diff --git a/.bazelrc b/.bazelrc
index 843c0aac12b80e..a42ff862e855c9 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -302,9 +302,11 @@ common:cuda --@local_config_cuda//:enable_cuda
 common:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
 common:cuda --@local_config_cuda//cuda:include_cuda_libs=true
+common:cuda --@cuda_driver//:include_cuda_umd_libs=true
 
 # This configuration is used for building the wheels.
 common:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false
+common:cuda_wheel --@cuda_driver//:include_cuda_umd_libs=false
 
 # CUDA: This config refers to building CUDA op kernels with clang.
 common:cuda_clang --config=cuda
@@ -596,7 +598,6 @@ common:use_tar_archive_files --repo_env=USE_LLVM_TAR_ARCHIVE_FILES=1
 common:use_tar_archive_files --repo_env=USE_MIRRORED_TAR_ARCHIVE_FILES=1
 
 # Make Bazel not try to probe the host system for a C++ toolchain.
-common:rbe_base --config=use_tar_archive_files
 common:rbe_base --config=resultstore
 common:rbe_base --repo_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
 common:rbe_base --define=EXECUTOR=remote
@@ -639,8 +640,8 @@ common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instance
 # Download CUDA/CUDNN redistributions to preserve the repositories cache between
 # CPU and GPU builds.
 # TODO(ybaturina): Uncomment when RBE is ready to support this.
-commonld:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
-commonld:rbe_linux_cpu --config=cuda_version
+common:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
+common:rbe_linux_cpu --config=cuda_version
 
 # Deprecated RBE config with non-hermetic toolchains.
 common:rbe_linux_cpu_clang_local --config=rbe_linux_cpu
@@ -666,9 +667,6 @@ common:rbe_linux_cuda --config=cuda_clang_official
 common:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 common:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-# Enable forward compatibility for CUDA builds because RBE docker image doesn't
-# have latest CUDA drivers installed.
-common:rbe_linux_cuda --@cuda_driver//:enable_forward_compatibility=true
 
 common:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 common:rbe_linux_cuda_nvcc --config=cuda_nvcc
@@ -861,7 +859,7 @@ test:linux_cpu_wheel_test --@local_xla//third_party/py:wheel_dependency=true --c
 test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
 test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --@local_xla//third_party/py:wheel_dependency=true --config=linux_cuda_wheel_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:prebuilt_wheel_import_api_packages_test_gpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --repo_env=HERMETIC_CUDA_UMD_VERSION=12.8.1 --@local_xla//third_party/py:wheel_dependency=true --config=linux_cuda_wheel_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:prebuilt_wheel_import_api_packages_test_gpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 WHEEL
 test:linux_arm64_wheel_test_filters --test_tag_filters=-no_oss,-tf_tosa,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
 test:linux_arm64_wheel_test_filters --build_tag_filters=-no_oss,-tf_tosa,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
diff --git a/.bazelversion b/.bazelversion
index 5c733d6c13a497..26c75fe8ad4fc9 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1,2 +1,2 @@
-7.4.1
+7.7.0
 # NOTE: Update Bazel version in tensorflow/tools/ci_build/release/common.sh.oss
\ No newline at end of file
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
index c0682a4cac7035..07896a48470753 100644
--- a/.github/workflows/osv-scanner-scheduled.yml
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -28,7 +28,7 @@ permissions:
 jobs:
   scan-scheduled:
     if: github.repository == 'tensorflow/tensorflow'
-    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.2.3"
+    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.2.4"
     with:
       scan-args: |-
         --lockfile=requirements.txt:./requirements_lock_3_9.txt
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index 75339c6b4f6bd7..e635c4cd8ccc88 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -55,7 +55,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: SARIF file
           path: results.sarif
@@ -64,6 +64,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@3599b3baa15b485a2e49ef411a7a4bb2452e7f93 # v3.29.5
+        uses: github/codeql-action/upload-sarif@0499de31b99561a6d14a36a5f662c2a54f91beee # v3.29.5
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml
index d9408810eb32ac..53f272bd5b9d8a 100644
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -31,7 +31,7 @@ jobs:
       pull-requests: write
     steps:
       - name: Awaiting response issues
-        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+        uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
         with:
           #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: 'override-stale'
@@ -59,7 +59,7 @@ jobs:
           close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale. Please reopen if you'd like to work on this further."
           repo-token: ${{ secrets.GITHUB_TOKEN }}
       - name: Contribution issues
-        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+        uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
         with:
           #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: 'override-stale'
diff --git a/RELEASE.md b/RELEASE.md
index 7ac60de2539cc0..6255a4a1d8679e 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -22,6 +22,10 @@
 * `tf.lite`
     * Adds int8 and int16x8 support for SQRT operator.
     * Adds int16x8 support for EQUAL and NOT_EQUAL operators.
+    * Adds support for int2 type.
+    * Adds support for int2/int4 in tfl.cast .
+    * Adds support for SRQ int2 in tfl.fully_connected.
+    * Adds support for int4 in tfl.slice.
 
 ### Bug Fixes and Other Changes
 
diff --git a/ci/official/containers/ml_build/Dockerfile b/ci/official/containers/ml_build/Dockerfile
index d12c886cc6d57a..a4fb0cd9b1640a 100644
--- a/ci/official/containers/ml_build/Dockerfile
+++ b/ci/official/containers/ml_build/Dockerfile
@@ -12,14 +12,6 @@ COPY builder.packages.txt /builder.packages.txt
 
 RUN /setup.sources.sh && /setup.packages.sh /builder.packages.txt
 
-# Install devtoolset-9 in /dt9 with glibc 2.17 and libstdc++ 4.8, for building
-# manylinux2014-compatible packages.
-COPY builder.devtoolset/fixlinks.sh /fixlinks.sh
-COPY builder.devtoolset/rpm-patch.sh /rpm-patch.sh
-COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
-COPY builder.devtoolset/glibc2.17-inline.patch /glibc2.17-inline.patch
-RUN /build_devtoolset.sh devtoolset-9 /dt9
-
 # Setup Python
 COPY setup.python.sh /setup.python.sh
 COPY builder.requirements.txt /builder.requirements.txt
@@ -56,9 +48,6 @@ RUN ln -sf /usr/bin/python3.12 /usr/bin/python3
 RUN ln -sf /usr/bin/python3.12 /usr/bin/python
 RUN ln -sf /usr/lib/python3.12 /usr/lib/tf_python
 
-# Make sure clang is on the path
-RUN ln -s /usr/lib/llvm-18/bin/clang /usr/bin/clang
-
 # Link the compat driver to the location if available.
 RUN if [ -e "/usr/local/cuda/compat/libcuda.so.1" ]; then ln -s /usr/local/cuda/compat/libcuda.so.1 /usr/lib/x86_64-linux-gnu/libcuda.so.1; fi
 
diff --git a/ci/official/containers/ml_build/builder.packages.txt b/ci/official/containers/ml_build/builder.packages.txt
index 8dbbf4196440da..cf914a0425ef11 100644
--- a/ci/official/containers/ml_build/builder.packages.txt
+++ b/ci/official/containers/ml_build/builder.packages.txt
@@ -1,28 +1,9 @@
-# Packages to be installed for the new Docker image.
-
-# Packages needed to build devtoolset
-file
-flex
-g++
-make
-patch
-rpm2cpio
-unar
-wget
-xz-utils
-cpio
-
 # Other build-related tools
 apt-transport-https
 autoconf
 automake
 build-essential
 ca-certificates
-llvm-18
-clang-18
-clang-tidy-18
-lld-18
-clang-format-12
 curl
 git
 parallel
@@ -32,4 +13,6 @@ unzip
 zip
 openjdk-21-jdk
 vim
+wget
 jq
+file
diff --git a/ci/official/containers/ml_build/builder.requirements.txt b/ci/official/containers/ml_build/builder.requirements.txt
index 114efaf9dc9757..ae113c68c2f03c 100644
--- a/ci/official/containers/ml_build/builder.requirements.txt
+++ b/ci/official/containers/ml_build/builder.requirements.txt
@@ -5,6 +5,9 @@ id
 urllib3
 requests
 
+# For XLA
+pyyaml
+
 # For JAX
 build ~= 1.2.2
 # uv is faster than pip for installing Python packages.
diff --git a/ci/official/containers/ml_build/cuda13.0_cudnn9.15.packages.txt b/ci/official/containers/ml_build/cuda13.0_cudnn9.15.packages.txt
new file mode 100644
index 00000000000000..dcc171ac5af019
--- /dev/null
+++ b/ci/official/containers/ml_build/cuda13.0_cudnn9.15.packages.txt
@@ -0,0 +1,23 @@
+# All required CUDA packages
+cuda-compat-13-0
+cuda-command-line-tools-13-0
+cuda-cudart-dev-13-0
+cuda-nvcc-13-0
+cuda-cupti-13-0
+cuda-nvprune-13-0
+cuda-libraries-13-0
+cuda-libraries-dev-13-0
+cuda-nvml-dev-13-0
+libcufft-13-0
+libcurand-13-0
+libcusolver-dev-13-0
+libcusparse-dev-13-0
+libcublas-13-0
+libcublas-dev-13-0
+libnccl-dev=2.27.7-1+cuda13.0
+libnccl2=2.27.7-1+cuda13.0
+# CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
+libcudnn9-headers-cuda-13=9.15.1.9-1
+libcudnn9-static-cuda-13=9.15.1.9-1
+libcudnn9-dev-cuda-13=9.15.1.9-1
+libcudnn9-cuda-13=9.15.1.9-1
\ No newline at end of file
diff --git a/ci/official/containers/ml_build/setup.python.sh b/ci/official/containers/ml_build/setup.python.sh
index cd56f3ca552d0f..b849457420f522 100755
--- a/ci/official/containers/ml_build/setup.python.sh
+++ b/ci/official/containers/ml_build/setup.python.sh
@@ -45,16 +45,6 @@ fi
 
 /setup.packages.sh pythons.txt
 
-# Re-link pyconfig.h from x86_64-linux-gnu into the devtoolset directory
-# for any Python version present
-pushd /usr/include/x86_64-linux-gnu
-for f in $(ls | grep python); do
-  # set up symlink for devtoolset-9
-  rm -f /dt9/usr/include/x86_64-linux-gnu/$f
-  ln -s /usr/include/x86_64-linux-gnu/$f /dt9/usr/include/x86_64-linux-gnu/$f
-done
-popd
-
 # Python 3.10 include headers fix:
 # sysconfig.get_path('include') incorrectly points to /usr/local/include/python
 # map /usr/include/python3.10 to /usr/local/include/python3.10
diff --git a/ci/official/envs/linux_arm64 b/ci/official/envs/linux_arm64
index 52aa80518b4b9c..026cc1bee85bf7 100644
--- a/ci/official/envs/linux_arm64
+++ b/ci/official/envs/linux_arm64
@@ -28,5 +28,5 @@ TFCI_OUTPUT_DIR=build_output
 TFCI_WHL_AUDIT_ENABLE=1
 TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
 TFCI_WHL_BAZEL_TEST_ENABLE=1
-TFCI_WHL_SIZE_LIMIT=265M
+TFCI_WHL_SIZE_LIMIT=270M
 TFCI_WHL_SIZE_LIMIT_ENABLE=1
diff --git a/ci/official/envs/windows_x86_2022 b/ci/official/envs/windows_x86_2022
index 56187ad78eca17..3c57bcfb8114ee 100644
--- a/ci/official/envs/windows_x86_2022
+++ b/ci/official/envs/windows_x86_2022
@@ -15,7 +15,7 @@
 TFCI_DOCKER_ENABLE=1
 TFCI_DOCKER_PULL_ENABLE=1
 TFCI_DOCKER_IMAGE="gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc"
-TFCI_BAZEL_BAZELRC_ARGS="--output_user_root=C:/t"
+TFCI_BAZEL_BAZELRC_ARGS="--output_user_root=C:/x"
 TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --repo_env=USE_PYWRAP_RULES=True --config=windows_x86_cpu_2022"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=windows_x86_cpu_2022
 TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=tensorflow"
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements.in b/ci/official/requirements_updater/numpy1_requirements/requirements.in
index c6a88054433ec0..a24dc1a57e3683 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements.in
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements.in
@@ -1,7 +1,7 @@
 # Requirements for NumPy 1.x
 numpy ~= 1.26.0
 wheel ~= 0.41.2
-h5py >= 3.11.0
+h5py >= 3.11.0, < 3.15.0
 lit ~= 17.0.2
 opt_einsum == 3.3.0
 astunparse == 1.6.3
diff --git a/ci/official/requirements_updater/requirements.in b/ci/official/requirements_updater/requirements.in
index 2a1fb43664c408..86d5526834753f 100644
--- a/ci/official/requirements_updater/requirements.in
+++ b/ci/official/requirements_updater/requirements.in
@@ -1,7 +1,7 @@
 # Note that numpy 2.1.0 does not support python 3.9
 numpy >= 2.0.0, < 2.2.0
 wheel ~= 0.41.2
-h5py >= 3.11.0
+h5py >= 3.11.0, < 3.15.0
 lit ~= 17.0.2
 opt_einsum == 3.3.0
 astunparse == 1.6.3
diff --git a/ci/official/utilities/setup_docker.sh b/ci/official/utilities/setup_docker.sh
index d928272d5ae1a3..03f49d85797225 100755
--- a/ci/official/utilities/setup_docker.sh
+++ b/ci/official/utilities/setup_docker.sh
@@ -62,6 +62,12 @@ if ! docker container inspect tf >/dev/null 2>&1 ; then
     # Additional setup is contained in ci/official/envs/rbe.
     CONTAINER_IP_ADDR=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' tf)
     netsh advfirewall firewall add rule name="Allow Metadata Proxy" dir=in action=allow protocol=TCP localport=80 remoteip="$CONTAINER_IP_ADDR"
+
+    # Stop non-essential indexing and link tracking services that
+    # may lock new files or symlinks.
+    # They may be causing sporadic "Permission denied" errors during Bazel builds.
+    # b/461500885
+    docker exec tf powershell -NoProfile -Command 'Stop-Service -Name SysMain,DiagTrack -Force -ErrorAction SilentlyContinue'
   fi
 
 fi
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f000821983b779..558b59368e615b 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -1033,6 +1033,7 @@ package_group(
         "//tensorflow_models/google/recml/...",
         "//third_party/cloud_tpu/convergence_tools/sdc_monitoring/...",
         "//third_party/cloud_tpu/inference_converter/...",
+        "//third_party/pathways/...",
         "//third_party/py/cloud_ml_autoflow/...",
         "//third_party/py/envlogger/...",
         "//third_party/py/gldm/...",
@@ -1180,38 +1181,31 @@ tf_cc_shared_library(
     linkstatic = 1,
     per_os_targets = True,
     roots = [
-                "//tensorflow/c/experimental/filesystem:filesystem_interface",
-                "//tensorflow/c/experimental/stream_executor:stream_executor",
-                "//tensorflow/c:env",
-                "//tensorflow/c:kernels",
-                "//tensorflow/c:kernels_experimental",
-                "//tensorflow/c:logging",
-                "//tensorflow/c:ops",
-                "//tensorflow/cc/saved_model:fingerprinting_impl",
-                "//tensorflow/cc/saved_model:loader_lite_impl",
-                "//tensorflow/cc/saved_model:metrics_impl",
-                "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
-                "//tensorflow/core/common_runtime:core_cpu_impl",
-                "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
-                "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
-                "//tensorflow/core:framework_internal_impl",
-                "//tensorflow/core/framework:tensor",
-                "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
-                "//tensorflow/core:lib_internal_impl",
-                "//tensorflow/core/profiler:profiler_impl",
-                "//tensorflow/core/util:determinism",  # Must be linked and exported to libtensorflow_framework.so.
-                "//tensorflow/lite/kernels/shim:tf_kernel_shim",
-                "@local_xla//xla/stream_executor:stream_executor_impl",
-                "@local_xla//xla/tsl/framework:bfc_allocator",
-                "@local_xla//xla/tsl/framework:metrics",
-            ] + tf_additional_binary_deps() +
-            # TODO(b/259305727): Remove this select and include captured_function in macos builds.
-            select({
-                "//tensorflow:macos": [],
-                "//conditions:default": [
-                    "//tensorflow/core/data:captured_function",
-                ],
-            }),
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/c/experimental/stream_executor:stream_executor",
+        "//tensorflow/c:env",
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:kernels_experimental",
+        "//tensorflow/c:ops",
+        "//tensorflow/cc/saved_model:fingerprinting_impl",
+        "//tensorflow/cc/saved_model:loader_lite_impl",
+        "//tensorflow/cc/saved_model:metrics_impl",
+        "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
+        "//tensorflow/core/common_runtime:core_cpu_impl",
+        "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
+        "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
+        "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
+        "//tensorflow/core:lib_internal_impl",
+        "//tensorflow/core/profiler:profiler_impl",
+        "//tensorflow/core/util:determinism",  # Must be linked and exported to libtensorflow_framework.so.
+        "//tensorflow/lite/kernels/shim:tf_kernel_shim",
+        "@local_xla//xla/stream_executor:stream_executor_impl",
+        "@local_xla//xla/tsl/framework:bfc_allocator",
+        "@local_xla//xla/tsl/framework:metrics",
+        "//tensorflow/core/data:captured_function",
+    ] + tf_additional_binary_deps(),
     soversion = VERSION,
     static_deps = PACKAGE_STATIC_DEPS,
     visibility = ["//visibility:public"],
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 726433bafded24..3f4ec98028e8c3 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -298,7 +298,6 @@ tf_cuda_library(
         ],
         "//conditions:default": [
             ":env",
-            ":logging",
             ":tf_status",
             ":tf_tensor",
             "//tensorflow/c/experimental/filesystem:modular_filesystem",
@@ -325,18 +324,6 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "logging",
-    srcs = ["logging.cc"],
-    hdrs = ["logging.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":c_api_macros",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:stringprintf",
-    ],
-)
-
 tf_cuda_library(
     name = "tf_status_internal",
     hdrs = [
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index b919be52b0bf68..4dd78e4cd7bbb1 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1171,7 +1171,7 @@ TEST_F(CApiFunctionTest, InvalidOutputTensor_BadNodePtr) {
   EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
   EXPECT_EQ(string("Node is null\n\tEncountered while processing output 0 "
                    "from function 'MyFunc'"),
-            string(TF_Message(s_)));
+            std::string(TF_Message(s_)));
 }
 
 TEST_F(CApiFunctionTest, NodeMissingInput) {
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index e3e7d812b15838..f59a73a0871945 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -2478,7 +2478,7 @@ TEST_F(CApiAttributesTest, Names) {
 
   TF_OperationGetAttrName(oper, 0, value.get(), s_);
   EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  EXPECT_EQ("v", string(static_cast<const char*>(value.get()), 1));
+  EXPECT_EQ("v", std::string(static_cast<const char*>(value.get()), 1));
 }
 
 TEST_F(CApiAttributesTest, Errors) {
diff --git a/tensorflow/c/checkpoint_reader.cc b/tensorflow/c/checkpoint_reader.cc
index 97a5bbd4b6077a..9dae0d3afd46fe 100644
--- a/tensorflow/c/checkpoint_reader.cc
+++ b/tensorflow/c/checkpoint_reader.cc
@@ -119,8 +119,7 @@ CheckpointReader::BuildV2VarMaps() {
   BundleEntryProto entry;
   v2_reader_->Seek(kHeaderEntryKey);
   for (v2_reader_->Next(); v2_reader_->Valid(); v2_reader_->Next()) {
-    CHECK(entry.ParseFromArray(v2_reader_->value().data(),
-                               v2_reader_->value().size()))
+    CHECK(entry.ParseFromString(v2_reader_->value()))
         << entry.InitializationErrorString();
     for (int i = 0; i < entry.slices_size(); ++i) {
       const auto& slice_proto = entry.slices(i);
@@ -140,8 +139,7 @@ CheckpointReader::BuildV2VarMaps() {
   v2_reader_->Seek(kHeaderEntryKey);
   for (v2_reader_->Next(); v2_reader_->Valid(); v2_reader_->Next()) {
     if (filtered_keys.count(string(v2_reader_->key())) > 0) continue;
-    CHECK(entry.ParseFromArray(v2_reader_->value().data(),
-                               v2_reader_->value().size()))
+    CHECK(entry.ParseFromString(v2_reader_->value()))
         << entry.InitializationErrorString();
     string key(v2_reader_->key());
     (*var_to_shape_map)[key] = TensorShape(entry.shape());
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index ccde2ba3d9b769..91f83b3f88967d 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -939,7 +939,8 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
                                const char* serialized_function_def, size_t size,
                                TF_Status* status) {
   tensorflow::FunctionDef function_def;
-  if (!function_def.ParseFromArray(serialized_function_def, size)) {
+  if (!function_def.ParseFromString(
+          absl::string_view(serialized_function_def, size))) {
     status->status =
         tensorflow::errors::InvalidArgument("Invalid FunctionDef proto");
     return;
diff --git a/tensorflow/c/eager/c_api_experimental_reader.cc b/tensorflow/c/eager/c_api_experimental_reader.cc
index 0959580a10438b..e93469bd4c1cfd 100644
--- a/tensorflow/c/eager/c_api_experimental_reader.cc
+++ b/tensorflow/c/eager/c_api_experimental_reader.cc
@@ -1,6 +1,6 @@
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
diff --git a/tensorflow/c/eager/c_api_experimental_reader.h b/tensorflow/c/eager/c_api_experimental_reader.h
index 71c2e4650f0520..d8bc2f6c65716b 100644
--- a/tensorflow/c/eager/c_api_experimental_reader.h
+++ b/tensorflow/c/eager/c_api_experimental_reader.h
@@ -1,6 +1,6 @@
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 0802cc46267f66..d96de81bfa4365 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -177,5 +177,6 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/platform:strcat",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
index a231fc74033fdd..fcdbd4ea9c2a2f 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
 #include "tensorflow/c/eager/parallel_device/parallel_device_testlib.h"
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 03dd862f95cb0f..7d25709df2dfc7 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 struct TF_StringStream {
-  std::vector<::tensorflow::string>* list;
+  std::vector<std::string>* list;
   size_t position;
 };
 
@@ -134,7 +134,7 @@ void TF_StringStreamDone(TF_StringStream* list) {
   delete list;
 }
 TF_StringStream* TF_GetChildren(const char* dirname, TF_Status* status) {
-  auto* children = new std::vector<::tensorflow::string>;
+  auto* children = new std::vector<std::string>;
 
   TF_SetStatus(status, TF_OK, "");
   ::tensorflow::Set_TF_Status_from_Status(
@@ -147,7 +147,7 @@ TF_StringStream* TF_GetChildren(const char* dirname, TF_Status* status) {
 }
 
 TF_StringStream* TF_GetLocalTempDirectories() {
-  auto* tmpdirs = new std::vector<::tensorflow::string>;
+  auto* tmpdirs = new std::vector<std::string>;
 
   ::tensorflow::Env::Default()->GetLocalTempDirectories(tmpdirs);
 
diff --git a/tensorflow/c/env_test.cc b/tensorflow/c/env_test.cc
index d4c9bfce3c2127..3d338d4377366b 100644
--- a/tensorflow/c/env_test.cc
+++ b/tensorflow/c/env_test.cc
@@ -35,14 +35,12 @@ TEST(TestEnv, TestDirHandling) {
 
     TF_Status* s = TF_NewStatus();
 
-    ::tensorflow::string dirpath =
-        ::tensorflow::io::JoinPath(tempdir, "somedir");
+    std::string dirpath = ::tensorflow::io::JoinPath(tempdir, "somedir");
     TF_CreateDir(dirpath.c_str(), s);
     ASSERT_TF_OK(s) << "TF_CreateDir failed for " << dirpath << ": "
                     << TF_Message(s);
 
-    ::tensorflow::string filepath =
-        ::tensorflow::io::JoinPath(dirpath, "somefile.txt");
+    std::string filepath = ::tensorflow::io::JoinPath(dirpath, "somefile.txt");
     TF_WritableFileHandle* handle;
     TF_NewWritableFile(filepath.c_str(), &handle, s);
     ASSERT_TF_OK(s) << "NewWritableFile failed for " << filepath << ": "
@@ -61,7 +59,7 @@ TEST(TestEnv, TestDirHandling) {
     ASSERT_TF_OK(s) << "TF_GetChildren failed for " << dirpath;
     const char* childpath;
     ASSERT_TRUE(TF_StringStreamNext(children, &childpath));
-    ASSERT_EQ(::tensorflow::string(childpath), "somefile.txt");
+    ASSERT_EQ(std::string(childpath), "somefile.txt");
     // There should only be one file in this directory.
     ASSERT_FALSE(TF_StringStreamNext(children, &childpath));
     ASSERT_EQ(childpath, nullptr);
diff --git a/tensorflow/c/experimental/filesystem/BUILD b/tensorflow/c/experimental/filesystem/BUILD
index 1f3f66b36681a0..ec446fd8389687 100644
--- a/tensorflow/c/experimental/filesystem/BUILD
+++ b/tensorflow/c/experimental/filesystem/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//xla/tsl/platform:env",
         "@local_xla//xla/tsl/platform:errors",
     ],
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem.h b/tensorflow/c/experimental/filesystem/modular_filesystem.h
index b8482bbdb4f85d..5a8c4ba3ccb56c 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "xla/tsl/platform/file_system.h"
 #include "tensorflow/core/platform/file_statistics.h"
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 8fa3e726e6a837..f0f6e5351372e1 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -31,10 +31,10 @@ cc_library(
         ":gcs_helper",
         ":ram_file_block_cache",
         "//tensorflow/c:env",
-        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
+        "@com_github_googlecloudplatform_google_cloud_cpp//google/cloud:google_cloud_cpp_common",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
@@ -65,7 +65,6 @@ cc_library(
     deps = [
         ":cleanup",
         "//tensorflow/c:env",
-        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
@@ -86,6 +85,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform/cloud:now_seconds_env",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc
index b0d283fff82d9b..e639f9a7dda476 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 namespace {
 
 TEST(ExpiringLRUCacheTest, MaxAge) {
-  const string key = "a";
+  const std::string key = "a";
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
   tf_gcs_filesystem::ExpiringLRUCache<int> cache(
       1, 0, [&env]() { return env->NowSeconds(); });
@@ -95,9 +95,10 @@ TEST(ExpiringLRUCacheTest, MaxEntries) {
 
 TEST(ExpiringLRUCacheTest, LookupOrCompute) {
   // max_age of 0 means we should always compute.
-  uint64 num_compute_calls = 0;
+  uint64_t num_compute_calls = 0;
   tf_gcs_filesystem::ExpiringLRUCache<int>::ComputeFunc compute_func =
-      [&num_compute_calls](const string& key, int* value, TF_Status* status) {
+      [&num_compute_calls](const std::string& key, int* value,
+                           TF_Status* status) {
         *value = num_compute_calls;
         num_compute_calls++;
         return TF_SetStatus(status, TF_OK, "");
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 3b9650b7416315..f61208c7b4a174 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
-#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for GCS environments.
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
index 0060abc76699c3..3e972fa6292995 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/c/env.h"
-#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 namespace tf_gcs_filesystem {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
index 4ad4a8ea1868f3..23645ed8e878bf 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h"
 
 #include <algorithm>
-#include <cctype>
 #include <cstdint>
 #include <cstring>
 #include <list>
@@ -25,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
@@ -39,7 +39,7 @@ namespace tensorflow {
 namespace {
 
 absl::Status ReadCache(tf_gcs_filesystem::RamFileBlockCache* cache,
-                       const string& filename, size_t offset, size_t n,
+                       const std::string& filename, size_t offset, size_t n,
                        std::vector<char>* out) {
   out->clear();
   out->resize(n, 0);
@@ -54,7 +54,7 @@ absl::Status ReadCache(tf_gcs_filesystem::RamFileBlockCache* cache,
 }
 
 TEST(RamFileBlockCacheTest, IsCacheEnabled) {
-  auto fetcher = [](const string& filename, size_t offset, size_t n,
+  auto fetcher = [](const std::string& filename, size_t offset, size_t n,
                     char* buffer, TF_Status* status) -> int64_t {
     // Do nothing.
     TF_SetStatus(status, TF_OK, "");
@@ -73,14 +73,14 @@ TEST(RamFileBlockCacheTest, IsCacheEnabled) {
 
 TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
     TF_SetStatus(status, TF_OK, "");
     return n;
   };
-  string filename = "file";
+  std::string filename = "file";
   tf_gcs_filesystem::RamFileBlockCache cache(16, 32, 0, fetcher);
   std::vector<char> out;
 
@@ -101,12 +101,12 @@ TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
 }
 
 TEST(RamFileBlockCacheTest, PassThrough) {
-  const string want_filename = "foo/bar";
+  const std::string want_filename = "foo/bar";
   const size_t want_offset = 42;
   const size_t want_n = 1024;
   int calls = 0;
   auto fetcher = [&calls, want_filename, want_offset, want_n](
-                     const string& got_filename, size_t got_offset,
+                     const std::string& got_filename, size_t got_offset,
                      size_t got_n, char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(got_filename, want_filename);
     EXPECT_EQ(got_offset, want_offset);
@@ -143,7 +143,7 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
     buf.push_back(i);
   }
   // The fetcher just fetches slices of the buffer.
-  auto fetcher = [&buf](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&buf](const std::string& filename, size_t offset, size_t n,
                         char* buffer, TF_Status* status) -> int64_t {
     int64_t bytes_transferred;
     if (offset < buf.size()) {
@@ -191,8 +191,8 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
 TEST(RamFileBlockCacheTest, CacheHits) {
   const size_t block_size = 16;
   std::set<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -202,7 +202,7 @@ TEST(RamFileBlockCacheTest, CacheHits) {
     TF_SetStatus(status, TF_OK, "");
     return n;
   };
-  const uint32 block_count = 256;
+  const uint32_t block_count = 256;
   tf_gcs_filesystem::RamFileBlockCache cache(
       block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
@@ -225,7 +225,7 @@ TEST(RamFileBlockCacheTest, OutOfRange) {
   bool first_block = false;
   bool second_block = false;
   auto fetcher = [block_size, file_size, &first_block, &second_block](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -269,8 +269,9 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
   // where we expected complete blocks.
   const size_t block_size = 16;
   // This fetcher returns OK but only fills in one byte for any offset.
-  auto fetcher = [block_size](const string& filename, size_t offset, size_t n,
-                              char* buffer, TF_Status* status) -> int64_t {
+  auto fetcher = [block_size](const std::string& filename, size_t offset,
+                              size_t n, char* buffer,
+                              TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     EXPECT_GE(n, 1);
@@ -293,8 +294,8 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
 TEST(RamFileBlockCacheTest, LRU) {
   const size_t block_size = 16;
   std::list<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_FALSE(calls.empty()) << "at offset = " << offset;
@@ -306,7 +307,7 @@ TEST(RamFileBlockCacheTest, LRU) {
     TF_SetStatus(status, TF_OK, "");
     return n;
   };
-  const uint32 block_count = 2;
+  const uint32_t block_count = 2;
   tf_gcs_filesystem::RamFileBlockCache cache(
       block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
@@ -342,7 +343,7 @@ TEST(RamFileBlockCacheTest, LRU) {
 
 TEST(RamFileBlockCacheTest, MaxStaleness) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
@@ -386,13 +387,13 @@ TEST(RamFileBlockCacheTest, MaxStaleness) {
 
 TEST(RamFileBlockCacheTest, RemoveFile) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     char c = (filename == "a") ? 'a' : (filename == "b") ? 'b' : 'x';
     if (offset > 0) {
       // The first block is lower case and all subsequent blocks are upper case.
-      c = toupper(c);
+      c = absl::ascii_toupper(c);
     }
     memset(buffer, c, n);
     TF_SetStatus(status, TF_OK, "");
@@ -448,7 +449,7 @@ TEST(RamFileBlockCacheTest, RemoveFile) {
 
 TEST(RamFileBlockCacheTest, Prune) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
@@ -458,7 +459,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   std::vector<char> out;
   // Our fake environment is initialized with the current timestamp.
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
-  uint64 now = Env::Default()->NowSeconds();
+  uint64_t now = Env::Default()->NowSeconds();
   env->SetNowSeconds(now);
   tf_gcs_filesystem::RamFileBlockCache cache(
       8, 32, 1 /* max staleness */, fetcher,
@@ -487,7 +488,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   // timestamp of `now` + 2, file "a" is stale because its first block is stale,
   // but file "b" is not stale yet. Thus, once the pruning thread wakes up (in
   // one second of wall time), it should remove "a" and leave "b" alone.
-  uint64 start = Env::Default()->NowSeconds();
+  uint64_t start = Env::Default()->NowSeconds();
   do {
     Env::Default()->SleepForMicroseconds(100000);
   } while (cache.CacheSize() == 24 && Env::Default()->NowSeconds() - start < 3);
@@ -515,7 +516,7 @@ TEST(RamFileBlockCacheTest, ParallelReads) {
   absl::BlockingCounter counter(callers);
   absl::Notification notification;
   auto fetcher = [&counter, &notification](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, TF_Status* status) -> int64_t {
     if (counter.DecrementCount()) {
       notification.Notify();
@@ -560,7 +561,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
   int num_requests = 0;
   absl::Notification notification;
   auto fetcher = [&num_requests, &notification, block_size](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset, 0);
@@ -591,7 +592,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
 
 TEST(RamFileBlockCacheTest, Flush) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
diff --git a/tensorflow/c/experimental/gradients/tape/BUILD b/tensorflow/c/experimental/gradients/tape/BUILD
index 20bc4a080f30ee..c0ae70b64abec7 100644
--- a/tensorflow/c/experimental/gradients/tape/BUILD
+++ b/tensorflow/c/experimental/gradients/tape/BUILD
@@ -50,6 +50,7 @@ cc_library(
         "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:stringpiece",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_xla//xla/tsl/platform:errors",
     ],
diff --git a/tensorflow/c/experimental/gradients/tape/tape_operation.cc b/tensorflow/c/experimental/gradients/tape/tape_operation.cc
index 7cd3acffbc9cec..2839616c63991b 100644
--- a/tensorflow/c/experimental/gradients/tape/tape_operation.cc
+++ b/tensorflow/c/experimental/gradients/tape/tape_operation.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
diff --git a/tensorflow/c/experimental/grappler/grappler_test.cc b/tensorflow/c/experimental/grappler/grappler_test.cc
index 32ac04832551c1..205aeec55ebf8c 100644
--- a/tensorflow/c/experimental/grappler/grappler_test.cc
+++ b/tensorflow/c/experimental/grappler/grappler_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_set>
 #include <vector>
 
@@ -70,11 +71,11 @@ TEST(Grappler, SuccessfulRegistration) {
 
   TF_ASSERT_OK(InitGraphPlugin(plugin_init));
   ASSERT_EQ(PluginGraphOptimizerRegistry::CreateOptimizers(
-                std::set<string>{"Success"})
+                std::set<std::string>{"Success"})
                 .size(),
             1);
   ConfigList config = PluginGraphOptimizerRegistry::GetPluginConfigs(
-      true, std::set<string>{"Success"});
+      true, std::set<std::string>{"Success"});
   ASSERT_EQ(config.toggle_config["remapping"], RewriterConfig::OFF);
 }
 
@@ -95,7 +96,7 @@ TEST(Grappler, MultiplePluginRegistration) {
   TF_ASSERT_OK(InitGraphPlugin(plugin_init_0));
   TF_ASSERT_OK(InitGraphPlugin(plugin_init_1));
   ASSERT_EQ(PluginGraphOptimizerRegistry::CreateOptimizers(
-                std::set<string>{"Device0", "Device1"})
+                std::set<std::string>{"Device0", "Device1"})
                 .size(),
             2);
 }
@@ -132,12 +133,12 @@ TEST(Grappler, OptimizeFuncNotSet) {
 
 TEST(TF_GrapplerItem, NodesToPreserve) {
   GrapplerItem item;
-  item.fetch = std::vector<string>{"Conv", "BiasAdd"};
-  std::unordered_set<string> nodes_preserved = item.NodesToPreserve();
+  item.fetch = std::vector<std::string>{"Conv", "BiasAdd"};
+  std::unordered_set<std::string> nodes_preserved = item.NodesToPreserve();
   TF_GrapplerItem* c_item = reinterpret_cast<TF_GrapplerItem*>(&item);
 
   int list_total_size = 0;
-  for (const string& s : nodes_preserved) {
+  for (const std::string& s : nodes_preserved) {
     list_total_size += s.size();
   }
 
@@ -158,20 +159,21 @@ TEST(TF_GrapplerItem, NodesToPreserve) {
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   for (size_t i = 0; i < nodes_preserved.size(); ++i) {
-    EXPECT_EQ(nodes_preserved.find(string(static_cast<const char*>(values[i]),
-                                          lens[i])) != nodes_preserved.end(),
-              true);
+    EXPECT_EQ(
+        nodes_preserved.find(std::string(static_cast<const char*>(values[i]),
+                                         lens[i])) != nodes_preserved.end(),
+        true);
   }
   TF_DeleteStatus(status);
 }
 
 TEST(TF_GrapplerItem, FetchNodes) {
   GrapplerItem item;
-  item.fetch = std::vector<string>{"Conv", "BiasAdd"};
+  item.fetch = std::vector<std::string>{"Conv", "BiasAdd"};
   TF_GrapplerItem* c_item = reinterpret_cast<TF_GrapplerItem*>(&item);
 
   int list_total_size = 0;
-  for (const string& s : item.fetch) {
+  for (const std::string& s : item.fetch) {
     list_total_size += s.size();
   }
 
@@ -193,7 +195,7 @@ TEST(TF_GrapplerItem, FetchNodes) {
   for (size_t i = 0; i < item.fetch.size(); ++i) {
     EXPECT_EQ(item.fetch[i].size(), lens[i]) << i;
     EXPECT_EQ(item.fetch[i],
-              string(static_cast<const char*>(values[i]), lens[i]))
+              std::string(static_cast<const char*>(values[i]), lens[i]))
         << i;
   }
   TF_DeleteStatus(status);
@@ -307,13 +309,13 @@ TEST(TF_FunctionLibraryDefinition, LookUpOpDef) {
       TF_NewFunctionLibraryDefinition(g_buf, status);
 
   TF_LookUpOpDef(func, "Add", op_buf, status);
-  string actual_string(reinterpret_cast<const char*>(op_buf->data),
-                       op_buf->length);
+  std::string actual_string(reinterpret_cast<const char*>(op_buf->data),
+                            op_buf->length);
   ASSERT_EQ(TF_OK, TF_GetCode(status));
 
   const OpDef* expected_op_def;
   TF_ASSERT_OK(OpRegistry::Global()->LookUpOpDef("Add", &expected_op_def));
-  string expected_serialized;
+  std::string expected_serialized;
   expected_op_def->SerializeToString(&expected_serialized);
   EXPECT_EQ(expected_serialized, actual_string);
   TF_DeleteBuffer(g_buf);
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index 348f5c5d6d0341..f4a57a7d265420 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -33,10 +33,10 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_xla//xla/pjrt:pjrt_c_api_client",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@local_xla//xla/pjrt/c:pjrt_c_api_helpers",
+        "@local_xla//xla/pjrt/c_api_client:pjrt_c_api_client",
         "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
@@ -70,9 +70,9 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_xla//xla/pjrt:pjrt_c_api_client",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
+        "@local_xla//xla/pjrt/c_api_client:pjrt_c_api_client",
         "@local_xla//xla/tsl/platform:errors",
         "@local_xla//xla/tsl/platform:statusor",
     ],
@@ -96,10 +96,10 @@ tf_cc_test(
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/pjrt:pjrt_api",
-        "@local_xla//xla/pjrt:pjrt_c_api_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_cpu",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@local_xla//xla/pjrt/c:pjrt_c_api_wrapper_impl",
+        "@local_xla//xla/pjrt/c_api_client:pjrt_c_api_client",
         "@local_xla//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "@local_xla//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "@local_xla//xla/tsl/lib/core:status_test_util",
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.cc b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
index fdb8a9e7f47794..569e7d0eed0ca4 100644
--- a/tensorflow/c/experimental/next_pluggable_device/c_api.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/variable_info_util.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
-#include "xla/pjrt/pjrt_c_api_client.h"
+#include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
index 5344db87abcae0..4df0e5d336273f 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/pjrt_tensor_buffer_util.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
-#include "xla/pjrt/pjrt_c_api_client.h"
+#include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
index c2378b68109fc9..24fc0cc20d3c3a 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
-#include "xla/pjrt/pjrt_c_api_client.h"
+#include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
 #include "tensorflow/core/framework/tensor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
index 7220877fad0ed8..c5d2b18dac36aa 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_cpu.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_api.h"
-#include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/shape.h"
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format.cc b/tensorflow/c/experimental/ops/gen/common/case_format.cc
index 82acc32f623fd8..1992357201af18 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format.cc
+++ b/tensorflow/c/experimental/ops/gen/common/case_format.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
 
-#include <cctype>
+#include <string>
 
 #include "absl/strings/ascii.h"
 #include "tensorflow/core/platform/types.h"
@@ -31,14 +31,14 @@ enum CaseFormatType {
   UPPER_SNAKE,
 };
 
-string FormatStringCase(const string &str, CaseFormatType to,
-                        const char delimiter = '_') {
+std::string FormatStringCase(const std::string& str, CaseFormatType to,
+                             const char delimiter = '_') {
   const bool from_snake = (str == absl::AsciiStrToUpper(str)) ||
                           (str == absl::AsciiStrToLower(str));
   const bool toUpper = (to == UPPER_CAMEL || to == UPPER_SNAKE);
   const bool toSnake = (to == LOWER_SNAKE || to == UPPER_SNAKE);
 
-  string result;
+  std::string result;
 
   bool inputStart = true;
   bool wordStart = true;
@@ -52,7 +52,7 @@ string FormatStringCase(const string &str, CaseFormatType to,
       wordStart = true;
       continue;
     }
-    if (!from_snake && isupper(c)) {
+    if (!from_snake && absl::ascii_isupper(c)) {
       wordStart = true;
     }
 
@@ -65,9 +65,9 @@ string FormatStringCase(const string &str, CaseFormatType to,
     const bool shouldCapIfSnake = toUpper;
     const bool shouldCapIfCamel = wordStart && (toUpper || !inputStart);
     if ((toSnake && shouldCapIfSnake) || (!toSnake && shouldCapIfCamel)) {
-      result += toupper(c);
+      result += absl::ascii_toupper(c);
     } else {
-      result += tolower(c);
+      result += absl::ascii_tolower(c);
     }
 
     // at this point we are no longer at the start of a word:
@@ -90,16 +90,16 @@ string FormatStringCase(const string &str, CaseFormatType to,
 // Public interface
 //
 
-string toLowerCamel(const string &s, const char delimiter) {
+std::string toLowerCamel(const std::string& s, const char delimiter) {
   return FormatStringCase(s, LOWER_CAMEL, delimiter);
 }
-string toLowerSnake(const string &s, const char delimiter) {
+std::string toLowerSnake(const std::string& s, const char delimiter) {
   return FormatStringCase(s, LOWER_SNAKE, delimiter);
 }
-string toUpperCamel(const string &s, const char delimiter) {
+std::string toUpperCamel(const std::string& s, const char delimiter) {
   return FormatStringCase(s, UPPER_CAMEL, delimiter);
 }
-string toUpperSnake(const string &s, const char delimiter) {
+std::string toUpperSnake(const std::string& s, const char delimiter) {
   return FormatStringCase(s, UPPER_SNAKE, delimiter);
 }
 
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format.h b/tensorflow/c/experimental/ops/gen/common/case_format.h
index f8255f6aa21c17..880f286788e0a2 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format.h
+++ b/tensorflow/c/experimental/ops/gen/common/case_format.h
@@ -35,10 +35,10 @@ namespace generator {
 //    "__OneTwo__" (in camel case)  <==>  "__ONE_TWO__" (in snake case)
 //
 // Note: performance not yet tested.
-string toLowerCamel(const string &s, const char delimiter = '_');
-string toLowerSnake(const string &s, const char delimiter = '_');
-string toUpperCamel(const string &s, const char delimiter = '_');
-string toUpperSnake(const string &s, const char delimiter = '_');
+std::string toLowerCamel(const std::string& s, const char delimiter = '_');
+std::string toLowerSnake(const std::string& s, const char delimiter = '_');
+std::string toUpperCamel(const std::string& s, const char delimiter = '_');
+std::string toUpperSnake(const std::string& s, const char delimiter = '_');
 
 }  // namespace generator
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format_test.cc b/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
index 302bcc42453169..e769acb94bff73 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
+++ b/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
 
+#include <string>
+
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -25,13 +27,13 @@ namespace {
 // For each test case, we manually construct the 4 variations in string case and
 // test all 16 conversions: from and to each of the 4 string case variations.
 struct Variations {
-  string lower_camel;
-  string lower_snake;
-  string upper_camel;
-  string upper_snake;
+  std::string lower_camel;
+  std::string lower_snake;
+  std::string upper_camel;
+  std::string upper_snake;
 };
 
-void TestSingleVariation(const string &str, Variations expected,
+void TestSingleVariation(const std::string& str, Variations expected,
                          char delimiter = '_') {
   EXPECT_EQ(expected.lower_camel, toLowerCamel(str, delimiter));
   EXPECT_EQ(expected.lower_snake, toLowerSnake(str, delimiter));
diff --git a/tensorflow/c/experimental/ops/gen/common/controller.cc b/tensorflow/c/experimental/ops/gen/common/controller.cc
index fb3e321714b108..7c9bf279fdcd2a 100644
--- a/tensorflow/c/experimental/ops/gen/common/controller.cc
+++ b/tensorflow/c/experimental/ops/gen/common/controller.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/controller.h"
 
+#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -43,7 +44,7 @@ Controller::Controller(PathConfig path_config, Env* env)
 }
 Controller::~Controller() { delete api_def_map_; }
 
-const void Controller::WriteFile(const string& file_path,
+const void Controller::WriteFile(const std::string& file_path,
                                  const SourceCode& code) const {
   TF_CHECK_OK(WriteStringToFile(env_, file_path, code.Render())) << file_path;
 }
@@ -60,8 +61,9 @@ void Controller::InitializeOpApi() {
   api_def_map_ = new ApiDefMap(op_list_);
   for (const auto& op : op_list_.op()) {
     for (const auto& dir : path_config_.api_dirs) {
-      const string file_name = absl::Substitute("api_def_$0.pbtxt", op.name());
-      const string file_path = io::JoinPath(dir, file_name);
+      const std::string file_name =
+          absl::Substitute("api_def_$0.pbtxt", op.name());
+      const std::string file_path = io::JoinPath(dir, file_name);
       if (env_->FileExists(file_path).ok()) {
         TF_CHECK_OK(api_def_map_->LoadFile(env_, file_path)) << file_path;
       } else {
diff --git a/tensorflow/c/experimental/ops/gen/common/controller.h b/tensorflow/c/experimental/ops/gen/common/controller.h
index e152efeb6d8f9f..c33891f963d7a6 100644
--- a/tensorflow/c/experimental/ops/gen/common/controller.h
+++ b/tensorflow/c/experimental/ops/gen/common/controller.h
@@ -32,7 +32,8 @@ class Controller {
  public:
   explicit Controller(PathConfig path_config, Env* env = Env::Default());
   virtual ~Controller();
-  const void WriteFile(const string& file_path, const SourceCode& code) const;
+  const void WriteFile(const std::string& file_path,
+                       const SourceCode& code) const;
   const std::vector<OpSpec>& GetModelOps() const;
 
  private:
diff --git a/tensorflow/c/experimental/ops/gen/common/path_config.cc b/tensorflow/c/experimental/ops/gen/common/path_config.cc
index 2ec57d67c9d6f7..6de98c242b1afa 100644
--- a/tensorflow/c/experimental/ops/gen/common/path_config.cc
+++ b/tensorflow/c/experimental/ops/gen/common/path_config.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/common/path_config.h"
 
 #include <algorithm>
+#include <string>
 #include <vector>
 
 #include "absl/strings/str_join.h"
@@ -24,9 +25,10 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-PathConfig::PathConfig(const string& output_dir, const string& source_dir,
-                       const string& api_dir_list,
-                       const std::vector<string> op_names)
+PathConfig::PathConfig(const std::string& output_dir,
+                       const std::string& source_dir,
+                       const std::string& api_dir_list,
+                       const std::vector<std::string> op_names)
     : output_path(output_dir), op_names(op_names) {
   api_dirs = str_util::Split(api_dir_list, ",", str_util::SkipEmpty());
 
@@ -39,7 +41,7 @@ PathConfig::PathConfig(const string& output_dir, const string& source_dir,
   tf_root_dir = "tensorflow";
 
   // Prefix, e.g. "third_party" given root_dir "third_party/tensorflow/...."
-  std::vector<string> source_path_components =
+  std::vector<std::string> source_path_components =
       tensorflow::str_util::Split(source_dir, "/");
   auto source_tfroot_pos = std::find(source_path_components.begin(),
                                      source_path_components.end(), tf_root_dir);
@@ -51,7 +53,7 @@ PathConfig::PathConfig(const string& output_dir, const string& source_dir,
   }
 
   // TF subdir, e.g. "c/ops" given output_dir "blah/blah/tensorflow/c/ops"
-  std::vector<string> output_path_components =
+  std::vector<std::string> output_path_components =
       tensorflow::str_util::Split(output_dir, "/");
   auto output_tfroot_pos = std::find(output_path_components.begin(),
                                      output_path_components.end(), tf_root_dir);
diff --git a/tensorflow/c/experimental/ops/gen/common/path_config.h b/tensorflow/c/experimental/ops/gen/common/path_config.h
index ce29063be5f682..d47266f86e38ef 100644
--- a/tensorflow/c/experimental/ops/gen/common/path_config.h
+++ b/tensorflow/c/experimental/ops/gen/common/path_config.h
@@ -23,17 +23,18 @@ namespace tensorflow {
 namespace generator {
 
 struct PathConfig {
-  string output_path;
-  std::vector<string> op_names;
-  std::vector<string> api_dirs;
-  string tf_prefix_dir;
-  string tf_root_dir;
-  string tf_output_dir;
+  std::string output_path;
+  std::vector<std::string> op_names;
+  std::vector<std::string> api_dirs;
+  std::string tf_prefix_dir;
+  std::string tf_root_dir;
+  std::string tf_output_dir;
 
   explicit PathConfig() = default;
-  explicit PathConfig(const string &output_dir, const string &source_dir,
-                      const string &api_dir_list,
-                      const std::vector<string> op_names);
+  explicit PathConfig(const std::string& output_dir,
+                      const std::string& source_dir,
+                      const std::string& api_dir_list,
+                      const std::vector<std::string> op_names);
 };
 
 }  // namespace generator
diff --git a/tensorflow/c/experimental/ops/gen/common/source_code.cc b/tensorflow/c/experimental/ops/gen/common/source_code.cc
index 2b7bce6a263184..28e55659c1cc90 100644
--- a/tensorflow/c/experimental/ops/gen/common/source_code.cc
+++ b/tensorflow/c/experimental/ops/gen/common/source_code.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/source_code.h"
 
+#include <string>
+
 #include "absl/log/log.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stringpiece.h"
@@ -25,20 +28,20 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-string SourceCode::Render() const {
-  string code;
+std::string SourceCode::Render() const {
+  std::string code;
   for (const Line& line : lines_) {
-    absl::StrAppend(&code, string(line.indent * spaces_per_indent_, ' '),
+    absl::StrAppend(&code, std::string(line.indent * spaces_per_indent_, ' '),
                     line.text, "\n");
   }
   return code;
 }
 
-void SourceCode::AddLineWithIndent(const string& line) {
+void SourceCode::AddLineWithIndent(const std::string& line) {
   ValidateAndAddLine(current_indent_, line);
 }
 
-void SourceCode::AddLineWithoutIndent(const string& line) {
+void SourceCode::AddLineWithoutIndent(const std::string& line) {
   ValidateAndAddLine(0, line);
 }
 
@@ -48,7 +51,7 @@ void SourceCode::IncreaseIndent() { current_indent_++; }
 
 void SourceCode::DecreaseIndent() { current_indent_--; }
 
-void SourceCode::ValidateAndAddLine(int indent, const string& raw_line) {
+void SourceCode::ValidateAndAddLine(int indent, const std::string& raw_line) {
   absl::string_view line(raw_line);
   bool had_trailing_newline = absl::ConsumeSuffix(&line, "\n");
 
@@ -57,7 +60,8 @@ void SourceCode::ValidateAndAddLine(int indent, const string& raw_line) {
   } else if (had_trailing_newline) {
     LOG(WARNING) << "Superfluous trailing newline in '" << line << "'";
   }
-  lines_.push_back({indent, string(absl::StripTrailingAsciiWhitespace(line))});
+  lines_.push_back(
+      {indent, std::string(absl::StripTrailingAsciiWhitespace(line))});
 }
 
 }  // namespace generator
diff --git a/tensorflow/c/experimental/ops/gen/common/source_code.h b/tensorflow/c/experimental/ops/gen/common/source_code.h
index df1aa90acf7b8c..9fd7f7eec5e174 100644
--- a/tensorflow/c/experimental/ops/gen/common/source_code.h
+++ b/tensorflow/c/experimental/ops/gen/common/source_code.h
@@ -24,13 +24,13 @@ namespace generator {
 
 class SourceCode {
  public:
-  string Render() const;
+  std::string Render() const;
   void SetSpacesPerIndent(int spaces_per_indent) {
     spaces_per_indent_ = spaces_per_indent;
   }
 
-  void AddLineWithIndent(const string &line);
-  void AddLineWithoutIndent(const string &line);
+  void AddLineWithIndent(const std::string& line);
+  void AddLineWithoutIndent(const std::string& line);
   void AddBlankLine();
   void IncreaseIndent();
   void DecreaseIndent();
@@ -38,10 +38,10 @@ class SourceCode {
  private:
   struct Line {
     int indent;
-    string text;
+    std::string text;
   };
 
-  void ValidateAndAddLine(int indent_level, const string &raw_line);
+  void ValidateAndAddLine(int indent_level, const std::string& raw_line);
 
   int spaces_per_indent_ = 2;
   int current_indent_ = 0;
diff --git a/tensorflow/c/experimental/ops/gen/common/view_util.cc b/tensorflow/c/experimental/ops/gen/common/view_util.cc
index 388aa0646db82b..d8095aca80cf51 100644
--- a/tensorflow/c/experimental/ops/gen/common/view_util.cc
+++ b/tensorflow/c/experimental/ops/gen/common/view_util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/view_util.h"
 
+#include <string>
 #include <vector>
 
 #include "absl/strings/str_join.h"
@@ -23,17 +24,20 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-string Call(const string& object, const string& method,
-            std::vector<string> arguments, const char* oper) {
+std::string Call(const std::string& object, const std::string& method,
+                 std::vector<std::string> arguments, const char* oper) {
   return absl::Substitute("$0$1$2($3)", object, oper, method,
                           absl::StrJoin(arguments, ", "));
 }
 
-string Call(const string& function, std::vector<string> arguments) {
+std::string Call(const std::string& function,
+                 std::vector<std::string> arguments) {
   return absl::Substitute("$0($1)", function, absl::StrJoin(arguments, ", "));
 }
 
-string Quoted(const string& s) { return absl::Substitute("\"$0\"", s); }
+std::string Quoted(const std::string& s) {
+  return absl::Substitute("\"$0\"", s);
+}
 
 }  // namespace generator
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/gen/common/view_util.h b/tensorflow/c/experimental/ops/gen/common/view_util.h
index 7ab437a90e4fd8..f23831ce8a07dd 100644
--- a/tensorflow/c/experimental/ops/gen/common/view_util.h
+++ b/tensorflow/c/experimental/ops/gen/common/view_util.h
@@ -22,10 +22,11 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-string Call(const string &function, std::vector<string> arguments);
-string Call(const string &object, const string &method,
-            std::vector<string> arguments, const char *oper = "->");
-string Quoted(const string &s);
+std::string Call(const std::string& function,
+                 std::vector<std::string> arguments);
+std::string Call(const std::string& object, const std::string& method,
+                 std::vector<std::string> arguments, const char* oper = "->");
+std::string Quoted(const std::string& s);
 
 }  // namespace generator
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
index 3fe5c059ca4e70..45e7b87069e361 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
@@ -52,11 +52,11 @@ SourceCode CppGenerator::SourceFileContents() const {
   return GenerateOneFile(cpp::RendererContext::kSource);
 }
 
-string CppGenerator::HeaderFileName() const {
+std::string CppGenerator::HeaderFileName() const {
   return io::JoinPath(path_config_.output_path, cpp_config_.unit + "_ops.h");
 }
 
-string CppGenerator::SourceFileName() const {
+std::string CppGenerator::SourceFileName() const {
   return io::JoinPath(path_config_.output_path, cpp_config_.unit + "_ops.cc");
 }
 
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
index 0a7b08cd9b171f..b4d016e0ecca44 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
@@ -30,8 +30,8 @@ class CppGenerator {
   explicit CppGenerator(cpp::CppConfig cpp_config, PathConfig path_config);
   SourceCode HeaderFileContents() const;
   SourceCode SourceFileContents() const;
-  string HeaderFileName() const;
-  string SourceFileName() const;
+  std::string HeaderFileName() const;
+  std::string SourceFileName() const;
   void WriteHeaderFile() const;
   void WriteSourceFile() const;
 
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
index f4a4d82bbce423..e1db2c9b8ce14b 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
@@ -30,12 +30,12 @@ namespace generator {
 namespace {
 
 TEST(CppGeneratorTest, typical_usage) {
-  string category = "testing";
-  string name_space = "tensorflow::ops";
-  string output_dir = "tensorflow/c/experimental/ops/gen/cpp/golden";
-  string source_dir = "tensorflow";
-  string api_dirs = "";
-  std::vector<string> ops = {
+  std::string category = "testing";
+  std::string name_space = "tensorflow::ops";
+  std::string output_dir = "tensorflow/c/experimental/ops/gen/cpp/golden";
+  std::string source_dir = "tensorflow";
+  std::string api_dirs = "";
+  std::vector<std::string> ops = {
       "Neg",        // Simple unary Op
       "MatMul",     // 2 inputs & attrs with default values
       "IdentityN",  // Variadic input+output
@@ -50,17 +50,19 @@ TEST(CppGeneratorTest, typical_usage) {
   CppGenerator generator(cpp_config, controller_config);
 
   Env *env = Env::Default();
-  string golden_dir = io::JoinPath(testing::TensorFlowSrcRoot(),
-                                   controller_config.tf_output_dir);
+  std::string golden_dir = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                        controller_config.tf_output_dir);
 
-  string generated_header = generator.HeaderFileContents().Render();
-  string generated_source = generator.SourceFileContents().Render();
-  string expected_header;
-  string header_file_name = io::JoinPath(golden_dir, "testing_ops.h.golden");
+  std::string generated_header = generator.HeaderFileContents().Render();
+  std::string generated_source = generator.SourceFileContents().Render();
+  std::string expected_header;
+  std::string header_file_name =
+      io::JoinPath(golden_dir, "testing_ops.h.golden");
   TF_CHECK_OK(ReadFileToString(env, header_file_name, &expected_header));
 
-  string expected_source;
-  string source_file_name = io::JoinPath(golden_dir, "testing_ops.cc.golden");
+  std::string expected_source;
+  std::string source_file_name =
+      io::JoinPath(golden_dir, "testing_ops.cc.golden");
   TF_CHECK_OK(ReadFileToString(env, source_file_name, &expected_source));
 
   // Remove carriage returns (for Windows)
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
index 4f0e64e3b0f8eb..7c8231a71133f5 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace generator {
 namespace cpp {
 
-CppConfig::CppConfig(const string &category, const string &name_space)
+CppConfig::CppConfig(const std::string& category, const std::string& name_space)
     : category(category),
       unit(absl::AsciiStrToLower(category)),
       namespaces(absl::StrSplit(name_space, "::")) {}
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
index fa7571d98a1214..eec5888e17e7cf 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
@@ -24,13 +24,13 @@ namespace generator {
 namespace cpp {
 
 struct CppConfig {
-  string category;
-  string unit;
-  std::vector<string> namespaces;
+  std::string category;
+  std::string unit;
+  std::vector<std::string> namespaces;
 
   explicit CppConfig() = default;
-  explicit CppConfig(const string &category,
-                     const string &name_space = "tensorflow::ops");
+  explicit CppConfig(const std::string& category,
+                     const std::string& name_space = "tensorflow::ops");
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
index 1a685cac0c405c..50db08df1db988 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
@@ -27,10 +27,10 @@ namespace generator {
 namespace cpp {
 
 GuardRenderer::GuardRenderer(RendererContext context) : Renderer(context) {
-  string self_path = io::JoinPath(context_.path_config.tf_root_dir,
-                                  context_.path_config.tf_output_dir,
-                                  context_.cpp_config.unit + "_ops.h");
-  string with_underscores(self_path);
+  std::string self_path = io::JoinPath(context_.path_config.tf_root_dir,
+                                       context_.path_config.tf_output_dir,
+                                       context_.cpp_config.unit + "_ops.h");
+  std::string with_underscores(self_path);
   std::replace(with_underscores.begin(), with_underscores.end(), '/', '_');
   std::replace(with_underscores.begin(), with_underscores.end(), '.', '_');
   guard_ = toUpperSnake(with_underscores) + "_";
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
index a45fe89a7a011c..bbd29e4620e2c2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
@@ -31,7 +31,7 @@ class GuardRenderer : public Renderer {
   void Close();
 
  private:
-  string guard_;
+  std::string guard_;
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
index 38f31209f6da24..0ec8108bee7aaf 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
@@ -30,13 +30,13 @@ void IncludeRenderer::SelfHeader() {
   BlankLine();
 }
 
-string IncludeRenderer::SelfHeaderPath() const {
+std::string IncludeRenderer::SelfHeaderPath() const {
   return io::JoinPath(context_.path_config.tf_root_dir,
                       context_.path_config.tf_output_dir,
                       context_.cpp_config.unit + "_ops.h");
 }
 
-void IncludeRenderer::Include(const string &tf_file_path) {
+void IncludeRenderer::Include(const std::string& tf_file_path) {
   CodeLine("#include \"$0\"",
            io::JoinPath(context_.path_config.tf_prefix_dir, tf_file_path));
 }
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
index e43715a62e45b0..4178f0da5beeb9 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
@@ -27,12 +27,12 @@ class IncludeRenderer : public Renderer {
  public:
   explicit IncludeRenderer(RendererContext context);
 
-  string SelfHeaderPath() const;
+  std::string SelfHeaderPath() const;
   void SelfHeader();
   void Headers();
 
  private:
-  void Include(const string &tf_file_path);
+  void Include(const std::string& tf_file_path);
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
index db28ab303ae5c6..b490cc7fe9e86a 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
@@ -26,7 +26,7 @@ NamespaceRenderer::NamespaceRenderer(RendererContext context)
     : Renderer(context) {}
 
 void NamespaceRenderer::Open() {
-  for (const string& ns : context_.cpp_config.namespaces) {
+  for (const std::string& ns : context_.cpp_config.namespaces) {
     CodeLine("namespace " + ns + " {");
   }
 }
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
index c459d239ca699f..63cb5f30eb1d9d 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
@@ -31,11 +31,11 @@ namespace tensorflow {
 namespace generator {
 namespace cpp {
 
-string OpRenderer::Signature() const {
-  std::vector<string> args_with_default_val;
-  std::vector<string> args_without_default_val;
+std::string OpRenderer::Signature() const {
+  std::vector<std::string> args_with_default_val;
+  std::vector<std::string> args_without_default_val;
   for (OpArgumentView const& argument : op_.AllArguments()) {
-    string text = argument.Declaration();
+    std::string text = argument.Declaration();
     if (context_.mode == RendererContext::kHeader) {
       absl::StrAppend(&text, argument.Initializer());
     }
@@ -45,7 +45,7 @@ string OpRenderer::Signature() const {
       args_without_default_val.push_back(text);
     }
   }
-  std::vector<string> arguments;
+  std::vector<std::string> arguments;
   arguments.reserve(args_without_default_val.size() +
                     args_with_default_val.size());
   arguments.insert(arguments.end(),
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
index 3360e14e672e3a..1ea161f55bdad9 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
@@ -34,7 +34,7 @@ class OpRenderer : public Renderer {
   OpView op_;
   OpCommentRenderer comment_;
 
-  string Signature() const;
+  std::string Signature() const;
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
index a9efb94335c0a6..6a608d759a3753 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
@@ -34,21 +34,21 @@ Renderer& Renderer::BlankLine() {
   return *this;
 }
 
-Renderer& Renderer::CodeLine(const string& text) {
+Renderer& Renderer::CodeLine(const std::string& text) {
   context_.code.AddLineWithoutIndent(text);
   return *this;
 }
 
-Renderer& Renderer::CodeLines(const string& text) {
+Renderer& Renderer::CodeLines(const std::string& text) {
   absl::string_view trimmed_text(text);
   str_util::RemoveWhitespaceContext(&trimmed_text);
-  for (const string& line : str_util::Split(trimmed_text, '\n')) {
+  for (const std::string& line : str_util::Split(trimmed_text, '\n')) {
     context_.code.AddLineWithoutIndent(line);
   }
   return *this;
 }
 
-Renderer& Renderer::Statement(const string& text) {
+Renderer& Renderer::Statement(const std::string& text) {
   if (absl::EndsWith(text, ";")) {
     LOG(WARNING) << "Superfluous terminating ';' in '" << text << "'";
     context_.code.AddLineWithIndent(text);
@@ -58,22 +58,22 @@ Renderer& Renderer::Statement(const string& text) {
   return *this;
 }
 
-Renderer& Renderer::TFStatement(const string& text) {
+Renderer& Renderer::TFStatement(const std::string& text) {
   return Statement(absl::Substitute("TF_RETURN_IF_ERROR($0)", text));
 }
 
-Renderer& Renderer::CommentLine(const string& text) {
+Renderer& Renderer::CommentLine(const std::string& text) {
   context_.code.AddLineWithIndent(absl::StrCat("// ", text));
   return *this;
 }
 
-Renderer& Renderer::BlockOpen(const string& text) {
+Renderer& Renderer::BlockOpen(const std::string& text) {
   context_.code.AddLineWithIndent(absl::StrCat(text, " {"));
   context_.code.IncreaseIndent();
   return *this;
 }
 
-Renderer& Renderer::BlockClose(const string& text) {
+Renderer& Renderer::BlockClose(const std::string& text) {
   context_.code.DecreaseIndent();
   context_.code.AddLineWithIndent(absl::StrCat("}", text));
   return *this;
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
index b6168b196b35b2..f41923651f44e2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
@@ -34,7 +34,7 @@ class Renderer {
 
   // Append a line of source code, left-justified (not indented).
   // Use for preprocessors directives ("#include"), namespaces, etc.
-  Renderer &CodeLine(const string &text);
+  Renderer& CodeLine(const std::string& text);
   template <typename... Args>
   Renderer CodeLine(absl::string_view text, const Args &...args) {
     return CodeLine(absl::Substitute(text, args...));
@@ -44,7 +44,7 @@ class Renderer {
   // Note: Trims leading/trailing whitespace including newlines, making this
   //       method convenient for multiline raw strings.
   // Newlines ('\n') are allowed/expected.
-  Renderer &CodeLines(const string &text);
+  Renderer& CodeLines(const std::string& text);
   template <typename... Args>
   Renderer CodeLines(absl::string_view text, const Args &...args) {
     return CodeLines(absl::Substitute(text, args...));
@@ -52,7 +52,7 @@ class Renderer {
 
   // Indent and append a C++ statement.
   // Note: do *not* include a trailing semicolon in the statement text.
-  Renderer &Statement(const string &text);
+  Renderer& Statement(const std::string& text);
   template <typename... Args>
   Renderer Statement(absl::string_view text, const Args &...args) {
     return Statement(absl::Substitute(text, args...));
@@ -60,14 +60,14 @@ class Renderer {
 
   // Indent and append a call to a TF method returning a Status to check.
   // Note: do *not* include a trailing semicolon in the statement text.
-  Renderer &TFStatement(const string &text);
+  Renderer& TFStatement(const std::string& text);
   template <typename... Args>
   Renderer TFStatement(absl::string_view text, const Args &...args) {
     return TFStatement(absl::Substitute(text, args...));
   }
 
   // Indent and append a C++ single-line style comment (using '//').
-  Renderer &CommentLine(const string &text = "");
+  Renderer& CommentLine(const std::string& text = "");
   template <typename... Args>
   Renderer CommentLine(absl::string_view text, const Args &...args) {
     return CommentLine(absl::Substitute(text, args...));
@@ -75,7 +75,7 @@ class Renderer {
 
   // Append a line of code which starts a new block: trailing with '{') and
   // indenting.
-  Renderer &BlockOpen(const string &text);
+  Renderer& BlockOpen(const std::string& text);
   template <typename... Args>
   Renderer BlockOpen(absl::string_view text, const Args &...args) {
     return BlockOpen(absl::Substitute(text, args...));
@@ -83,7 +83,7 @@ class Renderer {
 
   // Append a line of code ending a block: unindenting and adding '}'.
   // Note: optional trailing text is often a comment, e.g. '// namespace xyz'.
-  Renderer &BlockClose(const string &text = "");
+  Renderer& BlockClose(const std::string& text = "");
   template <typename... Args>
   Renderer BlockClose(absl::string_view text, const Args &...args) {
     return BlockClose(absl::Substitute(text, args...));
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
index eff654c5938160..6621d1aea2c217 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
@@ -57,7 +57,7 @@ TEST(Renderer, typical_usage) {
   SourceCode code;
   TestRenderer(code).Render();
 
-  string expected = R"(// File level comment.
+  std::string expected = R"(// File level comment.
 #include "header.h"
 
 void TestFunction() {
diff --git a/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc b/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc
index 18a506942de5b7..cb922d0a06b7ae 100644
--- a/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc
+++ b/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
diff --git a/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc b/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
index c2bf61d785e6b2..417a0f26d70b92 100644
--- a/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
@@ -26,8 +26,7 @@ namespace {
 
 SavedObjectGraph ParseSavedObjectGraph(absl::string_view text_proto) {
   SavedObjectGraph value;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(string(text_proto),
-                                                          &value));
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(text_proto, &value));
   return value;
 }
 
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index 4214f76cee1cee..de027662df30cf 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -82,6 +82,7 @@ tf_cc_test(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:core",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc
index 1d55dabcc9ab87..866dbaa94895d0 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/test_utils.h"
 #include "tensorflow/c/tensor_interface.h"
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
index 2ac31f313230ac..673411a44456d1 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
diff --git a/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc b/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc
index 6250af6dba1359..1796c99dc79f17 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc
+++ b/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc
@@ -178,8 +178,7 @@ tuple_value: {
 
 StructuredValue ParseStructuredValue(absl::string_view text_proto) {
   StructuredValue value;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(string(text_proto),
-                                                          &value));
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(text_proto, &value));
   return value;
 }
 
diff --git a/tensorflow/c/kernels/bitcast_op_test.cc b/tensorflow/c/kernels/bitcast_op_test.cc
index f2ff59a4c853e0..c44bc832547dab 100644
--- a/tensorflow/c/kernels/bitcast_op_test.cc
+++ b/tensorflow/c/kernels/bitcast_op_test.cc
@@ -60,7 +60,7 @@ void TestBitcastOp(Tensor* input_tensor, DataType out_type,
   (*def.mutable_attr())["type"] = outTypeAttr;
 
   def.add_input(
-      strings::StrCat("input1: ", DataTypeString(input_tensor->dtype())));
+      absl::StrCat("input1: ", DataTypeString(input_tensor->dtype())));
 
   std::unique_ptr<OpKernel> kernel =
       CreateOpKernel(DeviceType(DEVICE_CPU), nullptr, nullptr, def, 1, &status);
@@ -86,13 +86,13 @@ void TestBitcastOp(Tensor* input_tensor, DataType out_type,
 TEST(BitcastOpTest, TestUpcast) {
   Tensor int8_input(DT_UINT8, {8});
   for (int i = 0; i < 8; i++) {
-    int8_input.vec<uint8>()(i) = static_cast<uint8>(1);
+    int8_input.vec<uint8_t>()(i) = static_cast<uint8_t>(1);
   }
   TestBitcastOp(&int8_input, DT_UINT64, TensorShape(), error::OK);
 }
 
 TEST(BitcastOpTest, TestDowncast) {
-  Tensor int64_input(static_cast<uint64>(1));
+  Tensor int64_input(static_cast<uint64_t>(1));
   TestBitcastOp(&int64_input, DT_UINT8, TensorShape({8}), error::OK);
 }
 
diff --git a/tensorflow/c/kernels/histogram_summary_op.cc b/tensorflow/c/kernels/histogram_summary_op.cc
index 7f34e5217c20ba..35340baa5749ce 100644
--- a/tensorflow/c/kernels/histogram_summary_op.cc
+++ b/tensorflow/c/kernels/histogram_summary_op.cc
@@ -151,13 +151,13 @@ void RegisterHistogramSummaryOpKernel() {
 TF_ATTRIBUTE_UNUSED static bool IsHistogramSummaryOpKernelRegistered = []() {
   if (SHOULD_REGISTER_OP_KERNEL("HistogramSummary")) {
     RegisterHistogramSummaryOpKernel<int64_t>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint64>();
-    RegisterHistogramSummaryOpKernel<tensorflow::int32>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint32>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint16>();
-    RegisterHistogramSummaryOpKernel<tensorflow::int16>();
-    RegisterHistogramSummaryOpKernel<tensorflow::int8>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint8>();
+    RegisterHistogramSummaryOpKernel<uint64_t>();
+    RegisterHistogramSummaryOpKernel<int32_t>();
+    RegisterHistogramSummaryOpKernel<uint32_t>();
+    RegisterHistogramSummaryOpKernel<uint16_t>();
+    RegisterHistogramSummaryOpKernel<int16_t>();
+    RegisterHistogramSummaryOpKernel<int8_t>();
+    RegisterHistogramSummaryOpKernel<uint8_t>();
     RegisterHistogramSummaryOpKernel<Eigen::half>();
     RegisterHistogramSummaryOpKernel<tensorflow::bfloat16>();
     RegisterHistogramSummaryOpKernel<float>();
diff --git a/tensorflow/c/kernels/merge_summary_op.cc b/tensorflow/c/kernels/merge_summary_op.cc
index 339267d094a554..ddbc3440d47dc1 100644
--- a/tensorflow/c/kernels/merge_summary_op.cc
+++ b/tensorflow/c/kernels/merge_summary_op.cc
@@ -50,7 +50,7 @@ void MergeSummaryOp_Delete(void* kernel) {}
 
 void MergeSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
   tensorflow::Summary s;
-  std::unordered_set<tensorflow::string> tags;
+  std::unordered_set<std::string> tags;
   Safe_TF_StatusPtr status(TF_NewStatus());
   for (int input_num = 0; input_num < TF_NumInputs(ctx); ++input_num) {
     TF_Tensor* input;
@@ -74,7 +74,7 @@ void MergeSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
       for (int v = 0; v < summary_in.value_size(); ++v) {
         // This tag is unused by the TensorSummary op, so no need to check for
         // duplicates.
-        const tensorflow::string& tag = summary_in.value(v).tag();
+        const std::string& tag = summary_in.value(v).tag();
         if ((!tag.empty()) && !tags.insert(tag).second) {
           std::ostringstream err;
           err << "Duplicate tag " << tag << " found in summary inputs ";
diff --git a/tensorflow/c/kernels/summary_op.cc b/tensorflow/c/kernels/summary_op.cc
index 486aea1af53b50..5688d00fa8fa7c 100644
--- a/tensorflow/c/kernels/summary_op.cc
+++ b/tensorflow/c/kernels/summary_op.cc
@@ -126,7 +126,7 @@ std::string SingleTag(TF_Tensor* tags) {
   if (TF_TensorElementCount(tags) == 1) {
     const char* single_tag =
         static_cast<tensorflow::tstring*>(TF_TensorData(tags))->c_str();
-    return tensorflow::strings::StrCat(" (tag '", single_tag, "')");
+    return absl::StrCat(" (tag '", single_tag, "')");
   } else {
     return "";
   }
@@ -155,13 +155,13 @@ void RegisterScalarSummaryOpKernel() {
 TF_ATTRIBUTE_UNUSED bool IsScalarSummaryOpKernelRegistered = []() {
   if (SHOULD_REGISTER_OP_KERNEL("ScalarSummary")) {
     RegisterScalarSummaryOpKernel<int64_t>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint64>();
-    RegisterScalarSummaryOpKernel<tensorflow::int32>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint32>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint16>();
-    RegisterScalarSummaryOpKernel<tensorflow::int16>();
-    RegisterScalarSummaryOpKernel<tensorflow::int8>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint8>();
+    RegisterScalarSummaryOpKernel<uint64_t>();
+    RegisterScalarSummaryOpKernel<int32_t>();
+    RegisterScalarSummaryOpKernel<uint32_t>();
+    RegisterScalarSummaryOpKernel<uint16_t>();
+    RegisterScalarSummaryOpKernel<int16_t>();
+    RegisterScalarSummaryOpKernel<int8_t>();
+    RegisterScalarSummaryOpKernel<uint8_t>();
     RegisterScalarSummaryOpKernel<Eigen::half>();
     RegisterScalarSummaryOpKernel<tensorflow::bfloat16>();
     RegisterScalarSummaryOpKernel<float>();
diff --git a/tensorflow/c/kernels/summary_op_test.cc b/tensorflow/c/kernels/summary_op_test.cc
index 11a7c06c1d2e30..43de49bc39419d 100644
--- a/tensorflow/c/kernels/summary_op_test.cc
+++ b/tensorflow/c/kernels/summary_op_test.cc
@@ -45,13 +45,15 @@ class DummyDevice : public DeviceBase {
 };
 
 // Helper for comparing output and expected output
-void ExpectSummaryMatches(const Summary& actual, const string& expected_str) {
+void ExpectSummaryMatches(const Summary& actual,
+                          const std::string& expected_str) {
   Summary expected;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
 }
 
-void TestScalarSummaryOp(Tensor* tags, Tensor* values, string expected_output,
+void TestScalarSummaryOp(Tensor* tags, Tensor* values,
+                         std::string expected_output,
                          error::Code expected_code) {
   // Initialize node used to fetch OpKernel
   absl::Status status;
@@ -64,8 +66,8 @@ void TestScalarSummaryOp(Tensor* tags, Tensor* values, string expected_output,
   SetAttrValue(values->dtype(), &valuesTypeAttr);
   (*def.mutable_attr())["T"] = valuesTypeAttr;
 
-  def.add_input(strings::StrCat("input1: ", DataTypeString(tags->dtype())));
-  def.add_input(strings::StrCat("input2: ", DataTypeString(values->dtype())));
+  def.add_input(absl::StrCat("input1: ", DataTypeString(tags->dtype())));
+  def.add_input(absl::StrCat("input2: ", DataTypeString(values->dtype())));
 
   std::unique_ptr<OpKernel> kernel =
       CreateOpKernel(DeviceType(DEVICE_CPU), nullptr, nullptr, def, 1, &status);
diff --git a/tensorflow/c/kernels/tensor_shape_utils.cc b/tensorflow/c/kernels/tensor_shape_utils.cc
index 967330ccb93f87..ba54dc4eda4df9 100644
--- a/tensorflow/c/kernels/tensor_shape_utils.cc
+++ b/tensorflow/c/kernels/tensor_shape_utils.cc
@@ -26,15 +26,15 @@ namespace tensorflow {
 std::string ShapeDebugString(TF_Tensor* tensor) {
   // A TF_Tensor cannot have an unknown rank.
   CHECK_GE(TF_NumDims(tensor), 0);
-  tensorflow::string s = "[";
+  std::string s = "[";
   for (int i = 0; i < TF_NumDims(tensor); ++i) {
-    if (i > 0) tensorflow::strings::StrAppend(&s, ",");
+    if (i > 0) absl::StrAppend(&s, ",");
     int64_t dim = TF_Dim(tensor, i);
     // A TF_Tensor cannot have an unknown dimension.
     CHECK_GE(dim, 0);
-    tensorflow::strings::StrAppend(&s, dim);
+    absl::StrAppend(&s, dim);
   }
-  tensorflow::strings::StrAppend(&s, "]");
+  absl::StrAppend(&s, "]");
   return s;
 }
 }  // namespace tensorflow
diff --git a/tensorflow/c/logging.cc b/tensorflow/c/logging.cc
deleted file mode 100644
index 13c9e6ac208a14..00000000000000
--- a/tensorflow/c/logging.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/c/logging.h"
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stringprintf.h"
-
-static ::tensorflow::string BuildMessage(const char* fmt, va_list args) {
-  ::tensorflow::string message;
-  ::tensorflow::strings::Appendv(&message, fmt, args);
-  return message;
-}
-
-void TF_Log(TF_LogLevel level, const char* fmt, ...) {
-  if (level < TF_INFO || level > TF_FATAL) return;
-  va_list args;
-  va_start(args, fmt);
-  auto message = BuildMessage(fmt, args);
-  va_end(args);
-  switch (level) {
-    case TF_INFO:
-      LOG(INFO) << message;
-      break;
-    case TF_WARNING:
-      LOG(WARNING) << message;
-      break;
-    case TF_ERROR:
-      LOG(ERROR) << message;
-      break;
-    case TF_FATAL:
-      LOG(FATAL) << message;
-      break;
-  }
-}
-
-void TF_VLog(int level, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  auto message = BuildMessage(fmt, args);
-  va_end(args);
-  VLOG(level) << message;
-}
-
-void TF_DVLog(int level, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  auto message = BuildMessage(fmt, args);
-  va_end(args);
-  DVLOG(level) << message;
-}
diff --git a/tensorflow/c/logging.h b/tensorflow/c/logging.h
deleted file mode 100644
index 9583777b661122..00000000000000
--- a/tensorflow/c/logging.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_C_LOGGING_H_
-#define TENSORFLOW_C_LOGGING_H_
-
-#include "tensorflow/c/c_api_macros.h"
-
-// --------------------------------------------------------------------------
-// C API for tensorflow::Logging.
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum TF_LogLevel {
-  TF_INFO = 0,
-  TF_WARNING = 1,
-  TF_ERROR = 2,
-  TF_FATAL = 3,
-} TF_LogLevel;
-
-TF_CAPI_EXPORT extern void TF_Log(TF_LogLevel level, const char* fmt, ...);
-TF_CAPI_EXPORT extern void TF_VLog(int level, const char* fmt, ...);
-TF_CAPI_EXPORT extern void TF_DVLog(int level, const char* fmt, ...);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TENSORFLOW_C_LOGGING_H_
diff --git a/tensorflow/c/tf_datatype.h b/tensorflow/c/tf_datatype.h
index 02a38e9b164eb3..c991fc1f74f2e8 100644
--- a/tensorflow/c/tf_datatype.h
+++ b/tensorflow/c/tf_datatype.h
@@ -65,6 +65,7 @@ typedef enum TF_DataType {
   TF_UINT4 = 30,
   TF_INT2 = 31,
   TF_UINT2 = 32,
+  TF_FLOAT4_E2M1FN = 33  // 2 exponent bits, 1 mantissa bit, finite-only
 } TF_DataType;
 
 // TF_DataTypeSize returns the sizeof() for the underlying type corresponding
diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc
index 95748942f06390..f776fdd9612ecd 100644
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@@ -34,7 +34,7 @@ class ClientSession::Impl {
   Impl(Session* session, std::shared_ptr<Graph> graph)
       : session_(session), graph_(std::move(graph)) {}
 
-  static SessionOptions MakeDefaultSessionOptions(const string& target);
+  static SessionOptions MakeDefaultSessionOptions(const std::string& target);
   absl::Status MaybeExtendGraph() const;
 
   std::unique_ptr<Session> session_;
@@ -44,7 +44,7 @@ class ClientSession::Impl {
   mutable int last_num_graph_nodes_ TF_GUARDED_BY(mu_) = 0;
 };
 
-ClientSession::ClientSession(const Scope& scope, const string& target)
+ClientSession::ClientSession(const Scope& scope, const std::string& target)
     : ClientSession(scope, Impl::MakeDefaultSessionOptions(target)) {}
 
 ClientSession::ClientSession(const Scope& scope) : ClientSession(scope, "") {}
@@ -64,7 +64,7 @@ ClientSession::ClientSession(const Scope& scope,
 ClientSession::~ClientSession() {}
 
 SessionOptions ClientSession::Impl::MakeDefaultSessionOptions(
-    const string& target) {
+    const std::string& target) {
   SessionOptions options;
   options.env = Env::Default();
   options.target = target;
@@ -108,7 +108,7 @@ absl::Status ClientSession::Run(const RunOptions& run_options,
                                 const std::vector<Operation>& run_outputs,
                                 std::vector<Tensor>* outputs,
                                 RunMetadata* run_metadata) const {
-  std::vector<std::pair<string, Tensor>> feeds;
+  std::vector<std::pair<std::string, Tensor>> feeds;
   feeds.reserve(inputs.size());
   for (auto const& feed : inputs) {
     TF_RETURN_IF_ERROR(feed.second.status);
@@ -117,12 +117,12 @@ absl::Status ClientSession::Run(const RunOptions& run_options,
                        std::forward_as_tuple(feed.second.tensor));
   }
 
-  std::vector<string> output_tensor_names;
+  std::vector<std::string> output_tensor_names;
   output_tensor_names.reserve(fetch_outputs.size());
   for (auto const& output : fetch_outputs) {
     output_tensor_names.push_back(output.name());
   }
-  std::vector<string> target_node_names;
+  std::vector<std::string> target_node_names;
   target_node_names.reserve(run_outputs.size());
   for (auto const& output : run_outputs) {
     target_node_names.push_back(output.node()->name());
@@ -138,17 +138,17 @@ absl::Status ClientSession::Run(
     const std::vector<Operation>& run_outputs, std::vector<Tensor>* outputs,
     RunMetadata* run_metadata,
     const thread::ThreadPoolOptions& threadpool_options) const {
-  std::vector<std::pair<string, Tensor>> feeds;
+  std::vector<std::pair<std::string, Tensor>> feeds;
   for (auto const& feed : inputs) {
     TF_RETURN_IF_ERROR(feed.second.status);
     feeds.emplace_back(feed.first.name(), feed.second.tensor);
   }
-  std::vector<string> output_tensor_names;
+  std::vector<std::string> output_tensor_names;
   output_tensor_names.reserve(fetch_outputs.size());
   for (auto const& output : fetch_outputs) {
     output_tensor_names.push_back(output.name());
   }
-  std::vector<string> target_node_names;
+  std::vector<std::string> target_node_names;
   target_node_names.reserve(run_outputs.size());
   for (auto const& output : run_outputs) {
     target_node_names.push_back(output.node()->name());
diff --git a/tensorflow/cc/client/client_session.h b/tensorflow/cc/client/client_session.h
index 9dc790d0171528..bf5cf8b2c6c371 100644
--- a/tensorflow/cc/client/client_session.h
+++ b/tensorflow/cc/client/client_session.h
@@ -65,7 +65,7 @@ class ClientSession {
 
   /// Create a new session to evaluate the graph contained in `scope` by
   /// connecting to the TensorFlow runtime specified by `target`.
-  ClientSession(const Scope& scope, const string& target);
+  ClientSession(const Scope& scope, const std::string& target);
 
   /// Same as above, but use the empty string ("") as the target specification.
   explicit ClientSession(const Scope& scope);
diff --git a/tensorflow/cc/framework/cc_op_gen_util.cc b/tensorflow/cc/framework/cc_op_gen_util.cc
index 45c88283a47a6c..048378e68f4525 100644
--- a/tensorflow/cc/framework/cc_op_gen_util.cc
+++ b/tensorflow/cc/framework/cc_op_gen_util.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/cc_op_gen_util.h"
 
-#include <cctype>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
@@ -29,6 +28,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -107,10 +107,10 @@ string ToGuard(absl::string_view path) {
   string guard;
   guard.reserve(path.size() + 1);  // + 1 -> trailing _
   for (const char c : path) {
-    if (c >= 'A' && c <= 'Z') {
+    if (absl::ascii_isupper(c)) {
       guard += c;
-    } else if (c >= 'a' && c <= 'z') {
-      guard += c + 'A' - 'a';
+    } else if (absl::ascii_islower(c)) {
+      guard += absl::ascii_toupper(c);
     } else {
       guard += '_';
     }
@@ -306,7 +306,7 @@ string ToCamelCase(absl::string_view str) {
     } else if (c == joiner) {
       cap = true;
     } else if (cap) {
-      result += toupper(c);
+      result += absl::ascii_toupper(c);
       cap = false;
     } else {
       result += c;
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
index dcac1e4c0373bd..cd332ed1791849 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
@@ -42,7 +42,7 @@ namespace tensorflow {
 namespace cc_op {
 namespace {
 
-string DefaultValue(OpDef_AttrDef attr) {
+std::string DefaultValue(OpDef_AttrDef attr) {
   static const auto* attr_default_value_map =
       new absl::flat_hash_map<absl::string_view, absl::string_view,
                               StringPieceHasher>{
@@ -80,19 +80,19 @@ string DefaultValue(OpDef_AttrDef attr) {
   return std::string(entry->second);
 }
 
-string WriteClassFuzzDef(const OpInfo& op_info) {
-  string class_signature_str = absl::Substitute(
+std::string WriteClassFuzzDef(const OpInfo& op_info) {
+  std::string class_signature_str = absl::Substitute(
       "class Fuzz$0 : public FuzzSession<$1> {\n", op_info.op_name,
       absl::StrJoin(op_info.graph_op_def.input_arg(), ", ",
-                    [](string* out, const auto arg) {
+                    [](std::string* out, const auto arg) {
                       absl::StrAppend(out, "Tensor");
                       if (ArgIsList(arg)) absl::StrAppend(out, ", Tensor");
                     }));
 
-  string build_graph_body = absl::StrCat(
+  std::string build_graph_body = absl::StrCat(
       absl::StrJoin(
           op_info.graph_op_def.input_arg(), "",
-          [op_info](string* out, const OpDef_ArgDef arg) {
+          [op_info](std::string* out, const OpDef_ArgDef arg) {
             std::string type = "DT_UINT8";
 
             if (arg.type() != DT_INVALID) {
@@ -130,7 +130,7 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
             }
           }),
       absl::StrJoin(op_info.graph_op_def.attr(), "",
-                    [op_info](string* out, const OpDef_AttrDef attr) {
+                    [op_info](std::string* out, const OpDef_AttrDef attr) {
                       if (op_info.inferred_input_attrs.count(attr.name()) ==
                               0 &&
                           !attr.has_default_value()) {
@@ -139,22 +139,22 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
                       }
                     }));
 
-  string constructor_call_str = absl::Substitute(
+  std::string constructor_call_str = absl::Substitute(
       "    tensorflow::ops::$0(scope.WithOpName(\"output\")$1);\n",
       op_info.op_name,
       absl::StrCat(
           op_info.api_def.arg_order().empty()
               ? absl::StrJoin(op_info.api_def.in_arg(), "",
-                              [](string* out, const auto api_def_arg) {
+                              [](std::string* out, const auto api_def_arg) {
                                 strings::StrAppend(out, ", ",
                                                    api_def_arg.name());
                               })
               : absl::StrJoin(op_info.api_def.arg_order(), "",
-                              [](string* out, const auto name) {
+                              [](std::string* out, const auto name) {
                                 strings::StrAppend(out, ", ", name);
                               }),
           absl::StrJoin(op_info.graph_op_def.attr(), "",
-                        [op_info](string* out, const OpDef_AttrDef attr) {
+                        [op_info](std::string* out, const OpDef_AttrDef attr) {
                           if (op_info.inferred_input_attrs.count(attr.name()) ==
                                   0 &&
                               !attr.has_default_value()) {
@@ -162,20 +162,20 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
                           }
                         })));
 
-  string fuzz_impl_signature_str = absl::Substitute(
+  std::string fuzz_impl_signature_str = absl::Substitute(
       "  void FuzzImpl($0) final {\n",
       absl::StrJoin(
           op_info.graph_op_def.input_arg(), ", ",
-          [](string* out, const auto arg) {
+          [](std::string* out, const auto arg) {
             strings::StrAppend(out, "const Tensor& ", arg.name(), "_0");
             if (ArgIsList(arg))
               strings::StrAppend(out, ", const Tensor& ", arg.name(), "_1");
           }));
 
-  string run_inputs_str = absl::Substitute(
+  std::string run_inputs_str = absl::Substitute(
       "    RunInputs({$0});\n",
       absl::StrJoin(op_info.graph_op_def.input_arg(), ", ",
-                    [](string* out, const auto arg) {
+                    [](std::string* out, const auto arg) {
                       if (ArgIsList(arg)) {
                         strings::StrAppend(
                             out, "{\"", arg.name(), "\", ", arg.name(), "_0}, ",
@@ -186,7 +186,7 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
                       }
                     }));
 
-  string fuzz_class_def = strings::StrCat(
+  std::string fuzz_class_def = strings::StrCat(
       class_signature_str, "  void BuildGraph(const Scope& scope) override {\n",
       build_graph_body, constructor_call_str, "  }\n", fuzz_impl_signature_str,
       run_inputs_str, "  }\n", "};\n");
@@ -194,24 +194,24 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
   return fuzz_class_def;
 }
 
-string WriteFuzzTest(const OpInfo& op_info) {
+std::string WriteFuzzTest(const OpInfo& op_info) {
   return absl::Substitute(
       "FUZZ_TEST_F(Fuzz$0, Fuzz).WithDomains($1);\n", op_info.op_name,
       absl::StrJoin(op_info.graph_op_def.input_arg(), ", ",
-                    [](string* out, const auto arg) {
+                    [](std::string* out, const auto arg) {
                       absl::StrAppend(out, "AnyTensor()");
                       if (ArgIsList(arg)) absl::StrAppend(out, ", AnyTensor()");
                     }));
 }
 
-string FuzzerFileStart() {
-  const string fuzz_namespace_begin = R"namespace(
+std::string FuzzerFileStart() {
+  const std::string fuzz_namespace_begin = R"namespace(
 namespace tensorflow {
 namespace fuzzing {
 
 )namespace";
 
-  const string fuzz_header =
+  const std::string fuzz_header =
       absl::StrCat(R"include(// This file is MACHINE GENERATED! Do not edit.
 
 #include "tensorflow/cc/ops/const_op.h"
@@ -224,8 +224,8 @@ namespace fuzzing {
   return fuzz_header;
 }
 
-string FuzzerFileEnd() {
-  const string fuzz_footer = R"footer(
+std::string FuzzerFileEnd() {
+  const std::string fuzz_footer = R"footer(
 }  // namespace fuzzing
 }  // namespace tensorflow
 )footer";
@@ -258,7 +258,7 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
   }
 
   // TODO(unda) : zero input ops
-  std::set<string> zero_input_ops = {"Placeholder", "ImmutableConst"};
+  std::set<std::string> zero_input_ops = {"Placeholder", "ImmutableConst"};
   if (zero_input_ops.find(op_info.op_name) != zero_input_ops.end()) {
     std::cout << "NOT fuzzing: " << op_info.graph_op_def.name()
               << " takes zero inputs.\n";
@@ -266,19 +266,19 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
   }
 
   // TODO(unda, 253431636): constrained kernel
-  std::set<string> constrained_kernel = {"Diag",
-                                         "DiagPart",
-                                         "GatherNd",
-                                         "GatherV2",
-                                         "QuantizeAndDequantizeV2",
-                                         "QuantizeAndDequantizeV3",
-                                         "QuantizeAndDequantizeV4",
-                                         "QuantizeAndDequantizeV4Grad",
-                                         "QuantizedConcat",
-                                         "QuantizedInstanceNorm",
-                                         "QuantizedReshape",
-                                         "ScatterNd",
-                                         "TensorScatterUpdate"};
+  std::set<std::string> constrained_kernel = {"Diag",
+                                              "DiagPart",
+                                              "GatherNd",
+                                              "GatherV2",
+                                              "QuantizeAndDequantizeV2",
+                                              "QuantizeAndDequantizeV3",
+                                              "QuantizeAndDequantizeV4",
+                                              "QuantizeAndDequantizeV4Grad",
+                                              "QuantizedConcat",
+                                              "QuantizedInstanceNorm",
+                                              "QuantizedReshape",
+                                              "ScatterNd",
+                                              "TensorScatterUpdate"};
 
   // TODO(unda, b/253431636): constrained kernel
   if (constrained_kernel.find(op_info.op_name) != constrained_kernel.end()) {
@@ -297,7 +297,7 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
     }
   }
 
-  std::set<string> unhandled_attr_types = {
+  std::set<std::string> unhandled_attr_types = {
       "list(type)",   "func",         "float",      "bool",
       "tensor",       "list(string)", "list(bool)", "list(shape)",
       "list(tensor)", "list(attr)"};
@@ -321,7 +321,7 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
   return true;
 }
 
-string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable) {
+std::string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable) {
   return absl::StrCat(
       FuzzerFileStart(), is_fuzzable ? WriteClassFuzzDef(op_info) : "",
       is_fuzzable ? WriteFuzzTest(op_info) : "", FuzzerFileEnd());
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
index c11c9635d6d149..9dfee93e55e2e1 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 namespace cc_op {
 
 // String with single fuzzer file content.
-string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable);
+std::string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable);
 
 // Do we have all we need to create a fuzzer
 bool OpFuzzingIsOk(const OpInfo& op_info);
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
index f4a1eb642557de..6da6e2af6c3445 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
@@ -39,8 +39,9 @@ namespace tensorflow {
 namespace cc_op {
 namespace {
 
-void WriteAllFuzzers(string root_location, std::vector<string> api_def_dirs,
-                     std::vector<string> op_names) {
+void WriteAllFuzzers(std::string root_location,
+                     std::vector<std::string> api_def_dirs,
+                     std::vector<std::string> op_names) {
   OpList ops;
   absl::StatusOr<ApiDefMap> api_def_map =
       LoadOpsAndApiDefs(ops, false, api_def_dirs);
@@ -60,7 +61,7 @@ void WriteAllFuzzers(string root_location, std::vector<string> api_def_dirs,
       continue;
     }
 
-    OpInfo op_info(op_def, *api_def, std::vector<string>());
+    OpInfo op_info(op_def, *api_def, std::vector<std::string>());
     status.Update(env->NewWritableFile(
         root_location + "/" + op_def.name() + "_fuzz.cc", &fuzz_file));
     status.Update(
@@ -87,9 +88,9 @@ int main(int argc, char* argv[]) {
   for (int i = 1; i < argc; ++i) {
     fprintf(stdout, "Arg %d = %s\n", i, argv[i]);
   }
-  std::vector<tensorflow::string> api_def_srcs = tensorflow::str_util::Split(
+  std::vector<std::string> api_def_srcs = tensorflow::str_util::Split(
       argv[2], ",", tensorflow::str_util::SkipEmpty());
-  std::vector<tensorflow::string> op_names = tensorflow::str_util::Split(
+  std::vector<std::string> op_names = tensorflow::str_util::Split(
       argv[3], ",", tensorflow::str_util::SkipEmpty());
   tensorflow::cc_op::WriteAllFuzzers(argv[1], api_def_srcs, op_names);
   return 0;
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 357515a5dccb00..f3c3fd045a3d6f 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -218,9 +218,9 @@ REGISTER_GRADIENT_OP("GatherNd", GatherNdGrad);
 absl::Status CheckNumericsGrad(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string message;
+  std::string message;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "message", &message));
-  string err_msg = absl::StrCat(
+  std::string err_msg = absl::StrCat(
       "Not a number (NaN) or infinity (Inf) values detected in gradient. ",
       message);
   grad_outputs->push_back(CheckNumerics(scope, grad_inputs[0], err_msg));
@@ -411,7 +411,7 @@ REGISTER_GRADIENT_OP("DepthToSpace", DepthToSpaceGrad);
 absl::Status MirrorPadGrad(const Scope& scope, const Operation& op,
                            const std::vector<Output>& grad_inputs,
                            std::vector<Output>* grad_outputs) {
-  string mode;
+  std::string mode;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(tensorflow::ops::internal::MirrorPadGrad(
       scope, grad_inputs[0], op.input(1), mode));
@@ -424,7 +424,7 @@ REGISTER_GRADIENT_OP("MirrorPad", MirrorPadGrad);
 absl::Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string mode;
+  std::string mode;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(MirrorPad(scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc
index 77e2a3bfc38476..deb90eec264ee7 100644
--- a/tensorflow/cc/gradients/image_grad.cc
+++ b/tensorflow/cc/gradients/image_grad.cc
@@ -95,7 +95,7 @@ absl::Status ScaleAndTranslateGradHelper(const Scope& scope,
                                          const Operation& op,
                                          const std::vector<Output>& grad_inputs,
                                          std::vector<Output>* grad_outputs) {
-  string kernel_type;
+  std::string kernel_type;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.node()->attrs(), "kernel_type", &kernel_type));
   bool antialias;
@@ -117,7 +117,7 @@ absl::Status CropAndResizeGradHelper(const Scope& scope, const Operation& op,
                                      const std::vector<Output>& grad_inputs,
                                      std::vector<Output>* grad_outputs) {
   DataType input_type;
-  string method;
+  std::string method;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "method", &method));
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "T", &input_type));
   auto image_shape = Shape(scope, op.input(0));
diff --git a/tensorflow/cc/gradients/image_grad_test.cc b/tensorflow/cc/gradients/image_grad_test.cc
index f7a39f39cfc42a..b77f5512237024 100644
--- a/tensorflow/cc/gradients/image_grad_test.cc
+++ b/tensorflow/cc/gradients/image_grad_test.cc
@@ -203,7 +203,7 @@ class ScaleAndTranslateGradTest : public ::testing::Test {
 
   template <typename T>
   void MakeOp(const Tensor& x_data, const Input& y_shape, Input scale,
-              Input translation, const string& kernel_type, bool antialias,
+              Input translation, const std::string& kernel_type, bool antialias,
               Output* x, Output* y) {
     *x = Const<T>(scope_, x_data);
     *y = ScaleAndTranslate(scope_, *x, y_shape, scale, translation,
@@ -216,7 +216,7 @@ class ScaleAndTranslateGradTest : public ::testing::Test {
   template <typename X_T, typename Y_T, typename JAC_T>
   void TestScaleAndTranslate(const TensorShape x_shape, const int out_height,
                              const int out_width, Input scale,
-                             Input translation, const string& kernel_type,
+                             Input translation, const std::string& kernel_type,
                              bool antialias) {
     Tensor x_data = MakeData<X_T>(x_shape);
     Output x, y;
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index bf6f509c21ee8a..c785af15f95447 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -1070,8 +1070,8 @@ absl::Status MatMulGradHelper(const Scope& scope, const bool is_batch,
 absl::Status MatMulGradCommon(const Scope& scope, const Operation& op,
                               const bool is_batch,
                               const std::vector<Output>& grad_inputs,
-                              const string& attr_adj_x,
-                              const string& attr_adj_y,
+                              const std::string& attr_adj_x,
+                              const std::string& attr_adj_y,
                               std::vector<Output>* grad_outputs) {
   auto a = op.input(0);
   auto b = op.input(1);
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 34c0a8fd54b4c4..6309080492c1da 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -54,7 +54,7 @@ absl::Status SoftmaxGrad(const Scope& scope, const Operation& op,
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
 bool IsZero(const Scope& scope, const Output& grad) {
-  string op_type_name = grad.op().node()->type_string();
+  std::string op_type_name = grad.op().node()->type_string();
   if (op_type_name == "ZerosLike" || op_type_name == "Zeros") {
     return true;
   }
@@ -204,7 +204,7 @@ REGISTER_GRADIENT_OP("L2Loss", L2LossGrad);
 absl::Status BiasAddGradHelper(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string data_format;
+  std::string data_format;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.output(0).node()->attrs(), "data_format", &data_format));
   auto dx_1 =
@@ -218,9 +218,9 @@ REGISTER_GRADIENT_OP("BiasAdd", BiasAddGradHelper);
 absl::Status Conv2DGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
+  std::string data_format;
+  std::string padding;
+  std::vector<int32_t> strides;
   bool use_cudnn_on_gpu;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
@@ -245,10 +245,10 @@ REGISTER_GRADIENT_OP("Conv2D", Conv2DGrad);
 absl::Status MaxPoolGradHelper(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
-  std::vector<int32> ksize;
+  std::string data_format;
+  std::string padding;
+  std::vector<int32_t> strides;
+  std::vector<int32_t> ksize;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
@@ -265,8 +265,8 @@ REGISTER_GRADIENT_OP("MaxPool", MaxPoolGradHelper);
 absl::Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
                                  const std::vector<Output>& grad_inputs,
                                  std::vector<Output>* grad_outputs) {
-  string data_format;
-  string padding;
+  std::string data_format;
+  std::string padding;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
@@ -283,10 +283,10 @@ REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);
 absl::Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
                                  const std::vector<Output>& grad_inputs,
                                  std::vector<Output>* grad_outputs) {
-  std::vector<int32> ksize;
-  std::vector<int32> strides;
-  string padding;
-  string data_format;
+  std::vector<int32_t> ksize;
+  std::vector<int32_t> strides;
+  std::string padding;
+  std::string data_format;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
@@ -304,10 +304,10 @@ REGISTER_GRADIENT_OP("MaxPool3D", MaxPool3DGradHelper);
 absl::Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  std::vector<int32> ksize;
-  std::vector<int32> strides;
-  string padding;
-  string data_format;
+  std::vector<int32_t> ksize;
+  std::vector<int32_t> strides;
+  std::string padding;
+  std::string data_format;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
@@ -325,10 +325,10 @@ REGISTER_GRADIENT_OP("AvgPool", AvgPoolGradHelper);
 absl::Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
                                  const std::vector<Output>& grad_inputs,
                                  std::vector<Output>* grad_outputs) {
-  std::vector<int32> ksize;
-  std::vector<int32> strides;
-  string padding;
-  string data_format;
+  std::vector<int32_t> ksize;
+  std::vector<int32_t> strides;
+  std::string padding;
+  std::string data_format;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 56ac37e86b7168..1d23f9d87e2d7d 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <chrono>
 #include <cstddef>
+#include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 
 #include "absl/log/log.h"
@@ -70,7 +72,7 @@ absl::Status QueueRunner::Init(const QueueRunnerDef& queue_runner_def) {
                            queue_runner_def.enqueue_op_name().begin(),
                            queue_runner_def.enqueue_op_name().end());
   size_t op_names_size = enqueue_op_names_.size();
-  if (op_names_size > kint32max) {
+  if (op_names_size > std::numeric_limits<int32_t>::max()) {
     return absl::Status(absl::StatusCode::kInvalidArgument,
                         "Enqueue ops to run cannot exceed kint32max");
   }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index a8dedd0e40997a..1722da0d390915 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -51,8 +51,8 @@ cc_library(
         "@local_xla//xla:status_macros",
         "@local_xla//xla:util",
         "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/backends/cpu/runtime:convolution_lib",
-        "@local_xla//xla/backends/cpu/runtime:dot_lib",
+        "@local_xla//xla/backends/cpu/runtime:convolution_dims",
+        "@local_xla//xla/backends/cpu/runtime:dot_dims",
         "@local_xla//xla/backends/cpu/runtime:thunk_proto_cc",
         "@local_xla//xla/service/cpu:executable_proto_cc",
         "@local_xla//xla/tsl/platform:statusor",
@@ -96,6 +96,7 @@ cc_library(
         ":thunk_proto_execution_deserializer",
         "//tensorflow/compiler/tf2xla",
         "//tensorflow/compiler/tf2xla:allocator",
+        "//tensorflow/compiler/tf2xla:encoded_buffer_allocation_info",
         "//tensorflow/compiler/tf2xla:mlir_tf2xla",  # fixdeps: keep
         "//tensorflow/compiler/tf2xla:tf2xla_proto_cc",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -119,12 +120,13 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
-        "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:debug_options_flags",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla:util",
         "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info_util",
         "@local_xla//xla/backends/cpu/codegen:symbol_name_util",
         "@local_xla//xla/backends/cpu/runtime:thunk_proto_cc",
         "@local_xla//xla/backends/cpu/runtime:thunk_proto_serdes",
@@ -132,7 +134,6 @@ cc_library(
         "@local_xla//xla/client:compile_only_client",
         "@local_xla//xla/hlo/builder:xla_computation",
         "@local_xla//xla/service:compiler",
-        "@local_xla//xla/service/cpu:buffer_info_util",
         "@local_xla//xla/service/cpu:cpu_aot_compilation_result",
         "@local_xla//xla/service/cpu:cpu_compiler",
         "@local_xla//xla/service/cpu:cpu_executable",
@@ -155,7 +156,6 @@ tf_cc_test(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",  # fixdeps: keep
-        "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:shape_util",
         "@local_xla//xla/service/cpu:cpu_aot_compilation_result",
     ] + if_llvm_x86_available([
diff --git a/tensorflow/compiler/aot/aot_only_var_handle_op.cc b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
index 86666b073b0f71..f6293e0a2063bb 100644
--- a/tensorflow/compiler/aot/aot_only_var_handle_op.cc
+++ b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
@@ -31,7 +31,7 @@ class XlaAotOnlyVarHandleOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* context) override;
 
  private:
-  string name_;
+  std::string name_;
 };
 
 XlaAotOnlyVarHandleOp::XlaAotOnlyVarHandleOp(OpKernelConstruction* c)
diff --git a/tensorflow/compiler/aot/benchmark.cc b/tensorflow/compiler/aot/benchmark.cc
index 43b9c06418c2e1..ee4af4ca65a20f 100644
--- a/tensorflow/compiler/aot/benchmark.cc
+++ b/tensorflow/compiler/aot/benchmark.cc
@@ -37,10 +37,10 @@ namespace benchmark {
 //
 // TODO(b/33546473): Refactor tensorflow::Env::NowMicros() so that we can re-use
 // the implementation without pulling in all of the Env dependencies.
-static uint64 NowMicros() {
+static uint64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
-  return static_cast<uint64>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
 
 void DumpStatsToStdout(const Stats& stats) {
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 054a7fdde77bd9..87cb051b75df63 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -42,13 +43,14 @@ limitations under the License.
 #include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
 #include "tensorflow/compiler/aot/thunk_proto_execution_deserializer.h"
 #include "tensorflow/compiler/tf2xla/allocator.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
+#include "xla/backends/cpu/buffer_allocation_info_util.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/debug_options_flags.h"
-#include "xla/service/cpu/buffer_info_util.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/shape.h"
@@ -65,43 +67,37 @@ namespace tfcompile {
 
 namespace {
 
-using BufferInfo = xla::cpu_function_runtime::BufferInfo;
-
-bool IsAlpha(char c) {
-  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
-}
-
-bool IsAlphaNum(char c) { return IsAlpha(c) || (c >= '0' && c <= '9'); }
+using xla::cpu::BufferAllocationInfo;
 
 // Convert an XLA type into a C++ type.
-absl::Status XLATypeToCpp(xla::PrimitiveType type, string* str) {
+absl::Status XLATypeToCpp(xla::PrimitiveType type, std::string* str) {
   switch (type) {
     case xla::PRED:
       *str = "bool";
       break;
     case xla::S8:
-      *str = "tensorflow::int8";
+      *str = "int8_t";
       break;
     case xla::S16:
-      *str = "tensorflow::int16";
+      *str = "int16_t";
       break;
     case xla::S32:
-      *str = "tensorflow::int32";
+      *str = "int32_t";
       break;
     case xla::S64:
       *str = "int64_t";
       break;
     case xla::U8:
-      *str = "tensorflow::uint8";
+      *str = "uint8_t";
       break;
     case xla::U16:
-      *str = "tensorflow::uint16";
+      *str = "uint16_t";
       break;
     case xla::U32:
-      *str = "tensorflow::uint32";
+      *str = "uint32_t";
       break;
     case xla::U64:
-      *str = "tensorflow::uint64";
+      *str = "uint64_t";
       break;
     case xla::F32:
       *str = "float";
@@ -117,33 +113,36 @@ absl::Status XLATypeToCpp(xla::PrimitiveType type, string* str) {
 }
 
 // Returns the sum of the size of each buffer in `buffer_infos`.
-size_t TotalBufferBytes(const std::vector<BufferInfo>& buffer_infos) {
-  return std::accumulate(buffer_infos.begin(), buffer_infos.end(), size_t{0},
-                         [](size_t size, const BufferInfo& buffer_info) {
-                           return size + buffer_info.size();
-                         });
+size_t TotalBufferBytes(absl::Span<const BufferAllocationInfo> buffer_infos) {
+  return std::accumulate(
+      buffer_infos.begin(), buffer_infos.end(), size_t{0},
+      [](size_t size, const BufferAllocationInfo& buffer_info) {
+        return size + buffer_info.size();
+      });
 }
 
-// Returns a vector of BufferInfo instances in `buffer_infos` that are entry
-// parameter buffers.
-std::vector<BufferInfo> ExtractEntryParamBufferInfos(
-    const std::vector<BufferInfo>& buffer_infos) {
-  std::vector<BufferInfo> result;
+// Returns a vector of BufferAllocationInfo instances in `buffer_infos` that are
+// entry parameter buffers.
+std::vector<BufferAllocationInfo> ExtractEntryParamBufferAllocationInfos(
+    absl::Span<const BufferAllocationInfo> buffer_infos) {
+  std::vector<BufferAllocationInfo> result;
   std::copy_if(buffer_infos.begin(), buffer_infos.end(),
-               std::back_inserter(result), [](const BufferInfo& buffer_info) {
+               std::back_inserter(result),
+               [](const BufferAllocationInfo& buffer_info) {
                  return buffer_info.is_entry_parameter();
                });
   return result;
 }
 
-// Returns a vector of BufferInfo instances in `buffer_infos` that are temp
-// buffers.
-std::vector<BufferInfo> ExtractTempBufferInfos(
-    const std::vector<BufferInfo>& buffer_infos) {
-  std::vector<BufferInfo> result;
+// Returns a vector of BufferAllocationInfo instances in `buffer_infos` that are
+// temp buffers.
+std::vector<BufferAllocationInfo> ExtractTempBufferAllocationInfos(
+    absl::Span<const BufferAllocationInfo> buffer_infos) {
+  std::vector<BufferAllocationInfo> result;
   std::copy_if(buffer_infos.begin(), buffer_infos.end(),
-               std::back_inserter(result), [](const BufferInfo& buffer_info) {
-                 return buffer_info.is_temp_buffer();
+               std::back_inserter(result),
+               [](const BufferAllocationInfo& buffer_info) {
+                 return buffer_info.is_temp();
                });
   return result;
 }
@@ -152,11 +151,11 @@ std::vector<BufferInfo> ExtractTempBufferInfos(
 // are used to generate methods for args and results.
 absl::Status AddRewritesForShape(
     int i, const xla::Shape& shape,
-    std::vector<std::pair<string, string>>* rewrites) {
-  string type;
+    std::vector<std::pair<std::string, std::string>>* rewrites) {
+  std::string type;
   TF_RETURN_IF_ERROR(XLATypeToCpp(shape.element_type(), &type));
-  std::vector<string> dim_vars;
-  string dim_sizes, indices;
+  std::vector<std::string> dim_vars;
+  std::string dim_sizes, indices;
   int count = 1;
   if (shape.dimensions().size() == 0 ||
       (shape.dimensions().size() == 1 && shape.dimensions(0) == 1)) {
@@ -165,8 +164,8 @@ absl::Status AddRewritesForShape(
   } else {
     for (int dim = 0; dim < shape.dimensions().size(); ++dim) {
       dim_vars.push_back(absl::StrCat("size_t dim", dim));
-      dim_sizes += absl::StrCat("[", shape.dimensions(dim), "]");
-      indices += absl::StrCat("[dim", dim, "]");
+      absl::StrAppend(&dim_sizes, "[", shape.dimensions(dim), "]");
+      absl::StrAppend(&indices, "[dim", dim, "]");
       count *= shape.dimensions(dim);
     }
   }
@@ -187,8 +186,9 @@ absl::Status AddRewritesForShape(
 // TODO(toddw): If this becomes a problem, we should be able to change the
 // algorithm to O(N) by using a state machine, e.g. regexps or a real
 // text-templating mechanism.
-string RewriteWithName(const string& name, string code,
-                       const std::vector<std::pair<string, string>>& rewrites) {
+std::string RewriteWithName(
+    const std::string& name, std::string code,
+    const std::vector<std::pair<std::string, std::string>>& rewrites) {
   absl::StrReplaceAll(rewrites, &code);
   absl::StrReplaceAll({{"{{NAME}}", name}}, &code);
   return code;
@@ -198,7 +198,7 @@ string RewriteWithName(const string& name, string code,
 absl::Status GenArgMethods(const tf2xla::Config& config,
                            const xla::ProgramShapeProto& ps,
                            const CompileResult& compile_result,
-                           string* methods) {
+                           std::string* methods) {
   const int num_args = ps.parameters_size();
   // feed_size() + variable_size() is the maximum number of args as an
   // implementation may not create an argument for an unused variable.
@@ -208,11 +208,11 @@ absl::Status GenArgMethods(const tf2xla::Config& config,
         config.variable_size(), ") and num_args(", num_args, ")");
   }
   for (int i = 0; i < config.feed_size(); ++i) {
-    std::vector<std::pair<string, string>> rewrites;
+    std::vector<std::pair<std::string, std::string>> rewrites;
     TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         xla::Shape::FromProto(ps.parameters(i)));
     TF_RETURN_IF_ERROR(AddRewritesForShape(i, shape, &rewrites));
-    const string code = R"(
+    const std::string code = R"(
   void set_arg{{NAME}}_data(const void* data) {
     set_arg_data({{I}}, data);
   }
@@ -248,7 +248,7 @@ absl::Status GenArgMethods(const tf2xla::Config& config,
 // Generate methods for results (outputs).
 absl::Status GenResultMethods(const tf2xla::Config& config,
                               const xla::ProgramShapeProto& ps,
-                              string* methods) {
+                              std::string* methods) {
   if (ps.result().element_type() != xla::TUPLE) {
     // The XlaCompiler we use to build the xla computation always generates a
     // tuple result, and we rely on this to simplify code generation.
@@ -267,11 +267,11 @@ absl::Status GenResultMethods(const tf2xla::Config& config,
                                    ps.result().tuple_shapes_size(), ")");
   }
   for (int i = 0; i < config.fetch_size(); ++i) {
-    std::vector<std::pair<string, string>> rewrites;
+    std::vector<std::pair<std::string, std::string>> rewrites;
     TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         xla::Shape::FromProto(ps.result().tuple_shapes(i)));
     TF_RETURN_IF_ERROR(AddRewritesForShape(i, shape, &rewrites));
-    string code = R"(
+    std::string code = R"(
   {{TYPE}}* result{{NAME}}_data() {
     return static_cast<{{TYPE}}*>(result_data({{I}}));
   }
@@ -304,14 +304,14 @@ absl::Status GenResultMethods(const tf2xla::Config& config,
 // Generate methods for variables.
 absl::Status GenVariableMethods(const tf2xla::Config& config,
                                 const xla::ProgramShapeProto& ps,
-                                string* methods) {
+                                std::string* methods) {
   const int num_args = ps.parameters_size();
   for (int i = config.feed_size(); i < num_args; ++i) {
-    std::vector<std::pair<string, string>> rewrites;
+    std::vector<std::pair<std::string, std::string>> rewrites;
     TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         xla::Shape::FromProto(ps.parameters(i)));
     TF_RETURN_IF_ERROR(AddRewritesForShape(i, shape, &rewrites));
-    const string code = R"(
+    const std::string code = R"(
   void set_var_{{NAME}}_data({{MAYBE_CONST}}{{TYPE}}* data) {
     set_arg_data({{I}}, data);
   }
@@ -345,7 +345,8 @@ absl::Status GenVariableMethods(const tf2xla::Config& config,
 }
 
 // Generate shape infos for args (inputs).
-absl::Status GenArgShapeInfos(const xla::ProgramShapeProto& ps, string* infos) {
+absl::Status GenArgShapeInfos(const xla::ProgramShapeProto& ps,
+                              std::string* infos) {
   for (int i = 0; i < ps.parameters_size(); ++i) {
     const xla::ShapeProto& shape = ps.parameters(i);
     if (shape.element_type() == xla::TUPLE) {
@@ -383,7 +384,7 @@ absl::Status GenArgShapeInfos(const xla::ProgramShapeProto& ps, string* infos) {
 
 // Generate shape infos for results.
 absl::Status GenResultShapeInfos(const xla::ProgramShapeProto& ps,
-                                 string* infos) {
+                                 std::string* infos) {
   if (ps.result().element_type() != xla::TUPLE) {
     return absl::InternalError("codegen requires the XLA result to be a tuple");
   }
@@ -417,7 +418,7 @@ absl::Status GenResultShapeInfos(const xla::ProgramShapeProto& ps,
 // tf2xla::{Feed,Fetch,Variable}. Each feed or fetch name results in a C-style
 // string literal in the array, with nullptr terminating the array.
 template <typename T>
-string GenNameToIndexCode(const T& entries, bool generate) {
+std::string GenNameToIndexCode(const T& entries, bool generate) {
   // No need for a static array if we're not supposed to generate the data.
   if (!generate) {
     return "{\n    return nullptr;\n  }";
@@ -432,7 +433,7 @@ string GenNameToIndexCode(const T& entries, bool generate) {
     end = i;
   }
   // Emit string literals up to the last non-empty name.
-  string code = "{\n    static const char* kNames[] = {";
+  std::string code = "{\n    static const char* kNames[] = {";
   for (int i = 0; i < end; ++i) {
     if (i > 0) {
       code += ", ";
@@ -471,25 +472,24 @@ absl::Status ValidateFeedFetchCppNames(const tf2xla::Config& config) {
 }
 
 // Returns a list of C++ expressions that, when executed, will construct the
-// BufferInfo instances in `buffer_infos`.
-std::vector<string> BufferInfosToCppExpression(
-    const std::vector<BufferInfo>& buffer_infos) {
-  std::vector<string> buffer_infos_as_strings;
-  std::transform(buffer_infos.begin(), buffer_infos.end(),
-                 std::back_inserter(buffer_infos_as_strings),
-                 [](const BufferInfo& buffer_info) {
-                   xla::cpu_function_runtime::EncodedBufferInfo encoded =
-                       buffer_info.Encode();
-                   auto param_to_str = [](uint32_t param) -> std::string {
-                     return param == ~0U ? "~0U" : absl::StrCat(param, "U");
-                   };
-                   return absl::StrCat(
-                       "::xla::cpu_function_runtime::BufferInfo("
-                       "::xla::cpu_function_runtime::EncodedBufferInfo{",
-                       encoded.packed_kind_and_size, "ULL, ",
-                       param_to_str(encoded.entry_param_number), ", ",
-                       param_to_str(encoded.result_param_number), "})");
-                 });
+// BufferAllocationInfo instances in `buffer_infos`.
+std::vector<std::string> BufferAllocationInfosToCppExpression(
+    absl::Span<const BufferAllocationInfo> buffer_infos) {
+  std::vector<std::string> buffer_infos_as_strings;
+  absl::c_transform(
+      buffer_infos, std::back_inserter(buffer_infos_as_strings),
+      [](const BufferAllocationInfo& buffer_info) {
+        xla::cpu::EncodedBufferAllocationInfo encoded(buffer_info);
+        auto param_to_str = [](int32_t param) -> std::string {
+          return param == -1 ? "~0U" : absl::StrCat(param, "U");
+        };
+        return absl::StrCat(
+            "static_cast<::xla::cpu::BufferAllocationInfo>("
+            "::xla::cpu::EncodedBufferAllocationInfo{",
+            encoded.packed_kind_and_size, "ULL, ",
+            param_to_str(encoded.entry_param_number), ", ",
+            param_to_str(encoded.result_number), "})");
+      });
   return buffer_infos_as_strings;
 }
 
@@ -659,8 +659,8 @@ absl::Status ExtendRewrites(
       const std::string function_declarations_from_obj_files,
       GenFunctionDeclarations(absl::MakeSpan(entry_point_symbols)));
 
-  const int64_t buffer_infos_size = aot_thunks->buffer_infos().size();
-  const std::optional<size_t> temp_allocation_index =
+  int64_t buffer_infos_size = aot_thunks->buffer_allocation_infos().size();
+  std::optional<size_t> temp_allocation_index =
       aot_thunks->temp_allocation_index();
   if (temp_allocation_index.has_value() &&
       (*temp_allocation_index < 0 ||
@@ -670,45 +670,36 @@ absl::Status ExtendRewrites(
         " is outside the range of temp sizes: [0,", buffer_infos_size, ")"));
   }
 
-  const bool xla_cpu_multi_thread_eigen =
-      xla::GetDebugOptionsFromFlags().xla_cpu_multi_thread_eigen();
-
   std::vector<std::string> runtime_specific_includes = {R"(
 #include "absl/log/check.h"
+#include "absl/synchronization/blocking_counter.h"
 #include "xla/backends/cpu/runtime/kernel_c_api.h"
 #include "xla/types.h")"};
 
   if (HasThunkKind(aot_thunks->proto().thunk_sequence(),
                    xla::cpu::ThunkProto::kDotThunk)) {
-    if (xla_cpu_multi_thread_eigen) {
-      runtime_specific_includes.push_back(
-          R"(#include "xla/service/cpu/runtime_matmul.h")");
-    }
     runtime_specific_includes.push_back(
-        R"(#include "xla/service/cpu/runtime_single_threaded_matmul.h")");
+        R"(#include "xla/backends/cpu/runtime/dot_lib.h")");
   }
 
   if (HasThunkKind(aot_thunks->proto().thunk_sequence(),
                    xla::cpu::ThunkProto::kConvolutionThunk)) {
-    if (xla_cpu_multi_thread_eigen) {
-      runtime_specific_includes.push_back(
-          R"(#include "xla/service/cpu/runtime_conv2d.h")");
-    }
-
     runtime_specific_includes.push_back(
-        R"(#include "xla/service/cpu/runtime_single_threaded_conv2d.h")");
+        R"(#include "absl/synchronization/notification.h")");
+    runtime_specific_includes.push_back(
+        R"(#include "xla/backends/cpu/runtime/convolution_lib.h")");
   }
 
   if (HasThunkKind(aot_thunks->proto().thunk_sequence(),
                    xla::cpu::ThunkProto::kSortThunk)) {
     runtime_specific_includes.push_back(
-        R"(#include "xla/service/cpu/runtime_key_value_sort.h")");
+        R"(#include "xla/backends/cpu/runtime/sort_lib.h")");
   }
 
   if (HasThunkKind(aot_thunks->proto().thunk_sequence(),
                    xla::cpu::ThunkProto::kTopKThunk)) {
     runtime_specific_includes.push_back(
-        R"(#include "xla/service/cpu/runtime_topk.h")");
+        R"(#include "xla/backends/cpu/runtime/topk_lib.h")");
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -834,31 +825,32 @@ absl::Status ExtendRewrites(
 absl::Status GenerateHeader(
     const CodegenOpts& opts, const tf2xla::Config& config,
     const CompileResult& compile_result, const MetadataResult& metadata_result,
-    const EmbeddedConstantBuffers& embedded_constant_buffers, string* header) {
+    const EmbeddedConstantBuffers& embedded_constant_buffers,
+    std::string* header) {
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config));
 
-  const std::vector<BufferInfo>& buffer_infos =
-      compile_result.aot->buffer_infos();
+  absl::Span<const BufferAllocationInfo> buffer_infos =
+      compile_result.aot->buffer_allocation_infos();
 
-  const std::vector<int32> arg_index_table =
-      ::xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
-  const std::vector<int32> result_index_table =
-      ::xla::cpu::CreateResultIndexTableFromBufferInfos(buffer_infos);
-  std::vector<string> buffer_infos_as_strings =
-      BufferInfosToCppExpression(buffer_infos);
+  const std::vector<int32_t> arg_index_table =
+      ::xla::cpu::CreateArgIndexTable(buffer_infos);
+  const std::vector<int32_t> result_index_table =
+      ::xla::cpu::CreateResultIndexTable(buffer_infos);
+  std::vector<std::string> buffer_infos_as_strings =
+      BufferAllocationInfosToCppExpression(buffer_infos);
 
   // Compute sizes and generate methods.
-  std::vector<BufferInfo> buffer_infos_for_args =
-      ExtractEntryParamBufferInfos(buffer_infos);
-  std::vector<BufferInfo> buffer_infos_for_temps =
-      ExtractTempBufferInfos(buffer_infos);
+  std::vector<BufferAllocationInfo> buffer_infos_for_args =
+      ExtractEntryParamBufferAllocationInfos(buffer_infos);
+  std::vector<BufferAllocationInfo> buffer_infos_for_temps =
+      ExtractTempBufferAllocationInfos(buffer_infos);
   const xla::ProgramShapeProto& ps = compile_result.program_shape;
-  string methods_arg, methods_result, methods_variable;
+  std::string methods_arg, methods_result, methods_variable;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
   TF_RETURN_IF_ERROR(GenVariableMethods(config, ps, &methods_variable));
-  string arg_shape_infos, result_shape_infos;
+  std::string arg_shape_infos, result_shape_infos;
   TF_RETURN_IF_ERROR(GenArgShapeInfos(ps, &arg_shape_infos));
   TF_RETURN_IF_ERROR(
       CheckEqual(ps.parameters_size(), arg_index_table.size(),
@@ -868,29 +860,29 @@ absl::Status GenerateHeader(
       CheckEqual(ps.result().tuple_shapes_size(), result_index_table.size(),
                  "Result number mismatch, proto vs. result_index_table"));
   TF_ASSIGN_OR_RETURN(auto program_shape, xla::ProgramShape::FromProto(ps));
-  const size_t arg_bytes_aligned = tensorflow::AlignedBufferBytes(
-      buffer_infos_for_args.data(), buffer_infos_for_args.size(),
-      /*allocate_entry_params=*/true);
+  const size_t arg_bytes_aligned =
+      tensorflow::AlignedBufferBytes(buffer_infos_for_args,
+                                     /*allocate_entry_params=*/true);
   const size_t arg_bytes_total = TotalBufferBytes(buffer_infos_for_args);
-  const size_t temp_bytes_aligned = tensorflow::AlignedBufferBytes(
-      buffer_infos_for_temps.data(), buffer_infos_for_temps.size(),
-      /*allocate_entry_params=*/true);
+  const size_t temp_bytes_aligned =
+      tensorflow::AlignedBufferBytes(buffer_infos_for_temps,
+                                     /*allocate_entry_params=*/true);
   const size_t temp_bytes_total = TotalBufferBytes(buffer_infos_for_temps);
 
   // Create rewrite strings for namespace start and end.
-  string ns_start;
-  for (const string& n : opts.namespaces) {
+  std::string ns_start;
+  for (const std::string& n : opts.namespaces) {
     ns_start += absl::StrCat("namespace ", n, " {\n");
   }
   ns_start += "\n";
-  string ns_end("\n");
+  std::string ns_end("\n");
   for (int i = opts.namespaces.size() - 1; i >= 0; --i) {
-    const string& n = opts.namespaces[i];
+    const std::string& n = opts.namespaces[i];
     ns_end += absl::StrCat("}  // end namespace ", n, "\n");
   }
 
   // Generate metadata.
-  const string arg_names_code =
+  const std::string arg_names_code =
       GenNameToIndexCode(config.feed(), opts.gen_name_to_index);
 
   auto variable_copy = config.variable();
@@ -899,12 +891,12 @@ absl::Status GenerateHeader(
       var.set_name(var.node_name());
     }
   }
-  const string variable_names_code =
+  const std::string variable_names_code =
       GenNameToIndexCode(variable_copy, opts.gen_name_to_index);
 
-  const string result_names_code =
+  const std::string result_names_code =
       GenNameToIndexCode(config.fetch(), opts.gen_name_to_index);
-  const string include_xla_data_proto =
+  const std::string include_xla_data_proto =
       opts.gen_program_shape
           ? R"(#include "xla/xla_data.pb.h")"
           : "";
@@ -980,7 +972,7 @@ class {{CLASS}} final : public tensorflow::{{COMPUTATION_CLASS_BASE}} {
 
   // Byte size of each argument buffer. There are kNumArgs entries.
   static const ::int64_t ArgSize(::tensorflow::int32 index) {
-    return BufferInfos()[ArgIndexToBufferIndex()[index]].size();
+    return BufferAllocationInfos()[ArgIndexToBufferIndex()[index]].size();
   }
 
   // Returns static data used to create an XlaCompiledCpuFunction.
@@ -989,7 +981,7 @@ class {{CLASS}} final : public tensorflow::{{COMPUTATION_CLASS_BASE}} {
       XlaCompiledCpuFunction::StaticData* data =
         new XlaCompiledCpuFunction::StaticData;
       set_static_data_function_library_symbol_map(data, FunctionLibrarySymbolMap());
-      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_buffer_infos(data, BufferAllocationInfos());
       set_static_data_num_buffers(data, kNumBuffers);
       set_static_data_result_index_table(data, ResultIndexToBufferIndex());
       set_static_data_num_results(data, kNumResults);
@@ -1081,12 +1073,12 @@ class {{CLASS}} final : public tensorflow::{{COMPUTATION_CLASS_BASE}} {
   // Number of buffers for the compiled computation.
   static constexpr size_t kNumBuffers = {{NUM_BUFFERS}};
 
-  static const ::xla::cpu_function_runtime::BufferInfo* BufferInfos() {
-    static const ::xla::cpu_function_runtime::BufferInfo
-      kBufferInfos[kNumBuffers] = {
+  static const ::xla::cpu::BufferAllocationInfo* BufferAllocationInfos() {
+    static const ::xla::cpu::BufferAllocationInfo
+      kBufferAllocationInfos[kNumBuffers] = {
 {{BUFFER_INFOS_AS_STRING}}
       };
-    return kBufferInfos;
+    return kBufferAllocationInfos;
   }
 
   static const ::tensorflow::int32* ResultIndexToBufferIndex() {
@@ -1153,7 +1145,7 @@ class {{CLASS}} final : public tensorflow::{{COMPUTATION_CLASS_BASE}} {
   }
 
   // The replacement strategy is naive, but good enough for our purposes.
-  std::vector<std::pair<string, string>> rewrites = {
+  std::vector<std::pair<std::string, std::string>> rewrites = {
       {"{{ARG_BYTES_ALIGNED}}", absl::StrCat(arg_bytes_aligned)},
       {"{{ARG_BYTES_TOTAL}}", absl::StrCat(arg_bytes_total)},
       {"{{ARG_NAMES_CODE}}", arg_names_code},
@@ -1192,10 +1184,10 @@ class {{CLASS}} final : public tensorflow::{{COMPUTATION_CLASS_BASE}} {
   return absl::OkStatus();
 }
 
-static string CreateUniqueIdentifier(const CodegenOpts& opts,
-                                     absl::string_view suffix) {
-  string result = "__tfcompile";
-  for (const string& n : opts.namespaces) {
+static std::string CreateUniqueIdentifier(const CodegenOpts& opts,
+                                          absl::string_view suffix) {
+  std::string result = "__tfcompile";
+  for (const std::string& n : opts.namespaces) {
     absl::StrAppend(&result, "_", n);
   }
 
@@ -1301,14 +1293,15 @@ absl::Status GenerateMetadata(const CodegenOpts& opts,
   return absl::OkStatus();
 }
 
-absl::Status ParseCppClass(const string& cpp_class, string* class_name,
-                           std::vector<string>* namespaces) {
+absl::Status ParseCppClass(const std::string& cpp_class,
+                           std::string* class_name,
+                           std::vector<std::string>* namespaces) {
   class_name->clear();
   namespaces->clear();
   if (cpp_class.empty()) {
     return errors::InvalidArgument("empty cpp_class: " + cpp_class);
   }
-  std::vector<string> parts = absl::StrSplit(cpp_class, "::");
+  std::vector<std::string> parts = absl::StrSplit(cpp_class, "::");
   if (parts.front().empty()) {
     // Allow a fully qualified name that starts with "::".
     parts.erase(parts.begin());
@@ -1341,11 +1334,11 @@ absl::Status ValidateCppIdent(absl::string_view ident, absl::string_view msg) {
   // implementation-defined characters`.  We disallow those here to give
   // better error messages, at the expensive of being more restrictive than
   // the standard.
-  if (ident[0] != '_' && !IsAlpha(ident[0])) {
+  if (ident[0] != '_' && !absl::ascii_isalpha(ident[0])) {
     return errors::InvalidArgument("illegal leading char: ", msg);
   }
   for (size_t pos = 1; pos < ident.size(); ++pos) {
-    if (ident[pos] != '_' && !IsAlphaNum(ident[pos])) {
+    if (ident[pos] != '_' && !absl::ascii_isalnum(ident[pos])) {
       return errors::InvalidArgument("illegal char: ", msg);
     }
   }
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 77300b0fde4e3d..ff7d96720b4eba 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -32,14 +32,14 @@ namespace tfcompile {
 // and the generated metadata object file.
 struct CodegenOpts {
   // The name of the generated C++ class, wrapping the generated function.
-  string class_name;
+  std::string class_name;
 
   // Target triple for the architecture we're targeting.
-  string target_triple;
+  std::string target_triple;
 
   // Namespaces specifies a list of C++ namespaces to add to the generated
   // header.  If empty, all symbols will be in the global namespace.
-  std::vector<string> namespaces;
+  std::vector<std::string> namespaces;
 
   // If true, generate name-to-index data for Lookup{Arg,Result}Index methods.
   bool gen_name_to_index = false;
@@ -62,27 +62,27 @@ struct CodegenOpts {
 struct MetadataResult {
   // These are top level "extern C" declarations that are expected to be visible
   // wherever program_shape_access_shim is emitted.
-  std::vector<string> header_variable_decls;
+  std::vector<std::string> header_variable_decls;
 
   // program_shape_access_shim is a C++ expression that constructs the
   // xla::ProgramShapeProto instance for the CompileResult passed to
   // GenerateMetadata.
-  string program_shape_access_shim;
+  std::string program_shape_access_shim;
 
   // hlo_profile_printer_data_access_shim is a C++ expression that constructs
   // the xla::HloProfilePrinterData instance for the CompileResult passed to
   // GenerateMetadata.  If the xla::HloProfilePrinterData is null then this is a
   // C++ expression that evaluates to nullptr at runtime.
   // This is set only for AOT legacy.
-  string hlo_profile_printer_data_access_shim;
+  std::string hlo_profile_printer_data_access_shim;
 
   // cpu_executable_access_shim is a C++ expression that constructs
   // a protobuf required to construct a CpuExecutable.
   // This is set only for AOT thunks.
-  string cpu_executable_access_shim;
+  std::string cpu_executable_access_shim;
 
   // The contents of the object (".o") file.
-  string object_file_data;
+  std::string object_file_data;
 };
 
 // Generates a set of constant buffers embedded into an object file.
@@ -105,14 +105,16 @@ absl::Status GenerateMetadata(const CodegenOpts& opts,
 absl::Status GenerateHeader(
     const CodegenOpts& opts, const tf2xla::Config& config,
     const CompileResult& compile_result, const MetadataResult& metadata_result,
-    const EmbeddedConstantBuffers& embedded_constant_buffers, string* header);
+    const EmbeddedConstantBuffers& embedded_constant_buffers,
+    std::string* header);
 
 // ParseCppClass parses `cpp_class` into its `class_name` and `namespaces`
 // components.  The syntax is [[<optional_namespace>::],...]<class_name>.  This
 // mirrors the C++ syntax for referring to a class, where multiple namespaces
 // may precede the class name, separated by double-colons.
-absl::Status ParseCppClass(const string& cpp_class, string* class_name,
-                           std::vector<string>* namespaces);
+absl::Status ParseCppClass(const std::string& cpp_class,
+                           std::string* class_name,
+                           std::vector<std::string>* namespaces);
 
 // ValidateCppIdent returns OK iff ident is a valid C++ identifier.  The msg is
 // appended to error messages.
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index afa5a86af9ef47..ec0f336d87f716 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/Support/TargetSelect.h"
 #include "tensorflow/compiler/aot/compile.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/shape_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -40,7 +39,7 @@ namespace tensorflow {
 namespace tfcompile {
 namespace {
 
-using ::xla::cpu_function_runtime::BufferInfo;
+using ::xla::cpu::BufferAllocationInfo;
 
 void ExpectErrorContains(const absl::Status& status, absl::string_view str) {
   EXPECT_NE(absl::OkStatus(), status);
@@ -54,7 +53,7 @@ TEST(ValidateCppIdent, Simple) {
   TF_EXPECT_OK(ValidateCppIdent("_abc", ""));
   TF_EXPECT_OK(ValidateCppIdent("_abc123", ""));
   // Make sure we didn't skip a valid letter or digit
-  string ident;
+  std::string ident;
   for (char c = 'a'; c <= 'z'; c++) {
     ident.append(1, c);
   }
@@ -79,18 +78,19 @@ TEST(ValidateCppIdent, Simple) {
 
 class ParseCppClassTest : public ::testing::Test {
  protected:
-  void ExpectOK(const string& cpp_class, const string& want_class_name,
-                const std::vector<string>& want_namespaces) {
-    string class_name;
-    std::vector<string> namespaces;
+  void ExpectOK(const std::string& cpp_class,
+                const std::string& want_class_name,
+                const std::vector<std::string>& want_namespaces) {
+    std::string class_name;
+    std::vector<std::string> namespaces;
     TF_EXPECT_OK(ParseCppClass(cpp_class, &class_name, &namespaces));
     EXPECT_EQ(class_name, want_class_name);
     EXPECT_EQ(namespaces, want_namespaces);
   }
 
-  void ExpectFail(const string& cpp_class) {
-    string class_name;
-    std::vector<string> namespaces;
+  void ExpectFail(const std::string& cpp_class) {
+    std::string class_name;
+    std::vector<std::string> namespaces;
     EXPECT_NE(ParseCppClass(cpp_class, &class_name, &namespaces),
               absl::OkStatus())
         << cpp_class;
@@ -111,7 +111,7 @@ TEST_F(ParseCppClassTest, ParseOK) {
   ExpectOK("::_foo::MyClass", "MyClass", {"_foo"});
   ExpectOK("::_foo::_MyClass", "_MyClass", {"_foo"});
   // Make sure we didn't skip a valid letter or digit
-  string ident;
+  std::string ident;
   for (char c = 'a'; c <= 'z'; c++) {
     ident.append(1, c);
   }
@@ -144,10 +144,10 @@ TEST_F(ParseCppClassTest, ParseFail) {
 }
 
 static void CompareWithGoldenFile(
-    const string& tensorflow_relative_golden_file_name,
-    const string& expected_contents, bool ignore_cr) {
+    const std::string& tensorflow_relative_golden_file_name,
+    const std::string& expected_contents, bool ignore_cr) {
   // Get rid of all CR characters, we may be running under windows.
-  string sanitized_expected_contents(expected_contents);
+  std::string sanitized_expected_contents(expected_contents);
   if (ignore_cr) {
     sanitized_expected_contents.erase(
         std::remove(sanitized_expected_contents.begin(),
@@ -160,7 +160,7 @@ static void CompareWithGoldenFile(
   // blaz test --test_strategy=local \
   //   "third_party/tensorflow/compiler/aot:codegen_test"
   const bool update_golden = false;
-  string golden_file_name =
+  std::string golden_file_name =
       GetDataDependencyFilepath(tensorflow_relative_golden_file_name);
 
   if (update_golden) {
@@ -168,7 +168,7 @@ static void CompareWithGoldenFile(
         WriteStringToFile(Env::Default(), golden_file_name, expected_contents));
   }
 
-  string golden_file_contents;
+  std::string golden_file_contents;
   TF_ASSERT_OK(ReadFileToString(Env::Default(), golden_file_name,
                                 &golden_file_contents));
   if (ignore_cr) {
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 7d0897829b98ca..48c92bf346926f 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -212,7 +212,7 @@ absl::Status CompileGraph(GraphDef graph_def, const tf2xla::Config& config,
   return CompileXla(client, computation, aot_opts, compile_result);
 }
 
-static absl::Status ReadProtoFile(const string& fname,
+static absl::Status ReadProtoFile(const std::string& fname,
                                   protobuf::Message* proto) {
   if (absl::EndsWith(fname, ".pbtxt")) {
     return ReadTextProto(Env::Default(), fname, proto);
@@ -297,7 +297,7 @@ absl::Status Main(const MainFlags& flags) {
   TF_RETURN_IF_ERROR(ReadProtoFile(flags.config, &config));
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   if (flags.dump_fetch_nodes) {
-    std::set<string> nodes;
+    std::set<std::string> nodes;
     for (const tf2xla::Fetch& fetch : config.fetch()) {
       nodes.insert(fetch.id().node_name());
     }
@@ -368,7 +368,7 @@ absl::Status Main(const MainFlags& flags) {
       GenerateMetadata(codegen_opts, compile_result, &metadata_result));
   TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_metadata_object,
                                        metadata_result.object_file_data));
-  string header;
+  std::string header;
   TF_RETURN_IF_ERROR(GenerateHeader(codegen_opts, config, compile_result,
                                     metadata_result, embedded_constant_buffers,
                                     &header));
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 303854f40ed88c..2a0418126b8aaf 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -38,7 +38,7 @@ struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
   xla::ProgramShapeProto program_shape;  // Static shape of args and results.
-  string entry_point;                    // Name of generated function.
+  std::string entry_point;               // Name of generated function.
   int pointer_size = 0;                  // Size of a pointer in bytes.
 };
 
diff --git a/tensorflow/compiler/aot/embedded_constant_buffers.cc b/tensorflow/compiler/aot/embedded_constant_buffers.cc
index 987dac62bca0fe..b56ca80e26e875 100644
--- a/tensorflow/compiler/aot/embedded_constant_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_constant_buffers.cc
@@ -118,8 +118,8 @@ static absl::StatusOr<std::string> CodegenModule(
 static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
 GetTargetMachineFromTriple(absl::string_view target_triple) {
   std::string error;
-  std::string normalized_triple =
-      llvm::Triple::normalize(AsStringRef(absl::string_view(target_triple)));
+  llvm::Triple normalized_triple(
+      llvm::Triple::normalize(AsStringRef(absl::string_view(target_triple))));
   const llvm::Target* target =
       llvm::TargetRegistry::lookupTarget(normalized_triple, error);
   if (target == nullptr) {
@@ -128,7 +128,7 @@ GetTargetMachineFromTriple(absl::string_view target_triple) {
   }
 
   return absl::WrapUnique(target->createTargetMachine(
-      llvm::Triple(normalized_triple), /*CPU=*/"",
+      normalized_triple, /*CPU=*/"",
       /*Features=*/"", llvm::TargetOptions(), std::nullopt));
 }
 
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index ae5b62ccec01a9..1626686ba465ad 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -41,9 +41,9 @@ using xla::llvm_ir::AsStringRef;
 
 static void AddEmbeddedProtocolBufferToLlvmModule(
     llvm::Module* module, const ::tensorflow::protobuf::MessageLite& proto,
-    absl::string_view unique_identifier, string* protobuf_array_symbol_name,
-    int64_t* protobuf_array_size) {
-  string protobuf_array_contents = proto.SerializeAsString();
+    absl::string_view unique_identifier,
+    std::string* protobuf_array_symbol_name, int64_t* protobuf_array_size) {
+  std::string protobuf_array_contents = proto.SerializeAsString();
   *protobuf_array_symbol_name =
       absl::StrCat(unique_identifier, "_protobuf_array_contents");
   *protobuf_array_size = protobuf_array_contents.size();
@@ -58,10 +58,10 @@ static void AddEmbeddedProtocolBufferToLlvmModule(
       protobuf_array_initializer, AsStringRef(*protobuf_array_symbol_name));
 }
 
-static string CreateCPPShimExpression(
+static std::string CreateCPPShimExpression(
     absl::string_view qualified_cpp_protobuf_name,
     absl::string_view protobuf_array_symbol_name, int64_t protobuf_array_size) {
-  string code =
+  std::string code =
       "[]() {\n"
       "    {{PROTOBUF_NAME}}* proto = new {{PROTOBUF_NAME}};\n"
       "    proto->ParseFromArray(&{{ARRAY_SYMBOL}}[0], {{ARRAY_SIZE}});\n"
@@ -77,7 +77,7 @@ static string CreateCPPShimExpression(
       });
 }
 
-static absl::StatusOr<string> CodegenModule(
+static absl::StatusOr<std::string> CodegenModule(
     llvm::TargetMachine* target_machine, std::unique_ptr<llvm::Module> module) {
   llvm::SmallVector<char, 0> stream_buffer;
   llvm::raw_svector_ostream ostream(stream_buffer);
@@ -91,14 +91,14 @@ static absl::StatusOr<string> CodegenModule(
 
   codegen_passes.run(*module);
 
-  return string(stream_buffer.begin(), stream_buffer.end());
+  return std::string(stream_buffer.begin(), stream_buffer.end());
 }
 
 static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
 GetTargetMachineFromTriple(absl::string_view target_triple) {
   std::string error;
-  std::string normalized_triple =
-      llvm::Triple::normalize(AsStringRef(absl::string_view(target_triple)));
+  llvm::Triple normalized_triple(
+      llvm::Triple::normalize(AsStringRef(absl::string_view(target_triple))));
   const llvm::Target* target =
       llvm::TargetRegistry::lookupTarget(normalized_triple, error);
   if (target == nullptr) {
@@ -107,7 +107,7 @@ GetTargetMachineFromTriple(absl::string_view target_triple) {
   }
 
   return absl::WrapUnique(target->createTargetMachine(
-      llvm::Triple(normalized_triple), /*CPU=*/"",
+      normalized_triple, /*CPU=*/"",
       /*Features=*/"", llvm::TargetOptions(), std::nullopt));
 }
 
@@ -124,9 +124,9 @@ absl::StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
   EmbeddedProtocolBuffers result;
 
   for (const ProtobufToEmbed& protobuf_to_embed : protobufs_to_embed) {
-    string cpp_shim, cpp_variable_decl;
+    std::string cpp_shim, cpp_variable_decl;
     if (protobuf_to_embed.message) {
-      string protobuf_array_symbol_name;
+      std::string protobuf_array_symbol_name;
       int64_t protobuf_array_size;
 
       AddEmbeddedProtocolBufferToLlvmModule(
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index 0af4d4a3362f8c..aa3553f3b6a85b 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -37,11 +37,11 @@ struct EmbeddedProtocolBuffers {
   struct CPPShim {
     // `expression` is a C++ expression that creates an instance of said
     // protocol buffer when executed.
-    string expression;
+    std::string expression;
 
     // `variable_decl` is an "extern C" array declaration that is used in
     // `expression`.  It must be visible wherever `expression` is emitted.
-    string variable_decl;
+    std::string variable_decl;
   };
 
   // Each cpp_shim corresponds to one embedded protocol buffer.
@@ -50,20 +50,20 @@ struct EmbeddedProtocolBuffers {
   // The contents of the object (".o") file the protocol buffers are embbed in.
   // This needs to be linked in to any program that wants to execute any of the
   // expressions in `cpp_shims`.
-  string object_file_data;
+  std::string object_file_data;
 };
 
 // Describes a protocol buffer to embed into an object file.
 struct ProtobufToEmbed {
   // `symbol_prefix` is prefix that is guaranteed to be unique across the binary
   // or DSO the generated object file will be linked into.
-  string symbol_prefix;
+  std::string symbol_prefix;
 
   // `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
   // namespace qualified) protocol buffer name.  This is only used in
   // CPPShim::expression so relatively qualified names are fine as long as
   // they're valid wherever CPPShim::expression is emitted.
-  string qualified_cpp_protobuf_name;
+  std::string qualified_cpp_protobuf_name;
 
   // `message` is the protocol buffer to be embedded.  It is allowed to be
   // nullptr, in which case the generated C++ shim expression is just `nullptr`,
diff --git a/tensorflow/compiler/aot/flags.h b/tensorflow/compiler/aot/flags.h
index 9a3f2900dbafe4..5d0f93f7d67b88 100644
--- a/tensorflow/compiler/aot/flags.h
+++ b/tensorflow/compiler/aot/flags.h
@@ -27,27 +27,27 @@ namespace tfcompile {
 // Flags for the tfcompile binary.  See *.cc file for descriptions.
 
 struct MainFlags {
-  string graph;
-  string debug_info;
-  string debug_info_path_begin_marker;
-  string config;
+  std::string graph;
+  std::string debug_info;
+  std::string debug_info_path_begin_marker;
+  std::string config;
   bool dump_fetch_nodes = false;
-  string target_triple;
-  string target_cpu;
-  string target_features;
-  string entry_point;
-  string cpp_class;
-  string out_function_object;
-  string out_metadata_object;
-  string out_header;
-  string out_constant_buffers_object;
-  string out_session_module;
-  string mlir_components;
+  std::string target_triple;
+  std::string target_cpu;
+  std::string target_features;
+  std::string entry_point;
+  std::string cpp_class;
+  std::string out_function_object;
+  std::string out_metadata_object;
+  std::string out_header;
+  std::string out_constant_buffers_object;
+  std::string out_session_module;
+  std::string mlir_components;
   bool experimental_quantize = false;
 
   // Sanitizer pass options
   bool sanitize_dataflow = false;
-  string sanitize_abilists_dataflow;
+  std::string sanitize_abilists_dataflow;
 
   // C++ codegen options
   bool gen_name_to_index = false;
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 8caeec32b7bc5e..67fea2e6a022c1 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -63,6 +63,7 @@ def _tfcompile_model_library_rule_impl(ctx):
                       "--xla_cpu_fast_math_honor_functions=false " +
                       "--xla_cpu_fast_math_honor_division=false " +
                       "--xla_cpu_enable_fast_min_max=true " +
+                      "--xla_cpu_experimental_ynn_fusion_type= " +
                       additional_xla_flags + " " +
                       "$${XLA_FLAGS:-}' "),
         "CUDA_VISIBLE_DEVICES": "",
@@ -321,10 +322,11 @@ def _tf_library(
             # include_standard_runtime_deps is False.  Without them, the
             # generated code will fail to compile.
             "//third_party/absl/log:check",
+            "//third_party/absl/synchronization",
+            "//tensorflow/core:framework_lite",
             "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
             "@local_xla//xla:types",
             "@local_xla//xla/backends/cpu/runtime:kernel_c_api",
-            "//tensorflow/core:framework_lite",
             "@local_xla//xla/backends/cpu/runtime:rng_state_lib",
         ] + (need_xla_data_proto and [
             # If we're generating the program shape, we must depend on the
@@ -335,12 +337,11 @@ def _tf_library(
         ] or []) + (include_standard_runtime_deps and [
             # TODO(cwhipkey): only depend on kernel code that the model actually
             # needed.
-            "@local_xla//xla/service/cpu:runtime_conv2d",
-            "@local_xla//xla/service/cpu:runtime_custom_call_status",
-            "@local_xla//xla/service/cpu:runtime_key_value_sort",
+            "@local_xla//xla/backends/cpu/runtime:dot_lib",
+            "@local_xla//xla/backends/cpu/runtime:sort_lib",
+            "@local_xla//xla/backends/cpu/runtime:topk_lib",
+            "@local_xla//xla/backends/cpu/runtime:convolution_lib",
             "@local_xla//xla/service/cpu:runtime_matmul",
-            "@local_xla//xla/service/cpu:runtime_topk",
-            "@local_xla//xla/service/cpu:runtime_single_threaded_conv2d",
             "@local_xla//xla/service/cpu:runtime_single_threaded_matmul",
             "@eigen_archive//:eigen3",
         ] or []) + (use_xla_nanort_runtime and [
diff --git a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
index d2ced20a8d5eec..485bfd36dfa0a5 100644
--- a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
+++ b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/layout_util.h"
 #include "xla/service/cpu/executable.pb.h"
@@ -127,32 +127,23 @@ ThunkProtoExecutionDeserializer::ThunkSpecificRunImplFromThunkSequence(
 }
 
 absl::StatusOr<std::string> ThunkProtoExecutionDeserializer::GetMatmulFunction(
-    xla::PrimitiveType xla_type, bool is_single_threaded) {
+    xla::PrimitiveType xla_type) {
   switch (xla_type) {
     case xla::F16:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulF16"
-                 : "__xla_cpu_runtime_EigenMatMulF16";
+      return "::xla::cpu::internal::TypedMatMul<Eigen::half, Eigen::half, "
+             "Eigen::half>";
     case xla::F32:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulF32"
-                 : "__xla_cpu_runtime_EigenMatMulF32";
+      return "::xla::cpu::internal::TypedMatMul<float, float, float>";
     case xla::F64:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulF64"
-                 : "__xla_cpu_runtime_EigenMatMulF64";
+      return "::xla::cpu::internal::TypedMatMul<double, double, double>";
     case xla::C64:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulC64"
-                 : "__xla_cpu_runtime_EigenMatMulC64";
+      return "::xla::cpu::internal::TypedMatMul<std::complex<float>, "
+             "std::complex<float>, std::complex<float>";
     case xla::C128:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulC128"
-                 : "__xla_cpu_runtime_EigenMatMulC128";
+      return "::xla::cpu::internal::TypedMatMul<std::complex<double>, "
+             "std::complex<double>, std::complex<double>";
     case xla::S32:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulS32"
-                 : "__xla_cpu_runtime_EigenMatMulS32";
+      return "::xla::cpu::internal::TypedMatMul<int32_t, int32_t, int32_t>";
     default:
       return xla::Internal("Unsupported xla type: %d", xla_type);
   }
@@ -166,43 +157,23 @@ absl::StatusOr<std::string> ThunkProtoExecutionDeserializer::GetDotThunkRunImpl(
   }
   const xla::cpu::DotThunkProto& dot_thunk = thunk.dot_thunk();
 
-  absl::string_view dot_thunk_invocation_format = xla_cpu_multi_thread_eigen_
-                                                      ? R"(
+  absl::string_view dot_thunk_invocation_format = R"(
      // Dot Thunk
      {
+        absl::BlockingCounter done({{BATCH_SIZE}});
         for (int64_t i = 0; i < {{BATCH_SIZE}}; ++i) {
-          if (run_options->intra_op_thread_pool() != nullptr) {
-            {{MATMUL_FUNCTION}}(
-              run_options,
-              {{OUTPUT_PTR}} + {{OUTPUT_STRIDE}} * i,
-              {{LHS_PTR}} + {{LHS_STRIDE}} * i,
-              {{RHS_PTR}} + {{RHS_STRIDE}} * i,
-              {{M}}, {{N}}, {{K}}, {{TRANSPOSE_LHS}}, {{TRANSPOSE_RHS}});
-          } else {
-            {{SINGLE_THREADED_MATMUL_FUNCTION}}(
-                nullptr,
-                {{OUTPUT_PTR}} + {{OUTPUT_STRIDE}} * i,
-                {{LHS_PTR}} + {{LHS_STRIDE}} * i,
-                {{RHS_PTR}} + {{RHS_STRIDE}} * i,
-                {{M}}, {{N}}, {{K}}, {{TRANSPOSE_LHS}}, {{TRANSPOSE_RHS}});
-          }
+          {{MATMUL_FUNCTION}}(
+            run_options->intra_op_thread_pool(),
+            {{OUTPUT_PTR}} + {{OUTPUT_STRIDE}} * i,
+            {{LHS_PTR}} + {{LHS_STRIDE}} * i,
+            {{RHS_PTR}} + {{RHS_STRIDE}} * i,
+            {{M}}, {{N}}, {{K}}, {{TRANSPOSE_LHS}}, {{TRANSPOSE_RHS}},
+            [&done] { done.DecrementCount(); }
+          );
         }
+        done.Wait();
      }
-     )"
-                                                      :
-                                                      R"(
-      // Dot Thunk
-      {
-         for (int64_t i = 0; i < {{BATCH_SIZE}}; ++i) {
-          {{SINGLE_THREADED_MATMUL_FUNCTION}}(
-                nullptr,
-                {{OUTPUT_PTR}} + {{OUTPUT_STRIDE}} * i,
-                {{LHS_PTR}} + {{LHS_STRIDE}} * i,
-                {{RHS_PTR}} + {{RHS_STRIDE}} * i,
-                {{M}}, {{N}}, {{K}}, {{TRANSPOSE_LHS}}, {{TRANSPOSE_RHS}});
-         }
-      }
-      )";
+     )";
 
   if (!(dot_thunk.lhs_buffer_shape().shape().element_type() ==
             dot_thunk.rhs_buffer_shape().shape().element_type() &&
@@ -214,13 +185,7 @@ absl::StatusOr<std::string> ThunkProtoExecutionDeserializer::GetDotThunkRunImpl(
 
   TF_ASSIGN_OR_RETURN(
       std::string matmul_function,
-      GetMatmulFunction(dot_thunk.lhs_buffer_shape().shape().element_type(),
-                        /*is_single_threaded=*/false));
-
-  TF_ASSIGN_OR_RETURN(
-      std::string single_threaded_matmul_function,
-      GetMatmulFunction(dot_thunk.lhs_buffer_shape().shape().element_type(),
-                        /*is_single_threaded=*/true));
+      GetMatmulFunction(dot_thunk.lhs_buffer_shape().shape().element_type()));
 
   TF_ASSIGN_OR_RETURN(std::string data_type,
                       CppDataTypeFromXlaType(
@@ -280,7 +245,7 @@ absl::StatusOr<std::string> ThunkProtoExecutionDeserializer::GetDotThunkRunImpl(
   int64_t out_stride = m * n;
 
   std::vector<std::pair<std::string, std::string>> rewrites = {
-      {"{{SINGLE_THREADED_MATMUL_FUNCTION}}", single_threaded_matmul_function},
+      {"{{MATMUL_FUNCTION}}", matmul_function},
       {"{{OUTPUT_PTR}}", output_ptr},
       {"{{OUTPUT_STRIDE}}", absl::StrCat(out_stride)},
       {"{{LHS_PTR}}", lhs_ptr},
@@ -294,25 +259,17 @@ absl::StatusOr<std::string> ThunkProtoExecutionDeserializer::GetDotThunkRunImpl(
       {"{{TRANSPOSE_RHS}}", transpose_rhs ? "true" : "false"},
       {"{{BATCH_SIZE}}", absl::StrCat(dot_shape.batch_size)}};
 
-  if (xla_cpu_multi_thread_eigen_) {
-    rewrites.push_back({"{{MATMUL_FUNCTION}}", matmul_function});
-  }
-
   return absl::StrReplaceAll(dot_thunk_invocation_format, rewrites);
 };
 
 absl::StatusOr<std::string>
 ThunkProtoExecutionDeserializer::GetConvolutionFunction(
-    xla::PrimitiveType xla_type, bool is_single_threaded) {
+    xla::PrimitiveType xla_type) {
   switch (xla_type) {
     case xla::F16:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedConv2DF16"
-                 : "__xla_cpu_runtime_EigenConv2DF16";
+      return "xla::cpu::internal::EigenConv2D<Eigen::half>";
     case xla::F32:
-      return is_single_threaded
-                 ? "__xla_cpu_runtime_EigenSingleThreadedConv2DF32"
-                 : "__xla_cpu_runtime_EigenConv2DF32";
+      return "xla::cpu::internal::EigenConv2D<float>";
     default:
       return xla::Internal("Unsupported xla type: %d", xla_type);
   }
@@ -345,63 +302,28 @@ ThunkProtoExecutionDeserializer::GetConvolution2DRunImpl(
   TF_ASSIGN_OR_RETURN(
       std::string convolution_function,
       GetConvolutionFunction(
-          convolution_thunk.input_buffer_shape().shape().element_type(),
-          /*is_single_threaded=*/false));
-
-  TF_ASSIGN_OR_RETURN(
-      std::string single_threaded_convolution_function,
-      GetConvolutionFunction(
-          convolution_thunk.input_buffer_shape().shape().element_type(),
-          /*is_single_threaded=*/true));
+          convolution_thunk.input_buffer_shape().shape().element_type()));
 
-  absl::string_view convolution_thunk_invocation_format =
-      xla_cpu_multi_thread_eigen_ ? R"(
+  absl::string_view convolution_thunk_invocation_format = R"(
      // Convolution Thunk
      {
-         if (run_options->intra_op_thread_pool() != nullptr) {
-           {{CONVOLUTION_FUNCTION}}(
-             run_options,
-             {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}}, {{INPUT_BATCH}},
-             {{INPUT_ROWS}}, {{INPUT_COLS}}, {{INPUT_CHANNELS}}, {{KERNEL_ROWS}},
-             {{KERNEL_COLS}}, {{KERNEL_CHANNELS}}, {{KERNEL_FILTERS}},
-             {{OUTPUT_ROWS}}, {{OUTPUT_COLS}}, {{ROW_STRIDE}}, {{COL_STRIDE}},
-             {{PADDING_TOP}}, {{PADDING_BOTTOM}}, {{PADDING_LEFT}},
-             {{PADDING_RIGHT}}, {{LHS_ROW_DILATION}}, {{LHS_COL_DILATION}},
-             {{RHS_ROW_DILATION}}, {{RHS_COL_DILATION}}, {{FEATURE_GROUP_COUNT}}
-           );
-         } else {
-           {{SINGLE_THREADED_CONVOLUTION_FUNCTION}}(
-             nullptr,
-             {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}}, {{INPUT_BATCH}},
-             {{INPUT_ROWS}}, {{INPUT_COLS}}, {{INPUT_CHANNELS}}, {{KERNEL_ROWS}},
-             {{KERNEL_COLS}}, {{KERNEL_CHANNELS}}, {{KERNEL_FILTERS}},
-             {{OUTPUT_ROWS}}, {{OUTPUT_COLS}}, {{ROW_STRIDE}}, {{COL_STRIDE}},
-             {{PADDING_TOP}}, {{PADDING_BOTTOM}}, {{PADDING_LEFT}},
-             {{PADDING_RIGHT}}, {{LHS_ROW_DILATION}}, {{LHS_COL_DILATION}},
-             {{RHS_ROW_DILATION}}, {{RHS_COL_DILATION}}, {{FEATURE_GROUP_COUNT}}
-           );
-         }
-     })"
-                                  :
-                                  R"(
-      // Convolution Thunk
-      {
-        {{SINGLE_THREADED_CONVOLUTION_FUNCTION}}(
-              nullptr,
-              {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}}, {{INPUT_BATCH}},
-              {{INPUT_ROWS}}, {{INPUT_COLS}}, {{INPUT_CHANNELS}}, {{KERNEL_ROWS}},
-              {{KERNEL_COLS}}, {{KERNEL_CHANNELS}}, {{KERNEL_FILTERS}},
-              {{OUTPUT_ROWS}}, {{OUTPUT_COLS}}, {{ROW_STRIDE}}, {{COL_STRIDE}},
-              {{PADDING_TOP}}, {{PADDING_BOTTOM}}, {{PADDING_LEFT}},
-              {{PADDING_RIGHT}}, {{LHS_ROW_DILATION}}, {{LHS_COL_DILATION}},
-              {{RHS_ROW_DILATION}}, {{RHS_COL_DILATION}}, {{FEATURE_GROUP_COUNT}}
-            );
-      }
-      )";
+        absl::Notification done;
+        {{CONVOLUTION_FUNCTION}}(
+          run_options->intra_op_thread_pool(),
+          {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}}, {{INPUT_BATCH}},
+          {{INPUT_ROWS}}, {{INPUT_COLS}}, {{INPUT_CHANNELS}}, {{KERNEL_ROWS}},
+          {{KERNEL_COLS}}, {{KERNEL_CHANNELS}}, {{KERNEL_FILTERS}},
+          {{OUTPUT_ROWS}}, {{OUTPUT_COLS}}, {{ROW_STRIDE}}, {{COL_STRIDE}},
+          {{PADDING_TOP}}, {{PADDING_BOTTOM}}, {{PADDING_LEFT}},
+          {{PADDING_RIGHT}}, {{LHS_ROW_DILATION}}, {{LHS_COL_DILATION}},
+          {{RHS_ROW_DILATION}}, {{RHS_COL_DILATION}}, {{FEATURE_GROUP_COUNT}},
+          [&done] { done.Notify(); }
+        );
+        done.WaitForNotification();
+     })";
 
   std::vector<std::pair<std::string, std::string>> rewrites = {
-      {"{{SINGLE_THREADED_CONVOLUTION_FUNCTION}}",
-       single_threaded_convolution_function},
+      {"{{CONVOLUTION_FUNCTION}}", convolution_function},
       {"{{OUTPUT_PTR}}", output_ptr},
       {"{{LHS_PTR}}", lhs_ptr},
       {"{{RHS_PTR}}", rhs_ptr},
@@ -428,10 +350,6 @@ ThunkProtoExecutionDeserializer::GetConvolution2DRunImpl(
       {"{{FEATURE_GROUP_COUNT}}",
        absl::StrCat(canonical_dims.feature_group_count)}};
 
-  if (xla_cpu_multi_thread_eigen_) {
-    rewrites.push_back({"{{CONVOLUTION_FUNCTION}}", convolution_function});
-  }
-
   return absl::StrReplaceAll(convolution_thunk_invocation_format, rewrites);
 }
 
@@ -594,35 +512,47 @@ ThunkProtoExecutionDeserializer::GetSortThunkRunImpl(
   std::vector<std::string> buffers_to_sort;
   buffers_to_sort.reserve(sort_thunk.inputs_shapes_size());
 
-  std::vector<int32_t> values_primitive_type_size_in_bytes;
-  values_primitive_type_size_in_bytes.reserve(sort_thunk.inputs_shapes_size());
+  std::vector<int32_t> primitive_sizes;
+  primitive_sizes.reserve(sort_thunk.inputs_shapes_size());
   for (const auto& buffer_proto : sort_thunk.inputs_shapes()) {
     buffers_to_sort.push_back(
-        absl::StrCat("reinterpret_cast<char*>(",
+        absl::StrCat("reinterpret_cast<std::byte*>(",
                      GetBufferAllocationString(buffer_proto.slice()), ")"));
-    values_primitive_type_size_in_bytes.push_back(
-        xla::ShapeUtil::ByteSizeOfPrimitiveType(
-            buffer_proto.shape().element_type()));
+    primitive_sizes.push_back(xla::ShapeUtil::ByteSizeOfPrimitiveType(
+        buffer_proto.shape().element_type()));
   }
   absl::string_view sort_thunk_invocation_format = R"(
      // Sort Thunk
      {
-       std::vector<char*> values = {
+       std::vector<std::byte*> values = {
          {{BUFFERS_TO_SORT}}
        };
-       std::vector<int32_t> values_primitive_type_size_in_bytes = {
+       std::vector<size_t> primitive_sizes = {
          {{VALUES_PRIMITIVE_TYPE_SIZE_IN_BYTES}}
        };
 
-       __xla_cpu_runtime_KeyValueSort(
-         {{HIGHER_DIMENSIONS}}, {{SORT_DIMENSION_ELEMENTS}}, {{LOWER_DIMENSIONS}},
-         values.data(),
-         int32_t(values.size()),
-         values_primitive_type_size_in_bytes.data(),
-         /*is_stable=*/{{IS_STABLE}},
-         reinterpret_cast<char*>(run_options),
-         /*prof_counters=*/nullptr,
-         reinterpret_cast<void(*)(char*, char*, char**, char**, int64_t*)>({{SORT_FUNCTION_NAME}}));
+       // Type alias compatible with `FunctionLibrary::Comparator`.
+       using Comparator = void(bool* result, const void* run_options,
+                               const void** params, const void* buffer_table,
+                               const void* status, const void* prof_counters);
+       Comparator* comparator = reinterpret_cast<Comparator*>(
+           {{SORT_FUNCTION_NAME}});
+
+       absl::AnyInvocable<bool(const void** data)> less_than =
+           [comparator](const void** data) {
+             bool result;
+             (*comparator)(&result, nullptr, data, nullptr, nullptr, nullptr);
+             ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&result, sizeof(result));
+             return result;
+           };
+
+       xla::cpu::internal::SortInplace(
+         {
+           {{HIGHER_DIMENSIONS}},
+           {{SORT_DIMENSION_ELEMENTS}},
+           {{LOWER_DIMENSIONS}}
+         },
+         values, primitive_sizes, {{IS_STABLE}}, &less_than);
      })";
 
   TF_ASSIGN_OR_RETURN(
@@ -660,7 +590,7 @@ ThunkProtoExecutionDeserializer::GetSortThunkRunImpl(
           {"{{SORT_FUNCTION_NAME}}", sort_thunk.comparator_name()},
           {"{{BUFFERS_TO_SORT}}", absl::StrJoin(buffers_to_sort, ", ")},
           {"{{VALUES_PRIMITIVE_TYPE_SIZE_IN_BYTES}}",
-           absl::StrJoin(values_primitive_type_size_in_bytes, ", ")},
+           absl::StrJoin(primitive_sizes, ", ")},
           {"{{IS_STABLE}}", sort_thunk.is_stable() ? "true" : "false"},
       });
 }
@@ -677,7 +607,7 @@ ThunkProtoExecutionDeserializer::GetTopKThunkRunImpl(
   absl::string_view topk_thunk_invocation_format = R"(
      // TopK Thunk
      {
-    __xla_cpu_runtime_TopKF32({{BATCH_SIZE}}, {{INPUT_SIZE}}, {{K}},
+    ::xla::cpu::internal::TopK({{BATCH_SIZE}}, {{INPUT_SIZE}}, {{K}},
                               reinterpret_cast<const float*>({{VALUES_PTR}}),
                               reinterpret_cast<float*>({{OUTPUT_PTR}}),
                               reinterpret_cast<int32_t*>({{INDICES_PTR}}));
diff --git a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
index 1e5e47f140020e..a5f7ddcd5fa13b 100644
--- a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
+++ b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/cpu/executable.pb.h"
@@ -44,14 +44,13 @@ class ThunkProtoExecutionDeserializer {
       const xla::cpu::ThunkSequenceProto& thunk_sequence_proto);
 
  protected:
-  absl::StatusOr<std::string> GetMatmulFunction(xla::PrimitiveType xla_type,
-                                                bool is_single_threaded);
+  absl::StatusOr<std::string> GetMatmulFunction(xla::PrimitiveType xla_type);
 
   absl::StatusOr<std::string> GetDotThunkRunImpl(
       const xla::cpu::ThunkProto& thunk);
 
   absl::StatusOr<std::string> GetConvolutionFunction(
-      xla::PrimitiveType xla_type, bool is_single_threaded);
+      xla::PrimitiveType xla_type);
 
   absl::StatusOr<std::string> GetConvolution2DRunImpl(
       const xla::cpu::ConvolutionThunkProto& convolution_thunk,
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index c65bb6c44b1079..7c1772c084750c 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -654,6 +654,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_xla//xla:future",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla:util",
@@ -662,7 +663,6 @@ cc_library(
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:pjrt_common",
         "@local_xla//xla/pjrt:pjrt_executable",
-        "@local_xla//xla/pjrt:pjrt_future",
         "@local_xla//xla/service:executable",
         "@local_xla//xla/service:maybe_owning_device_memory",
         "@local_xla//xla/service:shaped_buffer",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index bed899bfed2f3e..31f1aeedd9850e 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -132,7 +132,7 @@ void MergeOutgoingDataEdges(const Scope& s, Node* old_node, Node* new_node,
     if (merged_output.node() == nullptr) {
       Output new_output(new_node, oidx);
       if (debugging_opts.print_outputs) {
-        string cpu_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+        std::string cpu_device = "/job:localhost/replica:0/task:0/device:CPU:0";
         ops::Print print_op(s.WithOpName("print_", oidx)
                                 .WithDevice(cpu_device)
                                 .WithAssignedDevice(cpu_device),
@@ -298,7 +298,8 @@ absl::StatusOr<Node*> ReplaceFunctionCallWithPartitionedCall(
     const GraphOptimizationPassOptions& options,
     const FunctionLibraryDefinition& flib_def, Node* n, Graph* g,
     const NameAttrList& func, const Scope& root) {
-  string config_string = options.session_options->config.SerializeAsString();
+  std::string config_string =
+      options.session_options->config.SerializeAsString();
 
   int input_count = absl::c_count_if(
       n->in_edges(), [](const Edge* e) { return !e->IsControlEdge(); });
@@ -346,7 +347,8 @@ absl::StatusOr<Node*> ReplaceFunctionCallWithPartitionedCall(
 
 absl::StatusOr<jit::DeviceId> InferDeviceForCluster(
     jit::DeviceInfoCache* device_info_cache, Node* n,
-    const string& function_name, const FunctionLibraryDefinition& flib_def) {
+    const std::string& function_name,
+    const FunctionLibraryDefinition& flib_def) {
   const FunctionDef* func_def = flib_def.Find(function_name);
   TF_RET_CHECK(func_def) << "Could not find " << function_name;
 
@@ -485,7 +487,8 @@ absl::Status ReplaceNodeWithXlaCompileAndXlaRun(
     requires_compilation = true;
   }
 
-  string device_name_str = string(device_info_cache->GetNameFor(device));
+  std::string device_name_str =
+      std::string(device_info_cache->GetNameFor(device));
 
   absl::Status status;
   Scope root = NewInternalScope(g, &status, /*refiner=*/nullptr)
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index c3b5ba5521ee65..6b90557df4b86f 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -85,8 +85,8 @@ absl::Status BuildXlaOps(const Scope& s, const FunctionDefLibrary& fdef_lib,
   return absl::OkStatus();
 }
 
-absl::Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
-                                   const string& node_name,
+absl::Status MakeXlaCompiledKernel(Graph* graph, const std::string& callee_name,
+                                   const std::string& node_name,
                                    int num_constant_args, int num_resource_args,
                                    Node** result) {
   NodeDef call_node;
@@ -99,14 +99,16 @@ absl::Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
   return absl::OkStatus();
 }
 
-absl::Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
-                                   const string& node_name, Node** result) {
+absl::Status MakeXlaCompiledKernel(Graph* graph, const std::string& callee_name,
+                                   const std::string& node_name,
+                                   Node** result) {
   return MakeXlaCompiledKernel(graph, callee_name, node_name,
                                /*num_constant_args=*/0, /*num_resource_args=*/0,
                                result);
 }
 
-Node* MakeWrite(const Scope& scope, Output value_to_write, const string& id) {
+Node* MakeWrite(const Scope& scope, Output value_to_write,
+                const std::string& id) {
   Output var_handle = ops::VarHandleOp(scope.WithOpName("Var_" + id), DT_FLOAT,
                                        TensorShape({}));
   ops::AssignVariableOp assign_op(scope.WithOpName("Assignee_" + id),
@@ -114,12 +116,13 @@ Node* MakeWrite(const Scope& scope, Output value_to_write, const string& id) {
   return assign_op.operation.node();
 }
 
-Node* MakeWrite(const Scope& scope, const string& id) {
+Node* MakeWrite(const Scope& scope, const std::string& id) {
   return MakeWrite(
       scope, ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f), id);
 }
 
-FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
+FunctionDefLibrary CreateFunctionDefLibWithConstFunction(
+    const std::string& name) {
   FunctionDefLibrary fdef_lib;
   FunctionDef func = FunctionDefHelper::Create(
       /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"},
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
index bb8dce848cfbc9..4164efc65a8f4c 100644
--- a/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
@@ -36,19 +36,21 @@ class CloneConstantsForBetterClusteringPassImpl {
 
  private:
   absl::Status CloneSmallConstantInputs(
-      const absl::flat_hash_set<string>& name_set, Node* n);
-  string GenerateUniqueName(const absl::flat_hash_set<string>& name_set,
-                            absl::string_view prefix);
-  absl::StatusOr<Node*> CloneNode(const absl::flat_hash_set<string>& name_set,
-                                  Node* n);
+      const absl::flat_hash_set<std::string>& name_set, Node* n);
+  std::string GenerateUniqueName(
+      const absl::flat_hash_set<std::string>& name_set,
+      absl::string_view prefix);
+  absl::StatusOr<Node*> CloneNode(
+      const absl::flat_hash_set<std::string>& name_set, Node* n);
 
   Graph* graph_;
   int unique_name_counter_;
 };
 
-string CloneConstantsForBetterClusteringPassImpl::GenerateUniqueName(
-    const absl::flat_hash_set<string>& name_set, absl::string_view prefix) {
-  string candidate;
+std::string CloneConstantsForBetterClusteringPassImpl::GenerateUniqueName(
+    const absl::flat_hash_set<std::string>& name_set,
+    absl::string_view prefix) {
+  std::string candidate;
   do {
     candidate = absl::StrCat(prefix, "/clone_", unique_name_counter_++);
   } while (name_set.contains(candidate));
@@ -56,7 +58,7 @@ string CloneConstantsForBetterClusteringPassImpl::GenerateUniqueName(
 }
 
 absl::StatusOr<Node*> CloneConstantsForBetterClusteringPassImpl::CloneNode(
-    const absl::flat_hash_set<string>& name_set, Node* n) {
+    const absl::flat_hash_set<std::string>& name_set, Node* n) {
   NodeDef new_in_def = n->def();
   new_in_def.clear_input();
   new_in_def.set_name(GenerateUniqueName(name_set, new_in_def.name()));
@@ -112,7 +114,7 @@ bool IsInPlaceOp(absl::string_view op_name) {
 
 absl::Status
 CloneConstantsForBetterClusteringPassImpl::CloneSmallConstantInputs(
-    const absl::flat_hash_set<string>& name_set, Node* n) {
+    const absl::flat_hash_set<std::string>& name_set, Node* n) {
   std::vector<const Edge*> in_edges;
   // Get the edges and sort them so we clone in a deterministic order.
   absl::c_copy(n->in_edges(), std::back_inserter(in_edges));
@@ -142,7 +144,7 @@ CloneConstantsForBetterClusteringPassImpl::CloneSmallConstantInputs(
 }
 
 absl::Status CloneConstantsForBetterClusteringPassImpl::Run() {
-  absl::flat_hash_set<string> name_set;
+  absl::flat_hash_set<std::string> name_set;
   absl::c_transform(graph_->nodes(), std::inserter(name_set, name_set.begin()),
                     [](Node* n) { return n->name(); });
   std::vector<Node*> nodes;
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass.cc b/tensorflow/compiler/jit/cluster_scoping_pass.cc
index e70be48f0b7341..20a3d98be1d0f2 100644
--- a/tensorflow/compiler/jit/cluster_scoping_pass.cc
+++ b/tensorflow/compiler/jit/cluster_scoping_pass.cc
@@ -51,8 +51,8 @@ class ClusterScopingPassImpl {
   size_t unique_scope_id_;
 };
 
-std::optional<string> GetXlaInternalScope(Node* node) {
-  string scope;
+std::optional<std::string> GetXlaInternalScope(Node* node) {
+  std::string scope;
   if (GetNodeAttr(node->attrs(), kXlaInternalScopeAttr, &scope).ok()) {
     return scope;
   }
@@ -85,8 +85,8 @@ void SetXlaInternalScope(Node* node, absl::string_view scope) {
 //  Node_X (scope "stage") -> Stage
 //
 void AddOrAppendXlaInternalScope(Node* node, absl::string_view suffix) {
-  string updated_scope;
-  std::optional<string> cur_scope = GetXlaInternalScope(node);
+  std::string updated_scope;
+  std::optional<std::string> cur_scope = GetXlaInternalScope(node);
   if (cur_scope == std::nullopt) {
     updated_scope = std::string(suffix);
   } else {
@@ -96,7 +96,7 @@ void AddOrAppendXlaInternalScope(Node* node, absl::string_view suffix) {
 }
 
 void ClusterScopingPassImpl::AddScopeToAllTransitivePredecessors(Node* start) {
-  const string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
+  const std::string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
 
   std::vector<Node*> starts;
   starts.push_back(start);
@@ -106,7 +106,7 @@ void ClusterScopingPassImpl::AddScopeToAllTransitivePredecessors(Node* start) {
 }
 
 void ClusterScopingPassImpl::AddScopeToAllTransitiveSuccessors(Node* start) {
-  const string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
+  const std::string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
 
   std::vector<Node*> starts;
   starts.push_back(start);
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass_test.cc b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
index b09cb2c12fa297..66cc10775992a3 100644
--- a/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
+++ b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
@@ -45,10 +45,11 @@ absl::Status ClusterScoping(std::unique_ptr<Graph>* graph) {
   return pass.Run(opt_options);
 }
 
-absl::flat_hash_map<string, string> GetXlaInternalScopes(const Graph& graph) {
-  absl::flat_hash_map<string, string> scopes;
+absl::flat_hash_map<std::string, std::string> GetXlaInternalScopes(
+    const Graph& graph) {
+  absl::flat_hash_map<std::string, std::string> scopes;
   for (Node* node : graph.nodes()) {
-    string scope;
+    std::string scope;
     if (GetNodeAttr(node->attrs(), kXlaInternalScopeAttr, &scope).ok()) {
       scopes[node->name()] = scope;
     }
@@ -63,7 +64,7 @@ absl::flat_hash_map<string, string> GetXlaInternalScopes(const Graph& graph) {
   return scopes;
 }
 
-Node* BuildStageNode(GraphDefBuilder& builder, string name,
+Node* BuildStageNode(GraphDefBuilder& builder, std::string name,
                      std::initializer_list<DataType> dtypes,
                      absl::Span<const ops::NodeOut> values) {
   auto opts = builder.opts()
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 50b26371698877..6c77648817f808 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -172,7 +172,7 @@ RecursiveCompilabilityChecker::FindUncompilableNodes(
 }
 
 bool RecursiveCompilabilityChecker::HasXLAKernel(
-    const Node& node, string* uncompilable_reason) const {
+    const Node& node, std::string* uncompilable_reason) const {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
   // IsCompilableCall().
@@ -424,7 +424,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
     return false;
   }
 
-  string uncompilable_reason;
+  std::string uncompilable_reason;
   if (IsFunctionCall(*lib_runtime->GetFunctionLibraryDefinition(), node)) {
     if (!IsCompilableCall(node.def(), lib_runtime, stack_trace,
                           encapsulating_function, uncompilable_nodes)) {
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 0d86c22de11a22..7d6741529ebd08 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -262,7 +262,7 @@ class RecursiveCompilabilityChecker {
   }
 
   bool HasXLAKernel(const Node& node,
-                    string* uncompilable_reason = nullptr) const;
+                    std::string* uncompilable_reason = nullptr) const;
 
   static void MaybeMarkUncompilableNode(
       const absl::string_view reason,
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 2b2db07642d1ab..fa546e3543e358 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -123,7 +123,7 @@ class Predicate {
  public:
   enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol, kIntSymbol };
 
-  virtual string ToString() const = 0;
+  virtual std::string ToString() const = 0;
 
   // An ID assigned to the Predicate at construction time.  Conceptually like a
   // pointer, except that it is stable across runs.
@@ -156,12 +156,12 @@ class AndPredicate : public Predicate {
   explicit AndPredicate(int64_t id, std::vector<Predicate*> operands)
       : Predicate(id), operands_(std::move(operands)) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     if (operands().empty()) {
       return "#true";
     }
 
-    std::vector<string> operands_str;
+    std::vector<std::string> operands_str;
     std::transform(operands().begin(), operands().end(),
                    std::back_inserter(operands_str),
                    [](Predicate* pred) { return pred->ToString(); });
@@ -186,12 +186,12 @@ class OrPredicate : public Predicate {
   explicit OrPredicate(int64_t id, std::vector<Predicate*> operands)
       : Predicate(id), operands_(std::move(operands)) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     if (operands().empty()) {
       return "#false";
     }
 
-    std::vector<string> operands_str;
+    std::vector<std::string> operands_str;
     std::transform(operands().begin(), operands().end(),
                    std::back_inserter(operands_str),
                    [](Predicate* pred) { return pred->ToString(); });
@@ -215,7 +215,7 @@ class NotPredicate : public Predicate {
   explicit NotPredicate(int64_t id, Predicate* operand)
       : Predicate(id), operands_({operand}) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     return absl::StrCat("~", operand()->ToString());
   }
 
@@ -251,14 +251,14 @@ class NotPredicate : public Predicate {
 class AndRecurrencePredicate : public Predicate {
  public:
   explicit AndRecurrencePredicate(int64_t id, Predicate* start, Predicate* step,
-                                  std::vector<string> frame)
+                                  std::vector<std::string> frame)
       : Predicate(id), operands_({start, step}), frame_(std::move(frame)) {}
 
   Predicate* start() const { return operands_[0]; }
   Predicate* step() const { return operands_[1]; }
-  absl::Span<const string> frame() const { return frame_; }
+  absl::Span<const std::string> frame() const { return frame_; }
 
-  string ToString() const override {
+  std::string ToString() const override {
     return absl::StrCat("{", start()->ToString(), ",&,", step()->ToString(),
                         "}<", absl::StrJoin(frame(), ";"), ">");
   }
@@ -271,7 +271,7 @@ class AndRecurrencePredicate : public Predicate {
 
  private:
   std::array<Predicate*, 2> operands_;
-  std::vector<string> frame_;
+  std::vector<std::string> frame_;
 };
 
 // Represents an uninterpreted symbol in a logical predicate.
@@ -286,7 +286,7 @@ class SymbolPredicate : public Predicate {
         tensor_id_(std::move(tensor_id)),
         must_be_true_(must_be_true) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     return must_be_true() ? absl::StrCat("*", tensor_id_.ToString())
                           : tensor_id_.ToString();
   }
@@ -320,7 +320,7 @@ class IntSymbolPredicate : public Predicate {
         tensor_id_(std::move(tensor_id)),
         must_have_value_(must_have_value) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     return must_have_value().has_value()
                ? absl::StrCat(tensor_id_.ToString(), "=", *must_have_value_)
                : tensor_id_.ToString();
@@ -396,7 +396,7 @@ class PredicateFactory {
   }
 
   Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step,
-                                        std::vector<string> frame) {
+                                        std::vector<std::string> frame) {
     SignatureForAndRec signature(start, step, std::move(frame));
     auto it = interned_and_rec_instances_.find(signature);
     if (it != interned_and_rec_instances_.end()) {
@@ -463,8 +463,8 @@ class PredicateFactory {
       Tensor tensor(proto->dtype());
       TF_RET_CHECK(tensor.FromProto(*proto));
 
-      *predicate = tensor.scalar<int32>()() == *must_have_value ? MakeTrue()
-                                                                : MakeFalse();
+      *predicate = tensor.scalar<int32_t>()() == *must_have_value ? MakeTrue()
+                                                                  : MakeFalse();
       return absl::OkStatus();
     }
     SignatureForIntSymbol signature = {tensor_id, must_have_value};
@@ -559,9 +559,9 @@ class PredicateFactory {
       std::pair<Predicate::Kind, absl::Span<Predicate* const>>;
   using SignatureForNot = Predicate*;
   using SignatureForAndRec =
-      std::tuple<Predicate*, Predicate*, std::vector<string>>;
+      std::tuple<Predicate*, Predicate*, std::vector<std::string>>;
   using SignatureForSymbol = std::pair<SafeTensorId, bool>;
-  using SignatureForIntSymbol = std::pair<SafeTensorId, std::optional<int32>>;
+  using SignatureForIntSymbol = std::pair<SafeTensorId, std::optional<int32_t>>;
 
   struct HashSignatureForAndOr {
     size_t operator()(const SignatureForAndOr& signature) const {
@@ -586,7 +586,7 @@ class PredicateFactory {
           SafeTensorId::Hasher()(signature.first),
           Hash64Combine(
               ::tensorflow::hash<bool>()(signature.second.has_value()),
-              ::tensorflow::hash<int32>()(
+              ::tensorflow::hash<int32_t>()(
                   signature.second.has_value() ? *signature.second : 0)));
     }
   };
@@ -830,8 +830,8 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   absl::StatusOr<DeadnessAnalysis::DeadnessPredicate> GetPredicateFor(
       Node* n, int oidx) const override;
   void Print() const override;
-  absl::flat_hash_map<TensorId, string, TensorId::Hasher> PredicateMapAsString()
-      const;
+  absl::flat_hash_map<TensorId, std::string, TensorId::Hasher>
+  PredicateMapAsString() const;
 
  private:
   enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly };
@@ -958,7 +958,7 @@ absl::Status DeadnessAnalysisImpl::HandleSwitch(
     for (int i = 0; i < n->num_outputs() - 1; i++) {
       TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
           pred_edge->src(), pred_edge->src_output(),
-          /*must_have_value=*/std::optional<int32>(i), &branch_pred));
+          /*must_have_value=*/std::optional<int32_t>(i), &branch_pred));
       input_preds.push_back(branch_pred);
       SetPredicate(n, i, predicate_factory_.MakeAndPredicate(input_preds),
                    should_revisit);
@@ -982,7 +982,7 @@ absl::Status DeadnessAnalysisImpl::HandleSwitch(
 
 namespace {
 absl::Status CreateMultipleNextIterationInputsError(Node* merge) {
-  std::vector<string> backedges;
+  std::vector<std::string> backedges;
   for (const Edge* backedge : merge->in_edges()) {
     if (backedge->src()->IsNextIteration()) {
       backedges.push_back(absl::StrCat("  ", SummarizeNode(*backedge->src())));
@@ -1058,7 +1058,7 @@ Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory,
 
 absl::Status GetFullFrame(const Node* n,
                           absl::Span<const ControlFlowInfo> cfi_infos,
-                          std::vector<string>* frame) {
+                          std::vector<std::string>* frame) {
   int depth = 0;
   for (const ControlFlowInfo* cfi_iter = &cfi_infos[n->id()]; !n->IsSource();
        n = cfi_iter->parent_frame, cfi_iter = &cfi_infos[n->id()]) {
@@ -1174,7 +1174,7 @@ absl::Status DeadnessAnalysisImpl::HandleMerge(
 
         Predicate* start =
             predicate_factory_.MakeOrPredicate(non_recurrent_inputs);
-        std::vector<string> frame;
+        std::vector<std::string> frame;
         TF_RETURN_IF_ERROR(GetFullFrame(n, control_flow_info_, &frame));
         Predicate* and_rec = predicate_factory_.MakeAndRecurrencePredicate(
             start, step, std::move(frame));
@@ -1358,7 +1358,7 @@ absl::Status DeadnessAnalysisImpl::GetFrameBasedTopologicalOrder(
 // nested while, as there is no clean cut for separating them in the topological
 // order.
 absl::Status DeadnessAnalysisImpl::Populate(bool enable_optimistic) {
-  std::vector<string> unreachable_nodes;
+  std::vector<std::string> unreachable_nodes;
   // Compute the loop structure of the graph.
   TF_RETURN_IF_ERROR(
       BuildControlFlowInfo(&graph_, &control_flow_info_, &unreachable_nodes));
@@ -1582,9 +1582,9 @@ DeadnessAnalysis::~DeadnessAnalysis() {}
   return absl::OkStatus();
 }
 
-absl::flat_hash_map<TensorId, string, TensorId::Hasher>
+absl::flat_hash_map<TensorId, std::string, TensorId::Hasher>
 DeadnessAnalysisImpl::PredicateMapAsString() const {
-  absl::flat_hash_map<TensorId, string, TensorId::Hasher> result;
+  absl::flat_hash_map<TensorId, std::string, TensorId::Hasher> result;
   for (const auto& kv_pair : predicate_map_) {
     CHECK(result.insert({kv_pair.first, kv_pair.second->ToString()}).second);
   }
@@ -1603,7 +1603,7 @@ absl::Status ComputePredicates(const Graph& graph,
 
 }  // namespace deadness_analysis_internal
 
-string DeadnessAnalysis::DebugString(DeadnessPredicate predicate) const {
+std::string DeadnessAnalysis::DebugString(DeadnessPredicate predicate) const {
   return static_cast<Predicate*>(predicate.pred_)->ToString();
 }
 
diff --git a/tensorflow/compiler/jit/deadness_analysis.h b/tensorflow/compiler/jit/deadness_analysis.h
index 80fa9a20faef41..1cd394154faf36 100644
--- a/tensorflow/compiler/jit/deadness_analysis.h
+++ b/tensorflow/compiler/jit/deadness_analysis.h
@@ -81,7 +81,7 @@ class DeadnessAnalysis {
   virtual void Print() const = 0;
   virtual ~DeadnessAnalysis();
 
-  string DebugString(DeadnessPredicate predicate) const;
+  std::string DebugString(DeadnessPredicate predicate) const;
 
   // Run the deadness analysis over `graph` and returns an error or a populated
   // instance of DeadnessAnalysis in `result`.
diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h
index 0dc18d3e129d79..569cdeadae735e 100644
--- a/tensorflow/compiler/jit/deadness_analysis_internal.h
+++ b/tensorflow/compiler/jit/deadness_analysis_internal.h
@@ -24,7 +24,8 @@ namespace deadness_analysis_internal {
 
 // Returns a map describing the predicate each Tensor was mapped to.  For
 // testing purposes only.
-using PredicateMapTy = absl::flat_hash_map<TensorId, string, TensorId::Hasher>;
+using PredicateMapTy =
+    absl::flat_hash_map<TensorId, std::string, TensorId::Hasher>;
 absl::Status ComputePredicates(const Graph& graph,
                                PredicateMapTy* out_predicate_map,
                                bool enable_optimistic = true);
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 894ee659121e25..fd7d93b3772f5f 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -61,7 +61,7 @@ absl::Status AnalyzeDeadness(Graph* graph,
   return DeadnessAnalysis::Run(*graph, result);
 }
 
-ops::Switch CreateSwitch(const Scope& root, const string& prefix) {
+ops::Switch CreateSwitch(const Scope& root, const std::string& prefix) {
   Output value = ops::Placeholder(root.WithOpName(prefix + "/value"), DT_FLOAT);
   Output predicate =
       ops::Placeholder(root.WithOpName(prefix + "/pred"), DT_BOOL);
@@ -76,7 +76,7 @@ void VLogGraphIfAsked(const Graph& graph) {
   if (VLOG_IS_ON(3)) {
     GraphDef graph_def;
     graph.ToGraphDef(&graph_def);
-    string serialized;
+    std::string serialized;
     ::tensorflow::protobuf::TextFormat::PrintToString(graph_def, &serialized);
     LOG(INFO) << serialized;
   }
@@ -127,8 +127,8 @@ struct InductionVarInfo {
 //    +-----> |     Exit      |
 //            +---------------+
 InductionVarInfo CreateInductionVariable(const Scope& root,
-                                         const string& prefix,
-                                         const string& frame_name,
+                                         const std::string& prefix,
+                                         const std::string& frame_name,
                                          const Output& initial_value) {
   Output enter_initial_value = ops::internal::Enter(
       root.WithOpName(prefix + "/enter"), initial_value, frame_name);
@@ -158,8 +158,8 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
 }
 
 InductionVarInfo CreateInductionVariable(const Scope& root,
-                                         const string& prefix,
-                                         const string& frame_name,
+                                         const std::string& prefix,
+                                         const std::string& frame_name,
                                          int32_t init) {
   return CreateInductionVariable(
       root, prefix, frame_name,
@@ -201,7 +201,7 @@ struct DependentInductionVar {
 };
 
 DependentInductionVar CreateDependentLoopInvariantValue(
-    const Scope& root, const string& prefix, const string& frame_name,
+    const Scope& root, const std::string& prefix, const std::string& frame_name,
     const Output& loop_cond, const Output& value) {
   Output enter_value = ops::internal::Enter(root.WithOpName(prefix + "/enter"),
                                             value, frame_name);
@@ -218,7 +218,7 @@ DependentInductionVar CreateDependentLoopInvariantValue(
 }
 
 DependentInductionVar CreateDependentLoopInvariantValue(
-    const Scope& root, const string& prefix, const string& frame_name,
+    const Scope& root, const std::string& prefix, const std::string& frame_name,
     const Output& loop_cond, int32_t value) {
   return CreateDependentLoopInvariantValue(
       root, prefix, frame_name, loop_cond,
diff --git a/tensorflow/compiler/jit/device_compilation_cluster_signature.cc b/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
index 9ec02d92d37cd6..8288b44e7f1c1d 100644
--- a/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
+++ b/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
@@ -65,9 +65,9 @@ struct SignatureNotEqual {
 // Functor that incrementally computes a Signature's hash given its current hash
 // and one of its args.
 struct SignatureHashCombiner {
-  explicit SignatureHashCombiner(const uint64 h) : h(h) {}
-  uint64 h;
-  uint64 operator()(const Tensor& arg) {
+  explicit SignatureHashCombiner(const uint64_t h) : h(h) {}
+  uint64_t h;
+  uint64_t operator()(const Tensor& arg) {
     h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.dtype())));
     h = Hash64Combine(
         h, Hash64(arg.tensor_data().data(), arg.tensor_data().size()));
@@ -76,7 +76,7 @@ struct SignatureHashCombiner {
     }
     return h;
   }
-  uint64 operator()(const TensorTypeAndShape& arg) {
+  uint64_t operator()(const TensorTypeAndShape& arg) {
     h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
     h = Hash64Combine(h, std::hash<int>()(arg.second.size()));
     for (int dim : arg.second) {
@@ -108,8 +108,8 @@ bool Signature::operator==(const Signature& other) const {
   return true;
 }
 
-uint64 Signature::Hash::operator()(const Signature& signature) const {
-  uint64 h = std::hash<string>()(signature.name);
+uint64_t Signature::Hash::operator()(const Signature& signature) const {
+  uint64_t h = std::hash<std::string>()(signature.name);
   for (const auto& arg : signature.args) {
     h = std::visit(SignatureHashCombiner(h), arg);
   }
diff --git a/tensorflow/compiler/jit/device_compilation_cluster_signature.h b/tensorflow/compiler/jit/device_compilation_cluster_signature.h
index b4c2840eedee59..721c1d3b78c50e 100644
--- a/tensorflow/compiler/jit/device_compilation_cluster_signature.h
+++ b/tensorflow/compiler/jit/device_compilation_cluster_signature.h
@@ -58,7 +58,8 @@ struct DeviceCompilationClusterSignature {
   bool operator==(const DeviceCompilationClusterSignature& other) const;
 
   struct Hash {
-    uint64 operator()(const DeviceCompilationClusterSignature& signature) const;
+    uint64_t operator()(
+        const DeviceCompilationClusterSignature& signature) const;
   };
 
   // Returns a human-readable description of the signature.
diff --git a/tensorflow/compiler/jit/device_compilation_profiler.cc b/tensorflow/compiler/jit/device_compilation_profiler.cc
index 5e1b3b26e8ecb5..ec161293b7643d 100644
--- a/tensorflow/compiler/jit/device_compilation_profiler.cc
+++ b/tensorflow/compiler/jit/device_compilation_profiler.cc
@@ -107,7 +107,7 @@ absl::Status DeviceCompilationProfiler::RegisterCompilation(
       cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
           .first;
 
-  const uint64 compile_time_s = compile_time_us / 1.0e6;
+  const uint64_t compile_time_s = compile_time_us / 1.0e6;
   it->second.compile_count++;
   it->second.cumulative_compile_time_us += compile_time_us;
   VLOG(1) << "Compiled " << function_name << " " << it->second.compile_count
diff --git a/tensorflow/compiler/jit/device_compiler.h b/tensorflow/compiler/jit/device_compiler.h
index 0fae07abd22897..a9f2418282c414 100644
--- a/tensorflow/compiler/jit/device_compiler.h
+++ b/tensorflow/compiler/jit/device_compiler.h
@@ -137,7 +137,7 @@ class DeviceCompiler : public ResourceBase {
     return compiler_client_.get();
   }
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
  private:
   // Common implementation of Compile and CompileSingleOp. The `OpKernelContext`
@@ -259,7 +259,7 @@ DeviceCompiler<ExecutableType, ClientType>::~DeviceCompiler() {
 }
 
 template <typename ExecutableType, typename ClientType>
-string DeviceCompiler<ExecutableType, ClientType>::DebugString() const {
+std::string DeviceCompiler<ExecutableType, ClientType>::DebugString() const {
   return "DeviceCompiler";
 }
 
@@ -331,7 +331,7 @@ DeviceCompiler<ExecutableType, ClientType>::CompileStrict(
     CompileScope scope, OpKernelContext* ctx,
     DeviceCompilationProfiler* profiler, mutex* mu) {
   tensorflow::Env* env = tensorflow::Env::Default();
-  const uint64 compile_start_us = env->NowMicros();
+  const uint64_t compile_start_us = env->NowMicros();
 
   TfGraphToHloCompiler compiler(options);
   cache_value.compile_state = DeviceCompileState::kCompiled;
@@ -385,8 +385,8 @@ DeviceCompiler<ExecutableType, ClientType>::CompileStrict(
   // Finalize the cache to release the XlaComputation after it was compiled.
   cache_->Finalize();
 
-  const uint64 compile_end_us = env->NowMicros();
-  const uint64 compile_time_us = compile_end_us - compile_start_us;
+  const uint64_t compile_end_us = env->NowMicros();
+  const uint64_t compile_time_us = compile_end_us - compile_start_us;
 
   device_compiler_internal::LogOnceXlaCompiledFirstCluster();
   TF_RETURN_IF_ERROR(profiler->RegisterCompilation(
@@ -496,7 +496,7 @@ absl::Status DeviceCompiler<ExecutableType, ClientType>::CompileImpl(
 
   profiler->RegisterExecution(function);
 
-  string human_signature;
+  std::string human_signature;
   if (VLOG_IS_ON(2)) {
     human_signature = VLOG_IS_ON(3) ? signature.HumanString() : function.name();
     VLOG(2) << "DeviceCompilationClusterSignature: " << human_signature;
diff --git a/tensorflow/compiler/jit/device_compiler_test.cc b/tensorflow/compiler/jit/device_compiler_test.cc
index 64e286bff55b07..749110be186311 100644
--- a/tensorflow/compiler/jit/device_compiler_test.cc
+++ b/tensorflow/compiler/jit/device_compiler_test.cc
@@ -139,7 +139,7 @@ class MockXlaDeviceExecutablePersistor
             Config{testing::TmpDir(), false, "xla"},
             DeviceType(DEVICE_CPU_XLA_JIT)) {}
   MOCK_METHOD(absl::Status, TryToPersistExecutable,
-              (uint64, const std::string&, const XlaCompiler::Options&,
+              (uint64_t, const std::string&, const XlaCompiler::Options&,
                const XlaCompiler::CompilationResult&,
                const xla::LocalExecutable&,
                (DeviceCompilerClient<xla::LocalExecutable, xla::LocalClient>*)),
@@ -425,7 +425,7 @@ TEST_F(DeviceCompilerTest, CompileFailedToLoadFromPersistentCache) {
       &xla_executable));
 
   // Corrupt the file which contains the serialized executable.
-  std::vector<string> files;
+  std::vector<std::string> files;
   TF_ASSERT_OK(Env::Default()->GetChildren(testing::TmpDir(), &files));
   std::string const* serialized_executable_filename = nullptr;
   for (const auto& file : files) {
diff --git a/tensorflow/compiler/jit/device_context_test.cc b/tensorflow/compiler/jit/device_context_test.cc
index 34a0c3d5ea067b..33bba30f3db3e1 100644
--- a/tensorflow/compiler/jit/device_context_test.cc
+++ b/tensorflow/compiler/jit/device_context_test.cc
@@ -38,7 +38,7 @@ static bool Initialized = [] {
 
 class DeviceContextTest : public ::testing::Test {
  public:
-  void SetDevice(const string& device_type) {
+  void SetDevice(const std::string& device_type) {
     auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
     rollout_config.AllowForDeviceInXlaLaunch(DeviceType(device_type));
     rollout_config.AllowForDeviceInXlaCompileOnDemand(DeviceType(device_type));
diff --git a/tensorflow/compiler/jit/device_executable_persistor.h b/tensorflow/compiler/jit/device_executable_persistor.h
index 458441c86b5c43..5a64b078e1a93c 100644
--- a/tensorflow/compiler/jit/device_executable_persistor.h
+++ b/tensorflow/compiler/jit/device_executable_persistor.h
@@ -96,7 +96,7 @@ class DeviceExecutablePersistor {
   // TODO(b/255826209): Take in Signature instead of hash and string once cache
   // is refactored.
   std::optional<StatusOr<std::unique_ptr<ExecutableType>>> TryToLoadExecutable(
-      uint64 signature_hash, const std::string& signature_str,
+      uint64_t signature_hash, const std::string& signature_str,
       const XlaCompiler::Options& options,
       const XlaCompiler::CompilationResult& compilation_result,
       DeviceCompilerClient<ExecutableType, ClientType>* client) const;
@@ -107,7 +107,7 @@ class DeviceExecutablePersistor {
   // TODO(b/255826209): Take in Signature instead hash and string once cache
   // is refactored.
   virtual absl::Status TryToPersistExecutable(
-      uint64 signature_hash, const std::string& signature_str,
+      uint64_t signature_hash, const std::string& signature_str,
       const XlaCompiler::Options& options,
       const XlaCompiler::CompilationResult& compilation_result,
       const ExecutableType& executable,
@@ -123,15 +123,15 @@ class DeviceExecutablePersistor {
   // Returns a cache key proto that identifies an entry in the compilation
   // cache.
   XlaSerializedCacheKey BuildSerializedCacheKey(
-      uint64 signature_hash, const xla::HloModuleProto& hlo_module) const;
+      uint64_t signature_hash, const xla::HloModuleProto& hlo_module) const;
 
   XlaSerializedCacheKey BuildSerializedCacheKey(
-      uint64 signature_hash, const xla::HloModuleProto& hlo_module,
+      uint64_t signature_hash, const xla::HloModuleProto& hlo_module,
       bool compiled_using_pjrt) const;
 
   // Serializes the signature and its corresponding entry to a proto message.
   absl::StatusOr<XlaSerializedCacheEntry> SerializeEntry(
-      uint64 signature_hash, const XlaCompiler::Options& options,
+      uint64_t signature_hash, const XlaCompiler::Options& options,
       const XlaCompiler::CompilationResult& compilation_result,
       const ExecutableType& executable,
       DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const;
@@ -189,7 +189,7 @@ std::string DeviceExecutablePersistor<ExecutableType, ClientType>::GetFilePath(
 template <typename ExecutableType, typename ClientType>
 XlaSerializedCacheKey
 DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
-    uint64 signature_hash, const xla::HloModuleProto& hlo_module,
+    uint64_t signature_hash, const xla::HloModuleProto& hlo_module,
     bool compiled_using_pjrt) const {
   XlaSerializedCacheKey key;
   key.set_signature_fingerprint(signature_hash);
@@ -203,7 +203,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
 template <typename ExecutableType, typename ClientType>
 XlaSerializedCacheKey
 DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
-    uint64 signature_hash, const xla::HloModuleProto& hlo_module) const {
+    uint64_t signature_hash, const xla::HloModuleProto& hlo_module) const {
   return BuildSerializedCacheKey(signature_hash, hlo_module, false);
 }
 
@@ -212,7 +212,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
 template <>
 inline XlaSerializedCacheKey
 DeviceExecutablePersistor<xla::PjRtLoadedExecutable, xla::PjRtClient>::
-    BuildSerializedCacheKey(uint64 signature_hash,
+    BuildSerializedCacheKey(uint64_t signature_hash,
                             const xla::HloModuleProto& hlo_module) const {
   return BuildSerializedCacheKey(signature_hash, hlo_module, true);
 }
@@ -305,7 +305,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::SaveSerializedEntry(
 template <typename ExecutableType, typename ClientType>
 absl::StatusOr<XlaSerializedCacheEntry>
 DeviceExecutablePersistor<ExecutableType, ClientType>::SerializeEntry(
-    uint64 signature_hash, const XlaCompiler::Options& options,
+    uint64_t signature_hash, const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& compilation_result,
     const ExecutableType& executable,
     DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
@@ -340,7 +340,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::SerializeEntry(
 template <typename ExecutableType, typename ClientType>
 std::optional<StatusOr<std::unique_ptr<ExecutableType>>>
 DeviceExecutablePersistor<ExecutableType, ClientType>::TryToLoadExecutable(
-    uint64 signature_hash, const std::string& signature_str,
+    uint64_t signature_hash, const std::string& signature_str,
     const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& compilation_result,
     DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
@@ -376,7 +376,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::TryToLoadExecutable(
 template <typename ExecutableType, typename ClientType>
 absl::Status
 DeviceExecutablePersistor<ExecutableType, ClientType>::TryToPersistExecutable(
-    uint64 signature_hash, const std::string& signature_str,
+    uint64_t signature_hash, const std::string& signature_str,
     const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& compilation_result,
     const ExecutableType& executable,
diff --git a/tensorflow/compiler/jit/device_executable_persistor_test.cc b/tensorflow/compiler/jit/device_executable_persistor_test.cc
index 7779f1112e7b9e..62cfd4c1b8e0b7 100644
--- a/tensorflow/compiler/jit/device_executable_persistor_test.cc
+++ b/tensorflow/compiler/jit/device_executable_persistor_test.cc
@@ -222,7 +222,7 @@ absl::StatusOr<XlaSerializedCacheEntry> ReadCacheEntryFromFile(
 }
 
 XlaSerializedCacheKey CreateCacheKey(
-    uint64 signature_hash,
+    uint64_t signature_hash,
     const XlaCompiler::CompilationResult& compilation_result,
     const DeviceType& device_type, const std::string& persistence_prefix,
     bool compiled_using_pjrt = false) {
diff --git a/tensorflow/compiler/jit/device_util.cc b/tensorflow/compiler/jit/device_util.cc
index 828da0b08c2590..1979aec5bcf0c3 100644
--- a/tensorflow/compiler/jit/device_util.cc
+++ b/tensorflow/compiler/jit/device_util.cc
@@ -44,7 +44,7 @@ void DeviceSet::UnionWith(const DeviceSet& other) {
 }
 
 bool DeviceSet::IsEmpty() const {
-  return absl::c_all_of(storage_, [&](uint64 val) { return val == 0; });
+  return absl::c_all_of(storage_, [&](uint64_t val) { return val == 0; });
 }
 
 absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
@@ -56,7 +56,7 @@ absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
   }
 
   int new_id = names_.size();
-  names_.push_back(string(name));
+  names_.push_back(std::string(name));
   id_to_device_type_.push_back(std::make_unique<DeviceType>(""));
   DeviceType* device_type = id_to_device_type_.back().get();
   TF_RETURN_IF_ERROR(DeviceNameToDeviceType(names_.back(), device_type));
@@ -64,7 +64,7 @@ absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
   is_cpu_.push_back(device_type->type_string() == DEVICE_CPU);
   is_gpu_.push_back(device_type->type_string() == DEVICE_GPU);
 
-  name_to_id_.emplace(string(name), DeviceId(new_id));
+  name_to_id_.emplace(std::string(name), DeviceId(new_id));
 
   const XlaOpRegistry::DeviceRegistration* compilation_device;
   if (!XlaOpRegistry::GetCompilationDevice(device_type->type(),
@@ -76,10 +76,10 @@ absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
   return DeviceId(new_id);
 }
 
-string DeviceInfoCache::DebugString(const DeviceSet& device_set) const {
-  std::vector<string> names;
+std::string DeviceInfoCache::DebugString(const DeviceSet& device_set) const {
+  std::vector<std::string> names;
   device_set.ForEach([&](DeviceId device_id) {
-    names.push_back(string(GetNameFor(device_id)));
+    names.push_back(std::string(GetNameFor(device_id)));
     return true;
   });
 
@@ -87,7 +87,7 @@ string DeviceInfoCache::DebugString(const DeviceSet& device_set) const {
 }
 }  // namespace jit
 
-absl::Status DeviceNameToDeviceType(const string& device,
+absl::Status DeviceNameToDeviceType(const std::string& device,
                                     DeviceType* device_type) {
   DeviceNameUtils::ParsedName parsed;
   if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
diff --git a/tensorflow/compiler/jit/device_util.h b/tensorflow/compiler/jit/device_util.h
index 745f87309501d8..fa862aac88c394 100644
--- a/tensorflow/compiler/jit/device_util.h
+++ b/tensorflow/compiler/jit/device_util.h
@@ -75,9 +75,9 @@ class DeviceSet {
     // iterator if this ends up being used widely.
     for (int word_index = 0, end = storage_.size(); word_index < end;
          word_index++) {
-      uint64 word = storage_[word_index];
+      uint64_t word = storage_[word_index];
       while (word != 0) {
-        uint64 only_lowest_bit_set = word & -word;
+        uint64_t only_lowest_bit_set = word & -word;
         // The number of trailing zeros in a non-zero word is the index of the
         // least significant 1.
         int bit_index = absl::countr_zero(word);
@@ -90,7 +90,7 @@ class DeviceSet {
   }
 
  private:
-  absl::InlinedVector<uint64, 1> storage_;
+  absl::InlinedVector<uint64_t, 1> storage_;
 
   const int kWordSize = 64;
 };
@@ -131,17 +131,17 @@ class DeviceInfoCache {
     return std::cref(*id_to_device_type_[device_id.id()]);
   }
 
-  string DebugString(const DeviceSet& device_set) const;
+  std::string DebugString(const DeviceSet& device_set) const;
 
  private:
-  absl::flat_hash_map<string, DeviceId> name_to_id_;
+  absl::flat_hash_map<std::string, DeviceId> name_to_id_;
 
   // These fields are populated for a device in GetIdFor, *before* we give out a
   // DeviceId.
   std::vector<const XlaOpRegistry::DeviceRegistration*>
       id_to_compilation_device_;
   std::vector<std::unique_ptr<DeviceType>> id_to_device_type_;
-  std::vector<string> names_;
+  std::vector<std::string> names_;
   std::vector<bool> is_cpu_;
   std::vector<bool> is_gpu_;
 };
@@ -149,7 +149,7 @@ class DeviceInfoCache {
 }  // namespace jit
 
 // Returns the DeviceType corresponding to 'device'.
-absl::Status DeviceNameToDeviceType(const string& device,
+absl::Status DeviceNameToDeviceType(const std::string& device,
                                     DeviceType* device_type);
 
 // Picks the device for which XLA should compile a cluster that contains
diff --git a/tensorflow/compiler/jit/device_util_test.cc b/tensorflow/compiler/jit/device_util_test.cc
index cef39df6283f2b..be58292f931686 100644
--- a/tensorflow/compiler/jit/device_util_test.cc
+++ b/tensorflow/compiler/jit/device_util_test.cc
@@ -23,7 +23,7 @@ namespace {
 
 absl::Status PickDeviceHelper(bool allow_mixing_unknown_and_cpu,
                               absl::Span<const absl::string_view> device_names,
-                              string* result) {
+                              std::string* result) {
   jit::DeviceInfoCache cache;
   jit::DeviceSet device_set;
   for (absl::string_view name : device_names) {
@@ -34,14 +34,14 @@ absl::Status PickDeviceHelper(bool allow_mixing_unknown_and_cpu,
   TF_ASSIGN_OR_RETURN(
       jit::DeviceId result_id,
       PickDeviceForXla(cache, device_set, allow_mixing_unknown_and_cpu));
-  *result = string(cache.GetNameFor(result_id));
+  *result = std::string(cache.GetNameFor(result_id));
   return absl::OkStatus();
 }
 
 void CheckPickDeviceResult(absl::string_view expected_result,
                            bool allow_mixing_unknown_and_cpu,
                            absl::Span<const absl::string_view> inputs) {
-  string result;
+  std::string result;
   TF_ASSERT_OK(PickDeviceHelper(allow_mixing_unknown_and_cpu, inputs, &result))
       << "inputs = [" << absl::StrJoin(inputs, ", ")
       << "], allow_mixing_unknown_and_cpu=" << allow_mixing_unknown_and_cpu
@@ -51,7 +51,7 @@ void CheckPickDeviceResult(absl::string_view expected_result,
 
 void CheckPickDeviceHasError(bool allow_mixing_unknown_and_cpu,
                              absl::Span<const absl::string_view> inputs) {
-  string result;
+  std::string result;
   EXPECT_FALSE(
       PickDeviceHelper(allow_mixing_unknown_and_cpu, inputs, &result).ok());
 }
@@ -110,10 +110,10 @@ void SimpleRoundTripTestForDeviceSet(int num_devices) {
   jit::DeviceSet device_set;
   jit::DeviceInfoCache device_info_cache;
 
-  std::vector<string> expected_devices, actual_devices;
+  std::vector<std::string> expected_devices, actual_devices;
 
   for (int i = 0; i < num_devices; i++) {
-    string device_name =
+    std::string device_name =
         absl::StrCat("/job:localhost/replica:0/task:0/device:XPU:", i);
     TF_ASSERT_OK_AND_ASSIGN(jit::DeviceId device_id,
                             device_info_cache.GetIdFor(device_name));
@@ -122,7 +122,8 @@ void SimpleRoundTripTestForDeviceSet(int num_devices) {
   }
 
   device_set.ForEach([&](jit::DeviceId device_id) {
-    actual_devices.push_back(string(device_info_cache.GetNameFor(device_id)));
+    actual_devices.push_back(
+        std::string(device_info_cache.GetNameFor(device_id)));
     return true;
   });
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 3e8a43ce08ed58..6e7d16de16a4f6 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -115,7 +115,7 @@ void MarkGuaranteedConstants(
 }
 
 struct OutputInputTensorPairHasher {
-  uint64 operator()(std::pair<OutputTensor, InputTensor> const& s) const {
+  uint64_t operator()(std::pair<OutputTensor, InputTensor> const& s) const {
     return Hash64Combine(OutputTensor::Hash()(s.first),
                          InputTensor::Hash()(s.second));
   }
@@ -128,7 +128,7 @@ static const char* const kRetValOp = "_Retval";
 
 class Encapsulator {
  public:
-  Encapsulator(string group_attribute, Graph const* graph_in)
+  Encapsulator(std::string group_attribute, Graph const* graph_in)
       : group_attribute_(std::move(group_attribute)), graph_in_(graph_in) {}
 
   // Find subgraphs marked with 'group_attribute', and build a new
@@ -182,7 +182,7 @@ class Encapsulator {
     // 'reuse_existing_functions' is set, use an existing function with the same
     // name, if any.  If 'rewrite_subgraph_fn' is set, it is applied to the
     // subgraph before function conversion.
-    absl::Status BuildFunctionDef(const string& name_in,
+    absl::Status BuildFunctionDef(const std::string& name_in,
                                   const RewriteSubgraphFn& rewrite_subgraph_fn,
                                   bool reuse_existing_functions,
                                   FunctionLibraryDefinition* library);
@@ -226,7 +226,7 @@ class Encapsulator {
         const absl::flat_hash_map<const Node*, Node*>& node_images);
 
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
-    absl::Status MakeSequencingNode(const string& subgraph_name,
+    absl::Status MakeSequencingNode(const std::string& subgraph_name,
                                     Graph* graph_out);
 
     // If there is a sequencer node, adds a control edge from the sequencer to
@@ -243,14 +243,14 @@ class Encapsulator {
 
     // Which device are these nodes on? Used to assign a device to the call
     // node.
-    string device_;
+    std::string device_;
 
     // NodeDef for the function call node.
     NodeDef call_node_def_;
 
     // Name that is used for the call node. This may not be
     // call_node_def_.name() if the client supplies a rewrite lambda.
-    string function_def_name_;
+    std::string function_def_name_;
 
     // Placeholder node simulating the host compute key in the output graph.
     // Not owned.
@@ -275,7 +275,7 @@ class Encapsulator {
     // Set of node names that are the source of a control output of the
     // subgraph. We store strings here so that we can tolerate nodes being
     // removed from the graph.
-    absl::flat_hash_set<string> control_output_nodes_;
+    absl::flat_hash_set<std::string> control_output_nodes_;
 
     // NoOp node in the output graph that is sequenced after the call node.
     Node* sequencer_ = nullptr;
@@ -283,7 +283,7 @@ class Encapsulator {
 
   // Returns the key attribute associated with a node in attr. Sets either
   // result to the empty string if the respective attribute is not found.
-  absl::Status GetFunctionNameAttr(Node const* node, string* attr) const;
+  absl::Status GetFunctionNameAttr(Node const* node, std::string* attr) const;
 
   // Copies edges local to a subgraph. Adds _Arg and _Retval nodes to
   // subgraphs for data edges that cross subgraph boundaries.
@@ -308,36 +308,35 @@ class Encapsulator {
   // a subgraph boundary it is the output of a call node, otherwise it is a node
   // in the output graph.
   absl::Status FindOutputImageOfEdgeSrc(
-      const string& src_func_id, const string& dst_func_id,
+      const std::string& src_func_id, const std::string& dst_func_id,
       const absl::flat_hash_map<const Node*, Node*>& node_images,
       const Node* original_src_node, Node** src_image);
 
   // Finds an edge source slot in the output graph. If the edge crosses a
   // subgraph boundary it is a slot on the output of a call node, otherwise it
   // is a slot on a node in the output graph.
-  int FindOutputSlotOfEdgeSrc(const string& src_func_id,
-                              const string& dst_func_id,
-                              const Edge* edge);
+  int FindOutputSlotOfEdgeSrc(const std::string& src_func_id,
+                              const std::string& dst_func_id, const Edge* edge);
 
   // Finds the image of an edge destination in the output graph. If the edge
   // crosses a subgraph boundary it is the input of a call node, otherwise it is
   // a node in the output graph.
   absl::Status FindOutputImageOfEdgeDst(
-      const string& src_func_id, const string& dst_func_id,
+      const std::string& src_func_id, const std::string& dst_func_id,
       const absl::flat_hash_map<const Node*, Node*>& node_images,
       const Node* original_dst_node, Node** dst_image);
 
   // Finds an edge destination slot in the output graph. If the edge crosses a
   // subgraph boundary it is a slot on the input of a call node, otherwise it is
   // a slot on a node in the output graph.
-  int FindOutputSlotOfEdgeDst(const string& src_func_id,
-                              const string& dst_func_id,
-                              const Edge* edge);
+  int FindOutputSlotOfEdgeDst(const std::string& src_func_id,
+                              const std::string& dst_func_id, const Edge* edge);
 
   // Copies a single edge to the output graph. The edge is either entirely
   // within the output graph, or crosses into or out of a compiled subgraph.
   absl::Status CopyEdgeToOutputGraph(
-      const Edge* edge, const string& src_func_id, const string& dst_func_id,
+      const Edge* edge, const std::string& src_func_id,
+      const std::string& dst_func_id,
       const absl::flat_hash_map<const Node*, Node*>& node_images,
       Graph* graph_out,
       absl::flat_hash_set<std::pair<OutputTensor, InputTensor>,
@@ -358,10 +357,10 @@ class Encapsulator {
       absl::flat_hash_map<const Node*, Node*>* node_images,
       FunctionLibraryDefinition* library);
 
-  const string group_attribute_;
+  const std::string group_attribute_;
   const Graph* graph_in_;
 
-  absl::flat_hash_map<string, Subgraph> subgraphs_;
+  absl::flat_hash_map<std::string, Subgraph> subgraphs_;
 
   Encapsulator(const Encapsulator&) = delete;
   void operator=(const Encapsulator&) = delete;
@@ -374,19 +373,20 @@ namespace {
 // including clusters that are not present in the ancestors map. has_successors
 // is the set of clusters that are ancestors of some other cluster.
 void TopologicalClusterSort(
-    const absl::flat_hash_set<string>& clusters,
-    const absl::flat_hash_set<string>& has_successors,
-    const absl::flat_hash_map<string, absl::flat_hash_set<string>>& ancestors,
-    std::vector<string>* sorted) {
+    const absl::flat_hash_set<std::string>& clusters,
+    const absl::flat_hash_set<std::string>& has_successors,
+    const absl::flat_hash_map<std::string, absl::flat_hash_set<std::string>>&
+        ancestors,
+    std::vector<std::string>* sorted) {
   // The nodes are placed in 'sorted' in topological order.
   sorted->clear();
   // We don't use the standard DFS because we are not operating on Node*
   // objects.
   struct Work {
-    string cluster;
+    std::string cluster;
     bool leave;
   };
-  std::set<string> visited;
+  std::set<std::string> visited;
   std::vector<Work> stack;
   // Seed the processing list with clusters that have no successors.
   for (const auto& cluster : clusters) {
@@ -523,7 +523,7 @@ absl::Status Encapsulator::Subgraph::RecordResult(
 }
 
 absl::Status Encapsulator::Subgraph::MakeSequencingNode(
-    const string& subgraph_name, Graph* graph_out) {
+    const std::string& subgraph_name, Graph* graph_out) {
   if (sequencer_ == nullptr) {
     NodeDef seq_def;
     // TODO(shikharagarwal): What source node should we use for errors?
@@ -547,11 +547,11 @@ void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) {
 }
 
 absl::Status Encapsulator::Subgraph::BuildFunctionDef(
-    const string& name_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
+    const std::string& name_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
     bool reuse_existing_functions, FunctionLibraryDefinition* library) {
   // name_in is copied here because name may be modified below if
   // rewrite_subgraph_fn is true.
-  string name = name_in;
+  std::string name = name_in;
   call_node_def_.set_op(name);
   call_node_def_.set_name(name);
   call_node_def_.set_device(device_);
@@ -596,7 +596,7 @@ absl::Status Encapsulator::Subgraph::BuildFunctionDef(
   function_def_name_ = name;
 
   FunctionDef fdef;
-  auto lookup = [this](const Node* node) -> std::optional<string> {
+  auto lookup = [this](const Node* node) -> std::optional<std::string> {
     if (control_output_nodes_.contains(node->name())) {
       return std::make_optional(node->name());
     }
@@ -625,7 +625,7 @@ absl::Status Encapsulator::Subgraph::BuildFunctionDef(
 
 absl::Status Encapsulator::Subgraph::ReplaceFunctionDef(
     FunctionLibraryDefinition* library) {
-  const string& name = function_def_name_;
+  const std::string& name = function_def_name_;
 
   FunctionDef fdef;
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
@@ -654,7 +654,7 @@ absl::Status Encapsulator::Subgraph::AddFunctionCallNode(
 }
 
 absl::Status Encapsulator::GetFunctionNameAttr(Node const* node,
-                                               string* attr) const {
+                                               std::string* attr) const {
   AttrSlice attrs = node->attrs();
   attr->clear();
   for (const auto& node_attr : attrs) {
@@ -667,12 +667,12 @@ absl::Status Encapsulator::GetFunctionNameAttr(Node const* node,
   return absl::OkStatus();
 }
 
-bool IsInSubgraph(const string& func_id) { return !func_id.empty(); }
+bool IsInSubgraph(const std::string& func_id) { return !func_id.empty(); }
 
 absl::Status Encapsulator::CopySubgraphNodes(
     absl::flat_hash_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
-    string func_id;
+    std::string func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
     if (!IsInSubgraph(func_id)) continue;
 
@@ -688,9 +688,9 @@ absl::Status Encapsulator::CopySubgraphEdges(
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
   for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id;
+    std::string src_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
-    string dst_func_id;
+    std::string dst_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
     Node* src_image = gtl::FindWithDefault(node_images, edge->src(), nullptr);
     Node* dst_image = gtl::FindWithDefault(node_images, edge->dst(), nullptr);
@@ -793,7 +793,7 @@ absl::Status Encapsulator::BuildFunctionDefs(
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     FunctionLibraryDefinition* library) {
   for (auto& subgraph_entry : subgraphs_) {
-    string name = subgraph_entry.first;
+    std::string name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
     TF_RETURN_IF_ERROR(subgraph.BuildFunctionDef(
         name, rewrite_subgraph_fn, reuse_existing_functions, library));
@@ -804,7 +804,7 @@ absl::Status Encapsulator::BuildFunctionDefs(
 absl::Status Encapsulator::CopyNodesToOutputGraph(
     Graph* graph_out, absl::flat_hash_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
-    string func_id;
+    std::string func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
 
     // Don't copy nodes that are going to be encapsulated.
@@ -829,7 +829,7 @@ absl::Status Encapsulator::AddFunctionCallNodes(
 }
 
 absl::Status Encapsulator::FindOutputImageOfEdgeSrc(
-    const string& src_func_id, const string& dst_func_id,
+    const std::string& src_func_id, const std::string& dst_func_id,
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     const Node* original_src_node, Node** src_image) {
   if (IsInSubgraph(src_func_id)) {
@@ -844,8 +844,8 @@ absl::Status Encapsulator::FindOutputImageOfEdgeSrc(
   return absl::OkStatus();
 }
 
-int Encapsulator::FindOutputSlotOfEdgeSrc(const string& src_func_id,
-                                          const string& dst_func_id,
+int Encapsulator::FindOutputSlotOfEdgeSrc(const std::string& src_func_id,
+                                          const std::string& dst_func_id,
                                           const Edge* edge) {
   if (IsInSubgraph(src_func_id)) {
     const Subgraph& src_subgraph = subgraphs_.at(src_func_id);
@@ -860,7 +860,7 @@ int Encapsulator::FindOutputSlotOfEdgeSrc(const string& src_func_id,
 }
 
 absl::Status Encapsulator::FindOutputImageOfEdgeDst(
-    const string& src_func_id, const string& dst_func_id,
+    const std::string& src_func_id, const std::string& dst_func_id,
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     const Node* original_dst_node, Node** dst_image) {
   if (IsInSubgraph(dst_func_id)) {
@@ -875,8 +875,8 @@ absl::Status Encapsulator::FindOutputImageOfEdgeDst(
   return absl::OkStatus();
 }
 
-int Encapsulator::FindOutputSlotOfEdgeDst(const string& src_func_id,
-                                          const string& dst_func_id,
+int Encapsulator::FindOutputSlotOfEdgeDst(const std::string& src_func_id,
+                                          const std::string& dst_func_id,
                                           const Edge* edge) {
   if (IsInSubgraph(dst_func_id)) {
     const Subgraph& dst_subgraph = subgraphs_.at(dst_func_id);
@@ -891,7 +891,8 @@ int Encapsulator::FindOutputSlotOfEdgeDst(const string& src_func_id,
 }
 
 absl::Status Encapsulator::CopyEdgeToOutputGraph(
-    const Edge* edge, const string& src_func_id, const string& dst_func_id,
+    const Edge* edge, const std::string& src_func_id,
+    const std::string& dst_func_id,
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     Graph* graph_out,
     absl::flat_hash_set<std::pair<OutputTensor, InputTensor>,
@@ -943,9 +944,9 @@ absl::Status Encapsulator::AddEdgesToOutputGraph(
       edges_added;
 
   for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id;
+    std::string src_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
-    string dst_func_id;
+    std::string dst_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
 
     // Ignore edges that are strictly contained within one subgraph, unless
@@ -1091,7 +1092,7 @@ absl::Status Encapsulator::BuildOutputGraph(
 }  // anonymous namespace
 
 absl::Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, const Graph& graph_in,
+    std::string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library) {
   Encapsulator encapsulator(std::move(group_attribute),
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index 0c7729f67349b5..ed2c9ef45a2c16 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -73,7 +73,7 @@ typedef std::function<absl::Status(
 // dep from B. Originally D must run after C, post-transformation this
 // dependency is lost.
 absl::Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, const Graph& graph_in,
+    std::string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 1e05ad067def7f..94b136a02b99cf 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -46,7 +46,7 @@ const char* const kXlaHostTransferSequencerAttr =
     "_xla_host_transfer_sequencer";
 
 absl::Status AddGraphDefToFunctionLibrary(
-    const GraphDefBuilder& graphdef_builder, const string& name_suffix,
+    const GraphDefBuilder& graphdef_builder, const std::string& name_suffix,
     FunctionDefLibrary* library) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(graphdef_builder.ToGraphDef(&graphdef));
@@ -64,13 +64,14 @@ absl::Status AddGraphDefToFunctionLibrary(
 }
 
 template <class Tkey, class Tvalue>
-bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
-                   const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
-                   const std::function<string(const Tkey&)>& key_to_string,
-                   const std::function<string(const Tvalue&)>& value_to_string,
-                   const std::function<bool(const Tkey&, const Tvalue&,
-                                            const Tvalue&)>& compare,
-                   const string& map_name, string* diff) {
+bool EqualProtoMap(
+    const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
+    const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
+    const std::function<std::string(const Tkey&)>& key_to_string,
+    const std::function<std::string(const Tvalue&)>& value_to_string,
+    const std::function<bool(const Tkey&, const Tvalue&, const Tvalue&)>&
+        compare,
+    const std::string& map_name, std::string* diff) {
   for (const auto& elt_a : a) {
     const auto iter = b.find(elt_a.first);
     if (iter == b.end()) {
@@ -106,7 +107,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
 }
 
 bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
-                          const string& diff_preamble, string* diff) {
+                          const std::string& diff_preamble, std::string* diff) {
   if (a.op() != b.op()) {
     if (diff) {
       *diff = absl::StrCat(diff_preamble, " mismatch for node ", a.name(),
@@ -131,8 +132,8 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
     }
     return false;
   }
-  std::unordered_set<string> control_input_a;
-  std::unordered_set<string> control_input_b;
+  std::unordered_set<std::string> control_input_a;
+  std::unordered_set<std::string> control_input_b;
   for (int i = 0; i < a.input_size(); ++i) {
     if (absl::StartsWith(a.input(i), "^")) {
       if (!absl::StartsWith(b.input(i), "^")) {
@@ -164,17 +165,17 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
     }
     return false;
   }
-  return EqualProtoMap<string, AttrValue>(
-      a.attr(), b.attr(), [](const string& s) { return s; },
+  return EqualProtoMap<std::string, AttrValue>(
+      a.attr(), b.attr(), [](const std::string& s) { return s; },
       [](const AttrValue& v) { return v.DebugString(); },
-      [](const string& key, const AttrValue& av, const AttrValue& bv) {
+      [](const std::string& key, const AttrValue& av, const AttrValue& bv) {
         if (key == "ancestors") {
           // The ancestors are added from a set so the order is unpredictable;
           // just compare set equality not list equality.
-          std::unordered_set<string> a_set(av.list().s().begin(),
-                                           av.list().s().end());
-          std::unordered_set<string> b_set(bv.list().s().begin(),
-                                           bv.list().s().end());
+          std::unordered_set<std::string> a_set(av.list().s().begin(),
+                                                av.list().s().end());
+          std::unordered_set<std::string> b_set(bv.list().s().begin(),
+                                                bv.list().s().end());
           return a_set == b_set;
         } else {
           return av.DebugString() == bv.DebugString();
@@ -184,7 +185,7 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
 }
 
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
-                      string* diff) {
+                      std::string* diff) {
   if (a.signature().DebugString() != b.signature().DebugString()) {
     if (diff) {
       *diff =
@@ -194,22 +195,21 @@ bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
     }
     return false;
   }
-  if (!EqualProtoMap<string, AttrValue>(
-          a.attr(), b.attr(), [](const string& s) { return s; },
+  if (!EqualProtoMap<std::string, AttrValue>(
+          a.attr(), b.attr(), [](const std::string& s) { return s; },
           [](const AttrValue& v) { return v.DebugString(); },
-          [](const string& key, const AttrValue& av, const AttrValue& bv) {
+          [](const std::string& key, const AttrValue& av, const AttrValue& bv) {
             return av.DebugString() == bv.DebugString();
           },
           absl::StrCat("attr mismatch for function ", a.signature().name()),
           diff)) {
     return false;
   }
-  if (!EqualProtoMap<string, string>(
-          a.ret(), b.ret(), [](const string& s) { return s; },
-          [](const string& s) { return s; },
-          [](const string& key, const string& av, const string& bv) {
-            return av == bv;
-          },
+  if (!EqualProtoMap<std::string, std::string>(
+          a.ret(), b.ret(), [](const std::string& s) { return s; },
+          [](const std::string& s) { return s; },
+          [](const std::string& key, const std::string& av,
+             const std::string& bv) { return av == bv; },
           absl::StrCat("ret mismatch for function ", a.signature().name()),
           diff)) {
     return false;
@@ -257,8 +257,9 @@ bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
 }
 
 bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
-                             const FunctionDefLibrary& actual, string* diff) {
-  std::unordered_map<string, const FunctionDef*> actual_index;
+                             const FunctionDefLibrary& actual,
+                             std::string* diff) {
+  std::unordered_map<std::string, const FunctionDef*> actual_index;
   for (const FunctionDef& function : actual.function()) {
     actual_index[function.signature().name()] = &function;
   }
@@ -343,7 +344,7 @@ REGISTER_OP("AddNLikeTest")
     .SetIsAggregate();
 
 Node* Sequencer(const GraphDefBuilder::Options& opts,
-                const string& call_node_name) {
+                const std::string& call_node_name) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("NoOp"), "NoOp",
                            opts.op_registry());
@@ -383,7 +384,7 @@ Node* KeyPlaceholderShape(const GraphDefBuilder::Options& opts) {
   return KnownShapeBase(DT_STRING, {2}, opts);
 }
 
-Node* KeyPlaceholder(const string& call_node,
+Node* KeyPlaceholder(const std::string& call_node,
                      const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(absl::StrCat(call_node, "_key_placeholder"),
@@ -396,15 +397,16 @@ Node* KeyPlaceholder(const string& call_node,
       .FinalizeBuilder(&node_builder);
 }
 
-Node* RecvAtHost(ops::NodeOut key_input, const string& cluster,
-                 const string& new_func_name, const string& oc_cluster,
+Node* RecvAtHost(ops::NodeOut key_input, const std::string& cluster,
+                 const std::string& new_func_name,
+                 const std::string& oc_cluster,
                  absl::Span<const DataType> dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  string key = absl::StrCat("host_compute_channel_", cluster, "_",
-                            new_func_name, "_", oc_cluster);
-  string name = absl::StrCat("outside_compilation_", cluster, "_",
-                             new_func_name, "_", oc_cluster, "_recv");
+  std::string key = absl::StrCat("host_compute_channel_", cluster, "_",
+                                 new_func_name, "_", oc_cluster);
+  std::string name = absl::StrCat("outside_compilation_", cluster, "_",
+                                  new_func_name, "_", oc_cluster, "_recv");
   NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
   node_builder.Input(std::move(key_input));
@@ -416,15 +418,16 @@ Node* RecvAtHost(ops::NodeOut key_input, const string& cluster,
       .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(ops::NodeOut key_input, const string& cluster,
-                   const string& new_func_name, const string& oc_cluster,
+Node* SendFromHost(ops::NodeOut key_input, const std::string& cluster,
+                   const std::string& new_func_name,
+                   const std::string& oc_cluster,
                    const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  string key = absl::StrCat("host_compute_channel_", cluster, "_",
-                            new_func_name, "_", oc_cluster);
-  string name = absl::StrCat("outside_compilation_", cluster, "_",
-                             new_func_name, "_", oc_cluster, "_send");
+  std::string key = absl::StrCat("host_compute_channel_", cluster, "_",
+                                 new_func_name, "_", oc_cluster);
+  std::string name = absl::StrCat("outside_compilation_", cluster, "_",
+                                  new_func_name, "_", oc_cluster, "_send");
   NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
@@ -477,8 +480,9 @@ Node* RetOp(int index, ops::NodeOut a, const GraphDefBuilder::Options& opts) {
   return opts.FinalizeBuilder(&node_builder);
 }
 
-absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
-                         const std::vector<string>& encapsulated_functions) {
+absl::Status Encapsulate(
+    GraphDef* graphdef, FunctionDefLibrary* library,
+    const std::vector<std::string>& encapsulated_functions) {
   absl::Status s;
   // Convert the GraphDef to a Graph
   std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -512,7 +516,7 @@ absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
                                       &graph_out, lib_def.get());
   if (!s.ok()) return s;
 
-  std::unordered_map<string, XlaClusterInfo> clusters;
+  std::unordered_map<std::string, XlaClusterInfo> clusters;
   for (const auto& func : encapsulated_functions) {
     Node* xla_computation_node;
     for (Node* n : graph_out->nodes()) {
@@ -527,7 +531,7 @@ absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
     func_name_attrs.set_name(func);
     clusters.emplace(func,
                      XlaClusterInfo{func, func_name_attrs, xla_computation_node,
-                                    std::map<string, int>{}});
+                                    std::map<std::string, int>{}});
   }
   bool modified;
   s = ExtractOutsideCompilation("_encapsulate", "_outside", clusters,
@@ -551,7 +555,7 @@ absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
 }
 
 absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
-  std::vector<string> encapsulated_functions;
+  std::vector<std::string> encapsulated_functions;
   return Encapsulate(graphdef, library, encapsulated_functions);
 }
 
@@ -698,8 +702,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) {
 }
 
 // Returns a vector of node names in 'graph', sorted by name.
-std::vector<string> GraphNodes(const Graph& graph) {
-  std::vector<string> nodes;
+std::vector<std::string> GraphNodes(const Graph& graph) {
+  std::vector<std::string> nodes;
   for (const auto& node : graph.nodes()) {
     if (!node->IsSource() && !node->IsSink()) {
       nodes.push_back(node->name());
@@ -710,8 +714,9 @@ std::vector<string> GraphNodes(const Graph& graph) {
 }
 
 // Returns a sorted vector of (src, dst) edges in 'graph'.
-std::vector<std::pair<string, string>> GraphEdges(const Graph& graph) {
-  std::vector<std::pair<string, string>> edges;
+std::vector<std::pair<std::string, std::string>> GraphEdges(
+    const Graph& graph) {
+  std::vector<std::pair<std::string, std::string>> edges;
   for (const Edge* edge : graph.edges()) {
     if (edge->src()->IsSource() || edge->dst()->IsSink()) continue;
     edges.emplace_back(
@@ -742,10 +747,11 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
       /*rewrite_subgraph_fn=*/{},
       /*reuse_existing_functions=*/false, &graph, &library));
 
-  std::vector<string> expected_nodes = {"cluster1", "cluster2", "mul", "x"};
+  std::vector<std::string> expected_nodes = {"cluster1", "cluster2", "mul",
+                                             "x"};
   EXPECT_EQ(expected_nodes, GraphNodes(*graph));
 
-  std::vector<std::pair<string, string>> expected_edges = {
+  std::vector<std::pair<std::string, std::string>> expected_edges = {
       {"cluster1:0", "cluster2:0"},
       {"cluster1:0", "mul:0"},
       {"cluster2:0", "mul:1"},
@@ -753,7 +759,7 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   EXPECT_EQ(expected_edges, GraphEdges(*graph));
 }
 
-const Node* FindNodeByName(const Graph& graph, const string& name) {
+const Node* FindNodeByName(const Graph& graph, const std::string& name) {
   for (const Node* node : graph.nodes()) {
     if (node->name() == name) return node;
   }
@@ -889,7 +895,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -931,7 +937,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
            {"C:o:0", "c:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -941,7 +947,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"c"}},
@@ -1025,7 +1031,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1102,7 +1108,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"F:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1112,8 +1118,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node",
-                                       "outside_compilation_O1_host_compute"})},
+             absl::Span<const std::string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O2_host_compute"}},
            {"F", "outside_compilation_O1_host_compute"}},
@@ -1122,7 +1129,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1132,7 +1139,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1235,7 +1242,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1", "F2"};
+  std::vector<std::string> encapsulated_functions{"F1", "F2"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1262,7 +1269,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1273,7 +1280,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1295,7 +1302,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"d_0_arg", "G:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F2_F2_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1306,7 +1313,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1409,7 +1416,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1", "F2"};
+  std::vector<std::string> encapsulated_functions{"F1", "F2"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1432,7 +1439,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1443,7 +1450,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1462,7 +1469,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
            {"G:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F2_F2_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1473,7 +1480,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1556,7 +1563,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1578,7 +1585,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1589,7 +1596,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1652,7 +1659,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1674,7 +1681,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1685,7 +1692,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1748,7 +1755,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1785,7 +1792,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1795,7 +1802,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1858,7 +1865,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1899,7 +1906,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1909,7 +1916,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1978,7 +1985,7 @@ TEST(EncapsulateSubgraphsTest,
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2037,7 +2044,7 @@ TEST(EncapsulateSubgraphsTest,
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2047,7 +2054,7 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
           {{"outside_compilation_O2_host_compute"},
@@ -2055,7 +2062,7 @@ TEST(EncapsulateSubgraphsTest,
            {"F:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2065,8 +2072,9 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node",
-                                       "outside_compilation_O1_host_compute"})},
+             absl::Span<const std::string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O2_host_compute"}},
            {"outside_compilation_O1_host_compute"}},
@@ -2149,7 +2157,7 @@ TEST(EncapsulateSubgraphsTest,
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2189,7 +2197,7 @@ TEST(EncapsulateSubgraphsTest,
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2199,8 +2207,9 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node",
-                                       "outside_compilation_O1_host_compute"})},
+             absl::Span<const std::string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O2_host_compute"}},
            {"outside_compilation_O1_host_compute"}},
@@ -2209,7 +2218,7 @@ TEST(EncapsulateSubgraphsTest,
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2219,7 +2228,7 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -2303,7 +2312,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2340,7 +2349,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-         {"ancestors", absl::Span<const string>({})},
+         {"ancestors", absl::Span<const std::string>({})},
          {"key", "host_compute_channel_F1_F1_O1"},
          {"send_key", ""},
          {"recv_key", ""},
@@ -2350,7 +2359,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O1"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>({"_xla_token_arg_node"})},
+          absl::Span<const std::string>({"_xla_token_arg_node"})},
          {"_xla_original_oc_node_name",
           "outside_compilation_O1_host_compute"}}},
        {{"outside_compilation_O2_host_compute"},
@@ -2358,7 +2367,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({})},
-         {"ancestors", absl::Span<const string>({})},
+         {"ancestors", absl::Span<const std::string>({})},
          {"key", "host_compute_channel_F1_F1_O2"},
          {"send_key", ""},
          {"recv_key", ""},
@@ -2368,7 +2377,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O2"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>(
+          absl::Span<const std::string>(
               {"_xla_token_arg_node", "outside_compilation_O1_host_compute"})},
          {"_xla_original_oc_node_name", "outside_compilation_O2_host_compute"}},
         {"outside_compilation_O1_host_compute"}},
@@ -2377,7 +2386,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({})},
-         {"ancestors", absl::Span<const string>({})},
+         {"ancestors", absl::Span<const std::string>({})},
          {"key", "host_compute_channel_F1_F1_O3"},
          {"send_key", ""},
          {"recv_key", ""},
@@ -2387,9 +2396,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O3"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>({"_xla_token_arg_node",
-                                    "outside_compilation_O1_host_compute",
-                                    "outside_compilation_O2_host_compute"})},
+          absl::Span<const std::string>(
+              {"_xla_token_arg_node", "outside_compilation_O1_host_compute",
+               "outside_compilation_O2_host_compute"})},
          {"_xla_original_oc_node_name", "outside_compilation_O3_host_compute"}},
         {"outside_compilation_O1_host_compute",
          "outside_compilation_O2_host_compute"}}},
@@ -2470,7 +2479,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2507,7 +2516,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2517,7 +2526,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -2586,7 +2595,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2627,7 +2636,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {"c_0_arg", "c:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2637,7 +2646,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"c"}},
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index fa94a341bbabc6..445ca63c05ad66 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -36,7 +36,8 @@ namespace {
 
 // Returns string attribute value for the node if the attribute is present,
 // otherwise returns empty optional value.
-std::optional<string> GetStringAttr(const Node& n, const string& attr_name) {
+std::optional<std::string> GetStringAttr(const Node& n,
+                                         const std::string& attr_name) {
   auto attr = n.attrs().Find(attr_name);
   if (!attr) {
     return std::nullopt;
@@ -47,8 +48,8 @@ std::optional<string> GetStringAttr(const Node& n, const string& attr_name) {
 
 // Adds a value to the node's list attribute.
 template <typename T>
-absl::Status AppendToListAttr(Node* n, const string& attr_name,
-                              const string& value) {
+absl::Status AppendToListAttr(Node* n, const std::string& attr_name,
+                              const std::string& value) {
   std::vector<T> attr_value;
   absl::Status s = GetNodeAttr(n->attrs(), attr_name, &attr_value);
   if (!s.ok() && s.code() != error::NOT_FOUND) {
@@ -63,7 +64,7 @@ absl::Status AppendToListAttr(Node* n, const string& attr_name,
 
 // Replaces attribute value.
 template <typename T>
-void ReplaceAttr(Node* n, const string& attr_name, const T& value) {
+void ReplaceAttr(Node* n, const std::string& attr_name, const T& value) {
   n->ClearAttr(attr_name);
   n->AddAttr(attr_name, value);
 }
@@ -71,7 +72,7 @@ void ReplaceAttr(Node* n, const string& attr_name, const T& value) {
 // Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PreprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PreprocessControlEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Gather edges to remove. We should not remove the edge while iterating.
   std::vector<const Edge*> edges_to_remove;
   for (const Edge* e : g->edges()) {
@@ -89,7 +90,7 @@ absl::Status PreprocessControlEdgesBetweenOutsideCompilations(
         // Case 1a: outside compilation to outside compilation control edge.
         edges_to_remove.push_back(e);
 
-        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+        TF_RETURN_IF_ERROR(AppendToListAttr<std::string>(
             e->dst(), kXlaControlDependenciesWithinXlaClusterAttrName,
             e->src()->name()));
       }
@@ -111,7 +112,7 @@ absl::Status PreprocessControlEdgesBetweenOutsideCompilations(
 // Step 2 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PreprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Gather edges between outside compilation and host computation. Notice that
   // we do not store `Edge*` directly because we remove some nodes while adding
   // Identity nodes, and those Edge pointers might be invalidated.
@@ -138,7 +139,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
 
   // Remove the edge from host to outside compilation. Add a placeholder as
   // outside compilation node input.
-  std::map<std::pair<string, int>, Node*> placeholders;
+  std::map<std::pair<std::string, int>, Node*> placeholders;
   for (int i = 0, end = edges.size(); i < end; i++) {
     Node* dst = g->FindNodeId(edges[i].dst_node_id);
     const Edge* e;
@@ -148,7 +149,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
     g->RemoveEdge(e);
 
     // Find or create placeholder node.
-    string new_name =
+    std::string new_name =
         absl::StrCat(src->name(), "_oc_to_oc_placeholder_", src_output);
     auto placeholder_index = std::make_pair(src->name(), src_output);
     auto iter = placeholders.find(placeholder_index);
@@ -156,7 +157,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
     if (iter == placeholders.end()) {
       NodeDefBuilder placeholder_builder(new_name, "Placeholder");
       placeholder_builder.Attr("dtype", src->output_type(src_output));
-      string outside_compilation_attr;
+      std::string outside_compilation_attr;
       TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
                                      outside_compilation_attr_name,
                                      &outside_compilation_attr));
@@ -195,7 +196,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
 // Step 1 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PostprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PostprocessDataEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Gather all outside compilation to outside compilation nodes.
   std::vector<Node*> placeholder_nodes;
   for (Node* n : g->nodes()) {
@@ -208,7 +209,7 @@ absl::Status PostprocessDataEdgesBetweenOutsideCompilations(
   // Remove the placeholder nodes, and reconnect original edge.
   auto node_name_index = g->BuildNodeNameIndex();
   for (auto n : placeholder_nodes) {
-    string node_name;
+    std::string node_name;
     int node_src_output;
     TF_RETURN_IF_ERROR(GetNodeAttr(
         n->attrs(), kOutsideCompilationOriginalNodeAttrName, &node_name));
@@ -271,12 +272,12 @@ absl::Status PostprocessDataEdgesBetweenOutsideCompilations(
 // Step 2 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PostprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PostprocessControlEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   auto node_name_index = g->BuildNodeNameIndex();
 
   // Reconnect outside compilation to outside compilation control edge.
   for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
+    std::vector<std::string> control_deps;
     absl::Status s =
         GetNodeAttr(n->attrs(), kXlaControlDependenciesWithinXlaClusterAttrName,
                     &control_deps);
@@ -288,7 +289,7 @@ absl::Status PostprocessControlEdgesBetweenOutsideCompilations(
       }
     } else {
       n->ClearAttr(kXlaControlDependenciesWithinXlaClusterAttrName);
-      for (const string& control_input : control_deps) {
+      for (const std::string& control_input : control_deps) {
         auto iter = node_name_index.find(control_input);
         if (iter == node_name_index.end()) {
           return errors::Internal("Cannot find original node for ",
@@ -342,11 +343,11 @@ absl::Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g) {
 }
 
 absl::StatusOr<
-    std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+    std::unique_ptr<absl::flat_hash_map<std::string, std::vector<std::string>>>>
 OutsideCompilationClusterDependencies(
-    const Graph* g, const string& outside_compilation_attr_name) {
+    const Graph* g, const std::string& outside_compilation_attr_name) {
   auto cluster_deps = std::make_unique<
-      absl::flat_hash_map<string, absl::flat_hash_set<string>>>();
+      absl::flat_hash_map<std::string, absl::flat_hash_set<std::string>>>();
 
   for (const Edge* e : g->edges()) {
     auto src_outside_compilation =
@@ -360,18 +361,18 @@ OutsideCompilationClusterDependencies(
       if (dst_deps_it == cluster_deps->end()) {
         cluster_deps->insert(std::make_pair(
             *dst_outside_compilation,
-            absl::flat_hash_set<string>({*src_outside_compilation})));
+            absl::flat_hash_set<std::string>({*src_outside_compilation})));
       } else {
         dst_deps_it->second.insert(*src_outside_compilation);
       }
     }
   }
 
-  auto cluster_deps_ordered =
-      std::make_unique<absl::flat_hash_map<string, std::vector<string>>>();
+  auto cluster_deps_ordered = std::make_unique<
+      absl::flat_hash_map<std::string, std::vector<std::string>>>();
 
   for (auto it = cluster_deps->begin(); it != cluster_deps->end(); it++) {
-    std::vector<string> ordered_deps(it->second.begin(), it->second.end());
+    std::vector<std::string> ordered_deps(it->second.begin(), it->second.end());
     std::sort(ordered_deps.begin(), ordered_deps.end());
     cluster_deps_ordered->insert(std::make_pair(it->first, ordered_deps));
   }
@@ -380,7 +381,7 @@ OutsideCompilationClusterDependencies(
 }
 
 absl::Status PreprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Remove edges from source node to outside compilation nodes, and edges
   // from outside compilation nodes to sink node.
   std::vector<const Edge*> edges_to_remove;
@@ -406,7 +407,7 @@ absl::Status PreprocessEdgesBetweenOutsideCompilations(
 }
 
 absl::Status PostprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   TF_RETURN_IF_ERROR(PostprocessDataEdgesBetweenOutsideCompilations(
       g, outside_compilation_attr_name));
   TF_RETURN_IF_ERROR(PostprocessControlEdgesBetweenOutsideCompilations(
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index 7c99763c770728..81ab31c79dcda2 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -95,21 +95,21 @@ struct XlaClusterInfo {
   // without losing aggregate initialization, which allows us to get rid of
   // the constructor definitions again.
   XlaClusterInfo() {}
-  XlaClusterInfo(const string& cluster_name,
+  XlaClusterInfo(const std::string& cluster_name,
                  const NameAttrList& func_name_attrs, Node* node,
-                 const std::map<string, int>& host_compute_core)
+                 const std::map<std::string, int>& host_compute_core)
       : cluster_name(cluster_name),
         func_name_attrs(func_name_attrs),
         node(node),
         host_compute_core(host_compute_core) {}
   // XLA cluster name. It might be different from `func_name`.
-  const string cluster_name;
+  const std::string cluster_name;
   // Name and attributes of XLA computation function.
   const NameAttrList func_name_attrs;
   // The XLA computation node in the graph.
   Node* node;
   // A mapping from outside compilation cluster name to its device assignment.
-  const std::map<string, int> host_compute_core;
+  const std::map<std::string, int> host_compute_core;
 };
 
 // Finds dependencies between outside compilation clusters, including both data
@@ -117,9 +117,9 @@ struct XlaClusterInfo {
 // outside compilation cluster to a set of names of outside compilation clusters
 // that it depends on.
 absl::StatusOr<
-    std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+    std::unique_ptr<absl::flat_hash_map<std::string, std::vector<std::string>>>>
 OutsideCompilationClusterDependencies(
-    const Graph* g, const string& outside_compilation_attr_name);
+    const Graph* g, const std::string& outside_compilation_attr_name);
 
 // Preprocesses edges within the same XLA cluster. It will perform the following
 // operations in order:
@@ -135,7 +135,7 @@ OutsideCompilationClusterDependencies(
 // 2.  For data edges between different outside compilations, remove the edge
 //     and create a Placeholder node as dst node's input.
 absl::Status PreprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name);
+    Graph* g, const std::string& outside_compilation_attr_name);
 
 // Postprocesses edges within the same XLA cluster. This function reverts what
 // `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the
@@ -149,7 +149,7 @@ absl::Status PreprocessEdgesBetweenOutsideCompilations(
 // `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here.
 // They are handled in `RewriteOutsideCompilationSubgraphFn`.
 absl::Status PostprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name);
+    Graph* g, const std::string& outside_compilation_attr_name);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 0e59bf0c19d93e..8ba11404010363 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -46,7 +46,7 @@ const char* const kXlaClusterOutput = "XlaClusterOutput";
 
 bool IsCpuGpuCompile(const Graph* graph) {
   for (Node* n : graph->nodes()) {
-    string name;
+    std::string name;
     // Only consider nodes being compiled.
     if (!TryGetNodeAttr(n->attrs(), kXlaClusterIdAttr, &name)) continue;
     // Early return for any node with a device that is not a CPU or GPU.
@@ -185,7 +185,7 @@ absl::Status RewriteSubgraph(
   // Uniquify the function name by computing a fingerprint of the function.
   // Nondeterminism in serialization would not lead to incorrect results, but
   // may cause spurious cache misses.
-  TF_ASSIGN_OR_RETURN(uint64 fingerprint, FingerprintGraph(*graph));
+  TF_ASSIGN_OR_RETURN(uint64_t fingerprint, FingerprintGraph(*graph));
   VLOG(1) << "Subgraph fingerprint:" << fingerprint;
   call_def->set_op(absl::StrCat(call_def->op(), "_", fingerprint));
   return absl::OkStatus();
@@ -360,7 +360,8 @@ absl::Status RewriteSubgraph(
 /*static*/ absl::Status EncapsulateXlaComputationsPass::BuildXlaLaunchOps(
     Graph* graph) {
   const auto is_xla_launch_node = [](const Node& node) -> absl::StatusOr<bool> {
-    const string& name = GetNodeAttrString(node.attrs(), kXlaClusterIdAttr);
+    const std::string& name =
+        GetNodeAttrString(node.attrs(), kXlaClusterIdAttr);
     return !name.empty();
   };
 
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index 16a17c3c2a03a6..acd5319cf8ed16 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 namespace tensorflow {
 
 static std::unique_ptr<Graph> MakeOuterGraph(
-    const FunctionLibraryDefinition& flib_def, const string& function) {
+    const FunctionLibraryDefinition& flib_def, const std::string& function) {
   Scope scope = Scope::NewRootScope().ExitOnError();
   TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(flib_def.ToProto()));
 
@@ -143,7 +143,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
   // Test that control edge insertion order doesn't affect the cache key
   // (cluster name) generated by TPU encapsulate pass.
   auto get_serialized_graph = [](bool control_input_reversed,
-                                 bool operand_reversed) -> string {
+                                 bool operand_reversed) -> std::string {
     FunctionLibraryDefinition flib_def(OpRegistry::Global(),
                                        FunctionDefLibrary());
     std::unique_ptr<Graph> graph(new Graph(&flib_def));
@@ -250,8 +250,8 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
 
   TF_ASSERT_OK(EncapsulateXlaComputationsPass::Encapsulate(&graph, &flib_def));
 
-  std::unordered_map<string, Node*> index = graph->BuildNodeNameIndex();
-  string function = index.at("launch0")->type_string();
+  std::unordered_map<std::string, Node*> index = graph->BuildNodeNameIndex();
+  std::string function = index.at("launch0")->type_string();
 
   // Tests the outer graph is as expected.
   {
@@ -285,9 +285,9 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
   // function. Encapsulation should be deterministic to avoid recompilation.
   TF_ASSERT_OK(
       EncapsulateXlaComputationsPass::Encapsulate(&graph_copy, &flib_def));
-  std::unordered_map<string, Node*> index_copy =
+  std::unordered_map<std::string, Node*> index_copy =
       graph_copy->BuildNodeNameIndex();
-  string function_copy = index_copy.at("launch0")->type_string();
+  std::string function_copy = index_copy.at("launch0")->type_string();
   EXPECT_EQ(function, function_copy);
 }
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 140c47dbcac804..05514f00bd29d5 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -42,7 +42,7 @@ namespace {
 
 // Control return mapping function for outside compilation host graphs.
 // All nodes with kXlaHasHostTransfer attribute are control outputs.
-std::optional<string> HostGraphControlRetMapping(const Node* n) {
+std::optional<std::string> HostGraphControlRetMapping(const Node* n) {
   if (HasNodeAttr(n->def(), kXlaHasHostTransferAttrName)) {
     return n->name();
   }
@@ -52,7 +52,7 @@ std::optional<string> HostGraphControlRetMapping(const Node* n) {
 // Add a key placeholder node to the graph. The key placeholder node will be
 // used as input for XlaRecvAtHost/XlaSendFromHost nodes.
 absl::StatusOr<Node*> AddHostComputeKeyPlaceholder(
-    const string& xla_cluster_name, Graph* g) {
+    const std::string& xla_cluster_name, Graph* g) {
   NodeDef key_def;
   NodeDefBuilder builder(absl::StrCat(xla_cluster_name, "_key_placeholder"),
                          "Placeholder");
@@ -74,7 +74,8 @@ bool IsKeyPlaceholderNode(const Node& n) {
 }
 
 // Returns nodes with given type.
-std::vector<Node*> GatherNodesWithType(const Graph& g, const string& type) {
+std::vector<Node*> GatherNodesWithType(const Graph& g,
+                                       const std::string& type) {
   std::vector<Node*> result;
   for (Node* n : g.nodes()) {
     if (n->type_string() == type) {
@@ -105,7 +106,7 @@ absl::Status GetArgDataTypes(const std::vector<Node*>& arg_nodes,
 
 // Builds XlaRecvAtHost node.
 absl::StatusOr<Node*> BuildRecvAtHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     const std::vector<DataType>& recv_at_host_dtypes, Node* key_placeholder) {
   NodeDefBuilder recv_at_host_builder(
       absl::StrCat("outside_compilation_", oc_cluster_name, "_recv"),
@@ -128,7 +129,7 @@ absl::StatusOr<Node*> BuildRecvAtHostNode(
 
 // Builds XlaRecvAtHost node, and replaces all _Arg nodes with it.
 absl::StatusOr<Node*> ReplaceArgNodesWithRecvAtHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     std::vector<DataType>* recv_at_host_dtypes, Node* key_placeholder) {
   // TODO(b/77601805): use out nodes for source node, instead of traversing all
   // nodes.
@@ -205,7 +206,7 @@ absl::Status GetRetDataTypes(const std::vector<Node*>& ret_nodes,
 
 // Builds XlaSendFromHost node.
 absl::StatusOr<Node*> BuildSendFromHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     const std::vector<Node*>& ret_nodes,
     const std::vector<DataType>& send_from_host_dtypes, Node* key_placeholder) {
   NodeDefBuilder send_from_host_builder(
@@ -245,7 +246,7 @@ absl::StatusOr<Node*> BuildSendFromHostNode(
 
 // Builds XlaSendFromHost node, and replaces all _Retval nodes with it.
 absl::StatusOr<Node*> ReplaceRetNodesWithSendFromHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     std::vector<DataType>* send_from_host_dtypes, Node* key_placeholder) {
   // TODO(b/77601805): use in nodes for sink node, instead of traversing all
   // nodes.
@@ -299,16 +300,17 @@ std::optional<std::vector<PartialTensorShape>> GetInferredInputShapes(
   return results;
 }
 
-string host_compute_node_name(const string& original_oc_name) {
+std::string host_compute_node_name(const std::string& original_oc_name) {
   return absl::StrCat("outside_compilation_", original_oc_name,
                       "_host_compute");
 }
 
 // Builds XlaHostCompute NodeDef from the outside compilation call node.
 absl::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
-    const Node* call_node, const std::map<string, int>& host_compute_core,
-    const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
-  string original_oc_name;
+    const Node* call_node, const std::map<std::string, int>& host_compute_core,
+    const absl::flat_hash_map<std::string, std::vector<std::string>>&
+        cluster_deps) {
+  std::string original_oc_name;
   TF_RETURN_IF_ERROR(GetNodeAttr(
       call_node->attrs(), "_outside_compilation_subgraph", &original_oc_name));
   NodeDefBuilder host_compute_builder(host_compute_node_name(original_oc_name),
@@ -341,7 +343,7 @@ absl::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
   // according to their host-side graph dependency. This can cause deadlock.
   // Therefore, we hint XLA what the correct ordering of these clusters should
   // be to avoid deadlocks.
-  std::vector<string> xla_token_input_nodes;
+  std::vector<std::string> xla_token_input_nodes;
   xla_token_input_nodes.emplace_back(kXlaTokenArgNodeName);
   auto cluster_deps_it = cluster_deps.find(original_oc_name);
   if (cluster_deps_it != cluster_deps.end()) {
@@ -376,8 +378,10 @@ absl::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
 
 // Replace outside compilation function call node with XlaHostCompute node.
 TF_ATTRIBUTE_NOINLINE absl::StatusOr<Node*> ReplaceOutsideCompilationCallNode(
-    Graph* g, Node* call_node, const std::map<string, int>& host_compute_core,
-    const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
+    Graph* g, Node* call_node,
+    const std::map<std::string, int>& host_compute_core,
+    const absl::flat_hash_map<std::string, std::vector<std::string>>&
+        cluster_deps) {
   // Build XlaHostCompute NodeDef.
   TF_ASSIGN_OR_RETURN(
       NodeDef node_def,
@@ -405,8 +409,8 @@ absl::Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
       n->ClearAttr("device_ordinal");
       n->AddAttr("device_ordinal", device_ordinal_value);
     } else if (n->IsIfNode()) {
-      for (const string& attr_name :
-           std::vector<string>{"then_branch", "else_branch"}) {
+      for (const std::string& attr_name :
+           std::vector<std::string>{"then_branch", "else_branch"}) {
         NameAttrList branch_func;
         TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
         (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -414,7 +418,8 @@ absl::Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
         n->AddAttr(attr_name, branch_func);
       }
     } else if (n->IsWhileNode()) {
-      for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+      for (const std::string& attr_name :
+           std::vector<std::string>{"cond", "body"}) {
         NameAttrList branch_func;
         TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
         (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -448,11 +453,12 @@ bool HasLiftedArgs(const FunctionDef& function_def) {
 absl::StatusOr<std::vector<std::pair<Node*, Node*>>>
 LiftedArgsAndOutsideCompilationNodesInFunctionBody(
     const FunctionBody& function_body,
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node) {
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node) {
   std::vector<std::pair<Node*, Node*>>
       lifted_arg_nodes_and_outside_compilation_nodes;
   for (Node* n : function_body.graph->op_nodes()) {
-    string oc_cluster;
+    std::string oc_cluster;
     if (n->type_string() == "Placeholder" &&
         GetNodeAttr(n->def(), kXlaLiftedArgOutsideCompilationAttrName,
                     &oc_cluster)
@@ -471,7 +477,7 @@ LiftedArgsAndOutsideCompilationNodesInFunctionBody(
 absl::StatusOr<std::vector<DataType>> UpdateTypesAttribute(
     const std::vector<std::pair<Node*, Node*>>&
         lifted_arg_nodes_and_outside_compilation_nodes,
-    const string& type_attr_name, Node* n) {
+    const std::string& type_attr_name, Node* n) {
   std::vector<DataType> data_types;
   data_types.reserve(lifted_arg_nodes_and_outside_compilation_nodes.size());
   TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), type_attr_name, &data_types));
@@ -578,7 +584,8 @@ absl::Status AddFunctionWithNewName(const std::string& new_name,
 // Reconnect outside compilation lifted arguments in a functional While node to
 // its outside compilation tensor sources.
 absl::Status PostprocessLiftedArgsForWhile(
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node,
     Graph* g, Node* n, FunctionLibraryDefinition* fld) {
   TF_RET_CHECK(n->IsWhileNode());
 
@@ -687,7 +694,8 @@ absl::Status PostprocessLiftedArgsForWhile(
 }
 
 absl::Status PostprocessLiftedArgsForIf(
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node,
     Graph* g, Node* n, FunctionLibraryDefinition* fld) {
   TF_RET_CHECK(n->IsIfNode());
 
@@ -826,7 +834,8 @@ absl::Status PostprocessLiftedArgsForIf(
 }
 
 absl::Status PostprocessLiftedArgsForCall(
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node,
     Graph* g, Node* n, FunctionLibraryDefinition* fld) {
   const FunctionDef* fdef = fld->Find(n->type_string());
   TF_RET_CHECK(fdef);
@@ -924,12 +933,12 @@ absl::Status PostprocessLiftedArgsForCall(
 
 // Creates a mapping from outside compilation cluster name to lifted argument
 // placeholder.
-absl::StatusOr<std::unordered_map<string, Node*>> OutsideCompilationAttrToNode(
-    const Graph& g) {
-  std::unordered_map<string, Node*> outside_compilation_attr_to_node;
+absl::StatusOr<std::unordered_map<std::string, Node*>>
+OutsideCompilationAttrToNode(const Graph& g) {
+  std::unordered_map<std::string, Node*> outside_compilation_attr_to_node;
   for (Node* n : g.op_nodes()) {
     bool is_lifted_arg;
-    string outside_compilation_attr;
+    std::string outside_compilation_attr;
     if (TryGetNodeAttr(n->def(), kXlaIsLiftedArgAttrName, &is_lifted_arg) &&
         TryGetNodeAttr(n->def(), "_xla_outside_compilation",
                        &outside_compilation_attr)) {
@@ -988,8 +997,9 @@ absl::Status PostprocessLiftedArgs(Graph* g, FunctionLibraryDefinition* fld) {
 //    replace this node with compilation result node.
 // 3) all outside compilation graphs.
 absl::Status ConstructHostGraph(
-    const string& xla_cluster_name, const string& outside_compilation_attr_name,
-    const std::vector<string>& outside_compilation_host_graphs,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::vector<std::string>& outside_compilation_host_graphs,
     FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
   host_graph->reset(new Graph(fld));
 
@@ -1013,7 +1023,7 @@ absl::Status ConstructHostGraph(
   //    XlaSendFromHost, If/While nodes containing
   //    XlaRecvAtHost/XlaSendFromHost) to sequencer node.
   // c) Clear node_def.device(), so device placer won't get confused.
-  for (const string& host_func : outside_compilation_host_graphs) {
+  for (const std::string& host_func : outside_compilation_host_graphs) {
     VLOG(4) << "Expanding host graph " << host_func;
     // Temporarily use "0" as "_device_ordinal". It will be reset to placeholder
     // value after we expanded all host graphs. We cannot just use placeholder
@@ -1021,7 +1031,7 @@ absl::Status ConstructHostGraph(
     // value for attributes.
     AttrValue device_ordinal_attr;
     device_ordinal_attr.set_i(0);
-    protobuf::Map<string, AttrValue> attrs;
+    protobuf::Map<std::string, AttrValue> attrs;
     attrs["_device_ordinal"] = device_ordinal_attr;
     std::unique_ptr<FunctionBody> host_fbody;
     const FunctionDef* host_fdef = fld->Find(host_func);
@@ -1123,18 +1133,17 @@ absl::Status ConstructHostGraph(
 
 // Expand XLA computation's outside compilation host side graph into main graph.
 // Add a control edge between sequencer node and the XLA computation node.
-absl::Status ExpandHostGraphIntoMainGraph(Graph* main_graph,
-                                          FunctionLibraryDefinition* fld,
-                                          const string& host_graph_func_name,
-                                          Node* xla_computation_node,
-                                          Node* pivot_node) {
+absl::Status ExpandHostGraphIntoMainGraph(
+    Graph* main_graph, FunctionLibraryDefinition* fld,
+    const std::string& host_graph_func_name, Node* xla_computation_node,
+    Node* pivot_node) {
   // Temporarily use "0" as "_device_ordinal". It will be rewritten with the
   // correct value in a later pass. We cannot just use placeholder value here
   // because FunctionDef instantiation does not allow placeholder value for
   // attributes.
   AttrValue device_ordinal_attr;
   device_ordinal_attr.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_attr;
   std::unique_ptr<FunctionBody> fbody;
   const FunctionDef* host_graph_func = fld->Find(host_graph_func_name);
@@ -1207,12 +1216,12 @@ absl::Status ExpandHostGraphIntoMainGraph(Graph* main_graph,
 // 2) Remove control edges.
 // 3) Prune nodes that are not useful for shape inference.
 absl::Status RewriteShapeInferenceGraph(
-    const string& shape_inference_graph_name, Graph* host_graph,
+    const std::string& shape_inference_graph_name, Graph* host_graph,
     Node* pivot_node, FunctionLibraryDefinition* fld) {
   // Use "0" as "_device_ordinal". It does not matter for shape inference.
   AttrValue device_ordinal_attr;
   device_ordinal_attr.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_attr;
   std::unique_ptr<FunctionBody> fbody;
   const FunctionDef* shape_inference_graph =
@@ -1338,13 +1347,13 @@ void SetMaximalSharding(NodeDefBuilder& node_builder) {
 
 // Builds XlaSendToHost node which sends cond predicate to host.
 TF_ATTRIBUTE_NOINLINE absl::StatusOr<Node*> BuildSendIfPredNode(
-    const string& name, const string& host_transfer_key, Node* pred_node,
-    Graph* g) {
+    const std::string& name, const std::string& host_transfer_key,
+    Node* pred_node, Graph* g) {
   NodeDefBuilder send_pred_builder(name, "XlaSendToHost");
   send_pred_builder.Attr("Tinput", DT_BOOL);
   send_pred_builder.Attr("key", absl::StrCat(host_transfer_key, "_dtoh_0"));
   send_pred_builder.Attr(kXlaTokenInputNodesAttrName,
-                         std::vector<string>{kXlaTokenArgNodeName});
+                         std::vector<std::string>{kXlaTokenArgNodeName});
   send_pred_builder.Attr(kXlaOriginalOutsideCompilationNodeName, name);
   SetMaximalSharding(send_pred_builder);
   send_pred_builder.Input(pred_node->name(), 0, DT_BOOL);
@@ -1356,14 +1365,14 @@ TF_ATTRIBUTE_NOINLINE absl::StatusOr<Node*> BuildSendIfPredNode(
 }
 
 // Replaces key placeholder node with an _Arg node.
-absl::Status ReplaceKeyPlaceholderWithArgNode(const string& xla_cluster_name,
-                                              const string& func_name,
-                                              FunctionLibraryDefinition* fld) {
+absl::Status ReplaceKeyPlaceholderWithArgNode(
+    const std::string& xla_cluster_name, const std::string& func_name,
+    FunctionLibraryDefinition* fld) {
   // Temporarily use "0" as "_device_ordinal". It will be reset to placeholder
   // value after rewriting.
   AttrValue device_ordinal_attr;
   device_ordinal_attr.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_attr;
   std::unique_ptr<FunctionBody> fbody;
   const FunctionDef* func = fld->Find(func_name);
@@ -1404,14 +1413,15 @@ absl::Status ReplaceKeyPlaceholderWithArgNode(const string& xla_cluster_name,
 
 // Builds host side graph for If node.
 TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForIfNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const string& if_node_name, const string& host_transfer_key,
-    const string& host_graph_func_name, FunctionLibraryDefinition* fld,
-    const string& then_branch_host_func_name,
-    const string& else_branch_host_func_name) {
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const std::string& if_node_name,
+    const std::string& host_transfer_key,
+    const std::string& host_graph_func_name, FunctionLibraryDefinition* fld,
+    const std::string& then_branch_host_func_name,
+    const std::string& else_branch_host_func_name) {
   Graph host_graph(fld);
-  string outside_compilation_name = absl::StrCat("oc_if_", if_node_name);
+  std::string outside_compilation_name = absl::StrCat("oc_if_", if_node_name);
   AttrValue device_ordinal_value;
   device_ordinal_value.set_placeholder("_device_ordinal");
 
@@ -1484,7 +1494,7 @@ TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForIfNode(
 
 // Rewrites loop cond to add a node which sends loop cond to host.
 TF_ATTRIBUTE_NOINLINE absl::Status AddSendLoopPredToLoopCond(
-    const string& cond_xla_func_name, const string& host_transfer_key,
+    const std::string& cond_xla_func_name, const std::string& host_transfer_key,
     NameAttrList* loop_cond_func, FunctionLibraryDefinition* fld,
     Node* while_node) {
   // Instantiate the loop cond function.
@@ -1523,7 +1533,7 @@ TF_ATTRIBUTE_NOINLINE absl::Status AddSendLoopPredToLoopCond(
   send_loop_cond_builder.Attr("key",
                               absl::StrCat(host_transfer_key, "_dtoh_0"));
   send_loop_cond_builder.Attr(kXlaTokenInputNodesAttrName,
-                              std::vector<string>{kXlaTokenArgNodeName});
+                              std::vector<std::string>{kXlaTokenArgNodeName});
   send_loop_cond_builder.Attr(kXlaOriginalOutsideCompilationNodeName,
                               send_loop_cond_builder.node_name());
   SetMaximalSharding(send_loop_cond_builder);
@@ -1560,10 +1570,13 @@ TF_ATTRIBUTE_NOINLINE absl::Status AddSendLoopPredToLoopCond(
 
 // Rewrites while loop cond function for host.
 absl::Status RewriteHostWhileLoopCond(
-    const string& cond_host_func_name, const string& while_node_name,
-    const string& host_transfer_key, const string& xla_cluster_attr_name,
-    const string& xla_cluster_name, const string& outside_compilation_attr_name,
-    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+    const std::string& cond_host_func_name, const std::string& while_node_name,
+    const std::string& host_transfer_key,
+    const std::string& xla_cluster_attr_name,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& outside_compilation_name,
+    FunctionLibraryDefinition* fld) {
   // Replace key placeholder node with _Arg node.
   TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
       xla_cluster_name, cond_host_func_name, fld));
@@ -1571,7 +1584,7 @@ absl::Status RewriteHostWhileLoopCond(
   // Instantiate cond function.
   AttrValue device_ordinal_temp_value;
   device_ordinal_temp_value.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_temp_value;
   std::unique_ptr<FunctionBody> cond_fbody;
   const FunctionDef* cond_host_func = fld->Find(cond_host_func_name);
@@ -1634,10 +1647,13 @@ absl::Status RewriteHostWhileLoopCond(
 
 // Rewrites while loop body function for host.
 absl::Status RewriteHostWhileLoopBody(
-    const string& body_host_func_name, const string& while_node_name,
-    const string& host_transfer_key, const string& xla_cluster_attr_name,
-    const string& xla_cluster_name, const string& outside_compilation_attr_name,
-    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+    const std::string& body_host_func_name, const std::string& while_node_name,
+    const std::string& host_transfer_key,
+    const std::string& xla_cluster_attr_name,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& outside_compilation_name,
+    FunctionLibraryDefinition* fld) {
   // Replace key placeholder node with _Arg node.
   TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
       xla_cluster_name, body_host_func_name, fld));
@@ -1645,7 +1661,7 @@ absl::Status RewriteHostWhileLoopBody(
   // Instantiate body function.
   AttrValue device_ordinal_temp_value;
   device_ordinal_temp_value.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_temp_value;
   std::unique_ptr<FunctionBody> body_fbody;
   const FunctionDef* body_host_func = fld->Find(body_host_func_name);
@@ -1692,13 +1708,16 @@ absl::Status RewriteHostWhileLoopBody(
 
 // Builds host side graph for while node.
 TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForWhileNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const string& while_node_name, const string& host_transfer_key,
-    const string& host_graph_func_name, FunctionLibraryDefinition* fld,
-    const string& cond_host_func_name, const string& body_host_func_name) {
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const std::string& while_node_name,
+    const std::string& host_transfer_key,
+    const std::string& host_graph_func_name, FunctionLibraryDefinition* fld,
+    const std::string& cond_host_func_name,
+    const std::string& body_host_func_name) {
   Graph host_graph(fld);
-  string outside_compilation_name = absl::StrCat("oc_while_", while_node_name);
+  std::string outside_compilation_name =
+      absl::StrCat("oc_while_", while_node_name);
 
   // Step 1: add key placeholder node.
   TF_ASSIGN_OR_RETURN(
@@ -1759,10 +1778,12 @@ TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForWhileNode(
 
 // Builds host graph for func call nodes.
 absl::Status BuildHostGraphForFuncCallNode(
-    const string& xla_cluster_attr_name, const string& xla_cluster_name,
-    const string& outside_compilation_attr_name,
-    const string& func_call_node_name, const string& func_call_host_func_name,
-    const string& host_graph_func_name, FunctionLibraryDefinition* fld) {
+    const std::string& xla_cluster_attr_name,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& func_call_node_name,
+    const std::string& func_call_host_func_name,
+    const std::string& host_graph_func_name, FunctionLibraryDefinition* fld) {
   Graph host_graph(fld);
   AttrValue device_ordinal_value;
   device_ordinal_value.set_placeholder("_device_ordinal");
@@ -1807,18 +1828,19 @@ absl::Status BuildHostGraphForFuncCallNode(
 }
 
 TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core, Graph* g, Node* n,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
-    std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   bool func_has_outside_compilation = false;
   NameAttrList func;
   if (fld->Contains(n->type_string())) {
     func.set_name(n->type_string());
-    typedef protobuf::Map<string, AttrValue> AttrMap;
+    typedef protobuf::Map<std::string, AttrValue> AttrMap;
     *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end());
   } else if (n->IsPartitionedCall()) {
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "f", &func));
@@ -1827,7 +1849,7 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
     func.set_name(FunctionLibraryDefinition::kGradientOp);
     *func.mutable_attr() = n->def().attr();
   }
-  string canonical_func_name;
+  std::string canonical_func_name;
   if (func.name() == FunctionLibraryDefinition::kGradientOp) {
     NameAttrList forward_func;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "f", &forward_func));
@@ -1835,8 +1857,8 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
   } else {
     canonical_func_name = func.name();
   }
-  string new_func_name = absl::StrCat(canonical_func_name, "_oc");
-  string host_func_name =
+  std::string new_func_name = absl::StrCat(canonical_func_name, "_oc");
+  std::string host_func_name =
       absl::StrCat("oc_func_call_host_", canonical_func_name);
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
@@ -1876,11 +1898,11 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
   TF_RETURN_IF_ERROR(replace_builder->Finalize(replace_def.get()));
   TF_ASSIGN_OR_RETURN(Node * replace, ReplaceNode(g, n, *replace_def));
   replace->AddAttr(kXlaTokenInputNodesAttrName,
-                   std::vector<string>{kXlaTokenArgNodeName});
+                   std::vector<std::string>{kXlaTokenArgNodeName});
   replace->AddAttr(kXlaOriginalOutsideCompilationNodeName, replace->name());
 
   // Build host side graph for the function call.
-  string oc_host_graph_name =
+  std::string oc_host_graph_name =
       absl::StrCat("oc_func_host_graph_", replace->name());
   TF_RETURN_IF_ERROR(BuildHostGraphForFuncCallNode(
       xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
@@ -1893,12 +1915,13 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
 }
 
 absl::Status ExtractOutsideCompilationForIfNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core, Graph* g, Node* n,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
-    std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   // Instantiate "then_branch" and "else_branch".
   NameAttrList then_branch, else_branch;
@@ -1908,12 +1931,14 @@ absl::Status ExtractOutsideCompilationForIfNode(
   // Extract outside compilation for then_branch and else_branch.
   bool then_branch_has_outside_compilation = false;
   bool else_branch_has_outside_compilation = false;
-  string then_branch_host_func_name =
-             absl::StrCat("oc_then_branch_host_if_", then_branch.name()),
-         else_branch_host_func_name =
-             absl::StrCat("oc_else_branch_host_if_", else_branch.name());
-  string then_branch_xla_func_name = absl::StrCat(then_branch.name(), "_oc"),
-         else_branch_xla_func_name = absl::StrCat(else_branch.name(), "_oc");
+  std::string then_branch_host_func_name =
+                  absl::StrCat("oc_then_branch_host_if_", then_branch.name()),
+              else_branch_host_func_name =
+                  absl::StrCat("oc_else_branch_host_if_", else_branch.name());
+  std::string then_branch_xla_func_name =
+                  absl::StrCat(then_branch.name(), "_oc"),
+              else_branch_xla_func_name =
+                  absl::StrCat(else_branch.name(), "_oc");
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       then_branch, then_branch_xla_func_name, then_branch_host_func_name,
@@ -1946,7 +1971,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
   }
   n->AddAttr(kXlaOriginalOutsideCompilationNodeName, n->name());
 
-  string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
+  std::string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
 
   // XLA computation: add a SendToHost node to send cond predicate.
   Node* pred_node;
@@ -1956,7 +1981,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
       BuildSendIfPredNode(absl::StrCat("send_oc_if_pred_", n->name()),
                           host_transfer_key, pred_node, g));
   n->AddAttr(kXlaTokenInputNodesAttrName,
-             std::vector<string>{send_pred_node->name()});
+             std::vector<std::string>{send_pred_node->name()});
 
   // Add a control edge from `send_pred_node` to If node, so XlaCompiler will
   // visit If node after `send_pred_node`, thus the token output for
@@ -1969,7 +1994,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
   // we need to create a no-op host graph.
   if (!then_branch_has_outside_compilation) {
     std::unique_ptr<Graph> then_branch_host_graph(new Graph(fld));
-    std::vector<string> then_branch_host_graphs;
+    std::vector<std::string> then_branch_host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(
         xla_cluster_name, outside_compilation_attr_name,
         then_branch_host_graphs, fld, &then_branch_host_graph));
@@ -1986,7 +2011,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
   }
   if (!else_branch_has_outside_compilation) {
     std::unique_ptr<Graph> else_branch_host_graph(new Graph(fld));
-    std::vector<string> else_branch_host_graphs;
+    std::vector<std::string> else_branch_host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(
         xla_cluster_name, outside_compilation_attr_name,
         else_branch_host_graphs, fld, &else_branch_host_graph));
@@ -2001,7 +2026,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
       TF_RETURN_IF_ERROR(fld->AddFunctionDef(else_branch_host_fdef));
     }
   }
-  string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
+  std::string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
   TF_RETURN_IF_ERROR(BuildHostGraphForIfNode(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       n->name(), host_transfer_key, oc_host_graph_name, fld,
@@ -2012,12 +2037,13 @@ absl::Status ExtractOutsideCompilationForIfNode(
 }
 
 absl::Status ExtractOutsideCompilationForWhileNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core, Graph* g, Node* n,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
-    std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   // Instantiate "cond" and "body".
   NameAttrList cond, body;
@@ -2027,10 +2053,12 @@ absl::Status ExtractOutsideCompilationForWhileNode(
   // Extract outside compilation for cond and body.
   bool cond_has_outside_compilation = false;
   bool body_has_outside_compilation = false;
-  string cond_host_func_name = absl::StrCat("oc_cond_host_while_", cond.name()),
-         body_host_func_name = absl::StrCat("oc_body_host_while_", body.name());
-  string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
-         body_xla_func_name = absl::StrCat(body.name(), "_oc");
+  std::string cond_host_func_name =
+                  absl::StrCat("oc_cond_host_while_", cond.name()),
+              body_host_func_name =
+                  absl::StrCat("oc_body_host_while_", body.name());
+  std::string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
+              body_xla_func_name = absl::StrCat(body.name(), "_oc");
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       cond, cond_xla_func_name, cond_host_func_name, host_compute_core, flr,
@@ -2060,19 +2088,19 @@ absl::Status ExtractOutsideCompilationForWhileNode(
   }
   n->AddAttr(kXlaOriginalOutsideCompilationNodeName, n->name());
 
-  string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
+  std::string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
 
   // XLA computation: rewrite cond function to add a SendToHost node to send
   // loop predicate.
   TF_RETURN_IF_ERROR(AddSendLoopPredToLoopCond(
       cond_xla_func_name, host_transfer_key, &cond, fld, n));
   n->AddAttr(kXlaTokenInputNodesAttrName,
-             std::vector<string>{kXlaTokenArgNodeName});
+             std::vector<std::string>{kXlaTokenArgNodeName});
 
   // Build host side graph for the "While" node.
   if (!cond_has_outside_compilation) {
     std::unique_ptr<Graph> cond_host_graph(new Graph(fld));
-    std::vector<string> host_graphs;
+    std::vector<std::string> host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(xla_cluster_name,
                                           outside_compilation_attr_name,
                                           host_graphs, fld, &cond_host_graph));
@@ -2088,7 +2116,7 @@ absl::Status ExtractOutsideCompilationForWhileNode(
   }
   if (!body_has_outside_compilation) {
     std::unique_ptr<Graph> body_host_graph(new Graph(fld));
-    std::vector<string> host_graphs;
+    std::vector<std::string> host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(xla_cluster_name,
                                           outside_compilation_attr_name,
                                           host_graphs, fld, &body_host_graph));
@@ -2102,7 +2130,8 @@ absl::Status ExtractOutsideCompilationForWhileNode(
       TF_RETURN_IF_ERROR(fld->AddFunctionDef(body_host_fdef));
     }
   }
-  string oc_host_graph_name = absl::StrCat("oc_while_host_graph_", n->name());
+  std::string oc_host_graph_name =
+      absl::StrCat("oc_while_host_graph_", n->name());
   TF_RETURN_IF_ERROR(BuildHostGraphForWhileNode(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       n->name(), host_transfer_key, oc_host_graph_name, fld,
@@ -2113,11 +2142,13 @@ absl::Status ExtractOutsideCompilationForWhileNode(
 }
 
 absl::Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
-    Graph* g, const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
-    FunctionLibraryDefinition* fld, std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    Graph* g, const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   std::vector<Node*> if_nodes, while_nodes, func_call_nodes;
   for (Node* n : g->nodes()) {
@@ -2155,7 +2186,7 @@ absl::Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
 }
 
 absl::Status CopyOutsideCompilationConstNodes(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   for (Node* n : g->op_nodes()) {
     if (!n->IsConstant() ||
         !HasNodeAttr(n->def(), outside_compilation_attr_name)) {
@@ -2205,8 +2236,8 @@ absl::Status RewriteOutsideCompilationSubgraphFn::operator()(
     const std::vector<OutputTensor>& arg_source_tensors,
     std::unique_ptr<Graph>* graph, std::vector<int>* input_permutation,
     std::vector<int>* output_permutation, NodeDef* node_def) {
-  string old_name = node_def->op();
-  string new_name =
+  std::string old_name = node_def->op();
+  std::string new_name =
       absl::StrCat(xla_cluster_name_, "_", new_function_name_, "_", old_name);
   node_def->set_op(new_name);
   node_def->set_name(new_name);
@@ -2290,14 +2321,14 @@ absl::Status RewriteOutsideCompilationSubgraphFn::operator()(
     AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", *shapes, node_def);
   } else {
-    string shape_inference_func_name =
+    std::string shape_inference_func_name =
         absl::StrCat("_outside_compilation_shape_inference_", new_name);
     NameAttrList shape_inference_graph;
     shape_inference_graph.set_name(shape_inference_func_name);
     AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", std::vector<TensorShapeProto>{}, node_def);
   }
-  AddNodeAttr("ancestors", std::vector<string>{}, node_def);
+  AddNodeAttr("ancestors", std::vector<std::string>{}, node_def);
   AddNodeAttr("Tinputs", recv_at_host_dtypes, node_def);
   AddNodeAttr("Toutputs", send_from_host_dtypes, node_def);
   AddNodeAttr("key", absl::StrCat("host_compute_channel_", new_name), node_def);
@@ -2306,15 +2337,16 @@ absl::Status RewriteOutsideCompilationSubgraphFn::operator()(
 }
 
 absl::Status ExtractOutsideCompilationForFunction(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const NameAttrList& func_name_attrs, const string& new_func_name,
-    const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
-    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const NameAttrList& func_name_attrs,
+    const std::string& new_func_name, const std::string& host_graph_func_name,
+    const std::map<std::string, int>& host_compute_core,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   // Convert the function to graph.
-  const string& func_name = func_name_attrs.name();
+  const std::string& func_name = func_name_attrs.name();
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(
       flr->Instantiate(func_name, AttrSlice(&func_name_attrs.attr()), &handle));
@@ -2345,8 +2377,8 @@ absl::Status ExtractOutsideCompilationForFunction(
   }
 
   std::unique_ptr<Graph> graph_out;
-  std::vector<string> outside_compilation_host_graphs;
-  std::vector<string> shape_inference_graphs_to_rewrite;
+  std::vector<std::string> outside_compilation_host_graphs;
+  std::vector<std::string> shape_inference_graphs_to_rewrite;
   if (*has_outside_compilation) {
     // Copy outside compilation Const nodes with non outside compilation users.
     TF_RETURN_IF_ERROR(CopyOutsideCompilationConstNodes(
@@ -2404,7 +2436,7 @@ absl::Status ExtractOutsideCompilationForFunction(
         }
       }
     }
-    std::map<string, Node*> host_compute_nodes;
+    std::map<std::string, Node*> host_compute_nodes;
     for (Node* n : outside_compilation_nodes) {
       auto host_compute_node_or = ReplaceOutsideCompilationCallNode(
           graph_out.get(), n, host_compute_core, *cluster_deps);
@@ -2416,11 +2448,11 @@ absl::Status ExtractOutsideCompilationForFunction(
     // them so XlaCompiler can handle them in correct order.
     for (const auto& iter : host_compute_nodes) {
       Node* host_compute_node = iter.second;
-      std::vector<string> token_input_node_names;
+      std::vector<std::string> token_input_node_names;
       TF_RETURN_IF_ERROR(GetNodeAttr(host_compute_node->def(),
                                      kXlaTokenInputNodesAttrName,
                                      &token_input_node_names));
-      for (const string& node_name : token_input_node_names) {
+      for (const std::string& node_name : token_input_node_names) {
         if (node_name == kXlaTokenArgNodeName) {
           continue;
         }
@@ -2459,7 +2491,7 @@ absl::Status ExtractOutsideCompilationForFunction(
     // Shape inference graphs might contain Placeholder nodes for outside
     // compilation to outside compilation edges. Rewrite shape inference graphs
     // to remove such nodes.
-    for (const string& shape_inference_graph :
+    for (const std::string& shape_inference_graph :
          shape_inference_graphs_to_rewrite) {
       TF_RETURN_IF_ERROR(
           RewriteShapeInferenceGraph(shape_inference_graph, host_graph.get(),
@@ -2467,7 +2499,7 @@ absl::Status ExtractOutsideCompilationForFunction(
     }
 
     // Remove the outside compilation graphs from function library.
-    for (const string& func : outside_compilation_host_graphs) {
+    for (const std::string& func : outside_compilation_host_graphs) {
       TF_RETURN_IF_ERROR(fld->RemoveFunction(func));
     }
 
@@ -2499,9 +2531,9 @@ absl::Status ExtractOutsideCompilationForFunction(
 }
 
 absl::Status ExtractOutsideCompilation(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::unordered_map<std::string, XlaClusterInfo>& clusters, Graph* g,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
     bool* modified) {
   if (VLOG_IS_ON(4)) {
@@ -2511,14 +2543,14 @@ absl::Status ExtractOutsideCompilation(
   *modified = false;
   auto node_name_index = g->BuildNodeNameIndex();
   for (auto& iter : clusters) {
-    string xla_cluster_name = iter.first;
+    std::string xla_cluster_name = iter.first;
     Node* n = iter.second.node;
     auto const& func_name_attrs = iter.second.func_name_attrs;
     auto const& host_compute_core = iter.second.host_compute_core;
 
-    std::vector<string> shape_inference_graphs;
+    std::vector<std::string> shape_inference_graphs;
     bool has_outside_compilation;
-    string host_graph_func_name =
+    std::string host_graph_func_name =
         absl::StrCat("oc_host_graph_", xla_cluster_name);
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
@@ -2528,7 +2560,7 @@ absl::Status ExtractOutsideCompilation(
     *modified |= has_outside_compilation;
 
     if (has_outside_compilation) {
-      string pivot_name = absl::StrCat(xla_cluster_name, "/pivot");
+      std::string pivot_name = absl::StrCat(xla_cluster_name, "/pivot");
       Node* pivot_node = node_name_index[pivot_name];
       TF_RETURN_IF_ERROR(ExpandHostGraphIntoMainGraph(
           g, fld, host_graph_func_name, n, pivot_node));
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
index 7631ccd0bc6ab0..c1697fcb4cde0d 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.h
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -44,9 +44,9 @@ namespace tensorflow {
 class RewriteOutsideCompilationSubgraphFn {
  public:
   RewriteOutsideCompilationSubgraphFn(
-      const string& xla_cluster_attr_name,
-      const string& outside_compilation_attr_name,
-      const string& xla_cluster_name, const string& new_function_name)
+      const std::string& xla_cluster_attr_name,
+      const std::string& outside_compilation_attr_name,
+      const std::string& xla_cluster_name, const std::string& new_function_name)
       : xla_cluster_attr_name_(xla_cluster_attr_name),
         outside_compilation_attr_name_(outside_compilation_attr_name),
         xla_cluster_name_(xla_cluster_name),
@@ -59,10 +59,10 @@ class RewriteOutsideCompilationSubgraphFn {
                           NodeDef* node_def);
 
  private:
-  string xla_cluster_attr_name_;
-  string outside_compilation_attr_name_;
-  string xla_cluster_name_;
-  string new_function_name_;
+  std::string xla_cluster_attr_name_;
+  std::string outside_compilation_attr_name_;
+  std::string xla_cluster_name_;
+  std::string new_function_name_;
 };
 
 // For an XLA computation function, replace all outside compilations with
@@ -88,12 +88,13 @@ class RewriteOutsideCompilationSubgraphFn {
 // has_outside_compilation: a bool indicating whether this function has any
 //   outside compilation nodes.
 absl::Status ExtractOutsideCompilationForFunction(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const NameAttrList& func_name_attrs, const string& new_func_name,
-    const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
-    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const NameAttrList& func_name_attrs,
+    const std::string& new_func_name, const std::string& host_graph_func_name,
+    const std::map<std::string, int>& host_compute_core,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation);
 
 // Rewrites XLA computation in `clusters` to replace outside compilation nodes
@@ -101,9 +102,9 @@ absl::Status ExtractOutsideCompilationForFunction(
 // of outside compilation outputs cannot be determined now, we will store shape
 // inference graph into `fld`.
 absl::Status ExtractOutsideCompilation(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::unordered_map<std::string, XlaClusterInfo>& clusters, Graph* g,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
     bool* modified);
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index 4d007d07504939..1a6441a80726a0 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -236,14 +236,14 @@ class ExtractOutsideCompilationForFunctionTest : public ::testing::Test {
   }
 
   absl::Status ExtractOutsideCompilationTest(
-      const string &xla_cluster_attr_name,
-      const string &outside_compilation_attr_name,
-      const string &xla_cluster_name, const NameAttrList &func_name_attrs,
-      const string &new_func_name, const string &host_graph_func_name,
-      const std::map<string, int> &host_compute_core,
-      FunctionLibraryDefinition *fld,
-      std::vector<string> *shape_inference_graphs,
-      bool *has_outside_compilation) {
+      const std::string& xla_cluster_attr_name,
+      const std::string& outside_compilation_attr_name,
+      const std::string& xla_cluster_name, const NameAttrList& func_name_attrs,
+      const std::string& new_func_name, const std::string& host_graph_func_name,
+      const std::map<std::string, int>& host_compute_core,
+      FunctionLibraryDefinition* fld,
+      std::vector<std::string>* shape_inference_graphs,
+      bool* has_outside_compilation) {
     OptimizerOptions opts;
     pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
@@ -288,9 +288,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -342,7 +342,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   std::unique_ptr<FunctionBody> host_fbody;
   AttrValue device_ordinal_temp_value;
   device_ordinal_temp_value.set_i(0);
-  protobuf::Map<string, AttrValue> host_func_attrs;
+  protobuf::Map<std::string, AttrValue> host_func_attrs;
   host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
   TF_CHECK_OK(FunctionDefToBodyHelper(
       *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld, &host_fbody));
@@ -406,9 +406,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -481,9 +481,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core;
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core;
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -498,7 +498,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     std::unique_ptr<FunctionBody> host_fbody;
     AttrValue device_ordinal_temp_value;
     device_ordinal_temp_value.set_i(0);
-    protobuf::Map<string, AttrValue> host_func_attrs;
+    protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
     TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
                                         AttrSlice(&host_func_attrs), &fld,
@@ -568,7 +568,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     // _xla_token_input_nodes.
     Node *if_node = node_name_index["if"];
     EXPECT_NE(if_node, nullptr);
-    std::vector<string> token_inputs;
+    std::vector<std::string> token_inputs;
     TF_CHECK_OK(
         GetNodeAttr(if_node->def(), "_xla_token_input_nodes", &token_inputs));
     EXPECT_THAT(token_inputs, ::testing::ElementsAre("send_oc_if_pred_if"));
@@ -631,9 +631,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core;
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core;
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -648,7 +648,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     std::unique_ptr<FunctionBody> host_fbody;
     AttrValue device_ordinal_temp_value;
     device_ordinal_temp_value.set_i(0);
-    protobuf::Map<string, AttrValue> host_func_attrs;
+    protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
     TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
                                         AttrSlice(&host_func_attrs), &fld,
@@ -767,9 +767,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     TF_CHECK_OK(fld.AddFunctionDef(*xla_fdef));
   }
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core;
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core;
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -784,7 +784,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     std::unique_ptr<FunctionBody> host_fbody;
     AttrValue device_ordinal_temp_value;
     device_ordinal_temp_value.set_i(0);
-    protobuf::Map<string, AttrValue> host_func_attrs;
+    protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
     TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
                                         AttrSlice(&host_func_attrs), &fld,
@@ -873,9 +873,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -898,14 +898,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   EXPECT_NE(host_compute_1, nullptr);
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
-  std::vector<string> token_input_nodes;
+  std::vector<std::string> token_input_nodes;
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
 
-  std::vector<string> expected_token_input_nodes_0({"_xla_token_arg_node"});
+  std::vector<std::string> expected_token_input_nodes_0(
+      {"_xla_token_arg_node"});
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_0);
   token_input_nodes.clear();
-  std::vector<string> expected_token_input_nodes_1(
+  std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
@@ -955,9 +956,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -980,14 +981,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   EXPECT_NE(host_compute_1, nullptr);
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
-  std::vector<string> token_input_nodes;
+  std::vector<std::string> token_input_nodes;
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
 
-  std::vector<string> expected_token_input_nodes_0({"_xla_token_arg_node"});
+  std::vector<std::string> expected_token_input_nodes_0(
+      {"_xla_token_arg_node"});
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_0);
   token_input_nodes.clear();
-  std::vector<string> expected_token_input_nodes_1(
+  std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index e7a375231accdf..a0a0d45736f1e8 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -46,7 +46,7 @@ std::vector<Flag>* jitrt_flag_list;
 std::vector<Flag>* flag_list;
 absl::once_flag flags_init;
 
-bool SetterForXlaAutoJitFlag(const string& value) {
+bool SetterForXlaAutoJitFlag(const std::string& value) {
   int32_t opt_level;
   // We need to use the mark_for_compilation_flags directly here instead of
   // going via GetMarkForCompilationPassFlags() to avoid infinite recursion. The
@@ -81,7 +81,7 @@ bool SetterForXlaAutoJitFlag(const string& value) {
   return true;
 }
 
-bool SetterForXlaCallModuleDisabledChecks(const string& value) {
+bool SetterForXlaCallModuleDisabledChecks(const std::string& value) {
   auto directives = absl::StrSplit(value, ',', absl::SkipEmpty());
   call_module_flags->disabled_checks.insert(directives.begin(),
                                             directives.end());
@@ -231,7 +231,7 @@ void AllocateAndParseFlags() {
   mark_for_compilation_flags->xla_auto_jit_flag.optimization_level_general = 0;
   mark_for_compilation_flags->tf_xla_min_cluster_size = 4;
   mark_for_compilation_flags->tf_xla_max_cluster_size =
-      std::numeric_limits<int32>::max();
+      std::numeric_limits<int32_t>::max();
   mark_for_compilation_flags->tf_xla_clustering_debug = false;
   mark_for_compilation_flags->tf_xla_cpu_global_jit = false;
   mark_for_compilation_flags->tf_xla_clustering_fuel =
@@ -291,6 +291,7 @@ void AllocateAndParseFlags() {
   // Dump graphs in TFG dialect.
   bool use_tfg_graph_dumper = false;
   bool enable_tpu_variable_runtime_reformatting_pass = true;
+  bool enable_serialize_mlir_to_compressed_bytecode = false;
 
   flag_list = new std::vector<Flag>(
       {Flag("tf_xla_enable_lazy_compilation",
@@ -405,7 +406,10 @@ void AllocateAndParseFlags() {
             &enable_tpu_variable_runtime_reformatting_pass,
             "Enables TPUVariableRuntimeReformatting pass for MLIR-Based "
             "TensorFlow Compiler Bridge. This enables weight update sharding "
-            "and creates TPUReshardVariables ops.")});
+            "and creates TPUReshardVariables ops."),
+       Flag("tf_serialize_mlir_to_compressed_bytecode",
+            &enable_serialize_mlir_to_compressed_bytecode,
+            "If true, serialize MLIR to compressed bytecode.")});
 
   AppendMarkForCompilationPassFlagsInternal(flag_list);
   xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
@@ -434,6 +438,8 @@ void AllocateAndParseFlags() {
       enable_mlir_multiple_local_cpu_devices;
   mlir_flags->tf_mlir_enable_debug_info_serialization =
       enable_mlir_debug_info_serialization;
+  mlir_flags->tf_serialize_mlir_to_compressed_bytecode =
+      enable_serialize_mlir_to_compressed_bytecode;
 
   if (use_tfg_graph_dumper) {
     UseMlirForGraphDump(MlirDumpConfig{}.elide_large_attributes().emit_dialect(
@@ -457,7 +463,7 @@ void ResetFlags() {
 
 }  // namespace
 
-bool SetXlaAutoJitFlagFromFlagString(const string& value) {
+bool SetXlaAutoJitFlagFromFlagString(const std::string& value) {
   absl::call_once(flags_init, &AllocateAndParseFlags);
   return SetterForXlaAutoJitFlag(value);
 }
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index b355c79364cb1b..96154b892ae5b0 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -41,15 +41,15 @@ struct XlaAutoJitFlag {
   // `optimization_level_general` applies.
   //
   // Experimental.
-  int32 optimization_level_single_gpu;
-  int32 optimization_level_general;
+  int32_t optimization_level_single_gpu;
+  int32_t optimization_level_general;
 };
 
 // Sets the xla_auto_jit_flag based on the given flag string. Supported syntax
 // is:
 // <number>: sets general and single_gpu setting to the provided number.
 // single-gpu(<number>): sets the single_gpu setting to the provided number.
-bool SetXlaAutoJitFlagFromFlagString(const string& value);
+bool SetXlaAutoJitFlagFromFlagString(const std::string& value);
 
 // Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
@@ -57,16 +57,16 @@ struct MarkForCompilationPassFlags {
 
   // Minimum number of operators in an XLA compilation. Ignored for operators
   // placed on an XLA device or operators explicitly marked for compilation.
-  int32 tf_xla_min_cluster_size;
+  int32_t tf_xla_min_cluster_size;
 
   // Maximum number of operators in an XLA compilation.
-  int32 tf_xla_max_cluster_size;
+  int32_t tf_xla_max_cluster_size;
 
   // If non-empty, limit XLA clustering to the following TF operations.
-  string tf_xla_ops_to_cluster;
+  std::string tf_xla_ops_to_cluster;
 
   // If non-empty, remove following operations from XLA clustering excludelist.
-  string tf_xla_cluster_exclude_ops;
+  std::string tf_xla_cluster_exclude_ops;
 
   // Dump graphs during XLA compilation.
   bool tf_xla_clustering_debug;
@@ -110,7 +110,7 @@ struct MarkForCompilationPassFlags {
   bool tf_xla_disable_strict_signature_checks;
 
   // Specifies the persistance cache prefix. Default is "xla_compile_cache"
-  string tf_xla_persistent_cache_prefix;
+  std::string tf_xla_persistent_cache_prefix;
 };
 
 // Flags associated with XLA Sparse Core.
@@ -299,6 +299,7 @@ struct MlirCommonFlags {
   // with different local CPU devices settings.
   bool tf_mlir_enable_multiple_local_cpu_devices;
   bool tf_mlir_enable_debug_info_serialization;
+  bool tf_serialize_mlir_to_compressed_bytecode;
 };
 
 // Flags for the JitRt pipeline -- see tf_jitrt_pipeline.h for details.
diff --git a/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc b/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
index 75bd1d7310a295..1b0239c3550970 100644
--- a/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
+++ b/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
@@ -95,7 +95,7 @@ TEST(ForceXlaConstantsOnHostPassTest, Simple) {
     if (CanCreateXlaKernel(node->def())) {
       EXPECT_FALSE(found);
       found = true;
-      std::vector<int32> hostmem_attr;
+      std::vector<int32_t> hostmem_attr;
       EXPECT_TRUE(TryGetNodeAttr(node->def(), "_input_hostmem", &hostmem_attr));
       EXPECT_EQ(hostmem_attr.size(), 1);
       EXPECT_EQ(hostmem_attr[0], 1);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index 8317d222928200..03a7d1081b8b53 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -93,7 +93,7 @@ std::vector<int64_t> IntTensorAsVector(const Tensor& t) {
   result.reserve(t.NumElements());
   for (int i = 0; i < t.NumElements(); i++) {
     int64_t element = t.dtype() == DT_INT32
-                          ? static_cast<int64_t>(t.flat<int32>()(i))
+                          ? static_cast<int64_t>(t.flat<int32_t>()(i))
                           : t.flat<int64_t>()(i);
     result.push_back(element);
   }
@@ -251,14 +251,14 @@ absl::Status ComputeSliceSize(const Scope& host_scope,
 absl::Status ConvertTensorFlowSliceToStaticShapedSlice(
     Graph* g, Node* slice, const SliceInputs& slice_inputs,
     absl::string_view cluster_name, Node** result) {
-  string host_name;
+  std::string host_name;
   TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
       slice->assigned_device_name(), &host_name));
 
   absl::Status status;
   Scope main_scope =
       NewInternalScope(g, &status, /*refiner=*/nullptr)
-          .WithXlaCluster(string(cluster_name))
+          .WithXlaCluster(std::string(cluster_name))
           .NewSubScope(absl::StrCat(slice->name(), "/static_shaped_slice"));
   Scope host_scope = main_scope.WithAssignedDevice(host_name);
 
@@ -286,7 +286,7 @@ absl::Status ConvertTensorFlowSliceToStaticShapedSlice(
 
   TF_RETURN_IF_ERROR(main_scope.status());
 
-  std::vector<string> compile_time_const_inputs;
+  std::vector<std::string> compile_time_const_inputs;
   compile_time_const_inputs.push_back("size");
   (*result)->AddAttr(kXlaCompileTimeConstantInputsAttr,
                      compile_time_const_inputs);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index 411f761995483a..6a8523a7d4c893 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -66,7 +66,8 @@ class FakeDevice : public Device {
 
   Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
 
-  static std::unique_ptr<Device> Make(const string& name, const string& type) {
+  static std::unique_ptr<Device> Make(const std::string& name,
+                                      const std::string& type) {
     DeviceAttributes device_attributes;
     device_attributes.set_name(name);
     device_attributes.set_device_type(DeviceType(type).type());
@@ -100,7 +101,7 @@ absl::Status IncreaseDynamismForAutoJit(const Scope& s,
 
   // Scope::ToGraph seems to drop assigned devices, probably because it goes
   // through a GraphDef.  So explicitly maintain the device assignment.
-  std::unordered_map<string, string> assigned_device_names;
+  std::unordered_map<std::string, std::string> assigned_device_names;
   for (Node* n : s.graph()->nodes()) {
     assigned_device_names[n->name()] = n->assigned_device_name();
   }
@@ -149,7 +150,7 @@ TEST(SliceToDynamicSliceRewriteTest, Basic) {
                    Inputs(m_slice_size_0, Const(static_cast<int64_t>(500)),
                           Const(zero_32))));
 
-  std::vector<string> compile_time_constant_inputs;
+  std::vector<std::string> compile_time_constant_inputs;
   compile_time_constant_inputs.push_back("size");
   auto m_dynamic_slice = NodeWith(
       Op("Slice"), AssignedDevice(kDeviceName),
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index c3a24f3e0f7163..340cdbe8032c63 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -151,7 +151,7 @@ class MarkForCompilationPassImpl {
             std::optional<DeviceId> resource_op_device,
             std::optional<int> resource_var_operation_node_id,
             std::optional<DeadnessPredicate> deadness_predicate,
-            bool is_xla_compile_attr_true, std::optional<string> xla_scope)
+            bool is_xla_compile_attr_true, std::optional<std::string> xla_scope)
         : cycles_graph_node_id_(tf_graph_node_id),
           effective_cluster_size_(effective_cluster_size),
           has_functional_control_flow_(has_functional_control_flow),
@@ -220,7 +220,7 @@ class MarkForCompilationPassImpl {
 
     // If not nullopt then the all nodes in the cluster either do not have the
     // XlaScope attribute set or have it set to the value returned.
-    const std::optional<string>& xla_scope() const { return xla_scope_; }
+    const std::optional<std::string>& xla_scope() const { return xla_scope_; }
 
     // Returns the TF graph node IDs for the resource variable operations in
     // this cluster.
@@ -228,7 +228,7 @@ class MarkForCompilationPassImpl {
       return resource_var_operation_node_ids_;
     }
 
-    string DebugString(const Graph& graph) const {
+    std::string DebugString(const Graph& graph) const {
       Node* node = graph.FindNodeId(cycles_graph_node_id());
       if (!node) {
         // This should never happen but we try to be resilient because this is a
@@ -254,7 +254,7 @@ class MarkForCompilationPassImpl {
     std::optional<DeviceId> resource_op_device_;
     std::optional<DeadnessPredicate> deadness_predicate_;
     bool is_xla_compile_attr_true_;
-    std::optional<string> xla_scope_;
+    std::optional<std::string> xla_scope_;
     std::vector<int> resource_var_operation_node_ids_;
 
     Cluster(const Cluster&) = delete;
@@ -365,7 +365,7 @@ class MarkForCompilationPassImpl {
                           std::optional<int> resource_var_operation_node_id,
                           std::optional<DeadnessPredicate> deadness_predicate,
                           bool is_xla_compile_attr_true,
-                          std::optional<string> xla_scope) {
+                          std::optional<std::string> xla_scope) {
     cluster_storage_.push_back(std::make_unique<Cluster>(
         cycles_graph_node_id, effective_cluster_size,
         has_functional_control_flow, device_set, resource_op_device,
@@ -374,7 +374,7 @@ class MarkForCompilationPassImpl {
     return cluster_storage_.back().get();
   }
 
-  std::optional<string> GetXlaScope(Node* n);
+  std::optional<std::string> GetXlaScope(Node* n);
 
   // Returns the cluster for node `n`.  If two nodes, N1 and N2, are placed in
   // the same cluster by the clustering algorithm then this function will return
@@ -417,7 +417,8 @@ class MarkForCompilationPassImpl {
   // Returns a string representing `cycles_graph_node_id`.  If the node is
   // unclusterable (either it is a phatom "frame" node or is not a compilation
   // candidate) then set `*found_unclustered` to true.
-  string DebugStringForCyclesGraphNode(int node_id, bool* found_unclustered);
+  std::string DebugStringForCyclesGraphNode(int node_id,
+                                            bool* found_unclustered);
 
   // We could not contract the edge from `from` to `to`.  Return a string
   // describing an alternate path from `from` to `to` (besides the direct edge
@@ -429,7 +430,7 @@ class MarkForCompilationPassImpl {
   // contracted because of the path [P,Q,R]" where P, Q and R are all clusters
   // since in that case a natural question is why we could not form a {A, P, Q,
   // R, B} cluster.
-  string DescribePotentialCycle(int from, int to);
+  std::string DescribePotentialCycle(int from, int to);
 
   // Merge the clusters `cluster_from` and `cluster_to`. After this step the
   // larger combined cluster is represented by `cluster_from`, but can have
@@ -459,8 +460,8 @@ class MarkForCompilationPassImpl {
     return true;
   }
 
-  string EdgeContractionFailureMsg(Cluster* from, Cluster* to,
-                                   absl::string_view reason) {
+  std::string EdgeContractionFailureMsg(Cluster* from, Cluster* to,
+                                        absl::string_view reason) {
     return absl::StrCat("Could not contract ", from->DebugString(*graph_),
                         " -> ", to->DebugString(*graph_), " because ", reason,
                         ".");
@@ -468,7 +469,7 @@ class MarkForCompilationPassImpl {
 
   DebugOptions debug_options_;
   Graph* graph_;
-  uint64 graph_fingerprint_;
+  uint64_t graph_fingerprint_;
   FunctionLibraryDefinition* flib_def_;
   Env* env_;
   OptimizerOptions::GlobalJitLevel global_jit_level_;
@@ -547,7 +548,7 @@ std::vector<int> MarkForCompilationPassImpl::FindAlternatePathForDebugging(
   return path;
 }
 
-string MarkForCompilationPassImpl::DebugStringForCyclesGraphNode(
+std::string MarkForCompilationPassImpl::DebugStringForCyclesGraphNode(
     int cycles_graph_node_id, bool* found_unclustered) {
   Cluster* cluster = GetClusterForCyclesGraphNode(cycles_graph_node_id);
   if (cluster) {
@@ -567,8 +568,9 @@ string MarkForCompilationPassImpl::DebugStringForCyclesGraphNode(
   return node->name();
 }
 
-string MarkForCompilationPassImpl::DescribePotentialCycle(int from, int to) {
-  std::vector<string> path_str;
+std::string MarkForCompilationPassImpl::DescribePotentialCycle(int from,
+                                                               int to) {
+  std::vector<std::string> path_str;
   bool found_unclustered = false;
   absl::c_transform(FindAlternatePathForDebugging(from, to),
                     std::back_inserter(path_str), [&](int node_id) {
@@ -701,7 +703,7 @@ absl::StatusOr<bool> MarkForCompilationPassImpl::ForEachEdgeInPostOrder(
 
     // Make a copy of the set of successors because we may modify the graph in
     // TryToContractEdge.
-    std::vector<int32> successors_copy =
+    std::vector<int32_t> successors_copy =
         cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
 
     for (int to : successors_copy) {
@@ -974,7 +976,7 @@ class ClusterSequenceNumberGenerator {
     sequence_numbers_.clear();
   }
 
-  int64 GetNext(uint64 key) {
+  int64_t GetNext(uint64_t key) {
     mutex_lock lock(mu_);
     return sequence_numbers_[key]++;
   }
@@ -987,13 +989,13 @@ class ClusterSequenceNumberGenerator {
 
  private:
   mutex mu_;
-  absl::flat_hash_map<uint64, int64> sequence_numbers_;
+  absl::flat_hash_map<uint64_t, int64_t> sequence_numbers_;
 };
 
 // Get a monotonic sequence numbers for a graph identified by its `fingerprint`.
 // The sequence number is necessary to disambiguate clusters extracted from the
 // same graph and when duplicate graphs exist within the same process.
-int64_t GetNextClusterSequenceNumber(uint64 fingerprint) {
+int64_t GetNextClusterSequenceNumber(uint64_t fingerprint) {
   return ClusterSequenceNumberGenerator::Global().GetNext(fingerprint);
 }
 
@@ -1002,7 +1004,7 @@ absl::Status MarkForCompilationPassImpl::CreateClusters() {
   clusters_created_ = true;
 
   // Names for each cluster.
-  std::unordered_map<int, string> cluster_names;
+  std::unordered_map<int, std::string> cluster_names;
 
   if (debug_options_.dump_graphs) {
     DumpGraphToFile("before_mark_for_compilation", *graph_, flib_def_);
@@ -1030,7 +1032,7 @@ absl::Status MarkForCompilationPassImpl::CreateClusters() {
     if (cluster->effective_cluster_size() >= debug_options_.min_cluster_size ||
         cluster->has_functional_control_flow() ||
         cluster->is_xla_compile_attr_true()) {
-      string& name = cluster_names[cluster->cycles_graph_node_id()];
+      std::string& name = cluster_names[cluster->cycles_graph_node_id()];
 
       if (name.empty()) {
         if (!cluster_name_prefix_.empty()) {
@@ -1099,7 +1101,7 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
   return false;
 }
 
-std::optional<string> MarkForCompilationPassImpl::GetXlaScope(Node* node) {
+std::optional<std::string> MarkForCompilationPassImpl::GetXlaScope(Node* node) {
   // Look for either _XlaScope or _XlaInternalScope on both nodes to guide
   // clustering.  If both nodes have a scope and the scopes do not match, do
   // not cluster along this edge.  If even one of the nodes lacks a scope
@@ -1118,14 +1120,14 @@ std::optional<string> MarkForCompilationPassImpl::GetXlaScope(Node* node) {
 
   if (global_jit_level_ != OptimizerOptions::OFF) {
     // If global_jit_level_ is ON, respect only _XlaInternalScope.
-    const string& scope =
+    const std::string& scope =
         GetNodeAttrString(node->attrs(), kXlaInternalScopeAttr);
     if (!scope.empty()) {
       return scope;
     }
   } else {
     // If global_jit_level_ is OFF, respect only _XlaScope.
-    const string& scope = GetNodeAttrString(node->attrs(), kXlaScopeAttr);
+    const std::string& scope = GetNodeAttrString(node->attrs(), kXlaScopeAttr);
     if (!scope.empty()) {
       return scope;
     }
@@ -1186,9 +1188,9 @@ absl::Status MarkForCompilationPassImpl::BuildInitialClusterSet() {
           deadness_analysis_->GetPredicateFor(node, Graph::kControlSlot));
     }
 
-    const string& device_name_str = !node->assigned_device_name().empty()
-                                        ? node->assigned_device_name()
-                                        : node->requested_device();
+    const std::string& device_name_str = !node->assigned_device_name().empty()
+                                             ? node->assigned_device_name()
+                                             : node->requested_device();
     TF_ASSIGN_OR_RETURN(DeviceId device,
                         device_info_cache_.GetIdFor(device_name_str));
 
@@ -1258,16 +1260,17 @@ absl::StatusOr<bool> IsIdentityDrivingConstsInLoop(Node* node) {
   return true;
 }
 
-absl::flat_hash_set<string> CreateClusterExcludeList() {
+absl::flat_hash_set<std::string> CreateClusterExcludeList() {
   MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
-  absl::flat_hash_set<string> excludelist;
+  absl::flat_hash_set<std::string> excludelist;
   for (auto s : absl::StrSplit(flags->tf_xla_cluster_exclude_ops, ',')) {
     if (!s.empty()) {
-      excludelist.insert(string(s));
+      excludelist.insert(std::string(s));
     }
   }
   if (VLOG_IS_ON(2) && !excludelist.empty()) {
-    std::vector<string> vexcludelist(excludelist.begin(), excludelist.end());
+    std::vector<std::string> vexcludelist(excludelist.begin(),
+                                          excludelist.end());
     absl::c_sort(vexcludelist);
     VLOG(2) << "XLA clustering will exclude following TF operations from auto "
                "clustering: "
@@ -1276,11 +1279,11 @@ absl::flat_hash_set<string> CreateClusterExcludeList() {
   return excludelist;
 }
 
-absl::flat_hash_set<string> GetOrCreateAllowlist() {
-  absl::flat_hash_map<string, std::vector<string>>* allowlist_table =
+absl::flat_hash_set<std::string> GetOrCreateAllowlist() {
+  absl::flat_hash_map<std::string, std::vector<std::string>>* allowlist_table =
       tensorflow::GetAllowlistTable();
   MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
-  absl::flat_hash_set<string> allowlist;
+  absl::flat_hash_set<std::string> allowlist;
 
   for (auto s : absl::StrSplit(flags->tf_xla_ops_to_cluster, ',')) {
     if (s == "FUSIBLE") {
@@ -1292,12 +1295,12 @@ absl::flat_hash_set<string> GetOrCreateAllowlist() {
       allowlist.insert(v.begin(), v.end());
     } else if (!s.empty()) {
       // Should be a user provided TF operation.
-      allowlist.insert(string(s));
+      allowlist.insert(std::string(s));
     }
   }
 
   if (VLOG_IS_ON(2) && !allowlist.empty()) {
-    std::vector<string> vallowlist(allowlist.begin(), allowlist.end());
+    std::vector<std::string> vallowlist(allowlist.begin(), allowlist.end());
     absl::c_sort(vallowlist);
     VLOG(2) << "XLA clustering will only consider the following TF operations: "
             << absl::StrJoin(vallowlist, " ");
@@ -1338,8 +1341,8 @@ absl::Status MarkForCompilationPassImpl::FindCompilationCandidates() {
 
   auto allowlist = GetOrCreateAllowlist();
 
-  std::vector<string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
-  absl::flat_hash_set<string> all_ops(vall_ops.begin(), vall_ops.end());
+  std::vector<std::string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
+  absl::flat_hash_set<std::string> all_ops(vall_ops.begin(), vall_ops.end());
   // Check that user's provided TF operation really exists.
   for (const auto& s : allowlist) {
     if (!all_ops.contains(s)) {
@@ -1674,7 +1677,7 @@ void MarkForCompilationPassImpl::DumpPostClusteringGraphs() {
   DumpGraphToFile("mark_for_compilation_annotated", new_graph, flib_def_);
 }
 
-string RatioToString(int numerator, int denominator) {
+std::string RatioToString(int numerator, int denominator) {
   return absl::StrFormat("%d / %d (%.2f%%)", numerator, denominator,
                          (100.0 * numerator) / denominator);
 }
@@ -1985,10 +1988,11 @@ absl::Status MarkForCompilationPass::RunForTest(
   return MarkForCompilation(options, debug_options);
 }
 
-absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable() {
+absl::flat_hash_map<std::string, std::vector<std::string>>*
+GetAllowlistTable() {
   // Table format: category name: {list of TF operations in that category}
-  static absl::flat_hash_map<string, std::vector<string>>* result =
-      new absl::flat_hash_map<string, std::vector<string>>{
+  static absl::flat_hash_map<std::string, std::vector<std::string>>* result =
+      new absl::flat_hash_map<std::string, std::vector<std::string>>{
           // Unary
           {"PW",
            {"ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin",
@@ -2056,8 +2060,8 @@ void ResetClusterSequenceNumber() {
   ClusterSequenceNumberGenerator::Global().Reset();
 }
 
-absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
-  absl::flat_hash_set<string> result{
+absl::flat_hash_set<std::string> GetKnownXLAAllowlistOp() {
+  absl::flat_hash_set<std::string> result{
       "AdjustContrastv2",
       "AdjustHue",
       "AdjustSaturation",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h
index 558912f2eee2e0..d6a2814ed33982 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -47,7 +47,7 @@ class MarkForCompilationPass : public GraphOptimizationPass {
   friend class MarkForCompilationPassTestHelper;
 };
 
-absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable();
+absl::flat_hash_map<std::string, std::vector<std::string>>* GetAllowlistTable();
 
 namespace testing {
 // DO NOT USE IN PRODUCTION.
@@ -56,7 +56,7 @@ namespace testing {
 void ResetClusterSequenceNumber();
 
 // Return a list of operation that we choose not to put into the allowlist.
-absl::flat_hash_set<string> GetKnownXLAAllowlistOp();
+absl::flat_hash_set<std::string> GetKnownXLAAllowlistOp();
 }  // namespace testing
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 1a120791206369..1d4031a4ffc926 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -67,10 +67,10 @@ static bool Initialized = [] {
 REGISTER_OP("UncompilableNullary").Output("o: float");
 REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float");
 
-std::unordered_map<string, string> GetClusters(const Graph& graph) {
-  std::unordered_map<string, string> ids;
+std::unordered_map<std::string, std::string> GetClusters(const Graph& graph) {
+  std::unordered_map<std::string, std::string> ids;
   for (Node* node : graph.nodes()) {
-    string cluster;
+    std::string cluster;
     if (TryGetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster)) {
       CHECK(!cluster.empty());
       ids[node->name()] = cluster;
@@ -86,10 +86,10 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
   return ids;
 }
 
-std::set<string> GetClusterNames(const Graph& graph) {
-  std::set<string> names;
+std::set<std::string> GetClusterNames(const Graph& graph) {
+  std::set<std::string> names;
   for (Node* node : graph.nodes()) {
-    string cluster;
+    std::string cluster;
     if (TryGetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster)) {
       CHECK(!cluster.empty());
       names.insert(cluster);
@@ -98,10 +98,10 @@ std::set<string> GetClusterNames(const Graph& graph) {
   return names;
 }
 
-absl::flat_hash_map<string, std::vector<string>> GetClusterSets(
-    const Graph& g, std::vector<string>* cluster_names = nullptr) {
+absl::flat_hash_map<std::string, std::vector<std::string>> GetClusterSets(
+    const Graph& g, std::vector<std::string>* cluster_names = nullptr) {
   CHECK(cluster_names == nullptr || cluster_names->empty());
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets;
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets;
   for (const auto& p : GetClusters(g)) {
     cluster_sets[p.second].push_back(p.first);
   }
@@ -357,7 +357,7 @@ TEST(XlaCompilationTest, CallXlaDeviceFuncWithResourceOp) {
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  std::string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   testing::FindNodeByName(graph.get(), "A")
       ->set_assigned_device_name(xla_cpu_device);
   testing::FindNodeByName(graph.get(), "tanh0")
@@ -694,7 +694,7 @@ TEST(XlaCompilationTest, ClusterNodesWithMismatchingInputDeadness) {
 }
 
 namespace {
-Node* MakeRead(const Scope& scope, const string& id,
+Node* MakeRead(const Scope& scope, const std::string& id,
                Node** var_handle_op = nullptr) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
@@ -706,7 +706,7 @@ Node* MakeRead(const Scope& scope, const string& id,
   return read.node();
 }
 
-Node* MakeWrite(const Scope& scope, const string& id) {
+Node* MakeWrite(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output value_to_write =
@@ -716,7 +716,7 @@ Node* MakeWrite(const Scope& scope, const string& id) {
   return assign_op.operation.node();
 }
 
-Node* MakeNeutral(const Scope& scope, const string& id) {
+Node* MakeNeutral(const Scope& scope, const std::string& id) {
   return ops::Const(scope.WithOpName("Const" + id), 42.0f).node();
 }
 }  // namespace
@@ -733,11 +733,11 @@ TEST(XlaCompilationTest, ResourcesClusteringAllowed) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets =
       GetClusterSets(*graph);
   ASSERT_EQ(cluster_sets.size(), 1);
-  std::vector<string> expected_clustered_nodes = {"AssignmentW", "ReadR",
-                                                  "ValueToAssignW"};
+  std::vector<std::string> expected_clustered_nodes = {"AssignmentW", "ReadR",
+                                                       "ValueToAssignW"};
   ASSERT_EQ(cluster_sets.begin()->second, expected_clustered_nodes);
 }
 
@@ -753,7 +753,7 @@ TEST(XlaCompilationTest, ResourcesClusteringDisallowed) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets =
       GetClusterSets(*graph);
   ASSERT_EQ(cluster_sets.size(), 0);
 }
@@ -779,13 +779,13 @@ TEST(XlaCompilationTest, ChainOfOps) {
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::vector<string> cluster_names;
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
+  std::vector<std::string> cluster_names;
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets =
       GetClusterSets(*graph, &cluster_names);
 
   ASSERT_EQ(cluster_sets.size(), 1);
 
-  std::vector<string> expected_clustered_nodes_a = {
+  std::vector<std::string> expected_clustered_nodes_a = {
       "AssignmentW1", "ConstN0", "ReadR0", "ValueToAssignW1"};
   ASSERT_EQ(cluster_sets[cluster_names[0]], expected_clustered_nodes_a);
 }
@@ -881,7 +881,7 @@ TEST(XlaCompilationTest, ConstOp) {
   {
     std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
     Scope root = Scope::NewRootScope().ExitOnError();
-    auto c = ops::Const(root.WithOpName("const"), string("string"));
+    auto c = ops::Const(root.WithOpName("const"), std::string("string"));
     c.node()->AddAttr(kXlaCompileAttr, true);
     TF_ASSERT_OK(root.ToGraph(graph.get()));
     TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
@@ -901,12 +901,12 @@ TEST(XlaCompilationTest, DontClusterIdentityWithRefInput) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
+  std::string cluster_name = clusters.begin()->second;
 
-  std::unordered_map<string, string> expected_clusters(
+  std::unordered_map<std::string, std::string> expected_clusters(
       {{"negate", cluster_name}, {"add", cluster_name}});
   EXPECT_EQ(clusters, expected_clusters);
 }
@@ -924,12 +924,12 @@ TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
+  std::string cluster_name = clusters.begin()->second;
 
-  std::unordered_map<string, string> expected_clusters(
+  std::unordered_map<std::string, std::string> expected_clusters(
       {{"negate", cluster_name},
        {"identity", cluster_name},
        {"add", cluster_name}});
@@ -956,7 +956,7 @@ TEST(XlaCompilationTest, ClusterControlTrigger) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   // TODO(b/118970344): ctrl_trigger_a has inputs with mismatching deadness so
   // it won't be clustered.  ctrl_trigger_b is okay to cluster but we don't
@@ -982,7 +982,7 @@ TEST(XlaCompilationTest, RandomShape) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["shape"], "");
 }
 
@@ -1028,7 +1028,7 @@ TEST(XlaCompilationTest, RandomShapeWithFunc) {
   TF_ASSERT_OK(
       MarkForCompilationPassTestHelper::MarkForCompilation(&graph, fld.get()));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["fn_call"], "");
 }
 
@@ -1054,12 +1054,12 @@ TEST(XlaCompilationTest, RandomShapeOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_gpu_device));
+      n->set_assigned_device_name(std::string(xla_gpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/shape_rng"], "");
   EXPECT_EQ(clusters["test/reshape"], "");
 }
@@ -1087,12 +1087,12 @@ TEST(XlaCompilationTest, TensorArrayShapeOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_gpu_device));
+      n->set_assigned_device_name(std::string(xla_gpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/read"], "");
   EXPECT_EQ(clusters["test/read"], clusters["test/reshape"]);
 }
@@ -1133,15 +1133,15 @@ TEST(XlaCompilationTest, DontClusterMergingNodes) {
 
   for (Node* n : graph->nodes()) {
     if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
-      n->set_assigned_device_name(string(xla_gpu_dev0));
+      n->set_assigned_device_name(std::string(xla_gpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
-      n->set_assigned_device_name(string(xla_gpu_dev1));
+      n->set_assigned_device_name(std::string(xla_gpu_dev1));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   // Each of the MatMuls should be in a separate cluster.
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
   EXPECT_NE(clusters["MatMulCombined_dev1"], clusters["MatMul0_dev0"]);
   EXPECT_NE(clusters["MatMulCombined_dev1"], clusters["MatMul1_dev1"]);
@@ -1170,17 +1170,17 @@ TEST(XlaCompilationTest, DontClusterMergingNodesOnCPU) {
 
   for (Node* n : graph->nodes()) {
     if (absl::EndsWith(n->name(), /*suffix=*/"cpu")) {
-      n->set_assigned_device_name(string(xla_cpu_dev0));
+      n->set_assigned_device_name(std::string(xla_cpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
-      n->set_assigned_device_name(string(xla_gpu_dev0));
+      n->set_assigned_device_name(std::string(xla_gpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
-      n->set_assigned_device_name(string(xla_gpu_dev1));
+      n->set_assigned_device_name(std::string(xla_gpu_dev1));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   // Each of the MatMuls should be in a separate cluster.
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
   EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul0_dev0"]);
   EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul1_dev1"]);
@@ -1223,14 +1223,14 @@ TEST(XlaCompilationTest, NOT_DontClusterSpreadingNodes) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   for (Node* n : graph->nodes()) {
     if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
-      n->set_assigned_device_name(string(xla_gpu_dev0));
+      n->set_assigned_device_name(std::string(xla_gpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
-      n->set_assigned_device_name(string(xla_gpu_dev1));
+      n->set_assigned_device_name(std::string(xla_gpu_dev1));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["A_dev0"], clusters["MatMulSource_dev0"]);
   EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
   EXPECT_NE(clusters["MatMulSource_dev0"], clusters["MatMul1_dev1"]);
@@ -1254,12 +1254,12 @@ TEST(XlaCompilationTest, ClusterStatefulRandomOpOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_cpu_device));
+      n->set_assigned_device_name(std::string(xla_cpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/a"], "");
   EXPECT_NE(clusters["test/b"], "");
   EXPECT_NE(clusters["test/c"], "");
@@ -1277,7 +1277,7 @@ TEST(XlaCompilationTest, DontAutoClusterStatefulRandomOp) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/a"], "");
   EXPECT_EQ(clusters["test/b"], "");
 }
@@ -1299,12 +1299,12 @@ TEST(XlaCompilationTest, ClusterDummyOpsOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_cpu_device));
+      n->set_assigned_device_name(std::string(xla_cpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/check"], "");
   EXPECT_NE(clusters["test/greaterequal"], "");
   EXPECT_NE(clusters["test/assert"], "");
@@ -1324,7 +1324,7 @@ TEST(XlaCompilationTest, DontAutoClusterDummyOps) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/assert"], "");
   EXPECT_EQ(clusters["test/check"], "");
 }
@@ -1345,7 +1345,7 @@ TEST(XlaCompilationTest, DontAutoClusterOpsProducingVariant) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/tensor_list_reserve"], "");
 }
 
@@ -1373,7 +1373,7 @@ TEST(XlaCompilationTest, DontAutoClusterOpsConsumingVariant) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/tensor_list_element_shape"], "");
 }
 
@@ -1391,7 +1391,7 @@ TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
 
-  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  std::string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
       n->set_assigned_device_name(xla_cpu_device);
@@ -1400,7 +1400,7 @@ TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/tensor_list_reserve"], "");
 }
 
@@ -1427,7 +1427,7 @@ TEST(XlaCompilationTest, CreateCombinedCpuGpuClusters) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/x"], "");
 
@@ -1451,7 +1451,7 @@ TEST(XlaCompilationTest, DontCreateGpu0AndGpu1Clusters) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/x"], "");
   EXPECT_EQ(clusters["test/y"], "");
@@ -1473,7 +1473,7 @@ TEST(XlaCompilationTest, DontCreateCombinedCpuUnknownClusters) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/x"], "");
   EXPECT_EQ(clusters["test/y"], "");
@@ -1486,8 +1486,8 @@ TEST(XlaCompilationTest, ClusterResourceOpsWhenSafe) {
   Node* resource_read = MakeRead(root, "read", &var_handle);
   Output b = ops::Add(root.WithOpName("test/b"), Output(resource_read, 0), a);
 
-  string resource_read_name = resource_read->name();
-  string var_handle_name = var_handle->name();
+  std::string resource_read_name = resource_read->name();
+  std::string var_handle_name = var_handle->name();
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
@@ -1499,7 +1499,7 @@ TEST(XlaCompilationTest, ClusterResourceOpsWhenSafe) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/b"], "");
   EXPECT_EQ(clusters["test/b"], clusters[resource_read_name]);
@@ -1512,8 +1512,8 @@ TEST(XlaCompilationTest, DontClusterResourceOpsWhenUnsafe) {
   Node* resource_read = MakeRead(root, "read", &var_handle);
   Output b = ops::Add(root.WithOpName("test/b"), Output(resource_read, 0), a);
 
-  string resource_read_name = resource_read->name();
-  string var_handle_name = var_handle->name();
+  std::string resource_read_name = resource_read->name();
+  std::string var_handle_name = var_handle->name();
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
@@ -1525,7 +1525,7 @@ TEST(XlaCompilationTest, DontClusterResourceOpsWhenUnsafe) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/b"], "");
   EXPECT_EQ(clusters[resource_read_name], "");
@@ -1555,7 +1555,7 @@ TEST(XlaCompilationTest, DontClusterNodesWithScopedAllocatorAttr) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/z"], "");
 }
@@ -1580,7 +1580,7 @@ TEST(XlaCompilationTest, DontClusterNodesWithForwardFromAttr) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/z"], "");
 }
@@ -1610,7 +1610,7 @@ TEST(XlaCompilationTest, ClusterShapeConsumerWithProducer) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/y"], "");
   EXPECT_EQ(clusters["test/x"], clusters["test/y"]);
@@ -1632,7 +1632,7 @@ TEST(XlaCompilationTest, ClusterShapeConsumerWithProducerAndConsumer) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/y"], "");
   EXPECT_EQ(clusters["test/y"], clusters["test/x"]);
@@ -1705,7 +1705,7 @@ TEST(XlaCompilationTest, IterationIncrementAndGroupDeps) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["some_ctrl_input"], "");
   EXPECT_EQ(clusters["some_ctrl_input"], clusters["weights_0_update"]);
@@ -1875,19 +1875,19 @@ TEST(XlaCompilationTest, ClusterSessionName) {
   TF_ASSERT_OK(
       MarkForCompilationPassTestHelper::MarkForCompilation(&graph, options));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
+  std::string cluster_name = clusters.begin()->second;
 
-  std::unordered_map<string, string> expected_clusters(
+  std::unordered_map<std::string, std::string> expected_clusters(
       {{"negate", cluster_name}, {"add", cluster_name}});
   EXPECT_EQ(clusters, expected_clusters);
   EXPECT_THAT(cluster_name, ::testing::StartsWith("test_session_name"));
 }
 
 namespace {
-Node* MakeStageNode(GraphDefBuilder& builder, string name,
+Node* MakeStageNode(GraphDefBuilder& builder, std::string name,
                     std::initializer_list<DataType> dtypes,
                     absl::Span<const ops::NodeOut> values) {
   auto opts = builder.opts()
@@ -1949,7 +1949,7 @@ TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
         &graph,
         MarkForCompilationPassTestHelper::Options().WithNoClusterScoping()));
 
-    std::unordered_map<string, string> clusters = GetClusters(*graph);
+    std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
     EXPECT_EQ(clusters["add0"], clusters["add1"]);
     EXPECT_EQ(clusters["add0"], clusters["relu1"]);
     EXPECT_EQ(clusters["relu0"], clusters["add1"]);
@@ -1964,7 +1964,7 @@ TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
 
     TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-    std::unordered_map<string, string> clusters = GetClusters(*graph);
+    std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
     EXPECT_NE(clusters["add0"], clusters["add1"]);
     EXPECT_NE(clusters["add0"], clusters["relu1"]);
     EXPECT_NE(clusters["relu0"], clusters["add1"]);
@@ -1973,9 +1973,9 @@ TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
 }
 TEST(XlaCompilationTest, XLALiteAllowlist) {
   auto* allowlist_table = tensorflow::GetAllowlistTable();
-  absl::flat_hash_set<string> hallowlist;
-  std::vector<string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
-  absl::flat_hash_set<string> all_ops(vall_ops.begin(), vall_ops.end());
+  absl::flat_hash_set<std::string> hallowlist;
+  std::vector<std::string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
+  absl::flat_hash_set<std::string> all_ops(vall_ops.begin(), vall_ops.end());
 
   // Check that all the operations in the table are existing TF operations
   for (auto pair : *allowlist_table) {
@@ -1988,10 +1988,10 @@ TEST(XlaCompilationTest, XLALiteAllowlist) {
   // Check that all registered XLA operation are in the allowlist
   // table or are known to not be in it.
 
-  absl::flat_hash_set<string> known_not_in_list =
+  absl::flat_hash_set<std::string> known_not_in_list =
       tensorflow::testing::GetKnownXLAAllowlistOp();
-  std::vector<string> unknow_op;
-  for (string op : vall_ops) {
+  std::vector<std::string> unknow_op;
+  for (std::string op : vall_ops) {
     if (!hallowlist.contains(op) && !known_not_in_list.contains(op)) {
       unknow_op.push_back(op);
     }
diff --git a/tensorflow/compiler/jit/node_matchers.cc b/tensorflow/compiler/jit/node_matchers.cc
index ce1f2cd5bcd671..db158fc84a0173 100644
--- a/tensorflow/compiler/jit/node_matchers.cc
+++ b/tensorflow/compiler/jit/node_matchers.cc
@@ -35,7 +35,7 @@ namespace {
 using impl::NodeMatcherProperties;
 using impl::OutEdge;
 
-string IndentAllButFirstLine(absl::string_view text) {
+std::string IndentAllButFirstLine(absl::string_view text) {
   std::vector<std::string> lines = absl::StrSplit(text, '\n');
   for (int i = 1; i < lines.size(); i++) {
     lines[i].insert(0, "  ");
@@ -86,21 +86,21 @@ bool MatchAndExplainTensor(const Tensor& tensor, const Tensor& expected_tensor,
     case DT_DOUBLE:
       return CompareTensor<double>(tensor, expected_tensor, listener);
     case DT_INT8:
-      return CompareTensor<int8>(tensor, expected_tensor, listener);
+      return CompareTensor<int8_t>(tensor, expected_tensor, listener);
     case DT_INT16:
-      return CompareTensor<int16>(tensor, expected_tensor, listener);
+      return CompareTensor<int16_t>(tensor, expected_tensor, listener);
     case DT_INT32:
-      return CompareTensor<int32>(tensor, expected_tensor, listener);
+      return CompareTensor<int32_t>(tensor, expected_tensor, listener);
     case DT_INT64:
       return CompareTensor<int64_t>(tensor, expected_tensor, listener);
     case DT_UINT8:
-      return CompareTensor<uint8>(tensor, expected_tensor, listener);
+      return CompareTensor<uint8_t>(tensor, expected_tensor, listener);
     case DT_UINT16:
-      return CompareTensor<uint16>(tensor, expected_tensor, listener);
+      return CompareTensor<uint16_t>(tensor, expected_tensor, listener);
     case DT_UINT32:
-      return CompareTensor<uint32>(tensor, expected_tensor, listener);
+      return CompareTensor<uint32_t>(tensor, expected_tensor, listener);
     case DT_UINT64:
-      return CompareTensor<uint64>(tensor, expected_tensor, listener);
+      return CompareTensor<uint64_t>(tensor, expected_tensor, listener);
     default:
       LOG(FATAL) << "Unsupported dtype "  // Crash ok: testonly.
                  << DataType_Name(tensor.dtype());
@@ -188,7 +188,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
     if (control_dep_set &&
         !control_dep_set->MatchAndExplain(control_deps, &inner_listener)) {
       if (listener->IsInterested()) {
-        string explanation = inner_listener.str();
+        std::string explanation = inner_listener.str();
         if (!explanation.empty()) {
           explanation = absl::StrCat(", ", explanation, ",");
         }
@@ -225,7 +225,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
   }
 
   void DescribeTo(::std::ostream* os) const override {
-    std::vector<string> predicates;
+    std::vector<std::string> predicates;
 
     if (name) {
       predicates.push_back(absl::StrCat("name: ", *name));
@@ -282,10 +282,11 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
 
     if (!attrs.empty()) {
       printed_something = true;
-      std::vector<string> attrs_str;
+      std::vector<std::string> attrs_str;
       absl::c_transform(
           attrs, std::back_inserter(attrs_str),
-          [](const std::pair<string, std::optional<AttrValue>>& attr_kv_pair) {
+          [](const std::pair<std::string, std::optional<AttrValue>>&
+                 attr_kv_pair) {
             return absl::StrCat(attr_kv_pair.first, "->",
                                 attr_kv_pair.second
                                     ? SummarizeAttrValue(*attr_kv_pair.second)
@@ -319,7 +320,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
     if (listener->IsInterested()) {
       *listener << "\ninput " << input_idx << " does not match expected:\n";
       (*input_matchers)[input_idx].DescribeTo(listener->stream());
-      string explanation = inner_listener.str();
+      std::string explanation = inner_listener.str();
       if (!explanation.empty()) {
         *listener << ", " << explanation;
       }
@@ -327,14 +328,14 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
     return false;
   }
 
-  std::optional<string> op;
-  std::optional<string> name;
-  std::optional<string> assigned_device;
+  std::optional<std::string> op;
+  std::optional<std::string> name;
+  std::optional<std::string> assigned_device;
   std::optional<Tensor> constant_value;
   std::optional<std::vector<::testing::Matcher<OutEdge>>> input_matchers;
   std::optional<::testing::Matcher<absl::Span<const Node* const>>>
       control_dep_set;
-  std::map<string, std::optional<AttrValue>> attrs;
+  std::map<std::string, std::optional<AttrValue>> attrs;
 };
 
 // Matches a dst and dst_output on an input edge.  Today we only use this with
@@ -352,7 +353,7 @@ class OutEdgeMatcher : public ::testing::MatcherInterface<OutEdge> {
       if (listener->IsInterested()) {
         *listener << "\nsource does not match expected ";
         src_matcher_.DescribeTo(listener->stream());
-        string explanation = inner_listener.str();
+        std::string explanation = inner_listener.str();
         if (!explanation.empty()) {
           *listener << "\n\t" << explanation;
         }
@@ -432,21 +433,21 @@ ::testing::Matcher<const Node*> impl::NodeWith(
   return ::testing::MakeMatcher(matcher);
 }
 
-impl::NodeMatcherProperties Name(string name) {
+impl::NodeMatcherProperties Name(std::string name) {
   impl::NodeMatcherProperties props;
   props.set_name(std::move(name));
   return props;
 }
 
 // Matches a node with op `op`.
-impl::NodeMatcherProperties Op(string op) {
+impl::NodeMatcherProperties Op(std::string op) {
   impl::NodeMatcherProperties props;
   props.set_op(std::move(op));
   return props;
 }
 
 // Matches a node with assigned device `assigned_device`.
-impl::NodeMatcherProperties AssignedDevice(string assigned_device) {
+impl::NodeMatcherProperties AssignedDevice(std::string assigned_device) {
   impl::NodeMatcherProperties props;
   props.set_assigned_device(std::move(assigned_device));
   return props;
@@ -472,15 +473,15 @@ impl::NodeMatcherProperties impl::CtrlDeps(
   return props;
 }
 
-std::pair<string, AttrValue> impl::AttrLiteralHelper(
-    const std::pair<string, bool>& bool_attr) {
+std::pair<std::string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<std::string, bool>& bool_attr) {
   AttrValue attr_value;
   attr_value.set_b(bool_attr.second);
   return {bool_attr.first, attr_value};
 }
 
-std::pair<string, AttrValue> impl::AttrLiteralHelper(
-    const std::pair<string, absl::Span<const int>>& int_list_attr) {
+std::pair<std::string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const int>>& int_list_attr) {
   AttrValue attr_value;
   AttrValue::ListValue* list = attr_value.mutable_list();
   for (int i : int_list_attr.second) {
@@ -489,23 +490,24 @@ std::pair<string, AttrValue> impl::AttrLiteralHelper(
   return {int_list_attr.first, attr_value};
 }
 
-std::pair<string, AttrValue> impl::AttrLiteralHelper(
-    const std::pair<string, absl::Span<const string>>& string_list_attr) {
+std::pair<std::string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const std::string>>&
+        string_list_attr) {
   AttrValue attr_value;
   AttrValue::ListValue* list = attr_value.mutable_list();
-  for (const string& s : string_list_attr.second) {
+  for (const std::string& s : string_list_attr.second) {
     list->add_s(s);
   }
   return {string_list_attr.first, attr_value};
 }
 
-impl::NodeMatcherProperties impl::Attr(std::pair<string, AttrValue> attr) {
+impl::NodeMatcherProperties impl::Attr(std::pair<std::string, AttrValue> attr) {
   impl::NodeMatcherProperties props;
   props.set_attr(std::move(attr));
   return props;
 }
 
-impl::NodeMatcherProperties impl::Attr(string name) {
+impl::NodeMatcherProperties impl::Attr(std::string name) {
   impl::NodeMatcherProperties props;
   props.set_attr({std::move(name), std::nullopt});
   return props;
diff --git a/tensorflow/compiler/jit/node_matchers.h b/tensorflow/compiler/jit/node_matchers.h
index bb2c1875306185..1391df3743bd4c 100644
--- a/tensorflow/compiler/jit/node_matchers.h
+++ b/tensorflow/compiler/jit/node_matchers.h
@@ -84,11 +84,11 @@ class NodeMatcherProperties {
  public:
   using NodeSeqMatcher = std::vector<::testing::Matcher<const Node*>>;
   using InputSeqMatcher = std::vector<::testing::Matcher<OutEdge>>;
-  using AttrKeyValuePair = std::pair<string, std::optional<AttrValue>>;
+  using AttrKeyValuePair = std::pair<std::string, std::optional<AttrValue>>;
 
-  const std::optional<string>& name() const { return name_; }
-  const std::optional<string>& op() const { return op_; }
-  const std::optional<string>& assigned_device() const {
+  const std::optional<std::string>& name() const { return name_; }
+  const std::optional<std::string>& op() const { return op_; }
+  const std::optional<std::string>& assigned_device() const {
     return assigned_device_;
   }
   const std::optional<Tensor>& constant_value() const {
@@ -102,17 +102,17 @@ class NodeMatcherProperties {
   }
   const std::optional<AttrKeyValuePair>& attr() const { return attr_; }
 
-  void set_name(string name) {
+  void set_name(std::string name) {
     DCHECK(IsEmpty());
     name_ = std::move(name);
   }
 
-  void set_op(string op) {
+  void set_op(std::string op) {
     DCHECK(IsEmpty());
     op_ = std::move(op);
   }
 
-  void set_assigned_device(string assigned_device) {
+  void set_assigned_device(std::string assigned_device) {
     DCHECK(IsEmpty());
     assigned_device_ = std::move(assigned_device);
   }
@@ -144,9 +144,9 @@ class NodeMatcherProperties {
   }
 
  private:
-  std::optional<string> name_;
-  std::optional<string> op_;
-  std::optional<string> assigned_device_;
+  std::optional<std::string> name_;
+  std::optional<std::string> op_;
+  std::optional<std::string> assigned_device_;
   std::optional<Tensor> constant_value_;
   std::optional<InputSeqMatcher> input_matchers_;
   std::optional<NodeSeqMatcher> control_deps_;
@@ -162,39 +162,40 @@ impl::NodeMatcherProperties Inputs(
 impl::NodeMatcherProperties CtrlDeps(
     absl::Span<const ::testing::Matcher<const Node*>> control_deps);
 
-impl::NodeMatcherProperties Attr(std::pair<string, AttrValue> attrs);
-impl::NodeMatcherProperties Attr(string name);
+impl::NodeMatcherProperties Attr(std::pair<std::string, AttrValue> attrs);
+impl::NodeMatcherProperties Attr(std::string name);
 
-std::pair<string, AttrValue> AttrLiteralHelper(
-    const std::pair<string, bool>& bool_attr);
+std::pair<std::string, AttrValue> AttrLiteralHelper(
+    const std::pair<std::string, bool>& bool_attr);
 
-std::pair<string, AttrValue> AttrLiteralHelper(
-    const std::pair<string, absl::Span<const int>>& int_list_attr);
+std::pair<std::string, AttrValue> AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const int>>& int_list_attr);
 
-std::pair<string, AttrValue> AttrLiteralHelper(
-    const std::pair<string, absl::Span<const string>>& string_list_attr);
+std::pair<std::string, AttrValue> AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const std::string>>&
+        string_list_attr);
 }  // namespace impl
 
 // -----------------------------------------------------------------------------
 // Public interface.
 
 // Matches a node with name `name`.
-impl::NodeMatcherProperties Name(string name);
+impl::NodeMatcherProperties Name(std::string name);
 
 // Matches a node with op `op`.
-impl::NodeMatcherProperties Op(string op);
+impl::NodeMatcherProperties Op(std::string op);
 
 // Matches a node with assigned device `assigned_device`.
-impl::NodeMatcherProperties AssignedDevice(string assigned_device);
+impl::NodeMatcherProperties AssignedDevice(std::string assigned_device);
 
 // Matches a node with a boolean typed attribute named `name` and with value
 // `value`.
 template <typename ValueTy>
-impl::NodeMatcherProperties Attr(const string& name, ValueTy value) {
+impl::NodeMatcherProperties Attr(const std::string& name, ValueTy value) {
   return impl::Attr({impl::AttrLiteralHelper({name, value})});
 }
 
-inline impl::NodeMatcherProperties Attr(const string& name) {
+inline impl::NodeMatcherProperties Attr(const std::string& name) {
   return impl::Attr(name);
 }
 
diff --git a/tensorflow/compiler/jit/node_matchers_test.cc b/tensorflow/compiler/jit/node_matchers_test.cc
index 6f37d5617b6ce6..ac1d9ce3468df1 100644
--- a/tensorflow/compiler/jit/node_matchers_test.cc
+++ b/tensorflow/compiler/jit/node_matchers_test.cc
@@ -41,7 +41,7 @@ using testing::matchers::Op;
 using testing::matchers::Out;
 
 template <typename M, typename T>
-string Explain(const T& t, const M& m) {
+std::string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
   EXPECT_THAT(t, ::testing::Not(m));  // For the error message.
   EXPECT_FALSE(m.MatchAndExplain(t, &listener));
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index c8bbcee20e3829..9539a14d060f42 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -100,7 +100,7 @@ absl::Status PartiallyDecluster(std::unique_ptr<Graph>* graph) {
   return pass.Run(opt_options);
 }
 
-Node* FindNodeByName(const Graph& graph, const string& name) {
+Node* FindNodeByName(const Graph& graph, const std::string& name) {
   for (Node* node : graph.nodes()) {
     if (node->name() == name) {
       return node;
@@ -109,7 +109,7 @@ Node* FindNodeByName(const Graph& graph, const string& name) {
   return nullptr;
 }
 
-bool GetInputsForNode(const Graph& graph, const string& node_name,
+bool GetInputsForNode(const Graph& graph, const std::string& node_name,
                       std::vector<Node*>* inputs) {
   const Node* node = FindNodeByName(graph, node_name);
   if (node == nullptr) {
@@ -292,7 +292,7 @@ TEST(PartiallyDeclusterPassTest, DeclusterDependentNodes) {
 void AddToCluster(absl::Span<Node* const> nodes,
                   absl::string_view cluster_name) {
   for (Node* n : nodes) {
-    n->AddAttr(kXlaClusterAttr, string(cluster_name));
+    n->AddAttr(kXlaClusterAttr, std::string(cluster_name));
   }
 }
 
diff --git a/tensorflow/compiler/jit/pjrt_base_device.cc b/tensorflow/compiler/jit/pjrt_base_device.cc
index ce7ed954575040..d25d77d6cff22b 100644
--- a/tensorflow/compiler/jit/pjrt_base_device.cc
+++ b/tensorflow/compiler/jit/pjrt_base_device.cc
@@ -17,8 +17,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-DeviceAttributes BuildPjRtBaseDeviceAttributes(const string& name_prefix,
-                                               const string& device_name,
+DeviceAttributes BuildPjRtBaseDeviceAttributes(const std::string& name_prefix,
+                                               const std::string& device_name,
                                                int device_ordinal) {
   return Device::BuildDeviceAttributes(
       absl::StrCat(name_prefix, "/device:", device_name, ":", device_ordinal),
diff --git a/tensorflow/compiler/jit/pjrt_device_context.cc b/tensorflow/compiler/jit/pjrt_device_context.cc
index e4d88f5816ec87..0bbad40fe5f25c 100644
--- a/tensorflow/compiler/jit/pjrt_device_context.cc
+++ b/tensorflow/compiler/jit/pjrt_device_context.cc
@@ -139,7 +139,7 @@ void PjRtDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
     return;
   }
 
-  xla::PjRtFuture<> future = device_buffer->ToLiteral(literal.get());
+  tsl::Future<void> future = device_buffer->ToLiteral(literal.get());
   future.OnReady([literal = std::move(literal), done = std::move(done)](
                      const absl::Status& status) { done(status); });
 }
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
index 2fee2b0b898890..33f09704d7c72b 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
@@ -143,7 +143,7 @@ bool IsEdgeSafe(XlaResourceOpKind from, XlaResourceOpKind to) {
 
 using ResourceOp = std::pair<int, XlaResourceOpKind>;
 
-string ResourceOpToString(const ResourceOp& resource_op) {
+std::string ResourceOpToString(const ResourceOp& resource_op) {
   return absl::StrCat(
       resource_op.first, ": ",
       XlaResourceOpInfo::XlaResourceOpKindToString(resource_op.second));
@@ -233,14 +233,14 @@ class ResourceOpSet {
   void operator=(const ResourceOpSet&) = delete;
 };
 
-string ResourceOpSetToString(const ResourceOpSet& resource_op_set) {
-  std::vector<string> elements_debug_string;
+std::string ResourceOpSetToString(const ResourceOpSet& resource_op_set) {
+  std::vector<std::string> elements_debug_string;
   std::transform(resource_op_set.begin(), resource_op_set.end(),
                  std::back_inserter(elements_debug_string), ResourceOpToString);
   return absl::StrCat("{", absl::StrJoin(elements_debug_string, ","), "}");
 }
 
-string NodeToString(const Node& n, XlaResourceOpKind resource_op_kind) {
+std::string NodeToString(const Node& n, XlaResourceOpKind resource_op_kind) {
   return absl::StrCat(
       "[", n.name(), ": ", n.type_string(), "(",
       XlaResourceOpInfo::XlaResourceOpKindToString(resource_op_kind), ")", "]");
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
index 8a80b8ae9b3497..6b038c992f1715 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Node* MakeRead(const Scope& scope, const string& id) {
+Node* MakeRead(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output read =
@@ -46,7 +46,7 @@ Node* MakeRead(const Scope& scope, const string& id) {
   return read.node();
 }
 
-Node* MakeWrite(const Scope& scope, const string& id) {
+Node* MakeWrite(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output value_to_write =
@@ -56,7 +56,7 @@ Node* MakeWrite(const Scope& scope, const string& id) {
   return assign_op.operation.node();
 }
 
-Node* MakeModify(const Scope& scope, const string& id) {
+Node* MakeModify(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output value_to_write = ops::Const(scope.WithOpName("Increment" + id), 1.0f);
@@ -65,7 +65,7 @@ Node* MakeModify(const Scope& scope, const string& id) {
   return assign_add_op.operation.node();
 }
 
-Node* MakeNeutral(const Scope& scope, const string& id) {
+Node* MakeNeutral(const Scope& scope, const std::string& id) {
   return ops::Const(scope.WithOpName("Const" + id), 42.0f).node();
 }
 
@@ -238,7 +238,8 @@ TEST(ResourceOperationSafetyAnalysisTest, WriteReadModify) {
   EXPECT_EQ(incompatible_pairs[1], write_modify_pair);
 }
 
-FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
+FunctionDefLibrary CreateFunctionDefLibWithConstFunction(
+    const std::string& name) {
   FunctionDefLibrary flib_def;
   FunctionDef func = FunctionDefHelper::Create(
       /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"},
@@ -249,8 +250,8 @@ FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
   return flib_def;
 }
 
-Node* MakeCall(Graph* graph, const string& callee_name, const string& node_name,
-               absl::Status* status) {
+Node* MakeCall(Graph* graph, const std::string& callee_name,
+               const std::string& node_name, absl::Status* status) {
   NodeDef call_node;
   call_node.set_name(node_name);
   call_node.set_op(callee_name);
diff --git a/tensorflow/compiler/jit/shape_inference.h b/tensorflow/compiler/jit/shape_inference.h
index 467ecb83a74aae..b1469d2d699bf1 100644
--- a/tensorflow/compiler/jit/shape_inference.h
+++ b/tensorflow/compiler/jit/shape_inference.h
@@ -35,7 +35,8 @@ struct InferredShape {
   DataType handle_type = DT_INVALID;
   PartialTensorShape handle_shape;
 };
-typedef std::unordered_map<string, std::vector<InferredShape>> GraphShapeInfo;
+typedef std::unordered_map<std::string, std::vector<InferredShape>>
+    GraphShapeInfo;
 
 // Infer shapes for all Tensors in a graph, and save them in a map.  The vector
 // for a Node contains the information about each of its outputs.
diff --git a/tensorflow/compiler/jit/shape_inference_test.cc b/tensorflow/compiler/jit/shape_inference_test.cc
index eaabf18c79603c..599d442de4b092 100644
--- a/tensorflow/compiler/jit/shape_inference_test.cc
+++ b/tensorflow/compiler/jit/shape_inference_test.cc
@@ -61,7 +61,7 @@ TEST(ShapeInferenceTest, Basics) {
   TF_ASSERT_OK(InferShapes(graph.get(), /*arg_shapes=*/{},
                            /*fnlib_def=*/nullptr, &shape_info));
 
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"A", {PartialTensorShape({2, 3})}}, {"B", {PartialTensorShape({3})}},
       {"C", {PartialTensorShape()}},       {"D", {PartialTensorShape({2, 3})}},
       {"E", {PartialTensorShape()}},       {"F", {PartialTensorShape()}},
@@ -94,7 +94,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSize) {
   TF_ASSERT_OK(InferShapes(graph.get(), arg_shapes,
                            /*fnlib_def=*/nullptr, &shape_info));
 
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"A", {PartialTensorShape({2, 3})}},
       {"B", {PartialTensorShape({2, 3})}},
       {"C", {PartialTensorShape({2, 3})}},
@@ -127,7 +127,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSizeIncompleteUserArgs) {
   TF_ASSERT_OK(InferShapes(graph.get(), arg_shapes,
                            /*fnlib_def=*/nullptr, &shape_info));
 
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"A", {PartialTensorShape({2, 3})}},
       {"B", {PartialTensorShape({2, 3})}},
       {"C", {PartialTensorShape({2, 3})}},
@@ -156,7 +156,7 @@ TEST(ShapeInferenceTest, WhileLoop) {
         ops::internal::Enter(scope.WithOpName("while/Enter2"), source, "aloop");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -168,11 +168,11 @@ TEST(ShapeInferenceTest, WhileLoop) {
     auto identity = ops::Identity(scope.WithOpName("while/Identity"),
                                   switch_node.output_true);
     auto identity_shape =
-        ops::Const<int32>(scope.WithOpName("while/Identity/shape"), {});
+        ops::Const<int32_t>(scope.WithOpName("while/Identity/shape"), {});
     auto identity_reshaped = ops::Reshape(
         scope.WithOpName("while/Identity/reshaped"), identity, identity_shape);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
     auto add = ops::Add(scope.WithOpName("while/add"), identity_reshaped, one);
     auto next_iteration =
@@ -190,7 +190,7 @@ TEST(ShapeInferenceTest, WhileLoop) {
   GraphShapeInfo shape_info;
   TF_ASSERT_OK(InferShapes(&graph, /*arg_shapes=*/{}, /*fnlib_def=*/nullptr,
                            &shape_info));
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"while/Identity", {PartialTensorShape()}},
       {"while/add", {PartialTensorShape({})}},
   };
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
index 81ab1d8d05f96e..30a9ab51faf105 100644
--- a/tensorflow/compiler/jit/test_util.cc
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 
 absl::Status ShapeAnnotationsMatch(
     const Graph& graph, const GraphShapeInfo& shape_info,
-    std::map<string, std::vector<PartialTensorShape>> expected_shapes) {
+    std::map<std::string, std::vector<PartialTensorShape>> expected_shapes) {
   for (Node* node : graph.op_nodes()) {
     auto sit = shape_info.find(node->name());
     TF_RET_CHECK(sit != shape_info.end())
@@ -50,7 +50,7 @@ absl::Status ShapeAnnotationsMatch(
     }
   }
   if (!expected_shapes.empty()) {
-    std::vector<string> missing;
+    std::vector<std::string> missing;
     missing.reserve(expected_shapes.size());
     for (const auto& entry : expected_shapes) {
       missing.push_back(entry.first);
@@ -88,12 +88,12 @@ void DeviceSetup::AddDevicesAndSetUp(
   flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
 }
 
-Device* DeviceSetup::GetDevice(const string& device_name) {
+Device* DeviceSetup::GetDevice(const std::string& device_name) {
   if (device_mgr_ == nullptr) {
     return nullptr;
   }
 
-  string full_device_name = absl::StrCat(
+  std::string full_device_name = absl::StrCat(
       "/job:localhost/replica:0/task:0/device:", device_name, ":0");
   Device* device;
   TF_CHECK_OK(device_mgr_->LookupDevice(full_device_name, &device));
diff --git a/tensorflow/compiler/jit/test_util.h b/tensorflow/compiler/jit/test_util.h
index ec694662297399..ba7d2533ef7c74 100644
--- a/tensorflow/compiler/jit/test_util.h
+++ b/tensorflow/compiler/jit/test_util.h
@@ -44,7 +44,7 @@ namespace tensorflow {
 // `expected_shapes` entries.
 absl::Status ShapeAnnotationsMatch(
     const Graph& graph, const GraphShapeInfo& shape_info,
-    std::map<string, std::vector<PartialTensorShape>> expected_shapes);
+    std::map<std::string, std::vector<PartialTensorShape>> expected_shapes);
 
 // A helper object to create GraphOptimizationPassOptions.
 struct GraphOptimizationPassWrapper {
@@ -74,7 +74,7 @@ class DeviceSetup {
   void AddDevicesAndSetUp(
       const std::vector<std::string>& device_names,
       const std::optional<FunctionDef>& fdef = std::nullopt);
-  Device* GetDevice(const string& device_name);
+  Device* GetDevice(const std::string& device_name);
   FunctionLibraryRuntime* flr() { return flr_; }
 
  private:
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test.cc b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
index 90e73c23d210d7..d108bc51b5ee33 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
@@ -23,7 +23,7 @@ class AutoClusteringTestImpl : public AutoClusteringTest {
  protected:
   // Test auto-clustering with a proto text file ${key}.pbtxt.
   absl::Status RunAutoClusteringTestWithPbtxt(absl::string_view key) {
-    string file_name_without_extension =
+    std::string file_name_without_extension =
         absl::StrCat(testing::TensorFlowSrcRoot(), "/compiler/jit/tests/", key);
 
     return AutoClusteringTest::RunAutoClusteringTestWithPbtxt(
@@ -33,7 +33,7 @@ class AutoClusteringTestImpl : public AutoClusteringTest {
 
   // Test auto-clustering with a gzipped proto text file ${key}.pbtxt.gz.
   absl::Status RunAutoClusteringTestWithGzippedPbtxt(absl::string_view key) {
-    string file_name_without_extension =
+    std::string file_name_without_extension =
         absl::StrCat(testing::TensorFlowSrcRoot(), "/compiler/jit/tests/", key);
 
     return AutoClusteringTest::RunAutoClusteringTestWithGzippedPbtxt(
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
index dee77ac750ee54..258449e91120e1 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
@@ -33,7 +33,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-absl::StatusOr<string> SummarizeClustering(
+absl::StatusOr<std::string> SummarizeClustering(
     const GraphDef& auto_clustered_graph_def) {
   testing::ResetClusterSequenceNumber();
   Graph graph(OpRegistry::Global());
@@ -45,7 +45,7 @@ absl::StatusOr<string> SummarizeClustering(
 
   // cluster_id -> (operation name -> # of operations)
   const int kNoCluster = -1;
-  std::map<int, std::map<string, int>> clusters;
+  std::map<int, std::map<std::string, int>> clusters;
   std::map<int, int> cluster_size;
   int clustered_nodes = 0;
   for (Node* n : graph.op_nodes()) {
@@ -60,7 +60,7 @@ absl::StatusOr<string> SummarizeClustering(
     cluster_size[cluster]++;
   }
 
-  string result =
+  std::string result =
       absl::StrCat("Clustered nodes: ", clustered_nodes,
                    "\nUnclustered nodes: ", cluster_size[kNoCluster],
                    "\nNumber of clusters: ", clusters.size() - 1, "\n\n");
@@ -99,7 +99,7 @@ absl::Status AssertGraphDefIsUnclustered(const GraphDef& graphdef) {
   return absl::OkStatus();
 }
 
-absl::Status ReadTextProtoFromString(Env* env, const string& data,
+absl::Status ReadTextProtoFromString(Env* env, const std::string& data,
                                      ::tensorflow::protobuf::Message* proto) {
   if (!::tensorflow::protobuf::TextFormat::ParseFromString(data, proto)) {
     return errors::DataLoss("Can't parse input data as text proto");
@@ -141,7 +141,8 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestImpl(
     graphdef = std::move(next);
   }
 
-  TF_ASSIGN_OR_RETURN(string clustering_summary, SummarizeClustering(graphdef));
+  TF_ASSIGN_OR_RETURN(std::string clustering_summary,
+                      SummarizeClustering(graphdef));
 
   // To update golden files flip this to true and run
   //
@@ -149,13 +150,15 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestImpl(
   //   tensorflow/compiler/jit/tests:auto_clustering_test
   bool update_golden = false;
   if (update_golden) {
-    TF_RETURN_IF_ERROR(WriteStringToFile(
-        Env::Default(), string(golden_summary_file_path), clustering_summary));
+    TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(),
+                                         std::string(golden_summary_file_path),
+                                         clustering_summary));
   }
 
-  string golden_file_contents;
-  TF_RETURN_IF_ERROR(ReadFileToString(
-      Env::Default(), string(golden_summary_file_path), &golden_file_contents));
+  std::string golden_file_contents;
+  TF_RETURN_IF_ERROR(ReadFileToString(Env::Default(),
+                                      std::string(golden_summary_file_path),
+                                      &golden_file_contents));
 
   EXPECT_EQ(golden_file_contents, clustering_summary);
 
@@ -167,7 +170,7 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestWithPbtxt(
     absl::string_view golden_summary_file_path) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(
-      ReadTextProto(Env::Default(), string(pbtxt_file_path), &graphdef));
+      ReadTextProto(Env::Default(), std::string(pbtxt_file_path), &graphdef));
   return RunAutoClusteringTestImpl(std::move(graphdef),
                                    golden_summary_file_path);
 }
@@ -177,8 +180,8 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestWithGzippedPbtxt(
     absl::string_view golden_summary_file_path) {
   Env* env = Env::Default();
   std::unique_ptr<RandomAccessFile> file_reader;
-  TF_RETURN_IF_ERROR(
-      env->NewRandomAccessFile(string(gzipped_pbtxt_file_path), &file_reader));
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
+      std::string(gzipped_pbtxt_file_path), &file_reader));
   std::unique_ptr<io::RandomAccessInputStream> input_stream(
       new io::RandomAccessInputStream(file_reader.get()));
   constexpr int k_buffer_size = 256 << 10;  // 256kb
@@ -206,7 +209,7 @@ absl::Status BenchmarkMarkForCompilation(absl::string_view graph_def_path,
                                          benchmark::State& state) {
   GraphDef graph_def;
   TF_RETURN_IF_ERROR(
-      ReadTextProto(Env::Default(), string(graph_def_path), &graph_def));
+      ReadTextProto(Env::Default(), std::string(graph_def_path), &graph_def));
 
   OptimizationPassRunner runner;
   TF_RETURN_IF_ERROR(runner.SetJitLevel(tensorflow::OptimizerOptions::ON_2));
diff --git a/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
index e4be1a1f641656..33e2daf941eafb 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
@@ -29,7 +29,7 @@ namespace {
 // Creates a float tensor of linearly increasing values, starting from offset.
 Tensor CreateInputTensor(const TensorShape& shape, float offset) {
   Tensor tensor(DT_FLOAT, shape);
-  for (int64 i = 0; i < tensor.flat<float>().size(); ++i) {
+  for (int64_t i = 0; i < tensor.flat<float>().size(); ++i) {
     tensor.flat<float>()(i) = offset + i;
   }
   return tensor;
@@ -127,7 +127,7 @@ absl::Status DeviceCompilerSerializeTest::ExecuteWithBatch(
   }
 
   Tensor f32_input(DT_FLOAT, shape);
-  for (int64 i = 0; i < f32_input.NumElements(); ++i) {
+  for (int64_t i = 0; i < f32_input.NumElements(); ++i) {
     EXPECT_NEAR(golden_output_tensors[0].flat<float>()(i),
                 output_tensors[0].flat<float>()(i), 1e-3);
   }
@@ -139,7 +139,7 @@ DeviceCompilerSerializeTest::AlterPersistentCacheEntryHloModuleNames(
     absl::string_view persistent_cache_dir_path,
     absl::string_view file_prefix) {
   Env* env = Env::Default();
-  std::vector<string> file_names;
+  std::vector<std::string> file_names;
   TF_RETURN_IF_ERROR(
       env->GetChildren(tensorflow::testing::TmpDir(), &file_names));
 
diff --git a/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc b/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
index bec124f1866689..62089beed8224f 100644
--- a/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
+++ b/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
@@ -34,8 +34,12 @@ namespace {
 
 class XlaHostSendRecvDeviceContextTest : public ::testing::Test {
  public:
-  void SetDevice(const string& device_type) {
+  absl::Status SetDevice(const string& device_type) {
     auto device_factory = DeviceFactory::GetFactory(device_type);
+    if (device_factory == nullptr) {
+      return absl::NotFoundError(
+          "Failed to get DeviceFactory for device_type: " + device_type);
+    }
     SessionOptions options;
     std::vector<std::unique_ptr<Device>> devices;
     Status s = device_factory->CreateDevices(
@@ -49,6 +53,7 @@ class XlaHostSendRecvDeviceContextTest : public ::testing::Test {
     AllocatorAttributes device_alloc_attr;
     device_alloc_attr.set_on_host(false);
     device_allocator_ = device_->GetAllocator(device_alloc_attr);
+    return absl::OkStatus();
   }
 
  protected:
@@ -58,7 +63,7 @@ class XlaHostSendRecvDeviceContextTest : public ::testing::Test {
 };
 
 TEST_F(XlaHostSendRecvDeviceContextTest, CopyDeviceTensorToCPU) {
-  SetDevice("GPU");
+  TF_ASSERT_OK(SetDevice("GPU"));
   Tensor origin_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
   test::FillValues<float>(&origin_cpu_tensor, {1.2, 2.3, 3.4, 4.5});
   Tensor device_tensor(device_allocator_, DT_FLOAT, TensorShape({2, 2}));
@@ -93,7 +98,7 @@ TEST_F(XlaHostSendRecvDeviceContextTest, CopyDeviceTensorToCPU) {
 }
 
 TEST_F(XlaHostSendRecvDeviceContextTest, CopyCPUTensorToDevice) {
-  SetDevice("GPU");
+  TF_ASSERT_OK(SetDevice("GPU"));
   Tensor origin_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
   test::FillValues<float>(&origin_cpu_tensor, {1.2, 2.3, 3.4, 4.5});
   Tensor device_tensor(device_allocator_, DT_FLOAT, TensorShape({2, 2}));
@@ -127,7 +132,7 @@ TEST_F(XlaHostSendRecvDeviceContextTest, CopyCPUTensorToDevice) {
 }
 
 TEST_F(XlaHostSendRecvDeviceContextTest, RoundTrip) {
-  SetDevice("GPU");
+  TF_ASSERT_OK(SetDevice("GPU"));
   Tensor origin_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
   test::FillValues<float>(&origin_cpu_tensor, {1.2, 2.3, 3.4, 4.5});
   Tensor device_tensor(device_allocator_, DT_FLOAT, TensorShape({2, 2}));
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index f26fcd34df7583..8ccb236897ce39 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -45,11 +45,11 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "xla/client/local_client.h"
+#include "xla/future.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/executable.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
@@ -809,8 +809,6 @@ xla::ExecuteOptions GetPjRtExecuteOptions(
     const DeviceType& device_type,
     absl::flat_hash_set<int> non_donatable_input_indices) {
   xla::ExecuteOptions options;
-  options.arguments_are_tupled = false;
-  options.untuple_result = true;
   // Hardcode run id to always be one: TF distributed strategy
   // differentiates between subsequent runs using dependency edges. This
   // is safe, as only TF dist-strat can produce distributed ops, and we
@@ -925,7 +923,7 @@ absl::StatusOr<std::vector<std::unique_ptr<xla::PjRtBuffer>>> RunPjRtExecutable(
       &executable_args, &owned_executable_args, &non_donatable_input_indices));
 
   std::vector<std::unique_ptr<xla::PjRtBuffer>> execute_outputs;
-  std::optional<xla::PjRtFuture<>> future;
+  std::optional<tsl::Future<void>> future;
   if (executable->num_replicas() != 1 || executable->num_partitions() != 1) {
     TF_ASSIGN_OR_RETURN(
         execute_outputs,
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
index 9e71286dc95df8..d8ed5feac79f12 100644
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -207,8 +207,6 @@ class PjRtExecutionUtilTest : public OpsTestBase {
         &executable_args, /*owned_args=*/{}, &non_donatable_input_indices));
 
     xla::ExecuteOptions exe_options;
-    exe_options.arguments_are_tupled = false;
-    exe_options.untuple_result = true;
 
     // TODO(b/257548614): currently PJRT is compiled as portable (num_replica =
     // 1 and num_partition = 1). Support multiple partitions case.
@@ -520,8 +518,6 @@ TEST_F(PjRtExecutionUtilTest, PopulateCtxOutputsResourceUpdates) {
 TEST(XlaLaunchUtilTest, GetPjRtExecuteOptions) {
   xla::ExecuteOptions options =
       GetPjRtExecuteOptions(DeviceType(DEVICE_GPU), {});
-  EXPECT_FALSE(options.arguments_are_tupled);
-  EXPECT_TRUE(options.untuple_result);
   EXPECT_FALSE(options.strict_shape_checking);
   EXPECT_TRUE(options.use_major_to_minor_data_layout_for_callbacks);
 }
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 7f200aa186a466..ab6c5abeca86f0 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1990,7 +1990,6 @@ cc_library(
         ":tf_tfl_passes",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/lite/core:macros",
         "//tensorflow/compiler/mlir/lite/debug",
         "//tensorflow/compiler/mlir/lite/experimental/remat:metadata_util",
         "//tensorflow/compiler/mlir/lite/metrics:converter_error_data_proto_cc",
@@ -2212,10 +2211,8 @@ tf_proto_library(
     srcs = ["converter_flags.proto"],
     make_default_target_header_only = True,
     protodeps = [
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto",
-        "//tensorflow/compiler/mlir/lite/debug:debug_options_proto",
         ":types_proto",
+        "//tensorflow/compiler/mlir/lite/debug:debug_options_proto",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/compiler/mlir/lite/converter_flags.proto b/tensorflow/compiler/mlir/lite/converter_flags.proto
index 1c1a1ad00aea74..49795ad8337d9a 100644
--- a/tensorflow/compiler/mlir/lite/converter_flags.proto
+++ b/tensorflow/compiler/mlir/lite/converter_flags.proto
@@ -17,8 +17,6 @@ package tflite;
 
 import "tensorflow/compiler/mlir/lite/debug/debug_options.proto";
 import "tensorflow/compiler/mlir/lite/types.proto";
-import "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto";
-import "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto";
 
 // Supported I/O file formats. Some formats may be input-only or output-only.
 enum FileFormat {
@@ -43,6 +41,8 @@ enum FileFormat {
 //
 // Next ID to use: 69.
 message ConverterFlags {
+  reserved 54, 61;
+
   // Input file format
   optional FileFormat input_format = 1;
 
@@ -312,12 +312,6 @@ message ConverterFlags {
   // If true, disable folding mul->fc as in layer norm during optimize pass.
   optional bool disable_fuse_mul_and_fc = 53 [default = false];
 
-  // Indicates the quantization specs. Quantization spec can be set to either
-  // a preset method or a custom method.
-  // Note: This is deprecated; use `quantization_config` instead.
-  optional stablehlo.quantization.QuantizationOptions quantization_options = 54
-      [deprecated = true];
-
   // Flag to enable hlo to tf conversion.
   // This is useful to exercise StableHLO -> HLO -> TF -> TFLite path.
   optional bool enable_hlo_to_tf_conversion = 55
@@ -346,11 +340,6 @@ message ConverterFlags {
   // WARNING: Experimental interface, subject to change.
   optional string qdq_conversion_mode = 60 [default = "NONE"];
 
-  // Configures quantization behavior. This config is fed to the StableHLO
-  // Quantizer integrated in the converter.
-  // WARNING: Experimental interface, subject to change.
-  optional stablehlo.quantization.QuantizationConfig quantization_config = 61;
-
   // Disables per channel weights quantization for Dense layers and enables
   // legacy per tensor quantization. The legacy quantization for Dense layers is
   // inconsistent with Conv 1x1 which always performs per channel quantization.
diff --git a/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.cc b/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.cc
index 0e8210b97e315b..7facd69ecca298 100644
--- a/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.cc
@@ -341,6 +341,7 @@ using tflite::TensorType_FLOAT16;
 using tflite::TensorType_FLOAT32;
 using tflite::TensorType_FLOAT64;
 using tflite::TensorType_INT16;
+using tflite::TensorType_INT2;
 using tflite::TensorType_INT32;
 using tflite::TensorType_INT4;
 using tflite::TensorType_INT64;
@@ -1400,6 +1401,9 @@ absl::Status ConvertTensorType(TensorType tensor_type, TfLiteType* type) {
     case TensorType_INT4:
       *type = kTfLiteInt4;
       return OkStatus();
+    case TensorType_INT2:
+      *type = kTfLiteInt2;
+      return OkStatus();
     default:
       *type = kTfLiteNoType;
       auto error_message =
diff --git a/tensorflow/compiler/mlir/lite/core/c/tflite_types.h b/tensorflow/compiler/mlir/lite/core/c/tflite_types.h
index 068facb10761c7..f09923dda5fc7c 100644
--- a/tensorflow/compiler/mlir/lite/core/c/tflite_types.h
+++ b/tensorflow/compiler/mlir/lite/core/c/tflite_types.h
@@ -64,6 +64,7 @@ typedef enum {
   kTfLiteUInt16 = 17,
   kTfLiteInt4 = 18,
   kTfLiteBFloat16 = 19,
+  kTfLiteInt2 = 20,
 } TfLiteType;
 // LINT.ThenChange(//tensorflow/lite/profiling/proto/model_runtime_info.proto:EdgeDataType)
 
diff --git a/tensorflow/compiler/mlir/lite/debug/debug_test.cc b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
index 6c26865757950a..b82d5725182745 100644
--- a/tensorflow/compiler/mlir/lite/debug/debug_test.cc
+++ b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
@@ -120,7 +120,7 @@ class InitPassManagerTest : public testing::Test {
   }
 
   absl::Status GetDumpDir(std::string* dump_dir) {
-    std::vector<string> files;
+    std::vector<std::string> files;
     if (auto status = tsl::Env::Default()->GetChildren(path_, &files);
         !status.ok()) {
       return status;
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 93879c1d6254b9..41dffc228a6b2c 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -217,6 +217,13 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     switch (itype.getWidth()) {
       case 1:
         return tflite::TensorType_BOOL;
+      case 2:
+        if (itype.isUnsigned()) {
+          return Status(absl::StatusCode::kInvalidArgument,
+                        "Unsupported 2bit unsigned int type");
+        } else {
+          return tflite::TensorType_INT2;
+        }
       case 4:
         if (itype.isUnsigned()) {
           return Status(absl::StatusCode::kInvalidArgument,
@@ -879,7 +886,7 @@ class Translator {
   std::vector<std::unique_ptr<char[], std::function<void(char*)>>>
       string_buffers_to_delete_;
   std::vector<std::unique_ptr<std::vector<uint8_t>>>
-      packed_int4_buffers_to_delete_;
+      packed_low_bit_buffers_to_delete_;
 
   // Maps custom options data to corresponding node
   // Key is set to be the list of input tensor indices and list of output tensor
@@ -1027,18 +1034,21 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
   auto type = mlir::cast<TensorType>(value.getType());
   tflite::TensorType tflite_element_type =
       GetTFLiteType(type.getElementType()).value();
-  if (tflite_element_type == tflite::TensorType_INT4) {
+  if (tflite_element_type == tflite::TensorType_INT4 ||
+      tflite_element_type == tflite::TensorType_INT2) {
     std::vector<uint8_t> data;
     for (mlir::APInt v : attr.getValues<mlir::APInt>()) {
       data.emplace_back(static_cast<uint8_t>(*(v.getRawData())));
     }
-    auto packed_buffer = std::make_unique<std::vector<uint8_t>>(
-        tflite::PackInt4ValuesDensely(data));
+    auto packed_buffer =
+        std::make_unique<std::vector<uint8_t>>(tflite::PackLowBitValuesDensely(
+            data, /*bit_width=*/(
+                tflite_element_type == tflite::TensorType_INT4 ? 4 : 2)));
     if (use_buffer_offset_) {
       buffer_data_map_[index] =
           absl::string_view(reinterpret_cast<char*>(packed_buffer->data()),
                             packed_buffer->size());
-      packed_int4_buffers_to_delete_.emplace_back(std::move(packed_buffer));
+      packed_low_bit_buffers_to_delete_.emplace_back(std::move(packed_buffer));
       return tflite::CreateBuffer(builder_, 0, 1, 1);
     } else {
       if (IsModelBiggerThan2GB(packed_buffer->size())) {
@@ -4239,10 +4249,10 @@ std::optional<std::string> Translator::TranslateInternal() {
 
   // Free all the buffers/tensors, etc. that were created but were kept around
   // to copy into the flatbuffer.
-  for (auto& packed_int4_buffer : packed_int4_buffers_to_delete_) {
-    packed_int4_buffer.reset();
+  for (auto& packed_low_bit_buffer : packed_low_bit_buffers_to_delete_) {
+    packed_low_bit_buffer.reset();
   }
-  packed_int4_buffers_to_delete_.clear();
+  packed_low_bit_buffers_to_delete_.clear();
 
   for (auto& str_buffer : string_buffers_to_delete_) {
     str_buffer.reset();
diff --git a/tensorflow/compiler/mlir/lite/integrations/BUILD b/tensorflow/compiler/mlir/lite/integrations/BUILD
index cae74c9c3ac7b2..64baf0b2fb7731 100644
--- a/tensorflow/compiler/mlir/lite/integrations/BUILD
+++ b/tensorflow/compiler/mlir/lite/integrations/BUILD
@@ -40,6 +40,7 @@ pybind_extension(
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/python/lib/core:ndarray_tensor",
         "//tensorflow/python/lib/core:py_func_lib",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:ArithDialect",
@@ -48,15 +49,14 @@ pybind_extension(
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MLIRBindingsPythonHeaders",
-        "@llvm-project//mlir:MLIRBindingsPythonHeadersAndDeps",
+        "@llvm-project//mlir:MLIRBindingsPythonNanobindHeadersAndDeps",
         "@llvm-project//mlir:MlirOptLib",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//third_party/python_runtime:headers",
-        "@pybind11",
+        "@nanobind",
         "@stablehlo//:register",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:vhlo_ops",
diff --git a/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc b/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
index 480901785329e7..80975abd3e9a7a 100644
--- a/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
+++ b/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
@@ -21,9 +21,10 @@ limitations under the License.
 
 #include "mlir/Support/LLVM.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
 #include "mlir-c/IR.h"  // from @llvm-project
-#include "mlir/Bindings/Python/PybindAdaptors.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Bindings/Python/NanobindAdaptors.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/CAPI/IR.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
@@ -40,9 +41,10 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "pybind11/cast.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
+#include "nanobind/nanobind.h"  // from @nanobind
+#include "nanobind/stl/string.h"  // from @nanobind
+#include "nanobind/stl/string_view.h"  // from @nanobind
+#include "nanobind/stl/vector.h"  // from @nanobind
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
@@ -57,7 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 
-namespace py = pybind11;
+namespace nb = nanobind;
 
 // -----------------------------------------------------------------------------
 // Module initialization.
@@ -70,7 +72,7 @@ class MlirPythonPass
                                mlir::OperationPass<mlir::ModuleOp>> {
  public:
   explicit MlirPythonPass(std::string name, std::string description,
-                          py::object pyfunc)
+                          nb::object pyfunc)
       : name_(name), description_(description), pyfunc_(pyfunc) {
     pyfunc.inc_ref();
   }
@@ -85,8 +87,8 @@ class MlirPythonPass
     auto module_clone = getOperation().clone();
     MlirModule c_module = wrap(module_clone);
 
-    auto py_module = py::cast(c_module);
-    auto py_args = py::make_tuple(py_module);
+    auto py_module = nb::cast(c_module);
+    auto py_args = nb::make_tuple(py_module);
     PyObject* py_pass_ret = PyObject_CallObject(pyfunc_.ptr(), py_args.ptr());
 
     if (py_pass_ret == nullptr || PyErr_Occurred()) {
@@ -95,8 +97,8 @@ class MlirPythonPass
       signalPassFailure();
       return;
     }
-    auto py_new_module_op = py::cast<py::object>(py_pass_ret);
-    auto c_new_module_op = py::cast<MlirOperation>(py_new_module_op);
+    auto py_new_module_op = nb::steal<nb::object>(py_pass_ret);
+    auto c_new_module_op = nb::cast<MlirOperation>(py_new_module_op);
     mlir::Operation* new_module_op = unwrap(c_new_module_op);
 
     // TODO: Copy attributes from new_module
@@ -108,7 +110,7 @@ class MlirPythonPass
  private:
   std::string name_;
   std::string description_;
-  py::object pyfunc_;
+  nb::object pyfunc_;
 };
 
 inline void RegisterDialects(mlir::DialectRegistry& registry) {
@@ -131,7 +133,7 @@ inline void RegisterPasses() {
       []() { return mlir::TFL::CreateOptimizePass(); });
 }
 
-PYBIND11_MODULE(model_utils_core_pybind, m) {
+NB_MODULE(model_utils_core_pybind, m) {
   Py_Initialize();
 
   m.doc() = "LiteRT ModelUtils Core Pybinds";
@@ -142,7 +144,7 @@ PYBIND11_MODULE(model_utils_core_pybind, m) {
   m.def("mlir_opt_main", [](std::vector<std::string> argv,
                             std::vector<std::string> pass_names,
                             std::vector<std::string> pass_descriptions,
-                            std::vector<py::object> pass_fns) {
+                            std::vector<nb::object> pass_fns) {
     std::vector<char*> c_argv_vec;
     c_argv_vec.reserve(argv.size());
     for (size_t i = 0; i < argv.size(); ++i)
@@ -178,14 +180,15 @@ PYBIND11_MODULE(model_utils_core_pybind, m) {
   });
 
   m.def("flatbuffer_to_mlir",
-        [](py::bytes buffer, MlirContext context) -> MlirModule {
+        [](nb::bytes buffer, MlirContext context) -> MlirModule {
           mlir::DialectRegistry registry;
           RegisterDialects(registry);
           unwrap(context)->appendDialectRegistry(registry);
           unwrap(context)->loadAllAvailableDialects();
 
           auto module_op = tflite::FlatBufferToMlir(
-              buffer, unwrap(context), mlir::UnknownLoc::get(unwrap(context)));
+              absl::string_view(buffer.c_str(), buffer.size()), unwrap(context),
+              mlir::UnknownLoc::get(unwrap(context)));
           return wrap(module_op.release());
         });
 
@@ -197,7 +200,7 @@ PYBIND11_MODULE(model_utils_core_pybind, m) {
     std::string result;
     tflite::MlirToFlatBufferTranslateFunction(module_op, options, &result,
                                               true);
-    return py::bytes(result);
+    return nb::bytes(result.data(), result.size());
   });
 
   m.def("get_operation_attribute_names", [](MlirOperation c_op) {
@@ -227,7 +230,7 @@ PYBIND11_MODULE(model_utils_core_pybind, m) {
     PyObject* np_array = Py_None;
     status = tensorflow::TensorToNdarray(tensor, &np_array);
 
-    return py::reinterpret_steal<py::object>(np_array);
+    return nb::steal<nb::object>(np_array);
   });
 }
 
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index e12fc16c56a49e..08c37384741ca4 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -247,16 +247,26 @@ bool ShouldFoldOperation(Operation* inst) {
     return size;
   };
 
-  int64_t results_size = get_size(inst->getResultTypes());
-  int64_t operands_size = get_size(inst->getOperandTypes());
+  int64_t inputs_size = get_size(inst->getOperandTypes());
+  int64_t outputs_size = get_size(inst->getResultTypes());
 
-  constexpr int kSizeFactor = 2;
-  constexpr int64_t kResultsSizeThreshold = (1 << 19);                // 64 KiB
-  constexpr int64_t kOperandsSizeThreshold = 200L * 1024 * 1024 * 8;  // 200 MiB
+  constexpr int64_t kInputsSizeThreshold = 200L * 1024 * 1024 * 8;  // 200 MiB
+  constexpr int64_t kOutputsSizeThreshold =
+      2 * kInputsSizeThreshold;  // 400 MiB
 
-  return (operands_size <= kOperandsSizeThreshold) &&
-         ((results_size <= kResultsSizeThreshold) ||
-          (results_size <= kSizeFactor * operands_size));
+  auto output_size_is_smaller_than_inputs = outputs_size <= inputs_size;
+
+  auto inputs_and_outputs_smaller_than_arbitrary_thresholds =
+      (inputs_size <= kInputsSizeThreshold) &&
+      (outputs_size <= kOutputsSizeThreshold);
+
+  // Folding rules are:
+  // 1. if the size of the resulting outputs are smaller than the inputs then
+  // just do the fold. The model size will be smaller as a result.
+  // 2. if the inputs and outputs sizes are smaller than certain thresholds, do
+  // the fold regardless of their impact on model size.
+  return output_size_is_smaller_than_inputs ||
+         inputs_and_outputs_smaller_than_arbitrary_thresholds;
 }
 
 // Returns dimension index for the given axis that supports negative
@@ -4374,10 +4384,23 @@ OpFoldResult CastFloatToFloat(DenseFPElementsAttr data, FloatType in_type,
     return DenseFPElementsAttr::get(result_type,
                                     MapStaticCast<double, float>(data));
   }
+
+  if (in_type.isF32() && out_type.isF16()) {
+    return data.mapValues(out_type, [&](const APFloat& old_value) {
+      APFloat value(old_value);
+      bool unused_loses_info;
+      value.convert(out_type.getFloatSemantics(), APFloat::rmNearestTiesToEven,
+                    &unused_loses_info);
+      return value.bitcastToAPInt();
+    });
+  }
   return {};
 }
 
 OpFoldResult CastOp::fold(FoldAdaptor adaptor) {
+  auto in_type = getInput().getType().getElementType();
+  auto out_type = getType().getElementType();
+
   if (!ShouldFoldOperation(this->getOperation())) return {};
 
   auto operands = adaptor.getOperands();
@@ -4390,9 +4413,6 @@ OpFoldResult CastOp::fold(FoldAdaptor adaptor) {
 
   auto input = operands[0];
 
-  auto in_type = getInput().getType().getElementType();
-  auto out_type = getType().getElementType();
-
   if (auto int_in_type = llvm::dyn_cast_or_null<IntegerType>(in_type)) {
     auto in_data = llvm::dyn_cast_or_null<DenseIntElementsAttr>(input);
     if (!in_data) {
@@ -4962,7 +4982,7 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
                                SmallVectorImpl<RegionSuccessor>& regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -5233,6 +5253,22 @@ int64_t SoftmaxOp::GetArithmeticCount(Operation* op) {
 // TanhOp
 //===----------------------------------------------------------------------===//
 
+OpFoldResult TanhOp::fold(FoldAdaptor adaptor) {
+  if (!ShouldFoldOperation(this->getOperation())) return {};
+
+  auto operands = adaptor.getOperands();
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat {
+    float f = value.convertToFloat();
+    float result = std::tanh(f);
+    return APFloat(result);
+  };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
 int64_t TanhOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   // As a very rough ballpark, the cost of evaluating a math function
@@ -5719,6 +5755,10 @@ static FailureOr<SmallVector<int32_t>> parseI32Array(AsmParser& parser) {
 }  // namespace TFL
 }  // namespace mlir
 
+using namespace mlir;  // NOLINT
+using mlir::TFL::ControlType;
+using mlir::TFL::LSTMKernelTypeAttr;
+
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_dialect.cc.inc"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_enums.cc.inc"
 #define GET_ATTRDEF_CLASSES
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 44370d1cfdeb96..c90859cd6accfe 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -112,6 +112,7 @@ class TFL_VariadicTensorOf<list<Type> allowedRuntimeTypes,
   Variadic<TensorOf<allowedOpTypes>>,
   TFL_RuntimeType<Variadic<TensorOf<allowedRuntimeTypes>>>;
 
+def TFL_I2 : I<2>;
 def TFL_I4 : I<4>;
 def TFL_Int32Or64 : SignlessIntOfWidths<[32, 64]>;
 
@@ -1099,7 +1100,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
 
   let arguments = (ins
     TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$input,
-    TFL_TensorOf<[F32, QI4, QI8, QUI8, QI16]>:$filter,
+    TFL_TensorOf<[F32, QI2, QI4, QI8, QUI8, QI16]>:$filter,
     TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
 
     TFL_AFAttr:$fused_activation_function,
@@ -2476,13 +2477,13 @@ equivalent to setting:
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI4, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$input,
     TFL_I32OrI64Tensor:$begin,
     TFL_I32OrI64Tensor:$size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI4, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$output
   );
 
   let hasVerifier = 1;
@@ -3574,6 +3575,8 @@ def TFL_TanhOp: TFL_Op<"tanh", [
         /*scale=*/1.0 / (1<<(bit_width-1)), /*zero_point=*/0);
   }
   }];
+
+  let hasFolder = 1;
 }
 
 def TFL_TileOp: TFL_Op<"tile", [
@@ -4072,13 +4075,10 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F16, BF16, F32, F64, I1, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F16, BF16, F32, F64, I1, TFL_I2, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
   );
 
-  // TODO(b/393644251): Temporary support for INT4 TFL_CastOp. Runtime
-  // probably already supports INT4. We should remove the INT4 support here or
-  // make sure the runtime supports is there, as part of closing the bug.
-  let results = (outs TFL_TensorOf<[F16, BF16, F32, F64, I1, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F16, BF16, F32, F64, I1, TFL_I2, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
@@ -4281,7 +4281,7 @@ def TFL_DequantizeOp: TFL_Op<"dequantize", [NoMemoryEffect]> {
     quantization parameters.
   }];
 
-  let arguments = (ins TFL_TensorOf<[QI4, QI8, QUI8, QI16, F16]>:$input);
+  let arguments = (ins TFL_TensorOf<[QI2, QI4, QI8, QUI8, QI16, F16]>:$input);
 
   let results = (outs TFL_FpTensor:$output);
 
diff --git a/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc b/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
index a3ae7f73b24f24..b5a3319ba13362 100644
--- a/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
+++ b/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
@@ -19,9 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape.h"
 
 #include <cstdint>
-#include <functional>
 #include <initializer_list>
-#include <numeric>
 #include <vector>
 
 #include <gmock/gmock.h>
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
index aa700dc166e046..29ed664e7ae78f 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
@@ -31,7 +31,7 @@ namespace tensorflow {
 // error status if it fails to convert the input.
 absl::Status ConvertJaxToTFLiteFlatBuffer(
     const std::string& input, const tflite::ModelFlags& model_flags,
-    tflite::ConverterFlags& converter_flags, string* result);
+    tflite::ConverterFlags& converter_flags, std::string* result);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index fa94cd3b5b8120..c334f24442b491 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -140,8 +140,8 @@ absl::Status ConvertSavedModelToTFLiteFlatBuffer(
   mlir::TFL::QuantizationSpecs quant_specs;
 
   // Parse input arrays.
-  std::vector<string> node_names;
-  std::vector<string> node_dtypes;
+  std::vector<std::string> node_names;
+  std::vector<std::string> node_dtypes;
   std::vector<std::optional<std::vector<int>>> node_shapes;
   std::vector<std::optional<double>> node_mins;
   std::vector<std::optional<double>> node_maxs;
@@ -210,8 +210,6 @@ absl::Status ConvertSavedModelToTFLiteFlatBuffer(
       converter_flags.convert_to_stablehlo();
   pass_config.legalize_custom_tensor_list_ops =
       converter_flags.legalize_custom_tensor_list_ops();
-  pass_config.enable_stablehlo_quantizer =
-      converter_flags.has_quantization_config();
   pass_config.enable_composite_direct_lowering =
       converter_flags.enable_composite_direct_lowering();
   pass_config.model_origin_framework = converter_flags.model_origin_framework();
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
index 33b9bacf2dfdeb..446652ccb8da05 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 // error status if it fails to convert the input.
 absl::Status ConvertSavedModelToTFLiteFlatBuffer(
     const tflite::ModelFlags& model_flags,
-    tflite::ConverterFlags& converter_flags, string* result,
+    tflite::ConverterFlags& converter_flags, std::string* result,
     const quantization::PyFunctionLibrary* quantization_py_function_lib);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index f837a6f0140e7b..de75080ab5da82 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -46,8 +46,8 @@ absl::Status RegisterAllCustomOps(
 absl::Status PopulateQuantizationSpecs(
     const tflite::ModelFlags& model_flags,
     tflite::ConverterFlags& converter_flags,
-    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
-    std::vector<string>* node_dtypes,
+    mlir::TFL::QuantizationSpecs* quant_specs,
+    std::vector<std::string>* node_names, std::vector<std::string>* node_dtypes,
     std::vector<std::optional<std::vector<int>>>* node_shapes,
     std::vector<std::optional<double>>* node_mins,
     std::vector<std::optional<double>>* node_maxs);
@@ -60,7 +60,8 @@ absl::Status ConvertMLIRToTFLiteFlatBuffer(
     std::unique_ptr<mlir::MLIRContext>&& context,
     mlir::OwningOpRef<mlir::ModuleOp> module,
     const mlir::TFL::PassConfig& pass_config,
-    const std::unordered_set<std::string>& saved_model_tags, string* result,
+    const std::unordered_set<std::string>& saved_model_tags,
+    std::string* result,
     const quantization::PyFunctionLibrary* quantization_py_function_lib);
 
 // Give a warning for any unused flags that have been specified.
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td
index 26bcf0ee0022d3..1653eb8a737482 100644
--- a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td
@@ -56,6 +56,7 @@ class Int8UniformQuantizedType<int zero_pt, int smantissa, int sexp>
 
 // General uniform quantized types. The definitions can be used to specify
 // operand's tensor types.
+def QI2 : QuantizedType<"Uniform", [2], 1>;
 def QI4 : QuantizedType<"Uniform", [4], 1>;
 def QUI8 : QuantizedType<"Uniform", [8], 0>;
 def QI8 : QuantizedType<"Uniform", [8], 1>;
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
index ae3b6233f8e959..1e1f79af16cbd6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
@@ -93,7 +93,7 @@ std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
 
 class QuantizeWeightsTest : public testing::Test {
  protected:
-  QuantizeWeightsTest() {}
+  QuantizeWeightsTest() = default;
 
   void LoadBasicModel() {
     input_model_ = ReadTestModel();
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD
index 8d57263800e2b4..d70688fc488a6a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD
@@ -14,10 +14,7 @@ cc_library(
     name = "portable_tensor_utils",
     srcs = ["portable_tensor_utils.cc"],
     hdrs = ["portable_tensor_utils.h"],
-    visibility = [
-        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:__pkg__",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:__pkg__",
-    ],
+    visibility = ["//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:__pkg__"],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/mlir/lite/schema/schema.fbs b/tensorflow/compiler/mlir/lite/schema/schema.fbs
index d74477be913d34..6cd1c51fb0cf9e 100644
--- a/tensorflow/compiler/mlir/lite/schema/schema.fbs
+++ b/tensorflow/compiler/mlir/lite/schema/schema.fbs
@@ -24,6 +24,8 @@
 // Version 3c: Move constant tensor buffers & custom op buffers outside from
 //             Flatbuffers. Has backward compatibility with version 3, 3a and
 //             3b.
+// Version 3d: Add ExternalBuffer tables and tensor.external_buffer field for
+//             referencing immutable data stored in external files.
 
 namespace tflite;
 
@@ -59,6 +61,7 @@ enum TensorType : byte {
   UINT16 = 16,
   INT4 = 17,
   BFLOAT16 = 18,
+  INT2 = 19,
 }
 
 // Custom quantization parameters for experimenting with new quantization
@@ -262,6 +265,11 @@ table Tensor {
   // Currently only 1 subtype is supported. The field is defined as an array for
   // flexibility of supporting multiple subtypes in the future.
   variant_tensors:[VariantSubType];
+
+  // Optional reference to an ExternalBuffer entry that stores constant tensor
+  // data outside of the FlatBuffer. A value of 0 indicates that the tensor uses
+  // the traditional embedded buffer field instead.
+  external_buffer:uint;
 }
 
 // A list of builtin operators. Builtin operators are slightly faster than custom
@@ -1612,6 +1620,22 @@ table Buffer {
   size: ulong;
 }
 
+// Groups external buffers by file/URI.
+table ExternalBufferGroup {
+  name:string;
+}
+
+// Describes an immutable data slice stored in an external file.
+table ExternalBuffer {
+  // Unique identifier for this external buffer.
+  id:uint;
+  // Index into the external_buffer_groups array.
+  group:uint;
+  offset:ulong;
+  length:ulong;
+  packing:string;
+}
+
 table Metadata {
   // A human readable string to uniquely identify a Metadata.
   name:string;
@@ -1679,6 +1703,12 @@ table Model {
 
   // Optional SignatureDefs for the model.
   signature_defs:[SignatureDef];
+
+  // Optional groups for external weight buffers.
+  external_buffer_groups:[ExternalBufferGroup];
+
+  // Optional list of external weight buffers referenced by tensors.
+  external_buffers:[ExternalBuffer];
 }
 
 root_type Model;
diff --git a/tensorflow/compiler/mlir/lite/schema/schema_generated.h b/tensorflow/compiler/mlir/lite/schema/schema_generated.h
index 43d51c40f01b8a..2b1701a8b9c0b9 100755
--- a/tensorflow/compiler/mlir/lite/schema/schema_generated.h
+++ b/tensorflow/compiler/mlir/lite/schema/schema_generated.h
@@ -681,6 +681,14 @@ struct Buffer;
 struct BufferBuilder;
 struct BufferT;
 
+struct ExternalBufferGroup;
+struct ExternalBufferGroupBuilder;
+struct ExternalBufferGroupT;
+
+struct ExternalBuffer;
+struct ExternalBufferBuilder;
+struct ExternalBufferT;
+
 struct Metadata;
 struct MetadataBuilder;
 struct MetadataT;
@@ -717,11 +725,12 @@ enum TensorType : int8_t {
   TensorType_UINT16 = 16,
   TensorType_INT4 = 17,
   TensorType_BFLOAT16 = 18,
+  TensorType_INT2 = 19,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_BFLOAT16
+  TensorType_MAX = TensorType_INT2
 };
 
-inline const TensorType (&EnumValuesTensorType())[19] {
+inline const TensorType (&EnumValuesTensorType())[20] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -741,13 +750,14 @@ inline const TensorType (&EnumValuesTensorType())[19] {
     TensorType_UINT32,
     TensorType_UINT16,
     TensorType_INT4,
-    TensorType_BFLOAT16
+    TensorType_BFLOAT16,
+    TensorType_INT2
   };
   return values;
 }
 
 inline const char * const *EnumNamesTensorType() {
-  static const char * const names[20] = {
+  static const char * const names[21] = {
     "FLOAT32",
     "FLOAT16",
     "INT32",
@@ -767,13 +777,14 @@ inline const char * const *EnumNamesTensorType() {
     "UINT16",
     "INT4",
     "BFLOAT16",
+    "INT2",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameTensorType(TensorType e) {
-  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_BFLOAT16)) return "";
+  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_INT2)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorType()[index];
 }
@@ -5949,6 +5960,7 @@ struct TensorT : public ::flatbuffers::NativeTable {
   std::vector<int32_t> shape_signature{};
   bool has_rank = false;
   std::vector<std::unique_ptr<tflite::VariantSubTypeT>> variant_tensors{};
+  uint32_t external_buffer = 0;
   TensorT() = default;
   TensorT(const TensorT &o);
   TensorT(TensorT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -5968,7 +5980,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_SPARSITY = 16,
     VT_SHAPE_SIGNATURE = 18,
     VT_HAS_RANK = 20,
-    VT_VARIANT_TENSORS = 22
+    VT_VARIANT_TENSORS = 22,
+    VT_EXTERNAL_BUFFER = 24
   };
   const ::flatbuffers::Vector<int32_t> *shape() const {
     return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
@@ -6000,6 +6013,9 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors() const {
     return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *>(VT_VARIANT_TENSORS);
   }
+  uint32_t external_buffer() const {
+    return GetField<uint32_t>(VT_EXTERNAL_BUFFER, 0);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_SHAPE) &&
@@ -6019,6 +6035,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            VerifyOffset(verifier, VT_VARIANT_TENSORS) &&
            verifier.VerifyVector(variant_tensors()) &&
            verifier.VerifyVectorOfTables(variant_tensors()) &&
+           VerifyField<uint32_t>(verifier, VT_EXTERNAL_BUFFER, 4) &&
            verifier.EndTable();
   }
   TensorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -6060,6 +6077,9 @@ struct TensorBuilder {
   void add_variant_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors) {
     fbb_.AddOffset(Tensor::VT_VARIANT_TENSORS, variant_tensors);
   }
+  void add_external_buffer(uint32_t external_buffer) {
+    fbb_.AddElement<uint32_t>(Tensor::VT_EXTERNAL_BUFFER, external_buffer, 0);
+  }
   explicit TensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -6082,8 +6102,10 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(
     ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature = 0,
     bool has_rank = false,
-    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0,
+    uint32_t external_buffer = 0) {
   TensorBuilder builder_(_fbb);
+  builder_.add_external_buffer(external_buffer);
   builder_.add_variant_tensors(variant_tensors);
   builder_.add_shape_signature(shape_signature);
   builder_.add_sparsity(sparsity);
@@ -6108,7 +6130,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
     ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
     const std::vector<int32_t> *shape_signature = nullptr,
     bool has_rank = false,
-    const std::vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr,
+    uint32_t external_buffer = 0) {
   auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
   auto shape_signature__ = shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0;
@@ -6124,7 +6147,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
       sparsity,
       shape_signature__,
       has_rank,
-      variant_tensors__);
+      variant_tensors__,
+      external_buffer);
 }
 
 ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -16528,6 +16552,182 @@ inline ::flatbuffers::Offset<Buffer> CreateBufferDirect(
 
 ::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ExternalBufferGroupT : public ::flatbuffers::NativeTable {
+  typedef ExternalBufferGroup TableType;
+  std::string name{};
+};
+
+struct ExternalBufferGroup FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExternalBufferGroupT NativeTableType;
+  typedef ExternalBufferGroupBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           verifier.EndTable();
+  }
+  ExternalBufferGroupT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExternalBufferGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExternalBufferGroup> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExternalBufferGroupBuilder {
+  typedef ExternalBufferGroup Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(ExternalBufferGroup::VT_NAME, name);
+  }
+  explicit ExternalBufferGroupBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExternalBufferGroup> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExternalBufferGroup>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroup(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
+  ExternalBufferGroupBuilder builder_(_fbb);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroupDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateExternalBufferGroup(
+      _fbb,
+      name__);
+}
+
+::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExternalBufferT : public ::flatbuffers::NativeTable {
+  typedef ExternalBuffer TableType;
+  uint32_t id = 0;
+  uint32_t group = 0;
+  uint64_t offset = 0;
+  uint64_t length = 0;
+  std::string packing{};
+};
+
+struct ExternalBuffer FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExternalBufferT NativeTableType;
+  typedef ExternalBufferBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID = 4,
+    VT_GROUP = 6,
+    VT_OFFSET = 8,
+    VT_LENGTH = 10,
+    VT_PACKING = 12
+  };
+  uint32_t id() const {
+    return GetField<uint32_t>(VT_ID, 0);
+  }
+  uint32_t group() const {
+    return GetField<uint32_t>(VT_GROUP, 0);
+  }
+  uint64_t offset() const {
+    return GetField<uint64_t>(VT_OFFSET, 0);
+  }
+  uint64_t length() const {
+    return GetField<uint64_t>(VT_LENGTH, 0);
+  }
+  const ::flatbuffers::String *packing() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_PACKING);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_ID, 4) &&
+           VerifyField<uint32_t>(verifier, VT_GROUP, 4) &&
+           VerifyField<uint64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<uint64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_PACKING) &&
+           verifier.VerifyString(packing()) &&
+           verifier.EndTable();
+  }
+  ExternalBufferT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExternalBufferT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExternalBuffer> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExternalBufferBuilder {
+  typedef ExternalBuffer Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(uint32_t id) {
+    fbb_.AddElement<uint32_t>(ExternalBuffer::VT_ID, id, 0);
+  }
+  void add_group(uint32_t group) {
+    fbb_.AddElement<uint32_t>(ExternalBuffer::VT_GROUP, group, 0);
+  }
+  void add_offset(uint64_t offset) {
+    fbb_.AddElement<uint64_t>(ExternalBuffer::VT_OFFSET, offset, 0);
+  }
+  void add_length(uint64_t length) {
+    fbb_.AddElement<uint64_t>(ExternalBuffer::VT_LENGTH, length, 0);
+  }
+  void add_packing(::flatbuffers::Offset<::flatbuffers::String> packing) {
+    fbb_.AddOffset(ExternalBuffer::VT_PACKING, packing);
+  }
+  explicit ExternalBufferBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExternalBuffer> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExternalBuffer>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExternalBuffer> CreateExternalBuffer(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t id = 0,
+    uint32_t group = 0,
+    uint64_t offset = 0,
+    uint64_t length = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> packing = 0) {
+  ExternalBufferBuilder builder_(_fbb);
+  builder_.add_length(length);
+  builder_.add_offset(offset);
+  builder_.add_packing(packing);
+  builder_.add_group(group);
+  builder_.add_id(id);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ExternalBuffer> CreateExternalBufferDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t id = 0,
+    uint32_t group = 0,
+    uint64_t offset = 0,
+    uint64_t length = 0,
+    const char *packing = nullptr) {
+  auto packing__ = packing ? _fbb.CreateString(packing) : 0;
+  return tflite::CreateExternalBuffer(
+      _fbb,
+      id,
+      group,
+      offset,
+      length,
+      packing__);
+}
+
+::flatbuffers::Offset<ExternalBuffer> CreateExternalBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct MetadataT : public ::flatbuffers::NativeTable {
   typedef Metadata TableType;
   std::string name{};
@@ -16799,6 +16999,8 @@ struct ModelT : public ::flatbuffers::NativeTable {
   std::vector<int32_t> metadata_buffer{};
   std::vector<std::unique_ptr<tflite::MetadataT>> metadata{};
   std::vector<std::unique_ptr<tflite::SignatureDefT>> signature_defs{};
+  std::vector<std::unique_ptr<tflite::ExternalBufferGroupT>> external_buffer_groups{};
+  std::vector<std::unique_ptr<tflite::ExternalBufferT>> external_buffers{};
   ModelT() = default;
   ModelT(const ModelT &o);
   ModelT(ModelT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -16816,7 +17018,9 @@ struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_BUFFERS = 12,
     VT_METADATA_BUFFER = 14,
     VT_METADATA = 16,
-    VT_SIGNATURE_DEFS = 18
+    VT_SIGNATURE_DEFS = 18,
+    VT_EXTERNAL_BUFFER_GROUPS = 20,
+    VT_EXTERNAL_BUFFERS = 22
   };
   uint32_t version() const {
     return GetField<uint32_t>(VT_VERSION, 0);
@@ -16842,6 +17046,12 @@ struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
     return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
   }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> *external_buffer_groups() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> *>(VT_EXTERNAL_BUFFER_GROUPS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>> *external_buffers() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>> *>(VT_EXTERNAL_BUFFERS);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_VERSION, 4) &&
@@ -16864,6 +17074,12 @@ struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            VerifyOffset(verifier, VT_SIGNATURE_DEFS) &&
            verifier.VerifyVector(signature_defs()) &&
            verifier.VerifyVectorOfTables(signature_defs()) &&
+           VerifyOffset(verifier, VT_EXTERNAL_BUFFER_GROUPS) &&
+           verifier.VerifyVector(external_buffer_groups()) &&
+           verifier.VerifyVectorOfTables(external_buffer_groups()) &&
+           VerifyOffset(verifier, VT_EXTERNAL_BUFFERS) &&
+           verifier.VerifyVector(external_buffers()) &&
+           verifier.VerifyVectorOfTables(external_buffers()) &&
            verifier.EndTable();
   }
   ModelT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -16899,6 +17115,12 @@ struct ModelBuilder {
   void add_signature_defs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
     fbb_.AddOffset(Model::VT_SIGNATURE_DEFS, signature_defs);
   }
+  void add_external_buffer_groups(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>>> external_buffer_groups) {
+    fbb_.AddOffset(Model::VT_EXTERNAL_BUFFER_GROUPS, external_buffer_groups);
+  }
+  void add_external_buffers(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>>> external_buffers) {
+    fbb_.AddOffset(Model::VT_EXTERNAL_BUFFERS, external_buffers);
+  }
   explicit ModelBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -16919,8 +17141,12 @@ inline ::flatbuffers::Offset<Model> CreateModel(
     ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
-    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>>> external_buffer_groups = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>>> external_buffers = 0) {
   ModelBuilder builder_(_fbb);
+  builder_.add_external_buffers(external_buffers);
+  builder_.add_external_buffer_groups(external_buffer_groups);
   builder_.add_signature_defs(signature_defs);
   builder_.add_metadata(metadata);
   builder_.add_metadata_buffer(metadata_buffer);
@@ -16941,7 +17167,9 @@ inline ::flatbuffers::Offset<Model> CreateModelDirect(
     const std::vector<::flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
     const std::vector<int32_t> *metadata_buffer = nullptr,
     const std::vector<::flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
-    const std::vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> *external_buffer_groups = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::ExternalBuffer>> *external_buffers = nullptr) {
   auto operator_codes__ = operator_codes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
   auto subgraphs__ = subgraphs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
   auto description__ = description ? _fbb.CreateString(description) : 0;
@@ -16949,6 +17177,8 @@ inline ::flatbuffers::Offset<Model> CreateModelDirect(
   auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
   auto metadata__ = metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
   auto signature_defs__ = signature_defs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
+  auto external_buffer_groups__ = external_buffer_groups ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBufferGroup>>(*external_buffer_groups) : 0;
+  auto external_buffers__ = external_buffers ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBuffer>>(*external_buffers) : 0;
   return tflite::CreateModel(
       _fbb,
       version,
@@ -16958,7 +17188,9 @@ inline ::flatbuffers::Offset<Model> CreateModelDirect(
       buffers__,
       metadata_buffer__,
       metadata__,
-      signature_defs__);
+      signature_defs__,
+      external_buffer_groups__,
+      external_buffers__);
 }
 
 ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -17212,7 +17444,7 @@ inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const ::flatbu
   (void)_resolver;
   { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } else { _o->traversal_order.resize(0); } }
   { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } else { _o->block_map.resize(0); } }
-  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->dim_metadata[_i]) { _e->Get(_i)->UnPackTo(_o->dim_metadata[_i].get(), _resolver); } else { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->dim_metadata.resize(0); } }
+  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->dim_metadata[_i]) { _e->Get(_i)->UnPackTo(_o->dim_metadata[_i].get(), _resolver); } else { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->dim_metadata.resize(0); } }
 }
 
 inline ::flatbuffers::Offset<SparsityParameters> SparsityParameters::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -17274,7 +17506,8 @@ inline TensorT::TensorT(const TensorT &o)
         is_variable(o.is_variable),
         sparsity((o.sparsity) ? new tflite::SparsityParametersT(*o.sparsity) : nullptr),
         shape_signature(o.shape_signature),
-        has_rank(o.has_rank) {
+        has_rank(o.has_rank),
+        external_buffer(o.external_buffer) {
   variant_tensors.reserve(o.variant_tensors.size());
   for (const auto &variant_tensors_ : o.variant_tensors) { variant_tensors.emplace_back((variant_tensors_) ? new tflite::VariantSubTypeT(*variant_tensors_) : nullptr); }
 }
@@ -17290,6 +17523,7 @@ inline TensorT &TensorT::operator=(TensorT o) FLATBUFFERS_NOEXCEPT {
   std::swap(shape_signature, o.shape_signature);
   std::swap(has_rank, o.has_rank);
   std::swap(variant_tensors, o.variant_tensors);
+  std::swap(external_buffer, o.external_buffer);
   return *this;
 }
 
@@ -17311,7 +17545,8 @@ inline void Tensor::UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function
   { auto _e = sparsity(); if (_e) { if(_o->sparsity) { _e->UnPackTo(_o->sparsity.get(), _resolver); } else { _o->sparsity = std::unique_ptr<tflite::SparsityParametersT>(_e->UnPack(_resolver)); } } else if (_o->sparsity) { _o->sparsity.reset(); } }
   { auto _e = shape_signature(); if (_e) { _o->shape_signature.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape_signature[_i] = _e->Get(_i); } } else { _o->shape_signature.resize(0); } }
   { auto _e = has_rank(); _o->has_rank = _e; }
-  { auto _e = variant_tensors(); if (_e) { _o->variant_tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->variant_tensors[_i]) { _e->Get(_i)->UnPackTo(_o->variant_tensors[_i].get(), _resolver); } else { _o->variant_tensors[_i] = std::unique_ptr<tflite::VariantSubTypeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->variant_tensors.resize(0); } }
+  { auto _e = variant_tensors(); if (_e) { _o->variant_tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->variant_tensors[_i]) { _e->Get(_i)->UnPackTo(_o->variant_tensors[_i].get(), _resolver); } else { _o->variant_tensors[_i] = std::unique_ptr<tflite::VariantSubTypeT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->variant_tensors.resize(0); } }
+  { auto _e = external_buffer(); _o->external_buffer = _e; }
 }
 
 inline ::flatbuffers::Offset<Tensor> Tensor::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -17332,6 +17567,7 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuild
   auto _shape_signature = _o->shape_signature.size() ? _fbb.CreateVector(_o->shape_signature) : 0;
   auto _has_rank = _o->has_rank;
   auto _variant_tensors = _o->variant_tensors.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::VariantSubType>> (_o->variant_tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateVariantSubType(*__va->__fbb, __va->__o->variant_tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _external_buffer = _o->external_buffer;
   return tflite::CreateTensor(
       _fbb,
       _shape,
@@ -17343,7 +17579,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuild
       _sparsity,
       _shape_signature,
       _has_rank,
-      _variant_tensors);
+      _variant_tensors,
+      _external_buffer);
 }
 
 inline StablehloGatherOptionsT *StablehloGatherOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
@@ -21572,10 +21809,10 @@ inline SubGraphT *SubGraph::UnPack(const ::flatbuffers::resolver_function_t *_re
 inline void SubGraph::UnPackTo(SubGraphT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->tensors[_i]) { _e->Get(_i)->UnPackTo(_o->tensors[_i].get(), _resolver); } else { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->tensors.resize(0); } }
+  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->tensors[_i]) { _e->Get(_i)->UnPackTo(_o->tensors[_i].get(), _resolver); } else { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->tensors.resize(0); } }
   { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } else { _o->inputs.resize(0); } }
   { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } else { _o->outputs.resize(0); } }
-  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operators[_i]) { _e->Get(_i)->UnPackTo(_o->operators[_i].get(), _resolver); } else { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operators.resize(0); } }
+  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operators[_i]) { _e->Get(_i)->UnPackTo(_o->operators[_i].get(), _resolver); } else { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->operators.resize(0); } }
   { auto _e = name(); if (_e) _o->name = _e->str(); }
   { auto _e = debug_metadata_index(); _o->debug_metadata_index = _e; }
 }
@@ -21637,6 +21874,70 @@ inline ::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuild
       _size);
 }
 
+inline ExternalBufferGroupT *ExternalBufferGroup::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExternalBufferGroupT>(new ExternalBufferGroupT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExternalBufferGroup::UnPackTo(ExternalBufferGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> ExternalBufferGroup::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExternalBufferGroup(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExternalBufferGroupT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  return tflite::CreateExternalBufferGroup(
+      _fbb,
+      _name);
+}
+
+inline ExternalBufferT *ExternalBuffer::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExternalBufferT>(new ExternalBufferT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExternalBuffer::UnPackTo(ExternalBufferT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = id(); _o->id = _e; }
+  { auto _e = group(); _o->group = _e; }
+  { auto _e = offset(); _o->offset = _e; }
+  { auto _e = length(); _o->length = _e; }
+  { auto _e = packing(); if (_e) _o->packing = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ExternalBuffer> ExternalBuffer::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExternalBuffer(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ExternalBuffer> CreateExternalBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExternalBufferT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _id = _o->id;
+  auto _group = _o->group;
+  auto _offset = _o->offset;
+  auto _length = _o->length;
+  auto _packing = _o->packing.empty() ? 0 : _fbb.CreateString(_o->packing);
+  return tflite::CreateExternalBuffer(
+      _fbb,
+      _id,
+      _group,
+      _offset,
+      _length,
+      _packing);
+}
+
 inline MetadataT *Metadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MetadataT>(new MetadataT());
   UnPackTo(_o.get(), _resolver);
@@ -21721,8 +22022,8 @@ inline SignatureDefT *SignatureDef::UnPack(const ::flatbuffers::resolver_functio
 inline void SignatureDef::UnPackTo(SignatureDefT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inputs[_i]) { _e->Get(_i)->UnPackTo(_o->inputs[_i].get(), _resolver); } else { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inputs.resize(0); } }
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->outputs[_i]) { _e->Get(_i)->UnPackTo(_o->outputs[_i].get(), _resolver); } else { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->outputs.resize(0); } }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inputs[_i]) { _e->Get(_i)->UnPackTo(_o->inputs[_i].get(), _resolver); } else { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->outputs[_i]) { _e->Get(_i)->UnPackTo(_o->outputs[_i].get(), _resolver); } else { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->outputs.resize(0); } }
   { auto _e = signature_key(); if (_e) _o->signature_key = _e->str(); }
   { auto _e = subgraph_index(); _o->subgraph_index = _e; }
 }
@@ -21761,6 +22062,10 @@ inline ModelT::ModelT(const ModelT &o)
   for (const auto &metadata_ : o.metadata) { metadata.emplace_back((metadata_) ? new tflite::MetadataT(*metadata_) : nullptr); }
   signature_defs.reserve(o.signature_defs.size());
   for (const auto &signature_defs_ : o.signature_defs) { signature_defs.emplace_back((signature_defs_) ? new tflite::SignatureDefT(*signature_defs_) : nullptr); }
+  external_buffer_groups.reserve(o.external_buffer_groups.size());
+  for (const auto &external_buffer_groups_ : o.external_buffer_groups) { external_buffer_groups.emplace_back((external_buffer_groups_) ? new tflite::ExternalBufferGroupT(*external_buffer_groups_) : nullptr); }
+  external_buffers.reserve(o.external_buffers.size());
+  for (const auto &external_buffers_ : o.external_buffers) { external_buffers.emplace_back((external_buffers_) ? new tflite::ExternalBufferT(*external_buffers_) : nullptr); }
 }
 
 inline ModelT &ModelT::operator=(ModelT o) FLATBUFFERS_NOEXCEPT {
@@ -21772,6 +22077,8 @@ inline ModelT &ModelT::operator=(ModelT o) FLATBUFFERS_NOEXCEPT {
   std::swap(metadata_buffer, o.metadata_buffer);
   std::swap(metadata, o.metadata);
   std::swap(signature_defs, o.signature_defs);
+  std::swap(external_buffer_groups, o.external_buffer_groups);
+  std::swap(external_buffers, o.external_buffers);
   return *this;
 }
 
@@ -21785,13 +22092,15 @@ inline void Model::UnPackTo(ModelT *_o, const ::flatbuffers::resolver_function_t
   (void)_o;
   (void)_resolver;
   { auto _e = version(); _o->version = _e; }
-  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operator_codes[_i]) { _e->Get(_i)->UnPackTo(_o->operator_codes[_i].get(), _resolver); } else { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operator_codes.resize(0); } }
-  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->subgraphs[_i]) { _e->Get(_i)->UnPackTo(_o->subgraphs[_i].get(), _resolver); } else { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->subgraphs.resize(0); } }
+  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operator_codes[_i]) { _e->Get(_i)->UnPackTo(_o->operator_codes[_i].get(), _resolver); } else { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->operator_codes.resize(0); } }
+  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->subgraphs[_i]) { _e->Get(_i)->UnPackTo(_o->subgraphs[_i].get(), _resolver); } else { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->subgraphs.resize(0); } }
   { auto _e = description(); if (_e) _o->description = _e->str(); }
-  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->buffers.resize(0); } }
+  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->buffers.resize(0); } }
   { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } else { _o->metadata_buffer.resize(0); } }
-  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metadata[_i]) { _e->Get(_i)->UnPackTo(_o->metadata[_i].get(), _resolver); } else { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metadata.resize(0); } }
-  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->signature_defs[_i]) { _e->Get(_i)->UnPackTo(_o->signature_defs[_i].get(), _resolver); } else { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->signature_defs.resize(0); } }
+  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metadata[_i]) { _e->Get(_i)->UnPackTo(_o->metadata[_i].get(), _resolver); } else { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->metadata.resize(0); } }
+  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->signature_defs[_i]) { _e->Get(_i)->UnPackTo(_o->signature_defs[_i].get(), _resolver); } else { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->signature_defs.resize(0); } }
+  { auto _e = external_buffer_groups(); if (_e) { _o->external_buffer_groups.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->external_buffer_groups[_i]) { _e->Get(_i)->UnPackTo(_o->external_buffer_groups[_i].get(), _resolver); } else { _o->external_buffer_groups[_i] = std::unique_ptr<tflite::ExternalBufferGroupT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->external_buffer_groups.resize(0); } }
+  { auto _e = external_buffers(); if (_e) { _o->external_buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->external_buffers[_i]) { _e->Get(_i)->UnPackTo(_o->external_buffers[_i].get(), _resolver); } else { _o->external_buffers[_i] = std::unique_ptr<tflite::ExternalBufferT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->external_buffers.resize(0); } }
 }
 
 inline ::flatbuffers::Offset<Model> Model::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -21810,6 +22119,8 @@ inline ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder
   auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
   auto _metadata = _o->metadata.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _signature_defs = _o->signature_defs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>> (_o->signature_defs.size(), [](size_t i, _VectorArgs *__va) { return CreateSignatureDef(*__va->__fbb, __va->__o->signature_defs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _external_buffer_groups = _o->external_buffer_groups.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> (_o->external_buffer_groups.size(), [](size_t i, _VectorArgs *__va) { return CreateExternalBufferGroup(*__va->__fbb, __va->__o->external_buffer_groups[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _external_buffers = _o->external_buffers.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBuffer>> (_o->external_buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateExternalBuffer(*__va->__fbb, __va->__o->external_buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateModel(
       _fbb,
       _version,
@@ -21819,7 +22130,9 @@ inline ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder
       _buffers,
       _metadata_buffer,
       _metadata,
-      _signature_defs);
+      _signature_defs,
+      _external_buffer_groups,
+      _external_buffers);
 }
 
 inline bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
diff --git a/tensorflow/compiler/mlir/lite/schema/schema_utils.cc b/tensorflow/compiler/mlir/lite/schema/schema_utils.cc
index a173380940d600..cb61ce6243f3ad 100644
--- a/tensorflow/compiler/mlir/lite/schema/schema_utils.cc
+++ b/tensorflow/compiler/mlir/lite/schema/schema_utils.cc
@@ -15,8 +15,12 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/schema/schema_utils.h"
 
 #include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
 
 #include "tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -59,4 +63,51 @@ BuiltinOperator GetBuiltinCode(const OperatorCodeT* op_code) {
                                              op_code->deprecated_builtin_code));
 }
 
+size_t TensorTypeGetSize(::tflite::TensorType data_type) {
+  switch (data_type) {
+    case ::tflite::TensorType_FLOAT32:
+      static_assert(sizeof(float) == 4, "");
+      return 4;
+    case ::tflite::TensorType_FLOAT16:
+      static_assert(sizeof(int16_t) == 2, "");
+      return 2;
+    case ::tflite::TensorType_INT32:
+      static_assert(sizeof(int32_t) == 4, "");
+      return 4;
+    case ::tflite::TensorType_UINT8:
+      static_assert(sizeof(uint8_t) == 1, "");
+      return 1;
+    case ::tflite::TensorType_INT64:
+      static_assert(sizeof(int64_t) == 8, "");
+      return 8;
+    case ::tflite::TensorType_BOOL:
+      return sizeof(bool);
+    case ::tflite::TensorType_INT16:
+      static_assert(sizeof(int16_t) == 2, "");
+      return 2;
+    case ::tflite::TensorType_COMPLEX64:
+      static_assert(sizeof(std::complex<float>) == 8, "");
+      return 8;
+    case ::tflite::TensorType_INT8:
+      static_assert(sizeof(int8_t) == 1, "");
+      return 1;
+    case ::tflite::TensorType_FLOAT64:
+      static_assert(sizeof(double) == 8, "");
+      return 8;
+    case ::tflite::TensorType_COMPLEX128:
+      static_assert(sizeof(std::complex<double>) == 16, "");
+      return 16;
+    case ::tflite::TensorType_UINT64:
+      static_assert(sizeof(uint64_t) == 8, "");
+      return 8;
+    case ::tflite::TensorType_UINT32:
+      static_assert(sizeof(uint32_t) == 4, "");
+      return 4;
+    case ::tflite::TensorType_UINT16:
+      static_assert(sizeof(uint16_t) == 2, "");
+      return 2;
+    default:
+      return 0;
+  }
+}
 }  // namespace tflite
diff --git a/tensorflow/compiler/mlir/lite/schema/schema_utils.h b/tensorflow/compiler/mlir/lite/schema/schema_utils.h
index 7498aa02ebe5c2..9c32680b85117f 100644
--- a/tensorflow/compiler/mlir/lite/schema/schema_utils.h
+++ b/tensorflow/compiler/mlir/lite/schema/schema_utils.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
 
+#include <cstddef>
+
 #include "flatbuffers/flatbuffers.h"
 #include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 
@@ -28,6 +30,11 @@ BuiltinOperator GetBuiltinCode(const OperatorCode *op_code);
 
 BuiltinOperator GetBuiltinCode(const OperatorCodeT *op_code);
 
+// Returns the size of the given TensorType in bytes, or 0 if the TensorType is
+// not supported, this function should be aligned with TfLiteTypeGetSize in
+// lite/kernels/kernel_util.h.
+size_t TensorTypeGetSize(::tflite::TensorType data_type);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 43fada7b0d0b62..cd553040786c72 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -539,6 +539,7 @@ cc_library(
         ":passes_inc_gen",
         ":unfold_splat_constant_pass",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:case",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:conv",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:custom_call",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:dot_general",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index ae672381bacafd..9a0a185443ebc0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -3073,6 +3073,13 @@ func.func @convert_iota_ui64() -> tensor<123xui64> {
   func.return %0 : tensor<123xui64>
 }
 
+// CHECK-LABEL: func @no_convert_iota_ui8
+func.func @no_convert_iota_ui8() -> tensor<123xui8> {
+  // CHECK: "mhlo.iota"
+  %0 = "mhlo.iota"() <{ iota_dimension = 0 : i64 }> : () -> tensor<123xui8>
+  func.return %0 : tensor<123xui8>
+}
+
 // CHECK-LABEL:   func @convert_avgpool_valid(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
 // CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
index a77d02e78c1dce..1d8a63130ac1d9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
@@ -3721,14 +3721,43 @@ func.func @dynamic_broadcast_in_dim_general_case_expand_back_dims(%arg0: tensor<
 // CHECK: %2 = "tfl.broadcast_to"(%1, %arg1) : (tensor<?x3000x1x1xf32>, tensor<4xi32>) -> tensor<?x3000x2x4xf32>
 
 
+// -----
+
+//===----------------------------------------------------------------------===//
+// mhlo.case
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: case_func
+func.func @case_func(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>) {
+  %0 = "mhlo.case"(%arg0) ({
+    %2 = mhlo.add %arg1, %arg2 : tensor<i32>
+    "mhlo.return"(%2) : (tensor<i32>) -> ()
+  }, {
+    %2 = mhlo.multiply %arg1, %arg1 : tensor<i32>
+    "mhlo.return"(%2) : (tensor<i32>) -> ()
+  }) : (tensor<i32>) -> tensor<i32>
+  func.return %0: tensor<i32>
+}
+
+// CHECK: %[[CST:.*]] = arith.constant dense<0> : tensor<i32>
+// CHECK: %[[PRED:.*]] = tfl.not_equal(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK: %[[IF:.*]] = "tfl.if"(%[[PRED]]) ({
+// CHECK:   %[[MUL:.*]] = tfl.mul %arg1, %arg1 {fused_activation_function = "NONE"} : tensor<i32>
+// CHECK:   "tfl.yield"(%[[MUL]]) : (tensor<i32>) -> ()
+// CHECK: }, {
+// CHECK:   %[[ADD:.*]] = tfl.add %arg1, %arg2 {fused_activation_function = "NONE"} : tensor<i32>
+// CHECK:   "tfl.yield"(%[[ADD]]) : (tensor<i32>) -> ()
+// CHECK: }) : (tensor<i1>) -> tensor<i32>
+// CHECK: return %[[IF]] : tensor<i32>
+
 // -----
 
 //===----------------------------------------------------------------------===//
 // mhlo.if
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: if
-func.func @if(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>) {
+// CHECK-LABEL: if_label
+func.func @if_label(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>) {
   %0 = mhlo.add %arg1, %arg2 : tensor<i32>
   %1 = "mhlo.if"(%arg0) ({
     "mhlo.return"(%0) : (tensor<i32>) -> ()
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 3891d0f3fe4db3..7608ff985f1eb9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -2081,8 +2081,10 @@ class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
       ConversionPatternRewriter& rewriter) const final {
     RankedTensorType type =
         mlir::dyn_cast_or_null<RankedTensorType>(iota_op.getType());
-    // TF::RangeOp doesn't support UI16.
-    if (!type || type.getElementType().isUnsignedInteger(16)) return failure();
+    // TF::RangeOp doesn't support UI16 and UI8.
+    if (!type || type.getElementType().isUnsignedInteger(16) ||
+        type.getElementType().isUnsignedInteger(8))
+      return failure();
 
     const uint64_t dimension = iota_op.getIotaDimension();
     Type element_type = type.getElementType();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
index 9e2f1cf33f495f..16c194df28f591 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
@@ -320,6 +320,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "case",
+    srcs = ["case.cc"],
+    hdrs = ["case.h"],
+    deps = [
+        ":util",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
+
 cc_library(
     name = "if",
     srcs = ["if.cc"],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.cc
new file mode 100644
index 00000000000000..b50a5e7fd83195
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.cc
@@ -0,0 +1,100 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::odml {
+namespace {
+
+// Legalizes mhlo.case op to tfl.if op.
+// This pattern only supports mhlo.case ops with exactly two branches.
+class LegalizeCaseOp : public OpConversionPattern<mhlo::CaseOp> {
+ public:
+  using OpConversionPattern<mhlo::CaseOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::CaseOp case_op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    // mhlo.case can have N branches, but tfl.if only supports two.
+    if (case_op.getBranches().size() != 2) {
+      return rewriter.notifyMatchFailure(
+          case_op, "can only convert mhlo.case with 2 branches");
+    }
+
+    // `mhlo.case` takes an index, `tfl.if` takes a boolean predicate.
+    // For a 2-branch `mhlo.case` (branch 0 and branch 1), we need to map
+    // the index to a boolean.
+    // According to the mhlo.case spec, an out-of-bounds index defaults to the
+    // index of the last branch, which is 1 in this case.
+    // So, index 0 maps to branch 0, and any other index (1, or out of bounds)
+    // maps to branch 1.
+    // This can be expressed as a predicate `index != 0` for branch 1.
+
+    auto loc = case_op->getLoc();
+    auto index = case_op.getIndex();
+    auto index_type = mlir::cast<ShapedType>(index.getType());
+
+    // Create a constant tensor of the same shape as the index, filled with
+    // zeros.
+    auto const_zero = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getZeroAttr(index_type));
+
+    // Create the predicate `index != 0`.
+    auto pred_type = index_type.clone(rewriter.getI1Type());
+    auto pred = mhlo::CompareOp::create(
+        rewriter, loc, pred_type, index, const_zero,
+        mhlo::ComparisonDirectionAttr::get(rewriter.getContext(),
+                                           mhlo::ComparisonDirection::NE),
+        mhlo::ComparisonTypeAttr{});  // Default comparison type is fine for
+                                      // integers.
+
+    // Create the tfl.if op.
+    auto tfl_if =
+        TFL::IfOp::create(rewriter, loc, case_op.getResultTypes(), pred);
+
+    // Branch 1 of mhlo.case becomes the `then_region` of tfl.if.
+    tfl_if.getThenRegion().takeBody(case_op.getBranches()[1]);
+    ReplaceTerminatorWithYield(tfl_if.getThenRegion(), rewriter);
+
+    // Branch 0 of mhlo.case becomes the `else_region` of tfl.if.
+    tfl_if.getElseRegion().takeBody(case_op.getBranches()[0]);
+    ReplaceTerminatorWithYield(tfl_if.getElseRegion(), rewriter);
+
+    rewriter.replaceOp(case_op, tfl_if.getResults());
+    return success();
+  }
+};
+
+}  // namespace
+
+void PopulateCasePatterns(MLIRContext* context, RewritePatternSet& patterns,
+                          ConversionTarget& target) {
+  patterns.add<LegalizeCaseOp>(context);
+  // Mark mhlo.case as dynamically legal: it's legal if it does NOT have
+  // exactly 2 branches, as those are the ones we want to convert.
+  target.addDynamicallyLegalOp<mhlo::CaseOp>(
+      [](mhlo::CaseOp op) { return op.getBranches().size() != 2; });
+}
+
+}  // namespace mlir::odml
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h
new file mode 100644
index 00000000000000..11c470a1492630
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h
@@ -0,0 +1,31 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CASE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CASE_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+void PopulateCasePatterns(MLIRContext* context, RewritePatternSet& patterns,
+                          ConversionTarget& target);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CASE_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
index 9518b960f17442..0c43a5c4047a64 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.h"  // IWYU pragma: keep
@@ -479,6 +480,7 @@ void LegalizeHloToTfLitePass::runOnOperation() {
   PopulateWhilePatterns(context, patterns, target);
   PopulateGetDimensionSizePatterns(context, patterns, target);
   PopulateIfPatterns(context, patterns, target);
+  PopulateCasePatterns(context, patterns, target);
   PopulateLegalizeFftPatterns(context, patterns, target);
   PopulateCustomCallPatterns(context, patterns, target);
 
@@ -493,7 +495,6 @@ void LegalizeHloToTfLitePass::runOnOperation() {
 
 }  // namespace
 
-
 // Creates an instance of the pass.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfLitePass() {
   return std::make_unique<LegalizeHloToTfLitePass>();
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index 6043e26cb757d8..5bc6bef17fe360 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -261,7 +261,7 @@ func.func @mul_one_quant(%arg0: tensor<32x!quant.uniform<u8:f32, 1.0>>) -> tenso
 
 
 // CHECK-LABEL: @elementwise_unary_ops
-func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) {
+func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) {
   %0 = arith.constant dense<-1.0> : tensor<f32>
   %1 = arith.constant dense<1.0> : tensor<f32>
   %2 = arith.constant dense<1.0> : tensor<f32>
@@ -269,6 +269,7 @@ func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, te
   %4 = arith.constant dense<4.0> : tensor<f32>
   %5 = arith.constant dense<4.0> : tensor<f32>
   %6 = arith.constant dense<2.0> : tensor<f32>
+  %one = arith.constant dense<1.0> : tensor<f32>
 
   // CHECK-DAG: [[cst0:%.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
   // CHECK-DAG: [[cst1:%.*]] = arith.constant dense<0.841470957> : tensor<f32>
@@ -277,7 +278,8 @@ func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, te
   // CHECK-DAG: [[cst4:%.*]] = arith.constant dense<2.000000e+00> : tensor<f32>
   // CHECK-DAG: [[cst5:%.*]] = arith.constant dense<5.000000e-01> : tensor<f32>
   // CHECK-DAG: [[cst6:%.*]] = arith.constant dense<4.000000e+00> : tensor<f32>
-  // CHECK: return [[cst0]], [[cst1]], [[cst2]], [[cst3]], [[cst4]], [[cst5]], [[cst6]]
+  // CHECK-DAG: [[cst7:%.*]] = arith.constant dense<0.761594176> : tensor<f32>
+  // CHECK: return [[cst0]], [[cst1]], [[cst2]], [[cst3]], [[cst4]], [[cst5]], [[cst6]], [[cst7]]
 
   %7 = "tfl.abs"(%0) : (tensor<f32>) -> tensor<f32>
   %8 = "tfl.sin"(%1) : (tensor<f32>) -> tensor<f32>
@@ -286,8 +288,9 @@ func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, te
   %11 = "tfl.sqrt"(%4) : (tensor<f32>) -> tensor<f32>
   %12 = "tfl.rsqrt"(%5) : (tensor<f32>) -> tensor<f32>
   %13 = "tfl.square"(%6) : (tensor<f32>) -> tensor<f32>
+  %14 = "tfl.tanh"(%one) : (tensor<f32>) -> tensor<f32>
 
-  func.return %7, %8, %9, %10, %11, %12, %13 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+  func.return %7, %8, %9, %10, %11, %12, %13, %14 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
 }
 
 // CHECK-LABEL: @max_with_neg_f32_max_val
@@ -1126,6 +1129,15 @@ func.func @cast_f32_to_f64() -> tensor<4xf64> {
 
 // CHECK: %cst = arith.constant dense<[-1.000000e+00, 0.000000e+00, 1.500000e+00, 1.000000e+02]> : tensor<4xf64>
 
+// CHECK-LABEL: @cast_f32_to_f16
+func.func @cast_f32_to_f16() -> tensor<4xf16> {
+  %cst = arith.constant dense<[-1.0, 0.0, 1.5, 100.0]> : tensor<4xf32>
+  %0 = "tfl.cast"(%cst) : (tensor<4xf32>) -> tensor<4xf16>
+  func.return %0 : tensor<4xf16>
+}
+
+// CHECK: %cst = arith.constant dense<[-1.000000e+00, 0.000000e+00, 1.500000e+00, 1.000000e+02]> : tensor<4xf16>
+
 // CHECK-LABEL: @ConstantFoldFullyConnectedSmall
 func.func @ConstantFoldFullyConnectedSmall() -> tensor<3xf32> {
   %cst_input = arith.constant dense<[2.0, 3.0]> : tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/lower_quant_annotations.mlir b/tensorflow/compiler/mlir/lite/tests/lower_quant_annotations.mlir
index 915db8f6867550..62434d956f8609 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower_quant_annotations.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower_quant_annotations.mlir
@@ -3,6 +3,8 @@
 func.func private @XlaCallModule_quant.fake_quant.impl_0(tensor<1x28x28x3xf32>) -> tensor<1x28x28x3xf32>
 func.func private @XlaCallModule_quant.fake_quant.impl_5_0(tensor<2x1x1x1xf32>) -> tensor<2x1x1x1xf32>
 func.func private @XlaCallModule_quant.fake_quant.impl_17_0(tensor<1x30x30x2xf32>) -> tensor<1x30x30x2xf32>
+func.func private @XlaCallModule_quant.fake_quant.impl_i2_0(tensor<1x4xf32>) -> tensor<1x4xf32>
+func.func private @XlaCallModule_quant.fake_quant.impl_i2_1(tensor<1x4xf32>) -> tensor<1x4xf32>
 // CHECK-LABEL: func.func @serving_default
 func.func @serving_default(%arg0: tensor<1x28x28x3xf32>) -> (tensor<1x30x30x2xf32>) {
   %cst = arith.constant dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>
@@ -22,4 +24,15 @@ func.func @serving_default(%arg0: tensor<1x28x28x3xf32>) -> (tensor<1x30x30x2xf3
   // CHECK-OFF: %[[DEQUANT2:.+]] = "tfl.dequantize"(%[[QUANT2]]) : (tensor<1x30x30x2x!quant.uniform<i8:f32, 0.018049469217658043:8>>) -> tensor<1x30x30x2xf32>
   %5 = stablehlo.composite "quant.fake_quant" %4 {composite_attributes = {dtype = "i8", narrow_range = false, scale = dense<0.0180494692> : tensor<1xf32>, zero_point = dense<8> : tensor<1xi32>}, decomposition = @XlaCallModule_quant.fake_quant.impl_17_0} : (tensor<1x30x30x2xf32>) -> tensor<1x30x30x2xf32>
   return %5 : tensor<1x30x30x2xf32>
+}
+
+// CHECK-LABEL: func.func @i2_test
+func.func @i2_test(%arg0: tensor<1x4xf32>) -> (tensor<1x4xf32>) {
+  // CHECK: %[[QUANT0:.+]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x4x!quant.uniform<i2:f32, 1.000000e+00>>}> : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i2:f32, 1.000000e+00>>
+  // CHECK: %[[DEQUANT0:.+]] = "tfl.dequantize"(%[[QUANT0]]) : (tensor<1x4x!quant.uniform<i2:f32, 1.000000e+00>>) -> tensor<1x4xf32>
+  %0 = stablehlo.composite "quant.fake_quant" %arg0 {composite_attributes = {dtype = "i2", narrow_range = false, scale = dense<1.0> : tensor<1xf32>, zero_point = dense<0> : tensor<1xi32>}, decomposition = @XlaCallModule_quant.fake_quant.impl_i2_0} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+  // CHECK: %[[QUANT1:.+]] = "tfl.quantize"(%[[DEQUANT0]]) <{qtype = tensor<1x4x!quant.uniform<i2<-1:1>:f32:1, {1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00}>>}> : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i2<-1:1>:f32:1, {1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00}>>
+  // CHECK: %[[DEQUANT1:.+]] = "tfl.dequantize"(%[[QUANT1]]) : (tensor<1x4x!quant.uniform<i2<-1:1>:f32:1, {1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00}>>) -> tensor<1x4xf32>
+  %1 = stablehlo.composite "quant.fake_quant" %0 {composite_attributes = {dtype = "i2", narrow_range = true, quantization_dimension = 1 : i32, scale = dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>}, decomposition = @XlaCallModule_quant.fake_quant.impl_i2_1} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 035f210a73a7f4..063e25944da6fe 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -4802,23 +4802,6 @@ func.func @RealDivWithConstDivisor(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: return %0 : tensor<2x3xf32>
 }
 
-// When the const tensor cst is very large, `1 / cst` div introduced by
-// div->mul conversion may not be folded and the `1 / cst` div may trigger
-// the div->mul conversion again.
-// This test checks the div->mul conversion will not be done infinitively.
-//
-// CHECK-LABEL: @RealDivWithLargeSizeConstDivisor
-func.func @RealDivWithLargeSizeConstDivisor(%arg0: tensor<1x16x4096x4096xf32>) -> tensor<1x16x4096x4096xf32> {
-  %cst = arith.constant dense<5.000000e+01> : tensor<1x16x4096x4096xf32>
-  %1 = tfl.div %arg0, %cst {fused_activation_function = "NONE"} : tensor<1x16x4096x4096xf32>
-  func.return %1 : tensor<1x16x4096x4096xf32>
-  // CHECK-NEXT: %[[CST0:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK-NEXT: %[[CST1:.*]] = arith.constant dense<5.000000e+01> : tensor<1x16x4096x4096xf32>
-  // CHECK-NEXT: %[[DIV:.*]] = tfl.div(%[[CST0]], %[[CST1]]) <{fused_activation_function = "NONE"}> : (tensor<f32>, tensor<1x16x4096x4096xf32>) -> tensor<1x16x4096x4096xf32>
-  // CHECK-NEXT: %[[MUL:.*]] = tfl.mul %arg0, %[[DIV]] {fused_activation_function = "NONE"} : tensor<1x16x4096x4096xf32>
-  // CHECK-NEXT: return %[[MUL]] : tensor<1x16x4096x4096xf32>
-}
-
 //CHECK-LABEL: @PushTransposeThroughSqueezeNoDims
 func.func @PushTransposeThroughSqueezeNoDims(%arg0: tensor<1x1x2x3xf32>) -> (tensor<3x2xf32>) {
   %cst = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index e950a5d91b9876..2ce933112a0a43 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -323,21 +323,19 @@ absl::Status ConvertTFExecutorToStablehloFlatbuffer(
 
     // TODO: b/264218457 - Refactor the component below once StableHLO Quantizer
     // can run DRQ. Temporarily using TF Quantization for StableHLO DRQ.
-    if (!converter_flags.has_quantization_options()) {
-      // The default minimum number of elements a weights array must have to be
-      // quantized by this transformation.
-      const int kWeightsMinNumElementsDefault = 1024;
-
-      quantization::QuantizationOptions quantization_options;
-
-      quantization_options.mutable_quantization_method()->set_preset_method(
-          quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8);
-      quantization_options.set_op_set(quantization::UNIFORM_QUANTIZED);
-      quantization_options.set_min_num_elements_for_weights(
-          kWeightsMinNumElementsDefault);
-      quantization::AddQuantizePtqDynamicRangePasses(pass_manager,
-                                                     quantization_options);
-    }
+    // The default minimum number of elements a weights array must have to be
+    // quantized by this transformation.
+    const int kWeightsMinNumElementsDefault = 1024;
+
+    quantization::QuantizationOptions quantization_options;
+
+    quantization_options.mutable_quantization_method()->set_preset_method(
+        quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8);
+    quantization_options.set_op_set(quantization::UNIFORM_QUANTIZED);
+    quantization_options.set_min_num_elements_for_weights(
+        kWeightsMinNumElementsDefault);
+    quantization::AddQuantizePtqDynamicRangePasses(pass_manager,
+                                                   quantization_options);
     if (failed(pass_manager.run(module))) {
       return status_handler.ConsumeStatus();
     }
@@ -350,10 +348,6 @@ absl::Status ConvertTFExecutorToStablehloFlatbuffer(
   pass_manager.addPass(mlir::odml::createPrintOpStatsPass(
       mlir::odml::GetAcceptedStableHLODialects()));
   mlir::odml::AddStablehloOptimizationPasses(pass_manager);
-  if (converter_flags.has_quantization_options()) {
-    stablehlo::quantization::AddQuantizationPasses(
-        pass_manager, converter_flags.quantization_options());
-  }
   if (failed(pass_manager.run(module))) {
     return status_handler.ConsumeStatus();
   }
diff --git a/tensorflow/compiler/mlir/lite/tools/versioning/op_signature.cc b/tensorflow/compiler/mlir/lite/tools/versioning/op_signature.cc
index f8438cd2231ad5..77b8fed82ab939 100644
--- a/tensorflow/compiler/mlir/lite/tools/versioning/op_signature.cc
+++ b/tensorflow/compiler/mlir/lite/tools/versioning/op_signature.cc
@@ -73,7 +73,7 @@ std::vector<OpSignatureTensorSpec> GetOpSignatureTensorSpecs(
         // Check if the tensor is a constant tensor.
         if (buffer_idx != 0 && buffer_idx < model->buffers()->size()) {
           auto* buffer = model->buffers()->Get(buffer_idx);
-          if (buffer->data() && buffer->data()->size() != 0) {
+          if (buffer->data() && !buffer->data()->empty()) {
             tensor_spec.is_const = true;
           }
         }
@@ -143,8 +143,8 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       const QuantizationParameters* weight_quant =
           weight_tensor->quantization();
       if (weight_quant && weight_quant->scale() &&
-          weight_quant->scale()->size() && weight_tensor->shape() &&
-          weight_tensor->shape()->size()) {
+          !weight_quant->scale()->empty() && weight_tensor->shape() &&
+          !weight_tensor->shape()->empty()) {
         op_sig.ext_options.fully_connected.is_per_channel_quantized =
             IsTensorSizeEqual(weight_quant->scale()->size(),
                               weight_tensor->shape()->Get(0));
@@ -152,7 +152,7 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
     } break;
 
     case BuiltinOperator_MUL: {
-      if (op->inputs()->size() < 2 || op->outputs()->size() < 1) {
+      if (op->inputs()->size() < 2 || op->outputs()->empty()) {
         break;
       }
       const Tensor* input1_tensor =
@@ -167,10 +167,10 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       const QuantizationParameters* output_quant =
           output_tensor->quantization();
       if (input1_quant && input1_quant->scale() &&
-          input1_quant->scale()->size() && input2_qunt &&
-          input2_qunt->scale() && input2_qunt->scale()->size() &&
+          !input1_quant->scale()->empty() && input2_qunt &&
+          input2_qunt->scale() && !input2_qunt->scale()->empty() &&
           output_quant && output_quant->scale() &&
-          output_quant->scale()->size()) {
+          !output_quant->scale()->empty()) {
         op_sig.ext_options.mul.input1_scale = input1_quant->scale()->Get(0);
         op_sig.ext_options.mul.input2_scale = input2_qunt->scale()->Get(0);
         op_sig.ext_options.mul.output_scale = output_quant->scale()->Get(0);
@@ -192,7 +192,7 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
           filter_quant->scale()->size() == static_cast<size_t>(num_filters)) {
         op_sig.ext_options.conv_2d.is_per_channel_quantized = true;
       }
-      if (input_tensor->shape() && input_tensor->shape()->size()) {
+      if (input_tensor->shape() && !input_tensor->shape()->empty()) {
         int num_input_channels = input_tensor->shape()->Get(3);
         int num_filter_input_channels = filter_tensor->shape()->Get(3);
         op_sig.ext_options.conv_2d.is_grouped_convolution =
@@ -249,8 +249,9 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       const Tensor* table_tensor =
           subgraph->tensors()->Get(op->inputs()->Get(1));
       const QuantizationParameters* table_quant = table_tensor->quantization();
-      if (table_quant && table_quant->scale() && table_quant->scale()->size() &&
-          table_tensor->shape() && table_tensor->shape()->size()) {
+      if (table_quant && table_quant->scale() &&
+          !table_quant->scale()->empty() && table_tensor->shape() &&
+          !table_tensor->shape()->empty()) {
         op_sig.ext_options.embedding_lookup.is_per_channel_quantized =
             table_quant->scale()->size() > 1 &&
             IsTensorSizeEqual(table_quant->scale()->size(),
diff --git a/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc b/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc
index 30c564c41c503c..9ccda1d0c95e69 100644
--- a/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc
+++ b/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc
@@ -177,6 +177,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
           reinterpret_cast<TfLiteFullyConnectedParams*>(op_sig.builtin_data);
       TFLITE_DCHECK(fully_connected_params != nullptr);
 
+      if (op_sig.inputs.at(1).type == kTfLiteInt2) {
+        return 14;
+      }
+
       if (op_sig.inputs.at(0).type == kTfLiteInt16 &&
           op_sig.inputs.at(1).type == kTfLiteInt4 &&
           op_sig.outputs.at(0).type == kTfLiteInt16) {
@@ -464,6 +468,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SLICE:
+      if (op_sig.inputs.at(0).type == kTfLiteInt4) {
+        return 7;
+      }
       if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
         return 6;
       }
@@ -473,7 +480,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.inputs.at(0).type == kTfLiteInt16) {
         return 4;
       }
-      // Version 3 supports string input types.
       if (op_sig.inputs.at(0).type == kTfLiteString) {
         return 3;
       }
@@ -499,6 +505,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_DEQUANTIZE:
+      if (op_sig.inputs.at(0).type == kTfLiteInt2) {
+        return 7;
+      }
       if (op_sig.inputs.at(0).type == kTfLiteInt4) {
         return 6;
       }
@@ -1073,8 +1082,11 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 2;
     case BuiltinOperator_CAST:
-      if (op_sig.inputs.at(0).type == kTfLiteBFloat16 ||
-          op_sig.outputs.at(0).type == kTfLiteBFloat16) {
+      if (op_sig.inputs.at(0).type == kTfLiteInt2 ||
+          op_sig.outputs.at(0).type == kTfLiteInt2) {
+        return 8;
+      } else if (op_sig.inputs.at(0).type == kTfLiteBFloat16 ||
+                 op_sig.outputs.at(0).type == kTfLiteBFloat16) {
         return 7;
       } else if (op_sig.inputs.at(0).type == kTfLiteInt4 &&
                  op_sig.outputs.at(0).type == kTfLiteFloat32) {
diff --git a/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc b/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc
index aaf335682a0358..87313665d1811f 100644
--- a/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc
@@ -733,6 +733,15 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   };
   fake_op_sig.ext_options.fully_connected.is_per_channel_quantized = true;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 12);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .inputs = CreateOpSignatureTensorSpecs(
+          std::vector<TfLiteType>{kTfLiteInt8, kTfLiteInt2}),
+      .outputs = CreateOpSignatureTensorSpecs(kTfLiteInt8),
+      .builtin_data = reinterpret_cast<void*>(&fully_connected_params),
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 14);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {
@@ -757,6 +766,12 @@ TEST(OpVersionTest, VersioningDequantizeTest) {
   fake_op_sig.ext_options.dequantize.is_per_channel_quantized = true;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
 
+  fake_op_sig = {
+      .op = BuiltinOperator_DEQUANTIZE,
+      .inputs = CreateOpSignatureTensorSpecs(kTfLiteInt2),
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
+
   fake_op_sig = {
       .op = BuiltinOperator_DEQUANTIZE,
       .inputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32),
@@ -1467,4 +1482,72 @@ TEST(OpVersionTest, VersioningSqrtTest) {
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
 }
+
+TEST(OpVersionTest, VersioningCastTest) {
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_CAST;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt2);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt2);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteBFloat16);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteBFloat16);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt4);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteFloat64);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat64);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteFloat16);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat16);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt16);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteUInt16);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt8);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+}
 }  // namespace tflite
diff --git a/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc b/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc
index 4f4dc835c91d6c..aca1b463878966 100644
--- a/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc
@@ -112,6 +112,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_CAST, 5}, "2.12.0"},
               {{BuiltinOperator_CAST, 6}, "2.15.0"},
               {{BuiltinOperator_CAST, 7}, "2.17.0"},
+              {{BuiltinOperator_CAST, 8}, "2.21.0"},
               {{BuiltinOperator_CONCATENATION, 1}, "1.5.0"},
               {{BuiltinOperator_CONCATENATION, 2}, "1.14.0"},
               {{BuiltinOperator_CONCATENATION, 3}, "2.3.0"},
@@ -138,6 +139,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_FULLY_CONNECTED, 11}, "2.15.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 12}, "2.17.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 13}, "2.18.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 14}, "2.21.0"},
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
@@ -293,6 +295,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SLICE, 4}, "2.4.0"},
               {{BuiltinOperator_SLICE, 5}, "2.5.0"},
               {{BuiltinOperator_SLICE, 6}, "2.14.0"},
+              {{BuiltinOperator_SLICE, 7}, "2.21.0"},
               {{BuiltinOperator_TANH, 1}, "1.14.0"},
               {{BuiltinOperator_TANH, 2}, "1.14.0"},
               {{BuiltinOperator_TANH, 3}, "2.3.0"},
@@ -325,6 +328,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_DEQUANTIZE, 4}, "2.2.0"},
               {{BuiltinOperator_DEQUANTIZE, 5}, "2.7.0"},
               {{BuiltinOperator_DEQUANTIZE, 6}, "2.18.0"},
+              {{BuiltinOperator_DEQUANTIZE, 7}, "2.21.0"},
               {{BuiltinOperator_REVERSE_SEQUENCE, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 2}, "1.14.0"},
diff --git a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
index 4886f09dd5c4bc..6b92b5f63ee66f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
@@ -49,7 +49,7 @@ class DecomposeHybridQuantizationPass
     : public impl::DecomposeHybridQuantizationPassBase<
           DecomposeHybridQuantizationPass> {
  public:
-  explicit DecomposeHybridQuantizationPass() {}
+  explicit DecomposeHybridQuantizationPass() = default;
   void runOnOperation() override;
 };
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index 77b112d28a5098..0564dd56961b35 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -54,7 +54,7 @@ namespace {
 class DefaultQuantParamsPass
     : public impl::DefaultQuantParamsPassBase<DefaultQuantParamsPass> {
  public:
-  DefaultQuantParamsPass() {}
+  DefaultQuantParamsPass() = default;
 
   explicit DefaultQuantParamsPass(double default_min, double default_max,
                                   bool is_signed) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.cc b/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.cc
index 2959f6764354d0..6caa2107799844 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.cc
@@ -71,12 +71,15 @@ LogicalResult FillCompositeParams(stablehlo::CompositeOp op,
     return failure();
   }
   std::string dtype = dtype_attr.getValue().str();
-  if (dtype == "i8") {
-    num_bits = 8;
+  if (dtype == "i2") {
+    num_bits = 2;
     is_signed = true;
   } else if (dtype == "i4") {
     num_bits = 4;
     is_signed = true;
+  } else if (dtype == "i8") {
+    num_bits = 8;
+    is_signed = true;
   } else {
     return failure();
   }
@@ -110,7 +113,16 @@ LogicalResult GetStorageParams(unsigned num_bits, bool narrow_range,
                                bool is_signed, MLIRContext* ctx,
                                Type& storage_type, int64_t& qmin,
                                int64_t& qmax) {
-  if (num_bits <= 4) {
+  if (num_bits == 2) {
+    storage_type = IntegerType::get(ctx, 2);
+    if (is_signed) {
+      qmin = -2;
+      qmax = 1;
+    } else {
+      qmin = 0;
+      qmax = 3;
+    }
+  } else if (num_bits <= 4) {
     storage_type = IntegerType::get(ctx, 4);
     if (is_signed) {
       qmin = -8;
diff --git a/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc b/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
index feec5f23ca015b..29bb4e7134b598 100644
--- a/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
@@ -42,7 +42,7 @@ struct ModifyIONodesPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ModifyIONodesPass)
 
-  explicit ModifyIONodesPass() {}
+  explicit ModifyIONodesPass() = default;
   explicit ModifyIONodesPass(mlir::Type input_type, mlir::Type output_type) {
     this->input_type = input_type;
     this->output_type = output_type;
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 3e9cc005dafe01..c3d28495a31fde 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -2155,18 +2155,13 @@ def ReorderGatherAndCast : Pat<
 // Replace division by a constant with a multiplication by a reciprocal of that
 // constant. Floating point division can be ~10x more expensive than a
 // multiplication.
-// Only do the replacement when arg0 is not a constant, otherwise the newly
-// generated div will be converted to mul again if the const div is not
-// folded (that could happen when const tensor is very large), and that will
-// cause infinite recursion.
 def RealDivWithF32ConstDivisor : Pat<
   (TFL_DivOp:$src $arg0, (Arith_ConstantOp FloatElementsAttr<32>:$value), $activation),
   (TFL_MulOp:$dest1 $arg0,
     (TFL_DivOp (Arith_ConstantOp
       (GetScalarOfType<1> (Arith_ConstantOp $value))),
       (Arith_ConstantOp $value),  TFL_AF_None),
-    $activation),
-  [(NotConstantLike $arg0)]>;
+    $activation)>;
 
 // Replace casting a boolean tensor to a numeric type, followed by comparing
 // with zero. Note it doesn't matter what type we're casting to. HasSameType
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 402674f7cbcf95..81a1a4e286f174 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -170,7 +170,7 @@ class PrepareCompositeFunctionsPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareCompositeFunctionsPass)
 
-  explicit PrepareCompositeFunctionsPass() {}
+  explicit PrepareCompositeFunctionsPass() = default;
 
  private:
   // TODO(b/160915525): Consolidate FuncAttr and StringAttr into one.
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
index 96412f20633f6a..7453ed54975a5a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -43,7 +43,7 @@ limitations under the License.
 namespace mlir {
 namespace TFL {
 namespace {
-#define GEN_PASS_CLASSES
+#define GEN_PASS_DEF_QUANTIZEVARIABLESPASS
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h.inc"
 
 using ResourceIdMap =
@@ -80,7 +80,7 @@ Type GetDequantizedTypeFromAssigneVariableOp(VarHandleOp var_handle_op) {
 }
 
 class QuantizeVariablesPass
-    : public QuantizeVariablesPassBase<QuantizeVariablesPass> {
+    : public impl::QuantizeVariablesPassBase<QuantizeVariablesPass> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeVariablesPass)
   explicit QuantizeVariablesPass() = default;
diff --git a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
index 25d9b15fec858a..80e0986209e8d0 100644
--- a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
@@ -42,7 +42,7 @@ struct RaiseCustomOpsPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RaiseCustomOpsPass)
 
-  explicit RaiseCustomOpsPass() {}
+  explicit RaiseCustomOpsPass() = default;
   explicit RaiseCustomOpsPass(const std::vector<std::string> &target_ops) {
     this->target_ops_ = target_ops;
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc b/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
index a814f35a385c29..c735517dd2f1f3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
@@ -31,7 +31,7 @@ class RuntimeVerifyPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RuntimeVerifyPass)
 
-  explicit RuntimeVerifyPass() {}
+  explicit RuntimeVerifyPass() = default;
 
  private:
   void runOnOperation() override;
diff --git a/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.cc b/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.cc
index a8ef6ac3b0d711..29576e8e06676a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/while_loop_outline_pass.cc
@@ -59,10 +59,10 @@ bool IsCompatibleTypeWithTFLCastOp(Type type) {
       elemType.isF64())
     return true;
 
-  // I1, I4, I8, I16, I32, I64 types are allowed.
-  if (elemType.isInteger(1) || elemType.isInteger(4) || elemType.isInteger(8) ||
-      elemType.isInteger(16) || elemType.isInteger(32) ||
-      elemType.isInteger(64))
+  // I1, I2, I4, I8, I16, I32, I64 types are allowed.
+  if (elemType.isInteger(1) || elemType.isInteger(2) || elemType.isInteger(4) ||
+      elemType.isInteger(8) || elemType.isInteger(16) ||
+      elemType.isInteger(32) || elemType.isInteger(64))
     return true;
 
   // Complex<F<32>> is allowed.
diff --git a/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
index f5699502eb134f..f88fc74b017555 100644
--- a/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
@@ -44,7 +44,7 @@ class TrimFunctionsPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TrimFunctionsPass)
 
-  explicit TrimFunctionsPass() {}
+  explicit TrimFunctionsPass() = default;
   explicit TrimFunctionsPass(llvm::ArrayRef<std::string> trim_funcs_allowlist) {
     this->trim_funcs_allowlist_ = trim_funcs_allowlist;
   }
diff --git a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
index d464edc4078618..1b82ca5b0e61dc 100644
--- a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
@@ -67,14 +67,14 @@ template <typename T>
 llvm::SmallVector<mlir::APInt> ReadAsHostEndian(ArrayRef<uint8_t> bytes) {
   llvm::SmallVector<mlir::APInt> ret;
   size_t read_size = sizeof(T);
-  int bytes_len = bytes.size();
+  size_t bytes_len = bytes.size();
   assert(bytes_len % read_size == 0);
 
-  int elem_count = bytes_len / read_size;
+  size_t elem_count = bytes_len / read_size;
   ret.reserve(elem_count);
 
   const char* data_ptr = reinterpret_cast<const char*>(bytes.data());
-  for (int i = 0; i < elem_count; i++) {
+  for (size_t i = 0; i < elem_count; i++) {
     T val = llvm::support::endian::readNext<T, llvm::endianness::native,
                                             llvm::support::unaligned>(data_ptr);
     ret.push_back(mlir::APInt(sizeof(T) * 8, val));
@@ -301,9 +301,17 @@ StatusOr<mlir::ElementsAttr> ConvertIntBuffer(
       return mlir::ElementsAttr(
           DenseElementsAttr::get(shaped_type, ArrayRef<bool>(boolValues)));
     }
+    case 2: {
+      auto i2Values = tflite::UnpackDenseLowBitIntoInt8(
+          buffer, shaped_type.getNumElements(), /*bit_width=*/2);
+      // Use `getFromRawBuffer()` instead of `get()` to bypass a templated size
+      // check which doesn't work with int2 because int2_t doesn't exist.
+      return mlir::ElementsAttr(DenseElementsAttr::getFromRawBuffer(
+          shaped_type, ArrayRef<char>(i2Values)));
+    }
     case 4: {
-      auto i4Values =
-          tflite::UnpackDenseInt4IntoInt8(buffer, shaped_type.getNumElements());
+      auto i4Values = tflite::UnpackDenseLowBitIntoInt8(
+          buffer, shaped_type.getNumElements(), /*bit_width=*/4);
       // Use `getFromRawBuffer()` instead of `get()` to bypass a templated size
       // check which doesn't work with int4 because int4_t doesn't exist.
       return mlir::ElementsAttr(DenseElementsAttr::getFromRawBuffer(
@@ -354,7 +362,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
       assert(bytes_len % 2 == 0);
       // Supports both BF16 and F16.
       assert(elem_type.isF16() || elem_type.isBF16());
-      int elem_count = bytes_len / 2;
+      size_t elem_count = bytes_len / 2;
 
       if (elem_type.isF16()) {
         std::vector<Eigen::half> values;
@@ -362,7 +370,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
 
         const char* data = reinterpret_cast<const char*>(buffer.data());
 
-        for (int i = 0; i < elem_count; i++) {
+        for (size_t i = 0; i < elem_count; i++) {
           uint16_t bit_repr = llvm::support::endian::readNext<
               uint16_t, llvm::endianness::native, llvm::support::unaligned>(
               data);
@@ -377,7 +385,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
 
         const char* data = reinterpret_cast<const char*>(buffer.data());
 
-        for (int i = 0; i < elem_count; i++) {
+        for (size_t i = 0; i < elem_count; i++) {
           uint16_t bit_repr = llvm::support::endian::readNext<
               uint16_t, llvm::endianness::native, llvm::support::unaligned>(
               data);
@@ -390,13 +398,13 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
     }
     case 32: {
       assert(bytes_len % 4 == 0);
-      int elem_count = bytes_len / 4;
+      size_t elem_count = bytes_len / 4;
       std::vector<float> values;
       values.reserve(elem_count);
 
       const char* data = reinterpret_cast<const char*>(buffer.data());
 
-      for (int i = 0; i < elem_count; i++) {
+      for (size_t i = 0; i < elem_count; i++) {
         uint32_t bit_repr =
             llvm::support::endian::readNext<uint32_t, llvm::endianness::native,
                                             llvm::support::unaligned>(data);
@@ -407,13 +415,13 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
     }
     case 64: {
       assert(bytes_len % 8 == 0);
-      int elem_count = bytes_len / 8;
+      size_t elem_count = bytes_len / 8;
       std::vector<double> values;
       values.reserve(elem_count);
 
       const char* data = reinterpret_cast<const char*>(buffer.data());
 
-      for (int i = 0; i < elem_count; i++) {
+      for (size_t i = 0; i < elem_count; i++) {
         uint64_t bit_repr =
             llvm::support::endian::readNext<uint64_t, llvm::endianness::native,
                                             llvm::support::unaligned>(data);
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index b118bab483048a..d774055fd2928a 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -114,6 +114,8 @@ mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder) {
       return mlir::ComplexType::get(builder.getF32Type());
     case tflite::TensorType_COMPLEX128:
       return mlir::ComplexType::get(builder.getF64Type());
+    case tflite::TensorType_INT2:
+      return builder.getIntegerType(2);
     case tflite::TensorType_INT4:
       return builder.getIntegerType(4);
     case tflite::TensorType_INT8:
@@ -143,7 +145,9 @@ tensorflow::DataType TflTypeToTfType(tflite::TensorType type) {
       return tensorflow::DT_FLOAT;
     case tflite::TensorType_FLOAT64:
       return tensorflow::DT_DOUBLE;
-    // TODO(b/246806634): Tensorflow DT_INT4 type doesn't exist yet
+    // TODO(b/246806634): Tensorflow DT_INT2/4 type doesn't exist yet
+    case tflite::TensorType_INT2:
+      return tensorflow::DT_INT8;
     case tflite::TensorType_INT4:
       return tensorflow::DT_INT8;
     case tflite::TensorType_INT8:
diff --git a/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc b/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc
index aa2e9697595b89..d0710f8b4d49d8 100644
--- a/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc
@@ -21,39 +21,41 @@ limitations under the License.
 
 namespace tflite {
 
-std::vector<uint8_t> PackInt4ValuesDensely(std::vector<uint8_t> src_buffer) {
+std::vector<uint8_t> PackLowBitValuesDensely(std::vector<uint8_t> src_buffer,
+                                             int bit_width) {
   auto num_elements = src_buffer.size();
-  auto packed_size = (num_elements + 1) / 2;
-  std::vector<uint8_t> packed_buffer((num_elements + 1) / 2);
+  const int elements_per_byte = 8 / bit_width;
+  auto packed_size = (num_elements + elements_per_byte - 1) / elements_per_byte;
+  std::vector<uint8_t> packed_buffer(packed_size, 0);
+  const uint8_t mask = (1 << bit_width) - 1;
 
-  for (int i = 0; i < num_elements - 1; i += 2) {
-    packed_buffer[i / 2] = src_buffer[i] & 0x0F;
-    packed_buffer[i / 2] |= src_buffer[i + 1] << 4;
-  }
-
-  // Copy the final nibble if the buffer is odd-lengthed
-  if (num_elements % 2 != 0) {
-    packed_buffer[packed_size - 1] = src_buffer[num_elements - 1] & 0x0F;
+  for (int i = 0; i < num_elements; ++i) {
+    int byte_index = i / elements_per_byte;
+    int bit_offset = (i % elements_per_byte) * bit_width;
+    packed_buffer[byte_index] |= (src_buffer[i] & mask) << bit_offset;
   }
 
   return packed_buffer;
 }
 
-std::vector<char> UnpackDenseInt4IntoInt8(
-    const std::vector<uint8_t>& src_buffer, int64_t num_elements) {
+std::vector<char> UnpackDenseLowBitIntoInt8(
+    const std::vector<uint8_t>& src_buffer, int64_t num_elements,
+    int bit_width) {
   std::vector<char> unpacked_buffer;
   unpacked_buffer.reserve(num_elements);
+  const int elements_per_byte = 8 / bit_width;
+  const int sign_bit_shift = 8 - bit_width;
 
   for (uint8_t value : src_buffer) {
-    // Cast to signed before right-shifting to ensure correct sign extension
-    unpacked_buffer.push_back(static_cast<int8_t>(value << 4) >> 4);
-    unpacked_buffer.push_back(static_cast<int8_t>(value) >> 4);
-  }
-
-  // The last element might be a padded zero, so check and pop if needed
-  if (unpacked_buffer.size() > num_elements) {
-    assert(unpacked_buffer.size() == num_elements + 1);
-    unpacked_buffer.pop_back();
+    for (int i = 0; i < elements_per_byte; ++i) {
+      if (unpacked_buffer.size() == num_elements) break;
+      int bit_offset = i * bit_width;
+      uint8_t extracted_value = (value >> bit_offset);
+      // Sign extend
+      unpacked_buffer.push_back(
+          static_cast<int8_t>(extracted_value << sign_bit_shift) >>
+          sign_bit_shift);
+    }
   }
 
   return unpacked_buffer;
diff --git a/tensorflow/compiler/mlir/lite/utils/low_bit_utils.h b/tensorflow/compiler/mlir/lite/utils/low_bit_utils.h
index fa9bd851eab284..f0633410a45c66 100644
--- a/tensorflow/compiler/mlir/lite/utils/low_bit_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/low_bit_utils.h
@@ -20,17 +20,18 @@ limitations under the License.
 #include <vector>
 
 namespace tflite {
-// Assumes that `src_tensor` is a buffer where each element is a 4-bit value
-// stored in 8-bit.
-// Returns a new buffer that is packed densely with 2 4-bit values in a byte.
-// The packing format is low-bits-first, i.e. the lower nibble of a byte is
-// filled first, followed by the upper nibble.
-std::vector<uint8_t> PackInt4ValuesDensely(std::vector<uint8_t> src_buffer);
-
-// Assumes `src_buffer` contains 2 4-bit elements packed in 8-bit.
-// Returns a vector where each int8 element contains a int4 sign-extended value.
-std::vector<char> UnpackDenseInt4IntoInt8(
-    const std::vector<uint8_t>& src_buffer, int64_t num_elements);
+// Assumes that `src_tensor` is a buffer where each element is a low bit value
+// (e.g. 2 or 4-bit) stored in 8-bit.
+// Returns a new buffer that is packed densely.
+// The packing format is low-bits-first.
+std::vector<uint8_t> PackLowBitValuesDensely(std::vector<uint8_t> src_buffer,
+                                             int bit_width);
+
+// Assumes `src_buffer` contains densely packed low bit elements.
+// Returns a vector where each int8 element contains a sign-extended value.
+std::vector<char> UnpackDenseLowBitIntoInt8(
+    const std::vector<uint8_t>& src_buffer, int64_t num_elements,
+    int bit_width);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LOW_BIT_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
index 2acb4dccb88a18..0ae1247e2a156a 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
@@ -43,13 +43,13 @@ void Register(const std::string& op_name, OpRegistry* registry) {
 }  // namespace
 
 TEST(TfTextUtilsTest, TestTfTextRegistered) {
-  std::unique_ptr<OpRegistry> registry(new OpRegistry);
+  std::unique_ptr<OpRegistry> registry = std::make_unique<OpRegistry>();
   Register("WhitespaceTokenizeWithOffsets", registry.get());
   EXPECT_TRUE(IsTFTextRegistered(registry.get()));
 }
 
 TEST(TfTextUtilsTest, TestTfTextNotRegistered) {
-  std::unique_ptr<OpRegistry> registry(new OpRegistry);
+  std::unique_ptr<OpRegistry> registry = std::make_unique<OpRegistry>();
   Register("Test", registry.get());
   EXPECT_FALSE(IsTFTextRegistered(registry.get()));
 }
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
index 9d7e689f3b6a3c..0c6a636d38b822 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
@@ -124,7 +124,7 @@ class ModifyMlirModulePass : public MlirOptimizationPass {
 };
 
 FunctionDef XTimesTwo() {
-  const Tensor kTwo = test::AsScalar<int64>(2);
+  const Tensor kTwo = test::AsScalar<int64_t>(2);
   return FunctionDefHelper::Define(
       // Name
       "XTimesTwo",
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 5eaf5d736262ca..4f2384347a7802 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -251,7 +251,7 @@ std::string ExperimentalConvertSavedModelToMlir(
 
   // Convert the SavedModelV2Bundle to an MLIR module.
 
-  std::vector<string> exported_names =
+  std::vector<std::string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::DialectRegistry registry;
   mlir::func::registerAllExtensions(registry);
@@ -270,10 +270,10 @@ std::string ExperimentalConvertSavedModelV1ToMlirLite(
     const std::string& saved_model_path, const std::string& exported_names_str,
     const std::string& tags, bool upgrade_legacy, bool show_debug_info,
     TF_Status* status) {
-  std::unordered_set<string> tag_set =
+  std::unordered_set<std::string> tag_set =
       absl::StrSplit(tags, ',', absl::SkipEmpty());
 
-  std::vector<string> exported_names =
+  std::vector<std::string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::DialectRegistry registry;
   mlir::func::registerAllExtensions(registry);
@@ -299,7 +299,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
     bool show_debug_info, TF_Status* status) {
   // Load the saved model into a SavedModelBundle.
 
-  std::unordered_set<string> tag_set =
+  std::unordered_set<std::string> tag_set =
       absl::StrSplit(tags, ',', absl::SkipEmpty());
 
   tensorflow::SavedModelBundle bundle;
@@ -311,7 +311,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   }
 
   // Convert the SavedModelBundle to an MLIR module.
-  std::vector<string> exported_names =
+  std::vector<std::string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::DialectRegistry registry;
   mlir::func::registerAllExtensions(registry);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 5c0de51a4f059a..969e84996acb4d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -125,6 +125,7 @@ cc_library(
         "@local_tsl//tsl/platform:regexp",
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@shardy//shardy/dialect/sdy/ir:register",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_passes",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
index 9a82ea7194614e..5d6d36ed3a6c7d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
@@ -36,8 +36,6 @@ using ::testing::HasSubstr;
 using ::testing::Key;
 using ::testing::SizeIs;
 using ::testing::StrEq;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 TEST(CreateRepresentativeDatasetFileMapTest,
      ConfigWithoutExplicitSignatureKeyMappedToServingDefault) {
@@ -52,7 +50,7 @@ TEST(CreateRepresentativeDatasetFileMapTest,
       representative_dataset_file_map =
           CreateRepresentativeDatasetFileMap(representative_dataset_configs);
 
-  ASSERT_THAT(representative_dataset_file_map, IsOk());
+  ASSERT_THAT(representative_dataset_file_map, absl_testing::IsOk());
   ASSERT_THAT(*representative_dataset_file_map, SizeIs(1));
   EXPECT_THAT(*representative_dataset_file_map,
               Contains(Key("serving_default")));
@@ -74,7 +72,7 @@ TEST(CreateRepresentativeDatasetFileMapTest, ConfigWithExplicitSignatureKey) {
       representative_dataset_file_map =
           CreateRepresentativeDatasetFileMap(representative_dataset_configs);
 
-  ASSERT_THAT(representative_dataset_file_map, IsOk());
+  ASSERT_THAT(representative_dataset_file_map, absl_testing::IsOk());
   ASSERT_THAT(*representative_dataset_file_map, SizeIs(1));
   EXPECT_THAT(*representative_dataset_file_map,
               Contains(Key(StrEq("test_signature_key"))));
@@ -103,8 +101,9 @@ TEST(CreateRepresentativeDatasetFileMapTest,
           CreateRepresentativeDatasetFileMap(representative_dataset_configs);
 
   EXPECT_THAT(representative_dataset_file_map,
-              StatusIs(absl::StatusCode::kInvalidArgument,
-                       HasSubstr("duplicate signature key: serving_default")));
+              absl_testing::StatusIs(
+                  absl::StatusCode::kInvalidArgument,
+                  HasSubstr("duplicate signature key: serving_default")));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
index a3a09bdb35daaa..2fb8f11a4e4349 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
@@ -49,23 +49,23 @@ class TestEnvBrokenFileSystem : public tsl::Env {
  public:
   TestEnvBrokenFileSystem() = default;
 
-  bool MatchPath(const tsl::string& path, const tsl::string& pattern) override {
+  bool MatchPath(const std::string& path, const std::string& pattern) override {
     return false;
   }
 
   void SleepForMicroseconds(int64_t micros) override {}
 
-  tsl::string GetRunfilesDir() override { return tsl::string("dummy_path"); }
+  std::string GetRunfilesDir() override { return std::string("dummy_path"); }
 
   int64_t GetCurrentThreadId() override { return 0; }
 
   tsl::Thread* StartThread(const tsl::ThreadOptions& thread_options,
-                           const tsl::string& name,
+                           const std::string& name,
                            absl::AnyInvocable<void()> fn) override {
     return nullptr;
   }
 
-  bool GetCurrentThreadName(tsl::string* name) override { return false; }
+  bool GetCurrentThreadName(std::string* name) override { return false; }
 
   void SchedClosure(absl::AnyInvocable<void()> closure) override {}
 
@@ -82,9 +82,9 @@ class TestEnvBrokenFileSystem : public tsl::Env {
     return absl::OkStatus();
   }
 
-  tsl::string FormatLibraryFileName(const tsl::string& name,
-                                    const tsl::string& version) override {
-    return tsl::string("dummy_path");
+  std::string FormatLibraryFileName(const std::string& name,
+                                    const std::string& version) override {
+    return std::string("dummy_path");
   }
 
   // This is the part that would break the `CreateTmpDir` function because it
@@ -95,7 +95,7 @@ class TestEnvBrokenFileSystem : public tsl::Env {
   }
 
  private:
-  void GetLocalTempDirectories(std::vector<tsl::string>* list) override {
+  void GetLocalTempDirectories(std::vector<std::string>* list) override {
     list->push_back("/tmp");
   }
 };
@@ -107,7 +107,7 @@ class TestEnvBrokenFileSystemAndNoLocalTempDirs
  private:
   // This is the part that essentially breaks the `GetLocalTmpFileName` function
   // because it doesn't provide any available temp dirs.
-  void GetLocalTempDirectories(std::vector<tsl::string>* list) override {}
+  void GetLocalTempDirectories(std::vector<std::string>* list) override {}
 };
 
 TEST(IoTest, GetLocalTmpFileNameGivesValidFileName) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index babda33245a7c8..0818c8013e534e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -246,8 +246,8 @@ class ConvertTfQuantToMhloIntTest : public Test {
     // Convert to double for comparison. This is needed for comparing integers
     // since it LiteralTestUtil asserts different integers even if it is within
     // error_spec.
-    TF_ASSERT_OK_AND_ASSIGN(auto expected_double, expected->Convert(xla::F64))
-    TF_ASSERT_OK_AND_ASSIGN(auto result_double, result->Convert(xla::F64))
+    TF_ASSERT_OK_AND_ASSIGN(auto expected_double, expected->Convert(xla::F64));
+    TF_ASSERT_OK_AND_ASSIGN(auto result_double, result->Convert(xla::F64));
     EXPECT_TRUE(xla::LiteralTestUtil::Near(expected_double, result_double,
                                            xla::ErrorSpec(error_tolerance)));
   }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
index b55bf3f5d18558..7ee6bbd98f61e6 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "shardy/dialect/sdy/ir/register.h"  // from @shardy
 #include "stablehlo/dialect/Serialization.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"  // IWYU pragma: keep
@@ -54,6 +55,7 @@ absl::StatusOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
   }
 
   MLIRContext context;
+  mlir::sdy::loadAllRequiredDialects(&context);
   OwningOpRef<ModuleOp> stablehlo_module_op =
       mlir::stablehlo::deserializePortableArtifact(serialized_stablehlo_module,
                                                    &context);
@@ -77,7 +79,8 @@ absl::StatusOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
   std::string bytecode;
   llvm::raw_string_ostream os(bytecode);
   if (failed(mlir::stablehlo::serializePortableArtifact(
-          stablehlo_module_op.get(), version.value().toString(), os))) {
+          stablehlo_module_op.get(), version.value().toString(), os,
+          /*allowOtherDialects=*/true))) {
     return absl::InternalError("Failed to serialize StableHLO module.");
   }
   return bytecode;
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
index c2e91c5da16e93..1f6464d85f5ef4 100644
--- a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
@@ -4822,7 +4822,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
@@ -5022,7 +5022,7 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index cc178e762ecadd..cbd6bc3b283504 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -962,12 +962,18 @@ cc_library(
     hdrs = ["utils/deserialize_mlir_module_utils.h"],
     deps = [
         ":error_util",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_xla//xla:status_macros",
+        "@local_xla//xla/tsl/lib/io:inputstream_interface",
+        "@local_xla//xla/tsl/lib/io:zlib_compression_options",
+        "@local_xla//xla/tsl/lib/io:zlib_inputstream",
     ],
 )
 
@@ -976,10 +982,20 @@ cc_library(
     srcs = ["utils/serialize_mlir_module_utils.cc"],
     hdrs = ["utils/serialize_mlir_module_utils.h"],
     deps = [
-        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:flags_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla/tsl/lib/io:zlib_compression_options",
+        "@local_xla//xla/tsl/lib/io:zlib_outputbuffer",
+        "@local_xla//xla/tsl/platform:env",
+        "@local_xla//xla/tsl/platform:errors",
     ],
 )
 
@@ -987,6 +1003,7 @@ tf_cc_test(
     name = "serialize_mlir_module_utils_test",
     srcs = ["utils/serialize_mlir_module_utils_test.cc"],
     deps = [
+        ":deserialize_mlir_module_utils",
         ":serialize_mlir_module_utils",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 8a641e06d93519..e8d0ea525943fd 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -1107,5 +1107,8 @@ LogicalResult IslandOp::fold(FoldAdaptor,
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
+using mlir::tf_executor::ControlType;
+using mlir::tf_executor::TokenType;
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index e23f510182259f..4104cf412acfd8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -335,7 +335,6 @@ def TF_IfRegionOp : TF_Op<"IfRegion",
            "areTypesCompatible",
            "getEntrySuccessorOperands",
            "getRegionInvocationBounds",
-           "getSuccessorRegions"
        ]>
       ]> {
   let summary = "output = cond ? then_branch output : else_branch output";
@@ -395,7 +394,6 @@ def TF_GeneratorDatasetRegionOp : TF_Op<"GeneratorDatasetRegion",
            "areTypesCompatible",
            "getEntrySuccessorOperands",
            "getRegionInvocationBounds",
-           "getSuccessorRegions"
        ]>,
        SingleBlockImplicitTerminator<"YieldOp">,
        TF_GeneratorOpSideEffect,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index ee65668078ca59..6382f325a47505 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -3003,14 +3003,14 @@ void GeneratorDatasetRegionOp::getRegionInvocationBounds(
 }
 
 OperandRange GeneratorDatasetRegionOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   auto end = this->getOperation()->operand_end();
-  if (point.isParent()) {
+  if (successor.isParent()) {
     // The op itself doesn't branch back to itself.
     return ::mlir::OperandRange(end, end);
-  } else if (point.getRegionOrNull() == &getInit()) {
+  } else if (successor.getSuccessor() == &getInit()) {
     return getInitFuncOtherArgs();
-  } else if (point.getRegionOrNull() == &getNext()) {
+  } else if (successor.getSuccessor() == &getNext()) {
     return getNextFuncOtherArgs();
   } else /* finalize region */ {
     return getFinalizeFuncOtherArgs();
@@ -3024,13 +3024,15 @@ void GeneratorDatasetRegionOp::getSuccessorRegions(
     // The op itself branches to `init` first.
     regions.push_back(
         RegionSuccessor(&getInit(), getInit().front().getArguments()));
-  } else if (point.getRegionOrNull() == &getInit()) {
+  } else if (point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getInit()) {
     // `init` branches to `next`, passing along the arguments given to `init`'s
     // yield. Said arguments precede the "other args".
     n = getInitFuncOtherArgs().size();
     regions.push_back(RegionSuccessor(
         &getNext(), getNext().front().getArguments().drop_back(n)));
-  } else if (point.getRegionOrNull() == &getNext()) {
+  } else if (point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getNext()) {
     // `next` branches to itself, or to `finalize`, passing all arguments given
     // to `next`s yield.
 
@@ -3045,7 +3047,8 @@ void GeneratorDatasetRegionOp::getSuccessorRegions(
         &getFinalize(), getFinalize().front().getArguments().slice(0, num)));
   } else {
     // `finalize` branches back to the op itself, not passing any arguments.
-    regions.push_back(RegionSuccessor());
+    regions.push_back(RegionSuccessor(
+        point.getTerminatorPredecessorOrNull()->getParentRegion()));
   }
 }
 
@@ -3261,11 +3264,12 @@ void IfRegionOp::getRegionInvocationBounds(
   invocationBounds.assign(2, {0, 1});
 }
 
-OperandRange IfRegionOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+OperandRange IfRegionOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   // IfRegionOp currently only allows one op (the condition), so there are no
   // remaining operands for the successor.
-  assert((point.isParent() ||
-          (point == (*this)->getRegion(0) || point == (*this)->getRegion(1))) &&
+  assert((successor.isParent() ||
+          (successor.getSuccessor() == &(*this)->getRegion(0) ||
+           successor.getSuccessor() == &(*this)->getRegion(1))) &&
          "Invalid IfRegionOp region index.");
   auto end = this->getOperation()->operand_end();
   return ::mlir::OperandRange(end, end);
@@ -3275,16 +3279,20 @@ void IfRegionOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor>& regions) {
   if (!point.isParent()) {
     // The `then` and the `else` region branch back to the parent operation.
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(
+        RegionSuccessor(point.getTerminatorPredecessorOrNull(), getResults()));
     return;
   } else {
     // The parent can branch to either `then` or `else`.
-    regions.push_back(RegionSuccessor(&getThenBranch()));
+    regions.push_back(
+        RegionSuccessor(&getThenBranch(), getThenBranch().getArguments()));
     Region* elseRegion = &this->getElseBranch();
     if (!elseRegion->empty())
-      regions.push_back(RegionSuccessor(elseRegion));
+      regions.push_back(
+          RegionSuccessor(elseRegion, elseRegion->getArguments()));
     else
-      regions.push_back(RegionSuccessor());
+      regions.push_back(RegionSuccessor(
+          point.getTerminatorPredecessorOrNull()->getParentRegion()));
   }
 }
 
@@ -3727,5 +3735,7 @@ LogicalResult BitcastOp::verify() {
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
+using namespace mlir;  // NOLINT
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index e4100657db7081..23683673fe189a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -3611,8 +3611,8 @@ SmallVector<Region *> WhileRegionOp::getLoopRegions() { return {&getBody()}; }
 //===----------------------------------------------------------------------===//
 
 OperandRange WhileRegionOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
-  if (point.isParent()) {
+    RegionSuccessor successor) {
+  if (successor.isParent()) {
     // WhileRegionOp branches to the condition, which branches to the body. But
     // the op itself doesn't branch back to itself. So this range is empty.
     auto end = this->getOperation()->operand_end();
@@ -3628,21 +3628,28 @@ OperandRange WhileRegionOp::getEntrySuccessorOperands(
 
 void WhileRegionOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
-  if (!point.isParent() && point == (*this)->getRegion(0)) {
+  if (!point.isParent() &&
+      (point.getTerminatorPredecessorOrNull() &&
+       point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+           &(*this)->getRegion(0))) {
     // 'cond' branches to the body or returns.
     Operation *yield = getCond().front().getTerminator();
     if (yield->getOperands().size() ==
         1 + this->getOperation()->getOperands().size()) {
       regions.push_back(
           RegionSuccessor(&getBody(), getBody().front().getArguments()));
-      regions.push_back(getResults());
+      regions.push_back(RegionSuccessor(getOperation(), getResults()));
     } else {
       // For compatibility with older code, we allow the "yield" in a condition
       // to only yield a single boolean. In that case we can't forward any args.
       regions.push_back(RegionSuccessor(&getBody()));
-      regions.push_back(RegionSuccessor());  // branch back to parent, no args
+      regions.push_back(
+          RegionSuccessor(getOperation(), getResults().take_front(0)));
     }
-  } else if (!point.isParent() && point == (*this)->getRegion(1)) {
+  } else if (!point.isParent() &&
+             (point.getTerminatorPredecessorOrNull() &&
+              point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+                  &(*this)->getRegion(1))) {
     // 'body' branches back to 'cond'.
     regions.push_back(
         RegionSuccessor(&getCond(), getCond().front().getArguments()));
@@ -4510,7 +4517,7 @@ LogicalResult UniformQuantizedClipByValueOp::verify() {
 //===----------------------------------------------------------------------===//
 
 MutableOperandRange YieldOp::getMutableSuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   if (auto whileOp =
           llvm::dyn_cast<WhileRegionOp>(this->getOperation()->getParentOp())) {
     if (&whileOp.getCond() == this->getOperation()->getParentRegion()) {
@@ -4538,5 +4545,7 @@ MutableOperandRange YieldOp::getMutableSuccessorOperands(
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
+using namespace mlir;  // NOLINT
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
index b5ce10d1500be8..7419149074fb8a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -188,5 +188,6 @@ std::optional<std::string> _SendOp::GetResourceInstanceStr() {
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
+using namespace mlir;  // NOLINT
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 765ed1171a8449..a3305eef8a0819 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1317,7 +1317,7 @@ func.func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -
 
 // tf.Region yield number of results should match op number of results
 func.func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Region #0 to parent results: source has 2 operands, but target successor needs 1}}
+  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Operation tf.Yield to parent results: source has 2 operands, but target successor <to parent> needs 1}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
@@ -1332,7 +1332,7 @@ func.func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>)
 // -----
 
 func.func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Region #1 to parent results: source has 2 operands, but target successor needs 1}}
+  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Operation tf.Yield to parent results: source has 2 operands, but target successor <to parent> needs 1}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
index 118d7f38ebf959..1087c6e3a679bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
@@ -36,7 +36,8 @@ class TestModule(tf.Module):
   def __init__(self):
     super(TestModule, self).__init__()
     self.v42 = tf.Variable(42.0)
-    self.c43 = tf.constant(43.0)
+    # Use convert_to_tensor to avoid forcing eager `.numpy()` in graph/XLA mode.
+    self.c43 = tf.convert_to_tensor(43.0, dtype=tf.float32)
 
   # During serialization, the constants are given internal (non-user-accessible, non-semantically-load-bearing) exported names.
   # CHECK: "tf_saved_model.global_tensor"() <{sym_name = "[[CONST:[a-zA-Z_0-9.]+]]", type = tensor<f32>, value = dense<4.300000e+01> : tensor<f32>}> {tf_saved_model.exported_names = [{{.*}}]} : () -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index bc4487a4e3fd7d..954c318b416150 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
 
 #include <algorithm>
-#include <cctype>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -29,6 +28,7 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
@@ -230,7 +230,7 @@ std::optional<llvm::SmallDenseMap<char, int64_t>> EquationToMap(
     llvm::StringRef equation) {
   llvm::SmallDenseMap<char, int64_t> map;
   for (int64_t i = 0; i < equation.size(); ++i) {
-    if (!std::isalpha(equation[i])) {
+    if (!llvm::isAlpha(equation[i])) {
       // Unsupported character in the equation.
       return std::nullopt;
     }
@@ -263,7 +263,7 @@ std::optional<llvm::SetVector<char>> GetAvailableLabels(
   const int lhs_size = lhs.size();
   for (int i = 0; i < lhs_size; ++i) {
     const char label = lhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       labels.remove(label);
       ++lhs_count;
     } else if (label == '.') {
@@ -280,7 +280,7 @@ std::optional<llvm::SetVector<char>> GetAvailableLabels(
   const int rhs_size = rhs.size();
   for (int i = 0; i < rhs_size; ++i) {
     const char label = rhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       labels.remove(label);
       ++rhs_count;
     } else if (label == '.') {
@@ -318,7 +318,7 @@ std::tuple<std::string, std::string, std::string> FlattenEllipsis(
   std::string new_lhs;
   for (int i = 0; i < lhs.size(); ++i) {
     const char label = lhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       new_lhs.push_back(label);
     } else {
       // Encounter ellipsis: generate unnamed labels then insert to the new
@@ -333,7 +333,7 @@ std::tuple<std::string, std::string, std::string> FlattenEllipsis(
   std::string new_rhs, new_rhs_labels;
   for (int i = 0; i < rhs.size(); ++i) {
     const char label = rhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       new_rhs.push_back(label);
     } else {
       // Encounter ellipsis: generate unnamed labels then insert to the new
@@ -352,7 +352,7 @@ std::tuple<std::string, std::string, std::string> FlattenEllipsis(
   std::string new_output;
   for (int i = 0; i < out.size(); ++i) {
     const char label = out[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       new_output.push_back(label);
     } else {
       // Encounter ellipsis: we will just insert the generated labels to the new
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
index a2c4a7031ed14b..0cdb563a45eed7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
@@ -49,7 +49,7 @@ static constexpr int kTextFileIndex_LineNumber = -1;
 class InitTextFileToImportPass
     : public impl::InitTextFileToImportPassBase<InitTextFileToImportPass> {
  public:
-  InitTextFileToImportPass() {}
+  InitTextFileToImportPass() = default;
   InitTextFileToImportPass(const InitTextFileToImportPass&) {}
   explicit InitTextFileToImportPass(std::string saved_model_dir) {
     saved_model_dir_ = saved_model_dir;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
index a985cdc11611b4..41c5cd4234f1cc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
@@ -46,7 +46,7 @@ class InitTextFileToImportTestPass
     : public impl::InitTextFileToImportTestPassBase<
           InitTextFileToImportTestPass> {
  public:
-  explicit InitTextFileToImportTestPass() {}
+  explicit InitTextFileToImportTestPass() = default;
 
   StringRef getArgument() const final {
     return "tf-init-text-file-to-import-test";
@@ -115,7 +115,7 @@ class InitTextFileToImportSavedModelTestPass
     : public impl::InitTextFileToImportSavedModelTestPassBase<
           InitTextFileToImportSavedModelTestPass> {
  public:
-  explicit InitTextFileToImportSavedModelTestPass() {}
+  explicit InitTextFileToImportSavedModelTestPass() = default;
 
  private:
   void runOnOperation() override;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 2e023e3e057096..57a41f538f277f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cassert>
-#include <cctype>
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -36,6 +35,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -289,8 +289,10 @@ ObjectNames::ObjectNames(const SavedObjectGraph& object_graph,
                 // - `model.variables.0`
                 // - `model.keras_api.layers.1.keras_api.trainable_variables.0`
                 // - ... 10 more long aliases ending in digits ...
-                return std::make_tuple(isdigit(a.back()), a.size(), a) <
-                       std::make_tuple(isdigit(b.back()), b.size(), b);
+                return std::make_tuple(absl::ascii_isdigit(a.back()), a.size(),
+                                       a) <
+                       std::make_tuple(absl::ascii_isdigit(b.back()), b.size(),
+                                       b);
               });
     for (const std::string& name : kv.second) {
       if (IsExported(name)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc
index c48f52576df4e3..0288006ee4d105 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc
@@ -39,13 +39,13 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ParseOutputArrayInfo(absl::string_view array_names,
-                                  std::vector<string>* outputs) {
+                                  std::vector<std::string>* outputs) {
   TF_RETURN_IF_ERROR(ParseNodeNames(array_names, *outputs));
   return absl::OkStatus();
 }
 
-absl::Status ParseOutputArrayInfo(const std::vector<string>& output_names,
-                                  std::vector<string>* outputs) {
+absl::Status ParseOutputArrayInfo(const std::vector<std::string>& output_names,
+                                  std::vector<std::string>* outputs) {
   for (auto& output_name : output_names) {
     if (output_name.empty()) continue;
     outputs->push_back(output_name);
@@ -57,8 +57,8 @@ absl::Status ParseInputArrayInfo(absl::string_view array_names,
                                  absl::string_view data_types,
                                  absl::string_view shapes,
                                  GraphImportConfig::InputArrays* inputs) {
-  std::vector<string> node_names;
-  std::vector<string> node_dtypes;
+  std::vector<std::string> node_names;
+  std::vector<std::string> node_dtypes;
   std::vector<std::optional<std::vector<int>>> node_shapes;
   TF_RETURN_IF_ERROR(ParseNodeNames(array_names, node_names));
   TF_RETURN_IF_ERROR(ParseNodeDataTypes(data_types, node_dtypes));
@@ -113,8 +113,8 @@ static absl::Status HandleSubtype(absl::string_view subtype,
 }
 
 absl::Status ParseInputArrayInfo(
-    const std::vector<string>& node_names,
-    const std::vector<string>& node_dtypes,
+    const std::vector<std::string>& node_names,
+    const std::vector<std::string>& node_dtypes,
     const std::vector<std::optional<std::vector<int>>>& node_shapes,
     GraphImportConfig::InputArrays* inputs) {
   std::vector<std::string> used_node_dtypes;
@@ -148,7 +148,7 @@ absl::Status ParseInputArrayInfo(
   // StringMap doesn't support reserve else reserve input map size here.
   for (int i = 0, end = node_names.size(); i < end; i++) {
     auto& name = node_names[i];
-    const string& type = used_node_dtypes[i];
+    const std::string& type = used_node_dtypes[i];
     if (name.empty()) continue;
 
     auto it_inserted_pair = inputs->insert({name, {}});
@@ -193,7 +193,7 @@ absl::Status ParseNodeShapes(
     std::vector<std::optional<std::vector<int>>>& shapes_vector) {
   shapes_vector.clear();
   if (!shapes_str.empty()) {
-    std::vector<string> node_shapes_str = absl::StrSplit(shapes_str, ':');
+    std::vector<std::string> node_shapes_str = absl::StrSplit(shapes_str, ':');
     for (int i = 0; i < node_shapes_str.size(); i++) {
       if (node_shapes_str[i] == "*") {
         shapes_vector.push_back(std::nullopt);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
index 1119d4e2b33c4f..176773da45fcbc 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
@@ -35,10 +35,10 @@ namespace tensorflow {
 // Parses the command line flag strings to the specification of nodes in
 // the Graph.
 absl::Status ParseOutputArrayInfo(absl::string_view array_names,
-                                  std::vector<string>* outputs);
+                                  std::vector<std::string>* outputs);
 
-absl::Status ParseOutputArrayInfo(const std::vector<string>& output_names,
-                                  std::vector<string>* outputs);
+absl::Status ParseOutputArrayInfo(const std::vector<std::string>& output_names,
+                                  std::vector<std::string>* outputs);
 
 // Parses the command line flag strings to the specification of nodes in
 // the Graph. `data_types` input string can be empty since the flag is optional.
@@ -48,8 +48,8 @@ absl::Status ParseInputArrayInfo(absl::string_view array_names,
                                  GraphImportConfig::InputArrays* inputs);
 
 absl::Status ParseInputArrayInfo(
-    const std::vector<string>& node_names,
-    const std::vector<string>& node_dtypes,
+    const std::vector<std::string>& node_names,
+    const std::vector<std::string>& node_dtypes,
     const std::vector<std::optional<std::vector<int>>>& node_shapes,
     GraphImportConfig::InputArrays* inputs);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
index 858c70a54a58d6..3706b8afe34d78 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <algorithm>
 #include <atomic>
-#include <cctype>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/log/log.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -99,8 +99,7 @@ std::vector<std::string> BridgeLoggerConfig::GetFilter(
 
 bool BridgeLoggerConfig::ShouldOnlyDumpTopLevelPasses() {
   const char* env_var = getenv(kEnableOnlyTopLevelPassesEnvVar);
-  std::string value(env_var);
-  std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+  std::string value = absl::AsciiStrToLower(env_var);
   // Return true if value is "1" or "true"; otherwise, false.
   return value == "1" || value == "true";
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index b0ad4e265633d8..550ab547498f45 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -249,14 +249,14 @@ absl::StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
     CONVERT_FLAT(DT_BOOL, bool)
     CONVERT_FLAT(DT_FLOAT, float)
     CONVERT_FLAT(DT_DOUBLE, double)
-    CONVERT_FLAT(DT_INT8, int8)
-    CONVERT_FLAT(DT_INT16, int16)
-    CONVERT_FLAT(DT_INT32, int32)
+    CONVERT_FLAT(DT_INT8, int8_t)
+    CONVERT_FLAT(DT_INT16, int16_t)
+    CONVERT_FLAT(DT_INT32, int32_t)
     CONVERT_FLAT(DT_INT64, int64_t)
-    CONVERT_FLAT(DT_UINT8, uint8)
-    CONVERT_FLAT(DT_UINT16, uint16)
-    CONVERT_FLAT(DT_UINT32, uint32)
-    CONVERT_FLAT(DT_UINT64, uint64)
+    CONVERT_FLAT(DT_UINT8, uint8_t)
+    CONVERT_FLAT(DT_UINT16, uint16_t)
+    CONVERT_FLAT(DT_UINT32, uint32_t)
+    CONVERT_FLAT(DT_UINT64, uint64_t)
     CONVERT_FLAT(DT_COMPLEX64, std::complex<float>)
     CONVERT_FLAT(DT_COMPLEX128, std::complex<double>)
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index a34553623408d8..b120b6c786edb6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -162,11 +162,11 @@ TEST_F(ConvertTensorTest, Simple) {
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<tsl::int4>(
       {static_cast<tsl::int4>(1), static_cast<tsl::int4>(-1)}, DT_INT4,
       mlir::IntegerType::get(&context, 4)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8_t>(
       {1, -1}, DT_INT8, mlir::IntegerType::get(&context, 8)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16_t>(
       {1, -1}, DT_INT16, mlir::IntegerType::get(&context, 16)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32_t>(
       {1, -1}, DT_INT32, mlir::IntegerType::get(&context, 32)));
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<int64_t>(
       {1, -1}, DT_INT64, mlir::IntegerType::get(&context, 64)));
@@ -175,19 +175,19 @@ TEST_F(ConvertTensorTest, Simple) {
       {static_cast<tsl::uint4>(1), static_cast<tsl::uint4>(2)}, DT_UINT4,
       mlir::IntegerType::get(
           &context, 4, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8_t>(
       {1, 2}, DT_UINT8,
       mlir::IntegerType::get(
           &context, 8, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16_t>(
       {1, 2}, DT_UINT16,
       mlir::IntegerType::get(
           &context, 16, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32_t>(
       {1, 2}, DT_UINT32,
       mlir::IntegerType::get(
           &context, 32, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64_t>(
       {1, 2}, DT_UINT64,
       mlir::IntegerType::get(
           &context, 64, mlir::IntegerType::SignednessSemantics::Unsigned)));
@@ -222,11 +222,11 @@ TEST_F(ConvertTensorTest, SimpleDenseResourceElements) {
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<tsl::int4>(
       {static_cast<tsl::int4>(1), static_cast<tsl::int4>(-1)}, DT_INT4,
       mlir::IntegerType::get(&context, 4), true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8_t>(
       {1, -1}, DT_INT8, mlir::IntegerType::get(&context, 8), true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16_t>(
       {1, -1}, DT_INT16, mlir::IntegerType::get(&context, 16), true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32_t>(
       {1, -1}, DT_INT32, mlir::IntegerType::get(&context, 32), true));
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<int64_t>(
       {1, -1}, DT_INT64, mlir::IntegerType::get(&context, 64), true));
@@ -236,22 +236,22 @@ TEST_F(ConvertTensorTest, SimpleDenseResourceElements) {
       mlir::IntegerType::get(&context, 4,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8_t>(
       {1, 2}, DT_UINT8,
       mlir::IntegerType::get(&context, 8,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16_t>(
       {1, 2}, DT_UINT16,
       mlir::IntegerType::get(&context, 16,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32_t>(
       {1, 2}, DT_UINT32,
       mlir::IntegerType::get(&context, 32,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64_t>(
       {1, 2}, DT_UINT64,
       mlir::IntegerType::get(&context, 64,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc
index 09a76102557c4f..a4f2861276a9bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc
@@ -59,9 +59,9 @@ TEST(DataDumperLoggerConfig, TestPassFilter) {
          1);
   setenv("TF_DUMP_GRAPH_PREFIX", "sponge", 1);
 
-  const string kTestFilename = "test.txt";
+  const std::string kTestFilename = "test.txt";
   int print_callback_count = 0;
-  auto get_filename_fn = [](const string &filename, mlir::Operation *op) {
+  auto get_filename_fn = [](const std::string& filename, mlir::Operation* op) {
     return filename;
   };
   auto print_callback = [&](llvm::raw_ostream &out) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/deserialize_mlir_module_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/deserialize_mlir_module_utils.cc
index da7917c9c21a4c..bcd3164cd10f7c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/deserialize_mlir_module_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/deserialize_mlir_module_utils.cc
@@ -15,7 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/deserialize_mlir_module_utils.h"
 
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -23,8 +32,54 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "xla/status_macros.h"
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
+#include "tensorflow/core/platform/tstring.h"
 
 namespace tensorflow {
+namespace {
+// Wrap memory buffer into InputStreamInterface
+class MemoryInputStream : public tensorflow::io::InputStreamInterface {
+ public:
+  explicit MemoryInputStream(const char* buffer, size_t length)
+      : buf_(buffer), len_(length), pos_(0) {}
+
+  ~MemoryInputStream() override = default;
+
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override {
+    result->clear();
+    if (bytes_to_read < 0) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Can't read a negative number of bytes: ", bytes_to_read));
+    }
+    absl::Status status = absl::OkStatus();
+    int64_t bytes = bytes_to_read;
+    if (pos_ + bytes_to_read > len_) {
+      bytes = len_ - pos_;
+      status = absl::OutOfRangeError("Reached end of file");
+    }
+    if (bytes > 0) {
+      result->resize(bytes);
+      memcpy(&(*result)[0], &buf_[pos_], bytes);
+      pos_ += bytes;
+    }
+    return status;
+  }
+
+  int64_t Tell() const override { return pos_; }
+
+  absl::Status Reset() override {
+    pos_ = 0;
+    return absl::OkStatus();
+  }
+
+ private:
+  const char* buf_;  // Not owned.
+  int64_t len_;
+  int64_t pos_ = 0;  // Tracks where we are in the file.
+};
+}  // namespace
 
 absl::Status DeserializeMlirModule(
     llvm::StringRef serialized_mlir_module, mlir::MLIRContext* mlir_context,
@@ -37,13 +92,44 @@ absl::Status DeserializeMlirModule(
   // error reporting system.
   mlir::StatusScopedDiagnosticHandler error_handler(mlir_context);
 
-  // Parse the module.
-  *mlir_module = mlir::parseSourceString<mlir::ModuleOp>(serialized_mlir_module,
-                                                         mlir_context);
-  if (!*mlir_module)
-    return error_handler.Combine(
-        absl::InvalidArgumentError("could not parse MLIR module"));
-
+  // Look for the GZIP magic number to check if this is a compressed bytecode.
+  if (serialized_mlir_module.starts_with("\x1f\x8b")) {
+    // Try to uncompress the and parse the bytecode.
+    auto input_stream = std::make_unique<MemoryInputStream>(
+        serialized_mlir_module.data(), serialized_mlir_module.size());
+    io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP();
+    auto zlib_stream = std::make_unique<tensorflow::io::ZlibInputStream>(
+        input_stream.get(), options.input_buffer_size,
+        options.output_buffer_size, options);
+    tstring uncompressed_bytecode;
+    absl::Status s = zlib_stream->ReadNBytes(/*bytes_to_read=*/INT_MAX,
+                                             &uncompressed_bytecode);
+    // OK status means the decompression is successful.
+    // OutOfRange error means the decompression is successful but end of input
+    // was reached before *bytes_to_read* bytes were read.
+    if (!s.ok() && !absl::IsOutOfRange(s)) {
+      // Failed to uncompress the bytecode and it is not the end of the input.
+      return error_handler.Combine(absl::InvalidArgumentError(
+          absl::StrCat("Failed to uncompress MLIR module", s.message())));
+    }
+    // Parse the uncompressed bytecode.
+    auto uncompressed_bytecode_str =
+        std::string(uncompressed_bytecode.data(), uncompressed_bytecode.size());
+    *mlir_module = mlir::parseSourceString<mlir::ModuleOp>(
+        uncompressed_bytecode_str, mlir_context);
+    if (!*mlir_module) {
+      // Uncompressing was successful but the parsed MLIR module is invalid.
+      return error_handler.Combine(absl::InvalidArgumentError(
+          "Failed to parse MLIR module after uncompressing"));
+    }
+  } else {
+    *mlir_module = mlir::parseSourceString<mlir::ModuleOp>(
+        serialized_mlir_module, mlir_context);
+    if (!*mlir_module) {
+      return error_handler.Combine(
+          absl::InvalidArgumentError("could not parse MLIR module"));
+    }
+  }
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
index d9249d472b334c..3329bff4c02737 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
@@ -126,7 +126,8 @@ void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set) {
   // For device that do not have any metadata, or if we failed to parse metadata
   // from the DeviceSet, we add a unit attribute to the `tf.devices` attribute.
   for (Device* device : device_set->devices()) {
-    string name = DeviceNameUtils::ParsedNameToString(device->parsed_name());
+    std::string name =
+        DeviceNameUtils::ParsedNameToString(device->parsed_name());
 
     if (device->device_type() == DEVICE_GPU) {
       auto metadata = ParseGpuDeviceMetadata(*device, &builder);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index c3e7ae75022348..abf357873a6153 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -52,8 +52,8 @@ class FakeDevice : public Device {
     return errors::Unimplemented("FakeDevice::Sync()");
   }
 
-  static std::unique_ptr<Device> Make(const string& name,
-                                      const string& desc = "") {
+  static std::unique_ptr<Device> Make(const std::string& name,
+                                      const std::string& desc = "") {
     DeviceNameUtils::ParsedName parsed_name;
     DeviceNameUtils::ParseFullName(name, &parsed_name);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
index 7e92860e5ff03e..9d9780d231523f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
@@ -26,12 +26,12 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void ExpectHasSubstr(const string& s, const string& expected) {
+void ExpectHasSubstr(const std::string& s, const std::string& expected) {
   EXPECT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
-void ExpectHasNoSubstr(const string& s, const string& expected) {
+void ExpectHasNoSubstr(const std::string& s, const std::string& expected) {
   EXPECT_FALSE(absl::StrContains(s, expected))
       << "'" << s << "' should not contain '" << expected << "'";
 }
@@ -39,7 +39,7 @@ void ExpectHasNoSubstr(const string& s, const string& expected) {
 // WritableFile that simply concats into string.
 class StringWritableFile : public WritableFile {
  public:
-  explicit StringWritableFile(string* str) : str_(*str) {}
+  explicit StringWritableFile(std::string* str) : str_(*str) {}
 
   absl::Status Append(absl::string_view data) override {
     absl::StrAppend(&str_, data);
@@ -62,7 +62,7 @@ class StringWritableFile : public WritableFile {
   }
 
  private:
-  string& str_;
+  std::string& str_;
 };
 
 TEST(Dump, TextualIrToFileSuccess) {
@@ -72,10 +72,10 @@ TEST(Dump, TextualIrToFileSuccess) {
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
   UseMlirForGraphDump(MlirDumpConfig());
-  string ret = DumpGraphToFile("tir", graph);
+  std::string ret = DumpGraphToFile("tir", graph);
   ASSERT_EQ(ret, io::JoinPath(testing::TmpDir(), "tir.mlir"));
 
-  string actual;
+  std::string actual;
   TF_ASSERT_OK(ReadFileToString(Env::Default(), ret, &actual));
 }
 
@@ -86,12 +86,12 @@ TEST(Dump, TextualIrWithOptions) {
                    .Attr("dtype", DT_FLOAT)
                    .Finalize(&graph, &node));
 
-  string actual;
+  std::string actual;
   StringWritableFile file(&actual);
   TF_ASSERT_OK(DumpTextualIRToFile(MlirDumpConfig().emit_location_information(),
                                    graph, /*flib_def=*/nullptr, &file));
 
-  string expected_substr = R"(loc(#loc))";
+  std::string expected_substr = R"(loc(#loc))";
   ExpectHasSubstr(actual, expected_substr);
 }
 
@@ -100,17 +100,17 @@ TEST(Dump, DumpToTFG) {
   Node* node;
   TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
-  string actual;
+  std::string actual;
   StringWritableFile file(&actual);
 
   TF_ASSERT_OK(DumpTextualIRToFile(
       MlirDumpConfig().emit_dialect(MlirDumpConfig::Dialect::kTFG), graph,
       /*flib_def=*/nullptr, &file));
 
-  string expected_substr("tfg.graph");
+  std::string expected_substr("tfg.graph");
   ExpectHasSubstr(actual, expected_substr);
 
-  string not_expected_substr("tf_executor.island");
+  std::string not_expected_substr("tf_executor.island");
   ExpectHasNoSubstr(actual, not_expected_substr);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index b970ca84b326cf..138e13e3719328 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -44,7 +44,7 @@ struct NameCounts {
   llvm::StringMap<int64_t> counts;
 };
 
-std::string MakeUniqueFilename(string name) {
+std::string MakeUniqueFilename(std::string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
@@ -274,7 +274,7 @@ void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path) {
   // Output dirs "sponge" (case-insensitive) have a special meaning: Dump into
   // the directory specified by the environment variable
   // TEST_UNDECLARED_OUTPUTS_DIR.
-  string lower_path = absl::AsciiStrToLower(path);
+  std::string lower_path = absl::AsciiStrToLower(path);
   if (lower_path == "sponge") {
     if (!tensorflow::io::GetTestUndeclaredOutputsDir(&path)) {
       LOG(ERROR) << "MLIR crash reproducer is set to '" << dir_path.str()
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 9ec1b9970ae777..9e07ece4e0999e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -400,12 +400,12 @@ absl::Status ConvertAttributes(
     if (auto symbol_ref = mlir::dyn_cast<mlir::SymbolRefAttr>(attr)) {
       TF_RETURN_IF_ERROR(ConvertAttribute(
           mlir::cast<mlir::FlatSymbolRefAttr>(symbol_ref), &value));
-      func_call_attrs[string(name)] = std::move(value);
+      func_call_attrs[std::string(name)] = std::move(value);
       continue;
     }
     if (auto func_attr = mlir::dyn_cast<mlir::TF::FuncAttr>(attr)) {
       TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, remove_ref_type, &value));
-      func_call_attrs[string(name)] = std::move(value);
+      func_call_attrs[std::string(name)] = std::move(value);
       continue;
     }
     if (mlir::isa<mlir::AffineMapAttr>(attr)) {
@@ -434,12 +434,12 @@ absl::Status ConvertAttributes(
     // input TensorFlow GraphDef shouldn't contain '.'. If it does appear in
     // the attribute from MLIR, it is treated as an attribute from function
     // calls.
-    std::vector<string> name_tokens =
+    std::vector<std::string> name_tokens =
         absl::StrSplit(name, '.', absl::SkipEmpty());
     TF_RET_CHECK(name_tokens.size() <= 2);
     auto it = func_call_attrs.find(name_tokens[0]);
     if (it == func_call_attrs.end()) {
-      (*values)[string(name)] = std::move(value);
+      (*values)[std::string(name)] = std::move(value);
     } else {
       (*it->second.mutable_func()->mutable_attr())[name_tokens[1]] =
           std::move(value);
@@ -457,7 +457,7 @@ absl::Status SetShapeAttribute(absl::string_view name,
   AttrValue value;
   SetTensorShapeProto(shaped_type, value.mutable_list()->add_shape());
 
-  auto result = values->insert({string(name), value});
+  auto result = values->insert({std::string(name), value});
   if (!result.second) {
     // This should be extremely rare as it means we are adding the same
     // attribute multiple times/have some redundancy in representing this
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
index 50306edb28b067..fa2ff3c8a281fa 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
@@ -59,7 +59,7 @@ absl::Status LoadProtoFromFileImpl(absl::string_view input_filename, T* proto) {
   if (std::error_code error = file_or_err.getError()) {
     return errors::InvalidArgument(
         "Could not open input file ",
-        string(input_filename.data(), input_filename.size()).c_str());
+        std::string(input_filename.data(), input_filename.size()).c_str());
   }
 
   const auto& input_file = *file_or_err;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
index a189cc14555143..fbcdc9e894fbd9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
@@ -41,7 +41,7 @@ const char kTensorPrefix[] = "tftensor$";
 
 }  // namespace
 
-string MangleAttributeName(absl::string_view str) {
+std::string MangleAttributeName(absl::string_view str) {
   return absl::StrCat(kAttributePrefix, str);
 }
 
@@ -66,7 +66,7 @@ MangledKind GetMangledKind(absl::string_view str) {
   }
 }
 
-string MangleShape(const TensorShapeProto& shape) {
+std::string MangleShape(const TensorShapeProto& shape) {
   return absl::StrCat(kTensorShapePrefix, PrintShortTextProto(shape));
 }
 
@@ -74,7 +74,7 @@ absl::Status DemangleShape(absl::string_view str, TensorShapeProto* proto) {
   return ParseTextProto(str, kTensorShapePrefix, proto);
 }
 
-string MangleTensor(const TensorProto& tensor) {
+std::string MangleTensor(const TensorProto& tensor) {
   return absl::StrCat(kTensorPrefix, PrintShortTextProto(tensor));
 }
 
@@ -82,7 +82,7 @@ absl::Status DemangleTensor(absl::string_view str, TensorProto* proto) {
   return ParseTextProto(str, kTensorPrefix, proto);
 }
 
-string MangleDataType(const DataType& dtype) {
+std::string MangleDataType(const DataType& dtype) {
   return absl::StrCat(kDataTypePrefix, DataType_Name(dtype));
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
index a0c14f27b5b38f..7e95a27f0290f9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
@@ -28,7 +28,7 @@ namespace mangling_util {
 enum class MangledKind { kUnknown, kDataType, kTensorShape, kTensor };
 
 // Mangles an attribute name, marking the attribute as a TensorFlow attribute.
-string MangleAttributeName(absl::string_view str);
+std::string MangleAttributeName(absl::string_view str);
 
 // Returns true if 'str' was mangled with MangleAttributeName.
 bool IsMangledAttributeName(absl::string_view str);
@@ -41,17 +41,17 @@ absl::string_view DemangleAttributeName(absl::string_view str);
 MangledKind GetMangledKind(absl::string_view str);
 
 // Return a TensorShapeProto mangled as a string.
-string MangleShape(const TensorShapeProto& shape);
+std::string MangleShape(const TensorShapeProto& shape);
 // Demangle a string mangled with MangleShape.
 absl::Status DemangleShape(absl::string_view str, TensorShapeProto* proto);
 
 // Return a TensorProto mangled as a string.
-string MangleTensor(const TensorProto& tensor);
+std::string MangleTensor(const TensorProto& tensor);
 // Demangle a string mangled with MangleTensor.
 absl::Status DemangleTensor(absl::string_view str, TensorProto* proto);
 
 // Return a DataType mangled as a string.
-string MangleDataType(const DataType& dtype);
+std::string MangleDataType(const DataType& dtype);
 // Demangle a string mangled with MangleDataType.
 absl::Status DemangleDataType(absl::string_view str, DataType* proto);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
index abea6d6602b862..e960de8acc494e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
@@ -18,12 +18,77 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
+#include "xla/tsl/lib/io/zlib_compression_options.h"
+#include "xla/tsl/lib/io/zlib_outputbuffer.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
+
 namespace tensorflow {
+namespace {
+class WritableStringFile : public tsl::WritableFile {
+ public:
+  explicit WritableStringFile(std::string* data) : data_(data) {};
+  ~WritableStringFile() override = default;
+
+  absl::Status Append(absl::string_view data) override {
+    absl::StrAppend(data_, data);
+    return absl::OkStatus();
+  }
+
+  absl::Status Close() override { return absl::OkStatus(); }
+  absl::Status Flush() override { return absl::OkStatus(); }
+  absl::Status Sync() override { return absl::OkStatus(); }
+
+ private:
+  std::string* data_;
+};
+}  // namespace
+
+absl::StatusOr<std::string> SerializeMlirModuleToCompressedBytecode(
+    mlir::ModuleOp module_op) {
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  mlir::BytecodeWriterConfig config;
+  if (mlir::failed(mlir::writeBytecodeToFile(module_op, os, config))) {
+    return absl::InternalError("Failed to serialize MLIR module to bytecode.");
+  }
+  std::string compressed_bytecode;
+  WritableStringFile f(&compressed_bytecode);
+
+  tsl::io::ZlibCompressionOptions options =
+      tsl::io::ZlibCompressionOptions::GZIP();
+  tsl::io::ZlibOutputBuffer buffer(&f, options.input_buffer_size,
+                                   options.output_buffer_size, options);
+  TF_RETURN_IF_ERROR(buffer.Init());
+  TF_RETURN_IF_ERROR(buffer.Append(bytecode));
+  TF_RETURN_IF_ERROR(buffer.Close());
+  return compressed_bytecode;
+}
 
 std::string SerializeMlirModule(mlir::ModuleOp module_op) {
+  if (GetMlirCommonFlags()->tf_serialize_mlir_to_compressed_bytecode) {
+    auto compressed_bytecode =
+        SerializeMlirModuleToCompressedBytecode(module_op);
+    if (compressed_bytecode.ok()) {
+      return compressed_bytecode.value();
+    }
+    LOG_IF(ERROR, !compressed_bytecode.ok())
+        << "Failed to serialize MLIR module to "
+           "compressed bytecode."
+        << compressed_bytecode.status();
+    return "";
+  }
   std::string serialized_mlir_module;
   llvm::raw_string_ostream os(serialized_mlir_module);
   mlir::OpPrintingFlags print_flags;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
index 78c7fb6c3857b3..4e264c5f566a9c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
@@ -18,10 +18,13 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/statusor.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 
 namespace tensorflow {
-
+// Serializes a MLIR module `module_op` to a compressed bytecode string.
+absl::StatusOr<std::string> SerializeMlirModuleToCompressedBytecode(
+    mlir::ModuleOp module_op);
 // Prints a MLIR module `module_op` and returns it as a string.
 std::string SerializeMlirModule(mlir::ModuleOp module_op);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils_test.cc
index d4f7eb11f81dff..d373e38cbaacbf 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/deserialize_mlir_module_utils.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -42,5 +43,17 @@ TEST(SerializeMlirModuleUtilsTest, DebugInfoSerialization) {
   EXPECT_FALSE(absl::StrContains(serialized_module, "loc("));
 }
 
+TEST(SerializeMlirModuleUtilsTest, CompressedBytecodeSerializationRoundTrip) {
+  GetMlirCommonFlags()->tf_serialize_mlir_to_compressed_bytecode = true;
+  mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  std::string mlir_module_str = tensorflow::SerializeMlirModule(*module_ref);
+  mlir::OwningOpRef<mlir::ModuleOp> deserialized_module;
+  EXPECT_TRUE(tensorflow::DeserializeMlirModule(mlir_module_str, &context,
+                                                &deserialized_module)
+                  .ok());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
index c9a6f6e85c9d4d..c1479fead3a595 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
@@ -133,7 +133,7 @@ absl::Status SetTypeAttribute(absl::string_view name, ContainerT types,
     type_list.add_type(dtype);
   }
 
-  auto result = values->insert({string(name), value});
+  auto result = values->insert({std::string(name), value});
   assert(result.second && "cannot have multiple attributes with the same name");
   (void)result;
 
@@ -164,7 +164,7 @@ void SetShapeAttribute(absl::string_view name, ContainerT shapes,
   // If shape is already set, override it. This can happen if we import
   // without shape inference enabled and so couldn't be removed on import and
   // are not explicitly dropped later.
-  (*values)[string(name)] = value;
+  (*values)[std::string(name)] = value;
 }
 
 // Collects all the unregistered attributes for an TF dialect operation.
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
index 8cb797a9a9b214..b13e099fde3557 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
@@ -214,7 +214,7 @@ absl::StatusOr<std::unique_ptr<Graph>> BuildConstOpGraphWithOutputShapes() {
   std::initializer_list<int64_t> dims = {2, 3, 4, 5};
   Tensor tensor(data_type, TensorShape(dims));
   for (int i = 0; i < 2 * 3 * 4 * 5; ++i) {
-    tensor.flat<int32>()(i) = i;
+    tensor.flat<int32_t>()(i) = i;
   }
 
   NodeDef node;
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
index 46f7f5de1d0856..74b7304b745033 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
@@ -106,9 +106,9 @@ namespace {
 
 // Time the execution of kernels (in CPU cycles). Meant to be used as RAII.
 struct CompilationTimer {
-  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+  uint64_t start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
 
-  uint64 ElapsedCycles() {
+  uint64_t ElapsedCycles() {
     return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
   }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 243f4333a88525..2ab0c3c619b292 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -4864,7 +4864,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
@@ -5064,7 +5064,7 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 159bc8b17bc36b..a6ee4c3e1ffbd0 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -308,7 +308,7 @@ py_strict_library(
         "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -339,7 +339,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:transpiler",
         "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfr/utils/utils.cc b/tensorflow/compiler/mlir/tfr/utils/utils.cc
index f9e70b228c0b71..ddff766c789450 100644
--- a/tensorflow/compiler/mlir/tfr/utils/utils.cc
+++ b/tensorflow/compiler/mlir/tfr/utils/utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
@@ -92,9 +93,9 @@ std::string GetComposeFuncName(StringRef tf_op_name) {
     }
     if (tf_op_name[i] == '.') {
       compose_func_name.push_back('_');
-    } else if (tf_op_name[i] >= 'A' && tf_op_name[i] <= 'Z') {
+    } else if (llvm::isUpper(tf_op_name[i])) {
       compose_func_name.push_back('_');
-      compose_func_name.push_back(tf_op_name[i] + 'a' - 'A');
+      compose_func_name.push_back(llvm::toLower(tf_op_name[i]));
     } else {
       compose_func_name.push_back(tf_op_name[i]);
     }
@@ -106,13 +107,13 @@ std::string GetTFOpName(StringRef compose_func_name) {
   std::string tf_op_name;
   bool after_underscore = false;
   for (int i = 0; i < compose_func_name.size(); ++i) {
-    if (compose_func_name[i] >= 'A' && compose_func_name[i] <= 'Z') {
+    if (llvm::isUpper(compose_func_name[i])) {
       // The field name must not contain uppercase letters.
       return {};
     }
     if (after_underscore) {
-      if (compose_func_name[i] >= 'a' && compose_func_name[i] <= 'z') {
-        tf_op_name.push_back(compose_func_name[i] + 'A' - 'a');
+      if (llvm::isLower(compose_func_name[i])) {
+        tf_op_name.push_back(llvm::toUpper(compose_func_name[i]));
         after_underscore = false;
       } else {
         // The character after a "_" must be a lowercase letter.
diff --git a/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
index 42e2e7ccb5086a..d6a4b0c3fcbf97 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
@@ -195,6 +195,28 @@ func.func @sink_in_stateful_call(%arg0: tensor<i32> {tf_saved_model.index_path =
   func.return %2 : tensor<i32>
 }
 
+// Test VarHandleOp getting sinked when it is used by the called function and returned by the called function.
+
+// CHECK: func private @func_use_and_return_varhandle([[arg0:.+]]: tensor<!tf_type.resource<tensor<i32>>>)
+func.func private @func_use_and_return_varhandle(%arg0: tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<i32>, tensor<!tf_type.resource<tensor<i32>>>) {
+  // CHECK: tf.VarHandleOp
+  // CHECK-NEXT: tf.ReadVariableOp
+  %0 = "tf.ReadVariableOp"(%arg0) {device = "cpu"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+
+  func.return %0, %arg0 : tensor<i32>, tensor<!tf_type.resource<tensor<i32>>>
+}
+
+// CHECK-LABEL: func @sink_in_stateful_call_varhandle_return
+func.func @sink_in_stateful_call_varhandle_return(%arg0: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
+  attributes {tf_saved_model.exported_names = ["test_sink_in_stateful_call_varhandle_return"]} {
+  // CHECK: tf.VarHandleOp
+  %0 = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK: "tf.StatefulPartitionedCall"(%0)
+  %1:2 = "tf.StatefulPartitionedCall"(%0) {device = "/CPU:0", config = "", config_proto = "", executor_type = "", f = @func_use_and_return_varhandle} : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<i32>, tensor<!tf_type.resource<tensor<i32>>>)
+  %2 = "tf.AddV2"(%arg0, %1#0) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
 // CHECK-LABEL: func @sink_in_if
 func.func @sink_in_if(%arg0: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["test_sink_in_if"]} {
@@ -374,3 +396,54 @@ func.func @nested_sink_in_if(%arg: tensor<i32> {tf_saved_model.index_path = ["in
 }
 
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// Test sinks crossing nested tf.While and BatchFunction, while the sinkable ops are only copied at the target.
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK: tf.VarHandleOp
+  // CHECK-NEXT: tf.ReadVariableOp
+  %1 = "tf.ReadVariableOp"(%arg0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %2 = "tf.Identity"(%1) {device = "/device:CPU:0"} : (tensor<i32>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
+// CHECK-LABEL: func private @while_cond_func
+func.func private @while_cond_func(
+    %arg0: tensor<i32>,
+    %arg1: tensor<i32>,
+    %arg: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32> {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  // CHECK: "tf.ReadVariableOp"([[handle]])
+  %0 = "tf.ReadVariableOp"(%arg) {device = "cpu"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func private @while_body_func
+func.func private @while_body_func(
+    %arg0: tensor<i32>,
+    %arg1: tensor<i32>,
+    %arg2: tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.resource<tensor<i32>>>) {
+  // CHECK: "tf.BatchFunction"(%arg2)
+  %0 = "tf.BatchFunction"(%arg2) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch/"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  func.return %0, %arg0, %arg2 : tensor<i32>, tensor<i32>, tensor<!tf_type.resource<tensor<i32>>>
+}
+
+// CHECK-LABEL: func @nested_sink_in_while_and_batch_functions
+func.func @nested_sink_in_while_and_batch_functions(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
+  attributes {tf_saved_model.exported_names = ["test_sink_in_while_and_batch_functions"]} {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  %handle = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK: [[cond:%.*]] = "tf.Const"()
+  %cond = "tf.Const"() {device = "/CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: "tf.While"([[cond]], [[cond]], [[handle]])
+  %x:3 = "tf.While"(%cond, %cond, %handle) {body = @while_body_func, cond = @while_cond_func, is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.resource<tensor<i32>>>)
+  func.return %x#0 : tensor<i32>
+}
+
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
index 32898953f8973e..5340015658621a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
@@ -81,10 +81,10 @@ class IfrtBackendCompilerTest : public ::testing::Test {
   }
 
   void verifyModules() {
-    absl::MutexLock l(&ServingExecutableRegistry::mu_);
+    absl::MutexLock l(ServingExecutableRegistry::mu_);
     for (const auto& [_, executable] :
          *ServingExecutableRegistry::executables_) {
-      absl::MutexLock l(&executable->mutex_);
+      absl::MutexLock l(executable->mutex_);
       executable->module_->walk([](mlir::func::FuncOp func) {
         ASSERT_FALSE(func->hasAttr("tfrt_ifrt_serving.program_id"));
       });
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
index c64672cdb10e69..9d0efd51791b87 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
@@ -25,6 +25,10 @@ namespace ifrt_serving {
 struct DtypeAndShape {
   tensorflow::DataType dtype;
   tensorflow::TensorShape shape;
+
+  bool operator==(const DtypeAndShape& other) const {
+    return dtype == other.dtype && shape == other.shape;
+  }
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
index 2cb92cb8baac1f..6ff373d7ce0b43 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
@@ -77,6 +77,8 @@ class TfToHloCompiler {
   virtual absl::StatusOr<std::string> Key(const Tf2HloArg& arg);
 
   virtual absl::StatusOr<Tf2HloResult> CompileTfToHlo(Tf2HloArg& arg);
+
+  virtual bool IsXlaCompilationDisabled() const { return false; }
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index cc59c9150da769..7f4a602b1330a6 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -906,10 +906,6 @@ void CreateFallbackInitializationFunction(
       builder.create<tf_mlrt::CreateOp>(
           func_op.getLoc(), /*resultTypes=*/mlir::TypeRange{},
           /*operands=*/mlir::ValueRange{}, op->getAttrs());
-    } else {
-      // TODO: b/381849919 - Remove this log once the bug is fixed.
-      LOG_FIRST_N(WARNING, 100)
-          << "Skip creation of fallback kernel for op index " << op_index;
     }
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
index ddff1b2bde43f9..990b3da433c327 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
@@ -152,7 +152,6 @@ void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
   pm.addPass(mlir::createInlinerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateRemoveUnusedWhileResultsPass());
-  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
 
   // Apply standard optimization after optimizing control flow ops.
   pm.addPass(mlir::createInlinerPass());
@@ -163,6 +162,7 @@ void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
   // by performing shape inference again after reference variable to resource
   // variable conversion. We should remove this after b/187876545 is fixed.
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
 
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TFDevice::CreateLaunchToDeviceAttributePass());
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
index 4615c521edb059..fddb217d4c57ee 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
@@ -49,15 +49,28 @@ bool IsSinkCandidate(mlir::Operation *op) {
 // Check if the op is allowed to be sinked. We are being conservative here to
 // whilelist very limited set of ops here.
 struct AllowSinkHelper {
-  explicit AllowSinkHelper(mlir::Operation *op, int arg_index) {
+  explicit AllowSinkHelper(mlir::Operation* sinked_op, mlir::Operation* user,
+                           int arg_index) {
     if (llvm::isa<mlir::TF::BatchFunctionOp,
-                  mlir::TF::StatefulPartitionedCallOp>(op)) {
+                  mlir::TF::StatefulPartitionedCallOp>(user)) {
       allow_sink_to = true;
       callee_arg_index = arg_index;
       return;
     }
 
-    if (llvm::isa<mlir::TF::IfOp>(op) && arg_index > 0) {
+    // We tend to limit this support on WhileOp to only VarHandleOp to satisfy
+    // IFRT lowering requirements.
+    // Sinking other invariants like ConstOp is error-prone because it requires
+    // non-trivial effort to avoid sinking Consts when they are used by cond
+    // function and we don't need such support.
+    if (llvm::isa<mlir::TF::VarHandleOp>(sinked_op) &&
+        llvm::isa<mlir::TF::WhileOp>(user)) {
+      allow_sink_to = true;
+      callee_arg_index = arg_index;
+      return;
+    }
+
+    if (llvm::isa<mlir::TF::IfOp>(user) && arg_index > 0) {
       allow_sink_to = true;
       callee_arg_index = arg_index - 1;
       return;
@@ -107,7 +120,8 @@ void FindSinkTarget(
   for (mlir::OpOperand &use : value.getUses()) {
     auto *user = use.getOwner();
 
-    AllowSinkHelper helper(user, use.getOperandNumber());
+    AllowSinkHelper helper(original.getDefiningOp(), user,
+                           use.getOperandNumber());
 
     if (helper.allow_sink_to) {
       auto values = FindValueInCallees(symbol_table, symbol_users, user,
@@ -116,6 +130,14 @@ void FindSinkTarget(
         FindSinkTarget(symbol_table, symbol_users, original, value, targets);
       }
     } else if (value != original) {
+      // If the sinked op is directly used by ReturnOp, we don't sink it.
+      // One example is for tf.WhileOp, the input and output of the cond
+      // function and the body function must be the same. If the cond function
+      // has an input of type tf.VarHandleOp and it just return the VarHandleOp,
+      // we don't need to sink it.
+      if (llvm::isa<mlir::func::ReturnOp>(user)) {
+        continue;
+      }
       targets[&use].insert(original);
     }
   }
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index e8004f17a24b47..d6d93d9f2d6f34 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -204,11 +204,18 @@ absl::Status ConvertTfMlirToRuntimeExecutable(
         tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
             module, /*is_supported_by_replicated_brige*/ true,
             /*is_in_fallback_enabled_mode=*/VLOG_IS_ON(1)));
+    if (VLOG_IS_ON(1)) {
+      tensorflow::DumpMlirOpToFile("after_tf2xla_clustering_bridge", module);
+    }
 
     TF_RETURN_IF_ERROR(
         tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
             module, tsl::DeviceType(DEVICE_TPU_XLA_JIT)));
 
+    if (VLOG_IS_ON(1)) {
+      tensorflow::DumpMlirOpToFile("after_lower_cluster_to_runtime_ops",
+                                   module);
+    }
     TF_RETURN_IF_ERROR(
         tensorflow::tf2xla::v2::ExportFromTensorflowDialectToExecutor(module));
   } else if (options.device_target == TfrtDeviceInfraTarget::kTfFallback) {
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
index b1a4a4b96f3b72..527a724c491b96 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
@@ -19,6 +19,7 @@ cc_library(
     srcs = ["mlir_to_bytecode.cc"],
     hdrs = ["mlir_to_bytecode.h"],
     deps = [
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
         "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/mlrt/bytecode:executable",
         "//tensorflow/core/tfrt/mlrt/bytecode:function",
@@ -43,6 +44,7 @@ tf_cc_test(
     data = glob(["testdata/**"]),
     deps = [
         ":mlir_to_bytecode",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
         "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/mlrt/bytecode:executable",
         "//tensorflow/core/tfrt/mlrt/interpreter:attribute_span",
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
index 52b1826f4a1f65..2324f958f19266 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
@@ -169,19 +170,26 @@ struct FunctionEmitterContext {
   struct RegInfo {
     int num_uses = 0;
     int id = -1;
+    bool persistent = false;  // True if the register should not be freed
   };
 
   int next_reg_id = 0;
   llvm::DenseMap<mlir::Value, RegInfo> register_table;
   std::vector<int> free_regs;
 
-  int AssignRegId() {
-    if (free_regs.empty()) {
+  int AssignRegId(bool is_persistent) {
+    if (is_persistent) {
+      // Persistent types ALWAYS get a brand new ID.
       return next_reg_id++;
     }
-    int id = free_regs.back();
-    free_regs.pop_back();
-    return id;
+
+    // Non-persistent types can reuse from free_regs.
+    if (!free_regs.empty()) {
+      int id = free_regs.back();
+      free_regs.pop_back();
+      return id;
+    }
+    return next_reg_id++;
   }
 
   void FreeRegId(int id) { free_regs.push_back(id); }
@@ -202,7 +210,7 @@ void EmitKernel(FunctionEmitterContext& function_context,
     auto iter = function_context.register_table.find(result);
     CHECK(iter != function_context.register_table.end());  // Crash Ok
     CHECK_EQ(iter->second.id, -1);                         // Crash Ok
-    iter->second.id = function_context.AssignRegId();
+    iter->second.id = function_context.AssignRegId(iter->second.persistent);
     results.push_back(iter->second.id);
   }
   constructor.construct_results(results.size())
@@ -218,9 +226,12 @@ void EmitKernel(FunctionEmitterContext& function_context,
     int id = iter->second.id;
     CHECK_NE(id, -1);  // Crash Ok
     last_uses.push_back(0);
-    if (--iter->second.num_uses == 0) {
-      function_context.FreeRegId(id);
-      last_uses.back() = 1;
+    auto& reg_info = iter->second;
+    if (!reg_info.persistent) {
+      if (--reg_info.num_uses == 0) {
+        function_context.FreeRegId(id);
+        last_uses.back() = 1;
+      }
     }
     arguments.push_back(id);
   }
@@ -282,18 +293,23 @@ void EmitFunction(const ModuleEmitterContext& module_context,
   std::vector<uint32_t> input_regs;
   input_regs.reserve(block.getNumArguments());
   for (auto arg : block.getArguments()) {
-    int id = function_context.AssignRegId();
+    bool persistent = mlir::isa<mlrt::compiler::AsyncHandleType>(arg.getType());
+    int id = function_context.AssignRegId(persistent);
     input_regs.push_back(id);
     register_table[arg] = {static_cast<int>(std::distance(arg.getUses().begin(),
                                                           arg.getUses().end())),
-                           id};
+                           id, persistent};
   }
   constructor.construct_input_regs(input_regs);
 
   for (auto& op : block) {
     for (auto result : op.getResults()) {
-      register_table[result] = {static_cast<int>(
-          std::distance(result.getUses().begin(), result.getUses().end()))};
+      bool persistent =
+          mlir::isa<mlrt::compiler::AsyncHandleType>(result.getType());
+      register_table[result] = {
+          static_cast<int>(
+              std::distance(result.getUses().begin(), result.getUses().end())),
+          -1, persistent};
     }
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
index b69f53cc9c7a2c..53f2e7591c8a9a 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
 #include "xla/tsl/platform/resource_loader.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
@@ -45,8 +46,6 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::FloatEq;
 using ::testing::IsEmpty;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(MlirToByteCodeTest, Basic) {
   constexpr char kBasicMlir[] =
@@ -147,16 +146,20 @@ TEST(MlirToByteCodeTest, BasicAttributes) {
   EXPECT_EQ(*attr_iter, "ts");
   ++attr_iter;
 
-  EXPECT_THAT(DecodeAttribute<int32_t>(*attr_iter), IsOkAndHolds(100));
+  EXPECT_THAT(DecodeAttribute<int32_t>(*attr_iter),
+              absl_testing::IsOkAndHolds(100));
   ++attr_iter;
 
-  EXPECT_THAT(DecodeAttribute<int64_t>(*attr_iter), IsOkAndHolds(200));
+  EXPECT_THAT(DecodeAttribute<int64_t>(*attr_iter),
+              absl_testing::IsOkAndHolds(200));
   ++attr_iter;
 
-  EXPECT_THAT(DecodeAttribute<float>(*attr_iter), IsOkAndHolds(FloatEq(3.0)));
+  EXPECT_THAT(DecodeAttribute<float>(*attr_iter),
+              absl_testing::IsOkAndHolds(FloatEq(3.0)));
   ++attr_iter;
 
-  EXPECT_THAT(DecodeAttribute<uint8_t>(*attr_iter), IsOkAndHolds(0));
+  EXPECT_THAT(DecodeAttribute<uint8_t>(*attr_iter),
+              absl_testing::IsOkAndHolds(0));
   ++attr_iter;
 
   bc::Vector<int64_t> list_of_i64((*attr_iter).data());
@@ -171,7 +174,8 @@ TEST(MlirToByteCodeTest, BasicAttributes) {
   EXPECT_THAT(list_of_str, ElementsAreArray({"string 0", "string 1"}));
   ++attr_iter;
 
-  EXPECT_THAT(DecodeAttribute<uint32_t>(*attr_iter), IsOkAndHolds(1));
+  EXPECT_THAT(DecodeAttribute<uint32_t>(*attr_iter),
+              absl_testing::IsOkAndHolds(1));
   EXPECT_EQ(executable.functions()[1].name().Get(), "callee");
   ++attr_iter;
 
@@ -272,9 +276,10 @@ TEST(MlirToByteCodeTest, UnsupportedAttributes) {
       &mlir_context);
 
   AttributeEncoderRegistry attribute_encoder_registry;
-  EXPECT_THAT(EmitExecutable(attribute_encoder_registry, mlir_module.get()),
-              StatusIs(absl::StatusCode::kInvalidArgument,
-                       "Try to encode unsupported attribute: unit"));
+  EXPECT_THAT(
+      EmitExecutable(attribute_encoder_registry, mlir_module.get()),
+      absl_testing::StatusIs(absl::StatusCode::kInvalidArgument,
+                             "Try to encode unsupported attribute: unit"));
 }
 
 class CustomDense {
@@ -378,5 +383,129 @@ TEST(MlirToByteCodeTest, CustomDense) {
   }
 }
 
+TEST(MlirToByteCodeTest, AsyncNotFreed) {
+  constexpr char kAsyncMlir[] =
+      "tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async.mlir";
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::func::FuncDialect, mlrt::compiler::MlrtDialect>();
+  mlir::MLIRContext mlir_context(registry);
+  mlir_context.allowUnregisteredDialects();
+  auto mlir_module = mlir::parseSourceFile<mlir::ModuleOp>(
+      tsl::GetDataDependencyFilepath(kAsyncMlir), &mlir_context);
+
+  AttributeEncoderRegistry attribute_encoder_registry;
+  bc::Buffer buffer =
+      EmitExecutable(attribute_encoder_registry, mlir_module.get()).value();
+
+  bc::Executable executable(buffer.data());
+
+  auto kernel_names = executable.kernel_names();
+  EXPECT_THAT(kernel_names,
+              ElementsAreArray({"test_mlbc.add.i32", "return", "mlrt.async",
+                                "mlrt.await_handle"}));
+
+  auto functions = executable.functions();
+  ASSERT_EQ(functions.size(), 2);
+
+  auto function = functions[1];
+  EXPECT_EQ(function.name().str(), "main");
+  EXPECT_EQ(function.num_regs(), 4);
+  EXPECT_THAT(function.input_regs(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(function.output_regs(), ElementsAreArray({1}));
+  EXPECT_THAT(function.output_last_uses(), ElementsAreArray({true}));
+
+  auto kernels = function.kernels();
+  ASSERT_EQ(kernels.size(), 5);
+
+  EXPECT_EQ(kernels[0].code(), 2);  // mlrt.async
+  EXPECT_THAT(kernels[0].arguments(), ElementsAreArray({0, 1}));
+  // The returned handle is in register 2, which is never used by other kernels.
+  EXPECT_THAT(kernels[0].results(), ElementsAreArray({2}));
+  EXPECT_THAT(kernels[0].last_uses(), ElementsAreArray({false, false}));
+
+  EXPECT_EQ(kernels[1].code(), 3);  // mlrt.await_handle
+  EXPECT_THAT(kernels[1].arguments(), ElementsAreArray({2}));
+  EXPECT_THAT(kernels[1].results(), IsEmpty());
+
+  EXPECT_EQ(kernels[2].code(), 0);  // test_mlbc.add.i32
+  EXPECT_THAT(kernels[2].arguments(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(kernels[2].results(), ElementsAreArray({3}));
+  EXPECT_THAT(kernels[2].last_uses(), ElementsAreArray({true, true}));
+
+  EXPECT_EQ(kernels[3].code(), 0);  // test_mlbc.add.i32
+  EXPECT_THAT(kernels[3].arguments(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(kernels[3].results(), ElementsAreArray({1}));
+  EXPECT_THAT(kernels[3].last_uses(), ElementsAreArray({false, true}));
+
+  EXPECT_EQ(kernels[4].code(), 1);  // return
+  EXPECT_THAT(kernels[4].arguments(), ElementsAreArray({1}));
+  EXPECT_THAT(kernels[4].results(), IsEmpty());
+  EXPECT_THAT(kernels[4].last_uses(), ElementsAreArray({true}));
+}
+
+TEST(MlirToByteCodeTest, AsyncUseNewId) {
+  constexpr char kAsyncMlir[] =
+      "tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async2.mlir";
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::func::FuncDialect, mlrt::compiler::MlrtDialect>();
+  mlir::MLIRContext mlir_context(registry);
+  mlir_context.allowUnregisteredDialects();
+  auto mlir_module = mlir::parseSourceFile<mlir::ModuleOp>(
+      tsl::GetDataDependencyFilepath(kAsyncMlir), &mlir_context);
+
+  AttributeEncoderRegistry attribute_encoder_registry;
+  bc::Buffer buffer =
+      EmitExecutable(attribute_encoder_registry, mlir_module.get()).value();
+
+  bc::Executable executable(buffer.data());
+
+  auto kernel_names = executable.kernel_names();
+  EXPECT_THAT(kernel_names,
+              ElementsAreArray({"test_mlbc.add.i32", "return", "mlrt.async",
+                                "mlrt.await_handle"}));
+
+  auto functions = executable.functions();
+  ASSERT_EQ(functions.size(), 2);
+
+  auto function = functions[1];
+  EXPECT_EQ(function.name().str(), "main");
+  EXPECT_EQ(function.num_regs(), 4);
+  EXPECT_THAT(function.input_regs(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(function.output_regs(), ElementsAreArray({1}));
+  EXPECT_THAT(function.output_last_uses(), ElementsAreArray({true}));
+
+  auto kernels = function.kernels();
+  ASSERT_EQ(kernels.size(), 5);
+
+  EXPECT_EQ(kernels[0].code(), 0);  // test_mlbc.add.i32
+  EXPECT_THAT(kernels[0].arguments(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(kernels[0].results(), ElementsAreArray({2}));
+  EXPECT_THAT(kernels[0].last_uses(), ElementsAreArray({true, true}));
+
+  EXPECT_EQ(kernels[1].code(), 2);  // mlrt.async
+  EXPECT_THAT(kernels[1].arguments(), ElementsAreArray({2, 2}));
+  // The returned handle is in register 3, which is never used by other kernels.
+  EXPECT_THAT(kernels[1].results(), ElementsAreArray({3}));
+  EXPECT_THAT(kernels[1].last_uses(), ElementsAreArray({false, false}));
+
+  EXPECT_EQ(kernels[2].code(), 3);  // mlrt.await_handle
+  EXPECT_THAT(kernels[2].arguments(), ElementsAreArray({3}));
+  EXPECT_THAT(kernels[2].results(), IsEmpty());
+  EXPECT_THAT(kernels[2].last_uses(), ElementsAreArray({false}));
+
+  EXPECT_EQ(kernels[3].code(), 0);  // test_mlbc.add.i32
+  EXPECT_THAT(kernels[3].arguments(), ElementsAreArray({2, 2}));
+  // AsyncHandle does not free its register. So this can only use 1.
+  EXPECT_THAT(kernels[3].results(), ElementsAreArray({1}));
+  EXPECT_THAT(kernels[3].last_uses(), ElementsAreArray({false, true}));
+
+  EXPECT_EQ(kernels[4].code(), 1);  // return
+  EXPECT_THAT(kernels[4].arguments(), ElementsAreArray({1}));
+  EXPECT_THAT(kernels[4].results(), IsEmpty());
+  EXPECT_THAT(kernels[4].last_uses(), ElementsAreArray({true}));
+}
+
 }  // namespace
 }  // namespace mlrt
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async.mlir b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async.mlir
new file mode 100644
index 00000000000000..f3816531218c81
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async.mlir
@@ -0,0 +1,15 @@
+func.func @add_i32(%arg0: i32, %arg1: i32) -> i32 {
+  %0 = "test_mlbc.add.i32"(%arg0, %arg1) : (i32, i32) -> i32
+  func.return %0 : i32
+}
+
+func.func @main(%arg0: i32, %arg1: i32) -> i32 {
+  %handle = "mlrt.async"(%arg0, %arg1) {callee = @add_i32} : (i32, i32) -> !mlrt.async_handle
+
+  "mlrt.await_handle"(%handle) : (!mlrt.async_handle) -> () 
+
+  %c1 = "test_mlbc.add.i32"(%arg0, %arg1) : (i32, i32) -> i32
+  %c2 = "test_mlbc.add.i32"(%c1, %c1) : (i32, i32) -> i32
+
+  func.return %c2 : i32
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async2.mlir b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async2.mlir
new file mode 100644
index 00000000000000..c960fedd2adc25
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/async2.mlir
@@ -0,0 +1,16 @@
+func.func @add_i32(%arg0: i32, %arg1: i32) -> i32 {
+  %0 = "test_mlbc.add.i32"(%arg0, %arg1) : (i32, i32) -> i32
+  func.return %0 : i32
+}
+
+func.func @main(%arg0: i32, %arg1: i32) -> i32 {
+  %c1 = "test_mlbc.add.i32"(%arg0, %arg1) : (i32, i32) -> i32
+ 
+  %handle = "mlrt.async"(%c1, %c1) {callee = @add_i32} : (i32, i32) -> !mlrt.async_handle
+
+  "mlrt.await_handle"(%handle) : (!mlrt.async_handle) -> () 
+
+  %c2 = "test_mlbc.add.i32"(%c1, %c1) : (i32, i32) -> i32
+
+  func.return %c2 : i32
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index 7b3625103efc1f..079500b8cd1ccf 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -163,9 +163,11 @@ class GpuKernelToBlobPass
         target->Options.AllowFPOpFusion =
             llvm::FPOpFusion::FPOpFusionMode::Fast;
       };
-      TF_ASSIGN_OR_RETURN(std::string ptx, xla::gpu::nvptx::CompileToPtx(
-                                               llvm_module_copy.get(), cc,
-                                               options, enable_fusion));
+      TF_ASSIGN_OR_RETURN(
+          std::string ptx,
+          xla::gpu::nvptx::CompileToPtx(
+              llvm_module_copy.get(), stream_executor::GpuComputeCapability(cc),
+              options, enable_fusion));
       if (print_ptx_) {
         llvm::dbgs() << "Generated PTX code for module '"
                      << gpu_module.getName() << "' on architecture sm_" << arch
diff --git a/tensorflow/compiler/mlir/tosa/tfl_passes.h b/tensorflow/compiler/mlir/tosa/tfl_passes.h
index 96d3cabf0c1f1f..02bd007f6fa36c 100644
--- a/tensorflow/compiler/mlir/tosa/tfl_passes.h
+++ b/tensorflow/compiler/mlir/tosa/tfl_passes.h
@@ -42,8 +42,8 @@ struct TOSATFLLegalizationPipelineOptions
       llvm::cl::desc("Dequantize the TFLite softmax"), llvm::cl::init(false)};
 
   TOSATFLLegalizationPipelineOptions() {
-    disabled_patterns = std::nullopt;
-    enabled_patterns = std::nullopt;
+    disabled_patterns = {};
+    enabled_patterns = {};
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.h b/tensorflow/compiler/mlir/tosa/transforms/passes.h
index de0872b660d4ec..0475d46a37a091 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.h
@@ -53,8 +53,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> createFuseBiasTFPass();
 // `enabledPatterns` is a set of labels used to filter out input patterns that
 //  do not have one of the labels in this set.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFLPass(
-    ArrayRef<std::string> disabled_patterns = std::nullopt,
-    ArrayRef<std::string> enabled_patterns = std::nullopt);
+    ArrayRef<std::string> disabled_patterns = {},
+    ArrayRef<std::string> enabled_patterns = {});
 
 std::unique_ptr<OperationPass<ModuleOp>> createRetainCallOnceFuncsPass();
 std::unique_ptr<OperationPass<ModuleOp>> createStripModuleMetadataPass();
diff --git a/tensorflow/compiler/mlir/utils/name_utils.cc b/tensorflow/compiler/mlir/utils/name_utils.cc
index fd50116ba7d1a7..fb5bb77644c211 100644
--- a/tensorflow/compiler/mlir/utils/name_utils.cc
+++ b/tensorflow/compiler/mlir/utils/name_utils.cc
@@ -31,8 +31,8 @@ namespace {
 // Checks if a character is legal for a TensorFlow node name, with special
 // handling if a character is at the beginning.
 bool IsLegalChar(char c, bool first_char) {
-  if (isalpha(c)) return true;
-  if (isdigit(c)) return true;
+  if (llvm::isAlpha(c)) return true;
+  if (llvm::isDigit(c)) return true;
   if (c == '.') return true;
   if (c == '_') return true;
 
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index f957ec4b08e322..995ae2b5740ae7 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -2,7 +2,14 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_cuda_cc_test")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow/compiler/tests:build_combined_defs.bzl", "tf_xla_combined_py_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "generate_backend_suites", "tf_xla_py_strict_test")
+load(
+    "//tensorflow/compiler/tests:build_defs.bzl",
+    "generate_backend_suites",
+    "tf_xla_py_strict_test",
+    # copybara:uncomment_begin(google-only)
+    # "tpu_backends",
+    # copybara:uncomment_end
+)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -93,6 +100,7 @@ py_strict_test(
 tf_xla_combined_py_test(
     name = "combined_ops_test_a",
     size = "medium",
+    timeout = "long",
     package = "tensorflow.compiler.tests",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -213,9 +221,8 @@ tf_xla_combined_py_test(
     name = "combined_ops_test_f",
     size = "medium",
     timeout = "long",
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
+    # copybara:uncomment_begin(google-only)
+    # disabled_backends = tpu_backends(),
     # copybara:uncomment_end
     exec_properties = {
         "cpp_link.mem": "16g",
@@ -340,10 +347,6 @@ tf_xla_py_strict_test(
     name = "add_n_test",
     size = "small",
     srcs = ["add_n_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
@@ -496,10 +499,6 @@ tf_xla_py_strict_test(
     name = "cond_test",
     size = "small",
     srcs = ["cond_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
@@ -1746,12 +1745,8 @@ tf_xla_py_strict_test(
     name = "tensor_list_ops_test",
     size = "small",
     srcs = ["tensor_list_ops_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
-    # TensorList ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = ["cpu_ondemand"],
+    # TensorList ops are only implemented on CPU.
+    enabled_backends = ["cpu"],
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -1906,10 +1901,6 @@ tf_xla_py_strict_test(
     name = "while_test",
     size = "small",
     srcs = ["while_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/291130193): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
@@ -2082,7 +2073,7 @@ tf_xla_py_strict_test(
 
 tf_xla_py_strict_test(
     name = "xla_device_test",
-    size = "small",
+    size = "medium",
     srcs = ["xla_device_test.py"],
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -2166,7 +2157,6 @@ tf_xla_py_strict_test(
         "gpu_a100",
         "gpu_h100",
     ],
-    env = {"XLA_FLAGS": "--xla_backend_extra_options=xla_cpu_disable_new_fusion_emitters=true"},
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -2430,9 +2420,6 @@ tf_xla_py_strict_test(
     name = "where_op_tpu_test",
     size = "small",
     srcs = ["where_op_test.py"],
-    args = [
-        "--tpu_use_tfrt=true",
-    ],
     disabled_backends = [
         "cpu",
         "cpu_ondemand",
diff --git a/tensorflow/compiler/tests/cast_test.py b/tensorflow/compiler/tests/cast_test.py
index bc35db4e05f7d5..453cbeb1312648 100644
--- a/tensorflow/compiler/tests/cast_test.py
+++ b/tensorflow/compiler/tests/cast_test.py
@@ -35,9 +35,10 @@ def test_cast(self):
         dtypes.uint32,
         dtypes.uint64,
     }
-    for src_type in types:
-      for dst_type in types:
-        self._test_cast(src_type, dst_type)
+    with self.session() as session:
+      for src_type in types:
+        for dst_type in types:
+          self._test_cast(src_type, dst_type, session)
 
   def test_cast_fp8(self):
     if platform.system() == "Darwin":
@@ -61,12 +62,13 @@ def test_cast_fp8(self):
         dtypes.uint32,
         dtypes.uint64,
     }
-    for fp8_type in fp8_types:
-      for other_type in other_types | fp8_types:
-        self._test_cast(fp8_type, other_type)
-        self._test_cast(other_type, fp8_type)
+    with self.session() as session:
+      for fp8_type in fp8_types:
+        for other_type in other_types | fp8_types:
+          self._test_cast(fp8_type, other_type, session)
+          self._test_cast(other_type, fp8_type, session)
 
-  def _test_cast(self, src_type, dst_type):
+  def _test_cast(self, src_type, dst_type, session):
     with self.subTest(src_type=src_type, dst_type=dst_type):
       shapes = [[], [4], [2, 3], [2, 0, 4]]
       src_np_dtype = src_type.as_numpy_dtype
@@ -83,6 +85,7 @@ def _test_cast(self, src_type, dst_type):
             lambda x, dst_type=dst_type: math_ops.cast(x, dst_type),
             src,
             expected=dst,
+            local_session=session,
         )
 
       # Check special values.
@@ -112,6 +115,7 @@ def _test_cast(self, src_type, dst_type):
           lambda x, dst_type=dst_type: math_ops.cast(x, dst_type),
           src,
           expected=dst,
+          local_session=session,
       )
 
   def test_give_me_a_name(self):
diff --git a/tensorflow/compiler/tests/float_ops_test.py b/tensorflow/compiler/tests/float_ops_test.py
index d8743016c20756..67a1ecc967f24c 100644
--- a/tensorflow/compiler/tests/float_ops_test.py
+++ b/tensorflow/compiler/tests/float_ops_test.py
@@ -23,449 +23,522 @@
 class FloatOpsTest(xla_test.XLATestCase):
 
   def test_float_ops(self):
-    for dtype in self.float_types:
-      x = np.arange(-0.90, 0.90, 0.25)
-      self.assert_op_output_matches_expected(
-          math_ops.acos, x.astype(dtype), expected=np.arccos(x).astype(dtype)
-      )
-      self.assert_op_output_matches_expected(
-          math_ops.asin, x.astype(dtype), expected=np.arcsin(x).astype(dtype)
-      )
-      x = np.arange(-3, 3).reshape(1, 3, 2)
-      self.assert_op_output_matches_expected(
-          math_ops.atan, x.astype(dtype), expected=np.arctan(x).astype(dtype)
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.acosh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [0, 1.3169579, 1.76274717, 2.06343707], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.asinh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [0.88137359, 1.44363548, 1.81844646, 2.09471255], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.atanh,
-          np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype),
-          expected=np.array(
-              [0.10033535, 0.20273255, 0.3095196, 0.42364893], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.ceil,
-          np.array([[-1.7, 1.2]], dtype=dtype),
-          expected=np.array([[-1, 2]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.cosh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [1.54308063, 3.76219569, 10.067662, 27.30823284], dtype=dtype
-          ),
-      )
-
-      # Disable float16 testing for now
-      if dtype != np.float16:
-        x = np.arange(-10, 10, 1).astype(dtype)
-        with self.session() as session:
+    with self.session() as session:
+      for dtype in self.float_types:
+        x = np.arange(-0.90, 0.90, 0.25)
+        self.assert_op_output_matches_expected(
+            math_ops.acos,
+            x.astype(dtype),
+            expected=np.arccos(x).astype(dtype),
+            local_session=session,
+        )
+        self.assert_op_output_matches_expected(
+            math_ops.asin,
+            x.astype(dtype),
+            expected=np.arcsin(x).astype(dtype),
+            local_session=session,
+        )
+        x = np.arange(-3, 3).reshape(1, 3, 2)
+        self.assert_op_output_matches_expected(
+            math_ops.atan,
+            x.astype(dtype),
+            expected=np.arctan(x).astype(dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.acosh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [0, 1.3169579, 1.76274717, 2.06343707], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.asinh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [0.88137359, 1.44363548, 1.81844646, 2.09471255], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.atanh,
+            np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype),
+            expected=np.array(
+                [0.10033535, 0.20273255, 0.3095196, 0.42364893], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.ceil,
+            np.array([[-1.7, 1.2]], dtype=dtype),
+            expected=np.array([[-1, 2]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.cosh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [1.54308063, 3.76219569, 10.067662, 27.30823284], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        # Disable float16 testing for now
+        if dtype != np.float16:
+          x = np.arange(-10, 10, 1).astype(dtype)
           erf_x = session.run(math_ops.erf(x))
           erfc_x = session.run(math_ops.erfc(x))
 
-        self.assert_op_output_matches_expected(math_ops.erf, x, expected=erf_x)
-        self.assert_op_output_matches_expected(
-            math_ops.erfc, x, expected=erfc_x
-        )
-
-      self.assert_op_output_matches_expected(
-          math_ops.exp,
-          np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[0.36787945, 2.7182817]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.expm1,
-          np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype),
-          rtol=1e-5,
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.floor,
-          np.array([[-1.7, 1.2]], dtype=dtype),
-          expected=np.array([[-2, 1]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.is_finite,
-          np.array(
-              [[-np.inf, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype
-          ),
-          expected=np.array([[0, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=np.bool_),
-      )
-
-      # Tests for tf.nn ops.
-      self.assert_op_output_matches_expected(
-          nn_ops.l2_loss, np.array([[[]]], dtype=dtype), expected=dtype(0)
-      )
-
-      self.assert_op_output_matches_expected(nn_ops.l2_loss, dtype(4), dtype(8))
-
-      self.assert_op_output_matches_expected(
-          nn_ops.l2_loss, np.array([[-2, 4]], dtype=dtype), expected=dtype(10)
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.reciprocal,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[1, 0.5]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.log,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[0, 0.69314718]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sin,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[0.841478, 0.909302]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.cos,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[0.540297, -0.41614]], dtype=dtype),
-      )
-
-      # Confirm that log1p will remain precise across a range of small values.
-      self.assert_op_output_matches_expected(
-          math_ops.log1p,
-          np.array(
-              [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
-              dtype=dtype,
-          ),
-          expected=np.log1p(
-              np.array(
-                  [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
-                  dtype=dtype,
-              )
-          ).astype(dtype),
-          rtol=1e-15 if dtype == np.float64 else 1e-4,
-          atol=1e-15 if dtype == np.float64 else 1e-4,
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.rint,
-          np.array(
-              [
-                  [-1.7, 1.2, 4.0, 0.0],
-                  [-3.5, -2.5, -1.5, -0.5],
-                  [0.5, 1.5, 2.5, 3.5],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
-          ),
-      )
-      self.assert_op_output_matches_expected(
-          math_ops.round,
-          np.array(
-              [
-                  [-1.7, 1.2, 4.0, 0.0],
-                  [-3.5, -2.5, -1.5, -0.5],
-                  [0.5, 1.5, 2.5, 3.5],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.rsqrt,
-          np.array([[4, 16]], dtype=dtype),
-          expected=np.array([[0.5, 0.25]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sigmoid,
-          np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
-          expected=np.array(
-              [
-                  [0.7310586, 0.7310586, 0.7310586, 0.7310586],
-                  [0.7310586, 0.880797, 0.95257413, 0.98201376],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sigmoid,
-          np.array([-300, -150, 0, 150, 300], dtype=dtype),
-          expected=np.array([0, 0, 0.5, 1, 1], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sinh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [1.17520119, 3.62686041, 10.01787493, 27.2899172], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sqrt,
-          np.array([[4, 9]], dtype=dtype),
-          expected=np.array([[2, 3]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.tan,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.tanh,
-          np.array(
-              [[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20], [19, -19, 22, -22]],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [
-                  [0.76159418, 0.96402758, 0.99505478, 0.99932933],
-                  [1.0, -1.0, np.nan, 1.0],
-                  [1.0, -1.0, 1.0, -1.0],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.log_softmax,
-          np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
-          expected=np.array(
-              [
-                  [-1.3862944, -1.3862944, -1.3862944, -1.3862944],
-                  [-3.4401896, -2.4401896, -1.4401897, -0.44018969],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.elu,
-          np.array([[-1, 0, 1, -1e-6]], dtype=dtype),
-          expected=np.array([[-0.63212056, 0, 1, -9.999995e-07]], dtype=dtype),
-          rtol=1e-5,
-          atol=1e-6,
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.selu,
-          np.array([[-1, 0, 1, -1e-5]], dtype=dtype),
-          expected=np.array(
-              [[-1.11133074, 0.0, 1.05070099, -1.758090550379974e-05]],
-              dtype=dtype,
-          ),
-          rtol=1e-5,
-          atol=1e-6,
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.relu,
-          np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[0, 1]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.relu6,
-          np.array([[-0.05, 6.05, 5]], dtype=dtype),
-          expected=np.array([[0, 6, 5]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.leaky_relu,
-          np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
-          expected=np.array([[-0.4, -0.2, 0.0, 1.0, 2.0]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softmax,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [0.032058604, 0.087144323, 0.23688284, 0.64391428], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softmax,
-          np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
-          expected=np.array(
-              [
-                  [0.25, 0.25, 0.25, 0.25],
-                  [0.032058604, 0.087144323, 0.23688284, 0.64391428],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softmax,
-          np.array([[[1, 1], [1, 1]], [[1, 2], [3, 4]]], dtype=dtype),
-          expected=np.array(
-              [
-                  [[0.5, 0.5], [0.5, 0.5]],
-                  [[0.26894142, 0.73105858], [0.26894142, 0.73105858]],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softsign,
-          np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
-          expected=np.array(
-              [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sign,
-          np.array(
-              [[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0, float("nan")]], dtype=dtype
-          ),
-          expected=np.array(
-              [[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0, float("nan")]], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.is_finite,
-          np.array(
-              [[42, float("inf"), -123], [float("nan"), 0, -0.0]], dtype=dtype
-          ),
-          expected=np.array(
-              [[True, False, True], [False, True, True]], dtype=np.bool_
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.lgamma,
-          np.array(0.5, dtype=dtype),
-          expected=np.array(np.log(np.pi) / 2, dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.lgamma,
-          np.array(
-              [
-                  [1, 2, 3],
-                  [4, 5, 6],
-                  [1 / 2, 3 / 2, 5 / 2],
-                  [-3 / 2, -7 / 2, -11 / 2],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [
-                  [0, 0, np.log(2.0)],
-                  [np.log(6.0), np.log(24.0), np.log(120)],
-                  [
-                      np.log(np.pi) / 2,
-                      np.log(np.pi) / 2 - np.log(2),
-                      np.log(np.pi) / 2 - np.log(4) + np.log(3),
-                  ],
-                  [
-                      np.log(np.pi) / 2 - np.log(3) + np.log(4),
-                      np.log(np.pi) / 2 - np.log(105) + np.log(16),
-                      np.log(np.pi) / 2 - np.log(10395) + np.log(64),
-                  ],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      # The actual result is complex. Take the real part.
-      self.assert_op_output_matches_expected(
-          math_ops.lgamma,
-          np.array([-1 / 2, -5 / 2, -9 / 2], dtype=dtype),
-          expected=np.array(
-              [
-                  np.log(np.pi) / 2 + np.log(2),
-                  np.log(np.pi) / 2 - np.log(15) + np.log(8),
-                  np.log(np.pi) / 2 - np.log(945) + np.log(32),
-              ],
-              dtype=dtype,
-          ),
-          atol=1e-4,
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.digamma,
-          np.array(
-              [
-                  [1.0, 0.5, 1 / 3.0],
-                  [0.25, 1 / 6.0, 0.125],
-                  [2.0, 3.0, 4.0],
-                  [6.0, 8.0, 9.0],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [
-                  [
-                      -np.euler_gamma,
-                      -2 * np.log(2) - np.euler_gamma,
-                      -np.pi / 2 / np.sqrt(3)
-                      - 3 * np.log(3) / 2
-                      - np.euler_gamma,
-                  ],
-                  [
-                      -np.pi / 2 - 3 * np.log(2) - np.euler_gamma,
-                      -np.pi * np.sqrt(3) / 2
-                      - 2 * np.log(2)
-                      - 3 * np.log(3) / 2
-                      - np.euler_gamma,
-                      -np.pi / 2
-                      - 4 * np.log(2)
-                      - (
-                          np.pi
-                          + np.log(2 + np.sqrt(2))
-                          - np.log(2 - np.sqrt(2))
-                      )
-                      / np.sqrt(2)
-                      - np.euler_gamma,
-                  ],
-                  [
-                      1 - np.euler_gamma,
-                      1.5 - np.euler_gamma,
-                      11 / 6.0 - np.euler_gamma,
-                  ],
-                  [
-                      137 / 60.0 - np.euler_gamma,
-                      363 / 140.0 - np.euler_gamma,
-                      761 / 280.0 - np.euler_gamma,
-                  ],
-              ],
-              dtype=dtype,
-          ),
-      )
+          self.assert_op_output_matches_expected(
+              math_ops.erf,
+              x,
+              expected=erf_x,
+              local_session=session,
+          )
+          self.assert_op_output_matches_expected(
+              math_ops.erfc,
+              x,
+              expected=erfc_x,
+              local_session=session,
+          )
+
+        self.assert_op_output_matches_expected(
+            math_ops.exp,
+            np.array([[-1, 1]], dtype=dtype),
+            expected=np.array([[0.36787945, 2.7182817]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.expm1,
+            np.array([[-1, 1]], dtype=dtype),
+            expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype),
+            local_session=session,
+            rtol=1e-5,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.floor,
+            np.array([[-1.7, 1.2]], dtype=dtype),
+            expected=np.array([[-2, 1]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.is_finite,
+            np.array(
+                [[-np.inf, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype
+            ),
+            expected=np.array([[0, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=np.bool_),
+            local_session=session,
+        )
+
+        # Tests for tf.nn ops.
+        self.assert_op_output_matches_expected(
+            nn_ops.l2_loss,
+            np.array([[[]]], dtype=dtype),
+            expected=dtype(0),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.l2_loss,
+            dtype(4),
+            dtype(8),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.l2_loss,
+            np.array([[-2, 4]], dtype=dtype),
+            expected=dtype(10),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.reciprocal,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[1, 0.5]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.log,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[0, 0.69314718]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sin,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[0.841478, 0.909302]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.cos,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[0.540297, -0.41614]], dtype=dtype),
+            local_session=session,
+        )
+
+        # Confirm that log1p will remain precise across a range of small values.
+        self.assert_op_output_matches_expected(
+            math_ops.log1p,
+            np.array(
+                [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
+                dtype=dtype,
+            ),
+            expected=np.log1p(
+                np.array(
+                    [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
+                    dtype=dtype,
+                )
+            ).astype(dtype),
+            local_session=session,
+            rtol=1e-15 if dtype == np.float64 else 1e-4,
+            atol=1e-15 if dtype == np.float64 else 1e-4,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.rint,
+            np.array(
+                [
+                    [-1.7, 1.2, 4.0, 0.0],
+                    [-3.5, -2.5, -1.5, -0.5],
+                    [0.5, 1.5, 2.5, 3.5],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
+            ),
+            local_session=session,
+        )
+        self.assert_op_output_matches_expected(
+            math_ops.round,
+            np.array(
+                [
+                    [-1.7, 1.2, 4.0, 0.0],
+                    [-3.5, -2.5, -1.5, -0.5],
+                    [0.5, 1.5, 2.5, 3.5],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.rsqrt,
+            np.array([[4, 16]], dtype=dtype),
+            expected=np.array([[0.5, 0.25]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sigmoid,
+            np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
+            expected=np.array(
+                [
+                    [0.7310586, 0.7310586, 0.7310586, 0.7310586],
+                    [0.7310586, 0.880797, 0.95257413, 0.98201376],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sigmoid,
+            np.array([-300, -150, 0, 150, 300], dtype=dtype),
+            expected=np.array([0, 0, 0.5, 1, 1], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sinh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [1.17520119, 3.62686041, 10.01787493, 27.2899172], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sqrt,
+            np.array([[4, 9]], dtype=dtype),
+            expected=np.array([[2, 3]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.tan,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.tanh,
+            np.array(
+                [
+                    [1, 2, 3, 4],
+                    [np.inf, -np.inf, np.nan, 20],
+                    [19, -19, 22, -22],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [
+                    [0.76159418, 0.96402758, 0.99505478, 0.99932933],
+                    [1.0, -1.0, np.nan, 1.0],
+                    [1.0, -1.0, 1.0, -1.0],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.log_softmax,
+            np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
+            expected=np.array(
+                [
+                    [-1.3862944, -1.3862944, -1.3862944, -1.3862944],
+                    [-3.4401896, -2.4401896, -1.4401897, -0.44018969],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.elu,
+            np.array([[-1, 0, 1, -1e-6]], dtype=dtype),
+            expected=np.array(
+                [[-0.63212056, 0, 1, -9.999995e-07]], dtype=dtype
+            ),
+            rtol=1e-5,
+            atol=1e-6,
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.selu,
+            np.array([[-1, 0, 1, -1e-5]], dtype=dtype),
+            expected=np.array(
+                [[-1.11133074, 0.0, 1.05070099, -1.758090550379974e-05]],
+                dtype=dtype,
+            ),
+            rtol=1e-5,
+            atol=1e-6,
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.relu,
+            np.array([[-1, 1]], dtype=dtype),
+            expected=np.array([[0, 1]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.relu6,
+            np.array([[-0.05, 6.05, 5]], dtype=dtype),
+            expected=np.array([[0, 6, 5]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.leaky_relu,
+            np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
+            expected=np.array([[-0.4, -0.2, 0.0, 1.0, 2.0]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softmax,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [0.032058604, 0.087144323, 0.23688284, 0.64391428], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softmax,
+            np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
+            expected=np.array(
+                [
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.032058604, 0.087144323, 0.23688284, 0.64391428],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softmax,
+            np.array([[[1, 1], [1, 1]], [[1, 2], [3, 4]]], dtype=dtype),
+            expected=np.array(
+                [
+                    [[0.5, 0.5], [0.5, 0.5]],
+                    [[0.26894142, 0.73105858], [0.26894142, 0.73105858]],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softsign,
+            np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
+            expected=np.array(
+                [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sign,
+            np.array(
+                [[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0, float("nan")]], dtype=dtype
+            ),
+            expected=np.array(
+                [[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0, float("nan")]], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.is_finite,
+            np.array(
+                [[42, float("inf"), -123], [float("nan"), 0, -0.0]], dtype=dtype
+            ),
+            expected=np.array(
+                [[True, False, True], [False, True, True]], dtype=np.bool_
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.lgamma,
+            np.array(0.5, dtype=dtype),
+            expected=np.array(np.log(np.pi) / 2, dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.lgamma,
+            np.array(
+                [
+                    [1, 2, 3],
+                    [4, 5, 6],
+                    [1 / 2, 3 / 2, 5 / 2],
+                    [-3 / 2, -7 / 2, -11 / 2],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [
+                    [0, 0, np.log(2.0)],
+                    [np.log(6.0), np.log(24.0), np.log(120)],
+                    [
+                        np.log(np.pi) / 2,
+                        np.log(np.pi) / 2 - np.log(2),
+                        np.log(np.pi) / 2 - np.log(4) + np.log(3),
+                    ],
+                    [
+                        np.log(np.pi) / 2 - np.log(3) + np.log(4),
+                        np.log(np.pi) / 2 - np.log(105) + np.log(16),
+                        np.log(np.pi) / 2 - np.log(10395) + np.log(64),
+                    ],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        # The actual result is complex. Take the real part.
+        self.assert_op_output_matches_expected(
+            math_ops.lgamma,
+            np.array([-1 / 2, -5 / 2, -9 / 2], dtype=dtype),
+            expected=np.array(
+                [
+                    np.log(np.pi) / 2 + np.log(2),
+                    np.log(np.pi) / 2 - np.log(15) + np.log(8),
+                    np.log(np.pi) / 2 - np.log(945) + np.log(32),
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+            atol=1e-4,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.digamma,
+            np.array(
+                [
+                    [1.0, 0.5, 1 / 3.0],
+                    [0.25, 1 / 6.0, 0.125],
+                    [2.0, 3.0, 4.0],
+                    [6.0, 8.0, 9.0],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [
+                    [
+                        -np.euler_gamma,
+                        -2 * np.log(2) - np.euler_gamma,
+                        -np.pi / 2 / np.sqrt(3)
+                        - 3 * np.log(3) / 2
+                        - np.euler_gamma,
+                    ],
+                    [
+                        -np.pi / 2 - 3 * np.log(2) - np.euler_gamma,
+                        -np.pi * np.sqrt(3) / 2
+                        - 2 * np.log(2)
+                        - 3 * np.log(3) / 2
+                        - np.euler_gamma,
+                        -np.pi / 2
+                        - 4 * np.log(2)
+                        - (
+                            np.pi
+                            + np.log(2 + np.sqrt(2))
+                            - np.log(2 - np.sqrt(2))
+                        )
+                        / np.sqrt(2)
+                        - np.euler_gamma,
+                    ],
+                    [
+                        1 - np.euler_gamma,
+                        1.5 - np.euler_gamma,
+                        11 / 6.0 - np.euler_gamma,
+                    ],
+                    [
+                        137 / 60.0 - np.euler_gamma,
+                        363 / 140.0 - np.euler_gamma,
+                        761 / 280.0 - np.euler_gamma,
+                    ],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 3061b37aaa354c..88b379331b32ef 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -110,12 +110,12 @@ namespace {
 int64_t tf_xla_random_seed = 0;
 int32_t tf_xla_test_repetitions = 20;
 int64_t tf_xla_max_tensor_size = 10000LL;
-string* tf_xla_test_device_ptr;       // initial value set in main()
-string* tf_xla_reference_device_ptr;  // initial value set in main()
+std::string* tf_xla_test_device_ptr;       // initial value set in main()
+std::string* tf_xla_reference_device_ptr;  // initial value set in main()
 bool tf_xla_test_use_jit = true;
 bool tf_xla_test_use_mlir = false;
 
-string LocalDeviceToFullDeviceName(const string& device) {
+std::string LocalDeviceToFullDeviceName(const std::string& device) {
   return absl::StrCat("/job:localhost/replica:0/task:0/device:", device);
 }
 
@@ -129,7 +129,7 @@ constexpr std::array<DataType, 4> kAllNumberTypes = {
 // operator.
 class OpTestBuilder {
  public:
-  explicit OpTestBuilder(const string& op_name);
+  explicit OpTestBuilder(const std::string& op_name);
 
   // Adds an input 'tensor' as a Placeholder node.
   OpTestBuilder& Input(const Tensor& tensor);
@@ -161,10 +161,11 @@ class OpTestBuilder {
   // sets it to the NodeDef of the operator under test. Fills 'inputs' and
   // 'outputs' with the names of the input placeholder nodes and the output
   // identity nodes, respectively.
-  absl::Status BuildGraph(const string& name_prefix, const string& device,
-                          bool use_jit, GraphDef* graphdef,
-                          NodeDef** test_node_def, std::vector<string>* inputs,
-                          std::vector<string>* outputs) const;
+  absl::Status BuildGraph(const std::string& name_prefix,
+                          const std::string& device, bool use_jit,
+                          GraphDef* graphdef, NodeDef** test_node_def,
+                          std::vector<std::string>* inputs,
+                          std::vector<std::string>* outputs) const;
 
   struct InputDescription {
     Tensor tensor;
@@ -182,7 +183,7 @@ class OpTestBuilder {
   std::vector<InputDescription> inputs_;
 };
 
-OpTestBuilder::OpTestBuilder(const string& op_name) {
+OpTestBuilder::OpTestBuilder(const std::string& op_name) {
   node_def_.set_op(op_name);
 }
 
@@ -247,12 +248,10 @@ OpTestBuilder& OpTestBuilder::Attr(absl::string_view attr_name,
   return *this;
 }
 
-absl::Status OpTestBuilder::BuildGraph(const string& name_prefix,
-                                       const string& device, bool use_jit,
-                                       GraphDef* graphdef,
-                                       NodeDef** test_node_def,
-                                       std::vector<string>* inputs,
-                                       std::vector<string>* outputs) const {
+absl::Status OpTestBuilder::BuildGraph(
+    const std::string& name_prefix, const std::string& device, bool use_jit,
+    GraphDef* graphdef, NodeDef** test_node_def,
+    std::vector<std::string>* inputs, std::vector<std::string>* outputs) const {
   OpRegistryInterface* op_registry = OpRegistry::Global();
 
   const OpDef* op_def;
@@ -275,7 +274,7 @@ absl::Status OpTestBuilder::BuildGraph(const string& name_prefix,
   // Build feed and fetch nodes.
   for (int i = 0; i < input_types.size(); ++i) {
     NodeDef* def = graphdef->add_node();
-    string name = absl::StrCat(name_prefix, "_input_", i);
+    std::string name = absl::StrCat(name_prefix, "_input_", i);
     TF_RETURN_IF_ERROR(NodeDefBuilder(name, "Placeholder")
                            .Device(device)
                            .Attr("dtype", input_types[i])
@@ -286,7 +285,7 @@ absl::Status OpTestBuilder::BuildGraph(const string& name_prefix,
 
   for (int i = 0; i < output_types.size(); ++i) {
     NodeDef* def = graphdef->add_node();
-    string name = absl::StrCat(name_prefix, "_output_", i);
+    std::string name = absl::StrCat(name_prefix, "_output_", i);
     TF_RETURN_IF_ERROR(NodeDefBuilder(name, "Identity")
                            .Device(device)
                            .Attr("T", output_types[i])
@@ -494,7 +493,7 @@ class OpTest : public ::testing::Test {
                                  const std::vector<int64_t>& spatial_dims);
 
   // Converts an int64 vector to an int32 vector.
-  std::vector<int32> AsInt32s(const std::vector<int64_t>& int64s);
+  std::vector<int32_t> AsInt32s(const std::vector<int64_t>& int64s);
 
   std::mt19937& generator() { return *generator_; }
 
@@ -664,16 +663,16 @@ class TensorGeneratorComplex64 : public TensorGenerator<complex64> {
   }
 };
 
-class TensorGeneratorInt32 : public TensorGenerator<int32> {
+class TensorGeneratorInt32 : public TensorGenerator<int32_t> {
  public:
   explicit TensorGeneratorInt32(OpTest& test) : TensorGenerator(test) {}
   DataType dtype() override { return DT_INT32; }
-  void RandomVals(std::optional<int32> lo, std::optional<int32> hi,
+  void RandomVals(std::optional<int32_t> lo, std::optional<int32_t> hi,
                   bool needs_unique_values,
-                  absl::FixedArray<int32>& vals) override {
-    absl::flat_hash_set<int32> already_generated;
-    std::uniform_int_distribution<int32> distribution(lo.value_or(-(1 << 20)),
-                                                      hi.value_or(1 << 20));
+                  absl::FixedArray<int32_t>& vals) override {
+    absl::flat_hash_set<int32_t> already_generated;
+    std::uniform_int_distribution<int32_t> distribution(lo.value_or(-(1 << 20)),
+                                                        hi.value_or(1 << 20));
     for (int64_t i = 0; i < vals.size(); ++i) {
       int32_t generated;
       do {
@@ -685,13 +684,13 @@ class TensorGeneratorInt32 : public TensorGenerator<int32> {
   }
 };
 
-class TensorGeneratorInt64 : public TensorGenerator<int64> {
+class TensorGeneratorInt64 : public TensorGenerator<int64_t> {
  public:
   explicit TensorGeneratorInt64(OpTest& test) : TensorGenerator(test) {}
   DataType dtype() override { return DT_INT64; }
-  void RandomVals(std::optional<int64> lo, std::optional<int64> hi,
+  void RandomVals(std::optional<int64_t> lo, std::optional<int64_t> hi,
                   bool needs_unique_values,
-                  absl::FixedArray<int64>& vals) override {
+                  absl::FixedArray<int64_t>& vals) override {
     absl::flat_hash_set<int64_t> already_generated;
     std::uniform_int_distribution<int64_t> distribution(
         lo.value_or(-(1LL << 40)), hi.value_or(1LL << 40));
@@ -928,18 +927,19 @@ Tensor OpTest::RandomBoundedTensor(DataType dtype, Tensor lo, Tensor hi) {
       break;
     }
     case DT_INT32: {
-      auto lo_flat = lo.flat<int32>();
-      auto hi_flat = hi.flat<int32>();
-      test::FillFn<int32>(&tensor, [this, &lo_flat, &hi_flat](int i) -> int32 {
-        std::uniform_int_distribution<int32> distribution(lo_flat(i),
-                                                          hi_flat(i));
-        return distribution(generator());
-      });
+      auto lo_flat = lo.flat<int32_t>();
+      auto hi_flat = hi.flat<int32_t>();
+      test::FillFn<int32_t>(
+          &tensor, [this, &lo_flat, &hi_flat](int i) -> int32_t {
+            std::uniform_int_distribution<int32_t> distribution(lo_flat(i),
+                                                                hi_flat(i));
+            return distribution(generator());
+          });
       break;
     }
     case DT_INT64: {
-      auto lo_flat = lo.flat<int64>();
-      auto hi_flat = hi.flat<int64>();
+      auto lo_flat = lo.flat<int64_t>();
+      auto hi_flat = hi.flat<int64_t>();
       test::FillFn<int64_t>(
           &tensor, [this, &lo_flat, &hi_flat](int i) -> int64_t {
             std::uniform_int_distribution<int64_t> distribution(lo_flat(i),
@@ -1021,21 +1021,21 @@ OpTest::BroadcastableDims() {
 
 Tensor OpTest::RandomReductionIndices(int rank) {
   std::bernoulli_distribution random_bool;
-  std::vector<int32> indices;
+  std::vector<int32_t> indices;
   for (int i = 0; i < rank; ++i) {
     if (random_bool(generator())) {
       indices.push_back(i);
     }
   }
-  return test::AsTensor<int32>(indices);
+  return test::AsTensor<int32_t>(indices);
 }
 
 // Helper that converts 'values' to an int32 or int64 Tensor.
 static Tensor AsIntTensor(DataType dtype, const std::vector<int64_t>& values) {
   switch (dtype) {
     case DT_INT32: {
-      std::vector<int32> values32(values.begin(), values.end());
-      return test::AsTensor<int32>(values32);
+      std::vector<int32_t> values32(values.begin(), values.end());
+      return test::AsTensor<int32_t>(values32);
     }
     case DT_INT64:
       return test::AsTensor<int64_t>(values);
@@ -1092,9 +1092,9 @@ OpTest::ConcatArguments OpTest::ChooseConcatArguments(bool int64_idx_allowed) {
   std::vector<int64_t> dims = RandomDims(1, 4, 0, 64);
 
   int axis =
-      std::uniform_int_distribution<int32>(0, dims.size() - 1)(generator());
-  a.axis =
-      use_int64_idx ? test::AsScalar<int64>(axis) : test::AsScalar<int32>(axis);
+      std::uniform_int_distribution<int32_t>(0, dims.size() - 1)(generator());
+  a.axis = use_int64_idx ? test::AsScalar<int64_t>(axis)
+                         : test::AsScalar<int32_t>(axis);
 
   for (int i = 0; i < a.n; ++i) {
     std::vector<int64_t> shape = dims;
@@ -1113,7 +1113,7 @@ OpTest::EinsumArguments OpTest::ChooseEinsumArguments() {
   switch (op_kind) {
     case matmul:
     case batchmatmul: {
-      std::vector<int64> dims;
+      std::vector<int64_t> dims;
       if (op_kind == matmul) {
         a.equation = "ij,jk->ik";
         dims = RandomDims(2, 2);
@@ -1131,7 +1131,7 @@ OpTest::EinsumArguments OpTest::ChooseEinsumArguments() {
     }
     case dot: {
       a.equation = "i,i->";
-      std::vector<int64> dims = RandomDims(1, 1);
+      std::vector<int64_t> dims = RandomDims(1, 1);
       a.lhs_dims = dims;
       a.rhs_dims = dims;
       break;
@@ -1166,11 +1166,11 @@ OpTest::GatherArguments OpTest::ChooseGatherArguments(bool axis_0) {
         a.batch_dims, kDefaultMaxRank - 1);
     axis = axis_distribution(generator());
   }
-  a.axis = test::AsScalar<int32>((int32)axis);
+  a.axis = test::AsScalar<int32_t>((int32_t)axis);
   a.params_shape = RandomDims(axis + 1, kDefaultMaxRank, 1, 16);
   std::vector<int64_t> indices_shape = RandomDims(0, 3, 0, 16);
-  a.indices = RandomBoundedTensor<int32>(DT_INT32, 0, a.params_shape[axis] - 1,
-                                         false, indices_shape);
+  a.indices = RandomBoundedTensor<int32_t>(
+      DT_INT32, 0, a.params_shape[axis] - 1, false, indices_shape);
 
   return a;
 }
@@ -1209,7 +1209,7 @@ OpTest::ScatterArguments OpTest::ChooseScatterArguments() {
   a.indices_type = DT_INT32;
   a.shape = RandomDims(1, kDefaultMaxRank, 1);
   int rank = a.shape.size();
-  std::uniform_int_distribution<int32> index_len_dist(1, rank);
+  std::uniform_int_distribution<int32_t> index_len_dist(1, rank);
   int index_len = index_len_dist(generator());
   std::vector<int64_t> indices_first = RandomDims(1, kDefaultMaxRank - 1, 1);
   std::vector<int64_t> indices_shape(indices_first);
@@ -1219,9 +1219,9 @@ OpTest::ScatterArguments OpTest::ChooseScatterArguments() {
     updates_shape.push_back(a.shape[index_len + i]);
   }
   Tensor indices_lo(a.indices_type, TensorShape(indices_shape));
-  test::FillFn<int32>(&indices_lo, [](int i) -> int32 { return 0; });
+  test::FillFn<int32_t>(&indices_lo, [](int i) -> int32_t { return 0; });
   Tensor indices_hi(a.indices_type, TensorShape(indices_shape));
-  test::FillFn<int32>(&indices_hi, [index_len, &a](int i) -> int32 {
+  test::FillFn<int32_t>(&indices_hi, [index_len, &a](int i) -> int32_t {
     int idx_dim = i % index_len;
     return a.shape[idx_dim] - 1;
   });
@@ -1239,16 +1239,16 @@ OpTest::SliceArguments OpTest::ChooseSliceArguments(bool neg_one_size) {
   a.shape = RandomDims();
   int rank = a.shape.size();
 
-  std::vector<int32> indices(rank);
+  std::vector<int32_t> indices(rank);
   a.size.resize(rank);
   for (int i = 0; i < rank; ++i) {
     indices[i] =
-        std::uniform_int_distribution<int32>(0, a.shape[i])(generator());
+        std::uniform_int_distribution<int32_t>(0, a.shape[i])(generator());
     int64_t low = neg_one_size ? -1 : 0;
     a.size[i] = std::uniform_int_distribution<int64_t>(
         low, a.shape[i] - indices[i])(generator());
   }
-  a.indices = test::AsTensor<int32>(indices);
+  a.indices = test::AsTensor<int32_t>(indices);
 
   return a;
 }
@@ -1341,8 +1341,8 @@ std::vector<int64_t> OpTest::ImageDims(
   return dims;
 }
 
-std::vector<int32> OpTest::AsInt32s(const std::vector<int64_t>& int64s) {
-  return std::vector<int32>(int64s.begin(), int64s.end());
+std::vector<int32_t> OpTest::AsInt32s(const std::vector<int64_t>& int64s) {
+  return std::vector<int32_t>(int64s.begin(), int64s.end());
 }
 
 // Functions for comparing tensors.
@@ -1382,11 +1382,11 @@ bool IsClose<complex64>(const complex64& x, const complex64& y, double atol,
 }
 
 template <typename T>
-string Str(T x) {
+std::string Str(T x) {
   return absl::StrCat(x);
 }
 template <>
-string Str<complex64>(complex64 x) {
+std::string Str<complex64>(complex64 x) {
   return absl::StrCat("(", x.real(), ", ", x.imag(), ")");
 }
 
@@ -1460,7 +1460,7 @@ absl::Status TensorsAreClose(const Tensor& a, const Tensor& b, double atol,
     case DT_COMPLEX64:
       return TensorsAreCloseImpl<complex64>(a, b, atol, rtol);
     case DT_INT32:
-      return TensorsAreEqualImpl<int32>(a, b);
+      return TensorsAreEqualImpl<int32_t>(a, b);
     case DT_INT64:
       return TensorsAreEqualImpl<int64_t>(a, b);
     case DT_BOOL:
@@ -1499,9 +1499,10 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
     VLOG(1) << "Input: " << input_tensors.back().DebugString();
   }
 
-  string reference_device =
+  std::string reference_device =
       LocalDeviceToFullDeviceName(*tf_xla_reference_device_ptr);
-  string test_device = LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
+  std::string test_device =
+      LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
 
   DeviceNameUtils::ParsedName parsed_name;
   if (!DeviceNameUtils::ParseLocalName(*tf_xla_test_device_ptr, &parsed_name)) {
@@ -1512,8 +1513,8 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
   ++num_tests_;
 
   GraphDef graph;
-  std::vector<string> expected_inputs, test_inputs;
-  std::vector<string> expected_fetches, test_fetches;
+  std::vector<std::string> expected_inputs, test_inputs;
+  std::vector<std::string> expected_fetches, test_fetches;
   absl::Status status = builder.BuildGraph(
       absl::StrCat("test", num_tests_, "_expected"), reference_device,
       /*use_jit=*/false, &graph, /*test_node_def=*/nullptr, &expected_inputs,
@@ -1550,8 +1551,9 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
     return kFatalError;
   }
 
-  std::vector<std::pair<string, Tensor>> expected_feeds(expected_inputs.size());
-  std::vector<std::pair<string, Tensor>> test_feeds(test_inputs.size());
+  std::vector<std::pair<std::string, Tensor>> expected_feeds(
+      expected_inputs.size());
+  std::vector<std::pair<std::string, Tensor>> test_feeds(test_inputs.size());
   CHECK_EQ(input_tensors.size(), expected_inputs.size());
   CHECK_EQ(input_tensors.size(), test_inputs.size());
 
@@ -1707,12 +1709,12 @@ TEST_F(OpTest, ArgMax) {
     auto type = Choose<DataType>({DT_BOOL, DT_FLOAT});
     std::vector<int64_t> dims = RandomDims(1, 5, 1);
     int num_dims = dims.size();
-    int reduce_dim =
-        std::uniform_int_distribution<int32>(-num_dims, num_dims)(generator());
+    int reduce_dim = std::uniform_int_distribution<int32_t>(
+        -num_dims, num_dims)(generator());
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ArgMax")
             .RandomInput(type, dims)
-            .Input(test::AsScalar<int32>(reduce_dim))
+            .Input(test::AsScalar<int32_t>(reduce_dim))
             .Attr("T", type)
             .Attr("Tidx", DT_INT32)
             .Attr("output_type", DT_INT32));
@@ -1724,12 +1726,12 @@ TEST_F(OpTest, ArgMin) {
     auto type = Choose<DataType>({DT_BOOL, DT_FLOAT});
     std::vector<int64_t> dims = RandomDims(1, 5, 1);
     int num_dims = dims.size();
-    int reduce_dim =
-        std::uniform_int_distribution<int32>(-num_dims, num_dims)(generator());
+    int reduce_dim = std::uniform_int_distribution<int32_t>(
+        -num_dims, num_dims)(generator());
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ArgMin")
             .RandomInput(type, dims)
-            .Input(test::AsScalar<int32>(reduce_dim))
+            .Input(test::AsScalar<int32_t>(reduce_dim))
             .Attr("T", type)
             .Attr("Tidx", DT_INT32)
             .Attr("output_type", DT_INT32));
@@ -1786,7 +1788,7 @@ TEST_F(OpTest, AvgPool) {
         std::uniform_int_distribution<int>(1, dims[2])(generator());
     int stride_rows = random_int(generator()),
         stride_cols = random_int(generator());
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPool")
             .RandomInput(DT_FLOAT, dims)
@@ -1817,7 +1819,7 @@ TEST_F(OpTest, AvgPool3D) {
     int64_t batch = dims[3];
     int64_t feature = dims[4];
 
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPool3D")
             .RandomInput(DT_FLOAT,
@@ -1837,13 +1839,13 @@ TEST_F(OpTest, AvgPoolGrad) {
   Repeatedly([this]() {
     int batch = RandomDim(1), features = RandomDim(1);
     WindowedSpatialDims d = ChooseWindowedSpatialDims(2);
-    std::vector<int32> input_dims =
+    std::vector<int32_t> input_dims =
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features, d.input_dims));
     std::vector<int64_t> output_dims =
         ImageDims(FORMAT_NHWC, batch, features, d.output_dims);
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPoolGrad")
-            .Input(test::AsTensor<int32>(input_dims))
+            .Input(test::AsTensor<int32_t>(input_dims))
             .RandomInput(DT_FLOAT, output_dims)
             .Attr("T", DT_FLOAT)
             .Attr("ksize", ImageDims(FORMAT_NHWC, 1, 1, d.kernel_dims))
@@ -1859,13 +1861,13 @@ TEST_F(OpTest, AvgPool3DGrad) {
   Repeatedly([this]() {
     int batch = RandomDim(1), features = RandomDim(1);
     WindowedSpatialDims d = ChooseWindowedSpatialDims(3);
-    std::vector<int32> input_dims =
+    std::vector<int32_t> input_dims =
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features, d.input_dims));
     std::vector<int64_t> output_dims =
         ImageDims(FORMAT_NHWC, batch, features, d.output_dims);
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPool3DGrad")
-            .Input(test::AsTensor<int32>(input_dims))
+            .Input(test::AsTensor<int32_t>(input_dims))
             .RandomInput(DT_FLOAT, output_dims)
             .Attr("T", DT_FLOAT)
             .Attr("ksize", ImageDims(FORMAT_NHWC, 1, 1, d.kernel_dims))
@@ -1976,8 +1978,8 @@ TEST_F(OpTest, BatchToSpaceND) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("BatchToSpaceND")
             .RandomInput(type, input_dims)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(block_dims.begin(), block_dims.end())))
             .Input(crops)
             .Attr("T", type));
   });
@@ -2198,15 +2200,15 @@ TEST_F(OpTest, ConcatOffset) {
 
     std::vector<int64_t> dims = RandomDims(1);
     int concat_dim =
-        std::uniform_int_distribution<int32>(0, dims.size() - 1)(generator());
+        std::uniform_int_distribution<int32_t>(0, dims.size() - 1)(generator());
 
     OpTestBuilder builder("ConcatOffset");
-    builder.Input(test::AsScalar<int32>(concat_dim));
+    builder.Input(test::AsScalar<int32_t>(concat_dim));
     builder.Attr("N", n);
     for (int i = 0; i < n; ++i) {
-      std::vector<int32> shape(dims.begin(), dims.end());
+      std::vector<int32_t> shape(dims.begin(), dims.end());
       shape[concat_dim] = RandomDim();
-      builder.Input(test::AsTensor<int32>(shape));
+      builder.Input(test::AsTensor<int32_t>(shape));
     }
     return ExpectTfAndXlaOutputsAreClose(builder);
   });
@@ -2280,7 +2282,8 @@ TEST_F(OpTest, IFFT3D) {
 TEST_F(OpTest, RFFT) {
   Repeatedly([this]() {
     std::vector<int64_t> dims = RandomDims(1, kDefaultMaxRank, 3);
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({dims[dims.size() - 1]}));
+    Tensor fft_shape =
+        test::AsTensor<int32_t>(AsInt32s({dims[dims.size() - 1]}));
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("RFFT").RandomInput(DT_FLOAT, dims).Input(fft_shape));
   });
@@ -2289,7 +2292,7 @@ TEST_F(OpTest, RFFT) {
 TEST_F(OpTest, RFFT2D) {
   Repeatedly([this]() {
     std::vector<int64_t> dims = RandomDims(2, kDefaultMaxRank, 3);
-    Tensor fft_shape = test::AsTensor<int32>(
+    Tensor fft_shape = test::AsTensor<int32_t>(
         AsInt32s({dims[dims.size() - 2], dims[dims.size() - 1]}));
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("RFFT2D").RandomInput(DT_FLOAT, dims).Input(fft_shape));
@@ -2299,7 +2302,7 @@ TEST_F(OpTest, RFFT2D) {
 TEST_F(OpTest, RFFT3D) {
   Repeatedly([this]() {
     std::vector<int64_t> dims = RandomDims(3, kDefaultMaxRank, 3);
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s(
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s(
         {dims[dims.size() - 3], dims[dims.size() - 2], dims[dims.size() - 1]}));
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("RFFT3D").RandomInput(DT_FLOAT, dims).Input(fft_shape));
@@ -2311,7 +2314,7 @@ TEST_F(OpTest, IRFFT) {
     std::vector<int64_t> dims = RandomDims(1, kDefaultMaxRank, 3);
     int64_t orig_size = dims[dims.size() - 1];
     dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s({orig_size}));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT")
                                              .RandomInput(DT_COMPLEX64, dims)
                                              .Input(fft_shape));
@@ -2324,7 +2327,7 @@ TEST_F(OpTest, IRFFT2D) {
     std::vector<int64_t> orig_size = {dims[dims.size() - 2],
                                       dims[dims.size() - 1]};
     dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s({orig_size}));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT2D")
                                              .RandomInput(DT_COMPLEX64, dims)
                                              .Input(fft_shape));
@@ -2337,7 +2340,7 @@ TEST_F(OpTest, IRFFT3D) {
     std::vector<int64_t> orig_size = {
         dims[dims.size() - 3], dims[dims.size() - 2], dims[dims.size() - 1]};
     dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s({orig_size}));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT3D")
                                              .RandomInput(DT_COMPLEX64, dims)
                                              .Input(fft_shape));
@@ -2383,7 +2386,7 @@ TEST_F(OpTest, Conv2DBackpropFilter) {
         ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
-    Tensor kernel_shape = test::AsTensor<int32>(AsInt32s(
+    Tensor kernel_shape = test::AsTensor<int32_t>(AsInt32s(
         {d.kernel_dims[0], d.kernel_dims[1], features_in, features_out}));
     DataType type = DT_FLOAT;
     return ExpectTfAndXlaOutputsAreClose(
@@ -2405,7 +2408,7 @@ TEST_F(OpTest, Conv2DBackpropInput) {
     int features_in = random_int(generator());
     int features_out = random_int(generator());
     int32_t batch = RandomDim();
-    Tensor in_shape = test::AsTensor<int32>(
+    Tensor in_shape = test::AsTensor<int32_t>(
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
@@ -2461,7 +2464,7 @@ TEST_F(OpTest, Conv3DBackpropFilter) {
         ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
-    Tensor kernel_shape = test::AsTensor<int32>(
+    Tensor kernel_shape = test::AsTensor<int32_t>(
         AsInt32s({d.kernel_dims[0], d.kernel_dims[1], d.kernel_dims[2],
                   features_in, features_out}));
     DataType type = DT_FLOAT;
@@ -2485,7 +2488,7 @@ TEST_F(OpTest, Conv3DBackpropInput) {
     int features_in = random_int(generator());
     int features_out = random_int(generator());
     int32_t batch = RandomDim(1);
-    Tensor in_shape = test::AsTensor<int32>(
+    Tensor in_shape = test::AsTensor<int32_t>(
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
@@ -2583,7 +2586,7 @@ TEST_F(OpTest, DepthwiseConv2DNativeBackpropFilter) {
         ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
     std::vector<int64_t> backprop = ImageDims(
         FORMAT_NHWC, batch, features_in * depth_multiplier, d.output_dims);
-    Tensor kernel_shape = test::AsTensor<int32>(AsInt32s(
+    Tensor kernel_shape = test::AsTensor<int32_t>(AsInt32s(
         {d.kernel_dims[0], d.kernel_dims[1], features_in, depth_multiplier}));
     std::vector<int64_t> strides = ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims);
     strides[2] = strides[1];  // Current impl only supports equal strides
@@ -2608,7 +2611,7 @@ TEST_F(OpTest, DepthwiseConv2DBackpropInput) {
     int features_in = random_int(generator());
     int depth_multiplier = random_int(generator());
     int32_t batch = RandomDim();
-    Tensor in_shape = test::AsTensor<int32>(
+    Tensor in_shape = test::AsTensor<int32_t>(
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
     std::vector<int64_t> backprop = ImageDims(
         FORMAT_NHWC, batch, features_in * depth_multiplier, d.output_dims);
@@ -2713,15 +2716,15 @@ TEST_F(OpTest, DynamicStitch) {
     // implementation does so require. However, the native TF implementation
     // leaves undefined values if we don't cover everything, so we can't
     // really test that case anyway.
-    std::vector<int32> indices(size);
+    std::vector<int32_t> indices(size);
     std::iota(indices.begin(), indices.end(), 0);
     std::shuffle(indices.begin(), indices.end(), generator());
 
     int pos = 0;
     for (int i = 0; i < n; ++i) {
       TensorShape shape(index_dims[i]);
-      Tensor t = test::AsTensor<int32>(
-          absl::Span<const int32>(indices).subspan(pos, shape.num_elements()),
+      Tensor t = test::AsTensor<int32_t>(
+          absl::Span<const int32_t>(indices).subspan(pos, shape.num_elements()),
           shape);
       builder.Input(t);
       pos += t.NumElements();
@@ -2781,8 +2784,8 @@ TEST_F(OpTest, EluGrad) {
 TEST_F(OpTest, ScatterNd) {
   Repeatedly([this]() {
     auto a = ChooseScatterArguments();
-    auto shape = test::AsTensor<int32>(
-        std::vector<int32>(a.shape.begin(), a.shape.end()));
+    auto shape = test::AsTensor<int32_t>(
+        std::vector<int32_t>(a.shape.begin(), a.shape.end()));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ScatterNd")
                                              .Input(a.indices)
                                              .Input(a.updates)
@@ -2855,8 +2858,9 @@ TEST_F(OpTest, ExpandDims) {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> in_dims = RandomDims();
     Tensor dim(DT_INT32, TensorShape());
-    std::uniform_int_distribution<int32> d(-1 - in_dims.size(), in_dims.size());
-    dim.scalar<int32>()() = d(generator());
+    std::uniform_int_distribution<int32_t> d(-1 - in_dims.size(),
+                                             in_dims.size());
+    dim.scalar<int32_t>()() = d(generator());
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ExpandDims")
                                              .RandomInput(type, in_dims)
                                              .Input(dim)
@@ -2868,10 +2872,10 @@ TEST_F(OpTest, Fill) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> dims = RandomDims();
-    std::vector<int32> shape(dims.begin(), dims.end());
+    std::vector<int32_t> shape(dims.begin(), dims.end());
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Fill")
-            .Input(test::AsTensor<int32>(shape))
+            .Input(test::AsTensor<int32_t>(shape))
             .RandomInput(type, {})
             .Attr("T", type));
   });
@@ -2949,9 +2953,9 @@ TEST_F(OpTest, GatherNd) {
     std::vector<int64_t> output_shape(output_outer_shape);
     output_shape.push_back(index_len);
     Tensor lo(indices_type, TensorShape(output_shape));
-    test::FillFn<int32>(&lo, [](int i) -> int32 { return 0; });
+    test::FillFn<int32_t>(&lo, [](int i) -> int32_t { return 0; });
     Tensor hi(indices_type, TensorShape(output_shape));
-    test::FillFn<int32>(&hi, [index_len, &params_shape](int i) -> int32 {
+    test::FillFn<int32_t>(&hi, [index_len, &params_shape](int i) -> int32_t {
       int idx_dim = i % index_len;
       return params_shape[idx_dim] - 1;
     });
@@ -3016,7 +3020,7 @@ TEST_F(OpTest, InplaceUpdate) {
     x_dims.insert(x_dims.end(), common_dims.begin(), common_dims.end());
     std::vector<int64_t> i_shape{v_dims[0]};
     Tensor i =
-        RandomBoundedTensor<int32>(DT_INT32, 0, x_dims[0] - 1, true, i_shape);
+        RandomBoundedTensor<int32_t>(DT_INT32, 0, x_dims[0] - 1, true, i_shape);
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("InplaceUpdate")
                                              .RandomInput(type, x_dims)
                                              .Input(i)
@@ -3046,7 +3050,7 @@ TEST_F(OpTest, InvertPermutation) {
     // TODO(b/211012712): Once needs_unique_values case is linear instead of
     // quadratic time, use default Dim max instead of 8.
     int64_t len = RandomDim(0, 8);
-    Tensor x = RandomBoundedTensor<int32>(DT_INT32, 0, len - 1, true, {len});
+    Tensor x = RandomBoundedTensor<int32_t>(DT_INT32, 0, len - 1, true, {len});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("InvertPermutation").Input(x).Attr("T", DT_INT32));
   });
@@ -3151,7 +3155,7 @@ TEST_F(OpTest, Lgamma) {
 TEST_F(OpTest, LinSpace) {
   Repeatedly([this]() {
     auto ToScalar = [](DataType type, int x) {
-      if (type == DT_INT32) return test::AsScalar<int32>(x);
+      if (type == DT_INT32) return test::AsScalar<int32_t>(x);
       return test::AsScalar<int64_t>(x);
     };
     std::uniform_int_distribution<int> distribution(-50, 50);
@@ -3290,11 +3294,11 @@ TEST_F(OpTest, MatrixBandPart) {
     auto type = Choose<DataType>(kAllXlaTypes);
     auto index_type = Choose<DataType>({DT_INT32, DT_INT64});
     auto num_lower =
-        RandomBoundedTensor<int32>(index_type, -2 * kDefaultMaxDimensionSize,
-                                   2 * kDefaultMaxDimensionSize, false, {});
+        RandomBoundedTensor<int32_t>(index_type, -2 * kDefaultMaxDimensionSize,
+                                     2 * kDefaultMaxDimensionSize, false, {});
     auto num_upper =
-        RandomBoundedTensor<int32>(index_type, -2 * kDefaultMaxDimensionSize,
-                                   2 * kDefaultMaxDimensionSize, false, {});
+        RandomBoundedTensor<int32_t>(index_type, -2 * kDefaultMaxDimensionSize,
+                                     2 * kDefaultMaxDimensionSize, false, {});
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixBandPart")
                                              .RandomInput(type)
                                              .Input(num_lower)
@@ -3330,12 +3334,12 @@ TEST_F(OpTest, MatrixDiagPartV3) {
     auto type = Choose<DataType>(kAllXlaTypes);
     auto align = Choose<std::string>(
         {"LEFT_RIGHT", "RIGHT_LEFT", "LEFT_LEFT", "RIGHT_RIGHT"});
-    auto k0 = std::uniform_int_distribution<int32>(
+    auto k0 = std::uniform_int_distribution<int32_t>(
         -2 * kDefaultMaxDimensionSize,
         2 * kDefaultMaxDimensionSize)(generator());
-    auto k1 = std::uniform_int_distribution<int32>(
+    auto k1 = std::uniform_int_distribution<int32_t>(
         k0, 2 * kDefaultMaxDimensionSize)(generator());
-    auto k = test::AsTensor<int32>({k0, k1});
+    auto k = test::AsTensor<int32_t>({k0, k1});
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixDiagPartV3")
                                              .RandomInput(type)
                                              .Input(k)
@@ -3369,10 +3373,10 @@ TEST_F(OpTest, MatrixSetDiagV2) {
     int64_t max_num_diags = shape[rank - 2] + shape[rank - 1] - 1;
     int64_t num_diags =
         std::uniform_int_distribution<int64_t>(2, max_num_diags)(generator());
-    int32 k0 = std::uniform_int_distribution<int32>(
+    int32_t k0 = std::uniform_int_distribution<int32_t>(
         -shape[rank - 2] + 1, shape[rank - 1] - num_diags)(generator());
-    int32 k1 = k0 + num_diags - 1;
-    Tensor k = test::AsTensor<int32>({k0, k1});
+    int32_t k1 = k0 + num_diags - 1;
+    Tensor k = test::AsTensor<int32_t>({k0, k1});
     int64_t max_diag_len = std::min(shape[rank - 2] + std::min(k1, 0),
                                     shape[rank - 1] + std::min(-k0, 0));
     std::vector<int64_t> diagonal_shape(shape);
@@ -3424,7 +3428,7 @@ TEST_F(OpTest, MaxPool) {
     int stride_rows = random_int(generator()),
         stride_cols = random_int(generator());
 
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("MaxPool")
             .RandomInput(DT_FLOAT, dims)
@@ -3458,7 +3462,7 @@ TEST_F(OpTest, MaxPool3D) {
     int64_t batch = dims[3];
     int64_t feature = dims[4];
 
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("MaxPool3D")
             .RandomInput(DT_FLOAT,
@@ -3585,20 +3589,20 @@ TEST_F(OpTest, OneHot) {
     int32_t depth = RandomDim();
 
     Tensor indices(DT_INT32, TensorShape(dims));
-    std::uniform_int_distribution<int32> distribution(-depth * 2, depth * 2);
-    test::FillFn<int32>(&indices, [this, &distribution](int i) -> int32 {
+    std::uniform_int_distribution<int32_t> distribution(-depth * 2, depth * 2);
+    test::FillFn<int32_t>(&indices, [this, &distribution](int i) -> int32_t {
       return distribution(generator());
     });
 
-    int axis = std::uniform_int_distribution<int32>(-num_dims - 5,
-                                                    num_dims + 5)(generator());
+    int axis = std::uniform_int_distribution<int32_t>(
+        -num_dims - 5, num_dims + 5)(generator());
 
     OpTestBuilder builder("OneHot");
     builder.Attr("T", type);
     builder.Attr("TI", DT_INT32);
     builder.Attr("axis", axis);
     builder.Input(indices);
-    builder.Input(test::AsScalar<int32>(depth));
+    builder.Input(test::AsScalar<int32_t>(depth));
     builder.RandomInput(type, {});
     builder.RandomInput(type, {});
     return ExpectTfAndXlaOutputsAreClose(builder);
@@ -3621,8 +3625,8 @@ TEST_F(OpTest, Pack) {
 
     std::vector<int64_t> dims = RandomDims();
     int num_dims = dims.size();
-    int axis = std::uniform_int_distribution<int32>(-num_dims - 1,
-                                                    num_dims)(generator());
+    int axis = std::uniform_int_distribution<int32_t>(-num_dims - 1,
+                                                      num_dims)(generator());
 
     OpTestBuilder builder("Pack");
     builder.Attr("T", type);
@@ -3764,7 +3768,7 @@ TEST_F(OpTest, RandomUniform) {
 TEST_F(OpTest, Range) {
   Repeatedly([this]() {
     auto ToScalar = [](DataType type, int x) {
-      if (type == DT_INT32) return test::AsScalar<int32>(x);
+      if (type == DT_INT32) return test::AsScalar<int32_t>(x);
       if (type == DT_INT64) return test::AsScalar<int64_t>(x);
       if (type == DT_FLOAT) return test::AsScalar<float>(x);
       if (type == DT_DOUBLE) return test::AsScalar<double>(x);
@@ -3881,8 +3885,8 @@ TEST_F(OpTest, Reshape) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Reshape")
             .RandomInput(type, dims_before)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(dims_after.begin(), dims_after.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(dims_after.begin(), dims_after.end())))
             .Attr("T", type));
   });
 }
@@ -3908,8 +3912,8 @@ TEST_F(OpTest, ResizeBilinear) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ResizeBilinear")
             .RandomInput(DT_FLOAT, in_dims)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(out_dims.begin(), out_dims.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(out_dims.begin(), out_dims.end())))
             .Attr("T", DT_FLOAT)
             .Attr("align_corners", true));
   });
@@ -3961,14 +3965,14 @@ TEST_F(OpTest, ReverseSequence) {
 
     int batch_size = dims[batch_dim];
     int max_seq_len = dims[seq_dim];
-    std::vector<int32> seq_lens(batch_size);
-    std::uniform_int_distribution<int32> d(0, max_seq_len);
+    std::vector<int32_t> seq_lens(batch_size);
+    std::uniform_int_distribution<int32_t> d(0, max_seq_len);
     absl::c_generate(seq_lens, [&]() { return d(generator()); });
 
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ReverseSequence")
             .RandomInput(type, dims)
-            .Input(test::AsTensor<int32>(seq_lens))
+            .Input(test::AsTensor<int32_t>(seq_lens))
             .Attr("seq_dim", seq_dim)
             .Attr("batch_dim", batch_dim)
             .Attr("T", type)
@@ -4157,14 +4161,15 @@ TEST_F(OpTest, Size) {
 TEST_F(OpTest, Slice) {
   Repeatedly([this]() {
     SliceArguments a = ChooseSliceArguments(true);
-    std::vector<int32> size;
+    std::vector<int32_t> size;
     size.insert(size.end(), a.size.begin(), a.size.end());
-    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Slice")
-                                             .RandomInput(a.type, a.shape)
-                                             .Input(a.indices)
-                                             .Input(test::AsTensor<int32>(size))
-                                             .Attr("T", a.type)
-                                             .Attr("Index", a.indices_type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Slice")
+            .RandomInput(a.type, a.shape)
+            .Input(a.indices)
+            .Input(test::AsTensor<int32_t>(size))
+            .Attr("T", a.type)
+            .Attr("Index", a.indices_type));
   });
 }
 
@@ -4298,8 +4303,8 @@ TEST_F(OpTest, SpaceToBatchND) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("SpaceToBatchND")
             .RandomInput(type, input_dims)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(block_dims.begin(), block_dims.end())))
             .Input(paddings)
             .Attr("T", type));
   });
@@ -4356,16 +4361,16 @@ TEST_F(OpTest, SparseSoftmaxCrossEntropyWithLogits) {
     int64_t batch_size = dims[0];
     int64_t num_classes = dims[1];
 
-    std::vector<int32> indices(batch_size);
+    std::vector<int32_t> indices(batch_size);
     for (int64_t i = 0; i < batch_size; ++i) {
-      indices[i] =
-          std::uniform_int_distribution<int32>(0, num_classes - 1)(generator());
+      indices[i] = std::uniform_int_distribution<int32_t>(
+          0, num_classes - 1)(generator());
     }
 
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("SparseSoftmaxCrossEntropyWithLogits")
             .RandomInput(DT_FLOAT, dims)
-            .Input(test::AsTensor<int32>(indices))
+            .Input(test::AsTensor<int32_t>(indices))
             .Attr("T", DT_FLOAT)
             .Attr("Tlabels", DT_INT32));
   });
@@ -4379,18 +4384,19 @@ TEST_F(OpTest, Split) {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> dims = RandomDims(1);
     std::uniform_int_distribution<int> ud;
-    int32_t dim = std::uniform_int_distribution<int32>(
-        -static_cast<int32>(dims.size()),
-        static_cast<int32>(dims.size()) - 1)(generator());
+    int32_t dim = std::uniform_int_distribution<int32_t>(
+        -static_cast<int32_t>(dims.size()),
+        static_cast<int32_t>(dims.size()) - 1)(generator());
     int n = std::uniform_int_distribution<int>(1, 5)(generator());
     // Ensure 'dim' is evenly divisible by 'n'.
     dims[dim] /= n;
     dims[dim] *= n;
-    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Split")
-                                             .Input(test::AsScalar<int32>(dim))
-                                             .RandomInput(type, dims)
-                                             .Attr("T", type)
-                                             .Attr("num_split", n));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Split")
+            .Input(test::AsScalar<int32_t>(dim))
+            .RandomInput(type, dims)
+            .Attr("T", type)
+            .Attr("num_split", n));
   });
 }
 
@@ -4401,12 +4407,12 @@ TEST_F(OpTest, SplitV) {
   Repeatedly([this]() {  // NOLINT: due to GTEST_SKIP
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> dims = RandomDims(1, kDefaultMaxRank, 1);
-    int32_t dim = std::uniform_int_distribution<int32>(
-        -static_cast<int32>(dims.size()),
-        static_cast<int32>(dims.size()) - 1)(generator());
+    int32_t dim = std::uniform_int_distribution<int32_t>(
+        -static_cast<int32_t>(dims.size()),
+        static_cast<int32_t>(dims.size()) - 1)(generator());
     int n = std::uniform_int_distribution<int>(
         1, std::min(5, static_cast<int>(dims[dim])))(generator());
-    std::vector<int32> size_splits(n);
+    std::vector<int32_t> size_splits(n);
     for (int i = 0; i < n - 1; ++i) {
       size_splits.push_back(dims[dim] / n);
     }
@@ -4414,8 +4420,8 @@ TEST_F(OpTest, SplitV) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("SplitV")
             .RandomInput(type, dims)
-            .Input(test::AsTensor<int32>(size_splits))
-            .Input(test::AsScalar<int32>(dim))
+            .Input(test::AsTensor<int32_t>(size_splits))
+            .Input(test::AsScalar<int32_t>(dim))
             .Attr("T", type)
             .Attr("num_split", n)
             .Attr("Tlen", DT_INT32));
@@ -4515,12 +4521,12 @@ TEST_F(OpTest, StridedSlice) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> data_dims = RandomDims();
-    std::vector<int32> begin(data_dims.size()), end(data_dims.size());
-    std::vector<int32> strides(data_dims.size());
+    std::vector<int32_t> begin(data_dims.size()), end(data_dims.size());
+    std::vector<int32_t> strides(data_dims.size());
     for (int i = 0; i < data_dims.size(); ++i) {
-      begin[i] = std::uniform_int_distribution<int32>(
+      begin[i] = std::uniform_int_distribution<int32_t>(
           -2 * data_dims[i], 2 * data_dims[i])(generator());
-      end[i] = std::uniform_int_distribution<int32>(
+      end[i] = std::uniform_int_distribution<int32_t>(
           -2 * data_dims[i], 2 * data_dims[i])(generator());
       // TODO(b/31360685): support strides other than 1 or -1
       strides[i] = std::bernoulli_distribution()(generator()) ? 1 : -1;
@@ -4543,9 +4549,9 @@ TEST_F(OpTest, StridedSlice) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("StridedSlice")
             .RandomInput(type, data_dims)
-            .Input(test::AsTensor<int32>(begin))
-            .Input(test::AsTensor<int32>(end))
-            .Input(test::AsTensor<int32>(strides))
+            .Input(test::AsTensor<int32_t>(begin))
+            .Input(test::AsTensor<int32_t>(end))
+            .Input(test::AsTensor<int32_t>(strides))
             .Attr("T", type)
             .Attr("Index", DT_INT32)
             .Attr("begin_mask", begin_mask)
@@ -4656,14 +4662,14 @@ TEST_F(OpTest, Tile) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> t_dims = RandomDims(1);
-    std::vector<int32> multiples(t_dims.size());
+    std::vector<int32_t> multiples(t_dims.size());
     for (int i = 0; i < t_dims.size(); ++i) {
       multiples[i] = std::uniform_int_distribution<int>(1, 3)(generator());
     }
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Tile")
             .RandomInput(type, t_dims)
-            .Input(test::AsTensor<int32>(multiples))
+            .Input(test::AsTensor<int32_t>(multiples))
             .Attr("T", type));
   });
 }
@@ -4674,10 +4680,11 @@ TEST_F(OpTest, TopKV2) {
   Repeatedly([this]() {  // NOLINT: due to GTEST_SKIP
     auto type = Choose<DataType>({DT_INT32, DT_FLOAT, DT_INT64});
     auto shape = RandomDims(1);
-    int32 k = std::uniform_int_distribution<int32>(1, shape[0])(generator());
+    int32_t k =
+        std::uniform_int_distribution<int32_t>(1, shape[0])(generator());
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TopKV2")
                                              .RandomInput(type, shape)
-                                             .Input(test::AsScalar<int32>(k))
+                                             .Input(test::AsScalar<int32_t>(k))
                                              .Attr("sorted", RandomBool())
                                              .Attr("T", type));
   });
@@ -4687,13 +4694,14 @@ TEST_F(OpTest, Transpose) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> data_dims = RandomDims();
-    std::vector<int32> perm(data_dims.size());
+    std::vector<int32_t> perm(data_dims.size());
     std::iota(perm.begin(), perm.end(), 0);
     std::shuffle(perm.begin(), perm.end(), generator());
-    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Transpose")
-                                             .RandomInput(type, data_dims)
-                                             .Input(test::AsTensor<int32>(perm))
-                                             .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Transpose")
+            .RandomInput(type, data_dims)
+            .Input(test::AsTensor<int32_t>(perm))
+            .Attr("T", type));
   });
 }
 
@@ -4883,8 +4891,8 @@ TEST_F(OpTest, FusedBatchNormTraining) {
 }  // namespace tensorflow
 
 int main(int argc, char** argv) {
-  tensorflow::tf_xla_test_device_ptr = new tensorflow::string("GPU:0");
-  tensorflow::tf_xla_reference_device_ptr = new tensorflow::string("CPU:0");
+  tensorflow::tf_xla_test_device_ptr = new std::string("GPU:0");
+  tensorflow::tf_xla_reference_device_ptr = new std::string("CPU:0");
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag(
           "tf_xla_random_seed", &tensorflow::tf_xla_random_seed,
@@ -4909,7 +4917,7 @@ int main(int argc, char** argv) {
           "tf_xla_test_use_mlir", &tensorflow::tf_xla_test_use_mlir,
           "Use MLIR legalization kernels for the operator under test"),
   };
-  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  std::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
     LOG(ERROR) << "\n" << usage;
diff --git a/tensorflow/compiler/tests/unary_ops_composition_test.cc b/tensorflow/compiler/tests/unary_ops_composition_test.cc
index 641af606bb24d1..c27b8070bbb450 100644
--- a/tensorflow/compiler/tests/unary_ops_composition_test.cc
+++ b/tensorflow/compiler/tests/unary_ops_composition_test.cc
@@ -48,9 +48,9 @@ static bool Initialized = [] {
 class UnaryOpsCompositionTest : public OpsTestBase {
  protected:
   template <typename T>
-  void RunComposedOp(const std::vector<string> op_names, T input_scalar_value,
-                     T expected_scalar_value) {
-    string xla_device_name =
+  void RunComposedOp(const std::vector<std::string> op_names,
+                     T input_scalar_value, T expected_scalar_value) {
+    std::string xla_device_name =
         tensorflow::IsGoogleCudaEnabled() ? DEVICE_XLA_GPU : DEVICE_XLA_CPU;
     SetDevice(DeviceType(xla_device_name),
               std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 864b64c349e798..7f84349bffd15b 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -31,9 +31,9 @@ def testCopies(self):
     """Tests that copies onto and off XLA devices work."""
     shapes = [[0], [1], [1, 0], [1024, 0], [1024, 1], [3, 777], [777, 3],
               [16384, 1], [1, 16384], [1, 20000, 1, 1]]
-    for dtype in self.numeric_types:
-      for shape in shapes:
-        with self.session() as sess:
+    with self.session() as sess:
+      for dtype in self.numeric_types:
+        for shape in shapes:
           with ops.device("CPU"):
             x = array_ops.placeholder(dtype, shape)
           with self.test_scope():
@@ -53,8 +53,8 @@ def testCopiesOfUnsupportedTypesFailGracefully(self):
         dtypes.bfloat16.as_numpy_dtype
     ])
     shape = (10, 10)
-    for unsupported_dtype in test_types - self.all_types:
-      with self.session() as sess:
+    with self.session() as sess:
+      for unsupported_dtype in test_types - self.all_types:
         with ops.device("CPU"):
           x = array_ops.placeholder(unsupported_dtype, shape)
         with self.test_scope():
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 20f93d86adfad1..d642418a44c2f5 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -308,7 +308,8 @@ def device_scope(self):
       yield
 
   def assert_op_output_matches_expected(
-      self, op, inp, expected, equality_test=None, rtol=1e-3, atol=1e-5
+      self, op, inp, expected, local_session,
+      equality_test=None, rtol=1e-3, atol=1e-5
   ):
     """Verifies that 'op' produces 'expected' when fed input 'inp' .
 
@@ -316,25 +317,25 @@ def assert_op_output_matches_expected(
       op: operator to test
       inp: numpy input array to use as input to 'op'.
       expected: numpy array representing the expected output of 'op'.
+      local_session: The session to use for the test.
       equality_test: either None, or a function that tests two numpy arrays for
         equality. If None, self.assertAllClose is used.
       rtol: relative tolerance for equality test.
       atol: absolute tolerance for equality test.
     """
-    with self.session() as local_session:
-      with self.test_scope():
-        pinp = array_ops.placeholder(
-            dtypes.as_dtype(inp.dtype), inp.shape, name='a'
-        )
-        output = op(pinp)
-      result = local_session.run(output, {pinp: inp})
-      if equality_test is None:
-        self.assertEqual(output.dtype, expected.dtype)
-        self.assertAllCloseAccordingToType(
-            expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03
-        )
-      else:
-        equality_test(result, expected, rtol=rtol, atol=atol)
+    with self.test_scope():
+      pinp = array_ops.placeholder(
+          dtypes.as_dtype(inp.dtype), inp.shape, name='a'
+      )
+      output = op(pinp)
+    result = local_session.run(output, {pinp: inp})
+    if equality_test is None:
+      self.assertEqual(output.dtype, expected.dtype)
+      self.assertAllCloseAccordingToType(
+          expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03
+      )
+    else:
+      equality_test(result, expected, rtol=rtol, atol=atol)
 
   def test_scope(self):
     """Deprecated alias of `device_scope`.
diff --git a/tensorflow/compiler/tf2tensorrt/common/datavec.h b/tensorflow/compiler/tf2tensorrt/common/datavec.h
index eff32f1f521af4..34b419d1d20d62 100644
--- a/tensorflow/compiler/tf2tensorrt/common/datavec.h
+++ b/tensorflow/compiler/tf2tensorrt/common/datavec.h
@@ -27,7 +27,7 @@ namespace tensorrt {
 // Input/output data format for OpConverterTest::BuildAndRun().
 struct InputOutputData {
   size_t TotalBytes() const { return tensor.TotalBytes(); }
-  string name;
+  std::string name;
   Tensor tensor;
 };
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc
index c8eb3db2e0b9e4..b4c3052953c677 100755
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc
@@ -739,16 +739,16 @@ class ReIndexer {
   // Initializes the index map with existing lowercase labels.
   ReIndexer(std::string eq) {
     for (char c : eq) {
-      if (islower(c)) {
+      if (absl::ascii_islower(c)) {
         idx_map_[c] = c;
       }
     }
   }
   // Finds new character for uppercase character c.
   char operator()(char c) {
-    if (!std::isupper(c)) return c;
+    if (!absl::ascii_isupper(c)) return c;
     if (idx_map_.count(c) > 0) return idx_map_[c];
-    char new_idx = std::tolower(c);
+    char new_idx = absl::ascii_tolower(c);
 
     // If lower(c) is not used in the equation, use it to replace c.
     if (idx_map_.count(new_idx) == 0) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
index faedcf3de8c427..000c32df25d253 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
@@ -81,9 +81,7 @@ string ProfileStrategyToName(const ProfileStrategy strategy) {
 }
 
 Status ProfileStrategyFromName(const string& name, ProfileStrategy* strategy) {
-  string name_lowercase(name);
-  std::transform(name.begin(), name.end(), name_lowercase.begin(),
-                 [](unsigned char c) { return std::tolower(c); });
+  std::string name_lowercase = absl::AsciiStrToLower(name);
   if (name_lowercase == "range") {
     *strategy = ProfileStrategy::kRange;
   } else if (name_lowercase == "optimal") {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index 30aff91a76d3b1..d1bf00a53d1cc3 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -99,7 +99,7 @@ string TRTEngineCacheResource::DebugString() const {
 EngineContext* TRTEngineCacheResource::GetEngineContext(
     const std::vector<TensorShape>& input_shapes) {
   EngineContext* engine_context = nullptr;
-  int64 min_matched_batch_size = kint64max;
+  int64 min_matched_batch_size = std::numeric_limits<int64_t>::max();
   for (const auto& pair : cache_) {
     const std::vector<TensorShape>& cached_input_shapes = pair.first;
     // This should not happen, but just for safety.
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 416f1a37179736..e5545445817ec2 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -138,6 +138,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "encoded_buffer_allocation_info",
+    hdrs = ["encoded_buffer_allocation_info.h"],
+    visibility = [":friends"],
+    deps = [
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
+    ],
+)
+
+tf_cc_test(
+    name = "encoded_buffer_allocation_info_test",
+    srcs = ["encoded_buffer_allocation_info_test.cc"],
+    deps = [
+        ":encoded_buffer_allocation_info",
+        "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
+    ],
+)
+
 cc_library(
     name = "tf2xla",
     srcs = ["tf2xla.cc"],
@@ -218,6 +237,7 @@ filegroup(
     name = "xla_compiled_cpu_runtime_hdrs",
     srcs = [
         "allocator.h",
+        "encoded_buffer_allocation_info.h",
         "xla_compiled_cpu_function.h",
         "//tensorflow/core/kernels:xla_cpu_runtime_hdrs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
@@ -355,6 +375,7 @@ cc_library(
 #         "@local_tsl//tsl/platform:context",
 #         "@local_tsl//tsl/platform:cord",
 #         "@local_tsl//tsl/platform:env_time",
+#         "@local_tsl//tsl/platform:refcount",
 #         "@local_tsl//tsl/platform:ml_dtypes",
 #         "@local_tsl//tsl/platform:logging",
 #         "@local_tsl//tsl/platform:macros",
@@ -406,8 +427,22 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:dynamic_annotations",
-        "@local_xla//xla:cpu_function_runtime",
+        "@com_google_absl//absl/types:span",
+        "@local_xla//xla/backends/cpu:alignment",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
+    ],
+)
+
+tf_cc_test(
+    name = "allocator_test",
+    srcs = ["allocator_test.cc"],
+    deps = [
+        ":allocator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "@local_xla//xla/backends/cpu:alignment",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
     ],
 )
 
@@ -418,14 +453,16 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
+        # Keep dependencies to a minimum here; this library is used in every AOT
+        # binary produced by tfcompile.
         ":allocator",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        ":encoded_buffer_allocation_info",
         "@local_xla//xla/service:custom_call_status_internal",
-        # Keep dependencies to a minimum here; this library is used in every AOT
-        # binary produced by tfcompile.
         "@local_xla//xla/backends/cpu/runtime:rng_state_lib",
         "@local_xla//xla/backends/cpu:alignment",
-        "@local_xla//xla:cpu_function_runtime",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
         "@local_xla//xla:executable_run_options",
         "//tensorflow/core/platform:types",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -481,25 +518,13 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_cc_test(
-    name = "cpu_function_runtime_test",
-    srcs = ["cpu_function_runtime_test.cc"],
-    deps = [
-        ":allocator",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@local_xla//xla:cpu_function_runtime",
-        "@local_xla//xla/backends/cpu:alignment",
-    ],
-)
-
 cc_library(
     name = "xla_jit_compiled_cpu_function",
     srcs = ["xla_jit_compiled_cpu_function.cc"],
     hdrs = ["xla_jit_compiled_cpu_function.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":encoded_buffer_allocation_info",
         ":tf2xla",
         ":tf2xla_proto_cc",
         ":xla_compiled_cpu_function",
@@ -513,9 +538,10 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
-        "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info_util",
         "@local_xla//xla/backends/cpu/codegen:compiled_function_library",
         "@local_xla//xla/client:client_library",
         "@local_xla//xla/client:executable_build_options",
@@ -528,7 +554,6 @@ cc_library(
     ] + if_libtpu(
         if_false = [
             "@local_xla//xla/service:cpu_plugin",
-            "@local_xla//xla/service/cpu:buffer_info_util",
             "@local_xla//xla/service/cpu:cpu_executable",
         ],
         if_true = [],
diff --git a/tensorflow/compiler/tf2xla/allocator.cc b/tensorflow/compiler/tf2xla/allocator.cc
index 7f7c3a351bbe87..08db8bb0261bc6 100644
--- a/tensorflow/compiler/tf2xla/allocator.cc
+++ b/tensorflow/compiler/tf2xla/allocator.cc
@@ -20,8 +20,9 @@ limitations under the License.
 #include <cstdlib>
 
 #include "absl/base/dynamic_annotations.h"
+#include "absl/types/span.h"
 #include "xla/backends/cpu/alignment.h"
-#include "xla/cpu_function_runtime.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
 
 namespace tensorflow {
 
@@ -64,26 +65,26 @@ size_t align_to(size_t n, size_t align) {
 }  // namespace
 
 size_t AlignedBufferBytes(
-    const xla::cpu_function_runtime::BufferInfo* buffer_infos, size_t n,
+    absl::Span<const xla::cpu::BufferAllocationInfo> buffers,
     bool allocate_entry_params) {
   size_t total = 0;
-  for (size_t i = 0; i < n; ++i) {
+  for (size_t i = 0; i < buffers.size(); ++i) {
     bool should_allocate =
-        buffer_infos[i].is_temp_buffer() ||
-        (buffer_infos[i].is_entry_parameter() && allocate_entry_params);
+        buffers[i].is_temp() || buffers[i].is_result() ||
+        (buffers[i].is_entry_parameter() && allocate_entry_params);
 
     if (should_allocate) {
-      total += align_to(buffer_infos[i].size(), xla::cpu::Align());
+      total += align_to(buffers[i].size(), xla::cpu::Align());
     }
   }
   return total;
 }
 
 void* MallocContiguousBuffers(
-    const xla::cpu_function_runtime::BufferInfo* buffer_infos, size_t n,
+    absl::Span<const xla::cpu::BufferAllocationInfo> buffers,
     bool allocate_entry_params, void** bufs, bool annotate_initialized) {
   const size_t total =
-      tensorflow::AlignedBufferBytes(buffer_infos, n, allocate_entry_params);
+      tensorflow::AlignedBufferBytes(buffers, allocate_entry_params);
   void* contiguous = nullptr;
   if (total > 0) {
     contiguous = aligned_malloc(total, xla::cpu::Align());
@@ -94,13 +95,13 @@ void* MallocContiguousBuffers(
     }
   }
   uintptr_t pos = reinterpret_cast<uintptr_t>(contiguous);
-  for (size_t i = 0; i < n; ++i) {
+  for (size_t i = 0; i < buffers.size(); ++i) {
     bool should_allocate =
-        buffer_infos[i].is_temp_buffer() ||
-        (buffer_infos[i].is_entry_parameter() && allocate_entry_params);
+        buffers[i].is_temp() || buffers[i].is_result() ||
+        (buffers[i].is_entry_parameter() && allocate_entry_params);
     if (should_allocate) {
       bufs[i] = reinterpret_cast<void*>(pos);
-      pos += align_to(buffer_infos[i].size(), xla::cpu::Align());
+      pos += align_to(buffers[i].size(), xla::cpu::Align());
     } else {
       bufs[i] = nullptr;
     }
diff --git a/tensorflow/compiler/tf2xla/allocator.h b/tensorflow/compiler/tf2xla/allocator.h
index 4ed60e4cb65535..b9d181ff60ba06 100644
--- a/tensorflow/compiler/tf2xla/allocator.h
+++ b/tensorflow/compiler/tf2xla/allocator.h
@@ -18,7 +18,8 @@ limitations under the License.
 
 #include <cstddef>
 
-#include "xla/cpu_function_runtime.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
 
 namespace tensorflow {
 
@@ -27,7 +28,7 @@ namespace tensorflow {
 // allocate_entry_params is false, entry parameters.  There are `n` entries in
 // `buffer_infos`.  Each buffer is aligned to Align() byte boundaries.
 size_t AlignedBufferBytes(
-    const xla::cpu_function_runtime::BufferInfo* buffer_infos, size_t n,
+    absl::Span<const xla::cpu::BufferAllocationInfo> buffers,
     bool allocate_entry_params);
 
 // MallocContiguousBuffers allocates buffers for use by the entry point
@@ -43,7 +44,7 @@ size_t AlignedBufferBytes(
 // the head of the allocated contiguous block, which should be passed to
 // FreeContiguous when the buffers are no longer in use.
 void* MallocContiguousBuffers(
-    const xla::cpu_function_runtime::BufferInfo* buffer_infos, size_t n,
+    absl::Span<const xla::cpu::BufferAllocationInfo> buffers,
     bool allocate_entry_params, void** bufs, bool annotate_initialized);
 
 // FreeContiguous frees the contiguous block of memory allocated by
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc b/tensorflow/compiler/tf2xla/allocator_test.cc
similarity index 71%
rename from tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
rename to tensorflow/compiler/tf2xla/allocator_test.cc
index 6904c58489f861..d5b9158c1fcb3f 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
+++ b/tensorflow/compiler/tf2xla/allocator_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/cpu_function_runtime.h"
+#include "tensorflow/compiler/tf2xla/allocator.h"
 
 #include <algorithm>
 #include <cstddef>
@@ -21,17 +21,17 @@ limitations under the License.
 #include <iterator>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/allocator.h"
 #include "xla/backends/cpu/alignment.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-using ::xla::cpu_function_runtime::BufferInfo;
+using ::xla::cpu::BufferAllocationInfo;
 
-TEST(XlaCompiledCpuFunctionTest, AlignmentValue) {
+TEST(AllocatorTest, AlignmentValue) {
   // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
@@ -41,38 +41,41 @@ TEST(XlaCompiledCpuFunctionTest, AlignmentValue) {
   EXPECT_LE(xla::cpu::MinAlign(), Allocator::kAllocatorAlignment);
 }
 
-std::vector<BufferInfo> SizesToBufferInfos(const intptr_t* sizes, size_t n) {
-  std::vector<BufferInfo> buffer_infos;
-  std::transform(sizes, sizes + n, std::back_inserter(buffer_infos),
-                 [&](intptr_t size) {
-                   if (size == -1) {
-                     // Use a dummy on-stack buffer allocation to indicate the
-                     // the current slot does not need an allocation.
-                     int64_t on_stack_buffer_size = 4;
-                     return BufferInfo::MakeOnStackBuffer(on_stack_buffer_size);
-                   }
-                   return BufferInfo::MakeTempBuffer(size);
-                 });
+std::vector<BufferAllocationInfo> SizesToBufferAllocationInfos(
+    const intptr_t* sizes, size_t n) {
+  std::vector<BufferAllocationInfo> buffer_infos;
+  std::transform(
+      sizes, sizes + n, std::back_inserter(buffer_infos), [&](intptr_t size) {
+        if (size == -1) {
+          // Use a dummy on-stack buffer allocation to indicate the
+          // the current slot does not need an allocation.
+          int64_t on_stack_buffer_size = 4;
+          return BufferAllocationInfo::ThreadLocal(on_stack_buffer_size);
+        }
+        return BufferAllocationInfo::Temp(size);
+      });
   return buffer_infos;
 }
 
 // Simple wrappers to make writing tests more ergonomic.
 
 size_t AlignedBufferBytesFromSizes(const intptr_t* sizes, size_t n) {
-  std::vector<BufferInfo> buffer_infos = SizesToBufferInfos(sizes, n);
-  return tensorflow::AlignedBufferBytes(buffer_infos.data(), n,
+  std::vector<BufferAllocationInfo> buffer_infos =
+      SizesToBufferAllocationInfos(sizes, n);
+  return tensorflow::AlignedBufferBytes(buffer_infos,
                                         /*allocate_entry_params=*/false);
 }
 
 void* MallocContiguousBuffersFromSizes(const intptr_t* sizes, size_t n,
                                        void** bufs, bool annotate_initialized) {
-  std::vector<BufferInfo> buffer_infos = SizesToBufferInfos(sizes, n);
-  return tensorflow::MallocContiguousBuffers(buffer_infos.data(), n,
+  std::vector<BufferAllocationInfo> buffer_infos =
+      SizesToBufferAllocationInfos(sizes, n);
+  return tensorflow::MallocContiguousBuffers(buffer_infos,
                                              /*allocate_entry_params=*/false,
                                              bufs, annotate_initialized);
 }
 
-TEST(XlaCompiledCpuFunctionTest, AlignedBufferBytes) {
+TEST(AllocatorTest, AlignedBufferBytes) {
   EXPECT_EQ(AlignedBufferBytesFromSizes(nullptr, 0), 0);
 
   static constexpr intptr_t sizesA[1] = {-1};
@@ -96,7 +99,7 @@ void* add_ptr(void* base, uintptr_t delta) {
 // expected nullptrs, and write to each byte of allocated memory.  We rely on
 // the leak checker to tell us if there's an inconsistency between malloc and
 // free.  We also check the contiguous property.
-TEST(XlaCompiledCpuFunctionTest, MallocFreeContiguousBuffers) {
+TEST(AllocatorTest, MallocFreeContiguousBuffers) {
   // Test empty sizes.
   void* base = MallocContiguousBuffersFromSizes(nullptr, 0, nullptr, false);
   EXPECT_EQ(base, nullptr);
@@ -158,23 +161,5 @@ TEST(XlaCompiledCpuFunctionTest, MallocFreeContiguousBuffers) {
   FreeContiguous(base);
 }
 
-void CheckRoundTripIsOk(const BufferInfo& buffer_info) {
-  BufferInfo round_trip(buffer_info.Encode());
-  ASSERT_EQ(round_trip, buffer_info);
-}
-
-TEST(XlaCompiledCpuFunctionTest, BufferInfoTest) {
-  CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(0));
-  CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(4));
-  CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(0));
-  CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(4));
-  CheckRoundTripIsOk(BufferInfo::MakeConstant(0));
-  CheckRoundTripIsOk(BufferInfo::MakeConstant(4));
-  CheckRoundTripIsOk(
-      BufferInfo::MakeEntryParameter(/*size=*/0, /*param_number=*/4));
-  CheckRoundTripIsOk(
-      BufferInfo::MakeEntryParameter(/*size=*/4, /*param_number=*/0));
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index c7c8702b49b774..d9f6927c09ecd6 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -180,7 +180,7 @@ TEST(ConstAnalysisTest, RespectExplicitAttr_0) {
   // not need to be a constant.
   Output reshape = ops::Reshape(root, arg1, add);
   reshape.node()->AddAttr(kXlaCompileTimeConstantInputsAttr,
-                          std::vector<string>());
+                          std::vector<std::string>());
 
   Graph graph(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&graph));
@@ -203,7 +203,7 @@ TEST(ConstAnalysisTest, RespectExplicitAttr_1) {
 
   // Force const analysis to pretend that the first argument to `add` needs to
   // be a constant.
-  std::vector<string> add_constant_inputs;
+  std::vector<std::string> add_constant_inputs;
   add_constant_inputs.push_back("x");
   add.node()->AddAttr(kXlaCompileTimeConstantInputsAttr, add_constant_inputs);
 
diff --git a/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h
new file mode 100644
index 00000000000000..5981751259967a
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h
@@ -0,0 +1,99 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_ENCODED_BUFFER_ALLOCATION_INFO_H_
+#define TENSORFLOW_COMPILER_TF2XLA_ENCODED_BUFFER_ALLOCATION_INFO_H_
+
+#include <cstdint>
+
+#include "xla/backends/cpu/buffer_allocation_info.h"
+
+namespace xla {
+namespace cpu {
+
+// Encoded version of `BufferAllocationInfo`, which can be used to reconstruct
+// the `BufferAllocationInfo` later. It's used in the AOT compiler, to
+// represent buffer allocation info as a lightweight struct.
+struct EncodedBufferAllocationInfo {
+  EncodedBufferAllocationInfo(uint64_t packed_kind_and_size,
+                              uint32_t entry_param_number,
+                              uint32_t result_number)
+      : packed_kind_and_size(packed_kind_and_size),
+        entry_param_number(entry_param_number),
+        result_number(result_number) {}
+
+  // Encodes BufferAllocationInfo into the struct that can be used to
+  // reconstruct the BufferAllocationInfo later using the constructor. We need
+  // this because we use BufferAllocationInfo in places where using protocol
+  // buffers would negatively impact binary size.
+  explicit EncodedBufferAllocationInfo(
+      const BufferAllocationInfo& buffer_info) {
+    packed_kind_and_size = Pack(buffer_info.kind(), buffer_info.size());
+    entry_param_number = buffer_info.is_entry_parameter()
+                             ? buffer_info.entry_parameter_number()
+                             : -1;
+    result_number = buffer_info.is_result() ? buffer_info.result_number() : -1;
+  }
+
+  explicit operator BufferAllocationInfo() const {
+    auto kind = UnpackKind(packed_kind_and_size);
+    auto size = UnpackSize(packed_kind_and_size);
+    int32_t entry_param_number = static_cast<int32_t>(this->entry_param_number);
+    int32_t result_number = static_cast<int32_t>(this->result_number);
+
+    switch (kind) {
+      case BufferAllocationInfo::Kind::kConstant:
+        return BufferAllocationInfo::Constant(size);
+      case BufferAllocationInfo::Kind::kTemp:
+        return BufferAllocationInfo::Temp(size);
+      case BufferAllocationInfo::Kind::kParameter:
+        if (entry_param_number >= 0 && result_number >= 0) {
+          return BufferAllocationInfo::InOutParameter(size, entry_param_number,
+                                                      result_number);
+        }
+        if (entry_param_number >= 0) {
+          return BufferAllocationInfo::EntryParameter(size, entry_param_number);
+        }
+        return BufferAllocationInfo::Result(size, result_number);
+      case BufferAllocationInfo::Kind::kThreadLocal:
+        return BufferAllocationInfo::ThreadLocal(size);
+    }
+  }
+
+  static uint64_t Pack(BufferAllocationInfo::Kind kind, uint64_t size) {
+    return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
+  }
+
+  static constexpr BufferAllocationInfo::Kind UnpackKind(uint64_t packed) {
+    return static_cast<BufferAllocationInfo::Kind>((packed << 62) >> 62);
+  }
+
+  static constexpr uint64_t UnpackSize(uint64_t packed) { return packed >> 2; }
+
+  uint64_t packed_kind_and_size = 0;
+  uint32_t entry_param_number = -1;
+  uint32_t result_number = -1;
+};
+}  // namespace cpu
+
+// TODO(ezhulenev): This is a temporary hack to keep `tfcompile` code working.
+namespace cpu_function_runtime {
+using BufferInfo = ::xla::cpu::BufferAllocationInfo;
+using EncodedBufferInfo = ::xla::cpu::EncodedBufferAllocationInfo;
+}  // namespace cpu_function_runtime
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_ENCODED_BUFFER_ALLOCATION_INFO_H_
diff --git a/third_party/xla/xla/backends/cpu/buffer_allocation_info_test.cc b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info_test.cc
similarity index 81%
rename from third_party/xla/xla/backends/cpu/buffer_allocation_info_test.cc
rename to tensorflow/compiler/tf2xla/encoded_buffer_allocation_info_test.cc
index b0b5bd57035fa2..c9fc52100abb33 100644
--- a/third_party/xla/xla/backends/cpu/buffer_allocation_info_test.cc
+++ b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2025 The OpenXLA Authors.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/buffer_allocation_info.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 
 #include <gtest/gtest.h>
+#include "xla/backends/cpu/buffer_allocation_info.h"
 
 namespace xla::cpu {
 namespace {
 
-TEST(BufferAllocationInfoTest, RoundTrip) {
+TEST(EncodedBufferAllocationInfoTest, RoundTrip) {
   auto round_trip = [](const BufferAllocationInfo& buffer_info) {
-    BufferAllocationInfo round_trip(buffer_info.Encode());
+    EncodedBufferAllocationInfo encoded(buffer_info);
+    BufferAllocationInfo round_trip(encoded);
     ASSERT_EQ(round_trip, buffer_info);
   };
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index ba297127eae117..2adc83512c6617 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -83,11 +83,11 @@ struct ClusterTupleLessThan {
 };
 
 // TODO(jpienaar): Move to OutputTensor.
-string DebugString(const OutputTensor& tensor) {
+std::string DebugString(const OutputTensor& tensor) {
   return absl::StrCat(tensor.node->name(), ":", tensor.index);
 }
 
-string Branch_Name(BranchType b) {
+std::string Branch_Name(BranchType b) {
   switch (b) {
     case BranchType::kElseBranch:
       return "else";
@@ -100,13 +100,13 @@ string Branch_Name(BranchType b) {
   }
 }
 
-string DebugString(StateMap::CondId cond_state) {
+std::string DebugString(StateMap::CondId cond_state) {
   if (cond_state == nullptr || cond_state->empty()) return "{}";
   using value_type = StateMap::CondState::value_type;
   return absl::StrCat(
       "{",
       absl::StrJoin(*cond_state, ", ",
-                    [](string* output, const value_type& pred_branch) {
+                    [](std::string* output, const value_type& pred_branch) {
                       const OutputTensor& pred = pred_branch.first;
                       const BranchType& branch = pred_branch.second;
                       if (branch == BranchType::kNeither)
@@ -200,7 +200,7 @@ struct CondArgNode {
   explicit CondArgNode(Node* src, int src_output)
       : src(src), src_output(src_output) {}
 
-  string ToString() const {
+  std::string ToString() const {
     return absl::StrCat("src=", src->name(), ":", src_output,
                         " switches=", NodesToString(switches));
   }
@@ -212,11 +212,11 @@ struct CondArgNode {
 };
 using CondArgNodes = std::vector<CondArgNode>;
 
-string DebugString(const CondArgNodes& nodes) {
+std::string DebugString(const CondArgNodes& nodes) {
   return absl::StrCat(
       "[",
       absl::StrJoin(nodes, ", ",
-                    [](string* output, const CondArgNode& node) {
+                    [](std::string* output, const CondArgNode& node) {
                       absl::StrAppend(output, node.ToString());
                     }),
       "]");
@@ -263,20 +263,20 @@ void StateMap::ResetAncestorId(const Node* node, StateMap::AncestorId id) {
 
 void StateMap::MarkDead(const Node* node) { ResetCondId(node, dead_id_); }
 
-string StateMap::CondStateToString(const Node* node) const {
+std::string StateMap::CondStateToString(const Node* node) const {
   return CondStateToString(LookupCondId(node));
 }
 
-string StateMap::CondStateToString(StateMap::CondId id) const {
+std::string StateMap::CondStateToString(StateMap::CondId id) const {
   return DebugString(id);
 }
 
-string StateMap::AncestorStateToString(const Node* node) const {
+std::string StateMap::AncestorStateToString(const Node* node) const {
   if (auto id = LookupAncestorId(node)) {
     return absl::StrCat(
         "{",
         absl::StrJoin(*id, ",",
-                      [](string* output, const AncestorNode& ancestor) {
+                      [](std::string* output, const AncestorNode& ancestor) {
                         absl::StrAppend(output,
                                         ancestor.output_tensor.node->name(),
                                         ":", ancestor.output_tensor.index);
@@ -340,7 +340,7 @@ class Conditional {
 
   // Internal name of conditional. The name is based on the first merge node
   // added.
-  string name() const;
+  std::string name() const;
 
   // The FunctionalizeCond instance that created this.
   FunctionalizeCond* parent_;
@@ -751,7 +751,7 @@ absl::Status Conditional::BuildIfNode(Graph* graph,
   VLOG(2) << "Build cond function for " << name();
   NodeDebugInfo debug_info((*merges_.begin())->def());
   NodeDefBuilder builder(name(), "If", library, &debug_info);
-  const string branch_name[] = {"else_branch", "then_branch"};
+  const std::string branch_name[] = {"else_branch", "then_branch"};
   for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
     int branch_index = static_cast<int>(branch);
 
@@ -817,7 +817,7 @@ absl::Status Conditional::BuildIfNode(Graph* graph,
   builder.Attr("Tcond", DT_BOOL);
   // Add some internal attributes which need to be propagated.
   for (absl::string_view attr_name : kAttrsToPropagate) {
-    string attr_val;
+    std::string attr_val;
     if (GetNodeAttr(predicate_.node->def(), attr_name, &attr_val).ok()) {
       builder.Attr(attr_name, attr_val);
     }
@@ -949,7 +949,7 @@ absl::Status Conditional::BuildAndReplace(
   return absl::OkStatus();
 }
 
-string Conditional::name() const {
+std::string Conditional::name() const {
   CHECK(!merges_.empty());
   return absl::StrCat((*merges_.begin())->name(), "_if");
 }
@@ -958,7 +958,7 @@ absl::Status FunctionalizeCond::AddIdentityNode(const Node* replacee,
                                                 Node* if_node, int port) {
   NodeBuilder id_builder(replacee->name(), "Identity");
   id_builder.Input(if_node, port);
-  string outside_compilation;
+  std::string outside_compilation;
   if (GetNodeAttr(if_node->def(), kXlaOutsideCompilationAttr,
                   &outside_compilation)
           .ok()) {
@@ -1580,7 +1580,7 @@ absl::Status FunctionalizeCond::FunctionalizeInternal() {
   return absl::OkStatus();
 }
 
-void FunctionalizeCond::DumpGraphWithCondState(const string& name) {
+void FunctionalizeCond::DumpGraphWithCondState(const std::string& name) {
   const char* const kCondGroupDebugAttr = "_XlaFunctionalizeCondGroup";
 
   for (Node* n : graph_->nodes()) {
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h
index e37555b053d7ed..25d773ad50a105 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.h
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -136,11 +136,11 @@ class StateMap {
   BranchType FindBranchOf(CondId id, OutputTensor predicate) const;
 
   // Returns textual representation of node's CondState.
-  string CondStateToString(const Node* node) const;
-  string CondStateToString(CondId id) const;
+  std::string CondStateToString(const Node* node) const;
+  std::string CondStateToString(CondId id) const;
 
   // Returns textual representation of node's AncestorState.
-  string AncestorStateToString(const Node* node) const;
+  std::string AncestorStateToString(const Node* node) const;
 
   // Returns whether the cond state is the dead state.
   bool IsDead(CondId id) const;
@@ -201,7 +201,7 @@ class FunctionalizeCond {
   absl::Status PropagateUpdatedState(const Node* replacee);
 
   // Dump graph with the CondState annotated.
-  void DumpGraphWithCondState(const string& name);
+  void DumpGraphWithCondState(const std::string& name);
 
   // Adds `switch_id` to the list of Switch node ids.
   void AddSwitchId(int switch_id);
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
index 50bd47ad73e77e..edb2a7e0ea1b33 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
@@ -48,7 +48,7 @@ class FunctionalizeCondTest : public ::testing::Test {
     return fc_->state_map_.GetCondId(state);
   }
 
-  string GetString(const StateMap::StateMap::CondId id) {
+  std::string GetString(const StateMap::StateMap::CondId id) {
     return fc_->state_map_.CondStateToString(id);
   }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index ac38725269bfd9..22b9b9187ecd7d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -51,8 +51,9 @@ namespace tensorflow {
 // Maps function name to
 // - new function name, if the function body was functionalized
 // - std::nullopt, if not
-using FuncMap = std::map<string, std::optional<string>>;
-using FuncMapIter = std::map<string, std::optional<string>>::const_iterator;
+using FuncMap = std::map<std::string, std::optional<std::string>>;
+using FuncMapIter =
+    std::map<std::string, std::optional<std::string>>::const_iterator;
 
 // Returns whether function has been processed before.
 bool FunctionHasBeenProcessed(FuncMapIter func_iter, const FuncMap* func_map) {
@@ -65,8 +66,8 @@ bool FunctionHasBeenModified(FuncMapIter func_iter) {
 }
 
 // Returns a name for the new functionalized version of a function.
-string GetNewFunctionName(
-    const string& func_name, Node* n,
+std::string GetNewFunctionName(
+    const std::string& func_name, Node* n,
     AssociatedFunctionInfo::AssociatedFunctionType func_type,
     FunctionLibraryDefinition* fld) {
   // For SymbolicGradient, `func_name` is always "SymbolicGradient" which
@@ -79,14 +80,15 @@ string GetNewFunctionName(
 }
 
 // Returns name to which a modified function has been mapped.
-const string& GetMappedFunctionName(FuncMapIter func_iter) {
+const std::string& GetMappedFunctionName(FuncMapIter func_iter) {
   DCHECK(func_iter->second.has_value());
   return func_iter->second.value();
 }
 
 // Updates `func_map` with function given by `canonicalized_name`.
-void UpdateFunctionMap(FuncMap* func_map, const string& canonicalized_name,
-                       const string& new_func_name, bool function_modified) {
+void UpdateFunctionMap(FuncMap* func_map, const std::string& canonicalized_name,
+                       const std::string& new_func_name,
+                       bool function_modified) {
   // If function was modified store its new name, otherwise add empty entry to
   // record that function has been processed and does not need to be rewritten.
   (*func_map)[canonicalized_name] =
@@ -95,8 +97,9 @@ void UpdateFunctionMap(FuncMap* func_map, const string& canonicalized_name,
 
 // Adds new function def to graph's function library if necessary.
 absl::Status AddFunctionDefToGraphLibrary(
-    const string& func_name, const AssociatedFunctionInfo& associated_function,
-    Graph* graph, FunctionLibraryDefinition* fld) {
+    const std::string& func_name,
+    const AssociatedFunctionInfo& associated_function, Graph* graph,
+    FunctionLibraryDefinition* fld) {
   const OpRegistrationData* op_reg_data;
   // We have to be careful with adding the function def since there are three
   // different `OpRegistryInterface`s involved here:
@@ -129,8 +132,8 @@ absl::Status AddFunctionDefToGraphLibrary(
 
 // Functionalizes function given by `func_name`. Update `func_map` accordingly.
 absl::Status FunctionalizeControlFlowForFunction(
-    const string& func_name, const string& new_func_name,
-    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    const std::string& func_name, const std::string& new_func_name,
+    const protobuf::Map<std::string, tensorflow::AttrValue>& attrs,
     FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
     FuncMap* func_map, bool* function_modified,
     const NodeFilter& node_filter = {});
@@ -165,11 +168,11 @@ absl::Status FunctionalizeControlFlowForNodeAssociatedFunctions(
              associated_functions.size() == 1);
 
       // Process one node-function-pair.
-      string func_name = associated_function.func_name();
-      string canonicalized_name =
+      std::string func_name = associated_function.func_name();
+      std::string canonicalized_name =
           Canonicalize(func_name, AttrSlice(&associated_function.attrs()));
       auto func_iter = func_map->find(canonicalized_name);
-      string new_func_name;
+      std::string new_func_name;
       if (FunctionHasBeenProcessed(func_iter, func_map)) {
         if (FunctionHasBeenModified(func_iter)) {
           *any_function_modified = true;
@@ -202,8 +205,8 @@ absl::Status FunctionalizeControlFlowForNodeAssociatedFunctions(
 }
 
 absl::Status FunctionalizeControlFlowForFunction(
-    const string& func_name, const string& new_func_name,
-    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    const std::string& func_name, const std::string& new_func_name,
+    const protobuf::Map<std::string, tensorflow::AttrValue>& attrs,
     FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
     FuncMap* func_map, bool* function_modified, const NodeFilter& node_filter) {
   *function_modified = false;
@@ -341,8 +344,8 @@ absl::Status FunctionalizeControlFlowForXlaPass::Run(
   // Find XLA compile ops and its corresponding FunctionDef.
   // TPUCompile op is not in the map because graph rewriting might happen
   // multiple times, and we want to avoid functionalize it again.
-  static std::map<string, string>* kNodeTypeToFunctionAttrMapping =
-      new std::map<string, string>{
+  static std::map<std::string, std::string>* kNodeTypeToFunctionAttrMapping =
+      new std::map<std::string, std::string>{
           // _TPUReplicate ops are generated by EncapsulateTPUComputationsPass.
           {"_TPUReplicate", "computation"},
           // XlaLaunch ops are generated by EncapsulateXlaComputationsPass.
@@ -355,12 +358,12 @@ absl::Status FunctionalizeControlFlowForXlaPass::Run(
     if (it == kNodeTypeToFunctionAttrMapping->end()) {
       continue;
     }
-    const string func_attr = it->second;
+    const std::string func_attr = it->second;
     NameAttrList func;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
     VLOG(2) << "Graph has node " << n->type_string()
             << ". Corresponding function: " << func.name();
-    string new_func_name = options.flib_def->UniqueFunctionName(
+    std::string new_func_name = options.flib_def->UniqueFunctionName(
         absl::StrCat(func.name(), "_f15n_"));
     bool modified;
     TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 7727853a8c4233..24fe7f5e13e7e0 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -46,7 +46,7 @@ namespace {
 
 // Returns the names of the "then" and "else" functions for the If node in a
 // graph.
-absl::Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
+absl::Status FindIfThenAndElse(const GraphDef& graph, std::string* op_name,
                                NameAttrList* then_fn, NameAttrList* else_fn) {
   for (const NodeDef& node : graph.node()) {
     if (node.op() == "If") {
@@ -97,7 +97,7 @@ INSTANTIATE_TEST_SUITE_P(
            info) {
       bool restrict_to_tpu_nodes = std::get<0>(info.param);
       bool wrap_cond_in_function = std::get<1>(info.param);
-      string name =
+      std::string name =
           absl::StrCat(restrict_to_tpu_nodes ? "with_filter" : "without_filter",
                        wrap_cond_in_function ? "_in_function" : "_in_graph");
       return name;
@@ -114,7 +114,7 @@ void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
 
     auto identity_t =
         ops::Identity(scope.WithOpName("cond/Identity"), switch_1.output_true);
-    auto seventeen = ops::Const<int32>(
+    auto seventeen = ops::Const<int32_t>(
         scope.WithOpName("cond").WithControlDependencies(identity_t), 17);
     auto switch_2 = ops::Switch(scope.WithOpName("cond/Switch"), y, less);
     auto mul = ops::Multiply(scope.WithOpName("cond/Mul"), switch_2.output_true,
@@ -122,7 +122,7 @@ void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
 
     auto identity_f =
         ops::Identity(scope.WithOpName("cond/Identity"), switch_1.output_false);
-    auto twenty_three = ops::Const<int32>(
+    auto twenty_three = ops::Const<int32_t>(
         scope.WithOpName("cond").WithControlDependencies(identity_f), 23);
     auto switch_3 = ops::Switch(scope.WithOpName("cond/Switch"), x, less);
     auto add = ops::Add(scope.WithOpName("cond/false/add"),
@@ -146,7 +146,7 @@ void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
 
 void ConditionalTestFixture::CheckGraphDef(
     const GraphDef& graph_def, const FunctionLibraryDefinition& library) {
-  string op_name;
+  std::string op_name;
   NameAttrList then_fn;
   NameAttrList else_fn;
   TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
@@ -285,7 +285,7 @@ void ConditionalTestFixture::RunTest() {
     FunctionLibraryRuntime::Handle handle;
 
     // Functionalized function name is the type string of `cond_node`.
-    string func_name;
+    std::string func_name;
     for (Node* n : graph.nodes()) {
       if (n->name() == "cond_node") {
         func_name = n->type_string();
@@ -341,7 +341,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
         ops::internal::Enter(scope.WithOpName("while/Enter2"), source, "aloop");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -352,7 +352,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
                                     switch_.output_false);
     auto identity =
         ops::Identity(scope.WithOpName("while/Identity"), switch_.output_true);
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
     auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
     auto next_iteration =
@@ -405,7 +405,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
     {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto ten = ops::Const<int32>(
+      auto ten = ops::Const<int32_t>(
           scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
       auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
@@ -427,7 +427,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
       auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-      auto one = ops::Const<int32>(
+      auto one = ops::Const<int32_t>(
           scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
       auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
@@ -463,7 +463,8 @@ FunctionDef GetNoinlineFunctionDef() {
 //   return [x + 1]
 // Define the above function, and add it to the given graph. It's used as the
 // while loop body in NoinlineLoopBody test.
-absl::Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+absl::Status AddNoinlineFunctionToGraph(const std::string& node_name,
+                                        Graph* graph) {
   FunctionDefLibrary fdef_lib;
   *(fdef_lib.add_function()) = GetNoinlineFunctionDef();
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
@@ -481,7 +482,7 @@ absl::Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
 // x = array_ops.placeholder(dtypes.int32)
 // y = control_flow_ops.while_loop(lambda i: i < 10, increment_fn, [x])
 TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
-  const string& noinline_node_name = "while/increment_fn";
+  const std::string& noinline_node_name = "while/increment_fn";
   Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
@@ -491,7 +492,7 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
                                       "while/while_context");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -585,7 +586,7 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
 }
 
 TEST(FunctionalizeControlFlow, MissingFunctionDefInLibrary) {
-  const string& noinline_node_name = "while/increment_fn";
+  const std::string& noinline_node_name = "while/increment_fn";
   Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
@@ -622,7 +623,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
         ops::internal::Enter(scope.WithOpName("while/Enter"), source, "aloop");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -631,7 +632,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
         ops::Switch(scope.WithOpName("while/Switch"), merge.output, loop_cond);
     auto identity =
         ops::Identity(scope.WithOpName("while/Identity"), switch_.output_true);
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
     auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
     auto next_iteration =
@@ -673,7 +674,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
     {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto ten = ops::Const<int32>(
+      auto ten = ops::Const<int32_t>(
           scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
       auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
@@ -695,7 +696,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
       auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-      auto one = ops::Const<int32>(
+      auto one = ops::Const<int32_t>(
           scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
       auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
@@ -739,14 +740,15 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
                               std::initializer_list<Input>{enter_y, dummy});
 
     // Loop condition
-    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
-                                       .WithControlDependencies(merge_x.output),
-                                   3);
+    auto three =
+        ops::Const<int32_t>(scope.WithOpName("while/cond/three")
+                                .WithControlDependencies(merge_x.output),
+                            3);
     auto cond_add =
         ops::Add(scope.WithOpName("while/cond/Add"), merge_x.output, three);
-    auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
-                                     .WithControlDependencies(merge_x.output),
-                                 10);
+    auto ten = ops::Const<int32_t>(scope.WithOpName("while/cond/ten")
+                                       .WithControlDependencies(merge_x.output),
+                                   10);
     auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
     auto loop_cond = ops::LoopCond(scope.WithOpName("while/LoopCond"), less);
 
@@ -765,10 +767,10 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
     auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"),
                                     switch_y.output_true);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
         1);
-    auto two = ops::Const<int32>(
+    auto two = ops::Const<int32_t>(
         scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
         2);
 
@@ -825,14 +827,15 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
       auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
-                                         .WithControlDependencies(arg0.output),
-                                     3);
+      auto three =
+          ops::Const<int32_t>(scope.WithOpName("while/cond/three")
+                                  .WithControlDependencies(arg0.output),
+                              3);
       auto cond_add =
           ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
-      auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
-                                       .WithControlDependencies(arg0.output),
-                                   10);
+      auto ten = ops::Const<int32_t>(scope.WithOpName("while/cond/ten")
+                                         .WithControlDependencies(arg0.output),
+                                     10);
       auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
 
@@ -859,10 +862,10 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
       auto identity_y =
           ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
 
-      auto one = ops::Const<int32>(
+      auto one = ops::Const<int32_t>(
           scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
           1);
-      auto two = ops::Const<int32>(
+      auto two = ops::Const<int32_t>(
           scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
           2);
 
@@ -922,7 +925,7 @@ INSTANTIATE_TEST_SUITE_P(
       bool mark_inner_loop_tpu = std::get<1>(info.param);
       bool mark_outer_loop_tpu = std::get<2>(info.param);
 
-      string node_string;
+      std::string node_string;
       if (mark_inner_loop_tpu && mark_outer_loop_tpu)
         node_string = "both_loops_tpu";
       else if (!mark_inner_loop_tpu && !mark_outer_loop_tpu)
@@ -930,7 +933,7 @@ INSTANTIATE_TEST_SUITE_P(
       else
         node_string = mark_inner_loop_tpu ? "inner_loop_tpu" : "outer_loop_tpu";
 
-      string name = absl::StrCat(
+      std::string name = absl::StrCat(
           restrict_to_tpu_nodes ? "restricted_" : "unrestricted_", node_string);
       return name;
     });
@@ -961,21 +964,21 @@ void ComplexTestFixture::RunTest() {
     auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32);
 
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+    auto three = ops::Const<int32_t>(scope.WithOpName("three"), 3);
     auto y = ops::Add(scope.WithOpName("y"), x, three);
 
     auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
                                 TensorShape({}));
 
     // Outer loop
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+    auto zero = ops::Const<int32_t>(scope.WithOpName("outer/Const"), 0);
     auto enter_i =
         ops::internal::Enter(scope.WithOpName("outer/Enter_i"), zero, "outer");
     auto merge_i = ops::Merge(scope.WithOpName("outer/Merge_i"),
                               std::initializer_list<Input>{enter_i, dummy});
-    auto ten = ops::Const<int32>(scope.WithOpName("outer/Less/y")
-                                     .WithControlDependencies(merge_i.output),
-                                 10);
+    auto ten = ops::Const<int32_t>(scope.WithOpName("outer/Less/y")
+                                       .WithControlDependencies(merge_i.output),
+                                   10);
     auto less_i =
         ops::Less(scope.WithOpName("outer/Less_i"), merge_i.output, ten);
     auto outer_loop_cond =
@@ -998,7 +1001,7 @@ void ComplexTestFixture::RunTest() {
                              ops::internal::Enter::Attrs().IsConstant(true));
 
     // Inner loop
-    auto one_j = ops::Const<int32>(
+    auto one_j = ops::Const<int32_t>(
         scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
     auto enter_j = ops::internal::Enter(scope.WithOpName("outer/inner/Enter_j"),
                                         one_j, "inner");
@@ -1018,9 +1021,10 @@ void ComplexTestFixture::RunTest() {
     auto merge_k = ops::Merge(scope.WithOpName("outer/inner/Merge_k"),
                               std::initializer_list<Input>{enter_k, dummy});
 
-    auto five = ops::Const<int32>(scope.WithOpName("outer/inner/Five")
-                                      .WithControlDependencies(merge_j.output),
-                                  5);
+    auto five =
+        ops::Const<int32_t>(scope.WithOpName("outer/inner/Five")
+                                .WithControlDependencies(merge_j.output),
+                            5);
     auto less_j =
         ops::Less(scope.WithOpName("outer/inner/Less_j"), merge_j.output, five);
     auto loop_cond =
@@ -1047,7 +1051,7 @@ void ComplexTestFixture::RunTest() {
     auto assign = ops::AssignAddVariableOp(
         scope.WithOpName("outer/inner/assign_add"), enter_var, add_jkx);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("outer/inner/One")
             .WithControlDependencies(
                 absl::Span<const Operation>{assign.operation}),
@@ -1061,7 +1065,7 @@ void ComplexTestFixture::RunTest() {
         scope.WithOpName("outer/inner/NextIteration_k"), identity_k);
 
     // Body and backedge for outer loop.
-    auto one_outer = ops::Const<int32>(
+    auto one_outer = ops::Const<int32_t>(
         scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
     auto add_i =
         ops::Add(scope.WithOpName("outer/add")
@@ -1086,9 +1090,10 @@ void ComplexTestFixture::RunTest() {
   }
   // Add '_tpu_replicate' attributes as specified.
   for (Node* n : graph.nodes()) {
-    string name = n->name();
-    bool is_inner_node = name.find("outer/inner/") != string::npos;
-    bool is_outer_node = !is_inner_node && name.find("outer/") != string::npos;
+    std::string name = n->name();
+    bool is_inner_node = name.find("outer/inner/") != std::string::npos;
+    bool is_outer_node =
+        !is_inner_node && name.find("outer/") != std::string::npos;
     if ((is_inner_node && mark_inner_loop_tpu_) ||
         (is_outer_node && mark_outer_loop_tpu_)) {
       n->AddAttr("_tpu_replicate", "cluster");
@@ -1159,13 +1164,13 @@ void ComplexTestFixture::CheckOuterNodesFunctionalized(
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+    auto three = ops::Const<int32_t>(scope.WithOpName("three"), 3);
     auto y = ops::Add(scope.WithOpName("y"), x, three);
 
     auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
                                 TensorShape({}));
 
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+    auto zero = ops::Const<int32_t>(scope.WithOpName("outer/Const"), 0);
 
     auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
                                std::initializer_list<Input>{zero, y, x, var},
@@ -1184,7 +1189,7 @@ void ComplexTestFixture::CheckOuterNodesFunctionalized(
     auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
     auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
 
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
         10);
     auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
@@ -1220,14 +1225,14 @@ void ComplexTestFixture::CheckOuterNodesFunctionalized(
     auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
 
     auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
-    auto one_j = ops::Const<int32>(
+    auto one_j = ops::Const<int32_t>(
         scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
     auto while_op =
         ops::While(scope.WithOpName("outer/inner/LoopCond"),
                    std::initializer_list<Input>{one_j, arg1, arg2, arg3},
                    inner_cond_fn, inner_body_fn);
 
-    auto one_outer = ops::Const<int32>(
+    auto one_outer = ops::Const<int32_t>(
         scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
     auto add_i =
         ops::Add(scope.WithOpName("outer/add")
@@ -1262,7 +1267,7 @@ void ComplexTestFixture::CheckInnerNodesFunctionalized(
     auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
     auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
 
-    auto five = ops::Const<int32>(
+    auto five = ops::Const<int32_t>(
         scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
     auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
     auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less_j, 0);
@@ -1299,7 +1304,7 @@ void ComplexTestFixture::CheckInnerNodesFunctionalized(
     auto assign = ops::AssignAddVariableOp(
         scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("outer/inner/One")
             .WithControlDependencies(
                 absl::Span<const Operation>{assign.operation}),
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
index cf3413154b8baa..d8558e7fb2b5fe 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
@@ -42,7 +42,7 @@ absl::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
 
 absl::Status ExtractWhileLoopFrames(
     const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
-    std::unordered_map<string, WhileLoopFrame>* frames,
+    std::unordered_map<std::string, WhileLoopFrame>* frames,
     const NodeFilter& node_filter) {
   for (Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
index 970f62daa42af3..90c50f75e36387 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
@@ -47,7 +47,7 @@ struct WhileLoopArg {
 
 // Information about a loop frame.
 struct WhileLoopFrame {
-  string name;
+  std::string name;
 
   // Pointer to the parent frame. The root frame has a pointer to itself.
   WhileLoopFrame* parent = nullptr;
@@ -76,7 +76,7 @@ struct WhileLoopFrame {
 // `FunctionalizeControlFlow` for more details about node filters).
 absl::Status ExtractWhileLoopFrames(
     const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
-    std::unordered_map<string, WhileLoopFrame>* frames,
+    std::unordered_map<std::string, WhileLoopFrame>* frames,
     const NodeFilter& node_filter = {});
 
 // Check that the graph has no cycle containing the given node.
@@ -97,10 +97,10 @@ absl::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index);
 
 // Returns a textual representation of the names of the nodes in the input.
 template <typename T>
-string NodesToString(const T& nodes) {
+std::string NodesToString(const T& nodes) {
   return absl::StrCat("{",
                       absl::StrJoin(nodes, ",",
-                                    [](string* output, const Node* node) {
+                                    [](std::string* output, const Node* node) {
                                       absl::StrAppend(output, node->name());
                                     }),
                       "}");
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index 2c02379c36cd45..b8183afd59481a 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -438,7 +438,7 @@ absl::Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
   builder.Attr("body", body_name);
   // Add some internal attributes which need to be propagated.
   for (absl::string_view attr_name : kAttrsToPropagate) {
-    string attr_val;
+    std::string attr_val;
     if (GetNodeAttr(frame->loop_cond->def(), attr_name, &attr_val).ok()) {
       builder.Attr(attr_name, attr_val);
     }
@@ -513,7 +513,7 @@ absl::Status FunctionalizeWhileLoop(Graph* graph,
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
   std::vector<ControlFlowInfo> cf_info;
-  std::vector<string> unreachable_nodes;
+  std::vector<std::string> unreachable_nodes;
   TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info, &unreachable_nodes));
   if (!unreachable_nodes.empty()) {
     return errors::InvalidArgument(
@@ -522,7 +522,7 @@ absl::Status FunctionalizeWhileLoop(Graph* graph,
   }
 
   // Builds Frames, indexed by name.
-  std::unordered_map<string, WhileLoopFrame> frames;
+  std::unordered_map<std::string, WhileLoopFrame> frames;
   TF_RETURN_IF_ERROR(
       ExtractWhileLoopFrames(cf_info, graph, &frames, node_filter));
 
diff --git a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
index 2759ad8384cd81..b331272a2c9504 100644
--- a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
+++ b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
@@ -42,7 +42,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-absl::Status GetTestDevice(Session* session, string* test_device) {
+absl::Status GetTestDevice(Session* session, std::string* test_device) {
   std::vector<DeviceAttributes> devices;
   TF_RETURN_IF_ERROR(session->ListDevices(&devices));
 
@@ -85,7 +85,7 @@ TEST(FusedBatchnormReserveSpaceTest, Test) {
   std::unique_ptr<tensorflow::Session> session(
       tensorflow::NewSession(tensorflow::SessionOptions{}));
 
-  string test_device;
+  std::string test_device;
   TF_ASSERT_OK(GetTestDevice(session.get(), &test_device));
 
   Scope root = tensorflow::Scope::NewRootScope();
@@ -108,8 +108,8 @@ TEST(FusedBatchnormReserveSpaceTest, Test) {
   Output variance =
       Const(root.WithOpName("variance"), Input::Initializer(variance_data));
 
-  string tf_device = absl::StrCat("/device:", test_device, ":0");
-  string xla_device = absl::StrCat("/device:XLA_", test_device, ":0");
+  std::string tf_device = absl::StrCat("/device:", test_device, ":0");
+  std::string xla_device = absl::StrCat("/device:XLA_", test_device, ":0");
 
   FusedBatchNorm fused_batch_norm_tf(
       root.WithOpName("fused_batch_norm_tf").WithDevice(tf_device), input,
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index f23c423fbb2632..5f794005b7c7c0 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -292,12 +292,12 @@ absl::Status GraphCompiler::CompileFunctionalNode(Node* n,
     }
   }
   if (add_token_input_output) {
-    std::vector<string> token_input_nodes;
+    std::vector<std::string> token_input_nodes;
     TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(&func.attr()),
                                    kXlaTokenInputNodesAttrName,
                                    &token_input_nodes));
     std::vector<xla::XlaOp> token_inputs;
-    for (const string& node_name : token_input_nodes) {
+    for (const std::string& node_name : token_input_nodes) {
       auto token_or = compiler->GetNodeToken(node_name);
       TF_RETURN_IF_ERROR(token_or.status());
       token_inputs.push_back(std::move(token_or).value());
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_test.cc b/tensorflow/compiler/tf2xla/graph_compiler_test.cc
index 3010ac7f0b026b..2dcb2ea0b52d45 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_test.cc
@@ -104,8 +104,8 @@ class GraphCompilerTest : public ::testing::Test {
     core::ScopedUnref context_unref(xla_context);
     xla_context->Ref();
 
-    auto step_container =
-        std::make_unique<ScopedStepContainer>(0, [this](const string& name) {
+    auto step_container = std::make_unique<ScopedStepContainer>(
+        0, [this](const std::string& name) {
           absl::Status status =
               this->device_->resource_manager()->Cleanup(name);
         });
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_util.cc b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
index d1c984e26f390a..116c1e68f66fe6 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_util.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
@@ -44,7 +44,7 @@ const char* const kFetchIdAttr = "_fetch_id";
 const char* const kShapeAttr = "_shape";
 const char* const kDebugNameAttr = "_debug_name";
 
-typedef std::unordered_map<string, Node*> NodeMap;
+typedef std::unordered_map<std::string, Node*> NodeMap;
 
 // Each feed id identifies the positional output of some node, which may consist
 // of multiple edges. AddPlaceholdersForFeeds has already replaced each fed
@@ -54,14 +54,14 @@ typedef std::unordered_map<string, Node*> NodeMap;
 absl::Status AddArgNodes(
     Graph* graph, const NodeMap& node_map,
     const protobuf::RepeatedPtrField<tf2xla::Feed>& feeds,
-    const std::unordered_map<string, string>& feed_remapping,
+    const std::unordered_map<std::string, std::string>& feed_remapping,
     std::unordered_set<const Node*>* arg_nodes) {
   for (int arg_index = 0; arg_index < feeds.size(); ++arg_index) {
     const tf2xla::Feed& feed = feeds[arg_index];
     // All feeds have been replaced by placeholders.
     const int output_index = 0;
 
-    const string key = TensorIdToString(feed.id());
+    const std::string key = TensorIdToString(feed.id());
     const auto remap_it = feed_remapping.find(key);
     auto node_it = node_map.find(remap_it->second);
     if (node_it == node_map.end()) {
@@ -149,7 +149,7 @@ absl::Status AddRetvalNodes(
 // execution to know the input and output args for the generated function.
 absl::Status RewriteAndPruneGraph(
     Graph* graph, const tf2xla::Config& config,
-    const std::unordered_map<string, string>& feed_remapping) {
+    const std::unordered_map<std::string, std::string>& feed_remapping) {
   NodeMap node_map;
   for (Node* n : graph->nodes()) {
     node_map[n->name()] = n;
@@ -164,7 +164,7 @@ absl::Status RewriteAndPruneGraph(
   FixupSourceAndSinkEdges(graph);
   VLOG(2) << "Post prune: " << DumpGraphToFile("tfcompile_post_prune", *graph);
   // Sanity-check, to make sure the feeds and fetches still exist post-pruning.
-  std::set<string> missing_feeds, missing_fetches;
+  std::set<std::string> missing_feeds, missing_fetches;
   for (const tf2xla::Feed& feed : config.feed()) {
     missing_feeds.insert(TensorIdToString(feed.id()));
   }
@@ -173,14 +173,14 @@ absl::Status RewriteAndPruneGraph(
   }
   for (const Node* n : graph->op_nodes()) {
     if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
-      string feed_id;
+      std::string feed_id;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFeedIdAttr, &feed_id));
       if (missing_feeds.erase(feed_id) == 0) {
         return errors::Aborted(FunctionLibraryDefinition::kArgOp,
                                " node found with unknown feed id: ", feed_id);
       }
     } else if (n->type_string() == FunctionLibraryDefinition::kRetOp) {
-      string fetch_id;
+      std::string fetch_id;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFetchIdAttr, &fetch_id));
       if (missing_fetches.erase(fetch_id) == 0) {
         return errors::Aborted(FunctionLibraryDefinition::kRetOp,
@@ -277,7 +277,7 @@ absl::Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   GraphDef first_copy_def = graph_def;
 
   // Maps from name:port of a feed to the name:port of the placeholder to use.
-  std::unordered_map<string, string> feed_remapping;
+  std::unordered_map<std::string, std::string> feed_remapping;
   TF_RETURN_IF_ERROR(AddPlaceholdersForFeeds(config, g->op_registry(),
                                              &feed_remapping, &first_copy_def));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 5079ddd4389bd8..bb50d530484b10 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -371,7 +371,6 @@ cc_library(
         "@local_xla//xla/hlo/translate:stablehlo",
         "@local_xla//xla/mlir/utils:error_util",
         "@local_xla//xla/mlir/utils:type_util",
-        "@local_xla//xla/mlir_hlo:mhlo_passes",
         "@local_xla//xla/python:refine_polymorphic_shapes",
         "@local_xla//xla/service:hlo_proto_cc",
         "@local_xla//xla/service/spmd/shardy/sdy_round_trip:pipelines",
@@ -382,6 +381,7 @@ cc_library(
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_passes",
+        "@stablehlo//:stablehlo_passes_optimization",
         "@stablehlo//:stablehlo_serialization",
         "@stablehlo//:vhlo_ops",
     ],
diff --git a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
index a6ddbfd3a01fef..74c888d37de784 100644
--- a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
@@ -94,9 +94,9 @@ class CollectiveReduceV2Op : public XlaOpKernel {
 
  private:
   DataType dtype_ = DT_INVALID;
-  string merge_op_name_;
-  string final_op_name_;
-  string communication_hint_;
+  std::string merge_op_name_;
+  std::string final_op_name_;
+  std::string communication_hint_;
 
   CollectiveReduceV2Op(const CollectiveReduceV2Op&) = delete;
   void operator=(const CollectiveReduceV2Op&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 0dd528e3dea173..240a099f075aa2 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -48,7 +48,7 @@ class FusedBatchNormOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("exponential_avg_factor", &exponential_avg_factor_));
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
@@ -61,7 +61,7 @@ class FusedBatchNormOp : public XlaOpKernel {
                   errors::InvalidArgument(
                       "FusedBatchNormEx supports at most 1 side input."));
       add_side_input_ = (num_side_inputs == 1);
-      string activation_mode;
+      std::string activation_mode;
       OP_REQUIRES_OK(ctx, ctx->GetAttr("activation_mode", &activation_mode));
       OP_REQUIRES(ctx,
                   activation_mode == "Identity" || activation_mode == "Relu",
@@ -249,7 +249,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   explicit FusedBatchNormGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index 7c89720292b0a7..94486a104152ea 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -66,9 +66,11 @@ class BCastArgsOp : public XlaOpKernel {
     Tensor output(val_type, TensorShape({len}));
     for (int64_t i = 0; i < len; ++i) {
       if (val_type == DT_INT32) {
-        output.flat<int32>()(i) = static_cast<int32>(bcast.output_shape()[i]);
+        output.flat<int32_t>()(i) =
+            static_cast<int32_t>(bcast.output_shape()[i]);
       } else {
-        output.flat<int64>()(i) = static_cast<int64>(bcast.output_shape()[i]);
+        output.flat<int64_t>()(i) =
+            static_cast<int64_t>(bcast.output_shape()[i]);
       }
     }
     ctx->SetConstantOutput(0, output);
@@ -129,9 +131,9 @@ class BCastGradArgsOp : public XlaOpKernel {
     Tensor constant(val_type, TensorShape({len}));
     for (int64_t i = 0; i < len; ++i) {
       if (val_type == DT_INT32) {
-        constant.flat<int32>()(i) = static_cast<int32>(v[i]);
+        constant.flat<int32_t>()(i) = static_cast<int32_t>(v[i]);
       } else {
-        constant.flat<int64>()(i) = static_cast<int64>(v[i]);
+        constant.flat<int64_t>()(i) = static_cast<int64_t>(v[i]);
       }
     }
     ctx->SetConstantOutput(idx, constant);
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index 2bf4ab52c8b59e..bf428711664d76 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -28,7 +28,7 @@ namespace {
 class BiasOp : public XlaOpKernel {
  public:
   explicit BiasOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
+    std::string data_format;
     if (ctx->GetAttr("data_format", &data_format).ok()) {
       OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
index 510d5225d6f04b..7d323b16d8856e 100644
--- a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
@@ -55,7 +55,7 @@ class BucketizeOp : public XlaOpKernel {
                                         /*broadcast_dimensions=*/{0}),
                                 xla::S32);
     xla::XlaOp buckets = xla::Reduce(
-        comparison, /*init_value=*/xla::ConstantR0<int32>(builder, 0),
+        comparison, /*init_value=*/xla::ConstantR0<int32_t>(builder, 0),
         /*computation=*/xla::CreateScalarAddComputation(xla::S32, builder),
         /*dimensions_to_reduce=*/{0});
     context->SetOutput(0, buckets);
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
index cead6d10c2a0eb..da40d84e73f063 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -66,7 +66,7 @@ XlaCaseOp::GetPrunedBranchesAndIndex(XlaOpKernelContext* ctx) {
     return {unpruned_branches_, ctx->Input(0)};
   }
 
-  int32_t branch_index = branch_index_literal.Get<int32>({});
+  int32_t branch_index = branch_index_literal.Get<int32_t>({});
   if (branch_index < 0 || branch_index >= unpruned_branches_.size()) {
     branch_index = unpruned_branches_.size() - 1;
   }
@@ -187,7 +187,8 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
 
       // Add any TensorArray gradients touched by the then/else computation to
       // the enclosing graph.
-      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+      for (const std::string& grad_source :
+           update.tensor_array_gradients_accessed) {
         VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
                 << grad_source;
         XlaResource* gradient;
@@ -289,7 +290,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
       // Set token input for this "case" op.
       std::vector<xla::XlaOp> token_inputs;
       token_inputs.reserve(token_input_nodes_.size());
-      for (const string& node_name : token_input_nodes_) {
+      for (const std::string& node_name : token_input_nodes_) {
         auto token_or = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token_or.status());
         token_inputs.push_back(token_or.value());
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.h b/tensorflow/compiler/tf2xla/kernels/case_op.h
index a4c01bea65a04d..6574fb4aac4c5e 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -65,8 +65,8 @@ class XlaCaseOp : public XlaOpKernel {
   DataTypeVector input_types_;
   DataTypeVector output_types_;
   bool has_token_input_output_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   // Whether to propagate compile time consts into the cond branches.
   // This is not supported by default now since it may cause HBM memory
   // overheads.
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index e8c804791299a7..2c69974d8373dc 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -185,7 +185,7 @@ class StatelessCategoricalOp : public CategoricalOp {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessCategoricalOp(const StatelessCategoricalOp&) = delete;
   void operator=(const StatelessCategoricalOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index d2463a9974b1bb..7ab53f7ad89e75 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -38,7 +38,7 @@ template <typename DstT,
                                   std::is_same<DstT, bfloat16>::value>::type* =
               nullptr>
 DstT CastTo(int32_t src) {
-  return absl::bit_cast<DstT>(static_cast<uint16>(src));
+  return absl::bit_cast<DstT>(static_cast<uint16_t>(src));
 }
 
 // Returns scalar constant with the value in the tensor, if the given proto has
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 3fe22dcb4441e7..59f72e630c0f75 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -163,8 +163,8 @@ absl::Status CheckConvAttrs(const ConvOpAttrs& attrs) {
 absl::Status ConvBackpropComputeDimensionsV2XlaShapes(
     absl::string_view label, int num_spatial_dims,
     const xla::Shape& input_shape, const xla::Shape& filter_shape,
-    const xla::Shape& out_backprop_shape, absl::Span<const int32> dilations,
-    const std::vector<int32>& strides, Padding padding,
+    const xla::Shape& out_backprop_shape, absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
     TensorFormat data_format, ConvBackpropDimensions* dims,
     absl::Span<const int64_t> explicit_paddings) {
   TensorShape input_tensor_shape, filter_tensor_shape,
@@ -203,7 +203,7 @@ absl::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
         ctx->GetAttr("explicit_paddings", &attrs.explicit_paddings));
   }
 
-  string data_format;
+  std::string data_format;
   TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format));
   if (!FormatFromString(data_format, &attrs.data_format)) {
     return errors::InvalidArgument("Invalid data format: ", data_format);
@@ -231,7 +231,7 @@ absl::StatusOr<ConvNDOpAttrs> ConvNDOpAttrs::Create(OpKernelConstruction* ctx) {
         ctx->GetAttr("explicit_paddings", &attrs.explicit_paddings));
   }
 
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format_str));
   if (!(data_format_str == "CHANNELS_LAST" ||
         data_format_str == "CHANNELS_FIRST")) {
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index 94e454df205df2..e64cebe3970cd8 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -54,8 +54,8 @@ struct ConvOpAttrs {
 
   bool depthwise;
   int num_spatial_dims;
-  std::vector<int32> dilations;
-  std::vector<int32> strides;
+  std::vector<int32_t> dilations;
+  std::vector<int32_t> strides;
   Padding padding;
   std::vector<int64_t> explicit_paddings;
   TensorFormat data_format;
@@ -68,8 +68,8 @@ struct ConvNDOpAttrs {
 
   int groups;
   int batch_dims;
-  std::vector<int32> dilations;
-  std::vector<int32> strides;
+  std::vector<int32_t> dilations;
+  std::vector<int32_t> strides;
   Padding padding;
   std::vector<int64_t> explicit_paddings;
   TensorFormat data_format;
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index b1da0acd61608f..82fdf8ea577e39 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -92,9 +92,9 @@ class ConvNDOp : public XlaOpKernel {
     ConvOpAttrs forward_attrs;
     forward_attrs.depthwise = false;
     forward_attrs.num_spatial_dims = num_spatial_dims;
-    forward_attrs.dilations = attrs_.dilations.empty()
-                                  ? std::vector<int32>(num_spatial_dims + 2, 1)
-                                  : attrs_.dilations;
+    forward_attrs.dilations =
+        attrs_.dilations.empty() ? std::vector<int32_t>(num_spatial_dims + 2, 1)
+                                 : attrs_.dilations;
     forward_attrs.strides = attrs_.strides;
     forward_attrs.padding = attrs_.padding;
     forward_attrs.explicit_paddings = attrs_.explicit_paddings;
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index 226d6248bd00d8..27818415169dbe 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -36,9 +36,9 @@ class DataFormatDimMapOp : public XlaOpKernel {
  public:
   explicit DataFormatDimMapOp(OpKernelConstruction* context)
       : XlaOpKernel(context) {
-    string src_format;
+    std::string src_format;
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
-    string dst_format;
+    std::string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
     OP_REQUIRES(context, src_format.size() == 4 || src_format.size() == 5,
                 errors::InvalidArgument(
@@ -69,9 +69,9 @@ class DataFormatDimMapOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* context) override {
     auto builder = context->builder();
     xla::XlaOp dst_indices =
-        xla::ConstantR1(builder, absl::Span<const int32>(dst_idx_));
+        xla::ConstantR1(builder, absl::Span<const int32_t>(dst_idx_));
     const int dims = dst_idx_.size();
-    xla::XlaOp rank = xla::ConstantR0<int32>(builder, dims);
+    xla::XlaOp rank = xla::ConstantR0<int32_t>(builder, dims);
     xla::XlaOp src_indices =
         (xla::ConvertElementType(context->Input(0), xla::S32) + rank) % rank;
     xla::XlaOp output =
@@ -81,7 +81,7 @@ class DataFormatDimMapOp : public XlaOpKernel {
   }
 
  private:
-  std::vector<int32> dst_idx_;
+  std::vector<int32_t> dst_idx_;
 
   DataFormatDimMapOp(const DataFormatDimMapOp&) = delete;
   void operator=(const DataFormatDimMapOp&) = delete;
@@ -146,13 +146,13 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
               input_tensor_shape.DebugString()));
     }
 
-    string src_format_str = src_format_;
-    string dst_format_str = dst_format_;
+    std::string src_format_str = src_format_;
+    std::string dst_format_str = dst_format_;
     if (input_tensor_shape.dim_size(0) == spatial_dim_count) {
       // If the input is a vector of size spatial_dim_count, treat the elements
       // as spatial dimensions.
       auto keep_only_spatial_dimensions =
-          [spatial_dim_count](string* format_str) -> void {
+          [spatial_dim_count](std::string* format_str) -> void {
         auto new_end =
             std::remove_if(format_str->begin(), format_str->end(),
                            [spatial_dim_count](const char dim) {
@@ -164,7 +164,7 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
       keep_only_spatial_dimensions(&src_format_str);
       keep_only_spatial_dimensions(&dst_format_str);
     }
-    std::vector<int32> dst_indices(dim0);
+    std::vector<int32_t> dst_indices(dim0);
     for (int i = 0; i < dim0; ++i) {
       for (int j = 0; j < dim0; ++j) {
         if (src_format_str[i] == dst_format_str[j]) {
@@ -174,14 +174,14 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
       }
     }
     xla::XlaOp indices =
-        xla::ConstantR1(builder, absl::Span<const int32>(dst_indices));
+        xla::ConstantR1(builder, absl::Span<const int32_t>(dst_indices));
     xla::XlaOp output = xla::TorchIndexSelect(ctx->Input(0), indices, 0);
     ctx->SetOutput(0, output);
   }
 
  private:
-  string src_format_;
-  string dst_format_;
+  std::string src_format_;
+  std::string dst_format_;
 
   DataFormatVecPermuteOp(const DataFormatVecPermuteOp&) = delete;
   void operator=(const DataFormatVecPermuteOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index e8e2babffd529c..7e93ed9c32e126 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -31,7 +31,7 @@ namespace {
 class DepthToSpaceOp : public XlaOpKernel {
  public:
   explicit DepthToSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
index d383c7d0ab4aa3..bc03e14556f9cb 100644
--- a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
@@ -42,7 +42,7 @@ float get_fullrange() {
 class DequantizeOp : public XlaOpKernel {
  public:
   explicit DequantizeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string mode_string;
+    std::string mode_string;
     int axis;
     bool narrow_range;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
index 141415bcd0d8c0..a5665baa6e3dc5 100644
--- a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
@@ -39,11 +39,11 @@ class DeviceIndexOp : public XlaOpKernel {
     // When compiling we are not executing on any physical device, so we return
     // a sentinel value (size of the list of devices).
     ctx->SetOutput(
-        0, xla::ConstantR0<int32>(ctx->builder(), device_names_.size()));
+        0, xla::ConstantR0<int32_t>(ctx->builder(), device_names_.size()));
   }
 
  private:
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 };
 
 REGISTER_XLA_OP(Name("DeviceIndex"), DeviceIndexOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
index ceeea010ee7858..ae7488ad1e1cbd 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
@@ -54,8 +54,8 @@ class DynamicPartitionOp : public XlaOpKernel {
   xla::XlaOp CountS32(XlaOpKernelContext* ctx, xla::XlaOp input,
                       int64_t target) {
     xla::XlaOp equal_dim =
-        xla::Compare(input, xla::ConstantR0<int32>(ctx->builder(), target), {},
-                     xla::ComparisonDirection::kEq);
+        xla::Compare(input, xla::ConstantR0<int32_t>(ctx->builder(), target),
+                     {}, xla::ComparisonDirection::kEq);
     xla::XlaOp casted = xla::ConvertElementType(equal_dim, xla::S32);
     return xla::ReduceAll(
         casted, xla::Zero(ctx->builder(), xla::S32),
@@ -178,8 +178,9 @@ class DynamicPartitionOp : public XlaOpKernel {
       } else {
         xla::XlaOp length;
         if (count_diff != 0) {
-          length = xla::Div(partition_length[i],
-                            xla::ConstantR0<int32>(ctx->builder(), count_diff));
+          length =
+              xla::Div(partition_length[i],
+                       xla::ConstantR0<int32_t>(ctx->builder(), count_diff));
         } else {
           length = CountS32(ctx, ctx->Input(1), /*target=*/i);
         }
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index cb7e4f6f96437e..edf9afb5ae14fb 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -145,8 +145,8 @@ class DynamicStitchOp : public XlaOpKernel {
 
     // Construct the reverse mapping, for each index, of which slice of which
     // input it comes from.
-    std::vector<int32> src_input_vector(number_of_indices);
-    std::vector<int32> src_slice_vector(number_of_indices);
+    std::vector<int32_t> src_input_vector(number_of_indices);
+    std::vector<int32_t> src_slice_vector(number_of_indices);
     std::vector<bool> src_index_used(number_of_indices);
     int index_used_count = 0;
     for (int input_num = 0; input_num < indices.size(); input_num++) {
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 4a1de78d9371b3..b9ca65cfbd6371 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -179,9 +179,9 @@ class ExtractImagePatchesOp : public XlaOpKernel {
   }
 
  protected:
-  std::vector<int32> ksizes_;
-  std::vector<int32> dilations_;
-  std::vector<int32> strides_;
+  std::vector<int32_t> ksizes_;
+  std::vector<int32_t> dilations_;
+  std::vector<int32_t> strides_;
   Padding padding_;
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
index b2b1eb3343e698..8075982c766a97 100644
--- a/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
@@ -154,7 +154,7 @@ class FusedConv2DInt8Op : public XlaOpKernel {
 
     // Un-vectorize NCHW_VECT_C to NCHW.
     TensorFormat orig_data_format = conv_attrs_.data_format;
-    int64 vect_width = -1;
+    int64_t vect_width = -1;
     switch (conv_attrs_.data_format) {
       case FORMAT_NCHW_VECT_C:
         vect_width = conv_input_shape.dimensions(4);
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 2783951e1b6b0f..e94f74d1fed8ef 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -275,7 +275,7 @@ class GatherOp : public XlaOpKernel {
 
   // The number of batch dimensions, as passed in the batch_dims attribute.
   // It must be less than or equal to rank(indices).
-  int32 batch_dims_ = 0;
+  int32_t batch_dims_ = 0;
 };
 
 REGISTER_XLA_OP(Name("Gather"), MlirXlaOpKernel);
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
index 033144e9f308e4..2aec21a6db5888 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
@@ -28,7 +28,7 @@ namespace {
 class GatherOp : public XlaOpKernel {
  public:
   explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {
-    string dnums_attr;
+    std::string dnums_attr;
     OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
     OP_REQUIRES(
         context, dnums_.ParsePartialFromString(dnums_attr),
@@ -60,7 +60,7 @@ class ScatterOp : public XlaOpKernel {
   explicit ScatterOp(OpKernelConstruction* context) : XlaOpKernel(context) {
     OP_REQUIRES_OK(
         context, context->GetAttr("update_computation", &update_computation_));
-    string dnums_attr;
+    std::string dnums_attr;
     OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
     OP_REQUIRES(
         context, dnums_.ParsePartialFromString(dnums_attr),
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 17db09722ba954..56c86d3d597227 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -84,7 +84,8 @@ static absl::StatusOr<bool> PopulateTensorArrayGradients(
 
       // Add any TensorArray gradients touched by the then/else computation to
       // the enclosing graph.
-      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+      for (const std::string& grad_source :
+           update.tensor_array_gradients_accessed) {
         VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
                 << grad_source;
         XlaResource* gradient;
@@ -318,7 +319,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
     if (has_token_input_output_ && i == num_inputs - 1) {
       // Set token input for this "if" op.
       std::vector<xla::XlaOp> token_inputs;
-      for (const string& node_name : token_input_nodes_) {
+      for (const std::string& node_name : token_input_nodes_) {
         auto token_or = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token_or.status());
         token_inputs.push_back(token_or.value());
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.h b/tensorflow/compiler/tf2xla/kernels/if_op.h
index fc6dd2e08bf41f..c11cfcb08e0b09 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.h
@@ -61,8 +61,8 @@ class XlaIfOp : public XlaOpKernel {
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   bool has_token_input_output_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index a8eb7bbf794268..a2676e095b91b7 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -352,10 +352,11 @@ struct WhileCondFn {
                                         xla::XlaBuilder* cond_builder) const {
     xla::XlaOp row_idx = values[0];
     xla::XlaOp row_in_bounds =
-        xla::Lt(row_idx, xla::ConstantR0<int32>(cond_builder, num_boxes));
+        xla::Lt(row_idx, xla::ConstantR0<int32_t>(cond_builder, num_boxes));
     xla::XlaOp num_outputs_so_far = values[1];
-    xla::XlaOp results_not_full = xla::Lt(
-        num_outputs_so_far, xla::ConstantR0<int32>(cond_builder, output_size));
+    xla::XlaOp results_not_full =
+        xla::Lt(num_outputs_so_far,
+                xla::ConstantR0<int32_t>(cond_builder, output_size));
     return xla::And(row_in_bounds, results_not_full);
   }
 };
@@ -375,7 +376,7 @@ struct SuppressBodyFn {
     auto num_outputs_so_far = values[1];
     auto iou_mask = values[2];
     auto included_iou = values[3];
-    auto zero = xla::ConstantR0<int32>(builder, 0);
+    auto zero = xla::ConstantR0<int32_t>(builder, 0);
     // Determine if current elem is active using a slice.
     // TODO(b/118437727): The only reason we need an explicit vector is because
     // some old GCCs can't deduce the right type for MakeConstSpan, and
@@ -386,7 +387,7 @@ struct SuppressBodyFn {
     active_elem = xla::Reshape(active_elem, {});
     // Increment output count iff current elem is not suppressed.
     num_outputs_so_far = xla::Select(
-        active_elem, num_outputs_so_far + xla::ConstantR0<int32>(builder, 1),
+        active_elem, num_outputs_so_far + xla::ConstantR0<int32_t>(builder, 1),
         num_outputs_so_far);
     // Slice out the row_idx.
     auto row_iou = xla::DynamicSlice(iou_mask, {row_idx, zero}, {1, num_boxes});
@@ -412,7 +413,7 @@ struct SuppressBodyFn {
     }
     included_iou =
         xla::Select(cond, xla::And(included_iou, supp_mask), included_iou);
-    row_idx = row_idx + xla::ConstantR0<int32>(builder, 1);
+    row_idx = row_idx + xla::ConstantR0<int32_t>(builder, 1);
     return std::vector<xla::XlaOp>{row_idx, num_outputs_so_far, iou_mask,
                                    included_iou};
   }
@@ -456,7 +457,7 @@ class NonMaxSuppressionOp : public XlaOpKernel {
                 errors::InvalidArgument(
                     "scores size ", std::to_string(scores_shape.dim_size(0)),
                     " must equal number of boxes ", std::to_string(num_boxes)));
-    OP_REQUIRES(context, num_boxes <= kint32max,
+    OP_REQUIRES(context, num_boxes <= std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument("XLA compilation requires number of "
                                         "boxes to be <= kint32max, got ",
                                         num_boxes));
@@ -477,7 +478,7 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     OP_REQUIRES(
         context, output_size >= 0,
         errors::InvalidArgument("Need output_size >= 0, got ", output_size));
-    OP_REQUIRES(context, output_size <= kint32max,
+    OP_REQUIRES(context, output_size <= std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument("Need output_size <= kint32Max, got ",
                                         output_size));
     const xla::XlaOp score_thresh = context->Input("score_threshold");
@@ -564,8 +565,8 @@ class NonMaxSuppressionOp : public XlaOpKernel {
 
     std::vector<xla::XlaOp> init_values;
     init_values.reserve(4);
-    init_values.push_back(xla::ConstantR0<int32>(builder, 0));  // col_idx
-    init_values.push_back(xla::ConstantR0<int32>(builder, 0));  // num_outputs
+    init_values.push_back(xla::ConstantR0<int32_t>(builder, 0));  // col_idx
+    init_values.push_back(xla::ConstantR0<int32_t>(builder, 0));  // num_outputs
     init_values.push_back(iou_thresh_mask);
     init_values.push_back(included_iou);
 
@@ -595,8 +596,8 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     // can be suppressed by score threshold.
     xla::XlaOp ones_included = xla::Select(
         included,
-        xla::Broadcast(xla::ConstantR0<int32>(builder, 1), {num_boxes}),
-        xla::Broadcast(xla::ConstantR0<int32>(builder, 0), {num_boxes}));
+        xla::Broadcast(xla::ConstantR0<int32_t>(builder, 1), {num_boxes}),
+        xla::Broadcast(xla::ConstantR0<int32_t>(builder, 0), {num_boxes}));
     // num_valid is scalar. Value should be bound by output_size.
 
     xla::XlaOp num_valid_total = xla::Reduce(
@@ -604,8 +605,8 @@ class NonMaxSuppressionOp : public XlaOpKernel {
         /*init_value=*/xla::ConstantR0<int>(builder, 0),
         /*computation=*/CreateScalarAddComputation(xla::S32, builder),
         /*dimensions_to_reduce=*/{0});
-    xla::XlaOp num_valid =
-        xla::Min(num_valid_total, xla::ConstantR0<int32>(builder, output_size));
+    xla::XlaOp num_valid = xla::Min(
+        num_valid_total, xla::ConstantR0<int32_t>(builder, output_size));
 
     // Re-index into the original scores input tensor, using a Gather.
     // Boxes were suppressed in the sorted domain.
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 58811c10744131..9959f8d4e44be6 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -120,8 +120,8 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters(
       const int64_t out_size_factor =
           align_corners ? out_size[i] - 1 : out_size[i];
 
-      int64_t gcd = MathUtil::GCD(static_cast<uint64>(in_size_factor),
-                                  static_cast<uint64>(out_size_factor));
+      int64_t gcd = MathUtil::GCD(static_cast<uint64_t>(in_size_factor),
+                                  static_cast<uint64_t>(out_size_factor));
       dims.stride[i] = in_size_factor / gcd;
       dims.kernel_size[i] = out_size_factor / gcd;
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
index f357262a39c35b..5b730cc0a9076d 100644
--- a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
@@ -96,7 +96,7 @@ class InTopKOp : public XlaOpKernel {
         xla::CreateScalarAddComputation(xla::S32, xla_builder), {1});
 
     xla::XlaOp result =
-        xla::And(xla::Lt(num_gt_r1, xla::ConstantR0<int32>(xla_builder, k)),
+        xla::And(xla::Lt(num_gt_r1, xla::ConstantR0<int32_t>(xla_builder, k)),
                  xla::IsFinite(targets_values_r1));
 
     context->SetOutput(0, result);
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index 85d70705c83837..390bc09c33057d 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -464,7 +464,7 @@ class TfCallbackDevice : public DeviceBase {
     set_tensorflow_accelerator_device_info(&accelerator_device_info_);
   }
 
-  const string& name() const override { return name_; }
+  const std::string& name() const override { return name_; }
 
   PerOpGpuDevice* MakeGpuDevice() override {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
index dfe8a36005b837..aabbd8d3b0514e 100644
--- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -60,7 +60,7 @@ class ListDiffOp : public XlaOpKernel {
     absl::Status status;
     switch (val_type) {
       case DT_INT32:
-        status = ListDiffWithIndexType<int32>(context, idx_type);
+        status = ListDiffWithIndexType<int32_t>(context, idx_type);
         break;
       case DT_INT64:
         status = ListDiffWithIndexType<int64_t>(context, idx_type);
@@ -111,7 +111,7 @@ class ListDiffOp : public XlaOpKernel {
                                      DataType idx_type) {
     switch (idx_type) {
       case DT_INT32:
-        return ListDiff<Tval, int32>(context);
+        return ListDiff<Tval, int32_t>(context);
       case DT_INT64:
         return ListDiff<Tval, int64_t>(context);
       default:
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
index 48e8f976cc67bb..8e7c966bdf35fc 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
@@ -57,7 +57,7 @@ static inline bool IsLeftAligned(int diag_index, bool left_align_superdiagonal,
 void ReadAlignment(OpKernelConstruction* context,
                    bool* left_align_superdiagonal,
                    bool* left_align_subdiagonal) {
-  string align;
+  std::string align;
   OP_REQUIRES_OK(context, context->GetAttr("align", &align));
 
   *left_align_superdiagonal = align == "LEFT_LEFT" || align == "LEFT_RIGHT";
diff --git a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
index 82dbfb3839312c..215de2bc5067e4 100644
--- a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
@@ -78,7 +78,7 @@ class OneHotOp : public XlaOpKernel {
   }
 
  private:
-  int32 axis_;
+  int32_t axis_;
 
   OneHotOp(const OneHotOp&) = delete;
   void operator=(const OneHotOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 1758451faf469f..15b2b5f9d2ebbb 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -113,7 +113,7 @@ class PadOp : public XlaOpKernel {
         high_pad_size = xla::Reshape(high_pad_size, {});
         high_pad_size = xla::ConvertElementType(high_pad_size, xla::S32);
         // Low pad has to be static.
-        xla::XlaOp low_pad_size = xla::ConstantR0<int32>(
+        xla::XlaOp low_pad_size = xla::ConstantR0<int32_t>(
             ctx->builder(), pad_literal.Get<int64_t>({i, 0}));
         xla::XlaOp input_size = xla::GetDimensionSize(input, i);
         xla::XlaOp total_size = low_pad_size + input_size + high_pad_size;
@@ -122,7 +122,7 @@ class PadOp : public XlaOpKernel {
                 total_size, xla::ValueInferenceMode::kUpperBound);
         OP_REQUIRES_OK(ctx, size_upper_bound_status_or.status());
         auto size_upper_bound =
-            size_upper_bound_status_or.value().Get<int32>({});
+            size_upper_bound_status_or.value().Get<int32_t>({});
         OP_REQUIRES(
             ctx, size_upper_bound.has_value(),
             errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index aa7c78b8b8f97a..77db609d997614 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -88,8 +88,8 @@ class PoolingOp : public XlaOpKernel {
         num_spatial_dims_(num_spatial_dims),
         reduction_type_(reduction_type) {
     if (ctx->num_inputs() == 1) {
-      std::vector<int32> ksize_int;
-      std::vector<int32> stride_int;
+      std::vector<int32_t> ksize_int;
+      std::vector<int32_t> stride_int;
       OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
       OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
                   errors::InvalidArgument("Sliding window ksize field must "
@@ -255,15 +255,15 @@ class MaxPoolOp : public PoolingOp {
           ctx->builder()->GetShape(pooling);
       OP_REQUIRES_OK(ctx, result_shape.status());
 
-      int64 num_channels = result_shape->dimensions(1);
+      int64_t num_channels = result_shape->dimensions(1);
       OP_REQUIRES(
           ctx, num_channels % *vect_width == 0,
           errors::FailedPrecondition("Result of NCHW_VECT_C op must have "
                                      "channels multiple of ",
                                      *vect_width, ", but was ", num_channels));
 
-      absl::InlinedVector<int64, 5> new_dims(result_shape->dimensions().begin(),
-                                             result_shape->dimensions().end());
+      absl::InlinedVector<int64_t, 5> new_dims(
+          result_shape->dimensions().begin(), result_shape->dimensions().end());
       new_dims[1] /= *vect_width;
       new_dims.insert(new_dims.begin() + 2, *vect_width);
       pooling =
@@ -298,7 +298,7 @@ class AvgPoolOp : public PoolingOp {
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
                   /*reduction_type=*/
                   XlaHelpers::SumAccumulationType(ctx->input_type(0))) {
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -466,7 +466,7 @@ class MaxPool2DGradOp : public MaxPoolGradOp {
  public:
   explicit MaxPool2DGradOp(OpKernelConstruction* ctx)
       : MaxPoolGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -505,7 +505,7 @@ class AvgPoolGradOp : public XlaOpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -561,7 +561,7 @@ class AvgPoolGradOp : public XlaOpKernel {
  protected:
   const int num_spatial_dims_;
   std::vector<int64_t> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
 };
@@ -677,7 +677,7 @@ class MaxPoolGradGradOp : public XlaOpKernel {
 
     auto b = ctx->builder();
 
-    auto sixteen = xla::ConstantR0<uint32>(b, 16);
+    auto sixteen = xla::ConstantR0<uint32_t>(b, 16);
     // in (f32) -> round to 7 mantissa bits (bf16)-> 16-high-bit u32.
     //
     // NOTE: Use a ReducePrecision operation instead of a cast to BF16 and back
@@ -702,7 +702,7 @@ class MaxPoolGradGradOp : public XlaOpKernel {
       const xla::Shape scalar = xla::ShapeUtil::MakeShape(xla::F32, {});
       auto lhs = xla::Parameter(rb.get(), 0, scalar, "lhs");
       auto rhs = xla::Parameter(rb.get(), 1, scalar, "rhs");
-      auto sixteen = xla::ConstantR0<int32>(rb.get(), 16);
+      auto sixteen = xla::ConstantR0<int32_t>(rb.get(), 16);
       auto lhs_criteria =
           xla::ShiftLeft(xla::ShiftRightLogical(
                              xla::BitcastConvertType(lhs, xla::S32), sixteen),
@@ -749,7 +749,7 @@ class MaxPool2DGradGradOp : public MaxPoolGradGradOp {
  public:
   explicit MaxPool2DGradGradOp(OpKernelConstruction* ctx)
       : MaxPoolGradGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -767,7 +767,7 @@ class MaxPool3DGradGradOp : public MaxPoolGradGradOp {
  public:
   explicit MaxPool3DGradGradOp(OpKernelConstruction* ctx)
       : MaxPoolGradGradOp(ctx, /*num_spatial_dims=*/3) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index cac9f8a68f234e..961fce9caa7728 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -113,7 +113,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
           errors::Internal("Expected 4 inputs to QuantizeAndDequantize"));
       num_bits = ctx->Input(3);
     } else {
-      num_bits = xla::ConstantR0<int32>(b, num_bits_);
+      num_bits = xla::ConstantR0<int32_t>(b, num_bits_);
     }
 
     const xla::XlaOp zero = XlaHelpers::Zero(b, data_type);
@@ -129,17 +129,17 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
     xla::XlaOp min_quantized, max_quantized;
     if (signed_input_) {
       if (narrow_range_) {
-        min_quantized =
-            -Pow(two, ConvertElementType(
-                          num_bits - xla::ConstantR0<int32>(b, 1), xla_type)) +
-            one;
+        min_quantized = -Pow(two, ConvertElementType(
+                                      num_bits - xla::ConstantR0<int32_t>(b, 1),
+                                      xla_type)) +
+                        one;
       } else {
         min_quantized =
             -Pow(two, ConvertElementType(
-                          num_bits - xla::ConstantR0<int32>(b, 1), xla_type));
+                          num_bits - xla::ConstantR0<int32_t>(b, 1), xla_type));
       }
       max_quantized =
-          Pow(two, ConvertElementType(num_bits - xla::ConstantR0<int32>(b, 1),
+          Pow(two, ConvertElementType(num_bits - xla::ConstantR0<int32_t>(b, 1),
                                       xla_type)) -
           one;
     } else {
@@ -222,7 +222,7 @@ class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
-    string round_mode_string;
+    std::string round_mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
     OP_REQUIRES(
         ctx,
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc b/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
index 8f2350f26861c4..dea3ecf85af7b8 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
@@ -140,7 +140,7 @@ absl::StatusOr<int> GetAlgId(XlaOpKernelContext* ctx, int alg_input_idx) {
   if (alg_dtype == DT_INT32) {
     return alg_literal.Get<int>({});
   } else {
-    return alg_literal.Get<int64>({});
+    return alg_literal.Get<int64_t>({});
   }
 }
 
@@ -172,7 +172,7 @@ DataType MaybeConvertBF16ToF32(DataType const& dtype) {
 }
 
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
-    XlaOpKernelContext* ctx, DataType dtype, string device_type_string,
+    XlaOpKernelContext* ctx, DataType dtype, std::string device_type_string,
     TensorShape shape,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> lo_fn,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> hi_fn) {
@@ -190,7 +190,7 @@ absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
 
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(XlaOpKernelContext* ctx,
                                                DataType dtype,
-                                               string device_type_string,
+                                               std::string device_type_string,
                                                xla::Shape xla_shape,
                                                xla::XlaOp lo, xla::XlaOp hi) {
   xla::XlaOp key = ctx->Input(kRandomKeyInputIdx);
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops_util.h b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
index 11ff44602f1900..5fb7aa4822834c 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
@@ -73,7 +73,7 @@ DataType MaybeConvertBF16ToF32(DataType const& dtype);
 // type, in the given low and high range, where low and high are expressed in
 // XLA functions.
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
-    XlaOpKernelContext* ctx, DataType dtype, string device_type_string,
+    XlaOpKernelContext* ctx, DataType dtype, std::string device_type_string,
     TensorShape shape,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> lo,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> hi);
@@ -82,7 +82,7 @@ absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
 // ops.
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(XlaOpKernelContext* ctx,
                                                DataType dtype,
-                                               string device_type_string,
+                                               std::string device_type_string,
                                                xla::Shape xla_shape,
                                                xla::XlaOp lo, xla::XlaOp hi);
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 6a8a98342c1123..3bfe9e384405b2 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -119,7 +119,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
     }
   }
 
-  string desc = ctx->op_kernel().name();
+  std::string desc = ctx->op_kernel().name();
 
   xla::XlaBuilder* const b = ctx->builder();
   // Construct the builder for the reduction lambda.
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
index c54c4613d29e44..a1dd0164e73fc7 100644
--- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -311,7 +311,7 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::Pad(grad_data, xla::Zero(ctx->builder(), warp_type),
                xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
 
-  auto shifting_value = xla::ConstantR1<int32>(
+  auto shifting_value = xla::ConstantR1<int32_t>(
       ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
   auto shifted_gather_indices =
       xla::Add(gather_indices, shifting_value, {last_warp_dim});
@@ -384,7 +384,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::Pad(data, xla::Zero(ctx->builder(), data_type),
                xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
 
-  auto shifting_value = xla::ConstantR1<int32>(
+  auto shifting_value = xla::ConstantR1<int32_t>(
       ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
   auto shifted_gather_indices =
       xla::Add(gather_indices, shifting_value, {last_warp_dim});
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 5cecbf37706283..5c77a4dfe29934 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -134,8 +134,8 @@ class ReverseSequenceOp : public XlaOpKernel {
   }
 
  private:
-  int32 batch_dim_;
-  int32 seq_dim_;
+  int32_t batch_dim_;
+  int32_t seq_dim_;
 };
 
 REGISTER_XLA_OP(Name("ReverseSequence"), ReverseSequenceOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index e1e93d614286a3..32b75c26c70212 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -35,7 +35,7 @@ class SendOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override;
 
  private:
-  string tensor_name_;
+  std::string tensor_name_;
 
   SendOp(const SendOp&) = delete;
   void operator=(const SendOp&) = delete;
@@ -60,7 +60,7 @@ class RecvOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override;
 
  private:
-  string tensor_name_;
+  std::string tensor_name_;
   xla::Shape shape_;
 
   RecvOp(const RecvOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 108bf3848aae93..d24d1688d188a6 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -104,7 +104,8 @@ class RangeOp : public XlaOpKernel {
     absl::StatusOr<xla::XlaOp> output;
     switch (type) {
       case DT_INT32:
-        output = CreateRangeTensor<int32>(start, limit, delta, ctx->builder());
+        output =
+            CreateRangeTensor<int32_t>(start, limit, delta, ctx->builder());
         break;
       case DT_INT64:
         output =
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 7e8889cb2ccee6..07bf81e9d76b58 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -109,7 +109,7 @@ class XlaSetBoundOp : public XlaOpKernel {
                                 bound_shape.DebugString()));
     int64_t bound;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("bound", &bound));
-    xla::Literal bound_literal = xla::LiteralUtil::CreateR0<int32>(bound);
+    xla::Literal bound_literal = xla::LiteralUtil::CreateR0<int32_t>(bound);
     xla::XlaOp result = xla::CustomCall(
         ctx->builder(), "SetBound", {ctx->Input("input")},
         ctx->InputXlaShape("input").value(), "", false, {}, &bound_literal);
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
index 57825657b205ab..beb38ce9a273ea 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -33,15 +33,15 @@ absl::Status TensorShapeToConstant(const TensorShape& input_shape,
                                    Tensor* shape_constant) {
   const int dims = input_shape.dims();
   if (shape_constant->dtype() == DT_INT32) {
-    auto vec = shape_constant->vec<int32>();
+    auto vec = shape_constant->vec<int32_t>();
     for (int i = 0; i < dims; ++i) {
       int64_t dim_size = input_shape.dim_size(i);
-      if (!FastBoundsCheck(dim_size, std::numeric_limits<int32>::max())) {
+      if (!FastBoundsCheck(dim_size, std::numeric_limits<int32_t>::max())) {
         return errors::InvalidArgument(
             "Shape with out_type=int32 does not support tensors > int32max",
             " but dim ", i, " is ", dim_size);
       }
-      vec(i) = static_cast<int32>(dim_size);
+      vec(i) = static_cast<int32_t>(dim_size);
     }
   } else {
     auto vec = shape_constant->vec<int64_t>();
diff --git a/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc b/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
index 74e04e035ef3be..0ee9173cda69e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
@@ -101,8 +101,8 @@ absl::Status GetAndValidateAttributes(OpKernelConstruction* ctx,
   return absl::OkStatus();
 }
 
-std::vector<int64_t> GetSliceIndices(absl::Span<const int64> num_partitions,
-                                     absl::Span<const int64> slice_shape,
+std::vector<int64_t> GetSliceIndices(absl::Span<const int64_t> num_partitions,
+                                     absl::Span<const int64_t> slice_shape,
                                      const int index) {
   DCHECK_EQ(num_partitions.size(), slice_shape.size());
 
@@ -213,7 +213,7 @@ class XlaSplitNDBaseOp : public XlaOpKernel {
       // Calculate paddings necessary for slice instead of padding input and
       // slicing subsequently to reduce temporary memory allocation.
       for (int dim = 0; dim < rank; ++dim) {
-        const int64 dim_size = input_shape.dim_size(dim);
+        const int64_t dim_size = input_shape.dim_size(dim);
         if (slice_start_indices[dim] >= dim_size) {
           // Complete padding.
           slice_start_indices[dim] = dim_size;
@@ -387,9 +387,9 @@ class XlaConcatNDBaseOp : public XlaOpKernel {
 
       std::vector<xla::XlaOp> update_slice_start_indices;
       update_slice_start_indices.reserve(rank);
-      for (int64 start_index : slice_start_indices) {
+      for (int64_t start_index : slice_start_indices) {
         update_slice_start_indices.push_back(
-            xla::ConstantR0<int32>(ctx->builder(), start_index));
+            xla::ConstantR0<int32_t>(ctx->builder(), start_index));
       }
       output = xla::DynamicUpdateSlice(output, input_slice,
                                        update_slice_start_indices);
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 844a31f97990fc..b0e337cec20c33 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -180,8 +180,8 @@ class SliceOp : public XlaOpKernel {
               xla::Reshape(xla::Slice(ctx->Input(2), {i}, {i + 1}, {1}), {});
           if (constant_size_is_minus_one && size[i] == -1) {
             // size = input_.dim_size(i) - begin[i]
-            dynamic_size = xla::ConstantR0<int32>(ctx->builder(),
-                                                  input_shape.dim_size(i)) -
+            dynamic_size = xla::ConstantR0<int32_t>(ctx->builder(),
+                                                    input_shape.dim_size(i)) -
                            begin_indices[i];
           }
           auto constant_size = ctx->value_inference().AnalyzeConstant(
@@ -192,7 +192,7 @@ class SliceOp : public XlaOpKernel {
             // triggered when some dimensions's slice sizes are constant while
             // some are dynamic.
             sliced = xla::SliceInDim(
-                sliced, 0, constant_size->Get<int32>({}).value(), 1, i);
+                sliced, 0, constant_size->Get<int32_t>({}).value(), 1, i);
           } else {
             // We gave a generous bound (same as input) to the output, try reset
             // the bound if a tighter one can be found.
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index ac33e0877200dc..180ba322f0fdd0 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -34,7 +34,7 @@ namespace {
 class SpaceToDepthOp : public XlaOpKernel {
  public:
   explicit SpaceToDepthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
index 124e36557f1429..f6d468131ac94e 100644
--- a/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
@@ -69,8 +69,8 @@ class XlaSpmdFullToShardShapeOp : public XlaOpKernel {
   }
 
  private:
-  string manual_sharding_str_;
-  int32 single_dim_;
+  std::string manual_sharding_str_;
+  int32_t single_dim_;
   std::vector<int64_t> unspecified_dims_;
   XlaSpmdFullToShardShapeOp(const XlaSpmdFullToShardShapeOp&) = delete;
   void operator=(const XlaSpmdFullToShardShapeOp&) = delete;
@@ -120,8 +120,8 @@ class XlaSpmdShardToFullShapeOp : public XlaOpKernel {
 
  private:
   TensorShape full_shape_;
-  string manual_sharding_str_;
-  int32 single_dim_;
+  std::string manual_sharding_str_;
+  int32_t single_dim_;
   std::vector<int64_t> unspecified_dims_;
   XlaSpmdShardToFullShapeOp(const XlaSpmdShardToFullShapeOp&) = delete;
   void operator=(const XlaSpmdShardToFullShapeOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 3c99ad63565266..4672477be3534b 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -120,7 +120,7 @@ class StackOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string stack_name_;
+  std::string stack_name_;
 
   StackOp(const StackOp&) = delete;
   void operator=(const StackOp&) = delete;
@@ -152,7 +152,7 @@ class StackPushOp : public XlaOpKernel {
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
@@ -164,7 +164,7 @@ class StackPushOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx,
                    resource->SetValue(xla::Tuple(
                        b, {xla::DynamicUpdateSlice(ta, update, start_indices),
-                           xla::Add(index, xla::ConstantR0<int32>(b, 1))})));
+                           xla::Add(index, xla::ConstantR0<int32_t>(b, 1))})));
 
     ctx->SetOutput(0, value);
   }
@@ -204,12 +204,12 @@ class StackPopOp : public XlaOpKernel {
     xla::XlaOp ta = xla::GetTupleElement(state, 0);
     xla::XlaOp index = xla::GetTupleElement(state, 1);
 
-    index = Sub(index, xla::ConstantR0<int32>(b, 1));
+    index = Sub(index, xla::ConstantR0<int32_t>(b, 1));
     OP_REQUIRES_OK(ctx, resource->SetValue(xla::Tuple(b, {ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(stack_shape.dims(),
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     auto slice_shape = stack_shape.dim_sizes();
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index e7ff8194b96ce8..80047c5f17cc98 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -511,7 +511,7 @@ class RngSkipOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("RngSkip").CompileTimeConstantInput("algorithm"),
                 RngSkipOp<>);
 
-using RngReadAndSkipOp = RngSkipOp<int32, true>;
+using RngReadAndSkipOp = RngSkipOp<int32_t, true>;
 
 REGISTER_XLA_OP(Name("RngReadAndSkip").CompileTimeConstantInput("alg"),
                 RngReadAndSkipOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index aa71c5c34d2e1a..246981c3465ef1 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -76,7 +76,7 @@ xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype) {
     // `BitcastConvertType(ConvertElementType(u32, U16), BF16)`, to avoid the
     // unclear `ConvertElementType(f32, BF16)` behavior.
     xla::XlaOp output = xla::BitcastConvertType(input, xla::U32) &
-                        xla::ConstantR0<uint32>(builder, 0xFFFF0000);
+                        xla::ConstantR0<uint32_t>(builder, 0xFFFF0000);
     return xla::ConvertElementType(xla::BitcastConvertType(output, xla::F32),
                                    xla::BF16);
   } else {
@@ -184,7 +184,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformOp(const StatelessRandomUniformOp&) = delete;
   void operator=(const StatelessRandomUniformOp&) = delete;
@@ -240,7 +240,7 @@ class StatelessRandomUniformIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformIntOp(const StatelessRandomUniformIntOp&) = delete;
   void operator=(const StatelessRandomUniformIntOp&) = delete;
@@ -283,7 +283,7 @@ class StatelessRandomUniformFullIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformFullIntOp(const StatelessRandomUniformFullIntOp&) =
       delete;
@@ -336,7 +336,7 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomNormalOp(const StatelessRandomNormalOp&) = delete;
   void operator=(const StatelessRandomNormalOp&) = delete;
@@ -384,7 +384,7 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessTruncatedNormalOp(const StatelessTruncatedNormalOp&) = delete;
   void operator=(const StatelessTruncatedNormalOp&) = delete;
@@ -449,7 +449,7 @@ class StatelessParameterizedTruncatedNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessParameterizedTruncatedNormalOp(
       const StatelessParameterizedTruncatedNormalOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
index ce1fee91ae6a51..689e6ca3f7bf41 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -128,7 +128,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformOp(const StatelessRandomUniformOp&) = delete;
   void operator=(const StatelessRandomUniformOp&) = delete;
@@ -177,7 +177,7 @@ class StatelessRandomUniformIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformIntOp(const StatelessRandomUniformIntOp&) = delete;
   void operator=(const StatelessRandomUniformIntOp&) = delete;
@@ -225,7 +225,7 @@ class StatelessRandomUniformFullIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformFullIntOp(const StatelessRandomUniformFullIntOp&) =
       delete;
@@ -295,7 +295,7 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomNormalOp(const StatelessRandomNormalOp&) = delete;
   void operator=(const StatelessRandomNormalOp&) = delete;
@@ -330,7 +330,7 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessTruncatedNormalOp(const StatelessTruncatedNormalOp&) = delete;
   void operator=(const StatelessTruncatedNormalOp&) = delete;
@@ -369,7 +369,7 @@ class GetKeyCounterOp : public XlaOpKernel {
   }
 
  private:
-  string device_type_string_;
+  std::string device_type_string_;
 
   GetKeyCounterOp(const GetKeyCounterOp&) = delete;
   void operator=(const GetKeyCounterOp&) = delete;
@@ -392,7 +392,7 @@ class GetAlgOp : public XlaOpKernel {
   }
 
  private:
-  string device_type_string_;
+  std::string device_type_string_;
 
   GetAlgOp(const GetAlgOp&) = delete;
   void operator=(const GetAlgOp&) = delete;
@@ -430,7 +430,7 @@ class GetKeyCounterAlgOp : public XlaOpKernel {
   }
 
  private:
-  string device_type_string_;
+  std::string device_type_string_;
 
   GetKeyCounterAlgOp(const GetKeyCounterAlgOp&) = delete;
   void operator=(const GetKeyCounterAlgOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index e15196bd756462..1b44d1e07c4bd8 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -142,7 +142,7 @@ class StridedSliceOp : public XlaOpKernel {
       // Pad input to 2x to avoid OOB access.
       slice = xla::Pad(slice, xla::Zero(ctx->builder(), ctx->input_xla_type(0)),
                        padding_config);
-      for (int64 i = 0; i < result_dims_are_dynamic.size(); ++i) {
+      for (int64_t i = 0; i < result_dims_are_dynamic.size(); ++i) {
         if (result_dims_are_dynamic[i]) {
           slice = xla::RemoveDynamicDimension(slice, i);
         }
@@ -178,7 +178,7 @@ class StridedSliceOp : public XlaOpKernel {
           // Can't infer a lower bound.
           return false;
         }
-        return lower_bound->Get<int32>({}) >= 0;
+        return lower_bound->Get<int32_t>({}) >= 0;
       };
       if (begin_mask) {
         begin_index = zero;
@@ -220,7 +220,7 @@ class StridedSliceOp : public XlaOpKernel {
     // size 1 dims of a shape.
     slice = xla::Reshape(slice, final_shape.dim_sizes());
     for (int64_t i = 0; i < final_shape.dims(); ++i) {
-      int64 processing_shape_dim = shape_spec.output_to_processing_mapping[i];
+      int64_t processing_shape_dim = shape_spec.output_to_processing_mapping[i];
       // If processing_shape_dim is -1, it means the output dimension was newly
       // added by new_axis_mask_, which doesn't show up in input.
       if (processing_shape_dim != -1) {
@@ -341,9 +341,9 @@ class StridedSliceOp : public XlaOpKernel {
         int64_t sparse_index = shape_spec.output_to_sparse_mapping[i];
         bool end_is_dynamic =
             sparse_index == -1 ? false : ends_are_dynamic[sparse_index];
-        bool backward_slice = sparse_index == -1
-                                  ? false
-                                  : end_literal.Get<int32>({sparse_index}) < 0;
+        bool backward_slice =
+            sparse_index == -1 ? false
+                               : end_literal.Get<int32_t>({sparse_index}) < 0;
         if (input_is_dynamic || end_is_dynamic) {
           OP_REQUIRES(
               ctx, strides[input_index] == 1,
@@ -363,8 +363,8 @@ class StridedSliceOp : public XlaOpKernel {
                             "sized slice with dynamic negative index %lld. "));
             operand_size = xla::Add(
                 operand_size,
-                xla::ConstantR0<int32>(ctx->builder(),
-                                       end_literal.Get<int32>({sparse_index})));
+                xla::ConstantR0<int32_t>(
+                    ctx->builder(), end_literal.Get<int32_t>({sparse_index})));
           } else {
             // The end of slice with dynamic slice size is the min of operand
             // shape and slice size. E.g., t[:end_size], result size is
@@ -376,13 +376,13 @@ class StridedSliceOp : public XlaOpKernel {
                                       {});
             } else {
               end_size =
-                  xla::ConstantR0<int32>(ctx->builder(), end[input_index]);
+                  xla::ConstantR0<int32_t>(ctx->builder(), end[input_index]);
             }
             operand_size = xla::Min(operand_size, end_size);
           }
           slice = xla::SetDimensionSize(
               slice,
-              xla::Sub(operand_size, xla::ConstantR0<int32>(
+              xla::Sub(operand_size, xla::ConstantR0<int32_t>(
                                          ctx->builder(), begin[input_index])),
               i);
         }
@@ -397,8 +397,8 @@ class StridedSliceOp : public XlaOpKernel {
   }
 
  private:
-  int32 begin_mask_, end_mask_;
-  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  int32_t begin_mask_, end_mask_;
+  int32_t ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
 };
 
@@ -634,8 +634,8 @@ class StridedSliceGradOp : public XlaOpKernel {
   }
 
  private:
-  int32 begin_mask_, end_mask_;
-  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  int32_t begin_mask_, end_mask_;
+  int32_t ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
 };
 
@@ -751,8 +751,8 @@ class StridedSliceAssignOp : public XlaOpKernel {
   }
 
  private:
-  int32 begin_mask_, end_mask_;
-  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  int32_t begin_mask_, end_mask_;
+  int32_t ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
   DataType dtype_;
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 888908e30b2331..e89c3e3b4f837b 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -94,7 +94,7 @@ absl::Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
 
 // Checks that the TensorArray 'resource' has been initialized, and has type
 // 'dtype'. Sets 'shape' to the shape
-absl::Status CheckTensorArrayIsInitialized(const string& op_name,
+absl::Status CheckTensorArrayIsInitialized(const std::string& op_name,
                                            const XlaResource* resource,
                                            DataType dtype) {
   if (resource->kind() != XlaResource::kTensorArray) {
@@ -184,7 +184,7 @@ class TensorArrayOp : public XlaOpKernel {
  private:
   PartialTensorShape element_shape_;
   DataType dtype_;
-  string tensor_array_name_;
+  std::string tensor_array_name_;
 
   TensorArrayOp(const TensorArrayOp&) = delete;
   void operator=(const TensorArrayOp&) = delete;
@@ -218,7 +218,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
@@ -270,7 +270,7 @@ class TensorArrayReadOp : public XlaOpKernel {
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(ta_shape.dims(),
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     auto slice_shape = ta_shape.dim_sizes();
@@ -430,7 +430,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
         // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
         auto index = xla::Reshape(xla::Slice(indices, {i}, {i + 1}, {1}), {});
         std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                              xla::ConstantR0<int32>(b, 0));
+                                              xla::ConstantR0<int32_t>(b, 0));
         start_indices[0] = index;
         ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices, dtype_);
       }
@@ -570,7 +570,8 @@ class TensorArraySizeOp : public XlaOpKernel {
     XlaResource* var;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &var));
     Tensor size_tensor(DT_INT32, {});
-    size_tensor.scalar<int32>()() = static_cast<int32>(var->max_array_size());
+    size_tensor.scalar<int32_t>()() =
+        static_cast<int32_t>(var->max_array_size());
     ctx->SetConstantOutput(0, size_tensor);
   }
 
@@ -609,7 +610,7 @@ class TensorArrayGradOp : public XlaOpKernel {
   }
 
  private:
-  string source_;
+  std::string source_;
 
   TensorArrayGradOp(const TensorArrayGradOp&) = delete;
   void operator=(const TensorArrayGradOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index a1f58d5ae9b40e..f128c96c570e6c 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -70,7 +70,7 @@ absl::StatusOr<std::vector<std::vector<xla::XlaOp>>> GetTensorListDynamicDims(
     dynamic_dims.push_back(ctx->Input(1));
   } else {
     dynamic_dims.push_back(
-        xla::ConstantR0<int32>(ctx->builder(), num_elements));
+        xla::ConstantR0<int32_t>(ctx->builder(), num_elements));
   }
   for (int64_t dim = 0; dim < element_shape.dimensions().size(); ++dim) {
     if (dims_are_dynamic[dim]) {
@@ -80,7 +80,7 @@ absl::StatusOr<std::vector<std::vector<xla::XlaOp>>> GetTensorListDynamicDims(
       dynamic_dims.push_back(dynamic_dim_size);
     } else {
       dynamic_dims.push_back(
-          xla::ConstantR0<int32>(ctx->builder(), dynamic_sizes[dim]));
+          xla::ConstantR0<int32_t>(ctx->builder(), dynamic_sizes[dim]));
     }
   }
   list_dynamic_dims.push_back(std::move(dynamic_dims));
@@ -191,7 +191,7 @@ class TensorListReserveOp : public XlaOpKernel {
       OP_REQUIRES_OK(
           ctx,
           SetTensorListPushIndex(
-              new_list, xla::ConstantR0<int32>(ctx->builder(), num_elements),
+              new_list, xla::ConstantR0<int32_t>(ctx->builder(), num_elements),
               &result));
       ctx->SetTensorListOutput(0, result);
       return;
@@ -324,13 +324,13 @@ class TensorListElementShapeOp : public XlaOpKernel {
         ctx->SetOutput(0, xla::ConstantR1<int64_t>(b, list_shape.dimensions()));
         break;
       case DT_INT32: {
-        std::vector<int32> size;
+        std::vector<int32_t> size;
         const auto& dimensions = list_shape.dimensions();
         size.reserve(dimensions.size());
         for (int64_t s : dimensions) {
           size.push_back(s);
         }
-        ctx->SetOutput(0, xla::ConstantR1<int32>(b, size));
+        ctx->SetOutput(0, xla::ConstantR1<int32_t>(b, size));
         break;
       }
       default:
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index 683dc4737e6dab..0a7297456fce8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -393,7 +393,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
 
       std::vector<xla::XlaOp> start_indices(
           element_part_shape.dimensions().size() + 1,
-          xla::ConstantR0<int32>(b, 0));
+          xla::ConstantR0<int32_t>(b, 0));
       start_indices[0] = push_index;
 
       xla::XlaOp list_part = xla::GetTupleElement(list, i);
@@ -409,7 +409,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
     xla::XlaOp update = xla::Reshape(element, element_dims);
 
     std::vector<xla::XlaOp> start_indices(element_shape.dimensions().size() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = push_index;
 
     xla::XlaOp list_part = xla::GetTupleElement(list, 0);
@@ -418,7 +418,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
     result_parts.push_back(updated_list_part);
   }
 
-  xla::XlaOp updated_push_index = push_index + xla::ConstantR0<int32>(b, 1);
+  xla::XlaOp updated_push_index = push_index + xla::ConstantR0<int32_t>(b, 1);
   result_parts.push_back(updated_push_index);
 
   *result = xla::Tuple(b, result_parts);
@@ -441,14 +441,14 @@ absl::Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, b->GetShape(list));
   int list_tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
   xla::XlaOp push_index = xla::GetTupleElement(list, list_tuple_size - 1);
-  push_index = push_index - xla::ConstantR0<int32>(b, 1);
+  push_index = push_index - xla::ConstantR0<int32_t>(b, 1);
 
   std::vector<xla::XlaOp> list_result_parts, element_result_parts;
   for (int i = 0; i < list_tuple_size - 1; i++) {
     const xla::Shape& list_part_shape =
         xla::ShapeUtil::GetTupleElementShape(list_shape, i);
     std::vector<xla::XlaOp> start_indices(list_part_shape.dimensions().size(),
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = push_index;
 
     std::vector<int64_t> slice_shape =
@@ -496,7 +496,7 @@ absl::Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
   xla::XlaOp update = xla::Reshape(element, element_dims);
 
   std::vector<xla::XlaOp> start_indices(element_shape.dimensions().size() + 1,
-                                        xla::ConstantR0<int32>(b, 0));
+                                        xla::ConstantR0<int32_t>(b, 0));
   start_indices[0] = index;
 
   xla::XlaOp list_part = xla::GetTupleElement(list, 0);
@@ -550,7 +550,7 @@ absl::Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
   const xla::Shape& buffer_shape =
       xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
   std::vector<xla::XlaOp> start_indices(buffer_shape.dimensions().size(),
-                                        xla::ConstantR0<int32>(b, 0));
+                                        xla::ConstantR0<int32_t>(b, 0));
   start_indices[0] = index;
 
   std::vector<int64_t> slice_shape =
@@ -585,7 +585,7 @@ absl::Status ExecuteTensorListFromTensor(int push_index, xla::XlaOp tensor,
   }
 
   std::vector<xla::XlaOp> result_parts{tensor,
-                                       xla::ConstantR0<int32>(b, push_index)};
+                                       xla::ConstantR0<int32_t>(b, push_index)};
   *result = xla::Tuple(b, result_parts);
   return absl::OkStatus();
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 039320573f4558..9c4e0b63490205 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -137,7 +137,7 @@ class InvertPermutationOp : public XlaOpKernel {
     absl::Status status;
     switch (dtype) {
       case DT_INT32:
-        InvertPermutation<int32>(ctx);
+        InvertPermutation<int32_t>(ctx);
         break;
       case DT_INT64:
         InvertPermutation<int64_t>(ctx);
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 96f44d14e42ef4..90a022f5111e9a 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -56,7 +56,7 @@ REGISTER_XLA_OP(Name("Abs"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Acos"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Acosh"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Asin"), MlirXlaOpKernel);
-XLAJIT_MAKE_UNARY(Asinh, xla::Asinh(x));
+REGISTER_XLA_OP(Name("Asinh"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Atan"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Atanh"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Ceil"), MlirXlaOpKernel);
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
index dbd6cda9d950d0..1d487f70d09d21 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
@@ -36,7 +36,7 @@ namespace tensorflow {
 namespace {
 
 using XlaUnaryOpGenerator = std::function<xla::XlaOp(xla::XlaOp)>;
-using XlaOpGeneratorMap = absl::flat_hash_map<string, XlaUnaryOpGenerator>;
+using XlaOpGeneratorMap = absl::flat_hash_map<std::string, XlaUnaryOpGenerator>;
 
 void PopulateXlaOpGeneratorMap(XlaOpGeneratorMap* op_generator_map) {
   auto add_xla_op_generator = [&](std::string name,
@@ -120,7 +120,7 @@ class UnaryOpsCompositionOp : public XlaOpKernel {
   }
 
  private:
-  std::vector<string> op_names_;
+  std::vector<std::string> op_names_;
 };
 
 REGISTER_XLA_OP(Name("_UnaryOpsComposition"), UnaryOpsCompositionOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index a7a1a438f95b9e..c9ddab9efb6e22 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -165,7 +165,7 @@ class ResourceGatherOp : public XlaOpKernel {
   }
 
  private:
-  int32 batch_dims_;
+  int32_t batch_dims_;
 };
 REGISTER_XLA_OP(Name("ResourceGather"), ResourceGatherOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 415f465f0b5088..57821f74e97024 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -449,7 +449,8 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
       // Add any TensorArray gradients touched by the body to the enclosing
       // graph.
-      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+      for (const std::string& grad_source :
+           update.tensor_array_gradients_accessed) {
         VLOG(4) << "TensorArray " << resource->name() << " accessed gradient "
                 << grad_source;
         XlaResource* gradient;
@@ -553,7 +554,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       // Set token input for this "while" op.
       std::vector<xla::XlaOp> token_inputs;
       token_inputs.reserve(token_input_nodes_.size());
-      for (const string& node_name : token_input_nodes_) {
+      for (const std::string& node_name : token_input_nodes_) {
         auto token_or = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token_or.status());
         token_inputs.push_back(token_or.value());
@@ -590,7 +591,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
           } else {
             int32_t dim_size = shape.dimensions(0);
             dynamic_dims.push_back(
-                xla::ConstantR0<int32>(ctx->builder(), dim_size));
+                xla::ConstantR0<int32_t>(ctx->builder(), dim_size));
           }
 
           // Set dynamic dimension size to 0 for element value. Inside the while
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.h b/tensorflow/compiler/tf2xla/kernels/while_op.h
index 8e9f317ac4f3fe..b1937c14f0bebc 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.h
@@ -61,8 +61,8 @@ class XlaWhileOp : public XlaOpKernel {
   NameAttrList cond_name_attr_;
   NameAttrList body_name_attr_;
   bool has_token_input_output_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   // Whether to propagate compile time consts into the loop body.
   // This is not supported by default now since it may cause HBM memory
   // overheads.
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index faa8b30bcf9dc6..1ac01a4c172cfe 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -61,13 +61,13 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
 #include "stablehlo/transforms/StablehloRefineShapes.h"  // from @stablehlo
+#include "stablehlo/transforms/optimization/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/translate/stablehlo.h"
 #include "xla/mlir/utils/type_util.h"
-#include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/python/refine_polymorphic_shapes.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/pipelines.h"
@@ -121,7 +121,7 @@ bool IsTokenType(mlir::Type type) {
 }
 
 absl::StatusOr<std::unique_ptr<XlaCallModuleLoader>>
-XlaCallModuleLoader::Create(mlir::MLIRContext *context, int version,
+XlaCallModuleLoader::Create(mlir::MLIRContext* context, int version,
                             mlir::StringRef module_str,
                             std::vector<std::string> disabled_checks,
                             std::vector<std::string> platforms,
@@ -165,7 +165,7 @@ absl::Status XlaCallModuleLoader::SetPlatformIndex(
   if (platform_index < 0) return absl::OkStatus();
   VLOG(3) << "XlaCallModule setting the platform_index to " << platform_index
           << " for platform " << compilation_platform << ".";
-  mlir::Block &main_body = main_.front();
+  mlir::Block& main_body = main_.front();
 
   if (main_.getNumArguments() < 1) {
     return absl::InvalidArgumentError(absl::StrCat(
@@ -241,19 +241,19 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
         " non-token and non-platform-index arguments. The input ",
         "shapes are (",
         absl::StrJoin(input_shapes, ", ",
-                      [](std::string *out, const xla::Shape &s) {
+                      [](std::string* out, const xla::Shape& s) {
                         absl::StrAppend(out, s.ToString());
                       }),
         ") and the main function argument types are ",
         absl::StrJoin(InputTypes(), ", ",
-                      [](std::string *out, const mlir::Type &t) {
+                      [](std::string* out, const mlir::Type& t) {
                         absl::StrAppend(out, mlir::debugString(t));
                       }),
         ")"));
   }
 
   // Derive static input types to use for main.
-  mlir::Block &main_body = main_.front();
+  mlir::Block& main_body = main_.front();
   mlir::Builder builder(module_->getContext());
   std::vector<mlir::Type> static_array_input_types(nr_inputs);
   int next_actual_input = 0;
@@ -272,7 +272,7 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
     }
 
     // Get static MLIR Type from xla Shape.
-    const xla::Shape &xla_shape = input_shapes[next_actual_input++];
+    const xla::Shape& xla_shape = input_shapes[next_actual_input++];
     std::vector<int64_t> xla_dimensions;
     if (xla_shape.IsArray()) {
       xla_dimensions = std::vector<int64_t>(xla_shape.dimensions().begin(),
@@ -370,7 +370,7 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
 }
 
 absl::Status XlaCallModuleLoader::LoadModule(
-    mlir::MLIRContext *context, int version, mlir::StringRef module_str,
+    mlir::MLIRContext* context, int version, mlir::StringRef module_str,
     std::vector<std::string> disabled_checks,
     std::vector<std::string> platforms, int num_invocation_args,
     bool main_has_token_input_output, bool use_shardy_partitioner) {
@@ -457,7 +457,7 @@ absl::Status XlaCallModuleLoader::LoadModule(
     return absl::InvalidArgumentError("Cannot find 'main' in module");
   }
 
-  mlir::Block &main_body = main_.front();
+  mlir::Block& main_body = main_.front();
 
   int nr_token_arguments = llvm::count_if(InputTypes(), IsTokenType);
   if (version < kVersionStartSupportEffects) {
@@ -489,7 +489,7 @@ absl::Status XlaCallModuleLoader::ValidateXlaCallModuleInvariants() {
   mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
   bool moduleValidationFailed = false;
 
-  module_->walk([&](mlir::Operation *op) {
+  module_->walk([&](mlir::Operation* op) {
     // StableHLO programs created by jax2tf only contain operations
     // from Builtin, Func, StableHLO, Shardy dialects.
     if (!llvm::isa<mlir::BuiltinDialect, mlir::chlo::ChloDialect,
@@ -526,13 +526,9 @@ absl::Status XlaCallModuleLoader::ValidateStaticShapes() {
 absl::Status XlaCallModuleLoader::PrepareStablehloForLowering() {
   mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
 
-  // TODO (b/410057228): Replace MHLO canonicalization with StableHLO.
-  // This code requires MHLO CaseOp canonicalization to remove unreachable
-  // branches, else `tf.call_tf_function` inlining can fail.
   mlir::PassManager pm(module_->getContext());
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::createStablehloTargetIndependentOptimizationPass());
   if (use_shardy_partitioner_) {
     // We need to export shardings because the lowering path go directly to
     // HLO but not the MLIR to HLO path that invokes SdyRoundTripExport.
@@ -543,7 +539,7 @@ absl::Status XlaCallModuleLoader::PrepareStablehloForLowering() {
 
   if (failed(pm.run(*module_))) {
     return absl::InternalError(
-        absl::StrCat("MHLO->HLO lowering passes failed: ",
+        absl::StrCat("StableHLO->HLO lowering passes failed: ",
                      diag_handler.ConsumeStatus().ToString()));
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index 9a2a00c58732f3..e06c0b09ba9938 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -166,13 +166,13 @@ class XlaCallModuleOp : public XlaOpKernel {
   explicit XlaCallModuleOp(OpKernelConstruction *ctx) : XlaOpKernel(ctx) {
     int version;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("version", &version));
-    string module_str;
+    std::string module_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("module", &module_str));
     std::vector<PartialTensorShape> expected_output_shapes;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Sout", &expected_output_shapes));
     std::vector<DataType> expected_output_dtypes;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &expected_output_dtypes));
-    std::vector<string> dim_args_spec;
+    std::vector<std::string> dim_args_spec;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dim_args_spec", &dim_args_spec));
     OP_REQUIRES(ctx, dim_args_spec.empty(),
                 absl::UnimplementedError(
@@ -183,9 +183,9 @@ class XlaCallModuleOp : public XlaOpKernel {
                     "The size of Sout (", expected_output_shapes.size(),
                     ") must match the size of Tout (",
                     expected_output_dtypes.size(), ")")));
-    std::vector<string> disabled_checks;
+    std::vector<std::string> disabled_checks;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("disabled_checks", &disabled_checks));
-    std::vector<string> platforms;
+    std::vector<std::string> platforms;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("platforms", &platforms));
     // TODO(necula): change this to OP_REQUIRES_OK when 6 months have passed
     // since we added the function_list and has_token_input_output
@@ -222,7 +222,7 @@ class XlaCallModuleOp : public XlaOpKernel {
                                })
               << "])";
     }
-    string compilation_device_type = ctx->device_type().type_string();
+    std::string compilation_device_type = ctx->device_type().type_string();
     compilation_platform_ = "";
     if (compilation_device_type == DEVICE_CPU_XLA_JIT) {
       compilation_platform_ = "CPU";
@@ -293,7 +293,7 @@ class XlaCallModuleOp : public XlaOpKernel {
     xla::XlaOp token_input;
     if (!op_token_input_nodes_.empty()) {
       std::vector<xla::XlaOp> token_inputs;
-      for (const string &node_name : op_token_input_nodes_) {
+      for (const std::string& node_name : op_token_input_nodes_) {
         auto token = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token.status());
         token_inputs.push_back(token.value());
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
index 139ac17b35c637..99a0ec6d9e38dd 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
@@ -55,8 +55,8 @@ class XlaCustomCallOp : public XlaOpKernel {
   }
 
  private:
-  string target_name_;
-  string backend_config_;
+  std::string target_name_;
+  std::string backend_config_;
   DataType output_type_;
   TensorShape output_shape_;
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
index 7b0ea597c63488..6889c093a11201 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
@@ -42,7 +42,7 @@ class XlaDequantizeOp : public XlaOpKernel {
     xla::QuantizedRange range(min_range_, max_range_);
 
     xla::XlaOp output =
-        xla::Dequantize<uint8>(input, range, mode_, transpose_output_);
+        xla::Dequantize<uint8_t>(input, range, mode_, transpose_output_);
     context->SetOutput(0, output);
   }
 
@@ -50,7 +50,7 @@ class XlaDequantizeOp : public XlaOpKernel {
   float min_range_;
   float max_range_;
   bool transpose_output_;
-  string mode_;
+  std::string mode_;
   XlaDequantizeOp(const XlaDequantizeOp&) = delete;
   void operator=(const XlaDequantizeOp&) = delete;
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
index 8236e67eeded01..f77cb46c44de8c 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
@@ -34,12 +34,12 @@ namespace {
 class XlaDotOp : public XlaOpKernel {
  public:
   explicit XlaDotOp(OpKernelConstruction* context) : XlaOpKernel(context) {
-    string dnums_attr;
+    std::string dnums_attr;
     OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
     OP_REQUIRES(
         context, dnums_.ParsePartialFromString(dnums_attr),
         errors::InvalidArgument("Error parsing convolution dimension numbers"));
-    string precision_config_attr;
+    std::string precision_config_attr;
     OP_REQUIRES_OK(
         context, context->GetAttr("precision_config", &precision_config_attr));
     OP_REQUIRES(
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
index 0cfd247bdd1de6..7765de131e865c 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
@@ -41,7 +41,7 @@ class XlaSelfAdjointEigOp : public XlaOpKernel {
 
  private:
   bool lower_;
-  int32 max_iter_;
+  int32_t max_iter_;
   float epsilon_;
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
index f3bd088ced826a..6639c8003e1a15 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
@@ -37,7 +37,7 @@ class XlaSvdOp : public XlaOpKernel {
   explicit XlaSvdOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("max_iter", &max_iter_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
-    string precision_config_attr;
+    std::string precision_config_attr;
     OP_REQUIRES_OK(ctx,
                    ctx->GetAttr("precision_config", &precision_config_attr));
     OP_REQUIRES(ctx,
@@ -57,7 +57,7 @@ class XlaSvdOp : public XlaOpKernel {
   }
 
  private:
-  int32 max_iter_;
+  int32_t max_iter_;
   float epsilon_;
   xla::PrecisionConfig precision_config_;
 };
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 6a67cfa237af70..0028f8e61cbd11 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -222,7 +222,7 @@ static absl::Status XlaDotShapeFunction(shape_inference::InferenceContext* c) {
     return shape_inference::UnknownShape(c);
   }
 
-  string dimension_numbers_string;
+  std::string dimension_numbers_string;
   TF_RETURN_IF_ERROR(
       c->GetAttr("dimension_numbers", &dimension_numbers_string));
 
@@ -1027,7 +1027,7 @@ REGISTER_OP("XlaEinsum")
     .Attr("equation: string")
     .Attr("T: {complex64, bfloat16, float}")
     .SetShapeFn([](shape_inference::InferenceContext* context) {
-      string equation;
+      std::string equation;
       TF_RETURN_IF_ERROR(context->GetAttr("equation", &equation));
       // XlaEinsum supports only two-input einsum equations.
       if (!absl::StrContains(equation, ",")) {
@@ -1057,9 +1057,9 @@ REGISTER_OP("XlaSpmdFullToShardShape")
       if (!c->RankKnown(input_handle)) {
         return shape_inference::UnknownShape(c);
       }
-      string sharding_attr;
+      std::string sharding_attr;
       TF_RETURN_IF_ERROR(c->GetAttr("manual_sharding", &sharding_attr));
-      int32 single_dim;
+      int32_t single_dim;
       TF_RETURN_IF_ERROR(c->GetAttr("dim", &single_dim));
       xla::OpSharding sharding;
       sharding.ParseFromString(sharding_attr);
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
index 84ed56a468df8e..47e76f81a0328c 100644
--- a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
@@ -304,7 +304,7 @@ absl::Status MaybeRewriteWhileNode(
                                         resource_input_count, index_mapping));
 
   // Modify cond and body functions.
-  for (auto const& attr_name : std::vector<string>{"cond", "body"}) {
+  for (auto const& attr_name : std::vector<std::string>{"cond", "body"}) {
     NameAttrList attr_value;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &attr_value));
     const FunctionBody* fbody;
@@ -363,7 +363,7 @@ absl::Status MaybeRewriteWhileNode(
 
     // Save the new FunctionDef.
     FunctionDef new_fdef;
-    string new_name =
+    std::string new_name =
         fld->UniqueFunctionName(absl::StrCat(attr_value.name(), "_rearrange_"));
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
 
@@ -435,7 +435,7 @@ absl::Status MaybeRewriteIfNode(
 
   std::map<int, int> resource_retval_to_arg, retval_index_mapping;
   for (auto const& attr_name :
-       std::vector<string>{"then_branch", "else_branch"}) {
+       std::vector<std::string>{"then_branch", "else_branch"}) {
     NameAttrList f;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &f));
     const FunctionBody* fbody;
@@ -459,7 +459,7 @@ absl::Status MaybeRewriteIfNode(
 
     // Save the new FunctionDef.
     FunctionDef new_fdef;
-    string new_name =
+    std::string new_name =
         fld->UniqueFunctionName(absl::StrCat(f.name(), "_rearrange_"));
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
 
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
index 956f597301d28d..39efe2d682eb12 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
@@ -34,15 +34,16 @@ bool HasResourceInputOrOutput(const OpDef& op_def) {
 }
 
 TEST(ResourceOperationTableTest, HaveAllResourceOps) {
-  absl::flat_hash_map<string, bool> known_resource_ops;
+  absl::flat_hash_map<std::string, bool> known_resource_ops;
   for (absl::string_view known_resource_op :
        resource_op_table_internal::GetKnownResourceOps()) {
     ASSERT_TRUE(
-        known_resource_ops.insert({string(known_resource_op), false}).second);
+        known_resource_ops.insert({std::string(known_resource_op), false})
+            .second);
   }
 
-  std::vector<string> xla_op_names = XlaOpRegistry::GetAllRegisteredOps();
-  for (const string& xla_op_name : xla_op_names) {
+  std::vector<std::string> xla_op_names = XlaOpRegistry::GetAllRegisteredOps();
+  for (const std::string& xla_op_name : xla_op_names) {
     const OpDef* op_def;
     TF_ASSERT_OK(OpRegistry::Global()->LookUpOpDef(xla_op_name, &op_def));
     if (HasResourceInputOrOutput(*op_def)) {
@@ -52,7 +53,7 @@ TEST(ResourceOperationTableTest, HaveAllResourceOps) {
     }
   }
 
-  std::vector<string> unnecessary_resource_ops;
+  std::vector<std::string> unnecessary_resource_ops;
   for (const auto& pair : known_resource_ops) {
     if (!pair.second) {
       unnecessary_resource_ops.push_back(pair.first);
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 7e0b70e4df270a..4b285078f94d21 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -50,7 +50,8 @@ xla::OpMetadata CreateOpMetadata(const std::string& op_type,
 }
 
 void AssignOpMetadataToSharding(xla::OpSharding& sharding,
-                                const string& op_type, const string& op_name) {
+                                const std::string& op_type,
+                                const std::string& op_name) {
   auto metadata = CreateOpMetadata(op_type, op_name);
   if (sharding.type() == xla::OpSharding::TUPLE) {
     for (auto& sharding_element : *sharding.mutable_tuple_shardings()) {
@@ -69,7 +70,7 @@ absl::Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
 }  // namespace
 
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
-    const string& device_name, int num_cores_per_replica,
+    const std::string& device_name, int num_cores_per_replica,
     std::optional<xla::OpSharding> explicit_sharding,
     std::optional<xla::OpMetadata> metadata) {
   if (device_name.empty()) {
@@ -102,7 +103,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
 
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
     const NodeDef& node_def, int num_cores_per_replica, bool add_metadata) {
-  const string& device_name = node_def.device();
+  const std::string& device_name = node_def.device();
   TF_ASSIGN_OR_RETURN(std::optional<xla::OpSharding> sharding,
                       GetShardingFromNodeDef(node_def, add_metadata));
   return ParseShardingFromDevice(
@@ -114,7 +115,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
 
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
     const Node& node, int num_cores_per_replica, bool add_metadata) {
-  string device_name = node.assigned_device_name();
+  std::string device_name = node.assigned_device_name();
   if (device_name.empty()) {
     device_name = node.requested_device();
   }
@@ -152,7 +153,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromEdgeSource(
 }
 
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
-  string device_name = src.assigned_device_name();
+  std::string device_name = src.assigned_device_name();
   if (device_name.empty()) {
     device_name = src.requested_device();
   }
@@ -169,7 +170,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> GetShardingFromNodeDefInternal(
   if (!HasNodeAttr(node_def, attribute)) {
     return std::optional<xla::OpSharding>();
   }
-  string value;
+  std::string value;
   xla::OpSharding sharding;
   TF_RETURN_IF_ERROR(GetNodeAttr(node_def, attribute, &value));
   if (tensorflow::DecodeShardingAttribute(value, sharding).failed()) {
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
index e579f3ee0ff397..85259e0c729883 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.h
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -36,7 +36,7 @@ namespace tensorflow {
 // - a non-value if there is no assigned core or
 // - a sharding set as per xla::sharding_builder::AssignDevice.
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
-    const string& device_name, int num_cores_per_replica,
+    const std::string& device_name, int num_cores_per_replica,
     std::optional<xla::OpSharding> explicit_sharding = std::nullopt,
     std::optional<xla::OpMetadata> metadata = std::nullopt);
 
diff --git a/tensorflow/compiler/tf2xla/sharding_util_test.cc b/tensorflow/compiler/tf2xla/sharding_util_test.cc
index 585e3887fe686c..c987e8f167422f 100644
--- a/tensorflow/compiler/tf2xla/sharding_util_test.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util_test.cc
@@ -33,7 +33,7 @@ TEST(CoreUtilTest, ParseShardingFromDevice) {
   Graph graph(OpRegistry::Global());
 
   auto core_from_sharding =
-      [](std::optional<xla::OpSharding> sharding) -> int64 {
+      [](std::optional<xla::OpSharding> sharding) -> int64_t {
     if (sharding.has_value() &&
         sharding.value().type() == xla::OpSharding::MAXIMAL) {
       return sharding.value().tile_assignment_devices(0);
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index afe82e0de40f62..e8b2a56cdf64d2 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -48,8 +48,8 @@ absl::Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
   } else if (node->IsIfNode()) {
     AttrValue device_ordinal_value;
     device_ordinal_value.set_i(device_ordinal);
-    for (const string& attr_name :
-         std::vector<string>{"then_branch", "else_branch"}) {
+    for (const std::string& attr_name :
+         std::vector<std::string>{"then_branch", "else_branch"}) {
       NameAttrList branch_func;
       TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
       (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -59,7 +59,8 @@ absl::Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
   } else if (node->IsWhileNode()) {
     AttrValue device_ordinal_value;
     device_ordinal_value.set_i(device_ordinal);
-    for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+    for (const std::string& attr_name :
+         std::vector<std::string>{"cond", "body"}) {
       NameAttrList branch_func;
       TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
       (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -80,39 +81,40 @@ absl::Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
   std::set<std::string> results;
   Node* first_side_effecting_node_on_path = nullptr;
-  ReverseDFS(g,
-             [&](Node* n) {
-               std::vector<string> token_input_nodes;
-               if (!GetNodeAttr(n->attrs(), kXlaTokenInputNodesAttrName,
-                                &token_input_nodes)
-                        .ok() ||
-                   token_input_nodes.empty()) {
-                 return;
-               }
-
-               if (first_side_effecting_node_on_path != nullptr) {
-                 return;
-               }
-
-               first_side_effecting_node_on_path = n;
-               string original_node_name;
-               TF_CHECK_OK(GetNodeAttr(n->def(),
-                                       kXlaOriginalOutsideCompilationNodeName,
-                                       &original_node_name));
-               results.insert(original_node_name);
-             },
-             [&](Node* n) {
-               if (first_side_effecting_node_on_path == n) {
-                 first_side_effecting_node_on_path = nullptr;
-               }
-             },
-             NodeComparatorName());
+  ReverseDFS(
+      g,
+      [&](Node* n) {
+        std::vector<std::string> token_input_nodes;
+        if (!GetNodeAttr(n->attrs(), kXlaTokenInputNodesAttrName,
+                         &token_input_nodes)
+                 .ok() ||
+            token_input_nodes.empty()) {
+          return;
+        }
+
+        if (first_side_effecting_node_on_path != nullptr) {
+          return;
+        }
+
+        first_side_effecting_node_on_path = n;
+        std::string original_node_name;
+        TF_CHECK_OK(GetNodeAttr(n->def(),
+                                kXlaOriginalOutsideCompilationNodeName,
+                                &original_node_name));
+        results.insert(original_node_name);
+      },
+      [&](Node* n) {
+        if (first_side_effecting_node_on_path == n) {
+          first_side_effecting_node_on_path = nullptr;
+        }
+      },
+      NodeComparatorName());
   return results;
 }
 
 bool HasSideEffectingNodes(const Graph& g) {
   for (Node* n : g.nodes()) {
-    std::vector<string> token_input_nodes;
+    std::vector<std::string> token_input_nodes;
     if (GetNodeAttr(n->attrs(), kXlaTokenInputNodesAttrName, &token_input_nodes)
             .ok() &&
         !token_input_nodes.empty()) {
@@ -123,10 +125,10 @@ bool HasSideEffectingNodes(const Graph& g) {
 }
 
 absl::Status ParseHostComputeCoreList(
-    absl::Span<const string> list_from_attr,
-    std::map<string, int>* host_compute_core) {
+    absl::Span<const std::string> list_from_attr,
+    std::map<std::string, int>* host_compute_core) {
   for (const auto& hc_core : list_from_attr) {
-    std::vector<string> parts = str_util::Split(hc_core, ":");
+    std::vector<std::string> parts = str_util::Split(hc_core, ":");
     if (parts.size() != 2) {
       return errors::InvalidArgument(
           "Malformed host_compute_core entry ", hc_core,
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index 34f30eb7661bc1..9ba994a16a3c8e 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -61,8 +61,9 @@ bool HasSideEffectingNodes(const Graph& g);
 // Parse the mapping from outside_compilation_subgraph name to core number,
 // which is specified in an attr as a list of strings
 // <subgraph_name>:<core_index>.
-absl::Status ParseHostComputeCoreList(absl::Span<const string> list_from_attr,
-                                      std::map<string, int>* host_compute_core);
+absl::Status ParseHostComputeCoreList(
+    absl::Span<const std::string> list_from_attr,
+    std::map<std::string, int>* host_compute_core);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/test_util.cc b/tensorflow/compiler/tf2xla/test_util.cc
index 43623a8db8014f..193eb7c08bc08a 100644
--- a/tensorflow/compiler/tf2xla/test_util.cc
+++ b/tensorflow/compiler/tf2xla/test_util.cc
@@ -21,12 +21,12 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status InstantiateFunctionForTest(
-    const string& name, const FunctionLibraryDefinition& library,
+    const std::string& name, const FunctionLibraryDefinition& library,
     InstantiationResultForTest* result) {
   const FunctionDef* fdef = library.Find(name);
   TF_RET_CHECK(fdef != nullptr);
 
-  auto get_func_sig = [&library](const string& op, const OpDef** sig) {
+  auto get_func_sig = [&library](const std::string& op, const OpDef** sig) {
     return library.LookUpOpDef(op, sig);
   };
   InstantiationResult inst;
diff --git a/tensorflow/compiler/tf2xla/test_util.h b/tensorflow/compiler/tf2xla/test_util.h
index 2b2eb4f582af3e..2c9cdc1c352238 100644
--- a/tensorflow/compiler/tf2xla/test_util.h
+++ b/tensorflow/compiler/tf2xla/test_util.h
@@ -41,7 +41,7 @@ struct InstantiationResultForTest {
 // Instantiates a function, producing a GraphDef to compare against the
 // expected graph.
 absl::Status InstantiateFunctionForTest(
-    const string& name, const FunctionLibraryDefinition& library,
+    const std::string& name, const FunctionLibraryDefinition& library,
     InstantiationResultForTest* result);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
index 504e9d0246322e..eccc2dfaf8d4a4 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
@@ -32,7 +32,8 @@ namespace tensorflow {
 namespace tf2xla {
 namespace {
 
-void PrintSupportedOps(const string& device, const string& regen_run) {
+void PrintSupportedOps(const std::string& device,
+                       const std::string& regen_run) {
   XlaOpRegistry::RegisterCompilationKernels();
 
   std::vector<const KernelDef*> kdefs =
@@ -46,10 +47,10 @@ void PrintSupportedOps(const string& device, const string& regen_run) {
             << "Operator | Type Constraint\n"
             << "-------- | ---------------" << std::endl;
   for (const KernelDef* kdef : kdefs) {
-    std::vector<string> constraints;
+    std::vector<std::string> constraints;
     constraints.reserve(kdef->constraint().size());
     for (const KernelDef::AttrConstraint& constraint : kdef->constraint()) {
-      std::vector<string> types;
+      std::vector<std::string> types;
       const auto& allowed_values = constraint.allowed_values().list().type();
       types.reserve(allowed_values.size());
       for (int type : allowed_values) {
@@ -70,18 +71,18 @@ void PrintSupportedOps(const string& device, const string& regen_run) {
 }  // namespace
 
 void SupportedOpsMain(int argc, char** argv, const char* regen_run) {
-  std::vector<string> device_names = XlaOpRegistry::BackendNames();
+  std::vector<std::string> device_names = XlaOpRegistry::BackendNames();
   std::sort(device_names.begin(), device_names.end());
 
   // Set up and parse flags.
-  string device;
+  std::string device;
   std::vector<Flag> flag_list = {
       {"device", &device,
        "Name of the compilation device for which to print supported ops, "
        "one of: " +
            absl::StrJoin(device_names, ",")},
   };
-  string usage = Flags::Usage(argv[0], flag_list);
+  std::string usage = Flags::Usage(argv[0], flag_list);
   bool parsed_flags_ok = Flags::Parse(&argc, argv, flag_list);
   QCHECK(parsed_flags_ok) << "\n" << usage;
   QCHECK(XlaOpRegistry::IsBackendRegistered(device))
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index d61d66bfe53b72..72bd28f2b47a8c 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -118,8 +118,8 @@ TEST(ConvertGraphDefToXla, Sum) {
   TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
-  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
-  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_literal = xla::LiteralUtil::CreateR0<int32_t>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32_t>(32);
   auto x_global_or = client->TransferToServer(x_literal);
   auto y_global_or = client->TransferToServer(y_literal);
   TF_EXPECT_OK(x_global_or.status());
@@ -140,23 +140,23 @@ TEST(ConvertGraphDefToXla, Sum) {
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
-GraphDef EinsumGraph() {
+GraphDef EinsumGraph(DataType dtype = DT_FLOAT) {
   GraphDef graph_def;
   NodeDef* x = graph_def.add_node();
   x->set_name("x");
   x->set_op("Placeholder");
-  (*x->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  (*x->mutable_attr())["dtype"] = TypeAttrValue(dtype);
   NodeDef* y = graph_def.add_node();
   y->set_name("y");
   y->set_op("Placeholder");
-  (*y->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  (*y->mutable_attr())["dtype"] = TypeAttrValue(dtype);
   NodeDef* einsum = graph_def.add_node();
   einsum->set_name("einsum");
   einsum->set_op("Einsum");
   einsum->add_input("x");
   einsum->add_input("y");
   (*einsum->mutable_attr())["equation"] = StringAttrValue("ij,jk->ik");
-  (*einsum->mutable_attr())["T"] = TypeAttrValue(DT_FLOAT);
+  (*einsum->mutable_attr())["T"] = TypeAttrValue(dtype);
   (*einsum->mutable_attr())["N"] = IntAttrValue(2);
   return graph_def;
 }
@@ -233,6 +233,35 @@ TEST_F(ConvertGraphDefToXlaWithTF32Disabled,
   EXPECT_EQ(num_dots, 1);
 }
 
+TEST_F(ConvertGraphDefToXlaWithTF32Disabled,
+       EinsumIsConvertedToDotWithDefaultPrecisionIfNotF32) {
+  GraphDef graph_def = EinsumGraph(DT_BFLOAT16);
+  tf2xla::Config config = EinsumConfig();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  int num_dots = 0;
+  const xla::HloModuleProto& module_proto = computation.proto();
+  for (const xla::HloComputationProto& computation_proto :
+       module_proto.computations()) {
+    for (const xla::HloInstructionProto& instruction_proto :
+         computation_proto.instructions()) {
+      if (instruction_proto.opcode() == "dot") {
+        num_dots++;
+        ASSERT_EQ(instruction_proto.precision_config().operand_precision_size(),
+                  2);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(0),
+                  xla::PrecisionConfig::DEFAULT);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(1),
+                  xla::PrecisionConfig::DEFAULT);
+      }
+    }
+  }
+  EXPECT_EQ(num_dots, 1);
+}
+
 GraphDef Conv2DGraph() {
   GraphDef graph_def;
   NodeDef* x = graph_def.add_node();
@@ -338,8 +367,8 @@ TEST(ConvertGraphDefToXla, SumWithUnusedArgument) {
   TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
-  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
-  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_literal = xla::LiteralUtil::CreateR0<int32_t>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32_t>(32);
   auto x_global_or = client->TransferToServer(x_literal);
   auto y_global_or = client->TransferToServer(y_literal);
   auto unused_global_or = client->TransferToServer(y_literal);
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 9f21af2741dcde..042b572c234355 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -58,8 +58,9 @@ absl::Status ValidateTensorId(const tf2xla::TensorId& id) {
   return absl::OkStatus();
 }
 
-absl::Status CheckNameDuplicates(const string& kind, const string& name,
-                                 std::set<string>* names) {
+absl::Status CheckNameDuplicates(const std::string& kind,
+                                 const std::string& name,
+                                 std::set<std::string>* names) {
   if (!name.empty()) {
     if (!names->insert(name).second) {
       return errors::InvalidArgument("duplicate ", kind, " name: ", name);
@@ -68,12 +69,12 @@ absl::Status CheckNameDuplicates(const string& kind, const string& name,
   return absl::OkStatus();
 }
 
-absl::Status CheckFeedFetchNameConflicts(const string& kind,
-                                         const std::set<string>& names) {
+absl::Status CheckFeedFetchNameConflicts(const std::string& kind,
+                                         const std::set<std::string>& names) {
   // We don't allow the feeds or fetches to contain both "foo" and "foo_data",
   // since that will cause a collision in codegen symbols.
-  for (const string& name : names) {
-    const string name_data(name + "_data");
+  for (const std::string& name : names) {
+    const std::string name_data(name + "_data");
     if (names.find(name_data) != names.end()) {
       return errors::InvalidArgument("conflicting ", kind, " name: ", name,
                                      " and ", name_data);
@@ -227,7 +228,7 @@ absl::Status ReplaceRetvalInputWithArg(
 // the function to replace _Arg nodes in `const_input_index_to_node` with Const
 // inputs.
 absl::Status PropagateConstIntoFuncAttr(
-    Node* n, const string& attr_name,
+    Node* n, const std::string& attr_name,
     const absl::flat_hash_map<int, const Node*>& const_input_index_to_node,
     const FunctionLibraryDefinition* lookup_fld, FunctionLibraryDefinition* fld,
     bool passthrough_arg_to_retval = false) {
@@ -255,7 +256,7 @@ absl::Status PropagateConstIntoFuncAttr(
 
   // Save rewritten function.
   FunctionDef replace_fdef;
-  string new_func_name =
+  std::string new_func_name =
       fld->UniqueFunctionName(absl::StrCat(func_attr.name(), "_const_"));
   const StackTracesMap* stack_traces =
       lookup_fld->GetStackTraces(func_attr.name());
@@ -301,7 +302,7 @@ absl::Status PropagateConstIntoIfNode(
   // Rewrite "then_branch" and "else_branch" function, replace usage of those
   // _Arg nodes with corresponding const node.
   for (const auto& attr_name :
-       std::vector<string>{"then_branch", "else_branch"}) {
+       std::vector<std::string>{"then_branch", "else_branch"}) {
     TF_RETURN_IF_ERROR(PropagateConstIntoFuncAttr(
         if_node, attr_name, const_input_index_to_node, lookup_fld, fld));
   }
@@ -309,13 +310,14 @@ absl::Status PropagateConstIntoIfNode(
   return absl::OkStatus();
 }
 
-using GraphCache = absl::flat_hash_map<string, std::unique_ptr<FunctionBody>>;
+using GraphCache =
+    absl::flat_hash_map<std::string, std::unique_ptr<FunctionBody>>;
 
 absl::StatusOr<FunctionBody*> FindOrInsert(
     GraphCache* cache, const NameAttrList& body_attr,
     const FunctionLibraryDefinition* lookup_fld,
     const FunctionLibraryDefinition* fallback_fld) {
-  const string name = body_attr.name();
+  const std::string name = body_attr.name();
   std::unique_ptr<FunctionBody>& value = (*cache)[name];
   if (!value) {
     const FunctionDef* body_func = lookup_fld->Find(name);
@@ -413,7 +415,7 @@ absl::Status PropagateConstIntoAndAroundWhileNode(
   absl::flat_hash_map<int, Node*> const_input_index_to_mutable_node;
   NameAttrList body_attr;
   TF_RETURN_IF_ERROR(GetNodeAttr(while_node->def(), "body", &body_attr));
-  const string fn_name = body_attr.name();
+  const std::string fn_name = body_attr.name();
   const FunctionDef* body_func = lookup_fld->Find(fn_name);
   if (!body_func) {
     return errors::Internal("Propagate: Cannot find body function ", fn_name,
@@ -461,7 +463,7 @@ absl::Status PropagateConstIntoAndAroundWhileNode(
 
   // Rewrite "cond" and "body" function, replace usage of those _Arg nodes with
   // corresponding const node.
-  for (const auto& attr_name : std::vector<string>{"cond", "body"}) {
+  for (const auto& attr_name : std::vector<std::string>{"cond", "body"}) {
     TF_RETURN_IF_ERROR(PropagateConstIntoFuncAttr(
         while_node, attr_name, const_input_index_to_node, lookup_fld, fld,
         /*passthrough_arg_to_retval=*/attr_name == "body"));
@@ -487,7 +489,7 @@ absl::StatusOr<bool> IsLoopInvariant(
 }
 
 absl::Status ValidateConfig(const tf2xla::Config& config) {
-  std::set<string> names;
+  std::set<std::string> names;
   for (const tf2xla::Feed& feed : config.feed()) {
     TF_RETURN_IF_ERROR(ValidateTensorId(feed.id()));
     TF_RETURN_IF_ERROR(TensorShape::IsValidShape(feed.shape()));
@@ -508,19 +510,20 @@ absl::Status ValidateConfig(const tf2xla::Config& config) {
 
 absl::Status AddPlaceholdersForFeeds(
     const tf2xla::Config& config, const OpRegistryInterface* op_registry,
-    std::unordered_map<string, string>* feed_remapping, GraphDef* graph_def) {
+    std::unordered_map<std::string, std::string>* feed_remapping,
+    GraphDef* graph_def) {
   struct PlaceholderInfo {
     const tf2xla::Feed* feed = nullptr;  // point to Feed in <config>.
-    string placeholder_name;
+    std::string placeholder_name;
     DataType data_type = DT_INVALID;
   };
 
   // Put each fed tensor into a map by name:port. A map is used for determinism
   // when creating placeholders (genrules want deterministic output).
-  std::map<string, PlaceholderInfo> placeholder_info;
+  std::map<std::string, PlaceholderInfo> placeholder_info;
   for (int i = 0; i < config.feed_size(); ++i) {
     const tf2xla::Feed* feed = &config.feed(i);
-    const string name_port = TensorIdToString(feed->id());
+    const std::string name_port = TensorIdToString(feed->id());
     PlaceholderInfo& info = placeholder_info[name_port];
     info.feed = feed;
     info.placeholder_name = absl::StrCat("aot_feed_", feed->id().output_index(),
@@ -529,7 +532,7 @@ absl::Status AddPlaceholdersForFeeds(
   }
 
   // Verify node exists and determine data type.
-  std::unordered_map<string, const NodeDef*> name_to_node;
+  std::unordered_map<std::string, const NodeDef*> name_to_node;
   for (int i = 0; i < graph_def->node_size(); ++i) {
     name_to_node[graph_def->node(i).name()] = &graph_def->node(i);
   }
@@ -609,25 +612,25 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
   out->clear_node();
 
   // Tensors needed for feeding.
-  std::set<std::pair<string, int>> feed_tensors;
+  std::set<std::pair<std::string, int>> feed_tensors;
   for (const tf2xla::Feed& feed : config.feed()) {
     feed_tensors.insert(
         std::make_pair(feed.id().node_name(), feed.id().output_index()));
   }
 
   // Maps node name to reachability.
-  std::unordered_map<string, std::pair<bool, const NodeDef*>> node_by_name;
+  std::unordered_map<std::string, std::pair<bool, const NodeDef*>> node_by_name;
   for (const NodeDef& node : in.node()) {
     node_by_name[node.name()] = std::pair<bool, const NodeDef*>(false, &node);
   }
 
   // Traverse.
-  std::queue<string> name_queue;
+  std::queue<std::string> name_queue;
   for (int i = 0; i < config.fetch_size(); ++i) {
     name_queue.push(config.fetch(i).id().node_name());
   }
   while (!name_queue.empty()) {
-    const string name = name_queue.front();
+    const std::string name = name_queue.front();
     name_queue.pop();
 
     auto find_it = node_by_name.find(name);
@@ -642,9 +645,9 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
     map_entry.first = true;
 
     // Push input nodes of the currently visited node to name_queue.
-    for (const string& in_edge : map_entry.second->input()) {
+    for (const std::string& in_edge : map_entry.second->input()) {
       auto id = ParseTensorName(in_edge);
-      const string node_name = string(id.first);
+      const std::string node_name = std::string(id.first);
       if (feed_tensors.find(std::make_pair(node_name, id.second)) ==
           feed_tensors.end()) {
         name_queue.push(node_name);
@@ -668,7 +671,7 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
   return absl::OkStatus();
 }
 
-string TensorIdToString(const tf2xla::TensorId& id) {
+std::string TensorIdToString(const tf2xla::TensorId& id) {
   return absl::StrCat(id.node_name(), ":", id.output_index());
 }
 
@@ -682,7 +685,7 @@ absl::Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
         std::optional<xla::OpSharding> sharding,
         ParseShardingFromDevice(
             *possible_match,
-            /*num_cores_per_replica=*/std::numeric_limits<int32>::max(),
+            /*num_cores_per_replica=*/std::numeric_limits<int32_t>::max(),
             /*add_metadata=*/false));
     if (sharding && sharding->type() == xla::OpSharding::MAXIMAL) {
       const int core_annotation = sharding.value().tile_assignment_devices(0);
@@ -709,7 +712,7 @@ void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
 }
 
 namespace {
-uint32 InitialRandomSeed() {
+uint32_t InitialRandomSeed() {
   // Support plumbing the TF seed through to XLA is being worked on.
   // If a user wants deterministic behavior, their best option
   // is to start with a known checkpoint. This also handles issues when
@@ -724,13 +727,13 @@ uint32 InitialRandomSeed() {
 }
 }  // namespace
 
-uint32 GetXLARandomSeed() {
+uint32_t GetXLARandomSeed() {
   // We initialize counter with an odd number and increment it by two
   // everytime. This ensures that it will never be zero, even
   // after an overflow. When seeded with zero, some XLA backends
   // can return all zeros instead of random numbers.
-  static std::atomic<uint32> counter(InitialRandomSeed());
-  uint32 seed = counter.fetch_add(2);
+  static std::atomic<uint32_t> counter(InitialRandomSeed());
+  uint32_t seed = counter.fetch_add(2);
   std::srand(seed);
   return std::rand() | 1;
 }
@@ -766,7 +769,7 @@ bool HasAssociatedFunction(const NodeDef& node_def,
 std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
     const Node& node, const FunctionLibraryDefinition* fld) {
   std::vector<AssociatedFunctionInfo> results;
-  const string& op = node.type_string();
+  const std::string& op = node.type_string();
   if (fld->Contains(op)) {
     // This is a function call node.
     AttrValueMap attrs(node.attrs().begin(), node.attrs().end());
@@ -795,7 +798,7 @@ std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
 absl::Status RewriteAssociatedFunction(
     Graph* graph, Node* node, FunctionLibraryDefinition* fld,
     const AssociatedFunctionInfo& associated_function,
-    const string& rewritten_function_name) {
+    const std::string& rewritten_function_name) {
   switch (associated_function.type()) {
     case AssociatedFunctionInfo::kFunctionCallNode: {
       // Change this node to call the new function.
@@ -834,7 +837,7 @@ absl::Status RewriteAssociatedFunction(
       GradientDef gradient_def;
       gradient_def.set_function_name(func.name());
       gradient_def.set_gradient_func(rewritten_function_name);
-      string original_grad_func = fld->FindGradient(func.name());
+      std::string original_grad_func = fld->FindGradient(func.name());
       if (original_grad_func.empty()) {
         TF_RETURN_IF_ERROR(fld->AddGradientDef(gradient_def));
       } else if (original_grad_func != rewritten_function_name) {
@@ -863,9 +866,9 @@ absl::Status RewriteAssociatedFunction(
 }
 
 absl::Status CachedFunctionHandles::GetOrInstantiate(
-    const string& func_name, AttrSlice attrs,
+    const std::string& func_name, AttrSlice attrs,
     FunctionLibraryRuntime::Handle* handle) {
-  string canonicalized_name = Canonicalize(func_name, attrs);
+  std::string canonicalized_name = Canonicalize(func_name, attrs);
   auto iter = handles_.find(canonicalized_name);
   if (iter != handles_.end()) {
     *handle = iter->second;
@@ -919,8 +922,8 @@ absl::StatusOr<Node*> ReplaceNode(Graph* g, Node* n, const NodeDef& node_def) {
 }
 
 absl::StatusOr<Node*> BuildIdentityNode(
-    Graph* graph, const string& node_name, DataType dtype, const Node* input,
-    std::optional<string> requested_device) {
+    Graph* graph, const std::string& node_name, DataType dtype,
+    const Node* input, std::optional<std::string> requested_device) {
   // Create identity node.
   NodeDef ndef;
   ndef.set_name(node_name);
@@ -975,7 +978,7 @@ absl::Status PruneUnreachableFunctionsFromGraph(
   g.ToGraphDef(&graph_def);
   FunctionLibraryDefinition reachable_functions =
       fld->ReachableDefinitions(graph_def);
-  for (const string& func_name : fld->ListFunctionNames()) {
+  for (const std::string& func_name : fld->ListFunctionNames()) {
     if (!reachable_functions.Find(func_name)) {
       TF_RETURN_IF_ERROR(fld->RemoveFunction(func_name));
     }
@@ -1106,7 +1109,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
 
     // Add rewritten backward While body function.
     FunctionDef new_fdef;
-    string new_name = fld->UniqueFunctionName(
+    std::string new_name = fld->UniqueFunctionName(
         absl::StrCat(bwd_body_attr.name(), "_tl_rewrite_"));
     TF_RETURN_IF_ERROR(
         GraphToFunctionDef(*bwd_fbody->graph, new_name, &new_fdef));
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index f2ce3944ac158c..4da5a474d964dc 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -41,7 +41,8 @@ absl::Status ValidateConfig(const tf2xla::Config& config);
 // feeds).
 absl::Status AddPlaceholdersForFeeds(
     const tf2xla::Config& config, const OpRegistryInterface* op_registry,
-    std::unordered_map<string, string>* feed_remapping, GraphDef* graph_def);
+    std::unordered_map<std::string, std::string>* feed_remapping,
+    GraphDef* graph_def);
 
 // Returns in <out> a copy of <in>, pruned to only include fetches from
 // <config>.
@@ -49,7 +50,7 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
                                GraphDef* out);
 
 // Returns node:port for the given <id>.
-string TensorIdToString(const tf2xla::TensorId& id);
+std::string TensorIdToString(const tf2xla::TensorId& id);
 
 // Updates the sharding of <n> based on the sharding of its neighbors.
 // If <out_edges> is true, outgoing edges from <n> are considered; else incoming
@@ -61,7 +62,7 @@ void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
                                    KernelDef* kdef);
 
 // Returns the next random seed to use for seeding xla rng.
-uint32 GetXLARandomSeed();
+uint32_t GetXLARandomSeed();
 
 // Indicates how a FunctionDef is associated with a graph node (e.g. the node is
 // a function call, or the node has function attrs).
@@ -74,14 +75,14 @@ class AssociatedFunctionInfo {
   };
 
   // The function is an attr of the node.
-  static AssociatedFunctionInfo FunctionAttr(const string& func_name,
+  static AssociatedFunctionInfo FunctionAttr(const std::string& func_name,
                                              const AttrValueMap& attrs,
-                                             const string& attr_name) {
+                                             const std::string& attr_name) {
     return AssociatedFunctionInfo(kFunctionAttr, func_name, attrs, attr_name);
   }
 
   // The node is a function call.
-  static AssociatedFunctionInfo FunctionCall(const string& func_name,
+  static AssociatedFunctionInfo FunctionCall(const std::string& func_name,
                                              const AttrValueMap& attrs) {
     // attr_name will not be used in this case.
     return AssociatedFunctionInfo(kFunctionCallNode, func_name, attrs,
@@ -89,7 +90,7 @@ class AssociatedFunctionInfo {
   }
 
   // The node is a SymbolicGradient op.
-  static AssociatedFunctionInfo SymbolicGradient(const string& func_name,
+  static AssociatedFunctionInfo SymbolicGradient(const std::string& func_name,
                                                  const AttrValueMap& attrs) {
     // attr_name will not be used in this case.
     return AssociatedFunctionInfo(kSymbolicGradient, func_name, attrs,
@@ -98,15 +99,17 @@ class AssociatedFunctionInfo {
 
   AssociatedFunctionType type() const { return type_; }
 
-  const string& func_name() const { return func_name_; }
+  const std::string& func_name() const { return func_name_; }
 
-  const string& attr_name() const { return attr_name_; }
+  const std::string& attr_name() const { return attr_name_; }
 
   const AttrValueMap& attrs() const { return attrs_; }
 
  private:
-  AssociatedFunctionInfo(AssociatedFunctionType type, const string& func_name,
-                         const AttrValueMap& attrs, const string& attr_name)
+  AssociatedFunctionInfo(AssociatedFunctionType type,
+                         const std::string& func_name,
+                         const AttrValueMap& attrs,
+                         const std::string& attr_name)
       : type_(type),
         func_name_(func_name),
         attrs_(attrs),
@@ -114,11 +117,11 @@ class AssociatedFunctionInfo {
 
   // Available for all instances.
   AssociatedFunctionType type_;
-  string func_name_;
+  std::string func_name_;
   AttrValueMap attrs_;
 
   // Only available if the function is defined in an attr.
-  string attr_name_;
+  std::string attr_name_;
 };
 
 // Returns if the NodeDef has associated function.
@@ -142,7 +145,7 @@ std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
 absl::Status RewriteAssociatedFunction(
     Graph* graph, Node* node, FunctionLibraryDefinition* fld,
     const AssociatedFunctionInfo& associated_function,
-    const string& rewritten_function_name);
+    const std::string& rewritten_function_name);
 
 // Class to act as cache for FunctionLibraryRuntime::Handle objects.
 class CachedFunctionHandles {
@@ -152,7 +155,7 @@ class CachedFunctionHandles {
   // Populates `handle` for requested function and attributes. If we have
   // instantiated the function with the same attributes before, `handle` will be
   // cached handle; otherwise instantiate the function and populate `handle`.
-  absl::Status GetOrInstantiate(const string& func_name, AttrSlice attrs,
+  absl::Status GetOrInstantiate(const std::string& func_name, AttrSlice attrs,
                                 FunctionLibraryRuntime::Handle* handle);
 
   // Releases all handles in the cache. Returns first non-OK status if any;
@@ -163,7 +166,7 @@ class CachedFunctionHandles {
 
  private:
   FunctionLibraryRuntime* flr_;
-  std::map<string, FunctionLibraryRuntime::Handle> handles_;
+  std::map<std::string, FunctionLibraryRuntime::Handle> handles_;
 
   CachedFunctionHandles(const CachedFunctionHandles&) = delete;
   void operator=(const CachedFunctionHandles&) = delete;
@@ -179,9 +182,9 @@ struct OutEdgeInfo {
 absl::StatusOr<Node*> ReplaceNode(Graph* g, Node* n, const NodeDef& node_def);
 
 // Helper function that builds an Identity node.
-absl::StatusOr<Node*> BuildIdentityNode(Graph* graph, const string& node_name,
-                                        DataType dtype, const Node* input,
-                                        std::optional<string> requested_device);
+absl::StatusOr<Node*> BuildIdentityNode(
+    Graph* graph, const std::string& node_name, DataType dtype,
+    const Node* input, std::optional<std::string> requested_device);
 
 // For "If"/"While" nodes, if some of their inputs are Const nodes, rewrite
 // body functions to use the Const nodes instead of original _Arg nodes.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index e66a8a38813474..ef64b82f50e5be 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -157,7 +157,7 @@ TEST(ValidateConfig, ConflictingFetchName) {
   ExpectErrorContains(ValidateConfig(config), "conflicting fetch name");
 }
 
-static tf2xla::Config FetchesConfig(std::vector<string> fetches) {
+static tf2xla::Config FetchesConfig(std::vector<std::string> fetches) {
   tf2xla::Config config;
   for (const auto& fetch_node_name : fetches) {
     auto* fetch = config.add_fetch();
@@ -409,7 +409,7 @@ TEST(PropagateConstIntoFunctionalNodes, CopiedConstNodeHasUniqueName) {
   TF_ASSERT_OK(GetNodeAttr(while_node->def(), "body", &body_fn));
   const FunctionDef* rewritten_body_fn = fld.Find(body_fn.name());
   ASSERT_NE(rewritten_body_fn, nullptr);
-  std::unordered_map<string, NodeDef> nodes;
+  std::unordered_map<std::string, NodeDef> nodes;
   for (const NodeDef& node_def : rewritten_body_fn->node_def()) {
     nodes[node_def.name()] = node_def;
   }
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index ec456344bcfced..007ecef7492600 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -87,6 +87,9 @@ absl::Status DataTypeToPrimitiveType(DataType data_type,
     case tensorflow::DT_FLOAT8_E5M2FNUZ:
       *type = xla::F8E5M2FNUZ;
       return absl::OkStatus();
+    case tensorflow::DT_FLOAT4_E2M1FN:
+      *type = xla::F4E2M1FN;
+      return absl::OkStatus();
     case tensorflow::DT_BFLOAT16:
       *type = xla::BF16;
       return absl::OkStatus();
@@ -122,6 +125,7 @@ absl::StatusOr<DataType> EncodePrimitiveTypeAsDataType(
           {xla::F8E4M3FNUZ, DT_FLOAT8_E4M3FNUZ},
           {xla::F8E4M3B11FNUZ, DT_FLOAT8_E4M3B11FNUZ},
           {xla::F8E5M2FNUZ, DT_FLOAT8_E5M2FNUZ},
+          {xla::F4E2M1FN, DT_FLOAT4_E2M1FN},
           {xla::BF16, DT_BFLOAT16},
           {xla::F16, DT_HALF},
           {xla::F32, DT_FLOAT},
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 215decdb4d8843..add79c369b69ef 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -39,7 +39,7 @@ class XlaCompilationAllocator : public Allocator {
   XlaCompilationAllocator() {}
   ~XlaCompilationAllocator() override {}
 
-  string Name() override { return "xla_compilation"; }
+  std::string Name() override { return "xla_compilation"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     // Regardless of the size requested, always allocates an XlaExpression.
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 4603bbf119a8bf..5ee45e499cb49e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -25,16 +25,16 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/allocator.h"
 #include "xla/backends/cpu/runtime/rng_state_lib.h"
-#include "xla/cpu_function_runtime.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 namespace {
 
-int32 GetResultIndex(const int32* result_index_table, int32 num_results) {
+int32_t GetResultIndex(const int32_t* result_index_table, int32_t num_results) {
   auto it =
       std::min_element(result_index_table, result_index_table + num_results);
 
@@ -72,7 +72,7 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       alloc_mode == AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS;
   // Allocate arg and temp buffers.
   alloc_buffer_table_ = tensorflow::MallocContiguousBuffers(
-      static_data.buffer_infos_, static_data.num_buffers_,
+      absl::MakeConstSpan(static_data.buffer_infos_, static_data.num_buffers_),
       /*allocate_entry_params=*/allocate_entry_params, buffer_table_,
       /*annotate_initialized=*/true);
   // If Hlo profiling is enabled the generated code expects an appropriately
@@ -150,7 +150,7 @@ int LookupNameIndex(absl::string_view name, const char** names) {
 
 }  // namespace
 
-int XlaCompiledCpuFunction::LookupArgIndex(const string& name) const {
+int XlaCompiledCpuFunction::LookupArgIndex(const std::string& name) const {
   return LookupNameIndex(name, arg_names_);
 }
 
@@ -162,7 +162,7 @@ int XlaCompiledCpuFunction::LookupVariableIndex(absl::string_view name) const {
   return num_args_ - num_variables_ + index;
 }
 
-int XlaCompiledCpuFunction::LookupResultIndex(const string& name) const {
+int XlaCompiledCpuFunction::LookupResultIndex(const std::string& name) const {
   return LookupNameIndex(name, result_names_);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 009650d76109bb..061982db6fd08f 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -28,9 +28,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 #include "xla/backends/cpu/alignment.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
 #include "xla/backends/cpu/runtime/rng_state_lib.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "tensorflow/core/platform/types.h"
@@ -123,19 +124,19 @@ class XlaCompiledCpuFunction {
     // End serialized thunk execution specific
 
     // Contains information about the buffers used by the XLA computation.
-    const xla::cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr;
+    const xla::cpu::BufferAllocationInfo* buffer_infos_ = nullptr;
     int32_t num_buffers_ = 0;
 
     // Result parameter i is described by
     // buffer_infos[result_index_table[i]].
-    const int32* result_index_table_ = nullptr;
+    const int32_t* result_index_table_ = nullptr;
 
     // There are num_results result parameters.
     int64_t num_results_ = 0;
 
     // Entry parameter i is described by
     // buffer_infos[arg_index_table[i]].
-    const int32* arg_index_table_ = nullptr;
+    const int32_t* arg_index_table_ = nullptr;
 
     // There are num_args entry parameters.
     int64_t num_args_ = 0;
@@ -209,7 +210,7 @@ class XlaCompiledCpuFunction {
   // TODO(fschneider): For now this always returns an empty string because there
   // is no support for error reporting in XLA. Remove this once all callers are
   // updated.
-  string error_msg() const { return error_msg_; }
+  std::string error_msg() const { return error_msg_; }
 
   void set_error_msg(absl::string_view error_msg) { error_msg_ = error_msg; }
 
@@ -251,9 +252,7 @@ class XlaCompiledCpuFunction {
   // called for each positional argument, in order to set the argument buffers.
   //
   // Allocated memory must be aligned to the size specified by
-  // xla::cpu_function_runtime::MinAlign(). If possible, use the functions in
-  // tensorflow/compiler/tf2xla/cpu_function_runtime.h to ensure correct
-  // alignment.
+  // xla::cpu::MinAlign().
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
@@ -304,7 +303,7 @@ class XlaCompiledCpuFunction {
   // The index remains constant for every instance of XlaCompiledCpuFunction
   // generated from the same static data, and might not be cheap to determine.
   // Recommended usage is to capture this in a variable for re-use.
-  int LookupArgIndex(const string& name) const;
+  int LookupArgIndex(const std::string& name) const;
 
   // Returns the 0-based index for the variable with the given `name`.
   // Returns -1 if the name wasn't found, or data isn't available.
@@ -320,7 +319,7 @@ class XlaCompiledCpuFunction {
   // The index remains constant for every instance of XlaCompiledCpuFunction
   // generated from the same static data, and might not be cheap to determine.
   // Recommended usage is to capture this in a variable for re-use.
-  int LookupResultIndex(const string& name) const;
+  int LookupResultIndex(const std::string& name) const;
 
   // Returns the name of the argument at `index`.
   // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
@@ -362,11 +361,11 @@ class XlaCompiledCpuFunction {
     return temp_allocation_index_;
   }
 
-  const xla::cpu_function_runtime::BufferInfo* buffer_infos() const {
+  const xla::cpu::BufferAllocationInfo* buffer_infos() const {
     return buffer_infos_;
   }
 
-  int32 num_buffers() const { return num_buffers_; }
+  int32_t num_buffers() const { return num_buffers_; }
 
   void** buffer_table() const { return buffer_table_; }
 
@@ -415,7 +414,7 @@ class XlaCompiledCpuFunction {
 
   static void set_static_data_buffer_infos(
       StaticData* static_data,
-      const xla::cpu_function_runtime::BufferInfo* buffer_infos) {
+      const xla::cpu::BufferAllocationInfo* buffer_infos) {
     static_data->buffer_infos_ = buffer_infos;
   }
 
@@ -425,7 +424,7 @@ class XlaCompiledCpuFunction {
   }
 
   static void set_static_data_result_index_table(
-      StaticData* static_data, const int32* result_index_table) {
+      StaticData* static_data, const int32_t* result_index_table) {
     static_data->result_index_table_ = result_index_table;
   }
 
@@ -435,7 +434,7 @@ class XlaCompiledCpuFunction {
   }
 
   static void set_static_data_arg_index_table(StaticData* static_data,
-                                              const int32* arg_index_table) {
+                                              const int32_t* arg_index_table) {
     static_data->arg_index_table_ = arg_index_table;
   }
 
@@ -531,22 +530,22 @@ class XlaCompiledCpuFunction {
   void** const buffer_table_;
 
   // Describes the buffers used by the XLA computation.
-  const xla::cpu_function_runtime::BufferInfo* const buffer_infos_;
-  const int32 num_buffers_;
+  const xla::cpu::BufferAllocationInfo* const buffer_infos_;
+  const int32_t num_buffers_;
 
   // Indices of expanded result tuple.
-  const int32 num_results_;
-  const int32* const result_index_table_;
+  const int32_t num_results_;
+  const int32_t* const result_index_table_;
 
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
-  const int32* const arg_index_table_;
+  const int32_t* const arg_index_table_;
 
   // The number of incoming arguments.
-  const int32 num_args_;
+  const int32_t num_args_;
 
   // The number of incoming variables.
-  const int32 num_variables_;
+  const int32_t num_variables_;
 
   // Shapes of the input arguments.
   const ShapeInfo* const arg_shape_infos_;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 9e761dc6003d80..5088badf28e9cb 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <numeric>
@@ -130,7 +131,7 @@ ComputeArgAndRetvalShardings(const Graph& graph) {
       [](const Node* n) -> absl::StatusOr<std::optional<xla::OpSharding>> {
     TF_ASSIGN_OR_RETURN(
         auto sharding,
-        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max(),
+        ParseShardingFromDevice(*n, std::numeric_limits<int32_t>::max(),
                                 /*add_metadata=*/false));
     return sharding;
   };
@@ -173,7 +174,7 @@ absl::Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
   xla_context->Ref();
   absl::Status status;
   auto step_container = std::make_unique<ScopedStepContainer>(
-      step_id, [&status, device](const string& name) {
+      step_id, [&status, device](const std::string& name) {
         status = device->resource_manager()->Cleanup(name);
       });
   TF_RETURN_IF_ERROR(step_container->Create(device->resource_manager(),
@@ -484,8 +485,8 @@ absl::Status BuildComputation(
 
 }  // namespace
 
-string XlaCompiler::Argument::HumanString() const {
-  string common;
+std::string XlaCompiler::Argument::HumanString() const {
+  std::string common;
   if (!name.empty()) {
     common = absl::StrCat(" name=", name);
   }
@@ -503,7 +504,7 @@ string XlaCompiler::Argument::HumanString() const {
       return absl::StrCat("kind=constant-resource", common,
                           " value=", constant_value.DebugString());
     case kResource: {
-      string output = absl::StrCat(
+      std::string output = absl::StrCat(
           "kind=resource", common,
           " resource_kind=", XlaResource::KindToString(resource_kind),
           " initialized=", initialized, " is_fast_mem=", fast_mem);
@@ -543,7 +544,7 @@ XlaCompiler::Argument::DimensionSizesAsInlinedVector() const {
   }
 }
 
-string XlaCompiler::Argument::ShapeHumanString() const {
+std::string XlaCompiler::Argument::ShapeHumanString() const {
   if (absl::holds_alternative<TensorShape>(shape)) {
     return std::get<TensorShape>(shape).DebugString();
   } else {
@@ -592,9 +593,9 @@ XlaCompiler::~XlaCompiler() = default;
 
 int64_t XlaCompiler::NextStepId() { return next_step_id_++; }
 
-uint64 XlaCompiler::SignatureHash::operator()(
-    const std::pair<string, std::vector<Argument>>& signature) const {
-  return std::hash<string>()(signature.first);
+uint64_t XlaCompiler::SignatureHash::operator()(
+    const std::pair<std::string, std::vector<Argument>>& signature) const {
+  return std::hash<std::string>()(signature.first);
 }
 
 static absl::Status GetFunctionBody(const NameAttrList& function,
@@ -703,9 +704,9 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
                 flib_runtime_->GetFunctionLibraryDefinition(), &shape_info)
         .IgnoreError();
     auto node_name_index = graph->BuildNodeNameIndex();
-    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
     for (const auto& node_shape_info : shape_info) {
-      const string& node_name = node_shape_info.first;
+      const std::string& node_name = node_shape_info.first;
       const std::vector<InferredShape>& output_shapes = node_shape_info.second;
       const auto& node_iter = node_name_index.find(node_name);
       if (node_iter != node_name_index.end()) {
@@ -726,9 +727,9 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
               flib_runtime_->GetFunctionLibraryDefinition(), &shape_info)
       .IgnoreError();
   auto node_name_index = graph->BuildNodeNameIndex();
-  std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+  std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
   for (const auto& node_shape_info : shape_info) {
-    const string& node_name = node_shape_info.first;
+    const std::string& node_name = node_shape_info.first;
     const std::vector<InferredShape>& output_shapes = node_shape_info.second;
     const auto& node_iter = node_name_index.find(node_name);
     if (node_iter != node_name_index.end()) {
@@ -754,7 +755,7 @@ std::vector<std::string> GetValidControlRets(
   // the map with nodes in FunctionDef control_ret_nodes and later query it
   // using the nodes in `graph`. The Node pointers would be different but the
   // Node name is expected to remain the same between the two.
-  absl::flat_hash_map<string, int> control_ret_nodes_map;
+  absl::flat_hash_map<std::string, int> control_ret_nodes_map;
   for (int i = 0; i < orig_control_ret_nodes.size(); ++i) {
     const Node* n = orig_control_ret_nodes[i];
     control_ret_nodes_map[n->name()] = i;
@@ -814,7 +815,7 @@ absl::Status XlaCompiler::CompileFunction(
     const NameAttrList& fn_name_attrs,
     absl::Span<const XlaCompiler::Argument> args,
     XlaCompiler::CompilationResult* result) {
-  string function_id =
+  std::string function_id =
       Canonicalize(fn_name_attrs.name(), AttrSlice(&fn_name_attrs.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
@@ -1325,7 +1326,7 @@ namespace {
 absl::Status ValidateFunctionDef(const FunctionDef* fdef,
                                  const FunctionLibraryDefinition& flib_def) {
   for (const NodeDef& node : fdef->node_def()) {
-    const string& op = node.op();
+    const std::string& op = node.op();
     if (op == FunctionLibraryDefinition::kGradientOp || flib_def.Find(op)) {
       continue;
     }
@@ -1340,7 +1341,8 @@ absl::Status ValidateFunctionDef(const FunctionDef* fdef,
 // Returned pointer points to the internal string either in node's attributes
 // or in its NodeDef. This pointer is valid as long as the node has not been
 // modified.
-absl::Status GetPotentialFunctionName(const Node& node, const string** name) {
+absl::Status GetPotentialFunctionName(const Node& node,
+                                      const std::string** name) {
   if (node.IsPartitionedCall()) {
     const AttrValue* attr_value;
     TF_RETURN_IF_ERROR(
@@ -1361,7 +1363,8 @@ absl::Status GetPotentialFunctionName(const Node& node, const string** name) {
 // given device_type, invalid data type, missing attributes...)
 absl::Status ValidateGraph(const Graph* graph,
                            const FunctionLibraryDefinition& flib_def,
-                           const DeviceType& device_type, const string& name) {
+                           const DeviceType& device_type,
+                           const std::string& name) {
   // Make sure the XLA compilation kernels are registered.  This operation is
   // idempotent so it is fine if someone called it already.
   XlaOpRegistry::RegisterCompilationKernels();
@@ -1398,7 +1401,7 @@ absl::Status ValidateGraph(const Graph* graph,
     if (node->type_string() == FunctionLibraryDefinition::kGradientOp) {
       continue;
     }
-    const string* function_name;
+    const std::string* function_name;
     TF_RETURN_IF_ERROR(GetPotentialFunctionName(*node, &function_name));
     const FunctionDef* fdef = flib_def.Find(*function_name);
     absl::Status s;
@@ -1455,6 +1458,36 @@ class DummyStackTrace : public AbstractStackTrace {
 };
 
 namespace {
+const xla::HloInstructionProto* FindInstructionById(
+    const xla::HloComputationProto& computation, int64_t id) {
+  auto iter =
+      absl::c_find_if(computation.instructions(),
+                      [id](const xla::HloInstructionProto& instruction) {
+                        return instruction.id() == id;
+                      });
+  if (iter == computation.instructions().end()) {
+    return nullptr;
+  }
+  return &(*iter);
+}
+
+bool ShouldAddPrecisionToInstruction(
+    const xla::HloInstructionProto& instruction,
+    const xla::HloComputationProto& computation) {
+  static constexpr std::array<absl::string_view, 2> kOpsPossiblyUsingTF32 = {
+      "dot", "convolution"};
+  if (!absl::c_linear_search(kOpsPossiblyUsingTF32, instruction.opcode())) {
+    return false;
+  }
+  if (instruction.shape().element_type() == xla::F32) {
+    return true;
+  }
+  return absl::c_any_of(instruction.operand_ids(), [&](int64_t operand_id) {
+    const xla::HloInstructionProto* operand =
+        FindInstructionById(computation, operand_id);
+    return operand && operand->shape().element_type() == xla::F32;
+  });
+}
 
 // Add precisions configs to the HLO module to avoid TensorFloat32 computations
 // in XLA.
@@ -1462,13 +1495,7 @@ namespace {
 // Some operations, such as Einsum are converted through MlirXlaOpKernel, which
 // doesn't set the precisions, so we set them all here.
 //
-// TODO(tdanyluk): We may want to restrict this logic to only set the operand
-// precision for F32 operands. (Historically, it was set without regard to
-// operand type in other parts of TF2XLA.)
 void IncreasePrecisionsToAvoidTF32(xla::HloModuleProto& module) {
-  static constexpr std::array<absl::string_view, 2> kOpsPossiblyUsingTF32 = {
-      "dot", "convolution"};
-
   xla::PrecisionConfig precision_config;
   precision_config.add_operand_precision(xla::PrecisionConfig::HIGHEST);
   precision_config.add_operand_precision(xla::PrecisionConfig::HIGHEST);
@@ -1476,8 +1503,7 @@ void IncreasePrecisionsToAvoidTF32(xla::HloModuleProto& module) {
   for (xla::HloComputationProto& computation : *module.mutable_computations()) {
     for (xla::HloInstructionProto& instruction :
          *computation.mutable_instructions()) {
-      if (absl::c_find(kOpsPossiblyUsingTF32, instruction.opcode()) !=
-          kOpsPossiblyUsingTF32.end()) {
+      if (ShouldAddPrecisionToInstruction(instruction, computation)) {
         *instruction.mutable_precision_config() = precision_config;
       }
     }
@@ -1487,7 +1513,7 @@ void IncreasePrecisionsToAvoidTF32(xla::HloModuleProto& module) {
 }  // namespace
 
 absl::Status XlaCompiler::CompileGraph(
-    const XlaCompiler::CompileOptions& options, string const& name,
+    const XlaCompiler::CompileOptions& options, const std::string& name,
     std::unique_ptr<Graph> graph, absl::Span<const XlaCompiler::Argument> args,
     CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.: " << name;
@@ -1689,7 +1715,7 @@ xla::ChannelHandle XlaCompiler::NewChannel(
   return new_handle;
 }
 
-absl::Status XlaCompiler::GetChannelHandle(const string& key,
+absl::Status XlaCompiler::GetChannelHandle(const std::string& key,
                                            xla::ChannelHandle* channel) {
   auto result = channels_.emplace(key, xla::ChannelHandle());
   if (result.second) {
@@ -1701,7 +1727,7 @@ absl::Status XlaCompiler::GetChannelHandle(const string& key,
 }
 
 absl::Status XlaCompiler::GetHostToDeviceChannelHandle(
-    const string& key, xla::ChannelHandle* channel) {
+    const std::string& key, xla::ChannelHandle* channel) {
   auto result = channels_.emplace(key, xla::ChannelHandle());
   if (result.second) {
     result.first->second = NewChannel(xla::ChannelHandle::HOST_TO_DEVICE);
@@ -1712,7 +1738,7 @@ absl::Status XlaCompiler::GetHostToDeviceChannelHandle(
 }
 
 absl::Status XlaCompiler::GetDeviceToHostChannelHandle(
-    const string& key, xla::ChannelHandle* channel) {
+    const std::string& key, xla::ChannelHandle* channel) {
   auto result = channels_.emplace(key, xla::ChannelHandle());
   if (result.second) {
     result.first->second = NewChannel(xla::ChannelHandle::DEVICE_TO_HOST);
@@ -1724,7 +1750,7 @@ absl::Status XlaCompiler::GetDeviceToHostChannelHandle(
 
 namespace {
 
-void SetTransfer(const string& key, absl::Span<const DataType> types,
+void SetTransfer(const std::string& key, absl::Span<const DataType> types,
                  absl::Span<const TensorShape> shapes,
                  tf2xla::HostTransferMetadata* transfer) {
   transfer->set_key(key);
@@ -1739,7 +1765,7 @@ void SetTransfer(const string& key, absl::Span<const DataType> types,
 }  // namespace
 
 absl::Status XlaCompiler::SetDeviceToHostMetadata(
-    const string& key, absl::Span<const DataType> types,
+    const std::string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
   if (host_compute_sends_.find(key) != host_compute_sends_.end()) {
     tf2xla::HostTransferMetadata& existing_transfer = host_compute_sends_[key];
@@ -1759,7 +1785,7 @@ absl::Status XlaCompiler::SetDeviceToHostMetadata(
 }
 
 absl::Status XlaCompiler::GetDeviceToHostShapes(
-    const string& key, std::vector<TensorShape>* shapes) const {
+    const std::string& key, std::vector<TensorShape>* shapes) const {
   const auto iter = host_compute_sends_.find(key);
   if (iter == host_compute_sends_.end()) {
     return errors::InvalidArgument(
@@ -1774,7 +1800,7 @@ absl::Status XlaCompiler::GetDeviceToHostShapes(
 }
 
 absl::Status XlaCompiler::SetHostToDeviceMetadata(
-    const string& key, absl::Span<const DataType> types,
+    const std::string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
   if (host_compute_recvs_.find(key) != host_compute_recvs_.end()) {
     tf2xla::HostTransferMetadata& existing_transfer = host_compute_recvs_[key];
@@ -1794,7 +1820,7 @@ absl::Status XlaCompiler::SetHostToDeviceMetadata(
 }
 
 absl::Status XlaCompiler::GetHostComputeControlDependency(
-    const string& host_compute_name, xla::XlaOp* handle) {
+    const std::string& host_compute_name, xla::XlaOp* handle) {
   const auto iter = host_compute_control_output_.find(host_compute_name);
   if (iter == host_compute_control_output_.end()) {
     return errors::InvalidArgument(
@@ -1807,7 +1833,7 @@ absl::Status XlaCompiler::GetHostComputeControlDependency(
 }
 
 absl::Status XlaCompiler::SetHostComputeControlDependency(
-    const string& host_compute_name, const xla::XlaOp handle) {
+    const std::string& host_compute_name, const xla::XlaOp handle) {
   if (host_compute_control_output_.find(host_compute_name) !=
       host_compute_control_output_.end()) {
     return errors::InvalidArgument(
@@ -1819,7 +1845,7 @@ absl::Status XlaCompiler::SetHostComputeControlDependency(
 }
 
 void XlaCompiler::PushNodeTokenMapping() {
-  node_token_mapping_stack_.emplace(std::map<string, xla::XlaOp>{});
+  node_token_mapping_stack_.emplace(std::map<std::string, xla::XlaOp>{});
 }
 
 absl::Status XlaCompiler::PopNodeTokenMapping() {
@@ -1832,7 +1858,7 @@ absl::Status XlaCompiler::PopNodeTokenMapping() {
   return absl::OkStatus();
 }
 
-absl::Status XlaCompiler::SetNodeToken(const string& node_name,
+absl::Status XlaCompiler::SetNodeToken(const std::string& node_name,
                                        const xla::XlaOp op) {
   if (node_token_mapping_stack_.empty()) {
     return errors::FailedPrecondition(
@@ -1847,7 +1873,8 @@ absl::Status XlaCompiler::SetNodeToken(const string& node_name,
   return absl::OkStatus();
 }
 
-absl::StatusOr<xla::XlaOp> XlaCompiler::GetNodeToken(const string& node_name) {
+absl::StatusOr<xla::XlaOp> XlaCompiler::GetNodeToken(
+    const std::string& node_name) {
   if (node_token_mapping_stack_.empty()) {
     return errors::FailedPrecondition(
         "Calling GetNodeToken() when node_token_mapping_stack_ is "
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 2beb730eb06fa3..216125f9cb153e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -277,7 +277,8 @@ class XlaCompiler {
   // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
-  absl::Status CompileGraph(const CompileOptions& options, string const& name,
+  absl::Status CompileGraph(const CompileOptions& options,
+                            const std::string& name,
                             std::unique_ptr<Graph> graph,
                             absl::Span<const Argument> args,
                             CompilationResult* result);
@@ -295,31 +296,32 @@ class XlaCompiler {
   // Channel handles can be used to communicate between different
   // computations. Computations that communicate should be compiled with the
   // same XlaCompiler.
-  absl::Status GetChannelHandle(const string& key, xla::ChannelHandle* channel);
+  absl::Status GetChannelHandle(const std::string& key,
+                                xla::ChannelHandle* channel);
 
   // Retrieves the host-to-device channel handle associated with `key`.
   // Allocates a new channel handle if none exists.
-  absl::Status GetHostToDeviceChannelHandle(const string& key,
+  absl::Status GetHostToDeviceChannelHandle(const std::string& key,
                                             xla::ChannelHandle* channel);
 
   // Retrieves the device-to-host channel handle associated with `key`.
   // Allocates a new channel handle if none exists.
-  absl::Status GetDeviceToHostChannelHandle(const string& key,
+  absl::Status GetDeviceToHostChannelHandle(const std::string& key,
                                             xla::ChannelHandle* channel);
 
   // Sets the shapes and types for the device to host transfer associated with
   // 'key'.
-  absl::Status SetDeviceToHostMetadata(const string& key,
+  absl::Status SetDeviceToHostMetadata(const std::string& key,
                                        absl::Span<const DataType> types,
                                        absl::Span<const TensorShape> shapes);
 
   // Gets the shapes the device to host transfer associated with 'key'.
-  absl::Status GetDeviceToHostShapes(const string& key,
+  absl::Status GetDeviceToHostShapes(const std::string& key,
                                      std::vector<TensorShape>* shapes) const;
 
   // Sets the shapes and types for the host to device transfer associated with
   // 'key'.
-  absl::Status SetHostToDeviceMetadata(const string& key,
+  absl::Status SetHostToDeviceMetadata(const std::string& key,
                                        absl::Span<const DataType> types,
                                        absl::Span<const TensorShape> shapes);
 
@@ -334,10 +336,10 @@ class XlaCompiler {
   // 'host_compute_name' can be any string the client wishes to use to identify
   // a given HostCompute Op as long as the names are unique within the
   // compilation.
-  absl::Status GetHostComputeControlDependency(const string& host_compute_name,
-                                               xla::XlaOp* handle);
-  absl::Status SetHostComputeControlDependency(const string& host_compute_name,
-                                               xla::XlaOp handle);
+  absl::Status GetHostComputeControlDependency(
+      const std::string& host_compute_name, xla::XlaOp* handle);
+  absl::Status SetHostComputeControlDependency(
+      const std::string& host_compute_name, xla::XlaOp handle);
 
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
@@ -345,8 +347,8 @@ class XlaCompiler {
 
   void PushNodeTokenMapping();
   absl::Status PopNodeTokenMapping();
-  absl::Status SetNodeToken(const string& node_name, xla::XlaOp op);
-  absl::StatusOr<xla::XlaOp> GetNodeToken(const string& node_name);
+  absl::Status SetNodeToken(const std::string& node_name, xla::XlaOp op);
+  absl::StatusOr<xla::XlaOp> GetNodeToken(const std::string& node_name);
 
   // Sets the function body `fbody` to the one registered as `function`.
   absl::Status FindFunctionBody(const NameAttrList& function,
@@ -405,20 +407,22 @@ class XlaCompiler {
   FunctionLibraryRuntime* flib_runtime_;        // owned by pflr_.
 
   struct SignatureHash {
-    uint64 operator()(
-        const std::pair<string, std::vector<Argument>>& signature) const;
+    uint64_t operator()(
+        const std::pair<std::string, std::vector<Argument>>& signature) const;
   };
 
-  std::unordered_map<std::pair<string, std::vector<Argument>>,
+  std::unordered_map<std::pair<std::string, std::vector<Argument>>,
                      CompilationResult, SignatureHash>
       cache_;
 
-  std::unordered_map<string, xla::ChannelHandle> channels_;
+  std::unordered_map<std::string, xla::ChannelHandle> channels_;
 
-  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_sends_;
-  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_recvs_;
+  std::unordered_map<std::string, tf2xla::HostTransferMetadata>
+      host_compute_sends_;
+  std::unordered_map<std::string, tf2xla::HostTransferMetadata>
+      host_compute_recvs_;
 
-  std::unordered_map<string, xla::XlaOp> host_compute_control_output_;
+  std::unordered_map<std::string, xla::XlaOp> host_compute_control_output_;
 
   // This is used to store <node name, token output> mapping. Side-effecting
   // ops call SetNodeToken() to record its token output, so later side-effecting
@@ -427,7 +431,7 @@ class XlaCompiler {
   // It's a stack because we need a mapping like this for each level of nested
   // CompileGraph() call. In CompileGraph(), we will push a new mapping to the
   // stack, and pop the mapping before returning.
-  std::stack<std::map<string, xla::XlaOp>> node_token_mapping_stack_;
+  std::stack<std::map<std::string, xla::XlaOp>> node_token_mapping_stack_;
 
   XlaCompiler(const XlaCompiler&) = delete;
   void operator=(const XlaCompiler&) = delete;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index a3090e81f84a82..2c149eacda678e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -140,7 +140,7 @@ namespace {
 // compiled kernels.
 class DummyResourceForTest : public ResourceBase {
  public:
-  string DebugString() const override { return "dummy"; }
+  std::string DebugString() const override { return "dummy"; }
   void Increment() { ++value_; }
   int Get() { return value_; }
 
@@ -268,8 +268,8 @@ TEST_F(XlaCompilerTest, Simple) {
                                      std::move(graph), args, &result));
 
   // Tests that the generated computation works.
-  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -281,7 +281,7 @@ TEST_F(XlaCompilerTest, Simple) {
           .value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32>({4, 143});
+  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32_t>({4, 143});
   xla::Literal expected_literal = xla::LiteralUtil::MakeTuple({&expected0});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
 }
@@ -366,8 +366,8 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) {
                                      args, &result));
 
   // Tests that the generated computation works.
-  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -484,7 +484,7 @@ TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForRetVal) {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(scope.ToGraph(graph.get()));
@@ -602,7 +602,7 @@ TEST_F(XlaCompilerTest, MixedOrderArguments) {
     auto read = ops::ReadVariableOp(
         scope.WithControlDependencies(std::vector<Operation>{write}), var,
         DT_INT32);
-    auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+    auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
     auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
     std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
     TF_ASSERT_OK(scope.ToGraph(graph.get()));
@@ -680,7 +680,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   // func(a) { b=7; c=-a; return b, c; }
   Scope scope = Scope::NewRootScope().ExitOnError();
   auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
-  auto b = ops::Const<int32>(scope.WithOpName("B"), 7);
+  auto b = ops::Const<int32_t>(scope.WithOpName("B"), 7);
   auto c = ops::Neg(scope.WithOpName("C"), a);
   auto d = ops::_Retval(scope.WithOpName("D"), b, 0);
   auto e = ops::_Retval(scope.WithOpName("E"), c, 1);
@@ -710,7 +710,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     EXPECT_FALSE(result.outputs[1].is_constant);
 
     // Tests that the generated computation works.
-    xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
+    xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
     std::unique_ptr<xla::GlobalData> param0_data =
         client_->TransferToServer(param0_literal).value();
 
@@ -718,8 +718,8 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->Execute(*result.computation, {param0_data.get()}).value();
     xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-    xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32>(7);
-    xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({-7, -42});
+    xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32_t>(7);
+    xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32_t>({-7, -42});
     xla::Literal expected =
         xla::LiteralUtil::MakeTuple({&expected0, &expected1});
     EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected, actual_literal));
@@ -885,7 +885,7 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) {
         // The names of instructions were uniquified by the XlaBuilder and the
         // unique ids may be different, the rest of the fields should be
         // identical.
-        string str1, str2;
+        std::string str1, str2;
         LOG(INFO) << "instr1 = " << instr1.DebugString();
         LOG(INFO) << "instr2 = " << instr2.DebugString();
         instr1.AppendPartialToString(&str1);
@@ -904,7 +904,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   auto flow = ops::Const<float>(scope, {});
   auto grad1 = ops::TensorArrayGrad(scope, arg, flow, "grad1");
   auto grad2 = ops::TensorArrayGrad(scope, arg, grad1.flow_out, "grad2");
-  auto index = ops::Const<int32>(scope, 1);
+  auto index = ops::Const<int32_t>(scope, 1);
   auto write = ops::TensorArrayWrite(scope, grad1.grad_handle, index, index,
                                      grad2.flow_out);
   auto read = ops::TensorArrayRead(scope, arg, index, write.flow_out, DT_INT32);
@@ -933,12 +933,12 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   const XlaCompiler::ResourceUpdate& update = result.resource_updates[0];
   EXPECT_EQ(0, update.input_index);
   EXPECT_EQ(DT_INT32, update.type);
-  EXPECT_EQ((std::set<string>{"grad1", "grad2"}),
+  EXPECT_EQ((std::set<std::string>{"grad1", "grad2"}),
             update.tensor_array_gradients_accessed);
 
   // Tests that the generated computation works.
-  xla::Literal input_base = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal input_grad2 = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal input_base = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal input_grad2 = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   xla::Literal input = xla::LiteralUtil::MakeTuple({&input_base, &input_grad2});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(input).value();
@@ -947,10 +947,10 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
       client_->Execute(*result.computation, {param0_data.get()}).value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal output_read = xla::LiteralUtil::CreateR0<int32>(42);
-  xla::Literal output_base = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal output_grad1 = xla::LiteralUtil::CreateR1<int32>({0, 1});
-  xla::Literal output_grad2 = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal output_read = xla::LiteralUtil::CreateR0<int32_t>(42);
+  xla::Literal output_base = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal output_grad1 = xla::LiteralUtil::CreateR1<int32_t>({0, 1});
+  xla::Literal output_grad2 = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   xla::Literal output_resource =
       xla::LiteralUtil::MakeTuple({&output_base, &output_grad1, &output_grad2});
   xla::Literal expected_literal =
@@ -964,7 +964,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   auto arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
   auto flow = ops::Const<float>(scope, {});
   auto grad1 = ops::TensorArrayGrad(scope, arg, flow, "grad1");
-  auto index = ops::Const<int32>(scope, 1);
+  auto index = ops::Const<int32_t>(scope, 1);
   auto read = ops::TensorArrayRead(scope, arg, index, grad1.flow_out, DT_INT32);
   auto retval = ops::_Retval(scope.WithOpName("retval"), read, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
@@ -996,7 +996,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   auto arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
   auto flow = ops::Const<float>(scope, {});
   auto grad1 = ops::TensorArrayGrad(scope, arg, flow, "grad2");
-  auto index = ops::Const<int32>(scope, 1);
+  auto index = ops::Const<int32_t>(scope, 1);
   auto read = ops::TensorArrayRead(scope, arg, index, grad1.flow_out, DT_INT32);
   auto retval = ops::_Retval(scope.WithOpName("retval"), read, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
@@ -1067,8 +1067,8 @@ TEST_F(XlaCompilerTest, FunctionCallWithConstants) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto value = ops::Const<int32>(scope.WithOpName("value"), 1, {});
-  auto shape = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  auto value = ops::Const<int32_t>(scope.WithOpName("value"), 1, {});
+  auto shape = ops::Const<int32_t>(scope.WithOpName("shape"), {5}, {1});
   TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(flib));
 
   NodeDef def;
@@ -1151,9 +1151,9 @@ TEST_F(XlaCompilerTest, SliceWithDynamicBegins) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto value = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  auto value = ops::Const<int32_t>(scope.WithOpName("shape"), {5}, {1});
   auto begin = ops::_Arg(scope.WithOpName("arg"), DT_INT32, 0);
-  auto size = ops::Const<int32>(scope.WithOpName("value"), {1}, {1});
+  auto size = ops::Const<int32_t>(scope.WithOpName("value"), {1}, {1});
 
   TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(flib));
 
@@ -1188,8 +1188,8 @@ TEST_F(XlaCompilerTest, SliceWithDynamicBegins) {
 
 void RunAndCheckVariablesComputation(
     xla::Client* client, const XlaCompiler::CompilationResult& result) {
-  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param0_data =
       client->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -1201,8 +1201,8 @@ void RunAndCheckVariablesComputation(
           .value();
   xla::Literal actual_literal = client->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32>({5, 144});
-  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({4, 143});
+  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32_t>({5, 144});
+  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32_t>({4, 143});
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1220,7 +1220,7 @@ TEST_F(XlaCompilerTest, Variables) {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(scope.ToGraph(graph.get()));
@@ -1356,7 +1356,7 @@ TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) {
                                      std::move(graph), args, &result));
 
   // Tests that the generated computation works.
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param1_data =
       client_->TransferToServer(param1_literal).value();
 
@@ -1379,7 +1379,7 @@ TEST_F(XlaCompilerTest, ReturnResourceHandle) {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto r = ops::_Retval(scope.WithOpName("R"), var, 0);
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 1);
 
@@ -1414,7 +1414,7 @@ absl::StatusOr<std::unique_ptr<Graph>> BuildTestGraph() {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_RETURN_IF_ERROR(scope.ToGraph(graph.get()));
@@ -1475,9 +1475,9 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   // Tests that the generated computation works.
   xla::Literal param0_literal =
-      xla::LiteralUtil::CreateR2<int32>({{4, 55}, {1, -3}});
+      xla::LiteralUtil::CreateR2<int32_t>({{4, 55}, {1, -3}});
   xla::Literal param1_literal =
-      xla::LiteralUtil::CreateR1<int32>({22, 11, 33, 404});
+      xla::LiteralUtil::CreateR1<int32_t>({22, 11, 33, 404});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -1490,8 +1490,9 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
   xla::Literal expected0 =
-      xla::LiteralUtil::CreateR2<int32>({{27, 67}, {35, 402}});
-  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({26, 66, 34, 401});
+      xla::LiteralUtil::CreateR2<int32_t>({{27, 67}, {35, 402}});
+  xla::Literal expected1 =
+      xla::LiteralUtil::CreateR1<int32_t>({26, 66, 34, 401});
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1547,9 +1548,9 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
 
   // Tests that the generated computation works.
   xla::Literal param0_literal =
-      xla::LiteralUtil::CreateR1<int32>({4, 55, 1, -3});
+      xla::LiteralUtil::CreateR1<int32_t>({4, 55, 1, -3});
   xla::Literal param1_literal =
-      xla::LiteralUtil::CreateR1<int32>({22, 11, 33, 404});
+      xla::LiteralUtil::CreateR1<int32_t>({22, 11, 33, 404});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -1561,8 +1562,10 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
           .value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32>({27, 67, 35, 402});
-  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({26, 66, 34, 401});
+  xla::Literal expected0 =
+      xla::LiteralUtil::CreateR1<int32_t>({27, 67, 35, 402});
+  xla::Literal expected1 =
+      xla::LiteralUtil::CreateR1<int32_t>({26, 66, 34, 401});
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1587,8 +1590,8 @@ TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto value = ops::Const<int32>(scope.WithOpName("value"), 1, {});
-  auto shape = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  auto value = ops::Const<int32_t>(scope.WithOpName("value"), 1, {});
+  auto shape = ops::Const<int32_t>(scope.WithOpName("shape"), {5}, {1});
   TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(flib));
 
   NodeDef def;
@@ -1684,7 +1687,8 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
   side_effecting_op.set_name("DummySideEffectingOp");
   side_effecting_op.set_op("DummySideEffectingOp");
   AddNodeAttr(kXlaTokenInputNodesAttrName,
-              std::vector<string>{kXlaTokenArgNodeName}, &side_effecting_op);
+              std::vector<std::string>{kXlaTokenArgNodeName},
+              &side_effecting_op);
   AddNodeAttr(kXlaOriginalOutsideCompilationNodeName, side_effecting_op.name(),
               &side_effecting_op);
   absl::Status status;
@@ -1768,8 +1772,8 @@ TEST_F(XlaCompilerTest, OpsWithTensorListInput) {
   }
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto element_shape = ops::Const<int32>(scope, {1}, {1});
-  auto max_elements = ops::Const<int32>(scope, {10}, {});
+  auto element_shape = ops::Const<int32_t>(scope, {1}, {1});
+  auto max_elements = ops::Const<int32_t>(scope, {10}, {});
   auto arg = ops::_Arg(scope.WithOpName("arg"), DT_VARIANT, 0);
   std::initializer_list<Output> out = {arg, arg};
   auto add_n = ops::AddN(scope, out);
@@ -1822,7 +1826,7 @@ TEST_F(XlaCompilerTest, WhileWithResources) {
     auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
     auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
     auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_RESOURCE, 2);
-    auto less = ops::Less(scope, arg0, ops::Const<int32>(scope, 10));
+    auto less = ops::Less(scope, arg0, ops::Const<int32_t>(scope, 10));
     (void)ops::_Retval(scope.WithOpName("ret"), less, 0);
     TF_ASSERT_OK(scope.ToGraph(graph.get()));
     FunctionDef fdef;
@@ -1899,9 +1903,9 @@ TEST_F(XlaCompilerTest, WhileWithResources) {
   ASSERT_EQ(output2.input_index, 2);
 
   // Tests that the generated computation works.
-  xla::Literal literal0 = xla::LiteralUtil::CreateR0<int32>(0);
-  xla::Literal literal1 = xla::LiteralUtil::CreateR0<int32>(2);
-  xla::Literal literal2 = xla::LiteralUtil::CreateR0<int32>(1);
+  xla::Literal literal0 = xla::LiteralUtil::CreateR0<int32_t>(0);
+  xla::Literal literal1 = xla::LiteralUtil::CreateR0<int32_t>(2);
+  xla::Literal literal2 = xla::LiteralUtil::CreateR0<int32_t>(1);
   std::unique_ptr<xla::GlobalData> data0 =
       client_->TransferToServer(literal0).value();
   std::unique_ptr<xla::GlobalData> data1 =
@@ -1916,9 +1920,9 @@ TEST_F(XlaCompilerTest, WhileWithResources) {
           .value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32>(10);
-  xla::Literal expected1 = xla::LiteralUtil::CreateR0<int32>(2);
-  xla::Literal expected2 = xla::LiteralUtil::CreateR0<int32>(1);
+  xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32_t>(10);
+  xla::Literal expected1 = xla::LiteralUtil::CreateR0<int32_t>(2);
+  xla::Literal expected2 = xla::LiteralUtil::CreateR0<int32_t>(1);
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1978,7 +1982,7 @@ TEST_F(XlaCompilerTest, SetShardingForReturnedTuple) {
 
 TEST_F(XlaCompilerTest, AliasResourceUpdates) {
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto a = ops::Const<int32>(scope.WithOpName("A"), {1, 2});
+  auto a = ops::Const<int32_t>(scope.WithOpName("A"), {1, 2});
   auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
   auto write = ops::AssignAddVariableOp(scope, var, a);
   auto read = ops::ReadVariableOp(
@@ -2022,7 +2026,7 @@ TEST_F(XlaCompilerTest, AliasResourceUpdates) {
 TEST_F(XlaCompilerTest, SetDeviceToHostMetadataExactDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
 
@@ -2035,7 +2039,7 @@ TEST_F(XlaCompilerTest, SetDeviceToHostMetadataExactDuplicate) {
 TEST_F(XlaCompilerTest, SetDeviceToHostMetadataMismatchedDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
   std::vector<DataType> types2{DT_FLOAT};
@@ -2051,7 +2055,7 @@ TEST_F(XlaCompilerTest, SetDeviceToHostMetadataMismatchedDuplicate) {
 TEST_F(XlaCompilerTest, SetHostToDeviceMetadataExactDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
 
@@ -2064,7 +2068,7 @@ TEST_F(XlaCompilerTest, SetHostToDeviceMetadataExactDuplicate) {
 TEST_F(XlaCompilerTest, SetHostToDeviceMetadataMismatchedDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
   std::vector<DataType> types2{DT_FLOAT};
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 92ddf0125aded1..fad607b1ae1333 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -67,7 +67,7 @@ XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
   }
 }
 
-string XlaContext::DebugString() const { return "XLA JIT context"; }
+std::string XlaContext::DebugString() const { return "XLA JIT context"; }
 
 void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   const int64_t retvals_size = retvals_.size();
@@ -84,7 +84,7 @@ XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
   return LookupOrCreate(type, &max_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Max() for " << type_string;
     xla::XlaBuilder b("max<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -100,7 +100,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
 
 const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
   return LookupOrCreate(type, &min_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Min() for " << type_string;
     xla::XlaBuilder b("min<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -116,7 +116,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
 
 const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
   return LookupOrCreate(type, &add_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Add() for " << type_string;
     xla::XlaBuilder b("add<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -133,7 +133,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
 const xla::XlaComputation* XlaContext::GetOrCreateLogAddExp(
     const DataType type) {
   return LookupOrCreate(type, &log_add_exp_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building LogAddExp() for " << type_string;
     xla::XlaBuilder b("log_add_exp<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -154,7 +154,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateLogAddExp(
 
 const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
   return LookupOrCreate(type, &mul_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Mul() for " << type_string;
     xla::XlaBuilder b("mul<" + type_string + ">");
     xla::PrimitiveType xla_type;
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 9184fb4300633c..1d72f0c756f364 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -50,7 +50,7 @@ class XlaContext : public ResourceBase {
              const Graph* graph);
 
   // Virtual method defined by ResourceBase.
-  string DebugString() const override;
+  std::string DebugString() const override;
 
   XlaCompiler* compiler() const { return compiler_; }
 
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index 61bd10e413ccf3..e867dd14209ab8 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -73,7 +73,7 @@ XlaExpression XlaExpression::Resource(XlaResource* resource) {
   return e;
 }
 
-string XlaExpression::HumanString() const {
+std::string XlaExpression::HumanString() const {
   switch (kind_) {
     case Kind::kInvalid:
       return "invalid";
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index d410b79a3da137..ed0041fc9942a0 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -115,7 +115,7 @@ class XlaExpression {
   XlaResource* resource() const { return resource_; }
 
   // Returns a human-readable summary of the expression.
-  string HumanString() const;
+  std::string HumanString() const;
 
   // Returns the value of a kValue or kXlaOp as an xla::XlaOp. Returns
   // an erroneous XlaOp if the expression is not a constant or an expression.
diff --git a/tensorflow/compiler/tf2xla/xla_expression_test.cc b/tensorflow/compiler/tf2xla/xla_expression_test.cc
index 7a0cc34de9af2e..797002476aeb1c 100644
--- a/tensorflow/compiler/tf2xla/xla_expression_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression_test.cc
@@ -38,14 +38,15 @@ class XlaExpressionTest : public ::testing::Test {
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
     builder_ = std::make_unique<xla::XlaBuilder>("acomputation");
-    constant_ = test::AsScalar<int32>(42);
-    op_ = xla::ConstantR0<int32>(builder_.get(), 7);
+    constant_ = test::AsScalar<int32_t>(42);
+    op_ = xla::ConstantR0<int32_t>(builder_.get(), 7);
     non_constant_op_ = xla::Parameter(
         builder_.get(), 0, xla::ShapeUtil::MakeShape(xla::F32, {}), "x");
     resource_ = std::make_unique<XlaResource>(
-        XlaResource::kVariable, /*arg_num=*/0, /*name=*/string("avariable"),
-        DT_INT32, TensorShape({17, 3}), op_, /*tensor_array_size=*/-1,
-        /*tensor_array_gradients=*/std::set<string>(),
+        XlaResource::kVariable, /*arg_num=*/0,
+        /*name=*/std::string("avariable"), DT_INT32, TensorShape({17, 3}), op_,
+        /*tensor_array_size=*/-1,
+        /*tensor_array_gradients=*/std::set<std::string>(),
         /*tensor_array_multiple_writes_aggregate=*/false);
   }
 
@@ -87,8 +88,8 @@ TEST_F(XlaExpressionTest, AsXlaOp) {
                           builder_->BuildConstantSubGraph(const_as_op));
   TF_ASSERT_OK_AND_ASSIGN(xla::Literal value,
                           client_->ComputeConstant(computation));
-  EXPECT_TRUE(xla::LiteralTestUtil::Equal(xla::LiteralUtil::CreateR0<int32>(42),
-                                          value));
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(
+      xla::LiteralUtil::CreateR0<int32_t>(42), value));
 }
 
 TEST_F(XlaExpressionTest, GetShape) {
@@ -120,7 +121,7 @@ TEST_F(XlaExpressionTest, ResolveConstant) {
       std::optional<Tensor> op_constant,
       XlaExpression::XlaOp(op_, DT_INT32).ResolveConstant(client_));
   ASSERT_TRUE(op_constant.has_value());
-  test::ExpectTensorEqual<int32>(test::AsScalar<int32>(7), *op_constant);
+  test::ExpectTensorEqual<int32_t>(test::AsScalar<int32_t>(7), *op_constant);
 
   TF_ASSERT_OK_AND_ASSIGN(std::optional<Tensor> op_nonconstant,
                           XlaExpression::XlaOp(non_constant_op_, DT_FLOAT)
@@ -131,7 +132,7 @@ TEST_F(XlaExpressionTest, ResolveConstant) {
       std::optional<Tensor> constant_constant,
       XlaExpression::Constant(constant_).ResolveConstant(client_));
   ASSERT_TRUE(constant_constant.has_value());
-  test::ExpectTensorEqual<int32>(constant_, *constant_constant);
+  test::ExpectTensorEqual<int32_t>(constant_, *constant_constant);
 }
 
 TEST_F(XlaExpressionTest, ResolveConstantOnResource) {
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index eb91ed5c3f78d6..45814517342abc 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -233,7 +233,7 @@ absl::Status ResolveDeviceAssignment(
     // For GPU collectives, `xla_global_id`s are arbitrary integers, and XLA
     // requires a mapping from local device IDs to global device IDs.
     const DeviceMgr* device_mgr = ctx->function_library()->device_mgr();
-    std::map<int, xla::GlobalDeviceId> global_device_ids;
+    absl::btree_map<xla::LocalDeviceId, xla::GlobalDeviceId> global_device_ids;
 
     for (int device_idx = 0; device_idx < params->group.group_size;
          device_idx++) {
@@ -246,8 +246,8 @@ absl::Status ResolveDeviceAssignment(
         // This is a local device, so include it in the mapping.
         const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info =
             resolved_device->tensorflow_accelerator_device_info();
-        global_device_ids[accelerator_device_info->stream->parent()
-                              ->device_ordinal()] =
+        global_device_ids[xla::LocalDeviceId(
+            accelerator_device_info->stream->parent()->device_ordinal())] =
             device_attributes.xla_global_id();
       }
     }
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 38f01c83db8251..0b3425e5b8524a 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -136,7 +136,7 @@ struct XlaResourceUpdate {
   bool modified;
 
   // If the resource is a TensorArray, the set of gradients read or written.
-  std::set<string> tensor_array_gradients_accessed;
+  std::set<std::string> tensor_array_gradients_accessed;
 };
 
 struct XlaCompilationResult {
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index ad571976cbfcf5..b374e8c8e81dd6 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
+#include "xla/backends/cpu/buffer_allocation_info_util.h"
 #include "xla/backends/cpu/codegen/compiled_function_library.h"
 #include "xla/client/client_library.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/builder/xla_computation.h"
-#include "xla/service/cpu/buffer_info_util.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/platform_util.h"
@@ -62,10 +62,10 @@ absl::StatusOr<size_t> ComputeResultIndex(
 
 // Returns the number of results.
 int CountResults(
-    absl::Span<const xla::cpu_function_runtime::BufferInfo> buffer_infos) {
+    absl::Span<const xla::cpu::BufferAllocationInfo> buffer_infos) {
   int num_results = 0;
   for (const auto& info : buffer_infos) {
-    if (info.is_result_parameter()) {
+    if (info.is_result()) {
       ++num_results;
     }
   }
@@ -76,12 +76,12 @@ int CountResults(
 // tf2xla::{Feed,Fetch,Variable}. We hold the actual strings in nonempty_names,
 // and hold arrays of pointers in name_ptrs, terminated by a nullptr entry.
 template <typename T>
-void CollectNames(const T& entries, std::vector<string>* nonempty_names,
+void CollectNames(const T& entries, std::vector<std::string>* nonempty_names,
                   std::vector<const char*>* name_ptrs) {
   // First collect `nonempty_names`, to ensure the underlying strings won't
   // change out from under us.
   for (const auto& entry : entries) {
-    const string& name = entry.name();
+    const std::string& name = entry.name();
     if (!name.empty()) {
       nonempty_names->push_back(name);
     }
@@ -90,7 +90,7 @@ void CollectNames(const T& entries, std::vector<string>* nonempty_names,
   name_ptrs->reserve(entries.size() + 1);  // +1 for nullptr array terminator
   size_t nonempty_index = 0;
   for (const auto& entry : entries) {
-    const string& name = entry.name();
+    const std::string& name = entry.name();
     if (!name.empty()) {
       name_ptrs->push_back(nonempty_names->at(nonempty_index).c_str());
       ++nonempty_index;
@@ -150,13 +150,18 @@ XlaJitCompiledCpuFunction::Compile(
       cpu_executable->buffer_assignment();
 
   // Compute buffer infos and the result index, needed to run the raw function.
-  std::vector<xla::cpu_function_runtime::BufferInfo> buffer_infos =
-      xla::cpu::CreateBufferInfosFromBufferAssignment(cpu_executable->module(),
-                                                      buffer_assignment);
-  std::vector<int32> arg_index_table =
-      xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
-  std::vector<int32> result_index_table =
-      xla::cpu::CreateResultIndexTableFromBufferInfos(buffer_infos);
+  std::vector<xla::cpu::BufferAllocationInfo> buffer_infos =
+      xla::cpu::CreateBufferAllocationInfos(cpu_executable->module(),
+                                            buffer_assignment);
+
+  std::vector<xla::cpu::BufferAllocationInfo> buffer_allocation_infos =
+      xla::cpu::CreateBufferAllocationInfos(cpu_executable->module(),
+                                            buffer_assignment);
+
+  std::vector<int32_t> arg_index_table =
+      xla::cpu::CreateArgIndexTable(buffer_infos);
+  std::vector<int32_t> result_index_table =
+      xla::cpu::CreateResultIndexTable(buffer_infos);
   TF_ASSIGN_OR_RETURN(size_t result_index,
                       ComputeResultIndex(buffer_assignment));
   const int num_results = CountResults(buffer_infos);
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index 8d142ffbe3254f..6f61f472a2fd5a 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -22,10 +22,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
 #include "xla/client/local_client.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/service/cpu/executable.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -82,20 +83,20 @@ class XlaJitCompiledCpuFunction {
   XlaCompiledCpuFunction::StaticData static_data_;
 
   // The backing array for buffer infos.
-  std::vector<xla::cpu_function_runtime::BufferInfo> buffer_infos_;
+  std::vector<xla::cpu::BufferAllocationInfo> buffer_infos_;
 
   // The backing array for the arg index table.
-  std::vector<int32> arg_index_table_;
+  std::vector<int32_t> arg_index_table_;
 
   // The backing array for the result index table.
-  std::vector<int32> result_index_table_;
+  std::vector<int32_t> result_index_table_;
 
   // The backing arrays of arg and result names. We hold the actual strings in
   // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static
   // data to refer to.
-  std::vector<string> nonempty_arg_names_;
-  std::vector<string> nonempty_variable_names_;
-  std::vector<string> nonempty_result_names_;
+  std::vector<std::string> nonempty_arg_names_;
+  std::vector<std::string> nonempty_variable_names_;
+  std::vector<std::string> nonempty_result_names_;
   std::vector<const char*> arg_names_;
   std::vector<const char*> variable_names_;
   std::vector<const char*> result_names_;
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index acac1efd73881f..b49e699d6e267f 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -182,18 +182,18 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   ASSERT_EQ(function.num_results(), 1);
 
   // Run the function and check results.
-  *static_cast<int32*>(function.arg_data(0)) = 10;
-  *static_cast<int32*>(function.arg_data(1)) = 32;
+  *static_cast<int32_t*>(function.arg_data(0)) = 10;
+  *static_cast<int32_t*>(function.arg_data(1)) = 32;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 42);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 42);
 
   // Run the function again.
-  *static_cast<int32*>(function.arg_data(0)) = 100;
-  *static_cast<int32*>(function.arg_data(1)) = 320;
+  *static_cast<int32_t*>(function.arg_data(0)) = 100;
+  *static_cast<int32_t*>(function.arg_data(1)) = 320;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 420);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 420);
 
   // Check name to index lookups.
   EXPECT_TRUE(function.HasNameIndices());
@@ -268,20 +268,20 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
   ASSERT_EQ(function.num_results(), 2);
 
   // Run the function and check results.
-  *static_cast<int32*>(function.arg_data(0)) = 10;
-  *static_cast<int32*>(function.arg_data(1)) = 32;
+  *static_cast<int32_t*>(function.arg_data(0)) = 10;
+  *static_cast<int32_t*>(function.arg_data(1)) = 32;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 10);
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(1)), 42);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 10);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(1)), 42);
 
   // Run the function again.
-  *static_cast<int32*>(function.arg_data(0)) = 100;
-  *static_cast<int32*>(function.arg_data(1)) = 320;
+  *static_cast<int32_t*>(function.arg_data(0)) = 100;
+  *static_cast<int32_t*>(function.arg_data(1)) = 320;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 100);
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(1)), 420);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 100);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(1)), 420);
 
   // Check name to index lookups.
   EXPECT_TRUE(function.HasNameIndices());
@@ -325,7 +325,7 @@ TEST(XlaJitCompiledCpuFunction, CanCompileWithAdditionalPlatform) {
 
     int VisibleDeviceCount() const override { return 0; }
 
-    const string& Name() const override { return name_; }
+    const std::string& Name() const override { return name_; }
 
     absl::StatusOr<std::unique_ptr<se::DeviceDescription>> DescriptionForDevice(
         int ordinal) const override {
@@ -338,7 +338,7 @@ TEST(XlaJitCompiledCpuFunction, CanCompileWithAdditionalPlatform) {
     }
 
    private:
-    string name_;
+    std::string name_;
   };
 
   TF_EXPECT_OK(
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 4a570827029330..baefe0138d43dd 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -207,9 +207,9 @@ static absl::Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::S16) {
-    *out = literal.Get<int16>({});
+    *out = literal.Get<int16_t>({});
   } else if (literal.shape().element_type() == xla::S32) {
-    *out = literal.Get<int32>({});
+    *out = literal.Get<int32_t>({});
   } else if (literal.shape().element_type() == xla::S64) {
     *out = literal.Get<int64_t>({});
   } else {
@@ -370,7 +370,7 @@ static absl::Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
   int64_t size = xla::ShapeUtil::ElementsIn(literal.shape());
   if (literal.shape().element_type() == xla::S32) {
     for (int64_t i = 0; i < size; ++i) {
-      out->push_back(literal.Get<int32>({i}));
+      out->push_back(literal.Get<int32_t>({i}));
     }
   } else if (literal.shape().element_type() == xla::S64) {
     for (int64_t i = 0; i < size; ++i) {
@@ -422,7 +422,7 @@ absl::Status XlaOpKernelContext::ConstantInputAsInt64Literal(
     case xla::S32: {
       *out = xla::Literal(
           xla::ShapeUtil::ChangeElementType(literal.shape(), xla::S64));
-      auto src_data = literal.data<int32>();
+      auto src_data = literal.data<int32_t>();
       for (int64_t i = 0; i < src_data.size(); ++i) {
         out->data<int64_t>()[i] = src_data[i];
       }
@@ -677,7 +677,7 @@ xla::PrimitiveType XlaOpKernelContext::output_xla_type(int index) {
   return type;
 }
 
-void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
+void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp handle) {
   SetOutputExpression(
       index,
       XlaExpression::XlaOp(handle, context_->expected_output_dtype(index)));
@@ -688,7 +688,7 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
 }
 
 void XlaOpKernelContext::SetTensorListOutput(int index,
-                                             const xla::XlaOp& handle) {
+                                             const xla::XlaOp handle) {
   SetOutputExpression(index, XlaExpression::TensorList(handle));
 }
 
@@ -811,7 +811,7 @@ const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMul(
 
 const Tensor& XlaOpKernelContext::GetInputTensorByName(absl::string_view name) {
   const Tensor* tensor;
-  CHECK(context_->input(name, &tensor).ok());
+  CHECK_OK(context_->input(name, &tensor));
   return *tensor;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index b0830d0766acb2..30de5a796d03a1 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -249,7 +249,7 @@ class XlaOpKernelContext {
   // Sets output `index` to the XlaOp `handle`.
   // All outputs should be set using SetOutput and SetConstantOutput, not
   // via the underlying OpKernelContext.
-  void SetOutput(int index, const xla::XlaOp& handle);
+  void SetOutput(int index, xla::XlaOp handle);
 
   // Sets output `index` to compile-time constant `host_tensor`, where
   // `host_tensor` is a tensor in host memory. It is preferable to use
@@ -260,7 +260,7 @@ class XlaOpKernelContext {
   void SetOutputExpression(int index, const XlaExpression& expression);
 
   // Sets output `index` to the Tensor List `handle`.
-  void SetTensorListOutput(int index, const xla::XlaOp& handle);
+  void SetTensorListOutput(int index, xla::XlaOp handle);
 
   // Status handling.
   void SetStatus(const absl::Status& status) { context_->SetStatus(status); }
@@ -341,27 +341,27 @@ class XlaOpKernelContext {
   // Gets an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::XlaComputation* GetOrCreateMax(const DataType type);
+  const xla::XlaComputation* GetOrCreateMax(DataType type);
 
   // Gets an XLA lambda to compute Min. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::XlaComputation* GetOrCreateMin(const DataType type);
+  const xla::XlaComputation* GetOrCreateMin(DataType type);
 
   // Gets an XLA lambda to compute Add. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::XlaComputation* GetOrCreateAdd(const DataType type);
+  const xla::XlaComputation* GetOrCreateAdd(DataType type);
 
   // Gets an XLA lambda to compute LogAddExp. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::XlaComputation* GetOrCreateLogAddExp(const DataType type);
+  const xla::XlaComputation* GetOrCreateLogAddExp(DataType type);
 
   // Gets an XLA lambda to compute Mul. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::XlaComputation* GetOrCreateMul(const DataType type);
+  const xla::XlaComputation* GetOrCreateMul(DataType type);
 
   // Returns stack trace encoded as a string at a given module, or an empty
   // string if none found.
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 445065971f2a6a..c74db865769229 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -61,7 +61,7 @@ static absl::Status LaunchOpHasKernelForDevice(const DeviceType& device_type) {
   NodeDef node_def;
   node_def.set_name("_XlaLaunch-op");
   node_def.set_op("XlaLaunch");
-  string kernel_class_name;
+  std::string kernel_class_name;
   TF_RETURN_IF_ERROR(FindKernelDef(device_type, node_def, /*KernelDef*/ nullptr,
                                    &kernel_class_name));
   VLOG(1) << "LaunchOpHasKernelForDevice"
@@ -128,7 +128,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
 }
 
 /* static */ void XlaOpRegistry::RegisterCompilationDevice(
-    const string& device_name, const DeviceRegistration& registration) {
+    const std::string& device_name, const DeviceRegistration& registration) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   auto result =
@@ -138,7 +138,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
 }
 
 /* static */ void XlaOpRegistry::RegisterBackend(
-    const string& compilation_device_name,
+    const std::string& compilation_device_name,
     absl::Span<const DataType> supported_types, BackendOpFilter op_filter) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
@@ -151,14 +151,14 @@ XlaOpRegistry::~XlaOpRegistry() = default;
 }
 
 /* static */ bool XlaOpRegistry::IsCompilationDevice(
-    const string& device_name) {
+    const std::string& device_name) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   return registry.backends_.find(device_name) != registry.backends_.end();
 }
 
 /* static */ bool XlaOpRegistry::GetCompilationDevice(
-    const string& device_name, const DeviceRegistration** registration) {
+    const std::string& device_name, const DeviceRegistration** registration) {
   XlaOpRegistry& registry = Instance();
 
   // Lazily register the CPU and GPU JIT devices the first time
@@ -235,7 +235,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
   // 2. Process op registration without device allowlists:
   //      this pass registers the kernels for all the other supported backends.
   for (auto& ops : registry.ops_) {
-    const string& op_name = ops.first;
+    const std::string& op_name = ops.first;
     std::vector<std::unique_ptr<OpRegistration>>& op_registrations = ops.second;
     // Partition the op registration so that the ones with device allowlists
     // precede the one without device allowlist.
@@ -247,7 +247,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
     // Collect a set of backend registered by ops with device allowlists.
     // The op registration without allowlists will register a generic kernel
     // for all other backends not in this set.
-    std::unordered_set<string> allowlisted_backend;
+    std::unordered_set<std::string> allowlisted_backend;
     for (auto& op_registration : op_registrations) {
       if (op_registration->has_device_allowlist) {
         allowlisted_backend.insert(op_registration->device_allowlist.begin(),
@@ -267,7 +267,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
       }
       TF_CHECK_OK(lookup_status);
 
-      std::unordered_set<string> type_attrs;
+      std::unordered_set<std::string> type_attrs;
       for (const OpDef::AttrDef& attr_def : op_def->attr()) {
         if (attr_def.type() == "type" || attr_def.type() == "list(type)") {
           type_attrs.insert(attr_def.name());
@@ -309,7 +309,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
         // b) the types allowed by the OpDef, and
         // c) the type constraints.
         bool unsatisfiable_type_constraint = false;
-        for (const string& type_attr : type_attrs) {
+        for (const std::string& type_attr : type_attrs) {
           KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint();
           attr_constraint->set_name(type_attr);
           auto* allowed_values =
@@ -375,7 +375,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
 }
 
 std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
-    const string& compilation_device_name,
+    const std::string& compilation_device_name,
     bool include_compilation_only_kernels) {
   // Ensure compilation kernels registered.
   RegisterCompilationKernels();
@@ -403,8 +403,8 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return kernels;
 }
 
-/*static*/ std::vector<string> XlaOpRegistry::GetAllRegisteredOps() {
-  std::vector<string> ops;
+/*static*/ std::vector<std::string> XlaOpRegistry::GetAllRegisteredOps() {
+  std::vector<std::string> ops;
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   ops.reserve(registry.ops_.size());
@@ -416,7 +416,7 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
 }
 
 /*static*/ const std::unordered_set<std::string>*
-XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
+XlaOpRegistry::CompileTimeConstantInputArgNames(const std::string& op) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   auto it = registry.ops_.find(op);
@@ -435,10 +435,10 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
 
   DCHECK(op_def != nullptr || op_kernel != nullptr);
 
-  std::unordered_set<string> compile_time_constant_inputs_from_attr;
-  std::vector<string> compile_time_constant_inputs_vect_from_attr;
+  std::unordered_set<std::string> compile_time_constant_inputs_from_attr;
+  std::vector<std::string> compile_time_constant_inputs_vect_from_attr;
 
-  const std::unordered_set<string>* compile_time_constant_inputs;
+  const std::unordered_set<std::string>* compile_time_constant_inputs;
 
   if (TryGetNodeAttr(node_def, kXlaCompileTimeConstantInputsAttr,
                      &compile_time_constant_inputs_vect_from_attr)) {
@@ -459,7 +459,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
           << " required constants are: "
           << absl::StrJoin(*compile_time_constant_inputs, ", ");
 
-  for (const string& input : *compile_time_constant_inputs) {
+  for (const std::string& input : *compile_time_constant_inputs) {
     if (op_def) {
       NameRangeMap input_name_ranges;
       TF_RETURN_IF_ERROR(
@@ -486,7 +486,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
   return absl::OkStatus();
 }
 
-/*static*/ bool XlaOpRegistry::IsMetadataOp(const string& op) {
+/*static*/ bool XlaOpRegistry::IsMetadataOp(const std::string& op) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   auto it = registry.ops_.find(op);
@@ -500,8 +500,8 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
   return it->second.front()->is_metadata_op;
 }
 
-std::vector<string> XlaOpRegistry::BackendNames() {
-  std::vector<string> names;
+std::vector<std::string> XlaOpRegistry::BackendNames() {
+  std::vector<std::string> names;
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   names.reserve(registry.backends_.size());
@@ -511,7 +511,7 @@ std::vector<string> XlaOpRegistry::BackendNames() {
   return names;
 }
 
-bool XlaOpRegistry::IsBackendRegistered(const string& name) {
+bool XlaOpRegistry::IsBackendRegistered(const std::string& name) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   return registry.backends_.find(name) != registry.backends_.end();
@@ -524,7 +524,7 @@ XlaOpRegistry& XlaOpRegistry::Instance() {
 
 XlaOpRegistrationBuilder::XlaOpRegistrationBuilder(absl::string_view name) {
   registration_.reset(new XlaOpRegistry::OpRegistration);
-  registration_->name = string(name);
+  registration_->name = std::string(name);
 }
 
 XlaOpRegistrationBuilder XlaOpRegistrationBuilder::Name(
@@ -572,7 +572,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowStringType() {
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     absl::string_view attr_name, DataType allowed) {
   std::set<DataType>& types =
-      registration_->type_constraints[string(attr_name)];
+      registration_->type_constraints[std::string(attr_name)];
   types.insert(allowed);
   return *this;
 }
@@ -580,7 +580,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     absl::string_view attr_name, absl::Span<const DataType> allowed) {
   std::set<DataType>& types =
-      registration_->type_constraints[string(attr_name)];
+      registration_->type_constraints[std::string(attr_name)];
   for (DataType t : allowed) {
     types.insert(t);
   }
@@ -628,7 +628,7 @@ XlaBackendRegistrar::XlaBackendRegistrar(
     absl::string_view name, absl::Span<const DataType> types,
     XlaOpRegistry::BackendOpFilter op_filter) {
   XlaOpRegistry& registry = XlaOpRegistry::Instance();
-  registry.RegisterBackend(string(name), types, op_filter);
+  registry.RegisterBackend(std::string(name), types, op_filter);
 
   AddSymbolicExecutionDevice(name);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 5eaf0fb2d42bfa..9ce6e263f8feb4 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -139,7 +139,7 @@ class XlaOpRegistry {
   // Describes how to compile operators assigned to a device.
   struct DeviceRegistration {
     // The name of the an XLA compilation device to use to compile code.
-    string compilation_device_name;
+    std::string compilation_device_name;
 
     // When should we autocluster operators assigned to this device?
     AutoclusteringPolicy autoclustering_policy;
@@ -190,25 +190,25 @@ class XlaOpRegistry {
   // `backend_op_filter` should return true if the op should be registered on
   // the device; it may optionally modify the KernelDef.
   typedef bool (*BackendOpFilter)(KernelDef* kdef);
-  static void RegisterBackend(const string& compilation_device_name,
+  static void RegisterBackend(const std::string& compilation_device_name,
                               absl::Span<const DataType> supported_types,
                               BackendOpFilter op_filter);
 
   // Returns the names of the registered backends.
-  static std::vector<string> BackendNames();
+  static std::vector<std::string> BackendNames();
 
   // Returns true iff a backend with the given name is registered.
-  static bool IsBackendRegistered(const string& name);
+  static bool IsBackendRegistered(const std::string& name);
 
   // Registers `device_name` for XLA compilation, using information from
   // `registration`.
   // Does nothing if a registration for `device_name` already exists.
-  static void RegisterCompilationDevice(const string& device_name,
+  static void RegisterCompilationDevice(const std::string& device_name,
                                         const DeviceRegistration& registration);
 
   // Returns whether the device name is for the JIT device used exclusively for
   // TF2XLA conversion.
-  static bool IsCompilationDevice(const string& device_name);
+  static bool IsCompilationDevice(const std::string& device_name);
 
   // Returns the JIT device name associated with 'device_name', setting
   // 'jit_device_name', 'requires_jit', and 'enabled_jit_by_default', if they
@@ -216,7 +216,7 @@ class XlaOpRegistry {
   // JIT device is registered.
   // '*enable_jit_by_default' is set to true if we should try to JIT using this
   // device when the JIT is enabled via the Session OptimizerOptions.
-  static bool GetCompilationDevice(const string& device_name,
+  static bool GetCompilationDevice(const std::string& device_name,
                                    const DeviceRegistration** registration);
 
   // Registers all JIT kernels on JIT devices, if not already registered.
@@ -227,11 +227,11 @@ class XlaOpRegistry {
   // 'compilation_device_name'.  Does not include kernels registered as
   // CompilationOnly, iff include_compilation_only_kernels=false.
   static std::vector<const KernelDef*> DeviceKernels(
-      const string& compilation_device_name,
+      const std::string& compilation_device_name,
       bool include_compilation_only_kernels);
 
   // Returns all operations for which there are XLA kernels on any device.
-  static std::vector<string> GetAllRegisteredOps();
+  static std::vector<std::string> GetAllRegisteredOps();
 
   // Returns (via `result`) the indices of inputs to `node_def` that must be
   // compile-time constants. Returns an empty vector if the op is not
@@ -265,11 +265,11 @@ class XlaOpRegistry {
   // Return names of arguments for a given op which are supposed to be
   // constants.
   static const std::unordered_set<std::string>*
-  CompileTimeConstantInputArgNames(const string& op);
+  CompileTimeConstantInputArgNames(const std::string& op);
 
   // Returns true if `op` is a "metadata" op, one that only looks at the shapes
   // of its operands and not their values.
-  static bool IsMetadataOp(const string& op);
+  static bool IsMetadataOp(const std::string& op);
 
  private:
   friend class XlaBackendRegistrar;
@@ -298,15 +298,15 @@ class XlaOpRegistry {
   };
 
   // Map from compilation device names to a description of the backend.
-  std::unordered_map<string, Backend> backends_ TF_GUARDED_BY(mutex_);
+  std::unordered_map<std::string, Backend> backends_ TF_GUARDED_BY(mutex_);
 
   // Map from Tensorflow device names to the corresponding JIT device metadata.
-  std::unordered_map<string, DeviceRegistration> compilation_devices_
+  std::unordered_map<std::string, DeviceRegistration> compilation_devices_
       TF_GUARDED_BY(mutex_);
 
   // A description of a Tensorflow operator that can be compiled to XLA.
   struct OpRegistration {
-    string name;
+    std::string name;
 
     // Should this operator be registered only on compilation devices, without a
     // dummy kernel registered on the corresponding XLA device?
@@ -325,15 +325,15 @@ class XlaOpRegistry {
     bool allow_string_type = false;
 
     // Mapping from attribute name to a list of supported types.
-    std::unordered_map<string, std::set<DataType>> type_constraints;
+    std::unordered_map<std::string, std::set<DataType>> type_constraints;
 
     // An optional allowlist of devices. If there is no allowlist, all devices
     // are permitted.
     bool has_device_allowlist = false;
-    std::unordered_set<string> device_allowlist;
+    std::unordered_set<std::string> device_allowlist;
 
     // Names of arguments that must be compile-time constants.
-    std::unordered_set<string> compile_time_constant_inputs;
+    std::unordered_set<std::string> compile_time_constant_inputs;
 
     // True if this is a "metadata" op, one that only looks at the shapes of its
     // operands and not their values.
@@ -360,8 +360,8 @@ class XlaOpRegistry {
   // Map from operator name to OpRegistrations, populated by REGISTER_XLA_OP.
   // Registrations present under the same key must satisfy IsCompatible above,
   // and this is checked during registration.
-  std::unordered_map<string, std::vector<std::unique_ptr<OpRegistration>>> ops_
-      TF_GUARDED_BY(mutex_);
+  std::unordered_map<std::string, std::vector<std::unique_ptr<OpRegistration>>>
+      ops_ TF_GUARDED_BY(mutex_);
 
   // Have we already registered the JIT kernels on the JIT devices?
   bool jit_kernels_registered_ = false;
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 5b894d07e121ba..962b0e473a826c 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -51,29 +51,29 @@ namespace tensorflow {
 }
 
 /*static*/ std::unique_ptr<XlaResource> XlaResource::CreateStack(
-    string name, DataType type, int64_t max_size) {
+    std::string name, DataType type, int64_t max_size) {
   return std::make_unique<XlaResource>(
       XlaResource::kStack, /*arg_num=*/-1, std::move(name), type, TensorShape(),
       /*initial_value=*/xla::XlaOp(),
       /*max_array_size=*/max_size,
-      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_gradients=*/std::set<std::string>{},
       /*tensor_array_multiple_writes_aggregate=*/false);
 }
 
 /*static*/ std::unique_ptr<XlaResource> XlaResource::CreateTensorArray(
-    string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
-    int64_t max_array_size) {
+    std::string name, DataType type, TensorShape shape,
+    xla::XlaOp initial_value, int64_t max_array_size) {
   return std::make_unique<XlaResource>(
       XlaResource::kTensorArray, /*arg_num=*/-1, std::move(name), type, shape,
       initial_value, max_array_size,
-      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_gradients=*/std::set<std::string>{},
       /*tensor_array_multiple_writes_aggregate=*/false);
 }
 
 XlaResource::XlaResource(
-    Kind kind, int arg_num, string name, DataType type, TensorShape shape,
+    Kind kind, int arg_num, std::string name, DataType type, TensorShape shape,
     xla::XlaOp initial_value, int64_t max_array_size,
-    const std::set<string>& tensor_array_gradients,
+    const std::set<std::string>& tensor_array_gradients,
     bool tensor_array_multiple_writes_aggregate,
     const std::optional<ManagedStackTrace>& definition_stack_trace)
     : kind_(kind),
@@ -89,7 +89,7 @@ XlaResource::XlaResource(
       definition_stack_trace_(definition_stack_trace) {
   CHECK(kind_ != kInvalid);
 
-  for (const string& gradient : tensor_array_gradients) {
+  for (const std::string& gradient : tensor_array_gradients) {
     tensor_array_gradients_[gradient].reset(new XlaResource(
         /*kind=*/kTensorArray, /*arg_num=*/-1,
         /*name=*/absl::StrCat("TensorArrayGrad: ", name_), type_, shape_,
@@ -163,7 +163,7 @@ absl::Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
       value_ =
           xla::Tuple(builder, {xla::Broadcast(XlaHelpers::Zero(builder, type_),
                                               ta_shape.dim_sizes()),
-                               xla::ConstantR0<int32>(builder, 0)});
+                               xla::ConstantR0<int32_t>(builder, 0)});
       break;
     }
 
@@ -175,7 +175,7 @@ absl::Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
 }
 
 absl::Status XlaResource::GetOrCreateTensorArrayGradient(
-    const string& source, xla::XlaBuilder* builder,
+    const std::string& source, xla::XlaBuilder* builder,
     XlaResource** gradient_out) {
   VLOG(2) << "Gradient lookup for resource: " << name_
           << " gradient: " << source;
@@ -214,9 +214,9 @@ absl::Status XlaResource::Pack(xla::XlaOp* pack,
   return absl::OkStatus();
 }
 
-absl::Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
-                                      const xla::XlaOp pack,
-                                      xla::XlaBuilder* builder) {
+absl::Status XlaResource::SetFromPack(
+    const std::set<std::string>& gradient_sources, const xla::XlaOp pack,
+    xla::XlaBuilder* builder) {
   if (gradient_sources.empty()) {
     if (!initialized()) {
       initial_value_ = pack;
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index d4c8f7c1c9347f..07c826d21e8b3d 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -43,18 +43,19 @@ class XlaResource {
   static absl::string_view KindToString(Kind kind);
 
   // Creates a new Stack resource.
-  static std::unique_ptr<XlaResource> CreateStack(string name, DataType type,
+  static std::unique_ptr<XlaResource> CreateStack(std::string name,
+                                                  DataType type,
                                                   int64_t max_size);
 
   // Creates a new TensorArray resource.
   static std::unique_ptr<XlaResource> CreateTensorArray(
-      string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
-      int64_t max_array_size);
+      std::string name, DataType type, TensorShape shape,
+      xla::XlaOp initial_value, int64_t max_array_size);
 
-  XlaResource(Kind kind, int arg_num, string name, DataType type,
+  XlaResource(Kind kind, int arg_num, std::string name, DataType type,
               TensorShape shape, xla::XlaOp initial_value,
               int64_t max_array_size,
-              const std::set<string>& tensor_array_gradients,
+              const std::set<std::string>& tensor_array_gradients,
               bool tensor_array_multiple_writes_aggregate,
               const std::optional<ManagedStackTrace>& definition_stack_trace =
                   std::nullopt);
@@ -72,7 +73,7 @@ class XlaResource {
   int arg_num() const { return arg_num_; }
 
   // A descriptive name for the resource, used in error messages.
-  const string& name() const { return name_; }
+  const std::string& name() const { return name_; }
 
   // Current type and value of the resource. Uninitialized resources are
   // represented by a default (zero) handle and type DT_INVALID.
@@ -121,7 +122,7 @@ class XlaResource {
   // exist. The call target must be an initialized TensorArray resource. A
   // TensorArray can have multiple named gradients; see the operator
   // documentation for TensorArrayGradV3 for details.
-  absl::Status GetOrCreateTensorArrayGradient(const string& source,
+  absl::Status GetOrCreateTensorArrayGradient(const std::string& source,
                                               xla::XlaBuilder* builder,
                                               XlaResource** gradient_out);
 
@@ -138,7 +139,7 @@ class XlaResource {
   // If `reset_initial_values` is true, sets the initial_values as well as the
   // values.
   // Opposite of Pack().
-  absl::Status SetFromPack(const std::set<string>& gradient_sources,
+  absl::Status SetFromPack(const std::set<std::string>& gradient_sources,
                            xla::XlaOp pack, xla::XlaBuilder* builder);
 
   bool IsOverwritten() { return is_overwritten_; }
@@ -164,15 +165,15 @@ class XlaResource {
   // string, irrespective of the number of calls to TensorArrayGrad. The map
   // is ordered since values are packed into tuples by Pack() sorted by name
   // order.
-  const std::map<string, std::unique_ptr<XlaResource>>& tensor_array_gradients()
-      const {
+  const std::map<std::string, std::unique_ptr<XlaResource>>&
+  tensor_array_gradients() const {
     return tensor_array_gradients_;
   }
 
  private:
   const Kind kind_;
   const int arg_num_;
-  const string name_;
+  const std::string name_;
 
   DataType type_;
   TensorShape shape_;
@@ -186,7 +187,7 @@ class XlaResource {
   int64_t max_array_size_ = -1;
   bool tensor_array_multiple_writes_aggregate_ = false;
 
-  std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
+  std::map<std::string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
   bool is_overwritten_ = false;
 
   std::optional<ManagedStackTrace> definition_stack_trace_;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 29aea47709cd6d..dd8da3665c5294 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -493,7 +493,6 @@ cc_library(
         "@local_tsl//tsl/platform:framework_lite_hdrs",
         "@local_xla//xla/tsl/framework:numeric_types.h",
         "@local_xla//xla/tsl/framework:type_traits.h",
-        "@local_xla//xla/tsl/platform/default:integral_types.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -1537,7 +1536,6 @@ cc_library(
     hdrs = [
         "//tensorflow/core/platform:tflite_portable_logging_hdrs",
         "@local_tsl//tsl/platform:tflite_portable_logging_hdrs",
-        "@local_xla//xla/tsl/platform/default:integral_types.h",
     ],
     compatible_with = get_compatible_with_portable(),
     copts = tf_copts(),
@@ -1938,21 +1936,15 @@ tf_cc_tests(
 )
 
 tf_cc_tests(
-    name = "cell_reader_test",
+    name = "test_utils_test",
     size = "small",
     srcs = [
-        "//tensorflow/core/lib/monitoring:cell_reader_test.cc",
         "//tensorflow/core/lib/monitoring:test_utils_test.cc",
     ],
     deps = [
         ":protos_all_cc",
         ":test",
         ":test_main",
-        "//tensorflow/core/lib/monitoring:cell_reader",
-        "//tensorflow/core/lib/monitoring:counter",
-        "//tensorflow/core/lib/monitoring:gauge",
-        "//tensorflow/core/lib/monitoring:percentile_sampler",
-        "//tensorflow/core/lib/monitoring:sampler",
         "//tensorflow/core/lib/monitoring:test_utils",
         "//tensorflow/core/lib/monitoring:types",
         "//tensorflow/core/platform:errors",
diff --git a/tensorflow/core/activity_watcher/activity.h b/tensorflow/core/activity_watcher/activity.h
index eecd207a33fe27..fba51b43f8a3ce 100644
--- a/tensorflow/core/activity_watcher/activity.h
+++ b/tensorflow/core/activity_watcher/activity.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 
 namespace activity_watcher {
 
-using ActivityId = tsl::uint64;
+using ActivityId = uint64_t;
 constexpr ActivityId kActivityNotRecorded = 0;
 constexpr int kWatcherDisabled = 0;
 
@@ -45,7 +45,7 @@ enum ActivityCategory {
   kRendezvous = 5,
 };
 
-static tsl::string ToString(ActivityCategory category) {
+static std::string ToString(ActivityCategory category) {
   switch (category) {
     case ActivityCategory::kCollective:
       return "Collective";
@@ -64,17 +64,17 @@ static tsl::string ToString(ActivityCategory category) {
 
 // An activity to be recorded.
 struct Activity {
-  using Attributes = absl::flat_hash_map<tsl::string, tsl::string>;
+  using Attributes = absl::flat_hash_map<std::string, std::string>;
   // A human readable title of the activity.
-  tsl::string title;
+  std::string title;
   // The category of the activity.
   ActivityCategory category = ActivityCategory::kMisc;
   // Key/value pairs that are attached to the activity.
   Attributes attributes;
   Activity() = default;
-  Activity(tsl::string title, ActivityCategory category)
+  Activity(std::string title, ActivityCategory category)
       : title(std::move(title)), category(category) {}
-  Activity(tsl::string title, ActivityCategory category, Attributes attributes)
+  Activity(std::string title, ActivityCategory category, Attributes attributes)
       : title(std::move(title)),
         category(category),
         attributes(std::move(attributes)) {}
diff --git a/tensorflow/core/activity_watcher/activity_utils.cc b/tensorflow/core/activity_watcher/activity_utils.cc
index b3631076c5c2d9..58b3909a25789c 100644
--- a/tensorflow/core/activity_watcher/activity_utils.cc
+++ b/tensorflow/core/activity_watcher/activity_utils.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 namespace activity_watcher {
 
 std::unique_ptr<Activity> ActivityFromContext(
-    OpKernelContext* context, tsl::string name, ActivityCategory category,
+    OpKernelContext* context, std::string name, ActivityCategory category,
     Activity::Attributes additional_attributes) {
   Activity::Attributes attributes(std::move(additional_attributes));
   if (context) {
diff --git a/tensorflow/core/activity_watcher/activity_utils.h b/tensorflow/core/activity_watcher/activity_utils.h
index 64958cd5e09744..749ef1326ae565 100644
--- a/tensorflow/core/activity_watcher/activity_utils.h
+++ b/tensorflow/core/activity_watcher/activity_utils.h
@@ -29,7 +29,7 @@ namespace activity_watcher {
 // A convenient way to create an activity. Writes OpKernelContext information
 // and given attributes to a new activity and returns.
 std::unique_ptr<Activity> ActivityFromContext(
-    OpKernelContext* context, tsl::string name, ActivityCategory category,
+    OpKernelContext* context, std::string name, ActivityCategory category,
     Activity::Attributes additional_attributes = Activity::Attributes());
 
 }  // namespace activity_watcher
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 76b8cc01324619..caf20c11b93566 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -65,6 +65,7 @@ cc_library(
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 7f844e88ba90c6..3c954cf076ddc8 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -43,26 +43,27 @@ namespace {
 
 constexpr char kApiDefFilePattern[] = "api_def_*.pbtxt";
 
-string DefaultApiDefDir() {
+std::string DefaultApiDefDir() {
   return GetDataDependencyFilepath(
       io::JoinPath("tensorflow", "core", "api_def", "base_api"));
 }
 
-string PythonApiDefDir() {
+std::string PythonApiDefDir() {
   return GetDataDependencyFilepath(
       io::JoinPath("tensorflow", "core", "api_def", "python_api"));
 }
 
 // Reads golden ApiDef files and returns a map from file name to ApiDef file
 // contents.
-void GetGoldenApiDefs(Env* env, const string& api_files_dir,
-                      std::unordered_map<string, ApiDef>* name_to_api_def) {
-  std::vector<string> matching_paths;
+void GetGoldenApiDefs(
+    Env* env, const std::string& api_files_dir,
+    std::unordered_map<std::string, ApiDef>* name_to_api_def) {
+  std::vector<std::string> matching_paths;
   TF_CHECK_OK(env->GetMatchingPaths(
       io::JoinPath(api_files_dir, kApiDefFilePattern), &matching_paths));
 
   for (auto& file_path : matching_paths) {
-    string file_contents;
+    std::string file_contents;
     TF_CHECK_OK(ReadFileToString(env, file_path, &file_contents));
     file_contents = PBTxtFromMultiline(file_contents);
 
@@ -76,8 +77,9 @@ void GetGoldenApiDefs(Env* env, const string& api_files_dir,
 }
 
 void TestAllApiDefsHaveCorrespondingOp(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
-  std::unordered_set<string> op_names;
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
+  std::unordered_set<std::string> op_names;
   for (const auto& op : ops.op()) {
     op_names.insert(op.name());
   }
@@ -89,7 +91,8 @@ void TestAllApiDefsHaveCorrespondingOp(
 }
 
 void TestAllApiDefInputArgsAreValid(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& op : ops.op()) {
     const auto api_def_iter = api_defs_map.find(op.name());
     if (api_def_iter == api_defs_map.end()) {
@@ -113,7 +116,8 @@ void TestAllApiDefInputArgsAreValid(
 }
 
 void TestAllApiDefOutputArgsAreValid(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& op : ops.op()) {
     const auto api_def_iter = api_defs_map.find(op.name());
     if (api_def_iter == api_defs_map.end()) {
@@ -137,7 +141,8 @@ void TestAllApiDefOutputArgsAreValid(
 }
 
 void TestAllApiDefAttributeNamesAreValid(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& op : ops.op()) {
     const auto api_def_iter = api_defs_map.find(op.name());
     if (api_def_iter == api_defs_map.end()) {
@@ -159,7 +164,7 @@ void TestAllApiDefAttributeNamesAreValid(
 }
 
 void TestDeprecatedAttributesSetCorrectly(
-    const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& name_and_api_def : api_defs_map) {
     int num_deprecated_endpoints = 0;
     const auto& api_def = name_and_api_def.second;
@@ -186,7 +191,7 @@ void TestDeprecatedAttributesSetCorrectly(
 }
 
 void TestDeprecationVersionSetCorrectly(
-    const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& name_and_api_def : api_defs_map) {
     const auto& name = name_and_api_def.first;
     const auto& api_def = name_and_api_def.second;
@@ -205,13 +210,13 @@ class BaseApiTest : public ::testing::Test {
  protected:
   BaseApiTest() {
     OpRegistry::Global()->Export(false, &ops_);
-    const std::vector<string> multi_line_fields = {"description"};
+    const std::vector<std::string> multi_line_fields = {"description"};
 
     Env* env = Env::Default();
     GetGoldenApiDefs(env, DefaultApiDefDir(), &api_defs_map_);
   }
   OpList ops_;
-  std::unordered_map<string, ApiDef> api_defs_map_;
+  std::unordered_map<std::string, ApiDef> api_defs_map_;
 };
 
 // Check that all ops have an ApiDef.
@@ -233,7 +238,7 @@ TEST_F(BaseApiTest, AllApiDefsHaveCorrespondingOp) {
   TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
 }
 
-string GetOpDefHasDocStringError(const string& op_name) {
+std::string GetOpDefHasDocStringError(const std::string& op_name) {
   return strings::Printf(
       "OpDef for %s has a doc string. "
       "Doc strings must be defined in ApiDef instead of OpDef. "
@@ -301,13 +306,13 @@ class PythonApiTest : public ::testing::Test {
  protected:
   PythonApiTest() {
     OpRegistry::Global()->Export(false, &ops_);
-    const std::vector<string> multi_line_fields = {"description"};
+    const std::vector<std::string> multi_line_fields = {"description"};
 
     Env* env = Env::Default();
     GetGoldenApiDefs(env, PythonApiDefDir(), &api_defs_map_);
   }
   OpList ops_;
-  std::unordered_map<string, ApiDef> api_defs_map_;
+  std::unordered_map<std::string, ApiDef> api_defs_map_;
 };
 
 // Check that ApiDefs have a corresponding op.
diff --git a/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
index 7c4db1f721a032..41868ddc6c649f 100644
--- a/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
@@ -1,5 +1,12 @@
 op {
   graph_op_name: "ComplexAbs"
+  attr {
+    name: "Tout"
+    description: <<END
+Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+Need to be `tf.float64` when the type of `x` is `tf.complex128`.
+END
+  }
   summary: "Computes the complex absolute value of a tensor."
   description: <<END
 Given a tensor `x` of complex numbers, this operation returns a tensor of type
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
index 11043899ba41d9..15db5b7cc6a3c9 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
@@ -15,9 +15,9 @@ END
   in_arg {
     name: "weights"
     description: <<END
-is an int32, int64, float32, or float64 `Tensor` with the same
-shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-equal to 1.
+`Tensor` with the same shape as `arr`, or a length-0 `Tensor`,
+in which case it acts as all weights equal to 1.
+Not supported by the GPU implementation of Bincount.
 END
   }
   out_arg {
@@ -30,7 +30,9 @@ END
   attr {
     name: "binary_output"
     description: <<END
-bool; Whether the kernel should count the appearance or number of occurrences.
+Whether the kernel should count the appearance or number of occurrences.
+Will raise `UnimplementedError` when `binary_output` is specified to `True`
+and the size of `weights` is not 0.
 END
   }
   summary: "Counts the number of occurrences of each value in an integer array."
diff --git a/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt b/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
index 8c3bb674311293..3be31658726ada 100644
--- a/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
@@ -1,5 +1,12 @@
 op {
   graph_op_name: "Imag"
+  attr {
+    name: "Tout"
+    description: <<END
+Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+Need to be `tf.float64` when the type of `x` is `tf.complex128`.
+END
+  }
   summary: "Returns the imaginary part of a complex number."
   description: <<END
 Given a tensor `input` of complex numbers, this operation returns a tensor of
diff --git a/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
index 3a737420005937..21e6ae2934c1ee 100644
--- a/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
@@ -13,12 +13,25 @@ END
 A (batch_size, dim)-tensor holding the per-batch element solutions.
 END
   }
-    out_arg {
+  out_arg {
     name: "segments"
     description: <<END
 An int32 (batch_size, dim)-tensor with the segments.
 END
   }
-  attr { name: "output_dtype"  description: "Dtype of output." }
+  attr {
+    name: "output_dtype"
+    description: <<END
+Dtype of the output tensor.
+
+Note on supported input-output type combinations:
+* For floating-point types, the output has the same dtype as the input.
+* For 8-bit and 16-bit integer inputs, the output is a 32-bit float.
+* For 32-bit and 64-bit integer inputs, the output is a 64-bit float.
+
+Using unsupported dtype pairs (for example, input=float64 with output=float32)
+will result in a "Could not find device for node" error.
+END
+  }
   summary: "Solves a batch of isotonic regression problems."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
index 88ba0ea88cbfdd..cc2475ae0d34d4 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
@@ -1,5 +1,19 @@
 op {
   graph_op_name: "QuantizeAndDequantizeV4Grad"
+  in_arg {
+    name: "input_min"
+    description: <<END
+If `axis` is specified, the shape of the minimum input tensor
+must be rank 1.
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+If `axis` is specified, the shape of the maximum input tensor
+must be rank 1.
+END
+  }
   summary: "Returns the gradient of `QuantizeAndDequantizeV4`."
   description: <<END
 Returns a gradient of 1 for inputs that are within the quantization range,
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandleV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandleV2.pbtxt
index 682d3d2c023c6b..32b0ea64211f32 100644
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandleV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandleV2.pbtxt
@@ -1,4 +1,11 @@
 op {
   graph_op_name: "StatsAggregatorHandleV2"
+  attr {
+    name: "container"
+    description: <<END
+The name of `container` should start with `'.'` or `letter` or `digit`,
+with ['-', '.', '/'] or `letter` or `digit` follows several times.
+END
+  }
   visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ThreadPoolHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_ThreadPoolHandle.pbtxt
index 0bc34ecb077cb6..47303ad2a85f89 100644
--- a/tensorflow/core/api_def/base_api/api_def_ThreadPoolHandle.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ThreadPoolHandle.pbtxt
@@ -1,16 +1,16 @@
 op {
   graph_op_name: "ThreadPoolHandle"
-  out_arg {
-    name: "handle"
+  attr {
+    name: "num_threads"
     description: <<END
-A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-ops.
+The number of threads in the thread pool.
 END
   }
   attr {
-    name: "num_threads"
+    name: "display_name"
     description: <<END
-The number of threads in the thread pool.
+A human-readable name for the threads that may be visible in some
+visualizations.
 END
   }
   attr {
@@ -21,11 +21,10 @@ threadpool.
 END
   }
   attr {
-    name: "display_name"
+    name: "container"
     description: <<END
-A human-readable name for the threads that may be visible in some
-visualizations.
-threadpool.
+The name of `container` should start with `'.'` or `letter` or `digit`,
+with ['-', '.', '/'] or `letter` or `digit` follows several times.
 END
   }
   summary: <<END
diff --git a/tensorflow/core/api_def/update_api_def.cc b/tensorflow/core/api_def/update_api_def.cc
index 2324c5adad6ab1..194f3f61014346 100644
--- a/tensorflow/core/api_def/update_api_def.cc
+++ b/tensorflow/core/api_def/update_api_def.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/api_def/excluded_ops.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -124,7 +125,7 @@ bool CheckDocsMatch(const OpDef& op1, const OpDef& op2) {
 
 // Returns true if descriptions and summaries in op match a
 // given single doc-string.
-bool ValidateOpDocs(const OpDef& op, const string& doc) {
+bool ValidateOpDocs(const OpDef& op, const std::string& doc) {
   OpDefBuilder b(op.name());
   // We don't really care about type we use for arguments and
   // attributes. We just want to make sure attribute and argument names
@@ -146,28 +147,28 @@ bool ValidateOpDocs(const OpDef& op, const string& doc) {
 }
 }  // namespace
 
-string RemoveDoc(const OpDef& op, const string& file_contents,
-                 size_t start_location) {
+std::string RemoveDoc(const OpDef& op, const std::string& file_contents,
+                      size_t start_location) {
   // Look for a line starting with .Doc( after the REGISTER_OP.
   const auto doc_start_location = file_contents.find(kDocStart, start_location);
-  const string format_error = strings::Printf(
+  const std::string format_error = strings::Printf(
       "Could not find %s doc for removal. Make sure the doc is defined with "
       "'%s' prefix and '%s' suffix or remove the doc manually.",
       op.name().c_str(), kDocStart, kDocEnd);
-  if (doc_start_location == string::npos) {
+  if (doc_start_location == std::string::npos) {
     std::cerr << format_error << std::endl;
     LOG(ERROR) << "Didn't find doc start";
     return file_contents;
   }
   const auto doc_end_location = file_contents.find(kDocEnd, doc_start_location);
-  if (doc_end_location == string::npos) {
+  if (doc_end_location == std::string::npos) {
     LOG(ERROR) << "Didn't find doc start";
     std::cerr << format_error << std::endl;
     return file_contents;
   }
 
   const auto doc_start_size = sizeof(kDocStart) - 1;
-  string doc_text = file_contents.substr(
+  std::string doc_text = file_contents.substr(
       doc_start_location + doc_start_size,
       doc_end_location - doc_start_location - doc_start_size);
 
@@ -189,12 +190,12 @@ namespace {
 // Remove .Doc calls that follow REGISTER_OP calls for the given ops.
 // We search for REGISTER_OP calls in the given op_files list.
 void RemoveDocs(const std::vector<const OpDef*>& ops,
-                const std::vector<string>& op_files) {
+                const std::vector<std::string>& op_files) {
   // Set of ops that we already found REGISTER_OP calls for.
-  std::set<string> processed_ops;
+  std::set<std::string> processed_ops;
 
   for (const auto& file : op_files) {
-    string file_contents;
+    std::string file_contents;
     bool file_contents_updated = false;
     TF_CHECK_OK(ReadFileToString(Env::Default(), file, &file_contents));
 
@@ -203,11 +204,11 @@ void RemoveDocs(const std::vector<const OpDef*>& ops,
         // We already found REGISTER_OP call for this op in another file.
         continue;
       }
-      string register_call =
+      std::string register_call =
           strings::Printf("REGISTER_OP(\"%s\")", op->name().c_str());
       const auto register_call_location = file_contents.find(register_call);
       // Find REGISTER_OP(OpName) call.
-      if (register_call_location == string::npos) {
+      if (register_call_location == std::string::npos) {
         continue;
       }
       std::cout << "Removing .Doc call for " << op->name() << " from " << file
@@ -228,11 +229,11 @@ void RemoveDocs(const std::vector<const OpDef*>& ops,
 
 // Returns ApiDefs text representation in multi-line format
 // constructed based on the given op.
-string CreateApiDef(const OpDef& op) {
+std::string CreateApiDef(const OpDef& op) {
   ApiDefs api_defs;
   FillBaseApiDef(api_defs.add_op(), op);
 
-  const std::vector<string> multi_line_fields = {"description"};
+  const std::vector<std::string> multi_line_fields = {"description"};
   std::string new_api_defs_str;
   ::tensorflow::protobuf::TextFormat::PrintToString(api_defs,
                                                     &new_api_defs_str);
@@ -242,8 +243,8 @@ string CreateApiDef(const OpDef& op) {
 // Creates ApiDef files for any new ops.
 // If op_file_pattern is not empty, then also removes .Doc calls from
 // new op registrations in these files.
-void CreateApiDefs(const OpList& ops, const string& api_def_dir,
-                   const string& op_file_pattern) {
+void CreateApiDefs(const OpList& ops, const std::string& api_def_dir,
+                   const std::string& op_file_pattern) {
   auto* excluded_ops = GetExcludedOps();
   std::vector<const OpDef*> new_ops_with_docs;
 
@@ -252,9 +253,8 @@ void CreateApiDefs(const OpList& ops, const string& api_def_dir,
       continue;
     }
     // Form the expected ApiDef path.
-    string file_path =
-        io::JoinPath(tensorflow::string(api_def_dir), kApiDefFileFormat);
-    file_path = strings::Printf(file_path.c_str(), op.name().c_str());
+    std::string file_name = absl::StrFormat(kApiDefFileFormat, op.name());
+    std::string file_path = io::JoinPath(api_def_dir, file_name);
 
     // Create ApiDef if it doesn't exist.
     if (!Env::Default()->FileExists(file_path).ok()) {
@@ -268,7 +268,7 @@ void CreateApiDefs(const OpList& ops, const string& api_def_dir,
     }
   }
   if (!op_file_pattern.empty()) {
-    std::vector<string> op_files;
+    std::vector<std::string> op_files;
     TF_CHECK_OK(Env::Default()->GetMatchingPaths(op_file_pattern, &op_files));
     RemoveDocs(new_ops_with_docs, op_files);
   }
diff --git a/tensorflow/core/api_def/update_api_def.h b/tensorflow/core/api_def/update_api_def.h
index 1e285c06883efa..1ac71689bba2d0 100644
--- a/tensorflow/core/api_def/update_api_def.h
+++ b/tensorflow/core/api_def/update_api_def.h
@@ -23,14 +23,14 @@ namespace tensorflow {
 
 // Returns ApiDefs text representation in multi-line format
 // constructed based on the given op.
-string CreateApiDef(const OpDef& op);
+std::string CreateApiDef(const OpDef& op);
 
 // Removes .Doc call for the given op.
 // If unsuccessful, returns original file_contents and prints an error.
 // start_location - We search for .Doc call starting at this location
 //   in file_contents.
-string RemoveDoc(const OpDef& op, const string& file_contents,
-                 size_t start_location);
+std::string RemoveDoc(const OpDef& op, const std::string& file_contents,
+                      size_t start_location);
 
 // Creates api_def_*.pbtxt files for any new ops (i.e. ops that don't have an
 // api_def_*.pbtxt file yet).
@@ -38,8 +38,8 @@ string RemoveDoc(const OpDef& op, const string& file_contents,
 // look for a REGISTER_OP call for the new ops and removes corresponding
 // .Doc() calls since the newly generated api_def_*.pbtxt files will
 // store the doc strings.
-void CreateApiDefs(const OpList& ops, const string& api_def_dir,
-                   const string& op_file_pattern);
+void CreateApiDefs(const OpList& ops, const std::string& api_def_dir,
+                   const std::string& op_file_pattern);
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
diff --git a/tensorflow/core/api_def/update_api_def_main.cc b/tensorflow/core/api_def/update_api_def_main.cc
index 3fd975ce178b5f..4cf74abf82cb6f 100644
--- a/tensorflow/core/api_def/update_api_def_main.cc
+++ b/tensorflow/core/api_def/update_api_def_main.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 int main(int argc, char** argv) {
-  tensorflow::string api_files_dir;
-  tensorflow::string op_file_pattern;
+  std::string api_files_dir;
+  std::string op_file_pattern;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("api_def_dir", &api_files_dir,
                        "Base directory of api_def*.pbtxt files."),
diff --git a/tensorflow/core/api_def/update_api_def_test.cc b/tensorflow/core/api_def/update_api_def_test.cc
index 4200c9da23c093..23751ffa3ecd25 100644
--- a/tensorflow/core/api_def/update_api_def_test.cc
+++ b/tensorflow/core/api_def/update_api_def_test.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace {
 
 TEST(UpdateApiDefTest, TestRemoveDocSingleOp) {
-  const string op_def_text = R"opdef(
+  const std::string op_def_text = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .Output("output: T")
@@ -32,7 +32,7 @@ REGISTER_OP("Op1")
     .SetShapeFn(shape_inference::UnchangedShape);
 )opdef";
 
-  const string op_def_text_with_doc = R"opdef(
+  const std::string op_def_text_with_doc = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .Output("output: T")
@@ -50,7 +50,7 @@ output: Description for output.
 )doc");
 )opdef";
 
-  const string op_text = R"(
+  const std::string op_text = R"(
 name: "Op1"
 input_arg {
   name: "a"
@@ -75,7 +75,7 @@ description: "Description\nfor Op1."
 }
 
 TEST(UpdateApiDefTest, TestRemoveDocMultipleOps) {
-  const string op_def_text = R"opdef(
+  const std::string op_def_text = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .SetShapeFn(shape_inference::UnchangedShape);
@@ -89,7 +89,7 @@ REGISTER_OP("Op3")
     .SetShapeFn(shape_inference::UnchangedShape);
 )opdef";
 
-  const string op_def_text_with_doc = R"opdef(
+  const std::string op_def_text_with_doc = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .Doc(R"doc(
@@ -112,21 +112,21 @@ Summary for Op3.
 )doc");
 )opdef";
 
-  const string op1_text = R"(
+  const std::string op1_text = R"(
 name: "Op1"
 input_arg {
   name: "a"
 }
 summary: "Summary for Op1."
 )";
-  const string op2_text = R"(
+  const std::string op2_text = R"(
 name: "Op2"
 input_arg {
   name: "a"
 }
 summary: "Summary for Op2."
 )";
-  const string op3_text = R"(
+  const std::string op3_text = R"(
 name: "Op3"
 input_arg {
   name: "c"
@@ -138,12 +138,12 @@ summary: "Summary for Op3."
   protobuf::TextFormat::ParseFromString(op2_text, &op2);  // NOLINT
   protobuf::TextFormat::ParseFromString(op3_text, &op3);  // NOLINT
 
-  string updated_text =
+  std::string updated_text =
       RemoveDoc(op2, op_def_text_with_doc,
                 op_def_text_with_doc.find("Op2") /* start_location */);
-  EXPECT_EQ(string::npos, updated_text.find("Summary for Op2"));
-  EXPECT_NE(string::npos, updated_text.find("Summary for Op1"));
-  EXPECT_NE(string::npos, updated_text.find("Summary for Op3"));
+  EXPECT_EQ(std::string::npos, updated_text.find("Summary for Op2"));
+  EXPECT_NE(std::string::npos, updated_text.find("Summary for Op1"));
+  EXPECT_NE(std::string::npos, updated_text.find("Summary for Op3"));
 
   updated_text = RemoveDoc(op3, updated_text,
                            updated_text.find("Op3") /* start_location */);
@@ -153,7 +153,7 @@ summary: "Summary for Op3."
 }
 
 TEST(UpdateApiDefTest, TestCreateApiDef) {
-  const string op_text = R"(
+  const std::string op_text = R"(
 name: "Op1"
 input_arg {
   name: "a"
@@ -173,7 +173,7 @@ description: "Description\nfor Op1."
   OpDef op;
   protobuf::TextFormat::ParseFromString(op_text, &op);  // NOLINT
 
-  const string expected_api_def = R"(op {
+  const std::string expected_api_def = R"(op {
   graph_op_name: "Op1"
   in_arg {
     name: "a"
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 2bbefa180aeb64..a6f717a4168f2a 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -375,6 +375,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/config:flag_defs",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/core/profiler/lib:traceme",
@@ -1110,6 +1111,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -1348,10 +1350,7 @@ cc_library(
     srcs = ["lower_functional_ops.cc"],
     hdrs = ["lower_functional_ops.h"],
     copts = tf_copts(),
-    visibility = default_package_visibility + [
-        "//platforms/performance/autograppler:__subpackages__",
-        "//platforms/performance/tf_sim:__subpackages__",
-    ],
+    visibility = default_package_visibility + ["//platforms/performance/tf_sim:__subpackages__"],
     deps = [
         ":device_propagation",
         ":function_utils",
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 250335da263435..04cbf66e8095f8 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/config/flag_defs.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -229,6 +230,9 @@ CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
 BaseCollectiveExecutor::~BaseCollectiveExecutor() {}
 
 void BaseCollectiveExecutor::StartAbort(const absl::Status& s) {
+  if (flags::Global().enable_fatal_error_on_collective_abort.value()) {
+    LOG(FATAL) << "BaseCollectiveExecutor::StartAbort: " << s;
+  }
   absl::Status status;
   {
     mutex_lock l(status_mu_);
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr.h b/tensorflow/core/common_runtime/device/device_event_mgr.h
index 7fb0dbc822d676..75847bf66a6e2c 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr.h
+++ b/tensorflow/core/common_runtime/device/device_event_mgr.h
@@ -83,7 +83,7 @@ class EventMgr {
   friend class EventMgrFactory;
 
   se::StreamExecutor* const exec_;
-  const int32 polling_active_delay_usecs_;
+  const int32_t polling_active_delay_usecs_;
   mutex mu_;
   condition_variable events_pending_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 36dc5af9310413..d51031b78b7387 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -154,7 +154,7 @@ class ClientCreationState : public ResourceBase {
   // false after the first call). This modifies internal state (i.e. the first
   // call clears `first_task_`).
   bool FirstThread() ABSL_LOCKS_EXCLUDED(mu_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     if (first_task_) {
       first_task_ = false;
       return true;
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.cc b/tensorflow/core/common_runtime/eval_const_tensor.cc
index b6131c0df03945..7deb3c8bc98df6 100644
--- a/tensorflow/core/common_runtime/eval_const_tensor.cc
+++ b/tensorflow/core/common_runtime/eval_const_tensor.cc
@@ -246,7 +246,7 @@ struct Subgraph {
 // Node along with output index.
 using NodeOutput = std::pair<const Node*, int>;
 std::string OutputName(const NodeOutput& output) {
-  return strings::StrCat(output.first->name(), ":", output.second);
+  return absl::StrCat(output.first->name(), ":", output.second);
 }
 
 // Assuming that the subgraph ending at `target_node` is constant-foldable,
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 8657071c248b7b..583fce11a0ef28 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -134,9 +134,9 @@ void SetMemory(NodeExecStatsInterface* stats, OpKernelContext* ctx) {
 // Time the execution of kernels (in CPU cycles).  Used to dynamically identify
 // inexpensive kernels which can be dispatched inline.
 struct KernelTimer {
-  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+  uint64_t start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
 
-  uint64 ElapsedCycles() {
+  uint64_t ElapsedCycles() {
     return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
   }
 };
@@ -197,14 +197,14 @@ class ExecutorImpl : public Executor {
     // given node is expensive. The new cost estimate is a weighted average of
     // the old cost estimate and the latest cost. We only update cost estimates
     // for kernels for which IsExpensive() return true.
-    void UpdateCostEstimate(const NodeItem& node, uint64 elapsed_cycles) {
+    void UpdateCostEstimate(const NodeItem& node, uint64_t elapsed_cycles) {
       // N.B. Updates to `cost_estimate` are atomic but unlocked.  Simultaneous
       // updates may result in one or more updates being ignored.  This does not
       // affect correctness but may slow down the update frequency.
       std::atomic_uint_fast64_t& cost_estimate = cost_estimates_[node.node_id];
       auto prev_estimate = cost_estimate.load(std::memory_order_relaxed);
 
-      uint64 new_estimate =
+      uint64_t new_estimate =
           ((kCostDecay - 1) * prev_estimate + elapsed_cycles) / kCostDecay;
 
       cost_estimate.store(new_estimate, std::memory_order_relaxed);
@@ -214,9 +214,9 @@ class ExecutorImpl : public Executor {
     // Initial time (in CPU cycles) we expect an operation to take.  Used to
     // determine whether an operation should be place in a threadpool.
     // Operations start out "expensive".
-    static constexpr uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
-    static constexpr uint64 kOpIsExpensiveThresholdCycles = 8000;
-    static constexpr uint64 kCostDecay = 10;
+    static constexpr uint64_t kInitialCostEstimateCycles = 100 * 1000 * 1000;
+    static constexpr uint64_t kOpIsExpensiveThresholdCycles = 8000;
+    static constexpr uint64_t kCostDecay = 10;
 
     std::vector<bool> is_expensive_;
     // std::unique_ptr<std::atomic<bool>[]> is_expensive_;
@@ -369,14 +369,14 @@ class ExecutorState {
   // Maximum number of kernels that can be scheduled inline. If lots of kernels
   // are ready at the same time, scheduling them in one thread can be very slow.
   // TODO(fishx): Make it configurable if necessary.
-  static constexpr uint64 kInlineScheduleReadyThreshold = 500;
+  static constexpr uint64_t kInlineScheduleReadyThreshold = 500;
 
   // Not owned.
   RendezvousInterface* rendezvous_;
   CollectiveExecutor* collective_executor_ = nullptr;
   const ConfigProto* const session_config_;
   SessionState* session_state_;
-  string session_handle_;
+  std::string session_handle_;
   const SessionMetadata* session_metadata_ = nullptr;
   TensorStore* tensor_store_;
   // Step-local container.
@@ -1099,14 +1099,13 @@ absl::Status ExecutorState<PropagatorStateType>::ProcessOutputs(
     }
     if (s.code() == error::RESOURCE_EXHAUSTED) {
       if (stats_collector_) {
-        string err =
+        std::string err =
             stats_collector_->ReportAllocsOnResourceExhausted(s.message());
-        s = errors::CreateWithUpdatedMessage(s,
-                                             strings::StrCat(s.message(), err));
+        s = errors::CreateWithUpdatedMessage(s, absl::StrCat(s.message(), err));
       } else {
         s = errors::CreateWithUpdatedMessage(
             s,
-            strings::StrCat(
+            absl::StrCat(
                 s.message(),
                 "\nHint: If you want to see a list of allocated tensors when "
                 "OOM happens, add report_tensor_allocations_upon_oom "
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 850d48694c1982..cbe63568f69de9 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -105,7 +105,7 @@ class Executor {
     const ConfigProto* session_config = nullptr;
     SessionState* session_state = nullptr;
     // Unique session identifier. Can be empty.
-    string session_handle;
+    std::string session_handle;
     TensorStore* tensor_store = nullptr;
     ScopedStepContainer* step_container = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
diff --git a/tensorflow/core/common_runtime/executor_factory.cc b/tensorflow/core/common_runtime/executor_factory.cc
index 98922e4701664d..8346b47748484d 100644
--- a/tensorflow/core/common_runtime/executor_factory.cc
+++ b/tensorflow/core/common_runtime/executor_factory.cc
@@ -29,7 +29,7 @@ namespace {
 
 static mutex executor_factory_lock(LINKER_INITIALIZED);
 
-typedef std::unordered_map<string, ExecutorFactory*> ExecutorFactories;
+typedef std::unordered_map<std::string, ExecutorFactory*> ExecutorFactories;
 ExecutorFactories* executor_factories() {
   static ExecutorFactories* factories = new ExecutorFactories;
   return factories;
@@ -37,7 +37,7 @@ ExecutorFactories* executor_factories() {
 
 }  // namespace
 
-void ExecutorFactory::Register(const string& executor_type,
+void ExecutorFactory::Register(const std::string& executor_type,
                                ExecutorFactory* factory) {
   mutex_lock l(executor_factory_lock);
   if (!executor_factories()->insert({executor_type, factory}).second) {
@@ -47,18 +47,18 @@ void ExecutorFactory::Register(const string& executor_type,
 }
 
 namespace {
-const string RegisteredFactoriesErrorMessageLocked()
+const std::string RegisteredFactoriesErrorMessageLocked()
     TF_SHARED_LOCKS_REQUIRED(executor_factory_lock) {
-  std::vector<string> factory_types;
+  std::vector<std::string> factory_types;
   for (const auto& executor_factory : *executor_factories()) {
     factory_types.push_back(executor_factory.first);
   }
-  return strings::StrCat("Registered factories are {",
-                         absl::StrJoin(factory_types, ", "), "}.");
+  return absl::StrCat("Registered factories are {",
+                      absl::StrJoin(factory_types, ", "), "}.");
 }
 }  // namespace
 
-absl::Status ExecutorFactory::GetFactory(const string& executor_type,
+absl::Status ExecutorFactory::GetFactory(const std::string& executor_type,
                                          ExecutorFactory** out_factory) {
   tf_shared_lock l(executor_factory_lock);
 
@@ -73,7 +73,7 @@ absl::Status ExecutorFactory::GetFactory(const string& executor_type,
   return absl::OkStatus();
 }
 
-absl::Status NewExecutor(const string& executor_type,
+absl::Status NewExecutor(const std::string& executor_type,
                          const LocalExecutorParams& params, const Graph& graph,
                          std::unique_ptr<Executor>* out_executor) {
   ExecutorFactory* factory = nullptr;
diff --git a/tensorflow/core/common_runtime/executor_factory.h b/tensorflow/core/common_runtime/executor_factory.h
index 14a8d2777bcfcb..3459a4a38b06c9 100644
--- a/tensorflow/core/common_runtime/executor_factory.h
+++ b/tensorflow/core/common_runtime/executor_factory.h
@@ -36,12 +36,13 @@ class ExecutorFactory {
                                    std::unique_ptr<Executor>* out_executor) = 0;
   virtual ~ExecutorFactory() {}
 
-  static void Register(const string& executor_type, ExecutorFactory* factory);
-  static absl::Status GetFactory(const string& executor_type,
+  static void Register(const std::string& executor_type,
+                       ExecutorFactory* factory);
+  static absl::Status GetFactory(const std::string& executor_type,
                                  ExecutorFactory** out_factory);
 };
 
-absl::Status NewExecutor(const string& executor_type,
+absl::Status NewExecutor(const std::string& executor_type,
                          const LocalExecutorParams& params, const Graph& graph,
                          std::unique_ptr<Executor>* out_executor);
 
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index ca733d1b917385..81719752519e56 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -128,7 +128,7 @@ Tensor V(const float val) {
 // A int32 val -> Tensor<int32>
 Tensor VI(const int32_t val) {
   Tensor tensor(DT_INT32, TensorShape({}));
-  tensor.scalar<int32>()() = val;
+  tensor.scalar<int32_t>()() = val;
   return tensor;
 }
 
@@ -153,10 +153,11 @@ float V(const Tensor& tensor) {
   return tensor.scalar<float>()();
 }
 
-static uint64 kIncarnation = 1;  // Uses in following tests.
+static uint64_t kIncarnation = 1;  // Uses in following tests.
 
-Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
-                          const string& receiver, const string& name) {
+Rendezvous::ParsedKey Key(const std::string& sender, const uint64_t incarnation,
+                          const std::string& receiver,
+                          const std::string& name) {
   Rendezvous::ParsedKey result;
   CHECK(
       Rendezvous::ParseKey(Rendezvous::CreateKey(sender, incarnation, receiver,
@@ -508,8 +509,8 @@ static void BM_executor(::testing::benchmark::State& state) {
   Graph* g = new Graph(OpRegistry::Global());
   random::PhiloxRandom philox(1729, 17);
   random::SimplePhilox rand(&philox);
-  uint64 cur = 0;
-  uint32 r = 1 + rand.Rand32() % width;
+  uint64_t cur = 0;
+  uint32_t r = 1 + rand.Rand32() % width;
   std::vector<Node*> ready_nodes;
   for (int i = 0; i < r; ++i) {
     ready_nodes.push_back(test::graph::NoOp(g, {}));
@@ -537,7 +538,7 @@ static void BM_executor(::testing::benchmark::State& state) {
   FixupSourceAndSinkEdges(g);
   test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
 
-  state.SetLabel(strings::StrCat("Nodes = ", cur));
+  state.SetLabel(absl::StrCat("Nodes = ", cur));
   state.SetItemsProcessed(cur * static_cast<int64_t>(state.iterations()));
 }
 
@@ -566,7 +567,7 @@ static void BM_const_identity(::testing::benchmark::State& state) {
   }
   FixupSourceAndSinkEdges(g);
   test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
-  state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  state.SetLabel(absl::StrCat("Nodes = ", (1 + outputs_per_const) * width));
   state.SetItemsProcessed((1 + outputs_per_const) * width *
                           static_cast<int64_t>(state.iterations()));
 }
@@ -589,9 +590,9 @@ static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
   Node* sum = test::graph::Add(g, x, y);
   Node* z = test::graph::Send(g, sum, "z", BOB, 1, ALICE);
 
-  string x_key = test::GetRendezvousKey(x);
-  string y_key = test::GetRendezvousKey(y);
-  string z_key = test::GetRendezvousKey(z);
+  std::string x_key = test::GetRendezvousKey(x);
+  std::string y_key = test::GetRendezvousKey(y);
+  std::string z_key = test::GetRendezvousKey(z);
 
   Tensor val(DT_FLOAT, TensorShape({}));
   val.scalar<float>()() = 3.14;
@@ -603,9 +604,10 @@ static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
 BENCHMARK(BM_FeedInputFetchOutput);
 
 absl::Status ReplaceEdgeWithSendRecv(Graph* g, const Edge* edge,
-                                     const string& tensor, const string& sender,
-                                     const uint64 sender_incarnation,
-                                     const string& receiver) {
+                                     const std::string& tensor,
+                                     const std::string& sender,
+                                     const uint64_t sender_incarnation,
+                                     const std::string& receiver) {
   Node* send;
   NodeDef send_def;
   TF_CHECK_OK(NodeDefBuilder(g->NewName("n"), "_Send")
@@ -662,20 +664,20 @@ static void BM_WhileLoopHelper(::testing::benchmark::State& state,
   FunctionDefLibrary f_lib_proto;
 
   // Define the loop body as a function: `x = x + 1`.
-  const Tensor one_t = test::AsScalar<int32>(1);
+  const Tensor one_t = test::AsScalar<int32_t>(1);
 
-  std::vector<string> args;
+  std::vector<std::string> args;
   args.reserve(loop_vars);
   args.push_back("x: int32");
   for (int i = 1; i < loop_vars; ++i) {
-    args.push_back(strings::StrCat("x", i, ": int32"));
+    args.push_back(absl::StrCat("x", i, ": int32"));
   }
 
-  std::vector<string> body_rets;
+  std::vector<std::string> body_rets;
   body_rets.reserve(loop_vars);
   body_rets.push_back("y: int32");
   for (int i = 1; i < loop_vars; ++i) {
-    body_rets.push_back(strings::StrCat("y", i, ": int32"));
+    body_rets.push_back(absl::StrCat("y", i, ": int32"));
   }
 
   std::vector<FunctionDefHelper::Node> body_nodes;
@@ -684,9 +686,9 @@ static void BM_WhileLoopHelper(::testing::benchmark::State& state,
       {{"one"}, "Const", {}, {{"value", one_t}, {"dtype", DT_INT32}}});
   body_nodes.push_back({{"y"}, "Add", {"x", "one"}, {{"T", DT_INT32}}});
   for (int i = 1; i < loop_vars; ++i) {
-    body_nodes.push_back({{strings::StrCat("y", i)},
+    body_nodes.push_back({{absl::StrCat("y", i)},
                           "Relu",
-                          {strings::StrCat("x", i)},
+                          {absl::StrCat("x", i)},
                           {{"T", DT_INT32}}});
   }
 
@@ -703,7 +705,7 @@ static void BM_WhileLoopHelper(::testing::benchmark::State& state,
       body_nodes);
 
   // Define the loop condition as a function: `x < loop_iters`.
-  const Tensor loop_iters_t = test::AsScalar<int32>(loop_iters);
+  const Tensor loop_iters_t = test::AsScalar<int32_t>(loop_iters);
   *f_lib_proto.add_function() = FunctionDefHelper::Define(
       // Name
       "LessThanOrEqualToN",
@@ -775,7 +777,7 @@ static void BM_WhileLoopHelper(::testing::benchmark::State& state,
           if (edge->dst()->type_string() != "Switch") {
             continue;
           }
-          string tensor_name = strings::StrCat("c", edge->id());
+          std::string tensor_name = absl::StrCat("c", edge->id());
           TF_ASSERT_OK(ReplaceEdgeWithSendRecv(graph.get(), edge, tensor_name,
                                                BOB, 1, ALICE));
         }
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 63d31e962d3bf2..90080692323345 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -88,11 +88,11 @@ struct Endpoint {
   int index;
 
   // Returns the string name represents this endpoint.
-  string name() const {
+  std::string name() const {
     if (index == 0) {
       return node->name();
     } else {
-      return strings::StrCat(node->name(), ":", index);
+      return absl::StrCat(node->name(), ":", index);
     }
   }
 
@@ -100,7 +100,7 @@ struct Endpoint {
 };
 
 struct EndpointHash {
-  uint64 operator()(const Endpoint& x) const {
+  uint64_t operator()(const Endpoint& x) const {
     return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
                   x.index);
   }
@@ -166,7 +166,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
       : base_flr_(base_flr), lib_def_(std::move(lib_def)) {}
   ~FunctionLibraryRuntimeOverlay() override;
 
-  absl::Status Instantiate(const string& function_name, AttrSlice attrs,
+  absl::Status Instantiate(const std::string& function_name, AttrSlice attrs,
                            const InstantiateOptions& options,
                            Handle* handle) override;
 
@@ -192,7 +192,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
   absl::Status CreateKernel(const std::shared_ptr<const NodeProperties>& props,
                             OpKernel** kernel) override;
 
-  bool IsStateful(const string& function_name) const override;
+  bool IsStateful(const std::string& function_name) const override;
 
   const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
       const override;
@@ -204,7 +204,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
   std::function<void(std::function<void()>)>* runner() override;
   const DeviceMgr* device_mgr() const override;
 
-  string DebugString(Handle handle) override;
+  std::string DebugString(Handle handle) override;
   int graph_def_version() const override;
 
   absl::Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
@@ -220,7 +220,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
 FunctionLibraryRuntimeOverlay::~FunctionLibraryRuntimeOverlay() = default;
 
 absl::Status FunctionLibraryRuntimeOverlay::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const InstantiateOptions& options, Handle* handle) {
   // We automatically set the `lib_def` option for all instantiations, if the
   // caller doesn't set this option explicitly.
@@ -284,7 +284,7 @@ absl::Status FunctionLibraryRuntimeOverlay::CreateKernel(
 }
 
 bool FunctionLibraryRuntimeOverlay::IsStateful(
-    const string& function_name) const {
+    const std::string& function_name) const {
   // Important: we do not forward lookup to the base FLR.
   const OpDef* op_def;
   const absl::Status s = lib_def_.LookUpOpDef(function_name, &op_def);
@@ -317,7 +317,7 @@ FunctionLibraryRuntimeOverlay::GetFunctionLibraryDefinition() const {
   return &lib_def_;
 }
 
-string FunctionLibraryRuntimeOverlay::DebugString(Handle handle) {
+std::string FunctionLibraryRuntimeOverlay::DebugString(Handle handle) {
   return base_flr_->DebugString(handle);
 }
 
@@ -348,7 +348,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   ~FunctionLibraryRuntimeImpl() override;
 
-  absl::Status Instantiate(const string& function_name, AttrSlice attrs,
+  absl::Status Instantiate(const std::string& function_name, AttrSlice attrs,
                            const InstantiateOptions& options,
                            Handle* handle) override;
 
@@ -375,7 +375,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   absl::Status RunSync(Options opts, Handle handle,
                        CallFrameInterface* call_frame) override;
 
-  bool IsStateful(const string& function) const override;
+  bool IsStateful(const std::string& function) const override;
 
   // TODO: b/396484774 - Consider handling the case where the FLR is already
   // finalized instead of always returning the pointer to the unowned library
@@ -397,7 +397,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   const ConfigProto* const config_proto() override { return config_; }
   int graph_def_version() const override { return graph_def_version_; }
 
-  string DebugString(Handle h) override;
+  std::string DebugString(Handle h) override;
 
   absl::Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
                      std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
@@ -416,9 +416,9 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   GraphOptimizer optimizer_;
   const SessionMetadata* const session_metadata_;
   Executor::Args::Runner default_runner_;
-  const string device_name_;
+  const std::string device_name_;
 
-  std::function<absl::Status(const string&, const OpDef**)> get_func_sig_;
+  std::function<absl::Status(const std::string&, const OpDef**)> get_func_sig_;
   std::function<absl::Status(const std::shared_ptr<const NodeProperties>&,
                              OpKernel**)>
       create_kernel_;
@@ -432,13 +432,13 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item {
-    uint64 instantiation_counter = 0;
+    uint64_t instantiation_counter = 0;
     std::unique_ptr<const Graph> graph = nullptr;
     const FunctionLibraryDefinition* lib_def = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
     core::RefCountPtr<FunctionLibraryRuntimeOverlay> overlay_flr = nullptr;
-    string executor_type;
+    std::string executor_type;
     bool allow_small_function_optimizations = false;
     bool allow_control_flow_sync_execution = false;
     bool function_runs_at_most_once = false;
@@ -517,7 +517,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
              absl::flat_hash_map<Handle, std::unique_ptr<Item>>>()),
       function_handle_cache_(std::make_unique<FunctionHandleCache>(this)),
       parent_(parent) {
-  get_func_sig_ = [this](const string& op, const OpDef** sig) {
+  get_func_sig_ = [this](const std::string& op, const OpDef** sig) {
     return base_lib_def_->LookUpOpDef(op, sig);
   };
   create_kernel_ = [this](const std::shared_ptr<const NodeProperties>& props,
@@ -714,7 +714,7 @@ absl::Status FunctionLibraryRuntimeImpl::FunctionDefToBody(
     return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
                                    get_func_sig_, fbody);
   } else {
-    auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
+    auto get_func_sig = [lib_def](const std::string& op, const OpDef** sig) {
       return lib_def->LookUpOpDef(op, sig);
     };
     return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
@@ -779,7 +779,7 @@ bool FunctionLibraryRuntimeImpl::IsLocalTarget(
 }
 
 absl::Status FunctionLibraryRuntimeImpl::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const InstantiateOptions& options, Handle* handle) {
   if (!IsLocalTarget(options)) {
     return parent_->Instantiate(function_name, attrs, options, handle);
@@ -796,7 +796,7 @@ absl::Status FunctionLibraryRuntimeImpl::Instantiate(
   // in the canonical key.
   InstantiateOptions options_copy(options);
   options_copy.target = device_name_;
-  const string key = Canonicalize(function_name, attrs, options_copy);
+  const std::string key = Canonicalize(function_name, attrs, options_copy);
 
   {
     mutex_lock l(mu_);
@@ -837,7 +837,7 @@ absl::Status FunctionLibraryRuntimeImpl::Instantiate(
     if (func.name() == kGradientOp) {
       return errors::InvalidArgument("Can't take gradient of SymbolicGradient");
     }
-    const string grad = lib_def->FindGradient(func.name());
+    const std::string grad = lib_def->FindGradient(func.name());
     if (!grad.empty()) {
       return Instantiate(grad, AttrSlice(&func.attr()), options, handle);
     }
@@ -941,7 +941,7 @@ absl::Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
 absl::Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   const FunctionBody* fbody;
   FunctionLibraryRuntime* flr;
-  string executor_type;
+  std::string executor_type;
   {
     tf_shared_lock l(mu_);
     fbody = (*item)->func_graph;
@@ -1120,8 +1120,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            absl::Span<const Tensor> args,
                                            std::vector<Tensor>* rets,
                                            Item* item, DoneCallback done) {
-  string target_device = parent_->GetDeviceName(handle);
-  string source_device = opts.source_device;
+  std::string target_device = parent_->GetDeviceName(handle);
+  std::string source_device = opts.source_device;
   RendezvousInterface* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   absl::Status s = parent_->GetDeviceContext(target_device, &device_context);
@@ -1436,13 +1436,13 @@ absl::Status FunctionLibraryRuntimeImpl::RunSync(
   return absl::OkStatus();
 }
 
-bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) const {
+bool FunctionLibraryRuntimeImpl::IsStateful(const std::string& func) const {
   const OpDef* op_def;
   const absl::Status s = base_lib_def_->LookUpOpDef(func, &op_def);
   return s.ok() && op_def->is_stateful();
 }
 
-string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
+std::string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
   Item* item = nullptr;
   LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
   absl::Status s = GetOrCreateItem(local_handle, &item);
diff --git a/tensorflow/core/common_runtime/function_def_utils.cc b/tensorflow/core/common_runtime/function_def_utils.cc
index 17263a4465cf31..570791252dda4e 100644
--- a/tensorflow/core/common_runtime/function_def_utils.cc
+++ b/tensorflow/core/common_runtime/function_def_utils.cc
@@ -41,7 +41,7 @@ namespace tensorflow {
 absl::Status FunctionDefToBodyHelper(
     core::RefCountPtr<FunctionRecord>&& record, const AttrSlice& attrs,
     const FunctionLibraryDefinition* const lib_def,
-    const std::function<absl::Status(const string&, const OpDef**)>&
+    const std::function<absl::Status(const std::string&, const OpDef**)>&
         get_func_sig,
     std::unique_ptr<FunctionBody>* fbody) {
   // Instantiates the function template into a graph def.
@@ -96,7 +96,8 @@ absl::Status FunctionDefToBodyHelper(core::RefCountPtr<FunctionRecord>&& record,
                                      const AttrSlice& attrs,
                                      const FunctionLibraryDefinition* lib_def,
                                      std::unique_ptr<FunctionBody>* fbody) {
-  const auto get_func_sig = [&lib_def](const string& op, const OpDef** sig) {
+  const auto get_func_sig = [&lib_def](const std::string& op,
+                                       const OpDef** sig) {
     return lib_def->LookUpOpDef(op, sig);
   };
   return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
@@ -109,7 +110,8 @@ absl::Status FunctionDefToBodyHelper(const FunctionDef& fdef,
                                      std::unique_ptr<FunctionBody>* fbody) {
   core::RefCountPtr<FunctionRecord> record(
       new FunctionRecord(FunctionDef(fdef), {}, true));
-  const auto get_func_sig = [&lib_def](const string& op, const OpDef** sig) {
+  const auto get_func_sig = [&lib_def](const std::string& op,
+                                       const OpDef** sig) {
     return lib_def->LookUpOpDef(op, sig);
   };
   return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
@@ -125,8 +127,8 @@ bool PrunableStatefulNode(const Node* n) {
   // and can produce different results on each invocation (due to variable
   // updates) but it does not itself modify the variable.
   // TODO(b/341721055): Consolidate this set with other side effect modeling.
-  static const absl::flat_hash_set<string>* prunable_stateful_ops =
-      new absl::flat_hash_set<string>{
+  static const absl::flat_hash_set<std::string>* prunable_stateful_ops =
+      new absl::flat_hash_set<std::string>{
           FunctionLibraryDefinition::kArgOp,
           "ResourceGather",
           "ResourceGatherNd",
diff --git a/tensorflow/core/common_runtime/function_def_utils.h b/tensorflow/core/common_runtime/function_def_utils.h
index cd3b021ec2f3c9..589dd9304edea9 100644
--- a/tensorflow/core/common_runtime/function_def_utils.h
+++ b/tensorflow/core/common_runtime/function_def_utils.h
@@ -55,7 +55,7 @@ absl::Status FunctionDefToBodyHelper(const FunctionDef& fdef,
 absl::Status FunctionDefToBodyHelper(
     core::RefCountPtr<FunctionRecord>&& record, const AttrSlice& attrs,
     const FunctionLibraryDefinition* lib_def,
-    const std::function<absl::Status(const string&, const OpDef**)>&
+    const std::function<absl::Status(const std::string&, const OpDef**)>&
         get_func_sig,
     std::unique_ptr<FunctionBody>* fbody);
 
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 08898cc8052396..adf7ea36fdd99d 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -74,7 +74,7 @@ using ::tsl::testing::StatusIs;
 using FDH = ::tensorflow::FunctionDefHelper;
 using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
 
-absl::Status GetOpSig(const string& op, const OpDef** sig) {
+absl::Status GetOpSig(const std::string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
@@ -220,14 +220,14 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return absl::OkStatus();
   }
 
-  absl::Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+  absl::Status Instantiate(FunctionLibraryRuntime* flr, const std::string& name,
                            test::function::Attrs attrs,
                            FunctionLibraryRuntime::Handle* handle) {
     return flr->Instantiate(name, attrs, handle);
   }
 
   absl::Status Instantiate(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle) {
@@ -235,7 +235,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(FunctionLibraryRuntime* flr,
-                                 const string& name,
+                                 const std::string& name,
                                  test::function::Attrs attrs,
                                  const std::vector<Tensor>& args,
                                  std::vector<Tensor*> rets) {
@@ -245,7 +245,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
@@ -295,7 +295,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRunViaCallFrameInterface(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs, const std::vector<Tensor>& args,
       std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
@@ -331,7 +331,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
-                                     const string& name,
+                                     const std::string& name,
                                      test::function::Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     absl::Status status = flr->Instantiate(name, attrs, &handle);
@@ -347,7 +347,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   std::unique_ptr<Graph> GetGradBody(FunctionLibraryRuntime* flr,
-                                     const string& func,
+                                     const std::string& func,
                                      test::function::Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     absl::Status status = flr->Instantiate(func, attrs, &handle);
@@ -646,9 +646,9 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
       // Attrs
       {},
       // Nodes
-      {FDH::Const<int32>("shape", absl::Span<const int32>({1})),
-       FDH::Const<int32>("minval", 0),
-       FDH::Const<int32>("maxval", 10),
+      {FDH::Const<int32_t>("shape", absl::Span<const int32_t>({1})),
+       FDH::Const<int32_t>("minval", 0),
+       FDH::Const<int32_t>("maxval", 10),
        // A stateful node.
        {{"y"},
         "RandomUniformInt",
@@ -665,7 +665,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     // Simple case: instantiating with no state_handle.
     for (int32_t expected : {6, 4}) {
       TF_CHECK_OK(Run(flr0_, handle, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -678,7 +678,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     EXPECT_EQ(handle, handle_non_isolated);
     for (int32_t expected : {0, 1}) {
       TF_CHECK_OK(Run(flr0_, handle_non_isolated, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -693,7 +693,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     EXPECT_NE(handle, handle_isolated);
     for (int32_t expected : {6, 4, 0, 1}) {
       TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -708,7 +708,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     EXPECT_NE(handle, handle_isolated);
     for (int32_t expected : {6, 4, 0, 1}) {
       TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -725,7 +725,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
       EXPECT_NE(handle, handle_isolated);
       for (int32_t expected : {6, 4, 0, 1}) {
         TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
-        test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+        test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
       }
       TF_CHECK_OK(flr0_->ReleaseHandle(handle_isolated));
     }
@@ -1128,9 +1128,9 @@ TEST_F(FunctionLibraryRuntimeTest,
   std::unique_ptr<Graph> g;
   ExpandInlineFunctionsOptions opts;
 
-  const string input_node = "Func/b/input/_0";
-  const string output_node = "Func/b/output/_1";
-  const string output_control_node = "Func/b/output_control_node/_2";
+  const std::string input_node = "Func/b/input/_0";
+  const std::string output_node = "Func/b/output/_1";
+  const std::string output_control_node = "Func/b/output_control_node/_2";
 
   // Use data outputs as output control source.
   opts.native_options.output_control_src = OutputControlSrc::kDataOutputs;
@@ -1203,9 +1203,9 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndKeepCallerNode) {
     return absl::OkStatus();
   };
 
-  const string input_node = "Func/b/input/_0";
-  const string output_node = "Func/b/output/_1";
-  const string output_control_node = "Func/b/output_control_node/_2";
+  const std::string input_node = "Func/b/input/_0";
+  const std::string output_node = "Func/b/output/_1";
+  const std::string output_control_node = "Func/b/output_control_node/_2";
 
   // Construct expected graph after function inlining.
   auto expected_graph = [&](const NodeDef& caller) -> GraphDef {
@@ -1266,9 +1266,9 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndPlaceInlinedNodes) {
   using test::function::NDef;
   using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode;
 
-  const string arg_device = "/job:arg/replica:0/task:0/device:GPU";
-  const string call_device = "/job:call/replica:0/task:1/device:GPU";
-  const string body_device = "/job:body/replica:0/task:1/device:CPU";
+  const std::string arg_device = "/job:arg/replica:0/task:0/device:GPU";
+  const std::string call_device = "/job:call/replica:0/task:1/device:GPU";
+  const std::string body_device = "/job:body/replica:0/task:1/device:CPU";
 
   const FunctionDef func = FDH::Create(
       "AddFunc", {"i: float"}, {"o: float"}, {},
@@ -1291,12 +1291,13 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndPlaceInlinedNodes) {
     return absl::OkStatus();
   };
 
-  const string input_node = "Func/b/input/_0";
-  const string output_node = "Func/b/output/_1";
-  const string output_control_node = "Func/b/output_control_node/_2";
+  const std::string input_node = "Func/b/input/_0";
+  const std::string output_node = "Func/b/output/_1";
+  const std::string output_control_node = "Func/b/output_control_node/_2";
 
   // Construct expected graph after function inlining.
-  auto expected_graph = [&](const std::vector<string>& placed) -> GraphDef {
+  auto expected_graph =
+      [&](const std::vector<std::string>& placed) -> GraphDef {
     return test::function::GDef(
         {
             NDef("a", "_Arg", {}, {{"T", DT_FLOAT}, {"index", 0}}, placed[0]),
@@ -1364,7 +1365,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndPlaceInlinedNodes) {
     auto g = std::make_unique<Graph>(OpRegistry::Global());
     TF_ASSERT_OK(construct_graph(&g));
 
-    const string merged_device = "/job:body/replica:0/task:1/device:CPU:*";
+    const std::string merged_device = "/job:body/replica:0/task:1/device:CPU:*";
 
     ExpandInlineFunctions(flr0_, g.get(), opts);
     GraphDef expected = expected_graph({/*a*/ arg_device,                //
@@ -1400,7 +1401,7 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
        {{"x1"}, "Add", {"o", "o"}, {{"T", T}}},
        {{"x2"}, "Mul", {"a", "x1"}, {{"T", T}}},
        {{"x3"}, "Mul", {"x1", "x2"}, {{"T", T}}},
-       FDH::Const<int32>("shape", {1, 2}),
+       FDH::Const<int32_t>("shape", {1, 2}),
        // A stateful node.
        {{"keep_me"},
         "RandomUniform",
@@ -1410,7 +1411,7 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
        {{"z"}, "Add", {"a", "o"}, {{"T", T}}}});
   Init({stateful_func});
 
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   auto y = test::AsTensor<float>({1.0, 2.0, 3.0, 4.0});
   Tensor z;
 
@@ -1427,15 +1428,15 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
 
   TF_CHECK_OK(InstantiateAndRun(flr0_, "SquareAndAddOneWithStatefulNodes", {},
                                 {x, y}, {&z}));
-  test::ExpectTensorEqual<int>(z, test::AsTensor<int32>({2, 5, 10, 17}));
+  test::ExpectTensorEqual<int>(z, test::AsTensor<int32_t>({2, 5, 10, 17}));
 
   stats_collector.FinalizeAndSwap(&stats);
 
   // Note that we do not expect the nodes named "y", "x1", "x2", or "x3" to
   // execute.
-  std::set<string> expected_node_names(
+  std::set<std::string> expected_node_names(
       {"_SOURCE", "shape", "x", "o", "a", "keep_me", "z", "z_RetVal"});
-  std::set<string> executed_node_names;
+  std::set<std::string> executed_node_names;
   for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
     executed_node_names.insert(node_stats.node_name());
   }
@@ -1475,9 +1476,9 @@ TEST_F(FunctionLibraryRuntimeTest, DoNotPruneControlOutputsFromBody) {
 
   stats_collector.FinalizeAndSwap(&stats);
 
-  std::set<string> expected_node_names(
+  std::set<std::string> expected_node_names(
       {"_SOURCE", "i", "add", "ret", "o_RetVal"});
-  std::set<string> executed_node_names;
+  std::set<std::string> executed_node_names;
   for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
     executed_node_names.insert(node_stats.node_name());
   }
@@ -1645,7 +1646,7 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiationError) {
 
 TEST_F(FunctionLibraryRuntimeTest, Error_BadControlFlow) {
   Init({test::function::InvalidControlFlow()});
-  auto x = test::AsTensor<int32>({0});
+  auto x = test::AsTensor<int32_t>({0});
   DCHECK_EQ(x.dtype(), DT_INT32);
   Tensor y;
   HasError(InstantiateAndRun(flr0_, "InvalidControlFlow", {}, {x}, {&y}),
@@ -2117,7 +2118,7 @@ TEST_F(FunctionLibraryRuntimeTest, FullTypeForInt32) {
        {{"z"}, "Add", {"x", "x"}, {{"T", T}}}});
   Init({int32_func});
 
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   auto y = test::AsTensor<float>({1.0, 2.0, 3.0, 4.0});
   Tensor z;
 
diff --git a/tensorflow/core/common_runtime/function_testlib.cc b/tensorflow/core/common_runtime/function_testlib.cc
index 77ee26e29d0e1f..a37f05da7df38e 100644
--- a/tensorflow/core/common_runtime/function_testlib.cc
+++ b/tensorflow/core/common_runtime/function_testlib.cc
@@ -126,8 +126,8 @@ FunctionDef BlockingOpFn() {
 }
 
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
-Output Call(Scope* scope, const string& op_name, const string& fn_name,
-            absl::Span<const Input> inputs) {
+Output Call(Scope* scope, const std::string& op_name,
+            const std::string& fn_name, absl::Span<const Input> inputs) {
   NodeDef def;
   NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
   for (const Input& input : inputs) {
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
index 9618c4083b869e..b71acef0c83408 100644
--- a/tensorflow/core/common_runtime/function_testlib.h
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -44,8 +44,8 @@ FunctionDef BlockingOpFn();
 
 // Adds a function call to the given scope and returns the output for the node.
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
-Output Call(Scope* scope, const string& op_name, const string& fn_name,
-            absl::Span<const Input> inputs);
+Output Call(Scope* scope, const std::string& op_name,
+            const std::string& fn_name, absl::Span<const Input> inputs);
 
 }  // namespace function
 }  // namespace test
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index e28eb03fd8787b..4c6846593885f5 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -81,7 +81,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
                    FunctionLibraryRuntime::Options opts,
                    const std::vector<Tensor>& args, std::vector<Tensor*> rets,
                    bool add_runner = true) {
-    std::atomic<int32> call_count(0);
+    std::atomic<int32_t> call_count(0);
     std::function<void(std::function<void()>)> runner =
         [&call_count](std::function<void()> fn) {
           ++call_count;
@@ -115,14 +115,14 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return absl::OkStatus();
   }
 
-  absl::Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+  absl::Status Instantiate(FunctionLibraryRuntime* flr, const std::string& name,
                            test::function::Attrs attrs,
                            FunctionLibraryRuntime::Handle* handle) {
     return flr->Instantiate(name, attrs, handle);
   }
 
   absl::Status Instantiate(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle) {
@@ -130,7 +130,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(FunctionLibraryRuntime* flr,
-                                 const string& name,
+                                 const std::string& name,
                                  test::function::Attrs attrs,
                                  const std::vector<Tensor>& args,
                                  std::vector<Tensor*> rets,
@@ -141,7 +141,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets,
@@ -171,7 +171,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
                    FunctionLibraryRuntime::Handle handle,
                    FunctionLibraryRuntime::Options opts,
                    CallFrameInterface* frame, bool add_runner = true) {
-    std::atomic<int32> call_count(0);
+    std::atomic<int32_t> call_count(0);
     std::function<void(std::function<void()>)> runner =
         [&call_count](std::function<void()> fn) {
           ++call_count;
@@ -232,7 +232,7 @@ TEST_F(FunctionLibraryRuntimeTest, DefaultThreadpool) {
   TF_CHECK_OK(Instantiate(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, &h));
 
   auto x1 = test::AsTensor<float>({1, 2, 3, 4});
-  std::atomic<int32> num_done(0);
+  std::atomic<int32_t> num_done(0);
   FunctionLibraryRuntime::Options opts;
   for (int i = 0; i < 4; ++i) {
     tp1->Schedule([&h, &x1, &opts, &num_done, this]() {
diff --git a/tensorflow/core/common_runtime/function_utils.cc b/tensorflow/core/common_runtime/function_utils.cc
index 53fe6154e578df..736dcc4db4811b 100644
--- a/tensorflow/core/common_runtime/function_utils.cc
+++ b/tensorflow/core/common_runtime/function_utils.cc
@@ -36,11 +36,11 @@ struct Endpoint {
   int index;
 
   // Returns the string name represents this endpoint.
-  string name() const {
+  std::string name() const {
     if (index == 0) {
       return node->name();
     } else {
-      return strings::StrCat(node->name(), ":", index);
+      return absl::StrCat(node->name(), ":", index);
     }
   }
 
@@ -285,11 +285,11 @@ bool IsFunctionCall(const FunctionLibraryDefinition& lib_def,
   return node.IsFunctionCall();
 }
 
-string NewName(const Node* n, bool pretty) {
+std::string NewName(const Node* n, bool pretty) {
   if (pretty) {
-    return strings::StrCat(n->type_string(), n->id());
+    return absl::StrCat(n->type_string(), n->id());
   } else {
-    return strings::StrCat("n", n->id());
+    return absl::StrCat("n", n->id());
   }
 }
 
@@ -347,20 +347,20 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
         ndef->add_input("unknown");
         continue;
       }
-      const string srcname = NewName(e->src(), pretty);
+      const std::string srcname = NewName(e->src(), pretty);
       if (!e->src()->IsOp()) {
       } else if (e->IsControlEdge()) {
-        ndef->add_input(strings::StrCat("^", srcname));
+        ndef->add_input(absl::StrCat("^", srcname));
       } else if (e->src_output() == 0) {
         ndef->add_input(srcname);
       } else {
-        ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
+        ndef->add_input(absl::StrCat(srcname, ":", e->src_output()));
       }
     }
   });
 }
 
-string DebugString(const Graph* g) {
+std::string DebugString(const Graph* g) {
   GraphDef gdef;
   ToGraphDef(g, &gdef);
   return DebugString(gdef);
diff --git a/tensorflow/core/common_runtime/function_utils.h b/tensorflow/core/common_runtime/function_utils.h
index cfbfe86936421b..97cd4cc63e8ea4 100644
--- a/tensorflow/core/common_runtime/function_utils.h
+++ b/tensorflow/core/common_runtime/function_utils.h
@@ -34,7 +34,7 @@ class OpDef;
 
 // Debugging facility.  Returns a debug string for a graph
 // representing an instantiated function.
-string DebugString(const Graph* g);
+std::string DebugString(const Graph* g);
 
 // Dump the contents of the "graph" to log files if the logging level is
 // sufficiently high.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 8eba1fbb914b9b..3aa8fa1003fbb7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -68,12 +68,12 @@ se::CudaComputeCapability GetComputeCapability() {
 }
 
 bool IsRocm() {
-  return std::holds_alternative<se::RocmComputeCapability>(
-      se::GPUMachineManager()
-          ->ExecutorForDevice(0)
-          .value()
-          ->GetDeviceDescription()
-          .gpu_compute_capability());
+  return se::GPUMachineManager()
+      ->ExecutorForDevice(0)
+      .value()
+      ->GetDeviceDescription()
+      .gpu_compute_capability()
+      .IsRocm();
 }
 
 void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 049155675fcdb6..4d192d8af9fab4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -34,8 +34,8 @@ limitations under the License.
 #ifdef TF_GPU_USE_PJRT
 #include "tensorflow/compiler/jit/pjrt_tensor_buffer.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "xla/future.h"
 #include "xla/literal.h"
-#include "xla/pjrt/pjrt_future.h"
 #endif  // TF_GPU_USE_PJRT
 
 #include "tensorflow/core/common_runtime/copy_tensor.h"
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index 28217fc69404be..f84dbfac0d3f6d 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_view.h"
 
 #include <atomic>
+#include <cstdint>
 #include <deque>
+#include <limits>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -67,7 +70,7 @@ namespace {
 typedef std::tuple<int32, int32> OutputAndControlEdges;
 
 OutputAndControlEdges CountOutputEdges(const Node* n) {
-  DCHECK_LE(n->out_edges().size(), kint32max);
+  DCHECK_LE(n->out_edges().size(), std::numeric_limits<int32_t>::max());
   int32_t num_output_edges = 0;
   int32_t num_output_control_edges = 0;
   for (auto e : n->out_edges()) {
@@ -123,7 +126,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
 
 char* GraphView::InitializeNode(char* ptr, const Node* n) {
   const int id = n->id();
-  CHECK(node_offsets_[id] == kuint32max);  // Initial value in constructor
+  CHECK(node_offsets_[id] ==
+        std::numeric_limits<uint32_t>::max());  // Initial value in constructor
 
   const size_t bytes = NodeItemBytes(n);
   constexpr size_t kItemAlignment = sizeof(NodeItem*);
@@ -135,7 +139,8 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   // (versus 64 bits on most machines if we just stored an array of NodeItem*
   // pointers). Casting to int64 is needed on 32bit CPU to avoid comparing
   // values as "int" vs "size_t" in CHECK_LE.
-  CHECK_LE(static_cast<int64_t>(ptr - space_), kuint32max);
+  CHECK_LE(static_cast<int64_t>(ptr - space_),
+           std::numeric_limits<uint32_t>::max());
   const uint32 offset = static_cast<uint32>(ptr - space_);
   node_offsets_[id] = offset;
   ptr += bytes;
@@ -250,18 +255,18 @@ absl::Status GraphView::Initialize(const Graph* g) {
   num_nodes_ = num_nodes;
   size_t total_bytes = 0;
   for (const Node* n : g->nodes()) {
-    if (n->out_edges().size() > kint32max) {
+    if (n->out_edges().size() > std::numeric_limits<int32_t>::max()) {
       return errors::InvalidArgument(
-          "The executor cannot handle nodes with more than ", kint32max,
-          " output edges. Node ", n->name(), " had ", n->out_edges().size(),
-          " output edges.");
+          "The executor cannot handle nodes with more than ",
+          std::numeric_limits<int32_t>::max(), " output edges. Node ",
+          n->name(), " had ", n->out_edges().size(), " output edges.");
     }
     total_bytes += NodeItemBytes(n);
   }
 
   node_offsets_ = new uint32[num_nodes];
   for (int i = 0; i < num_nodes; i++) {
-    node_offsets_[i] = kuint32max;
+    node_offsets_[i] = std::numeric_limits<uint32_t>::max();
   }
 
   space_ = new char[total_bytes];  // NodeItem objects are allocated here
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index 83d15e71282024..3864df8a6ce165 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -221,7 +221,7 @@ class GraphView {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
     uint32 offset = node_offsets_[id];
-    return ((offset == kuint32max)
+    return ((offset == std::numeric_limits<uint32_t>::max())
                 ? nullptr
                 : reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));
   }
@@ -233,7 +233,7 @@ class GraphView {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
     uint32 offset = node_offsets_[id];
-    DCHECK_NE(offset, kuint32max);
+    DCHECK_NE(offset, std::numeric_limits<uint32_t>::max());
     return *reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]);
   }
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/BUILD b/tensorflow/core/common_runtime/pluggable_device/BUILD
index ed14349725abc7..506edb7c152356 100644
--- a/tensorflow/core/common_runtime/pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/pluggable_device/BUILD
@@ -109,6 +109,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@local_xla//xla/pjrt:pjrt_api",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
@@ -137,6 +138,7 @@ cc_library(
         "//tensorflow/core/platform:stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@local_xla//xla/stream_executor:stream_executor_h",
         "@local_xla//xla/stream_executor/integrations:device_mem_allocator",
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
index c9b86fa69ee856..a9ebb6f4c3559d 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
@@ -241,7 +242,7 @@ absl::Status PluggableDevice::Init(const SessionOptions& options) {
   // callback instead of GPU environment variables: TF_GPU_THREAD_MODE,
   // TF_GPU_THREAD_COUNT, TF_FORCE_GPU_ALLOC_GROWTH,
   // TF_ENABLE_GPU_GARBAGE_COLLECTION, and TF_GPU_HOST_MEM_LIMIT_IN_MB.
-  string device_thread_mode;
+  std::string device_thread_mode;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global",
                                           &device_thread_mode));
   device_thread_mode = absl::AsciiStrToLower(device_thread_mode);
@@ -255,19 +256,19 @@ absl::Status PluggableDevice::Init(const SessionOptions& options) {
       thread_pool_ = std::make_unique<thread::ThreadPool>(
           options.env, ThreadOptions(),
           absl::StrCat("gpu_private_", tf_device_id_.value()),
-          static_cast<int32>(device_thread_count),
+          static_cast<int32_t>(device_thread_count),
           !options.config.experimental().disable_thread_spinning(),
           /*allocator=*/nullptr);
       set_tensorflow_device_thread_pool(thread_pool_.get());
     } else if (device_thread_mode == "gpu_shared") {
       static thread::ThreadPool* thread_pool = new thread::ThreadPool(
           options.env, ThreadOptions(), "gpu_shared",
-          static_cast<int32>(device_thread_count),
+          static_cast<int32_t>(device_thread_count),
           !options.config.experimental().disable_thread_spinning(),
           /*allocator=*/nullptr);
       set_tensorflow_device_thread_pool(thread_pool);
     } else {
-      string error_message =
+      std::string error_message =
           absl::StrCat("Invalid gpu_thread_mode: ", device_thread_mode);
       LOG(WARNING) << error_message;
       return errors::InvalidArgument(error_message);
@@ -292,8 +293,8 @@ Allocator* PluggableDevice::GetAllocator(AllocatorAttributes attr) {
   }
 }
 
-string PluggableDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
-                                                   const int stream_id) {
+std::string PluggableDevice::ComputeOpKernelDebugString(
+    const OpKernel& op_kernel, const int stream_id) {
   return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
                          " on ", platform_name_, tf_device_id_.value(),
                          " stream[", stream_id, "]");
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
index bfcbc16d0eb2da..9ccdc04192e071 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
@@ -48,9 +48,9 @@ namespace tensorflow {
 class PluggableDevice : public LocalDevice {
  public:
   PluggableDevice(const SessionOptions& options, const std::string& name,
-                  const string& device_type, const string& platform_name,
-                  Bytes memory_limit, const DeviceLocality& locality,
-                  TfDeviceId tf_device_id,
+                  const std::string& device_type,
+                  const std::string& platform_name, Bytes memory_limit,
+                  const DeviceLocality& locality, TfDeviceId tf_device_id,
                   const std::string& physical_device_desc,
                   Allocator* device_allocator, Allocator* cpu_allocator,
                   bool sync_every_op);
@@ -99,7 +99,7 @@ class PluggableDevice : public LocalDevice {
   // TODO(penpornk): Investigate renaming `GpuDeviceInfo` to `DeviceInfo`.
   DeviceBase::AcceleratorDeviceInfo* pluggable_device_info_ = nullptr;
   TfDeviceId tf_device_id_;
-  const string platform_name_;
+  const std::string platform_name_;
   const bool sync_every_op_ = false;
   EventMgr* em_ = nullptr;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
index e4b3ef4c8e7f2b..8ec86ab19d035c 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdlib>
 #include <cstring>
+#include <string>
 
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
@@ -94,14 +95,14 @@ bool PluggableDeviceBFCAllocator::GetGarbageCollectionValue() {
 }
 
 PluggableDeviceBFCAllocator::PluggableDeviceBFCAllocator(
-    tsl::SubAllocator* sub_allocator, size_t total_memory, const string& name,
-    bool force_memory_growth_requested)
+    tsl::SubAllocator* sub_allocator, size_t total_memory,
+    const std::string& name, bool force_memory_growth_requested)
     : PluggableDeviceBFCAllocator(sub_allocator, total_memory, GPUOptions(),
                                   name, force_memory_growth_requested) {}
 
 PluggableDeviceBFCAllocator::PluggableDeviceBFCAllocator(
     tsl::SubAllocator* sub_allocator, size_t total_memory,
-    const GPUOptions& gpu_options, const string& name,
+    const GPUOptions& gpu_options, const std::string& name,
     bool force_memory_growth_requested)
     : BFCAllocator(absl::WrapUnique(sub_allocator), total_memory, name, [&] {
         BFCAllocator::Options o;
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
index b968b9dbc1c734..9e87b2612343a6 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
@@ -30,11 +30,12 @@ namespace tensorflow {
 class PluggableDeviceBFCAllocator : public BFCAllocator {
  public:
   PluggableDeviceBFCAllocator(tsl::SubAllocator* sub_allocator,
-                              size_t total_memory, const string& name,
+                              size_t total_memory, const std::string& name,
                               bool force_memory_growth_requested);
   PluggableDeviceBFCAllocator(tsl::SubAllocator* sub_allocator,
                               size_t total_memory,
-                              const GPUOptions& gpu_options, const string& name,
+                              const GPUOptions& gpu_options,
+                              const std::string& name,
                               bool force_memory_growth_requested);
   ~PluggableDeviceBFCAllocator() override = default;
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
index c6c10b190f958c..924e0ed2cb6066 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device/device_event_mgr.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h"
 #include "tensorflow/core/framework/device.h"
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
index 596341fdae9d20..6e62aefa1707f3 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
index d580a185f6ed56..855e796ee7903d 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
@@ -82,7 +82,7 @@ int64_t MinSystemMemory(int64_t available_memory) {
 // Get the memory limit for the virtual device being created on PluggableDevice
 // with 'platform_device_id', when that virtual device is the only
 // virtual device being created on that PluggableDevice.
-absl::Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
+absl::Status SingleVirtualDeviceMemoryLimit(const std::string& platform_name,
                                             const GPUOptions& device_options,
                                             PlatformDeviceId platform_device_id,
                                             int64_t* memory_limit) {
@@ -119,18 +119,18 @@ absl::Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
 }
 }  // namespace
 
-PluggableDeviceFactory::PluggableDeviceFactory(const string& device_type,
-                                               const string& platform_name)
+PluggableDeviceFactory::PluggableDeviceFactory(const std::string& device_type,
+                                               const std::string& platform_name)
     : device_type_(device_type), platform_name_(platform_name) {}
 
 absl::Status PluggableDeviceFactory::ListPhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
 
   int device_count = platform->VisibleDeviceCount();
   for (int i = 0; i < device_count; ++i) {
-    const string device_name =
+    const std::string device_name =
         absl::StrCat("/physical_device:", device_type_, ":", i);
     devices->push_back(device_name);
   }
@@ -139,7 +139,7 @@ absl::Status PluggableDeviceFactory::ListPhysicalDevices(
 }
 
 absl::Status PluggableDeviceFactory::GetDeviceDetails(
-    int device_index, std::unordered_map<string, string>* details) {
+    int device_index, std::unordered_map<std::string, std::string>* details) {
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   if (platform == nullptr) {
@@ -163,7 +163,7 @@ absl::Status PluggableDeviceFactory::GetDeviceDetails(
 }
 
 absl::Status PluggableDeviceFactory::CreateDevices(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     std::vector<std::unique_ptr<Device>>* devices) {
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
@@ -214,20 +214,20 @@ absl::Status PluggableDeviceFactory::CreateDevices(
   return absl::OkStatus();
 }
 
-static string GetShortDeviceDescription(PlatformDeviceId platform_device_id,
-                                        const se::DeviceDescription& desc) {
+static std::string GetShortDeviceDescription(
+    PlatformDeviceId platform_device_id, const se::DeviceDescription& desc) {
   return strings::StrCat("device: ", platform_device_id.value(),
                          ", name: ", desc.name(),
                          ", pci bus id: ", desc.pci_bus_id());
 }
 
 absl::Status PluggableDeviceFactory::CreatePluggableDevice(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     TfDeviceId tf_device_id, int64_t memory_limit,
     const DeviceLocality& dev_locality,
     std::vector<std::unique_ptr<Device>>* devices) {
   DCHECK_GE(tf_device_id.value(), 0);
-  const string device_name = strings::StrCat(
+  const std::string device_name = strings::StrCat(
       name_prefix, "/device:", device_type_, ":", tf_device_id.value());
 
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
index 3f6ab10f9951fc..92a145080a0ba4 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
@@ -34,14 +34,15 @@ limitations under the License.
 namespace tensorflow {
 class PluggableDeviceFactory : public DeviceFactory {
  public:
-  PluggableDeviceFactory(const string& device_type,
-                         const string& platform_name);
-  absl::Status ListPhysicalDevices(std::vector<string>* devices) override;
+  PluggableDeviceFactory(const std::string& device_type,
+                         const std::string& platform_name);
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override;
   absl::Status CreateDevices(
       const SessionOptions& options, const std::string& name_prefix,
       std::vector<std::unique_ptr<Device>>* devices) override;
   absl::Status GetDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details) override;
+      int device_index,
+      std::unordered_map<std::string, std::string>* details) override;
 
  private:
   // Populates *device_localities with the DeviceLocality descriptor for
@@ -57,8 +58,8 @@ class PluggableDeviceFactory : public DeviceFactory {
       const DeviceLocality& dev_locality,
       std::vector<std::unique_ptr<Device>>* devices);
 
-  const string device_type_;
-  const string platform_name_;
+  const std::string device_type_;
+  const std::string platform_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc
index 52c09016bddcd1..6f735b70695b4f 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h"
 
+#include <string>
+
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "xla/stream_executor/platform_manager.h"
@@ -25,11 +27,11 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ValidatePluggableDeviceMachineManager(
-    const string& platform_name) {
+    const std::string& platform_name) {
   return se::PlatformManager::PlatformWithName(platform_name).status();
 }
 
-se::Platform* PluggableDeviceMachineManager(const string& platform_name) {
+se::Platform* PluggableDeviceMachineManager(const std::string& platform_name) {
   auto result = se::PlatformManager::PlatformWithName(platform_name);
   if (!result.ok()) {
     LOG(FATAL) << "Could not find platform with name "  // Crash OK
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
index b77917d14701c5..6d385ac31c435d 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
@@ -30,7 +30,8 @@ namespace tensorflow {
 
 // Initializes the PluggableDevice platform and returns OK if the
 // PluggableDevice platform could be initialized.
-absl::Status ValidatePluggableDeviceMachineManager(const string& platform_name);
+absl::Status ValidatePluggableDeviceMachineManager(
+    const std::string& platform_name);
 
 // Returns the PluggableDevice machine manager singleton, creating it and
 // initializing the PluggableDevices on the machine if needed the first time it
@@ -38,7 +39,7 @@ absl::Status ValidatePluggableDeviceMachineManager(const string& platform_name);
 // environment in the process (e.g., ValidatePluggableDeviceMachineManager()
 // returns OK).
 stream_executor::Platform* PluggableDeviceMachineManager(
-    const string& platform_name);
+    const std::string& platform_name);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
index 5e41c8db0c39b6..d348c678a15ea3 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
@@ -49,7 +49,7 @@ static absl::Status InitDeviceModule(stream_executor::SEInitPluginFn init_fn) {
     return absl::OkStatus();
   }
 
-  string device_type, platform_name;
+  std::string device_type, platform_name;
   TF_RETURN_IF_ERROR(stream_executor::InitStreamExecutorPlugin(
       init_fn, &device_type, &platform_name));
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
index 799f03644597a3..e428437c8d029f 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
 #include "xla/stream_executor/integrations/stream_executor_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -56,9 +58,9 @@ limitations under the License.
 namespace tensorflow {
 
 /*static*/ PluggableDeviceProcessState* PluggableDeviceProcessState::singleton(
-    const string& device_type, const string& platform_name) {
+    const std::string& device_type, const std::string& platform_name) {
   using ProcessStateMap =
-      std::unordered_map<string, PluggableDeviceProcessState*>;
+      std::unordered_map<std::string, PluggableDeviceProcessState*>;
   static ProcessStateMap* process_state_map = new ProcessStateMap;
   auto iter = process_state_map->find(platform_name);
   if (iter != process_state_map->end()) {
@@ -70,7 +72,7 @@ namespace tensorflow {
 }
 
 PluggableDeviceProcessState::PluggableDeviceProcessState(
-    const string& device_type, const string& platform_name)
+    const std::string& device_type, const std::string& platform_name)
     : pluggable_device_enabled_(false),
       device_type_(device_type),
       platform_name_(platform_name) {
@@ -92,7 +94,7 @@ int PluggableDeviceProcessState::BusIdForPluggableDevice(
 Allocator* PluggableDeviceProcessState::GetPluggableDeviceAllocator(
     const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes) {
   DCHECK(process_state_);
-  const string& allocator_type = options.allocator_type();
+  const std::string& allocator_type = options.allocator_type();
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   mutex_lock lock(mu_);
   tsl::CheckValidTfDeviceId(DeviceType(device_type_),
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
index 6e6b45fe887dca..6afb0daa77a2da 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
@@ -43,8 +43,8 @@ class PluggableDeviceProcessState {
  public:
   // Singleton that manages each platform's per-process state. e.g. allocation
   // of shared resource.
-  static PluggableDeviceProcessState* singleton(const string& device_type,
-                                                const string& platform_name);
+  static PluggableDeviceProcessState* singleton(
+      const std::string& device_type, const std::string& platform_name);
 
   // Query whether any PluggableDevice has been created so far.
   // Disable thread safety analysis since a race is benign here.
@@ -89,8 +89,8 @@ class PluggableDeviceProcessState {
  protected:
   // PluggableDeviceProcessState is a singleton that should not normally be
   // deleted except at process shutdown.
-  PluggableDeviceProcessState(const string& device_type,
-                              const string& platform_name);
+  PluggableDeviceProcessState(const std::string& device_type,
+                              const std::string& platform_name);
   virtual ~PluggableDeviceProcessState() = default;
 
   ProcessState::MDMap* mem_desc_map() {
@@ -101,8 +101,8 @@ class PluggableDeviceProcessState {
   static PluggableDeviceProcessState* instance_;
   ProcessState* process_state_;  // Not owned.
   bool pluggable_device_enabled_;
-  const string device_type_;
-  const string platform_name_;
+  const std::string device_type_;
+  const std::string platform_name_;
   mutex mu_;
 
   struct AllocatorParts {
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
index 27304954c25b0c..b7e9424982b22a 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
@@ -35,7 +35,7 @@ class PluggableDeviceSimpleAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
 
   bool TracksAllocationSizes() const override { return false; }
-  string Name() override { return "Simple allocator"; }
+  std::string Name() override { return "Simple allocator"; }
   std::optional<AllocatorStats> GetStats() override;
 
   AllocatorMemoryType GetMemoryType() const override {
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index 29ae03e0d1f996..e74d99fd2af2ad 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 
 PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize,
                              SubAllocator* allocator,
-                             RoundUpInterface* size_rounder, string name)
+                             RoundUpInterface* size_rounder, std::string name)
     : name_(std::move(name)),
       has_size_limit_(pool_size_limit > 0),
       auto_resize_(auto_resize),
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 6ce3b7886cfa6b..69c1e7a75b88d9 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -55,10 +55,10 @@ class PoolAllocator : public Allocator {
   // malloc/free operations.  This object takes ownership of allocator.
   PoolAllocator(size_t pool_size_limit, bool auto_resize,
                 SubAllocator* allocator, RoundUpInterface* size_rounder,
-                string name);
+                std::string name);
   ~PoolAllocator() override;
 
-  string Name() override { return name_; }
+  std::string Name() override { return name_; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
 
@@ -121,7 +121,7 @@ class PoolAllocator : public Allocator {
   // Delete the least recently used record.
   void EvictOne() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  const string name_;
+  const std::string name_;
   const bool has_size_limit_;
   const bool auto_resize_;
   size_t pool_size_limit_;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 98af5aedeedee1..c26495dfa83117 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -92,7 +92,7 @@ int64_t GetParallelSubgraphThreshold() {
 const char ProcessFunctionLibraryRuntime::kDefaultFLRDevice[] = "null";
 
 void ProcessFunctionLibraryRuntime::FunctionData::DistributedInit(
-    DistributedFunctionLibraryRuntime* parent, const string& function_name,
+    DistributedFunctionLibraryRuntime* parent, const std::string& function_name,
     const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -149,16 +149,17 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
 
 /* static */
 absl::Status ProcessFunctionLibraryRuntime::SendTensors(
-    const string& source_device, const string& target_device,
-    const string& key_prefix, int64_t src_incarnation,
+    const std::string& source_device, const std::string& target_device,
+    const std::string& key_prefix, int64_t src_incarnation,
     absl::Span<const Tensor> tensors_to_send, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
     RendezvousInterface* rendezvous) {
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   for (int i = 0; i < tensors_to_send.size(); ++i) {
-    string name = strings::StrCat(key_prefix, i);
-    string key = Rendezvous::CreateKey(source_device, src_incarnation,
-                                       target_device, name, FrameAndIter(0, 0));
+    std::string name = absl::StrCat(key_prefix, i);
+    std::string key =
+        Rendezvous::CreateKey(source_device, src_incarnation, target_device,
+                              name, FrameAndIter(0, 0));
     keys.push_back(key);
   }
   TF_RETURN_IF_ERROR(SendTensorsToRendezvous(
@@ -168,17 +169,18 @@ absl::Status ProcessFunctionLibraryRuntime::SendTensors(
 
 /* static */
 void ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
-    const string& source_device, const string& target_device,
-    const string& key_prefix, int64_t src_incarnation, int64_t num_tensors,
+    const std::string& source_device, const std::string& target_device,
+    const std::string& key_prefix, int64_t src_incarnation, int64_t num_tensors,
     DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
     RendezvousInterface* rendezvous, std::vector<Tensor>* received_tensors,
     StatusCallback done) {
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   for (int64_t i = 0; i < num_tensors; ++i) {
-    string name = strings::StrCat(key_prefix, i);
-    string key = Rendezvous::CreateKey(source_device, src_incarnation,
-                                       target_device, name, FrameAndIter(0, 0));
+    std::string name = absl::StrCat(key_prefix, i);
+    std::string key =
+        Rendezvous::CreateKey(source_device, src_incarnation, target_device,
+                              name, FrameAndIter(0, 0));
     keys.push_back(key);
   }
   RecvOutputsFromRendezvousAsync(rendezvous, device_context, alloc_attrs, keys,
@@ -207,7 +209,7 @@ absl::Status ProcessFunctionLibraryRuntime::GetRetTypes(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
-    const string& device_name, int64_t* incarnation) const {
+    const std::string& device_name, int64_t* incarnation) const {
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
     return errors::InvalidArgument("Device name: ", device_name, " not found.");
@@ -217,14 +219,14 @@ absl::Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::GetDeviceContext(
-    const string& device_name, DeviceContext** device_context) const {
+    const std::string& device_name, DeviceContext** device_context) const {
   *device_context = nullptr;
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
     return errors::InvalidArgument("Device name: ", device_name, " not found.");
   }
   Device* device = flr->device();
-  string device_type = device->parsed_name().type;
+  std::string device_type = device->parsed_name().type;
   if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
     // "TPU_SYSTEM" indicates that `device` is a CPU.
     return absl::OkStatus();
@@ -281,7 +283,7 @@ void ProcessFunctionLibraryRuntime::InitializeDeviceAndFlr() {
 }
 
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
-    const string& device_name) const {
+    const std::string& device_name) const {
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
@@ -299,14 +301,14 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
-    const string& function_key, const string& device_name,
+    const std::string& function_key, const std::string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
   return AddHandleLocked(function_key, device_name, local_handle);
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandleLocked(
-    const string& function_key, const string& device_name,
+    const std::string& function_key, const std::string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   auto h = next_handle_;
   function_data_[h] =
@@ -318,7 +320,8 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandleLocked(
 
 FunctionLibraryRuntime::Handle
 ProcessFunctionLibraryRuntime::AddMultiDeviceHandle(
-    std::unique_ptr<MultiDeviceFunctionData> data, const string& function_key) {
+    std::unique_ptr<MultiDeviceFunctionData> data,
+    const std::string& function_key) {
   mutex_lock l(mu_);
   auto h = next_handle_;
   mdevice_data_[h] = std::move(data);
@@ -338,14 +341,14 @@ bool ProcessFunctionLibraryRuntime::HasMultiDeviceHandle(
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
-    const string& function_key) const {
+    const std::string& function_key) const {
   tf_shared_lock l(mu_);
   return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle,
+    const std::string& device_name, FunctionLibraryRuntime::Handle handle,
     bool include_multi_device) const {
   tf_shared_lock l(mu_);
 
@@ -357,7 +360,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
     if (data.glue_.size() != 1) return kInvalidLocalHandle;
 
     const auto& pair = *data.glue_.begin();
-    const string& func_device_name = pair.first;
+    const std::string& func_device_name = pair.first;
     const ComponentFunctionData& component_data = pair.second;
     if (func_device_name != device_name) return kInvalidLocalHandle;
 
@@ -377,7 +380,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
   return function_data->local_handle();
 }
 
-string ProcessFunctionLibraryRuntime::GetDeviceName(
+std::string ProcessFunctionLibraryRuntime::GetDeviceName(
     FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
   auto iter = function_data_.find(handle);
@@ -496,11 +499,11 @@ void ProcessFunctionLibraryRuntime::PublishSubgraphs(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
   // Check if this function has already been instantiated.
-  const string& function_key = Canonicalize(function_name, attrs, options);
+  const std::string& function_key = Canonicalize(function_name, attrs, options);
 
   {
     mutex_lock l(mu_);
@@ -517,12 +520,12 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   if (VLOG_IS_ON(3)) {
     int index = 0;
     VLOG(3) << "Requested input devices:";
-    for (const string& device : options.input_devices) {
+    for (const std::string& device : options.input_devices) {
       VLOG(3) << "    [input " << index++ << "] " << device;
     }
     index = 0;
     VLOG(3) << "Requested output devices:";
-    for (const string& device : options.output_devices) {
+    for (const std::string& device : options.output_devices) {
       VLOG(3) << "    [output " << index++ << "] " << device;
     }
   }
@@ -552,7 +555,7 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   Device* cpu_device;
   TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
 
-  const uint64 optimization_start_time_usecs = Env::Default()->NowMicros();
+  const uint64_t optimization_start_time_usecs = Env::Default()->NowMicros();
   // Look up for optimized function graph in library. If found, skip
   // `OptimizeFunctionGraph` step.
   std::optional<absl::StatusOr<OptimizedFunctionGraph>> optimized_graph_proto =
@@ -593,8 +596,8 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
                                           function_name, *optimized_graph_info,
                                           options, *dev_set, lib_def_,
                                           composite_devices, cpu_device, env_));
-  const uint64 optimization_end_time_usecs = Env::Default()->NowMicros();
-  const uint64 graph_optimization_duration =
+  const uint64_t optimization_end_time_usecs = Env::Default()->NowMicros();
+  const uint64_t graph_optimization_duration =
       optimization_end_time_usecs - optimization_start_time_usecs;
   metrics::UpdateFunctionGraphOptimizationTime(graph_optimization_duration);
   VLOG(1) << "Finished graph optimizations for MultiDevice function \""
@@ -617,11 +620,11 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // We must preserve control returns in each of the function components,
   // otherwise after function inlining we might prune side-effectful nodes.
   const auto control_ret =
-      [&node_name_to_control_ret](const Node* n) -> std::optional<string> {
+      [&node_name_to_control_ret](const Node* n) -> std::optional<std::string> {
     const auto it = node_name_to_control_ret.find(n->name());
     return it != node_name_to_control_ret.end()
                // NOLINTNEXTLINE
-               ? absl::make_optional<string>(it->second)
+               ? absl::make_optional<std::string>(it->second)
                // NOLINTNEXTLINE
                : absl::nullopt;
   };
@@ -659,11 +662,11 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   auto instantiate_component = [this, dev_set, &data_lib_def, &control_ret,
                                 &options,
-                                &data](const string& target,
+                                &data](const std::string& target,
                                        std::unique_ptr<Graph> subgraph,
                                        ComponentFunctionData* comp_data,
                                        std::function<void(absl::Status)> done) {
-    const string& device_type =
+    const std::string& device_type =
         dev_set->FindDeviceByName(target)->device_type();
 
     bool ints_on_device =
@@ -854,7 +857,7 @@ absl::Status ProcessFunctionLibraryRuntime::GetOutputDevices(
       continue;
     }
 
-    const string& target = pair.first;
+    const std::string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
     Device* target_device = nullptr;
     Device* host = nullptr;
@@ -863,7 +866,7 @@ absl::Status ProcessFunctionLibraryRuntime::GetOutputDevices(
         data->has_remote_outputs = true;
       }
       target_device = device_set()->FindDeviceByName(target);
-      string remote_host;
+      std::string remote_host;
       TF_RETURN_IF_ERROR(
           DeviceNameUtils::DeviceNameToCpuDeviceName(target, &remote_host));
       host = device_set()->FindDeviceByName(remote_host);
@@ -917,14 +920,14 @@ absl::Status ProcessFunctionLibraryRuntime::PrepareRunMultiDevice(
   return absl::OkStatus();
 }
 
-std::vector<string> ProcessFunctionLibraryRuntime::GetOrderedSubgraphs(
+std::vector<std::string> ProcessFunctionLibraryRuntime::GetOrderedSubgraphs(
     const MultiDeviceFunctionData* data) const {
-  std::vector<string> subgraph_keys;
+  std::vector<std::string> subgraph_keys;
   subgraph_keys.reserve(data->glue_.size());
   for (const auto& pair : data->glue_) {
     subgraph_keys.push_back(pair.first);
   }
-  auto send_first_ordering = [&](const string& a, const string& b) {
+  auto send_first_ordering = [&](const std::string& a, const std::string& b) {
     auto a_summary = data->glue_.at(a).async_attributes.summary();
     auto b_summary = data->glue_.at(b).async_attributes.summary();
     if (a_summary == b_summary) {
@@ -969,9 +972,9 @@ absl::Status ProcessFunctionLibraryRuntime::RunMultiDeviceSync(
   //
   // We assume that the partitioning has a valid deadlock-free ordering and the
   // safety of running synchronously has already been confirmed by this point.
-  std::vector<string> subgraph_keys = GetOrderedSubgraphs(data);
+  std::vector<std::string> subgraph_keys = GetOrderedSubgraphs(data);
 
-  for (const string& target : subgraph_keys) {
+  for (const std::string& target : subgraph_keys) {
     const ComponentFunctionData& comp_data = data->glue_.at(target);
     FunctionLibraryRuntime::Handle comp_handle = comp_data.handle;
 
@@ -1003,9 +1006,9 @@ absl::Status ProcessFunctionLibraryRuntime::RunMultiDeviceSync(
                        &comp_tensor_rets);
       if (!run_status.ok()) {
         VLOG(2) << "Component function execution failed: " << run_status;
-        const string function_and_msg = strings::StrCat(
-            errors::FormatFunctionForError(data->function_name_), " ",
-            run_status.message());
+        const std::string function_and_msg =
+            absl::StrCat(errors::FormatFunctionForError(data->function_name_),
+                         " ", run_status.message());
         if (opts.rendezvous != nullptr) opts.rendezvous->StartAbort(run_status);
         return errors::CreateWithUpdatedMessage(run_status, function_and_msg);
       } else {
@@ -1067,7 +1070,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
 
   FunctionLibraryRuntime::Options opts_copy = opts;
   for (const auto& pair : data->glue_) {
-    const string& target = pair.first;
+    const std::string& target = pair.first;
     const ComponentFunctionData& comp_data = pair.second;
     FunctionLibraryRuntime::Handle comp_handle = pair.second.handle;
 
@@ -1094,9 +1097,9 @@ void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
         VLOG(2) << "Component function execution on target " << target
                 << " from " << data->function_name_ << " with handle "
                 << comp_handle << " failed: " << status;
-        const string function_and_msg = strings::StrCat(
-            errors::FormatFunctionForError(data->function_name_), " ",
-            status.message());
+        const std::string function_and_msg =
+            absl::StrCat(errors::FormatFunctionForError(data->function_name_),
+                         " ", status.message());
         refcounted_done->UpdateStatus(
             errors::CreateWithUpdatedMessage(status, function_and_msg));
         // Cancel the execution of other component functions.
@@ -1147,7 +1150,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
   if (options.is_multi_device_function) {
@@ -1195,7 +1198,7 @@ absl::Status ProcessFunctionLibraryRuntime::IsCrossProcess(
 }
 
 void ProcessFunctionLibraryRuntime::InstantiateRemote(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -1207,7 +1210,7 @@ void ProcessFunctionLibraryRuntime::InstantiateRemote(
   }
   auto target = options.target;
   VLOG(1) << "ProcessFLR Instantiate: " << function_name << " on: " << target;
-  string function_key = Canonicalize(function_name, attrs, options);
+  std::string function_key = Canonicalize(function_name, attrs, options);
   FunctionData* f;
   {
     mutex_lock l(mu_);
@@ -1257,7 +1260,7 @@ absl::Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
   // Release all component function handles.
   absl::Status overall_status;
   for (const auto& it : mdata->glue_) {
-    const string& device = it.first;
+    const std::string& device = it.first;
     FunctionLibraryRuntime::Handle flr_handle = it.second.handle;
     FunctionLibraryRuntime* flr = GetFLR(device);
     if (flr == nullptr) {
@@ -1291,7 +1294,7 @@ absl::Status ProcessFunctionLibraryRuntime::ReleaseHandle(
   }
 
   FunctionLibraryRuntime* flr = nullptr;
-  string target_device;
+  std::string target_device;
   {
     mutex_lock l(mu_);
 
@@ -1455,7 +1458,7 @@ void ProcessFunctionLibraryRuntime::RunInternal(
     std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
     FunctionLibraryRuntime::DoneCallback done) const {
   FunctionLibraryRuntime* flr = nullptr;
-  string target_device;
+  std::string target_device;
   FunctionLibraryRuntime::LocalHandle local_handle;
   {
     tf_shared_lock l(mu_);
@@ -1480,7 +1483,7 @@ void ProcessFunctionLibraryRuntime::RunInternal(
   flr = GetFLR(target_device);
   if (flr != nullptr) {
     auto rendezvous = opts.rendezvous;
-    string source_device = opts.source_device;
+    std::string source_device = opts.source_device;
     DeviceContext* device_context;
     absl::Status s = GetDeviceContext(source_device, &device_context);
     if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 0305bde12e6cba..d37f341ae83531 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -94,8 +94,8 @@ class ProcessFunctionLibraryRuntime {
   // `tensors_to_send` and indicates how the input tensors are allocated. Method
   // takes references on each of the `tensors_to_send`. Method doesn't block.
   static absl::Status SendTensors(
-      const string& source_device, const string& target_device,
-      const string& key_prefix, int64_t src_incarnation,
+      const std::string& source_device, const std::string& target_device,
+      const std::string& key_prefix, int64_t src_incarnation,
       absl::Span<const Tensor> tensors_to_send, DeviceContext* device_context,
       const std::vector<AllocatorAttributes>& alloc_attrs,
       RendezvousInterface* rendezvous);
@@ -107,23 +107,23 @@ class ProcessFunctionLibraryRuntime {
   // tensors and should either be empty or `num_tensors` in size. Method doesn't
   // block and calls `done` when `num_tensors` are fetched.
   static void ReceiveTensorsAsync(
-      const string& source_device, const string& target_device,
-      const string& key_prefix, int64_t src_incarnation, int64_t num_tensors,
-      DeviceContext* device_context,
+      const std::string& source_device, const std::string& target_device,
+      const std::string& key_prefix, int64_t src_incarnation,
+      int64_t num_tensors, DeviceContext* device_context,
       const std::vector<AllocatorAttributes>& alloc_attrs,
       RendezvousInterface* rendezvous, std::vector<Tensor>* received_tensors,
       StatusCallback done);
 
   static const char kDefaultFLRDevice[];
   // Returns the FunctionLibraryRuntime for the corresponding device_name.
-  FunctionLibraryRuntime* GetFLR(const string& device_name) const;
+  FunctionLibraryRuntime* GetFLR(const std::string& device_name) const;
 
   // Returns the return types for the function identified by handle `h`.
   absl::Status GetRetTypes(FunctionLibraryRuntime::Handle h,
                            DataTypeVector* ret_types);
 
   // Returns the device incarnation for the given device_name.
-  absl::Status GetDeviceIncarnation(const string& device_name,
+  absl::Status GetDeviceIncarnation(const std::string& device_name,
                                     int64_t* incarnation) const;
 
   // For a given canonicalized key signature of the function instantiated
@@ -131,11 +131,12 @@ class ProcessFunctionLibraryRuntime {
   // that value. Uses core/common_runtime/framework/function.h::Canonicalize
   // to canonicalize the function signature.
   FunctionLibraryRuntime::Handle AddHandle(
-      const string& function_key, const string& device_name,
+      const std::string& function_key, const std::string& device_name,
       FunctionLibraryRuntime::LocalHandle local_handle);
 
   // Returns a handle if found for the given key, else returns kInvalidHandle.
-  FunctionLibraryRuntime::Handle GetHandle(const string& function_key) const;
+  FunctionLibraryRuntime::Handle GetHandle(
+      const std::string& function_key) const;
 
   // For the given handle instantiated on device `device_name` returns the local
   // index of instantiation of that function. If the function was not
@@ -146,7 +147,7 @@ class ProcessFunctionLibraryRuntime {
   // with a single component that is placed on `device_name`, then this method
   // will return the local handle for that component.
   FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
-      const string& device_name, FunctionLibraryRuntime::Handle handle,
+      const std::string& device_name, FunctionLibraryRuntime::Handle handle,
       bool include_multi_device = false) const;
 
   // Fills `output_devices` with the devices on which the results will
@@ -161,7 +162,7 @@ class ProcessFunctionLibraryRuntime {
   // Allows for function_name to be instantiated on different devices
   // as specified in attrs.
   absl::Status Instantiate(
-      const string& function_name, AttrSlice attrs,
+      const std::string& function_name, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle);
 
@@ -273,7 +274,7 @@ class ProcessFunctionLibraryRuntime {
     // The handle for the instantiated component function.
     FunctionLibraryRuntime::Handle handle;
     // The name for the component function.
-    string name;
+    std::string name;
     // arg_indices.size() is the number of arguments to the component function.
     // The i-th argument of the component function comes from the
     // `arg_indices[i]`-th argument of the multi-device function.
@@ -297,8 +298,8 @@ class ProcessFunctionLibraryRuntime {
   // The fields are filled in during instantiation. Once the object is
   // added to mdevice_data_, all fields are constant.
   struct MultiDeviceFunctionData {
-    MultiDeviceFunctionData(const string& function_name,
-                            const string& function_key, int num_outputs,
+    MultiDeviceFunctionData(const std::string& function_name,
+                            const std::string& function_key, int num_outputs,
                             DataTypeVector ret_types)
         : function_name_(function_name),
           function_key_(function_key),
@@ -308,9 +309,9 @@ class ProcessFunctionLibraryRuntime {
           is_cross_process_(false),
           has_remote_outputs(false) {}
 
-    const string function_name_;
-    const string function_key_;
-    uint64 instantiation_counter_;
+    const std::string function_name_;
+    const std::string function_key_;
+    uint64_t instantiation_counter_;
     // Stored here to resize the output tensor vector when function is run.
     const int num_outputs_;
     DataTypeVector ret_types_;
@@ -325,12 +326,12 @@ class ProcessFunctionLibraryRuntime {
 
     // Maps the device name to the information about the component function
     // be run on this device.
-    std::unordered_map<string, ComponentFunctionData> glue_;
+    std::unordered_map<std::string, ComponentFunctionData> glue_;
   };
 
   struct CleanUpItem {
-    string device;
-    uint64 step_id;
+    std::string device;
+    uint64_t step_id;
     FunctionLibraryRuntime::Handle local_handle;
   };
 
@@ -343,18 +344,18 @@ class ProcessFunctionLibraryRuntime {
 
  private:
   FunctionLibraryRuntime::Handle AddHandleLocked(
-      const string& function_key, const string& device_name,
+      const std::string& function_key, const std::string& device_name,
       FunctionLibraryRuntime::LocalHandle local_handle)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // For a given device_name, returns a DeviceContext for copying
   // tensors to/from the device.
-  absl::Status GetDeviceContext(const string& device_name,
+  absl::Status GetDeviceContext(const std::string& device_name,
                                 DeviceContext** device_context) const;
 
   // Looks up the information for the given `handle` and returns the name
   // of the device where the function is registered.
-  string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
+  std::string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
 
   // Removes handle from the state owned by this object.
   absl::Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
@@ -380,19 +381,19 @@ class ProcessFunctionLibraryRuntime {
   absl::Status ReleaseMultiDeviceHandle(FunctionLibraryRuntime::Handle handle);
 
   absl::Status InstantiateMultiDevice(
-      const string& function_name, AttrSlice attrs,
+      const std::string& function_name, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle);
 
   void InstantiateRemote(
-      const string& function_name, AttrSlice attrs,
+      const std::string& function_name, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle,
       FunctionLibraryRuntime::DoneCallback done);
 
   FunctionLibraryRuntime::Handle AddMultiDeviceHandle(
       const std::unique_ptr<MultiDeviceFunctionData> data,
-      const string& function_key);
+      const std::string& function_key);
 
   bool HasMultiDeviceHandle(FunctionLibraryRuntime::Handle handle) const;
 
@@ -426,7 +427,7 @@ class ProcessFunctionLibraryRuntime {
                                        InternalArgs* comp_args);
 #endif  // IS_MOBILE_PLATFORM
 
-  std::vector<string> GetOrderedSubgraphs(
+  std::vector<std::string> GetOrderedSubgraphs(
       const MultiDeviceFunctionData* data) const;
 
   absl::Status PrepareRunMultiDevice(
@@ -458,15 +459,15 @@ class ProcessFunctionLibraryRuntime {
   // (to be executed on `target_device`) function.
   class FunctionData {
    public:
-    FunctionData(const string& target_device,
+    FunctionData(const std::string& target_device,
                  FunctionLibraryRuntime::LocalHandle local_handle,
-                 const string& function_key)
+                 const std::string& function_key)
         : target_device_(target_device),
           local_handle_(local_handle),
           function_key_(function_key) {}
 
-    const string& target_device() { return target_device_; }
-    const string& function_key() { return function_key_; }
+    const std::string& target_device() { return target_device_; }
+    const std::string& function_key() { return function_key_; }
 
     FunctionLibraryRuntime::LocalHandle local_handle() {
       mutex_lock l(mu_);
@@ -476,7 +477,8 @@ class ProcessFunctionLibraryRuntime {
     // Initializes the FunctionData object by potentially making an Initialize
     // call to the DistributedFunctionLibraryRuntime.
     void DistributedInit(
-        DistributedFunctionLibraryRuntime* parent, const string& function_name,
+        DistributedFunctionLibraryRuntime* parent,
+        const std::string& function_name,
         const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
         const FunctionLibraryRuntime::InstantiateOptions& options,
         FunctionLibraryRuntime::DoneCallback done);
@@ -489,9 +491,9 @@ class ProcessFunctionLibraryRuntime {
    private:
     mutex mu_;
 
-    const string target_device_;
+    const std::string target_device_;
     FunctionLibraryRuntime::LocalHandle local_handle_ TF_GUARDED_BY(mu_);
-    const string function_key_;
+    const std::string function_key_;
     bool is_cross_process_ TF_GUARDED_BY(mu_) = false;
     bool init_started_ TF_GUARDED_BY(mu_) = false;
     absl::Status init_result_ TF_GUARDED_BY(mu_);
@@ -516,7 +518,7 @@ class ProcessFunctionLibraryRuntime {
   std::vector<CompositeDevice*> composite_devices_ TF_GUARDED_BY(mu_);
 
   // Holds all the function instantiations. Maps function_keys to handles.
-  std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
+  std::unordered_map<std::string, FunctionLibraryRuntime::Handle> table_
       TF_GUARDED_BY(mu_);
 
   // Function data for instantiated remote functions.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 677b84187a31da..970ac62b197f17 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -59,7 +59,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
  public:
   explicit TestClusterFLR(DeviceMgr* device_mgr) : device_mgr_(device_mgr) {}
 
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
                    FunctionLibraryRuntime::LocalHandle* handle,
@@ -82,7 +82,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
            absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override {}
 
-  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+  void CleanUp(uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
                FunctionLibraryRuntime::DoneCallback done) override {}
 
   DeviceMgr* remote_device_mgr() const override { return device_mgr_; }
@@ -169,7 +169,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status Instantiate(
-      const string& name, test::function::Attrs attrs,
+      const std::string& name, test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       FunctionLibraryRuntime::Handle* handle) {
     return proc_flr_->Instantiate(name, attrs, instantiate_opts, handle);
@@ -214,7 +214,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
   template <typename T, typename K>
   absl::Status RunWithRuntime(
-      const string& name, FunctionLibraryRuntime::Options opts,
+      const std::string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       const T& args, std::vector<K*> rets,
@@ -270,7 +270,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status Run(
-      const string& name, FunctionLibraryRuntime::Options opts,
+      const std::string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets,
@@ -280,7 +280,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status RunWithPackedArgs(
-      const string& name, FunctionLibraryRuntime::Options opts,
+      const std::string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       const FunctionArgsInterface& args, std::vector<FunctionRet*> rets,
@@ -503,7 +503,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
 TEST_F(ProcessFunctionLibraryRuntimeTest,
        SameDeviceXTimesFourInt32MultiDevice) {
   Init({test::function::XTimesTwoInt32(), test::function::XTimesFourInt32()});
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.remote_execution = true;
@@ -515,13 +515,13 @@ TEST_F(ProcessFunctionLibraryRuntimeTest,
   Tensor y;
   TF_CHECK_OK(Run("XTimesFourInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
                   {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({4, 8, 12, 16}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({4, 8, 12, 16}));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest,
        MultipleCallsSameDeviceXTimesMultiDevice) {
   Init({test::function::XTimesTwoInt32(), test::function::XTimesFourInt32()});
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.remote_execution = true;
@@ -533,10 +533,10 @@ TEST_F(ProcessFunctionLibraryRuntimeTest,
   Tensor y;
   TF_CHECK_OK(Run("XTimesTwoInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
                   {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({2, 4, 6, 8}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({2, 4, 6, 8}));
   TF_CHECK_OK(Run("XTimesFourInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
                   {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({4, 8, 12, 16}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({4, 8, 12, 16}));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceFindDevice) {
@@ -668,7 +668,7 @@ bool IsCUDATensor(const Tensor& t) {
 void TestTwoDeviceMult(
     ProcessFunctionLibraryRuntimeTest* fixture,
     const FunctionLibraryRuntime::InstantiateOptions& inst_opts,
-    const string& error = "") {
+    const std::string& error = "") {
   fixture->Init({test::function::TwoDeviceMult()});
   FunctionLibraryRuntime::Options opts;
   auto x = test::AsTensor<float>({1, 2, 3});
@@ -764,18 +764,18 @@ void TestTwoDeviceInputOutput(
   test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({30, 60}));
 }
 
-std::vector<string> CompleteDevices(const std::vector<string>& v) {
-  std::vector<string> result;
+std::vector<std::string> CompleteDevices(const std::vector<std::string>& v) {
+  std::vector<std::string> result;
   result.reserve(v.size());
-  for (const string& s : v) {
-    result.push_back(strings::StrCat("/job:a/replica:0/task:0/device:", s));
+  for (const std::string& s : v) {
+    result.push_back(absl::StrCat("/job:a/replica:0/task:0/device:", s));
   }
   return result;
 }
 
 FunctionLibraryRuntime::InstantiateOptions MakeOptions(
-    const string& target, const std::vector<string>& input_devices,
-    const std::vector<string>& output_devices) {
+    const std::string& target, const std::vector<std::string>& input_devices,
+    const std::vector<std::string>& output_devices) {
   FunctionLibraryRuntime::InstantiateOptions inst_opts;
   inst_opts.target = target;
   inst_opts.input_devices = CompleteDevices(input_devices);
@@ -924,8 +924,9 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_EmptyBodySwap) {
   test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2}));
 }
 
-Tensor GetResourceHandle(const string& var_name, const string& container,
-                         const string& device_name) {
+Tensor GetResourceHandle(const std::string& var_name,
+                         const std::string& container,
+                         const std::string& device_name) {
   ResourceHandle handle;
   handle.set_device(device_name);
   handle.set_container(container);
@@ -1189,8 +1190,9 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_StateHandle) {
       // Attrs
       {},
       // Nodes
-      {FunctionDefHelper::Const<int32>("shape", absl::Span<const int32>({1})),
-       FunctionDefHelper::Const<int32>("minval", 0),
+      {FunctionDefHelper::Const<int32_t>("shape",
+                                         absl::Span<const int32_t>({1})),
+       FunctionDefHelper::Const<int32_t>("minval", 0),
        {{"maxval"}, "ReadVariableOp", {"x"}, {{"dtype", T}}, {}},
        // A stateful node.
        {{"y"},
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index a91eb74f1ef464..c79b42faffe83c 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -46,7 +46,7 @@ namespace tensorflow {
 ProcessState::ProcessState()
     : numa_enabled_(false), cpu_allocators_cached_(0) {}
 
-string ProcessState::MemDesc::DebugString() {
+std::string ProcessState::MemDesc::DebugString() {
   return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
                          ", dma: ", gpu_registered, ", nic: ", nic_registered);
 }
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index dd667cc236a8e9..eb0b7f53a8c7a4 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -51,7 +51,7 @@ class ProcessState : public ProcessStateInterface {
           dev_index(0),
           gpu_registered(false),
           nic_registered(false) {}
-    string DebugString();
+    std::string DebugString();
   };
 
   // If NUMA Allocators are desired, call this before calling any
@@ -122,7 +122,7 @@ class RecordingAllocator : public Allocator {
                      ProcessState::MemDesc md, mutex* mu)
       : mm_(mm), a_(a), md_(md), mu_(mu) {}
 
-  string Name() override { return a_->Name(); }
+  std::string Name() override { return a_->Name(); }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     void* p = a_->AllocateRaw(alignment, num_bytes);
     mutex_lock l(*mu_);
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 65733614bdc54c..233dcde498a6bc 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -35,12 +35,12 @@ namespace tensorflow {
 namespace {
 
 // Use environment setting if specified (init once)
-int32 GetEnvNumInterOpThreads() {
+int32_t GetEnvNumInterOpThreads() {
   static int32_t env_num_threads = NumInterOpThreadsFromEnvironment();
   return env_num_threads;
 }
 
-int32 DefaultNumInterOpThreads() {
+int32_t DefaultNumInterOpThreads() {
 #ifndef __ANDROID__
   int32_t env_num_threads = GetEnvNumInterOpThreads();
   if (env_num_threads > 0) {
@@ -90,13 +90,13 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
-int32 NumInterOpThreadsFromEnvironment() {
+int32_t NumInterOpThreadsFromEnvironment() {
   int32_t num;
   const char* val = std::getenv("TF_NUM_INTEROP_THREADS");
   return (val && absl::SimpleAtoi(val, &num)) ? num : 0;
 }
 
-int32 NumIntraOpThreadsFromEnvironment() {
+int32_t NumIntraOpThreadsFromEnvironment() {
   int32_t num;
   const char* val = std::getenv("TF_NUM_INTRAOP_THREADS");
   return (val && absl::SimpleAtoi(val, &num)) ? num : 0;
@@ -122,7 +122,7 @@ int32 DefaultNumIntraOpThreads() {
   return port::MaxParallelism();
 }
 #endif  // defined(ENABLE_ONEDNN_OPENMP) && defined(ENABLE_MKL)
-int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
+int32_t NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32_t inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op > 0) return inter_op;
   const int32_t env_inter_op = GetEnvNumInterOpThreads();
@@ -169,7 +169,7 @@ void SchedClosure(absl::AnyInvocable<void()> closure) {
   if (!tsl::tracing::EventCollector::IsEnabled()) {
     return Env::Default()->SchedClosure(std::move(closure));
   }
-  uint64 id = tsl::tracing::GetUniqueArg();
+  uint64_t id = tsl::tracing::GetUniqueArg();
   tsl::tracing::RecordEvent(tsl::tracing::EventCategory::kScheduleClosure, id);
 
   Env::Default()->SchedClosure([id, closure = std::move(closure)]() mutable {
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index cc2bc4390793c0..682556d19fbfad 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -32,10 +32,10 @@ namespace tensorflow {
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
 // Returns the TF_NUM_INTEROP_THREADS environment value, or 0 if not specified.
-int32 NumInterOpThreadsFromEnvironment();
+int32_t NumInterOpThreadsFromEnvironment();
 
 // Returns the TF_NUM_INTRAOP_THREADS environment value, or 0 if not specified.
-int32 NumIntraOpThreadsFromEnvironment();
+int32_t NumIntraOpThreadsFromEnvironment();
 
 // Returns the number of inter op threads specified in `options` or a default.
 // If no value or a negative value is specified in the provided options, then
@@ -43,7 +43,7 @@ int32 NumIntraOpThreadsFromEnvironment();
 // environment variable. If neither a value is specified in the options or in
 // the environment, this function will return a reasonable default value based
 // on the number of schedulable CPUs, and any MKL and OpenMP configurations.
-int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
+int32_t NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
 
 // Creates a thread pool with number of inter op threads.
 // The number is set if `num_threads` > 0, otherwise it will be configured by
diff --git a/tensorflow/core/common_runtime/profile_handler.h b/tensorflow/core/common_runtime/profile_handler.h
index 71aac10bf6887a..28ae706c3f08b7 100644
--- a/tensorflow/core/common_runtime/profile_handler.h
+++ b/tensorflow/core/common_runtime/profile_handler.h
@@ -40,9 +40,9 @@ class ProfileHandler {
   // - label: Extra content for timeline click text.
   // - op_type: String name of the Op.
   // - details: Main content for timeline click text.
-  virtual void RecordOneOp(const string& device, const NodeExecStats& stats,
-                           bool is_copy, absl::string_view label,
-                           absl::string_view op_type,
+  virtual void RecordOneOp(const std::string& device,
+                           const NodeExecStats& stats, bool is_copy,
+                           absl::string_view label, absl::string_view op_type,
                            absl::string_view details) = 0;
 
   // Records that the current step finished.
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index dee1903b112b2b..6d65024cc4f50a 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -159,7 +159,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
         if (need_create_iter) {
           tsl::profiler::TraceMe activit1y(
               [&]() {
-                return strings::StrCat(
+                return absl::StrCat(
                     "PropagateOutputs::NextIteration::CreateIterationState");
               },
               tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
@@ -259,7 +259,7 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
   const ImmutableExecutorState::FrameInfo& frame_info =
       immutable_state_.get_enter_frame_info(node_item);
 
-  const uint64 child_id = Hash64Combine(
+  const uint64_t child_id = Hash64Combine(
       frame->frame_id,
       Hash64Combine(iter_state->iter_num, Hash64(frame_info.name)));
 
@@ -275,7 +275,7 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
   // Need to create a new frame instance.
   // Note that this new frame instance is created without any locks.
   if (vlog_) {
-    const string child_name = strings::StrCat(
+    const std::string child_name = strings::StrCat(
         frame->frame_name, ";", iter_state->iter_num, ";", frame_info.name);
     VLOG(2) << "Create frame: " << child_name << " id: " << child_id;
   }
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index bdfea225a5ac2d..238cb0552b2c67 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -255,11 +255,11 @@ class PropagatorState {
     // The name of this frame, which is the concatenation of its parent
     // frame name, the iteration of the parent frame when this frame was
     // created, and the value of the attr 'frame_name'.
-    string frame_name;
+    std::string frame_name;
 
     // The unique id for this frame. Generated by fingerprinting
     // frame_name.
-    uint64 frame_id;
+    uint64_t frame_id;
 
     // The iteration state of its parent frame when this frame is created.
     // nullptr if there is no parent frame. The frame_name/parent_iter pair
@@ -543,7 +543,7 @@ class PropagatorState {
   // child frame is a hash composed of the ID of the parent frame, the iteration
   // number at which the parent frame is creating the new frame, and the
   // name of the new frame from nodedef.
-  absl::flat_hash_map<uint64, FrameState*> outstanding_frames_
+  absl::flat_hash_map<uint64_t, FrameState*> outstanding_frames_
       TF_GUARDED_BY(mu_);
 
   PropagatorState(const PropagatorState&) = delete;
@@ -579,12 +579,12 @@ class OrderedPropagatorState : public PropagatorState {
 
    private:
     static bool compare(TaggedNode const& lhs, TaggedNode const& rhs) {
-      std::tuple<int, uint64, int64_t> lhs_prio{lhs.node_item->node_id,
-                                                lhs.input_frame->frame_id,
-                                                lhs.input_iter->iter_num};
-      std::tuple<int, uint64, int64_t> rhs_prio{rhs.node_item->node_id,
-                                                rhs.input_frame->frame_id,
-                                                rhs.input_iter->iter_num};
+      std::tuple<int, uint64_t, int64_t> lhs_prio{lhs.node_item->node_id,
+                                                  lhs.input_frame->frame_id,
+                                                  lhs.input_iter->iter_num};
+      std::tuple<int, uint64_t, int64_t> rhs_prio{rhs.node_item->node_id,
+                                                  rhs.input_frame->frame_id,
+                                                  rhs.input_iter->iter_num};
       return lhs_prio < rhs_prio;
     }
 
diff --git a/tensorflow/core/common_runtime/quantize_training.cc b/tensorflow/core/common_runtime/quantize_training.cc
index c800552b5d3bca..3459153ed7dace 100644
--- a/tensorflow/core/common_runtime/quantize_training.cc
+++ b/tensorflow/core/common_runtime/quantize_training.cc
@@ -35,18 +35,18 @@ namespace tensorflow {
 namespace {
 
 // TODO(suharshs): If desired, make these values configurable.
-const uint32 kAllowedInputs = 2;
+const uint32_t kAllowedInputs = 2;
 const float kEMADecay = 0.999;
 
 // Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
 const auto* nodes_to_rewrite =
-    new std::unordered_set<string, StringPieceHasher>{"MatMul", "Conv2D"};
+    new std::unordered_set<std::string, StringPieceHasher>{"MatMul", "Conv2D"};
 
 // Contains necessary parameters to convert an edge.
 struct EdgeToConvert {
   // edge is not owned here.
   const Edge* edge;
-  int32 num_bits;
+  int32_t num_bits;
   bool signed_input;
   bool range_given;
   float input_min;
@@ -67,7 +67,7 @@ struct EdgeToConvert {
 // TODO(jmchen): Make this check more robust as it is not guaranteed that the
 // forward node will not be named with a leading "gradients".
 inline bool IsGradientNode(const Graph* graph, const Node* node) {
-  static const string tag = "gradients";
+  static const std::string tag = "gradients";
   return (node->name().compare(0, tag.size(), tag) == 0);
 }
 
@@ -76,7 +76,7 @@ inline bool IsGradientNode(const Graph* graph, const Node* node) {
 // Returns true if the root tensor op type is known, false otherwise.
 bool FindType(const Graph* graph, const Node* node, bool* signed_input,
               bool* range_given, float* input_min, float* input_max) {
-  const string& src_op = node->type_string();
+  const std::string& src_op = node->type_string();
   if (src_op == "Const" || src_op == "Variable" || src_op == "VariableV2") {
     *signed_input = true;
     *range_given = false;
@@ -154,7 +154,7 @@ absl::Status FindSaveOp(const Graph* graph, Node** save_op,
 Node* FindRestoreAllOp(const Graph* graph, absl::string_view save_prefix) {
   for (Node* node : graph->op_nodes()) {
     // The restore_all op should have the same prefix of the save_op.
-    if (node->name() == strings::StrCat(save_prefix, "/restore_all")) {
+    if (node->name() == absl::StrCat(save_prefix, "/restore_all")) {
       return node;
     }
   }
@@ -254,21 +254,21 @@ absl::Status AddRestoreVariableSubgraphs(
   if (restore_all == nullptr) {
     return errors::InvalidArgument("graph has SaveOp, but no restore_all NoOp");
   }
-  const string restore_op_name = strings::StrCat(name_prefix, "/RestoreV2");
-  const string assign_op_name = strings::StrCat(name_prefix, "/Assign");
+  const std::string restore_op_name = absl::StrCat(name_prefix, "/RestoreV2");
+  const std::string assign_op_name = absl::StrCat(name_prefix, "/Assign");
   for (Node* var : variables) {
     // Add an extra prefix after calling graph->NewName because the "unique"
     // name may conflict with names generated for Send nodes.
     // TODO(b/77547936): fix this more generally and get rid of the extra prefix
     // here.
-    string new_restore_op_name =
-        strings::StrCat(graph->NewName(restore_op_name), "_qt");
-    string new_assign_op_name =
-        strings::StrCat(graph->NewName(assign_op_name), "_qt");
-    string tensor_names_op_name =
-        strings::StrCat(new_restore_op_name, "/tensor_names");
-    string shape_and_slices_op_name =
-        strings::StrCat(new_restore_op_name, "/shape_and_slices");
+    std::string new_restore_op_name =
+        absl::StrCat(graph->NewName(restore_op_name), "_qt");
+    std::string new_assign_op_name =
+        absl::StrCat(graph->NewName(assign_op_name), "_qt");
+    std::string tensor_names_op_name =
+        absl::StrCat(new_restore_op_name, "/tensor_names");
+    std::string shape_and_slices_op_name =
+        absl::StrCat(new_restore_op_name, "/shape_and_slices");
 
     // Construct the tensor_names input with the variable name.
     Node* tensor_names;
@@ -329,32 +329,32 @@ absl::Status AddSaveAndRestore(Graph* graph,
 
 // Sets output to the Node that computes reduction axes corresponding to all
 // dimensions of input and return.
-absl::Status MakeReductionAxes(Graph* graph, string name_prefix, Node* input,
-                               Node** output) {
-  name_prefix = strings::StrCat(name_prefix, "/ReductionAxes");
+absl::Status MakeReductionAxes(Graph* graph, std::string name_prefix,
+                               Node* input, Node** output) {
+  name_prefix = absl::StrCat(name_prefix, "/ReductionAxes");
   Node* start;
   Tensor zero_tensor(DT_INT32, TensorShape());
-  zero_tensor.flat<int32>()(0) = 0;
+  zero_tensor.flat<int32_t>()(0) = 0;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/RangeStart"), "Const")
+      NodeBuilder(absl::StrCat(name_prefix, "/RangeStart"), "Const")
           .Attr("dtype", DT_INT32)
           .Attr("value", zero_tensor)
           .Finalize(graph, &start));
   Node* delta;
   Tensor one_tensor(DT_INT32, TensorShape());
-  one_tensor.flat<int32>()(0) = 1;
+  one_tensor.flat<int32_t>()(0) = 1;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/RangeDelta"), "Const")
+      NodeBuilder(absl::StrCat(name_prefix, "/RangeDelta"), "Const")
           .Attr("dtype", DT_INT32)
           .Attr("value", one_tensor)
           .Finalize(graph, &delta));
   Node* rank;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/InputRank"), "Rank")
+      NodeBuilder(absl::StrCat(name_prefix, "/InputRank"), "Rank")
           .Input(input)
           .Finalize(graph, &rank));
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/ReductionAxes"), "Range")
+      NodeBuilder(absl::StrCat(name_prefix, "/ReductionAxes"), "Range")
           .Input(start)
           .Input(rank)
           .Input(delta)
@@ -363,45 +363,43 @@ absl::Status MakeReductionAxes(Graph* graph, string name_prefix, Node* input,
 }
 
 // Computes the exponential moving average of input, updated in update_variable.
-absl::Status MakeExponentialMovingAverage(Graph* graph, string name_prefix,
+absl::Status MakeExponentialMovingAverage(Graph* graph, std::string name_prefix,
                                           const NodeBuilder::NodeOut& input,
                                           Node* decay, Node* update_variable,
                                           Node** assign_value) {
   // variable_t+1 = variable_t - [(variable_t - value) * (1 - decay)]
-  name_prefix = strings::StrCat(name_prefix, "/EMA");
+  name_prefix = absl::StrCat(name_prefix, "/EMA");
   Node* one;
   Tensor one_tensor(DT_FLOAT, TensorShape());
   one_tensor.flat<float>()(0) = 1.0;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/OneConst"), "Const")
+      NodeBuilder(absl::StrCat(name_prefix, "/OneConst"), "Const")
           .Attr("dtype", DT_FLOAT)
           .Attr("value", one_tensor)
           .Finalize(graph, &one));
   Node* decay_complement;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/DecayComplement"), "Sub")
+      NodeBuilder(absl::StrCat(name_prefix, "/DecayComplement"), "Sub")
           .Input(one)
           .Input(decay)
           .Finalize(graph, &decay_complement));
 
   Node* value_diff;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/ValueDiff"), "Sub")
-          .Input(update_variable)
-          .Input(input)
-          .Finalize(graph, &value_diff));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name_prefix, "/ValueDiff"), "Sub")
+                         .Input(update_variable)
+                         .Input(input)
+                         .Finalize(graph, &value_diff));
   Node* update_value;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/UpdateValue"), "Mul")
+      NodeBuilder(absl::StrCat(name_prefix, "/UpdateValue"), "Mul")
           .Input(value_diff)
           .Input(decay_complement)
           .Finalize(graph, &update_value));
 
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/EMAValue"), "Sub")
-          .Input(update_variable)
-          .Input(update_value)
-          .Finalize(graph, assign_value));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name_prefix, "/EMAValue"), "Sub")
+                         .Input(update_variable)
+                         .Input(update_value)
+                         .Finalize(graph, assign_value));
   return absl::OkStatus();
 }
 
@@ -416,25 +414,24 @@ absl::Status MakeExponentialMovingAverage(Graph* graph, string name_prefix,
 //       |         EMA    init_val
 //       |           \      /
 //       +----------- assign
-absl::Status MakeInitializedEMAVariable(Graph* graph, const string& name,
+absl::Status MakeInitializedEMAVariable(Graph* graph, const std::string& name,
                                         Node* decay, Node* init_val,
                                         std::vector<Node*>* added_variables,
                                         Node** var) {
   // TODO(suharshs): Update this to use ResourceVariables when they are ready.
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name, "/Variable"), "VariableV2")
-          .Attr("shape", TensorShape())
-          .Attr("dtype", DT_FLOAT)
-          .Finalize(graph, var));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/Variable"), "VariableV2")
+                         .Attr("shape", TensorShape())
+                         .Attr("dtype", DT_FLOAT)
+                         .Finalize(graph, var));
   added_variables->push_back(*var);
 
   Node* is_initialized;
-  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/IsInitialized"),
-                                 "IsVariableInitialized")
-                         .Input(*var)
-                         .Finalize(graph, &is_initialized));
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(absl::StrCat(name, "/IsInitialized"), "IsVariableInitialized")
+          .Input(*var)
+          .Finalize(graph, &is_initialized));
   Node* switch_node;
-  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/Switch"), "Switch")
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/Switch"), "Switch")
                          .Input(init_val)
                          .Input(is_initialized)
                          .Finalize(graph, &switch_node));
@@ -446,20 +443,19 @@ absl::Status MakeInitializedEMAVariable(Graph* graph, const string& name,
                                                   decay, *var, &ema_value));
 
   Node* assign_value;
-  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/Merge"), "Merge")
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/Merge"), "Merge")
                          .Input({output_false, ema_value})
                          .Finalize(graph, &assign_value));
 
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name, "/AssignValue"), "Assign")
-          .Input(*var)
-          .Input(assign_value)
-          .Finalize(graph, var));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/AssignValue"), "Assign")
+                         .Input(*var)
+                         .Input(assign_value)
+                         .Finalize(graph, var));
   return absl::OkStatus();
 }
 
 // Computes the min and max EMA of input and stores them in min_var and max_var.
-absl::Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix,
+absl::Status MakeEMAMinMaxVars(Graph* graph, const std::string& name_prefix,
                                Node* input, std::vector<Node*>* added_variables,
                                Node** min_var, Node** max_var) {
   // TODO(suharshs): The decay will be constant, so we could make only one for
@@ -468,23 +464,22 @@ absl::Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix,
   Tensor decay_tensor(DT_FLOAT, TensorShape());
   decay_tensor.flat<float>()(0) = kEMADecay;
   Node* decay;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/Decay"), "Const")
-          .Attr("dtype", DT_FLOAT)
-          .Attr("value", decay_tensor)
-          .Finalize(graph, &decay));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name_prefix, "/Decay"), "Const")
+                         .Attr("dtype", DT_FLOAT)
+                         .Attr("value", decay_tensor)
+                         .Finalize(graph, &decay));
 
   Node* reduction_axes;
   TF_RETURN_IF_ERROR(
       MakeReductionAxes(graph, name_prefix, input, &reduction_axes));
   Node* min;
-  string min_name = strings::StrCat(name_prefix, "/Min");
+  std::string min_name = absl::StrCat(name_prefix, "/Min");
   TF_RETURN_IF_ERROR(NodeBuilder(min_name, "Min")
                          .Input(input)
                          .Input(reduction_axes)
                          .Finalize(graph, &min));
   Node* max;
-  string max_name = strings::StrCat(name_prefix, "/Max");
+  std::string max_name = absl::StrCat(name_prefix, "/Max");
   TF_RETURN_IF_ERROR(NodeBuilder(max_name, "Max")
                          .Input(input)
                          .Input(reduction_axes)
@@ -498,7 +493,7 @@ absl::Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix,
 
 // Makes an input min and max constant if the range is given. Otherwise, makes
 // min and max variables that are updated by an EMA.
-absl::Status MakeInputMinMax(Graph* graph, const string& name_prefix,
+absl::Status MakeInputMinMax(Graph* graph, const std::string& name_prefix,
                              const EdgeToConvert& edge,
                              std::vector<Node*>* added_variables,
                              Node** input_min, Node** input_max) {
@@ -508,14 +503,14 @@ absl::Status MakeInputMinMax(Graph* graph, const string& name_prefix,
     Tensor input_min_tensor(DT_FLOAT, TensorShape());
     input_min_tensor.flat<float>()(0) = edge.input_min;
     TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat(name_prefix, "/InputMin"), "Const")
+        NodeBuilder(absl::StrCat(name_prefix, "/InputMin"), "Const")
             .Attr("dtype", DT_FLOAT)
             .Attr("value", input_min_tensor)
             .Finalize(graph, input_min));
     Tensor input_max_tensor(DT_FLOAT, TensorShape());
     input_max_tensor.flat<float>()(0) = edge.input_max;
     TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat(name_prefix, "/InputMax"), "Const")
+        NodeBuilder(absl::StrCat(name_prefix, "/InputMax"), "Const")
             .Attr("dtype", DT_FLOAT)
             .Attr("value", input_max_tensor)
             .Finalize(graph, input_max));
@@ -532,8 +527,8 @@ absl::Status MakeInputMinMax(Graph* graph, const string& name_prefix,
 // Adds a QuantizeAndDequantizeV2 or FakeQuantizeWithMinMaxVars op
 // (and required input nodes) based on edge.
 // The result is stored in convert_node.
-absl::Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
-                            const string& quant_op_type,
+absl::Status MakeQuantizeOp(Graph* graph, const std::string& name_prefix,
+                            const std::string& quant_op_type,
                             const EdgeToConvert& edge,
                             std::vector<Node*>* added_variables,
                             Node** convert_node) {
@@ -541,7 +536,7 @@ absl::Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
   Node* input_max;
   TF_RETURN_IF_ERROR(MakeInputMinMax(graph, name_prefix, edge, added_variables,
                                      &input_min, &input_max));
-  string quant_name = strings::StrCat(name_prefix, "/", quant_op_type);
+  std::string quant_name = absl::StrCat(name_prefix, "/", quant_op_type);
   if (quant_op_type == "QuantizeAndDequantizeV2") {
     TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
                            .Input(edge.edge->src())
@@ -566,15 +561,15 @@ absl::Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
 
 // Insert conversion op, connect it to the graph and remove the old edge.
 absl::Status ProcessTargetEdges(
-    Graph* graph, const string& quant_op_type,
+    Graph* graph, const std::string& quant_op_type,
     const std::vector<EdgeToConvert>& target_edges) {
   // Remember previously converted ops to avoid duplicated conversion on the
   // same input.
-  std::unordered_map<string, Node*, StringPieceHasher> name_index;
+  std::unordered_map<std::string, Node*, StringPieceHasher> name_index;
   std::vector<Node*> added_variables;
   for (const EdgeToConvert edge : target_edges) {
     Node* convert_node;
-    string name_prefix = edge.edge->src()->name();
+    std::string name_prefix = edge.edge->src()->name();
 
     auto iter = name_index.find(name_prefix);
     if (iter == name_index.end()) {
@@ -596,7 +591,8 @@ absl::Status ProcessTargetEdges(
 
 }  // namespace
 
-absl::Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
+absl::Status DoQuantizeTraining(int32_t num_bits,
+                                const std::string& quant_op_type,
                                 Graph* graph) {
   if (graph == nullptr) {
     return errors::InvalidArgument("Cannot accept empty graph pointer.");
@@ -663,7 +659,7 @@ absl::Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
 
 absl::Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
                                           int32_t num_bits,
-                                          const string& quant_op_type,
+                                          const std::string& quant_op_type,
                                           GraphDef* result_graphdef) {
   Graph graph(OpRegistry::Global());
   GraphConstructorOptions opts;
@@ -678,8 +674,8 @@ absl::Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
 }
 
 absl::Status DoQuantizeTrainingOnSerializedGraphDef(
-    const string& input_graph_string, int32_t num_bits,
-    const string& quant_op_type, string* result_graph_string) {
+    const std::string& input_graph_string, int32_t num_bits,
+    const std::string& quant_op_type, std::string* result_graph_string) {
   // First create the graph from the GraphDef.
   GraphDef input_graphdef;
   if (!ParseProtoUnlimited(&input_graphdef, input_graph_string)) {
diff --git a/tensorflow/core/common_runtime/quantize_training.h b/tensorflow/core/common_runtime/quantize_training.h
index de3ed6b476b24a..21f794cbec8f2c 100644
--- a/tensorflow/core/common_runtime/quantize_training.h
+++ b/tensorflow/core/common_runtime/quantize_training.h
@@ -35,21 +35,20 @@ namespace tensorflow {
 //    - num_bits out of range.
 //    - g is null.
 //    - More than 1 unknown ops encountered.
-absl::Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
-                                Graph* g);
+absl::Status DoQuantizeTraining(int32_t num_bits,
+                                const std::string& quant_op_type, Graph* g);
 
 // Converts the input serialized GraphDef and returns a rewritten serialized
 // GraphDef for quantized training.
-absl::Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
-                                                    int32_t num_bits,
-                                                    const string& quant_op_type,
-                                                    string* result_graph);
+absl::Status DoQuantizeTrainingOnSerializedGraphDef(
+    const std::string& input_graph, int32_t num_bits,
+    const std::string& quant_op_type, std::string* result_graph);
 
 // Converts the input GraphDef and returns a rewritten GraphDef for quantized
 // training.
 absl::Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
                                           int32_t num_bits,
-                                          const string& quant_op_type,
+                                          const std::string& quant_op_type,
                                           GraphDef* result_graphdef);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/quantize_training_test.cc b/tensorflow/core/common_runtime/quantize_training_test.cc
index 7f2e1b0e709d35..5d4a1ac2618de2 100644
--- a/tensorflow/core/common_runtime/quantize_training_test.cc
+++ b/tensorflow/core/common_runtime/quantize_training_test.cc
@@ -51,7 +51,7 @@ class QuantizeTrainingTest : public ::testing::Test {
     return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
   }
 
-  absl::Status Placeholder(Graph* g, const string& name, TensorShape shape,
+  absl::Status Placeholder(Graph* g, const std::string& name, TensorShape shape,
                            Node** out) {
     TF_RETURN_IF_ERROR(NodeBuilder(name, "Placeholder")
                            .Attr("dtype", DT_FLOAT)
@@ -60,7 +60,7 @@ class QuantizeTrainingTest : public ::testing::Test {
     return absl::OkStatus();
   }
 
-  absl::Status FindNode(Graph* g, const string& name, Node** out) {
+  absl::Status FindNode(Graph* g, const std::string& name, Node** out) {
     for (Node* node : g->nodes()) {
       if (node->name() == name) {
         *out = node;
@@ -111,15 +111,14 @@ TEST_F(QuantizeTrainingTest, SignedInput) {
   // Quantize_and_dequantize node for identity should have signed_input==true.
   Node* identity_q_node;
   TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
+      FindNode(g, absl::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
                &identity_q_node));
   ASSERT_EQ("true",
             SummarizeAttrValue(*identity_q_node->attrs().Find("signed_input")));
   // Quantize_and_dequantize node for relu should have signed_input==false.
   Node* relu_q_node;
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
-               &relu_q_node));
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/QuantizeAndDequantizeV2"), &relu_q_node));
   ASSERT_EQ("false",
             SummarizeAttrValue(*relu_q_node->attrs().Find("signed_input")));
 }
@@ -161,16 +160,15 @@ TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
 
   // Quantize_and_dequantize node for relu6 should have range_given==true.
   Node* relu6_q_node;
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu6->name(), "/QuantizeAndDequantizeV2"),
-               &relu6_q_node));
+  TF_ASSERT_OK(FindNode(g,
+                        absl::StrCat(relu6->name(), "/QuantizeAndDequantizeV2"),
+                        &relu6_q_node));
   ASSERT_EQ("true",
             SummarizeAttrValue(*relu6_q_node->attrs().Find("range_given")));
   // Quantize_and_dequantize node for relu should have range_given==true.
   Node* relu_q_node;
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
-               &relu_q_node));
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/QuantizeAndDequantizeV2"), &relu_q_node));
   ASSERT_EQ("true",
             SummarizeAttrValue(*relu_q_node->attrs().Find("range_given")));
 }
@@ -215,18 +213,17 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   // Ensure that the backwards matmul input was not quantized.
   Node* found_node;
   absl::Status s = FindNode(
-      g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"), &found_node);
+      g, absl::StrCat(d->name(), "/QuantizeAndDequantizeV2"), &found_node);
   EXPECT_TRUE(absl::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/QuantizeAndDequantizeV2"), &found_node));
   TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
-               &found_node));
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
+      FindNode(g, absl::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
                &found_node));
-  TF_ASSERT_OK(FindNode(
-      g, strings::StrCat(c->name(), "/QuantizeAndDequantizeV2"), &found_node));
+  TF_ASSERT_OK(FindNode(g, absl::StrCat(c->name(), "/QuantizeAndDequantizeV2"),
+                        &found_node));
 }
 
 TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
@@ -269,18 +266,17 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
   // Ensure that the backwards matmul input was not quantized.
   Node* found_node;
   absl::Status s = FindNode(
-      g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"), &found_node);
+      g, absl::StrCat(d->name(), "/FakeQuantWithMinMaxVars"), &found_node);
   EXPECT_TRUE(absl::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/FakeQuantWithMinMaxVars"), &found_node));
   TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/FakeQuantWithMinMaxVars"),
-               &found_node));
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(identity->name(), "/FakeQuantWithMinMaxVars"),
+      FindNode(g, absl::StrCat(identity->name(), "/FakeQuantWithMinMaxVars"),
                &found_node));
-  TF_ASSERT_OK(FindNode(
-      g, strings::StrCat(c->name(), "/FakeQuantWithMinMaxVars"), &found_node));
+  TF_ASSERT_OK(FindNode(g, absl::StrCat(c->name(), "/FakeQuantWithMinMaxVars"),
+                        &found_node));
 }
 
 TEST_F(QuantizeTrainingTest, QuantizeSerializedGraphDef) {
@@ -301,10 +297,10 @@ TEST_F(QuantizeTrainingTest, QuantizeSerializedGraphDef) {
   // Convert the graph to the graphdef string.
   GraphDef input_graph;
   graph->ToGraphDef(&input_graph);
-  string input_string;
+  std::string input_string;
   input_graph.SerializeToString(&input_string);
 
-  string result_string;
+  std::string result_string;
   TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(
       input_string, num_bits, "QuantizeAndDequantizeV2", &result_string));
 
@@ -400,8 +396,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_QuantizeAndDequantize) {
 
   // The min and max values of the relu6 quantization should be constant values
   // of 0 and 6.
-  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
-  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::string min_const_name = absl::StrCat(relu6->name(), "/InputMin");
+  std::string max_const_name = absl::StrCat(relu6->name(), "/InputMax");
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
@@ -416,8 +412,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_QuantizeAndDequantize) {
 
   // The value of the min and max should be set to the min and max of a1 since
   // this is the first run that initializes the EMA variables.
-  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
-  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  std::string min_var_name = absl::StrCat(relu->name(), "/Min/Variable");
+  std::string max_var_name = absl::StrCat(relu->name(), "/Max/Variable");
   TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
   EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
@@ -494,8 +490,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_FakeQuant) {
 
   // The min and max values of the relu6 quantization should be constant values
   // of 0 and 6.
-  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
-  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::string min_const_name = absl::StrCat(relu6->name(), "/InputMin");
+  std::string max_const_name = absl::StrCat(relu6->name(), "/InputMax");
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
@@ -510,8 +506,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_FakeQuant) {
 
   // The value of the min and max should be set to the min and max of a1 since
   // this is the first run that initializes the EMA variables.
-  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
-  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  std::string min_var_name = absl::StrCat(relu->name(), "/Min/Variable");
+  std::string max_var_name = absl::StrCat(relu->name(), "/Max/Variable");
   TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
   EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
index 0bfc121b23cab4..a4c15f74b49774 100644
--- a/tensorflow/core/common_runtime/renamed_device.cc
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 
 /* static */
 std::unique_ptr<Device> RenamedDevice::NewRenamedDevice(
-    const string& new_base, Device* underlying, bool owns_underlying,
+    const std::string& new_base, Device* underlying, bool owns_underlying,
     bool isolate_session_state,
     thread::ThreadPoolInterface* underlying_threadpool) {
   DeviceNameUtils::ParsedName parsed_name;
@@ -39,9 +39,9 @@ std::unique_ptr<Device> RenamedDevice::NewRenamedDevice(
   CHECK(underlying_parsed_name.has_id);
   parsed_name.type = underlying_parsed_name.type;
   parsed_name.id = underlying_parsed_name.id;
-  string name = DeviceNameUtils::FullName(parsed_name.job, parsed_name.replica,
-                                          parsed_name.task, parsed_name.type,
-                                          parsed_name.id);
+  std::string name = DeviceNameUtils::FullName(
+      parsed_name.job, parsed_name.replica, parsed_name.task, parsed_name.type,
+      parsed_name.id);
   DeviceAttributes attributes(underlying->attributes());
   attributes.set_name(name);
   // Call absl::WrapUnique to access private constructor.
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 4a0e1057b398a4..687f61f8eff2d8 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -30,7 +30,7 @@ namespace tensorflow {
 class RenamedDevice : public Device {
  public:
   static std::unique_ptr<Device> NewRenamedDevice(
-      const string& new_base, Device* underlying, bool owns_underlying,
+      const std::string& new_base, Device* underlying, bool owns_underlying,
       bool isolate_session_state,
       thread::ThreadPoolInterface* underlying_threadpool = nullptr);
 
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index f1a199ba97250d..1d6e53c6585068 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -99,9 +99,9 @@ void SameWorkerRecvDone(const DeviceMgr* device_mgr,
   if (in.dtype() != DT_VARIANT) {
     // Variants are handled by CopyTensor::ViaDMA.
     AllocationAttributes aa;
-    uint64 safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
-    std::function<uint64()> freed_by_func = [dst_device,
-                                             &safe_alloc_frontier]() {
+    uint64_t safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
+    std::function<uint64_t()> freed_by_func = [dst_device,
+                                               &safe_alloc_frontier]() {
       safe_alloc_frontier = dst_device->SafeAllocFrontier(safe_alloc_frontier);
       return safe_alloc_frontier;
     };
diff --git a/tensorflow/core/common_runtime/rendezvous_util.cc b/tensorflow/core/common_runtime/rendezvous_util.cc
index 532f4e84a2f9f2..8f4e7acbf77ed5 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util.cc
@@ -22,7 +22,8 @@ namespace tensorflow {
 absl::Status SendTensorsToRendezvous(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, absl::Span<const Tensor> tensors_to_send) {
+    const std::vector<std::string>& keys,
+    absl::Span<const Tensor> tensors_to_send) {
   if (keys.size() != tensors_to_send.size()) {
     return errors::InvalidArgument(
         "keys and tensors_to_send are not the same size. keys.size() = ",
@@ -56,7 +57,7 @@ absl::Status SendTensorsToRendezvous(
 void RecvOutputsFromRendezvousAsync(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, std::vector<Tensor>* received_tensors,
+    const std::vector<std::string>& keys, std::vector<Tensor>* received_tensors,
     StatusCallback done) {
   if (keys.empty()) {
     done(absl::OkStatus());
@@ -69,8 +70,8 @@ void RecvOutputsFromRendezvousAsync(
   }
 
   received_tensors->reserve(keys.size());
-  std::vector<
-      std::tuple<string, Tensor*, Rendezvous::ParsedKey, AllocatorAttributes>>
+  std::vector<std::tuple<std::string, Tensor*, Rendezvous::ParsedKey,
+                         AllocatorAttributes>>
       arguments;
   for (int i = 0; i < keys.size(); ++i) {
     Rendezvous::ParsedKey parsed;
@@ -90,7 +91,7 @@ void RecvOutputsFromRendezvousAsync(
 
   auto status_cb = new ReffedStatusCallback(std::move(done));
   for (auto& p : arguments) {
-    const string& key = std::get<0>(p);
+    const std::string& key = std::get<0>(p);
     Tensor* val = std::get<1>(p);
     Rendezvous::ParsedKey parsed = std::get<2>(p);
     Rendezvous::Args rendez_args;
@@ -124,7 +125,7 @@ absl::Status RecvOutputsFromRendezvous(RendezvousInterface* rendezvous,
   // Receives values requested by the caller.
   Rendezvous::ParsedKey parsed;
   for (auto& p : *out) {
-    const string& key = p.first;
+    const std::string& key = p.first;
     Tensor* val = &p.second;
     bool is_dead = false;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(key, &parsed));
diff --git a/tensorflow/core/common_runtime/rendezvous_util.h b/tensorflow/core/common_runtime/rendezvous_util.h
index 8ed1dd7a11ad16..1c9ac0ef221a54 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.h
+++ b/tensorflow/core/common_runtime/rendezvous_util.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef std::map<string, Tensor> NamedTensors;
+typedef std::map<std::string, Tensor> NamedTensors;
 typedef std::function<void(const absl::Status&)> StatusCallback;
 
 // Uses `rendezvous` to send tensors in `tensors_to_send`. `device_context`
@@ -33,7 +33,8 @@ typedef std::function<void(const absl::Status&)> StatusCallback;
 absl::Status SendTensorsToRendezvous(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, absl::Span<const Tensor> tensors_to_send);
+    const std::vector<std::string>& keys,
+    absl::Span<const Tensor> tensors_to_send);
 
 // Uses `rendezvous` to obtain tensors. `device_context` should be the
 // DeviceContext associated with the receiving device. `alloc_attrs` contains
@@ -42,7 +43,7 @@ absl::Status SendTensorsToRendezvous(
 void RecvOutputsFromRendezvousAsync(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, std::vector<Tensor>* received_tensors,
+    const std::vector<std::string>& keys, std::vector<Tensor>* received_tensors,
     StatusCallback done);
 
 absl::Status RecvOutputsFromRendezvous(RendezvousInterface* rendezvous,
diff --git a/tensorflow/core/common_runtime/rendezvous_util_test.cc b/tensorflow/core/common_runtime/rendezvous_util_test.cc
index 484746ce416f0b..f2c866c307905c 100644
--- a/tensorflow/core/common_runtime/rendezvous_util_test.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util_test.cc
@@ -32,20 +32,20 @@ class RendezvousUtilTest : public ::testing::Test {
 };
 
 // string -> Tensor<string>
-Tensor V(const string& content) {
+Tensor V(const std::string& content) {
   Tensor tensor(DT_STRING, TensorShape({}));
   tensor.scalar<tstring>()() = content;
   return tensor;
 }
 
 // Tensor<string> -> string
-string V(const Tensor& tensor) {
+std::string V(const Tensor& tensor) {
   CHECK_EQ(tensor.dtype(), DT_STRING);
   CHECK(TensorShapeUtils::IsScalar(tensor.shape()));
   return tensor.scalar<tstring>()();
 }
 
-string MakeStringKey(const string& name) {
+std::string MakeStringKey(const std::string& name) {
   return Rendezvous::CreateKey(
       "/job:localhost/replica:0/task:0/device:CPU:0", 0,
       "/job:localhost/replica:0/task:0/device:GPU:0", name, FrameAndIter(0, 0));
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
index 7da785ca6f54e3..9dfa50ae0dc2a4 100644
--- a/tensorflow/core/common_runtime/replicate_constants_pass.cc
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.cc
@@ -70,8 +70,8 @@ bool HasCpuDevice(const Node* node) {
 // Convert the CPU device name to the corresponding CPU device name. If
 // multiple local CPU devices are enabled, the CPU device name will also
 // contain the device id.
-absl::Status DeviceNameToCpuDeviceNameWithDeviceId(const string& device_name,
-                                                   string* host_device_name) {
+absl::Status DeviceNameToCpuDeviceNameWithDeviceId(
+    const std::string& device_name, std::string* host_device_name) {
   DeviceNameUtils::ParsedName device;
   if (!DeviceNameUtils::ParseFullName(device_name, &device)) {
     return absl::InternalError(
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 3f4cf1498769a0..e60117f588f8c0 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -45,7 +45,7 @@ class ReplicateHelper {
 
   // Replicate the given node to an allowed device.
   absl::Status ReplicateNode(const Node* node,
-                             const std::vector<string>& allowed_devices,
+                             const std::vector<std::string>& allowed_devices,
                              int allowed_device_index, Graph* graph) {
     auto& replicated_nodes = replicated_nodes_map_.at(node);
     if (replicated_nodes[allowed_device_index] != nullptr) {
@@ -53,8 +53,8 @@ class ReplicateHelper {
     }
     const auto& device = allowed_devices.at(allowed_device_index);
     NodeDef node_def = node->def();
-    const string suffix = strings::StrCat("/R", allowed_device_index);
-    node_def.set_name(graph->NewName(strings::StrCat(node_def.name(), suffix)));
+    const std::string suffix = absl::StrCat("/R", allowed_device_index);
+    node_def.set_name(graph->NewName(absl::StrCat(node_def.name(), suffix)));
     TF_ASSIGN_OR_RETURN(Node * replicated_node, graph->AddNode(node_def));
     replicated_node->set_assigned_device_name(device);
     if (replicated_node->IsArg()) {
@@ -83,7 +83,7 @@ class ReplicateHelper {
   // Replace an edge (composite device -> composite device) with
   // N edges (allowed devices -> allowed devices).
   absl::Status ReplicateFromCompositeDeviceToCompositeDevice(
-      const Edge* edge, const std::vector<string>& allowed_devices,
+      const Edge* edge, const std::vector<std::string>& allowed_devices,
       Graph* graph) {
     const std::vector<Node*>& src_replicated_nodes =
         replicated_nodes_map_.at(edge->src());
@@ -115,12 +115,12 @@ class ReplicateHelper {
   // Control edge: replace an edge (composite device -> a regular device) with
   // N edges (allowed devices -> a regular device).
   absl::Status ReplicateFromCompositeDeviceToRegularDevice(
-      const Edge* edge, const std::vector<string>& allowed_devices,
+      const Edge* edge, const std::vector<std::string>& allowed_devices,
       Graph* graph) {
     const std::vector<Node*>& src_replicated_nodes =
         replicated_nodes_map_.at(edge->src());
     Node* dst = edge->dst();
-    const string& dst_device = dst->assigned_device_name();
+    const std::string& dst_device = dst->assigned_device_name();
     bool found_src_node = false;
     for (int i = 0; i < allowed_devices.size(); ++i) {
       if (allowed_devices.at(i) == dst_device) {
@@ -198,7 +198,7 @@ class ReplicateHelper {
 
 // Replicate the nodes in cluster_nodes and update edges.
 absl::Status ReplicateNodesAndEdges(
-    const std::vector<string>& allowed_devices,
+    const std::vector<std::string>& allowed_devices,
     absl::flat_hash_map<Node*, int>* cluster_nodes, ReplicateHelper* helper,
     Graph* graph) {
   // Contains nodes in cluster_nodes whose out nodes are all on physical
@@ -253,19 +253,19 @@ absl::Status ReplicateNodesAndEdges(
 }  // namespace
 
 absl::Status ReplicatePerReplicaNodesInFunctionGraph(
-    const absl::flat_hash_map<string, const std::vector<string>*>&
+    const absl::flat_hash_map<std::string, const std::vector<std::string>*>&
         composite_devices,
     Graph* graph) {
   VLOG(1) << "Starting ReplicatePerReplicaNodesInFunctionGraph";
   VLOG(1) << "Graph #nodes " << graph->num_nodes() << " #edges "
           << graph->num_edges();
-  std::set<string> composite_device_names;
+  std::set<std::string> composite_device_names;
   for (const auto& it : composite_devices) {
     composite_device_names.insert(it.first);
   }
   // Map from a composite device to a cluster of nodes assigned to the
   // composite device and the numbers of their out edges to process.
-  absl::flat_hash_map<string, absl::flat_hash_map<Node*, int>>
+  absl::flat_hash_map<std::string, absl::flat_hash_map<Node*, int>>
       composite_device_to_cluster_nodes;
   for (Node* n : graph->op_nodes()) {
     if (composite_device_names.find(n->assigned_device_name()) !=
@@ -284,7 +284,7 @@ absl::Status ReplicatePerReplicaNodesInFunctionGraph(
   }
 
   for (auto& it : composite_device_to_cluster_nodes) {
-    const std::vector<string>& allowed_devices =
+    const std::vector<std::string>& allowed_devices =
         *composite_devices.at(it.first);
     if (allowed_devices.empty()) {
       return errors::InvalidArgument("No allowed device of composite device: ",
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.h b/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
index 4be95ea32ca44b..414bd21de35361 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
@@ -35,7 +35,7 @@ namespace tensorflow {
 // dependency.
 // TODO(b/145922293): Register it as a POST_REWRITE_FOR_EXEC pass.
 absl::Status ReplicatePerReplicaNodesInFunctionGraph(
-    const absl::flat_hash_map<string, const std::vector<string>*>&
+    const absl::flat_hash_map<std::string, const std::vector<std::string>*>&
         composite_devices,
     Graph* graph);
 
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
index ff6fcb4b8bc735..f0a859286fba06 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
@@ -40,7 +40,7 @@ class GraphHelper {
     }
   }
 
-  Node* GetNodeByName(const string& name) {
+  Node* GetNodeByName(const std::string& name) {
     const auto it = nodes_by_name_.find(name);
     if (it != nodes_by_name_.end()) {
       return it->second;
@@ -53,7 +53,8 @@ class GraphHelper {
     return nullptr;
   }
 
-  void SetAssignedDevice(const string& node_name, const string& device_name) {
+  void SetAssignedDevice(const std::string& node_name,
+                         const std::string& device_name) {
     CHECK_NOTNULL(GetNodeByName(node_name))
         ->set_assigned_device_name(device_name);
   }
@@ -68,14 +69,14 @@ class GraphHelper {
     EXPECT_EQ(arg_num, expected_num);
   }
 
-  void CheckAssignedDevice(const string& node_name,
-                           const string& expected_device_name) {
+  void CheckAssignedDevice(const std::string& node_name,
+                           const std::string& expected_device_name) {
     EXPECT_EQ(expected_device_name,
               CHECK_NOTNULL(GetNodeByName(node_name))->assigned_device_name());
   }
 
-  void CheckAssignedDevicePrefix(const string& node_name,
-                                 const string& expected_device_name) {
+  void CheckAssignedDevicePrefix(const std::string& node_name,
+                                 const std::string& expected_device_name) {
     auto assigned =
         CHECK_NOTNULL(GetNodeByName(node_name))->assigned_device_name();
     EXPECT_EQ(assigned.rfind(expected_device_name, 0), 0);
@@ -85,21 +86,21 @@ class GraphHelper {
   const Graph& graph_;
   // Maps from a node name to a Node* in the graph. We use an ordered map here
   // to ensure stability of GetNodeByName().
-  std::map<string, Node*> nodes_by_name_;
+  std::map<std::string, Node*> nodes_by_name_;
 };
 
 TEST(ReplicatePerReplicaNodesTest, SingleCompositeDevice) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   Output arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
   auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
-  auto one = ops::Const<int32>(scope.WithOpName("one"), 1);
+  auto one = ops::Const<int32_t>(scope.WithOpName("one"), 1);
   auto write = ops::AssignVariableOp(scope.WithOpName("write"), arg, one);
   auto ret = ops::_Retval(
       scope.WithOpName("ret").WithControlDependencies({write}), read, 0);
 
-  const std::vector<string> underlying_devices = {"/device:TPU:0",
-                                                  "/device:TPU:1"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0",
+                                                       "/device:TPU:1"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
@@ -143,8 +144,8 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDeviceToSingleDevice) {
   auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
   auto ret = ops::_Retval(scope.WithOpName("ret"), read, 0);
 
-  const std::vector<string> underlying_devices = {"/device:TPU:0"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
@@ -183,11 +184,11 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
   auto add = ops::Add(scope.WithOpName("add"), identity0, identity1);
   auto ret = ops::_Retval(scope.WithOpName("ret"), add, 0);
 
-  const std::vector<string> underlying_devices_0 = {"/device:TPU:0",
-                                                    "/device:TPU:1"};
-  const std::vector<string> underlying_devices_1 = {"/device:TPU:2",
-                                                    "/device:TPU:3"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices_0 = {"/device:TPU:0",
+                                                         "/device:TPU:1"};
+  const std::vector<std::string> underlying_devices_1 = {"/device:TPU:2",
+                                                         "/device:TPU:3"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices_0},
                            {"/device:TPU_COMPOSITE:1", &underlying_devices_1}};
 
@@ -232,9 +233,9 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
 }
 
 TEST(ReplicatePerReplicaNodesTest, NestedFunctions) {
-  const std::vector<string> underlying_devices = {"/device:TPU:0",
-                                                  "/device:TPU:1"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0",
+                                                       "/device:TPU:1"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   FunctionDefLibrary fdef_lib;
@@ -311,9 +312,9 @@ TEST(ReplicatePerReplicaNodesTest, DeadArgNodes) {
   auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
   auto ret = ops::_Retval(scope.WithOpName("ret"), read, 0);
 
-  const std::vector<string> underlying_devices = {"/device:TPU:0",
-                                                  "/device:TPU:1"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0",
+                                                       "/device:TPU:1"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index a12acfdf64c9dd..ff44370ecbd451 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -61,8 +61,8 @@ namespace {
 // RingAlg instances.  Note that the exec_key will differentiate between
 // different instances consequently we don't need to further differentiate
 // between subclasses of RingAlg.
-string RingAlgBufKey(const string& name, const string& exec_key, int pass,
-                     int section, int source_rank) {
+std::string RingAlgBufKey(const std::string& name, const std::string& exec_key,
+                          int pass, int section, int source_rank) {
   if (READABLE_KEYS) {
     return strings::StrCat(name, "(", exec_key, "):pass(", pass, "):section(",
                            section, "):srcrank(", source_rank, ")");
@@ -97,7 +97,7 @@ RingAlg::RingField* RingAlg::PCQueue::Dequeue() {
   return rf;
 }
 
-RingAlg::RingAlg(CollectiveType type, const string& name)
+RingAlg::RingAlg(CollectiveType type, const std::string& name)
     : type_(type),
       name_(name),
       col_ctx_(nullptr),
@@ -163,10 +163,10 @@ absl::Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
   }
 
   if (VLOG_IS_ON(2)) {
-    string subdiv_buf;
+    std::string subdiv_buf;
     for (const int subdiv_offset :
          col_params->instance.impl_details.subdiv_offsets) {
-      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
+      absl::StrAppend(&subdiv_buf, " ", subdiv_offset);
     }
     VLOG(2) << "Dynamically generated " << num_subdivs
             << " subdiv_offsets:" << subdiv_buf << " tensor_size "
@@ -178,7 +178,7 @@ absl::Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
 }  // namespace
 
 absl::Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
-  const string& device_name =
+  const std::string& device_name =
       col_params->group.members[col_params->default_rank].device.name();
   // Each subdiv permutation is a ring formed by rotating each
   // single-task subsequence of devices by an offset.  This makes most
@@ -190,7 +190,7 @@ absl::Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->group.members[0].task;
+  const std::string* prior_task_name = &col_params->group.members[0].task;
   int dev_count = 1;
   for (int di = 1; di < col_params->group.group_size; ++di) {
     if (col_params->group.members[di].task != *prior_task_name) {
@@ -265,7 +265,7 @@ absl::Status RingAlg::InitializeCollectiveContext(
       &col_ctx->device_locality);
 }
 
-string RingAlg::TensorDebugString(const Tensor& tensor) {
+std::string RingAlg::TensorDebugString(const Tensor& tensor) {
   const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info =
       col_ctx_->op_ctx->device()->tensorflow_accelerator_device_info();
   if (accelerator_device_info) {
@@ -383,11 +383,11 @@ void RingAlg::AdvanceToSecondPass(RingField* rf) {
   VLOG(3) << "IncrRingField new value " << rf->DebugString();
 }
 
-string RingAlg::RingField::DebugString() const {
-  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
-                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
-                              " action=", action);
-  strings::StrAppend(&rv, " pass=", second_pass);
+std::string RingAlg::RingField::DebugString() const {
+  std::string rv = strings::StrCat(
+      "RingField rank=", rank, " chunk_idx=", chunk_idx, " subdiv=", subdiv_idx,
+      " sc_idx=", sc_idx, " action=", action);
+  absl::StrAppend(&rv, " pass=", second_pass);
   strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
                      " is_final=", is_final, " recv_is_remote=", recv_is_remote,
                      " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
@@ -396,8 +396,8 @@ string RingAlg::RingField::DebugString() const {
 
 void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
   DCHECK(rf->do_send);
-  string send_buf_key = RingAlgBufKey(name_, col_ctx_->exec_key,
-                                      rf->second_pass, rf->sc_idx, rf->rank);
+  std::string send_buf_key = RingAlgBufKey(
+      name_, col_ctx_->exec_key, rf->second_pass, rf->sc_idx, rf->rank);
   VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
           << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
           << rf->sc_idx;
@@ -415,7 +415,7 @@ void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
 
 void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
   DCHECK(rf->do_recv);
-  string recv_buf_key =
+  std::string recv_buf_key =
       RingAlgBufKey(name_, col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
                     (rf->rank + (group_size_ - 1)) % group_size_);
   VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
@@ -434,9 +434,9 @@ void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
       col_ctx_->op_ctx->cancellation_manager(), done);
 }
 
-string RingAlg::FieldState() {
-  string s = strings::StrCat(
-      "Ring", name_, " ", strings::Hex(reinterpret_cast<uint64>(this)),
+std::string RingAlg::FieldState() {
+  std::string s = strings::StrCat(
+      "Ring", name_, " ", strings::Hex(reinterpret_cast<uint64_t>(this)),
       " exec ", col_ctx_->exec_key, " step_id=", col_ctx_->step_id,
       " state of all ", rfv_.size(), " fields:");
   for (int i = 0; i < rfv_.size(); ++i) {
diff --git a/tensorflow/core/common_runtime/ring_alg.h b/tensorflow/core/common_runtime/ring_alg.h
index d2294f830db2c1..b54da03a01a739 100644
--- a/tensorflow/core/common_runtime/ring_alg.h
+++ b/tensorflow/core/common_runtime/ring_alg.h
@@ -31,7 +31,7 @@ class Device;
 // for specific collective functions.
 class RingAlg : public CollectiveImplementationInterface {
  public:
-  explicit RingAlg(CollectiveType type, const string& name);
+  explicit RingAlg(CollectiveType type, const std::string& name);
   ~RingAlg() override {}
 
   // Establishes the requested number of subdivision permutations based on the
@@ -63,11 +63,11 @@ class RingAlg : public CollectiveImplementationInterface {
 
   // Tracks progress of actions on a single subfield of the entire tensor.
   struct RingField {
-    int16 chunk_idx;     // major division index
-    int16 subdiv_idx;    // minor division index
-    int16 sc_idx;        // subchunk index
-    int16 rank;          // rank within subdiv permutation
-    int16 recv_dev_idx;  // dev from which value should be recv'd
+    int16_t chunk_idx;     // major division index
+    int16_t subdiv_idx;    // minor division index
+    int16_t sc_idx;        // subchunk index
+    int16_t rank;          // rank within subdiv permutation
+    int16_t recv_dev_idx;  // dev from which value should be recv'd
     RingFieldAction action;
     bool second_pass;
     bool recv_is_remote = false;
@@ -78,7 +78,7 @@ class RingAlg : public CollectiveImplementationInterface {
     Tensor chunk;           // alias to field values
     Tensor tmp_chunk;
     absl::Status status;
-    string DebugString() const;
+    std::string DebugString() const;
   };
   virtual void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
                              int field_idx);
@@ -87,8 +87,8 @@ class RingAlg : public CollectiveImplementationInterface {
   void DispatchRecv(RingField* rf, const StatusCallback& done);
 
   // For constructing log messages for debugging.
-  string FieldState();
-  string TensorDebugString(const Tensor& tensor);
+  std::string FieldState();
+  std::string TensorDebugString(const Tensor& tensor);
 
   // Producer/Consumer Queue of RingField structs.
   class PCQueue {
@@ -104,7 +104,7 @@ class RingAlg : public CollectiveImplementationInterface {
   };
 
   const CollectiveType type_;
-  const string name_;
+  const std::string name_;
   std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
   StatusCallback done_;
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
index bc016b366696d4..bd85f07aef1840 100644
--- a/tensorflow/core/common_runtime/ring_gatherer.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -71,7 +71,7 @@ void RingGatherer::Run(StatusCallback done) {
   DCHECK_GT(num_subdivs_, 0);
 
   if (VLOG_IS_ON(1)) {
-    string buf;
+    std::string buf;
     for (int r = 0; r < col_params_->group.members.size(); ++r) {
       strings::StrAppend(&buf, "dev ", r, " : ",
                          col_params_->group.members[r].device.name(), "\n");
@@ -79,10 +79,10 @@ void RingGatherer::Run(StatusCallback done) {
     for (int sd = 0;
          sd < col_params_->instance.impl_details.subdiv_permutations.size();
          ++sd) {
-      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      absl::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
       for (auto x :
            col_params_->instance.impl_details.subdiv_permutations[sd]) {
-        strings::StrAppend(&buf, x, ", ");
+        absl::StrAppend(&buf, x, ", ");
       }
     }
     VLOG(1) << "RingGatherer::Run for device " << col_ctx_->device_name
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 595ff502737b93..884fb17340c4c0 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -105,7 +105,7 @@ class RingGathererTest : public ::testing::Test {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
         EXPECT_NE(instances_[di]->status_.message().find("Deliberate failure"),
-                  string::npos);
+                  std::string::npos);
       }
     } else {
       // Confirm that every device accumulated the same set of correct
@@ -130,7 +130,7 @@ class RingGathererTest : public ::testing::Test {
             GenerateEvenSubdivOffsets(test_env->num_devices_per_worker,
                                       num_subdivs);
       }
-      string dev_name = col_params_->group.members[rank].device.name();
+      std::string dev_name = col_params_->group.members[rank].device.name();
       TF_CHECK_OK(test_env_->device_mgr->LookupDevice(dev_name, &device_))
           << "Couldn't find device " << dev_name
           << " existing devices: " << test_env_->device_mgr->DebugString();
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index c448f021f055f0..3ad099caee9b9b 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -67,7 +67,7 @@ void RingReducer::Run(StatusCallback done) {
   CHECK_GT(num_subdivs_, 0);
 
   if (VLOG_IS_ON(1)) {
-    string buf;
+    std::string buf;
     for (int r = 0; r < col_params_->group.members.size(); ++r) {
       strings::StrAppend(&buf, "dev ", r, " : ",
                          col_params_->group.members[r].device.name(), "\n");
@@ -75,10 +75,10 @@ void RingReducer::Run(StatusCallback done) {
     for (int sd = 0;
          sd < col_params_->instance.impl_details.subdiv_permutations.size();
          ++sd) {
-      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      absl::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
       for (auto x :
            col_params_->instance.impl_details.subdiv_permutations[sd]) {
-        strings::StrAppend(&buf, x, ", ");
+        absl::StrAppend(&buf, x, ", ");
       }
     }
     VLOG(1) << "RingReducer::Run for device " << col_ctx_->device_name
@@ -129,9 +129,9 @@ void RingReducer::ContinueAfterInputCopy() {
     // can be provided to the kernel in host memory?
     Tensor group_size_val = ca_->Scalar(group_size_);
     if (col_params_->group.device_type != "CPU") {
-      uint64 safe_alloc_frontier = col_ctx_->device->SafeAllocFrontier(0);
+      uint64_t safe_alloc_frontier = col_ctx_->device->SafeAllocFrontier(0);
       AllocationAttributes aa;
-      std::function<uint64()> freed_by_func = [this, &safe_alloc_frontier]() {
+      std::function<uint64_t()> freed_by_func = [this, &safe_alloc_frontier]() {
         safe_alloc_frontier =
             col_ctx_->device->SafeAllocFrontier(safe_alloc_frontier);
         return safe_alloc_frontier;
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index d4baa4aaef652e..bedfa64134de51 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -138,7 +138,7 @@ class RingReducerTest : public ::testing::Test {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
         EXPECT_NE(instances_[di]->status_.message().find("Deliberate failure"),
-                  string::npos);
+                  std::string::npos);
       }
     } else {
       // Confirm that every device computed the same correct reduction value.
@@ -165,7 +165,7 @@ class RingReducerTest : public ::testing::Test {
             GenerateEvenSubdivOffsets(test_env->num_devices_per_worker,
                                       num_subdivs);
       }
-      string dev_name = col_params_->group.members[rank].device.name();
+      std::string dev_name = col_params_->group.members[rank].device.name();
       TF_CHECK_OK(test_env_->device_mgr->LookupDevice(dev_name, &device_))
           << "Couldn't find device " << dev_name
           << " existing devices: " << test_env_->device_mgr->DebugString();
@@ -200,7 +200,7 @@ class RingReducerTest : public ::testing::Test {
   std::unique_ptr<CollectiveTestEnv> test_env_;
   std::vector<std::unique_ptr<DeviceInstance>> instances_;
   mutex mu_;
-  int32 reduce_counter_ TF_GUARDED_BY(mu_) = 0;
+  int32_t reduce_counter_ TF_GUARDED_BY(mu_) = 0;
 };
 
 class RingReducerInitParamsTest : public ::testing::Test {
diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc
index 1b3d39a8c6e996..24e7e089784e17 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 
 ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32_t scope_id,
-                                 const string& name,
+                                 const std::string& name,
                                  const absl::Span<const Field> fields,
                                  int32_t expected_call_count,
                                  ScopedAllocatorContainer* container)
@@ -69,7 +69,7 @@ void* ScopedAllocator::AllocateRaw(int32_t field_index, size_t num_bytes) {
       return nullptr;
     }
 
-    int32_t num_fields = static_cast<int32>(fields_.size());
+    int32_t num_fields = static_cast<int32_t>(fields_.size());
     if (field_index >= num_fields) {
       LOG(ERROR) << "ScopedAllocator " << name_
                  << " received unexpected field number " << field_index;
@@ -228,8 +228,8 @@ void ScopedAllocatorInstance::DeallocateRaw(void* p) {
   if (del) delete this;
 }
 
-string ScopedAllocatorInstance::Name() {
-  return strings::StrCat(scoped_allocator_->name(), "_field_", field_index_);
+std::string ScopedAllocatorInstance::Name() {
+  return absl::StrCat(scoped_allocator_->name(), "_field_", field_index_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 5b22deb264ce52..8c894372fbee15 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -33,7 +33,7 @@ class ScopedAllocator {
   // A subrange of the TensorBuffer associated with this object that
   // will be the backing memory for one aliased tensor.
   struct Field {
-    int32 scope_id;
+    int32_t scope_id;
     size_t offset;
     size_t bytes_requested;
     size_t bytes_allocated;
@@ -71,13 +71,13 @@ class ScopedAllocator {
   void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_);
   Tensor backing_tensor_;
   TensorBuffer* tbuf_;
-  int32 id_;
+  int32_t id_;
   std::string name_;
   ScopedAllocatorContainer* container_;
   std::vector<Field> fields_;
   mutex mu_;
-  int32 expected_call_count_ TF_GUARDED_BY(mu_);
-  int32 live_alloc_count_ TF_GUARDED_BY(mu_);
+  int32_t expected_call_count_ TF_GUARDED_BY(mu_);
+  int32_t live_alloc_count_ TF_GUARDED_BY(mu_);
 };
 
 // An Allocator that will return a pointer into the backing buffer of
@@ -117,7 +117,7 @@ class ScopedAllocatorInstance : public Allocator {
  private:
   mutex mu_;
   ScopedAllocator* scoped_allocator_;
-  int32 field_index_;
+  int32_t field_index_;
   bool allocated_ TF_GUARDED_BY(mu_);
   bool deallocated_ TF_GUARDED_BY(mu_);
   bool in_table_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index 47ddfabbc27efe..d4fe07b5f27d2b 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -20,7 +20,8 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ScopedAllocatorContainer::AddScopedAllocator(
-    const Tensor& backing_tensor, int32_t scope_id, const string& scope_name,
+    const Tensor& backing_tensor, int32_t scope_id,
+    const std::string& scope_name,
     const absl::Span<const ScopedAllocator::Field>& fields,
     int32_t expected_call_count) {
   VLOG(1) << "AddScopedAllocator " << mgr_->device_name()
@@ -152,7 +153,7 @@ ScopedAllocatorContainer* ScopedAllocatorMgr::GetContainer(int64_t step_id) {
 
 absl::Status ScopedAllocatorMgr::AddScopedAllocator(
     const Tensor& backing_tensor, int64_t step_id, int32_t scope_id,
-    const string& scope_name,
+    const std::string& scope_name,
     const absl::Span<const ScopedAllocator::Field>& fields,
     int32_t expected_call_count) {
   ScopedAllocatorContainer* sac = GetContainer(step_id);
@@ -164,7 +165,7 @@ absl::Status ScopedAllocatorMgr::AddScopedAllocator(
 size_t ScopedAllocatorMgr::PopulateFields(
     int32_t scope_id, const absl::Span<const TensorShape>& shapes,
     const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
-  const int32_t num_fields = static_cast<int32>(shapes.size());
+  const int32_t num_fields = static_cast<int32_t>(shapes.size());
   fields->resize(num_fields);
   // At the end of iteration `i`, `offset` points to the offset from the start
   // of the backing buffer until the end of `field[i].bytes_allocated`.  This
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
index dbbf7c3249ae54..22924a7005e892 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -54,7 +54,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
   int64_t step_id_;
   mutex mu_;
   struct SAField {
-    int32 field_index;
+    int32_t field_index;
     union {
       ScopedAllocator* scoped_allocator;
       ScopedAllocatorInstance* instance;
@@ -67,7 +67,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
         : field_index(ScopedAllocator::kBackingIndex),
           scoped_allocator(nullptr) {}
   };
-  std::unordered_map<int32, SAField> allocators_ TF_GUARDED_BY(mu_);
+  std::unordered_map<int32_t, SAField> allocators_ TF_GUARDED_BY(mu_);
 };
 
 // At most one of these exists per device.
diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index ab0d769ceebe8b..59deffca41c19c 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -36,27 +36,29 @@ Session::Session() {}
 
 Session::~Session() {}
 
-absl::Status Session::Run(const RunOptions& run_options,
-                          const std::vector<std::pair<string, Tensor> >& inputs,
-                          const std::vector<string>& output_tensor_names,
-                          const std::vector<string>& target_tensor_names,
-                          std::vector<Tensor>* outputs,
-                          RunMetadata* run_metadata) {
+absl::Status Session::Run(
+    const RunOptions& run_options,
+    const std::vector<std::pair<std::string, Tensor> >& inputs,
+    const std::vector<std::string>& output_tensor_names,
+    const std::vector<std::string>& target_tensor_names,
+    std::vector<Tensor>* outputs, RunMetadata* run_metadata) {
   return errors::Unimplemented(
       "Run with options is not supported for this session.");
 }
 
-absl::Status Session::PRunSetup(const std::vector<string>& input_names,
-                                const std::vector<string>& output_names,
-                                const std::vector<string>& target_nodes,
-                                string* handle) {
+absl::Status Session::PRunSetup(const std::vector<std::string>& input_names,
+                                const std::vector<std::string>& output_names,
+                                const std::vector<std::string>& target_nodes,
+                                std::string* handle) {
   return errors::Unimplemented(
       "Partial run is not supported for this session.");
 }
 
 absl::Status Session::PRun(
-    const string& handle, const std::vector<std::pair<string, Tensor> >& inputs,
-    const std::vector<string>& output_names, std::vector<Tensor>* outputs) {
+    const std::string& handle,
+    const std::vector<std::pair<std::string, Tensor> >& inputs,
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor>* outputs) {
   return errors::Unimplemented(
       "Partial run is not supported for this session.");
 }
@@ -96,7 +98,7 @@ absl::Status NewSession(const SessionOptions& options, Session** out_session) {
 }
 
 absl::Status Reset(const SessionOptions& options,
-                   const std::vector<string>& containers) {
+                   const std::vector<std::string>& containers) {
   SessionFactory* factory;
   TF_RETURN_IF_ERROR(SessionFactory::GetFactory(options, &factory));
   return factory->Reset(options, containers);
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index c21f1dc9483ee2..fc28ab4e05e887 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -33,7 +33,7 @@ static mutex* get_session_factory_lock() {
   return &session_factory_lock;
 }
 
-typedef std::unordered_map<string, SessionFactory*> SessionFactories;
+typedef std::unordered_map<std::string, SessionFactory*> SessionFactories;
 SessionFactories* session_factories() {
   static SessionFactories* factories = new SessionFactories;
   return factories;
@@ -41,7 +41,7 @@ SessionFactories* session_factories() {
 
 }  // namespace
 
-void SessionFactory::Register(const string& runtime_type,
+void SessionFactory::Register(const std::string& runtime_type,
                               SessionFactory* factory) {
   mutex_lock l(*get_session_factory_lock());
   if (!session_factories()->insert({runtime_type, factory}).second) {
@@ -51,17 +51,17 @@ void SessionFactory::Register(const string& runtime_type,
 }
 
 namespace {
-const string RegisteredFactoriesErrorMessageLocked() {
-  std::vector<string> factory_types;
+const std::string RegisteredFactoriesErrorMessageLocked() {
+  std::vector<std::string> factory_types;
   for (const auto& session_factory : *session_factories()) {
     factory_types.push_back(session_factory.first);
   }
-  return strings::StrCat("Registered factories are {",
-                         absl::StrJoin(factory_types, ", "), "}.");
+  return absl::StrCat("Registered factories are {",
+                      absl::StrJoin(factory_types, ", "), "}.");
 }
-string SessionOptionsToString(const SessionOptions& options) {
-  return strings::StrCat("target: \"", options.target,
-                         "\" config: ", options.config.ShortDebugString());
+std::string SessionOptionsToString(const SessionOptions& options) {
+  return absl::StrCat("target: \"", options.target,
+                      "\" config: ", options.config.ShortDebugString());
 }
 }  // namespace
 
@@ -69,7 +69,7 @@ absl::Status SessionFactory::GetFactory(const SessionOptions& options,
                                         SessionFactory** out_factory) {
   mutex_lock l(*get_session_factory_lock());  // could use reader lock
 
-  std::vector<std::pair<string, SessionFactory*>> candidate_factories;
+  std::vector<std::pair<std::string, SessionFactory*>> candidate_factories;
   for (const auto& session_factory : *session_factories()) {
     if (session_factory.second->AcceptsOptions(options)) {
       VLOG(2) << "SessionFactory type " << session_factory.first
@@ -93,7 +93,7 @@ absl::Status SessionFactory::GetFactory(const SessionOptions& options,
     // the number of sessions grows.
     // TODO(mrry): Consider providing a system-default fallback option
     // in this case.
-    std::vector<string> factory_types;
+    std::vector<std::string> factory_types;
     factory_types.reserve(candidate_factories.size());
     for (const auto& candidate_factory : candidate_factories) {
       factory_types.push_back(candidate_factory.first);
diff --git a/tensorflow/core/common_runtime/session_factory.h b/tensorflow/core/common_runtime/session_factory.h
index ffadb29ae21a6c..3c9d08db121c68 100644
--- a/tensorflow/core/common_runtime/session_factory.h
+++ b/tensorflow/core/common_runtime/session_factory.h
@@ -61,12 +61,13 @@ class SessionFactory {
   //
   // Sessions that support resource containers should override this function.
   virtual absl::Status Reset(const SessionOptions& options,
-                             const std::vector<string>& containers) {
+                             const std::vector<std::string>& containers) {
     return errors::Unimplemented("Reset()");
   }
 
   virtual ~SessionFactory() {}
-  static void Register(const string& runtime_type, SessionFactory* factory);
+  static void Register(const std::string& runtime_type,
+                       SessionFactory* factory);
   static absl::Status GetFactory(const SessionOptions& options,
                                  SessionFactory** out_factory);
 };
diff --git a/tensorflow/core/common_runtime/session_state.cc b/tensorflow/core/common_runtime/session_state.cc
index 47341276fef563..5a236367357099 100644
--- a/tensorflow/core/common_runtime/session_state.cc
+++ b/tensorflow/core/common_runtime/session_state.cc
@@ -23,7 +23,8 @@ namespace tensorflow {
 // kTensorHandleResourceTypeName.
 const char* SessionState::kTensorHandleResourceTypeName = "TensorHandle";
 
-absl::Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
+absl::Status SessionState::GetTensor(const std::string& handle,
+                                     Tensor* tensor) {
   mutex_lock l(state_lock_);
   auto it = tensors_.find(handle);
   if (it == tensors_.end()) {
@@ -34,7 +35,7 @@ absl::Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
   return absl::OkStatus();
 }
 
-absl::Status SessionState::AddTensor(const string& handle,
+absl::Status SessionState::AddTensor(const std::string& handle,
                                      const Tensor& tensor) {
   mutex_lock l(state_lock_);
   if (!tensors_.insert({handle, tensor}).second) {
@@ -44,7 +45,7 @@ absl::Status SessionState::AddTensor(const string& handle,
   return absl::OkStatus();
 }
 
-absl::Status SessionState::DeleteTensor(const string& handle) {
+absl::Status SessionState::DeleteTensor(const std::string& handle) {
   mutex_lock l(state_lock_);
   if (tensors_.erase(handle) == 0) {
     return errors::InvalidArgument("Failed to delete a tensor with handle '",
@@ -58,7 +59,7 @@ int64_t SessionState::GetNewId() {
   return tensor_id_++;
 }
 
-absl::Status TensorStore::AddTensor(const string& name,
+absl::Status TensorStore::AddTensor(const std::string& name,
                                     const TensorAndKey& tk) {
   mutex_lock l(lock_);
   if (!tensors_.insert({name, tk}).second) {
@@ -69,18 +70,18 @@ absl::Status TensorStore::AddTensor(const string& name,
   return absl::OkStatus();
 }
 
-absl::Status TensorStore::SaveTensors(const std::vector<string>& output_names,
-                                      SessionState* session_state) {
+absl::Status TensorStore::SaveTensors(
+    const std::vector<std::string>& output_names, SessionState* session_state) {
   mutex_lock l(lock_);
   if (!tensors_.empty()) {
     // Save only the tensors in output_names in the session.
-    for (const string& name : output_names) {
+    for (const std::string& name : output_names) {
       TensorId id(ParseTensorName(name));
-      const string op_name(id.first);
+      const std::string op_name(id.first);
       auto it = tensors_.find(op_name);
       if (it != tensors_.end()) {
         // Save the tensor to the session state.
-        string key = it->second.GetHandle(op_name);
+        std::string key = it->second.GetHandle(op_name);
         TF_RETURN_IF_ERROR(session_state->AddTensor(key, it->second.tensor));
       }
     }
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 0893140693fdf9..bc4787864315b0 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -162,7 +162,7 @@ absl::Status ShapeRefiner::InferShapesForFunction(
     const FunctionDef* function_def, AttrSlice attributes,
     InferenceContext* outer_context) {
   const Graph* graph;
-  const string& fname = function_def->signature().name();
+  const std::string& fname = function_def->signature().name();
   auto it = functions_.find(fname);
   if (it != functions_.end()) {
     graph = it->second.get();
@@ -170,7 +170,7 @@ absl::Status ShapeRefiner::InferShapesForFunction(
     InstantiationResult result;
     TF_RETURN_IF_ERROR(InstantiateFunction(
         *function_def, attributes,
-        [this](const string& op, const OpDef** sig) {
+        [this](const std::string& op, const OpDef** sig) {
           return this->function_library_->LookUpOpDef(op, sig);
         },
         &result));
@@ -476,7 +476,7 @@ absl::Status ShapeRefiner::EvaluateConstantIntScalarEdge(
           scalar.NumElements());
     }
     if (scalar.dtype() == DT_INT32) {
-      *result = scalar.scalar<int32>()();
+      *result = scalar.scalar<int32_t>()();
     } else {
       if (scalar.dtype() != DT_INT64) {
         return errors::InvalidArgument(
@@ -515,7 +515,7 @@ absl::Status ShapeRefiner::ConstantPartialShape(
           "of '-1' is required to represent an unknown shape.");
     }
     if (t.dims() == 0) {
-      if (t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) {
+      if (t.dtype() == DT_INT32 && t.scalar<int32_t>()() == -1) {
         *result = target_context->UnknownShape();
         return absl::OkStatus();
       } else if (t.dtype() == DT_INT64 && t.scalar<int64_t>()() == -1) {
@@ -531,7 +531,7 @@ absl::Status ShapeRefiner::ConstantPartialShape(
 
   TF_RETURN_IF_ERROR(src_context->WithRank(src_shape, 1, &src_shape));
 
-  const string& src_op = input_edge->src()->type_string();
+  const std::string& src_op = input_edge->src()->type_string();
   if (src_context->Value(src_context->Dim(src_shape, 0)) == 0) {
     // Source tensor is a vector of length 0, so the shape it
     // represents is as scalar.
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 111303a095f5ae..f67e5dd4b388e7 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -87,7 +87,7 @@ class ShapeRefiner {
   }
 
   // Getters and setters for graph_def_version_.
-  int32 graph_def_version() const { return graph_def_version_; }
+  int32_t graph_def_version() const { return graph_def_version_; }
   void set_graph_def_version(int32_t version) { graph_def_version_ = version; }
 
   void set_require_shape_inference_fns(bool require_shape_inference_fns) {
@@ -250,7 +250,7 @@ class ShapeRefiner {
       shape_inference::InferenceContext* context,
       shape_inference::InferenceContext* outer_context = nullptr);
 
-  int32 graph_def_version_;
+  int32_t graph_def_version_;
   const OpRegistryInterface* const ops_registry_;
 
   // The lifetime of the tensors are bound to the runner, so it should be the
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index c54f26e7cc460c..580a987b3ccffd 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -281,9 +281,10 @@ TEST_F(ShapeRefinerTest, ExtractConstantSubgraphMultiOutput) {
   // input_tensor from the shape function.
   {
     Scope root = Scope::NewRootScope();
-    auto small = ops::Const(root, {static_cast<int32>(1), TensorShape({1, 1})});
+    auto small =
+        ops::Const(root, {static_cast<int32_t>(1), TensorShape({1, 1})});
     auto large = ops::Const(
-        root, {static_cast<int32>(2), TensorShape({4, kMaxTensorSize / 2})});
+        root, {static_cast<int32_t>(2), TensorShape({4, kMaxTensorSize / 2})});
     Node* multi;
     TF_ASSERT_OK(NodeBuilder("MI", "MultiIdentity")
                      .Input(std::vector<NodeBuilder::NodeOut>{small.node(),
@@ -313,7 +314,7 @@ TEST_F(ShapeRefinerTest, ExtractConstantSubgraphMultiOutput) {
     // The add adds 1 and 2 together, and its output has kMaxTensorSize*2
     // elements.
     shape_inference::InferenceContext* ctx = m.GetContext(shape_v2);
-    EXPECT_EQ(strings::StrCat("[", kMaxTensorSize * 2 * 3, "]"),
+    EXPECT_EQ(absl::StrCat("[", kMaxTensorSize * 2 * 3, "]"),
               ctx->DebugString(ctx->output(0)));
   }
 }
@@ -380,7 +381,7 @@ REGISTER_OP("ShapeData")
       std::vector<shape_inference::DimensionHandle> dims;
       dims.reserve(shape_data->NumElements());
       for (int i = 0; i < shape_data->NumElements(); ++i) {
-        dims.emplace_back(c->MakeDim(shape_data->flat<int32>()(i)));
+        dims.emplace_back(c->MakeDim(shape_data->flat<int32_t>()(i)));
       }
 
       c->set_output(0, c->MakeShape(dims));
@@ -418,7 +419,7 @@ REGISTER_OP("ShapeVectorForAllElements")
       }
       int64_t total = 0;
       for (int i = 0; i < shape_data->NumElements(); ++i) {
-        total += shape_data->flat<int32>()(i);
+        total += shape_data->flat<int32_t>()(i);
       }
 
       c->set_output(0, c->Vector(total));
@@ -487,7 +488,8 @@ TEST_F(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt64) {
 
   // Create variable 2x4 tensor.
   auto input = ops::Variable(
-      root, {2, 4, static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+      root,
+      {2, 4, static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT64);
 
   // Shape is a vector of 2 elements (2,4)
@@ -521,7 +523,8 @@ TEST_F(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt32Overflow) {
 
   // Create variable 2x4 tensor.
   auto input = ops::Variable(
-      root, {2, 4, static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+      root,
+      {2, 4, static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT32);
 
   // Shape is a vector of 2 elements (2,4)
@@ -607,7 +610,7 @@ TEST_F(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt64) {
   auto input = ops::Variable(
       root,
       {1, 2, 3, 4, 5,
-       static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+       static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT64);
 
   // 5! * int32_max_value * 2.
@@ -638,7 +641,7 @@ TEST_F(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt32Overflow) {
   auto input = ops::Variable(
       root,
       {1, 2, 3, 4, 5,
-       static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+       static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT32);
 
   // 5!.
@@ -845,7 +848,7 @@ absl::Status PartialTensorAsShapeShapeFn(shape_inference::InferenceContext* c) {
     return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(
-      c->MakeShapeFromTensorShape(TensorShape({t->flat<int32>()(0)}), &out));
+      c->MakeShapeFromTensorShape(TensorShape({t->flat<int32_t>()(0)}), &out));
   c->set_output(0, out);
   return absl::OkStatus();
 }
@@ -967,10 +970,10 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
 
   InputList inputs{
       // clang-format off
-      Input(ops::Const<int32>(root, 10)),
-      Input(ops::Const<int32>(root, 20)),
+      Input(ops::Const<int32_t>(root, 10)),
+      Input(ops::Const<int32_t>(root, 20)),
       Input(Output(scalar_non_const)),
-      Input(ops::Const<int32>(root, 40)),
+      Input(ops::Const<int32_t>(root, 40)),
   };  // clang-format on
   auto pack = ops::Stack(root, inputs);
   TF_ASSERT_OK(root.status());
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
index af721c1893baa0..3855c6a3d6cfce 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.cc
+++ b/tensorflow/core/common_runtime/simple_propagator_state.cc
@@ -35,7 +35,7 @@ SimplePropagatorState::SimplePropagatorState(
       vlog_(vlog || VLOG_IS_ON(1)),
       input_tensors_(finfo.total_inputs),
       pending_(
-          new std::atomic<int32>[immutable_state.graph_view().num_nodes()]),
+          new std::atomic<int32_t>[immutable_state.graph_view().num_nodes()]),
       active_(vlog_ ? new std::vector<bool>(
                           immutable_state.graph_view().num_nodes())
                     : nullptr),
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.h b/tensorflow/core/common_runtime/simple_propagator_state.h
index 3c53a5f900414f..8ef9775f93aee8 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.h
+++ b/tensorflow/core/common_runtime/simple_propagator_state.h
@@ -167,7 +167,7 @@ class SimplePropagatorState {
   // is never concurrent access to the same entry.
   std::vector<Entry> input_tensors_;
 
-  std::unique_ptr<std::atomic<int32>[]> pending_;
+  std::unique_ptr<std::atomic<int32_t>[]> pending_;
 
   // If `vlog_` is true, this stores a bit vector of active nodes, indexed by
   // node ID.
diff --git a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
index 10226237cbb8e3..5eb084d0def629 100644
--- a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
+++ b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
@@ -109,12 +109,12 @@ void RedirectEdge(Graph* graph, Node* old_src_node, Node* dst_node,
 }
 
 // Find the corresponding host device name from the TPU device name.
-string GetHostDeviceName(Node* tpu_node) {
+std::string GetHostDeviceName(Node* tpu_node) {
   auto device_name = tpu_node->requested_device();
   if (device_name.empty()) device_name = tpu_node->assigned_device_name();
   DeviceNameUtils::ParsedName parsed_device_name;
   DeviceNameUtils::ParseFullName(device_name, &parsed_device_name);
-  string host_device_name = DeviceNameUtils::FullName(
+  std::string host_device_name = DeviceNameUtils::FullName(
       parsed_device_name.job, parsed_device_name.replica,
       parsed_device_name.task, /*type=*/"CPU", /*id=*/0);
   return host_device_name;
@@ -143,7 +143,8 @@ int GetTPUTaskId(Node* tpu_node) {
 // Build the fill op. Its value is 0 and the fill op is put on the host device
 // with the same task id as the TPUExecute node.
 Node* BuildFillOp(GraphDefBuilder::Options& bopts, Node* tpu_node,
-                  Node* in_node, int input_index, string host_device_name) {
+                  Node* in_node, int input_index,
+                  std::string host_device_name) {
   // Find the output_shape vector
   auto output_shape_vec = GetOutputShapeVec(in_node);
   if (!output_shape_vec.has_value()) return nullptr;
@@ -191,7 +192,7 @@ absl::Status ReplaceIciDummyVariables(Graph* graph, int input_index,
       continue;
     }
 
-    string host_device_name = GetHostDeviceName(tpu_node);
+    std::string host_device_name = GetHostDeviceName(tpu_node);
 
     // If the node corresponding to host_device_name is already in the graph,
     // replace the edge from in_node to tpu_node with the edge from
diff --git a/tensorflow/core/common_runtime/single_threaded_executor.cc b/tensorflow/core/common_runtime/single_threaded_executor.cc
index a7c30baec739ad..c737d274fbcd64 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor.cc
@@ -65,8 +65,8 @@ namespace {
 typedef absl::InlinedVector<TensorValue, 4UL> TensorValueVec;
 typedef absl::InlinedVector<AllocatorAttributes, 4UL> AllocatorAttributeVec;
 
-static const string& kSingleThreadedExecutor =
-    *new string("SINGLE_THREADED_EXECUTOR");
+static const std::string& kSingleThreadedExecutor =
+    *new std::string("SINGLE_THREADED_EXECUTOR");
 
 class SingleThreadedExecutorImpl : public Executor {
  public:
diff --git a/tensorflow/core/common_runtime/single_threaded_executor_test.cc b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
index 334ada5ad0a389..b081e17d86a978 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor_test.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
@@ -170,8 +170,9 @@ float V(const Tensor& tensor) {
   return tensor.scalar<float>()();
 }
 
-Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
-                          const string& receiver, const string& name) {
+Rendezvous::ParsedKey Key(const std::string& sender, const uint64_t incarnation,
+                          const std::string& receiver,
+                          const std::string& name) {
   Rendezvous::ParsedKey result;
   TF_CHECK_OK(
       Rendezvous::ParseKey(Rendezvous::CreateKey(sender, incarnation, receiver,
@@ -363,8 +364,8 @@ void BM_executor(::testing::benchmark::State& state) {
   Graph* g = new Graph(OpRegistry::Global());
   random::PhiloxRandom philox(1729, 17);
   random::SimplePhilox rand(&philox);
-  uint64 cur = 0;
-  uint32 r = 1 + rand.Rand32() % width;
+  uint64_t cur = 0;
+  uint32_t r = 1 + rand.Rand32() % width;
   std::vector<Node*> ready_nodes;
   for (int i = 0; i < r; ++i) {
     ready_nodes.push_back(test::graph::NoOp(g, {}));
@@ -392,7 +393,7 @@ void BM_executor(::testing::benchmark::State& state) {
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
                   "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
       .Run(state);
-  state.SetLabel(strings::StrCat("Nodes = ", cur));
+  state.SetLabel(absl::StrCat("Nodes = ", cur));
   state.SetItemsProcessed(cur * static_cast<int64_t>(state.iterations()));
 }
 
@@ -424,7 +425,7 @@ void BM_const_identity(::testing::benchmark::State& state) {
                   "SINGLE_THREADED_EXECUTOR",
                   /*old_benchmark_api=*/false)
       .Run(state);
-  state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  state.SetLabel(absl::StrCat("Nodes = ", (1 + outputs_per_const) * width));
   state.SetItemsProcessed((1 + outputs_per_const) * width *
                           static_cast<int64_t>(state.iterations()));
 }
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.cc b/tensorflow/core/common_runtime/stats_publisher_interface.cc
index 8b04ac9f80523d..610efbdadb7dc8 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.cc
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.cc
@@ -43,7 +43,8 @@ class NoOpStatsPublisher : public StatsPublisherInterface {
                              function_records) override {}
 
   std::unique_ptr<ProfileHandler> GetProfileHandler(
-      uint64 step, int64_t execution_count, const RunOptions& ropts) override {
+      uint64_t step, int64_t execution_count,
+      const RunOptions& ropts) override {
     return nullptr;
   }
 
@@ -74,7 +75,7 @@ StatsPublisherFactory StatsPublisherInterface::GetStatsPublisherFactory() {
 }
 
 std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
-    const string& session, const BuildGraphOptions& bopts,
+    const std::string& session, const BuildGraphOptions& bopts,
     const SessionOptions& sopts) {
   return std::unique_ptr<StatsPublisherInterface>(new NoOpStatsPublisher);
 }
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.h b/tensorflow/core/common_runtime/stats_publisher_interface.h
index 450683e643dc0c..2f0e3221be97cb 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.h
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.h
@@ -61,7 +61,7 @@ class StatsPublisherInterface {
   //
   // This method may return a null pointer, if no handler was created.
   virtual std::unique_ptr<ProfileHandler> GetProfileHandler(
-      uint64 step, int64_t execution_count, const RunOptions& ropts) = 0;
+      uint64_t step, int64_t execution_count, const RunOptions& ropts) = 0;
 
   virtual ~StatsPublisherInterface() {}
 
@@ -77,7 +77,7 @@ class StatsPublisherInterface {
 };
 
 std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
-    const string& session, const BuildGraphOptions& bopts,
+    const std::string& session, const BuildGraphOptions& bopts,
     const SessionOptions& sopts);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 03fe4d946bdb0d..cc32e668309402 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -36,7 +36,7 @@ const int kMaxAllocReportNodes = 100;
 const float kMaxAllocReportFraction = 0.99;
 
 struct AllocStats {
-  std::map<int64_t, std::vector<string>> nodes_by_size;
+  std::map<int64_t, std::vector<std::string>> nodes_by_size;
   int64_t total_bytes = 0;
   int64_t total_nodes = 0;
 };
@@ -65,39 +65,39 @@ NodeExecStatsWrapper::NodeExecStatsWrapper(
       node_(node),
       step_stats_collector_(step_stats_collector) {}
 
-void NodeExecStatsWrapper::Done(const string& device) {
+void NodeExecStatsWrapper::Done(const std::string& device) {
   // TODO(tucker): merge with the DetailText function in session.cc in a common
   // location.
   DCHECK(node_);
-  string memory;
+  std::string memory;
   for (auto& all : stats_->memory()) {
     int64_t tot = all.total_bytes();
     if (tot >= 0.1 * 1048576.0) {
       int64_t peak = all.peak_bytes();
       if (peak > 0) {
         memory =
-            strings::StrCat(memory, "[", all.allocator_name(),
-                            strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
-                                            peak / 1048576.0));
+            absl::StrCat(memory, "[", all.allocator_name(),
+                         strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
+                                         peak / 1048576.0));
       } else {
-        memory = strings::StrCat(memory, "[", all.allocator_name(),
-                                 strings::Printf(" %.1fMB] ", tot / 1048576.0));
+        memory = absl::StrCat(memory, "[", all.allocator_name(),
+                              strings::Printf(" %.1fMB] ", tot / 1048576.0));
       }
     }
   }
   const AttrSlice attrs(*node_);
-  string text;
+  std::string text;
   if (IsSend(node_)) {
-    string tensor_name;
+    std::string tensor_name;
     TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string recv_device;
+    std::string recv_device;
     TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
     text = strings::StrCat(memory, node_->name(), " = ", node_->op(), "(",
                            tensor_name, " @", recv_device, ")");
   } else if (IsRecv(node_)) {
-    string tensor_name;
+    std::string tensor_name;
     TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string send_device;
+    std::string send_device;
     TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
     text = strings::StrCat(memory, node_->name(), " = ", node_->op(), "(",
                            tensor_name, " @", send_device, ")");
@@ -197,7 +197,7 @@ void NodeExecStatsWrapper::Finalize() {
 StepStatsCollector::StepStatsCollector(StepStats* step_stats)
     : finalized_(false), step_stats_(step_stats) {}
 
-static int ExtractGpuWithStreamAll(string device_name) {
+static int ExtractGpuWithStreamAll(std::string device_name) {
   // Check if the device name matches the ".*gpu:(\\d+)/stream:all$" regexp,
   // and if it does return the stream index (always positive). If it doesn't
   // return -1.
@@ -220,7 +220,7 @@ static int ExtractGpuWithStreamAll(string device_name) {
   } else {
     // Convert the captured string into an integer. But first we need to put
     // the digits back in order
-    string ordered_capture(capture);
+    std::string ordered_capture(capture);
     std::reverse(ordered_capture.begin(), ordered_capture.end());
     int gpu_id;
     CHECK(absl::SimpleAtoi(ordered_capture, &gpu_id));
@@ -228,7 +228,7 @@ static int ExtractGpuWithStreamAll(string device_name) {
   }
 }
 
-static int ExtractGpuWithoutStream(string device_name) {
+static int ExtractGpuWithoutStream(std::string device_name) {
   // Check if the device name matches the ".*gpu:(\\d+)$" regexp,
   // and if it does return the stream index (always positive). If it doesn't
   // return -1.
@@ -249,7 +249,7 @@ static int ExtractGpuWithoutStream(string device_name) {
   } else {
     // Convert the captured string into an integer. But first we need to put
     // the digits back in order
-    string ordered_capture(capture);
+    std::string ordered_capture(capture);
     std::reverse(ordered_capture.begin(), ordered_capture.end());
     int gpu_id;
     CHECK(absl::SimpleAtoi(ordered_capture, &gpu_id));
@@ -259,7 +259,7 @@ static int ExtractGpuWithoutStream(string device_name) {
 
 void StepStatsCollector::BuildCostModel(
     CostModelManager* cost_model_manager,
-    const std::unordered_map<string, const Graph*>& device_map) {
+    const std::unordered_map<std::string, const Graph*>& device_map) {
   mutex_lock lock(mu_);
 
   if (!finalized_) {
@@ -282,7 +282,7 @@ void StepStatsCollector::BuildCostModel(
 
   for (int i = 0; i < step_stats_->dev_stats_size(); ++i) {
     const DeviceStepStats& device_stats = step_stats_->dev_stats(i);
-    const string& device_name = device_stats.device();
+    const std::string& device_name = device_stats.device();
     const int gpu_id = ExtractGpuWithStreamAll(device_name);
     if (gpu_id >= 0) {
       // These are gpu hardware stats
@@ -296,7 +296,7 @@ void StepStatsCollector::BuildCostModel(
 
   for (auto& itr : per_device_stats) {
     const absl::string_view device_name = itr.first;
-    const int gpu_id = ExtractGpuWithoutStream(string(device_name));
+    const int gpu_id = ExtractGpuWithoutStream(std::string(device_name));
     if (gpu_id >= 0) {
       // Reference the gpu hardware stats in addition to the regular stats
       // for this gpu device if they're available.
@@ -324,10 +324,10 @@ void StepStatsCollector::BuildCostModel(
 
     const DeviceStats& dev_stats = per_device_stats.find(device)->second;
 
-    std::unordered_map<string, NodeExecStats> name_to_hw_node_stats;
+    std::unordered_map<std::string, NodeExecStats> name_to_hw_node_stats;
     if (dev_stats.hardware_stats) {
       for (const auto& node_stats : dev_stats.hardware_stats->node_stats()) {
-        string node_name = node_stats.node_name();
+        std::string node_name = node_stats.node_name();
         // Remove the part of op name (e.g. :Conv2D) in the end of a node name.
         size_t pos = node_name.find_first_of(':');
         if (pos != std::string::npos) {
@@ -368,7 +368,8 @@ void StepStatsCollector::BuildCostModel(
         cm->RecordMemoryStats(node, stats.memory_stats());
         // Use hardware stats to record the execution time if they're available,
         // otherwise use the regular (less accurate) stats
-        string node_name = dev_stats.regular_stats->node_stats(i).node_name();
+        std::string node_name =
+            dev_stats.regular_stats->node_stats(i).node_name();
         if (dev_stats.hardware_stats && name_to_hw_node_stats.find(node_name) !=
                                             name_to_hw_node_stats.end()) {
           const NodeExecStats& hw_stats = name_to_hw_node_stats[node_name];
@@ -383,14 +384,14 @@ void StepStatsCollector::BuildCostModel(
   }
 }
 
-void StepStatsCollector::Save(const string& device,
+void StepStatsCollector::Save(const std::string& device,
                               NodeExecStats* node_stats_pb) {
   Save(device,
        new NodeExecStatsWrapper(std::unique_ptr<NodeExecStats>(node_stats_pb),
                                 nullptr, this));
 }
 
-void StepStatsCollector::Save(const string& device,
+void StepStatsCollector::Save(const std::string& device,
                               NodeExecStatsWrapper* node_stats) {
   if (!node_stats) return;
   VLOG(1) << "Save dev " << device << " node stats " << node_stats->stats();
@@ -410,9 +411,9 @@ void StepStatsCollector::Save(const string& device,
   }
 }
 
-void StepStatsCollector::SaveThreadName(const string& device,
-                                        const uint32 thread_id,
-                                        const string& thread_name) {
+void StepStatsCollector::SaveThreadName(const std::string& device,
+                                        const uint32_t thread_id,
+                                        const std::string& thread_name) {
   VLOG(1) << "Save dev " << device << " thread id " << thread_id << " name "
           << thread_name;
   {
@@ -434,17 +435,17 @@ NodeExecStatsInterface* StepStatsCollector::CreateNodeExecStats(
   return new NodeExecStatsWrapper(node, this);
 }
 
-string StepStatsCollector::ReportAllocsOnResourceExhausted(
+std::string StepStatsCollector::ReportAllocsOnResourceExhausted(
     const absl::string_view err) {
   mutex_lock l(mu_);
   if (err.find("OOM") == err.npos) {
     return "";
   }
   // <device, allocator> -> AllocStats
-  std::map<std::pair<string, string>, AllocStats> allocs_map;
-  string report = "\n";
+  std::map<std::pair<std::string, std::string>, AllocStats> allocs_map;
+  std::string report = "\n";
   for (const auto& dev_stat : dev_stats_) {
-    const string& device = dev_stat.first;
+    const std::string& device = dev_stat.first;
     // Only print the device that has OOM.
     // TODO(xpan): Extract device from err first to speed it up.
     if (err.find(device) == err.npos) {
@@ -490,7 +491,7 @@ string StepStatsCollector::ReportAllocsOnResourceExhausted(
     // Print allocations stats of the <device, allocator> pair.
     for (auto it = dev_allocs_stats.nodes_by_size.rbegin();
          it != dev_allocs_stats.nodes_by_size.rend(); ++it) {
-      for (const string& node_name : it->second) {
+      for (const std::string& node_name : it->second) {
         reported_bytes += it->first;
         strings::StrAppend(&report, "  ",
                            strings::HumanReadableNumBytes(it->first), " from ",
@@ -532,7 +533,7 @@ void StepStatsCollector::FinalizeInternal() {
     return;
   }
   finalized_ = true;
-  std::map<string, DeviceStepStats*> dev_stats_pb;
+  std::map<std::string, DeviceStepStats*> dev_stats_pb;
   for (auto& ds : *step_stats_->mutable_dev_stats()) {
     dev_stats_pb[ds.device()] = &ds;
   }
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 277630cd40f9de..1c3503a8101654 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -51,7 +51,7 @@ class NodeExecStatsInterface {
   // Called when the statistics collection for the node has finished. Once this
   // method is called, the caller should not make assumptions about the validity
   // of this object.
-  virtual void Done(const string& device) = 0;
+  virtual void Done(const std::string& device) = 0;
 
   // Called immediately after this node starts being processed by the executor.
   virtual void RecordExecutorStarted() = 0;
@@ -101,7 +101,7 @@ class NodeExecStatsWrapper : public NodeExecStatsInterface {
   // Destructor calls Finalize() to release the TrackingAllocators.
   ~NodeExecStatsWrapper() override { Finalize(); }
 
-  void Done(const string& device) override;
+  void Done(const std::string& device) override;
   void RecordExecutorStarted() override;
   void RecordComputeStarted() override;
   void RecordComputeEnded() override;
@@ -148,7 +148,8 @@ class StepStatsCollectorInterface {
   // `err` message needs to contain device name and allocator name, e.g.:
   // "ResourceExhaustedError: OOM when allocating tensor ...
   // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
-  virtual string ReportAllocsOnResourceExhausted(absl::string_view err) = 0;
+  virtual std::string ReportAllocsOnResourceExhausted(
+      absl::string_view err) = 0;
 };
 
 // StepStatsCollector manages the collection of a StepStats object.
@@ -164,19 +165,19 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   // device_map.
   void BuildCostModel(
       CostModelManager* cost_model_manager,
-      const std::unordered_map<string, const Graph*>& device_map);
+      const std::unordered_map<std::string, const Graph*>& device_map);
 
   // Saves node statistics to the DeviceStats object associated with device.
   // Should be called before Finalize.
-  void Save(const string& device, NodeExecStats* node_stats_pb);
-  void Save(const string& device, NodeExecStatsWrapper* node_stats);
+  void Save(const std::string& device, NodeExecStats* node_stats_pb);
+  void Save(const std::string& device, NodeExecStatsWrapper* node_stats);
 
   // Saves thread name.
-  void SaveThreadName(const string& device, const uint32 thread_id,
-                      const string& thread_name);
+  void SaveThreadName(const std::string& device, const uint32_t thread_id,
+                      const std::string& thread_name);
 
   NodeExecStatsInterface* CreateNodeExecStats(const NodeDef* node) override;
-  string ReportAllocsOnResourceExhausted(absl::string_view err) override;
+  std::string ReportAllocsOnResourceExhausted(absl::string_view err) override;
 
   // The following 2 Finalize methods populate the StepStats passed
   // from the constructor. Calling it more than once won't have any effect.
@@ -188,19 +189,21 @@ class StepStatsCollector : public StepStatsCollectorInterface {
  private:
   // TODO(suharshs): Make this configurable if its not possible to find a value
   // that works for all cases.
-  static constexpr uint64 kMaxCollectedNodes = 1 << 20;
+  static constexpr uint64_t kMaxCollectedNodes = 1 << 20;
 
   typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
-  typedef std::unordered_map<uint32, string> ThreadNamesMap;
+  typedef std::unordered_map<uint32_t, std::string> ThreadNamesMap;
 
   void FinalizeInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   bool finalized_ TF_GUARDED_BY(mu_);
-  std::unordered_map<string, NodeStatsVector> dev_stats_ TF_GUARDED_BY(mu_);
-  std::unordered_map<string, ThreadNamesMap> thread_names_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, NodeStatsVector> dev_stats_
+      TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, ThreadNamesMap> thread_names_
+      TF_GUARDED_BY(mu_);
   StepStats* step_stats_ TF_GUARDED_BY(mu_);
-  uint64 collected_nodes_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t collected_nodes_ TF_GUARDED_BY(mu_) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/testlib_ops.cc b/tensorflow/core/common_runtime/testlib_ops.cc
index 11970bee114128..d36ad0a20dc0a0 100644
--- a/tensorflow/core/common_runtime/testlib_ops.cc
+++ b/tensorflow/core/common_runtime/testlib_ops.cc
@@ -46,7 +46,7 @@ class ErrorOp : public OpKernel {
   }
 
  private:
-  string errmsg_;
+  std::string errmsg_;
   bool log_error_ = false;
 };
 REGISTER_KERNEL_BUILDER(Name("Error").Device(DEVICE_CPU), ErrorOp);
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 23166b69540083..8ada1107f7f044 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -54,7 +54,7 @@ info. It does not have any negative impact on performance. */
 namespace tensorflow {
 
 ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
-                                   const string& name, Bytes memory_limit,
+                                   const std::string& name, Bytes memory_limit,
                                    const DeviceLocality& locality,
                                    Allocator* allocator)
     : LocalDevice(options, Device::BuildDeviceAttributes(
diff --git a/tensorflow/core/common_runtime/threadpool_device.h b/tensorflow/core/common_runtime/threadpool_device.h
index 08175ccb1f231c..4e6c0b87935082 100644
--- a/tensorflow/core/common_runtime/threadpool_device.h
+++ b/tensorflow/core/common_runtime/threadpool_device.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 // CPU device implementation.
 class ThreadPoolDevice : public LocalDevice {
  public:
-  ThreadPoolDevice(const SessionOptions& options, const string& name,
+  ThreadPoolDevice(const SessionOptions& options, const std::string& name,
                    Bytes memory_limit, const DeviceLocality& locality,
                    Allocator* allocator);
   ~ThreadPoolDevice() override;
diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc
index 3ac8ea5ae8b68c..a6756935e63e27 100644
--- a/tensorflow/core/common_runtime/threadpool_device_factory.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc
@@ -29,14 +29,14 @@ namespace tensorflow {
 // TODO(zhifengc/tucker): Figure out the bytes of available RAM.
 class ThreadPoolDeviceFactory : public DeviceFactory {
  public:
-  absl::Status ListPhysicalDevices(std::vector<string>* devices) override {
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override {
     devices->push_back("/physical_device:CPU:0");
 
     return absl::OkStatus();
   }
 
   absl::Status CreateDevices(
-      const SessionOptions& options, const string& name_prefix,
+      const SessionOptions& options, const std::string& name_prefix,
       std::vector<std::unique_ptr<Device>>* devices) override {
     int num_numa_nodes = port::NUMANumNodes();
     int n = 1;
@@ -45,7 +45,7 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
       n = iter->second;
     }
     for (int i = 0; i < n; i++) {
-      string name = strings::StrCat(name_prefix, "/device:CPU:", i);
+      std::string name = absl::StrCat(name_prefix, "/device:CPU:", i);
       std::unique_ptr<ThreadPoolDevice> tpd;
       if (options.config.experimental().use_numa_affinity()) {
         int numa_node = i % num_numa_nodes;
diff --git a/tensorflow/core/common_runtime/type_inference.cc b/tensorflow/core/common_runtime/type_inference.cc
index 8239e1c2196767..0434c287f31a5e 100644
--- a/tensorflow/core/common_runtime/type_inference.cc
+++ b/tensorflow/core/common_runtime/type_inference.cc
@@ -125,7 +125,7 @@ absl::Status update_inferred_type(Node* target, const FullTypeDef& t,
   return absl::OkStatus();
 }
 
-absl::StatusOr<FullTypeDef> run_inference(const string& fn_name,
+absl::StatusOr<FullTypeDef> run_inference(const std::string& fn_name,
                                           const TypeRefVector& in_types) {
   // TODO(b/224776031): Things remaining to implement:
   //  * look up function by name
diff --git a/tensorflow/core/common_runtime/type_inference_test.cc b/tensorflow/core/common_runtime/type_inference_test.cc
index 068f81ea191ace..6f7a165e695326 100644
--- a/tensorflow/core/common_runtime/type_inference_test.cc
+++ b/tensorflow/core/common_runtime/type_inference_test.cc
@@ -60,7 +60,6 @@ TEST(TypeInferenceTest, BasicStraightline) {
 
   Node* ds;
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(NodeBuilder("ds", "RangeDataset", &root.graph()->flib_def())
                    .Input({NodeBuilder::NodeOut(start.node())})
@@ -100,7 +99,6 @@ TEST(TypeInferenceTest, CyclicGraphWithV1ControlFlow) {
 
   Node* ds;
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(NodeBuilder("ds", "RangeDataset", &root.graph()->flib_def())
                    .Input({NodeBuilder::NodeOut(start.node())})
@@ -443,7 +441,6 @@ TEST(ReverseTypeInferenceTest, BasicVDependency) {
 
   Node* ds;  // This node has a type constructor.
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(NodeBuilder("ds", "RangeDataset", &root.graph()->flib_def())
                    .Input({NodeBuilder::NodeOut(start.node())})
@@ -491,7 +488,6 @@ TEST(ReverseTypeInferenceTest, FromUnsetType) {
 
   Node* it;
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(
       NodeBuilder("it", "AnonymousIteratorV2", &root.graph()->flib_def())
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index d6bc4d9531173f..f257876ad907ad 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -69,6 +69,9 @@ class Flags {
                   "graphs.")
   TF_DECLARE_FLAG(enable_graph_debug_info_caching_for_stack_frames, true,
                   "If true, graph debug info will cache the stack frames.")
+  TF_DECLARE_FLAG(
+      enable_fatal_error_on_collective_abort, false,
+      "If true, a fatal error will be raised when a collective is aborted.")
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index 9da0ba1b64b0b7..13b3fcb0d135bf 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -56,5 +56,6 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(enable_function_pruning_before_inlining)
   TF_PY_DECLARE_FLAG(enable_skip_encapsulation_for_non_tpu_graphs)
   TF_PY_DECLARE_FLAG(enable_graph_debug_info_caching_for_stack_frames)
+  TF_PY_DECLARE_FLAG(enable_fatal_error_on_collective_abort)
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index 8efa1a440fb68b..f733c905fe7d6e 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/core/data/compression_utils.h"
 
 #include <cstddef>
+#include <cstdint>
+#include <limits>
 #include <string>
 #include <vector>
 
@@ -123,7 +125,7 @@ absl::Status CompressElement(const std::vector<Tensor>& element,
     }
   }
 
-  if (iov.NumBytes() > kuint32max) {
+  if (iov.NumBytes() > std::numeric_limits<uint32_t>::max()) {
     return errors::OutOfRange("Encountered dataset element of size ",
                               iov.NumBytes(),
                               ", exceeding the 4GB Snappy limit.");
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 14f385dcaa48cb..a9300194089efe 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -493,7 +494,7 @@ bool MatchesAnyVersion(absl::string_view op_prefix,
     return true;
   }
   size_t index = op_to_match.length() - 1;
-  while (isdigit(op_to_match[index])) {
+  while (absl::ascii_isdigit(op_to_match[index])) {
     index--;
   }
   return (op_to_match[index] == 'V') && (op_prefix.length() == index);
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index cf99ef7088c5ae..b8f0434adae02d 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -718,8 +718,6 @@ INSTANTIATE_TEST_SUITE_P(Test, GetOptimizationsTest,
                                            GetOptimizationTestCase4()));
 
 TEST(DeterministicOpsTest, GetOptimizations) {
-  // TODO(b/259305727): Re-enable for MacOS when the bug is fixed.
-#if !defined(__APPLE__)
   tsl::test::DeterministicOpsScope det_scope;
   Options options;
   // options.deterministic should be ignored when deterministic ops are enabled.
@@ -729,7 +727,6 @@ TEST(DeterministicOpsTest, GetOptimizations) {
   EXPECT_THAT(std::vector<string>(actual_enabled.begin(), actual_enabled.end()),
               ::testing::UnorderedElementsAreArray({"make_deterministic"}));
   EXPECT_EQ(actual_disabled.size(), 0);
-#endif
 }
 
 REGISTER_DATASET_EXPERIMENT("test_only_experiment",
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 36aa6492a22faa..1a79089fbccc0f 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -119,7 +119,7 @@ absl::Status DataServiceClient::Initialize(
           << " in tf.data service client.";
   dispatcher_ = std::make_unique<DataServiceDispatcherClient>(params_.address,
                                                               params_.protocol);
-  int64_t deadline_micros = kint64max;
+  int64_t deadline_micros = std::numeric_limits<int64_t>::max();
   std::optional<std::string> job_name;
   if (!params_.job_name.empty()) {
     job_name = params_.job_name;
@@ -668,7 +668,7 @@ void DataServiceClient::RunWorkerThread(std::function<void()> done)
       }
       VLOG(3) << "Processing task " << task_to_process->info.task_id();
     }
-    int64_t deadline_micros = kint64max;
+    int64_t deadline_micros = std::numeric_limits<int64_t>::max();
     absl::Status s = GetElementTraced(task_to_process.get(), deadline_micros,
                                       /*enqueue_result=*/!IsCoordinatedRead(),
                                       allow_skip, result);
diff --git a/tensorflow/core/data/service/dispatcher_client.cc b/tensorflow/core/data/service/dispatcher_client.cc
index 87608b858eedee..c06acb3e332ddf 100644
--- a/tensorflow/core/data/service/dispatcher_client.cc
+++ b/tensorflow/core/data/service/dispatcher_client.cc
@@ -379,9 +379,9 @@ absl::Status DataServiceDispatcherClient::DisableCompressionAtRuntime(
 }
 
 absl::Status DataServiceDispatcherClient::EnsureInitialized() {
-  return grpc_util::Retry([this] { return Initialize(); },
-                          "Initialize dispatcher client",
-                          /*deadline_micros=*/kint64max);
+  return grpc_util::Retry(
+      [this] { return Initialize(); }, "Initialize dispatcher client",
+      /*deadline_micros=*/std::numeric_limits<int64_t>::max());
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/task_runner.cc b/tensorflow/core/data/service/task_runner.cc
index 2b85af5aa20b73..a4d82ede95d362 100644
--- a/tensorflow/core/data/service/task_runner.cc
+++ b/tensorflow/core/data/service/task_runner.cc
@@ -232,7 +232,7 @@ std::shared_ptr<model::Model> CachingTaskRunner::model() const {
 
 RoundRobinTaskRunner::RoundRobinTaskRunner(
     std::unique_ptr<TaskIterator> iterator, int64_t num_consumers,
-    string worker_address)
+    std::string worker_address)
     : num_consumers_(num_consumers),
       worker_address_(worker_address),
       buffer_(num_consumers_),
diff --git a/tensorflow/core/data/service/task_runner.h b/tensorflow/core/data/service/task_runner.h
index 79d698f9edc65f..9f208b6bc0c35e 100644
--- a/tensorflow/core/data/service/task_runner.h
+++ b/tensorflow/core/data/service/task_runner.h
@@ -261,7 +261,7 @@ class PrefetchThread {
 class RoundRobinTaskRunner : public TaskRunner {
  public:
   RoundRobinTaskRunner(std::unique_ptr<TaskIterator> iterator,
-                       int64_t num_consumers, string worker_address);
+                       int64_t num_consumers, std::string worker_address);
 
   absl::Status GetNext(const GetElementRequest& req,
                        GetElementResult& result) override;
@@ -280,7 +280,7 @@ class RoundRobinTaskRunner : public TaskRunner {
   // start.
   absl::Status PrepareRound(const GetElementRequest& req);
   const int64_t num_consumers_;
-  const string worker_address_;
+  const std::string worker_address_;
   mutex mu_;
   bool cancelled_ TF_GUARDED_BY(mu_) = false;
   // Condition variable notified whenever we start a new round of round-robin.
@@ -291,7 +291,7 @@ class RoundRobinTaskRunner : public TaskRunner {
       requests_ TF_GUARDED_BY(mu_);
   // Index of the first round we plan to serve. At startup, this is the minimum
   // of all requested element indices.
-  int64_t first_round_ TF_GUARDED_BY(mu_) = kint64max;
+  int64_t first_round_ TF_GUARDED_BY(mu_) = std::numeric_limits<int64_t>::max();
   int64_t current_round_ TF_GUARDED_BY(mu_) = -1;
   bool round_skipped_ TF_GUARDED_BY(mu_) = false;
   // Buffered results for the current round.
diff --git a/tensorflow/core/data/service/thread_safe_buffer_test.cc b/tensorflow/core/data/service/thread_safe_buffer_test.cc
index ea4008b3886dde..b486a078cf92cc 100644
--- a/tensorflow/core/data/service/thread_safe_buffer_test.cc
+++ b/tensorflow/core/data/service/thread_safe_buffer_test.cc
@@ -167,7 +167,7 @@ TEST_P(ThreadSafeBufferTest, BlockWriterWhenBufferIsFull) {
     ASSERT_THAT(buffer.Push(Tensor("Test tensor")), absl_testing::IsOk());
   }
 
-  uint64 push_time = 0;
+  uint64_t push_time = 0;
   auto thread = absl::WrapUnique(Env::Default()->StartThread(
       /*thread_options=*/{}, /*name=*/"writer_thread", [&buffer, &push_time]() {
         ASSERT_THAT(buffer.Push(Tensor("Test tensor")), absl_testing::IsOk());
@@ -176,7 +176,7 @@ TEST_P(ThreadSafeBufferTest, BlockWriterWhenBufferIsFull) {
 
   // Popping an element unblocks the `Push` call.
   Env::Default()->SleepForMicroseconds(10000);
-  uint64 pop_time = Env::Default()->NowMicros();
+  uint64_t pop_time = Env::Default()->NowMicros();
   ASSERT_THAT(buffer.Pop(), absl_testing::IsOk());
   thread.reset();
   EXPECT_LE(pop_time, push_time);
diff --git a/tensorflow/core/data/service/utils.cc b/tensorflow/core/data/service/utils.cc
index 4f79b9384de3b7..c4a1ea3dfd351a 100644
--- a/tensorflow/core/data/service/utils.cc
+++ b/tensorflow/core/data/service/utils.cc
@@ -44,7 +44,7 @@ absl::Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewRandomAccessFile(path, &file));
   io::RecordReader reader(file.get());
-  uint64 offset = 0;
+  uint64_t offset = 0;
   tstring record;
   TF_RETURN_IF_ERROR(reader.ReadRecord(&offset, &record));
   if (!dataset_def.ParseFromString(record)) {
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index c89c8a1c4881f4..f5978a573b24e4 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/worker_impl.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -183,9 +184,9 @@ absl::Status DataServiceWorkerImpl::Start(
     mutex_lock l(mu_);
     return !cancelled_;
   };
-  TF_RETURN_IF_ERROR(grpc_util::Retry([this]() { return Heartbeat(); },
-                                      should_retry, "Worker heartbeat.",
-                                      /*deadline_micros=*/kint64max));
+  TF_RETURN_IF_ERROR(grpc_util::Retry(
+      [this]() { return Heartbeat(); }, should_retry, "Worker heartbeat.",
+      /*deadline_micros=*/std::numeric_limits<int64_t>::max()));
   LOG(INFO) << "Worker registered with dispatcher running at "
             << config_.dispatcher_address()
             << ". Worker config: " << config_.DebugString();
@@ -248,10 +249,10 @@ DataServiceWorkerImpl::CreateDispatcherClient() const TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     return !cancelled_;
   };
-  TF_RETURN_IF_ERROR(
-      grpc_util::Retry([&dispatcher]() { return dispatcher->Initialize(); },
-                       should_retry, "Initialize dispatcher client.",
-                       /*deadline_micros=*/kint64max));
+  TF_RETURN_IF_ERROR(grpc_util::Retry(
+      [&dispatcher]() { return dispatcher->Initialize(); }, should_retry,
+      "Initialize dispatcher client.",
+      /*deadline_micros=*/std::numeric_limits<int64_t>::max()));
   return dispatcher;
 }
 
diff --git a/tensorflow/core/data/snapshot_utils.cc b/tensorflow/core/data/snapshot_utils.cc
index 576cbed01fb633..9946b3dc213020 100644
--- a/tensorflow/core/data/snapshot_utils.cc
+++ b/tensorflow/core/data/snapshot_utils.cc
@@ -710,7 +710,7 @@ absl::Status Reader::MakeNestedDataset(
     datasets.push_back(
         new Dataset(DatasetContext(DatasetContext::Params(
                         {"SnapshotDatasetReader",
-                         strings::StrCat("SnapshotDatasetReader/_", i)})),
+                         absl::StrCat("SnapshotDatasetReader/_", i)})),
                     shard_dirs.at(i), compression_type, version, dtypes, shapes,
                     dataset_start_index));
     datasets.back()->Initialize(/*metadata=*/{});
diff --git a/tensorflow/core/data/snapshot_utils.h b/tensorflow/core/data/snapshot_utils.h
index f083cbe495fa72..1af57f169efd97 100644
--- a/tensorflow/core/data/snapshot_utils.h
+++ b/tensorflow/core/data/snapshot_utils.h
@@ -65,10 +65,10 @@ constexpr char kShardDirectorySuffix[] = ".shard";
 enum Mode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 
 // Returns the name of the "hash" directory for the given base path and hash ID.
-std::string HashDirectory(const std::string& path, uint64 hash);
+std::string HashDirectory(const std::string& path, uint64_t hash);
 
 // Returns the name of the "run" directory for the given base path and run ID.
-std::string RunDirectory(const std::string& hash_directory, uint64 run_id);
+std::string RunDirectory(const std::string& hash_directory, uint64_t run_id);
 std::string RunDirectory(const std::string& hash_directory,
                          const std::string& run_id);
 
@@ -78,7 +78,7 @@ std::string ShardDirectory(const std::string& run_directory, int64_t shard_id);
 
 // Returns the checkpoint file name for the given directory and checkpoint ID.
 std::string GetCheckpointFileName(const std::string& shard_directory,
-                                  uint64 checkpoint_id);
+                                  uint64_t checkpoint_id);
 
 // This is a interface class that exposes snapshot writing functionality.
 class Writer {
@@ -132,7 +132,7 @@ class TFRecordWriter : public Writer {
 // Writes snapshot with a custom (legacy) file format.
 class CustomWriter : public Writer {
  public:
-  static constexpr const size_t kHeaderSize = sizeof(uint64);
+  static constexpr const size_t kHeaderSize = sizeof(uint64_t);
 
   static constexpr const char* const kClassName = "SnapshotWriter";
   static constexpr const char* const kWriteStringPiece = "WriteStringPiece";
@@ -210,7 +210,7 @@ class Reader {
   // the `version`, `compression_type`, and `dtypes` arguments passed into
   // `Writer` and `Reader` must be the same for the reading to succeed.
   static absl::Status Create(Env* env, const std::string& filename,
-                             const string& compression_type, int version,
+                             const std::string& compression_type, int version,
                              const DataTypeVector& dtypes,
                              std::unique_ptr<Reader>* out_reader);
 
@@ -221,7 +221,8 @@ class Reader {
   // contains all the elements written out to each individual snapshot file.
   static absl::Status MakeNestedDataset(
       Env* env, const std::vector<std::string>& shard_dirs,
-      const string& compression_type, int version, const DataTypeVector& dtypes,
+      const std::string& compression_type, int version,
+      const DataTypeVector& dtypes,
       const std::vector<PartialTensorShape>& shapes, int64_t start_index,
       DatasetBase** output);
 
@@ -253,7 +254,8 @@ class TFRecordReaderImpl {
   // tensorflow/compiler/xla/tsl/lib/io/compression.h.
   // `output_buffer_size` specifies the buffer size required by Snappy/Zlib
   // compression algorithms. Ignored if compression is not enabled.
-  TFRecordReaderImpl(const std::string& filename, const string& compression,
+  TFRecordReaderImpl(const std::string& filename,
+                     const std::string& compression,
                      std::optional<int64_t> output_buffer_size = std::nullopt);
 
   // Initializes the reader. Callers must initialize the reader before calling
@@ -279,14 +281,14 @@ class TFRecordReaderImpl {
   uint64_t offset_ = 0;
   uint64_t bytes_read_ = 0;
 
-  const string compression_;
+  const std::string compression_;
   const std::optional<int64_t> output_buffer_size_;
 };
 
 // Reads snapshots previously written with `TFRecordWriter`.
 class TFRecordReader : public Reader {
  public:
-  TFRecordReader(const std::string& filename, const string& compression,
+  TFRecordReader(const std::string& filename, const std::string& compression,
                  const DataTypeVector& dtypes,
                  std::optional<int64_t> output_buffer_size = std::nullopt)
       : reader_impl_(filename, compression, output_buffer_size),
@@ -321,14 +323,14 @@ class CustomReader : public Reader {
   // TODO(b/148804377): Set this in a smarter fashion.
   static constexpr const int64_t kSnappyReaderOutputBufferSizeBytes =
       32 << 20;  // 32 MiB
-  static constexpr const size_t kHeaderSize = sizeof(uint64);
+  static constexpr const size_t kHeaderSize = sizeof(uint64_t);
 
   static constexpr const char* const kClassName = "SnapshotReader";
   static constexpr const char* const kReadString = "ReadString";
   static constexpr const char* const kReadCord = "ReadCord";
   static constexpr const char* const kSeparator = "::";
 
-  CustomReader(const std::string& filename, const string& compression_type,
+  CustomReader(const std::string& filename, const std::string& compression_type,
                int version, const DataTypeVector& dtypes);
 
   absl::Status ReadTensors(std::vector<Tensor>* read_tensors) override;
@@ -356,7 +358,7 @@ class CustomReader : public Reader {
   std::string filename_;
   std::unique_ptr<RandomAccessFile> file_;
   std::unique_ptr<io::InputStreamInterface> input_stream_;
-  const string compression_type_;
+  const std::string compression_type_;
   const int version_;
   const DataTypeVector dtypes_;
   int num_simple_ = 0;
@@ -366,18 +368,18 @@ class CustomReader : public Reader {
 
 // Writes snapshot metadata to the given directory.
 absl::Status WriteMetadataFile(
-    Env* env, const string& dir,
+    Env* env, const std::string& dir,
     const experimental::SnapshotMetadataRecord* metadata);
 
 // Writes distributed snapshot metadata to the given directory. An error is
 // returned if `dir` is unable to be created or if `metadata` is unable to be
 // written.
 absl::Status WriteMetadataFile(
-    Env* env, const string& dir,
+    Env* env, const std::string& dir,
     const experimental::DistributedSnapshotMetadata* metadata);
 
 // Reads snapshot metadata from the given directory.
-absl::Status ReadMetadataFile(Env* env, const string& dir,
+absl::Status ReadMetadataFile(Env* env, const std::string& dir,
                               experimental::SnapshotMetadataRecord* metadata,
                               bool* file_exists);
 
@@ -386,17 +388,17 @@ absl::Status ReadMetadataFile(Env* env, const string& dir,
 // returned. If the file exists in `dir` but is unable to be opened, an error
 // is returned.
 absl::Status ReadMetadataFile(
-    Env* env, const string& dir,
+    Env* env, const std::string& dir,
     experimental::DistributedSnapshotMetadata* metadata, bool* file_exists);
 
 // Writes a dataset graph to the given directory.
-absl::Status DumpDatasetGraph(Env* env, const std::string& path, uint64 hash,
+absl::Status DumpDatasetGraph(Env* env, const std::string& path, uint64_t hash,
                               const GraphDef* graph);
 
 absl::Status DetermineOpState(
     const std::string& mode_string, bool file_exists,
     const experimental::SnapshotMetadataRecord* metadata,
-    uint64 pending_snapshot_expiry_seconds, Mode* mode);
+    uint64_t pending_snapshot_expiry_seconds, Mode* mode);
 
 // Represents a dataset element or EOF.
 struct ElementOrEOF {
@@ -420,9 +422,9 @@ struct ElementOrEOF {
 class AsyncWriter {
  public:
   explicit AsyncWriter(Env* env, int64_t file_index,
-                       const std::string& shard_directory, uint64 checkpoint_id,
-                       const std::string& compression, int64_t version,
-                       const DataTypeVector& output_types,
+                       const std::string& shard_directory,
+                       uint64_t checkpoint_id, const std::string& compression,
+                       int64_t version, const DataTypeVector& output_types,
                        std::function<void(absl::Status)> done);
 
   // Writes the given tensors. The method is non-blocking and returns without
@@ -437,7 +439,7 @@ class AsyncWriter {
   void Consume(ElementOrEOF* be) TF_LOCKS_EXCLUDED(mu_);
   bool ElementAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   absl::Status WriterThread(Env* env, const std::string& shard_directory,
-                            uint64 checkpoint_id,
+                            uint64_t checkpoint_id,
                             const std::string& compression, int64_t version,
                             DataTypeVector output_types);
 
diff --git a/tensorflow/core/data/split_utils.cc b/tensorflow/core/data/split_utils.cc
index 5248e6370781b6..44eda7649af7cc 100644
--- a/tensorflow/core/data/split_utils.cc
+++ b/tensorflow/core/data/split_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -82,7 +83,7 @@ absl::Status IndexSplitProvider::Restore(
 int64_t IndexSplitProvider::Cardinality() const {
   // RandomDataset uses kint64max to simulate infinite splits.
   // See RandomDatasetOp::Dataset::MakeSplitProviders.
-  if (n_ == tsl::kint64max) {
+  if (n_ == std::numeric_limits<int64_t>::max()) {
     return kInfiniteCardinality;
   }
   return n_;
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index 8052db9883010d..74c87c5103e21b 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -162,7 +162,7 @@ absl::Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
         return absl::OkStatus();
       }});
 
-  string fetch_node = "";
+  std::string fetch_node = "";
   for (const auto& node : graph_def.node()) {
     if (node.op() == "_Retval") {
       fetch_node = node.input(0);
diff --git a/tensorflow/core/data/standalone_test.cc b/tensorflow/core/data/standalone_test.cc
index b60b4e752492e9..aa8e7259b5d90b 100644
--- a/tensorflow/core/data/standalone_test.cc
+++ b/tensorflow/core/data/standalone_test.cc
@@ -514,7 +514,7 @@ constexpr const char* const kMapGraphNoAutotuneProto = R"pb(
 
 TEST(Scalar, Standalone) {
   struct TestCase {
-    string graph_string;
+    std::string graph_string;
     std::vector<int64_t> expected_outputs;
   };
   auto test_cases = {
diff --git a/tensorflow/core/data/stats_utils.cc b/tensorflow/core/data/stats_utils.cc
index 80c1de6dbd5576..12c12b8907994e 100644
--- a/tensorflow/core/data/stats_utils.cc
+++ b/tensorflow/core/data/stats_utils.cc
@@ -33,40 +33,40 @@ ABSL_CONST_INIT const char kFeaturesCount[] = "features_count";
 ABSL_CONST_INIT const char kFeatureValuesCount[] = "feature_values_count";
 ABSL_CONST_INIT const char kExamplesCount[] = "examples_count";
 
-string ExecutionTimeHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kExecutionTime);
+std::string ExecutionTimeHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kExecutionTime);
 }
 
-string ThreadUtilizationScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kThreadUtilization);
+std::string ThreadUtilizationScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kThreadUtilization);
 }
 
-string BufferSizeScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kBufferSize);
+std::string BufferSizeScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kBufferSize);
 }
 
-string BufferCapacityScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kBufferCapacity);
+std::string BufferCapacityScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kBufferCapacity);
 }
 
-string BufferUtilizationHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kBufferUtilization);
+std::string BufferUtilizationHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kBufferUtilization);
 }
 
-string FilterdElementsScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kFilteredElements);
+std::string FilterdElementsScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kFilteredElements);
 }
 
-string DroppedElementsScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kDroppedElements);
+std::string DroppedElementsScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kDroppedElements);
 }
 
-string FeatureHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kFeaturesCount);
+std::string FeatureHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kFeaturesCount);
 }
 
-string FeatureValueHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kFeatureValuesCount);
+std::string FeatureValueHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kFeatureValuesCount);
 }
 
 }  // namespace stats_utils
diff --git a/tensorflow/core/data/stats_utils.h b/tensorflow/core/data/stats_utils.h
index 5fa1eae397b39e..22a40b9be963a7 100644
--- a/tensorflow/core/data/stats_utils.h
+++ b/tensorflow/core/data/stats_utils.h
@@ -33,33 +33,33 @@ extern const char kFeatureValuesCount[];
 extern const char kExamplesCount[];
 
 // Name for tf.data function execution time (in ns) histogram metrics.
-string ExecutionTimeHistogramName(const string& prefix);
+std::string ExecutionTimeHistogramName(const std::string& prefix);
 
 // Name for thread utilization (ratio of threads being used and maximum number
 // of threads allocated) scalar metrics.
-string ThreadUtilizationScalarName(const string& prefix);
+std::string ThreadUtilizationScalarName(const std::string& prefix);
 
 // Name for buffer size scalar metrics.
-string BufferSizeScalarName(const string& prefix);
+std::string BufferSizeScalarName(const std::string& prefix);
 
 // Name for buffer capacity (maximum allocated buffer size) scalar metrics.
-string BufferCapacityScalarName(const string& prefix);
+std::string BufferCapacityScalarName(const std::string& prefix);
 
 // Name for buffer utilization (ratio of buffer size and maximum allocated
 // buffer size.) histogram metrics.
-string BufferUtilizationHistogramName(const string& prefix);
+std::string BufferUtilizationHistogramName(const std::string& prefix);
 
 // Name for filtered elements scalar metrics.
-string FilterdElementsScalarName(const string& prefix);
+std::string FilterdElementsScalarName(const std::string& prefix);
 
 // Name for dropped elements scalar mereics.
-string DroppedElementsScalarName(const string& prefix);
+std::string DroppedElementsScalarName(const std::string& prefix);
 
 // Name for features count histogram metrics.
-string FeatureHistogramName(const string& prefix);
+std::string FeatureHistogramName(const std::string& prefix);
 
 // Name for feature-values count histogram metrics.
-string FeatureValueHistogramName(const string& prefix);
+std::string FeatureValueHistogramName(const std::string& prefix);
 
 }  // namespace stats_utils
 }  // namespace data
diff --git a/tensorflow/core/data/unbounded_thread_pool.cc b/tensorflow/core/data/unbounded_thread_pool.cc
index 3ffbbd5c70569f..6d196322298612 100644
--- a/tensorflow/core/data/unbounded_thread_pool.cc
+++ b/tensorflow/core/data/unbounded_thread_pool.cc
@@ -59,7 +59,7 @@ class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory {
  public:
   explicit LogicalThreadFactory(UnboundedThreadPool* pool) : pool_(pool) {}
 
-  std::unique_ptr<Thread> StartThread(const string& name,
+  std::unique_ptr<Thread> StartThread(const std::string& name,
                                       std::function<void()> fn) override {
     auto done = std::make_shared<absl::Notification>();
     pool_->ScheduleOnWorkQueue(std::move(fn), done);
diff --git a/tensorflow/core/data/unbounded_thread_pool.h b/tensorflow/core/data/unbounded_thread_pool.h
index 1b89024a8db86e..1046c8ad5e7bb7 100644
--- a/tensorflow/core/data/unbounded_thread_pool.h
+++ b/tensorflow/core/data/unbounded_thread_pool.h
@@ -35,9 +35,9 @@ namespace data {
 // `UnboundedWorkQueue`.
 class UnboundedThreadPool : public thread::ThreadPoolInterface {
  public:
-  UnboundedThreadPool(Env* env, const string& thread_name)
+  UnboundedThreadPool(Env* env, const std::string& thread_name)
       : unbounded_work_queue_(env, thread_name) {}
-  UnboundedThreadPool(Env* env, const string& thread_name,
+  UnboundedThreadPool(Env* env, const std::string& thread_name,
                       const ThreadOptions& thread_options)
       : unbounded_work_queue_(env, thread_name, thread_options) {}
   ~UnboundedThreadPool() override = default;
diff --git a/tensorflow/core/debug/bfc_dump_reader.cc b/tensorflow/core/debug/bfc_dump_reader.cc
index 9ff9dd9d474e7b..dbad9888c99caa 100644
--- a/tensorflow/core/debug/bfc_dump_reader.cc
+++ b/tensorflow/core/debug/bfc_dump_reader.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
-MemoryDump ReadDumpFile(const string& fname) {
+MemoryDump ReadDumpFile(const std::string& fname) {
   absl::Status status;
-  uint64 file_size = 0;
+  uint64_t file_size = 0;
   status = Env::Default()->GetFileSize(fname, &file_size);
   if (!status.ok()) {
     LOG(ERROR) << "Failed to get size of " << fname;
@@ -66,7 +66,7 @@ MemoryDump FilterByChunkType(MemoryDump md, const char chunk_type) {
   return filtered;
 }
 
-void PrintChunk(const MemChunk& mc, const uint64 ac_offset, bool freed_at,
+void PrintChunk(const MemChunk& mc, const uint64_t ac_offset, bool freed_at,
                 const int64_t total_bytes, int64_t* cumulative_bytes) {
   // A size class corresponding approximately to log base 100.
   int size_class = floor(0.5 * log10(static_cast<double>(mc.size())));
@@ -120,7 +120,7 @@ void PrintSortedChunks(
   chunks.reserve(md.chunk_size());
   int64_t total_bytes = 0;
   int64_t cumulative_bytes = 0;
-  uint64 max_action_count = 0;
+  uint64_t max_action_count = 0;
   for (auto& it : md.chunk()) {
     chunks.push_back(&it);
     total_bytes += it.size();
@@ -129,7 +129,7 @@ void PrintSortedChunks(
     }
   }
   sort(chunks.begin(), chunks.end(), compare);
-  uint64 last_end = 0;
+  uint64_t last_end = 0;
   for (int i = 0; i < chunks.size(); ++i) {
     const MemChunk* c = chunks[i];
     if (by_addr && i > 0 && last_end != c->address()) {
@@ -174,12 +174,12 @@ void PrintChunksBySize(const MemoryDump& md, bool by_age, bool freed_at) {
       by_age, freed_at, false /*by_addr*/);
 }
 
-void PrintChunksByOpName(const MemoryDump& md, const string& op_name,
+void PrintChunksByOpName(const MemoryDump& md, const std::string& op_name,
                          bool by_age, bool freed_at) {
   printf("------------Chunks matching \"%s\":----------------------\n",
          op_name.c_str());
   MemoryDump filtered;
-  uint64 total_bytes = 0;
+  uint64_t total_bytes = 0;
   filtered.set_allocator_name(md.allocator_name());
   for (const auto& it : md.bin_summary()) {
     *filtered.add_bin_summary() = it;
@@ -203,7 +203,7 @@ void PrintChunksByOpName(const MemoryDump& md, const string& op_name,
 void PrintSizeHistory(const MemoryDump& md, bool by_age) {
   printf("------------Allocated Bytes by Action Count--------\n");
   printf("num snapshots: %d\n", md.snap_shot_size());
-  uint64 max_action_count = 0;
+  uint64_t max_action_count = 0;
   if (by_age) {
     for (auto& it : md.snap_shot()) {
       if (it.action_count() > max_action_count) {
diff --git a/tensorflow/core/debug/debug_callback_registry.cc b/tensorflow/core/debug/debug_callback_registry.cc
index 97967a3f040eca..5ee0d53d507624 100644
--- a/tensorflow/core/debug/debug_callback_registry.cc
+++ b/tensorflow/core/debug/debug_callback_registry.cc
@@ -28,20 +28,20 @@ DebugCallbackRegistry* DebugCallbackRegistry::singleton() {
   return instance_;
 }
 
-void DebugCallbackRegistry::RegisterCallback(const string& key,
+void DebugCallbackRegistry::RegisterCallback(const std::string& key,
                                              EventCallback callback) {
   mutex_lock lock(mu_);
   keyed_callback_[key] = std::move(callback);
 }
 
 DebugCallbackRegistry::EventCallback* DebugCallbackRegistry::GetCallback(
-    const string& key) {
+    const std::string& key) {
   mutex_lock lock(mu_);
   auto iter = keyed_callback_.find(key);
   return iter == keyed_callback_.end() ? nullptr : &iter->second;
 }
 
-void DebugCallbackRegistry::UnregisterCallback(const string& key) {
+void DebugCallbackRegistry::UnregisterCallback(const std::string& key) {
   mutex_lock lock(mu_);
   keyed_callback_.erase(key);
 }
diff --git a/tensorflow/core/debug/debug_callback_registry.h b/tensorflow/core/debug/debug_callback_registry.h
index 94b57401418eb9..c3cf8d665af9d9 100644
--- a/tensorflow/core/debug/debug_callback_registry.h
+++ b/tensorflow/core/debug/debug_callback_registry.h
@@ -45,14 +45,14 @@ class DebugCallbackRegistry {
   static DebugCallbackRegistry* singleton();
 
   // Returns the registered callback, or nullptr, for key.
-  EventCallback* GetCallback(const string& key);
+  EventCallback* GetCallback(const std::string& key);
 
   // Associates callback with key.  This must be called by clients observing
   // nodes to be exported by this callback router before running a session.
-  void RegisterCallback(const string& key, EventCallback callback);
+  void RegisterCallback(const std::string& key, EventCallback callback);
 
   // Removes the callback associated with key.
-  void UnregisterCallback(const string& key);
+  void UnregisterCallback(const std::string& key);
 
  private:
   DebugCallbackRegistry();
@@ -61,7 +61,7 @@ class DebugCallbackRegistry {
   mutex mu_;
 
   // Maps debug_url keys to callbacks for routing observed tensors.
-  std::map<string, EventCallback> keyed_callback_ TF_GUARDED_BY(mu_);
+  std::map<std::string, EventCallback> keyed_callback_ TF_GUARDED_BY(mu_);
 
   static DebugCallbackRegistry* instance_;
 };
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 2b772e74c81153..9b0fc5c517c170 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -30,8 +30,8 @@ namespace tensorflow {
 namespace {
 
 // TODO(cais): Switch to safe_strtob when available.
-absl::Status ParseBoolString(const string& bool_str, bool* bool_val) {
-  const string lower_bool_str = absl::AsciiStrToLower(bool_str);
+absl::Status ParseBoolString(const std::string& bool_str, bool* bool_val) {
+  const std::string lower_bool_str = absl::AsciiStrToLower(bool_str);
   if (lower_bool_str == "false" || lower_bool_str == "f" ||
       lower_bool_str == "0") {
     *bool_val = false;
@@ -60,15 +60,15 @@ absl::Status DebugNodeInserter::InsertNodes(
   }
 
   // Debug ops and URLs for wildcard node names (if any).
-  std::vector<string> default_debug_ops;
-  std::vector<string> default_debug_urls;
+  std::vector<std::string> default_debug_ops;
+  std::vector<std::string> default_debug_urls;
 
   // A map from tensor name (e.g., "node_a:0") to list of debug op names
   // (e.g., {"DebugIdentity", "DebugNanCount"})
-  std::unordered_map<string, std::vector<string>> tensor_watches;
+  std::unordered_map<std::string, std::vector<std::string>> tensor_watches;
   // A map from tensor name to debug_url.
-  std::unordered_map<string, std::vector<string>> tensor_watch_urls;
-  std::unordered_map<string, bool> tensor_tolerate_failures;
+  std::unordered_map<std::string, std::vector<std::string>> tensor_watch_urls;
+  std::unordered_map<std::string, bool> tensor_tolerate_failures;
 
   // Cache the proto content for fast lookup later
   for (const DebugTensorWatch& watch : watches) {
@@ -91,26 +91,25 @@ absl::Status DebugNodeInserter::InsertNodes(
       } else {
         return absl::Status(
             absl::StatusCode::kFailedPrecondition,
-            strings::StrCat("output_slot is expected to be -1 for wildcard ",
-                            "node name (\"*\"), but got ",
-                            watch.output_slot()));
+            absl::StrCat("output_slot is expected to be -1 for wildcard ",
+                         "node name (\"*\"), but got ", watch.output_slot()));
       }
       continue;
     } else {
       if (watch.output_slot() < 0) {
         return absl::Status(
             absl::StatusCode::kFailedPrecondition,
-            strings::StrCat("A negative output_slot in DebugTensorWatch is ",
-                            "valid only for the wildcard node name (\"*\"), ",
-                            "but got node name ", watch.node_name()));
+            absl::StrCat("A negative output_slot in DebugTensorWatch is ",
+                         "valid only for the wildcard node name (\"*\"), ",
+                         "but got node name ", watch.node_name()));
       }
     }
 
-    string tensor_name =
-        strings::StrCat(watch.node_name(), ":", watch.output_slot());
+    std::string tensor_name =
+        absl::StrCat(watch.node_name(), ":", watch.output_slot());
 
-    std::vector<string> debug_ops;
-    for (const string& debug_op : watch.debug_ops()) {
+    std::vector<std::string> debug_ops;
+    for (const std::string& debug_op : watch.debug_ops()) {
       debug_ops.push_back(debug_op);
     }
 
@@ -118,8 +117,8 @@ absl::Status DebugNodeInserter::InsertNodes(
     tensor_tolerate_failures[tensor_name] =
         watch.tolerate_debug_op_creation_failures();
 
-    std::vector<string> urls;
-    for (const string& url : watch.debug_urls()) {
+    std::vector<std::string> urls;
+    for (const std::string& url : watch.debug_urls()) {
       urls.push_back(url);
     }
     tensor_watch_urls[tensor_name] = urls;
@@ -149,8 +148,8 @@ absl::Status DebugNodeInserter::InsertNodes(
     // Iterate through all output slots of the node.
     for (int src_output_slot = 0; src_output_slot < src_node->num_outputs();
          ++src_output_slot) {
-      const string tensor_name =
-          strings::StrCat(src_node->name(), ":", src_output_slot);
+      const std::string tensor_name =
+          absl::StrCat(src_node->name(), ":", src_output_slot);
       const bool explicit_tensor_match =
           tensor_watches.find(tensor_name) != tensor_watches.end();
       if (!explicit_tensor_match && default_debug_ops.empty()) {
@@ -177,10 +176,10 @@ absl::Status DebugNodeInserter::InsertNodes(
                                              src_output_slot, &memory_type));
 
       // Create the copy node for the watched tensor.
-      const std::vector<string> debug_ops = explicit_tensor_match
-                                                ? tensor_watches[tensor_name]
-                                                : default_debug_ops;
-      const std::vector<string> debug_urls =
+      const std::vector<std::string> debug_ops =
+          explicit_tensor_match ? tensor_watches[tensor_name]
+                                : default_debug_ops;
+      const std::vector<std::string> debug_urls =
           explicit_tensor_match ? tensor_watch_urls[tensor_name]
                                 : default_debug_urls;
       Node* copy_node;
@@ -191,8 +190,8 @@ absl::Status DebugNodeInserter::InsertNodes(
       if (!copy_s.ok()) {
         return absl::Status(
             absl::StatusCode::kFailedPrecondition,
-            strings::StrCat("Failed to create Copy/CopyHost node for tensor ",
-                            tensor_name, ", due to: ", copy_s.message()));
+            absl::StrCat("Failed to create Copy/CopyHost node for tensor ",
+                         tensor_name, ", due to: ", copy_s.message()));
       }
 
       // Add edge from watched tensor to the copy node.
@@ -201,7 +200,7 @@ absl::Status DebugNodeInserter::InsertNodes(
       // Create all requested debug nodes and their edges to the Copy node.
       std::vector<Node*> debug_nodes;
       for (size_t i = 0; i < debug_ops.size(); ++i) {
-        const string& debug_op_name = debug_ops[i];
+        const std::string& debug_op_name = debug_ops[i];
 
         Node* debug_node;
         absl::Status debug_s = CreateDebugNode(
@@ -281,17 +280,17 @@ void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
 }
 
 // static
-const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
-                                                const int output_slot) {
+const std::string DebugNodeInserter::GetCopyNodeName(
+    const std::string& node_name, const int output_slot) {
   // For example, if the watched node is named "node1" and the output slot
   // is 0, the debug node will be called: __copy_node1_0
-  return strings::StrCat("__copy_", node_name, "_", output_slot);
+  return absl::StrCat("__copy_", node_name, "_", output_slot);
 }
 
 // static
-const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
-                                                 const int debug_op_num,
-                                                 const string& debug_op_name) {
+const std::string DebugNodeInserter::GetDebugNodeName(
+    const std::string& tensor_name, const int debug_op_num,
+    const std::string& debug_op_name) {
   // For example, if the watched node is named "node1" and the debug op that
   // watches the output slot of node1 is of the type "DebugNanCount", the
   // debug node will be called: __dbg_node1_0_0_DebugNanCount.
@@ -302,23 +301,24 @@ const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
 // static
 absl::Status DebugNodeInserter::CreateCopyNode(
     Graph* graph, const DeviceType device_type, const bool is_host_memory,
-    const string& src_node_name, const int src_output, const DataType src_dt,
-    const string& tensor_name, const std::vector<string>& debug_ops,
-    const std::vector<string>& debug_urls, Node** copy_node) {
-  const string kGatedGrpcAttributeKey = "gated_grpc";
+    const std::string& src_node_name, const int src_output,
+    const DataType src_dt, const std::string& tensor_name,
+    const std::vector<std::string>& debug_ops,
+    const std::vector<std::string>& debug_urls, Node** copy_node) {
+  const std::string kGatedGrpcAttributeKey = "gated_grpc";
 
   NodeDef node_def;
   const KernelDef* kdef;
 
-  const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
-  const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
+  const std::string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
+  const std::string copy_node_name = GetCopyNodeName(src_node_name, src_output);
 
   // Cross debug_ops and debug_urls to get the list of debug ops and watches.
-  std::vector<string> debug_ops_spec;
-  for (const string& debug_op : debug_ops) {
-    for (const string& debug_url : debug_urls) {
-      string debug_op_name_proper;
-      std::unordered_map<string, string> custom_attributes;
+  std::vector<std::string> debug_ops_spec;
+  for (const std::string& debug_op : debug_ops) {
+    for (const std::string& debug_url : debug_urls) {
+      std::string debug_op_name_proper;
+      std::unordered_map<std::string, std::string> custom_attributes;
       TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
                                           &custom_attributes));
 
@@ -355,8 +355,8 @@ absl::Status DebugNodeInserter::CreateCopyNode(
   if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) {
     return absl::Status(
         absl::StatusCode::kFailedPrecondition,
-        strings::StrCat("Failed to create copy node ", copy_node_name,
-                        " on watched tensor ", tensor_name));
+        absl::StrCat("Failed to create copy node ", copy_node_name,
+                     " on watched tensor ", tensor_name));
   }
 
   return absl::OkStatus();
@@ -364,24 +364,25 @@ absl::Status DebugNodeInserter::CreateCopyNode(
 
 // static
 absl::Status DebugNodeInserter::ParseDebugOpName(
-    const string& debug_op_name, string* debug_op_name_proper,
-    std::unordered_map<string, string>* attributes) {
+    const std::string& debug_op_name, std::string* debug_op_name_proper,
+    std::unordered_map<std::string, std::string>* attributes) {
   const size_t l_index = debug_op_name.find('(');
   const size_t r_index = debug_op_name.find(')');
-  if (l_index == string::npos && r_index == string::npos) {
+  if (l_index == std::string::npos && r_index == std::string::npos) {
     *debug_op_name_proper = debug_op_name;
   } else {
-    if (l_index == string::npos || l_index == 0 ||
+    if (l_index == std::string::npos || l_index == 0 ||
         r_index != debug_op_name.size() - 1) {
       return absl::InvalidArgumentError(
           absl::StrCat("Malformed debug op name \"", debug_op_name, "\""));
     }
 
     *debug_op_name_proper = debug_op_name.substr(0, l_index);
-    string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
+    std::string arguments =
+        debug_op_name.substr(l_index + 1, r_index - l_index - 1);
 
-    std::vector<string> attribute_segs = str_util::Split(arguments, ";");
-    for (const string& attribute_seg : attribute_segs) {
+    std::vector<std::string> attribute_segs = str_util::Split(arguments, ";");
+    for (const std::string& attribute_seg : attribute_segs) {
       absl::string_view seg(attribute_seg);
       str_util::RemoveWhitespaceContext(&seg);
       if (seg.empty()) {
@@ -389,13 +390,13 @@ absl::Status DebugNodeInserter::ParseDebugOpName(
       }
 
       const size_t eq_index = seg.find('=');
-      if (eq_index == string::npos) {
+      if (eq_index == std::string::npos) {
         return absl::InvalidArgumentError(absl::StrCat(
             "Malformed attributes in debug op name \"", debug_op_name, "\""));
       }
 
-      const string key(seg.substr(0, eq_index));
-      const string value(
+      const std::string key(seg.substr(0, eq_index));
+      const std::string value(
           seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
       if (key.empty() || value.empty()) {
         return absl::InvalidArgumentError(absl::StrCat(
@@ -416,17 +417,18 @@ absl::Status DebugNodeInserter::ParseDebugOpName(
 
 // static
 absl::Status DebugNodeInserter::SetDebugNodeAttributes(
-    Node* debug_node, const std::unordered_map<string, string>& attributes) {
-  std::unordered_set<string> unfulfilled_keys;
+    Node* debug_node,
+    const std::unordered_map<std::string, std::string>& attributes) {
+  std::unordered_set<std::string> unfulfilled_keys;
   for (const auto& item : attributes) {
     unfulfilled_keys.insert(item.first);
   }
 
   for (const auto& attr : debug_node->op_def().attr()) {
     if (attributes.find(attr.name()) != attributes.end()) {
-      const string& attr_value = attributes.at(attr.name());
+      const std::string& attr_value = attributes.at(attr.name());
       if (attr.type() == "string") {
-        debug_node->AddAttr<string>(attr.name(), attr_value);
+        debug_node->AddAttr<std::string>(attr.name(), attr_value);
       } else if (attr.type() == "float") {
         float float_value = 0.0;
         if (!absl::SimpleAtof(attr_value, &float_value)) {
@@ -473,19 +475,19 @@ absl::Status DebugNodeInserter::SetDebugNodeAttributes(
 
 // static
 absl::Status DebugNodeInserter::CreateDebugNode(
-    Graph* graph, const Device& device, const string& src_copy_node_name,
-    const DataType src_dt, const string& tensor_name,
-    const std::vector<string>& debug_urls, const int debug_op_num,
-    const string& debug_op_name, Node** debug_node) {
+    Graph* graph, const Device& device, const std::string& src_copy_node_name,
+    const DataType src_dt, const std::string& tensor_name,
+    const std::vector<std::string>& debug_urls, const int debug_op_num,
+    const std::string& debug_op_name, Node** debug_node) {
   NodeDef node_def;
   const KernelDef* kdef;
 
-  string debug_op_name_proper;
-  std::unordered_map<string, string> custom_attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> custom_attributes;
   TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
                                       &custom_attributes));
 
-  const string debug_node_name =
+  const std::string debug_node_name =
       GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
   auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
                      .Input(src_copy_node_name, 0, src_dt)
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index 27cfb357e2b9d9..9552becbe7b27c 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -82,20 +82,21 @@ class DebugNodeInserter {
   static void DeparallelizeWhileLoops(Graph* graph, Device* device);
 
   // Get canonical name of a copy node.
-  static const string GetCopyNodeName(const string& node_name,
-                                      const int output_slot);
+  static const std::string GetCopyNodeName(const std::string& node_name,
+                                           const int output_slot);
 
   // Get canonical name of a debug node.
-  static const string GetDebugNodeName(const string& tensor_name,
-                                       const int debug_op_num,
-                                       const string& debug_op_name);
+  static const std::string GetDebugNodeName(const std::string& tensor_name,
+                                            const int debug_op_num,
+                                            const std::string& debug_op_name);
 
  private:
   static absl::Status CreateCopyNode(
       Graph* graph, const DeviceType device_type, const bool is_host_memory,
-      const string& src_node_name, const int src_output, const DataType src_dt,
-      const string& tensor_name, const std::vector<string>& debug_ops,
-      const std::vector<string>& debug_urls, Node** copy_node);
+      const std::string& src_node_name, const int src_output,
+      const DataType src_dt, const std::string& tensor_name,
+      const std::vector<std::string>& debug_ops,
+      const std::vector<std::string>& debug_urls, Node** copy_node);
 
   // Parse the debug_op_name string to extract proper op name and attributes.
   // debug_op_name can be the proper op name only, e.g., "DebugNumericSummary".
@@ -104,17 +105,18 @@ class DebugNodeInserter {
   // with semicolons (";"), which optional whitespace in between, e.g.,
   // "DebugNumericSummary(mute_if_healthy=true, lower_bound=-100.0)".
   static absl::Status ParseDebugOpName(
-      const string& debug_op_name, string* debug_op_name_proper,
-      std::unordered_map<string, string>* attributes);
+      const std::string& debug_op_name, std::string* debug_op_name_proper,
+      std::unordered_map<std::string, std::string>* attributes);
 
   static absl::Status SetDebugNodeAttributes(
-      Node* debug_node, const std::unordered_map<string, string>& attributes);
+      Node* debug_node,
+      const std::unordered_map<std::string, std::string>& attributes);
 
   static absl::Status CreateDebugNode(
-      Graph* graph, const Device& device, const string& src_copy_node_name,
-      const DataType src_dt, const string& tensor_name,
-      const std::vector<string>& debug_urls, const int debug_op_num,
-      const string& debug_op_name, Node** debug_node);
+      Graph* graph, const Device& device, const std::string& src_copy_node_name,
+      const DataType src_dt, const std::string& tensor_name,
+      const std::vector<std::string>& debug_urls, const int debug_op_num,
+      const std::string& debug_op_name, Node** debug_node);
   // TODO(cais): Cut down the number of args to this method.
 
   friend class DebugGraphUtilsTest;
diff --git a/tensorflow/core/debug/debug_graph_utils_test.cc b/tensorflow/core/debug/debug_graph_utils_test.cc
index 207b8bc1b3c1f7..d1184d5d18c498 100644
--- a/tensorflow/core/debug/debug_graph_utils_test.cc
+++ b/tensorflow/core/debug/debug_graph_utils_test.cc
@@ -25,16 +25,16 @@ namespace tensorflow {
 class DebugGraphUtilsTest : public ::testing::Test {
  protected:
   absl::Status ParseDebugOpName(
-      const string& debug_op_name, string* debug_op_name_proper,
-      std::unordered_map<string, string>* attributes) {
+      const std::string& debug_op_name, std::string* debug_op_name_proper,
+      std::unordered_map<std::string, std::string>* attributes) {
     return DebugNodeInserter::ParseDebugOpName(
         debug_op_name, debug_op_name_proper, attributes);
   }
 };
 
 TEST_F(DebugGraphUtilsTest, TestParseNoAttributeDebugOpName) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
   TF_ASSERT_OK(
       ParseDebugOpName("DebugIdentity", &debug_op_name_proper, &attributes));
   ASSERT_EQ("DebugIdentity", debug_op_name_proper);
@@ -42,8 +42,8 @@ TEST_F(DebugGraphUtilsTest, TestParseNoAttributeDebugOpName) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestMalformedDebugOpName) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   absl::Status s = ParseDebugOpName("(mute_if_healthy=true)",
                                     &debug_op_name_proper, &attributes);
@@ -59,8 +59,8 @@ TEST_F(DebugGraphUtilsTest, TestMalformedDebugOpName) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   absl::Status s = ParseDebugOpName("DebugNumericSummary(=)",
                                     &debug_op_name_proper, &attributes);
@@ -89,8 +89,8 @@ TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithSingleAttribute) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   TF_ASSERT_OK(ParseDebugOpName("DebugNumericSummary()", &debug_op_name_proper,
                                 &attributes));
@@ -106,8 +106,8 @@ TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithSingleAttribute) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreThanOneAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
   TF_ASSERT_OK(ParseDebugOpName(
       "DebugNumericSummary(mute_if_healthy=true; threshold=300.0)",
       &debug_op_name_proper, &attributes));
@@ -128,8 +128,8 @@ TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreThanOneAttributes) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreDuplicateAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
   absl::Status s = ParseDebugOpName(
       "DebugNumericSummary(mute_if_healthy=true; lower_bound=3; "
       "mute_if_healthy=false;)",
@@ -138,8 +138,8 @@ TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreDuplicateAttributes) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithWhitespaceInAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   TF_ASSERT_OK(ParseDebugOpName(
       "DebugNumericSummary(  mute_if_healthy=true; threshold=300.0  )",
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
index 6a19ecfe14992e..19c79a04d2123d 100644
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -34,7 +34,7 @@ class GrpcDebugTest : public ::testing::Test {
  protected:
   struct ServerData {
     int port;
-    string url;
+    std::string url;
     std::unique_ptr<test::TestEventListenerImpl> server;
     std::unique_ptr<thread::ThreadPool> thread_pool;
   };
@@ -49,7 +49,7 @@ class GrpcDebugTest : public ::testing::Test {
   void SetUpInProcessServer(ServerData* server_data,
                             int64_t server_start_delay_micros) {
     server_data->port = testing::PickUnusedPortOrDie();
-    server_data->url = strings::StrCat("grpc://localhost:", server_data->port);
+    server_data->url = absl::StrCat("grpc://localhost:", server_data->port);
     server_data->server = std::make_unique<test::TestEventListenerImpl>();
 
     server_data->thread_pool =
@@ -86,8 +86,8 @@ TEST_F(GrpcDebugTest, ConnectionTimeoutWorks) {
   SetChannelConnectionTimeoutMicros(kShortTimeoutMicros);
   ASSERT_EQ(kShortTimeoutMicros, GetChannelConnectionTimeoutMicros());
 
-  const string& kInvalidGrpcUrl =
-      strings::StrCat("grpc://localhost:", testing::PickUnusedPortOrDie());
+  const std::string& kInvalidGrpcUrl =
+      absl::StrCat("grpc://localhost:", testing::PickUnusedPortOrDie());
   Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
   tensor.flat<float>()(0) = 42.0;
   absl::Status publish_status = DebugIO::PublishDebugTensor(
@@ -98,10 +98,11 @@ TEST_F(GrpcDebugTest, ConnectionTimeoutWorks) {
   TF_ASSERT_OK(DebugIO::CloseDebugURL(kInvalidGrpcUrl));
 
   ASSERT_FALSE(publish_status.ok());
-  const string expected_error_msg = strings::StrCat(
+  const std::string expected_error_msg = strings::StrCat(
       "Failed to connect to gRPC channel at ", kInvalidGrpcUrl.substr(7),
       " within a timeout of ", kShortTimeoutMicros / 1e6, " s");
-  ASSERT_NE(string::npos, publish_status.message().find(expected_error_msg));
+  ASSERT_NE(std::string::npos,
+            publish_status.message().find(expected_error_msg));
 }
 
 TEST_F(GrpcDebugTest, ConnectionToDelayedStartingServerWorks) {
@@ -149,7 +150,7 @@ TEST_F(GrpcDebugTest, SendSingleDebugTensorViaGrpcTest) {
 
 TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
   Tensor tensor(DT_STRING, TensorShape({1, 1}));
-  tensor.flat<tstring>()(0) = string(5000 * 1024, 'A');
+  tensor.flat<tstring>()(0) = std::string(5000 * 1024, 'A');
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo_tensor", 0, "DebugIdentity");
   const absl::Status status = DebugIO::PublishDebugTensor(
@@ -158,14 +159,14 @@ TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
   ASSERT_NE(status.message().find("string value at index 0 from debug "
                                   "node foo_tensor:0:DebugIdentity does "
                                   "not fit gRPC message size limit"),
-            string::npos);
+            std::string::npos);
   TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
 }
 
 TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex1ViaGrpcTest) {
   Tensor tensor(DT_STRING, TensorShape({1, 2}));
   tensor.flat<tstring>()(0) = "A";
-  tensor.flat<tstring>()(1) = string(5000 * 1024, 'A');
+  tensor.flat<tstring>()(1) = std::string(5000 * 1024, 'A');
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo_tensor", 0, "DebugIdentity");
   const absl::Status status = DebugIO::PublishDebugTensor(
@@ -174,7 +175,7 @@ TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex1ViaGrpcTest) {
   ASSERT_NE(status.message().find("string value at index 1 from debug "
                                   "node foo_tensor:0:DebugIdentity does "
                                   "not fit gRPC message size limit"),
-            string::npos);
+            std::string::npos);
   TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
 }
 
@@ -197,7 +198,7 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
   int tensor_count TF_GUARDED_BY(mu) = 0;
   std::vector<absl::Status> statuses TF_GUARDED_BY(mu);
 
-  const std::vector<string> urls({server_data_.url});
+  const std::vector<std::string> urls({server_data_.url});
 
   // Set up the concurrent tasks of sending Tensors via an Event stream to the
   // server.
@@ -210,10 +211,10 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
     }
 
     // Different concurrent tasks will send different tensors.
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
     absl::Status publish_status = DebugIO::PublishDebugTensor(
         DebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
-                     strings::StrCat("synchronized_node_", this_count), 0,
+                     absl::StrCat("synchronized_node_", this_count), 0,
                      "DebugIdentity"),
         tensors[this_count], wall_time, urls);
 
@@ -247,7 +248,7 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
   // One prep tensor plus kSends concurrent tensors are expected.
   ASSERT_EQ(kSends, server_data_.server->node_names.size());
   for (size_t i = 0; i < server_data_.server->node_names.size(); ++i) {
-    std::vector<string> items =
+    std::vector<std::string> items =
         str_util::Split(server_data_.server->node_names[i], '_');
     int tensor_index;
     strings::safe_strto32(items[2], &tensor_index);
@@ -267,10 +268,10 @@ TEST_F(GrpcDebugTest, SendDebugTensorsThroughMultipleRoundsUsingGrpcGating) {
   Tensor tensor(DT_INT32, TensorShape({1, 1}));
   tensor.flat<int>()(0) = 42;
 
-  const std::vector<string> urls({server_data_.url});
+  const std::vector<std::string> urls({server_data_.url});
   for (int i = 0; i < 3; ++i) {
     server_data_.server->ClearReceivedDebugData();
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
 
     // On the 1st send (i == 0), gating is disabled, so data should be sent.
     // On the 2nd send (i == 1), gating is enabled, and the server has enabled
@@ -315,10 +316,10 @@ TEST_F(GrpcDebugTest, SendDebugTensorsThroughMultipleRoundsUnderReadWriteMode) {
   Tensor tensor(DT_INT32, TensorShape({1, 1}));
   tensor.flat<int>()(0) = 42;
 
-  const std::vector<string> urls({server_data_.url});
+  const std::vector<std::string> urls({server_data_.url});
   for (int i = 0; i < 3; ++i) {
     server_data_.server->ClearReceivedDebugData();
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
 
     // On the 1st send (i == 0), gating is disabled, so data should be sent.
     // On the 2nd send (i == 1), gating is enabled, and the server has enabled
@@ -367,8 +368,8 @@ TEST_F(GrpcDebugTest, TestGateDebugNodeOnEmptyEnabledSet) {
 }
 
 TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSet) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kGrpcUrl2 = "grpc://localhost:3334";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kGrpcUrl2 = "grpc://localhost:3334";
 
   DebugGrpcIO::SetDebugNodeKeyGrpcState(
       kGrpcUrl1, "foo:0:DebugIdentity",
@@ -398,9 +399,9 @@ TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSet) {
 }
 
 TEST_F(GrpcDebugTest, TestGateDebugNodeOnMultipleEmptyEnabledSets) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kGrpcUrl2 = "grpc://localhost:3334";
-  const string kGrpcUrl3 = "grpc://localhost:3335";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kGrpcUrl2 = "grpc://localhost:3334";
+  const std::string kGrpcUrl3 = "grpc://localhost:3335";
 
   DebugGrpcIO::SetDebugNodeKeyGrpcState(
       kGrpcUrl1, "foo:0:DebugIdentity",
@@ -434,14 +435,14 @@ TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSetAndEmptyURLs) {
       "grpc://localhost:3333", "foo:0:DebugIdentity",
       EventReply::DebugOpStateChange::READ_ONLY);
 
-  std::vector<string> debug_urls_1;
+  std::vector<std::string> debug_urls_1;
   ASSERT_FALSE(
       DebugIO::IsDebugNodeGateOpen("foo:1:DebugIdentity", debug_urls_1));
 }
 
 TEST_F(GrpcDebugTest, TestGateCopyNodeOnEmptyEnabledSet) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kWatch1 = "foo:0:DebugIdentity";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kWatch1 = "foo:0:DebugIdentity";
 
   ASSERT_FALSE(DebugIO::IsCopyNodeGateOpen(
       {DebugWatchAndURLSpec(kWatch1, kGrpcUrl1, true)}));
@@ -456,10 +457,10 @@ TEST_F(GrpcDebugTest, TestGateCopyNodeOnEmptyEnabledSet) {
 }
 
 TEST_F(GrpcDebugTest, TestGateCopyNodeOnNonEmptyEnabledSet) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kGrpcUrl2 = "grpc://localhost:3334";
-  const string kWatch1 = "foo:0:DebugIdentity";
-  const string kWatch2 = "foo:1:DebugIdentity";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kGrpcUrl2 = "grpc://localhost:3334";
+  const std::string kWatch1 = "foo:0:DebugIdentity";
+  const std::string kWatch2 = "foo:1:DebugIdentity";
   DebugGrpcIO::SetDebugNodeKeyGrpcState(
       kGrpcUrl1, kWatch1, EventReply::DebugOpStateChange::READ_ONLY);
 
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index d0d779b7d9378b..0f3dfb8bb737f4 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -44,11 +44,11 @@ ::grpc::Status TestEventListenerImpl::SendEvents(
     } else if (event.has_summary()) {
       const Summary::Value& val = event.summary().value(0);
 
-      std::vector<string> name_items =
+      std::vector<std::string> name_items =
           tensorflow::str_util::Split(val.node_name(), ':');
 
-      const string node_name = name_items[0];
-      const string debug_op = name_items[2];
+      const std::string node_name = name_items[0];
+      const std::string debug_op = name_items[2];
 
       const TensorProto& tensor_proto = val.tensor();
       Tensor tensor(tensor_proto.dtype());
@@ -138,7 +138,7 @@ void TestEventListenerImpl::RequestDebugOpStateChangeAtNextStream(
 
 void TestEventListenerImpl::RunServer(const int server_port) {
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(strings::StrCat("localhost:", server_port),
+  builder.AddListeningPort(absl::StrCat("localhost:", server_port),
                            ::grpc::InsecureServerCredentials());
   builder.RegisterService(this);
   std::unique_ptr<::grpc::Server> server = builder.BuildAndStart();
@@ -156,7 +156,7 @@ void TestEventListenerImpl::StopServer() {
   }
 }
 
-bool PollTillFirstRequestSucceeds(const string& server_url,
+bool PollTillFirstRequestSucceeds(const std::string& server_url,
                                   const size_t max_attempts) {
   const int kSleepDurationMicros = 100 * 1000;
   size_t n_attempts = 0;
@@ -168,7 +168,7 @@ bool PollTillFirstRequestSucceeds(const string& server_url,
   prep_tensor.flat<float>()(0) = 42.0f;
 
   while (n_attempts++ < max_attempts) {
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
     absl::Status publish_s = DebugIO::PublishDebugTensor(
         DebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", "prep_node", 0,
                      "DebugIdentity"),
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 2a57df8d866331..415ce6435c7bdf 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -48,12 +48,12 @@ class TestEventListenerImpl final : public grpc::EventListener::Service {
       const EventReply::DebugOpStateChange::State new_state,
       const DebugNodeKey& debug_node_key);
 
-  std::vector<string> debug_metadata_strings;
-  std::vector<string> encoded_graph_defs;
-  std::vector<string> device_names;
-  std::vector<string> node_names;
-  std::vector<int32> output_slots;
-  std::vector<string> debug_ops;
+  std::vector<std::string> debug_metadata_strings;
+  std::vector<std::string> encoded_graph_defs;
+  std::vector<std::string> device_names;
+  std::vector<std::string> node_names;
+  std::vector<int32_t> output_slots;
+  std::vector<std::string> debug_ops;
   std::vector<Tensor> debug_tensors;
 
  private:
@@ -77,7 +77,7 @@ class TestEventListenerImpl final : public grpc::EventListener::Service {
 //
 // Returns:
 //   Whether the polling succeeded within max_attempts.
-bool PollTillFirstRequestSucceeds(const string& server_url,
+bool PollTillFirstRequestSucceeds(const std::string& server_url,
                                   const size_t max_attempts);
 
 }  // namespace test
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index f5f6e9a81fa7f4..430bc36ea1a96c 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -66,8 +66,8 @@ constexpr absl::string_view kDumpSubDirName = "node-io-dump";
 // shape). It does not set the value.tensor field, which should be set by the
 // caller separately.
 Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
-                             const uint64 wall_time_us, const size_t num_chunks,
-                             const size_t chunk_index,
+                             const uint64_t wall_time_us,
+                             const size_t num_chunks, const size_t chunk_index,
                              const DataType& tensor_dtype,
                              const TensorShapeProto& tensor_shape) {
   Event event;
@@ -92,7 +92,7 @@ Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
   metadata.set_chunk_index(chunk_index);
 
   // Encode the data in JSON.
-  string json_output;
+  std::string json_output;
   tensorflow::protobuf::util::JsonPrintOptions json_options;
   json_options.always_print_fields_with_no_presence = true;
   auto status = tensorflow::protobuf::util::MessageToJsonString(
@@ -120,7 +120,7 @@ Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
 // (i.e., an estimate that is usually too large, but never too small under the
 // gRPC message size limit) of the Varint-encoded length, to workaround the lack
 // of a portable length function.
-const size_t StringValMaxBytesInProto(const string& str) {
+const size_t StringValMaxBytesInProto(const std::string& str) {
 #if defined(PLATFORM_GOOGLE)
   return str.size() + DebugGrpcIO::kGrpcMaxVarintLengthSize;
 #else
@@ -131,11 +131,12 @@ const size_t StringValMaxBytesInProto(const string& str) {
 // Breaks a string Tensor (represented as a TensorProto) as a vector of Event
 // protos.
 absl::Status WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key,
-                                      const uint64 wall_time_us,
+                                      const uint64_t wall_time_us,
                                       const size_t chunk_size_limit,
                                       TensorProto* tensor_proto,
                                       std::vector<Event>* events) {
-  const protobuf::RepeatedPtrField<string>& strs = tensor_proto->string_val();
+  const protobuf::RepeatedPtrField<std::string>& strs =
+      tensor_proto->string_val();
   const size_t num_strs = strs.size();
   const size_t chunk_size_ub = chunk_size_limit > 0
                                    ? chunk_size_limit
@@ -191,7 +192,8 @@ absl::Status WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key,
 // If chunk_size_limit <= 0, the tensor will not be broken into chunks, i.e., a
 // length-1 vector will be returned, regardless of the size of the tensor.
 absl::Status WrapTensorAsEvents(const DebugNodeKey& debug_node_key,
-                                const Tensor& tensor, const uint64 wall_time_us,
+                                const Tensor& tensor,
+                                const uint64_t wall_time_us,
                                 const size_t chunk_size_limit,
                                 std::vector<Event>* events) {
   TensorProto tensor_proto;
@@ -237,10 +239,11 @@ absl::Status WrapTensorAsEvents(const DebugNodeKey& debug_node_key,
 // TOCTOU race condition is not of concern here due to the fact that tfdbg
 // sets parallel_iterations attribute of all while_loops to 1 to prevent
 // the same node from between executed multiple times concurrently.
-string AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
-  string out = strings::StrCat(in, "_", timestamp);
+std::string AppendTimestampToFilePath(const std::string& in,
+                                      const uint64_t timestamp) {
+  std::string out = absl::StrCat(in, "_", timestamp);
 
-  uint64 i = 1;
+  uint64_t i = 1;
   while (Env::Default()->FileExists(out).ok()) {
     out = strings::StrCat(in, "_", timestamp, "-", i);
     ++i;
@@ -251,11 +254,10 @@ string AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
 #ifndef PLATFORM_WINDOWS
 // Publishes encoded GraphDef through a gRPC debugger stream, in chunks,
 // conforming to the gRPC message size limit.
-absl::Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
-                                            const string& device_name,
-                                            const int64_t wall_time,
-                                            const string& debug_url) {
-  const uint64 hash = ::tensorflow::Hash64(encoded_graph_def);
+absl::Status PublishEncodedGraphDefInChunks(
+    const std::string& encoded_graph_def, const std::string& device_name,
+    const int64_t wall_time, const std::string& debug_url) {
+  const uint64_t hash = ::tensorflow::Hash64(encoded_graph_def);
   const size_t total_length = encoded_graph_def.size();
   const size_t num_chunks =
       static_cast<size_t>(std::ceil(static_cast<float>(total_length) /
@@ -297,11 +299,12 @@ const char* const DebugIO::kGraphTag = "graph_";
 
 const char* const DebugIO::kHashTag = "hash";
 
-absl::Status ReadEventFromFile(const string& dump_file_path, Event* event) {
+absl::Status ReadEventFromFile(const std::string& dump_file_path,
+                               Event* event) {
   Env* env(Env::Default());
 
-  string content;
-  uint64 file_size = 0;
+  std::string content;
+  uint64_t file_size = 0;
 
   absl::Status s = env->GetFileSize(dump_file_path, &file_size);
   if (!s.ok()) {
@@ -333,10 +336,11 @@ const char* const DebugIO::kMemoryURLScheme = "memcbk://";
 // Publishes debug metadata to a set of debug URLs.
 absl::Status DebugIO::PublishDebugMetadata(
     const int64_t global_step, const int64_t session_run_index,
-    const int64_t executor_step_index, const std::vector<string>& input_names,
-    const std::vector<string>& output_names,
-    const std::vector<string>& target_nodes,
-    const std::unordered_set<string>& debug_urls) {
+    const int64_t executor_step_index,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::string>& target_nodes,
+    const std::unordered_set<std::string>& debug_urls) {
   std::ostringstream oss;
 
   // Construct a JSON string to carry the metadata.
@@ -370,29 +374,29 @@ absl::Status DebugIO::PublishDebugMetadata(
   oss << "]";
   oss << "}";
 
-  const string json_metadata = oss.str();
+  const std::string json_metadata = oss.str();
   Event event;
   event.set_wall_time(static_cast<double>(Env::Default()->NowMicros()));
   LogMessage* log_message = event.mutable_log_message();
   log_message->set_message(json_metadata);
 
   absl::Status status;
-  for (const string& url : debug_urls) {
+  for (const std::string& url : debug_urls) {
     if (absl::StartsWith(absl::AsciiStrToLower(url), kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
       Event grpc_event;
 
       // Determine the path (if any) in the grpc:// URL, and add it as a field
       // of the JSON string.
-      const string address = url.substr(strlen(DebugIO::kFileURLScheme));
-      const string path = address.find('/') == string::npos
-                              ? ""
-                              : address.substr(address.find('/'));
+      const std::string address = url.substr(strlen(DebugIO::kFileURLScheme));
+      const std::string path = address.find('/') == std::string::npos
+                                   ? ""
+                                   : address.substr(address.find('/'));
       grpc_event.set_wall_time(event.wall_time());
       LogMessage* log_message_grpc = grpc_event.mutable_log_message();
       log_message_grpc->set_message(
-          strings::StrCat(json_metadata.substr(0, json_metadata.size() - 1),
-                          ",\"grpc_path\":\"", path, "\"}"));
+          absl::StrCat(json_metadata.substr(0, json_metadata.size() - 1),
+                       ",\"grpc_path\":\"", path, "\"}"));
 
       status.Update(
           DebugGrpcIO::SendEventProtoThroughGrpcStream(grpc_event, url, true));
@@ -400,18 +404,18 @@ absl::Status DebugIO::PublishDebugMetadata(
       GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
     } else if (absl::StartsWith(absl::AsciiStrToLower(url), kFileURLScheme)) {
-      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
-      const string core_metadata_path = AppendTimestampToFilePath(
-          io::JoinPath(dump_root_dir,
-                       strings::StrCat(
-                           DebugNodeKey::kMetadataFilePrefix,
+      const std::string dump_root_dir = url.substr(strlen(kFileURLScheme));
+      const std::string core_metadata_path = AppendTimestampToFilePath(
+          io::JoinPath(
+              dump_root_dir,
+              absl::StrCat(DebugNodeKey::kMetadataFilePrefix,
                            DebugIO::kCoreMetadataTag, "sessionrun",
                            strings::Printf("%.14lld", static_cast<long long>(
                                                           session_run_index)))),
           Env::Default()->NowMicros());
       status.Update(DebugFileIO::DumpEventProtoToFile(
-          event, string(io::Dirname(core_metadata_path)),
-          string(io::Basename(core_metadata_path))));
+          event, std::string(io::Dirname(core_metadata_path)),
+          std::string(io::Basename(core_metadata_path))));
     }
   }
 
@@ -420,13 +424,13 @@ absl::Status DebugIO::PublishDebugMetadata(
 
 absl::Status DebugIO::PublishDebugTensor(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const absl::Span<const string> debug_urls,
+    const uint64_t wall_time_us, const absl::Span<const std::string> debug_urls,
     const bool gated_grpc, const int64_t step_id) {
   int32_t num_failed_urls = 0;
   std::vector<absl::Status> fail_statuses;
-  for (const string& url : debug_urls) {
+  for (const std::string& url : debug_urls) {
     if (absl::StartsWith(absl::AsciiStrToLower(url), kFileURLScheme)) {
-      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
+      const std::string dump_root_dir = url.substr(strlen(kFileURLScheme));
 
       const int64_t tensorBytes =
           tensor.IsInitialized() ? tensor.TotalBytes() : 0;
@@ -465,26 +469,25 @@ absl::Status DebugIO::PublishDebugTensor(
       GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
     } else if (absl::StartsWith(absl::AsciiStrToLower(url), kMemoryURLScheme)) {
-      const string dump_root_dir = url.substr(strlen(kMemoryURLScheme));
+      const std::string dump_root_dir = url.substr(strlen(kMemoryURLScheme));
       auto* callback_registry = DebugCallbackRegistry::singleton();
       auto* callback = callback_registry->GetCallback(dump_root_dir);
       CHECK(callback) << "No callback registered for: " << dump_root_dir;
       (*callback)(debug_node_key, tensor);
     } else {
       return absl::Status(absl::StatusCode::kUnavailable,
-                          strings::StrCat("Invalid debug target URL: ", url));
+                          absl::StrCat("Invalid debug target URL: ", url));
     }
   }
 
   if (num_failed_urls == 0) {
     return absl::OkStatus();
   } else {
-    string error_message = strings::StrCat(
+    std::string error_message = strings::StrCat(
         "Publishing to ", num_failed_urls, " of ", debug_urls.size(),
         " debug target URLs failed, due to the following errors:");
     for (absl::Status& status : fail_statuses) {
-      error_message =
-          strings::StrCat(error_message, " ", status.message(), ";");
+      error_message = absl::StrCat(error_message, " ", status.message(), ";");
     }
 
     return absl::Status(absl::StatusCode::kInternal, error_message);
@@ -493,18 +496,19 @@ absl::Status DebugIO::PublishDebugTensor(
 
 absl::Status DebugIO::PublishDebugTensor(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const absl::Span<const string> debug_urls) {
+    const uint64_t wall_time_us,
+    const absl::Span<const std::string> debug_urls) {
   return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls,
                             false);
 }
 
 absl::Status DebugIO::PublishGraph(
-    const Graph& graph, const string& device_name,
-    const std::unordered_set<string>& debug_urls) {
+    const Graph& graph, const std::string& device_name,
+    const std::unordered_set<std::string>& debug_urls) {
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
 
-  string buf;
+  std::string buf;
   graph_def.SerializeToString(&buf);
 
   const int64_t now_micros = Env::Default()->NowMicros();
@@ -513,13 +517,13 @@ absl::Status DebugIO::PublishGraph(
   event.set_graph_def(buf);
 
   absl::Status status = absl::OkStatus();
-  for (const string& debug_url : debug_urls) {
+  for (const std::string& debug_url : debug_urls) {
     if (absl::StartsWith(debug_url, kFileURLScheme)) {
-      const string dump_root_dir =
+      const std::string dump_root_dir =
           io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
                        DebugNodeKey::DeviceNameToDevicePath(device_name));
-      const uint64 graph_hash = ::tensorflow::Hash64(buf);
-      const string file_name =
+      const uint64_t graph_hash = ::tensorflow::Hash64(buf);
+      const std::string file_name =
           strings::StrCat(DebugNodeKey::kMetadataFilePrefix, DebugIO::kGraphTag,
                           DebugIO::kHashTag, graph_hash, "_", now_micros);
 
@@ -557,10 +561,10 @@ bool DebugIO::IsCopyNodeGateOpen(
 #endif
 }
 
-bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
-                                  const std::vector<string>& debug_urls) {
+bool DebugIO::IsDebugNodeGateOpen(const std::string& watch_key,
+                                  const std::vector<std::string>& debug_urls) {
 #ifndef PLATFORM_WINDOWS
-  for (const string& debug_url : debug_urls) {
+  for (const std::string& debug_url : debug_urls) {
     if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme),
                           DebugIO::kGrpcURLScheme)) {
       return true;
@@ -576,8 +580,8 @@ bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
 #endif
 }
 
-bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
-                                 const string& debug_url) {
+bool DebugIO::IsDebugURLGateOpen(const std::string& watch_key,
+                                 const std::string& debug_url) {
 #ifndef PLATFORM_WINDOWS
   if (debug_url != kGrpcURLScheme) {
     return true;
@@ -589,7 +593,7 @@ bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
 #endif
 }
 
-absl::Status DebugIO::CloseDebugURL(const string& debug_url) {
+absl::Status DebugIO::CloseDebugURL(const std::string& debug_url) {
   if (absl::StartsWith(debug_url, DebugIO::kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
     return DebugGrpcIO::CloseGrpcStream(debug_url);
@@ -604,10 +608,10 @@ absl::Status DebugIO::CloseDebugURL(const string& debug_url) {
 
 absl::Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                           const Tensor& tensor,
-                                          const uint64 wall_time_us,
-                                          const string& dump_root_dir,
-                                          string* dump_file_path) {
-  const string file_path =
+                                          const uint64_t wall_time_us,
+                                          const std::string& dump_root_dir,
+                                          std::string* dump_file_path) {
+  const std::string file_path =
       GetDumpFilePath(dump_root_dir, debug_node_key, wall_time_us);
 
   if (dump_file_path != nullptr) {
@@ -619,9 +623,9 @@ absl::Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
 
 absl::Status DebugFileIO::DumpTensorToDirForNodeDumping(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const string& dump_root_dir,
-    string* dump_file_path, const int64_t step_id) {
-  const string file_path = GetDumpFilePathForNodeDumping(
+    const uint64_t wall_time_us, const std::string& dump_root_dir,
+    std::string* dump_file_path, const int64_t step_id) {
+  const std::string file_path = GetDumpFilePathForNodeDumping(
       dump_root_dir, debug_node_key, wall_time_us, step_id);
   if (dump_file_path != nullptr) {
     *dump_file_path = file_path;
@@ -630,9 +634,9 @@ absl::Status DebugFileIO::DumpTensorToDirForNodeDumping(
   return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path);
 }
 
-string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
-                                    const DebugNodeKey& debug_node_key,
-                                    const uint64 wall_time_us) {
+std::string DebugFileIO::GetDumpFilePath(const std::string& dump_root_dir,
+                                         const DebugNodeKey& debug_node_key,
+                                         const uint64_t wall_time_us) {
   return AppendTimestampToFilePath(
       io::JoinPath(dump_root_dir, debug_node_key.device_path,
                    strings::StrCat(debug_node_key.node_name, "_",
@@ -641,12 +645,12 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
       wall_time_us);
 }
 
-string DebugFileIO::GetDumpFilePathForNodeDumping(
-    const string& dump_root_dir, const DebugNodeKey& debug_node_key,
-    const uint64 wall_time_us, const int64_t step_id) {
+std::string DebugFileIO::GetDumpFilePathForNodeDumping(
+    const std::string& dump_root_dir, const DebugNodeKey& debug_node_key,
+    const uint64_t wall_time_us, const int64_t step_id) {
   return AppendTimestampToFilePath(
       io::JoinPath(
-          dump_root_dir, kDumpSubDirName, strings::StrCat("step-", step_id),
+          dump_root_dir, kDumpSubDirName, absl::StrCat("step-", step_id),
           strings::StrCat(
               absl::StrReplaceAll(debug_node_key.io_of_node, {{"/", "-"}}), ":",
               debug_node_key.is_input ? "in" : "out", ":",
@@ -655,20 +659,20 @@ string DebugFileIO::GetDumpFilePathForNodeDumping(
 }
 
 absl::Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
-                                               const string& dir_name,
-                                               const string& file_name) {
+                                               const std::string& dir_name,
+                                               const std::string& file_name) {
   Env* env(Env::Default());
 
   absl::Status s = RecursiveCreateDir(env, dir_name);
   if (!s.ok()) {
     return absl::Status(absl::StatusCode::kFailedPrecondition,
-                        strings::StrCat("Failed to create directory  ",
-                                        dir_name, ", due to: ", s.message()));
+                        absl::StrCat("Failed to create directory  ", dir_name,
+                                     ", due to: ", s.message()));
   }
 
-  const string file_path = io::JoinPath(dir_name, file_name);
+  const std::string file_path = io::JoinPath(dir_name, file_name);
 
-  string event_str;
+  std::string event_str;
   event_proto.SerializeToString(&event_str);
 
   std::unique_ptr<WritableFile> f = nullptr;
@@ -681,36 +685,35 @@ absl::Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
 
 absl::Status DebugFileIO::DumpTensorToEventFile(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const string& file_path) {
+    const uint64_t wall_time_us, const std::string& file_path) {
   std::vector<Event> events;
   TF_RETURN_IF_ERROR(
       WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events));
-  return DumpEventProtoToFile(events[0], string(io::Dirname(file_path)),
-                              string(io::Basename(file_path)));
+  return DumpEventProtoToFile(events[0], std::string(io::Dirname(file_path)),
+                              std::string(io::Basename(file_path)));
 }
 
-absl::Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
+absl::Status DebugFileIO::RecursiveCreateDir(Env* env, const std::string& dir) {
   if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
     // The path already exists as a directory. Return OK right away.
     return absl::OkStatus();
   }
 
-  string parent_dir(io::Dirname(dir));
+  std::string parent_dir(io::Dirname(dir));
   if (!env->FileExists(parent_dir).ok()) {
     // The parent path does not exist yet, create it first.
     absl::Status s = RecursiveCreateDir(env, parent_dir);  // Recursive call
     if (!s.ok()) {
       return absl::Status(
           absl::StatusCode::kFailedPrecondition,
-          strings::StrCat("Failed to create directory  ", parent_dir));
+          absl::StrCat("Failed to create directory  ", parent_dir));
     }
   } else if (env->FileExists(parent_dir).ok() &&
              !env->IsDirectory(parent_dir).ok()) {
     // The path exists, but it is a file.
-    return absl::Status(
-        absl::StatusCode::kFailedPrecondition,
-        strings::StrCat("Failed to create directory  ", parent_dir,
-                        " because the path exists as a file "));
+    return absl::Status(absl::StatusCode::kFailedPrecondition,
+                        absl::StrCat("Failed to create directory  ", parent_dir,
+                                     " because the path exists as a file "));
   }
 
   env->CreateDir(dir).IgnoreError();
@@ -721,18 +724,18 @@ absl::Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
   } else {
     return absl::Status(
         absl::StatusCode::kAborted,
-        strings::StrCat("Failed to create directory  ", parent_dir));
+        absl::StrCat("Failed to create directory  ", parent_dir));
   }
 }
 
 // Default total disk usage limit: 100 GBytes
-const uint64 DebugFileIO::kDefaultGlobalDiskBytesLimit = 107374182400L;
-uint64 DebugFileIO::global_disk_bytes_limit_ = 0;
-uint64 DebugFileIO::disk_bytes_used_ = 0;
+const uint64_t DebugFileIO::kDefaultGlobalDiskBytesLimit = 107374182400L;
+uint64_t DebugFileIO::global_disk_bytes_limit_ = 0;
+uint64_t DebugFileIO::disk_bytes_used_ = 0;
 
 mutex DebugFileIO::bytes_mu_(LINKER_INITIALIZED);
 
-bool DebugFileIO::requestDiskByteUsage(uint64 bytes) {
+bool DebugFileIO::requestDiskByteUsage(uint64_t bytes) {
   mutex_lock l(bytes_mu_);
   if (global_disk_bytes_limit_ == 0) {
     const char* env_tfdbg_disk_bytes_limit = getenv("TFDBG_DISK_BYTES_LIMIT");
@@ -762,13 +765,13 @@ void DebugFileIO::resetDiskByteUsage() {
 }
 
 #ifndef PLATFORM_WINDOWS
-DebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
+DebugGrpcChannel::DebugGrpcChannel(const std::string& server_stream_addr)
     : server_stream_addr_(server_stream_addr),
-      url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
+      url_(absl::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
 
 absl::Status DebugGrpcChannel::Connect(const int64_t timeout_micros) {
   ::grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32_t>::max());
   // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
   args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   channel_ = ::grpc::CreateCustomChannel(
@@ -803,9 +806,10 @@ void DebugGrpcChannel::ReceiveAndProcessEventReplies(const size_t max_replies) {
          ReadEventReply(&event_reply)) {
     for (const EventReply::DebugOpStateChange& debug_op_state_change :
          event_reply.debug_op_state_changes()) {
-      string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":",
-                                         debug_op_state_change.output_slot(),
-                                         ":", debug_op_state_change.debug_op());
+      std::string watch_key =
+          strings::StrCat(debug_op_state_change.node_name(), ":",
+                          debug_op_state_change.output_slot(), ":",
+                          debug_op_state_change.debug_op());
       DebugGrpcIO::SetDebugNodeKeyGrpcState(url_, watch_key,
                                             debug_op_state_change.state());
     }
@@ -834,17 +838,17 @@ const size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024;
 
 const size_t DebugGrpcIO::kGrpcMaxVarintLengthSize = 6;
 
-std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
 DebugGrpcIO::GetStreamChannels() {
-  static std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
-      stream_channels =
-          new std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>();
+  static std::unordered_map<
+      std::string, std::unique_ptr<DebugGrpcChannel>>* stream_channels =
+      new std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>();
   return stream_channels;
 }
 
 absl::Status DebugGrpcIO::SendTensorThroughGrpcStream(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const string& grpc_stream_url,
+    const uint64_t wall_time_us, const std::string& grpc_stream_url,
     const bool gated) {
   if (gated &&
       !IsReadGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) {
@@ -870,29 +874,29 @@ absl::Status DebugGrpcIO::SendTensorThroughGrpcStream(
 }
 
 absl::Status DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream(
-    EventReply* event_reply, const string& grpc_stream_url) {
+    EventReply* event_reply, const std::string& grpc_stream_url) {
   DebugGrpcChannel* debug_grpc_channel = nullptr;
   TF_RETURN_IF_ERROR(
       GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel));
   if (debug_grpc_channel->ReadEventReply(event_reply)) {
     return absl::OkStatus();
   } else {
-    return errors::Cancelled(strings::StrCat(
-        "Reading EventReply from stream URL ", grpc_stream_url, " failed."));
+    return errors::Cancelled(absl::StrCat("Reading EventReply from stream URL ",
+                                          grpc_stream_url, " failed."));
   }
 }
 
 absl::Status DebugGrpcIO::GetOrCreateDebugGrpcChannel(
-    const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel) {
-  const string addr_with_path =
+    const std::string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel) {
+  const std::string addr_with_path =
       absl::StartsWith(grpc_stream_url, DebugIO::kGrpcURLScheme)
           ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme))
           : grpc_stream_url;
-  const string server_stream_addr =
+  const std::string server_stream_addr =
       addr_with_path.substr(0, addr_with_path.find('/'));
   {
     mutex_lock l(streams_mu_);
-    std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+    std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
         stream_channels = GetStreamChannels();
     if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
       std::unique_ptr<DebugGrpcChannel> channel(
@@ -907,7 +911,7 @@ absl::Status DebugGrpcIO::GetOrCreateDebugGrpcChannel(
 }
 
 absl::Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
-    const Event& event_proto, const string& grpc_stream_url,
+    const Event& event_proto, const std::string& grpc_stream_url,
     const bool receive_reply) {
   DebugGrpcChannel* debug_grpc_channel;
   TF_RETURN_IF_ERROR(
@@ -915,8 +919,8 @@ absl::Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
 
   bool write_ok = debug_grpc_channel->WriteEvent(event_proto);
   if (!write_ok) {
-    return errors::Cancelled(strings::StrCat("Write event to stream URL ",
-                                             grpc_stream_url, " failed."));
+    return errors::Cancelled(absl::StrCat("Write event to stream URL ",
+                                          grpc_stream_url, " failed."));
   }
 
   if (receive_reply) {
@@ -926,15 +930,15 @@ absl::Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
   return absl::OkStatus();
 }
 
-bool DebugGrpcIO::IsReadGateOpen(const string& grpc_debug_url,
-                                 const string& watch_key) {
+bool DebugGrpcIO::IsReadGateOpen(const std::string& grpc_debug_url,
+                                 const std::string& watch_key) {
   const DebugNodeName2State* enabled_node_to_state =
       GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
   return enabled_node_to_state->find(watch_key) != enabled_node_to_state->end();
 }
 
-bool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url,
-                                  const string& watch_key) {
+bool DebugGrpcIO::IsWriteGateOpen(const std::string& grpc_debug_url,
+                                  const std::string& watch_key) {
   const DebugNodeName2State* enabled_node_to_state =
       GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
   auto it = enabled_node_to_state->find(watch_key);
@@ -945,10 +949,10 @@ bool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url,
   }
 }
 
-absl::Status DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
+absl::Status DebugGrpcIO::CloseGrpcStream(const std::string& grpc_stream_url) {
   mutex_lock l(streams_mu_);
 
-  std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+  std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
       stream_channels = GetStreamChannels();
   if (stream_channels->find(grpc_stream_url) != stream_channels->end()) {
     // Stream of the specified address exists. Close it and remove it from
@@ -963,18 +967,18 @@ absl::Status DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
   }
 }
 
-std::unordered_map<string, DebugGrpcIO::DebugNodeName2State>*
+std::unordered_map<std::string, DebugGrpcIO::DebugNodeName2State>*
 DebugGrpcIO::GetEnabledDebugOpStates() {
-  static std::unordered_map<string, DebugNodeName2State>*
+  static std::unordered_map<std::string, DebugNodeName2State>*
       enabled_debug_op_states =
-          new std::unordered_map<string, DebugNodeName2State>();
+          new std::unordered_map<std::string, DebugNodeName2State>();
   return enabled_debug_op_states;
 }
 
 DebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl(
-    const string& grpc_debug_url) {
+    const std::string& grpc_debug_url) {
   static mutex* debug_ops_state_mu = new mutex();
-  std::unordered_map<string, DebugNodeName2State>* states =
+  std::unordered_map<std::string, DebugNodeName2State>* states =
       GetEnabledDebugOpStates();
 
   mutex_lock l(*debug_ops_state_mu);
@@ -986,7 +990,7 @@ DebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl(
 }
 
 void DebugGrpcIO::SetDebugNodeKeyGrpcState(
-    const string& grpc_debug_url, const string& watch_key,
+    const std::string& grpc_debug_url, const std::string& watch_key,
     const EventReply::DebugOpStateChange::State new_state) {
   DebugNodeName2State* states = GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
   if (new_state == EventReply::DebugOpStateChange::DISABLED) {
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 95864c714682b6..99107971f0f2b4 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -36,15 +36,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-absl::Status ReadEventFromFile(const string& dump_file_path, Event* event);
+absl::Status ReadEventFromFile(const std::string& dump_file_path, Event* event);
 
 struct DebugWatchAndURLSpec {
-  DebugWatchAndURLSpec(const string& watch_key, const string& url,
+  DebugWatchAndURLSpec(const std::string& watch_key, const std::string& url,
                        const bool gated_grpc)
       : watch_key(watch_key), url(url), gated_grpc(gated_grpc) {}
 
-  const string watch_key;
-  const string url;
+  const std::string watch_key;
+  const std::string url;
   const bool gated_grpc;
 };
 
@@ -63,10 +63,11 @@ class DebugIO {
 
   static absl::Status PublishDebugMetadata(
       const int64_t global_step, const int64_t session_run_index,
-      const int64_t executor_step_index, const std::vector<string>& input_names,
-      const std::vector<string>& output_names,
-      const std::vector<string>& target_nodes,
-      const std::unordered_set<string>& debug_urls);
+      const int64_t executor_step_index,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::string>& target_nodes,
+      const std::unordered_set<std::string>& debug_urls);
 
   // Publishes a tensor to a debug target URL.
   //
@@ -82,13 +83,15 @@ class DebugIO {
   //   step_id: Step ID associated with the tensor.
   static absl::Status PublishDebugTensor(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      const uint64 wall_time_us, const absl::Span<const string> debug_urls,
-      bool gated_grpc, int64_t step_id = -1);
+      const uint64_t wall_time_us,
+      const absl::Span<const std::string> debug_urls, bool gated_grpc,
+      int64_t step_id = -1);
 
   // Convenience overload of the method above for no gated_grpc by default.
   static absl::Status PublishDebugTensor(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      const uint64 wall_time_us, const absl::Span<const string> debug_urls);
+      const uint64_t wall_time_us,
+      const absl::Span<const std::string> debug_urls);
 
   // Publishes a graph to a set of debug URLs.
   //
@@ -96,8 +99,8 @@ class DebugIO {
   //   graph: The graph to be published.
   //   debug_urls: The set of debug URLs to publish the graph to.
   static absl::Status PublishGraph(
-      const Graph& graph, const string& device_name,
-      const std::unordered_set<string>& debug_urls);
+      const Graph& graph, const std::string& device_name,
+      const std::unordered_set<std::string>& debug_urls);
 
   // Determines whether a copy node needs to perform deep-copy of input tensor.
   //
@@ -126,8 +129,8 @@ class DebugIO {
   //
   // Returns:
   //   Whether this debug op should proceed.
-  static bool IsDebugNodeGateOpen(const string& watch_key,
-                                  const std::vector<string>& debug_urls);
+  static bool IsDebugNodeGateOpen(const std::string& watch_key,
+                                  const std::vector<std::string>& debug_urls);
 
   // Determines whether debug information should be sent through a grpc://
   // debug URL given the current gRPC gating status.
@@ -141,10 +144,10 @@ class DebugIO {
   // Returns:
   //   Whether the sending of debug data to the debug_url should
   //     proceed.
-  static bool IsDebugURLGateOpen(const string& watch_key,
-                                 const string& debug_url);
+  static bool IsDebugURLGateOpen(const std::string& watch_key,
+                                 const std::string& debug_url);
 
-  static absl::Status CloseDebugURL(const string& debug_url);
+  static absl::Status CloseDebugURL(const std::string& debug_url);
 };
 
 // Helper class for debug ops.
@@ -171,15 +174,15 @@ class DebugFileIO {
   //   dump_file_path: The actual dump file path (passed as reference).
   static absl::Status DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                       const Tensor& tensor,
-                                      const uint64 wall_time_us,
-                                      const string& dump_root_dir,
-                                      string* dump_file_path);
+                                      const uint64_t wall_time_us,
+                                      const std::string& dump_root_dir,
+                                      std::string* dump_file_path);
 
   // Similar to the above, but for node inputs/outputs dumping feature.
   static absl::Status DumpTensorToDirForNodeDumping(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      uint64 wall_time_us, const string& dump_root_dir, string* dump_file_path,
-      int64_t step_id);
+      uint64_t wall_time_us, const std::string& dump_root_dir,
+      std::string* dump_file_path, int64_t step_id);
 
   // Get the full path to the dump file.
   //
@@ -190,14 +193,14 @@ class DebugFileIO {
   //   output_slot: Output slot index of the said node, e.g., 0.
   //   debug_op: Name of the debug op, e.g., DebugIdentity.
   //   wall_time_us: Time stamp of the dumped tensor, in microseconds (us).
-  static string GetDumpFilePath(const string& dump_root_dir,
-                                const DebugNodeKey& debug_node_key,
-                                const uint64 wall_time_us);
+  static std::string GetDumpFilePath(const std::string& dump_root_dir,
+                                     const DebugNodeKey& debug_node_key,
+                                     const uint64_t wall_time_us);
 
   // Similar to the above, but for node inputs/outputs dumping feature.
-  static string GetDumpFilePathForNodeDumping(
-      const string& dump_root_dir, const DebugNodeKey& debug_node_key,
-      uint64 wall_time_us, int64_t step_id);
+  static std::string GetDumpFilePathForNodeDumping(
+      const std::string& dump_root_dir, const DebugNodeKey& debug_node_key,
+      uint64_t wall_time_us, int64_t step_id);
 
   // Dumps an Event proto to a file.
   //
@@ -206,8 +209,8 @@ class DebugFileIO {
   //   dir_name: Directory path.
   //   file_name: Base file name.
   static absl::Status DumpEventProtoToFile(const Event& event_proto,
-                                           const string& dir_name,
-                                           const string& file_name);
+                                           const std::string& dir_name,
+                                           const std::string& file_name);
 
   // Request additional bytes to be dumped to the file system.
   //
@@ -222,31 +225,31 @@ class DebugFileIO {
   // Returns:
   //   Whether the request is approved given the total dumping
   //   limit.
-  static bool requestDiskByteUsage(uint64 bytes);
+  static bool requestDiskByteUsage(uint64_t bytes);
 
   // Reset the disk byte usage to zero.
   static void resetDiskByteUsage();
 
-  static uint64 global_disk_bytes_limit_;
+  static uint64_t global_disk_bytes_limit_;
 
  private:
   // Encapsulates the Tensor in an Event protobuf and write it to file.
   static absl::Status DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
                                             const Tensor& tensor,
-                                            const uint64 wall_time_us,
-                                            const string& file_path);
+                                            const uint64_t wall_time_us,
+                                            const std::string& file_path);
 
   // Implemented ad hoc here for now.
   // TODO(cais): Replace with shared implementation once http://b/30497715 is
   // fixed.
-  static absl::Status RecursiveCreateDir(Env* env, const string& dir);
+  static absl::Status RecursiveCreateDir(Env* env, const std::string& dir);
 
   // Tracks how much disk has been used so far.
-  static uint64 disk_bytes_used_;
+  static uint64_t disk_bytes_used_;
   // Mutex for thread-safe access to disk_bytes_used_.
   static mutex bytes_mu_;
   // Default limit for the disk space.
-  static const uint64 kDefaultGlobalDiskBytesLimit;
+  static const uint64_t kDefaultGlobalDiskBytesLimit;
 
   friend class DiskUsageLimitTest;
 };
@@ -282,7 +285,7 @@ class DebugGrpcChannel {
   //   server_stream_addr: Address (host name and port) of the debug stream
   //     server implementing the EventListener service (see
   //     debug_service.proto). E.g., "127.0.0.1:12345".
-  explicit DebugGrpcChannel(const string& server_stream_addr);
+  explicit DebugGrpcChannel(const std::string& server_stream_addr);
 
   virtual ~DebugGrpcChannel() {}
 
@@ -337,8 +340,8 @@ class DebugGrpcChannel {
   absl::Status ReceiveServerRepliesAndClose();
 
  private:
-  string server_stream_addr_;
-  string url_;
+  std::string server_stream_addr_;
+  std::string url_;
   ::grpc::ClientContext ctx_;
   std::shared_ptr<::grpc::Channel> channel_;
   std::unique_ptr<grpc::EventListener::Stub> stub_;
@@ -356,7 +359,7 @@ class DebugGrpcIO {
   // Sends a tensor through a debug gRPC stream.
   static absl::Status SendTensorThroughGrpcStream(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      const uint64 wall_time_us, const string& grpc_stream_url,
+      const uint64_t wall_time_us, const std::string& grpc_stream_url,
       const bool gated);
 
   // Sends an Event proto through a debug gRPC stream.
@@ -373,40 +376,40 @@ class DebugGrpcIO {
   // Returns:
   //   The Status of the operation.
   static absl::Status SendEventProtoThroughGrpcStream(
-      const Event& event_proto, const string& grpc_stream_url,
+      const Event& event_proto, const std::string& grpc_stream_url,
       const bool receive_reply = false);
 
   // Receive an EventReply proto through a debug gRPC stream.
   static absl::Status ReceiveEventReplyProtoThroughGrpcStream(
-      EventReply* event_reply, const string& grpc_stream_url);
+      EventReply* event_reply, const std::string& grpc_stream_url);
 
   // Check whether a debug watch key is read-activated at a given gRPC URL.
-  static bool IsReadGateOpen(const string& grpc_debug_url,
-                             const string& watch_key);
+  static bool IsReadGateOpen(const std::string& grpc_debug_url,
+                             const std::string& watch_key);
 
   // Check whether a debug watch key is write-activated (i.e., read- and
   // write-activated) at a given gRPC URL.
-  static bool IsWriteGateOpen(const string& grpc_debug_url,
-                              const string& watch_key);
+  static bool IsWriteGateOpen(const std::string& grpc_debug_url,
+                              const std::string& watch_key);
 
   // Closes a gRPC stream to the given address, if it exists.
   // Thread-safety: Safe with respect to other calls to the same method and
   // calls to SendTensorThroughGrpcStream().
-  static absl::Status CloseGrpcStream(const string& grpc_stream_url);
+  static absl::Status CloseGrpcStream(const std::string& grpc_stream_url);
 
   // Set the gRPC state of a debug node key.
   // TODO(cais): Include device information in watch_key.
   static void SetDebugNodeKeyGrpcState(
-      const string& grpc_debug_url, const string& watch_key,
+      const std::string& grpc_debug_url, const std::string& watch_key,
       const EventReply::DebugOpStateChange::State new_state);
 
  private:
   using DebugNodeName2State =
-      std::unordered_map<string, EventReply::DebugOpStateChange::State>;
+      std::unordered_map<std::string, EventReply::DebugOpStateChange::State>;
 
   // Returns a global map from grpc debug URLs to the corresponding
   // DebugGrpcChannels.
-  static std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+  static std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
   GetStreamChannels();
 
   // Get a DebugGrpcChannel object at a given URL, creating one if necessary.
@@ -420,15 +423,16 @@ class DebugGrpcIO {
   // Returns:
   //   Status of this operation.
   static absl::Status GetOrCreateDebugGrpcChannel(
-      const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel);
+      const std::string& grpc_stream_url,
+      DebugGrpcChannel** debug_grpc_channel);
 
   // Returns a map from debug URL to a map from debug op name to enabled state.
-  static std::unordered_map<string, DebugNodeName2State>*
+  static std::unordered_map<std::string, DebugNodeName2State>*
   GetEnabledDebugOpStates();
 
   // Returns a map from debug op names to enabled state, for a given debug URL.
   static DebugNodeName2State* GetEnabledDebugOpStatesAtUrl(
-      const string& grpc_debug_url);
+      const std::string& grpc_debug_url);
 
   // Clear enabled debug op state from all debug URLs (if any).
   static void ClearEnabledWatchKeys();
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 9374b610ff9ce5..fde63f53331cf1 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -107,19 +107,19 @@ TEST_F(DebugIOUtilsTest, DebugNodeKeysIsHashable) {
 TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
   Initialize();
 
-  const string test_dir =
-      strings::StrCat(testing::TmpDir(), "/DumpFloatTensorToFileSunnyDay");
+  const std::string test_dir =
+      absl::StrCat(testing::TmpDir(), "/DumpFloatTensorToFileSunnyDay");
   if (!env_->FileExists(test_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(test_dir).ok());
   }
 
   // Append levels of nonexisting directories, to test that the function can
   // create directories.
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
 
-  string dump_file_path;
+  std::string dump_file_path;
   TF_ASSERT_OK(DebugFileIO::DumpTensorToDir(
       kDebugNodeKey, *tensor_a_, wall_time, test_dir, &dump_file_path));
 
@@ -154,16 +154,16 @@ TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
 TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
   Initialize();
 
-  const string test_dir =
-      strings::StrCat(testing::TmpDir(), "/DumpStringTensorToFileSunnyDay");
+  const std::string test_dir =
+      absl::StrCat(testing::TmpDir(), "/DumpStringTensorToFileSunnyDay");
   if (!env_->FileExists(test_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(test_dir).ok());
   }
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "quux/grault/tensor_b", 1, "DebugIdentity");
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
-  string dump_file_name;
+  std::string dump_file_name;
   absl::Status s = DebugFileIO::DumpTensorToDir(
       kDebugNodeKey, *tensor_b_, wall_time, test_dir, &dump_file_name);
   ASSERT_TRUE(s.ok());
@@ -209,17 +209,17 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
   Initialize();
 
   // First, create the file at the path.
-  const string test_dir = strings::StrCat(
-      testing::TmpDir(), "/DumpTensorToFileCannotCreateDirectory");
+  const std::string test_dir =
+      absl::StrCat(testing::TmpDir(), "/DumpTensorToFileCannotCreateDirectory");
   if (!env_->FileExists(test_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(test_dir).ok());
   }
-  const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+  const std::string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
   const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
                                    "DebugIdentity");
-  const string txt_file_dir =
+  const std::string txt_file_dir =
       io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
-  const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
+  const std::string txt_file_name = io::JoinPath(txt_file_dir, "baz");
   if (!env_->FileExists(txt_file_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
   }
@@ -238,9 +238,9 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
   // Second, try to dump the tensor to a path that requires "baz" to be a
   // directory, which should lead to an error.
 
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
-  string dump_file_name;
+  std::string dump_file_name;
   absl::Status s = DebugFileIO::DumpTensorToDir(
       kDebugNodeKey, *tensor_a_, wall_time, test_dir, &dump_file_name);
   ASSERT_FALSE(s.ok());
@@ -261,19 +261,19 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
   const int kNumDumpRoots = 3;
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
-  std::vector<string> dump_roots;
-  std::vector<string> dump_file_paths;
-  std::vector<string> urls;
+  std::vector<std::string> dump_roots;
+  std::vector<std::string> dump_file_paths;
+  std::vector<std::string> urls;
   for (int i = 0; i < kNumDumpRoots; ++i) {
-    string dump_root = strings::StrCat(testing::TmpDir(),
-                                       "/PublicTensorToMultipleFileUrls_", i);
+    std::string dump_root =
+        absl::StrCat(testing::TmpDir(), "/PublicTensorToMultipleFileUrls_", i);
 
     dump_roots.push_back(dump_root);
     dump_file_paths.push_back(
         DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time));
-    urls.push_back(strings::StrCat("file://", dump_root));
+    urls.push_back(absl::StrCat("file://", dump_root));
   }
 
   for (int i = 1; i < kNumDumpRoots; ++i) {
@@ -331,10 +331,10 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMemoryCallback) {
 
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
   bool called = false;
-  std::vector<string> urls = {"memcbk://test_callback"};
+  std::vector<std::string> urls = {"memcbk://test_callback"};
   ;
 
   auto* callback_registry = DebugCallbackRegistry::singleton();
@@ -367,17 +367,17 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
 
   thread::ThreadPool* tp =
       new thread::ThreadPool(Env::Default(), "test", kConcurrentPubs);
-  const uint64 wall_time = env_->NowMicros();
-  const string dump_root_base =
-      strings::StrCat(testing::TmpDir(),
-                      "/PublishTensorConcurrentlyToPartiallyOverlappingPaths");
+  const uint64_t wall_time = env_->NowMicros();
+  const std::string dump_root_base =
+      absl::StrCat(testing::TmpDir(),
+                   "/PublishTensorConcurrentlyToPartiallyOverlappingPaths");
   if (!env_->FileExists(dump_root_base).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(dump_root_base).ok());
   }
 
   mutex mu;
-  std::vector<string> dump_roots TF_GUARDED_BY(mu);
-  std::vector<string> dump_file_paths TF_GUARDED_BY(mu);
+  std::vector<std::string> dump_roots TF_GUARDED_BY(mu);
+  std::vector<std::string> dump_file_paths TF_GUARDED_BY(mu);
 
   int dump_count TF_GUARDED_BY(mu) = 0;
   int done_count TF_GUARDED_BY(mu) = 0;
@@ -387,21 +387,21 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
              &dump_file_paths, &wall_time, &kDebugNodeKey, &kConcurrentPubs,
              &all_done]() {
     // "gumpy" is the shared directory part of the path.
-    string dump_root;
-    string debug_url;
+    std::string dump_root;
+    std::string debug_url;
     {
       mutex_lock l(mu);
       dump_root =
-          strings::StrCat(dump_root_base, "grumpy/", "dump_", dump_count++);
+          absl::StrCat(dump_root_base, "grumpy/", "dump_", dump_count++);
 
       dump_roots.push_back(dump_root);
       dump_file_paths.push_back(
           DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time));
 
-      debug_url = strings::StrCat("file://", dump_root);
+      debug_url = absl::StrCat("file://", dump_root);
     }
 
-    std::vector<string> urls;
+    std::vector<std::string> urls;
     urls.push_back(debug_url);
 
     absl::Status s =
diff --git a/tensorflow/core/debug/debug_node_key.cc b/tensorflow/core/debug/debug_node_key.cc
index 1a8c9f91eee60e..09510b8df1bfb8 100644
--- a/tensorflow/core/debug/debug_node_key.cc
+++ b/tensorflow/core/debug/debug_node_key.cc
@@ -26,9 +26,11 @@ const char* const DebugNodeKey::kMetadataFilePrefix = "_tfdbg_";
 
 const char* const DebugNodeKey::kDeviceTag = "device_";
 
-DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
-                           const int32_t output_slot, const string& debug_op,
-                           const string& io_of_node, const bool is_input,
+DebugNodeKey::DebugNodeKey(const std::string& device_name,
+                           const std::string& node_name,
+                           const int32_t output_slot,
+                           const std::string& debug_op,
+                           const std::string& io_of_node, const bool is_input,
                            const int32_t io_index)
     : device_name(device_name),
       node_name(node_name),
@@ -52,12 +54,12 @@ bool DebugNodeKey::operator!=(const DebugNodeKey& other) const {
   return !((*this) == other);
 }
 
-const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
-  return strings::StrCat(
-      kMetadataFilePrefix, kDeviceTag,
-      str_util::StringReplace(
-          str_util::StringReplace(device_name, ":", "_", true), "/", ",",
-          true));
+const std::string DebugNodeKey::DeviceNameToDevicePath(
+    const std::string& device_name) {
+  return absl::StrCat(kMetadataFilePrefix, kDeviceTag,
+                      str_util::StringReplace(
+                          str_util::StringReplace(device_name, ":", "_", true),
+                          "/", ",", true));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_node_key.h b/tensorflow/core/debug/debug_node_key.h
index 5decb5cc683643..867e0809314324 100644
--- a/tensorflow/core/debug/debug_node_key.h
+++ b/tensorflow/core/debug/debug_node_key.h
@@ -27,28 +27,29 @@ struct DebugNodeKey {
   static const char* const kMetadataFilePrefix;
   static const char* const kDeviceTag;
 
-  DebugNodeKey(const string& device_name, const string& node_name,
-               int32_t output_slot, const string& debug_op,
-               const string& io_of_node = "", bool is_input = false,
+  DebugNodeKey(const std::string& device_name, const std::string& node_name,
+               int32_t output_slot, const std::string& debug_op,
+               const std::string& io_of_node = "", bool is_input = false,
                int32_t io_index = -1);
 
   // Converts a device name string to a device path string.
   // E.g., /job:localhost/replica:0/task:0/cpu:0 will be converted to
   //   ,job_localhost,replica_0,task_0,cpu_0.
-  static const string DeviceNameToDevicePath(const string& device_name);
+  static const std::string DeviceNameToDevicePath(
+      const std::string& device_name);
 
   bool operator==(const DebugNodeKey& other) const;
   bool operator!=(const DebugNodeKey& other) const;
 
-  const string device_name;
-  const string node_name;
-  const int32 output_slot;
-  const string debug_op;
-  const string debug_node_name;
-  const string device_path;
-  const string io_of_node;
+  const std::string device_name;
+  const std::string node_name;
+  const int32_t output_slot;
+  const std::string debug_op;
+  const std::string debug_node_name;
+  const std::string device_path;
+  const std::string io_of_node;
   const bool is_input;
-  const int32 io_index;
+  const int32_t io_index;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debugger_state_impl.cc b/tensorflow/core/debug/debugger_state_impl.cc
index a1545ad1aa1516..23b70b431d8dd0 100644
--- a/tensorflow/core/debug/debugger_state_impl.cc
+++ b/tensorflow/core/debug/debugger_state_impl.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 DebuggerState::DebuggerState(const DebugOptions& debug_options) {
   for (const DebugTensorWatch& watch :
        debug_options.debug_tensor_watch_opts()) {
-    for (const string& url : watch.debug_urls()) {
+    for (const std::string& url : watch.debug_urls()) {
       debug_urls_.insert(url);
     }
   }
@@ -33,16 +33,17 @@ DebuggerState::DebuggerState(const DebugOptions& debug_options) {
 }
 
 DebuggerState::~DebuggerState() {
-  for (const string& debug_url : debug_urls_) {
+  for (const std::string& debug_url : debug_urls_) {
     DebugIO::CloseDebugURL(debug_url).IgnoreError();
   }
 }
 
 absl::Status DebuggerState::PublishDebugMetadata(
     const int64_t global_step, const int64_t session_run_index,
-    const int64_t executor_step_index, const std::vector<string>& input_names,
-    const std::vector<string>& output_names,
-    const std::vector<string>& target_names) {
+    const int64_t executor_step_index,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::string>& target_names) {
   return DebugIO::PublishDebugMetadata(global_step, session_run_index,
                                        executor_step_index, input_names,
                                        output_names, target_names, debug_urls_);
@@ -55,11 +56,11 @@ absl::Status DebugGraphDecorator::DecorateGraph(Graph* graph, Device* device) {
 }
 
 absl::Status DebugGraphDecorator::PublishGraph(const Graph& graph,
-                                               const string& device_name) {
-  std::unordered_set<string> debug_urls;
+                                               const std::string& device_name) {
+  std::unordered_set<std::string> debug_urls;
   for (const DebugTensorWatch& watch :
        debug_options_.debug_tensor_watch_opts()) {
-    for (const string& url : watch.debug_urls()) {
+    for (const std::string& url : watch.debug_urls()) {
       debug_urls.insert(url);
     }
   }
diff --git a/tensorflow/core/debug/debugger_state_impl.h b/tensorflow/core/debug/debugger_state_impl.h
index c34aa8bb51a917..73e74738d59d3c 100644
--- a/tensorflow/core/debug/debugger_state_impl.h
+++ b/tensorflow/core/debug/debugger_state_impl.h
@@ -34,12 +34,13 @@ class DebuggerState : public DebuggerStateInterface {
   // details.
   absl::Status PublishDebugMetadata(
       const int64_t global_step, const int64_t session_run_count,
-      const int64_t executor_step_count, const std::vector<string>& input_names,
-      const std::vector<string>& output_names,
-      const std::vector<string>& target_names) override;
+      const int64_t executor_step_count,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::string>& target_names) override;
 
  private:
-  std::unordered_set<string> debug_urls_;
+  std::unordered_set<std::string> debug_urls_;
 };
 
 class DebugGraphDecorator : public DebugGraphDecoratorInterface {
@@ -50,7 +51,7 @@ class DebugGraphDecorator : public DebugGraphDecoratorInterface {
 
   absl::Status DecorateGraph(Graph* graph, Device* device) override;
   absl::Status PublishGraph(const Graph& graph,
-                            const string& device_name) override;
+                            const std::string& device_name) override;
 
  private:
   DebugOptions debug_options_;
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 9f23b6777da73c..4e58928e5693dd 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -49,7 +49,7 @@ SessionOptions Devices(int num_cpus, int num_gpus) {
   return result;
 }
 
-void CreateGraphDef(GraphDef* graph_def, string node_names[3]) {
+void CreateGraphDef(GraphDef* graph_def, std::string node_names[3]) {
   Graph graph(OpRegistry::Global());
 
   Tensor a_tensor(DT_FLOAT, TensorShape({1, 2}));
@@ -77,11 +77,11 @@ void IsSingleFloatValue(const Tensor& val, float expected_val) {
   ASSERT_EQ(val.flat<float>()(0), expected_val);
 }
 
-SessionOptions Options(const string& target, int placement_period) {
+SessionOptions Options(const std::string& target, int placement_period) {
   SessionOptions options;
   // NOTE(mrry): GrpcSession requires a grpc:// scheme prefix in the target
   // string.
-  options.target = strings::StrCat("grpc://", target);
+  options.target = absl::StrCat("grpc://", target);
   options.config.set_placement_period(placement_period);
   options.config.mutable_graph_options()
       ->mutable_optimizer_options()
@@ -115,18 +115,19 @@ class GrpcSessionDebugTest : public ::testing::Test {
     }
   }
 
-  const string GetDebugURL() { return debug_url_; }
+  const std::string GetDebugURL() { return debug_url_; }
 
-  void LoadTensorDumps(const string& subdir, std::vector<Tensor>* tensors) {
-    const string dirpath = io::JoinPath(dump_dir_, subdir);
+  void LoadTensorDumps(const std::string& subdir,
+                       std::vector<Tensor>* tensors) {
+    const std::string dirpath = io::JoinPath(dump_dir_, subdir);
     if (!(Env::Default()->IsDirectory(dirpath).ok())) {
       return;
     }
 
-    std::vector<string> filenames;
+    std::vector<std::string> filenames;
     TF_ASSERT_OK(Env::Default()->GetChildren(dirpath, &filenames));
 
-    for (const string& filename : filenames) {
+    for (const std::string& filename : filenames) {
       Event event;
       TF_ASSERT_OK(ReadEventFromFile(io::JoinPath(dirpath, filename), &event));
       if (event.summary().value().size() == 1) {
@@ -141,16 +142,16 @@ class GrpcSessionDebugTest : public ::testing::Test {
   void CreateDumpDir() {
     char dir_template[] = "/tmp/tfdbg_grpc_sessions_XXXXXX";
     dump_dir_ = mkdtemp(dir_template);
-    debug_url_ = strings::StrCat("file://", dump_dir_);
+    debug_url_ = absl::StrCat("file://", dump_dir_);
   }
 
-  string dump_dir_;
-  string debug_url_;
+  std::string dump_dir_;
+  std::string debug_url_;
 };
 
 TEST_F(GrpcSessionDebugTest, FileDebugURL) {
   GraphDef graph;
-  string node_names[3];
+  std::string node_names[3];
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
@@ -216,7 +217,8 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) {
   TF_CHECK_OK(session->Close());
 }
 
-void SetDevice(GraphDef* graph, const string& name, const string& dev) {
+void SetDevice(GraphDef* graph, const std::string& name,
+               const std::string& dev) {
   for (size_t i = 0; i < graph->node_size(); ++i) {
     if (graph->node(i).name() == name) {
       graph->mutable_node(i)->set_device(dev);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index cdd4dbd25b75ee..57f0a0699efb17 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_operation",
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 8c45235aad5cdc..5688c30275eb2e 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -28,10 +28,19 @@ limitations under the License.
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
 
 namespace tensorflow {
 namespace eager {
 namespace {
+
+auto* enqueue_request_size_metric = ::tensorflow::monitoring::Sampler<1>::New(
+    {"/tensorflow/distributed_runtime/eager/enqueue_request_size",
+     "The size of EnqueueRequest protos sent by "
+     "EagerClusterFunctionLibraryRuntime in bytes.",
+     "phase"},
+    ::tensorflow::monitoring::Buckets::Exponential(1, 1.12, 250));
+
 void StripDefaultAttributesInRegisterFunctionOp(
     RegisterFunctionOp* register_function) {
   StripDefaultAttributes(
@@ -112,6 +121,8 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
   }
 
   const absl::optional<std::vector<int>>& ret_indices = options.ret_indices;
+  enqueue_request_size_metric->GetCell("instantiate")
+      ->Add(request->ByteSizeLong());
   eager_client->EnqueueAsync(
       /*call_opts=*/nullptr, request.get(), response.get(),
       [this, request, response, handle, released_op = released_op.release(),
@@ -294,6 +305,7 @@ void EagerClusterFunctionLibraryRuntime::CleanUp(
   // StreamingEnqueueAsync could be blocking when streaming RPC is disabled.
   // CleanUp() needs to be non-blocking since it would be invoked inside the
   // enqueue done callback of Run(). So we don't use StreamingEnqueueAsync here.
+  enqueue_request_size_metric->GetCell("cleanup")->Add(request->ByteSizeLong());
   eager_client->EnqueueAsync(
       /*call_opts=*/nullptr, request.get(), response.get(),
       [request, response, done](const absl::Status& status) { done(status); });
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 476ab423154c88..13d130d289418c 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -107,7 +107,7 @@ GraphMgr::Item::~Item() {
 // NOTE: node->device_name() is not set by GraphConstructor.  We
 // expects that NodeDef in GraphDef given to workers fully specifies
 // device names.
-static string SplitByDevice(const Node* node) {
+static std::string SplitByDevice(const Node* node) {
   return node->assigned_device_name();
 }
 
@@ -144,7 +144,7 @@ absl::Status GraphMgr::DecorateAndPublishGraphForDebug(
 //
 // "executors" are filled with one executor per device if success and
 // the caller takes the ownership of returned executors.
-absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
+absl::Status GraphMgr::InitItem(const std::string& handle, const GraphDef& gdef,
                                 const GraphOptions& graph_options,
                                 const DebugOptions& debug_options,
                                 const ConfigProto& config_proto,
@@ -187,14 +187,14 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, gdef, &graph));
 
   // Splits "graph" into multiple subgraphs by device names.
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   PartitionOptions popts;
   popts.node_to_loc = SplitByDevice;
-  popts.new_name = [this](const string& prefix) {
+  popts.new_name = [this](const std::string& prefix) {
     mutex_lock l(mu_);
     return absl::StrCat(prefix, "_G", next_id_++);
   };
-  popts.get_incarnation = [this](const string& name) -> int64 {
+  popts.get_incarnation = [this](const std::string& name) -> int64_t {
     Device* device = nullptr;
     absl::Status s = device_mgr_->LookupDevice(name, &device);
     if (s.ok()) {
@@ -211,7 +211,7 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
     TF_RETURN_IF_ERROR(AddControlEdges(popts, &partitions));
   }
 
-  std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
+  std::unordered_map<std::string, std::unique_ptr<Graph>> partition_graphs;
   for (auto& partition : partitions) {
     std::unique_ptr<Graph> device_graph(new Graph(OpRegistry::Global()));
     GraphConstructorOptions device_opts;
@@ -236,7 +236,7 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
   const auto& optimizer_opts = graph_options.optimizer_options();
   GraphOptimizer optimizer(optimizer_opts);
   for (auto& p : partition_graphs) {
-    const string& device_name = p.first;
+    const std::string& device_name = p.first;
     std::unique_ptr<Graph>& subgraph = p.second;
     item->units.resize(item->units.size() + 1);
     ExecutionUnit* unit = &(item->units.back());
@@ -316,14 +316,14 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
   return absl::OkStatus();
 }
 
-absl::Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
+absl::Status GraphMgr::Register(const std::string& handle, const GraphDef& gdef,
                                 const GraphOptions& graph_options,
                                 const DebugOptions& debug_options,
                                 const ConfigProto& config_proto,
                                 int64_t collective_graph_key,
                                 WorkerSession* session,
                                 DistributedFunctionLibraryRuntime* cluster_flr,
-                                string* graph_handle) {
+                                std::string* graph_handle) {
   Item* item = new Item;
   absl::Status s =
       InitItem(handle, gdef, graph_options, debug_options, config_proto,
@@ -344,7 +344,7 @@ absl::Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
   return absl::OkStatus();
 }
 
-absl::Status GraphMgr::Deregister(const string& handle) {
+absl::Status GraphMgr::Deregister(const std::string& handle) {
   Item* item = nullptr;
   // Removes one item from table_.
   {
@@ -380,7 +380,7 @@ absl::Status GraphMgr::DeregisterAll() {
 absl::Status GraphMgr::SendInputs(const int64_t step_id,
                                   const NamedTensors& in) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id).release();
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   std::vector<Tensor> tensors_to_send;
   keys.reserve(in.size());
   tensors_to_send.reserve(in.size());
@@ -419,7 +419,7 @@ absl::Status GraphMgr::RecvOutputs(const int64_t step_id, NamedTensors* out) {
 void GraphMgr::RecvOutputsAsync(const int64_t step_id, NamedTensors* out,
                                 StatusCallback done) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id).release();
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   std::vector<Tensor>* received_keys = new std::vector<Tensor>;
   keys.reserve(out->size());
   received_keys->reserve(out->size());
@@ -443,13 +443,13 @@ void GraphMgr::RecvOutputsAsync(const int64_t step_id, NamedTensors* out,
 }
 
 void GraphMgr::ExecuteAsync(
-    const string& handle, const int64_t step_id, const ExecutorOpts& opts,
+    const std::string& handle, const int64_t step_id, const ExecutorOpts& opts,
     const NamedTensors& in, WorkerSession* session,
     StepStatsCollector* collector, MutableRunGraphResponseWrapper* response,
     CancellationManager* cancellation_manager,
     tsl::CoordinationServiceAgent* coordination_service_agent,
     StatusCallback done) {
-  const uint64 start_time_usecs = Env::Default()->NowMicros();
+  const uint64_t start_time_usecs = Env::Default()->NowMicros();
   tsl::profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish or RunGraphDone.
       [step_id] {
@@ -498,7 +498,7 @@ void GraphMgr::ExecuteAsync(
   // Sends values specified by the caller.
   size_t input_size = 0;
   if (s.ok()) {
-    std::vector<string> keys;
+    std::vector<std::string> keys;
     std::vector<Tensor> tensors_to_send;
     keys.reserve(in.size());
     tensors_to_send.reserve(in.size());
@@ -543,17 +543,19 @@ void GraphMgr::ExecuteAsync(
 }
 
 void GraphMgr::StartParallelExecutors(
-    const string& handle, int64_t step_id, Item* item, Rendezvous* rendezvous,
-    CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
-    CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
-    WorkerSession* session, int64_t start_time_usecs,
+    const std::string& handle, int64_t step_id, Item* item,
+    Rendezvous* rendezvous, CollectiveExecutor::Handle* ce_handle,
+    StepStatsCollector* collector, CostGraphDef* cost_graph,
+    CancellationManager* cancellation_manager, WorkerSession* session,
+    int64_t start_time_usecs,
     tsl::CoordinationServiceAgent* coordination_service_agent,
     StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
-  ScopedStepContainer* step_container = new ScopedStepContainer(
-      step_id,
-      [this](const string& name) { device_mgr_->ClearContainers({name}); });
+  ScopedStepContainer* step_container =
+      new ScopedStepContainer(step_id, [this](const std::string& name) {
+        device_mgr_->ClearContainers({name});
+      });
   // NOTE: Transfer one ref of rendezvous and item.
   ExecutorBarrier* barrier =
       new ExecutorBarrier(num_units, rendezvous,
@@ -602,7 +604,7 @@ void GraphMgr::BuildCostModel(Item* item, StepStatsCollector* collector,
                               CostGraphDef* cost_graph) {
   if (collector && !skip_cost_models_) {
     // Build the cost model
-    std::unordered_map<string, const Graph*> device_to_graph;
+    std::unordered_map<std::string, const Graph*> device_to_graph;
     for (const auto& unit : item->units) {
       if (unit.build_cost_model > 0) {
         device_to_graph[unit.device->name()] = unit.graph.get();
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 5c8c7ce0f20c95..3458771a21e9b1 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -85,21 +85,21 @@ class GraphMgr {
 
   // Registers a graph. Fills in "handle". The registered graph retains a
   // reference to cluster_flr to do cross process function calls.
-  absl::Status Register(const string& handle, const GraphDef& gdef,
+  absl::Status Register(const std::string& handle, const GraphDef& gdef,
                         const GraphOptions& graph_options,
                         const DebugOptions& debug_options,
                         const ConfigProto& config_proto,
                         int64_t collective_graph_key, WorkerSession* session,
                         DistributedFunctionLibraryRuntime* cluster_flr,
-                        string* graph_handle);
+                        std::string* graph_handle);
 
   // Executes one step of a registered graph "handle".
   //
   // If "out" is not nullptr, "out" specifies all keys the execution
   // should receive upon finish.
-  typedef std::map<string, Tensor> NamedTensors;
+  typedef std::map<std::string, Tensor> NamedTensors;
   typedef std::function<void(const absl::Status&)> StatusCallback;
-  void ExecuteAsync(const string& handle, const int64_t step_id,
+  void ExecuteAsync(const std::string& handle, const int64_t step_id,
                     const ExecutorOpts& opts, const NamedTensors& in,
                     WorkerSession* session, StepStatsCollector* collector,
                     MutableRunGraphResponseWrapper* response,
@@ -113,7 +113,7 @@ class GraphMgr {
                         StatusCallback done);
 
   // Deregisters a graph.
-  absl::Status Deregister(const string& handle);
+  absl::Status Deregister(const std::string& handle);
 
   // Deregister all graphs.
   absl::Status DeregisterAll();
@@ -137,10 +137,10 @@ class GraphMgr {
     ~Item() override;
 
     // Session handle.
-    string session;
+    std::string session;
 
     // Graph handle.
-    string handle;
+    std::string handle;
 
     // Session configuration options for the graph.
     ConfigProto session_config;
@@ -177,13 +177,14 @@ class GraphMgr {
   // TODO(zhifengc): If the client does not call Deregister, we'll
   // lose memory over time. We should implement a timeout-based
   // mechanism to gc these graphs.
-  std::unordered_map<string, Item*> table_;
+  std::unordered_map<std::string, Item*> table_;
 
   void StartParallelExecutors(
-      const string& handle, int64_t step_id, Item* item, Rendezvous* rendezvous,
-      CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
-      CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
-      WorkerSession* session, int64_t start_time_usecs,
+      const std::string& handle, int64_t step_id, Item* item,
+      Rendezvous* rendezvous, CollectiveExecutor::Handle* ce_handle,
+      StepStatsCollector* collector, CostGraphDef* cost_graph,
+      CancellationManager* cancellation_manager, WorkerSession* session,
+      int64_t start_time_usecs,
       tsl::CoordinationServiceAgent* coordination_service_agent,
       StatusCallback done);
 
@@ -194,7 +195,7 @@ class GraphMgr {
   void BuildCostModel(Item* item, StepStatsCollector* collector,
                       CostGraphDef* cost_graph);
 
-  absl::Status InitItem(const string& handle, const GraphDef& gdef,
+  absl::Status InitItem(const std::string& handle, const GraphDef& gdef,
                         const GraphOptions& graph_options,
                         const DebugOptions& debug_options,
                         const ConfigProto& config_proto,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 3b1bbebe380ebf..3fc358af4ad580 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -304,8 +304,7 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
       absl::string_view name,
       const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) {
     absl::string_view id;
-    auto it = metadata.find(GrpcIdKey());
-    if (it != metadata.end()) {
+    if (const auto [it, end] = metadata.equal_range(GrpcIdKey()); it != end) {
       id = absl::string_view(it->second.data(), it->second.size());
     }
     return new tsl::profiler::TraceMe(
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 95aed8f498efc8..5935465711a02e 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/scheduler.h"
 
+#include <limits>
 #include <queue>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -280,7 +281,7 @@ Microseconds GreedyScheduler::ComputeSchedule(
 const Node* GreedyScheduler::GetNodeWithHighestPriority(
     const std::vector<const Node*>& nodes) {
   const Node* curr_node = nullptr;
-  int64_t curr_priority = kint64max;
+  int64_t curr_priority = std::numeric_limits<int64_t>::max();
   for (const Node* n : nodes) {
     if ((*priority_)[n->id()] < curr_priority) {
       curr_node = n;
diff --git a/tensorflow/core/example/example_parser_configuration.cc b/tensorflow/core/example/example_parser_configuration.cc
index 085d215656c978..7f3cbcd2a49936 100644
--- a/tensorflow/core/example/example_parser_configuration.cc
+++ b/tensorflow/core/example/example_parser_configuration.cc
@@ -30,7 +30,7 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status FindNodeIndexByName(const tensorflow::GraphDef& graph,
-                                 const string& node_name, int* node_idx) {
+                                 const std::string& node_name, int* node_idx) {
   for (int i = 0; i < graph.node_size(); ++i) {
     const auto& node = graph.node(i);
     if (node.name() == node_name) {
@@ -42,7 +42,7 @@ absl::Status FindNodeIndexByName(const tensorflow::GraphDef& graph,
 }
 
 absl::Status ExtractExampleParserConfiguration(
-    const tensorflow::GraphDef& graph, const string& node_name,
+    const tensorflow::GraphDef& graph, const std::string& node_name,
     tensorflow::Session* session,
     std::vector<FixedLenFeature>* fixed_len_features,
     std::vector<VarLenFeature>* var_len_features) {
@@ -95,7 +95,7 @@ absl::Status ExtractExampleParserConfiguration(
 
   // We must fetch the configuration input tensors to the ParseExample op.
   // Skipping index = 0, which is the serialized proto input.
-  std::vector<string> fetch_names(node.input_size() - 1);
+  std::vector<std::string> fetch_names(node.input_size() - 1);
   for (int i = 1; i < node.input_size(); ++i) {
     fetch_names[i - 1] = node.input(i);
   }
@@ -134,7 +134,7 @@ absl::Status ExtractExampleParserConfiguration(
   int sparse_shapes_output_start = sparse_values_output_start + num_sparse;
   int dense_values_output_start = sparse_shapes_output_start + num_sparse;
 
-  string node_output_prefix = absl::StrCat(node_name, ":");
+  std::string node_output_prefix = absl::StrCat(node_name, ":");
 
   for (int i = 0; i < num_sparse; ++i) {
     VarLenFeature& config = (*var_len_features)[i];
@@ -166,7 +166,7 @@ absl::Status ExampleParserConfigurationProtoToFeatureVectors(
     std::vector<VarLenFeature>* var_len_features) {
   const auto& feature_map = config_proto.feature_map();
   for (auto it = feature_map.cbegin(); it != feature_map.cend(); ++it) {
-    string key = it->first;
+    std::string key = it->first;
     const auto& config = it->second;
     if (config.has_fixed_len_feature()) {
       const auto& fixed_config = config.fixed_len_feature();
diff --git a/tensorflow/core/example/example_parser_configuration.h b/tensorflow/core/example/example_parser_configuration.h
index dd2aacaee2c078..b202b035da16c5 100644
--- a/tensorflow/core/example/example_parser_configuration.h
+++ b/tensorflow/core/example/example_parser_configuration.h
@@ -38,7 +38,7 @@ namespace tensorflow {
 // Given a graph and the node_name of a ParseExample op,
 // extract the FixedLenFeature/VarLenFeature configurations.
 absl::Status ExtractExampleParserConfiguration(
-    const tensorflow::GraphDef& graph, const string& node_name,
+    const tensorflow::GraphDef& graph, const std::string& node_name,
     tensorflow::Session* session,
     std::vector<FixedLenFeature>* fixed_len_features,
     std::vector<VarLenFeature>* var_len_features);
diff --git a/tensorflow/core/example/example_parser_configuration_test.cc b/tensorflow/core/example/example_parser_configuration_test.cc
index 8abbd705cbcbe7..d83984d3373139 100644
--- a/tensorflow/core/example/example_parser_configuration_test.cc
+++ b/tensorflow/core/example/example_parser_configuration_test.cc
@@ -29,7 +29,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void ReadFileToStringOrDie(Env* env, const string& filename, string* output) {
+void ReadFileToStringOrDie(Env* env, const std::string& filename,
+                           std::string* output) {
   TF_CHECK_OK(ReadFileToString(env, filename, output));
 }
 
@@ -42,8 +43,8 @@ std::unique_ptr<Session> CreateSession() {
 class ExtractExampleParserConfigurationTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    string proto_string;
-    string filename =
+    std::string proto_string;
+    std::string filename =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/example/testdata/parse_example_graph_def.pbtxt");
     ReadFileToStringOrDie(Env::Default(), filename, &proto_string);
diff --git a/tensorflow/core/example/feature_util_test.cc b/tensorflow/core/example/feature_util_test.cc
index 374bbe6093b717..8192c7b9ffa420 100644
--- a/tensorflow/core/example/feature_util_test.cc
+++ b/tensorflow/core/example/feature_util_test.cc
@@ -455,9 +455,9 @@ TEST(AppendFeatureValuesTest, StringValuesUsingInitializerList) {
 TEST(AppendFeatureValuesTest, StringVariablesUsingInitializerList) {
   Example example;
 
-  string string1("FOO");
-  string string2("BAR");
-  string string3("BAZ");
+  std::string string1("FOO");
+  std::string string2("BAR");
+  std::string string3("BAZ");
 
   AppendFeatureValues({string1, string2, string3}, "tag", &example);
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index e7aa3a0bf21c17..d8d38eb58e9ae2 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -96,7 +96,7 @@ constexpr int kMaxTensorNestDepth = 100;
 // to serialize, compute hash based on TensorProto string representation.
 // This approach may result different hash codes with identical Tensors if they
 // are defined with different TensorProto representations.
-uint64 TensorProtoHash(const TensorProto& tp) {
+uint64_t TensorProtoHash(const TensorProto& tp) {
   Tensor tensor(tp.dtype());
   bool success = tensor.FromProto(tp);
   if (success) {
@@ -112,7 +112,7 @@ uint64 TensorProtoHash(const TensorProto& tp) {
 // string representation. Tensors with identical content potentially can have a
 // different hash code if they are defined with different TensorProto
 // representations.
-uint64 FastTensorProtoHash(const TensorProto& tp) {
+uint64_t FastTensorProtoHash(const TensorProto& tp) {
   if (attr_value_util_internal::TensorByteSize(tp) >
       kMaxAttrValueTensorByteSize) {
     return DeterministicProtoHash64(tp);
@@ -180,15 +180,17 @@ bool AreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs,
   return AreSerializedProtosEqual(lhs_tp, rhs_tp);
 }
 
-using TensorProtoHasher = std::function<uint64(const TensorProto&)>;
+using TensorProtoHasher = std::function<uint64_t(const TensorProto&)>;
 
-uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
+uint64_t AttrValueHash(const AttrValue& a,
+                       const TensorProtoHasher& tensor_hash) {
   if (a.has_tensor()) return tensor_hash(a.tensor());
 
   if (a.has_func()) {
     const NameAttrList& func = a.func();
-    uint64 h = Hash64(func.name());
-    std::map<string, AttrValue> map(func.attr().begin(), func.attr().end());
+    uint64_t h = Hash64(func.name());
+    std::map<std::string, AttrValue> map(func.attr().begin(),
+                                         func.attr().end());
     for (const auto& pair : map) {
       h = Hash64(pair.first.data(), pair.first.size(), h);
       h = Hash64Combine(AttrValueHash(pair.second, tensor_hash), h);
@@ -200,8 +202,8 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
   return DeterministicProtoHash64(a);
 }
 
-string SummarizeString(const string& str) {
-  string escaped = absl::CEscape(str);
+std::string SummarizeString(const std::string& str) {
+  std::string escaped = absl::CEscape(str);
 
   // If the string is long, replace the middle with ellipses.
   constexpr int kMaxStringSummarySize = 80;
@@ -216,7 +218,7 @@ string SummarizeString(const string& str) {
   }
 }
 
-string SummarizeTensor(const TensorProto& tensor_proto) {
+std::string SummarizeTensor(const TensorProto& tensor_proto) {
   Tensor t;
   int64_t tensor_byte_size =
       attr_value_util_internal::TensorByteSize(tensor_proto);
@@ -233,8 +235,8 @@ string SummarizeTensor(const TensorProto& tensor_proto) {
   return t.DebugString();
 }
 
-string SummarizeFunc(const NameAttrList& func) {
-  std::vector<string> entries;
+std::string SummarizeFunc(const NameAttrList& func) {
+  std::vector<std::string> entries;
   for (const auto& p : func.attr()) {
     entries.push_back(absl::StrCat(p.first, "=", SummarizeAttrValue(p.second)));
   }
@@ -242,7 +244,8 @@ string SummarizeFunc(const NameAttrList& func) {
   return absl::StrCat(func.name(), "[", absl::StrJoin(entries, ", "), "]");
 }
 
-bool ParseAttrValueHelper_TensorNestsUnderLimit(int limit, string to_parse) {
+bool ParseAttrValueHelper_TensorNestsUnderLimit(int limit,
+                                                std::string to_parse) {
   int nests = 0;
   int maxed_out = to_parse.length();
   int open_curly = to_parse.find('{');
@@ -292,7 +295,7 @@ bool ParseAttrValueHelper_TensorNestsUnderLimit(int limit, string to_parse) {
 
 }  // namespace
 
-string SummarizeAttrValue(const AttrValue& attr_value) {
+std::string SummarizeAttrValue(const AttrValue& attr_value) {
   switch (attr_value.value_case()) {
     case AttrValue::kS:
       return SummarizeString(attr_value.s());
@@ -309,7 +312,7 @@ string SummarizeAttrValue(const AttrValue& attr_value) {
     case AttrValue::kTensor:
       return SummarizeTensor(attr_value.tensor());
     case AttrValue::kList: {
-      std::vector<string> pieces;
+      std::vector<std::string> pieces;
       if (attr_value.list().s_size() > 0) {
         for (int i = 0; i < attr_value.list().s_size(); ++i) {
           pieces.push_back(SummarizeString(attr_value.list().s(i)));
@@ -472,7 +475,7 @@ absl::Status AttrValueHasType(const AttrValue& attr_value,
 bool ParseAttrValue(absl::string_view type, absl::string_view text,
                     AttrValue* out) {
   // Parse type.
-  string field_name;
+  std::string field_name;
   bool is_list = absl::ConsumePrefix(&type, "list(");
   if (absl::ConsumePrefix(&type, "string")) {
     field_name = "s";
@@ -500,7 +503,7 @@ bool ParseAttrValue(absl::string_view type, absl::string_view text,
   }
 
   // Construct a valid text proto message to parse.
-  string to_parse;
+  std::string to_parse;
   if (is_list) {
     // TextFormat parser considers "i: 7" to be the same as "i: [7]",
     // but we only want to allow list values with [].
@@ -550,8 +553,8 @@ void SetAttrValue(const AttrValue& value, AttrValue* out) { *out = value; }
   DEFINE_SET_ATTR_VALUE_ONE(ARG_TYPE, FIELD)        \
   DEFINE_SET_ATTR_VALUE_LIST(gtl::ArraySlice<ARG_TYPE>, FIELD)
 
-DEFINE_SET_ATTR_VALUE_ONE(const string&, s)
-DEFINE_SET_ATTR_VALUE_LIST(absl::Span<const string>, s)
+DEFINE_SET_ATTR_VALUE_ONE(const std::string&, s)
+DEFINE_SET_ATTR_VALUE_LIST(absl::Span<const std::string>, s)
 DEFINE_SET_ATTR_VALUE_BOTH(const char*, s)
 DEFINE_SET_ATTR_VALUE_BOTH(int64_t, i)
 DEFINE_SET_ATTR_VALUE_BOTH(int32_t, i)
@@ -585,7 +588,7 @@ void SetAttrValue(const absl::Span<const absl::string_view> value,
   }
 }
 
-void MoveAttrValue(std::vector<string>&& value, AttrValue* out) {
+void MoveAttrValue(std::vector<std::string>&& value, AttrValue* out) {
   out->mutable_list()->Clear();  // Create list() even if value empty.
   for (auto& v : value) {
     out->mutable_list()->add_s(std::move(v));
@@ -689,8 +692,8 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
     const NameAttrList& af = a.func();
     const NameAttrList& bf = b.func();
     if (af.name() != bf.name()) return false;
-    std::unordered_map<string, AttrValue> am(af.attr().begin(),
-                                             af.attr().end());
+    std::unordered_map<std::string, AttrValue> am(af.attr().begin(),
+                                                  af.attr().end());
     for (const auto& bm_pair : bf.attr()) {
       const auto& iter = am.find(bm_pair.first);
       if (iter == am.end()) return false;
@@ -708,11 +711,11 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
   return AreSerializedProtosEqual(a, b);
 }
 
-uint64 AttrValueHash(const AttrValue& a) {
+uint64_t AttrValueHash(const AttrValue& a) {
   return AttrValueHash(a, TensorProtoHash);
 }
 
-uint64 FastAttrValueHash(const AttrValue& a) {
+uint64_t FastAttrValueHash(const AttrValue& a) {
   return AttrValueHash(a, FastTensorProtoHash);
 }
 
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index b6f7c972c71624..135bfe67231f37 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -76,12 +76,12 @@ void SetAttrValue(const Tensor& value, AttrValue* out);
 void SetAttrValue(const TensorProto& value, AttrValue* out);
 void SetAttrValue(const NameAttrList& value, AttrValue* out);
 
-void SetAttrValue(absl::Span<const string> value, AttrValue* out);
+void SetAttrValue(absl::Span<const std::string> value, AttrValue* out);
 void SetAttrValue(absl::Span<const tstring> value, AttrValue* out);
 void SetAttrValue(absl::Span<const char* const> value, AttrValue* out);
 void SetAttrValue(absl::Span<const absl::string_view> value, AttrValue* out);
 void SetAttrValue(absl::Span<const int64_t> value, AttrValue* out);
-void SetAttrValue(absl::Span<const int32> value, AttrValue* out);
+void SetAttrValue(absl::Span<const int32_t> value, AttrValue* out);
 void SetAttrValue(absl::Span<const float> value, AttrValue* out);
 void SetAttrValue(absl::Span<const double> value, AttrValue* out);
 void SetAttrValue(absl::Span<const bool> value, AttrValue* out);
@@ -97,7 +97,7 @@ void SetAttrValue(absl::Span<const NameAttrList> value, AttrValue* out);
 
 void SetAttrValue(const AttrValue& value, AttrValue* out);
 
-void MoveAttrValue(std::vector<string>&& value, AttrValue* out);
+void MoveAttrValue(std::vector<std::string>&& value, AttrValue* out);
 
 // Returns a hash of `a` that is consistent with AreAttrValuesEqual. In other
 // words, if two AttrValues compare equal according to AreAttrValuesEqual,
@@ -105,7 +105,7 @@ void MoveAttrValue(std::vector<string>&& value, AttrValue* out);
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 AttrValueHash(const AttrValue& a);
+uint64_t AttrValueHash(const AttrValue& a);
 
 // WARNING: Equality check might return false-negative for large (> 32mb)
 // tensors defined with different TensorProto representations.
@@ -117,7 +117,7 @@ uint64 AttrValueHash(const AttrValue& a);
 // bool_val), they will have different hash code and equals will return false.
 // Small (less than 32mb) tensors with different TensorProto representations
 // hashed/compared by their tensor content.
-uint64 FastAttrValueHash(const AttrValue& a);
+uint64_t FastAttrValueHash(const AttrValue& a);
 // Returns true if a and b have the same value. If false negatives are allowed,
 // then compares proto representation to avoid construction of large (> 32mb)
 // tensors.
@@ -134,7 +134,7 @@ bool HasPlaceHolder(const AttrValue& val);
 // SubstituteFunc is given a placeholder string. If the placeholder is
 // unknown, SubstituteFunc returns false. Otherwise, overwrites the
 // attr value and returns true.
-using SubstituteFunc = std::function<bool(const string&, AttrValue*)>;
+using SubstituteFunc = std::function<bool(const std::string&, AttrValue*)>;
 bool SubstitutePlaceholders(const SubstituteFunc& substitute, AttrValue* value);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index 4e8daeb8f04dde..d6d685ef4c49f0 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -36,14 +36,14 @@ AttrValue V(T value) {
   return ret;
 }
 
-AttrValue P(const string& p) {
+AttrValue P(const std::string& p) {
   AttrValue ret;
   ret.set_placeholder(p);
   return ret;
 }
 
-AttrValue F(const string& name,
-            std::vector<std::pair<string, AttrValue>> pairs) {
+AttrValue F(const std::string& name,
+            std::vector<std::pair<std::string, AttrValue>> pairs) {
   AttrValue ret;
   ret.mutable_func()->set_name(name);
   ret.mutable_func()->mutable_attr()->insert(pairs.begin(), pairs.end());
@@ -51,7 +51,8 @@ AttrValue F(const string& name,
 }
 
 AttrValue Fs(
-    std::vector<std::pair<string, std::vector<std::pair<string, AttrValue>>>>
+    std::vector<
+        std::pair<std::string, std::vector<std::pair<std::string, AttrValue>>>>
         funcs) {
   AttrValue ret;
   for (const auto& func : funcs) {
@@ -82,7 +83,7 @@ TEST(AttrValueUtil, HasType) {
 }
 
 SubstituteFunc ReplaceTWith(const AttrValue& val) {
-  return [val](const string& placeholder, AttrValue* target) {
+  return [val](const std::string& placeholder, AttrValue* target) {
     if (placeholder == "T") {
       *target = val;
       return true;
@@ -142,14 +143,14 @@ TEST(AttrValueUtil, DeepAttr) {
 
 TEST(AttrValueUtil, SummarizeAttrValueDoesNotElideShortStrings) {
   AttrValue attr_value;
-  SetAttrValue(string(40, '-'), &attr_value);
-  EXPECT_EQ(absl::StrCat("\"", string(40, '-'), "\""),
+  SetAttrValue(std::string(40, '-'), &attr_value);
+  EXPECT_EQ(absl::StrCat("\"", std::string(40, '-'), "\""),
             SummarizeAttrValue(attr_value));
 }
 
 TEST(AttrValueUtil, SummarizeAttrValueElidesLongStrings) {
   AttrValue attr_value;
-  SetAttrValue(string(80, '-'), &attr_value);
+  SetAttrValue(std::string(80, '-'), &attr_value);
   EXPECT_EQ("\"----------...----------\"", SummarizeAttrValue(attr_value));
 }
 
@@ -197,7 +198,7 @@ TEST(AttrValueUtil, TensorByteSizeShouldNotOverflow) {
   }
 }
 
-AttrValue FromText(const string& text) {
+AttrValue FromText(const std::string& text) {
   AttrValue attr;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &attr));
   return attr;
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index b2f4b856ec9feb..e4456d888df736 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -32,11 +32,11 @@ struct RegistrationInfo {
   // what is effectively a static instance of the collective implementation.
   // During param resolution of collective ops we return this static instance.
   // The actual op execution gets a fresh instance using `factory`.
-  RegistrationInfo(const string& n, CollectiveRegistry::Factory f)
+  RegistrationInfo(const std::string& n, CollectiveRegistry::Factory f)
       : name(n),
         factory(std::move(f)),
         param_resolver_instance(this->factory()) {}
-  string name;
+  std::string name;
   CollectiveRegistry::Factory factory;
   CollectiveImplementationInterface* param_resolver_instance;
 };
@@ -48,13 +48,13 @@ std::vector<RegistrationInfo>* MutableCollectiveRegistry() {
 }
 }  // namespace
 
-string CollGroupRuntimeDetails::ToString() const {
+std::string CollGroupRuntimeDetails::ToString() const {
   return absl::StrCat("CollGroupRuntimeDetails {communicator_key=",
                       absl::CEscape(communicator_key), "}");
 }
 
-string CollGroupParams::ToString() const {
-  string v = strings::StrCat(
+std::string CollGroupParams::ToString() const {
+  std::string v = strings::StrCat(
       "CollGroupParams {group_key=", group_key, " group_size=", group_size,
       " device_type=", device_type.type_string(), " num_tasks=", num_tasks,
       " runtime_details=", runtime_details.ToString(), " devices {");
@@ -94,8 +94,8 @@ CollInstanceParams& CollInstanceParams::operator=(
   return *this;
 }
 
-string CollInstanceParams::ToString() const {
-  string v =
+std::string CollInstanceParams::ToString() const {
+  std::string v =
       strings::StrCat("CollInstanceParams { instance_key=", instance_key,
                       " type=", type, " data_type=", DataTypeString(data_type),
                       " shape=", shape.DebugString(), " devices {");
@@ -134,8 +134,9 @@ string CollInstanceParams::ToString() const {
   return v;
 }
 
-string CollectiveParams::ToString() const {
-  string v = absl::StrCat("CollectiveParams ", name, " {", group.ToString());
+std::string CollectiveParams::ToString() const {
+  std::string v =
+      absl::StrCat("CollectiveParams ", name, " {", group.ToString());
   absl::StrAppend(&v, " ", instance.ToString());
   strings::StrAppend(&v, " default_rank=", default_rank,
                      " is_source=", is_source, " source_rank=", source_rank,
@@ -156,7 +157,7 @@ CollectiveContext::CollectiveContext(
     CollectiveExecutor* col_exec, NcclCommunicatorInterface* nccl_communicator,
     const DeviceMgr* dev_mgr, OpKernelContext* ctx,
     OpKernelContext::Params* op_params, const CollectiveParams* col_params,
-    const string& exec_key, int64_t step_id, const Tensor* input,
+    const std::string& exec_key, int64_t step_id, const Tensor* input,
     Tensor* output)
     : col_exec(col_exec),
       nccl_communicator(nccl_communicator),
@@ -177,14 +178,14 @@ int64_t CollectiveExecutor::kInvalidId = -1;
 
 /*static*/
 absl::Status CollectiveRegistry::Lookup(
-    const string& collective_name,
+    const std::string& collective_name,
     CollectiveImplementationInterface** implementation) {
   return LookupHelper(collective_name, implementation, false);
 }
 
 /*static*/
 absl::Status CollectiveRegistry::LookupParamResolverInstance(
-    const string& collective_name,
+    const std::string& collective_name,
     CollectiveImplementationInterface** implementation) {
   return LookupHelper(collective_name, implementation, true);
 }
@@ -198,7 +199,7 @@ void CollectiveRegistry::GetAll(
 }
 
 /*static*/
-absl::Status CollectiveRegistry::Register(const string& collective_name,
+absl::Status CollectiveRegistry::Register(const std::string& collective_name,
                                           Factory factory) {
   std::vector<RegistrationInfo>* registry = MutableCollectiveRegistry();
   for (const RegistrationInfo& reg_info : *registry) {
@@ -212,7 +213,7 @@ absl::Status CollectiveRegistry::Register(const string& collective_name,
 
 /*static*/
 absl::Status CollectiveRegistry::LookupHelper(
-    const string& collective_name,
+    const std::string& collective_name,
     CollectiveImplementationInterface** implementation, bool param_resolver) {
   std::vector<RegistrationInfo>* registry = MutableCollectiveRegistry();
   for (const RegistrationInfo& reg_info : *registry) {
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 8fca00f0e3b515..cdb22129e813d4 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -56,16 +56,16 @@ enum CollectiveType {
 // the OpKernel.  Currently, this struct is used to set communicator key for
 // NCCL-based collective implementation.
 struct CollGroupRuntimeDetails {
-  string communicator_key;  // for communicator-based techniques e.g. NCCL
-  string ToString() const;
+  std::string communicator_key;  // for communicator-based techniques e.g. NCCL
+  std::string ToString() const;
 };
 
 struct CollGroupMember {
   DeviceAttributes device;
-  string task;
+  std::string task;
   bool is_local;
   // User provided rank
-  int32 rank = -1;
+  int32_t rank = -1;
 };
 
 // Data common to all members of a device group.
@@ -73,8 +73,8 @@ struct CollGroupMember {
 // particular to an instance so it is stored there.
 struct CollGroupParams {
   // Inputs from Collective ops:
-  int32 group_key;
-  int32 group_size;
+  int32_t group_key;
+  int32_t group_size;
   DeviceType device_type;
   int user_specified_rank = -1;  // rank provided by the user.
   // Generated from Collective Group Resolver:
@@ -83,10 +83,10 @@ struct CollGroupParams {
   // True if every task has the same number of devices.
   bool same_num_devices_per_task = false;
   // Task -> number of devices on that task.
-  std::unordered_map<string, int32> num_devices_per_task;
-  int32 num_tasks;  // number of distinct tasks in group
+  std::unordered_map<std::string, int32_t> num_devices_per_task;
+  int32_t num_tasks;  // number of distinct tasks in group
   CollGroupRuntimeDetails runtime_details;
-  string ToString() const;
+  std::string ToString() const;
   CollGroupParams()
       : group_key(0), group_size(0), device_type(DEVICE_CPU), num_tasks(0) {}
 };
@@ -99,7 +99,7 @@ struct CollGroupParams {
 // interpretation.  On first execution the runtime will update this
 // structure with decisions that will guide all subsequent executions.
 struct CollImplDetails {
-  string collective_name;
+  std::string collective_name;
   std::vector<std::vector<int>> subdiv_permutations;
   // subdiv_offsets and max_subdivs_per_device are used together as follows:
   // When subdiv_offsets is provided (non-empty) it is used as is. When
@@ -110,10 +110,10 @@ struct CollImplDetails {
   int max_subdivs_per_device = -1;  // Upper bound on subdivisions per device.
   std::vector<int> subdiv_offsets;
   std::vector<int> subdiv_source_rank;  // rank of source in each subdiv
-  std::vector<int32>
-      dependencies;           // collective instances on which this node depends
-  string communication_hint;  // user-supplied hint for implementation choice,
-                              // e.g. ring or nccl
+  std::vector<int32_t>
+      dependencies;  // collective instances on which this node depends
+  std::string communication_hint;  // user-supplied hint for implementation
+                                   // choice, e.g. ring or nccl
   float timeout_seconds;      // If non zero, set a completion timeout for the
                               // collective op to detect staleness.
 };
@@ -122,16 +122,16 @@ struct CollImplDetails {
 // TODO(b/163171014) Refactor this struct to not be a union of all fields.
 struct CollInstanceParams {
   // Identifies all participating graph nodes.
-  int32 instance_key = -1;
+  int32_t instance_key = -1;
   // The full identifier includes both instance_key and step_id.
   int64_t step_id = 0;
   CollectiveType type = UNDEFINED_COLLECTIVE;
   DataType data_type = DT_FLOAT;
   TensorShape shape = {0};
   CollImplDetails impl_details;
-  string ToString() const;
+  std::string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
-  std::vector<string> devices;  // permuter only
+  std::vector<std::string> devices;  // permuter only
 
   // For permuter only
   // Each rank in the permutation is a receiver.
@@ -148,7 +148,7 @@ struct CollectiveParams : public core::RefCounted {
   CollGroupParams group;
   CollInstanceParams instance;
 
-  string name = "";        // node name used only for log or error messages
+  std::string name = "";   // node name used only for log or error messages
   int default_rank = -1;   // index of this op within device_names
   bool is_source = false;  // broadcast only
   int source_rank = -1;    // broadcast only
@@ -156,7 +156,7 @@ struct CollectiveParams : public core::RefCounted {
   std::vector<int> subdiv_rank;
   OpKernel* merge_op = nullptr;  // reduction only
   OpKernel* final_op = nullptr;  // reduction only
-  string ToString() const;
+  std::string ToString() const;
   bool run_group_initialization = true;
   bool is_stateless = false;
 };
@@ -169,12 +169,12 @@ class DeviceResolverInterface {
   virtual ~DeviceResolverInterface() {}
 
   // Populates *attributes with the DeviceAttributes of the specified device.
-  virtual absl::Status GetDeviceAttributes(const string& device,
+  virtual absl::Status GetDeviceAttributes(const std::string& device,
                                            DeviceAttributes* attributes) = 0;
 
   // Returns all device attributes of a task.
   virtual absl::Status GetAllDeviceAttributes(
-      const string& task, std::vector<DeviceAttributes>* attributes) = 0;
+      const std::string& task, std::vector<DeviceAttributes>* attributes) = 0;
 
   // Updates device attributes. It returns error if any device already
   // exists in the DeviceResolver and has a different incarnation.
@@ -284,19 +284,17 @@ class CollectiveRemoteAccess {
  public:
   virtual ~CollectiveRemoteAccess() {}
 
-  virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
-                            bool peer_is_local, const string& key,
-                            Device* to_device, DeviceContext* to_device_ctx,
-                            const AllocatorAttributes& to_alloc_attr,
-                            Tensor* to_tensor,
-                            const DeviceLocality& client_locality,
-                            int dev_to_dev_stream_index,
-                            CancellationManager* cancellation_manager,
-                            const StatusCallback& done) = 0;
-
-  virtual void PostToPeer(const string& peer_device, const string& peer_task,
-                          const string& key, Device* from_device,
-                          DeviceContext* from_device_ctx,
+  virtual void RecvFromPeer(
+      const std::string& peer_device, const std::string& peer_task,
+      bool peer_is_local, const std::string& key, Device* to_device,
+      DeviceContext* to_device_ctx, const AllocatorAttributes& to_alloc_attr,
+      Tensor* to_tensor, const DeviceLocality& client_locality,
+      int dev_to_dev_stream_index, CancellationManager* cancellation_manager,
+      const StatusCallback& done) = 0;
+
+  virtual void PostToPeer(const std::string& peer_device,
+                          const std::string& peer_task, const std::string& key,
+                          Device* from_device, DeviceContext* from_device_ctx,
                           const AllocatorAttributes& from_alloc_attr,
                           const Tensor* from_tensor,
                           const DeviceLocality& client_locality,
@@ -306,7 +304,8 @@ class CollectiveRemoteAccess {
   // Checks the health of a collective peer. It probes the peer to see if it is
   // alive. Note that if a peer has restarted, it's considered a different one,
   // so CheckPeerHealth fails.
-  virtual void CheckPeerHealth(const string& peer_task, int64_t timeout_in_ms,
+  virtual void CheckPeerHealth(const std::string& peer_task,
+                               int64_t timeout_in_ms,
                                const StatusCallback& done) = 0;
 
   virtual BufRendezvous* buf_rendezvous() = 0;
@@ -322,7 +321,7 @@ class CollectiveExecutor : public core::RefCounted {
 
   virtual void ExecuteAsync(OpKernelContext* ctx,
                             const CollectiveParams* col_params,
-                            const string& exec_key, StatusCallback done) {
+                            const std::string& exec_key, StatusCallback done) {
     done(errors::Internal(
         "A collective Op has been called in a context in which "
         "a CollectiveExecutor has not been provided."));
@@ -404,27 +403,28 @@ struct CollectiveContext {
   OpKernelContext* op_ctx;                       // Not owned
   OpKernelContext::Params* op_params;            // Not owned
   core::IntrusivePtr<const CollectiveParams> col_params;
-  const string exec_key;
+  const std::string exec_key;
   const int64_t step_id;
   const Tensor* input;  // Not owned
   Tensor* output;       // Not owned
   Device* device;       // The device for which this instance labors
-  const string device_name;
+  const std::string device_name;
   DeviceLocality device_locality;
 
   CollectiveContext(CollectiveExecutor* col_exec,
                     NcclCommunicatorInterface* nccl_communicator,
                     const DeviceMgr* dev_mgr, OpKernelContext* ctx,
                     OpKernelContext::Params* op_params,
-                    const CollectiveParams* col_params, const string& exec_key,
-                    int64_t step_id, const Tensor* input, Tensor* output);
+                    const CollectiveParams* col_params,
+                    const std::string& exec_key, int64_t step_id,
+                    const Tensor* input, Tensor* output);
 };
 
 class NcclCommunicatorInterface {
  public:
   virtual ~NcclCommunicatorInterface() = default;
 
-  virtual string GenerateCommunicatorKey() = 0;
+  virtual std::string GenerateCommunicatorKey() = 0;
 
   virtual void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
                        StatusCallback done) = 0;
@@ -474,7 +474,7 @@ class CollectiveRegistry {
   // `collective_name`.  If found, creates an instance of the implementation and
   // assign to `implementation`.
   static absl::Status Lookup(
-      const string& collective_name,
+      const std::string& collective_name,
       CollectiveImplementationInterface** implementation);
 
   // Looks up a previously registered CollectiveImplementation under
@@ -482,7 +482,7 @@ class CollectiveRegistry {
   // implementation via `implementation`.  This instance should only be used to
   // call InitializateCollectiveParams.
   static absl::Status LookupParamResolverInstance(
-      const string& collective_name,
+      const std::string& collective_name,
       CollectiveImplementationInterface** implementation);
 
   // Returns all registered collective implementations.
@@ -496,10 +496,11 @@ class CollectiveRegistry {
   // the CollectiveImplementation.  Also creates a static instance of the
   // implementation - this instance is used during param resolution and should
   // only be used to call InitializeCollectiveParams.
-  static absl::Status Register(const string& collective_name, Factory factory);
+  static absl::Status Register(const std::string& collective_name,
+                               Factory factory);
 
   static absl::Status LookupHelper(
-      const string& collective_name,
+      const std::string& collective_name,
       CollectiveImplementationInterface** implementation, bool param_resolver);
 };
 
@@ -507,7 +508,7 @@ class CollectiveRegistry {
 // create a global static object.
 class CollectiveRegistration {
  public:
-  CollectiveRegistration(const string& collective_name,
+  CollectiveRegistration(const std::string& collective_name,
                          CollectiveRegistry::Factory factory) {
     TF_CHECK_OK(CollectiveRegistry::Register(collective_name, factory));
   }
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 0f495b17a69544..bcfd94424e59c7 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -164,10 +164,10 @@ absl::Status EinsumShape(shape_inference::InferenceContext* c) {
   // We assume that the equation has a valid format. Either (x),(y)->(z)
   // or (x)->(z), where each of (x), (y) and (z) are concatenation of zero or
   // more latin alphabets and contains at most one ellipsis ('...').
-  string equation;
+  std::string equation;
   TF_RETURN_IF_ERROR(c->GetAttr("equation", &equation));
-  absl::InlinedVector<string, 2> input_labels;
-  string output_labels;
+  absl::InlinedVector<std::string, 2> input_labels;
+  std::string output_labels;
   TF_RETURN_IF_ERROR(
       ValidateEinsumEquation(equation, &input_labels, &output_labels));
 
@@ -391,7 +391,7 @@ absl::Status BiasAddShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
 
   // Fetch the data_format attribute, which may not exist.
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
   if (s.ok() && data_format == "NCHW") {
@@ -449,7 +449,7 @@ absl::Status BiasAddShape(shape_inference::InferenceContext* c) {
 absl::Status BiasAddGradShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   // Fetch the data_format attribute, which may not exist.
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
   if (s.ok() && data_format == "NCHW") {
@@ -465,7 +465,7 @@ absl::Status BiasAddGradShape(shape_inference::InferenceContext* c) {
 
 absl::Status CheckFormatConstraintsOnShape(
     const TensorFormat tensor_format, const ShapeHandle shape_handle,
-    const string& tensor_name, shape_inference::InferenceContext* c) {
+    const std::string& tensor_name, shape_inference::InferenceContext* c) {
   if (tensor_format == FORMAT_NCHW_VECT_C) {
     // Check that the vect dim has size 4 or 32.
     const int num_dims = c->Rank(shape_handle);
@@ -593,7 +593,7 @@ namespace {
 
 absl::Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
                              bool supports_explicit_padding) {
-  string data_format_str, filter_format_str;
+  std::string data_format_str, filter_format_str;
   if (!c->GetAttr("data_format", &data_format_str).ok()) {
     data_format_str = "NHWC";
   }
@@ -626,7 +626,7 @@ absl::Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, filter_shape, "filter", c));
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
 
   if (dilations.size() != 4) {
@@ -635,7 +635,7 @@ absl::Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
         dilations.size());
   }
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
   // strides.size() should be 4 (NCHW) even if the input is 5 (NCHW_VECT_C).
@@ -808,7 +808,7 @@ absl::Status ConvShape(shape_inference::InferenceContext* c) {
   }
 
   // Default format is NHWC for 2D and NDHWC for 3D.
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   bool channels_last_format;
   if (data_format_str == "CHANNELS_LAST") {
@@ -827,7 +827,7 @@ absl::Status ConvShape(shape_inference::InferenceContext* c) {
   // Determine number of spatial dims.
   int spatial_dims = standard_input_rank - 2;
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
   // Default case.
   if (dilations.empty()) {
@@ -840,7 +840,7 @@ absl::Status ConvShape(shape_inference::InferenceContext* c) {
         " values, but got: ", dilations.size()));
   }
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != standard_input_rank) {
     return absl::InvalidArgumentError(
@@ -1004,10 +1004,10 @@ absl::Status Conv3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle filter_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &filter_shape));
 
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
 
   if (dilations.size() != 5) {
@@ -1016,7 +1016,7 @@ absl::Status Conv3DShape(shape_inference::InferenceContext* c) {
         dilations.size());
   }
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
     return errors::InvalidArgument(
@@ -1113,7 +1113,7 @@ absl::Status Conv3DShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status Conv2DBackpropInputShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   if (!c->GetAttr("data_format", &data_format_str).ok()) {
     data_format_str = "NHWC";
   }
@@ -1188,7 +1188,7 @@ absl::Status Conv2DBackpropFilterWithBiasShape(
     shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   // Fetch the data_format attribute, which may not exist.
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
@@ -1213,7 +1213,7 @@ absl::Status DepthwiseConv2DNativeShapeImpl(
   ShapeHandle filter_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape));
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
   if (strides.size() != 4) {
@@ -1223,7 +1223,7 @@ absl::Status DepthwiseConv2DNativeShapeImpl(
         strides.size());
   }
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   if (!c->GetAttr("dilations", &dilations).ok()) {
     dilations.resize(4, 1);
   }
@@ -1235,7 +1235,7 @@ absl::Status DepthwiseConv2DNativeShapeImpl(
         dilations.size());
   }
 
-  string data_format_str;
+  std::string data_format_str;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   TensorFormat data_format;
   if (!s.ok() || !FormatFromString(data_format_str, &data_format)) {
@@ -1338,7 +1338,7 @@ absl::Status DepthwiseConv2DNativeShapeWithExplicitPadding(
 }
 
 absl::Status AvgPoolShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   TensorFormat data_format;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   if (s.ok()) {
@@ -1354,7 +1354,7 @@ absl::Status AvgPoolShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, input_shape, "input", c));
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 4) {
     return errors::InvalidArgument(
@@ -1362,7 +1362,7 @@ absl::Status AvgPoolShape(shape_inference::InferenceContext* c) {
         strides.size());
   }
 
-  std::vector<int32> kernel_sizes;
+  std::vector<int32_t> kernel_sizes;
   TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
   if (kernel_sizes.size() != 4) {
     return errors::InvalidArgument(
@@ -1415,7 +1415,7 @@ absl::Status AvgPoolGradShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status FusedBatchNormShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1465,7 +1465,7 @@ absl::Status FusedBatchNormV3Shape(shape_inference::InferenceContext* c) {
 absl::Status FusedBatchNormExShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(FusedBatchNormV3Shape(c));
 
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1488,7 +1488,7 @@ absl::Status FusedBatchNormExShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status FusedBatchNormGradShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1537,7 +1537,7 @@ absl::Status FusedBatchNormGradExShape(shape_inference::InferenceContext* c) {
     return absl::OkStatus();
   }
 
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1565,19 +1565,20 @@ absl::Status FusedBatchNormGradExShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status ReadDiagIndex(InferenceContext* c, const Tensor* diag_index_tensor,
-                           int32* lower_diag_index, int32* upper_diag_index) {
+                           int32_t* lower_diag_index,
+                           int32_t* upper_diag_index) {
   // This function assumes that the shape of diag_index_tensor is fully defined.
   if (diag_index_tensor->dims() == 0) {
-    *lower_diag_index = diag_index_tensor->scalar<int32>()();
+    *lower_diag_index = diag_index_tensor->scalar<int32_t>()();
     *upper_diag_index = *lower_diag_index;
   } else {
     int32_t num_elements = diag_index_tensor->dim_size(0);
     if (num_elements == 1) {
-      *lower_diag_index = diag_index_tensor->vec<int32>()(0);
+      *lower_diag_index = diag_index_tensor->vec<int32_t>()(0);
       *upper_diag_index = *lower_diag_index;
     } else if (num_elements == 2) {
-      *lower_diag_index = diag_index_tensor->vec<int32>()(0);
-      *upper_diag_index = diag_index_tensor->vec<int32>()(1);
+      *lower_diag_index = diag_index_tensor->vec<int32_t>()(0);
+      *upper_diag_index = diag_index_tensor->vec<int32_t>()(1);
     } else {
       return errors::InvalidArgument(
           "diag_index must be a vector with one or two elements. It has ",
@@ -1815,7 +1816,7 @@ absl::Status MatrixSetDiagV2Shape(shape_inference::InferenceContext* c) {
 
 absl::Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
                               bool supports_explicit_padding) {
-  string data_format_str;
+  std::string data_format_str;
   TensorFormat data_format;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   if (s.ok()) {
@@ -1831,7 +1832,7 @@ absl::Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, input_shape, "input", c));
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 4) {
     return errors::InvalidArgument(
@@ -1839,7 +1840,7 @@ absl::Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
         strides.size());
   }
 
-  std::vector<int32> kernel_sizes;
+  std::vector<int32_t> kernel_sizes;
   TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
   if (kernel_sizes.size() != 4) {
     return errors::InvalidArgument(
@@ -1924,7 +1925,7 @@ absl::Status MaxPoolShapeWithExplicitPadding(
 
 absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
                             int num_inputs) {
-  string data_format_str;
+  std::string data_format_str;
   TensorFormat data_format;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   if (s.ok()) {
@@ -1940,8 +1941,8 @@ absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, input_shape, "input", c));
 
-  std::vector<int32> kernel_sizes;
-  std::vector<int32> strides;
+  std::vector<int32_t> kernel_sizes;
+  std::vector<int32_t> strides;
 
   if (c->num_inputs() + 2 == num_inputs) {
     TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
@@ -1962,7 +1963,7 @@ absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
       return absl::OkStatus();
     }
     kernel_sizes.resize(kernel_sizes_tensor->shape().num_elements());
-    auto kernel_sizes_vec = kernel_sizes_tensor->flat<int32>();
+    auto kernel_sizes_vec = kernel_sizes_tensor->flat<int32_t>();
     std::copy_n(&kernel_sizes_vec(0), kernel_sizes.size(),
                 kernel_sizes.begin());
 
@@ -1972,7 +1973,7 @@ absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
       return absl::OkStatus();
     }
     strides.resize(strides_tensor->shape().num_elements());
-    auto strides_vec = strides_tensor->flat<int32>();
+    auto strides_vec = strides_tensor->flat<int32_t>();
     std::copy_n(&strides_vec(0), strides.size(), strides.begin());
   }
 
@@ -2029,10 +2030,10 @@ absl::Status Pool3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
 
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
     return errors::InvalidArgument(
@@ -2041,7 +2042,7 @@ absl::Status Pool3DShape(shape_inference::InferenceContext* c) {
         strides.size());
   }
 
-  std::vector<int32> kernel_sizes;
+  std::vector<int32_t> kernel_sizes;
   TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
   if (kernel_sizes.size() != 5) {
     return errors::InvalidArgument(
@@ -2181,8 +2182,8 @@ absl::Status ReductionShape(InferenceContext* c) {
   const int32_t input_rank = c->Rank(input);
   std::set<int64_t> true_indices;
   if (reduction_indices_t->dtype() == DataType::DT_INT32) {
-    TF_RETURN_IF_ERROR(ReductionShapeHelper<int32>(reduction_indices_t,
-                                                   input_rank, &true_indices));
+    TF_RETURN_IF_ERROR(ReductionShapeHelper<int32_t>(
+        reduction_indices_t, input_rank, &true_indices));
   } else if (reduction_indices_t->dtype() == DataType::DT_INT64) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int64_t>(
         reduction_indices_t, input_rank, &true_indices));
@@ -2247,13 +2248,13 @@ absl::Status ConcatShapeHelper(InferenceContext* c, int start_value_index,
   // shape.
   int64_t concat_dim;
   if (concat_dim_t->dtype() == DT_INT32) {
-    concat_dim = static_cast<int64_t>(concat_dim_t->flat<int32>()(0));
+    concat_dim = static_cast<int64_t>(concat_dim_t->flat<int32_t>()(0));
   } else {
     concat_dim = concat_dim_t->flat<int64_t>()(0);
   }
 
   // Minimum required number of dimensions.
-  const int64 min_rank = concat_dim < 0 ? -concat_dim : concat_dim + 1;
+  const int64_t min_rank = concat_dim < 0 ? -concat_dim : concat_dim + 1;
 
   ShapeHandle output_before;
   ShapeHandle output_after;
@@ -2510,7 +2511,7 @@ absl::Status SliceShape(InferenceContext* c) {
           SliceHelper<int64_t>(c, begin_value, sizes_value, &dims));
     } else {
       TF_RETURN_IF_ERROR(
-          SliceHelper<int32>(c, begin_value, sizes_value, &dims));
+          SliceHelper<int32_t>(c, begin_value, sizes_value, &dims));
     }
     c->set_output(0, c->MakeShape(dims));
     return absl::OkStatus();
@@ -2749,7 +2750,7 @@ absl::Status SparseReduceShapeFn(InferenceContext* c) {
   const Tensor* axes_tensor = c->input_tensor(3);
   if (shape_tensor != nullptr && axes_tensor != nullptr) {
     auto shape_vec = shape_tensor->flat<int64_t>();
-    auto axes_vec = axes_tensor->flat<int32>();
+    auto axes_vec = axes_tensor->flat<int32_t>();
 
     int64_t ndims = shape_vec.size();
     absl::flat_hash_set<int64_t> axes;
@@ -2797,7 +2798,7 @@ absl::Status QuantizedConv2DShape(InferenceContext* c) {
 }
 
 absl::Status FusedQuantizedConvShape(InferenceContext* c, int num_dims) {
-  std::vector<string> fused_ops;
+  std::vector<std::string> fused_ops;
   TF_RETURN_IF_ERROR(c->GetAttr("fused_ops", &fused_ops));
   ShapeHandle unused, channel;
   bool fused_sum, fused_bias, fused_requantize;
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 9bc8a20208096f..a97915901cc027 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -220,7 +220,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
 TEST(CommonShapeFnsTest, Einsum_ShapeFn) {
   ShapeInferenceTestOp op("Einsum");
-  auto set_equation = [&op](int n, string equation) {
+  auto set_equation = [&op](int n, std::string equation) {
     std::vector<NodeDefBuilder::NodeOut> input_list;
     input_list.reserve(n);
     for (int i = 0; i < n; ++i) {
@@ -629,8 +629,9 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 
 TEST(CommonShapeFnsTest, ConvTest) {
   ShapeInferenceTestOp op("Conv");
-  auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      string data_format, int batch_dims, int groups) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding, std::string data_format,
+                      int batch_dims, int groups) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -715,9 +716,11 @@ TEST(CommonShapeFnsTest, ConvTest) {
 
 TEST(CommonShapeFnsTest, Conv2DFormatsTest) {
   ShapeInferenceTestOp op("Conv2D");
-  auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format,
-                      const std::vector<int32>& explicit_paddings = {}) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding,
+                      const std::string& data_format,
+                      const std::string& filter_format,
+                      const std::vector<int32_t>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -761,15 +764,17 @@ TEST(CommonShapeFnsTest, Conv2DFormatsTest) {
   INFER_OK(op, "[1,1,4,4,32];[32,1,2,1,32]", "[d0_0,1,3,2,d0_4]");
 }
 
-class Conv2DShapeTest : public ::testing::TestWithParam<string> {};
+class Conv2DShapeTest : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(Conv2DShapeTest, Conv2DShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format,
-                      const std::vector<int32>& explicit_paddings = {}) {
-    string format;
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding,
+                      const std::string& data_format,
+                      const std::string& filter_format,
+                      const std::vector<int32_t>& explicit_paddings = {}) {
+    std::string format;
     if (op.name == "Conv")
       format = (data_format == "NHWC") ? "CHANNELS_LAST" : "CHANNELS_FIRST";
     else
@@ -974,13 +979,14 @@ TEST_P(Conv2DShapeTest, Conv2DShapeTest) {
 }
 
 TEST_P(Conv2DShapeTest, Conv2DDilatedShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& dilations,
-                      const std::vector<int32>& strides, const string& padding,
-                      const string& data_format,
-                      const std::vector<int32>& explicit_paddings = {}) {
-    string format;
+  auto set_op = [&op](const std::vector<int32_t>& dilations,
+                      const std::vector<int32_t>& strides,
+                      const std::string& padding,
+                      const std::string& data_format,
+                      const std::vector<int32_t>& explicit_paddings = {}) {
+    std::string format;
     if (op.name == "Conv")
       format = (data_format == "NHWC") ? "CHANNELS_LAST" : "CHANNELS_FIRST";
     else
@@ -1129,8 +1135,8 @@ TEST(CommonShapeFnsTest, Conv3DShapeRankTest) {
 
 TEST(CommonShapeFnsTest, Conv3DGroupsTest) {
   ShapeInferenceTestOp op("Conv3D");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv3D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -1166,13 +1172,13 @@ TEST(CommonShapeFnsTest, Conv3DGroupsTest) {
 INSTANTIATE_TEST_SUITE_P(CommonShapeFnsTest, Conv2DShapeTest,
                          ::testing::Values("Conv2D", "Conv"));
 
-class Conv3DShapeTest : public ::testing::TestWithParam<string> {};
+class Conv3DShapeTest : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(Conv3DShapeTest, Conv3DShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -1245,11 +1251,11 @@ TEST_P(Conv3DShapeTest, Conv3DShapeTest) {
 }
 
 TEST_P(Conv3DShapeTest, Conv3DDilatedShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& dilations,
-                      const std::vector<int32>& strides,
-                      const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& dilations,
+                      const std::vector<int32_t>& strides,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -1300,7 +1306,7 @@ INSTANTIATE_TEST_SUITE_P(CommonShapeFnsTest, Conv3DShapeTest,
 
 TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   ShapeInferenceTestOp op("DepthwiseConv2dNative");
-  std::vector<int32> strides = {{1, 1, 1, 1}};
+  std::vector<int32_t> strides = {{1, 1, 1, 1}};
   TF_CHECK_OK(NodeDefBuilder("test", "DepthwiseConv2dNative")
                   .Input("input", 0, DT_FLOAT)
                   .Input("filter", 0, DT_FLOAT)
@@ -1344,9 +1350,10 @@ TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
 
 TEST(CommonShapeFnsTest, AvgPool2DShapeTest) {
   ShapeInferenceTestOp op("AvgPool");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const std::vector<int32>& ksizes, const string& padding,
-                      const string& data_format) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& ksizes,
+                      const std::string& padding,
+                      const std::string& data_format) {
     TF_CHECK_OK(NodeDefBuilder("test", "AvgPool")
                     .Input("input", 0, DT_FLOAT)
                     .Attr("strides", strides)
@@ -1390,9 +1397,10 @@ TEST(CommonShapeFnsTest, AvgPool2DShapeTest) {
 
 TEST(CommonShapeFnsTest, MaxPool2DShapeTest) {
   ShapeInferenceTestOp op("MaxPool");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const std::vector<int32>& ksizes, const string& padding,
-                      const string& data_format) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& ksizes,
+                      const std::string& padding,
+                      const std::string& data_format) {
     TF_CHECK_OK(NodeDefBuilder("test", "MaxPool")
                     .Input("input", 0, DT_FLOAT)
                     .Attr("strides", strides)
@@ -1426,9 +1434,10 @@ TEST(CommonShapeFnsTest, MaxPoolV22DShapeTest) {
   ShapeInferenceTestOp op("MaxPoolV2");
   Tensor ksizes_tensor, strides_tensor;
   auto set_op = [&op, &ksizes_tensor, &strides_tensor](
-                    const std::vector<int32>& strides,
-                    const std::vector<int32>& ksizes, const string& padding,
-                    const string& data_format) {
+                    const std::vector<int32_t>& strides,
+                    const std::vector<int32_t>& ksizes,
+                    const std::string& padding,
+                    const std::string& data_format) {
     TF_CHECK_OK(NodeDefBuilder("test", "MaxPoolV2")
                     .Input("input", 0, DT_FLOAT)
                     .Input("ksize", 1, DT_INT32)
@@ -1436,11 +1445,11 @@ TEST(CommonShapeFnsTest, MaxPoolV22DShapeTest) {
                     .Attr("padding", padding)
                     .Attr("data_format", data_format)
                     .Finalize(&op.node_def));
-    ksizes_tensor = test::AsTensor<int32>(ksizes);
+    ksizes_tensor = test::AsTensor<int32_t>(ksizes);
     op.input_tensors.resize(3);
     op.input_tensors[0] = nullptr;
     op.input_tensors[1] = &ksizes_tensor;
-    strides_tensor = test::AsTensor<int32>(strides);
+    strides_tensor = test::AsTensor<int32_t>(strides);
     op.input_tensors[2] = &strides_tensor;
   };
 
@@ -1466,8 +1475,9 @@ TEST(CommonShapeFnsTest, MaxPoolV22DShapeTest) {
 
 TEST(CommonShapeFnsTest, Pool3DShapeTest) {
   ShapeInferenceTestOp op("MaxPool3D");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const std::vector<int32>& ksizes, const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& ksizes,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", "MaxPool3D")
                     .Input("input", 0, DT_FLOAT)
                     .Attr("strides", strides)
@@ -1524,28 +1534,28 @@ TEST(CommonShapeFnsTest, Reduce_ShapeFn) {
   INFER_OK(op, "[2,4,5];[2]", "?");
   INFER_OK(op, "?;[2]", "?");
 
-  Tensor indices = test::AsTensor<int32>({1, 2});
+  Tensor indices = test::AsTensor<int32_t>({1, 2});
   op.input_tensors[1] = &indices;
 
   // Reduction indices available
   INFER_OK(op, "[2,4,5];[2]", "[d0_0]");
 
   // Wrapped indices
-  indices = test::AsTensor<int32>({-1, -2});
+  indices = test::AsTensor<int32_t>({-1, -2});
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[2]", "[d0_0]");
 
   // Scalar
-  indices = test::AsScalar<int32>(0);
+  indices = test::AsScalar<int32_t>(0);
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[]", "[d0_1,d0_2]");
 
-  indices = test::AsScalar<int32>(-4);
+  indices = test::AsScalar<int32_t>(-4);
   op.input_tensors[1] = &indices;
   INFER_ERROR("Invalid reduction dimension", op, "[2,4,5];[]");
 
   // Empty reduction indices
-  indices = test::AsTensor<int32>({});
+  indices = test::AsTensor<int32_t>({});
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[0]", "[d0_0,d0_1,d0_2]");
 
@@ -1555,7 +1565,7 @@ TEST(CommonShapeFnsTest, Reduce_ShapeFn) {
                    .Input("reduction_indices", 1, DT_INT32)
                    .Attr("keep_dims", true)
                    .Finalize(&op.node_def));
-  indices = test::AsTensor<int32>({-1, -2});
+  indices = test::AsTensor<int32_t>({-1, -2});
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[2]", "[d0_0, 1, 1]");
 
@@ -1572,9 +1582,9 @@ TEST(CommonShapeFnsTest, Reduce_ShapeFn) {
   INFER_OK(op, "[?,?,?];[?,?]", "[?,?,?]");
   // And when the tensor is specified, it's still allowed.
   op.input_tensors[1] = &indices;
-  indices = test::AsTensor<int32>({-1, -2}, TensorShape({2, 1}));
+  indices = test::AsTensor<int32_t>({-1, -2}, TensorShape({2, 1}));
   INFER_OK(op, "[2,4,5];[2,1]", "[d0_0, 1, 1]");
-  indices = test::AsTensor<int32>({-1, -2}, TensorShape({1, 2}));
+  indices = test::AsTensor<int32_t>({-1, -2}, TensorShape({1, 2}));
   INFER_OK(op, "[2,4,5];[1,2]", "[d0_0, 1, 1]");
 }
 
diff --git a/tensorflow/core/framework/control_flow.h b/tensorflow/core/framework/control_flow.h
index 3cc270b323d92f..a70ecb85214e31 100644
--- a/tensorflow/core/framework/control_flow.h
+++ b/tensorflow/core/framework/control_flow.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-const uint64 kIllegalFrameId = ~0uLL;
+const uint64_t kIllegalFrameId = ~0uLL;
 const int64_t kIllegalIterId = -1;
 
 // For the purpose of control flow, every tensor produced by TensorFlow is
@@ -30,12 +30,12 @@ const int64_t kIllegalIterId = -1;
 // 'frame_id' and an 'iter_id'. The tensor value it represents is produced
 // in the frame with frame_id at the iteration of iter_id.
 struct FrameAndIter {
-  uint64 frame_id = kIllegalFrameId;
+  uint64_t frame_id = kIllegalFrameId;
   int64_t iter_id = kIllegalIterId;
 
   FrameAndIter() {}
 
-  FrameAndIter(uint64 frame, int64_t iter) {
+  FrameAndIter(uint64_t frame, int64_t iter) {
     frame_id = frame;
     iter_id = iter;
   }
@@ -48,7 +48,7 @@ struct FrameAndIter {
 struct FrameAndIterHash {
   size_t operator()(const FrameAndIter& key) const {
     // Make sure there are no padding bytes that we don't want
-    CHECK_EQ(sizeof(uint64) + sizeof(int64_t), sizeof(FrameAndIter));
+    CHECK_EQ(sizeof(uint64_t) + sizeof(int64_t), sizeof(FrameAndIter));
     return Hash64(reinterpret_cast<const char*>(&key), sizeof(FrameAndIter));
   }
 };
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 491eb5293f22ad..69593a67d90352 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -52,8 +52,9 @@ static mutex* get_dataset_op_registry_lock() {
   return &dataset_op_registry_lock;
 }
 
-static std::unordered_set<string>* get_dataset_op_registry() {
-  static std::unordered_set<string>* names = new std::unordered_set<string>;
+static std::unordered_set<std::string>* get_dataset_op_registry() {
+  static std::unordered_set<std::string>* names =
+      new std::unordered_set<std::string>;
   return names;
 }
 
@@ -97,8 +98,8 @@ class DatasetVariantWrapper {
 
   DatasetBase* get() const { return dataset_; }
 
-  string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
-  string DebugString() const {
+  std::string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
+  std::string DebugString() const {
     if (dataset_) {
       return dataset_->DebugString();
     } else {
@@ -131,9 +132,11 @@ class WrappedDatasetVariantWrapper {
 
   Tensor get() const { return ds_tensor_; }
 
-  string TypeName() const { return "tensorflow::WrappedDatasetVariantWrapper"; }
+  std::string TypeName() const {
+    return "tensorflow::WrappedDatasetVariantWrapper";
+  }
 
-  string DebugString() const {
+  std::string DebugString() const {
     return "tensorflow::WrappedDatasetVariantWrapper::DebugString";
   }
 
@@ -324,7 +327,7 @@ absl::Status GraphDefBuilderWrapper::AddDataset(
 }
 
 absl::Status GraphDefBuilderWrapper::AddFunction(
-    SerializationContext* ctx, const string& function_name,
+    SerializationContext* ctx, const std::string& function_name,
     const FunctionLibraryDefinition& lib_def) {
   if (b_->HasFunction(function_name)) {
     VLOG(1) << "Function with name " << function_name << "already exists in"
@@ -338,7 +341,7 @@ absl::Status GraphDefBuilderWrapper::AddFunction(
   }
   FunctionDefLibrary def;
   *def.add_function() = *f_def;
-  const string gradient_func = lib_def.FindGradient(function_name);
+  const std::string gradient_func = lib_def.FindGradient(function_name);
   if (!gradient_func.empty()) {
     GradientDef* g_def = def.add_gradient();
     g_def->set_function_name(function_name);
@@ -380,8 +383,8 @@ void GraphDefBuilderWrapper::AddTensorInternal(const Tensor& val,
       b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
 }
 
-bool GraphDefBuilderWrapper::HasAttr(const string& name,
-                                     const string& attr_name) const {
+bool GraphDefBuilderWrapper::HasAttr(const std::string& name,
+                                     const std::string& attr_name) const {
   const OpDef* op_def = nullptr;
   absl::Status s = b_->opts().op_registry()->LookUpOpDef(name, &op_def);
   if (!s.ok() || op_def == nullptr) {
@@ -535,11 +538,11 @@ absl::Status MemoryCheckpoint::Save(IteratorStateWriter* writer) const {
 absl::Status IteratorBase::InitializeBase(IteratorContext* ctx,
                                           const IteratorBase* parent) {
   parent_ = parent;
-  id_ =
-      Hash64CombineUnordered(Hash64(prefix()), reinterpret_cast<uint64>(this));
+  id_ = Hash64CombineUnordered(Hash64(prefix()),
+                               reinterpret_cast<uint64_t>(this));
   if (parent_) {
     parent_id_ = Hash64CombineUnordered(Hash64(parent_->prefix()),
-                                        reinterpret_cast<uint64>(parent_));
+                                        reinterpret_cast<uint64_t>(parent_));
     // This block of code is executed only when `parent_` is not a `nullptr`
     // because we do not create a `Node` in the `Model` for `RootDataset`.
     if (const auto& model = ctx->model()) {
@@ -626,17 +629,17 @@ std::string FullName(const std::string& prefix, const std::string& name) {
   return strings::StrCat(kFullNameRandomHex, kPipe, prefix, kColon, name);
 }
 
-absl::Status ExtractIteratorPrefix(absl::string_view key, string* prefix) {
+absl::Status ExtractIteratorPrefix(absl::string_view key, std::string* prefix) {
   if (!absl::StartsWith(key, data::kFullNameRandomHex)) {
     return errors::InvalidArgument("Key: ", key,
                                    " was not generated using full_name.");
   }
-  std::vector<string> split_keys = str_util::Split(key, data::kPipe);
+  std::vector<std::string> split_keys = str_util::Split(key, data::kPipe);
   if (split_keys.size() != 2) {
     return errors::InvalidArgument("Key: ", key,
                                    " was not generated using full_name.");
   }
-  string real_key = split_keys[1];
+  std::string real_key = split_keys[1];
   const int pos = real_key.rfind(kColon);
   *prefix = real_key.substr(0, pos);
   return absl::OkStatus();
@@ -811,10 +814,11 @@ absl::Status DatasetBase::ComputeNumSources() {
   return absl::OkStatus();
 }
 
-absl::Status DatasetBase::CheckRandomAccessCompatible(const int64 index) const {
+absl::Status DatasetBase::CheckRandomAccessCompatible(
+    const int64_t index) const {
   CardinalityOptions options;
   options.set_compute_level(CardinalityOptions::CARDINALITY_COMPUTE_MODERATE);
-  int64 cardinality = Cardinality(options);
+  int64_t cardinality = Cardinality(options);
   if (cardinality == kInfiniteCardinality ||
       cardinality == kUnknownCardinality) {
     return tensorflow::errors::FailedPrecondition(
@@ -829,13 +833,13 @@ absl::Status DatasetBase::CheckRandomAccessCompatible(const int64 index) const {
   return absl::OkStatus();
 }
 
-absl::Status DatasetBase::Get(OpKernelContext* ctx, int64 index,
+absl::Status DatasetBase::Get(OpKernelContext* ctx, int64_t index,
                               std::vector<Tensor>* out_tensors) const {
   return errors::Unimplemented("Random access is not implemented for dataset ",
                                DebugString());
 }
 
-absl::Status DatasetBase::Get(AnyContext ctx, int64 index,
+absl::Status DatasetBase::Get(AnyContext ctx, int64_t index,
                               std::vector<Tensor>* out_tensors) const {
   return errors::Unimplemented("Random access is not implemented for dataset ",
                                DebugString());
@@ -876,7 +880,7 @@ absl::Status DatasetBase::MergeOptionsFromInputs() {
 
 absl::Status DatasetBase::MakeIterator(
     IteratorContext* ctx, const IteratorBase* parent,
-    const string& output_prefix,
+    const std::string& output_prefix,
     std::unique_ptr<IteratorBase>* iterator) const {
   if (type_string() == "OptionsDataset" || type_string() == "FinalizeDataset") {
     std::vector<const DatasetBase*> inputs;
@@ -1103,8 +1107,8 @@ DatasetBaseIterator::~DatasetBaseIterator() {
   params_.dataset->Unref();
 }
 
-string DatasetBaseIterator::BuildTraceMeName() {
-  string result =
+std::string DatasetBaseIterator::BuildTraceMeName() {
+  std::string result =
       strings::StrCat(params_.prefix, "#", traceme_metadata_, ",id=", id_);
   if (parent_) {
     absl::StrAppend(&result, ",parent_id=", parent_id_);
@@ -1274,8 +1278,8 @@ void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   }
 }
 
-string DatasetOpKernel::TraceString(const OpKernelContext& ctx,
-                                    bool verbose) const {
+std::string DatasetOpKernel::TraceString(const OpKernelContext& ctx,
+                                         bool verbose) const {
   return tsl::profiler::TraceMeOp(name_view(), type_string_view());
 }
 
@@ -1310,7 +1314,7 @@ bool DatasetOpKernel::IsDatasetOp(const OpDef& op_def) {
 
   // Check if the suffix matches "DatasetV[0-9]+".
   size_t index = op_name.length() - 1;
-  while (index >= 0 && isdigit(op_name[index])) {
+  while (index >= 0 && absl::ascii_isdigit(op_name[index])) {
     index--;
   }
   constexpr absl::string_view kDatasetPrefix = "DatasetV";
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index b807208647c1cb..2471c3dc08cc0a 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -87,7 +87,7 @@ void MergeOptions(const protobuf::MessageLite& source,
                   protobuf::MessageLite* destination);
 }  // namespace internal
 
-using TraceMeMetadata = std::vector<std::pair<absl::string_view, string>>;
+using TraceMeMetadata = std::vector<std::pair<absl::string_view, std::string>>;
 
 // Maps the index of dataset elements to a globally shuffled index. See the
 // comment for IteratorContext::Params::index_mapper for more details.
@@ -211,7 +211,7 @@ class IteratorStateWriter {
 std::string FullName(const std::string& prefix, const std::string& name);
 
 // Extracts iterator prefix from key generated by `FullName`.
-absl::Status ExtractIteratorPrefix(absl::string_view key, string* prefix);
+absl::Status ExtractIteratorPrefix(absl::string_view key, std::string* prefix);
 
 // Interface for objects that can be checkpointed.
 class Checkpointable {
@@ -264,7 +264,7 @@ class GraphDefBuilderWrapper {
     return absl::OkStatus();
   }
 
-  absl::Status AddVector(const std::vector<string>& val, Node** output) {
+  absl::Status AddVector(const std::vector<std::string>& val, Node** output) {
     Tensor val_t = Tensor(DataTypeToEnum<tstring>::v(),
                           TensorShape({static_cast<int64_t>(val.size())}));
     for (size_t i = 0; i < val.size(); i++) {
@@ -350,7 +350,7 @@ class GraphDefBuilderWrapper {
   // or any of its dependent functions are stateful, and the context does not
   // explicitly permit stateful functions, returns an InvalidArgument error.
   absl::Status AddFunction(SerializationContext* ctx,
-                           const string& function_name,
+                           const std::string& function_name,
                            const FunctionLibraryDefinition& lib_def);
 
   template <typename T>
@@ -371,9 +371,10 @@ class GraphDefBuilderWrapper {
  private:
   void AddPlaceholderInternal(const Tensor& val, Node** output);
   void AddTensorInternal(const Tensor& val, Node** output);
-  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+  bool HasAttr(const std::string& op_type_name,
+               const std::string& attr_name) const;
 
-  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
+  bool HasAttr(const OpDef* op_def, const std::string& attr_name) const {
     for (const auto& attr : op_def->attr()) {
       if (attr.name() == attr_name) {
         return true;
@@ -509,35 +510,35 @@ class MemoryCheckpoint final : public IteratorStateWriter {
 
   // BEGIN implementation of `IteratorStateWriter` interface
   absl::Status WriteScalar(absl::string_view key, int64_t val) override {
-    string prefix;
+    std::string prefix;
     TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
     return WriteScalar(prefix, key, val);
   }
   absl::Status WriteScalar(absl::string_view name, absl::string_view key,
                            int64_t val) override {
-    auto id = id_registry_->Add(string(name), string(key));
+    auto id = id_registry_->Add(std::string(name), std::string(key));
     int_values_[id] = val;
     return absl::OkStatus();
   }
   absl::Status WriteScalar(absl::string_view key, const tstring& val) override {
-    string prefix;
+    std::string prefix;
     TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
     return WriteScalar(prefix, key, val);
   }
   absl::Status WriteScalar(absl::string_view name, absl::string_view key,
                            const tstring& val) override {
-    auto id = id_registry_->Add(string(name), string(key));
+    auto id = id_registry_->Add(std::string(name), std::string(key));
     str_values_[id] = val;
     return absl::OkStatus();
   }
   absl::Status WriteTensor(absl::string_view key, const Tensor& val) override {
-    string prefix;
+    std::string prefix;
     TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
     return WriteTensor(prefix, key, val);
   }
   absl::Status WriteTensor(absl::string_view name, absl::string_view key,
                            const Tensor& val) override {
-    auto id = id_registry_->Add(string(name), string(key));
+    auto id = id_registry_->Add(std::string(name), std::string(key));
     tensor_values_[id] = val;
     return absl::OkStatus();
   }
@@ -614,7 +615,8 @@ class SerializationContext {
         : resource_mgr(ctx->resource_manager()),
           device_name(ctx->device()->attributes().name()) {}
 
-    std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
+    std::vector<std::pair<std::string, Tensor>>* input_list =
+        nullptr;  // Not owned.
 
     // Indicates what to do if the dataset depends on external state.
     ExternalStatePolicy external_state_policy =
@@ -653,7 +655,7 @@ class SerializationContext {
 
   explicit SerializationContext(Params params) : params_(params) {}
 
-  std::vector<std::pair<string, Tensor>>* input_list() {
+  std::vector<std::pair<std::string, Tensor>>* input_list() {
     return params_.input_list;
   }
 
@@ -773,7 +775,7 @@ class IteratorContext {
     // Records the number of ParallelInterleave operations in the path from the
     // root node to this node (not including this node) in the input pipeline
     // tree.
-    int64 interleave_depth = 0;
+    int64_t interleave_depth = 0;
 
     // Marks whether the iterator is restored from a checkpoint.
     bool is_restoring = false;
@@ -795,7 +797,7 @@ class IteratorContext {
     std::function<void(std::function<void()>)> runner = nullptr;
 
     // Number of threads used for executing user-defined functions.
-    int32 runner_threadpool_size = 0;
+    int32_t runner_threadpool_size = 0;
 
     // Split providers indicating which splits to process. May be empty,
     // indicating that the iterator should process all splits.
@@ -891,7 +893,7 @@ class IteratorContext {
 
   MemoryCheckpoint* checkpoint() { return &checkpoint_; }
 
-  int64 interleave_depth() { return params_.interleave_depth; }
+  int64_t interleave_depth() { return params_.interleave_depth; }
 
   bool is_restoring() { return params_.is_restoring; }
 
@@ -909,7 +911,7 @@ class IteratorContext {
     return &params_.runner;
   }
 
-  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
+  int32_t runner_threadpool_size() { return params_.runner_threadpool_size; }
 
   std::vector<std::shared_ptr<SplitProvider>> split_providers() const {
     return params_.split_providers;
@@ -949,7 +951,7 @@ class IteratorContext {
     params_.index_mapper = index_mapper;
   };
 
-  std::unique_ptr<thread::ThreadPool> CreateThreadPool(const string& name,
+  std::unique_ptr<thread::ThreadPool> CreateThreadPool(const std::string& name,
                                                        int num_threads) {
     if (params_.thread_pool) {
       // Create a `ThreadPool` instance by wrapping `params_.thread_pool` (which
@@ -1010,7 +1012,7 @@ class IteratorContext {
     }
   }
 
-  std::unique_ptr<Thread> StartThread(const string& name,
+  std::unique_ptr<Thread> StartThread(const std::string& name,
                                       std::function<void()> fn) {
     if (params_.thread_factory) {
       return params_.thread_factory->StartThread(name, std::move(fn));
@@ -1133,7 +1135,7 @@ class IteratorBase : public Checkpointable {
 
   // Returns a string that identifies the sequence of iterators leading up to
   // this iterator.
-  virtual const string& prefix() const = 0;
+  virtual const std::string& prefix() const = 0;
 
   // Indicates whether the iterator is compatible with symbolic checkpointing.
   virtual bool SymbolicCheckpointCompatible() const { return false; }
@@ -1248,9 +1250,9 @@ class IteratorBase : public Checkpointable {
 class DatasetContext {
  public:
   struct Params {
-    string type_string;  // op type name of this dataset.
-    string node_name;    // graph node name of this dataset op, uniquely
-                         // identifying the dataset in the graph.
+    std::string type_string;  // op type name of this dataset.
+    std::string node_name;    // graph node name of this dataset op, uniquely
+                              // identifying the dataset in the graph.
   };
 
   explicit DatasetContext(Params params) : params_(std::move(params)) {}
@@ -1260,8 +1262,8 @@ class DatasetContext {
     params_.node_name = ctx->op_kernel().name();
   }
 
-  const string& type_string() const { return params_.type_string; }
-  const string& node_name() const { return params_.node_name; }
+  const std::string& type_string() const { return params_.type_string; }
+  const std::string& node_name() const { return params_.node_name; }
 
  private:
   Params params_;
@@ -1304,11 +1306,11 @@ class DatasetBase : public core::RefCounted {
       : type_string_(ctx.type_string()), node_name_(ctx.node_name()) {}
 
   // Op type name of this dataset.
-  const string& type_string() const { return type_string_; }
+  const std::string& type_string() const { return type_string_; }
 
   // Graph node name of this dataset op, uniquely identifying the dataset in
   // the graph.
-  const string& node_name() const { return node_name_; }
+  const std::string& node_name() const { return node_name_; }
 
   const Metadata& metadata() const { return metadata_; }
 
@@ -1330,18 +1332,18 @@ class DatasetBase : public core::RefCounted {
   // The prefix identifies the sequence of iterators leading up to the newly
   // created iterator.
   absl::Status MakeIterator(IteratorContext* ctx, const IteratorBase* parent,
-                            const string& output_prefix,
+                            const std::string& output_prefix,
                             std::unique_ptr<IteratorBase>* iterator) const;
 
   absl::Status MakeIterator(IteratorContext&& ctx, const IteratorBase* parent,
-                            const string& output_prefix,
+                            const std::string& output_prefix,
                             std::unique_ptr<IteratorBase>* iterator) const {
     return MakeIterator(&ctx, parent, output_prefix, iterator);
   }
 
   // Returns a new iterator restored from the checkpoint data in `reader`.
   absl::Status MakeIteratorFromCheckpoint(
-      IteratorContext* ctx, const string& output_prefix,
+      IteratorContext* ctx, const std::string& output_prefix,
       IteratorStateReader* reader,
       std::unique_ptr<IteratorBase>* iterator) const {
     std::unique_ptr<IteratorBase> it;
@@ -1357,7 +1359,7 @@ class DatasetBase : public core::RefCounted {
   }
 
   absl::Status MakeIteratorFromCheckpoint(
-      IteratorContext&& ctx, const string& output_prefix,
+      IteratorContext&& ctx, const std::string& output_prefix,
       IteratorStateReader* reader,
       std::unique_ptr<IteratorBase>* iterator) const {
     return MakeIteratorFromCheckpoint(&ctx, output_prefix, reader, iterator);
@@ -1405,7 +1407,7 @@ class DatasetBase : public core::RefCounted {
   }
 
   // A human-readable debug string for this dataset.
-  virtual string DebugString() const = 0;
+  virtual std::string DebugString() const = 0;
 
   // Stores the dataset's input datasets in `*inputs`. The pointers stored in
   // `*inputs` are borrowed. The only valid non-ok return status is
@@ -1423,16 +1425,16 @@ class DatasetBase : public core::RefCounted {
   virtual absl::Status CheckExternalState() const = 0;
 
   // Indicates whether the dataset is compatible with random access.
-  absl::Status CheckRandomAccessCompatible(const int64 index) const;
+  absl::Status CheckRandomAccessCompatible(const int64_t index) const;
 
   // Return the element at a particular index for a randomly accessible dataset.
-  virtual absl::Status Get(OpKernelContext* ctx, int64 index,
+  virtual absl::Status Get(OpKernelContext* ctx, int64_t index,
                            std::vector<Tensor>* out_tensors) const;
 
   // Same as above, but with an `AnyContext`, which can be constructed from
   // either an `OpKernelContext` or `IteratorContext`. Used to support datasets
   // that provide random access through both the dataset and iterator APIs.
-  virtual absl::Status Get(AnyContext ctx, int64 index,
+  virtual absl::Status Get(AnyContext ctx, int64_t index,
                            std::vector<Tensor>* out_tensors) const;
 
   // Returns true if the dataset and its inputs support random access.
@@ -1487,7 +1489,7 @@ class DatasetBase : public core::RefCounted {
                                           Node** node) const = 0;
 
   virtual std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const = 0;
+      const std::string& prefix) const = 0;
 
   void set_options(const Options& options) { options_ = options; }
 
@@ -1505,8 +1507,8 @@ class DatasetBase : public core::RefCounted {
   // how they appear for this dataset.
   absl::Status MergeOptionsFromInputs();
 
-  const string type_string_;
-  const string node_name_;
+  const std::string type_string_;
+  const std::string node_name_;
   Metadata metadata_;
   Options options_;
   mutable mutex mu_;
@@ -1527,7 +1529,7 @@ class DatasetBaseIterator : public IteratorBase {
     const DatasetBase* dataset;
 
     // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
+    const std::string prefix;
   };
 
   explicit DatasetBaseIterator(const BaseParams& params);
@@ -1544,13 +1546,13 @@ class DatasetBaseIterator : public IteratorBase {
     return params_.dataset->output_shapes();
   }
 
-  const string& prefix() const override { return params_.prefix; }
+  const std::string& prefix() const override { return params_.prefix; }
 
   // Returns a name to be used for the TraceMe event.
   //
   // NOTE: TraceMe supports passing key-value pairs of "arguments" using the
   // following format "name#arg_1=value_,...,arg_n=value_n".
-  string BuildTraceMeName();
+  std::string BuildTraceMeName();
 
   absl::Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                        bool* end_of_sequence) final;
@@ -1602,7 +1604,7 @@ class DatasetBaseIterator : public IteratorBase {
   virtual absl::Status SkipInternal(IteratorContext* ctx, int num_to_skip,
                                     bool* end_of_sequence, int* num_skipped);
 
-  string full_name(const string& name) const {
+  std::string full_name(const std::string& name) const {
     return FullName(params_.prefix, name);
   }
 
@@ -1693,7 +1695,7 @@ class DatasetBaseIterator : public IteratorBase {
     return ctx->model() && node_;
   }
 
-  string traceme_metadata_;
+  std::string traceme_metadata_;
   BaseParams params_;
 };
 
@@ -1707,7 +1709,7 @@ class DatasetIterator : public DatasetBaseIterator {
     const DatasetType* dataset;
 
     // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
+    const std::string prefix;
   };
 
   explicit DatasetIterator(const Params& params)
@@ -1774,7 +1776,8 @@ class DatasetOpKernel : public OpKernel {
   // names that end with "Dataset" or "DatasetV[0-9]+".
   static bool IsDatasetOp(const OpDef& op_def);
 
-  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
+  std::string TraceString(const OpKernelContext& ctx,
+                          bool verbose) const override;
 
  protected:
   // Subclasses should implement this method. It will be called during Compute
diff --git a/tensorflow/core/framework/dataset_stateful_op_allowlist.h b/tensorflow/core/framework/dataset_stateful_op_allowlist.h
index cc25c801bf60b1..14b16b309ea5c1 100644
--- a/tensorflow/core/framework/dataset_stateful_op_allowlist.h
+++ b/tensorflow/core/framework/dataset_stateful_op_allowlist.h
@@ -25,17 +25,17 @@ namespace data {
 // See below macro for usage details.
 class AllowlistedStatefulOpRegistry {
  public:
-  absl::Status Add(string op_name) {
+  absl::Status Add(std::string op_name) {
     op_names_.insert(std::move(op_name));
     return absl::OkStatus();
   }
 
-  absl::Status Remove(string op_name) {
+  absl::Status Remove(std::string op_name) {
     op_names_.erase(op_name);
     return absl::OkStatus();
   }
 
-  bool Contains(const string& op_name) { return op_names_.count(op_name); }
+  bool Contains(const std::string& op_name) { return op_names_.count(op_name); }
 
   static AllowlistedStatefulOpRegistry* Global() {
     static auto* reg = new AllowlistedStatefulOpRegistry;
@@ -49,7 +49,7 @@ class AllowlistedStatefulOpRegistry {
   AllowlistedStatefulOpRegistry operator=(
       AllowlistedStatefulOpRegistry const& copy) = delete;
 
-  std::unordered_set<string> op_names_;
+  std::unordered_set<std::string> op_names_;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/framework/dataset_test.cc b/tensorflow/core/framework/dataset_test.cc
index 66213ea5721b13..b572de72e54113 100644
--- a/tensorflow/core/framework/dataset_test.cc
+++ b/tensorflow/core/framework/dataset_test.cc
@@ -68,8 +68,8 @@ TEST_P(DatasetTestTotalBytes, TestTotalBytes) {
 }
 
 std::vector<Tensor> tensor_tf_int_32s() {
-  return {test::AsTensor<int32>({1, 2, 3, 4, 5}),
-          test::AsTensor<int32>({1, 2, 3, 4})};
+  return {test::AsTensor<int32_t>({1, 2, 3, 4, 5}),
+          test::AsTensor<int32_t>({1, 2, 3, 4})};
 }
 
 std::vector<Tensor> tensor_tf_int_64s() {
diff --git a/tensorflow/core/framework/device.cc b/tensorflow/core/framework/device.cc
index 59730e3ce1d436..1adb6e7eaf1641 100644
--- a/tensorflow/core/framework/device.cc
+++ b/tensorflow/core/framework/device.cc
@@ -41,8 +41,8 @@ void Device::Sync(const DoneCallback& done) { done(Sync()); }
 
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
-    const string& name, DeviceType device, Bytes memory_limit,
-    const DeviceLocality& locality, const string& physical_device_desc) {
+    const std::string& name, DeviceType device, Bytes memory_limit,
+    const DeviceLocality& locality, const std::string& physical_device_desc) {
   DeviceAttributes da;
   da.set_name(name);
   do {
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index 44db0a284f1f79..891d45f237e61e 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -66,7 +66,7 @@ const DeviceAttributes& DeviceBase::attributes() const {
   std::abort();
 }
 
-const string& DeviceBase::name() const {
+const std::string& DeviceBase::name() const {
   LOG(FATAL) << "DeviceBase does not implement name()";  // Crash OK
   std::abort();
 }
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index fe5099fa361429..15c4e6bba6ae9e 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -269,7 +269,7 @@ class DeviceBase {
   // device memory tagged with an earlier freed-at count is really unencumbered
   // by pending uses.  For this to be useful the device memory allocator must
   // be tagging deallocated memory chunks using the same counter.
-  virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }
+  virtual uint64_t SafeAllocFrontier(uint64_t old_value) { return 0; }
 
   // Copies `input_tensor` to `output_tensor`, where both tensors are on this
   // device. This function assumes that `output_tensor` has already been
diff --git a/tensorflow/core/framework/device_factory.cc b/tensorflow/core/framework/device_factory.cc
index 392b44f2eb177c..d6374d41a93bb7 100644
--- a/tensorflow/core/framework/device_factory.cc
+++ b/tensorflow/core/framework/device_factory.cc
@@ -47,14 +47,14 @@ struct FactoryItem {
   bool is_pluggable_device;
 };
 
-std::unordered_map<string, FactoryItem>& device_factories() {
-  static std::unordered_map<string, FactoryItem>* factories =
-      new std::unordered_map<string, FactoryItem>;
+std::unordered_map<std::string, FactoryItem>& device_factories() {
+  static std::unordered_map<std::string, FactoryItem>* factories =
+      new std::unordered_map<std::string, FactoryItem>;
   return *factories;
 }
 
-bool IsDeviceFactoryEnabled(const string& device_type) {
-  std::vector<string> enabled_devices;
+bool IsDeviceFactoryEnabled(const std::string& device_type) {
+  std::vector<std::string> enabled_devices;
   TF_CHECK_OK(tensorflow::ReadStringsFromEnvVar(
       /*env_var_name=*/"TF_ENABLED_DEVICE_TYPES", /*default_val=*/"",
       &enabled_devices));
@@ -67,9 +67,9 @@ bool IsDeviceFactoryEnabled(const string& device_type) {
 }  // namespace
 
 // static
-int32 DeviceFactory::DevicePriority(const string& device_type) {
+int32_t DeviceFactory::DevicePriority(const std::string& device_type) {
   tf_shared_lock l(*get_device_factory_lock());
-  std::unordered_map<string, FactoryItem>& factories = device_factories();
+  std::unordered_map<std::string, FactoryItem>& factories = device_factories();
   auto iter = factories.find(device_type);
   if (iter != factories.end()) {
     return iter->second.priority;
@@ -78,9 +78,9 @@ int32 DeviceFactory::DevicePriority(const string& device_type) {
   return -1;
 }
 
-bool DeviceFactory::IsPluggableDevice(const string& device_type) {
+bool DeviceFactory::IsPluggableDevice(const std::string& device_type) {
   tf_shared_lock l(*get_device_factory_lock());
-  std::unordered_map<string, FactoryItem>& factories = device_factories();
+  std::unordered_map<std::string, FactoryItem>& factories = device_factories();
   auto iter = factories.find(device_type);
   if (iter != factories.end()) {
     return iter->second.is_pluggable_device;
@@ -89,7 +89,7 @@ bool DeviceFactory::IsPluggableDevice(const string& device_type) {
 }
 
 // static
-void DeviceFactory::Register(const string& device_type,
+void DeviceFactory::Register(const std::string& device_type,
                              std::unique_ptr<DeviceFactory> factory,
                              int priority, bool is_pluggable_device) {
   if (!IsDeviceFactoryEnabled(device_type)) {
@@ -98,7 +98,7 @@ void DeviceFactory::Register(const string& device_type,
     return;
   }
   mutex_lock l(*get_device_factory_lock());
-  std::unordered_map<string, FactoryItem>& factories = device_factories();
+  std::unordered_map<std::string, FactoryItem>& factories = device_factories();
   auto iter = factories.find(device_type);
   if (iter == factories.end()) {
     factories[device_type] = {std::move(factory), priority,
@@ -113,7 +113,7 @@ void DeviceFactory::Register(const string& device_type,
   }
 }
 
-DeviceFactory* DeviceFactory::GetFactory(const string& device_type) {
+DeviceFactory* DeviceFactory::GetFactory(const std::string& device_type) {
   tf_shared_lock l(*get_device_factory_lock());
   auto it = device_factories().find(device_type);
   if (it == device_factories().end()) {
@@ -128,7 +128,7 @@ DeviceFactory* DeviceFactory::GetFactory(const string& device_type) {
 }
 
 absl::Status DeviceFactory::ListAllPhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   // CPU first. A CPU device is required.
   // TODO(b/183974121): Consider merge the logic into the loop below.
   auto cpu_factory = GetFactory("CPU");
@@ -156,7 +156,7 @@ absl::Status DeviceFactory::ListAllPhysicalDevices(
 }
 
 absl::Status DeviceFactory::ListPluggablePhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   tf_shared_lock l(*get_device_factory_lock());
   for (auto& p : device_factories()) {
     if (p.second.is_pluggable_device) {
@@ -168,7 +168,7 @@ absl::Status DeviceFactory::ListPluggablePhysicalDevices(
 }
 
 absl::Status DeviceFactory::GetAnyDeviceDetails(
-    int device_index, std::unordered_map<string, string>* details) {
+    int device_index, std::unordered_map<std::string, std::string>* details) {
   if (device_index < 0) {
     return errors::InvalidArgument("Device index out of bounds: ",
                                    device_index);
@@ -183,7 +183,7 @@ absl::Status DeviceFactory::GetAnyDeviceDetails(
   }
 
   // TODO(b/183974121): Consider merge the logic into the loop below.
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   TF_RETURN_IF_ERROR(cpu_factory->ListPhysicalDevices(&devices));
   if (device_index < devices.size()) {
     return cpu_factory->GetDeviceDetails(device_index, details);
@@ -211,7 +211,7 @@ absl::Status DeviceFactory::GetAnyDeviceDetails(
 }
 
 absl::Status DeviceFactory::AddCpuDevices(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     std::vector<std::unique_ptr<Device>>* devices) {
   auto cpu_factory = GetFactory("CPU");
   if (!cpu_factory) {
@@ -228,7 +228,7 @@ absl::Status DeviceFactory::AddCpuDevices(
 }
 
 absl::Status DeviceFactory::AddDevices(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     std::vector<std::unique_ptr<Device>>* devices) {
   // CPU first. A CPU device is required.
   // TODO(b/183974121): Consider merge the logic into the loop below.
@@ -263,9 +263,9 @@ absl::Status DeviceFactory::AddDevices(
   return absl::OkStatus();
 }
 
-std::unique_ptr<Device> DeviceFactory::NewDevice(const string& type,
-                                                 const SessionOptions& options,
-                                                 const string& name_prefix) {
+std::unique_ptr<Device> DeviceFactory::NewDevice(
+    const std::string& type, const SessionOptions& options,
+    const std::string& name_prefix) {
   auto device_factory = GetFactory(type);
   if (!device_factory) {
     return nullptr;
diff --git a/tensorflow/core/framework/device_factory.h b/tensorflow/core/framework/device_factory.h
index 8b07d15cfc0dac..e30a4538fa939a 100644
--- a/tensorflow/core/framework/device_factory.h
+++ b/tensorflow/core/framework/device_factory.h
@@ -58,34 +58,35 @@ class DeviceFactory {
   // Helper for tests.  Create a single device of type "type".  The
   // returned device is always numbered zero, so if creating multiple
   // devices of the same type, supply distinct name_prefix arguments.
-  static std::unique_ptr<Device> NewDevice(const string& type,
+  static std::unique_ptr<Device> NewDevice(const std::string& type,
                                            const SessionOptions& options,
-                                           const string& name_prefix);
+                                           const std::string& name_prefix);
 
   // Iterate through all device factories and build a list of all of the
   // possible physical devices.
   //
   // CPU is are added first.
-  static absl::Status ListAllPhysicalDevices(std::vector<string>* devices);
+  static absl::Status ListAllPhysicalDevices(std::vector<std::string>* devices);
 
   // Iterate through all device factories and build a list of all of the
   // possible pluggable physical devices.
   static absl::Status ListPluggablePhysicalDevices(
-      std::vector<string>* devices);
+      std::vector<std::string>* devices);
 
   // Get details for a specific device among all device factories.
   // 'device_index' indexes into devices from ListAllPhysicalDevices.
   static absl::Status GetAnyDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details);
+      int device_index, std::unordered_map<std::string, std::string>* details);
 
   // For a specific device factory list all possible physical devices.
-  virtual absl::Status ListPhysicalDevices(std::vector<string>* devices) = 0;
+  virtual absl::Status ListPhysicalDevices(
+      std::vector<std::string>* devices) = 0;
 
   // Get details for a specific device for a specific factory. Subclasses
   // can store arbitrary device information in the map. 'device_index' indexes
   // into devices from ListPhysicalDevices.
   virtual absl::Status GetDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details) {
+      int device_index, std::unordered_map<std::string, std::string>* details) {
     return absl::OkStatus();
   }
 
@@ -106,7 +107,7 @@ class DeviceFactory {
   // higher than the packaged devices.  See calls to
   // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
   // for built-in devices.
-  static int32 DevicePriority(const std::string& device_type);
+  static int32_t DevicePriority(const std::string& device_type);
 
   // Returns true if 'device_type' is registered from plugin. Returns false if
   // 'device_type' is a first-party device.
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index ec424f890883eb..c295e18bca197b 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -36,7 +36,7 @@ class FakeInputImpl {
   absl::Status AddInputToBuilder();
 
  private:
-  static string FakeNodeName(int in_index);
+  static std::string FakeNodeName(int in_index);
   absl::Status GetN(int* n) const;
   absl::Status GetDataType(DataType* dt) const;
   void NSources(int n, DataType dt) const;
@@ -44,7 +44,7 @@ class FakeInputImpl {
 
   const OpDef* const op_def_;
   const OpDef::ArgDef* const arg_;
-  const string in_node_;
+  const std::string in_node_;
   const NodeDef* const node_def_;
   NodeDefBuilder* const builder_;
 
@@ -120,9 +120,9 @@ absl::Status FakeInputImpl::AddInputToBuilder() {
 }
 
 // static
-string FakeInputImpl::FakeNodeName(int in_index) {
+std::string FakeInputImpl::FakeNodeName(int in_index) {
   char c = 'a' + (in_index % 26);
-  return string(&c, 1);
+  return std::string(&c, 1);
 }
 
 absl::Status FakeInputImpl::GetN(int* n) const {
diff --git a/tensorflow/core/framework/full_type_inference_util.cc b/tensorflow/core/framework/full_type_inference_util.cc
index 029ca251b536c2..2fc478466337e7 100644
--- a/tensorflow/core/framework/full_type_inference_util.cc
+++ b/tensorflow/core/framework/full_type_inference_util.cc
@@ -342,7 +342,7 @@ TypeInferenceFn MapCovariant(FullTypeId t, FullTypeId u, int input_idx) {
       };
 }
 
-TypeInferenceFn FunctionCall(const string& func_attr_name) {
+TypeInferenceFn FunctionCall(const std::string& func_attr_name) {
   return [func_attr_name](const TypeRefVector& input_types,
                           const FunctionTypeInferrer& infer_function_rets)
              -> absl::StatusOr<FullTypeDef> {
diff --git a/tensorflow/core/framework/full_type_inference_util.h b/tensorflow/core/framework/full_type_inference_util.h
index 3117613bcd130f..211768f4a0083b 100644
--- a/tensorflow/core/framework/full_type_inference_util.h
+++ b/tensorflow/core/framework/full_type_inference_util.h
@@ -122,7 +122,7 @@ TypeInferenceFn MapCovariant(FullTypeId t, FullTypeId u, int input_idx);
 // Helper for ops with semantics of calling a function. The function is
 // specified indirectly, as the name of an attribute that holds the actual
 // function name.
-TypeInferenceFn FunctionCall(const string& func_attr_name);
+TypeInferenceFn FunctionCall(const std::string& func_attr_name);
 
 // Compose the type of a function by concatenating the outputs of multiple
 // type inference functions. If func_list is {type inference function 1, type
diff --git a/tensorflow/core/framework/full_type_util.cc b/tensorflow/core/framework/full_type_util.cc
index 0bc3e04faf0e49..ea5fad4f704ff3 100644
--- a/tensorflow/core/framework/full_type_util.cc
+++ b/tensorflow/core/framework/full_type_util.cc
@@ -51,7 +51,7 @@ OpTypeConstructor Nullary(FullTypeId t) {
   };
 }
 
-OpTypeConstructor Unary(FullTypeId t, const string& var_name) {
+OpTypeConstructor Unary(FullTypeId t, const std::string& var_name) {
   return [t, var_name](OpDef* op_def) {
     FullTypeDef* tdef =
         op_def->mutable_output_arg(0)->mutable_experimental_full_type();
@@ -93,7 +93,8 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype) {
   };
 }
 
-OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name) {
+OpTypeConstructor UnaryTensorContainer(FullTypeId t,
+                                       const std::string& var_name) {
   return [t, var_name](OpDef* op_def) {
     FullTypeDef* tdef =
         op_def->mutable_output_arg(0)->mutable_experimental_full_type();
@@ -110,7 +111,7 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name) {
 }
 
 OpTypeConstructor VariadicTensorContainer(FullTypeId t,
-                                          const string& var_name) {
+                                          const std::string& var_name) {
   return [t, var_name](OpDef* op_def) {
     FullTypeDef* tdef =
         op_def->mutable_output_arg(0)->mutable_experimental_full_type();
diff --git a/tensorflow/core/framework/full_type_util.h b/tensorflow/core/framework/full_type_util.h
index 76045e1bf1a777..392871a189651e 100644
--- a/tensorflow/core/framework/full_type_util.h
+++ b/tensorflow/core/framework/full_type_util.h
@@ -52,7 +52,7 @@ OpTypeConstructor NoOutputs();
 OpTypeConstructor Nullary(FullTypeId t);
 
 // Helper for a type constructor of <t>[FT_VAR[<var_name>]].
-OpTypeConstructor Unary(FullTypeId t, const string& var_name);
+OpTypeConstructor Unary(FullTypeId t, const std::string& var_name);
 
 // Helper for a type constructor of <t>[FT_ANY].
 OpTypeConstructor UnaryGeneric(FullTypeId t);
@@ -61,7 +61,8 @@ OpTypeConstructor UnaryGeneric(FullTypeId t);
 OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype);
 
 // Helper for a type constructor of <t>[FT_VAR[<var_name>]].
-OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name);
+OpTypeConstructor UnaryTensorContainer(FullTypeId t,
+                                       const std::string& var_name);
 
 // Helper for a type constructor of
 // <t>[FT_FOR_EACH[
@@ -69,7 +70,8 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name);
 //     FT_TENSOR[FT_VAR[<var_name>]],
 //     FT_VAR[<var_name>]].
 // Multi-valued type variables will expand the template (see full_type.proto).
-OpTypeConstructor VariadicTensorContainer(FullTypeId t, const string& var_name);
+OpTypeConstructor VariadicTensorContainer(FullTypeId t,
+                                          const std::string& var_name);
 
 // Type specialization and inference logic. This function narrows the type
 // specified in an op definition. Such types are usually generic and dependent
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 2b778ca0134c70..91653d2cb0936f 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -122,7 +122,7 @@ absl::Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def,
 namespace {
 
 template <typename T>
-void AddAttr(const string& name, const T& val, NodeDef* ndef) {
+void AddAttr(const std::string& name, const T& val, NodeDef* ndef) {
   SetAttrValue(val, &((*ndef->mutable_attr())[name]));
 }
 
@@ -207,7 +207,7 @@ class FunctionInstantiationHelper {
             "Expected arg_index to be equal to the number of nodes in result.",
             " Got ", arg_index, " and ", result_.nodes.size());
       }
-      string name = arg_def.name();
+      std::string name = arg_def.name();
       if (dtypes.size() > 1) {
         absl::StrAppend(&name, "_", i);
       }
@@ -259,7 +259,7 @@ class FunctionInstantiationHelper {
           ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes));
       // Note that we rely on the backwards-compatibility test enforcing
       // that output_arg(*).name() doesn't change here.
-      const string base_name =
+      const std::string base_name =
           absl::StrCat(node.name(), ":", node_sig->output_arg(i).name());
       TF_RETURN_IF_ERROR(
           AddItem(base_name, {false, arg_index, start, is_type_list, dtypes}));
@@ -299,7 +299,7 @@ class FunctionInstantiationHelper {
               " >= ", fnode.input_size());
         }
         // Look up the next input.
-        const string& input_name = fnode.input(fnode_arg_index);
+        const std::string& input_name = fnode.input(fnode_arg_index);
         const auto* item = GetItemOrNull(input_name);
         if (item == nullptr) {
           return errors::InvalidArgument(
@@ -331,15 +331,15 @@ class FunctionInstantiationHelper {
 
     // Control deps.
     for (int i = fnode_arg_index; i < fnode.input_size(); ++i) {
-      const string& input = fnode.input(i);
+      const std::string& input = fnode.input(i);
       if (input.empty() || input[0] != '^') {
         return errors::InvalidArgument("Expected input[", i, "] == '", input,
                                        "' to be a control input.");
       }
       int nid = -1;
-      const string node_name = input.substr(1);
-      const string node_colon = node_name + ":";
-      const string node_colon_bound = node_name + ";";
+      const std::string node_name = input.substr(1);
+      const std::string node_colon = node_name + ":";
+      const std::string node_colon_bound = node_name + ";";
       // index_ is a map sorted lexicographically, so the key we are looking for
       // must lie in the range [node_name, node_colon_bound).
       auto it = index_.lower_bound(node_name);
@@ -379,7 +379,7 @@ class FunctionInstantiationHelper {
 
   absl::Status AddReturnNode(
       const OpDef::ArgDef& ret_def, AttrSlice attrs,
-      const ::tensorflow::protobuf::Map<string, string>& ret_map,
+      const ::tensorflow::protobuf::Map<std::string, std::string>& ret_map,
       bool ints_on_device, int* ret_index) {
     auto ret_iter = ret_map.find(ret_def.name());
     if (ret_iter == ret_map.end()) {
@@ -401,7 +401,7 @@ class FunctionInstantiationHelper {
                                      DataTypeVectorString(item->dtypes));
     }
     for (size_t i = 0; i < dtypes.size(); ++i) {
-      string name = absl::StrCat(ret_def.name(), "_RetVal");
+      std::string name = absl::StrCat(ret_def.name(), "_RetVal");
       if (dtypes.size() > 1) {
         absl::StrAppend(&name, "_", i);
       }
@@ -456,7 +456,7 @@ class FunctionInstantiationHelper {
   };
 
   // Adds an item into the input name index.
-  absl::Status AddItem(const string& name, const NameInfoItem& item) {
+  absl::Status AddItem(const std::string& name, const NameInfoItem& item) {
     if (!index_.insert({name, item}).second) {
       return errors::InvalidArgument(
           absl::StrCat("Duplicated ", item.is_func_arg ? "arg" : "ret",
@@ -466,20 +466,20 @@ class FunctionInstantiationHelper {
     return absl::OkStatus();
   }
 
-  const NameInfoItem* GetItemOrNull(const string& name) const {
+  const NameInfoItem* GetItemOrNull(const std::string& name) const {
     return gtl::FindOrNull(index_, name);
   }
 
-  string Dep(int node_index) const {
+  std::string Dep(int node_index) const {
     return absl::StrCat("^", Name(node_index));
   }
 
-  string Name(int node_index) const {
+  std::string Name(int node_index) const {
     CHECK_LT(node_index, nodes_.size());
     return nodes_[node_index].name;
   }
 
-  string Name(int node_index, int output_index) const {
+  std::string Name(int node_index, int output_index) const {
     if (output_index == 0) {
       return Name(node_index);
     } else {
@@ -487,7 +487,7 @@ class FunctionInstantiationHelper {
     }
   }
 
-  NodeDef* AddNode(const string& name) {
+  NodeDef* AddNode(const std::string& name) {
     result_.nodes.emplace_back();
     NodeDef* gnode = &result_.nodes.back();
     gnode->set_name(name);
@@ -510,11 +510,11 @@ class FunctionInstantiationHelper {
   GetFunctionSignature get_function_;
   InstantiationResult& result_;
   // A small index for all names that can be used as a node's input arguments.
-  std::map<string, NameInfoItem> index_;
+  std::map<std::string, NameInfoItem> index_;
   // This contains information about a node in the new graph including the node
   // names and input nodes' indexes.
   struct NodeInfo {
-    string name;
+    std::string name;
     // Data inputs where <n, k> means arg k of node n.
     std::vector<std::pair<int, int>> data_inputs;
     // Control inputs (dependencies).
@@ -525,8 +525,8 @@ class FunctionInstantiationHelper {
 };
 
 // Various helpers Print(proto) to print relevant protos to ascii.
-string Print(const OpDef::ArgDef& arg) {
-  string out;
+std::string Print(const OpDef::ArgDef& arg) {
+  std::string out;
   absl::StrAppend(&out, arg.name(), ":");
   if (arg.is_ref()) absl::StrAppend(&out, "Ref(");
   if (!arg.number_attr().empty()) {
@@ -545,13 +545,13 @@ string Print(const OpDef::ArgDef& arg) {
 // When hash_string_attrs = true, string attributes are hashed instead of being
 // truncated with ellipses. This is done to reduce the chance of collisions when
 // looking up functions using the canonical representation.
-string Print(const AttrValue& attr_value,
-             const bool hash_string_attrs = false) {
+std::string Print(const AttrValue& attr_value,
+                  const bool hash_string_attrs = false) {
   if (attr_value.value_case() == AttrValue::kType) {
     return DataTypeString(attr_value.type());
   } else if ((attr_value.value_case() == AttrValue::kList) &&
              (attr_value.list().type_size() > 0)) {
-    string ret = "{";
+    std::string ret = "{";
     for (int i = 0; i < attr_value.list().type_size(); ++i) {
       if (i > 0) absl::StrAppend(&ret, ", ");
       absl::StrAppend(&ret, DataTypeString(attr_value.list().type(i)));
@@ -562,7 +562,7 @@ string Print(const AttrValue& attr_value,
     if (attr_value.func().attr_size() == 0) {
       return attr_value.func().name();
     }
-    std::vector<string> entries;
+    std::vector<std::string> entries;
     for (const auto& p : attr_value.func().attr()) {
       entries.push_back(absl::StrCat(p.first, "=", Print(p.second)));
     }
@@ -576,11 +576,11 @@ string Print(const AttrValue& attr_value,
 }
 
 // TODO(josh11b): Merge this with SummarizeNodeDef().
-string Print(const NodeDef& n) {
-  string out;
+std::string Print(const NodeDef& n) {
+  std::string out;
   absl::StrAppend(&out, n.name(), " = ", n.op());
   if (n.attr_size() > 0) {
-    std::vector<string> entries;
+    std::vector<std::string> entries;
     for (auto& a : n.attr()) {
       entries.push_back(absl::StrCat(a.first, "=", Print(a.second)));
     }
@@ -598,7 +598,7 @@ string Print(const NodeDef& n) {
   }
   absl::StrAppend(&out, "(");
   std::vector<absl::string_view> dat;
-  std::vector<string> dep;
+  std::vector<std::string> dep;
   for (absl::string_view s : n.input()) {
     if (absl::ConsumePrefix(&s, "^")) {
       dep.emplace_back(s);
@@ -613,8 +613,8 @@ string Print(const NodeDef& n) {
   return out;
 }
 
-string Print(const FunctionDef& fdef) {
-  string out;
+std::string Print(const FunctionDef& fdef) {
+  std::string out;
   const OpDef& sig = fdef.signature();
   absl::StrAppend(&out, "\n", sig.name());
   if (sig.attr_size() > 0) {
@@ -654,7 +654,7 @@ string Print(const FunctionDef& fdef) {
   return out;
 }
 
-string Print(absl::Span<const NodeDef* const> nodes) {
+std::string Print(absl::Span<const NodeDef* const> nodes) {
   std::vector<const NodeDef*> arg;
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
@@ -678,7 +678,7 @@ string Print(absl::Span<const NodeDef* const> nodes) {
   };
   std::sort(arg.begin(), arg.end(), comp);
   std::sort(ret.begin(), ret.end(), comp);
-  string out;
+  std::string out;
   absl::StrAppend(&out, "\n(");
   auto get_type_and_device = [](const NodeDef& n) {
     DataType dt;
@@ -714,7 +714,7 @@ string Print(absl::Span<const NodeDef* const> nodes) {
     // The _RetVal op should have a unique non-control input. We assert that
     // here and add it to the output.
     bool found_non_control_input = false;
-    for (const string& input : n->input()) {
+    for (const std::string& input : n->input()) {
       if (!input.empty() && input[0] != '^') {
         DCHECK_EQ(found_non_control_input, false)
             << "RetVal node has more than one non-control input: "
@@ -735,7 +735,7 @@ string Print(absl::Span<const NodeDef* const> nodes) {
   return out;
 }
 
-absl::Status AddDefaultAttrs(const string& op,
+absl::Status AddDefaultAttrs(const std::string& op,
                              const GetFunctionSignature& get_function,
                              AttrValueMap* attrs) {
   const OpDef* op_def = nullptr;
@@ -799,7 +799,8 @@ absl::Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
     }
   }
 
-  auto substitute = [attr_values, &sig](const string& name, AttrValue* val) {
+  auto substitute = [attr_values, &sig](const std::string& name,
+                                        AttrValue* val) {
     // Look for a specified value...
     if (const AttrValue* v = attr_values.FindByString(name)) {
       *val = *v;
@@ -870,9 +871,9 @@ absl::Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   return absl::OkStatus();
 }
 
-string DebugString(const FunctionDef& func_def) { return Print(func_def); }
+std::string DebugString(const FunctionDef& func_def) { return Print(func_def); }
 
-string DebugString(const GraphDef& instantiated_func_def) {
+std::string DebugString(const GraphDef& instantiated_func_def) {
   std::vector<const NodeDef*> ptrs;
   for (const NodeDef& n : instantiated_func_def.node()) {
     ptrs.push_back(&n);
@@ -880,7 +881,7 @@ string DebugString(const GraphDef& instantiated_func_def) {
   return Print(ptrs);
 }
 
-string DebugString(absl::Span<const NodeDef> instantiated_func_nodes) {
+std::string DebugString(absl::Span<const NodeDef> instantiated_func_nodes) {
   std::vector<const NodeDef*> ptrs;
   for (const NodeDef& n : instantiated_func_nodes) {
     ptrs.push_back(&n);
@@ -888,8 +889,8 @@ string DebugString(absl::Span<const NodeDef> instantiated_func_nodes) {
   return Print(ptrs);
 }
 
-string DebugStringWhole(const GraphDef& gdef) {
-  string ret;
+std::string DebugStringWhole(const GraphDef& gdef) {
+  std::string ret;
   for (const auto& fdef : gdef.library().function()) {
     absl::StrAppend(&ret, Print(fdef));
   }
@@ -905,8 +906,8 @@ namespace {
 // Returns the name -> attr mapping of fdef's attrs that have a value set. In
 // Python, it's possible to access unset attrs, which returns a default value
 // and adds an unset attr to the map.
-std::map<string, AttrValue> GetSetAttrs(const FunctionDef& fdef) {
-  std::map<string, AttrValue> set_attrs;
+std::map<std::string, AttrValue> GetSetAttrs(const FunctionDef& fdef) {
+  std::map<std::string, AttrValue> set_attrs;
   for (const auto& pair : fdef.attr()) {
     if (pair.second.value_case() != AttrValue::VALUE_NOT_SET) {
       set_attrs[pair.first] = pair.second;
@@ -920,8 +921,8 @@ std::map<string, AttrValue> GetSetAttrs(const FunctionDef& fdef) {
 bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   if (!OpDefEqual(f1.signature(), f2.signature())) return false;
 
-  std::map<string, AttrValue> f1_attrs = GetSetAttrs(f1);
-  std::map<string, AttrValue> f2_attrs = GetSetAttrs(f2);
+  std::map<std::string, AttrValue> f1_attrs = GetSetAttrs(f1);
+  std::map<std::string, AttrValue> f2_attrs = GetSetAttrs(f2);
   if (f1_attrs.size() != f2_attrs.size()) return false;
   for (const auto& iter1 : f1_attrs) {
     auto iter2 = f2_attrs.find(iter1.first);
@@ -933,25 +934,25 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
     return false;
   }
 
-  std::map<string, string> ret1(f1.ret().begin(), f1.ret().end());
-  std::map<string, string> ret2(f2.ret().begin(), f2.ret().end());
+  std::map<std::string, std::string> ret1(f1.ret().begin(), f1.ret().end());
+  std::map<std::string, std::string> ret2(f2.ret().begin(), f2.ret().end());
   if (ret1 != ret2) return false;
 
-  std::map<string, string> control_ret1(f1.control_ret().begin(),
-                                        f1.control_ret().end());
-  std::map<string, string> control_ret2(f2.control_ret().begin(),
-                                        f2.control_ret().end());
+  std::map<std::string, std::string> control_ret1(f1.control_ret().begin(),
+                                                  f1.control_ret().end());
+  std::map<std::string, std::string> control_ret2(f2.control_ret().begin(),
+                                                  f2.control_ret().end());
   if (control_ret1 != control_ret2) return false;
 
   return true;
 }
 
-uint64 FunctionDefHash(const FunctionDef& fdef) {
+uint64_t FunctionDefHash(const FunctionDef& fdef) {
   // signature
-  uint64 h = OpDefHash(fdef.signature());
+  uint64_t h = OpDefHash(fdef.signature());
 
   // attrs
-  std::map<string, AttrValue> attrs = GetSetAttrs(fdef);
+  std::map<std::string, AttrValue> attrs = GetSetAttrs(fdef);
   for (const auto& p : attrs) {
     h = Hash64(p.first.data(), p.first.size(), h);
     h = Hash64Combine(AttrValueHash(p.second), h);
@@ -961,15 +962,15 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
   h = Hash64Combine(RepeatedNodeDefHash(fdef.node_def()), h);
 
   // output names
-  std::map<string, string> ret(fdef.ret().begin(), fdef.ret().end());
+  std::map<std::string, std::string> ret(fdef.ret().begin(), fdef.ret().end());
   for (const auto& p : ret) {
     h = Hash64(p.first.data(), p.first.size(), h);
     h = Hash64(p.second.data(), p.second.size(), h);
   }
 
   // control output names
-  std::map<string, string> control_ret(fdef.control_ret().begin(),
-                                       fdef.control_ret().end());
+  std::map<std::string, std::string> control_ret(fdef.control_ret().begin(),
+                                                 fdef.control_ret().end());
   for (const auto& p : control_ret) {
     h = Hash64(p.first.data(), p.first.size(), h);
     h = Hash64(p.second.data(), p.second.size(), h);
@@ -981,14 +982,14 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
 static constexpr const char* const kExecutorAttr = "_executor";
 
 /* static */
-string FunctionLibraryRuntime::ExecutorType(const InstantiateOptions& options,
-                                            AttrSlice attrs) {
+std::string FunctionLibraryRuntime::ExecutorType(
+    const InstantiateOptions& options, AttrSlice attrs) {
   if (!options.executor_type.empty()) {
     return options.executor_type;
   } else if (const AttrValue* executor_attr = attrs.Find(kExecutorAttr)) {
     return executor_attr->s();
   } else {
-    return string();
+    return std::string();
   }
 }
 
@@ -999,7 +1000,7 @@ class AttrKeyAndValue {
     kRaw,
     kCEscape,
   };
-  AttrKeyAndValue(absl::string_view key_name, int key_suffix, string value,
+  AttrKeyAndValue(absl::string_view key_name, int key_suffix, std::string value,
                   ValueRepresentationOp value_op = kRaw)
       : key_name_(key_name),
         key_suffix_(key_suffix),
@@ -1016,7 +1017,7 @@ class AttrKeyAndValue {
     }
   }
 
-  void AppendTo(bool first, string* s) const {
+  void AppendTo(bool first, std::string* s) const {
     absl::string_view v;
     bool add_escaped = false;
     if ((value_op_ == kCEscape) && NeedsEscaping(value_)) {
@@ -1037,9 +1038,9 @@ class AttrKeyAndValue {
   }
 
  private:
-  static bool NeedsEscaping(const string& s) {
+  static bool NeedsEscaping(const std::string& s) {
     for (auto c : s) {
-      if (!isalnum(c) && (c != ' ')) {
+      if (!absl::ascii_isalnum(c) && (c != ' ')) {
         return true;
       }
     }
@@ -1049,16 +1050,17 @@ class AttrKeyAndValue {
   absl::string_view key_name_;
   int key_suffix_;  // -1 if missing
   ValueRepresentationOp value_op_;
-  string value_;
+  std::string value_;
 };
 }  // namespace
 
-string GetFunctionResourceInputDevice(
+std::string GetFunctionResourceInputDevice(
     const Tensor& input, const int arg_index, const FunctionDef& function_def,
-    absl::flat_hash_map<string, std::vector<string>>* composite_devices) {
+    absl::flat_hash_map<std::string, std::vector<std::string>>*
+        composite_devices) {
   const auto& handles = input.flat<ResourceHandle>();
   const ResourceHandle& handle0 = handles(0);
-  string composite_device;
+  std::string composite_device;
   auto iter = function_def.arg_attr().find(arg_index);
   if (iter != function_def.arg_attr().end()) {
     auto arg_attr = iter->second.attr().find("_composite_device");
@@ -1078,8 +1080,9 @@ string GetFunctionResourceInputDevice(
   }
 }
 
-string Canonicalize(const string& funcname, AttrSlice attrs,
-                    const FunctionLibraryRuntime::InstantiateOptions& options) {
+std::string Canonicalize(
+    const std::string& funcname, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options) {
   absl::InlinedVector<AttrKeyAndValue, 8> entries;
   entries.reserve(attrs.size() + static_cast<int>(!options.target.empty()) +
                   options.input_devices.size());
@@ -1118,12 +1121,13 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(
         AttrKeyAndValue("_state_handle", -1, options.state_handle));
   }
-  string executor_type = FunctionLibraryRuntime::ExecutorType(options, attrs);
+  std::string executor_type =
+      FunctionLibraryRuntime::ExecutorType(options, attrs);
   if (!executor_type.empty()) {
     entries.push_back(AttrKeyAndValue(kExecutorAttr, -1, executor_type));
   }
   if (options.config_proto.ByteSize() > 0) {
-    string config_proto_serialized;
+    std::string config_proto_serialized;
     SerializeToStringDeterministic(options.config_proto,
                                    &config_proto_serialized);
     entries.push_back(AttrKeyAndValue("_config_proto", -1,
@@ -1131,7 +1135,7 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
                                       AttrKeyAndValue::kCEscape));
   }
   std::sort(entries.begin(), entries.end());
-  string result = absl::StrCat(funcname, "[");
+  std::string result = absl::StrCat(funcname, "[");
   bool first = true;
   for (const auto& entry : entries) {
     entry.AppendTo(first, &result);
@@ -1141,7 +1145,7 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
   return result;
 }
 
-string Canonicalize(const string& funcname, AttrSlice attrs) {
+std::string Canonicalize(const std::string& funcname, AttrSlice attrs) {
   static const FunctionLibraryRuntime::InstantiateOptions* kEmptyOptions =
       new FunctionLibraryRuntime::InstantiateOptions;
   return Canonicalize(funcname, attrs, *kEmptyOptions);
@@ -1373,12 +1377,13 @@ void FunctionLibraryDefinition::Initialize(
   }
 }
 
-bool FunctionLibraryDefinition::Contains(const string& func) const {
+bool FunctionLibraryDefinition::Contains(const std::string& func) const {
   tf_shared_lock l(mu_);
   return records_.find(func) != records_.end();
 }
 
-const FunctionDef* FunctionLibraryDefinition::Find(const string& func) const {
+const FunctionDef* FunctionLibraryDefinition::Find(
+    const std::string& func) const {
   tf_shared_lock l(mu_);
   auto result = FindHelper(func);
   if (result) {
@@ -1389,13 +1394,13 @@ const FunctionDef* FunctionLibraryDefinition::Find(const string& func) const {
 }
 
 core::RefCountPtr<FunctionRecord> FunctionLibraryDefinition::FindRecord(
-    const string& func) const {
+    const std::string& func) const {
   tf_shared_lock l(mu_);
   return FindHelper(func);
 }
 
 core::RefCountPtr<FunctionRecord> FunctionLibraryDefinition::FindHelper(
-    const string& func) const {
+    const std::string& func) const {
   auto iter = records_.find(func);
   if (iter == records_.end()) {
     return nullptr;
@@ -1474,7 +1479,7 @@ absl::Status FunctionLibraryDefinition::AddHelper(FunctionRecord* registration,
 }
 
 absl::Status FunctionLibraryDefinition::CopyFunctionDefFrom(
-    const string& name, const FunctionLibraryDefinition& other) {
+    const std::string& name, const FunctionLibraryDefinition& other) {
   if (default_registry() != other.default_registry()) {
     return errors::InvalidArgument(
         "Cannot copy function '", name,
@@ -1516,7 +1521,7 @@ absl::Status FunctionLibraryDefinition::AddGradientDef(
 absl::Status FunctionLibraryDefinition::AddGradientDefHelper(
     const GradientDef& grad, bool* added) {
   *added = false;
-  string* entry = &func_grad_[grad.function_name()];
+  std::string* entry = &func_grad_[grad.function_name()];
   if (!entry->empty()) {
     if (*entry != grad.gradient_func()) {
       return errors::InvalidArgument(
@@ -1545,8 +1550,8 @@ absl::Status FunctionLibraryDefinition::AddLibrary(
   mutex_lock l2(other.mu_);
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
-  std::vector<string> funcs;
-  std::vector<string> funcs_with_grads;
+  std::vector<std::string> funcs;
+  std::vector<std::string> funcs_with_grads;
   absl::Status s;
   bool added;
   for (const auto& [name, record] : other.records_) {
@@ -1603,8 +1608,8 @@ absl::Status FunctionLibraryDefinition::AddLibrary(
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
   mutex_lock l(mu_);
-  std::vector<string> funcs;
-  std::vector<string> funcs_with_grads;
+  std::vector<std::string> funcs;
+  std::vector<std::string> funcs_with_grads;
   absl::Status s;
   bool added;
   for (FunctionDef& fdef : *lib_def.mutable_function()) {
@@ -1641,7 +1646,7 @@ absl::Status FunctionLibraryDefinition::AddLibrary(
 }
 
 absl::Status FunctionLibraryDefinition::ReplaceFunction(
-    const string& func, const FunctionDef& fdef,
+    const std::string& func, const FunctionDef& fdef,
     const StackTracesMap& stack_traces) {
   mutex_lock l(mu_);
   bool added;
@@ -1660,14 +1665,15 @@ absl::Status FunctionLibraryDefinition::ReplaceGradient(
   return absl::OkStatus();
 }
 
-absl::Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
+absl::Status FunctionLibraryDefinition::RemoveFunction(
+    const std::string& func) {
   mutex_lock l(mu_);
   TF_RETURN_IF_ERROR(RemoveFunctionHelper(func));
   return absl::OkStatus();
 }
 
 absl::Status FunctionLibraryDefinition::RemoveFunctionHelper(
-    const string& func) {
+    const std::string& func) {
   auto iter = records_.find(func);
   if (iter == records_.end()) {
     return errors::InvalidArgument("Tried to remove non-existent function '",
@@ -1688,7 +1694,8 @@ void FunctionLibraryDefinition::Clear() {
   func_grad_.clear();
 }
 
-absl::Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
+absl::Status FunctionLibraryDefinition::RemoveGradient(
+    const std::string& func) {
   const auto& i = func_grad_.find(func);
   if (i == func_grad_.end()) {
     return errors::InvalidArgument("Tried to remove non-existent gradient '",
@@ -1699,16 +1706,16 @@ absl::Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
 }
 
 absl::Status FunctionLibraryDefinition::Remove(
-    const std::vector<string>& funcs,
-    const std::vector<string>& funcs_with_grads) {
+    const std::vector<std::string>& funcs,
+    const std::vector<std::string>& funcs_with_grads) {
   absl::Status s;
-  for (const string& f : funcs) {
+  for (const std::string& f : funcs) {
     s = RemoveFunctionHelper(f);
     if (!s.ok()) {
       return s;
     }
   }
-  for (const string& f : funcs_with_grads) {
+  for (const std::string& f : funcs_with_grads) {
     s = RemoveGradient(f);
     if (!s.ok()) {
       return s;
@@ -1717,17 +1724,19 @@ absl::Status FunctionLibraryDefinition::Remove(
   return absl::OkStatus();
 }
 
-string FunctionLibraryDefinition::FindGradient(const string& func) const {
+std::string FunctionLibraryDefinition::FindGradient(
+    const std::string& func) const {
   tf_shared_lock l(mu_);
   return gtl::FindWithDefault(func_grad_, func, "");
 }
 
-string FunctionLibraryDefinition::FindGradientHelper(const string& func) const {
+std::string FunctionLibraryDefinition::FindGradientHelper(
+    const std::string& func) const {
   return gtl::FindWithDefault(func_grad_, func, "");
 }
 
 absl::Status FunctionLibraryDefinition::LookUp(
-    const string& op, const OpRegistrationData** op_reg_data) const {
+    const std::string& op, const OpRegistrationData** op_reg_data) const {
   tf_shared_lock l(mu_);
   auto iter = records_.find(op);
   if (iter != records_.end()) {
@@ -1737,11 +1746,11 @@ absl::Status FunctionLibraryDefinition::LookUp(
   return default_registry_->LookUp(op, op_reg_data);
 }
 
-string FunctionLibraryDefinition::UniqueFunctionName(
+std::string FunctionLibraryDefinition::UniqueFunctionName(
     absl::string_view prefix) const {
   tf_shared_lock l(mu_);
   int index = 0;
-  string name = absl::StrCat(prefix, index);
+  std::string name = absl::StrCat(prefix, index);
   while (records_.find(name) != records_.end()) {
     ++index;
     name = absl::StrCat(prefix, index);
@@ -1763,8 +1772,8 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   if (!TryGetNodeAttr(ndef, kFuncAttr, &forward_func_attrs)) {
     return nullptr;
   }
-  const string& func_name = forward_func_attrs->name();
-  const string& grad_name = FindGradient(func_name);
+  const std::string& func_name = forward_func_attrs->name();
+  const std::string& grad_name = FindGradient(func_name);
   // If 'func' has a user-defined gradient function, uses the grad
   // function's attrs to see if noinline is specified. Otherwise,
   // uses func's attrs.
@@ -1782,8 +1791,8 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   }
 }
 
-std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
-  std::vector<string> function_names;
+std::vector<std::string> FunctionLibraryDefinition::ListFunctionNames() const {
+  std::vector<std::string> function_names;
   tf_shared_lock l(mu_);
   function_names.reserve(records_.size());
   for (const auto& it : records_) {
@@ -1808,7 +1817,7 @@ FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
 
 template <typename T>
 absl::Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
-                                                const string& attr,
+                                                const std::string& attr,
                                                 T* value) const {
   const FunctionDef* fdef = GetAttrImpl(ndef);
   if (fdef && TryGetNodeAttr(AttrSlice(&fdef->attr()), attr, value)) {
@@ -1819,7 +1828,7 @@ absl::Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
 
 template <typename T>
 absl::Status FunctionLibraryDefinition::GetAttr(const Node& node,
-                                                const string& attr,
+                                                const std::string& attr,
                                                 T* value) const {
   return GetAttr(node.def(), attr, value);
 }
@@ -1839,25 +1848,25 @@ constexpr char kApiImplements[] = "api_implements";
 
 template <typename NodeType, typename NodeIter, typename OpTypeGetter,
           typename AttrGetter>
-std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
-                                    NodeIter begin, NodeIter end,
-                                    OpTypeGetter op_type_getter,
-                                    AttrGetter attr_getter) {
+std::set<std::string> ReachableFunctions(const FunctionLibraryDefinition& flib,
+                                         NodeIter begin, NodeIter end,
+                                         OpTypeGetter op_type_getter,
+                                         AttrGetter attr_getter) {
   // Functions that are reachable from the graph.
-  std::set<string> reachable_funcs;
+  std::set<std::string> reachable_funcs;
 
   // For any functions, if it has attribute "api_implements" =
   // "some_interface" and it is reachable, then it means any other
   // function with same attribute name and value could also be potentially
   // reachable, eg via implementation_selector swapping the nodedef.
-  absl::flat_hash_set<string> reachable_api_interface;
+  absl::flat_hash_set<std::string> reachable_api_interface;
 
   // Functions might be reachable from the nested function calls, so we keep a
   // queue of functions that we have to check.
   absl::InlinedVector<core::RefCountPtr<FunctionRecord>, 4> func_queue;
 
   // Add reachable and not already processed functions to the functions queue.
-  const auto add_to_func_queue = [&](const string& func_name) {
+  const auto add_to_func_queue = [&](const std::string& func_name) {
     auto record = flib.FindRecord(func_name);
     if (record && reachable_funcs.find(func_name) == reachable_funcs.end()) {
       func_queue.push_back(std::move(record));
@@ -1866,19 +1875,20 @@ std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
 
   // If any function with certain API name is reachable, all the other functions
   // with same API name should also be checked.
-  const auto add_function_with_api_interface = [&](const string& api_name) {
-    if (!reachable_api_interface.contains(api_name)) {
-      reachable_api_interface.insert(api_name);
-      for (const auto& func_name : flib.ListFunctionNames()) {
-        const auto record = flib.FindRecord(func_name);
-        const auto attr_it = record->fdef().attr().find(kApiImplements);
-        if (attr_it != record->fdef().attr().end() &&
-            attr_it->second.s() == api_name) {
-          add_to_func_queue(func_name);
+  const auto add_function_with_api_interface =
+      [&](const std::string& api_name) {
+        if (!reachable_api_interface.contains(api_name)) {
+          reachable_api_interface.insert(api_name);
+          for (const auto& func_name : flib.ListFunctionNames()) {
+            const auto record = flib.FindRecord(func_name);
+            const auto attr_it = record->fdef().attr().find(kApiImplements);
+            if (attr_it != record->fdef().attr().end() &&
+                attr_it->second.s() == api_name) {
+              add_to_func_queue(func_name);
+            }
+          }
         }
-      }
-    }
-  };
+      };
 
   const auto process_attr_value = [&](const AttrValue& attr_value) {
     // 1. AttrValue.func
@@ -1913,7 +1923,7 @@ std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
     auto func = std::move(func_queue.back());
     func_queue.pop_back();
 
-    const string& func_name = func->fdef().signature().name();
+    const std::string& func_name = func->fdef().signature().name();
     reachable_funcs.insert(func_name);
 
     const auto attr_it = func->fdef().attr().find(kApiImplements);
@@ -1937,7 +1947,7 @@ std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
     std::for_each(func_body.begin(), func_body.end(), process_node_def);
 
     // Check if the function has a registered gradient.
-    const string grad_func_name = flib.FindGradient(func_name);
+    const std::string grad_func_name = flib.FindGradient(func_name);
     if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
   }
 
@@ -1949,19 +1959,19 @@ template <typename NodeType, typename NodeIter, typename OpTypeGetter,
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, NodeIter begin, NodeIter end,
     OpTypeGetter op_type_getter, AttrGetter attr_getter) {
-  std::set<string> reachable_funcs = ReachableFunctions<NodeType>(
+  std::set<std::string> reachable_funcs = ReachableFunctions<NodeType>(
       flib, begin, end, op_type_getter, attr_getter);
 
   FunctionLibraryDefinition reachable_flib(flib.default_registry(),
                                            FunctionDefLibrary());
 
-  for (const string& func_name : reachable_funcs) {
+  for (const std::string& func_name : reachable_funcs) {
     // This should never fail, because we copy functions from a valid flib and
     // use the same default registry.
     absl::Status added = reachable_flib.CopyFunctionDefFrom(func_name, flib);
     TF_DCHECK_OK(added);
 
-    const string grad_func_name = flib.FindGradient(func_name);
+    const std::string grad_func_name = flib.FindGradient(func_name);
     if (!grad_func_name.empty()) {
       GradientDef grad;
       grad.set_function_name(func_name);
@@ -1975,9 +1985,9 @@ FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
   return reachable_flib;
 }
 
-string AllocatorAttributesToString(
+std::string AllocatorAttributesToString(
     const std::vector<AllocatorAttributes>& attrs) {
-  string result("[");
+  std::string result("[");
   // AllocatorAttribute::DebugString produces around 85 bytes now.
   result.reserve(100 * attrs.size());
   for (const AllocatorAttributes& attr : attrs) {
@@ -2036,7 +2046,7 @@ FunctionLibraryDefinition::ReachableDefinitions(
   }
 }
 
-string FunctionLibraryRuntime::Options::DebugString() const {
+std::string FunctionLibraryRuntime::Options::DebugString() const {
   return absl::StrCat(
       "FLR::Options(step_id=", step_id, " rendezvous=", IsSet(rendezvous),
       " cancellation_manager=", IsSet(cancellation_manager),
@@ -2060,8 +2070,8 @@ void FunctionDefHelper::AttrValueWrapper::InitFromString(
 }
 
 FunctionDefHelper::AttrValueWrapper FunctionDefHelper::FunctionRef(
-    const string& name,
-    absl::Span<const std::pair<string, AttrValueWrapper>> attrs) {
+    const std::string& name,
+    absl::Span<const std::pair<std::string, AttrValueWrapper>> attrs) {
   AttrValueWrapper ret;
   ret.proto.mutable_func()->set_name(name);
   for (const auto& a : attrs) {
@@ -2077,10 +2087,10 @@ NodeDef FunctionDefHelper::Node::ToNodeDef() const {
   for (const auto& a : this->attr) {
     n.mutable_attr()->insert({a.first, a.second.proto});
   }
-  for (const string& a : this->arg) {
+  for (const std::string& a : this->arg) {
     n.add_input(a);
   }
-  for (const string& d : this->dep) {
+  for (const std::string& d : this->dep) {
     n.add_input(absl::StrCat("^", d));
   }
   if (!this->device.empty()) {
@@ -2099,11 +2109,11 @@ NodeDef FunctionDefHelper::Node::ToNodeDef() const {
 
 /* static */
 FunctionDef FunctionDefHelper::Create(
-    const string& function_name, absl::Span<const string> in_def,
-    absl::Span<const string> out_def, absl::Span<const string> attr_def,
-    absl::Span<const Node> node_def,
-    absl::Span<const std::pair<string, string>> ret_def,
-    absl::Span<const std::pair<string, string>> control_ret_def) {
+    const std::string& function_name, absl::Span<const std::string> in_def,
+    absl::Span<const std::string> out_def,
+    absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+    absl::Span<const std::pair<std::string, std::string>> ret_def,
+    absl::Span<const std::pair<std::string, std::string>> control_ret_def) {
   FunctionDef fdef;
 
   // Signature
@@ -2149,19 +2159,19 @@ FunctionDef FunctionDefHelper::Create(
 
 /* static */
 FunctionDef FunctionDefHelper::Create(
-    const string& function_name, absl::Span<const string> in_def,
-    absl::Span<const string> out_def, absl::Span<const string> attr_def,
-    absl::Span<const Node> node_def,
-    absl::Span<const std::pair<string, string>> ret_def) {
+    const std::string& function_name, absl::Span<const std::string> in_def,
+    absl::Span<const std::string> out_def,
+    absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+    absl::Span<const std::pair<std::string, std::string>> ret_def) {
   return Create(function_name, in_def, out_def, attr_def, node_def, ret_def,
                 /*control_ret_def=*/{});
 }
 
 /* static */
-FunctionDef FunctionDefHelper::Define(const string& name,
-                                      absl::Span<const string> arg_def,
-                                      absl::Span<const string> ret_def,
-                                      absl::Span<const string> attr_def,
+FunctionDef FunctionDefHelper::Define(const std::string& name,
+                                      absl::Span<const std::string> arg_def,
+                                      absl::Span<const std::string> ret_def,
+                                      absl::Span<const std::string> attr_def,
                                       absl::Span<const Node> node_def) {
   FunctionDef fdef;
   OpDefBuilder b(name);
@@ -2174,7 +2184,7 @@ FunctionDef FunctionDefHelper::Define(const string& name,
   fdef.mutable_signature()->Swap(&op_reg_data.op_def);
 
   // Mapping from legacy output names to NodeDef outputs.
-  std::unordered_map<string, string> ret_index;
+  std::unordered_map<std::string, std::string> ret_index;
   for (const auto& a : fdef.signature().input_arg()) {
     ret_index[a.name()] = a.name();
   }
@@ -2190,13 +2200,13 @@ FunctionDef FunctionDefHelper::Define(const string& name,
     for (const auto& a : src.attr) {
       n->mutable_attr()->insert({a.first, a.second.proto});
     }
-    for (const string& a : src.arg) {
+    for (const std::string& a : src.arg) {
       const auto iter = ret_index.find(a);
       CHECK(iter != ret_index.end())
           << "Node input '" << a << "' in '" << n->name() << "' of " << name;
       n->add_input(iter->second);
     }
-    for (const string& d : src.dep) {
+    for (const std::string& d : src.dep) {
       n->add_input(absl::StrCat("^", d));
     }
 
@@ -2227,29 +2237,29 @@ FunctionDef FunctionDefHelper::Define(const string& name,
   return fdef;
 }
 
-FunctionDef FunctionDefHelper::Define(absl::Span<const string> arg_def,
-                                      absl::Span<const string> ret_def,
-                                      absl::Span<const string> attr_def,
+FunctionDef FunctionDefHelper::Define(absl::Span<const std::string> arg_def,
+                                      absl::Span<const std::string> ret_def,
+                                      absl::Span<const std::string> attr_def,
                                       absl::Span<const Node> node_def) {
   return Define("_", arg_def, ret_def, attr_def, node_def);
 }
 
 namespace gradient {
 
-typedef std::unordered_map<string, Creator> OpGradFactory;
+typedef std::unordered_map<std::string, Creator> OpGradFactory;
 
 OpGradFactory* GetOpGradFactory() {
   static OpGradFactory* factory = new OpGradFactory;
   return factory;
 }
 
-bool RegisterOp(const string& op, Creator func) {
+bool RegisterOp(const std::string& op, Creator func) {
   CHECK(GetOpGradFactory()->insert({op, func}).second)
       << "Duplicated gradient for " << op;
   return true;
 }
 
-absl::Status GetOpGradientCreator(const string& op, Creator* creator) {
+absl::Status GetOpGradientCreator(const std::string& op, Creator* creator) {
   auto fac = GetOpGradFactory();
   auto iter = fac->find(op);
   if (iter == fac->end()) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 7fbf120afd6741..ed2ec8c075db08 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -125,7 +125,7 @@ class FunctionDefHelper {
   // Constructs an AttrValue.func given the "name" and "attrs".
   static AttrValueWrapper FunctionRef(
       const std::string& name,
-      absl::Span<const std::pair<string, AttrValueWrapper>> attrs);
+      absl::Span<const std::pair<std::string, AttrValueWrapper>> attrs);
   static AttrValueWrapper FunctionRef(const std::string& name) {
     return FunctionRef(name, {});
   }
@@ -141,11 +141,11 @@ class FunctionDefHelper {
   struct Node {
     // When constructing a NodeDef, the first entry in ret is used as
     // the node name, the remaining values are ignored.
-    std::vector<string> ret;
+    std::vector<std::string> ret;
     std::string op;
-    std::vector<string> arg;
-    std::vector<std::pair<string, AttrValueWrapper>> attr;
-    std::vector<string> dep;
+    std::vector<std::string> arg;
+    std::vector<std::pair<std::string, AttrValueWrapper>> attr;
+    std::vector<std::string> dep;
     std::string device;
 
     // Required if the op has zero outputs. Otherwise, ret[0] used as name if
@@ -157,8 +157,8 @@ class FunctionDefHelper {
       CHECK(!ret.empty());
       return ret[0];
     }
-    std::vector<string> original_node_names;
-    std::vector<string> original_func_names;
+    std::vector<std::string> original_node_names;
+    std::vector<std::string> original_func_names;
 
     NodeDef ToNodeDef() const;
   };
@@ -170,33 +170,33 @@ class FunctionDefHelper {
   // - `control_ret_def` holds a mapping from the function control
   //   output names to the nodes from `node_def`.
   static FunctionDef Create(
-      const std::string& function_name, absl::Span<const string> in_def,
-      absl::Span<const string> out_def, absl::Span<const string> attr_def,
-      absl::Span<const Node> node_def,
-      absl::Span<const std::pair<string, string>> ret_def,
-      absl::Span<const std::pair<string, string>> control_ret_def);
+      const std::string& function_name, absl::Span<const std::string> in_def,
+      absl::Span<const std::string> out_def,
+      absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+      absl::Span<const std::pair<std::string, std::string>> ret_def,
+      absl::Span<const std::pair<std::string, std::string>> control_ret_def);
 
   // Creates a FunctionDef from the given parameters. Node inputs must use
   // function encoding (node_name:output_name[:output_index]).
   // - `ret_def` holds a mapping from the function output names from `out_def`
   //   to the node outputs from `node_def`.
   static FunctionDef Create(
-      const std::string& function_name, absl::Span<const string> in_def,
-      absl::Span<const string> out_def, absl::Span<const string> attr_def,
-      absl::Span<const Node> node_def,
-      absl::Span<const std::pair<string, string>> ret_def);
+      const std::string& function_name, absl::Span<const std::string> in_def,
+      absl::Span<const std::string> out_def,
+      absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+      absl::Span<const std::pair<std::string, std::string>> ret_def);
 
   // TODO(josh11b): Get rid of these and transition to the one above.
   static FunctionDef Define(const std::string& function_name,
-                            absl::Span<const string> arg_def,
-                            absl::Span<const string> ret_def,
-                            absl::Span<const string> attr_def,
+                            absl::Span<const std::string> arg_def,
+                            absl::Span<const std::string> ret_def,
+                            absl::Span<const std::string> attr_def,
                             absl::Span<const Node> node_def);
 
   // Defines an anonymous function. I.e., its name is not relevant.
-  static FunctionDef Define(absl::Span<const string> arg_def,
-                            absl::Span<const string> ret_def,
-                            absl::Span<const string> attr_def,
+  static FunctionDef Define(absl::Span<const std::string> arg_def,
+                            absl::Span<const std::string> ret_def,
+                            absl::Span<const std::string> attr_def,
                             absl::Span<const Node> node_def);
 
   // Helpers to construct a constant scalar.
@@ -258,7 +258,7 @@ inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(
 // GetFunctionSignature(func name, opdef) returns OK if the func name is found
 // and opdef is filled with a pointer to the corresponding signature
 // (a OpDef proto). Otherwise, returns an error.
-typedef std::function<absl::Status(const string&, const OpDef**)>
+typedef std::function<absl::Status(const std::string&, const OpDef**)>
     GetFunctionSignature;
 
 struct InstantiationResult {
@@ -293,7 +293,7 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2);
 // Return a hash of `fdef` that is consistent with FunctionDefsEqual method.
 // In other words, if two fdefs compare equal, their hash values will be the
 // same.
-uint64 FunctionDefHash(const FunctionDef& fdef);
+uint64_t FunctionDefHash(const FunctionDef& fdef);
 
 class CallFrameInterface {
  public:
@@ -566,7 +566,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   }
 
   // Returns all the function names in the FunctionLibraryDefinition.
-  std::vector<string> ListFunctionNames() const TF_LOCKS_EXCLUDED(mu_);
+  std::vector<std::string> ListFunctionNames() const TF_LOCKS_EXCLUDED(mu_);
 
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
@@ -658,7 +658,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   void Initialize(const FunctionDefLibrary& library,
                   const FunctionDefLibraryStackTraces& library_traces);
 
-  core::RefCountPtr<FunctionRecord> FindHelper(const string& func) const
+  core::RefCountPtr<FunctionRecord> FindHelper(const std::string& func) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
   std::string FindGradientHelper(const std::string& func) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
@@ -681,8 +681,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   // Remove all functions in `funcs` and all gradients of functions in
   // `funcs_with_grads` from this library.
-  absl::Status Remove(const std::vector<string>& funcs,
-                      const std::vector<string>& funcs_with_grads)
+  absl::Status Remove(const std::vector<std::string>& funcs,
+                      const std::vector<std::string>& funcs_with_grads)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Remove `func` from the library. Returns non-OK Status unless `func` is in
@@ -698,10 +698,11 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   mutable mutex mu_;
   const OpRegistryInterface* default_registry_;
-  gtl::FlatMap<string, FunctionRecord*> records_ TF_GUARDED_BY(mu_);
-  gtl::FlatMap<string, string> func_grad_ TF_GUARDED_BY(mu_);
+  gtl::FlatMap<std::string, FunctionRecord*> records_ TF_GUARDED_BY(mu_);
+  gtl::FlatMap<std::string, std::string> func_grad_ TF_GUARDED_BY(mu_);
   // Maps from function name to optimized function graph.
-  gtl::FlatMap<string, std::function<absl::StatusOr<OptimizedFunctionGraph>()>>
+  gtl::FlatMap<std::string,
+               std::function<absl::StatusOr<OptimizedFunctionGraph>()>>
       optimized_function_graph_creator_map_ TF_GUARDED_BY(mu_);
 };
 
@@ -752,7 +753,7 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
     // function's inputs. The device of resource inputs must be the device
     // backing the resource, not the CPU device backing the resource handle.
     // Must have the same length as number of inputs to the function.
-    std::vector<string> input_devices;
+    std::vector<std::string> input_devices;
 
     // For multi-device functions, a vector of canonical device names for
     // function's outputs.
@@ -780,14 +781,15 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
     // resource output, and node producing that resource is a function call,
     // runtime will leave device specification empty and will rely on Placer to
     // infer correct device.
-    std::vector<string> output_devices;
+    std::vector<std::string> output_devices;
 
     // If set, it indicates the original output indices of a component function.
     absl::optional<std::vector<int>> ret_indices = absl::nullopt;
 
     // Maps from a CompositeDevice name to a list of underlying physical
     // devices.
-    absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
+    absl::flat_hash_map<std::string, const std::vector<std::string>*>
+        composite_devices;
 
     // This interface is EXPERIMENTAL and subject to change.
     //
@@ -836,8 +838,8 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
 
     // If provided, this optimization function will be invoked before
     // the placer for multi-device functions.
-    std::function<absl::Status(std::vector<string> /*ret_node_names*/,
-                               std::vector<string> /*keep_node_names*/,
+    std::function<absl::Status(std::vector<std::string> /*ret_node_names*/,
+                               std::vector<std::string> /*keep_node_names*/,
                                FunctionLibraryDefinition*, const DeviceSet&,
                                Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
         optimize_graph_fn;
@@ -899,7 +901,7 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
     // Instantiates the function enabling soft placement or outside compilation.
     bool allow_soft_placement = false;
   };
-  typedef uint64 Handle;
+  typedef uint64_t Handle;
   virtual absl::Status Instantiate(const std::string& function_name,
                                    AttrSlice attrs,
                                    const InstantiateOptions& options,
@@ -1055,7 +1057,7 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
   // Returns the graph version number.
   virtual int graph_def_version() const = 0;
 
-  typedef uint64 LocalHandle;
+  typedef uint64_t LocalHandle;
 
   // Creates a copy of ProcessFunctionLibraryRuntime (transferring ownership to
   // the caller), FunctionLibraryRuntime (owned by the returned
@@ -1088,7 +1090,8 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
 // `composite_devices` if the input device is a composite device.
 std::string GetFunctionResourceInputDevice(
     const Tensor& input, const int arg_index, const FunctionDef& function_def,
-    absl::flat_hash_map<string, std::vector<string>>* composite_devices);
+    absl::flat_hash_map<std::string, std::vector<std::string>>*
+        composite_devices);
 
 // Returns a canonicalized string for the instantiation of the function of the
 // given "name", attributes "attrs", and "options".
@@ -1173,7 +1176,7 @@ class DistributedFunctionLibraryRuntime {
                    FunctionLibraryRuntime::DoneCallback done) = 0;
 
   // Clean up a previously instantiated function on remote worker.
-  virtual void CleanUp(uint64 step_id,
+  virtual void CleanUp(uint64_t step_id,
                        FunctionLibraryRuntime::LocalHandle handle,
                        FunctionLibraryRuntime::DoneCallback done) = 0;
 
diff --git a/tensorflow/core/framework/function_handle_cache.cc b/tensorflow/core/framework/function_handle_cache.cc
index 6b9119b681af88..d0d995cbcc3712 100644
--- a/tensorflow/core/framework/function_handle_cache.cc
+++ b/tensorflow/core/framework/function_handle_cache.cc
@@ -33,10 +33,10 @@ FunctionHandleCache::~FunctionHandleCache() {
 }
 
 absl::Status FunctionHandleCache::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     FunctionLibraryRuntime::InstantiateOptions options,
     FunctionLibraryRuntime::Handle* handle) {
-  string key = Canonicalize(function_name, attrs, options);
+  std::string key = Canonicalize(function_name, attrs, options);
   FunctionLibraryRuntime::Handle h;
   {
     tf_shared_lock l(mu_);
diff --git a/tensorflow/core/framework/function_handle_cache.h b/tensorflow/core/framework/function_handle_cache.h
index 1bd67138d1964f..317c53823c1685 100644
--- a/tensorflow/core/framework/function_handle_cache.h
+++ b/tensorflow/core/framework/function_handle_cache.h
@@ -34,7 +34,7 @@ class FunctionHandleCache {
   //
   // The cache retains the ownership of the handle. In particular, the caller
   // should not invoke `ReleaseHandle`.
-  absl::Status Instantiate(const string& function_name, AttrSlice attrs,
+  absl::Status Instantiate(const std::string& function_name, AttrSlice attrs,
                            FunctionLibraryRuntime::InstantiateOptions options,
                            FunctionLibraryRuntime::Handle* handle);
 
@@ -45,8 +45,8 @@ class FunctionHandleCache {
  private:
   mutex mu_;
   FunctionLibraryRuntime* lib_ = nullptr;  // not owned
-  const string state_handle_;
-  std::unordered_map<string, FunctionLibraryRuntime::Handle> handles_
+  const std::string state_handle_;
+  std::unordered_map<std::string, FunctionLibraryRuntime::Handle> handles_
       TF_GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 1a396876e00166..fcae39d0277bab 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -54,7 +54,7 @@ using ::testing::UnorderedElementsAreArray;
 class Attrs {
  public:
   Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
-        std::pair<string, FunctionDefHelper::AttrValueWrapper>>
+        std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>
             attrs) {
     for (const auto& aval : attrs) {
       map_.insert({aval.first, aval.second.proto});
@@ -69,7 +69,7 @@ class Attrs {
 
 typedef FunctionDefHelper FDH;
 
-absl::Status GetOpSig(const string& op, const OpDef** sig) {
+absl::Status GetOpSig(const std::string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
@@ -490,7 +490,7 @@ WXPlusB[T:{float, double}](w:T, x:T, b:T) -> (y:T) {
 }
 
 TEST(TFunc, Body_TypeList) {
-  const Tensor kZero = test::AsScalar<int32>(0);
+  const Tensor kZero = test::AsScalar<int32_t>(0);
   auto fdef = FDH::Create(
       // Name
       "Test",
@@ -633,7 +633,7 @@ TEST(TFunc, IntsOnDeviceArgSet) {
   EXPECT_EQ("_DeviceRetval", result.nodes[4].op());
 }
 
-static void HasError(const absl::Status& s, const string& substr) {
+static void HasError(const absl::Status& s, const std::string& substr) {
   EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
@@ -1229,7 +1229,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   TF_EXPECT_OK(lib_def.AddLibrary(lib_def));
 }
 
-GradientDef MakeGradDef(const string& f, const string& g) {
+GradientDef MakeGradDef(const std::string& f, const std::string& g) {
   GradientDef grad;
   grad.set_function_name(f);
   grad.set_gradient_func(g);
@@ -1239,8 +1239,8 @@ GradientDef MakeGradDef(const string& f, const string& g) {
 TEST(FunctionLibraryDefinitionTest, AddLibrary_Atomic) {
   // Create lib def containing two functions with equal names
   FunctionDefLibrary proto;
-  const string x2_name = test::function::XTimesTwo().signature().name();
-  const string x4_name = test::function::XTimesFour().signature().name();
+  const std::string x2_name = test::function::XTimesTwo().signature().name();
+  const std::string x4_name = test::function::XTimesFour().signature().name();
   *proto.add_function() = test::function::XTimesTwo();
   FunctionDef fdef = test::function::XTimesFour();
   fdef.mutable_signature()->set_name(x2_name);
@@ -1275,9 +1275,9 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary_Atomic) {
 }
 
 TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_FuncConflict) {
-  const string x2_name = test::function::XTimesTwo().signature().name();
-  const string x4_name = test::function::XTimesFour().signature().name();
-  const string wx_name = test::function::WXPlusB().signature().name();
+  const std::string x2_name = test::function::XTimesTwo().signature().name();
+  const std::string x4_name = test::function::XTimesFour().signature().name();
+  const std::string wx_name = test::function::WXPlusB().signature().name();
 
   // Create FunctionLibraryDefinition with
   // (func = XTimesTwo, grad = XTimesFour)
@@ -1311,9 +1311,9 @@ TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_FuncConflict) {
 }
 
 TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_GradConflict) {
-  const string x2_name = test::function::XTimesTwo().signature().name();
-  const string x4_name = test::function::XTimesFour().signature().name();
-  const string wx_name = test::function::WXPlusB().signature().name();
+  const std::string x2_name = test::function::XTimesTwo().signature().name();
+  const std::string x4_name = test::function::XTimesFour().signature().name();
+  const std::string wx_name = test::function::WXPlusB().signature().name();
 
   // Create FunctionLibraryDefinition with
   // (func = XTimesTwo, grad = XTimesFour)
@@ -1372,8 +1372,8 @@ TEST(FunctionLibraryDefinitionTest, ListFunctionNames) {
   TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XTimesTwo()));
   TF_CHECK_OK(lib_def.AddFunctionDef(test::function::WXPlusB()));
 
-  const std::vector<string> function_names = lib_def.ListFunctionNames();
-  const std::vector<string> expected = {"XTimesTwo", "WXPlusB"};
+  const std::vector<std::string> function_names = lib_def.ListFunctionNames();
+  const std::vector<std::string> expected = {"XTimesTwo", "WXPlusB"};
   EXPECT_EQ(function_names, expected);
 }
 
@@ -1399,7 +1399,7 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_FuncNoAttr) {
 }
 
 template <typename T>
-void SetAttrValue(FunctionDef* fdef, const string& attr, const T& value) {
+void SetAttrValue(FunctionDef* fdef, const std::string& attr, const T& value) {
   AttrValue attr_value;
   SetAttrValue(value, &attr_value);
   fdef->mutable_attr()->insert({attr, attr_value});
@@ -1421,7 +1421,7 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_FuncWithAttr) {
   TF_EXPECT_OK(lib.GetAttr(ndef, "annotation", &annotation));
   EXPECT_EQ(annotation, true);
 
-  string str;
+  std::string str;
   TF_EXPECT_OK(lib.GetAttr(ndef, "options", &str));
   EXPECT_EQ(str, "some string data");
 }
@@ -1462,8 +1462,8 @@ TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
   using ::tensorflow::test::function::NDef;
   using FDH = ::tensorflow::FunctionDefHelper;
 
-  const auto make_simple_fdef = [](const string& name,
-                                   const string& interface_name) {
+  const auto make_simple_fdef = [](const std::string& name,
+                                   const std::string& interface_name) {
     auto func_def = FDH::Create(
         name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
         {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
@@ -1616,7 +1616,7 @@ TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
   const FunctionDef fdef1 = test::function::XTimesTwo();
   FunctionDef fdef2 = test::function::XTimesTwo();
-  uint64 hash1 = FunctionDefHash(fdef1);
+  uint64_t hash1 = FunctionDefHash(fdef1);
   EXPECT_TRUE(FunctionDefsEqual(fdef1, fdef2));
   EXPECT_EQ(hash1, FunctionDefHash(fdef2));
 
@@ -1760,7 +1760,7 @@ TEST(InstantiateFunctionTest, ResourceInputDevice) {
   *(*arg_attrs.mutable_attr())["_composite_device"].mutable_s() =
       "/device:COMPOSITE:0";
   (*fdef.mutable_arg_attr())[0] = arg_attrs;
-  absl::flat_hash_map<string, std::vector<string>> composite_devices;
+  absl::flat_hash_map<std::string, std::vector<std::string>> composite_devices;
 
   Tensor arg0(DT_RESOURCE, TensorShape({2}));
   ResourceHandle resource_handle0;
@@ -1773,9 +1773,9 @@ TEST(InstantiateFunctionTest, ResourceInputDevice) {
   Tensor arg1(DT_RESOURCE, TensorShape({}));
   arg1.scalar<ResourceHandle>()() = resource_handle0;
 
-  const string device0 = GetFunctionResourceInputDevice(
+  const std::string device0 = GetFunctionResourceInputDevice(
       arg0, /*arg_index=*/0, fdef, &composite_devices);
-  const string device1 = GetFunctionResourceInputDevice(
+  const std::string device1 = GetFunctionResourceInputDevice(
       arg1, /*arg_index=*/1, fdef, &composite_devices);
 
   EXPECT_EQ(device0, "/device:COMPOSITE:0");
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 5e5c64d2a2a5ee..1b968b939365a7 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -48,13 +48,14 @@ GraphDef GDef(absl::Span<const NodeDef> nodes,
 }
 
 // Helper to construct a NodeDef.
-NodeDef NDef(absl::string_view name, absl::string_view op,
-             absl::Span<const string> inputs,
-             absl::Span<const std::pair<string, FDH::AttrValueWrapper>> attrs,
-             const string& device) {
+NodeDef NDef(
+    absl::string_view name, absl::string_view op,
+    absl::Span<const std::string> inputs,
+    absl::Span<const std::pair<std::string, FDH::AttrValueWrapper>> attrs,
+    const std::string& device) {
   NodeDef n;
-  n.set_name(string(name));
-  n.set_op(string(op));
+  n.set_name(name);
+  n.set_op(op);
   for (const auto& in : inputs) n.add_input(in);
   n.set_device(device);
   for (const auto& na : attrs)
@@ -609,8 +610,8 @@ FunctionDef XYXLessThanOrEqualToN(int64_t N) {
 }
 
 FunctionDef RandomUniformLess() {
-  const Tensor kZero = test::AsScalar<int32>(0);
-  const Tensor kOne = test::AsScalar<int32>(1);
+  const Tensor kZero = test::AsScalar<int32_t>(0);
+  const Tensor kOne = test::AsScalar<int32_t>(1);
   const Tensor k005 = test::AsScalar<float>(0.05);
 
   return FDH::Define(
@@ -703,8 +704,8 @@ FunctionDef MakeBatchDataset() {
 }
 
 FunctionDef MakeMapDataset(bool has_other_args) {
-  std::vector<string> args = {"input_dataset: variant"};
-  std::vector<string> inputs = {"input_dataset"};
+  std::vector<std::string> args = {"input_dataset: variant"};
+  std::vector<std::string> inputs = {"input_dataset"};
   if (has_other_args) {
     args.emplace_back("other_arguments: Targuments");
     inputs.emplace_back("other_arguments");
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 93cae697e62d15..b4cbf057cbe0a8 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -34,15 +34,14 @@ namespace function {
 class Attrs {
  public:
   Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
-        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
+        std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
     for (const auto& aval : attrs) {
       map_.insert({aval.first, aval.second.proto});
     }
   }
 
-  Attrs(
-      const std::vector<std::pair<string, FunctionDefHelper::AttrValueWrapper>>&
-          attrs) {
+  Attrs(const std::vector<
+        std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
     for (const auto& aval : attrs) {
       map_.insert({aval.first, aval.second.proto});
     }
@@ -55,12 +54,12 @@ class Attrs {
 };
 
 // Helper to construct a NodeDef.
-NodeDef NDef(
-    absl::string_view name, absl::string_view op,
-    absl::Span<const string> inputs,
-    absl::Span<const std::pair<string, FunctionDefHelper::AttrValueWrapper>>
-        attrs = {},
-    const string& device = "");
+NodeDef NDef(absl::string_view name, absl::string_view op,
+             absl::Span<const std::string> inputs,
+             absl::Span<const std::pair<std::string,
+                                        FunctionDefHelper::AttrValueWrapper>>
+                 attrs = {},
+             const std::string& device = "");
 
 // Helper to construct a GraphDef proto.
 GraphDef GDef(absl::Span<const NodeDef> nodes,
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index c603ced808d370..9f54e3eecfdccd 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -35,8 +35,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-string SummarizeGraphDef(const GraphDef& graph_def) {
-  string ret;
+std::string SummarizeGraphDef(const GraphDef& graph_def) {
+  std::string ret;
   absl::StrAppend(&ret, "versions = ", graph_def.versions().ShortDebugString(),
                   ";\n");
   for (const NodeDef& node : graph_def.node()) {
@@ -85,7 +85,7 @@ absl::Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 static absl::Status RemoveNewDefaultAttrsFromNodeDef(
     NodeDef* node_def, const OpRegistryInterface& consumer_op_registry,
     const OpRegistryInterface& producer_op_registry,
-    std::set<std::pair<string, string>>* op_attr_removed) {
+    std::set<std::pair<std::string, std::string>>* op_attr_removed) {
   const OpDef* producer_op_def;
   const OpDef* consumer_op_def;
   TF_RETURN_IF_ERROR(
@@ -93,7 +93,7 @@ static absl::Status RemoveNewDefaultAttrsFromNodeDef(
   TF_RETURN_IF_ERROR(
       consumer_op_registry.LookUpOpDef(node_def->op(), &consumer_op_def));
 
-  std::vector<string> to_remove;
+  std::vector<std::string> to_remove;
   for (const auto& attr : node_def->attr()) {
     // If the attr is not in consumer_op_def and doesn't start with '_'...
     if (!absl::StartsWith(attr.first, "_") &&
@@ -117,7 +117,7 @@ static absl::Status RemoveNewDefaultAttrsFromNodeDef(
   // We separate identifying which attrs should be removed from
   // actually removing them to avoid invalidating the loop iterators
   // above.
-  for (const string& attr_name : to_remove) {
+  for (const std::string& attr_name : to_remove) {
     node_def->mutable_attr()->erase(attr_name);
     if (op_attr_removed != nullptr) {
       op_attr_removed->insert(std::make_pair(node_def->op(), attr_name));
@@ -127,7 +127,7 @@ static absl::Status RemoveNewDefaultAttrsFromNodeDef(
   return absl::OkStatus();
 }
 
-static bool IsFunction(const GraphDef& graph_def, const string& op_name) {
+static bool IsFunction(const GraphDef& graph_def, const std::string& op_name) {
   for (const auto& func_def : graph_def.library().function()) {
     if (op_name == func_def.signature().name()) return true;
   }
@@ -137,7 +137,7 @@ static bool IsFunction(const GraphDef& graph_def, const string& op_name) {
 absl::Status RemoveNewDefaultAttrsFromGraphDef(
     GraphDef* graph_def, const OpRegistryInterface& consumer_op_registry,
     const OpRegistryInterface& producer_op_registry,
-    std::set<std::pair<string, string>>* op_attr_removed) {
+    std::set<std::pair<std::string, std::string>>* op_attr_removed) {
   // TODO(joshL): Make IsFunction() faster by collecting the names of
   // all functions as a preprocessing step.
   for (int n = 0; n < graph_def->node_size(); ++n) {
@@ -184,7 +184,7 @@ void StripDefaultAttributes(const OpRegistryInterface& op_registry,
     for (const OpDef::AttrDef& attr_def : op_def->attr()) {
       if (attr_def.has_default_value()) {
         AttrValueMap* attrs = node->mutable_attr();
-        const string& name = attr_def.name();
+        const std::string& name = attr_def.name();
         auto iter = attrs->find(name);
         if (iter != attrs->end()) {
           const AttrValue& default_value = attr_def.default_value();
@@ -202,9 +202,9 @@ void StripDefaultAttributes(const OpRegistryInterface& op_registry,
 }
 
 void OpsUsedByGraph(const GraphDef& graph_def,
-                    std::set<string>* ops_used_in_graph) {
+                    std::set<std::string>* ops_used_in_graph) {
   // Map function names to definitions.
-  std::unordered_map<string, const FunctionDef*> name_to_function;
+  std::unordered_map<std::string, const FunctionDef*> name_to_function;
   for (const auto& function : graph_def.library().function()) {
     name_to_function.insert(
         std::make_pair(function.signature().name(), &function));
@@ -212,11 +212,11 @@ void OpsUsedByGraph(const GraphDef& graph_def,
 
   // Collect the sorted list of op names.  Since functions can reference
   // functions, we need a recursive traversal.
-  std::set<string> used_ops;  // Includes both primitive ops and functions
+  std::set<std::string> used_ops;  // Includes both primitive ops and functions
   std::vector<const FunctionDef*> functions_to_process;  // A subset of used_ops
   // Collect the logic to mark an op in a lambda; it'll be used twice below.
   const auto mark_op_as_used = [&used_ops, &functions_to_process,
-                                &name_to_function](const string& op) {
+                                &name_to_function](const std::string& op) {
     if (used_ops.insert(op).second) {
       // If it's a function, we'll need to process further
       const auto it = name_to_function.find(op);
@@ -239,7 +239,7 @@ void OpsUsedByGraph(const GraphDef& graph_def,
   // Filter out function names to produce output.
   // TODO(josh11b): Change the above code to produce this directly.
   ops_used_in_graph->clear();
-  for (const string& op_name : used_ops) {
+  for (const std::string& op_name : used_ops) {
     if (name_to_function.find(op_name) == name_to_function.end()) {
       ops_used_in_graph->insert(op_name);
     }
@@ -249,12 +249,12 @@ void OpsUsedByGraph(const GraphDef& graph_def,
 absl::Status StrippedOpListForGraph(const GraphDef& graph_def,
                                     const OpRegistryInterface& op_registry,
                                     OpList* stripped_op_list) {
-  std::set<string> used_ops;
+  std::set<std::string> used_ops;
   OpsUsedByGraph(graph_def, &used_ops);
 
   // Build the stripped op list in sorted order, ignoring functions.
   stripped_op_list->clear_op();
-  for (const string& op_name : used_ops) {
+  for (const std::string& op_name : used_ops) {
     const OpDef* op_def;
     TF_RETURN_IF_ERROR(op_registry.LookUpOpDef(op_name, &op_def));
     OpDef* stripped_op = stripped_op_list->add_op();
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index a164ac310fe4ed..b3e335e776f3f6 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -29,7 +29,7 @@ class NodeDef;
 
 // Produce a human-readable version of a GraphDef that is more concise
 // than a text-format proto.
-string SummarizeGraphDef(const GraphDef& graph_def);
+std::string SummarizeGraphDef(const GraphDef& graph_def);
 
 // Validates the syntax of a GraphDef provided externally.
 //
@@ -97,7 +97,7 @@ absl::Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 absl::Status RemoveNewDefaultAttrsFromGraphDef(
     GraphDef* graph_def, const OpRegistryInterface& consumer_op_registry,
     const OpRegistryInterface& producer_op_registry,
-    std::set<std::pair<string, string>>* op_attr_removed);
+    std::set<std::pair<std::string, std::string>>* op_attr_removed);
 
 // Goes over the `nodes` and removes attributes that are set to their
 // default values according to op_registry.
@@ -115,7 +115,7 @@ void StripDefaultAttributes(const OpRegistryInterface& op_registry,
 //
 // This returns the ops used as a set of strings.
 void OpsUsedByGraph(const GraphDef& graph_def,
-                    std::set<string>* ops_used_in_graph);
+                    std::set<std::string>* ops_used_in_graph);
 
 // This function computes the stripped_op_list field of MetaGraphDef
 // and similar protos.  The op_registry should contain the ops used to
diff --git a/tensorflow/core/framework/graph_def_util_test.cc b/tensorflow/core/framework/graph_def_util_test.cc
index 12a1ee29fe792e..503f2cc93af194 100644
--- a/tensorflow/core/framework/graph_def_util_test.cc
+++ b/tensorflow/core/framework/graph_def_util_test.cc
@@ -59,7 +59,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, NoChangeWithDefault) {
                    .Finalize(graph_def.add_node()));
   GraphDef expected_graph_def = graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(RemoveNewDefaultAttrsFromGraphDef(&graph_def, registry, registry,
                                                  &op_attr_removed));
 
@@ -80,7 +80,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, NoChangeNoDefault) {
                    .Finalize(graph_def.add_node()));
   GraphDef expected_graph_def = graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(RemoveNewDefaultAttrsFromGraphDef(&graph_def, registry, registry,
                                                  &op_attr_removed));
 
@@ -106,7 +106,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, UsesDefault) {
   TF_ASSERT_OK(NodeDefBuilder("uses_default", "UsesDefault", &producer_registry)
                    .Finalize(produced_graph_def.add_node()));
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -116,7 +116,8 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, UsesDefault) {
                    .Finalize(expected_graph_def.add_node()));
   TF_EXPECT_GRAPH_EQ(expected_graph_def, produced_graph_def);
 
-  std::set<std::pair<string, string>> expected_removed({{"UsesDefault", "a"}});
+  std::set<std::pair<std::string, std::string>> expected_removed(
+      {{"UsesDefault", "a"}});
   EXPECT_EQ(expected_removed, op_attr_removed);
 }
 
@@ -142,7 +143,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, ChangedFromDefault) {
                    .Finalize(produced_graph_def.add_node()));
   GraphDef expected_graph_def = produced_graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -174,7 +175,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, UnderscoreAttrs) {
                    .Finalize(produced_graph_def.add_node()));
   GraphDef expected_graph_def = produced_graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -213,7 +214,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, HasFunction) {
   TF_ASSERT_OK(NodeDefBuilder("call_func", "my_func", &function_registry)
                    .Finalize(produced_graph_def.add_node()));
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -231,7 +232,8 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, HasFunction) {
   EXPECT_EQ(expected_graph_def.library().DebugString(),
             produced_graph_def.library().DebugString());
 
-  std::set<std::pair<string, string>> expected_removed({{"UsesDefault", "a"}});
+  std::set<std::pair<std::string, std::string>> expected_removed(
+      {{"UsesDefault", "a"}});
   EXPECT_EQ(expected_removed, op_attr_removed);
 }
 
@@ -272,7 +274,7 @@ TEST(StripDefaultAttributesTest, NonDefaultNotStripped) {
 TEST(StrippedOpListForGraphTest, FlatTest) {
   // Make four ops
   OpList op_list;
-  for (const string& op : {"A", "B", "C", "D"}) {
+  for (const std::string& op : {"A", "B", "C", "D"}) {
     OpDef* op_def = op_list.add_op();
     op_def->set_name(op);
     op_def->set_summary("summary");
@@ -282,7 +284,7 @@ TEST(StrippedOpListForGraphTest, FlatTest) {
 
   // Make a graph which uses two ops once and twice, respectively.
   // The result should be independent of the ordering.
-  const string graph_ops[4][3] = {
+  const std::string graph_ops[4][3] = {
       {"C", "B", "B"}, {"B", "C", "B"}, {"B", "B", "C"}, {"C", "C", "B"}};
   for (const bool use_function : {false, true}) {
     for (int order = 0; order < 4; order++) {
@@ -290,13 +292,13 @@ TEST(StrippedOpListForGraphTest, FlatTest) {
       if (use_function) {
         FunctionDef* function_def = graph_def.mutable_library()->add_function();
         function_def->mutable_signature()->set_name("F");
-        for (const string& op : graph_ops[order]) {
+        for (const std::string& op : graph_ops[order]) {
           function_def->add_node_def()->set_op(op);
         }
         graph_def.add_node()->set_op("F");
       } else {
-        for (const string& op : graph_ops[order]) {
-          string name = absl::StrCat("name", graph_def.node_size());
+        for (const std::string& op : graph_ops[order]) {
+          std::string name = absl::StrCat("name", graph_def.node_size());
           NodeDef* node = graph_def.add_node();
           node->set_name(name);
           node->set_op(op);
@@ -319,9 +321,9 @@ TEST(StrippedOpListForGraphTest, FlatTest) {
       }
 
       // Should get the same result using OpsUsedByGraph().
-      std::set<string> used_ops;
+      std::set<std::string> used_ops;
       OpsUsedByGraph(graph_def, &used_ops);
-      ASSERT_EQ(std::set<string>({"B", "C"}), used_ops);
+      ASSERT_EQ(std::set<std::string>({"B", "C"}), used_ops);
     }
   }
 }
@@ -356,9 +358,9 @@ TEST(StrippedOpListForGraphTest, NestedFunctionTest) {
     ASSERT_EQ(stripped_op_list.op(0).name(), "A");
 
     // Should get the same result using OpsUsedByGraph().
-    std::set<string> used_ops;
+    std::set<std::string> used_ops;
     OpsUsedByGraph(graph_def, &used_ops);
-    ASSERT_EQ(std::set<string>({"A"}), used_ops);
+    ASSERT_EQ(std::set<std::string>({"A"}), used_ops);
   }
 }
 
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 95b6287c4e56b6..b3226c6fac490b 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -51,45 +51,45 @@ class NodeNameMapping {
 
   // Normalize the input name and make it unique. This is the same as the
   // function for output, expect that it adds a name mapping for the name.
-  string GetInputName(const string& name);
+  std::string GetInputName(const std::string& name);
 
   // Normalize the output name and make it unique.
-  string GetOutputName(const string& name);
+  std::string GetOutputName(const std::string& name);
 
   // Make the node name unique.
-  string Uniquify(const string& name);
+  std::string Uniquify(const std::string& name);
 
   // Records name as a used name. If this name is already used,
   // returns an error status.
-  absl::Status UseOutputName(const string& name);
+  absl::Status UseOutputName(const std::string& name);
 
   // Look up how a node name was previously normalized/uniquified.
   // Returns empty if name was never seen.
-  string Lookup(const string& name) const;
+  std::string Lookup(const std::string& name) const;
 
  private:
-  string UniquifyHelper(const string& name);
-  static string Normalize(string name);
+  std::string UniquifyHelper(const std::string& name);
+  static std::string Normalize(std::string name);
 
   // The normalized/uniquified names already used as
   // input names (in signature), output names (in signature), and node names
   // (in node_def).
   // This is a superset of values in name_mapping_.
-  absl::flat_hash_map<string, uint64> used_names_;
+  absl::flat_hash_map<std::string, uint64_t> used_names_;
   // Mapping from original node name from the graph to the normalized
   // and uniquified version of it.
-  absl::flat_hash_map<string, string> name_mapping_;
+  absl::flat_hash_map<std::string, std::string> name_mapping_;
 };
 
-string NodeNameMapping::Normalize(string name) {
+std::string NodeNameMapping::Normalize(std::string name) {
   // Convert letters to lowercase and non-alphanumeric characters to '_'.
   if (name.empty()) return "unknown";
   const int n = name.size();
   for (int i = 0; i < n; ++i) {
     char c = name[i];
-    if (isalnum(c)) {
-      if (isupper(c)) {
-        name[i] = tolower(c);
+    if (absl::ascii_isalnum(c)) {
+      if (absl::ascii_isupper(c)) {
+        name[i] = absl::ascii_tolower(c);
       }
     } else {
       name[i] = '_';
@@ -99,45 +99,45 @@ string NodeNameMapping::Normalize(string name) {
   // Find the first letter and start with it.
   int i = 0;
   for (; i < n; ++i) {
-    if (isalpha(name[i])) break;
+    if (absl::ascii_isalpha(name[i])) break;
   }
 
   // Return "unknown" if none of the name's chars were letters.
   return i == n ? "unknown" : name.substr(i);
 }
 
-string NodeNameMapping::UniquifyHelper(const string& name) {
+std::string NodeNameMapping::UniquifyHelper(const std::string& name) {
   auto it = used_names_.emplace(name, 0);
   // If the name hasn't been used yet, use it as-is.
   if (it.second) return name;
 
   // Add a suffix to name to make it unique.
   while (true) {
-    const string candidate = absl::StrCat(name, "_", it.first->second);
+    const std::string candidate = absl::StrCat(name, "_", it.first->second);
     it.first->second++;
     if (used_names_.emplace(candidate, 0).second) return candidate;
   }
 }
 
-string NodeNameMapping::GetInputName(const string& name) {
-  const string& input_name = UniquifyHelper(Normalize(name));
+std::string NodeNameMapping::GetInputName(const std::string& name) {
+  const std::string& input_name = UniquifyHelper(Normalize(name));
   name_mapping_[name] = input_name;
   return input_name;
 }
 
-string NodeNameMapping::GetOutputName(const string& name) {
-  const string& input_name = UniquifyHelper(Normalize(name));
+std::string NodeNameMapping::GetOutputName(const std::string& name) {
+  const std::string& input_name = UniquifyHelper(Normalize(name));
   // Don't add it to name_mapping_ since this name is not for a node.
   return input_name;
 }
 
-string NodeNameMapping::Uniquify(const string& name) {
-  const string uniqued = UniquifyHelper(name);
+std::string NodeNameMapping::Uniquify(const std::string& name) {
+  const std::string uniqued = UniquifyHelper(name);
   name_mapping_[name] = uniqued;
   return uniqued;
 }
 
-absl::Status NodeNameMapping::UseOutputName(const string& name) {
+absl::Status NodeNameMapping::UseOutputName(const std::string& name) {
   const auto& iter = used_names_.find(name);
   if (iter != used_names_.end()) {
     return errors::InvalidArgument(
@@ -148,19 +148,19 @@ absl::Status NodeNameMapping::UseOutputName(const string& name) {
   return absl::OkStatus();
 }
 
-string NodeNameMapping::Lookup(const string& name) const {
+std::string NodeNameMapping::Lookup(const std::string& name) const {
   const auto iter = name_mapping_.find(name);
-  if (iter == name_mapping_.end()) return string();
+  if (iter == name_mapping_.end()) return std::string();
   return iter->second;
 }
 
 absl::Status FillFunctionBody(
-    const string& fn_name, const NodeNameMapping& node_names,
+    const std::string& fn_name, const NodeNameMapping& node_names,
     const std::vector<const Node*>& body_nodes,
-    const absl::flat_hash_map<string, string>& tensor_renaming,
+    const absl::flat_hash_map<std::string, std::string>& tensor_renaming,
     bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
     bool allow_destructive_reads, FunctionDef* fdef) {
-  absl::flat_hash_set<string> func_attr_names;
+  absl::flat_hash_set<std::string> func_attr_names;
   for (const auto& func_attr : fdef->signature().attr()) {
     func_attr_names.insert(func_attr.name());
   }
@@ -263,7 +263,7 @@ absl::Status FillFunctionBody(
     for (const Edge* edge : control_edges) {
       // Add this control input only if the src node is in the body or a part of
       // the inputs.
-      const string normalized = node_names.Lookup(edge->src()->name());
+      const std::string normalized = node_names.Lookup(edge->src()->name());
       // If we did not find a name for the source of control edge, this
       // source must be outside of the body, and not an input. Raise an error.
       if (normalized.empty()) {
@@ -322,15 +322,16 @@ absl::Status FillFunctionBody(
 }
 
 absl::Status GraphToFunctionDefHelper(
-    const Graph& fn_body, const string& fn_name, bool append_hash_to_fn_name,
-    bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
+    const Graph& fn_body, const std::string& fn_name,
+    bool append_hash_to_fn_name, bool set_stateful_from_nodes,
+    bool copy_placeholder_attrs_from_nodes,
     const std::vector<const Node*>& body_nodes,
     const std::vector<OutputTensor>& inputs,
     const std::vector<OutputTensor>& outputs,
-    const std::vector<string>& output_names,
+    const std::vector<std::string>& output_names,
     const std::vector<const Node*>& control_outputs,
-    const std::vector<string>& control_output_names, const char* description,
-    bool allow_destructive_reads, FunctionDef* fdef) {
+    const std::vector<std::string>& control_output_names,
+    const char* description, bool allow_destructive_reads, FunctionDef* fdef) {
   if (!output_names.empty()) {
     DCHECK_EQ(output_names.size(), outputs.size());
   }
@@ -350,7 +351,7 @@ absl::Status GraphToFunctionDefHelper(
   //  - For tensors produced by nodes in function's body:
   //    {flat_tensor_name -> nested_tensor_name}
   //    e.g. {Add:3 -> add_0:z:1}
-  absl::flat_hash_map<string, string> tensor_renaming;
+  absl::flat_hash_map<std::string, std::string> tensor_renaming;
 
   // Fill outputs in function's signature.
   // We fill the outputs first to prevent output_names from colliding
@@ -380,7 +381,7 @@ absl::Status GraphToFunctionDefHelper(
     int idx = inputs[i].index;
     OpDef::ArgDef* argdef = fdef->mutable_signature()->add_input_arg();
     argdef->set_type(node->output_type(idx));
-    const string& input_name = node_names.GetInputName(node->name());
+    const std::string& input_name = node_names.GetInputName(node->name());
     argdef->set_name(input_name);
     FunctionDef::ArgAttrs arg_attrs;
     int64_t resource_arg_unique_id = -1;
@@ -431,7 +432,7 @@ absl::Status GraphToFunctionDefHelper(
   // in tensor_renaming.
   for (const Node* node : body_nodes) {
     // Make sure node_name does not collide with an input or output name.
-    const string& node_name = node_names.Uniquify(node->name());
+    const std::string& node_name = node_names.Uniquify(node->name());
     // For each output_arg in the op_def, the output_ranges
     // map will have [start, end] range of indices that this arg produces
     // among all the output tensors of this op.
@@ -443,8 +444,8 @@ absl::Status GraphToFunctionDefHelper(
       int index_start = output.second.first;
       int index_end = output.second.second;
       for (int i = index_start; i < index_end; ++i) {
-        const string& original_name = absl::StrCat(node->name(), ":", i);
-        const string& new_name =
+        const std::string& original_name = absl::StrCat(node->name(), ":", i);
+        const std::string& new_name =
             strings::StrCat(node_name, ":", output_name, ":", i - index_start);
         // Record the mapping if this tensor is not already mapped.
         // Tensor can be already mapped if it is used as an input.
@@ -461,10 +462,10 @@ absl::Status GraphToFunctionDefHelper(
 
   // Remap return values.
   for (int r = 0; r < fdef->signature().output_arg_size(); ++r) {
-    const string& ret_name = fdef->signature().output_arg(r).name();
+    const std::string& ret_name = fdef->signature().output_arg(r).name();
     // We convert this flat tensor name to the nested value
     // (e.g. `add:z:1`) that we stored in tensor_renaming.
-    string return_value;
+    std::string return_value;
     if (outputs[r].node->IsRetval()) {
       Edge const* edge;
       TF_RETURN_IF_ERROR(outputs[r].node->input_edge(0, &edge));
@@ -484,8 +485,8 @@ absl::Status GraphToFunctionDefHelper(
   }
 
   if (append_hash_to_fn_name) {
-    const uint64 hash = FunctionDefHash(*fdef);
-    string encoded;
+    const uint64_t hash = FunctionDefHash(*fdef);
+    std::string encoded;
     TF_RETURN_IF_ERROR(Base64Encode(
         absl::string_view(reinterpret_cast<const char*>(&hash), sizeof(hash)),
         &encoded));
@@ -508,9 +509,9 @@ absl::Status GraphToFunctionDefHelper(
         ") and the number of control output names (",
         control_output_names.size(), ") to match but they do not.");
   }
-  std::set<string> control_output_names_set;
+  std::set<std::string> control_output_names_set;
   for (int i = 0; i < control_outputs.size(); ++i) {
-    string signature_name;
+    std::string signature_name;
     if (!control_output_names.empty()) {
       signature_name = control_output_names[i];
     } else {
@@ -523,7 +524,7 @@ absl::Status GraphToFunctionDefHelper(
       return errors::InvalidArgument("Repeated control output name: ",
                                      signature_name);
     }
-    const string control_output_node =
+    const std::string control_output_node =
         node_names.Lookup(control_outputs[i]->name());
     if (control_output_node.empty()) {
       return errors::InvalidArgument(
@@ -531,7 +532,7 @@ absl::Status GraphToFunctionDefHelper(
     }
     (*fdef->mutable_control_ret())[signature_name] = control_output_node;
   }
-  for (const string& control_output : control_output_names_set) {
+  for (const std::string& control_output : control_output_names_set) {
     fdef->mutable_signature()->add_control_output(control_output);
   }
 
@@ -539,9 +540,9 @@ absl::Status GraphToFunctionDefHelper(
 }
 
 absl::Status GraphToFunctionDefHelper(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
-    const std::vector<string>& output_names, bool allow_destructive_reads,
+    const Graph& graph, const std::string& name,
+    const std::function<absl::optional<std::string>(const Node*)>& control_ret,
+    const std::vector<std::string>& output_names, bool allow_destructive_reads,
     FunctionDef* fdef) {
   auto add_arg_or_retval = [](Node* node,
                               std::vector<OutputTensor>* args_or_retvals) {
@@ -566,7 +567,7 @@ absl::Status GraphToFunctionDefHelper(
   std::vector<OutputTensor> inputs;
   std::vector<OutputTensor> outputs;
   std::vector<const Node*> control_outputs;
-  std::vector<string> control_output_names;
+  std::vector<std::string> control_output_names;
   for (Node* node : graph.op_nodes()) {
     if (node->IsArg()) {
       TF_RETURN_IF_ERROR(add_arg_or_retval(node, &inputs));
@@ -591,7 +592,7 @@ absl::Status GraphToFunctionDefHelper(
 
   auto validate_args_retvals =
       [](const std::vector<OutputTensor>& args_or_retvals,
-         const string& op_type) {
+         const std::string& op_type) {
         for (int i = 0, e = args_or_retvals.size(); i < e; ++i) {
           if (args_or_retvals[i].node == nullptr) {
             return errors::InvalidArgument("Missing '", op_type,
@@ -614,17 +615,17 @@ absl::Status GraphToFunctionDefHelper(
 
 }  // anonymous namespace
 
-absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
-                                bool append_hash_to_fn_name,
-                                bool set_stateful_from_nodes,
-                                bool copy_placeholder_attrs_from_nodes,
-                                const std::vector<const Node*>& body_nodes,
-                                const std::vector<OutputTensor>& inputs,
-                                const std::vector<OutputTensor>& outputs,
-                                const std::vector<string>& output_names,
-                                const std::vector<const Node*>& control_outputs,
-                                const std::vector<string>& control_output_names,
-                                const char* description, FunctionDef* fdef) {
+absl::Status GraphToFunctionDef(
+    const Graph& fn_body, const std::string& fn_name,
+    bool append_hash_to_fn_name, bool set_stateful_from_nodes,
+    bool copy_placeholder_attrs_from_nodes,
+    const std::vector<const Node*>& body_nodes,
+    const std::vector<OutputTensor>& inputs,
+    const std::vector<OutputTensor>& outputs,
+    const std::vector<std::string>& output_names,
+    const std::vector<const Node*>& control_outputs,
+    const std::vector<std::string>& control_output_names,
+    const char* description, FunctionDef* fdef) {
   return GraphToFunctionDefHelper(
       fn_body, fn_name, append_hash_to_fn_name, set_stateful_from_nodes,
       copy_placeholder_attrs_from_nodes, body_nodes, inputs, outputs,
@@ -634,20 +635,20 @@ absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
 }
 
 absl::Status GraphToFunctionDef(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
+    const Graph& graph, const std::string& name,
+    const std::function<absl::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, control_ret,
                                   /*output_names=*/{},
                                   /*allow_destructive_reads=*/false, fdef);
 }
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 FunctionDef* fdef) {
   return GraphToFunctionDef(graph, name, /*control_ret=*/nullptr, fdef);
 }
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 const std::vector<std::string>& output_names,
                                 FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, /*control_ret=*/nullptr,
@@ -656,8 +657,8 @@ absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
 }
 
 absl::Status GraphToFunctionDef(
-    std::unique_ptr<Graph> graph, const string& name,
-    const std::function<std::optional<string>(const Node*)>& control_ret,
+    std::unique_ptr<Graph> graph, const std::string& name,
+    const std::function<std::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef) {
   return GraphToFunctionDefHelper(*graph, name, control_ret,
                                   /*output_names=*/{},
diff --git a/tensorflow/core/framework/graph_to_functiondef.h b/tensorflow/core/framework/graph_to_functiondef.h
index 369b86ecea5e03..4558af7938f312 100644
--- a/tensorflow/core/framework/graph_to_functiondef.h
+++ b/tensorflow/core/framework/graph_to_functiondef.h
@@ -29,17 +29,17 @@ namespace tensorflow {
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
 // function graph_to_function_def(), which is located in
 // tensorflow/python/framework/graph_to_function_def.py.
-absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
-                                bool append_hash_to_fn_name,
-                                bool set_stateful_from_nodes,
-                                bool copy_placeholder_attrs_from_nodes,
-                                const std::vector<const Node*>& body_nodes,
-                                const std::vector<OutputTensor>& inputs,
-                                const std::vector<OutputTensor>& outputs,
-                                const std::vector<string>& output_names,
-                                const std::vector<const Node*>& control_outputs,
-                                const std::vector<string>& control_output_names,
-                                const char* description, FunctionDef* fdef);
+absl::Status GraphToFunctionDef(
+    const Graph& fn_body, const std::string& fn_name,
+    bool append_hash_to_fn_name, bool set_stateful_from_nodes,
+    bool copy_placeholder_attrs_from_nodes,
+    const std::vector<const Node*>& body_nodes,
+    const std::vector<OutputTensor>& inputs,
+    const std::vector<OutputTensor>& outputs,
+    const std::vector<std::string>& output_names,
+    const std::vector<const Node*>& control_outputs,
+    const std::vector<std::string>& control_output_names,
+    const char* description, FunctionDef* fdef);
 
 // Converts 'graph' to a FunctionDef 'fdef', with name 'name':
 //
@@ -50,20 +50,20 @@ absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
 //     `control_output` in Op definition (see OpDef). Control output name must
 //     be unique for all control output nodes.
 absl::Status GraphToFunctionDef(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
+    const Graph& graph, const std::string& name,
+    const std::function<absl::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef);
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 FunctionDef* fdef);
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 const std::vector<std::string>& output_names,
                                 FunctionDef* fdef);
 
 absl::Status GraphToFunctionDef(
-    std::unique_ptr<Graph> graph, const string& name,
-    const std::function<std::optional<string>(const Node*)>& control_ret,
+    std::unique_ptr<Graph> graph, const std::string& name,
+    const std::function<std::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
index d71f6b9ff47a3b..719f9af233758e 100644
--- a/tensorflow/core/framework/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -47,7 +47,7 @@ FunctionDef RemoveDebugInfo(const FunctionDef& def) {
 }
 
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
-                      string* diff) {
+                      std::string* diff) {
   // TODO(phawkins) use a more sophisticated equality test.
   if (a.DebugString() != b.DebugString()) {
     if (diff) {
@@ -95,7 +95,7 @@ TEST(GraphToFunctionDefTest, Basics) {
       },
       {{"h", "G:sum:0"}});  // return values
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -119,7 +119,7 @@ TEST(GraphToFunctionDefTest, OverrideOutputNames) {
                                 {},             // body
                                 {{"b", "a"}});  // return values
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -168,7 +168,7 @@ TEST(GraphToFunctionDefTest, ArgAttrShape) {
   attrs.mutable_attr()->insert({"_output_shapes", output_shapes});
   (*fdef_expected.mutable_arg_attr())[0] = std::move(attrs);
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -199,7 +199,7 @@ TEST(GraphToFunctionDefTest, ArgAttrPrivateAttr) {
   attrs.mutable_attr()->insert({"_name", private_attr});
   (*fdef_expected.mutable_arg_attr())[0] = std::move(attrs);
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -266,7 +266,7 @@ TEST(GraphToFunctionDefTest, ArgAttrConstInput) {
   (*fdef_expected.mutable_signature()->mutable_description()) =
       "ArgAttrConstInput";
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -374,7 +374,7 @@ TEST(GraphToFunctionDefTest, ControlDependencies) {
       },
       {{"c", "b:y:0"}});  // return values
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -395,8 +395,9 @@ TEST(GraphToFunctionDefTest, ControlOutputs) {
   TF_EXPECT_OK(ConvertGraphDefToGraph(options, graph_def, graph.get()));
 
   // Add a 'b' node to the control return set.
-  const auto control_ret = [](const Node* n) -> absl::optional<string> {
-    if (n->name() == "b") return absl::make_optional<string>("must_execute");
+  const auto control_ret = [](const Node* n) -> absl::optional<std::string> {
+    if (n->name() == "b")
+      return absl::make_optional<std::string>("must_execute");
     return absl::nullopt;
   };
 
@@ -415,7 +416,7 @@ TEST(GraphToFunctionDefTest, ControlOutputs) {
                                 {{"c", "b:y:0"}},          // return values
                                 {{"must_execute", "b"}});  // control returns
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index c9788b0a08c45f..7b7e90df8bab2a 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -55,8 +55,8 @@ KernelDefBuilder& KernelDefBuilder::AttrConstraint<int64_t>(
 }
 
 template <>
-KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
-    const char* attr_name, absl::Span<const string> allowed) {
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<std::string>(
+    const char* attr_name, absl::Span<const std::string> allowed) {
   auto* constraint = kernel_def_->add_constraint();
   constraint->set_name(attr_name);
   auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
@@ -67,11 +67,11 @@ KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
 }
 
 template <>
-KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
-    const char* attr_name, string allowed) {
-  return AttrConstraint(
-      attr_name,
-      absl::Span<const string>(std::initializer_list<string>({allowed})));
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<std::string>(
+    const char* attr_name, std::string allowed) {
+  return AttrConstraint(attr_name,
+                        absl::Span<const std::string>(
+                            std::initializer_list<std::string>({allowed})));
 }
 
 template <>
diff --git a/tensorflow/core/framework/kernel_def_builder_test.cc b/tensorflow/core/framework/kernel_def_builder_test.cc
index fa37b114abbe22..eefa454beb763e 100644
--- a/tensorflow/core/framework/kernel_def_builder_test.cc
+++ b/tensorflow/core/framework/kernel_def_builder_test.cc
@@ -48,7 +48,7 @@ TEST(KernelDefBuilderTest, TypeConstraint) {
 
   def = KernelDefBuilder("C")
             .Device(DEVICE_GPU)
-            .TypeConstraint<int32>("U")
+            .TypeConstraint<int32_t>("U")
             .TypeConstraint<bool>("V")
             .Build();
 
@@ -95,7 +95,7 @@ TEST(KernelDefBuilderTest, Int64Constraint) {
             .Device(DEVICE_GPU)
             .AttrConstraint("U",
                             absl::Span<const int64_t>{int64_t{5}, int64_t{17}})
-            .AttrConstraint("V", string("proto"))
+            .AttrConstraint("V", std::string("proto"))
             .Build();
 
   protobuf::TextFormat::ParseFromString(
@@ -136,7 +136,7 @@ TEST(KernelDefBuilderTest, StringConstraint) {
   def = KernelDefBuilder("C")
             .Device(DEVICE_GPU)
             .AttrConstraint("U", absl::Span<const char* const>{"boo", "ya"})
-            .AttrConstraint("V", string("proto"))
+            .AttrConstraint("V", std::string("proto"))
             .Build();
 
   protobuf::TextFormat::ParseFromString(
diff --git a/tensorflow/core/framework/kernel_def_util_test.cc b/tensorflow/core/framework/kernel_def_util_test.cc
index a2e4aa82fafd56..a15fa7b0cfbe0f 100644
--- a/tensorflow/core/framework/kernel_def_util_test.cc
+++ b/tensorflow/core/framework/kernel_def_util_test.cc
@@ -24,13 +24,13 @@ namespace tensorflow {
 
 namespace {
 
-NodeDef NodeDefFromText(const string& text) {
+NodeDef NodeDefFromText(const std::string& text) {
   NodeDef node_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &node_def));
   return node_def;
 }
 
-KernelDef KernelDefFromText(const string& text) {
+KernelDef KernelDefFromText(const std::string& text) {
   KernelDef kernel_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &kernel_def));
   return kernel_def;
@@ -38,8 +38,8 @@ KernelDef KernelDefFromText(const string& text) {
 
 class AttrsMatchTest : public ::testing::Test {
  protected:
-  void ExpectStatus(const string& node_def_str, const string& kernel_def_str,
-                    error::Code code) {
+  void ExpectStatus(const std::string& node_def_str,
+                    const std::string& kernel_def_str, error::Code code) {
     bool match;
     auto status = KernelAttrsMatch(KernelDefFromText(kernel_def_str),
                                    NodeDefFromText(node_def_str), &match);
@@ -53,7 +53,7 @@ class AttrsMatchTest : public ::testing::Test {
 };
 
 TEST_F(AttrsMatchTest, ValidConstraint) {
-  string node_def_str = R"(
+  std::string node_def_str = R"(
     name: "ValidConstraint-op"
     op: "ValidConstraint"
     attr {
@@ -63,7 +63,7 @@ TEST_F(AttrsMatchTest, ValidConstraint) {
       }
     }
   )";
-  string kernel_def_str = R"(
+  std::string kernel_def_str = R"(
     op: "ValidConstraint"
     device_type: "CPU"
     constraint {
@@ -79,7 +79,7 @@ TEST_F(AttrsMatchTest, ValidConstraint) {
 }
 
 TEST_F(AttrsMatchTest, BadConstraint) {
-  string node_def_str = R"(
+  std::string node_def_str = R"(
     name: "BadConstraint-op"
     op: "BadConstraint"
     attr {
@@ -89,7 +89,7 @@ TEST_F(AttrsMatchTest, BadConstraint) {
       }
     }
   )";
-  string kernel_def_str = R"(
+  std::string kernel_def_str = R"(
     op: "BadConstraint"
     device_type: "CPU"
     constraint {
@@ -105,7 +105,7 @@ TEST_F(AttrsMatchTest, BadConstraint) {
 }
 
 TEST_F(AttrsMatchTest, Unimplemented) {
-  string node_def_str = R"(
+  std::string node_def_str = R"(
     name: "BadConstraint-op"
     op: "BadConstraint"
     attr {
@@ -115,7 +115,7 @@ TEST_F(AttrsMatchTest, Unimplemented) {
       }
     }
   )";
-  string kernel_def_str = R"(
+  std::string kernel_def_str = R"(
     op: "BadConstraint"
     device_type: "CPU"
     constraint {
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index a8ad5ba42069a7..df63471f59dff3 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -46,10 +46,10 @@ struct Library {
 absl::Status LoadDynamicLibrary(const char* library_filename, void** result,
                                 const void** buf, size_t* len) {
   static mutex mu(LINKER_INITIALIZED);
-  static std::unordered_map<string, Library> loaded_libs;
+  static std::unordered_map<std::string, Library> loaded_libs;
   Env* env = Env::Default();
   Library library;
-  std::unordered_set<string> seen_op_names;
+  std::unordered_set<std::string> seen_op_names;
   {
     mutex_lock lock(mu);
     if (loaded_libs.find(library_filename) != loaded_libs.end()) {
@@ -90,7 +90,7 @@ absl::Status LoadDynamicLibrary(const char* library_filename, void** result,
       loaded_libs[library_filename] = library;
     }
   }
-  string str;
+  std::string str;
   library.op_list.SerializeToString(&str);
   char* str_buf = reinterpret_cast<char*>(port::Malloc(str.length()));
   memcpy(str_buf, str.data(), str.length());
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index fffc5f8864e992..6a56c1695d35b9 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -141,7 +141,7 @@ LocalRendezvous::~LocalRendezvous() {
 }
 
 namespace {
-uint64 KeyHash(const absl::string_view& k) {
+uint64_t KeyHash(const absl::string_view& k) {
   return Hash64(k.data(), k.size());
 }
 }  // namespace
@@ -149,7 +149,7 @@ uint64 KeyHash(const absl::string_view& k) {
 absl::Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
                                    const Rendezvous::Args& send_args,
                                    const Tensor& val, const bool is_dead) {
-  uint64 key_hash = KeyHash(key.FullKey());
+  uint64_t key_hash = KeyHash(key.FullKey());
   DVLOG(2) << "Send " << this << " " << key_hash << " " << key.FullKey();
 
   if (is_dead) {
@@ -158,7 +158,7 @@ absl::Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
         "The number of dead values sent between a pair of devices.",
         "send_device", "recv_device");
     rendezvous_dead_values_sent
-        ->GetCell(string(key.src_device), string(key.dst_device))
+        ->GetCell(std::string(key.src_device), std::string(key.dst_device))
         ->IncrementBy(1);
   }
 
@@ -229,7 +229,7 @@ absl::Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
 void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
                                 const Rendezvous::Args& recv_args,
                                 Rendezvous::DoneCallback done) {
-  uint64 key_hash = KeyHash(key.FullKey());
+  uint64_t key_hash = KeyHash(key.FullKey());
   DVLOG(2) << "Recv " << this << " " << key_hash << " " << key.FullKey();
   tsl::core::RefCountPtr<Rendezvous> rc_keep_alive;
 
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 332daaa6c02060..628bd4642f4762 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -82,7 +82,7 @@ class LocalRendezvous {
     Item* tail = nullptr;
   };
 
-  typedef gtl::FlatMap<uint64, ItemQueue> Table;
+  typedef gtl::FlatMap<uint64_t, ItemQueue> Table;
 
   const int num_buckets_;
   // Pointer to the owner class of this LocalRendezvous if it is refcounted,
diff --git a/tensorflow/core/framework/log_memory.cc b/tensorflow/core/framework/log_memory.cc
index b168957ef7ed03..4fc2b86e18f156 100644
--- a/tensorflow/core/framework/log_memory.cc
+++ b/tensorflow/core/framework/log_memory.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-const string LogMemory::kLogMemoryLabel = "__LOG_MEMORY__";
+const std::string LogMemory::kLogMemoryLabel = "__LOG_MEMORY__";
 
 bool LogMemory::IsEnabled() { return VLOG_IS_ON(2); }
 
@@ -28,23 +28,23 @@ namespace {
 // Write the proto entry to LOG(INFO).
 template <typename T>
 void OutputToLog(const T& proto) {
-  string type_name(proto.GetTypeName());
+  std::string type_name(proto.GetTypeName());
   const size_t index = type_name.find_last_of('.');
-  if (index != string::npos) type_name = type_name.substr(index + 1);
+  if (index != std::string::npos) type_name = type_name.substr(index + 1);
   LOG(INFO) << LogMemory::kLogMemoryLabel << " " << type_name << " { "
             << proto.ShortDebugString() << " }";
 }
 
 }  // namespace
 
-void LogMemory::RecordStep(const int64_t step_id, const string& handle) {
+void LogMemory::RecordStep(const int64_t step_id, const std::string& handle) {
   MemoryLogStep step;
   step.set_step_id(step_id);
   step.set_handle(handle);
   OutputToLog(step);
 }
 
-void LogMemory::RecordTensorAllocation(const string& kernel_name,
+void LogMemory::RecordTensorAllocation(const std::string& kernel_name,
                                        const int64_t step_id,
                                        const Tensor& tensor) {
   MemoryLogTensorAllocation allocation;
@@ -55,14 +55,14 @@ void LogMemory::RecordTensorAllocation(const string& kernel_name,
 }
 
 void LogMemory::RecordTensorDeallocation(const int64_t allocation_id,
-                                         const string& allocator_name) {
+                                         const std::string& allocator_name) {
   MemoryLogTensorDeallocation deallocation;
   deallocation.set_allocation_id(allocation_id);
   deallocation.set_allocator_name(allocator_name);
   OutputToLog(deallocation);
 }
 
-void LogMemory::RecordTensorOutput(const string& kernel_name,
+void LogMemory::RecordTensorOutput(const std::string& kernel_name,
                                    const int64_t step_id, const int index,
                                    const Tensor& tensor) {
   MemoryLogTensorOutput output;
@@ -73,7 +73,7 @@ void LogMemory::RecordTensorOutput(const string& kernel_name,
   OutputToLog(output);
 }
 
-void LogMemory::RecordRawAllocation(const string& operation,
+void LogMemory::RecordRawAllocation(const std::string& operation,
                                     const int64_t step_id, size_t num_bytes,
                                     void* ptr, Allocator* allocator) {
   MemoryLogRawAllocation allocation;
@@ -86,7 +86,7 @@ void LogMemory::RecordRawAllocation(const string& operation,
   OutputToLog(allocation);
 }
 
-void LogMemory::RecordRawDeallocation(const string& operation,
+void LogMemory::RecordRawDeallocation(const std::string& operation,
                                       const int64_t step_id, void* ptr,
                                       Allocator* allocator, bool deferred) {
   MemoryLogRawDeallocation deallocation;
diff --git a/tensorflow/core/framework/logging.cc b/tensorflow/core/framework/logging.cc
index 14f23b06d0e5e3..d10b4d555fd00f 100644
--- a/tensorflow/core/framework/logging.cc
+++ b/tensorflow/core/framework/logging.cc
@@ -36,13 +36,13 @@ bool RegisterListener(void (*listener)(const char*)) {
   return true;
 }
 
-bool LogToListeners(string msg, string end) {
+bool LogToListeners(std::string msg, std::string end) {
   auto listeners = logging::GetListeners();
   if (listeners->empty()) {
     return false;
   }
 
-  string ended_msg = absl::StrCat(msg, end);
+  std::string ended_msg = absl::StrCat(msg, end);
 
   for (auto& listener : *listeners) {
     listener(ended_msg.c_str());
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index 06524726e5cfc1..ccc167ca91474e 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -133,7 +133,7 @@ class LookupInterface : public ResourceBase {
   absl::Status CheckFindArguments(const Tensor& keys,
                                   const Tensor& default_value);
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat("A lookup table of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index d6e606699e0c49..b08d16866e1a16 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -43,7 +43,7 @@ int GetTotal(const NameRangeMap& name_map) {
 // to DEVICE_MEMORY except those args in host_memory_args.  Removes
 // elements of host_memory_args that were used.
 void MemoryTypesHelper(const NameRangeMap& name_map,
-                       std::vector<string>* host_memory_args,
+                       std::vector<std::string>* host_memory_args,
                        MemoryTypeVector* memory_types) {
   // Update args that have been marked as in "HOST_MEMORY".
   size_t keep = 0;
@@ -62,7 +62,7 @@ void MemoryTypesHelper(const NameRangeMap& name_map,
   host_memory_args->resize(keep);
 }
 
-bool IsFunctionCallOp(const string& op_type) {
+bool IsFunctionCallOp(const std::string& op_type) {
   return op_type == "SymbolicGradient" || op_type == "PartitionedCall" ||
          op_type == "StatefulPartitionedCall" || op_type == "While" ||
          op_type == "StatelessWhile";
@@ -126,7 +126,8 @@ absl::Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
 
     // Fills in host memory types based on the kernel def.
     const auto& from_proto = kdef->host_memory_arg();
-    std::vector<string> host_memory_args(from_proto.begin(), from_proto.end());
+    std::vector<std::string> host_memory_args(from_proto.begin(),
+                                              from_proto.end());
     MemoryTypesHelper(inp_names, &host_memory_args, inp_mtypes);
     MemoryTypesHelper(out_names, &host_memory_args, out_mtypes);
     if (!host_memory_args.empty()) {
@@ -155,7 +156,7 @@ absl::Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
     }
   }
 
-  std::vector<int32> hostmem_attr;
+  std::vector<int32_t> hostmem_attr;
   if (TryGetNodeAttr(ndef, "_input_hostmem", &hostmem_attr)) {
     for (int32_t i : hostmem_attr) {
       if (0 <= i && i < inp_mtypes->size()) {
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index dafcef280b48e5..c55d7e46a89140 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -305,7 +305,7 @@ auto* tf_data_pipeline_processing_time = tsl::monitoring::Gauge<double, 1>::New(
     "in microseconds",
     "id");
 
-auto* tf_data_auto_shard = tsl::monitoring::Gauge<int64, 2>::New(
+auto* tf_data_auto_shard = tsl::monitoring::Gauge<int64_t, 2>::New(
     "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
     "name");
 
@@ -490,39 +490,41 @@ std::string GraphOptimizationSourceMapping(GraphOptimizationSource source) {
   }
 }
 
-void RecordTFDataFetchOp(const string& name) {
+void RecordTFDataFetchOp(const std::string& name) {
   tf_data_fetch_op_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataAutotune(const string& name) {
+void RecordTFDataAutotune(const std::string& name) {
   tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
 }
 
 tsl::monitoring::CounterCell* GetTFDataBytesConsumedCounter(
-    const string& name) {
+    const std::string& name) {
   return tf_data_bytes_consumed_counter->GetCell(name);
 }
 
 tsl::monitoring::CounterCell* GetTFDataBytesProducedCounter(
-    const string& name) {
+    const std::string& name) {
   return tf_data_bytes_produced_counter->GetCell(name);
 }
 
-tsl::monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
+tsl::monitoring::CounterCell* GetTFDataBytesReadCounter(
+    const std::string& name) {
   return tf_data_bytes_read_counter->GetCell(name);
 }
 
-tsl::monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
+tsl::monitoring::CounterCell* GetTFDataElementsCounter(
+    const std::string& name) {
   return tf_data_elements_counter->GetCell(name);
 }
 
 tsl::monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
-    const string& id) {
+    const std::string& id) {
   return tf_data_model_gauge->GetCell(id);
 }
 
 tsl::monitoring::GaugeCell<double>* GetTFDataPipelineProcessingTimeGauge(
-    const string& id) {
+    const std::string& id) {
   return tf_data_pipeline_processing_time->GetCell(id);
 }
 
@@ -530,23 +532,23 @@ void RecordTFDataBytesFetched(int64_t num_bytes) {
   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
 }
 
-void RecordTFDataExperiment(const string& name) {
+void RecordTFDataExperiment(const std::string& name) {
   tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataExperimentLive(const string& name) {
+void RecordTFDataExperimentLive(const std::string& name) {
   tf_data_experiment_live_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataExperimentOptIn(const string& name) {
+void RecordTFDataExperimentOptIn(const std::string& name) {
   tf_data_experiment_opt_in_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataExperimentOptOut(const string& name) {
+void RecordTFDataExperimentOptOut(const std::string& name) {
   tf_data_experiment_opt_out_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataFingerprint(const string& name) {
+void RecordTFDataFingerprint(const std::string& name) {
   tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
 }
 
@@ -557,18 +559,18 @@ void RecordTFDataServiceRuntimeCompressionDecision(bool compression_disabled) {
       ->IncrementBy(1);
 }
 
-void RecordTFDataServiceCompressionAction(const string& action) {
+void RecordTFDataServiceCompressionAction(const std::string& action) {
   tf_data_service_compression->GetCell(action)->IncrementBy(1);
 }
 
-void RecordTFDataServiceGetElementDuration(const string& data_transfer_protocol,
-                                           uint64 duration_us) {
+void RecordTFDataServiceGetElementDuration(
+    const std::string& data_transfer_protocol, uint64_t duration_us) {
   tf_data_service_get_element_duration_usecs_histogram
       ->GetCell(data_transfer_protocol)
       ->Add(duration_us);
 }
 
-void RecordTFDataGetNextDuration(uint64 duration_us) {
+void RecordTFDataGetNextDuration(uint64_t duration_us) {
   static auto* tf_data_get_next_duration_cell =
       tf_data_get_next_duration_usecs_histogram->GetCell();
   tf_data_get_next_duration_cell->Add(duration_us);
@@ -586,25 +588,25 @@ void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio) {
   tf_data_buffered_vs_budget_ratio_histogram_cell->Add(ratio);
 }
 
-void RecordTFDataIteratorBusy(uint64 duration_us) {
+void RecordTFDataIteratorBusy(uint64_t duration_us) {
   static auto* tf_data_iterator_busy_cell =
       tf_data_iterator_busy_counter->GetCell();
   tf_data_iterator_busy_cell->IncrementBy(duration_us);
 }
 
-void RecordTFDataIteratorLifetime(uint64 duration_us) {
+void RecordTFDataIteratorLifetime(uint64_t duration_us) {
   static auto* tf_data_iterator_lifetime_cell =
       tf_data_iterator_lifetime_counter->GetCell();
   tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
 }
 
-void RecordTFDataIteratorGap(uint64 duration_us) {
+void RecordTFDataIteratorGap(uint64_t duration_us) {
   static auto* tf_data_iterator_gap_msec_histogram_cell =
       tf_data_iterator_gap_msec_histogram->GetCell();
   tf_data_iterator_gap_msec_histogram_cell->Add(duration_us * 0.001);
 }
 
-void RecordTFDataOptimization(const string& name, int64_t num_changes) {
+void RecordTFDataOptimization(const std::string& name, int64_t num_changes) {
   tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
 }
 
@@ -641,7 +643,7 @@ void RecordTFDataServiceClientIterators(
 }
 
 void RecordTFDataServiceDataTransferProtocolUsed(
-    const string& data_transfer_protocol, bool user_specified) {
+    const std::string& data_transfer_protocol, bool user_specified) {
   std::string nature = user_specified ? "specified" : "default";
   tf_data_service_data_transfer_protocol_used_by_nature
       ->GetCell(data_transfer_protocol, nature)
@@ -649,16 +651,16 @@ void RecordTFDataServiceDataTransferProtocolUsed(
 }
 
 void RecordTFDataServiceDataTransferProtocolFallback(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message) {
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message) {
   tf_data_service_data_transfer_protocol_fallback
       ->GetCell(data_transfer_protocol, error::Code_Name(code), error_message)
       ->IncrementBy(1);
 }
 
 void RecordTFDataServiceDataTransferProtocolError(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message) {
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message) {
   tf_data_service_data_transfer_protocol_error
       ->GetCell(data_transfer_protocol, error::Code_Name(code), error_message)
       ->IncrementBy(1);
@@ -688,7 +690,8 @@ void RecordTFDataServiceOptimalNumberOfWorkers(int64_t number_of_workers) {
   tf_data_service_optimal_number_of_workers->GetCell()->Set(number_of_workers);
 }
 
-void RecordTFDataFilename(const string& name, const string& filename) {
+void RecordTFDataFilename(const std::string& name,
+                          const std::string& filename) {
   tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
 }
 
@@ -697,7 +700,7 @@ void RecordTFDataFileLoggerAttempts() {
 }
 
 void RecordTFDataFileLoggerErrors(error::Code error_code,
-                                  const string& error_message) {
+                                  const std::string& error_message) {
   tf_data_file_logger_errors_counter
       ->GetCell(error::Code_Name(error_code), error_message)
       ->IncrementBy(1);
@@ -710,39 +713,40 @@ void RecordTFDataFileLoggerAttemptedNumFiles(size_t num_files) {
 
 void RecordTFDataFileLoggerErrorsNumFiles(size_t num_files,
                                           error::Code error_code,
-                                          const string& error_message) {
+                                          const std::string& error_message) {
   tf_data_file_logger_errors_num_files_counter
       ->GetCell(error::Code_Name(error_code), error_message)
       ->IncrementBy(num_files);
 }
 
-void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
-                           int64 num_workers, int64 num_replicas) {
+void RecordTFDataAutoShard(const std::string& id, data::AutoShardPolicy policy,
+                           int64_t num_workers, int64_t num_replicas) {
   tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64_t>(policy));
   tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
   tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
 }
 
 void RecordTFDataAutoShardRewriteBatchSize(
-    bool eligible, const std::vector<string>& ineligible_reason) {
+    bool eligible, const std::vector<std::string>& ineligible_reason) {
   tf_data_auto_shard_rewrite_batch_size_eligible
       ->GetCell(eligible ? "true" : "false")
       ->IncrementBy(1);
-  for (const string& reason : ineligible_reason) {
+  for (const std::string& reason : ineligible_reason) {
     tf_data_auto_shard_rewrite_batch_size_reason->GetCell(reason)->IncrementBy(
         1);
   }
 }
 
-void RecordTFDataAutotuneStoppingCriteria(const string& name) {
+void RecordTFDataAutotuneStoppingCriteria(const std::string& name) {
   tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataDebug(const string& event) {
+void RecordTFDataDebug(const std::string& event) {
   tf_data_debug->GetCell(event)->IncrementBy(1);
 }
 
-void RecordTFDataError(const string& error_type, const string& status_code) {
+void RecordTFDataError(const std::string& error_type,
+                       const std::string& status_code) {
   tf_data_error->GetCell(error_type, status_code)->IncrementBy(1);
 }
 
@@ -750,7 +754,7 @@ void RecordTFDataFrameworkType(const std::string& framework_type) {
   tf_data_framework_type->GetCell(framework_type)->IncrementBy(1);
 }
 
-void RecordParseDenseFeature(int64 num_features) {
+void RecordParseDenseFeature(int64_t num_features) {
   static auto* parse_dense_feature_counter_cell =
       parse_dense_feature_counter->GetCell();
   parse_dense_feature_counter_cell->IncrementBy(num_features);
@@ -797,7 +801,7 @@ void UpdateAotBefMlirLoadCount() {
   aot_bef_mlir_load_count_cell->IncrementBy(1);
 }
 
-void UpdateGraphExecTime(const uint64 running_time_usecs) {
+void UpdateGraphExecTime(const uint64_t running_time_usecs) {
   if (running_time_usecs > 0) {
     static auto* graph_runs_cell = graph_runs->GetCell();
     static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
@@ -809,13 +813,13 @@ void UpdateGraphExecTime(const uint64 running_time_usecs) {
   }
 }
 
-void UpdateGraphPendingQueueLength(uint64 len) {
+void UpdateGraphPendingQueueLength(uint64_t len) {
   static auto* graph_pending_queue_length_cell =
       graph_pending_queue_length_histogram->GetCell();
   graph_pending_queue_length_cell->Add(len);
 }
 
-void UpdateGraphBuildTime(const uint64 running_time_usecs) {
+void UpdateGraphBuildTime(const uint64_t running_time_usecs) {
   if (running_time_usecs > 0) {
     static auto* build_graph_calls_cell = build_graph_calls->GetCell();
     static auto* build_graph_time_usecs_cell =
@@ -825,7 +829,7 @@ void UpdateGraphBuildTime(const uint64 running_time_usecs) {
   }
 }
 
-void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs) {
+void UpdateFunctionGraphOptimizationTime(const uint64_t running_time_usecs) {
   if (running_time_usecs > 0) {
     static auto* function_graph_optimization_time_usecs_cell =
         function_graph_optimization_time_usecs->GetCell();
@@ -834,7 +838,7 @@ void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs) {
   }
 }
 
-void UpdateFunctionGraphOptimizationSavingTime(const uint64 saving_time_usecs,
+void UpdateFunctionGraphOptimizationSavingTime(const uint64_t saving_time_usecs,
                                                GraphOptimizationSource source) {
   if (saving_time_usecs > 0) {
     std::string mapped_source = GraphOptimizationSourceMapping(source);
@@ -845,7 +849,7 @@ void UpdateFunctionGraphOptimizationSavingTime(const uint64 saving_time_usecs,
   }
 }
 
-uint64 GetFunctionGraphOptimizationSavingTimeUsecs(
+uint64_t GetFunctionGraphOptimizationSavingTimeUsecs(
     GraphOptimizationSource source) {
   std::string mapped_source = GraphOptimizationSourceMapping(source);
   return graph_optimization_saving_time_usecs->GetCell(mapped_source)->value();
@@ -904,14 +908,14 @@ int64_t GetFunctionGraphOptimizationCacheLoadCount(
   return graph_optimization_cache_load_count->GetCell(mapped_source)->value();
 }
 
-void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
+void UpdateTpuVariableDistributionTime(const uint64_t distribution_time_usecs) {
   if (distribution_time_usecs > 0) {
     tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
         distribution_time_usecs);
   }
 }
 
-void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
+void UpdateXlaCompilationTime(const uint64_t compilation_time_usecs) {
   if (compilation_time_usecs > 0) {
     static auto* xla_compilations_cell = xla_compilations->GetCell();
     static auto* xla_compilation_time_usecs_cell =
@@ -921,32 +925,32 @@ void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
   }
 }
 
-void RecordUnusedOutput(const string& op_name) {
+void RecordUnusedOutput(const std::string& op_name) {
   graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
 }
 
-void RecordPipelineProcessingTime(const string& id,
+void RecordPipelineProcessingTime(const std::string& id,
                                   double pipeline_processing_time_usec) {
   GetTFDataPipelineProcessingTimeGauge(id)->Set(pipeline_processing_time_usec);
 }
 
-void IncrementTestCounter(const string& name, const string& label) {
+void IncrementTestCounter(const std::string& name, const std::string& label) {
   test_counters->GetCell(name, label)->IncrementBy(1);
 }
 
-const tsl::monitoring::CounterCell* TestCounter(const string& name,
-                                                const string& label) {
+const tsl::monitoring::CounterCell* TestCounter(const std::string& name,
+                                                const std::string& label) {
   return test_counters->GetCell(name, label);
 }
 
-TestDelta::TestDelta(const string& name, const string& label)
+TestDelta::TestDelta(const std::string& name, const std::string& label)
     : cell_(TestCounter(name, label)) {
   Reset();
 }
 
 void TestDelta::Reset() { last_value_ = cell_->value(); }
 
-int64 TestDelta::Get() { return cell_->value() - last_value_; }
+int64_t TestDelta::Get() { return cell_->value() - last_value_; }
 
 void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& bridge_type,
                                          const std::string& bridge_version,
@@ -1020,12 +1024,13 @@ void IncrementPhase2XlaCompilerCounter(Phase2XlaCompilerMetric metric) {
       ->IncrementBy(1);
 }
 
-void UpdateTpuErrorCounter(const string& op, const string& error_type) {
+void UpdateTpuErrorCounter(const std::string& op,
+                           const std::string& error_type) {
   tpu_op_error_counter->GetCell(op, error_type)->IncrementBy(1);
 }
 
-void UpdateEagerClientErrorCounter(const string& error_source,
-                                   const string& error_type) {
+void UpdateEagerClientErrorCounter(const std::string& error_source,
+                                   const std::string& error_type) {
   eager_client_error_counter->GetCell(error_source, error_type)->IncrementBy(1);
 }
 
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 18b52c49ecf61b..4d84c1f615adae 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -39,24 +39,24 @@ enum class GraphOptimizationSource {
 // Records when a data-fetching tf.data operation is executed.
 //
 // The `name` argument identifies the operation type (e.g. "ToSingleElementOp").
-void RecordTFDataFetchOp(const string& name);
+void RecordTFDataFetchOp(const std::string& name);
 
 // Records that a tf.data.Dataset executed by the program used autotuning.
 //
 // The `name` argument identifies the Dataset type (e.g. "ParallelMap").
-void RecordTFDataAutotune(const string& name);
+void RecordTFDataAutotune(const std::string& name);
 
 // Returns a counter that can be used to record the number of bytes produced by
 // a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
-monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
+monitoring::CounterCell* GetTFDataBytesConsumedCounter(const std::string& name);
 
 // Returns a counter that can be used to record the number of bytes produced by
 // a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
-monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
+monitoring::CounterCell* GetTFDataBytesProducedCounter(const std::string& name);
 
 // Returns a counter than can be used to record the number of bytes read from
 // the filesystem by a tf.data.Dataset source.
@@ -64,43 +64,43 @@ monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
 //
 // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
-monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
+monitoring::CounterCell* GetTFDataBytesReadCounter(const std::string& name);
 
 // Returns a counter than can be used to record the number of elements produced
 // by a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
-monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
+monitoring::CounterCell* GetTFDataElementsCounter(const std::string& name);
 
 // Returns a gauge than can be used to record the performance model information.
 //
 // The `id` argument represents the (unique) model ID.
 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
-    const string& id);
+    const std::string& id);
 
 // Records the number of bytes fetched from tf.data.Dataset iterator.
 void RecordTFDataBytesFetched(int64_t num_bytes);
 
 // Records the number of times a tf.data experiment was applied.
-void RecordTFDataExperiment(const string& name);
+void RecordTFDataExperiment(const std::string& name);
 
 // Records the number of times a tf.data experiment could have been applied.
-void RecordTFDataExperimentLive(const string& name);
+void RecordTFDataExperimentLive(const std::string& name);
 
 // Records the number of times a tf.data experiment was opted into.
-void RecordTFDataExperimentOptIn(const string& experiment_name);
+void RecordTFDataExperimentOptIn(const std::string& experiment_name);
 
 // Records the number of times a tf.data experiment was opted out of.
-void RecordTFDataExperimentOptOut(const string& experiment_name);
+void RecordTFDataExperimentOptOut(const std::string& experiment_name);
 
 // Records the time (in microseconds) spent generating an element and
 // transferring it over the network for the given protocol.
-void RecordTFDataServiceGetElementDuration(const string& data_transfer_protocol,
-                                           uint64 duration_us);
+void RecordTFDataServiceGetElementDuration(
+    const std::string& data_transfer_protocol, uint64_t duration_us);
 
 // Records the time (in microseconds) spent in a single invocation of
 // `ItertatorResource::GetNext()`.
-void RecordTFDataGetNextDuration(uint64 duration_us);
+void RecordTFDataGetNextDuration(uint64_t duration_us);
 
 // Records the histogram of ratios of tf.data autotune algorithm used RAM over
 // the ram budget.
@@ -115,7 +115,7 @@ void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio);
 //
 // The `name` argument identifies the Dataset graph fingerprint,
 // created using GraphHash().
-void RecordTFDataFingerprint(const string& name);
+void RecordTFDataFingerprint(const std::string& name);
 
 // Records the event of a tf.data service pipeline getting a runtime
 // compression decision.
@@ -123,26 +123,26 @@ void RecordTFDataServiceRuntimeCompressionDecision(bool compression_decision);
 
 // Records the event of a tf.data service pipeline making the compression
 // related action.
-void RecordTFDataServiceCompressionAction(const string& action);
+void RecordTFDataServiceCompressionAction(const std::string& action);
 
 // Records the time (in microseconds) during which `IteratorResource` was busy
 // processing at least one `GetNext()` request.
-void RecordTFDataIteratorBusy(uint64 duration_us);
+void RecordTFDataIteratorBusy(uint64_t duration_us);
 
 // Records the time (in microseconds) between `IteratorResource` receiving the
 // first `GetNext()` request and responding to the last `GetNext()` request.
-void RecordTFDataIteratorLifetime(uint64 duration_us);
+void RecordTFDataIteratorLifetime(uint64_t duration_us);
 
 // Records the time histogram (in microseconds) between `IteratorResource`
 // responding to a `GetNext()` request and receiving the next `GetNext()`
 // request.
-void RecordTFDataIteratorGap(uint64 duration_us);
+void RecordTFDataIteratorGap(uint64_t duration_us);
 
 // Records the number of independent graph changes resulting from the
 // application of a tf.data optimization.
 //
 // The `name` argument identifies the optimization (e.g. "noop_elimination").
-void RecordTFDataOptimization(const string& name, int64_t num_changes);
+void RecordTFDataOptimization(const std::string& name, int64_t num_changes);
 
 // Records that a tf.data service worker has been created.
 void RecordTFDataServiceWorkerCreated();
@@ -160,21 +160,21 @@ void RecordTFDataServiceClientIterators(
 // `data_transfer_protocol` to get data from the worker server and whether or
 // not the user explicitly specified the protocol.
 void RecordTFDataServiceDataTransferProtocolUsed(
-    const string& data_transfer_protocol, bool user_specified);
+    const std::string& data_transfer_protocol, bool user_specified);
 
 // Records that a tf.data service worker client fell back to gRPC rather than
 // use `data_transfer_protocol` because of an error of type `code` with message
 // `error_message`.
 void RecordTFDataServiceDataTransferProtocolFallback(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message);
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message);
 
 // Records that a tf.data service worker client got an error of non-retriable
 // type `code` with message `error_message` when trying to transfer data over
 // `data_transfer_protocol`.
 void RecordTFDataServiceDataTransferProtocolError(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message);
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message);
 
 // Records tf.data service cross-trainer cache queries.
 void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit);
@@ -195,7 +195,7 @@ void RecordTFDataServiceOptimalNumberOfWorkers(int64_t number_of_workers);
 // Records the file name read by a tf.data Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
-void RecordTFDataFilename(const string& name, const string& filename);
+void RecordTFDataFilename(const std::string& name, const std::string& filename);
 
 // Records the total attempts made by file logger.
 void RecordTFDataFileLoggerAttempts();
@@ -203,7 +203,7 @@ void RecordTFDataFileLoggerAttempts();
 // Records an error of type `code` with message `error_message` encountered by
 // file logger.
 void RecordTFDataFileLoggerErrors(error::Code code,
-                                  const string& error_message);
+                                  const std::string& error_message);
 
 // Records the total number of files attempted to be logged by file logger.
 void RecordTFDataFileLoggerAttemptedNumFiles(size_t num_files);
@@ -212,15 +212,15 @@ void RecordTFDataFileLoggerAttemptedNumFiles(size_t num_files);
 // `code` with message `error_message` during logging by file logger with this
 // error code.
 void RecordTFDataFileLoggerErrorsNumFiles(size_t num_files, error::Code code,
-                                          const string& error_message);
+                                          const std::string& error_message);
 
 // Records statistics of tf.data auto sharding.
 //
 // The `id` is a unique identifier of the input pipeline. The `policy`
 // identifies the auto-sharding policy used, the `num_workers` identifies the
 // number of workers, and `num_replicas` identifies the number of replicas.
-void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
-                           int64 num_workers, int64 num_replicas);
+void RecordTFDataAutoShard(const std::string& id, data::AutoShardPolicy policy,
+                           int64_t num_workers, int64_t num_replicas);
 
 // Records statistics of whether we can rewrite batch size in tf.data auto
 // sharding.
@@ -229,26 +229,27 @@ void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
 // indicates whether the input pipeline is eligible for the rewrite. The
 // `ineligible_reason` is the reason if the input pipeline is ineligible.
 void RecordTFDataAutoShardRewriteBatchSize(
-    bool eligible, const std::vector<string>& ineligible_reason);
+    bool eligible, const std::vector<std::string>& ineligible_reason);
 
 // Records the number of times each tf.data autotuning algorithm stopping
 // criterion is met.
-void RecordTFDataAutotuneStoppingCriteria(const string& name);
+void RecordTFDataAutotuneStoppingCriteria(const std::string& name);
 
 // Records the number of times this event occured, for debugging.
-void RecordTFDataDebug(const string& event);
+void RecordTFDataDebug(const std::string& event);
 
 // Records the number of times an error of this type occurred with this status
 // code.
-void RecordTFDataError(const string& error_type, const string& error_code);
+void RecordTFDataError(const std::string& error_type,
+                       const std::string& error_code);
 
 // Records the framework type used to build the tf.data.Dataset.
 void RecordTFDataFrameworkType(const std::string& framework_type);
 
 // Records the number of times tf.data file logger encountered an error of this
 // type occurred with this status code.
-void RecordTFDataFileLoggerError(const string& error_type,
-                                 const string& error_code);
+void RecordTFDataFileLoggerError(const std::string& error_type,
+                                 const std::string& error_code);
 
 // Records parsing of dense tensor features.
 void RecordParseDenseFeature(int64_t num_features);
@@ -266,14 +267,14 @@ void RecordGraphOutputTensors(const size_t size);
 // Records the number of cores requested by graphs with XLA SPMD enabled.
 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica);
 
-void UpdateGraphExecTime(const uint64 running_time_usecs);
-void UpdateGraphPendingQueueLength(uint64 len);
+void UpdateGraphExecTime(const uint64_t running_time_usecs);
+void UpdateGraphPendingQueueLength(uint64_t len);
 
 // Records that one output of an op of type `op_name` was unused.
-void RecordUnusedOutput(const string& op_name);
+void RecordUnusedOutput(const std::string& op_name);
 
 // Records the pipeline processing time in microseconds
-void RecordPipelineProcessingTime(const string& id,
+void RecordPipelineProcessingTime(const std::string& id,
                                   double pipeline_processing_time_usec);
 
 // Increments the count of binaries loaded from the persistent cache.
@@ -295,17 +296,17 @@ void UpdateAotBefMlirLoadCount();
 // When executing eagerly, this will not record any activity.
 //
 // TODO(jtkeeling): Should we record building/optimizing tf.functions?
-void UpdateGraphBuildTime(const uint64 running_time_usecs);
+void UpdateGraphBuildTime(const uint64_t running_time_usecs);
 
 // Updates the metric stored for time spent optimizing function graphs.
-void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs);
+void UpdateFunctionGraphOptimizationTime(const uint64_t running_time_usecs);
 
 // Updates the metric stored for time saved by caching graph optimization.
-void UpdateFunctionGraphOptimizationSavingTime(uint64 saving_time_usec,
+void UpdateFunctionGraphOptimizationSavingTime(uint64_t saving_time_usec,
                                                GraphOptimizationSource source);
 
 // Retrieves the total time saved by the graph optimization caching.
-uint64 GetFunctionGraphOptimizationSavingTimeUsecs(
+uint64_t GetFunctionGraphOptimizationSavingTimeUsecs(
     GraphOptimizationSource source);
 
 // Increments the hit count for the graph optimization cache.
@@ -463,10 +464,10 @@ class ScopedCounter final {
 
   // Returns duration of the current interval in case the timer has started.
   // Returns nullopt otherwise.
-  std::optional<uint64> DurationMicroSec() const {
-    return started_ ? std::optional<uint64>(accumulated_time_ +
-                                            Env::Default()->NowMicros() -
-                                            start_time_)
+  std::optional<uint64_t> DurationMicroSec() const {
+    return started_ ? std::optional<uint64_t>(accumulated_time_ +
+                                              Env::Default()->NowMicros() -
+                                              start_time_)
                     : std::nullopt;
   }
 
@@ -492,7 +493,7 @@ class ScopedCounter final {
  private:
   template <std::size_t... S>
   void ReportInternal(std::index_sequence<S...>) {
-    uint64 time_interval = Env::Default()->NowMicros() - start_time_;
+    uint64_t time_interval = Env::Default()->NowMicros() - start_time_;
     time_interval += accumulated_time_;
     if (time_interval > 0) {
       counter_->GetCell(labels_[S]...)->IncrementBy(time_interval);
@@ -508,8 +509,8 @@ class ScopedCounter final {
   monitoring::Counter<NumLabels>* counter_;
   std::array<std::string, NumLabels> labels_;
   bool started_{false};
-  uint64 start_time_;
-  uint64 accumulated_time_;
+  uint64_t start_time_;
+  uint64_t accumulated_time_;
 };
 
 // Returns a counter used to capture timing metrics for graph optimization
@@ -517,32 +518,33 @@ class ScopedCounter final {
 monitoring::Counter<2>* GetGraphOptimizationCounter();
 
 // Updates metrics for time to distribute variables to all TPU hosts.
-void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs);
+void UpdateTpuVariableDistributionTime(const uint64_t distribution_time_usecs);
 
 // Updates the metrics stored about time XLA spents compiling graphs.
-void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
+void UpdateXlaCompilationTime(const uint64_t compilation_time_usecs);
 
 // Increments (by 1) a simple integer counter that is exposed for testing.
-void IncrementTestCounter(const string& name, const string& label);
+void IncrementTestCounter(const std::string& name, const std::string& label);
 
 // Read-only access to a counter for testing.
-const monitoring::CounterCell* TestCounter(const string& name,
-                                           const string& label);
+const monitoring::CounterCell* TestCounter(const std::string& name,
+                                           const std::string& label);
 
 // Read-only wrapper for a TestCounter to track increments between calls.
 class TestDelta {
  public:
-  TestDelta(const string& name, const string& label);
+  TestDelta(const std::string& name, const std::string& label);
   void Reset();
-  int64 Get();
+  int64_t Get();
 
  private:
   const monitoring::CounterCell* cell_;
-  int64 last_value_;
+  int64_t last_value_;
 };
-void UpdateTpuErrorCounter(const string& op, const string& error_type);
-void UpdateEagerClientErrorCounter(const string& error_source,
-                                   const string& error_type);
+void UpdateTpuErrorCounter(const std::string& op,
+                           const std::string& error_type);
+void UpdateEagerClientErrorCounter(const std::string& error_source,
+                                   const std::string& error_type);
 
 }  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 654b15b9ac0201..0d05c8d72b69d7 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -240,7 +240,7 @@ bool AreAllParametersMax(const Model::ModelParameters& parameters) {
 }
 
 // Records the ram usage of hill climbing algorithm.
-void RecordAutotuneRamUsage(int64 ram_budget, double max_buffered_bytes) {
+void RecordAutotuneRamUsage(int64_t ram_budget, double max_buffered_bytes) {
   if (ram_budget == 0) {
     return;
   }
@@ -1227,8 +1227,8 @@ class UnknownRatio : public Node {
   // The processing time is the sum of the self processing time and the product
   // of the ratio estimate and the sum of processing times of inputs.
   void TotalProcessingTimeLocked(
-      absl::flat_hash_map<string, double>* processing_times,
-      absl::flat_hash_map<string, double>* total_processing_times) override
+      absl::flat_hash_map<std::string, double>* processing_times,
+      absl::flat_hash_map<std::string, double>* total_processing_times) override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
@@ -1400,13 +1400,13 @@ class AsyncUnknownRatio : public AsyncRatio {
 
 thread_local int64_t Node::work_start_;
 
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max) {
   return std::make_shared<Parameter>(name, state, min, max);
 }
 
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max, double value) {
   std::shared_ptr<Parameter> parameter =
@@ -1415,7 +1415,7 @@ std::shared_ptr<Parameter> MakeParameter(const string& name,
   return parameter;
 }
 
-std::shared_ptr<Parameter> MakeNonTunableParameter(const string& name,
+std::shared_ptr<Parameter> MakeNonTunableParameter(const std::string& name,
                                                    double value) {
   return std::make_shared<Parameter>(name, nullptr, /*min=*/value,
                                      /*max=*/value);
@@ -1649,8 +1649,8 @@ Node::ModelParameters Node::CollectNodeTunableParameters() const {
   return parameters;
 }
 
-string Node::DebugString() const {
-  absl::flat_hash_map<string, string> debug_strings;
+std::string Node::DebugString() const {
+  absl::flat_hash_map<std::string, std::string> debug_strings;
   tf_shared_lock l(mu_);
   // Build up the debug string from the leaves of the nodes tree to the root.
   for (const auto& node :
@@ -2035,9 +2035,10 @@ void Node::CollectTunableParametersHelper(
   }
 }
 
-void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
-    const TF_SHARED_LOCKS_REQUIRED(mu_) {
-  string result;
+void Node::DebugStringHelper(
+    absl::flat_hash_map<std::string, std::string>* debug_strings) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
+  std::string result;
   absl::StrAppend(&result, long_name(), ":\n");
   absl::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
   absl::StrAppend(&result, "  buffered_bytes=", buffered_bytes_.load(), "\n");
@@ -2047,7 +2048,7 @@ void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
   absl::StrAppend(&result, "  bytes_produced=", bytes_produced_.load(), "\n");
   absl::StrAppend(&result, "  processing_time=", processing_time_.load(), "\n");
   absl::StrAppend(&result, "  num_elements=", num_elements_.load(), "\n");
-  string inputs;
+  std::string inputs;
   for (auto& input : inputs_) {
     absl::StrAppend(&inputs, input->long_name(), ",");
   }
@@ -2080,7 +2081,7 @@ std::shared_ptr<Node> Node::SnapshotHelper(
     {
       mutex_lock l2(cloned_current->mu_);
       cloned_current->parameters_ =
-          absl::flat_hash_map<string, std::shared_ptr<Parameter>>();
+          absl::flat_hash_map<std::string, std::shared_ptr<Parameter>>();
       for (const auto& [parameter_name, parameter_ptr] : parameters_) {
         cloned_current->parameters_[parameter_name] =
             std::make_shared<Parameter>(parameter_ptr);
@@ -2257,7 +2258,7 @@ Model::Model(std::optional<std::string> dataset_name)
     : dataset_name_(std::move(dataset_name)),
       optimization_period_ms_(kOptimizationPeriodMinMs),
       safe_to_collect_metrics_(std::make_shared<GuardedBool>(true)) {
-  model_id_ = absl::StrCat(reinterpret_cast<uint64>(this));
+  model_id_ = absl::StrCat(reinterpret_cast<uint64_t>(this));
   model_gauge_cell_ = metrics::GetTFDataModelGauge(model_id_);
 
   // Capture `safe_to_collect_metrics_` by value to avoid use-after-free issues
@@ -2297,7 +2298,7 @@ Model::~Model() {
   metrics::RecordPipelineProcessingTime(model_id_, 0);
 }
 
-void Model::AddNode(Node::Factory factory, const string& name,
+void Model::AddNode(Node::Factory factory, const std::string& name,
                     std::shared_ptr<Node> parent,
                     std::shared_ptr<Node>* out_node) {
   // The name captures the sequence of iterators joined by `::`. We only use the
@@ -2935,7 +2936,7 @@ void Model::OptimizeStageBasedNonAsyncInterleaveManyNodes(
                               node_tunable_parameters.end());
   }
   // Initialize the parallelism parameter values to minimal before tuning.
-  for (std::pair<string, std::shared_ptr<Parameter>>& pair :
+  for (std::pair<std::string, std::shared_ptr<Parameter>>& pair :
        tunable_parameters) {
     if (pair.second->name != kParallelism) {
       continue;
@@ -3206,7 +3207,8 @@ absl::Status Model::FromProto(ModelProto model_proto,
   return absl::OkStatus();
 }
 
-absl::Status Model::Save(const string& fname, std::shared_ptr<Node> snapshot,
+absl::Status Model::Save(const std::string& fname,
+                         std::shared_ptr<Node> snapshot,
                          const OptimizationParams& optimization_params) {
   ModelProto model_proto;
   std::unique_ptr<Model> model_snapshot = std::make_unique<Model>();
@@ -3222,7 +3224,8 @@ absl::Status Model::Save(const string& fname, std::shared_ptr<Node> snapshot,
   return WriteBinaryProto(Env::Default(), fname, model_proto);
 }
 
-absl::Status Model::Load(const string& fname, std::unique_ptr<Model>* model,
+absl::Status Model::Load(const std::string& fname,
+                         std::unique_ptr<Model>* model,
                          OptimizationParams* optimization_params) {
   ModelProto model_proto;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index fd47c91842721c..c8c39768dc2e6a 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -97,8 +97,8 @@ struct SharedState {
 
 // Represents a parameter.
 struct Parameter {
-  Parameter(const string& name, std::shared_ptr<SharedState> state, double min,
-            double max)
+  Parameter(const std::string& name, std::shared_ptr<SharedState> state,
+            double min, double max)
       : name(name),
         // Sometimes non-autotune nodes (with `autotune_=false`) may contain
         // parameters (for example inputs of parallel interleave dataset which
@@ -121,7 +121,7 @@ struct Parameter {
         state(parameter->state) {}
 
   // Human-readable name of the parameter.
-  const string name;
+  const std::string name;
 
   // Identifies the model value of the parameter. This can be different from
   // the actual value (e.g. during optimization search).
@@ -138,18 +138,18 @@ struct Parameter {
 };
 
 // Returns a new tunable parameter with the value set to `min`.
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max);
 
 // Returns a new tunable parameter with the value set to `value` instead
 // of `min`.
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max, double value);
 
 // Returns a new non-tunable parameter.
-std::shared_ptr<Parameter> MakeNonTunableParameter(const string& name,
+std::shared_ptr<Parameter> MakeNonTunableParameter(const std::string& name,
                                                    double value);
 
 // Class for managing the ram budget of an iterator. This is necessary for
@@ -283,7 +283,7 @@ class Node {
   // Arguments for `Node` constructor.
   struct Args {
     int64_t id;
-    string name;
+    std::string name;
     std::shared_ptr<Node> output;
   };
 
@@ -292,10 +292,10 @@ class Node {
   using NodePairList =
       std::list<std::pair<std::shared_ptr<Node>, std::shared_ptr<Node>>>;
   using ModelParameters =
-      std::vector<std::pair<string, std::shared_ptr<Parameter>>>;
-  using NodeValues = absl::flat_hash_map<string, double>;
+      std::vector<std::pair<std::string, std::shared_ptr<Parameter>>>;
+  using NodeValues = absl::flat_hash_map<std::string, double>;
   using ParameterGradients =
-      absl::flat_hash_map<std::pair<string, string>, double>;
+      absl::flat_hash_map<std::pair<std::string, std::string>, double>;
 
   explicit Node(Args args)
       : id_(args.id),
@@ -413,10 +413,12 @@ class Node {
   }
 
   // Returns a longer node name that is guaranteed to be unique.
-  string long_name() const { return absl::StrCat(name_, "(id:", id_, ")"); }
+  std::string long_name() const {
+    return absl::StrCat(name_, "(id:", id_, ")");
+  }
 
   // Returns the node name.
-  const string& name() const { return name_; }
+  const std::string& name() const { return name_; }
 
   // Returns the number of elements produced by the node.
   int64_t num_elements() const TF_LOCKS_EXCLUDED(mu_) { return num_elements_; }
@@ -426,7 +428,7 @@ class Node {
   std::shared_ptr<Node> output_shared() { return output_weak_ptr_.lock(); }
 
   // Returns the parameter value.
-  double parameter_value(const string& name) const TF_LOCKS_EXCLUDED(mu_) {
+  double parameter_value(const std::string& name) const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return parameters_.at(name)->state->value;
   }
@@ -564,7 +566,7 @@ class Node {
   ModelParameters CollectNodeTunableParameters() const TF_LOCKS_EXCLUDED(mu_);
 
   // Returns a human-readable representation of this node.
-  string DebugString() const TF_LOCKS_EXCLUDED(mu_);
+  std::string DebugString() const TF_LOCKS_EXCLUDED(mu_);
 
   // Flushes the metrics recorded by this node.
   void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
@@ -645,7 +647,7 @@ class Node {
   // Used for (incrementally) recording metrics. The class is thread-safe.
   class Metrics {
    public:
-    explicit Metrics(const string& name)
+    explicit Metrics(const std::string& name)
         : bytes_consumed_counter_(metrics::GetTFDataBytesConsumedCounter(name)),
           bytes_produced_counter_(metrics::GetTFDataBytesProducedCounter(name)),
           num_elements_counter_(metrics::GetTFDataElementsCounter(name)),
@@ -787,8 +789,9 @@ class Node {
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Build up debug string for the node and store in the debug strings map.
-  void DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
-      const TF_SHARED_LOCKS_REQUIRED(mu_);
+  void DebugStringHelper(
+      absl::flat_hash_map<std::string, std::string>* debug_strings) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Copy the node and add the (input, copy) pairs to the NodePairList.
   std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> cloned_output,
@@ -827,7 +830,7 @@ class Node {
 
   mutable mutex mu_;
   const int64_t id_;
-  const string name_;
+  const std::string name_;
 
   // Indicates whether the subtree rooted in this node should be included in
   // autotuning. In particular, if this is `false`, then the subtree is excluded
@@ -844,7 +847,7 @@ class Node {
   std::atomic<int64_t> processing_time_;
   std::atomic<bool> record_metrics_;
   Metrics metrics_;
-  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters_
+  absl::flat_hash_map<std::string, std::shared_ptr<Parameter>> parameters_
       TF_GUARDED_BY(mu_);
 
   // Statistic of inputs processing time history.
@@ -952,7 +955,7 @@ class Model {
   }
 
   // Adds a node with the given name and given parent.
-  void AddNode(Node::Factory factory, const string& name,
+  void AddNode(Node::Factory factory, const std::string& name,
                std::shared_ptr<Node> parent, std::shared_ptr<Node>* out_node)
       TF_LOCKS_EXCLUDED(mu_);
 
@@ -1014,12 +1017,13 @@ class Model {
 
   // Saves this model with a given snapshot and its optimization parameters to a
   // file. Note that the file directory must already exist.
-  absl::Status Save(const string& fname, std::shared_ptr<Node> snapshot,
+  absl::Status Save(const std::string& fname, std::shared_ptr<Node> snapshot,
                     const OptimizationParams& optimization_params);
 
   // Loads a model and its optimization parameters from a file with the given
   // name.
-  static absl::Status Load(const string& fname, std::unique_ptr<Model>* model,
+  static absl::Status Load(const std::string& fname,
+                           std::unique_ptr<Model>* model,
                            OptimizationParams* optimization_params);
 
   // Records gap time between consecutive `GetNext()` calls.
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index b7d42eaa0522d3..6ad728f1a0de2c 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -54,7 +54,7 @@ std::function<int64_t(int64_t)> RamBudgetFunc(int64_t budget) {
   return [budget](int64_t) { return budget; };
 }
 
-int64_t CountParametersOnNode(const string& node_name,
+int64_t CountParametersOnNode(const std::string& node_name,
                               const Model::ModelParameters& parameters) {
   int64_t cnt = 0;
   for (const auto& pair : parameters) {
@@ -865,10 +865,11 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
       (new_output_time - output_time) / kParameterStep, kComparisonPrecision);
 }
 
-class AsyncKnownRatioGradientTest : public ::testing::TestWithParam<string> {};
+class AsyncKnownRatioGradientTest
+    : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(AsyncKnownRatioGradientTest, Model) {
-  const string parameter_name = GetParam();
+  const std::string parameter_name = GetParam();
   const double input_time = 100;
   const int64_t num_inputs_per_output = 2;
 
@@ -1165,7 +1166,7 @@ TEST(SaveModelTest, Model) {
 
   // Make Save->Load roundtrip.
   Env* env = Env::Default();
-  string tmpFile;
+  std::string tmpFile;
   EXPECT_TRUE(env->LocalTempFilename(&tmpFile));
   tmpFile += "_autotune_model_test";
 
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index 92fb66395efbf8..fcbb4b7d3672a3 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -32,7 +32,7 @@ NodeDefBuilder::NodeOut::NodeOut() {
 }
 
 void NodeDefBuilder::NodeOut::Reset(absl::string_view n, int i, DataType dt) {
-  node = string(n);
+  node = std::string(n);
   index = i;
   data_type = dt;
 }
@@ -41,9 +41,9 @@ NodeDefBuilder::NodeDefBuilder(absl::string_view name,
                                absl::string_view op_name,
                                const OpRegistryInterface* op_registry,
                                const NodeDebugInfo* debug) {
-  node_def_.set_name(string(name));
+  node_def_.set_name(name);
   const absl::Status status =
-      op_registry->LookUpOpDef(string(op_name), &op_def_);
+      op_registry->LookUpOpDef(std::string(op_name), &op_def_);
   if (status.ok()) {
     Initialize();
   } else {
@@ -62,7 +62,7 @@ NodeDefBuilder::NodeDefBuilder(absl::string_view name,
 
 NodeDefBuilder::NodeDefBuilder(absl::string_view name, const OpDef* op_def)
     : op_def_(op_def) {
-  node_def_.set_name(string(name));
+  node_def_.set_name(name);
   Initialize();
 }
 
@@ -182,7 +182,7 @@ void NodeDefBuilder::AddInput(absl::string_view src_node, int src_index) {
   } else if (src_index > 0) {
     node_def_.add_input(absl::StrCat(src_node, ":", src_index));
   } else {
-    node_def_.add_input(string(src_node));
+    node_def_.add_input(std::string(src_node));
   }
 }
 
@@ -210,13 +210,13 @@ NodeDefBuilder& NodeDefBuilder::ControlInput(absl::string_view src_node) {
 }
 
 NodeDefBuilder& NodeDefBuilder::Device(absl::string_view device_spec) {
-  node_def_.set_device(string(device_spec));
+  node_def_.set_device(device_spec);
   return *this;
 }
 
 absl::Status NodeDefBuilder::Finalize(NodeDef* node_def, bool consume) {
-  const std::vector<string>* errors_ptr = &errors_;
-  std::vector<string> errors_storage;
+  const std::vector<std::string>* errors_ptr = &errors_;
+  std::vector<std::string> errors_storage;
   if (op_def_ != nullptr && inputs_specified_ < op_def_->input_arg_size()) {
     // Since this is a const method, to add an error, we have to make
     // a copy of the existing errors.
@@ -318,9 +318,9 @@ ATTR(const TensorProto&)
 ATTR(const NameAttrList&)
 ATTR(absl::Span<const absl::string_view>)
 ATTR(absl::Span<const char* const>)
-ATTR(absl::Span<const string>)
+ATTR(absl::Span<const std::string>)
 ATTR(absl::Span<const tstring>)
-ATTR(absl::Span<const int32>)
+ATTR(absl::Span<const int32_t>)
 ATTR(absl::Span<const int64_t>)
 ATTR(absl::Span<const float>)
 ATTR(absl::Span<const bool>)
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index 47b14f185800cf..6b74b20fd85ad3 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -56,7 +56,7 @@ class NodeDefBuilder {
     NodeOut(absl::string_view n, int i, DataType dt);
     NodeOut();  // uninitialized, call Reset() before use.
     void Reset(absl::string_view n, int i, DataType dt);
-    string node;
+    std::string node;
     int index;
     DataType data_type;
   };
@@ -112,9 +112,10 @@ class NodeDefBuilder {
                        absl::Span<const absl::string_view> value);
   NodeDefBuilder& Attr(absl::string_view name,
                        absl::Span<const char* const> value);
-  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const string> value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const std::string> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const tstring> value);
-  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int32> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int32_t> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int64_t> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const float> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const bool> value);
@@ -145,7 +146,7 @@ class NodeDefBuilder {
   absl::Status Finalize(NodeDef* node_def, bool consume = false);
 
   // Accessors for the values set in the constructor.
-  const string& node_name() const { return node_def_.name(); }
+  const std::string& node_name() const { return node_def_.name(); }
   const OpDef& op_def() const { return *op_def_; }
 
  private:
@@ -189,8 +190,8 @@ class NodeDefBuilder {
   const OpDef* op_def_;
   NodeDef node_def_;
   int inputs_specified_;
-  std::vector<string> control_inputs_;
-  std::vector<string> errors_;
+  std::vector<std::string> control_inputs_;
+  std::vector<std::string> errors_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index b5429579bc889b..c769537ab13d94 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -79,12 +79,12 @@ class NodeDefBuilderTest : public ::testing::Test {
   // Calls Finalize() and verifies it returns an error.
   // Each message must appear as a substring of the error.
   void ExpectFailures(NodeDefBuilder& builder,  // NOLINT
-                      const std::vector<string>& messages) {
+                      const std::vector<std::string>& messages) {
     NodeDef node_def;
     absl::Status status = builder.Finalize(&node_def);
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    for (const string& message : messages) {
+    for (const std::string& message : messages) {
       EXPECT_TRUE(absl::StrContains(status.message(), message))
           << status << ", " << message;
     }
@@ -93,14 +93,14 @@ class NodeDefBuilderTest : public ::testing::Test {
   // Calls Finalize() and verifies it returns an error.
   // Message must appear as a substring of the error.
   void ExpectFailure(NodeDefBuilder& builder,  // NOLINT
-                     const string& message) {
+                     const std::string& message) {
     ExpectFailures(builder, {message});
   }
 
   // Like ExpectFailure(), except that the error can come from
   // ValidateNodeDef().
   void ExpectInvalid(NodeDefBuilder& builder,  // NOLINT
-                     const string& message) {
+                     const std::string& message) {
     NodeDef node_def;
     absl::Status status = builder.Finalize(&node_def);
     if (status.ok()) {
@@ -822,9 +822,9 @@ TEST_F(NodeDefBuilderTest, AttrManyDefault) {
                     .Input(FakeInput(DT_FLOAT))
                     .Attr("a", "foo")
                     .Attr("e", "foo")
-                    .Attr("b", std::vector<string>({"bar", "baz"}))
+                    .Attr("b", std::vector<std::string>({"bar", "baz"}))
                     .Attr("f", 1.0f),
-                {DT_FLOAT}, {}, R"proto(
+                {DT_FLOAT}, {}, R"pb(
     op: "AttrManyDefaultAndInferred"
     input: "a"
     attr {
@@ -854,7 +854,7 @@ TEST_F(NodeDefBuilderTest, AttrManyDefault) {
     attr {
       key: "d"
       value { f: 0.3 }
-    })proto");
+    })pb");
 }
 
 TEST_F(NodeDefBuilderTest, AttrListDefault) {
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 507e2275afc3b5..42c5e841c99417 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -64,18 +64,18 @@ AttrSlice::AttrSlice(const NodeDef& node_def)
 
 AttrSlice::AttrSlice(const AttrValueMap* a) : ndef_(nullptr), attrs_(a) {}
 
-string SummarizeAttrsHelper(AttrSlice attrs, absl::string_view device) {
-  string ret;
+std::string SummarizeAttrsHelper(AttrSlice attrs, absl::string_view device) {
+  std::string ret;
 
   // We sort the attrs so the output is deterministic.
-  std::vector<string> attr_names;
+  std::vector<std::string> attr_names;
   attr_names.reserve(attrs.size());
   for (const auto& attr : attrs) {
     attr_names.push_back(attr.first);
   }
   std::sort(attr_names.begin(), attr_names.end());
   bool first = true;
-  for (const string& attr_name : attr_names) {
+  for (const std::string& attr_name : attr_names) {
     if (!first) absl::StrAppend(&ret, ", ");
     first = false;
     absl::StrAppend(&ret, attr_name, "=",
@@ -91,18 +91,18 @@ string SummarizeAttrsHelper(AttrSlice attrs, absl::string_view device) {
   return ret;
 }
 
-string AttrSlice::SummarizeNode() const {
+std::string AttrSlice::SummarizeNode() const {
   return ndef_
              ? SummarizeNodeDef(*ndef_)
              : absl::StrCat(
                    "[", SummarizeAttrsHelper(*this, absl::string_view()), "]");
 }
 
-string AttrSlice::DebugString() const {
-  std::vector<string> attr_key_vals;
+std::string AttrSlice::DebugString() const {
+  std::vector<std::string> attr_key_vals;
   attr_key_vals.reserve(attrs()->size());
   for (const auto& it : *this) {
-    const string& name = it.first;
+    const std::string& name = it.first;
     const AttrValue& attr_value = it.second;
     attr_key_vals.push_back(
         absl::StrCat(name, "=", SummarizeAttrValue(attr_value)));
@@ -110,15 +110,17 @@ string AttrSlice::DebugString() const {
   return absl::StrJoin(attr_key_vals, ", ");
 }
 
-string SummarizeNodeDef(const NodeDef& node_def, int max_inputs_in_summary) {
-  string ret = absl::StrCat(errors::FormatNodeNameForError(node_def.name()),
-                            " = ", node_def.op(), "[");
+std::string SummarizeNodeDef(const NodeDef& node_def,
+                             int max_inputs_in_summary) {
+  std::string ret =
+      absl::StrCat(errors::FormatNodeNameForError(node_def.name()), " = ",
+                   node_def.op(), "[");
   absl::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   absl::StrAppend(&ret, "](");
 
   // Output inputs, including control inputs, verbatim.
   bool first = true;
-  for (const string& input : node_def.input()) {
+  for (const std::string& input : node_def.input()) {
     if (!first) absl::StrAppend(&ret, ", ");
     first = false;
     if (max_inputs_in_summary-- == 0) {
@@ -131,22 +133,22 @@ string SummarizeNodeDef(const NodeDef& node_def, int max_inputs_in_summary) {
   return ret;
 }
 
-string SummarizeAttrs(const NodeDef& node_def) {
+std::string SummarizeAttrs(const NodeDef& node_def) {
   return SummarizeAttrsHelper(node_def, node_def.device());
 }
 
-string FormatNodeDefForError(
+std::string FormatNodeDefForError(
     absl::string_view node_name, bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info) {
   return !has_experimental_debug_info ||
                  experimental_debug_info.original_node_names().empty()
-             ? errors::FormatNodeNameForError(string(node_name))
+             ? errors::FormatNodeNameForError(node_name)
              : errors::FormatOriginalNodeLocationForError(
                    experimental_debug_info.original_node_names(),
                    experimental_debug_info.original_func_names());
 }
 
-string FormatNodeDefForError(const NodeDef& node_def) {
+std::string FormatNodeDefForError(const NodeDef& node_def) {
   return FormatNodeDefForError(node_def.name(),
                                node_def.has_experimental_debug_info(),
                                node_def.experimental_debug_info());
@@ -174,7 +176,7 @@ const AttrValue* AttrSlice::Find(absl::string_view attr_name) const {
   return nullptr;
 }
 
-const AttrValue* AttrSlice::FindByString(const string& attr_name) const {
+const AttrValue* AttrSlice::FindByString(const std::string& attr_name) const {
   auto iter = attrs()->find(attr_name);
   if (iter != attrs()->end()) {
     return &iter->second;
@@ -205,7 +207,7 @@ absl::Status AttrSlice::Find(absl::string_view attr_name,
   return CheckFind(attr_name, *attr_value);
 }
 
-absl::Status AttrSlice::FindByString(const string& attr_name,
+absl::Status AttrSlice::FindByString(const std::string& attr_name,
                                      const AttrValue** attr_value) const {
   *attr_value = FindByString(attr_name);
   return CheckFind(attr_name, *attr_value);
@@ -288,19 +290,19 @@ bool AttrSlice::EqualAttrs(AttrSlice other, Scratch* scratch) const {
   }
 DEFINE_GET_ATTR(tstring, s, "string", emplace_back, v, ;)
 DEFINE_TRY_GET_ATTR(tstring, s, "string", emplace_back, v, ;)
-DEFINE_GET_ATTR(string, s, "string", emplace_back, v, ;)
-DEFINE_TRY_GET_ATTR(string, s, "string", emplace_back, v, ;)
+DEFINE_GET_ATTR(std::string, s, "string", emplace_back, v, ;)
+DEFINE_TRY_GET_ATTR(std::string, s, "string", emplace_back, v, ;)
 DEFINE_GET_ATTR(int64_t, i, "int", emplace_back, v, ;)
 DEFINE_TRY_GET_ATTR(int64_t, i, "int", emplace_back, v, ;)
 DEFINE_GET_ATTR(
-    int32, i, "int", emplace_back, static_cast<int32>(v),
-    if (static_cast<int64_t>(static_cast<int32>(v)) != v) {
+    int32_t, i, "int", emplace_back, static_cast<int32_t>(v),
+    if (static_cast<int64_t>(static_cast<int32_t>(v)) != v) {
       return errors::InvalidArgument("Attr ", attr_name, " has value ", v,
                                      " out of range for an int32");
     })
 DEFINE_TRY_GET_ATTR(
-    int32, i, "int", emplace_back, static_cast<int32>(v),
-    if (static_cast<int64_t>(static_cast<int32>(v)) != v) {
+    int32_t, i, "int", emplace_back, static_cast<int32_t>(v),
+    if (static_cast<int64_t>(static_cast<int32_t>(v)) != v) {
       static int log_counter = 0;
       if (log_counter < 10) {
         log_counter++;
@@ -345,13 +347,13 @@ DEFINE_GET_ATTR(NameAttrList, func, "func", emplace_back, v, ;);
 #undef DEFINE_GET_ATTR
 
 bool HasNodeAttr(const NodeDef& node_def, absl::string_view attr_name) {
-  return node_def.attr().find(string(attr_name)) != node_def.attr().end();
+  return node_def.attr().find(std::string(attr_name)) != node_def.attr().end();
 }
 
-static const string& kEmptyString = *new string();
+static const std::string& kEmptyString = *new std::string();
 
-const string& GetNodeAttrString(const AttrSlice& attrs,
-                                absl::string_view attr_name) {
+const std::string& GetNodeAttrString(const AttrSlice& attrs,
+                                     absl::string_view attr_name) {
   const AttrValue* attr_value = attrs.Find(attr_name);
   if (attr_value == nullptr) {
     return kEmptyString;
@@ -364,7 +366,7 @@ const string& GetNodeAttrString(const AttrSlice& attrs,
 }
 
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<const string*>* value) {
+                    std::vector<const std::string*>* value) {
   const AttrValue* attr_value = attrs.Find(attr_name);
   if (attr_value == nullptr) {
     return false;
@@ -456,7 +458,7 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          Padding* value) {
-  string str_value;
+  std::string str_value;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, attr_name, &str_value));
   return GetPaddingFromString(str_value, value);
 }
@@ -473,7 +475,7 @@ absl::Status AddArgToSig(const NodeDefOrAttrSlice& node_or_attrs,
     TF_RETURN_IF_ERROR(
         GetNodeAttr(node_or_attrs, arg_def.number_attr(), &repeats));
     // We can't handle outputs that are larger than int32 sizes.
-    if (static_cast<int64_t>(static_cast<int32>(repeats)) != repeats) {
+    if (static_cast<int64_t>(static_cast<int32_t>(repeats)) != repeats) {
       return errors::InvalidArgument("Number of outputs is too big: ", repeats);
     }
     if (repeats < 0) {
@@ -645,10 +647,10 @@ absl::Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   bool seen_control = false;
   size_t num_inputs = 0;
   // TODO(josh11b): Unify the input field validation.
-  for (const string& input : node_def.input()) {
+  for (const std::string& input : node_def.input()) {
     if (absl::StartsWith(input, "^")) {
       seen_control = true;
-      if (input.find(':') != string::npos) {
+      if (input.find(':') != std::string::npos) {
         return errors::InvalidArgument("Control input '", input,
                                        "' must not have ':' in NodeDef: ",
                                        FormatNodeDefForError(node_def));
@@ -662,7 +664,7 @@ absl::Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     }
   }
 
-  std::unordered_map<string, const OpDef::AttrDef*> op_attrs;
+  std::unordered_map<std::string, const OpDef::AttrDef*> op_attrs;
   for (const auto& attr : op_def.attr()) {
     if (!gtl::InsertIfNotPresent(&op_attrs, attr.name(), &attr)) {
       return errors::InvalidArgument("OpDef has duplicate attr name '",
@@ -700,7 +702,7 @@ absl::Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
 
   // Were all attrs in the OpDef found in the NodeDef?
   if (!op_attrs.empty()) {
-    string attrs;
+    std::string attrs;
     for (const auto& attr_pair : op_attrs) {
       if (!attrs.empty()) absl::StrAppend(&attrs, "', '");
       absl::StrAppend(&attrs, attr_pair.first);
@@ -870,7 +872,8 @@ const absl::string_view kColocationGroupPrefixStringPiece(
 
 }  // namespace
 
-absl::Status ValidateOpInput(const string& input_name, bool* is_control_input) {
+absl::Status ValidateOpInput(const std::string& input_name,
+                             bool* is_control_input) {
   *is_control_input = false;
   if (IsValidDataInputName(input_name)) {
     return absl::OkStatus();
@@ -882,7 +885,7 @@ absl::Status ValidateOpInput(const string& input_name, bool* is_control_input) {
   }
 }
 
-absl::Status ValidateNodeName(const string& node_name) {
+absl::Status ValidateNodeName(const std::string& node_name) {
   if (IsValidNodeName(node_name)) {
     return absl::OkStatus();
   } else {
@@ -896,7 +899,7 @@ absl::Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
     return AttachDef(s, node_def);
   }
   bool in_control_inputs = false;
-  for (const string& input_name : node_def.input()) {
+  for (const std::string& input_name : node_def.input()) {
     bool is_control_input;
     s = ValidateOpInput(input_name, &is_control_input);
     if (!s.ok()) {
@@ -915,7 +918,7 @@ absl::Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
 
 absl::Status AttachDef(const absl::Status& status, const NodeDef& node_def,
                        bool allow_multiple_formatted_node) {
-  string node_error;
+  std::string node_error;
   if (!allow_multiple_formatted_node &&
       absl::StrContains(status.message(), "{{node ")) {
     node_error = node_def.name();
@@ -930,11 +933,11 @@ absl::Status AttachDef(const absl::Status& status, const NodeDef& node_def,
 void AddNodeAttr(absl::string_view name, const AttrValue& value,
                  NodeDef* node_def) {
   node_def->mutable_attr()->insert(
-      AttrValueMap::value_type(string(name), value));
+      AttrValueMap::value_type(std::string(name), value));
 }
 
 void AddNodeAttr(absl::string_view name, AttrValue&& value, NodeDef* node_def) {
-  (*node_def->mutable_attr())[string(name)] = std::move(value);
+  (*node_def->mutable_attr())[std::string(name)] = std::move(value);
 }
 
 #define ADD_NODE_ATTR(T)                                           \
@@ -957,8 +960,8 @@ ADD_NODE_ATTR(const TensorProto&)
 ADD_NODE_ATTR(const NameAttrList&)
 ADD_NODE_ATTR(absl::Span<const absl::string_view>)
 ADD_NODE_ATTR(absl::Span<const char* const>)
-ADD_NODE_ATTR(absl::Span<const string>)
-ADD_NODE_ATTR(absl::Span<const int32>)
+ADD_NODE_ATTR(absl::Span<const std::string>)
+ADD_NODE_ATTR(absl::Span<const int32_t>)
 ADD_NODE_ATTR(absl::Span<const int64_t>)
 ADD_NODE_ATTR(absl::Span<const float>)
 ADD_NODE_ATTR(absl::Span<const bool>)
@@ -973,7 +976,7 @@ ADD_NODE_ATTR(absl::Span<const NameAttrList>)
 
 void AddAttr(absl::string_view name, const AttrValue& value,
              AttrValueMap* map) {
-  map->insert(AttrValueMap::value_type(string(name), value));
+  map->insert(AttrValueMap::value_type(std::string(name), value));
 }
 
 #define ADD_ATTR(T)                                            \
@@ -994,7 +997,7 @@ absl::Status AddPrefixAndSuffixToNode(absl::string_view prefix,
   // Update frame name to avoid multiple LoopCond nodes in one frame.
   if (uniquify_frame_name &&
       (node_def->op() == "Enter" || node_def->op() == "RefEnter")) {
-    string frame_name;
+    std::string frame_name;
     TF_RETURN_IF_ERROR(GetNodeAttr(*node_def, "frame_name", &frame_name));
     AttrValue& attr = (*node_def->mutable_attr())["frame_name"];
     frame_name = absl::StrCat(prefix, frame_name, suffix);
@@ -1005,7 +1008,7 @@ absl::Status AddPrefixAndSuffixToNode(absl::string_view prefix,
 }
 
 absl::Status MaybeAddPrefixToColocationConstraints(
-    const std::unordered_set<string>& match, absl::string_view prefix,
+    const std::unordered_set<std::string>& match, absl::string_view prefix,
     NodeDef* node_def) {
   auto attr = node_def->mutable_attr()->find(kColocationAttrName);
   if (attr == node_def->mutable_attr()->end()) {
@@ -1016,7 +1019,7 @@ absl::Status MaybeAddPrefixToColocationConstraints(
   for (size_t i = 0; i < constraints_size; ++i) {
     absl::string_view original(constraints_list->s(i));
     if (absl::ConsumePrefix(&original, kColocationGroupPrefixStringPiece)) {
-      if (match.find(string(original)) != match.end()) {
+      if (match.find(std::string(original)) != match.end()) {
         (*constraints_list->mutable_s(i)) =
             absl::StrCat(kColocationGroupPrefix, prefix, original);
       }
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 2b82c596fee301..1dd97f9e4137db 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -81,7 +81,7 @@ std::string FormatNodeDefForError(
     absl::string_view node_name, bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
 
-typedef protobuf::Map<string, AttrValue> AttrValueMap;
+typedef protobuf::Map<std::string, AttrValue> AttrValueMap;
 
 // Adds an attr with name <name> and value <value> to *node_def.
 // The type of the attr is based on the type of value.
@@ -109,9 +109,9 @@ void AddNodeAttr(absl::string_view name,
                  absl::Span<const absl::string_view> value, NodeDef* node_def);
 void AddNodeAttr(absl::string_view name, absl::Span<const char* const> value,
                  NodeDef* node_def);
-void AddNodeAttr(absl::string_view name, absl::Span<const string> value,
+void AddNodeAttr(absl::string_view name, absl::Span<const std::string> value,
                  NodeDef* node_def);
-void AddNodeAttr(absl::string_view name, absl::Span<const int32> value,
+void AddNodeAttr(absl::string_view name, absl::Span<const int32_t> value,
                  NodeDef* node_def);
 void AddNodeAttr(absl::string_view name, absl::Span<const int64_t> value,
                  NodeDef* node_def);
@@ -221,7 +221,7 @@ absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          int64_t* value);  // type: "int"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                         int32* value);  // type: "int"
+                         int32_t* value);  // type: "int"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          float* value);  // type: "float"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -236,14 +236,15 @@ absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          PartialTensorShape* value);  // type: "shape"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          Tensor* value);  // type: "tensor"
-absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                         std::vector<string>* value);  // type "list(string)"
+absl::Status GetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<std::string>* value);  // type "list(string)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          std::vector<tstring>* value);  // type "list(tstring)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          std::vector<int64_t>* value);  // type "list(int)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                         std::vector<int32>* value);  // type "list(int)"
+                         std::vector<int32_t>* value);  // type "list(int)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          std::vector<float>* value);  // type "list(float)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -302,7 +303,7 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     std::vector<int64_t>* value);  // type: "int"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    int32* value);  // type: "int"
+                    int32_t* value);  // type: "int"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     float* value);  // type: "float"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -313,11 +314,11 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     TensorShape* value);  // type: "shape"
 
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<string>* value);  // type: "list(string)"
+                    std::vector<std::string>* value);  // type: "list(string)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     std::vector<tstring>* value);  // type: "list(tstring)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<int32>* value);  // type: "list(int)"
+                    std::vector<int32_t>* value);  // type: "list(int)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     std::vector<float>* value);  // type: "list(float)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -329,8 +330,9 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 
 // Overloads of TryGetNodeAttr() that avoid copying the non-POD attribute
 // values.
-bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<const string*>* value);  // type: "list(string)"
+bool TryGetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<const std::string*>* value);  // type: "list(string)"
 bool TryGetNodeAttr(
     const AttrSlice& attrs, absl::string_view attr_name,
     std::vector<const TensorShapeProto*>* value);  // type: "list(shape)"
@@ -442,7 +444,7 @@ absl::Status AddPrefixAndSuffixToNode(absl::string_view prefix,
 // Appends the given prefix to the colocation group name if the name exists
 // in `to_match`.
 absl::Status MaybeAddPrefixToColocationConstraints(
-    const std::unordered_set<string>& match, absl::string_view prefix,
+    const std::unordered_set<std::string>& match, absl::string_view prefix,
     NodeDef* node_def);
 
 // Updates the colocation constraint name with the one provided in the map (if
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 5296dcc7075dc6..66a37a41ee3f8a 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -37,7 +37,7 @@ OpDef ToOpDef(const OpDefBuilder& builder) {
   return op_reg_data.op_def;
 }
 
-NodeDef ToNodeDef(const string& text) {
+NodeDef ToNodeDef(const std::string& text) {
   NodeDef node_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &node_def));
   return node_def;
@@ -56,7 +56,7 @@ void ExpectSuccess(const NodeDef& good, const OpDef& op_def) {
 }
 
 void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
-                   const string& message) {
+                   const std::string& message) {
   absl::Status status = ValidateNodeDef(bad, op_def);
 
   EXPECT_FALSE(status.ok()) << "NodeDef: " << SummarizeNodeDef(bad)
@@ -322,7 +322,7 @@ void ExpectValidSyntax(const NodeDef& good) {
       << "NodeDef: " << SummarizeNodeDef(good);
 }
 
-void ExpectInvalidSyntax(const NodeDef& bad, const string& message) {
+void ExpectInvalidSyntax(const NodeDef& bad, const std::string& message) {
   absl::Status status = ValidateExternalNodeDefSyntax(bad);
 
   ASSERT_FALSE(status.ok()) << "NodeDef: " << SummarizeNodeDef(bad);
@@ -761,11 +761,11 @@ TEST(AddPrefixAndSuffixToNode, Enter) {
   node_def.set_name("enter");
   node_def.set_op("Enter");
   AddNodeAttr("frame_name", "test_frame", &node_def);
-  const string prefix = "prefix/";
-  const string suffix = "/suffix";
+  const std::string prefix = "prefix/";
+  const std::string suffix = "/suffix";
   TF_ASSERT_OK(AddPrefixAndSuffixToNode(prefix, suffix, &node_def));
   EXPECT_EQ("prefix/enter/suffix", node_def.name());
-  string frame_name;
+  std::string frame_name;
   TF_ASSERT_OK(GetNodeAttr(node_def, "frame_name", &frame_name));
   EXPECT_EQ("prefix/test_frame/suffix", frame_name);
 }
@@ -780,15 +780,15 @@ TEST(MaybeAddPrefixToColocationConstraints, Basic) {
                absl::StrCat(kColocationGroupPrefix, "Node3")},
               &node_def);
 
-  std::unordered_set<string> match;
+  std::unordered_set<std::string> match;
   match.insert("Node1");
   match.insert("Node3");
   TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
-  std::vector<string> coloc_constraints;
+  std::vector<std::string> coloc_constraints;
   TF_ASSERT_OK(GetNodeAttr(node_def, kColocationAttrName, &coloc_constraints));
-  EXPECT_EQ(
-      coloc_constraints,
-      std::vector<string>({"loc:@fn/Node1", "loc:@Node2", "loc:@fn/Node3"}));
+  EXPECT_EQ(coloc_constraints,
+            std::vector<std::string>(
+                {"loc:@fn/Node1", "loc:@Node2", "loc:@fn/Node3"}));
 }
 
 TEST(MaybeAddPrefixToColocationConstraints, NoConstraints) {
@@ -796,7 +796,7 @@ TEST(MaybeAddPrefixToColocationConstraints, NoConstraints) {
   node_def.set_name("Identity");
   node_def.set_op("Identity");
 
-  std::unordered_set<string> match;
+  std::unordered_set<std::string> match;
   match.insert("Node1");
   match.insert("Node3");
   TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
@@ -817,10 +817,10 @@ TEST(MaybeUpdateColocationConstraintsWithMap, Basic) {
   node_map["Node1"] = "Node4";
   node_map["Invalid"] = "Node5";
   TF_ASSERT_OK(MaybeUpdateColocationConstraintsWithMap(node_map, &node_def));
-  std::vector<string> coloc_constraints;
+  std::vector<std::string> coloc_constraints;
   TF_ASSERT_OK(GetNodeAttr(node_def, kColocationAttrName, &coloc_constraints));
-  EXPECT_EQ(coloc_constraints,
-            std::vector<string>({"loc:@Node4", "loc:@Node2", "loc:@Node3"}));
+  EXPECT_EQ(coloc_constraints, std::vector<std::string>(
+                                   {"loc:@Node4", "loc:@Node2", "loc:@Node3"}));
 }
 
 TEST(MaybeUpdateColocationConstraintsWithMap, NoConstraints) {
diff --git a/tensorflow/core/framework/node_properties_test.cc b/tensorflow/core/framework/node_properties_test.cc
index 8e1dd344e91261..28f992c4e4dff4 100644
--- a/tensorflow/core/framework/node_properties_test.cc
+++ b/tensorflow/core/framework/node_properties_test.cc
@@ -40,7 +40,7 @@ class MockOpRegistry : public OpRegistryInterface {
   // Returns an error status and sets *op_reg_data to nullptr if no OpDef is
   // registered under that name, otherwise returns the registered OpDef.
   // Caller must not delete the returned pointer.
-  absl::Status LookUp(const string& op_type_name,
+  absl::Status LookUp(const std::string& op_type_name,
                       const OpRegistrationData** op_reg_data) const override {
     if (op_type_name == "Foo") {
       *op_reg_data = &op_reg_;
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 31aeb2421bc652..7688578d8513f5 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -39,7 +39,7 @@ absl::Status DefaultValidator(const OpRegistryInterface& op_registry) {
 
 // OpRegistry -----------------------------------------------------------------
 
-absl::Status OpRegistryInterface::LookUpOpDef(const string& op_type_name,
+absl::Status OpRegistryInterface::LookUpOpDef(const std::string& op_type_name,
                                               const OpDef** op_def) const {
   *op_def = nullptr;
   const OpRegistrationData* op_reg_data = nullptr;
@@ -62,7 +62,7 @@ void OpRegistry::Register(const OpRegistrationDataFactory& op_data_factory) {
 
 namespace {
 // Helper function that returns Status message for failed LookUp.
-absl::Status OpNotFound(const string& op_type_name) {
+absl::Status OpNotFound(const std::string& op_type_name) {
   absl::Status status = errors::NotFound(
       "Op type not registered '", op_type_name, "' in binary running on ",
       port::Hostname(), ". ",
@@ -76,13 +76,14 @@ absl::Status OpNotFound(const string& op_type_name) {
 }
 }  // namespace
 
-absl::Status OpRegistry::LookUp(const string& op_type_name,
+absl::Status OpRegistry::LookUp(const std::string& op_type_name,
                                 const OpRegistrationData** op_reg_data) const {
   if ((*op_reg_data = LookUp(op_type_name))) return absl::OkStatus();
   return OpNotFound(op_type_name);
 }
 
-const OpRegistrationData* OpRegistry::LookUp(const string& op_type_name) const {
+const OpRegistrationData* OpRegistry::LookUp(
+    const std::string& op_type_name) const {
   {
     tf_shared_lock l(mu_);
     if (initialized_) {
@@ -96,7 +97,7 @@ const OpRegistrationData* OpRegistry::LookUp(const string& op_type_name) const {
 }
 
 const OpRegistrationData* OpRegistry::LookUpSlow(
-    const string& op_type_name) const {
+    const std::string& op_type_name) const {
   const OpRegistrationData* res = nullptr;
 
   bool first_call = false;
@@ -195,10 +196,10 @@ absl::Status OpRegistry::ProcessRegistrations() const {
   return CallDeferred();
 }
 
-string OpRegistry::DebugString(bool include_internal) const {
+std::string OpRegistry::DebugString(bool include_internal) const {
   OpList op_list;
   Export(include_internal, &op_list);
-  string ret;
+  std::string ret;
   for (const auto& op : op_list.op()) {
     absl::StrAppend(&ret, SummarizeOpDef(op), "\n");
   }
@@ -268,7 +269,7 @@ OpListOpRegistry::OpListOpRegistry(const OpList* op_list) {
 }
 
 const OpRegistrationData* OpListOpRegistry::LookUp(
-    const string& op_type_name) const {
+    const std::string& op_type_name) const {
   auto iter = index_.find(op_type_name);
   if (iter == index_.end()) {
     return nullptr;
@@ -277,7 +278,8 @@ const OpRegistrationData* OpListOpRegistry::LookUp(
 }
 
 absl::Status OpListOpRegistry::LookUp(
-    const string& op_type_name, const OpRegistrationData** op_reg_data) const {
+    const std::string& op_type_name,
+    const OpRegistrationData** op_reg_data) const {
   if ((*op_reg_data = LookUp(op_type_name))) return absl::OkStatus();
   return OpNotFound(op_type_name);
 }
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 41b39fc2076469..251d58bdd01a15 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -165,7 +165,8 @@ class OpRegistry : public OpRegistryInterface {
   // Functions in deferred_ may only be called with mu_ held.
   mutable std::vector<OpRegistrationDataFactory> deferred_ TF_GUARDED_BY(mu_);
   // Values are owned.
-  mutable absl::flat_hash_map<string, std::unique_ptr<const OpRegistrationData>>
+  mutable absl::flat_hash_map<std::string,
+                              std::unique_ptr<const OpRegistrationData>>
       registry_ TF_GUARDED_BY(mu_);
   mutable bool initialized_ TF_GUARDED_BY(mu_);
 
@@ -193,7 +194,8 @@ class OpListOpRegistry : public OpRegistryInterface {
 
  private:
   // Values are owned.
-  absl::flat_hash_map<string, std::unique_ptr<const OpRegistrationData>> index_;
+  absl::flat_hash_map<std::string, std::unique_ptr<const OpRegistrationData>>
+      index_;
 };
 
 // Support for defining the OpDef (specifying the semantics of the Op and how
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index da11e32498becf..f6087d6d5f33ed 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -88,10 +88,10 @@ class OpCompatibilityTest : public OpsTestBase {
     TF_ASSERT_OK(RunOpKernel());
   }
 
-  string Result() { return GetOutput(0)->scalar<tstring>()(); }
+  std::string Result() { return GetOutput(0)->scalar<tstring>()(); }
 
   void ExpectIncompatible(const OpDef& old_op_def, const OpDef& new_op_def,
-                          const string& error) {
+                          const std::string& error) {
     // Test OpDefCompatible gives the same answer without the node_def.
     absl::Status status = OpDefCompatible(old_op_def, new_op_def);
     if (status.ok()) {
@@ -103,8 +103,9 @@ class OpCompatibilityTest : public OpsTestBase {
     }
   }
 
-  void ExpectInvalid(const OpDef& old_op_def, const string& validation_error,
-                     const string& compatibility_error) {
+  void ExpectInvalid(const OpDef& old_op_def,
+                     const std::string& validation_error,
+                     const std::string& compatibility_error) {
     // Record the original signature before we change *node_def().
     DataTypeVector old_in_types, old_out_types;
     TF_ASSERT_OK(InOutTypesForNode(*node_def(), old_op_def, &old_in_types,
@@ -127,7 +128,7 @@ class OpCompatibilityTest : public OpsTestBase {
   }
 
   void ExpectTypeMismatch(const OpDef& old_op_def,
-                          const string& compatibility_error) {
+                          const std::string& compatibility_error) {
     // Record the original signature before we change *node_def().
     DataTypeVector old_in_types, old_out_types;
     TF_ASSERT_OK(InOutTypesForNode(*node_def(), old_op_def, &old_in_types,
@@ -153,7 +154,7 @@ class OpCompatibilityTest : public OpsTestBase {
   }
 
   void ExpectRenameFailure(const OpDef& old_op_def,
-                           const string& compatibility_error) {
+                           const std::string& compatibility_error) {
     // This should be all that is needed to get compatibility.
     const OpDef* new_op_def = RegisteredOpDef();
     AddDefaultsToNodeDef(*new_op_def, node_def());
@@ -166,7 +167,7 @@ class OpCompatibilityTest : public OpsTestBase {
   }
 
   void ExpectDefaultChangeFailure(const OpDef& old_op_def,
-                                  const string& compatibility_error) {
+                                  const std::string& compatibility_error) {
     // This should be all that is needed to get compatibility.
     const OpDef* new_op_def = RegisteredOpDef();
     AddDefaultsToNodeDef(*new_op_def, node_def());
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index e4ec9e50497d73..9265f5b10ed7e4 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -36,7 +36,7 @@ namespace tensorflow {
 
 namespace {
 
-string AttrError(absl::string_view orig, const string& op_name) {
+std::string AttrError(absl::string_view orig, const std::string& op_name) {
   return absl::StrCat(" from Attr(\"", orig, "\") for Op ", op_name);
 }
 
@@ -62,7 +62,7 @@ bool ConsumeListPrefix(absl::string_view* sp) {
 
 bool ConsumeQuotedString(char quote_ch, absl::string_view* sp,
                          absl::string_view* out) {
-  const string quote_str(1, quote_ch);
+  const std::string quote_str(1, quote_ch);
   return Scanner(*sp)
       .OneLiteral(quote_str.c_str())
       .RestartCapture()
@@ -150,7 +150,7 @@ bool ProcessCompoundType(const absl::string_view type_string,
 }
 
 void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
-                  OpDef* op_def, std::vector<string>* errors) {
+                  OpDef* op_def, std::vector<std::string>* errors) {
   OpDef::AttrDef* attr = op_def->add_attr();
   absl::string_view orig(spec);
 
@@ -161,7 +161,7 @@ void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
 
   // Read "<type>" or "list(<type>)".
   bool is_list = ConsumeListPrefix(&spec);
-  string type;
+  std::string type;
   absl::string_view type_string;  // Used if type == "type"
   if (absl::ConsumePrefix(&spec, "string")) {
     type = "string";
@@ -197,8 +197,8 @@ void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
         VERIFY(ConsumeQuotedString('"', &spec, &escaped_string) ||
                    ConsumeQuotedString('\'', &spec, &escaped_string),
                "Trouble parsing allowed string at '", spec, "'");
-        string unescaped;
-        string error;
+        std::string unescaped;
+        std::string error;
         VERIFY(absl::CUnescape(escaped_string, &unescaped, &error),
                "Trouble unescaping \"", escaped_string,
                "\", got error: ", error);
@@ -274,8 +274,8 @@ void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
 
 #undef VERIFY
 
-string InOutError(bool is_output, absl::string_view orig,
-                  const string& op_name) {
+std::string InOutError(bool is_output, absl::string_view orig,
+                       const std::string& op_name) {
   return strings::StrCat(" from ", is_output ? "Output" : "Input", "(\"", orig,
                          "\") for Op ", op_name);
 }
@@ -343,7 +343,7 @@ bool ConsumeControlOutName(absl::string_view* sp, absl::string_view* out) {
   } while (false)
 
 void FinalizeInputOrOutput(absl::string_view spec, bool is_output,
-                           OpDef* op_def, std::vector<string>* errors) {
+                           OpDef* op_def, std::vector<std::string>* errors) {
   OpDef::ArgDef* arg =
       is_output ? op_def->add_output_arg() : op_def->add_input_arg();
 
@@ -426,12 +426,13 @@ void FinalizeInputOrOutput(absl::string_view spec, bool is_output,
 
 #undef VERIFY
 
-string ControlOutError(absl::string_view orig, const string& op_name) {
+std::string ControlOutError(absl::string_view orig,
+                            const std::string& op_name) {
   return absl::StrCat(" from ControlOutput(\"", orig, "\") for Op ", op_name);
 }
 
 void FinalizeControlOutput(absl::string_view name, OpDef* op_def,
-                           std::vector<string>* errors) {
+                           std::vector<std::string>* errors) {
   absl::string_view orig(name);
 
   // Parse control output name.
@@ -441,7 +442,7 @@ void FinalizeControlOutput(absl::string_view name, OpDef* op_def,
                                    ControlOutError(orig, op_def->name())));
   }
 
-  *op_def->add_control_output() = string(tmp_name.data(), tmp_name.size());
+  *op_def->add_control_output() = std::string(tmp_name.data(), tmp_name.size());
 }
 
 int num_leading_spaces(absl::string_view s) {
@@ -467,12 +468,12 @@ bool IsDocNameColon(absl::string_view s) {
   return ConsumeDocNameColon(&s, nullptr /* out */);
 }
 
-void FinalizeDoc(const string& text, OpDef* op_def,
-                 std::vector<string>* errors) {
-  std::vector<string> lines = str_util::Split(text, '\n');
+void FinalizeDoc(const std::string& text, OpDef* op_def,
+                 std::vector<std::string>* errors) {
+  std::vector<std::string> lines = str_util::Split(text, '\n');
 
   // Remove trailing spaces.
-  for (string& line : lines) {
+  for (std::string& line : lines) {
     absl::StripTrailingAsciiWhitespace(&line);
   }
 
@@ -493,8 +494,9 @@ void FinalizeDoc(const string& text, OpDef* op_def,
   int end_l = l;
   // Trim trailing blank lines from the description.
   while (start_l < end_l && lines[end_l - 1].empty()) --end_l;
-  string desc = absl::StrJoin(
-      absl::Span<const string>(lines.data() + start_l, end_l - start_l), "\n");
+  std::string desc = absl::StrJoin(
+      absl::Span<const std::string>(lines.data() + start_l, end_l - start_l),
+      "\n");
   if (!desc.empty()) op_def->set_description(desc);
 
   // name: description
@@ -528,7 +530,7 @@ void FinalizeDoc(const string& text, OpDef* op_def,
       if (!description[i].empty()) description[i].remove_prefix(min_indent);
     }
     // Concatenate lines into a single string.
-    const string complete(absl::StrJoin(description, "\n"));
+    const std::string complete(absl::StrJoin(description, "\n"));
 
     // Find name.
     bool found = false;
@@ -561,31 +563,31 @@ void FinalizeDoc(const string& text, OpDef* op_def,
 
 }  // namespace
 
-OpDefBuilder::OpDefBuilder(string op_name) {
+OpDefBuilder::OpDefBuilder(std::string op_name) {
   op_def()->set_name(std::move(op_name));
 }
 
-OpDefBuilder& OpDefBuilder::Attr(string spec) {
+OpDefBuilder& OpDefBuilder::Attr(std::string spec) {
   attrs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Input(string spec) {
+OpDefBuilder& OpDefBuilder::Input(std::string spec) {
   inputs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Output(string spec) {
+OpDefBuilder& OpDefBuilder::Output(std::string spec) {
   outputs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::ControlOutput(string name) {
+OpDefBuilder& OpDefBuilder::ControlOutput(std::string name) {
   control_outputs_.push_back(std::move(name));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Doc(string text) {
+OpDefBuilder& OpDefBuilder::Doc(std::string text) {
 #ifndef TF_LEAN_BINARY
   if (!doc_.empty()) {
     errors_.push_back(
@@ -622,7 +624,7 @@ OpDefBuilder& OpDefBuilder::SetIsDistributedCommunication() {
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Deprecated(int version, string explanation) {
+OpDefBuilder& OpDefBuilder::Deprecated(int version, std::string explanation) {
   if (op_def()->has_deprecation()) {
     errors_.push_back(
         absl::StrCat("Deprecated called twice for Op ", op_def()->name()));
@@ -667,7 +669,7 @@ OpDefBuilder& OpDefBuilder::AllowAttrTypeAny() {
 }
 
 absl::Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
-  std::vector<string> errors = errors_;
+  std::vector<std::string> errors = errors_;
   *op_reg_data = op_reg_data_;
 
   OpDef* op_def = &op_reg_data->op_def;
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index 8009135d584188..3df88e028c2bd2 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -40,7 +40,7 @@ typedef std::vector<std::reference_wrapper<const FullTypeDef>> TypeRefVector;
 // A callback into the type inference process, allowing type inference functions
 // to request inferring the type of some function (assumed to exist in the
 // runtime). The function is specified by name.
-typedef std::function<absl::StatusOr<FullTypeDef>(const string&,
+typedef std::function<absl::StatusOr<FullTypeDef>(const std::string&,
                                                   const TypeRefVector&)>
     FunctionTypeInferrer;
 
@@ -266,12 +266,12 @@ class OpDefBuilder {
   OpDef* op_def() { return &op_reg_data_.op_def; }
 
   OpRegistrationData op_reg_data_;
-  std::vector<string> attrs_;
-  std::vector<string> inputs_;
-  std::vector<string> outputs_;
-  std::vector<string> control_outputs_;
+  std::vector<std::string> attrs_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  std::vector<std::string> control_outputs_;
   std::string doc_;
-  std::vector<string> errors_;
+  std::vector<std::string> errors_;
   bool allow_attr_type_any_ = false;
 };
 
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index 3e8c805bcb419f..8dad7a721dad34 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -74,7 +74,7 @@ class OpDefBuilderTest : public ::testing::Test {
     }
   }
 
-  void ExpectFailure(const OpDefBuilder& builder, const string& error) {
+  void ExpectFailure(const OpDefBuilder& builder, const std::string& error) {
     OpRegistrationData op_reg_data;
     absl::Status status = builder.Finalize(&op_reg_data);
     EXPECT_FALSE(status.ok());
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index e228d1f4969a7c..b11360b68bb4a6 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -48,7 +48,7 @@ absl::Status AllowedTypeValue(DataType dt, const OpDef::AttrDef& attr) {
       return absl::OkStatus();
     }
   }
-  string allowed_str;
+  std::string allowed_str;
   for (int i = 0; i < allowed_values.list().type_size(); ++i) {
     if (!allowed_str.empty()) {
       absl::StrAppend(&allowed_str, ", ");
@@ -61,15 +61,16 @@ absl::Status AllowedTypeValue(DataType dt, const OpDef::AttrDef& attr) {
       " is not in the list of allowed values: ", allowed_str);
 }
 
-absl::Status AllowedStringValue(const string& str, const OpDef::AttrDef& attr) {
+absl::Status AllowedStringValue(const std::string& str,
+                                const OpDef::AttrDef& attr) {
   const AttrValue& allowed_values(attr.allowed_values());
   for (const auto& allowed : allowed_values.list().s()) {
     if (str == allowed) {
       return absl::OkStatus();
     }
   }
-  string allowed_str;
-  for (const string& allowed : allowed_values.list().s()) {
+  std::string allowed_str;
+  for (const std::string& allowed : allowed_values.list().s()) {
     if (!allowed_str.empty()) {
       absl::StrAppend(&allowed_str, ", ");
     }
@@ -135,7 +136,7 @@ absl::Status ValidateAttrValue(const AttrValue& attr_value,
     } else if (attr.type() == "string") {
       TF_RETURN_IF_ERROR(AllowedStringValue(attr_value.s(), attr));
     } else if (attr.type() == "list(string)") {
-      for (const string& str : attr_value.list().s()) {
+      for (const std::string& str : attr_value.list().s()) {
         TF_RETURN_IF_ERROR(AllowedStringValue(str, attr));
       }
     } else {
@@ -193,7 +194,7 @@ const ApiDef::Arg* FindInputArg(absl::string_view name, const ApiDef& api_def) {
 static absl::Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
                                 bool output,
                                 absl::flat_hash_set<absl::string_view>* names) {
-  const string suffix =
+  const std::string suffix =
       absl::StrCat(output ? " for output '" : " for input '", arg.name(), "'");
   VALIDATE(names->emplace(arg.name()).second, "Duplicate name: ", arg.name());
   VALIDATE(HasAttrStyleType(arg), "Missing type", suffix);
@@ -320,7 +321,7 @@ absl::Status ValidateOpDef(const OpDef& op_def) {
 
     // Validate allowed_values
     if (attr.has_allowed_values()) {
-      const string list_type =
+      const std::string list_type =
           is_list ? attr.type() : absl::StrCat("list(", attr.type(), ")");
       TF_RETURN_WITH_CONTEXT_IF_ERROR(
           AttrValueHasType(attr.allowed_values(), list_type), " for attr '",
@@ -360,7 +361,7 @@ absl::Status CheckOpDeprecation(const OpDef& op_def, int graph_def_version) {
     } else {
       // Warn only once for each op name, and do it in a threadsafe manner.
       static mutex mu(LINKER_INITIALIZED);
-      static auto* warned = new absl::flat_hash_set<string>();
+      static auto* warned = new absl::flat_hash_set<std::string>();
       bool warn;
       {
         mutex_lock lock(mu);
@@ -378,8 +379,9 @@ absl::Status CheckOpDeprecation(const OpDef& op_def, int graph_def_version) {
 
 namespace {
 
-string SummarizeArgs(const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
-  string ret;
+std::string SummarizeArgs(
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
+  std::string ret;
   for (const OpDef::ArgDef& arg : args) {
     if (!ret.empty()) absl::StrAppend(&ret, ", ");
     absl::StrAppend(&ret, arg.name(), ":");
@@ -399,8 +401,8 @@ string SummarizeArgs(const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
 
 }  // namespace
 
-string SummarizeOpDef(const OpDef& op_def) {
-  string ret = absl::StrCat("Op<name=", op_def.name());
+std::string SummarizeOpDef(const OpDef& op_def) {
+  std::string ret = absl::StrCat("Op<name=", op_def.name());
   absl::StrAppend(&ret, "; signature=", SummarizeArgs(op_def.input_arg()),
                   " -> ", SummarizeArgs(op_def.output_arg()));
   for (int i = 0; i < op_def.attr_size(); ++i) {
@@ -474,12 +476,12 @@ bool MoreRestrictive(const OpDef::AttrDef& old_attr,
   return false;
 }
 
-string AllowedStr(const OpDef::AttrDef& attr) {
+std::string AllowedStr(const OpDef::AttrDef& attr) {
   if (!attr.has_allowed_values()) return "no restriction";
   return SummarizeAttrValue(attr.allowed_values());
 }
 
-string DefaultAttrStr(const OpDef::AttrDef& attr) {
+std::string DefaultAttrStr(const OpDef::AttrDef& attr) {
   if (!attr.has_default_value()) return "no default";
   return SummarizeAttrValue(attr.default_value());
 }
@@ -495,7 +497,7 @@ bool HigherMinimum(const OpDef::AttrDef& old_attr,
   return new_attr.minimum() > old_attr.minimum();
 }
 
-string MinStr(const OpDef::AttrDef& attr) {
+std::string MinStr(const OpDef::AttrDef& attr) {
   if (!attr.has_minimum()) return "no minimum";
   return absl::StrCat(attr.minimum());
 }
@@ -509,7 +511,7 @@ void FillAttrMap(const OpDef& op_def, AttrMap* attr_map) {
 
 // Add a comma to *s every call but the first (*add_comma should be
 // initialized to false).
-void AddComma(string* s, bool* add_comma) {
+void AddComma(std::string* s, bool* add_comma) {
   if (*add_comma) {
     absl::StrAppend(s, ", ");
   } else {
@@ -518,7 +520,7 @@ void AddComma(string* s, bool* add_comma) {
 }
 
 // Will add the `name` from arg if name is true.
-void AddName(string* s, bool name, const OpDef::ArgDef& arg) {
+void AddName(std::string* s, bool name, const OpDef::ArgDef& arg) {
   if (name) {
     absl::StrAppend(s, arg.name(), ":");
   }
@@ -535,11 +537,11 @@ void AddName(string* s, bool name, const OpDef::ArgDef& arg) {
 //
 // We get the types by either using the attrs in args if they are in
 // old_attrs, or substituting the default value from new_attrs.
-string ComputeArgSignature(
+std::string ComputeArgSignature(
     const protobuf::RepeatedPtrField<OpDef::ArgDef>& args,
     const AttrMap& old_attrs, const AttrMap& new_attrs, std::vector<bool>* ref,
     bool names) {
-  string s;
+  std::string s;
   bool add_comma = false;
   for (const OpDef::ArgDef& arg : args) {
     if (!arg.type_list_attr().empty()) {
@@ -568,7 +570,7 @@ string ComputeArgSignature(
       }
     } else {
       int num = 1;  // How many input/outputs does this represent?
-      string type;  // What is the type of this arg?
+      std::string type;  // What is the type of this arg?
       AddName(&type, names, arg);
       if (!arg.number_attr().empty()) {
         // N * type case.
@@ -655,9 +657,9 @@ absl::Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
   }
 
   std::vector<bool> old_in_ref, new_in_ref, old_out_ref, new_out_ref;
-  const string old_in_sig = ComputeArgSignature(
+  const std::string old_in_sig = ComputeArgSignature(
       old_op.input_arg(), old_attrs, new_attrs, &old_in_ref, false /* names */);
-  const string new_in_sig = ComputeArgSignature(
+  const std::string new_in_sig = ComputeArgSignature(
       new_op.input_arg(), old_attrs, new_attrs, &new_in_ref, false /* names */);
   VALIDATE(old_in_sig == new_in_sig, "Input signature mismatch '", old_in_sig,
            "' vs. '", new_in_sig, "'");
@@ -669,10 +671,10 @@ absl::Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
              " changed from non-ref to ref");
   }
 
-  const string old_out_sig =
+  const std::string old_out_sig =
       ComputeArgSignature(old_op.output_arg(), old_attrs, new_attrs,
                           &old_out_ref, true /* names */);
-  const string new_out_sig =
+  const std::string new_out_sig =
       ComputeArgSignature(new_op.output_arg(), old_attrs, new_attrs,
                           &new_out_ref, true /* names */);
   VALIDATE(old_out_sig == new_out_sig, "Output signature mismatch '",
@@ -805,13 +807,13 @@ bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2) {
   return true;
 }
 
-uint64 AttrDefHash(const OpDef::AttrDef& a) {
-  uint64 h = Hash64(a.name());
+uint64_t AttrDefHash(const OpDef::AttrDef& a) {
+  uint64_t h = Hash64(a.name());
   h = Hash64(a.type().data(), a.type().size(), h);
   h = Hash64Combine(AttrValueHash(a.default_value()), h);
   h = Hash64(a.description().data(), a.description().size(), h);
-  h = Hash64Combine(static_cast<uint64>(a.has_minimum()), h);
-  h = Hash64Combine(static_cast<uint64>(a.minimum()), h);
+  h = Hash64Combine(static_cast<uint64_t>(a.has_minimum()), h);
+  h = Hash64Combine(static_cast<uint64_t>(a.minimum()), h);
   h = Hash64Combine(AttrValueHash(a.allowed_values()), h);
   return h;
 }
@@ -837,7 +839,7 @@ bool RepeatedAttrDefEqual(
   return true;
 }
 
-uint64 RepeatedAttrDefHash(
+uint64_t RepeatedAttrDefHash(
     const protobuf::RepeatedPtrField<OpDef::AttrDef>& a) {
   // Insert AttrDefs into map to deterministically sort by name
   std::vector<const OpDef::AttrDef*> a_sorted;
@@ -850,7 +852,7 @@ uint64 RepeatedAttrDefHash(
               return lhs->name() < rhs->name();
             });
   // Iterate and combines hashes of keys and values
-  uint64 h = 0xDECAFCAFFE;
+  uint64_t h = 0xDECAFCAFFE;
   for (const auto& def : a_sorted) {
     h = Hash64(def->name().data(), def->name().size(), h);
     h = Hash64Combine(AttrDefHash(*def), h);
@@ -884,8 +886,8 @@ bool OpDefEqual(const OpDef& o1, const OpDef& o2) {
   return AreSerializedProtosEqual(o1_copy, o2_copy);
 }
 
-uint64 OpDefHash(const OpDef& o) {
-  uint64 h = RepeatedAttrDefHash(o.attr());
+uint64_t OpDefHash(const OpDef& o) {
+  uint64_t h = RepeatedAttrDefHash(o.attr());
 
   // Compute deterministic order-independent control outputs hash.
   std::vector<const char*> control_output;
diff --git a/tensorflow/core/framework/op_def_util.h b/tensorflow/core/framework/op_def_util.h
index be1f08225c0e2e..abaaeefb03c9a8 100644
--- a/tensorflow/core/framework/op_def_util.h
+++ b/tensorflow/core/framework/op_def_util.h
@@ -88,7 +88,7 @@ void RemoveNonDeprecationDescriptionsFromOpDef(OpDef* op_def);
 bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2);
 
 // Returns hash of `a` that is consistent with AttrDefEqual.
-uint64 AttrDefHash(const OpDef::AttrDef& a);
+uint64_t AttrDefHash(const OpDef::AttrDef& a);
 
 // Returns true if all AttrDefs in `a1` equal corresponding AttrDefs in
 // `a2`. Correspondence is established by name.
@@ -96,14 +96,15 @@ bool RepeatedAttrDefEqual(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a1,
                           const protobuf::RepeatedPtrField<OpDef::AttrDef>& a2);
 
 // Returns hash of `a` that is consistent with RepeatedAttrDefEqual
-uint64 RepeatedAttrDefHash(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a);
+uint64_t RepeatedAttrDefHash(
+    const protobuf::RepeatedPtrField<OpDef::AttrDef>& a);
 
 // Returns true if `o1` is equal to `o2`.
 // Equality includes all the fields. OpDef.attr field is treated as a set.
 bool OpDefEqual(const OpDef& o1, const OpDef& o2);
 
 // Returns hash of `o` that is consistent with AttrDefEqual.
-uint64 OpDefHash(const OpDef& o);
+uint64_t OpDefHash(const OpDef& o);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 333a103cef7e65..41fd90d4e79fcf 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -27,13 +27,13 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-OpDef FromText(const string& text) {
+OpDef FromText(const std::string& text) {
   OpDef op_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &op_def));
   return op_def;
 }
 
-OpDef::AttrDef ADef(const string& text) {
+OpDef::AttrDef ADef(const std::string& text) {
   OpDef::AttrDef attr_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &attr_def));
   return attr_def;
@@ -41,7 +41,7 @@ OpDef::AttrDef ADef(const string& text) {
 
 class ValidateOpDefTest : public ::testing::Test {
  protected:
-  absl::Status TestProto(const string& text) {
+  absl::Status TestProto(const std::string& text) {
     return ValidateOpDef(FromText(text));
   }
 
@@ -58,7 +58,7 @@ class ValidateOpDefTest : public ::testing::Test {
 };
 
 namespace {
-void ExpectFailure(const absl::Status& status, const string& message) {
+void ExpectFailure(const absl::Status& status, const std::string& message) {
   EXPECT_FALSE(status.ok()) << "Did not see error with: " << message;
   if (!status.ok()) {
     LOG(INFO) << "message: " << status;
@@ -516,9 +516,9 @@ void ExpectDifferent(const OpDef& o1, const OpDef& o2) {
 }
 
 TEST(OpDefEqualityTest, EqualAndHash) {
-  string a1 = "attr { name: 'a' type: 'string' } ";
-  string a2 = "attr { name: 'b' type: 'string' } ";
-  string a3 = "attr { name: 'c' type: 'int32' } ";
+  std::string a1 = "attr { name: 'a' type: 'string' } ";
+  std::string a2 = "attr { name: 'b' type: 'string' } ";
+  std::string a3 = "attr { name: 'c' type: 'int32' } ";
   OpDef o1 = FromText(absl::StrCat("name: 'MatMul' ", a1));
   OpDef o2 = FromText(absl::StrCat("name: 'MatMul' ", a2));
   OpDef o3 = FromText(absl::StrCat("name: 'MatMul' ", a1, a2));
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 026b8e677ac668..79766a2d187d93 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -30,10 +30,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-string WordWrap(absl::string_view prefix, absl::string_view str, int width) {
-  const string indent_next_line = "\n" + Spaces(prefix.size());
+std::string WordWrap(absl::string_view prefix, absl::string_view str,
+                     int width) {
+  const std::string indent_next_line = "\n" + Spaces(prefix.size());
   width -= prefix.size();
-  string result;
+  std::string result;
   absl::StrAppend(&result, prefix);
 
   while (!str.empty()) {
@@ -100,8 +101,8 @@ static bool SplitAt(char split_ch, absl::string_view* orig,
 
 // Does this line start with "<spaces><field>:" where "<field>" is
 // in multi_line_fields? Sets *colon_pos to the position of the colon.
-static bool StartsWithFieldName(absl::string_view line,
-                                const std::vector<string>& multi_line_fields) {
+static bool StartsWithFieldName(
+    absl::string_view line, const std::vector<std::string>& multi_line_fields) {
   absl::string_view up_to_colon;
   if (!SplitAt(':', &line, &up_to_colon)) return false;
   while (absl::ConsumePrefix(&up_to_colon, " "))
@@ -115,8 +116,8 @@ static bool StartsWithFieldName(absl::string_view line,
 }
 
 static bool ConvertLine(absl::string_view line,
-                        const std::vector<string>& multi_line_fields,
-                        string* ml) {
+                        const std::vector<std::string>& multi_line_fields,
+                        std::string* ml) {
   // Is this a field we should convert?
   if (!StartsWithFieldName(line, multi_line_fields)) {
     return false;
@@ -140,7 +141,7 @@ static bool ConvertLine(absl::string_view line,
   absl::string_view suffix = after_colon.substr(last_quote + 1);
   // We've now parsed line into '<up_to_colon>: "<escaped>"<suffix>'
 
-  string unescaped;
+  std::string unescaped;
   if (!absl::CUnescape(escaped, &unescaped, nullptr)) {
     // Error unescaping, abort the conversion.
     return false;
@@ -148,8 +149,8 @@ static bool ConvertLine(absl::string_view line,
   // No more errors possible at this point.
 
   // Find a string to mark the end that isn't in unescaped.
-  string end = "END";
-  for (int s = 0; unescaped.find(end) != string::npos; ++s) {
+  std::string end = "END";
+  for (int s = 0; unescaped.find(end) != std::string::npos; ++s) {
     end = absl::StrCat("END", s);
   }
 
@@ -163,9 +164,10 @@ static bool ConvertLine(absl::string_view line,
   return true;
 }
 
-string PBTxtToMultiline(absl::string_view pbtxt,
-                        const std::vector<string>& multi_line_fields) {
-  string ml;
+std::string PBTxtToMultiline(
+    absl::string_view pbtxt,
+    const std::vector<std::string>& multi_line_fields) {
+  std::string ml;
   // Probably big enough, since the input and output are about the
   // same size, but just a guess.
   ml.reserve(pbtxt.size() * (17. / 16));
@@ -184,20 +186,21 @@ string PBTxtToMultiline(absl::string_view pbtxt,
 // Given a single line of text `line` with first : at `colon`, determine if
 // there is an "<<END" expression after the colon and if so return true and set
 // `*end` to everything after the "<<".
-static bool FindMultiline(absl::string_view line, size_t colon, string* end) {
+static bool FindMultiline(absl::string_view line, size_t colon,
+                          std::string* end) {
   if (colon == absl::string_view::npos) return false;
   line.remove_prefix(colon + 1);
   while (absl::ConsumePrefix(&line, " ")) {
   }
   if (absl::ConsumePrefix(&line, "<<")) {
-    *end = string(line);
+    *end = std::string(line);
     return true;
   }
   return false;
 }
 
-string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
-  string pbtxt;
+std::string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
+  std::string pbtxt;
   // Probably big enough, since the input and output are about the
   // same size, but just a guess.
   pbtxt.reserve(multiline_pbtxt.size() * (33. / 32));
@@ -209,7 +212,7 @@ string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
       break;
     }
 
-    string end;
+    std::string end;
     auto colon = line.find(':');
     if (!FindMultiline(line, colon, &end)) {
       // Normal case: not a multi-line string, just output the line as-is.
@@ -229,7 +232,7 @@ string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
     absl::StrAppend(&pbtxt, line.substr(0, colon + 1));
 
     // Add every line to unescaped until we see the "END" string.
-    string unescaped;
+    std::string unescaped;
     bool first = true;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
@@ -250,13 +253,14 @@ string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
   return pbtxt;
 }
 
-static void StringReplace(const string& from, const string& to, string* s) {
+static void StringReplace(const std::string& from, const std::string& to,
+                          std::string* s) {
   // Split *s into pieces delimited by `from`.
-  std::vector<string> split;
-  string::size_type pos = 0;
+  std::vector<std::string> split;
+  std::string::size_type pos = 0;
   while (pos < s->size()) {
     auto found = s->find(from, pos);
-    if (found == string::npos) {
+    if (found == std::string::npos) {
       split.push_back(s->substr(pos));
       break;
     } else {
@@ -271,10 +275,10 @@ static void StringReplace(const string& from, const string& to, string* s) {
   *s = absl::StrJoin(split, to);
 }
 
-static void RenameInDocs(const string& from, const string& to,
+static void RenameInDocs(const std::string& from, const std::string& to,
                          ApiDef* api_def) {
-  const string from_quoted = absl::StrCat("`", from, "`");
-  const string to_quoted = absl::StrCat("`", to, "`");
+  const std::string from_quoted = absl::StrCat("`", from, "`");
+  const std::string to_quoted = absl::StrCat("`", to, "`");
   for (int i = 0; i < api_def->in_arg_size(); ++i) {
     if (!api_def->in_arg(i).description().empty()) {
       StringReplace(from_quoted, to_quoted,
@@ -480,17 +484,17 @@ ApiDefMap::ApiDefMap(const OpList& op_list) {
 
 ApiDefMap::~ApiDefMap() {}
 
-absl::Status ApiDefMap::LoadFileList(Env* env,
-                                     const std::vector<string>& filenames) {
+absl::Status ApiDefMap::LoadFileList(
+    Env* env, const std::vector<std::string>& filenames) {
   for (const auto& filename : filenames) {
     TF_RETURN_IF_ERROR(LoadFile(env, filename));
   }
   return absl::OkStatus();
 }
 
-absl::Status ApiDefMap::LoadFile(Env* env, const string& filename) {
+absl::Status ApiDefMap::LoadFile(Env* env, const std::string& filename) {
   if (filename.empty()) return absl::OkStatus();
-  string contents;
+  std::string contents;
   TF_RETURN_IF_ERROR(ReadFileToString(env, filename, &contents));
   absl::Status status = LoadApiDef(contents);
   if (!status.ok()) {
@@ -502,8 +506,8 @@ absl::Status ApiDefMap::LoadFile(Env* env, const string& filename) {
   return absl::OkStatus();
 }
 
-absl::Status ApiDefMap::LoadApiDef(const string& api_def_file_contents) {
-  const string contents = PBTxtFromMultiline(api_def_file_contents);
+absl::Status ApiDefMap::LoadApiDef(const std::string& api_def_file_contents) {
+  const std::string contents = PBTxtFromMultiline(api_def_file_contents);
   ApiDefs api_defs;
   TF_RETURN_IF_ERROR(
       proto_utils::ParseTextFormatFromString(contents, &api_defs));
@@ -522,7 +526,7 @@ void ApiDefMap::UpdateDocs() {
   for (auto& name_and_api_def : map_) {
     auto& api_def = name_and_api_def.second;
     CHECK_GT(api_def.endpoint_size(), 0);
-    const string canonical_name = api_def.endpoint(0).name();
+    const std::string canonical_name = api_def.endpoint(0).name();
     if (api_def.graph_op_name() != canonical_name) {
       RenameInDocs(api_def.graph_op_name(), canonical_name, &api_def);
     }
@@ -544,7 +548,7 @@ void ApiDefMap::UpdateDocs() {
   }
 }
 
-const tensorflow::ApiDef* ApiDefMap::GetApiDef(const string& name) const {
+const tensorflow::ApiDef* ApiDefMap::GetApiDef(const std::string& name) const {
   return gtl::FindOrNull(map_, name);
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index 27ffe522a6dd35..e5414c031abdca 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -29,13 +29,14 @@ namespace tensorflow {
 // Forward declare protos so their symbols can be removed from .so exports
 class OpDef;
 
-inline string Spaces(int n) { return string(n, ' '); }
+inline std::string Spaces(int n) { return std::string(n, ' '); }
 
 // Wrap prefix + str to be at most width characters, indenting every line
 // after the first by prefix.size() spaces.  Intended use case is something
 // like prefix = "  Foo(" and str is a list of arguments (terminated by a ")").
 // TODO(josh11b): Option to wrap on ", " instead of " " when possible.
-string WordWrap(absl::string_view prefix, absl::string_view str, int width);
+std::string WordWrap(absl::string_view prefix, absl::string_view str,
+                     int width);
 
 // Looks for an "=" at the beginning of *description.  If found, strips it off
 // (and any following spaces) from *description and return true.  Otherwise
@@ -43,9 +44,9 @@ string WordWrap(absl::string_view prefix, absl::string_view str, int width);
 bool ConsumeEquals(absl::string_view* description);
 
 // Convert text-serialized protobufs to/from multiline format.
-string PBTxtToMultiline(absl::string_view pbtxt,
-                        const std::vector<string>& multi_line_fields);
-string PBTxtFromMultiline(absl::string_view multiline_pbtxt);
+std::string PBTxtToMultiline(absl::string_view pbtxt,
+                             const std::vector<std::string>& multi_line_fields);
+std::string PBTxtFromMultiline(absl::string_view multiline_pbtxt);
 
 // Takes a list of files with ApiDefs text protos, and allows you to
 // look up the specific ApiDef for any given op.
@@ -62,20 +63,21 @@ class ApiDefMap {
   // definitions take precedence.
   // ApiDefs loaded from files must contain a subset of ops defined
   // in the OpList passed to the constructor.
-  absl::Status LoadFileList(Env* env, const std::vector<string>& filenames);
+  absl::Status LoadFileList(Env* env,
+                            const std::vector<std::string>& filenames);
 
   // Load a single file. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
   // definitions take precedence.
   // ApiDefs loaded from file must contain a subset of ops defined
   // in the OpList passed to the constructor.
-  absl::Status LoadFile(Env* env, const string& filename);
+  absl::Status LoadFile(Env* env, const std::string& filename);
 
   // Load ApiDefs from string containing ApiDefs text proto.
   // api_def_file_contents is expected to be in "multiline format".
   // ApiDefs must contain a subset of ops defined in OpsList
   // passed to the constructor.
-  absl::Status LoadApiDef(const string& api_def_file_contents);
+  absl::Status LoadApiDef(const std::string& api_def_file_contents);
 
   // Updates ApiDef docs. For example, if ApiDef renames an argument
   // or attribute, applies these renames to descriptions as well.
@@ -89,10 +91,10 @@ class ApiDefMap {
   // Note: Returned ApiDef pointer should stay valid even after calling
   // Load* functions defined above. Subsequent calls to Load* might modify
   // returned ApiDef contents, but should never remove the ApiDef itself.
-  const ApiDef* GetApiDef(const string& name) const;
+  const ApiDef* GetApiDef(const std::string& name) const;
 
  private:
-  std::unordered_map<string, ApiDef> map_;
+  std::unordered_map<std::string, ApiDef> map_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index b08c77ca83221c..b06646d9fc51bd 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -72,7 +72,7 @@ END
 
 TEST(OpGenLibTest, MultilinePBTxt) {
   // Non-multiline pbtxt
-  const string pbtxt = R"(foo: "abc"
+  const std::string pbtxt = R"(foo: "abc"
 foo: ""
 foo: "\n\n"
 foo: "abc\nEND"
@@ -81,7 +81,7 @@ bar: "quotes:\""
 )";
 
   // Field "foo" converted to multiline but not "bar".
-  const string ml_foo = R"(foo: <<END
+  const std::string ml_foo = R"(foo: <<END
 abc
 END
 foo: <<END
@@ -105,7 +105,7 @@ bar: "quotes:\""
 )";
 
   // Both fields "foo" and "bar" converted to multiline.
-  const string ml_foo_bar = R"(foo: <<END
+  const std::string ml_foo_bar = R"(foo: <<END
 abc
 END
 foo: <<END
@@ -161,10 +161,10 @@ TEST(OpGenLibTest, PBTxtToMultilineErrorCases) {
 }
 
 TEST(OpGenLibTest, PBTxtToMultilineComments) {
-  const string pbtxt = R"(f: "bar"  # Comment 1
+  const std::string pbtxt = R"(f: "bar"  # Comment 1
     f: "\n"  # Comment 2
 )";
-  const string ml = R"(f: <<END
+  const std::string ml = R"(f: <<END
 bar
 END  # Comment 1
     f: <<END
@@ -267,7 +267,7 @@ TEST(OpGenLibTest, ApiDefLoadSingleApiDef) {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideVisibility) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   endpoint {
@@ -275,7 +275,7 @@ op {
   }
 }
 )";
-  const string api_def2 = R"(
+  const std::string api_def2 = R"(
 op {
   graph_op_name: "testop"
   visibility: HIDDEN
@@ -304,7 +304,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideEndpoints) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   endpoint {
@@ -327,7 +327,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideArgs) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   in_arg {
@@ -363,7 +363,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideDescriptions) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   summary: "New summary"
@@ -375,7 +375,7 @@ END
 }
 )";
 
-  const string api_def2 = R"(
+  const std::string api_def2 = R"(
 op {
   graph_op_name: "testop"
   description_prefix: "B"
@@ -402,7 +402,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefInvalidOpInOverride) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "different_testop"
   endpoint {
@@ -420,7 +420,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefInvalidArgOrder) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   arg_order: "arg_a"
@@ -428,14 +428,14 @@ op {
 }
 )";
 
-  const string api_def2 = R"(
+  const std::string api_def2 = R"(
 op {
   graph_op_name: "testop"
   arg_order: "arg_a"
 }
 )";
 
-  const string api_def3 = R"(
+  const std::string api_def3 = R"(
 op {
   graph_op_name: "testop"
   arg_order: "arg_a"
@@ -462,7 +462,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefInvalidSyntax) {
-  const string api_def = R"pb(
+  const std::string api_def = R"pb(
     op { bad_op_name: "testop" }
   )pb";
 
@@ -474,7 +474,7 @@ TEST(OpGenLibTest, ApiDefInvalidSyntax) {
 }
 
 TEST(OpGenLibTest, ApiDefUpdateDocs) {
-  const string op_list1 = R"(op {
+  const std::string op_list1 = R"(op {
   name: "testop"
   input_arg {
     name: "arg_a"
@@ -492,7 +492,7 @@ TEST(OpGenLibTest, ApiDefUpdateDocs) {
 }
 )";
 
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   endpoint {
@@ -519,7 +519,7 @@ op {
   TF_CHECK_OK(api_map.LoadApiDef(api_def1));
   api_map.UpdateDocs();
 
-  const string expected_description =
+  const std::string expected_description =
       "`arg_aa`, `arg_cc`, `attr_aa`, `testop2`";
   EXPECT_EQ(expected_description, api_map.GetApiDef("testop")->description());
   EXPECT_EQ(expected_description,
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 056db3d9fc2644..79d85cf24420db 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -213,10 +213,10 @@ absl::Status OpKernel::OutputRange(absl::string_view output_name, int* start,
   }
 }
 
-string OpKernel::ShapeTraceString(const OpKernelContext& ctx) const {
+std::string OpKernel::ShapeTraceString(const OpKernelContext& ctx) const {
   int num_inputs = ctx.num_inputs();
   if (num_inputs == 0) return "";
-  std::vector<string> tensor_shapes;
+  std::vector<std::string> tensor_shapes;
   tensor_shapes.reserve(num_inputs);
   for (int i = 0; i < num_inputs; i++) {
     if (!ctx.has_input(i)) {
@@ -235,11 +235,12 @@ string OpKernel::ShapeTraceString(const OpKernelContext& ctx) const {
   return absl::StrCat("(", absl::StrJoin(tensor_shapes, ";"), ")");
 }
 
-string OpKernel::TraceString(const OpKernelContext& ctx, bool verbose) const {
-  string trace_string =
+std::string OpKernel::TraceString(const OpKernelContext& ctx,
+                                  bool verbose) const {
+  std::string trace_string =
       tsl::profiler::TraceMeOp(name_view(), type_string_view());
   if (verbose) {
-    string shape = ShapeTraceString(ctx);
+    std::string shape = ShapeTraceString(ctx);
     if (!shape.empty()) {
       trace_string = tsl::profiler::TraceMeEncode(std::move(trace_string),
                                                   {{"shape", shape}});
@@ -709,7 +710,7 @@ absl::Status OpKernelContext::output_list(absl::string_view name,
 
 void OpKernelContext::maybe_initialize_scope_id_set() {
   if (allocated_scope_ids_ == nullptr) {
-    allocated_scope_ids_ = std::make_unique<std::unordered_set<int32>>();
+    allocated_scope_ids_ = std::make_unique<std::unordered_set<int32_t>>();
   }
 }
 
@@ -988,7 +989,7 @@ void OpKernelContext::maybe_track_allocations_for_set_output(
     const auto it = std::find_if(
         tracking_state_->temp_tensor_buffer_and_size.begin(),
         tracking_state_->temp_tensor_buffer_and_size.end(),
-        [&tensor](const std::pair<const void*, int64>& e) {
+        [&tensor](const std::pair<const void*, int64_t>& e) {
           return e.first == static_cast<const void*>(tensor.data());
         });
     if (it != tracking_state_->temp_tensor_buffer_and_size.end()) {
@@ -1141,11 +1142,11 @@ void OpKernelContext::set_record_memory_consumption(bool v) {
   }
 }
 
-const string& OpKernelContext::executor_type() const {
+const std::string& OpKernelContext::executor_type() const {
   if (params_->executor_type) {
     return *params_->executor_type;
   } else {
-    static const string& kEmptyString = *new string("");
+    static const std::string& kEmptyString = *new std::string("");
     return kEmptyString;
   }
 }
@@ -1158,7 +1159,7 @@ struct KernelRegistration {
       : def(d), kernel_class_name(c), factory(std::move(f)) {}
 
   const KernelDef def;
-  const string kernel_class_name;
+  const std::string kernel_class_name;
   std::unique_ptr<kernel_factory::OpKernelFactory> factory;
 };
 
@@ -1167,7 +1168,7 @@ struct KernelRegistration {
 // KernelDef.
 struct KernelRegistry {
   mutex mu;
-  std::unordered_multimap<string, KernelRegistration> registry
+  std::unordered_multimap<std::string, KernelRegistration> registry
       TF_GUARDED_BY(mu);
 };
 
@@ -1183,11 +1184,11 @@ static const char kKernelLibPattern[] = "libtfkernel*.so";
 
 // Returns Status::OK if the dynamic library at the given path is safe to
 // load with some level of confidence.
-static absl::Status IsProbablySafeToLoad(const string& path) {
+static absl::Status IsProbablySafeToLoad(const std::string& path) {
   // A map of platform string to required CPU feature.
   using port::CPUFeature;
   static const auto* feature_map =
-      new std::map<string, std::pair<CPUFeature, string>>{
+      new std::map<std::string, std::pair<CPUFeature, std::string>>{
           {"__AVX512VL__=1", FEATURE(CPUFeature::AVX512VL)},
       };
 
@@ -1209,7 +1210,7 @@ static absl::Status IsProbablySafeToLoad(const string& path) {
     }
   }
   if (!missing_features.empty()) {
-    string errmsg = "Missing CPU features: ";
+    std::string errmsg = "Missing CPU features: ";
     errmsg.append(absl::StrJoin(missing_features, ", "));
     return errors::FailedPrecondition(errmsg);
   }
@@ -1227,14 +1228,14 @@ void LoadDynamicKernelsInternal() {
     override_abi_check = strcmp(_abi_check_env_var, "1") == 0;
   }
 
-  string bazel_kernel_dir =
+  std::string bazel_kernel_dir =
       io::JoinPath(env->GetRunfilesDir(), "tensorflow", "core", "kernels");
-  std::vector<string> files;
+  std::vector<std::string> files;
   absl::Status s_kernel_dir = env->GetChildren(bazel_kernel_dir, &files);
   if (s_kernel_dir.ok()) {
-    string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
+    std::string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
     for (const auto& file : files) {
-      string fullpath = io::JoinPath(bazel_kernel_dir, file);
+      std::string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
         absl::Status s = IsProbablySafeToLoad(fullpath);
         if (!s.ok() && override_abi_check) {
@@ -1263,8 +1264,8 @@ void LoadDynamicKernels() {
   absl::call_once(dll_loader_flag, LoadDynamicKernelsInternal);
 }
 
-static string Key(absl::string_view op_type, const DeviceType& device_type,
-                  absl::string_view label) {
+static std::string Key(absl::string_view op_type, const DeviceType& device_type,
+                       absl::string_view label) {
   return strings::StrCat(op_type, ":", DeviceTypeString(device_type), ":",
                          label);
 }
@@ -1274,12 +1275,12 @@ static string Key(absl::string_view op_type, const DeviceType& device_type,
 // to JIT kernels during the static registration, to allow them to be found
 // during lookup as normal kernels.
 void SetupOrDisableJit(KernelRegistry* registry) {
-  std::unordered_multimap<string, KernelRegistration> jit_kernels;
+  std::unordered_multimap<std::string, KernelRegistration> jit_kernels;
   bool remove_jit_kernels = absl::StrContains(
       absl::NullSafeStringView(getenv(kDisableJitKernelsEnvVar)), "1");
 
   mutex_lock l(registry->mu);
-  std::unordered_multimap<string, KernelRegistration>& all_kernels =
+  std::unordered_multimap<std::string, KernelRegistration>& all_kernels =
       registry->registry;
   auto it = all_kernels.begin();
   while (it != all_kernels.end()) {
@@ -1344,7 +1345,7 @@ namespace kernel_factory {
 void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
                                      absl::string_view kernel_class_name,
                                      std::unique_ptr<OpKernelFactory> factory) {
-  const string key =
+  const std::string key =
       Key(kernel_def->op(), DeviceType(kernel_def->device_type()),
           kernel_def->label());
 
@@ -1374,9 +1375,9 @@ OpKernel* OpKernelRegistrar::PtrOpKernelFactory::Create(
 namespace {
 
 // Label defaults to empty if not found in NodeDef.
-const string& GetKernelLabelAttr(const AttrSlice& node_attrs) {
-  static const string& kKernelAttr = *new string("_kernel");
-  static const string& kEmptyString = *new string("");
+const std::string& GetKernelLabelAttr(const AttrSlice& node_attrs) {
+  static const std::string& kKernelAttr = *new std::string("_kernel");
+  static const std::string& kEmptyString = *new std::string("");
 
   // NOTE: We inline the implementation of `GetNodeAttrString()` here in order
   // to use the `AttrSlice::FindByString()` overload, which does a more
@@ -1399,9 +1400,9 @@ absl::Status FindKernelRegistration(
   *reg = nullptr;
   *was_attr_mismatch = false;
 
-  const string& label = GetKernelLabelAttr(node_attrs);
+  const std::string& label = GetKernelLabelAttr(node_attrs);
 
-  const string key = Key(node_op, device_type, label);
+  const std::string key = Key(node_op, device_type, label);
   auto typed_registry = GlobalKernelRegistryTyped();
   tf_shared_lock lock(typed_registry->mu);
   auto regs = typed_registry->registry.equal_range(key);
@@ -1434,7 +1435,7 @@ absl::Status FindKernelRegistration(
   // default kernel.
   if (*reg == nullptr &&
       !IsSymbolicExecutionDevice(device_type.type_string())) {
-    const string default_key = Key(node_op, DEVICE_DEFAULT, label);
+    const std::string default_key = Key(node_op, DEVICE_DEFAULT, label);
     auto regs = typed_registry->registry.equal_range(default_key);
     for (auto iter = regs.first; iter != regs.second; ++iter) {
       // If there is a kernel registered for the op and device_type,
@@ -1496,7 +1497,8 @@ absl::Status FindKernelDef(
     bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
     absl::string_view node_op, absl::string_view node_device,
-    AttrSlice node_attrs, const KernelDef** def, string* kernel_class_name) {
+    AttrSlice node_attrs, const KernelDef** def,
+    std::string* kernel_class_name) {
   const KernelRegistration* reg = nullptr;
   bool was_attr_mismatch;
   TF_RETURN_IF_ERROR(FindKernelRegistration(
@@ -1535,7 +1537,7 @@ absl::Status FindKernelDef(
 
 absl::Status FindKernelDef(const DeviceType& device_type,
                            const NodeDef& node_def, const KernelDef** def,
-                           string* kernel_class_name) {
+                           std::string* kernel_class_name) {
   return FindKernelDef(
       device_type, node_def.name(), node_def.has_experimental_debug_info(),
       node_def.experimental_debug_info(), node_def.op(), node_def.device(),
@@ -1600,8 +1602,8 @@ absl::Status SupportedDeviceTypesForNode(
 
     std::stable_sort(prioritized_device_types->begin(),
                      prioritized_device_types->end(),
-                     [](const std::pair<DeviceType, int32>& a,
-                        const std::pair<DeviceType, int32>& b) {
+                     [](const std::pair<DeviceType, int32_t>& a,
+                        const std::pair<DeviceType, int32_t>& b) {
                        return a.second > b.second;
                      });
   } else {
@@ -1644,10 +1646,10 @@ KernelList GetRegisteredKernelsForOp(absl::string_view op_name) {
   return GetFilteredRegisteredKernels(op_pred);
 }
 
-string KernelsRegisteredForOp(absl::string_view op_name) {
+std::string KernelsRegisteredForOp(absl::string_view op_name) {
   KernelList kernel_list = GetRegisteredKernelsForOp(op_name);
   if (kernel_list.kernel_size() == 0) return "  <no registered kernels>\n";
-  string ret;
+  std::string ret;
   for (const auto& kernel_def : kernel_list.kernel()) {
     absl::StrAppend(&ret, "  device='", kernel_def.device_type(), "'");
     if (!kernel_def.label().empty()) {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 17861100a38bf3..1181c593cfc8de 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1315,7 +1315,7 @@ class OpKernelContext {
 
   // Keep track of calls to ScopedAllocator.
   // TODO(ayushd): change to absl::flat_hash_set.
-  std::unique_ptr<std::unordered_set<int32>> allocated_scope_ids_;
+  std::unique_ptr<std::unordered_set<int32_t>> allocated_scope_ids_;
 
   // The following data members are only used when allocation tracking is
   // enabled, memory consumption is being recorded, or tensor access is being
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 9a38f4a11e1f3a..9d7eff71641541 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -93,7 +93,7 @@ class TestOp3Cpu : public tensorflow::OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("Test3").Device(DEVICE_CPU).TypeConstraint<int8>("T"), TestOp3Cpu);
+    Name("Test3").Device(DEVICE_CPU).TypeConstraint<int8_t>("T"), TestOp3Cpu);
 
 namespace {
 
@@ -179,8 +179,9 @@ class OpKernelTest : public ::testing::Test {
   OpKernelTest() : device_(Env::Default()) {}
 
  protected:
-  NodeDef CreateNodeDef(const string& op_type, const DataTypeVector& inputs,
-                        const string& device = "") {
+  NodeDef CreateNodeDef(const std::string& op_type,
+                        const DataTypeVector& inputs,
+                        const std::string& device = "") {
     NodeDefBuilder builder(op_type + "-op", op_type);
     for (DataType dt : inputs) {
       builder.Input(FakeInput(dt));
@@ -191,7 +192,7 @@ class OpKernelTest : public ::testing::Test {
     return node_def;
   }
 
-  void ExpectEqual(const string& what, const DataTypeVector& expected,
+  void ExpectEqual(const std::string& what, const DataTypeVector& expected,
                    const DataTypeVector& observed) {
     EXPECT_EQ(expected.size(), observed.size()) << what;
     const size_t size = std::min(expected.size(), observed.size());
@@ -202,7 +203,7 @@ class OpKernelTest : public ::testing::Test {
     }
   }
 
-  void ExpectSuccess(const string& op_type, DeviceType device_type,
+  void ExpectSuccess(const std::string& op_type, DeviceType device_type,
                      const DataTypeVector& inputs,
                      const DataTypeVector& outputs) {
     absl::Status status;
@@ -217,7 +218,7 @@ class OpKernelTest : public ::testing::Test {
     }
   }
 
-  void ExpectFailure(const string& ascii_node_def, DeviceType device_type,
+  void ExpectFailure(const std::string& ascii_node_def, DeviceType device_type,
                      error::Code code) {
     NodeDef node_def;
     protobuf::TextFormat::ParseFromString(ascii_node_def, &node_def);
@@ -266,8 +267,9 @@ TEST_F(OpKernelTest, CpuTypeRegistered) {
 }
 
 TEST_F(OpKernelTest, KernelNotRegistered) {
-  const string& local_device = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string& remote_device = "/job:worker/replica:0/task:0/device";
+  const std::string& local_device =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string& remote_device = "/job:worker/replica:0/task:0/device";
   {
     // Try a node def of an op which does not have kernel. And the requested
     // device in NodeDef is on a different address space than the local device.
@@ -810,7 +812,7 @@ TEST_F(OpKernelBuilderTest, OpOutputList) {
 class GetAttrKernel : public ::tensorflow::OpKernel {
  public:
   explicit GetAttrKernel(OpKernelConstruction* context) : OpKernel(context) {
-    string attr_name;
+    std::string attr_name;
     OP_REQUIRES_OK(context, context->GetAttr("attr_name", &attr_name));
 
     status.emplace_back("s", context->GetAttr(attr_name, &s));
@@ -836,11 +838,11 @@ class GetAttrKernel : public ::tensorflow::OpKernel {
   }
   void Compute(::tensorflow::OpKernelContext* context) override {}
 
-  void ExpectOk(std::initializer_list<string> keys) {
+  void ExpectOk(std::initializer_list<std::string> keys) {
     for (const auto& key_status : status) {
       // Only the status for keys in "keys" should be ok().
       bool in_keys = false;
-      for (const string& key : keys) {
+      for (const std::string& key : keys) {
         if (key_status.first == key) {
           in_keys = true;
         }
@@ -850,12 +852,12 @@ class GetAttrKernel : public ::tensorflow::OpKernel {
     }
   }
 
-  string s;
-  std::vector<string> s_list;
+  std::string s;
+  std::vector<std::string> s_list;
   int64_t i;
   std::vector<int64_t> i_list;
-  int32 i32;
-  std::vector<int32> i32_list;
+  int32_t i32;
+  std::vector<int32_t> i32_list;
   float f;
   std::vector<float> f_list;
   bool b;
@@ -867,7 +869,7 @@ class GetAttrKernel : public ::tensorflow::OpKernel {
   std::vector<TensorShapeProto> shape_proto_list;
   TensorShape shape;
   std::vector<TensorShape> shape_list;
-  std::vector<std::pair<string, absl::Status>> status;
+  std::vector<std::pair<std::string, absl::Status>> status;
 };
 
 class GetAttrTest : public OpKernelBuilderTest {};
@@ -884,7 +886,7 @@ TEST_F(GetAttrTest, StringList) {
                     {"attr_name|string|'a'", "a|list(string)|['foo', 'bar']"});
   auto* get_attr_kernel = static_cast<GetAttrKernel*>(op_kernel.get());
   get_attr_kernel->ExpectOk({"s_list"});
-  EXPECT_EQ(std::vector<string>({"foo", "bar"}), get_attr_kernel->s_list);
+  EXPECT_EQ(std::vector<std::string>({"foo", "bar"}), get_attr_kernel->s_list);
 
   op_kernel = ExpectSuccess("GetAttrStringList", DEVICE_CPU,
                             {"attr_name|string|'b'", "a|list(string)|['baz']"});
@@ -914,7 +916,7 @@ TEST_F(GetAttrTest, Int) {
   get_attr_kernel = static_cast<GetAttrKernel*>(op_kernel.get());
   get_attr_kernel->ExpectOk({"i_list", "i32_list"});
   EXPECT_EQ(std::vector<int64_t>({-1, 2, -4}), get_attr_kernel->i_list);
-  EXPECT_EQ(std::vector<int32>({-1, 2, -4}), get_attr_kernel->i32_list);
+  EXPECT_EQ(std::vector<int32_t>({-1, 2, -4}), get_attr_kernel->i32_list);
 
   // 8589934592 == 2^33, too big to fit in an int32
   op_kernel = ExpectSuccess("GetAttrInt", DEVICE_CPU,
diff --git a/tensorflow/core/framework/op_kernel_test_base.h b/tensorflow/core/framework/op_kernel_test_base.h
index 7b3951e56411be..9300f159039846 100644
--- a/tensorflow/core/framework/op_kernel_test_base.h
+++ b/tensorflow/core/framework/op_kernel_test_base.h
@@ -54,13 +54,13 @@ static std::vector<DeviceType> DeviceTypes() {
 class OpKernelBuilderTest : public ::testing::Test {
  protected:
   // Each attr is described by a "name|type|value".
-  NodeDef CreateNodeDef(const string& op_type,
-                        const std::vector<string>& attrs) {
+  NodeDef CreateNodeDef(const std::string& op_type,
+                        const std::vector<std::string>& attrs) {
     NodeDef node_def;
     node_def.set_name(op_type + "-op");
     node_def.set_op(op_type);
-    for (const string& attr_desc : attrs) {
-      std::vector<string> parts = str_util::Split(attr_desc, '|');
+    for (const std::string& attr_desc : attrs) {
+      std::vector<std::string> parts = str_util::Split(attr_desc, '|');
       CHECK_EQ(parts.size(), 3);
       AttrValue attr_value;
       CHECK(ParseAttrValue(parts[1], parts[2], &attr_value)) << attr_desc;
@@ -70,9 +70,9 @@ class OpKernelBuilderTest : public ::testing::Test {
     return node_def;
   }
 
-  std::unique_ptr<OpKernel> ExpectSuccess(const string& op_type,
+  std::unique_ptr<OpKernel> ExpectSuccess(const std::string& op_type,
                                           const DeviceType& device_type,
-                                          const std::vector<string>& attrs,
+                                          const std::vector<std::string>& attrs,
                                           DataTypeSlice input_types = {}) {
     absl::Status status;
     NodeDef def = CreateNodeDef(op_type, attrs);
@@ -110,8 +110,8 @@ class OpKernelBuilderTest : public ::testing::Test {
     return op;
   }
 
-  void ExpectFailure(const string& op_type, const DeviceType& device_type,
-                     const std::vector<string>& attrs, error::Code code) {
+  void ExpectFailure(const std::string& op_type, const DeviceType& device_type,
+                     const std::vector<std::string>& attrs, error::Code code) {
     absl::Status status;
     const NodeDef def = CreateNodeDef(op_type, attrs);
     Env* env = Env::Default();
@@ -142,17 +142,17 @@ class OpKernelBuilderTest : public ::testing::Test {
     }
   }
 
-  string GetKernelClassName(const string& op_type,
-                            const DeviceType& device_type,
-                            const std::vector<string>& attrs,
-                            DataTypeSlice input_types = {}) {
+  std::string GetKernelClassName(const std::string& op_type,
+                                 const DeviceType& device_type,
+                                 const std::vector<std::string>& attrs,
+                                 DataTypeSlice input_types = {}) {
     NodeDef def = CreateNodeDef(op_type, attrs);
     for (size_t i = 0; i < input_types.size(); ++i) {
       def.add_input("a:0");
     }
 
     const KernelDef* kernel_def = nullptr;
-    string kernel_class_name;
+    std::string kernel_class_name;
     const absl::Status status =
         FindKernelDef(device_type, def, &kernel_def, &kernel_class_name);
     if (status.ok()) {
diff --git a/tensorflow/core/framework/op_registration_test.cc b/tensorflow/core/framework/op_registration_test.cc
index 9ec03a876342ea..7b4e2f97c62d90 100644
--- a/tensorflow/core/framework/op_registration_test.cc
+++ b/tensorflow/core/framework/op_registration_test.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 namespace {
 
-void Register(const string& op_name, OpRegistry* registry) {
+void Register(const std::string& op_name, OpRegistry* registry) {
   registry->Register(
       [op_name](OpRegistrationData* op_reg_data) -> absl::Status {
         op_reg_data->op_def.set_name(op_name);
diff --git a/tensorflow/core/framework/op_segment.cc b/tensorflow/core/framework/op_segment.cc
index 2f583903f43670..ca75d2af8a466f 100644
--- a/tensorflow/core/framework/op_segment.cc
+++ b/tensorflow/core/framework/op_segment.cc
@@ -35,8 +35,9 @@ OpSegment::~OpSegment() {
   for (const auto& kv : sessions_) delete kv.second;
 }
 
-absl::Status OpSegment::FindOrCreate(const string& session_handle,
-                                     const string& node_name, OpKernel** kernel,
+absl::Status OpSegment::FindOrCreate(const std::string& session_handle,
+                                     const std::string& node_name,
+                                     OpKernel** kernel,
                                      CreateKernelFn create_fn) {
   {
     mutex_lock l(mu_);
@@ -71,7 +72,7 @@ absl::Status OpSegment::FindOrCreate(const string& session_handle,
   return absl::OkStatus();
 }
 
-void OpSegment::AddHold(const string& session_handle) {
+void OpSegment::AddHold(const std::string& session_handle) {
   mutex_lock l(mu_);
   Item** item = &sessions_[session_handle];
   if (*item == nullptr) {
@@ -81,7 +82,7 @@ void OpSegment::AddHold(const string& session_handle) {
   }
 }
 
-void OpSegment::RemoveHold(const string& session_handle) {
+void OpSegment::RemoveHold(const std::string& session_handle) {
   Item* item = nullptr;
   {
     mutex_lock l(mu_);
@@ -101,7 +102,7 @@ void OpSegment::RemoveHold(const string& session_handle) {
 }
 
 bool OpSegment::ShouldOwnKernel(FunctionLibraryRuntime* lib,
-                                const string& node_op) {
+                                const std::string& node_op) {
   // OpSegment should not own kernel if the node is stateless, or a function.
   return lib->IsStateful(node_op) &&
          lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr &&
diff --git a/tensorflow/core/framework/op_segment.h b/tensorflow/core/framework/op_segment.h
index 10c4fa467e3228..e2726fe7f11e98 100644
--- a/tensorflow/core/framework/op_segment.h
+++ b/tensorflow/core/framework/op_segment.h
@@ -67,7 +67,7 @@ class OpSegment {
 
  private:
   // op name -> OpKernel
-  typedef std::unordered_map<string, OpKernel*> KernelMap;
+  typedef std::unordered_map<std::string, OpKernel*> KernelMap;
   struct Item {
     int num_holds = 1;      // Num of holds put on the session.
     KernelMap name_kernel;  // op name -> kernel.
@@ -76,7 +76,7 @@ class OpSegment {
 
   // session handle -> item.
   // Session handles are produced by strings::FpToString()
-  typedef std::unordered_map<string, Item*> SessionMap;
+  typedef std::unordered_map<std::string, Item*> SessionMap;
 
   mutable mutex mu_;
   SessionMap sessions_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/ops_util.cc b/tensorflow/core/framework/ops_util.cc
index 9a4de9240822bd..aad2ae197689f2 100644
--- a/tensorflow/core/framework/ops_util.cc
+++ b/tensorflow/core/framework/ops_util.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/ops_util.h"
+
 #include <algorithm>
 #include <cmath>
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/padding.h"
@@ -62,12 +64,11 @@ absl::Status GetBroadcastSize(const int index, const int in_size,
   return absl::OkStatus();
 }
 
-string SanitizeThreadSuffix(string suffix) {
-  string clean;
+std::string SanitizeThreadSuffix(std::string suffix) {
+  std::string clean;
   for (int i = 0; i < suffix.size(); ++i) {
     const char ch = suffix[i];
-    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
-        (ch >= '0' && ch <= '9') || ch == '_' || ch == '-') {
+    if (absl::ascii_isalnum(ch) || ch == '_' || ch == '-') {
       clean += ch;
     } else {
       clean += '_';
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index bda429e3194668..7ca37906f061f8 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -85,11 +85,11 @@ class QueueInterface : public ResourceBase {
   virtual absl::Status MatchesNodeDef(const NodeDef& node_def) = 0;
 
   // Returns the number of elements in the queue.
-  virtual int32 size() const = 0;
+  virtual int32_t size() const = 0;
 
   virtual const DataTypeVector& component_dtypes() const = 0;
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat("A Queue of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index 9f7188f25264e8..916f391d529d10 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 
 // ReaderBase ------------------------------------------------------
 
-ReaderBase::ReaderBase(const string& name) : name_(name) {}
+ReaderBase::ReaderBase(const std::string& name) : name_(name) {}
 
 int64_t ReaderBase::NumRecordsProduced() {
   mutex_lock lock(mu_);
@@ -199,9 +199,9 @@ void ReaderBase::Read(QueueInterface* queue, tstring* key, tstring* value,
   }
 }
 
-string ReaderBase::GetNextWorkLocked(QueueInterface* queue,
-                                     OpKernelContext* context) const {
-  string work;
+std::string ReaderBase::GetNextWorkLocked(QueueInterface* queue,
+                                          OpKernelContext* context) const {
+  std::string work;
   absl::Notification n;
   queue->TryDequeue(
       context, [context, &n, &work](const QueueInterface::Tuple& tuple) {
@@ -246,7 +246,7 @@ absl::Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
 #if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
-    const string debug_string = state.DebugString();
+    const std::string debug_string = state.DebugString();
 #endif
     return errors::InvalidArgument(
         "Unexpected negative value when restoring in ", name(), ": ",
@@ -256,7 +256,7 @@ absl::Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
 #if defined(__ANDROID__) || (__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
-    const string debug_string = state.DebugString();
+    const std::string debug_string = state.DebugString();
 #endif
     return errors::InvalidArgument(
         "Inconsistent work started vs. finished when restoring in ", name(),
diff --git a/tensorflow/core/framework/reader_base.h b/tensorflow/core/framework/reader_base.h
index 73842644d15992..2cd63ea5355526 100644
--- a/tensorflow/core/framework/reader_base.h
+++ b/tensorflow/core/framework/reader_base.h
@@ -31,7 +31,7 @@ class ReaderBase : public ReaderInterface {
  public:
   // name: For use in error messages, should mention both the name of
   // the op and the node.
-  explicit ReaderBase(const string& name);
+  explicit ReaderBase(const std::string& name);
 
   // Note that methods with names ending in "Locked" are called while
   // the ReaderBase's mutex is held.
@@ -87,7 +87,7 @@ class ReaderBase : public ReaderInterface {
   const tstring& current_work() const { return work_; }
 
   // What was passed to the constructor.
-  const string& name() const { return name_; }
+  const std::string& name() const { return name_; }
 
   // Produce the key name (from current_work and the actual key).
   tstring KeyName(const tstring& key) const;
@@ -106,8 +106,8 @@ class ReaderBase : public ReaderInterface {
   // For descendants that wish to obtain the next work item in a different way.
   // For implementing Read().  Dequeues the next work item from
   // *queue, and if successful returns "work" (a string). May block.
-  virtual string GetNextWorkLocked(QueueInterface* queue,
-                                   OpKernelContext* context) const;
+  virtual std::string GetNextWorkLocked(QueueInterface* queue,
+                                        OpKernelContext* context) const;
 
   // Implementations of ReaderInterface methods.  These ensure thread-safety
   // and call the methods above to do the work.
@@ -127,7 +127,7 @@ class ReaderBase : public ReaderInterface {
   absl::Status RestoreState(const tstring& state) override;
 
   mutable mutex mu_;
-  const string name_;
+  const std::string name_;
   int64_t work_started_ = 0;
   int64_t work_finished_ = 0;
   int64_t num_records_produced_ = 0;
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index 6210b68fe17b45..c8c1f1302811b8 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -77,7 +77,7 @@ class ReaderInterface : public ResourceBase {
   // Note: Must Reset on error.
   virtual absl::Status RestoreState(const tstring& state) = 0;
 
-  string DebugString() const override { return "a reader"; }
+  std::string DebugString() const override { return "a reader"; }
 
  protected:
   ~ReaderInterface() override {}
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index c91a9847c429fc..c23039c252c67f 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
 // This file is used by cuda code and must remain compilable by nvcc.
 
+#include <cstdint>  // IWYU pragma: export
+
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/variant.h"
@@ -61,19 +63,19 @@ limitations under the License.
 // readability.
 #define TF_CALL_float(m) m(float)
 #define TF_CALL_double(m) m(double)
-#define TF_CALL_int32(m) m(::tensorflow::int32)
-#define TF_CALL_uint32(m) m(::tensorflow::uint32)
-#define TF_CALL_uint8(m) m(::tensorflow::uint8)
-#define TF_CALL_int16(m) m(::tensorflow::int16)
+#define TF_CALL_int32(m) m(::int32_t)
+#define TF_CALL_uint32(m) m(::uint32_t)
+#define TF_CALL_uint8(m) m(::uint8_t)
+#define TF_CALL_int16(m) m(::int16_t)
 
-#define TF_CALL_int8(m) m(::tensorflow::int8)
+#define TF_CALL_int8(m) m(::int8_t)
 #define TF_CALL_string(m) m(::tensorflow::tstring)
 #define TF_CALL_tstring(m) m(::tensorflow::tstring)
 #define TF_CALL_resource(m) m(::tensorflow::ResourceHandle)
 #define TF_CALL_variant(m) m(::tensorflow::Variant)
 #define TF_CALL_complex64(m) m(::tensorflow::complex64)
 #define TF_CALL_int64(m) m(::int64_t)
-#define TF_CALL_uint64(m) m(::tensorflow::uint64)
+#define TF_CALL_uint64(m) m(::uint64_t)
 #define TF_CALL_bool(m) m(bool)
 
 #define TF_CALL_qint8(m) m(::tensorflow::qint8)
@@ -83,7 +85,7 @@ limitations under the License.
 #define TF_CALL_qint16(m) m(::tensorflow::qint16)
 
 #define TF_CALL_quint16(m) m(::tensorflow::quint16)
-#define TF_CALL_uint16(m) m(::tensorflow::uint16)
+#define TF_CALL_uint16(m) m(::uint16_t)
 #define TF_CALL_complex128(m) m(::tensorflow::complex128)
 #define TF_CALL_half(m) m(Eigen::half)
 
@@ -92,6 +94,7 @@ limitations under the License.
 #define TF_CALL_float8_e4m3fnuz(m) m(::tensorflow::float8_e4m3fnuz)
 #define TF_CALL_float8_e4m3b11fnuz(m) m(::tensorflow::float8_e4m3b11fnuz)
 #define TF_CALL_float8_e5m2fnuz(m) m(::tensorflow::float8_e5m2fnuz)
+#define TF_CALL_float4_e2m1fn(m) m(::tensorflow::float4_e2m1fn)
 
 #define TF_CALL_int4(m) m(::tensorflow::int4)
 #define TF_CALL_uint4(m) m(::tensorflow::uint4)
@@ -104,7 +107,7 @@ limitations under the License.
 // supported.
 #define TF_CALL_float(m) m(float)
 #define TF_CALL_double(m)
-#define TF_CALL_int32(m) m(::tensorflow::int32)
+#define TF_CALL_int32(m) m(::int32_t)
 #define TF_CALL_uint32(m)
 #define TF_CALL_uint8(m)
 #define TF_CALL_int16(m)
@@ -135,6 +138,7 @@ limitations under the License.
 #define TF_CALL_float8_e4m3fnuz(m)
 #define TF_CALL_float8_e4m3b11fnuz(m)
 #define TF_CALL_float8_e5m2fnuz(m)
+#define TF_CALL_float4_e2m1fn(m)
 
 #define TF_CALL_int4(m)
 #define TF_CALL_uint4(m)
@@ -146,7 +150,7 @@ limitations under the License.
 // Only float, int32, and bool are supported.
 #define TF_CALL_float(m) m(float)
 #define TF_CALL_double(m)
-#define TF_CALL_int32(m) m(::tensorflow::int32)
+#define TF_CALL_int32(m) m(::int32_t)
 #define TF_CALL_uint32(m)
 #define TF_CALL_uint8(m)
 #define TF_CALL_int16(m)
@@ -177,6 +181,7 @@ limitations under the License.
 #define TF_CALL_float8_e4m3fnuz(m)
 #define TF_CALL_float8_e4m3b11fnuz(m)
 #define TF_CALL_float8_e5m2fnuz(m)
+#define TF_CALL_float4_e2m1fn(m)
 
 #define TF_CALL_int4(m)
 #define TF_CALL_uint4(m)
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index b2847d844e1aea..a264ed0c710b30 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -44,15 +44,15 @@ struct proxy_type_pod<CPUDevice, 8> {
 };
 template <>
 struct proxy_type_pod<CPUDevice, 4> {
-  typedef ::tensorflow::int32 type;
+  typedef int32_t type;
 };
 template <>
 struct proxy_type_pod<CPUDevice, 2> {
-  typedef ::tensorflow::int16 type;
+  typedef int16_t type;
 };
 template <>
 struct proxy_type_pod<CPUDevice, 1> {
-  typedef ::tensorflow::int8 type;
+  typedef int8_t type;
 };
 template <>
 struct proxy_type_pod<GPUDevice, 8> {
@@ -68,7 +68,7 @@ struct proxy_type_pod<GPUDevice, 2> {
 };
 template <>
 struct proxy_type_pod<GPUDevice, 1> {
-  typedef ::tensorflow::int8 type;
+  typedef int8_t type;
 };
 
 
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index b0de3c3bdc1011..cbe5935907f791 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -53,9 +53,11 @@ Rendezvous::ParsedKey& Rendezvous::ParsedKey::operator=(const ParsedKey& b) {
 }
 
 /*  static */
-string Rendezvous::CreateKey(const string& src_device, uint64 src_incarnation,
-                             const string& dst_device, const string& name,
-                             const FrameAndIter& frame_iter) {
+std::string Rendezvous::CreateKey(const std::string& src_device,
+                                  uint64_t src_incarnation,
+                                  const std::string& dst_device,
+                                  const std::string& name,
+                                  const FrameAndIter& frame_iter) {
   // NOTE: ';' is not used in the device name's job name.
   //
   // We include both sender and receiver in the key to facilitate
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 97a5daffcae3ee..f28df3bcaf2c89 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -60,7 +60,7 @@ class RendezvousInterface {
   struct ParsedKey {
     absl::string_view src_device;
     DeviceNameUtils::ParsedName src;
-    uint64 src_incarnation = 0;
+    uint64_t src_incarnation = 0;
     absl::string_view dst_device;
     DeviceNameUtils::ParsedName dst;
     absl::string_view edge_name;
@@ -159,7 +159,7 @@ class Rendezvous : public RendezvousInterface, public core::WeakRefCounted {
   // "src_device" to "dst_device". The tensor is generated in the frame
   // and iteration specified by "frame_iter".
   static std::string CreateKey(const std::string& src_device,
-                               uint64 src_incarnation,
+                               uint64_t src_incarnation,
                                const std::string& dst_device,
                                const std::string& name,
                                const FrameAndIter& frame_iter);
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index d67b8e41b5e35e..bdaab9f3c5a322 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 namespace {
 
 TEST(RendezvousTest, Key) {
-  const string key = Rendezvous::CreateKey(
+  const std::string key = Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/CPU:0", 7890,
       "/job:mnist/replica:1/task:2/device:GPU:0", "var0", FrameAndIter(0, 0));
   EXPECT_EQ(key,
@@ -83,23 +83,23 @@ class LocalRendezvousTest : public ::testing::Test {
 };
 
 // string -> Tensor<string>
-Tensor V(const string& content) {
+Tensor V(const std::string& content) {
   Tensor tensor(DT_STRING, TensorShape({}));
   tensor.scalar<tstring>()() = content;
   return tensor;
 }
 
 // Tensor<string> -> string
-string V(const Tensor& tensor) {
+std::string V(const Tensor& tensor) {
   CHECK_EQ(tensor.dtype(), DT_STRING);
   CHECK(TensorShapeUtils::IsScalar(tensor.shape()));
   return tensor.scalar<tstring>()();
 }
 
-Rendezvous::ParsedKey MakeKey(const string& name) {
-  string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/device:GPU:0",
-                                   name, FrameAndIter(0, 0));
+Rendezvous::ParsedKey MakeKey(const std::string& name) {
+  std::string s = Rendezvous::CreateKey(
+      "/job:mnist/replica:1/task:2/CPU:0", 7890,
+      "/job:mnist/replica:1/task:2/device:GPU:0", name, FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
   return k;
diff --git a/tensorflow/core/framework/resource_handle.cc b/tensorflow/core/framework/resource_handle.cc
index 93fc5360e68c9c..970577a1df3376 100644
--- a/tensorflow/core/framework/resource_handle.cc
+++ b/tensorflow/core/framework/resource_handle.cc
@@ -99,18 +99,18 @@ absl::Status ResourceHandle::FromProto(const ResourceHandleProto& proto) {
   return absl::OkStatus();
 }
 
-string ResourceHandle::SerializeAsString() const {
+std::string ResourceHandle::SerializeAsString() const {
   ResourceHandleProto proto;
   AsProto(&proto);
   return proto.SerializeAsString();
 }
 
-bool ResourceHandle::ParseFromString(const string& s) {
+bool ResourceHandle::ParseFromString(const std::string& s) {
   ResourceHandleProto proto;
   return proto.ParseFromString(s) && FromProto(proto).ok();
 }
 
-string ResourceHandle::DebugString() const {
+std::string ResourceHandle::DebugString() const {
   return absl::StrFormat(
       "device: %s container: %s name: %s hash_code: 0x%X maybe_type_name %s, "
       "dtype and shapes : %s",
@@ -118,7 +118,7 @@ string ResourceHandle::DebugString() const {
       port::Demangle(maybe_type_name()),
       DtypeAndShapesToString(dtypes_and_shapes()));
 }
-string ResourceHandle::SummarizeValue() const {
+std::string ResourceHandle::SummarizeValue() const {
   return absl::StrFormat(
       "ResourceHandle(name=\"%s\", device=\"%s\", container=\"%s\", "
       "type=\"%s\", dtype and shapes : \"%s\")",
@@ -127,7 +127,7 @@ string ResourceHandle::SummarizeValue() const {
 }
 
 ResourceHandle ResourceHandle::MakeRefCountingHandle(
-    ResourceBase* resource, const string& device_name,
+    ResourceBase* resource, const std::string& device_name,
     const TypeIndex& type_index,
     const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes,
     const absl::optional<ManagedStackTrace>& definition_stack_trace) {
@@ -164,7 +164,7 @@ std::atomic<int64_t> ResourceHandle::current_id_;
 
 int64_t ResourceHandle::GenerateUniqueId() { return current_id_.fetch_add(1); }
 
-string ProtoDebugString(const ResourceHandle& handle) {
+std::string ProtoDebugString(const ResourceHandle& handle) {
   return handle.DebugString();
 }
 
@@ -180,12 +180,13 @@ void EncodeResourceHandleList(const ResourceHandle* p, int64_t n,
 
 bool DecodeResourceHandleList(std::unique_ptr<port::StringListDecoder> d,
                               ResourceHandle* ps, int64_t n) {
-  std::vector<uint32> sizes(n);
+  std::vector<uint32_t> sizes(n);
   if (!d->ReadSizes(&sizes)) return false;
 
   ResourceHandleProto proto;
   for (int i = 0; i < n; ++i) {
-    if (!proto.ParseFromArray(d->Data(sizes[i]), sizes[i])) {
+    if (!proto.ParseFromString(
+            absl::string_view(d->Data(sizes[i]), sizes[i]))) {
       return false;
     }
     if (!ps[i].FromProto(proto).ok()) {
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index 393a899862d0d4..4a8f291c36853e 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -67,8 +67,8 @@ class ResourceHandle {
 
   // Hash code for the type of the resource. Is only valid in the same device
   // and in the same execution.
-  uint64 hash_code() const { return hash_code_; }
-  void set_hash_code(uint64 hash_code) { hash_code_ = hash_code; }
+  uint64_t hash_code() const { return hash_code_; }
+  void set_hash_code(uint64_t hash_code) { hash_code_ = hash_code; }
 
   // For debug-only, the name of the type pointed to by this handle, if
   // available.
@@ -135,7 +135,7 @@ class ResourceHandle {
   // does not hold a strong reference to the resource.
   template <typename T>
   static ResourceHandle MakeRefCountingHandle(
-      T* resource, const string& device_name,
+      T* resource, const std::string& device_name,
       const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
       const absl::optional<ManagedStackTrace>& definition_stack_trace = {}) {
     return MakeRefCountingHandle(resource, device_name, TypeIndex::Make<T>(),
@@ -143,7 +143,7 @@ class ResourceHandle {
   }
 
   static ResourceHandle MakeRefCountingHandle(
-      ResourceBase* resource, const string& device_name,
+      ResourceBase* resource, const std::string& device_name,
       const TypeIndex& type_index,
       const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
       const absl::optional<ManagedStackTrace>& definition_stack_trace = {});
@@ -178,7 +178,7 @@ class ResourceHandle {
   std::string device_;
   std::string container_;
   std::string name_;
-  uint64 hash_code_ = 0;
+  uint64_t hash_code_ = 0;
   std::string maybe_type_name_;
   std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes_;
   std::optional<ManagedStackTrace> definition_stack_trace_;
diff --git a/tensorflow/core/framework/resource_handle_test.cc b/tensorflow/core/framework/resource_handle_test.cc
index 3dbdb46fca6fa8..022b128c145d57 100644
--- a/tensorflow/core/framework/resource_handle_test.cc
+++ b/tensorflow/core/framework/resource_handle_test.cc
@@ -37,7 +37,7 @@ class MockResource : public ResourceBase {
       *alive_ = false;
     }
   }
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   bool* alive_;
   int payload_;
 };
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 771c49626c59f8..c83acfe5329311 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -40,8 +40,8 @@ limitations under the License.
 namespace tensorflow {
 
 ResourceHandle MakeResourceHandle(
-    const string& container, const string& name, const DeviceBase& device,
-    const TypeIndex& type_index,
+    const std::string& container, const std::string& name,
+    const DeviceBase& device, const TypeIndex& type_index,
     const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes,
     const absl::optional<ManagedStackTrace>& definition_stack_trace) {
   ResourceHandle result;
@@ -62,8 +62,8 @@ ResourceHandle MakeResourceHandle(
 
 absl::Status MakeResourceHandleToOutput(OpKernelContext* context,
                                         int output_index,
-                                        const string& container,
-                                        const string& name,
+                                        const std::string& container,
+                                        const std::string& name,
                                         const TypeIndex& type_index) {
   Tensor* handle;
   TF_RETURN_IF_ERROR(
@@ -86,8 +86,8 @@ absl::Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p) {
 
 }  // end namespace internal
 
-absl::Status ResourceMgr::InsertDebugTypeName(uint64 hash_code,
-                                              const string& type_name) {
+absl::Status ResourceMgr::InsertDebugTypeName(uint64_t hash_code,
+                                              const std::string& type_name) {
   auto iter = debug_type_names_.emplace(hash_code, type_name);
   if (iter.first->second != type_name) {
     return errors::AlreadyExists("Duplicate hash code found for type ",
@@ -96,7 +96,7 @@ absl::Status ResourceMgr::InsertDebugTypeName(uint64 hash_code,
   return absl::OkStatus();
 }
 
-const char* ResourceMgr::DebugTypeName(uint64 hash_code) const {
+const char* ResourceMgr::DebugTypeName(uint64_t hash_code) const {
   auto type_name_iter = debug_type_names_.find(hash_code);
   if (type_name_iter == debug_type_names_.end()) {
     return "<unknown>";
@@ -107,8 +107,8 @@ const char* ResourceMgr::DebugTypeName(uint64 hash_code) const {
 
 ResourceMgr::ResourceAndName::ResourceAndName() : name(nullptr) {}
 
-ResourceMgr::ResourceAndName::ResourceAndName(const string& name)
-    : name(std::make_unique<string>(name)) {}
+ResourceMgr::ResourceAndName::ResourceAndName(const std::string& name)
+    : name(std::make_unique<std::string>(name)) {}
 
 core::RefCountPtr<ResourceBase> ResourceMgr::ResourceAndName::GetResource()
     const {
@@ -141,7 +141,7 @@ ResourceMgr::ResourceAndName& ResourceMgr::ResourceAndName::operator=(
 
 ResourceMgr::ResourceMgr() : default_container_("localhost") {}
 
-ResourceMgr::ResourceMgr(const string& default_container)
+ResourceMgr::ResourceMgr(const std::string& default_container)
     : default_container_(default_container) {}
 
 ResourceMgr::~ResourceMgr() { Clear(); }
@@ -149,7 +149,7 @@ ResourceMgr::~ResourceMgr() { Clear(); }
 void ResourceMgr::Clear() {
   // We do the deallocation outside of the lock to avoid a potential deadlock
   // in case any of the destructors access the resource manager.
-  absl::flat_hash_map<string, Container*> tmp_containers;
+  absl::flat_hash_map<std::string, Container*> tmp_containers;
   {
     mutex_lock l(mu_);
     tmp_containers = std::move(containers_);
@@ -181,17 +181,17 @@ void ResourceMgr::Finalize() {
   finalized_ = true;
 }
 
-string ResourceMgr::DebugString() const {
+std::string ResourceMgr::DebugString() const {
   mutex_lock l(mu_);
   struct Line {
-    const string* container;
-    const string type;
-    const string* resource;
-    const string detail;
+    const std::string* container;
+    const std::string type;
+    const std::string* resource;
+    const std::string detail;
   };
   std::vector<Line> lines;
   for (const auto& p : containers_) {
-    const string& container = p.first;
+    const std::string& container = p.first;
     for (const auto& q : *p.second) {
       const Key& key = q.first;
       const char* type = DebugTypeName(key.first);
@@ -201,7 +201,7 @@ string ResourceMgr::DebugString() const {
       lines.push_back(l);
     }
   }
-  std::vector<string> text;
+  std::vector<std::string> text;
   text.reserve(lines.size());
   for (const Line& line : lines) {
     text.push_back(strings::Printf(
@@ -212,9 +212,9 @@ string ResourceMgr::DebugString() const {
   return absl::StrJoin(text, "\n");
 }
 
-absl::Status ResourceMgr::DoCreate(const string& container_name, TypeIndex type,
-                                   const string& name, ResourceBase* resource,
-                                   bool owns_resource) {
+absl::Status ResourceMgr::DoCreate(const std::string& container_name,
+                                   TypeIndex type, const std::string& name,
+                                   ResourceBase* resource, bool owns_resource) {
   if (finalized_) {
     return absl::FailedPreconditionError(
         "ResourceMgr is finalized. Cannot create a new resource");
@@ -267,16 +267,16 @@ absl::Status ResourceMgr::Lookup(const ResourceHandle& handle,
                   /*type_name=*/"ResourceBase", handle.name(), resource);
 }
 
-absl::Status ResourceMgr::DoLookup(const string& container, TypeIndex type,
-                                   const string& name,
+absl::Status ResourceMgr::DoLookup(const std::string& container, TypeIndex type,
+                                   const std::string& name,
                                    ResourceBase** resource) const {
   return DoLookup(container, type.hash_code(), type.name(), name, resource);
 }
 
-absl::Status ResourceMgr::DoLookup(const string& container,
-                                   uint64 type_hash_code,
-                                   const string& type_name,
-                                   const string& resource_name,
+absl::Status ResourceMgr::DoLookup(const std::string& container,
+                                   uint64_t type_hash_code,
+                                   const std::string& type_name,
+                                   const std::string& resource_name,
                                    ResourceBase** resource) const {
   const Container* b = gtl::FindPtrOrNull(containers_, container);
   if (b == nullptr) {
@@ -299,8 +299,9 @@ absl::Status ResourceMgr::DoLookup(const string& container,
 }
 
 absl::Status ResourceMgr::PopResourceAndName(
-    const string& container, uint64 type_hash_code, const string& resource_name,
-    const string& type_name, ResourceAndName& resource_and_name) {
+    const std::string& container, uint64_t type_hash_code,
+    const std::string& resource_name, const std::string& type_name,
+    ResourceAndName& resource_and_name) {
   mutex_lock l(mu_);
   Container* b = gtl::FindPtrOrNull(containers_, container);
   if (b == nullptr) {
@@ -316,10 +317,10 @@ absl::Status ResourceMgr::PopResourceAndName(
   return absl::OkStatus();
 }
 
-absl::Status ResourceMgr::DoDelete(const string& container,
-                                   uint64 type_hash_code,
-                                   const string& resource_name,
-                                   const string& type_name) {
+absl::Status ResourceMgr::DoDelete(const std::string& container,
+                                   uint64_t type_hash_code,
+                                   const std::string& resource_name,
+                                   const std::string& type_name) {
   ResourceAndName resource_and_name;
   TF_RETURN_IF_ERROR(PopResourceAndName(
       container, type_hash_code, resource_name, type_name, resource_and_name));
@@ -335,8 +336,8 @@ absl::Status ResourceMgr::DoDelete(const string& container,
   return absl::OkStatus();
 }
 
-absl::Status ResourceMgr::DoDelete(const string& container, TypeIndex type,
-                                   const string& resource_name) {
+absl::Status ResourceMgr::DoDelete(const std::string& container, TypeIndex type,
+                                   const std::string& resource_name) {
   return DoDelete(container, type.hash_code(), resource_name, type.name());
 }
 
@@ -345,7 +346,7 @@ absl::Status ResourceMgr::Delete(const ResourceHandle& handle) {
                   "<unknown>");
 }
 
-absl::Status ResourceMgr::Cleanup(const string& container) {
+absl::Status ResourceMgr::Cleanup(const std::string& container) {
   {
     tf_shared_lock l(mu_);
     if (!gtl::FindOrNull(containers_, container)) {
@@ -382,13 +383,13 @@ absl::Status ContainerInfo::Init(ResourceMgr* rmgr, const NodeDef& ndef,
                                  bool use_node_name_as_default) {
   CHECK(rmgr);
   rmgr_ = rmgr;
-  string attr_container;
+  std::string attr_container;
   TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "container", &attr_container));
   if (!attr_container.empty() && !IsValidContainerName(attr_container)) {
     return errors::InvalidArgument("container contains invalid characters: ",
                                    attr_container);
   }
-  string attr_shared_name;
+  std::string attr_shared_name;
   TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "shared_name", &attr_shared_name));
   if (!attr_shared_name.empty() && (attr_shared_name[0] == '_')) {
     return errors::InvalidArgument("shared_name cannot start with '_':",
@@ -411,7 +412,7 @@ absl::Status ContainerInfo::Init(ResourceMgr* rmgr, const NodeDef& ndef,
   return absl::OkStatus();
 }
 
-string ContainerInfo::DebugString() const {
+std::string ContainerInfo::DebugString() const {
   return strings::StrCat("[", container(), ",", name(), ",",
                          resource_is_private_to_kernel() ? "private" : "public",
                          "]");
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 4dfba330ed8d89..406dcf39315bd0 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -86,14 +86,14 @@ class ScopedStepContainer {
   // cleanup: callback to delete a container of this name.
   // prefix: optional string prefix to disambiguate step containers.
   ScopedStepContainer(const int64_t step_id,
-                      std::function<void(const string&)> cleanup)
+                      std::function<void(const std::string&)> cleanup)
       : step_id_(step_id),
         container_(absl::StrCat("__per_step_", step_id)),
         cleanup_(cleanup),
         dirty_(false) {}
 
   ScopedStepContainer(const int64_t step_id,
-                      std::function<void(const string&)> cleanup,
+                      std::function<void(const std::string&)> cleanup,
                       const std::string& prefix)
       : step_id_(step_id),
         container_(absl::StrCat("__", prefix, "_per_step_", step_id)),
@@ -141,7 +141,7 @@ class ScopedStepContainer {
  private:
   const int64_t step_id_;
   const std::string container_;
-  const std::function<void(const string&)> cleanup_;
+  const std::function<void(const std::string&)> cleanup_;
   mutex mu_;
   mutable std::atomic<bool> dirty_ TF_GUARDED_BY(mu_);
 };
@@ -200,7 +200,7 @@ class ResourceMgr {
   // then this function does not modify resources[i].
   template <typename T, bool use_dynamic_cast = false>
   absl::Status LookupMany(
-      absl::Span<std::pair<const string*, const string*> const>
+      absl::Span<std::pair<const std::string*, const std::string*> const>
           containers_and_names,
       std::vector<core::RefCountPtr<T>>* resources) const;
 
@@ -245,7 +245,7 @@ class ResourceMgr {
   std::string DebugString() const;
 
  private:
-  typedef std::pair<uint64, absl::string_view> Key;
+  typedef std::pair<uint64_t, absl::string_view> Key;
   struct KeyHash {
     std::size_t operator()(const Key& k) const {
       return Hash64(k.second.data(), k.second.size(), k.first);
@@ -262,7 +262,7 @@ class ResourceMgr {
     std::unique_ptr<std::string> name;
 
     ResourceAndName();
-    explicit ResourceAndName(const string& name);
+    explicit ResourceAndName(const std::string& name);
     ResourceAndName(ResourceAndName&& other) noexcept;
     ~ResourceAndName();
 
@@ -281,7 +281,7 @@ class ResourceMgr {
 
   const std::string default_container_;
   mutable mutex mu_;
-  absl::flat_hash_map<string, Container*> containers_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, Container*> containers_ TF_GUARDED_BY(mu_);
   bool finalized_ TF_GUARDED_BY(mu_) = false;
 
   template <typename T, bool use_dynamic_cast = false>
@@ -289,7 +289,7 @@ class ResourceMgr {
                               const std::string& name, T** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
   absl::Status LookupInternal(const std::string& container,
-                              uint64 type_hash_code, const std::string& name,
+                              uint64_t type_hash_code, const std::string& name,
                               ResourceBase** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
@@ -300,13 +300,13 @@ class ResourceMgr {
   absl::Status DoLookup(const std::string& container, TypeIndex type,
                         const std::string& name, ResourceBase** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
-  absl::Status DoLookup(const std::string& container, uint64 type_hash_code,
+  absl::Status DoLookup(const std::string& container, uint64_t type_hash_code,
                         const std::string& type_name,
                         const std::string& resource_name,
                         ResourceBase** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  absl::Status DoDelete(const std::string& container, uint64 type_hash_code,
+  absl::Status DoDelete(const std::string& container, uint64_t type_hash_code,
                         const std::string& resource_name,
                         const std::string& type_name);
   absl::Status DoDelete(const std::string& container, TypeIndex type,
@@ -315,23 +315,24 @@ class ResourceMgr {
   // Pops the ResourceAndName entry. The entry is moved from the list to
   // the output argument `resource_and_name`.
   absl::Status PopResourceAndName(const std::string& container,
-                                  uint64 type_hash_code,
+                                  uint64_t type_hash_code,
                                   const std::string& resource_name,
                                   const std::string& type_name,
                                   ResourceAndName& resource_and_name);
   // Inserts the type name for 'hash_code' into the hash_code to type name map.
-  absl::Status InsertDebugTypeName(uint64 hash_code,
+  absl::Status InsertDebugTypeName(uint64_t hash_code,
                                    const std::string& type_name)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Returns the type name for the 'hash_code'.
   // Returns "<unknown>" if a resource with such a type was never inserted into
   // the container.
-  const char* DebugTypeName(uint64 hash_code) const
+  const char* DebugTypeName(uint64_t hash_code) const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Map from type hash_code to type name.
-  std::unordered_map<uint64, string> debug_type_names_ TF_GUARDED_BY(mu_);
+  std::unordered_map<uint64_t, std::string> debug_type_names_
+      TF_GUARDED_BY(mu_);
 
   ResourceMgr(const ResourceMgr&) = delete;
   void operator=(const ResourceMgr&) = delete;
@@ -560,8 +561,8 @@ class ResourceHandlesOp : public OpKernel {
   bool IsExpensive() override { return false; }
 
  private:
-  std::vector<string> containers_;
-  std::vector<string> names_;
+  std::vector<std::string> containers_;
+  std::vector<std::string> names_;
   mutex mutex_;
   std::vector<Tensor> resources_;
   std::atomic<bool> initialized_{false};
@@ -682,7 +683,7 @@ absl::Status ResourceMgr::Lookup(const std::string& container,
 
 template <typename T, bool use_dynamic_cast>
 absl::Status ResourceMgr::LookupMany(
-    absl::Span<std::pair<const string*, const string*> const>
+    absl::Span<std::pair<const std::string*, const std::string*> const>
         containers_and_names,
     std::vector<core::RefCountPtr<T>>* resources) const {
   CheckDeriveFromResourceBase<T>();
@@ -854,8 +855,8 @@ template <typename T>
 absl::Status LookupResources(OpKernelContext* ctx,
                              absl::Span<ResourceHandle const* const> p,
                              std::vector<core::RefCountPtr<T>>* values) {
-  std::vector<std::pair<const string*, const string*>> containers_and_names(
-      p.size());
+  std::vector<std::pair<const std::string*, const std::string*>>
+      containers_and_names(p.size());
   for (size_t i = 0; i < p.size(); ++i) {
     TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, *p[i]));
     containers_and_names[i] = {&p[i]->container(), &p[i]->name()};
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index dec1a9583a49a2..363abbf4fccb90 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -45,24 +45,28 @@ using ::tsl::testing::StatusIs;
 
 class Resource : public ResourceBase {
  public:
-  explicit Resource(const string& label) : label_(label) {}
+  explicit Resource(const std::string& label) : label_(label) {}
   ~Resource() override {}
 
-  string DebugString() const override { return absl::StrCat("R/", label_); }
+  std::string DebugString() const override {
+    return absl::StrCat("R/", label_);
+  }
 
  private:
-  string label_;
+  std::string label_;
 };
 
 class Other : public ResourceBase {
  public:
-  explicit Other(const string& label) : label_(label) {}
+  explicit Other(const std::string& label) : label_(label) {}
   ~Other() override {}
 
-  string DebugString() const override { return absl::StrCat("O/", label_); }
+  std::string DebugString() const override {
+    return absl::StrCat("O/", label_);
+  }
 
  private:
-  string label_;
+  std::string label_;
 };
 
 class Finalizable : public ResourceBase {
@@ -79,38 +83,38 @@ class Finalizable : public ResourceBase {
 };
 
 template <typename T>
-string Find(const ResourceMgr& rm, const string& container,
-            const string& name) {
+std::string Find(const ResourceMgr& rm, const std::string& container,
+                 const std::string& name) {
   T* r;
   TF_CHECK_OK(rm.Lookup(container, name, &r));
-  const string ret = r->DebugString();
+  const std::string ret = r->DebugString();
   r->Unref();
   return ret;
 }
 
 template <typename T>
-string LookupOrCreate(ResourceMgr* rm, const string& container,
-                      const string& name, const string& label) {
+std::string LookupOrCreate(ResourceMgr* rm, const std::string& container,
+                           const std::string& name, const std::string& label) {
   T* r;
   TF_CHECK_OK(rm->LookupOrCreate<T>(container, name, &r, [&label](T** ret) {
     *ret = new T(label);
     return absl::OkStatus();
   }));
-  const string ret = r->DebugString();
+  const std::string ret = r->DebugString();
   r->Unref();
   return ret;
 }
 
 static void HasError(const absl::Status& s, const error::Code code,
-                     const string& substr) {
+                     const std::string& substr) {
   EXPECT_EQ(s.code(), code);
   EXPECT_TRUE(absl::StrContains(s.message(), substr))
       << s << ", expected substring " << substr;
 }
 
 template <typename T>
-absl::Status FindErr(const ResourceMgr& rm, const string& container,
-                     const string& name) {
+absl::Status FindErr(const ResourceMgr& rm, const std::string& container,
+                     const std::string& name) {
   T* r;
   absl::Status s = rm.Lookup(container, name, &r);
   CHECK(!s.ok());
@@ -323,9 +327,9 @@ TEST(ResourceMgrTest, CreateUnownedFailAfterFinalize) {
   finalizable->Unref();
 }
 
-absl::Status ComputePolicy(const string& attr_container,
-                           const string& attr_shared_name,
-                           bool use_node_name_as_default, string* result) {
+absl::Status ComputePolicy(const std::string& attr_container,
+                           const std::string& attr_shared_name,
+                           bool use_node_name_as_default, std::string* result) {
   ContainerInfo cinfo;
   ResourceMgr rmgr;
   NodeDef ndef;
@@ -341,9 +345,10 @@ absl::Status ComputePolicy(const string& attr_container,
   return absl::OkStatus();
 }
 
-string Policy(const string& attr_container, const string& attr_shared_name,
-              bool use_node_name_as_default) {
-  string ret;
+std::string Policy(const std::string& attr_container,
+                   const std::string& attr_shared_name,
+                   bool use_node_name_as_default) {
+  std::string ret;
   TF_CHECK_OK(ComputePolicy(attr_container, attr_shared_name,
                             use_node_name_as_default, &ret));
   return ret;
@@ -365,10 +370,10 @@ TEST(ContainerInfo, Basic) {
   EXPECT_EQ(Policy(".cat", "bar", true), "[.cat,bar,public]");
 }
 
-absl::Status WrongPolicy(const string& attr_container,
-                         const string& attr_shared_name,
+absl::Status WrongPolicy(const std::string& attr_container,
+                         const std::string& attr_shared_name,
                          bool use_node_name_as_default) {
-  string dbg;
+  std::string dbg;
   auto s = ComputePolicy(attr_container, attr_shared_name,
                          use_node_name_as_default, &dbg);
   CHECK(!s.ok());
@@ -396,7 +401,7 @@ TEST(ContainerInfo, Error) {
 // handles.
 class StubDevice : public DeviceBase {
  public:
-  explicit StubDevice(const string& name) : DeviceBase(nullptr) {
+  explicit StubDevice(const std::string& name) : DeviceBase(nullptr) {
     attr_.set_name(name);
   }
 
@@ -405,7 +410,7 @@ class StubDevice : public DeviceBase {
   }
 
   const DeviceAttributes& attributes() const override { return attr_; }
-  const string& name() const override { return attr_.name(); }
+  const std::string& name() const override { return attr_.name(); }
 
  private:
   DeviceAttributes attr_;
@@ -414,7 +419,7 @@ class StubDevice : public DeviceBase {
 // Empty stub resource for testing resource handles.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   int value_{0};
 };
 
@@ -560,7 +565,7 @@ TEST(ResourceHandleTest, DifferentDevice) {
 // Other stub resource to test type-checking of resource handles.
 class OtherStubResource : public ResourceBase {
  public:
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
 };
 
 TEST(ResourceHandleTest, DifferentType) {
diff --git a/tensorflow/core/framework/resource_op_kernel_test.cc b/tensorflow/core/framework/resource_op_kernel_test.cc
index db879c40654be5..983cbdcf588aa9 100644
--- a/tensorflow/core/framework/resource_op_kernel_test.cc
+++ b/tensorflow/core/framework/resource_op_kernel_test.cc
@@ -47,7 +47,7 @@ class StubDevice : public DeviceBase {
 // Stub resource for testing resource op kernel.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   int code;
 };
 
@@ -84,8 +84,8 @@ REGISTER_KERNEL_BUILDER(Name("StubResourceOp").Device(DEVICE_CPU),
 
 class ResourceOpKernelTest : public ::testing::Test {
  protected:
-  std::unique_ptr<StubResourceOpKernel> CreateOp(int code,
-                                                 const string& shared_name) {
+  std::unique_ptr<StubResourceOpKernel> CreateOp(
+      int code, const std::string& shared_name) {
     static std::atomic<int64_t> count(0);
     NodeDef node_def;
     TF_CHECK_OK(NodeDefBuilder(absl::StrCat("test-node", count.fetch_add(1)),
@@ -137,7 +137,7 @@ TEST_F(ResourceOpKernelTest, PrivateResource) {
   // TODO(gonnet): This test is brittle since it assumes that the
   // ResourceManager is untouched and thus the private resource name starts
   // with "_0_".
-  const string key = "_0_" + op->name();
+  const std::string key = "_0_" + op->name();
 
   StubResource* resource;
   TF_ASSERT_OK(
@@ -155,7 +155,7 @@ TEST_F(ResourceOpKernelTest, PrivateResource) {
 }
 
 TEST_F(ResourceOpKernelTest, SharedResource) {
-  const string shared_name = "shared_stub";
+  const std::string shared_name = "shared_stub";
   const int code = -201;
   auto op = CreateOp(code, shared_name);
   ASSERT_NE(op, nullptr);
@@ -199,7 +199,7 @@ TEST_F(ResourceOpKernelTest, VerifyResource) {
 }
 
 TEST_F(ResourceOpKernelTest, ContainerClearedBetweenRuns) {
-  const string shared_name = "shared_stub";
+  const std::string shared_name = "shared_stub";
   const int code = -201;
   auto op = CreateOp(code, shared_name);
   ASSERT_NE(op, nullptr);
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index e4b8688f8ece1f..4836961f46f08c 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -48,7 +48,7 @@ typedef Eigen::RunQueue<Task, 1024> Queue;
 
 namespace internal {
 RunHandlerEnvironment::RunHandlerEnvironment(
-    Env* env, const ThreadOptions& thread_options, const string& name)
+    Env* env, const ThreadOptions& thread_options, const std::string& name)
     : env_(env), thread_options_(thread_options), name_(name) {}
 
 RunHandlerEnvironment::EnvThread* RunHandlerEnvironment::CreateThread(
@@ -67,7 +67,7 @@ RunHandlerEnvironment::EnvThread* RunHandlerEnvironment::CreateThread(
 
 RunHandlerEnvironment::Task RunHandlerEnvironment::CreateTask(
     std::function<void()> f) {
-  uint64 id = 0;
+  uint64_t id = 0;
   if (tsl::tracing::EventCollector::IsEnabled()) {
     id = tsl::tracing::GetUniqueArg();
     tsl::tracing::RecordEvent(tsl::tracing::EventCategory::kScheduleClosure,
@@ -126,7 +126,7 @@ void WaitOnWaiter(Waiter* waiter, Waiter* queue_head, mutex* mutex,
 
 ThreadWorkSource::ThreadWorkSource()
     : non_blocking_work_sharding_factor_(
-          static_cast<int32>(ParamFromEnvWithDefault(
+          static_cast<int32_t>(ParamFromEnvWithDefault(
               "TF_RUN_HANDLER_NUM_OF_NON_BLOCKING_QUEUES", 1))),
       non_blocking_work_queues_(non_blocking_work_sharding_factor_),
       blocking_inflight_(0),
@@ -261,7 +261,8 @@ int64_t ThreadWorkSource::GetTracemeId() {
 
 void ThreadWorkSource::SetTracemeId(int64_t value) { traceme_id_ = value; }
 
-void ThreadWorkSource::SetWaiter(uint64 version, Waiter* waiter, mutex* mutex) {
+void ThreadWorkSource::SetWaiter(uint64_t version, Waiter* waiter,
+                                 mutex* mutex) {
   {
     tf_shared_lock lock(run_handler_waiter_mu_);
     // Most of the request won't change sub pool for recomputation.
@@ -313,7 +314,7 @@ std::string ThreadWorkSource::ToString() {
 
 RunHandlerThreadPool::RunHandlerThreadPool(
     int num_blocking_threads, int num_non_blocking_threads, Env* env,
-    const ThreadOptions& thread_options, const string& name,
+    const ThreadOptions& thread_options, const std::string& name,
     Eigen::MaxSizeVector<mutex>* waiters_mu,
     Eigen::MaxSizeVector<Waiter>* queue_waiters)
     : num_threads_(num_blocking_threads + num_non_blocking_threads),
@@ -407,7 +408,7 @@ void RunHandlerThreadPool::AddWorkToQueue(ThreadWorkSource* tws,
 // provide better performance due to less lock retention. The drawback is that
 // the profiler will be a bit harder to read.
 void RunHandlerThreadPool::SetThreadWorkSources(
-    int tid, int start_request_idx, uint64 version,
+    int tid, int start_request_idx, uint64_t version,
     const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources) {
   mutex_lock l(thread_data_[tid].mu);
   if (version > thread_data_[tid].new_version) {
@@ -478,12 +479,12 @@ RunHandlerThreadPool::ThreadData::ThreadData()
     : new_version(0),
       current_index(0),
       new_thread_work_sources(
-          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32>(
+          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32_t>(
               ParamFromEnvWithDefault("TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
                                       kMaxConcurrentHandlers)))),
       current_version(0),
       current_thread_work_sources(
-          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32>(
+          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32_t>(
               ParamFromEnvWithDefault("TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
                                       kMaxConcurrentHandlers)))) {}
 
@@ -734,7 +735,7 @@ class RunHandler::Impl {
 
   // Stores now time (in microseconds) since unix epoch when the handler is
   // requested via RunHandlerPool::Get().
-  uint64 start_time_us() const { return start_time_us_; }
+  uint64_t start_time_us() const { return start_time_us_; }
   int64_t step_id() const { return step_id_; }
   void ScheduleInterOpClosure(std::function<void()> fn);
   void ScheduleIntraOpClosure(std::function<void()> fn);
@@ -763,7 +764,7 @@ class RunHandler::Impl {
   };
 
   RunHandlerPool::Impl* pool_impl_;  // NOT OWNED.
-  uint64 start_time_us_;
+  uint64_t start_time_us_;
   int64_t step_id_;
   std::unique_ptr<thread::ThreadPoolInterface> thread_pool_interface_;
   internal::ThreadWorkSource tws_;
@@ -776,7 +777,7 @@ class RunHandler::Impl {
 class RunHandlerPool::Impl {
  public:
   explicit Impl(int num_inter_op_threads, int num_intra_op_threads)
-      : max_handlers_(static_cast<int32>(ParamFromEnvWithDefault(
+      : max_handlers_(static_cast<int32_t>(ParamFromEnvWithDefault(
             "TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS", kMaxConcurrentHandlers))),
         waiters_mu_(
             ParamFromEnvWithDefault("TF_RUN_HANDLER_NUM_SUB_THREAD_POOL", 2)),
@@ -838,10 +839,10 @@ class RunHandlerPool::Impl {
         thread_work_sources =
             std::make_unique<Eigen::MaxSizeVector<internal::ThreadWorkSource*>>(
 
-                static_cast<int32>(ParamFromEnvWithDefault(
+                static_cast<int32_t>(ParamFromEnvWithDefault(
                     "TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
                     kMaxConcurrentHandlers)));
-    uint64 version;
+    uint64_t version;
     int num_active_requests;
     RunHandler::Impl* handler_impl;
     {
@@ -899,7 +900,7 @@ class RunHandlerPool::Impl {
     CHECK_EQ(handler->tws()->TaskQueueSize(true), 0);   // Crash OK.
     CHECK_EQ(handler->tws()->TaskQueueSize(false), 0);  // Crash OK.
 
-    uint64 now = tensorflow::EnvTime::NowMicros();
+    uint64_t now = tensorflow::EnvTime::NowMicros();
     double elapsed = (now - handler->start_time_us()) / 1000.0;
     time_hist_.Add(elapsed);
 
@@ -935,7 +936,7 @@ class RunHandlerPool::Impl {
 
  private:
   void RecomputePoolStats(
-      int num_active_requests, uint64 version,
+      int num_active_requests, uint64_t version,
       const Eigen::MaxSizeVector<internal::ThreadWorkSource*>&
           thread_work_sources);
 
@@ -971,7 +972,7 @@ class RunHandlerPool::Impl {
 };
 
 void RunHandlerPool::Impl::RecomputePoolStats(
-    int num_active_requests, uint64 version,
+    int num_active_requests, uint64_t version,
     const Eigen::MaxSizeVector<internal::ThreadWorkSource*>&
         thread_work_sources) {
   if (num_active_requests == 0) return;
@@ -1019,9 +1020,9 @@ void RunHandlerPool::Impl::LogInfo() {
     int num_active_requests = sorted_active_handlers_.size();
     VLOG(1) << "Printing time histogram: " << time_hist_.ToString();
     VLOG(1) << "Active session runs: " << num_active_requests;
-    uint64 now = tensorflow::Env::Default()->NowMicros();
-    string times_str = "";
-    string ids_str = "";
+    uint64_t now = tensorflow::Env::Default()->NowMicros();
+    std::string times_str = "";
+    std::string ids_str = "";
     auto it = sorted_active_handlers_.cbegin();
     for (int i = 0; i < num_active_requests; ++i) {
       if (i > 0) {
diff --git a/tensorflow/core/framework/run_handler.h b/tensorflow/core/framework/run_handler.h
index 148378bc0e9c6f..993be4a9eb4c0e 100644
--- a/tensorflow/core/framework/run_handler.h
+++ b/tensorflow/core/framework/run_handler.h
@@ -115,11 +115,11 @@ class RunHandlerEnvironment {
   struct TaskImpl {
     std::function<void()> f;
     Context context;
-    uint64 trace_id;
+    uint64_t trace_id;
   };
   Env* const env_;
   const ThreadOptions thread_options_;
-  const string name_;
+  const std::string name_;
 
  public:
   struct Task {
@@ -127,7 +127,7 @@ class RunHandlerEnvironment {
   };
 
   RunHandlerEnvironment(Env* env, const ThreadOptions& thread_options,
-                        const string& name);
+                        const std::string& name);
 
   EnvThread* CreateThread(std::function<void()> f,
                           const std::string& thread_name);
@@ -174,7 +174,7 @@ class ThreadWorkSource {
 
   void SetTracemeId(int64_t value);
 
-  void SetWaiter(uint64 version, Waiter* waiter, mutex* mutex);
+  void SetWaiter(uint64_t version, Waiter* waiter, mutex* mutex);
 
   int64_t GetInflightTaskCount(bool is_blocking);
 
@@ -193,7 +193,7 @@ class ThreadWorkSource {
     Queue queue;
   };
 
-  int32 non_blocking_work_sharding_factor_;
+  int32_t non_blocking_work_sharding_factor_;
   Eigen::MaxSizeVector<NonBlockingQueue*> non_blocking_work_queues_;
 
   std::atomic<int64_t> blocking_inflight_;
@@ -207,7 +207,7 @@ class ThreadWorkSource {
   std::atomic<int64_t> traceme_id_;
 
   mutex run_handler_waiter_mu_;
-  uint64 version_ TF_GUARDED_BY(run_handler_waiter_mu_);
+  uint64_t version_ TF_GUARDED_BY(run_handler_waiter_mu_);
   mutex* sub_thread_pool_waiter_mu_ TF_GUARDED_BY(run_handler_waiter_mu_);
   Waiter* sub_thread_pool_waiter_ TF_GUARDED_BY(run_handler_waiter_mu_);
 };
@@ -222,7 +222,7 @@ class RunHandlerThreadPool {
 
   RunHandlerThreadPool(int num_blocking_threads, int num_non_blocking_threads,
                        Env* env, const ThreadOptions& thread_options,
-                       const string& name,
+                       const std::string& name,
                        Eigen::MaxSizeVector<mutex>* waiters_mu,
                        Eigen::MaxSizeVector<Waiter>* queue_waiters);
 
@@ -239,7 +239,7 @@ class RunHandlerThreadPool {
   // The request with start_request_idx will be attempted first. Other requests
   // will be attempted in FIFO order based on their arrival time.
   void SetThreadWorkSources(
-      int tid, int start_request_idx, uint64 version,
+      int tid, int start_request_idx, uint64_t version,
       const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources);
 
   PerThread* GetPerThread();
@@ -273,14 +273,14 @@ class RunHandlerThreadPool {
   struct ThreadData {
     ThreadData();
     mutex mu;
-    uint64 new_version;
+    uint64_t new_version;
     condition_variable sources_not_empty;
     std::unique_ptr<Thread> thread;
     int current_index;
     std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
         new_thread_work_sources TF_GUARDED_BY(mu);
 
-    uint64 current_version;
+    uint64_t current_version;
     // Should only be accessed by one thread.
     std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
         current_thread_work_sources;
@@ -294,7 +294,7 @@ class RunHandlerThreadPool {
   Eigen::MaxSizeVector<ThreadData> thread_data_;
   internal::RunHandlerEnvironment env_;
   std::atomic<bool> cancelled_;
-  string name_;
+  std::string name_;
   Eigen::MaxSizeVector<mutex>* waiters_mu_;
   Eigen::MaxSizeVector<Waiter>* queue_waiters_;
 
diff --git a/tensorflow/core/framework/run_handler_test.cc b/tensorflow/core/framework/run_handler_test.cc
index b6560dc45c73b9..aab7869c139472 100644
--- a/tensorflow/core/framework/run_handler_test.cc
+++ b/tensorflow/core/framework/run_handler_test.cc
@@ -638,11 +638,11 @@ class RunHandlerTest : public ::testing::Test {
     ASSERT_EQ(setenv("TF_NUM_INTEROP_THREADS", "16", true), 0);
   }
 
-  string a_;
-  string x_;
-  string y_;
-  string y_neg_;
-  string z_;
+  std::string a_;
+  std::string x_;
+  std::string y_;
+  std::string y_neg_;
+  std::string z_;
   GraphDef def_;
 };
 
@@ -651,11 +651,11 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPool) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   EXPECT_EQ(absl::OkStatus(), session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   // Prepares RunOptions and RunMetadata
@@ -687,10 +687,10 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPool) {
   thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
 
   // Run the graph 1000 times in 4 different threads concurrently.
-  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<std::string> output_names = {y_ + ":0"};
   auto fn = [&session, output_names, run_options]() {
     for (int i = 0; i < 1000; ++i) {
-      std::vector<std::pair<string, Tensor>> inputs;
+      std::vector<std::pair<std::string, Tensor>> inputs;
       std::vector<Tensor> outputs;
       // Run the graph
       absl::Status s = session->Run(run_options, inputs, output_names, {},
@@ -715,11 +715,11 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPoolWithPriority) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   EXPECT_EQ(absl::OkStatus(), session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   // Prepares RunOptions and RunMetadata
@@ -751,7 +751,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPoolWithPriority) {
   thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
 
   // Run the graph 1000 times in 4 different threads concurrently.
-  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<std::string> output_names = {y_ + ":0"};
   auto fn = [&session, output_names]() {
     for (int i = 0; i < 1000; ++i) {
       RunOptions run_options;
@@ -759,7 +759,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPoolWithPriority) {
       run_options.mutable_experimental()
           ->mutable_run_handler_pool_options()
           ->set_priority(i % 4);
-      std::vector<std::pair<string, Tensor>> inputs;
+      std::vector<std::pair<std::string, Tensor>> inputs;
       std::vector<Tensor> outputs;
       // Run the graph
       absl::Status s = session->Run(run_options, inputs, output_names, {},
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
index c54ecc844f595d..85ce55fc240076 100644
--- a/tensorflow/core/framework/run_handler_util.cc
+++ b/tensorflow/core/framework/run_handler_util.cc
@@ -35,7 +35,7 @@ std::vector<double> ParamFromEnvWithDefault(const char* var_name,
   if (!val) {
     return default_value;
   }
-  std::vector<string> splits = str_util::Split(val, ",");
+  std::vector<std::string> splits = str_util::Split(val, ",");
   std::vector<double> result;
   result.reserve(splits.size());
   for (auto& split : splits) {
@@ -56,7 +56,7 @@ std::vector<int> ParamFromEnvWithDefault(const char* var_name,
   if (!val) {
     return default_value;
   }
-  std::vector<string> splits = str_util::Split(val, ",");
+  std::vector<std::string> splits = str_util::Split(val, ",");
   std::vector<int> result;
   result.reserve(splits.size());
   for (auto& split : splits) {
diff --git a/tensorflow/core/framework/run_handler_util_test.cc b/tensorflow/core/framework/run_handler_util_test.cc
index 1eff55529bb5e2..e3e3a2ae57e354 100644
--- a/tensorflow/core/framework/run_handler_util_test.cc
+++ b/tensorflow/core/framework/run_handler_util_test.cc
@@ -37,7 +37,7 @@ void VerifySchedulingRanges(int num_active_requests, int num_threads,
 
   ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
                                  min_threads_per_request, &start, &end);
-  string range_str = "";
+  std::string range_str = "";
   for (int i = 0; i < num_active_requests; ++i) {
     if (i > 0) range_str += " ";
     range_str += strings::StrCat("[", start[i], ", ", end[i], ")");
@@ -107,7 +107,7 @@ TEST(RunHandlerUtilTest, TestComputeInterOpStealingRanges) {
   for (int i = 0; i < num_inter_op_threads; ++i) {
     int expected_start = stealing_ranges[i / 6][0];
     int expected_end = stealing_ranges[i / 6][1];
-    string message =
+    std::string message =
         strings::StrCat("Stealing range of thread ", i, " should be [",
                         expected_start, ", ", expected_end, "]");
     ASSERT_EQ(start_vec[i], expected_start) << message;
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
index d102e153c0001f..84ec0d45c4852c 100644
--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@@ -50,7 +50,7 @@ class SessionState {
   int64_t tensor_id_ = 0;
 
   // The live tensors in the session. A map from tensor handle to tensor.
-  std::unordered_map<string, Tensor> tensors_;
+  std::unordered_map<std::string, Tensor> tensors_;
 };
 
 // The tensor store remembers the tensors we choose to keep for the
@@ -71,7 +71,7 @@ class TensorStore {
   absl::Status AddTensor(const std::string& name, const TensorAndKey& tk);
 
   // Save the tensors in the tensor store of this run to the session.
-  absl::Status SaveTensors(const std::vector<string>& output_names,
+  absl::Status SaveTensors(const std::vector<std::string>& output_names,
                            SessionState* session_state);
 
   // Returns true if no tensors have been added to this store.
@@ -83,7 +83,7 @@ class TensorStore {
 
   // The tensors that will be saved to session state when this run completes.
   // A map from tensor string name to tensor.
-  std::unordered_map<string, TensorAndKey> tensors_ TF_GUARDED_BY(lock_);
+  std::unordered_map<std::string, TensorAndKey> tensors_ TF_GUARDED_BY(lock_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 5ba580d2f49276..a201638e797306 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 
 #include "tensorflow/core/framework/bounds_check.h"
@@ -282,9 +283,9 @@ DimensionHandle InferenceContext::NumElements(ShapeHandle s) {
   }
 }
 
-string InferenceContext::DebugString(ShapeHandle s) {
+std::string InferenceContext::DebugString(ShapeHandle s) {
   if (RankKnown(s)) {
-    std::vector<string> vals;
+    std::vector<std::string> vals;
     for (auto d : s->dims_) vals.push_back(DebugString(d));
     return absl::StrCat("[", absl::StrJoin(vals, ","), "]");
   } else {
@@ -292,22 +293,22 @@ string InferenceContext::DebugString(ShapeHandle s) {
   }
 }
 
-string InferenceContext::DebugString(DimensionHandle d) {
+std::string InferenceContext::DebugString(DimensionHandle d) {
   return ValueKnown(d) ? absl::StrCat(Value(d)) : "?";
 }
 
-string InferenceContext::DebugString() const {
+std::string InferenceContext::DebugString() const {
   return absl::StrCat("InferenceContext for node: ", attrs_.SummarizeNode());
 }
 
-string InferenceContext::DebugString(const ShapeAndType& shape_and_type) {
+std::string InferenceContext::DebugString(const ShapeAndType& shape_and_type) {
   return absl::StrCat(DebugString(shape_and_type.shape), ":",
                       DataTypeString(shape_and_type.dtype));
 }
 
-string InferenceContext::DebugString(
+std::string InferenceContext::DebugString(
     absl::Span<const ShapeAndType> shape_and_types) {
-  std::vector<string> pieces;
+  std::vector<std::string> pieces;
   for (const ShapeAndType& s : shape_and_types) {
     pieces.push_back(DebugString(s));
   }
@@ -316,7 +317,7 @@ string InferenceContext::DebugString(
 
 absl::Status InferenceContext::WithRank(ShapeHandle shape, int64_t rank,
                                         ShapeHandle* out) {
-  if (rank > kint32max) {
+  if (rank > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32_t existing = Rank(shape);
@@ -341,7 +342,7 @@ absl::Status InferenceContext::WithRank(ShapeHandle shape, int64_t rank,
 
 absl::Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64_t rank,
                                                ShapeHandle* out) {
-  if (rank > kint32max) {
+  if (rank > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32_t existing = Rank(shape);
@@ -356,7 +357,7 @@ absl::Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64_t rank,
 
 absl::Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64_t rank,
                                               ShapeHandle* out) {
-  if (rank > kint32max) {
+  if (rank > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32_t existing = Rank(shape);
@@ -702,7 +703,8 @@ ShapeHandle InferenceContext::UnknownShape() {
 }
 
 ShapeHandle InferenceContext::UnknownShapeOfRank(int64_t rank) {
-  CHECK_LE(rank, kint32max) << "rank must be less than kint32max";
+  CHECK_LE(rank, std::numeric_limits<int32_t>::max())
+      << "rank must be less than kint32max";
   if (rank == kUnknownRank) {
     return UnknownShape();
   }
@@ -812,7 +814,7 @@ absl::Status InferenceContext::InternalMakeShapeFromTensor(
 
   if (t->shape().dims() == 0) {
     if (t->dtype() == DataType::DT_INT32) {
-      auto flat_t = t->scalar<int32>();
+      auto flat_t = t->scalar<int32_t>();
       if (flat_t() != -1) {
         *out = nullptr;
         return errors::InvalidArgument(
@@ -853,7 +855,7 @@ absl::Status InferenceContext::InternalMakeShapeFromTensor(
   }
   std::vector<DimensionHandle> dims;
   if (t->dtype() == DataType::DT_INT32) {
-    auto flat_t = t->flat<int32>();
+    auto flat_t = t->flat<int32_t>();
     for (int i = 0; i < flat_t.size(); ++i) {
       const int32_t val = flat_t(i);
       if (val < -1) {
@@ -939,7 +941,7 @@ absl::Status InferenceContext::GetScalarFromTensor(const Tensor* t,
     *val = t->scalar<int16_t>()();
     return absl::OkStatus();
   } else if (t->dtype() == DataType::DT_INT32) {
-    *val = t->scalar<int32>()();
+    *val = t->scalar<int32_t>()();
     return absl::OkStatus();
   } else if (t->dtype() == DataType::DT_INT64) {
     *val = t->scalar<int64_t>()();
@@ -959,7 +961,7 @@ absl::Status InferenceContext::GetScalarFromTensor(const Tensor* t, int64_t idx,
   }
 
   if (t->dtype() == DataType::DT_INT32) {
-    auto flat_t = t->flat<int32>();
+    auto flat_t = t->flat<int32_t>();
     if (idx < 0 || idx >= flat_t.size()) {
       return errors::InvalidArgument("Invalid index ", idx,
                                      " for Tensor of size ", flat_t.size());
@@ -1069,7 +1071,7 @@ absl::Status InferenceContext::Add(DimensionHandle first,
     // get pair of values which cannot be store in output. Check below will
     // report error. We still need to avoid undefined behavior of signed
     // overflow and use unsigned addition.
-    const int64_t sum = static_cast<uint64>(first_value) + second_value;
+    const int64_t sum = static_cast<uint64_t>(first_value) + second_value;
     if (sum < 0) {
       return errors::InvalidArgument("Dimension size overflow from adding ",
                                      first_value, " and ", second_value);
@@ -1170,15 +1172,15 @@ absl::Status InferenceContext::Max(DimensionHandle first,
 }
 
 absl::Status InferenceContext::AttachContext(const absl::Status& status) {
-  std::vector<string> input_shapes;
+  std::vector<std::string> input_shapes;
   input_shapes.reserve(inputs_.size());
   for (const ShapeHandle& input_shape : inputs_) {
     input_shapes.emplace_back(DebugString(input_shape));
   }
 
   // Add information about the input tensors and partial tensor shapes used.
-  std::vector<string> input_from_tensors_str;
-  std::vector<string> input_from_tensors_as_shape_str;
+  std::vector<std::string> input_from_tensors_str;
+  std::vector<std::string> input_from_tensors_as_shape_str;
   input_from_tensors_as_shape_str.reserve(inputs_.size());
   for (int i = 0, end = inputs_.size(); i < end; ++i) {
     const int input_tensors_as_shapes_size = input_tensors_as_shapes_.size();
@@ -1197,7 +1199,7 @@ absl::Status InferenceContext::AttachContext(const absl::Status& status) {
     }
   }
 
-  string error_context =
+  std::string error_context =
       absl::StrCat(" for '", attrs_.SummarizeNode(),
                    "' with input shapes: ", absl::StrJoin(input_shapes, ", "));
   if (!input_from_tensors_str.empty()) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 8bfd301d860de1..6065ab22fa8756 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -158,7 +158,7 @@ class Shape {
   Shape(const std::vector<DimensionHandle>& dims);
   ~Shape() {}
 
-  const int32 rank_;
+  const int32_t rank_;
   const std::vector<DimensionHandle> dims_;
 
   friend class InferenceContext;
@@ -430,7 +430,7 @@ class InferenceContext {
     return s->dims_[idx];
   }
 
-  static int32 Rank(ShapeHandle s) {
+  static int32_t Rank(ShapeHandle s) {
     return s.IsSet() ? s->rank_ : kUnknownRank;
   }
   static bool RankKnown(ShapeHandle s) {
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index e07b9b75f0b648..c1d853d4933190 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -230,7 +230,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
   // Error when a constant tensor value as shape was requested, but no partial
   // shapes provided.
   {
-    Tensor input_t = ::tensorflow::test::AsTensor<int32>({1, 2, 3, 4, 5});
+    Tensor input_t = ::tensorflow::test::AsTensor<int32_t>({1, 2, 3, 4, 5});
     InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({3}), S({4})},
                        {nullptr, &input_t}, {}, {});
     TF_ASSERT_OK(c.construction_status());
@@ -257,7 +257,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
   // Error when a constant tensor value as shape was requested, and a partial
   // shape was provided.
   {
-    Tensor input_t = ::tensorflow::test::AsTensor<int32>({1, 2, 3, 4, 5});
+    Tensor input_t = ::tensorflow::test::AsTensor<int32_t>({1, 2, 3, 4, 5});
     InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({3}), S({4})},
                        {nullptr, &input_t}, {S({10, -1, 5}), Unknown()}, {});
     TF_ASSERT_OK(c.construction_status());
@@ -1129,7 +1129,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   Tensor t;
   EXPECT_EQ("?", create(nullptr));
 
-  t = ::tensorflow::test::AsTensor<int32>({1, 2, 3});
+  t = ::tensorflow::test::AsTensor<int32_t>({1, 2, 3});
   EXPECT_EQ("[1,2,3]", create(&t));
 
   t = ::tensorflow::test::AsTensor<int64_t>({3, 2, 1});
@@ -1142,19 +1142,19 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   EXPECT_EQ("[]", create(&t));
 
   // Test negative scalar
-  t = ::tensorflow::test::AsScalar<int32>(-1);
+  t = ::tensorflow::test::AsScalar<int32_t>(-1);
   EXPECT_EQ("?", create(&t));
 
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
   EXPECT_THAT(create(&t),
               HasSubstr("Input tensor must be int32 or int64, but was float"));
 
-  t = ::tensorflow::test::AsScalar<int32>(1);
+  t = ::tensorflow::test::AsScalar<int32_t>(1);
   auto s_scalar = create(&t);
   EXPECT_THAT(s_scalar, HasSubstr("Input tensor must be rank 1, or if its rank "
                                   "0 it must have value -1"));
 
-  t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
+  t = ::tensorflow::test::AsTensor<int32_t>({1, 2}, TensorShape{2, 1});
   auto s_matrix = create(&t);
   EXPECT_THAT(s_matrix,
               HasSubstr("Input tensor must be rank 1, but was rank 2"));
@@ -1165,7 +1165,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
               HasSubstr("Invalid value in tensor used for shape: -2"));
 
   // Test negative values for the dims.
-  t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
+  t = ::tensorflow::test::AsTensor<int32_t>({3, -2, 1});
   EXPECT_THAT(create(&t),
               HasSubstr("Invalid value in tensor used for shape: -2"));
 
@@ -1304,8 +1304,8 @@ TEST_F(ShapeInferenceTest, InputTensors) {
 }
 
 TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
-  Tensor t1 = tensorflow::test::AsScalar<int32>(20);
-  Tensor t2 = tensorflow::test::AsScalar<int32>(-1);
+  Tensor t1 = tensorflow::test::AsScalar<int32_t>(20);
+  Tensor t2 = tensorflow::test::AsScalar<int32_t>(-1);
   NodeDef def;
   InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({}), S({})}, {&t1, &t2},
                      {}, {});
@@ -1334,8 +1334,8 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
 }
 
 TEST_F(ShapeInferenceTest, MakeDimForScalarInputWithNegativeIndexing) {
-  Tensor t1 = tensorflow::test::AsScalar<int32>(-2);
-  Tensor t2 = tensorflow::test::AsScalar<int32>(3);
+  Tensor t1 = tensorflow::test::AsScalar<int32_t>(-2);
+  Tensor t2 = tensorflow::test::AsScalar<int32_t>(3);
   NodeDef def;
   InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({}), S({})}, {&t1, &t2},
                      {}, {});
@@ -1377,7 +1377,7 @@ TEST_F(ShapeInferenceTest, GetAttr) {
 
   std::vector<ShapeHandle> empty;
   InferenceContext c(kVersion, def, op_reg_data.op_def, empty, {}, {}, {});
-  string value;
+  std::string value;
   TF_EXPECT_OK(c.GetAttr("foo", &value));
   EXPECT_EQ("bar", value);
 }
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index e77fed6b3f83a7..d1769e0a0282e5 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -30,17 +30,17 @@ namespace shape_inference {
 
 using errors::Unknown;
 
-absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
-                                                 const string& ins,
-                                                 const string& expected_outs) {
+absl::Status ShapeInferenceTestutil::InferShapes(
+    ShapeInferenceTestOp op, const std::string& ins,
+    const std::string& expected_outs) {
   const OpRegistrationData* op_reg_data;
   TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUp(op.name, &op_reg_data));
 
-  std::vector<string> ins_v = str_util::Split(ins, ';');
+  std::vector<std::string> ins_v = str_util::Split(ins, ';');
 
   InferenceContext::ShapeManager manager;
   std::vector<ShapeHandle> in_shapes;
-  for (const string& spec : ins_v) {
+  for (const std::string& spec : ins_v) {
     ShapeHandle shape;
     TF_RETURN_IF_ERROR(MakeShapeFromString(&manager, spec, &shape));
     in_shapes.push_back(shape);
@@ -82,7 +82,8 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
   }
 
   // Verify the output shape.
-  std::vector<string> expected_outs_v = str_util::Split(expected_outs, ';');
+  std::vector<std::string> expected_outs_v =
+      str_util::Split(expected_outs, ';');
   if (num_outputs != expected_outs_v.size()) {
     return Unknown("The expected output string lists the wrong number of ",
                    "outputs. It lists ", expected_outs_v.size(),
@@ -92,8 +93,9 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     absl::string_view expected(expected_outs_v[i]);
     shape_inference::ShapeHandle out = c.output(i);
 
-    string err_prefix = absl::StrCat("Output ", i);
-    string err_suffix = absl::StrCat(". Output shape was ", c.DebugString(out));
+    std::string err_prefix = absl::StrCat("Output ", i);
+    std::string err_suffix =
+        absl::StrCat(". Output shape was ", c.DebugString(out));
 
     int in_index = -1;
     for (int i = 0; i < c.num_inputs(); ++i) {
@@ -230,7 +232,7 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
 
 // static
 absl::Status ShapeInferenceTestutil::MakeShapeFromString(
-    InferenceContext::ShapeManager* manager, const string& spec,
+    InferenceContext::ShapeManager* manager, const std::string& spec,
     ShapeHandle* output) {
   if (spec == "?") {
     *output = manager->UnknownShape();
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index c9b9bd74a8515f..5c204012546a22 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -32,9 +32,10 @@ namespace tensorflow {
 class Tensor;
 
 struct ShapeInferenceTestOp {
-  typedef std::pair<string, DataType> ShapeAndType;
-  explicit ShapeInferenceTestOp(absl::string_view name) : name(string(name)) {}
-  string name;
+  typedef std::pair<std::string, DataType> ShapeAndType;
+  explicit ShapeInferenceTestOp(absl::string_view name)
+      : name(std::string(name)) {}
+  std::string name;
   NodeDef node_def;
   std::vector<const Tensor*> input_tensors;
   std::vector<std::vector<ShapeAndType>*>
@@ -67,15 +68,16 @@ class ShapeInferenceTestutil {
   //            the second is which dimension in that input it corresponds to.
   // <expected_outs> can be "e"; this is used to indicate that shape inference
   // should have failed.
-  static absl::Status InferShapes(ShapeInferenceTestOp op, const string& ins,
-                                  const string& expected_outs);
+  static absl::Status InferShapes(ShapeInferenceTestOp op,
+                                  const std::string& ins,
+                                  const std::string& expected_outs);
 
  private:
   ShapeInferenceTestutil() = default;
 
   // Makes a shape out of 'spec'.
   static absl::Status MakeShapeFromString(
-      InferenceContext::ShapeManager* manager, const string& spec,
+      InferenceContext::ShapeManager* manager, const std::string& spec,
       ShapeHandle* output);
 };
 
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 4bebe251c5d349..56b32d859a9d8b 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -49,8 +49,9 @@ REGISTER_OP("OpTwoOut")
     .Attr("T: numbertype")
     .SetShapeFn([](InferenceContext* c) { return (*global_fn_ptr)(c); });
 
-string RunInferShapes(const string& op_name, const string& ins,
-                      const string& expected_outs, OpShapeInferenceFn fn) {
+std::string RunInferShapes(const std::string& op_name, const std::string& ins,
+                           const std::string& expected_outs,
+                           OpShapeInferenceFn fn) {
   ShapeInferenceTestOp op(op_name);
   const int num_inputs = 1 + std::count(ins.begin(), ins.end(), ';');
   std::vector<NodeDefBuilder::NodeOut> src_list;
@@ -91,7 +92,7 @@ TEST(ShapeInferenceTestutilTest, Failures) {
     c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
     return absl::OkStatus();
   };
-  const string& op = "OpOneOut";
+  const std::string& op = "OpOneOut";
 
   EXPECT_EQ("Shape inference should have returned error",
             RunInferShapes(op, "[1];[2];[1]", "e", fn_copy_input_0));
@@ -143,7 +144,7 @@ TEST(ShapeInferenceTestutilTest, Failures) {
                                    c->UnknownDim(), c->Dim(c->input(2), 0)}));
     return absl::OkStatus();
   };
-  const string ins = "[0,1,?];[2];[1]";
+  const std::string ins = "[0,1,?];[2];[1]";
   EXPECT_CONTAINS(RunInferShapes(op, ins, "[?,2,?,d2_0]", fn),
                   "Output dim 0,0 expected to be an unknown");
   EXPECT_CONTAINS(RunInferShapes(op, ins, "[0,2,?,d2_0]", fn),
diff --git a/tensorflow/core/framework/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
index 5b89a82f861b9a..c7327f54880842 100644
--- a/tensorflow/core/framework/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -46,13 +46,13 @@ class StatsAggregator {
 
   // Add the given `values` to the histogram with the given `name`. Each
   // element of `values` will be treated as a separate sample in the histogram.
-  virtual void AddToHistogram(const string& name,
+  virtual void AddToHistogram(const std::string& name,
                               absl::Span<const double> values,
                               int64_t global_step) = 0;
 
   // TODO(shivaniagrawal): consistency in double and float usage.
   // Add the given `value` as Scalar with the given `name`.
-  virtual void AddScalar(const string& name, float value,
+  virtual void AddScalar(const std::string& name, float value,
                          int64_t global_step) = 0;
 
   // Stores a protocol buffer representation of the aggregator state in the
@@ -64,8 +64,8 @@ class StatsAggregator {
       SummaryWriterInterface* summary_writer) = 0;
 
   // Increment the `label` cell of metrics mapped with `name` by given `value`.
-  virtual void IncrementCounter(const string& name, const string& label,
-                                int64_t val) = 0;
+  virtual void IncrementCounter(const std::string& name,
+                                const std::string& label, int64_t val) = 0;
 };
 
 // A `StatsAggregatorResource` wraps a sharable `StatsAggregator` as a resource
@@ -86,7 +86,7 @@ class StatsAggregatorResource : public ResourceBase {
     return stats_aggregator_;
   }
 
-  string DebugString() const override { return "StatsAggregatorResource"; }
+  std::string DebugString() const override { return "StatsAggregatorResource"; }
 
  private:
   const std::shared_ptr<StatsAggregator> stats_aggregator_;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 1b03732fabfcf6..819b628f137955 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -231,7 +231,7 @@ struct Helper {
 template <>
 struct Helper<tstring> {
   // Proto message uses RepeatedFieldType to hold repeated T.
-  typedef protobuf::RepeatedPtrField<string> RepeatedFieldType;
+  typedef protobuf::RepeatedPtrField<std::string> RepeatedFieldType;
 
   // Encodes "n" elements of type string stored in "in" into Cord
   // "out", which is usually the TensorProto::tensor_content.
@@ -268,7 +268,7 @@ struct Helper<tstring> {
 template <>
 struct Helper<ResourceHandle> {
   // Proto message uses RepeatedFieldType to hold repeated T.
-  typedef protobuf::RepeatedPtrField<string> RepeatedFieldType;
+  typedef protobuf::RepeatedPtrField<std::string> RepeatedFieldType;
 
   // Encodes "n" elements of type ResourceHandle stored in "in" into destination
   // "out", which is usually the TensorProto::tensor_content.
@@ -357,18 +357,18 @@ struct ProtoHelper {};
 
 PROTO_TRAITS(float, float, float);
 PROTO_TRAITS(double, double, double);
-PROTO_TRAITS(int32, int32, int);
-PROTO_TRAITS(uint8, int32, int);
-PROTO_TRAITS(uint16, int32, int);
-PROTO_TRAITS(uint32, uint32, uint32);
-PROTO_TRAITS(int16, int32, int);
-PROTO_TRAITS(int8, int32, int);
+PROTO_TRAITS(int32_t, int32_t, int);
+PROTO_TRAITS(uint8_t, int32_t, int);
+PROTO_TRAITS(uint16_t, int32_t, int);
+PROTO_TRAITS(uint32_t, uint32_t, uint32);
+PROTO_TRAITS(int16_t, int32_t, int);
+PROTO_TRAITS(int8_t, int32_t, int);
 PROTO_TRAITS(bool, bool, bool);
 PROTO_TRAITS(tstring, tstring, string);
-PROTO_TRAITS(qint8, int32, int);
-PROTO_TRAITS(quint8, int32, int);
-PROTO_TRAITS(qint16, int32, int);
-PROTO_TRAITS(quint16, int32, int);
+PROTO_TRAITS(qint8, int32_t, int);
+PROTO_TRAITS(quint8, int32_t, int);
+PROTO_TRAITS(qint16, int32_t, int);
+PROTO_TRAITS(quint16, int32_t, int);
 #undef PROTO_TRAITS
 
 template <typename T>
@@ -416,7 +416,7 @@ struct ProtoHelper<int64_t> {
 };
 
 template <>
-struct ProtoHelper<uint64> {
+struct ProtoHelper<uint64_t> {
   static protobuf::RepeatedField<uint64_t>::const_iterator Begin(
       const TensorProto& proto) {
     return proto.uint64_val().begin();
@@ -424,7 +424,7 @@ struct ProtoHelper<uint64> {
   static size_t NumElements(const TensorProto& proto) {
     return proto.uint64_val().size();
   }
-  static void Fill(const uint64* data, size_t n, TensorProto* proto) {
+  static void Fill(const uint64_t* data, size_t n, TensorProto* proto) {
     protobuf::RepeatedField<protobuf_uint64> copy(data, data + n);
     proto->mutable_uint64_val()->Swap(&copy);
   }
@@ -502,7 +502,7 @@ struct ProtoHelper<complex128> {
 
 template <>
 struct ProtoHelper<qint32> {
-  typedef Helper<int32>::RepeatedFieldType FieldType;
+  typedef Helper<int32_t>::RepeatedFieldType FieldType;
   static const qint32* Begin(const TensorProto& proto) {
     return reinterpret_cast<const qint32*>(proto.int_val().data());
   }
@@ -510,7 +510,7 @@ struct ProtoHelper<qint32> {
     return proto.int_val().size();
   }
   static void Fill(const qint32* data, size_t n, TensorProto* proto) {
-    const int32* p = reinterpret_cast<const int32*>(data);
+    const int32_t* p = reinterpret_cast<const int32_t*>(data);
     FieldType copy(p, p + n);
     proto->mutable_int_val()->Swap(&copy);
   }
@@ -522,7 +522,7 @@ struct ProtoHelper<bfloat16> {
     proto->mutable_half_val()->Reserve(n);
     for (size_t i = 0; i < n; ++i) {
       proto->mutable_half_val()->AddAlreadyReserved(
-          Eigen::numext::bit_cast<uint16>(data[i]));
+          Eigen::numext::bit_cast<uint16_t>(data[i]));
     }
   }
 };
@@ -533,14 +533,14 @@ struct ProtoHelper<Eigen::half> {
     proto->mutable_half_val()->Reserve(n);
     for (size_t i = 0; i < n; ++i) {
       proto->mutable_half_val()->AddAlreadyReserved(
-          Eigen::numext::bit_cast<uint16>(data[i]));
+          Eigen::numext::bit_cast<uint16_t>(data[i]));
     }
   }
 };
 
 template <typename Float8>
 struct Float8ProtoHelper {
-  typedef string RepeatedFieldType;
+  typedef std::string RepeatedFieldType;
   static const Float8* Begin(const TensorProto& proto) {
     return reinterpret_cast<const Float8*>(proto.float8_val().data());
   }
@@ -574,6 +574,29 @@ template <>
 struct ProtoHelper<float8_e5m2fnuz>
     : public Float8ProtoHelper<float8_e5m2fnuz> {};
 
+template <typename Float4>
+struct Float4ProtoHelper {
+  typedef std::string RepeatedFieldType;
+  static const Float4* Begin(const TensorProto& proto) {
+    // Read from float8_val
+    return reinterpret_cast<const Float4*>(proto.float8_val().data());
+  }
+  static size_t NumElements(const TensorProto& proto) {
+    // Size is the number of bytes in float8_val
+    return proto.float8_val().size();
+  }
+  static void Fill(const Float4* data, size_t n, TensorProto* proto) {
+    proto->mutable_float8_val()->reserve(n);
+    for (size_t i = 0; i < n; ++i) {
+      proto->mutable_float8_val()->push_back(
+          Eigen::numext::bit_cast<uint8_t>(data[i]));
+    }
+  }
+};
+
+template <>
+struct ProtoHelper<float4_e2m1fn> : public Float4ProtoHelper<float4_e2m1fn> {};
+
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64_t n)
     : BufferBase(a, TypedAllocator::Allocate<T>(a, n, AllocationAttributes())),
@@ -659,7 +682,7 @@ TensorBuffer* Int4OrInt2FromProtoField(Allocator* a, const TensorProto& in,
     std::copy_n(begin, n, data);
   } else if (in_n > 0) {
     std::copy_n(begin, in_n, data);
-    const uint16 last = *(data + in_n - 1);
+    const uint16_t last = *(data + in_n - 1);
     std::fill_n(data + in_n, n - in_n, last);
   } else {
     std::fill_n(data, n, 0);
@@ -776,7 +799,7 @@ TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
                                           int64_t n) {
   CHECK_GT(n, 0);
   Buffer<Eigen::half>* buf = new Buffer<Eigen::half>(a, n);
-  uint16* data = buf->template base<uint16>();
+  uint16_t* data = buf->template base<uint16_t>();
   if (data == nullptr) {
     buf->Unref();
     return nullptr;
@@ -787,7 +810,7 @@ TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
     std::copy_n(begin, n, data);
   } else if (in_n > 0) {
     std::copy_n(begin, in_n, data);
-    const uint16 last = *(data + in_n - 1);
+    const uint16_t last = *(data + in_n - 1);
     std::fill_n(data + in_n, n - in_n, last);
   } else {
     std::fill_n(data, n, 0);
@@ -800,7 +823,7 @@ TensorBuffer* FromProtoField<bfloat16>(Allocator* a, const TensorProto& in,
                                        int64_t n) {
   CHECK_GT(n, 0);
   Buffer<bfloat16>* buf = new Buffer<bfloat16>(a, n);
-  uint16* data = buf->template base<uint16>();
+  uint16_t* data = buf->template base<uint16_t>();
   if (data == nullptr) {
     buf->Unref();
     return nullptr;
@@ -811,7 +834,7 @@ TensorBuffer* FromProtoField<bfloat16>(Allocator* a, const TensorProto& in,
     std::copy_n(begin, n, data);
   } else if (in_n > 0) {
     std::copy_n(begin, in_n, data);
-    const uint16 last = *(data + in_n - 1);
+    const uint16_t last = *(data + in_n - 1);
     std::fill_n(data + in_n, n - in_n, last);
   } else {
     std::fill_n(data, n, 0);
@@ -940,13 +963,13 @@ absl::Status Tensor::BitcastFrom(const Tensor& other, DataType dtype,
   switch (TYPE_ENUM) {                                         \
     CASE(float, SINGLE_ARG(STMTS))                             \
     CASE(double, SINGLE_ARG(STMTS))                            \
-    CASE(int32, SINGLE_ARG(STMTS))                             \
-    CASE(uint8, SINGLE_ARG(STMTS))                             \
-    CASE(uint16, SINGLE_ARG(STMTS))                            \
-    CASE(uint32, SINGLE_ARG(STMTS))                            \
-    CASE(uint64, SINGLE_ARG(STMTS))                            \
-    CASE(int16, SINGLE_ARG(STMTS))                             \
-    CASE(int8, SINGLE_ARG(STMTS))                              \
+    CASE(int32_t, SINGLE_ARG(STMTS))                           \
+    CASE(uint8_t, SINGLE_ARG(STMTS))                           \
+    CASE(uint16_t, SINGLE_ARG(STMTS))                          \
+    CASE(uint32_t, SINGLE_ARG(STMTS))                          \
+    CASE(uint64_t, SINGLE_ARG(STMTS))                          \
+    CASE(int16_t, SINGLE_ARG(STMTS))                           \
+    CASE(int8_t, SINGLE_ARG(STMTS))                            \
     CASE(tstring, SINGLE_ARG(STMTS))                           \
     CASE(complex64, SINGLE_ARG(STMTS))                         \
     CASE(complex128, SINGLE_ARG(STMTS))                        \
@@ -966,6 +989,7 @@ absl::Status Tensor::BitcastFrom(const Tensor& other, DataType dtype,
     CASE(float8_e4m3fnuz, SINGLE_ARG(STMTS))                   \
     CASE(float8_e4m3b11fnuz, SINGLE_ARG(STMTS))                \
     CASE(float8_e5m2fnuz, SINGLE_ARG(STMTS))                   \
+    CASE(float4_e2m1fn, SINGLE_ARG(STMTS))                     \
     CASE(int4, SINGLE_ARG(STMTS))                              \
     CASE(uint4, SINGLE_ARG(STMTS))                             \
     CASE(int2, SINGLE_ARG(STMTS))                              \
@@ -1240,7 +1264,7 @@ template <typename T>
 const T& PrintOneElement(const T& value, bool print_v2) {
   return value;
 }
-string PrintOneElement(const tstring& a, bool print_v2) {
+std::string PrintOneElement(const tstring& a, bool print_v2) {
   if (print_v2) {
     return "\"" + absl::Utf8SafeCEscape(a) + "\"";
   } else {
@@ -1267,6 +1291,10 @@ float PrintOneElement(float8_e4m3b11fnuz f, bool print_v2) {
   return static_cast<float>(f);
 }
 
+float PrintOneElement(float4_e2m1fn f, bool print_v2) {
+  return static_cast<float>(f);
+}
+
 int16_t PrintOneElement(int4 a, bool print_v2) {
   return static_cast<int16_t>(a);
 }
@@ -1285,9 +1313,9 @@ uint16_t PrintOneElement(uint2 a, bool print_v2) {
 
 // Print from left dim to right dim recursively.
 template <typename T>
-void PrintOneDim(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
+void PrintOneDim(int dim_index, const absl::InlinedVector<int64_t, 4UL>& shape,
                  int64_t limit, int shape_size, const T* data,
-                 int64_t* data_index, string* result) {
+                 int64_t* data_index, std::string* result) {
   if (*data_index >= limit) return;
   int64_t element_count = shape[dim_index];
   // We have reached the right-most dimension of the tensor.
@@ -1324,7 +1352,7 @@ void PrintOneDim(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
 }
 
 // Appends the spacing between elements for a given dim onto a result string
-void PrintDimSpacing(int dim_index, int num_dims, string* result) {
+void PrintDimSpacing(int dim_index, int num_dims, std::string* result) {
   if (dim_index == num_dims - 1) {
     absl::StrAppend(result, " ");
     return;
@@ -1339,9 +1367,10 @@ void PrintDimSpacing(int dim_index, int num_dims, string* result) {
 
 // Print from left dim to right dim recursively.
 template <typename T>
-void PrintOneDimV2(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
+void PrintOneDimV2(int dim_index,
+                   const absl::InlinedVector<int64_t, 4UL>& shape,
                    int64_t num_elts_at_ends, int num_dims, const T* data,
-                   int64_t data_index, string* result) {
+                   int64_t data_index, std::string* result) {
   // We have recursed beyond all the dimensions into a single element
   // of the tensor.
   if (dim_index == num_dims) {
@@ -1384,10 +1413,10 @@ void PrintOneDimV2(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
 }
 
 template <typename T>
-string SummarizeArrayInternal(int64_t limit, int64_t num_elts,
-                              const TensorShape& tensor_shape, const T* array,
-                              const bool print_v2) {
-  string ret;
+std::string SummarizeArrayInternal(int64_t limit, int64_t num_elts,
+                                   const TensorShape& tensor_shape,
+                                   const T* array, const bool print_v2) {
+  std::string ret;
   const absl::InlinedVector<int64_t, 4UL> shape = tensor_shape.dim_sizes();
   if (shape.empty()) {
     for (int64_t i = 0; i < limit; ++i) {
@@ -1413,18 +1442,18 @@ string SummarizeArrayInternal(int64_t limit, int64_t num_elts,
 }
 
 template <typename T>
-string SummarizeArray(int64_t limit, int64_t num_elts,
-                      const TensorShape& tensor_shape, const char* data,
-                      const bool print_v2) {
+std::string SummarizeArray(int64_t limit, int64_t num_elts,
+                           const TensorShape& tensor_shape, const char* data,
+                           const bool print_v2) {
   const T* array = reinterpret_cast<const T*>(data);
   return SummarizeArrayInternal<T>(limit, num_elts, tensor_shape, array,
                                    print_v2);
 }
 
 template <>
-string SummarizeArray<bool>(int64_t limit, int64_t num_elts,
-                            const TensorShape& tensor_shape, const char* data,
-                            const bool print_v2) {
+std::string SummarizeArray<bool>(int64_t limit, int64_t num_elts,
+                                 const TensorShape& tensor_shape,
+                                 const char* data, const bool print_v2) {
   if (data == nullptr) {
     return "";  // we already print type and shape
   }
@@ -1439,7 +1468,7 @@ string SummarizeArray<bool>(int64_t limit, int64_t num_elts,
 }
 }  // namespace
 
-string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
+std::string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
   const int64_t num_elts = NumElements();
   if (max_entries < 0) {
     max_entries = num_elts;
@@ -1467,6 +1496,10 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
     case DT_FLOAT8_E4M3B11FNUZ:
       return SummarizeArray<float8_e4m3b11fnuz>(limit, num_elts, shape_, data,
                                                 print_v2);
+    case DT_FLOAT4_E2M1FN:
+      return SummarizeArray<float4_e2m1fn>(limit, num_elts, shape_, data,
+                                           print_v2);
+      break;
     case DT_FLOAT:
       return SummarizeArray<float>(limit, num_elts, shape_, data, print_v2);
       break;
@@ -1474,29 +1507,29 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
       return SummarizeArray<double>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT32:
-      return SummarizeArray<uint32>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint32_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT32:
-      return SummarizeArray<int32>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<int32_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT8:
     case DT_QUINT8:
-      return SummarizeArray<uint8>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint8_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT16:
     case DT_QUINT16:
-      return SummarizeArray<uint16>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint16_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT16:
     case DT_QINT16:
-      return SummarizeArray<int16>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<int16_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT8:
     case DT_QINT8:
-      return SummarizeArray<int8>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<int8_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT64:
-      return SummarizeArray<uint64>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint64_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT64:
       return SummarizeArray<int64_t>(limit, num_elts, shape_, data, print_v2);
@@ -1519,7 +1552,7 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
       return SummarizeArray<uint2>(limit, num_elts, shape_, data, print_v2);
     default: {
       // All irregular cases
-      string ret;
+      std::string ret;
       if (print_v2 && (dims() > 0)) {
         absl::StrAppend(&ret, "[");
       }
@@ -1571,13 +1604,13 @@ bool Tensor::SharesBufferWith(const Tensor& b) const {
          buf_->root_buffer() == b.buf_->root_buffer();
 }
 
-string Tensor::DebugString(int num_values) const {
+std::string Tensor::DebugString(int num_values) const {
   return absl::StrCat("Tensor<type: ", DataTypeString(dtype()),
                       " shape: ", shape().DebugString(),
                       " values: ", SummarizeValue(num_values), ">");
 }
 
-string Tensor::DeviceSafeDebugString() const {
+std::string Tensor::DeviceSafeDebugString() const {
   return absl::StrCat("Tensor<type: ", DataTypeString(dtype()),
                       " shape: ", shape().DebugString(), ">");
 }
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index c66f206c471d91..5db5b0bcd74e84 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -217,11 +217,11 @@ class Tensor {
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(int32_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint32 scalar_value)
+  explicit Tensor(uint32_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint16 scalar_value)
+  explicit Tensor(uint16_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint8 scalar_value)
+  explicit Tensor(uint8_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(int16_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
@@ -235,7 +235,7 @@ class Tensor {
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(int64_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint64 scalar_value)
+  explicit Tensor(uint64_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(bool scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
diff --git a/tensorflow/core/framework/tensor_fuzz.cc b/tensorflow/core/framework/tensor_fuzz.cc
index ef04128e0d8328..578d4c5697fd8d 100644
--- a/tensorflow/core/framework/tensor_fuzz.cc
+++ b/tensorflow/core/framework/tensor_fuzz.cc
@@ -36,7 +36,7 @@ FUZZ_TEST(TensorFuzz, BuildTensorAlwaysSucceedsWithValidTensorShape)
                                      /*dim_upper_bound=*/10));
 
 void DebugStringCheck(const Tensor& tensor) {
-  string out = tensor.DeviceSafeDebugString();
+  std::string out = tensor.DeviceSafeDebugString();
 }
 FUZZ_TEST(TensorFuzz, DebugStringCheck)
     .WithDomains(
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 4dc00a66af3adb..2db5464941f6b9 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -35,7 +35,7 @@ static_assert(sizeof(TensorShapeRep) == sizeof(PartialTensorShape),
 
 template <class Shape>
 static void AppendTo(const TensorShapeBase<Shape>& s,
-                     absl::InlinedVector<int64, 8UL>* vals) {
+                     absl::InlinedVector<int64_t, 8UL>* vals) {
   for (auto dim : s) {
     vals->push_back(dim.size);
   }
@@ -213,10 +213,10 @@ absl::Status TensorShapeBase<Shape>::BuildTensorShapeBase(
 // Returns true iff partial is true and val is < 0.
 // REQUIRES: val < kMaxRep16
 // REQUIRES: partial || val >= 0
-static inline bool Set16(bool partial, uint16* dst, int dim, int64_t val) {
+static inline bool Set16(bool partial, uint16_t* dst, int dim, int64_t val) {
   if (partial) {
     if (val < 0) {
-      dst[dim] = std::numeric_limits<uint16>::max();
+      dst[dim] = std::numeric_limits<uint16_t>::max();
       return true;
     }
   }
@@ -229,10 +229,11 @@ absl::Status TensorShapeBase<Shape>::InitDims(
     absl::Span<const int64_t> dim_sizes) {
   DCHECK_EQ(tag(), REP16);
 
-  // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
-  // below cannot overflow.
+  // Allow sizes that are under std::numeric_limits<int64_t>::max()^0.25 so that
+  // 4-way multiplication below cannot overflow.
   static const int64_t kMaxSmall = 0xd744;
-  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
+  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <=
+                    std::numeric_limits<int64_t>::max(),
                 "bad overflow check");
   bool large_size = false;
   for (auto s : dim_sizes) {
@@ -253,7 +254,7 @@ absl::Status TensorShapeBase<Shape>::InitDims(
 
   if (!large_size) {
     // Every size fits in 16 bits; use fast-paths for dims in {1,2,3,4}.
-    uint16* dst = as16()->dims_;
+    uint16_t* dst = as16()->dims_;
     switch (dim_sizes.size()) {
       case 1: {
         set_ndims_byte(1);
@@ -358,11 +359,11 @@ int64_t TensorShapeBase<Shape>::dim_size(int d) const {
   CHECK_GE(d, 0);                  // Crash OK
   if (d > 0) CHECK_LT(d, dims());  // Crash OK
   if (tag() == REP16) {
-    uint16 dim = as16()->dims_[d];
+    uint16_t dim = as16()->dims_[d];
     if (kIsPartial && dim == kUnknownRep16) return -1;
     return dim;
   } else if (tag() == REP32) {
-    uint32 dim = as32()->dims_[d];
+    uint32_t dim = as32()->dims_[d];
     if (kIsPartial && dim == kUnknownRep32) return -1;
     return dim;
   } else {
@@ -462,10 +463,10 @@ void TensorShapeBase<Shape>::UnsafeAddDim(int64_t size,
   const int nd = ndims_byte();
   if (tag() == REP16 && nd < 6 && size < kMaxRep16) {
     as16()->dims_[nd] =
-        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16_t>(size);
   } else if (tag() == REP32 && nd < 3 && size < kMaxRep32) {
     as32()->dims_[nd] =
-        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32_t>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     as64()->dims_->push_back(size);
   } else {
@@ -490,7 +491,7 @@ void TensorShapeBase<Shape>::UnsafeAddDim(int64_t size,
       for (size_t d = 0; d < vals.size(); d++) {
         as32()->dims_[d] = kIsPartial && vals[d] < 0
                                ? kUnknownRep32
-                               : static_cast<uint32>(vals[d]);
+                               : static_cast<uint32_t>(vals[d]);
       }
     } else {
       set_tag(REP_OUT_OF_LINE);
@@ -590,10 +591,10 @@ void TensorShapeBase<Shape>::set_dim(int d, int64_t size) {
   }
   if (tag() == REP16 && size < kMaxRep16) {
     as16()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16_t>(size);
   } else if (tag() == REP32 && size < kMaxRep32) {
     as32()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32_t>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     (*as64()->dims_)[d] = size;
   } else {
@@ -624,10 +625,10 @@ absl::Status TensorShapeBase<Shape>::SetDimWithStatus(int d, int64_t size) {
 
   if (tag() == REP16 && size < kMaxRep16) {
     as16()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16_t>(size);
   } else if (tag() == REP32 && size < kMaxRep32) {
     as32()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32_t>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     (*as64()->dims_)[d] = size;
   } else {
@@ -752,10 +753,10 @@ TensorShapeIter<Shape> TensorShapeBase<Shape>::end() const {
   return TensorShapeIter<Shape>(static_cast<const Shape*>(this), max_dim);
 }
 
-string TensorShapeRep::DebugString() const {
+std::string TensorShapeRep::DebugString() const {
   const auto& shape = *static_cast<const PartialTensorShape*>(this);
   if (shape.unknown_rank()) return "<unknown>";
-  string s = "[";
+  std::string s = "[";
   for (int i = 0; i < shape.dims(); i++) {
     if (i > 0) absl::StrAppend(&s, ",");
     int64_t dim = shape.dim_size(i);
@@ -769,8 +770,8 @@ string TensorShapeRep::DebugString() const {
   return s;
 }
 
-string TensorShapeRep::DebugString(const TensorShapeProto& proto) {
-  string s;
+std::string TensorShapeRep::DebugString(const TensorShapeProto& proto) {
+  std::string s;
   if (proto.unknown_rank()) {
     absl::StrAppend(&s, "<unknown>");
     if (proto.dim_size() == 0) return s;
@@ -858,15 +859,15 @@ absl::Status MakeShapeHelper(const T* dims, int64_t n, Shape* out) {
   Status TensorShapeUtils::MakeShape(gtl::ArraySlice<T> shape, Shape* out) { \
     return MakeShapeHelper(shape.data(), shape.size(), out);                 \
   }
-MAKE_SHAPE(int32, TensorShape)
+MAKE_SHAPE(int32_t, TensorShape)
 MAKE_SHAPE(int64_t, TensorShape)
-MAKE_SHAPE(int32, PartialTensorShape)
+MAKE_SHAPE(int32_t, PartialTensorShape)
 MAKE_SHAPE(int64_t, PartialTensorShape)
 #undef MAKE_SHAPE
 
-string TensorShapeUtils::ShapeListString(
+std::string TensorShapeUtils::ShapeListString(
     const absl::Span<const TensorShape>& shapes) {
-  string result = "[";
+  std::string result = "[";
   bool first = true;
   for (const TensorShape& shape : shapes) {
     absl::StrAppend(&result, first ? "" : ", ", shape.DebugString());
@@ -985,9 +986,9 @@ bool PartialTensorShape::IsCompatibleWith(
   return true;
 }
 
-string PartialTensorShapeUtils::PartialShapeListString(
+std::string PartialTensorShapeUtils::PartialShapeListString(
     const absl::Span<const PartialTensorShape>& shapes) {
-  string result = "[";
+  std::string result = "[";
   bool first = true;
   for (const PartialTensorShape& shape : shapes) {
     absl::StrAppend(&result, first ? "" : ", ", shape.DebugString());
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 0bcf1fc54af844..dfc292a5f22e50 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -91,10 +91,10 @@ class TensorShapeRep {
   // For PartialTensorShape, a dimension of static_cast<uint??>(-1) is unknown.
   // This value is not allowed in TensorShape either for format compatibility.
   struct Rep16 {
-    uint16 dims_[6];
+    uint16_t dims_[6];
   };
   struct Rep32 {
-    uint32 dims_[3];
+    uint32_t dims_[3];
   };
   struct Rep64 {
     absl::InlinedVector<int64_t, 4UL>* dims_;
@@ -102,10 +102,12 @@ class TensorShapeRep {
 
   // We use the max value of uint16 or uint32 to represent unknown shapes, so
   // the maximum representable valid shape in these representations is one less.
-  static constexpr int64_t kMaxRep16 = std::numeric_limits<uint16>::max() - 1;
-  static constexpr int64_t kMaxRep32 = std::numeric_limits<uint32>::max() - 1;
-  static constexpr uint16 kUnknownRep16 = std::numeric_limits<uint16>::max();
-  static constexpr uint32 kUnknownRep32 = std::numeric_limits<uint32>::max();
+  static constexpr int64_t kMaxRep16 = std::numeric_limits<uint16_t>::max() - 1;
+  static constexpr int64_t kMaxRep32 = std::numeric_limits<uint32_t>::max() - 1;
+  static constexpr uint16_t kUnknownRep16 =
+      std::numeric_limits<uint16_t>::max();
+  static constexpr uint32_t kUnknownRep32 =
+      std::numeric_limits<uint32_t>::max();
 
   Rep16* as16() { return reinterpret_cast<Rep16*>(buf()); }
   Rep32* as32() { return reinterpret_cast<Rep32*>(buf()); }
@@ -126,19 +128,19 @@ class TensorShapeRep {
   DataType data_type() const { return static_cast<DataType>(buf()[13]); }
   void set_data_type(DataType dt) {
     // We only have 8 bits available to store DataType, so make sure it fits
-    DCHECK_LT(static_cast<uint32>(dt), 256u);
-    buf()[13] = static_cast<uint8>(dt);
+    DCHECK_LT(static_cast<uint32_t>(dt), 256u);
+    buf()[13] = static_cast<uint8_t>(dt);
   }
 
   // We store the number of dimensions in byte 14, and the RepTag in byte 15.
   // Bytes [0..13] vary depending on the representation.
   // A value of 255 indicates unknown rank in the PartialTensorShape case.
-  static constexpr uint8 kUnknownRank = 255;
-  uint8 ndims_byte() const { return buf()[14]; }
-  void set_ndims_byte(uint8 nd) { buf()[14] = nd; }
+  static constexpr uint8_t kUnknownRank = 255;
+  uint8_t ndims_byte() const { return buf()[14]; }
+  void set_ndims_byte(uint8_t nd) { buf()[14] = nd; }
 
   RepTag tag() const { return static_cast<RepTag>(buf()[15]); }
-  void set_tag(RepTag tag) { buf()[15] = static_cast<uint8>(tag); }
+  void set_tag(RepTag tag) { buf()[15] = static_cast<uint8_t>(tag); }
 
   void set_num_elements(int64_t n) { num_elements_ = n; }
 
@@ -146,11 +148,11 @@ class TensorShapeRep {
   void DestructorOutOfLine();
   void SlowCopyFrom(const TensorShapeRep& b);
 
-  uint8* buf() { return &u_.buf[0]; }
-  const uint8* buf() const { return &u_.buf[0]; }
+  uint8_t* buf() { return &u_.buf[0]; }
+  const uint8_t* buf() const { return &u_.buf[0]; }
 
   union {
-    uint8 buf[16];
+    uint8_t buf[16];
     // Force data to be aligned enough for a pointer.
     Rep64* unused_aligner;
   } u_;
@@ -290,7 +292,7 @@ class TensorShapeBase : public TensorShapeRep {
   /// Return the number of dimensions in the tensor.
   /// Can be -1 meaning unknown rank for PartialTensorShape.
   int dims() const {
-    uint8 dims = ndims_byte();
+    uint8_t dims = ndims_byte();
     return kIsPartial && dims == kUnknownRank ? -1 : dims;
   }
 
@@ -507,18 +509,19 @@ class TensorShapeUtils {
 
   /// \brief Returns a `TensorShape` whose dimensions are
   /// `dims[0]`, `dims[1]`, ..., `dims[n-1]`.
-  static absl::Status MakeShape(const int32* dims, int64_t n, TensorShape* out);
+  static absl::Status MakeShape(const int32_t* dims, int64_t n,
+                                TensorShape* out);
   static absl::Status MakeShape(const int64_t* dims, int64_t n,
                                 TensorShape* out);
-  static absl::Status MakeShape(absl::Span<const int32> shape,
+  static absl::Status MakeShape(absl::Span<const int32_t> shape,
                                 TensorShape* out);
   static absl::Status MakeShape(absl::Span<const int64_t> shape,
                                 TensorShape* out);
-  static absl::Status MakeShape(const int32* dims, int64_t n,
+  static absl::Status MakeShape(const int32_t* dims, int64_t n,
                                 PartialTensorShape* out);
   static absl::Status MakeShape(const int64_t* dims, int64_t n,
                                 PartialTensorShape* out);
-  static absl::Status MakeShape(absl::Span<const int32> shape,
+  static absl::Status MakeShape(absl::Span<const int32_t> shape,
                                 PartialTensorShape* out);
   static absl::Status MakeShape(absl::Span<const int64_t> shape,
                                 PartialTensorShape* out);
@@ -774,7 +777,7 @@ inline TensorShapeBase<Shape>::TensorShapeBase(DataType dt) {
   // Optimized implementation of InitDims() where the shape is statically known
   // to be {0}.
   set_ndims_byte(1);
-  uint16* dst = as16()->dims_;
+  uint16_t* dst = as16()->dims_;
   *dst = 0;
   set_num_elements(0);
 }
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 3de9b9b8462a33..5156d62484dbe9 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 class TensorShapeTestHelper {
  public:
   static void set_data_type(TensorShape* s, DataType t) { s->set_data_type(t); }
-  static uint8 data_type(const TensorShape* s) { return s->data_type(); }
+  static uint8_t data_type(const TensorShape* s) { return s->data_type(); }
 };
 
 namespace {
@@ -620,11 +620,11 @@ class TensorShapeOld {
   TensorShapeIterOld end() const;
 
   /// For error messages.
-  string DebugString() const;
+  std::string DebugString() const;
 
   /// Same as `TensorShape(proto).DebugString()` but doesn't crash for
   /// invalid protos.
-  static string DebugString(const TensorShapeProto& proto);
+  static std::string DebugString(const TensorShapeProto& proto);
 
  private:
   // Recalculates the dimensions of this tensor after they are modified.
@@ -794,13 +794,13 @@ TensorShapeIterOld TensorShapeOld::end() const {
   return TensorShapeIterOld(this, dims());
 }
 
-string TensorShapeOld::DebugString() const {
+std::string TensorShapeOld::DebugString() const {
   return absl::StrCat(
       "[", absl::StrJoin(absl::Span<const int64_t>(dim_sizes_), ","), "]");
 }
 
-string TensorShapeOld::DebugString(const TensorShapeProto& proto) {
-  string s = "[";
+std::string TensorShapeOld::DebugString(const TensorShapeProto& proto) {
+  std::string s = "[";
   bool first = true;
   for (const auto& d : proto.dim()) {
     absl::StrAppend(&s, first ? "" : ",", d.size());
diff --git a/tensorflow/core/framework/tensor_slice.cc b/tensorflow/core/framework/tensor_slice.cc
index 0eed9342c404ec..30971ed5675b39 100644
--- a/tensorflow/core/framework/tensor_slice.cc
+++ b/tensorflow/core/framework/tensor_slice.cc
@@ -75,18 +75,20 @@ absl::Status TensorSlice::BuildTensorSlice(const TensorSliceProto& proto,
   return absl::OkStatus();
 }
 
-absl::Status TensorSlice::Parse(const string& str, TensorSlice* slice) {
-  std::vector<string> items = str_util::Split(str, ':', str_util::SkipEmpty());
+absl::Status TensorSlice::Parse(const std::string& str, TensorSlice* slice) {
+  std::vector<std::string> items =
+      str_util::Split(str, ':', str_util::SkipEmpty());
   slice->starts_.reserve(items.size());
   slice->lengths_.reserve(items.size());
-  for (const string& x : items) {
+  for (const std::string& x : items) {
     int64_t s, l;
     if (x == "-") {
       // "everything"
       s = 0;
       l = kFullExtent;
     } else {
-      std::vector<string> sl = str_util::Split(x, ',', str_util::SkipEmpty());
+      std::vector<std::string> sl =
+          str_util::Split(x, ',', str_util::SkipEmpty());
       if (sl.size() != 2 || !absl::SimpleAtoi(sl[0], &s) ||
           !absl::SimpleAtoi(sl[1], &l)) {
         return errors::InvalidArgument(
@@ -152,8 +154,8 @@ void TensorSlice::AsProto(TensorSliceProto* proto) const {
   }
 }
 
-string TensorSlice::DebugString() const {
-  string buffer;
+std::string TensorSlice::DebugString() const {
+  std::string buffer;
   bool first = true;
   for (int d = 0; d < dims(); ++d) {
     if (!first) {
diff --git a/tensorflow/core/framework/tensor_slice.h b/tensorflow/core/framework/tensor_slice.h
index 4ada28d1f20109..b6fc4f503bdb11 100644
--- a/tensorflow/core/framework/tensor_slice.h
+++ b/tensorflow/core/framework/tensor_slice.h
@@ -54,8 +54,8 @@ class TensorSlice {
   static absl::Status BuildTensorSlice(const TensorSliceProto& proto,
                                        TensorSlice* output);
 
-  static absl::Status Parse(const string& str, TensorSlice* output);
-  static TensorSlice ParseOrDie(const string& str) {
+  static absl::Status Parse(const std::string& str, TensorSlice* output);
+  static TensorSlice ParseOrDie(const std::string& str) {
     TensorSlice ret;
     absl::Status s = Parse(str, &ret);
     if (!s.ok()) {
@@ -117,7 +117,7 @@ class TensorSlice {
 
   // Conversion of a TensorSlice to other formats
   void AsProto(TensorSliceProto* proto) const;
-  string DebugString() const;
+  std::string DebugString() const;
 
   // Fill *indices and *sizes from *this (so that we can use the slice()
   // function in eigen tensor). We need a tensor shape in case some of the
diff --git a/tensorflow/core/framework/tensor_slice_test.cc b/tensorflow/core/framework/tensor_slice_test.cc
index 1818c0b3f27c3c..13b07d1dcf016c 100644
--- a/tensorflow/core/framework/tensor_slice_test.cc
+++ b/tensorflow/core/framework/tensor_slice_test.cc
@@ -317,11 +317,11 @@ TEST(TensorSliceTest, Deserialization) {
   // since 0 is start's default value.)
 
   TensorSliceProto proto2;
-  ASSERT_TRUE(proto2.ParseFromArray(pb2, sizeof(pb2) - 1));
+  ASSERT_TRUE(proto2.ParseFromString(absl::string_view(pb2, sizeof(pb2) - 1)));
   TensorSlice ts2(proto2);
 
   TensorSliceProto proto3;
-  ASSERT_TRUE(proto3.ParseFromArray(pb3, sizeof(pb3) - 1));
+  ASSERT_TRUE(proto3.ParseFromString(absl::string_view(pb3, sizeof(pb3) - 1)));
   TensorSlice ts3(proto3);
 
   // Both serializations should be interpreted the same.
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 43a80931c18c75..fe4568624e6a4f 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -62,8 +62,8 @@ bool operator==(const Variant& a, const Variant& b) {
   a.Encode(&a_data);
   b.Encode(&b_data);
 
-  string a_metadata;
-  string b_metadata;
+  std::string a_metadata;
+  std::string b_metadata;
   a_data.get_metadata(&a_metadata);
   b_data.get_metadata(&b_metadata);
   if (a_metadata != b_metadata) return false;
@@ -74,7 +74,7 @@ bool operator==(const Variant& a, const Variant& b) {
     TensorProto a_proto, b_proto;
     a_data.tensors(i).AsProtoTensorContent(&a_proto);
     b_data.tensors(i).AsProtoTensorContent(&b_proto);
-    string a_str, b_str;
+    std::string a_str, b_str;
     a_proto.SerializeToString(&a_str);
     b_proto.SerializeToString(&b_str);
     if (a_str != b_str) return false;
@@ -95,15 +95,15 @@ TEST(TensorTest, Default) {
 TEST(TensorTest, DataType_Traits) {
   EXPECT_TRUE(std::is_trivial<float>::value);
   EXPECT_TRUE(std::is_trivial<double>::value);
-  EXPECT_TRUE(std::is_trivial<int32>::value);
-  EXPECT_TRUE(std::is_trivial<uint8>::value);
-  EXPECT_TRUE(std::is_trivial<uint16>::value);
-  EXPECT_TRUE(std::is_trivial<int16>::value);
-  EXPECT_TRUE(std::is_trivial<int8>::value);
+  EXPECT_TRUE(std::is_trivial<int32_t>::value);
+  EXPECT_TRUE(std::is_trivial<uint8_t>::value);
+  EXPECT_TRUE(std::is_trivial<uint16_t>::value);
+  EXPECT_TRUE(std::is_trivial<int16_t>::value);
+  EXPECT_TRUE(std::is_trivial<int8_t>::value);
   EXPECT_TRUE(std::is_trivial<int64_t>::value);
   EXPECT_TRUE(std::is_trivial<bool>::value);
   EXPECT_FALSE(std::is_trivial<tstring>::value);
-  EXPECT_FALSE(std::is_trivial<string>::value);
+  EXPECT_FALSE(std::is_trivial<std::string>::value);
 
   EXPECT_EQ(sizeof(bool), 1);
 
@@ -288,6 +288,17 @@ TEST(Tensor_Float8_E5m2fnuz, Simple) {
   TestCopies<float8_e5m2fnuz>(t);
 }
 
+TEST(Tensor_Float4_E2m1fn, Simple) {
+  Tensor t(DT_FLOAT4_E2M1FN, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<float4_e2m1fn>()(a, b) = static_cast<float4_e2m1fn>(a * b);
+    }
+  }
+  TestCopies<float4_e2m1fn>(t);
+}
+
 TEST(Tensor_Float, Simple) {
   Tensor t(DT_FLOAT, TensorShape({10, 20}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
@@ -315,10 +326,10 @@ TEST(Tensor_int8, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<int8>()(a, b) = static_cast<int8>(a * b);
+      t.matrix<int8_t>()(a, b) = static_cast<int8_t>(a * b);
     }
   }
-  TestCopies<int8>(t);
+  TestCopies<int8_t>(t);
 }
 
 TEST(Tensor_int16, Simple) {
@@ -326,10 +337,10 @@ TEST(Tensor_int16, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<int16>()(a, b) = static_cast<int16>(a * b);
+      t.matrix<int16_t>()(a, b) = static_cast<int16_t>(a * b);
     }
   }
-  TestCopies<int16>(t);
+  TestCopies<int16_t>(t);
 }
 
 TEST(Tensor_int32, Simple) {
@@ -499,10 +510,10 @@ TEST(Tensor_UInt8, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint8>()(a, b) = static_cast<uint8>(a * b);
+      t.matrix<uint8_t>()(a, b) = static_cast<uint8_t>(a * b);
     }
   }
-  TestCopies<uint8>(t);
+  TestCopies<uint8_t>(t);
 }
 
 TEST(Tensor_UInt16, Simple) {
@@ -510,10 +521,10 @@ TEST(Tensor_UInt16, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint16>()(a, b) = static_cast<uint16>(a * b);
+      t.matrix<uint16_t>()(a, b) = static_cast<uint16_t>(a * b);
     }
   }
-  TestCopies<uint16>(t);
+  TestCopies<uint16_t>(t);
 }
 
 TEST(Tensor_UInt32, Simple) {
@@ -521,10 +532,10 @@ TEST(Tensor_UInt32, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint32>()(a, b) = static_cast<uint32>(a * b);
+      t.matrix<uint32_t>()(a, b) = static_cast<uint32_t>(a * b);
     }
   }
-  TestCopies<uint32>(t);
+  TestCopies<uint32_t>(t);
 }
 
 TEST(Tensor_QInt8, Simple) {
@@ -576,7 +587,7 @@ TEST(Tensor_QInt32, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<qint32>()(a, b) = qint32(static_cast<int32>(a * b));
+      t.matrix<qint32>()(a, b) = qint32(static_cast<int32_t>(a * b));
     }
   }
   TestCopies<qint32>(t);
@@ -970,26 +981,26 @@ TEST(ReinterpretLastDimension, Reinterpret_NCHW_VECT_C_as_NCHW) {
     Tensor t_nchw_vect_c(DT_QINT8, TensorShape({2, 3, 5, 7, 4}));
     auto nchw_vect_c = t_nchw_vect_c.tensor<qint8, 5>();
     Tensor t_expected_nchw(DT_INT32, TensorShape({2, 3, 5, 7}));
-    auto expected_nchw = t_expected_nchw.tensor<int32, 4>();
+    auto expected_nchw = t_expected_nchw.tensor<int32_t, 4>();
     int8_t val = 0;
     for (int n = 0; n < t_nchw_vect_c.shape().dim_size(0); ++n) {
       for (int c = 0; c < t_nchw_vect_c.shape().dim_size(1); ++c) {
         for (int h = 0; h < t_nchw_vect_c.shape().dim_size(2); ++h, ++val) {
-          int8 packet[4];
+          int8_t packet[4];
           for (int w = 0; w < t_nchw_vect_c.shape().dim_size(3); ++w) {
             packet[0] = nchw_vect_c(n, c, h, w, 0) = ++val;
             packet[1] = nchw_vect_c(n, c, h, w, 1) = ++val;
             packet[2] = nchw_vect_c(n, c, h, w, 2) = ++val;
             packet[3] = nchw_vect_c(n, c, h, w, 3) = ++val;
-            expected_nchw(n, c, h, w) = *reinterpret_cast<int32*>(&packet[0]);
+            expected_nchw(n, c, h, w) = *reinterpret_cast<int32_t*>(&packet[0]);
           }
         }
       }
     }
-    auto actual_nchw = t_nchw_vect_c.reinterpret_last_dimension<int32, 4>();
+    auto actual_nchw = t_nchw_vect_c.reinterpret_last_dimension<int32_t, 4>();
     const auto& const_t_nchw_vect_c = t_nchw_vect_c;
     auto const_actual_nchw =
-        const_t_nchw_vect_c.reinterpret_last_dimension<int32, 4>();
+        const_t_nchw_vect_c.reinterpret_last_dimension<int32_t, 4>();
     for (int n = 0; n < t_nchw_vect_c.shape().dim_size(0); ++n) {
       for (int c = 0; c < t_nchw_vect_c.shape().dim_size(1); ++c) {
         for (int h = 0; h < t_nchw_vect_c.shape().dim_size(2); ++h) {
@@ -1217,19 +1228,19 @@ TEST(Tensor_Float, SimpleWithAllocator) {
 }
 
 TEST(Tensor_Int32, SimpleWithHelper) {
-  Tensor t1 = test::AsTensor<int32>({0, 1, 2, 3, 4, 5}, {2, 3});
+  Tensor t1 = test::AsTensor<int32_t>({0, 1, 2, 3, 4, 5}, {2, 3});
   Tensor t2(t1.dtype(), t1.shape());
-  t2.flat<int32>() = t1.flat<int32>() * 2;
-  Tensor t3 = test::AsTensor<int32>({0, 2, 4, 6, 8, 10}, t1.shape());
-  ExpectEqual<int32>(t2, t3);
+  t2.flat<int32_t>() = t1.flat<int32_t>() * 2;
+  Tensor t3 = test::AsTensor<int32_t>({0, 2, 4, 6, 8, 10}, t1.shape());
+  ExpectEqual<int32_t>(t2, t3);
 }
 
 TEST(Tensor_UInt16, SimpleWithHelper) {
-  Tensor t1 = test::AsTensor<uint16>({0, 1, 2, 3, 4, 5}, {2, 3});
+  Tensor t1 = test::AsTensor<uint16_t>({0, 1, 2, 3, 4, 5}, {2, 3});
   Tensor t2(t1.dtype(), t1.shape());
-  t2.flat<uint16>() = t1.flat<uint16>() * uint16(2);
-  Tensor t3 = test::AsTensor<uint16>({0, 2, 4, 6, 8, 10}, t1.shape());
-  ExpectEqual<uint16>(t2, t3);
+  t2.flat<uint16_t>() = t1.flat<uint16_t>() * uint16_t(2);
+  Tensor t3 = test::AsTensor<uint16_t>({0, 2, 4, 6, 8, 10}, t1.shape());
+  ExpectEqual<uint16_t>(t2, t3);
 }
 
 TEST(Tensor_QInt8, SimpleWithHelper) {
@@ -1413,7 +1424,7 @@ TEST(Tensor_Complex, SimpleWithHelper128) {
 class DummyCPUAllocator : public Allocator {
  public:
   DummyCPUAllocator() = default;
-  string Name() override { return "cpu"; }
+  std::string Name() override { return "cpu"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     return nullptr;
   }
@@ -1583,7 +1594,7 @@ TEST(Tensor, Slice_Basic) {
   {
     // Test unaligned access via a Slice for 8-bit data type.
     Tensor x(DT_INT8, TensorShape({30}));
-    x.flat<int8>().setConstant(0);
+    x.flat<int8_t>().setConstant(0);
 
     // Take an unaligned slice.
     Tensor y = x.Slice(1, 13);
@@ -1705,37 +1716,40 @@ INSTANTIATE_TEST_SUITE_P(
                                   {1, 2, 3, 4, 0}),
          MkTensor<Eigen::QUInt16>(DT_QUINT16, TensorShape({0}), {})},
         {"DT_UINT8",
-         MkTensor<uint8>(DT_UINT8, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint8>(DT_UINT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint8>(DT_UINT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
-         MkTensor<uint8>(DT_UINT8, TensorShape({0}), {})},
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({2, 2, 1, 1}),
+                           {1, 2, 3, 4, 0}),
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({0}), {})},
         {"DT_UINT16",
-         MkTensor<uint16>(DT_UINT16, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint16>(DT_UINT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint16>(DT_UINT16, TensorShape({2, 2, 1, 1}),
-                          {1, 2, 3, 4, 0}),
-         MkTensor<uint16>(DT_UINT16, TensorShape({0}), {})},
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({2, 2, 1, 1}),
+                            {1, 2, 3, 4, 0}),
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({0}), {})},
         {"DT_UINT32",
-         MkTensor<uint32>(DT_UINT32, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint32>(DT_UINT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint32>(DT_UINT32, TensorShape({2, 2, 1, 1}),
-                          {1, 2, 3, 4, 0}),
-         MkTensor<uint32>(DT_UINT32, TensorShape({0}), {})},
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({2, 2, 1, 1}),
+                            {1, 2, 3, 4, 0}),
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({0}), {})},
         {"DT_UINT64",
-         MkTensor<uint64>(DT_UINT64, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint64>(DT_UINT64, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint64>(DT_UINT64, TensorShape({2, 2, 1, 1}),
-                          {1, 2, 3, 4, 0}),
-         MkTensor<uint64>(DT_UINT64, TensorShape({0}), {})},
-        {"DT_INT8", MkTensor<int8>(DT_INT8, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<int8>(DT_INT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<int8>(DT_INT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
-         MkTensor<int8>(DT_INT8, TensorShape({0}), {})},
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({2, 2, 1, 1}),
+                            {1, 2, 3, 4, 0}),
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({0}), {})},
+        {"DT_INT8",
+         MkTensor<int8_t>(DT_INT8, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int8_t>(DT_INT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int8_t>(DT_INT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensor<int8_t>(DT_INT8, TensorShape({0}), {})},
         {"DT_INT16",
-         MkTensor<int16>(DT_INT16, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<int16>(DT_INT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<int16>(DT_INT16, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
-         MkTensor<int16>(DT_INT16, TensorShape({0}), {})},
+         MkTensor<int16_t>(DT_INT16, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int16_t>(DT_INT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int16_t>(DT_INT16, TensorShape({2, 2, 1, 1}),
+                           {1, 2, 3, 4, 0}),
+         MkTensor<int16_t>(DT_INT16, TensorShape({0}), {})},
         {"DT_INT32", MkTensor<int>(DT_INT32, TensorShape({5}), {1, 2, 3, 4, 0}),
          MkTensor<int>(DT_INT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
          MkTensor<int>(DT_INT32, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index 57c9ce04a9b79e..6ee0f84035f611 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -49,7 +49,8 @@ static ::testing::AssertionResult EqualFailure(const T& x, const T& y) {
 }
 
 template <>
-::testing::AssertionResult EqualFailure<int8>(const int8& x, const int8& y) {
+::testing::AssertionResult EqualFailure<int8_t>(const int8_t& x,
+                                                const int8_t& y) {
   return EqualFailure(static_cast<int>(x), static_cast<int>(y));
 }
 
@@ -231,17 +232,17 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
     case DT_DOUBLE:
       return ExpectEqual<double>(x, y, t);
     case DT_INT32:
-      return ExpectEqual<int32>(x, y);
+      return ExpectEqual<int32_t>(x, y);
     case DT_UINT32:
-      return ExpectEqual<uint32>(x, y);
+      return ExpectEqual<uint32_t>(x, y);
     case DT_UINT16:
-      return ExpectEqual<uint16>(x, y);
+      return ExpectEqual<uint16_t>(x, y);
     case DT_UINT8:
-      return ExpectEqual<uint8>(x, y);
+      return ExpectEqual<uint8_t>(x, y);
     case DT_INT16:
-      return ExpectEqual<int16>(x, y);
+      return ExpectEqual<int16_t>(x, y);
     case DT_INT8:
-      return ExpectEqual<int8>(x, y);
+      return ExpectEqual<int8_t>(x, y);
     case DT_STRING:
       return ExpectEqual<tstring>(x, y);
     case DT_COMPLEX64:
@@ -251,7 +252,7 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
     case DT_INT64:
       return ExpectEqual<int64_t>(x, y);
     case DT_UINT64:
-      return ExpectEqual<uint64>(x, y);
+      return ExpectEqual<uint64_t>(x, y);
     case DT_BOOL:
       return ExpectEqual<bool>(x, y);
     case DT_QINT8:
@@ -278,6 +279,8 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
       return ExpectEqual<float8_e4m3b11fnuz>(x, y, t);
     case DT_FLOAT8_E5M2FNUZ:
       return ExpectEqual<float8_e5m2fnuz>(x, y, t);
+    case DT_FLOAT4_E2M1FN:
+      return ExpectEqual<float4_e2m1fn>(x, y, t);
     case DT_INT4:
       return ExpectEqual<int4>(x, y, t);
     case DT_UINT4:
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 2d9be837368745..899efab94e85ee 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -430,7 +430,7 @@ absl::Status MakeShape(const Tensor& shape, TensorShape* out) {
         shape.shape().DebugString());
   }
   if (shape.dtype() == DataType::DT_INT32) {
-    auto vec = shape.flat<int32>();
+    auto vec = shape.flat<int32_t>();
     return TensorShapeUtils::MakeShape(vec.data(), vec.size(), out);
   } else if (shape.dtype() == DataType::DT_INT64) {
     auto vec = shape.flat<int64_t>();
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index eec2bd3f018ddf..7b9aaba557774d 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -94,14 +94,14 @@ class TensorProtoFieldHelper : public std::false_type {};
 // values in TensorProto. See tensorflow/core/framework/tensor.proto.
 DEFINE_PROTO_FIELD_HELPER(float, float);
 DEFINE_PROTO_FIELD_HELPER(double, double);
-DEFINE_PROTO_FIELD_HELPER(int8, int);
-DEFINE_PROTO_FIELD_HELPER(uint8, int);
-DEFINE_PROTO_FIELD_HELPER(int16, int);
-DEFINE_PROTO_FIELD_HELPER(uint16, int);
-DEFINE_PROTO_FIELD_HELPER(int32, int);
-DEFINE_PROTO_FIELD_HELPER(uint32, uint32);
+DEFINE_PROTO_FIELD_HELPER(int8_t, int);
+DEFINE_PROTO_FIELD_HELPER(uint8_t, int);
+DEFINE_PROTO_FIELD_HELPER(int16_t, int);
+DEFINE_PROTO_FIELD_HELPER(uint16_t, int);
+DEFINE_PROTO_FIELD_HELPER(int32_t, int);
+DEFINE_PROTO_FIELD_HELPER(uint32_t, uint32);
 DEFINE_PROTO_FIELD_HELPER(int64_t, int64);
-DEFINE_PROTO_FIELD_HELPER(uint64, uint64);
+DEFINE_PROTO_FIELD_HELPER(uint64_t, uint64);
 DEFINE_PROTO_FIELD_HELPER(bool, bool);
 DEFINE_PROTO_FIELD_HELPER(qint8, int);
 DEFINE_PROTO_FIELD_HELPER(quint8, int);
@@ -142,13 +142,13 @@ struct CopyHelper<Eigen::half> {
   template <typename SrcIter>
   static void ToArray(SrcIter begin, SrcIter end, Eigen::half* dst) {
     std::transform(begin, end, dst, [](int x) -> Eigen::half {
-      return Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16>(x));
+      return Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(x));
     });
   }
   template <typename SrcIter, typename DstIter>
   static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
     std::transform(begin, end, dst, [](Eigen::half h) -> int {
-      return static_cast<int>(Eigen::numext::bit_cast<uint16>(h));
+      return static_cast<int>(Eigen::numext::bit_cast<uint16_t>(h));
     });
   }
 };
@@ -158,13 +158,13 @@ struct CopyHelper<bfloat16> {
   template <typename SrcIter>
   static void ToArray(SrcIter begin, SrcIter end, bfloat16* dst) {
     std::transform(begin, end, dst, [](int x) -> bfloat16 {
-      return Eigen::numext::bit_cast<bfloat16>(static_cast<uint16>(x));
+      return Eigen::numext::bit_cast<bfloat16>(static_cast<uint16_t>(x));
     });
   }
   template <typename SrcIter, typename DstIter>
   static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
     std::transform(begin, end, dst, [](bfloat16 bf16) -> int {
-      return static_cast<int>(Eigen::numext::bit_cast<uint16>(bf16));
+      return static_cast<int>(Eigen::numext::bit_cast<uint16_t>(bf16));
     });
   }
 };
@@ -245,10 +245,10 @@ class TensorProtoHelper : public std::true_type {
 
 // Specialization for string.
 template <>
-class TensorProtoHelper<string> : public std::true_type {
+class TensorProtoHelper<std::string> : public std::true_type {
  public:
   static DataType GetDataType() { return DataType::DT_STRING; }
-  static void AddValue(const string& value, TensorProto* proto) {
+  static void AddValue(const std::string& value, TensorProto* proto) {
     *proto->mutable_string_val()->Add() = value;
   }
   template <typename IterType>
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 9b26845e948201..04414e89560a62 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -122,7 +122,7 @@ TEST(TensorUtil, DeepCopy) {
 
 TEST(TensorUtil, DeepCopySlice) {
   Tensor x(DT_INT32, TensorShape({10}));
-  x.flat<int32>().setConstant(1);
+  x.flat<int32_t>().setConstant(1);
 
   // Slice 'x' -- y still refers to the same buffer.
   Tensor y = x.Slice(2, 6);
@@ -131,7 +131,7 @@ TEST(TensorUtil, DeepCopySlice) {
   Tensor z = tensor::DeepCopy(y);
 
   // Set x to be different.
-  x.flat<int32>().setConstant(2);
+  x.flat<int32_t>().setConstant(2);
 
   EXPECT_EQ(TensorShape({10}), x.shape());
   EXPECT_EQ(TensorShape({4}), y.shape());
@@ -142,11 +142,11 @@ TEST(TensorUtil, DeepCopySlice) {
 
   // x and y should now all be '2', but z should be '1'.
   for (int i = 0; i < 10; ++i) {
-    EXPECT_EQ(2, x.flat<int32>()(i));
+    EXPECT_EQ(2, x.flat<int32_t>()(i));
   }
   for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(2, y.unaligned_flat<int32>()(i));
-    EXPECT_EQ(1, z.flat<int32>()(i));
+    EXPECT_EQ(2, y.unaligned_flat<int32_t>()(i));
+    EXPECT_EQ(1, z.flat<int32_t>()(i));
   }
 }
 
@@ -223,7 +223,7 @@ TEST(TensorUtil, Concat) {
     Tensor tensor(DT_INT32, TensorShape({size, 2}));
     for (int i = offset; i < offset + size; ++i) {
       for (int j = 0; j < 2; ++j) {
-        tensor.matrix<int32>()(i - offset, j) = 2 * i + j;
+        tensor.matrix<int32_t>()(i - offset, j) = 2 * i + j;
       }
     }
     to_concat.push_back(tensor);
@@ -236,7 +236,7 @@ TEST(TensorUtil, Concat) {
   ASSERT_EQ(TensorShape({total_size, 2}), concated.shape());
   for (int i = 0; i < total_size; ++i) {
     for (int j = 0; j < 2; ++j) {
-      EXPECT_EQ(2 * i + j, concated.matrix<int32>()(i, j));
+      EXPECT_EQ(2 * i + j, concated.matrix<int32_t>()(i, j));
     }
   }
 }
@@ -296,9 +296,9 @@ TEST(TensorUtil, ConcatSplitStrings) {
 
 TEST(TensorProtoUtil, CreateTensorProtoSpan_string) {
   // Don't use vector to trigger Span version.
-  string s[2] = {"a", "b"};
+  std::string s[2] = {"a", "b"};
   std::vector<size_t> shape{1, 2};
-  auto proto = tensor::CreateTensorProtoSpan<string>(s, shape);
+  auto proto = tensor::CreateTensorProtoSpan<std::string>(s, shape);
   TensorProto expected_tensor_proto;
   expected_tensor_proto.set_dtype(DT_STRING);
   expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(1);
@@ -310,9 +310,9 @@ TEST(TensorProtoUtil, CreateTensorProtoSpan_string) {
 
 TEST(TensorProtoUtil, CreateTensorProtoSpan_int32) {
   // Don't use vector to trigger Span version.
-  int32 s[2] = {123, 456};
+  int32_t s[2] = {123, 456};
   std::vector<size_t> shape{1, 2};
-  auto proto = tensor::CreateTensorProtoSpan<int32>(s, shape);
+  auto proto = tensor::CreateTensorProtoSpan<int32_t>(s, shape);
   TensorProto expected_tensor_proto;
   expected_tensor_proto.set_dtype(DT_INT32);
   expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(1);
@@ -323,7 +323,7 @@ TEST(TensorProtoUtil, CreateTensorProtoSpan_int32) {
 }
 
 TEST(TensorProtoUtil, CreatesStringTensorProto) {
-  std::vector<string> values{"a", "b", "c"};
+  std::vector<std::string> values{"a", "b", "c"};
   std::vector<size_t> shape{1, 3};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -347,7 +347,7 @@ TEST(TensorProtoUtil, CreatesStringTensorProto) {
 }
 
 TEST(TensorProtoUtil, CreatesInt32TensorProto) {
-  std::vector<int32> values{1, 2};
+  std::vector<int32_t> values{1, 2};
   std::vector<size_t> shape{2};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -387,7 +387,7 @@ TEST(TensorProtoUtil, CreatesInt64TensorProto) {
 }
 
 TEST(TensorProtoUtil, CreatesUInt32TensorProto) {
-  std::vector<uint32> values{1, 2};
+  std::vector<uint32_t> values{1, 2};
   std::vector<size_t> shape{2};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -407,7 +407,7 @@ TEST(TensorProtoUtil, CreatesUInt32TensorProto) {
 }
 
 TEST(TensorProtoUtil, CreatesUInt64TensorProto) {
-  std::vector<uint64> values{1, 2};
+  std::vector<uint64_t> values{1, 2};
   std::vector<size_t> shape{2};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -495,7 +495,7 @@ TEST(TensorProtoUtil, CompressTensorProtoInPlaceTooSmall) {
       tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
   EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
   tensor_proto =
-      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+      tensor::CreateTensorProto(std::vector<uint8_t>(kLength), {kLength});
   EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
   tensor_proto =
       tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
@@ -523,10 +523,10 @@ TEST(TensorProtoUtil, CompressTensorProtoInPlaceAllEqual) {
             0);
 
   tensor_proto =
-      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+      tensor::CreateTensorProto(std::vector<uint8_t>(kLength), {kLength});
   EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
-  EXPECT_EQ(tensor::internal::TensorProtoHelper<uint8>::NumValues(tensor_proto),
-            0);
+  EXPECT_EQ(
+      tensor::internal::TensorProtoHelper<uint8_t>::NumValues(tensor_proto), 0);
   tensor_proto =
       tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
   EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
@@ -645,14 +645,14 @@ TEST(TensorProtoUtil, CompressTensorProtoConstantTail) {
       ConstantTailTest<double>(kLength, tail_length, as_field);
       ConstantTailTest<complex64>(kLength, tail_length, as_field);
       ConstantTailTest<complex128>(kLength, tail_length, as_field);
-      ConstantTailTest<int32>(kLength, tail_length, as_field);
-      ConstantTailTest<uint32>(kLength, tail_length, as_field);
+      ConstantTailTest<int32_t>(kLength, tail_length, as_field);
+      ConstantTailTest<uint32_t>(kLength, tail_length, as_field);
       ConstantTailTest<int64_t>(kLength, tail_length, as_field);
-      ConstantTailTest<uint64>(kLength, tail_length, as_field);
-      ConstantTailTest<int8>(kLength, tail_length, as_field);
-      ConstantTailTest<uint8>(kLength, tail_length, as_field);
-      ConstantTailTest<int16>(kLength, tail_length, as_field);
-      ConstantTailTest<uint16>(kLength, tail_length, as_field);
+      ConstantTailTest<uint64_t>(kLength, tail_length, as_field);
+      ConstantTailTest<int8_t>(kLength, tail_length, as_field);
+      ConstantTailTest<uint8_t>(kLength, tail_length, as_field);
+      ConstantTailTest<int16_t>(kLength, tail_length, as_field);
+      ConstantTailTest<uint16_t>(kLength, tail_length, as_field);
       ConstantTailTest<Eigen::half>(kLength, tail_length, as_field);
       ConstantTailTest<bfloat16>(kLength, tail_length, as_field);
     }
diff --git a/tensorflow/core/framework/thread_factory.h b/tensorflow/core/framework/thread_factory.h
index 769ada29a5fe05..22c8238f3a40dd 100644
--- a/tensorflow/core/framework/thread_factory.h
+++ b/tensorflow/core/framework/thread_factory.h
@@ -35,7 +35,7 @@ class ThreadFactory {
   //
   // NOTE: The caller is responsible for ensuring that this `ThreadFactory`
   // outlives the returned `Thread`.
-  virtual std::unique_ptr<Thread> StartThread(const string& name,
+  virtual std::unique_ptr<Thread> StartThread(const std::string& name,
                                               std::function<void()> fn) = 0;
 };
 
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 3935df8f050a13..c1ebe5577b70d1 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 
 class TestableSizeTrackingAllocator : public Allocator {
  public:
-  string Name() override { return "test"; }
+  std::string Name() override { return "test"; }
   void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
     void* ptr = port::Malloc(num_bytes);
     size_map_[ptr] = num_bytes;
@@ -52,7 +52,7 @@ class TestableSizeTrackingAllocator : public Allocator {
 
 class NoMemoryAllocator : public Allocator {
  public:
-  string Name() override { return "test"; }
+  std::string Name() override { return "test"; }
   void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
     return nullptr;
   }
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index d73ca52743cd8c..22c0d608076af5 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -50,7 +50,7 @@ class TypeIndex {
 
   const char* name() const { return name_; }
 
-  uint64 hash_code() const { return hash_; }
+  uint64_t hash_code() const { return hash_; }
 
   // Returns a TypeIndex object that corresponds to a typename.
   template <typename T>
@@ -76,17 +76,18 @@ class TypeIndex {
 #endif  // TARGET_OS_OSX
 
     // No type names available.
-    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
-                     "[RTTI disabled]");
+    return TypeIndex(
+        static_cast<uint64_t>(reinterpret_cast<intptr_t>(hash_bit)),
+        "[RTTI disabled]");
 #endif  // __GXX_RTTI
   }
 
  private:
   // We hide the constructor of the TypeIndex class. Use the templated
   // Make<T>() function to create a TypeIndex object.
-  explicit TypeIndex(const uint64 hash, const char* name)
+  explicit TypeIndex(const uint64_t hash, const char* name)
       : hash_(hash), name_(name) {}
-  uint64 hash_;
+  uint64_t hash_;
   const char* name_;
 };
 
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 1e9274fbbb6cc1..60ce9f6148a516 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -82,7 +82,7 @@ const std::string DeviceName<Eigen::GpuDevice>::value = DEVICE_GPU;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace {
-string DataTypeStringInternal(DataType dtype) {
+std::string DataTypeStringInternal(DataType dtype) {
   switch (dtype) {
     case DT_INVALID:
       return "INVALID";
@@ -138,6 +138,8 @@ string DataTypeStringInternal(DataType dtype) {
       return "float8_e4m3b11fnuz";
     case DT_FLOAT8_E5M2FNUZ:
       return "float8_e5m2fnuz";
+    case DT_FLOAT4_E2M1FN:
+      return "float4_e2m1fn";
     case DT_INT4:
       return "int4";
     case DT_UINT4:
@@ -157,7 +159,7 @@ string DataTypeStringInternal(DataType dtype) {
 }
 }  // end namespace
 
-string DataTypeString(DataType dtype) {
+std::string DataTypeString(DataType dtype) {
   if (IsRefType(dtype)) {
     DataType non_ref = static_cast<DataType>(dtype - kDataTypeRefOffset);
     return absl::StrCat(DataTypeStringInternal(non_ref), "_ref");
@@ -255,6 +257,9 @@ bool DataTypeFromString(absl::string_view sp, DataType* dt) {
   } else if (sp == "float8_e5m2fnuz") {
     *dt = DT_FLOAT8_E5M2FNUZ;
     return true;
+  } else if (sp == "float4_e2m1fn") {
+    *dt = DT_FLOAT4_E2M1FN;
+    return true;
   } else if (sp == "int4") {
     *dt = DT_INT4;
     return true;
@@ -277,12 +282,12 @@ bool DataTypeFromString(absl::string_view sp, DataType* dt) {
   return false;
 }
 
-string DeviceTypeString(const DeviceType& device_type) {
+std::string DeviceTypeString(const DeviceType& device_type) {
   return device_type.type();
 }
 
-string DataTypeSliceString(const DataTypeSlice types) {
-  string out;
+std::string DataTypeSliceString(const DataTypeSlice types) {
+  std::string out;
   for (auto it = types.begin(); it != types.end(); ++it) {
     absl::StrAppend(&out, it == types.begin() ? "" : ", ", DataTypeString(*it));
   }
@@ -318,6 +323,7 @@ int DataTypeSize(DataType dt) {
     TF_CALL_float8_e4m3fnuz(CASE);
     TF_CALL_float8_e4m3b11fnuz(CASE);
     TF_CALL_float8_e5m2fnuz(CASE);
+    TF_CALL_float4_e2m1fn(CASE);
     TF_CALL_int4(CASE);
     TF_CALL_uint4(CASE);
     TF_CALL_int2(CASE);
@@ -335,17 +341,17 @@ int DataTypeSize(DataType dt) {
 
 DEFINE_DATATYPETOENUM_VALUE(float);
 DEFINE_DATATYPETOENUM_VALUE(double);
-DEFINE_DATATYPETOENUM_VALUE(int32);
-DEFINE_DATATYPETOENUM_VALUE(uint32);
-DEFINE_DATATYPETOENUM_VALUE(uint16);
-DEFINE_DATATYPETOENUM_VALUE(uint8);
-DEFINE_DATATYPETOENUM_VALUE(int16);
-DEFINE_DATATYPETOENUM_VALUE(int8);
+DEFINE_DATATYPETOENUM_VALUE(int32_t);
+DEFINE_DATATYPETOENUM_VALUE(uint32_t);
+DEFINE_DATATYPETOENUM_VALUE(uint16_t);
+DEFINE_DATATYPETOENUM_VALUE(uint8_t);
+DEFINE_DATATYPETOENUM_VALUE(int16_t);
+DEFINE_DATATYPETOENUM_VALUE(int8_t);
 DEFINE_DATATYPETOENUM_VALUE(tstring);
 DEFINE_DATATYPETOENUM_VALUE(complex64);
 DEFINE_DATATYPETOENUM_VALUE(complex128);
 DEFINE_DATATYPETOENUM_VALUE(int64_t);
-DEFINE_DATATYPETOENUM_VALUE(uint64);
+DEFINE_DATATYPETOENUM_VALUE(uint64_t);
 DEFINE_DATATYPETOENUM_VALUE(bool);
 DEFINE_DATATYPETOENUM_VALUE(qint8);
 DEFINE_DATATYPETOENUM_VALUE(quint8);
@@ -359,6 +365,7 @@ DEFINE_DATATYPETOENUM_VALUE(float8_e4m3fn);
 DEFINE_DATATYPETOENUM_VALUE(float8_e4m3fnuz);
 DEFINE_DATATYPETOENUM_VALUE(float8_e4m3b11fnuz);
 DEFINE_DATATYPETOENUM_VALUE(float8_e5m2fnuz);
+DEFINE_DATATYPETOENUM_VALUE(float4_e2m1fn);
 DEFINE_DATATYPETOENUM_VALUE(int4);
 DEFINE_DATATYPETOENUM_VALUE(uint4);
 DEFINE_DATATYPETOENUM_VALUE(int2);
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index c6939faff56176..9413656ed96a3a 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -82,7 +82,7 @@ typedef absl::InlinedVector<DataType, 4UL> DataTypeVector;
 typedef absl::Span<const DataType> DataTypeSlice;
 
 typedef absl::InlinedVector<DeviceType, 4UL> DeviceTypeVector;
-typedef absl::InlinedVector<std::pair<DeviceType, int32>, 4UL>
+typedef absl::InlinedVector<std::pair<DeviceType, int32_t>, 4UL>
     PrioritizedDeviceTypeVector;
 
 // Convert the enums to strings for errors:
@@ -98,25 +98,25 @@ inline std::string DataTypeVectorString(const DataTypeVector& dtypes) {
 // cannot represent any of the DT_*_REF values.
 class DataTypeSet {
  private:
-  const uint64 mask_;
+  const uint64_t mask_;
 
-  static constexpr uint64 kNumBits = 64;
+  static constexpr uint64_t kNumBits = 64;
 
  public:
   constexpr DataTypeSet(const DataTypeSet& other) : mask_(other.mask_) {}
-  explicit constexpr DataTypeSet(uint64 mask) : mask_(mask) {}
+  explicit constexpr DataTypeSet(uint64_t mask) : mask_(mask) {}
 
   constexpr bool Contains(DataType dt) const {
-    return (static_cast<uint64>(dt) < kNumBits) &&
-           ((mask_ >> static_cast<uint64>(dt)) & 1ull) != 0ull;
+    return (static_cast<uint64_t>(dt) < kNumBits) &&
+           ((mask_ >> static_cast<uint64_t>(dt)) & 1ull) != 0ull;
   }
 
   class Iterator {
     const DataTypeSet& set_;
-    uint64 pos_;
+    uint64_t pos_;
 
    public:
-    Iterator(const DataTypeSet& set, uint64 pos) : set_(set), pos_(pos) {
+    Iterator(const DataTypeSet& set, uint64_t pos) : set_(set), pos_(pos) {
       DCHECK_LE(pos, kNumBits);
     }
     DataType operator*() const { return static_cast<DataType>(pos_); }
@@ -124,7 +124,7 @@ class DataTypeSet {
       ++pos_;
       DCHECK_LE(pos_, kNumBits);
       if (pos_ < kNumBits) {
-        uint64 remaining_mask = set_.mask_ >> pos_;
+        uint64_t remaining_mask = set_.mask_ >> pos_;
         if (remaining_mask != 0ull) {
           pos_ += absl::countr_zero(remaining_mask);
         }
@@ -171,7 +171,7 @@ class DataTypeSet {
 bool DataTypeFromString(absl::string_view sp, DataType* dt);
 
 constexpr inline DataTypeSet ToSet(DataType dt) {
-  return DataTypeSet(1ull << static_cast<uint64>(dt));
+  return DataTypeSet(1ull << static_cast<uint64_t>(dt));
 }
 
 // DT_FLOAT + kDataTypeRefOffset == DT_FLOAT_REF, etc.
@@ -325,12 +325,12 @@ struct EnumToDataType {};  // Specializations below
 
 MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
 MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
-MATCH_TYPE_AND_ENUM(int32, DT_INT32);
-MATCH_TYPE_AND_ENUM(uint32, DT_UINT32);
-MATCH_TYPE_AND_ENUM(uint16, DT_UINT16);
-MATCH_TYPE_AND_ENUM(uint8, DT_UINT8);
-MATCH_TYPE_AND_ENUM(int16, DT_INT16);
-MATCH_TYPE_AND_ENUM(int8, DT_INT8);
+MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);
+MATCH_TYPE_AND_ENUM(uint32_t, DT_UINT32);
+MATCH_TYPE_AND_ENUM(uint16_t, DT_UINT16);
+MATCH_TYPE_AND_ENUM(uint8_t, DT_UINT8);
+MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
+MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
 MATCH_TYPE_AND_ENUM(tstring, DT_STRING);
 MATCH_TYPE_AND_ENUM(complex64, DT_COMPLEX64);
 MATCH_TYPE_AND_ENUM(complex128, DT_COMPLEX128);
@@ -347,6 +347,7 @@ MATCH_TYPE_AND_ENUM(float8_e4m3fn, DT_FLOAT8_E4M3FN);
 MATCH_TYPE_AND_ENUM(float8_e4m3fnuz, DT_FLOAT8_E4M3FNUZ);
 MATCH_TYPE_AND_ENUM(float8_e4m3b11fnuz, DT_FLOAT8_E4M3B11FNUZ);
 MATCH_TYPE_AND_ENUM(float8_e5m2fnuz, DT_FLOAT8_E5M2FNUZ);
+MATCH_TYPE_AND_ENUM(float4_e2m1fn, DT_FLOAT4_E2M1FN);
 MATCH_TYPE_AND_ENUM(int4, DT_INT4);
 MATCH_TYPE_AND_ENUM(uint4, DT_UINT4);
 MATCH_TYPE_AND_ENUM(int2, DT_INT2);
@@ -382,7 +383,7 @@ struct IsValidDataType<unsigned long> {
 };
 template <>
 struct EnumToDataType<DT_UINT64> {
-  typedef tensorflow::uint64 Type;
+  typedef uint64_t Type;
 };
 
 template <>
@@ -417,7 +418,7 @@ struct IsValidDataType {
 
 // Extra validity checking; not part of public API.
 static_assert(IsValidDataType<int64_t>::value, "Incorrect impl for int64");
-static_assert(IsValidDataType<int32>::value, "Incorrect impl for int32");
+static_assert(IsValidDataType<int32_t>::value, "Incorrect impl for int32");
 
 // TODO(jeff): Maybe unify this with Tensor::CanUseDMA, or the underlying
 // is_simple<T> in tensor.cc (and possible choose a more general name?)
@@ -429,8 +430,9 @@ constexpr DataTypeSet kDataTypesCanUseMemcpy =
     ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
     ToSet(DT_BFLOAT16) | ToSet(DT_HALF) | ToSet(DT_FLOAT8_E5M2) |
     ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_FLOAT8_E4M3FNUZ) |
-    ToSet(DT_FLOAT8_E4M3B11FNUZ) | ToSet(DT_FLOAT8_E5M2FNUZ) | ToSet(DT_INT4) |
-    ToSet(DT_UINT4) | ToSet(DT_INT2) | ToSet(DT_UINT2);
+    ToSet(DT_FLOAT8_E4M3B11FNUZ) | ToSet(DT_FLOAT8_E5M2FNUZ) |
+    ToSet(DT_FLOAT4_E2M1FN) | ToSet(DT_INT4) | ToSet(DT_UINT4) |
+    ToSet(DT_INT2) | ToSet(DT_UINT2);
 
 constexpr bool DataTypeCanUseMemcpy(DataType dt) {
   return kDataTypesCanUseMemcpy.Contains(dt);
@@ -441,7 +443,7 @@ constexpr DataTypeSet kDataTypeIsFloating =
     ToSet(DT_HALF) | ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) |
     ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_FLOAT8_E5M2) |
     ToSet(DT_FLOAT8_E4M3FNUZ) | ToSet(DT_FLOAT8_E4M3B11FNUZ) |
-    ToSet(DT_FLOAT8_E5M2FNUZ);
+    ToSet(DT_FLOAT8_E5M2FNUZ) | ToSet(DT_FLOAT4_E2M1FN);
 inline bool DataTypeIsFloating(DataType dt) {
   return kDataTypeIsFloating.Contains(dt);
 }
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index bef9863cbb3222..314189b58ea38e 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -49,11 +49,11 @@ enum DataType {
                                // bias, finite-only, with NaNs.
   DT_FLOAT8_E5M2FNUZ = 28;     // 5 exponent bits, 2 mantissa bits, finite-only,
                                // with NaN.
-
   DT_INT4 = 29;
   DT_UINT4 = 30;
   DT_INT2 = 31;
   DT_UINT2 = 32;
+  DT_FLOAT4_E2M1FN = 33;  // 2 exponent bits, 1 mantissa bit, finite-only
 
   // Do not use!  These are only for TF1's obsolete reference Variables.
   // Every enum above should have a corresponding value below (verified by
@@ -91,6 +91,7 @@ enum DataType {
   DT_UINT4_REF = 130;
   DT_INT2_REF = 131;
   DT_UINT2_REF = 132;
+  DT_FLOAT4_E2M1FN_REF = 133;
 }
 // LINT.ThenChange(
 //    https://www.tensorflow.org/code/tensorflow/c/tf_datatype.h,
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 745416098d71b9..8a30c8ff0cad26 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -116,6 +116,8 @@ TEST(TypesTest, DataTypeFromString) {
   EXPECT_EQ(DT_FLOAT8_E4M3B11FNUZ, dt);
   ASSERT_TRUE(DataTypeFromString("float8_e5m2fnuz", &dt));
   EXPECT_EQ(DT_FLOAT8_E5M2FNUZ, dt);
+  ASSERT_TRUE(DataTypeFromString("float4_e2m1fn", &dt));
+  EXPECT_EQ(DT_FLOAT4_E2M1FN, dt);
   ASSERT_TRUE(DataTypeFromString("int4", &dt));
   EXPECT_EQ(DT_INT4, dt);
   ASSERT_TRUE(DataTypeFromString("uint4", &dt));
@@ -138,10 +140,10 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_TRUE(GetQuantized<quint8>());
   EXPECT_TRUE(GetQuantized<qint32>());
 
-  EXPECT_FALSE(GetQuantized<int8>());
-  EXPECT_FALSE(GetQuantized<uint8>());
-  EXPECT_FALSE(GetQuantized<int16>());
-  EXPECT_FALSE(GetQuantized<int32>());
+  EXPECT_FALSE(GetQuantized<int8_t>());
+  EXPECT_FALSE(GetQuantized<uint8_t>());
+  EXPECT_FALSE(GetQuantized<int16_t>());
+  EXPECT_FALSE(GetQuantized<int32_t>());
 
   EXPECT_TRUE(DataTypeIsQuantized(DT_QINT8));
   EXPECT_TRUE(DataTypeIsQuantized(DT_QUINT8));
@@ -158,6 +160,7 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E4M3FNUZ));
   EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E4M3B11FNUZ));
   EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E5M2FNUZ));
+  EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT4_E2M1FN));
   EXPECT_FALSE(DataTypeIsQuantized(DT_UINT4));
   EXPECT_FALSE(DataTypeIsQuantized(DT_INT4));
   EXPECT_FALSE(DataTypeIsQuantized(DT_UINT2));
@@ -173,7 +176,7 @@ TEST(TypesTest, ComplexTypes) {
 
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
-    const string name = DataTypeString(dt);
+    const std::string name = DataTypeString(dt);
     EXPECT_EQ(DataTypeIsInteger(dt),
               absl::StartsWith(name, "int") || absl::StartsWith(name, "uint"))
         << "DataTypeInteger failed for " << name;
diff --git a/tensorflow/core/framework/variant.cc b/tensorflow/core/framework/variant.cc
index 69bd07ac6a02e5..8c183c4374e8bf 100644
--- a/tensorflow/core/framework/variant.cc
+++ b/tensorflow/core/framework/variant.cc
@@ -47,7 +47,7 @@ const void* Variant::get() const {
 }
 
 template <>
-string TypeNameVariant(const VariantTensorDataProto& value) {
+std::string TypeNameVariant(const VariantTensorDataProto& value) {
   return value.type_name();
 }
 
@@ -64,19 +64,19 @@ bool DecodeVariant(VariantTensorData* data, VariantTensorDataProto* value) {
 }
 
 template <>
-void EncodeVariant(const VariantTensorDataProto& value, string* buf) {
+void EncodeVariant(const VariantTensorDataProto& value, std::string* buf) {
   value.SerializeToString(buf);
 }
 
 template <>
-bool DecodeVariant(string* buf, VariantTensorDataProto* value) {
+bool DecodeVariant(std::string* buf, VariantTensorDataProto* value) {
   return value->ParseFromString(*buf);
 }
 
 void EncodeVariantList(const Variant* variant_array, int64_t n,
                        std::unique_ptr<port::StringListEncoder> e) {
   for (int i = 0; i < n; ++i) {
-    string s;
+    std::string s;
     variant_array[i].Encode(&s);
     e->Append(s);
   }
@@ -85,7 +85,7 @@ void EncodeVariantList(const Variant* variant_array, int64_t n,
 
 bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
                        Variant* variant_array, int64_t n) {
-  std::vector<uint32> sizes(n);
+  std::vector<uint32_t> sizes(n);
   if (!d->ReadSizes(&sizes)) return false;
 
   for (int i = 0; i < n; ++i) {
@@ -94,7 +94,7 @@ bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
     }
     // TODO(ebrevdo): Replace with StringPiece?  Any way to make this a
     // zero-copy operation that keeps a reference to the data in d?
-    string str(d->Data(sizes[i]), sizes[i]);
+    std::string str(d->Data(sizes[i]), sizes[i]);
     if (!variant_array[i].Decode(std::move(str))) return false;
     if (!DecodeUnaryVariant(&variant_array[i])) {
       LOG(ERROR) << "Could not decode variant with type_name: \""
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index d93991ac0dc217..b735be446fb5c5 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -131,7 +131,7 @@ struct has_type_name : std::false_type {};
 template <typename C>
 struct has_type_name<
     C, typename std::enable_if<std::is_same<
-           decltype(std::declval<C>().TypeName()), string>::value>::type>
+           decltype(std::declval<C>().TypeName()), std::string>::value>::type>
     : std::true_type {};
 
 template <typename T, bool = has_type_name<typename std::decay<T>::type>::value,
@@ -179,18 +179,18 @@ struct has_debug_string : std::false_type {};
 
 template <typename C>
 struct has_debug_string<
-    C, typename std::enable_if<std::is_same<
-           decltype(std::declval<C>().DebugString()), string>::value>::type>
+    C,
+    typename std::enable_if<std::is_same<
+        decltype(std::declval<C>().DebugString()), std::string>::value>::type>
     : std::true_type {};
 
 template <typename C, typename = void>
 struct can_strcat : std::false_type {};
 
 template <typename C>
-struct can_strcat<
-    C, typename std::enable_if<std::is_same<
-           decltype(strings::StrCat(std::declval<C>())), string>::value>::type>
-    : std::true_type {};
+struct can_strcat<C, typename std::enable_if<std::is_same<
+                         decltype(strings::StrCat(std::declval<C>())),
+                         std::string>::value>::type> : std::true_type {};
 
 template <typename T,
           bool = has_debug_string<typename std::decay<T>::type>::value,
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 77e3d313f920e8..9c036722b772b4 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -59,7 +59,7 @@ static int* GetCopyGPUToGPUCounter() {
 
 struct StoredTensorValue {
   Tensor stored;
-  string TypeName() const { return "StoredTensorValue"; }
+  std::string TypeName() const { return "StoredTensorValue"; }
   void Encode(VariantTensorData* data) const { data->tensors_ = {stored}; }
   bool Decode(const VariantTensorData& data) {
     CHECK_EQ(1, data.tensors_.size());
@@ -268,7 +268,7 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
 TEST(VariantOpCopyTest, CreateCopyCPUToCPU) {
   Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
   Tensor t_42(DT_INT32, TensorShape({}));
-  t_42.flat<int32>()(0) = 42;
+  t_42.flat<int32_t>()(0) = 42;
   Output create_op = CreateTestVariant(root, t_42);
   Output identity = ops::Identity(root, create_op);
 
@@ -285,7 +285,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToCPU) {
     EXPECT_EQ("StoredTensorValue", r1.TypeName());
     const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
     EXPECT_NE(v1, nullptr);
-    EXPECT_EQ(42, v1->stored.scalar<int32>()());
+    EXPECT_EQ(42, v1->stored.scalar<int32_t>()());
   }
 }
 
@@ -319,7 +319,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPU) {
   Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
   Scope with_gpu = root.WithDevice("/gpu:0");
   Tensor t_42(DT_INT32, TensorShape({}));
-  t_42.scalar<int32>()() = 42;
+  t_42.scalar<int32_t>()() = 42;
   Output create_op = CreateTestVariant(root, t_42);
   Output identity = ops::Identity(with_gpu, create_op);
 
@@ -346,7 +346,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPU) {
     EXPECT_EQ("StoredTensorValue", r1.TypeName());
     const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
     EXPECT_NE(v1, nullptr);
-    EXPECT_EQ(42, v1->stored.scalar<int32>()());
+    EXPECT_EQ(42, v1->stored.scalar<int32_t>()());
   }
 }
 
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index 225da86665613d..3019976645f679 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -46,9 +46,10 @@ const char* VariantBinaryOpToString(VariantBinaryOp op) {
   }
 }
 
-std::unordered_set<string>* UnaryVariantOpRegistry::PersistentStringStorage() {
-  static std::unordered_set<string>* string_storage =
-      new std::unordered_set<string>();
+std::unordered_set<std::string>*
+UnaryVariantOpRegistry::PersistentStringStorage() {
+  static std::unordered_set<std::string>* string_storage =
+      new std::unordered_set<std::string>();
   return string_storage;
 }
 
@@ -70,7 +71,7 @@ UnaryVariantOpRegistry::VariantDecodeFn* UnaryVariantOpRegistry::GetDecodeFn(
 }
 
 void UnaryVariantOpRegistry::RegisterDecodeFn(
-    const string& type_name, const VariantDecodeFn& decode_fn) {
+    const std::string& type_name, const VariantDecodeFn& decode_fn) {
   CHECK(!type_name.empty()) << "Need a valid name for UnaryVariantDecode";
   VariantDecodeFn* existing = GetDecodeFn(type_name);
   CHECK_EQ(existing, nullptr)
@@ -98,7 +99,7 @@ bool DecodeUnaryVariant(Variant* variant) {
   if (decode_fn == nullptr) {
     return false;
   }
-  const string type_name = variant->TypeName();
+  const std::string type_name = variant->TypeName();
   bool decoded = (*decode_fn)(variant);
   if (!decoded) return false;
   if (variant->TypeName() != type_name) {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index c7d8680d31bfbe..5c31066f54a4a4 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -188,7 +188,7 @@ class UnaryVariantOpRegistry {
   // iterators).  In other words, one may safely point a StringPiece to
   // a value in the set without that StringPiece being invalidated by
   // future insertions.
-  static std::unordered_set<string>* PersistentStringStorage();
+  static std::unordered_set<std::string>* PersistentStringStorage();
 
  private:
   struct TypeIndexHash {
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 594e9a6682ddfa..2506bdd433242d 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -38,7 +38,7 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 
 struct VariantValue {
-  string TypeName() const { return "TEST VariantValue"; }
+  std::string TypeName() const { return "TEST VariantValue"; }
   static absl::Status CPUZerosLikeFn(OpKernelContext* ctx,
                                      const VariantValue& v,
                                      VariantValue* v_out) {
@@ -147,7 +147,7 @@ TEST(VariantOpDecodeRegistryTest, TestEmpty) {
 TEST(VariantOpDecodeRegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::VariantDecodeFn f;
-  string kTypeName = "fjfjfj";
+  std::string kTypeName = "fjfjfj";
   registry.RegisterDecodeFn(kTypeName, f);
   EXPECT_DEATH(registry.RegisterDecodeFn(kTypeName, f),
                "fjfjfj already registered");
diff --git a/tensorflow/core/framework/variant_tensor_data.cc b/tensorflow/core/framework/variant_tensor_data.cc
index 40028b91a95650..906cfaa3d8e58a 100644
--- a/tensorflow/core/framework/variant_tensor_data.cc
+++ b/tensorflow/core/framework/variant_tensor_data.cc
@@ -71,27 +71,27 @@ bool VariantTensorData::FromConstProto(const VariantTensorDataProto& proto) {
   return true;
 }
 
-string VariantTensorData::SerializeAsString() const {
+std::string VariantTensorData::SerializeAsString() const {
   VariantTensorDataProto proto;
   ToProto(&proto);
   return proto.SerializeAsString();
 }
 
-bool VariantTensorData::SerializeToString(string* buf) {
+bool VariantTensorData::SerializeToString(std::string* buf) {
   VariantTensorDataProto proto;
   ToProto(&proto);
   return proto.SerializeToString(buf);
 }
 
-bool VariantTensorData::ParseFromString(string s) {
+bool VariantTensorData::ParseFromString(std::string s) {
   VariantTensorDataProto proto;
   const bool status = proto.ParseFromString(s);
   if (status) FromProto(std::move(proto));
   return status;
 }
 
-string VariantTensorData::DebugString() const {
-  string repeated_field = "";
+std::string VariantTensorData::DebugString() const {
+  std::string repeated_field = "";
   for (const auto& t : tensors_) {
     repeated_field =
         absl::StrCat(repeated_field, " tensors: ", t.DebugString());
@@ -100,7 +100,7 @@ string VariantTensorData::DebugString() const {
                          repeated_field);
 }
 
-string ProtoDebugString(const VariantTensorData& object) {
+std::string ProtoDebugString(const VariantTensorData& object) {
   return object.DebugString();
 }
 
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 2dbd0157e79b8f..bf99cd721ad6de 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -38,7 +38,7 @@ template <typename T, bool BIG>
 struct Wrapper {
   T value;
   char big[BIG ? 256 : 1];
-  string TypeName() const { return "POD"; }
+  std::string TypeName() const { return "POD"; }
 };
 
 template <bool BIG>
@@ -87,7 +87,7 @@ class MaybeAlive {
 
   static int LiveCounter() { return live_counter_; }
 
-  string TypeName() const { return "MaybeAlive"; }
+  std::string TypeName() const { return "MaybeAlive"; }
   void Encode(VariantTensorData* data) const {}
   bool Decode(VariantTensorData data) { return false; }
 
@@ -127,7 +127,7 @@ class DeleteCounter {
   char big_[BIG ? 256 : 1];
   int* counter_;
 
-  string TypeName() const { return "DeleteCounter"; }
+  std::string TypeName() const { return "DeleteCounter"; }
   void Encode(VariantTensorData* data) const {}
   bool Decode(VariantTensorData data) { return false; }
 };
@@ -248,7 +248,7 @@ class MoveAndCopyCounter {
   int* move_counter_;
   int* copy_counter_;
 
-  string TypeName() const { return "MoveAndCopyCounter"; }
+  std::string TypeName() const { return "MoveAndCopyCounter"; }
   void Encode(VariantTensorData* data) const {}
   bool Decode(VariantTensorData data) { return false; }
 };
@@ -538,7 +538,7 @@ struct TensorList {
     return true;
   }
 
-  string TypeName() const { return "TensorList"; }
+  std::string TypeName() const { return "TensorList"; }
 
   std::vector<Tensor> vec;
 };
@@ -616,7 +616,7 @@ void PodUpdateTest() {
     float y;
     char big[BIG ? 256 : 1];
 
-    string TypeName() const { return "POD"; }
+    std::string TypeName() const { return "POD"; }
   };
 
   Variant x = Pod{10, 20.f};
@@ -639,7 +639,7 @@ void TestEncodeDecodePod() {
     float y;
     char big[BIG ? 256 : 1];
 
-    string TypeName() const { return "POD"; }
+    std::string TypeName() const { return "POD"; }
   };
 
   Variant x;
diff --git a/tensorflow/core/function/runtime_client/runtime_client.cc b/tensorflow/core/function/runtime_client/runtime_client.cc
index b38e293026111b..8ac853ab224606 100644
--- a/tensorflow/core/function/runtime_client/runtime_client.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client.cc
@@ -231,7 +231,7 @@ absl::StatusOr<ReturnValues> Runtime::CallFunction(
   TF_RETURN_WITH_CONTEXT_IF_ERROR(op->AddInputList(args),
                                   "preparing call args for ", name);
 
-  const FunctionDef* fn_def = ctx.GetFunctionDef(string(name));
+  const FunctionDef* fn_def = ctx.GetFunctionDef(std::string(name));
   int num_retvals = fn_def->signature().output_arg_size();
   int actual_retvals = num_retvals;
   std::vector<ImmediateExecutionTensorHandle*> retvals(num_retvals);
diff --git a/tensorflow/core/graph/graph_debug_info_builder.cc b/tensorflow/core/graph/graph_debug_info_builder.cc
index b539fa1d5c04a5..b09851e5a191bf 100644
--- a/tensorflow/core/graph/graph_debug_info_builder.cc
+++ b/tensorflow/core/graph/graph_debug_info_builder.cc
@@ -241,7 +241,7 @@ GraphDebugInfo GraphDebugInfoBuilder::Build() const { return *debug_info_; }
 absl::Status GraphDebugInfoBuilder::AppendGraphDebugInfoStr(
     absl::string_view prefix, absl::string_view new_info_str) {
   GraphDebugInfo debug_info;
-  if (!debug_info.ParseFromArray(new_info_str.data(), new_info_str.size())) {
+  if (!debug_info.ParseFromString(new_info_str)) {
     return absl::InvalidArgumentError("Failed to parse GraphDebugInfo proto.");
   }
   AppendGraphDebugInfo(prefix, debug_info);
@@ -280,8 +280,7 @@ StackTracesMap LoadTracesFromDebugInfo(const GraphDebugInfo& debug_info) {
 absl::StatusOr<StackTracesMap> LoadTracesFromDebugInfoStr(
     absl::string_view debug_info_str) {
   GraphDebugInfo debug_info;
-  if (!debug_info.ParseFromArray(debug_info_str.data(),
-                                 debug_info_str.size())) {
+  if (!debug_info.ParseFromString(debug_info_str)) {
     return absl::InvalidArgumentError("Failed to parse GraphDebugInfo proto.");
   }
   return LoadTracesFromDebugInfo(debug_info);
diff --git a/tensorflow/core/graph/regularization/simple_delete_test.cc b/tensorflow/core/graph/regularization/simple_delete_test.cc
index b9c2652ed9db57..2eac003707755f 100644
--- a/tensorflow/core/graph/regularization/simple_delete_test.cc
+++ b/tensorflow/core/graph/regularization/simple_delete_test.cc
@@ -57,7 +57,7 @@ TEST(SimpleDeleteTest, TestSimpleDeleteModelSavedTwice) {
   MetaGraphDef* metagraph = saved_model_pb.mutable_meta_graphs(0);
   GraphDef* graph_def = metagraph->mutable_graph_def();
   SimpleDelete(*graph_def);
-  uint64 hash1 = ComputeHash(*graph_def);
+  uint64_t hash1 = ComputeHash(*graph_def);
 
   const std::string export_dir2 =
       io::JoinPath(testing::TensorFlowSrcRoot(),
@@ -67,7 +67,7 @@ TEST(SimpleDeleteTest, TestSimpleDeleteModelSavedTwice) {
   const MetaGraphDef& metagraph2 = saved_model_pb2.meta_graphs(0);
   GraphDef graph_def2 = metagraph2.graph_def();
   SimpleDelete(graph_def2);
-  uint64 hash2 = ComputeHash(graph_def2);
+  uint64_t hash2 = ComputeHash(graph_def2);
 
   EXPECT_EQ(hash1, hash2);
 }
diff --git a/tensorflow/core/graph/regularization/util.cc b/tensorflow/core/graph/regularization/util.cc
index ed9d7254dd43aa..b42bd21f0656aa 100644
--- a/tensorflow/core/graph/regularization/util.cc
+++ b/tensorflow/core/graph/regularization/util.cc
@@ -33,7 +33,7 @@ limitations under the License.
 
 namespace tensorflow::graph_regularization {
 
-uint64 ComputeHash(const GraphDef& graph_def) {
+uint64_t ComputeHash(const GraphDef& graph_def) {
   std::string graph_def_string;
   SerializeToStringDeterministic(graph_def, &graph_def_string);
   return tensorflow::Fingerprint64(graph_def_string);
diff --git a/tensorflow/core/graph/regularization/util.h b/tensorflow/core/graph/regularization/util.h
index 2fff645261ce20..e046b81d8dca89 100644
--- a/tensorflow/core/graph/regularization/util.h
+++ b/tensorflow/core/graph/regularization/util.h
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow::graph_regularization {
 
 // Computes the Fingerprint64 hash of the GraphDef.
-uint64 ComputeHash(const GraphDef& graph_def);
+uint64_t ComputeHash(const GraphDef& graph_def);
 
 // Returns the suffix UID of `function_name`, returns an error if there is none.
 absl::StatusOr<int64_t> GetSuffixUID(absl::string_view function_name);
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 7cdd046c48a806..901273fb7e0775 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -15,21 +15,21 @@ limitations under the License.
 
 #include "tensorflow/core/graph/tensor_id.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <string>
 
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/str_util.h"
 
 namespace tensorflow {
 
 TensorId::TensorId(const SafeTensorId& id) : TensorId(id.first, id.second) {}
 
 SafeTensorId::SafeTensorId(const TensorId& id)
-    : SafeTensorId(string(id.first), id.second) {}
-
-TensorId ParseTensorName(const string& name) {
-  return ParseTensorName(absl::string_view(name.data(), name.size()));
-}
+    : SafeTensorId(std::string(id.first), id.second) {}
 
 TensorId ParseTensorName(absl::string_view name) {
   // Parse either a name, ^name, or name:digits.  To do so, we go backwards from
@@ -38,28 +38,19 @@ TensorId ParseTensorName(absl::string_view name) {
   // see if the name starts with '^', indicating a control edge. If we find
   // neither ':' nor '^' characters, the output index is implicitly 0, and the
   // whole name string forms the first part of the tensor name.
-  const char* base = name.data();
-  const char* p = base + name.size() - 1;
-  unsigned int index = 0;
-  unsigned int mul = 1;
-  while (p > base && (*p >= '0' && *p <= '9')) {
-    index += ((*p - '0') * mul);
-    mul *= 10;
-    p--;
+  size_t colon_pos = name.rfind(':');
+  if (colon_pos != absl::string_view::npos) {
+    absl::string_view prefix = name.substr(0, colon_pos);
+    absl::string_view suffix = name.substr(colon_pos + 1);
+    uint64_t index;
+    if (str_util::ConsumeLeadingDigits(&suffix, &index) && suffix.empty()) {
+      return TensorId(prefix, index);
+    }
   }
-  TensorId id;
-  if (p > base && *p == ':' && mul > 1) {
-    id.first = absl::string_view(base, p - base);
-    id.second = index;
-  } else if (absl::StartsWith(name, "^")) {
-    // Control edge
-    id.first = absl::string_view(base + 1);
-    id.second = Graph::kControlSlot;
-  } else {
-    id.first = name;
-    id.second = 0;
+  if (absl::ConsumePrefix(&name, "^")) {
+    return TensorId(name, Graph::kControlSlot);
   }
-  return id;
+  return TensorId(name, 0);
 }
 
 bool IsTensorIdControl(const TensorId& tensor_id) {
diff --git a/tensorflow/core/graph/tensor_id.h b/tensorflow/core/graph/tensor_id.h
index cabd7968b28274..31b30fa14af463 100644
--- a/tensorflow/core/graph/tensor_id.h
+++ b/tensorflow/core/graph/tensor_id.h
@@ -57,7 +57,6 @@ struct TensorId : public std::pair<absl::string_view, int> {
   };
 };
 
-TensorId ParseTensorName(const string& name);
 TensorId ParseTensorName(absl::string_view name);
 
 bool IsTensorIdControl(const TensorId& tensor_id);
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 3ea2390d9c4e56..2f5bc2c22b0875 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index a630c1d3941aa7..e17c88971cc0c2 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -48,7 +48,7 @@ void Cluster::SetNumWarmupSteps(int num_steps) {
 }
 
 // Set executor type to instantiate
-void Cluster::SetExecutorType(const string* executor_type) {
+void Cluster::SetExecutorType(const std::string* executor_type) {
   options_.config.mutable_experimental()->set_executor_type(*executor_type);
 }
 
@@ -100,8 +100,8 @@ void Cluster::DisableOptimizer(bool disable) {
   }
 }
 
-const std::vector<string> Cluster::GetDeviceNames() const {
-  std::vector<string> device_names;
+const std::vector<std::string> Cluster::GetDeviceNames() const {
+  std::vector<std::string> device_names;
   device_names.reserve(devices_.size());
   for (const auto& device : devices_) {
     device_names.push_back(device.first);
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 9782a83f5e0012..d2662424d5ab2f 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -44,7 +44,7 @@ class Cluster {
   virtual ~Cluster();
 
   // Returns a string that represent the type of cluster that was instantiated.
-  virtual string type() const = 0;
+  virtual std::string type() const = 0;
 
   // Provision the hardware resources needed to run TensorFlow and start a
   // TensorFlow session that can take advantage of these resources.
@@ -76,7 +76,7 @@ class Cluster {
   void SetNumWarmupSteps(int num_steps);
 
   // Set executor type to instantiate
-  void SetExecutorType(const string* executor_type);
+  void SetExecutorType(const std::string* executor_type);
 
   // Returns the number of warmup steps.
   int NumWarmupSteps() const;
@@ -94,13 +94,13 @@ class Cluster {
 
   // Return the list of TensorFlow devices that are available to execute a
   // graph. This is empty until provision() is called.
-  const std::unordered_map<string, DeviceProperties>& GetDevices() const {
+  const std::unordered_map<std::string, DeviceProperties>& GetDevices() const {
     return devices_;
   }
 
   // Convenience method that returns the set of device names. These names are
   // sorted alphabetically.
-  const std::vector<string> GetDeviceNames() const;
+  const std::vector<std::string> GetDeviceNames() const;
 
   // The DeviceSet is not always available, but when it is it contains a
   // superset of the devices listed in GetDevices/GetDeviceNames().
@@ -116,7 +116,7 @@ class Cluster {
   // Returns peak memory of all devices during the session creation and session
   // runs.
   virtual absl::Status GetPeakMemoryUsage(
-      std::unordered_map<string, uint64>* device_peak_memory) const {
+      std::unordered_map<std::string, uint64_t>* device_peak_memory) const {
     return absl::UnimplementedError(
         "GetPeakMemoryUsage is not implemented for this type of cluster.");
   }
@@ -126,10 +126,10 @@ class Cluster {
   virtual absl::Status Initialize(const GrapplerItem& item) = 0;
 
   // Run the specified graph_def and return the corresponding metadata.
-  virtual absl::Status Run(const GraphDef& graph_def,
-                           const std::vector<std::pair<string, Tensor>>& feed,
-                           const std::vector<string>& fetch,
-                           RunMetadata* metadata) = 0;
+  virtual absl::Status Run(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<std::string, Tensor>>& feed,
+      const std::vector<std::string>& fetch, RunMetadata* metadata) = 0;
 
   // Run the specified GrapplerItem and return the corresponding metadata.
   virtual absl::Status Run(const GrapplerItem& item, RunMetadata* metadata) {
@@ -137,7 +137,7 @@ class Cluster {
   }
 
  protected:
-  std::unordered_map<string, DeviceProperties> devices_;
+  std::unordered_map<std::string, DeviceProperties> devices_;
   const int timeout_s_;
   SessionOptions options_;
   RunOptions run_options_;
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 43384037403acc..de16db2cbdea31 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -119,7 +119,7 @@ absl::Status SingleMachine::Provision() {
                          ": ", s.message()));
       }
       attr = GetLocalGPUInfo(platform_device_id);
-    } else if (dev.device_type().find("XLA") == string::npos) {
+    } else if (dev.device_type().find("XLA") == std::string::npos) {
       // Filter out the fake XLA devices to avoid double counting the actual
       // hardware resources that are available.
       attr.set_type(dev.device_type());
@@ -162,8 +162,8 @@ absl::Status SingleMachine::Shutdown() {
 
 absl::Status SingleMachine::Run(
     const GraphDef& graph_def,
-    const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* metadata) {
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* metadata) {
   mutex_lock l(this->last_graph_mu_);
   if (last_graph_ != &graph_def) {
     TF_RETURN_IF_ERROR(ResetSession());
@@ -229,7 +229,7 @@ absl::Status SingleMachine::EnablePeakMemoryStats() {
 }
 
 absl::Status SingleMachine::GetPeakMemoryUsage(
-    std::unordered_map<string, uint64>* device_peak_memory) const {
+    std::unordered_map<std::string, uint64_t>* device_peak_memory) const {
   // Cpu_allocator->TracksAllocationSizes() returns true doesn't always mean the
   // the AllocatorStats would be collected.
   if (!cpu_allocator_stats_enabled_) {
@@ -257,14 +257,14 @@ absl::Status SingleMachine::GetPeakMemoryUsage(
 }
 
 absl::Status SingleMachine::RunWithTimeout(
-    const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* run_metadata) {
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* run_metadata) {
   return RunWithTimeout(feed, fetch, run_metadata, timeout_s_);
 }
 
 absl::Status SingleMachine::RunWithTimeout(
-    const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* run_metadata,
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* run_metadata,
     int64_t timeout_s) {
   // We shouldn't be running or closing the session at this point.
   {
@@ -403,7 +403,7 @@ void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
   graph_costs->mutable_node()->Reserve(graph_costs->node_size() +
                                        init_costs.node_size() +
                                        queue_costs.node_size());
-  std::unordered_set<string> nodes_seen;
+  std::unordered_set<std::string> nodes_seen;
   int queue_costs_id_offset = graph_costs->node_size();
   for (const auto& node : graph_costs->node()) {
     nodes_seen.insert(node.name());
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index f3f36626767c52..554f316a3c9c76 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -44,15 +44,15 @@ class SingleMachine : public Cluster {
   SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus);
   ~SingleMachine() override;
 
-  string type() const override { return "single_machine"; }
+  std::string type() const override { return "single_machine"; }
 
   absl::Status Provision() override;
   absl::Status Shutdown() override;
 
   absl::Status Initialize(const GrapplerItem& item) override;
   absl::Status Run(const GraphDef& item,
-                   const std::vector<std::pair<string, Tensor>>& feed,
-                   const std::vector<string>& fetch,
+                   const std::vector<std::pair<std::string, Tensor>>& feed,
+                   const std::vector<std::string>& fetch,
                    RunMetadata* metadata) override;
 
   const DeviceSet* GetDeviceSet() const override { return device_set_.get(); }
@@ -60,16 +60,16 @@ class SingleMachine : public Cluster {
   absl::Status EnablePeakMemoryStats() override;
 
   // It requires EnableAllocatorStats(true) be called before Provision().
-  absl::Status GetPeakMemoryUsage(
-      std::unordered_map<string, uint64>* device_peak_memory) const override;
+  absl::Status GetPeakMemoryUsage(std::unordered_map<std::string, uint64_t>*
+                                      device_peak_memory) const override;
 
  private:
   absl::Status RunWithTimeout(
-      const std::vector<std::pair<string, Tensor>>& feed,
-      const std::vector<string>& fetch, RunMetadata* run_metadata);
+      const std::vector<std::pair<std::string, Tensor>>& feed,
+      const std::vector<std::string>& fetch, RunMetadata* run_metadata);
   absl::Status RunWithTimeout(
-      const std::vector<std::pair<string, Tensor>>& feed,
-      const std::vector<string>& fetch, RunMetadata* run_metadata,
+      const std::vector<std::pair<std::string, Tensor>>& feed,
+      const std::vector<std::string>& fetch, RunMetadata* run_metadata,
       int64_t timeout_s);
   absl::Status ResetSession();
   absl::Status CloseSession(bool use_timeout);
@@ -81,10 +81,10 @@ class SingleMachine : public Cluster {
 
   std::unique_ptr<Session> session_;
   std::vector<QueueRunnerDef> queue_runner_defs_;
-  string last_graph_id_;
+  std::string last_graph_id_;
   mutex last_graph_mu_;
   const GraphDef* last_graph_ TF_GUARDED_BY(last_graph_mu_) = nullptr;
-  std::vector<string> init_ops_;
+  std::vector<std::string> init_ops_;
   int64_t expected_init_time_s_;
   std::unique_ptr<Coordinator> coordinator_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index fb3960ea2a8b79..31b3a76c595bb3 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -93,7 +93,7 @@ TEST_F(SingleMachineTest, CostModel) {
   for (const auto& node : metadata.cost_graph().node()) {
     // Skip the special nodes inserted by TF: these are prefixed with an
     // underscore.
-    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+    if (node.name()[0] == '_' || node.name().find("/_") != std::string::npos) {
       continue;
     }
 #ifndef INTEL_MKL
@@ -140,7 +140,8 @@ TEST_F(SingleMachineTest, MultipleItems) {
     // in the fake input, plus 1 enqueue and 1 dequeue node.
     EXPECT_LE(6, metadata1.cost_graph().node_size());
     for (const auto& node : metadata1.cost_graph().node()) {
-      if (node.name()[0] == '_' || node.name().find("/_") != string::npos ||
+      if (node.name()[0] == '_' ||
+          node.name().find("/_") != std::string::npos ||
           node.name() == "queue") {
         continue;
       }
@@ -161,9 +162,9 @@ TEST_F(SingleMachineTest, MultipleItems) {
       metadata2.mutable_cost_graph()->mutable_node(i)->set_compute_cost(0);
       metadata2.clear_step_stats();
     }
-    string s1;
+    std::string s1;
     ::tensorflow::protobuf::TextFormat::PrintToString(metadata1, &s1);
-    string s2;
+    std::string s2;
     ::tensorflow::protobuf::TextFormat::PrintToString(metadata2, &s2);
     EXPECT_EQ(s1, s2);
   }
@@ -211,7 +212,7 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   RunMetadata metadata;
   TF_CHECK_OK(cluster_->Initialize(item));
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
-  std::set<string> cost_nodes;
+  std::set<std::string> cost_nodes;
   for (const auto& node : metadata.cost_graph().node()) {
 #ifdef INTEL_MKL
     // Skip the special nodes inserted by TF (and MKL): these are either
@@ -227,7 +228,7 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
     }
 #endif
   }
-  const std::set<string> expected_cost_nodes = {
+  const std::set<std::string> expected_cost_nodes = {
       "zero",      "one",      "add",         "square",
       "new_shape", "reshaped", "final_shape", "expected_shape",
       "valid",     "all_dims", "all_valid",   "assert_valid"};
@@ -263,7 +264,7 @@ static void RunInfiniteTFLoop() {
   shp->set_op("Const");
   (*shp->mutable_attr())["dtype"].set_type(DT_INT32);
   Tensor shp_tensor(DT_INT32, TensorShape({1}));
-  shp_tensor.flat<int32>()(0) = 1;
+  shp_tensor.flat<int32_t>()(0) = 1;
   shp_tensor.AsProtoTensorContent(
       (*shp->mutable_attr())["value"].mutable_tensor());
 
@@ -394,14 +395,14 @@ TEST_F(SingleMachineTest, InitializationMemory) {
 namespace {
 
 template <class T>
-inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
+inline void SetNodeAttr(const std::string& key, const T& value, NodeDef* node) {
   AttrValue attr_value;
   SetAttrValue(value, &attr_value);
   auto* attr_map = node->mutable_attr();
   (*attr_map)[key] = attr_value;
 }
 template <>
-inline void SetNodeAttr(const string& key, const Tensor& tensor,
+inline void SetNodeAttr(const std::string& key, const Tensor& tensor,
                         NodeDef* node) {
   TensorProto tensor_proto;
   tensor.AsProtoTensorContent(&tensor_proto);
@@ -528,7 +529,7 @@ GrapplerItem CreateGrapplerItemWithResourceMemory() {
   // Add a queue.
   ops::FIFOQueue queue(s.WithOpName("queue"), {DataType::DT_STRING});
   Output some_string =
-      ops::Const(s.WithOpName("some_string"), string("nothing"));
+      ops::Const(s.WithOpName("some_string"), std::string("nothing"));
   ops::QueueEnqueue enqueue(s.WithOpName("enqueue"), queue, {some_string});
   ops::QueueDequeue dequeue(s.WithOpName("dequeue"), queue,
                             {DataType::DT_STRING});
@@ -560,7 +561,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   GrapplerItem item = CreateGrapplerItemWithResourceMemory();
   TF_CHECK_OK(cluster_->Initialize(item));
 
-  std::unordered_map<string, uint64> device_peak_memory_before;
+  std::unordered_map<std::string, uint64_t> device_peak_memory_before;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_before));
   EXPECT_EQ(device_peak_memory_before.size(), 1);
   // There might be a bit memory used before session's running anything.
@@ -570,7 +571,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
 
   // Check there is memory that is not released.
-  std::unordered_map<string, uint64> device_peak_memory;
+  std::unordered_map<std::string, uint64_t> device_peak_memory;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory));
   EXPECT_EQ(device_peak_memory.size(), 1);
   EXPECT_GT(device_peak_memory.begin()->second, 0);
@@ -578,7 +579,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   // Reprovisioning the cluster would release all memory.
   TF_CHECK_OK(cluster_->Shutdown());
   TF_CHECK_OK(cluster_->Provision());
-  std::unordered_map<string, uint64> device_peak_memory_after;
+  std::unordered_map<std::string, uint64_t> device_peak_memory_after;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_after));
   TF_CHECK_OK(cluster_->Shutdown());
 
@@ -596,12 +597,12 @@ TEST_F(SingleMachineTest, PeakMemory) {
   RunMetadata metadata;
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
 
-  std::unordered_map<string, uint64> device_peak_memory;
+  std::unordered_map<std::string, uint64_t> device_peak_memory;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory));
   ASSERT_NE(
       device_peak_memory.find("/job:localhost/replica:0/task:0/device:CPU:0"),
       device_peak_memory.end());
-  uint64 cpu_memory =
+  uint64_t cpu_memory =
       device_peak_memory["/job:localhost/replica:0/task:0/device:CPU:0"];
   EXPECT_GT(cpu_memory, 0);
 
@@ -629,7 +630,7 @@ TEST_F(SingleMachineTest, PeakMemoryStatsNotEnabled) {
   TF_CHECK_OK(cluster.Provision());
   TF_CHECK_OK(cluster.Initialize(item));
 
-  std::unordered_map<string, uint64> device_peak_memory;
+  std::unordered_map<std::string, uint64_t> device_peak_memory;
   absl::Status s = cluster.GetPeakMemoryUsage(&device_peak_memory);
   TF_CHECK_OK(cluster.Shutdown());
   ASSERT_FALSE(s.ok());
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index e1775679e6ba54..c8c450c91f0edc 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -35,12 +35,12 @@ namespace tensorflow {
 namespace grappler {
 
 VirtualCluster::VirtualCluster(
-    const std::unordered_map<string, DeviceProperties>& devices)
+    const std::unordered_map<std::string, DeviceProperties>& devices)
     : VirtualCluster(devices, std::make_unique<OpLevelCostEstimator>(),
                      ReadyNodeManagerFactory("FirstReady")) {}
 
 VirtualCluster::VirtualCluster(
-    const std::unordered_map<string, DeviceProperties>& devices,
+    const std::unordered_map<std::string, DeviceProperties>& devices,
     std::unique_ptr<OpLevelCostEstimator> node_estimator,
     std::unique_ptr<ReadyNodeManager> node_manager)
     : Cluster(0) {
@@ -54,7 +54,7 @@ VirtualCluster::VirtualCluster(
 }
 
 VirtualCluster::VirtualCluster(const DeviceSet* device_set)
-    : VirtualCluster(std::unordered_map<string, DeviceProperties>()) {
+    : VirtualCluster(std::unordered_map<std::string, DeviceProperties>()) {
   device_set_ = device_set;
   for (const auto& device : device_set_->devices()) {
     DeviceProperties props = GetDeviceInfo(device->parsed_name());
@@ -74,8 +74,9 @@ absl::Status VirtualCluster::Initialize(const GrapplerItem& item) {
 }
 
 absl::Status VirtualCluster::Run(
-    const GraphDef& graph, const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* metadata) {
+    const GraphDef& graph,
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* metadata) {
   GrapplerItem item;
   item.graph = graph;
   item.feed = feed;
@@ -98,11 +99,12 @@ absl::Status VirtualCluster::Run(const GrapplerItem& item,
   TF_RETURN_IF_ERROR(
       estimator_->PredictCosts(item.graph, metadata, /*cost=*/nullptr));
 
-  const std::unordered_map<string, DeviceProperties>& device = GetDevices();
-  std::unordered_map<string, int64_t> peak_mem_usage =
+  const std::unordered_map<std::string, DeviceProperties>& device =
+      GetDevices();
+  std::unordered_map<std::string, int64_t> peak_mem_usage =
       estimator_->GetScheduler()->GetPeakMemoryUsage();
   for (const auto& mem_usage : peak_mem_usage) {
-    const string& device_name = mem_usage.first;
+    const std::string& device_name = mem_usage.first;
     auto it = device.find(device_name);
     if (it == device.end()) {
       // It's probably the fake send/recv device. Eventually we'll need to
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index 1204a34c7f3f8f..5a6f3d83bccb4f 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -39,21 +39,22 @@ namespace grappler {
 class VirtualCluster : public Cluster {
  public:
   explicit VirtualCluster(
-      const std::unordered_map<string, DeviceProperties>& devices);
-  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
-                 std::unique_ptr<OpLevelCostEstimator> node_estimator,
-                 std::unique_ptr<ReadyNodeManager> node_manager);
+      const std::unordered_map<std::string, DeviceProperties>& devices);
+  VirtualCluster(
+      const std::unordered_map<std::string, DeviceProperties>& devices,
+      std::unique_ptr<OpLevelCostEstimator> node_estimator,
+      std::unique_ptr<ReadyNodeManager> node_manager);
   explicit VirtualCluster(const DeviceSet* device_set);
 
   ~VirtualCluster() override;
 
-  string type() const override { return "virtual"; }
+  std::string type() const override { return "virtual"; }
 
   absl::Status Provision() override;
   absl::Status Initialize(const GrapplerItem& item) override;
   absl::Status Run(const GraphDef& graph,
-                   const std::vector<std::pair<string, Tensor>>& feed,
-                   const std::vector<string>& fetch,
+                   const std::vector<std::pair<std::string, Tensor>>& feed,
+                   const std::vector<std::string>& fetch,
                    RunMetadata* metadata) override;
   absl::Status Run(const GrapplerItem& item, RunMetadata* metadata) override;
   const DeviceSet* GetDeviceSet() const override { return device_set_; }
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster_test.cc b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
index 251f02d407c093..8af18e1e1d6eba 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
@@ -51,7 +51,7 @@ class VirtualClusterTest : public ::testing::Test {
     cpu_device.set_l2_cache_size(256 * 1024);
     cpu_device.set_l3_cache_size(4 * 1024 * 1024);
     cpu_device.set_memory_size(1024 * 1024);
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
     cluster_ = std::make_unique<VirtualCluster>(devices);
     TF_CHECK_OK(cluster_->Provision());
@@ -86,7 +86,7 @@ TEST_F(VirtualClusterTest, CostModel) {
   EXPECT_LE(4, metadata.cost_graph().node_size());
   for (const auto& node : metadata.cost_graph().node()) {
     // Skip the constant node that configures the random number generator.
-    if (node.name().find("Const/Const") != string::npos) {
+    if (node.name().find("Const/Const") != std::string::npos) {
       continue;
     }
     EXPECT_EQ(1, node.output_info_size());
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 94f5c24032a4a1..f11c7391930d19 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
 
+#include <cstdint>
+#include <limits>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -104,7 +106,7 @@ class TensorSizeHistogram {
   uint64 sum_elem_ = 0;
   // min_ and max_ are initialized to a very large value and zero, respectively,
   // so that any value added can replace the initial min_ and max_.
-  uint64 min_ = kuint64max;
+  uint64_t min_ = std::numeric_limits<uint64_t>::max();
   uint64 max_ = 0;
   // Buckets are logarithmic:
   // 0B, 1B, 2-3B, 4-7B, 8-15B, ..., 2^N - 2^(N+1)-1B, ...
diff --git a/tensorflow/core/grappler/graph_topology_view.cc b/tensorflow/core/grappler/graph_topology_view.cc
index a6cc0dc29fa91f..1a44dd7e373911 100644
--- a/tensorflow/core/grappler/graph_topology_view.cc
+++ b/tensorflow/core/grappler/graph_topology_view.cc
@@ -65,7 +65,7 @@ absl::Status GraphTopologyView::InitializeFromGraph(
     const auto src = node_name_to_index_.find(edge.src.node->name());
     const bool valid_src = src != node_name_to_index_.end();
     if (!valid_src) {
-      const string error_message =
+      const std::string error_message =
           absl::StrCat("Non-existent src node: ", edge.src.node->name());
       if (skip_invalid_edges_) {
         VLOG(0) << "Skip error: " << error_message;
@@ -78,7 +78,7 @@ absl::Status GraphTopologyView::InitializeFromGraph(
     const bool valid_dst = dst != node_name_to_index_.end();
 
     if (!valid_dst) {
-      const string error_message =
+      const std::string error_message =
           absl::StrCat("Non-existent dst node: ", edge.dst.node->name());
       if (skip_invalid_edges_) {
         VLOG(0) << "Skip error: " << error_message;
@@ -103,7 +103,7 @@ absl::Status GraphTopologyView::InitializeFromGraph(
     const NodeDef& node = graph.node(node_idx);
     fanins_[node_idx].reserve(node.input_size());
 
-    for (const string& input : node.input()) {
+    for (const std::string& input : node.input()) {
       TensorId tensor = ParseTensorName(input);
       if (ignore_control_edges && IsTensorIdControl(tensor)) {
         continue;
@@ -112,8 +112,8 @@ absl::Status GraphTopologyView::InitializeFromGraph(
       const bool valid_input = it != node_name_to_index_.end();
 
       if (!valid_input) {
-        const string error_message = absl::StrCat("Non-existent input ", input,
-                                                  " in node ", node.name());
+        const std::string error_message = absl::StrCat(
+            "Non-existent input ", input, " in node ", node.name());
         if (skip_invalid_edges_) {
           VLOG(3) << "Skip error: " << error_message;
         } else {
diff --git a/tensorflow/core/grappler/graph_topology_view_test.cc b/tensorflow/core/grappler/graph_topology_view_test.cc
index 4d93eaa0b198b8..adda65d886b667 100644
--- a/tensorflow/core/grappler/graph_topology_view_test.cc
+++ b/tensorflow/core/grappler/graph_topology_view_test.cc
@@ -24,7 +24,7 @@ namespace grappler {
 
 class GraphTopologyViewTest : public ::testing::Test {
  protected:
-  using NodeConfig = std::pair<string, std::vector<string>>;
+  using NodeConfig = std::pair<std::string, std::vector<std::string>>;
 
   static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
     GraphDef graph;
@@ -35,7 +35,7 @@ class GraphTopologyViewTest : public ::testing::Test {
 
       NodeDef node_def;
       node_def.set_name(node_name);
-      for (const string& input : node_inputs) {
+      for (const std::string& input : node_inputs) {
         node_def.add_input(input);
       }
 
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index d4c83e754dda96..a637c2f4e207a0 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -161,16 +161,16 @@ TEST_F(GraphViewTest, BasicGraph) {
   const NodeDef* add_node = graph.GetNode("AddN");
   EXPECT_NE(add_node, nullptr);
 
-  absl::flat_hash_set<string> fanouts;
-  absl::flat_hash_set<string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
+  absl::flat_hash_set<std::string> fanouts;
+  absl::flat_hash_set<std::string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
   for (const auto& fo : graph.GetFanouts(*add_node, false)) {
     fanouts.insert(absl::StrCat(fo.node->name(), ":", fo.port_id));
   }
   EXPECT_EQ(graph.NumFanouts(*add_node, false), 2);
   EXPECT_EQ(fanouts, expected_fanouts);
 
-  absl::flat_hash_set<string> fanins;
-  absl::flat_hash_set<string> expected_fanins = {"Sign_1:0", "Sign:0"};
+  absl::flat_hash_set<std::string> fanins;
+  absl::flat_hash_set<std::string> expected_fanins = {"Sign_1:0", "Sign:0"};
   for (const auto& fi : graph.GetFanins(*add_node, false)) {
     fanins.insert(absl::StrCat(fi.node->name(), ":", fi.port_id));
   }
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 0143b66c5ac228..b6da0ed010e189 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -74,9 +74,9 @@ std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
 }
 
 std::vector<const NodeDef*> GrapplerItem::EnqueueOpsFanin() const {
-  std::vector<string> enqueue_ops;
+  std::vector<std::string> enqueue_ops;
   for (const auto& queue_runner : queue_runners) {
-    for (const string& enqueue_op : queue_runner.enqueue_op_name()) {
+    for (const std::string& enqueue_op : queue_runner.enqueue_op_name()) {
       enqueue_ops.push_back(enqueue_op);
     }
   }
@@ -103,9 +103,9 @@ std::vector<const NodeDef*> GrapplerItem::MainVariables() const {
   return vars;
 }
 
-std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
-  std::unordered_set<string> result;
-  for (const string& f : fetch) {
+std::unordered_set<std::string> GrapplerItem::NodesToPreserve() const {
+  std::unordered_set<std::string> result;
+  for (const std::string& f : fetch) {
     VLOG(1) << "Add fetch " << f;
     result.insert(NodeName(f));
   }
@@ -130,7 +130,7 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   }
 
   for (const auto& queue_runner : queue_runners) {
-    for (const string& enqueue_op : queue_runner.enqueue_op_name()) {
+    for (const std::string& enqueue_op : queue_runner.enqueue_op_name()) {
       result.insert(NodeName(enqueue_op));
     }
     if (!queue_runner.close_op_name().empty()) {
@@ -167,11 +167,11 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   return result;
 }
 
-const std::unordered_set<string>& GrapplerItem::devices() const {
+const std::unordered_set<std::string>& GrapplerItem::devices() const {
   return devices_;
 }
 
-absl::Status GrapplerItem::AddDevice(const string& device) {
+absl::Status GrapplerItem::AddDevice(const std::string& device) {
   DeviceNameUtils::ParsedName name;
 
   if (!DeviceNameUtils::ParseFullName(device, &name)) {
@@ -189,7 +189,7 @@ absl::Status GrapplerItem::AddDevice(const string& device) {
 
 absl::Status GrapplerItem::AddDevices(const GrapplerItem& other) {
   std::vector<absl::string_view> invalid_devices;
-  for (const string& device : other.devices()) {
+  for (const std::string& device : other.devices()) {
     absl::Status added = AddDevice(device);
     if (!added.ok()) invalid_devices.emplace_back(device);
   }
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 36bc4f1552e4be..98e70d599fa4c9 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -46,22 +46,22 @@ struct GrapplerItem {
   // Create a copy of this GrapplerItem with graph swapped with the argument.
   GrapplerItem WithGraph(GraphDef&& graph) const;
 
-  string id;  // A unique id for this item
+  std::string id;  // A unique id for this item
 
   // Inputs
   GraphDef graph;
-  std::vector<std::pair<string, Tensor>> feed;
-  std::vector<string> fetch;
+  std::vector<std::pair<std::string, Tensor>> feed;
+  std::vector<std::string> fetch;
 
   // Initialization op(s).
-  std::vector<string> init_ops;
+  std::vector<std::string> init_ops;
   // Expected initialization time in seconds, or 0 if unknown
   int64_t expected_init_time = 0;
 
   // Save/restore ops (if any)
-  string save_op;
-  string restore_op;
-  string save_restore_loc_tensor;
+  std::string save_op;
+  std::string restore_op;
+  std::string save_restore_loc_tensor;
 
   // Queue runner(s) required to run the queue(s) of this model.
   std::vector<QueueRunnerDef> queue_runners;
@@ -69,7 +69,7 @@ struct GrapplerItem {
   // List of op names to keep in the graph. This includes nodes that are
   // referenced in various collections, and therefore must be preserved to
   // ensure that the optimized metagraph can still be loaded.
-  std::vector<string> keep_ops;
+  std::vector<std::string> keep_ops;
 
   // Return the set of node evaluated during a regular train/inference step.
   std::vector<const NodeDef*> MainOpsFanin() const;
@@ -81,7 +81,7 @@ struct GrapplerItem {
   std::vector<const NodeDef*> MainVariables() const;
   // Return a set of node names that must be preserved. This includes feed and
   // fetch nodes, keep_ops, init_ops.
-  std::unordered_set<string> NodesToPreserve() const;
+  std::unordered_set<std::string> NodesToPreserve() const;
 
   struct OptimizationOptions {
     // Is it allowed to add nodes to the graph that do not have registered
@@ -108,11 +108,11 @@ struct GrapplerItem {
     int intra_op_parallelism_threads = tsl::port::MaxParallelism();
   };
 
-  const std::unordered_set<string>& devices() const;
+  const std::unordered_set<std::string>& devices() const;
   // Adds a device to a set of available devices, only if it's a valid fully
   // defined device name. Returns `OkStatus()` if successfully added a device,
   // and an error otherwise.
-  absl::Status AddDevice(const string& device);
+  absl::Status AddDevice(const std::string& device);
   // Adds all valid devices from the other Grappler item to the device set.
   absl::Status AddDevices(const GrapplerItem& other);
   // Adds all valid devices from the nodes of the graph to the device set.
@@ -132,7 +132,7 @@ struct GrapplerItem {
   // A set of fully defined device names that can be used to place the nodes of
   // the `graph`.
   // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
-  std::unordered_set<string> devices_;
+  std::unordered_set<std::string> devices_;
 
   OptimizationOptions optimization_options_;
 };
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 54c8883db7cd2d..ee1964096cd720 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -95,7 +95,7 @@ absl::Status ReplaceUnknownShapeDim(const ItemConfig& cfg,
                                     const TensorShapeProto& shape_pb_in,
                                     TensorShapeProto* shape_pb_out,
                                     TensorShape* shape_out) {
-  std::vector<int32> dims;
+  std::vector<int32_t> dims;
   for (const auto& dim_proto : shape_pb_in.dim()) {
     if (cfg.placeholder_unknown_output_shape_dim >= 0 &&
         dim_proto.size() == -1) {
@@ -103,7 +103,7 @@ absl::Status ReplaceUnknownShapeDim(const ItemConfig& cfg,
       shape_pb_out->add_dim()->set_size(
           cfg.placeholder_unknown_output_shape_dim);
     } else {
-      dims.push_back(std::max<int32>(1, dim_proto.size()));
+      dims.push_back(std::max<int32_t>(1, dim_proto.size()));
       shape_pb_out->add_dim()->set_size(dim_proto.size());
     }
   }
@@ -117,7 +117,7 @@ absl::Status ReplaceUnknownShapeDim(const ItemConfig& cfg,
 // (b/134092018).
 absl::Status UpdatePlaceholderShape(
     const ItemConfig& cfg,
-    const std::unordered_set<string>& signature_feed_nodes,
+    const std::unordered_set<std::string>& signature_feed_nodes,
     GrapplerItem* new_item, NodeDef* node) {
   if (node->attr().count("dtype") == 0) {
     return absl::InternalError(absl::StrCat("Unknown type for placeholder ",
@@ -188,7 +188,7 @@ absl::Status UpdatePlaceholderShape(
   } else if (cfg.feed_nodes.count(node->name()) > 0) {
     // If specific feed nodes were given, only update their tensors.
     auto it = find_if(new_item->feed.begin(), new_item->feed.end(),
-                      [&node](std::pair<string, Tensor>& f) {
+                      [&node](std::pair<std::string, Tensor>& f) {
                         return f.first == node->name();
                       });
     DCHECK(it != new_item->feed.end());
@@ -294,7 +294,8 @@ absl::Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg,
 }
 
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
-    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg) {
+    const std::string& id, const MetaGraphDef& meta_graph,
+    const ItemConfig& cfg) {
   if (id.empty()) {
     LOG(ERROR) << "id must be non-empty.";
     return nullptr;
@@ -305,7 +306,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 
   // Fill in feed nodes from config, if any provided.
   for (const auto& feed_node : cfg.feed_nodes) {
-    const string feed_name = NodeName(feed_node);
+    const std::string feed_name = NodeName(feed_node);
     new_item->feed.emplace_back(feed_name, Tensor());
   }
   for (const auto& fetch_node : cfg.fetch_nodes) {
@@ -325,8 +326,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 
   // Detect feed and fetch nodes from signature defs. Signatures may share same
   // inputs or outputs.
-  std::unordered_set<string> signature_feed_nodes;
-  std::unordered_set<string> signature_fetch_nodes;
+  std::unordered_set<std::string> signature_feed_nodes;
+  std::unordered_set<std::string> signature_fetch_nodes;
   for (const auto& name_and_signature : meta_graph.signature_def()) {
     for (const auto& name_and_input : name_and_signature.second.inputs()) {
       const TensorInfo& input = name_and_input.second;
@@ -442,7 +443,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   // have to run restore op first.
 
   // Try to find initializers from variables and tables as init ops.
-  for (const string& var_collection :
+  for (const std::string& var_collection :
        {"variables", "local_variables", "model_variables",
         "trainable_variables"}) {
     if (meta_graph.collection_def().count(var_collection) == 0) {
@@ -476,7 +477,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   // We keep the mapping from asset node to asset files. This should have been
   // used as feed but since asset node is usually a constant node, we will fill
   // the values of these constant nodes with their actual asset file paths.
-  std::unordered_map<string, string> asset_node_to_value;
+  std::unordered_map<std::string, std::string> asset_node_to_value;
 
   // Assets file may have changed their directory, we assemble their new paths
   // if assets_directory_override is set. We also make sure we still can
@@ -495,8 +496,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
               LOG(ERROR) << "Failed to parse AssetFile.";
               continue;
             }
-            string asset_filepath = io::JoinPath(cfg.assets_directory_override,
-                                                 asset_file_def.filename());
+            std::string asset_filepath = io::JoinPath(
+                cfg.assets_directory_override, asset_file_def.filename());
             if (!FilesExist({asset_filepath}, nullptr)) {
               LOG(ERROR) << "Can't access one or more of the asset files "
                          << asset_filepath << ", skipping this input";
@@ -514,7 +515,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   } else if (meta_graph.collection_def().count("asset_filepaths") > 0) {
     const CollectionDef& file_paths =
         meta_graph.collection_def().at("asset_filepaths");
-    std::vector<string> paths;
+    std::vector<std::string> paths;
     for (const auto& raw_path : file_paths.bytes_list().value()) {
       paths.push_back(raw_path);
     }
@@ -544,7 +545,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   // Add each node referenced in a collection to the list of nodes to keep.
   for (const auto& col : meta_graph.collection_def()) {
     const CollectionDef& collection = col.second;
-    for (const string& node : collection.node_list().value()) {
+    for (const std::string& node : collection.node_list().value()) {
       new_item->keep_ops.push_back(NodeName(node));
     }
   }
@@ -654,7 +655,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   }
 
   // Validate feed, fetch and init nodes
-  std::unordered_set<string> nodes;
+  std::unordered_set<std::string> nodes;
   for (const auto& node : new_item->graph.node()) {
     nodes.insert(node.name());
   }
@@ -680,7 +681,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 }
 
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDefFile(
-    const string& id, const string& meta_graph_file, const ItemConfig& cfg) {
+    const std::string& id, const std::string& meta_graph_file,
+    const ItemConfig& cfg) {
   MetaGraphDef meta_graph;
   if (!ReadMetaGraphDefFromFile(meta_graph_file, &meta_graph).ok()) {
     LOG(ERROR) << "Failed to read " << meta_graph_file;
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 00661da0253c0d..f16ffef470240a 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -43,13 +43,13 @@ struct ItemConfig {
   // Has no effect if "inline_functions" is disabled.
   bool erase_noinline_attributes = false;
   // If non-empty, override the directory of asset paths.
-  string assets_directory_override;
+  std::string assets_directory_override;
   // If true, runs ModelPruner on the graph.
   bool prune_graph = false;
   // Override feed nodes list.
-  std::set<string> feed_nodes;
+  std::set<std::string> feed_nodes;
   // Override fetch nodes list.
-  std::set<string> fetch_nodes;
+  std::set<std::string> fetch_nodes;
 
   // Configs for graph optimizations from common_runtime. This is NOT Grappler
   // function optimizer. When Grappler is invoked at runtime, it is typically
@@ -71,13 +71,15 @@ absl::Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg,
 // Factory method for creating a GrapplerItem from a MetaGraphDef.
 // Returns nullptr if the given meta_graph cannot be converted.
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
-    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
+    const std::string& id, const MetaGraphDef& meta_graph,
+    const ItemConfig& cfg);
 
 // Factory method for creating a GrapplerItem from a file
 // containing a MetaGraphDef in either binary or text format.
 // Returns nullptr if the given meta_graph cannot be converted.
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDefFile(
-    const string& id, const string& meta_graph_file, const ItemConfig& cfg);
+    const std::string& id, const std::string& meta_graph_file,
+    const ItemConfig& cfg);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 7b288aa1d7cdd2..472909e1790a91 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -42,19 +42,19 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest) {
   Output var =
       ops::Variable(s.WithOpName("var"), TensorShape(), DataType::DT_FLOAT);
   Output filename_node =
-      ops::Const(s.WithOpName("filename"), string("model"), TensorShape());
+      ops::Const(s.WithOpName("filename"), std::string("model"), TensorShape());
   Output tensor_name =
-      ops::Const(s.WithOpName("tensorname"), string("var"), TensorShape());
+      ops::Const(s.WithOpName("tensorname"), std::string("var"), TensorShape());
   Output restore = ops::Restore(s.WithOpName("restore"), filename_node,
                                 tensor_name, DataType::DT_FLOAT);
   Output assign = ops::Assign(s.WithOpName("assign"), var, restore);
 
   TF_CHECK_OK(s.ToGraphDef(meta_graph.mutable_graph_def()));
 
-  string temp_dir = testing::TmpDir();
+  std::string temp_dir = testing::TmpDir();
 
   Env *env = Env::Default();
-  string filename =
+  std::string filename =
       io::JoinPath(temp_dir, "grappler_item_builder_test_filename");
   env->DeleteFile(filename).IgnoreError();
   std::unique_ptr<WritableFile> file_to_write;
@@ -88,7 +88,7 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest) {
       ASSERT_TRUE(iter->second.has_tensor());
       ASSERT_EQ(1, iter->second.tensor().string_val_size());
 
-      string tensor_string_val = iter->second.tensor().string_val(0);
+      std::string tensor_string_val = iter->second.tensor().string_val(0);
       EXPECT_EQ(tensor_string_val, filename);
     }
   }
@@ -100,12 +100,12 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output var =
       ops::Variable(s.WithOpName("var"), TensorShape(), DataType::DT_FLOAT);
-  Output filename_node1 =
-      ops::Const(s.WithOpName("filename1"), string("model1"), TensorShape());
-  Output filename_node2 =
-      ops::Const(s.WithOpName("filename2"), string("model2"), TensorShape());
+  Output filename_node1 = ops::Const(s.WithOpName("filename1"),
+                                     std::string("model1"), TensorShape());
+  Output filename_node2 = ops::Const(s.WithOpName("filename2"),
+                                     std::string("model2"), TensorShape());
   Output tensor_name =
-      ops::Const(s.WithOpName("tensorname"), string("var"), TensorShape());
+      ops::Const(s.WithOpName("tensorname"), std::string("var"), TensorShape());
   Output restore1 = ops::Restore(s.WithOpName("restore1"), filename_node1,
                                  tensor_name, DataType::DT_FLOAT);
   Output restore2 = ops::Restore(s.WithOpName("restore2"), filename_node1,
@@ -115,11 +115,11 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
 
   TF_CHECK_OK(s.ToGraphDef(meta_graph.mutable_graph_def()));
 
-  string temp_dir = testing::TmpDir();
+  std::string temp_dir = testing::TmpDir();
 
   // Create the first AssetFileDef that has a valid file.
   Env *env = Env::Default();
-  string filename1 =
+  std::string filename1 =
       io::JoinPath(temp_dir, "grappler_item_builder_test_filename1");
   env->DeleteFile(filename1).IgnoreError();
   std::unique_ptr<WritableFile> file_to_write;
@@ -132,7 +132,7 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
   *asset_file_def1.mutable_filename() = "grappler_item_builder_test_filename1";
 
   // Create the second AssetFileDef that has not a valid file.
-  string filename2 =
+  std::string filename2 =
       io::JoinPath(temp_dir, "grappler_item_builder_test_filename1");
   env->DeleteFile(filename2).IgnoreError();
   EXPECT_FALSE(env->FileExists(filename2).ok());
diff --git a/tensorflow/core/grappler/grappler_item_test.cc b/tensorflow/core/grappler/grappler_item_test.cc
index a8fbe356829409..f023d7f1f6e746 100644
--- a/tensorflow/core/grappler/grappler_item_test.cc
+++ b/tensorflow/core/grappler/grappler_item_test.cc
@@ -33,11 +33,11 @@ TEST_F(GrapplerItemTest, Basic) {
 
   EXPECT_TRUE(item.InitOpsFanin().empty());
 
-  std::vector<string> graph_nodes;
+  std::vector<std::string> graph_nodes;
   for (const auto& node : item.graph.node()) {
     graph_nodes.push_back(node.name());
   }
-  std::vector<string> main_ops;
+  std::vector<std::string> main_ops;
   for (const auto& node : item.MainOpsFanin()) {
     main_ops.push_back(node->name());
   }
@@ -49,9 +49,9 @@ TEST_F(GrapplerItemTest, Basic) {
 TEST_F(GrapplerItemTest, InferDevices) {
   using test::function::NDef;
 
-  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
-  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
-  const string cpu2 = "/device:CPU:2";
+  const std::string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const std::string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+  const std::string cpu2 = "/device:CPU:2";
 
   GrapplerItem item;
   item.graph = test::function::GDef(
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 2bbd5885b07132..943c05fae7b4ef 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -91,5 +91,6 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.cc b/tensorflow/core/grappler/inputs/file_input_yielder.cc
index 6fd2a55abf9f90..5d3e91d8dccee1 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index ebebebf5226d6a..326b291c4eb4f5 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/op_types.h"
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
@@ -767,9 +768,7 @@ bool ModifiesInputsInPlace(const NodeDef& node) {
     return false;
   }
 
-  string lower_op_name = op_name;
-  std::transform(lower_op_name.begin(), lower_op_name.end(),
-                 lower_op_name.begin(), ::tolower);
+  std::string lower_op_name = absl::AsciiStrToLower(op_name);
   if (absl::StrContains(lower_op_name, "inplace")) {
     return true;
   }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index cb903c798f2980..0e1cbc6b9aeed4 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 
+#include <algorithm>
 #include <cmath>
+#include <limits>
 #include <utility>
 
 #include "absl/status/status.h"
@@ -1267,28 +1269,28 @@ absl::Status ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, FIELDTYPE)         \
-  {                                                               \
-    const auto* val_ptr = tensor->flat<TYPE>().data();            \
-    auto last = *val_ptr;                                         \
-    int64_t last_index = 0;                                       \
-    for (int64_t i = 0; i < tensor->NumElements(); ++i) {         \
-      TYPE cur = *val_ptr++;                                      \
-      if (PackedValuesNotEqual(cur, last)) {                      \
-        last = cur;                                               \
-        last_index = i;                                           \
-      }                                                           \
-    }                                                             \
-    encoded_size = (last_index + 1) * sizeof(FIELDTYPE);          \
-    if (encoded_size < kint32max) {                               \
-      optimized = true;                                           \
-      t->mutable_##FIELDTYPE##_val()->Reserve(last_index + 1);    \
-      const auto* src_ptr = tensor->flat<TYPE>().data();          \
-      auto* dst_ptr = t->mutable_##FIELDTYPE##_val()              \
-                          -> AddNAlreadyReserved(last_index + 1); \
-      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);      \
-    }                                                             \
-  }                                                               \
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, FIELDTYPE)                      \
+  {                                                                            \
+    const auto* val_ptr = tensor->flat<TYPE>().data();                         \
+    auto last = *val_ptr;                                                      \
+    int64_t last_index = 0;                                                    \
+    for (int64_t i = 0; i < tensor->NumElements(); ++i) {                      \
+      TYPE cur = *val_ptr++;                                                   \
+      if (PackedValuesNotEqual(cur, last)) {                                   \
+        last = cur;                                                            \
+        last_index = i;                                                        \
+      }                                                                        \
+    }                                                                          \
+    encoded_size = (last_index + 1) * sizeof(FIELDTYPE);                       \
+    if (encoded_size < std::numeric_limits<int32_t>::max()) {                  \
+      optimized = true;                                                        \
+      t->mutable_##FIELDTYPE##_val()->Reserve(last_index + 1);                 \
+      const auto* src_ptr = tensor->flat<TYPE>().data();                       \
+      auto* dst_ptr =                                                          \
+          t->mutable_##FIELDTYPE##_val()->AddNAlreadyReserved(last_index + 1); \
+      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);                   \
+    }                                                                          \
+  }                                                                            \
   break
 
     switch (tensor->dtype()) {
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index c39d57b33ccf19..25dfae3ec7be3e 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -268,9 +268,9 @@ absl::Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
 
 absl::Status AddShuffleDataset(MutableGraphView* graph,
                                const NodeDef& add_before,
-                               const string& buffer_size_node,
-                               const string& seed_node,
-                               const string& seed2_node,
+                               const std::string& buffer_size_node,
+                               const std::string& seed_node,
+                               const std::string& seed2_node,
                                bool reshuffle_each_iteration) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
@@ -299,8 +299,8 @@ absl::Status AddShuffleDataset(MutableGraphView* graph,
 
 absl::Status AddShuffleDatasetV2(MutableGraphView* graph,
                                  const NodeDef& add_before,
-                                 const string& buffer_size_node,
-                                 const string& seed_generator_node) {
+                                 const std::string& buffer_size_node,
+                                 const std::string& seed_generator_node) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
   new_node.set_op(kShuffleDatasetV2OpName);
@@ -323,10 +323,10 @@ absl::Status AddShuffleDatasetV2(MutableGraphView* graph,
 
 absl::Status AddShuffleDatasetV3(MutableGraphView* graph,
                                  const NodeDef& add_before,
-                                 const string& buffer_size_node,
-                                 const string& seed_node,
-                                 const string& seed2_node,
-                                 const string& seed_generator_node,
+                                 const std::string& buffer_size_node,
+                                 const std::string& seed_node,
+                                 const std::string& seed2_node,
+                                 const std::string& seed_generator_node,
                                  bool reshuffle_each_iteration) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
@@ -373,11 +373,11 @@ bool ReaderOpInFunction(const NodeDef& node,
   return false;
 }
 
-absl::Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
-                                  absl::flat_hash_set<string>* nodes_to_delete,
-                                  string* op_name, string* buffer_size_node,
-                                  string* seed_node, string* seed2_node,
-                                  bool* reshuffle_each_iteration) {
+absl::Status RemoveShuffleDataset(
+    MutableGraphView* graph, const NodeDef& node,
+    absl::flat_hash_set<std::string>* nodes_to_delete, std::string* op_name,
+    std::string* buffer_size_node, std::string* seed_node,
+    std::string* seed2_node, bool* reshuffle_each_iteration) {
   if (node.op() == kShuffleDatasetOpName) {
     *op_name = node.op();
     *buffer_size_node = node.input(1);
@@ -400,8 +400,8 @@ absl::Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
 
 absl::Status RemoveShuffleDatasetV2(
     MutableGraphView* graph, const NodeDef& node,
-    absl::flat_hash_set<string>* nodes_to_delete, string* op_name,
-    string* buffer_size_node, string* seed_generator_node) {
+    absl::flat_hash_set<std::string>* nodes_to_delete, std::string* op_name,
+    std::string* buffer_size_node, std::string* seed_generator_node) {
   if (node.op() == kShuffleDatasetV2OpName) {
     *op_name = node.op();
     *buffer_size_node = node.input(1);
@@ -422,9 +422,10 @@ absl::Status RemoveShuffleDatasetV2(
 
 absl::Status RemoveShuffleDatasetV3(
     MutableGraphView* graph, const NodeDef& node,
-    absl::flat_hash_set<string>* nodes_to_delete, string* op_name,
-    string* buffer_size_node, string* seed_node, string* seed2_node,
-    string* seed_generator_node, bool* reshuffle_each_iteration) {
+    absl::flat_hash_set<std::string>* nodes_to_delete, std::string* op_name,
+    std::string* buffer_size_node, std::string* seed_node,
+    std::string* seed2_node, std::string* seed_generator_node,
+    bool* reshuffle_each_iteration) {
   if (node.op() == kShuffleDatasetV3OpName) {
     *op_name = node.op();
     *buffer_size_node = node.input(1);
@@ -448,13 +449,13 @@ absl::Status RemoveShuffleDatasetV3(
 
 absl::Status ProcessDatasetSourceNode(
     MutableGraphView* graph, const NodeDef& node,
-    absl::flat_hash_set<string>* nodes_to_delete, int64_t num_workers,
+    absl::flat_hash_set<std::string>* nodes_to_delete, int64_t num_workers,
     int64_t index) {
-  string shuffle_op_name = "";
-  string buffer_size_node = "";
-  string seed_node = "";
-  string seed2_node = "";
-  string seed_generator_node = "";
+  std::string shuffle_op_name = "";
+  std::string buffer_size_node = "";
+  std::string seed_node = "";
+  std::string seed2_node = "";
+  std::string seed_generator_node = "";
   bool reshuffle_each_iteration;
 
   TF_RETURN_IF_ERROR(AddShardNode(graph, node, num_workers, index));
@@ -492,7 +493,7 @@ absl::Status ProcessDatasetSourceNode(
 const NodeDef* FindFuncAndTensorSliceDataset(
     const NodeDef* node, int64_t num_workers, int64_t index,
     FunctionLibraryDefinition* flib, MutableGraphView* graph,
-    absl::flat_hash_set<string>* nodes_to_delete) {
+    absl::flat_hash_set<std::string>* nodes_to_delete) {
   if (IsDatasetNodeOfType(*node, kFuncDatasetOps)) {
     const NodeDef* input_node = graph_utils::GetInputNode(*node, *graph, 0);
     if (input_node->op() == kTensorSliceDatasetOpName ||
@@ -550,10 +551,10 @@ DropRemainderValue GetDropRemainder(const MutableGraphView& graph,
                               : DropRemainderValue::kFalse;
 }
 
-absl::Status RecursivelyHandleOp(const NodeDef& node, int64_t num_workers,
-                                 int64_t index, FunctionLibraryDefinition* flib,
-                                 MutableGraphView* graph,
-                                 absl::flat_hash_set<string>* nodes_to_delete) {
+absl::Status RecursivelyHandleOp(
+    const NodeDef& node, int64_t num_workers, int64_t index,
+    FunctionLibraryDefinition* flib, MutableGraphView* graph,
+    absl::flat_hash_set<std::string>* nodes_to_delete) {
   if (node.op() == kAssertCardinalityDatasetOpName) {
     LOG(WARNING) << "The `assert_cardinality` transformation is currently not "
                     "handled by the auto-shard rewrite and will be removed.";
@@ -664,7 +665,7 @@ absl::Status RecursivelyHandleOp(const NodeDef& node, int64_t num_workers,
 absl::Status ShardByFile(const NodeDef& sink_node, int64_t num_workers,
                          int64_t index, FunctionLibraryDefinition* flib,
                          MutableGraphView* graph) {
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   TF_RETURN_IF_ERROR(RecursivelyHandleOp(sink_node, num_workers, index, flib,
                                          graph, &nodes_to_delete));
   return graph->DeleteNodes(nodes_to_delete);
@@ -818,7 +819,7 @@ absl::Status OptimizeGraph(const GrapplerItem& item, int64_t num_workers,
 
   // id for telemetry purpose. item.id is always the same so we use the address
   // of the output as id.
-  string id = strings::StrCat(reinterpret_cast<uint64>(output));
+  std::string id = absl::StrCat(reinterpret_cast<uint64_t>(output));
   // Only record metrics on the first shard to avoid duplication.
   if (index == 0) {
     std::vector<std::string> ineligible_reason;
@@ -906,7 +907,7 @@ bool IsEligibleRewriteBatchSize(const NodeDef& sink_node,
     // know whether this node is sensitive to the batch size or not and we err
     // on the safe side.
     ineligible_reason->push_back(
-        strings::StrCat("OP_NOT_SUPPORTED_", input_node->op()));
+        absl::StrCat("OP_NOT_SUPPORTED_", input_node->op()));
     input_node = graph_utils::GetInputNode(*input_node, graph);
   }
   // If we don't find a batch node, only records BATCH_NOT_FOUND as the reason.
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.h b/tensorflow/core/grappler/optimizers/data/auto_shard.h
index 400ace5f99e7c8..c1901ebb251abf 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.h
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.h
@@ -32,7 +32,7 @@ class AutoShard : public TFDataOptimizerBase {
   AutoShard() = default;
   ~AutoShard() override = default;
 
-  string name() const override { return "tf_auto_shard"; }
+  std::string name() const override { return "tf_auto_shard"; }
 
   bool UsesFunctionLibrary() const override { return true; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc b/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
index eb3022aa37facf..60ef5aece2ce7d 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
@@ -44,7 +44,7 @@ using ::testing::UnorderedElementsAre;
 
 // Adds a MapDataset, a RebatchDataset, a PrefetchDataset and a fake sink that
 // are common to all graphs; and sets the fetch node to the fake sink.
-void FinishItem(GrapplerItem* item, const string& input_node_name) {
+void FinishItem(GrapplerItem* item, const std::string& input_node_name) {
   *item->graph.add_node() =
       NDef("map_before_rebatch", "MapDataset", {input_node_name},
            {{"f", "__inference_Dataset_map_normalize_8232"},
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
index 1c359563756cfd..de186866b83957 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
@@ -63,7 +63,7 @@ absl::Status AutotuneBufferSizes::OptimizeAndCollectStats(
   NodeDef* autotune_value =
       graph_utils::AddScalarConstNode(data::model::kAutotune, &graph);
 
-  absl::flat_hash_set<string> already_prefetched;
+  absl::flat_hash_set<std::string> already_prefetched;
 
   // 1) Collect about all existing `PrefetchDataset` nodes, replacing
   // `prefetch(N)` with `prefetch(AUTOTUNE, buffer_size_min=N)` for all N !=-1.
@@ -114,7 +114,7 @@ absl::Status AutotuneBufferSizes::OptimizeAndCollectStats(
   for (const NodeDef* async_dataset_node : async_datasets) {
     NodeDef prefetch_node;
     graph_utils::SetUniqueGraphNodeName(
-        strings::StrCat("inject/prefetch_", async_dataset_node->name()),
+        absl::StrCat("inject/prefetch_", async_dataset_node->name()),
         graph.graph(), &prefetch_node);
     prefetch_node.set_op(kPrefetchDataset);
     // `input_dataset` input
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
index 0860ba50d4b161..3174a3ee853abd 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
@@ -39,7 +39,7 @@ class AutotuneBufferSizes : public TFDataOptimizerBase {
   AutotuneBufferSizes() = default;
   ~AutotuneBufferSizes() override = default;
 
-  string name() const override { return "autotune_buffer_sizes"; };
+  std::string name() const override { return "autotune_buffer_sizes"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -47,7 +47,7 @@ class AutotuneBufferSizes : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
index a3a7caba94e7ab..de9d5d93ad1434 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
@@ -41,10 +41,10 @@ absl::Status OptimizeWithAutotuneBufferSizes(const GrapplerItem &item,
   return optimizer.Optimize(nullptr, item, output);
 }
 
-class SimpleInject : public ::testing::TestWithParam<string> {};
+class SimpleInject : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(SimpleInject, AutotuneBufferSizesTest) {
-  const string async_dataset = GetParam();
+  const std::string async_dataset = GetParam();
   using test::function::NDef;
   GrapplerItem item;
   if (async_dataset == "map") {
@@ -162,20 +162,20 @@ TEST_P(MultipleNodes, AutotuneBufferSizesTest) {
   NodeDef *stop_val = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_val = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_val->name();
   range_inputs[1] = stop_val->name();
   range_inputs[2] = step_val->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("range", "RangeDataset",
                                              range_inputs, range_attrs, &graph);
 
   NodeDef *parallelism_val =
       graph_utils::AddScalarConstNode<int64_t>(1, &graph);
-  std::vector<string> map_inputs1(2);
+  std::vector<std::string> map_inputs1(2);
   map_inputs1[0] = range_node->name();
   map_inputs1[1] = parallelism_val->name();
-  std::vector<std::pair<string, AttrValue>> map_attrs(4);
+  std::vector<std::pair<std::string, AttrValue>> map_attrs(4);
   AttrValue attr_val;
   SetAttrValue("value", &attr_val);
   map_attrs[0] = std::make_pair("f", attr_val);
@@ -187,10 +187,10 @@ TEST_P(MultipleNodes, AutotuneBufferSizesTest) {
 
   NodeDef *buffer_size_val =
       graph_utils::AddScalarConstNode<int64_t>(initial_buffer_size, &graph);
-  std::vector<string> prefetch_inputs(2);
+  std::vector<std::string> prefetch_inputs(2);
   prefetch_inputs[0] = map_node1->name();
   prefetch_inputs[1] = buffer_size_val->name();
-  std::vector<std::pair<string, AttrValue>> prefetch_attrs(4);
+  std::vector<std::pair<std::string, AttrValue>> prefetch_attrs(4);
   AttrValue legacy_autotune_attr;
   SetAttrValue(legacy_autotune, &legacy_autotune_attr);
   AttrValue buffer_size_min_attr;
@@ -202,13 +202,13 @@ TEST_P(MultipleNodes, AutotuneBufferSizesTest) {
   NodeDef *prefetch_node = graph_utils::AddNode(
       "prefetch", "PrefetchDataset", prefetch_inputs, prefetch_attrs, &graph);
 
-  std::vector<string> map_inputs2(2);
+  std::vector<std::string> map_inputs2(2);
   map_inputs2[0] = prefetch_node->name();
   map_inputs2[1] = parallelism_val->name();
   NodeDef *map_node2 = graph_utils::AddNode("map2", "ParallelMapDatasetV2",
                                             map_inputs2, map_attrs, &graph);
 
-  std::vector<string> map_inputs3(1);
+  std::vector<std::string> map_inputs3(1);
   map_inputs3[0] = map_node2->name();
   graph_utils::AddNode("map3", "MapDataset", map_inputs3, map_attrs, &graph);
 
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc b/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
index 55fb6e4961164e..4d20fdb698d137 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
@@ -34,7 +34,7 @@ namespace {
 constexpr char kBatchDataset[] = "BatchDatasetV2";
 constexpr char kParallelBatchDataset[] = "ParallelBatchDataset";
 
-NodeDef MakeParallelBatch(const string& name, MutableGraphView* graph) {
+NodeDef MakeParallelBatch(const std::string& name, MutableGraphView* graph) {
   // The inputs of the node to be parallelized could be changed by the
   // optimization pass, so we need to look it up in the modified graph.
   int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
@@ -46,7 +46,7 @@ NodeDef MakeParallelBatch(const string& name, MutableGraphView* graph) {
   parallel_batch.set_op(kParallelBatchDataset);
   auto* num_parallel_calls =
       graph_utils::AddScalarConstNode(data::model::kAutotune, graph);
-  string drop_remainder_name = parallel_batch.input(2);
+  std::string drop_remainder_name = parallel_batch.input(2);
   parallel_batch.set_input(2, num_parallel_calls->name());
   parallel_batch.add_input(drop_remainder_name);
 
@@ -72,7 +72,7 @@ absl::Status BatchParallelization::OptimizeAndCollectStats(
   if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
     return absl::OkStatus();
 
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_batch_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization.h b/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
index 2e77dea08bbda1..8d2f6895c322e2 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
@@ -30,7 +30,7 @@ class BatchParallelization : public TFDataOptimizerBase {
   BatchParallelization() = default;
   ~BatchParallelization() override = default;
 
-  string name() const override { return "batch_parallelization"; };
+  std::string name() const override { return "batch_parallelization"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class BatchParallelization : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc b/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc
index 1f15a57a396029..426400737db8aa 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc
@@ -75,10 +75,10 @@ TEST_P(AutotuneSetting, BatchParallelizationTest) {
 
 INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
 
-class FromFunctionDef : public ::testing::TestWithParam<string> {};
+class FromFunctionDef : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(FromFunctionDef, BatchParallelizationTest) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   bool from_function_def = (op == "_Retval");
 
   using test::function::NDef;
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
index 977b0c5d513d1a..b48d799f03c6c3 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
@@ -27,7 +27,7 @@ class DisableIntraOpParallelism : public TFDataOptimizerBase {
   DisableIntraOpParallelism() = default;
   ~DisableIntraOpParallelism() override = default;
 
-  string name() const override { return "disable_intra_op_parallelism"; };
+  std::string name() const override { return "disable_intra_op_parallelism"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
index fb67798c6d9753..3fbf47b9cc205d 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
@@ -32,10 +32,10 @@ using test::function::NDef;
 
 // If the user manually sets intra op parallelism, we don't insert the op.
 class IntraOpAlreadySetTest
-    : public ::testing::TestWithParam<std::tuple<string, int64_t>> {};
+    : public ::testing::TestWithParam<std::tuple<std::string, int64_t>> {};
 
 TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
-  const string op = std::get<0>(GetParam());
+  const std::string op = std::get<0>(GetParam());
   const int64_t value = std::get<1>(GetParam());
 
   GrapplerItem item;
@@ -44,26 +44,26 @@ TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
   NodeDef *start_val = graph_utils::AddScalarConstNode<int64_t>(0, &graph);
   NodeDef *stop_val = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_val = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_val->name();
   range_inputs[1] = stop_val->name();
   range_inputs[2] = step_val->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("range", "RangeDataset",
                                              range_inputs, range_attrs, &graph);
 
   NodeDef *parallelism_val =
       graph_utils::AddScalarConstNode<int64_t>(value, &graph);
-  std::vector<string> parallelism_inputs(2);
+  std::vector<std::string> parallelism_inputs(2);
   parallelism_inputs[0] = range_node->name();
   parallelism_inputs[1] = parallelism_val->name();
-  std::vector<std::pair<string, AttrValue>> parallelism_attrs;
+  std::vector<std::pair<std::string, AttrValue>> parallelism_attrs;
   NodeDef *parallelism_node = graph_utils::AddNode(
       "max_parallelism", op, parallelism_inputs, parallelism_attrs, &graph);
 
-  std::vector<string> sink_inputs(1);
+  std::vector<std::string> sink_inputs(1);
   sink_inputs[0] = parallelism_node->name();
-  std::vector<std::pair<string, AttrValue>> sink_attrs;
+  std::vector<std::pair<std::string, AttrValue>> sink_attrs;
   NodeDef *sink_node =
       graph_utils::AddNode("Sink", "Identity", sink_inputs, sink_attrs, &graph);
   item.fetch.push_back(sink_node->name());
@@ -97,10 +97,10 @@ INSTANTIATE_TEST_SUITE_P(
 // If we can not find the sink node or sink node op is "_Retval", we don't apply
 // the optimization; otherwise, we insert the op to disable intra op
 // parallelism.
-class IntraOpNotSetTest : public ::testing::TestWithParam<string> {};
+class IntraOpNotSetTest : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(IntraOpNotSetTest, IntraOpParallelism) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   GrapplerItem item;
 
   item.graph = test::function::GDef(
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
index 3aded2589424ba..225a652c05265c 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
@@ -30,7 +30,9 @@ class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
   DisablePrefetchLegacyAutotune() = default;
   ~DisablePrefetchLegacyAutotune() override = default;
 
-  string name() const override { return "disable_prefetch_legacy_autotune"; };
+  std::string name() const override {
+    return "disable_prefetch_legacy_autotune";
+  };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +40,7 @@ class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
index 35c333c0fffe29..257d2893afb547 100644
--- a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
@@ -30,7 +30,7 @@ class EnableGradientDescent : public TFDataOptimizerBase {
   EnableGradientDescent() = default;
   ~EnableGradientDescent() override = default;
 
-  string name() const override { return "enable_gradient_descent"; };
+  std::string name() const override { return "enable_gradient_descent"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class EnableGradientDescent : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
index ff4b83b32cf25e..432cf04516a4d5 100644
--- a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
@@ -43,12 +43,13 @@ absl::Status OptimizeWithEnableGradientDescent(const GrapplerItem &item,
 }
 
 class SimpleRewrite
-    : public ::testing::TestWithParam<std::tuple<bool, int64_t, string>> {};
+    : public ::testing::TestWithParam<std::tuple<bool, int64_t, std::string>> {
+};
 
 TEST_P(SimpleRewrite, EnableGradientDescentTest) {
   const bool autotune = std::get<0>(GetParam());
   const int64_t algorithm_index = std::get<1>(GetParam());
-  const string op = std::get<2>(GetParam());
+  const std::string op = std::get<2>(GetParam());
 
   using test::function::NDef;
   GrapplerItem item;
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
index bb3ab390357370..9a9a774223cd34 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -70,7 +70,7 @@ absl::Status FilterFusion::OptimizeAndCollectStats(Cluster* cluster,
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              output->library());
 
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
index 757f75578ccbf2..59a1f3a3748709 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -27,7 +27,7 @@ class FilterFusion : public TFDataOptimizerBase {
   FilterFusion() = default;
   ~FilterFusion() override = default;
 
-  string name() const override { return "filter_fusion"; };
+  std::string name() const override { return "filter_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc b/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
index 88536e8670fb5d..92a627bf490249 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
@@ -34,7 +34,7 @@ namespace {
 constexpr char kFilterDataset[] = "FilterDataset";
 constexpr char kParallelFilterDataset[] = "ParallelFilterDataset";
 
-NodeDef MakeParallelFilter(const string& name, MutableGraphView* graph) {
+NodeDef MakeParallelFilter(const std::string& name, MutableGraphView* graph) {
   // The inputs of the node to be parallelized could be changed by the
   // optimization pass, so we need to look it up in the modified graph.
   int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
@@ -71,7 +71,7 @@ absl::Status FilterParallelization::OptimizeAndCollectStats(
   if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
     return absl::OkStatus();
 
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_filter_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization.h b/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
index 63f759075d0c0d..2d64ca051204cc 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
@@ -30,7 +30,7 @@ class FilterParallelization : public TFDataOptimizerBase {
   FilterParallelization() = default;
   ~FilterParallelization() override = default;
 
-  string name() const override { return "filter_parallelization"; };
+  std::string name() const override { return "filter_parallelization"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class FilterParallelization : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc b/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc
index 480c7e54f1d934..7a10b7762acac9 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc
@@ -77,10 +77,10 @@ TEST_P(AutotuneSetting, FilterParallelizationTest) {
 
 INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
 
-class FromFunctionDef : public ::testing::TestWithParam<string> {};
+class FromFunctionDef : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(FromFunctionDef, FilterParallelizationTest) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   bool from_function_def = (op == "_Retval");
 
   using test::function::NDef;
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index f5b207698713a0..0d0c2eab1428f7 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -24,13 +24,14 @@ namespace tensorflow {
 namespace grappler {
 namespace function_utils {
 
-FunctionDefTensorDesc::FunctionDefTensorDesc(const string& node_name,
-                                             const string& output, int position)
+FunctionDefTensorDesc::FunctionDefTensorDesc(const std::string& node_name,
+                                             const std::string& output,
+                                             int position)
     : node_name(node_name), node_output(output), position(position) {
   full_str = strings::StrCat(node_name, ":", node_output, ":", position);
 }
 
-FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
+FunctionDefTensorDesc::FunctionDefTensorDesc(const std::string& input) {
   // Parses node_name:node_output:position string into its components.
   full_str = input;
   absl::string_view capture;
@@ -41,7 +42,7 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
           .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
           .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
           .GetResult(&remaining, &capture)) {
-    node_name = string(capture.data(), capture.size());
+    node_name = std::string(capture.data(), capture.size());
   }
 
   // Parse "node_output" if it exists
@@ -51,7 +52,7 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
           .One(strings::Scanner::LETTER)
           .Any(strings::Scanner::LETTER_DIGIT_UNDERSCORE)
           .GetResult(&remaining, &capture)) {
-    node_output = string(capture.data(), capture.size());
+    node_output = std::string(capture.data(), capture.size());
   }
 
   // Parse "position" if it exists
@@ -71,7 +72,7 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
 // Note that we're not using GrapplerFunctionItem because it doesn't cover
 // some of our desired uses (eg changing the outputs of a function), and the
 // FunctionDef -> GraphDef conversion isn't really necessary in this case.
-void ReplaceReferences(const string& from, const string& to,
+void ReplaceReferences(const std::string& from, const std::string& to,
                        FunctionDef* func) {
   for (NodeDef& n : *func->mutable_node_def()) {
     std::replace(n.mutable_input()->begin(), n.mutable_input()->end(), from,
@@ -88,20 +89,20 @@ void ReplaceReferences(const string& from, const string& to,
 void AddFunctionOutputWithUniqueName(absl::string_view prefix,
                                      absl::string_view output_tensor_name,
                                      FunctionDef* fdef, DataType dtype) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = fdef->signature().output_arg_size();
   while (ContainsFunctionOutputWithName(name, *fdef)) {
-    name = strings::StrCat(prefix, "/_", id);
+    name = absl::StrCat(prefix, "/_", id);
     ++id;
   }
   auto* output = fdef->mutable_signature()->mutable_output_arg()->Add();
   output->set_name(name);
   output->set_type(dtype);
 
-  (*fdef->mutable_ret())[name] = string(output_tensor_name);
+  (*fdef->mutable_ret())[name] = std::string(output_tensor_name);
 }
 
-OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
+OpDef_ArgDef* AddFunctionInput(const std::string& name, FunctionDef* fdef,
                                DataType dtype) {
   auto* input_arg = fdef->mutable_signature()->mutable_input_arg()->Add();
   input_arg->set_type(dtype);
@@ -110,18 +111,19 @@ OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
   return input_arg;
 }
 
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 FunctionDef* fd) {
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    FunctionDef* fd) {
   NodeDef* node = fd->add_node_def();
   if (!name.empty()) {
-    node->set_name(string(name));
+    node->set_name(name);
   } else {
     SetUniqueFunctionNodeName(op, fd, node);
   }
-  node->set_op(string(op));
-  for (const string& input : inputs) {
+  node->set_op(op);
+  for (const std::string& input : inputs) {
     node->add_input(input);
   }
   for (const auto& attr : attributes) {
@@ -174,10 +176,10 @@ int FindFunctionNodeWithOp(absl::string_view op, const FunctionDef& function) {
 
 void SetUniqueFunctionNodeName(absl::string_view prefix, FunctionDef* function,
                                NodeDef* node) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = function->node_def_size();
   while (ContainsFunctionNodeWithName(name, *function)) {
-    name = strings::StrCat(prefix, "/_", id);
+    name = absl::StrCat(prefix, "/_", id);
     ++id;
   }
   node->set_name(std::move(name));
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.h b/tensorflow/core/grappler/optimizers/data/function_utils.h
index e78e98aef5e082..b4e58c4646690c 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -42,23 +42,24 @@ namespace function_utils {
 struct FunctionDefTensorDesc {
   FunctionDefTensorDesc() = default;
 
-  FunctionDefTensorDesc(const string& node_name, const string& output,
+  FunctionDefTensorDesc(const std::string& node_name, const std::string& output,
                         int position);
 
   // Parses node_name:node_output:position string into its components.
-  explicit FunctionDefTensorDesc(const string& input);
+  explicit FunctionDefTensorDesc(const std::string& input);
 
   // TODO(rachelim): Add provisions to deal with special formats, like how
   // GrapplerFunctionItem expands node output range if position is not defined
-  string full_str;
-  string node_name;
-  string node_output;
+  std::string full_str;
+  std::string node_name;
+  std::string node_output;
   int position = -1;
 };
 
 // Replaces all references to `from` tensor in func's nodes' inputs and retvals
 // to `to` tensor. This is similar to `MutableGraphView::ReplaceInputs`.
-void ReplaceReferences(const string& from, const string& to, FunctionDef* func);
+void ReplaceReferences(const std::string& from, const std::string& to,
+                       FunctionDef* func);
 
 // Adds a function output to the function def, ensuring that the output key
 // is unique, and maps to output_tensor_name in the ret dict.
@@ -67,14 +68,15 @@ void AddFunctionOutputWithUniqueName(absl::string_view prefix,
                                      FunctionDef* fdef, DataType dtype);
 
 // Adds an input to a FunctionDef.
-OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
+OpDef_ArgDef* AddFunctionInput(const std::string& name, FunctionDef* fdef,
                                DataType dtype);
 
 // Adds a node to a FunctionDef.
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 FunctionDef* fd);
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    FunctionDef* fd);
 
 // Checks whether the function contains a node with the given name.
 bool ContainsFunctionNodeWithName(absl::string_view name,
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
index cac01d1842a47f..110f49c85921e8 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
@@ -145,7 +145,7 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
   EXPECT_EQ(node1.input_size(), 0);
   EXPECT_EQ(node1.attr_size(), 0);
 
-  const std::vector<string> inputs({"input1", "input2"});
+  const std::vector<std::string> inputs({"input1", "input2"});
   AddNode("", op_name, inputs, {}, &func);
   const NodeDef& node2 =
       func.node_def(FindFunctionNodeWithName("xxx/_2", func));
@@ -159,7 +159,7 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
   AttrValue a1, a2;
   a1.set_type(DT_INT32);
   a2.set_type(DT_INT64);
-  const std::vector<std::pair<string, AttrValue>> attrs(
+  const std::vector<std::pair<std::string, AttrValue>> attrs(
       {{"attr1", a1}, {"attr2", a2}});
   AddNode("", op_name, {}, attrs, &func);
   const NodeDef& node3 =
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index 1ed529c5837f5d..17f0e72afa4634 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -52,36 +52,36 @@ namespace {
 // See the comment for the proto field `tensorflow.NodeDef.input`.
 constexpr char kControlInputPrefix[] = "^";
 
-bool IsControlInput(const string& node_input) {
+bool IsControlInput(const std::string& node_input) {
   return absl::StartsWith(node_input, kControlInputPrefix);
 }
 
-string StripControlInputNotation(const string& node_input) {
-  return string(absl::StripPrefix(node_input, kControlInputPrefix));
+std::string StripControlInputNotation(const std::string& node_input) {
+  return std::string(absl::StripPrefix(node_input, kControlInputPrefix));
 }
 
-string AddControlInputNotation(const string& node_input) {
+std::string AddControlInputNotation(const std::string& node_input) {
   return absl::StrCat(kControlInputPrefix, node_input);
 }
 
 // Returns e.g. `"node"` given `"node:out"` or `"node:out:0"`. See the comment
 // for the proto field `tensorflow.FunctionDef.node_def`.
-string ParseNodeConnection(const string& name) {
+std::string ParseNodeConnection(const std::string& name) {
   return name.substr(0, name.find(':'));
 }
 
-string ParseOutputNode(const string& name) {
-  if (name.find(':') == string::npos) return {};
-  return name.substr(name.find(':'), string::npos);
+std::string ParseOutputNode(const std::string& name) {
+  if (name.find(':') == std::string::npos) return {};
+  return name.substr(name.find(':'), std::string::npos);
 }
 
-string GetOutputNode(const FunctionDef& function, int output_idx) {
+std::string GetOutputNode(const FunctionDef& function, int output_idx) {
   const auto& ret_output_name =
       function.signature().output_arg(output_idx).name();
   return function.ret().at(ret_output_name);
 }
 
-string& GetMutableOutputNode(FunctionDef* function, int output_idx) {
+std::string& GetMutableOutputNode(FunctionDef* function, int output_idx) {
   const auto& ret_output_name =
       function->signature().output_arg(output_idx).name();
   return function->mutable_ret()->at(ret_output_name);
@@ -96,10 +96,10 @@ StringCollection GetNames(const Iterable& iterable, int allocate_size) {
 }
 
 template <typename Iterable>
-gtl::FlatSet<string> GetNodeNamesSet(const Iterable& nodes) {
+gtl::FlatSet<std::string> GetNodeNamesSet(const Iterable& nodes) {
   // NOTE(prazek): Cases where the set is not modified after construction
   // could use sorted vector with binary_search instead, to make it faster.
-  gtl::FlatSet<string> names;
+  gtl::FlatSet<std::string> names;
   for (const auto& node : nodes) {
     CHECK(gtl::InsertIfNotPresent(&names, node.name()))
         << "Functions should have unique node names. Node with name "
@@ -109,21 +109,21 @@ gtl::FlatSet<string> GetNodeNamesSet(const Iterable& nodes) {
 }
 
 template <typename Iterable>
-gtl::FlatMap<string, string> GetUniqueNames(const Iterable& first_iterable,
-                                            const Iterable& second_iterable) {
-  gtl::FlatMap<string, string> changed_node_names;
+gtl::FlatMap<std::string, std::string> GetUniqueNames(
+    const Iterable& first_iterable, const Iterable& second_iterable) {
+  gtl::FlatMap<std::string, std::string> changed_node_names;
   const auto first_names = GetNodeNamesSet(first_iterable);
   auto second_names = GetNodeNamesSet(first_iterable);
   int id = second_iterable.size();
 
   for (const auto& node : second_iterable) {
-    string name_before = node.name();
-    string name = name_before;
+    std::string name_before = node.name();
+    std::string name = name_before;
     bool changed_name = false;
 
     while (first_names.count(name) ||
            (changed_name && second_names.count(name))) {
-      name = strings::StrCat(name_before, "/_", id);
+      name = absl::StrCat(name_before, "/_", id);
       changed_name = true;
       ++id;
     }
@@ -143,14 +143,14 @@ gtl::FlatMap<string, string> GetUniqueNames(const Iterable& first_iterable,
 void RenameFunctionNodes(
     const FunctionDef& first_function,
     protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse,
-    protobuf::Map<string, string>* rets_to_fuse,
-    protobuf::Map<string, string>* control_rets_to_fuse,
-    protobuf::RepeatedPtrField<string>* control_outputs_to_fuse) {
-  const gtl::FlatMap<string, string> changed_node_names =
+    protobuf::Map<std::string, std::string>* rets_to_fuse,
+    protobuf::Map<std::string, std::string>* control_rets_to_fuse,
+    protobuf::RepeatedPtrField<std::string>* control_outputs_to_fuse) {
+  const gtl::FlatMap<std::string, std::string> changed_node_names =
       GetUniqueNames(first_function.node_def(), *nodes_to_fuse);
 
-  auto updated_name = [&changed_node_names](const string& input) {
-    string input_node = ParseNodeConnection(input);
+  auto updated_name = [&changed_node_names](const std::string& input) {
+    std::string input_node = ParseNodeConnection(input);
     auto iter = changed_node_names.find(input_node);
     if (iter != changed_node_names.end()) {
       return iter->second + ParseOutputNode(input);
@@ -159,12 +159,12 @@ void RenameFunctionNodes(
   };
 
   for (NodeDef& function_node : *nodes_to_fuse) {
-    if (const string* new_name =
+    if (const std::string* new_name =
             gtl::FindOrNull(changed_node_names, function_node.name())) {
       function_node.set_name(*new_name);
     }
 
-    for (string& input : *function_node.mutable_input()) {
+    for (std::string& input : *function_node.mutable_input()) {
       input = updated_name(input);
     }
   }
@@ -179,10 +179,10 @@ void RenameFunctionNodes(
   // `FunctionDef.signature.control_output` elements with these new values.
   // The old keys and elements are ignored; these keys and elements serve only
   // to look up one another.
-  protobuf::Map<string, string> new_control_rets_to_fuse;
-  protobuf::RepeatedPtrField<string> new_control_outputs_to_fuse;
+  protobuf::Map<std::string, std::string> new_control_rets_to_fuse;
+  protobuf::RepeatedPtrField<std::string> new_control_outputs_to_fuse;
   for (const auto& [unused, control_ret_node] : *control_rets_to_fuse) {
-    string updated_control_ret_node = updated_name(control_ret_node);
+    std::string updated_control_ret_node = updated_name(control_ret_node);
     new_control_rets_to_fuse.insert(
         {updated_control_ret_node, updated_control_ret_node});
     *new_control_outputs_to_fuse.Add() = updated_control_ret_node;
@@ -199,12 +199,12 @@ StringCollection GetFunctionInputs(const FunctionDef& function) {
 // This function produces signature having names that do not conflict with
 // `first_signature`.  The input of returns and nodes that will be fused are
 // updated to use new names.
-OpDef GetUniqueSignature(const OpDef& first_signature,
-                         const OpDef& second_signature,
-                         protobuf::Map<string, string>* rets_to_fuse,
-                         protobuf::Map<string, string>* control_rets_to_fuse,
-                         protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
-  const gtl::FlatMap<string, string> changed_input_names =
+OpDef GetUniqueSignature(
+    const OpDef& first_signature, const OpDef& second_signature,
+    protobuf::Map<std::string, std::string>* rets_to_fuse,
+    protobuf::Map<std::string, std::string>* control_rets_to_fuse,
+    protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
+  const gtl::FlatMap<std::string, std::string> changed_input_names =
       GetUniqueNames(first_signature.input_arg(), second_signature.input_arg());
   OpDef signature;
   signature.set_name(second_signature.name());
@@ -212,25 +212,26 @@ OpDef GetUniqueSignature(const OpDef& first_signature,
   for (const auto& input_arg : second_signature.input_arg()) {
     auto& input = *signature.add_input_arg();
     input = input_arg;
-    if (const string* new_name =
+    if (const std::string* new_name =
             gtl::FindOrNull(changed_input_names, input.name())) {
       input.set_name(*new_name);
     }
   }
-  const gtl::FlatMap<string, string> changed_output_names = GetUniqueNames(
-      first_signature.output_arg(), second_signature.output_arg());
+  const gtl::FlatMap<std::string, std::string> changed_output_names =
+      GetUniqueNames(first_signature.output_arg(),
+                     second_signature.output_arg());
 
   for (const auto& output_arg : second_signature.output_arg()) {
     auto& output = *signature.add_output_arg();
     output = output_arg;
-    if (const string* new_name =
+    if (const std::string* new_name =
             gtl::FindOrNull(changed_output_names, output.name())) {
       output.set_name(*new_name);
     }
   }
 
-  auto new_rets = [&](const protobuf::Map<string, string>& old_rets) {
-    protobuf::Map<string, string> new_rets;
+  auto new_rets = [&](const protobuf::Map<std::string, std::string>& old_rets) {
+    protobuf::Map<std::string, std::string> new_rets;
     for (const auto& ret : old_rets) {
       const auto& key = changed_output_names.count(ret.first)
                             ? changed_output_names.at(ret.first)
@@ -253,7 +254,7 @@ OpDef GetUniqueSignature(const OpDef& first_signature,
       bool is_control_input = IsControlInput(node_input);
       const auto& input =
           ParseNodeConnection(StripControlInputNotation(node_input));
-      if (const string* new_name =
+      if (const std::string* new_name =
               gtl::FindOrNull(changed_input_names, input)) {
         node_input = *new_name + ParseOutputNode(node_input);
         if (is_control_input) {
@@ -304,7 +305,7 @@ void FuseReturns(const StringCollection& first_inputs,
                  const StringCollection& second_inputs,
                  const StringCollection& first_outputs,
                  const SetInputFn& set_input,
-                 protobuf::Map<string, string>* fused_ret) {
+                 protobuf::Map<std::string, std::string>* fused_ret) {
   for (auto& ret : *fused_ret) {
     auto return_input = ParseNodeConnection(ret.second);
     auto input_it =
@@ -342,7 +343,7 @@ FunctionDef* CreateFalsePredicate(
   for (const auto& fake_arg : fake_args) {
     auto* arg = false_predicate->mutable_signature()->add_input_arg();
     arg->set_type(fake_arg.type());
-    arg->set_name(strings::StrCat("fake_arg", num));
+    arg->set_name(absl::StrCat("fake_arg", num));
     num++;
   }
 
@@ -381,9 +382,9 @@ bool CanCompose(const OpDef& first_signature, const OpDef& second_signature) {
   return first_signature.output_arg_size() == second_signature.input_arg_size();
 }
 
-string ComposeInput(const StringCollection& first_inputs,
-                    const StringCollection& second_inputs,
-                    const StringCollection& first_outputs, int arg_num) {
+std::string ComposeInput(const StringCollection& first_inputs,
+                         const StringCollection& second_inputs,
+                         const StringCollection& first_outputs, int arg_num) {
   // Take corresponding parent output.
   return first_outputs.at(arg_num);
 }
@@ -412,9 +413,9 @@ void ComposeSignature(const OpDef& first_signature,
       second_signature.control_output().end());
 }
 
-void ComposeOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret) {
+void ComposeOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret) {
   *fused_ret = second_ret;
 }
 
@@ -429,16 +430,16 @@ void CombineSignature(const OpDef& first_signature,
       second_signature.output_arg());
 }
 
-void CombineOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret) {
+void CombineOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret) {
   *fused_ret = first_ret;
   fused_ret->insert(second_ret.begin(), second_ret.end());
 }
 
-string SameInput(const StringCollection& first_inputs,
-                 const StringCollection& second_inputs,
-                 const StringCollection& first_outputs, int arg_num) {
+std::string SameInput(const StringCollection& first_inputs,
+                      const StringCollection& second_inputs,
+                      const StringCollection& first_outputs, int arg_num) {
   return first_inputs.at(arg_num);
 }
 
@@ -498,9 +499,10 @@ void LazyConjunctionNodes(const FunctionDef& first_function,
   GetMutableOutputNode(fused_function, 0) = if_node->name() + ":output:0";
 }
 
-void LazyConjunctionOutput(const protobuf::Map<string, string>& first_ret,
-                           const protobuf::Map<string, string>& second_ret,
-                           protobuf::Map<string, string>* fused_ret) {
+void LazyConjunctionOutput(
+    const protobuf::Map<std::string, std::string>& first_ret,
+    const protobuf::Map<std::string, std::string>& second_ret,
+    protobuf::Map<std::string, std::string>* fused_ret) {
   CHECK_EQ(first_ret.size(), 1);
   CHECK_EQ(second_ret.size(), 1);
   // Temporarily copy returns from first_ret.  We are going to change the
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.h b/tensorflow/core/grappler/optimizers/data/fusion_utils.h
index d0b7ed7cb4de67..3f86bfc63c4a38 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.h
@@ -34,7 +34,7 @@ using SetFunctionSignatureFn = std::function<void(
     const OpDef& first_function_signature,
     const OpDef& second_function_signature, OpDef* fused_function_signature)>;
 
-using StringCollection = absl::InlinedVector<string, 2UL>;
+using StringCollection = absl::InlinedVector<std::string, 2UL>;
 
 // These functions are invoked with nodes from second function that were
 // previously taking arguments as input. The `arg_num` tells which
@@ -43,17 +43,17 @@ using StringCollection = absl::InlinedVector<string, 2UL>;
 // would be called on the first and third input with arg_num equal 1 and 4.
 // It should set up inputs based on first function inputs or outputs or
 // second function inputs.
-using SetInputFn =
-    std::function<string(const StringCollection& first_function_inputs,
-                         const StringCollection& second_function_inputs,
-                         const StringCollection& parent_outputs, int arg_num)>;
+using SetInputFn = std::function<std::string(
+    const StringCollection& first_function_inputs,
+    const StringCollection& second_function_inputs,
+    const StringCollection& parent_outputs, int arg_num)>;
 
 // This function is invoked with first and second function ret. It is used to
 // set up returns of fused function.
-using SetOutputFn =
-    std::function<void(const protobuf::Map<string, string>& parent_ret,
-                       const protobuf::Map<string, string>& second_function_ret,
-                       protobuf::Map<string, string>* fused_ret)>;
+using SetOutputFn = std::function<void(
+    const protobuf::Map<std::string, std::string>& parent_ret,
+    const protobuf::Map<std::string, std::string>& second_function_ret,
+    protobuf::Map<std::string, std::string>* fused_ret)>;
 
 using SetNodesFn = std::function<void(
     const FunctionDef& first_function, const FunctionDef& second_function,
@@ -69,15 +69,15 @@ bool CanCompose(const OpDef& first_signature, const OpDef& second_signature);
 void ComposeSignature(const OpDef& first_signature,
                       const OpDef& second_signature, OpDef* fused_signature);
 
-string ComposeInput(const StringCollection& first_inputs,
-                    const StringCollection& second_inputs,
-                    const StringCollection& first_outputs, int arg_num);
+std::string ComposeInput(const StringCollection& first_inputs,
+                         const StringCollection& second_inputs,
+                         const StringCollection& first_outputs, int arg_num);
 
 // Sets output to the composition of first and second function:
 // second_function(first_function(args...)).
-void ComposeOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret);
+void ComposeOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret);
 
 // Set input signature to `first_function_signature` and output signature
 // to `first_function_signature` + `second_function_signature`
@@ -87,9 +87,9 @@ void CombineSignature(const OpDef& first_signature,
 // Apart from first function returns, return values from second function as
 // extra returns like:
 // return *first_function(...), *second_function(...)
-void CombineOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret);
+void CombineOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret);
 
 // Returns true if both signatures have the same number of input and output
 // args.
@@ -101,15 +101,16 @@ void SameSignature(const OpDef& first_signature, const OpDef& second_signature,
                    OpDef* fused_signature);
 
 // Take the same input as first function.
-string SameInput(const StringCollection& first_inputs,
-                 const StringCollection& second_inputs,
-                 const StringCollection& first_outputs, int arg_num);
+std::string SameInput(const StringCollection& first_inputs,
+                      const StringCollection& second_inputs,
+                      const StringCollection& first_outputs, int arg_num);
 
 // Create a fused function that computes the short-circuit logical AND of the
 // result of the first function and the result of the second function.
-void LazyConjunctionOutput(const protobuf::Map<string, string>& first_ret,
-                           const protobuf::Map<string, string>& second_ret,
-                           protobuf::Map<string, string>* fused_ret);
+void LazyConjunctionOutput(
+    const protobuf::Map<std::string, std::string>& first_ret,
+    const protobuf::Map<std::string, std::string>& second_ret,
+    protobuf::Map<std::string, std::string>* fused_ret);
 
 void LazyConjunctionNodes(const FunctionDef& first_function,
                           const FunctionDef& second_function,
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
index 66e02acb967939..21df6afab30497 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
@@ -35,22 +35,22 @@ namespace grappler {
 namespace fusion_utils {
 namespace {
 
-string ParseNodeConnection(const string& name) {
+std::string ParseNodeConnection(const std::string& name) {
   return name.substr(0, name.find(':'));
 }
 
 void CheckUniqueNames(const FunctionDef& function) {
-  std::unordered_set<string> inputs;
+  std::unordered_set<std::string> inputs;
   for (const auto& input_arg : function.signature().input_arg())
     inputs.insert(input_arg.name());
   EXPECT_EQ(inputs.size(), function.signature().input_arg_size());
 
-  std::unordered_set<string> outputs;
+  std::unordered_set<std::string> outputs;
   for (const auto& output_arg : function.signature().output_arg())
     outputs.insert(output_arg.name());
   EXPECT_EQ(outputs.size(), function.signature().output_arg_size());
 
-  std::unordered_set<string> nodes;
+  std::unordered_set<std::string> nodes;
   for (const auto& node : function.node_def()) nodes.insert(node.name());
 
   EXPECT_EQ(nodes.size(), function.node_def_size());
@@ -147,8 +147,8 @@ TEST(FusionUtilsTest, FuseFunctionWithControlOutputs) {
                     fusion_utils::MergeNodes, graph.mutable_library());
 
   EXPECT_EQ(fused_function->signature().control_output_size(), 2);
-  string control_output_1 = fused_function->signature().control_output(0);
-  string control_output_2 = fused_function->signature().control_output(1);
+  std::string control_output_1 = fused_function->signature().control_output(0);
+  std::string control_output_2 = fused_function->signature().control_output(1);
   EXPECT_NE(control_output_1, control_output_2);
   EXPECT_EQ(fused_function->control_ret_size(), 2);
   EXPECT_TRUE(fused_function->control_ret().contains(control_output_1));
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index e99da1c407aa1a..3f54ccedf2ce2a 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -32,8 +32,8 @@ NodeDef MakeBatchV2Node(absl::string_view name,
                         bool parallel_copy) {
   return test::function::NDef(
       name, "BatchDatasetV2",
-      {string(input_node_name), string(batch_size_node_name),
-       string(drop_remainder_node_name)},
+      {std::string(input_node_name), std::string(batch_size_node_name),
+       std::string(drop_remainder_node_name)},
       {{"parallel_copy", parallel_copy},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -47,11 +47,12 @@ NodeDef MakeParallelBatchNode(absl::string_view name,
                               absl::string_view deterministic) {
   return test::function::NDef(
       name, "ParallelBatchDataset",
-      {string(input_node_name), string(batch_size_node_name),
-       string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
+      {std::string(input_node_name), std::string(batch_size_node_name),
+       std::string(num_parallel_calls_node_name),
+       std::string(drop_remainder_node_name)},
       {{"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}},
-       {"deterministic", string(deterministic)}});
+       {"deterministic", std::string(deterministic)}});
 }
 
 NodeDef MakeCacheV2Node(absl::string_view name,
@@ -61,9 +62,9 @@ NodeDef MakeCacheV2Node(absl::string_view name,
   return test::function::NDef(
       name, "CacheDatasetV2",
       {
-          string(input_node_name),
-          string(filename_node_name),
-          string(cache_node_name),
+          std::string(input_node_name),
+          std::string(filename_node_name),
+          std::string(cache_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -75,8 +76,9 @@ NodeDef MakeFilterNode(absl::string_view name,
                        absl::string_view input_node_name,
                        absl::string_view function_name) {
   return test::function::NDef(
-      name, "FilterDataset", {string(input_node_name)},
-      {{"predicate", FunctionDefHelper::FunctionRef(string(function_name))},
+      name, "FilterDataset", {std::string(input_node_name)},
+      {{"predicate",
+        FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -90,9 +92,10 @@ NodeDef MakeMapAndBatchNode(absl::string_view name,
                             absl::string_view function_name) {
   return test::function::NDef(
       name, "MapAndBatchDataset",
-      {string(input_node_name), string(batch_size_node_name),
-       string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
-      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
+      {std::string(input_node_name), std::string(batch_size_node_name),
+       std::string(num_parallel_calls_node_name),
+       std::string(drop_remainder_node_name)},
+      {{"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -101,8 +104,8 @@ NodeDef MakeMapAndBatchNode(absl::string_view name,
 NodeDef MakeMapNode(absl::string_view name, absl::string_view input_node_name,
                     absl::string_view function_name) {
   return test::function::NDef(
-      name, "MapDataset", {string(input_node_name)},
-      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
+      name, "MapDataset", {std::string(input_node_name)},
+      {{"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -116,10 +119,11 @@ NodeDef MakeParallelInterleaveV2Node(
     absl::string_view function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV2",
-      {string(input_node_name), string(cycle_length_node_name),
-       string(block_length_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(cycle_length_node_name),
+       std::string(block_length_node_name),
+       std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
@@ -135,14 +139,15 @@ NodeDef MakeParallelInterleaveV4Node(
     absl::string_view function_name, absl::string_view deterministic) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV4",
-      {string(input_node_name), string(cycle_length_node_name),
-       string(block_length_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(cycle_length_node_name),
+       std::string(block_length_node_name),
+       std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
-          {"deterministic", string(deterministic)},
+          {"deterministic", std::string(deterministic)},
       });
 }
 
@@ -154,14 +159,14 @@ NodeDef MakeInterleaveNode(absl::string_view name,
                            absl::string_view deterministic) {
   return test::function::NDef(
       name, "InterleaveDataset",
-      {string(input_node_name), string(cycle_length_node_name),
-       string(block_length_node_name)},
+      {std::string(input_node_name), std::string(cycle_length_node_name),
+       std::string(block_length_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
-          {"deterministic", string(deterministic)},
+          {"deterministic", std::string(deterministic)},
       });
 }
 
@@ -171,9 +176,9 @@ NodeDef MakeParallelMapNode(absl::string_view name,
                             absl::string_view function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelMapDataset",
-      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
@@ -189,13 +194,13 @@ NodeDef MakeParallelMapV2Node(absl::string_view name,
                               bool use_unbounded_threadpool) {
   return test::function::NDef(
       name, "ParallelMapDatasetV2",
-      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
-          {"deterministic", string(deterministic)},
+          {"deterministic", std::string(deterministic)},
           {"use_unbounded_threadpool", use_unbounded_threadpool},
       });
 }
@@ -206,7 +211,7 @@ NodeDef MakeParseExampleNode(absl::string_view name,
                              bool sloppy) {
   return test::function::NDef(
       name, "ParseExampleDataset",
-      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(num_parallel_calls_node_name)},
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
@@ -221,9 +226,9 @@ NodeDef MakeShuffleV2Node(absl::string_view name,
   return test::function::NDef(
       name, "ShuffleDatasetV2",
       {
-          string(input_node_name),
-          string(buffer_size_node_name),
-          string(seed_generator_node_name),
+          std::string(input_node_name),
+          std::string(buffer_size_node_name),
+          std::string(seed_generator_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -236,8 +241,8 @@ NodeDef MakeTakeNode(absl::string_view name, absl::string_view input_node_name,
   return test::function::NDef(
       name, "TakeDataset",
       {
-          string(input_node_name),
-          string(count_node_name),
+          std::string(input_node_name),
+          std::string(count_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -251,7 +256,7 @@ NodeDef MakeTensorSliceNode(absl::string_view name,
   return test::function::NDef(
       name, "TensorSliceDataset",
       {
-          string(tensor_node_name),
+          std::string(tensor_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -265,8 +270,8 @@ NodeDef MakeSkipNode(absl::string_view name, absl::string_view input_node_name,
   return test::function::NDef(
       name, "SkipDataset",
       {
-          string(input_node_name),
-          string(count_node_name),
+          std::string(input_node_name),
+          std::string(count_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -280,9 +285,9 @@ NodeDef MakeShardNode(absl::string_view name, absl::string_view input_node_name,
   return test::function::NDef(
       name, "ShardDataset",
       {
-          string(input_node_name),
-          string(num_shards_node_name),
-          string(index_node_name),
+          std::string(input_node_name),
+          std::string(num_shards_node_name),
+          std::string(index_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -294,7 +299,8 @@ NodeDef MakePrefetchNode(absl::string_view name,
                          absl::string_view input_node_name,
                          absl::string_view buffer_size) {
   return test::function::NDef(
-      name, "PrefetchDataset", {string(input_node_name), string(buffer_size)},
+      name, "PrefetchDataset",
+      {std::string(input_node_name), std::string(buffer_size)},
       {{"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}},
        {"slack_period", 0},
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index efd29b75fd54c8..760d55da080b35 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -60,7 +60,7 @@ std::vector<int> GetElementIndicesWithPredicate(const Predicate& predicate,
 }
 
 std::vector<int> CreateNameIndex(const GraphDef& graph) {
-  std::map<string, int> names;
+  std::map<std::string, int> names;
   for (int i = 0; i < graph.node_size(); ++i) {
     names[graph.node(i).name()] = i;
   }
@@ -73,7 +73,7 @@ std::vector<int> CreateNameIndex(const GraphDef& graph) {
 }
 
 std::vector<int> CreateInputIndex(const NodeDef& node) {
-  std::map<string, int> inputs;
+  std::map<std::string, int> inputs;
   for (int i = 0; i < node.input_size(); ++i) {
     inputs[node.input(i)] = i;
   }
@@ -117,18 +117,19 @@ NodeDef* AddScalarPlaceholder(DataType dtype, MutableGraphView* graph) {
   return graph->AddNode(std::move(node));
 }
 
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 MutableGraphView* graph) {
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    MutableGraphView* graph) {
   NodeDef node;
   if (!name.empty()) {
-    node.set_name(string(name));
+    node.set_name(name);
   } else {
     SetUniqueGraphNodeName(op, graph->graph(), &node);
   }
-  node.set_op(string(op));
-  for (const string& input : inputs) {
+  node.set_op(op);
+  for (const std::string& input : inputs) {
     node.add_input(input);
   }
   for (const auto& attr : attributes) {
@@ -278,7 +279,7 @@ int FindGraphNodeWithOp(absl::string_view op, const GraphDef& graph) {
       [&op](const NodeDef& node) { return node.op() == op; }, graph.node());
 }
 
-std::vector<int> FindAllGraphNodesWithOp(const string& op,
+std::vector<int> FindAllGraphNodesWithOp(const std::string& op,
                                          const GraphDef& graph) {
   return GetElementIndicesWithPredicate(
       [&op](const NodeDef& node) { return node.op() == op; }, graph.node());
@@ -300,7 +301,7 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
 absl::Status GetDatasetOutputTypesAttr(const NodeDef& node,
                                        DataTypeVector* output_types) {
   // We don't name the output_types attr consistently, so should check for both.
-  for (const string& attr_name : {"output_types", "Toutput_types"}) {
+  for (const std::string& attr_name : {"output_types", "Toutput_types"}) {
     if (node.attr().contains(attr_name)) {
       return GetNodeAttr(node, attr_name, output_types);
     }
@@ -311,14 +312,14 @@ absl::Status GetDatasetOutputTypesAttr(const NodeDef& node,
 
 void SetUniqueGraphNodeName(absl::string_view prefix, GraphDef* graph,
                             NodeDef* node) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = graph->node_size();
   while (ContainsGraphNodeWithName(name, *graph)) {
-    if (name.rfind("_generated") != string::npos &&
+    if (name.rfind("_generated") != std::string::npos &&
         (name.rfind("_generated") == (name.size() - strlen("_generated")))) {
-      name.insert(name.rfind("_generated"), strings::StrCat("/_", id));
+      name.insert(name.rfind("_generated"), absl::StrCat("/_", id));
     } else {
-      name = strings::StrCat(prefix, "/_", id);
+      name = absl::StrCat(prefix, "/_", id);
     }
     ++id;
   }
@@ -328,22 +329,23 @@ void SetUniqueGraphNodeName(absl::string_view prefix, GraphDef* graph,
 void SetUniqueGraphFunctionName(absl::string_view prefix,
                                 const FunctionDefLibrary* library,
                                 FunctionDef* function) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = library->function_size();
   while (ContainsGraphFunctionWithName(name, *library)) {
-    name = strings::StrCat(prefix, "/_", id);
+    name = absl::StrCat(prefix, "/_", id);
     ++id;
   }
   function->mutable_signature()->set_name(std::move(name));
 }
 
-void CopyAttribute(const string& attribute_name, const NodeDef& from,
+void CopyAttribute(const std::string& attribute_name, const NodeDef& from,
                    NodeDef* to_node) {
   (*to_node->mutable_attr())[attribute_name] = from.attr().at(attribute_name);
 }
 
-void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
-                         const NodeDef& second, NodeDef* to_node) {
+void ConcatAttributeList(const std::string& attribute_name,
+                         const NodeDef& first, const NodeDef& second,
+                         NodeDef* to_node) {
   CopyAttribute(attribute_name, first, to_node);
   (*to_node->mutable_attr())
       .at(attribute_name)
@@ -353,14 +355,14 @@ void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
 
 absl::Status EnsureNodeNamesUnique(Graph* g) {
   // Modeled after Scope::Impl::GetUniqueName
-  std::unordered_map<string, int> name_map;
+  std::unordered_map<std::string, int> name_map;
 
   for (auto node : g->op_nodes()) {
-    const string& prefix = node->name();
+    const std::string& prefix = node->name();
     if (auto entry = gtl::FindOrNull(name_map, prefix)) {
-      string unique_name;
+      std::string unique_name;
       do {
-        unique_name = strings::StrCat(prefix, "_", ++(*entry));
+        unique_name = absl::StrCat(prefix, "_", ++*entry);
       } while (name_map.find(unique_name) != name_map.end());
       name_map.insert({unique_name, 0});
       node->set_name(std::move(unique_name));
@@ -409,7 +411,7 @@ void MaybeSetFusedMetadata(const NodeDef& node1, const NodeDef& node2,
     metadata2.ParseFromString(node2.attr().at("metadata").s());
   }
   data::Metadata fused_metadata;
-  auto normalize_name = [](const string& name) {
+  auto normalize_name = [](const std::string& name) {
     return name.empty() ? "?" : name;
   };
   *fused_metadata.mutable_name() =
@@ -433,18 +435,18 @@ bool CopyShapesAndTypesAttrs(const NodeDef& from, NodeDef* to_node) {
 }
 
 namespace {
-const auto* kSloppyAttrOps = new absl::flat_hash_set<string>{
+const auto* kSloppyAttrOps = new absl::flat_hash_set<std::string>{
     "ParallelInterleaveDatasetV2",
     "ParallelMapDataset",
     "ParseExampleDataset",
 };
 
-const auto* kReplicateOnSplitAttrOps = new absl::flat_hash_set<string>{
+const auto* kReplicateOnSplitAttrOps = new absl::flat_hash_set<std::string>{
     "TensorSliceDataset",
     "RangeDataset",
 };
 
-const auto* kDeterministicAttrOps = new absl::flat_hash_set<string>{
+const auto* kDeterministicAttrOps = new absl::flat_hash_set<std::string>{
     "LegacyParallelInterleaveDatasetV2",
     "ParallelInterleaveDatasetV3",
     "ParallelInterleaveDatasetV4",
@@ -453,13 +455,15 @@ const auto* kDeterministicAttrOps = new absl::flat_hash_set<string>{
 };
 }  // anonymous namespace
 
-bool HasSloppyAttr(const string& op) { return kSloppyAttrOps->contains(op); }
+bool HasSloppyAttr(const std::string& op) {
+  return kSloppyAttrOps->contains(op);
+}
 
-bool HasReplicateOnSplitAttr(const string& op) {
+bool HasReplicateOnSplitAttr(const std::string& op) {
   return kReplicateOnSplitAttrOps->contains(op);
 }
 
-bool HasDeterministicAttr(const string& op) {
+bool HasDeterministicAttr(const std::string& op) {
   return kDeterministicAttrOps->contains(op);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 70d0c48085716a..78cc5e2818254c 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -49,10 +49,11 @@ int GetFirstElementIndexWithPredicate(const Predicate& predicate,
 }
 
 // Adds a node to the graph.
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 MutableGraphView* graph);
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    MutableGraphView* graph);
 
 // Adds Placeholder node for given type.
 NodeDef* AddScalarPlaceholder(DataType dtype, MutableGraphView* graph);
@@ -134,7 +135,7 @@ absl::Status GetDatasetOutputTypesAttr(const NodeDef& node,
 
 // Returns the list of indices of all nodes with the given op or empty list if
 // no such node exists.
-std::vector<int> FindAllGraphNodesWithOp(const string& op,
+std::vector<int> FindAllGraphNodesWithOp(const std::string& op,
                                          const GraphDef& graph);
 
 // Sets the node name using `prefix` as a prefix while guaranteeing the name
@@ -150,13 +151,14 @@ void SetUniqueGraphFunctionName(absl::string_view prefix,
 
 // Copies attribute having name `attribute_name` from node `from` to node
 // `to_node`.
-void CopyAttribute(const string& attribute_name, const NodeDef& from,
+void CopyAttribute(const std::string& attribute_name, const NodeDef& from,
                    NodeDef* to_node);
 
 // Concatenates list attribute having name `attribute_name` from `first` and
 // `second` node, setting it to `to_node`.
-void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
-                         const NodeDef& second, NodeDef* to_node);
+void ConcatAttributeList(const std::string& attribute_name,
+                         const NodeDef& first, const NodeDef& second,
+                         NodeDef* to_node);
 
 // Checks that all nodes in the graphs have unique names, and sets their names
 // to be unique if they are not already.  This is necessary as Graph does not
@@ -195,13 +197,13 @@ void MaybeSetFusedMetadata(const NodeDef& node1, const NodeDef& node2,
 bool CopyShapesAndTypesAttrs(const NodeDef& from, NodeDef* to_node);
 
 // Checks whether the op has a "sloppy" attribute.
-bool HasSloppyAttr(const string& op);
+bool HasSloppyAttr(const std::string& op);
 
 // Checks whether the op has a "replicate_on_split" attribute.
-bool HasReplicateOnSplitAttr(const string& op);
+bool HasReplicateOnSplitAttr(const std::string& op);
 
 // Checks whether the op has a "deterministic" attribute.
-bool HasDeterministicAttr(const string& op);
+bool HasDeterministicAttr(const std::string& op);
 
 // Sets the `name` as the metadata name of the `node`. It returns an error if
 // the `node` already has a metadata name.
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 31ca40af244757..b381f31ea92145 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -314,7 +314,7 @@ TEST(GraphUtilsTest, EnsureNodeNamesUnique) {
 
   // Arbitrary const
   Tensor tensor(DT_INT32, {});
-  tensor.scalar<int32>()() = 5;
+  tensor.scalar<int32_t>()() = 5;
 
   for (auto node : {&const_0, &const_1}) {
     TF_EXPECT_OK(NodeBuilder("Const", "Const")
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
index c0d6ae8247285a..bb55561d9e46a4 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
@@ -110,7 +110,7 @@ absl::Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
   // Insert `prefetch(AUTOTUNE)` after the last node.
   NodeDef prefetch_node;
   graph_utils::SetUniqueGraphNodeName(
-      strings::StrCat("inject/prefetch_", last_node->name()), graph.graph(),
+      absl::StrCat("inject/prefetch_", last_node->name()), graph.graph(),
       &prefetch_node);
   prefetch_node.set_op(kPrefetchDataset);
   // `input_dataset` input
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic.cc b/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
index 48a4b2a2b79934..c603bcb7cf470e 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
@@ -158,7 +158,7 @@ absl::flat_hash_map<absl::string_view, const NodeDef*> NameToNode(
   return name_to_node;
 }
 
-NodeDef* GetMutableNode(const string& node_name, MutableGraphView* graph) {
+NodeDef* GetMutableNode(const std::string& node_name, MutableGraphView* graph) {
   int index = graph_utils::FindGraphNodeWithName(node_name, *graph->graph());
   DCHECK_NE(index, -1) << "Failed to find node " << node_name
                        << " in the optimized graph.";
@@ -167,7 +167,7 @@ NodeDef* GetMutableNode(const string& node_name, MutableGraphView* graph) {
 
 // Converts a ParallelInterleaveDataset or ParallelMapDataset to the equivalent
 // non-parallel version, to make it deterministic.
-absl::Status ConvertMapOrInterleave(const string& node_name,
+absl::Status ConvertMapOrInterleave(const std::string& node_name,
                                     MutableGraphView* graph) {
   NodeDef* node = GetMutableNode(node_name, graph);
 
@@ -257,7 +257,7 @@ absl::flat_hash_set<absl::string_view> GetAllTransitiveDependencies(
 // separate Map and MapAndBatch ops. All the nondeterministic nodes and their
 // dependencies are moved to the Map node.
 absl::Status SplitMap(
-    const FunctionLibraryDefinition& library, const string& map_node_name,
+    const FunctionLibraryDefinition& library, const std::string& map_node_name,
     MutableGraphView* graph,
     const absl::flat_hash_set<absl::string_view>& nondeterministic_nodes) {
   NodeDef* map_node = GetMutableNode(map_node_name, graph);
@@ -294,7 +294,7 @@ absl::Status SplitMap(
   {
     NodeDef first_map_node;
     graph_utils::SetUniqueGraphNodeName(
-        strings::StrCat("make_deterministic_sequential_map/", map_node->name()),
+        absl::StrCat("make_deterministic_sequential_map/", map_node->name()),
         graph->graph(), &first_map_node);
     first_map_node.set_op(kMapOp);
     int num_control_deps = NumControlInputs(*map_node);
@@ -344,11 +344,11 @@ absl::Status SplitMap(
   NodeDef* second_map_node_ptr;
   {
     NodeDef second_map_node;
-    string node_name =
+    std::string node_name =
         map_node->op() == kMapAndBatchOp ? "map_and_batch" : "parallel_map";
     graph_utils::SetUniqueGraphNodeName(
-        strings::StrCat("make_deterministic_parallel_", node_name, "/",
-                        map_node->name()),
+        absl::StrCat("make_deterministic_parallel_", node_name, "/",
+                     map_node->name()),
         graph->graph(), &second_map_node);
     second_map_node.set_op(map_node->op());
     second_map_node.add_input(first_map_node_ptr->name());
@@ -384,7 +384,8 @@ absl::Status SplitMap(
 
 // Converts a ParallalBatch dataset to a Batch dataset, to make it
 // deterministic.
-absl::Status ConvertBatch(const string& node_name, MutableGraphView* graph) {
+absl::Status ConvertBatch(const std::string& node_name,
+                          MutableGraphView* graph) {
   NodeDef* node = GetMutableNode(node_name, graph);
   node->set_op(kBatchV2Op);
   std::string num_parallel_calls_input = node->input(2);
@@ -398,7 +399,7 @@ absl::Status ConvertBatch(const string& node_name, MutableGraphView* graph) {
 // deterministic. Caller should delete the MapAndBatch node afterwards.
 // TODO(reedwm): Handle 'metadata' attribute. Currently the Map node and Batch
 // node will have an empty 'metadata' attribute.
-absl::Status ConvertMapAndBatch(const string& node_name,
+absl::Status ConvertMapAndBatch(const std::string& node_name,
                                 MutableGraphView* graph) {
   int index = graph_utils::FindGraphNodeWithName(node_name, *graph->graph());
   DCHECK_NE(index, -1) << "Failed to find node " << node_name
@@ -479,7 +480,8 @@ absl::Status ConvertMapAndBatch(const string& node_name,
 
 // Change the buffer_size of a Prefetch node to zero, effectively disabling it,
 // to make it deterministic.
-absl::Status ConvertPrefetch(const string& node_name, MutableGraphView* graph) {
+absl::Status ConvertPrefetch(const std::string& node_name,
+                             MutableGraphView* graph) {
   NodeDef* node = GetMutableNode(node_name, graph);
   constexpr int buffer_size_index = 1;
   node->add_input(absl::StrCat("^", node->input(buffer_size_index)));
@@ -548,7 +550,7 @@ bool FunctionMayIntroduceNondeterminism(
 bool FunctionMayIntroduceNondeterminism(
     const FunctionLibraryDefinition& library, const std::string& function_name,
     NondeterminismType nondeterminism_type) {
-  absl::flat_hash_set<string> functions_processed;
+  absl::flat_hash_set<std::string> functions_processed;
   return FunctionMayIntroduceNondeterminism(library, function_name,
                                             nondeterminism_type,
                                             &functions_processed, nullptr);
@@ -652,7 +654,7 @@ bool GraphMayHaveAsyncNondeterminism(const FunctionLibraryDefinition& library,
       return true;
     }
   }
-  for (const string& function_name : library.ListFunctionNames()) {
+  for (const std::string& function_name : library.ListFunctionNames()) {
     const FunctionDef* function_def = library.Find(function_name);
     CHECK(function_def);  // Crash Ok
     for (const NodeDef& node : function_def->node_def()) {
@@ -673,7 +675,7 @@ absl::Status MakeDeterministic::OptimizeAndCollectStats(
   MutableGraphView graph(output);
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   bool remove_async_nodes =
       GraphMayHaveAsyncNondeterminism(function_library, item.graph);
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic.h b/tensorflow/core/grappler/optimizers/data/make_deterministic.h
index 30659c435253a8..6971e5cf277e65 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic.h
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic.h
@@ -56,7 +56,7 @@ class MakeDeterministic : public TFDataOptimizerBase {
   MakeDeterministic() = default;
   ~MakeDeterministic() override = default;
 
-  string name() const override { return "make_deterministic"; };
+  std::string name() const override { return "make_deterministic"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc b/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
index c6a6294a4f1755..430d95f2162bb0 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
@@ -34,8 +34,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<string> GetNodeNames(const FunctionDef& func) {
-  std::vector<string> node_names;
+std::vector<std::string> GetNodeNames(const FunctionDef& func) {
+  std::vector<std::string> node_names;
   for (const NodeDef& node : func.node_def()) {
     node_names.push_back(node.name());
   }
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
index b10468093b005b..ce3d3c8cbd5202 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.h
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -26,7 +26,7 @@ class MakeSloppy : public TFDataOptimizerBase {
   MakeSloppy() = default;
   ~MakeSloppy() override = default;
 
-  string name() const override { return "make_sloppy"; }
+  std::string name() const override { return "make_sloppy"; }
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 44d46212ee4f65..55f5190c223f75 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -110,7 +110,7 @@ absl::Status MapAndBatchFusion::OptimizeAndCollectStats(
     OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
       continue;
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
index 7e7e002b3fa380..d9f99dfcc0f423 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -26,7 +26,7 @@ class MapAndBatchFusion : public TFDataOptimizerBase {
   MapAndBatchFusion() = default;
   ~MapAndBatchFusion() override = default;
 
-  string name() const override { return "map_and_batch_fusion"; };
+  std::string name() const override { return "map_and_batch_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 0aaa95f77fbeb0..b42376ddee4d73 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -33,11 +33,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -45,10 +45,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(2);
+    std::vector<std::string> map_inputs(2);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -63,10 +63,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -116,11 +116,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -128,10 +128,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(2);
+    std::vector<std::string> map_inputs(2);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -148,11 +148,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
       graph_utils::AddScalarConstNode<bool>(true, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(3);
+    std::vector<std::string> batch_inputs(3);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
     batch_inputs[2] = drop_remainder_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -200,11 +200,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -214,11 +214,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(3);
+    std::vector<std::string> map_inputs(3);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
     map_inputs[2] = num_parallel_calls_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -233,10 +233,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -286,11 +286,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapV2AndBatchNodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -300,11 +300,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapV2AndBatchNodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(3);
+    std::vector<std::string> map_inputs(3);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
     map_inputs[2] = num_parallel_calls_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -319,10 +319,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapV2AndBatchNodesIntoOne) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -373,20 +373,20 @@ TEST(MapAndBatchFusionTest, NoChange) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
 
   NodeDef *batch_size_node =
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
-  std::vector<string> batch_inputs(2);
+  std::vector<std::string> batch_inputs(2);
   batch_inputs[0] = range_node->name();
   batch_inputs[1] = batch_size_node->name();
-  std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+  std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
   AttrValue shapes_attr;
   SetAttrValue("output_shapes", &shapes_attr);
   batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -409,11 +409,11 @@ TEST(MapAndBatchFusionTest, NoChange_UnboundedThreadpoolParallelMap) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -423,11 +423,11 @@ TEST(MapAndBatchFusionTest, NoChange_UnboundedThreadpoolParallelMap) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(3);
+    std::vector<std::string> map_inputs(3);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
     map_inputs[2] = num_parallel_calls_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(3);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(3);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -446,10 +446,10 @@ TEST(MapAndBatchFusionTest, NoChange_UnboundedThreadpoolParallelMap) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 2a1e9dd1403110..b974f083e24bbf 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -109,7 +109,7 @@ NodeDef MakeFilterNode(const NodeDef& fused_map,
   arg->set_description("predicate result computed in the fused map");
   arg->set_type(DT_BOOL);
   sig->set_description("returns the last argument");
-  (*func->mutable_ret())["predicate_result"] = strings::StrCat(
+  (*func->mutable_ret())["predicate_result"] = absl::StrCat(
       fused_sig.output_arg(fused_sig.output_arg_size() - 1).name(), ":0");
 
   (*filter_node.mutable_attr())["predicate"] =
@@ -147,7 +147,7 @@ NodeDef MakeMapNode(const NodeDef& updated_filter, const NodeDef& original_map,
   for (int i = 0; i < fused_sig.output_arg_size() - 1; ++i) {
     auto arg_i = fused_sig.output_arg(i);
     *(sig->add_output_arg()) = arg_i;
-    (*func->mutable_ret())[arg_i.name()] = strings::StrCat(arg_i.name(), ":0");
+    (*func->mutable_ret())[arg_i.name()] = absl::StrCat(arg_i.name(), ":0");
   }
   sig->set_description("drops the last argument");
 
@@ -168,7 +168,7 @@ absl::Status MapAndFilterFusion::OptimizeAndCollectStats(
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
index 018a8751af4f94..3c4543c31d80d4 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -35,7 +35,7 @@ class MapAndFilterFusion : public TFDataOptimizerBase {
   MapAndFilterFusion() = default;
   ~MapAndFilterFusion() override = default;
 
-  string name() const override { return "map_and_filter_fusion"; };
+  std::string name() const override { return "map_and_filter_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 4e6b9c9e647efc..4b8321514880f9 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -48,7 +48,8 @@ constexpr char kValueAttr[] = "value";
 constexpr int kAutotuneValue = -1;
 
 // Returns true if it is a `tf.data.AUTOTUNE` node.
-bool IsAutotuneNode(const string& node_name, const MutableGraphView& graph) {
+bool IsAutotuneNode(const std::string& node_name,
+                    const MutableGraphView& graph) {
   const NodeDef* node = graph.GetNode(node_name);
   if (!node) return false;
   if (node->op() != kConstOp) return false;
@@ -91,10 +92,10 @@ bool SameDeterministicAttr(const NodeDef& parallel_map_node,
 // optimizing each function in that graph and later aggregating any new
 // functions introduced during these individual optimizations into that single
 // graph's collective function library).
-string GetFusedName(const NodeDef& parent, const NodeDef& child) {
+std::string GetFusedName(const NodeDef& parent, const NodeDef& child) {
   return absl::StrCat("map_fusion_nodes/", parent.name(), "/", child.name());
 }
-string GetFusedName(const FunctionDef& parent, const FunctionDef& child) {
+std::string GetFusedName(const FunctionDef& parent, const FunctionDef& child) {
   return absl::StrCat("map_fusion_funcs/", parent.signature().name(), "/",
                       child.signature().name());
 }
@@ -171,7 +172,7 @@ absl::Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
index 2512fc882ab9c1..26068eadf16a7e 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -30,7 +30,7 @@ class MapFusion : public TFDataOptimizerBase {
   MapFusion() = default;
   ~MapFusion() override = default;
 
-  string name() const override { return "map_fusion"; };
+  std::string name() const override { return "map_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class MapFusion : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index 2118fb95261886..6a0a10926c3d5d 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -34,7 +34,7 @@ namespace {
 constexpr char kMapDataset[] = "MapDataset";
 constexpr char kParallelMapDataset[] = "ParallelMapDatasetV2";
 
-NodeDef MakeParallelMap(const string& name, MutableGraphView* graph) {
+NodeDef MakeParallelMap(const std::string& name, MutableGraphView* graph) {
   // The inputs of the node to be parallelized could be changed by the
   // optimization pass, so we need to look it up in the modified graph.
   int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
@@ -72,7 +72,7 @@ absl::Status MapParallelization::OptimizeAndCollectStats(
   if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
     return absl::OkStatus();
 
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
index 6ed7003454b996..54bc5bd5632704 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -30,7 +30,7 @@ class MapParallelization : public TFDataOptimizerBase {
   MapParallelization() = default;
   ~MapParallelization() override = default;
 
-  string name() const override { return "map_parallelization"; };
+  std::string name() const override { return "map_parallelization"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class MapParallelization : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
index 50444ecb73602e..2bfa7363da1825 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
@@ -75,10 +75,10 @@ TEST_P(AutotuneSetting, MapParallelizationTest) {
 
 INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
 
-class FromFunctionDef : public ::testing::TestWithParam<string> {};
+class FromFunctionDef : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(FromFunctionDef, MapParallelizationTest) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   bool from_function_def = (op == "_Retval");
 
   using test::function::NDef;
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 718f11106277dd..18787fc4be25b5 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -35,7 +35,7 @@ namespace grappler {
 namespace {
 
 using ConfigMap =
-    std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
+    std::map<std::string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
 // clang-format off
@@ -76,7 +76,7 @@ absl::Status ToConfigMap(
   for (const auto& option_string : options) {
     // The option string has the format
     // <optimizer_name>:<config_key>:<config_value>
-    std::vector<string> split = absl::StrSplit(option_string, ':');
+    std::vector<std::string> split = absl::StrSplit(option_string, ':');
     if (split.size() != 3) {
       return errors::Internal(
           "Wrong format for optimizer options. Expect <optimizer name>:<config "
@@ -84,9 +84,9 @@ absl::Status ToConfigMap(
           option_string);
     }
 
-    const string& optimizer_name = split[0];
-    const string& config_key = split[1];
-    const string& config_value = split[2];
+    const std::string& optimizer_name = split[0];
+    const std::string& config_key = split[1];
+    const std::string& config_value = split[2];
 
     auto optimizer_config = gtl::FindOrNull(*result, optimizer_name);
     if (!optimizer_config) {
@@ -168,7 +168,7 @@ absl::Status TFDataMetaOptimizer::Optimize(Cluster* cluster,
   return absl::OkStatus();
 }
 
-absl::Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
+absl::Status TFDataMetaOptimizer::ApplyOptimization(const std::string& name,
                                                     Cluster* cluster,
                                                     GrapplerItem* item) const {
   GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
index e839389dba34ca..ac1c819bc32669 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
@@ -29,7 +29,7 @@ class TFDataMetaOptimizer : public CustomGraphOptimizer {
   TFDataMetaOptimizer() = default;
   ~TFDataMetaOptimizer() override = default;
 
-  string name() const override { return "tf_data_meta_optimizer"; };
+  std::string name() const override { return "tf_data_meta_optimizer"; };
 
   bool UsesFunctionLibrary() const override { return true; }
 
@@ -40,12 +40,12 @@ class TFDataMetaOptimizer : public CustomGraphOptimizer {
                         GraphDef* output) override;
 
  private:
-  absl::flat_hash_map<string, std::unique_ptr<GraphOptimizer>>
+  absl::flat_hash_map<std::string, std::unique_ptr<GraphOptimizer>>
       enabled_optimizers_;
 
   // Applies an optimization with the specified name on `item`, and stores
   // the result in `item.graph`
-  absl::Status ApplyOptimization(const string& name, Cluster* cluster,
+  absl::Status ApplyOptimization(const std::string& name, Cluster* cluster,
                                  GrapplerItem* item) const;
 };
 
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index cba50f55e18260..ad8a3755dd3a2f 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -70,8 +70,9 @@ bool IsShardOne(const NodeDef& shard_node, const MutableGraphView& graph) {
   return IsConstNodeWithValue(*graph.GetNode(shard_node.input(1)), 1);
 }
 
-bool IsOutputIdentityOfInput(const FunctionDef& fdef, const string& output_arg,
-                             const string& input_arg) {
+bool IsOutputIdentityOfInput(const FunctionDef& fdef,
+                             const std::string& output_arg,
+                             const std::string& input_arg) {
   if (!fdef.ret().contains(output_arg)) {
     LOG(WARNING)
         << "Malformed FunctionDef: ret dict does not contain output arg key.";
@@ -146,7 +147,7 @@ absl::Status NoOpElimination::OptimizeAndCollectStats(
     OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              graph.graph()->library());
   for (const NodeDef& node : item.graph.node()) {
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
index 389b112e66f9b5..8d7e5a9e6973e1 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.h
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -28,7 +28,7 @@ class NoOpElimination : public TFDataOptimizerBase {
   NoOpElimination() = default;
   ~NoOpElimination() override = default;
 
-  string name() const override { return "noop_elimination"; };
+  std::string name() const override { return "noop_elimination"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
index 173f3e463fdf6d..eadd19ed929b5c 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
@@ -25,24 +25,24 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<std::pair<string, AttrValue>> GetCommonAttributes() {
+std::vector<std::pair<std::string, AttrValue>> GetCommonAttributes() {
   AttrValue shapes_attr, types_attr;
   SetAttrValue("output_shapes", &shapes_attr);
   SetAttrValue("output_types", &types_attr);
-  std::vector<std::pair<string, AttrValue>> commonAttributes = {
+  std::vector<std::pair<std::string, AttrValue>> commonAttributes = {
       {"output_shapes", shapes_attr}, {"output_types", types_attr}};
 
   return commonAttributes;
 }
 
-NodeDef *MakeNode(absl::string_view node_type, std::vector<int> params,
-                  string input_node, MutableGraphView *graph) {
+NodeDef* MakeNode(absl::string_view node_type, std::vector<int> params,
+                  std::string input_node, MutableGraphView* graph) {
   std::vector<NodeDef *> node_params;
   for (int param : params) {
     node_params.push_back(
         graph_utils::AddScalarConstNode<int64_t>(param, graph));
   }
-  std::vector<string> inputs = {input_node};
+  std::vector<std::string> inputs = {input_node};
   for (int i = 0; i < node_params.size(); i++) {
     inputs.push_back(node_params[i]->name());
   }
@@ -50,14 +50,14 @@ NodeDef *MakeNode(absl::string_view node_type, std::vector<int> params,
                               graph);
 }
 
-NodeDef *MakeNonConstNode(absl::string_view node_type,
-                          std::vector<DataType> param_dtypes, string input_node,
-                          MutableGraphView *graph) {
+NodeDef* MakeNonConstNode(absl::string_view node_type,
+                          std::vector<DataType> param_dtypes,
+                          std::string input_node, MutableGraphView* graph) {
   std::vector<NodeDef *> node_params;
   for (DataType dtype : param_dtypes) {
     node_params.push_back(graph_utils::AddScalarPlaceholder(dtype, graph));
   }
-  std::vector<string> inputs = {input_node};
+  std::vector<std::string> inputs = {input_node};
   for (int i = 0; i < node_params.size(); i++) {
     inputs.push_back(node_params[i]->name());
   }
@@ -66,7 +66,7 @@ NodeDef *MakeNonConstNode(absl::string_view node_type,
                               graph);
 }
 
-NodeDef *MakeCacheNode(string input_node, MutableGraphView *graph) {
+NodeDef* MakeCacheNode(std::string input_node, MutableGraphView* graph) {
   NodeDef *node_filename =
       graph_utils::AddScalarConstNode<absl::string_view>("", graph);
   return graph_utils::AddNode("", "CacheDataset",
@@ -79,15 +79,16 @@ NodeDef *MakeRangeNode(MutableGraphView *graph) {
   auto *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, graph);
   auto *step_node = graph_utils::AddScalarConstNode<int64_t>(1, graph);
 
-  std::vector<string> range_inputs = {start_node->name(), stop_node->name(),
-                                      step_node->name()};
+  std::vector<std::string> range_inputs = {
+      start_node->name(), stop_node->name(), step_node->name()};
 
   return graph_utils::AddNode("", "RangeDataset", range_inputs,
                               GetCommonAttributes(), graph);
 }
 
 struct NoOpLastEliminationTest
-    : ::testing::TestWithParam<std::tuple<string, std::vector<int>, bool>> {};
+    : ::testing::TestWithParam<
+          std::tuple<std::string, std::vector<int>, bool>> {};
 
 // This test checks whether the no-op elimination correctly handles
 // transformations at the end of the pipeline.
@@ -95,7 +96,7 @@ TEST_P(NoOpLastEliminationTest, EliminateLastNoOpNode) {
   GrapplerItem item;
   MutableGraphView graph(&item.graph);
 
-  const string &node_type = std::get<0>(GetParam());
+  const std::string& node_type = std::get<0>(GetParam());
   const std::vector<int> node_params = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
@@ -127,7 +128,8 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple("ShardDataset", std::vector<int>({2, 0}), true)));
 
 struct NoOpMiddleEliminationTest
-    : ::testing::TestWithParam<std::tuple<string, std::vector<int>, bool>> {};
+    : ::testing::TestWithParam<
+          std::tuple<std::string, std::vector<int>, bool>> {};
 
 // This test checks whether the no-op elimination correctly handles
 // transformations int the middle of the pipeline.
@@ -135,7 +137,7 @@ TEST_P(NoOpMiddleEliminationTest, EliminateMiddleNoOpNode) {
   GrapplerItem item;
   MutableGraphView graph(&item.graph);
 
-  const string &node_type = std::get<0>(GetParam());
+  const std::string& node_type = std::get<0>(GetParam());
   const std::vector<int> node_params = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
@@ -176,8 +178,8 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple("ShardDataset", std::vector<int>({1, 0}), false),
         std::make_tuple("ShardDataset", std::vector<int>({2, 0}), true)));
 
-using NodesTypes = std::tuple<std::pair<string, std::vector<int>>,
-                              std::pair<string, std::vector<int>>>;
+using NodesTypes = std::tuple<std::pair<std::string, std::vector<int>>,
+                              std::pair<std::string, std::vector<int>>>;
 struct NoOpMultipleEliminationTest : ::testing::TestWithParam<NodesTypes> {};
 
 // This test checks whether the no-op elimination correctly removes
@@ -188,13 +190,13 @@ TEST_P(NoOpMultipleEliminationTest, EliminateMultipleNoOpNode) {
 
   static_assert(std::tuple_size<NodesTypes>::value == 2,
                 "Make sure to include everything in the test");
-  const std::vector<std::pair<string, std::vector<int>>> noop_nodes = {
+  const std::vector<std::pair<std::string, std::vector<int>>> noop_nodes = {
       std::get<0>(GetParam()), std::get<1>(GetParam())};
 
   NodeDef *range_node = MakeRangeNode(&graph);
 
   NodeDef *previous = range_node;
-  std::vector<string> nodes_to_remove;
+  std::vector<std::string> nodes_to_remove;
   nodes_to_remove.reserve(noop_nodes.size());
 
   for (const auto &noop_node : noop_nodes) {
@@ -223,14 +225,14 @@ TEST_P(NoOpMultipleEliminationTest, EliminateMultipleNoOpNode) {
   EXPECT_EQ(cache_node_out.input(0), range_node->name());
 }
 
-const auto *const kTakeNode =
-    new std::pair<string, std::vector<int>>{"TakeDataset", {-1}};
-const auto *const kSkipNode =
-    new std::pair<string, std::vector<int>>{"SkipDataset", {0}};
-const auto *const kRepeatNode =
-    new std::pair<string, std::vector<int>>{"RepeatDataset", {1}};
-const auto *const kShardNode =
-    new std::pair<string, std::vector<int>>{"ShardDataset", {1, 0}};
+const auto* const kTakeNode =
+    new std::pair<std::string, std::vector<int>>{"TakeDataset", {-1}};
+const auto* const kSkipNode =
+    new std::pair<std::string, std::vector<int>>{"SkipDataset", {0}};
+const auto* const kRepeatNode =
+    new std::pair<std::string, std::vector<int>>{"RepeatDataset", {1}};
+const auto* const kShardNode =
+    new std::pair<std::string, std::vector<int>>{"ShardDataset", {1, 0}};
 
 INSTANTIATE_TEST_CASE_P(
     BasicRemovalTest, NoOpMultipleEliminationTest,
@@ -240,8 +242,8 @@ INSTANTIATE_TEST_CASE_P(
 
 struct NoOpPlaceholdersTest
     : ::testing::TestWithParam<
-          std::tuple<std::pair<string, std::vector<DataType>>,
-                     std::pair<string, std::vector<DataType>>>> {};
+          std::tuple<std::pair<std::string, std::vector<DataType>>,
+                     std::pair<std::string, std::vector<DataType>>>> {};
 
 TEST_P(NoOpPlaceholdersTest, NonConstNoOpNode) {
   GrapplerItem item;
@@ -249,10 +251,10 @@ TEST_P(NoOpPlaceholdersTest, NonConstNoOpNode) {
 
   static_assert(std::tuple_size<NodesTypes>::value == 2,
                 "Make sure to include everything in the test");
-  const std::vector<std::pair<string, std::vector<DataType>>> noop_nodes = {
-      std::get<0>(GetParam()), std::get<1>(GetParam())};
+  const std::vector<std::pair<std::string, std::vector<DataType>>> noop_nodes =
+      {std::get<0>(GetParam()), std::get<1>(GetParam())};
   NodeDef *range_node = MakeRangeNode(&graph);
-  std::vector<string> nodes_to_keep;
+  std::vector<std::string> nodes_to_keep;
   nodes_to_keep.reserve(noop_nodes.size());
   NodeDef *previous = range_node;
 
@@ -270,15 +272,18 @@ TEST_P(NoOpPlaceholdersTest, NonConstNoOpNode) {
     EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName(noop_node_name, output));
 }
 
-const auto *const kNonConstTakeNode =
-    new std::pair<string, std::vector<DataType>>{"TakeDataset", {DT_INT32}};
-const auto *const kNonConstSkipNode =
-    new std::pair<string, std::vector<DataType>>{"SkipDataset", {DT_INT32}};
-const auto *const kNonConstRepeatNode =
-    new std::pair<string, std::vector<DataType>>{"RepeatDataset", {DT_INT32}};
-const auto *const kNonConstShardNode =
-    new std::pair<string, std::vector<DataType>>{"ShardDataset",
-                                                 {DT_INT32, DT_INT32}};
+const auto* const kNonConstTakeNode =
+    new std::pair<std::string, std::vector<DataType>>{"TakeDataset",
+                                                      {DT_INT32}};
+const auto* const kNonConstSkipNode =
+    new std::pair<std::string, std::vector<DataType>>{"SkipDataset",
+                                                      {DT_INT32}};
+const auto* const kNonConstRepeatNode =
+    new std::pair<std::string, std::vector<DataType>>{"RepeatDataset",
+                                                      {DT_INT32}};
+const auto* const kNonConstShardNode =
+    new std::pair<std::string, std::vector<DataType>>{"ShardDataset",
+                                                      {DT_INT32, DT_INT32}};
 
 INSTANTIATE_TEST_CASE_P(
     DoNotRemovePlaceholders, NoOpPlaceholdersTest,
diff --git a/tensorflow/core/grappler/optimizers/data/parallel_batch.h b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
index 0fd3f4a138803e..a201a210816510 100644
--- a/tensorflow/core/grappler/optimizers/data/parallel_batch.h
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
@@ -29,7 +29,7 @@ class ParallelBatch : public TFDataOptimizerBase {
   ParallelBatch() = default;
   ~ParallelBatch() override = default;
 
-  string name() const override { return "parallel_batch"; }
+  std::string name() const override { return "parallel_batch"; }
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/remove_compression_map.h b/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
index 550436f4e3d234..d1e56fa3afb6af 100644
--- a/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
+++ b/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
@@ -26,7 +26,7 @@ class RemoveCompressionMap : public TFDataOptimizerBase {
   RemoveCompressionMap() = default;
   ~RemoveCompressionMap() override = default;
 
-  string name() const override { return "remove_compression_map"; }
+  std::string name() const override { return "remove_compression_map"; }
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/slack_test.cc b/tensorflow/core/grappler/optimizers/data/slack_test.cc
index 518a237afe4da4..692004683b26dd 100644
--- a/tensorflow/core/grappler/optimizers/data/slack_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack_test.cc
@@ -133,23 +133,23 @@ TEST(SlackTest, TestFunctionNotOptimized) {
                               {std::make_pair("output_shapes", shapes_attr),
                                std::make_pair("Toutput_types", types_attr)},
                               fdef);
-  NodeDef *prefetch_node = function_utils::AddNode(
+  NodeDef* prefetch_node = function_utils::AddNode(
       "PrefetchDataset", "PrefetchDataset",
-      {strings::StrCat(tensor_dataset_node->name(), ":handle:0"), "args_0"},
+      {absl::StrCat(tensor_dataset_node->name(), ":handle:0"), "args_0"},
       {std::make_pair("output_shapes", shapes_attr),
        std::make_pair("output_types", types_attr)},
       fdef);
 
   AttrValue variant_type_attr;
   SetAttrValue(DT_VARIANT, &variant_type_attr);
-  NodeDef *identity_node = function_utils::AddNode(
-      "Identity", "Identity",
-      {strings::StrCat(prefetch_node->name(), ":handle:0"),
-       strings::StrCat("^", tensor_dataset_node->name())},
-      {std::make_pair("T", variant_type_attr)}, fdef);
+  NodeDef* identity_node =
+      function_utils::AddNode("Identity", "Identity",
+                              {absl::StrCat(prefetch_node->name(), ":handle:0"),
+                               absl::StrCat("^", tensor_dataset_node->name())},
+                              {std::make_pair("T", variant_type_attr)}, fdef);
 
   (*fdef->mutable_ret())["identity"] =
-      strings::StrCat(identity_node->name(), ":output:0");
+      absl::StrCat(identity_node->name(), ":output:0");
   (*fdef->mutable_control_ret())[tensor_dataset_node->name()] =
       tensor_dataset_node->name();
   fdef->mutable_signature()->add_control_output(tensor_dataset_node->name());
diff --git a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
index 393a5e1bb44dc1..92ef3e2fc0b454 100644
--- a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
@@ -47,7 +47,7 @@ constexpr char kBatchTimeoutMicrosAttr[] = "batch_timeout_micros";
 constexpr char kAllowedBatchSizesAttr[] = "allowed_batch_sizes";
 constexpr char kMaxEnqueuedBatchesAttr[] = "max_enqueued_batches";
 constexpr char kEnableLargeBatchSplitting[] = "enable_large_batch_splitting";
-constexpr int64 kBoostMicrosNotSet = -1;
+constexpr int64_t kBoostMicrosNotSet = -1;
 
 using BatchOpRewriteFunction = std::function<void(NodeDef* batch_op)>;
 
@@ -61,10 +61,10 @@ using ::tensorflow::grappler::GrapplerItem;
 namespace {
 // Parameters for adaptive batch scheduler only.
 struct AdaptiveBatchSchedulerParams {
-  int32 initial_inflight_batches;
-  int32 min_inflight_batches;
-  int32 max_inflight_batches;
-  int32 batches_to_average_over;
+  int32_t initial_inflight_batches;
+  int32_t min_inflight_batches;
+  int32_t max_inflight_batches;
+  int32_t batches_to_average_over;
   int64_t full_batch_scheduling_boost_micros;
 };
 
@@ -175,7 +175,7 @@ Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool asbs_overridden = false;
   if (config_proto_.has_experimental() &&
       config_proto_.experimental().has_session_metadata()) {
-    const string model_name =
+    const std::string model_name =
         config_proto_.experimental().session_metadata().name();
 
     if (!config_.model_scheduler_options().empty()) {
diff --git a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
index 7b3a784bd205e8..74a4a03aa38ecd 100644
--- a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
+++ b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
@@ -46,17 +46,17 @@ using ::tensorflow::grappler::GrapplerItem;
 using ::tensorflow::serving::BatchOpRewriteConfig;
 
 // Add batch op in both GraphDef.node and GraphDef.library.function.node_def.
-void AddBatchOp(GraphDef* graph, int num_batch_threads = 16,
-                const absl::flat_hash_map<string, int>& reserved_int_attrs = {},
-                int max_batch_size = 16, int batch_timeout_micros = 10000,
-                const std::vector<int32>& allowed_batch_sizes = {8, 16},
-                int max_enqueued_batches = 1000,
-                bool disable_large_batch_splitting = false,
-                std::string_view mixed_priority_policy = "",
-                int low_priority_max_batch_size = -1,
-                int low_priority_batch_timeout_micros = -1,
-                const std::vector<int32>& low_priority_allowed_batch_sizes = {},
-                int low_priority_max_enqueued_batches = -1) {
+void AddBatchOp(
+    GraphDef* graph, int num_batch_threads = 16,
+    const absl::flat_hash_map<std::string, int>& reserved_int_attrs = {},
+    int max_batch_size = 16, int batch_timeout_micros = 10000,
+    const std::vector<int32_t>& allowed_batch_sizes = {8, 16},
+    int max_enqueued_batches = 1000, bool disable_large_batch_splitting = false,
+    std::string_view mixed_priority_policy = "",
+    int low_priority_max_batch_size = -1,
+    int low_priority_batch_timeout_micros = -1,
+    const std::vector<int32_t>& low_priority_allowed_batch_sizes = {},
+    int low_priority_max_enqueued_batches = -1) {
   auto set_batch_node_attribute = [&](const int32_t num_batch_threads,
                                       NodeDef* batch_op) {
     batch_op->set_name("cond/batch/BatchFunction");
@@ -288,7 +288,7 @@ TEST_F(BatchOpRewriterTest, UpdateBatchOptions) {
       128);
   (*config.mutable_batch_options())["model_with_override"]
       .set_batch_timeout_micros(5000);
-  const std::vector<int32> allowed_batch_sizes{4, 32};
+  const std::vector<int32_t> allowed_batch_sizes{4, 32};
   (*config.mutable_batch_options())["model_with_override"]
       .mutable_allowed_batch_sizes()
       ->Add(allowed_batch_sizes.begin(), allowed_batch_sizes.end());
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 902d619946d052..94d02108caa0ed 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h"
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -58,7 +59,7 @@ bool HasOpName(const string& node_name, const string& op_name) {
   if (end != string::npos) {
     size_t p = end + 1;
     while (p < node_name.size()) {
-      if (!isdigit(node_name[p])) {
+      if (!absl::ascii_isdigit(node_name[p])) {
         end = node_name.size();
         break;
       }
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 0619c756df14a5..d2bca75f98527f 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -300,7 +300,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
       // Output Mapping
       {{"o", "o:z:0"}});
 
-  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  protobuf::Map<std::string, AttrValue> func_instantiation_attr;
   func_instantiation_attr["T"].set_type(DT_FLOAT);
 
   GrapplerFunctionItem item;
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
index fa2a5381fd844b..ceab1328dcc08a 100644
--- a/tensorflow/core/grappler/utils/graph_view.cc
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -641,7 +641,7 @@ absl::Status MutableGraphView::GetNodeNamesAndPartitionUpdatedNodes(
       continue;
     }
     // Get name of updated node after potential mutation.
-    const string& node_name =
+    const std::string& node_name =
         diff.update_name ? diff.name : nodes_[diff.node_index].GetName();
     auto it = node_names->insert({node_name, internal::kNodeNamePresent});
     if (!it.second) {
diff --git a/tensorflow/core/grappler/utils/graph_view_internal.h b/tensorflow/core/grappler/utils/graph_view_internal.h
index d66b1ca04528e1..f23de430d88fb3 100644
--- a/tensorflow/core/grappler/utils/graph_view_internal.h
+++ b/tensorflow/core/grappler/utils/graph_view_internal.h
@@ -406,7 +406,7 @@ inline bool UpdateName(NodeViewDiff<GraphViewT>* diff, absl::string_view name) {
     diff->name.clear();
     diff->update_name = false;
   } else {
-    diff->name = string(name);
+    diff->name = std::string(name);
     diff->update_name = true;
   }
   return true;
diff --git a/tensorflow/core/grappler/utils/graph_view_internal_test.cc b/tensorflow/core/grappler/utils/graph_view_internal_test.cc
index 8bd4591d9e3fe1..6b9d3b82f94d61 100644
--- a/tensorflow/core/grappler/utils/graph_view_internal_test.cc
+++ b/tensorflow/core/grappler/utils/graph_view_internal_test.cc
@@ -785,7 +785,7 @@ TEST(MutableNodeViewDiffTest, IsWellFormedRenamedMissingFaninControl) {
   EXPECT_TRUE(IsEmpty(&diff));
   EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
 
-  string old_node_name = "d";
+  std::string old_node_name = "d";
   string new_node_name = "e";
   updated_node_names.erase(old_node_name);
   updated_node_names.emplace(old_node_name, 3);
diff --git a/tensorflow/core/grappler/utils/graph_view_test.cc b/tensorflow/core/grappler/utils/graph_view_test.cc
index 376b2854c01a5d..478ca1fb4062c4 100644
--- a/tensorflow/core/grappler/utils/graph_view_test.cc
+++ b/tensorflow/core/grappler/utils/graph_view_test.cc
@@ -2012,7 +2012,7 @@ class TopologicalSortTest : public CompareGraphTest {
 
   void CompareGraphNodePrecedences(
       const MutableGraphView& graph_view,
-      absl::Span<const std::pair<string, string>> node_precedences) {
+      absl::Span<const std::pair<string, std::string>> node_precedences) {
     for (const auto& node_precedence : node_precedences) {
       auto* parent_node = graph_view.GetNode(node_precedence.first);
       ASSERT_NE(parent_node, nullptr);
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index da44c9ebde953f..99b8d43d1a3228 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -52,7 +52,7 @@ void CompareGraphNodes(protobuf::RepeatedPtrField<NodeDef>* want,
         << got_node.DebugString();
 
     // Order of control dependencies doesn't matter, so we sort them first.
-    const auto is_control = [](const string& input) -> bool {
+    const auto is_control = [](const std::string& input) -> bool {
       return ParseTensorName(input).index() < 0;
     };
 
diff --git a/tensorflow/core/grappler/utils/pattern_utils_test.cc b/tensorflow/core/grappler/utils/pattern_utils_test.cc
index 1281968bc47db5..fc2b243323578c 100644
--- a/tensorflow/core/grappler/utils/pattern_utils_test.cc
+++ b/tensorflow/core/grappler/utils/pattern_utils_test.cc
@@ -515,7 +515,7 @@ TEST_F(PatternMatcherTest, CommutativeInputs) {
 
   absl::Status status;
   std::vector<string> commutative_ops = {"Mul", "Add", "AddV2"};
-  for (string op : commutative_ops) {
+  for (std::string op : commutative_ops) {
     for (bool should_swap : {false, true}) {
       std::vector<string> commutative_operands =
           (should_swap ? std::vector<string>{"d", "c"}
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index a9eaad04d3bd00..b6ff0170a2c2f5 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -133,7 +133,7 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
   });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"1", "2"};
+  std::vector<std::string> order = {"1", "2"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
index 7b36d328e93847..cd3b9fdde414c5 100644
--- a/tensorflow/core/grappler/utils/traversal_test.cc
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -226,7 +226,7 @@ TEST(TraversalTest, DfsWithAdvancePredicate) {
                MkCallbacks(&pre_order, &post_order, &back_edges));
 
   const std::vector<string> expected_pre = {"1", "4", "5", "6", "2"};
-  const std::vector<string> expected_post = {"6", "5", "4", "2", "1"};
+  const std::vector<std::string> expected_post = {"6", "5", "4", "2", "1"};
 
   EXPECT_EQ(pre_order, expected_pre);
   EXPECT_EQ(post_order, expected_post);
diff --git a/tensorflow/core/ir/ops.cc b/tensorflow/core/ir/ops.cc
index 52ef2d416d393d..057aad050762d3 100644
--- a/tensorflow/core/ir/ops.cc
+++ b/tensorflow/core/ir/ops.cc
@@ -1155,7 +1155,7 @@ static LogicalResult VerifyPreservedAttrs(Operation* op,
       num_rets = 1;
     } else {
       num_rets = cast<RegionBranchTerminatorOpInterface>(terminator)
-                     .getMutableSuccessorOperands(region)
+                     .getMutableSuccessorOperands(RegionSuccessor(&region))
                      .size();
     }
     if (num_rets != attrs.getResAttrs().size()) {
@@ -1171,7 +1171,7 @@ static LogicalResult VerifyPreservedAttrs(Operation* op,
 // YieldOp
 
 MutableOperandRange YieldOp::getMutableSuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   // Get the subrange of non-control operands.
   return getArgsMutable();
 }
@@ -1212,10 +1212,10 @@ template <typename IfLikeRegionOp>
 void GetIfLikeRegionOpSuccessorRegions(
     IfLikeRegionOp op, RegionBranchPoint point,
     SmallVectorImpl<RegionSuccessor>& regions) {
-  // Both regions branch back to the parent op.
   if (!point.isParent()) {
     // Ignore the control token.
     regions.emplace_back(
+        op.getOperation(),
         ResultRange(op->result_begin(), std::prev(op->result_end())));
   } else {
     // Unknown successor.
@@ -1296,6 +1296,7 @@ void GetCaseLikeRegionOpSuccessorRegions(
   if (!point.isParent()) {
     // Ignore the control token.
     regions.emplace_back(
+        op.getOperation(),
         ResultRange(op->result_begin(), std::prev(op->result_end())));
   } else {
     // Unknown successor. Add all of them.
@@ -1325,7 +1326,7 @@ void GetCaseLikeRegionOpEntrySuccessorRegions(
 // ConditionOp
 
 MutableOperandRange ConditionOp::getMutableSuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   // Get the subrange of non-control operands that are forwarded to the
   // successor region.
   return getArgsMutable();
@@ -1380,12 +1381,18 @@ static void GetWhileLikeRegionOpSuccessorRegions(
     WhileLikeRegionOp op, RegionBranchPoint point ,
     SmallVectorImpl<RegionSuccessor>& regions) {
   // The parent op and the body region always branch to the condition region.
-  if (point.isParent() || point == op.getRegion(1)) {
+  if (point.isParent() ||
+      (point.getTerminatorPredecessorOrNull() &&
+       point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+           &op.getRegion(1))) {
     regions.emplace_back(&op.getCondRegion(),
                          GetLoopRegionDataArgs(op.getCondRegion()));
     return;
   }
-  assert(point == op->getRegion(0) && "invalid region index");
+  assert((point.getTerminatorPredecessorOrNull() &&
+          point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+              &op.getRegion(0)) &&
+         "invalid region index");
   // The condition regions branches to the loop body or back to the parent.
   // Try to narrow the condition value to a constant.
   auto condition =
@@ -1399,7 +1406,7 @@ static void GetWhileLikeRegionOpSuccessorRegions(
   }
   if (!cond || !*cond) {
     // Drop the control token.
-    regions.emplace_back(op.getResults().drop_back());
+    regions.emplace_back(op.getOperation(), op.getResults().drop_back());
   }
 }
 
@@ -1427,8 +1434,7 @@ LogicalResult ForRegionOp::verify() {
   return VerifyPreservedAttrs(*this, {getRegionAttrsAttr()});
 }
 
-OperandRange ForRegionOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
+OperandRange ForRegionOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   return getInit();
 }
 
@@ -1440,7 +1446,7 @@ void ForRegionOp::getSuccessorRegions(
                        GetLoopRegionDataArgs(getBodyRegion()).drop_front());
   if (point.isParent()) return;
   // The body might branch back to the parent. Drop the control token.
-  regions.emplace_back((*this)->getResults().drop_back());
+  regions.emplace_back(getOperation(), getResults().drop_back());
 }
 
 BlockArgument ForRegionOp::getDataValueOf(BlockArgument ctl) {
diff --git a/tensorflow/core/ir/ops.td b/tensorflow/core/ir/ops.td
index b6bbbee3b6e88e..d54d2f58eefd7f 100644
--- a/tensorflow/core/ir/ops.td
+++ b/tensorflow/core/ir/ops.td
@@ -864,7 +864,7 @@ class TFGraph_WhileLikeRegionOp<string mnemonic> : TFGraph_RegionOp<
     }
 
     OperandRange $cppClass::getEntrySuccessorOperands(
-         ::mlir::RegionBranchPoint point) {
+         ::mlir::RegionSuccessor successor) {
       return getInit();
     }
     void $cppClass::getSuccessorRegions(
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6cc8e464406ea7..1ddb54cd5c3f08 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -565,7 +565,7 @@ cc_library(
         "//tensorflow/core/platform:tensor_float_32_hdr_lib",
         "//tensorflow/core/util:determinism_for_kernels",
         "//tensorflow/core/util:env_var",
-        "@local_xla//xla/stream_executor:numeric_options",
+        "@local_xla//xla/stream_executor:engine_options",
         "@local_xla//xla/tsl/util:determinism_for_kernels",
     ] + if_static(["//tensorflow/core/platform:tensor_float_32_utils"]),
 )
@@ -1067,7 +1067,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "gather_op",
     prefix = "gather_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 tf_kernel_library(
@@ -1357,6 +1360,8 @@ tf_kernel_library(
     srcs = ["ragged_gather_op.cc"],
     deps = [
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2440,6 +2445,7 @@ tf_kernel_library(
         ":range_sampler",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -2451,6 +2457,8 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -2742,6 +2750,8 @@ tf_kernel_library(
         ":gather_functor",
         ":gpu_prim_hdrs",
         "//tensorflow/core:framework_internal",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2961,6 +2971,7 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util:determinism_for_kernels",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ],
@@ -5460,6 +5471,8 @@ tf_kernel_library(
     deps = STATE_DEPS + [
         ":loose_headers",
         "//tensorflow/core/util:determinism_for_kernels",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -5469,6 +5482,8 @@ tf_kernel_library(
     deps = STATE_DEPS + [
         "//tensorflow/core/framework:op_requires",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -5884,6 +5899,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:bounds_check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -6510,7 +6527,6 @@ filegroup(
         "variable_ops.h",
         "variant_ops_util.cc",
         "variant_ops_util.h",
-    ] + [
         "//tensorflow/c/kernels:portable_all_op_kernels",
         "//tensorflow/core/kernels/image:non_max_suppression_op.cc",
         "//tensorflow/core/kernels/image:non_max_suppression_op.h",
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 12e1622963da07..253f21b69451fe 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -15,19 +15,27 @@ limitations under the License.
 
 // See docs in ../ops/string_ops.cc.
 
+#include <cstdint>
 #include <string>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/tstring.h"
 
 namespace tensorflow {
 
@@ -36,36 +44,28 @@ class AsStringOp : public OpKernel {
   using OpKernel::OpKernel;
 
   explicit AsStringOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    int32_t precision;
     bool scientific;
     bool shortest;
-    int32_t width;
-    string fill_string;
+    std::string fill_string;
     DataType dtype;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("precision", &precision));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("precision", &precision_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("scientific", &scientific));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shortest", &shortest));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("width", &width));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("width", &width_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fill", &fill_string));
-    switch (dtype) {
-      case DT_STRING:
-      case DT_HALF:
-      case DT_BFLOAT16:
-      case DT_FLOAT:
-      case DT_DOUBLE:
-      case DT_COMPLEX64:
-      case DT_COMPLEX128:
-        break;
-      default:
-        OP_REQUIRES(ctx, !(scientific || shortest),
-                    errors::InvalidArgument("scientific and shortest format "
-                                            "not supported for datatype ",
-                                            DataTypeString(dtype)));
-        OP_REQUIRES(ctx, precision < 0,
-                    errors::InvalidArgument("precision not supported "
-                                            "for datatype ",
-                                            DataTypeString(dtype)));
+    if (dtype != DT_STRING && !DataTypeIsFloating(dtype) &&
+        !DataTypeIsComplex(dtype)) {
+      OP_REQUIRES(ctx, !(scientific || shortest),
+                  absl::InvalidArgumentError(
+                      absl::StrCat("scientific and shortest format "
+                                   "not supported for datatype ",
+                                   DataTypeString(dtype))));
+      OP_REQUIRES(
+          ctx, precision_ < 0,
+          absl::InvalidArgumentError(absl::StrCat("precision not supported "
+                                                  "for datatype ",
+                                                  DataTypeString(dtype))));
     }
     OP_REQUIRES(
         ctx, fill_string.size() <= 1,
@@ -74,7 +74,6 @@ class AsStringOp : public OpKernel {
                 errors::InvalidArgument(
                     "Cannot select both scientific and shortest notation"));
 
-    format_ = "%";
     if (!fill_string.empty()) {
       switch (fill_string[0]) {
         case ' ':
@@ -82,7 +81,6 @@ class AsStringOp : public OpKernel {
         case '-':
         case '0':
         case '#':
-          strings::Appendf(&format_, "%s", fill_string.c_str());
           break;
         default:
           bool fill_not_supported = true;
@@ -91,64 +89,55 @@ class AsStringOp : public OpKernel {
                                               fill_string, "\""));
       }
     }
-    if (width > -1) {
-      strings::Appendf(&format_, "%d", width);
+    if (width_ <= -1) {
+      width_ = 0;
     }
-    if (precision > -1) {
-      strings::Appendf(&format_, ".%d", precision);
+    // If input is string and width unspecified, simply forward to output.
+    if (dtype == DT_STRING && width_ <= 0) {
+      return;
     }
-    switch (dtype) {
-      case DT_STRING:
-        // Clear format to signal pass-through.
-        if (width <= 0) {
-          format_ = "";
-        } else {
-          strings::Appendf(&format_, "s");
-        }
-        break;
-      case DT_UINT8:
-      case DT_UINT16:
-      case DT_UINT32:
-        strings::Appendf(&format_, "u");
-        break;
-      case DT_UINT64:
-        strings::Appendf(&format_, "llu");
-        break;
-      case DT_INT8:
-      case DT_INT16:
-      case DT_INT32:
-        strings::Appendf(&format_, "d");
-        break;
-      case DT_INT64:
-        strings::Appendf(&format_, "lld");
-        break;
-      case DT_HALF:
-      case DT_BFLOAT16:
-      case DT_FLOAT:
-      case DT_DOUBLE:
-      case DT_COMPLEX64:
-      case DT_COMPLEX128:
-        if (shortest) {
-          strings::Appendf(&format_, "g");
-        } else if (scientific) {
-          strings::Appendf(&format_, "e");
-        } else {
-          strings::Appendf(&format_, "f");
-        }
-        break;
-      case DT_BOOL:
-        break;
-      case DT_VARIANT:
-        break;
-      default:
-        bool type_not_supported = true;
-        OP_REQUIRES(ctx, !type_not_supported,
-                    errors::InvalidArgument("Type not supported: ",
-                                            DataTypeString(dtype)));
+    char format_char;
+    if (dtype == DT_STRING) {
+      format_char = 's';
+    } else if (DataTypeIsUnsigned(dtype)) {
+      format_char = 'u';
+    } else if (DataTypeIsSigned(dtype)) {
+      format_char = 'd';
+    } else if (DataTypeIsFloating(dtype) || DataTypeIsComplex(dtype)) {
+      if (shortest) {
+        format_char = 'g';
+      } else if (scientific) {
+        format_char = 'e';
+      } else {
+        format_char = 'f';
+      }
+    } else if (dtype == DT_BOOL) {
+      return;
+    } else if (dtype == DT_VARIANT) {
+      return;
+    } else {
+      bool type_not_supported = true;
+      OP_REQUIRES(ctx, !type_not_supported,
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "Type not supported: ", DataTypeString(dtype))));
     }
-
-    if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
-      format_ = strings::Printf("(%s,%s)", format_.c_str(), format_.c_str());
+    format_ = absl::StrCat("%", fill_string, "*.*",
+                           absl::string_view(&format_char, 1));
+    if (format_char == 's') {
+      string_format_ = StringFormat::New(format_);
+      OP_REQUIRES(ctx, string_format_ != nullptr,
+                  absl::InvalidArgumentError(
+                      absl::StrCat("Invalid format: ", format_)));
+    } else if (format_char == 'u' || format_char == 'd') {
+      integral_format_ = IntegralFormat::New(format_);
+      OP_REQUIRES(ctx, integral_format_ != nullptr,
+                  absl::InvalidArgumentError(
+                      absl::StrCat("Invalid format: ", format_)));
+    } else {
+      floating_format_ = FloatingFormat::New(format_);
+      OP_REQUIRES(ctx, floating_format_ != nullptr,
+                  absl::InvalidArgumentError(
+                      absl::StrCat("Invalid format: ", format_)));
     }
   }
 
@@ -158,7 +147,7 @@ class AsStringOp : public OpKernel {
     const DataType& dtype = input_tensor->dtype();
 
     // If input is string and width unspecified, simply forward to output.
-    if (dtype == DT_STRING && format_.empty()) {
+    if (dtype == DT_STRING && width_ <= 0) {
       context->set_output(0, context->input(0));
       return;
     }
@@ -169,70 +158,91 @@ class AsStringOp : public OpKernel {
                                             &output_tensor));
     auto output_flat = output_tensor->flat<tstring>();
 
-#define ENCODE_TYPE(type, T, enc_str)                                     \
-  case (type): {                                                          \
-    const auto& input_flat = input_tensor->flat<T>();                     \
-    for (int i = 0; i < input_flat.size(); ++i) {                         \
-      output_flat(i) = strings::Printf((enc_str.c_str()), input_flat(i)); \
-    }                                                                     \
+    if (dtype == DT_BOOL) {
+      const auto& input_flat = input_tensor->flat<bool>();
+      for (int i = 0; i < input_flat.size(); ++i) {
+        output_flat(i) = (input_flat(i)) ? "true" : "false";
+      }
+      return;
+    }
+
+    if (dtype == DT_VARIANT) {
+      const auto& input_flat = input_tensor->flat<Variant>();
+      for (int i = 0; i < input_flat.size(); ++i) {
+        output_flat(i) = input_flat(i).DebugString();
+      }
+      return;
+    }
+
+    // All other cases use the format string.
+
+#define ENCODE_TYPE(type, T, enc_fmt)                                   \
+  case (type): {                                                        \
+    const auto& input_flat = input_tensor->flat<T>();                   \
+    for (int i = 0; i < input_flat.size(); ++i) {                       \
+      output_flat(i) =                                                  \
+          absl::StrFormat(*enc_fmt, width_, precision_, input_flat(i)); \
+    }                                                                   \
   } break
 
     switch (dtype) {
-      ENCODE_TYPE(DT_UINT8, uint8, format_);
-      ENCODE_TYPE(DT_UINT16, uint16, format_);
-      ENCODE_TYPE(DT_UINT32, uint32, format_);
-      ENCODE_TYPE(DT_UINT64, uint64, format_);
-      ENCODE_TYPE(DT_INT8, int8, format_);
-      ENCODE_TYPE(DT_INT16, int16, format_);
-      ENCODE_TYPE(DT_INT32, int32, format_);
-      ENCODE_TYPE(DT_INT64, int64_t, format_);
-      ENCODE_TYPE(DT_FLOAT, float, format_);
-      ENCODE_TYPE(DT_DOUBLE, double, format_);
-      case (DT_BOOL): {
-        const auto& input_flat = input_tensor->flat<bool>();
-        for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = (input_flat(i)) ? "true" : "false";
-        }
-      } break;
+      ENCODE_TYPE(DT_UINT8, uint8_t, integral_format_);
+      ENCODE_TYPE(DT_UINT16, uint16_t, integral_format_);
+      ENCODE_TYPE(DT_UINT32, uint32_t, integral_format_);
+      ENCODE_TYPE(DT_UINT64, uint64_t, integral_format_);
+      ENCODE_TYPE(DT_INT8, int8_t, integral_format_);
+      ENCODE_TYPE(DT_INT16, int16_t, integral_format_);
+      ENCODE_TYPE(DT_INT32, int32_t, integral_format_);
+      ENCODE_TYPE(DT_INT64, int64_t, integral_format_);
+      ENCODE_TYPE(DT_FLOAT, float, floating_format_);
+      ENCODE_TYPE(DT_DOUBLE, double, floating_format_);
       case (DT_STRING): {
         const auto& input_flat = input_tensor->flat<tstring>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(
-              format_.c_str(), absl::string_view(input_flat(i)).data());
-        }
-      } break;
-      case (DT_VARIANT): {
-        const auto& input_flat = input_tensor->flat<Variant>();
-        for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = input_flat(i).DebugString();
+          output_flat(i) = absl::StrFormat(*string_format_, width_, precision_,
+                                           absl::string_view(input_flat(i)));
         }
       } break;
       case (DT_HALF): {
         const auto& input_flat = input_tensor->flat<Eigen::half>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(format_.c_str(),
-                                           static_cast<float>(input_flat(i)));
+          output_flat(i) =
+              absl::StrFormat(*floating_format_, width_, precision_,
+                              static_cast<float>(input_flat(i)));
         }
       } break;
       case (DT_BFLOAT16): {
         const auto& input_flat = input_tensor->flat<bfloat16>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(format_.c_str(),
-                                           static_cast<float>(input_flat(i)));
+          output_flat(i) =
+              absl::StrFormat(*floating_format_, width_, precision_,
+                              static_cast<float>(input_flat(i)));
         }
       } break;
       case (DT_COMPLEX64): {
         const auto& input_flat = input_tensor->flat<complex64>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(
-              format_.c_str(), input_flat(i).real(), input_flat(i).imag());
+          output_flat(i) =
+              absl::StrCat("(",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).real()),
+                           ",",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).imag()),
+                           ")");
         }
       } break;
       case (DT_COMPLEX128): {
         const auto& input_flat = input_tensor->flat<complex128>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(
-              format_.c_str(), input_flat(i).real(), input_flat(i).imag());
+          output_flat(i) =
+              absl::StrCat("(",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).real()),
+                           ",",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).imag()),
+                           ")");
         }
       } break;
       default:
@@ -246,7 +256,32 @@ class AsStringOp : public OpKernel {
   }
 
  private:
-  string format_;
+  // Used to parse "%*.*g", etc.
+  using FloatingFormat =
+      absl::ParsedFormat<absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::g |
+                             absl::FormatConversionCharSet::e |
+                             absl::FormatConversionCharSet::f>;
+
+  // Used to parse "%*.*u", etc.
+  using IntegralFormat =
+      absl::ParsedFormat<absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::u |
+                             absl::FormatConversionCharSet::d>;
+
+  // Used to parse "%*.*s", etc.
+  using StringFormat = absl::ParsedFormat<absl::FormatConversionCharSet::kStar,
+                                          absl::FormatConversionCharSet::kStar,
+                                          absl::FormatConversionCharSet::s>;
+
+  int precision_ = -1;
+  int width_ = -1;
+  decltype(StringFormat::New("%*.*s")) string_format_;
+  decltype(IntegralFormat::New("%*.*u")) integral_format_;
+  decltype(FloatingFormat::New("%*.*g")) floating_format_;
+  std::string format_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("AsString").Device(DEVICE_CPU), AsStringOp);
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 1e96d9a2070e91..cdd9af962f346e 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -521,13 +521,19 @@ tf_cc_test(
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:notification",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:refcount",
         "@local_tsl//tsl/platform:status",
+        "@local_xla//xla/tsl/lib/core:status_test_util",
+        "@local_xla//xla/tsl/lib/monitoring:cell_reader",
+        "@local_xla//xla/tsl/lib/monitoring:test_utils",
         "@local_xla//xla/tsl/platform:criticality",
     ],
 )
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 8be441b231387a..4006fcbaf94dea 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -89,7 +89,7 @@ class AdaptiveSharedBatchScheduler
 
   struct Options {
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
     // Number of batch processing threads - the maximum value of
     // in_flight_batches_limit_.  It is recommended that this value be set by
     // running the system under load, observing the learned value for
@@ -329,7 +329,7 @@ class ASBSQueue : public BatchScheduler<TaskType> {
 
   // Returns uint64 one greater than was returned by the previous call.
   // Context id is reused after std::numeric_limits<uint64>::max is exhausted.
-  static uint64 NewTraceMeContextIdForBatch();
+  static uint64_t NewTraceMeContextIdForBatch();
 
   std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
@@ -347,7 +347,7 @@ template <typename TaskType>
 class ASBSBatch : public Batch<TaskType> {
  public:
   ASBSBatch(ASBSQueue<TaskType>* queue, int64_t creation_time_micros,
-            int64_t batch_timeout_micros, uint64 traceme_context_id)
+            int64_t batch_timeout_micros, uint64_t traceme_context_id)
       : queue_(queue),
         creation_time_micros_(creation_time_micros),
         schedulable_time_micros_(creation_time_micros + batch_timeout_micros),
@@ -361,13 +361,13 @@ class ASBSBatch : public Batch<TaskType> {
 
   int64_t schedulable_time_micros() const { return schedulable_time_micros_; }
 
-  uint64 traceme_context_id() const { return traceme_context_id_; }
+  uint64_t traceme_context_id() const { return traceme_context_id_; }
 
  private:
   ASBSQueue<TaskType>* queue_;
   const int64_t creation_time_micros_;
   const int64_t schedulable_time_micros_;
-  const uint64 traceme_context_id_;
+  const uint64_t traceme_context_id_;
   ASBSBatch(const ASBSBatch&) = delete;
   void operator=(const ASBSBatch&) = delete;
 };
@@ -860,8 +860,8 @@ size_t ASBSQueue<TaskType>::SchedulingCapacityLocked() const {
 
 template <typename TaskType>
 // static
-uint64 ASBSQueue<TaskType>::NewTraceMeContextIdForBatch() {
-  static std::atomic<uint64> traceme_context_id(0);
+uint64_t ASBSQueue<TaskType>::NewTraceMeContextIdForBatch() {
+  static std::atomic<uint64_t> traceme_context_id(0);
   return traceme_context_id.fetch_add(1, std::memory_order_relaxed);
 }
 }  // namespace internal
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index a066550399c56d..c915deead27a85 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -161,7 +161,7 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
     // To share a thread pool (2) create a scheduler and pass it in.
 
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
 
     // The number of threads to use to process batches.
     // Must be >= 1, and should be tuned carefully.
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
index 197058ef7982b4..6301f0e1bf111b 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@@ -99,11 +99,11 @@ class BenchmarkBatchTask : public BatchTask {
 
   size_t size() const override { return 1; }
 
-  uint64 start_time_micros() const { return start_time_micros_; }
+  uint64_t start_time_micros() const { return start_time_micros_; }
 
  private:
   // The time at which the task was created, in microseconds.
-  const uint64 start_time_micros_;
+  const uint64_t start_time_micros_;
 };
 
 BenchmarkBatchTask::BenchmarkBatchTask()
@@ -164,7 +164,7 @@ class LatencyBenchmark {
   void InjectLoad();
 
   // Return latency and batch size stat.
-  string ReportLatencyBatchSz();
+  std::string ReportLatencyBatchSz();
 
   // Reset scheduler. This has a side-effect of waiting for all work to be
   // completed prior to reset.
@@ -255,7 +255,7 @@ void LatencyBenchmark::InjectLoad() {
 void LatencyBenchmark::ProcessBatch(
     std::unique_ptr<Batch<BenchmarkBatchTask>> batch) {
   PerformBatchCpuWork();
-  const uint64 batch_completion_time = Env::Default()->NowMicros();
+  const uint64_t batch_completion_time = Env::Default()->NowMicros();
 
   {
     mutex_lock l(mu_);
@@ -263,7 +263,7 @@ void LatencyBenchmark::ProcessBatch(
   }
 
   for (int i = 0; i < batch->num_tasks(); ++i) {
-    const uint64 task_latency_micros =
+    const uint64_t task_latency_micros =
         batch_completion_time - batch->task(i).start_time_micros();
     {
       mutex_lock l(mu_);
@@ -280,7 +280,7 @@ void LatencyBenchmark::PerformBatchCpuWork() const {
   CHECK_NE(dummy, 0);
 }
 
-string LatencyBenchmark::ReportLatencyBatchSz() {
+std::string LatencyBenchmark::ReportLatencyBatchSz() {
   mutex_lock l(mu_);
   return absl::StrCat(
       "lat_p99.9=", task_latency_millis_histogram_.Percentile(99.9),
@@ -347,9 +347,9 @@ void LatencyBM(::testing::benchmark::State& state) {
     scheduler_options.num_batch_threads = state.range(1);
     scheduler_options.max_enqueued_batches = INT_MAX;  // Unbounded queue.
     const int kBatchCpuCost = 10 * 1000 * 1000;
-    const int64 kQps = state.range(2);
-    const int64 kInjectionIntervalMicros = 1000000 / (kQps / state.threads());
-    const int64 kNumTasks = latency_benchmark_duration_secs * kQps;
+    const int64_t kQps = state.range(2);
+    const int64_t kInjectionIntervalMicros = 1000000 / (kQps / state.threads());
+    const int64_t kNumTasks = latency_benchmark_duration_secs * kQps;
     if (kNumTasks <= 10000) {
       LOG(WARNING) << "Not enough tasks (" << kNumTasks << ")"
                    << " to report meaningful 99.9% latency!"
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index ae4d32598ef73b..b00b468cf79ac7 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -82,8 +82,9 @@ namespace serving {
 namespace {
 
 // TODO(b/181883417): Replace with RecordPaddingSizeV2.
-void RecordPaddingSize(int32_t padding_size, const string& model_name,
-                       int32_t execution_batch_size, const string& op_name) {
+void RecordPaddingSize(int32_t padding_size, const std::string& model_name,
+                       int32_t execution_batch_size,
+                       const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::PercentileSampler<3>::New(
       {"/tensorflow/serving/batching/padding_size",
        "Tracks the padding size distribution on batches by model_name (if "
@@ -95,8 +96,7 @@ void RecordPaddingSize(int32_t padding_size, const string& model_name,
       ->Add(static_cast<double>(padding_size));
 }
 
-void RecordPaddingSizeV2(int32_t padding_size, const string& model_name,
-                         int32_t execution_batch_size, const string& op_name) {
+std::vector<double> GetBucketLimitsForPaddingSizeV2() {
   // Bucket containing 0 has bounds [-2/3, 2/3).
   // Remaining buckets are centered at powers of 2 and have bounds:
   // [(2/3) * 2^i, (4/3) * 2^i) for i = 1, ..., 13.
@@ -112,20 +112,27 @@ void RecordPaddingSizeV2(int32_t padding_size, const string& model_name,
     bucket_limits.push_back(bound);
     bound *= growth_factor;
   }
+  return bucket_limits;
+}
 
-  static auto* cell = tensorflow::monitoring::Sampler<3>::New(
-      {"/tensorflow/serving/batching/padding_size_v2",
-       "Tracks the padding size distribution on batches by model_name (if "
-       "available).",
-       "model_name", "execution_batch_size", "op_name"},
-      monitoring::Buckets::Explicit(bucket_limits));
-  cell->GetCell(model_name, absl::StrCat(execution_batch_size), op_name)
+static auto* padding_size_v2_sampler = tensorflow::monitoring::Sampler<3>::New(
+    {"/tensorflow/serving/batching/padding_size_v2",
+     "Tracks the padding size distribution on batches by model_name (if "
+     "available).",
+     "model_name", "execution_batch_size", "op_name"},
+    monitoring::Buckets::Explicit(GetBucketLimitsForPaddingSizeV2()));
+
+void RecordPaddingSizeV2(int32_t padding_size, const std::string& model_name,
+                         int32_t execution_batch_size,
+                         const std::string& op_name) {
+  padding_size_v2_sampler
+      ->GetCell(model_name, absl::StrCat(execution_batch_size), op_name)
       ->Add(static_cast<double>(padding_size));
 }
 
 // TODO(b/181883417): Replace with RecordInputBatchSizeV2.
-void RecordInputBatchSize(int32_t batch_size, const string& model_name,
-                          const string& op_name) {
+void RecordInputBatchSize(int32_t batch_size, const std::string& model_name,
+                          const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::PercentileSampler<2>::New(
       {"/tensorflow/serving/batching/input_batch_size",
        "Tracks the batch size distribution on the inputs by model_name (if "
@@ -136,8 +143,8 @@ void RecordInputBatchSize(int32_t batch_size, const string& model_name,
   cell->GetCell(model_name, op_name)->Add(static_cast<double>(batch_size));
 }
 
-void RecordInputStatsV2(int32_t batch_size, const string& model_name,
-                        const string& op_name,
+void RecordInputStatsV2(int32_t batch_size, const std::string& model_name,
+                        const std::string& op_name,
                         const tsl::criticality::Criticality& criticality) {
   static auto* cell = tensorflow::monitoring::Sampler<3>::New(
       {"/tensorflow/serving/batching/input_batch_size_v2",
@@ -161,8 +168,8 @@ void RecordInputStatsV2(int32_t batch_size, const string& model_name,
 }
 
 // Record the actual batch size without padding.
-void RecordBatchSize(int32_t batch_size, const string& model_name,
-                     const string& op_name) {
+void RecordBatchSize(int32_t batch_size, const std::string& model_name,
+                     const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::Sampler<2>::New(
       {"/tensorflow/serving/batching/batch_size",
        "Tracks the batch size distribution on the batch result by model_name "
@@ -172,8 +179,8 @@ void RecordBatchSize(int32_t batch_size, const string& model_name,
   cell->GetCell(model_name, op_name)->Add(static_cast<double>(batch_size));
 }
 
-void RecordProcessedBatchSize(int32_t batch_size, const string& model_name,
-                              const string& op_name) {
+void RecordProcessedBatchSize(int32_t batch_size, const std::string& model_name,
+                              const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::PercentileSampler<2>::New(
       {"/tensorflow/serving/batching/processed_batch_size",
        "Tracks the batch size distribution on processing by model_name (if "
@@ -184,21 +191,24 @@ void RecordProcessedBatchSize(int32_t batch_size, const string& model_name,
   cell->GetCell(model_name, op_name)->Add(static_cast<double>(batch_size));
 }
 
+static auto* processed_batch_size_v2_counter = monitoring::Counter<3>::New(
+    "/tensorflow/serving/batching/processed_batch_size_v2",
+    "Tracks the batch size on processing by model_name and op name (if "
+    "available).",
+    "model_name", "op_name", "batch_size");
+
 // Export the exact number instead of the distribution of processed batch size.
-void RecordProcessedBatchSizeV2(int32_t batch_size, const string& model_name,
-                                const string& op_name) {
-  static auto* cell = monitoring::Counter<3>::New(
-      "/tensorflow/serving/batching/processed_batch_size_v2",
-      "Tracks the batch size on processing by model_name and op name (if "
-      "available).",
-      "model_name", "op_name", "batch_size");
-  cell->GetCell(model_name, op_name, std::to_string(batch_size))
+void RecordProcessedBatchSizeV2(int32_t batch_size,
+                                const std::string& model_name,
+                                const std::string& op_name) {
+  processed_batch_size_v2_counter
+      ->GetCell(model_name, op_name, std::to_string(batch_size))
       ->IncrementBy(1);
 }
 
 // TODO(b/181883417): Replace with RecordBatchDelayUsV2.
-void RecordBatchDelayUs(int64_t batch_delay_us, const string& model_name,
-                        const string& op_name, int32_t batch_size) {
+void RecordBatchDelayUs(int64_t batch_delay_us, const std::string& model_name,
+                        const std::string& op_name, int32_t batch_size) {
   static auto* cell = monitoring::PercentileSampler<3>::New(
       {"/tensorflow/serving/batching/batch_delay_us",
        "Tracks the batching delay (in microseconds) for inputs by model_name "
@@ -210,8 +220,8 @@ void RecordBatchDelayUs(int64_t batch_delay_us, const string& model_name,
       ->Add(static_cast<double>(batch_delay_us));
 }
 
-void RecordBatchDelayUsV2(int64_t batch_delay_us, const string& model_name,
-                          const string& op_name, int32_t batch_size) {
+void RecordBatchDelayUsV2(int64_t batch_delay_us, const std::string& model_name,
+                          const std::string& op_name, int32_t batch_size) {
   static auto* cell = tensorflow::monitoring::Sampler<3>::New(
       {"/tensorflow/serving/batching/batch_delay_us_v2",
        "Tracks the batching delay (in microseconds) for inputs by model_name "
@@ -226,7 +236,8 @@ void RecordBatchDelayUsV2(int64_t batch_delay_us, const string& model_name,
 
 void RecordBatchTaskSizeSum(int32_t batch_task_size,
                             int32_t unbatched_task_size,
-                            const string& model_name, const string& op_name) {
+                            const std::string& model_name,
+                            const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::Counter<3>::New(
       "/tensorflow/serving/batching/batch_task_size_sum",
       "Tracks the sum of the task sizes in a batch.", "model_name", "op_name",
@@ -236,8 +247,8 @@ void RecordBatchTaskSizeSum(int32_t batch_task_size,
 }
 
 void RecordBatchParamBatchTimeoutMicros(int64_t batch_timeout_micros,
-                                        const string& model_name,
-                                        const string& op_name) {
+                                        const std::string& model_name,
+                                        const std::string& op_name) {
   static auto* cell = monitoring::Gauge<int64_t, 2>::New(
       "/tensorflow/serving/batching/batch_timeout_micros",
       "Tracks how long a request can wait before being processed by a batch.",
@@ -246,18 +257,18 @@ void RecordBatchParamBatchTimeoutMicros(int64_t batch_timeout_micros,
 }
 
 void RecordBatchParamMaxBatchSize(int64_t max_batch_size,
-                                  const string& model_name,
-                                  const string& op_name) {
+                                  const std::string& model_name,
+                                  const std::string& op_name) {
   static auto* cell = monitoring::Gauge<int64_t, 2>::New(
       "/tensorflow/serving/batching/max_batch_size",
       "Tracks the maximum size of a batch.", "model_name", "op_name");
   cell->GetCell(model_name, op_name)->Set(max_batch_size);
 }
 
-void RecordBatchParamPaddingPolicy(const string& batch_padding_policy,
-                                   const string& model_name,
-                                   const string& op_name) {
-  static auto* cell = monitoring::Gauge<string, 2>::New(
+void RecordBatchParamPaddingPolicy(const std::string& batch_padding_policy,
+                                   const std::string& model_name,
+                                   const std::string& op_name) {
+  static auto* cell = monitoring::Gauge<std::string, 2>::New(
       "/tensorflow/serving/batching/configured_batch_padding_policy",
       "The value of BatchFunction.batch_padding_policy attribute.",
       "model_name", "op_name");
@@ -265,8 +276,8 @@ void RecordBatchParamPaddingPolicy(const string& batch_padding_policy,
 }
 
 void RecordBatchParamMaxEnqueuedBatches(int64_t max_enqueued_batches,
-                                        const string& model_name,
-                                        const string& op_name) {
+                                        const std::string& model_name,
+                                        const std::string& op_name) {
   static auto* cell = monitoring::Gauge<int64_t, 2>::New(
       "/tensorflow/serving/batching/max_enqueued_batches",
       "Tracks the maximum number of enqueued batches.", "model_name",
@@ -274,10 +285,10 @@ void RecordBatchParamMaxEnqueuedBatches(int64_t max_enqueued_batches,
   cell->GetCell(model_name, op_name)->Set(max_enqueued_batches);
 }
 
-void RecordBatchParamAllowedBatchSizes(const string& allowed_batch_sizes,
-                                       const string& model_name,
-                                       const string& op_name) {
-  static auto* cell = monitoring::Gauge<string, 2>::New(
+void RecordBatchParamAllowedBatchSizes(const std::string& allowed_batch_sizes,
+                                       const std::string& model_name,
+                                       const std::string& op_name) {
+  static auto* cell = monitoring::Gauge<std::string, 2>::New(
       "/tensorflow/serving/batching/allowed_batch_sizes",
       "Tracks the sizes that are allowed to form a batch.", "model_name",
       "op_name");
@@ -301,8 +312,8 @@ void RecordBatchCosts(const std::string& model_name,
       ->Add(absl::ToDoubleMicroseconds(total_cost));
 }
 
-const string& GetModelName(OpKernelContext* ctx) {
-  static string* kModelNameUnset = new string("model_name_unset");
+const std::string& GetModelName(OpKernelContext* ctx) {
+  static std::string* kModelNameUnset = new std::string("model_name_unset");
   if (!ctx->session_metadata()) return *kModelNameUnset;
   if (ctx->session_metadata()->name().empty()) return *kModelNameUnset;
   return ctx->session_metadata()->name();
@@ -347,8 +358,8 @@ using ::tensorflow::concat_split_util::Concat;
 using ::tensorflow::concat_split_util::Split;
 using TensorMatrix = std::vector<std::vector<Tensor>>;
 
-string GetTensorNamesAndShapesString(const OpKernelContext* context,
-                                     const OpInputList& tensors) {
+std::string GetTensorNamesAndShapesString(const OpKernelContext* context,
+                                          const OpInputList& tensors) {
   std::stringstream out;
   int i = 0;
   for (const Tensor& tensor : tensors) {
@@ -359,7 +370,8 @@ string GetTensorNamesAndShapesString(const OpKernelContext* context,
 }
 
 absl::Status BatchResourceBase::RegisterWarmupInputs(
-    int64_t guid, OpKernelContext* context, const string& batcher_queue_name,
+    int64_t guid, OpKernelContext* context,
+    const std::string& batcher_queue_name,
     const CreateBatchTaskFn& create_batch_task_fn,
     AsyncOpKernel::DoneCallback done) {
   auto shared_status = std::make_shared<ThreadSafeStatus>();
@@ -395,7 +407,8 @@ absl::Status BatchResourceBase::RegisterWarmupInputs(
 }
 
 absl::Status BatchResourceBase::RegisterInput(
-    int64_t guid, OpKernelContext* context, const string& batcher_queue_name,
+    int64_t guid, OpKernelContext* context,
+    const std::string& batcher_queue_name,
     const CreateBatchTaskFn& create_batch_task_fn,
     AsyncOpKernel::DoneCallback done_callback, int forced_warmup_batch_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BatchTask> batch_components,
@@ -532,7 +545,7 @@ absl::Status BatchResourceBase::RegisterInput(
 BatchResourceBase::GetBatcherQueueOptions(
     int32_t num_batch_threads, int32_t max_batch_size,
     int32_t batch_timeout_micros, int32_t max_enqueued_batches,
-    const std::vector<int32>& allowed_batch_sizes,
+    const std::vector<int32_t>& allowed_batch_sizes,
     bool enable_large_batch_splitting, bool disable_padding) {
   return GetBatcherQueueOptions(
       num_batch_threads, max_batch_size, batch_timeout_micros,
@@ -551,12 +564,12 @@ BatchResourceBase::GetBatcherQueueOptions(
 BatchResourceBase::GetBatcherQueueOptions(
     int32_t num_batch_threads, int32_t max_batch_size,
     int32_t batch_timeout_micros, int32_t max_enqueued_batches,
-    const std::vector<int32>& allowed_batch_sizes,
+    const std::vector<int32_t>& allowed_batch_sizes,
     bool enable_large_batch_splitting, bool disable_padding,
     absl::string_view batch_padding_policy, int32_t low_priority_max_batch_size,
     int32_t low_priority_batch_timeout_micros,
     int32_t low_priority_max_enqueued_batches,
-    const std::vector<int32>& low_priority_allowed_batch_sizes,
+    const std::vector<int32_t>& low_priority_allowed_batch_sizes,
     MixedPriorityBatchingPolicy mixed_priority_batching_policy) {
   BatcherT::QueueOptions batcher_queue_options;
   batcher_queue_options.input_batch_size_limit = max_batch_size;
@@ -601,18 +614,17 @@ BatchResourceBase::GetBatcherQueueOptions(
       return SplitInputTask(input_task, open_batch_remaining_slot,
                             max_batch_size, output_tasks);
     };
-
-    if (allowed_batch_sizes.empty()) {
-      batcher_queue_options.max_execution_batch_size = max_batch_size;
-      batcher_queue_options.high_priority_queue_options
-          .max_execution_batch_size = max_batch_size;
-    } else {
-      batcher_queue_options.max_execution_batch_size =
-          *allowed_batch_sizes.rbegin();
-      batcher_queue_options.high_priority_queue_options
-          .max_execution_batch_size = *allowed_batch_sizes.rbegin();
-      batcher_queue_options.allowed_batch_sizes = allowed_batch_sizes;
-    }
+  }
+  if (allowed_batch_sizes.empty()) {
+    batcher_queue_options.max_execution_batch_size = max_batch_size;
+    batcher_queue_options.high_priority_queue_options.max_execution_batch_size =
+        max_batch_size;
+  } else {
+    batcher_queue_options.max_execution_batch_size =
+        *allowed_batch_sizes.rbegin();
+    batcher_queue_options.high_priority_queue_options.max_execution_batch_size =
+        *allowed_batch_sizes.rbegin();
+    batcher_queue_options.allowed_batch_sizes = allowed_batch_sizes;
   }
   batcher_queue_options.disable_padding = disable_padding;
 
@@ -623,7 +635,7 @@ BatchResourceBase::GetBatcherQueueOptions(
 BatchResourceBase::GetAdaptiveBatcherQueueOptions(
     int32_t max_batch_size, int32_t batch_timeout_micros,
     int32_t max_enqueued_batches, bool enable_large_batch_splitting,
-    const std::vector<int32>& allowed_batch_sizes, bool disable_padding) {
+    const std::vector<int32_t>& allowed_batch_sizes, bool disable_padding) {
   AdaptiveBatcherT::QueueOptions batcher_queue_options;
   batcher_queue_options.max_input_task_size =
       std::make_optional(max_batch_size);
@@ -680,7 +692,7 @@ bool BatchResourceBase::IsLowPriorityBatch(const BatchT& batch) const {
 // returns 'batch_size'.
 int BatchResourceBase::RoundToLowestAllowedBatchSize(
     int batch_size, bool is_low_priority_batch) const {
-  const std::vector<int32>& allowed_batch_sizes =
+  const std::vector<int32_t>& allowed_batch_sizes =
       is_low_priority_batch ? batcher_queue_options_.low_priority_queue_options
                                   .allowed_batch_sizes
                             : allowed_batch_sizes_;
@@ -1221,8 +1233,8 @@ void BatchResourceBase::ProcessBatchCallBack(
 }
 
 absl::Status BatchResourceBase::LookupOrCreateBatcherQueue(
-    const string& queue_name, const string& model_name, const string& op_name,
-    BatcherQueueT** queue) {
+    const std::string& queue_name, const std::string& model_name,
+    const std::string& op_name, BatcherQueueT** queue) {
   mutex_lock l(batcher_queues_mu_);
 
   auto it = batcher_queues_.find(queue_name);
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
index 4140774114303e..0a46a96020288e 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
@@ -23,12 +23,19 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/lib/monitoring/cell_reader.h"
+#include "xla/tsl/lib/monitoring/test_utils.h"
 #include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/common_runtime/cost_constants.h"
 #include "tensorflow/core/common_runtime/cost_measurement.h"
@@ -51,12 +58,15 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tsl/platform/refcount.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace serving {
 namespace {
 
+using ::tensorflow::monitoring::testing::CellReader;
+using ::tensorflow::monitoring::testing::Histogram;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
 
@@ -65,6 +75,106 @@ TEST(BatchTaskCriticalityTest, CriticalityDefaultsToCritical) {
   EXPECT_EQ(batch_task.criticality(), tsl::criticality::Criticality::kCritical);
 }
 
+struct PriorityTestParams {
+  std::string test_name;
+  bool enable_large_batch_splitting;
+  MixedPriorityBatchingPolicy mixed_priority_batching_policy;
+  // The expected number of batches for each allowed batch size.
+  absl::flat_hash_map<int, int> expected_batch_size_count;
+  // The expected sum of padding sizes for each allowed batch size.
+  absl::flat_hash_map<int, int> expected_batch_size_padding_sum;
+};
+
+class TestBatchResourceBase : public BatchResourceBase {
+ public:
+  using BatchResourceBase::BatchResourceBase;
+
+  std::string DebugString() const override { return "TestBatchResourceBase"; }
+
+ protected:
+  // Simple function that returns the input tensors as the output tensors.
+  void ProcessFuncBatchImpl(
+      const BatchResourceBase::BatchTask& last_task,
+      absl::Span<const Tensor> inputs, std::vector<Tensor>* combined_outputs,
+      std::function<void(const absl::Status&)> done) const override {
+    for (const auto& input : inputs) {
+      combined_outputs->push_back(input);
+    }
+    done(absl::OkStatus());
+  }
+};
+
+class BatchResourceBaseWithPriorityTest
+    : public ::testing::TestWithParam<PriorityTestParams> {
+ protected:
+  void SetUp() override {
+    processed_batch_size_v2_reader_ = std::make_unique<CellReader<int64_t>>(
+        "/tensorflow/serving/batching/processed_batch_size_v2");
+    padding_size_v2_reader_ = std::make_unique<CellReader<Histogram>>(
+        "/tensorflow/serving/batching/padding_size_v2");
+    // Create device_.
+    device_ = DeviceFactory::NewDevice("CPU", SessionOptions{},
+                                       "/job:a/replica:0/task:0");
+    // Create batch_kernel_node_def.
+    NodeDefBuilder batch_function_builder("my_batch_node", "BatchFunction");
+    batch_function_builder.Attr("max_batch_size", 16);
+    batch_function_builder.Attr("num_batch_threads", 6);
+    batch_function_builder.Attr("allowed_batch_sizes", {4, 8, 12, 16});
+    batch_function_builder.Attr("batch_timeout_micros", 3000000);
+    batch_function_builder.Attr("max_enqueued_batches", 6);
+    batch_function_builder.Attr("enable_large_batch_splitting", true);
+    batch_function_builder.Attr("Tin", {DataType::DT_INT64});
+    batch_function_builder.Input(std::vector<NodeDefBuilder::NodeOut>{
+        NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
+    batch_function_builder.Attr("Tcaptured", std::vector<DataType>{});
+    batch_function_builder.Input(std::vector<NodeDefBuilder::NodeOut>{});
+    batch_function_builder.Attr("Tout", {DataType::DT_INT64});
+    NameAttrList f;
+    f.set_name("func_to_batch");
+    batch_function_builder.Attr("f", f);
+    NodeDef batch_kernel_node_def;
+    CHECK_OK(batch_function_builder.Finalize(&batch_kernel_node_def));
+
+    // Create batch_kernel_.
+    absl::Status op_kernel_creation_status;
+    batch_kernel_ =
+        CreateOpKernel(DEVICE_CPU, device_.get(), device_->GetAllocator({}),
+                       batch_kernel_node_def, TF_GRAPH_DEF_VERSION,
+                       &op_kernel_creation_status);
+    CHECK_OK(op_kernel_creation_status);
+    CHECK(batch_kernel_ != nullptr);
+
+    // Create input tensors.
+    input_tensor_ = Tensor(DataType::DT_INT64, TensorShape({3, 4}));
+    input_tensor_.flat<int64_t>().setZero();
+    input_tensor_values_ = {
+        TensorValue(&input_tensor_),
+    };
+
+    // Fill-in session_metadata_.
+    session_metadata_.set_name("my_model_name");
+
+    // Fill-in params_.
+    params_.device = device_.get();
+    params_.op_kernel = batch_kernel_.get();
+    params_.inputs = input_tensor_values_;
+    params_.session_metadata = &session_metadata_;
+
+    // Create context_.
+    context_ = std::make_unique<OpKernelContext>(&params_);
+  }
+
+  std::unique_ptr<CellReader<int64_t>> processed_batch_size_v2_reader_;
+  std::unique_ptr<CellReader<Histogram>> padding_size_v2_reader_;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<OpKernel> batch_kernel_;
+  Tensor input_tensor_;
+  std::vector<TensorValue> input_tensor_values_;
+  SessionMetadata session_metadata_;
+  OpKernelContext::Params params_;
+  std::unique_ptr<OpKernelContext> context_;
+};
+
 #if defined(PLATFORM_GOOGLE)
 TEST(BatchTaskCriticalityTest, CriticalitySuccessfullyPropagated) {
   std::vector<BatchResourceBase::BatchTask> batch_tasks;
@@ -110,6 +220,177 @@ TEST(BatchTaskCriticalityTest, CriticalitySuccessfullyPropagated) {
   EXPECT_EQ(batch_tasks[4].criticality(),
             tsl::criticality::Criticality::kCritical);
 }
+
+TEST_P(BatchResourceBaseWithPriorityTest, BatchingWithMixedPriorityPolicy) {
+  std::shared_ptr<SharedBatchScheduler<BatchResourceBase::BatchTask>> batcher;
+  TF_ASSERT_OK(SharedBatchScheduler<BatchResourceBase::BatchTask>::Create(
+      SharedBatchScheduler<BatchResourceBase::BatchTask>::Options(), &batcher));
+  std::vector<int32_t> allowed_batch_sizes = {4, 8, 12, 16};
+  int max_batch_size = 16;
+  int64_t batch_timeout = absl::ToInt64Microseconds(absl::Seconds(3));
+  int num_requests = 6;
+  // Make the low priority batch timeout longer than the high priority batch
+  // so the low priority tasks can be padded to the high priority batch instead
+  // of forming a separate batch.
+  BatchResourceBase::BatcherT::QueueOptions queue_options =
+      TestBatchResourceBase::GetBatcherQueueOptions(
+          /*num_batch_threads=*/num_requests, /*max_batch_size=*/max_batch_size,
+          /*batch_timeout_micros=*/batch_timeout,
+          /*max_enqueued_batches=*/num_requests, allowed_batch_sizes,
+          /*enable_large_batch_splitting=*/
+          GetParam().enable_large_batch_splitting,
+          /*disable_padding=*/false, kPadUpPolicy,
+          /*low_priority_max_batch_size=*/max_batch_size,
+          /*low_priority_batch_timeout_micros=*/batch_timeout * 3,
+          /*low_priority_max_enqueued_batches=*/num_requests,
+          /*low_priority_allowed_batch_sizes=*/allowed_batch_sizes,
+          /*mixed_priority_batching_policy=*/
+          GetParam().mixed_priority_batching_policy);
+  tsl::core::RefCountPtr<BatchResourceBase> batch_resource(
+      new TestBatchResourceBase(true, batcher, queue_options,
+                                allowed_batch_sizes));
+
+  std::vector<std::unique_ptr<OpKernelContext>> contexts;
+  for (int i = 0; i < num_requests; ++i) {
+    contexts.push_back(std::make_unique<OpKernelContext>(&params_));
+  }
+
+  absl::BlockingCounter blocking_counter(num_requests);
+  for (int i = 0; i < num_requests; ++i) {
+    auto create_batch_task_fn = [&]() {
+      // The first 3 requests are assigned with the default high priority, while
+      // the last 3 requests are set to low priority.
+      std::unique_ptr<BatchResourceBase::BatchTask> batch_task;
+      if (i >= 3) {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        batch_task = std::make_unique<BatchResourceBase::BatchTask>();
+      } else {
+        batch_task = std::make_unique<BatchResourceBase::BatchTask>();
+      }
+      return batch_task;
+    };
+    auto done_callback = [&]() { blocking_counter.DecrementCount(); };
+    TF_ASSERT_OK(batch_resource->RegisterInput(
+        /*guid=*/i, contexts[i].get(),
+        /*batcher_queue_name=*/"batcher_queue_name",
+        /*create_batch_task_fn=*/create_batch_task_fn,
+        /*done_callback=*/done_callback,
+        /*forced_warmup_batch_size=*/0));
+  }
+  blocking_counter.Wait();
+
+  for (const auto& [batch_size, expected_count] :
+       GetParam().expected_batch_size_count) {
+    EXPECT_EQ(processed_batch_size_v2_reader_->Delta(
+                  "my_model_name", "my_batch_node", absl::StrCat(batch_size)),
+              expected_count);
+  }
+  for (const auto& [batch_size, expected_padding_sum] :
+       GetParam().expected_batch_size_padding_sum) {
+    EXPECT_EQ(
+        padding_size_v2_reader_
+            ->Delta("my_model_name", absl::StrCat(batch_size), "my_batch_node")
+            .sum(),
+        expected_padding_sum);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BatchResourceBaseWithPriorityTests, BatchResourceBaseWithPriorityTest,
+    ::testing::ValuesIn<PriorityTestParams>({
+        // allowed_batch_sizes = {4, 8, 12, 16}.
+        // 6 requests in total and each request has task size 3.
+        // 3 requests with high priority and 3 requests with low priority.
+        // With priority_isolation policy, the high priority tasks and low
+        // priority tasks are batched separately. There are 2 batches. Each one
+        // has 3 tasks and total size is 12. Each batch has 3 paddings.
+        {
+            "priority_isolation",
+            /*enable_large_batch_splitting=*/true,
+            MixedPriorityBatchingPolicy::kPriorityIsolation,
+            /*expected_batch_size_count=*/
+            {{4, 0}, {8, 0}, {12, 2}, {16, 0}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 0}, {8, 0}, {12, 6}, {16, 0}},
+        },
+        // With priority_merge policy, high priority tasks and low priority
+        // tasks are batched together. The total size of all tasks is 18 which
+        // exceeds the max batch size 16. The last low priority task is split
+        // into two tasks of size 1 and size 2. There are 2 batches. First batch
+        // has 6 tasks and total size is 16. No padding for the first batch. The
+        // second batch has 1 task of size 2 and is padded to size 4.
+        {
+            "priority_merge_enable_splitting",
+            /*enable_large_batch_splitting=*/true,
+            MixedPriorityBatchingPolicy::kPriorityMerge,
+            /*expected_batch_size_count=*/
+            {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 2}, {8, 0}, {12, 0}, {16, 0}},
+        },
+        // With priority_merge policy, high priority tasks and low priority
+        // tasks are batched together. Since splitting is disabled, there are 2
+        // batches. First batch has 5 tasks, total size is 15 and is padded to
+        // size 16. The second batch has 1 low priority task of size 3 and is
+        // padded to size 4.
+        {
+            "priority_merge_disable_splitting",
+            /*enable_large_batch_splitting=*/false,
+            MixedPriorityBatchingPolicy::kPriorityMerge,
+            /*expected_batch_size_count=*/
+            {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
+        },
+        // With padding_with_max_batch_size policy, high priority tasks and low
+        // priority tasks are batched to the max batch size and there is no
+        // splitting for low priority tasks. 3 high priority tasks and 2 low
+        // priority tasks are batched together. The first batch has total size
+        // of 15 and is padded to size 16. The second batch has 1 low priority
+        // task of size 3 and is padded to size 4.
+        {
+            "padding_with_max_batch_size",
+            /*enable_large_batch_splitting=*/true,
+            MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize,
+            /*expected_batch_size_count=*/
+            {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
+        },
+        // With padding_with_next_allowed_batch_size policy, high priority tasks
+        // and low priority tasks are batched to the next allowed batch size. 3
+        // high priority tasks and 1 low priority tasks are batched together.
+        // The first batch has total size of 12. No padding for batch 1. The
+        // second batch has 2 low priority tasks (total size 6) and is padded to
+        // size 8.
+        {
+            "low_priority_padding_with_next_allowed_batch_size",
+            /*enable_large_batch_splitting=*/true,
+            MixedPriorityBatchingPolicy::
+                kLowPriorityPaddingWithNextAllowedBatchSize,
+            /*expected_batch_size_count=*/
+            {{4, 0}, {8, 1}, {12, 1}, {16, 0}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 0}, {8, 2}, {12, 0}, {16, 0}},
+        },
+        // Same as above but disabled large batch splitting.
+        {
+            "low_priority_padding_with_next_allowed_batch_size_disable_"
+            "splitting",
+            /*enable_large_batch_splitting=*/false,
+            MixedPriorityBatchingPolicy::
+                kLowPriorityPaddingWithNextAllowedBatchSize,
+            /*expected_batch_size_count=*/
+            {{4, 0}, {8, 1}, {12, 1}, {16, 0}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 0}, {8, 2}, {12, 0}, {16, 0}},
+        },
+    }),
+    [](const ::testing::TestParamInfo<
+        BatchResourceBaseWithPriorityTest::ParamType>& info) {
+      return info.param.test_name;
+    });
 #endif
 
 class TestTpuCostMeasurement : public CostMeasurement {
@@ -756,6 +1037,209 @@ TEST_F(BatchResourceBaseTest, ConfiguredBatchPaddingPolicyMetric) {
   my_batch_resource->Unref();
 }
 
+struct MaxExecutionBatchSizeTestParams {
+  std::string test_name;
+  bool enable_large_batch_splitting;
+  int max_batch_size;
+  std::vector<int> allowed_batch_sizes;
+  absl::flat_hash_map<int, int> expected_batch_size_and_count;
+  absl::flat_hash_map<int, int> expected_batch_size_and_padding_sum;
+};
+
+class BatchResourceBaseMaxExecutionBatchSizeTest
+    : public ::testing::TestWithParam<MaxExecutionBatchSizeTestParams> {
+ protected:
+  void SetUp() override {
+    processed_batch_size_v2_reader_ = std::make_unique<CellReader<int64_t>>(
+        "/tensorflow/serving/batching/processed_batch_size_v2");
+    padding_size_v2_reader_ = std::make_unique<CellReader<Histogram>>(
+        "/tensorflow/serving/batching/padding_size_v2");
+    // Create device_.
+    device_ = DeviceFactory::NewDevice("CPU", SessionOptions{},
+                                       "/job:a/replica:0/task:0");
+    // Create batch_kernel_node_def.
+    NodeDefBuilder batch_function_builder("my_batch_node", "BatchFunction");
+    batch_function_builder.Attr("max_batch_size", GetParam().max_batch_size);
+    batch_function_builder.Attr("num_batch_threads", 6);
+    batch_function_builder.Attr("allowed_batch_sizes",
+                                GetParam().allowed_batch_sizes);
+    batch_function_builder.Attr("batch_timeout_micros", 3000000);
+    batch_function_builder.Attr("max_enqueued_batches", 6);
+    batch_function_builder.Attr("enable_large_batch_splitting",
+                                GetParam().enable_large_batch_splitting);
+    batch_function_builder.Attr("Tin", {DataType::DT_INT64});
+    batch_function_builder.Input(std::vector<NodeDefBuilder::NodeOut>{
+        NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
+    batch_function_builder.Attr("Tcaptured", std::vector<DataType>{});
+    batch_function_builder.Input(std::vector<NodeDefBuilder::NodeOut>{});
+    batch_function_builder.Attr("Tout", {DataType::DT_INT64});
+    NameAttrList f;
+    f.set_name("func_to_batch");
+    batch_function_builder.Attr("f", f);
+    NodeDef batch_kernel_node_def;
+    CHECK_OK(batch_function_builder.Finalize(&batch_kernel_node_def));
+
+    // Create batch_kernel_.
+    absl::Status op_kernel_creation_status;
+    batch_kernel_ =
+        CreateOpKernel(DEVICE_CPU, device_.get(), device_->GetAllocator({}),
+                       batch_kernel_node_def, TF_GRAPH_DEF_VERSION,
+                       &op_kernel_creation_status);
+    CHECK_OK(op_kernel_creation_status);
+    CHECK(batch_kernel_ != nullptr);
+
+    // Create input tensors.
+    input_tensor_ = Tensor(DataType::DT_INT64, TensorShape({1, 4}));
+    input_tensor_.flat<int64_t>().setZero();
+    input_tensor_values_ = {
+        TensorValue(&input_tensor_),
+    };
+
+    // Fill-in session_metadata_.
+    session_metadata_.set_name("my_model_name");
+
+    // Fill-in params_.
+    params_.device = device_.get();
+    params_.op_kernel = batch_kernel_.get();
+    params_.inputs = input_tensor_values_;
+    params_.session_metadata = &session_metadata_;
+
+    // Create context_.
+    context_ = std::make_unique<OpKernelContext>(&params_);
+  }
+
+  std::unique_ptr<CellReader<int64_t>> processed_batch_size_v2_reader_;
+  std::unique_ptr<CellReader<Histogram>> padding_size_v2_reader_;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<OpKernel> batch_kernel_;
+  Tensor input_tensor_;
+  std::vector<TensorValue> input_tensor_values_;
+  SessionMetadata session_metadata_;
+  OpKernelContext::Params params_;
+  std::unique_ptr<OpKernelContext> context_;
+};
+
+TEST_P(BatchResourceBaseMaxExecutionBatchSizeTest,
+       MaxExecutionBatchSizeIsRespected) {
+  std::shared_ptr<SharedBatchScheduler<BatchResourceBase::BatchTask>> batcher;
+  TF_ASSERT_OK(SharedBatchScheduler<BatchResourceBase::BatchTask>::Create(
+      SharedBatchScheduler<BatchResourceBase::BatchTask>::Options(), &batcher));
+  int64_t batch_timeout = absl::ToInt64Microseconds(absl::Seconds(3));
+  int num_requests = 10;
+  BatchResourceBase::BatcherT::QueueOptions queue_options =
+      TestBatchResourceBase::GetBatcherQueueOptions(
+          /*num_batch_threads=*/num_requests,
+          /*max_batch_size=*/GetParam().max_batch_size,
+          /*batch_timeout_micros=*/batch_timeout,
+          /*max_enqueued_batches=*/num_requests, GetParam().allowed_batch_sizes,
+          /*enable_large_batch_splitting=*/
+          GetParam().enable_large_batch_splitting,
+          /*disable_padding=*/false);
+  tsl::core::RefCountPtr<BatchResourceBase> batch_resource(
+      new TestBatchResourceBase(true, batcher, queue_options,
+                                GetParam().allowed_batch_sizes));
+
+  std::vector<std::unique_ptr<OpKernelContext>> contexts;
+  for (int i = 0; i < num_requests; ++i) {
+    contexts.push_back(std::make_unique<OpKernelContext>(&params_));
+  }
+
+  absl::BlockingCounter blocking_counter(num_requests);
+  for (int i = 0; i < num_requests; ++i) {
+    auto create_batch_task_fn = [&]() {
+      return std::make_unique<BatchResourceBase::BatchTask>();
+    };
+    auto done_callback = [&]() { blocking_counter.DecrementCount(); };
+    TF_ASSERT_OK(batch_resource->RegisterInput(
+        /*guid=*/i, contexts[i].get(),
+        /*batcher_queue_name=*/"batcher_queue_name",
+        /*create_batch_task_fn=*/create_batch_task_fn,
+        /*done_callback=*/done_callback,
+        /*forced_warmup_batch_size=*/0));
+  }
+  blocking_counter.Wait();
+
+  for (const auto& [batch_size, expected_count] :
+       GetParam().expected_batch_size_and_count) {
+    EXPECT_EQ(processed_batch_size_v2_reader_->Delta(
+                  "my_model_name", "my_batch_node", absl::StrCat(batch_size)),
+              expected_count);
+  }
+  for (const auto& [batch_size, expected_padding_sum] :
+       GetParam().expected_batch_size_and_padding_sum) {
+    EXPECT_EQ(
+        padding_size_v2_reader_
+            ->Delta("my_model_name", absl::StrCat(batch_size), "my_batch_node")
+            .sum(),
+        expected_padding_sum);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BatchResourceBaseMaxExecutionBatchSizeTests,
+    BatchResourceBaseMaxExecutionBatchSizeTest,
+    testing::ValuesIn<MaxExecutionBatchSizeTestParams>({
+        // There are 10 requests and each request has task size 1. When batch
+        // splitting is enabled and allowed_batch_sizes is empty, the
+        // max_execution_batch_size is assigned by the max_batch_size 16. Since
+        // allowed_batch_sizes is empty, any batch size <= 16 is allowed.
+        // Therefore an input batch of size 10 is processed directly with no
+        // padding.
+        {
+            "batch_splitting_enabled_and_allowed_batch_sizes_empty",
+            /*enable_large_batch_splitting=*/true,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{},
+            /*expected_batch_size_and_count=*/{{10, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{10, 0}},
+        },
+        // Same requests as above. With batch splitting disabled,
+        // max_execution_batch_size is set by input_batch_size_limit, which
+        // inherits its value from max_batch_size (16). Since
+        // allowed_batch_sizes is empty, any batch size <= 16 is permitted.
+        // Therefore, an input batch of size 10 is processed directly with no
+        // padding.
+        {
+            "batch_splitting_disabled_and_allowed_batch_sizes_empty",
+            /*enable_large_batch_splitting=*/false,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{},
+            /*expected_batch_size_and_count=*/{{10, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{10, 0}},
+        },
+        // Same requests as above. When batch splitting is enabled and
+        // allowed_batch_sizes is not empty, the max_execution_batch_size is
+        // assigned to the largest allowed_batch_size 8. There are two batches.
+        // The first batch has 8 requests with total size 8, no padding. The
+        // second batch has 2 requests with total size 2, padding to size 4
+        // with 2 paddings.
+        {
+            "batch_splitting_enabled_and_allowed_batch_sizes_not_empty",
+            /*enable_large_batch_splitting=*/true,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{4, 8},
+            /*expected_batch_size_and_count=*/{{4, 1}, {8, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{4, 2}, {8, 0}},
+        },
+        // Same requests as above. When batch splitting is disabled, the
+        // max_execution_batch_size is assigned to the max_batch_size 16. Since
+        // allowed_batch_sizes is not empty and the padding policy is pad up,
+        // there is one batch of total size 10 which is padded to size 16 with 6
+        // paddings.
+        {
+            "batch_splitting_disabled_and_allowed_batch_sizes_not_empty",
+            /*enable_large_batch_splitting=*/false,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{4, 8, 16},
+            /*expected_batch_size_and_count=*/{{4, 0}, {8, 0}, {16, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{4, 0}, {8, 0}, {16, 6}},
+        },
+    }),
+    [](const ::testing::TestParamInfo<
+        BatchResourceBaseMaxExecutionBatchSizeTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
 }  // namespace
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index 12364fcfc6266c..936473a1884dc9 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -109,17 +109,17 @@ class TaskQueue {
 
   struct TaskWrapper {
     std::unique_ptr<TaskType> task;
-    uint64 start_time_micros;
+    uint64_t start_time_micros;
 
-    TaskWrapper(std::unique_ptr<TaskType> task, uint64 start_time_micros)
+    TaskWrapper(std::unique_ptr<TaskType> task, uint64_t start_time_micros)
         : task(std::move(task)), start_time_micros(start_time_micros) {}
   };
 
   // Appends a task to the end of the queue with the given start time.
-  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
+  void AddTask(std::unique_ptr<TaskType> task, uint64_t start_time_micros);
 
   // Adds a task to the front of the queue with the given start time.
-  void PrependTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
+  void PrependTask(std::unique_ptr<TaskType> task, uint64_t start_time_micros);
 
   // Removes a task from the front of the queue, i.e., the oldest task in the
   // queue.
@@ -132,7 +132,7 @@ class TaskQueue {
 
   // Returns the start time of the earliest task in the queue. If the queue is
   // empty, return the null value.
-  std::optional<uint64> EarliestTaskStartTime() const;
+  std::optional<uint64_t> EarliestTaskStartTime() const;
 
   // Returns true iff the queue contains 0 tasks.
   bool empty() const;
@@ -162,7 +162,7 @@ class TaskQueue {
 
 template <typename TaskType>
 void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task,
-                                  uint64 start_time_micros) {
+                                  uint64_t start_time_micros) {
   {
     mutex_lock l(mu_);
     size_ += task->size();
@@ -173,7 +173,7 @@ void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task,
 
 template <typename TaskType>
 void TaskQueue<TaskType>::PrependTask(std::unique_ptr<TaskType> task,
-                                      uint64 start_time_micros) {
+                                      uint64_t start_time_micros) {
   {
     mutex_lock l(mu_);
     size_ += task->size();
@@ -233,7 +233,7 @@ bool TaskQueue<TaskType>::empty() const {
 }
 
 template <typename TaskType>
-std::optional<uint64> TaskQueue<TaskType>::EarliestTaskStartTime() const {
+std::optional<uint64_t> TaskQueue<TaskType>::EarliestTaskStartTime() const {
   {
     mutex_lock l(mu_);
 
@@ -275,13 +275,13 @@ template <typename TaskType>
 class Batch {
  public:
   Batch();
-  explicit Batch(uint64 traceme_context_id);
+  explicit Batch(uint64_t traceme_context_id);
   virtual ~Batch();  // Blocks until the batch is closed.
 
   // Appends 'task' to the batch. After calling AddTask(), the newly-added task
   // can be accessed via task(num_tasks()-1) or mutable_task(num_tasks()-1).
   // Dies if the batch is closed.
-  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros = 0);
+  void AddTask(std::unique_ptr<TaskType> task, uint64_t start_time_micros = 0);
 
   // Removes the most recently added task. Returns nullptr if the batch is
   // empty.
@@ -318,7 +318,7 @@ class Batch {
   void Close();
 
   // Returns the TraceMe context id of this batch.
-  uint64 traceme_context_id() const;
+  uint64_t traceme_context_id() const;
 
   // Attempts to trim this batch to a new, smaller size (not to be confused with
   // the number of tasks in the batch). On success, the trimmed tasks go into
@@ -331,7 +331,7 @@ class Batch {
 
   // Returns the start time of the earliest task in the queue. If the queue is
   // empty, return the null value.
-  std::optional<uint64> EarliestTaskStartTime() const;
+  std::optional<uint64_t> EarliestTaskStartTime() const;
 
  private:
   mutable mutex mu_;
@@ -348,11 +348,11 @@ class Batch {
   absl::Notification closed_;
 
   // The TracMe context id.
-  const uint64 traceme_context_id_;
+  const uint64_t traceme_context_id_;
 
   // The minimum start time of all tasks in the batch.
   // If the batch is empty, the value is undefined.
-  uint64 earliest_task_start_time_micros_ TF_GUARDED_BY(mu_);
+  uint64_t earliest_task_start_time_micros_ TF_GUARDED_BY(mu_);
 
   Batch(const Batch&) = delete;
   void operator=(const Batch&) = delete;
@@ -421,7 +421,7 @@ template <typename TaskType>
 Batch<TaskType>::Batch() : Batch(0) {}
 
 template <typename TaskType>
-Batch<TaskType>::Batch(uint64 traceme_context_id)
+Batch<TaskType>::Batch(uint64_t traceme_context_id)
     : traceme_context_id_(traceme_context_id) {}
 
 template <typename TaskType>
@@ -431,7 +431,7 @@ Batch<TaskType>::~Batch() {
 
 template <typename TaskType>
 void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task,
-                              uint64 start_time_micros) {
+                              uint64_t start_time_micros) {
   DCHECK(!IsClosed());
   {
     mutex_lock l(mu_);
@@ -448,7 +448,7 @@ void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task,
 }
 
 template <typename TaskType>
-std::optional<uint64> Batch<TaskType>::EarliestTaskStartTime() const {
+std::optional<uint64_t> Batch<TaskType>::EarliestTaskStartTime() const {
   {
     mutex_lock l(mu_);
     if (tasks_.empty()) {
@@ -552,7 +552,7 @@ void Batch<TaskType>::Close() {
 }
 
 template <typename TaskType>
-uint64 Batch<TaskType>::traceme_context_id() const {
+uint64_t Batch<TaskType>::traceme_context_id() const {
   return traceme_context_id_;
 }
 
@@ -567,9 +567,9 @@ void Batch<TaskType>::TryTrimToNewSize(
   // Index of the first task to trim away. It is possible that it is the index
   // of a task of size larger than 1 that will have to be split in order to get
   // to the target new_size.
-  int32 first_task_to_move = 0;
+  int32_t first_task_to_move = 0;
   // The sum of sizes of tasks i, where i < first_task_to_move.
-  int32 size_of_previous_tasks = 0;
+  int32_t size_of_previous_tasks = 0;
   while (size_of_previous_tasks + tasks_[first_task_to_move]->size() <=
          new_size) {
     size_of_previous_tasks += tasks_[first_task_to_move]->size();
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
index 21622a5f0ba2f0..c309110f3afa67 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 namespace serving {
 
 int GetNextAllowedBatchSize(int batch_size,
-                            const std::vector<int32>& allowed_batch_sizes,
+                            const std::vector<int32_t>& allowed_batch_sizes,
                             bool disable_padding) {
   if (disable_padding || allowed_batch_sizes.empty()) {
     return batch_size;
@@ -44,9 +44,9 @@ int GetNextAllowedBatchSize(int batch_size,
   return batch_size;
 }
 
-int32 GetPrevAllowedBatchSize(int batch_size,
-                              const std::vector<int32>& allowed_batch_sizes,
-                              bool disable_padding) {
+int32_t GetPrevAllowedBatchSize(int batch_size,
+                                const std::vector<int32_t>& allowed_batch_sizes,
+                                bool disable_padding) {
   if (disable_padding || allowed_batch_sizes.empty()) {
     return batch_size;
   }
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
index 9a6deb1a530208..d9ce8070a16895 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
@@ -34,13 +34,13 @@ namespace serving {
 // greater than or equal to the given batch size. If allowed_batch_sizes,
 // returns batch_size as is.
 int GetNextAllowedBatchSize(int batch_size,
-                            const std::vector<int32>& allowed_batch_sizes,
+                            const std::vector<int32_t>& allowed_batch_sizes,
                             bool disable_padding);
 
 // Returns the largest allowed batch size that is smaller than or equal to
 // batch_size. Returns batch_size if no such size exists.
 int GetPrevAllowedBatchSize(int batch_size,
-                            const std::vector<int32>& allowed_batch_sizes,
+                            const std::vector<int32_t>& allowed_batch_sizes,
                             bool disable_padding);
 
 // Constants containing possible values for the batch_padding_policy argument
@@ -69,11 +69,14 @@ inline constexpr absl::string_view kMinimizeTpuCostPerRequestPolicy =
 // out_trimmed_tasks vector in the same order as they were in the batch.
 template <typename TaskType>
 void MaybeBatchDown(Batch<TaskType>& batch,
-                    const std::vector<int32>& allowed_batch_sizes,
+                    const std::vector<int32_t>& allowed_batch_sizes,
                     bool disable_padding,
                     absl::string_view batch_padding_policy,
                     ModelBatchStats* model_batch_stats,
                     std::vector<std::unique_ptr<TaskType>>& out_trimmed_tasks) {
+  if (batch.empty()) {
+    return;
+  }
   if (batch_padding_policy == kPadUpPolicy) {
     // This is the default behavior of batch resource when it is given a batch
     // size that doesn't match any of the allowed batch sizes.
@@ -100,15 +103,15 @@ void MaybeBatchDown(Batch<TaskType>& batch,
     return;
   }
 
-  int32 batch_size = batch.size();
+  int32_t batch_size = batch.size();
 
-  int32 pad_up_size =
+  int32_t pad_up_size =
       GetNextAllowedBatchSize(batch_size, allowed_batch_sizes, disable_padding);
   if (pad_up_size == batch_size) {
     return;  // Good, no padding is necessary.
   }
 
-  int32 batch_down_size =
+  int32_t batch_down_size =
       GetPrevAllowedBatchSize(batch_size, allowed_batch_sizes, disable_padding);
   if (batch_down_size == batch_size) {
     return;  // Can't batch down (e.g. no smaller batch size available).
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_utils_test.cc
index e45cb46e29646c..91ff8065a1d8a7 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_utils_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils_test.cc
@@ -83,6 +83,23 @@ class FakeTask : public BatchTask {
   const size_t size_;
 };
 
+TEST(MaybeBatchDownTest, EmptyBatch) {
+  Batch<FakeTask> batch;
+  batch.Close();
+
+  std::vector<std::unique_ptr<FakeTask>> out_trimmed_tasks;
+
+  MaybeBatchDown(
+      /* batch= */ batch, /* allowed_batch_sizes= */ {1, 2, 4, 8},
+      /* disable_padding= */ false,
+      /* batch_padding_policy= */ kBatchDownPolicy,
+      /* model_batch_stats= */ nullptr,
+      /* out_trimmed_tasks= */ out_trimmed_tasks);
+
+  EXPECT_TRUE(batch.empty());
+  EXPECT_TRUE(out_trimmed_tasks.empty());
+}
+
 TEST(MaybeBatchDownTest, PadUp) {
   Batch<FakeTask> batch;
   batch.AddTask(std::make_unique<FakeTask>(1));
diff --git a/tensorflow/core/kernels/batching_util/batch_stats.h b/tensorflow/core/kernels/batching_util/batch_stats.h
index 87c36fca0c02a1..22729032d5d6e2 100644
--- a/tensorflow/core/kernels/batching_util/batch_stats.h
+++ b/tensorflow/core/kernels/batching_util/batch_stats.h
@@ -140,7 +140,7 @@ class ModelBatchStats {
   // size.
   //
   // The returned reference persist for as long as 'this' is alive.
-  BatchSizeStats& batch_size(int32 batch_size) {
+  BatchSizeStats& batch_size(int32_t batch_size) {
     mutex_lock l(mu_);
     return batch_size_stats_by_batch_size_[batch_size];
   }
@@ -161,8 +161,8 @@ class ModelBatchStats {
   // Returns the list of batch sizes for which this model has statistics.
   //
   // The returned list is not guaranteed to be sorted.
-  std::vector<int32> BatchSizes() const {
-    std::vector<int32> result;
+  std::vector<int32_t> BatchSizes() const {
+    std::vector<int32_t> result;
     mutex_lock l(mu_);
     result.reserve(batch_size_stats_by_batch_size_.size());
     for (const auto& [key, value] : batch_size_stats_by_batch_size_) {
@@ -198,7 +198,7 @@ class ModelBatchStats {
   // element deletion is possible because we return references to items in this
   // map and don't track their lifetime. We are using the node hash map so that
   // elements, once created, are fixed in memory.
-  absl::node_hash_map<int32, BatchSizeStats> batch_size_stats_by_batch_size_
+  absl::node_hash_map<int32_t, BatchSizeStats> batch_size_stats_by_batch_size_
       TF_GUARDED_BY(mu_);
 
   // The total count of individual unit-sized queries processed by this model.
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.cc b/tensorflow/core/kernels/batching_util/fake_clock_env.cc
index cbe1109cc7e367..d2fc86ac0e826d 100644
--- a/tensorflow/core/kernels/batching_util/fake_clock_env.cc
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.cc
@@ -40,7 +40,7 @@ void FakeClockEnv::AdvanceByMicroseconds(int micros) {
   }
 }
 
-void FakeClockEnv::BlockUntilSleepingThread(uint64 wake_time) {
+void FakeClockEnv::BlockUntilSleepingThread(uint64_t wake_time) {
   for (;;) {
     {
       mutex_lock l(mu_);
@@ -67,7 +67,7 @@ void FakeClockEnv::BlockUntilThreadsAsleep(int num_threads) {
   }
 }
 
-uint64 FakeClockEnv::NowMicros() const {
+uint64_t FakeClockEnv::NowMicros() const {
   {
     mutex_lock l(mu_);
     return current_time_;
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.h b/tensorflow/core/kernels/batching_util/fake_clock_env.h
index 119abc2ee1f328..739324b2a7cdac 100644
--- a/tensorflow/core/kernels/batching_util/fake_clock_env.h
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.h
@@ -46,22 +46,22 @@ class FakeClockEnv : public EnvWrapper {
 
   // Blocks until there is a sleeping thread that is scheduled to wake up at
   // the given (absolute) time.
-  void BlockUntilSleepingThread(uint64 wake_time);
+  void BlockUntilSleepingThread(uint64_t wake_time);
 
   // Blocks until there are at least num_threads sleeping.
   void BlockUntilThreadsAsleep(int num_threads);
 
   // Methods that this class implements.
-  uint64 NowMicros() const override;
+  uint64_t NowMicros() const override;
   void SleepForMicroseconds(int64_t micros) override;
 
  private:
   mutable mutex mu_;
 
-  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t current_time_ TF_GUARDED_BY(mu_) = 0;
 
   struct SleepingThread {
-    uint64 wake_time;
+    uint64_t wake_time;
     absl::Notification* wake_notification;
   };
   std::vector<SleepingThread> sleeping_threads_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.cc b/tensorflow/core/kernels/batching_util/periodic_function.cc
index e579135a2e7fd0..b72ac33bdbc3ef 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.cc
+++ b/tensorflow/core/kernels/batching_util/periodic_function.cc
@@ -29,9 +29,9 @@ PeriodicFunction::PeriodicFunction(absl::AnyInvocable<void()> function,
                                    const int64_t interval_micros,
                                    const Options& options)
     : function_(std::move(function)),
-      interval_micros_([interval_micros]() -> int64 {
+      interval_micros_([interval_micros]() -> int64_t {
         if (interval_micros < 0) {
-          const string error =
+          const std::string error =
               absl::StrCat(" The value of 'interval_micros' should be >= 0: ",
                            interval_micros, ". ");
           DCHECK(false) << error;
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index 45a56e60461b26..0cd0504ab88bfe 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -82,7 +82,7 @@ class PeriodicFunction {
 
     // Specifies the thread name prefix (see the description in class
     // Thread).
-    string thread_name_prefix = "periodic_function";
+    std::string thread_name_prefix = "periodic_function";
 
     // The environment to use. Does not take ownership, but must remain alive
     // for as long as the PeriodicFunction exists.
diff --git a/tensorflow/core/kernels/batching_util/periodic_function_test.cc b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
index 2c1300d7dada2a..bc908b500ad7d3 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function_test.cc
+++ b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
@@ -46,7 +46,7 @@ using test_util::FakeClockEnv;
 
 void StopPeriodicFunction(PeriodicFunction* periodic_function,
                           FakeClockEnv* fake_clock_env,
-                          const uint64 pf_interval_micros) {
+                          const uint64_t pf_interval_micros) {
   fake_clock_env->BlockUntilThreadsAsleep(1);
   internal::PeriodicFunctionTestAccess(periodic_function).NotifyStop();
   fake_clock_env->AdvanceByMicroseconds(pf_interval_micros);
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
index 7fa9f48b23de41..64a63c5e678653 100644
--- a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -71,7 +71,7 @@ class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
 
   struct Options {
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
     // Maximum number of batch processing threads.
     int64_t num_batch_threads = port::NumSchedulableCPUs();
     // Although batch selection is primarily based on age, this parameter
@@ -87,7 +87,7 @@ class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
     int64_t initial_in_flight_batches_limit = 3;
     // Returns the current number of batches directly waiting to be processed
     // by the serial device (i.e. GPU, TPU).
-    std::function<int64()> get_pending_on_serial_device;
+    std::function<int64_t()> get_pending_on_serial_device;
     // Desired average number of batches directly waiting to be processed by the
     // serial device. Small numbers of O(1) should deliver the best latency.
     double target_pending = 2;
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index e36a541f47a734..d568707754bd89 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -133,7 +133,7 @@ class SharedBatchScheduler
   // TODO(b/25089730): Tune defaults based on best practices as they develop.
   struct Options {
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
 
     // The number of threads to use to process batches.
     // Must be >= 1, and should be tuned carefully.
@@ -233,7 +233,7 @@ class SharedBatchScheduler
     size_t max_execution_batch_size = 1000;
 
     // If non-empty, contains configured batch sizes.
-    std::vector<int32> allowed_batch_sizes;
+    std::vector<int32_t> allowed_batch_sizes;
 
     // If true, the padding will not be appended.
     bool disable_padding = false;
@@ -241,7 +241,7 @@ class SharedBatchScheduler
     // The padding policy to use.
     //
     // See the documentation for kPadUpPolicy for details.
-    string batch_padding_policy = string(kPadUpPolicy);
+    std::string batch_padding_policy = std::string(kPadUpPolicy);
 
     // A pointer to a ModelBatchStats instance for this model. To be used for
     // cost-based padding policy selection.
@@ -266,7 +266,7 @@ class SharedBatchScheduler
       // See QueueOptions.max_enqueued_batches
       size_t max_enqueued_batches = 0;
       // See QueueOptions.allowed_batch_sizes
-      std::vector<int32> allowed_batch_sizes;
+      std::vector<int32_t> allowed_batch_sizes;
     };
     // A subset of queue options for high priority input. These options are
     // currently not being used in favor of the equivalents options at the
@@ -516,7 +516,7 @@ class Queue {
   size_t tail_batch_task_size() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Returns the number of enqueued batches.
-  int64 num_enqueued_batches() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  int64_t num_enqueued_batches() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Gets the appropriate batches.
   std::deque<std::unique_ptr<Batch<TaskType>>>& GetBatches()
@@ -573,7 +573,7 @@ class Queue {
       TF_GUARDED_BY(mu_);
 
   // The counter of the TraceMe context ids.
-  uint64 traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
 
   // The time at which the first task was added to the open (back-most) batch
   // in 'high_priority_batches_'. Valid iff that batch contains at least one
@@ -581,7 +581,7 @@ class Queue {
   //
   // Note that when using a batch padding policy other than PAD_UP, this field
   // might contain an approximate value.
-  uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
+  uint64_t open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
 
   // Whether this queue contains a batch that is eligible to be scheduled.
   // Used to keep track of when to call 'schedulable_batch_callback_'.
@@ -980,7 +980,7 @@ void Queue<TaskType>::PadOpenBatchWithLowPriorityTasks() {
       return;
     }
 
-    uint64 task_time = low_priority_tasks_.EarliestTaskStartTime().value();
+    uint64_t task_time = low_priority_tasks_.EarliestTaskStartTime().value();
     std::unique_ptr<TaskType> task = low_priority_tasks_.RemoveTask();
 
     const int64_t input_task_size = task->size();
@@ -1024,7 +1024,7 @@ void Queue<TaskType>::PadOpenBatchWithLowPriorityTasks() {
           tsl::profiler::ContextType::kSharedBatchScheduler,
           batches.back()->traceme_context_id());
 
-      batches.back()->AddTask(std::move(output_tasks[i]));
+      batches.back()->AddTask(std::move(output_tasks[i]), task_time);
     }
   }
 }
@@ -1089,11 +1089,11 @@ size_t Queue<TaskType>::SchedulingCapacity() const {
 
 template <typename TaskType>
 size_t Queue<TaskType>::SchedulingCapacityInternal() const {
-  const int64 num_new_batches_schedulable =
+  const int64_t num_new_batches_schedulable =
       static_cast<int64_t>(options_.max_enqueued_batches) -
       this->num_enqueued_batches();
-  const int64 execution_batch_size_limit = max_execution_batch_size();
-  const int64 open_batch_capacity =
+  const int64_t execution_batch_size_limit = max_execution_batch_size();
+  const int64_t open_batch_capacity =
       execution_batch_size_limit - this->tail_batch_task_size();
   // Note the returned value is guaranteed to be not negative, since
   // enqueue operation could only happen if queue has enough capacity.
@@ -1200,42 +1200,45 @@ Queue<TaskType>::ScheduleBatch() {
       // starting a new batch because starting a new batch will close the old
       // batch, making it read-only.
       Batch<TaskType>& old_batch = *batches[0];
-      uint64 old_batch_time = old_batch.EarliestTaskStartTime().value();
-      std::vector<std::unique_ptr<TaskType>> trimmed_tasks;
-      MaybeBatchDown(
-          /* batch= */ old_batch,
-          /* allowed_batch_sizes= */ options_.allowed_batch_sizes,
-          /* disable_padding= */ options_.disable_padding,
-          /* batch_padding_policy= */ options_.batch_padding_policy,
-          /* model_batch_stats= */ options_.model_batch_stats,
-          /* out_trimmed_tasks= */ trimmed_tasks);
-
-      StartNewBatch();
-
-      // Move the trimmed tasks, if any, into the new batch.
-      Batch<TaskType>& new_batch = *batches[1];
-      for (std::unique_ptr<TaskType>& task : trimmed_tasks) {
-        new_batch.AddTask(std::move(task), old_batch_time);
-      }
-      if (!new_batch.empty()) {
-        // TODO - b/325954758: Reconsider the starting time of a trimmed batch.
-        //
-        // Ideally, we'd set open_batch_start_time_micros_ to time we received
-        // the first task in the open batch, but we don't have this information
-        // here. For now, we're trying as alternative solution that doesn't
-        // require adding time to each task: assume that requests arrived at a
-        // steady rate and therefore use a point between the old value of
-        // open_batch_start_time_micros_ and NOW.
-        //
-        // Let's say that originally, the batch had 10 requests, and we want to
-        // schedule a batch of size 8 and leave 2 requests in the open batch
-        // (new_batch). Then, variable `position` is 0.8, which means we have to
-        // set open_batch_start_time_micros_ to be at a position of 80% between
-        // open_batch_start_time_micros_ and now.
-        double position = static_cast<double>(old_batch.size()) /
-                          (old_batch.size() + new_batch.size());
-        open_batch_start_time_micros_ +=
-            (env_->NowMicros() - open_batch_start_time_micros_) * position;
+      if (!old_batch.empty()) {
+        uint64_t old_batch_time = old_batch.EarliestTaskStartTime().value();
+        std::vector<std::unique_ptr<TaskType>> trimmed_tasks;
+        MaybeBatchDown(
+            /* batch= */ old_batch,
+            /* allowed_batch_sizes= */ options_.allowed_batch_sizes,
+            /* disable_padding= */ options_.disable_padding,
+            /* batch_padding_policy= */ options_.batch_padding_policy,
+            /* model_batch_stats= */ options_.model_batch_stats,
+            /* out_trimmed_tasks= */ trimmed_tasks);
+
+        StartNewBatch();
+
+        // Move the trimmed tasks, if any, into the new batch.
+        Batch<TaskType>& new_batch = *batches[1];
+        for (std::unique_ptr<TaskType>& task : trimmed_tasks) {
+          new_batch.AddTask(std::move(task), old_batch_time);
+        }
+        if (!new_batch.empty()) {
+          // TODO - b/325954758: Reconsider the starting time of a trimmed
+          // batch.
+          //
+          // Ideally, we'd set open_batch_start_time_micros_ to time we received
+          // the first task in the open batch, but we don't have this
+          // information here. For now, we're trying as alternative solution
+          // that doesn't require adding time to each task: assume that requests
+          // arrived at a steady rate and therefore use a point between the old
+          // value of open_batch_start_time_micros_ and NOW.
+          //
+          // Let's say that originally, the batch had 10 requests, and we want
+          // to schedule a batch of size 8 and leave 2 requests in the open
+          // batch (new_batch). Then, variable `position` is 0.8, which means we
+          // have to set open_batch_start_time_micros_ to be at a position of
+          // 80% between open_batch_start_time_micros_ and now.
+          double position = static_cast<double>(old_batch.size()) /
+                            (old_batch.size() + new_batch.size());
+          open_batch_start_time_micros_ +=
+              (env_->NowMicros() - open_batch_start_time_micros_) * position;
+        }
       }
     }
 
@@ -1412,7 +1415,7 @@ Queue<TaskType>::PeekBatchPriorityImpl() const {
   Batch<TaskType>* open_batch = batches.back().get();
 
   size_t effective_batch_size = open_batch->size();
-  uint64 effective_start_time_micros = open_batch_start_time_micros_;
+  uint64_t effective_start_time_micros = open_batch_start_time_micros_;
   int64_t effective_batch_timeout_micros = options_.batch_timeout_micros;
   if (effective_batch_size == 0) {
     // open_batch_start_time_micros_ is not valid for an empty batch.
@@ -1495,7 +1498,7 @@ size_t Queue<TaskType>::tail_batch_task_size() const {
 }
 
 template <typename TaskType>
-int64 Queue<TaskType>::num_enqueued_batches() const {
+int64_t Queue<TaskType>::num_enqueued_batches() const {
   return GetBatches().size();
 }
 
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 2795928315860b..6ae76b0d54c842 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -2442,7 +2442,7 @@ void BM_QueueSchedule(::testing::benchmark::State& state) {
   const int queue_index = state.range(1);
   Queue* queue = (*queues)[queue_index].get();
 
-  const string label =
+  const std::string label =
       absl::StrCat(state.threads(), "-Threads", (*queue_labels)[queue_index]);
   state.SetLabel(label);
   for (auto s : state) {
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index 940e08cd71f880..eab8997f6cd781 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <cfloat>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/range_sampler.h"
@@ -219,8 +219,10 @@ class ComputeAccidentalHitsOp : public OpKernel {
                     "sampled_candidates must be a vector, which is typically "
                     "an output from CandidateSampler"));
 
-    std::unordered_map<int64_t, int> sampled_candidate_to_pos;
-    for (int64_t i = 0; i < in_sampled_candidates.dim_size(0); ++i) {
+    const int64_t num_sampled = in_sampled_candidates.dim_size(0);
+    absl::flat_hash_map<int64_t, int> sampled_candidate_to_pos;
+    sampled_candidate_to_pos.reserve(num_sampled);
+    for (int64_t i = 0; i < num_sampled; ++i) {
       sampled_candidate_to_pos[in_sampled_candidates.vec<int64_t>()(i)] = i;
     }
 
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index c2fa1b82f2ac2a..bc08f46c579c3b 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -61,10 +61,10 @@ struct OutOfRange {
 
 // Add additional logging for out of range inputs when running in debug mode.
 #ifndef NDEBUG
-VALIDATE_CAST(int32, float);
-VALIDATE_CAST(int64, float);
-VALIDATE_CAST(int32, double);
-VALIDATE_CAST(int64, double);
+VALIDATE_CAST(int32_t, float);
+VALIDATE_CAST(int64_t, float);
+VALIDATE_CAST(int32_t, double);
+VALIDATE_CAST(int64_t, double);
 #endif
 
 CAST_FUNCTORS(Eigen::ThreadPoolDevice);
@@ -74,13 +74,13 @@ CAST_FUNCTORS(Eigen::ThreadPoolDevice);
 
 #define CURRY_TYPES3(FN, arg0, arg1)   \
   FN(arg0, arg1, bool);                \
-  FN(arg0, arg1, uint8);               \
-  FN(arg0, arg1, uint16);              \
-  FN(arg0, arg1, uint32);              \
-  FN(arg0, arg1, uint64);              \
-  FN(arg0, arg1, int8);                \
-  FN(arg0, arg1, int16);               \
-  FN(arg0, arg1, int32);               \
+  FN(arg0, arg1, uint8_t);             \
+  FN(arg0, arg1, uint16_t);            \
+  FN(arg0, arg1, uint32_t);            \
+  FN(arg0, arg1, uint64_t);            \
+  FN(arg0, arg1, int8_t);              \
+  FN(arg0, arg1, int16_t);             \
+  FN(arg0, arg1, int32_t);             \
   FN(arg0, arg1, int64_t);             \
   FN(arg0, arg1, float);               \
   FN(arg0, arg1, double);              \
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index 93b595f49bcdaa..aca59d5acd369e 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -114,9 +116,9 @@ class DenseCount : public OpKernel {
     OP_REQUIRES(context,
                 TensorShapeUtils::IsVector(data.shape()) ||
                     TensorShapeUtils::IsMatrix(data.shape()),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
-                    data.shape().DebugString()));
+                    data.shape().DebugString())));
 
     // Ensure all values are non-negative.
     const auto data_values = data.flat<T>();
@@ -125,15 +127,15 @@ class DenseCount : public OpKernel {
         (data_values >= static_cast<T>(0)).all();
     OP_REQUIRES(
         context, nonnegative(),
-        errors::InvalidArgument("Input values must all be non-negative"));
+        absl::InvalidArgumentError("Input values must all be non-negative"));
 
     if (use_weights) {
       OP_REQUIRES(
           context, weights.shape() == data.shape(),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Weights and data must have the same shape. Weight shape: ",
               weights.shape().DebugString(),
-              "; data shape: ", data.shape().DebugString()));
+              "; data shape: ", data.shape().DebugString())));
     }
 
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
@@ -143,7 +145,7 @@ class DenseCount : public OpKernel {
     int num_batch_elements = 1;
     for (int i = 0; i < num_batch_dimensions; ++i) {
       OP_REQUIRES(context, data.shape().dim_size(i) != 0,
-                  errors::InvalidArgument(
+                  absl::InvalidArgumentError(
                       "Invalid input: Shapes dimension cannot be 0."));
       num_batch_elements *= data.shape().dim_size(i);
     }
@@ -202,29 +204,31 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(indices.shape()),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Input indices must be a 2-dimensional tensor. Got: ",
-                    indices.shape().DebugString()));
+                    indices.shape().DebugString())));
     OP_REQUIRES(context, TensorShapeUtils::IsVector(values.shape()),
-                errors::InvalidArgument("Input values must be a vector. Got: ",
-                                        values.shape().DebugString()));
+                absl::InvalidArgumentError(
+                    absl::StrCat("Input values must be a vector. Got: ",
+                                 values.shape().DebugString())));
     OP_REQUIRES(context, TensorShapeUtils::IsVector(shape.shape()),
-                errors::InvalidArgument("Input shape must be a vector. Got: ",
-                                        shape.shape().DebugString()));
-    OP_REQUIRES(context,
-                values.shape().dim_size(0) == indices.shape().dim_size(0),
-                errors::InvalidArgument(
-                    "Number of values must match first dimension of indices.",
-                    "Got ", values.shape().dim_size(0),
-                    " values, indices shape: ", indices.shape().DebugString()));
+                absl::InvalidArgumentError(
+                    absl::StrCat("Input shape must be a vector. Got: ",
+                                 shape.shape().DebugString())));
+    OP_REQUIRES(
+        context, values.shape().dim_size(0) == indices.shape().dim_size(0),
+        absl::InvalidArgumentError(absl::StrCat(
+            "Number of values must match first dimension of indices.", "Got ",
+            values.shape().dim_size(0),
+            " values, indices shape: ", indices.shape().DebugString())));
     OP_REQUIRES(
         context, shape.shape().dim_size(0) == indices.shape().dim_size(1),
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "Number of dimensions must match second dimension of indices.",
             "Got ", shape.shape().dim_size(0),
-            " dimensions, indices shape: ", indices.shape().DebugString()));
+            " dimensions, indices shape: ", indices.shape().DebugString())));
     OP_REQUIRES(context, shape.NumElements() > 0,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(
                     "The shape argument requires at least one element."));
     // Validate indices: each index must be valid for the corresponding
     // dimension. This could be possibly done better.
@@ -237,10 +241,10 @@ class SparseCount : public OpKernel {
         OP_REQUIRES(
             context,
             indices_values(i, j) >= 0 && indices_values(i, j) < shape_vector(j),
-            errors::InvalidArgument(
+            absl::InvalidArgumentError(absl::StrCat(
                 "Invalid index value at ", i, ": dimension ", j, " has value ",
                 indices_values(i, j), " which is not in [0, ", shape_vector(j),
-                ") (as given by dense shape ", shape.DebugString()));
+                ") (as given by dense shape ", shape.DebugString())));
       }
     }
 
@@ -251,23 +255,23 @@ class SparseCount : public OpKernel {
         (values_values >= static_cast<T>(0)).all();
     OP_REQUIRES(
         context, nonnegative(),
-        errors::InvalidArgument("Input values must all be non-negative"));
+        absl::InvalidArgumentError("Input values must all be non-negative"));
 
     if (use_weights) {
       OP_REQUIRES(
           context, weights.shape() == values.shape(),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Weights and values must have the same shape. Weight shape: ",
               weights.shape().DebugString(),
-              "; values shape: ", values.shape().DebugString()));
+              "; values shape: ", values.shape().DebugString())));
     }
 
     bool is_1d = shape.NumElements() == 1;
     int num_batches = is_1d ? 1 : shape_vector(0);
-    OP_REQUIRES(
-        context, 0 < num_batches && num_batches < kMaxBatches,
-        errors::InvalidArgument("Cannot allocate ", num_batches,
-                                " batches, is the dense shape too wide?"));
+    OP_REQUIRES(context, 0 < num_batches && num_batches < kMaxBatches,
+                absl::InvalidArgumentError(
+                    absl::StrCat("Cannot allocate ", num_batches,
+                                 " batches, is the dense shape too wide?")));
 
     const auto weight_values = weights.flat<W>();
 
@@ -279,11 +283,11 @@ class SparseCount : public OpKernel {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       if (batch >= num_batches) {
         OP_REQUIRES(context, batch < num_batches,
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "Indices value along the first dimension must be ",
                         "lower than the first index of the shape.", "Got ",
                         batch, " as batch and ", num_batches,
-                        " as the first dimension of the shape."));
+                        " as the first dimension of the shape.")));
       }
       const auto& value = values_values(idx);
       if (maxlength_ < 0 || value < maxlength_) {
@@ -332,10 +336,10 @@ class RaggedCount : public OpKernel {
     if (use_weights) {
       OP_REQUIRES(
           context, weights.shape() == values.shape(),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Weights and values must have the same shape. Weight shape: ",
               weights.shape().DebugString(),
-              "; values shape: ", values.shape().DebugString()));
+              "; values shape: ", values.shape().DebugString())));
     }
 
     const auto splits_values = splits.flat<int64_t>();
@@ -346,15 +350,15 @@ class RaggedCount : public OpKernel {
 
     OP_REQUIRES(
         context, num_batches > 0,
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(
             "Must provide at least 2 elements for the splits argument"));
     OP_REQUIRES(context, splits_values(0) == 0,
-                errors::InvalidArgument("Splits must start with 0, not with ",
-                                        splits_values(0)));
+                absl::InvalidArgumentError(absl::StrCat(
+                    "Splits must start with 0, not with ", splits_values(0))));
     OP_REQUIRES(context, splits_values(num_batches) == num_values,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Splits must end with the number of values, got ",
-                    splits_values(num_batches), " instead of ", num_values));
+                    splits_values(num_batches), " instead of ", num_values)));
 
     // Ensure all values are non-negative.
     Eigen::TensorFixedSize<bool, Eigen::Sizes<>, Eigen::RowMajor> nonnegative;
@@ -362,7 +366,7 @@ class RaggedCount : public OpKernel {
         (values_values >= static_cast<T>(0)).all();
     OP_REQUIRES(
         context, nonnegative(),
-        errors::InvalidArgument("Input values must all be non-negative"));
+        absl::InvalidArgumentError("Input values must all be non-negative"));
 
     auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index ac89801639bbfd..f08f977d1fe631 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -73,6 +73,7 @@ tf_kernel_library(
         "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/core/util/tensor_bundle:naming",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 83b42576084346..1cc826e8c17b3d 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
@@ -175,10 +176,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         env_(env),
         num_tensors_(input->output_dtypes().size()),
         tensor_index_padding_size_(StringPaddingSize(num_tensors_)),
-        item_index_padding_size_(StringPaddingSize(kMaxItems)),
-        tensor_format_string_(strings::Printf(kKeyStrFormat,
-                                              item_index_padding_size_,
-                                              tensor_index_padding_size_)) {
+        item_index_padding_size_(StringPaddingSize(kMaxItems)) {
     input_->Ref();
     DCHECK_EQ(item_index_padding_size_, 7);
   }
@@ -230,9 +228,9 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
     return strings::Printf(kPaddingSizeStrFormat, num_tensors - 1).size();
   }
 
-  string FormatName(size_t item_index, size_t tensor_index) const {
-    return strings::Printf(tensor_format_string_.c_str(), item_index,
-                           tensor_index);
+  std::string FormatName(size_t item_index, size_t tensor_index) const {
+    return absl::StrFormat("%*zu_%*zu", item_index_padding_size_, item_index,
+                           tensor_index_padding_size_, tensor_index);
   }
 
   class FileIterator : public DatasetIterator<FileDatasetBase> {
@@ -710,7 +708,6 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
   const size_t tensor_index_padding_size_;
   static constexpr size_t kMaxItems = 10000000;  // 10 million
   const size_t item_index_padding_size_;
-  const string tensor_format_string_;
 };  // FileDatasetBase
 
 class CacheDatasetOp::FileDataset : public CacheDatasetOp::FileDatasetBase {
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 8c59ceab746252..12f22ca8bb107f 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -875,6 +875,7 @@ tf_kernel_library(
         "//tensorflow/core/kernels/data/experimental/sql",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -888,6 +889,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:summary_interface",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -902,6 +904,7 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -936,6 +939,7 @@ tf_kernel_library(
         "//tensorflow/core/framework:dataset_options_proto_cc",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -988,6 +992,7 @@ tf_kernel_library(
         "//tensorflow/core/framework:types_proto_cc",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 01eba9c38c7455..607d434ec4fa1f 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/experimental/random_dataset_op.h"
 
+#include <cstdint>
+#include <limits>
+#include <memory>
 #include <string>
 #include <utility>
 
@@ -84,7 +87,8 @@ class RandomDatasetOp::Dataset : public DatasetBase {
     // These splits aren't actually used during iteration.
     // TODO(aaudibert): Avoid sending dummy splits over RPC when using tf.data
     // service with RandomDataset.
-    split_providers->push_back(std::make_unique<IndexSplitProvider>(kint64max));
+    split_providers->push_back(std::make_unique<IndexSplitProvider>(
+        std::numeric_limits<int64_t>::max()));
     return absl::OkStatus();
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index dc974c34b735f2..fb4ac9abe70131 100644
--- a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -24,7 +24,7 @@ namespace experimental {
 namespace sql {
 
 std::unique_ptr<QueryConnection> DriverManager::CreateQueryConnection(
-    const string& driver_name) {
+    const std::string& driver_name) {
   if (driver_name == "sqlite") {
     return std::make_unique<SqliteQueryConnection>();
   } else {  // TODO(b/64276826, b/64276995) Add support for other db types.
diff --git a/tensorflow/core/kernels/data/experimental/sql/driver_manager.h b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
index 7aa307e2690b13..d15afd9f0b7398 100644
--- a/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -32,7 +32,7 @@ class DriverManager {
   // == `sqlite`, then `CreateQueryConnection` will create a
   // `SqliteQueryConnection` instance.
   static std::unique_ptr<QueryConnection> CreateQueryConnection(
-      const string& driver_name);
+      const std::string& driver_name);
 };
 
 }  // namespace sql
diff --git a/tensorflow/core/kernels/data/experimental/sql/query_connection.h b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
index 031a87253e5fad..68070966e7f6a0 100644
--- a/tensorflow/core/kernels/data/experimental/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -48,7 +48,8 @@ class QueryConnection {
   // The client must call `Close()` to release the connection resources, even
   // if `Open()` fails. `Close()` must be called before making another call
   // to `Open()`.
-  virtual absl::Status Open(const string& data_source_name, const string& query,
+  virtual absl::Status Open(const std::string& data_source_name,
+                            const std::string& query,
                             const DataTypeVector& output_types) = 0;
   // Closes an opened connection.
   virtual absl::Status Close() = 0;
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index 714899b59dc993..51ba27aa0ce500 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -31,8 +31,8 @@ SqliteQueryConnection::~SqliteQueryConnection() {
   if (db_ != nullptr) db_->Unref();
 }
 
-absl::Status SqliteQueryConnection::Open(const string& data_source_name,
-                                         const string& query,
+absl::Status SqliteQueryConnection::Open(const std::string& data_source_name,
+                                         const std::string& query,
                                          const DataTypeVector& output_types) {
   if (db_ != nullptr) {
     return errors::FailedPrecondition(
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
index 4cf2608c22f02c..fe9fe8d0d83535 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -30,7 +30,8 @@ class SqliteQueryConnection : public QueryConnection {
  public:
   SqliteQueryConnection();
   ~SqliteQueryConnection() override;
-  absl::Status Open(const string& data_source_name, const string& query,
+  absl::Status Open(const std::string& data_source_name,
+                    const std::string& query,
                     const DataTypeVector& output_types) override;
   absl::Status Close() override;
   absl::Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -46,7 +47,7 @@ class SqliteQueryConnection : public QueryConnection {
   Sqlite* db_ = nullptr;
   SqliteStatement stmt_;
   int column_count_ = 0;
-  string query_;
+  std::string query_;
   DataTypeVector output_types_;
 };
 
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index c2bc283e140ed8..cab138c9903c42 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -82,8 +84,8 @@ class SqlDatasetOp : public DatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const string& driver_name,
-            const string& data_source_name, const string& query,
+    Dataset(OpKernelContext* ctx, const std::string& driver_name,
+            const std::string& data_source_name, const std::string& query,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
@@ -94,7 +96,7 @@ class SqlDatasetOp : public DatasetOpKernel {
           output_shapes_(output_shapes) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::Sql")});
     }
@@ -107,7 +109,7 @@ class SqlDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() const override { return "SqlDatasetOp::Dataset"; }
+    std::string DebugString() const override { return "SqlDatasetOp::Dataset"; }
 
     absl::Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 95d4c0169c52af..5e9d433b2e1785 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -44,9 +46,11 @@ static mutex* get_counters_map_lock() {
   return &counters_map_lock;
 }
 
-static std::unordered_map<string, monitoring::Counter<1>*>* get_counters_map() {
-  static std::unordered_map<string, monitoring::Counter<1>*>* counters_map =
-      new std::unordered_map<string, monitoring::Counter<1>*>;
+static std::unordered_map<std::string, monitoring::Counter<1>*>*
+get_counters_map() {
+  static std::unordered_map<std::string, monitoring::Counter<1>*>*
+      counters_map =
+          new std::unordered_map<std::string, monitoring::Counter<1>*>;
   return counters_map;
 }
 
@@ -54,7 +58,7 @@ class StatsAggregatorImpl : public StatsAggregator {
  public:
   StatsAggregatorImpl() {}
 
-  void AddToHistogram(const string& name, absl::Span<const double> values,
+  void AddToHistogram(const std::string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];
@@ -63,7 +67,7 @@ class StatsAggregatorImpl : public StatsAggregator {
     }
   }
 
-  void AddScalar(const string& name, float value,
+  void AddScalar(const std::string& name, float value,
                  const int64_t steps) override {
     mutex_lock l(mu_);
     scalars_[name] = value;
@@ -72,7 +76,7 @@ class StatsAggregatorImpl : public StatsAggregator {
   void EncodeToProto(Summary* out_summary) override {
     mutex_lock l(mu_);
     for (const auto& pair : histograms_) {
-      const string& name = pair.first;
+      const std::string& name = pair.first;
       const histogram::Histogram& histogram = pair.second;
 
       Summary::Value* value = out_summary->add_value();
@@ -94,7 +98,7 @@ class StatsAggregatorImpl : public StatsAggregator {
     return absl::OkStatus();
   }
 
-  void IncrementCounter(const string& name, const string& label,
+  void IncrementCounter(const std::string& name, const std::string& label,
                         int64_t val) override {
     mutex_lock l(*get_counters_map_lock());
     auto counters_map = get_counters_map();
@@ -112,9 +116,9 @@ class StatsAggregatorImpl : public StatsAggregator {
 
  private:
   mutex mu_;
-  std::unordered_map<string, histogram::Histogram> histograms_
+  std::unordered_map<std::string, histogram::Histogram> histograms_
       TF_GUARDED_BY(mu_);
-  std::unordered_map<string, float> scalars_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, float> scalars_ TF_GUARDED_BY(mu_);
   StatsAggregatorImpl(const StatsAggregatorImpl&) = delete;
   void operator=(const StatsAggregatorImpl&) = delete;
 };
@@ -143,7 +147,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     }
   }
 
-  void AddToHistogram(const string& name, absl::Span<const double> values,
+  void AddToHistogram(const std::string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];
@@ -153,7 +157,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     AddToEvents(name, steps, histogram);
   }
 
-  void AddScalar(const string& name, float value,
+  void AddScalar(const std::string& name, float value,
                  const int64_t steps) override {
     mutex_lock l(mu_);
     AddToEvents(name, steps, value);
@@ -167,7 +171,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     return absl::OkStatus();
   }
 
-  void IncrementCounter(const string& name, const string& label,
+  void IncrementCounter(const std::string& name, const std::string& label,
                         int64_t val) override {
     mutex_lock l(*get_counters_map_lock());
     auto counters_map = get_counters_map();
@@ -203,7 +207,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
   }
 
  private:
-  void AddToEvents(const string& name, const int64_t steps,
+  void AddToEvents(const std::string& name, const int64_t steps,
                    const float scalar_value) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (summary_writer_interface_ == nullptr) {
       return;
@@ -218,7 +222,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     TF_CHECK_OK(summary_writer_interface_->WriteEvent(std::move(e)));
   }
 
-  void AddToEvents(const string& name, const int64_t steps,
+  void AddToEvents(const std::string& name, const int64_t steps,
                    const histogram::Histogram& histogram)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (summary_writer_interface_ == nullptr) {
@@ -238,7 +242,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
       nullptr;
   // not owned, we might be associating the default summary_writer from the
   // context
-  std::unordered_map<string, histogram::Histogram> histograms_
+  std::unordered_map<std::string, histogram::Histogram> histograms_
       TF_GUARDED_BY(mu_);
   StatsAggregatorImplV2(const StatsAggregatorImplV2&) = delete;
   void operator=(const StatsAggregatorImplV2&) = delete;
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 3fbf149172c562..3961ae52923e7e 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -15,10 +15,12 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset_options.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -58,7 +60,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     std::string tag)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           tag_(std::move(tag)) {
@@ -68,7 +71,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::LatencyStats")});
     }
@@ -80,7 +83,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "LatencyStatsDatasetOp::Dataset";
     }
 
@@ -125,10 +128,10 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
                                    std::vector<Tensor>* out_tensors,
                                    bool* end_of_sequence) override {
         tf_shared_lock l(mu_);
-        uint64 start = EnvTime::NowMicros();
+        uint64_t start = EnvTime::NowMicros();
         absl::Status s =
             input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-        uint64 end = EnvTime::NowMicros();
+        uint64_t end = EnvTime::NowMicros();
         auto stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator && !*end_of_sequence) {
           int64_t steps = num_elements();
@@ -184,7 +187,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     std::string tag)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           tag_(std::move(tag)) {
@@ -194,7 +198,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::BytesProducedStats")});
     }
@@ -206,7 +210,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index 02cea46c2ff1ad..a76d33ddb25ba5 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -70,7 +70,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::TakeWhile")});
     }
@@ -83,7 +83,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "TakeWhileDatasetOp::Dataset";
     }
 
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index a3696281f18071..285144ce9448fc 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -17,11 +17,13 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset_options.pb.h"
@@ -65,8 +67,8 @@ absl::Status ValidateNumThreads(int32_t num_threads) {
 class ThreadPoolResource : public ResourceBase {
  public:
   ThreadPoolResource(Env* env, const ThreadOptions& thread_options,
-                     const string& name, int num_threads, bool low_latency_hint,
-                     int max_intra_op_parallelism)
+                     const std::string& name, int num_threads,
+                     bool low_latency_hint, int max_intra_op_parallelism)
       : thread_pool_(env, thread_options, name, num_threads, low_latency_hint),
         max_intra_op_parallelism_(max_intra_op_parallelism) {}
 
@@ -86,9 +88,9 @@ class ThreadPoolResource : public ResourceBase {
     }
   }
 
-  int32 NumThreads() { return thread_pool_.NumThreads(); }
+  int32_t NumThreads() { return thread_pool_.NumThreads(); }
 
-  string DebugString() const override { return "ThreadPoolResource"; }
+  std::string DebugString() const override { return "ThreadPoolResource"; }
 
  private:
   thread::ThreadPool thread_pool_;
@@ -156,7 +158,7 @@ class ThreadPoolHandleOp : public OpKernel {
   mutex mu_;
   ContainerInfo cinfo_ TF_GUARDED_BY(mu_);
   bool initialized_ TF_GUARDED_BY(mu_) = false;
-  string display_name_;
+  std::string display_name_;
   int num_threads_;
   int max_intra_op_parallelism_;
 };
@@ -194,7 +196,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::ThreadPool")});
     }
@@ -206,7 +208,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "ThreadPoolDatasetOp::Dataset";
     }
 
@@ -316,7 +318,7 @@ class MaxIntraOpParallelismDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(Iterator::Params{
         this, absl::StrCat(prefix, "::MaxIntraOpParallelism")});
   }
@@ -328,7 +330,7 @@ class MaxIntraOpParallelismDatasetOp::Dataset : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return "MaxIntraOpParallelismDatasetOp::Dataset";
   }
 
@@ -460,7 +462,7 @@ class PrivateThreadPoolDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(
         Iterator::Params{this, absl::StrCat(prefix, "::PrivateThreadPool")});
   }
@@ -472,7 +474,7 @@ class PrivateThreadPoolDatasetOp::Dataset : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return "PrivateThreadPoolDatasetOp::Dataset";
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index e245ddddce90a9..92f504347a4dae 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -17,12 +17,14 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -47,7 +49,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(
         Iterator::Params{this, absl::StrCat(prefix, "::Unique")});
   }
@@ -60,7 +62,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat("UniqueDatasetOp::Dataset");
   }
 
@@ -173,7 +175,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
         } else {
           DCHECK_EQ(DT_STRING, t.dtype());
           auto flat_t = t.flat<tstring>();
-          uint64 hash = 0;
+          uint64_t hash = 0;
           for (int64_t i = 0; i < t.NumElements(); ++i) {
             hash = Hash64Combine(hash, Hash64(flat_t(i)));
           }
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
index b218f27516f14e..3692d02cdbdd1d 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
@@ -12,6 +12,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/experimental/unique_dataset_op.h"
 
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -44,7 +45,8 @@ class UniqueDatasetParams : public DatasetParams {
 
   std::vector<Tensor> GetInputTensors() const override { return {}; }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->clear();
     input_names->emplace_back(UniqueDatasetOp::kInputDataset);
     return absl::OkStatus();
@@ -57,7 +59,9 @@ class UniqueDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return UniqueDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return UniqueDatasetOp::kDatasetType;
+  }
 };
 
 class UniqueDatasetOpTest : public DatasetOpsTestBase {};
diff --git a/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
index c6e806e4cde410..833d3a4e4d3237 100644
--- a/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
@@ -168,7 +168,7 @@ class WeightedFlatMapDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 205bd45973f21a..2641c447a16249 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <functional>
+#include <limits>
 #include <optional>
 #include <string>
 #include <utility>
@@ -73,7 +75,7 @@ int64_t sgn(int64_t val) { return (0 < val) - (val < 0); }
 
 int64_t RangeCardinality(int64_t start, int64_t stop, int64_t step) {
   // `enumerate` uses int max to simulate an infinite range dataset.
-  if (stop >= tsl::kint64max) {
+  if (stop >= std::numeric_limits<int64_t>::max()) {
     return kInfiniteCardinality;
   }
 
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 5af2eb1210e389..5a095bc82b3cd6 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -67,6 +67,7 @@ TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_float8_e5m2(DEFINE_GPU_KERNELS);
 TF_CALL_float8_e4m3fn(DEFINE_GPU_KERNELS);
+TF_CALL_float4_e2m1fn(DEFINE_GPU_KERNELS);
 TF_CALL_int4(DEFINE_GPU_KERNELS);
 TF_CALL_uint4(DEFINE_GPU_KERNELS);
 TF_CALL_int2(DEFINE_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 17e8e4cb190528..366f67f13c8769 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // See docs in ../ops/data_flow_ops.cc.
 
 #include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -47,10 +50,10 @@ class DynamicPartitionOp_Shared : public OpKernel {
     OP_REQUIRES(
         c,
         TensorShapeUtils::StartsWith((*data)->shape(), (*partitions)->shape()),
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "data.shape must start with partitions.shape, ",
             "got data.shape = ", (*data)->shape().DebugString(),
-            ", partitions.shape = ", (*partitions)->shape().DebugString()));
+            ", partitions.shape = ", (*partitions)->shape().DebugString())));
 
     // Count how many occurrences of each partition id we have in partitions
     absl::InlinedVector<int, 32UL> partition_count(num_partitions_);
@@ -59,9 +62,9 @@ class DynamicPartitionOp_Shared : public OpKernel {
     for (int64_t i = 0; i < N; i++) {
       const int32_t p = internal::SubtleMustCopy(e_partitions(i));
       OP_REQUIRES(c, FastBoundsCheck(p, num_partitions_),
-                  errors::InvalidArgument(
+                  absl::InvalidArgumentError(absl::StrCat(
                       "partitions", SliceDebugString((*partitions)->shape(), i),
-                      " = ", p, " is not in [0, ", num_partitions_, ")"));
+                      " = ", p, " is not in [0, ", num_partitions_, ")")));
       partition_count[p]++;
     }
 
@@ -111,14 +114,14 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       }
       for (int64_t i = 0; i < N; i++) {
         const int32_t p = internal::SubtleMustCopy(e_partitions(i));
-        OP_REQUIRES(
-            c, FastBoundsCheck(p, num_partitions_),
-            errors::InvalidArgument("indices[", i, "] is out of range"));
+        OP_REQUIRES(c, FastBoundsCheck(p, num_partitions_),
+                    absl::InvalidArgumentError(
+                        absl::StrCat("indices[", i, "] is out of range")));
         auto oi = output_index[p];
         OP_REQUIRES(c, FastBoundsCheck(oi, out_vec[p].size()),
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "out_vec[", p, "] size: ", out_vec[p].size(),
-                        " is not LTE output_index[", p, "] : ", oi));
+                        " is not LTE output_index[", p, "] : ", oi)));
         out_vec[p](oi) = data_flat(i);
         output_index[p]++;
       }
@@ -139,15 +142,16 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       for (int64_t i = 0; i < N; i++) {
         // outputs[p][output_index[p]++] = data[i]
         const int32_t p = internal::SubtleMustCopy(e_partitions(i));
-        OP_REQUIRES(
-            c, FastBoundsCheck(p, num_partitions_),
-            errors::InvalidArgument("indices[", i,
-                                    "] has been asynchronously overwritten and "
-                                    "is no longer in range!"));
+        OP_REQUIRES(c, FastBoundsCheck(p, num_partitions_),
+                    absl::InvalidArgumentError(absl::StrCat(
+                        "indices[", i,
+                        "] has been asynchronously overwritten and "
+                        "is no longer in range!")));
         auto oi = output_index[p];
-        OP_REQUIRES(c, FastBoundsCheck(oi, out_flat[p].dimension(0)),
-                    errors::InvalidArgument("Size of output_index: ", oi,
-                                            " is no longer in range."));
+        OP_REQUIRES(
+            c, FastBoundsCheck(oi, out_flat[p].dimension(0)),
+            absl::InvalidArgumentError(absl::StrCat(
+                "Size of output_index: ", oi, " is no longer in range.")));
         Eigen::DSizes<Eigen::DenseIndex, 2> out_indices(oi, 0);
         Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0);
         out_flat[p].slice(out_indices, sizes) =
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 68b20ef6e3d7a0..17ab793bc74521 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -67,6 +67,7 @@ DEFINE_SETZERO_CPU(float8_e4m3fn);
 DEFINE_SETZERO_CPU(float8_e4m3fnuz);
 DEFINE_SETZERO_CPU(float8_e4m3b11fnuz);
 DEFINE_SETZERO_CPU(float8_e5m2fnuz);
+DEFINE_SETZERO_CPU(float4_e2m1fn);
 DEFINE_SETZERO_CPU(int4);
 DEFINE_SETZERO_CPU(uint4);
 DEFINE_SETZERO_CPU(int2);
@@ -102,6 +103,7 @@ DEFINE_SETONE_CPU(float8_e4m3fn);
 DEFINE_SETONE_CPU(float8_e4m3fnuz);
 DEFINE_SETONE_CPU(float8_e4m3b11fnuz);
 DEFINE_SETONE_CPU(float8_e5m2fnuz);
+DEFINE_SETONE_CPU(float4_e2m1fn);
 DEFINE_SETONE_CPU(int4);
 DEFINE_SETONE_CPU(uint4);
 DEFINE_SETONE_CPU(int2);
@@ -145,6 +147,7 @@ DEFINE_FILL_CPU(float8_e4m3fn);
 DEFINE_FILL_CPU(float8_e4m3fnuz);
 DEFINE_FILL_CPU(float8_e4m3b11fnuz);
 DEFINE_FILL_CPU(float8_e5m2fnuz);
+DEFINE_FILL_CPU(float4_e2m1fn);
 TF_CALL_int4(DEFINE_FILL_CPU);
 TF_CALL_uint4(DEFINE_FILL_CPU);
 #undef DEFINE_FILL_CPU
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 59ac5db06545b5..4cac7f8f1be68a 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
+#include <limits>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/gather_functor.h"
@@ -56,7 +61,7 @@ class GatherOp : public OpKernel {
     const Tensor& indices = c->input(1);
     OP_REQUIRES(
         c, TensorShapeUtils::IsVectorOrHigher(params.shape()),
-        errors::InvalidArgument("params must be at least 1 dimensional"));
+        absl::InvalidArgumentError("params must be at least 1 dimensional"));
 
     // GatherV2 added an axis argument. For backwards compatibility with Gather,
     // fall back to axis 0 if the op does not have an axis input.
@@ -66,26 +71,29 @@ class GatherOp : public OpKernel {
       axis_is_set = true;
       const Tensor& axis_tensor = c->input(2);
       OP_REQUIRES(c, TensorShapeUtils::IsScalar(axis_tensor.shape()),
-                  errors::InvalidArgument("axis must be scalar"));
+                  absl::InvalidArgumentError("axis must be scalar"));
 
       if (axis_tensor.dtype() == DT_INT32) {
-        axis = axis_tensor.scalar<int32>()();
+        axis = axis_tensor.scalar<int32_t>()();
       } else if (axis_tensor.dtype() == DT_INT64) {
         axis = axis_tensor.scalar<int64_t>()();
       } else {
         OP_REQUIRES(c, false,
-                    errors::InvalidArgument("axis must be int32 or int64."));
+                    absl::InvalidArgumentError("axis must be int32 or int64."));
       }
     }
-    // special case to avoid checkfail when axis = kint64max.
-    OP_REQUIRES(c, axis < kint64max,
-                absl::InvalidArgumentError("axis must be less than kint64max"));
+    // special case to avoid checkfail when axis =
+    // std::numeric_limits<int64_t>::max().
+    OP_REQUIRES(
+        c, axis < std::numeric_limits<int64_t>::max(),
+        absl::InvalidArgumentError(
+            "axis must be less than std::numeric_limits<int64_t>::max()"));
 
     int64_t min_params_dim = axis < 0 ? -axis : axis + 1;
-    OP_REQUIRES(
-        c, params.dims() >= min_params_dim,
-        errors::InvalidArgument("Shape must be at least rank ", min_params_dim,
-                                " but is rank ", params.dims()));
+    OP_REQUIRES(c, params.dims() >= min_params_dim,
+                absl::InvalidArgumentError(
+                    absl::StrCat("Shape must be at least rank ", min_params_dim,
+                                 " but is rank ", params.dims())));
 
     if (axis < 0) {
       axis = params.dims() + axis;
@@ -96,9 +104,9 @@ class GatherOp : public OpKernel {
     if (batch_dims != 0) {
       OP_REQUIRES(c,
                   batch_dims >= -indices.dims() && batch_dims <= indices.dims(),
-                  errors::InvalidArgument("Expected batch_dims in the range [",
-                                          -indices.dims(), ", ", indices.dims(),
-                                          "], but got ", batch_dims));
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "Expected batch_dims in the range [", -indices.dims(),
+                      ", ", indices.dims(), "], but got ", batch_dims)));
 
       if (batch_dims < 0) {
         batch_dims = indices.dims() + batch_dims;
@@ -106,33 +114,35 @@ class GatherOp : public OpKernel {
 
       if (!axis_is_set) axis = batch_dims;
 
-      OP_REQUIRES(c, batch_dims < params.dims(),
-                  errors::InvalidArgument("batch_dims (", batch_dims,
-                                          ") must be less than rank(params) (",
-                                          params.dims(), ")."));
-
-      OP_REQUIRES(c, axis >= batch_dims,
-                  errors::InvalidArgument("batch_dims (", batch_dims,
-                                          ") must be less than or equal to ",
-                                          "axis (", axis, ")."));
+      OP_REQUIRES(
+          c, batch_dims < params.dims(),
+          absl::InvalidArgumentError(absl::StrCat(
+              "batch_dims (", batch_dims, ") must be less than rank(params) (",
+              params.dims(), ").")));
+
+      OP_REQUIRES(
+          c, axis >= batch_dims,
+          absl::InvalidArgumentError(absl::StrCat(
+              "batch_dims (", batch_dims, ") must be less than or equal to ",
+              "axis (", axis, ").")));
       for (int i = 0; i < batch_dims; ++i) {
         OP_REQUIRES(c, params.dim_size(i) == indices.dim_size(i),
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "params.shape[", i, "]: ", params.dim_size(i),
                         " should be equal to indices.shape[", i,
-                        "]: ", indices.dim_size(i)));
+                        "]: ", indices.dim_size(i))));
       }
     }
 
     // Check that we have enough index space
     int64_t gather_dim_size = params.dim_size(axis);
     const int64_t N = indices.NumElements();
-    OP_REQUIRES(
-        c, gather_dim_size <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[", axis, "] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", gather_dim_size, " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, gather_dim_size <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[", axis, "] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", gather_dim_size, " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // The result shape is params.shape[:axis] + indices.shape[batch_dims:] +
     // params.shape[axis + 1:].
@@ -182,15 +192,15 @@ class GatherOp : public OpKernel {
     }
     OP_REQUIRES(
         c, bad_i < 0,
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
+            indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")")));
   }
 
  private:
   // The number of batch dimensions, as passed in the batch_dims attribute.
   // It must be less than or equal to rank(indices).
-  int32 batch_dims_ = 0;
+  int32_t batch_dims_ = 0;
 };
 
 #define REGISTER_GATHER_FULL(dev, type, index_type)                    \
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index f3cbce2fb249bb..ee1e1c16399141 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -56,8 +56,8 @@ TEST_F(GatherOpTest, ScalarIndices) {
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({}), {3});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {3});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -74,8 +74,8 @@ TEST_F(GatherOpTest, ScalarIndices_Complex) {
       TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
                          std::complex<float>(2, 12), std::complex<float>(3, 13),
                          std::complex<float>(4, 14)});
-  AddInputFromArray<int32>(TensorShape({}), {3});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {3});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -91,8 +91,8 @@ TEST_F(GatherOpTest, Simple_TwoD32_Axis0) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -106,8 +106,8 @@ TEST_F(GatherOpTest, InvalidInputShape_TwoD32) {
 
   // Feed invalid input shape and run
   AddInput<float>(TensorShape({0, 3}), [](int) -> float { return 0.f; });
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   auto s = RunOpKernel();
   EXPECT_TRUE(
       absl::StrContains(s.ToString(), "indices[0] = 0 is not in [0, 0)"))
@@ -120,8 +120,8 @@ TEST_F(GatherOpTest, Simple_TwoD32_Axis1) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 1, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 1, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {1});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -136,8 +136,8 @@ TEST_F(GatherOpTest, ZeroSize_TwoD32) {
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 0}), {});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -166,8 +166,8 @@ TEST_F(GatherOpTest, HighRank) {
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3});
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output
@@ -182,8 +182,8 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   absl::Status s = RunOpKernel();
   EXPECT_TRUE(
       absl::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
@@ -196,8 +196,8 @@ TEST_F(GatherOpTest, Error_BatchDimsOutOfRange) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   absl::Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Expected batch_dims in the range [-1, 1], but got 10"))
@@ -256,8 +256,8 @@ static Graph* Gather(int dim) {
       ->Arg(200)                                                              \
       ->Arg(1000)
 
-BM_GATHER(cpu, int32);
-BM_GATHER(gpu, int32);
+BM_GATHER(cpu, int32_t);
+BM_GATHER(gpu, int32_t);
 BM_GATHER(cpu, int64_t);
 BM_GATHER(gpu, int64_t);
 
diff --git a/tensorflow/core/kernels/generate_vocab_remapping_op.cc b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
index e54173c4b07f2f..5ab9e2eea4790f 100644
--- a/tensorflow/core/kernels/generate_vocab_remapping_op.cc
+++ b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
@@ -56,7 +56,7 @@ class GenerateVocabRemappingOp : public OpKernel {
                     new_vocab_file_tensor->shape().DebugString()));
 
     // Build a new ID->token lookup table.
-    const string& new_vocab_filename =
+    const std::string& new_vocab_filename =
         new_vocab_file_tensor->scalar<tstring>()();
     OP_REQUIRES(context, !new_vocab_filename.empty(),
                 errors::InvalidArgument("new vocab filename cannot be empty."));
@@ -88,7 +88,7 @@ class GenerateVocabRemappingOp : public OpKernel {
                     "old_vocab_file should be a single string, but got ",
                     old_vocab_file_tensor->shape().DebugString()));
     // Build a token->old ID lookup table.
-    const string& old_vocab_filename =
+    const std::string& old_vocab_filename =
         old_vocab_file_tensor->scalar<tstring>()();
     OP_REQUIRES(context, !old_vocab_filename.empty(),
                 errors::InvalidArgument("new vocab filename cannot be empty."));
diff --git a/tensorflow/core/kernels/guarantee_const_op_test.cc b/tensorflow/core/kernels/guarantee_const_op_test.cc
index 09e74987c72598..a9467b6246b03a 100644
--- a/tensorflow/core/kernels/guarantee_const_op_test.cc
+++ b/tensorflow/core/kernels/guarantee_const_op_test.cc
@@ -40,20 +40,20 @@ class GuaranteeConstOpTest : public OpsTestBase {
 
 TEST_F(GuaranteeConstOpTest, Int32Success_6) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(GuaranteeConstOpTest, Int32Success_2_3) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(GuaranteeConstOpTest, StringSuccess) {
diff --git a/tensorflow/core/kernels/histogram_op.cc b/tensorflow/core/kernels/histogram_op.cc
index bf7c6fb5770da7..28f179b0a44932 100644
--- a/tensorflow/core/kernels/histogram_op.cc
+++ b/tensorflow/core/kernels/histogram_op.cc
@@ -48,9 +48,9 @@ struct HistogramFixedWidthFunctor<CPUDevice, T, Tout> {
 
     Tensor index_to_bin_tensor;
     TF_RETURN_IF_ERROR(context->forward_input_or_allocate_temp(
-        {0}, DataTypeToEnum<int32>::value, TensorShape({values.size()}),
+        {0}, DataTypeToEnum<int32_t>::value, TensorShape({values.size()}),
         &index_to_bin_tensor));
-    auto index_to_bin = index_to_bin_tensor.flat<int32>();
+    auto index_to_bin = index_to_bin_tensor.flat<int32_t>();
 
     // Avoid overflow in step computation.
     const double step =
@@ -59,9 +59,9 @@ struct HistogramFixedWidthFunctor<CPUDevice, T, Tout> {
     const double nbins_minus_1 = static_cast<double>(nbins - 1);
 
     // We cannot handle NANs in the algorithm below (due to the cast to int32)
-    const Eigen::Tensor<int32, 1, 1> nans_tensor =
-        values.isnan().template cast<int32>();
-    const Eigen::Tensor<int32, 0, 1> reduced_tensor = nans_tensor.sum();
+    const Eigen::Tensor<int32_t, 1, 1> nans_tensor =
+        values.isnan().template cast<int32_t>();
+    const Eigen::Tensor<int32_t, 0, 1> reduced_tensor = nans_tensor.sum();
     const int num_nans = reduced_tensor(0);
     if (num_nans > 0) {
       return errors::InvalidArgument("Histogram values must not contain NaN");
@@ -82,7 +82,7 @@ struct HistogramFixedWidthFunctor<CPUDevice, T, Tout> {
                                static_cast<double>(value_range(0))) /
                               step)
                                  .cwiseMin(nbins_minus_1)
-                                 .template cast<int32>();
+                                 .template cast<int32_t>();
 
     out.setZero();
     for (int32_t i = 0; i < index_to_bin.size(); i++) {
@@ -114,7 +114,7 @@ class HistogramFixedWidthOp : public OpKernel {
 
     const auto values = values_tensor.flat<T>();
     const auto value_range = value_range_tensor.flat<T>();
-    const auto nbins = nbins_tensor.scalar<int32>()();
+    const auto nbins = nbins_tensor.scalar<int32_t>()();
 
     OP_REQUIRES(
         ctx, value_range(0) < value_range(1),
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
index 9d9cda422824a3..e9d98426c7fae4 100644
--- a/tensorflow/core/kernels/host_constant_op.cc
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -49,7 +49,7 @@ void _HostConstantOp::Compute(OpKernelContext* ctx) {
 REGISTER_KERNEL_BUILDER(Name("Const")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
+                            .TypeConstraint<int32_t>("dtype"),
                         _HostConstantOp);
 
 // HostConst: forced to generate output on the host.
diff --git a/tensorflow/core/kernels/identity_n_op_test.cc b/tensorflow/core/kernels/identity_n_op_test.cc
index 1cde97d5301188..cf2fd5b476570b 100644
--- a/tensorflow/core/kernels/identity_n_op_test.cc
+++ b/tensorflow/core/kernels/identity_n_op_test.cc
@@ -39,13 +39,13 @@ class IdentityNOpTest : public OpsTestBase {
 
 TEST_F(IdentityNOpTest, Int32DoubleSuccess_6) {
   TF_ASSERT_OK(Init(DT_INT32, DT_DOUBLE));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<double>(TensorShape({6}),
                             {7.3, 8.3, 9.3, 10.3, 11.3, 12.3});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected0(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected0, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected0, *GetOutput(0));
+  test::FillValues<int32_t>(&expected0, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected0, *GetOutput(0));
   Tensor expected1(allocator(), DT_DOUBLE, TensorShape({6}));
   test::FillValues<double>(&expected1, {7.3, 8.3, 9.3, 10.3, 11.3, 12.3});
   test::ExpectTensorEqual<double>(expected1, *GetOutput(1));
@@ -53,27 +53,27 @@ TEST_F(IdentityNOpTest, Int32DoubleSuccess_6) {
 
 TEST_F(IdentityNOpTest, Int32Success_2_3) {
   TF_ASSERT_OK(Init(DT_INT32, DT_INT32));
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({2, 3}), {7, 8, 9, 10, 11, 12});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {7, 8, 9, 10, 11, 12});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
-  test::FillValues<int32>(&expected, {7, 8, 9, 10, 11, 12});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(1));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {7, 8, 9, 10, 11, 12});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(1));
 }
 
 TEST_F(IdentityNOpTest, StringInt32Success) {
   TF_ASSERT_OK(Init(DT_STRING, DT_INT32));
   AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
-  AddInputFromArray<int32>(TensorShape({8}), {1, 3, 5, 7, 9, 11, 13, 15});
+  AddInputFromArray<int32_t>(TensorShape({8}), {1, 3, 5, 7, 9, 11, 13, 15});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected0(allocator(), DT_STRING, TensorShape({6}));
   test::FillValues<tstring>(&expected0, {"A", "b", "C", "d", "E", "f"});
   test::ExpectTensorEqual<tstring>(expected0, *GetOutput(0));
   Tensor expected1(allocator(), DT_INT32, TensorShape({8}));
-  test::FillValues<int32>(&expected1, {1, 3, 5, 7, 9, 11, 13, 15});
-  test::ExpectTensorEqual<int32>(expected1, *GetOutput(1));
+  test::FillValues<int32_t>(&expected1, {1, 3, 5, 7, 9, 11, 13, 15});
+  test::ExpectTensorEqual<int32_t>(expected1, *GetOutput(1));
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 222f89d9f43166..61cd717420f08b 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -193,7 +193,7 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
                               .TypeConstraint<type>("T"),     \
                           IdentityOp)
 
-REGISTER_DEFAULT_HOST_KERNEL(int32);
+REGISTER_DEFAULT_HOST_KERNEL(int32_t);
 REGISTER_DEFAULT_HOST_KERNEL(tstring);
 REGISTER_DEFAULT_HOST_KERNEL(ResourceHandle);
 
diff --git a/tensorflow/core/kernels/identity_op_test.cc b/tensorflow/core/kernels/identity_op_test.cc
index 408b28e9d805cb..62da102905007d 100644
--- a/tensorflow/core/kernels/identity_op_test.cc
+++ b/tensorflow/core/kernels/identity_op_test.cc
@@ -39,20 +39,20 @@ class IdentityOpTest : public OpsTestBase {
 
 TEST_F(IdentityOpTest, Int32Success_6) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(IdentityOpTest, Int32Success_2_3) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(IdentityOpTest, StringSuccess) {
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
index 28a39705bfa59d..cacde8cb079467 100644
--- a/tensorflow/core/kernels/identity_reader_op.cc
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 
 class IdentityReader : public ReaderBase {
  public:
-  explicit IdentityReader(const string& node_name)
+  explicit IdentityReader(const std::string& node_name)
       : ReaderBase(absl::StrCat("IdentityReader '", node_name, "'")) {}
 
   absl::Status ReadLocked(tstring* key, tstring* value, bool* produced,
diff --git a/tensorflow/core/kernels/image/adjust_contrast_op.cc b/tensorflow/core/kernels/image/adjust_contrast_op.cc
index df8650ebfed515..0890d444ed5538 100644
--- a/tensorflow/core/kernels/image/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.cc
@@ -99,10 +99,10 @@ class AdjustContrastOp : public OpKernel {
       Name("AdjustContrast").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       AdjustContrastOp<CPUDevice, T>);
 
-REGISTER_KERNEL(uint8);
-REGISTER_KERNEL(int8);
-REGISTER_KERNEL(int16);
-REGISTER_KERNEL(int32);
+REGISTER_KERNEL(uint8_t);
+REGISTER_KERNEL(int8_t);
+REGISTER_KERNEL(int16_t);
+REGISTER_KERNEL(int32_t);
 REGISTER_KERNEL(float);
 REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL
diff --git a/tensorflow/core/kernels/image/attention_ops.cc b/tensorflow/core/kernels/image/attention_ops.cc
index 71c8015bcd1ff1..99b1b246629852 100644
--- a/tensorflow/core/kernels/image/attention_ops.cc
+++ b/tensorflow/core/kernels/image/attention_ops.cc
@@ -33,12 +33,12 @@ namespace tensorflow {
 class ExtractGlimpseOp : public OpKernel {
  public:
   explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
-    const string& op = context->def().op();
+    const std::string& op = context->def().op();
     version_ = (op == "ExtractGlimpse") ? 1 : 2;
     OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
     OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
     bool uniform_noise = false;
-    string noise;
+    std::string noise;
     OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise));
     OP_REQUIRES_OK(context, context->GetAttr("noise", &noise));
     OP_REQUIRES(context,
@@ -132,7 +132,7 @@ class ExtractGlimpseOp : public OpKernel {
   bool normalized_;
   bool centered_;
   Eigen::ExtractGlimpsesNoiseMode noise_;
-  int32 version_;
+  int32_t version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
index cd7501a4966625..a667e2752ba013 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -83,14 +83,16 @@ static inline absl::Status ParseAndCheckBoxSizes(const Tensor& boxes,
 // [0, batch_size) then calls done.
 template <typename Device>
 inline void RunIfBoxIndexIsValid(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
-    int batch_size, const Callback& compute, const Callback& done);
+    OpKernelContext* context,
+    typename TTypes<int32_t, 1>::ConstTensor box_index, int batch_size,
+    const Callback& compute, const Callback& done);
 
 // Specialization of CheckValidBoxIndex for a CPUDevice.
 template <>
 inline void RunIfBoxIndexIsValid<CPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
-    int batch_size, const Callback& compute, const Callback& done) {
+    OpKernelContext* context,
+    typename TTypes<int32_t, 1>::ConstTensor box_index, int batch_size,
+    const Callback& compute, const Callback& done) {
   const int num_boxes = box_index.dimension(0);
   for (int b = 0; b < num_boxes; ++b) {
     OP_REQUIRES_ASYNC(
@@ -169,7 +171,7 @@ class CropAndResizeOp : public AsyncOpKernel {
         done);
 
     // Copy and validate crop sizes.
-    auto crop_size_vec = crop_size.vec<int32>();
+    auto crop_size_vec = crop_size.vec<int32_t>();
     const int crop_height = internal::SubtleMustCopy(crop_size_vec(0));
     const int crop_width = internal::SubtleMustCopy(crop_size_vec(1));
     OP_REQUIRES_ASYNC(
@@ -192,7 +194,7 @@ class CropAndResizeOp : public AsyncOpKernel {
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResize<Device, T>()(
           context, image.tensor<T, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), method_, extrapolation_value_,
+          box_index.tensor<int32_t, 1>(), method_, extrapolation_value_,
           output->tensor<float, 4>());
 
       if (!status) {
@@ -201,14 +203,14 @@ class CropAndResizeOp : public AsyncOpKernel {
       }
     };
 
-    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32_t, 1>(),
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
 
  private:
   float extrapolation_value_;
-  string method_;
+  std::string method_;
 };
 
 // Partial specialization of CropAndResize functor for a CPUDevice.
@@ -218,8 +220,8 @@ struct CropAndResize<CPUDevice, T> {
   bool operator()(OpKernelContext* context,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_index,
-                  const string& method_name, float extrapolation_value,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index,
+                  const std::string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
     const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
@@ -403,7 +405,7 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
                       errors::InvalidArgument("image_size must have 4 elements",
                                               image_size.shape().DebugString()),
                       done);
-    auto image_size_vec = image_size.vec<int32>();
+    auto image_size_vec = image_size.vec<int32_t>();
     const int batch_size = internal::SubtleMustCopy(image_size_vec(0));
     const int image_height = internal::SubtleMustCopy(image_size_vec(1));
     const int image_width = internal::SubtleMustCopy(image_size_vec(2));
@@ -440,7 +442,7 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
           context, grads.tensor<float, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), output->tensor<T, 4>(), method_);
+          box_index.tensor<int32_t, 1>(), output->tensor<T, 4>(), method_);
 
       if (!status) {
         context->SetStatus(errors::Internal(
@@ -448,13 +450,13 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
       }
     };
 
-    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32_t, 1>(),
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
 
  private:
-  string method_;
+  std::string method_;
 };
 
 // Partial specialization of CropAndResizeBackpropImage functor for a CPUDevice.
@@ -464,9 +466,9 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
   bool operator()(const OpKernelContext* context,
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_index,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index,
                   typename TTypes<T, 4>::Tensor grads_image,
-                  const string& method_name) {
+                  const std::string& method_name) {
     const int batch_size = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
@@ -583,7 +585,7 @@ class CropAndResizeGradBoxesOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradBoxesOp(OpKernelConstruction* context)
       : AsyncOpKernel(context) {
-    string method;
+    std::string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
@@ -658,14 +660,14 @@ class CropAndResizeGradBoxesOp : public AsyncOpKernel {
       const bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
           context->eigen_device<Device>(), grads.tensor<float, 4>(),
           image.tensor<T, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), output->tensor<float, 2>());
+          box_index.tensor<int32_t, 1>(), output->tensor<float, 2>());
       if (!status) {
         context->SetStatus(errors::Internal(
             "Failed to launch CropAndResizeBackpropBoxes kernel."));
       }
     };
 
-    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32_t, 1>(),
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
@@ -679,7 +681,7 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_index,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index,
                   typename TTypes<float, 2>::Tensor grads_boxes) {
     const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.h b/tensorflow/core/kernels/image/crop_and_resize_op.h
index dd838ea570e9cd..0e61fdb9b8bd41 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.h
@@ -30,7 +30,7 @@ struct CropAndResize {
   bool operator()(const OpKernelContext* context,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32_t, 1>::ConstTensor box_ind,
                   const std::string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops);
 };
@@ -41,7 +41,7 @@ struct CropAndResizeBackpropImage {
   bool operator()(const OpKernelContext* context,
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32_t, 1>::ConstTensor box_ind,
                   typename TTypes<T, 4>::Tensor grads_image,
                   const std::string& method_name);
 };
@@ -52,7 +52,7 @@ struct CropAndResizeBackpropBoxes {
   bool operator()(const Device& d, typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32_t, 1>::ConstTensor box_ind,
                   typename TTypes<float, 2>::Tensor grads_boxes);
 };
 
@@ -60,7 +60,7 @@ template <typename Device>
 struct CheckValidBoxIndexHelper {
   // Checks if all values in box_index are in [0, batch).
   void operator()(const Device& d,
-                  typename TTypes<int32, 1>::ConstTensor box_index, int batch,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index, int batch,
                   typename TTypes<bool, 0>::Tensor isvalid) {
     isvalid.device(d) = ((box_index >= 0) && (box_index < batch)).all();
   }
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
index abfa9c7c579674..8f73a552812d7e 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
@@ -29,7 +29,7 @@ static Graph* CropAndResize(int batches, int width, int height, int depth,
   Tensor boxes(DT_FLOAT, TensorShape({batches, 4}));
   auto boxes_tensor = boxes.matrix<float>();
   Tensor box_ind(DT_INT32, TensorShape({batches}));
-  auto box_ind_flat = box_ind.flat<int32>();
+  auto box_ind_flat = box_ind.flat<int32_t>();
   for (int i = 0; i < batches; ++i) {
     boxes_tensor(i, 0) = 0.2;
     boxes_tensor(i, 1) = 0.2;
@@ -38,7 +38,7 @@ static Graph* CropAndResize(int batches, int width, int height, int depth,
     box_ind_flat(i) = i;
   }
   Tensor crop_size(DT_INT32, TensorShape({2}));
-  auto crop_size_flat = crop_size.flat<int32>();
+  auto crop_size_flat = crop_size.flat<int32_t>();
   crop_size_flat(0) = crop_height;
   crop_size_flat(1) = crop_width;
   Node* ret;
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
index b82df065927a82..ac00c2d45245e4 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 class CropAndResizeOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(float extrapolation_value, const string& method) {
+  void MakeOp(float extrapolation_value, const std::string& method) {
     TF_EXPECT_OK(NodeDefBuilder("crop_and_resize_op", "CropAndResize")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_FLOAT))
@@ -76,24 +76,24 @@ class CropAndResizeOpTest : public OpsTestBase {
 
 REGISTER_TEST(float)
 REGISTER_TEST(double)
-REGISTER_TEST(uint8)
-REGISTER_TEST(uint16)
-REGISTER_TEST(int8)
-REGISTER_TEST(int16)
-REGISTER_TEST(int32)
+REGISTER_TEST(uint8_t)
+REGISTER_TEST(uint16_t)
+REGISTER_TEST(int8_t)
+REGISTER_TEST(int16_t)
+REGISTER_TEST(int32_t)
 REGISTER_TEST(int64_t)
 
 #undef REGISTER_TEST
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
-  MakeOp<uint8>(0, "bilinear");
+  MakeOp<uint8_t>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
-  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<uint8_t>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -102,14 +102,14 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8NearestNeibor) {
-  MakeOp<uint8>(0, "nearest");
+  MakeOp<uint8_t>(0, "nearest");
   // Input:
   //  1, 2
   //  3, 4
-  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<uint8_t>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -124,8 +124,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -140,8 +140,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1FlippedNearestNeighbor) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -156,8 +156,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -177,8 +177,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NearestNeighbor) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -198,8 +198,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -219,8 +219,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3FlippedNearestNeighbor) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -242,8 +242,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {0, 0, 1, 1, 0, 0, 0.5, 0.5});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -267,8 +267,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2NearestNeighbor) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {0, 0, 1, 1, 0, 0, 0.5, 0.5});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -292,8 +292,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {1, 1, 0, 0, 0.5, 0.5, 0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -317,8 +317,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2FlippedNearestNeighbor) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {1, 1, 0, 0, 0.5, 0.5, 0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -341,8 +341,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {-1, -1, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -362,8 +362,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({0, 4}), {});
-  AddInputFromArray<int32>(TensorShape({0}), {});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({0}), {});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({0, 3, 3, 1}));
@@ -377,8 +377,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
   MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   absl::Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(absl::StrContains(s.ToString(), "input image must be 4-D")) << s;
@@ -388,8 +388,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   absl::Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
@@ -401,8 +401,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {1});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   absl::Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(absl::StrContains(s.ToString(),
@@ -425,8 +425,8 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
                   [=](int i) -> float { return i % kLength; });
   AddInputFromArray<float>(TensorShape({2, 4}),
                            {0, 0, 0.5, 0.5, 0.5, 0.5, 1, 1});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {kHalf, kHalf});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {kHalf, kHalf});
 
   TF_ASSERT_OK(RunOpKernel());
 
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index 165747897a8e3c..a73d6f1ad660cb 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -129,7 +129,7 @@ class DecodeImageV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->GetAttr("acceptable_fraction",
                                       &flags_.min_acceptable_fraction));
-      string dct_method;
+      std::string dct_method;
       OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
       OP_REQUIRES(
           context,
@@ -189,7 +189,7 @@ class DecodeImageV2Op : public OpKernel {
   }
 
   // Helper for decoding BMP.
-  inline int32 ByteSwapInt32ForBigEndian(int32_t x) {
+  inline int32_t ByteSwapInt32ForBigEndian(int32_t x) {
     if (!port::kLittleEndian) {
       return BYTE_SWAP_32(x);
     } else {
@@ -198,7 +198,7 @@ class DecodeImageV2Op : public OpKernel {
   }
 
   // Helper for decoding BMP.
-  inline int16 ByteSwapInt16ForBigEndian(int16_t x) {
+  inline int16_t ByteSwapInt16ForBigEndian(int16_t x) {
     if (!port::kLittleEndian) {
       return BYTE_SWAP_16(x);
     } else {
@@ -264,7 +264,7 @@ class DecodeImageV2Op : public OpKernel {
       OP_REQUIRES(context, crop_window.dim_size(0) == 4,
                   errors::InvalidArgument("crop_size must have four elements ",
                                           crop_window.shape().DebugString()));
-      auto crop_window_vec = crop_window.vec<int32>();
+      auto crop_window_vec = crop_window.vec<int32_t>();
       flags.crop_y = crop_window_vec(0);
       flags.crop_x = crop_window_vec(1);
       flags.crop_height = crop_window_vec(2);
@@ -288,9 +288,9 @@ class DecodeImageV2Op : public OpKernel {
     // Decode JPEG. Directly allocate to the output buffer if data type is
     // uint8 (to save extra copying). Otherwise, allocate a new uint8 buffer
     // with buffer size. `jpeg::Uncompress` supports unit8 only.
-    uint8* buffer = jpeg::Uncompress(
+    uint8_t* buffer = jpeg::Uncompress(
         input.data(), input.size(), flags, nullptr /* nwarn */,
-        [&](int width, int height, int channels) -> uint8* {
+        [&](int width, int height, int channels) -> uint8_t* {
           buffer_size = height * width * channels;
           absl::Status status;
           // By the existing API, we support decoding JPEG with `DecodeGif`
@@ -310,9 +310,9 @@ class DecodeImageV2Op : public OpKernel {
           }
 
           if (data_type_ == DataType::DT_UINT8) {
-            return output->flat<uint8>().data();
+            return output->flat<uint8_t>().data();
           } else {
-            return new uint8[buffer_size];
+            return new uint8_t[buffer_size];
           }
         });
 
@@ -327,20 +327,20 @@ class DecodeImageV2Op : public OpKernel {
       return;
     }
     // Make sure we don't forget to deallocate `buffer`.
-    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+    std::unique_ptr<uint8_t[]> buffer_unique_ptr(buffer);
 
     // Convert uint8 image data to desired data type.
     // Use eigen threadpooling to speed up the copy operation.
     const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
-    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    TTypes<uint8_t>::UnalignedConstFlat buffer_view(buffer, buffer_size);
     if (data_type_ == DataType::DT_UINT16) {
-      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
-                           (std::numeric_limits<uint8>::max() + 1));
+      uint16_t scale = floor((std::numeric_limits<uint16_t>::max() + 1) /
+                             (std::numeric_limits<uint8_t>::max() + 1));
       // Fill output tensor with desired dtype.
-      output->flat<uint16>().device(device) =
-          buffer_view.cast<uint16>() * scale;
+      output->flat<uint16_t>().device(device) =
+          buffer_view.cast<uint16_t>() * scale;
     } else if (data_type_ == DataType::DT_FLOAT) {
-      float scale = 1. / std::numeric_limits<uint8>::max();
+      float scale = 1. / std::numeric_limits<uint8_t>::max();
       // Fill output tensor with desired dtype.
       output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
     }
@@ -415,35 +415,35 @@ class DecodeImageV2Op : public OpKernel {
       OP_REQUIRES(
           context,
           png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
-              decode.channels * width * sizeof(uint8), &decode),
+              reinterpret_cast<png_bytep>(output->flat<uint8_t>().data()),
+              decode.channels * width * sizeof(uint8_t), &decode),
           errors::InvalidArgument("Invalid PNG data, size ", input.size()));
     } else if (data_type_ == DataType::DT_UINT16) {
       OP_REQUIRES(
           context,
           png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
-              decode.channels * width * sizeof(uint16), &decode),
+              reinterpret_cast<png_bytep>(output->flat<uint16_t>().data()),
+              decode.channels * width * sizeof(uint16_t), &decode),
           errors::InvalidArgument("Invalid PNG data, size ", input.size()));
     } else if (data_type_ == DataType::DT_FLOAT) {
       // `png::CommonFinishDecode` does not support `float`. First allocate
       // uint16 buffer for the image and decode in uint16 (lossless). Wrap the
       // buffer in `unique_ptr` so that we don't forget to delete the buffer.
-      std::unique_ptr<uint16[]> buffer(
-          new uint16[height * width * decode.channels]);
+      std::unique_ptr<uint16_t[]> buffer(
+          new uint16_t[height * width * decode.channels]);
       OP_REQUIRES(
           context,
           png::CommonFinishDecode(reinterpret_cast<png_bytep>(buffer.get()),
-                                  decode.channels * width * sizeof(uint16),
+                                  decode.channels * width * sizeof(uint16_t),
                                   &decode),
           errors::InvalidArgument("Invalid PNG data, size ", input.size()));
 
       // Convert uint16 image data to desired data type.
       // Use eigen threadpooling to speed up the copy operation.
       const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
-      TTypes<uint16, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
-                                                  decode.channels);
-      float scale = 1. / std::numeric_limits<uint16>::max();
+      TTypes<uint16_t, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                    decode.channels);
+      float scale = 1. / std::numeric_limits<uint16_t>::max();
       // Fill output tensor with desired dtype.
       output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
     }
@@ -477,10 +477,10 @@ class DecodeImageV2Op : public OpKernel {
     // uint8 only.
     Tensor* output = nullptr;
     int64_t buffer_size = 0;
-    string error_string;
-    uint8* buffer = gif::Decode(
+    std::string error_string;
+    uint8_t* buffer = gif::Decode(
         input.data(), input.size(),
-        [&](int num_frames, int width, int height, int channels) -> uint8* {
+        [&](int num_frames, int width, int height, int channels) -> uint8_t* {
           buffer_size =
               static_cast<int64_t>(num_frames) * height * width * channels;
 
@@ -515,9 +515,9 @@ class DecodeImageV2Op : public OpKernel {
           }
 
           if (data_type_ == DataType::DT_UINT8) {
-            return output->flat<uint8>().data();
+            return output->flat<uint8_t>().data();
           } else {
-            return new uint8[buffer_size];
+            return new uint8_t[buffer_size];
           }
         },
         &error_string, expand_animations_);
@@ -532,20 +532,20 @@ class DecodeImageV2Op : public OpKernel {
       return;
     }
     // Make sure we don't forget to deallocate `buffer`.
-    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+    std::unique_ptr<uint8_t[]> buffer_unique_ptr(buffer);
 
     // Convert the raw uint8 buffer to desired dtype.
     // Use eigen threadpooling to speed up the copy operation.
-    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    TTypes<uint8_t>::UnalignedConstFlat buffer_view(buffer, buffer_size);
     const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
     if (data_type_ == DataType::DT_UINT16) {
-      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
-                           (std::numeric_limits<uint8>::max() + 1));
+      uint16_t scale = floor((std::numeric_limits<uint16_t>::max() + 1) /
+                             (std::numeric_limits<uint8_t>::max() + 1));
       // Fill output tensor with desired dtype.
-      output->flat<uint16>().device(device) =
-          buffer_view.cast<uint16>() * scale;
+      output->flat<uint16_t>().device(device) =
+          buffer_view.cast<uint16_t>() * scale;
     } else if (data_type_ == DataType::DT_FLOAT) {
-      float scale = 1. / std::numeric_limits<uint8>::max();
+      float scale = 1. / std::numeric_limits<uint8_t>::max();
       // Fill output tensor with desired dtype.
       output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
     }
@@ -578,18 +578,18 @@ class DecodeImageV2Op : public OpKernel {
                                         "size, width, height, and bpp, got ",
                                         input.size(), " bytes"));
 
-    const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
+    const uint8_t* img_bytes = reinterpret_cast<const uint8_t*>(input.data());
     int32_t header_size_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 10)));
+        *(reinterpret_cast<const int32_t*>(img_bytes + 10)));
     const int32_t header_size = ByteSwapInt32ForBigEndian(header_size_);
     int32_t width_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 18)));
+        *(reinterpret_cast<const int32_t*>(img_bytes + 18)));
     const int32_t width = ByteSwapInt32ForBigEndian(width_);
     int32_t height_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 22)));
+        *(reinterpret_cast<const int32_t*>(img_bytes + 22)));
     const int32_t height = ByteSwapInt32ForBigEndian(height_);
     int16_t bpp_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int16*>(img_bytes + 28)));
+        *(reinterpret_cast<const int16_t*>(img_bytes + 28)));
     const int16_t bpp = ByteSwapInt16ForBigEndian(bpp_);
 
     // `channels_` is desired number of channels. `img_channels` is number of
@@ -657,28 +657,29 @@ class DecodeImageV2Op : public OpKernel {
         context->allocate_output(
             0, TensorShape({abs_height, width, requested_channels}), &output));
 
-    const uint8* bmp_pixels = &img_bytes[header_size];
+    const uint8_t* bmp_pixels = &img_bytes[header_size];
 
     if (data_type_ == DataType::DT_UINT8) {
-      DecodeBMP(bmp_pixels, row_size, output->flat<uint8>().data(), width,
+      DecodeBMP(bmp_pixels, row_size, output->flat<uint8_t>().data(), width,
                 abs_height, requested_channels, img_channels, top_down);
     } else {
-      std::unique_ptr<uint8[]> buffer(
-          new uint8[height * width * requested_channels]);
+      std::unique_ptr<uint8_t[]> buffer(
+          new uint8_t[height * width * requested_channels]);
       DecodeBMP(bmp_pixels, row_size, buffer.get(), width, abs_height,
                 requested_channels, img_channels, top_down);
-      TTypes<uint8, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
-                                                 requested_channels);
+      TTypes<uint8_t, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                   requested_channels);
       // Convert the raw uint8 buffer to desired dtype.
       // Use eigen threadpooling to speed up the copy operation.
       const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
       if (data_type_ == DataType::DT_UINT16) {
-        uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
-                             (std::numeric_limits<uint8>::max() + 1));
+        uint16_t scale = floor((std::numeric_limits<uint16_t>::max() + 1) /
+                               (std::numeric_limits<uint8_t>::max() + 1));
         // Fill output tensor with desired dtype.
-        output->tensor<uint16, 3>().device(device) = buf.cast<uint16>() * scale;
+        output->tensor<uint16_t, 3>().device(device) =
+            buf.cast<uint16_t>() * scale;
       } else if (data_type_ == DataType::DT_FLOAT) {
-        float scale = 1. / std::numeric_limits<uint8>::max();
+        float scale = 1. / std::numeric_limits<uint8_t>::max();
         // Fill output tensor with desired dtype.
         output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
       }
@@ -724,7 +725,7 @@ class DecodeImageV2Op : public OpKernel {
 
       // Actually decode the image into the output buffer.
       OP_REQUIRES(context,
-                  webp::DecodeWebPImage(input, output->flat<uint8>().data(),
+                  webp::DecodeWebPImage(input, output->flat<uint8_t>().data(),
                                         width, height, channels),
                   errors::InvalidArgument("Failed to decode WebP image."));
       // Note: Here we could also perform casting to other dtypes, but users can
@@ -762,7 +763,7 @@ class DecodeImageV2Op : public OpKernel {
             return nullptr;
           }
 
-          return output->flat<uint8>().data();
+          return output->flat<uint8_t>().data();
         },
         &error_string, expand_animations_);
 
@@ -773,15 +774,16 @@ class DecodeImageV2Op : public OpKernel {
   }
 
  private:
-  void DecodeBMP(const uint8* input, const int row_size, uint8* const output,
-                 const int width, const int height, const int output_channels,
-                 const int input_channels, bool top_down);
+  void DecodeBMP(const uint8_t* input, const int row_size,
+                 uint8_t* const output, const int width, const int height,
+                 const int output_channels, const int input_channels,
+                 bool top_down);
 
   int channels_ = 0;
   DataType data_type_ = DataType::DT_UINT8;
   bool expand_animations_ = true;
   jpeg::UncompressFlags flags_;
-  string op_type_;
+  std::string op_type_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageV2Op);
@@ -794,8 +796,8 @@ REGISTER_KERNEL_BUILDER(Name("DecodeImage").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeImageV2Op);
 REGISTER_KERNEL_BUILDER(Name("DecodeWebP").Device(DEVICE_CPU), DecodeImageV2Op);
 
-void DecodeImageV2Op::DecodeBMP(const uint8* input, const int row_size,
-                                uint8* const output, const int width,
+void DecodeImageV2Op::DecodeBMP(const uint8_t* input, const int row_size,
+                                uint8_t* const output, const int width,
                                 const int height, const int output_channels,
                                 const int input_channels, bool top_down) {
   for (int i = 0; i < height; i++) {
diff --git a/tensorflow/core/kernels/image/encode_jpeg_op.cc b/tensorflow/core/kernels/image/encode_jpeg_op.cc
index 24b0b90ee1fda6..3ef479b9154bdd 100644
--- a/tensorflow/core/kernels/image/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/image/encode_jpeg_op.cc
@@ -56,7 +56,7 @@ class EncodeJpegOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
                                              &flags_.chroma_downsampling));
 
-    string density_unit;
+    std::string density_unit;
     OP_REQUIRES_OK(context, context->GetAttr("density_unit", &density_unit));
     if (density_unit == "in") {
       flags_.density_unit = 1;
@@ -80,15 +80,15 @@ class EncodeJpegOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
-        errors::InvalidArgument(
-            "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(context,
+                FastBoundsCheck(image.NumElements(),
+                                std::numeric_limits<int32_t>::max()),
+                errors::InvalidArgument(
+                    "Cannot encode images with >= max int32 elements"));
 
-    const int32_t dim_size0 = static_cast<int32>(image.dim_size(0));
-    const int32_t dim_size1 = static_cast<int32>(image.dim_size(1));
-    const int32_t dim_size2 = static_cast<int32>(image.dim_size(2));
+    const int32_t dim_size0 = static_cast<int32_t>(image.dim_size(0));
+    const int32_t dim_size1 = static_cast<int32_t>(image.dim_size(1));
+    const int32_t dim_size2 = static_cast<int32_t>(image.dim_size(2));
 
     // Autodetect format if desired, otherwise make sure format and
     // image channels are consistent.
@@ -122,15 +122,16 @@ class EncodeJpegOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES(context,
-                jpeg::Compress(image.flat<uint8>().data(), dim_size1, dim_size0,
-                               adjusted_flags, &output->scalar<tstring>()()),
-                errors::Internal("JPEG encoding failed"));
+    OP_REQUIRES(
+        context,
+        jpeg::Compress(image.flat<uint8_t>().data(), dim_size1, dim_size0,
+                       adjusted_flags, &output->scalar<tstring>()()),
+        errors::Internal("JPEG encoding failed"));
   }
 
  private:
-  string format_;
-  string xmp_metadata_;  // Owns data referenced by flags_
+  std::string format_;
+  std::string xmp_metadata_;  // Owns data referenced by flags_
   jpeg::CompressFlags flags_;
 };
 REGISTER_KERNEL_BUILDER(Name("EncodeJpeg").Device(DEVICE_CPU), EncodeJpegOp);
@@ -146,15 +147,15 @@ class EncodeJpegVariableQualityOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
-        errors::InvalidArgument(
-            "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(context,
+                FastBoundsCheck(image.NumElements(),
+                                std::numeric_limits<int32_t>::max()),
+                errors::InvalidArgument(
+                    "Cannot encode images with >= max int32 elements"));
 
-    const int32_t dim_size0 = static_cast<int32>(image.dim_size(0));
-    const int32_t dim_size1 = static_cast<int32>(image.dim_size(1));
-    const int32_t dim_size2 = static_cast<int32>(image.dim_size(2));
+    const int32_t dim_size0 = static_cast<int32_t>(image.dim_size(0));
+    const int32_t dim_size1 = static_cast<int32_t>(image.dim_size(1));
+    const int32_t dim_size2 = static_cast<int32_t>(image.dim_size(2));
 
     // Use default jpeg compression flags except for format and quality.
     jpeg::CompressFlags adjusted_flags;
@@ -188,10 +189,11 @@ class EncodeJpegVariableQualityOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES(context,
-                jpeg::Compress(image.flat<uint8>().data(), dim_size1, dim_size0,
-                               adjusted_flags, &output->scalar<tstring>()()),
-                errors::Internal("JPEG encoding failed"));
+    OP_REQUIRES(
+        context,
+        jpeg::Compress(image.flat<uint8_t>().data(), dim_size1, dim_size0,
+                       adjusted_flags, &output->scalar<tstring>()()),
+        errors::Internal("JPEG encoding failed"));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("EncodeJpegVariableQuality").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
index 0e51b4e244141f..c52685ccea043a 100644
--- a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
+++ b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
@@ -35,9 +35,9 @@ TEST_F(EncodeJpegWithVariableQualityTest, FailsForInvalidQuality) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
 
-  AddInputFromArray<uint8>(TensorShape({2, 2, 3}),
-                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
-  AddInputFromArray<int32>(TensorShape({}), {200});
+  AddInputFromArray<uint8_t>(TensorShape({2, 2, 3}),
+                             {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32_t>(TensorShape({}), {200});
   absl::Status status = RunOpKernel();
   EXPECT_TRUE(absl::IsInvalidArgument(status));
   EXPECT_TRUE(absl::StartsWith(status.message(), "quality must be in [0,100]"));
diff --git a/tensorflow/core/kernels/image/encode_png_op.cc b/tensorflow/core/kernels/image/encode_png_op.cc
index ae1226c05e9d1b..ebf7527302ac9a 100644
--- a/tensorflow/core/kernels/image/encode_png_op.cc
+++ b/tensorflow/core/kernels/image/encode_png_op.cc
@@ -67,16 +67,18 @@ class EncodePngOp : public OpKernel {
                 errors::Internal("Invalid image provided."));
     OP_REQUIRES(
         context,
-        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
+        FastBoundsCheck(image.NumElements(),
+                        std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument("image cannot have >= int32 max elements"));
 
     const int batch_dims = image.dims() - 3;
-    const int32_t height = static_cast<int32>(image.dim_size(batch_dims));
-    const int32_t width = static_cast<int32>(image.dim_size(batch_dims + 1));
-    const int32_t channels = static_cast<int32>(image.dim_size(batch_dims + 2));
+    const int32_t height = static_cast<int32_t>(image.dim_size(batch_dims));
+    const int32_t width = static_cast<int32_t>(image.dim_size(batch_dims + 1));
+    const int32_t channels =
+        static_cast<int32_t>(image.dim_size(batch_dims + 2));
 
     // In some cases, we pass width*channels*2 to png.
-    const int32_t max_row_width = std::numeric_limits<int32>::max() / 2;
+    const int32_t max_row_width = std::numeric_limits<int32_t>::max() / 2;
 
     OP_REQUIRES(context, FastBoundsCheck(width * channels, max_row_width),
                 errors::InvalidArgument("image too wide to encode"));
diff --git a/tensorflow/core/kernels/image/extract_image_patches_op.cc b/tensorflow/core/kernels/image/extract_image_patches_op.cc
index b40c59147e51b5..dc9739123ca1f8 100644
--- a/tensorflow/core/kernels/image/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.cc
@@ -41,8 +41,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 static inline void ParseAttributeVec4(OpKernelConstruction* context,
-                                      const string& attr_name,
-                                      std::vector<int32>* attr) {
+                                      const std::string& attr_name,
+                                      std::vector<int32_t>* attr) {
   OP_REQUIRES_OK(context, context->GetAttr(attr_name, attr));
   OP_REQUIRES(
       context, (*attr)[0] == 1 && (*attr)[3] == 1,
@@ -115,9 +115,9 @@ class ExtractImagePatchesOp : public UnaryOp<T> {
   }
 
  private:
-  std::vector<int32> ksizes_;
-  std::vector<int32> strides_;
-  std::vector<int32> rates_;
+  std::vector<int32_t> ksizes_;
+  std::vector<int32_t> strides_;
+  std::vector<int32_t> rates_;
 
   Padding padding_;
 
diff --git a/tensorflow/core/kernels/image/extract_volume_patches_op.cc b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
index 1edcf5cad22b2f..0db3ca43ad36d8 100644
--- a/tensorflow/core/kernels/image/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
@@ -46,8 +46,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 static inline void ParseAttributeVec5(OpKernelConstruction* context,
-                                      const string& attr_name,
-                                      std::vector<int32>* attr) {
+                                      const std::string& attr_name,
+                                      std::vector<int32_t>* attr) {
   OP_REQUIRES_OK(context, context->GetAttr(attr_name, attr));
   OP_REQUIRES(
       context, (*attr)[0] == 1 && (*attr)[4] == 1,
@@ -143,8 +143,8 @@ class ExtractVolumePatchesOp : public UnaryOp<T> {
   }
 
  private:
-  std::vector<int32> ksizes_;
-  std::vector<int32> strides_;
+  std::vector<int32_t> ksizes_;
+  std::vector<int32_t> strides_;
   // std::vector<int32> rates_;
 
   Padding padding_;
diff --git a/tensorflow/core/kernels/image/image_ops.cc b/tensorflow/core/kernels/image/image_ops.cc
index 649ad187c47439..9dd1e907e94cde 100644
--- a/tensorflow/core/kernels/image/image_ops.cc
+++ b/tensorflow/core/kernels/image/image_ops.cc
@@ -75,7 +75,7 @@ void DoImageProjectiveTransformOp(OpKernelContext* ctx,
     OP_REQUIRES(ctx, shape_t.NumElements() == 2,
                 errors::InvalidArgument("output shape must have two elements",
                                         shape_t.shape().DebugString()));
-    auto shape_vec = shape_t.vec<int32>();
+    auto shape_vec = shape_t.vec<int32_t>();
     out_height = shape_vec(0);
     out_width = shape_vec(1);
     OP_REQUIRES(ctx, out_height > 0 && out_width > 0,
@@ -121,7 +121,7 @@ class ImageProjectiveTransformV2 : public OpKernel {
  public:
   explicit ImageProjectiveTransformV2(OpKernelConstruction* ctx)
       : OpKernel(ctx) {
-    string interpolation_str;
+    std::string interpolation_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("interpolation", &interpolation_str));
     if (interpolation_str == "NEAREST") {
       interpolation_ = Interpolation::NEAREST;
@@ -131,7 +131,7 @@ class ImageProjectiveTransformV2 : public OpKernel {
       LOG(ERROR) << "Invalid interpolation " << interpolation_str
                  << ". Supported types: NEAREST, BILINEAR";
     }
-    string mode_str;
+    std::string mode_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_mode", &mode_str));
     if (mode_str == "REFLECT") {
       fill_mode_ = Mode::FILL_REFLECT;
diff --git a/tensorflow/core/kernels/image/mirror_pad_op.h b/tensorflow/core/kernels/image/mirror_pad_op.h
index 7c3df97855208c..0bdf9ad6961b70 100644
--- a/tensorflow/core/kernels/image/mirror_pad_op.h
+++ b/tensorflow/core/kernels/image/mirror_pad_op.h
@@ -343,13 +343,13 @@ namespace functor {
 template <typename Device, typename T, typename Tpaddings, int Dims>
 struct MirrorPad {
   void operator()(const Device& device,
-                  typename TTypes<T, Dims, int32>::Tensor output,
-                  typename TTypes<T, Dims, int32>::ConstTensor input,
+                  typename TTypes<T, Dims, int32_t>::Tensor output,
+                  typename TTypes<T, Dims, int32_t>::ConstTensor input,
                   typename TTypes<Tpaddings>::ConstMatrix padding, int offset) {
-    Eigen::array<Eigen::IndexPair<int32>, Dims> padding_dims;
+    Eigen::array<Eigen::IndexPair<int32_t>, Dims> padding_dims;
 
     for (int i = 0; i < Dims; ++i) {
-      padding_dims[i] = Eigen::IndexPair<int32>(padding(i, 0), padding(i, 1));
+      padding_dims[i] = Eigen::IndexPair<int32_t>(padding(i, 0), padding(i, 1));
     }
 
     output.device(device) = MirrorPadOp(input, padding_dims, offset);
@@ -370,16 +370,16 @@ struct MirrorPad {
 template <typename Device, typename T, typename Tpaddings, int Dims>
 struct MirrorPadGrad {
   void operator()(const Device& device,
-                  typename TTypes<T, Dims, int32>::Tensor output,
-                  typename TTypes<T, Dims, int32>::ConstTensor input,
+                  typename TTypes<T, Dims, int32_t>::Tensor output,
+                  typename TTypes<T, Dims, int32_t>::ConstTensor input,
                   typename TTypes<Tpaddings>::ConstMatrix paddings, int offset,
-                  typename TTypes<T, Dims, int32>::Tensor scratch) {
+                  typename TTypes<T, Dims, int32_t>::Tensor scratch) {
     // Copy the gradient input into the scratch buffer.
     scratch.device(device) = input;
 
-    Eigen::array<int32, Dims> lhs_offsets;
-    Eigen::array<int32, Dims> rhs_offsets;
-    Eigen::array<int32, Dims> extents;
+    Eigen::array<int32_t, Dims> lhs_offsets;
+    Eigen::array<int32_t, Dims> rhs_offsets;
+    Eigen::array<int32_t, Dims> extents;
     Eigen::array<bool, Dims> reverses;
 
     for (int i = 0; i < Dims; ++i) {
diff --git a/tensorflow/core/kernels/image/mirror_pad_op_test.cc b/tensorflow/core/kernels/image/mirror_pad_op_test.cc
index d424ac36f12533..94b8dc0697b6e6 100644
--- a/tensorflow/core/kernels/image/mirror_pad_op_test.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_test.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 class MirrorPadOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(const string& mode) {
+  void MakeOp(const std::string& mode) {
     TF_EXPECT_OK(NodeDefBuilder("mirror_pad_op", "MirrorPad")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_INT32))
@@ -79,11 +79,11 @@ REGISTER_TEST(double)
 REGISTER_TEST(quint8)
 REGISTER_TEST(qint8)
 REGISTER_TEST(qint32)
-REGISTER_TEST(uint8)
-REGISTER_TEST(uint16)
-REGISTER_TEST(int8)
-REGISTER_TEST(int16)
-REGISTER_TEST(int32)
+REGISTER_TEST(uint8_t)
+REGISTER_TEST(uint16_t)
+REGISTER_TEST(int8_t)
+REGISTER_TEST(int16_t)
+REGISTER_TEST(int32_t)
 REGISTER_TEST(int64_t)
 
 #undef REGISTER_TEST
@@ -102,8 +102,8 @@ TEST_F(MirrorPadOpTest, TestMirrorPadReflectLargeInput) {
   //  0, 1, 2, ..., 999
   AddInput<float>(TensorShape({1, kInput, kInput, 1}),
                   [=](int i) -> float { return i % kInput; });
-  AddInputFromArray<int32>(TensorShape({4, 2}),
-                           {0, 0, kPad, kPad, kPad, kPad, 0, 0});
+  AddInputFromArray<int32_t>(TensorShape({4, 2}),
+                             {0, 0, kPad, kPad, kPad, kPad, 0, 0});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, kOutput, kOutput, 1}));
@@ -136,8 +136,8 @@ TEST_F(MirrorPadOpTest, TestMirrorPadSymmetricLargeInput) {
   //  0, 1, 2, ..., 999
   AddInput<float>(TensorShape({1, kInput, kInput, 1}),
                   [=](int i) -> float { return i % kInput; });
-  AddInputFromArray<int32>(TensorShape({4, 2}),
-                           {0, 0, kPad, kPad, kPad, kPad, 0, 0});
+  AddInputFromArray<int32_t>(TensorShape({4, 2}),
+                             {0, 0, kPad, kPad, kPad, kPad, 0, 0});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, kOutput, kOutput, 1}));
@@ -159,7 +159,7 @@ TEST_F(MirrorPadOpTest, TestMirrorPadSymmetricLargeInput) {
 class MirrorPadGradOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(const string& mode) {
+  void MakeOp(const std::string& mode) {
     TF_EXPECT_OK(NodeDefBuilder("mirror_pad_grad_op", "MirrorPadGrad")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_INT32))
@@ -196,11 +196,11 @@ class MirrorPadGradOpTest : public OpsTestBase {
 
 REGISTER_TEST(float)
 REGISTER_TEST(double)
-REGISTER_TEST(uint8)
-REGISTER_TEST(uint16)
-REGISTER_TEST(int8)
-REGISTER_TEST(int16)
-REGISTER_TEST(int32)
+REGISTER_TEST(uint8_t)
+REGISTER_TEST(uint16_t)
+REGISTER_TEST(int8_t)
+REGISTER_TEST(int16_t)
+REGISTER_TEST(int32_t)
 REGISTER_TEST(int64_t)
 
 #undef REGISTER_TEST
diff --git a/tensorflow/core/kernels/image/non_max_suppression_op.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cc
index fb0a562182315e..012c9bf0f8f926 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cc
@@ -822,7 +822,7 @@ class NonMaxSuppressionV4Op : public OpKernel {
     Tensor* num_outputs_t = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
                                 1, tensorflow::TensorShape{}, &num_outputs_t));
-    num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
+    num_outputs_t->scalar<int32_t>().setConstant(num_valid_outputs);
   }
 
  private:
@@ -902,7 +902,7 @@ class NonMaxSuppressionV5Op : public OpKernel {
     Tensor* num_outputs_t = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
                                 2, tensorflow::TensorShape{}, &num_outputs_t));
-    num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
+    num_outputs_t->scalar<int32_t>().setConstant(num_valid_outputs);
   }
 
  private:
diff --git a/tensorflow/core/kernels/image/resize_area_op_test.cc b/tensorflow/core/kernels/image/resize_area_op_test.cc
index 2d91114668d630..af4db8cd3c18d8 100644
--- a/tensorflow/core/kernels/image/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_area_op_test.cc
@@ -152,7 +152,7 @@ class ResizeAreaOpTest : public OpsTestBase {
                      int target_width, int channels) {
     const Tensor* input =
         SetRandomImageInput(TensorShape({1, in_height, in_width, channels}));
-    AddInputFromArray<int32>(TensorShape({2}), {target_height, target_width});
+    AddInputFromArray<int32_t>(TensorShape({2}), {target_height, target_width});
 
     TF_ASSERT_OK(RunOpKernel());
     std::unique_ptr<Tensor> expected(
diff --git a/tensorflow/core/kernels/image/resize_bicubic_op_test.cc b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
index c80a518c4e01d4..366f7e284fd279 100644
--- a/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
@@ -173,7 +173,7 @@ class ResizeBicubicOpTest : public OpsTestBase {
               << "x" << channels;
     const Tensor* input = SetRandomImageInput(
         TensorShape({batch_size, in_height, in_width, channels}));
-    AddInputFromArray<int32>(TensorShape({2}), {target_height, target_width});
+    AddInputFromArray<int32_t>(TensorShape({2}), {target_height, target_width});
 
     TF_ASSERT_OK(RunOpKernel());
 
@@ -213,7 +213,7 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To1x1) {
   // 1, 2
   // 3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -225,7 +225,7 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To1x1) {
 
 TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
 
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
@@ -267,7 +267,7 @@ static Graph* ResizeBicubic(int batch_size, int size, int channels,
   Tensor input(DT_FLOAT, TensorShape({batch_size, size, size, channels}));
   input.flat<float>().setRandom();
   Tensor shape(DT_INT32, TensorShape({2}));
-  auto shape_t = shape.flat<int32>();
+  auto shape_t = shape.flat<int32_t>();
   shape_t(0) = scale_y * size;
   shape_t(1) = scale_x * size;
   test::graph::Binary(g, "ResizeBicubic", test::graph::Constant(g, input),
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op_test.cc b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
index 00463e5bf05480..87da8432435aae 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
@@ -137,7 +137,7 @@ class ResizeBilinearOpTestBase
                   int channels, int output_width, int output_height) {
     const TensorShape shape({batch_size, input_width, input_height, channels});
     const Tensor* input = SetRandomImageInput(shape);
-    AddInputFromArray<int32>(TensorShape({2}), {output_width, output_height});
+    AddInputFromArray<int32_t>(TensorShape({2}), {output_width, output_height});
     TF_ASSERT_OK(RunOpKernel());
 
     std::unique_ptr<Tensor> expected(new Tensor(
@@ -199,7 +199,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -211,7 +211,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
 
 TEST_P(ResizeBilinearOpTest, TestBilinearRandom2x2To1x1) {
   const Tensor* input = SetRandomImageInput(TensorShape({1, 2, 2, 1}));
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -230,7 +230,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -245,7 +245,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -265,7 +265,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -290,7 +290,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -311,7 +311,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -332,7 +332,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear3x3To4x4) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -356,7 +356,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear4x4To3x3) {
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -388,7 +388,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners4x4To3x3) {
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -410,7 +410,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
   //
   // repeated twice
   AddInputFromArray<float>(TensorShape({2, 2, 2, 1}), {1, 2, 3, 4, 1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 1}));
@@ -426,7 +426,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
 TEST_P(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 2}),
                            {1, -1, 2, -2, 3, -3, 4, -4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 2}));
@@ -452,7 +452,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -492,7 +492,7 @@ TEST_P(ResizeBilinearOpTest, Test6_3c) { TestResize(1, 304, 303, 3, 299, 299); }
 
 TEST_P(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(
@@ -502,7 +502,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidOutputSize) {
 
 TEST_P(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), "input must be 4-dimensional"))
@@ -511,7 +511,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidInputShape) {
 
 TEST_P(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2, 1}), {4, 4});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), "shape_t must be 1-dimensional"))
@@ -520,7 +520,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidSizeDim) {
 
 TEST_P(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
+  AddInputFromArray<int32_t>(TensorShape({3}), {4, 4, 1});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), "shape_t must have two elements"))
@@ -562,7 +562,7 @@ class ResizeBM : public ResizeBilinearOpTest {
     const TensorShape shape(
         {/*batch_size*/ 1, input_width, input_height, num_channels});
     SetRandomImageInput(shape);
-    AddInputFromArray<int32>(TensorShape({2}), {output_width, output_height});
+    AddInputFromArray<int32_t>(TensorShape({2}), {output_width, output_height});
   }
 
   using ResizeBilinearOpTest::RunOpKernel;
diff --git a/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
index fa62e644bb9f3e..7dfe9f2cefd2e2 100644
--- a/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
@@ -236,7 +236,7 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
                 errors::InvalidArgument("shape_t must have two elements",
                                         shape_t.shape().DebugString()));
 
-    auto sizes = shape_t.vec<int32>();
+    auto sizes = shape_t.vec<int32_t>();
     OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
                 errors::InvalidArgument("shape_t's elements must be positive"));
 
diff --git a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
index 4e73c6ed7e5e8b..9367253f356707 100644
--- a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
@@ -101,7 +101,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -119,7 +119,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -136,7 +136,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -157,7 +157,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -179,7 +179,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -201,7 +201,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -220,7 +220,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 5});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 5});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1}));
@@ -243,7 +243,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearestNeighbor4x4To3x3) {
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -268,7 +268,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -288,7 +288,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {5, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {5, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1}));
@@ -310,7 +310,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -334,7 +334,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
   //    [ 7, 7 ], [ 8, 8] ]
   AddInputFromArray<float>(TensorShape({2, 2, 2, 2}),
                            {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2}));
@@ -364,7 +364,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest5x2To2x2) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 5, 1}),
                            {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -381,7 +381,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -398,7 +398,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -420,7 +420,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -439,7 +439,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To2x5) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 5});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 5});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1}));
@@ -463,7 +463,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest,
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -483,7 +483,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To5x2) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {5, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {5, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1}));
@@ -505,7 +505,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To4x4) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -530,7 +530,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest,
   //    [ 7, 7 ], [ 8, 8] ]
   AddInputFromArray<float>(TensorShape({2, 2, 2, 2}),
                            {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2}));
diff --git a/tensorflow/core/kernels/image/resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
index acae1f0b49f2f9..852d31f96f7ca6 100644
--- a/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
@@ -28,7 +28,7 @@ static Graph* Resize(const char* algorithm, int batches, int width,
   in.flat<float>().setRandom();
 
   Tensor out_size(DT_INT32, TensorShape({2}));
-  auto out_size_flat = out_size.flat<int32>();
+  auto out_size_flat = out_size.flat<int32_t>();
   out_size_flat(0) = width * 2;
   out_size_flat(1) = height * 2;
 
diff --git a/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
index a754a8cec1fc62..42804ea06e129f 100644
--- a/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
@@ -270,16 +270,19 @@ class SampleDistortedBoundingBoxBaseOp : public OpKernel {
                                         image_size.shape().DebugString()));
 
     // Note image_size_data(2) is the depth and unused.
-    const uint64 height_raw = internal::SubtleMustCopy(image_size.flat<T>()(0));
-    const uint64 width_raw = internal::SubtleMustCopy(image_size.flat<T>()(1));
-    OP_REQUIRES(context,
-                FastBoundsCheck(height_raw, std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("image height cannot be >= int32 max"));
+    const uint64_t height_raw =
+        internal::SubtleMustCopy(image_size.flat<T>()(0));
+    const uint64_t width_raw =
+        internal::SubtleMustCopy(image_size.flat<T>()(1));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(height_raw, std::numeric_limits<int32_t>::max()),
+        errors::InvalidArgument("image height cannot be >= int32 max"));
     OP_REQUIRES(context,
-                FastBoundsCheck(width_raw, std::numeric_limits<int32>::max()),
+                FastBoundsCheck(width_raw, std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument("image width cannot be >= int32 max"));
-    const int32_t height = static_cast<int32>(height_raw);
-    const int32_t width = static_cast<int32>(width_raw);
+    const int32_t height = static_cast<int32_t>(height_raw);
+    const int32_t width = static_cast<int32_t>(width_raw);
 
     // Ensure that the supplied bounding boxes are sane and convert them to
     // Rectangles.
@@ -328,10 +331,10 @@ class SampleDistortedBoundingBoxBaseOp : public OpKernel {
                                       boxes(b, i)));
         }
 
-        const int32_t x_min = static_cast<int32>(boxes(b, 1) * width);
-        const int32_t y_min = static_cast<int32>(boxes(b, 0) * height);
-        const int32_t x_max = static_cast<int32>(boxes(b, 3) * width);
-        const int32_t y_max = static_cast<int32>(boxes(b, 2) * height);
+        const int32_t x_min = static_cast<int32_t>(boxes(b, 1) * width);
+        const int32_t y_min = static_cast<int32_t>(boxes(b, 0) * height);
+        const int32_t x_max = static_cast<int32_t>(boxes(b, 3) * width);
+        const int32_t y_max = static_cast<int32_t>(boxes(b, 2) * height);
 
         bounding_boxes.push_back(Rectangle(x_min, y_min, x_max, y_max));
       }
@@ -432,7 +435,7 @@ class SampleDistortedBoundingBoxBaseOp : public OpKernel {
   }
 
  protected:
-  int32 max_attempts_;
+  int32_t max_attempts_;
   std::vector<float> area_range_;
   std::vector<float> aspect_ratio_range_;
   float min_object_covered_;
diff --git a/tensorflow/core/kernels/image/sampling_kernels.cc b/tensorflow/core/kernels/image/sampling_kernels.cc
index d03247fc7487bf..c72e206f4e424c 100644
--- a/tensorflow/core/kernels/image/sampling_kernels.cc
+++ b/tensorflow/core/kernels/image/sampling_kernels.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace functor {
 
 SamplingKernelType SamplingKernelTypeFromString(const absl::string_view str) {
-  const string lower_case = absl::AsciiStrToLower(str);
+  const std::string lower_case = absl::AsciiStrToLower(str);
   if (lower_case == "lanczos1") return Lanczos1Kernel;
   if (lower_case == "lanczos3") return Lanczos3Kernel;
   if (lower_case == "lanczos5") return Lanczos5Kernel;
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op.cc b/tensorflow/core/kernels/image/scale_and_translate_op.cc
index 3cacc10229495d..edd6ef09b3a8f4 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.cc
@@ -70,7 +70,7 @@ absl::Status ComputeSpansCore(OpKernelContext* context, const Kernel& kernel,
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_INT32, tensorflow::TensorShape({output_size}),
       &spans->starts, alloc_attr));
-  auto starts_vec = spans->starts.vec<int32>();
+  auto starts_vec = spans->starts.vec<int32_t>();
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_FLOAT,
       tensorflow::TensorShape({spans->span_size * output_size}),
@@ -135,7 +135,7 @@ absl::Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
   };
   std::vector<std::vector<GradComponent>> grad_components(forward_input_size);
   auto weights_vec = spans.weights.vec<float>();
-  auto starts_vec = spans.starts.vec<int32>();
+  auto starts_vec = spans.starts.vec<int32_t>();
   for (int output_index = 0; output_index < forward_output_size;
        ++output_index) {
     int input_index = starts_vec(output_index);
@@ -163,7 +163,7 @@ absl::Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_INT32, tensorflow::TensorShape({forward_input_size}),
       &grad_spans->starts, alloc_attr));
-  auto grad_starts_vec = grad_spans->starts.vec<int32>();
+  auto grad_starts_vec = grad_spans->starts.vec<int32_t>();
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_FLOAT,
       tensorflow::TensorShape({grad_spans->span_size * forward_input_size}),
@@ -273,7 +273,7 @@ class ScaleAndTranslateOp : public OpKernel {
   explicit ScaleAndTranslateOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("antialias", &antialias_));
-    string kernel_type_str;
+    std::string kernel_type_str;
     OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
     kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
     OP_REQUIRES(context, kernel_type_ != functor::SamplingKernelTypeEnd,
@@ -293,15 +293,16 @@ class ScaleAndTranslateOp : public OpKernel {
     OP_REQUIRES(context, output_shape_t.NumElements() == 2,
                 errors::InvalidArgument("output_shape_t must have two elements",
                                         output_shape_t.shape().DebugString()));
-    auto output_shape_vec = output_shape_t.vec<int32>();
+    auto output_shape_vec = output_shape_t.vec<int32_t>();
     const int64_t output_height = internal::SubtleMustCopy(output_shape_vec(0));
     const int64_t output_width = internal::SubtleMustCopy(output_shape_vec(1));
 
     OP_REQUIRES(
         context,
-        FastBoundsCheck(input.dim_size(1), std::numeric_limits<int32>::max()) &&
+        FastBoundsCheck(input.dim_size(1),
+                        std::numeric_limits<int32_t>::max()) &&
             FastBoundsCheck(input.dim_size(2),
-                            std::numeric_limits<int32>::max()),
+                            std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument("input sizes must be between 0 and max int32"));
 
     const int64_t batch_size = input.dim_size(0);
@@ -359,13 +360,13 @@ class ScaleAndTranslateOp : public OpKernel {
         intermediate_t.tensor<float, 4>();
 
     const functor::Spans& const_row_spans = row_spans;
-    typename TTypes<int32, 1>::ConstTensor row_starts(
-        const_row_spans.starts.tensor<int32, 1>());
+    typename TTypes<int32_t, 1>::ConstTensor row_starts(
+        const_row_spans.starts.tensor<int32_t, 1>());
     typename TTypes<float, 1>::ConstTensor row_weights(
         const_row_spans.weights.tensor<float, 1>());
     const functor::Spans& const_col_spans = col_spans;
-    typename TTypes<int32, 1>::ConstTensor col_starts(
-        const_col_spans.starts.tensor<int32, 1>());
+    typename TTypes<int32_t, 1>::ConstTensor col_starts(
+        const_col_spans.starts.tensor<int32_t, 1>());
     typename TTypes<float, 1>::ConstTensor col_weights(
         const_col_spans.weights.tensor<float, 1>());
 
@@ -384,7 +385,7 @@ class ScaleAndTranslateGradOp : public OpKernel {
   explicit ScaleAndTranslateGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("antialias", &antialias_));
-    string kernel_type_str;
+    std::string kernel_type_str;
     OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
     kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
     OP_REQUIRES(context, kernel_type_ != functor::SamplingKernelTypeEnd,
@@ -417,9 +418,9 @@ class ScaleAndTranslateGradOp : public OpKernel {
 
     OP_REQUIRES(context,
                 FastBoundsCheck(forward_input_height,
-                                std::numeric_limits<int32>::max()) &&
+                                std::numeric_limits<int32_t>::max()) &&
                     FastBoundsCheck(forward_input_width,
-                                    std::numeric_limits<int32>::max()),
+                                    std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument(
                     "original sizes must be between 0 and max int32"));
     Tensor* output = nullptr;
@@ -464,13 +465,13 @@ class ScaleAndTranslateGradOp : public OpKernel {
         intermediate_t.tensor<float, 4>();
 
     const functor::Spans& const_row_spans = row_spans;
-    typename TTypes<int32, 1>::ConstTensor row_starts =
-        const_row_spans.starts.tensor<int32, 1>();
+    typename TTypes<int32_t, 1>::ConstTensor row_starts =
+        const_row_spans.starts.tensor<int32_t, 1>();
     typename TTypes<float, 1>::ConstTensor row_weights(
         const_row_spans.weights.tensor<float, 1>());
     const functor::Spans& const_col_spans = col_spans;
-    typename TTypes<int32, 1>::ConstTensor col_starts(
-        const_col_spans.starts.tensor<int32, 1>());
+    typename TTypes<int32_t, 1>::ConstTensor col_starts(
+        const_col_spans.starts.tensor<int32_t, 1>());
     typename TTypes<float, 1>::ConstTensor col_weights(
         const_col_spans.weights.tensor<float, 1>());
 
@@ -485,8 +486,8 @@ class ScaleAndTranslateGradOp : public OpKernel {
 };
 
 template <typename T>
-void GatherColumns(OpKernelContext* context, int span_size, const int32* starts,
-                   const float* weights, const T* image,
+void GatherColumns(OpKernelContext* context, int span_size,
+                   const int32_t* starts, const float* weights, const T* image,
                    const int64_t input_height, const int64_t input_width,
                    const int64_t output_height, const int64_t output_width,
                    const int channels, float* output) {
@@ -538,7 +539,7 @@ inline void AddScaledVector(const T* in_vec, int vec_len, float weight,
 }
 
 template <typename T>
-void GatherRows(OpKernelContext* context, int span_size, const int32* starts,
+void GatherRows(OpKernelContext* context, int span_size, const int32_t* starts,
                 const float* weights, const T* image,
                 const int64_t input_height, const int64_t input_width,
                 const int64_t output_height, const int64_t output_width,
@@ -581,10 +582,10 @@ template <typename T>
 struct GatherSpans<CPUDevice, T> {
   void operator()(OpKernelContext* context, const CPUDevice& d,
                   int row_span_size,
-                  typename TTypes<int32, 1>::ConstTensor row_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor row_starts,
                   typename TTypes<float, 1>::ConstTensor row_weights,
                   int col_span_size,
-                  typename TTypes<int32, 1>::ConstTensor col_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor col_starts,
                   typename TTypes<float, 1>::ConstTensor col_weights,
                   typename TTypes<T, 4>::ConstTensor images,
                   typename TTypes<float, 4>::Tensor intermediate_buffer,
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op.h b/tensorflow/core/kernels/image/scale_and_translate_op.h
index 672cc2a860b8f1..e2387a41ae381c 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op.h
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.h
@@ -60,10 +60,10 @@ struct Spans {
 template <typename Device, typename T>
 struct GatherSpans {
   void operator()(OpKernelContext* context, const Device& d, int row_span_size,
-                  typename TTypes<int32, 1>::ConstTensor row_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor row_starts,
                   typename TTypes<float, 1>::ConstTensor row_weights,
                   int col_span_size,
-                  typename TTypes<int32, 1>::ConstTensor col_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor col_starts,
                   typename TTypes<float, 1>::ConstTensor col_weights,
                   typename TTypes<T, 4>::ConstTensor input_images,
                   typename TTypes<float, 4>::Tensor intermediate_buffer,
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc b/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc
index c21a3a95b90b64..9b69dae48790cb 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc
@@ -26,8 +26,8 @@ void BM_ScaleAndTranslateOp(benchmark::State& state) {
   Tensor in(DT_FLOAT, TensorShape({1, 768, 768, 3}));
   in.flat<float>().setRandom();
   Tensor size(DT_INT32, TensorShape({2}));
-  size.flat<int32>()(0) = 772;
-  size.flat<int32>()(1) = 772;
+  size.flat<int32_t>()(0) = 772;
+  size.flat<int32_t>()(1) = 772;
   Tensor scale(DT_FLOAT, TensorShape({2}));
   scale.flat<float>()(0) = 1.0052;
   scale.flat<float>()(1) = 1.0052;
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op_test.cc b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
index 55bd559ddd6776..0def4456696781 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
@@ -193,7 +193,7 @@ void ScaleAndTranslateBaseline(const DynamicKernel& kernel,
 
 class ScaleAndTranslateOpTest : public OpsTestBase {
  protected:
-  void CreateOp(const string& kernel_type_str, const bool antialias) {
+  void CreateOp(const std::string& kernel_type_str, const bool antialias) {
     TF_EXPECT_OK(NodeDefBuilder("scale_and_translate_op", "ScaleAndTranslate")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
@@ -244,8 +244,8 @@ class ScaleAndTranslateOpTest : public OpsTestBase {
 
   void RunTest(int output_image_height, int output_image_width,
                const Vector2f& scale, const Vector2f& translate) {
-    AddInputFromArray<int32>(TensorShape({2}),
-                             {output_image_height, output_image_width});
+    AddInputFromArray<int32_t>(TensorShape({2}),
+                               {output_image_height, output_image_width});
     AddInputFromArray<float>(TensorShape({2}), {scale[1], scale[0]});
     AddInputFromArray<float>(TensorShape({2}), {translate[1], translate[0]});
     absl::Status s = RunOpKernel();
@@ -417,10 +417,10 @@ TEST_F(ScaleAndTranslateOpTest, NonAntialiasedScaleAndTranslationTest) {
 }
 
 TEST_F(ScaleAndTranslateOpTest, TestKernelTypes) {
-  const std::vector<string> kKernelTypes = {
+  const std::vector<std::string> kKernelTypes = {
       "lanczos1", "lanczos3",  "lanczos5",     "box",
       "triangle", "keyscubic", "mitchellcubic"};
-  for (const string& kernel_type : kKernelTypes) {
+  for (const std::string& kernel_type : kKernelTypes) {
     CreateOp(kernel_type, true);
     constexpr int64_t kBatchSize = 2;
     constexpr int64_t kNumRowSquares = 10;
diff --git a/tensorflow/core/kernels/linalg/linalg_ops_common.h b/tensorflow/core/kernels/linalg/linalg_ops_common.h
index b4b98921b263d0..820f921c167400 100644
--- a/tensorflow/core/kernels/linalg/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg/linalg_ops_common.h
@@ -102,8 +102,9 @@ class LinearAlgebraOp : public OpKernel {
     double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
     double cost = std::max(m, n) * std::min(m, n) * std::min(m, n);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   // Returns true if it is safe to forward (alias) input to output buffer
diff --git a/tensorflow/core/kernels/linalg/lu_op.cc b/tensorflow/core/kernels/linalg/lu_op.cc
index 27ffb48f45915f..a59f5de7c22382 100644
--- a/tensorflow/core/kernels/linalg/lu_op.cc
+++ b/tensorflow/core/kernels/linalg/lu_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <limits>
 
 #include "absl/container/inlined_vector.h"
 #include "Eigen/Core"  // from @eigen_archive
@@ -61,8 +62,9 @@ class LuOp : public OpKernel {
   int64_t GetCostPerUnit(const TensorShape& input_matrix_shape) const {
     double num_rows = static_cast<double>(input_matrix_shape.dim_size(0));
     double cost = (2 / 3.0) * MathUtil::IPow(num_rows, 3);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
index c75c494e6f8586..97088a08d65019 100644
--- a/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
+++ b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 // See docs in ../ops/linalg_ops.cc.
 
+#include <limits>
+
 #include "Eigen/Cholesky"  // from @eigen_archive
 #include "Eigen/Core"  // from @eigen_archive
 #include "Eigen/QR"  // from @eigen_archive
@@ -67,8 +69,9 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
     double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
     double cost = std::max(m, n) * std::min(m, n) * (std::min(m, n) + num_rhss);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   bool EnableInputForwarding() const final { return false; }
diff --git a/tensorflow/core/kernels/linalg/matrix_solve_op.cc b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
index 5b88e5d89a05b5..8ccd74cb9d2eac 100644
--- a/tensorflow/core/kernels/linalg/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif
 
+#include <limits>
 #include <numeric>
 
 #include "Eigen/Core"  // from @eigen_archive
@@ -66,8 +67,9 @@ class MatrixSolveOp : public LinearAlgebraOp<Scalar> {
     double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
     double cost = rows * rows * (rows + num_rhss);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   bool EnableInputForwarding() const final { return false; }
diff --git a/tensorflow/core/kernels/linalg/qr_op_impl.h b/tensorflow/core/kernels/linalg/qr_op_impl.h
index c5a1823f2f69e5..7dfa1122e4d99b 100644
--- a/tensorflow/core/kernels/linalg/qr_op_impl.h
+++ b/tensorflow/core/kernels/linalg/qr_op_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 // individual kernels. A separate file is used for each instantiated kernel to
 // improve compilation times.
 #include <algorithm>
+#include <limits>
 #include <numeric>
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -89,8 +90,9 @@ class QrOp : public LinearAlgebraOp<Scalar> {
                   2 * min_size * min_size * min_size / 3.;
     // TODO(jpoulson): Increase the cost if full_matrices is true in a manner
     // that reflects the algorithm used for the expansion.
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   using Matrix = typename Base::Matrix;
diff --git a/tensorflow/core/kernels/linalg/svd_op_impl.h b/tensorflow/core/kernels/linalg/svd_op_impl.h
index 4e6745855e47ff..b63b99a7e6de96 100644
--- a/tensorflow/core/kernels/linalg/svd_op_impl.h
+++ b/tensorflow/core/kernels/linalg/svd_op_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 // individual kernels. A separate file is used for each instantiated kernel to
 // improve compilation times.
 #include <algorithm>
+#include <limits>
 
 #include "Eigen/SVD"  // from @eigen_archive
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -72,8 +73,9 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
     double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
     double cost = 12 * std::max(m, n) * std::min(m, n) * std::min(m, n);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   using Matrix = typename Base::Matrix;
diff --git a/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
index 0b75d5c7d07c80..5e83c65b538491 100644
--- a/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // See docs in ../ops/linalg_ops.cc.
 
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -75,8 +76,9 @@ class TridiagonalMatMulOp : public LinearAlgebraOp<Scalar> {
 
     const double cost = num_rhss * ((3 * num_eqs - 2) * mult_cost +
                                     (2 * num_eqs - 2) * add_cost);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   // Needed to prevent writing result to the same location where input is.
diff --git a/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
index ca7b7994f2efaa..a10391c693df5f 100644
--- a/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
@@ -104,8 +104,9 @@ class TridiagonalSolveOp : public LinearAlgebraOp<Scalar> {
       cost = num_eqs * (div_cost * (num_rhss + 1) +
                         (add_cost + mult_cost) * (2 * num_rhss + 1));
     }
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   bool EnableInputForwarding() const final { return false; }
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 502654ca09765e..4e6a8d5266608d 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -516,7 +516,7 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
 
     const auto& cc =
         stream->parent()->GetDeviceDescription().gpu_compute_capability();
-    if (auto* procm = std::get_if<se::RocmComputeCapability>(&cc)) {
+    if (auto* procm = cc.rocm_compute_capability()) {
       use_cudnn = !procm->gfx9_mi200_or_later();
     }
     BlasScratchAllocator scratch_allocator(context);
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index 50517dc9a2c6e5..f4991bc1fe252a 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -51,7 +51,6 @@ limitations under the License.
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/kernels/matmul_util.h"
 #include "tensorflow/core/kernels/numeric_options_utils.h"
@@ -604,7 +603,7 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
 
     const auto& cc =
         stream->parent()->GetDeviceDescription().gpu_compute_capability();
-    if (auto* procm = std::get_if<se::RocmComputeCapability>(&cc)) {
+    if (auto* procm = cc.rocm_compute_capability()) {
       bCublasLtSupport = procm->gfx9_mi200_or_later();
     }
 
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 8af21582ee9652..8e6700f646d090 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -1174,8 +1174,10 @@ void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
   dims a_dims = dims{m, k};
   dims b_dims = dims{k, n};
   dims c_dims = dims{m, n};
-  dims a_strides = tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
-  dims b_strides = tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
+  dims a_strides =
+      absl::ascii_tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
+  dims b_strides =
+      absl::ascii_tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
   dims c_strides = dims{ldc, 1};
 
   // MklMatMul uses const alpha and beta, make guarantee here to ensure
diff --git a/tensorflow/core/kernels/numeric_options_utils.h b/tensorflow/core/kernels/numeric_options_utils.h
index ced38d3794f828..3c4caf81522f87 100644
--- a/tensorflow/core/kernels/numeric_options_utils.h
+++ b/tensorflow/core/kernels/numeric_options_utils.h
@@ -16,20 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
 #define TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
 
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/tsl/util/determinism.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 
-inline stream_executor::NumericOptions GetNumericOptions() {
-  return stream_executor::NumericOptions{
+inline stream_executor::EngineOptions GetNumericOptions() {
+  return stream_executor::EngineOptions{
       /*require_determinism=*/tsl::OpDeterminismRequired(),
-      /*allow_tf32=*/tsl::tensor_float_32_execution_enabled()};
+      /*allow_tf32=*/tsl::tensor_float_32_execution_enabled(),
+      /*require_command_buffer=*/false};
 }
 
-inline stream_executor::NumericOptions GetNumericOptionsForCuDnn() {
+inline stream_executor::EngineOptions GetNumericOptionsForCuDnn() {
   static bool cudnn_deterministic_env_var = [] {
     bool cudnn_deterministic = false;
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
@@ -37,7 +38,7 @@ inline stream_executor::NumericOptions GetNumericOptionsForCuDnn() {
                                    &cudnn_deterministic));
     return cudnn_deterministic;
   }();
-  stream_executor::NumericOptions result = GetNumericOptions();
+  stream_executor::EngineOptions result = GetNumericOptions();
   result.require_determinism |= cudnn_deterministic_env_var;
   return result;
 }
diff --git a/tensorflow/core/kernels/population_count_op.cc b/tensorflow/core/kernels/population_count_op.cc
index e83fef2b9b4e20..9d0fc7530ae889 100644
--- a/tensorflow/core/kernels/population_count_op.cc
+++ b/tensorflow/core/kernels/population_count_op.cc
@@ -18,10 +18,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <bitset>
-
 #include "tensorflow/core/kernels/population_count_op.h"
 
+#include <bitset>
+#include <limits>
+
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -114,9 +115,10 @@ struct PopulationCount<CPUDevice, T> {
     // (bitset.count() -> output).  The .count() itself is relatively cheap.
     const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() +
                                Eigen::TensorOpCost::CastCost<int64_t, uint8>());
-    const int64_t shard_cost = (total_cost >= static_cast<double>(kint64max))
-                                   ? kint64max
-                                   : static_cast<int64_t>(total_cost);
+    const int64_t shard_cost =
+        (total_cost >= static_cast<double>(std::numeric_limits<int64_t>::max()))
+            ? std::numeric_limits<int64_t>::max()
+            : static_cast<int64_t>(total_cost);
 
     auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads());
     Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index d902e8424d7486..7e218941a49e9a 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -17,6 +17,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -64,20 +66,21 @@ class RaggedGatherOpBase : public OpKernel {
                                                 &params_nested_splits_in));
     OP_REQUIRES(
         context, params_nested_splits_in.size() > 0,
-        errors::InvalidArgument("params_nested_splits must be non empty"));
+        absl::InvalidArgumentError("params_nested_splits must be non empty"));
 
     const Tensor& params_dense_values_in =
         context->input(params_nested_splits_in.size());
     const Tensor& indices_in =
         context->input(params_nested_splits_in.size() + 1);
 
-    OP_REQUIRES(context, params_nested_splits_in[0].dims() > 0,
-                errors::InvalidArgument("Split tensors must not be scalars"));
+    OP_REQUIRES(
+        context, params_nested_splits_in[0].dims() > 0,
+        absl::InvalidArgumentError("Split tensors must not be scalars"));
     SPLITS_TYPE num_params = params_nested_splits_in[0].dim_size(0) - 1;
     OP_REQUIRES_OK(context, ValidateIndices(indices_in, num_params));
 
     OP_REQUIRES(context, params_dense_values_in.dims() > 0,
-                errors::InvalidArgument("params.rank must be nonzero"));
+                absl::InvalidArgumentError("params.rank must be nonzero"));
     SPLITS_TYPE num_params_dense_values = params_dense_values_in.dim_size(0);
 
     // Calculate the `splits`, and store the value slices that we need to
@@ -106,9 +109,9 @@ class RaggedGatherOpBase : public OpKernel {
     for (SPLITS_TYPE i = 0; i < indices.size(); ++i) {
       SPLITS_TYPE index = indices(i);
       if (index < 0 || index >= num_params) {
-        return errors::InvalidArgument(
-            "indices", SliceDebugString(indices_in.shape(), i), " = ", index,
-            " is not in [0, ", num_params, ")");
+        return absl::InvalidArgumentError(
+            absl::StrCat("indices", SliceDebugString(indices_in.shape(), i),
+                         " = ", index, " is not in [0, ", num_params, ")"));
       }
     }
     return absl::OkStatus();
@@ -201,18 +204,18 @@ class RaggedGatherOpBase : public OpKernel {
                                    ? num_params_dense_values
                                    : params_nested_splits[dim + 1].size();
       if (splits.size() == 0) {
-        return errors::InvalidArgument("Ragged splits may not be empty");
+        return absl::InvalidArgumentError("Ragged splits may not be empty");
       }
       if (splits(0) < 0) {
-        return errors::InvalidArgument("Ragged splits must be non-negative");
+        return absl::InvalidArgumentError("Ragged splits must be non-negative");
       }
       if (splits(splits.size() - 1) > last_split) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(
             "Ragged splits must not point past values");
       }
       for (int i = 1; i < splits.size(); ++i) {
         if (splits(i - 1) > splits(i)) {
-          return errors::InvalidArgument("Ragged splits must be sorted");
+          return absl::InvalidArgumentError("Ragged splits must be sorted");
         }
       }
     }
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index 409b5448243d90..db4f97c3e925de 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/range_sampler.h"
 
+#include <algorithm>
 #include <cmath>
-#include <unordered_set>
+#include <cstdint>
+#include <limits>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
@@ -83,13 +87,14 @@ void RangeSampler::SampleBatchGetExpectedCountAvoid(
 
   if (unique) {
     CHECK_LE(static_cast<int64_t>(batch_size + avoided_values.size()), range_);
-    std::unordered_set<int64_t> used(batch_size);
+    absl::flat_hash_set<int64_t> used;
+    used.reserve(batch_size + avoided_values.size());
     used.insert(avoided_values.begin(), avoided_values.end());
     int num_picked = 0;
     num_tries = 0;
     while (num_picked < batch_size) {
       num_tries++;
-      CHECK_LT(num_tries, kint32max);
+      CHECK_LT(num_tries, std::numeric_limits<int32_t>::max());
       int64_t value = Sample(rnd);
       if (gtl::InsertIfNotPresent(&used, value)) {
         batch[num_picked++] = value;
@@ -177,7 +182,7 @@ float LogUniformSampler::Probability(int64_t value) const {
 
 ThreadUnsafeUnigramSampler::ThreadUnsafeUnigramSampler(int64_t range)
     : RangeSampler(range), picker_(range) {
-  CHECK_LT(range, kint32max);
+  CHECK_LT(range, std::numeric_limits<int32_t>::max());
 }
 
 int64_t ThreadUnsafeUnigramSampler::Sample(random::SimplePhilox* rnd) const {
@@ -189,8 +194,9 @@ float ThreadUnsafeUnigramSampler::Probability(int64_t value) const {
 }
 
 void ThreadUnsafeUnigramSampler::Update(absl::Span<const int64_t> values) {
-  int num_updates = std::min(static_cast<int>(values.size()),
-                             kint32max - picker_.total_weight());
+  int num_updates =
+      std::min(static_cast<int>(values.size()),
+               std::numeric_limits<int32_t>::max() - picker_.total_weight());
   for (int i = 0; i < num_updates; i++) {
     const int64_t value = values[i];
     picker_.set_weight(value, picker_.get_weight(value) + 1);
@@ -200,7 +206,7 @@ void ThreadUnsafeUnigramSampler::Update(absl::Span<const int64_t> values) {
 // Thread-safe unigram sampler
 UnigramSampler::UnigramSampler(int64_t range)
     : RangeSampler(range), unsafe_sampler_(range) {
-  CHECK_LT(range, kint32max);
+  CHECK_LT(range, std::numeric_limits<int32_t>::max());
 }
 
 int64_t UnigramSampler::Sample(random::SimplePhilox* rnd) const {
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index e2d797290297bd..0c5054077a1f35 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -45,11 +45,6 @@ limitations under the License.
 //   (use_locking=false), we never copy even if the variable's
 //   reference count is >1.
 
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/synchronization/notification.h"
-#include "tensorflow/core/framework/op_requires.h"
-#include "tensorflow/core/framework/types.pb.h"
 #define EIGEN_USE_THREADS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -58,18 +53,25 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #endif
 
+#include <limits>
 #include <memory>
 #include <type_traits>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/synchronization/notification.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
@@ -133,13 +135,15 @@ absl::Status CopyVariable(int output_idx, OpKernelContext* ctx,
       TF_CALL_ALL_TYPES(HANDLER);
       TF_CALL_float8_e5m2(HANDLER);
       TF_CALL_float8_e4m3fn(HANDLER);
+      TF_CALL_float4_e2m1fn(HANDLER);
       TF_CALL_int4(HANDLER);
       TF_CALL_uint4(HANDLER);
       TF_CALL_int2(HANDLER);
       TF_CALL_uint2(HANDLER);
 #undef HANDLER
       default:
-        return errors::Internal("Unsupported dtype", t->dtype());
+        return absl::InternalError(
+            absl::StrCat("Unsupported dtype: ", DataTypeString(t->dtype())));
     }
   }
   return absl::OkStatus();
@@ -152,12 +156,12 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
   const ResourceHandle& handle = HandleFromInput(ctx, 0);
   const auto status = LookupResource(ctx, handle, &variable);
   OP_REQUIRES(ctx, status.ok(),
-              errors::FailedPrecondition(
+              absl::FailedPreconditionError(absl::StrCat(
                   "Could not find variable ", handle.name(), ". ",
                   "This could mean that the variable has been deleted. ",
                   "In TF1, it can also mean the variable is uninitialized. ",
                   "Debug info: container=", handle.container(),
-                  ", status error message=", status.message()));
+                  ", status error message=", status.message())));
 
   tf_shared_lock ml(*variable->mu());
   // We're acquiring a reference to the underlying buffer while
@@ -167,9 +171,9 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
   if (!variable->copy_on_read_mode.load()) {
     OP_REQUIRES(
         ctx, dtype_ == t->dtype(),
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "Trying to read variable with wrong dtype. Expected ",
-            DataTypeString(dtype_), " got ", DataTypeString(t->dtype())));
+            DataTypeString(dtype_), " got ", DataTypeString(t->dtype()))));
     ctx->set_output(0, *t);
   } else {
     OP_REQUIRES_OK(ctx, CopyVariable(0, ctx, t));
@@ -181,9 +185,9 @@ ReadVariablesOp::ReadVariablesOp(OpKernelConstruction* c) : OpKernel(c) {
   OP_REQUIRES_OK(c, c->GetAttr("N", &n));
   OP_REQUIRES_OK(c, c->GetAttr("dtypes", &dtypes_));
   OP_REQUIRES(c, n == dtypes_.size(),
-              errors::InvalidArgument(
+              absl::InvalidArgumentError(absl::StrCat(
                   "Mismatched number of arguments to ReadVariablesOp (", n,
-                  " vs. ", dtypes_.size(), ")"));
+                  " vs. ", dtypes_.size(), ")")));
 }
 
 void ReadVariablesOp::Compute(OpKernelContext* ctx) {
@@ -203,10 +207,10 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
   }
 
   OP_REQUIRES(ctx, uninitialized_vars.empty(),
-              errors::FailedPrecondition(
+              absl::FailedPreconditionError(absl::StrCat(
                   "In ReadVariablesOp the following variables were "
                   "found uninitialized: ",
-                  absl::StrJoin(uninitialized_vars, ", ")));
+                  absl::StrJoin(uninitialized_vars, ", "))));
 
   for (size_t i = 0; i < dtypes_.size(); ++i) {
     // We're acquiring a reference to the underlying buffer while
@@ -214,11 +218,11 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
     // writes.
     tf_shared_lock ml(*variables[i]->mu());
     OP_REQUIRES(ctx, dtypes_[i] == variables[i]->tensor()->dtype(),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Trying to read variable ", handles[i]->name(),
                     " from Container: ", handles[i]->container(),
                     " with wrong dtype. Expected ", DataTypeString(dtypes_[i]),
-                    " got ", DataTypeString(variables[i]->tensor()->dtype())));
+                    " got ", DataTypeString(variables[i]->tensor()->dtype()))));
     if (variables[i]->copy_on_read_mode.load()) {
       OP_REQUIRES_OK(ctx, CopyVariable(i, ctx, variables[i]->tensor()));
     } else {
@@ -383,12 +387,12 @@ void DisableCopyOnReadOp::Compute(OpKernelContext* ctx) {
   const ResourceHandle& handle = HandleFromInput(ctx, 0);
   const auto status = LookupResource(ctx, handle, &variable);
   OP_REQUIRES(ctx, status.ok(),
-              errors::FailedPrecondition(
+              absl::FailedPreconditionError(absl::StrCat(
                   "Could not find variable ", handle.name(), ". ",
                   "This could mean that the variable has been deleted. ",
                   "In TF1, it can also mean the variable is uninitialized. ",
                   "Debug info: container=", handle.container(),
-                  ", status error message=", status.message()));
+                  ", status error message=", status.message())));
   // If the variable is currently in copy-on-read mode, its refcount is 1
   if (variable->copy_on_read_mode.load()) {
     // Obtain an exclusive lock on the variable and change the access mode
@@ -420,10 +424,10 @@ class AssignVariableOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     OP_REQUIRES(context, dtype_ == context->input(1).dtype(),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Variable and value dtypes don't match; respectively, ",
                     DataTypeString(dtype_), " and ",
-                    DataTypeString(context->input(1).dtype())));
+                    DataTypeString(context->input(1).dtype()))));
     core::RefCountPtr<Var> variable;
     const Tensor& value = context->input(1);
     // Note: every resource-variable-manipulating op assumes copy-on-write
@@ -454,20 +458,20 @@ class AssignVariableOp : public OpKernel {
                 (variable->tensor()->dtype() == DT_INVALID &&
                  !variable->is_initialized) ||
                     variable->tensor()->dtype() == dtype_,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
-                    DataTypeString(dtype_)));
+                    DataTypeString(dtype_))));
     if (validate_shape_) {
       OP_REQUIRES(
           context,
           (!variable->is_initialized ||
            variable->tensor()->shape().IsSameSize(value.shape())),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Trying to assign to variable with tensor with wrong shape."
               " Expected ",
               variable->tensor()->shape().DebugString(), " got ",
-              value.shape().DebugString()));
+              value.shape().DebugString())));
     }
     if (variable->copy_on_read_mode.load()) {
       AllocatorAttributes attr;
@@ -496,9 +500,10 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
   explicit AssignVariableOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
-    OP_REQUIRES(c, dtype_ == DT_VARIANT,
-                errors::Internal("Variant kernel called with dtype: ",
-                                 DataTypeString(dtype_)));
+    OP_REQUIRES(
+        c, dtype_ == DT_VARIANT,
+        absl::InternalError(absl::StrCat("Variant kernel called with dtype: ",
+                                         DataTypeString(dtype_))));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -532,10 +537,10 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
     mutex_lock ml(*variable->mu());
     OP_REQUIRES(context, variable->tensor()->dtype() == DT_VARIANT,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
-                    DataTypeString(DT_VARIANT)));
+                    DataTypeString(DT_VARIANT))));
     variable->is_initialized = true;
     *variable->tensor() = Tensor(DT_VARIANT, value.shape());
 
@@ -574,6 +579,7 @@ TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_float8_e5m2(REGISTER_KERNELS);
 TF_CALL_float8_e4m3fn(REGISTER_KERNELS);
+TF_CALL_float4_e2m1fn(REGISTER_KERNELS);
 TF_CALL_int4(REGISTER_KERNELS);
 TF_CALL_uint4(REGISTER_KERNELS);
 TF_CALL_int2(REGISTER_KERNELS);
@@ -592,6 +598,7 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU_KERNELS);
 TF_CALL_float8_e5m2(REGISTER_GPU_KERNELS);
 TF_CALL_float8_e4m3fn(REGISTER_GPU_KERNELS);
+TF_CALL_float4_e2m1fn(REGISTER_GPU_KERNELS);
 TF_CALL_int4(REGISTER_GPU_KERNELS);
 TF_CALL_uint4(REGISTER_GPU_KERNELS);
 TF_CALL_int2(REGISTER_GPU_KERNELS);
@@ -633,7 +640,7 @@ class AssignUpdateVariableOp : public OpKernel {
     OP_REQUIRES_OK(context, ValidateAssignUpdateVariableOpShapes(
                                 var_tensor->shape(), value.shape()));
     OP_REQUIRES(context, var_tensor->dtype() == value.dtype(),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(
                     "DType of variable handle and value does not match."));
     OP_REQUIRES_OK(
         context, PrepareToUpdateVariable<Device, T>(
@@ -746,21 +753,21 @@ class ResourceGatherOp : public OpKernel {
     const Tensor& indices = c->input(1);
     OP_REQUIRES(
         c, TensorShapeUtils::IsVectorOrHigher(params.shape()),
-        errors::InvalidArgument("params must be at least 1 dimensional"));
-    OP_REQUIRES(
-        c, params.shape().dims() >= batch_dims_,
-        errors::InvalidArgument("params must have at least ", batch_dims_,
-                                " (batch_dims) dimensions but it has shape ",
-                                params.shape().DebugString()));
+        absl::InvalidArgumentError("params must be at least 1 dimensional"));
+    OP_REQUIRES(c, params.shape().dims() >= batch_dims_,
+                absl::InvalidArgumentError(
+                    absl::StrCat("params must have at least ", batch_dims_,
+                                 " (batch_dims) dimensions but it has shape ",
+                                 params.shape().DebugString())));
 
     // Check that we have enough index space
     const int64_t N = indices.NumElements();
-    OP_REQUIRES(
-        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params.dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[0] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", params.dim_size(0), " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // The result shape is params.shape[:batch_dims] +
     // indices.shape[batch_dims:] + params.shape[batch_dims+1:].
@@ -817,11 +824,11 @@ class ResourceGatherOp : public OpKernel {
       functor::GatherFunctor<Device, T, Index> functor;
       int64_t bad_i = functor(c, params_flat, indices_flat, out_flat);
 
-      OP_REQUIRES(
-          c, bad_i < 0,
-          errors::InvalidArgument(
-              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+      OP_REQUIRES(c, bad_i < 0,
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "indices", SliceDebugString(indices.shape(), bad_i),
+                      " = ", indices_flat(bad_i), " is not in [0, ",
+                      params.dim_size(0), ")")));
     }
   }
 
@@ -838,8 +845,8 @@ class ResourceGatherOp : public OpKernel {
     }
     OP_REQUIRES(
         ctx, batch_size != 0,
-        errors::InvalidArgument(
-            "Inner size of indices would result in batch_size of 0 and a ",
+        absl::InvalidArgumentError(
+            "Inner size of indices would result in batch_size of 0 and a "
             "division by 0 in the implementation. This is illegal"));
 
     auto indices_flat = indices->flat<Index>();
@@ -1014,7 +1021,7 @@ Status CopyTensorToHost(OpKernelContext* c, const Tensor& device_tensor,
   TF_RETURN_IF_ERROR(stream->Memcpy(host_tensor->flat<T>().data(), device_ptr,
                                     device_tensor.NumElements() * sizeof(T)));
   if (!stream) {
-    return errors::Internal("Failed to copy indices to host");
+    return absl::InternalError("Failed to copy indices to host");
   }
   return OkStatus();
 }
@@ -1027,9 +1034,9 @@ template <typename T, typename Index, scatter_op::UpdateOp Op>
 Status DoScatterOnCpu(OpKernelContext* c, Tensor* params, const Tensor& indices,
                       const Tensor& updates, Index num_indices) {
   if (!DataTypeCanUseMemcpy(params->dtype())) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(absl::StrCat(
         "GPU Scatter ops for dtype ", DataTypeString(params->dtype()),
-        " do not yet have a deterministic implementation");
+        " do not yet have a deterministic implementation"));
   }
   auto stream = c->op_device_context()->stream();
 
@@ -1050,7 +1057,7 @@ Status DoScatterOnCpu(OpKernelContext* c, Tensor* params, const Tensor& indices,
   TF_RETURN_IF_ERROR(stream->Memcpy(&params_ptr, host_params.flat<T>().data(),
                                     host_params.NumElements() * sizeof(T)));
   if (!stream) {
-    return errors::Internal("Failed to copy params to device");
+    return absl::InternalError("Failed to copy params to device");
   }
   // Deallocate host_params' buffer once the host-to-device copy is complete.
   // host_params is captured by value in the lambda so that its buffer is only
@@ -1090,17 +1097,17 @@ absl::Status DoScatter(OpKernelContext* c, Tensor* params,
       const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                   params_flat, update, indices_flat);
       if (bad_i >= 0) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")");
+            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")"));
       }
     } else {
       int64_t num_updates = updates.NumElements();
       if (!TensorShapeUtils::StartsWith(updates.shape(), indices.shape())) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "The shape of indices (", indices.shape().DebugString(),
             ") must be a prefix of the shape of updates (",
-            updates.shape().DebugString(), ")");
+            updates.shape().DebugString(), ")"));
       }
       auto updates_flat =
           updates.shaped<T, 2>({num_indices, num_updates / num_indices});
@@ -1108,9 +1115,9 @@ absl::Status DoScatter(OpKernelContext* c, Tensor* params,
       const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                   params_flat, updates_flat, indices_flat);
       if (bad_i >= 0) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")");
+            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")"));
       }
     }
   }
@@ -1138,7 +1145,7 @@ class ResourceScatterUpdateOp : public OpKernel {
     // Check data type of update and resource to scatter.
     const DataType update_dtype = c->input(2).dtype();
     OP_REQUIRES(c, v->tensor()->dtype() == update_dtype,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(
                     "DType of scatter resource and updates does not match."));
 
     OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v.get()));
@@ -1166,33 +1173,32 @@ class ResourceScatterUpdateOp : public OpKernel {
     OP_REQUIRES(c,
                 updates.dims() == 0 ||
                     updates.dims() == indices.dims() + params->dims() - 1,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Must have updates.shape = indices.shape + "
                     "params.shape[1:] or updates.shape = [], got ",
                     "updates.shape ", updates.shape().DebugString(),
                     ", indices.shape ", indices.shape().DebugString(),
-                    ", params.shape ", params->shape().DebugString()));
+                    ", params.shape ", params->shape().DebugString())));
 
     // Check that we have enough index space
     const int64_t N_big = indices.NumElements();
-    OP_REQUIRES(
-        c, N_big <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("indices has too many elements for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", N_big, " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(absl::StrCat(
+                    "indices has too many elements for ",
+                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
+                    N_big, " > ", std::numeric_limits<Index>::max())));
     const Index N = static_cast<Index>(N_big);
-    OP_REQUIRES(
-        c, params->dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params->dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, params->dim_size(0) <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[0] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", params->dim_size(0), " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // Prevent division by 0
     if (isCPUDevice<Device>() && op == tensorflow::scatter_op::UpdateOp::DIV) {
       OP_REQUIRES(c, ValidateInput<T>(updates),
-                  errors::InvalidArgument("updates must not contain 0"));
+                  absl::InvalidArgumentError("updates must not contain 0"));
     }
 
     if (N > 0) {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc b/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
index 7d1dd915ee3383..ffa562d7158744 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
@@ -33,7 +33,7 @@ class RiscReshapeOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("RiscReshape").Device(DEVICE_CPU).TypeConstraint<int32>("Tshape"),
+    Name("RiscReshape").Device(DEVICE_CPU).TypeConstraint<int32_t>("Tshape"),
     RiscReshapeOp);
 REGISTER_KERNEL_BUILDER(
     Name("RiscReshape").Device(DEVICE_CPU).TypeConstraint<int64_t>("Tshape"),
diff --git a/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc b/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
index 98273b64cf6a7d..a7b7e73a8aa37b 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
@@ -34,8 +34,8 @@ class RiscShapeOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("RiscShape").Device(DEVICE_CPU).TypeConstraint<int32>("out_type"),
-    RiscShapeOp<int32>);
+    Name("RiscShape").Device(DEVICE_CPU).TypeConstraint<int32_t>("out_type"),
+    RiscShapeOp<int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("RiscShape").Device(DEVICE_CPU).TypeConstraint<int64_t>("out_type"),
     RiscShapeOp<int64_t>);
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 12a8b153ebb2a7..44364ee6c4cd8a 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 // See docs in ../ops/state_ops.cc.
 
+#include <limits>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/scatter_functor.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,17 +55,18 @@ static bool ValidShapes(const Tensor& params, const Tensor& updates,
 static void DoValidationChecking(OpKernelContext* c, const Tensor& params,
                                  const Tensor& indices, const Tensor& updates) {
   OP_REQUIRES(c, params.IsInitialized(),
-              errors::FailedPrecondition("Null ref for params"));
+              absl::FailedPreconditionError("Null ref for params"));
   OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(params.shape()),
-              errors::InvalidArgument("params must be at least 1-D, got shape ",
-                                      params.shape().DebugString()));
-  OP_REQUIRES(
-      c, ValidShapes(params, updates, indices),
-      errors::InvalidArgument("Must have updates.shape = indices.shape + "
-                              "params.shape[1:] or updates.shape = [], got ",
-                              "updates.shape ", updates.shape().DebugString(),
-                              ", indices.shape ", indices.shape().DebugString(),
-                              ", params.shape ", params.shape().DebugString()));
+              absl::InvalidArgumentError(
+                  absl::StrCat("params must be at least 1-D, got shape ",
+                               params.shape().DebugString())));
+  OP_REQUIRES(c, ValidShapes(params, updates, indices),
+              absl::InvalidArgumentError(absl::StrCat(
+                  "Must have updates.shape = indices.shape + "
+                  "params.shape[1:] or updates.shape = [], got ",
+                  "updates.shape ", updates.shape().DebugString(),
+                  ", indices.shape ", indices.shape().DebugString(),
+                  ", params.shape ", params.shape().DebugString())));
 }
 
 template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
@@ -76,7 +82,7 @@ class ScatterUpdateOp : public OpKernel {
     if (std::is_same<Device, GPUDevice>::value) {
       OP_REQUIRES(
           c, !OpDeterminismRequired(),
-          errors::Unimplemented(
+          absl::UnimplementedError(
               "Determinism is not yet supported in GPU implementation of "
               "Scatter ops with ref inputs. Consider using resource variables "
               "instead if you want to run Scatter when op determinism is "
@@ -106,19 +112,18 @@ class ScatterUpdateOp : public OpKernel {
 
     // Check that we have enough index space
     const int64_t N_big = indices.NumElements();
-    OP_REQUIRES(
-        c, N_big <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("indices has too many elements for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", N_big, " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(absl::StrCat(
+                    "indices has too many elements for ",
+                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
+                    N_big, " > ", std::numeric_limits<Index>::max())));
     const Index N = static_cast<Index>(indices.NumElements());
-    OP_REQUIRES(
-        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params.dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[0] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", params.dim_size(0), " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // We always return the input ref.
     c->forward_ref_input_to_ref_output(0, 0);
@@ -133,10 +138,10 @@ class ScatterUpdateOp : public OpKernel {
         const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                     params_flat, update, indices_flat);
         OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "indices", SliceDebugString(indices.shape(), bad_i),
                         " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
+                        params.dim_size(0), ")")));
       } else {
         auto updates_flat =
             updates.shaped<T, 2>({N, updates.NumElements() / N});
@@ -145,10 +150,10 @@ class ScatterUpdateOp : public OpKernel {
         const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                     params_flat, updates_flat, indices_flat);
         OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "indices", SliceDebugString(indices.shape(), bad_i),
                         " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
+                        params.dim_size(0), ")")));
       }
     }
   }
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index d087bfaef5b83b..2461dbca7efb93 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 #include <vector>
 
@@ -576,7 +577,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
     // sorted.
     const SegmentId last_segment_id =
         num_indices > 0 ? segment_vec(num_indices - 1) : 0;
-    int64_t limit = dtidx_ == DataType::DT_INT32 ? kint32max : kint64max;
+    int64_t limit = dtidx_ == DataType::DT_INT32
+                        ? std::numeric_limits<int32_t>::max()
+                        : std::numeric_limits<int64_t>::max();
 
     OP_REQUIRES(
         context, last_segment_id < limit,
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index c91aff5a0ae5f3..9faeeafc45a72e 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // Ops for operating with sets. They are not checked in
 // to TensorFlow because we would first like to demonstrate successful
 // end-to-end use of these ops in eval and polish the api a bit like taking two
-// SparseTensor rather than on edense and one sparse.
+// SparseTensor rather than one dense and one sparse.
 
 #define EIGEN_USE_THREADS
 
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/ascii.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -323,8 +324,7 @@ SetOperation SetOperationFromContext(OpKernelConstruction* ctx) {
   if (!ctx->GetAttr("set_operation", &set_operation_str).ok()) {
     ctx->CtxFailure(errors::InvalidArgument("Missing set_operation."));
   } else {
-    std::transform(set_operation_str.begin(), set_operation_str.end(),
-                   set_operation_str.begin(), ::tolower);
+    absl::AsciiStrToLower(&set_operation_str);
     if ("a-b" == set_operation_str) {
       return A_MINUS_B;
     }
diff --git a/tensorflow/core/kernels/shuffle_common.h b/tensorflow/core/kernels/shuffle_common.h
index 0eea7fd43a335e..bc2334590b4eef 100644
--- a/tensorflow/core/kernels/shuffle_common.h
+++ b/tensorflow/core/kernels/shuffle_common.h
@@ -19,7 +19,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_SHUFFLE_COMMON_H_
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
+#include <limits>
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -87,7 +89,7 @@ absl::Status RandomShuffle(
           context->allocate_output(output_idx, input.shape(), &output));
       const auto input_mat = input.flat_outer_dims<T>();
       auto output_mat = output->flat_outer_dims<T>();
-      if (size < kint32max) {
+      if (size < std::numeric_limits<int32_t>::max()) {
         IndexedShuffle<int32>(size, input_mat, output_mat, uniform);
       } else {
         IndexedShuffle<int64_t>(size, input_mat, output_mat, uniform);
diff --git a/tensorflow/core/kernels/string_ngrams_op.cc b/tensorflow/core/kernels/string_ngrams_op.cc
index 3463a9ae2fbe72..2da0fcc1ad3f7d 100644
--- a/tensorflow/core/kernels/string_ngrams_op.cc
+++ b/tensorflow/core/kernels/string_ngrams_op.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <cstdint>
+#include <limits>
 #include <locale>
 #include <string>
 
@@ -52,7 +54,7 @@ class StringNGramsOp : public tensorflow::OpKernel {
 
   absl::StatusOr<int> get_num_ngrams(const int length,
                                      const int ngram_width) const {
-    int64 limit = kint32max;
+    int64_t limit = std::numeric_limits<int32_t>::max();
     int pad_width = get_pad_width(ngram_width);
     if (pad_width > limit / 2 - length) {
       return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/string_strip_op.cc b/tensorflow/core/kernels/string_strip_op.cc
index 6a0dabef7c0330..84650b04395d70 100644
--- a/tensorflow/core/kernels/string_strip_op.cc
+++ b/tensorflow/core/kernels/string_strip_op.cc
@@ -43,7 +43,7 @@ class StringStripOp : public OpKernel {
     for (int64_t i = 0; i < input.size(); ++i) {
       absl::string_view entry(input(i));
       str_util::RemoveWhitespaceContext(&entry);
-      output(i) = string(entry);
+      output(i) = std::string(entry);
     }
   }
 };
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h b/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
index f9119259f4d934..a0486165a8ed2d 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <uint64 hash(absl::string_view)>
+template <uint64_t hash(absl::string_view)>
 class StringToHashBucketOp : public OpKernel {
  public:
   explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -46,8 +46,8 @@ class StringToHashBucketOp : public OpKernel {
 
     typedef decltype(input_flat.size()) Index;
     for (Index i = 0; i < input_flat.size(); ++i) {
-      const uint64 input_hash = hash(input_flat(i));
-      const uint64 bucket_id = input_hash % num_buckets_;
+      const uint64_t input_hash = hash(input_flat(i));
+      const uint64_t bucket_id = input_hash % num_buckets_;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.cc b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
index 3231b6b87cd8d9..e7bb536ee542b4 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.cc
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
@@ -42,8 +42,8 @@ class LegacyStringToHashBucketOp : public OpKernel {
 
     typedef decltype(input_flat.size()) Index;
     for (Index i = 0; i < input_flat.size(); ++i) {
-      const uint64 input_hash = Hash64(input_flat(i));
-      const uint64 bucket_id = input_hash % num_buckets_;
+      const uint64_t input_hash = Hash64(input_flat(i));
+      const uint64_t bucket_id = input_hash % num_buckets_;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.h b/tensorflow/core/kernels/string_to_hash_bucket_op.h
index 71fba9b63ad6d1..a509ae2a337f2f 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.h
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <uint64 hash(const uint64 (&)[2], const string&)>
+template <uint64_t hash(const uint64_t (&)[2], const std::string&)>
 class StringToKeyedHashBucketOp : public OpKernel {
  public:
   explicit StringToKeyedHashBucketOp(OpKernelConstruction* ctx)
@@ -53,8 +53,8 @@ class StringToKeyedHashBucketOp : public OpKernel {
 
     typedef decltype(input_flat.size()) Index;
     for (Index i = 0; i < input_flat.size(); ++i) {
-      const uint64 input_hash = hash(key_, input_flat(i));
-      const uint64 bucket_id = input_hash % num_buckets_;
+      const uint64_t input_hash = hash(key_, input_flat(i));
+      const uint64_t bucket_id = input_hash % num_buckets_;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
@@ -64,7 +64,7 @@ class StringToKeyedHashBucketOp : public OpKernel {
 
  private:
   int64_t num_buckets_;
-  uint64 key_[2];
+  uint64_t key_[2];
 
   StringToKeyedHashBucketOp(const StringToKeyedHashBucketOp&) = delete;
   void operator=(const StringToKeyedHashBucketOp&) = delete;
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index 7d1553874d9a7a..3d234a23498448 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -68,7 +68,7 @@ class StringToNumberOp : public OpKernel {
                           StringToNumberOp<type>)
 REGISTER(float);
 REGISTER(double);
-REGISTER(int32);
+REGISTER(int32_t);
 REGISTER(int64_t);
 REGISTER(uint32_t);
 REGISTER(uint64_t);
diff --git a/tensorflow/core/kernels/string_upper_op.cc b/tensorflow/core/kernels/string_upper_op.cc
index 0a427dcc294c73..ff31e7c0c2816b 100644
--- a/tensorflow/core/kernels/string_upper_op.cc
+++ b/tensorflow/core/kernels/string_upper_op.cc
@@ -63,7 +63,7 @@ class StringUpperOp : public OpKernel {
   }
 
  private:
-  string encoding_;
+  std::string encoding_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("StringUpper").Device(DEVICE_CPU), StringUpperOp);
diff --git a/tensorflow/core/kernels/string_util.cc b/tensorflow/core/kernels/string_util.cc
index faeb0069678a81..a0185d1bcbddd2 100644
--- a/tensorflow/core/kernels/string_util.cc
+++ b/tensorflow/core/kernels/string_util.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Sets unit value based on str.
-absl::Status ParseUnicodeEncoding(const string& str,
+absl::Status ParseUnicodeEncoding(const std::string& str,
                                   UnicodeEncoding* encoding) {
   if (str == "UTF-8") {
     *encoding = UnicodeEncoding::UTF8;
@@ -36,7 +36,7 @@ absl::Status ParseUnicodeEncoding(const string& str,
 }
 
 // Sets unit value based on str.
-absl::Status ParseCharUnit(const string& str, CharUnit* unit) {
+absl::Status ParseCharUnit(const std::string& str, CharUnit* unit) {
   if (str == "BYTE") {
     *unit = CharUnit::BYTE;
   } else if (str == "UTF8_CHAR") {
@@ -50,7 +50,7 @@ absl::Status ParseCharUnit(const string& str, CharUnit* unit) {
 
 // Return the number of Unicode characters in a UTF-8 string.
 // Result may be incorrect if the input string is not valid UTF-8.
-int32 UTF8StrLen(const string& str) {
+int32_t UTF8StrLen(const std::string& str) {
   const int32_t byte_size = str.size();
   const char* const end = str.data() + byte_size;
   const char* ptr = str.data();
diff --git a/tensorflow/core/kernels/string_util.h b/tensorflow/core/kernels/string_util.h
index 58230d3d3e3cf4..af790ad417c778 100644
--- a/tensorflow/core/kernels/string_util.h
+++ b/tensorflow/core/kernels/string_util.h
@@ -33,14 +33,15 @@ enum class CharUnit { BYTE, UTF8_CHAR };
 inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
 
 // Sets `encoding` based on `str`.
-absl::Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding);
+absl::Status ParseUnicodeEncoding(const std::string& str,
+                                  UnicodeEncoding* encoding);
 
 // Sets `unit` value based on `str`.
-absl::Status ParseCharUnit(const string& str, CharUnit* unit);
+absl::Status ParseCharUnit(const std::string& str, CharUnit* unit);
 
 // Returns the number of Unicode characters in a UTF-8 string.
 // Result may be incorrect if the input string is not valid UTF-8.
-int32 UTF8StrLen(const string& str);
+int32_t UTF8StrLen(const std::string& str);
 
 // Get the next UTF8 character position starting at the given position and
 // skipping the given number of characters. Position is a byte offset, and
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 3ea53cbe70f542..0b3fea3ab3e0d3 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -39,7 +39,7 @@ template <typename T>
 class SubstrOp : public OpKernel {
  public:
   explicit SubstrOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string unit;
+    std::string unit;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("unit", &unit));
     OP_REQUIRES_OK(ctx, ParseCharUnit(unit, &unit_));
   }
@@ -342,6 +342,6 @@ class SubstrOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                         \
       Name("Substr").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       SubstrOp<type>);
-REGISTER_SUBSTR(int32);
+REGISTER_SUBSTR(int32_t);
 REGISTER_SUBSTR(int64_t);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/substr_op_test.cc b/tensorflow/core/kernels/substr_op_test.cc
index 5f9eaa8ca7468a..4e630604e5f4c8 100644
--- a/tensorflow/core/kernels/substr_op_test.cc
+++ b/tensorflow/core/kernels/substr_op_test.cc
@@ -136,9 +136,9 @@ Graph* SetupSubstrGraph(const Tensor& input, const int32_t pos,
                         const int32_t len, const char* const unit) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor position(DT_INT32, TensorShape({}));
-  position.flat<int32>().setConstant(pos);
+  position.flat<int32_t>().setConstant(pos);
   Tensor length(DT_INT32, TensorShape({}));
-  length.flat<int32>().setConstant(len);
+  length.flat<int32_t>().setConstant(len);
 
   TF_CHECK_OK(NodeBuilder("substr_op", "Substr")
                   .Input(test::graph::Constant(g, input))
diff --git a/tensorflow/core/kernels/summary_audio_op.cc b/tensorflow/core/kernels/summary_audio_op.cc
index 99f2334e466a41..fb080f66fd5e52 100644
--- a/tensorflow/core/kernels/summary_audio_op.cc
+++ b/tensorflow/core/kernels/summary_audio_op.cc
@@ -44,7 +44,7 @@ class SummaryAudioOp : public OpKernel {
     OP_REQUIRES(c, tensor.dims() >= 2 && tensor.dims() <= 3,
                 errors::InvalidArgument("Tensor must be 3-D or 2-D, got: ",
                                         tensor.shape().DebugString()));
-    const string& base_tag = tag.scalar<tstring>()();
+    const std::string& base_tag = tag.scalar<tstring>()();
 
     float sample_rate = sample_rate_attr_;
     if (!has_sample_rate_attr_) {
diff --git a/tensorflow/core/kernels/summary_audio_op_test.cc b/tensorflow/core/kernels/summary_audio_op_test.cc
index 198627652f6002..987f7bdc1833f3 100644
--- a/tensorflow/core/kernels/summary_audio_op_test.cc
+++ b/tensorflow/core/kernels/summary_audio_op_test.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 33d8338ff4fcbc..6ca851979fd503 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -28,14 +28,14 @@ namespace tensorflow {
 
 class SummaryImageOp : public OpKernel {
  public:
-  typedef Eigen::Tensor<uint8, 2, Eigen::RowMajor> Uint8Image;
+  typedef Eigen::Tensor<uint8_t, 2, Eigen::RowMajor> Uint8Image;
 
   explicit SummaryImageOp(OpKernelConstruction* context) : OpKernel(context) {
     int64_t max_images_tmp;
     OP_REQUIRES_OK(context, context->GetAttr("max_images", &max_images_tmp));
     OP_REQUIRES(context, max_images_tmp < (1LL << 31),
                 errors::InvalidArgument("max_images must be < 2^31"));
-    max_images_ = static_cast<int32>(max_images_tmp);
+    max_images_ = static_cast<int32_t>(max_images_tmp);
     const TensorProto* proto;
     OP_REQUIRES_OK(context, context->GetAttr("bad_color", &proto));
     OP_REQUIRES_OK(context, context->device()->MakeTensorFromProto(
@@ -61,7 +61,7 @@ class SummaryImageOp : public OpKernel {
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
-    const string& base_tag = tags.scalar<tstring>()();
+    const std::string& base_tag = tags.scalar<tstring>()();
 
     OP_REQUIRES(c,
                 tensor.dim_size(0) < (1LL << 31) &&
@@ -87,8 +87,8 @@ class SummaryImageOp : public OpKernel {
     if (tensor.dtype() == DT_UINT8) {
       // For uint8 input, no normalization is necessary
       auto ith_image = [&tensor, batch_size, hw, depth](int i) {
-        auto values = tensor.shaped<uint8, 3>({batch_size, hw, depth});
-        return typename TTypes<uint8>::ConstMatrix(
+        auto values = tensor.shaped<uint8_t, 3>({batch_size, hw, depth});
+        return typename TTypes<uint8_t>::ConstMatrix(
             &values(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
       };
       OP_REQUIRES_OK(
@@ -112,14 +112,14 @@ class SummaryImageOp : public OpKernel {
   template <class T>
   void NormalizeAndAddImages(OpKernelContext* c, const Tensor& tensor, int h,
                              int w, int hw, int depth, int batch_size,
-                             const string& base_tag, Summary* s) {
+                             const std::string& base_tag, Summary* s) {
     // For float and half images, nans and infs are replaced with bad_color.
     OP_REQUIRES(c, bad_color_.dim_size(0) >= depth,
                 errors::InvalidArgument(
                     "expected depth <= bad_color.size, got depth = ", depth,
                     ", bad_color.size = ", bad_color_.dim_size(0)));
-    auto bad_color_full = bad_color_.vec<uint8>();
-    typename TTypes<uint8>::ConstVec bad_color(bad_color_full.data(), depth);
+    auto bad_color_full = bad_color_.vec<uint8_t>();
+    typename TTypes<uint8_t>::ConstVec bad_color(bad_color_full.data(), depth);
 
     // Float images must be scaled and translated.
     Uint8Image image(hw, depth);
@@ -142,7 +142,7 @@ class SummaryImageOp : public OpKernel {
   // differently in the float and uint8 cases: the float case needs a temporary
   // buffer which can be shared across calls to ith_image, but the uint8 case
   // does not.
-  absl::Status AddImages(const string& tag, int batch_size, int w, int h,
+  absl::Status AddImages(const std::string& tag, int batch_size, int w, int h,
                          int depth,
                          const std::function<Uint8Image(int)>& ith_image,
                          Summary* s) {
@@ -180,7 +180,7 @@ class SummaryImageOp : public OpKernel {
   template <class T>
   static void NormalizeFloatImage(int hw, int depth,
                                   typename TTypes<T>::ConstMatrix values,
-                                  typename TTypes<uint8>::ConstVec bad_color,
+                                  typename TTypes<uint8_t>::ConstVec bad_color,
                                   Uint8Image* image) {
     if (!image->size()) return;  // Nothing to do for empty images
 
@@ -241,7 +241,7 @@ class SummaryImageOp : public OpKernel {
       }
       if (finite) {
         image->chip<0>(i) = (values.template chip<0>(i) * scale + offset)
-                                .template cast<uint8>();
+                                .template cast<uint8_t>();
       } else {
         image->chip<0>(i) = bad_color;
       }
@@ -249,7 +249,7 @@ class SummaryImageOp : public OpKernel {
   }
 
  private:
-  int32 max_images_;
+  int32_t max_images_;
   Tensor bad_color_;
 };
 
diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc
index 9a136915c78ce3..43e4bb06b27104 100644
--- a/tensorflow/core/kernels/summary_image_op_test.cc
+++ b/tensorflow/core/kernels/summary_image_op_test.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h
index f423d4abaa5808..0f743c4dfa2590 100644
--- a/tensorflow/core/kernels/summary_interface.h
+++ b/tensorflow/core/kernels/summary_interface.h
@@ -36,21 +36,21 @@ class SummaryWriterInterface : public ResourceBase {
 
   // These are called in the OpKernel::Compute methods for the summary ops.
   virtual absl::Status WriteTensor(int64_t global_step, Tensor t,
-                                   const string& tag,
-                                   const string& serialized_metadata) = 0;
+                                   const std::string& tag,
+                                   const std::string& serialized_metadata) = 0;
 
   virtual absl::Status WriteScalar(int64_t global_step, Tensor t,
-                                   const string& tag) = 0;
+                                   const std::string& tag) = 0;
 
   virtual absl::Status WriteHistogram(int64_t global_step, Tensor t,
-                                      const string& tag) = 0;
+                                      const std::string& tag) = 0;
 
   virtual absl::Status WriteImage(int64_t global_step, Tensor t,
-                                  const string& tag, int max_images,
+                                  const std::string& tag, int max_images,
                                   Tensor bad_color) = 0;
 
   virtual absl::Status WriteAudio(int64_t global_step, Tensor t,
-                                  const string& tag, int max_outputs_,
+                                  const std::string& tag, int max_outputs_,
                                   float sample_rate) = 0;
 
   virtual absl::Status WriteGraph(int64_t global_step,
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 81d30e8dbee42b..bb1f5f8e755872 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -40,19 +40,19 @@ class CreateSummaryFileWriterOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("logdir", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("logdir must be a scalar"));
-    const string logdir = tmp->scalar<tstring>()();
+    const std::string logdir = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("max_queue", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("max_queue must be a scalar"));
-    const int32_t max_queue = tmp->scalar<int32>()();
+    const int32_t max_queue = tmp->scalar<int32_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("flush_millis", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("flush_millis must be a scalar"));
-    const int32_t flush_millis = tmp->scalar<int32>()();
+    const int32_t flush_millis = tmp->scalar<int32_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("filename_suffix must be a scalar"));
-    const string filename_suffix = tmp->scalar<tstring>()();
+    const std::string filename_suffix = tmp->scalar<tstring>()();
 
     core::RefCountPtr<SummaryWriterInterface> s;
     OP_REQUIRES_OK(ctx, LookupOrCreateResource<SummaryWriterInterface>(
@@ -75,13 +75,13 @@ class CreateSummaryDbWriterOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* tmp;
     OP_REQUIRES_OK(ctx, ctx->input("db_uri", &tmp));
-    const string db_uri = tmp->scalar<tstring>()();
+    const std::string db_uri = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("experiment_name", &tmp));
-    const string experiment_name = tmp->scalar<tstring>()();
+    const std::string experiment_name = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("run_name", &tmp));
-    const string run_name = tmp->scalar<tstring>()();
+    const std::string run_name = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
-    const string user_name = tmp->scalar<tstring>()();
+    const std::string user_name = tmp->scalar<tstring>()();
 
     core::RefCountPtr<SummaryWriterInterface> s;
     OP_REQUIRES_OK(
@@ -140,9 +140,9 @@ class WriteSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("summary_metadata", &tmp));
-    const string& serialized_metadata = tmp->scalar<tstring>()();
+    const std::string& serialized_metadata = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
@@ -220,7 +220,7 @@ class WriteScalarSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("value", &t));
@@ -242,7 +242,7 @@ class WriteHistogramSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("values", &t));
@@ -260,7 +260,7 @@ class WriteImageSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("max_images", &max_images_tmp));
     OP_REQUIRES(ctx, max_images_tmp < (1LL << 31),
                 errors::InvalidArgument("max_images must be < 2^31"));
-    max_images_ = static_cast<int32>(max_images_tmp);
+    max_images_ = static_cast<int32_t>(max_images_tmp);
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -270,7 +270,7 @@ class WriteImageSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
     const Tensor* bad_color;
     OP_REQUIRES_OK(ctx, ctx->input("bad_color", &bad_color));
     OP_REQUIRES(
@@ -285,7 +285,7 @@ class WriteImageSummaryOp : public OpKernel {
   }
 
  private:
-  int32 max_images_;
+  int32_t max_images_;
 };
 REGISTER_KERNEL_BUILDER(Name("WriteImageSummary").Device(DEVICE_CPU),
                         WriteImageSummaryOp);
@@ -305,7 +305,7 @@ class WriteAudioSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("sample_rate", &tmp));
     const float sample_rate = tmp->scalar<float>()();
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index cd6b96b0f68be6..3e456df7b6f888 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/summary_tensor_op.cc b/tensorflow/core/kernels/summary_tensor_op.cc
index 730ef6f38e5d62..5bbcb254685b36 100644
--- a/tensorflow/core/kernels/summary_tensor_op.cc
+++ b/tensorflow/core/kernels/summary_tensor_op.cc
@@ -43,7 +43,7 @@ class SummaryTensorOpV2 : public OpKernel {
 
     Summary s;
     Summary::Value* v = s.add_value();
-    v->set_tag(string(tag.scalar<tstring>()()));  // NOLINT
+    v->set_tag(std::string(tag.scalar<tstring>()()));  // NOLINT
 
     if (tensor.dtype() == DT_STRING) {
       // tensor_util.makeNdarray doesn't work for strings in tensor_content
diff --git a/tensorflow/core/kernels/summary_tensor_op_test.cc b/tensorflow/core/kernels/summary_tensor_op_test.cc
index 58aff4ba331d02..87a3dd2f0b9a22 100644
--- a/tensorflow/core/kernels/summary_tensor_op_test.cc
+++ b/tensorflow/core/kernels/summary_tensor_op_test.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 2a8ff3c10186bd..2919db1e1ce36f 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -136,8 +136,9 @@ class TensorArray : public ResourceBase {
   // 'N' elements.  While the underlying storage is a std::vector and
   // can hold more than MAX_INT entries, in practice we do not expect
   // users to construct this many Tensors for storage in a TensorArray.
-  TensorArray(const string& key, const DataType& dtype, const Tensor& handle,
-              int32_t N, const PartialTensorShape& element_shape,
+  TensorArray(const std::string& key, const DataType& dtype,
+              const Tensor& handle, int32_t N,
+              const PartialTensorShape& element_shape,
               bool identical_element_shapes, bool dynamic_size,
               bool multiple_writes_aggregate, bool is_grad, int32_t marked_size,
               bool clear_after_read)
@@ -193,7 +194,7 @@ class TensorArray : public ResourceBase {
 
   template <typename Device, typename T>
   absl::Status WriteOrAggregateMany(OpKernelContext* ctx,
-                                    const std::vector<int32>& indices,
+                                    const std::vector<int32_t>& indices,
                                     std::vector<Tensor>* values) {
     mutex_lock l(mu_);
     int32_t i = 0;
@@ -228,7 +229,8 @@ class TensorArray : public ResourceBase {
   }
 
   template <typename Device, typename T>
-  absl::Status ReadMany(OpKernelContext* ctx, const std::vector<int32>& indices,
+  absl::Status ReadMany(OpKernelContext* ctx,
+                        const std::vector<int32_t>& indices,
                         std::vector<Tensor>* values) {
     mutex_lock l(mu_);
     values->clear();
@@ -260,7 +262,7 @@ class TensorArray : public ResourceBase {
     return absl::OkStatus();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     mutex_lock l(mu_);
     CHECK(!closed_);
     return absl::StrCat("TensorArray[", tensors_.size(), "]");
@@ -272,7 +274,7 @@ class TensorArray : public ResourceBase {
   }
 
   // Return the size of the TensorArray.
-  absl::Status Size(int32* size) {
+  absl::Status Size(int32_t* size) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(LockedReturnIfClosed());
     *size = tensors_.size();
@@ -290,7 +292,7 @@ class TensorArray : public ResourceBase {
   }
 
   // Return the marked size of the TensorArray.
-  absl::Status MarkedSize(int32* size) {
+  absl::Status MarkedSize(int32_t* size) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(LockedReturnIfClosed());
     *size = marked_size_;
@@ -298,7 +300,7 @@ class TensorArray : public ResourceBase {
   }
 
   // Return the size that should be used by pack or concat op.
-  absl::Status PackOrConcatSize(int32* size) {
+  absl::Status PackOrConcatSize(int32_t* size) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(LockedReturnIfClosed());
     *size = is_grad_ ? marked_size_ : tensors_.size();
@@ -372,7 +374,7 @@ class TensorArray : public ResourceBase {
     return absl::OkStatus();
   }
 
-  const string key_;
+  const std::string key_;
 
   const DataType dtype_;
   Tensor handle_;
@@ -401,7 +403,7 @@ class TensorArray : public ResourceBase {
 
   // The size of the TensorArray after a (legacy) unpack or split is performed.
   // -1 if there has been no unpack or split performed on the TensorArray.
-  int32 marked_size_;
+  int32_t marked_size_;
 
   // The shape of each element in the TensorArray, may be partially known or not
   // known at all.
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 7e3da637c515bf..bd2956c734a1b7 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -57,8 +57,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace tensorflow {
 
-absl::Status GetHandle(OpKernelContext* ctx, string* container,
-                       string* ta_handle) {
+absl::Status GetHandle(OpKernelContext* ctx, std::string* container,
+                       std::string* ta_handle) {
   {
     Tensor tensor;
     // Assuming that handle is the input at index 0.
@@ -80,8 +80,8 @@ absl::Status GetHandle(OpKernelContext* ctx, string* container,
 }
 
 absl::Status GetTensorArray(OpKernelContext* ctx, TensorArray** tensor_array) {
-  string container;
-  string ta_handle;
+  std::string container;
+  std::string ta_handle;
   if (ctx->input_dtype(0) != DT_RESOURCE) {
     TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &ta_handle));
     ResourceMgr* rm = ctx->resource_manager();
@@ -197,13 +197,13 @@ class TensorArrayOp : public TensorArrayCreationOp {
           "TensorArray size must be scalar, but had shape: ",
           tensor_size->shape().DebugString());
     }
-    const int32_t size = tensor_size->scalar<int32>()();
+    const int32_t size = tensor_size->scalar<int32_t>()();
     if (size < 0) {
       return errors::InvalidArgument("Size should be >= 0.");
     }
 
     auto handle = tensor_array_output_handle->flat<tstring>();
-    string unique_tensor_array_name =
+    std::string unique_tensor_array_name =
         absl::StrCat(tensor_array_name_, "_",
                      TensorArray::tensor_array_counter.fetch_add(1));
     handle(0) = "_tensor_arrays";
@@ -230,7 +230,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
   bool identical_element_shapes_;
   bool dynamic_size_;
   bool clear_after_read_;
-  string tensor_array_name_;  // The name used to create the TensorArray.
+  std::string tensor_array_name_;  // The name used to create the TensorArray.
 
   TensorArrayOp(const TensorArrayOp&) = delete;
   void operator=(const TensorArrayOp&) = delete;
@@ -314,8 +314,8 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
   absl::Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
                                  Tensor* tensor_array_output_handle,
                                  TensorArray** output_tensor_array) override {
-    string container;
-    string tensor_array_name;
+    std::string container;
+    std::string tensor_array_name;
     if (ctx->input_dtype(0) != DT_RESOURCE) {
       TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &tensor_array_name));
       if (container != "_tensor_arrays") {
@@ -331,8 +331,8 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
         return errors::InvalidArgument("Wrong input container. ",
                                        resource.name());
       }
-      tensor_array_name =
-          string(absl::string_view(resource.name()).substr(container.size()));
+      tensor_array_name = std::string(
+          absl::string_view(resource.name()).substr(container.size()));
     }
 
     auto output_handle = tensor_array_output_handle->flat<tstring>();
@@ -407,7 +407,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
   // The gradient source for creating the given
   // gradient TensorArray.  This should be unique to each gradients
   // call.  Typical values look like "gradients", "gradients_1", ...
-  string source_;
+  std::string source_;
 
   TensorArrayGradOp(const TensorArrayGradOp&) = delete;
   void operator=(const TensorArrayGradOp&) = delete;
@@ -490,7 +490,7 @@ class TensorArrayWriteOp : public OpKernel {
     TensorArray* tensor_array = nullptr;
     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
     core::ScopedUnref unref(tensor_array);
-    const int32_t index = tensor_index->scalar<int32>()();
+    const int32_t index = tensor_index->scalar<int32_t>()();
     OP_REQUIRES(
         ctx, tensor_value->dtype() == tensor_array->ElemType(),
         errors::InvalidArgument("TensorArray dtype is ",
@@ -571,7 +571,7 @@ class TensorArrayReadOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
     core::ScopedUnref unref(tensor_array);
 
-    const int32_t index = tensor_index->scalar<int32>()();
+    const int32_t index = tensor_index->scalar<int32_t>()();
     OP_REQUIRES(
         ctx, dtype_ == tensor_array->ElemType(),
         errors::InvalidArgument(
@@ -669,7 +669,7 @@ class TensorArrayPackOrGatherOp : public OpKernel {
 
     int32_t num_indices;
     std::vector<Tensor> values;
-    std::vector<int32> indices;
+    std::vector<int32_t> indices;
     if (LEGACY_PACK) {
       OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&num_indices));
       indices.resize(num_indices);
@@ -681,7 +681,7 @@ class TensorArrayPackOrGatherOp : public OpKernel {
                   errors::InvalidArgument(
                       "Expected indices to be a vector, but received shape: ",
                       tensor_indices->shape().DebugString()));
-      const auto indices_t = tensor_indices->vec<int32>();
+      const auto indices_t = tensor_indices->vec<int32_t>();
       num_indices = tensor_indices->NumElements();
       indices.resize(num_indices);
       std::copy(indices_t.data(), indices_t.data() + num_indices,
@@ -911,7 +911,7 @@ class TensorArrayConcatOp : public OpKernel {
 
     // Read all the Tensors into a vector to keep track of their memory.
     std::vector<Tensor> values;
-    std::vector<int32> indices(array_size);
+    std::vector<int32_t> indices(array_size);
     std::iota(indices.begin(), indices.end(), 0);
     absl::Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
     OP_REQUIRES_OK(ctx, s);
@@ -1110,7 +1110,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
 
     OP_REQUIRES(ctx,
                 FastBoundsCheck(element_shape.dim_size(0),
-                                std::numeric_limits<int32>::max()),
+                                std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument("tensor dim0 too large to unpack"));
 
     OP_REQUIRES(
@@ -1128,7 +1128,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
 
     int32_t max_index;
     int32_t num_values;
-    std::vector<int32> write_indices;
+    std::vector<int32_t> write_indices;
     if (LEGACY_UNPACK) {
       num_values = element_shape.dim_size(0);
       max_index = num_values - 1;
@@ -1147,7 +1147,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
                       "Expected len(indices) == values.shape[0], but saw: ",
                       tensor_indices->NumElements(), " vs. ",
                       element_shape.dim_size(0)));
-      const auto indices_t = tensor_indices->vec<int32>();
+      const auto indices_t = tensor_indices->vec<int32_t>();
       num_values = tensor_indices->NumElements();
       max_index = (num_values == 0)
                       ? -1
@@ -1163,7 +1163,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
 
     // If dynamic size, we may have to resize the TensorArray to fit.
     if (dynamic_size && array_size < max_index + 1) {
-      array_size = static_cast<int32>(max_index + 1);
+      array_size = static_cast<int32_t>(max_index + 1);
     }
 
     if (LEGACY_UNPACK) {
@@ -1310,11 +1310,11 @@ class TensorArraySplitOp : public OpKernel {
                     tensor_lengths->shape().DebugString()));
     OP_REQUIRES(ctx,
                 FastBoundsCheck(tensor_lengths->NumElements(),
-                                std::numeric_limits<int32>::max()),
+                                std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument(
                     "Expected lengths to have < max int32 entries"));
 
-    int32_t num_tensors = static_cast<int32>(tensor_lengths->NumElements());
+    int32_t num_tensors = static_cast<int32_t>(tensor_lengths->NumElements());
     auto tensor_lengths_t = tensor_lengths->vec<int64_t>();
     std::vector<int64_t> cumulative_lengths;
     cumulative_lengths.reserve(num_tensors);
@@ -1402,7 +1402,7 @@ class TensorArraySplitOp : public OpKernel {
     // Record the concat size of the TensorArray.
     OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
 
-    std::vector<int32> indices(array_size);
+    std::vector<int32_t> indices(array_size);
     std::iota(indices.begin(), indices.end(), 0);
 
     absl::Status s = tensor_array->WriteOrAggregateMany<Device, T>(
@@ -1467,7 +1467,7 @@ class TensorArraySizeOp : public OpKernel {
     core::ScopedUnref unref(tensor_array);
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES_OK(ctx, tensor_array->Size(&(output->scalar<int32>()())));
+    OP_REQUIRES_OK(ctx, tensor_array->Size(&(output->scalar<int32_t>()())));
   }
 };
 
diff --git a/tensorflow/core/kernels/tensor_cord.cc b/tensorflow/core/kernels/tensor_cord.cc
index 56ff11e0d9493f..e529750929c9a2 100644
--- a/tensorflow/core/kernels/tensor_cord.cc
+++ b/tensorflow/core/kernels/tensor_cord.cc
@@ -40,7 +40,7 @@ void TensorCord::Encode(VariantTensorData* data) const {
 }
 
 bool TensorCord::Decode(VariantTensorData data) {
-  auto* str = new string(std::move(data.metadata_string()));
+  auto* str = new std::string(std::move(data.metadata_string()));
   Cleanup();
   chunks_.push_back(new CordRep(absl::string_view(*str), &StringReleaser, str));
   return true;
@@ -57,7 +57,7 @@ void TensorCord::TensorBufReleaser(void* tensor_buffer) {
 }
 
 void TensorCord::StringReleaser(void* str_ptr) {
-  delete static_cast<string*>(str_ptr);
+  delete static_cast<std::string*>(str_ptr);
 }
 
 namespace {
@@ -85,14 +85,15 @@ struct ResizeUninitializedTraits<
 };
 
 // Resize string `s` to `new_size`, leaving the data uninitialized.
-static inline void STLStringResizeUninitialized(string* s, size_t new_size) {
-  ResizeUninitializedTraits<string>::Resize(s, new_size);
+static inline void STLStringResizeUninitialized(std::string* s,
+                                                size_t new_size) {
+  ResizeUninitializedTraits<std::string>::Resize(s, new_size);
 }
 
 }  // namespace
 
-TensorCord::operator string() const {
-  string out;
+TensorCord::operator std::string() const {
+  std::string out;
   STLStringResizeUninitialized(&out, size());
   char* data = const_cast<char*>(out.data());
   for (auto* rep : chunks_) {
diff --git a/tensorflow/core/kernels/tensor_cord.h b/tensorflow/core/kernels/tensor_cord.h
index 2d3d4e3fb01f71..75104f0022696c 100644
--- a/tensorflow/core/kernels/tensor_cord.h
+++ b/tensorflow/core/kernels/tensor_cord.h
@@ -114,7 +114,7 @@ class TensorCord {
   bool empty() const { return size() == 0; }
 
   // NOTE: This performs an expensive copy of the underlying data.
-  explicit operator string() const;
+  explicit operator std::string() const;
 
   class ChunkIterator {
    public:
@@ -188,9 +188,9 @@ class TensorCord {
     return ChunkIterator(this, chunks_.size());
   }
 
-  static string TypeName() { return kTypeName; }
+  static std::string TypeName() { return kTypeName; }
 
-  string DebugString() const {
+  std::string DebugString() const {
     return absl::StrCat("<TensorCord size=", size(), ">");
   }
 
@@ -217,7 +217,7 @@ class TensorCord {
       if (is_inline_) {
         return absl::string_view(
             rep_.internal.data() + 1,
-            *reinterpret_cast<const uint8*>(rep_.internal.data()));
+            *reinterpret_cast<const uint8_t*>(rep_.internal.data()));
       } else {
         return rep_.external.view;
       }
@@ -256,7 +256,7 @@ class TensorCord {
 
       explicit _rep_union(absl::string_view view) {
         DCHECK_LT(view.size(), kMaxInlineSize);
-        *reinterpret_cast<uint8*>(internal.data()) = view.size();
+        *reinterpret_cast<uint8_t*>(internal.data()) = view.size();
         std::memcpy(static_cast<char*>(internal.data() + 1), view.data(),
                     view.size());
       }
diff --git a/tensorflow/core/kernels/tensor_cord_test.cc b/tensorflow/core/kernels/tensor_cord_test.cc
index 038fddc7ce2276..450f9a1bdd43bf 100644
--- a/tensorflow/core/kernels/tensor_cord_test.cc
+++ b/tensorflow/core/kernels/tensor_cord_test.cc
@@ -80,7 +80,7 @@ TEST(TensorCordTest, Copy) {
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
   TensorCord tc_copy;
-  string a = "abc";
+  std::string a = "abc";
   {
     TensorCord tc(a, thunk, &cleaner);
     tc_copy = tc;
@@ -104,7 +104,7 @@ TEST(TensorCordTest, AppendCord) {
   TensorCord tc_0("abc", thunk_0, &cleaner_0);
   TensorCord tc_1("cba", thunk_1, &cleaner_1);
   tc_0.Append(tc_1);
-  EXPECT_EQ(string(tc_0), "abccba");
+  EXPECT_EQ(std::string(tc_0), "abccba");
   auto it = tc_0.chunk_begin();
   EXPECT_EQ(*it, "abc");
   ++it;
@@ -128,7 +128,7 @@ TEST(TensorCordTest, AppendView) {
   auto thunk_1 = CreateThunkFor(cleaner_1);
   TensorCord tc_0("abc", thunk_0, &cleaner_0);
   tc_0.Append("cba", thunk_1, &cleaner_1);
-  EXPECT_EQ(string(tc_0), "abccba");
+  EXPECT_EQ(std::string(tc_0), "abccba");
   auto it = tc_0.chunk_begin();
   EXPECT_EQ(*it, "abc");
   ++it;
@@ -147,7 +147,7 @@ TEST(TensorCordTest, Move) {
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
   TensorCord tc_copy;
-  string a = "abc";
+  std::string a = "abc";
   {
     TensorCord tc(a, thunk, &cleaner);
     tc_copy = std::move(tc);
@@ -167,7 +167,7 @@ TEST(TensorCordTest, CopyConstructor) {
   int cleaned = 0;
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
-  string a = "abc";
+  std::string a = "abc";
   TensorCord tc(a, thunk, &cleaner);
   TensorCord tc_copy(tc);
   EXPECT_EQ(tc.size(), 3);
@@ -187,7 +187,7 @@ TEST(TensorCordTest, MoveConstructor) {
   int cleaned = 0;
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
-  string a = "abc";
+  std::string a = "abc";
   TensorCord tc(a, thunk, &cleaner);
   TensorCord tc_copy(std::move(tc));
   EXPECT_EQ(tc_copy.size(), 3);
@@ -236,7 +236,7 @@ void TensorCordFromAbslCordBenchmark(benchmark::State& state, int num_elem,
                                      int string_size) {
   std::vector<absl::Cord> cords(num_elem);
   for (int i = 0; i < num_elem; ++i) {
-    string s(string_size, 'a');
+    std::string s(string_size, 'a');
     cords[i] = s;
   }
 
diff --git a/tensorflow/core/kernels/tensor_list.cc b/tensorflow/core/kernels/tensor_list.cc
index b65d4a96907d44..122324cbdc3148 100644
--- a/tensorflow/core/kernels/tensor_list.cc
+++ b/tensorflow/core/kernels/tensor_list.cc
@@ -35,16 +35,16 @@ void TensorList::Encode(VariantTensorData* data) const {
       invalid_indices.push_back(i);
     }
   }
-  string metadata;
+  std::string metadata;
   // TODO(b/118838800): Add a proto for storing the metadata.
   // Metadata format:
   // <num_invalid_tensors><invalid_indices><element_dtype><element_shape_proto>
-  core::PutVarint64(&metadata, static_cast<uint64>(invalid_indices.size()));
+  core::PutVarint64(&metadata, static_cast<uint64_t>(invalid_indices.size()));
   for (size_t i : invalid_indices) {
-    core::PutVarint64(&metadata, static_cast<uint64>(i));
+    core::PutVarint64(&metadata, static_cast<uint64_t>(i));
   }
-  core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
-  core::PutVarint64(&metadata, static_cast<uint64>(max_num_elements));
+  core::PutVarint64(&metadata, static_cast<uint64_t>(element_dtype));
+  core::PutVarint64(&metadata, static_cast<uint64_t>(max_num_elements));
   TensorShapeProto element_shape_proto;
   element_shape.AsProto(&element_shape_proto);
   element_shape_proto.AppendToString(&metadata);
@@ -55,9 +55,9 @@ bool TensorList::Decode(const VariantTensorData& data) {
   // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
   // that we do not have to copy each tensor individually below. This would
   // require changing VariantTensorData::tensors() as well.
-  string metadata;
+  std::string metadata;
   data.get_metadata(&metadata);
-  uint64 scratch;
+  uint64_t scratch;
   absl::string_view iter(metadata);
   std::vector<size_t> invalid_indices;
   core::GetVarint64(&iter, &scratch);
@@ -91,7 +91,7 @@ bool TensorList::Decode(const VariantTensorData& data) {
   core::GetVarint64(&iter, &scratch);
   max_num_elements = static_cast<int>(scratch);
   TensorShapeProto element_shape_proto;
-  element_shape_proto.ParseFromString(string(iter.data(), iter.size()));
+  element_shape_proto.ParseFromString(iter);
   element_shape = PartialTensorShape(element_shape_proto);
   return true;
 }
diff --git a/tensorflow/core/kernels/tensor_list.h b/tensorflow/core/kernels/tensor_list.h
index 5d3921cffe93af..bf2363e1ae4d9b 100644
--- a/tensorflow/core/kernels/tensor_list.h
+++ b/tensorflow/core/kernels/tensor_list.h
@@ -105,14 +105,14 @@ class TensorList {
 
   static const char kTypeName[];
 
-  string TypeName() const { return kTypeName; }
+  std::string TypeName() const { return kTypeName; }
 
   void Encode(VariantTensorData* data) const;
 
   bool Decode(const VariantTensorData& data);
 
   // TODO(apassos) fill this out
-  string DebugString() const { return "TensorList"; }
+  std::string DebugString() const { return "TensorList"; }
 
   PartialTensorShape element_shape;
 
diff --git a/tensorflow/core/kernels/tensor_map.h b/tensorflow/core/kernels/tensor_map.h
index cb4c827cc3c605..7f307c859226a1 100644
--- a/tensorflow/core/kernels/tensor_map.h
+++ b/tensorflow/core/kernels/tensor_map.h
@@ -93,14 +93,14 @@ class TensorMap {
 
   static const char kTypeName[];
 
-  string TypeName() const { return kTypeName; }
+  std::string TypeName() const { return kTypeName; }
 
   void Encode(VariantTensorData* data) const;
 
   bool Decode(const VariantTensorData& data);
 
   // TODO(apassos) fill this out
-  string DebugString() const { return "TensorMap"; }
+  std::string DebugString() const { return "TensorMap"; }
 
   // Access to the underlying tensor container.
   absl::flat_hash_map<TensorKey, Tensor>& tensors() {
diff --git a/tensorflow/core/kernels/tensor_map_test.cc b/tensorflow/core/kernels/tensor_map_test.cc
index 76c903f047cb19..5ea6d5242a7199 100644
--- a/tensorflow/core/kernels/tensor_map_test.cc
+++ b/tensorflow/core/kernels/tensor_map_test.cc
@@ -54,7 +54,7 @@ TEST(TensorMapTest, Insert) {
   absl::flat_hash_map<TensorKey, Tensor>::iterator map_it =
       tm.tensors().begin();
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(map_it->second, v);
+  test::ExpectTensorEqual<int32_t>(map_it->second, v);
   map_it++;
   EXPECT_EQ(map_it, tm.tensors().end());
 }
@@ -68,7 +68,7 @@ TEST(TensorMapTest, Lookup) {
   Tensor f = map_it->second;
 
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(f, v);
+  test::ExpectTensorEqual<int32_t>(f, v);
 }
 
 TEST(TensorMapTest, Erase) {
@@ -91,7 +91,7 @@ TEST(TensorMapTest, SameKeyInsert) {
   EXPECT_EQ(b2, false);
   absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(map_it->second, v1);
+  test::ExpectTensorEqual<int32_t>(map_it->second, v1);
 }
 
 TEST(TensorMapTest, Replace) {
@@ -102,7 +102,7 @@ TEST(TensorMapTest, Replace) {
   tm[k] = v2;
   absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(map_it->second, v2);
+  test::ExpectTensorEqual<int32_t>(map_it->second, v2);
 }
 
 TEST(TensorMapTest, ListKeys) {
@@ -153,7 +153,7 @@ TEST(TensorMapTest, Copy) {
   EXPECT_NE(tm.find(k), tm.tensors().end());
   EXPECT_NE(tmc.find(k), tmc.tensors().end());
   EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
-  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+  test::ExpectTensorEqual<int32_t>(tm.find(k)->second, tmc.find(k)->second);
 }
 
 TEST(TensorMapTest, EncodeDecode) {
@@ -169,7 +169,7 @@ TEST(TensorMapTest, EncodeDecode) {
   EXPECT_NE(tm.find(k), tm.tensors().end());
   EXPECT_NE(tmc.find(k), tmc.tensors().end());
   EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
-  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+  test::ExpectTensorEqual<int32_t>(tm.find(k)->second, tmc.find(k)->second);
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/tensor_to_hash_bucket_op.h b/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
index cdf7dab23947d9..ca43aa9c8da329 100644
--- a/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
@@ -36,15 +36,11 @@ template <typename Device, typename T>
 struct LaunchTensorToHashBucket {
   void operator()(OpKernelContext* c, const int64_t num_buckets, const T* input,
                   const int num_elems, int64_t* output) {
-    string format = "%";
     switch (DataTypeToEnum<T>::value) {
       case DT_INT8:
       case DT_INT16:
       case DT_INT32:
-        strings::Appendf(&format, "d");
-        break;
       case DT_INT64:
-        strings::Appendf(&format, "lld");
         break;
       default:
         bool type_not_supported = true;
@@ -55,9 +51,9 @@ struct LaunchTensorToHashBucket {
     }
 
     for (int i = 0; i < num_elems; ++i) {
-      string input_str = strings::Printf(format.c_str(), input[i]);
-      const uint64 input_hash = Fingerprint64(input_str);
-      const uint64 bucket_id = input_hash % num_buckets;
+      std::string input_str = absl::StrFormat("%d", input[i]);
+      const uint64_t input_hash = Fingerprint64(input_str);
+      const uint64_t bucket_id = input_hash % num_buckets;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc
index 5a771a033b39b7..cb190d3ac871c7 100644
--- a/tensorflow/core/kernels/text_line_reader_op.cc
+++ b/tensorflow/core/kernels/text_line_reader_op.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 
 class TextLineReader : public ReaderBase {
  public:
-  TextLineReader(const string& node_name, int skip_header_lines, Env* env)
+  TextLineReader(const std::string& node_name, int skip_header_lines, Env* env)
       : ReaderBase(absl::StrCat("TextLineReader '", node_name, "'")),
         skip_header_lines_(skip_header_lines),
         env_(env),
@@ -41,7 +41,7 @@ class TextLineReader : public ReaderBase {
 
     input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
     for (; line_number_ < skip_header_lines_; ++line_number_) {
-      string line_contents;
+      std::string line_contents;
       absl::Status status = input_buffer_->ReadLine(&line_contents);
       if (absl::IsOutOfRange(status)) {
         // We ignore an end of file error when skipping header lines.
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
index 82d3fac862eddd..8abc2eeaf2b7fa 100644
--- a/tensorflow/core/kernels/tf_record_reader_op.cc
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -29,8 +29,8 @@ namespace tensorflow {
 
 class TFRecordReader : public ReaderBase {
  public:
-  TFRecordReader(const string& node_name, const string& compression_type,
-                 Env* env)
+  TFRecordReader(const std::string& node_name,
+                 const std::string& compression_type, Env* env)
       : ReaderBase(absl::StrCat("TFRecordReader '", node_name, "'")),
         env_(env),
         offset_(0),
@@ -76,10 +76,10 @@ class TFRecordReader : public ReaderBase {
 
  private:
   Env* const env_;
-  uint64 offset_;
+  uint64_t offset_;
   std::unique_ptr<RandomAccessFile> file_;
   std::unique_ptr<io::RecordReader> reader_;
-  string compression_type_ = "";
+  std::string compression_type_ = "";
 };
 
 class TFRecordReaderOp : public ReaderOpKernel {
@@ -88,7 +88,7 @@ class TFRecordReaderOp : public ReaderOpKernel {
       : ReaderOpKernel(context) {
     Env* env = context->env();
 
-    string compression_type;
+    std::string compression_type;
     OP_REQUIRES_OK(context,
                    context->GetAttr("compression_type", &compression_type));
 
diff --git a/tensorflow/core/kernels/tile_functor_gpu.h b/tensorflow/core/kernels/tile_functor_gpu.h
index 8d825a682dbf9e..ab8ddccd15bfeb 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -51,9 +51,11 @@ template <typename T>
 void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
   // Ensures we can use 32-bit index.
   const int64 in_nelem = in.NumElements();
-  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  CHECK_LT(in_nelem, std::numeric_limits<int32_t>::max())
+      << "Tensor too large to transpose on GPU";
   const int64 out_nelem = out->NumElements();
-  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  CHECK_LT(out_nelem, std::numeric_limits<int32_t>::max())
+      << "Tensor too large to transpose on GPU";
   // Pack strides and input dimension sizes into one buffer.
   const int32 ndims = in.dims();
   gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 550c54ed59b147..b02cf949a5d5e9 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -585,23 +585,23 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 REGISTER_KERNEL_BUILDER(Name("Tile")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
-                            .TypeConstraint<int32>("Tmultiples"),
-                        TileOp<CPUDevice, int32>);
+                            .TypeConstraint<int32_t>("Tmultiples"),
+                        TileOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("Tile")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
                             .TypeConstraint<int64_t>("Tmultiples"),
-                        TileOp<CPUDevice, int64>);
+                        TileOp<CPUDevice, int64_t>);
 REGISTER_KERNEL_BUILDER(Name("TileGrad")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
-                            .TypeConstraint<int32>("Tmultiples"),
-                        TileGradientOp<CPUDevice, int32>);
+                            .TypeConstraint<int32_t>("Tmultiples"),
+                        TileGradientOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("TileGrad")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
                             .TypeConstraint<int64_t>("Tmultiples"),
-                        TileGradientOp<CPUDevice, int64>);
+                        TileGradientOp<CPUDevice, int64_t>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_TILE(type)                                      \
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 4c4c79f843e503..1f4137b20f5d95 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -244,9 +244,10 @@ struct TopKFunctor<CPUDevice, T, Tidx> {
     const double sort_cost = (k == num_cols) ? base_cost : 4 * base_cost;
     const double copy_cost = 2 * k * Eigen::TensorOpCost::AddCost<T>();
     const double total_cost = sort_cost + copy_cost;
-    const int64_t final_cost = (total_cost >= static_cast<double>(kint64max))
-                                   ? kint64max
-                                   : static_cast<int64_t>(total_cost);
+    const int64_t final_cost =
+        (total_cost >= static_cast<double>(std::numeric_limits<int64_t>::max()))
+            ? std::numeric_limits<int64_t>::max()
+            : static_cast<int64_t>(total_cost);
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           final_cost, SortIndices);
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 93033e4bc0c7ea..8e136527653774 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <algorithm>  // NOLINT
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -1564,9 +1566,9 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
                               .TypeConstraint<Tindices>("Tindices"),          \
                           SparseApplyProximalGradientDescentOp<T, Tindices>);
 
-REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int32_t);
 REGISTER_KERNELS(float, int64_t);
-REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int32_t);
 REGISTER_KERNELS(double, int64_t);
 #undef REGISTER_KERNELS
 
@@ -2250,9 +2252,9 @@ class SparseApplyProximalAdagradOp : public OpKernel {
           .TypeConstraint<Tindices>("Tindices"),             \
       SparseApplyProximalAdagradOp<D##Device, T, Tindices>);
 
-REGISTER_KERNELS(CPU, float, int32);
+REGISTER_KERNELS(CPU, float, int32_t);
 REGISTER_KERNELS(CPU, float, int64_t);
-REGISTER_KERNELS(CPU, double, int32);
+REGISTER_KERNELS(CPU, double, int32_t);
 REGISTER_KERNELS(CPU, double, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -2580,9 +2582,9 @@ class SparseApplyAdagradDAOp : public OpKernel {
                               .TypeConstraint<Tindices>("Tindices"),      \
                           SparseApplyAdagradDAOp<T, Tindices>);
 
-REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int32_t);
 REGISTER_KERNELS(float, int64_t);
-REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int32_t);
 REGISTER_KERNELS(double, int64_t);
 #undef REGISTER_KERNELS
 
@@ -3450,9 +3452,9 @@ class SparseApplyKerasMomentumOp : public OpKernel {
         momentum.scalar<T>(), use_nesterov_);
     OP_REQUIRES(
         ctx, bad_i < 0,
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", var.dim_size(0), ")"));
+            indices_flat(bad_i), " is not in [0, ", var.dim_size(0), ")")));
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -4463,15 +4465,15 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
                               .TypeConstraint<Tindices>("Tindices"),  \
                           SparseApplyCenteredRMSPropOp<T, Tindices>);
 
-REGISTER_KERNELS(Eigen::half, int32);
+REGISTER_KERNELS(Eigen::half, int32_t);
 REGISTER_KERNELS(Eigen::half, int64_t);
-REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int32_t);
 REGISTER_KERNELS(float, int64_t);
-REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int32_t);
 REGISTER_KERNELS(double, int64_t);
-REGISTER_KERNELS(complex64, int32);
+REGISTER_KERNELS(complex64, int32_t);
 REGISTER_KERNELS(complex64, int64_t);
-REGISTER_KERNELS(complex128, int32);
+REGISTER_KERNELS(complex128, int32_t);
 REGISTER_KERNELS(complex128, int64_t);
 
 #undef REGISTER_KERNELS
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index b8786c6a2a027b..496d0b181bf8d8 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -75,7 +75,7 @@ static Node* Random(Graph* g, int m, int n) {
 
 static Node* Iota(Graph* g, int n) {
   Tensor data(DT_INT32, TensorShape({n}));
-  int32* base = data.flat<int32>().data();
+  int32_t* base = data.flat<int32_t>().data();
   for (int i = 0; i < n; ++i) base[i] = i;
   return test::graph::Constant(g, data);
 }
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index 37f35f6a58f9e1..683be3ff01a8ce 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -34,7 +34,7 @@ namespace tensorflow {
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 absl::Status DoTranspose(const Device& device, const Tensor& in,
-                         const absl::Span<const int32> perm, Tensor* out);
+                         const absl::Span<const int32_t> perm, Tensor* out);
 
 // Conjugate and transpose tensor 'in' into tensor 'out' according to dimension
 // permutation 'perm'.
@@ -45,7 +45,7 @@ absl::Status DoTranspose(const Device& device, const Tensor& in,
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 absl::Status DoConjugateTranspose(const Device& device, const Tensor& in,
-                                  const absl::Span<const int32> perm,
+                                  const absl::Span<const int32_t> perm,
                                   Tensor* out);
 
 // Convenience versions of DoTranspose that only swap the last (inner) two
@@ -64,14 +64,14 @@ absl::Status DoConjugateMatrixTranspose(const Device& device, const Tensor& in,
 template <typename Device, typename T, bool conjugate = false>
 struct Transpose {
   static void run(const Device& d, const Tensor& in,
-                  const absl::Span<const int32> perm, Tensor* out);
+                  const absl::Span<const int32_t> perm, Tensor* out);
 };
 
 // Implementation details.
 namespace internal {
 
 typedef absl::InlinedVector<int64_t, 8UL> TransposeDimsVec;
-typedef absl::InlinedVector<int32, 8UL> TransposePermsVec;
+typedef absl::InlinedVector<int32_t, 8UL> TransposePermsVec;
 
 // Helper function that takes a tensor shape, a permutation, combines the
 // neighboring shapes if their indices in the permutation are consecutive.
@@ -79,7 +79,7 @@ typedef absl::InlinedVector<int32, 8UL> TransposePermsVec;
 // Example: Tensor shape {2, 3, 4, 5, 120} and permutation {0, 4, 1, 2, 3} will
 // produce new shape {2, 60, 120} and new permutation {0, 2, 1}.
 inline void ReduceTransposeDimensions(const TensorShape& shape,
-                                      absl::Span<const int32> perm,
+                                      absl::Span<const int32_t> perm,
                                       TransposePermsVec* new_perm,
                                       TransposeDimsVec* new_dims) {
   CHECK_EQ(shape.dims(), perm.size());
@@ -130,8 +130,8 @@ inline void ReduceTransposeDimensions(const TensorShape& shape,
 // That is, for all i, 0 <= perm[i] < input_shape.dims().
 // In practice, this is checked in TransposeOp::Compute prior to calling this
 // function, and the function sits here to facilitate unit testing.
-inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
-                                        const std::vector<int32>& permutation) {
+inline bool NonSingletonDimensionsAlign(
+    const TensorShape& input_shape, const std::vector<int32_t>& permutation) {
   int last_nonsingleton_perm_dim = -1;
   for (int perm_dim : permutation) {
     if (input_shape.dim_size(perm_dim) == 1) {
@@ -148,7 +148,7 @@ inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
 // Uses Eigen to transpose.
 template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
-                         const absl::Span<const int32> perm, bool conjugate,
+                         const absl::Span<const int32_t> perm, bool conjugate,
                          Tensor* out) {
   Eigen::array<int, NDIMS> p;
   for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
@@ -167,8 +167,8 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
 
 template <typename Device>
 absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
-                             const absl::Span<const int32> perm, bool conjugate,
-                             Tensor* out) {
+                             const absl::Span<const int32_t> perm,
+                             bool conjugate, Tensor* out) {
   // log a msg
   CHECK_EQ(in.dims(), out->dims());
   CHECK_EQ(in.dims(), perm.size());
@@ -181,7 +181,7 @@ absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
     case DT_UINT8:
     case DT_FLOAT8_E5M2:
     case DT_FLOAT8_E4M3FN:
-      Transpose<Device, uint8>::run(d, in, perm, out);
+      Transpose<Device, uint8_t>::run(d, in, perm, out);
       break;
 
     case DT_BFLOAT16:
@@ -190,20 +190,20 @@ absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
     case DT_QINT16:
     case DT_QUINT16:
     case DT_UINT16:
-      Transpose<Device, uint16>::run(d, in, perm, out);
+      Transpose<Device, uint16_t>::run(d, in, perm, out);
       break;
 
     case DT_FLOAT:
     case DT_INT32:
     case DT_QINT32:
     case DT_UINT32:
-      Transpose<Device, uint32>::run(d, in, perm, out);
+      Transpose<Device, uint32_t>::run(d, in, perm, out);
       break;
 
     case DT_DOUBLE:
     case DT_INT64:
     case DT_UINT64:
-      Transpose<Device, uint64>::run(d, in, perm, out);
+      Transpose<Device, uint64_t>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX64:
@@ -217,7 +217,7 @@ absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
         Transpose<Device, complex64, /*conjugate=*/true>::run(d, in, perm, out);
 #endif
       } else {
-        Transpose<Device, uint64>::run(d, in, perm, out);
+        Transpose<Device, uint64_t>::run(d, in, perm, out);
       }
       break;
 
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 83a92f412c4339..7ce93d69c64c43 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -33,7 +33,7 @@ namespace {
 
 template <typename T, bool conjugate>
 void TransposeSimple(const CPUDevice& device, const Tensor& in,
-                     const absl::Span<const int32> perm, Tensor* out) {
+                     const absl::Span<const int32_t> perm, Tensor* out) {
   const int ndims = in.dims();
   absl::InlinedVector<int64_t, 8UL> in_strides =
       ComputeStride<int64_t>(in.shape());
@@ -73,7 +73,7 @@ void TransposeSimple(const CPUDevice& device, const Tensor& in,
 template <typename T, bool conjugate>
 struct Transpose<CPUDevice, T, conjugate> {
   static void run(const CPUDevice& d, const Tensor& in,
-                  const absl::Span<const int32> perm, Tensor* out) {
+                  const absl::Span<const int32_t> perm, Tensor* out) {
     switch (in.dims()) {
       case 2:
         internal::TransposeUsingEigen<CPUDevice, T, 2>(d, in, perm, conjugate,
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index a3cd6f4c446ffe..64f4d2d33c50d4 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -67,7 +67,8 @@ void TransposeSimple(const GPUDevice& d, const Tensor& in,
                      const gtl::ArraySlice<int32> perm, Tensor* out) {
   // Ensures we can use 32-bit index.
   const int64 nelem = in.NumElements();
-  CHECK_LT(nelem, kint32max) << "Tensor too large to transpose on GPU";
+  CHECK_LT(nelem, std::numeric_limits<int32_t>::max())
+      << "Tensor too large to transpose on GPU";
   // Pack strides and permutation into one buffer.
   const int32 ndims = in.dims();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index b28f414f98631a..f3cebb32043787 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -50,10 +50,11 @@ class InvertPermutationOp : public OpKernel {
         context, TensorShapeUtils::IsVector(input.shape()),
         errors::InvalidArgument("invert_permutation expects a 1D vector."));
     auto Tin = input.vec<T>();
-    OP_REQUIRES(context,
-                FastBoundsCheck(Tin.size(), std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("permutation of nonnegative int32s "
-                                        "must have <= int32 max elements"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(Tin.size(), std::numeric_limits<int32_t>::max()),
+        errors::InvalidArgument("permutation of nonnegative int32s "
+                                "must have <= int32 max elements"));
     const T N = static_cast<T>(Tin.size());  // Safe: bounds-checked above.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
@@ -72,18 +73,18 @@ class InvertPermutationOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    InvertPermutationOp<int32>);
+    Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    InvertPermutationOp<int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int64_t>("T"),
     InvertPermutationOp<int64_t>);
 
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .Device(DEVICE_DEFAULT)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("x")
                             .HostMemory("y"),
-                        InvertPermutationOp<int32>);
+                        InvertPermutationOp<int32_t>);
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .Device(DEVICE_DEFAULT)
                             .TypeConstraint<int64_t>("T")
@@ -94,7 +95,7 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
 namespace {
 template <typename Tperm>
 absl::Status PermutationHelper(const Tensor& perm, const int dims,
-                               std::vector<int32>* permutation) {
+                               std::vector<int32_t>* permutation) {
   auto Vperm = perm.vec<Tperm>();
   if (dims != Vperm.size()) {
     return errors::InvalidArgument("transpose expects a vector of size ", dims,
@@ -105,7 +106,7 @@ absl::Status PermutationHelper(const Tensor& perm, const int dims,
   // asynchrony boundary is permutation.
   const volatile Tperm* perm_begin =
       reinterpret_cast<const volatile Tperm*>(Vperm.data());
-  *permutation = std::vector<int32>(perm_begin, perm_begin + dims);
+  *permutation = std::vector<int32_t>(perm_begin, perm_begin + dims);
 
   return absl::OkStatus();
 }
@@ -136,10 +137,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
 
   // Although Tperm may be an int64 type, an int32 is sufficient to hold
   // dimension range values, so the narrowing here should be safe.
-  std::vector<int32> permutation;
+  std::vector<int32_t> permutation;
   const int dims = input.dims();
   if (perm.dtype() == DT_INT32) {
-    OP_REQUIRES_OK(ctx, PermutationHelper<int32>(perm, dims, &permutation));
+    OP_REQUIRES_OK(ctx, PermutationHelper<int32_t>(perm, dims, &permutation));
   } else {
     OP_REQUIRES_OK(ctx, PermutationHelper<int64_t>(perm, dims, &permutation));
   }
@@ -191,17 +192,16 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
 }
 
 absl::Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                         absl::Span<const int32> perm,
+                                         absl::Span<const int32_t> perm,
                                          Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
                                    out);
 }
 
-absl::Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
-                                                  const Tensor& in,
-                                                  absl::Span<const int32> perm,
-                                                  Tensor* out) {
+absl::Status ConjugateTransposeCpuOp::DoTranspose(
+    OpKernelContext* ctx, const Tensor& in, absl::Span<const int32_t> perm,
+    Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<CPUDevice>(), in,
                                             perm, out);
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 8f0405b604f818..2e22a06107f610 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -29,7 +29,7 @@ class TransposeOp : public OpKernel {
 
  protected:
   virtual absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                   absl::Span<const int32> perm,
+                                   absl::Span<const int32_t> perm,
                                    Tensor* out) = 0;
   virtual bool IsConjugate() const { return false; }
 };
@@ -40,7 +40,8 @@ class TransposeCpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
 };
 
 #if defined(INTEL_MKL)
@@ -60,7 +61,8 @@ class TransposeGpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
 };
 
 
@@ -72,7 +74,8 @@ class ConjugateTransposeCpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
@@ -96,7 +99,8 @@ class ConjugateTransposeGpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
diff --git a/tensorflow/core/kernels/transpose_util_test.cc b/tensorflow/core/kernels/transpose_util_test.cc
index caba50c268dd8d..85a4e6e51fdc2f 100644
--- a/tensorflow/core/kernels/transpose_util_test.cc
+++ b/tensorflow/core/kernels/transpose_util_test.cc
@@ -23,14 +23,14 @@ namespace tensorflow {
 class TransposeUtilTest : public ::testing::Test {
  protected:
   void TestDimensionReduction(const TensorShape& shape,
-                              const absl::Span<const int32> perm,
-                              const absl::Span<const int32> expected_perm,
+                              const absl::Span<const int32_t> perm,
+                              const absl::Span<const int32_t> expected_perm,
                               const absl::Span<const int64_t> expected_dims) {
     internal::TransposePermsVec new_perm;
     internal::TransposeDimsVec new_dims;
     internal::ReduceTransposeDimensions(shape, perm, &new_perm, &new_dims);
 
-    absl::Span<const int32> computed_perm(new_perm);
+    absl::Span<const int32_t> computed_perm(new_perm);
     absl::Span<const int64_t> computed_dims(new_dims);
     EXPECT_EQ(computed_perm, expected_perm);
     EXPECT_EQ(computed_dims, expected_dims);
diff --git a/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
index f657441686cfea..1fb049a25c393c 100644
--- a/tensorflow/core/kernels/typed_conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -35,8 +35,8 @@ class TypedConditionalAccumulatorBase : public ConditionalAccumulatorBase {
  public:
   TypedConditionalAccumulatorBase(const DataType& dtype,
                                   const PartialTensorShape& shape,
-                                  const string& name,
-                                  const string& reduction_type)
+                                  const std::string& name,
+                                  const std::string& reduction_type)
       : ConditionalAccumulatorBase(dtype, shape, name, reduction_type) {}
 
   /**
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index e4c82f0ebde03a..fe8a408c4088c1 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -34,7 +34,7 @@ class TypedQueue : public QueueBase {
  public:
   TypedQueue(const int32_t capacity, const DataTypeVector& component_dtypes,
              const std::vector<TensorShape>& component_shapes,
-             const string& name);
+             const std::string& name);
 
   virtual absl::Status Initialize();  // Must be called before any other method.
 
@@ -47,7 +47,7 @@ class TypedQueue : public QueueBase {
 template <typename SubQueue>
 TypedQueue<SubQueue>::TypedQueue(
     int32_t capacity, const DataTypeVector& component_dtypes,
-    const std::vector<TensorShape>& component_shapes, const string& name)
+    const std::vector<TensorShape>& component_shapes, const std::string& name)
     : QueueBase(capacity, component_dtypes, component_shapes, name) {}
 
 template <typename SubQueue>
diff --git a/tensorflow/core/kernels/unary_ops_composition.cc b/tensorflow/core/kernels/unary_ops_composition.cc
index 112f32a8641fe1..bce5ac67714c24 100644
--- a/tensorflow/core/kernels/unary_ops_composition.cc
+++ b/tensorflow/core/kernels/unary_ops_composition.cc
@@ -43,12 +43,13 @@ struct UnaryOpsCompositionBase {
     int cost;
   };
 
-  bool HasComputeFn(const string& name) {
+  bool HasComputeFn(const std::string& name) {
     return compute_fns.find(name) != compute_fns.end();
   }
 
  protected:
-  void RegisterComputeFn(const string& name, ComputeFn compute_fn, int cost) {
+  void RegisterComputeFn(const std::string& name, ComputeFn compute_fn,
+                         int cost) {
     VLOG(5) << "Register compute fn: name=" << name << " cost=" << cost;
     compute_fns[name] = {compute_fn, cost};
   }
@@ -56,9 +57,9 @@ struct UnaryOpsCompositionBase {
  private:
   friend class UnaryOpsComposition<T>;
 
-  absl::Status ExportComputeFns(const std::vector<string>& op_names,
+  absl::Status ExportComputeFns(const std::vector<std::string>& op_names,
                                 std::vector<ComputeFn>* fns, int* cost) {
-    for (const string& op_name : op_names) {
+    for (const std::string& op_name : op_names) {
       auto it = compute_fns.find(op_name);
       if (it == compute_fns.end())
         return errors::InvalidArgument(
@@ -72,7 +73,7 @@ struct UnaryOpsCompositionBase {
     return absl::OkStatus();
   }
 
-  std::unordered_map<string, ComputeFnRegistration> compute_fns;
+  std::unordered_map<std::string, ComputeFnRegistration> compute_fns;
 };
 
 template <typename T>
@@ -151,7 +152,7 @@ class UnaryOpsComposition : public OpKernel {
 
   Support support_;
 
-  std::vector<string> op_names_;
+  std::vector<std::string> op_names_;
   std::vector<ComputeFn> fns_;
   int cost_ = 0;
 };
diff --git a/tensorflow/core/kernels/unary_ops_composition_test.cc b/tensorflow/core/kernels/unary_ops_composition_test.cc
index 1edec07e43dc75..773f9626173081 100644
--- a/tensorflow/core/kernels/unary_ops_composition_test.cc
+++ b/tensorflow/core/kernels/unary_ops_composition_test.cc
@@ -33,7 +33,8 @@ namespace {
 class UnaryOpsCompositionTest : public OpsTestBase {
  protected:
   template <typename T>
-  void RunComposedOp(const std::vector<string> op_names, T input, T expected) {
+  void RunComposedOp(const std::vector<std::string> op_names, T input,
+                     T expected) {
     TF_ASSERT_OK(NodeDefBuilder("unary_op_composition", "_UnaryOpsComposition")
                      .Input(FakeInput(DataTypeToEnum<T>::v()))
                      .Attr("T", DataTypeToEnum<T>::v())
@@ -82,8 +83,9 @@ TEST_F(UnaryOpsCompositionTest, Compose_Tanh_Relu6_F) {
 
 // Performance benchmarks below.
 
-string Function(int i) {
-  std::vector<string> ops = {"Tanh", "Relu", "Sigmoid", "Sqrt", "Log", "Exp"};
+std::string Function(int i) {
+  std::vector<std::string> ops = {"Tanh", "Relu", "Sigmoid",
+                                  "Sqrt", "Log",  "Exp"};
   return ops[i % ops.size()];
 }
 
@@ -127,7 +129,7 @@ static Graph* UnaryOpsCompo(int tensor_size, int repeat_graph,
   Tensor t(DT_FLOAT, TensorShape({tensor_size}));
   t.flat<float>() = t.flat<float>().setRandom();
 
-  std::vector<string> functions;
+  std::vector<std::string> functions;
   for (int j = 0; j < num_functions; ++j) {
     functions.push_back(Function(j));
   }
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index e9e0da41057342..417aed9e39a9f8 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -117,7 +117,7 @@ void unicode_error_callback(const void* context, UConverterToUnicodeArgs* args,
 // encoding position.
 // callback: function(UChar32 codepoint, int num_bytes_consumed_from_source_str,
 //                    bool fatal_format_error)
-void IterateUnicodeString(const string& str, UConverter* converter,
+void IterateUnicodeString(const std::string& str, UConverter* converter,
                           std::function<void(UChar32, int, bool)> callback) {
   const char* source = str.data();
   const char* limit = str.data() + str.length();
@@ -165,7 +165,7 @@ class WrappedConverter {
     }
   }
 
-  void init(const string& name) {
+  void init(const std::string& name) {
     if (converter_ && name == name_) {
       // Note: this reset is not typically needed, but if not done, then in some
       // cases the cached converter will maintain state of input endianness
@@ -193,7 +193,7 @@ class WrappedConverter {
   }
 
   UConverter* converter_ = nullptr;
-  string name_;
+  std::string name_;
 };
 
 struct ErrorOptions {
@@ -206,7 +206,7 @@ struct ErrorOptions {
 absl::Status GetErrorOptions(OpKernelConstruction* ctx, ErrorOptions* out) {
   *out = ErrorOptions();
 
-  string error_policy;
+  std::string error_policy;
   TF_RETURN_IF_ERROR(ctx->GetAttr("errors", &error_policy));
 
   if (error_policy == "replace") {
@@ -251,7 +251,7 @@ class UnicodeTranscodeOp : public OpKernel {
   explicit UnicodeTranscodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
 
-    string output_encoding;
+    std::string output_encoding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &output_encoding));
     OP_REQUIRES_OK(ctx,
                    ParseUnicodeEncoding(output_encoding, &output_encoding_));
@@ -338,7 +338,7 @@ class UnicodeTranscodeOp : public OpKernel {
     Encode(output_encoding_, source, s);
   }
 
-  string input_encoding_;
+  std::string input_encoding_;
   ErrorOptions error_options_;
   UnicodeEncoding output_encoding_ = UnicodeEncoding::UTF8;
 };
@@ -420,7 +420,7 @@ class UnicodeDecodeBaseOp : public OpKernel {
     int row_split_index = 0;
     SPLITS_TYPE next_row_split = 0;
     for (int i = 0; i < input_vec.size(); ++i) {
-      const string& input = input_vec(i);
+      const std::string& input = input_vec(i);
       // Convert input strings into unicode values. Output to a list of
       // char_values, record row splits and char_to_byte_starts, which are all
       // the fields needed to construct a RaggedTensor.
@@ -441,7 +441,7 @@ class UnicodeDecodeBaseOp : public OpKernel {
         ctx, ctx->allocate_output(
                  "char_values", {static_cast<SPLITS_TYPE>(char_values.size())},
                  &output_char_values));
-    auto out_char_values = output_char_values->vec<int32>();
+    auto out_char_values = output_char_values->vec<int32_t>();
     if (generate_offsets_) {
       DCHECK(offset_values.size() == char_values.size());
       Tensor* output_offset_values;
@@ -453,18 +453,18 @@ class UnicodeDecodeBaseOp : public OpKernel {
 
       // Load output tensors from intermediate value arrays.
       for (int i = 0; i < char_values.size(); ++i) {
-        out_char_values(i) = static_cast<int32>(char_values[i]);
+        out_char_values(i) = static_cast<int32_t>(char_values[i]);
         out_offset_values(i) = offset_values[i];
       }
     } else {
       for (int i = 0; i < char_values.size(); ++i) {
-        out_char_values(i) = static_cast<int32>(char_values[i]);
+        out_char_values(i) = static_cast<int32_t>(char_values[i]);
       }
     }
   }
 
  private:
-  string input_encoding_;
+  std::string input_encoding_;
   ErrorOptions error_options_;
   bool generate_offsets_ = false;
 };
@@ -491,18 +491,18 @@ REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets")
                             .TypeConstraint<int64_t>("Tsplits"),
                         UnicodeDecodeWithOffsetsOp<int64_t>);
 REGISTER_KERNEL_BUILDER(
-    Name("UnicodeDecode").Device(DEVICE_CPU).TypeConstraint<int32>("Tsplits"),
-    UnicodeDecodeOp<int32>);
+    Name("UnicodeDecode").Device(DEVICE_CPU).TypeConstraint<int32_t>("Tsplits"),
+    UnicodeDecodeOp<int32_t>);
 REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<int32>("Tsplits"),
-                        UnicodeDecodeWithOffsetsOp<int32>);
+                            .TypeConstraint<int32_t>("Tsplits"),
+                        UnicodeDecodeWithOffsetsOp<int32_t>);
 
 template <typename SPLITS_TYPE>
 class UnicodeEncodeOp : public OpKernel {
  public:
   explicit UnicodeEncodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string encoding_tmp;
+    std::string encoding_tmp;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &encoding_tmp));
     OP_REQUIRES_OK(ctx, ParseUnicodeEncoding(encoding_tmp, &encoding_));
     OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
@@ -521,7 +521,7 @@ class UnicodeEncodeOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     // Get inputs
     const Tensor& input_tensor = context->input(0);
-    const auto input_tensor_flat = input_tensor.flat<int32>();
+    const auto input_tensor_flat = input_tensor.flat<int32_t>();
     const Tensor& input_splits = context->input(1);
     const auto input_splits_flat = input_splits.flat<SPLITS_TYPE>();
 
@@ -602,7 +602,7 @@ REGISTER_KERNEL_BUILDER(
     Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int64_t>("Tsplits"),
     UnicodeEncodeOp<int64_t>);
 REGISTER_KERNEL_BUILDER(
-    Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int32>("Tsplits"),
-    UnicodeEncodeOp<int32>);
+    Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int32_t>("Tsplits"),
+    UnicodeEncodeOp<int32_t>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unicode_script_op.cc b/tensorflow/core/kernels/unicode_script_op.cc
index 70ab6ef39bf7f1..7a378cd5054b33 100644
--- a/tensorflow/core/kernels/unicode_script_op.cc
+++ b/tensorflow/core/kernels/unicode_script_op.cc
@@ -26,13 +26,13 @@ class UnicodeScriptOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(context, context->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<int32>();
+    const auto& input_flat = input_tensor->flat<int32_t>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output("output", input_tensor->shape(),
                                             &output_tensor));
-    auto output_flat = output_tensor->flat<int32>();
+    auto output_flat = output_tensor->flat<int32_t>();
 
     icu::ErrorCode status;
     for (int i = 0; i < input_flat.size(); i++) {
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index 5e0e52f6736b57..dd9447444deac3 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -64,9 +65,9 @@ class UnravelIndexOp : public OpKernel {
     double prod = 1;
     uint64_t limit;
     if (dtidx_ == DataType::DT_INT64) {
-      limit = kint64max;
+      limit = std::numeric_limits<int64_t>::max();
     } else {
-      limit = kint32max;
+      limit = std::numeric_limits<int32_t>::max();
     }
 
     for (int i = 0; i < dims.size(); i++) {
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index bc57282dc06318..14f99e9ee9bb71 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -143,7 +143,6 @@ cc_library(
 
 tf_cc_test(
     name = "threadpool_test",
-    timeout = "short",
     srcs = ["threadpool_test.cc"],
     deps = [
         ":threadpool",
@@ -151,6 +150,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@local_xla//xla/tsl/platform:test_main",
     ],
 )
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 7984b2f325449f..5507dd0644e164 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include <optional>
 
 #include "absl/synchronization/barrier.h"
-#include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -353,24 +354,30 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
 }
 
 TEST(ThreadPool, Parallelism) {
-  // TODO: b/433244133 - Re-enable this test once the flakiness is fixed.
-#if defined(PLATFORM_WINDOWS)
-  GTEST_SKIP() << "Skipping test on Windows due to flakiness.";
-#endif
   // Test that if we have N threads and schedule N tasks,
   // all tasks will be scheduled at the same time.
-  // Failure mode for this test will be episodic timeouts (does not terminate).
+  // Failure mode for this test will be timeouts.
   ThreadPool pool(Env::Default(), "test", kNumThreads);
   for (int iter = 0; iter < 2000; iter++) {
     absl::Barrier barrier(kNumThreads);
-    absl::BlockingCounter counter(kNumThreads);
+    // Expect each loop finishes less than 1s or much less. The semantic of
+    // counter, mutex and done here is the same as absl::BlockingCounter except
+    // that it waits for the condition with timeout.
+    std::atomic<int> counter(kNumThreads);
+    absl::Mutex mutex;
+    bool done = false;
     for (int t = 0; t < kNumThreads; ++t) {
       pool.Schedule([&]() {
         barrier.Block();
-        counter.DecrementCount();
+        if (--counter <= 0) {
+          absl::MutexLock lock(mutex);
+          done = true;
+        }
       });
     }
-    counter.Wait();
+    absl::MutexLock lock(mutex);
+    absl::Condition cond(+[](bool* done) { return *done; }, &done);
+    EXPECT_TRUE(mutex.AwaitWithTimeout(cond, absl::Seconds(1)));
   }
 }
 
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index c79a73c1a5e69b..a0eb727f1a4343 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -25,6 +25,8 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@org_sqlite",
     ],
 )
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index 587021ac6a7574..defbd1bee365c6 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/strcat.h"
@@ -81,7 +83,8 @@ absl::StatusCode GetTfErrorCode(int code) {
 }
 
 template <typename... Args>
-absl::Status PrintfStatus(int rc, const char* fmt, Args&&... args) {
+absl::Status PrintfStatus(int rc, const absl::FormatSpec<Args...>& fmt,
+                          Args&&... args) {
   return {GetTfErrorCode(rc),
           strings::Printf(fmt, std::forward<Args>(args)...)};
 }
@@ -97,8 +100,7 @@ absl::Status SetPragma(Sqlite* db, const char* pragma,
                        const absl::string_view& value) {
   if (value.empty()) return absl::OkStatus();
   for (auto p = value.begin(); p < value.end(); ++p) {
-    if (!(('0' <= *p && *p <= '9') || ('A' <= *p && *p <= 'Z') ||
-          ('a' <= *p && *p <= 'z') || *p == '-')) {
+    if (!(absl::ascii_isalnum(*p) || *p == '-')) {
       return errors::InvalidArgument("Illegal pragma character");
     }
   }
@@ -123,7 +125,7 @@ absl::Status EnvPragma(Sqlite* db, const char* pragma, const char* var) {
 }  // namespace
 
 /* static */
-absl::Status Sqlite::Open(const string& path, int flags, Sqlite** db) {
+absl::Status Sqlite::Open(const std::string& path, int flags, Sqlite** db) {
   flags |= SQLITE_OPEN_PRIVATECACHE;
   flags |= SQLITE_OPEN_URI;
   sqlite3* sqlite = nullptr;
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 992001e448e617..6b70e84106211f 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -81,7 +81,7 @@ class TF_LOCKABLE Sqlite : public core::RefCounted {
   ///
   /// This function sets PRAGMA values from TF_SQLITE_* environment
   /// variables. See sqlite.cc to learn more.
-  static absl::Status Open(const string& path, int flags, Sqlite** db);
+  static absl::Status Open(const std::string& path, int flags, Sqlite** db);
 
   /// \brief Creates SQLite statement.
   ///
@@ -165,7 +165,7 @@ class SqliteStatement {
   const char* sql() const { return sqlite3_sql(stmt_); }
 
   /// \brief Number of bytes bound since last *Reset().
-  uint64 size() { return size_; }
+  uint64_t size() { return size_; }
 
   /// \brief Executes query for fetching arbitrary rows.
   ///
@@ -302,7 +302,7 @@ class SqliteStatement {
   ///
   /// NULL values are returned as empty string. This method should be
   /// used for both BLOB and TEXT columns. See also: ColumnType().
-  string ColumnString(int column) const TF_MUST_USE_RESULT {
+  std::string ColumnString(int column) const TF_MUST_USE_RESULT {
     auto data = sqlite3_column_blob(stmt_, column);
     if (data == nullptr) return "";
     return {static_cast<const char*>(data),
@@ -377,7 +377,7 @@ class SqliteStatement {
   sqlite3_stmt* stmt_ = nullptr;
   int bind_error_ = SQLITE_OK;
   int bind_error_parameter_ = 0;
-  uint64 size_ = 0;
+  uint64_t size_ = 0;
 
   SqliteStatement(const SqliteStatement&) = delete;
   void operator=(const SqliteStatement&) = delete;
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index a3551ca1aa5664..99e50cde01f2fc 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -89,7 +89,7 @@ TEST_F(SqliteTest, Json1Extension) {
 #endif //DSQLITE_ENABLE_JSON1
 
 TEST_F(SqliteTest, NulCharsInString) {
-  string s;  // XXX: Want to write {2, '\0'} but not sure why not.
+  std::string s;  // XXX: Want to write {2, '\0'} but not sure why not.
   s.append(static_cast<size_t>(2), '\0');
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
@@ -108,7 +108,7 @@ TEST_F(SqliteTest, NulCharsInString) {
 }
 
 TEST_F(SqliteTest, Unicode) {
-  string s = "要依法治国是赞美那些谁是公义的和惩罚恶人。 - 韩非";
+  std::string s = "要依法治国是赞美那些谁是公义的和惩罚恶人。 - 韩非";
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
   stmt.BindText(2, s);
@@ -134,7 +134,7 @@ TEST_F(SqliteTest, StepAndResetClearsBindings) {
 }
 
 TEST_F(SqliteTest, SafeBind) {
-  string s = "hello";
+  std::string s = "hello";
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
   stmt.BindText(2, s);
@@ -147,7 +147,7 @@ TEST_F(SqliteTest, SafeBind) {
 }
 
 TEST_F(SqliteTest, UnsafeBind) {
-  string s = "hello";
+  std::string s = "hello";
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlobUnsafe(1, s);
   stmt.BindTextUnsafe(2, s);
@@ -216,7 +216,7 @@ TEST_F(SqliteTest, PrepareFailed) {
   SqliteStatement stmt;
   absl::Status s = db_->Prepare("SELECT", &stmt);
   ASSERT_FALSE(s.ok());
-  EXPECT_NE(string::npos, s.message().find("SELECT"));
+  EXPECT_NE(std::string::npos, s.message().find("SELECT"));
   EXPECT_EQ(SQLITE_ERROR, db_->errcode());
 }
 
@@ -224,7 +224,8 @@ TEST_F(SqliteTest, BindFailed) {
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a) VALUES (123)");
   stmt.BindInt(1, 123);
   absl::Status s = stmt.StepOnce();
-  EXPECT_NE(string::npos, s.message().find("INSERT INTO T (a) VALUES (123)"))
+  EXPECT_NE(std::string::npos,
+            s.message().find("INSERT INTO T (a) VALUES (123)"))
       << s.message();
 }
 
diff --git a/tensorflow/core/lib/gif/BUILD b/tensorflow/core/lib/gif/BUILD
index 8c27edc30dcc7c..7697d08819a051 100644
--- a/tensorflow/core/lib/gif/BUILD
+++ b/tensorflow/core/lib/gif/BUILD
@@ -52,7 +52,6 @@ cc_library(
         "//tensorflow/core/lib/gtl:legacy_android_gif_internal_headers",
         "//tensorflow/core/platform:gif_internal_hdrs",
         "@local_tsl//tsl/platform:gif_internal_hdrs",
-        "@local_xla//xla/tsl/platform/default:integral_types.h",
     ],
     copts = tf_copts(),
     features = ["-layering_check"],
diff --git a/tensorflow/core/lib/gtl/edit_distance_test.cc b/tensorflow/core/lib/gtl/edit_distance_test.cc
index 8290334bbf71a7..b1771bb89f2ded 100644
--- a/tensorflow/core/lib/gtl/edit_distance_test.cc
+++ b/tensorflow/core/lib/gtl/edit_distance_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cctype>
 #include <vector>
+
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -96,7 +98,7 @@ TEST_F(LevenshteinDistanceTest, DifferentComparisons) {
                           std::equal_to<char>()),
       5);
   auto no_case_cmp = [](char c1, char c2) {
-    return std::tolower(c1) == std::tolower(c2);
+    return absl::ascii_tolower(c1) == absl::ascii_tolower(c2);
   };
   ASSERT_EQ(LevenshteinDistance(lower_, upper_, no_case_cmp), 3);
   ASSERT_EQ(LevenshteinDistance(upper_, lower_, no_case_cmp), 3);
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index f5deacd1026a1e..442fcbf8daea79 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_IO_PATH_H_
 #define TENSORFLOW_CORE_LIB_IO_PATH_H_
 
-#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/path.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_LIB_IO_PATH_H_
diff --git a/tensorflow/core/lib/jpeg/BUILD b/tensorflow/core/lib/jpeg/BUILD
index a831e9d43214ee..ce705066f161d7 100644
--- a/tensorflow/core/lib/jpeg/BUILD
+++ b/tensorflow/core/lib/jpeg/BUILD
@@ -61,7 +61,6 @@ cc_library(
         "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
         "//tensorflow/core/platform:jpeg_internal_hdrs",
         "@local_tsl//tsl/platform:jpeg_internal_hdrs",
-        "@local_xla//xla/tsl/platform/default:integral_types.h",
     ],
     copts = tf_copts(),
     linkopts = if_android(["-ldl"]),
diff --git a/tensorflow/core/lib/monitoring/BUILD b/tensorflow/core/lib/monitoring/BUILD
index f3e585a0fe0334..fae5828c6b4daa 100644
--- a/tensorflow/core/lib/monitoring/BUILD
+++ b/tensorflow/core/lib/monitoring/BUILD
@@ -263,7 +263,6 @@ filegroup(
 # "/tensorflow/test/sampler_without_labels").
 exports_files(
     [
-        "cell_reader_test.cc",
         "collection_registry_test.cc",
         "counter_test.cc",
         "gauge_test.cc",
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index 23f5de6291ca7a..682f1a84687e45 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -171,7 +171,7 @@ TEST(CollectMetricsTest, Counter) {
 
 TEST(CollectMetricsTest, Gauge) {
   auto string_gauge_with_labels =
-      std::unique_ptr<Gauge<string, 2>>(Gauge<string, 2>::New(
+      std::unique_ptr<Gauge<std::string, 2>>(Gauge<std::string, 2>::New(
           "/tensorflow/test/string_gauge_with_labels",
           "String gauge with labels.", "MyLabel0", "MyLabel1"));
   auto inteter_gauge_without_labels = std::unique_ptr<Gauge<int64_t, 0>>(
@@ -463,13 +463,13 @@ class FakeClockEnv : public EnvWrapper {
   FakeClockEnv() : EnvWrapper(Env::Default()), current_millis_(0) {}
 
   // Manually advance the current time by 'millis' milliseconds.
-  void AdvanceByMillis(const uint64 millis) { current_millis_ += millis; }
+  void AdvanceByMillis(const uint64_t millis) { current_millis_ += millis; }
 
   // Method that this environment specifically overrides.
-  uint64 NowMicros() const override { return current_millis_ * 1000; }
+  uint64_t NowMicros() const override { return current_millis_ * 1000; }
 
  private:
-  uint64 current_millis_;
+  uint64_t current_millis_;
 };
 
 TEST(CollectionRegistryTest, WriteTimestamps) {
diff --git a/tensorflow/core/lib/monitoring/gauge_test.cc b/tensorflow/core/lib/monitoring/gauge_test.cc
index 7443ab8f568527..3ba19e929723fe 100644
--- a/tensorflow/core/lib/monitoring/gauge_test.cc
+++ b/tensorflow/core/lib/monitoring/gauge_test.cc
@@ -65,8 +65,8 @@ TEST(UnlabeledGaugeTest, GetCell) {
   EXPECT_EQ(10, same_cell->value());
 }
 
-auto* string_gauge = Gauge<string, 0>::New("/tensorflow/test/string_gauge",
-                                           "Gauge of string value.");
+auto* string_gauge = Gauge<std::string, 0>::New("/tensorflow/test/string_gauge",
+                                                "Gauge of string value.");
 
 TEST(GaugeOfStringValue, InitializedWithEmptyString) {
   EXPECT_EQ("", string_gauge->GetCell()->value());
diff --git a/tensorflow/core/lib/monitoring/metric_def_test.cc b/tensorflow/core/lib/monitoring/metric_def_test.cc
index 05ab7eb81a152c..0865f55d96dbed 100644
--- a/tensorflow/core/lib/monitoring/metric_def_test.cc
+++ b/tensorflow/core/lib/monitoring/metric_def_test.cc
@@ -43,9 +43,9 @@ TEST(MetricDefTest, Simple) {
 
 TEST(MetricDefTest, StringsPersist) {
   // Ensure string attributes of the metric are copied into the metric
-  string name = "/tensorflow/metric0";
-  string description = "test description";
-  string label_description = "test label description";
+  std::string name = "/tensorflow/metric0";
+  std::string description = "test description";
+  std::string label_description = "test label description";
   const MetricDef<MetricKind::kCumulative, int64_t, 1> metric_def(
       name, description, label_description);
 
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index f9f03c1ee3e1af..41db93ae910a18 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string.h>
 
 #include <algorithm>
+#include <cstdint>
 
 #include "absl/base/casts.h"
 #include "tensorflow/core/lib/core/coding.h"
@@ -74,8 +75,9 @@ constexpr char kDataChunkId[] = "data";
 
 inline int16 FloatToInt16Sample(float data) {
   constexpr float kMultiplier = 1.0f * (1 << 15);
-  return std::min<float>(std::max<float>(roundf(data * kMultiplier), kint16min),
-                         kint16max);
+  return std::min<float>(std::max<float>(roundf(data * kMultiplier),
+                                         std::numeric_limits<int16_t>::min()),
+                         std::numeric_limits<int16_t>::max());
 }
 
 inline float Int16SampleToFloat(int16_t data) {
@@ -156,11 +158,12 @@ absl::Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   if (wav_string == nullptr) {
     return errors::InvalidArgument("wav_string is null");
   }
-  if (sample_rate == 0 || sample_rate > kuint32max) {
+  if (sample_rate == 0 || sample_rate > std::numeric_limits<uint32_t>::max()) {
     return errors::InvalidArgument("sample_rate must be in (0, 2^32), got: ",
                                    sample_rate);
   }
-  if (num_channels == 0 || num_channels > kuint16max) {
+  if (num_channels == 0 ||
+      num_channels > std::numeric_limits<uint16_t>::max()) {
     return errors::InvalidArgument("num_channels must be in (0, 2^16), got: ",
                                    num_channels);
   }
@@ -172,8 +175,8 @@ absl::Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   const size_t bytes_per_frame = kBytesPerSample * num_channels;
 
   // WAV represents the length of the file as a uint32 so file_size cannot
-  // exceed kuint32max.
-  if (file_size > kuint32max) {
+  // exceed std::numeric_limits<uint32_t>::max().
+  if (file_size > std::numeric_limits<uint32_t>::max()) {
     return errors::InvalidArgument(
         "Provided channels and frames cannot be encoded as a WAV.");
   }
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index bbdb643b56ff17..dfc75257cc85f5 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/lib/wav/wav_io.h"
 
+#include <cstddef>
+#include <cstdint>
+#include <limits>
 #include <string>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -45,8 +48,10 @@ TEST(WavIO, BadArguments) {
       error::INVALID_ARGUMENT,
       EncodeAudioAsS16LEWav(audio, 44100, 2, 3, (tstring*)nullptr).code());
 
-  const size_t kuint32max_plus_one = static_cast<size_t>(kuint32max) + 1;
-  const size_t kuint16max_plus_one = static_cast<size_t>(kuint16max) + 1;
+  const size_t kuint32max_plus_one =
+      static_cast<size_t>(std::numeric_limits<uint32_t>::max()) + 1;
+  const size_t kuint16max_plus_one =
+      static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1;
 
   // Zero values are invalid.
   EXPECT_EQ(error::INVALID_ARGUMENT,
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index b1f50645f1dafb..0f14acbac3b841 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -149,13 +149,13 @@ absl::Status ConcatGradHelper(const AttrSlice& attrs, FunctionDef* g,
   DataType T;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "T", &T));
 
-  std::vector<string> shape_i;
-  std::vector<string> offset_i;
-  std::vector<string> dx_i;
+  std::vector<std::string> shape_i;
+  std::vector<std::string> offset_i;
+  std::vector<std::string> dx_i;
   for (int i = 0; i < N; ++i) {
-    shape_i.push_back(strings::StrCat("shapes:output:", i));
-    offset_i.push_back(strings::StrCat("offset:offset:", i));
-    dx_i.push_back(strings::StrCat("dx_", i, ":output:0"));
+    shape_i.push_back(absl::StrCat("shapes:output:", i));
+    offset_i.push_back(absl::StrCat("offset:offset:", i));
+    dx_i.push_back(absl::StrCat("dx_", i, ":output:0"));
   }
 
   // ConcatGrad(dim, x, dy):
@@ -175,7 +175,7 @@ absl::Status ConcatGradHelper(const AttrSlice& attrs, FunctionDef* g,
   // For each dx[i], we take a slice of dy. The offset and size of the
   // slice is given by offset[i] and shape[i].
   for (int i = 0; i < N; ++i) {
-    nodes.push_back({{strings::StrCat("dx_", i)},
+    nodes.push_back({{absl::StrCat("dx_", i)},
                      "Slice",
                      {"dy", offset_i[i], shape_i[i]},
                      {{"T", "$T"}, {"Index", DT_INT32}}});
@@ -270,10 +270,10 @@ REGISTER_OP_GRADIENT("SplitV", SplitVGrad);
 absl::Status ArrayToListGrad(const AttrSlice& attrs, FunctionDef* g) {
   int N;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "N", &N));
-  std::vector<string> dys;
+  std::vector<std::string> dys;
   dys.reserve(N);
   for (int i = 0; i < N; ++i) {
-    dys.push_back(strings::StrCat("dy:", i));
+    dys.push_back(absl::StrCat("dy:", i));
   }
   // clang-format off
   *g = FDH::Define(
diff --git a/tensorflow/core/ops/array_grad_test.cc b/tensorflow/core/ops/array_grad_test.cc
index df7d0aa6a40284..b68d4e7da8cbdd 100644
--- a/tensorflow/core/ops/array_grad_test.cc
+++ b/tensorflow/core/ops/array_grad_test.cc
@@ -172,7 +172,7 @@ TEST(ArrayGradTest, ConcatGrad) {
 
   // Test Concat.
   auto dx = ConcatGrad(1, x0, x1, dy);
-  test::ExpectTensorEqual<int32>(dx[0], test::AsScalar(0));
+  test::ExpectTensorEqual<int32_t>(dx[0], test::AsScalar(0));
   test::ExpectClose(
       dx[1],
       test::AsTensor<float>({0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.,
@@ -185,7 +185,7 @@ TEST(ArrayGradTest, ConcatGrad) {
 
   // Test ConcatV2 with positive concat axis.
   dx = ConcatGradV2(1, x0, x1, dy);
-  test::ExpectTensorEqual<int32>(dx[dx.size() - 1], test::AsScalar(0));
+  test::ExpectTensorEqual<int32_t>(dx[dx.size() - 1], test::AsScalar(0));
   test::ExpectClose(
       dx[0],
       test::AsTensor<float>({0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.,
@@ -198,7 +198,7 @@ TEST(ArrayGradTest, ConcatGrad) {
 
   // Test ConcatV2 with negative concat axis.
   dx = ConcatGradV2(-2, x0, x1, dy);
-  test::ExpectTensorEqual<int32>(dx[dx.size() - 1], test::AsScalar(0));
+  test::ExpectTensorEqual<int32_t>(dx[dx.size() - 1], test::AsScalar(0));
   test::ExpectClose(
       dx[0],
       test::AsTensor<float>({0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.,
@@ -289,7 +289,7 @@ TEST(ArrayGradTest, SplitGrad) {
   // SplitGrad
   {
     auto dx = SplitGrad(1, x, dy0, dy1);
-    test::ExpectTensorEqual<int32>(dx[0], expected_d_dim);
+    test::ExpectTensorEqual<int32_t>(dx[0], expected_d_dim);
     test::ExpectClose(dx[1], expected_dx);
   }
   // SplitVGrad
@@ -300,7 +300,7 @@ TEST(ArrayGradTest, SplitGrad) {
     auto dx = SplitVGrad(x, size_splits, 1, dy0, dy1);
     test::ExpectClose(dx[0], expected_dx);
     test::ExpectTensorEqual<int64_t>(dx[1], expected_d_size_splits);
-    test::ExpectTensorEqual<int32>(dx[2], expected_d_dim);
+    test::ExpectTensorEqual<int32_t>(dx[2], expected_d_dim);
   }
 }
 
@@ -329,7 +329,7 @@ std::vector<Tensor> ReshapeGrad(const Tensor& x, const Tensor& s,
 TEST(ArrayGradTest, ReshapeGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
-  auto s = test::AsTensor<int32>({8, 5});
+  auto s = test::AsTensor<int32_t>({8, 5});
   Tensor dy(DT_FLOAT, {8, 5});
   test::FillIota<float>(&dy, 73);
   auto dx = ReshapeGrad(x, s, dy);
@@ -340,7 +340,7 @@ TEST(ArrayGradTest, ReshapeGrad) {
                   93.,  94.,  95.,  96.,  97.,  98.,  99.,  100., 101., 102.,
                   103., 104., 105., 106., 107., 108., 109., 110., 111., 112.},
                  {2, 4, 5}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0}));
 }
 
 std::vector<Tensor> ExpandDimsGrad(const Tensor& x, const Tensor& s,
@@ -368,7 +368,7 @@ std::vector<Tensor> ExpandDimsGrad(const Tensor& x, const Tensor& s,
 TEST(ArrayGradTest, ExpandDimsGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
-  auto s = test::AsTensor<int32>({1});
+  auto s = test::AsTensor<int32_t>({1});
   Tensor dy(DT_FLOAT, {2, 1, 4, 5});
   test::FillIota<float>(&dy, 73);
   auto dx = ExpandDimsGrad(x, s, dy);
@@ -379,7 +379,7 @@ TEST(ArrayGradTest, ExpandDimsGrad) {
                   93.,  94.,  95.,  96.,  97.,  98.,  99.,  100., 101., 102.,
                   103., 104., 105., 106., 107., 108., 109., 110., 111., 112.},
                  {2, 4, 5}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0}));
 }
 
 std::vector<Tensor> SqueezeGrad(const Tensor& x, const Tensor& dy) {
@@ -436,7 +436,7 @@ std::vector<Tensor> TransposeGrad(const Tensor& x, const Tensor& p,
 TEST(ArrayGradTest, TransposeGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
-  auto p = test::AsTensor<int32>({2, 0, 1});
+  auto p = test::AsTensor<int32_t>({2, 0, 1});
   Tensor dy(DT_FLOAT, {5, 2, 4});
   test::FillIota<float>(&dy, 0);
   auto dx = TransposeGrad(x, p, dy);
@@ -446,7 +446,7 @@ TEST(ArrayGradTest, TransposeGrad) {
                                 4., 12., 20., 28., 36., 5., 13., 21., 29., 37.,
                                 6., 14., 22., 30., 38., 7., 15., 23., 31., 39.},
                                {2, 4, 5}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
 }
 
 std::vector<Tensor> ReverseGrad(const Tensor& x, const Tensor& dims,
@@ -510,13 +510,13 @@ std::vector<Tensor> ReverseV2Grad(const Tensor& x, const Tensor& axis,
 TEST(ArrayGradTest, ReverseV2Grad) {
   Tensor x(DT_FLOAT, {2, 3});
   x.flat<float>().setZero();
-  auto axis = test::AsTensor<int32>({1});
+  auto axis = test::AsTensor<int32_t>({1});
   Tensor dy(DT_FLOAT, {2, 3});
   test::FillIota<float>(&dy, 1);
   auto dx = ReverseV2Grad(x, axis, dy);
   test::ExpectTensorEqual<float>(
       dx[0], test::AsTensor<float>({3., 2., 1., 6., 5., 4.}, {2, 3}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0}));
 }
 
 std::vector<Tensor> SliceGrad(const Tensor& x, const Tensor& b, const Tensor& s,
@@ -546,8 +546,8 @@ std::vector<Tensor> SliceGrad(const Tensor& x, const Tensor& b, const Tensor& s,
 TEST(ArrayGradTest, SliceGrad) {
   Tensor x(DT_FLOAT, {2, 3, 4});
   x.flat<float>().setZero();
-  auto begin = test::AsTensor<int32>({1, 1, 1});
-  auto size = test::AsTensor<int32>({1, 2, 2});
+  auto begin = test::AsTensor<int32_t>({1, 1, 1});
+  auto size = test::AsTensor<int32_t>({1, 2, 2});
   Tensor dy(DT_FLOAT, {1, 2, 2});
   test::FillIota<float>(&dy, 1);
   auto dx = SliceGrad(x, begin, size, dy);
@@ -558,8 +558,8 @@ TEST(ArrayGradTest, SliceGrad) {
                             0., 0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0.,
                         },
                         {2, 3, 4}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
-  test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0, 0}));
 }
 
 std::vector<Tensor> StridedSliceGrad(const Tensor& x, const Tensor& begin,
@@ -653,12 +653,12 @@ std::vector<Tensor> StridedSliceGradGrad(
 TEST(ArrayGradTest, StridedSliceGrad) {
   Tensor x(DT_FLOAT, {2, 3, 4});
   x.flat<float>().setZero();
-  Tensor x_shape = test::AsTensor<int32>({2, 3, 4}, {3});
+  Tensor x_shape = test::AsTensor<int32_t>({2, 3, 4}, {3});
 
   {
-    auto start = test::AsTensor<int32>({1, 1, 1});
-    auto stop = test::AsTensor<int32>({2, 3, 3});
-    auto strides = test::AsTensor<int32>({1, 1, 1});
+    auto start = test::AsTensor<int32_t>({1, 1, 1});
+    auto stop = test::AsTensor<int32_t>({2, 3, 3});
+    auto strides = test::AsTensor<int32_t>({1, 1, 1});
     Tensor dy(DT_FLOAT, {1, 2, 2});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 0, end_mask = 0, new_axis_mask = 0, shrink_axis_mask = 0,
@@ -673,8 +673,8 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               0., 0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -683,9 +683,9 @@ TEST(ArrayGradTest, StridedSliceGrad) {
 
   // test equivalent of python tf.gradients(foo[1:2, 1:3, 1:3])
   {
-    auto start = test::AsTensor<int32>({1, 1, 1});
-    auto stop = test::AsTensor<int32>({2, 3, 3});
-    auto strides = test::AsTensor<int32>({1, 1, 1});
+    auto start = test::AsTensor<int32_t>({1, 1, 1});
+    auto stop = test::AsTensor<int32_t>({2, 3, 3});
+    auto strides = test::AsTensor<int32_t>({1, 1, 1});
     Tensor dy(DT_FLOAT, {1, 2, 2});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 0, end_mask = 0, new_axis_mask = 0, shrink_axis_mask = 0,
@@ -700,8 +700,8 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               0., 0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -711,9 +711,9 @@ TEST(ArrayGradTest, StridedSliceGrad) {
   // test equivalent of python tf.gradients(foo[1, 1:, :-2, None])
   {
     int dontcare = 66;
-    auto start = test::AsTensor<int32>({1, 1, dontcare, dontcare});
-    auto stop = test::AsTensor<int32>({2, dontcare, -2, dontcare});
-    auto strides = test::AsTensor<int32>({1, 1, 1, dontcare});
+    auto start = test::AsTensor<int32_t>({1, 1, dontcare, dontcare});
+    auto stop = test::AsTensor<int32_t>({2, dontcare, -2, dontcare});
+    auto strides = test::AsTensor<int32_t>({1, 1, 1, dontcare});
     Tensor dy(DT_FLOAT, {2, 2, 1});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 4, end_mask = 2, new_axis_mask = 8, shrink_axis_mask = 1,
@@ -728,8 +728,10 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0., 0.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1],
+                                     test::AsTensor<int32_t>({0, 0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2],
+                                     test::AsTensor<int32_t>({0, 0, 0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -739,9 +741,9 @@ TEST(ArrayGradTest, StridedSliceGrad) {
   // test equivalent of tf.gradients(foo[1, ...]) i.e. foo[1, 0:3, 0:4]
   {
     int dontcare = 66;
-    auto start = test::AsTensor<int32>({1, dontcare});
-    auto stop = test::AsTensor<int32>({2, dontcare});
-    auto strides = test::AsTensor<int32>({1, 1});
+    auto start = test::AsTensor<int32_t>({1, dontcare});
+    auto stop = test::AsTensor<int32_t>({2, dontcare});
+    auto strides = test::AsTensor<int32_t>({1, 1});
     Tensor dy(DT_FLOAT, {3, 4});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 0, end_mask = 0, new_axis_mask = 0, shrink_axis_mask = 1,
@@ -756,8 +758,8 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -793,12 +795,13 @@ TEST(ArrayGradTest, BroadcastToGrad) {
   Tensor x(DT_FLOAT, {2, 2});
   x.flat<float>().setZero();
   Tensor shape(DT_INT32, {3});
-  test::FillValues<int32>(&shape, {2, 2, 2});
+  test::FillValues<int32_t>(&shape, {2, 2, 2});
   Tensor dy(DT_FLOAT, {2, 2, 2});
   test::FillIota<float>(&dy, 0);
   auto dx = BroadcastToGrad(x, shape, dy);
   test::ExpectClose(dx[0], test::AsTensor<float>({4., 6., 8., 10.}, {2, 2}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}, {3}));
+  test::ExpectTensorEqual<int32_t>(dx[1],
+                                   test::AsTensor<int32_t>({0, 0, 0}, {3}));
 }
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 8d53c6dbb38425..0213914a564647 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <limits>
 #include <ostream>
 #include <vector>
 
@@ -43,7 +44,7 @@ using shape_inference::UnchangedShape;
 namespace {
 
 absl::Status GetAxisForPackAndUnpack(InferenceContext* c,
-                                     int32_t rank_after_pack, int32* axis) {
+                                     int32_t rank_after_pack, int32_t* axis) {
   TF_RETURN_IF_ERROR(c->GetAttr("axis", axis));
   if (*axis < -1 * rank_after_pack || *axis >= rank_after_pack) {
     return errors::InvalidArgument("Invalid axis: ", *axis, "; must be in [",
@@ -116,7 +117,7 @@ absl::Status PadShapeFn(InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->WithValue(n_dim, num_dims, &n_dim));
 
   if (paddings_t->dtype() == DT_INT32) {
-    return PadKnown<int32>(c, input, paddings_t, num_dims);
+    return PadKnown<int32_t>(c, input, paddings_t, num_dims);
   } else {
     return PadKnown<int64_t>(c, input, paddings_t, num_dims);
   }
@@ -165,7 +166,7 @@ absl::Status TransposeShapeFn(InferenceContext* c) {
   if (perm != nullptr) {
     std::vector<int64_t> data;
     if (perm->dtype() == DT_INT32) {
-      data = AsInt64<int32>(perm, rank);
+      data = AsInt64<int32_t>(perm, rank);
     } else {
       data = AsInt64<int64_t>(perm, rank);
     }
@@ -660,7 +661,8 @@ REGISTER_OP("SplitV")
         TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, split_dim + 1, &input));
         std::vector<int64_t> data;
         if (size_splits->dtype() == DT_INT32) {
-          data = AsInt64<int32>(size_splits, size_splits->shape().dim_size(0));
+          data =
+              AsInt64<int32_t>(size_splits, size_splits->shape().dim_size(0));
         } else {
           data =
               AsInt64<int64_t>(size_splits, size_splits->shape().dim_size(0));
@@ -1033,7 +1035,8 @@ REGISTER_OP("ReverseV2")
         int32_t rank = c->Rank(input);
         std::vector<int64_t> axis_value;
         if (axis_tensor->dtype() == DT_INT32) {
-          axis_value = AsInt64<int32>(axis_tensor, axis_tensor->NumElements());
+          axis_value =
+              AsInt64<int32_t>(axis_tensor, axis_tensor->NumElements());
         } else {
           axis_value =
               AsInt64<int64_t>(axis_tensor, axis_tensor->NumElements());
@@ -1131,7 +1134,7 @@ REGISTER_OP("Fill")
       const Tensor* t = c->input_tensor(0);
       if (t != nullptr) {
         for (int i = 0; i < t->NumElements(); ++i) {
-          if ((index_type == DT_INT32 && t->vec<int32>()(i) < 0) ||
+          if ((index_type == DT_INT32 && t->vec<int32_t>()(i) < 0) ||
               (index_type == DT_INT64 && t->vec<int64_t>()(i) < 0)) {
             return errors::InvalidArgument("Fill dimensions must be >= 0");
           }
@@ -1249,7 +1252,7 @@ REGISTER_OP("GatherV2")
       // Note, axis can be negative.
       int64_t axis = 0;
       if (axis_t->dtype() == DT_INT32) {
-        axis = axis_t->scalar<int32>()();
+        axis = axis_t->scalar<int32_t>()();
       } else {
         axis = axis_t->scalar<int64_t>()();
       }
@@ -1482,7 +1485,7 @@ absl::Status UniqueIdxShapeFn(InferenceContext* c) {
   } else if (n == 1) {
     int64_t axis;
     if (axis_t->dtype() == DT_INT32) {
-      axis = static_cast<int64_t>(axis_t->flat<int32>()(0));
+      axis = static_cast<int64_t>(axis_t->flat<int32_t>()(0));
     } else {
       axis = axis_t->flat<int64_t>()(0);
     }
@@ -1753,7 +1756,7 @@ REGISTER_OP("StridedSlice")
 
       PartialTensorShape processing_shape, final_shape;
       bool is_identity, is_simple_slice, slice_dim0;
-      absl::InlinedVector<int64, 4UL> begin, end, strides;
+      absl::InlinedVector<int64_t, 4UL> begin, end, strides;
       TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
           begin_value, end_value, *strides_value, input_shape, begin_mask,
           end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
@@ -2026,7 +2029,7 @@ REGISTER_OP("MirrorPadGrad")
       }
 
       if (paddings_t->dtype() == DT_INT32) {
-        return MirrorPadKnown<int32>(c, input, paddings_t, input_rank);
+        return MirrorPadKnown<int32_t>(c, input, paddings_t, input_rank);
       } else {
         return MirrorPadKnown<int64_t>(c, input, paddings_t, input_rank);
       }
@@ -2107,7 +2110,7 @@ REGISTER_OP("ExpandDims")
 
       int64_t dim;
       if (dim_t->dtype() == DT_INT32) {
-        dim = static_cast<int64_t>(dim_t->flat<int32>()(0));
+        dim = static_cast<int64_t>(dim_t->flat<int32_t>()(0));
       } else {
         dim = dim_t->flat<int64_t>()(0);
       }
@@ -2151,7 +2154,7 @@ REGISTER_OP("Squeeze")
       const int32_t input_rank = c->Rank(input);
 
       // Validate and wrap squeeze dimensions.
-      std::vector<int32> squeeze_dims;
+      std::vector<int32_t> squeeze_dims;
       TF_RETURN_IF_ERROR(c->GetAttr("squeeze_dims", &squeeze_dims));
       for (int i = 0; i < squeeze_dims.size(); ++i) {
         if (squeeze_dims[i] < -input_rank || squeeze_dims[i] >= input_rank) {
@@ -2238,7 +2241,7 @@ std::vector<int64_t> GetFlatInt64(const Tensor& t) {
 // Converts int32 or int64 Tensor to flat std::vector<int64_t>.
 std::vector<int64_t> GetFlatInt64(const Tensor& t) {
   if (t.dtype() == DT_INT32) {
-    return GetFlatInt64<int32>(t);
+    return GetFlatInt64<int32_t>(t);
   } else {
     return GetFlatInt64<int64_t>(t);
   }
@@ -2489,7 +2492,7 @@ REGISTER_OP("SpaceToDepth")
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     // TODO(pauldonnelly): Implement GPU kernels for NCHW_VECT_C.
     .SetShapeFn([](InferenceContext* c) {
-      string data_format_str;
+      std::string data_format_str;
       TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
       TensorFormat data_format;
       FormatFromString(data_format_str, &data_format);
@@ -2543,7 +2546,7 @@ REGISTER_OP("DepthToSpace")
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     // TODO(pauldonnelly): Implement GPU kernels for NCHW and NCHW_VECT_C.
     .SetShapeFn([](InferenceContext* c) {
-      string data_format_str;
+      std::string data_format_str;
       TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
       TensorFormat data_format;
       FormatFromString(data_format_str, &data_format);
@@ -2602,7 +2605,7 @@ REGISTER_OP("ExtractImagePatches")
       ShapeHandle input_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
 
-      std::vector<int32> ksizes;
+      std::vector<int32_t> ksizes;
       TF_RETURN_IF_ERROR(c->GetAttr("ksizes", &ksizes));
       if (ksizes.size() != 4) {
         return errors::InvalidArgument(
@@ -2611,7 +2614,7 @@ REGISTER_OP("ExtractImagePatches")
             ksizes.size());
       }
 
-      std::vector<int32> strides;
+      std::vector<int32_t> strides;
       TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
       if (strides.size() != 4) {
         return errors::InvalidArgument(
@@ -2620,7 +2623,7 @@ REGISTER_OP("ExtractImagePatches")
             strides.size());
       }
 
-      std::vector<int32> rates;
+      std::vector<int32_t> rates;
       TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates));
       if (rates.size() != 4) {
         return errors::InvalidArgument(
@@ -2692,7 +2695,7 @@ REGISTER_OP("ExtractVolumePatches")
       ShapeHandle input_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
 
-      std::vector<int32> ksizes;
+      std::vector<int32_t> ksizes;
       TF_RETURN_IF_ERROR(c->GetAttr("ksizes", &ksizes));
       if (ksizes.size() != 5) {
         return errors::InvalidArgument(
@@ -2701,7 +2704,7 @@ REGISTER_OP("ExtractVolumePatches")
             ksizes.size());
       }
 
-      std::vector<int32> strides;
+      std::vector<int32_t> strides;
       TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
       if (strides.size() != 5) {
         return errors::InvalidArgument(
@@ -2863,7 +2866,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -2902,7 +2905,7 @@ REGISTER_OP("QuantizeAndDequantizeV4")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -2937,7 +2940,7 @@ REGISTER_OP("QuantizeAndDequantizeV4Grad")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -2977,7 +2980,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -3042,7 +3045,7 @@ REGISTER_OP("Dequantize")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
       if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           // Check int32 max bound for a corner case to prevent integer flow
           // when input actually has kint32max rank and above bound check is not
           // triggered.
@@ -3379,11 +3382,11 @@ REGISTER_OP("Fingerprint")
           return errors::InvalidArgument("`method` must be rank 0: ",
                                          method->shape());
         }
-        const string& method_string = method->scalar<tstring>()();
+        const std::string& method_string = method->scalar<tstring>()();
         if (method_string != "farmhash64") {
           return errors::InvalidArgument("Unsupported method: ", method_string);
         }
-        fingerprint_size = c->MakeDim(sizeof(uint64));
+        fingerprint_size = c->MakeDim(sizeof(uint64_t));
       }
 
       DimensionHandle batch = c->Dim(c->input(0), 0);
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 5546a6c158e7f1..ddbf13d893b44e 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -315,7 +315,7 @@ TEST(ArrayOpsTest, Fill_ShapeFn) {
   INFER_OK(op, "[?];?", "?");
   INFER_OK(op, "[4];?", "[?,?,?,?]");
 
-  Tensor in_t = test::AsTensor<int32>({1, 2, 3, 4});
+  Tensor in_t = test::AsTensor<int32_t>({1, 2, 3, 4});
   op.input_tensors[0] = &in_t;
   INFER_OK(op, "[4];?", "[1,2,3,4]");
 }
@@ -695,14 +695,14 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
 
   // Expand at front of tensor.
   for (int32_t idx : {0, -4}) {
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[1,d0_0,d0_1,d0_2]");
   }
 
   // Expand at middle of tensor.
   for (int32_t idx : {1, -3}) {
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[d0_0,1,d0_1,d0_2]");
 
@@ -712,7 +712,7 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
     INFER_OK(op, "[5,?,7];?", "[d0_0,1,d0_1,d0_2]");
   }
   for (int32_t idx : {2, -2}) {
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[d0_0,d0_1,1,d0_2]");
 
@@ -724,7 +724,7 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
 
   for (int32_t idx : {3, -1}) {
     // Expand at the end.
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[d0_0,d0_1,d0_2,1]");
 
@@ -735,31 +735,31 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
   }
   for (int32_t idx : {4, -5}) {
     // Invalid idx.
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_ERROR("not in the interval [-4, 3]", op, "[5,?,7];?");
     dim_t = test::AsScalar<int64_t>(idx);
     INFER_ERROR("not in the interval [-4, 3]", op, "[5,?,7];?");
   }
 
   // Expand using an input vector tensor.
-  std::vector<int32> dims;
+  std::vector<int32_t> dims;
   dims.push_back(0);
-  dim_t = test::AsTensor<int32>(dims);
+  dim_t = test::AsTensor<int32_t>(dims);
   INFER_OK(op, "?;?", "?");
   INFER_OK(op, "[5,?,7];?", "[1,d0_0,d0_1,d0_2]");
 
   // Expand using too many input elements.
   dims.push_back(1);
-  dim_t = test::AsTensor<int32>(dims);
+  dim_t = test::AsTensor<int32_t>(dims);
   INFER_ERROR("'dim' input must be a tensor with a single", op, "?;?");
   INFER_ERROR("'dim' input must be a tensor with a single", op, "[5,6,7];?");
 
   // Examples from ExpandDims doc.
-  dim_t = test::AsScalar<int32>(0);
+  dim_t = test::AsScalar<int32_t>(0);
   INFER_OK(op, "[2];[]", "[1,d0_0]");
-  dim_t = test::AsScalar<int32>(1);
+  dim_t = test::AsScalar<int32_t>(1);
   INFER_OK(op, "[2];[]", "[d0_0,1]");
-  dim_t = test::AsScalar<int32>(-1);
+  dim_t = test::AsScalar<int32_t>(-1);
   INFER_OK(op, "[2];[]", "[d0_0,1]");
 }
 
@@ -966,7 +966,7 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   INFER_OK(op, "[4];[?]", "?");
 
   // All dimensions provided.
-  Tensor new_shape = test::AsTensor<int32>({1, 2, 3});
+  Tensor new_shape = test::AsTensor<int32_t>({1, 2, 3});
   op.input_tensors[1] = &new_shape;
   INFER_OK(op, "?;[3]", "[1,2,3]");
   INFER_OK(op, "[?];[3]", "[1,2,3]");
@@ -978,39 +978,39 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
 
   // Unknown dimensions.
   // Flatten:
-  new_shape = test::AsTensor<int32>({-1});
+  new_shape = test::AsTensor<int32_t>({-1});
   INFER_OK(op, "?;[1]", "[?]");
   INFER_OK(op, "[?];[1]", "[d0_0]");
   INFER_OK(op, "[2,2];[1]", "[4]");
   // The first dimension is inferred:
-  new_shape = test::AsTensor<int32>({2, -1});
+  new_shape = test::AsTensor<int32_t>({2, -1});
   INFER_OK(op, "[3,4];[2]", "[2,6]");
   // The total number of elements must be evenly divisible by the known
   // dimensions.
   INFER_ERROR("Dimension size must be evenly divisible by 2 but is 7", op,
               "[7];[2]");
   // Multiple missing dimensions cannot be inferred.
-  new_shape = test::AsTensor<int32>({-1, -1, 2});
+  new_shape = test::AsTensor<int32_t>({-1, -1, 2});
   INFER_OK(op, "[8];[3]", "[?,?,2]");
   INFER_OK(op, "?;[3]", "[?,?,2]");
 
   // Symbolic shape propagation
-  new_shape = test::AsTensor<int32>({-1, 2, 3});
+  new_shape = test::AsTensor<int32_t>({-1, 2, 3});
   INFER_OK(op, "[?,2,3];[3]", "[d0_0,2,3]");
 
   // Reshaping to a scalar.
-  new_shape = test::AsTensor<int32>({});
+  new_shape = test::AsTensor<int32_t>({});
   INFER_OK(op, "[1];[0]", "[]");
   INFER_ERROR(
       "Cannot reshape a tensor with 2 elements to shape [] (1 elements)", op,
       "[1,2];[0]");
 
   // Reshaping a tensor with no elements.
-  new_shape = test::AsTensor<int32>({-1});
+  new_shape = test::AsTensor<int32_t>({-1});
   INFER_OK(op, "[0];[1]", "[0]");
-  new_shape = test::AsTensor<int32>({-1, 6});
+  new_shape = test::AsTensor<int32_t>({-1, 6});
   INFER_OK(op, "[0,2];[1]", "[0,6]");
-  new_shape = test::AsTensor<int32>({0, -1});
+  new_shape = test::AsTensor<int32_t>({0, -1});
   INFER_OK(op, "[0,2];[1]", "[0,?]");
 }
 
@@ -1024,7 +1024,7 @@ TEST(ArrayOpsTest, QuantizedReshape_ShapeFn) {
   INFER_OK(op, "[?];?;?;?", "?;[];[]");
   INFER_OK(op, "[?];[?];?;?", "?;[];[]");
   INFER_OK(op, "[4];[?];?;?", "?;[];[]");
-  Tensor new_shape = test::AsTensor<int32>({1, 2, 3});
+  Tensor new_shape = test::AsTensor<int32_t>({1, 2, 3});
   op.input_tensors[1] = &new_shape;
   INFER_OK(op, "[?];[3];?;?", "[1,2,3];[];[]");
   INFER_OK(op, "[6];[3];?;?", "[1,2,3];[];[]");
@@ -1096,23 +1096,23 @@ TEST(ArrayOpsTest, Transpose_ShapeFn) {
   INFER_OK(op, "[?];?", "[?]");
   INFER_OK(op, "[?,?];[2]", "[?,?]");
   INFER_ERROR("Dimension must be 3 but is 2", op, "[1,2,3];[2]");
-  Tensor perm = test::AsTensor<int32>({0});
+  Tensor perm = test::AsTensor<int32_t>({0});
   op.input_tensors[1] = &perm;
   INFER_OK(op, "[?];[?]", "[d0_0]");
-  perm = test::AsTensor<int32>({1, 0});
+  perm = test::AsTensor<int32_t>({1, 0});
   INFER_OK(op, "?;[2]", "[?,?]");
   INFER_OK(op, "[?,?];[2]", "[d0_1,d0_0]");
   INFER_OK(op, "[1,?];[2]", "[d0_1,d0_0]");
   INFER_OK(op, "?;[0]", "in0");
 
   // Invalid arguments.
-  perm = test::AsTensor<int32>({1, 2});
+  perm = test::AsTensor<int32_t>({1, 2});
   INFER_ERROR("perm dim 2 is out of range of input rank 2", op, "[1,2];[2]");
-  perm = test::AsTensor<int32>({0});
+  perm = test::AsTensor<int32_t>({0});
   INFER_ERROR("Dimension must be 2 but is 1", op, "[1,2];[1]");
 
   // Larger valid cases.
-  perm = test::AsTensor<int32>({1, 0, 3, 4, 2});
+  perm = test::AsTensor<int32_t>({1, 0, 3, 4, 2});
   INFER_OK(op, "[0,1,2,3,4];[5]", "[d0_1,d0_0,d0_3,d0_4,d0_2]");
   INFER_OK(op, "[0,?,2,3,4];[5]", "[d0_1,d0_0,d0_3,d0_4,d0_2]");
 }
@@ -1163,7 +1163,7 @@ TEST(ArrayOpsTest, Bitcast_ShapeFn) {
 TEST(ArrayOpsTest, Squeeze_ShapeFn) {
   ShapeInferenceTestOp op("Squeeze");
 
-  auto rebuild_node_def = [&op](const std::vector<int32>& squeeze_dims) {
+  auto rebuild_node_def = [&op](const std::vector<int32_t>& squeeze_dims) {
     TF_ASSERT_OK(NodeDefBuilder("test", "Squeeze")
                      .Input("input", 0, DT_FLOAT)
                      .Attr("squeeze_dims", squeeze_dims)
@@ -1257,10 +1257,10 @@ TEST(ArrayOpsTest, Split_ShapeFn) {
   INFER_OK(op, "?;[1,4]", "[?,?];[?,?]");
 
   // split_dim is known.
-  Tensor split_dim = test::AsTensor<int32>({1, 2});
+  Tensor split_dim = test::AsTensor<int32_t>({1, 2});
   op.input_tensors[0] = &split_dim;
   INFER_ERROR("Input must be scalar but has rank 1", op, "[?];[?,?]");
-  split_dim = test::AsScalar<int32>(1);
+  split_dim = test::AsScalar<int32_t>(1);
   INFER_OK(op, "?;?", "?;?");
   INFER_OK(op, "?;[?,?]", "[d1_0,?];[d1_0,?]");
   INFER_OK(op, "?;[1,4]", "[d1_0,2];[d1_0,2]");
@@ -1269,21 +1269,21 @@ TEST(ArrayOpsTest, Split_ShapeFn) {
               "?;[1,5]");
 
   // split_dim too large.
-  split_dim = test::AsScalar<int32>(3);
+  split_dim = test::AsScalar<int32_t>(3);
   INFER_ERROR(
       "Dimension size, given by scalar input 3 must be in range [-3, 3)", op,
       "?;[1,4,8]");
 
   // Negative split_dim.
-  split_dim = test::AsScalar<int32>(-1);
+  split_dim = test::AsScalar<int32_t>(-1);
   INFER_OK(op, "?;?", "?;?");
   INFER_OK(op, "?;[?,?]", "[d1_0,?];[d1_0,?]");
   INFER_OK(op, "?;[1,?]", "[d1_0,?];[d1_0,?]");
   INFER_OK(op, "?;[1,4]", "[d1_0,2];[d1_0,2]");
   INFER_OK(op, "?;[1,4,8]", "[d1_0,d1_1,4];[d1_0,d1_1,4]");
-  split_dim = test::AsScalar<int32>(-2);
+  split_dim = test::AsScalar<int32_t>(-2);
   INFER_OK(op, "?;[1,4,8]", "[d1_0,2,d1_2];[d1_0,2,d1_2]");
-  split_dim = test::AsScalar<int32>(-4);
+  split_dim = test::AsScalar<int32_t>(-4);
   INFER_ERROR(
       "Dimension size, given by scalar input -4 must be in range [-3, 3)", op,
       "?;[1,4,8]");
@@ -1312,7 +1312,7 @@ TEST(ArrayOpsTest, Tile_ShapeFn) {
   INFER_OK(op, "?;[4]", "[?,?,?,?]");
 
   // Test a tile of a 4D input.
-  Tensor multiples = test::AsTensor<int32>({2, 3, 4, 5});
+  Tensor multiples = test::AsTensor<int32_t>({2, 3, 4, 5});
   op.input_tensors[1] = &multiples;
   INFER_OK(op, "[2,3,1,4];[4]", "[4,9,4,20]");
   // Test 64-bit tensor type
@@ -1362,12 +1362,12 @@ TEST(ArrayOpsTest, OneHot_ShapeFn) {
   INFER_OK(op, "?;[];?;?", "?");
 
   // Depth must be scalar.
-  Tensor depth = test::AsTensor<int32>({1, 2});
+  Tensor depth = test::AsTensor<int32_t>({1, 2});
   op.input_tensors[1] = &depth;
   INFER_ERROR("Input must be scalar but has rank 1", op, "?;[2];?;?");
 
   // Full information is available.
-  depth = test::AsScalar<int32>(2);
+  depth = test::AsScalar<int32_t>(2);
   INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,2,d0_1,d0_2]");
   set_axis(-1);
   INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,d0_1,d0_2,2]");
@@ -1375,9 +1375,10 @@ TEST(ArrayOpsTest, OneHot_ShapeFn) {
 
 TEST(ArrayOpsTest, ExtractImagePatchesShapeTest) {
   ShapeInferenceTestOp op("ExtractImagePatches");
-  auto set_op = [&op](const std::vector<int32>& ksizes,
-                      const std::vector<int32>& strides,
-                      const std::vector<int32>& rates, const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& ksizes,
+                      const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& rates,
+                      const std::string& padding) {
     TF_ASSERT_OK(NodeDefBuilder("test", "ExtractImagePatches")
                      .Input("input", 0, DT_FLOAT)
                      .Attr("ksizes", ksizes)
@@ -1453,20 +1454,20 @@ TEST(ArrayOpsTest, SpaceToBatch_ShapeFn) {
   INFER_ERROR("rank", op, "[1,10,10,3];[4]");
   INFER_ERROR("3 and 2", op, "[1,10,10,3];[2,3]");
 
-  Tensor paddings = test::AsTensor<int32>({4, 2, 2, 4}, {{2, 2}});
+  Tensor paddings = test::AsTensor<int32_t>({4, 2, 2, 4}, {{2, 2}});
   op.input_tensors[1] = &paddings;
   INFER_OK(op, "[1,10,10,3];[2,2]", "[4,8,8,d0_3]");
   paddings = test::AsTensor<int64_t>({4, 2, 2, 4}, {{2, 2}});
   INFER_OK(op, "[1,10,10,3];[2,2]", "[4,8,8,d0_3]");
 
   // Bad paddings values
-  paddings = test::AsTensor<int32>({1, 2, 3, 4}, {{2, 2}});
+  paddings = test::AsTensor<int32_t>({1, 2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &paddings;
   INFER_ERROR("Dimension size must be evenly divisible by 2 but is 13", op,
               "[1,10,10,3];[2,2]");
 
   // Negative paddings
-  paddings = test::AsTensor<int32>({1, -2, 3, 4}, {{2, 2}});
+  paddings = test::AsTensor<int32_t>({1, -2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &paddings;
   INFER_ERROR("cannot be negative", op, "[1,10,10,3];[2,2]");
 }
@@ -1491,13 +1492,13 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
 
   {
     // Dimensions are partially known, block_shape known.
-    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 3});
     op.input_tensors[1] = &block_shape;
     INFER_OK(op, "[3,?,?,2];[2];?", "[18,?,?,d0_3]");
 
     // Dimensions are partially known, block_shape and paddings known.
     {
-      Tensor paddings = test::AsTensor<int32>({1, 1, 0, 1}, {{2, 2}});
+      Tensor paddings = test::AsTensor<int32_t>({1, 1, 0, 1}, {{2, 2}});
       op.input_tensors[2] = &paddings;
       INFER_OK(op, "[3,?,2,2];[2];[2,2]", "[18,?,1,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1505,7 +1506,7 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
 
     // Dimensions are fully known, block_shape and paddings are known.
     {
-      Tensor paddings = test::AsTensor<int32>({1, 1, 0, 0}, {{2, 2}});
+      Tensor paddings = test::AsTensor<int32_t>({1, 1, 0, 0}, {{2, 2}});
       op.input_tensors[2] = &paddings;
       INFER_OK(op, "[3,2,3,2];[2];[2,2]", "[18,2,1,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1518,16 +1519,16 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
   INFER_ERROR("block_shape must have known size", op, "?;[?];?");
 
   {
-    Tensor block_shape = test::AsTensor<int32>({0, 2});
+    Tensor block_shape = test::AsTensor<int32_t>({0, 2});
     op.input_tensors[1] = &block_shape;
     INFER_ERROR("block_shape must be positive", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({1, 1});
+    Tensor block_shape = test::AsTensor<int32_t>({1, 1});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({0, -1, 0, 0}, {{2, 2}});
+    Tensor paddings = test::AsTensor<int32_t>({0, -1, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &paddings;
     INFER_ERROR("paddings cannot be negative", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1535,9 +1536,9 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({3, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({3, 3});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({0, 0, 0, 0}, {{2, 2}});
+    Tensor paddings = test::AsTensor<int32_t>({0, 0, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &paddings;
     INFER_ERROR("divisible", op, "[1,2,3,1];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1545,9 +1546,9 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({});
+    Tensor block_shape = test::AsTensor<int32_t>({});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({});
+    Tensor paddings = test::AsTensor<int32_t>({});
     op.input_tensors[2] = &paddings;
     INFER_OK(op, "?;[0];[0,2]", "?");
     op.input_tensors[1] = nullptr;
@@ -1586,17 +1587,17 @@ TEST(ArrayOpsTest, BatchToSpace_ShapeFn) {
   INFER_OK(op, "[4,8,8,3];[2,2]", "[1,10,10,d0_3]");
 
   // Bad croppings values
-  croppings = test::AsTensor<int32>({100, 2, 3, 4}, {{2, 2}});
+  croppings = test::AsTensor<int32_t>({100, 2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &croppings;
   INFER_ERROR("Negative dimension size caused by subtracting", op,
               "[4,8,8,3];[2,2]");
-  croppings = test::AsTensor<int32>({1, 2, 3, 400}, {{2, 2}});
+  croppings = test::AsTensor<int32_t>({1, 2, 3, 400}, {{2, 2}});
   op.input_tensors[1] = &croppings;
   INFER_ERROR("Negative dimension size caused by subtracting", op,
               "[4,8,8,3];[2,2]");
 
   // Negative paddings
-  croppings = test::AsTensor<int32>({1, -2, 3, 4}, {{2, 2}});
+  croppings = test::AsTensor<int32_t>({1, -2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &croppings;
   INFER_ERROR("cannot be negative", op, "[4,8,8,3];[2,2]");
 }
@@ -1618,7 +1619,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
   {
     // Dimensions are partially known, block_shape known.
-    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 3});
     op.input_tensors[1] = &block_shape;
     INFER_OK(op, "[?,?,?,2];[2];?", "[?,?,?,d0_3]");
 
@@ -1626,7 +1627,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
     // Dimensions are partially known, block_shape and crops known.
     {
-      Tensor crops = test::AsTensor<int32>({1, 1, 0, 1}, {{2, 2}});
+      Tensor crops = test::AsTensor<int32_t>({1, 1, 0, 1}, {{2, 2}});
       op.input_tensors[2] = &crops;
       INFER_OK(op, "[18,?,2,2];[2];[2,2]", "[3,?,5,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1634,7 +1635,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
     // Dimensions are fully known, block_shape and crops are known.
     {
-      Tensor crops = test::AsTensor<int32>({1, 1, 0, 0}, {{2, 2}});
+      Tensor crops = test::AsTensor<int32_t>({1, 1, 0, 0}, {{2, 2}});
       op.input_tensors[2] = &crops;
       INFER_OK(op, "[18,2,1,2];[2];[2,2]", "[3,2,3,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1649,16 +1650,16 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
   INFER_ERROR("rank", op, "[2,2,3];[3];[3,2]");
 
   {
-    Tensor block_shape = test::AsTensor<int32>({0, 2});
+    Tensor block_shape = test::AsTensor<int32_t>({0, 2});
     op.input_tensors[1] = &block_shape;
     INFER_ERROR("block_shape must be positive", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({1, 1});
+    Tensor block_shape = test::AsTensor<int32_t>({1, 1});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({0, -1, 0, 0}, {{2, 2}});
+    Tensor paddings = test::AsTensor<int32_t>({0, -1, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &paddings;
     INFER_ERROR("crops cannot be negative", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1667,9 +1668,9 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
   // The amount to crop exceeds the padded size.
   {
-    Tensor block_shape = test::AsTensor<int32>({2, 2});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 2});
     op.input_tensors[1] = &block_shape;
-    Tensor crops = test::AsTensor<int32>({3, 2, 0, 0}, {{2, 2}});
+    Tensor crops = test::AsTensor<int32_t>({3, 2, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &crops;
     INFER_ERROR("Negative", op, "[4,2,3,1];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1678,7 +1679,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
   // The batch size is not divisible by the product of the block_shape.
   {
-    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 3});
     op.input_tensors[1] = &block_shape;
     INFER_ERROR("divisible", op, "[3,1,1,1];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1755,22 +1756,22 @@ TEST(ArrayOpsTest, Slice_ShapeFn) {
 
   // Tests with known values.
   op.input_tensors.resize(3);
-  Tensor begin = test::AsTensor<int32>({0, 1, 2, 1});
-  Tensor sizes = test::AsTensor<int32>({1, 2, 1, 3});
+  Tensor begin = test::AsTensor<int32_t>({0, 1, 2, 1});
+  Tensor sizes = test::AsTensor<int32_t>({1, 2, 1, 3});
   op.input_tensors[1] = &begin;
   op.input_tensors[2] = &sizes;
   INFER_OK(op, "[2,3,4,5];[4];[4]", "[1,2,1,3]");
 
   // -1 in sizes means "get the rest"
-  sizes = test::AsTensor<int32>({-1, -1, 1, -1});
+  sizes = test::AsTensor<int32_t>({-1, -1, 1, -1});
   INFER_OK(op, "[2,3,4,5];[4];[4]", "[d0_0,2,1,4]");
 
-  begin = test::AsTensor<int32>({0, 1, 2, 6});
-  sizes = test::AsTensor<int32>({-1, -1, -1, -1});
+  begin = test::AsTensor<int32_t>({0, 1, 2, 6});
+  sizes = test::AsTensor<int32_t>({-1, -1, -1, -1});
   INFER_ERROR("Negative dimension size", op, "[2,3,4,5];[4];[4]");
 
-  begin = test::AsTensor<int32>({0, 1, 2, 5});
-  sizes = test::AsTensor<int32>({-1, -1, -1, -2});
+  begin = test::AsTensor<int32_t>({0, 1, 2, 5});
+  sizes = test::AsTensor<int32_t>({-1, -1, -1, -2});
   INFER_ERROR("cannot be < -1", op, "[2,3,4,5];[4];[4]");
 }
 
@@ -1784,7 +1785,7 @@ TEST(ArrayOpsTest, StridedSlice_ShapeFn) {
                    .Attr("shrink_axis_mask", 1)
                    .Finalize(&op.node_def));
   op.input_tensors.resize(4);
-  Tensor strides = test::AsTensor<int32>({1});
+  Tensor strides = test::AsTensor<int32_t>({1});
   op.input_tensors[3] = &strides;
   // Slicing on the 0-th dimension.
   INFER_OK(op, "[2,3,4,5];[1];[1];[1]", "[3,4,5]");
@@ -1799,7 +1800,7 @@ TEST(ArrayOpsTest, StridedSliceGrad_ShapeFn) {
   INFER_OK(op, "[?];?;?;?;?", "?");
   INFER_OK(op, "[4];?;?;?;?", "[?,?,?,?]");
 
-  Tensor in_t = test::AsTensor<int32>({1, 2, 3, 4});
+  Tensor in_t = test::AsTensor<int32_t>({1, 2, 3, 4});
   op.input_tensors[0] = &in_t;
   INFER_OK(op, "[4];?;?;?;?", "[1,2,3,4]");
 }
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.cc b/tensorflow/core/ops/compat/op_compatibility_lib.cc
index d05755ad816ec4..1144ddb78c7cdf 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.cc
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.cc
@@ -27,19 +27,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-static string OpsHistoryDirectory(const string& ops_prefix,
-                                  const string& history_version) {
+static std::string OpsHistoryDirectory(const std::string& ops_prefix,
+                                       const std::string& history_version) {
   return io::JoinPath(ops_prefix,
                       absl::StrCat("compat/ops_history_", history_version));
 }
 
-static string OpsHistoryFile(const string& ops_prefix,
-                             const string& history_version) {
+static std::string OpsHistoryFile(const std::string& ops_prefix,
+                                  const std::string& history_version) {
   return io::JoinPath(ops_prefix, absl::StrCat("compat/ops_history.",
                                                history_version, ".pbtxt"));
 }
 
-static string FileNameFromOpName(const string& op_name) {
+static std::string FileNameFromOpName(const std::string& op_name) {
   return absl::StrCat(op_name, ".pbtxt");
 }
 
@@ -51,23 +51,24 @@ static void AddNewOpToHistory(const OpDef& op,
   }
 }
 
-static absl::Status ReadOpHistory(Env* env, const string& file,
-                                  const string& directory,
+static absl::Status ReadOpHistory(Env* env, const std::string& file,
+                                  const std::string& directory,
                                   OpCompatibilityLib::OpHistory* out) {
   // Read op history form `directory` if it exists there.
-  std::vector<string> matching_files;
+  std::vector<std::string> matching_files;
   absl::Status status = env->GetMatchingPaths(
       io::JoinPath(directory, "*.pbtxt"), &matching_files);
   if (status.ok() && !matching_files.empty()) {
     printf("Reading op history from %s/*.pbtxt...\n", directory.c_str());
     std::sort(matching_files.begin(), matching_files.end());
-    for (const string& full_file : matching_files) {
-      string op_history_str;
+    for (const std::string& full_file : matching_files) {
+      std::string op_history_str;
       TF_RETURN_IF_ERROR(ReadFileToString(env, full_file, &op_history_str));
       OpList in_op_history;
       protobuf::TextFormat::ParseFromString(op_history_str, &in_op_history);
-      const string file_tail = FileNameFromOpName(in_op_history.op(0).name());
-      const string expected = io::JoinPath(directory, file_tail);
+      const std::string file_tail =
+          FileNameFromOpName(in_op_history.op(0).name());
+      const std::string expected = io::JoinPath(directory, file_tail);
       if (full_file != expected) {
         return errors::Internal("Expected file paths to match but '", full_file,
                                 "' != '", expected, "'");
@@ -76,7 +77,7 @@ static absl::Status ReadOpHistory(Env* env, const string& file,
     }
   } else {  // Otherwise, fall back to reading op history from `file`.
     printf("Reading op history from %s...\n", file.c_str());
-    string op_history_str;
+    std::string op_history_str;
     TF_RETURN_IF_ERROR(ReadFileToString(env, file, &op_history_str));
     OpList in_op_history;
     protobuf::TextFormat::ParseFromString(op_history_str, &in_op_history);
@@ -98,9 +99,9 @@ static absl::Status ReadOpHistory(Env* env, const string& file,
   return absl::OkStatus();
 }
 
-OpCompatibilityLib::OpCompatibilityLib(const string& ops_prefix,
-                                       const string& history_version,
-                                       const std::set<string>* stable_ops)
+OpCompatibilityLib::OpCompatibilityLib(const std::string& ops_prefix,
+                                       const std::string& history_version,
+                                       const std::set<std::string>* stable_ops)
     : ops_file_(io::JoinPath(ops_prefix, "ops.pbtxt")),
       op_history_file_(OpsHistoryFile(ops_prefix, history_version)),
       op_history_directory_(OpsHistoryDirectory(ops_prefix, history_version)),
@@ -121,12 +122,12 @@ absl::Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
 
   if (stable_ops_ != nullptr) {
     printf("Verifying no stable ops have been removed...\n");
-    std::vector<string> removed;
+    std::vector<std::string> removed;
     // We rely on stable_ops_ and op_list_ being in sorted order.
     auto iter = stable_ops_->begin();
     for (int cur = 0; iter != stable_ops_->end() && cur < op_list_.op_size();
          ++cur) {
-      const string& op_name = op_list_.op(cur).name();
+      const std::string& op_name = op_list_.op(cur).name();
       while (op_name > *iter) {
         removed.push_back(*iter);
         ++iter;
@@ -156,9 +157,9 @@ absl::Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
   // Within the OplList it has versions in oldest-first order.
   while (cur < op_list_.op_size() && hist < in_op_history.size()) {
     const OpDef& cur_op = op_list_.op(cur);
-    const string& cur_op_name = cur_op.name();
+    const std::string& cur_op_name = cur_op.name();
     const OpList& history_op_list = in_op_history[hist].second;
-    const string& history_op_name = history_op_list.op(0).name();
+    const std::string& history_op_name = history_op_list.op(0).name();
     if (stable_ops_ != nullptr && stable_ops_->count(cur_op_name) == 0) {
       // Ignore unstable op.
       for (++cur; cur < op_list_.op_size(); ++cur) {
@@ -188,7 +189,7 @@ absl::Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
       const int end = history_op_list.op_size();
       // Is the last op in the history the same as the current op?
       // Compare using their serialized representations.
-      string history_str, cur_str;
+      std::string history_str, cur_str;
       history_op_list.op(end - 1).SerializeToString(&history_str);
       cur_op.SerializeToString(&cur_str);
 
@@ -232,7 +233,7 @@ absl::Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
 
   // Add remaining new ops.
   for (; cur < op_list_.op_size(); ++cur) {
-    const string& op_name = op_list_.op(cur).name();
+    const std::string& op_name = op_list_.op(cur).name();
     if (stable_ops_ != nullptr && stable_ops_->count(op_name) == 0) {
       // Ignore unstable op.
     } else {
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.h b/tensorflow/core/ops/compat/op_compatibility_lib.h
index 776a603966252b..829c152a8cb5c5 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.h
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.h
@@ -33,27 +33,30 @@ class OpCompatibilityLib {
   //   If stable_ops == nullptr, we use all registered ops.
   //   Otherwise ValidateCompatible() ignores ops not in *stable_ops
   //   and require all ops in *stable_ops to exist.
-  OpCompatibilityLib(const string& ops_prefix, const string& history_version,
-                     const std::set<string>* stable_ops);
+  OpCompatibilityLib(const std::string& ops_prefix,
+                     const std::string& history_version,
+                     const std::set<std::string>* stable_ops);
 
   // Name of the file that contains the checked-in versions of *all*
   // ops, with docs.
-  const string& ops_file() const { return ops_file_; }
+  const std::string& ops_file() const { return ops_file_; }
 
   // Name of the file that contains all versions of *stable* ops,
   // without docs.  Op history is in (alphabetical, oldest-first)
   // order.
-  const string& op_history_file() const { return op_history_file_; }
+  const std::string& op_history_file() const { return op_history_file_; }
 
   // Name of the directory that contains all versions of *stable* ops,
   // without docs.  Op history is one file per op, in oldest-first
   // order within the file.
-  const string& op_history_directory() const { return op_history_directory_; }
+  const std::string& op_history_directory() const {
+    return op_history_directory_;
+  }
 
   // Should match the contents of ops_file().  Run before calling
   // ValidateCompatible().
-  string OpsString() const {
-    string result;
+  std::string OpsString() const {
+    std::string result;
     google::protobuf::TextFormat::PrintToString(op_list_, &result);
     return result;
   }
@@ -63,7 +66,7 @@ class OpCompatibilityLib {
   int num_all_ops() const { return op_list_.op_size(); }
 
   // <file name, file contents> pairs representing op history.
-  typedef std::vector<std::pair<string, OpList>> OpHistory;
+  typedef std::vector<std::pair<std::string, OpList>> OpHistory;
 
   // Make sure the current version of the *stable* ops are compatible
   // with the historical versions, and if out_op_history != nullptr,
@@ -74,10 +77,10 @@ class OpCompatibilityLib {
                                   OpHistory* out_op_history);
 
  private:
-  const string ops_file_;
-  const string op_history_file_;
-  const string op_history_directory_;
-  const std::set<string>* stable_ops_;
+  const std::string ops_file_;
+  const std::string op_history_file_;
+  const std::string op_history_directory_;
+  const std::set<std::string>* stable_ops_;
   OpList op_list_;
 };
 
diff --git a/tensorflow/core/ops/compat/update_ops_main.cc b/tensorflow/core/ops/compat/update_ops_main.cc
index b72394191863e2..b1a1e93e8fa925 100644
--- a/tensorflow/core/ops/compat/update_ops_main.cc
+++ b/tensorflow/core/ops/compat/update_ops_main.cc
@@ -30,14 +30,14 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void WriteUpdateTo(const string& directory) {
+void WriteUpdateTo(const std::string& directory) {
   OpCompatibilityLib compatibility(
-      directory, strings::StrCat("v", TF_MAJOR_VERSION), nullptr);
+      directory, absl::StrCat("v", TF_MAJOR_VERSION), nullptr);
 
   // Write full copy of all ops to ops.pbtxt.
   Env* env = Env::Default();
   {
-    const string& ops_file = compatibility.ops_file();
+    const std::string& ops_file = compatibility.ops_file();
     printf("Writing ops to %s...\n", ops_file.c_str());
     TF_QCHECK_OK(WriteStringToFile(env, ops_file, compatibility.OpsString()));
   }
@@ -52,7 +52,7 @@ void WriteUpdateTo(const string& directory) {
                                                 &out_op_history));
   printf("%d changed ops\n%d added ops\n", changed_ops, added_ops);
 
-  const string& history_dir = compatibility.op_history_directory();
+  const std::string& history_dir = compatibility.op_history_directory();
   absl::Status status = env->CreateDir(history_dir);
   if (!absl::IsAlreadyExists(status)) {
     TF_QCHECK_OK(status);
diff --git a/tensorflow/core/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc
index 1016703b6426dd..8efefb7245fc0a 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops.cc
@@ -92,9 +92,9 @@ REGISTER_OP("CudnnRNN")
       auto batch_size = c->Dim(input_shape, 1);
       auto num_units = c->Dim(input_h_shape, 2);
 
-      string direction;
+      std::string direction;
       TF_RETURN_IF_ERROR(c->GetAttr("direction", &direction));
-      string rnn_mode;
+      std::string rnn_mode;
       TF_RETURN_IF_ERROR(c->GetAttr("rnn_mode", &rnn_mode));
       int dir_count = (direction == "bidirectional") ? 2 : 1;
       DimensionHandle output_size;
@@ -140,9 +140,9 @@ REGISTER_OP("CudnnRNNV2")
       auto seq_length = c->Dim(input_shape, 0);
       auto batch_size = c->Dim(input_shape, 1);
       auto num_units = c->Dim(input_h_shape, 2);
-      string direction;
+      std::string direction;
       TF_RETURN_IF_ERROR(c->GetAttr("direction", &direction));
-      string rnn_mode;
+      std::string rnn_mode;
       TF_RETURN_IF_ERROR(c->GetAttr("rnn_mode", &rnn_mode));
       int dir_count = (direction == "bidirectional") ? 2 : 1;
       DimensionHandle output_size;
@@ -195,9 +195,9 @@ REGISTER_OP("CudnnRNNV3")
       auto batch_size = c->Dim(input_shape, 1);
       auto num_units = c->Dim(input_h_shape, 2);
 
-      string direction;
+      std::string direction;
       TF_RETURN_IF_ERROR(c->GetAttr("direction", &direction));
-      string rnn_mode;
+      std::string rnn_mode;
       TF_RETURN_IF_ERROR(c->GetAttr("rnn_mode", &rnn_mode));
       if (rnn_mode == "lstm") {
         TF_RETURN_IF_ERROR(c->WithRank(input_c_shape, 3, &unused));
diff --git a/tensorflow/core/ops/cudnn_rnn_ops_test.cc b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
index 7439b1705eefb4..5bc206c1392496 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops_test.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
@@ -24,10 +24,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-static string JoinedCopies(const string& s, int copies) {
-  string res;
+static std::string JoinedCopies(const std::string& s, int copies) {
+  std::string res;
   for (int i = 0; i < copies; ++i) {
-    strings::StrAppend(&res, i > 0 ? ";" : "", s);
+    absl::StrAppend(&res, i > 0 ? ";" : "", s);
   }
   return res;
 }
@@ -58,12 +58,12 @@ TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
   std::vector<int> output_shape = {seq_length, batch_size,
                                    num_units * dir_count};
   auto shape_to_str = [](const std::vector<int>& v) {
-    return strings::StrCat("[", absl::StrJoin(v, ","), "]");
+    return absl::StrCat("[", absl::StrJoin(v, ","), "]");
   };
-  string input_shapes_desc = strings::StrCat(
+  std::string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
       shape_to_str(input_h_shape), ";", "[?]");
-  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?";
+  std::string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?";
 
   ShapeInferenceTestOp op("CudnnRNN");
   TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNN")
@@ -95,12 +95,12 @@ TEST(CudnnRNNOpsTest, ForwardV2Lstm_ShapeFn) {
   std::vector<int> output_shape = {seq_length, batch_size,
                                    num_units * dir_count};
   auto shape_to_str = [](const std::vector<int>& v) {
-    return strings::StrCat("[", absl::StrJoin(v, ","), "]");
+    return absl::StrCat("[", absl::StrJoin(v, ","), "]");
   };
-  string input_shapes_desc = strings::StrCat(
+  std::string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
       shape_to_str(input_h_shape), ";", "[?]");
-  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?;?";
+  std::string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?;?";
 
   ShapeInferenceTestOp op("CudnnRNNV2");
   TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNNV2")
@@ -135,13 +135,13 @@ TEST(CudnnRNNOpsTest, ForwardV3Lstm_ShapeFn) {
                                    num_units * dir_count};
   std::vector<int> seq_lengths_shape = {batch_size};
   auto shape_to_str = [](const std::vector<int>& v) {
-    return strings::StrCat("[", absl::StrJoin(v, ","), "]");
+    return absl::StrCat("[", absl::StrJoin(v, ","), "]");
   };
-  string input_shapes_desc = strings::StrCat(
+  std::string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
       shape_to_str(input_c_shape), ";", "[?]", ";",
       shape_to_str(seq_lengths_shape));
-  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in2;?;?";
+  std::string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in2;?;?";
 
   ShapeInferenceTestOp op("CudnnRNNV3");
   TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNNV3")
@@ -177,13 +177,13 @@ TEST(CudnnRNNOpsTest, ForwardV3Gru) {
                                    num_units * dir_count};
   std::vector<int> seq_lengths_shape = {batch_size};
   auto shape_to_str = [](const std::vector<int>& v) {
-    return strings::StrCat("[", absl::StrJoin(v, ","), "]");
+    return absl::StrCat("[", absl::StrJoin(v, ","), "]");
   };
-  string input_shapes_desc = strings::StrCat(
+  std::string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
       shape_to_str(input_c_shape), ";", "[?]", ";",
       shape_to_str(seq_lengths_shape));
-  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;[];?;?";
+  std::string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;[];?;?";
 
   ShapeInferenceTestOp op("CudnnRNNV3");
   TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNNV3")
@@ -207,7 +207,7 @@ TEST(CudnnRNNOpsTest, LSTMBlockCell_ShapeFn) {
   ShapeInferenceTestOp op("LSTMBlockCell");
 
   // Last 6 inputs don't affect shape inference.
-  string input_suffix = strings::StrCat(";", JoinedCopies("?", 6));
+  std::string input_suffix = absl::StrCat(";", JoinedCopies("?", 6));
 
   // Rank checks.
   INFER_ERROR("must be rank 2", op, "[?];?" + input_suffix);
@@ -234,7 +234,7 @@ TEST(CudnnRNNOpsTest, BlockLSTM_ShapeFn) {
                    .Finalize(&op.node_def));
 
   // Middle inputs don't affect shape inference.
-  string infix = ";" + JoinedCopies("?", 6) + ";";
+  std::string infix = ";" + JoinedCopies("?", 6) + ";";
 
   // Rank checks.
   INFER_ERROR("must be rank 3", op, "?;[?]" + infix + "?");
@@ -266,7 +266,7 @@ TEST(CudnnRNNOpsTest, BlockLSTMV2_ShapeFn) {
                    .Finalize(&op.node_def));
 
   // Middle inputs don't affect shape inference.
-  string infix = ";" + JoinedCopies("?", 6) + ";";
+  std::string infix = ";" + JoinedCopies("?", 6) + ";";
 
   // Rank checks.
   INFER_ERROR("must be rank 3", op, "?;[?]" + infix + "?");
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 8329f3963d7258..e85532695ffeb4 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -120,7 +120,7 @@ absl::Status DynamicStitchShapeFunction(InferenceContext* c) {
 
     if (indices_t != nullptr) {
       // The length is based on the highest index from flattened indices.
-      const int32* indices = indices_t->flat<int32>().data();
+      const int32_t* indices = indices_t->flat<int32_t>().data();
       int64_t count = indices_t->NumElements();
       for (int64_t i = 0; i < count; ++i) {
         if (indices[i] > max_index) {
@@ -340,7 +340,7 @@ REGISTER_OP("QueueDequeueManyV2")
       if (c->input_tensor(1) == nullptr) {
         n_shape = c->Vector(InferenceContext::kUnknownDim);
       } else {
-        const int32_t n = c->input_tensor(1)->scalar<int32>()();
+        const int32_t n = c->input_tensor(1)->scalar<int32_t>()();
         if (n < 0) {
           return errors::InvalidArgument("Input 'n' must be >= 0, but is ", n);
         }
diff --git a/tensorflow/core/ops/data_flow_ops_test.cc b/tensorflow/core/ops/data_flow_ops_test.cc
index a071eac453ec68..30165ffe914129 100644
--- a/tensorflow/core/ops/data_flow_ops_test.cc
+++ b/tensorflow/core/ops/data_flow_ops_test.cc
@@ -138,12 +138,12 @@ TEST(DataFlowOpsTest, DynamicStitch) {
   INFER_OK(op, "[2,3];[5,6];[2,3,4,5];[5,6,4,5]", "[?,d2_2,d2_3]");
 
   // 1 known input tensors, not enough to change answer.
-  Tensor tensor_2 = test::AsTensor<int32>(
-      std::vector<int32>{2, 4, 6, 0, 10, 11}, TensorShape({2, 3}));
-  Tensor tensor_5 = test::AsTensor<int32>(
-      std::vector<int32>{0,    1,  2,  3,  4,  5,  6,  7,  8,  9,
-                         10,   11, 12, 13, 14, 15, 16, 17, 18, 19,
-                         1000, 21, 22, 23, 24, 25, 26, 27, 28, 29},
+  Tensor tensor_2 = test::AsTensor<int32_t>(
+      std::vector<int32_t>{2, 4, 6, 0, 10, 11}, TensorShape({2, 3}));
+  Tensor tensor_5 = test::AsTensor<int32_t>(
+      std::vector<int32_t>{0,    1,  2,  3,  4,  5,  6,  7,  8,  9,
+                           10,   11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           1000, 21, 22, 23, 24, 25, 26, 27, 28, 29},
       TensorShape({5, 6}));
   op.input_tensors.push_back(nullptr);
   op.input_tensors.push_back(&tensor_5);
@@ -157,7 +157,7 @@ TEST(DataFlowOpsTest, DynamicStitch) {
   op.input_tensors[1] = &tensor_5;
   INFER_OK(op, "[2,3];[5,6];[2,3,4,5];[5,6,4,5]", "[1001,d2_2,d2_3]");
 
-  tensor_2.flat<int32>()(3) = 10000;
+  tensor_2.flat<int32_t>()(3) = 10000;
   INFER_OK(op, "[2,3];[5,6];[2,3,4,5];[5,6,4,5]", "[10001,d2_2,d2_3]");
 }
 
@@ -254,7 +254,7 @@ TEST(DataFlowOpsTest, QueueDequeueManyV2ShapeFn) {
   shapes_and_types.emplace_back("[?,2]", DT_FLOAT);
   INFER_OK(op, "?;?", "[12,1,?,3];[12,?,2]");
 
-  n_tensor = test::AsScalar<int32>(-1);  // invalid value of n.
+  n_tensor = test::AsScalar<int32_t>(-1);  // invalid value of n.
   INFER_ERROR("must be >= 0", op, "?;?");
 }
 
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 1496de13d65820..5d36e15f0f24f6 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -57,7 +57,7 @@ absl::Status SetOutputToSizedImage(InferenceContext* c,
           DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
           " in ", c->DebugString());
     }
-    auto vec = size_tensor->vec<int32>();
+    auto vec = size_tensor->vec<int32_t>();
     height = c->MakeDim(vec(0));
     width = c->MakeDim(vec(1));
   }
@@ -415,7 +415,7 @@ REGISTER_OP("ResizeNearestNeighborGrad")
         TF_RETURN_IF_ERROR(c->ReplaceDim(input, 1, c->UnknownDim(), &input));
         TF_RETURN_IF_ERROR(c->ReplaceDim(input, 2, c->UnknownDim(), &input));
       } else {
-        auto size_vec = size->vec<int32>();
+        auto size_vec = size->vec<int32_t>();
         TF_RETURN_IF_ERROR(
             c->ReplaceDim(input, 1, c->MakeDim(size_vec(0)), &input));
         TF_RETURN_IF_ERROR(
@@ -516,7 +516,7 @@ REGISTER_OP("DecodeAndCropJpeg")
 
       const Tensor* crop_window = c->input_tensor(1);
       if (crop_window != nullptr) {
-        auto crop_window_vec = crop_window->vec<int32>();
+        auto crop_window_vec = crop_window->vec<int32_t>();
         h = c->MakeDim(crop_window_vec(2));
         w = c->MakeDim(crop_window_vec(3));
       }
@@ -861,7 +861,7 @@ REGISTER_OP("ExtractGlimpse")
 
       bool uniform_noise = false;
       TF_RETURN_IF_ERROR(c->GetAttr("uniform_noise", &uniform_noise));
-      string noise;
+      std::string noise;
       TF_RETURN_IF_ERROR(c->GetAttr("noise", &noise));
       if (uniform_noise && (!noise.empty() && noise != "uniform")) {
         return errors::InvalidArgument(
@@ -895,7 +895,7 @@ REGISTER_OP("ExtractGlimpseV2")
 
       bool uniform_noise = false;
       TF_RETURN_IF_ERROR(c->GetAttr("uniform_noise", &uniform_noise));
-      string noise;
+      std::string noise;
       TF_RETURN_IF_ERROR(c->GetAttr("noise", &noise));
       if (uniform_noise && (!noise.empty() && noise != "uniform")) {
         return errors::InvalidArgument(
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index c042da449cfd87..d11204c47d437c 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -45,7 +45,7 @@ TEST(ImageOpsTest, Resize_ShapeFn) {
     // When the size tensor is not a constant, the middle dims are unknown.
     INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,?,?,d0_3]");
 
-    Tensor size_tensor = test::AsTensor<int32>({20, 30});
+    Tensor size_tensor = test::AsTensor<int32_t>({20, 30});
     op.input_tensors[1] = &size_tensor;
     INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,20,30,d0_3]");
   }
@@ -244,7 +244,7 @@ TEST(ImageOpsTest, ExtractGlimpse_ShapeFn) {
   // When the size tensor is not a constant, the middle dims are unknown.
   INFER_OK(op, "[1,?,3,?];[2];?", "[d0_0,?,?,d0_3]");
 
-  Tensor size_tensor = test::AsTensor<int32>({20, 30});
+  Tensor size_tensor = test::AsTensor<int32_t>({20, 30});
   op.input_tensors[1] = &size_tensor;
   INFER_OK(op, "[1,?,3,?];[2];?", "[d0_0,20,30,d0_3]");
 
@@ -272,7 +272,7 @@ TEST(ImageOpsTest, CropAndResize_ShapeFn) {
   // When the size tensor is not a constant, the middle dims are unknown.
   INFER_OK(op, "[1,?,3,?];?;?;[2]", "[?,?,?,d0_3]");
 
-  Tensor size_tensor = test::AsTensor<int32>({20, 30});
+  Tensor size_tensor = test::AsTensor<int32_t>({20, 30});
   op.input_tensors[3] = &size_tensor;
   INFER_OK(op, "[1,?,3,?];?;?;[2]", "[?,20,30,d0_3]");
 
@@ -298,7 +298,7 @@ TEST(ImageOpsTest, ResizeNearestNeighborGrad_ShapeFn) {
   // When the size tensor is not a constant, the middle dims are unknown.
   INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,?,?,d0_3]");
 
-  Tensor size_tensor = test::AsTensor<int32>({20, 30});
+  Tensor size_tensor = test::AsTensor<int32_t>({20, 30});
   op.input_tensors[1] = &size_tensor;
   INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,20,30,d0_3]");
 }
@@ -314,7 +314,7 @@ TEST(ImageOpsTest, CropAndResizeGradImage_ShapeFn) {
   INFER_OK(op, "?;?;?;?", "[?,?,?,?]");
 
   // Known image_size should result in full shape information.
-  Tensor image_size = test::AsTensor<int32>({10, 20, 30, 40});
+  Tensor image_size = test::AsTensor<int32_t>({10, 20, 30, 40});
   op.input_tensors[3] = &image_size;
   INFER_OK(op, "?;?;?;[1]", "[10, 20, 30, 40]");
 }
@@ -357,7 +357,7 @@ TEST(ImageOpsTest, QuantizedResizeBilinear_ShapeFn) {
   INFER_ERROR("must be rank 0", op, "[1,?,3,?];[2];[?];[]");
   INFER_ERROR("must be rank 0", op, "[1,?,3,?];[2];[];[?]");
 
-  const Tensor size_tensor = test::AsTensor<int32>({20, 30});
+  const Tensor size_tensor = test::AsTensor<int32_t>({20, 30});
   op.input_tensors.at(1) = &size_tensor;
   INFER_OK(op, "[1,?,3,?];[2];[];[]", "[d0_0,20,30,d0_3];[];[]");
 }
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index d2540c02bc9f0a..63fa0821897633 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -112,7 +112,7 @@ REGISTER_OP("RestoreV2")
               "The number of shape_and_slice doesn't match tensor outputs.");
         }
         for (int i = 0; i < shape_and_slices_flat.size(); ++i) {
-          const string& shape_and_slice = shape_and_slices_flat(i);
+          const std::string& shape_and_slice = shape_and_slices_flat(i);
           if (shape_and_slice.empty()) {
             c->set_output(i, c->UnknownShape());
             continue;
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index 7e5ddc023397a5..19abdfd6779d41 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -41,7 +41,7 @@ TEST(LinalgOpsTest, UnchangedSquare_ShapeFn) {
   for (const char* op_name : {"Cholesky", "CholeskyGrad", "MatrixInverse"}) {
     ShapeInferenceTestOp op(op_name);
 
-    const string extra_shape = (op.name == "CholeskyGrad" ? ";?" : "");
+    const std::string extra_shape = (op.name == "CholeskyGrad" ? ";?" : "");
 
     INFER_OK(op, "?" + extra_shape, "?");
     INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index d85cad19425192..75f6e9f8b53721 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -90,9 +90,9 @@ REGISTER_OP("LookupTableFind")
 
 absl::Status ValidateTableType(InferenceContext* c,
                                const ShapeAndType& key_shape_and_type,
-                               const string& key_dtype_attr,
+                               const std::string& key_dtype_attr,
                                const ShapeAndType& value_shape_and_type,
-                               const string& value_dtype_attr) {
+                               const std::string& value_dtype_attr) {
   DataType key_dtype;
   TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype));
   if (key_shape_and_type.dtype != key_dtype) {
@@ -115,8 +115,8 @@ absl::Status ValidateTableType(InferenceContext* c,
 }
 
 absl::Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
-                                         const string& key_dtype_attr,
-                                         const string& value_dtype_attr,
+                                         const std::string& key_dtype_attr,
+                                         const std::string& value_dtype_attr,
                                          ShapeAndType* output_shape_and_type) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->size() != 2) {
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 2102dfd8df6fc3..cecfa31bace611 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -630,7 +630,7 @@ absl::Status SquaredDifferenceGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("SquaredDifference", SquaredDifferenceGrad);
 
-absl::Status MaximumMinimumGradHelper(const string& comparator,
+absl::Status MaximumMinimumGradHelper(const std::string& comparator,
                                       const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForBinaryCwise(g, {
@@ -770,7 +770,7 @@ REGISTER_OP_GRADIENT("Mean", MeanGrad);
 // REGISTER_OP_GRADIENT("UnsortedSegmentSum", UnsortedSegmentSumGrad);
 // REGISTER_OP_GRADIENT("UnsortedSegmentMax", UnsortedSegmentMaxGrad);
 
-absl::Status MinMaxGradHelper(const string& op, const AttrSlice& attrs,
+absl::Status MinMaxGradHelper(const std::string& op, const AttrSlice& attrs,
                               FunctionDef* g) {
   // clang-format off
   *g = FDH::Define(
@@ -807,13 +807,11 @@ absl::Status MinGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Min", MinGrad);
 
-static absl::Status MatMulGradHelper(FunctionDef* g, const string& opname,
-                                     const string& attr_adj_x,
-                                     const string& attr_adj_y, const string& x0,
-                                     bool ax0, const string& x1, bool ax1,
-                                     const string& y0, bool ay0,
-                                     const string& y1, bool ay1,
-                                     bool enable_broadcasting) {
+static absl::Status MatMulGradHelper(
+    FunctionDef* g, const std::string& opname, const std::string& attr_adj_x,
+    const std::string& attr_adj_y, const std::string& x0, bool ax0,
+    const std::string& x1, bool ax1, const std::string& y0, bool ay0,
+    const std::string& y1, bool ay1, bool enable_broadcasting) {
   // The final outputs are "dx" and "dy". If we're broadcasting compute
   // intermediate nodes for now.
   std::vector<FDH::Node> nodes = {
@@ -831,9 +829,9 @@ static absl::Status MatMulGradHelper(FunctionDef* g, const string& opname,
   // broadcasting-specific ops.
   if (enable_broadcasting) {
     std::vector<FDH::Node> unbroadcast_gradients = {
-        FDH::Const<int32>("zero", absl::Span<const int32>{0}),
-        FDH::Const<int32>("one", absl::Span<const int32>{1}),
-        FDH::Const<int32>("minustwo", absl::Span<const int32>{-2}),
+        FDH::Const<int32_t>("zero", absl::Span<const int32_t>{0}),
+        FDH::Const<int32_t>("one", absl::Span<const int32_t>{1}),
+        FDH::Const<int32_t>("minustwo", absl::Span<const int32_t>{-2}),
         // Compute the batch shapes of the inputs (all but last two dims).
         {{"sx"}, "Shape", {"x"}, {{"T", "$T"}}},
         {{"sy"}, "Shape", {"y"}, {{"T", "$T"}}},
@@ -866,9 +864,11 @@ static absl::Status MatMulGradHelper(FunctionDef* g, const string& opname,
   return absl::OkStatus();
 }
 
-absl::Status MatMulGradCommon(const string& opname, const string& attr_adj_x,
-                              const string& attr_adj_y, const AttrSlice& attrs,
-                              FunctionDef* g, bool enable_broadcasting) {
+absl::Status MatMulGradCommon(const std::string& opname,
+                              const std::string& attr_adj_x,
+                              const std::string& attr_adj_y,
+                              const AttrSlice& attrs, FunctionDef* g,
+                              bool enable_broadcasting) {
   DataType T;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "T", &T));
   if (T == DT_COMPLEX64 || T == DT_COMPLEX128) {
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 60614209185c73..5ef72e958292ea 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -43,9 +43,9 @@ class MathGradTest : public ::testing::Test {
   absl::Status Unary(const FDH::Node& op_node, const Tensor& x,
                      const DataType dst, Tensor* y) {
     const DataType src = x.dtype();
-    auto adef = [](const string& name,
+    auto adef = [](const std::string& name,
                    const DataType type) {  // E.g., x:float, dy:double
-      return strings::StrCat(name, ":", DataTypeString(type));
+      return absl::StrCat(name, ":", DataTypeString(type));
     };
     // Sum(op(x)), sum all output of op(x).
     auto test = FDH::Define("Test", {adef("x", src)}, {adef("l", dst)}, {},
@@ -94,13 +94,13 @@ class MathGradTest : public ::testing::Test {
     return s;
   }
 
-  absl::Status Unary(const string& op, const Tensor& x, Tensor* y) {
+  absl::Status Unary(const std::string& op, const Tensor& x, Tensor* y) {
     const FDH::Node op_node = {{"y"}, op, {"x"}, {{"T", x.dtype()}}};
     return Unary(op_node, x, x.dtype(), y);
   }
 
   // Unary op expecting OK.
-  Tensor SymGrad(const string& op, const Tensor& x) {
+  Tensor SymGrad(const std::string& op, const Tensor& x) {
     Tensor ret;
     TF_CHECK_OK(Unary(op, x, &ret));
     return ret;
@@ -115,11 +115,11 @@ class MathGradTest : public ::testing::Test {
   }
 
   // Binary
-  void SymGrad(const string& op, const Tensor& x, const Tensor& y, Tensor* dx,
-               Tensor* dy) {
+  void SymGrad(const std::string& op, const Tensor& x, const Tensor& y,
+               Tensor* dx, Tensor* dy) {
     const DataType T = x.dtype();
-    auto adef = [T](const string& name) {  // E.g., x:float, dy:double
-      return strings::StrCat(name, ":", DataTypeString(T));
+    auto adef = [T](const std::string& name) {  // E.g., x:float, dy:double
+      return absl::StrCat(name, ":", DataTypeString(T));
     };
     // Sum(op(x)), sum all output of op(x).
     auto test = FDH::Define("Test", {adef("x"), adef("y")}, {adef("l")}, {},
@@ -171,11 +171,11 @@ class MathGradTest : public ::testing::Test {
   }
 
   // Reduction grad
-  void ReductionGrad(const string& op, const Tensor& x, const Tensor& idx,
+  void ReductionGrad(const std::string& op, const Tensor& x, const Tensor& idx,
                      Tensor* dx, Tensor* di) {
     const DataType T = x.dtype();
-    auto adef = [T](const string& name) {  // E.g., x:float, dy:double
-      return strings::StrCat(name, ":", DataTypeString(T));
+    auto adef = [T](const std::string& name) {  // E.g., x:float, dy:double
+      return absl::StrCat(name, ":", DataTypeString(T));
     };
     // Sum(op(x, idx)), sum all output of op(x, idx).
     auto test = FDH::Define("Test", {adef("x"), "i:int32"}, {adef("l")}, {},
@@ -225,11 +225,11 @@ class MathGradTest : public ::testing::Test {
     *di = outputs[1];
   }
 
-  Tensor ReduceSum(const Tensor& x, absl::Span<const int32> axes) {
+  Tensor ReduceSum(const Tensor& x, absl::Span<const int32_t> axes) {
     int num_axes = axes.length();
     Tensor y(DT_INT32, TensorShape({num_axes}));
     for (size_t i = 0; i < axes.size(); ++i) {
-      y.flat<int32>()(i) = axes[i];
+      y.flat<int32_t>()(i) = axes[i];
     }
     auto T = x.dtype();
     auto gdef = test::function::GDef(
@@ -248,8 +248,8 @@ class MathGradTest : public ::testing::Test {
     return outputs[0];
   }
 
-  Tensor MatMulCommon(const string& opname, const string& attr_adj_x,
-                      const string& attr_adj_y, const Tensor& x, bool ax,
+  Tensor MatMulCommon(const std::string& opname, const std::string& attr_adj_x,
+                      const std::string& attr_adj_y, const Tensor& x, bool ax,
                       const Tensor& y, bool ay) {
     auto T = x.dtype();
     auto gdef = test::function::GDef(
@@ -281,12 +281,13 @@ class MathGradTest : public ::testing::Test {
     return MatMulCommon("BatchMatMulV2", "adj_x", "adj_y", x, ax, y, ay);
   }
 
-  void MatMulGradCommon(const string& opname, const string& attr_adj_x,
-                        const string& attr_adj_y, const Tensor& x, bool ax,
+  void MatMulGradCommon(const std::string& opname,
+                        const std::string& attr_adj_x,
+                        const std::string& attr_adj_y, const Tensor& x, bool ax,
                         const Tensor& y, bool ay, Tensor* dx, Tensor* dy) {
     const DataType T = x.dtype();
-    auto adef = [T](const string& name) {  // E.g., x:float, dy:double
-      return strings::StrCat(name, ":", DataTypeString(T));
+    auto adef = [T](const std::string& name) {  // E.g., x:float, dy:double
+      return absl::StrCat(name, ":", DataTypeString(T));
     };
     // Sum(op(x)), sum all output of op(x).
     auto test =
@@ -412,7 +413,7 @@ class MathGradTest : public ::testing::Test {
   }
 };
 
-void HasError(const absl::Status& s, const string& substr) {
+void HasError(const absl::Status& s, const std::string& substr) {
   EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
@@ -1363,35 +1364,35 @@ TEST_F(MathGradTest, BatchMatMulV2_BroadcastWhileAdjointed) {
 TEST_F(MathGradTest, Sum_dim0) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({0}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Sum", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Sum_dim1) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({1}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({1}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Sum", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Mean_dim0) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({0}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Mean", x, i, &dx, &di);
@@ -1399,14 +1400,14 @@ TEST_F(MathGradTest, Mean_dim0) {
       dx, test::AsTensor<float>(
               {1.f / 2, 1.f / 2, 1.f / 2, 1.f / 2, 1.f / 2, 1.f / 2},
               TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Mean_dim1) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({1}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({1}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Mean", x, i, &dx, &di);
@@ -1414,14 +1415,14 @@ TEST_F(MathGradTest, Mean_dim1) {
       dx, test::AsTensor<float>(
               {1.f / 3, 1.f / 3, 1.f / 3, 1.f / 3, 1.f / 3, 1.f / 3},
               TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Mean_dim0_dim1) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0, 1}, TensorShape({2}));
+  auto i = test::AsTensor<int32_t>({0, 1}, TensorShape({2}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Mean", x, i, &dx, &di);
@@ -1429,70 +1430,70 @@ TEST_F(MathGradTest, Mean_dim0_dim1) {
       dx, test::AsTensor<float>(
               {1.f / 6, 1.f / 6, 1.f / 6, 1.f / 6, 1.f / 6, 1.f / 6},
               TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(
-      di, test::AsTensor<int32>({0, 0}, TensorShape({2})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0, 0}, TensorShape({2})));
 }
 
 TEST_F(MathGradTest, Min_dim0) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({0}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Min", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({1.f, 1.f, 1.f, 0.f, 0.f, 0.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Min_dim1) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({1}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({1}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Min", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({1.f, 0.f, 0.f, 1.f, 0.f, 0.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Min_dim0_dim1) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0, 1}, TensorShape({2}));
+  auto i = test::AsTensor<int32_t>({0, 1}, TensorShape({2}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Min", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({1.f, 0.f, 0.f, 0.f, 0.f, 0.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(
-      di, test::AsTensor<int32>({0, 0}, TensorShape({2})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0, 0}, TensorShape({2})));
 }
 
 TEST_F(MathGradTest, Min_dim0_dim1_Dups) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, -3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0, 1}, TensorShape({2}));
+  auto i = test::AsTensor<int32_t>({0, 1}, TensorShape({2}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Min", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({.5f, 0.f, 0.f, 0.f, 0.f, .5f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(
-      di, test::AsTensor<int32>({0, 0}, TensorShape({2})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0, 0}, TensorShape({2})));
 }
 
 TEST_F(MathGradTest, Max_dim0) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({0}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Max", x, i, &dx, &di);
@@ -1500,50 +1501,50 @@ TEST_F(MathGradTest, Max_dim0) {
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({0.f, 0.f, 0.f, 1.f, 1.f, 1.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Max_dim1) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({1}, TensorShape({}));
+  auto i = test::AsTensor<int32_t>({1}, TensorShape({}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Max", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({0.f, 0.f, 1.f, 0.f, 0.f, 1.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(di,
-                                 test::AsTensor<int32>({0}, TensorShape({})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0}, TensorShape({})));
 }
 
 TEST_F(MathGradTest, Max_dim0_dim1) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0, 1}, TensorShape({2}));
+  auto i = test::AsTensor<int32_t>({0, 1}, TensorShape({2}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Max", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({0.f, 0.f, 0.f, 0.f, 0.f, 1.f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(
-      di, test::AsTensor<int32>({0, 0}, TensorShape({2})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0, 0}, TensorShape({2})));
 }
 
 TEST_F(MathGradTest, Max_dim0_dim1_Dups) {
   auto x = test::AsTensor<float>({3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
-  auto i = test::AsTensor<int32>({0, 1}, TensorShape({2}));
+  auto i = test::AsTensor<int32_t>({0, 1}, TensorShape({2}));
   Tensor dx;
   Tensor di;
   ReductionGrad("Max", x, i, &dx, &di);
   test::ExpectTensorEqual<float>(
       dx, test::AsTensor<float>({.5f, 0.f, 0.f, 0.f, 0.f, .5f},
                                 TensorShape({2, 3})));
-  test::ExpectTensorEqual<int32>(
-      di, test::AsTensor<int32>({0, 0}, TensorShape({2})));
+  test::ExpectTensorEqual<int32_t>(
+      di, test::AsTensor<int32_t>({0, 0}, TensorShape({2})));
 }
 
 }  // namespace
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 20d1944ecf6a59..e594e3d5c4aca0 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1029,7 +1029,7 @@ absl::Status ArgOpShape(shape_inference::InferenceContext* c) {
 
   int64_t dimension_val;
   if (dim_t->dtype() == DT_INT32) {
-    dimension_val = dim_t->scalar<int32>()();
+    dimension_val = dim_t->scalar<int32_t>()();
   } else {
     dimension_val = dim_t->scalar<int64_t>()();
   }
@@ -1142,7 +1142,7 @@ absl::Status SparseSegmentReductionGradShapeFnImpl(
     // shape is unknown.
     dim0_shape = c->Vector(InferenceContext::kUnknownDim);
   } else {
-    auto dim0_value = dim0->scalar<int32>()();
+    auto dim0_value = dim0->scalar<int32_t>()();
     if (dim0_value < 0) {
       return errors::InvalidArgument(
           "Cannot specify a negative value for output_dim0");
@@ -1198,7 +1198,7 @@ absl::Status SparseSegmentReductionWithNumSegmentsShapeFn(InferenceContext* c) {
     TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(InferenceContext::kUnknownDim),
                                       subshape, &out));
   } else {
-    auto dim0_value = dim0->scalar<int32>()();
+    auto dim0_value = dim0->scalar<int32_t>()();
     if (dim0_value < 0) {
       return errors::InvalidArgument(
           "Cannot specify a negative value for num_segments");
@@ -1573,19 +1573,19 @@ REGISTER_OP("Range")
         return absl::OkStatus();
       }
       if (dtype == DT_INT32) {
-        return RangeSize<int32>(start_t, limit_t, delta_t, c);
+        return RangeSize<int32_t>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_INT16) {
-        return RangeSize<int16>(start_t, limit_t, delta_t, c);
+        return RangeSize<int16_t>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_INT8) {
-        return RangeSize<int8>(start_t, limit_t, delta_t, c);
+        return RangeSize<int8_t>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_INT64) {
         return RangeSize<int64_t>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_UINT16) {
-        return RangeSize<uint16>(start_t, limit_t, delta_t, c);
+        return RangeSize<uint16_t>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_UINT32) {
-        return RangeSize<uint32>(start_t, limit_t, delta_t, c);
+        return RangeSize<uint32_t>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_UINT64) {
-        return RangeSize<uint64>(start_t, limit_t, delta_t, c);
+        return RangeSize<uint64_t>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_FLOAT) {
         return RangeSize<float>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_DOUBLE) {
@@ -1621,7 +1621,7 @@ REGISTER_OP("LinSpace")
 
       int64_t num;
       if (num_t->dtype() == DT_INT32) {
-        num = num_t->scalar<int32>()();
+        num = num_t->scalar<int32_t>()();
       } else {
         num = num_t->scalar<int64_t>()();
       }
@@ -1760,7 +1760,7 @@ REGISTER_OP("Bincount")
       }
 
       // Return `[size]` shape if size is known.
-      int32_t size_val = size_tensor->scalar<int32>()();
+      int32_t size_val = size_tensor->scalar<int32_t>()();
       if (size_val < 0) {
         return errors::InvalidArgument("size (", size_val,
                                        ") must be non-negative");
@@ -1801,7 +1801,7 @@ REGISTER_OP("DenseBincount")
       DataType dtype;
       TF_RETURN_IF_ERROR(c->GetAttr("Tidx", &dtype));
       if (dtype == DT_INT32) {
-        size_val = static_cast<int64_t>(size_tensor->scalar<int32>()());
+        size_val = static_cast<int64_t>(size_tensor->scalar<int32_t>()());
       } else if (dtype == DT_INT64) {
         size_val = size_tensor->scalar<int64_t>()();
       } else {
@@ -1846,7 +1846,7 @@ REGISTER_OP("SparseBincount")
       DataType dtype;
       TF_RETURN_IF_ERROR(c->GetAttr("Tidx", &dtype));
       if (dtype == DT_INT32) {
-        size_val = static_cast<int64_t>(size_tensor->scalar<int32>()());
+        size_val = static_cast<int64_t>(size_tensor->scalar<int32_t>()());
       } else if (dtype == DT_INT64) {
         size_val = size_tensor->scalar<int64_t>()();
       } else {
@@ -2136,11 +2136,11 @@ REGISTER_OP("SobolSample")
       const Tensor* num_results_t = c->input_tensor(1);
 
       int32_t dim = dim_t == nullptr ? InferenceContext::kUnknownDim
-                                     : dim_t->scalar<int32>()();
+                                     : dim_t->scalar<int32_t>()();
 
       int32_t num_results = num_results_t == nullptr
                                 ? InferenceContext::kUnknownDim
-                                : num_results_t->scalar<int32>()();
+                                : num_results_t->scalar<int32_t>()();
 
       c->set_output(0, c->Matrix(num_results, dim));
       return absl::OkStatus();
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 766eabfd77ff2b..b4392dbe439bf5 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -160,18 +160,18 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     }
   };
 
-  for (string op_name : {"Add",        "Complex",
-                         "Div",        "Equal",
-                         "Greater",    "GreaterEqual",
-                         "Igamma",     "Igammac",
-                         "Zeta",       "Polygamma",
-                         "Less",       "LessEqual",
-                         "LogicalAnd", "LogicalOr",
-                         "Maximum",    "Minimum",
-                         "Mod",        "Mul",
-                         "NotEqual",   "Pow",
-                         "Sub",        "SquaredDifference",
-                         "DivNoNan"}) {
+  for (std::string op_name : {"Add",        "Complex",
+                              "Div",        "Equal",
+                              "Greater",    "GreaterEqual",
+                              "Igamma",     "Igammac",
+                              "Zeta",       "Polygamma",
+                              "Less",       "LessEqual",
+                              "LogicalAnd", "LogicalOr",
+                              "Maximum",    "Minimum",
+                              "Mod",        "Mul",
+                              "NotEqual",   "Pow",
+                              "Sub",        "SquaredDifference",
+                              "DivNoNan"}) {
     ShapeInferenceTestOp op(op_name);
     AddNodeAttr("incompatible_shape_error", true, &op.node_def);
     test_shapes(op, true);
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index 228dbac5f8985b..b0da798463cc38 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -37,7 +37,7 @@ absl::Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       {
         {{"softmax"}, "Softmax", {"x"}, {{"T", "$T"}}},
         {{"n0"}, "Mul", {"grad_softmax", "softmax"}, {{"T", "$T"}}},
-        FDH::Const<int32>("indices", {-1}),
+        FDH::Const<int32_t>("indices", {-1}),
         {{"n1"}, "Sum", {"n0", "indices"}, {{"keep_dims", true}, {"T", "$T"}}},
         {{"n2"}, "Sub", {"grad_softmax", "n1"}, {{"T", "$T"}}},
         {{"grad_x"}, "Mul", {"n2", "softmax"}, {{"T", "$T"}}}
@@ -61,7 +61,7 @@ absl::Status LogSoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       // Based on _LogSoftmaxGrad in nn_grad.py.
       {
         {{"softmax"}, "Softmax", {"x"}, {{"T", "$T"}}},
-        FDH::Const<int32>("indices", {-1}),
+        FDH::Const<int32_t>("indices", {-1}),
         {{"n0"}, "Sum", {"grad_logsoftmax", "indices"},
          {{"keep_dims", true}, {"T", "$T"}}},
         {{"n1"}, "Mul", {"n0", "softmax"}, {{"T", "$T"}}},
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index f090aff143796c..c9a70cc3622cf1 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -539,8 +539,8 @@ absl::Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
     DimensionHandle new_height = c->UnknownDim();
     DimensionHandle new_width = c->UnknownDim();
     if (size != nullptr) {
-      new_height = c->MakeDim(size->flat<int32>()(0));
-      new_width = c->MakeDim(size->flat<int32>()(1));
+      new_height = c->MakeDim(size->flat<int32_t>()(0));
+      new_width = c->MakeDim(size->flat<int32_t>()(1));
     }
     TF_RETURN_IF_ERROR(c->ReplaceDim(resized, 1, new_height, &resized));
     TF_RETURN_IF_ERROR(c->ReplaceDim(resized, 2, new_width, &resized));
@@ -559,8 +559,8 @@ absl::Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
     std::vector<DimensionHandle> output_dims;
     for (int i = 0; i < 4; ++i) {
       DimensionHandle dim = c->Dim(resized, i);
-      int64_t p0 = static_cast<int64_t>(paddings_t->matrix<int32>()(i, 0));
-      int64_t p1 = static_cast<int64_t>(paddings_t->matrix<int32>()(i, 1));
+      int64_t p0 = static_cast<int64_t>(paddings_t->matrix<int32_t>()(i, 0));
+      int64_t p1 = static_cast<int64_t>(paddings_t->matrix<int32_t>()(i, 1));
       if (p0 < 0 || p1 < 0) {
         return errors::InvalidArgument("Paddings must be non-negative");
       }
@@ -576,7 +576,7 @@ absl::Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
   // Work out the convolution's effect with 'padded' as the input.
   ShapeHandle filter;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(filter_index), 4, &filter));
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 4) {
     return errors::InvalidArgument(
@@ -1026,7 +1026,7 @@ REGISTER_OP("MaxPoolWithArgmax")
     .Output("argmax: Targmax")
     .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
-      std::vector<int32> ksize;
+      std::vector<int32_t> ksize;
       TF_RETURN_IF_ERROR(c->GetAttr("ksize", &ksize));
       for (int i = 0; i < ksize.size(); ++i) {
         if (ksize[i] <= 0) {
@@ -1091,7 +1091,7 @@ REGISTER_OP("Dilation2D")
       ShapeHandle filter_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &filter_shape));
 
-      std::vector<int32> strides;
+      std::vector<int32_t> strides;
       TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
       if (strides.size() != 4) {
         return errors::InvalidArgument(
@@ -1100,7 +1100,7 @@ REGISTER_OP("Dilation2D")
             strides.size());
       }
 
-      std::vector<int32> rates;
+      std::vector<int32_t> rates;
       TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates));
       if (rates.size() != 4) {
         return errors::InvalidArgument(
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 41940da69ef4c7..23ddf12e08305e 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -59,7 +59,7 @@ TEST(NNOpsTest, TopKV2_ShapeFn) {
   Tensor k_t;
   op.input_tensors[1] = &k_t;
 
-  k_t = test::AsScalar<int32>(20);
+  k_t = test::AsScalar<int32_t>(20);
   // With known input, each output is an unknown shape.
   INFER_OK(op, "?;[]", "?;?");
   // With vector input, each output is [k].
@@ -75,7 +75,7 @@ TEST(NNOpsTest, TopKV2_ShapeFn) {
               "[1];[]");
   INFER_ERROR("input must have last dimension >= k = 20 but is 4", op,
               "[1,2,3,4];[]");
-  k_t = test::AsScalar<int32>(-1);
+  k_t = test::AsScalar<int32_t>(-1);
   INFER_ERROR(
       "Dimension size, given by scalar input 1, must be non-negative but is -1",
       op, "[1,2,3,4];[]");
@@ -87,7 +87,7 @@ TEST(NNOpsTest, NthElement_ShapeFn) {
 
   Tensor n_t;
   op.input_tensors[1] = &n_t;
-  n_t = test::AsScalar<int32>(20);
+  n_t = test::AsScalar<int32_t>(20);
 
   INFER_OK(op, "?;[]", "?");
   INFER_OK(op, "[21];[]", "[]");
@@ -98,7 +98,7 @@ TEST(NNOpsTest, NthElement_ShapeFn) {
   INFER_ERROR("Input must have last dimension > n = 20 but is 1", op, "[1];[]");
   INFER_ERROR("Input must have last dimension > n = 20 but is 20", op,
               "[1,2,3,20];[]");
-  n_t = test::AsScalar<int32>(-1);
+  n_t = test::AsScalar<int32_t>(-1);
   INFER_ERROR(
       "Dimension size, given by scalar input 1, must be non-negative but is -1",
       op, "[1,2,3,4];[]");
@@ -182,7 +182,7 @@ TEST(NNOpsTest, FusedBatchNorm_ShapeFn) {
   ShapeInferenceTestOp op("FusedBatchNorm");
 
   auto set_op = [&op](bool is_training, float exponential_avg_factor,
-                      string data_format) {
+                      std::string data_format) {
     TF_ASSERT_OK(NodeDefBuilder("test", "FusedBatchNorm")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_FLOAT))
@@ -276,7 +276,7 @@ TEST(NNOpsTest, FusedBatchNorm_ShapeFn) {
 
 TEST(NNOpsTest, FusedBatchNormGrad_ShapeFn) {
   ShapeInferenceTestOp op("FusedBatchNormGrad");
-  auto set_op = [&op](string data_format) {
+  auto set_op = [&op](std::string data_format) {
     TF_ASSERT_OK(NodeDefBuilder("test", "FusedBatchNormGrad")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_FLOAT))
@@ -490,8 +490,9 @@ TEST(NNOpsTest, InTopK_ShapeFn) {
 
 TEST(NNOpsTest, Dilation2DShapeTest) {
   ShapeInferenceTestOp op("Dilation2D");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const std::vector<int32>& rates, const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& rates,
+                      const std::string& padding) {
     TF_ASSERT_OK(NodeDefBuilder("test", "Dilation2D")
                      .Input("input", 0, DT_FLOAT)
                      .Input("filter", 0, DT_FLOAT)
@@ -568,8 +569,8 @@ TEST(NNOpsTest, FractionalAvgPoolGrad) {
   INFER_OK(op, "?;?;?;?", "[?,?,?,?]");
 
   // When input tensor is known, its values determine output shape.
-  std::vector<int32> shape{1, 2, 3, 4};
-  Tensor shape_t = test::AsTensor<int32>(shape);
+  std::vector<int32_t> shape{1, 2, 3, 4};
+  Tensor shape_t = test::AsTensor<int32_t>(shape);
   op.input_tensors[0] = &shape_t;
   INFER_OK(op, "[5];?;?;?", "[1,2,3,4]");
 }
diff --git a/tensorflow/core/ops/parsing_ops_test.cc b/tensorflow/core/ops/parsing_ops_test.cc
index d791278b3d9a50..e433ac7ae3e5e1 100644
--- a/tensorflow/core/ops/parsing_ops_test.cc
+++ b/tensorflow/core/ops/parsing_ops_test.cc
@@ -157,7 +157,7 @@ TEST(ParsingOpsTest, ParseSequenceExample_ShapeFn) {
                            bool add_extra_shape = false) {
     using NodeOutList = std::vector<NodeDefBuilder::NodeOut>;
     using DataTypeList = std::vector<DataType>;
-    string string_in("test");
+    std::string string_in("test");
     NodeDefBuilder::NodeOut node_in{"a", 0, DT_STRING};
     TF_ASSERT_OK(
         NodeDefBuilder("test", "ParseSequenceExample")
@@ -169,15 +169,15 @@ TEST(ParsingOpsTest, ParseSequenceExample_ShapeFn) {
             .Attr("Nfeature_list_sparse", num_feature_list_sparse)
             .Attr("Nfeature_list_dense", num_feature_list_dense)
             .Attr("feature_list_dense_missing_assumed_empty",
-                  std::vector<string>(num_feature_list_dense, string_in))
+                  std::vector<std::string>(num_feature_list_dense, string_in))
             .Attr("context_sparse_keys",
-                  std::vector<string>(num_context_sparse, string_in))
+                  std::vector<std::string>(num_context_sparse, string_in))
             .Attr("context_dense_keys",
-                  std::vector<string>(num_context_dense, string_in))
+                  std::vector<std::string>(num_context_dense, string_in))
             .Attr("feature_list_sparse_keys",
-                  std::vector<string>(num_feature_list_sparse, string_in))
+                  std::vector<std::string>(num_feature_list_sparse, string_in))
             .Attr("feature_list_dense_keys",
-                  std::vector<string>(num_feature_list_dense, string_in))
+                  std::vector<std::string>(num_feature_list_dense, string_in))
             .Attr("context_sparse_types",
                   DataTypeList(num_context_sparse, DT_FLOAT))
             .Attr("context_dense_types",
@@ -395,7 +395,7 @@ TEST(ParsingOpsTest, ParseSequenceExampleV2_ShapeFn) {
                            bool add_extra_shape = false) {
     using NodeOutList = std::vector<NodeDefBuilder::NodeOut>;
     using DataTypeList = std::vector<DataType>;
-    string string_in("test");
+    std::string string_in("test");
     NodeDefBuilder::NodeOut node_in{"a", 0, DT_STRING};
     TF_ASSERT_OK(
         NodeDefBuilder("test", "ParseSequenceExampleV2")
diff --git a/tensorflow/core/ops/ragged_array_ops.cc b/tensorflow/core/ops/ragged_array_ops.cc
index cb9896d77b7989..7d9e2dc7b4745d 100644
--- a/tensorflow/core/ops/ragged_array_ops.cc
+++ b/tensorflow/core/ops/ragged_array_ops.cc
@@ -106,7 +106,7 @@ REGISTER_OP("RaggedCross")
       int dense_start = num_ragged * 2 + num_sparse * 3;
       for (int i = 0; i < dense_types.size(); ++i) {
         ShapeHandle dense_input = c->input(i + dense_start);
-        int32 rank = c->Rank(dense_input);
+        int32_t rank = c->Rank(dense_input);
         if (rank == InferenceContext::kUnknownRank) {
           continue;
         } else if (rank != 2) {
diff --git a/tensorflow/core/ops/random_index_shuffle_ops.cc b/tensorflow/core/ops/random_index_shuffle_ops.cc
index f4e0f651012b5e..016b35539805de 100644
--- a/tensorflow/core/ops/random_index_shuffle_ops.cc
+++ b/tensorflow/core/ops/random_index_shuffle_ops.cc
@@ -38,9 +38,9 @@ static absl::Status StatelessRandomPermuteShape(InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &max_index_shape));
 
   // Figure out if the output is a scalar or tensor.
-  const int32 index_rank = c->Rank(index_shape);
-  const int32 seed_rank = c->Rank(seed_shape);
-  const int32 max_index_rank = c->Rank(max_index_shape);
+  const int32_t index_rank = c->Rank(index_shape);
+  const int32_t seed_rank = c->Rank(seed_shape);
+  const int32_t max_index_rank = c->Rank(max_index_shape);
 
   // Check that last dimension of seed is 3.
   if (seed_rank == 1 && c->Value(c->Dim(seed_shape, 0)) != 3) {
diff --git a/tensorflow/core/ops/random_ops_test.cc b/tensorflow/core/ops/random_ops_test.cc
index e9145f8ee023d3..1f2cbafcea4c7d 100644
--- a/tensorflow/core/ops/random_ops_test.cc
+++ b/tensorflow/core/ops/random_ops_test.cc
@@ -30,10 +30,10 @@ TEST(RandomOpsTest, Multinomial_ShapeFn) {
   INFER_OK(op, "[?,?];?", "[d0_0,?]");
   INFER_OK(op, "[2,?];?", "[d0_0,?]");
   INFER_OK(op, "[2,1];?", "[d0_0,?]");
-  Tensor num_samples = test::AsScalar<int32>(3);
+  Tensor num_samples = test::AsScalar<int32_t>(3);
   op.input_tensors[1] = &num_samples;
   INFER_OK(op, "[2,1];[]", "[d0_0,3]");
-  num_samples = test::AsTensor<int32>({1, 2, 3});
+  num_samples = test::AsTensor<int32_t>({1, 2, 3});
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[2,1];[3]");
 }
 
@@ -45,7 +45,7 @@ TEST(RandomOpsTest, RandomGamma_ShapeFn) {
   INFER_OK(op, "?;[3]", "?");
   INFER_OK(op, "[1];?", "?");
   INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,2];[3,4]");
-  Tensor shape = test::AsTensor<int32>({1, 2, 3});
+  Tensor shape = test::AsTensor<int32_t>({1, 2, 3});
   op.input_tensors[0] = &shape;
   INFER_OK(op, "[3];[4,?]", "[1,2,3,d1_0,d1_1]");
   INFER_OK(op, "[3];[4,5]", "[1,2,3,d1_0,d1_1]");
@@ -60,7 +60,7 @@ TEST(RandomOpsTest, RandomPoisson_ShapeFn) {
   INFER_OK(op, "?;[3]", "?");
   INFER_OK(op, "[1];?", "?");
   INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,2];[3,4]");
-  Tensor shape = test::AsTensor<int32>({1, 2, 3});
+  Tensor shape = test::AsTensor<int32_t>({1, 2, 3});
   op.input_tensors[0] = &shape;
   INFER_OK(op, "[3];[4,?]", "[1,2,3,d1_0,d1_1]");
   INFER_OK(op, "[3];[4,5]", "[1,2,3,d1_0,d1_1]");
diff --git a/tensorflow/core/ops/rnn_ops_test.cc b/tensorflow/core/ops/rnn_ops_test.cc
index 19ded0ce064878..157d1f3dafb8e7 100644
--- a/tensorflow/core/ops/rnn_ops_test.cc
+++ b/tensorflow/core/ops/rnn_ops_test.cc
@@ -23,10 +23,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-static string JoinedCopies(const string& s, int copies) {
-  string res;
+static std::string JoinedCopies(const std::string& s, int copies) {
+  std::string res;
   for (int i = 0; i < copies; ++i) {
-    strings::StrAppend(&res, i > 0 ? ";" : "", s);
+    absl::StrAppend(&res, i > 0 ? ";" : "", s);
   }
   return res;
 }
@@ -62,7 +62,7 @@ TEST(RnnOpsTest, LSTMBlockCell_ShapeFn) {
   ShapeInferenceTestOp op("LSTMBlockCell");
 
   // Last 6 inputs don't affect shape inference.
-  string input_suffix = strings::StrCat(";", JoinedCopies("?", 6));
+  std::string input_suffix = absl::StrCat(";", JoinedCopies("?", 6));
 
   // Rank checks.
   INFER_ERROR("must be rank 2", op, "[?];?" + input_suffix);
@@ -77,7 +77,7 @@ TEST(RnnOpsTest, LSTMBlockCellGrad_ShapeFn) {
   ShapeInferenceTestOp op("LSTMBlockCellGrad");
 
   // Last 14 inputs don't affect shape inference.
-  string input_suffix = strings::StrCat(";", JoinedCopies("?", 14));
+  std::string input_suffix = absl::StrCat(";", JoinedCopies("?", 14));
 
   // Rank checks.
   INFER_ERROR("must be rank 2", op, "[?];?" + input_suffix);
@@ -107,7 +107,7 @@ TEST(RnnOpsTest, BlockLSTM_ShapeFn) {
                    .Finalize(&op.node_def));
 
   // Middle inputs don't affect shape inference.
-  string infix = ";" + JoinedCopies("?", 6) + ";";
+  std::string infix = ";" + JoinedCopies("?", 6) + ";";
 
   // Rank checks.
   INFER_ERROR("must be rank 3", op, "?;[?]" + infix + "?");
@@ -147,7 +147,7 @@ TEST(RnnOpsTest, BlockLSTMGrad_ShapeFn) {
                    .Finalize(&op.node_def));
 
   // Last inputs don't affect shape inference.
-  string suffix = ";" + JoinedCopies("?", 9);
+  std::string suffix = ";" + JoinedCopies("?", 9);
 
   // Rank check for x
   INFER_ERROR("must be rank 3", op, "?;[?];?;?;?;?;?;?;?" + suffix);
@@ -167,11 +167,11 @@ TEST(RnnOpsTest, BlockLSTMGrad_ShapeFn) {
       "[?,?,?];" + JoinedCopies("[?,?]", 3) + ";" + JoinedCopies("[?]", 4));
 
   // Output with copies input shapes to output.
-  string input = strings::StrCat("?;[?,?,?];", JoinedCopies("[?,?]", 3), ";",
-                                 JoinedCopies("[?]", 4), suffix);
-  string expected = "in1";
+  std::string input = strings::StrCat("?;[?,?,?];", JoinedCopies("[?,?]", 3),
+                                      ";", JoinedCopies("[?]", 4), suffix);
+  std::string expected = "in1";
   for (int i = 1; i < 8; ++i) {
-    strings::StrAppend(&expected, ";in", (i + 1));
+    absl::StrAppend(&expected, ";in", i + 1);
   }
   INFER_OK(op, input, expected);
 }
diff --git a/tensorflow/core/ops/sparse_csr_matrix_ops_test.cc b/tensorflow/core/ops/sparse_csr_matrix_ops_test.cc
index bae771557a2658..927e74a89f6db3 100644
--- a/tensorflow/core/ops/sparse_csr_matrix_ops_test.cc
+++ b/tensorflow/core/ops/sparse_csr_matrix_ops_test.cc
@@ -189,7 +189,8 @@ TEST(SparseMatrixOpsTest, SparseMatrixAdd_ShapeFn) {
   op.input_resource_handle_shapes_and_types.push_back(nullptr);
   op.input_resource_handle_shapes_and_types.push_back(nullptr);
   auto set_shapes = [&a_shapes_and_types, &b_shapes_and_types](
-                        const string& a_shape, const string& b_shape) {
+                        const std::string& a_shape,
+                        const std::string& b_shape) {
     a_shapes_and_types[0].first = a_shape;
     b_shapes_and_types[0].first = b_shape;
   };
@@ -225,7 +226,8 @@ TEST(SparseMatrixOpsTest, SparseMatrixSparseMatMul_ShapeFn) {
   op.input_resource_handle_shapes_and_types.push_back(&a_shapes_and_types);
   op.input_resource_handle_shapes_and_types.push_back(&b_shapes_and_types);
   auto set_shapes = [&a_shapes_and_types, &b_shapes_and_types](
-                        const string& a_shape, const string& b_shape) {
+                        const std::string& a_shape,
+                        const std::string& b_shape) {
     a_shapes_and_types[0].first = a_shape;
     b_shapes_and_types[0].first = b_shape;
   };
@@ -323,7 +325,8 @@ TEST(SparseMatrixOpsTest, SparseMatrixSoftmaxGrad_ShapeFn) {
   op.input_resource_handle_shapes_and_types.push_back(&a_shapes_and_types);
   op.input_resource_handle_shapes_and_types.push_back(&b_shapes_and_types);
   auto set_shapes = [&a_shapes_and_types, &b_shapes_and_types](
-                        const string& a_shape, const string& b_shape) {
+                        const std::string& a_shape,
+                        const std::string& b_shape) {
     a_shapes_and_types[0].first = a_shape;
     b_shapes_and_types[0].first = b_shape;
   };
diff --git a/tensorflow/core/ops/sparse_ops_test.cc b/tensorflow/core/ops/sparse_ops_test.cc
index fc29dc12214fb7..3ca5baa9351894 100644
--- a/tensorflow/core/ops/sparse_ops_test.cc
+++ b/tensorflow/core/ops/sparse_ops_test.cc
@@ -126,7 +126,7 @@ TEST(SparseOpsTest, SparseToDense_ShapeFn) {
   INFER_OK(op, "?;?;?;?", "?");
   INFER_OK(op, "?;[?];?;?", "?");
   INFER_OK(op, "?;[4];?;?", "[?,?,?,?]");
-  Tensor in_t = test::AsTensor<int32>({1, 2, 3, 4});
+  Tensor in_t = test::AsTensor<int32_t>({1, 2, 3, 4});
   op.input_tensors[1] = &in_t;
   INFER_OK(op, "?;[4];?;?", "[1,2,3,4]");
 }
diff --git a/tensorflow/core/ops/spectral_ops.cc b/tensorflow/core/ops/spectral_ops.cc
index f1f4a599eee663..f0e9b434c38196 100644
--- a/tensorflow/core/ops/spectral_ops.cc
+++ b/tensorflow/core/ops/spectral_ops.cc
@@ -115,7 +115,7 @@ absl::Status RFFTShape(InferenceContext* c, const bool forward,
       TF_RETURN_IF_ERROR(c->ReplaceDim(out, -rank + i, c->UnknownDim(), &out));
     }
   } else {
-    auto fft_length_as_vec = fft_length_tensor->vec<int32>();
+    auto fft_length_as_vec = fft_length_tensor->vec<int32_t>();
     for (int i = 0; i < rank; ++i) {
       // For RFFT, replace the last dimension with fft_length/2 + 1.
       auto dim = forward && i == rank - 1 && fft_length_as_vec(i) != 0
diff --git a/tensorflow/core/ops/spectral_ops_test.cc b/tensorflow/core/ops/spectral_ops_test.cc
index b1c5e95fc5ce25..49de445d57fd07 100644
--- a/tensorflow/core/ops/spectral_ops_test.cc
+++ b/tensorflow/core/ops/spectral_ops_test.cc
@@ -72,7 +72,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
 
     // Tests with known values for fft_length input.
     op.input_tensors.resize(2);
-    Tensor fft_length = test::AsTensor<int32>({10});
+    Tensor fft_length = test::AsTensor<int32_t>({10});
     op.input_tensors[1] = &fft_length;
 
     // The inner-most dimension of the RFFT is n/2+1 while for IRFFT it's n.
@@ -86,7 +86,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
       INFER_OK(op, "[1,1];[1]", "[d0_0,10]");
     }
 
-    fft_length = test::AsTensor<int32>({11});
+    fft_length = test::AsTensor<int32_t>({11});
     if (forward) {
       INFER_OK(op, "[?];[1]", "[6]");
       INFER_OK(op, "[1];[1]", "[6]");
@@ -97,7 +97,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
       INFER_OK(op, "[1,1];[1]", "[d0_0,11]");
     }
 
-    fft_length = test::AsTensor<int32>({12});
+    fft_length = test::AsTensor<int32_t>({12});
     if (forward) {
       INFER_OK(op, "[?];[1]", "[7]");
       INFER_OK(op, "[1];[1]", "[7]");
@@ -132,7 +132,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
 
     // Tests with known values for fft_length input.
     op.input_tensors.resize(2);
-    Tensor fft_length = test::AsTensor<int32>({9, 10});
+    Tensor fft_length = test::AsTensor<int32_t>({9, 10});
     op.input_tensors[1] = &fft_length;
 
     // The inner-most dimension of the RFFT is n/2+1 while for IRFFT it's n.
@@ -146,7 +146,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
       INFER_OK(op, "[1,1,1];[2]", "[d0_0,9,10]");
     }
 
-    fft_length = test::AsTensor<int32>({10, 11});
+    fft_length = test::AsTensor<int32_t>({10, 11});
     if (forward) {
       INFER_OK(op, "[?,?];[2]", "[10,6]");
       INFER_OK(op, "[1,1];[2]", "[10,6]");
@@ -157,7 +157,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
       INFER_OK(op, "[1,1,1];[2]", "[d0_0,10,11]");
     }
 
-    fft_length = test::AsTensor<int32>({11, 12});
+    fft_length = test::AsTensor<int32_t>({11, 12});
     if (forward) {
       INFER_OK(op, "[?,?];[2]", "[11,7]");
       INFER_OK(op, "[1,1];[2]", "[11,7]");
@@ -192,7 +192,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
 
     // Tests with known values for fft_length input.
     op.input_tensors.resize(2);
-    Tensor fft_length = test::AsTensor<int32>({10, 11, 12});
+    Tensor fft_length = test::AsTensor<int32_t>({10, 11, 12});
     op.input_tensors[1] = &fft_length;
 
     // The inner-most dimension of the RFFT is n/2+1 while for IRFFT it's n.
@@ -206,7 +206,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
       INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,10,11,12]");
     }
 
-    fft_length = test::AsTensor<int32>({11, 12, 13});
+    fft_length = test::AsTensor<int32_t>({11, 12, 13});
     if (forward) {
       INFER_OK(op, "[?,?,?];[3]", "[11,12,7]");
       INFER_OK(op, "[1,1,1];[3]", "[11,12,7]");
@@ -217,7 +217,7 @@ TEST(MathOpsTest, RFFT_ShapeFn) {
       INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,11,12,13]");
     }
 
-    fft_length = test::AsTensor<int32>({12, 13, 14});
+    fft_length = test::AsTensor<int32_t>({12, 13, 14});
     if (forward) {
       INFER_OK(op, "[?,?,?];[3]", "[12,13,8]");
       INFER_OK(op, "[1,1,1];[3]", "[12,13,8]");
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 5347ad9008bb21..702791f04ef090 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -145,8 +145,8 @@ REGISTER_OP("StringFormat")
     .Attr("placeholder: string = '%s'")
     .Attr("summarize: int = 3")
     .SetShapeFn([](InferenceContext* c) {
-      string template_;
-      string placeholder;
+      std::string template_;
+      std::string placeholder;
       TF_RETURN_IF_ERROR(c->GetAttr("template", &template_));
       TF_RETURN_IF_ERROR(c->GetAttr("placeholder", &placeholder));
 
@@ -154,7 +154,7 @@ REGISTER_OP("StringFormat")
       split_template = absl::StrSplit(template_, placeholder);
       int64_t num_placeholders = split_template.size() - 1;
       if (c->num_inputs() != num_placeholders) {
-        return errors::InvalidArgument(strings::StrCat(
+        return errors::InvalidArgument(absl::StrCat(
             "num placeholders in template and num inputs must match: ",
             num_placeholders, " vs. ", c->num_inputs()));
       }
diff --git a/tensorflow/core/ops/tpu_embedding_ops.cc b/tensorflow/core/ops/tpu_embedding_ops.cc
index 2ee7791cb2ed20..1f966749f8a310 100644
--- a/tensorflow/core/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/ops/tpu_embedding_ops.cc
@@ -137,7 +137,7 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
     .Attr("combiners: list(string) = []")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
-      std::vector<string> combiners;
+      std::vector<std::string> combiners;
       TF_RETURN_IF_ERROR(c->GetAttr("combiners", &combiners));
       int n;
       TF_RETURN_IF_ERROR(c->GetAttr("N", &n));
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index a9ba90bc8df12d..b92f897d346946 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 namespace tensorflow {
 
 // Used for testing the grad+indices handling for SparseApplyXYZ tests.
-static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
-                                            string shape_spec_middle,
-                                            const string& shape_spec_end = "") {
+static void TestGradAndIndicesErrorHandling(
+    const ShapeInferenceTestOp& op, std::string shape_spec_middle,
+    const std::string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
                         const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index accbd0a0682c96..41c022c7250448 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -940,6 +940,7 @@ cc_library(
         ":bfloat16",
         ":platform",
         ":tstring",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/platform:bfloat16",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:tstring",
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index 8c31b76709db6c..85f103f360e29c 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -7,6 +7,7 @@ load(
     _if_llvm_arm_available = "if_llvm_arm_available",
     _if_llvm_hexagon_available = "if_llvm_hexagon_available",
     _if_llvm_powerpc_available = "if_llvm_powerpc_available",
+    _if_llvm_riscv_available = "if_llvm_riscv_available",
     _if_llvm_system_z_available = "if_llvm_system_z_available",
     _if_llvm_x86_available = "if_llvm_x86_available",
     _if_pywrap = "if_pywrap",
@@ -32,6 +33,7 @@ if_llvm_aarch64_available = _if_llvm_aarch64_available
 if_llvm_arm_available = _if_llvm_arm_available
 if_llvm_hexagon_available = _if_llvm_hexagon_available
 if_llvm_powerpc_available = _if_llvm_powerpc_available
+if_llvm_riscv_available = _if_llvm_riscv_available
 if_llvm_system_z_available = _if_llvm_system_z_available
 if_llvm_x86_available = _if_llvm_x86_available
 if_dynamic_kernels = _if_dynamic_kernels
diff --git a/tensorflow/core/platform/numbers.h b/tensorflow/core/platform/numbers.h
index 11bb0f09e63c59..0e7218085f4d3a 100644
--- a/tensorflow/core/platform/numbers.h
+++ b/tensorflow/core/platform/numbers.h
@@ -25,20 +25,12 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 // NOLINTBEGIN(misc-unused-using-decls)
-using tsl::strings::DoubleToBuffer;
-using tsl::strings::FastInt32ToBufferLeft;
-using tsl::strings::FastInt64ToBufferLeft;
-using tsl::strings::FastUInt32ToBufferLeft;
-using tsl::strings::FastUInt64ToBufferLeft;
-using tsl::strings::FloatToBuffer;
 using tsl::strings::FpToString;
 using tsl::strings::HexStringToUint64;
 using tsl::strings::HumanReadableElapsedTime;
 using tsl::strings::HumanReadableNum;
 using tsl::strings::HumanReadableNumBytes;
-using tsl::strings::kFastToBufferSize;
 using tsl::strings::LegacyPrecision;
-using tsl::strings::ProtoParseNumeric;
 using tsl::strings::safe_strto32;
 using tsl::strings::safe_strto64;
 using tsl::strings::safe_strtod;
diff --git a/tensorflow/core/platform/str_util.h b/tensorflow/core/platform/str_util.h
index fbea09afc9a470..1ba5b1ad531485 100644
--- a/tensorflow/core/platform/str_util.h
+++ b/tensorflow/core/platform/str_util.h
@@ -51,7 +51,6 @@ using tsl::str_util::StringReplace;
 using tsl::str_util::StripPrefix;
 using tsl::str_util::StripSuffix;
 using tsl::str_util::StripTrailingWhitespace;
-using tsl::str_util::Strnlen;
 using tsl::str_util::TitlecaseString;
 using tsl::str_util::Uppercase;
 // NOLINTEND(misc-unused-using-decls)
diff --git a/tensorflow/core/platform/stringprintf.h b/tensorflow/core/platform/stringprintf.h
index 27d30089f9e22c..b40057a9edf497 100644
--- a/tensorflow/core/platform/stringprintf.h
+++ b/tensorflow/core/platform/stringprintf.h
@@ -34,7 +34,6 @@ namespace tensorflow {
 namespace strings {
 // NOLINTBEGIN(misc-unused-using-decls)
 using tsl::strings::Appendf;
-using tsl::strings::Appendv;
 using tsl::strings::Printf;
 // NOLINTEND(misc-unused-using-decls)
 }  // namespace strings
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 599f5b442fd323..9a4544f99734a1 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -19,51 +19,52 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/base/macros.h"
 #include "xla/tsl/platform/types.h"
 #include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tsl/platform/bfloat16.h"
 #include "tsl/platform/tstring.h"
-#include "tsl/platform/types.h"
 
 namespace tensorflow {
 
 // Alias tensorflow::string to std::string.
-using tsl::string;
+using string ABSL_DEPRECATE_AND_INLINE() = std::string;
 
-using tsl::uint16;
 using tsl::uint2;
-using tsl::uint32;
 using tsl::uint4;
-using tsl::uint64;
-using tsl::uint8;
+using uint8 ABSL_DEPRECATE_AND_INLINE() = uint8_t;
+using uint16 ABSL_DEPRECATE_AND_INLINE() = uint16_t;
+using uint32 ABSL_DEPRECATE_AND_INLINE() = uint32_t;
+using uint64 ABSL_DEPRECATE_AND_INLINE() = uint64_t;
 
-using tsl::int16;
 using tsl::int2;
-using tsl::int32;
 using tsl::int4;
-using tsl::int64;
-using tsl::int8;
+using int8 ABSL_DEPRECATE_AND_INLINE() = int8_t;
+using int16 ABSL_DEPRECATE_AND_INLINE() = int16_t;
+using int32 ABSL_DEPRECATE_AND_INLINE() = int32_t;
+using int64 ABSL_DEPRECATE_AND_INLINE() = int64_t;
 
+using tsl::float4_e2m1fn;
 using tsl::float8_e4m3b11fnuz;
 using tsl::float8_e4m3fn;
 using tsl::float8_e4m3fnuz;
 using tsl::float8_e5m2;
 using tsl::float8_e5m2fnuz;
 
-static const uint8 kuint8max = tsl::kuint8max;
-static const uint16 kuint16max = tsl::kuint16max;
-static const uint32 kuint32max = tsl::kuint32max;
-static const uint64 kuint64max = tsl::kuint64max;
-static const int8_t kint8min = tsl::kint8min;
-static const int8_t kint8max = tsl::kint8max;
-static const int16_t kint16min = tsl::kint16min;
-static const int16_t kint16max = tsl::kint16max;
-static const int32_t kint32min = tsl::kint32min;
-static const int32_t kint32max = tsl::kint32max;
-static const int64_t kint64min = tsl::kint64min;
-static const int64_t kint64max = tsl::kint64max;
+using tsl::kint16max;
+using tsl::kint16min;
+using tsl::kint32max;
+using tsl::kint32min;
+using tsl::kint64max;
+using tsl::kint64min;
+using tsl::kint8max;
+using tsl::kint8min;
+using tsl::kuint16max;
+using tsl::kuint32max;
+using tsl::kuint64max;
+using tsl::kuint8max;
 
 // A typedef for a uint64 used as a short fingerprint.
 using tsl::bfloat16;
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
index 9a51220df32782..8e86009cf0022e 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
@@ -128,9 +128,9 @@ class DeviceTracerTest : public ::testing::Test {
     }
   }
 
-  string x_;
-  string y_;
-  string y_neg_;
+  std::string x_;
+  std::string y_;
+  std::string y_neg_;
   GraphDef def_;
 };
 
@@ -188,11 +188,11 @@ TEST_F(DeviceTracerTest, RunWithTracer) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   TF_ASSERT_OK(tracer->Start());
@@ -212,11 +212,11 @@ TEST_F(DeviceTracerTest, RunWithTraceOption) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   // Prepares RunOptions and RunOutputs
diff --git a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
index 9ea9bdac53091f..16a137bc766bb6 100644
--- a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
@@ -37,7 +37,7 @@ struct ExecStats {
 
 class AcceleratorUtilizationChecker : public Checker {
  public:
-  string name() const override { return kCheckers[0]; }
+  std::string name() const override { return kCheckers[0]; }
 
  private:
   AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
@@ -84,8 +84,8 @@ class AcceleratorUtilizationChecker : public Checker {
 
     if (accelerator_exec_stats_.find(node->canonical_device()) ==
         accelerator_exec_stats_.end()) {
-      accelerator_exec_stats_.insert(
-          std::pair<string, ExecStats>(node->canonical_device(), ExecStats()));
+      accelerator_exec_stats_.insert(std::pair<std::string, ExecStats>(
+          node->canonical_device(), ExecStats()));
     }
     ExecStats& stats = accelerator_exec_stats_.at(node->canonical_device());
 
@@ -102,8 +102,8 @@ class AcceleratorUtilizationChecker : public Checker {
     stats.exec_micros += exec.accelerator_exec_micros();
   }
 
-  std::map<string, ExecStats> accelerator_exec_stats_;
-  std::map<string, int64_t> ps_placement_;
+  std::map<std::string, ExecStats> accelerator_exec_stats_;
+  std::map<std::string, int64_t> ps_placement_;
   AdviceProto::Checker reports_;
 };
 
diff --git a/tensorflow/core/profiler/internal/advisor/checker.h b/tensorflow/core/profiler/internal/advisor/checker.h
index 3fc345ccfc5f97..b21c8765b44ce7 100644
--- a/tensorflow/core/profiler/internal/advisor/checker.h
+++ b/tensorflow/core/profiler/internal/advisor/checker.h
@@ -33,7 +33,7 @@ class Checker {
  public:
   virtual ~Checker() = default;
 
-  virtual string name() const = 0;
+  virtual std::string name() const = 0;
 
   AdviceProto::Checker Run(const AdvisorOptionsProto::CheckerOption& options,
                            const TFStats* stats) {
diff --git a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
index 4ec0cb571dd7c6..8437501130a9ec 100644
--- a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -27,7 +27,7 @@ namespace tfprof {
 
 class ExpensiveOperationChecker : public Checker {
  public:
-  string name() const override { return kCheckers[2]; }
+  std::string name() const override { return kCheckers[2]; }
 
  private:
   AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
@@ -58,7 +58,7 @@ class ExpensiveOperationChecker : public Checker {
       return;
     }
     const MultiGraphNodeProto* node = &root;
-    std::vector<string> outputs;
+    std::vector<std::string> outputs;
     for (int i = 0; i < 3 && node->children_size() > 0; ++i) {
       node = &node->children(0);
       outputs.push_back(absl::StrFormat(
@@ -90,7 +90,7 @@ class ExpensiveOperationChecker : public Checker {
       return;
     }
 
-    std::vector<string> outputs;
+    std::vector<std::string> outputs;
     CodeViewHelper(node, 0, &outputs);
     reports_.add_reports(absl::StrJoin(outputs, "\n"));
   }
@@ -102,7 +102,7 @@ class ExpensiveOperationChecker : public Checker {
     if (root.children_size() == 0) {
       return;
     }
-    std::vector<string> outputs;
+    std::vector<std::string> outputs;
     for (int i = 0; i < 3 && i < root.children_size(); ++i) {
       const GraphNodeProto& node = root.children(i);
       outputs.push_back(absl::StrFormat(
@@ -115,7 +115,7 @@ class ExpensiveOperationChecker : public Checker {
   }
 
   void CodeViewHelper(const MultiGraphNodeProto* node, int depth,
-                      std::vector<string>* outputs) {
+                      std::vector<std::string>* outputs) {
     if (node->children_size() <= 1 || depth > 3) {
       return;
     }
diff --git a/tensorflow/core/profiler/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h
index 5142639fea6b2d..de87bc990833f4 100644
--- a/tensorflow/core/profiler/internal/advisor/operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -25,7 +25,7 @@ namespace tfprof {
 
 class OperationChecker : public Checker {
  public:
-  string name() const override { return kCheckers[1]; }
+  std::string name() const override { return kCheckers[1]; }
 
  private:
   AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
index e1db57cce895fd..65d704b7e84d2a 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -36,9 +36,9 @@ class Advisor {
 
   static AdvisorOptionsProto DefaultOptions() {
     AdvisorOptionsProto options;
-    std::vector<string> checkers(
+    std::vector<std::string> checkers(
         kCheckers, kCheckers + sizeof(kCheckers) / sizeof(*kCheckers));
-    for (const string& checker : checkers) {
+    for (const std::string& checker : checkers) {
       (*options.mutable_checkers())[checker];
     }
     return options;
@@ -66,7 +66,7 @@ class Advisor {
     }
     for (const auto& checker : ret.checkers()) {
       absl::FPrintF(stdout, "\n%s:\n", checker.first);
-      for (const string& r : checker.second.reports()) {
+      for (const std::string& r : checker.second.reports()) {
         absl::FPrintF(stdout, "%s\n", r);
       }
     }
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index d6c1a29fe9e131..ef57e7dedde83e 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -48,11 +48,10 @@ class TFProfAdvisorTest : public ::testing::Test {
     advisor_ = std::make_unique<Advisor>(stats_.get());
   }
 
-  std::unique_ptr<TFGraphNode> CreateNode(const string& name,
-                                          const string& type,
-                                          std::map<string, string> attrs,
-                                          int64_t step, int64_t start_miros,
-                                          int64_t end_rel_micros) {
+  std::unique_ptr<TFGraphNode> CreateNode(
+      const std::string& name, const std::string& type,
+      std::map<std::string, std::string> attrs, int64_t step,
+      int64_t start_miros, int64_t end_rel_micros) {
     node_defs_.push_back(std::make_unique<NodeDef>());
     NodeDef* def = node_defs_.back().get();
 
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index 60dcd90ea131ab..142b156681dafd 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -85,7 +85,7 @@ string RunProfile(const string& command, const string& options,
 }  // namespace
 
 bool NewProfiler(const string* graph, const string* op_log) {
-  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  std::unique_ptr<GraphDef> graph_ptr = std::make_unique<GraphDef>();
   if (graph && !graph->empty()) {
     if (!graph_ptr->ParseFromString(*graph)) {
       if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
@@ -126,7 +126,7 @@ double AddStep(int64_t step, const string* graph, const string* run_meta,
   CHECK(tf_stat);
 
   if (graph && !graph->empty()) {
-    std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+    std::unique_ptr<GraphDef> graph_ptr = std::make_unique<GraphDef>();
     if (!graph_ptr->ParseFromString(*graph)) {
       if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
         absl::FPrintF(stderr, "Failed to parse graph\n");
@@ -137,7 +137,7 @@ double AddStep(int64_t step, const string* graph, const string* run_meta,
 
   CHECK(run_meta && !run_meta->empty());
   // TODO(xpan): Better error handling.
-  std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
+  std::unique_ptr<RunMetadata> run_meta_ptr = std::make_unique<RunMetadata>();
   run_meta_ptr->ParseFromString(*run_meta);
   tf_stat->AddRunMeta(step, std::move(run_meta_ptr));
 
@@ -175,7 +175,7 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
                           const string* options) {
   CHECK(command) << "command mustn't be null";
   CHECK(options) << "options mustn't be null";
-  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  std::unique_ptr<GraphDef> graph_ptr = std::make_unique<GraphDef>();
   if (graph && !graph->empty()) {
     graph_ptr->ParseFromString(*graph);
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index b68c5c9e18d322..a6c88c2e4cda41 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -41,19 +41,20 @@ class TFProfShowTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb =
+        std::make_unique<tensorflow::RunMetadata>();
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/run_meta");
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
 
-    std::unique_ptr<OpLogProto> op_log_pb(new OpLogProto());
+    std::unique_ptr<OpLogProto> op_log_pb = std::make_unique<OpLogProto>();
     string op_log_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/tfprof_log");
@@ -62,8 +63,8 @@ class TFProfShowTest : public ::testing::Test {
     string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
                                     "core/profiler/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader =
+        std::make_unique<checkpoint::CheckpointReader>(ckpt_path, status);
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 3ef535b775047f..7491ef4e99229c 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -97,8 +97,8 @@ TFStats::TFStats(const string& filename,
     id_to_string_[entry.first] = entry.second;
   }
   for (const auto& node_pb : profile.nodes()) {
-    std::unique_ptr<TFGraphNode> node(
-        new TFGraphNode(node_pb.second, profile, &id_to_string_, &nodes_map_));
+    std::unique_ptr<TFGraphNode> node = std::make_unique<TFGraphNode>(
+        node_pb.second, profile, &id_to_string_, &nodes_map_);
     nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>(
         node_pb.second.name(), std::move(node)));
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 1dd0317e3b4d67..37cedfa641ae00 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -35,19 +35,20 @@ class TFProfStatsTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb =
+        std::make_unique<tensorflow::RunMetadata>();
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/run_meta");
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
 
-    std::unique_ptr<OpLogProto> op_log_pb(new OpLogProto());
+    std::unique_ptr<OpLogProto> op_log_pb = std::make_unique<OpLogProto>();
     string op_log_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/tfprof_log");
@@ -56,8 +57,8 @@ class TFProfStatsTest : public ::testing::Test {
     string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
                                     "core/profiler/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader =
+        std::make_unique<checkpoint::CheckpointReader>(ckpt_path, status);
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
index b632636b44ae6c..b7304ebefda97d 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
@@ -33,7 +33,8 @@ class TFProfTensorTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
@@ -43,8 +44,8 @@ class TFProfTensorTest : public ::testing::Test {
     string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
                                     "core/profiler/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader =
+        std::make_unique<checkpoint::CheckpointReader>(ckpt_path, status);
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index 7efa804a39dd9f..6cf00f9bac90ca 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -35,12 +35,13 @@ class TFProfTimelineTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb =
+        std::make_unique<tensorflow::RunMetadata>();
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/run_meta");
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 8ad7c154ce2cd0..1c615c20999d12 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <cstdint>
+#include <limits>
 #include <map>
 #include <memory>
 #include <string>
@@ -47,7 +48,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 void completion(const char* buf, linenoiseCompletions* lc) {
-  string buf_str = buf;
+  std::string buf_str = buf;
   if (buf_str.find(' ') == buf_str.npos) {
     for (const char* opt : kCmds) {
       if (absl::StartsWith(opt, buf_str)) {
@@ -57,11 +58,12 @@ void completion(const char* buf, linenoiseCompletions* lc) {
     return;
   }
 
-  string prefix;
+  std::string prefix;
   int last_dash = buf_str.find_last_of(' ');
-  if (last_dash != string::npos) {
+  if (last_dash != std::string::npos) {
     prefix = buf_str.substr(0, last_dash + 1);
-    buf_str = buf_str.substr(last_dash + 1, kint32max);
+    buf_str =
+        buf_str.substr(last_dash + 1, std::numeric_limits<int32_t>::max());
   }
   for (const char* opt : kOptions) {
     if (absl::StartsWith(opt, buf_str)) {
@@ -71,11 +73,11 @@ void completion(const char* buf, linenoiseCompletions* lc) {
 }
 
 int Run(int argc, char** argv) {
-  string FLAGS_profile_path = "";
-  string FLAGS_graph_path = "";
-  string FLAGS_run_meta_path = "";
-  string FLAGS_op_log_path = "";
-  string FLAGS_checkpoint_path = "";
+  std::string FLAGS_profile_path = "";
+  std::string FLAGS_graph_path = "";
+  std::string FLAGS_run_meta_path = "";
+  std::string FLAGS_op_log_path = "";
+  std::string FLAGS_checkpoint_path = "";
   int32_t FLAGS_max_depth = 10;
   int64_t FLAGS_min_bytes = 0;
   int64_t FLAGS_min_peak_bytes = 0;
@@ -88,15 +90,15 @@ int Run(int argc, char** argv) {
   int64_t FLAGS_min_float_ops = 0;
   int64_t FLAGS_min_occurrence = 0;
   int64_t FLAGS_step = -1;
-  string FLAGS_order_by = "name";
-  string FLAGS_account_type_regexes = ".*";
-  string FLAGS_start_name_regexes = ".*";
-  string FLAGS_trim_name_regexes = "";
-  string FLAGS_show_name_regexes = ".*";
-  string FLAGS_hide_name_regexes;
+  std::string FLAGS_order_by = "name";
+  std::string FLAGS_account_type_regexes = ".*";
+  std::string FLAGS_start_name_regexes = ".*";
+  std::string FLAGS_trim_name_regexes = "";
+  std::string FLAGS_show_name_regexes = ".*";
+  std::string FLAGS_hide_name_regexes;
   bool FLAGS_account_displayed_op_only = false;
-  string FLAGS_select = "micros";
-  string FLAGS_output = "";
+  std::string FLAGS_select = "micros";
+  std::string FLAGS_output = "";
   for (int i = 0; i < argc; i++) {
     absl::FPrintF(stderr, "%s\n", argv[i]);
   }
@@ -137,7 +139,7 @@ int Run(int argc, char** argv) {
       Flag("select", &FLAGS_select, "select"),
       Flag("output", &FLAGS_output, "output"),
   };
-  string usage = Flags::Usage(argv[0], flag_list);
+  std::string usage = Flags::Usage(argv[0], flag_list);
   bool parse_ok = Flags::Parse(&argc, argv, flag_list);
   if (!parse_ok) {
     absl::PrintF("%s", usage);
@@ -153,37 +155,37 @@ int Run(int argc, char** argv) {
     return 1;
   }
 
-  std::vector<string> account_type_regexes =
+  std::vector<std::string> account_type_regexes =
       absl::StrSplit(FLAGS_account_type_regexes, ',', absl::SkipEmpty());
-  std::vector<string> start_name_regexes =
+  std::vector<std::string> start_name_regexes =
       absl::StrSplit(FLAGS_start_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> trim_name_regexes =
+  std::vector<std::string> trim_name_regexes =
       absl::StrSplit(FLAGS_trim_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> show_name_regexes =
+  std::vector<std::string> show_name_regexes =
       absl::StrSplit(FLAGS_show_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> hide_name_regexes =
+  std::vector<std::string> hide_name_regexes =
       absl::StrSplit(FLAGS_hide_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> select =
+  std::vector<std::string> select =
       absl::StrSplit(FLAGS_select, ',', absl::SkipEmpty());
 
-  string output_type;
-  std::map<string, string> output_options;
+  std::string output_type;
+  std::map<std::string, std::string> output_options;
   absl::Status s = ParseOutput(FLAGS_output, &output_type, &output_options);
   CHECK(s.ok()) << s;
 
-  string cmd = "";
+  std::string cmd = "";
   if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty() &&
       FLAGS_run_meta_path.empty()) {
     PrintHelp();
     return 0;
   } else if (argc > 1) {
-    if (string(argv[1]) == kCmds[6]) {
+    if (std::string(argv[1]) == kCmds[6]) {
       PrintHelp();
       return 0;
     }
-    if (string(argv[1]) == kCmds[0] || string(argv[1]) == kCmds[1] ||
-        string(argv[1]) == kCmds[2] || string(argv[1]) == kCmds[3] ||
-        string(argv[1]) == kCmds[4]) {
+    if (std::string(argv[1]) == kCmds[0] || std::string(argv[1]) == kCmds[1] ||
+        std::string(argv[1]) == kCmds[2] || std::string(argv[1]) == kCmds[3] ||
+        std::string(argv[1]) == kCmds[4]) {
       cmd = argv[1];
     }
   }
@@ -221,7 +223,7 @@ int Run(int argc, char** argv) {
 
     std::unique_ptr<OpLogProto> op_log = std::make_unique<OpLogProto>();
     if (!FLAGS_op_log_path.empty()) {
-      string op_log_str;
+      std::string op_log_str;
       s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str);
       if (!s.ok()) {
         absl::FPrintF(stderr, "Failed to read op_log_path: %s\n", s.ToString());
@@ -235,7 +237,7 @@ int Run(int argc, char** argv) {
     tf_stat = std::make_unique<TFStats>(
         std::move(graph), nullptr, std::move(op_log), std::move(ckpt_reader));
 
-    std::vector<string> run_meta_files =
+    std::vector<std::string> run_meta_files =
         absl::StrSplit(FLAGS_run_meta_path, ',', absl::SkipEmpty());
     for (int i = 0; i < run_meta_files.size(); ++i) {
       std::unique_ptr<RunMetadata> run_meta = std::make_unique<RunMetadata>();
@@ -292,7 +294,7 @@ int Run(int argc, char** argv) {
       break;
     }
     looped = true;
-    string line_s = line;
+    std::string line_s = line;
     free(line);
 
     if (line_s.empty()) {
diff --git a/tensorflow/core/profiler/tfprof_options.cc b/tensorflow/core/profiler/tfprof_options.cc
index a31fddbcef3821..5bc5ea2f53be16 100644
--- a/tensorflow/core/profiler/tfprof_options.cc
+++ b/tensorflow/core/profiler/tfprof_options.cc
@@ -32,8 +32,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 namespace {
-string KeyValueToStr(const std::map<string, string>& kv_map) {
-  std::vector<string> kv_vec;
+std::string KeyValueToStr(const std::map<std::string, std::string>& kv_map) {
+  std::vector<std::string> kv_vec;
   kv_vec.reserve(kv_map.size());
   for (const auto& pair : kv_map) {
     kv_vec.push_back(absl::StrCat(pair.first, "=", pair.second));
@@ -42,18 +42,19 @@ string KeyValueToStr(const std::map<string, string>& kv_map) {
 }
 }  // namespace
 
-absl::Status ParseOutput(const string& output_opt, string* output_type,
-                         std::map<string, string>* output_options) {
+absl::Status ParseOutput(const std::string& output_opt,
+                         std::string* output_type,
+                         std::map<std::string, std::string>* output_options) {
   // The default is to use stdout.
   if (output_opt.empty()) {
     *output_type = kOutput[1];
     return absl::OkStatus();
   }
 
-  std::set<string> output_types(kOutput,
-                                kOutput + sizeof(kOutput) / sizeof(*kOutput));
+  std::set<std::string> output_types(
+      kOutput, kOutput + sizeof(kOutput) / sizeof(*kOutput));
   auto opt_split = output_opt.find(':');
-  std::vector<string> kv_split;
+  std::vector<std::string> kv_split;
   if (opt_split == output_opt.npos) {
     if (output_types.find(output_opt) == output_types.end()) {
       return absl::Status(
@@ -74,8 +75,8 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
                               absl::SkipEmpty());
   }
 
-  std::set<string> valid_options;
-  std::set<string> required_options;
+  std::set<std::string> valid_options;
+  std::set<std::string> required_options;
   if (*output_type == kOutput[0]) {
     valid_options.insert(
         kTimelineOpts,
@@ -99,8 +100,8 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
             sizeof(kPprofRequiredOpts) / sizeof(*kPprofRequiredOpts));
   }
 
-  for (const string& kv_str : kv_split) {
-    const std::vector<string> kv =
+  for (const std::string& kv_str : kv_split) {
+    const std::vector<std::string> kv =
         absl::StrSplit(kv_str, '=', absl::SkipEmpty());
     if (kv.size() < 2) {
       return absl::Status(
@@ -113,11 +114,11 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
           absl::StrFormat("Unrecognized options %s for output_type: %s\n",
                           kv[0], *output_type));
     }
-    const std::vector<string> kv_without_key(kv.begin() + 1, kv.end());
+    const std::vector<std::string> kv_without_key(kv.begin() + 1, kv.end());
     (*output_options)[kv[0]] = absl::StrJoin(kv_without_key, "=");
   }
 
-  for (const string& opt : required_options) {
+  for (const std::string& opt : required_options) {
     if (output_options->find(opt) == output_options->end()) {
       return absl::Status(
           absl::StatusCode::kInvalidArgument,
@@ -129,7 +130,7 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
   return absl::OkStatus();
 }
 
-absl::Status Options::FromProtoStr(const string& opts_proto_str,
+absl::Status Options::FromProtoStr(const std::string& opts_proto_str,
                                    Options* opts) {
   OptionsProto opts_pb;
   if (!opts_pb.ParseFromString(opts_proto_str)) {
@@ -139,8 +140,8 @@ absl::Status Options::FromProtoStr(const string& opts_proto_str,
                      opts_proto_str));
   }
 
-  string output_type;
-  std::map<string, string> output_options;
+  std::string output_type;
+  std::map<std::string, std::string> output_options;
   absl::Status s = ParseOutput(opts_pb.output(), &output_type, &output_options);
   if (!s.ok()) return s;
 
@@ -162,18 +163,19 @@ absl::Status Options::FromProtoStr(const string& opts_proto_str,
       opts_pb.min_micros(), opts_pb.min_accelerator_micros(),
       opts_pb.min_cpu_micros(), opts_pb.min_params(), opts_pb.min_float_ops(),
       opts_pb.min_occurrence(), opts_pb.step(), opts_pb.order_by(),
-      std::vector<string>(opts_pb.account_type_regexes().begin(),
-                          opts_pb.account_type_regexes().end()),
-      std::vector<string>(opts_pb.start_name_regexes().begin(),
-                          opts_pb.start_name_regexes().end()),
-      std::vector<string>(opts_pb.trim_name_regexes().begin(),
-                          opts_pb.trim_name_regexes().end()),
-      std::vector<string>(opts_pb.show_name_regexes().begin(),
-                          opts_pb.show_name_regexes().end()),
-      std::vector<string>(opts_pb.hide_name_regexes().begin(),
-                          opts_pb.hide_name_regexes().end()),
+      std::vector<std::string>(opts_pb.account_type_regexes().begin(),
+                               opts_pb.account_type_regexes().end()),
+      std::vector<std::string>(opts_pb.start_name_regexes().begin(),
+                               opts_pb.start_name_regexes().end()),
+      std::vector<std::string>(opts_pb.trim_name_regexes().begin(),
+                               opts_pb.trim_name_regexes().end()),
+      std::vector<std::string>(opts_pb.show_name_regexes().begin(),
+                               opts_pb.show_name_regexes().end()),
+      std::vector<std::string>(opts_pb.hide_name_regexes().begin(),
+                               opts_pb.hide_name_regexes().end()),
       opts_pb.account_displayed_op_only(),
-      std::vector<string>(opts_pb.select().begin(), opts_pb.select().end()),
+      std::vector<std::string>(opts_pb.select().begin(),
+                               opts_pb.select().end()),
       output_type, output_options);
   return absl::OkStatus();
 }
diff --git a/tensorflow/core/profiler/tfprof_options.h b/tensorflow/core/profiler/tfprof_options.h
index a518db50022f53..821a868b3e7816 100644
--- a/tensorflow/core/profiler/tfprof_options.h
+++ b/tensorflow/core/profiler/tfprof_options.h
@@ -102,7 +102,8 @@ static const char* const kPprofRequiredOpts[] = {
 
 struct Options {
  public:
-  static absl::Status FromProtoStr(const string& opts_proto_str, Options* opts);
+  static absl::Status FromProtoStr(const std::string& opts_proto_str,
+                                   Options* opts);
 
   virtual ~Options() = default;
   Options()
@@ -113,15 +114,16 @@ struct Options {
           int64_t min_residual_bytes, int64_t min_output_bytes,
           int64_t min_micros, int64_t min_accelerator_micros,
           int64_t min_cpu_micros, int64_t min_params, int64_t min_float_ops,
-          int64_t min_occurrence, int64_t step, const string& order_by,
-          const std::vector<string>& account_type_regexes,
-          const std::vector<string>& start_name_regexes,
-          const std::vector<string>& trim_name_regexes,
-          const std::vector<string>& show_name_regexes,
-          const std::vector<string>& hide_name_regexes,
-          bool account_displayed_op_only, const std::vector<string>& select,
-          const string& output_type,
-          const std::map<string, string>& output_options)
+          int64_t min_occurrence, int64_t step, const std::string& order_by,
+          const std::vector<std::string>& account_type_regexes,
+          const std::vector<std::string>& start_name_regexes,
+          const std::vector<std::string>& trim_name_regexes,
+          const std::vector<std::string>& show_name_regexes,
+          const std::vector<std::string>& hide_name_regexes,
+          bool account_displayed_op_only,
+          const std::vector<std::string>& select,
+          const std::string& output_type,
+          const std::map<std::string, std::string>& output_options)
       : max_depth(max_depth),
         min_bytes(min_bytes),
         min_peak_bytes(min_peak_bytes),
@@ -145,7 +147,7 @@ struct Options {
         output_type(output_type),
         output_options(output_options) {}
 
-  string ToString() const;
+  std::string ToString() const;
 
   int max_depth;
   int64_t min_bytes;
@@ -159,26 +161,27 @@ struct Options {
   int64_t min_float_ops;
   int64_t min_occurrence;
   int64_t step;
-  string order_by;
+  std::string order_by;
 
-  std::vector<string> account_type_regexes;
-  std::vector<string> start_name_regexes;
-  std::vector<string> trim_name_regexes;
-  std::vector<string> show_name_regexes;
-  std::vector<string> hide_name_regexes;
+  std::vector<std::string> account_type_regexes;
+  std::vector<std::string> start_name_regexes;
+  std::vector<std::string> trim_name_regexes;
+  std::vector<std::string> show_name_regexes;
+  std::vector<std::string> hide_name_regexes;
   bool account_displayed_op_only;
 
-  std::set<string> select;
+  std::set<std::string> select;
 
-  string output_type;
-  std::map<string, string> output_options;
+  std::string output_type;
+  std::map<std::string, std::string> output_options;
 };
 
 // Parse the -output option.
 // 'output_opt': User input string with format: output_type:key=value,key=value.
 // 'output_type' and 'output_options' are extracted from 'output_opt'.
-absl::Status ParseOutput(const string& output_opt, string* output_type,
-                         std::map<string, string>* output_options);
+absl::Status ParseOutput(const std::string& output_opt,
+                         std::string* output_type,
+                         std::map<std::string, std::string>* output_options);
 
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 7d43a017663e57..291246e549fa41 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -58,8 +58,6 @@ cc_library(
     hdrs = ["math_utils.h"],
     visibility = [
         "//perftools/accelerators/xprof/convert:__pkg__",
-        "//perftools/accelerators/xprof/service:__pkg__",
-        "//perftools/accelerators/xprof/xplane:__pkg__",
         "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
         "//perftools/gputools/profiler/collector:__pkg__",
         "//tensorflow/core/profiler/rpc:__pkg__",
@@ -85,7 +83,6 @@ cc_library(
     copts = tf_profiler_copts(),
     visibility = [
         "//perftools/accelerators/xprof/convert:__pkg__",
-        "//perftools/accelerators/xprof/xplane:__pkg__",
         "//perftools/gputools/profiler/collector:__pkg__",
     ],
     deps = [
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index cced3f25e32e6d..52a69150cae3fb 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -791,7 +791,14 @@ message ConfigProto {
     // directory.
     string tf2xla_dump_dir = 35;
 
-    // Next: 36
+    // When set to true, the online op cost analysis will be enabled. This
+    // will run the cost analysis once for the first model execution and
+    // use the obtained cost to optimize subsequent executions.
+    // Typically `stream_merge_threshold` should be tuned to set to a non-zero
+    // value when this option is enabled.
+    bool online_cost_analysis = 36;
+
+    // Next: 37
   }
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index e0d7e15d8fc054..de9e66fa5181c1 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2376  // Updated: 2025/10/10
+#define TF_GRAPH_DEF_VERSION 2424  // Updated: 2025/11/27
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index 0dd34564e39d81..b496e1924107d9 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -427,9 +427,8 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOpInternal(
       [&]() { return GetTracingMetadata(args, exec_ctx, kernel_runner); });
 
   if (fallback_request_state.log_device_placement() || VLOG_IS_ON(1)) {
-    string msg =
-        strings::StrCat("Executing op ", frame.op_name().GetValue().str(),
-                        " in device ", frame.device().GetValue().str());
+    string msg = absl::StrCat("Executing op ", frame.op_name().GetValue().str(),
+                              " in device ", frame.device().GetValue().str());
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
diff --git a/tensorflow/core/summary/loader.cc b/tensorflow/core/summary/loader.cc
index 1144fed77165f3..1443cffc4c6e6a 100644
--- a/tensorflow/core/summary/loader.cc
+++ b/tensorflow/core/summary/loader.cc
@@ -33,9 +33,9 @@ namespace tensorflow {
 namespace {
 
 template <typename T>
-string AddCommas(T n) {
+std::string AddCommas(T n) {
   static_assert(std::is_integral<T>::value, "is_integral");
-  string s = strings::StrCat(n);
+  std::string s = strings::StrCat(n);
   if (s.size() > 3) {
     int extra = s.size() / 3 - (s.size() % 3 == 0 ? 1 : 0);
     s.append(extra, 'X');
@@ -52,11 +52,11 @@ string AddCommas(T n) {
 }
 
 int main(int argc, char* argv[]) {
-  string path;
-  string events;
-  string experiment_name;
-  string run_name;
-  string user_name;
+  std::string path;
+  std::string events;
+  std::string experiment_name;
+  std::string run_name;
+  std::string user_name;
   std::vector<Flag> flag_list = {
       Flag("db", &path, "Path of SQLite DB file"),
       Flag("events", &events, "TensorFlow record proto event log file"),
@@ -64,7 +64,7 @@ int main(int argc, char* argv[]) {
       Flag("run_name", &run_name, "The DB run_name value"),
       Flag("user_name", &user_name, "The DB user_name value"),
   };
-  string usage = Flags::Usage(argv[0], flag_list);
+  std::string usage = Flags::Usage(argv[0], flag_list);
   bool parse_result = Flags::Parse(&argc, argv, flag_list);
   if (!parse_result || path.empty()) {
     std::cerr << "The loader tool imports tf.Event record files, created by\n"
@@ -99,9 +99,9 @@ int main(int argc, char* argv[]) {
   TF_CHECK_OK(env->NewRandomAccessFile(events, &file));
   io::RecordReader reader(file.get());
 
-  uint64 start = env->NowMicros();
-  uint64 records = 0;
-  uint64 offset = 0;
+  uint64_t start = env->NowMicros();
+  uint64_t records = 0;
+  uint64_t offset = 0;
   tstring record;
   while (true) {
     std::unique_ptr<Event> event = std::unique_ptr<Event>(new Event);
@@ -116,9 +116,10 @@ int main(int argc, char* argv[]) {
     TF_CHECK_OK(db_writer->WriteEvent(std::move(event)));
     ++records;
   }
-  uint64 elapsed = env->NowMicros() - start;
-  uint64 bps = (elapsed == 0 ? offset : static_cast<uint64>(
-                                            offset / (elapsed / 1000000.0)));
+  uint64_t elapsed = env->NowMicros() - start;
+  uint64_t bps =
+      (elapsed == 0 ? offset
+                    : static_cast<uint64_t>(offset / (elapsed / 1000000.0)));
   LOG(INFO) << "Loaded " << AddCommas(offset) << " bytes with "
             << AddCommas(records) << " records at " << AddCommas(bps) << " bps";
   return 0;
diff --git a/tensorflow/core/summary/schema.h b/tensorflow/core/summary/schema.h
index dc13bbfb0e8895..d39fd74812491f 100644
--- a/tensorflow/core/summary/schema.h
+++ b/tensorflow/core/summary/schema.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-constexpr uint32 kTensorboardSqliteApplicationId = 0xfeedabee;
+constexpr uint32_t kTensorboardSqliteApplicationId = 0xfeedabee;
 
 /// \brief Creates TensorBoard SQLite tables and indexes.
 ///
diff --git a/tensorflow/core/summary/summary_converter.cc b/tensorflow/core/summary/summary_converter.cc
index 08dcfb6177ca8c..a5e3695e420103 100644
--- a/tensorflow/core/summary/summary_converter.cc
+++ b/tensorflow/core/summary/summary_converter.cc
@@ -72,7 +72,7 @@ absl::Status TensorValueAt(Tensor t, int64_t i, T* out) {
 #undef COMPLEX_CASE
 }
 
-typedef Eigen::Tensor<uint8, 2, Eigen::RowMajor> Uint8Image;
+typedef Eigen::Tensor<uint8_t, 2, Eigen::RowMajor> Uint8Image;
 
 // Add the sequence of images specified by ith_image to the summary.
 //
@@ -80,8 +80,8 @@ typedef Eigen::Tensor<uint8, 2, Eigen::RowMajor> Uint8Image;
 // differently in the float and uint8 cases: the float case needs a temporary
 // buffer which can be shared across calls to ith_image, but the uint8 case
 // does not.
-absl::Status AddImages(const string& tag, int max_images, int batch_size, int w,
-                       int h, int depth,
+absl::Status AddImages(const std::string& tag, int max_images, int batch_size,
+                       int w, int h, int depth,
                        const std::function<Uint8Image(int)>& ith_image,
                        Summary* s) {
   const int N = std::min<int>(max_images, batch_size);
@@ -118,7 +118,7 @@ absl::Status AddImages(const string& tag, int max_images, int batch_size, int w,
 template <class T>
 void NormalizeFloatImage(int hw, int depth,
                          typename TTypes<T>::ConstMatrix values,
-                         typename TTypes<uint8>::ConstVec bad_color,
+                         typename TTypes<uint8_t>::ConstVec bad_color,
                          Uint8Image* image) {
   if (!image->size()) return;  // Nothing to do for empty images
 
@@ -178,8 +178,8 @@ void NormalizeFloatImage(int hw, int depth,
       }
     }
     if (finite) {
-      image->chip<0>(i) =
-          (values.template chip<0>(i) * scale + offset).template cast<uint8>();
+      image->chip<0>(i) = (values.template chip<0>(i) * scale + offset)
+                              .template cast<uint8_t>();
     } else {
       image->chip<0>(i) = bad_color;
     }
@@ -189,7 +189,7 @@ void NormalizeFloatImage(int hw, int depth,
 template <class T>
 absl::Status NormalizeAndAddImages(const Tensor& tensor, int max_images, int h,
                                    int w, int hw, int depth, int batch_size,
-                                   const string& base_tag,
+                                   const std::string& base_tag,
                                    Tensor bad_color_tensor, Summary* s) {
   // For float and half images, nans and infs are replaced with bad_color.
   if (bad_color_tensor.dim_size(0) < depth) {
@@ -197,8 +197,8 @@ absl::Status NormalizeAndAddImages(const Tensor& tensor, int max_images, int h,
         "expected depth <= bad_color.size, got depth = ", depth,
         ", bad_color.size = ", bad_color_tensor.dim_size(0));
   }
-  auto bad_color_full = bad_color_tensor.vec<uint8>();
-  typename TTypes<uint8>::ConstVec bad_color(bad_color_full.data(), depth);
+  auto bad_color_full = bad_color_tensor.vec<uint8_t>();
+  typename TTypes<uint8_t>::ConstVec bad_color(bad_color_full.data(), depth);
 
   // Float images must be scaled and translated.
   Uint8Image image(hw, depth);
@@ -214,7 +214,7 @@ absl::Status NormalizeAndAddImages(const Tensor& tensor, int max_images, int h,
 
 }  // namespace
 
-absl::Status AddTensorAsScalarToSummary(const Tensor& t, const string& tag,
+absl::Status AddTensorAsScalarToSummary(const Tensor& t, const std::string& tag,
                                         Summary* s) {
   Summary::Value* v = s->add_value();
   v->set_tag(tag);
@@ -224,8 +224,8 @@ absl::Status AddTensorAsScalarToSummary(const Tensor& t, const string& tag,
   return absl::OkStatus();
 }
 
-absl::Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
-                                           Summary* s) {
+absl::Status AddTensorAsHistogramToSummary(const Tensor& t,
+                                           const std::string& tag, Summary* s) {
   Summary::Value* v = s->add_value();
   v->set_tag(tag);
   histogram::Histogram histo;
@@ -244,9 +244,9 @@ absl::Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
   return absl::OkStatus();
 }
 
-absl::Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
-                                       int max_images, const Tensor& bad_color,
-                                       Summary* s) {
+absl::Status AddTensorAsImageToSummary(const Tensor& tensor,
+                                       const std::string& tag, int max_images,
+                                       const Tensor& bad_color, Summary* s) {
   if (!(tensor.dims() == 4 &&
         (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
          tensor.dim_size(3) == 4))) {
@@ -269,8 +269,8 @@ absl::Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
   if (tensor.dtype() == DT_UINT8) {
     // For uint8 input, no normalization is necessary
     auto ith_image = [&tensor, batch_size, hw, depth](int i) {
-      auto values = tensor.shaped<uint8, 3>({batch_size, hw, depth});
-      return typename TTypes<uint8>::ConstMatrix(
+      auto values = tensor.shaped<uint8_t, 3>({batch_size, hw, depth});
+      return typename TTypes<uint8_t>::ConstMatrix(
           &values(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
     };
     TF_RETURN_IF_ERROR(
@@ -293,9 +293,9 @@ absl::Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
   return absl::OkStatus();
 }
 
-absl::Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
-                                       int max_outputs, float sample_rate,
-                                       Summary* s) {
+absl::Status AddTensorAsAudioToSummary(const Tensor& tensor,
+                                       const std::string& tag, int max_outputs,
+                                       float sample_rate, Summary* s) {
   if (sample_rate <= 0.0f) {
     return errors::InvalidArgument("sample_rate must be > 0");
   }
diff --git a/tensorflow/core/summary/summary_converter.h b/tensorflow/core/summary/summary_converter.h
index ab19669298ff4f..650958341e9d37 100644
--- a/tensorflow/core/summary/summary_converter.h
+++ b/tensorflow/core/summary/summary_converter.h
@@ -23,16 +23,16 @@ limitations under the License.
 namespace tensorflow {
 
 // TODO(jart): Delete these methods in favor of new Python implementation.
-absl::Status AddTensorAsScalarToSummary(const Tensor& t, const string& tag,
+absl::Status AddTensorAsScalarToSummary(const Tensor& t, const std::string& tag,
                                         Summary* s);
-absl::Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
-                                           Summary* s);
-absl::Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
-                                       int max_images, const Tensor& bad_color,
-                                       Summary* s);
-absl::Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
-                                       int max_outputs, float sample_rate,
-                                       Summary* s);
+absl::Status AddTensorAsHistogramToSummary(const Tensor& t,
+                                           const std::string& tag, Summary* s);
+absl::Status AddTensorAsImageToSummary(const Tensor& tensor,
+                                       const std::string& tag, int max_images,
+                                       const Tensor& bad_color, Summary* s);
+absl::Status AddTensorAsAudioToSummary(const Tensor& tensor,
+                                       const std::string& tag, int max_outputs,
+                                       float sample_rate, Summary* s);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index f0d4fec33ad45a..849fc9a6954c7e 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -67,13 +67,13 @@ namespace tensorflow {
 namespace {
 
 // https://www.sqlite.org/fileformat.html#record_format
-const uint64 kIdTiers[] = {
+const uint64_t kIdTiers[] = {
     0x7fffffULL,        // 23-bit (3 bytes on disk)
     0x7fffffffULL,      // 31-bit (4 bytes on disk)
     0x7fffffffffffULL,  // 47-bit (5 bytes on disk)
                         // remaining bits for future use
 };
-const int kMaxIdTier = sizeof(kIdTiers) / sizeof(uint64) - 1;
+const int kMaxIdTier = sizeof(kIdTiers) / sizeof(uint64_t) - 1;
 const int kIdCollisionDelayMicros = 10;
 const int kMaxIdCollisions = 21;  // sum(2**i*10µs for i in range(21))~=21s
 const int64_t kAbsent = 0LL;
@@ -92,16 +92,16 @@ const int64_t kPreallocateRows = 1000;
 // hundreds of megs but doesn't need the transaction to maintain its
 // invariants. This ensures the WAL read penalty is small and might
 // allow writers in other processes a chance to schedule.
-const uint64 kFlushBytes = 1024 * 1024;
+const uint64_t kFlushBytes = 1024 * 1024;
 
-double DoubleTime(uint64 micros) {
+double DoubleTime(uint64_t micros) {
   // TODO(@jart): Follow precise definitions for time laid out in schema.
   // TODO(@jart): Use monotonic clock from gRPC codebase.
   return static_cast<double>(micros) / 1.0e6;
 }
 
-string StringifyShape(const TensorShape& shape) {
-  string result;
+std::string StringifyShape(const TensorShape& shape) {
+  std::string result;
   bool first = true;
   for (const auto& dim : shape) {
     if (first) {
@@ -233,7 +233,7 @@ class IdAllocator {
 class GraphWriter {
  public:
   static absl::Status Save(Sqlite* db, SqliteTransaction* txn, IdAllocator* ids,
-                           GraphDef* graph, uint64 now, int64_t run_id,
+                           GraphDef* graph, uint64_t now, int64_t run_id,
                            int64_t* graph_id)
       SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db) {
     TF_RETURN_IF_ERROR(ids->CreateNewId(graph_id));
@@ -246,7 +246,7 @@ class GraphWriter {
   }
 
  private:
-  GraphWriter(Sqlite* db, SqliteTransaction* txn, GraphDef* graph, uint64 now,
+  GraphWriter(Sqlite* db, SqliteTransaction* txn, GraphDef* graph, uint64_t now,
               int64_t graph_id)
       : db_(db), txn_(txn), graph_(graph), now_(now), graph_id_(graph_id) {}
 
@@ -338,7 +338,7 @@ class GraphWriter {
       node->clear_op();
       node->clear_device();
       node->clear_input();
-      string node_def;
+      std::string node_def;
       if (node->SerializeToString(&node_def)) {
         insert.BindBlobUnsafe(6, node_def);
       }
@@ -364,7 +364,7 @@ class GraphWriter {
     insert.BindInt(2, graph_id_);
     insert.BindDouble(3, DoubleTime(now_));
     graph_->clear_node();
-    string graph_def;
+    std::string graph_def;
     if (graph_->SerializeToString(&graph_def)) {
       insert.BindBlobUnsafe(4, graph_def);
     }
@@ -382,11 +382,11 @@ class GraphWriter {
 
   Sqlite* const db_;
   SqliteTransaction* const txn_;
-  uint64 unflushed_bytes_ = 0;
+  uint64_t unflushed_bytes_ = 0;
   GraphDef* const graph_;
-  const uint64 now_;
+  const uint64_t now_;
   const int64_t graph_id_;
-  std::vector<string> name_copies_;
+  std::vector<std::string> name_copies_;
   std::unordered_map<absl::string_view, int64_t, StringPieceHasher>
       name_to_node_id_;
 
@@ -403,8 +403,8 @@ class GraphWriter {
 /// This class is thread safe.
 class RunMetadata {
  public:
-  RunMetadata(IdAllocator* ids, const string& experiment_name,
-              const string& run_name, const string& user_name)
+  RunMetadata(IdAllocator* ids, const std::string& experiment_name,
+              const std::string& run_name, const std::string& user_name)
       : ids_{ids},
         experiment_name_{experiment_name},
         run_name_{run_name},
@@ -412,16 +412,16 @@ class RunMetadata {
     DCHECK(ids_ != nullptr);
   }
 
-  const string& experiment_name() { return experiment_name_; }
-  const string& run_name() { return run_name_; }
-  const string& user_name() { return user_name_; }
+  const std::string& experiment_name() { return experiment_name_; }
+  const std::string& run_name() { return run_name_; }
+  const std::string& user_name() { return user_name_; }
 
   int64_t run_id() TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     return run_id_;
   }
 
-  absl::Status SetGraph(Sqlite* db, uint64 now, double computed_time,
+  absl::Status SetGraph(Sqlite* db, uint64_t now, double computed_time,
                         std::unique_ptr<GraphDef> g)
       SQLITE_TRANSACTIONS_EXCLUDED(*db) TF_LOCKS_EXCLUDED(mu_) {
     int64_t run_id;
@@ -437,8 +437,8 @@ class RunMetadata {
     return txn.Commit();
   }
 
-  absl::Status GetTagId(Sqlite* db, uint64 now, double computed_time,
-                        const string& tag_name, int64_t* tag_id,
+  absl::Status GetTagId(Sqlite* db, uint64_t now, double computed_time,
+                        const std::string& tag_name, int64_t* tag_id,
                         const SummaryMetadata& metadata)
       TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
@@ -484,7 +484,7 @@ class RunMetadata {
   }
 
  private:
-  absl::Status InitializeUser(Sqlite* db, uint64 now)
+  absl::Status InitializeUser(Sqlite* db, uint64_t now)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (user_id_ != kAbsent || user_name_.empty()) return absl::OkStatus();
     const char* get_sql = R"sql(
@@ -516,7 +516,7 @@ class RunMetadata {
     return absl::OkStatus();
   }
 
-  absl::Status InitializeExperiment(Sqlite* db, uint64 now,
+  absl::Status InitializeExperiment(Sqlite* db, uint64_t now,
                                     double computed_time)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (experiment_name_.empty()) return absl::OkStatus();
@@ -584,7 +584,7 @@ class RunMetadata {
     return absl::OkStatus();
   }
 
-  absl::Status InitializeRun(Sqlite* db, uint64 now, double computed_time)
+  absl::Status InitializeRun(Sqlite* db, uint64_t now, double computed_time)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (run_name_.empty()) return absl::OkStatus();
     TF_RETURN_IF_ERROR(InitializeExperiment(db, now, computed_time));
@@ -630,15 +630,15 @@ class RunMetadata {
 
   mutex mu_;
   IdAllocator* const ids_;
-  const string experiment_name_;
-  const string run_name_;
-  const string user_name_;
+  const std::string experiment_name_;
+  const std::string run_name_;
+  const std::string user_name_;
   int64_t experiment_id_ TF_GUARDED_BY(mu_) = kAbsent;
   int64_t run_id_ TF_GUARDED_BY(mu_) = kAbsent;
   int64_t user_id_ TF_GUARDED_BY(mu_) = kAbsent;
   double experiment_started_time_ TF_GUARDED_BY(mu_) = 0.0;
   double run_started_time_ TF_GUARDED_BY(mu_) = 0.0;
-  std::unordered_map<string, int64_t> tag_ids_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, int64_t> tag_ids_ TF_GUARDED_BY(mu_);
 
   RunMetadata(const RunMetadata&) = delete;
   void operator=(const RunMetadata&) = delete;
@@ -654,7 +654,7 @@ class SeriesWriter {
     DCHECK(series_ > 0);
   }
 
-  absl::Status Append(Sqlite* db, int64_t step, uint64 now,
+  absl::Status Append(Sqlite* db, int64_t step, uint64_t now,
                       double computed_time, const Tensor& t)
       SQLITE_TRANSACTIONS_EXCLUDED(*db) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
@@ -837,9 +837,9 @@ class SeriesWriter {
   mutex mu_;
   const int64_t series_;
   RunMetadata* const meta_;
-  uint64 count_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t count_ TF_GUARDED_BY(mu_) = 0;
   std::deque<int64_t> rowids_ TF_GUARDED_BY(mu_);
-  uint64 unflushed_bytes_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t unflushed_bytes_ TF_GUARDED_BY(mu_) = 0;
 
   SeriesWriter(const SeriesWriter&) = delete;
   void operator=(const SeriesWriter&) = delete;
@@ -856,7 +856,7 @@ class RunWriter {
  public:
   explicit RunWriter(RunMetadata* meta) : meta_{meta} {}
 
-  absl::Status Append(Sqlite* db, int64_t tag_id, int64_t step, uint64 now,
+  absl::Status Append(Sqlite* db, int64_t tag_id, int64_t step, uint64_t now,
                       double computed_time, const Tensor& t)
       SQLITE_TRANSACTIONS_EXCLUDED(*db) TF_LOCKS_EXCLUDED(mu_) {
     SeriesWriter* writer = GetSeriesWriter(tag_id);
@@ -903,8 +903,8 @@ class RunWriter {
 /// This class is thread safe.
 class SummaryDbWriter : public SummaryWriterInterface {
  public:
-  SummaryDbWriter(Env* env, Sqlite* db, const string& experiment_name,
-                  const string& run_name, const string& user_name)
+  SummaryDbWriter(Env* env, Sqlite* db, const std::string& experiment_name,
+                  const std::string& run_name, const std::string& user_name)
       : SummaryWriterInterface(),
         env_{env},
         db_{db},
@@ -941,8 +941,9 @@ class SummaryDbWriter : public SummaryWriterInterface {
 
   absl::Status Flush() override { return absl::OkStatus(); }
 
-  absl::Status WriteTensor(int64_t global_step, Tensor t, const string& tag,
-                           const string& serialized_metadata) override {
+  absl::Status WriteTensor(int64_t global_step, Tensor t,
+                           const std::string& tag,
+                           const std::string& serialized_metadata) override {
     TF_RETURN_IF_ERROR(CheckSupportedType(t));
     SummaryMetadata metadata;
     if (!metadata.ParseFromString(serialized_metadata)) {
@@ -952,7 +953,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
   }
 
   absl::Status WriteScalar(int64_t global_step, Tensor t,
-                           const string& tag) override {
+                           const std::string& tag) override {
     TF_RETURN_IF_ERROR(CheckSupportedType(t));
     SummaryMetadata metadata;
     PatchPluginName(&metadata, kScalarPluginName);
@@ -961,7 +962,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
 
   absl::Status WriteGraph(int64_t global_step,
                           std::unique_ptr<GraphDef> g) override {
-    uint64 now = env_->NowMicros();
+    uint64_t now = env_->NowMicros();
     return meta_.SetGraph(db_, now, DoubleTime(now), std::move(g));
   }
 
@@ -970,8 +971,8 @@ class SummaryDbWriter : public SummaryWriterInterface {
   }
 
   absl::Status WriteHistogram(int64_t global_step, Tensor t,
-                              const string& tag) override {
-    uint64 now = env_->NowMicros();
+                              const std::string& tag) override {
+    uint64_t now = env_->NowMicros();
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
     e->set_wall_time(DoubleTime(now));
@@ -980,9 +981,9 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return MigrateEvent(std::move(e));
   }
 
-  absl::Status WriteImage(int64_t global_step, Tensor t, const string& tag,
+  absl::Status WriteImage(int64_t global_step, Tensor t, const std::string& tag,
                           int max_images, Tensor bad_color) override {
-    uint64 now = env_->NowMicros();
+    uint64_t now = env_->NowMicros();
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
     e->set_wall_time(DoubleTime(now));
@@ -991,9 +992,9 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return MigrateEvent(std::move(e));
   }
 
-  absl::Status WriteAudio(int64_t global_step, Tensor t, const string& tag,
+  absl::Status WriteAudio(int64_t global_step, Tensor t, const std::string& tag,
                           int max_outputs, float sample_rate) override {
-    uint64 now = env_->NowMicros();
+    uint64_t now = env_->NowMicros();
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
     e->set_wall_time(DoubleTime(now));
@@ -1002,12 +1003,12 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return MigrateEvent(std::move(e));
   }
 
-  string DebugString() const override { return "SummaryDbWriter"; }
+  std::string DebugString() const override { return "SummaryDbWriter"; }
 
  private:
-  absl::Status Write(int64_t step, const Tensor& t, const string& tag,
+  absl::Status Write(int64_t step, const Tensor& t, const std::string& tag,
                      const SummaryMetadata& metadata) {
-    uint64 now = env_->NowMicros();
+    uint64_t now = env_->NowMicros();
     double computed_time = DoubleTime(now);
     int64_t tag_id;
     TF_RETURN_IF_ERROR(
@@ -1022,7 +1023,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
   absl::Status MigrateEvent(std::unique_ptr<Event> e) {
     switch (e->what_case()) {
       case Event::WhatCase::kSummary: {
-        uint64 now = env_->NowMicros();
+        uint64_t now = env_->NowMicros();
         auto summaries = e->mutable_summary();
         for (int i = 0; i < summaries->value_size(); ++i) {
           Summary::Value* value = summaries->mutable_value(i);
@@ -1046,8 +1047,8 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return absl::OkStatus();
   }
 
-  absl::Status MigrateGraph(const Event* e, const string& graph_def) {
-    uint64 now = env_->NowMicros();
+  absl::Status MigrateGraph(const Event* e, const std::string& graph_def) {
+    uint64_t now = env_->NowMicros();
     std::unique_ptr<GraphDef> graph{new GraphDef};
     if (!ParseProtoUnlimited(graph.get(), graph_def)) {
       return errors::InvalidArgument("bad proto");
@@ -1055,7 +1056,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return meta_.SetGraph(db_, now, e->wall_time(), std::move(graph));
   }
 
-  absl::Status MigrateSummary(const Event* e, Summary::Value* s, uint64 now) {
+  absl::Status MigrateSummary(const Event* e, Summary::Value* s, uint64_t now) {
     switch (s->value_case()) {
       case Summary::Value::ValueCase::kTensor:
         TF_RETURN_WITH_CONTEXT_IF_ERROR(MigrateTensor(e, s, now), "tensor");
@@ -1078,7 +1079,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return absl::OkStatus();
   }
 
-  absl::Status MigrateTensor(const Event* e, Summary::Value* s, uint64 now) {
+  absl::Status MigrateTensor(const Event* e, Summary::Value* s, uint64_t now) {
     Tensor t;
     if (!t.FromProto(s->tensor())) return errors::InvalidArgument("bad proto");
     TF_RETURN_IF_ERROR(CheckSupportedType(t));
@@ -1090,7 +1091,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
 
   // TODO(jart): Refactor Summary -> Tensor logic into separate file.
 
-  absl::Status MigrateScalar(const Event* e, Summary::Value* s, uint64 now) {
+  absl::Status MigrateScalar(const Event* e, Summary::Value* s, uint64_t now) {
     // See tensorboard/plugins/scalar/summary.py and data_compat.py
     Tensor t{DT_FLOAT, {}};
     t.scalar<float>()() = s->simple_value();
@@ -1101,7 +1102,8 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
-  absl::Status MigrateHistogram(const Event* e, Summary::Value* s, uint64 now) {
+  absl::Status MigrateHistogram(const Event* e, Summary::Value* s,
+                                uint64_t now) {
     const HistogramProto& histo = s->histo();
     int k = histo.bucket_size();
     if (k != histo.bucket_limit_size()) {
@@ -1132,7 +1134,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
-  absl::Status MigrateImage(const Event* e, Summary::Value* s, uint64 now) {
+  absl::Status MigrateImage(const Event* e, Summary::Value* s, uint64_t now) {
     // See tensorboard/plugins/image/summary.py and data_compat.py
     Tensor t{DT_STRING, {3}};
     auto img = s->mutable_image();
@@ -1146,7 +1148,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
-  absl::Status MigrateAudio(const Event* e, Summary::Value* s, uint64 now) {
+  absl::Status MigrateAudio(const Event* e, Summary::Value* s, uint64_t now) {
     // See tensorboard/plugins/audio/summary.py and data_compat.py
     Tensor t{DT_STRING, {1, 2}};
     auto wav = s->mutable_audio();
@@ -1168,9 +1170,10 @@ class SummaryDbWriter : public SummaryWriterInterface {
 
 }  // namespace
 
-absl::Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
-                                   const string& run_name,
-                                   const string& user_name, Env* env,
+absl::Status CreateSummaryDbWriter(Sqlite* db,
+                                   const std::string& experiment_name,
+                                   const std::string& run_name,
+                                   const std::string& user_name, Env* env,
                                    SummaryWriterInterface** result) {
   *result = new SummaryDbWriter(env, db, experiment_name, run_name, user_name);
   return absl::OkStatus();
diff --git a/tensorflow/core/summary/summary_db_writer.h b/tensorflow/core/summary/summary_db_writer.h
index 545f849e0a1160..05900fe8ce0ca6 100644
--- a/tensorflow/core/summary/summary_db_writer.h
+++ b/tensorflow/core/summary/summary_db_writer.h
@@ -34,9 +34,10 @@ namespace tensorflow {
 /// the future if support for other DBs is added to core.
 ///
 /// The result holds a new reference to db.
-absl::Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
-                                   const string& run_name,
-                                   const string& user_name, Env* env,
+absl::Status CreateSummaryDbWriter(Sqlite* db,
+                                   const std::string& experiment_name,
+                                   const std::string& run_name,
+                                   const std::string& user_name, Env* env,
                                    SummaryWriterInterface** result);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/summary/summary_db_writer_test.cc b/tensorflow/core/summary/summary_db_writer_test.cc
index 76cabf9f9fe0f7..8c25da1823f057 100644
--- a/tensorflow/core/summary/summary_db_writer_test.cc
+++ b/tensorflow/core/summary/summary_db_writer_test.cc
@@ -47,12 +47,12 @@ Tensor MakeScalarInt64(int64_t x) {
 class FakeClockEnv : public EnvWrapper {
  public:
   FakeClockEnv() : EnvWrapper(Env::Default()), current_millis_(0) {}
-  void AdvanceByMillis(const uint64 millis) { current_millis_ += millis; }
-  uint64 NowMicros() const override { return current_millis_ * 1000; }
-  uint64 NowSeconds() const override { return current_millis_ * 1000; }
+  void AdvanceByMillis(const uint64_t millis) { current_millis_ += millis; }
+  uint64_t NowMicros() const override { return current_millis_ * 1000; }
+  uint64_t NowSeconds() const override { return current_millis_ * 1000; }
 
  private:
-  uint64 current_millis_;
+  uint64_t current_millis_;
 };
 
 class SummaryDbWriterTest : public ::testing::Test {
@@ -71,7 +71,7 @@ class SummaryDbWriterTest : public ::testing::Test {
     db_ = nullptr;
   }
 
-  int64_t QueryInt(const string& sql) {
+  int64_t QueryInt(const std::string& sql) {
     SqliteStatement stmt = db_->PrepareOrDie(sql);
     bool is_done;
     absl::Status s = stmt.Step(&is_done);
@@ -82,7 +82,7 @@ class SummaryDbWriterTest : public ::testing::Test {
     return stmt.ColumnInt(0);
   }
 
-  double QueryDouble(const string& sql) {
+  double QueryDouble(const std::string& sql) {
     SqliteStatement stmt = db_->PrepareOrDie(sql);
     bool is_done;
     absl::Status s = stmt.Step(&is_done);
@@ -93,7 +93,7 @@ class SummaryDbWriterTest : public ::testing::Test {
     return stmt.ColumnDouble(0);
   }
 
-  string QueryString(const string& sql) {
+  std::string QueryString(const std::string& sql) {
     SqliteStatement stmt = db_->PrepareOrDie(sql);
     bool is_done;
     absl::Status s = stmt.Step(&is_done);
@@ -142,7 +142,7 @@ TEST_F(SummaryDbWriterTest, WriteHistogram_VerifyTensorValues) {
 
   // TODO(nickfelt): implement QueryTensor() to encapsulate this
   // Verify the data
-  string result = QueryString("SELECT data FROM Tensors");
+  std::string result = QueryString("SELECT data FROM Tensors");
   const double* val = reinterpret_cast<const double*>(result.data());
   double histarray[] = {std::numeric_limits<double>::min(),
                         -30.5,
diff --git a/tensorflow/core/summary/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
index 2821edc777842c..dfb1bba4aecbe5 100644
--- a/tensorflow/core/summary/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -47,7 +47,8 @@ class SummaryFileWriter : public SummaryWriterInterface {
         flush_millis_(flush_millis),
         env_(env) {}
 
-  absl::Status Initialize(const string& logdir, const string& filename_suffix) {
+  absl::Status Initialize(const std::string& logdir,
+                          const std::string& filename_suffix) {
     const absl::Status is_dir = env_->IsDirectory(logdir);
     if (!is_dir.ok()) {
       if (is_dir.code() != tensorflow::error::NOT_FOUND) {
@@ -60,8 +61,8 @@ class SummaryFileWriter : public SummaryWriterInterface {
     int32_t pid = env_->GetProcessId();
     static std::atomic<int64_t> file_id_counter(0);
     // Precede filename_suffix with "." if it doesn't already start with one.
-    string sep = absl::StartsWith(filename_suffix, ".") ? "" : ".";
-    const string uniquified_filename_suffix = absl::StrCat(
+    std::string sep = absl::StartsWith(filename_suffix, ".") ? "" : ".";
+    const std::string uniquified_filename_suffix = absl::StrCat(
         ".", pid, ".", file_id_counter.fetch_add(1), sep, filename_suffix);
     mutex_lock ml(mu_);
     events_writer_ =
@@ -86,8 +87,9 @@ class SummaryFileWriter : public SummaryWriterInterface {
     (void)Flush();  // Ignore errors.
   }
 
-  absl::Status WriteTensor(int64_t global_step, Tensor t, const string& tag,
-                           const string& serialized_metadata) override {
+  absl::Status WriteTensor(int64_t global_step, Tensor t,
+                           const std::string& tag,
+                           const std::string& serialized_metadata) override {
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
     e->set_wall_time(GetWallTime());
@@ -110,7 +112,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
   }
 
   absl::Status WriteScalar(int64_t global_step, Tensor t,
-                           const string& tag) override {
+                           const std::string& tag) override {
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
     e->set_wall_time(GetWallTime());
@@ -120,7 +122,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
   }
 
   absl::Status WriteHistogram(int64_t global_step, Tensor t,
-                              const string& tag) override {
+                              const std::string& tag) override {
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
     e->set_wall_time(GetWallTime());
@@ -129,7 +131,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     return WriteEvent(std::move(e));
   }
 
-  absl::Status WriteImage(int64_t global_step, Tensor t, const string& tag,
+  absl::Status WriteImage(int64_t global_step, Tensor t, const std::string& tag,
                           int max_images, Tensor bad_color) override {
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
@@ -139,7 +141,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     return WriteEvent(std::move(e));
   }
 
-  absl::Status WriteAudio(int64_t global_step, Tensor t, const string& tag,
+  absl::Status WriteAudio(int64_t global_step, Tensor t, const std::string& tag,
                           int max_outputs, float sample_rate) override {
     std::unique_ptr<Event> e{new Event};
     e->set_step(global_step);
@@ -168,7 +170,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     return absl::OkStatus();
   }
 
-  string DebugString() const override { return "SummaryFileWriter"; }
+  std::string DebugString() const override { return "SummaryFileWriter"; }
 
  private:
   double GetWallTime() {
@@ -189,21 +191,22 @@ class SummaryFileWriter : public SummaryWriterInterface {
   bool is_initialized_;
   const int max_queue_;
   const int flush_millis_;
-  uint64 last_flush_;
+  uint64_t last_flush_;
   Env* env_;
   mutex mu_;
   std::vector<std::unique_ptr<Event>> queue_ TF_GUARDED_BY(mu_);
   // A pointer to allow deferred construction.
   std::unique_ptr<EventsWriter> events_writer_ TF_GUARDED_BY(mu_);
-  std::vector<std::pair<string, SummaryMetadata>> registered_summaries_
+  std::vector<std::pair<std::string, SummaryMetadata>> registered_summaries_
       TF_GUARDED_BY(mu_);
 };
 
 }  // namespace
 
 absl::Status CreateSummaryFileWriter(int max_queue, int flush_millis,
-                                     const string& logdir,
-                                     const string& filename_suffix, Env* env,
+                                     const std::string& logdir,
+                                     const std::string& filename_suffix,
+                                     Env* env,
                                      SummaryWriterInterface** result) {
   SummaryFileWriter* w = new SummaryFileWriter(max_queue, flush_millis, env);
   const absl::Status s = w->Initialize(logdir, filename_suffix);
diff --git a/tensorflow/core/summary/summary_file_writer.h b/tensorflow/core/summary/summary_file_writer.h
index 847e7cb8d396b1..a3ba40bf8a4db3 100644
--- a/tensorflow/core/summary/summary_file_writer.h
+++ b/tensorflow/core/summary/summary_file_writer.h
@@ -35,9 +35,9 @@ namespace tensorflow {
 /// returned status is ok. The Env object must not be destroyed until
 /// after the returned writer.
 absl::Status CreateSummaryFileWriter(int max_queue, int flush_millis,
-                                     const string& logdir,
-                                     const string& filename_suffix, Env* env,
-                                     SummaryWriterInterface** result);
+                                     const std::string& logdir,
+                                     const std::string& filename_suffix,
+                                     Env* env, SummaryWriterInterface** result);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/summary/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
index 4c8bf2eb407bb5..94ca029774f40d 100644
--- a/tensorflow/core/summary/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -43,21 +43,21 @@ namespace {
 class FakeClockEnv : public EnvWrapper {
  public:
   FakeClockEnv() : EnvWrapper(Env::Default()), current_millis_(0) {}
-  void AdvanceByMillis(const uint64 millis) { current_millis_ += millis; }
-  uint64 NowMicros() const override { return current_millis_ * 1000; }
-  uint64 NowSeconds() const override { return current_millis_ * 1000; }
+  void AdvanceByMillis(const uint64_t millis) { current_millis_ += millis; }
+  uint64_t NowMicros() const override { return current_millis_ * 1000; }
+  uint64_t NowSeconds() const override { return current_millis_ * 1000; }
 
  private:
-  uint64 current_millis_;
+  uint64_t current_millis_;
 };
 
 class SummaryFileWriterTest : public ::testing::Test {
  protected:
   absl::Status SummaryTestHelper(
-      const string& test_name,
+      const std::string& test_name,
       const std::function<absl::Status(SummaryWriterInterface*)>& writer_fn,
       const std::function<void(const Event&)>& test_fn) {
-    static std::set<string>* tests = new std::set<string>();
+    static std::set<std::string>* tests = new std::set<std::string>();
     CHECK(tests->insert(test_name).second) << ": " << test_name;
 
     SummaryWriterInterface* writer;
@@ -68,10 +68,10 @@ class SummaryFileWriterTest : public ::testing::Test {
     TF_CHECK_OK(writer_fn(writer));
     TF_CHECK_OK(writer->Flush());
 
-    std::vector<string> files;
+    std::vector<std::string> files;
     TF_CHECK_OK(env_.GetChildren(testing::TmpDir(), &files));
     bool found = false;
-    for (const string& f : files) {
+    for (const std::string& f : files) {
       if (absl::StrContains(f, test_name)) {
         if (found) {
           return errors::Unknown("Found more than one file for ", test_name);
@@ -82,7 +82,7 @@ class SummaryFileWriterTest : public ::testing::Test {
                                              &read_file));
         io::RecordReader reader(read_file.get(), io::RecordReaderOptions());
         tstring record;
-        uint64 offset = 0;
+        uint64_t offset = 0;
         TF_CHECK_OK(
             reader.ReadRecord(&offset,
                               &record));  // The first event is irrelevant
@@ -179,7 +179,7 @@ namespace {
 template <typename T>
 static absl::Status CreateImage(SummaryWriterInterface* writer) {
   Tensor bad_color(DT_UINT8, TensorShape({1}));
-  bad_color.scalar<uint8>()() = 0;
+  bad_color.scalar<uint8_t>()() = 0;
   Tensor one(DataTypeToEnum<T>::v(), TensorShape({1, 1, 1, 1}));
   one.scalar<T>()() = T(1);
   TF_RETURN_IF_ERROR(writer->WriteImage(2, one, "name", 1, bad_color));
@@ -202,7 +202,7 @@ static void CheckImage(const Event& e) {
 
 TEST_F(SummaryFileWriterTest, WriteImageUInt8) {
   TF_CHECK_OK(
-      SummaryTestHelper("image_test_uint8", CreateImage<uint8>, CheckImage));
+      SummaryTestHelper("image_test_uint8", CreateImage<uint8_t>, CheckImage));
 }
 
 TEST_F(SummaryFileWriterTest, WriteImageFloat) {
@@ -272,7 +272,7 @@ TEST_F(SummaryFileWriterTest, WallTime) {
 
 TEST_F(SummaryFileWriterTest, AvoidFilenameCollision) {
   // Keep unique with all other test names in this file.
-  string test_name = "avoid_filename_collision_test";
+  std::string test_name = "avoid_filename_collision_test";
   int num_files = 10;
   for (int i = 0; i < num_files; i++) {
     SummaryWriterInterface* writer;
@@ -280,11 +280,11 @@ TEST_F(SummaryFileWriterTest, AvoidFilenameCollision) {
                                         &env_, &writer));
     core::ScopedUnref deleter(writer);
   }
-  std::vector<string> files;
+  std::vector<std::string> files;
   TF_CHECK_OK(env_.GetChildren(testing::TmpDir(), &files));
   // Filter `files` down to just those generated in this test.
   files.erase(std::remove_if(files.begin(), files.end(),
-                             [test_name](string f) {
+                             [test_name](std::string f) {
                                return !absl::StrContains(f, test_name);
                              }),
               files.end());
diff --git a/tensorflow/core/summary/vacuum.cc b/tensorflow/core/summary/vacuum.cc
index 1268b93d040b17..29c459cca89f13 100644
--- a/tensorflow/core/summary/vacuum.cc
+++ b/tensorflow/core/summary/vacuum.cc
@@ -110,7 +110,7 @@ void Vacuum(const char* path) {
 }
 
 int main(int argc, char* argv[]) {
-  string usage = Flags::Usage(argv[0], {});
+  std::string usage = Flags::Usage(argv[0], {});
   bool parse_result = Flags::Parse(&argc, argv, {});
   if (!parse_result) {
     std::cerr << "The vacuum tool rebuilds SQLite database files created by\n"
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index bf4290bfca990c..9a6ec5bba211e5 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -36,7 +36,7 @@ PjRtState* PjRtState::Create() { return new PjRtState(); }
 
 absl::StatusOr<xla::PjRtClient*> PjRtState::GetPjRtClient(
     const DeviceType& device_type) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (auto it = clients_.find(device_type); it != clients_.end()) {
     return it->second.get();
   }
@@ -46,7 +46,7 @@ absl::StatusOr<xla::PjRtClient*> PjRtState::GetPjRtClient(
 
 absl::StatusOr<xla::PjRtClient*> PjRtState::GetOrCreatePjRtClient(
     const DeviceType& device_type) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (auto it = clients_.find(device_type); it != clients_.end()) {
     return it->second.get();
   }
@@ -68,7 +68,7 @@ absl::StatusOr<xla::PjRtClient*> PjRtState::GetOrCreatePjRtClient(
 
 absl::Status PjRtState::SetPjRtClient(const DeviceType& device_type,
                                       std::unique_ptr<xla::PjRtClient> client) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (auto it = clients_.find(device_type); it != clients_.end()) {
     unused_.push_back(std::move(it->second));
   }
@@ -77,7 +77,7 @@ absl::Status PjRtState::SetPjRtClient(const DeviceType& device_type,
 }
 
 absl::Status PjRtState::MovePjRtClientToUnused(const DeviceType& device_type) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (auto it = clients_.find(device_type); it != clients_.end()) {
     unused_.push_back(std::move(it->second));
     clients_.erase(it);
@@ -89,13 +89,13 @@ absl::Status PjRtState::MovePjRtClientToUnused(const DeviceType& device_type) {
 
 absl::Status PjRtState::SetPjRtGpuClientCreationInfo(
     std::unique_ptr<PjRtGpuClientCreationInfo> info) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   pjrt_gpu_client_creation_info_ = std::move(info);
   return absl::OkStatus();
 }
 
 PjRtGpuClientCreationInfo* PjRtState::GetPjRtGpuClientCreationInfo() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   return pjrt_gpu_client_creation_info_.get();
 }
 
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 1c985c50aada77..7967ea7945b9d9 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -78,6 +78,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
     ],
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index 32f4a8ceb6d380..c7f12aed50daa3 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/base/nullability.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "xla/tsl/platform/errors.h"
diff --git a/tensorflow/core/tfrt/fallback/fallback_state_test.cc b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
index 9f71bde2e9e410..16531164c6e5d4 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state_test.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/base/nullability.h"
+#include "absl/status/status_matchers.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/const_op.h"
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
index b36c8e99a65bcb..f8ebd91f053f72 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
@@ -148,17 +148,14 @@ absl::StatusOr<OpKernelRunner> OpKernelRunner::Create(
     return absl::InternalError(
         absl::StrCat("Failed to create OpKernel for op: ", op_name));
   }
-  return OpKernelRunner(op_name, device, function_library_runtime,
-                        std::move(op_kernel));
+  return OpKernelRunner(device, function_library_runtime, std::move(op_kernel));
 }
 
 OpKernelRunner::OpKernelRunner(
-    absl::string_view op_name, tensorflow::Device* device,
+    tensorflow::Device* device,
     tensorflow::FunctionLibraryRuntime* function_library_runtime,
     std::unique_ptr<tensorflow::OpKernel> op_kernel)
-    : op_kernel_(std::move(op_kernel)),
-      op_name_(op_name),
-      info_(std::make_unique<Info>()) {
+    : op_kernel_(std::move(op_kernel)), info_(std::make_unique<Info>()) {
   DCHECK(device);
   DCHECK(function_library_runtime);
 
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.h b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
index 4b0987cf984f51..317d0956b4a247 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
@@ -129,12 +129,11 @@ class OpKernelRunner {
 
  private:
   explicit OpKernelRunner(
-      absl::string_view op_name, tensorflow::Device* device,
+      tensorflow::Device* device,
       tensorflow::FunctionLibraryRuntime* function_library_runtime,
       std::unique_ptr<OpKernel> op_kernel);
 
   std::unique_ptr<OpKernel> op_kernel_;
-  std::string op_name_;
   absl::Span<const AllocatorAttributes> input_alloc_attrs_;
   absl::Span<const AllocatorAttributes> output_alloc_attrs_;
 
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index 4edc49eba34f72..879a5667a3486b 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -346,6 +346,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
         "@local_xla//xla:shape_util",
+        "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/pjrt_ifrt:xla_ifrt",
@@ -466,6 +467,7 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/framework:types_proto_cc",
+        "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
index 6e167c690ec051..e626f64f808f91 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
@@ -36,8 +36,20 @@ absl::Status IfrtRestoreTensorRegistry::TryRegister(
   absl::MutexLock lock(mutex_);
   auto& info = restored_tensors_[name];
   if (info.tensor_future.IsValid()) {
-    return absl::AlreadyExistsError(
-        absl::StrCat("Variable '", name, "' already registered."));
+    if (info.dtype_and_shape != restored_tensor_info.dtype_and_shape) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Variable '", name, "' already registered with dtype ",
+          info.dtype_and_shape.dtype, " and shape ",
+          info.dtype_and_shape.shape.DebugString(),
+          " but trying to register with dtype ",
+          restored_tensor_info.dtype_and_shape.dtype, " and shape ",
+          restored_tensor_info.dtype_and_shape.shape.DebugString()));
+    }
+    LOG(WARNING)
+        << "Variable named '" << name
+        << "' has been already registered. Ignore request of a new tensor with "
+           "same name, dtype and shape.";
+    return absl::OkStatus();
   }
   info = std::move(restored_tensor_info);
   return absl::OkStatus();
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry_test.cc
index e7ebad15cb9070..b5aa63e4ec753c 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "testing/base/public/mock-log.h"
+#include "absl/base/log_severity.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
@@ -34,6 +36,12 @@ namespace tensorflow {
 namespace ifrt_serving {
 namespace {
 
+using ::testing::_;
+using ::testing::AnyNumber;
+using ::testing::HasSubstr;
+using ::testing::kDoNotCaptureLogsYet;
+using ::testing::ScopedMockLog;
+
 TEST(IfrtRestoreTensorRegistryTest, RetrieveNonRegisteredTensorFails) {
   IfrtRestoreTensorRegistry registry;
   EXPECT_THAT(registry.GetRestoredTensor("input_tensor_1").Await(),
@@ -53,7 +61,52 @@ TEST(IfrtRestoreTensorRegistryTest, SetNonExistedTensorAsUsedByHostFails) {
               absl_testing::StatusIs(absl::StatusCode::kNotFound));
 }
 
-TEST(IfrtRestoreTensorRegistryTest, RegisteredExistedTensorFails) {
+TEST(IfrtRestoreTensorRegistryTest,
+     RegisteredExistedTensorwithDifferentDtypeAndShapeFails) {
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({2, 2}));
+  auto [promise, future] = tsl::Future<tensorflow::Tensor>::MakePromise();
+
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      .used_by_host = false,
+      .dtype_and_shape =
+          {
+              .dtype = DT_INT32,
+              .shape = tensorflow::TensorShape({2, 2}),
+          },
+      .tensor_future = future};
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.TryRegister("input_tensor_2", restored_tensor_info),
+              absl_testing::IsOk());
+  promise.Set(input_tensor);
+  IfrtRestoreTensorRegistry::RestoredTensorInfo
+      restored_tensor_info_with_different_dtype = {
+          .used_by_host = false,
+          .dtype_and_shape =
+              {
+                  .dtype = DT_INT64,
+                  .shape = tensorflow::TensorShape({2, 2}),
+              },
+          .tensor_future = future};
+  EXPECT_THAT(registry.TryRegister("input_tensor_2",
+                                   restored_tensor_info_with_different_dtype),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  IfrtRestoreTensorRegistry::RestoredTensorInfo
+      restored_tensor_info_with_different_shape = {
+          .used_by_host = false,
+          .dtype_and_shape =
+              {
+                  .dtype = DT_INT32,
+                  .shape = tensorflow::TensorShape({2, 3}),
+              },
+          .tensor_future = future};
+  EXPECT_THAT(registry.TryRegister("input_tensor_2",
+                                   restored_tensor_info_with_different_shape),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST(IfrtRestoreTensorRegistryTest,
+     RegisteredExistedTensorWithSameDtypeAndShapeSucceedsAndWarningIsLogged) {
   auto input_tensor =
       test::AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({2, 2}));
   auto [promise, future] = tsl::Future<tensorflow::Tensor>::MakePromise();
@@ -70,8 +123,18 @@ TEST(IfrtRestoreTensorRegistryTest, RegisteredExistedTensorFails) {
   EXPECT_THAT(registry.TryRegister("input_tensor_2", restored_tensor_info),
               absl_testing::IsOk());
   promise.Set(input_tensor);
+
+  ScopedMockLog mock_log(kDoNotCaptureLogsYet);
+  EXPECT_CALL(mock_log, Log).Times(AnyNumber());
+  EXPECT_CALL(mock_log,
+              Log(base_logging::WARNING, _,
+                  HasSubstr("Variable named 'input_tensor_2' has been already "
+                            "registered. Ignore request of a new tensor")))
+      .Times(1);
+  mock_log.StartCapturingLogs();
+
   EXPECT_THAT(registry.TryRegister("input_tensor_2", restored_tensor_info),
-              absl_testing::StatusIs(absl::StatusCode::kAlreadyExists));
+              absl_testing::IsOk());
 }
 
 TEST(IfrtRestoreTensorRegistryTest, SetTensorAsUsedByHost) {
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 454f345bc3113b..d8c2064e5f6f81 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -268,19 +268,6 @@ IfrtServingExecutable::Create(
   return executable;
 }
 
-absl::StatusOr<xla::ifrt::ArrayRef> IfrtServingExecutable::ConvertTensorToArray(
-    const tensorflow::Tensor& tensor,
-    const xla::ifrt::DeviceListRef& device_list,
-    const xla::OpSharding& sharding) {
-  xla::ifrt::Shape input_shape = ToIfrtShape(tensor.shape());
-  VLOG(2) << "Converting tensor of shape " << input_shape;
-
-  TF_ASSIGN_OR_RETURN(auto hlo_sharding, xla::HloSharding::FromProto(sharding));
-
-  return MakeArrayFromTensor(*ifrt_client_, tensor, device_list,
-                             std::move(hlo_sharding), thread_pool_);
-}
-
 absl::StatusOr<std::vector<tensorflow::FunctionDef>> BuildFunctionDef(
     mlir::ModuleOp module) {
   std::vector<tensorflow::FunctionDef> function_defs;
@@ -590,11 +577,14 @@ IfrtServingExecutable::LookUpOrCreateExecutable(
       return it->second;
     }
 
-    if (is_frozen_) {
+    if (is_frozen_ || tf_to_hlo_compiler_->IsXlaCompilationDisabled()) {
       tsl::Future<SharedCachedExecutableBundle> frozen_future(
-          absl::FailedPreconditionError(
-              "Cannot compile for new input shapes after the executable is "
-              "already frozen."));
+          absl::FailedPreconditionError(absl::StrCat(
+              "Cannot compile for new input shapes. Either the executable is "
+              "already frozen: ",
+              is_frozen_,
+              " or XLA compilation disabled by ScopedTpuCompileDisabler: ",
+              tf_to_hlo_compiler_->IsXlaCompilationDisabled())));
       return frozen_future;
     }
 
@@ -718,9 +708,19 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   for (xla::ifrt::Device* device : device_list->devices()) {
     device_ids.push_back(device->Id().value());
   }
-  std::vector<xla::ifrt::ArrayRef> args;
+  std::vector<tsl::Future<xla::ifrt::ArrayRef>> args;
   args.reserve(inputs.size());
   int variable_arg_index = 0;
+  // TODO(b/445201291): Plumb the H2DTransferExecutorFactory from the
+  // IfrtServingExecutable constructor.
+  absl::StatusOr<std::unique_ptr<H2DTransferExecutor>>
+      user_inputs_h2d_transfer_executor =
+          h2d_transfer_executor_factory_ != nullptr
+              ? h2d_transfer_executor_factory_->CreateH2DTransferExecutor(
+                    *ifrt_client_)
+              : std::make_unique<H2DTransferExecutor>(*ifrt_client_);
+  TF_RETURN_IF_ERROR(user_inputs_h2d_transfer_executor.status());
+
   for (int i = 0; i < inputs.size(); i++) {
     if (variable_arg_index < variable_arg_indices.size() &&
         i == variable_arg_indices[variable_arg_index]) {
@@ -736,9 +736,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
       TF_ASSIGN_OR_RETURN(
           auto loaded_variable,
           ifrt_loaded_variable_registry_.GetLoadedVariable(key));
-      TF_ASSIGN_OR_RETURN(xla::ifrt::ArrayRef single_array,
-                          loaded_variable.array.Await());
-      args.push_back(std::move(single_array));
+      args.push_back(std::move(loaded_variable.array));
       variable_arg_index++;
     } else {
       // If the input shape is not the same as the shape after Tf2Hlo
@@ -753,17 +751,27 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
           !reshaped.CopyFrom(inputs[i], reshaped_shape)) {
         return absl::InternalError("Failed to reshape tensor");
       }
-
       TF_ASSIGN_OR_RETURN(
-          auto single_array,
-          ConvertTensorToArray(
-              reshaped, device_list,
-              executable_bundle->compile_metadata.args()[i].sharding()));
-      args.push_back(single_array);
+          tsl::Future<xla::ifrt::ArrayRef> array_ref,
+          (*user_inputs_h2d_transfer_executor)
+              ->ScheduledH2DTransfer(
+                  reshaped, device_list,
+                  executable_bundle->compile_metadata.args()[i].sharding(),
+                  thread_pool_));
+      args.push_back(std::move(array_ref));
     }
   }
   DCHECK_EQ(args.size(), executable_bundle->compile_metadata.args().size());
 
+  TF_RETURN_IF_ERROR((*user_inputs_h2d_transfer_executor)->RunH2DTransfers());
+
+  std::vector<xla::ifrt::ArrayRef> transfer_result;
+  transfer_result.reserve(args.size());
+  for (auto& arg : args) {
+    TF_ASSIGN_OR_RETURN(auto array_ref, arg.Await());
+    transfer_result.push_back(std::move(array_ref));
+  }
+
   VLOG(2) << "Start Execution";
 
   std::optional<xla::ifrt::DeviceListRef> execution_device_list;
@@ -775,7 +783,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   {
     tsl::profiler::TraceMe traceme("Execute");
     execution_result = executable_bundle->ifrt_executable->Execute(
-        absl::MakeSpan(args), /*options=*/{.fill_status = true},
+        absl::MakeSpan(transfer_result), /*options=*/{.fill_status = true},
         std::move(execution_device_list));
     TF_RETURN_IF_ERROR(execution_result.status());
   }
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index c47f256d6a9867..8e29544fd01d78 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -57,6 +57,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 #include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
@@ -220,6 +221,8 @@ class IfrtServingExecutable {
   // disabled at ifrt serving level.
   IfrtPersistentCompilationCache* persistent_compilation_cache_;
 
+  H2DTransferExecutorFactory* h2d_transfer_executor_factory_ = nullptr;
+
   // Asynchronously load the restored variable tensors to Ifrt array.
   absl::Status AsyncLoadIfrtArray(
       absl::Span<const tensorflow::Tensor> inputs,
@@ -227,11 +230,6 @@ class IfrtServingExecutable {
       const CachedExecutableBundle& executable_bundle,
       const xla::ifrt::DeviceListRef& devices);
 
-  absl::StatusOr<xla::ifrt::ArrayRef> ConvertTensorToArray(
-      const tensorflow::Tensor& tensor,
-      const xla::ifrt::DeviceListRef& device_list,
-      const xla::OpSharding& sharding);
-
   tsl::Future<SharedCachedExecutableBundle> LookUpOrCreateExecutable(
       const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
       absl::Span<const DtypeAndShape> dtypes_and_shapes,
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.cc b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
index 80975b68448b8a..7db387987f46b5 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
@@ -645,6 +645,29 @@ absl::StatusOr<tsl::Future<tensorflow::Tensor>> MakeTensorFromArrayHelper(
 
 }  // namespace
 
+absl::StatusOr<std::unique_ptr<H2DTransferExecutor>>
+H2DTransferExecutorFactory::CreateH2DTransferExecutor(
+    xla::ifrt::Client& ifrt_client) {
+  return std::make_unique<H2DTransferExecutor>(ifrt_client);
+}
+
+H2DTransferExecutor::H2DTransferExecutor(xla::ifrt::Client& ifrt_client)
+    : ifrt_client_(ifrt_client) {}
+
+absl::StatusOr<tsl::Future<xla::ifrt::ArrayRef>>
+H2DTransferExecutor::ScheduledH2DTransfer(
+    const tensorflow::Tensor& tensor,
+    const xla::ifrt::DeviceListRef& device_list,
+    const xla::OpSharding& sharding, tsl::thread::ThreadPool& thread_pool) {
+  TF_ASSIGN_OR_RETURN(auto hlo_sharding, xla::HloSharding::FromProto(sharding));
+  TF_ASSIGN_OR_RETURN(xla::ifrt::ArrayRef array_ref,
+                      MakeArrayFromTensor(ifrt_client_, tensor, device_list,
+                                          hlo_sharding, thread_pool));
+  return tsl::Future<xla::ifrt::ArrayRef>(std::move(array_ref));
+}
+
+absl::Status H2DTransferExecutor::RunH2DTransfers() { return absl::OkStatus(); }
+
 tsl::Future<tensorflow::Tensor> MakeTensorFromArray(
     xla::ifrt::Client& ifrt_client, xla::ifrt::Array& input_array,
     const xla::HloSharding& hlo_sharding,
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.h b/tensorflow/core/tfrt/ifrt/sharding_utils.h
index d9e57f5a2e7a9b..13ee0c3cee4815 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.h
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -27,9 +29,10 @@ limitations under the License.
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
 #include "xla/tsl/concurrency/future.h"
-#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -37,6 +40,39 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
+// A per-request H2D transfer executor. The caller should call
+// `RegisterH2DTransfer` to register tensors to be transferred, and then call
+// `RunH2DTransfers` to start the transfers. The futures returned by
+// `RegisterH2DTransfer` will only be guaranteed to be fulfilled after
+// `RunH2DTransfers` returns OK.
+class H2DTransferExecutor {
+ public:
+  // TODO(b/445201291): Make the constructor private once the
+  // H2DTransferExecutorFactory is plumbed through the stack.
+  explicit H2DTransferExecutor(xla::ifrt::Client& ifrt_client);
+  virtual ~H2DTransferExecutor() = default;
+
+  // Registers a tensor to be transferred to devices. The H2D transfer can be
+  // started in this call or in a later call of `RunH2DTransfers`.
+  virtual absl::StatusOr<tsl::Future<xla::ifrt::ArrayRef>> ScheduledH2DTransfer(
+      const tensorflow::Tensor& tensor,
+      const xla::ifrt::DeviceListRef& device_list,
+      const xla::OpSharding& sharding, tsl::thread::ThreadPool& thread_pool);
+
+  // Executes the H2D transfers for all registered tensors.
+  virtual absl::Status RunH2DTransfers();
+
+ protected:
+  xla::ifrt::Client& ifrt_client_;
+};
+
+class H2DTransferExecutorFactory {
+ public:
+  virtual ~H2DTransferExecutorFactory() = default;
+  virtual absl::StatusOr<std::unique_ptr<H2DTransferExecutor>>
+  CreateH2DTransferExecutor(xla::ifrt::Client& ifrt_client) = 0;
+};
+
 // Create a tensor from the given host tensor based on given device ids and
 // sharding information.
 absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromTensor(
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/future.h b/tensorflow/core/tfrt/mlrt/interpreter/future.h
index fd32214cf39814..be5668988529df 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/future.h
+++ b/tensorflow/core/tfrt/mlrt/interpreter/future.h
@@ -266,7 +266,7 @@ struct State {
   absl::Status status;
 
   void SetError(absl::Status status) {
-    absl::MutexLock lock(&mu);
+    absl::MutexLock lock(mu);
     this->status = std::move(status);
   }
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index 5b11a2be9846ba..4aa8584e7e4041 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -142,6 +142,7 @@ cc_library(
         "//tensorflow/core/tfrt/mlrt/interpreter:register_span",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
+        "@local_xla//xla/tsl/platform:errors",
     ],
 )
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
index daecf14a6d0eb7..fe9833ea071531 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/optimization.h"
 #include "absl/cleanup/cleanup.h"
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
@@ -117,7 +118,11 @@ void ExecuteKernelRunner(
     }
 
     if (ABSL_PREDICT_FALSE(!op_kernel_context.status().ok())) {
-      frame.execution_context().Fail(op_kernel_context.status());
+      absl::Status status = op_kernel_context.status();
+      tsl::errors::AppendToMessage(
+          &status, absl::StrCat("Error from kernel: ",
+                                kernel_runner.op_kernel()->name_view()));
+      frame.execution_context().Fail(std::move(status));
       return;
     }
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
index f687c0f1f2b84f..bf316332ed4f89 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
@@ -399,7 +399,8 @@ TEST(KernelTest, CreateExecuteOpError) {
 
   EXPECT_THAT(
       execution_context.status(),
-      absl_testing::StatusIs(absl::StatusCode::kInternal, "test error"));
+      absl_testing::StatusIs(absl::StatusCode::kInternal,
+                             "test error\n\tError from kernel: TestError"));
 }
 
 REGISTER_OP("TestAsyncIdentity")
diff --git a/tensorflow/core/tfrt/runtime/BUILD b/tensorflow/core/tfrt/runtime/BUILD
index 208363a315f6ce..e9ae97cc24c597 100644
--- a/tensorflow/core/tfrt/runtime/BUILD
+++ b/tensorflow/core/tfrt/runtime/BUILD
@@ -19,7 +19,8 @@ package_group(
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
         # copybara:uncomment "//learning/infra/mira/...",
-        # copybara:uncomment "//learning/pathways/serving/...",
+        "//learning/pathways/serving/...",
+        # copybara:uncomment "//third_party/pathways/serving/...",
         # copybara:uncomment "//learning/serving/...",
         # copybara:uncomment "//smartass/brain/...",
         # copybara:uncomment "//quality/webanswers/servo2/...",
diff --git a/tensorflow/core/tfrt/runtime/stream.cc b/tensorflow/core/tfrt/runtime/stream.cc
index e44a6ac88d9304..3c1e9cbdfece51 100644
--- a/tensorflow/core/tfrt/runtime/stream.cc
+++ b/tensorflow/core/tfrt/runtime/stream.cc
@@ -83,7 +83,7 @@ absl::StatusOr<std::optional<StreamCallbackId>> CreateStreamCallbackId(
 absl::Status StreamCallbackRegistry::CallbackState::Invoke(
     tsl::thread::ThreadPoolInterface* thread_pool, StreamedResult result) {
   {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     if (closed_) {
       return absl::InternalError(
           "Failed to invole the callback that is closed.");
@@ -93,7 +93,7 @@ absl::Status StreamCallbackRegistry::CallbackState::Invoke(
   thread_pool->Schedule([this, result = std::move(result)]() mutable {
     InvokeCallback(std::move(result));
 
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     --num_outstanding_;
   });
   return absl::OkStatus();
@@ -101,7 +101,7 @@ absl::Status StreamCallbackRegistry::CallbackState::Invoke(
 
 void StreamCallbackRegistry::CallbackState::Close() {
   {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     closed_ = true;
 
     auto not_running = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) {
@@ -134,7 +134,7 @@ absl::StatusOr<ScopedStreamCallback> StreamCallbackRegistry::Register(
     absl::AnyInvocable<
         void(absl::flat_hash_map<std::string, tensorflow::Tensor>)>
         callback) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
 
   const auto [it, inserted] =
       stream_callbacks_.insert({std::make_pair(callback_id, step_id), nullptr});
@@ -151,7 +151,7 @@ absl::StatusOr<ScopedStreamCallback> StreamCallbackRegistry::Register(
 absl::Status StreamCallbackRegistry::Invoke(
     tsl::thread::ThreadPoolInterface* thread_pool, StreamCallbackId callback_id,
     StepId step_id, StreamedResult result) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   auto iter = stream_callbacks_.find({callback_id, step_id});
   if (iter == stream_callbacks_.end()) {
     return absl::NotFoundError(absl::StrCat(
@@ -168,7 +168,7 @@ absl::Status StreamCallbackRegistry::Invoke(
 std::unique_ptr<StreamCallbackRegistry::CallbackState>
 StreamCallbackRegistry::Unregister(StreamCallbackId callback_id,
                                    StepId step_id) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   const auto it = stream_callbacks_.find({callback_id, step_id});
   if (it == stream_callbacks_.end()) {
     return nullptr;
diff --git a/tensorflow/core/tfrt/runtime/stream.h b/tensorflow/core/tfrt/runtime/stream.h
index 03b0784b4c8505..e6f6eb23582797 100644
--- a/tensorflow/core/tfrt/runtime/stream.h
+++ b/tensorflow/core/tfrt/runtime/stream.h
@@ -106,23 +106,23 @@ class StreamInterfaceFactory {
       absl::AnyInvocable<
           absl::StatusOr<std::unique_ptr<StreamControllerInterface>>() const>
           interface_factory) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     controller_interface_factory_ = std::move(interface_factory);
   }
 
   absl::StatusOr<std::unique_ptr<StreamControllerInterface>>
   CreateControllerStreamInterface() const {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return controller_interface_factory_();
   }
 
   void RegisterWorker(CreateWorkerStreamInterfaceFn interface_factory) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     worker_interface_factory_ = std::move(interface_factory);
   }
 
   CreateWorkerStreamInterfaceFn CreateWorkerStreamInterface() const {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return worker_interface_factory_;
   }
 
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 4a43cb0b79be96..5c99062a5cecf9 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -16,8 +16,6 @@ package_group(
         # copybara:uncomment "//learning/brain/tfrt/...",
         # copybara:uncomment "//learning/infra/mira/...",
         # copybara:uncomment "//learning/serving/...",
-        # copybara:uncomment "//learning/pathways/serving/model_tests/...",
-        # copybara:uncomment "//learning/pathways/serving/runtime/...",
         "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/core/tfrt/mlrt/application/tensorflow/tests/...",
         "//tensorflow/compiler/mlir/tfrt/...",
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
index 7e9214e801db60..ae3317128c7294 100644
--- a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index 4b07668829d93d..ef8e371457400b 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -308,15 +308,15 @@ AotCompileToGpuPjRtExecutable(
       may_alias_resource_update, &options, compilation_result));
 
   TF_ASSIGN_OR_RETURN(
-      xla::Compiler::TargetConfig gpu_config,
-      xla::Compiler::TargetConfig::FromProto(gpu_target_config));
+      xla::Compiler::GpuTargetConfig gpu_config,
+      xla::Compiler::GpuTargetConfig::FromProto(gpu_target_config));
   xla::StreamExecutorGpuCompiler pjrt_gpu_compiler;
   // Create a trivial topology, which won't be used.
   xla::StreamExecutorGpuTopologyDescription topology(xla::CudaId(),
                                                      xla::CudaName(), nullptr);
   xla::CompileOptions pjrt_options =
       GetPjRtCompileOptions(options, **compilation_result);
-  pjrt_options.target_config = gpu_config;
+  pjrt_options.gpu_target_config = gpu_config;
   return pjrt_gpu_compiler.Compile(
       pjrt_options, *((*compilation_result)->computation), topology, nullptr);
 }
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
index 4d9a1219cce8cd..46b8de9c482c91 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
@@ -61,7 +61,7 @@ TfrtSavedModelMLIRImportInput::GetSubGraph(
   ABSL_CONST_INIT static absl::Mutex mu(absl::kConstInit);
 
   {
-    absl::MutexLock l(&mu);
+    absl::MutexLock l(mu);
     auto iter = optimized_graphs_.find(name);
     if (iter != optimized_graphs_.end()) return iter->second.get();
   }
@@ -70,7 +70,7 @@ TfrtSavedModelMLIRImportInput::GetSubGraph(
       auto optimization_result,
       graph_execution_state_->CreateOptimizedGraph(graph_import_config));
 
-  absl::MutexLock l(&mu);
+  absl::MutexLock l(mu);
   functionalization_duration_ += optimization_result.functionalization_duration;
   grappler_duration_ += optimization_result.grappler_duration;
 
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 31a18f642bcea2..0fc8f06b2b5e53 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -80,6 +80,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using CostAnalysisOptions =
+    tensorflow::tfrt_stub::GraphExecutionOptions::CostAnalysisOptions;
+
 // Wraps an `Eigen::ThreadPoolInterface` as a
 // `tensorflow::thread::ThreadPoolInterface`.
 class ThreadPoolInterfaceWrapper : public thread::ThreadPoolInterface {
@@ -244,7 +247,7 @@ class TfrtSession : public tensorflow::Session {
     optimization_options.session_options = &options_;
     FunctionLibraryDefinition flib_def = fallback_state->func_lib_def();
     optimization_options.flib_def = &flib_def;
-    std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
+    std::unordered_map<std::string, std::unique_ptr<Graph>> partition_graphs;
     auto initial_graph =
         std::make_unique<tensorflow::Graph>(tensorflow::OpRegistry::Global());
     tensorflow::GraphConstructorOptions opts;
@@ -504,7 +507,12 @@ class TfrtSession : public tensorflow::Session {
     compile_options.tpu_fuse_ops = tpu_use_tpu_runner_;
     compile_options.hoist_invariant_ops = true;
     compile_options.sink_in_invariant_ops = true;
+
     compile_options.cost_threshold = 1024;
+    if (options_.config.experimental().stream_merge_threshold() > 0) {
+      compile_options.cost_threshold =
+          options_.config.experimental().stream_merge_threshold();
+    }
 
     if (use_gpu_) {
       options.enable_tfrt_gpu = true;
@@ -518,6 +526,10 @@ class TfrtSession : public tensorflow::Session {
 
     options.model_metadata = options_.config.experimental().session_metadata();
     options.enable_mlrt = enable_mlrt_;
+    if (options_.config.experimental().online_cost_analysis()) {
+      options.cost_analysis_options.version =
+          CostAnalysisOptions::CostAnalysisVersion::kOnce;
+    }
 
     return options;
   }
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.h b/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
index 9fd4d9095b6bf2..eb7913a44deb80 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
@@ -106,7 +106,8 @@ class TfrtSessionFactory : public tensorflow::SessionFactory {
   bool use_gpu_ TF_GUARDED_BY(mutex_) = false;
   std::unique_ptr<ThreadPoolManager> thread_pool_manager_ TF_GUARDED_BY(mutex_);
   bool enable_mlrt_ TF_GUARDED_BY(mutex_) = false;
-  tensorflow::BackendCompiler* backend_compiler_ TF_GUARDED_BY(mutex_);
+  tensorflow::BackendCompiler* backend_compiler_ TF_GUARDED_BY(mutex_) =
+      nullptr;
   std::unique_ptr<StaticDeviceMgr> device_manager_;
 };
 
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index 92595afecd9a80..0f51606fda3ff4 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -14,11 +14,12 @@ package_group(
         "//learning/brain/google/xla/kernels/...",
         # copybara:uncomment "//learning/brain/research/pjrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
-        # copybara:uncomment "//learning/pathways/serving/...",
+        "//learning/pathways/serving/...",
         "//learning/serving/...",
         "//quality/webanswers/...",
         "//smartass/brain/inference/...",
         # copybara:uncomment "//smartass/brain/ops/...",
+        # copybara:uncomment "//third_party/pathways/serving/...",
         "//tensorflow/c/eager/...",
         "//tensorflow/compiler/mlir/tfrt/...",
         "//tensorflow/core/runtime_fallback/...",
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
index e5b234e59c57dc..9eccd5d40dc131 100644
--- a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <dirent.h>
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
@@ -117,6 +118,7 @@ absl::StatusOr<std::vector<std::string>> GetFilenames(
     filenames.push_back(entry->d_name);
   }
   closedir(dir);
+  std::sort(filenames.begin(), filenames.end());
   return filenames;
 }
 
@@ -183,9 +185,9 @@ TEST(NodeIoDumpRewriterTest, OnSavedModelV1) {
   // Check the dump files.
   ASSERT_OK_AND_ASSIGN(auto filenames, GetFilenames(dump_dir));
   ASSERT_EQ(filenames.size(), 3);
-  EXPECT_TRUE(absl::StartsWith(filenames[0], "Add:out:0_"));
-  EXPECT_TRUE(absl::StartsWith(filenames[1], "Add:in:0_"));
-  EXPECT_TRUE(absl::StartsWith(filenames[2], "Add:in:1_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[0], "Add:in:0_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[1], "Add:in:1_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[2], "Add:out:0_"));
 }
 
 TEST(NodeIoDumpRewriterTest, OnSavedModelV2) {
@@ -222,9 +224,9 @@ TEST(NodeIoDumpRewriterTest, OnSavedModelV2) {
   // Check the dump files.
   ASSERT_OK_AND_ASSIGN(auto filenames, GetFilenames(dump_dir));
   ASSERT_EQ(filenames.size(), 3);
-  EXPECT_TRUE(absl::StartsWith(filenames[0], "result:out:0_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[0], "result:in:0_"));
   EXPECT_TRUE(absl::StartsWith(filenames[1], "result:in:1_"));
-  EXPECT_TRUE(absl::StartsWith(filenames[2], "result:in:0_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[2], "result:out:0_"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/utils/device_variables_table.h b/tensorflow/core/tfrt/utils/device_variables_table.h
index 1b1a742e18be4a..834e474daa023d 100644
--- a/tensorflow/core/tfrt/utils/device_variables_table.h
+++ b/tensorflow/core/tfrt/utils/device_variables_table.h
@@ -36,7 +36,7 @@ class DeviceVariablesTable {
   void AddOrUpdateDeviceVariable(
       const HostTensorType& host_tensor, int copy_index,
       AsyncValueRef<DeviceTensorType> device_tensor) {
-    absl::MutexLock lock(&device_variables_mu_);
+    absl::MutexLock lock(device_variables_mu_);
     device_variables_table_.insert_or_assign(
         std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index),
         std::move(device_tensor));
@@ -44,7 +44,7 @@ class DeviceVariablesTable {
 
   AsyncValueRef<DeviceTensorType> GetDeviceVariable(
       const HostTensorType& host_tensor, int copy_index) {
-    absl::ReaderMutexLock lock(&device_variables_mu_);
+    absl::ReaderMutexLock lock(device_variables_mu_);
     auto it = device_variables_table_.find(
         std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index));
     return it != device_variables_table_.end()
@@ -55,7 +55,7 @@ class DeviceVariablesTable {
   AsyncValueRef<DeviceTensorType> GetOrAddDeviceVariable(
       const HostTensorType& host_tensor, int copy_index,
       llvm::unique_function<void(AsyncValueRef<DeviceTensorType>)> creator) {
-    absl::ReleasableMutexLock lock(&device_variables_mu_);
+    absl::ReleasableMutexLock lock(device_variables_mu_);
     auto it = device_variables_table_.find(
         std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index));
     if (it != device_variables_table_.end()) return it->second.CopyRef();
@@ -70,12 +70,12 @@ class DeviceVariablesTable {
   }
 
   void ClearDeviceVariablesTable() {
-    absl::MutexLock lock(&device_variables_mu_);
+    absl::MutexLock lock(device_variables_mu_);
     device_variables_table_.clear();
   }
 
   int size() {
-    absl::ReaderMutexLock lock(&device_variables_mu_);
+    absl::ReaderMutexLock lock(device_variables_mu_);
     return device_variables_table_.size();
   }
 
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
index 3950b4bbbf4b95..6e4d6438eda9c8 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
@@ -325,7 +325,7 @@ TfrtGraphExecutionState::CreateOptimizedGraph(
 
 absl::Status TfrtGraphExecutionState::Extend(const GraphDef& graph) {
   std::unique_ptr<GraphExecutionState> new_state;
-  absl::MutexLock lock(&graph_execution_state_mu_);
+  absl::MutexLock lock(graph_execution_state_mu_);
   TF_RETURN_IF_ERROR(graph_execution_state_->Extend(graph, &new_state));
   graph_execution_state_.swap(new_state);
 
@@ -684,7 +684,7 @@ TfrtGraphExecutionState::OptimizeGraph(
   std::unique_ptr<tensorflow::FunctionLibraryDefinition> optimized_flib;
 
   {
-    absl::MutexLock lock(&graph_execution_state_mu_);
+    absl::MutexLock lock(graph_execution_state_mu_);
     // Invoke Grappler to optimize the graph.
     TF_RETURN_IF_ERROR(graph_execution_state_->OptimizeGraph(
         build_graph_options, graph, &graph.flib_def(), &optimized_graph,
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
index 2912c2ca57c088..30935e0db9c424 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
@@ -86,20 +86,20 @@ class TfrtGraphExecutionState {
   // Return the preprocessed full graph. Note that it does not contain the
   // function library in the original graph.
   const tensorflow::Graph& graph() const {
-    absl::MutexLock lock(&graph_execution_state_mu_);
+    absl::MutexLock lock(graph_execution_state_mu_);
     DCHECK(graph_execution_state_->full_graph());
     return *graph_execution_state_->full_graph();
   }
 
   // The original graph.
   const GraphDef* original_graph_def() const {
-    absl::MutexLock lock(&graph_execution_state_mu_);
+    absl::MutexLock lock(graph_execution_state_mu_);
     return graph_execution_state_->original_graph_def();
   }
 
   // Return the function library in the original graph.
   const FunctionLibraryDefinition& flib_def() const {
-    absl::MutexLock lock(&graph_execution_state_mu_);
+    absl::MutexLock lock(graph_execution_state_mu_);
     return graph_execution_state_->flib_def();
   }
 
diff --git a/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
index 722ceed2cdd412..1e1aaa11483827 100644
--- a/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
@@ -56,7 +56,7 @@ constexpr char kFinalizeOp[] = "FinalizeTPUEmbedding";
 constexpr char kEmbeddingConfigurationAttr[] = "config";
 
 absl::Status AddSynchronizationNode(
-    const NodeDef& sync_node_def, const string& device_name,
+    const NodeDef& sync_node_def, const std::string& device_name,
     absl::Span<Node* const> end_nodes,
     absl::Span<const DistributedTPURewriteHelpers::OutputDependency>
         output_dependencies,
@@ -88,8 +88,9 @@ absl::Status AddSynchronizationNode(
 }
 
 absl::Status AddSetupPropagationEmbeddingNode(
-    const string& device_name, const string& node_name, const string& op_name,
-    absl::Span<Node* const> input_nodes, Graph* graph, Node** node) {
+    const std::string& device_name, const std::string& node_name,
+    const std::string& op_name, absl::Span<Node* const> input_nodes,
+    Graph* graph, Node** node) {
   NodeDef node_def;
   node_def.set_name(node_name);
   node_def.set_op(op_name);
@@ -109,7 +110,7 @@ absl::Status AddSetupPropagationEmbeddingNode(
 }
 
 absl::Status AddExecutePartitionerNode(
-    const string& configuration_device_name, const string& config,
+    const std::string& configuration_device_name, const std::string& config,
     absl::Span<Node* const> input_dependencies, Graph* graph,
     Node** partitioner_node) {
   NodeDef partitioner_def;
@@ -128,7 +129,7 @@ absl::Status AddExecutePartitionerNode(
   return absl::OkStatus();
 }
 
-absl::Status AddConfigureMemoryNode(const string& host_device_name,
+absl::Status AddConfigureMemoryNode(const std::string& host_device_name,
                                     Node* partitioner_node, Graph* graph,
                                     Node** embedding_node) {
   NodeDef embedding_def;
@@ -142,7 +143,7 @@ absl::Status AddConfigureMemoryNode(const string& host_device_name,
   return absl::OkStatus();
 }
 
-absl::Status AddCollateMemoryNode(const string& configuration_device_name,
+absl::Status AddCollateMemoryNode(const std::string& configuration_device_name,
                                   absl::Span<Node* const> memory_nodes,
                                   Graph* graph, Node** embedding_node) {
   return AddSetupPropagationEmbeddingNode(
@@ -153,10 +154,10 @@ absl::Status AddCollateMemoryNode(const string& configuration_device_name,
       /*node=*/embedding_node);
 }
 
-absl::Status AddConfigureHostNode(const string& host_device_name,
-                                  const string& config, Node* partitioner_node,
-                                  Node* memory_node, Graph* graph,
-                                  Node** embedding_node) {
+absl::Status AddConfigureHostNode(const std::string& host_device_name,
+                                  const std::string& config,
+                                  Node* partitioner_node, Node* memory_node,
+                                  Graph* graph, Node** embedding_node) {
   NodeDef embedding_def;
   embedding_def.set_name(graph->NewName("configure_tpu_embedding_host"));
   embedding_def.set_op(kConfigureHostOp);
@@ -172,7 +173,7 @@ absl::Status AddConfigureHostNode(const string& host_device_name,
   return absl::OkStatus();
 }
 
-absl::Status AddConnectHostsNode(const string& host_device_name,
+absl::Status AddConnectHostsNode(const std::string& host_device_name,
                                  absl::Span<Node* const> configure_host_nodes,
                                  Graph* graph, Node** connect_node) {
   return AddSetupPropagationEmbeddingNode(
@@ -183,7 +184,7 @@ absl::Status AddConnectHostsNode(const string& host_device_name,
       /*node=*/connect_node);
 }
 
-absl::Status AddFinalizeNode(const string& configuration_device_name,
+absl::Status AddFinalizeNode(const std::string& configuration_device_name,
                              Node* partitioner_node, Node* memory_node,
                              Graph* graph, Node** finalize_node) {
   NodeDef finalize_def;
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc
index f7cabd272cb69e..2b568d2db40b27 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
@@ -43,7 +44,7 @@ namespace tensorflow {
 
 // LINT.IfChange
 absl::Status DistributedTPURewriteHelpers::GetSystemDevice(
-    const string& system_spec_string, const DeviceSet& device_set,
+    const std::string& system_spec_string, const DeviceSet& device_set,
     DeviceNameUtils::ParsedName* system_spec, Device** system_device) {
   if (!DeviceNameUtils::ParseFullName(system_spec_string, system_spec)) {
     system_spec->Clear();
@@ -71,7 +72,7 @@ absl::Status DistributedTPURewriteHelpers::GetSystemDevice(
                                    system_spec_string, "'");
   } else if (system_devices.size() > 1) {
     // Validate that all system devices are part of the same job.
-    std::unordered_set<string> job_names;
+    std::unordered_set<std::string> job_names;
     for (auto device : system_devices) {
       const auto& parsed_name = device->parsed_name();
       TF_RET_CHECK(parsed_name.has_job);
@@ -135,7 +136,7 @@ absl::Status DistributedTPURewriteHelpers::GetHostSystemDevices(
 
   // Check that all the devices belong to the same job.
   TF_RET_CHECK((*host_system_devices)[0]->parsed_name().has_job);
-  const string& job_name = (*host_system_devices)[0]->parsed_name().job;
+  const std::string& job_name = (*host_system_devices)[0]->parsed_name().job;
   int replica = (*host_system_devices)[0]->parsed_name().replica;
   for (const auto host_device : *host_system_devices) {
     const auto& parsed_name = host_device->parsed_name();
@@ -214,10 +215,10 @@ absl::Status DistributedTPURewriteHelpers::GetTPUDevices(
 // LINT.ThenChange(//tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc)
 
 absl::Status DistributedTPURewriteHelpers::ForConfigurationNodeMatchingType(
-    const string& node_type, Graph* graph, const DeviceSet& device_set,
+    const std::string& node_type, Graph* graph, const DeviceSet& device_set,
     const std::function<
         absl::Status(const NodeDef& configuration_node_def,
-                     const string& configuration_device_name,
+                     const std::string& configuration_device_name,
                      const std::vector<Device*>& host_devices,
                      const std::vector<Node*>& input_dependencies,
                      const std::vector<OutputDependency>& output_dependencies,
@@ -231,12 +232,12 @@ absl::Status DistributedTPURewriteHelpers::ForConfigurationNodeMatchingType(
   }
 
   for (Node* node : nodes) {
-    string spec_string = node->requested_device();
+    std::string spec_string = node->requested_device();
     DeviceNameUtils::ParsedName spec;
     Device* device;
     TF_RETURN_IF_ERROR(
         GetSystemDevice(spec_string, device_set, &spec, &device));
-    const string& device_name = device->name();
+    const std::string& device_name = device->name();
 
     std::vector<Device*> host_devices;
     TF_RETURN_IF_ERROR(GetHostSystemDevices(spec, device_set, &host_devices));
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
index ae4bfc8b63d3ae..3863f27baffe96 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
@@ -44,7 +44,7 @@ class DistributedTPURewriteHelpers {
   // system_spec_string to identify the TPU_SYSTEM on replica 0, task 0 of the
   // job that contains the TPU hardware.
   // TODO(b/110910013): Possibly remove the tpu system device.
-  static absl::Status GetSystemDevice(const string& system_spec_string,
+  static absl::Status GetSystemDevice(const std::string& system_spec_string,
                                       const DeviceSet& device_set,
                                       DeviceNameUtils::ParsedName* system_spec,
                                       Device** system_device);
@@ -91,10 +91,10 @@ class DistributedTPURewriteHelpers {
     int dst_input;
   };
   static absl::Status ForConfigurationNodeMatchingType(
-      const string& node_type, Graph* graph, const DeviceSet& device_set,
+      const std::string& node_type, Graph* graph, const DeviceSet& device_set,
       const std::function<
           absl::Status(const NodeDef& configuration_node_def,
-                       const string& configuration_device_name,
+                       const std::string& configuration_device_name,
                        const std::vector<Device*>& host_devices,
                        const std::vector<Node*>& input_dependencies,
                        const std::vector<OutputDependency>& output_dependencies,
diff --git a/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
index 2730408736181e..d6a810b202c009 100644
--- a/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
+++ b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
@@ -33,23 +33,24 @@ namespace tensorflow {
 // TODO(jpienaar): Clean up NodeDefBuilder and remove this class.
 class IncompleteNodeDefBuilder {
  public:
-  IncompleteNodeDefBuilder(const string& name, const string& op,
+  IncompleteNodeDefBuilder(const std::string& name, const std::string& op,
                            const NodeDebugInfo& debug);
 
-  IncompleteNodeDefBuilder& AddAttr(const string& attr, const DataType& type);
-  IncompleteNodeDefBuilder& AddAttr(const string& attr, int val);
+  IncompleteNodeDefBuilder& AddAttr(const std::string& attr,
+                                    const DataType& type);
+  IncompleteNodeDefBuilder& AddAttr(const std::string& attr, int val);
 
-  IncompleteNodeDefBuilder& Device(const string& device);
+  IncompleteNodeDefBuilder& Device(const std::string& device);
 
   absl::Status Build(Graph* graph, Node** n);
 
-  static IncompleteNodeDefBuilder Identity(const string& name,
+  static IncompleteNodeDefBuilder Identity(const std::string& name,
                                            const DataType& type,
                                            const NodeDebugInfo& debug);
-  static IncompleteNodeDefBuilder Merge(const string& name,
+  static IncompleteNodeDefBuilder Merge(const std::string& name,
                                         const DataType& type,
                                         const NodeDebugInfo& debug, int n);
-  static IncompleteNodeDefBuilder Switch(const string& name,
+  static IncompleteNodeDefBuilder Switch(const std::string& name,
                                          const DataType& type,
                                          const NodeDebugInfo& debug);
 
diff --git a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
index 2b015a383ff2ab..b5c287d1726b93 100644
--- a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
@@ -51,7 +51,7 @@ namespace {
 // Check the number of outputs for RecvActivationsNode or for number of inputs
 // For SendGradientsNode.
 absl::Status CheckNumInputsOrOutputs(
-    const int32 num_input_or_outputs, const std::string& attribute_name,
+    const int32_t num_input_or_outputs, const std::string& attribute_name,
     const std::string& node_name,
     const tpu::TPUEmbeddingConfiguration& tpu_embedding_config) {
   if (tpu_embedding_config.feature_descriptor_size() == 0 &&
@@ -129,7 +129,7 @@ absl::StatusOr<NodeDef> MakeRecvActivationsNodeDef(
         "Malformed config attribute in the RecvTPUEmbeddingActivations node.");
   }
 
-  int32 num_outputs;
+  int32_t num_outputs;
   TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(old_activations_node_def),
                                  "num_outputs", &num_outputs));
 
@@ -185,7 +185,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
         "Malformed config attribute in the SendTPUEmbeddingGradients node.");
   }
 
-  int32 num_inputs;
+  int32_t num_inputs;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(AttrSlice(old_gradients_node_def), "N", &num_inputs));
 
@@ -193,7 +193,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
                                              "SendTPUEmbeddingGradients",
                                              tpu_embedding_config));
 
-  int32 dynamic_inputs_tag_count = 0;
+  int32_t dynamic_inputs_tag_count = 0;
   if (!GetNodeAttr(AttrSlice(old_gradients_node_def), "NN",
                    &dynamic_inputs_tag_count)
            .ok()) {
@@ -209,7 +209,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
         status_or_dynamic_inputs_tag_count.status().message());
   }
 
-  const int32 expected_dynamic_inputs_tag_count =
+  const int32_t expected_dynamic_inputs_tag_count =
       status_or_dynamic_inputs_tag_count.value();
 
   if (dynamic_inputs_tag_count != expected_dynamic_inputs_tag_count) {
@@ -221,7 +221,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
   }
 
   if (data_inputs.size() !=
-      static_cast<uint64>(num_inputs + dynamic_inputs_tag_count)) {
+      static_cast<uint64_t>(num_inputs + dynamic_inputs_tag_count)) {
     return absl::InvalidArgumentError(absl::StrFormat(
         "Mismatch in the number of inputs for SendTPUEmbeddingGradients node, "
         "expected: %d, actual: %d",
diff --git a/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc b/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc
index 7d923357fb91fb..36f6fde1ebe817 100644
--- a/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc
+++ b/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc
@@ -109,10 +109,10 @@ absl::Status UpdateTPUEmbeddingEnqueueOrdinalPass::Run(
   single_tpu_device_spec.has_id = false;
   options.device_set->FindMatchingDevices(single_tpu_device_spec,
                                           &task_devices);
-  int64 num_tpus_per_task = task_devices.size();
+  int64_t num_tpus_per_task = task_devices.size();
 
   for (Node* node : embedding_nodes) {
-    int64 replica_id;
+    int64_t replica_id;
     if (TryGetNodeAttr(node->attrs(), kXlaReplicaIdAttrName, &replica_id)) {
       node->AddAttr("device_ordinal", replica_id % num_tpus_per_task);
     }
@@ -128,7 +128,7 @@ absl::Status UpdateMapsForModeOverride(
     std::map<std::string, N>* enqueue_op,
     std::map<std::string, bool>* found_recv_op,
     std::map<std::string, bool>* found_grad_send_op) {
-  string layer_call_index;
+  std::string layer_call_index;
   if (TryGetNodeAttr(attrs, "_tpu_embedding_layer", &layer_call_index)) {
     if ((op == kTPURecvOps[0]) || (op == kTPURecvOps[1])) {
       // We will prevent users from creating multiple copies of the
@@ -269,7 +269,7 @@ absl::Status UpdateTPUEmbeddingModePass::UpdateFunctionDefEnqueueOp(
   TF_RET_CHECK(!node->input(mode_override).empty());
 
   // Find input node
-  string select_name = std::vector<std::string>(
+  std::string select_name = std::vector<std::string>(
       absl::StrSplit(node->input(mode_override), ':'))[0];
   int select = 0;
   while ((select < function->node_def_size()) &&
diff --git a/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
index 26445d0f19a307..ca5c50074b2a0c 100644
--- a/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -49,8 +50,8 @@ namespace {
 // The name of a stateful op is semantically meaningful because ops with the
 // same name will share the same kernel. We therefore form new op names using a
 // deterministic function (a fingerprint) of the old names.
-uint64 MergedOpFingerprint(absl::Span<Node* const> ops) {
-  std::vector<string> op_names;
+uint64_t MergedOpFingerprint(absl::Span<Node* const> ops) {
+  std::vector<std::string> op_names;
   op_names.reserve(ops.size());
   for (const Node* node : ops) {
     op_names.push_back(node->name());
@@ -58,13 +59,13 @@ uint64 MergedOpFingerprint(absl::Span<Node* const> ops) {
   return Fingerprint64(absl::StrJoin(op_names, ","));
 }
 
-absl::Status MergeVarHandleOps(const string& device,
+absl::Status MergeVarHandleOps(const std::string& device,
                                absl::Span<Node* const> nodes, Graph* graph) {
   int num_var_handles(nodes.size());
   if (num_var_handles <= 1) return absl::OkStatus();
 
-  std::vector<string> containers(num_var_handles);
-  std::vector<string> names(num_var_handles);
+  std::vector<std::string> containers(num_var_handles);
+  std::vector<std::string> names(num_var_handles);
   DataTypeVector dtypes(num_var_handles);
   std::vector<PartialTensorShape> shapes(num_var_handles);
   for (int i = 0; i < num_var_handles; ++i) {
@@ -149,7 +150,7 @@ absl::Status VariableMergerPass::Run(
 
   // Find VarHandleOps that are graph roots and group them by assigned device.
   // Also find any ReadVariableOps that are consumers of those handles.
-  absl::flat_hash_map<string, std::vector<Node*>> var_handle_ops_by_device;
+  absl::flat_hash_map<std::string, std::vector<Node*>> var_handle_ops_by_device;
   absl::flat_hash_set<Node*> read_variable_ops;
 
   for (Node* m : graph->source_node()->out_nodes()) {
diff --git a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
index 4da64d9185ff75..fbd6a9c0d46d5e 100644
--- a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
@@ -180,7 +180,7 @@ class HostComputeOp : public XlaOpKernel {
     // Send values to the host.
     std::vector<xla::XlaOp> send_to_host_tokens;
     for (int i = 0; i < input_handles.size(); ++i) {
-      const string channel_name = GetDeviceToHostChannelName(send_key_, i);
+      const std::string channel_name = GetDeviceToHostChannelName(send_key_, i);
       xla::Shape xla_shape;
       OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(input_dtypes_[i],
                                                 input_shapes[i], &xla_shape));
@@ -242,7 +242,7 @@ class HostComputeOp : public XlaOpKernel {
     // Copy results to the device.
     std::vector<xla::XlaOp> recv_from_host_tokens;
     for (int i = 0; i < output_shapes->size(); ++i) {
-      const string channel_name = GetHostToDeviceChannelName(recv_key_, i);
+      const std::string channel_name = GetHostToDeviceChannelName(recv_key_, i);
       // Specify frontend attributes.
       xla::FrontendAttributes attrs;
       (*attrs.mutable_map())[xla::kXlaHostTransferRendezvousNameAttr] =
@@ -403,21 +403,21 @@ class HostComputeOp : public XlaOpKernel {
 
   DataTypeVector input_dtypes_;
   DataTypeVector output_dtypes_;
-  std::vector<string> ancestors_;
+  std::vector<std::string> ancestors_;
   std::vector<TensorShape> static_output_shapes_;
   std::vector<xla::Shape> static_xla_output_shapes_;
-  string original_node_name_;
+  std::string original_node_name_;
   // If static_xla_output_shapes_.size() == 1 then xla_output_shape_ is the
   // unique output shape, otherwise it is a tuple of all the xla_output_shapes_.
   xla::Shape static_xla_output_shape_;
-  string send_key_;
-  string recv_key_;
+  std::string send_key_;
+  std::string recv_key_;
   // If shape inference is performed at runtime, the graph needed to perform
   // shape inference is stored in this function.
   std::unique_ptr<FunctionBody> shape_inference_graph_function_;
   int64_t cost_estimate_;
   int64_t tpu_core_;
-  std::vector<string> token_input_nodes_;
+  std::vector<std::string> token_input_nodes_;
 
   HostComputeOp(const HostComputeOp&) = delete;
   void operator=(const HostComputeOp&) = delete;
@@ -470,9 +470,9 @@ class SendToHostOp : public XlaOpKernel {
 
  private:
   DataType input_dtype_;
-  string key_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::string key_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   SendToHostOp(const SendToHostOp&) = delete;
   void operator=(const SendToHostOp&) = delete;
 };
@@ -529,9 +529,9 @@ class RecvFromHostOp : public XlaOpKernel {
  private:
   DataType output_dtype_;
   TensorShape output_shape_;
-  string key_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::string key_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   RecvFromHostOp(const RecvFromHostOp&) = delete;
   void operator=(const RecvFromHostOp&) = delete;
 };
diff --git a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
index ce2bb7df6ab0d6..d9f89b172a815d 100644
--- a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
@@ -45,7 +45,7 @@ class OutfeedEnqueueOp : public XlaOpKernel {
     OP_REQUIRES_OK(
         ctx, TensorShapeToXLAShape(dtype_, ctx->InputShape(0), &xla_shape));
     // Outfeed configuration is only needed for embedding outfeed.
-    const string outfeed_config;
+    const std::string outfeed_config;
     xla::Outfeed(ctx->Input(0), xla_shape, outfeed_config);
   }
 
@@ -83,7 +83,7 @@ class OutfeedEnqueueTupleOp : public XlaOpKernel {
     auto b = ctx->builder();
     auto tuple = xla::Tuple(b, handles);
     // Outfeed configuration is only needed for embedding outfeed.
-    const string outfeed_config;
+    const std::string outfeed_config;
     xla::Outfeed(tuple, tuple_shape, outfeed_config);
   }
 
diff --git a/tensorflow/core/tpu/tpu_compile.cc b/tensorflow/core/tpu/tpu_compile.cc
index 009d5691ed960d..f882ddc4fc0194 100644
--- a/tensorflow/core/tpu/tpu_compile.cc
+++ b/tensorflow/core/tpu/tpu_compile.cc
@@ -82,7 +82,7 @@ static absl::flat_hash_set<std::string>* kBlockList =
     });
 
 std::string CoreDevice(int core) {
-  return strings::StrCat("/device:", DEVICE_TPU_REPLICATED_CORE, ":", core);
+  return absl::StrCat("/device:", DEVICE_TPU_REPLICATED_CORE, ":", core);
 }
 
 static constexpr char kArgOp[] = "_Arg";
@@ -148,7 +148,8 @@ absl::Status AssignDevicesToArgsAndRetvals(
   auto assign = [&](Node* node,
                     const xla::OpSharding& sharding) -> absl::Status {
     if (sharding.type() == xla::OpSharding::MAXIMAL) {
-      const string device = CoreDevice(sharding.tile_assignment_devices(0));
+      const std::string device =
+          CoreDevice(sharding.tile_assignment_devices(0));
       node->set_assigned_device_name(device);
       node->set_requested_device(device);
     } else {
@@ -180,16 +181,17 @@ absl::Status AssignDevicesToArgsAndRetvals(
 
 void ConvertGraphShapeInfoToShapeMap(
     const Graph& graph, const GraphShapeInfo& graph_shape_info,
-    std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map) {
+    std::unordered_map<std::string, std::vector<PartialTensorShape>>*
+        shape_map) {
   // Builds a map from node name to Node* for `graph`.
-  std::unordered_map<string, Node*> index;
+  std::unordered_map<std::string, Node*> index;
   for (Node* node : graph.nodes()) {
     index[node->name()] = node;
   }
   // Discards the resource handle shape info while converting to the correct map
   // form.
   for (const auto& node_shape_info : graph_shape_info) {
-    const string& node_name = node_shape_info.first;
+    const std::string& node_name = node_shape_info.first;
     const std::vector<InferredShape>& output_shapes = node_shape_info.second;
     // Gets the vector of partial shapes, first converting node name to Node*
     // using index. graph is the subgraph of the original graph assigned to a
@@ -248,7 +250,7 @@ absl::Status OptimizeGraph(const tpu::TPUCompileMetadataProto& metadata,
         metadata, arg_shapes, graph->get(), flr, &shape_info));
     // Converts the GraphShapeInfo into the form needed by the constant-folding
     // pass of the optimizer.
-    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
     ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
     optimizer_opts.shape_map = &shape_map;
     optimizer.Optimize(flr, flr->env(), flr->device(), graph, optimizer_opts);
@@ -259,7 +261,7 @@ absl::Status OptimizeGraph(const tpu::TPUCompileMetadataProto& metadata,
     GraphShapeInfo shape_info;
     TF_RETURN_IF_ERROR(internal::RunShapeInferenceOnComputation(
         metadata, arg_shapes, graph->get(), flr, &shape_info));
-    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
     ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
     GraphOptimizer::Options optimizer_opts;
     optimizer_opts.shape_map = &shape_map;
@@ -487,7 +489,7 @@ absl::Status CompileTFFunctionToHlo(
   TF_RETURN_IF_ERROR(compiler->flib_runtime()->Instantiate(
       function.name(), AttrSlice(&function.attr()), &handle));
   const FunctionBody* fbody = compiler->flib_runtime()->GetFunctionBody(handle);
-  const string function_id =
+  const std::string function_id =
       Canonicalize(function.name(), AttrSlice(&function.attr()));
 
   std::unique_ptr<Graph> graph(new Graph(&flib_definition));
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 6508679f953999..587d6341527a20 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -475,7 +475,7 @@ absl::Status LoadOpShapeFunction::operator()(
     shape_inference::InferenceContext* c) const {
   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
-  string table_name;
+  std::string table_name;
   TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
   // Exactly one must be non-default.
   if ((table_id >= 0) == (!table_name.empty())) {
@@ -505,7 +505,7 @@ absl::Status RetrieveOpShapeFunction::operator()(
     shape_inference::InferenceContext* c) const {
   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
-  string table_name;
+  std::string table_name;
   TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
   // Exactly one must be non-default.
   if ((table_id >= 0) == (!table_name.empty())) {
diff --git a/tensorflow/core/tpu/tpu_execute.h b/tensorflow/core/tpu/tpu_execute.h
index bfa177d66a7e4c..aab534085423a7 100644
--- a/tensorflow/core/tpu/tpu_execute.h
+++ b/tensorflow/core/tpu/tpu_execute.h
@@ -42,7 +42,7 @@ absl::StatusOr<xla::ExecutionOutput> TPUExecute(
     const TPUHostTransferInfoProto& host_transfers,
     const xla::HloProto& hlo_metadata,
     std::vector<xla::ExecutionInput> arguments,
-    const std::string& rendezvous_key_base, uint32 rng_seed,
+    const std::string& rendezvous_key_base, uint32_t rng_seed,
     tpu::TpuNodeContext* node_context, xla::DeviceAssignment* device_assignment,
     CancellationManager* cancellation_manager, OpKernelContext* ctx,
     stream_executor::Stream* stream,
diff --git a/tensorflow/core/transforms/constant_folding/pass.cc b/tensorflow/core/transforms/constant_folding/pass.cc
index ea11c9bdd8f36a..fb79a288b7cfdf 100644
--- a/tensorflow/core/transforms/constant_folding/pass.cc
+++ b/tensorflow/core/transforms/constant_folding/pass.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cassert>
-#include <cctype>
 #include <cstdint>
 #include <memory>
 #include <numeric>
@@ -30,6 +29,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
@@ -448,9 +448,7 @@ bool OpPropertyHelper::ModifiesInputsInPlace(TFOp op) {
     return false;
   }
 
-  std::string lower_op_name = op_name.str();
-  std::transform(lower_op_name.begin(), lower_op_name.end(),
-                 lower_op_name.begin(), ::tolower);
+  std::string lower_op_name = absl::AsciiStrToLower(op_name.str());
   if (absl::StrContains(lower_op_name, "inplace")) return true;
 
   return op->hasAttr("in_place") || op->hasAttr("inplace");
diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index 4a7bb1653563ed..65c37b8b468825 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -668,8 +669,9 @@ FuncAttr BasePattern::Outline(Operation *op, PatternRewriter &rewriter,
             op->getAttrOfType<StringAttr>(dialect_.getNameAttrIdentifier())) {
       llvm::raw_string_ostream os(new_func_name);
       os << "_tfg_region_specialized_";
-      for (char c : llvm::map_range(
-               op_name.getValue(), [](char c) { return isalnum(c) ? c : '_'; }))
+      for (char c : llvm::map_range(op_name.getValue(), [](char c) {
+             return llvm::isAlnum(c) ? c : '_';
+           }))
         os << c;
       os << '_' << llvm::to_string(region.getRegionNumber());
       os.flush();
diff --git a/tensorflow/core/util/activation_mode.cc b/tensorflow/core/util/activation_mode.cc
index 24de700824f138..47f6c83698edf3 100644
--- a/tensorflow/core/util/activation_mode.cc
+++ b/tensorflow/core/util/activation_mode.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-absl::Status GetActivationModeFromString(const string& str_value,
+absl::Status GetActivationModeFromString(const std::string& str_value,
                                          ActivationMode* value) {
   if (str_value == "None") {
     *value = NONE;
diff --git a/tensorflow/core/util/activation_mode.h b/tensorflow/core/util/activation_mode.h
index a541794ef9dae8..c01eff0c83530b 100644
--- a/tensorflow/core/util/activation_mode.h
+++ b/tensorflow/core/util/activation_mode.h
@@ -40,7 +40,7 @@ enum ActivationMode {
 };
 
 // Specialization to parse an attribute directly into a ActivationMode enum.
-absl::Status GetActivationModeFromString(const string& str_value,
+absl::Status GetActivationModeFromString(const std::string& str_value,
                                          ActivationMode* value);
 
 inline absl::string_view ToString(ActivationMode mode) {
diff --git a/tensorflow/core/util/bcast_test.cc b/tensorflow/core/util/bcast_test.cc
index f33483a5846d67..3c0bd9abaaeafc 100644
--- a/tensorflow/core/util/bcast_test.cc
+++ b/tensorflow/core/util/bcast_test.cc
@@ -23,55 +23,56 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-string BCast(const tensorflow::BCast::Vec& x, const tensorflow::BCast::Vec& y,
-             const bool fewer_dims_optimization = true) {
+std::string BCast(const tensorflow::BCast::Vec& x,
+                  const tensorflow::BCast::Vec& y,
+                  const bool fewer_dims_optimization = true) {
   tensorflow::BCast b(x, y, fewer_dims_optimization);
   if (!b.IsValid()) {
     return "invalid";
   }
-  string ret;
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.x_reshape(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.x_bcast(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.y_reshape(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.y_bcast(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.result_shape(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.output_shape(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.grad_x_reduce_idx(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.grad_y_reduce_idx(), ","), "]");
+  std::string ret;
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.x_reshape(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.x_bcast(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.y_reshape(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.y_bcast(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.result_shape(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.output_shape(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.grad_x_reduce_idx(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.grad_y_reduce_idx(), ","), "]");
   return ret;
 }
 
-string BCastBatchIndices(const tensorflow::BCast::Vec& x,
-                         const tensorflow::BCast::Vec& y,
-                         const bool fewer_dims_optimization = true) {
+std::string BCastBatchIndices(const tensorflow::BCast::Vec& x,
+                              const tensorflow::BCast::Vec& y,
+                              const bool fewer_dims_optimization = true) {
   tensorflow::BCast b(x, y, fewer_dims_optimization,
                       /*return_flattened_batch_indices=*/true);
-  string ret;
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.x_batch_indices(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.y_batch_indices(), ","), "]");
+  std::string ret;
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.x_batch_indices(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.y_batch_indices(), ","), "]");
   return ret;
 }
 
-string BCastList3(const tensorflow::BCast::Vec& x,
-                  const tensorflow::BCast::Vec& y,
-                  const tensorflow::BCast::Vec& z,
-                  const bool fewer_dims_optimization = true) {
+std::string BCastList3(const tensorflow::BCast::Vec& x,
+                       const tensorflow::BCast::Vec& y,
+                       const tensorflow::BCast::Vec& z,
+                       const bool fewer_dims_optimization = true) {
   tensorflow::BCastList<3> b({x, y, z}, fewer_dims_optimization);
   if (!b.IsValid()) {
     return "invalid";
   }
-  string ret;
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.reshape(0), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.bcast(0), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.reshape(1), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.bcast(1), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.reshape(2), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.bcast(2), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.result_shape(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.output_shape(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.grad_reduce_idx(0), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.grad_reduce_idx(1), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.grad_reduce_idx(2), ","), "]");
+  std::string ret;
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.reshape(0), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.bcast(0), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.reshape(1), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.bcast(1), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.reshape(2), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.bcast(2), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.result_shape(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.output_shape(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.grad_reduce_idx(0), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.grad_reduce_idx(1), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.grad_reduce_idx(2), ","), "]");
   return ret;
 }
 
@@ -571,7 +572,7 @@ TEST(BCastTest, Complex_BCast_To_Each_Other) {
   //   y = np.arange(0,21).reshape([7,1,3,1])
   //   np.shape(x + y)
   //   Out[.]: (11, 7, 5, 3, 2)
-  string truth =
+  std::string truth =
       "[11,1,5,1,2][1,7,1,3,1][1,7,1,3,1][11,1,5,1,2]"
       "[11,7,5,3,2]"
       "[11,7,5,3,2]"
@@ -592,7 +593,7 @@ TEST(BCastListTest, Complex_BCast_To_Each_Other) {
   //   np.shape(x + y + z)
   //   Out[.]: (11, 7, 5, 3, 2)
   //
-  string truth =
+  std::string truth =
       "[11,1,1,1,2][1,7,5,3,1]"
       "[1,7,1,3,1][11,1,5,1,2]"
       "[1,1,5,1,1][11,7,1,3,2]"
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index d76d2fce3d0b03..2bf0b27a24924d 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -13,22 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/util/command_line_flags.h"
+
 #include <ctype.h>
+
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
 // The returned array is only valid for the lifetime of the input vector.
 // We're using const casting because we need to pass in an argv-style array of
 // char* pointers for the API, even though we know they won't be altered.
-std::vector<char *> CharPointerVectorFromStrings(
-    const std::vector<string> &strings) {
+std::vector<char*> CharPointerVectorFromStrings(
+    const std::vector<std::string>& strings) {
   std::vector<char *> result;
   result.reserve(strings.size());
-  for (const string &string : strings) {
+  for (const std::string& string : strings) {
     result.push_back(const_cast<char *>(string.c_str()));
   }
   return result;
@@ -44,23 +47,24 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   bool some_switch_set_via_hook = true;
   bool some_switch_set_capitalized = false;
   bool some_switch_set_by_number = false;
-  string some_name_set_directly = "something_a";
-  string some_name_set_via_hook = "something_b";
+  std::string some_name_set_directly = "something_a";
+  std::string some_name_set_via_hook = "something_b";
   float some_float_set_directly = -23.23f;
   float some_float_set_via_hook = -25.23f;
-  std::vector<string> argv_strings = {"program_name",
-                                      "--some_int32_set_directly=20",
-                                      "--some_int32_set_via_hook=50",
-                                      "--some_int64_set_directly=214748364700",
-                                      "--some_int64_set_via_hook=214748364710",
-                                      "--some_switch_set_directly",
-                                      "--some_switch_set_via_hook=false",
-                                      "--some_switch_set_capitalized=True",
-                                      "--some_switch_set_by_number=1",
-                                      "--some_name_set_directly=somethingelse",
-                                      "--some_name_set_via_hook=anythingelse",
-                                      "--some_float_set_directly=42.0",
-                                      "--some_float_set_via_hook=43.0"};
+  std::vector<std::string> argv_strings = {
+      "program_name",
+      "--some_int32_set_directly=20",
+      "--some_int32_set_via_hook=50",
+      "--some_int64_set_directly=214748364700",
+      "--some_int64_set_via_hook=214748364710",
+      "--some_switch_set_directly",
+      "--some_switch_set_via_hook=false",
+      "--some_switch_set_capitalized=True",
+      "--some_switch_set_by_number=1",
+      "--some_name_set_directly=somethingelse",
+      "--some_name_set_via_hook=anythingelse",
+      "--some_float_set_directly=42.0",
+      "--some_float_set_via_hook=43.0"};
   int argc = argv_strings.size();
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok = Flags::Parse(
@@ -101,7 +105,7 @@ TEST(CommandLineFlagsTest, BasicUsage) {
                "some name set directly"),
           Flag(
               "some_name_set_via_hook",
-              [&](string value) {
+              [&](std::string value) {
                 some_name_set_via_hook = std::move(value);
                 return true;
               },
@@ -136,7 +140,8 @@ TEST(CommandLineFlagsTest, BasicUsage) {
 TEST(CommandLineFlagsTest, BadIntValue) {
   int some_int = 10;
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_int=notanumber"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_int=notanumber"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok = Flags::Parse(&argc, argv_array.data(),
                                 {Flag("some_int", &some_int, "some int")});
@@ -149,7 +154,8 @@ TEST(CommandLineFlagsTest, BadIntValue) {
 TEST(CommandLineFlagsTest, BadBoolValue) {
   bool some_switch = false;
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_switch=notabool"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_switch=notabool"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -163,8 +169,8 @@ TEST(CommandLineFlagsTest, BadBoolValue) {
 TEST(CommandLineFlagsTest, BadFloatValue) {
   float some_float = -23.23f;
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name",
-                                      "--some_float=notanumber"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_float=notanumber"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -177,7 +183,7 @@ TEST(CommandLineFlagsTest, BadFloatValue) {
 
 TEST(CommandLineFlagsTest, FailedInt32Hook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_int32=200"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_int32=200"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -191,7 +197,7 @@ TEST(CommandLineFlagsTest, FailedInt32Hook) {
 
 TEST(CommandLineFlagsTest, FailedInt64Hook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_int64=200"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_int64=200"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -205,7 +211,8 @@ TEST(CommandLineFlagsTest, FailedInt64Hook) {
 
 TEST(CommandLineFlagsTest, FailedFloatHook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_float=200.0"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_float=200.0"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -218,7 +225,8 @@ TEST(CommandLineFlagsTest, FailedFloatHook) {
 
 TEST(CommandLineFlagsTest, FailedBoolHook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_switch=true"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_switch=true"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -231,11 +239,13 @@ TEST(CommandLineFlagsTest, FailedBoolHook) {
 
 TEST(CommandLineFlagsTest, FailedStringHook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_name=true"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_name=true"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
-  bool parsed_ok = Flags::Parse(
-      &argc, argv_array.data(),
-      {Flag("some_name", [](string value) { return false; }, "", "some name")});
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag(
+                       "some_name", [](std::string value) { return false; }, "",
+                       "some name")});
 
   EXPECT_EQ(false, parsed_ok);
   EXPECT_EQ(argc, 1);
@@ -243,17 +253,18 @@ TEST(CommandLineFlagsTest, FailedStringHook) {
 
 TEST(CommandLineFlagsTest, RepeatedStringHook) {
   int argc = 3;
-  std::vector<string> argv_strings = {"program_name", "--some_name=this",
-                                      "--some_name=that"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_name=this",
+                                           "--some_name=that"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   int call_count = 0;
   bool parsed_ok = Flags::Parse(&argc, argv_array.data(),
-                                {Flag("some_name",
-                                      [&call_count](string value) {
-                                        call_count++;
-                                        return true;
-                                      },
-                                      "", "some name")});
+                                {Flag(
+                                    "some_name",
+                                    [&call_count](std::string value) {
+                                      call_count++;
+                                      return true;
+                                    },
+                                    "", "some name")});
 
   EXPECT_EQ(true, parsed_ok);
   EXPECT_EQ(argc, 1);
@@ -262,20 +273,21 @@ TEST(CommandLineFlagsTest, RepeatedStringHook) {
 
 // Return whether str==pat, but allowing any whitespace in pat
 // to match zero or more whitespace characters in str.
-static bool MatchWithAnyWhitespace(const string &str, const string &pat) {
+static bool MatchWithAnyWhitespace(const std::string& str,
+                                   const std::string& pat) {
   bool matching = true;
   int pat_i = 0;
   for (int str_i = 0; str_i != str.size() && matching; str_i++) {
-    if (isspace(str[str_i])) {
-      matching = (pat_i != pat.size() && isspace(pat[pat_i]));
+    if (absl::ascii_isspace(str[str_i])) {
+      matching = (pat_i != pat.size() && absl::ascii_isspace(pat[pat_i]));
     } else {
-      while (pat_i != pat.size() && isspace(pat[pat_i])) {
+      while (pat_i != pat.size() && absl::ascii_isspace(pat[pat_i])) {
         pat_i++;
       }
       matching = (pat_i != pat.size() && str[str_i] == pat[pat_i++]);
     }
   }
-  while (pat_i != pat.size() && isspace(pat[pat_i])) {
+  while (pat_i != pat.size() && absl::ascii_isspace(pat[pat_i])) {
     pat_i++;
   }
   return (matching && pat_i == pat.size());
@@ -285,15 +297,15 @@ TEST(CommandLineFlagsTest, UsageString) {
   int some_int = 10;
   int64_t some_int64 = 21474836470;  // max int32 is 2147483647
   bool some_switch = false;
-  string some_name = "something";
+  std::string some_name = "something";
   // Don't test float in this case, because precision is hard to predict and
   // match against, and we don't want a franky test.
-  const string tool_name = "some_tool_name";
-  string usage = Flags::Usage(tool_name + "<flags>",
-                              {Flag("some_int", &some_int, "some int"),
-                               Flag("some_int64", &some_int64, "some int64"),
-                               Flag("some_switch", &some_switch, "some switch"),
-                               Flag("some_name", &some_name, "some name")});
+  const std::string tool_name = "some_tool_name";
+  std::string usage = Flags::Usage(
+      tool_name + "<flags>", {Flag("some_int", &some_int, "some int"),
+                              Flag("some_int64", &some_int64, "some int64"),
+                              Flag("some_switch", &some_switch, "some switch"),
+                              Flag("some_name", &some_name, "some name")});
   // Match the usage message, being sloppy about whitespace.
   const char *expected_usage =
       " usage: some_tool_name <flags>\n"
@@ -311,10 +323,10 @@ TEST(CommandLineFlagsTest, UsageString) {
 
 namespace {
 template <typename T, typename ExpectationFun>
-void PrefixTestTempl(ExpectationFun expectation_fun, const T &value0,
-                     const T &value1, string str0, string str1) {
+void PrefixTestTempl(ExpectationFun expectation_fun, const T& value0,
+                     const T& value1, std::string str0, std::string str1) {
   int argc = 3;
-  std::vector<string> argv_strings = {
+  std::vector<std::string> argv_strings = {
       "program_name",
       "--hello" + str0,
       "--hello_world" + str1,
@@ -347,7 +359,7 @@ TEST(CommandLineFlagsTest, OneArgumentIsAPrefixOfAnother) {
   PrefixTestTempl<bool>(expect_eq, false, true, "=false", "");
   PrefixTestTempl<bool>(expect_eq, true, false, "=true", "=false");
   PrefixTestTempl<bool>(expect_eq, true, false, "", "=false");
-  PrefixTestTempl<string>(expect_eq, "a", "b", "=a", "=b");
+  PrefixTestTempl<std::string>(expect_eq, "a", "b", "=a", "=b");
   PrefixTestTempl<float>(expect_near, 0.1f, 0.2f, "=0.1", "=0.2");
 }
 
diff --git a/tensorflow/core/util/debug_data_dumper.h b/tensorflow/core/util/debug_data_dumper.h
index 44eee52c5b37a2..5716a77059b407 100644
--- a/tensorflow/core/util/debug_data_dumper.h
+++ b/tensorflow/core/util/debug_data_dumper.h
@@ -127,7 +127,7 @@ class DebugDataDumper {
   std::optional<std::string> name_filter_;
 
   // The groups filter.
-  std::set<string> groups_filter_;
+  std::set<std::string> groups_filter_;
 
   // A flag indicating whether to dump wrapped graphs.
   bool dump_wrapped_;
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index 8e84b2cc2cecbc..1d98ecbe2bde67 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -38,7 +38,8 @@ void MaybeSetDebugEventTimestamp(DebugEvent* debug_event, Env* env) {
 }
 }  // namespace
 
-SingleDebugEventFileWriter::SingleDebugEventFileWriter(const string& file_path)
+SingleDebugEventFileWriter::SingleDebugEventFileWriter(
+    const std::string& file_path)
     : env_(Env::Default()),
       file_path_(file_path),
       num_outstanding_events_(0),
@@ -120,7 +121,7 @@ absl::Status SingleDebugEventFileWriter::Close() {
   return status;
 }
 
-const string SingleDebugEventFileWriter::FileName() { return file_path_; }
+const std::string SingleDebugEventFileWriter::FileName() { return file_path_; }
 
 mutex DebugEventsWriter::factory_mu_(LINKER_INITIALIZED);
 
@@ -128,11 +129,11 @@ DebugEventsWriter::~DebugEventsWriter() { Close().IgnoreError(); }
 
 // static
 DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
-    const string& dump_root, const string& tfdbg_run_id,
+    const std::string& dump_root, const std::string& tfdbg_run_id,
     int64_t circular_buffer_size) {
   mutex_lock l(DebugEventsWriter::factory_mu_);
-  std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
-      DebugEventsWriter::GetDebugEventsWriterMap();
+  std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
+      writer_pool = DebugEventsWriter::GetDebugEventsWriterMap();
   if (writer_pool->find(dump_root) == writer_pool->end()) {
     std::unique_ptr<DebugEventsWriter> writer(
         new DebugEventsWriter(dump_root, tfdbg_run_id, circular_buffer_size));
@@ -143,10 +144,10 @@ DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
 
 // static
 absl::Status DebugEventsWriter::LookUpDebugEventsWriter(
-    const string& dump_root, DebugEventsWriter** debug_events_writer) {
+    const std::string& dump_root, DebugEventsWriter** debug_events_writer) {
   mutex_lock l(DebugEventsWriter::factory_mu_);
-  std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
-      DebugEventsWriter::GetDebugEventsWriterMap();
+  std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
+      writer_pool = DebugEventsWriter::GetDebugEventsWriterMap();
   if (writer_pool->find(dump_root) == writer_pool->end()) {
     return errors::FailedPrecondition(
         "No DebugEventsWriter has been created at dump root ", dump_root);
@@ -182,7 +183,7 @@ absl::Status DebugEventsWriter::Init() {
   metadata_writer_.reset();
 
   // The metadata file should be created.
-  string metadata_filename = GetFileNameInternal(METADATA);
+  std::string metadata_filename = GetFileNameInternal(METADATA);
   metadata_writer_ =
       std::make_unique<SingleDebugEventFileWriter>(metadata_filename);
   if (metadata_writer_ == nullptr) {
@@ -243,7 +244,7 @@ absl::Status DebugEventsWriter::WriteExecution(Execution* execution) {
     DebugEvent debug_event;
     MaybeSetDebugEventTimestamp(&debug_event, env_);
     debug_event.set_allocated_execution(execution);
-    string serialized;
+    std::string serialized;
     debug_event.SerializeToString(&serialized);
 
     mutex_lock l(execution_buffer_mu_);
@@ -268,7 +269,7 @@ absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
     DebugEvent debug_event;
     MaybeSetDebugEventTimestamp(&debug_event, env_);
     debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
-    string serialized;
+    std::string serialized;
     debug_event.SerializeToString(&serialized);
 
     mutex_lock l(graph_execution_trace_buffer_mu_);
@@ -281,8 +282,8 @@ absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
 }
 
 absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
-    const string& tfdbg_context_id, const string& device_name,
-    const string& op_name, int32_t output_slot, int32_t tensor_debug_mode,
+    const std::string& tfdbg_context_id, const std::string& device_name,
+    const std::string& op_name, int32_t output_slot, int32_t tensor_debug_mode,
     const Tensor& tensor_value) {
   std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
   trace->set_tfdbg_context_id(tfdbg_context_id);
@@ -301,16 +302,16 @@ absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
 }
 
 void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
-    const string& debug_event_str, DebugEventFileType type) {
+    const std::string& debug_event_str, DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
   (*writer)->WriteSerializedDebugEvent(debug_event_str);
 }
 
 void DebugEventsWriter::WriteSerializedExecutionDebugEvent(
-    const string& debug_event_str, DebugEventFileType type) {
+    const std::string& debug_event_str, DebugEventFileType type) {
   const std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
-  std::deque<string>* buffer = nullptr;
+  std::deque<std::string>* buffer = nullptr;
   mutex* mu = nullptr;
   switch (type) {
     case EXECUTION:
@@ -340,7 +341,7 @@ void DebugEventsWriter::WriteSerializedExecutionDebugEvent(
   }
 }
 
-int DebugEventsWriter::RegisterDeviceAndGetId(const string& device_name) {
+int DebugEventsWriter::RegisterDeviceAndGetId(const std::string& device_name) {
   mutex_lock l(device_mu_);
   int& device_id = device_name_to_id_[device_name];
   if (device_id == 0) {
@@ -350,7 +351,7 @@ int DebugEventsWriter::RegisterDeviceAndGetId(const string& device_name) {
     DebuggedDevice* debugged_device = debug_event.mutable_debugged_device();
     debugged_device->set_device_name(device_name);
     debugged_device->set_device_id(device_id);
-    string serialized;
+    std::string serialized;
     debug_event.SerializeToString(&serialized);
     graphs_writer_->WriteSerializedDebugEvent(serialized);
   }
@@ -403,7 +404,7 @@ absl::Status DebugEventsWriter::FlushExecutionFiles() {
   return absl::OkStatus();
 }
 
-string DebugEventsWriter::FileName(DebugEventFileType type) {
+std::string DebugEventsWriter::FileName(DebugEventFileType type) {
   if (file_prefix_.empty()) {
     Init().IgnoreError();
   }
@@ -418,7 +419,7 @@ absl::Status DebugEventsWriter::Close() {
     }
   }
 
-  std::vector<string> failed_to_close_files;
+  std::vector<std::string> failed_to_close_files;
 
   if (metadata_writer_ != nullptr) {
     if (!metadata_writer_->Close().ok()) {
@@ -472,16 +473,16 @@ absl::Status DebugEventsWriter::Close() {
 }
 
 // static
-std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
+std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
 DebugEventsWriter::GetDebugEventsWriterMap() {
-  static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
-      writer_pool =
-          new std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>();
+  static std::unordered_map<std::string,
+                            std::unique_ptr<DebugEventsWriter>>* writer_pool =
+      new std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>();
   return writer_pool;
 }
 
-DebugEventsWriter::DebugEventsWriter(const string& dump_root,
-                                     const string& tfdbg_run_id,
+DebugEventsWriter::DebugEventsWriter(const std::string& dump_root,
+                                     const std::string& tfdbg_run_id,
                                      int64_t circular_buffer_size)
     : env_(Env::Default()),
       dump_root_(dump_root),
@@ -499,7 +500,7 @@ DebugEventsWriter::DebugEventsWriter(const string& dump_root,
 absl::Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
-  const string filename = GetFileNameInternal(type);
+  const std::string filename = GetFileNameInternal(type);
   writer->reset();
 
   *writer = std::make_unique<SingleDebugEventFileWriter>(filename);
@@ -521,7 +522,7 @@ absl::Status DebugEventsWriter::SerializeAndWriteDebugEvent(
   if (writer != nullptr) {
     // Timestamp is in seconds, with double precision.
     MaybeSetDebugEventTimestamp(debug_event, env_);
-    string str;
+    std::string str;
     debug_event->AppendToString(&str);
     (*writer)->WriteSerializedDebugEvent(str);
     return absl::OkStatus();
@@ -557,7 +558,7 @@ void DebugEventsWriter::SelectWriter(
   }
 }
 
-const string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
+const std::string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
   switch (type) {
     case METADATA:
       return kMetadataSuffix;
@@ -572,14 +573,14 @@ const string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
     case GRAPH_EXECUTION_TRACES:
       return kGraphExecutionTracesSuffix;
     default:
-      string suffix;
+      std::string suffix;
       return suffix;
   }
 }
 
-string DebugEventsWriter::GetFileNameInternal(DebugEventFileType type) {
-  const string suffix = GetSuffix(type);
-  return strings::StrCat(file_prefix_, ".", suffix);
+std::string DebugEventsWriter::GetFileNameInternal(DebugEventFileType type) {
+  const std::string suffix = GetSuffix(type);
+  return absl::StrCat(file_prefix_, ".", suffix);
 }
 
 }  // namespace tfdbg
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 7b1042790d7913..abc7397d71dbb0 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -49,7 +49,7 @@ enum DebugEventFileType {
 // TFRecord files, and hence utilizes multiple objects of this helper class.
 class SingleDebugEventFileWriter {
  public:
-  explicit SingleDebugEventFileWriter(const string& file_path);
+  explicit SingleDebugEventFileWriter(const std::string& file_path);
 
   absl::Status Init();
 
@@ -58,11 +58,11 @@ class SingleDebugEventFileWriter {
   absl::Status Flush();
   absl::Status Close();
 
-  const string FileName();
+  const std::string FileName();
 
  private:
   Env* env_;
-  const string file_path_;
+  const std::string file_path_;
   std::atomic_int_fast32_t num_outstanding_events_;
 
   std::unique_ptr<WritableFile> writable_file_;
@@ -108,15 +108,15 @@ class DebugEventsWriter {
   //     behavior.
   // Returns:
   //   A pointer to a DebugEventsWriter object: a per-dump_root singleton.
-  static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root,
-                                                 const string& tfdbg_run_id,
-                                                 int64_t circular_buffer_size);
+  static DebugEventsWriter* GetDebugEventsWriter(
+      const std::string& dump_root, const std::string& tfdbg_run_id,
+      int64_t circular_buffer_size);
   // Look up existing events writer by dump_root.
   // If no DebugEventsWriter has been created at the dump_root, a non-OK
   // Status will be returned. Else an OK status will be returned, with
   // the pointer to the existing instance provided by reference.
   static absl::Status LookUpDebugEventsWriter(
-      const string& dump_root, DebugEventsWriter** debug_events_writer);
+      const std::string& dump_root, DebugEventsWriter** debug_events_writer);
   ~DebugEventsWriter();
 
   // Sets the debug event filenames and opens file for writing.
@@ -168,9 +168,9 @@ class DebugEventsWriter {
   //   tensor(s)
   //     that this trace is concerned with. The semantics of this tensor value
   //     depends on the value of `tensor_debug_mode`.
-  absl::Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                        const string& device_name,
-                                        const string& op_name,
+  absl::Status WriteGraphExecutionTrace(const std::string& tfdbg_context_id,
+                                        const std::string& device_name,
+                                        const std::string& op_name,
                                         int32_t output_slot,
                                         int32_t tensor_debug_mode,
                                         const Tensor& tensor_value);
@@ -180,7 +180,7 @@ class DebugEventsWriter {
   // and GRAPHS files.
   // NOTE: Actually used in the Python binding, to avoid overhead of
   // serializing and parsing protos at the language interface.
-  void WriteSerializedNonExecutionDebugEvent(const string& debug_event_str,
+  void WriteSerializedNonExecutionDebugEvent(const std::string& debug_event_str,
                                              DebugEventFileType type);
 
   // Writes a serialized DebugEvent to one of the debug-events files
@@ -189,13 +189,13 @@ class DebugEventsWriter {
   // circular_buffer_size is configured to be >0.
   // NOTE: Actually used in the Python binding, to avoid overhead of
   // serializing and parsing protos at the language interface.
-  void WriteSerializedExecutionDebugEvent(const string& debug_event_str,
+  void WriteSerializedExecutionDebugEvent(const std::string& debug_event_str,
                                           DebugEventFileType type);
 
   // Given name of the device, retrieve a unique integer ID. As a side effect,
   // if this is the first time this object encounters the device name,
   // writes a DebuggedDevice proto to the .graphs file in the file set.
-  int RegisterDeviceAndGetId(const string& device_name);
+  int RegisterDeviceAndGetId(const std::string& device_name);
 
   // EventWriter automatically flushes and closes on destruction, but
   // this method is provided for users who want to write to disk sooner
@@ -213,7 +213,7 @@ class DebugEventsWriter {
   absl::Status Close();
 
  private:
-  static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
+  static std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
 
   // Get a static map from dump-root path to DebugEventsWriter objects.
   // This helps the per-dump-root singletone pattern.
@@ -222,12 +222,13 @@ class DebugEventsWriter {
   // Guards calls to the GetDebugEventsWriter() method.
   static mutex factory_mu_;
 
-  DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id,
+  DebugEventsWriter(const std::string& dump_root,
+                    const std::string& tfdbg_run_id,
                     int64_t circular_buffer_size);
 
   // Get the path prefix. The same for all files, which differ only in the
   // suffix.
-  string FileName(DebugEventFileType type);
+  std::string FileName(DebugEventFileType type);
 
   // Initialize the TFRecord writer for non-metadata file type.
   absl::Status InitNonMetadataFile(DebugEventFileType type);
@@ -237,25 +238,26 @@ class DebugEventsWriter {
 
   void SelectWriter(DebugEventFileType type,
                     std::unique_ptr<SingleDebugEventFileWriter>** writer);
-  const string GetSuffix(DebugEventFileType type);
-  string GetFileNameInternal(DebugEventFileType type);
+  const std::string GetSuffix(DebugEventFileType type);
+  std::string GetFileNameInternal(DebugEventFileType type);
 
   Env* env_;
-  const string dump_root_;
-  const string tfdbg_run_id_;
+  const std::string dump_root_;
+  const std::string tfdbg_run_id_;
 
-  string file_prefix_;
+  std::string file_prefix_;
   bool is_initialized_ TF_GUARDED_BY(initialization_mu_);
   mutex initialization_mu_;
 
   const int64_t circular_buffer_size_;
-  std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
+  std::deque<std::string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
   mutex execution_buffer_mu_;
-  std::deque<string> graph_execution_trace_buffer_
+  std::deque<std::string> graph_execution_trace_buffer_
       TF_GUARDED_BY(graph_execution_trace_buffer_mu_);
   mutex graph_execution_trace_buffer_mu_;
 
-  absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_);
+  absl::flat_hash_map<std::string, int> device_name_to_id_
+      TF_GUARDED_BY(device_mu_);
   mutex device_mu_;
 
   std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_;
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 2021010735aebb..2bd17cbda55a9d 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -37,8 +37,8 @@ Env* env() { return Env::Default(); }
 
 class DebugEventsWriterTest : public ::testing::Test {
  public:
-  static string GetDebugEventFileName(DebugEventsWriter* writer,
-                                      DebugEventFileType type) {
+  static std::string GetDebugEventFileName(DebugEventsWriter* writer,
+                                           DebugEventFileType type) {
     return writer->FileName(type);
   }
 
@@ -46,12 +46,12 @@ class DebugEventsWriterTest : public ::testing::Test {
                                    DebugEventFileType type,
                                    std::vector<DebugEvent>* protos) {
     protos->clear();
-    const string filename = writer->FileName(type);
+    const std::string filename = writer->FileName(type);
     std::unique_ptr<RandomAccessFile> debug_events_file;
     TF_CHECK_OK(env()->NewRandomAccessFile(filename, &debug_events_file));
     io::RecordReader* reader = new io::RecordReader(debug_events_file.get());
 
-    uint64 offset = 0;
+    uint64_t offset = 0;
     DebugEvent actual;
     while (ReadDebugEventProto(reader, &offset, &actual)) {
       protos->push_back(actual);
@@ -60,7 +60,7 @@ class DebugEventsWriterTest : public ::testing::Test {
     delete reader;
   }
 
-  static bool ReadDebugEventProto(io::RecordReader* reader, uint64* offset,
+  static bool ReadDebugEventProto(io::RecordReader* reader, uint64_t* offset,
                                   DebugEvent* proto) {
     tstring record;
     absl::Status s = reader->ReadRecord(offset, &record);
@@ -88,8 +88,8 @@ class DebugEventsWriterTest : public ::testing::Test {
     }
   }
 
-  string dump_root_;
-  string tfdbg_run_id_;
+  std::string dump_root_;
+  std::string tfdbg_run_id_;
 };
 
 TEST_F(DebugEventsWriterTest, GetDebugEventsWriterSameRootGivesSameObject) {
@@ -134,7 +134,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) {
   std::vector<DebugEventsWriter*> writers;
   mutex mu;
   auto fn = [this, &counter, &writers, &mu]() {
-    const string new_dump_root =
+    const std::string new_dump_root =
         io::JoinPath(dump_root_, strings::Printf("%ld", counter.fetch_add(1)));
     DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
         new_dump_root, tfdbg_run_id_,
@@ -159,7 +159,7 @@ TEST_F(DebugEventsWriterTest, GetDebugEventsWriterDifferentRoots) {
   // Test the DebugEventsWriters for different directories are different.
   DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter(
       dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
-  const string dump_root_2 = io::JoinPath(dump_root_, "subdirectory");
+  const std::string dump_root_2 = io::JoinPath(dump_root_, "subdirectory");
   DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter(
       dump_root_2, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   EXPECT_NE(writer_1, writer_2);
@@ -177,7 +177,7 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
   EXPECT_EQ(actuals.size(), 1);
   EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
   // Check the content of the file version string.
-  const string file_version = actuals[0].debug_metadata().file_version();
+  const std::string file_version = actuals[0].debug_metadata().file_version();
   EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
   EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
   // Check the tfdbg run ID.
@@ -223,7 +223,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
   EXPECT_EQ(actuals.size(), 1);
   EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
   // Check the content of the file version string.
-  const string file_version = actuals[0].debug_metadata().file_version();
+  const std::string file_version = actuals[0].debug_metadata().file_version();
   EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
   EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
   EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
@@ -247,7 +247,7 @@ TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) {
   EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
   EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0);
 
-  string metadata_path_1 =
+  std::string metadata_path_1 =
       GetDebugEventFileName(writer, DebugEventFileType::METADATA);
   TF_ASSERT_OK(writer->Init());
   EXPECT_EQ(GetDebugEventFileName(writer, DebugEventFileType::METADATA),
@@ -434,7 +434,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
       new thread::ThreadPool(Env::Default(), "test_pool", 8);
   std::atomic_int_fast64_t counter(0);
   auto fn = [&writer, &counter]() {
-    const string file_path = strings::Printf(
+    const std::string file_path = strings::Printf(
         "/home/tf_programs/program_%.3ld.py", counter.fetch_add(1));
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
@@ -451,8 +451,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
   std::vector<DebugEvent> actuals;
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites);
-  std::vector<string> file_paths;
-  std::vector<string> host_names;
+  std::vector<std::string> file_paths;
+  std::vector<std::string> host_names;
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     file_paths.push_back(actuals[i].source_file().file_path());
     host_names.push_back(actuals[i].source_file().host_name());
@@ -475,7 +475,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
       new thread::ThreadPool(Env::Default(), "test_pool", 8);
   std::atomic_int_fast64_t counter(0);
   auto fn = [&writer, &counter]() {
-    const string file_path = strings::Printf(
+    const std::string file_path = strings::Printf(
         "/home/tf_programs/program_%.3ld.py", counter.fetch_add(1));
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
@@ -493,8 +493,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
   std::vector<DebugEvent> actuals;
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites);
-  std::vector<string> file_paths;
-  std::vector<string> host_names;
+  std::vector<std::string> file_paths;
+  std::vector<std::string> host_names;
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     file_paths.push_back(actuals[i].source_file().file_path());
     host_names.push_back(actuals[i].source_file().host_name());
@@ -545,8 +545,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
   std::vector<DebugEvent> actuals;
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites / 3);
-  std::vector<string> file_paths;
-  std::vector<string> host_names;
+  std::vector<std::string> file_paths;
+  std::vector<std::string> host_names;
   for (int32_t i = 0; i < kConcurrentWrites / 3; ++i) {
     file_paths.push_back(actuals[i].source_file().file_path());
     host_names.push_back(actuals[i].source_file().host_name());
@@ -560,7 +560,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
 
   ReadDebugEventProtos(writer, DebugEventFileType::STACK_FRAMES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites / 3);
-  std::vector<string> stack_frame_ids;
+  std::vector<std::string> stack_frame_ids;
   for (int32_t i = 0; i < kConcurrentWrites / 3; ++i) {
     stack_frame_ids.push_back(actuals[i].stack_frame_with_id().id());
   }
@@ -571,8 +571,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
 
   ReadDebugEventProtos(writer, DebugEventFileType::GRAPHS, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites / 3);
-  std::vector<string> op_types;
-  std::vector<string> op_names;
+  std::vector<std::string> op_types;
+  std::vector<std::string> op_names;
   for (int32_t i = 0; i < kConcurrentWrites / 3; ++i) {
     op_types.push_back(actuals[i].graph_op_creation().op_type());
     op_names.push_back(actuals[i].graph_op_creation().op_name());
@@ -809,7 +809,7 @@ TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
   int device_ids[8];
   for (int i = 0; i < 8; ++i) {
     thread_pool->Schedule([i, &writer, &device_ids]() {
-      const string device_name = strings::Printf(
+      const std::string device_name = strings::Printf(
           "/job:localhost/replica:0/task:0/device:GPU:%d", i % 4);
       device_ids[i] = writer->RegisterDeviceAndGetId(device_name);
     });
@@ -833,7 +833,7 @@ TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
   // are 8 threads each calling `RegisterDeviceAndGetId`.
   EXPECT_EQ(actuals.size(), 4);
   for (const DebugEvent& actual : actuals) {
-    const string& device_name = actual.debugged_device().device_name();
+    const std::string& device_name = actual.debugged_device().device_name();
     int device_index = -1;
     CHECK(absl::SimpleAtoi(device_name.substr(strlen(
                                "/job:localhost/replica:0/task:0/device:GPU:")),
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index adf49e492e5c33..f75670557d804d 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -39,10 +39,11 @@ using strings::StrCat;
 
 struct NameCounts {
   mutex counts_mutex;
-  std::unordered_map<string, int> counts;
+  std::unordered_map<std::string, int> counts;
 };
 
-string MakeUniqueFilename(string name, const string& suffix = ".pbtxt") {
+std::string MakeUniqueFilename(std::string name,
+                               const std::string& suffix = ".pbtxt") {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
@@ -60,7 +61,7 @@ string MakeUniqueFilename(string name, const string& suffix = ".pbtxt") {
     count = instance.counts[name]++;
   }
 
-  string filename = name;
+  std::string filename = name;
   if (count > 0) {
     absl::StrAppend(&filename, "_", count);
   }
@@ -78,7 +79,7 @@ struct GraphDumperConfig {
                                const FunctionLibraryDefinition* flib_def,
                                WritableFile*)>
         dumper = nullptr;
-    string suffix = ".pbtxt";
+    std::string suffix = ".pbtxt";
   } config TF_GUARDED_BY(mu);
 
   // Returns whether a custom dumper is set.
@@ -93,8 +94,8 @@ GraphDumperConfig& GetGraphDumperConfig() {
   return config;
 }
 
-string GetDumpGraphFormatLowerCase() {
-  string fmt;
+std::string GetDumpGraphFormatLowerCase() {
+  std::string fmt;
   absl::Status status =
       tsl::ReadStringFromEnvVar("TF_DUMP_GRAPH_FMT", "TXT", &fmt);
   if (!status.ok()) {
@@ -105,8 +106,8 @@ string GetDumpGraphFormatLowerCase() {
   return fmt;
 }
 
-string GetDumpGraphSuffix() {
-  string fmt = GetDumpGraphFormatLowerCase();
+std::string GetDumpGraphSuffix() {
+  std::string fmt = GetDumpGraphFormatLowerCase();
   if (fmt == "txt") {
     return ".pbtxt";
   } else if (fmt == "bin") {
@@ -145,11 +146,12 @@ class StderrWritableFile : public WritableFile {
   }
 };
 
-absl::Status CreateWritableFile(Env* env, const string& dirname,
-                                const string& name, const string& suffix,
-                                string* filepath,
+absl::Status CreateWritableFile(Env* env, const std::string& dirname,
+                                const std::string& name,
+                                const std::string& suffix,
+                                std::string* filepath,
                                 std::unique_ptr<WritableFile>* file) {
-  string dir;
+  std::string dir;
   if (!dirname.empty()) {
     dir = dirname;
   } else {
@@ -187,8 +189,8 @@ absl::Status CreateWritableFile(Env* env, const string& dirname,
 
 absl::Status WriteProtoToUniqueFile(const tensorflow::protobuf::Message& proto,
                                     WritableFile* file) {
-  string s;
-  string format = GetDumpGraphFormatLowerCase();
+  std::string s;
+  std::string format = GetDumpGraphFormatLowerCase();
   if (format == "txt" &&
       !::tensorflow::protobuf::TextFormat::PrintToString(proto, &s)) {
     return absl::FailedPreconditionError("Unable to convert proto to text.");
@@ -209,7 +211,7 @@ absl::Status WriteProtoToUniqueFile(const tensorflow::protobuf::Message& proto,
 
 absl::Status WriteProtoToUniqueFile(
     const tensorflow::protobuf::MessageLite& proto, WritableFile* file) {
-  string s;
+  std::string s;
   if (!SerializeToStringDeterministic(proto, &s)) {
     return errors::Internal("Failed to serialize proto to string.");
   }
@@ -223,15 +225,16 @@ absl::Status WriteProtoToUniqueFile(
 
 }  // anonymous namespace
 
-string DumpToFile(const string& name, const string& dirname,
-                  const string& suffix, absl::string_view type_name,
-                  std::function<absl::Status(WritableFile*)> dumper) {
-  string filepath;
+std::string DumpToFile(const std::string& name, const std::string& dirname,
+                       const std::string& suffix, absl::string_view type_name,
+                       std::function<absl::Status(WritableFile*)> dumper) {
+  std::string filepath;
   std::unique_ptr<WritableFile> file;
   absl::Status status = CreateWritableFile(Env::Default(), dirname, name,
                                            suffix, &filepath, &file);
   if (!status.ok()) {
-    return StrCat("(failed to create writable file: ", status.ToString(), ")");
+    return absl::StrCat("(failed to create writable file: ", status.ToString(),
+                        ")");
   }
 
   status = dumper(file.get());
@@ -248,32 +251,34 @@ void SetGraphDumper(
                                const FunctionLibraryDefinition* flib_def,
                                WritableFile*)>
         dumper,
-    string suffix) {
+    std::string suffix) {
   GraphDumperConfig& dumper_config = GetGraphDumperConfig();
   mutex_lock lock(dumper_config.mu);
   dumper_config.config.dumper = dumper;
   dumper_config.config.suffix = suffix;
 }
 
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
-                          const string& dirname) {
+std::string DumpGraphDefToFile(const std::string& name,
+                               GraphDef const& graph_def,
+                               const std::string& dirname) {
   return DumpToFile(name, dirname, GetDumpGraphSuffix(), "Graph",
                     [&](WritableFile* file) {
                       return WriteProtoToUniqueFile(graph_def, file);
                     });
 }
 
-string DumpCostGraphDefToFile(const string& name, CostGraphDef const& graph_def,
-                              const string& dirname) {
+std::string DumpCostGraphDefToFile(const std::string& name,
+                                   CostGraphDef const& graph_def,
+                                   const std::string& dirname) {
   return DumpToFile(name, dirname, GetDumpGraphSuffix(), "Graph",
                     [&](WritableFile* file) {
                       return WriteProtoToUniqueFile(graph_def, file);
                     });
 }
 
-string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def,
-                       const string& dirname) {
+std::string DumpGraphToFile(const std::string& name, Graph const& graph,
+                            const FunctionLibraryDefinition* flib_def,
+                            const std::string& dirname) {
   auto& dumper_config = GetGraphDumperConfig();
   if (dumper_config.IsSet()) {
     GraphDumperConfig::Config config;
@@ -297,16 +302,17 @@ string DumpGraphToFile(const string& name, Graph const& graph,
   return DumpGraphDefToFile(name, graph_def, dirname);
 }
 
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
-                             const string& dirname) {
+std::string DumpFunctionDefToFile(const std::string& name,
+                                  FunctionDef const& fdef,
+                                  const std::string& dirname) {
   return DumpToFile(
       name, dirname, GetDumpGraphSuffix(), "FunctionDef",
       [&](WritableFile* file) { return WriteProtoToUniqueFile(fdef, file); });
 }
 
-string DumpProtoToFile(const string& name,
-                       tensorflow::protobuf::Message const& proto,
-                       const string& dirname) {
+std::string DumpProtoToFile(const std::string& name,
+                            tensorflow::protobuf::Message const& proto,
+                            const std::string& dirname) {
   return DumpToFile(
       name, dirname, GetDumpGraphSuffix(), proto.GetTypeName(),
       [&](WritableFile* file) { return WriteProtoToUniqueFile(proto, file); });
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
index 0d0c55754dd8ff..2a81c55a0232f2 100644
--- a/tensorflow/core/util/dump_graph.h
+++ b/tensorflow/core/util/dump_graph.h
@@ -42,29 +42,32 @@ namespace tensorflow {
 // 'name' with '.pbtxt' or '.pb'. If a graph has already been dumped by
 // this process with the same name, suffixes with "_n.pb(txt)", where 'n' is a
 // sequence number.
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
-                          const string& dirname = "");
+std::string DumpGraphDefToFile(const std::string& name,
+                               GraphDef const& graph_def,
+                               const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, use CostGraphDef instead of GraphDef.
-string DumpCostGraphDefToFile(const string& name, CostGraphDef const& graph_def,
-                              const string& dirname = "");
+std::string DumpCostGraphDefToFile(const std::string& name,
+                                   CostGraphDef const& graph_def,
+                                   const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
 // and an optional function library 'flib_def'. Returns the file name chosen.
-string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def = nullptr,
-                       const string& dirname = "");
+std::string DumpGraphToFile(const std::string& name, Graph const& graph,
+                            const FunctionLibraryDefinition* flib_def = nullptr,
+                            const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
 // proto. Returns the file name chosen.
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
-                             const string& dirname = "");
+std::string DumpFunctionDefToFile(const std::string& name,
+                                  FunctionDef const& fdef,
+                                  const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but dumps a proto of any type. Returns the
 // file name chosen.
-string DumpProtoToFile(const string& name,
-                       tensorflow::protobuf::Message const& proto,
-                       const string& dirname = "");
+std::string DumpProtoToFile(const std::string& name,
+                            tensorflow::protobuf::Message const& proto,
+                            const std::string& dirname = "");
 
 // Sets a custom Graph dumper. If set, this dumper will be used to dump graphs
 // instead via DumpGraphToFile. As the custom dumper may not produce protobufs,
@@ -74,14 +77,14 @@ void SetGraphDumper(
                                const FunctionLibraryDefinition* flib_def,
                                WritableFile*)>
         dumper,
-    string suffix = ".pbtxt");
+    std::string suffix = ".pbtxt");
 
 // Dump data to a file.
 // This function will create a WritableFile and pass it to the dumper.
 // The dumper callback will be responsible for writing data to the file.
-string DumpToFile(const string& name, const string& dirname,
-                  const string& suffix, absl::string_view type_name,
-                  std::function<absl::Status(WritableFile*)> dumper);
+std::string DumpToFile(const std::string& name, const std::string& dirname,
+                       const std::string& suffix, absl::string_view type_name,
+                       std::function<absl::Status(WritableFile*)> dumper);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/dump_graph_test.cc b/tensorflow/core/util/dump_graph_test.cc
index 935ca41a7e9d26..c80f74690f0f6d 100644
--- a/tensorflow/core/util/dump_graph_test.cc
+++ b/tensorflow/core/util/dump_graph_test.cc
@@ -35,7 +35,7 @@ TEST(DumpGraph, DumpGraphToFileSuccess) {
   TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
-  string ret = DumpGraphToFile("graph", graph);
+  std::string ret = DumpGraphToFile("graph", graph);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph.pbtxt"));
   ret = DumpGraphToFile("graph", graph);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph_1.pbtxt"));
@@ -43,7 +43,7 @@ TEST(DumpGraph, DumpGraphToFileSuccess) {
   GraphDef gdef;
   TF_ASSERT_OK(ReadTextProto(
       Env::Default(), io::JoinPath(testing::TmpDir(), "graph.pbtxt"), &gdef));
-  string read, written;
+  std::string read, written;
   gdef.AppendToString(&read);
   graph.ToGraphDefDebug().AppendToString(&written);
   EXPECT_EQ(read, written);
@@ -52,14 +52,14 @@ TEST(DumpGraph, DumpGraphToFileSuccess) {
 TEST(DumpGraph, DumpGraphToFileNoEnvPrefix) {
   Graph graph(OpRegistry::Global());
   unsetenv("TF_DUMP_GRAPH_PREFIX");
-  string ret = DumpGraphToFile("graph", graph);
+  std::string ret = DumpGraphToFile("graph", graph);
   EXPECT_TRUE(absl::StrContains(ret, "TF_DUMP_GRAPH_PREFIX not specified"));
 }
 
 TEST(DumpGraph, DumpFunctionDefToFileSuccess) {
   FunctionDef fdef;
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
-  string ret = DumpFunctionDefToFile("function", fdef);
+  std::string ret = DumpFunctionDefToFile("function", fdef);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "function.pbtxt"));
 }
 
@@ -72,8 +72,9 @@ TEST(DumpGraph, DumpProtoToFileSuccess) {
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
   setenv("TF_DUMP_GRAPH_FMT", "TXT", 1);
-  string expected_filepath = io::JoinPath(testing::TmpDir(), "node_def.pbtxt");
-  string actual_filepath = DumpProtoToFile("node_def", ndef_in);
+  std::string expected_filepath =
+      io::JoinPath(testing::TmpDir(), "node_def.pbtxt");
+  std::string actual_filepath = DumpProtoToFile("node_def", ndef_in);
   EXPECT_EQ(expected_filepath, actual_filepath);
 
   NodeDef ndef_out;
@@ -81,7 +82,7 @@ TEST(DumpGraph, DumpProtoToFileSuccess) {
   EXPECT_EQ(ndef_in.DebugString(), ndef_out.DebugString());
 
   setenv("TF_DUMP_GRAPH_FMT", "BIN", 1);
-  string ret = DumpProtoToFile("node_def", ndef_in);
+  std::string ret = DumpProtoToFile("node_def", ndef_in);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "node_def_1.pb"));
   TF_ASSERT_OK(ReadBinaryProto(Env::Default(), ret, &ndef_out));
   EXPECT_EQ(ndef_out.DebugString(), ndef_in.DebugString());
diff --git a/tensorflow/core/util/einsum_op_util.cc b/tensorflow/core/util/einsum_op_util.cc
index 55151c724af993..bb37622670f4d6 100644
--- a/tensorflow/core/util/einsum_op_util.cc
+++ b/tensorflow/core/util/einsum_op_util.cc
@@ -28,9 +28,10 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ValidateEinsumEquation(
-    const string& equation, absl::InlinedVector<string, 2UL>* input_subscripts,
-    string* output_subscript) {
-  absl::InlinedVector<string, 2UL> inputs_and_output_subscripts =
+    const std::string& equation,
+    absl::InlinedVector<std::string, 2UL>* input_subscripts,
+    std::string* output_subscript) {
+  absl::InlinedVector<std::string, 2UL> inputs_and_output_subscripts =
       absl::StrSplit(equation, "->");
   if (inputs_and_output_subscripts.size() != 2) {
     return errors::InvalidArgument(
@@ -63,7 +64,7 @@ EinsumDimensionType GetDimensionType(bool is_removed, bool is_unique) {
 }
 
 // Maps the character labels to consecutive integers.
-void MapToLabels(const string& subscript, Labels* labels,
+void MapToLabels(const std::string& subscript, Labels* labels,
                  absl::flat_hash_map<char, int>* label_mapping) {
   for (int i = 0; i < subscript.size(); ++i) {
     const char label_char = subscript[i];
@@ -82,13 +83,13 @@ void MapToLabels(const string& subscript, Labels* labels,
 }
 
 absl::Status ParseEinsumEquation(
-    const string& equation, OperandLabels* input_labels, Labels* output_labels,
-    std::vector<EinsumDimensionType>* label_types,
+    const std::string& equation, OperandLabels* input_labels,
+    Labels* output_labels, std::vector<EinsumDimensionType>* label_types,
     OperandLabelCounts* input_label_counts, LabelCounts* output_label_counts,
     absl::InlinedVector<bool, 2UL>* input_has_ellipsis,
     bool* output_has_ellipsis) {
-  absl::InlinedVector<string, 2UL> input_str;
-  string output_str;
+  absl::InlinedVector<std::string, 2UL> input_str;
+  std::string output_str;
   TF_RETURN_IF_ERROR(ValidateEinsumEquation(equation, &input_str, &output_str));
 
   // Temporary map from single character labels to (consecutive) integer labels.
diff --git a/tensorflow/core/util/einsum_op_util.h b/tensorflow/core/util/einsum_op_util.h
index 6155b8a08d663b..6e45c47fea99b8 100644
--- a/tensorflow/core/util/einsum_op_util.h
+++ b/tensorflow/core/util/einsum_op_util.h
@@ -53,16 +53,17 @@ enum EinsumDimensionType {
 
 // Parses and validates an einsum equation in explicit form.
 absl::Status ValidateEinsumEquation(
-    const string& equation, absl::InlinedVector<string, 2UL>* input_subscripts,
-    string* output_subscript);
+    const std::string& equation,
+    absl::InlinedVector<std::string, 2UL>* input_subscripts,
+    std::string* output_subscript);
 
 // Parses and validates the equation and the input shapes. Single character
 // labels are integerized and we populate input and output label subscripts
 // and corresponding counts. Also create the mapping from (named) labels to
 // their EinsumDimensionType.
 absl::Status ParseEinsumEquation(
-    const string& equation, OperandLabels* input_labels, Labels* output_labels,
-    std::vector<EinsumDimensionType>* label_types,
+    const std::string& equation, OperandLabels* input_labels,
+    Labels* output_labels, std::vector<EinsumDimensionType>* label_types,
     OperandLabelCounts* input_label_counts, LabelCounts* output_label_counts,
     absl::InlinedVector<bool, 2UL>* input_has_ellipsis,
     bool* output_has_ellipsis);
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index a27ebb034ec091..c8d5a41b1fc848 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -33,20 +33,22 @@ limitations under the License.
 namespace tensorflow {
 
 bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
-                   string* diff, const EqualGraphDefOptions& options) {
+                   std::string* diff, const EqualGraphDefOptions& options) {
   // Intentionally do not check that versions match so that this routine can
   // be used for less brittle golden file tests.
   return EqualRepeatedNodeDef(actual.node(), expected.node(), diff, options);
 }
 
-uint64 GraphDefHash(const GraphDef& gdef, const EqualGraphDefOptions& options) {
+uint64_t GraphDefHash(const GraphDef& gdef,
+                      const EqualGraphDefOptions& options) {
   return RepeatedNodeDefHash(gdef.node(), options);
 }
 
 bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
                           const protobuf::RepeatedPtrField<NodeDef>& expected,
-                          string* diff, const EqualGraphDefOptions& options) {
-  std::unordered_map<string, const NodeDef*> actual_index;
+                          std::string* diff,
+                          const EqualGraphDefOptions& options) {
+  std::unordered_map<std::string, const NodeDef*> actual_index;
   for (const NodeDef& node : actual) {
     actual_index[node.name()] = &node;
   }
@@ -55,8 +57,8 @@ bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
     auto actual_iter = actual_index.find(expected_node.name());
     if (actual_iter == actual_index.end()) {
       if (diff != nullptr) {
-        *diff = strings::StrCat("Did not find expected node '",
-                                SummarizeNodeDef(expected_node), "'");
+        *diff = absl::StrCat("Did not find expected node '",
+                             SummarizeNodeDef(expected_node), "'");
       }
       return false;
     }
@@ -71,8 +73,8 @@ bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
   if (!actual_index.empty()) {
     if (diff != nullptr) {
       *diff =
-          strings::StrCat("Found unexpected node '",
-                          SummarizeNodeDef(*actual_index.begin()->second), "'");
+          absl::StrCat("Found unexpected node '",
+                       SummarizeNodeDef(*actual_index.begin()->second), "'");
     }
     return false;
   }
@@ -80,11 +82,11 @@ bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
   return true;
 }
 
-uint64 RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
-                           const EqualGraphDefOptions& options) {
-  uint64 h = 0xDECAFCAFFE;
+uint64_t RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
+                             const EqualGraphDefOptions& options) {
+  uint64_t h = 0xDECAFCAFFE;
   // Insert NodeDefs into map to deterministically sort by name
-  std::map<string, const NodeDef*> nodes;
+  std::map<std::string, const NodeDef*> nodes;
   for (const NodeDef& node : ndefs) {
     nodes[node.name()] = &node;
   }
@@ -97,19 +99,19 @@ uint64 RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
 
 namespace {
 
-string JoinStringField(const protobuf::RepeatedPtrField<string>& f) {
-  string ret;
+std::string JoinStringField(const protobuf::RepeatedPtrField<std::string>& f) {
+  std::string ret;
   for (int i = 0; i < f.size(); ++i) {
-    if (i > 0) strings::StrAppend(&ret, ", ");
-    strings::StrAppend(&ret, f.Get(i));
+    if (i > 0) absl::StrAppend(&ret, ", ");
+    absl::StrAppend(&ret, f.Get(i));
   }
   return ret;
 }
 
 }  // namespace
 
-bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
-                  const EqualGraphDefOptions& options) {
+bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected,
+                  std::string* diff, const EqualGraphDefOptions& options) {
   if (actual.name() != expected.name()) {
     if (diff != nullptr) {
       *diff = strings::StrCat("Actual node name '", actual.name(),
@@ -154,8 +156,8 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
     }
     // Special case for inputs: "tensor" is equivalent to "tensor:0"
     if (actual.input(i) != expected.input(i) &&
-        actual.input(i) != strings::StrCat(expected.input(i), ":0") &&
-        strings::StrCat(actual.input(i), ":0") != expected.input(i)) {
+        actual.input(i) != absl::StrCat(expected.input(i), ":0") &&
+        absl::StrCat(actual.input(i), ":0") != expected.input(i)) {
       if (diff != nullptr) {
         *diff = strings::StrCat("Node named '", actual.name(), "' has input ",
                                 i, " '", actual.input(i),
@@ -166,8 +168,8 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
     }
   }
 
-  std::unordered_set<string> actual_control;
-  std::unordered_set<string> expected_control;
+  std::unordered_set<std::string> actual_control;
+  std::unordered_set<std::string> expected_control;
   for (int i = first_control_input; i < actual.input_size(); ++i) {
     actual_control.insert(actual.input(i));
     expected_control.insert(expected.input(i));
@@ -190,7 +192,7 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
     return false;
   }
 
-  std::unordered_set<string> actual_attr;
+  std::unordered_set<std::string> actual_attr;
   for (const auto& a : actual.attr()) {
     if (options.ignore_internal_attrs && !a.first.empty() &&
         a.first[0] == '_') {
@@ -236,8 +238,8 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
   return true;
 }
 
-uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
-  uint64 h = Hash64(ndef.name());
+uint64_t NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
+  uint64_t h = Hash64(ndef.name());
   h = Hash64(ndef.op().data(), ndef.op().size(), h);
   h = Hash64(ndef.device().data(), ndef.device().size(), h);
 
@@ -252,16 +254,16 @@ uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
   }
 
   // Control inputs. Order irrelevant.
-  std::set<string> ndef_control;
+  std::set<std::string> ndef_control;
   for (int i = first_control_input; i < ndef.input_size(); ++i) {
     ndef_control.insert(ndef.input(i));
   }
-  for (const string& s : ndef_control) {
+  for (const std::string& s : ndef_control) {
     h = Hash64(s.data(), s.size(), h);
   }
 
   // Attributes
-  std::map<string, AttrValue> ndef_attr;
+  std::map<std::string, AttrValue> ndef_attr;
   for (const auto& a : ndef.attr()) {
     if (options.ignore_internal_attrs && !a.first.empty() &&
         a.first[0] == '_') {
diff --git a/tensorflow/core/util/equal_graph_def.h b/tensorflow/core/util/equal_graph_def.h
index 9803b2dba60452..2c720dcbfdadfd 100644
--- a/tensorflow/core/util/equal_graph_def.h
+++ b/tensorflow/core/util/equal_graph_def.h
@@ -37,7 +37,7 @@ struct EqualGraphDefOptions {
 // we use node names to match up nodes between the graphs, and so the naming of
 // nodes must be consistent.
 bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
-                   string* diff, const EqualGraphDefOptions& options = {});
+                   std::string* diff, const EqualGraphDefOptions& options = {});
 
 // Returns a hash of `gdef` that is consistent with EqualGraphDef. In other
 // words, if two graph defs compare equal according to EqualGraphDef,
@@ -46,16 +46,16 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 GraphDefHash(const GraphDef& gdef,
-                    const EqualGraphDefOptions& options = {});
+uint64_t GraphDefHash(const GraphDef& gdef,
+                      const EqualGraphDefOptions& options = {});
 
 // Determines if actual and expected are equal, ignoring: ordering of
 // attrs, internal attributes (if set in `options`), and control inputs.
 //
 // If the NodeDefs are different and
 // diff != nullptr, *diff is set to an explanation of the difference.
-bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
-                  const EqualGraphDefOptions& options = {});
+bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected,
+                  std::string* diff, const EqualGraphDefOptions& options = {});
 
 // Returns a hash of `ndef` that is consistent with EqualNodeDef. In other
 // words, if two node defs compare equal according to EqualNodeDef, NodeDefHash
@@ -64,15 +64,15 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 NodeDefHash(const NodeDef& ndef,
-                   const EqualGraphDefOptions& options = {});
+uint64_t NodeDefHash(const NodeDef& ndef,
+                     const EqualGraphDefOptions& options = {});
 
 // Determines if actual and expected are equal, ignoring ordering. If they're
 // different and diff != nullptr, *diff is set to an explanation of the
 // difference.
 bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
                           const protobuf::RepeatedPtrField<NodeDef>& expected,
-                          string* diff,
+                          std::string* diff,
                           const EqualGraphDefOptions& options = {});
 
 // Returns a hash of `ndefs` that is consistent with EqualRepeatedNodeDef.
@@ -83,8 +83,8 @@ bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
-                           const EqualGraphDefOptions& options = {});
+uint64_t RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
+                             const EqualGraphDefOptions& options = {});
 
 #define TF_EXPECT_GRAPH_EQ(expected, actual)            \
   do {                                                  \
diff --git a/tensorflow/core/util/equal_graph_def_test.cc b/tensorflow/core/util/equal_graph_def_test.cc
index 77ca8eaec36804..c989b60c371751 100644
--- a/tensorflow/core/util/equal_graph_def_test.cc
+++ b/tensorflow/core/util/equal_graph_def_test.cc
@@ -72,7 +72,7 @@ class EqualGraphDefTest : public ::testing::Test {
 
   GraphDefBuilder e_;
   GraphDefBuilder a_;
-  string diff_;
+  std::string diff_;
 };
 
 TEST_F(EqualGraphDefTest, Match) {
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index 80aadf73deafe2..4d4b16a078848b 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -32,7 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-EventsWriter::EventsWriter(const string& file_prefix)
+EventsWriter::EventsWriter(const std::string& file_prefix)
     // TODO(jeff,sanjay): Pass in env and use that here instead of Env::Default
     : env_(Env::Default()),
       file_prefix_(file_prefix),
@@ -44,7 +44,7 @@ EventsWriter::~EventsWriter() {
 
 absl::Status EventsWriter::Init() { return InitWithSuffix(""); }
 
-absl::Status EventsWriter::InitWithSuffix(const string& suffix) {
+absl::Status EventsWriter::InitWithSuffix(const std::string& suffix) {
   file_suffix_ = suffix;
   return InitIfNeeded();
 }
@@ -90,7 +90,7 @@ absl::Status EventsWriter::InitIfNeeded() {
 
     Event event;
     event.set_wall_time(time_in_seconds);
-    event.set_file_version(strings::StrCat(kVersionPrefix, kCurrentVersion));
+    event.set_file_version(absl::StrCat(kVersionPrefix, kCurrentVersion));
     SourceMetadata* source_metadata = event.mutable_source_metadata();
     source_metadata->set_writer(kWriterSourceMetadata);
     WriteEvent(event);
@@ -99,7 +99,7 @@ absl::Status EventsWriter::InitIfNeeded() {
   return absl::OkStatus();
 }
 
-string EventsWriter::FileName() {
+std::string EventsWriter::FileName() {
   if (filename_.empty()) {
     InitIfNeeded().IgnoreError();
   }
@@ -120,7 +120,7 @@ void EventsWriter::WriteSerializedEvent(absl::string_view event_str) {
 // NOTE(touts); This is NOT the function called by the Python code.
 // Python calls WriteSerializedEvent(), see events_writer.i.
 void EventsWriter::WriteEvent(const Event& event) {
-  string record;
+  std::string record;
   event.AppendToString(&record);
   WriteSerializedEvent(record);
 }
diff --git a/tensorflow/core/util/events_writer_test.cc b/tensorflow/core/util/events_writer_test.cc
index 81b2aa400b027c..f1aeb7981daab4 100644
--- a/tensorflow/core/util/events_writer_test.cc
+++ b/tensorflow/core/util/events_writer_test.cc
@@ -39,7 +39,7 @@ namespace {
 Env* env() { return Env::Default(); }
 
 void WriteSimpleValue(EventsWriter* writer, double wall_time, int64_t step,
-                      const string& tag, float simple_value) {
+                      const std::string& tag, float simple_value) {
   Event event;
   event.set_wall_time(wall_time);
   event.set_step(step);
@@ -54,7 +54,7 @@ void WriteFile(EventsWriter* writer) {
   WriteSimpleValue(writer, 2345, 35, "bar", -42);
 }
 
-static bool ReadEventProto(io::RecordReader* reader, uint64* offset,
+static bool ReadEventProto(io::RecordReader* reader, uint64_t* offset,
                            Event* proto) {
   tstring record;
   absl::Status s = reader->ReadRecord(offset, &record);
@@ -64,13 +64,13 @@ static bool ReadEventProto(io::RecordReader* reader, uint64* offset,
   return ParseProtoUnlimited(proto, record);
 }
 
-void VerifyFile(const string& filename) {
+void VerifyFile(const std::string& filename) {
   TF_CHECK_OK(env()->FileExists(filename));
   std::unique_ptr<RandomAccessFile> event_file;
   TF_CHECK_OK(env()->NewRandomAccessFile(filename, &event_file));
   io::RecordReader* reader = new io::RecordReader(event_file.get());
 
-  uint64 offset = 0;
+  uint64_t offset = 0;
 
   Event actual;
   CHECK(ReadEventProto(reader, &offset, &actual));
@@ -80,9 +80,8 @@ void VerifyFile(const string& filename) {
   double current_time = env()->NowMicros() / 1000000.0;
   EXPECT_LT(fabs(actual.wall_time() - current_time), 5);
   // Should have the current version number.
-  EXPECT_EQ(actual.file_version(),
-            strings::StrCat(EventsWriter::kVersionPrefix,
-                            EventsWriter::kCurrentVersion));
+  EXPECT_EQ(actual.file_version(), absl::StrCat(EventsWriter::kVersionPrefix,
+                                                EventsWriter::kCurrentVersion));
   // Should have the current source metadata.
   EXPECT_EQ(actual.source_metadata().writer(),
             EventsWriter::kWriterSourceMetadata);
@@ -110,41 +109,41 @@ void VerifyFile(const string& filename) {
   delete reader;
 }
 
-string GetDirName(const string& suffix) {
+std::string GetDirName(const std::string& suffix) {
   return io::JoinPath(testing::TmpDir(), suffix);
 }
 
 TEST(EventWriter, WriteFlush) {
-  string file_prefix = GetDirName("/writeflush_test");
+  std::string file_prefix = GetDirName("/writeflush_test");
   EventsWriter writer(file_prefix);
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Flush());
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   VerifyFile(filename);
 }
 
 TEST(EventWriter, WriteClose) {
-  string file_prefix = GetDirName("/writeclose_test");
+  std::string file_prefix = GetDirName("/writeclose_test");
   EventsWriter writer(file_prefix);
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Close());
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   VerifyFile(filename);
 }
 
 TEST(EventWriter, WriteDelete) {
-  string file_prefix = GetDirName("/writedelete_test");
+  std::string file_prefix = GetDirName("/writedelete_test");
   EventsWriter* writer = new EventsWriter(file_prefix);
   WriteFile(writer);
-  string filename = writer->FileName();
+  std::string filename = writer->FileName();
   delete writer;
   VerifyFile(filename);
 }
 
 TEST(EventWriter, FailFlush) {
-  string file_prefix = GetDirName("/failflush_test");
+  std::string file_prefix = GetDirName("/failflush_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   WriteFile(&writer);
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
@@ -152,9 +151,9 @@ TEST(EventWriter, FailFlush) {
 }
 
 TEST(EventWriter, FailClose) {
-  string file_prefix = GetDirName("/failclose_test");
+  std::string file_prefix = GetDirName("/failclose_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   WriteFile(&writer);
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
@@ -162,22 +161,22 @@ TEST(EventWriter, FailClose) {
 }
 
 TEST(EventWriter, InitWriteClose) {
-  string file_prefix = GetDirName("/initwriteclose_test");
+  std::string file_prefix = GetDirName("/initwriteclose_test");
   EventsWriter writer(file_prefix);
   TF_EXPECT_OK(writer.Init());
-  string filename0 = writer.FileName();
+  std::string filename0 = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename0));
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Close());
-  string filename1 = writer.FileName();
+  std::string filename1 = writer.FileName();
   EXPECT_EQ(filename0, filename1);
   VerifyFile(filename1);
 }
 
 TEST(EventWriter, NameWriteClose) {
-  string file_prefix = GetDirName("/namewriteclose_test");
+  std::string file_prefix = GetDirName("/namewriteclose_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename));
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Close());
@@ -185,18 +184,18 @@ TEST(EventWriter, NameWriteClose) {
 }
 
 TEST(EventWriter, NameClose) {
-  string file_prefix = GetDirName("/nameclose_test");
+  std::string file_prefix = GetDirName("/nameclose_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   TF_EXPECT_OK(writer.Close());
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
 }
 
 TEST(EventWriter, FileDeletionBeforeWriting) {
-  string file_prefix = GetDirName("/fdbw_test");
+  std::string file_prefix = GetDirName("/fdbw_test");
   EventsWriter writer(file_prefix);
-  string filename0 = writer.FileName();
+  std::string filename0 = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename0));
   env()->SleepForMicroseconds(
       2000000);  // To make sure timestamp part of filename will differ.
@@ -204,7 +203,7 @@ TEST(EventWriter, FileDeletionBeforeWriting) {
   TF_EXPECT_OK(writer.Init());  // Init should reopen file.
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Flush());
-  string filename1 = writer.FileName();
+  std::string filename1 = writer.FileName();
   EXPECT_NE(filename0, filename1);
   VerifyFile(filename1);
 }
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 18d58405287bbf..5a41ad1982a242 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -107,17 +107,17 @@ auto EnableAliasing(A* a) -> decltype(a->EnableAliasing(true), void()) {
 template <typename A>
 void EnableAliasing(A&& a) {}
 
-uint8 PeekTag(protobuf::io::CodedInputStream* stream) {
+uint8_t PeekTag(protobuf::io::CodedInputStream* stream) {
   DCHECK(stream != nullptr);
   const void* ptr;
   int size;
   if (!stream->GetDirectBufferPointer(&ptr, &size)) return 0;
-  return *static_cast<const uint8*>(ptr);
+  return *static_cast<const uint8_t*>(ptr);
 }
 
-constexpr uint8 kVarintTag(uint32 tag) { return (tag << 3) | 0; }
-constexpr uint8 kDelimitedTag(uint32 tag) { return (tag << 3) | 2; }
-constexpr uint8 kFixed32Tag(uint32 tag) { return (tag << 3) | 5; }
+constexpr uint8_t kVarintTag(uint32_t tag) { return (tag << 3) | 0; }
+constexpr uint8_t kDelimitedTag(uint32_t tag) { return (tag << 3) | 2; }
+constexpr uint8_t kFixed32Tag(uint32_t tag) { return (tag << 3) | 5; }
 
 namespace parsed {
 
@@ -133,7 +133,7 @@ class Feature {
       *dtype = DT_INVALID;
       return absl::OkStatus();
     }
-    uint8 oneof_tag = static_cast<uint8>(*serialized_.data());
+    uint8_t oneof_tag = static_cast<uint8_t>(*serialized_.data());
     serialized_.remove_prefix(1);
     switch (oneof_tag) {
       case kDelimitedTag(1):
@@ -155,15 +155,16 @@ class Feature {
 
   bool GetNumElementsInBytesList(int* num_elements) {
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
     EnableAliasing(&stream);
-    uint32 length = 0;
+    uint32_t length = 0;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
     *num_elements = 0;
     while (!stream.ExpectAtEnd()) {
       if (!stream.ExpectTag(kDelimitedTag(1))) return false;
-      uint32 bytes_length = 0;
+      uint32_t bytes_length = 0;
       if (!stream.ReadVarint32(&bytes_length)) return false;
       if (!stream.Skip(bytes_length)) return false;
       ++*num_elements;
@@ -188,18 +189,19 @@ class Feature {
     DCHECK(bytes_list != nullptr);
 
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
 
     EnableAliasing(&stream);
 
-    uint32 length;
+    uint32_t length;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
 
     while (!stream.ExpectAtEnd()) {
       if (!stream.ExpectTag(kDelimitedTag(1))) return false;
       // parse string
-      uint32 bytes_length;
+      uint32_t bytes_length;
       if (!stream.ReadVarint32(&bytes_length)) return false;
       tstring* bytes = construct_at_end(bytes_list);
       if (bytes == nullptr) return false;
@@ -214,14 +216,15 @@ class Feature {
   bool ParseFloatList(Result* float_list) {
     DCHECK(float_list != nullptr);
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
     EnableAliasing(&stream);
-    uint32 length;
+    uint32_t length;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
 
     if (!stream.ExpectAtEnd()) {
-      uint8 peek_tag = PeekTag(&stream);
+      uint8_t peek_tag = PeekTag(&stream);
       if (peek_tag != kDelimitedTag(1) && peek_tag != kFixed32Tag(1)) {
         return false;
       }
@@ -229,7 +232,7 @@ class Feature {
       constexpr int32_t kNumFloatBytes = 4;
       if (peek_tag == kDelimitedTag(1)) {                       // packed
         if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
-        uint32 packed_length;
+        uint32_t packed_length;
         if (!stream.ReadVarint32(&packed_length)) return false;
         auto packed_limit = stream.PushLimit(packed_length);
 
@@ -245,16 +248,16 @@ class Feature {
             sizeof(typename Result::value_type) == kNumFloatBytes) {
           // Calculate the length of the buffer available what can be less than
           // what we requested in resize in case of a LimitedArraySlice.
-          const uint32 bytes_to_copy =
-              std::min(static_cast<uint32>((float_list->size() - initial_size) *
-                                           kNumFloatBytes),
-                       packed_length);
+          const uint32_t bytes_to_copy = std::min(
+              static_cast<uint32_t>((float_list->size() - initial_size) *
+                                    kNumFloatBytes),
+              packed_length);
           if (!stream.ReadRaw(float_list->data() + initial_size, bytes_to_copy))
             return false;
         } else {
           int64_t index = initial_size;
           while (!stream.ExpectAtEnd()) {
-            uint32 buffer32;
+            uint32_t buffer32;
             if (!stream.ReadLittleEndian32(&buffer32)) return false;
             if (index < float_list->size()) {
               float_list->data()[index] = absl::bit_cast<float>(buffer32);
@@ -274,7 +277,7 @@ class Feature {
         int64_t index = initial_size;
         while (!stream.ExpectAtEnd()) {
           if (!stream.ExpectTag(kFixed32Tag(1))) return false;
-          uint32 buffer32;
+          uint32_t buffer32;
           if (!stream.ReadLittleEndian32(&buffer32)) return false;
           float_list->data()[index] = absl::bit_cast<float>(buffer32);
           ++index;
@@ -290,20 +293,21 @@ class Feature {
   bool ParseInt64List(Result* int64_list) {
     DCHECK(int64_list != nullptr);
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
     EnableAliasing(&stream);
-    uint32 length;
+    uint32_t length;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
 
     if (!stream.ExpectAtEnd()) {
-      uint8 peek_tag = PeekTag(&stream);
+      uint8_t peek_tag = PeekTag(&stream);
       if (peek_tag != kDelimitedTag(1) && peek_tag != kVarintTag(1)) {
         return false;
       }
       if (peek_tag == kDelimitedTag(1)) {                       // packed
         if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
-        uint32 packed_length;
+        uint32_t packed_length;
         if (!stream.ReadVarint32(&packed_length)) return false;
         auto packed_limit = stream.PushLimit(packed_length);
 
@@ -340,7 +344,7 @@ using Example = std::vector<FeatureMapEntry>;
 }  // namespace parsed
 
 inline bool SkipExtraneousTag(protobuf::io::CodedInputStream* stream) {
-  uint32 data;
+  uint32_t data;
   protobuf_uint64 dummy;
   switch (stream->ReadTag() & 0x7) {
     case 0:  // varint
@@ -368,7 +372,7 @@ bool ParseString(protobuf::io::CodedInputStream* stream,
                  absl::string_view* result) {
   DCHECK(stream != nullptr);
   DCHECK(result != nullptr);
-  uint32 length;
+  uint32_t length;
   if (!stream->ReadVarint32(&length)) return false;
   if (length == 0) {
     *result = absl::string_view(nullptr, 0);
@@ -379,7 +383,7 @@ bool ParseString(protobuf::io::CodedInputStream* stream,
   if (!stream->GetDirectBufferPointer(&stream_alias, &stream_size)) {
     return false;
   }
-  if (static_cast<uint32>(stream_size) < length) return false;
+  if (static_cast<uint32_t>(stream_size) < length) return false;
   *result = absl::string_view(static_cast<const char*>(stream_alias), length);
   stream->Skip(length);
   return true;
@@ -389,7 +393,7 @@ bool ParseFeatureMapEntry(protobuf::io::CodedInputStream* stream,
                           parsed::FeatureMapEntry* feature_map_entry) {
   DCHECK(stream != nullptr);
   DCHECK(feature_map_entry != nullptr);
-  uint32 length;
+  uint32_t length;
   if (!stream->ReadVarint32(&length)) return false;
   auto limit = stream->PushLimit(length);
 
@@ -422,7 +426,7 @@ bool ParseFeatures(protobuf::io::CodedInputStream* stream,
                    parsed::Example* example) {
   DCHECK(stream != nullptr);
   DCHECK(example != nullptr);
-  uint32 length;
+  uint32_t length;
   if (!stream->ReadVarint32(&length)) return false;
   auto limit = stream->PushLimit(length);
   while (!stream->ExpectAtEnd()) {
@@ -455,14 +459,14 @@ bool ParseExample(protobuf::io::CodedInputStream* stream,
 bool ParseExample(absl::string_view serialized, parsed::Example* example) {
   DCHECK(example != nullptr);
   protobuf::io::CodedInputStream stream(
-      reinterpret_cast<const uint8*>(serialized.data()), serialized.size());
+      reinterpret_cast<const uint8_t*>(serialized.data()), serialized.size());
   EnableAliasing(&stream);
   return ParseExample(&stream, example);
 }
 
 }  // namespace
 
-bool TestFastParse(const string& serialized, Example* example) {
+bool TestFastParse(const std::string& serialized, Example* example) {
   DCHECK(example != nullptr);
   parsed::Example parsed_example;
   if (!ParseExample(serialized, &parsed_example)) return false;
@@ -473,7 +477,7 @@ bool TestFastParse(const string& serialized, Example* example) {
     // I.e. last entry in the map overwrites all the previous ones.
     parsed::FeatureMapEntry& name_and_feature =
         parsed_example[parsed_example_size - i - 1];
-    string name(name_and_feature.first);
+    std::string name(name_and_feature.first);
     if ((*features.mutable_feature()).count(name) > 0) continue;
 
     auto& value = (*features.mutable_feature())[name];
@@ -562,10 +566,10 @@ struct SparseBuffer {
 };
 
 struct SeededHasher {
-  uint64 operator()(absl::string_view s) const {
+  uint64_t operator()(absl::string_view s) const {
     return Hash64(s.data(), s.size(), seed);
   }
-  uint64 seed{0xDECAFCAFFE};
+  uint64_t seed{0xDECAFCAFFE};
 };
 
 void LogDenseFeatureDataLoss(absl::string_view feature_name) {
@@ -631,7 +635,7 @@ absl::Status FastParseSerializedExample(
     parsed::Feature& feature = name_and_feature.second;
 
     std::pair<size_t, Type> d_and_type;
-    uint64 h = hasher(feature_name);
+    uint64_t h = hasher(feature_name);
     if (!config_index.Find(h, &d_and_type)) continue;
 
     size_t d = d_and_type.first;
@@ -673,7 +677,7 @@ absl::Status FastParseSerializedExample(
       dense_feature_last_example[d] = example_index;
 
       if (example_dtype != config.dense[d].dtype) {
-        return example_error(strings::StrCat(
+        return example_error(absl::StrCat(
             "Data types don't match. Data type: ",
             DataTypeString(example_dtype),
             " but expected type: ", DataTypeString(config.dense[d].dtype)));
@@ -738,7 +742,7 @@ absl::Status FastParseSerializedExample(
 
         if (example_dtype != DT_INVALID &&
             example_dtype != config.dense[d].dtype) {
-          return example_error(strings::StrCat(
+          return example_error(absl::StrCat(
               "Data types don't match. ",
               "Expected type: ", DataTypeString(config.dense[d].dtype)));
         }
@@ -1302,7 +1306,7 @@ absl::Status FastParseExample(const Config& config,
       size_t delta = 0;
 
       if (indices->NumElements() > 0) {
-        int64* ix_p = &indices->matrix<int64_t>()(offset, 0);
+        int64_t* ix_p = &indices->matrix<int64_t>()(offset, 0);
         size_t example_index = first_example_of_minibatch(i);
         for (size_t example_end_index : buffer.example_end_indices) {
           size_t feature_index = 0;
@@ -1339,7 +1343,7 @@ absl::Status FastParseExample(const Config& config,
     if (config.ragged[d].splits_dtype == DT_INT64) {
       row_splits->flat<int64_t>()(0) = 0;
     } else {
-      row_splits->flat<int32>()(0) = 0;
+      row_splits->flat<int32_t>()(0) = 0;
     }
 
     TensorShape values_shape;
@@ -1356,13 +1360,13 @@ absl::Status FastParseExample(const Config& config,
       // Update row_splits.  row_splits are formed by concatenating the example
       // end_indices (adjusting each to start after the previous one ends).
       if (config.ragged[d].splits_dtype == DT_INT64) {
-        int64* row_splits_out = &row_splits->flat<int64_t>()(splits_offset);
+        int64_t* row_splits_out = &row_splits->flat<int64_t>()(splits_offset);
         int64_t start = *row_splits_out;
         for (size_t example_end_index : buffer.example_end_indices) {
           *++row_splits_out = start + example_end_index;
         }
       } else {
-        int32* row_splits_out = &row_splits->flat<int32>()(splits_offset);
+        int32_t* row_splits_out = &row_splits->flat<int32_t>()(splits_offset);
         int32_t start = *row_splits_out;
         for (size_t example_end_index : buffer.example_end_indices) {
           *++row_splits_out = start + example_end_index;
@@ -1561,7 +1565,7 @@ absl::Status FastParseSingleExample(const Config& config,
     parsed::Feature& feature = name_and_feature.second;
 
     std::pair<size_t, Type> d_and_type;
-    uint64 h = hasher(feature_name);
+    uint64_t h = hasher(feature_name);
     if (!config_index.Find(h, &d_and_type)) continue;
 
     size_t d = d_and_type.first;
@@ -1602,7 +1606,7 @@ absl::Status FastParseSingleExample(const Config& config,
       dense_feature_already_seen[d] = true;
 
       if (example_dtype != config.dense[d].dtype) {
-        return example_error(strings::StrCat(
+        return example_error(absl::StrCat(
             "Data types don't match. Data type: ",
             DataTypeString(example_dtype),
             " but expected type: ", DataTypeString(config.dense[d].dtype)));
@@ -1669,7 +1673,7 @@ absl::Status FastParseSingleExample(const Config& config,
         }
         dense_feature_already_seen[d] = true;
         if (example_dtype != config.dense[d].dtype) {
-          return example_error(strings::StrCat(
+          return example_error(absl::StrCat(
               "Data types don't match. Data type: ",
               DataTypeString(example_dtype),
               " but expected type: ", DataTypeString(config.dense[d].dtype)));
@@ -1873,7 +1877,7 @@ struct FeatureProtos {
 // Map from feature name to FeatureProtos for that feature.
 using FeatureProtosMap = absl::flat_hash_map<absl::string_view, FeatureProtos>;
 
-string ExampleName(const absl::Span<const tstring> example_names, int n) {
+std::string ExampleName(const absl::Span<const tstring> example_names, int n) {
   return example_names.empty() ? "<unknown>" : example_names[n];
 }
 
@@ -1882,14 +1886,14 @@ string ExampleName(const absl::Span<const tstring> example_names, int n) {
 inline int ParseBytesFeature(protobuf::io::CodedInputStream* stream,
                              tstring* out) {
   int num_elements = 0;
-  uint32 length;
+  uint32_t length;
   if (!stream->ExpectTag(kDelimitedTag(1)) || !stream->ReadVarint32(&length)) {
     return -1;
   }
   if (length > 0) {
     auto limit = stream->PushLimit(length);
     while (!stream->ExpectAtEnd()) {
-      uint32 bytes_length;
+      uint32_t bytes_length;
       if (!stream->ExpectTag(kDelimitedTag(1)) ||
           !stream->ReadVarint32(&bytes_length)) {
         return -1;
@@ -1927,22 +1931,22 @@ inline void PadInt64Feature(int num_to_pad, int64_t* out) {
 inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
                              float* out) {
   int num_elements = 0;
-  uint32 length;
+  uint32_t length;
   if (!stream->ExpectTag(kDelimitedTag(2)) || !stream->ReadVarint32(&length)) {
     return -1;
   }
   if (length > 0) {
     auto limit = stream->PushLimit(length);
-    uint8 peek_tag = PeekTag(stream);
+    uint8_t peek_tag = PeekTag(stream);
     if (peek_tag == kDelimitedTag(1)) {  // packed
-      uint32 packed_length;
+      uint32_t packed_length;
       if (!stream->ExpectTag(kDelimitedTag(1)) ||
           !stream->ReadVarint32(&packed_length)) {
         return -1;
       }
       auto packed_limit = stream->PushLimit(packed_length);
       while (!stream->ExpectAtEnd()) {
-        uint32 buffer32;
+        uint32_t buffer32;
         if (!stream->ReadLittleEndian32(&buffer32)) {
           return -1;
         }
@@ -1954,7 +1958,7 @@ inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
       stream->PopLimit(packed_limit);
     } else if (peek_tag == kFixed32Tag(1)) {
       while (!stream->ExpectAtEnd()) {
-        uint32 buffer32;
+        uint32_t buffer32;
         if (!stream->ExpectTag(kFixed32Tag(1)) ||
             !stream->ReadLittleEndian32(&buffer32)) {
           return -1;
@@ -1978,15 +1982,15 @@ inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
 inline int ParseInt64Feature(protobuf::io::CodedInputStream* stream,
                              int64_t* out) {
   int num_elements = 0;
-  uint32 length;
+  uint32_t length;
   if (!stream->ExpectTag(kDelimitedTag(3)) || !stream->ReadVarint32(&length)) {
     return -1;
   }
   if (length > 0) {
     auto limit = stream->PushLimit(length);
-    uint8 peek_tag = PeekTag(stream);
+    uint8_t peek_tag = PeekTag(stream);
     if (peek_tag == kDelimitedTag(1)) {  // packed
-      uint32 packed_length;
+      uint32_t packed_length;
       if (!stream->ExpectTag(kDelimitedTag(1)) ||
           !stream->ReadVarint32(&packed_length)) {
         return -1;
@@ -2070,7 +2074,7 @@ inline int GetFeatureLength(DataType dtype,
 }
 
 inline DataType ParseDataType(protobuf::io::CodedInputStream* stream) {
-  uint8 peek_tag = PeekTag(stream);
+  uint8_t peek_tag = PeekTag(stream);
   switch (peek_tag) {
     case kDelimitedTag(1):
       return DT_STRING;
@@ -2104,7 +2108,7 @@ inline bool SkipEmptyFeature(protobuf::io::CodedInputStream* stream,
     default:
       return false;
   }
-  uint32 length;
+  uint32_t length;
   return stream->ReadVarint32(&length) && length == 0;
 }
 
@@ -2116,7 +2120,7 @@ absl::Status ExtractFeaturesFromSequenceExamples(
   for (int d = 0; d < examples.size(); d++) {
     const tstring& example = examples[d];
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(example.data()), example.size());
+        reinterpret_cast<const uint8_t*>(example.data()), example.size());
     // Not clear what this does. Why not stream.EnableAliasing()?
     EnableAliasing(&stream);
 
@@ -2135,7 +2139,7 @@ absl::Status ExtractFeaturesFromSequenceExamples(
             ExampleName(example_names, d));
       }
       if (features != nullptr) {
-        uint32 length;
+        uint32_t length;
         if (!stream.ReadVarint32(&length)) {
           return errors::InvalidArgument(
               "Invalid protocol message input, example id: ",
@@ -2144,7 +2148,7 @@ absl::Status ExtractFeaturesFromSequenceExamples(
         auto limit = stream.PushLimit(length);
         while (!stream.ExpectAtEnd()) {
           absl::string_view key, value;
-          uint32 length;
+          uint32_t length;
           if (!stream.ExpectTag(kDelimitedTag(1)) ||
               !stream.ReadVarint32(&length)) {
             return errors::InvalidArgument(
@@ -2187,7 +2191,7 @@ absl::Status GetContextFeatureLengths(
       const auto& proto = feature.protos[d];
       if (proto.empty()) continue;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(proto.data()), proto.size());
+          reinterpret_cast<const uint8_t*>(proto.data()), proto.size());
       EnableAliasing(&stream);
       int num_elements = GetFeatureLength(feature.dtype, &stream);
       if (num_elements < 0) {
@@ -2226,10 +2230,10 @@ absl::Status GetSequenceFeatureLengths(
       size_t num_rows = 0;
       size_t num_elements = 0;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(proto.data()), proto.size());
+          reinterpret_cast<const uint8_t*>(proto.data()), proto.size());
       EnableAliasing(&stream);
       while (!stream.ExpectAtEnd()) {
-        uint32 feature_bytes;
+        uint32_t feature_bytes;
         if (!stream.ExpectTag(kDelimitedTag(1)) ||
             !stream.ReadVarint32(&feature_bytes)) {
           return errors::InvalidArgument("Error in sequence feature ", c.first,
@@ -2358,7 +2362,7 @@ absl::Status ParseContextDenseFeatures(
         num_elements += c.default_value.NumElements();
       } else if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         num_elements += ParseFeature(dtype, &stream, &out, &out_offset);
@@ -2408,7 +2412,7 @@ absl::Status ParseContextSparseFeatures(
       const auto& feature_proto = feature.protos[e];
       if (feature_proto.empty()) continue;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(feature_proto.data()),
+          reinterpret_cast<const uint8_t*>(feature_proto.data()),
           feature_proto.size());
       EnableAliasing(&stream);
       size_t num_added =
@@ -2458,9 +2462,9 @@ absl::Status ParseContextRaggedFeatures(
         Tensor(allocator, splits_dtype, splits_shape);
     Tensor& out_values = context_result->ragged_values[t];
     size_t out_values_offset = 0;
-    int32* int32_splits =
+    int32_t* int32_splits =
         is_batch && splits_dtype == DT_INT32
-            ? context_result->ragged_splits[t].vec<int32>().data()
+            ? context_result->ragged_splits[t].vec<int32_t>().data()
             : nullptr;
     int64_t* int64_splits =
         is_batch && splits_dtype == DT_INT64
@@ -2478,7 +2482,7 @@ absl::Status ParseContextRaggedFeatures(
       const auto& feature_proto = feature.protos[e];
       if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         size_t num_added =
@@ -2499,7 +2503,7 @@ absl::Status ParseContextRaggedFeatures(
       int actual_splits =
           int32_splits
               ? int32_splits -
-                    context_result->ragged_splits[t].vec<int32>().data()
+                    context_result->ragged_splits[t].vec<int32_t>().data()
               : int64_splits -
                     context_result->ragged_splits[t].vec<int64_t>().data();
       if (actual_splits != num_examples + 1) {
@@ -2591,11 +2595,11 @@ absl::Status ParseSequenceDenseFeatures(
         }
       } else if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         while (!stream.ExpectAtEnd()) {
-          uint32 feature_length;
+          uint32_t feature_length;
           if (!stream.ExpectTag(kDelimitedTag(1)) ||
               !stream.ReadVarint32(&feature_length)) {
             return errors::InvalidArgument("Error in sequence feature ",
@@ -2716,12 +2720,12 @@ absl::Status ParseSequenceSparseFeatures(
       const auto& feature_proto = feature.protos[e];
       if (feature_proto.empty()) continue;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(feature_proto.data()),
+          reinterpret_cast<const uint8_t*>(feature_proto.data()),
           feature_proto.size());
       EnableAliasing(&stream);
       size_t num_rows = 0;
       while (!stream.ExpectAtEnd()) {
-        uint32 feature_length;
+        uint32_t feature_length;
         if (!stream.ExpectTag(kDelimitedTag(1)) ||
             !stream.ReadVarint32(&feature_length)) {
           // This should be unreachable -- we already scanned the feature in
@@ -2821,17 +2825,17 @@ absl::Status ParseSequenceRaggedFeatures(
         Tensor(allocator, splits_dtype, outer_splits_shape);
     Tensor& out_values = sequence_result->ragged_values[t];
     size_t out_values_offset = 0;
-    int32* int32_inner_splits =
+    int32_t* int32_inner_splits =
         splits_dtype == DT_INT32
-            ? sequence_result->ragged_splits[t].vec<int32>().data()
+            ? sequence_result->ragged_splits[t].vec<int32_t>().data()
             : nullptr;
     int64_t* int64_inner_splits =
         splits_dtype == DT_INT64
             ? sequence_result->ragged_splits[t].vec<int64_t>().data()
             : nullptr;
-    int32* int32_outer_splits =
+    int32_t* int32_outer_splits =
         is_batch && splits_dtype == DT_INT32
-            ? sequence_result->ragged_outer_splits[t].vec<int32>().data()
+            ? sequence_result->ragged_outer_splits[t].vec<int32_t>().data()
             : nullptr;
     int64_t* int64_outer_splits =
         is_batch && splits_dtype == DT_INT64
@@ -2855,11 +2859,11 @@ absl::Status ParseSequenceRaggedFeatures(
       const auto& feature_proto = feature.protos[e];
       if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         while (!stream.ExpectAtEnd()) {
-          uint32 feature_length;
+          uint32_t feature_length;
           if (!stream.ExpectTag(kDelimitedTag(1)) ||
               !stream.ReadVarint32(&feature_length)) {
             // This should be unreachable -- we already scanned the feature in
@@ -2916,7 +2920,7 @@ absl::Status ParseSequenceRaggedFeatures(
       const auto& inner_splits = sequence_result->ragged_splits[t];
       int num_inner_splits =
           int32_inner_splits
-              ? int32_inner_splits - inner_splits.vec<int32>().data()
+              ? int32_inner_splits - inner_splits.vec<int32_t>().data()
               : int64_inner_splits - inner_splits.vec<int64_t>().data();
       if (num_inner_splits != expected_num_rows + 1) {
         return errors::InvalidArgument("Unexpected number of rows for feature ",
@@ -2927,7 +2931,7 @@ absl::Status ParseSequenceRaggedFeatures(
       const auto& outer_splits = sequence_result->ragged_outer_splits[t];
       int num_outer_splits =
           int32_outer_splits
-              ? int32_outer_splits - outer_splits.vec<int32>().data()
+              ? int32_outer_splits - outer_splits.vec<int32_t>().data()
               : int64_outer_splits - outer_splits.vec<int64_t>().data();
       if (num_outer_splits != num_examples + 1) {
         return errors::InvalidArgument(
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 6ba6d89ab5aa01..583a4238737807 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -164,7 +164,7 @@ absl::Status FastParseSequenceExample(
 // It uses the same specialized parser as FastParseExample which is efficient.
 // But then constructs Example which is relatively slow.
 // It is exported here as a convenient API to test parser part separately.
-bool TestFastParse(const string& serialized, Example* example);
+bool TestFastParse(const std::string& serialized, Example* example);
 
 }  // namespace example
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index b44978fcc84816..2df2732e1bc705 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -41,8 +41,8 @@ constexpr char kSparseInt64Key[] = "sparse_int64";
 constexpr char kSparseFloatKey[] = "sparse_float";
 constexpr char kSparseStringKey[] = "sparse_string";
 
-string SerializedToReadable(string serialized) {
-  string result;
+std::string SerializedToReadable(std::string serialized) {
+  std::string result;
   result += '"';
   for (char c : serialized)
     absl::StrAppend(&result, "\\x", absl::Hex(c, absl::kZeroPad2));
@@ -51,15 +51,15 @@ string SerializedToReadable(string serialized) {
 }
 
 template <class T>
-string Serialize(const T& example) {
-  string serialized;
+std::string Serialize(const T& example) {
+  std::string serialized;
   example.SerializeToString(&serialized);
   return serialized;
 }
 
 // Tests that serialized gets parsed identically by TestFastParse(..)
 // and the regular Example.ParseFromString(..).
-void TestCorrectness(const string& serialized) {
+void TestCorrectness(const std::string& serialized) {
   Example example;
   Example fast_example;
   EXPECT_TRUE(example.ParseFromString(serialized));
@@ -98,7 +98,7 @@ TEST(FastParse, IgnoresPrecedingUnknownTopLevelFields) {
       .mutable_int64_list()
       ->add_value(94043);
 
-  TestCorrectness(strings::StrCat(Serialize(example), Serialize(context)));
+  TestCorrectness(absl::StrCat(Serialize(example), Serialize(context)));
 }
 
 TEST(FastParse, IgnoresTrailingUnknownTopLevelFields) {
@@ -122,7 +122,7 @@ TEST(FastParse, IgnoresTrailingUnknownTopLevelFields) {
       .mutable_int64_list()
       ->add_value(1337);
 
-  TestCorrectness(strings::StrCat(Serialize(example), Serialize(context)));
+  TestCorrectness(absl::StrCat(Serialize(example), Serialize(context)));
 }
 
 TEST(FastParse, SingleInt64WithContext) {
@@ -136,7 +136,7 @@ TEST(FastParse, SingleInt64WithContext) {
       .mutable_int64_list()
       ->add_value(94043);
 
-  TestCorrectness(strings::StrCat(Serialize(example), Serialize(context)));
+  TestCorrectness(absl::StrCat(Serialize(example), Serialize(context)));
 }
 
 TEST(FastParse, DenseInt64WithContext) {
@@ -150,7 +150,7 @@ TEST(FastParse, DenseInt64WithContext) {
       .mutable_int64_list()
       ->add_value(15);
 
-  string serialized = Serialize(example) + Serialize(context);
+  std::string serialized = Serialize(example) + Serialize(context);
 
   {
     Example deserialized;
@@ -183,10 +183,10 @@ TEST(FastParse, EmptyFeatures) {
   TestCorrectness(Serialize(example));
 }
 
-void TestCorrectnessJson(const string& json) {
+void TestCorrectnessJson(const std::string& json) {
   auto resolver = protobuf::util::NewTypeResolverForDescriptorPool(
       "type.googleapis.com", protobuf::DescriptorPool::generated_pool());
-  string serialized;
+  std::string serialized;
   auto s = protobuf::util::JsonToBinaryString(
       resolver, "type.googleapis.com/tensorflow.Example", json, &serialized);
   EXPECT_TRUE(s.ok()) << s;
@@ -220,7 +220,7 @@ TEST(FastParse, SingleInt64) {
   TestCorrectness(Serialize(example));
 }
 
-static string ExampleWithSomeFeatures() {
+static std::string ExampleWithSomeFeatures() {
   Example example;
 
   (*example.mutable_features()->mutable_feature())[""];
@@ -328,13 +328,13 @@ TEST(FastParse, StatsCollection) {
   }
 }
 
-string RandStr(random::SimplePhilox* rng) {
+std::string RandStr(random::SimplePhilox* rng) {
   static const char key_char_lookup[] =
       "0123456789{}~`!@#$%^&*()"
       "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
       "abcdefghijklmnopqrstuvwxyz";
   auto len = 1 + rng->Rand32() % 200;
-  string str;
+  std::string str;
   str.reserve(len);
   while (len-- > 0) {
     str.push_back(
@@ -347,18 +347,18 @@ string RandStr(random::SimplePhilox* rng) {
 void Fuzz(random::SimplePhilox* rng) {
   // Generate keys.
   auto num_keys = 1 + rng->Rand32() % 100;
-  std::unordered_set<string> unique_keys;
+  std::unordered_set<std::string> unique_keys;
   for (auto i = 0; i < num_keys; ++i) {
     unique_keys.emplace(RandStr(rng));
   }
 
   // Generate serialized example.
   Example example;
-  string serialized_example;
+  std::string serialized_example;
   auto num_concats = 1 + rng->Rand32() % 4;
   std::vector<Feature::KindCase> feat_types(
       {Feature::kBytesList, Feature::kFloatList, Feature::kInt64List});
-  std::vector<string> all_keys(unique_keys.begin(), unique_keys.end());
+  std::vector<std::string> all_keys(unique_keys.begin(), unique_keys.end());
   while (num_concats--) {
     example.Clear();
     auto num_active_keys = 1 + rng->Rand32() % all_keys.size();
@@ -410,7 +410,7 @@ void Fuzz(random::SimplePhilox* rng) {
 }
 
 TEST(FastParse, FuzzTest) {
-  const uint64 seed = 1337;
+  const uint64_t seed = 1337;
   random::PhiloxRandom philox(seed);
   random::SimplePhilox rng(&philox);
   auto num_runs = 200;
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index fc0573a53be71c..e4bf04377822b3 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -63,10 +63,10 @@ absl::Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
   return absl::OkStatus();
 }
 
-absl::Status FeatureDenseCopy(const std::size_t out_index, const string& name,
-                              const string& key, const DataType& dtype,
-                              const TensorShape& shape, const Feature& feature,
-                              Tensor* out) {
+absl::Status FeatureDenseCopy(const std::size_t out_index,
+                              const std::string& name, const std::string& key,
+                              const DataType& dtype, const TensorShape& shape,
+                              const Feature& feature, Tensor* out) {
   const std::size_t num_elements = shape.num_elements();
   const std::size_t offset = out_index * num_elements;
 
@@ -109,7 +109,7 @@ absl::Status FeatureDenseCopy(const std::size_t out_index, const string& name,
       auto out_p = out->flat<tstring>().data() + offset;
       std::transform(values.value().data(),
                      values.value().data() + num_elements, out_p,
-                     [](const string* s) { return *s; });
+                     [](const std::string* s) { return *s; });
       return absl::OkStatus();
     }
     default:
@@ -118,7 +118,7 @@ absl::Status FeatureDenseCopy(const std::size_t out_index, const string& name,
   }
 }
 
-Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
+Tensor FeatureSparseCopy(const std::size_t batch, const std::string& key,
                          const DataType& dtype, const Feature& feature) {
   switch (dtype) {
     case DT_INT64: {
@@ -144,7 +144,7 @@ Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
       auto out_p = out.flat<tstring>().data();
       std::transform(values.value().data(),
                      values.value().data() + num_elements, out_p,
-                     [](const string* s) { return *s; });
+                     [](const std::string* s) { return *s; });
       return out;
     }
     default:
@@ -221,7 +221,8 @@ void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
 }
 
 absl::Status SingleExampleProtoToTensors(
-    const Example& example, const string& example_name, const int batch_index,
+    const Example& example, const std::string& example_name,
+    const int batch_index,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features,
     std::vector<Tensor*>* output_dense_values_tensor,
@@ -232,7 +233,7 @@ absl::Status SingleExampleProtoToTensors(
   // Handle dense features.
   for (size_t d = 0; d < fixed_len_features.size(); ++d) {
     const FixedLenFeature& feature_config = fixed_len_features[d];
-    const string& key = feature_config.key;
+    const std::string& key = feature_config.key;
     const DataType& dtype = feature_config.dtype;
     const TensorShape& shape = feature_config.shape;
     const Tensor& default_value = feature_config.default_value;
@@ -274,7 +275,7 @@ absl::Status SingleExampleProtoToTensors(
   // Handle sparse features.
   for (size_t d = 0; d < var_len_features.size(); ++d) {
     const VarLenFeature& feature_config = var_len_features[d];
-    const string& key = feature_config.key;
+    const std::string& key = feature_config.key;
     const DataType& dtype = feature_config.dtype;
     const auto& feature_found = feature_dict.find(key);
 
@@ -324,7 +325,7 @@ absl::Status GetSparseTensorShapes(const VarLenFeature& var_len_feature,
 
 absl::Status BatchExampleProtoToTensors(
     const std::vector<const Example*>& examples,
-    const std::vector<string>& names,
+    const std::vector<std::string>& names,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features, Allocator* allocator,
     std::vector<Tensor>* output_dense_values_tensor,
@@ -368,7 +369,7 @@ absl::Status BatchExampleProtoToTensors(
 
   for (size_t b = 0; b < examples.size(); ++b) {
     const Example& ex = *(examples[b]);
-    const string& example_name = (has_names) ? names[b] : "<unknown>";
+    const std::string& example_name = (has_names) ? names[b] : "<unknown>";
     TF_RETURN_IF_ERROR(SingleExampleProtoToTensors(
         ex, example_name, b, fixed_len_features, var_len_features,
         &output_dense_values_tensor_ptrs, &sparse_values_tmp));
@@ -455,7 +456,7 @@ absl::Status ParseExampleAttrs::FinishInit(int op_version) {
     return errors::InvalidArgument(
         "len(ragged_keys) != len(ragged_split_types)");
   }
-  if (num_dense > std::numeric_limits<int32>::max()) {
+  if (num_dense > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("num_dense_ too large");
   }
   for (const DataType& type : dense_types) {
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index f3ff81ec98944f..2b2dda892523a8 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -40,20 +40,20 @@ namespace tensorflow {
 
 // "Dense" feature configuration.
 struct FixedLenFeature {
-  string key;
+  std::string key;
   DataType dtype;
   TensorShape shape;
   Tensor default_value;
-  string values_output_tensor_name;
+  std::string values_output_tensor_name;
 };
 
 // "Sparse" feature configuration.
 struct VarLenFeature {
-  string key;
+  std::string key;
   DataType dtype;
-  string values_output_tensor_name;
-  string indices_output_tensor_name;
-  string shapes_output_tensor_name;
+  std::string values_output_tensor_name;
+  std::string indices_output_tensor_name;
+  std::string shapes_output_tensor_name;
 };
 
 // Given a single tensorflow::Example, with an optional example name
@@ -77,7 +77,7 @@ struct VarLenFeature {
 // CopyIntoSparseTensor can be used to copy from the temporary vector
 // into the final allocated tensors.
 absl::Status SingleExampleProtoToTensors(
-    const Example& example, const string& name, int batch_index,
+    const Example& example, const std::string& name, int batch_index,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features,
     std::vector<Tensor*>* output_dense_values_tensor,
@@ -111,7 +111,7 @@ absl::Status GetSparseTensorShapes(const VarLenFeature& var_len_feature,
 // allocated using a provided Allocator within this method.
 absl::Status BatchExampleProtoToTensors(
     const std::vector<const Example*>& examples,
-    const std::vector<string>& names,
+    const std::vector<std::string>& names,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features, Allocator* allocator,
     std::vector<Tensor>* output_dense_values_tensor,
@@ -130,8 +130,8 @@ absl::Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
 
 // For a single Example, copy a dense feature value into an output
 // dense value tensor Out at the provided out_index offset.
-absl::Status FeatureDenseCopy(std::size_t out_index, const string& name,
-                              const string& key, const DataType& dtype,
+absl::Status FeatureDenseCopy(std::size_t out_index, const std::string& name,
+                              const std::string& key, const DataType& dtype,
                               const TensorShape& shape, const Feature& feature,
                               Tensor* out);
 
@@ -142,7 +142,7 @@ void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
 
 // For a single Example, and given sparse feature return a temporary output
 // Tensor suitable for being collected in the temporary sparse value vector.
-Tensor FeatureSparseCopy(std::size_t batch, const string& key,
+Tensor FeatureSparseCopy(std::size_t batch, const std::string& key,
                          const DataType& dtype, const Feature& feature);
 
 // Copy a temporary Tensor into the final sparse indices and values
@@ -251,10 +251,10 @@ struct ParseSequenceExampleAttrs {
   absl::Status Init(ContextType* ctx, int op_version = 1) {
     switch (op_version) {
       case 1: {
-        std::vector<string> missing_empty_vector;
+        std::vector<std::string> missing_empty_vector;
         TF_RETURN_IF_ERROR(ctx->GetAttr(
             "feature_list_dense_missing_assumed_empty", &missing_empty_vector));
-        for (const string& feature : missing_empty_vector) {
+        for (const std::string& feature : missing_empty_vector) {
           feature_list_dense_missing_assumed_empty.insert(feature);
         }
       }
@@ -300,7 +300,7 @@ struct ParseSequenceExampleAttrs {
     return FinishInit(op_version);
   }
 
-  std::unordered_set<string> feature_list_dense_missing_assumed_empty;
+  std::unordered_set<std::string> feature_list_dense_missing_assumed_empty;
   int64_t num_context_sparse;
   int64_t num_context_dense;
   int64_t num_context_ragged;
diff --git a/tensorflow/core/util/exec_on_stall.h b/tensorflow/core/util/exec_on_stall.h
index d4a6c552dfbe08..6ee738641a11f5 100644
--- a/tensorflow/core/util/exec_on_stall.h
+++ b/tensorflow/core/util/exec_on_stall.h
@@ -82,7 +82,7 @@ class ExecuteOnStall {
   Env* env_;
   std::function<void()> f_;
   int64_t deadline_;
-  int32 poll_microseconds_;
+  int32_t poll_microseconds_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/fake_clock_env.cc b/tensorflow/core/util/fake_clock_env.cc
index 55c7841a4cf9a9..81b8254559548a 100644
--- a/tensorflow/core/util/fake_clock_env.cc
+++ b/tensorflow/core/util/fake_clock_env.cc
@@ -28,7 +28,7 @@ void FakeClockEnv::AdvanceByMicroseconds(int64_t micros) {
   }
 }
 
-uint64 FakeClockEnv::NowMicros() const {
+uint64_t FakeClockEnv::NowMicros() const {
   {
     mutex_lock l(mu_);
     return current_time_;
diff --git a/tensorflow/core/util/fake_clock_env.h b/tensorflow/core/util/fake_clock_env.h
index 2ded1708aed79c..7d1e9305dd5b92 100644
--- a/tensorflow/core/util/fake_clock_env.h
+++ b/tensorflow/core/util/fake_clock_env.h
@@ -43,11 +43,11 @@ class FakeClockEnv : public EnvWrapper {
   void AdvanceByMicroseconds(int64_t micros);
 
   // Returns the current time of FakeClockEnv in microseconds.
-  uint64 NowMicros() const override;
+  uint64_t NowMicros() const override;
 
  private:
   mutable mutex mu_;
-  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t current_time_ TF_GUARDED_BY(mu_) = 0;
 
   FakeClockEnv(const FakeClockEnv&) = delete;
   void operator=(const FakeClockEnv&) = delete;
diff --git a/tensorflow/core/util/image_resizer_state.h b/tensorflow/core/util/image_resizer_state.h
index e028ec5c66acbc..63691f0b3c05b2 100644
--- a/tensorflow/core/util/image_resizer_state.h
+++ b/tensorflow/core/util/image_resizer_state.h
@@ -100,12 +100,12 @@ struct ImageResizerState {
     OP_REQUIRES(
         context,
         FastBoundsCheck(input_shape.dim_size(1),
-                        std::numeric_limits<int32>::max()) &&
+                        std::numeric_limits<int32_t>::max()) &&
             FastBoundsCheck(input_shape.dim_size(2),
-                            std::numeric_limits<int32>::max()),
+                            std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument("input sizes must be between 0 and max int32"));
-    in_height = static_cast<int32>(input_shape.dim_size(1));
-    in_width = static_cast<int32>(input_shape.dim_size(2));
+    in_height = static_cast<int32_t>(input_shape.dim_size(1));
+    in_width = static_cast<int32_t>(input_shape.dim_size(2));
 
     // Verify the output tensor's shape.
     const Tensor& shape_t = context->input(1);
@@ -117,7 +117,7 @@ struct ImageResizerState {
                                         shape_t.shape().DebugString()));
 
     // Verify and assign `out_height` and `out_width`.
-    auto Svec = shape_t.vec<int32>();
+    auto Svec = shape_t.vec<int32_t>();
     out_height = internal::SubtleMustCopy(Svec(0));
     out_width = internal::SubtleMustCopy(Svec(1));
     OP_REQUIRES(context, out_height > 0 && out_width > 0,
@@ -222,8 +222,9 @@ struct ImageResizerGradientState {
 
     OP_REQUIRES(
         context,
-        FastBoundsCheck(original_height, std::numeric_limits<int32>::max()) &&
-            FastBoundsCheck(original_width, std::numeric_limits<int32>::max()),
+        FastBoundsCheck(original_height, std::numeric_limits<int32_t>::max()) &&
+            FastBoundsCheck(original_width,
+                            std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument(
             "original sizes must be between 0 and max int32"));
 
diff --git a/tensorflow/core/util/matmul_bcast_test.cc b/tensorflow/core/util/matmul_bcast_test.cc
index f64ecb86f711a9..1f9762e1fdbbb0 100644
--- a/tensorflow/core/util/matmul_bcast_test.cc
+++ b/tensorflow/core/util/matmul_bcast_test.cc
@@ -22,15 +22,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-string MatMulBCastToStr(const MatMulBCast& b) {
+std::string MatMulBCastToStr(const MatMulBCast& b) {
   if (!b.IsValid()) {
     return "invalid";
   }
-  string ret;
-  strings::StrAppend(
-      &ret, "[", absl::StrJoin(b.output_batch_shape().dim_sizes(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.x_batch_indices(), ","), "]");
-  strings::StrAppend(&ret, "[", absl::StrJoin(b.y_batch_indices(), ","), "]");
+  std::string ret;
+  absl::StrAppend(&ret, "[",
+                  absl::StrJoin(b.output_batch_shape().dim_sizes(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.x_batch_indices(), ","), "]");
+  absl::StrAppend(&ret, "[", absl::StrJoin(b.y_batch_indices(), ","), "]");
   return ret;
 }
 
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index c3729774d5a07c..82ab2423465bc8 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -28,10 +28,10 @@ namespace tensorflow {
 
 namespace {
 
-uint64 DecodeUint64LittleEndian(const uint8* buffer) {
-  uint64 result = 0;
-  for (int i = 0; i < static_cast<int>(sizeof(uint64)); ++i) {
-    result |= static_cast<uint64>(buffer[i]) << (8 * i);
+uint64_t DecodeUint64LittleEndian(const uint8_t* buffer) {
+  uint64_t result = 0;
+  for (int i = 0; i < static_cast<int>(sizeof(uint64_t)); ++i) {
+    result |= static_cast<uint64_t>(buffer[i]) << (8 * i);
   }
   return result;
 }
@@ -42,21 +42,21 @@ namespace {
 
 class ReadOnlyMemoryRegionFromMemmapped : public ReadOnlyMemoryRegion {
  public:
-  ReadOnlyMemoryRegionFromMemmapped(const void* data, uint64 length)
+  ReadOnlyMemoryRegionFromMemmapped(const void* data, uint64_t length)
       : data_(data), length_(length) {}
   ~ReadOnlyMemoryRegionFromMemmapped() override = default;
   const void* data() override { return data_; }
-  uint64 length() override { return length_; }
+  uint64_t length() override { return length_; }
 
  private:
   const void* const data_;
-  const uint64 length_;
+  const uint64_t length_;
   // intentionally copyable
 };
 
 class RandomAccessFileFromMemmapped : public RandomAccessFile {
  public:
-  RandomAccessFileFromMemmapped(const void* data, uint64 length)
+  RandomAccessFileFromMemmapped(const void* data, uint64_t length)
       : data_(data), length_(length) {}
 
   ~RandomAccessFileFromMemmapped() override = default;
@@ -66,14 +66,14 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
         "RandomAccessFileFromMemmapped does not support Name()");
   }
 
-  absl::Status Read(uint64 offset, size_t to_read, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t to_read, absl::string_view* result,
                     char* scratch) const override {
     if (offset >= length_) {
       *result = absl::string_view(scratch, 0);
       return absl::Status(absl::StatusCode::kOutOfRange, "Read after file end");
     }
-    const uint64 region_left =
-        std::min(length_ - offset, static_cast<uint64>(to_read));
+    const uint64_t region_left =
+        std::min(length_ - offset, static_cast<uint64_t>(to_read));
     *result = absl::string_view(reinterpret_cast<const char*>(data_) + offset,
                                 region_left);
     return (region_left == to_read)
@@ -84,7 +84,7 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
  private:
   const void* const data_;
-  const uint64 length_;
+  const uint64_t length_;
   // intentionally copyable
 };
 
@@ -92,7 +92,7 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
 MemmappedFileSystem::MemmappedFileSystem() = default;
 
-absl::Status MemmappedFileSystem::FileExists(const string& fname,
+absl::Status MemmappedFileSystem::FileExists(const std::string& fname,
                                              TransactionToken* token) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
@@ -105,7 +105,7 @@ absl::Status MemmappedFileSystem::FileExists(const string& fname,
 }
 
 absl::Status MemmappedFileSystem::NewRandomAccessFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<RandomAccessFile>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
@@ -121,7 +121,7 @@ absl::Status MemmappedFileSystem::NewRandomAccessFile(
 }
 
 absl::Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
@@ -136,9 +136,9 @@ absl::Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
   return absl::OkStatus();
 }
 
-absl::Status MemmappedFileSystem::GetFileSize(const string& filename,
+absl::Status MemmappedFileSystem::GetFileSize(const std::string& filename,
                                               TransactionToken* token,
-                                              uint64* size) {
+                                              uint64_t* size) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -150,10 +150,10 @@ absl::Status MemmappedFileSystem::GetFileSize(const string& filename,
   return absl::OkStatus();
 }
 
-absl::Status MemmappedFileSystem::Stat(const string& fname,
+absl::Status MemmappedFileSystem::Stat(const std::string& fname,
                                        TransactionToken* token,
                                        FileStatistics* stat) {
-  uint64 size;
+  uint64_t size;
   auto status = GetFileSize(fname, token, &size);
   if (status.ok()) {
     stat->length = size;
@@ -162,85 +162,85 @@ absl::Status MemmappedFileSystem::Stat(const string& fname,
 }
 
 absl::Status MemmappedFileSystem::NewWritableFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<WritableFile>* wf) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
 absl::Status MemmappedFileSystem::NewAppendableFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
-absl::Status MemmappedFileSystem::GetChildren(const string& filename,
-                                              TransactionToken* token,
-                                              std::vector<string>* strings) {
+absl::Status MemmappedFileSystem::GetChildren(
+    const std::string& filename, TransactionToken* token,
+    std::vector<std::string>* strings) {
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
 absl::Status MemmappedFileSystem::GetMatchingPaths(
-    const string& pattern, TransactionToken* token,
-    std::vector<string>* results) {
+    const std::string& pattern, TransactionToken* token,
+    std::vector<std::string>* results) {
   return errors::Unimplemented(
       "memmapped format doesn't support GetMatchingPaths");
 }
 
-absl::Status MemmappedFileSystem::DeleteFile(const string& filename,
+absl::Status MemmappedFileSystem::DeleteFile(const std::string& filename,
                                              TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
 
-absl::Status MemmappedFileSystem::CreateDir(const string& dirname,
+absl::Status MemmappedFileSystem::CreateDir(const std::string& dirname,
                                             TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support CreateDir");
 }
 
-absl::Status MemmappedFileSystem::DeleteDir(const string& dirname,
+absl::Status MemmappedFileSystem::DeleteDir(const std::string& dirname,
                                             TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteDir");
 }
 
-absl::Status MemmappedFileSystem::RenameFile(const string& filename_from,
-                                             const string& filename_to,
+absl::Status MemmappedFileSystem::RenameFile(const std::string& filename_from,
+                                             const std::string& filename_to,
                                              TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support RenameFile");
 }
 
-const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
-  return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
+const void* MemmappedFileSystem::GetMemoryWithOffset(uint64_t offset) const {
+  return reinterpret_cast<const uint8_t*>(mapped_memory_->data()) + offset;
 }
 
 constexpr const char MemmappedFileSystem::kMemmappedPackagePrefix[];
 constexpr const char MemmappedFileSystem::kMemmappedPackageDefaultGraphDef[];
 
-absl::Status MemmappedFileSystem::InitializeFromFile(Env* env,
-                                                     const string& filename) {
+absl::Status MemmappedFileSystem::InitializeFromFile(
+    Env* env, const std::string& filename) {
   TF_RETURN_IF_ERROR(
       env->NewReadOnlyMemoryRegionFromFile(filename, &mapped_memory_));
   directory_.clear();
-  if (mapped_memory_->length() <= sizeof(uint64)) {
+  if (mapped_memory_->length() <= sizeof(uint64_t)) {
     return errors::DataLoss("Corrupted memmapped model file: ", filename,
                             " Invalid package size");
   }
   const auto memory_start =
-      reinterpret_cast<const uint8*>(mapped_memory_->data());
-  const uint64 directory_offset = DecodeUint64LittleEndian(
-      memory_start + mapped_memory_->length() - sizeof(uint64));
-  if (directory_offset > mapped_memory_->length() - sizeof(uint64)) {
+      reinterpret_cast<const uint8_t*>(mapped_memory_->data());
+  const uint64_t directory_offset = DecodeUint64LittleEndian(
+      memory_start + mapped_memory_->length() - sizeof(uint64_t));
+  if (directory_offset > mapped_memory_->length() - sizeof(uint64_t)) {
     return errors::DataLoss("Corrupted memmapped model file: ", filename,
                             " Invalid directory offset");
   }
   MemmappedFileSystemDirectory proto_directory;
   if (!ParseProtoUnlimited(
           &proto_directory, memory_start + directory_offset,
-          mapped_memory_->length() - directory_offset - sizeof(uint64))) {
+          mapped_memory_->length() - directory_offset - sizeof(uint64_t))) {
     return errors::DataLoss("Corrupted memmapped model file: ", filename,
                             " Can't parse its internal directory");
   }
 
   // Iterating in reverse order to get lengths of elements;
-  uint64 prev_element_offset = directory_offset;
+  uint64_t prev_element_offset = directory_offset;
   for (auto element_iter = proto_directory.element().rbegin();
        element_iter != proto_directory.element().rend(); ++element_iter) {
     // Check that the element offset is in the right range.
@@ -262,19 +262,19 @@ absl::Status MemmappedFileSystem::InitializeFromFile(Env* env,
   return absl::OkStatus();
 }
 
-bool MemmappedFileSystem::IsMemmappedPackageFilename(const string& filename) {
+bool MemmappedFileSystem::IsMemmappedPackageFilename(
+    const std::string& filename) {
   return absl::StartsWith(filename, kMemmappedPackagePrefix);
 }
 
 namespace {
 bool IsValidRegionChar(char c) {
-  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
-         (c >= '0' && c <= '9') || c == '_' || c == '.';
+  return absl::ascii_isalnum(c) || c == '_' || c == '.';
 }
 }  // namespace
 
 bool MemmappedFileSystem::IsWellFormedMemmappedPackageFilename(
-    const string& filename) {
+    const std::string& filename) {
   if (!IsMemmappedPackageFilename(filename)) {
     return false;
   }
@@ -290,7 +290,7 @@ bool MemmappedFileSystem::IsWellFormedMemmappedPackageFilename(
 
 MemmappedEnv::MemmappedEnv(Env* env) : EnvWrapper(env) {}
 
-absl::Status MemmappedEnv::GetFileSystemForFile(const string& fname,
+absl::Status MemmappedEnv::GetFileSystemForFile(const std::string& fname,
                                                 FileSystem** result) {
   if (MemmappedFileSystem::IsMemmappedPackageFilename(fname)) {
     if (!memmapped_file_system_) {
@@ -304,7 +304,7 @@ absl::Status MemmappedEnv::GetFileSystemForFile(const string& fname,
 }
 
 absl::Status MemmappedEnv::GetRegisteredFileSystemSchemes(
-    std::vector<string>* schemes) {
+    std::vector<std::string>* schemes) {
   const auto status = EnvWrapper::GetRegisteredFileSystemSchemes(schemes);
   if (status.ok()) {
     schemes->emplace_back(MemmappedFileSystem::kMemmappedPackagePrefix);
@@ -312,7 +312,8 @@ absl::Status MemmappedEnv::GetRegisteredFileSystemSchemes(
   return status;
 }
 
-absl::Status MemmappedEnv::InitializeFromFile(const string& package_filename) {
+absl::Status MemmappedEnv::InitializeFromFile(
+    const std::string& package_filename) {
   std::unique_ptr<MemmappedFileSystem> file_system_ptr(new MemmappedFileSystem);
   const auto status =
       file_system_ptr->InitializeFromFile(target(), package_filename);
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 225defc49ceceb..69f3b62949fd67 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -64,58 +64,63 @@ class MemmappedFileSystem : public FileSystem {
 
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  absl::Status FileExists(const string& fname,
+  absl::Status FileExists(const std::string& fname,
                           TransactionToken* token) override;
   absl::Status NewRandomAccessFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
   absl::Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
   // All these functions return Unimplemented error, the memmapped storage is
   // read only.
-  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
                                std::unique_ptr<WritableFile>* result) override;
   absl::Status NewAppendableFile(
-      const string& fname, TransactionToken* token,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<WritableFile>* result) override;
-  absl::Status GetChildren(const string& dir, TransactionToken* token,
-                           std::vector<string>* r) override;
-  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
-                                std::vector<string>* results) override;
-  absl::Status DeleteFile(const string& f, TransactionToken* token) override;
-  absl::Status CreateDir(const string& d, TransactionToken* token) override;
-  absl::Status DeleteDir(const string& d, TransactionToken* token) override;
-  absl::Status RenameFile(const string& s, const string& t,
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<std::string>* r) override;
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<std::string>* results) override;
+  absl::Status DeleteFile(const std::string& f,
+                          TransactionToken* token) override;
+  absl::Status CreateDir(const std::string& d,
+                         TransactionToken* token) override;
+  absl::Status DeleteDir(const std::string& d,
+                         TransactionToken* token) override;
+  absl::Status RenameFile(const std::string& s, const std::string& t,
                           TransactionToken* token) override;
 
   // These functions are implemented.
-  absl::Status GetFileSize(const string& f, TransactionToken* token,
-                           uint64* s) override;
+  absl::Status GetFileSize(const std::string& f, TransactionToken* token,
+                           uint64_t* s) override;
   // Currently just returns size.
-  absl::Status Stat(const string& fname, TransactionToken* token,
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
                     FileStatistics* stat) override;
 
   // Initializes filesystem from a file in memmapped format.
-  absl::Status InitializeFromFile(Env* env, const string& filename);
+  absl::Status InitializeFromFile(Env* env, const std::string& filename);
 
   // Checks if the filename has a correct prefix.
-  static bool IsMemmappedPackageFilename(const string& filename);
+  static bool IsMemmappedPackageFilename(const std::string& filename);
 
-  static bool IsWellFormedMemmappedPackageFilename(const string& filename);
+  static bool IsWellFormedMemmappedPackageFilename(const std::string& filename);
 
  private:
   struct FileRegion {
-    FileRegion(uint64 o, uint64 l) : offset(o), length(l) {}
+    FileRegion(uint64_t o, uint64_t l) : offset(o), length(l) {}
 
-    uint64 offset;  // Offset from the beginning of the file.
-    uint64 length;  // Length of the region.
+    uint64_t offset;  // Offset from the beginning of the file.
+    uint64_t length;  // Length of the region.
   };
 
-  using DirectoryType = std::unordered_map<string, FileRegion>;
+  using DirectoryType = std::unordered_map<std::string, FileRegion>;
 
-  const void* GetMemoryWithOffset(uint64 offset) const;
+  const void* GetMemoryWithOffset(uint64_t offset) const;
 
   std::unique_ptr<ReadOnlyMemoryRegion> mapped_memory_;
   DirectoryType directory_;
@@ -128,11 +133,11 @@ class MemmappedEnv : public EnvWrapper {
  public:
   explicit MemmappedEnv(Env* env);
   ~MemmappedEnv() override = default;
-  absl::Status GetFileSystemForFile(const string& fname,
+  absl::Status GetFileSystemForFile(const std::string& fname,
                                     FileSystem** result) override;
   absl::Status GetRegisteredFileSystemSchemes(
-      std::vector<string>* schemes) override;
-  absl::Status InitializeFromFile(const string& filename);
+      std::vector<std::string>* schemes) override;
+  absl::Status InitializeFromFile(const std::string& filename);
 
  protected:
   std::unique_ptr<MemmappedFileSystem> memmapped_file_system_;
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 9e9bce6a883349..ea54a143771906 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -38,7 +38,7 @@ constexpr char kTensor2FileName[] = "memmapped_package://t2";
 constexpr char kProtoFileName[] = "memmapped_package://b";
 constexpr int kTestGraphDefVersion = 666;
 
-absl::Status CreateMemmappedFileSystemFile(const string& filename,
+absl::Status CreateMemmappedFileSystemFile(const std::string& filename,
                                            bool corrupted,
                                            Tensor* test_tensor) {
   Env* env = Env::Default();
@@ -72,8 +72,8 @@ absl::Status CreateMemmappedFileSystemFile(const string& filename,
 TEST(MemmappedFileSystemTest, SimpleTest) {
   const TensorShape test_tensor_shape = {10, 200};
   Tensor test_tensor(DT_FLOAT, test_tensor_shape);
-  const string dir = testing::TmpDir();
-  const string filename = io::JoinPath(dir, "memmapped_env_test");
+  const std::string dir = testing::TmpDir();
+  const std::string filename = io::JoinPath(dir, "memmapped_env_test");
   TF_ASSERT_OK(CreateMemmappedFileSystemFile(filename, false, &test_tensor));
 
   // Check that we can memmap the created file.
@@ -96,7 +96,7 @@ TEST(MemmappedFileSystemTest, SimpleTest) {
             absl::string_view(static_cast<const char*>(memory_region->data()),
                               test_tensor.TotalBytes()));
   // Check that GetFileSize works.
-  uint64 file_size = 0;
+  uint64_t file_size = 0;
   TF_ASSERT_OK(memmapped_env.GetFileSize(kTensor2FileName, &file_size));
   EXPECT_EQ(test_tensor.TotalBytes(), file_size);
 
@@ -134,8 +134,9 @@ TEST(MemmappedFileSystemTest, Corrupted) {
   // Create a corrupted file (it is not closed it properly).
   const TensorShape test_tensor_shape = {100, 200};
   Tensor test_tensor(DT_FLOAT, test_tensor_shape);
-  const string dir = testing::TmpDir();
-  const string filename = io::JoinPath(dir, "memmapped_env_corrupted_test");
+  const std::string dir = testing::TmpDir();
+  const std::string filename =
+      io::JoinPath(dir, "memmapped_env_corrupted_test");
   TF_ASSERT_OK(CreateMemmappedFileSystemFile(filename, true, &test_tensor));
   MemmappedFileSystem memmapped_env;
   ASSERT_NE(memmapped_env.InitializeFromFile(Env::Default(), filename),
@@ -144,8 +145,8 @@ TEST(MemmappedFileSystemTest, Corrupted) {
 
 TEST(MemmappedFileSystemTest, ProxyToDefault) {
   MemmappedEnv memmapped_env(Env::Default());
-  const string dir = testing::TmpDir();
-  const string filename = io::JoinPath(dir, "test_file");
+  const std::string dir = testing::TmpDir();
+  const std::string filename = io::JoinPath(dir, "test_file");
   // Check that we can create write and read ordinary file.
   std::unique_ptr<WritableFile> writable_file_temp;
   TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp));
@@ -156,10 +157,10 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   };
   std::unique_ptr<WritableFile, decltype(adh)> writable_file(
       writable_file_temp.release(), adh);
-  const string test_string = "bla-bla-bla";
+  const std::string test_string = "bla-bla-bla";
   TF_ASSERT_OK(writable_file->Append(test_string));
   TF_ASSERT_OK(writable_file->Close());
-  uint64 file_length = 0;
+  uint64_t file_length = 0;
   TF_EXPECT_OK(memmapped_env.GetFileSize(filename, &file_length));
   EXPECT_EQ(test_string.length(), file_length);
   FileStatistics stat;
diff --git a/tensorflow/core/util/memmapped_file_system_writer.cc b/tensorflow/core/util/memmapped_file_system_writer.cc
index ce5d435b8a7a3f..37a84f08c9b968 100644
--- a/tensorflow/core/util/memmapped_file_system_writer.cc
+++ b/tensorflow/core/util/memmapped_file_system_writer.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status MemmappedFileSystemWriter::InitializeToFile(
-    Env* env, const string& filename) {
+    Env* env, const std::string& filename) {
   auto status = env->NewWritableFile(filename, &output_file_);
   if (status.ok()) {
     output_file_offset_ = 0;
@@ -27,8 +27,8 @@ absl::Status MemmappedFileSystemWriter::InitializeToFile(
   return status;
 }
 
-absl::Status MemmappedFileSystemWriter::SaveTensor(const Tensor& tensor,
-                                                   const string& element_name) {
+absl::Status MemmappedFileSystemWriter::SaveTensor(
+    const Tensor& tensor, const std::string& element_name) {
   if (!output_file_) {
     return errors::FailedPrecondition(
         "MemmappedEnvWritter: saving tensor into not opened file");
@@ -56,7 +56,7 @@ absl::Status MemmappedFileSystemWriter::SaveTensor(const Tensor& tensor,
 }
 
 absl::Status MemmappedFileSystemWriter::SaveProtobuf(
-    const protobuf::MessageLite& message, const string& element_name) {
+    const protobuf::MessageLite& message, const std::string& element_name) {
   if (!output_file_) {
     return errors::FailedPrecondition(
         "MemmappedEnvWritter: saving protobuf into not opened file");
@@ -69,7 +69,7 @@ absl::Status MemmappedFileSystemWriter::SaveProtobuf(
         MemmappedFileSystem::kMemmappedPackagePrefix,
         " and include [A-Za-z0-9_.]");
   }
-  const string encoded = message.SerializeAsString();
+  const std::string encoded = message.SerializeAsString();
   AddToDirectoryElement(element_name, encoded.size());
   const auto res = output_file_->Append(encoded);
   if (res.ok()) {
@@ -80,11 +80,11 @@ absl::Status MemmappedFileSystemWriter::SaveProtobuf(
 
 namespace {
 
-absl::string_view EncodeUint64LittleEndian(uint64 val, char* output_buffer) {
-  for (unsigned int i = 0; i < sizeof(uint64); ++i) {
+absl::string_view EncodeUint64LittleEndian(uint64_t val, char* output_buffer) {
+  for (unsigned int i = 0; i < sizeof(uint64_t); ++i) {
     output_buffer[i] = (val >> i * 8);
   }
-  return {output_buffer, sizeof(uint64)};
+  return {output_buffer, sizeof(uint64_t)};
 }
 
 }  // namespace
@@ -94,11 +94,11 @@ absl::Status MemmappedFileSystemWriter::FlushAndClose() {
     return errors::FailedPrecondition(
         "MemmappedEnvWritter: flushing into not opened file");
   }
-  const string dir = directory_.SerializeAsString();
+  const std::string dir = directory_.SerializeAsString();
   TF_RETURN_IF_ERROR(output_file_->Append(dir));
 
   // Write the directory offset.
-  char buffer[sizeof(uint64)];
+  char buffer[sizeof(uint64_t)];
   TF_RETURN_IF_ERROR(output_file_->Append(
       EncodeUint64LittleEndian(output_file_offset_, buffer)));
 
@@ -109,13 +109,13 @@ absl::Status MemmappedFileSystemWriter::FlushAndClose() {
   return absl::OkStatus();
 }
 
-absl::Status MemmappedFileSystemWriter::AdjustAlignment(uint64 alignment) {
-  const uint64 alignment_rest = output_file_offset_ % alignment;
-  const uint64 to_write_for_alignment =
+absl::Status MemmappedFileSystemWriter::AdjustAlignment(uint64_t alignment) {
+  const uint64_t alignment_rest = output_file_offset_ % alignment;
+  const uint64_t to_write_for_alignment =
       (alignment_rest == 0) ? 0 : alignment - (output_file_offset_ % alignment);
-  static constexpr uint64 kFillerBufferSize = 16;
+  static constexpr uint64_t kFillerBufferSize = 16;
   const char kFillerBuffer[kFillerBufferSize] = {};
-  for (uint64 rest = to_write_for_alignment; rest > 0;) {
+  for (uint64_t rest = to_write_for_alignment; rest > 0;) {
     absl::string_view sp(kFillerBuffer, std::min(rest, kFillerBufferSize));
     TF_RETURN_IF_ERROR(output_file_->Append(sp));
     rest -= sp.size();
@@ -124,8 +124,8 @@ absl::Status MemmappedFileSystemWriter::AdjustAlignment(uint64 alignment) {
   return absl::OkStatus();
 }
 
-void MemmappedFileSystemWriter::AddToDirectoryElement(const string& name,
-                                                      uint64 length) {
+void MemmappedFileSystemWriter::AddToDirectoryElement(const std::string& name,
+                                                      uint64_t length) {
   MemmappedFileSystemDirectoryElement* new_directory_element =
       directory_.add_element();
   new_directory_element->set_offset(output_file_offset_);
diff --git a/tensorflow/core/util/memmapped_file_system_writer.h b/tensorflow/core/util/memmapped_file_system_writer.h
index 9d0db92758252d..0a61e648c58c38 100644
--- a/tensorflow/core/util/memmapped_file_system_writer.h
+++ b/tensorflow/core/util/memmapped_file_system_writer.h
@@ -31,19 +31,20 @@ class MemmappedFileSystemWriter {
  public:
   MemmappedFileSystemWriter() = default;
   ~MemmappedFileSystemWriter() = default;
-  absl::Status InitializeToFile(Env* env, const string& filename);
-  absl::Status SaveTensor(const Tensor& tensor, const string& element_name);
+  absl::Status InitializeToFile(Env* env, const std::string& filename);
+  absl::Status SaveTensor(const Tensor& tensor,
+                          const std::string& element_name);
   absl::Status SaveProtobuf(const protobuf::MessageLite& message,
-                            const string& element_name);
+                            const std::string& element_name);
   // Writes out the directory of regions and closes the output file.
   absl::Status FlushAndClose();
 
  private:
-  absl::Status AdjustAlignment(uint64 alignment);
-  void AddToDirectoryElement(const string& element_name, uint64 length);
+  absl::Status AdjustAlignment(uint64_t alignment);
+  void AddToDirectoryElement(const std::string& element_name, uint64_t length);
   MemmappedFileSystemDirectory directory_;
   // The current offset in the file, to support alignment.
-  uint64 output_file_offset_ = 0;
+  uint64_t output_file_offset_ = 0;
   std::unique_ptr<WritableFile> output_file_;
   MemmappedFileSystemWriter(const MemmappedFileSystemWriter&) = delete;
   void operator=(const MemmappedFileSystemWriter&) = delete;
diff --git a/tensorflow/core/util/mirror_pad_mode.cc b/tensorflow/core/util/mirror_pad_mode.cc
index 39364886219b29..3b628d42eb2d84 100644
--- a/tensorflow/core/util/mirror_pad_mode.cc
+++ b/tensorflow/core/util/mirror_pad_mode.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 absl::Status GetNodeAttr(const NodeDef& node_def, absl::string_view attr_name,
                          MirrorPadMode* value) {
-  string str_value;
+  std::string str_value;
   TF_RETURN_IF_ERROR(GetNodeAttr(node_def, attr_name, &str_value));
   if (str_value == "REFLECT") {
     *value = MirrorPadMode::REFLECT;
@@ -36,6 +36,8 @@ absl::Status GetNodeAttr(const NodeDef& node_def, absl::string_view attr_name,
   return absl::OkStatus();
 }
 
-string GetMirrorPadModeAttrString() { return "mode: {'REFLECT', 'SYMMETRIC'}"; }
+std::string GetMirrorPadModeAttrString() {
+  return "mode: {'REFLECT', 'SYMMETRIC'}";
+}
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/mirror_pad_mode.h b/tensorflow/core/util/mirror_pad_mode.h
index eea7c7415268a9..a90a2b131a8ab1 100644
--- a/tensorflow/core/util/mirror_pad_mode.h
+++ b/tensorflow/core/util/mirror_pad_mode.h
@@ -39,7 +39,7 @@ enum class MirrorPadMode {
 
 // Return the string containing the list of valid padding modes, that can be
 // used as an Attr() in REGISTER_OP.
-string GetMirrorPadModeAttrString();
+std::string GetMirrorPadModeAttrString();
 
 // Forward declaration to avoid including core/framework/graph.proto.
 class NodeDef;
diff --git a/tensorflow/core/util/overflow.h b/tensorflow/core/util/overflow.h
index a041d86b4a2a07..eb37d294b5b537 100644
--- a/tensorflow/core/util/overflow.h
+++ b/tensorflow/core/util/overflow.h
@@ -32,9 +32,9 @@ inline int64_t MultiplyWithoutOverflow(int64_t x, int64_t y) {
   // Multiply in uint64 rather than int64 since signed overflow is undefined.
   // Negative values will wrap around to large unsigned values in the casts
   // (see section 4.7 [conv.integral] of the C++14 standard).
-  const uint64 ux = x;
-  const uint64 uy = y;
-  const uint64 uxy = ux * uy;
+  const uint64_t ux = x;
+  const uint64_t uy = y;
+  const uint64_t uxy = ux * uy;
 
   // Check if we overflow uint64, using a cheap check if both inputs are small
   if (TF_PREDICT_FALSE((ux | uy) >> 32 != 0)) {
@@ -54,9 +54,9 @@ inline int64_t AddWithoutOverflow(int64_t x, int64_t y) {
   // Add in uint64 rather than int64 since signed overflow is undefined.
   // Negative values will wrap around to large unsigned values in the casts
   // (see section 4.7 [conv.integral] of the C++14 standard).
-  const uint64 ux = x;
-  const uint64 uy = y;
-  const uint64 uxy = ux + uy;
+  const uint64_t ux = x;
+  const uint64_t uy = y;
+  const uint64_t uxy = ux + uy;
 
   // Cast back to signed. A negative value signals an overflow.
   return static_cast<int64_t>(uxy);
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index 41989d277b55fc..6ec5b99bfc38d4 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -69,13 +69,13 @@ absl::Status CheckValidPadding(Padding padding_type,
   return absl::OkStatus();
 }
 
-string GetPaddingAttrString() { return "padding: {'SAME', 'VALID'}"; }
+std::string GetPaddingAttrString() { return "padding: {'SAME', 'VALID'}"; }
 
-string GetPaddingAttrStringWithExplicit() {
+std::string GetPaddingAttrStringWithExplicit() {
   return "padding: {'SAME', 'VALID', 'EXPLICIT'}";
 }
 
-string GetExplicitPaddingsAttrString() {
+std::string GetExplicitPaddingsAttrString() {
   return "explicit_paddings: list(int) = []";
 }
 
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index 2a03c511ed1492..594de0fbc95414 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -50,15 +50,15 @@ template <class value>
 class PresizedCuckooMap {
  public:
   // The key type is fixed as a pre-hashed key for this specialized use.
-  typedef uint64 key_type;
+  typedef uint64_t key_type;
 
-  explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); }
+  explicit PresizedCuckooMap(uint64_t num_entries) { Clear(num_entries); }
 
-  void Clear(uint64 num_entries) {
+  void Clear(uint64_t num_entries) {
     cpq_.reset(new CuckooPathQueue());
     double n(num_entries);
     n /= kLoadFactor;
-    num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket);
+    num_buckets_ = (static_cast<uint64_t>(n) / kSlotsPerBucket);
     // Very small cuckoo tables don't work, because the probability
     // of having same-bucket hashes is large.  We compromise for those
     // uses by having a larger static starting size.
@@ -74,12 +74,12 @@ class PresizedCuckooMap {
   // Returns false if k is already in table or if the table
   // is full; true otherwise.
   bool InsertUnique(const key_type k, const value& v) {
-    uint64 tk = key_transform(k);
-    uint64 b1 = fast_map_to_buckets(tk);
-    uint64 b2 = fast_map_to_buckets(h2(tk));
+    uint64_t tk = key_transform(k);
+    uint64_t b1 = fast_map_to_buckets(tk);
+    uint64_t b2 = fast_map_to_buckets(h2(tk));
 
     // Merged find and duplicate checking.
-    uint64 target_bucket = 0;
+    uint64_t target_bucket = 0;
     int target_slot = kNoSpace;
 
     for (auto bucket : {b1, b2}) {
@@ -104,14 +104,14 @@ class PresizedCuckooMap {
 
   // Returns true if found.  Sets *out = value.
   bool Find(const key_type k, value* out) const {
-    uint64 tk = key_transform(k);
+    uint64_t tk = key_transform(k);
     return FindInBucket(k, fast_map_to_buckets(tk), out) ||
            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
   }
 
   // Prefetch memory associated with the key k into cache.
   void PrefetchKey(const key_type k) const {
-    const uint64 tk = key_transform(k);
+    const uint64_t tk = key_transform(k);
     absl::PrefetchToLocalCache(&buckets_[fast_map_to_buckets(tk)].keys);
     absl::PrefetchToLocalCache(&buckets_[fast_map_to_buckets(h2(tk))].keys);
   }
@@ -138,7 +138,7 @@ class PresizedCuckooMap {
   // around the full point.  For (2,4) a max BFS path len of 5 results in ~682
   // nodes to visit, calculated below, and is a good value.
 
-  static constexpr uint8 kMaxBFSPathLen = 5;
+  static constexpr uint8_t kMaxBFSPathLen = 5;
 
   // Constants for BFS cuckoo path search:
   // The visited list must be maintained for all but the last level of search
@@ -151,7 +151,7 @@ class PresizedCuckooMap {
   static constexpr int kVisitedListSize = 170;
 
   static constexpr int kNoSpace = -1;  // SpaceAvailable return
-  static constexpr uint64 kUnusedSlot = ~(0ULL);
+  static constexpr uint64_t kUnusedSlot = ~(0ULL);
 
   // Buckets are organized with key_types clustered for access speed
   // and for compactness while remaining aligned.
@@ -164,7 +164,7 @@ class PresizedCuckooMap {
   // the number of cache lines dirtied during search.
 
   struct CuckooPathEntry {
-    uint64 bucket;
+    uint64_t bucket;
     int depth;
     int parent;       // To index in the visited array.
     int parent_slot;  // Which slot in our parent did we come from?  -1 == root.
@@ -208,27 +208,27 @@ class PresizedCuckooMap {
   // collisions, OR must ensure that their keys are always in
   // the range 0 - (uint64max - 1).  This transforms 'not found flag'
   // keys into something else.
-  inline uint64 key_transform(const key_type k) const {
+  inline uint64_t key_transform(const key_type k) const {
     return k + (k == kUnusedSlot);
   }
 
   // h2 performs a very quick mix of h to generate the second bucket hash.
   // Assumes there is plenty of remaining entropy in the initial h.
-  inline uint64 h2(uint64 h) const {
-    const uint64 m = 0xc6a4a7935bd1e995;
+  inline uint64_t h2(uint64_t h) const {
+    const uint64_t m = 0xc6a4a7935bd1e995;
     return m * ((h >> 32) | (h << 32));
   }
 
   // alt_bucket identifies the "other" bucket for key k, where
   // other is "the one that isn't bucket b"
-  inline uint64 alt_bucket(key_type k, uint64 b) const {
+  inline uint64_t alt_bucket(key_type k, uint64_t b) const {
     if (fast_map_to_buckets(k) != b) {
       return fast_map_to_buckets(k);
     }
     return fast_map_to_buckets(h2(k));
   }
 
-  inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) {
+  inline void InsertInternal(key_type k, const value& v, uint64_t b, int slot) {
     Bucket* bptr = &buckets_[b];
     bptr->keys[slot] = k;
     bptr->values[slot] = v;
@@ -236,7 +236,7 @@ class PresizedCuckooMap {
 
   // For the associative cuckoo table, check all of the slots in
   // the bucket to see if the key is present.
-  bool FindInBucket(key_type k, uint64 b, value* out) const {
+  bool FindInBucket(key_type k, uint64_t b, value* out) const {
     const Bucket& bref = buckets_[b];
     for (int i = 0; i < kSlotsPerBucket; i++) {
       if (bref.keys[i] == k) {
@@ -249,7 +249,7 @@ class PresizedCuckooMap {
 
   //  returns either kNoSpace or the index of an
   //  available slot (0 <= slot < kSlotsPerBucket)
-  inline int SpaceAvailable(uint64 bucket) const {
+  inline int SpaceAvailable(uint64_t bucket) const {
     const Bucket& bref = buckets_[bucket];
     for (int i = 0; i < kSlotsPerBucket; i++) {
       if (bref.keys[i] == kUnusedSlot) {
@@ -259,7 +259,7 @@ class PresizedCuckooMap {
     return kNoSpace;
   }
 
-  inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket,
+  inline void CopyItem(uint64_t src_bucket, int src_slot, uint64_t dst_bucket,
                        int dst_slot) {
     Bucket& src_ref = buckets_[src_bucket];
     Bucket& dst_ref = buckets_[dst_bucket];
@@ -267,7 +267,7 @@ class PresizedCuckooMap {
     dst_ref.values[dst_slot] = src_ref.values[src_slot];
   }
 
-  bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) {
+  bool CuckooInsert(key_type k, const value& v, uint64_t b1, uint64_t b2) {
     int visited_end = 0;
     cpq_->reset();
 
@@ -299,10 +299,10 @@ class PresizedCuckooMap {
           const Bucket& bref = buckets_[e.bucket];
           for (int i = 0; i < kSlotsPerBucket; i++) {
             int slot = (start_slot + i) % kSlotsPerBucket;
-            uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket);
+            uint64_t next_bucket = alt_bucket(bref.keys[slot], e.bucket);
             // Optimization:  Avoid single-step cycles (from e, don't
             // add a child node that is actually e's parent).
-            uint64 e_parent_bucket = visited_[e.parent].bucket;
+            uint64_t e_parent_bucket = visited_[e.parent].bucket;
             if (next_bucket != e_parent_bucket) {
               cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot});
             }
@@ -315,7 +315,7 @@ class PresizedCuckooMap {
     return false;
   }
 
-  inline uint64 fast_map_to_buckets(uint64 x) const {
+  inline uint64_t fast_map_to_buckets(uint64_t x) const {
     // Map x (uniform in 2^64) to the range [0, num_buckets_ -1]
     // using Lemire's alternative to modulo reduction:
     // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
@@ -324,7 +324,7 @@ class PresizedCuckooMap {
   }
 
   // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket.
-  uint64 num_buckets_;
+  uint64_t num_buckets_;
   std::vector<Bucket> buckets_;
 
   std::unique_ptr<CuckooPathQueue> cpq_;
diff --git a/tensorflow/core/util/presized_cuckoo_map_test.cc b/tensorflow/core/util/presized_cuckoo_map_test.cc
index cc657e0032f2e5..ad46c36cd7d258 100644
--- a/tensorflow/core/util/presized_cuckoo_map_test.cc
+++ b/tensorflow/core/util/presized_cuckoo_map_test.cc
@@ -45,17 +45,17 @@ TEST(PresizedCuckooMapTest, Prefetch) {
 TEST(PresizedCuckooMapTest, TooManyItems) {
   static constexpr int kTableSize = 1000;
   PresizedCuckooMap<int> pscm(kTableSize);
-  for (uint64 i = 0; i < kTableSize; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+  for (uint64_t i = 0; i < kTableSize; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     ASSERT_TRUE(pscm.InsertUnique(key, i));
   }
   // Try to over-fill the table.  A few of these
   // inserts will succeed, but should start failing.
-  uint64 failed_at = 0;
-  for (uint64 i = kTableSize; i < (2 * kTableSize); i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+  uint64_t failed_at = 0;
+  for (uint64_t i = kTableSize; i < (2 * kTableSize); i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     if (!pscm.InsertUnique(key, i)) {
       failed_at = i;
       break;
@@ -66,10 +66,10 @@ TEST(PresizedCuckooMapTest, TooManyItems) {
 
   // Requirement 2:  Table must preserve all items inserted prior
   // to the failure.
-  for (uint64 i = 0; i < failed_at; i++) {
+  for (uint64_t i = 0; i < failed_at; i++) {
     int out;
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     EXPECT_TRUE(pscm.Find(key, &out));
     EXPECT_EQ(out, i);
   }
@@ -78,7 +78,7 @@ TEST(PresizedCuckooMapTest, TooManyItems) {
 TEST(PresizedCuckooMapTest, ZeroSizeMap) {
   PresizedCuckooMap<int> pscm(0);
   int out;
-  for (uint64 i = 0; i < 100; i++) {
+  for (uint64_t i = 0; i < 100; i++) {
     EXPECT_FALSE(pscm.Find(i, &out));
   }
 }
@@ -102,13 +102,13 @@ TEST(PresizedCuckooMapTest, RepeatedClear) {
 void RunFill(int64_t table_size) {
   PresizedCuckooMap<int> pscm(table_size);
   for (int64_t i = 0; i < table_size; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     EXPECT_TRUE(pscm.InsertUnique(key, i));
   }
   for (int64_t i = 0; i < table_size; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     int out;
     EXPECT_TRUE(pscm.Find(key, &out));
     EXPECT_EQ(out, i);
@@ -125,24 +125,24 @@ TEST(PresizedCuckooMapTest, Duplicates) {
   static constexpr int kSmallTableSize = 1000;
   PresizedCuckooMap<int> pscm(kSmallTableSize);
 
-  for (uint64 i = 0; i < kSmallTableSize; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64)));
+  for (uint64_t i = 0; i < kSmallTableSize; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(uint64_t)));
     EXPECT_TRUE(pscm.InsertUnique(key, i));
   }
 
-  for (uint64 i = 0; i < kSmallTableSize; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64)));
+  for (uint64_t i = 0; i < kSmallTableSize; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(uint64_t)));
     EXPECT_FALSE(pscm.InsertUnique(key, i));
   }
 }
 
-static void CalculateKeys(uint64 num, std::vector<uint64> *dst) {
+static void CalculateKeys(uint64_t num, std::vector<uint64_t>* dst) {
   dst->resize(num);
-  for (uint64 i = 0; i < num; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64)));
+  for (uint64_t i = 0; i < num; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(uint64_t)));
     dst->at(i) = key;
   }
 }
@@ -150,12 +150,12 @@ static void CalculateKeys(uint64 num, std::vector<uint64> *dst) {
 void BM_CuckooFill(::testing::benchmark::State &state) {
   const int arg = state.range(0);
 
-  uint64 table_size = arg;
-  std::vector<uint64> calculated_keys;
+  uint64_t table_size = arg;
+  std::vector<uint64_t> calculated_keys;
   CalculateKeys(table_size, &calculated_keys);
   for (auto s : state) {
     PresizedCuckooMap<int> pscm(table_size);
-    for (uint64 i = 0; i < table_size; i++) {
+    for (uint64_t i = 0; i < table_size; i++) {
       pscm.InsertUnique(calculated_keys[i], i);
     }
   }
@@ -166,18 +166,18 @@ BENCHMARK(BM_CuckooFill)->Arg(1000)->Arg(10000000);
 void BM_CuckooRead(::testing::benchmark::State &state) {
   const int arg = state.range(0);
 
-  uint64 table_size = arg;
-  std::vector<uint64> calculated_keys;
+  uint64_t table_size = arg;
+  std::vector<uint64_t> calculated_keys;
   CalculateKeys(table_size, &calculated_keys);
   PresizedCuckooMap<int> pscm(table_size);
-  for (uint64 i = 0; i < table_size; i++) {
+  for (uint64_t i = 0; i < table_size; i++) {
     pscm.InsertUnique(calculated_keys[i], i);
   }
 
   int i = 0;
   for (auto s : state) {
     // Avoid using '%', which is expensive.
-    uint64 key_index = i;
+    uint64_t key_index = i;
     ++i;
     if (i == table_size) i = 0;
 
diff --git a/tensorflow/core/util/ragged_to_dense_util.cc b/tensorflow/core/util/ragged_to_dense_util.cc
index e8364169e4b8c2..42ec4e98b04071 100644
--- a/tensorflow/core/util/ragged_to_dense_util.cc
+++ b/tensorflow/core/util/ragged_to_dense_util.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 using errors::InvalidArgument;
 
 absl::Status GetRowPartitionTypesHelper(
-    const std::vector<string>& row_partition_type_strings,
+    const std::vector<std::string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types) {
   *row_partition_types = GetRowPartitionTypesHelper(row_partition_type_strings);
   if (row_partition_types->size() != row_partition_type_strings.size()) {
diff --git a/tensorflow/core/util/ragged_to_dense_util.h b/tensorflow/core/util/ragged_to_dense_util.h
index 438e3d1d5ab080..28d698b73a3ca8 100644
--- a/tensorflow/core/util/ragged_to_dense_util.h
+++ b/tensorflow/core/util/ragged_to_dense_util.h
@@ -26,17 +26,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-string RowPartitionTypeToString(RowPartitionType row_partition_type);
+std::string RowPartitionTypeToString(RowPartitionType row_partition_type);
 
 absl::Status GetRowPartitionTypesHelper(
-    const std::vector<string>& row_partition_type_strings,
+    const std::vector<std::string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types);
 
 // ContextType must be InferenceContext or OpKernelConstruction.
 template <typename ContextType>
 absl::Status GetRowPartitionTypes(
     ContextType* context, std::vector<RowPartitionType>* row_partition_types) {
-  std::vector<string> row_partition_type_strings;
+  std::vector<std::string> row_partition_type_strings;
   TF_RETURN_IF_ERROR(
       context->GetAttr("row_partition_types", &row_partition_type_strings));
   return GetRowPartitionTypesHelper(row_partition_type_strings,
@@ -44,7 +44,7 @@ absl::Status GetRowPartitionTypes(
 }
 
 absl::Status GetRowPartitionTypesHelper(
-    const std::vector<string>& row_partition_type_strings,
+    const std::vector<std::string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types);
 
 absl::Status CombineRaggedTensorToTensorShapes(
diff --git a/tensorflow/core/util/ragged_to_dense_util_test.cc b/tensorflow/core/util/ragged_to_dense_util_test.cc
index 9752e0a8a410e2..ad3af9c6267e20 100644
--- a/tensorflow/core/util/ragged_to_dense_util_test.cc
+++ b/tensorflow/core/util/ragged_to_dense_util_test.cc
@@ -77,7 +77,7 @@ TEST(CombineRaggedTensorToTensorShapes, UnknownShapeDenseValue) {
 }
 
 TEST(GetRowPartitionTypesHelper, BasicTest) {
-  const std::vector<string> row_partition_type_strings = {
+  const std::vector<std::string> row_partition_type_strings = {
       "FIRST_DIM_SIZE", "VALUE_ROWIDS", "ROW_SPLITS"};
   std::vector<RowPartitionType> row_partition_types;
   TF_ASSERT_OK(GetRowPartitionTypesHelper(row_partition_type_strings,
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 68690d94bee066..c8f40a224756e0 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -44,8 +44,8 @@ TEST(TestReporter, UsesEnv) {
 
   // Set a file we can't possibly create, check for failure
   setenv(TestReporter::kTestReporterEnv, "/cant/find/me:!", 1);
-  CHECK_EQ(string(std::getenv(TestReporter::kTestReporterEnv)),
-           string("/cant/find/me:!"));
+  CHECK_EQ(std::string(std::getenv(TestReporter::kTestReporterEnv)),
+           std::string("/cant/find/me:!"));
   TestReporter test_reporter("b1");
   absl::Status s = test_reporter.Initialize();
   ExpectHasSubstr(s.ToString(), "/cant/find/me");
@@ -69,12 +69,12 @@ TEST(TestReporter, UsesEnv) {
 TEST(TestReporter, CreateTwiceFails) {
   {
     TestReporter test_reporter(
-        strings::StrCat(testing::TmpDir(), "/test_reporter_dupe"), "t1");
+        absl::StrCat(testing::TmpDir(), "/test_reporter_dupe"), "t1");
     TF_EXPECT_OK(test_reporter.Initialize());
   }
   {
     TestReporter test_reporter(
-        strings::StrCat(testing::TmpDir(), "/test_reporter_dupe"), "t1");
+        absl::StrCat(testing::TmpDir(), "/test_reporter_dupe"), "t1");
     absl::Status s = test_reporter.Initialize();
     ExpectHasSubstr(s.ToString(), "file exists:");
   }
@@ -82,7 +82,7 @@ TEST(TestReporter, CreateTwiceFails) {
 
 TEST(TestReporter, CreateCloseCreateAgainSkipsSecond) {
   TestReporter test_reporter(
-      strings::StrCat(testing::TmpDir(), "/test_reporter_create_close"), "t1");
+      absl::StrCat(testing::TmpDir(), "/test_reporter_create_close"), "t1");
   TF_EXPECT_OK(test_reporter.Initialize());
   TF_EXPECT_OK(test_reporter.Close());
   TF_EXPECT_OK(test_reporter.Benchmark(1, 1.0, 2.0, 3.0));  // No-op, closed
@@ -92,15 +92,15 @@ TEST(TestReporter, CreateCloseCreateAgainSkipsSecond) {
 }
 
 TEST(TestReporter, Benchmark) {
-  string fname =
-      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  std::string fname =
+      absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
   TestReporter test_reporter(fname, "b1/2/3");
   TF_EXPECT_OK(test_reporter.Initialize());
   TF_EXPECT_OK(test_reporter.Benchmark(1, 1.0, 2.0, 3.0));
   TF_EXPECT_OK(test_reporter.Close());
 
-  string expected_fname = strings::StrCat(fname, "b1__2__3");
-  string read;
+  std::string expected_fname = absl::StrCat(fname, "b1__2__3");
+  std::string read;
   TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
 
   BenchmarkEntries benchmark_entries;
@@ -116,16 +116,16 @@ TEST(TestReporter, Benchmark) {
 }
 
 TEST(TestReporter, SetProperties) {
-  string fname =
-      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  std::string fname =
+      absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
   TestReporter test_reporter(fname, "b2/3/4");
   TF_EXPECT_OK(test_reporter.Initialize());
   TF_EXPECT_OK(test_reporter.SetProperty("string_prop", "abc"));
   TF_EXPECT_OK(test_reporter.SetProperty("double_prop", 4.0));
 
   TF_EXPECT_OK(test_reporter.Close());
-  string expected_fname = strings::StrCat(fname, "b2__3__4");
-  string read;
+  std::string expected_fname = absl::StrCat(fname, "b2__3__4");
+  std::string read;
   TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
 
   BenchmarkEntries benchmark_entries;
@@ -139,16 +139,16 @@ TEST(TestReporter, SetProperties) {
 }
 
 TEST(TestReporter, AddMetrics) {
-  string fname =
-      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  std::string fname =
+      absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
   TestReporter test_reporter(fname, "b3/4/5");
   TF_EXPECT_OK(test_reporter.Initialize());
   TF_EXPECT_OK(test_reporter.AddMetric("metric1", 2.0));
   TF_EXPECT_OK(test_reporter.AddMetric("metric2", 3.0));
 
   TF_EXPECT_OK(test_reporter.Close());
-  string expected_fname = strings::StrCat(fname, "b3__4__5");
-  string read;
+  std::string expected_fname = absl::StrCat(fname, "b3__4__5");
+  std::string read;
   TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
 
   BenchmarkEntries benchmark_entries;
diff --git a/tensorflow/core/util/saved_tensor_slice_util.cc b/tensorflow/core/util/saved_tensor_slice_util.cc
index 3db9ec58811e0d..25d7562ba2f359 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.cc
+++ b/tensorflow/core/util/saved_tensor_slice_util.cc
@@ -27,8 +27,9 @@ namespace checkpoint {
 
 const char kSavedTensorSlicesKey[] = "";
 
-string EncodeTensorNameSlice(const string& name, const TensorSlice& slice) {
-  string buffer;
+std::string EncodeTensorNameSlice(const std::string& name,
+                                  const TensorSlice& slice) {
+  std::string buffer;
   // All the tensor slice keys will start with a 0
   tensorflow::strings::OrderedCode::WriteNumIncreasing(&buffer, 0);
   tensorflow::strings::OrderedCode::WriteString(&buffer, name);
@@ -44,10 +45,10 @@ string EncodeTensorNameSlice(const string& name, const TensorSlice& slice) {
   return buffer;
 }
 
-absl::Status DecodeTensorNameSlice(const string& code, string* name,
+absl::Status DecodeTensorNameSlice(const std::string& code, std::string* name,
                                    tensorflow::TensorSlice* slice) {
   absl::string_view src(code);
-  uint64 x;
+  uint64_t x;
   if (!tensorflow::strings::OrderedCode::ReadNumIncreasing(&src, &x)) {
     return errors::Internal("Failed to parse the leading number: src = ", src);
   }
@@ -65,11 +66,11 @@ absl::Status DecodeTensorNameSlice(const string& code, string* name,
     return errors::Internal("Expecting positive rank of the tensor, got ", x,
                             ", src = ", src);
   }
-  if (x >= kint32max) {
+  if (x >= std::numeric_limits<int32_t>::max()) {
     return errors::Internal("Too many elements ", x);
   }
   slice->SetFullSlice(x);
-  for (int d = 0; d < static_cast<int32>(x); ++d) {
+  for (int d = 0; d < static_cast<int32_t>(x); ++d) {
     // We expected 2x integers
     int64_t start, length;
     if (!tensorflow::strings::OrderedCode::ReadSignedNumIncreasing(&src,
@@ -89,13 +90,13 @@ absl::Status DecodeTensorNameSlice(const string& code, string* name,
   return absl::OkStatus();
 }
 
-absl::Status ParseShapeAndSlice(const string& shape_and_slice,
+absl::Status ParseShapeAndSlice(const std::string& shape_and_slice,
                                 TensorShape* shape, TensorSlice* slice,
                                 TensorShape* shape_slice) {
   CHECK(!shape_and_slice.empty());
   // Syntax: dim0 dim1 dim2 ... <slice string>
   // Where slice string is defined in core/framework/tensor_slice.h
-  std::vector<string> splits = str_util::Split(shape_and_slice, ' ');
+  std::vector<std::string> splits = str_util::Split(shape_and_slice, ' ');
 
   // Must have at least 2 strings.
   if (splits.size() < 2) {
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 102e32a9ebc156..8b3d8f355b3ba0 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -43,17 +43,17 @@ extern const char kSavedTensorSlicesKey[];
 //  <dim-1-start><dim-1-length>
 //  ...
 
-string EncodeTensorNameSlice(const string& name,
-                             const tensorflow::TensorSlice& slice);
+std::string EncodeTensorNameSlice(const std::string& name,
+                                  const tensorflow::TensorSlice& slice);
 
 // Parse out the name and the slice from string encoded as an ordered code.
-absl::Status DecodeTensorNameSlice(const string& code, string* name,
+absl::Status DecodeTensorNameSlice(const std::string& code, std::string* name,
                                    tensorflow::TensorSlice* slice);
 
 // Extracts the full shape, slice spec, and shape of the slice from
 // "shape_and_slice".  On non-OK return, caller must clear the out-arguments
 // before reusing.
-absl::Status ParseShapeAndSlice(const string& shape_and_slice,
+absl::Status ParseShapeAndSlice(const std::string& shape_and_slice,
                                 TensorShape* shape, TensorSlice* slice,
                                 TensorShape* shape_slice);
 
@@ -127,17 +127,17 @@ TENSOR_PROTO_EXTRACT_TYPE(float, float, float);
 TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
-TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
+TENSOR_PROTO_EXTRACT_TYPE(int32_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(uint32_t, uint32, uint32_t);
 TENSOR_PROTO_EXTRACT_TYPE(int64_t, int64, protobuf_int64);
-TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
-TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(int16, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(qint8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(quint8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(quint16, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint64_t, uint64, protobuf_uint64);
+TENSOR_PROTO_EXTRACT_TYPE(uint16_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(uint8_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(int8_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(int16_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(qint8, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(quint8, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(quint16, int, int32_t);
 
 #undef TENSOR_PROTO_EXTRACT_TYPE_COMPLEX
 #undef TENSOR_PROTO_EXTRACT_TYPE_HELPER
@@ -146,7 +146,7 @@ TENSOR_PROTO_EXTRACT_TYPE(quint16, int, int32);
 // Custom implementation for qint32, based on the one for int32.
 
 template <>
-struct SaveTypeTraits<qint32> : SaveTypeTraits<int32> {};
+struct SaveTypeTraits<qint32> : SaveTypeTraits<int32_t> {};
 
 template <>
 inline int TensorProtoDataSize<qint32>(const TensorProto& t) {
@@ -154,15 +154,15 @@ inline int TensorProtoDataSize<qint32>(const TensorProto& t) {
 }
 
 template <>
-inline const int32* TensorProtoData<qint32>(const TensorProto& t) {
+inline const int32_t* TensorProtoData<qint32>(const TensorProto& t) {
   static_assert(SaveTypeTraits<qint32>::supported,
                 "Specified type qint32 not supported for Restore");
-  return reinterpret_cast<const int32*>(t.int_val().data());
+  return reinterpret_cast<const int32_t*>(t.int_val().data());
 }
 
 inline void Fill(const qint32* data, size_t n, TensorProto* t) {
-  const int32* p = reinterpret_cast<const int32*>(data);
-  typename protobuf::RepeatedField<int32> copy(p, p + n);
+  const int32_t* p = reinterpret_cast<const int32_t*>(data);
+  typename protobuf::RepeatedField<int32_t> copy(p, p + n);
   t->mutable_int_val()->Swap(&copy);
 }
 
@@ -172,7 +172,7 @@ template <>
 struct SaveTypeTraits<Eigen::half> {
   static constexpr bool supported = true;
   typedef int SavedType;
-  typedef protobuf::RepeatedField<int32> RepeatedField;
+  typedef protobuf::RepeatedField<int32_t> RepeatedField;
 };
 
 template <>
@@ -186,17 +186,17 @@ inline const int* TensorProtoData<Eigen::half>(const TensorProto& t) {
 }
 
 template <>
-inline protobuf::RepeatedField<int32>* MutableTensorProtoData<Eigen::half>(
+inline protobuf::RepeatedField<int32_t>* MutableTensorProtoData<Eigen::half>(
     TensorProto* t) {
   return t->mutable_half_val();
 }
 
 template <>
 inline void Fill(const Eigen::half* data, size_t n, TensorProto* t) {
-  typename protobuf::RepeatedField<int32>* val = t->mutable_half_val();
+  typename protobuf::RepeatedField<int32_t>* val = t->mutable_half_val();
   val->Resize(n, 0);
   for (size_t i = 0; i < n; ++i) {
-    val->Set(i, Eigen::numext::bit_cast<uint16>(data[i]));
+    val->Set(i, Eigen::numext::bit_cast<uint16_t>(data[i]));
   }
 }
 
@@ -205,8 +205,8 @@ inline void Fill(const Eigen::half* data, size_t n, TensorProto* t) {
 template <>
 struct SaveTypeTraits<tstring> {
   static constexpr bool supported = true;
-  typedef const string* SavedType;
-  typedef protobuf::RepeatedPtrField<string> RepeatedField;
+  typedef const std::string* SavedType;
+  typedef protobuf::RepeatedPtrField<std::string> RepeatedField;
 };
 
 template <>
@@ -215,14 +215,15 @@ inline int TensorProtoDataSize<tstring>(const TensorProto& t) {
 }
 
 template <>
-inline const string* const* TensorProtoData<tstring>(const TensorProto& t) {
+inline const std::string* const* TensorProtoData<tstring>(
+    const TensorProto& t) {
   static_assert(SaveTypeTraits<tstring>::supported,
                 "Specified type tstring not supported for Restore");
   return t.string_val().data();
 }
 
 template <>
-inline protobuf::RepeatedPtrField<string>* MutableTensorProtoData<tstring>(
+inline protobuf::RepeatedPtrField<std::string>* MutableTensorProtoData<tstring>(
     TensorProto* t) {
   static_assert(SaveTypeTraits<tstring>::supported,
                 "Specified type tstring not supported for Save");
@@ -231,7 +232,7 @@ inline protobuf::RepeatedPtrField<string>* MutableTensorProtoData<tstring>(
 
 template <>
 inline void Fill(const tstring* data, size_t n, TensorProto* t) {
-  typename protobuf::RepeatedPtrField<string> copy(data, data + n);
+  typename protobuf::RepeatedPtrField<std::string> copy(data, data + n);
   t->mutable_string_val()->Swap(&copy);
 }
 
diff --git a/tensorflow/core/util/saved_tensor_slice_util_test.cc b/tensorflow/core/util/saved_tensor_slice_util_test.cc
index 0880c48162edfe..40fb0b581c70d3 100644
--- a/tensorflow/core/util/saved_tensor_slice_util_test.cc
+++ b/tensorflow/core/util/saved_tensor_slice_util_test.cc
@@ -31,8 +31,8 @@ namespace {
 TEST(TensorShapeUtilTest, TensorNameSliceToOrderedCode) {
   {
     TensorSlice s = TensorSlice::ParseOrDie("-:-:1,3:4,5");
-    string buffer = EncodeTensorNameSlice("foo", s);
-    string name;
+    std::string buffer = EncodeTensorNameSlice("foo", s);
+    std::string name;
     s.Clear();
     TF_CHECK_OK(DecodeTensorNameSlice(buffer, &name, &s));
     EXPECT_EQ("foo", name);
diff --git a/tensorflow/core/util/semver_test.cc b/tensorflow/core/util/semver_test.cc
index 4c2ecdd21fed2e..22d7eaca043127 100644
--- a/tensorflow/core/util/semver_test.cc
+++ b/tensorflow/core/util/semver_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstddef>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -27,13 +28,11 @@ namespace {
 bool IsDotOrIdentifierChar(char c) {
   if (c == '.') return true;
   if (c == '-') return true;
-  if (c >= 'A' && c <= 'Z') return true;
-  if (c >= 'a' && c <= 'z') return true;
-  if (c >= '0' && c <= '9') return true;
-  return false;
+  return absl::ascii_isalnum(c);
 }
 
-bool ConsumeDotSeparatedIdentifiers(absl::string_view* s, const string& prefix,
+bool ConsumeDotSeparatedIdentifiers(absl::string_view* s,
+                                    const std::string& prefix,
                                     absl::string_view* val) {
   if (!absl::ConsumePrefix(s, prefix)) return false;
   size_t i;
@@ -50,7 +49,7 @@ TEST(SemverTest, VersionStringFollowsSemver) {
   // Poor approximation of the semver 2.0 specification at www.semver.org.  Feel
   // free to refine further (for example, check for leading 0s in numbers), but
   // avoid adding dependencies.
-  uint64 major, minor, patch;
+  uint64_t major, minor, patch;
   absl::string_view prerelease, metadata;
   absl::string_view semver(TF_VERSION_STRING);
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 3e22ae9a848a62..afa764a2e15227 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -439,9 +439,9 @@ BundleWriter::BundleWriter(Env* env, absl::string_view prefix,
   data_path_ = DataFilename(prefix_, 0, 1);
   metadata_path_ = MetaFilename(prefix_);
   if (use_temp_file_) {
-    data_path_ = strings::StrCat(data_path_, ".tempstate", random::New64());
+    data_path_ = absl::StrCat(data_path_, ".tempstate", random::New64());
     metadata_path_ =
-        strings::StrCat(metadata_path_, ".tempstate", random::New64());
+        absl::StrCat(metadata_path_, ".tempstate", random::New64());
   }
 
   status_ = env_->CreateDir(string(io::Dirname(prefix_)));
@@ -1230,7 +1230,7 @@ string BundleReader::DebugString() {
 
     strings::StrAppend(&shape_str, key(), " (", DataType_Name(entry.dtype()),
                        ") ", TensorShape(entry.shape()).DebugString());
-    strings::StrAppend(&shape_str, "\n");
+    absl::StrAppend(&shape_str, "\n");
   }
   return shape_str;
 }
diff --git a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
index 5b53f8e89c138d..549f96a204d37b 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
+++ b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 // Needed for encoding and decoding ResourceDeleter Variant.
@@ -375,10 +376,11 @@ class GrpcPollingThread {
   explicit GrpcPollingThread(std::string thread_name) {
     // Thread name can only have alpha numeric characters. Remove special
     // characters from input thread_name.
-    thread_name.erase(
-        std::remove_if(thread_name.begin(), thread_name.end(),
-                       [](auto const c) -> bool { return !std::isalnum(c); }),
-        thread_name.end());
+    thread_name.erase(std::remove_if(thread_name.begin(), thread_name.end(),
+                                     [](unsigned char const c) -> bool {
+                                       return !absl::ascii_isalnum(c);
+                                     }),
+                      thread_name.end());
     thread_.reset(Env::Default()->StartThread(
         ThreadOptions(), absl::StrCat("GrpcPollingThread", thread_name),
         [this]() {
@@ -406,7 +408,7 @@ class GrpcPollingThread {
 class RpcClient : public ResourceBase {
  public:
   explicit RpcClient(std::string address, std::string resource_name,
-                     int64 timeout_in_ms)
+                     int64_t timeout_in_ms)
       : server_address_(address),
         thread_(resource_name),
         timeout_in_ms_(timeout_in_ms) {
@@ -428,7 +430,7 @@ class RpcClient : public ResourceBase {
 
   void CallAsync(const std::string& method_name,
                  const std::vector<Tensor>& inputs, CallResponse* response,
-                 StatusCallback callback, int64 timeout_in_ms) {
+                 StatusCallback callback, int64_t timeout_in_ms) {
     CallRequest request;
     request.set_method(method_name);
     for (const auto& t : inputs) {
@@ -436,7 +438,7 @@ class RpcClient : public ResourceBase {
     }
     ::grpc::ClientContext context;
     // Use per call timeout if specified, otherwise use default client timeout.
-    int64 timeout = timeout_in_ms > 0 ? timeout_in_ms : timeout_in_ms_;
+    int64_t timeout = timeout_in_ms > 0 ? timeout_in_ms : timeout_in_ms_;
     new RPCState<CallResponse>(
         stub_.get(), cq_, "/tensorflow.rpc.RpcService/Call", request, response,
         /*done=*/std::move(callback),
@@ -468,7 +470,7 @@ class RpcClient : public ResourceBase {
   ::grpc::CompletionQueue* cq_;
   GrpcPollingThread thread_;
   std::unique_ptr<thread::ThreadPool> callback_threadpool_;
-  int64 timeout_in_ms_;
+  int64_t timeout_in_ms_;
 };
 
 class RpcFutureResource : public ResourceBase {
@@ -685,7 +687,7 @@ void RpcServerRegisterOp::Compute(OpKernelContext* ctx) {
     instantiate_opts.input_devices.push_back(ctx->device()->name());
   }
 
-  absl::flat_hash_map<string, std::vector<string>> composite_devices;
+  absl::flat_hash_map<std::string, std::vector<std::string>> composite_devices;
   for (int i = 0; i < captured.size(); ++i) {
     if (captured[i].dtype() == DT_RESOURCE) {
       instantiate_opts.input_devices.push_back(GetFunctionResourceInputDevice(
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 53e925e2d74d23..de2aeb2ab38304 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -313,7 +313,7 @@ cc_library(
         "//tensorflow/c/eager:c_api_internal",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@local_xla//xla/pjrt:pjrt_future",
+        "@local_xla//xla:future",
     ],
 )
 
diff --git a/tensorflow/dtensor/cc/parallel_executor.h b/tensorflow/dtensor/cc/parallel_executor.h
index e57218e541ff02..c237a1315783f4 100644
--- a/tensorflow/dtensor/cc/parallel_executor.h
+++ b/tensorflow/dtensor/cc/parallel_executor.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api_experimental.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/cc/tensor_with_layout.h"
 
@@ -31,7 +31,7 @@ namespace tensorflow {
 namespace dtensor {
 
 template <typename T = void>
-using Future = ::xla::PjRtFuture<T>;
+using Future = ::xla::Future<T>;
 
 // ParallelExecutor Interface
 // Note: The interface is under development and APIs are subject to change.
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
index 3af9d11e586748..4811331ef1a8fa 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
@@ -35,6 +35,8 @@ limitations under the License.
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
+using namespace mlir;  // NOLINT
+
 namespace mlir {
 namespace TF {
 namespace {
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index cc5bf67cbde467..91eab6f8438dc2 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -227,7 +227,7 @@ StatusOr<Layout> GetBroadcastLayoutForElementWise(
   const int rank_offset_b = std::max(0, rank_a - rank_b);
   absl::flat_hash_map<std::string, int> mesh_dim_map_a;
   absl::flat_hash_map<std::string, int> mesh_dim_map_b;
-  std::vector<string> output_layout_specs;
+  std::vector<std::string> output_layout_specs;
 
   auto unsharded_specs = [](const int new_size) -> std::vector<std::string> {
     std::vector<std::string> spec_strs(new_size, Layout::kUnshardedDim);
@@ -531,7 +531,7 @@ StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
   // has the given mesh in it. If it exists, simply return that op's value.
   TF_ASSIGN_OR_RETURN(const auto mesh, ExtractDeviceMeshFromOp(cluster));
   if (!mesh) return errors::InvalidArgument("missing mesh on cluster");
-  string serialized_mesh = mesh->ToString();
+  std::string serialized_mesh = mesh->ToString();
   mlir::Value ret_val;
   auto result = cluster.walk([&](mlir::TF::FloorModOp op) -> mlir::WalkResult {
     if (op->hasAttrOfType<mlir::StringAttr>(kMeshCoordinatesAttr) &&
@@ -547,12 +547,12 @@ StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
 
   // We didn't find a FloorModOp for the given mesh, so we must produce the
   // FloorModOp and add the attr so we can find it on next call.
-  std::vector<int32> mesh_shape(mesh->rank());
+  std::vector<int32_t> mesh_shape(mesh->rank());
   for (int i = 0; i < mesh->rank(); ++i) mesh_shape[i] = mesh->dim(i).size;
 
   // This product represents the [b*c*d, c*d, d, 1] from the function
   // documentation.
-  std::vector<int32> running_product(mesh->rank());
+  std::vector<int32_t> running_product(mesh->rank());
   running_product[mesh->rank() - 1] = 1;
   for (int i = mesh->rank() - 1; i > 0; --i)
     running_product[i - 1] = running_product[i] * mesh_shape[i];
@@ -685,13 +685,13 @@ namespace {
 // used. In order to ensure that all branch functions of TF control flow ops are
 // unique, we keep track of atomic counter for each control flow functions.
 // See b/174253694 for more details.
-std::atomic<int32> dtensor_controlflow_function_counter{0};
+std::atomic<int32_t> dtensor_controlflow_function_counter{0};
 
 }  // namespace
 
 mlir::StringAttr GetUniqueControlflowFnName(const std::string& prefix,
                                             mlir::OpBuilder& builder) {
-  int32 unique_id = dtensor_controlflow_function_counter++;
+  int32_t unique_id = dtensor_controlflow_function_counter++;
   return builder.getStringAttr(
       absl::StrCat(prefix, "_dtensor_function_", unique_id));
 }
diff --git a/tensorflow/dtensor/mlir/tpu_integration.cc b/tensorflow/dtensor/mlir/tpu_integration.cc
index 78a03f05a5540b..60271a08b8922d 100644
--- a/tensorflow/dtensor/mlir/tpu_integration.cc
+++ b/tensorflow/dtensor/mlir/tpu_integration.cc
@@ -81,7 +81,7 @@ void IdentifyTPUFunctions(
   if (!main_func) return;
 
   for (auto call : main_func.getOps<mlir::TF::StatefulPartitionedCallOp>()) {
-    auto mesh_or_status = Mesh::FromString(string(call.getConfig()));
+    auto mesh_or_status = Mesh::FromString(std::string(call.getConfig()));
     // Function calls created by end users instead of being converted from
     // tf_device.cluster do not have a serialized mesh as a config attribute. We
     // ignore the error returned from parsing in this case.
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 7c7c78a77ebde7..3f7ce18d0c619e 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -38,7 +38,6 @@ pytype_strict_library(
     deps = [
         "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:config",
-        "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:numpy_util",
         "//tensorflow/dtensor/python:tpu_util",
@@ -65,6 +64,7 @@ pytype_strict_library(
         "//tensorflow/python/util:numpy_compat",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -486,7 +486,7 @@ pytype_strict_library(
         ":test_util",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/flags",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
@@ -724,6 +724,7 @@ dtensor_test(
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
+        "@absl_py//absl/logging",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/dtensor/tests/tensor_layout_test.cc b/tensorflow/dtensor/tests/tensor_layout_test.cc
index 78bec8ba5d198e..bc4118651b2c30 100644
--- a/tensorflow/dtensor/tests/tensor_layout_test.cc
+++ b/tensorflow/dtensor/tests/tensor_layout_test.cc
@@ -175,7 +175,8 @@ TEST_F(LayoutTest, LayoutToFromString) {
 }
 
 TEST_F(LayoutTest, LayoutToFromStringNotSharded) {
-  std::string layout_str = "sharding_specs:x," + string(Layout::kUnshardedDim) +
+  std::string layout_str = "sharding_specs:x," +
+                           std::string(Layout::kUnshardedDim) +
                            ", mesh:|x=1|0|0|/job:localhost/task:0/device:CPU:0";
   EXPECT_EQ(layout_str, Layout::FromString(layout_str)->ToString());
 }
@@ -223,7 +224,7 @@ TEST_F(LayoutTest, OnTPUMesh) {
 }
 
 TEST_F(LayoutTest, NumShardsAsVector) {
-  std::vector<int32> shards = {4, 8};
+  std::vector<int32_t> shards = {4, 8};
   EXPECT_EQ(BatchLayout().num_shards(), shards);
 }
 
@@ -233,7 +234,7 @@ TEST_F(LayoutTest, IsReplicated) {
 
 TEST_F(LayoutTest, MeshDeviceLocations) {
   Layout layout = BatchLayout();
-  absl::InlinedVector<int64, 4> offset = {1, 2};
+  absl::InlinedVector<int64_t, 4> offset = {1, 2};
   EXPECT_THAT(layout.mesh().device_location(10),
               absl_testing::IsOkAndHolds(offset));
   offset = {2, 2};
diff --git a/tensorflow/examples/wav_to_spectrogram/main.cc b/tensorflow/examples/wav_to_spectrogram/main.cc
index 8bc0a1cba96f69..09e4db2bc04bfa 100644
--- a/tensorflow/examples/wav_to_spectrogram/main.cc
+++ b/tensorflow/examples/wav_to_spectrogram/main.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/log/log.h"
@@ -28,12 +29,12 @@ int main(int argc, char* argv[]) {
   // They define where the graph and input data is located, and what kind of
   // input the model expects. If you train your own model, or use something
   // other than inception_v3, then you'll need to update these.
-  tensorflow::string input_wav =
+  std::string input_wav =
       "tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav";
   int32_t window_size = 256;
   int32_t stride = 128;
   float brightness = 64.0f;
-  tensorflow::string output_image = "spectrogram.png";
+  std::string output_image = "spectrogram.png";
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("input_wav", &input_wav, "audio file to load"),
       tensorflow::Flag("window_size", &window_size,
@@ -45,7 +46,7 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag("output_image", &output_image,
                        "where to save the spectrogram image to"),
   };
-  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  std::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
     LOG(ERROR) << usage;
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 2a4b4a88e2549a..6536ef720e58ea 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -44,10 +45,9 @@ using tensorflow::Output;
 using tensorflow::TensorShape;
 
 // Runs a TensorFlow graph to convert an audio file into a visualization.
-absl::Status WavToSpectrogram(const tensorflow::string& input_wav,
-                              int32_t window_size, int32_t stride,
-                              float brightness,
-                              const tensorflow::string& output_image) {
+absl::Status WavToSpectrogram(const std::string& input_wav, int32_t window_size,
+                              int32_t stride, float brightness,
+                              const std::string& output_image) {
   auto root = tensorflow::Scope::NewRootScope();
   using namespace tensorflow::ops;  // NOLINT(build/namespaces)
   // The following block creates a TensorFlow graph that:
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
index 62c95218b02575..1c6809fa6cf3ad 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -26,9 +26,8 @@ limitations under the License.
 // in the path to the audio file, the window size and stride parameters
 // controlling the spectrogram creation, the brightness scaling to use, and a
 // path to save the output PNG file to.
-absl::Status WavToSpectrogram(const tensorflow::string& input_wav,
-                              int32_t window_size, int32_t stride,
-                              float brightness,
-                              const tensorflow::string& output_image);
+absl::Status WavToSpectrogram(const std::string& input_wav, int32_t window_size,
+                              int32_t stride, float brightness,
+                              const std::string& output_image);
 
 #endif  // TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
index e5997114454b72..019741f49a93f6 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
 
+#include <string>
+
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
@@ -22,12 +24,12 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 TEST(WavToSpectrogramTest, WavToSpectrogramTest) {
-  const tensorflow::string input_wav =
+  const std::string input_wav =
       tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "input_wav.wav");
-  const tensorflow::string output_image = tensorflow::io::JoinPath(
+  const std::string output_image = tensorflow::io::JoinPath(
       tensorflow::testing::TmpDir(), "output_image.png");
   float audio[8] = {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-  tensorflow::string wav_string;
+  std::string wav_string;
   TF_ASSERT_OK(
       tensorflow::wav::EncodeAudioAsS16LEWav(audio, 44100, 1, 8, &wav_string));
   TF_ASSERT_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c02bd8ee5fc495..a1b28fb161836e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -6511,6 +6511,9 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 type ComplexAbsAttr func(optionalAttr)
 
 // ComplexAbsTout sets the optional Tout attribute to value.
+//
+// value: Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+// Need to be `tf.float64` when the type of `x` is `tf.complex128`.
 // If not specified, defaults to DT_FLOAT
 func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
@@ -12617,7 +12620,9 @@ type DenseBincountAttr func(optionalAttr)
 
 // DenseBincountBinaryOutput sets the optional binary_output attribute to value.
 //
-// value: bool; Whether the kernel should count the appearance or number of occurrences.
+// value: Whether the kernel should count the appearance or number of occurrences.
+// Will raise `UnimplementedError` when `binary_output` is specified to `True`
+// and the size of `weights` is not 0.
 // If not specified, defaults to false
 func DenseBincountBinaryOutput(value bool) DenseBincountAttr {
 	return func(m optionalAttr) {
@@ -12639,10 +12644,10 @@ func DenseBincountBinaryOutput(value bool) DenseBincountAttr {
 //
 //	input: 1D or 2D int `Tensor`.
 //	size: non-negative int scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+//	weights: `Tensor` with the same shape as `arr`, or a length-0 `Tensor`,
 //
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+// in which case it acts as all weights equal to 1.
+// Not supported by the GPU implementation of Bincount.
 //
 // Returns 1D `Tensor` with length equal to `size` or 2D `Tensor` with [batch_size, `size`].
 // The counts or summed weights for each value in the range [0, size).
@@ -20903,6 +20908,9 @@ func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []t
 type ImagAttr func(optionalAttr)
 
 // ImagTout sets the optional Tout attribute to value.
+//
+// value: Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+// Need to be `tf.float64` when the type of `x` is `tf.complex128`.
 // If not specified, defaults to DT_FLOAT
 func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
@@ -21908,7 +21916,15 @@ type IsotonicRegressionAttr func(optionalAttr)
 
 // IsotonicRegressionOutputDtype sets the optional output_dtype attribute to value.
 //
-// value: Dtype of output.
+// value: Dtype of the output tensor.
+//
+// Note on supported input-output type combinations:
+// * For floating-point types, the output has the same dtype as the input.
+// * For 8-bit and 16-bit integer inputs, the output is a 32-bit float.
+// * For 32-bit and 64-bit integer inputs, the output is a 64-bit float.
+//
+// Using unsupported dtype pairs (for example, input=float64 with output=float32)
+// will result in a "Could not find device for node" error.
 // If not specified, defaults to DT_FLOAT
 func IsotonicRegressionOutputDtype(value tf.DataType) IsotonicRegressionAttr {
 	return func(m optionalAttr) {
@@ -33163,6 +33179,16 @@ func QuantizeAndDequantizeV4GradAxis(value int64) QuantizeAndDequantizeV4GradAtt
 //
 // Returns a gradient of 1 for inputs that are within the quantization range,
 // or 0 otherwise.
+//
+// Arguments:
+//
+//	input_min: If `axis` is specified, the shape of the minimum input tensor
+//
+// must be rank 1.
+//
+//	input_max: If `axis` is specified, the shape of the maximum input tensor
+//
+// must be rank 1.
 func QuantizeAndDequantizeV4Grad(scope *Scope, gradients tf.Output, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV4GradAttr) (input_backprop tf.Output, input_min_backprop tf.Output, input_max_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -54965,6 +54991,9 @@ func ThreadPoolHandleMaxIntraOpParallelism(value int64) ThreadPoolHandleAttr {
 }
 
 // ThreadPoolHandleContainer sets the optional container attribute to value.
+//
+// value: The name of `container` should start with `'.'` or `letter` or `digit`,
+// with ['-', '.', '/'] or `letter` or `digit` follows several times.
 // If not specified, defaults to ""
 func ThreadPoolHandleContainer(value string) ThreadPoolHandleAttr {
 	return func(m optionalAttr) {
@@ -54988,10 +55017,6 @@ func ThreadPoolHandleSharedName(value string) ThreadPoolHandleAttr {
 //	display_name: A human-readable name for the threads that may be visible in some
 //
 // visualizations.
-// threadpool.
-//
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
 func ThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ThreadPoolHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 12a1caaa992acd..e59411ec49aa02 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -70,6 +70,7 @@ const (
 	Float8e4m3fnuz    DataType = C.TF_FLOAT8_E4M3FNUZ
 	Float8e4m3b11fnuz DataType = C.TF_FLOAT8_E4M3B11FNUZ
 	Float8e5m2fnuz    DataType = C.TF_FLOAT8_E5M2FNUZ
+	Float4e2m1fn      DataType = C.TF_FLOAT4_E2M1FN
 	Int4              DataType = C.TF_INT4
 	Uint4             DataType = C.TF_UINT4
 	Int2              DataType = C.TF_INT2
@@ -563,7 +564,7 @@ func isTensorSerializable(dataType DataType) error {
 	// serialization and deserialization of Tensors.  Till then capitalize
 	// on knowledge of the implementation for numeric types.
 	switch dataType {
-	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half, Float8e5m2, Float8e4m3fn, Float8e4m3fnuz, Float8e4m3b11fnuz, Float8e5m2fnuz, Int4, Uint4, Int2, Uint2:
+	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half, Float8e5m2, Float8e4m3fn, Float8e4m3fnuz, Float8e4m3b11fnuz, Float8e5m2fnuz, Float4e2m1fn, Int4, Uint4, Int2, Uint2:
 		return nil
 	default:
 		return fmt.Errorf("serialization of tensors with the DataType %d is not yet supported, see https://github.com/tensorflow/tensorflow/issues/6003", dataType)
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index bffb769004b56e..099e9e40a36f12 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/java/src/gen/cc/op_specs.h"
 
-#include <cctype>
 #include <map>
 #include <sstream>
 #include <string>
@@ -23,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/strip.h"
@@ -188,7 +188,7 @@ string SnakeToCamelCase(const string& str, bool upper = false) {
     if (c == '_') {
       cap = true;
     } else if (cap) {
-      result += toupper(c);
+      result += absl::ascii_toupper(c);
       cap = false;
     } else {
       result += c;
diff --git a/tensorflow/js/ops/ts_op_gen.cc b/tensorflow/js/ops/ts_op_gen.cc
index eff8be42576483..3b5f0145ba983c 100644
--- a/tensorflow/js/ops/ts_op_gen.cc
+++ b/tensorflow/js/ops/ts_op_gen.cc
@@ -57,13 +57,13 @@ class GenTypeScriptOp {
   ~GenTypeScriptOp();
 
   // Returns the generated code as a string:
-  string Code();
+  std::string Code();
 
  private:
   void ProcessArgs();
   void ProcessAttrs();
-  void AddAttrForArg(const string& attr, int arg_index);
-  string InputForAttr(const OpDef::AttrDef& op_def_attr);
+  void AddAttrForArg(const std::string& attr, int arg_index);
+  std::string InputForAttr(const OpDef::AttrDef& op_def_attr);
 
   void AddMethodSignature();
   void AddOpAttrs();
@@ -73,7 +73,7 @@ class GenTypeScriptOp {
   const ApiDef& api_def_;
 
   // Placeholder string for all generated code:
-  string result_;
+  std::string result_;
 
   // Holds in-order vector of Op inputs:
   std::vector<ArgDefs> input_op_args_;
@@ -82,7 +82,7 @@ class GenTypeScriptOp {
   std::vector<OpAttrs> op_attrs_;
 
   // Stores attributes-to-arguments by name:
-  typedef std::unordered_map<string, std::vector<int>> AttrArgIdxMap;
+  typedef std::unordered_map<std::string, std::vector<int>> AttrArgIdxMap;
   AttrArgIdxMap attr_arg_idx_map_;
 
   // Holds number of outputs:
@@ -94,7 +94,7 @@ GenTypeScriptOp::GenTypeScriptOp(const OpDef& op_def, const ApiDef& api_def)
 
 GenTypeScriptOp::~GenTypeScriptOp() = default;
 
-string GenTypeScriptOp::Code() {
+std::string GenTypeScriptOp::Code() {
   ProcessArgs();
   ProcessAttrs();
 
@@ -144,7 +144,7 @@ void GenTypeScriptOp::ProcessAttrs() {
   }
 }
 
-void GenTypeScriptOp::AddAttrForArg(const string& attr, int arg_index) {
+void GenTypeScriptOp::AddAttrForArg(const std::string& attr, int arg_index) {
   // Keep track of attributes-to-arguments by name. These will be used for
   // construction Op attributes that require information about the inputs.
   auto iter = attr_arg_idx_map_.find(attr);
@@ -155,8 +155,8 @@ void GenTypeScriptOp::AddAttrForArg(const string& attr, int arg_index) {
   }
 }
 
-string GenTypeScriptOp::InputForAttr(const OpDef::AttrDef& op_def_attr) {
-  string inputs;
+std::string GenTypeScriptOp::InputForAttr(const OpDef::AttrDef& op_def_attr) {
+  std::string inputs;
   auto arg_list = attr_arg_idx_map_.find(op_def_attr.name());
   if (arg_list != attr_arg_idx_map_.end()) {
     for (auto iter = arg_list->second.begin(); iter != arg_list->second.end();
@@ -235,7 +235,7 @@ void WriteTSOp(const OpDef& op_def, const ApiDef& api_def, WritableFile* ts) {
 }
 
 void StartFile(WritableFile* ts_file) {
-  const string header =
+  const std::string header =
       R"header(/**
  * @license
  * Copyright 2018 Google Inc. All Rights Reserved.
@@ -266,7 +266,7 @@ import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 }  // namespace
 
 void WriteTSOps(const OpList& ops, const ApiDefMap& api_def_map,
-                const string& ts_filename) {
+                const std::string& ts_filename) {
   Env* env = Env::Default();
 
   std::unique_ptr<WritableFile> ts_file = nullptr;
diff --git a/tensorflow/js/ops/ts_op_gen.h b/tensorflow/js/ops/ts_op_gen.h
index fcd46a17a77c32..fb0c7c34d0d8bc 100644
--- a/tensorflow/js/ops/ts_op_gen.h
+++ b/tensorflow/js/ops/ts_op_gen.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 // Generated code is written to the file ts_filename:
 void WriteTSOps(const OpList& ops, const ApiDefMap& api_def_map,
-                const string& ts_filename);
+                const std::string& ts_filename);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/js/ops/ts_op_gen_test.cc b/tensorflow/js/ops/ts_op_gen_test.cc
index 45170ed846fd3d..f0f65f3d86428b 100644
--- a/tensorflow/js/ops/ts_op_gen_test.cc
+++ b/tensorflow/js/ops/ts_op_gen_test.cc
@@ -81,8 +81,9 @@ op {
 )";
 
 // Generate TypeScript code
-void GenerateTsOpFileText(const string& op_def_str, const string& api_def_str,
-                          string* ts_file_text) {
+void GenerateTsOpFileText(const std::string& op_def_str,
+                          const std::string& api_def_str,
+                          std::string* ts_file_text) {
   Env* env = Env::Default();
   OpList op_defs;
   protobuf::TextFormat::ParseFromString(
@@ -93,7 +94,7 @@ void GenerateTsOpFileText(const string& op_def_str, const string& api_def_str,
     TF_ASSERT_OK(api_def_map.LoadApiDef(api_def_str));
   }
 
-  const string& tmpdir = testing::TmpDir();
+  const std::string& tmpdir = testing::TmpDir();
   const auto ts_file_path = io::JoinPath(tmpdir, "test.ts");
 
   WriteTSOps(op_defs, api_def_map, ts_file_path);
@@ -101,10 +102,10 @@ void GenerateTsOpFileText(const string& op_def_str, const string& api_def_str,
 }
 
 TEST(TsOpGenTest, TestImports) {
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", "", &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 import * as tfc from '@tensorflow/tfjs-core';
 import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 )";
@@ -112,38 +113,38 @@ import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 }
 
 TEST(TsOpGenTest, InputSingleAndList) {
-  const string api_def = R"pb(
+  const std::string api_def = R"pb(
     op { graph_op_name: "Foo" arg_order: "dim" arg_order: "images" }
   )pb";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", api_def, &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 export function Foo(dim: tfc.Tensor, images: tfc.Tensor[]): tfc.Tensor {
 )";
   ExpectContainsStr(ts_file_text, expected);
 }
 
 TEST(TsOpGenTest, TestVisibility) {
-  const string api_def = R"(
+  const std::string api_def = R"(
 op {
   graph_op_name: "Foo"
   visibility: HIDDEN
 }
 )";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", api_def, &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 export function Foo(images: tfc.Tensor[], dim: tfc.Tensor): tfc.Tensor {
 )";
   ExpectDoesNotContainStr(ts_file_text, expected);
 }
 
 TEST(TsOpGenTest, SkipDeprecated) {
-  const string op_def = R"(
+  const std::string op_def = R"(
 op {
   name: "DeprecatedFoo"
   input_arg {
@@ -172,14 +173,14 @@ op {
 }
 )";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText(op_def, "", &ts_file_text);
 
   ExpectDoesNotContainStr(ts_file_text, "DeprecatedFoo");
 }
 
 TEST(TsOpGenTest, MultiOutput) {
-  const string op_def = R"(
+  const std::string op_def = R"(
 op {
   name: "MultiOutputFoo"
   input_arg {
@@ -212,20 +213,20 @@ op {
 }
 )";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText(op_def, "", &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 export function MultiOutputFoo(input: tfc.Tensor): tfc.Tensor[] {
 )";
   ExpectContainsStr(ts_file_text, expected);
 }
 
 TEST(TsOpGenTest, OpAttrs) {
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", "", &ts_file_text);
 
-  const string expectedFooAttrs = R"(
+  const std::string expectedFooAttrs = R"(
   const opAttrs = [
     createTensorsTypeOpAttr('T', images),
     {name: 'N', type: nodeBackend().binding.TF_ATTR_INT, value: images.length}
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 019855a88810ac..3c6900be369eba 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1043,8 +1043,8 @@ cc_test(
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
     ],
 )
 
diff --git a/tensorflow/lite/c/common_internal.cc b/tensorflow/lite/c/common_internal.cc
index c24f92bce4594b..d67f3af3e652d1 100644
--- a/tensorflow/lite/c/common_internal.cc
+++ b/tensorflow/lite/c/common_internal.cc
@@ -20,6 +20,13 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 
+bool TfLiteDelegateIsOpaque(const TfLiteDelegate* delegate) {
+  return delegate != nullptr && delegate->Prepare == nullptr &&
+         delegate->CopyFromBufferHandle == nullptr &&
+         delegate->FreeBufferHandle == nullptr &&
+         delegate->opaque_delegate_builder != nullptr;
+}
+
 TfLiteStatus TfLiteDelegatePrepareInternal(TfLiteContext* context,
                                            TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
@@ -27,7 +34,7 @@ TfLiteStatus TfLiteDelegatePrepareInternal(TfLiteContext* context,
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
   // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
-  if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate) &&
+  if (TfLiteDelegateIsOpaque(delegate) &&
       delegate->opaque_delegate_builder->Prepare) {
     status = delegate->opaque_delegate_builder->Prepare(
         reinterpret_cast<TfLiteOpaqueContext*>(context),
@@ -46,7 +53,7 @@ TfLiteStatus TfLiteDelegateCopyFromBufferHandleInternal(
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
   // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
-  if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate) &&
+  if (TfLiteDelegateIsOpaque(delegate) &&
       delegate->opaque_delegate_builder->CopyFromBufferHandle) {
     return delegate->opaque_delegate_builder->CopyFromBufferHandle(
         reinterpret_cast<TfLiteOpaqueContext*>(context),
@@ -67,7 +74,7 @@ TfLiteStatus TfLiteDelegateFreeBufferHandleInternal(
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
   // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
-  if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate) &&
+  if (TfLiteDelegateIsOpaque(delegate) &&
       delegate->opaque_delegate_builder->FreeBufferHandle) {
     delegate->opaque_delegate_builder->FreeBufferHandle(
         reinterpret_cast<TfLiteOpaqueContext*>(context),
@@ -84,7 +91,7 @@ TfLiteStatus TfLiteDelegateFreeBufferHandleInternal(
 }
 
 int64_t TfLiteDelegateGetFlagsInternal(TfLiteDelegate* delegate) {
-  if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
+  if (TfLiteDelegateIsOpaque(delegate)) {
     return delegate->opaque_delegate_builder->flags;
   }
   return delegate->flags;
diff --git a/tensorflow/lite/c/common_internal.h b/tensorflow/lite/c/common_internal.h
index 9d24d72626b622..ccf0489d6805a2 100644
--- a/tensorflow/lite/c/common_internal.h
+++ b/tensorflow/lite/c/common_internal.h
@@ -112,29 +112,9 @@ typedef struct TfLiteOperator {
       void* user_data, TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
 } TfLiteOperator;
 
-// Returns true iff it's safe to dereference
-// 'delegate->opaque_delegate_builder'.
-inline bool TfLiteDelegateHasValidOpaqueDelegateBuilder(
-    const TfLiteDelegate* delegate) {
-  // We want to give precedence to the delegate's `opaque_delegate_builder`
-  // field when it is available.  In an ideal setting, where all client code
-  // properly initializes the delegate, we could simply check if the
-  // `opaque_delegate_builder` contains a non-zero address.  However, in
-  // practice this breaks code that doesn't adhere to these best practices.
-  //
-  // We can avoid this problem by checking the `Prepare` field contained in the
-  // `TfliteDelegate` (not to be confused with the `Prepare` field contained in
-  // `TfLiteOpaqueDelegateBuilder` struct). In order to tell if we should use
-  // the `opaque_delegate_builder` field we check that the `TfLiteDelegate`'s
-  // `Prepare` member is null.  This should be true for every delegate that
-  // adopts the `TfLiteOpaqueDelegateBuilder` interface and should not be true
-  // for any delegate implementation that is using `TfLiteDelegate` directly.
-  //
-  // TODO(b/245730811): Consider signalling to clients if the delegate is not
-  // initialized cleanly.
-  return delegate != nullptr && delegate->Prepare == nullptr &&
-         delegate->opaque_delegate_builder != nullptr;
-}
+// Returns true iff the delegate is a well-formed opaque delegate, i.e. none of
+// the fields that are part of the legacy 'TfLiteDelegate' interface are set.
+bool TfLiteDelegateIsOpaque(const TfLiteDelegate* delegate);
 
 // Invokes 'Prepare' on the provided 'delegate', giving the 'delegate' a view
 // of the current graph through the provided 'context'.  Returns the delegate's
diff --git a/tensorflow/lite/cmake/DownloadPThreadPool.cmake b/tensorflow/lite/cmake/DownloadPThreadPool.cmake
index 25a771fe6f9561..b5338cc13558b6 100644
--- a/tensorflow/lite/cmake/DownloadPThreadPool.cmake
+++ b/tensorflow/lite/cmake/DownloadPThreadPool.cmake
@@ -19,8 +19,8 @@ PROJECT(pthreadpool-download NONE)
 
 INCLUDE(ExternalProject)
 ExternalProject_Add(pthreadpool
-  URL https://github.com/google/pthreadpool/archive/d561aae9dfeab38ff595a0ae3e6bbd90b862c5f8.zip
-  URL_HASH SHA256=8b1d13195842c9b7e8ef5aa7d9b44ca4168a41b8ae97b4e50db4fcc562211f5b
+  URL https://github.com/google/pthreadpool/archive/0e6ca13779b57d397a5ba6bfdcaa8a275bc8ea2e.zip
+  URL_HASH SHA256=f602ab141bdc5d5872a79d6551e9063b5bfa7ad6ad60cceaa641de5c45c86d70
   SOURCE_DIR "${CMAKE_BINARY_DIR}/pthreadpool-source"
   BINARY_DIR "${CMAKE_BINARY_DIR}/pthreadpool"
   CONFIGURE_COMMAND ""
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index 6aa3cfa19ef675..fd1cffa0d561cf 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -342,6 +342,8 @@ cc_test(
     srcs = ["signature_runner_test.cc"],
     data = [
         "//tensorflow/lite:testdata/multi_signatures.bin",
+        "//tensorflow/lite:testdata/no_signatures.bin",
+        "//tensorflow/lite:testdata/no_signatures_no_tensor_names.bin",
         "//tensorflow/lite:testdata/reverse_signature_model.bin",
     ],
     deps = [
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 882b839049ca4e..35e6d6d534aba4 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -269,6 +269,11 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseDiv(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_DYNAMIC_UPDATE_SLICE: {
+      return ParseDynamicUpdateSlice(op, error_reporter, allocator,
+                                     builtin_data);
+    }
+
     case BuiltinOperator_ELU: {
       return ParseElu(op, error_reporter, allocator, builtin_data);
     }
@@ -987,7 +992,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_COS:
     case BuiltinOperator_CUSTOM:
     case BuiltinOperator_DENSIFY:
-    case BuiltinOperator_DYNAMIC_UPDATE_SLICE:
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_HASHTABLE_FIND:
     case BuiltinOperator_HASHTABLE_IMPORT:
@@ -1088,6 +1092,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_INT4:
       *type = kTfLiteInt4;
       return kTfLiteOk;
+    case TensorType_INT2:
+      *type = kTfLiteInt2;
+      return kTfLiteOk;
     default:
       *type = kTfLiteNoType;
       TF_LITE_REPORT_ERROR(error_reporter,
@@ -1466,6 +1473,14 @@ TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseDynamicUpdateSlice(const Operator*, ErrorReporter*,
+                                     BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 28aacff61828b3..26b36df194bfe0 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -148,6 +148,11 @@ TfLiteStatus ParseDequantize(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseDynamicUpdateSlice(const Operator* op,
+                                     ErrorReporter* error_reporter,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data);
+
 TfLiteStatus ParseElu(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 9f97ec049bc01a..17e39bf8475247 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -106,6 +106,7 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api_types",
         ":operator",
+        "//tensorflow/compiler/mlir/lite:allocation",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite:stderr_reporter",
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index fcbda4e4fb0c81..075718509d367c 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -571,7 +571,8 @@ TfLiteInterpreter* InterpreterCreateWithOpResolver(
                                               ? optional_error_reporter.get()
                                               : tflite::DefaultErrorReporter();
   tflite::InterpreterBuilder builder(model->impl->GetModel(), *op_resolver,
-                                     error_reporter);
+                                     error_reporter, nullptr,
+                                     model->impl->allocation());
 
   if (optional_options && optional_options->telemetry_profiler) {
     std::unique_ptr<tflite::telemetry::TelemetryProfiler> profiler;
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index 2ebb6b3d567a75..aa85ddbf113ce4 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -423,19 +423,19 @@ TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
       context, node_index, &local_node, &registration);
   if (status != kTfLiteOk) return status;
 
-  // When the 'registration' object obtained via 'GetNodeAndRegistration'
-  // has its 'registration_external' field set then we can load that into the
-  // caller's 'registration_external' pointer and return early.
+  // When the `registration` object obtained via `GetNodeAndRegistration`
+  // has its `registration_external` field set then we can load that into the
+  // caller's `registration_external` pointer and return early.
   *node = reinterpret_cast<TfLiteOpaqueNode*>(local_node);
   if (registration->registration_external) {
     *registration_external = registration->registration_external;
     return kTfLiteOk;
   }
 
-  // When the 'registration' object obtained via 'GetNodeAndRegistration'
-  // does *not* have its 'registration_external' field set then we need to
+  // When the `registration` object obtained via `GetNodeAndRegistration`
+  // does *not* have its `registration_external` field set then we need to
   // create a TfLiteOperator on the fly, and set its field according
-  // to the 'TfLiteRegistration' object.
+  // to the `TfLiteRegistration` object.
   auto derived_registration =
       tflite::internal::CommonOpaqueConversionUtil::ObtainOperator(
           context, registration, node_index);
@@ -459,12 +459,12 @@ TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
   TfLiteContext* context = reinterpret_cast<TfLiteContext*>(opaque_context);
   TfLiteDelegate* delegate = reinterpret_cast<TfLiteDelegate*>(opaque_delegate);
 
-  // Wrap the provided 'registration_external' as a regular 'TfLiteRegistration'
+  // Wrap the provided `registration_external` as a regular `TfLiteRegistration`
   // object to reduce the places in the TF Lite runtime that need to be aware
-  // of 'TfLiteOperator's.  Note that it is important to
-  // brace-initialize the 'TfLiteRegistration' so that we pass a registration to
-  // 'ReplaceNodeSubsetsWithDelegateKernels' that has all of its fields set to
-  // null, except the 'registration_external' one.
+  // of `TfLiteOperator`s.  Note that it is important to
+  // brace-initialize the `TfLiteRegistration` so that we pass a registration to
+  // `ReplaceNodeSubsetsWithDelegateKernels` that has all of its fields set to
+  // null, except the `registration_external` one.
   TfLiteRegistration registration{};
   registration.registration_external = registration_external;
 
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index 24b556db238523..d037d36d828bdf 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -70,7 +70,7 @@ TFL_CAPI_EXPORT extern TfLiteType TfLiteOpaqueTensorType(
     const TfLiteOpaqueTensor* opaque_tensor);
 
 /// Returns the number of dimensions that the tensor has.  Returns -1 in case
-/// the 'opaque_tensor' does not have its dimensions property set.
+/// the `opaque_tensor` does not have its dimensions property set.
 TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorNumDims(
     const TfLiteOpaqueTensor* opaque_tensor);
 
@@ -78,29 +78,29 @@ TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorNumDims(
 TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorDim(
     const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index);
 
-/// Loads into the provided 'num_dims' the number of dimensions that the
-/// tensor's signature has. Returns 'kTfLiteOk' if 'num_dims' was successfully
-/// loaded. Any other return code indicates an error and 'num_dims' won't be
+/// Loads into the provided `num_dims` the number of dimensions that the
+/// tensor's signature has. Returns `kTfLiteOk` if `num_dims` was successfully
+/// loaded. Any other return code indicates an error and `num_dims` won't be
 /// loaded.
 ///
 /// A tensor's dimension signature encodes shapes with unknown dimensions with
 /// -1.  E.g. for a tensor with three dimensions, whose first dimension has an
 /// unknown size, and the second and third dimension have a size of 2, the
-/// dimension signature is [-1,2,2], and 'TfLiteOpaqueTensorGetNumDimsSignature'
-/// loads 3 into 'num_dims'. If the tensor does not have its dimension signature
-/// field set then 'num_dims' is set to -1.
+/// dimension signature is [-1,2,2], and `TfLiteOpaqueTensorGetNumDimsSignature`
+/// loads 3 into `num_dims`. If the tensor does not have its dimension signature
+/// field set then `num_dims` is set to -1.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetNumDimsSignature(
     const TfLiteOpaqueTensor* opaque_tensor, int32_t* num_dims);
 
-/// Loads into the provided 'dim_length' the length of the tensor in the
-/// 'dim_index' signature dimension or -1 if that dimension has unknown length.
-/// Returns 'kTfLiteOk' if 'dim_length' was successfully loaded. Any
-/// other return code indicates an error and 'dim_length' won't be loaded.
+/// Loads into the provided `dim_length` the length of the tensor in the
+/// `dim_index` signature dimension or -1 if that dimension has unknown length.
+/// Returns `kTfLiteOk` if `dim_length` was successfully loaded. Any
+/// other return code indicates an error and `dim_length` won't be loaded.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetDimSignature(
     const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index,
     int32_t* dim_length);
 
-/// Returns 'non-zero' if the provided 'opaque_tensor' is a variable, and
+/// Returns `non-zero` if the provided `opaque_tensor` is a variable, and
 /// returns zero otherwise.
 TFL_CAPI_EXPORT extern int TfLiteOpaqueTensorIsVariable(
     const TfLiteOpaqueTensor* opaque_tensor);
@@ -114,7 +114,7 @@ TFL_CAPI_EXPORT extern size_t TfLiteOpaqueTensorByteSize(
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueTensorData(
     const TfLiteOpaqueTensor* opaque_tensor);
 
-/// Returns the 'opaque_tensor's allocation type.
+/// Returns the `opaque_tensor`'s allocation type.
 TFL_CAPI_EXPORT extern TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
     const TfLiteOpaqueTensor* opaque_tensor);
 
@@ -142,11 +142,11 @@ TFL_CAPI_EXPORT extern TfLiteRunStep TfLiteOpaqueTensorGetShapeKnownStep(
 TFL_CAPI_EXPORT extern const char* TfLiteOpaqueTensorName(
     const TfLiteOpaqueTensor* opaque_tensor);
 
-/// Returns the 'opaque_tensor's quantization information.
+/// Returns the `opaque_tensor`'s quantization information.
 TFL_CAPI_EXPORT extern TfLiteQuantization TfLiteOpaqueTensorGetQuantization(
     const TfLiteOpaqueTensor* opaque_tensor);
 
-/// Returns the 'opaque_tensor's quantization parameters.
+/// Returns the `opaque_tensor`'s quantization parameters.
 TFL_CAPI_EXPORT extern TfLiteQuantizationParams
 TfLiteOpaqueTensorGetQuantizationParams(
     const TfLiteOpaqueTensor* opaque_tensor);
@@ -161,61 +161,61 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorCopyToBuffer(
     const TfLiteOpaqueTensor* opaque_tensor, void* output_data,
     size_t output_data_size);
 
-/// Returns the number of strings stored in the provided 'tensor'.
+/// Returns the number of strings stored in the provided `tensor`.
 /// Returns -1 in case of failure.
 int TfLiteOpaqueTensorGetStringCount(const TfLiteOpaqueTensor* tensor);
 
-/// Stores the address of the n-th (denoted by the provided 'index') string
-/// contained in the provided 'tensor' in the provided '*str' pointer.  Stores
-/// the length of the string in the provided '*len' argument.
+/// Stores the address of the n-th (denoted by the provided `index`) string
+/// contained in the provided `tensor` in the provided `*str` pointer.  Stores
+/// the length of the string in the provided `*len` argument.
 ///
-/// Returns 'kTfLiteOk' if '*str' and '*len' have been set successfully.  Any
-/// other return value indicates a failure, which leaves '*str' and '*len' in an
+/// Returns `kTfLiteOk` if `*str` and `*len` have been set successfully.  Any
+/// other return value indicates a failure, which leaves `*str` and `*len` in an
 /// unspecified state.
 ///
 /// The range of valid indices is defined by the half open interval [0, N),
 /// where N == TfLiteOpaqueTensorGetStringCount(tensor).
 ///
-/// Note that 'str' is not guaranteed to be null-terminated. Also note that this
+/// Note that `str` is not guaranteed to be null-terminated. Also note that this
 /// function will not create a copy of the underlying string data.  The data is
-/// owned by the 'tensor'.
+/// owned by the `tensor`.
 TfLiteStatus TfLiteOpaqueTensorGetString(const TfLiteOpaqueTensor* tensor,
                                          int index, const char** str, int* len);
 
-/// Writes the array of strings specified by 'str_array' into
-/// the specified 'tensor'.  The strings provided via the 'str_array' are being
-/// copied into the 'tensor'. Returns 'kTfLiteOk' in case of success.  Any other
+/// Writes the array of strings specified by `str_array` into
+/// the specified `tensor`.  The strings provided via the `str_array` are being
+/// copied into the `tensor`. Returns `kTfLiteOk` in case of success.  Any other
 /// return value indicates a failure.
 ///
-/// The provided 'str_array_len' must denote the length of 'str_array'
-/// and 'str_n_len[i]' must denote the length of the i-th string.
+/// The provided `str_array_len` must denote the length of `str_array`
+/// and `str_n_len[i]` must denote the length of the i-th string.
 ///
 /// The provided strings don't need to be null terminated and may contain
-/// embedded null characters.  The amount of bytes copied into the 'tensor' is
-/// entirely determined by 'str_n_len[i]' and it is the caller's responsibility
+/// embedded null characters.  The amount of bytes copied into the `tensor` is
+/// entirely determined by `str_n_len[i]` and it is the caller's responsibility
 /// to set this value correctly to avoid undefined behavior.
 ///
-/// Also note that calling 'TfLiteOpaqueTensorWriteStrings' deallocates any
-/// previously stored data in the 'tensor'.
+/// Also note that calling `TfLiteOpaqueTensorWriteStrings` deallocates any
+/// previously stored data in the `tensor`.
 TfLiteStatus TfLiteOpaqueTensorWriteStrings(TfLiteOpaqueTensor* tensor,
                                             const char* const* str_array,
                                             int str_array_len,
                                             const int* str_n_len);
 
-/// Writes the string pointed to by the provided 'str' pointer of length 'len'
-/// into the provided 'tensor'.  The string provided via 'str' is
-/// copied into the 'tensor'.  Returns 'kTfLiteOk' in case of success. Any
+/// Writes the string pointed to by the provided `str` pointer of length `len`
+/// into the provided `tensor`.  The string provided via `str` is
+/// copied into the `tensor`.  Returns `kTfLiteOk` in case of success. Any
 /// other return value indicates a failure.
 ///
-/// Note that calling 'TfLiteOpaqueTensorWriteString' deallocates any
-/// previously stored data in the 'tensor'.  E.g. suppose 't' denotes a
-/// 'TfLiteOpaqueTensor*', then calling 'TfLiteOpaqueTensorWriteString(t, "AB",
-/// 2)' followed by a call to 'TfLiteOpaqueTensorWriteString(t, "CD", 2)' will
-/// lead to 't' containing 'CD', not 'ABCD'.
+/// Note that calling `TfLiteOpaqueTensorWriteString` deallocates any
+/// previously stored data in the `tensor`.  E.g. suppose `t` denotes a
+/// `TfLiteOpaqueTensor*`, then calling `TfLiteOpaqueTensorWriteString(t, "AB",
+/// 2)` followed by a call to `TfLiteOpaqueTensorWriteString(t, "CD", 2)` will
+/// lead to `t` containing `CD`, not `ABCD`.
 ///
-/// 'TfLiteOpaqueTensorWriteString' is a convenience function for the use case
+/// `TfLiteOpaqueTensorWriteString` is a convenience function for the use case
 /// of writing a single string to a tensor and its effects are identical to
-/// calling 'TfLiteOpaqueTensorWriteStrings' with an array of a single string.
+/// calling `TfLiteOpaqueTensorWriteStrings` with an array of a single string.
 TfLiteStatus TfLiteOpaqueTensorWriteString(TfLiteOpaqueTensor* tensor,
                                            const char* str, int len);
 
@@ -228,46 +228,46 @@ TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderCreate();
 /// Deletes an opaque tensor builder object.
 void TfLiteOpaqueTensorBuilderDelete(TfLiteOpaqueTensorBuilder* builder);
 
-/// Sets the 'TfLiteType' of the provided 'builder' to the provided 'type'.
-/// Returns the address of the provided 'builder', so that builder calls can be
+/// Sets the `TfLiteType` of the provided `builder` to the provided `type`.
+/// Returns the address of the provided `builder`, so that builder calls can be
 /// chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetType(
     TfLiteOpaqueTensorBuilder* builder, TfLiteType type);
 
-/// Sets the raw data of the provided 'builder' to the provided 'data'. Returns
-/// the address of the provided 'builder', so that builder calls can be chained
+/// Sets the raw data of the provided `builder` to the provided `data`. Returns
+/// the address of the provided `builder`, so that builder calls can be chained
 /// together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetData(
     TfLiteOpaqueTensorBuilder* builder, void* data);
 
-/// Sets the allocation type of the provided 'builder' to the provided
-/// 'allocation_type'.  The 'allocation_type' must be one of the following:
-/// 'kTfLiteDynamic', 'kTfLiteArenaRw' or 'kTfLiteArenaRwPersistent'.  If the
-/// provided 'allocation_type' is not one of those values then
-/// 'TfLiteOpaqueContextAddTensor' will return an error. Returns the address of
-/// the provided 'builder', so that builder calls can be chained together.
+/// Sets the allocation type of the provided `builder` to the provided
+/// `allocation_type`.  The `allocation_type` must be one of the following:
+/// `kTfLiteDynamic`, `kTfLiteArenaRw` or `kTfLiteArenaRwPersistent`.  If the
+/// provided `allocation_type` is not one of those values then
+/// `TfLiteOpaqueContextAddTensor` will return an error. Returns the address of
+/// the provided `builder`, so that builder calls can be chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetAllocationType(
     TfLiteOpaqueTensorBuilder* builder, TfLiteAllocationType allocation_type);
 
-/// Sets the quantization params of the provided 'builder' to the provided
-/// 'params'. Returns the address of the provided 'builder', so that builder
+/// Sets the quantization params of the provided `builder` to the provided
+/// `params`. Returns the address of the provided `builder`, so that builder
 /// calls can be chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetQuantizationParams(
     TfLiteOpaqueTensorBuilder* builder, TfLiteQuantizationParams params);
 
-/// Sets the quantization of the provided 'builder' to the provided
-/// 'quantization'. Returns the address of the provided 'builder', so that
+/// Sets the quantization of the provided `builder` to the provided
+/// `quantization`. Returns the address of the provided `builder`, so that
 /// builder calls can be chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetQuantization(
     TfLiteOpaqueTensorBuilder* builder, TfLiteQuantization quantization);
 
-/// Sets the allocation type of the provided 'tensor' to 'kTfLiteDynamic'.
-/// This function has no effect if the 'tensor's allocation type is already
-/// 'kTfLiteDynamic'.  The provided 'tensor' must not be null.
+/// Sets the allocation type of the provided `tensor` to `kTfLiteDynamic`.
+/// This function has no effect if the `tensor`'s allocation type is already
+/// `kTfLiteDynamic`.  The provided `tensor` must not be null.
 void TfLiteOpaqueTensorSetAllocationTypeToDynamic(TfLiteOpaqueTensor* tensor);
 
-/// Sets the allocation type of the provided 'tensor' to 'kTfLiteNonCpu'.
-/// The provided 'tensor' must not be null.
+/// Sets the allocation type of the provided `tensor` to `kTfLiteNonCpu`.
+/// The provided `tensor` must not be null.
 ///
 /// WARNING: This is an experimental API and subject to change.
 void TfLiteOpaqueTensorSetNonCpuAllocation(TfLiteOpaqueTensor* tensor);
@@ -285,11 +285,11 @@ TFL_CAPI_EXPORT extern TfLiteOpaqueTensor* TfLiteOpaqueNodeGetOutput(
     TfLiteOpaqueContext* opaque_context, const TfLiteOpaqueNode* opaque_node,
     int index);
 
-/// Gets the number of input tensors of the provided 'opaque_node'.
+/// Gets the number of input tensors of the provided `opaque_node`.
 TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfInputs(
     const TfLiteOpaqueNode* opaque_node);
 
-/// Gets the number of output tensors of the provided 'opaque_node'.
+/// Gets the number of output tensors of the provided `opaque_node`.
 TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
     const TfLiteOpaqueNode* opaque_node);
 
@@ -299,57 +299,57 @@ TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetUserData(
     const TfLiteOpaqueNode* opaque_node);
 
-/// Returns the builtin data associated with the provided 'opaque_node'.
+/// Returns the builtin data associated with the provided `opaque_node`.
 ///
 /// The builtin init data associated with a node would typically be set during
 /// the creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the returned
-/// address remains valid throughout the lifetime of the 'opaque_node'.
+/// address remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetBuiltinData(
     const TfLiteOpaqueNode* opaque_node);
 
-/// Loads into the provided '*init_data' pointer the address of the custom init
-/// data associated with the provided 'opaque_node'.  The length of data is
-/// loaded into the provided 'size' pointer.  Returns 'kTfLiteOk' in case
+/// Loads into the provided `*init_data` pointer the address of the custom init
+/// data associated with the provided `opaque_node`.  The length of data is
+/// loaded into the provided `size` pointer.  Returns `kTfLiteOk` in case
 /// of success.  Any other return value indicates a failure and will leave
-/// 'init_data' and 'size' in an unspecified state.
+/// `init_data` and `size` in an unspecified state.
 ///
 /// The custom init data associated with a node would typically be set during
 /// the creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the returned
-/// address remains valid throughout the lifetime of the 'opaque_node'.
+/// address remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueNodeGetCustomInitialData(
     const TfLiteOpaqueNode* opaque_node, const void** init_data, int* size);
 
-/// Loads into the provided '*inputs' pointer the starting address of an array
+/// Loads into the provided `*inputs` pointer the starting address of an array
 /// of indices representing the tensors that are inputs of the provided
-/// 'opaque_node'. The length of the array is loaded into the provided
-/// 'num_inputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
-/// return value indicates a failure and will leave 'inputs' and
-/// 'num_inputs' in an unspecified state.
+/// `opaque_node`. The length of the array is loaded into the provided
+/// `num_inputs` pointer. Returns `kTfLiteOk` in case of success.  Any other
+/// return value indicates a failure and will leave `inputs` and
+/// `num_inputs` in an unspecified state.
 ///
 /// The input tensors associated with a node would typically be set during the
 /// creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the loaded address
-/// remains valid throughout the lifetime of the 'opaque_node'.
+/// remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeInputs(
     const TfLiteOpaqueNode* opaque_node, const int** inputs, int* num_inputs);
 
-/// Loads into the provided '*outputs' pointer the starting address of an array
+/// Loads into the provided `*outputs` pointer the starting address of an array
 /// of indices representing the tensors that are outputs of the provided
-/// 'opaque_node'. The length of the array is loaded into the provided
-/// 'num_outputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
-/// return value indicates a failure and will leave 'outputs' and
-/// 'num_outputs' in an unspecified state.
+/// `opaque_node`. The length of the array is loaded into the provided
+/// `num_outputs` pointer. Returns `kTfLiteOk` in case of success.  Any other
+/// return value indicates a failure and will leave `outputs` and
+/// `num_outputs` in an unspecified state.
 ///
 /// The output tensors associated with a node would typically be set during the
 /// creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the loaded address
-/// remains valid throughout the lifetime of the 'opaque_node'.
+/// remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeOutputs(
     const TfLiteOpaqueNode* opaque_node, const int** outputs, int* num_outputs);
 
@@ -362,35 +362,35 @@ TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeOutputs(
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeSetTemporaries(
     TfLiteOpaqueNode* opaque_node, const int* temporaries, int num_temporaries);
 
-/// Loads into the provided '*temporaries' pointer the starting address of an
+/// Loads into the provided `*temporaries` pointer the starting address of an
 /// array of indices representing the temporary tensors associated with the
-/// provided 'opaque_node'. The length of the array is loaded into the provided
-/// 'num_temporaries' pointer. Returns 'kTfLiteOk' in case of success.  Any
-/// other return value indicates a failure and will leave 'temporaries' and
-/// 'num_temporaries' in an unspecified state.
+/// provided `opaque_node`. The length of the array is loaded into the provided
+/// `num_temporaries` pointer. Returns `kTfLiteOk` in case of success.  Any
+/// other return value indicates a failure and will leave `temporaries` and
+/// `num_temporaries` in an unspecified state.
 ///
 /// The temporary tensors associated with a node would typically be set during
 /// the creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the loaded address
-/// remains valid throughout the lifetime of the 'opaque_node'.
+/// remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueNodeTemporaries(const TfLiteOpaqueNode* opaque_node,
                                          const int** temporaries,
                                          int* num_temporaries);
 
-/// Given an 'index_of_input', which must be in the range of [0, N), where N is
-/// the number of input tensors of the provided 'opaque_node', returns the
+/// Given an `index_of_input`, which must be in the range of [0, N), where N is
+/// the number of input tensors of the provided `opaque_node`, returns the
 /// (global) index of the tensor that holds the input.  Returns -1 if
-/// 'index_of_input' is not within the [0, N) range.
+/// `index_of_input` is not within the [0, N) range.
 TFL_CAPI_EXPORT
 int TfLiteOpaqueNodeGetInputTensorIndex(const TfLiteOpaqueNode* opaque_node,
                                         int index_of_input);
 
-/// Given an 'index_of_output', which must be in the range of [0, N), where N is
-/// the number of output tensors of the provided 'opaque_node', returns the
+/// Given an `index_of_output`, which must be in the range of [0, N), where N is
+/// the number of output tensors of the provided `opaque_node`, returns the
 /// (global) index of the tensor that holds the output. Returns -1 if
-/// 'index_of_output' is not within the [0, N) range.
+/// `index_of_output` is not within the [0, N) range.
 TFL_CAPI_EXPORT
 int TfLiteOpaqueNodeGetOutputTensorIndex(const TfLiteOpaqueNode* opaque_node,
                                          int index_of_output);
@@ -415,26 +415,26 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExternalContext(
     TfLiteOpaqueContext* opaque_context, void** external_context,
     TfLiteExternalContextType type);
 
-/// Given the specified 'opaque_context' and 'node_index', load the caller's
-/// opaque '*node' and '*registration_external' pointer.  Return 'kTfLiteOk' if
-/// both the '*node' as well as the '*registration_external' have been loaded
-/// correctly.  Any other return code indicates a failure and both '*node' as
-/// well as '*registration_external' will be in an unspecified state.
+/// Given the specified `opaque_context` and `node_index`, load the caller's
+/// opaque `*node` and `*registration_external` pointer.  Return `kTfLiteOk` if
+/// both the `*node` as well as the `*registration_external` have been loaded
+/// correctly.  Any other return code indicates a failure and both `*node` as
+/// well as `*registration_external` will be in an unspecified state.
 ///
 /// A caller can obtain a node's index by calling
-/// 'TfLiteOpaqueContextGetExecutionPlan', which provides an array of node
+/// `TfLiteOpaqueContextGetExecutionPlan`, which provides an array of node
 /// indices, sorted in execution order.  A node index might also come from the
 /// data structures passed to the delegate kernel's callback parameters, like
-/// the delegate parameters data structure passed to the 'init' callback that
+/// the delegate parameters data structure passed to the `init` callback that
 /// contains an array of node indices that are meant to be handled by the
 /// delegate kernel.
 ///
 /// This function is expected to be called from within a delegate callback, like
-/// 'Prepare', or a delegate kernel callback (i.e., a callback registered with
-/// a 'TfLiteOperator' object).
+/// `Prepare`, or a delegate kernel callback (i.e., a callback registered with
+/// a `TfLiteOperator` object).
 ///
-/// The loaded '*node' and '*registration_external' pointers will generally
-/// remain valid for the lifetime of the associated 'opaque_context', but can be
+/// The loaded `*node` and `*registration_external` pointers will generally
+/// remain valid for the lifetime of the associated `opaque_context`, but can be
 /// invalidated through API calls where delegates get un-applied, like API calls
 /// that modify the model graph via a delegate, or if input tensors get
 /// re-sized.
@@ -484,67 +484,67 @@ TFL_CAPI_EXPORT
 TfLiteOpaqueTensor* TfLiteOpaqueContextGetOpaqueTensor(
     const TfLiteOpaqueContext* opaque_context, int index);
 
-/// Loads into the provided '*inputs' pointer the starting address of an array
+/// Loads into the provided `*inputs` pointer the starting address of an array
 /// of indices representing the tensors that are inputs to the subgraph that is
-/// associated with the provided 'opaque_context'.  The length of the array is
-/// loaded into the provided 'num_inputs' pointer.  Returns 'kTfLiteOk' in case
+/// associated with the provided `opaque_context`.  The length of the array is
+/// loaded into the provided `num_inputs` pointer.  Returns `kTfLiteOk` in case
 /// of success.  Any other return value indicates a failure and will leave
-/// 'inputs' and 'num_inputs' in an unspecified state.  Calls to 'SetInputs' on
+/// `inputs` and `num_inputs` in an unspecified state.  Calls to `SetInputs` on
 /// the associated subgraph invalidate the loaded pointers.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetInputs(
     const struct TfLiteOpaqueContext* opaque_context, const int** inputs,
     int* num_inputs);
 
-/// Loads into the provided '*outputs' pointer the starting address of an array
+/// Loads into the provided `*outputs` pointer the starting address of an array
 /// of indices representing the tensors that are outputs to the subgraph that is
-/// associated with the provided 'opaque_context'.  The length of the array is
-/// loaded into the provided 'num_outputs' pointer.  Returns 'kTfLiteOk' in case
+/// associated with the provided `opaque_context`.  The length of the array is
+/// loaded into the provided `num_outputs` pointer.  Returns `kTfLiteOk` in case
 /// of success.  Any other return value indicates a failure and will leave
-/// 'outputs' and 'num_outputs' in an unspecified state.  Calls to 'SetOutputs'
+/// `outputs` and `num_outputs` in an unspecified state.  Calls to `SetOutputs`
 /// on the associated subgraph invalidate the loaded pointers.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetOutputs(
     const struct TfLiteOpaqueContext* opaque_context, const int** outputs,
     int* num_outputs);
 
-/// Loads into the provided '*variables' pointer the starting address of an
+/// Loads into the provided `*variables` pointer the starting address of an
 /// array of indices representing the tensors that are variables to the subgraph
-/// that is associated with the provided 'opaque_context'.  The length of the
-/// array is loaded into the provided 'num_variables' pointer.  Returns
-/// 'kTfLiteOk' in case of success.  Any other return value indicates a failure
-/// and will leave 'variables' and 'num_variables' in an unspecified state.
-/// Calls to 'SetVariables' on the associated subgraph invalidate the loaded
+/// that is associated with the provided `opaque_context`.  The length of the
+/// array is loaded into the provided `num_variables` pointer.  Returns
+/// `kTfLiteOk` in case of success.  Any other return value indicates a failure
+/// and will leave `variables` and `num_variables` in an unspecified state.
+/// Calls to `SetVariables` on the associated subgraph invalidate the loaded
 /// pointers.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetVariables(
     const struct TfLiteOpaqueContext* opaque_context, const int** variables,
     int* num_variables);
 
-/// Returns the number of nodes associated with the provided 'opaque_context'.
+/// Returns the number of nodes associated with the provided `opaque_context`.
 TFL_CAPI_EXPORT
 size_t TfLiteOpaqueContextGetNumNodes(
     const struct TfLiteOpaqueContext* opaque_context);
 
-/// Returns the number of tensors associated with the provided 'opaque_context'.
+/// Returns the number of tensors associated with the provided `opaque_context`.
 TFL_CAPI_EXPORT
 size_t TfLiteOpaqueContextGetNumTensors(
     const struct TfLiteOpaqueContext* opaque_context);
 
 /// Returns the name of the subgraph that is associated with the provided
-/// 'opaque_context'.  Typically the returned pointer will remain valid
+/// `opaque_context`.  Typically the returned pointer will remain valid
 /// throughout the lifetime of the subgraph, but may be invalidated by a call to
-/// 'Subgraph::SetName'.
+/// `Subgraph::SetName`.
 TFL_CAPI_EXPORT
 const char* TfLiteOpaqueContextGetName(
     const struct TfLiteOpaqueContext* opaque_context);
 
-/// Resizes the provided 'tensor' that is associated with the provided
-/// 'context' so that the 'tensor's shape matches the dimensionality specified
-/// via the provided 'new_size' array.  Returns 'kTfLiteOk' in
+/// Resizes the provided `tensor` that is associated with the provided
+/// `context` so that the `tensor`'s shape matches the dimensionality specified
+/// via the provided `new_size` array.  Returns `kTfLiteOk` in
 /// case of success.  Any other return value indicates a failure and will leave
-/// the 'tensor' in an unspecified state.  The TF Lite runtime takes ownership
-/// of the 'new_size' array, even in case of failure.
+/// the `tensor` in an unspecified state.  The TF Lite runtime takes ownership
+/// of the `new_size` array, even in case of failure.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextResizeTensor(TfLiteOpaqueContext* context,
                                              TfLiteOpaqueTensor* tensor,
@@ -614,23 +614,23 @@ TfLiteStatus TfLiteOpaqueContextMarkSubgraphAsDelegationSkippable(
     TfLiteOpaqueContext* opaque_context, int subgraph_index);
 
 /// Loads metadata of a TF Lite node's custom initialization data. Specifically:
-/// * Loads into the supplied 'fd' the file descriptor of the file that stores
-///   the 'node's custom  initialization data.  This output parameter will be
+/// * Loads into the supplied `fd` the file descriptor of the file that stores
+///   the `node`'s custom  initialization data.  This output parameter will be
 ///   loaded if the TF Lite runtime has access to the file descriptor, though
 ///   this is not always the case, e.g. if a client provides a tflite::Model
-///   directly to the TF Lite runtime.  If 'fd' can be loaded then 'kTfLiteOk'
-///   will be returned, otherwise 'kTfLiteError' is returned.
-/// * Loads into the supplied 'custom_initial_data_offset_in_file' pointer the
-///   offset of the 'node's custom init data in the file associated with 'fd'.
-///   This output parameter will be set to -1 if the 'node' does not have custom
+///   directly to the TF Lite runtime.  If `fd` can be loaded then `kTfLiteOk`
+///   will be returned, otherwise `kTfLiteError` is returned.
+/// * Loads into the supplied `custom_initial_data_offset_in_file` pointer the
+///   offset of the `node`'s custom init data in the file associated with `fd`.
+///   This output parameter will be set to -1 if the `node` does not have custom
 ///   init data set.
-/// * Loads into the supplied 'custom_initial_data_size' the size of the
+/// * Loads into the supplied `custom_initial_data_size` the size of the
 ///   custom initialization data.  This output parameter will be set to -1 if
-///   the 'node' does not have custom init data set.
+///   the `node` does not have custom init data set.
 ///
-/// Returns 'kTfLiteOk' when 'fd' has been loaded successfully and
-/// 'kTfLiteError' otherwise.  Note that this means that 'kTfLiteOk' can be
-/// returned, even if the 'node' does not have custom init data set.
+/// Returns `kTfLiteOk` when `fd` has been loaded successfully and
+/// `kTfLiteError` otherwise.  Note that this means that `kTfLiteOk` can be
+/// returned, even if the `node` does not have custom init data set.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetNodeInitDataMmapInfo(
     const TfLiteOpaqueContext* context, const TfLiteOpaqueNode* node, int* fd,
@@ -638,17 +638,17 @@ TfLiteStatus TfLiteOpaqueContextGetNodeInitDataMmapInfo(
     int64_t* custom_initial_data_size);
 
 /// Adds an additional tensor and configures its properties based on the
-/// provided 'builder', preserving pre-existing Tensor entries. If non-null,
-/// the value pointed to by 'new_tensor_index' will be set to the index of the
-/// new tensor.  Returns 'kTfLiteOk' when the tensor has been added
-/// successfully.  Returns 'kTfLiteError' in case of failure.
+/// provided `builder`, preserving pre-existing Tensor entries. If non-null,
+/// the value pointed to by `new_tensor_index` will be set to the index of the
+/// new tensor.  Returns `kTfLiteOk` when the tensor has been added
+/// successfully.  Returns `kTfLiteError` in case of failure.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextAddTensor(TfLiteOpaqueContext* context,
                                           TfLiteOpaqueTensorBuilder* builder,
                                           int* new_tensor_index);
 
-/// Populates the size in bytes of a provide 'type' into 'bytes'.  Returns
-/// 'kTfLiteOk' for valid types, and 'kTfLiteError' otherwise.
+/// Populates the size in bytes of a provide `type` into `bytes`.  Returns
+/// `kTfLiteOk` for valid types, and `kTfLiteError` otherwise.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetSizeOfType(TfLiteOpaqueContext* context,
                                               TfLiteType type, size_t* bytes);
@@ -664,17 +664,17 @@ TfLiteStatus TfLiteOpaqueContextGetMetadata(TfLiteOpaqueContext* context,
                                             const char* name, const char** ptr,
                                             size_t* bytes);
 
-/// Reports an error message formed by using the provided 'format' string in
+/// Reports an error message formed by using the provided `format` string in
 /// combination with the data provided via the unnamed arguments following
-/// the 'format' parameter ('...').  The intended usage and behavior is the same
-/// as with 'printf' with regards to how the data and the formatting string
+/// the `format` parameter (`...`).  The intended usage and behavior is the same
+/// as with `printf` with regards to how the data and the formatting string
 /// interact.  E.g.
-/// 'TfLiteOpaqueContextReportError(opaque_context, "a=%d b=%d", a, b);'
+/// `TfLiteOpaqueContextReportError(opaque_context, "a=%d b=%d", a, b);`
 ///
-/// The provided 'opaque_context' will be used for reporting the resulting error
+/// The provided `opaque_context` will be used for reporting the resulting error
 /// message.
 ///
-/// Note that TF Lite clients can use macros like 'TF_LITE_OPAQUE_ENSURE' to
+/// Note that TF Lite clients can use macros like `TF_LITE_OPAQUE_ENSURE` to
 /// check for certain conditions to be true, and print an error message if the
 /// condition does not hold.  Direct usage of this function from application
 /// code should therefore be rare.
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index 4f404c93a1830e..5d483bdf977476 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -509,6 +509,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "VARIANT";
     case kTfLiteInt4:
       return "INT4";
+    case kTfLiteInt2:
+      return "INT2";
   }
   return "Unknown type";
 }
diff --git a/tensorflow/lite/core/interpreter.cc b/tensorflow/lite/core/interpreter.cc
index ecc8f61560f825..c7d3ebb4c0801f 100644
--- a/tensorflow/lite/core/interpreter.cc
+++ b/tensorflow/lite/core/interpreter.cc
@@ -568,10 +568,18 @@ Interpreter::CreatePlaceholderSignatureDef() {
   auto placeholder_signature_def = std::make_unique<internal::SignatureDef>();
   for (auto i = 0; i < inputs().size(); ++i) {
     auto* name = GetInputName(i);
+    if (*name == 0) {
+      placeholder_input_names_.push_back("input" + std::to_string(i));
+      name = placeholder_input_names_.back().c_str();
+    }
     placeholder_signature_def->inputs[name] = inputs()[i];
   }
   for (auto i = 0; i < outputs().size(); ++i) {
     auto* name = GetOutputName(i);
+    if (*name == 0) {
+      placeholder_output_names_.push_back("output" + std::to_string(i));
+      name = placeholder_output_names_.back().c_str();
+    }
     placeholder_signature_def->outputs[name] = outputs()[i];
   }
   placeholder_signature_def->signature_key = kPlaceholderSignatureDefKey;
diff --git a/tensorflow/lite/core/interpreter.h b/tensorflow/lite/core/interpreter.h
index 12f757bcb92ab0..8fef5727a01196 100644
--- a/tensorflow/lite/core/interpreter.h
+++ b/tensorflow/lite/core/interpreter.h
@@ -1069,6 +1069,14 @@ class Interpreter {
   static constexpr char kPlaceholderSignatureDefKey[] =
       "<placeholder signature>";
 
+  // Placeholder input names to use when the model has with no signatures & no
+  // tensor names.
+  std::vector<std::string> placeholder_input_names_;
+
+  // Placeholder output names to use when the model has with no signatures & no
+  // tensor names.
+  std::vector<std::string> placeholder_output_names_;
+
   // Placeholder SignatureDef for legacy models with no signatures.
   std::unique_ptr<internal::SignatureDef> placeholder_signature_def_;
 
diff --git a/tensorflow/lite/core/interpreter_builder.cc b/tensorflow/lite/core/interpreter_builder.cc
index 292a1a959d2243..aab15e3a854bb6 100644
--- a/tensorflow/lite/core/interpreter_builder.cc
+++ b/tensorflow/lite/core/interpreter_builder.cc
@@ -725,7 +725,8 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
       if (subgraph->SetTensorParametersReadOnly(
               i, type, get_name(tensor), dims, quantization, buffer_ptr,
               buffer_size, allocation_, sparsity,
-              /*buffer_identifier=*/tensor->buffer()) != kTfLiteOk) {
+              /*buffer_identifier=*/tensor->buffer(),
+              /*external_buffer_id=*/tensor->external_buffer()) != kTfLiteOk) {
         TF_LITE_REPORT_ERROR(error_reporter_,
                              "Tensor %d is invalidly specified in schema.\n",
                              i);
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index a905b09c98601c..848b28f108adf3 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -83,7 +83,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 13);
+             /* max_version = */ 14);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
@@ -176,10 +176,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST(),
              /* min_version = */ 1,
-             /* max_version = */ 7);
+             /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
              /* min_version = */ 1,
@@ -217,7 +217,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
diff --git a/tensorflow/lite/core/model_building.h b/tensorflow/lite/core/model_building.h
index 8b9b87e28ce0d2..989c6ea0a739e5 100644
--- a/tensorflow/lite/core/model_building.h
+++ b/tensorflow/lite/core/model_building.h
@@ -104,7 +104,7 @@ class [[nodiscard]] Buffer {
 template <TfLiteType kType, class T>
 void Assign(Buffer b, std::vector<int> shape, const std::vector<T>& data,
             Quantization quantization) {
-  using Storage = TfLiteTypeToType<kType>::Type;
+  using Storage = typename TfLiteTypeToType<kType>::Type;
   std::unique_ptr<Storage[]> buffer_data(new Storage[data.size()]);
   std::copy(begin(data), end(data), buffer_data.get());
   Assign(
diff --git a/tensorflow/lite/core/signature_runner_test.cc b/tensorflow/lite/core/signature_runner_test.cc
index 33d2df0bfa1ae5..d18affa28a0372 100644
--- a/tensorflow/lite/core/signature_runner_test.cc
+++ b/tensorflow/lite/core/signature_runner_test.cc
@@ -134,6 +134,65 @@ TEST(SignatureRunnerTest, ReverseSignatureModel) {
   EXPECT_STREQ(subgraph_output_names[1], "prod");
 }
 
+TEST(SignatureRunnerTest, TestPlaceholderSignatures) {
+  TestErrorReporter reporter;
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/no_signatures.bin", &reporter);
+  ASSERT_TRUE(model);
+  ops::builtin::BuiltinOpResolver resolver;
+  InterpreterBuilder builder(*model, resolver);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(builder(&interpreter), kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+
+  std::vector<const std::string*> signature_defs =
+      interpreter->signature_keys();
+  ASSERT_EQ(signature_defs.size(), 0);
+
+  SignatureRunner* default_runner =
+      interpreter->GetSignatureRunner(/*signature_key=*/nullptr);
+  ASSERT_NE(default_runner, nullptr);
+  EXPECT_EQ(default_runner->signature_key(), "<placeholder signature>");
+  const std::vector<const char*>& input_names = default_runner->input_names();
+  const std::vector<const char*>& output_names = default_runner->output_names();
+  ASSERT_EQ(input_names.size(), 2);
+  EXPECT_EQ(std::string(input_names[0]), "x1");
+  EXPECT_EQ(std::string(input_names[1]), "x2");
+  ASSERT_EQ(output_names.size(), 1);
+  EXPECT_EQ(std::string(output_names[0]), "Identity");
+}
+
+TEST(SignatureRunnerTest, TestPlaceholderSignaturesDefaultNames) {
+  TestErrorReporter reporter;
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/no_signatures_no_tensor_names.bin",
+      &reporter);
+  ASSERT_TRUE(model);
+  ops::builtin::BuiltinOpResolver resolver;
+  InterpreterBuilder builder(*model, resolver);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(builder(&interpreter), kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+
+  std::vector<const std::string*> signature_defs =
+      interpreter->signature_keys();
+  ASSERT_EQ(signature_defs.size(), 0);
+
+  SignatureRunner* default_runner =
+      interpreter->GetSignatureRunner(/*signature_key=*/nullptr);
+  ASSERT_NE(default_runner, nullptr);
+  EXPECT_EQ(default_runner->signature_key(), "<placeholder signature>");
+  const std::vector<const char*>& input_names = default_runner->input_names();
+  const std::vector<const char*>& output_names = default_runner->output_names();
+  ASSERT_EQ(input_names.size(), 2);
+  EXPECT_EQ(std::string(input_names[0]), "input0");
+  EXPECT_EQ(std::string(input_names[1]), "input1");
+  ASSERT_EQ(output_names.size(), 1);
+  EXPECT_EQ(std::string(output_names[0]), "output0");
+}
+
 }  // namespace
 }  // namespace impl
 }  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 6f6a320389f100..996d36b7e9725f 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -524,7 +524,7 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
 
   // The subgraph is taking ownership of the external registration, in case the
   // user has supplied an opaque delegate.
-  if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
+  if (TfLiteDelegateIsOpaque(delegate)) {
     // If the user has supplied an opaque delegate, then they _must_ also use
     // TfLiteOperator.
     if (!registration.registration_external) {
@@ -595,7 +595,7 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
         int node_index;
 
         void* delegate_params = nullptr;
-        if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
+        if (TfLiteDelegateIsOpaque(delegate)) {
           TfLiteOpaqueDelegateParams* opaque_params =
               CreateOpaqueDelegateParams(delegate, node_subset);
           delegate_params = opaque_params;
@@ -992,7 +992,8 @@ TfLiteStatus Subgraph::AllocateTensors(InliningStrategy auto_inline) {
   TF_LITE_ENSURE_STATUS(RedoAllDelegates());
 
   if (options_ && options_->GetShloCompositeInlining() &&
-      auto_inline == InliningStrategy::kAutoInline) {
+      auto_inline == InliningStrategy::kAutoInline &&
+      !IsDelegationSkippable() && !IsFullyDelegated()) {
     TF_LITE_ENSURE_STATUS(InlineCompositeNodes());
   }
 
@@ -1917,7 +1918,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t ndims,
     const int* dims, TfLiteQuantization quantization, const char* buffer,
     size_t bytes, const Allocation* allocation, TfLiteSparsity* sparsity,
-    const size_t buffer_identifier) {
+    const size_t buffer_identifier, const size_t external_buffer_id) {
   // Ensure quantization cleanup on failure.
   ScopedTfLiteQuantization scoped_quantization(&quantization);
   ScopedTfLiteSparsity scoped_sparsity(sparsity);
@@ -1968,6 +1969,10 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
   if (buffer_identifier != kTfLiteNoBufferIdentifier) {
     tensor_buffer_identifiers_[tensor_index] = buffer_identifier;
   }
+  if (external_buffer_id != kTfLiteNoBufferIdentifier &&
+      external_buffer_id != 0) {
+    tensor_external_buffer_ids_[tensor_index] = external_buffer_id;
+  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 9e5791ca2c2952..ba952c431bc3fa 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -130,22 +130,32 @@ class Subgraph {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter. `quantization` ownership is passed to the subgraph.
+  // `buffer_identifier`: An optional value to identify the buffer. If set to
+  // a value other than kTfLiteNoBufferIdentifier, this tensor is considered a
+  // constant tensor shared across multiple subgraphs / interpreters.
+  // `external_buffer_id`: An optional value to identify the external buffer. If
+  // set to a value other than kTfLiteNoBufferIdentifier, this tensor is
+  // considered a tensor using an external buffer shared across multiple
+  // subgraphs / interpreters.
   inline TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantization quantization,
       const char* buffer, size_t bytes, const Allocation* allocation = nullptr,
       TfLiteSparsity* sparsity = nullptr,
-      size_t buffer_identifier = kTfLiteNoBufferIdentifier) {
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier,
+      size_t external_buffer_id = kTfLiteNoBufferIdentifier) {
     return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
                                        dims.data(), quantization, buffer, bytes,
-                                       allocation, sparsity, buffer_identifier);
+                                       allocation, sparsity, buffer_identifier,
+                                       external_buffer_id);
   }
   TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name, size_t ndims,
       const int* dims, TfLiteQuantization quantization, const char* buffer,
       size_t bytes, const Allocation* allocation = nullptr,
       TfLiteSparsity* sparsity = nullptr,
-      size_t buffer_identifier = kTfLiteNoBufferIdentifier);
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier,
+      size_t external_buffer_id = kTfLiteNoBufferIdentifier);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
@@ -611,6 +621,11 @@ class Subgraph {
     return tensor_buffer_identifiers_;
   }
 
+  const std::unordered_map<size_t, size_t>& GetExternalTensorBufferIdentifiers()
+      const {
+    return tensor_external_buffer_ids_;
+  }
+
   // Replaces the node for the given execution index with the subgraph.
   //
   // - The node and subgraph tensor counts must match.
@@ -1220,6 +1235,10 @@ class Subgraph {
   // Maps tensor constant buffers used in the subgraph to a model-wide
   // identifiers.
   std::unordered_map<size_t, size_t> tensor_buffer_identifiers_;
+
+  // Maps tensor external buffer ids used in the subgraph to a model-wide
+  // identifiers.
+  std::unordered_map<size_t, size_t> tensor_external_buffer_ids_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/core/tools/verifier.cc b/tensorflow/lite/core/tools/verifier.cc
index 6dc5647ada468c..701c1e0673534f 100644
--- a/tensorflow/lite/core/tools/verifier.cc
+++ b/tensorflow/lite/core/tools/verifier.cc
@@ -428,11 +428,11 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_UINT32:
       bytes_required *= sizeof(uint32_t);
       break;
+    case TensorType_INT2:
+      bytes_required = (bytes_required + 3) / 4;
+      break;
     case TensorType_INT4:
-      // TODO(b/246647008): Multiplying this value by the number of elements
-      // does not yield the size of a tensor when 4-bit values are packed
-      // 2 to a byte.
-      bytes_required *= sizeof(int8_t);
+      bytes_required = (bytes_required + 1) / 2;
       break;
     case TensorType_UINT8:
       bytes_required *= sizeof(uint8_t);
diff --git a/tensorflow/lite/delegates/flex/test/BUILD b/tensorflow/lite/delegates/flex/test/BUILD
index 0c5d4781fbdf25..92f81d68892b69 100644
--- a/tensorflow/lite/delegates/flex/test/BUILD
+++ b/tensorflow/lite/delegates/flex/test/BUILD
@@ -106,7 +106,6 @@ build_test(
     tags = [
         "noasan",
         "nomsan",
-        "notap",  # TODO(b/415812396): Re-enable once the test is fixed.
         "notsan",
         "nozapfhahn",
     ],
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index 4b6a9238acd4e8..9f8ae414fc9177 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -101,6 +101,9 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_INT32;
     case kTfLiteUInt32:
       return TF_UINT32;
+    case kTfLiteInt2:
+      // TODO(b/246806634): Tensorflow DT_INT2/4 type doesn't exist yet
+      return TF_INT8;
     case kTfLiteInt4:
       // TODO(b/246806634): Tensorflow DT_INT4 type doesn't exist yet
       return TF_INT8;
@@ -185,6 +188,8 @@ const char* TfLiteTypeToTfTypeName(TfLiteType type) {
       return "int32";
     case kTfLiteUInt32:
       return "uint32";
+    case kTfLiteInt2:
+      return "int2";
     case kTfLiteInt4:
       return "int4";
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 6710a7b1457685..b7a1e544c9666b 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -192,6 +192,8 @@ absl::Status CreateVectorCopyData(const TfLiteTensor& src, T* dst) {
         return absl::OkStatus();
       case kTfLiteInt4:
         return absl::UnimplementedError("src can't be int4.");
+      case kTfLiteInt2:
+        return absl::UnimplementedError("src can't be int2.");
     }
   }
 }
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 99391a32c24e27..6ab8553c5d1fb6 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -277,6 +277,12 @@ TfLiteDelegate* delegate1 = TfLiteXNNPackDelegateCreate(&xnnpack_options);
 if (interpreter1->ModifyGraphWithDelegate(delegate1) != kTfLiteOk) {
   // Handle errors...
 }
+// Signal to the weight cache provider that there's no building to be done
+// anymore. That way subsequent interpreter setups won't try to continue
+// building the cache.
+weight_cache.StopBuild();
+
+// Modify graph with delegate, as above...
 TfLiteDelegate* delegate2 = TfLiteXNNPackDelegateCreate(&xnnpack_options);
 if (interpreter2->ModifyGraphWithDelegate(delegate2) != kTfLiteOk) {
   // Handle errors...
@@ -287,8 +293,10 @@ if (interpreter2->ModifyGraphWithDelegate(delegate2) != kTfLiteOk) {
 // directly read from disk the 2nd time.
 ```
 
-Warning: Sharing the cache is not thread safe for writing. You should always do
-one full run of one of the interpreters before starting threading.
+Warning: Sharing the cache is not thread safe for building. You should always do
+one full run of one of the interpreters before starting threading. **Once the
+building run is done**, call `weight_cache.StopBuild()` before using the weight
+cache provider to build other delegate instances.
 
 ## Profiling
 
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.cc b/tensorflow/lite/delegates/xnnpack/file_util.cc
index 82472f4462c2ef..b475080480ecb4 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util.cc
@@ -16,13 +16,15 @@ limitations under the License.
 
 #include <fcntl.h>
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
 #include <io.h>
 #define F_OK 0
 #define ftruncate _chsize_s
+#define XNN_LSEEK _lseeki64
 #else
 #include <unistd.h>
-#endif  // defined(_MSC_VER)
+#define XNN_LSEEK lseek
+#endif  // defined(_WIN32)
 
 // We currently use the memfd_create system call to create in-memory files which
 // is only supported on Linux and Android.
@@ -68,22 +70,27 @@ void FileDescriptor::Reset(int new_fd) {
   fd_ = new_fd;
 }
 
-off_t FileDescriptorView::GetPos() const { return lseek(fd_, 0, SEEK_CUR); }
+FileDescriptor::Offset FileDescriptorView::GetPos() const {
+  return XNN_LSEEK(fd_, 0, SEEK_CUR);
+}
 
-off_t FileDescriptorView::SetPos(off_t position) const {
-  return lseek(fd_, position, SEEK_SET);
+FileDescriptor::Offset FileDescriptorView::SetPos(
+    FileDescriptor::Offset position) const {
+  return XNN_LSEEK(fd_, position, SEEK_SET);
 }
 
-off_t FileDescriptorView::SetPosFromEnd(off_t offset) const {
-  return lseek(fd_, offset, SEEK_END);
+FileDescriptor::Offset FileDescriptorView::SetPosFromEnd(
+    FileDescriptor::Offset offset) const {
+  return XNN_LSEEK(fd_, offset, SEEK_END);
 }
 
-off_t FileDescriptorView::MovePos(off_t offset) const {
-  return lseek(fd_, offset, SEEK_CUR);
+FileDescriptor::Offset FileDescriptorView::MovePos(
+    FileDescriptor::Offset offset) const {
+  return XNN_LSEEK(fd_, offset, SEEK_CUR);
 }
 
 FileDescriptor FileDescriptor::Open(const char* path, int flags, mode_t mode) {
-#if defined(_MSC_VER)
+#if defined(_WIN32)
   if (!(flags & O_TEXT)) {
     flags |= O_BINARY;
   }
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.h b/tensorflow/lite/delegates/xnnpack/file_util.h
index 501b5fd40a92c8..cddc0a4c615f06 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.h
+++ b/tensorflow/lite/delegates/xnnpack/file_util.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FILE_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_FILE_UTIL_H_
 
+#if !defined(_WIN32)
 #include <sys/types.h>
+#endif
 
 #include <cstddef>
 #include <utility>
@@ -23,12 +25,18 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
 using mode_t = int;
 #endif
 
 class FileDescriptorView {
  public:
+#if defined(_WIN32)
+  using Offset = __int64;
+#else
+  using Offset = off_t;
+#endif
+
   explicit FileDescriptorView(int fd) : fd_(fd) {}
   FileDescriptorView() = default;
 
@@ -45,28 +53,28 @@ class FileDescriptorView {
   // Equivalent to MovePos(0).
   //
   // WARNING: the file descriptor must be valid and the file must be opened.
-  off_t GetPos() const;
+  Offset GetPos() const;
 
   // Sets the absolute cursor position in the current file.
   //
   // Returns the cursor position in the file or -1 on error.
   //
   // WARNING: the file descriptor must be valid and the file must be opened.
-  off_t SetPos(off_t position) const;
+  Offset SetPos(Offset position) const;
 
   // Sets the cursor position relative to the file end.
   //
   // Returns the cursor position in the file or -1 on error.
   //
   // WARNING: the file descriptor must be valid and the file must be opened.
-  off_t SetPosFromEnd(off_t offset) const;
+  Offset SetPosFromEnd(Offset offset) const;
 
   // Moves the cursor position by the given offset in the current file.
   //
   // Returns the cursor position in the file or -1 on error.
   //
   // WARNING: the file descriptor must be valid and the file must be opened.
-  off_t MovePos(off_t offset) const;
+  Offset MovePos(Offset offset) const;
 
   // Reads `count` bytes from the file at the current position to `dst`.
   //
diff --git a/tensorflow/lite/delegates/xnnpack/mmap_handle.cc b/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
index c367638b2c4bec..169e284de47f46 100644
--- a/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
+++ b/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/xnnpack/mmap_handle.h"
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
 #include <io.h>
 #include <windows.h>
 #else
@@ -49,6 +49,20 @@ limitations under the License.
 
 namespace tflite::xnnpack {
 
+#ifdef _WIN32
+// Helper to split a value in high/low parts to pass to Windows APIs.
+struct HighLow {
+  DWORD high;
+  DWORD low;
+  static HighLow From(uint64_t val) {
+    static_assert(sizeof(val) <= 2 * sizeof(DWORD),
+                  "Value type doesn't fit in two DWORDs.");
+    return {static_cast<DWORD>(val >> CHAR_BIT * sizeof(DWORD)),
+            static_cast<DWORD>(val)};
+  }
+};
+#endif
+
 void swap(MMapHandle& a, MMapHandle& b) {
   using std::swap;
   swap(a.size_, b.size_);
@@ -79,7 +93,7 @@ bool MMapHandle::Map(const FileDescriptorView& fd, const size_t offset,
                        "cannot mmap invalid file descriptor %d ('%s').",
                        fd.Value(), path);
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
   struct _stat64 file_stats;
   XNNPACK_RETURN_CHECK(_fstat64(fd.Value(), &file_stats) == 0,
                        "could not access file stats to get size ('%s'): %s.",
@@ -101,27 +115,18 @@ bool MMapHandle::Map(const FileDescriptorView& fd, const size_t offset,
   fd.SetPos(offset);
   XNNPACK_RETURN_CHECK(fd.Read(data_, size_), "could not read file ('%s'): %s.",
                        safe_path, strerror(errno));
-#elif defined(_MSC_VER)
+#elif defined(_WIN32)
   HANDLE osf_handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd.Value()));
   XNNPACK_RETURN_CHECK(osf_handle != INVALID_HANDLE_VALUE,
                        "could not convert file descriptor to file handle: %s.",
                        strerror(errno));
 
-  // Create the handle name, which is either NULL or the path with backslashes
-  // replaced by `_`.
-  std::string name;
-  const char* handle_name = nullptr;
-  if (path && path[0] != '\0') {
-    name = path;
-    std::replace(name.begin(), name.end(), '\\', '_');
-    handle_name = name.c_str();
-  }
   file_mapping_ =
       CreateFileMappingA(osf_handle, /*lpFileMappingAttributes=*/nullptr,
                          /*flProtect=*/PAGE_READONLY, /*dwMaximumSizeHigh=*/0,
-                         /*dwMaximumSizeLow=*/0, /*lpName=*/handle_name);
+                         /*dwMaximumSizeLow=*/0, /*lpName=*/nullptr);
   XNNPACK_RETURN_CHECK(file_mapping_ != NULL,
-                       "could not create a file mapping: %s.",
+                       "could not create a file mapping: %s",
                        GetLastErrorString().c_str());
 
   SYSTEM_INFO sys_info;
@@ -129,18 +134,15 @@ bool MMapHandle::Map(const FileDescriptorView& fd, const size_t offset,
 
   offset_page_adjustment_ = offset_ % sys_info.dwAllocationGranularity;
 
-  const size_t adjusted_offset = offset - offset_page_adjustment_;
-  const DWORD file_offset_high =
-      sizeof(DWORD) < sizeof(adjusted_offset)
-          ? (adjusted_offset >> CHAR_BIT * sizeof(DWORD))
-          : 0;
-  const DWORD file_offset_low = static_cast<DWORD>(adjusted_offset);
+  const size_t adjusted_offset = offset_ - offset_page_adjustment_;
+  const size_t adjusted_size = size_ + offset_page_adjustment_;
+  HighLow file_offset = HighLow::From(adjusted_offset);
 
-  data_ = static_cast<uint8_t*>(MapViewOfFile(file_mapping_, FILE_MAP_READ,
-                                              file_offset_high, file_offset_low,
-                                              /*dwNumberOfBytesToMap=*/0));
+  data_ = static_cast<uint8_t*>(MapViewOfFile(
+      file_mapping_, FILE_MAP_READ, file_offset.high, file_offset.low,
+      /*dwNumberOfBytesToMap=*/adjusted_size));
 
-  XNNPACK_RETURN_CHECK(data_ != nullptr, "could not map file (%s): %s.",
+  XNNPACK_RETURN_CHECK(data_ != nullptr, "could not map file (%s): %s",
                        safe_path, GetLastErrorString().c_str());
 #else
   offset_page_adjustment_ = offset_ % getpagesize();
@@ -178,7 +180,7 @@ void MMapHandle::UnMap() {
   if (data_) {
 #if defined(XNNPACK_CACHE_NO_MMAP_FOR_TEST)
     delete[] data_;
-#elif defined(_MSC_VER)
+#elif defined(_WIN32)
     UnmapViewOfFile(data_);
     CloseHandle(file_mapping_);
 #else
diff --git a/tensorflow/lite/delegates/xnnpack/mmap_handle.h b/tensorflow/lite/delegates/xnnpack/mmap_handle.h
index 4c7cbbbcc5e23b..a4ef96017e2824 100644
--- a/tensorflow/lite/delegates/xnnpack/mmap_handle.h
+++ b/tensorflow/lite/delegates/xnnpack/mmap_handle.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_MMAP_HANDLE_H_
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_MMAP_HANDLE_H_
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
 #include <windows.h>
 #endif
 
@@ -130,7 +130,7 @@ class MMapHandle {
   size_t offset_ = 0;
   size_t offset_page_adjustment_ = 0;
   uint8_t* data_ = nullptr;
-#if defined(_MSC_VER)
+#if defined(_WIN32)
   HANDLE file_mapping_ = 0;
 #endif
 };
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index 97f3f795b2b6cb..e9ccdbfd8eedd9 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -144,7 +144,11 @@ bool WeightCacheBuilder::Start(const char* path, const FileDescriptor& fd) {
 }
 
 bool WeightCacheBuilder::StartBuildStep() {
-  XNNPACK_RETURN_CHECK(IsStarted());
+  XNNPACK_RETURN_CHECK(IsStarted(),
+                       "Trying to start a build step in an invalid builder.")
+  XNNPACK_RETURN_CHECK(!is_build_step_.exchange(true),
+                       "Failed to start build step: already started. This may "
+                       "be a concurrency issue.");
 
   // Reload flatbuffer data.
   XNNPackCacheHeader header;
@@ -169,7 +173,6 @@ bool WeightCacheBuilder::StartBuildStep() {
   build_segment_start_ = fd_.SetPos(header.buffer_list_offset);
   XNNPACK_RETURN_CHECK(build_segment_start_ != -1);
 
-  is_build_step_ = true;
   return true;
 }
 
@@ -216,14 +219,12 @@ BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
 }
 
 bool WeightCacheBuilder::StopBuildStep() {
-  if (!is_build_step_) {
-    return true;
-  }
-  XNNPACK_RETURN_CHECK(fd_.IsValid(),
-                       "cache file ('%s') is not open for writing: %s.",
-                       file_path_.c_str(), strerror(errno));
+  XNNPACK_RETURN_CHECK(is_build_step_,
+                       "Attempting to stop a non existing build step. This may "
+                       "be a concurrency issue.");
+  XNNPACK_RETURN_CHECK(fd_.IsValid(), "cache file ('%s') is not open.",
+                       file_path_.c_str());
 
-  is_build_step_ = false;
   if (fd_.GetPos() == build_segment_start_ && first_write_done_) {
     // Nothing was written to the file, we can exit early.
     return true;
@@ -271,6 +272,7 @@ bool WeightCacheBuilder::StopBuildStep() {
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_VERBOSE,
                   "XNNPack weight cache: written to '%s'.", file_path_.c_str());
   first_write_done_ = true;
+  is_build_step_ = false;
   return true;
 }
 
@@ -600,10 +602,11 @@ size_t MMapWeightCacheProvider::LookUpOrInsert(
 void* MMapWeightCacheProvider::OffsetToAddr(const size_t offset) {
   // While the cache is being built, the buffer could grow and need to be
   // reallocated so we cannot ensure pointer stability.
-  XNNPACK_ABORT_CHECK(
-      !IsBuilding(),
-      "Cannot get the address of a buffer in a cache during a building step.");
-  return offset_to_addr_[offset];
+  auto it = offset_to_addr_.find(offset);
+  XNNPACK_ABORT_CHECK(it != offset_to_addr_.end(),
+                      "Cannot get the address of a buffer in a cache before "
+                      "the build step that introduces it has finished.");
+  return it->second;
 }
 
 void MMapWeightCacheProvider::Release() {
@@ -684,7 +687,7 @@ bool IsCompatibleCacheFile(const char* path) {
                        "Couldn't read file header.");
   XNNPACK_RETURN_CHECK(
       header.version == XNNPackCacheHeader::kVersion,
-      "Cache header version is incompatible. Expected %d, got %d.",
+      "Cache header version is incompatible. Expected %llu, got %llu.",
       XNNPackCacheHeader::kVersion, header.version);
   XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
                            header.xnnpack_build_identifier,
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index e6e4797ba8001a..7dd04a20f2095f 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
 
+#include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -205,7 +206,7 @@ class WeightCacheBuilder {
   FileDescriptorView fd_;
   std::string file_path_;
 
-  bool is_build_step_ = false;
+  std::atomic<bool> is_build_step_ = false;
 };
 
 // Allows XNNPack to directly load packed weights from disk instead of having to
@@ -248,6 +249,11 @@ class MMapWeightCacheProvider {
   [[nodiscard /*Starting to build a cache file may fail.*/]]
   bool StartBuild(const char* file_path, FileDescriptor fd = FileDescriptor());
 
+  // If the cache is still being built, this signals that all of the building
+  // operations are done and that `CanStartBuildStep()` should now return
+  // `false`.
+  void StopBuild() { building_run_ = false; }
+
   // Sets the weight file path and loads it.
   [[nodiscard /*Loading a cache file may fail.*/]]
   bool Load(const std::string& path, FileDescriptor fd = FileDescriptor());
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index c1253ac64dc4c0..1e1f1d69426ec4 100644
Binary files a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin and b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin differ
diff --git a/tensorflow/lite/g3doc/android/lite_build.md b/tensorflow/lite/g3doc/android/lite_build.md
index aea79a35d14356..be6b9a041b02fa 100644
--- a/tensorflow/lite/g3doc/android/lite_build.md
+++ b/tensorflow/lite/g3doc/android/lite_build.md
@@ -8,6 +8,8 @@ them in your Android projects.
 
 ## Use Nightly Snapshots
 
+Warning: Support for nightly snapshots is currently broken (b/446167415).
+
 To use nightly snapshots, add the following repo to your root Gradle build
 config.
 
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 19a36f4b6eea08..e8074f01801996 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
@@ -42,6 +41,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 
 #ifdef __APPLE__
@@ -272,7 +272,7 @@ TEST(BasicInterpreter, CheckResize) {
   const uint8_t uint8s[] = {3, 4};
   const int64_t int64s[] = {6, -7};
   const int16_t int16s[] = {8, -9};
-  const Eigen::half float16s[] = {Eigen::half(-3.f), Eigen::half(-4.f)};
+  const half float16s[] = {half(-3.f), half(-4.f)};
 
   struct {
     TfLiteType type;
diff --git a/tensorflow/lite/java/AndroidManifestGpuApi.xml b/tensorflow/lite/java/AndroidManifestGpuApi.xml
index 1343f5da8920cc..bb4b19398ad5e5 100644
--- a/tensorflow/lite/java/AndroidManifestGpuApi.xml
+++ b/tensorflow/lite/java/AndroidManifestGpuApi.xml
@@ -2,6 +2,9 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.tensorflow.lite.gpu.api">
 
+    <uses-sdk
+        android:minSdkVersion="21" />
+
     <application>
         <!-- Applications that target Android S+ require explicit declaration of
              any referenced vendor-provided libraries. -->
@@ -19,4 +22,3 @@
     </application>
 
 </manifest>
-
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index b6c518d0f1ff1d..b5c214806d080b 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -94,6 +94,7 @@ cc_library_with_tflite(
     linkopts = [
         "-lm",
         "-ldl",
+        "-Wl,-z,max-page-size=16384",
     ],
     tflite_deps = [
         ":jni_utils",
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 9da18896842306..6a3ec9f57e2a02 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -172,6 +172,8 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/log:absl_check",
+        "@com_google_absl//absl/log:absl_log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -1490,11 +1492,14 @@ cc_test(
     tags = ["tflite_nnapi"],
     deps = [
         ":cast_test_common",
+        ":kernel_util",
         ":test_main",
         ":test_util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/random",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
@@ -2220,6 +2225,7 @@ cc_test(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -2567,10 +2573,13 @@ cc_test(
     ],
     tags = ["tflite_nnapi"],
     deps = [
+        ":kernel_util",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels/internal:tensor_ctypes",
+        "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index fe2a57671a87fc..192a552bca4ea2 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -18,11 +18,14 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <type_traits>
+#include <vector>
 
 #include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/interpreter_options.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -183,6 +186,19 @@ void copyCastToBFloat16(const Eigen::half* in, Eigen::bfloat16* out,
   });
 }
 
+TfLiteStatus castInt2ToFloat(TfLiteContext* context, const TfLiteTensor* in,
+                             TfLiteTensor* out, int num_elements) {
+  const int8_t* in_data = (const int8_t*)in->data.data;
+  float* out_data = (float*)out->data.data;
+  std::vector<int8_t> unpacked_temp(num_elements);
+  tensor_utils::UnpackPackedIntToInt8(in_data, num_elements, /*bit_width=*/2,
+                                      unpacked_temp.data());
+  for (int i = 0; i < num_elements; ++i) {
+    out_data[i] = static_cast<float>(unpacked_temp[i]);
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus castInt4ToFloat(TfLiteContext* context, const TfLiteTensor* in,
                              TfLiteTensor* out, int num_elements) {
   const int8_t* in_data = (const int8_t*)in->data.data;
@@ -240,6 +256,34 @@ TfLiteStatus castInt4ToFloat(TfLiteContext* context, const TfLiteTensor* in,
   return kTfLiteOk;
 }
 
+TfLiteStatus castFloatToInt4(const float* in, TfLiteTensor* out,
+                             int num_elements) {
+  const float min_val = -8.0f;
+  const float max_val = 7.0f;
+  std::vector<int8_t> unpacked_temp(num_elements);
+  for (int i = 0; i < num_elements; ++i) {
+    unpacked_temp[i] =
+        static_cast<int8_t>(std::max(min_val, std::min(max_val, in[i])));
+  }
+  tensor_utils::PackInt8IntoDenseInt(unpacked_temp.data(), num_elements,
+                                     /*bit_width=*/4, (int8_t*)out->data.data);
+  return kTfLiteOk;
+}
+
+TfLiteStatus castFloatToInt2(const float* in, TfLiteTensor* out,
+                             int num_elements) {
+  const float min_val = -2.0f;
+  const float max_val = 1.0f;
+  std::vector<int8_t> unpacked_temp(num_elements);
+  for (int i = 0; i < num_elements; ++i) {
+    unpacked_temp[i] =
+        static_cast<int8_t>(std::max(min_val, std::min(max_val, in[i])));
+  }
+  tensor_utils::PackInt8IntoDenseInt(unpacked_temp.data(), num_elements,
+                                     /*bit_width=*/2, (int8_t*)out->data.data);
+  return kTfLiteOk;
+}
+
 template <typename FromT>
 TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
                           TfLiteTensor* out, int num_elements) {
@@ -286,6 +330,20 @@ TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
       copyCast(in, reinterpret_cast<std::complex<float>*>(out->data.c64),
                num_elements);
       break;
+    case kTfLiteInt4:
+      if (std::is_same<FromT, float>::value) {
+        return castFloatToInt4(reinterpret_cast<const float*>(in), out,
+                               num_elements);
+      } else {
+        TF_LITE_UNSUPPORTED_TYPE(context, out->type, "Cast");
+      }
+    case kTfLiteInt2:
+      if (std::is_same<FromT, float>::value) {
+        return castFloatToInt2(reinterpret_cast<const float*>(in), out,
+                               num_elements);
+      } else {
+        TF_LITE_UNSUPPORTED_TYPE(context, out->type, "Cast");
+      }
     default:
       // Unsupported type.
       TF_LITE_UNSUPPORTED_TYPE(context, out->type, "Cast");
@@ -334,6 +392,11 @@ TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
         TF_LITE_UNSUPPORTED_TYPE(context, output->type, "Cast");
       }
       return castInt4ToFloat(context, input, output, num_elements);
+    case kTfLiteInt2:
+      if (output->type != kTfLiteFloat32) {
+        TF_LITE_UNSUPPORTED_TYPE(context, output->type, "Cast");
+      }
+      return castInt2ToFloat(context, input, output, num_elements);
     default:
       // Unsupported type.
       TF_LITE_UNSUPPORTED_TYPE(context, input->type, "Cast");
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index 09fa0ada76ad40..77cc2f3442b1c2 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -17,16 +17,18 @@ limitations under the License.
 #include <algorithm>
 #include <complex>
 #include <limits>
-#include <random>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/random/random.h"
 #include "absl/types/span.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/cast_test_common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -45,10 +47,10 @@ TEST(CastOpModel, CastInt4ToFloat) {
 
 TEST(CastOpModel, CastInt4ToFloatLarge) {
   int num_elements = 40;
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  std::uniform_int_distribution<int8_t> i8dist(-8, 7);
-  auto i8rng = [&] { return i8dist(rng); };
+  absl::BitGen bitgen;
+  auto i8rng = [&] {
+    return absl::Uniform<int8_t>(absl::IntervalClosed, bitgen, -8, 7);
+  };
   std::vector<int8_t> input(num_elements);
   std::generate(input.begin(), input.end(), i8rng);
   CastOpModel m({TensorType_INT4, {num_elements}},
@@ -60,6 +62,85 @@ TEST(CastOpModel, CastInt4ToFloatLarge) {
   }
 }
 
+TEST(CastOpModel, CastInt2ToFloat) {
+  CastOpModel m({TensorType_INT2, {2, 4}}, {TensorType_FLOAT32, {2, 4}});
+  m.Set2BitInput({1, 0, -1, -2, 1, 0, -1, -2});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              Pointwise(FloatingPointEq(),
+                        {1.f, 0.f, -1.f, -2.f, 1.f, 0.f, -1.f, -2.f}));
+}
+
+TEST(CastOpModel, CastInt2ToFloatLarge) {
+  int num_elements = 40;
+  absl::BitGen bitgen;
+  auto i2rng = [&] {
+    return absl::Uniform<int8_t>(absl::IntervalClosed, bitgen, -2, 1);
+  };
+  std::vector<int8_t> input(num_elements);
+  std::generate(input.begin(), input.end(), i2rng);
+  CastOpModel m({TensorType_INT2, {num_elements}},
+                {TensorType_FLOAT32, {num_elements}});
+  m.Set2BitInput(input);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  for (int i = 0; i < input.size(); ++i) {
+    EXPECT_EQ(m.ExtractVector<float>(m.output())[i], input[i]);
+  }
+}
+
+TEST(CastOpModel, CastFloatToInt4) {
+  CastOpModel m({TensorType_FLOAT32, {2, 4}}, {TensorType_INT4, {2, 4}});
+  m.PopulateTensor<float>(m.input(), {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, -8.f});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  TfLiteTensor* output = m.GetOutputTensor(0);
+  int num_elements = NumElements(output);
+  std::vector<int8_t> unpacked_output(num_elements);
+  tensor_utils::UnpackPackedIntToInt8(
+      reinterpret_cast<int8_t*>(output->data.data), num_elements,
+      /*bit_width=*/4, unpacked_output.data());
+  EXPECT_THAT(unpacked_output, ElementsAreArray({1, 2, 3, 4, 5, 6, 7, -8}));
+}
+
+TEST(CastOpModel, CastFloatToInt4Clamp) {
+  CastOpModel m({TensorType_FLOAT32, {1, 4}}, {TensorType_INT4, {1, 4}});
+  m.PopulateTensor<float>(m.input(), {100.f, -100.f, 7.9f, -8.9f});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  TfLiteTensor* output = m.GetOutputTensor(0);
+  int num_elements = NumElements(output);
+  std::vector<int8_t> unpacked_output(num_elements);
+  tensor_utils::UnpackPackedIntToInt8(
+      reinterpret_cast<int8_t*>(output->data.data), num_elements,
+      /*bit_width=*/4, unpacked_output.data());
+  EXPECT_THAT(unpacked_output, ElementsAreArray({7, -8, 7, -8}));
+}
+
+TEST(CastOpModel, CastFloatToInt2) {
+  CastOpModel m({TensorType_FLOAT32, {2, 4}}, {TensorType_INT2, {2, 4}});
+  m.PopulateTensor<float>(m.input(),
+                          {1.f, 0.f, -1.f, -2.f, 1.f, 0.f, -1.f, -2.f});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  TfLiteTensor* output = m.GetOutputTensor(0);
+  int num_elements = NumElements(output);
+  std::vector<int8_t> unpacked_output(num_elements);
+  tensor_utils::UnpackPackedIntToInt8(
+      reinterpret_cast<int8_t*>(output->data.data), num_elements,
+      /*bit_width=*/2, unpacked_output.data());
+  EXPECT_THAT(unpacked_output, ElementsAreArray({1, 0, -1, -2, 1, 0, -1, -2}));
+}
+
+TEST(CastOpModel, CastFloatToInt2Clamp) {
+  CastOpModel m({TensorType_FLOAT32, {1, 4}}, {TensorType_INT2, {1, 4}});
+  m.PopulateTensor<float>(m.input(), {100.f, -100.f, 1.9f, -2.9f});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  TfLiteTensor* output = m.GetOutputTensor(0);
+  int num_elements = NumElements(output);
+  std::vector<int8_t> unpacked_output(num_elements);
+  tensor_utils::UnpackPackedIntToInt8(
+      reinterpret_cast<int8_t*>(output->data.data), num_elements,
+      /*bit_width=*/2, unpacked_output.data());
+  EXPECT_THAT(unpacked_output, ElementsAreArray({1, -2, 1, -2}));
+}
+
 TEST(CastOpModel, CastFloatToUint8Infinity) {
   CastOpModel m({TensorType_FLOAT32, {2}}, {TensorType_UINT8, {2}});
   m.PopulateTensor<float>(m.input(), {std::numeric_limits<float>::infinity(),
diff --git a/tensorflow/lite/kernels/cast_test_common.h b/tensorflow/lite/kernels/cast_test_common.h
index 1cfa4bd740eecf..c0802ca68e1529 100644
--- a/tensorflow/lite/kernels/cast_test_common.h
+++ b/tensorflow/lite/kernels/cast_test_common.h
@@ -59,6 +59,10 @@ class CastOpModel : public SingleOpModel {
     PopulateTensor4bit(input_, 0, f.data(), f.data() + f.size());
   }
 
+  void Set2BitInput(absl::Span<const int8_t> data) {
+    PopulateTensor2bit(input_, 0, data.data(), data.data() + data.size());
+  }
+
   int input() const { return input_; }
   int output() const { return output_; }
 
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index fd3a9320999447..2359d13b7edd85 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -57,7 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, op_context.input != nullptr);
 
-  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteInt4 ||
+  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteInt2 ||
+                              op_context.input->type == kTfLiteInt4 ||
                               op_context.input->type == kTfLiteUInt8 ||
                               op_context.input->type == kTfLiteInt8 ||
                               op_context.input->type == kTfLiteInt16 ||
diff --git a/tensorflow/lite/kernels/dequantize.h b/tensorflow/lite/kernels/dequantize.h
index 77668d47704982..07888d7a16d05b 100644
--- a/tensorflow/lite/kernels/dequantize.h
+++ b/tensorflow/lite/kernels/dequantize.h
@@ -72,7 +72,12 @@ inline TfLiteStatus PerChannelDequantizeImpl(TfLiteContext* context,
     per_channel_op_params.zero_point = zero_points.data();
   }
   const int8_t* input_data;
-  const size_t bytes_unpacked = input->bytes * 2;
+  size_t bytes_unpacked;
+  if (input->type == kTfLiteInt2) {
+    bytes_unpacked = input->bytes * 4;
+  } else {
+    bytes_unpacked = input->bytes * 2;
+  }
   auto unpacked_input_data = std::make_unique<int8_t[]>(bytes_unpacked);
 
   if (input->type == kTfLiteInt4) {
@@ -80,6 +85,11 @@ inline TfLiteStatus PerChannelDequantizeImpl(TfLiteContext* context,
         GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
         /*bit_width=*/4, unpacked_input_data.get());
     input_data = unpacked_input_data.get();
+  } else if (input->type == kTfLiteInt2) {
+    tflite::tensor_utils::UnpackPackedIntToInt8(
+        GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
+        /*bit_width=*/2, unpacked_input_data.get());
+    input_data = unpacked_input_data.get();
   } else {
     input_data = GetTensorData<int8_t>(input);
   }
@@ -91,6 +101,7 @@ inline TfLiteStatus PerChannelDequantizeImpl(TfLiteContext* context,
           GetTensorData<uint8_t>(input), GetTensorShape(output),
           GetTensorData<float>(output));
       break;
+    case kTfLiteInt2:
     case kTfLiteInt4:
     case kTfLiteInt8:
       reference_ops::PerChannelDequantize<int8_t>(
@@ -115,7 +126,12 @@ TfLiteStatus DequantizeImpl(TfLiteContext* context, TfLiteNode* node,
   op_params.zero_point = input->params.zero_point;
   op_params.scale = input->params.scale;
   const int8_t* input_data;
-  const size_t bytes_unpacked = input->bytes * 2;
+  size_t bytes_unpacked;
+  if (input->type == kTfLiteInt2) {
+    bytes_unpacked = input->bytes * 4;
+  } else {
+    bytes_unpacked = input->bytes * 2;
+  }
   auto unpacked_input_data = std::make_unique<int8_t[]>(bytes_unpacked);
 
   if (input->type == kTfLiteInt4) {
@@ -124,6 +140,12 @@ TfLiteStatus DequantizeImpl(TfLiteContext* context, TfLiteNode* node,
         GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
         /*bit_width=*/4, unpacked_input_data.get());
     input_data = unpacked_input_data.get();
+  } else if (input->type == kTfLiteInt2) {
+    // Use GetTensorShape(input).FlatSize() for num_elements.
+    tflite::tensor_utils::UnpackPackedIntToInt8(
+        GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
+        /*bit_width=*/2, unpacked_input_data.get());
+    input_data = unpacked_input_data.get();
   } else {
     input_data = GetTensorData<int8_t>(input);
   }
@@ -140,6 +162,7 @@ TfLiteStatus DequantizeImpl(TfLiteContext* context, TfLiteNode* node,
             GetTensorShape(output), GetTensorData<float>(output));
       }
       break;
+    case kTfLiteInt2:
     case kTfLiteInt4:
     case kTfLiteInt8:
       if (kernel_type == kReference) {
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index bf098c91a170ef..d60bebb3049b5f 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -19,12 +19,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/memory/memory.h"
 #include "Eigen/Core"  // from @eigen_archive
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -75,6 +71,15 @@ class DequantizeOpModel : public SingleOpModel {
                        data_int8.data() + data_int8.size());
   }
 
+  template <typename T>
+  void SetInputInt2(int input, const std::vector<T> data) {
+    auto non_const = *const_cast<std::vector<T>*>(&data);
+    std::vector<int8_t> data_int8(non_const.size());
+    std::copy(non_const.begin(), non_const.end(), data_int8.begin());
+    PopulateTensor2bit(input, 0, data_int8.data(),
+                       data_int8.data() + data_int8.size());
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
  protected:
@@ -92,6 +97,15 @@ TEST(DequantizeOpTest, Int4) {
               ElementsAreArray(ArrayFloatNear({4, 3.5, -3, -3.5})));
 }
 
+TEST(DequantizeOpTest, Int2) {
+  DequantizeOpModel m(TensorType_INT2, {1, 4}, 0.5, -1, 6);
+
+  m.SetInputInt2<int8_t>(0, {1, 0, -1, -2});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({1.0, 0.5, 0.0, -0.5})));
+}
+
 TEST(DequantizeOpTest, Uint8) {
   // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
   DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127, 1);
@@ -185,5 +199,22 @@ TEST(DequantizePerChannelOpTest, Int8) {
                   {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
 }
 
+TEST(DequantizePerChannelOpTest, Int2) {
+  // scales={0.5, 1.0}, zero_points={-1, 0}, channel_dim=0
+  DequantizePerChannelOpModel m(TensorType_INT2, {2, 2}, {0.5, 1.0}, {-1, 0}, 0,
+                                6);
+  m.SetInputInt2<int8_t>(0, {1, 0, -1, -2});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  // Dequantization formula: (val - zp) * scale
+  // Channel 0: scale=0.5, zp=-1.
+  // val=1: (1 - (-1)) * 0.5 = 1.0
+  // val=0: (0 - (-1)) * 0.5 = 0.5
+  // Channel 1: scale=1.0, zp=0
+  // val=-1: (-1 - 0) * 1.0 = -1.0
+  // val=-2: (-2 - 0) * 1.0 = -2.0
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({1.0, 0.5, -1.0, -2.0})));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index da06ee3c3e892e..dcce5022c49dbb 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -186,7 +186,7 @@ inline TfLiteStatus CheckTypes(TfLiteContext* context,
                                TfLiteFullyConnectedParams* params) {
   const bool is_quantized =
       ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
-       (filter->type == kTfLiteInt4));
+       (filter->type == kTfLiteInt4) || (filter->type == kTfLiteInt2));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_shuffled =
       is_quantized && (params->weights_format ==
@@ -448,7 +448,8 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node,
       TF_LITE_ENSURE(context,
                      input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
       TF_LITE_ENSURE(context, (filter->type == kTfLiteInt8 ||
-                               filter->type == kTfLiteInt4));
+                               filter->type == kTfLiteInt4 ||
+                               filter->type == kTfLiteInt2));
       TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                         per_channel_quantization_size);
       TF_LITE_ENSURE_EQ(
@@ -654,7 +655,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const bool is_quantized =
       ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
-       (filter->type == kTfLiteInt4));
+       (filter->type == kTfLiteInt4) || (filter->type == kTfLiteInt2));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_pie = kernel_type == kLegacyPie;
 
@@ -666,7 +667,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                 params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
-  if (filter->type == kTfLiteInt4) {
+  if (filter->type == kTfLiteInt4 || filter->type == kTfLiteInt2) {
     TF_LITE_ENSURE_MSG(
         context,
         kTfLiteOk == VerifyQuantizationZeroPoint(filter, /*expected_value=*/0),
@@ -1420,6 +1421,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       case kTfLiteUInt8:
         if (kernel_type == kReference) {
           TF_LITE_ENSURE(context, filter->type != kTfLiteInt4);
+          TF_LITE_ENSURE(context, filter->type != kTfLiteInt2);
           reference_ops::FullyConnected(
               op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
               GetTensorShape(filter), GetTensorData<uint8_t>(filter),
@@ -1456,8 +1458,10 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 "Invalid quantized and sparse fully-connected format.");
             return kTfLiteError;
           }
-          // Int4 support for sparse filter tensor is currently not supported
+          // Int4/Int2 support for sparse filter tensor is currently not
+          // supported
           TF_LITE_ENSURE(context, filter->type != kTfLiteInt4);
+          TF_LITE_ENSURE(context, filter->type != kTfLiteInt2);
           if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
               sparsity.dim_metadata[2].dense_size == 16) {
             // Block sparse with block size of 1x16.
@@ -1485,6 +1489,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 GetTensorShape(filter).FlatSize(), /*bit_width=*/4,
                 unpacked_filter_data.get());
             filter_data = unpacked_filter_data.get();
+          } else if (filter->type == kTfLiteInt2) {
+            const size_t bytes_unpacked = filter->bytes * 4;
+            unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+            tflite::tensor_utils::UnpackPackedIntToInt8(
+                GetTensorData<int8_t>(filter),
+                GetTensorShape(filter).FlatSize(), /*bit_width=*/2,
+                unpacked_filter_data.get());
+            filter_data = unpacked_filter_data.get();
           } else {
             filter_data = GetTensorData<int8_t>(filter);
           }
@@ -1514,6 +1526,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 GetTensorShape(filter).FlatSize(), /*bit_width=*/4,
                 unpacked_filter_data.get());
             filter_data = unpacked_filter_data.get();
+          } else if (filter->type == kTfLiteInt2) {
+            const size_t bytes_unpacked = filter->bytes * 4;
+            unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+            tflite::tensor_utils::UnpackPackedIntToInt8(
+                GetTensorData<int8_t>(filter),
+                GetTensorShape(filter).FlatSize(), /*bit_width=*/2,
+                unpacked_filter_data.get());
+            filter_data = unpacked_filter_data.get();
           } else {
             filter_data = GetTensorData<int8_t>(filter);
           }
@@ -1762,14 +1782,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
       }
     case kTfLiteInt8:
-      if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
-        return EvalQuantized<kernel_type>(context, node, params, data, input,
-                                          filter, bias, output);
-      } else {
-        TF_LITE_KERNEL_LOG(context, "Unhandled fully-connected weights format");
-        return kTfLiteError;
-      }
     case kTfLiteInt4:
+    case kTfLiteInt2:
       if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
         return EvalQuantized<kernel_type>(context, node, params, data, input,
                                           filter, bias, output);
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 1239a3888677f8..bf707d135e4eca 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/absl_check.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -159,22 +160,34 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
       std::vector<int64_t> per_channel_quantization_offsets(
           per_channel_quantization_scales.size(), 0);
       weights_ = AddInput({filter_type,
-                           {units_, input_size_},
-                           0,
-                           0,
-                           0,
-                           0,
-                           true,
+                           /*shape=*/{units_, input_size_},
+                           /*min=*/0,
+                           /*max=*/0,
+                           /*scale=*/0,
+                           /*zero_point=*/0,
+                           /*per_channel_quantization=*/true,
                            per_channel_quantization_scales,
                            per_channel_quantization_offsets,
-                           0});
+                           /*channel_index=*/0});
     } else {
       // per-tensor
       float min = input.min;
       float max = input.max;
-      if (filter_type == TensorType_INT4 || filter_type == TensorType_INT8) {
-        min = filter_type == TensorType_INT4 ? -7.f : -63.5f;
-        max = filter_type == TensorType_INT4 ? 7.f : 64.f;
+      switch (filter_type) {
+        case TensorType_INT4:
+          min = -7.f;
+          max = 7.f;
+          break;
+        case TensorType_INT2:
+          min = -2.f;
+          max = 2.f;
+          break;
+        case TensorType_INT8:
+          min = -63.5f;
+          max = 64.f;
+          break;
+        default:
+          break;
       }
       weights_ = AddInput({filter_type, {units_, input_size_}, min, max});
     }
@@ -292,6 +305,13 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
     QuantizeAndPopulate4bit(weights_, data);
   }
 
+  void SetWeights2bit(const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(weights_);
+    std::vector<int8_t> u =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor2bit(weights_, 0, u.data(), u.data() + u.size());
+  }
+
   template <typename T>
   void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
                             int output_depth) {
@@ -372,6 +392,12 @@ class PerChannelQuantizedFullyConnectedOpModel
     PerChannelSymmetricQuantizeAndPopulate(weights_, data);
   }
 
+  void SetWeights2bit(const std::vector<float>& data) {
+    // 2 bit logic handled in PerChannelSymmetricQuantizeAndPopulate.
+    ABSL_CHECK_EQ(interpreter_->tensor(weights_)->type, kTfLiteInt2);
+    PerChannelSymmetricQuantizeAndPopulate(weights_, data);
+  }
+
   template <typename T>
   void SetInput(const std::vector<float>& data) {
     QuantizeAndPopulate<T>(input_, data);
@@ -734,6 +760,38 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt4) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(103, 104, 105, 97, 98, 99));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt2) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128}, TensorType_INT32, false,
+      false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1., 2., 3.});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  // The quantization parameters for the model.
+  // input s, zp: 0.5, -1
+  // filter s, zp: 0.5, 0
+  // output s, zp: 1, -1
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              testing::Pointwise(testing::FloatEq(),
+                                 {26.0, 27.0, 28.0, 8.0, 9.0, 10.0}));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(25, 26, 27, 7, 8, 9));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
@@ -863,6 +921,34 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestPerChannelQuantizedInt4) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(103, 104, 105, 97, 98, 99));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestPerChannelQuantizedInt2) {
+  PerChannelQuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*per_channel_quantization_scales=*/{1.0, 1.0, 1.0},
+      /*output=*/{TensorType_INT8, {}, -127, 128},
+      /*bias_type=*/TensorType_INT32, false, false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({26, 27, 28, 8, 9, 10})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(25, 26, 27, 7, 8, 9));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16NoBias) {
   const float scale = 128.0 / 65536;
   QuantizedFullyConnectedOpModel m(
@@ -1018,6 +1104,37 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(1536, 2048, 2560, 11776, 12288, 12800));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestPerChannelQuantizedInt16Bias32Weight2) {
+  const float scale = 128.0 / 65536;
+  PerChannelQuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT16, {2, 10}, 0, 0, scale, 0},
+      /*per_channel_quantization_scales=*/{1.0, 1.0, 1.0},
+      /*output=*/{TensorType_INT16, {}, 0, 0, scale, 0},
+      /*bias_type=*/TensorType_INT32, false, false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int16_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({26, 27, 28, 8, 9, 10})));
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAre(13312, 13824, 14336, 4096, 4608, 5120));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16Bias64) {
   const float scale = 128.0 / 65536;
   QuantizedFullyConnectedOpModel m(
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index d845b3ee6f5184..debdc5142e963d 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 #include <limits>
 #include <memory>
 #include <tuple>
@@ -34,6 +35,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/add.h"
 #include "tensorflow/lite/kernels/internal/reference/mul.h"
 #include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
 #if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 #include <Accelerate/Accelerate.h>
@@ -4798,6 +4801,78 @@ inline void Slice(const tflite::SliceParams& op_params,
   return Slice(op_params, input_shape, output_shape, &writer);
 }
 
+// Iterates through the desired slice region and copies nibbles directly from
+// the input to the output tensor.
+inline void SliceInt4(const tflite::SliceParams& op_params,
+                      const RuntimeShape& input_shape,
+                      const TfLiteTensor* input,
+                      const RuntimeShape& output_shape, TfLiteTensor* output) {
+  ruy::profiler::ScopeLabel label("SliceInt4");
+
+  const int8_t* input_data = GetTensorData<int8_t>(input);
+  int8_t* output_data = GetTensorData<int8_t>(output);
+
+  // Clear output buffer, as we will be writing nibbles.
+  const int output_byte_size = (output_shape.FlatSize() + 1) / 2;
+  memset(output_data, 0, output_byte_size);
+
+  // Calculate the start and stop indices for each dimension of the slice.
+  const RuntimeShape ext_input_shape =
+      RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  int start[5];
+  int stop[5];
+  for (int i = 0; i < 5; ++i) {
+    int padded_i = 5 - i;
+    start[i] =
+        begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] =
+        (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+            ? ext_input_shape.Dims(i)
+            : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  // Loop over the slice region and copy nibbles.
+  int output_nibble_idx = 0;
+  for (int i0 = start[0]; i0 < stop[0]; ++i0) {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1) {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2) {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3) {
+          for (int i4 = start[4]; i4 < stop[4]; ++i4) {
+            const int input_nibble_idx =
+                Offset(ext_input_shape, i0, i1, i2, i3, i4);
+
+            // Get nibble from input. Since int4 data is packed, two nibbles
+            // share a byte.
+            const int8_t input_byte = input_data[input_nibble_idx / 2];
+            int8_t nibble;
+            if (input_nibble_idx % 2 == 0) {  // low nibble
+              // The `(val << 4) >> 4` trick is to sign-extend the 4-bit value.
+              nibble = static_cast<int8_t>(input_byte << 4) >> 4;
+            } else {  // high nibble
+              nibble = input_byte >> 4;
+            }
+
+            // Set nibble in output.
+            if (output_nibble_idx % 2 == 0) {
+              // First nibble of a byte. We simply set the lower 4 bits.
+              output_data[output_nibble_idx / 2] = (nibble & 0x0F);
+            } else {
+              // Second nibble. OR with existing low nibble.
+              output_data[output_nibble_idx / 2] |= (nibble << 4);
+            }
+            output_nibble_idx++;
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
              const T* input2_data, const RuntimeShape& output_shape,
diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
index fe419074a24d05..efc6ba5a9c04d3 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
@@ -143,23 +143,40 @@ void UnpackPackedIntToInt8(const int8_t* src_buffer, int num_elements,
   }
 }
 
-void PackInt8IntoDenseInt4(const int8_t* src_buffer, int num_elements,
-                           int8_t* dst_buffer) {
-  // num_elements means the number of elements regardless of packed or unpacked.
-  // For example, 3 elements means both
-  //   1) Packed: 3 int4's = 12 bit -> 16 bits (padded) = 2 bytes.
-  //      stored in src_buffer[0] and src_buffer[1] (i = 0..1)
-  //   2) Unpacked: 3 int8's = 3 bytes.
-  //      stored in dst_buffer[0], dst_buffer[1] and dst_buffer[2] (j = 0..2)
-  for (int i = 0; i < num_elements - 1; i += 2) {
-    dst_buffer[i / 2] = src_buffer[i] & 0x0F;
-    dst_buffer[i / 2] |= src_buffer[i + 1] << 4;
-  }
-  auto packed_size = (num_elements + 1) / 2;
-
-  // Copy the final nibble if the buffer is odd-lengthed
-  if (num_elements % 2 != 0) {
-    dst_buffer[packed_size - 1] = src_buffer[num_elements - 1] & 0x0F;
+void PackInt8IntoDenseInt(const int8_t* src_buffer, int num_elements,
+                          int bit_width, int8_t* dst_buffer) {
+  assert(bit_width == 2 || bit_width == 4);
+  if (bit_width == 4) {
+    // num_elements means the number of elements regardless of packed or
+    // unpacked. For example, 3 elements means both
+    //   1) Unpacked: 3 int8's = 3 bytes.
+    //      stored in src_buffer[0], src_buffer[1] and src_buffer[2] (j = 0..2)
+    //   2) Packed: 3 int4's = 12 bit -> 16 bits (padded) = 2 bytes.
+    //      stored in dst_buffer[0] and dst_buffer[1] (i = 0..1)
+    for (int i = 0; i < num_elements / 2; ++i) {
+      dst_buffer[i] = (src_buffer[2 * i] & 0x0F) | (src_buffer[2 * i + 1] << 4);
+    }
+    // If the buffer size is odd, pack the final nibble.
+    if (num_elements % 2 != 0) {
+      dst_buffer[num_elements / 2] = src_buffer[num_elements - 1] & 0x0F;
+    }
+  } else if (bit_width == 2) {
+    for (int i = 0; i < num_elements / 4; ++i) {
+      dst_buffer[i] = (src_buffer[4 * i] & 0x03) |
+                      ((src_buffer[4 * i + 1] & 0x03) << 2) |
+                      ((src_buffer[4 * i + 2] & 0x03) << 4) |
+                      ((src_buffer[4 * i + 3] & 0x03) << 6);
+    }
+    // Handle the remaining elements.
+    int remaining_elements = num_elements % 4;
+    if (remaining_elements > 0) {
+      int8_t packed_val = 0;
+      for (int i = 0; i < remaining_elements; ++i) {
+        packed_val |= (src_buffer[num_elements - remaining_elements + i] & 0x03)
+                      << (i * 2);
+      }
+      dst_buffer[num_elements / 4] = packed_val;
+    }
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
index ee5063b0203997..c70ac94db5fcc7 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -635,20 +635,24 @@ void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements,
 void UnpackPackedIntToInt8(const int8_t* src_buffer, int num_elements,
                            int bit_width, int8_t* dst_buffer);
 
-// Pack `src_buffer` into a densely packed buffer of int4 values.
+// Pack `src_buffer` into a densely packed buffer of int2 or int4 values.
 // Parameters:
-//   src_buffer   : Buffer containing int4 values stored in int8 memory.
+//   src_buffer   : Buffer containing int2 or int4 values stored in int8
+//                  memory.
 //   num_elements : Number of elements stored in the buffer. Note that this can
 //                  be smaller than the size of `src_buffer` by 1 if it's odd,
 //                  in which case the last nibble in `src_buffer` is ignored.
 //                  This should be equal to the size of `dst_buffer`.
+//   bit_width    : The bit width of the packed elements (either 2 or 4).
 //   dst_buffer   : Buffer to pack into. Should be allocated by the caller.
 //                  Size should be at least `num_elements`.
 // Notes:
-//   For example, given `src_buffer = {0x02, 0x01, 0x04, 0x03}`, calling this
-//   function will return `dst_buffer = {0x12, 0x34}`.
-void PackInt8IntoDenseInt4(const int8_t* src_buffer, int num_elements,
-                           int8_t* dst_buffer);
+//   For 4-bit packing: e.g., given `src_buffer = {0x02, 0x01, 0x04, 0x03}`,
+//   calling this function will return `dst_buffer = {0x12, 0x34}`.
+//   For 2-bit packing: e.g., given `src_buffer = {0x00, 0x01, 0x00, 0x02}`,
+//   calling this function will return `dst_buffer = {0x84}`.
+void PackInt8IntoDenseInt(const int8_t* src_buffer, int num_elements,
+                          int bit_width, int8_t* dst_buffer);
 }  // namespace tensor_utils
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/broadcast_to.h b/tensorflow/lite/kernels/internal/reference/broadcast_to.h
index f106b2b52f6c35..0cd03db926df67 100644
--- a/tensorflow/lite/kernels/internal/reference/broadcast_to.h
+++ b/tensorflow/lite/kernels/internal/reference/broadcast_to.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
 
+#include <cstddef>
+
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -83,7 +85,8 @@ inline void BroadcastTo(const RuntimeShape& unextended_input_shape,
   // If non-broadcasting, just copy data from input to output tensor.
   if (last_broadcast_dim == -1) {
     memcpy(output_data, input_data,
-           unextended_input_shape.FlatSize() * TfLiteTypeGetSize(data_type));
+           static_cast<size_t>(unextended_input_shape.FlatSize()) *
+               static_cast<size_t>(TfLiteTypeGetSize(data_type)));
     return;
   }
 
diff --git a/tensorflow/lite/kernels/internal/reference/slice.h b/tensorflow/lite/kernels/internal/reference/slice.h
index cb73ea0d0c4c6c..feddd639584c09 100644
--- a/tensorflow/lite/kernels/internal/reference/slice.h
+++ b/tensorflow/lite/kernels/internal/reference/slice.h
@@ -15,7 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
 
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -74,6 +81,27 @@ inline void Slice(const tflite::SliceParams& op_params,
   return Slice(op_params, input_shape, output_shape, &writer);
 }
 
+inline void SliceInt4(const tflite::SliceParams& op_params,
+                      const RuntimeShape& input_shape,
+                      const TfLiteTensor* input,
+                      const RuntimeShape& output_shape, TfLiteTensor* output) {
+  const int num_input_elements = input_shape.FlatSize();
+  std::vector<int8_t> unpacked_input(num_input_elements);
+  tensor_utils::UnpackPackedIntToInt8(GetTensorData<int8_t>(input),
+                                      num_input_elements, 4,
+                                      unpacked_input.data());
+
+  const int num_output_elements = output_shape.FlatSize();
+  std::vector<int8_t> unpacked_output(num_output_elements);
+
+  reference_ops::Slice<int8_t>(op_params, input_shape, unpacked_input.data(),
+                               output_shape, unpacked_output.data());
+
+  tensor_utils::PackInt8IntoDenseInt(unpacked_output.data(),
+                                     num_output_elements, 4,
+                                     GetTensorData<int8_t>(output));
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index f3634223533b8d..c1ae8831a0d26c 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -2149,6 +2149,44 @@ TEST(uKernels, UnpackInt2OddLength) {
               testing::Pointwise(testing::Eq(), expected_output));
 }
 
+TEST(uKernels, PackInt4Basic) {
+  const int8_t input[4] = {-8, 3, -2, -5};
+  const int8_t expected_output[2] = {0x38, static_cast<int8_t>(0xBE)};
+  int8_t actual_output[2];
+  PackInt8IntoDenseInt(input, 4, 4, actual_output);
+  EXPECT_THAT(actual_output,
+              testing::Pointwise(testing::Eq(), expected_output));
+}
+
+TEST(uKernels, PackInt4OddLength) {
+  // `num_elements` is odd, so the last element 0x4 should be ignored
+  const int8_t input[3] = {1, 2, 3};
+  const int8_t expected_output[2] = {0x21, 0x03};
+  int8_t actual_output[2];
+  PackInt8IntoDenseInt(input, 3, 4, actual_output);
+  EXPECT_THAT(actual_output,
+              testing::Pointwise(testing::Eq(), expected_output));
+}
+
+TEST(uKernels, PackInt2Basic) {
+  const int8_t input[4] = {0, -1, -2, 1};
+  const int8_t expected_output[1] = {0x6C};
+  int8_t actual_output[1];
+  PackInt8IntoDenseInt(input, 4, 2, actual_output);
+  EXPECT_THAT(actual_output,
+              testing::Pointwise(testing::Eq(), expected_output));
+}
+
+TEST(uKernels, PackInt2OddLength) {
+  // `num_elements` is odd
+  const int8_t input[3] = {0, -2, 1};
+  const int8_t expected_output[1] = {0x18};
+  int8_t actual_output[1];
+  PackInt8IntoDenseInt(input, 3, 2, actual_output);
+  EXPECT_THAT(actual_output,
+              testing::Pointwise(testing::Eq(), expected_output));
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc
index 7e451a3be99b0e..ed6a444d3a47c3 100644
--- a/tensorflow/lite/kernels/quantize.cc
+++ b/tensorflow/lite/kernels/quantize.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/requantize.h"
@@ -109,8 +110,8 @@ void AffineQuantizeToInt4(const tflite::QuantizationParams& op_params,
     int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
     quantized_buffer[i] = clamped;
   }
-  tensor_utils::PackInt8IntoDenseInt4(quantized_buffer.data(), flat_size,
-                                      output_data);
+  tensor_utils::PackInt8IntoDenseInt(quantized_buffer.data(), flat_size,
+                                     /*bit_width=*/4, output_data);
 }
 
 void ReportError(TfLiteContext* context, TfLiteType input_type,
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 9118ddd43f7486..842a9dc99d2dd6 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -280,7 +280,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED_REF(),
              /* min_version */ 1,
-             /* max_version */ 11);
+             /* max_version */ 14);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX_REF(),
@@ -343,7 +343,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              /* max_version = */ 6);
   AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 5);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -377,10 +377,10 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST(),
              /* min_version = */ 1,
-             /* max_version = */ 7);
+             /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU_REF());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM_REF(),
              /* min_version = */ 1,
@@ -415,7 +415,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 5);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF(),
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index d8ff57364fe0f7..62b4ae9440668f 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/kernels/internal/reference/slice.h"
+
 #include <stdint.h>
 
 #include <algorithm>
@@ -206,6 +208,27 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // The dimensions in the kernel used to be in reverse-order, and TFLite
   // arranged the begins and sizes vectors accordingly. This macro incorporates
   // the needed reversing.
+#define TF_LITE_SLICE_INT4()                                            \
+  {                                                                     \
+    TF_LITE_ENSURE_EQ(context, begins.size(), kMaxDim);                 \
+    TF_LITE_ENSURE_EQ(context, sizes.size(), kMaxDim);                  \
+    tflite::SliceParams op_params;                                      \
+    op_params.begin_count = kMaxDim;                                    \
+    op_params.size_count = kMaxDim;                                     \
+    for (int i = 0; i < kMaxDim; ++i) {                                 \
+      op_params.begin[i] = begins[i];                                   \
+      op_params.size[i] = sizes[i];                                     \
+    }                                                                   \
+                                                                        \
+    if (kernel_type == kGenericOptimized) {                             \
+      optimized_ops::SliceInt4(op_params, GetTensorShape(input), input, \
+                               GetTensorShape(output), output);         \
+    } else {                                                            \
+      reference_ops::SliceInt4(op_params, GetTensorShape(input), input, \
+                               GetTensorShape(output), output);         \
+    }                                                                   \
+  }
+
 #define TF_LITE_SLICE(data_type)                                               \
   {                                                                            \
     TF_LITE_ENSURE_EQ(context, begins.size(), kMaxDim);                        \
@@ -231,6 +254,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       TF_LITE_SLICE(float);
       break;
+    case kTfLiteInt4:
+      TF_LITE_SLICE_INT4();
+      break;
     case kTfLiteInt32:
       TF_LITE_SLICE(int32_t);
       break;
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 4a016c44a4544f..feb02c48d2f3aa 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -16,11 +16,16 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "Eigen/Core"
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
@@ -67,6 +72,12 @@ class SliceOpModel : public SingleOpModel {
   }
 
   void SetInput(std::initializer_list<input_type> data) {
+    if constexpr (std::is_same<input_type, int8_t>::value) {
+      if (interpreter_->tensor(input_)->type == kTfLiteInt4) {
+        PopulateTensor4bit(input_, 0, data.begin(), data.end());
+        return;
+      }
+    }
     PopulateTensor<input_type>(input_, data);
   }
   void SetStringInput(std::vector<string> data) {
@@ -253,6 +264,22 @@ TEST_P(SliceOpTest, SliceInt8) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST_P(SliceOpTest, SliceInt4) {
+  SliceOpModel<int8_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                  {2, 1, -1, 1}, TensorType_INT32,
+                                  TensorType_INT4, GetParam());
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  const TfLiteTensor* output_tensor = m.GetOutputTensor();
+  int num_elements = NumElements(output_tensor);
+  std::vector<int8_t> unpacked_output(num_elements);
+  tensor_utils::UnpackPackedIntToInt8(GetTensorData<int8_t>(output_tensor),
+                                      num_elements,
+                                      /*bit_width=*/4, unpacked_output.data());
+  EXPECT_THAT(unpacked_output, ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 TEST_P(SliceOpTest, SliceInt16) {
   SliceOpModel<int16_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
                                    {2, 1, -1, 1}, TensorType_INT32,
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 0560133b6c3a49..cbdb74d29d04aa 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include <ostream>
 #include <string>
 #include <tuple>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -41,14 +40,15 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16/fp16.h"  // from @FP16
 #include "absl/algorithm/container.h"
+#include "absl/log/absl_check.h"
+#include "absl/log/absl_log.h"
 #include "absl/types/span.h"
 #include "Eigen/Core"  // from @eigen_archive
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/portable_type_to_tflitetype.h"
@@ -57,7 +57,6 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
-#include "tensorflow/lite/type_to_tflitetype.h"
 #include "tensorflow/lite/util.h"
 #include "tsl/platform/logging.h"
 
@@ -110,6 +109,9 @@ inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
   if (type == kTfLiteInt4) {
     min = -7;
     max = 7;
+  } else if (type == kTfLiteInt2) {
+    min = -2;
+    max = 1;
   }
 
   q.reserve(data.size());
@@ -489,14 +491,14 @@ class SingleOpModel {
             reinterpret_cast<const uint8_t*>(q.data()), q.size());
         buffers_.push_back(CreateBuffer(builder_, data_buffer));
       } else if (is_quantized) {
-        CHECK_EQ(t.type, TensorType_INT8)
+        ABSL_CHECK_EQ(t.type, TensorType_INT8)
             << "The INT8 quantization is only supported for sparsified tensor";
         std::vector<int8_t> quantized_output(sparse_data.size());
         std::vector<float> scales;
         std::vector<int64_t> zero_points;
         if (t.per_channel_quantization) {
-          CHECK_EQ(t.per_channel_quantization_scales.size(),  // NOLINT
-                   t.per_channel_quantization_offsets.size())
+          ABSL_CHECK_EQ(t.per_channel_quantization_scales.size(),  // NOLINT
+                        t.per_channel_quantization_offsets.size())
               << "Per channel quantization scales and offsets should have the "
                  "same size";
           std::vector<int8_t> temp_data(dense_data.size());
@@ -571,6 +573,15 @@ class SingleOpModel {
                        quantized_output.data() + quantized_output.size());
   }
 
+  void QuantizeAndPopulate2bit(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    t->type = kTfLiteInt2;
+    std::vector<int8_t> quantized_output =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor2bit(index, /*offset=*/0, quantized_output.data(),
+                       quantized_output.data() + quantized_output.size());
+  }
+
   void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
     std::vector<int8_t> q = QuantizeTensor(index, data);
     PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
@@ -584,6 +595,10 @@ class SingleOpModel {
       std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
                                                t->params.zero_point, t->type);
       PopulateTensor4bit(index, /*offset=*/0, q.data(), q.data() + q.size());
+    } else if (t->type == kTfLiteInt2) {
+      std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
+                                               t->params.zero_point, t->type);
+      PopulateTensor2bit(index, /*offset=*/0, q.data(), q.data() + q.size());
     } else {
       std::vector<int8_t> q = QuantizeTensor(index, data);
       PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
@@ -664,6 +679,9 @@ class SingleOpModel {
       PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
                          quantized_output.data() + quantized_output.size());
 
+    } else if (t->type == kTfLiteInt2) {
+      PopulateTensor2bit(index, /*offset=*/0, quantized_output.data(),
+                         quantized_output.data() + quantized_output.size());
     } else {
       PopulateTensor(index, /*offset=*/0, quantized_output.data(),
                      quantized_output.data() + quantized_output.size());
@@ -703,7 +721,7 @@ class SingleOpModel {
     TfLiteTensor* t = interpreter_->tensor(index);
     auto* params =
         reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
-    CHECK(t->type == kTfLiteInt32 || t->type == kTfLiteInt64);
+    ABSL_CHECK(t->type == kTfLiteInt32 || t->type == kTfLiteInt64);
     if (t->type == kTfLiteInt32) {
       PerChannelQuantizeBiasPopulateTensor<int32_t>(index, input_data, params);
     } else {
@@ -783,7 +801,7 @@ class SingleOpModel {
   std::vector<T> ExtractVector(int index) const {
     const T* v = interpreter_->typed_tensor<T>(index);
     const auto* tensor = interpreter_->tensor(index);
-    CHECK(v) << "Could not extract vector at index: " << index;
+    ABSL_CHECK(v) << "Could not extract vector at index: " << index;
     int tensor_size;
     if (tensor->sparsity) {
       // Getting the size of the sparse buffer this way is based on the
@@ -815,7 +833,7 @@ class SingleOpModel {
   // Sets the number of threads available to the interpreter.
   // Reconstruct the interpreter if reset_interpreter is true.
   void SetNumThreads(int num_threads, bool reset_interpreter = false) {
-    CHECK(interpreter_ != nullptr);
+    ABSL_CHECK(interpreter_ != nullptr);
     if (reset_interpreter) {
       // Reconstruct interpreter as number of threads may affect internal
       // state, e.g. stratch buffer allocation.
@@ -889,8 +907,11 @@ class SingleOpModel {
         } else if (t.type == TensorType_INT4) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt4);
+        } else if (t.type == TensorType_INT2) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt2);
         } else {
-          LOG(FATAL) << "No support for the requested quantized type";
+          ABSL_LOG(FATAL) << "No support for the requested quantized type";
         }
         t.min = 0;
         t.max = 0;
@@ -941,6 +962,9 @@ class SingleOpModel {
     if (type == kTfLiteInt4) {
       qmin = -7;
       qmax = 7;
+    } else if (type == kTfLiteInt2) {
+      qmin = -2;
+      qmax = 2;
     } else {
       qmin = std::numeric_limits<T>::min();
       qmax = std::numeric_limits<T>::max();
@@ -949,12 +973,12 @@ class SingleOpModel {
     const float qmax_double = qmax;
     // 0 should always be a representable value. Let's assume that the initial
     // min,max range contains 0.
-    CHECK_LE(f_min, 0);
-    CHECK_GE(f_max, 0);
+    ABSL_CHECK_LE(f_min, 0);
+    ABSL_CHECK_GE(f_max, 0);
     if (f_min == f_max) {
       // Special case where the min,max range is a point. Should be {0}.
-      CHECK_EQ(f_min, 0);
-      CHECK_EQ(f_max, 0);
+      ABSL_CHECK_EQ(f_min, 0);
+      ABSL_CHECK_EQ(f_max, 0);
       return {scale, zero_point};
     }
 
@@ -1003,8 +1027,8 @@ class SingleOpModel {
 
     // The zero point should always be in the range of quantized value,
     // // [qmin, qmax].
-    CHECK_GE(nudged_zero_point, qmin);
-    CHECK_LE(nudged_zero_point, qmax);
+    ABSL_CHECK_GE(nudged_zero_point, qmin);
+    ABSL_CHECK_LE(nudged_zero_point, qmax);
 
     zero_point = nudged_zero_point;
     // finally, return the values
@@ -1028,15 +1052,42 @@ class SingleOpModel {
 
     if (!v) {
       auto* t = interpreter_->tensor(index);
-      CHECK(t) << "No tensor with index " << index << ".";
-      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
-      LOG(FATAL) << "Unknown tensor error.";
+      ABSL_CHECK(t) << "No tensor with index " << index << ".";
+      ABSL_CHECK(t->data.raw)
+          << "Empty data for tensor with index " << index << ".";
+      ABSL_LOG(FATAL) << "Unknown tensor error.";
     }
     absl::c_copy(data, v + offset);
     PackInt4ValuesDenselyInPlace(v, ElementCount(*tensor_ptr->dims));
     tensor_ptr->bytes = ((ElementCount(*tensor_ptr->dims) + 1) / 2);
   }
 
+  // Partially populates the tensor, starting at the given offset.
+  void PopulateTensor2bit(int index, int offset, const int8_t* begin,
+                          const int8_t* end) {
+    auto data = absl::Span<const int8_t>(begin, end - begin);
+    TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
+    uint8_t* v = nullptr;
+    if (tensor_ptr) {
+      v = reinterpret_cast<uint8_t*>(tensor_ptr->data.data);
+    }
+
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      ABSL_CHECK(t) << "No tensor with index " << index << ".";
+      ABSL_CHECK(t->data.raw)
+          << "Empty data for tensor with index " << index << ".";
+      ABSL_LOG(FATAL) << "Unknown tensor error.";
+    }
+    int num_elements = data.size();
+    int num_bytes = (num_elements + 3) / 4;
+    std::vector<int8_t> packed(num_bytes);
+    tensor_utils::PackInt8IntoDenseInt(data.data(), num_elements,
+                                       /*bit_width=*/2, packed.data());
+    memcpy(v + offset, packed.data(), packed.size());
+    tensor_ptr->bytes = num_bytes;
+  }
+
  private:
   // Populates the tensor starting at offset using given data.
   template <typename T, typename Container>
@@ -1044,13 +1095,14 @@ class SingleOpModel {
     T* v = interpreter_->typed_tensor<T>(index);
     if (!v) {
       auto* t = interpreter_->tensor(index);
-      CHECK(t) << "No tensor with index " << index << ".";
-      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
-      CHECK_EQ(t->type, typeToTfLiteType<T>())
+      ABSL_CHECK(t) << "No tensor with index " << index << ".";
+      ABSL_CHECK(t->data.raw)
+          << "Empty data for tensor with index " << index << ".";
+      ABSL_CHECK_EQ(t->type, typeToTfLiteType<T>())
           << "Type mismatch for tensor with index " << index << ". Requested "
           << TfLiteTypeGetName(typeToTfLiteType<T>()) << ", got "
           << TfLiteTypeGetName(t->type) << ".";
-      LOG(FATAL) << "Unknown tensor error.";
+      ABSL_LOG(FATAL) << "Unknown tensor error.";
     }
     absl::c_copy(data, v + offset);
   }
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 5f08f8083fa1b7..66c84e652015d7 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -142,10 +142,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           params, GetTensorShape(op_context.input), unpacked_input_data.get(),
           GetTensorShape(op_context.output), unpacked_output_data.get());
       // Pack the output back to int4.
-      tflite::tensor_utils::PackInt8IntoDenseInt4(
+      tflite::tensor_utils::PackInt8IntoDenseInt(
           unpacked_output_data.get(),
           GetTensorShape(op_context.input).FlatSize(),
-          GetTensorData<int8_t>(op_context.output));
+          /*bit_width=*/4, GetTensorData<int8_t>(op_context.output));
       break;
     }
     case kTfLiteInt16:
diff --git a/tensorflow/lite/objc/BUILD.apple b/tensorflow/lite/objc/BUILD.apple
index 9615b10e2f80f3..c82e97e00ed311 100644
--- a/tensorflow/lite/objc/BUILD.apple
+++ b/tensorflow/lite/objc/BUILD.apple
@@ -126,7 +126,7 @@ objc_library(
         "//tensorflow/lite:testdata/multi_signatures.bin",
     ],
     sdk_frameworks = ["XCTest"],
-    tags = TFL_DEFAULT_TAGS + ["builder_default_ios_x86_64"],
+    tags = TFL_DEFAULT_TAGS + ["builder_default_ios_arm64"],
     deps = [":TensorFlowLite"],
 )
 
@@ -159,7 +159,7 @@ objc_library(
     ],
     module_name = "TestApp",
     tags = TFL_DEFAULT_TAGS + [
-        "builder_default_ios_x86_64",
+        "builder_default_ios_arm64",
         "manual",
     ],
     deps = [
diff --git a/tensorflow/lite/objc/sources/TFLCommonUtil.mm b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
index 8f9e37ebb421b6..a69a6a1fe8418d 100644
--- a/tensorflow/lite/objc/sources/TFLCommonUtil.mm
+++ b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
@@ -56,6 +56,7 @@ TFLTensorDataType TFLTensorDataTypeFromCTensor(const TfLiteTensor *cTensor) {
     case kTfLiteUInt32:
     case kTfLiteUInt64:
     case kTfLiteInt4:
+    case kTfLiteInt2:
     case kTfLiteResource:
     case kTfLiteVariant:
       // Not all datatypes are supported in the TfLite Objc API.
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 0c5d772e3e8fde..99025999e4082c 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -397,6 +397,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteVariant";
     case kTfLiteInt4:
       return "kTfLiteInt4";
+    case kTfLiteInt2:
+      return "kTfLiteInt2";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/profiling/memory_info.cc b/tensorflow/lite/profiling/memory_info.cc
index 1e1f8a80bf511b..dc9b262623643e 100644
--- a/tensorflow/lite/profiling/memory_info.cc
+++ b/tensorflow/lite/profiling/memory_info.cc
@@ -16,7 +16,12 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <cstdint>
+#include <fstream>
+#include <iostream>
 #include <ostream>
+#include <sstream>
+#include <string>
 
 #ifdef __linux__
 #include <malloc.h>
@@ -25,6 +30,10 @@ limitations under the License.
 #elif defined(__APPLE__)
 #include <mach/mach.h>
 #include <malloc/malloc.h>
+#elif defined(_WIN32)
+#include <windows.h>
+// psapi must be included after windows.h.
+#include <psapi.h>
 #endif
 
 namespace tflite {
@@ -33,8 +42,40 @@ namespace memory {
 
 const size_t MemoryUsage::kValueNotSet = 0;
 
+namespace {
+
+#if defined(__linux__)
+// Returns the current VM swap in kilobytes on Linux.
+int64_t GetCurrentVmSwapKb() {
+  std::ifstream status_file("/proc/self/status");
+  if (!status_file.is_open()) {
+    return -1;
+  }
+  std::string line;
+  while (std::getline(status_file, line)) {
+    if (line.rfind("VmSwap:", 0) == 0) {
+      std::stringstream ss(line);
+      std::string key;
+      int64_t value_kb;
+      // The line format is "VmSwap:    1234 kB"
+      // We can extract the key ("VmSwap:") and the numeric value ("1234").
+      ss >> key >> value_kb;
+      if (!ss.fail()) {
+        return value_kb;
+      } else {
+        return -1;  // Indicate parsing error
+      }
+    }
+  }
+  // If the VmSwap line is not found, it means 0 swap is being used.
+  return 0;
+}
+#endif
+
+}  // namespace
+
 bool MemoryUsage::IsSupported() {
-#if defined(__linux__) || defined(__APPLE__)
+#if defined(__linux__) || defined(__APPLE__) || defined(_WIN32)
   return true;
 #endif
   return false;
@@ -46,11 +87,17 @@ MemoryUsage GetMemoryUsage() {
   rusage res;
   if (getrusage(RUSAGE_SELF, &res) == 0) {
     result.mem_footprint_kb = res.ru_maxrss;
+    int64_t vm_swap_kb = GetCurrentVmSwapKb();
+    if (vm_swap_kb >= 0) {
+      result.private_footprint_bytes = (vm_swap_kb + res.ru_maxrss) * 1024;
+    }
   }
-#if defined(__NO_MALLINFO__)
+#if defined(__NO_MALLINFO__) || !defined(__GLIBC__) ||         \
+    defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
   result.total_allocated_bytes = -1;
   result.in_use_allocated_bytes = -1;
-#elif defined(__GLIBC__) && __GLIBC_MINOR__ >= 33
+#elif __GLIBC_MINOR__ >= 33
   const auto mem = mallinfo2();
   result.total_allocated_bytes = mem.arena;
   result.in_use_allocated_bytes = mem.uordblks;
@@ -58,7 +105,9 @@ MemoryUsage GetMemoryUsage() {
   const auto mem = mallinfo();
   result.total_allocated_bytes = mem.arena;
   result.in_use_allocated_bytes = mem.uordblks;
-#endif  // defined(__NO_MALLINFO__)
+#endif  // defined(__NO_MALLINFO__) || !defined(__GLIBC__) || \
+        // defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) ||
+        // defined(THREAD_SANITIZER)
 #elif defined(__APPLE__)
   struct task_vm_info vm_info;
   mach_msg_type_number_t count = TASK_VM_INFO_COUNT;
@@ -67,10 +116,39 @@ MemoryUsage GetMemoryUsage() {
   if (status == KERN_SUCCESS) {
     result.mem_footprint_kb =
         static_cast<int64_t>(vm_info.phys_footprint / 1024.0);
+    // TODO: b/421171145 - Consider subtracting shared_resident_kb.
+    result.private_footprint_bytes = vm_info.phys_footprint;
   }
   struct mstats stats = mstats();
   result.total_allocated_bytes = stats.bytes_total;
   result.in_use_allocated_bytes = stats.bytes_used;
+#elif defined(_WIN32)
+  PROCESS_MEMORY_COUNTERS_EX process_memory_counters;
+  HANDLE process_handle = GetCurrentProcess();
+  if (process_handle != nullptr &&
+      GetProcessMemoryInfo(process_handle,
+                           (PROCESS_MEMORY_COUNTERS*)&process_memory_counters,
+                           sizeof(process_memory_counters))) {
+    result.mem_footprint_kb = process_memory_counters.WorkingSetSize / 1024;
+    result.private_footprint_bytes = process_memory_counters.PrivateUsage;
+  } else {
+    result.mem_footprint_kb = -1;
+    result.private_footprint_bytes = -1;
+  }
+  CloseHandle(process_handle);
+  HANDLE process_heap = GetProcessHeap();
+  if (process_heap != nullptr && HeapLock(process_heap)) {
+    HEAP_SUMMARY heap_summary;
+    heap_summary.cb = sizeof(heap_summary);
+    if (HeapSummary(process_heap, 0, &heap_summary)) {
+      result.total_allocated_bytes = heap_summary.cbCommitted;
+      result.in_use_allocated_bytes = heap_summary.cbAllocated;
+    } else {
+      result.total_allocated_bytes = -1;
+      result.in_use_allocated_bytes = -1;
+    }
+    HeapUnlock(process_heap);
+  }
 #endif  // __linux__
   return result;
 }
@@ -80,7 +158,9 @@ void MemoryUsage::AllStatsToStream(std::ostream* stream) const {
           << mem_footprint_kb / 1000.0 << " MB, total non-mmapped heap size = "
           << total_allocated_bytes / 1000.0 / 1000.0
           << " MB, in-use heap size = "
-          << in_use_allocated_bytes / 1000.0 / 1000.0 << " MB";
+          << in_use_allocated_bytes / 1000.0 / 1000.0
+          << " MB, private footprint = "
+          << private_footprint_bytes / 1000.0 / 1000.0 << " MB";
 }
 
 }  // namespace memory
diff --git a/tensorflow/lite/profiling/memory_info.h b/tensorflow/lite/profiling/memory_info.h
index a1246be6db6901..0416c9148fd27a 100644
--- a/tensorflow/lite/profiling/memory_info.h
+++ b/tensorflow/lite/profiling/memory_info.h
@@ -30,12 +30,16 @@ struct MemoryUsage {
 
   // Indicates whether obtaining memory usage is supported on the platform, thus
   // indicating whether the values defined in this struct make sense or not.
+  // Note that even if this returns true, some of the fields in the struct may
+  // not be supported by GetMemoryUsage(); in such cases, unsupported fields
+  // will be set to kValueNotSet (zero) or -1.
   static bool IsSupported();
 
   MemoryUsage()
       : mem_footprint_kb(kValueNotSet),
         total_allocated_bytes(kValueNotSet),
-        in_use_allocated_bytes(kValueNotSet) {}
+        in_use_allocated_bytes(kValueNotSet),
+        private_footprint_bytes(kValueNotSet) {}
 
   // The memory footprint (in kilobytes).
   //
@@ -55,11 +59,19 @@ struct MemoryUsage {
   //    + purgeable_nonvolatile
   //    + purgeable_nonvolatile_compressed
   //    + page_table
+  //
+  // For Windows:
+  // This is the current memory size (in kilobytes) occupied by the OS process
+  // that is held in main memory (RAM). This is generally referred to as the
+  // working set size. This is an alias to
+  // PROCESS_MEMORY_COUNTERS::WorkingSetSize.
   int64_t mem_footprint_kb;
 
   // Total non-mmapped heap space allocated from system in bytes.
   // For Linux, this is an alias to mallinfo::arena.
   // For Mac, this is an alias to mstats::bytes_total
+  // For Windows, this is an alias to HEAP_SUMMARY::cbCommitted
+  // for the default process heap.
   //
   // This does not count mmapped heap space, nor does it count non-heap
   // uses of memory such as other mmapped space, thread stacks, globals,
@@ -70,11 +82,20 @@ struct MemoryUsage {
   // (i.e. excluding those have been freed).
   // For Linux, this is an alias to mallinfo::uordblks.
   // For Mac, this is an alias to mstats::bytes_used
+  // For Windows, this is an alias to HEAP_SUMMARY::cbAllocated
+  // for the default process heap.
   //
   // This does not count non-heap uses of mmap, nor does it count other
   // non-heap uses of memory such as thread stacks, globals, code, etc.
   size_t in_use_allocated_bytes;
 
+  // Private footprint (in kilobytes).
+  //
+  // For Linux this is the rusage::ru_maxrss + VmSwap.
+  // For Mac this is the task_vm_info::phys_footprint.
+  // For Windows this is the PrivateUsage.
+  size_t private_footprint_bytes;
+
   MemoryUsage operator+(MemoryUsage const& obj) const {
     MemoryUsage res;
     res.mem_footprint_kb = mem_footprint_kb + obj.mem_footprint_kb;
@@ -82,6 +103,8 @@ struct MemoryUsage {
         total_allocated_bytes + obj.total_allocated_bytes;
     res.in_use_allocated_bytes =
         in_use_allocated_bytes + obj.in_use_allocated_bytes;
+    res.private_footprint_bytes =
+        private_footprint_bytes + obj.private_footprint_bytes;
     return res;
   }
 
@@ -92,6 +115,8 @@ struct MemoryUsage {
         total_allocated_bytes - obj.total_allocated_bytes;
     res.in_use_allocated_bytes =
         in_use_allocated_bytes - obj.in_use_allocated_bytes;
+    res.private_footprint_bytes =
+        private_footprint_bytes - obj.private_footprint_bytes;
     return res;
   }
 
@@ -105,7 +130,7 @@ struct MemoryUsage {
 };
 
 // Return the memory usage from the system.
-// Note: this currently only works on Linux-based and Apple systems.
+// Note: this currently only works on Linux-based, Apple, and Windows systems.
 MemoryUsage GetMemoryUsage();
 
 }  // namespace memory
diff --git a/tensorflow/lite/profiling/memory_info_test.cc b/tensorflow/lite/profiling/memory_info_test.cc
index 3051338dbce148..0cd3abf60f7255 100644
--- a/tensorflow/lite/profiling/memory_info_test.cc
+++ b/tensorflow/lite/profiling/memory_info_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/lite/profiling/memory_info.h"
 
 #include <memory>
-#include <new>
 #include <sstream>
 #include <string>
 
@@ -30,20 +29,24 @@ TEST(MemoryUsage, AddAndSub) {
   mem1.mem_footprint_kb = 5;
   mem1.total_allocated_bytes = 7000;
   mem1.in_use_allocated_bytes = 2000;
+  mem1.private_footprint_bytes = 1000;
 
   mem2.mem_footprint_kb = 3;
   mem2.total_allocated_bytes = 7000;
   mem2.in_use_allocated_bytes = 4000;
+  mem2.private_footprint_bytes = 500;
 
   const auto add_mem = mem1 + mem2;
   EXPECT_EQ(8, add_mem.mem_footprint_kb);
   EXPECT_EQ(14000, add_mem.total_allocated_bytes);
   EXPECT_EQ(6000, add_mem.in_use_allocated_bytes);
+  EXPECT_EQ(1500, add_mem.private_footprint_bytes);
 
   const auto sub_mem = mem1 - mem2;
   EXPECT_EQ(2, sub_mem.mem_footprint_kb);
   EXPECT_EQ(0, sub_mem.total_allocated_bytes);
   EXPECT_EQ(-2000, sub_mem.in_use_allocated_bytes);
+  EXPECT_EQ(500, sub_mem.private_footprint_bytes);
 }
 
 TEST(MemoryUsage, GetMemoryUsage) {
@@ -51,8 +54,9 @@ TEST(MemoryUsage, GetMemoryUsage) {
   EXPECT_EQ(MemoryUsage::kValueNotSet, result.mem_footprint_kb);
   EXPECT_EQ(MemoryUsage::kValueNotSet, result.total_allocated_bytes);
   EXPECT_EQ(MemoryUsage::kValueNotSet, result.in_use_allocated_bytes);
+  EXPECT_EQ(MemoryUsage::kValueNotSet, result.private_footprint_bytes);
 
-#if defined(__linux__) || defined(__APPLE__)
+#if defined(__linux__) || defined(__APPLE__) || defined(_WIN32)
   // Just allocate some space in heap so that we have some meaningful
   // memory usage to report.
   constexpr int size = 10 * 1024 * 1024;
@@ -70,9 +74,32 @@ TEST(MemoryUsage, GetMemoryUsage) {
   }
 
   EXPECT_GE(result.mem_footprint_kb, size / 1024);
+#if (defined(__linux__) && !defined(ADDRESS_SANITIZER) &&         \
+     !defined(MEMORY_SANITIZER) && !defined(THREAD_SANITIZER)) || \
+    (defined(__APPLE__) && !defined(THREAD_SANITIZER)) || defined(_WIN32)
   EXPECT_GE(result.total_allocated_bytes, size);
+  EXPECT_NE(result.total_allocated_bytes, -1);
   EXPECT_GE(result.in_use_allocated_bytes, size);
-#endif
+  EXPECT_NE(result.in_use_allocated_bytes, -1);
+#else
+  // The mallinfo() function, which is used on Linux, returns invalid
+  // results when address/memory/thread sanitizer is enabled, e.g.
+  // <https://github.com/google/sanitizers/issues/1845>, so the
+  // *_allocated_bytes fields are not supported in those cases,
+  // and should be set to either -1 or kValueNotSet(0).
+  // For Apple platforms, the mstats() function returns invalid results when
+  // thread sanitizer is enabled.
+  if (result.total_allocated_bytes != -1) {
+    EXPECT_EQ(result.total_allocated_bytes, MemoryUsage::kValueNotSet);
+  }
+  if (result.in_use_allocated_bytes != -1) {
+    EXPECT_EQ(result.in_use_allocated_bytes, MemoryUsage::kValueNotSet);
+  }
+#endif  // (defined(__linux__) && !defined(ADDRESS_SANITIZER) && \
+        // !defined(MEMORY_SANITIZER) && !defined(THREAD_SANITIZER)) || \
+        // (defined(__APPLE__) && !defined(THREAD_SANITIZER)) || defined(_WIN32)
+  EXPECT_GE(result.private_footprint_bytes, size);
+#endif  // defined(__linux__) || defined(__APPLE__) || defined(_WIN32)
 }
 
 // The main aim of this test is just to exercise the code for
@@ -89,7 +116,7 @@ TEST(MemoryUsage, OutputMemoryUsageToStream) {
 }
 
 TEST(MemoryUsage, IsSupported) {
-#if defined(__linux__) || defined(__APPLE__)
+#if defined(__linux__) || defined(__APPLE__) || defined(_WIN32)
   EXPECT_TRUE(MemoryUsage::IsSupported());
 #else
   EXPECT_FALSE(MemoryUsage::IsSupported());
diff --git a/tensorflow/lite/profiling/memory_latency_logger.cc b/tensorflow/lite/profiling/memory_latency_logger.cc
index d485a1b205c5ca..5cf102e92814b3 100644
--- a/tensorflow/lite/profiling/memory_latency_logger.cc
+++ b/tensorflow/lite/profiling/memory_latency_logger.cc
@@ -80,10 +80,16 @@ void MemoryLatencyLogger::Stop(absl::string_view log_message) {
                    << " MB,";
   }
   if (mem_monitor_->GetCurrentInUseMemoryInMB() < 0) {
-    message_stream << " current in-use: unknown";
+    message_stream << " current in-use: unknown,";
   } else {
     message_stream << " current in-use: "
-                   << mem_monitor_->GetCurrentInUseMemoryInMB() << " MB";
+                   << mem_monitor_->GetCurrentInUseMemoryInMB() << " MB,";
+  }
+  if (mem_monitor_->GetPeakPrivateFootprintInMB() < 0) {
+    message_stream << " peak private: unknown";
+  } else {
+    message_stream << " peak private: "
+                   << mem_monitor_->GetPeakPrivateFootprintInMB() << " MB";
   }
   TFLITE_LOG(INFO) << message_stream.str();
 }
diff --git a/tensorflow/lite/profiling/memory_usage_monitor.cc b/tensorflow/lite/profiling/memory_usage_monitor.cc
index e00a5ec63d73a6..b6f852334ba04e 100644
--- a/tensorflow/lite/profiling/memory_usage_monitor.cc
+++ b/tensorflow/lite/profiling/memory_usage_monitor.cc
@@ -62,6 +62,11 @@ void MemoryUsageMonitor::Start() {
       if (current_in_use_bytes > peak_in_use_mem_bytes_) {
         peak_in_use_mem_bytes_ = current_in_use_bytes;
       }
+      int64_t current_private_footprint_bytes =
+          mem_info.private_footprint_bytes;
+      if (current_private_footprint_bytes > peak_private_footprint_bytes_) {
+        peak_private_footprint_bytes_ = current_private_footprint_bytes;
+      }
       if (stop_signal_->HasBeenNotified()) break;
       sampler_->SleepFor(sampling_interval_);
     }
diff --git a/tensorflow/lite/profiling/memory_usage_monitor.h b/tensorflow/lite/profiling/memory_usage_monitor.h
index 6576c0831bfc23..c794aac2b86f4a 100644
--- a/tensorflow/lite/profiling/memory_usage_monitor.h
+++ b/tensorflow/lite/profiling/memory_usage_monitor.h
@@ -87,6 +87,13 @@ class MemoryUsageMonitor {
     return BytesToMegabytes(peak_in_use_mem_bytes_);
   }
 
+  float GetPeakPrivateFootprintInMB() const {
+    if (!is_supported_ || check_memory_thd_ != nullptr) {
+      return kInvalidMemUsageMB;
+    }
+    return BytesToMegabytes(peak_private_footprint_bytes_);
+  }
+
   MemoryUsageMonitor(MemoryUsageMonitor&) = delete;
   MemoryUsageMonitor& operator=(const MemoryUsageMonitor&) = delete;
   MemoryUsageMonitor(MemoryUsageMonitor&&) = delete;
@@ -105,6 +112,7 @@ class MemoryUsageMonitor {
   std::unique_ptr<std::thread> check_memory_thd_ = nullptr;
   int64_t peak_mem_footprint_bytes_ = kInvalidMemUsageBytes;
   int64_t peak_in_use_mem_bytes_ = kInvalidMemUsageBytes;
+  int64_t peak_private_footprint_bytes_ = kInvalidMemUsageBytes;
 };
 
 }  // namespace memory
diff --git a/tensorflow/lite/profiling/proto/model_runtime_info.proto b/tensorflow/lite/profiling/proto/model_runtime_info.proto
index 3c5bde18c9a3bf..a2f906903ecdbc 100644
--- a/tensorflow/lite/profiling/proto/model_runtime_info.proto
+++ b/tensorflow/lite/profiling/proto/model_runtime_info.proto
@@ -118,6 +118,7 @@ message Edge {
     UINT16 = 17;
     INT4 = 18;
     BFLOAT16 = 19;
+    INT2 = 20;
   }
   // LINT.ThenChange(//tensorflow/lite/profiling/model_runtime_info.cc:EdgeDataTypeTransform)
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index bc40cdbf5f5c92..8a2549e1c623db 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -321,7 +321,7 @@ py_strict_test(
         "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/trackable:autotrackable",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_jax//:pkg",
+        "@pypi//jax",
     ],
 )
 
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 7bd4c5e9411271..3337f6badfb515 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -25,8 +25,6 @@
 from tensorflow.compiler.mlir.lite import types_pb2 as _types_pb2
 from tensorflow.compiler.mlir.lite.metrics import converter_error_data_pb2
 from tensorflow.compiler.mlir.lite.python import wrap_converter
-from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2
-from tensorflow.compiler.mlir.quantization.stablehlo import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
 from tensorflow.lite.python.convert_phase import Component
@@ -432,7 +430,6 @@ def build_conversion_flags(
     guarantee_all_funcs_one_use=False,
     enable_mlir_variable_quantization=False,
     disable_fuse_mul_and_fc=False,
-    quantization_options: Optional[quant_opts_pb2.QuantizationOptions] = None,
     ir_dump_dir=None,
     ir_dump_pass_regex=None,
     ir_dump_func_regex=None,
@@ -441,9 +438,6 @@ def build_conversion_flags(
     print_ir_after=None,
     print_ir_module_scope=None,
     elide_elementsattrs_if_larger=None,
-    quantization_config: Optional[
-        quantization_config_pb2.QuantizationConfig
-    ] = None,
     use_buffer_offset=False,
     reduce_type_precision=False,
     qdq_conversion_mode=None,
@@ -548,11 +542,6 @@ def build_conversion_flags(
       graph.
     disable_fuse_mul_and_fc: Disable fusing input multiplication with
       fullyconnected operations. Useful when quantizing weights.
-    quantization_options: [Deprecated] Config to indicate quantization options
-      of each components (ex: weight, bias, activation). This can be a preset
-      method or a custom method, and allows finer, modular control. This option
-      will override any other existing quantization flags. We plan on gradually
-      migrating all quantization-related specs into this option.
     ir_dump_dir: A string specifying the target directory to output MLIR dumps
       produced during conversion. If populated, enables MLIR dumps.
     ir_dump_pass_regex: A string containing a regular expression for filtering
@@ -570,8 +559,6 @@ def build_conversion_flags(
       operation when printing IR for print_ir_[before|after].
     elide_elementsattrs_if_larger: An int, if specified elides ElementsAttrs
       with '...' that have more elements than the given upper limit.
-    quantization_config: Configures the StableHLO Quantizer. See the comments in
-      `QuantizationConfig` protobuf definition for details.
     use_buffer_offset: Force the model use buffer_offset & buffer_size fields
       instead of data. i.e. store the constant tensor and custom op binaries
       outside of Flatbuffers
@@ -682,10 +669,6 @@ def build_conversion_flags(
       enable_mlir_variable_quantization
   )
   conversion_flags.disable_fuse_mul_and_fc = disable_fuse_mul_and_fc
-  if quantization_options:  # Deprecated
-    conversion_flags.quantization_options.CopyFrom(quantization_options)
-  if quantization_config:
-    conversion_flags.quantization_config.CopyFrom(quantization_config)
 
   # Transfer debug options. Check for existence before populating in order to
   # leverage defaults specified in proto definition.
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 922f655dfc855d..ccf2bf2b61651e 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -615,7 +615,12 @@ PyObject* InterpreterWrapper::SetTensor(int tensor_index, PyObject* value,
   TfLiteTensor* tensor =
       interpreter_->subgraph(subgraph_index)->tensor(tensor_index);
 
-  if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
+  if (tensor->type == kTfLiteInt4) {
+    return python_utils::SetInt4Tensor(tensor, array, tensor_index);
+  }
+
+  TfLiteType incoming_type = python_utils::TfLiteTypeFromPyArray(array);
+  if (incoming_type != tensor->type) {
     PyErr_Format(PyExc_ValueError,
                  "Cannot set tensor:"
                  " Got value of type %s"
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index c385ef73232fc3..44203e9f146dda 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 
 #define TFLITE_IMPORT_NUMPY  // See numpy.h for explanation.
 #include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 
 namespace tflite {
@@ -55,6 +58,9 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
     case kTfLiteInt4:
       // TODO(b/246806634): NPY_INT4 currently doesn't exist
       return NPY_BYTE;
+    case kTfLiteInt2:
+      // TODO(b/246806634): NPY_INT2 currently doesn't exist
+      return NPY_BYTE;
     case kTfLiteUInt8:
       return NPY_UINT8;
     case kTfLiteInt8:
@@ -217,5 +223,67 @@ bool FillStringBufferWithPyArray(PyObject* value,
   return false;
 }
 
+// Helper function to pack int8 numpy array data into an INT4 tensor.
+PyObject* SetInt4Tensor(TfLiteTensor* tensor, PyArrayObject* array,
+                        int tensor_index) {
+  TfLiteType incoming_type = TfLiteTypeFromPyArray(array);
+  if (incoming_type != kTfLiteInt8) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Expected a numpy array of int8 for INT4 input "
+                 "%d, name: %s, but got %s",
+                 tensor_index, tensor->name, TfLiteTypeGetName(incoming_type));
+    return nullptr;
+  }
+
+  size_t num_elements = 1;
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    num_elements *= tensor->dims->data[i];
+  }
+  size_t expected_packed_bytes = (num_elements + 1) / 2;
+  size_t actual_numpy_bytes = PyArray_NBYTES(array);
+
+  if (actual_numpy_bytes != num_elements) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Numpy array for INT4 input %d, name: %s, has %zu bytes, "
+                 "but expected %zu bytes for %zu elements",
+                 tensor_index, tensor->name, actual_numpy_bytes, num_elements,
+                 num_elements);
+    return nullptr;
+  }
+
+  if (tensor->data.raw == nullptr && tensor->bytes) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Tensor is unallocated. Try calling allocate_tensors()"
+                 " first for input %d, name: %s",
+                 tensor_index, tensor->name);
+    return nullptr;
+  }
+
+  // Pack the int8 array into int4
+  uint8_t* packed_data = reinterpret_cast<uint8_t*>(tensor->data.raw);
+  int8_t* numpy_data = reinterpret_cast<int8_t*>(PyArray_DATA(array));
+  for (size_t i = 0; i < expected_packed_bytes; ++i) {
+    int8_t first_nibble = numpy_data[2 * i];
+    int8_t second_nibble =
+        (2 * i + 1 < num_elements) ? numpy_data[2 * i + 1] : 0;
+    if ((first_nibble < -8 || first_nibble > 7) ||
+        (second_nibble < -8 || second_nibble > 7)) {
+      PyErr_Format(PyExc_ValueError,
+                   "Cannot set tensor:"
+                   " Values for INT4 input must be between -8 and 7.");
+      return nullptr;
+    }
+    // Pack the two int8 values into a single byte. The first nibble
+    // occupies the lower 4 bits and the second nibble occupies the upper 4
+    // bits. We mask the first nibble with 0x0F to ensure only the lower 4
+    // bits are used, handling potential sign extension in the int8 value.
+    packed_data[i] = (first_nibble & 0x0F) | (second_nibble << 4);
+  }
+  Py_RETURN_NONE;
+}
+
 }  // namespace python_utils
 }  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
index e04418c32df7f4..d543a758c14e4d 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.h
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -72,6 +72,10 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array);
 bool FillStringBufferWithPyArray(PyObject* value,
                                  DynamicBuffer* dynamic_buffer);
 
+// Helper function to pack int8 numpy array data into an INT4 tensor.
+PyObject* SetInt4Tensor(TfLiteTensor* tensor, PyArrayObject* array,
+                        int tensor_index);
+
 }  // namespace python_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 544bc79940be58..2e9dcd47d22fe3 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -122,6 +122,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_INT32;
     case kTfLiteUInt32:
       return TensorType_UINT32;
+    case kTfLiteInt2:
+      return TensorType_INT2;
     case kTfLiteInt4:
       return TensorType_INT4;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/testdata/no_signatures_no_tensor_names.bin b/tensorflow/lite/testdata/no_signatures_no_tensor_names.bin
new file mode 100644
index 00000000000000..c5d285995a637c
Binary files /dev/null and b/tensorflow/lite/testdata/no_signatures_no_tensor_names.bin differ
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 46d817cd0c065c..6f0139eeea9229 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -174,6 +174,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_protobuf//:protobuf_headers",
     ],
 )
@@ -307,6 +308,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/lite/toco/format_port.h b/tensorflow/lite/toco/format_port.h
index b0f1f2e7c14845..2c0516021fc1cc 100644
--- a/tensorflow/lite/toco/format_port.h
+++ b/tensorflow/lite/toco/format_port.h
@@ -12,59 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// This file is used to provide equivalents of internal absl::FormatF
-// and absl::StrAppendFormat. Unfortunately, type safety is not as good as a
-// a full C++ example.
-// TODO(aselle): When absl adds support for StrFormat, use that instead.
+
 #ifndef TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
 #define TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
 
 #include <string>
 
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/lite/toco/toco_types.h"
+#include "absl/strings/str_format.h"
 
 namespace toco {
 namespace port {
 
-/// Identity (default case)
-template <class T>
-T IdentityOrConvertStringToRaw(T foo) {
-  return foo;
-}
-
-// Overloaded case where we return std::string.
-inline const char* IdentityOrConvertStringToRaw(const std::string& foo) {
-  return foo.c_str();
-}
-
-// Delegate to TensorFlow Appendf function until absl has an equivalent.
-template <typename... Args>
-inline void AppendFHelper(std::string* destination, const char* fmt,
-                          Args&&... args) {
-  tensorflow::strings::Appendf(destination, fmt, args...);
-}
-
-// Specialization for no argument format string (avoid security bug).
-inline void AppendFHelper(std::string* destination, const char* fmt) {
-  tensorflow::strings::Appendf(destination, "%s", fmt);
-}
-
-// Append formatted string (with format fmt and args args) to the string
-// pointed to by destination. fmt follows C printf semantics.
-// One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline void AppendF(std::string* destination, const char* fmt, Args&&... args) {
-  AppendFHelper(destination, fmt, IdentityOrConvertStringToRaw(args)...);
+inline void AppendF(std::string* destination,
+                    const absl::FormatSpec<Args...>& fmt, Args&&... args) {
+  absl::StrAppendFormat(destination, fmt, args...);
 }
 
-// Return formatted string (with format fmt and args args). fmt follows C printf
-// semantics. One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline std::string StringF(const char* fmt, Args&&... args) {
-  std::string result;
-  AppendFHelper(&result, fmt, IdentityOrConvertStringToRaw(args)...);
-  return result;
+inline std::string StringF(const absl::FormatSpec<Args...>& fmt,
+                           Args&&... args) {
+  return absl::StrFormat(fmt, args...);
 }
 
 }  // namespace port
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
index ca581832f7b4e1..a6e00c588af7fc 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -48,7 +48,7 @@ absl::Status FuseActivationFunctions::Run(Model* model, std::size_t op_index,
   if (CountTrueOutputs(*model, *op) > 1) {
     AddMessageF(
         "Not fusing activation function %s into %s because it has more than "
-        "one  consumed output",
+        "one consumed output",
         LogName(*ac_op), LogName(*op));
     return absl::OkStatus();
   }
@@ -59,8 +59,8 @@ absl::Status FuseActivationFunctions::Run(Model* model, std::size_t op_index,
   DCHECK_GE(count_ops_consuming_output, 1);
   if (count_ops_consuming_output > 1) {
     AddMessageF(
-        "Not fusing activation function into %s because it is consumed by more "
-        "than 1 other operator",
+        "Not fusing activation function %s into %s because it is consumed by "
+        "more than 1 other operator",
         LogName(*ac_op), LogName(*op));
     return absl::OkStatus();
   }
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 7e0b57c8dd5d60..a87449cd9d00de 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/toco_port.h"
@@ -43,7 +44,8 @@ class GraphTransformation {
   // Adds a message; normally only called by the graph transformation
   // itself during its run (this function could be protected).
   template <typename... Args>
-  void AddMessageF(const char* format, const Args&... args) {
+  void AddMessageF(const absl::FormatSpec<Args...>& format,
+                   const Args&... args) {
     return messages_.push_back(toco::port::StringF(format, args...));
   }
 
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 553830bd9c5d32..4a41cd3565e1df 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -18,12 +18,14 @@ limitations under the License.
 // Portability layer for toco tool. Mainly, abstract filesystem access so we
 // can build and use on google internal environments and on OSX.
 
+#include <cstdint>
 #include <string>
+
 #include "google/protobuf/text_format.h"
-#include "tensorflow/lite/toco/format_port.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/lite/toco/format_port.h"
 #if defined(PLATFORM_GOOGLE)
 #include "absl/strings/cord.h"
 #endif  // PLATFORM_GOOGLE
@@ -39,17 +41,16 @@ limitations under the License.
 namespace std {
 
 template <typename T>
-std::string to_string(T value)
-{
-    std::ostringstream os ;
-    os << value ;
-    return os.str() ;
+std::string to_string(T value) {
+  std::ostringstream os;
+  os << value;
+  return os.str();
 }
 
 #ifdef __ARM_ARCH_7A__
 double round(double x);
 #endif
-}
+}  // namespace std
 #endif
 
 namespace toco {
@@ -84,7 +85,7 @@ void CopyToBuffer(const ::absl::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const std::string& src, char* dest);
 
-inline uint32 ReverseBits32(uint32 n) {
+inline uint32_t ReverseBits32(uint32_t n) {
   n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
   n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
   n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index e83dd5fa3b719c..4577c2afb2c867 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -126,10 +126,10 @@ cc_test(
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/benchmark/proto:benchmark_result_cc",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "@com_google_absl//absl/algorithm",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -183,6 +183,7 @@ cc_library(
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools:model_loader",
         "//tensorflow/lite/tools:utils",
+        "//tensorflow/lite/tools/benchmark/proto:benchmark_result_cc",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/delegates:tflite_execution_providers",
         "@com_google_absl//absl/base:core_headers",
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 053035aa752702..1cbef60a257136 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_performance_options.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/proto/benchmark_result.pb.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/logging.h"
@@ -614,6 +615,83 @@ TEST(BenchmarkTest, InitializationFailedWhenInvalidGraphFdIsProvided) {
   EXPECT_EQ(benchmark.Init(), kTfLiteError);
 }
 
+class TestBenchmarkListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    results_ = results;
+  }
+
+  const BenchmarkResults& results() const { return results_; }
+
+ private:
+  BenchmarkResults results_;
+};
+
+TEST(BenchmarkTest, BenchmarkResultFileIsWritten) {
+  ASSERT_THAT(g_fp32_model_path, testing::NotNull());
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+
+  std::string result_file_path = "/tmp/result.txtproto";
+#if defined(__ANDROID__)
+  result_file_path = "/data/local/tmp/result.txtproto";
+#endif
+  params.Set<std::string>("result_file_path", result_file_path);
+  params.Set<bool>("report_peak_memory_footprint", true);
+  params.Set<std::string>("graph", *g_fp32_model_path);
+
+  TestBenchmark benchmark(std::move(params));
+  TestBenchmarkListener listener;
+  benchmark.AddListener(&listener);
+  benchmark.Run();
+
+  std::ifstream in_file(result_file_path, std::ios::binary | std::ios::in);
+  tflite::tools::benchmark::BenchmarkResult result;
+  result.ParseFromIstream(&in_file);
+
+  // Verify latency metrics.
+  EXPECT_FLOAT_EQ(result.latency_metrics().init_ms(),
+                  listener.results().startup_latency_us() / 1000.0);
+  EXPECT_FLOAT_EQ(result.latency_metrics().first_inference_ms(),
+                  listener.results().warmup_time_us().first() / 1000.0);
+  EXPECT_FLOAT_EQ(result.latency_metrics().average_warm_up_ms(),
+                  listener.results().warmup_time_us().avg() / 1000.0);
+  EXPECT_FLOAT_EQ(result.latency_metrics().avg_ms(),
+                  listener.results().inference_time_us().avg() / 1000.0);
+  EXPECT_FLOAT_EQ(result.latency_metrics().min_ms(),
+                  listener.results().inference_time_us().min() / 1000.0);
+  EXPECT_FLOAT_EQ(result.latency_metrics().max_ms(),
+                  listener.results().inference_time_us().max() / 1000.0);
+  EXPECT_FLOAT_EQ(
+      result.latency_metrics().stddev_ms(),
+      listener.results().inference_time_us().std_deviation() / 1000.0);
+  EXPECT_FLOAT_EQ(
+      result.latency_metrics().median_ms(),
+      listener.results().inference_time_us().percentile(50) / 1000.0);
+  EXPECT_FLOAT_EQ(
+      result.latency_metrics().p95_ms(),
+      listener.results().inference_time_us().percentile(95) / 1000.0);
+  EXPECT_FLOAT_EQ(
+      result.latency_metrics().p5_ms(),
+      listener.results().inference_time_us().percentile(5) / 1000.0);
+
+  // Verify memory metrics.
+  EXPECT_EQ(result.memory_metrics().init_footprint_kb(),
+            listener.results().init_mem_usage().mem_footprint_kb);
+  EXPECT_EQ(result.memory_metrics().overall_footprint_kb(),
+            listener.results().overall_mem_usage().mem_footprint_kb);
+  EXPECT_EQ(result.memory_metrics().has_peak_mem_mb(), true);
+
+  // Verify misc metrics.
+  EXPECT_FLOAT_EQ(result.misc_metrics().model_size_mb(),
+                  listener.results().model_size_mb());
+  EXPECT_EQ(result.misc_metrics().num_runs(),
+            listener.results().inference_time_us().count());
+  EXPECT_EQ(result.misc_metrics().num_warmup_runs(),
+            listener.results().warmup_time_us().count());
+  EXPECT_FLOAT_EQ(result.misc_metrics().model_throughput_in_mb_per_sec(),
+                  listener.results().throughput_MB_per_second());
+}
+
 }  // namespace
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 37e488f02aed40..9384380ca6037e 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/benchmark/benchmark_params.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
 #include "tensorflow/lite/tools/benchmark/profiling_listener.h"
+#include "tensorflow/lite/tools/benchmark/proto/benchmark_result.pb.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/logging.h"
 #include "tensorflow/lite/tools/model_loader.h"
@@ -74,6 +75,7 @@ RegisterSelectedOps(::tflite::MutableOpResolver* resolver) {}
 namespace tflite {
 namespace benchmark {
 namespace {
+using ::tflite::tools::benchmark::BenchmarkResult;
 using utils::InputTensorData;
 using utils::VoidUniquePtr;
 
@@ -264,6 +266,78 @@ class ModelRuntimeInfoListener : public BenchmarkListener {
   Interpreter* const interpreter_ = nullptr;  // not own the memory.
 };
 
+// Dumps the benchmark result to a file in proto format if result_file_path is
+// set.
+class ProtoBenchmarkReporter : public BenchmarkListener {
+ public:
+  void OnBenchmarkStart(
+      const ::tflite::benchmark::BenchmarkParams& params) override {
+    if (!params.Get<std::string>("result_file_path").empty()) {
+      result_file_path_ =
+          std::string(params.Get<std::string>("result_file_path"));
+    }
+  }
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    if (!result_file_path_.empty()) {
+      auto inference_us = results.inference_time_us();
+      auto init_us = results.startup_latency_us();
+      auto warmup_us = results.warmup_time_us();
+      auto init_mem_usage = results.init_mem_usage();
+      auto overall_mem_usage = results.overall_mem_usage();
+
+      BenchmarkResult result;
+      result.mutable_latency_metrics()->set_init_ms(init_us / 1000.0);
+      result.mutable_latency_metrics()->set_first_inference_ms(
+          warmup_us.first() / 1000.0);
+      result.mutable_latency_metrics()->set_average_warm_up_ms(warmup_us.avg() /
+                                                               1000.0);
+      result.mutable_latency_metrics()->set_min_ms(inference_us.min() / 1000.0);
+      result.mutable_latency_metrics()->set_max_ms(inference_us.max() / 1000.0);
+      result.mutable_latency_metrics()->set_stddev_ms(
+          inference_us.std_deviation() / 1000.0);
+      result.mutable_latency_metrics()->set_avg_ms(inference_us.avg() / 1000.0);
+      result.mutable_latency_metrics()->set_median_ms(
+          inference_us.percentile(50) / 1000.0);
+      result.mutable_latency_metrics()->set_p5_ms(inference_us.percentile(5) /
+                                                  1000.0);
+      result.mutable_latency_metrics()->set_p95_ms(inference_us.percentile(95) /
+                                                   1000.0);
+      if (init_mem_usage.IsSupported()) {
+        result.mutable_memory_metrics()->set_init_footprint_kb(
+            init_mem_usage.mem_footprint_kb);
+        result.mutable_memory_metrics()->set_overall_footprint_kb(
+            overall_mem_usage.mem_footprint_kb);
+        if (results.peak_mem_mb() > 0) {
+          result.mutable_memory_metrics()->set_peak_mem_mb(
+              results.peak_mem_mb());
+        }
+      }
+
+      result.mutable_misc_metrics()->set_model_size_mb(results.model_size_mb());
+      result.mutable_misc_metrics()->set_num_runs(inference_us.count());
+      result.mutable_misc_metrics()->set_num_warmup_runs(warmup_us.count());
+      result.mutable_misc_metrics()->set_model_throughput_in_mb_per_sec(
+          results.throughput_MB_per_second());
+
+      std::ofstream out_file(result_file_path_,
+                             std::ios::binary | std::ios::out);
+      if (out_file.good()) {
+        TFLITE_LOG(INFO) << "Saving benchmark result to: " << result_file_path_;
+        result.SerializeToOstream(&out_file);
+        out_file.close();
+        TFLITE_LOG(INFO) << "Saved benchmark result to: " << result_file_path_;
+      } else {
+        TFLITE_LOG(ERROR) << "Failed to save benchmark result to: "
+                          << result_file_path_;
+      }
+    }
+  }
+
+ private:
+  std::string result_file_path_;
+};
+
 std::vector<std::string> Split(const std::string& str, const char delim) {
   if (str.empty()) {
     return {};
@@ -615,6 +689,8 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("output_proto_filepath",
                           BenchmarkParam::Create<std::string>(""));
+  default_params.AddParam("result_file_path",
+                          BenchmarkParam::Create<std::string>(""));
 
   default_params.AddParam("tensor_name_display_length",
                           BenchmarkParam::Create<int32_t>(25));
@@ -747,7 +823,10 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
           "default signature will be used."),
       CreateFlag<bool>("list_signatures", &params_,
                        "Displays all signatures present in the model and then "
-                       "terminates the program.")};
+                       "terminates the program."),
+      CreateFlag<std::string>(
+          "result_file_path", &params_,
+          "Path to save the benchmark result in binary proto format.")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
 
@@ -811,6 +890,10 @@ void BenchmarkTfLiteModel::LogParams() {
   LOG_BENCHMARK_PARAM(std::string, "output_proto_filepath",
                       "File path to export outputs layer as tf example to",
                       verbose);
+  LOG_BENCHMARK_PARAM(std::string, "result_file_path",
+                      "File path to save the benchmark result in binary proto "
+                      "format",
+                      verbose);
   LOG_BENCHMARK_PARAM(int32_t, "tensor_name_display_length",
                       "Tensor name display length", verbose);
   LOG_BENCHMARK_PARAM(int32_t, "tensor_type_display_length",
@@ -1242,6 +1325,9 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   AddOwnedListener(
       std::unique_ptr<BenchmarkListener>(new RuyProfileListener()));
 
+  AddOwnedListener(
+      std::unique_ptr<BenchmarkListener>(new ProtoBenchmarkReporter()));
+
   AddOwnedListener(std::unique_ptr<BenchmarkListener>(
       new OutputSaver(interpreter_runner_.get())));
 
@@ -1271,12 +1357,11 @@ TfLiteStatus BenchmarkTfLiteModel::LoadModel() {
 std::unique_ptr<tflite::OpResolver> BenchmarkTfLiteModel::GetOpResolver()
     const {
   tflite::ops::builtin::BuiltinOpResolver* resolver = nullptr;
-  // When --use_xnnpack is explicitly set to false, skip applying the default
-  // XNNPACK delegate in TfLite runtime so that the original execution path
-  // based on the unmodified model graph is still exercised.
+  // When --use_xnnpack is explicitly set, skip applying the default XNNPACK
+  // delegate in TfLite runtime so that the execution path either doesn't use
+  // the XNNPack delegate or only uses the one applied explicitly.
   if (params_.HasParam("use_xnnpack") &&
-      params_.HasValueSet<bool>("use_xnnpack") &&
-      !params_.Get<bool>("use_xnnpack")) {
+      params_.HasValueSet<bool>("use_xnnpack")) {
     resolver =
         new tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates();
   } else {
diff --git a/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto b/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto
index ab0c5d2588eb8c..2001d216d28b20 100644
--- a/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto
+++ b/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto
@@ -1,3 +1,18 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 syntax = "proto2";
 
 package tflite.tools.benchmark;
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
index 0dea571cb442f4..f03ae364d385e7 100644
--- a/tensorflow/lite/tools/cmake/modules/eigen.cmake
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   eigen
   GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
   # Sync with tensorflow/third_party/eigen3/workspace.bzl
-  GIT_TAG 70d8d99d0df9fd967b135efd8d12ed20fc48d007
+  GIT_TAG dcbaf2d608f306450f1e74949eb87e9a22a7ef4b
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 9ce01d879c679e..f863e7cbb3b632 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG ea1906f8df2faf8172da1b341c563bf9115581dd
+  GIT_TAG fa0fd6471a39a5d66a59d4cd8f8cc4a93a4bd470
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/serialization/enum_mapping.h b/tensorflow/lite/tools/serialization/enum_mapping.h
index 4b25a878ee8d55..771f5293b1e4bb 100644
--- a/tensorflow/lite/tools/serialization/enum_mapping.h
+++ b/tensorflow/lite/tools/serialization/enum_mapping.h
@@ -74,6 +74,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_UINT32;
     case kTfLiteInt4:
       return TensorType_INT4;
+    case kTfLiteInt2:
+      return TensorType_INT2;
     case kTfLiteUInt8:
       return TensorType_UINT8;
     case kTfLiteInt8:
diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index 05059794c50d15..c4344d667f158f 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -153,6 +153,11 @@ InputTensorData CreateRandomTensorData(std::string name, TfLiteType type,
           num_elements,
           std::uniform_int_distribution<int16_t>(low_range, high_range));
     }
+    case kTfLiteUInt16: {
+      return CreateInputTensorData<uint16_t>(
+          num_elements,
+          std::uniform_int_distribution<uint16_t>(low_range, high_range));
+    }
     case kTfLiteUInt8: {
       // std::uniform_int_distribution is specified not to support char types.
       return CreateInputTensorData<uint8_t>(
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility.cc b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
index 6fc7b87bdf1e1a..d42f796cc1e228 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
@@ -300,9 +300,6 @@ absl::Status CheckDepthwiseConvGpuDelegateCompatibility(
   if (bias && NumElements(bias->dims) != output_depth) {
     return absl::InvalidArgumentError("bias.size != output.c");
   }
-  if (depth_multiplier != 1 && input_depth != 1) {
-    return absl::UnimplementedError("depth_multiplier != 1 && input.c != 1");
-  }
   return absl::OkStatus();
 }
 
@@ -1007,6 +1004,12 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig,
                                          /*required_outputs=*/1));
       return absl::OkStatus();
 
+    case kTfLiteBuiltinTopkV2:
+      RETURN_IF_ERROR(CheckInputsOutputs(op_sig,
+                                         /*required_runtime_inputs=*/1,
+                                         /*required_outputs=*/2));
+      return absl::OkStatus();
+
     case kTfLiteBuiltinTranspose:
       RETURN_IF_ERROR(CheckInputsOutputs(op_sig,
                                          /*required_runtime_inputs=*/1,
diff --git a/third_party/xla/build_tools/rocm/platform/BUILD b/tensorflow/lite/types/BUILD
similarity index 73%
rename from third_party/xla/build_tools/rocm/platform/BUILD
rename to tensorflow/lite/types/BUILD
index 94e73c9f542638..c00aadb6ae46e9 100644
--- a/third_party/xla/build_tools/rocm/platform/BUILD
+++ b/tensorflow/lite/types/BUILD
@@ -1,4 +1,4 @@
-# Copyright 2025 The OpenXLA Authors.
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,9 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ============================================================================
+# ==============================================================================
+
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "half",
+    hdrs = [
+        "bit_cast.h",
+        "fp16.h",
+        "half.h",
+    ],
 )
diff --git a/tensorflow/lite/types/bit_cast.h b/tensorflow/lite/types/bit_cast.h
new file mode 100644
index 00000000000000..77d9772653d10d
--- /dev/null
+++ b/tensorflow/lite/types/bit_cast.h
@@ -0,0 +1,36 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TYPES_BIT_CAST_H_
+#define TENSORFLOW_LITE_TYPES_BIT_CAST_H_
+
+#include <cstring>
+
+namespace tflite {
+
+// Unfortunately, std::bit_cast is C++20, which we can't use. More unfortunately
+// it seems impossible to hack together a constexpr bit_cast without compiler
+// support.
+template <typename To, typename From>
+To bit_cast(From x) {
+  static_assert(sizeof(To) == sizeof(From), "");
+  To result;
+  memcpy(&result, &x, sizeof(result));
+  return result;
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TYPES_BIT_CAST_H_
diff --git a/tensorflow/lite/types/fp16.h b/tensorflow/lite/types/fp16.h
new file mode 100644
index 00000000000000..cc63fe7d21fbd8
--- /dev/null
+++ b/tensorflow/lite/types/fp16.h
@@ -0,0 +1,219 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TYPES_FP16_H_
+#define TENSORFLOW_LITE_TYPES_FP16_H_
+
+#include <stdint.h>
+
+// This file is an excerpt from
+// https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h,
+// including only the minimal functionality we need in XNNPACK. This works
+// around some issues that we haven't been able to fix upstream
+// (https://github.com/Maratyszcza/FP16/pull/32). See also:
+// - https://github.com/microsoft/onnxruntime/pull/22294/files
+// - https://github.com/google/XNNPACK/issues/6989
+// We also don't need a lot of the functionality in the upstream library.
+
+static inline float fp32_from_bits(uint32_t w) {
+  union {
+    uint32_t as_bits;
+    float as_value;
+  } fp32 = {w};
+  return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+  union {
+    float as_value;
+    uint32_t as_bits;
+  } fp32 = {f};
+  return fp32.as_bits;
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+static inline float fp16_ieee_to_fp32_value(uint16_t h) {
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the high bits
+   * of the 32-bit word:
+   *
+   *      +-----+------------+---------------------+
+   *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+   *      +-----+------------+---------------------+
+   * Bits  27-31    17-26            0-16
+   */
+  const uint32_t two_w = w + w;
+
+  /*
+   * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become
+   * mantissa and exponent of a single-precision floating-point number:
+   *
+   *       S|Exponent |          Mantissa
+   *      +-+---+-----+------------+----------------+
+   *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+   *      +-+---+-----+------------+----------------+
+   * Bits   | 23-31   |           0-22
+   *
+   * Next, there are some adjustments to the exponent:
+   * - The exponent needs to be corrected by the difference in exponent bias
+   * between single-precision and half-precision formats (0x7F - 0xF = 0x70)
+   * - Inf and NaN values in the inputs should become Inf and NaN values after
+   * conversion to the single-precision number. Therefore, if the biased
+   * exponent of the half-precision input was 0x1F (max possible value), the
+   * biased exponent of the single-precision output must be 0xFF (max possible
+   * value). We do this correction in two steps:
+   *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset
+   * below) rather than by 0x70 suggested by the difference in the exponent bias
+   * (see above).
+   *   - Then we multiply the single-precision result of exponent adjustment by
+   * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the
+   * necessary exponent adjustment by 0x70 due to difference in exponent bias.
+   *     The floating-point multiplication hardware would ensure than Inf and
+   * NaN would retain their value on at least partially IEEE754-compliant
+   * implementations.
+   *
+   * Note that the above operations do not handle denormal inputs (where biased
+   * exponent == 0). However, they also do not operate on denormal inputs, and
+   * do not produce denormal results.
+   */
+  const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
+    defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const float exp_scale = 0x1.0p-112f;
+#else
+  const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+  const float normalized_value =
+      fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  /*
+   * Convert denormalized half-precision inputs into single-precision results
+   * (always normalized). Zero inputs are also handled here.
+   *
+   * In a denormalized number the biased exponent is zero, and mantissa has
+   * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word.
+   *
+   *                  zeros           |  mantissa
+   *      +---------------------------+------------+
+   *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+   *      +---------------------------+------------+
+   * Bits             10-31                0-9
+   *
+   * Now, remember that denormalized half-precision numbers are represented as:
+   *    FP16 = mantissa * 2**(-24).
+   * The trick is to construct a normalized single-precision number with the
+   * same mantissa and thehalf-precision input and with an exponent which would
+   * scale the corresponding mantissa bits to 2**(-24). A normalized
+   * single-precision floating-point number is represented as: FP32 = (1 +
+   * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased
+   * exponent is 126, a unit change in the mantissa of the input denormalized
+   * half-precision number causes a change of the constructud single-precision
+   * number by 2**(-24), i.e. the same ammount.
+   *
+   * The last step is to adjust the bias of the constructed single-precision
+   * number. When the input half-precision number is zero, the constructed
+   * single-precision number has the value of FP32 = 1 * 2**(126 - 127) =
+   * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed
+   * single-precision number to get the numerical equivalent of the input
+   * half-precision number.
+   */
+  const uint32_t magic_mask = UINT32_C(126) << 23;
+  const float magic_bias = 0.5f;
+  const float denormalized_value =
+      fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  /*
+   * - Choose either results of conversion of input as a normalized number, or
+   * as a denormalized number, depending on the input exponent. The variable
+   * two_w contains input exponent in bits 27-31, therefore if its smaller than
+   * 2**27, the input is either a denormal number, or zero.
+   * - Combine the result of conversion of exponent and mantissa with the sign
+   * of the input number.
+   */
+  const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result =
+      sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
+                                          : fp32_to_bits(normalized_value));
+  return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+static inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
+    defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const float scale_to_inf = 0x1.0p+112f;
+  const float scale_to_zero = 0x1.0p-110f;
+#else
+  const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+  const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+  const uint32_t w = fp32_to_bits(f);
+  const float abs_f = fp32_from_bits(w & UINT32_C(0x7FFFFFFF));
+  float base = (abs_f * scale_to_inf) * scale_to_zero;
+
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000)) {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return (sign >> 16) |
+         (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#endif  // TENSORFLOW_LITE_TYPES_FP16_H_
diff --git a/tensorflow/lite/types/half.h b/tensorflow/lite/types/half.h
new file mode 100644
index 00000000000000..13e8662d341b23
--- /dev/null
+++ b/tensorflow/lite/types/half.h
@@ -0,0 +1,169 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TYPES_HALF_H_
+#define TENSORFLOW_LITE_TYPES_HALF_H_
+
+#include <cstdint>
+
+// We want to use _Float16 if the compiler supports it fully, but it's
+// tricky to do this detection; there are compiler versions that define the
+// type in broken ways. We're only going to bother using it if the support is
+// known to be at least a robust f16<->f32 conversion, which generally means a
+// recent version of Clang or GCC, x86 or ARM or RISC-V architectures, and
+// (in some cases) the right architecture flags specified on the command line.
+
+#ifndef TFLITE_ARCH_FLOAT16
+
+// Some non-GCC compilers define __GNUC__, but we only want to detect the Real
+// Thing
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \
+    !defined(__INTEL_LLVM_COMPILER)
+#define TFLITE_GNUC_ACTUAL __GNUC__
+#else
+#define TFLITE_GNUC_ACTUAL 0
+#endif
+
+#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__) && \
+    defined(__FLT16_MAX__) && defined(__F16C__) &&                     \
+    ((__clang_major__ >= 15 && !defined(_MSC_VER)) ||                  \
+     (TFLITE_GNUC_ACTUAL >= 12))
+#define TFLITE_ARCH_FLOAT16 1
+#endif
+
+#if ((defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+      defined(_M_ARM64) || defined(_M_ARM64EC)) &&                   \
+     !defined(_MSC_VER)) &&                                          \
+    defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+#define TFLITE_ARCH_FLOAT16 1
+#endif
+
+#if defined(__riscv) && defined(__riscv_zvfh) && __clang__ >= 1600
+#define TFLITE_ARCH_FLOAT16 1
+#endif
+
+#ifndef TFLITE_ARCH_FLOAT16
+#define TFLITE_ARCH_FLOAT16 0
+#endif
+
+#endif  // TFLITE_ARCH_FLOAT16
+
+#if TFLITE_ARCH_FLOAT16
+
+#include <cmath>
+
+#include "tensorflow/lite/types/bit_cast.h"
+
+namespace tflite {
+
+class half {
+ public:
+  half() = default;
+  constexpr half(float x) : value_(static_cast<_Float16>(x)) {}  // NOLINT
+  constexpr half(int x)
+      : value_(static_cast<_Float16>(static_cast<float>(x))) {}  // NOLINT
+
+  constexpr operator float() const { return value_; }  // NOLINT
+
+  static half from_bits(uint16_t bits) {
+    half result;
+    result.value_ = bit_cast<_Float16>(bits);
+    return result;
+  }
+
+  uint16_t to_bits() const { return bit_cast<uint16_t>(value_); }
+
+  bool is_zero() const { return value_ == 0.0f; }
+
+  // These definitions are imprecise because we want them to be constexpr, and
+  // the various tools for doing that are not constepxr (bit_cast,
+  // std::numeric_limits, etc.).
+  static constexpr half epsilon() { return 0.0009765625f; }
+  static constexpr half infinity() { return INFINITY; }
+  static constexpr half min() { return -65504.0f; }
+  static constexpr half max() { return 65504.0f; }
+  static constexpr half smallest_normal() { return 0.00006103515625f; }
+  static constexpr half min_identity() { return INFINITY; }
+  static constexpr half max_identity() { return -INFINITY; }
+  static constexpr half sum_identity() { return 0.0f; }
+
+  // Not private due to -Werror=class-memaccess, which can't be disabled:
+  // - via a --copt, because it seems to have no effect.
+  // - via .bazelrc, because it then applies to C code, and the compiler says
+  //   this flag is not valid in C.
+  _Float16 value_;
+};
+
+}  // namespace tflite
+
+#else  // TFLITE_ARCH_FLOAT16
+
+#include "tensorflow/lite/types/fp16.h"
+
+namespace tflite {
+
+class half {
+ private:
+  // We need this hoop jumping to enable implementing a constexpr `from_bits`.
+  struct zero_initializer {};
+  explicit constexpr half(zero_initializer) : bits_(0) {}
+
+ public:
+  half() = default;
+  half(float x) : bits_(fp16_ieee_from_fp32_value(x)) {}  // NOLINT
+  explicit half(int x)
+      : bits_(fp16_ieee_from_fp32_value(static_cast<float>(x))) {}
+
+  operator float() const { return fp16_ieee_to_fp32_value(bits_); }  // NOLINT
+
+  static constexpr half from_bits(uint16_t bits) {
+    half result{zero_initializer{}};
+    result.bits_ = bits;
+    return result;
+  }
+
+  constexpr uint16_t to_bits() const { return bits_; }
+
+  bool is_zero() const {
+    // Check for +/- zero (0x0000/0x8000). uint16 overflow is well defined to
+    // wrap around.
+    return static_cast<uint16_t>(bits_ * 2) == 0;
+  }
+
+  static constexpr half epsilon() {
+    return half::from_bits(0x1400);  // 2^-10 = 0.0009765625
+  }
+  static constexpr half infinity() { return from_bits(0x7c00); }
+  static constexpr half min() { return from_bits(0xfbff); }
+  static constexpr half max() { return from_bits(0x7bff); }
+  static constexpr half smallest_normal() {
+    return from_bits(0x0400);  // 2^-14
+  }
+  static constexpr half min_identity() { return from_bits(0x7c00); }
+  static constexpr half max_identity() { return from_bits(0xfc00); }
+  static constexpr half sum_identity() { return from_bits(0); }
+
+  // Not private due to -Werror=class-memaccess, which can't be disabled:
+  // - via a --copt, because it seems to have no effect.
+  // - via .bazelrc, because it then applies to C code, and the compiler says
+  //   this flag is not valid in C.
+  uint16_t bits_;
+};
+
+}  // namespace tflite
+
+#endif  // TFLITE_ARCH_FLOAT16
+
+#endif  // TENSORFLOW_LITE_TYPES_HALF_H_
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index e3a48f405dcaa7..05f90d30bce07a 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -133,12 +133,22 @@ TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
       // 2 to a byte.
       *bytes = sizeof(int8_t);
       break;
+    case kTfLiteInt2:
+      // Similar to Int4, Int2 values are packed. Multiple Int2 values
+      // (specifically 4) are stored within a single byte. However, this
+      // function is expected to return the size of a single element as if it
+      // were unpacked. When unpacked, each Int2 value would occupy the space
+      // of an int8_t to be addressable in memory. The actual packed size
+      // is handled in the BytesRequired function.
+      *bytes = sizeof(int8_t);
+      break;
     default:
       if (context) {
         TF_LITE_KERNEL_LOG(
             context,
-            "Type %d is unsupported. Only float16, float32, float64, int8, "
-            "int16, int32, int64, uint8, uint64, bool, complex64 and "
+            "Type %d is unsupported. Only float16, float32, float64, int2, "
+            "int4, "
+            "int8, int16, int32, int64, uint8, uint64, bool, complex64 and "
             "complex128 supported currently.",
             type);
       }
@@ -218,6 +228,19 @@ TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
   // Thus the required bytes must be divided by half after everything for int4.
   if (type == kTfLiteInt4) {
     *bytes = (*bytes + 1) / 2;
+  } else if (type == kTfLiteInt2) {
+    // For kTfLiteInt2, 4 elements are packed into a single byte.
+    // The '*bytes' variable at this point holds the total number of elements,
+    // because GetSizeOfType returns sizeof(int8_t) for each Int2 element.
+    // To get the actual number of bytes needed for the packed representation,
+    // we need to divide the total number of elements by 4.
+    // The expression `(*bytes + 3) / 4` implements integer division with
+    // ceiling, ensuring that we allocate enough bytes to store all elements.
+    // For example:
+    // 1 element: (1 + 3) / 4 = 1 byte
+    // 4 elements: (4 + 3) / 4 = 1 byte
+    // 5 elements: (5 + 3) / 4 = 2 bytes
+    *bytes = (*bytes + 3) / 4;
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 84547ebf0b9b0e..e8e80a18fc3c7b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -615,7 +615,7 @@ py_strict_library(
     deps = [
         ":keras_lib",
         "//third_party/py/numpy",
-        "@pypi_scipy//:pkg",
+        "@pypi//scipy",
         "@six_archive//:six",
     ],
 )
@@ -767,7 +767,6 @@ pywrap_tensorflow_macro(
         "//tensorflow/c:env",
         "//tensorflow/c:kernels",
         "//tensorflow/c:kernels_experimental",
-        "//tensorflow/c:logging",
         "//tensorflow/c:ops",
         "//tensorflow/c:python_api",
         "//tensorflow/c:safe_ptr",
diff --git a/tensorflow/python/_pywrap_tensorflow.def b/tensorflow/python/_pywrap_tensorflow.def
index afcd59b667a418..8cb3b304b23695 100644
--- a/tensorflow/python/_pywrap_tensorflow.def
+++ b/tensorflow/python/_pywrap_tensorflow.def
@@ -111,6 +111,8 @@ EXPORTS
   ??0SignatureDef@tensorflow@@IEAA@PEAVArena@protobuf@google@@@Z
   ??0SimpleFieldComparator@util@protobuf@google@@QEAA@XZ
   ??0SingleMachine@grappler@tensorflow@@QEAA@HHH@Z
+  ??0SnappyOutputBuffer@io@tsl@@QEAA@PEAVWritableFile@2@HH@Z
+  ??0SourceMgr@llvm@@QEAA@XZ
   ??0SourceMgrDiagnosticHandler@mlir@@QEAA@AEAVSourceMgr@llvm@@PEAVMLIRContext@1@$$QEAV?$unique_function@$$A6A_NVLocation@mlir@@@Z@3@@Z
   ??0SparseCoreLayoutStacker@tpu@tensorflow@@QEAA@H_NH@Z
   ??0SrcBuffer@SourceMgr@llvm@@QEAA@$$QEAU012@@Z
@@ -173,6 +175,7 @@ EXPORTS
   ??1RunMetadata@tensorflow@@UEAA@XZ
   ??1SignatureDef@tensorflow@@UEAA@XZ
   ??1SimpleFieldComparator@util@protobuf@google@@UEAA@XZ
+  ??1SourceMgr@llvm@@QEAA@XZ
   ??1SourceMgrDiagnosticHandler@mlir@@QEAA@XZ
   ??1SparseCoreTableLayout@tpu@tensorflow@@UEAA@XZ
   ??1SparseCoreTableLayouts@tpu@tensorflow@@UEAA@XZ
@@ -215,6 +218,7 @@ EXPORTS
   ?BatchShardedOnMesh@Layout@dtensor@tensorflow@@SA?AV123@AEBVMesh@23@HAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@H@Z
   ?BoundPort@GrpcDataServerBase@data@tensorflow@@QEAAHXZ
   ?Build@KernelDefBuilder@tensorflow@@QEAAPEBVKernelDef@2@XZ
+  ?ByteSwapTensor@tensorflow@@YA?AVStatus@lts_20250814@absl@@PEAVTensor@1@@Z
   ?Canonicalize@FunctionParameterCanonicalizer@tensorflow@@QEAA_NPEAU_object@@0V?$Span@PEAU_object@@@lts_20250814@absl@@@Z
   ?Capture@StackTrace@tensorflow@@SA?AV?$shared_ptr@VStackTrace@tensorflow@@@std@@H@Z
   ?CatPieces@strings_internal@lts_20250814@absl@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$initializer_list@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@5@@Z
@@ -308,6 +312,7 @@ EXPORTS
   ?EncodeToProto@ThreadSafeHistogram@histogram@tsl@@QEBAXPEAVHistogramProto@tensorflow@@_N@Z
   ?EnqueueOpsFanin@GrapplerItem@grappler@tensorflow@@QEBA?AV?$vector@PEBVNodeDef@tensorflow@@V?$allocator@PEBVNodeDef@tensorflow@@@std@@@std@@XZ
   ?EnsureInitialized@DataServiceDispatcherClient@data@tensorflow@@MEAA?AVStatus@lts_20250814@absl@@XZ
+  ?EnsureMemoryTypes@tensorflow@@YA?AVStatus@lts_20250814@absl@@AEBVDeviceType@tsl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAVGraph@1@@Z
   ?Enter@PyContextManager@tensorflow@@QEAA_NPEAU_object@@@Z
   ?EqualAttrValueWrapper@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBV23@0@Z
   ?EqualGraphDefWrapper@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBV23@0@Z
@@ -511,6 +516,7 @@ EXPORTS
   ?Matrix@InferenceContext@shape_inference@tensorflow@@QEAA?AVShapeHandle@23@UDimensionOrConstant@23@0@Z
   ?MaybeRaiseExceptionFromTFStatus@tensorflow@@YAHPEAUTSL_Status@@PEAU_object@@@Z
   ?MemoryTypesForNode@tensorflow@@YA?AVStatus@lts_20250814@absl@@PEBVOpRegistryInterface@1@AEBVDeviceType@tsl@@AEBVNodeDef@1@PEAV?$InlinedVector@W4MemoryType@tensorflow@@$03V?$allocator@W4MemoryType@tensorflow@@@std@@@34@3@Z
+  ?MemoryTypeForOutput@tensorflow@@YA?AVStatus@lts_20250814@absl@@AEBVDeviceType@tsl@@PEBVGraph@1@PEBVNode@1@HPEAW4MemoryType@1@@Z
   ?Merge@InferenceContext@shape_inference@tensorflow@@QEAA?AVStatus@lts_20250814@absl@@VDimensionHandle@23@0PEAV723@@Z
   ?Merge@InferenceContext@shape_inference@tensorflow@@QEAA?AVStatus@lts_20250814@absl@@VShapeHandle@23@0PEAV723@@Z
   ?MeshDimNames@Mesh@dtensor@tensorflow@@QEBA?AV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@XZ
@@ -552,7 +558,6 @@ EXPORTS
   ?PrintModelAnalysis@tfprof@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBV34@0000@Z
   ?PrintResult@MetaOptimizer@grappler@tensorflow@@QEAAXXZ
   ?PrintStepStats@StatSummarizer@tensorflow@@QEBAXXZ
-  ?Printf@strings@tsl@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBDZZ
   ?ProcessStepStats@StatSummarizer@tensorflow@@QEAAXAEBVStepStats@2@@Z
   ?Profile@tfprof@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBV34@0@Z
   ?ProfilerFromFile@tfprof@tensorflow@@YAXPEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
@@ -780,6 +785,7 @@ EXPORTS
   ?add_ret_check_failure@MakeErrorStream@status_macros@xla@@QEAAAEAVMakeErrorStreamWithOutput@123@PEBD@Z
   ?checkInput@FileCheck@llvm@@QEAA_NAEAVSourceMgr@2@VStringRef@2@PEAV?$vector@UFileCheckDiag@llvm@@V?$allocator@UFileCheckDiag@llvm@@@std@@@std@@@Z
   ?clear_value@AttrValue@tensorflow@@QEAAXXZ
+  ?configureGpuToNVVMConversionLegality@mlir@@YAXAEAVConversionTarget@1@@Z
   ?cost@PyListChecker@py_dispatch@tensorflow@@UEBAHXZ
   ?cost@PyUnionChecker@py_dispatch@tensorflow@@UEBAHXZ
   ?create@ModuleOp@mlir@@SA?AV12@AEAVOpBuilder@2@VLocation@2@V?$optional@VStringRef@llvm@@@std@@@Z
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index fb8d75d26471dc..5e514b407150be 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -13,7 +13,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -39,7 +39,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -65,7 +65,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -80,7 +80,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/utils:ag_logging",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -93,7 +93,7 @@ py_strict_library(
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -105,7 +105,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -127,7 +127,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct/static_analysis:liveness",
         "//tensorflow/python/autograph/pyct/static_analysis:reaching_definitions",
         "//tensorflow/python/autograph/pyct/static_analysis:reaching_fndefs",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -143,7 +143,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct/static_analysis:activity",
         "//tensorflow/python/autograph/pyct/static_analysis:annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -160,7 +160,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct/static_analysis:activity",
         "//tensorflow/python/autograph/pyct/static_analysis:annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -176,7 +176,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct/static_analysis:activity",
         "//tensorflow/python/autograph/pyct/static_analysis:annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -187,7 +187,7 @@ py_strict_library(
     deps = [
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 7d45b9e3e8c808..8e1697b166ca98 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -36,7 +36,7 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/pyct:errors",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index c9ba2649e38f7b..434c0f2a08d57d 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -15,7 +15,7 @@ py_strict_library(
         ":ast_util",
         ":parser",
         ":qual_names",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -33,7 +33,7 @@ py_strict_library(
         ":templates",
         ":transformer",
         "//tensorflow/python/autograph/utils:ag_logging",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -45,7 +45,7 @@ py_strict_library(
         ":anno",
         ":parser",
         ":qual_names",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -63,7 +63,7 @@ py_strict_library(
     name = "gast_util",
     srcs = ["gast_util.py"],
     visibility = ["//visibility:public"],
-    deps = ["@pypi_gast//:pkg"],
+    deps = ["@pypi//gast"],
 )
 
 py_strict_library(
@@ -80,8 +80,8 @@ py_strict_library(
         ":errors",
         ":inspect_utils",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_astunparse//:pkg",
-        "@pypi_gast//:pkg",
+        "@pypi//astunparse",
+        "@pypi//gast",
     ],
 )
 
@@ -109,7 +109,7 @@ py_strict_library(
         ":parser",
         ":pretty_printer",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -117,7 +117,7 @@ py_strict_library(
     name = "anno",
     srcs = ["anno.py"],
     visibility = ["//visibility:public"],
-    deps = ["@pypi_gast//:pkg"],
+    deps = ["@pypi//gast"],
 )
 
 py_strict_library(
@@ -135,7 +135,7 @@ py_strict_library(
         ":parser",
         ":pretty_printer",
         ":templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -146,7 +146,7 @@ py_strict_library(
     deps = [
         ":anno",
         ":parser",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -156,8 +156,8 @@ py_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         ":anno",
-        "@pypi_astunparse//:pkg",
-        "@pypi_gast//:pkg",
+        "@pypi//astunparse",
+        "@pypi//gast",
     ],
 )
 
@@ -182,8 +182,8 @@ py_strict_library(
     srcs = ["pretty_printer.py"],
     visibility = ["//visibility:public"],
     deps = [
-        "@pypi_gast//:pkg",
-        "@pypi_termcolor//:pkg",
+        "@pypi//gast",
+        "@pypi//termcolor",
     ],
 )
 
@@ -207,7 +207,7 @@ py_strict_test(
         ":parser",
         ":pretty_printer",
         ":qual_names",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -229,7 +229,7 @@ py_strict_test(
     deps = [
         ":cfg",
         ":parser",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -243,7 +243,7 @@ py_strict_test(
         ":loader",
         ":parser",
         ":pretty_printer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_inspect",
@@ -314,7 +314,7 @@ py_strict_test(
         ":errors",
         ":parser",
         ":pretty_printer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -351,7 +351,7 @@ py_strict_test(
         ":qual_names",
         ":templates",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -365,7 +365,7 @@ py_strict_test(
         ":origin_info",
         ":parser",
         ":transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -377,7 +377,7 @@ py_strict_test(
     deps = [
         ":transformer",
         ":transpiler",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 298a08ad0b596a..4e812b3a1ebca1 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -15,7 +15,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:gast_util",
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -25,7 +25,7 @@ py_strict_test(
     tags = ["no_oss"],
     deps = [
         ":common_transformers",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/autograph/pyct:loader",
         "//tensorflow/python/autograph/pyct:parser",
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 8a00fb201556a8..b1e553236395db 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -14,7 +14,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -26,7 +26,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -57,7 +57,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -72,7 +72,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -84,7 +84,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -100,7 +100,7 @@ py_strict_test(
     deps = [
         ":activity",
         ":annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:naming",
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index cb603b8c2eab19..a5c7eb4083282a 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -14,7 +14,7 @@ py_strict_library(
     deps = [
         "//tensorflow/python/autograph/pyct:templates",
         "//third_party/py/numpy",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index b31002abc4da6f..f77ebe8ccd8cef 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -187,7 +187,6 @@ py_strict_library(
     srcs = ["device_lib.py"],
     visibility = [
         "//tensorflow:internal",
-        "//third_party/mlperf/submissions/training/v0_7/models:__subpackages__",
         "//third_party/py/cleverhans:__subpackages__",
     ],
     deps = [
@@ -325,7 +324,7 @@ py_strict_library(
         "//tensorflow/python/util:numpy_compat",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index c50ac3327c4ffe..d8763ff21a6a30 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 10, 10)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 11, 27)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 7b63b93fe651da..bf82a1b23dcd44 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -76,7 +76,7 @@ py_strict_library(
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/framework:dtypes",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index 5add62b32dfeb4..af27ef03f7a200 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -163,7 +163,7 @@ tf_py_strict_test(
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 1218e791054a88..59eae8fcca33b8 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -67,7 +68,7 @@ py_strict_library(
     ],
     deps = [
         ":compression_ops",
-        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf:for_core_protos_py_proto",
         "//tensorflow/python:tf2",
         "//tensorflow/python/data/experimental/service:_pywrap_server_lib",
         "//tensorflow/python/data/experimental/service:_pywrap_utils_exp",
@@ -82,10 +83,24 @@ py_strict_library(
         "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
+tf_py_strict_test(
+    name = "data_service_ops_test",
+    srcs = ["data_service_ops_test.py"],
+    deps = [
+        ":data_service_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
 py_strict_library(
     name = "distributed_save_op",
     srcs = [
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 2f612d8b9f68c8..7ad917b377645f 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -36,6 +36,7 @@
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 COMPRESSION_AUTO = "AUTO"
@@ -228,6 +229,42 @@ def _to_string(dataset_id) -> str:
           if isinstance(dataset_id, bytes) else str(dataset_id))
 
 
+class HashableElementSpec:
+  """Wrapper for element_spec to make it hashable."""
+
+  def __init__(self, element_spec):
+    self.element_spec = element_spec
+    self._flattened_reprs = tuple(repr(s) for s in nest.flatten(element_spec))
+    self._hash = hash(self._flattened_reprs)
+
+  def __hash__(self):
+    return self._hash
+
+  def __eq__(self, other):
+    if not isinstance(other, HashableElementSpec):
+      return NotImplemented
+    return self._flattened_reprs == other._flattened_reprs
+
+
+@functools.lru_cache(maxsize=128)
+def _get_uncompress_func(
+    hashable_spec: HashableElementSpec,
+) -> structured_function.StructuredFunctionWrapper:
+  """Returns a cached StructuredFunctionWrapper for uncompression.
+
+  Args:
+    hashable_spec: A HashableElementSpec wrapping a nested structure of
+      `tf.TypeSpec`s representing the type of elements produced by the dataset.
+  """
+  return structured_function.StructuredFunctionWrapper(
+      lambda x: compression_ops.uncompress(
+          x, output_spec=hashable_spec.element_spec
+      ),
+      transformation_name="DataServiceDataset.uncompress()",
+      input_structure=tensor.TensorSpec(shape=(), dtype=dtypes.variant),
+  )
+
+
 class _DataServiceDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` that reads elements from the tf.data service."""
 
@@ -340,10 +377,7 @@ def __init__(self,
         dtype=dtypes.int64,
         name="max_outstanding_requests")
     self._element_spec = element_spec
-    uncompress_func = structured_function.StructuredFunctionWrapper(
-        lambda x: compression_ops.uncompress(x, output_spec=element_spec),
-        transformation_name="DataServiceDataset.uncompress()",
-        input_structure=tensor.TensorSpec(shape=(), dtype=dtypes.variant))
+    uncompress_func = _get_uncompress_func(HashableElementSpec(element_spec))
     cross_trainer_cache_options = (
         cross_trainer_cache._to_proto().SerializeToString()
         if cross_trainer_cache else None)
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops_test.py b/tensorflow/python/data/experimental/ops/data_service_ops_test.py
new file mode 100644
index 00000000000000..c1c1b3c7b3e228
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/data_service_ops_test.py
@@ -0,0 +1,99 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for HashableElementSpec and related functionality."""
+
+from tensorflow.python.data.experimental.ops import data_service_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+
+
+class HashableElementSpecTest(test.TestCase):
+
+  def testEqual(self):
+    spec1 = data_service_ops.HashableElementSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32)
+    )
+    spec2 = data_service_ops.HashableElementSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32)
+    )
+    self.assertEqual(spec1, spec2)
+    self.assertEqual(hash(spec1), hash(spec2))
+
+  def testNotEqual(self):
+    spec1 = data_service_ops.HashableElementSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32)
+    )
+    spec2 = data_service_ops.HashableElementSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int64)
+    )
+    self.assertNotEqual(spec1, spec2)
+    self.assertNotEqual(hash(spec1), hash(spec2))
+
+  def testNotEqualOtherType(self):
+    spec1 = data_service_ops.HashableElementSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32)
+    )
+    self.assertNotEqual(spec1, 123)
+
+  def testGetUncompressFuncCache(self):
+    spec1 = tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32)
+    spec2 = tensor_spec.TensorSpec(shape=(), dtype=dtypes.int64)
+    data_service_ops._get_uncompress_func.cache_clear()
+    func1 = data_service_ops._get_uncompress_func(
+        data_service_ops.HashableElementSpec(spec1)
+    )
+    func2 = data_service_ops._get_uncompress_func(
+        data_service_ops.HashableElementSpec(spec1)
+    )
+    func3 = data_service_ops._get_uncompress_func(
+        data_service_ops.HashableElementSpec(spec2)
+    )
+    self.assertIs(func1, func2)
+    self.assertIsNot(func1, func3)
+
+  def testRaggedTensorUncompressFuncCache(self):
+    spec1 = ragged_tensor.RaggedTensorSpec(
+        [3, None], dtypes.int32, 1, dtypes.int64
+    )
+    spec1_long_format = ragged_tensor.RaggedTensorSpec(
+        tensor_shape.TensorShape(
+            [tensor_shape.Dimension(3), tensor_shape.Dimension(None)]
+        ),
+        dtypes.int32,
+        1,
+        dtypes.int64,
+    )
+    spec2 = ragged_tensor.RaggedTensorSpec(
+        ([3, None]), dtypes.int64, 1, dtypes.int64
+    )
+    data_service_ops._get_uncompress_func.cache_clear()
+    func1a = data_service_ops._get_uncompress_func(
+        data_service_ops.HashableElementSpec(spec1)
+    )
+    func1b = data_service_ops._get_uncompress_func(
+        data_service_ops.HashableElementSpec(spec1_long_format)
+    )
+    func2 = data_service_ops._get_uncompress_func(
+        data_service_ops.HashableElementSpec(spec2)
+    )
+    self.assertIs(func1a, func1b)
+    self.assertIsNot(func1a, func2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index 9cb56335a99608..560b4bd5ac0416 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "Python.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "pybind11/chrono.h"  // from @pybind11
 #include "pybind11/complex.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
@@ -130,7 +131,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
          const std::string& protocol) -> tensorflow::data::DataServiceMetadata {
         tensorflow::data::DataServiceMetadata metadata;
         tensorflow::data::DataServiceDispatcherClient client(address, protocol);
-        int64_t deadline_micros = tensorflow::kint64max;
+        int64_t deadline_micros = std::numeric_limits<int64_t>::max();
         absl::Status status;
         Py_BEGIN_ALLOW_THREADS;
         status = tensorflow::data::grpc_util::Retry(
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 770abe89b454bc..cba1693d90b02b 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -87,7 +87,7 @@ py_strict_library(
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest_util",
         "//tensorflow/python/util:tf_export",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
@@ -116,7 +116,7 @@ py_strict_test(
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index 8dc30eeee3d6ac..5abaac014cb5d7 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -557,7 +557,7 @@ py_strict_library(
         "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:compat",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 607ddad17f40eb..2266673efd9a58 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -500,7 +500,7 @@ cuda_py_strict_test(
         "//tensorflow/python/ops:variable_scope",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
     ],
 )
 
@@ -1403,7 +1403,7 @@ cuda_py_strict_test(
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
     ],
 )
 
@@ -1719,7 +1719,7 @@ distribute_py_strict_test(
         "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/ops:variable_v1",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
@@ -2238,8 +2238,8 @@ py_strict_library(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
-        "@pypi_tblib//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
+        "@pypi//tblib",  # build_cleaner: keep
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/distribute/device_util.py b/tensorflow/python/distribute/device_util.py
index a7631c3c38c538..1f6fd6a8996267 100644
--- a/tensorflow/python/distribute/device_util.py
+++ b/tensorflow/python/distribute/device_util.py
@@ -145,12 +145,16 @@ def current():
   return d
 
 
-def get_host_for_device(device):
+def get_host_for_device(device, device_index=0):
   """Returns the corresponding host device for the given device."""
   spec = tf_device.DeviceSpec.from_string(device)
   return tf_device.DeviceSpec(
-      job=spec.job, replica=spec.replica, task=spec.task,
-      device_type="CPU", device_index=0).to_string()
+      job=spec.job,
+      replica=spec.replica,
+      task=spec.task,
+      device_type="CPU",
+      device_index=device_index,
+  ).to_string()
 
 
 def local_devices_from_num_gpus(num_gpus):
diff --git a/tensorflow/python/distribute/experimental/rpc/BUILD b/tensorflow/python/distribute/experimental/rpc/BUILD
index 76482d4386ce6c..de78c957828708 100644
--- a/tensorflow/python/distribute/experimental/rpc/BUILD
+++ b/tensorflow/python/distribute/experimental/rpc/BUILD
@@ -59,6 +59,6 @@ tf_py_strict_test(
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
diff --git a/tensorflow/python/distribute/failure_handling/BUILD b/tensorflow/python/distribute/failure_handling/BUILD
index 5066dabaebda47..d43e018787cc6e 100644
--- a/tensorflow/python/distribute/failure_handling/BUILD
+++ b/tensorflow/python/distribute/failure_handling/BUILD
@@ -45,7 +45,7 @@ py_strict_library(
     deps = [
         "//tensorflow/python/eager:context",
         "//tensorflow/python/platform:tf_logging",
-        "@pypi_requests//:pkg",
+        "@pypi//requests",
         "@six_archive//:six",
     ],
 )
@@ -133,7 +133,7 @@ tf_py_strict_test(
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
     ],
 )
 
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 245a1a6b86e2eb..c3e1f17f715554 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -907,7 +907,6 @@ def __init__(
 
         tpu_devices.append(replica_devices)
       self._tpu_devices = np.array(tpu_devices, dtype=object)
-
     self._host_device = device_util.get_host_for_device(self._tpu_devices[0][0])
 
     # Preload the data onto the TPUs. Currently we always preload onto logical
@@ -957,6 +956,48 @@ def __init__(
     # to match IteratorGetNext's device with the TPUExecute device.
     self._enable_data_reorder = False
 
+  def _place_input_on_local_cpu_devices(self):
+    """Place input on local CPU devices.
+
+    For example, if the tpu_devices are:
+    '/job:worker/replica:0/task:0/device:TPU:0',
+    '/job:worker/replica:0/task:1/device:TPU:0',
+    '/job:worker/replica:0/task:1/device:TPU:1',
+    '/job:worker/replica:0/task:0/device:TPU:1',
+
+
+    the host_input_worker_devices will be:
+    {
+        '/job:worker/replica:0/task:0/device:CPU:0': [
+            '/job:worker/replica:0/task:0/device:TPU:0',
+        ],
+        '/job:worker/replica:0/task:1/device:CPU:0', [
+            '/job:worker/replica:0/task:1/device:TPU:0',
+        ],
+        '/job:worker/replica:0/task:1/device:CPU:1': [
+            '/job:worker/replica:0/task:1/device:TPU:1',
+        ],
+        '/job:worker/replica:0/task:0/device:CPU:1': [
+            '/job:worker/replica:0/task:0/device:TPU:1',
+        ],
+    }
+    This will make sure that the input is placed on the corresponding host CPU
+    device if the device assignment is set.
+    """
+    self._device_input_worker_devices = collections.OrderedDict()
+    self._host_input_worker_devices = collections.OrderedDict()
+    for tpu_device in self._tpu_devices[:, 0]:
+      host_device = device_util.get_host_for_device(
+          tpu_device,
+          device_index=tf_device.DeviceSpec.from_string(
+              tpu_device
+          ).device_index,
+      )
+      self._device_input_worker_devices.setdefault(host_device, [])
+      self._device_input_worker_devices[host_device].append(tpu_device)
+      self._host_input_worker_devices.setdefault(host_device, [])
+      self._host_input_worker_devices[host_device].append(host_device)
+
   def _get_replica_order(self):
     """Get the replica order based on the tpu device order.
 
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 2018c782c74d27..4090be7e26477f 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -791,6 +791,9 @@ cuda_py_benchmark_test(
 cuda_py_strict_test(
     name = "run_eager_op_as_function_test",
     srcs = ["run_eager_op_as_function_test.py"],
+    exec_properties = {
+        "cpp_link.mem": "16g",
+    },
     tags = [
         "no_windows",  # b/207695287
     ],
@@ -1108,6 +1111,7 @@ cuda_py_strict_test(
         "no_oss",  # This test launches local server.
         "nofastbuild",  # times out
         "optonly",  # times out
+        "requires-mem:20g",  # Add this for Test Execution OOM
     ],
     deps = [
         ":cancellation",
@@ -1137,7 +1141,7 @@ cuda_py_strict_test(
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
@@ -1271,7 +1275,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct/static_analysis:reaching_fndefs",
         "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/framework:ops",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/eager/benchmarks/BUILD b/tensorflow/python/eager/benchmarks/BUILD
index 21471586b37c6a..7424a2dc2f674e 100644
--- a/tensorflow/python/eager/benchmarks/BUILD
+++ b/tensorflow/python/eager/benchmarks/BUILD
@@ -15,6 +15,7 @@ cuda_py_benchmark_test(
     srcs = ["kpi_benchmark_test.py"],
     tags = [
         "no_windows",  #  b/141617449
+        "notap",  # b/456542868
         "optonly",
     ],
     deps = [
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 9fee97bee904f5..876451a22ef4a2 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -227,7 +227,6 @@ cuda_py_strict_test(
     tags = [
         "nomac",  # b/157056289
     ],
-    # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
         ":attributes",
         ":polymorphic_function",
@@ -318,19 +317,13 @@ tf_py_strict_test(
 tf_xla_py_strict_test(
     name = "polymorphic_function_xla_jit_test",
     srcs = ["polymorphic_function_xla_jit_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/185944215) # Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
-    disabled_backends = [
-        "cpu_ondemand",
+    enabled_backends = [
+        "cpu",
     ],
-    enable_mlir_bridge = True,
     tags = [
         "no_mac",
         "no_oss",  # TODO(b/295654746)
         "no_pip",
-        "no_tfrt",  # TODO(b/185944215)
         "no_windows",
     ],
     use_xla_device = False,
@@ -366,11 +359,6 @@ tf_xla_py_strict_test(
 tf_xla_py_strict_test(
     name = "polymorphic_function_xla_test",
     srcs = ["polymorphic_function_xla_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/185944215) # Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
-    enable_mlir_bridge = False,
     tags = [
         "no_pip",
         "no_windows",
@@ -475,7 +463,6 @@ cuda_py_strict_test(
     name = "argument_naming_test",
     size = "medium",
     srcs = ["argument_naming_test.py"],
-    # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
         ":polymorphic_function",
         "//tensorflow/core:protos_all_py",
@@ -612,11 +599,9 @@ tf_xla_py_strict_test(
         "gpu_a100",
         "gpu_h100",
     ],
-    enable_mlir_bridge = True,
     tags = [
         "no_mac",
         "no_pip",
-        "no_tfrt",  # TODO(b/185944215)
         "no_windows",
     ],
     use_xla_device = False,
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 1ff9e314e6d3ab..a5066356052041 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -105,8 +105,8 @@ def testNumpyValueWithCast(self):
     ctx = context.context()
     # Bad dtype value.
     with self.assertRaisesRegex(TypeError, "Invalid dtype argument value"):
-      # The max value of TF_DataType is 32, so using 33 for the dtype fails.
-      ops.EagerTensor(values, device=ctx.device_name, dtype=33)
+      # The max value of TF_DataType is 33, so using 34 for the dtype fails.
+      ops.EagerTensor(values, device=ctx.device_name, dtype=34)
 
   def testNumpyOrderHandling(self):
     n = np.array([[1, 2], [3, 4]], order="F")
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index 7cd1ac17d52a78..837ba6ec259354 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -21,6 +21,7 @@ class Flag:
 class Flags:
     enable_aggressive_constant_replication: Flag
     enable_colocation_key_propagation_in_while_op_lowering: Flag
+    enable_fatal_error_on_collective_abort: Flag
     enable_function_pruning_before_inlining: Flag
     enable_graph_debug_info_caching_for_stack_frames: Flag
     enable_nested_function_shape_inference: Flag
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 2ea58e0cd45198..b152626818e181 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -303,7 +303,7 @@ py_strict_library(
         "//tensorflow/python/eager:execute",
         "//tensorflow/security/fuzzing/py:annotation_types",
         "//tensorflow/tools/docs:doc_controls",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -1758,7 +1758,7 @@ py_strict_library(
     deps = [
         ":composite_tensor",
         "//tensorflow/python/util:nest",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -1786,7 +1786,7 @@ py_strict_library(
         "//tensorflow/python/util:tf_decorator_py",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -1929,7 +1929,7 @@ pytype_strict_library(
         "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -2213,7 +2213,7 @@ pytype_strict_library(
         "//tensorflow/python/util/protobuf",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
@@ -2781,7 +2781,7 @@ tf_py_strict_test(
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_inspect",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index fcd2da85f4ff1b..0d00a881abe5ac 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -479,6 +479,19 @@ def __reduce__(self):
     "dtypes.experimental.float8_e5m2fnuz", "experimental.float8_e5m2fnuz"
 ).export_constant(__name__, "float8_e5m2fnuz")
 
+float4_e2m1fn = DType(types_pb2.DT_FLOAT4_E2M1FN)
+doc_typealias.document(
+    obj=float4_e2m1fn,
+    doc=(
+        "4-bit float with 2 exponent bits and 1 mantissa bits, with extended"
+        " finite range.  This type has no representation for both inf and NaN."
+    ),
+)
+tf_export(
+    "dtypes.experimental.float4_e2m1fn", "experimental.float4_e2m1fn"
+).export_constant(__name__, "float4_e2m1fn")
+
+
 int4 = DType(types_pb2.DT_INT4)
 doc_typealias.document(obj=int4, doc="Signed 4-bit integer.")
 tf_export("dtypes.experimental.int4", "experimental.int4").export_constant(
@@ -533,6 +546,7 @@ def __reduce__(self):
 float8_e4m3fnuz_ref = DType(types_pb2.DT_FLOAT8_E4M3FNUZ_REF)
 float8_e4m3b11fnuz_ref = DType(types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF)
 float8_e5m2fnuz_ref = DType(types_pb2.DT_FLOAT8_E5M2FNUZ_REF)
+float4_e2m1fn_ref = DType(types_pb2.DT_FLOAT4_E2M1FN_REF)
 int4_ref = DType(types_pb2.DT_INT4_REF)
 uint4_ref = DType(types_pb2.DT_UINT4_REF)
 int2_ref = DType(types_pb2.DT_INT2_REF)
@@ -567,6 +581,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ: float8_e4m3fnuz,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ: float8_e4m3b11fnuz,
     types_pb2.DT_FLOAT8_E5M2FNUZ: float8_e5m2fnuz,
+    types_pb2.DT_FLOAT4_E2M1FN: float4_e2m1fn,
     types_pb2.DT_INT4: int4,
     types_pb2.DT_UINT4: uint4,
     types_pb2.DT_INT2: int2,
@@ -599,6 +614,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ_REF: float8_e4m3fnuz_ref,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF: float8_e4m3b11fnuz_ref,
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: float8_e5m2fnuz_ref,
+    types_pb2.DT_FLOAT4_E2M1FN_REF: float4_e2m1fn_ref,
     types_pb2.DT_INT4_REF: int4_ref,
     types_pb2.DT_UINT4_REF: uint4_ref,
     types_pb2.DT_INT2_REF: int2_ref,
@@ -635,6 +651,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ: "float8_e4m3fnuz",
     types_pb2.DT_FLOAT8_E4M3B11FNUZ: "float8_e4m3b11fnuz",
     types_pb2.DT_FLOAT8_E5M2FNUZ: "float8_e5m2fnuz",
+    types_pb2.DT_FLOAT4_E2M1FN: "float4_e2m1fn",
     types_pb2.DT_INT4: "int4",
     types_pb2.DT_UINT4: "uint4",
     types_pb2.DT_INT2: "int2",
@@ -667,6 +684,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ_REF: "float8_e4m3fnuz_ref",
     types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF: "float8_e4m3b11fnuz_ref",
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: "float8_e5m2fnuz_ref",
+    types_pb2.DT_FLOAT4_E2M1FN_REF: "float4_e2m1fn_ref",
     types_pb2.DT_INT4_REF: "int4_ref",
     types_pb2.DT_UINT4_REF: "uint4_ref",
     types_pb2.DT_INT2_REF: "int2_ref",
@@ -704,6 +722,7 @@ def __reduce__(self):
 _np_float8_e4m3b11fnuz = ml_dtypes.float8_e4m3b11fnuz
 _np_float8_e5m2 = ml_dtypes.float8_e5m2
 _np_float8_e5m2fnuz = ml_dtypes.float8_e5m2fnuz
+_np_float4_e2m1fn = ml_dtypes.float4_e2m1fn
 _np_int4 = ml_dtypes.int4
 _np_uint4 = ml_dtypes.uint4
 _np_int2 = ml_dtypes.int2
@@ -742,6 +761,7 @@ def __reduce__(self):
     _np_float8_e4m3fnuz: float8_e4m3fnuz,
     _np_float8_e4m3b11fnuz: float8_e4m3b11fnuz,
     _np_float8_e5m2fnuz: float8_e5m2fnuz,
+    _np_float4_e2m1fn: float4_e2m1fn,
     _np_int4: int4,
     _np_uint4: uint4,
     _np_int2: int2,
@@ -797,6 +817,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ: _np_float8_e4m3fnuz,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ: _np_float8_e4m3b11fnuz,
     types_pb2.DT_FLOAT8_E5M2FNUZ: _np_float8_e5m2fnuz,
+    types_pb2.DT_FLOAT4_E2M1FN: _np_float4_e2m1fn,
     types_pb2.DT_INT4: _np_int4,
     types_pb2.DT_UINT4: _np_uint4,
     types_pb2.DT_INT2: _np_int2,
@@ -828,6 +849,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ_REF: _np_float8_e4m3fnuz,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF: _np_float8_e4m3b11fnuz,
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: _np_float8_e5m2fnuz,
+    types_pb2.DT_FLOAT4_E2M1FN_REF: _np_float4_e2m1fn,
     types_pb2.DT_INT4_REF: _np_int4,
     types_pb2.DT_UINT4_REF: _np_uint4,
     types_pb2.DT_INT2_REF: _np_int2,
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 00a019a8760c81..0f299a19ac0daf 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -96,6 +96,9 @@ def testNumpyConversion(self):
     self.assertIs(dtypes.float8_e5m2, dtypes.as_dtype(dtypes._np_float8_e5m2))
     self.assertIs(dtypes.float8_e4m3fn,
                   dtypes.as_dtype(dtypes._np_float8_e4m3fn))
+    self.assertIs(
+        dtypes.float4_e2m1fn, dtypes.as_dtype(dtypes._np_float4_e2m1fn)
+    )
     self.assertIs(
         dtypes.float8_e4m3fnuz, dtypes.as_dtype(dtypes._np_float8_e4m3fnuz)
     )
@@ -138,6 +141,7 @@ def testRealDtype(self):
         dtypes.float8_e4m3fnuz,
         dtypes.float8_e4m3b11fnuz,
         dtypes.float8_e5m2fnuz,
+        dtypes.float4_e2m1fn,
         dtypes.int4,
         dtypes.uint4,
         dtypes.int2,
@@ -171,6 +175,7 @@ def testStringConversion(self):
         dtypes.float8_e4m3b11fnuz, dtypes.as_dtype("float8_e4m3b11fnuz")
     )
     self.assertIs(dtypes.float8_e5m2fnuz, dtypes.as_dtype("float8_e5m2fnuz"))
+    self.assertIs(dtypes.float4_e2m1fn, dtypes.as_dtype("float4_e2m1fn"))
     self.assertIs(dtypes.int4, dtypes.as_dtype("int4"))
     self.assertIs(dtypes.uint4, dtypes.as_dtype("uint4"))
     self.assertIs(dtypes.int2, dtypes.as_dtype("int2"))
@@ -191,8 +196,12 @@ def testStringConversion(self):
     self.assertIs(dtypes.qint32_ref, dtypes.as_dtype("qint32_ref"))
     self.assertIs(dtypes.bfloat16_ref, dtypes.as_dtype("bfloat16_ref"))
     self.assertIs(dtypes.float8_e5m2_ref, dtypes.as_dtype("float8_e5m2_ref"))
-    self.assertIs(dtypes.float8_e4m3fn_ref,
-                  dtypes.as_dtype("float8_e4m3fn_ref"))
+    self.assertIs(
+        dtypes.float8_e4m3fn_ref, dtypes.as_dtype("float8_e4m3fn_ref")
+    )
+    self.assertIs(
+        dtypes.float4_e2m1fn_ref, dtypes.as_dtype("float4_e2m1fn_ref")
+    )
     self.assertIs(dtypes.int4_ref, dtypes.as_dtype("int4_ref"))
     self.assertIs(dtypes.uint4_ref, dtypes.as_dtype("uint4_ref"))
     self.assertIs(dtypes.int2_ref, dtypes.as_dtype("int2_ref"))
@@ -230,6 +239,7 @@ def testIsInteger(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_integer, False)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_integer, False)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_integer, False)
     self.assertEqual(dtypes.as_dtype("int4").is_integer, True)
     self.assertEqual(dtypes.as_dtype("uint4").is_integer, True)
     self.assertEqual(dtypes.as_dtype("int2").is_integer, True)
@@ -259,6 +269,7 @@ def testIsFloating(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_floating, True)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_floating, True)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_floating, True)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_floating, True)
     self.assertEqual(dtypes.as_dtype("int4").is_floating, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_floating, False)
     self.assertEqual(dtypes.as_dtype("int2").is_floating, False)
@@ -288,6 +299,7 @@ def testIsComplex(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_complex, False)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_complex, False)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_complex, False)
     self.assertEqual(dtypes.as_dtype("int4").is_complex, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_complex, False)
     self.assertEqual(dtypes.as_dtype("int2").is_complex, False)
@@ -317,6 +329,7 @@ def testIsUnsigned(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("int4").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_unsigned, True)
     self.assertEqual(dtypes.as_dtype("int2").is_unsigned, False)
@@ -398,6 +411,9 @@ def testMinMax(self):
       if numpy_dtype == dtypes.float8_e5m2fnuz.as_numpy_dtype:
         self.assertEqual(dtype.min, -57344.0)
         self.assertEqual(dtype.max, 57344.0)
+      if numpy_dtype == dtypes.float4_e2m1fn.as_numpy_dtype:
+        self.assertEqual(dtype.min, -6)
+        self.assertEqual(dtype.max, 6)
       if numpy_dtype == dtypes.int4.as_numpy_dtype:
         self.assertEqual(dtype.min, -8)
         self.assertEqual(dtype.max, 7)
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 34c3a3bec391ba..7ac2e7caa68cbc 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -1388,6 +1388,7 @@ def _type_list_to_str(types):
     dtypes.float8_e4m3fnuz: "f8e4m3fnuz",
     dtypes.float8_e4m3b11fnuz: "f8e4m3b11fnuz",
     dtypes.float8_e5m2fnuz: "f8e5m2fnuz",
+    dtypes.float4_e2m1fn: "f4e2m1fn",
     dtypes.int4: "i4",
     dtypes.uint4: "u4",
     dtypes.int2: "i2",
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index 05b764bb5eb77e..3a3c8bca1317d2 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -75,9 +75,9 @@ def write_graph(graph_or_graph_def, logdir, name, as_text=True):
     file_io.recursive_create_dir(logdir)
   path = os.path.join(logdir, name)
   if as_text:
-    file_io.atomic_write_string_to_file(path,
-                                        text_format.MessageToString(
-                                            graph_def, float_format=''))
+    file_io.atomic_write_string_to_file(
+        path, text_format.MessageToString(graph_def)
+    )
   else:
     file_io.atomic_write_string_to_file(
         path, graph_def.SerializeToString(deterministic=True))
diff --git a/tensorflow/python/framework/python_memory_checker_helper.cc b/tensorflow/python/framework/python_memory_checker_helper.cc
index d2507291b16822..ab57efde991afd 100644
--- a/tensorflow/python/framework/python_memory_checker_helper.cc
+++ b/tensorflow/python/framework/python_memory_checker_helper.cc
@@ -1,6 +1,6 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 4011142b236c04..8c5d4753d36235 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -94,6 +95,7 @@ const std::unordered_map<string, string> dtype_type{
     {"_dtypes.float8_e4m3fnuz", "_atypes.Float8e4m3fnuz"},
     {"_dtypes.float8_e4m3b11fnuz", "_atypes.Float8e4m3b11fnuz"},
     {"_dtypes.float8_e5m2fnuz", "_atypes.Float8e5m2fnuz"},
+    {"_dtypes.float4_e2m1fn", "_atypes.Float4e2m1fn"},
     {"_dtypes.int4", "_atypes.Int4"},
     {"_dtypes.uint4", "_atypes.UInt4"},
     {"_dtypes.int2", "_atypes.Int2"},
@@ -783,14 +785,15 @@ void GenerateLowerCaseOpName(const string& str, string* result) {
     // Emit a joiner only if a previous-lower-to-now-upper or a
     // now-upper-to-next-lower transition happens.
     // (But don't emit an extra joiner if we just saw a namespace separator
-    if (isupper(c) && (i > 0)) {
-      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
+    if (absl::ascii_isupper(c) && (i > 0)) {
+      if (absl::ascii_islower(str[i - 1]) ||
+          ((i < last_index) && absl::ascii_islower(str[i + 1]))) {
         if (!(str[i - 1] == namespace_separator)) {
           result->push_back(joiner);
         }
       }
     }
-    result->push_back(tolower(c));
+    result->push_back(absl::ascii_tolower(c));
   }
 }
 
@@ -2118,7 +2121,7 @@ void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   OpList ops;
-  ops.ParseFromArray(op_list_buf, op_list_len);
+  ops.ParseFromString(absl::string_view(op_list_buf, op_list_len));
 
   ApiDefMap api_def_map(ops);
   return GetPythonOpsImpl(ops, api_def_map, OpRegOffsets(), {}, {});
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index febfdc89a91464..b42abafddaa7dc 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -62,6 +62,7 @@ TEST(PythonOpGen, TypeAnnotateAllOps) {
   const std::string all_types =
       ", \"_atypes.BFloat16\", \"_atypes.Bool\", \"_atypes.Complex128\", "
       "\"_atypes.Complex64\", \"_atypes.Float16\", \"_atypes.Float32\", "
+      "\"_atypes.Float4e2m1fn\", "
       "\"_atypes.Float64\", "
       "\"_atypes.Float8e4m3b11fnuz\", \"_atypes.Float8e4m3fn\", "
       "\"_atypes.Float8e4m3fnuz\", \"_atypes.Float8e5m2\", "
@@ -529,7 +530,7 @@ TEST(PythonOpGen, GenerateMetadataWhenOpRegOffsetsIsPresent) {
   int target_end = target_begin + 3;
 
   std::vector<string> sp = absl::StrSplit(code, '\n');
-  string last_line = sp.back();
+  std::string last_line = sp.back();
   ASSERT_TRUE(absl::StrContains(last_line,
                                 "# kythe.proto.metadata.GeneratedCodeInfo:"));
   GeneratedCodeInfo gci = DecodeAnnotation(last_line);
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 66894e173e86a7..5df970025a6526 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -167,6 +167,26 @@ def FastAppendFloat8e5m2fnuzArrayToTensorProto(tensor_proto, proto_values):
   )
 
 
+def SlowAppendFloat4e2m1fnArrayToTensorProto(tensor_proto, proto_values):
+  tensor_proto.float8_val += (
+      numpy_compat.np_asarray(
+          proto_values, dtype=dtypes.float4_e2m1fn.as_numpy_dtype)
+      .view(np.uint8)
+      .tobytes()
+  )
+
+
+def FastAppendFloat4e2m1fnArrayToTensorProto(tensor_proto, proto_values):
+  # Note: This requires a corresponding C++ binding in
+  # fast_tensor_util.AppendFloat8ArrayToTensorProto
+  fast_tensor_util.AppendFloat8ArrayToTensorProto(
+      tensor_proto,
+      numpy_compat.np_asarray(
+          proto_values, dtype=dtypes.float4_e2m1fn.as_numpy_dtype
+      ).view(np.uint8),
+  )
+
+
 def SlowAppendInt4ArrayToTensorProto(tensor_proto, proto_values):
   # The actual bit representation of int4 as a bit-field is
   # implementation-defined, so we need to explicitly cast each
@@ -249,6 +269,9 @@ def SlowAppendUInt2ArrayToTensorProto(tensor_proto, proto_values):
       dtypes.float8_e5m2fnuz.as_numpy_dtype: (
           FastAppendFloat8e5m2fnuzArrayToTensorProto
       ),
+      dtypes.float4_e2m1fn.as_numpy_dtype: (
+          FastAppendFloat4e2m1fnArrayToTensorProto
+      ),
       dtypes.int4.as_numpy_dtype: SlowAppendInt4ArrayToTensorProto,
       dtypes.uint4.as_numpy_dtype: SlowAppendUInt4ArrayToTensorProto,
       dtypes.int2.as_numpy_dtype: SlowAppendInt2ArrayToTensorProto,
@@ -297,6 +320,9 @@ def SlowAppendBoolArrayToTensorProto(tensor_proto, proto_values):
       dtypes.float8_e4m3fn.as_numpy_dtype: (
           SlowAppendFloat8e4m3fnArrayToTensorProto
       ),
+      dtypes.float4_e2m1fn.as_numpy_dtype: (
+          SlowAppendFloat4e2m1fnArrayToTensorProto
+      ),
       np.float16: SlowAppendFloat16ArrayToTensorProto,
       np.float32: SlowAppendFloat32ArrayToTensorProto,
       np.float64: SlowAppendFloat64ArrayToTensorProto,
@@ -397,6 +423,7 @@ def _FlattenToStrings(nested_strings):
     dtypes.float8_e4m3fnuz,
     dtypes.float8_e4m3b11fnuz,
     dtypes.float8_e5m2fnuz,
+    dtypes.float4_e2m1fn,
     dtypes.bfloat16,
     # int4 / uint4 / int2 / uint2 intentionally not listed, since their binary
     # representation is implementation-dependent.
@@ -779,6 +806,7 @@ def MakeNdarray(tensor):
   elif tensor_dtype in [
       dtypes.float8_e5m2,
       dtypes.float8_e4m3fn,
+      dtypes.float4_e2m1fn
   ]:
     values = np.fromiter(tensor.float8_val, dtype=np.uint8)
     values.dtype = dtype
@@ -1022,7 +1050,9 @@ def constant_value(tensor, partial=False):  # pylint: disable=invalid-name
   class Foo:
     def __init__(self):
       self.a = tf.Variable(1)
-      self.b = tf.constant(2)
+      # Use convert_to_tensor to avoid forcing eager-only .numpy() calls
+      # when this example is used inside a tf.function.
+      self.b = tf.convert_to_tensor(2, dtype=tf.int32)
 
     @tf.function
     def bar(self, partial):
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 68dd9967142fdf..bb4ad4cab194a4 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -375,13 +375,39 @@ def testFloat8e5m2fnuz(self):
         t,
     )
 
+  def testFloat4e2m1fn(self):
+    test_type = dtypes.float4_e2m1fn.as_numpy_dtype
+    t = tensor_util.make_tensor_proto(np.array([6, 0.5], dtype=test_type))
+    # 0x7 = 011 1 = 2^(3-1) x (1+0.5) = 6
+    # 0x1 = 000 1 = 2^(0) x 0.5 = 0.5
+    expected_bytes = r"\x07\x01"
+    self.assertProtoEquals(
+        f"""
+      dtype: DT_FLOAT4_E2M1FN
+      tensor_shape {{
+        dim {{
+          size: 2
+        }}
+      }}
+      tensor_content: "{expected_bytes}"
+      """,
+        t,
+    )
+
+    a = tensor_util.MakeNdarray(t)
+    self.assertEqual(test_type, a.dtype)
+    self.assertAllClose(np.array([6, 0.5], dtype=test_type), a)
+
   def testInt(self):
     t = tensor_util.make_tensor_proto(10)
-    self.assertProtoEquals("""
+    self.assertProtoEquals(
+        """
       dtype: DT_INT32
       tensor_shape {}
       int_val: 10
-      """, t)
+      """,
+        t,
+    )
     a = tensor_util.MakeNdarray(t)
     self.assertEqual(np.int32, a.dtype)
     self.assertAllClose(np.array(10, dtype=np.int32), a)
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index 44239ebe140536..1c54cbc3cf6813 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace grappler {
 
 CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
-                           const string& suffix)
+                           const std::string& suffix)
     : item_(&item),
       measure_estimator_(cluster, 10, 0),
       analytical_estimator_(cluster, /*use_static_shapes=*/false,
@@ -86,7 +86,7 @@ void CostAnalyzer::GatherCosts() {
 
   CostGraphDef cost_graph_analytical_filtered;
   CostGraphDef cost_graph_measured_filtered;
-  std::map<string, const CostGraphDef_Node*> measured_nodes;
+  std::map<std::string, const CostGraphDef_Node*> measured_nodes;
   for (const auto& node : cost_graph_measured.node()) {
     measured_nodes[node.name()] = &node;
   }
@@ -139,7 +139,7 @@ void CostAnalyzer::PreprocessCosts() {
   }
 }
 
-void CostAnalyzer::SortOpsByTime(std::map<string, OpPerfSummary> ops) {
+void CostAnalyzer::SortOpsByTime(std::map<std::string, OpPerfSummary> ops) {
   for (const auto& op : ops) {
     ops_.push_back(op.second);
   }
@@ -152,9 +152,9 @@ void CostAnalyzer::SortOpsByTime(std::map<string, OpPerfSummary> ops) {
 }
 
 void CostAnalyzer::AnalyzeCosts() {
-  std::map<string, OpPerfSummary> ops;
+  std::map<std::string, OpPerfSummary> ops;
   for (const auto& op_perf : op_perf_.op_performance()) {
-    string op_name = op_perf.op().op();
+    std::string op_name = op_perf.op().op();
     ops[op_name].count++;
     ops[op_name].time += op_perf.compute_cost();
     ops[op_name].compute_time += op_perf.compute_time();
@@ -263,7 +263,7 @@ void CostAnalyzer::PrintAnalysis(std::ostream& os, bool per_node_report,
       os << "    Inputs" << std::endl;
       for (int i = 0; i < op_perf_.op_performance_size(); i++) {
         const auto& perf = op_perf_.op_performance(i);
-        string op_name = perf.op().op();
+        std::string op_name = perf.op().op();
         os << std::setw(width) << op_name << ",";
         os << std::setw(width_wide) << perf.compute_cost() << ",";
         os << std::setw(width_wide) << perf.compute_time() << ",";
diff --git a/tensorflow/python/grappler/cost_analyzer.h b/tensorflow/python/grappler/cost_analyzer.h
index b14a89b1f318d9..f3cc59931279a4 100644
--- a/tensorflow/python/grappler/cost_analyzer.h
+++ b/tensorflow/python/grappler/cost_analyzer.h
@@ -37,7 +37,7 @@ struct GrapplerItem;
 
 // Aggregated perf summary for ops of the same type in a graph.
 struct OpPerfSummary {
-  string name;
+  std::string name;
   int64_t count;
   int64_t time;
   int64_t compute_time;
@@ -52,7 +52,7 @@ struct OpPerfSummary {
 class CostAnalyzer {
  public:
   explicit CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
-                        const string& suffix);
+                        const std::string& suffix);
   absl::Status GenerateReport(std::ostream& os, bool per_node_report,
                               bool verbose);
 
@@ -62,7 +62,7 @@ class CostAnalyzer {
   void GatherCosts();
   void PreprocessCosts();
   void AnalyzeCosts();
-  void SortOpsByTime(std::map<string, OpPerfSummary> ops);
+  void SortOpsByTime(std::map<std::string, OpPerfSummary> ops);
   void PrintAnalysis(std::ostream& os, bool per_node_report,
                      bool verbose) const;
 
@@ -77,7 +77,7 @@ class CostAnalyzer {
   int64_t total_time_measured_serialized_;
   int64_t total_time_analytical_upper_;
   int64_t total_time_analytical_lower_;
-  string suffix_;
+  std::string suffix_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 7786247dc7b75d..e01db384639222 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -44,7 +44,7 @@ py_library(
         "//tensorflow/python/saved_model",
         "//tensorflow/python/training",
         "//tensorflow/python/util:nest",
-        "@pypi_h5py//:pkg",
+        "@pypi//h5py",
     ],
 )
 
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 623c8e365cb9ef..c984ef3207b9a8 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -32,6 +32,7 @@
 # canonical name.
 _TF_ACTIVATIONS_V2 = {
     'softmax_v2': 'softmax',
+    'log_softmax_v2': 'log_softmax',
 }
 
 
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index a160ef5a230c8a..38d083d2170e61 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -96,7 +96,7 @@ py_library(
         "//tensorflow/python/util:tf_decorator_py",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/tools/docs:doc_controls",
-        "@pypi_h5py//:pkg",
+        "@pypi//h5py",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index 08dd07c89fc9f7..eb85c056b55d37 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -62,7 +62,7 @@ class BaseDenseAttention(Layer):
         `mask==False` do not contribute to the result.
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
-    return_attention_scores: bool, it `True`, returns the attention scores
+    return_attention_scores: bool, if `True`, returns the attention scores
       (after masking and softmax) as an additional output argument.
 
   Output:
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index a8e9553946d353..00a16b849e5ab8 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -55,6 +55,6 @@ py_library(
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/training:saver",
-        "@pypi_h5py//:pkg",
+        "@pypi//h5py",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index af0e4464396a3e..47780b93e9e430 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -481,7 +481,7 @@ cuda_py_strict_test(
         "//tensorflow/python/ops:manip_ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index ba2ba032e69a8a..4a65e23f0c4902 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -49,7 +49,7 @@ tf_py_strict_test(
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:parsing_ops",
         "//tensorflow/python/platform:client_testlib",
-        "@pypi_zstandard//:pkg",
+        "@pypi//zstandard",
     ],
 )
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 395470f00776f3..c33014cc3ae5b2 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -225,6 +225,9 @@ absl::Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       } else if (pyarray_type == custom_dtypes.float8_e5m2fnuz) {
         *out_tf_datatype = TF_FLOAT8_E5M2FNUZ;
         break;
+      } else if (pyarray_type == custom_dtypes.float4_e2m1fn) {
+        *out_tf_datatype = TF_FLOAT4_E2M1FN;
+        break;
       } else if (pyarray_type == custom_dtypes.int4) {
         *out_tf_datatype = TF_INT4;
         break;
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 62032943b672d8..703bab0f65a7b8 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -207,6 +207,9 @@ absl::Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
     case TF_FLOAT8_E5M2FNUZ:
       *out_pyarray_type = custom_dtypes.float8_e5m2fnuz;
       break;
+    case TF_FLOAT4_E2M1FN:
+      *out_pyarray_type = custom_dtypes.float4_e2m1fn;
+      break;
     case TF_INT4:
       *out_pyarray_type = custom_dtypes.int4;
       break;
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index 6500e7ae7fe7d5..6da554f9882dd1 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -36,6 +36,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_absl",
         "//tensorflow/python/lib/core:pybind11_status",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@pybind11",
     ],
 )
@@ -58,6 +59,8 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_status",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/lib/io/file_io_wrapper.cc b/tensorflow/python/lib/io/file_io_wrapper.cc
index b44d22c8783cb0..f0c2a99c58ba5e 100644
--- a/tensorflow/python/lib/io/file_io_wrapper.cc
+++ b/tensorflow/python/lib/io/file_io_wrapper.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
 #include "tensorflow/core/lib/core/error_codes.pb.h"
diff --git a/tensorflow/python/lib/io/record_io_wrapper.cc b/tensorflow/python/lib/io/record_io_wrapper.cc
index 5db79f9d4ca392..060eb973fd50ca 100644
--- a/tensorflow/python/lib/io/record_io_wrapper.cc
+++ b/tensorflow/python/lib/io/record_io_wrapper.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
diff --git a/tensorflow/python/module/module.py b/tensorflow/python/module/module.py
index 19c9cc816ae4b7..f05f78e173ac79 100644
--- a/tensorflow/python/module/module.py
+++ b/tensorflow/python/module/module.py
@@ -228,9 +228,11 @@ def _flatten(self,
     class Foo(tf.Module):
       def __init__(self):
         super().__init__()
-        self.x = [tf.constant('a'), tf.constant('b')]
-        self.y = {'i': tf.constant('c'), 'j': tf.constant('d')}
-        self.z = tf.constant('e')
+        self.x = [tf.convert_to_tensor('a', dtype=tf.string),
+                  tf.convert_to_tensor('b', dtype=tf.string)]
+        self.y = {'i': tf.convert_to_tensor('c', dtype=tf.string),
+                  'j': tf.convert_to_tensor('d', dtype=tf.string)}
+        self.z = tf.convert_to_tensor('e', dtype=tf.string)
 
       @property
       def tensors(self):
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index c3cb14d9ac8a0b..77cb95823fa49b 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -2694,7 +2694,7 @@ py_strict_library(
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
-        "@pypi_opt_einsum//:pkg",
+        "@pypi//opt_einsum",
     ],
 )
 
@@ -3063,7 +3063,7 @@ py_strict_library(
         "//tensorflow/python/util:tf_should_use",
         "//tensorflow/python/util:traceback_utils",
         "@absl_py//absl/logging",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -3557,7 +3557,7 @@ cuda_py_strict_test(
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -3785,7 +3785,7 @@ cuda_py_strict_test(
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_opt_einsum//:pkg",
+        "@pypi//opt_einsum",
     ],
 )
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index 969ac6507af9ac..c436a79f132052 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -968,7 +968,13 @@ def __init__(self,
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
+        Should not be `False` now. 
       name:  A name to prepend to all ops created by this class.
+
+    Raises:
+      ValueError:  If `is_square` is `False`.
+      ValueError:  If the dtype of `spectrum` is not complex and
+        `is_self_adjoint` is `False`.
     """
     parameters = dict(
         spectrum=spectrum,
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 2d3fc2d5a85fff..7bca58f96db3c6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -653,13 +653,17 @@ def __init__(self,
         https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
-        checks that initialization and method arguments have proper shape.
-        If `True`, and static checks are inconclusive, add asserts to the graph.
+        checks that initialization and method arguments have proper shape. If
+        `True`, and static checks are inconclusive, add asserts to the graph.
       name: A name for this `LinearOperator`
 
     Raises:
       ValueError:  If `num_rows` is determined statically to be non-scalar, or
         negative.
+      ValueError:  If the dtype of `multiplier` is complex and `is_self_adjoint`
+        is `False`.
+      ValueError:  If `is_positive_definite` is `True` and `is_non_singular`
+        is `False`.
     """
     parameters = dict(
         num_rows=num_rows,
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
index ea87ef6a61b558..25af67e758b9ed 100644
--- a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -117,14 +117,14 @@ def __init__(self,
     r"""Initialize a `LinearOperatorToeplitz`.
 
     Args:
-      col: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-        The first column of the operator. Allowed dtypes: `float16`, `float32`,
-          `float64`, `complex64`, `complex128`. Note that the first entry of
-          `col` is assumed to be the same as the first entry of `row`.
-      row: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-        The first row of the operator. Allowed dtypes: `float16`, `float32`,
-          `float64`, `complex64`, `complex128`. Note that the first entry of
-          `row` is assumed to be the same as the first entry of `col`.
+      col: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`. The first
+        column of the operator. Allowed dtypes: `float16`, `float32`, `float64`,
+        `complex64`, `complex128`. Note that the first entry of `col` is assumed
+        to be the same as the first entry of `row`.
+      row: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`. The first row
+        of the operator. Allowed dtypes: `float16`, `float32`, `float64`,
+        `complex64`, `complex128`. Note that the first entry of `row` is assumed
+        to be the same as the first entry of `col`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `diag.dtype` is real, this is auto-set to `True`.
@@ -134,7 +134,14 @@ def __init__(self,
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
+        Should not be `False` for that only square Toeplitz operators currently
+        supported.
       name: A name for this `LinearOperator`.
+
+    Raises:
+      ValueError:  If `is_square` is `False`.
+      ValueError:  If `is_positive_definite` is `True` and
+        `is_non_singular` is `False`.
     """
     parameters = dict(
         col=col,
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index e20c78164f57fe..828c0610205904 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -6,7 +6,6 @@ package(
     default_visibility = [
         "//tensorflow:internal",
         "//third_party/py/tensorflow_numerics:__subpackages__",
-        "//third_party/py/trax/tf_numpy/numpy:__pkg__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index aa014ddb2c9232..de0b0fb57ccacb 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -1363,6 +1363,12 @@ def argsort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missin
 
   a = np_array_ops.array(a)
 
+  if a.dtype in (dtypes.complex64, dtypes.complex128):
+    raise TypeError(
+        'argsort does not support complex64/complex128 dtypes. '
+        f'Received dtype: {a.dtype}'
+    )
+
   def _argsort(a, axis, stable):
     if axis is None:
       a = array_ops.reshape(a, [-1])
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
index 2a6b6368e8fb14..0eb113e604c043 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
@@ -178,6 +178,20 @@ def testArgsort(self):
     a = np.zeros(100)
     np.testing.assert_equal(np_math_ops.argsort(a, kind='stable'), r)
 
+    def testArgsortRaisesErrorForComplexDtypes(self):
+      """Test that argsort raises TypeError for complex64 and complex128."""
+      complex64_array = np.array([1 + 2j, 3 + 4j, 5 + 6j], dtype=np.complex64)
+      with self.assertRaisesRegex(
+          TypeError, 'argsort does not support complex64/complex128 dtypes'
+      ):
+        np_math_ops.argsort(complex64_array)
+
+      complex128_array = np.array([1 + 2j, 3 + 4j, 5 + 6j], dtype=np.complex128)
+      with self.assertRaisesRegex(
+          TypeError, 'argsort does not support complex64/complex128 dtypes'
+      ):
+        np_math_ops.argsort(complex128_array)
+
   def testArgMaxArgMin(self):
     data = [
         0,
diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index 6942a755890bc6..0d68e5aa190654 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Structured Tensors."""
 
+from __future__ import annotations
+
 import re
 from typing import Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
@@ -49,6 +51,10 @@
 # FieldValue.
 _FieldFn = Callable[[_FieldValue], _FieldValue]
 
+# Field names work as key, and they can be a sequence to refer to the
+# sub-levels (embedded) StructuredTensor's.
+FieldName = Union[str, Sequence[str]]
+
 
 @tf_export('experimental.StructuredTensor')
 class StructuredTensor(extension_type.BatchableExtensionType):
@@ -99,15 +105,6 @@ class StructuredTensor(extension_type.BatchableExtensionType):
   _ragged_shape: dynamic_ragged_shape.DynamicRaggedShape
 
   __name__ = 'tf.StructuredTensor'
-  #=============================================================================
-  # Common Types
-  #=============================================================================
-  # pylint: disable=invalid-name
-  # Field names work as key, and they can be a sequence to refer to the
-  # sub-levels (embedded) StructuredTensor's.
-  FieldName = Union[str, Sequence[str]]
-
-  # pylint: enable=invalid-name
 
   #=============================================================================
   # Constructor & Factory Methods
@@ -1190,6 +1187,11 @@ def rank(self):
       return self._ragged_shape.rank
 
 
+# We cannot define this inside the class in the usual way because under Python
+# 3.14 we attempt to interpret it as an ExtensionType field. It would be cleaner
+# not to define it inside the class at all, but we want to avoid an API break.
+StructuredTensor.FieldName = FieldName
+
 # Regular expression used to determine whether a string is a valid field name.
 # Note: we plan to relax (or possibly eliminate) this in the future; you
 # should not rely on the fact that some field names are currently disallowed.
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index 6e8147bdc2c468..85b1a33e9df4d9 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -219,7 +219,6 @@ py_strict_library(
     # copybara:uncomment_begin(google-only)
     # visibility = visibility + [
     # "//third_party/cloud_tpu/convergence_tools:__subpackages__",
-    # "//third_party/mlperf:__subpackages__",
     # "//third_party/py/tf_slim:__subpackages__",
     # "//tensorflow:internal",
     # "//tensorflow_models:__subpackages__",
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index ea58dd71dd1226..1ed2fea59df481 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -118,7 +118,7 @@ def get_logger():
 
   The `msg` can contain string formatting.  An example of logging at the `ERROR`
   level
-  using string formating is:
+  using string formatting is:
 
   >>> tf.get_logger().error("The value %d is invalid.", 3)
 
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 600e154d577b04..105a3ce0827de9 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -40,7 +40,7 @@ cuda_py_strict_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
diff --git a/tensorflow/python/profiler/integration_test/BUILD b/tensorflow/python/profiler/integration_test/BUILD
index a4e82101944856..1c273057c4ce94 100644
--- a/tensorflow/python/profiler/integration_test/BUILD
+++ b/tensorflow/python/profiler/integration_test/BUILD
@@ -33,6 +33,6 @@ cuda_py_strict_test(
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
diff --git a/tensorflow/python/profiler/internal/flops_registry.py b/tensorflow/python/profiler/internal/flops_registry.py
index 18b30ee27d2167..79d1af3a649d7d 100644
--- a/tensorflow/python/profiler/internal/flops_registry.py
+++ b/tensorflow/python/profiler/internal/flops_registry.py
@@ -122,7 +122,7 @@ def _l2_loss_flops(graph, node):
 @ops.RegisterStatistics("Softmax", "flops")
 def _softmax_flops(graph, node):
   """Compute flops for Softmax operation."""
-  # Softmax implemetation:
+  # Softmax implementation:
   #
   # Approximate flops breakdown:
   #   2*n          -- compute shifted logits
@@ -312,7 +312,7 @@ def _pool_flops(graph, node):
   #     - padding
   #     - data_format
   #
-  # Pooling implemetation:
+  # Pooling implementation:
   out_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   out_shape.assert_is_fully_defined()
   kernel_shape = list(node.attr["ksize"].list.i)
diff --git a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
index c3245c21d88780..77c77ff6692b8a 100644
--- a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
+++ b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
@@ -45,7 +45,7 @@ absl::Status ProfilerSessionWrapper::Start(
   return session_->Status();
 }
 
-absl::Status ProfilerSessionWrapper::Stop(tensorflow::string* result) {
+absl::Status ProfilerSessionWrapper::Stop(std::string* result) {
   if (session_ != nullptr) {
     tensorflow::profiler::XSpace xspace;
     absl::Status status = session_->CollectData(&xspace);
diff --git a/tensorflow/python/profiler/internal/profiler_pywrap_impl.h b/tensorflow/python/profiler/internal/profiler_pywrap_impl.h
index d8038d464050a8..a788e78b15365a 100644
--- a/tensorflow/python/profiler/internal/profiler_pywrap_impl.h
+++ b/tensorflow/python/profiler/internal/profiler_pywrap_impl.h
@@ -35,12 +35,12 @@ class ProfilerSessionWrapper {
       const char* logdir,
       const absl::flat_hash_map<std::string,
                                 std::variant<bool, int, std::string>>& options);
-  absl::Status Stop(tensorflow::string* result);
+  absl::Status Stop(std::string* result);
   absl::Status ExportToTensorBoard();
 
  private:
   std::unique_ptr<tsl::ProfilerSession> session_;
-  tensorflow::string logdir_;
+  std::string logdir_;
 };
 
 }  // namespace pywrap
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 4f328836d40b31..d48ddabf15b191 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -80,7 +80,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
            })
       .def("stop",
            [](ProfilerSessionWrapper& wrapper) {
-             tensorflow::string content;
+             std::string content;
              absl::Status status;
              {
                py::gil_scoped_release release;
diff --git a/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc b/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc
index 0438a7a4d0596a..7123e2539a17fb 100644
--- a/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc
+++ b/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc
@@ -80,7 +80,7 @@ PYBIND11_MODULE(_pywrap_profiler_plugin, m) {
 
   m.def("monitor", [](const char* service_addr, int duration_ms,
                       int monitoring_level, bool display_timestamp) {
-    tsl::string content;
+    std::string content;
     absl::Status status;
     {
       py::gil_scoped_release release;
diff --git a/tensorflow/python/profiler/pprof_profiler.py b/tensorflow/python/profiler/pprof_profiler.py
index f10529cf00a713..656534ff3c68f7 100644
--- a/tensorflow/python/profiler/pprof_profiler.py
+++ b/tensorflow/python/profiler/pprof_profiler.py
@@ -221,8 +221,7 @@ def add(self, datum, location_ids):
 
     Args:
       datum: `ProfileDatum` to add a sample for.
-      location_ids: List of numberic location ids for this
-        sample.
+      location_ids: List of numeric location ids for this sample.
     """
     node_name = datum.node_exec_stats.node_name
     if node_name in self._node_name_to_sample:
diff --git a/tensorflow/python/saved_model/pywrap_saved_model.cc b/tensorflow/python/saved_model/pywrap_saved_model.cc
index cabefeb92251e0..677c2bba3077f2 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model.cc
@@ -1,6 +1,6 @@
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_constants.cc b/tensorflow/python/saved_model/pywrap_saved_model_constants.cc
index 8a5a710f17497c..569401846f171c 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_constants.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_constants.cc
@@ -1,6 +1,6 @@
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
index 600d8156c8290b..3075699ed415eb 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
@@ -1,6 +1,6 @@
 /* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
@@ -184,8 +184,8 @@ void DefineFingerprintingModule(py::module main_module) {
 
   m.def(
       "Singleprint",
-      [](uint64 graph_def_program_hash, uint64 signature_def_hash,
-         uint64 saved_object_graph_hash, uint64 checkpoint_hash) {
+      [](uint64_t graph_def_program_hash, uint64_t signature_def_hash,
+         uint64_t saved_object_graph_hash, uint64_t checkpoint_hash) {
         absl::StatusOr<std::string> singleprint = fingerprinting::Singleprint(
             graph_def_program_hash, signature_def_hash, saved_object_graph_hash,
             checkpoint_hash);
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
index 708ac413f88206..e48058b4bdaeee 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
@@ -1,6 +1,6 @@
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
@@ -18,10 +18,10 @@ limitations under the License.
 #include <cstdint>
 #include <exception>
 #include <string>
-#include <utility>
 
 // Placeholder for lineage logging import.
 // Placeholder for lineage logging additional import.
+#include "absl/base/call_once.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
@@ -36,6 +36,8 @@ namespace tensorflow {
 namespace saved_model {
 namespace python {
 
+// Helper variable for logging.
+
 namespace py = pybind11;
 
 class MetricException : public std::exception {
@@ -47,10 +49,9 @@ class MetricException : public std::exception {
   std::string message_ = "";
 };
 
+// Placeholder for a helper function for logging.
+
 void DefineMetricsModule(py::module main_module) {
-  // Deduplicate writes from lineage log. This should reduce the number of
-  // outgoing calls to UMB from lineage log. See b/415794129 for details.
-  // Placeholder for lineage logging enable dedupe call.
   auto m = main_module.def_submodule("metrics");
 
   m.doc() = "Python bindings for TensorFlow SavedModel and Checkpoint Metrics.";
@@ -207,6 +208,7 @@ void DefineMetricsModule(py::module main_module) {
         }
         metrics::SavedModelReadPathAndSingleprint().Set(
             path_and_singleprint.value());
+        // Placeholder for lineage logging dedup setup.
         // Placeholder for lineage logging input call.
       },
       py::kw_only(), py::arg("path"), py::arg("singleprint"),
@@ -243,6 +245,7 @@ void DefineMetricsModule(py::module main_module) {
         }
         metrics::SavedModelWritePathAndSingleprint().Set(
             path_and_singleprint.value());
+        // Placeholder for lineage logging dedup setup.
         // Placeholder for lineage logging output call.
       },
       py::kw_only(), py::arg("path"), py::arg("singleprint"),
@@ -385,7 +388,7 @@ void DefineMetricsModule(py::module main_module) {
       "CalculateFileSize",
       [](const char* filename) {
         Env* env = Env::Default();
-        uint64 filesize = 0;
+        uint64_t filesize = 0;
         if (!env->GetFileSize(filename, &filesize).ok()) {
           return (int64_t)-1;
         }
@@ -414,7 +417,7 @@ void DefineMetricsModule(py::module main_module) {
 
   m.def(
       "GetCheckpointSize",
-      [](const char* api_label, uint64 filesize) {
+      [](const char* api_label, uint64_t filesize) {
         return metrics::CheckpointSize(api_label, filesize).value();
       },
       py::kw_only(), py::arg("api_label"), py::arg("filesize"),
diff --git a/tensorflow/python/summary/BUILD b/tensorflow/python/summary/BUILD
index 9f8bff4a26b237..23e479f8321e90 100644
--- a/tensorflow/python/summary/BUILD
+++ b/tensorflow/python/summary/BUILD
@@ -118,7 +118,7 @@ tf_py_strict_test(
         "//tensorflow/python/ops:summary_ops_v2",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:training_util",
-        "@pypi_tb_nightly//:pkg",
+        "@pypi//tb_nightly",
     ],
 )
 
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 6a15f436d421d4..d8735e89e8a04c 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -1,6 +1,6 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
@@ -377,7 +377,7 @@ py::object TFE_Py_ExecuteCancelable_wrapper(
 }
 
 static py::object TF_ListPhysicalDevices() {
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   absl::Status s = tensorflow::DeviceFactory::ListAllPhysicalDevices(&devices);
   MaybeRaiseRegisteredFromStatus(s);
   PyObject* result = PyList_New(devices.size());
@@ -391,7 +391,7 @@ static py::object TF_ListPhysicalDevices() {
 }
 
 static py::object TF_ListPluggablePhysicalDevices() {
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   absl::Status s =
       tensorflow::DeviceFactory::ListPluggablePhysicalDevices(&devices);
   MaybeRaiseRegisteredFromStatus(s);
@@ -405,9 +405,10 @@ static py::object TF_ListPluggablePhysicalDevices() {
   return tensorflow::PyoOrThrow(result.release());
 }
 
-static std::unordered_map<string, string> TF_GetDeviceDetails(int index) {
+static std::unordered_map<std::string, std::string> TF_GetDeviceDetails(
+    int index) {
   tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
-  std::unordered_map<string, string> device_details;
+  std::unordered_map<std::string, std::string> device_details;
   absl::Status s =
       tensorflow::DeviceFactory::GetAnyDeviceDetails(index, &device_details);
   tensorflow::Set_TF_Status_from_Status(status.get(), s);
diff --git a/tensorflow/python/tfe_wrapper_monitoring_reader.cc b/tensorflow/python/tfe_wrapper_monitoring_reader.cc
index c906a9f851a260..f490e9313b1f0c 100644
--- a/tensorflow/python/tfe_wrapper_monitoring_reader.cc
+++ b/tensorflow/python/tfe_wrapper_monitoring_reader.cc
@@ -1,6 +1,6 @@
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");;
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 1104863acea37c..964afc6dc0a956 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -399,6 +399,7 @@ py_strict_test(
     ],
     tags = [
         "noasan",  # TODO(b/222716501)
+        "notap",  # b/456542868
     ],
     deps = [
         ":saved_model_cli_lib",
@@ -583,7 +584,6 @@ tf_cc_test(
         ":aot_compiled_x_matmul_y_small",
         ":aot_compiled_x_plus_y",
         # LINT.ThenChange(//tensorflow/tools/pip_package/xla_build/pip_test/run_xla_aot_test.sh)
-        "@local_xla//xla/service/cpu:runtime_matmul_acl",
         "//tensorflow/core:test",
         "//tensorflow/core/platform:logging",
         "@eigen_archive//:eigen3",
@@ -595,6 +595,7 @@ tf_py_test(
     size = "small",
     srcs = ["saved_model_cli_sanitize_list_test.py"],
     python_version = "PY3",
+    tags = ["notap"],  # b/456542868
     deps = [
         ":saved_model_cli",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/python/tools/api/generator2/extractor/extractor.py b/tensorflow/python/tools/api/generator2/extractor/extractor.py
index ccdd13099d13c4..f22d2a73213eeb 100644
--- a/tensorflow/python/tools/api/generator2/extractor/extractor.py
+++ b/tensorflow/python/tools/api/generator2/extractor/extractor.py
@@ -336,7 +336,7 @@ def _add_exported_symbol(self, node: ast.Call, symbol_name: str) -> None:
             f'{self._api_name}.{v}' for v in self._literal_value(kw.value)
         )
       elif kw.arg == 'allow_multiple_exports':
-        # no-op kept for backward comapatibility of `tf-keras` with TF 2.13
+        # no-op kept for backward compatibility of `tf-keras` with TF 2.13
         pass
       else:
         raise BadExportError(
diff --git a/tensorflow/python/tools/api/generator2/generate_api.bzl b/tensorflow/python/tools/api/generator2/generate_api.bzl
index e3a2a349d74156..28ae9b07b683dc 100644
--- a/tensorflow/python/tools/api/generator2/generate_api.bzl
+++ b/tensorflow/python/tools/api/generator2/generate_api.bzl
@@ -279,7 +279,7 @@ generate_api = rule(
                   "should include module1/module2/__init__.py and module3/__init__.py.",
         ),
         "use_lazy_loading": attr.bool(
-            doc = "If true, lazy load imports in the generated API rather then imporing them all statically.",
+            doc = "If true, lazy load imports in the generated API rather then importing them all statically.",
         ),
         "packages_to_ignore": attr.string_list(
             doc = "List of packages to ignore tf_exports from.",
@@ -327,7 +327,7 @@ def generate_apis(
         name: name of generate_api target.
         apis: APIs to extract. See APIS constant for allowed values.
         deps: python_library targets to serve as roots for extracting APIs.
-        output_files: The list of files that the API generator is exected to create.
+        output_files: The list of files that the API generator is expected to create.
         root_init_template: The template for the top level __init__.py file generated.
             "#API IMPORTS PLACEHOLDER" comment will be replaced with imports.
         api_packages_file_name: Name of the file with the list of all API packages. Stores in output_dir.
diff --git a/tensorflow/python/tools/api/generator2/generator/generator.py b/tensorflow/python/tools/api/generator2/generator/generator.py
index 8a2d01bf47e5a5..9ad366f75df59c 100644
--- a/tensorflow/python/tools/api/generator2/generator/generator.py
+++ b/tensorflow/python/tools/api/generator2/generator/generator.py
@@ -80,7 +80,7 @@
 _PACKAGES_TO_IGNORE = flags.DEFINE_list(
     'packages_to_ignore',
     [],
-    'Comma seperated list of packages to ignore tf_exports from. Ex:'
+    'Comma separated list of packages to ignore tf_exports from. Ex:'
     ' packages_to_ignore="tensorflow.python.framework.test_ops"'
     ' will not export any tf_exports from test_ops',
 )
diff --git a/tensorflow/python/tools/grpc_tpu_worker_service.py b/tensorflow/python/tools/grpc_tpu_worker_service.py
index c85b8ad34450b4..e125b708799183 100644
--- a/tensorflow/python/tools/grpc_tpu_worker_service.py
+++ b/tensorflow/python/tools/grpc_tpu_worker_service.py
@@ -20,8 +20,10 @@
 """
 
 import os
+import shutil
 import subprocess
 import sys
+from pathlib import Path
 
 
 def get_sys_path():
@@ -63,30 +65,35 @@ def create_systemd_service_file(service_content, service_name):
 
 
 def move_file_to_systemd(service_name):
-  if not os.path.exists("~/.config/systemd/user/"):
-    mkdir_command = "mkdir -p ~/.config/systemd/user"
-    subprocess.run(mkdir_command, shell=True, check=True)
-    print("Created directory ~/.config/systemd/user/")
-  command = f"mv {service_name} ~/.config/systemd/user/{service_name}"
-  subprocess.run(command, shell=True, check=True)
-  print(f"Service file moved to ~/.config/systemd/user/{service_name}")
+  user_systemd_dir = Path.home() / ".config" / "systemd" / "user"
+  if not user_systemd_dir.exists():
+    user_systemd_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Created directory {user_systemd_dir}")
+
+  source_file = Path(service_name)
+  dest_file = user_systemd_dir / service_name
+  shutil.move(str(source_file), str(dest_file))
+  print(f"Service file moved to {dest_file}")
 
 
 def enable_start_service(service_name):
   commands = [
-      "systemctl --user import-environment",
-      "systemctl --user daemon-reload",
-      f"systemctl --user enable {service_name}",
-      f"systemctl --user start {service_name}",
+      ["systemctl", "--user", "import-environment"],
+      ["systemctl", "--user", "daemon-reload"],
+      ["systemctl", "--user", "enable", service_name],
+      ["systemctl", "--user", "start", service_name],
   ]
   for command in commands:
-    subprocess.run(command, shell=True, check=True)
-    print(f"Executed: {command}")
+    subprocess.run(command, check=True)
+    print(f"Executed: {' '.join(command)}")
 
 
 def run():
-  if os.path.exists(f"~/.config/systemd/user/{SERVICE_NAME}"):
-    print(f"Service file ~/.config/systemd/user/{SERVICE_NAME} already exists")
+  service_file_path = (
+      Path.home() / ".config" / "systemd" / "user" / SERVICE_NAME
+  )
+  if service_file_path.exists():
+    print(f"Service file {service_file_path} already exists")
     sys.exit(1)
   else:
     create_systemd_service_file(SERVICE_FILE_CONTENT, SERVICE_NAME)
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index 335979b295e597..257e11b1caa99d 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -367,6 +367,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
     else:
       xla_flags += ' --xla_cpu_multi_thread_eigen={}'.format(
           'true' if multithreading else 'false')
+    xla_flags += ' --xla_cpu_experimental_ynn_fusion_type= '
     os.environ['XLA_FLAGS'] = xla_flags
 
   temp_dir = test.get_temp_dir()
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index ce1f36ce273e32..1a0b647700e62a 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -41,7 +41,7 @@ py_strict_library(
         "//tensorflow/python/profiler:profiler_v2",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -52,6 +52,6 @@ py_strict_binary(
     deps = [
         ":capture_tpu_profile_lib",
         "@absl_py//absl/flags",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
diff --git a/tensorflow/python/tpu/tpu_embedding_v3.py b/tensorflow/python/tpu/tpu_embedding_v3.py
index 871659600cffa8..d6762a7b153002 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3.py
@@ -1388,18 +1388,18 @@ def compute_sparse_core_stats(
   ) -> Tuple[Any, Any]:
     """Computes the max_ids/unique ids settings from the input features."""
     copy_feature_config = _clone_feature_config(feature_config)
-    table_config = []
-    for feature in nest.flatten(copy_feature_config):
-      table_config.append(feature.table)
+    table_config_list = list(
+        {feature.table for feature in nest.flatten(copy_feature_config)}
+    )
 
-    for table in table_config:
+    for table in table_config_list:
       if table.optimizer is None:
         table.optimizer = optimizer
 
     flat_features = nest.flatten_with_joined_string_paths(copy_feature_config)
 
     s = _stack_tables_with_same_table_dim_and_optimizer(
-        table_config,
+        table_config_list,
         flat_features,
         num_tpu_chips,
         num_sc_per_chip,
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_test.py b/tensorflow/python/tpu/tpu_embedding_v3_test.py
index b42fac65b44414..67ca374d0ff7e5 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3_test.py
@@ -171,6 +171,7 @@ def setUp(self):
         combiner='sum',
         name='user',
     )
+    self.rng = np.random.default_rng(0)
 
   def test_single_feature_single_table_lookup_with_static_buffer_size(self):
     feature_config = tpu_embedding_v2_utils.FeatureConfig(
@@ -864,6 +865,46 @@ def step(data):
     ):
       self.assertAllEqual(per_feature_result, per_feature_result_cpu)
 
+  def test_compute_sparse_core_stats_shared_table(self):
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    batch_size = 16
+    num_tpu_chips = strategy.num_replicas_in_sync
+
+    shared_table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.vocabulary_size,
+        dim=self.embedding_dim,
+        initializer=init_ops_v2.Constant(1.0),
+        combiner='sum',
+        name='shared_table',
+    )
+
+    features = ['feature_a', 'feature_b']
+    feature_config = {
+        feature: tpu_embedding_v2_utils.FeatureConfig(
+            table=shared_table, name=feature, output_shape=[batch_size]
+        )
+        for feature in features
+    }
+
+    data = {
+        feature: math_ops.cast(
+            math_ops.range(batch_size * num_tpu_chips) % self.vocabulary_size,
+            dtypes.float32,
+        )
+        for feature in features
+    }
+
+    tpu_embedding_v3.TPUEmbeddingV2.compute_sparse_core_stats(
+        features=data,
+        feature_config=feature_config,
+        num_tpu_chips=num_tpu_chips,
+        optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+    )
+
   def test_raise_error_when_weight_decay_is_set(self):
     feature_config = tpu_embedding_v2_utils.FeatureConfig(
         table=self.table_video, name='watched', output_shape=[16]
diff --git a/tensorflow/python/trackable/BUILD b/tensorflow/python/trackable/BUILD
index 8ecdd61826eec5..8f47d2e1498e07 100644
--- a/tensorflow/python/trackable/BUILD
+++ b/tensorflow/python/trackable/BUILD
@@ -219,7 +219,7 @@ py_strict_library(
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index 9cb571525d39ff..c1c60df8672bec 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -75,7 +75,6 @@ py_strict_library(
         "//tensorflow_minigo:__subpackages__",
         "//tensorflow_models:__subpackages__",
         "//third_party/cloud_tpu/convergence_tools:__subpackages__",
-        "//third_party/mlperf:__subpackages__",
         "//third_party/py/tf_slim:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/python/training/quantize_training_wrapper.cc b/tensorflow/python/training/quantize_training_wrapper.cc
index 1b0020e2cc6b40..191f6346f0aa22 100644
--- a/tensorflow/python/training/quantize_training_wrapper.cc
+++ b/tensorflow/python/training/quantize_training_wrapper.cc
@@ -27,9 +27,9 @@ limitations under the License.
 namespace py = pybind11;
 
 namespace tensorflow {
-static PyObject* DoQuantizeTrainingOnGraphDefHelper(const string& input_graph,
-                                                    int num_bits) {
-  string result;
+static PyObject* DoQuantizeTrainingOnGraphDefHelper(
+    const std::string& input_graph, int num_bits) {
+  std::string result;
   // TODO(suharshs): Make the QuantizeAndDequantizeV2 configurable.
   tensorflow::MaybeRaiseFromStatus(
       tensorflow::DoQuantizeTrainingOnSerializedGraphDef(
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index d7d2f0c6834a50..47057327ec43fb 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -19,7 +19,7 @@ pytype_strict_library(
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -62,7 +62,7 @@ pytype_strict_library(
         ":core",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/tools/docs:doc_controls",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 9d00df368088ef..5875bf0e16668d 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -783,7 +783,7 @@ py_strict_library(
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
         "//tensorflow/python:pywrap_tensorflow",
         ":_pywrap_utils",
     ],
@@ -969,7 +969,7 @@ py_strict_library(
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
         "//third_party/py/numpy",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/util/kernel_registry.cc b/tensorflow/python/util/kernel_registry.cc
index e2bac19de7527d..6a78c6668d9643 100644
--- a/tensorflow/python/util/kernel_registry.cc
+++ b/tensorflow/python/util/kernel_registry.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow {
 namespace swig {
 
-string TryFindKernelClass(const string& serialized_node_def) {
+std::string TryFindKernelClass(const std::string& serialized_node_def) {
   tensorflow::NodeDef node_def;
   if (!node_def.ParseFromString(serialized_node_def)) {
     LOG(WARNING) << "Error parsing node_def";
@@ -50,7 +50,7 @@ string TryFindKernelClass(const string& serialized_node_def) {
                  << node_def.ShortDebugString();
     return "";
   }
-  string class_name = "";
+  std::string class_name = "";
   status = tensorflow::FindKernelDef(
       tensorflow::DeviceType(parsed_name.type.c_str()), node_def,
       nullptr /* kernel_def */, &class_name);
diff --git a/tensorflow/python/util/kernel_registry.h b/tensorflow/python/util/kernel_registry.h
index 1ba76f020bf391..ea64df7de46d8b 100644
--- a/tensorflow/python/util/kernel_registry.h
+++ b/tensorflow/python/util/kernel_registry.h
@@ -26,7 +26,7 @@ namespace swig {
 // Returns the kernel class name required to execute <node_def> on the device
 // type of <node_def.device>, or an empty string if the kernel class is not
 // found or the device name is invalid.
-string TryFindKernelClass(const string& serialized_node_def);
+std::string TryFindKernelClass(const std::string& serialized_node_def);
 
 }  // namespace swig
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
index 43d618e27a83af..c73ff67fc85317 100644
--- a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
+++ b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
@@ -101,7 +101,7 @@ struct type_caster<tensorflow::DataType> {
 namespace tensorflow {
 
 static py::object CheckpointReader_GetTensor(
-    tensorflow::checkpoint::CheckpointReader* reader, const string& name) {
+    tensorflow::checkpoint::CheckpointReader* reader, const std::string& name) {
   Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
   PyObject* py_obj = Py_None;
   std::unique_ptr<tensorflow::Tensor> tensor;
diff --git a/tensorflow/python/util/transform_graph_wrapper.cc b/tensorflow/python/util/transform_graph_wrapper.cc
index dc6c5cb18e3e13..892d087c647115 100644
--- a/tensorflow/python/util/transform_graph_wrapper.cc
+++ b/tensorflow/python/util/transform_graph_wrapper.cc
@@ -30,10 +30,10 @@ namespace py = pybind11;
 
 namespace tensorflow {
 
-string TransformGraphWithStringInputs(string graph_def_string,
-                                      string inputs_string,
-                                      string outputs_string,
-                                      string transforms_string) {
+std::string TransformGraphWithStringInputs(std::string graph_def_string,
+                                           std::string inputs_string,
+                                           std::string outputs_string,
+                                           std::string transforms_string) {
   GraphDef graph_def;
   if (!graph_def.ParseFromString(graph_def_string)) {
     MaybeRaiseFromStatus(
@@ -46,15 +46,15 @@ string TransformGraphWithStringInputs(string graph_def_string,
   if (!parse_status.ok()) {
     MaybeRaiseFromStatus(parse_status);
   }
-  std::vector<string> inputs = str_util::Split(inputs_string, ',');
-  std::vector<string> outputs = str_util::Split(outputs_string, ',');
+  std::vector<std::string> inputs = str_util::Split(inputs_string, ',');
+  std::vector<std::string> outputs = str_util::Split(outputs_string, ',');
 
   absl::Status transform_status = graph_transforms::TransformGraph(
       inputs, outputs, params_list, &graph_def);
   if (!transform_status.ok()) {
     MaybeRaiseFromStatus(transform_status);
   }
-  string result;
+  std::string result;
   if (!graph_def.SerializeToString(&result)) {
     MaybeRaiseFromStatus(
         errors::InvalidArgument("Couldn't serialize output as a GraphDef"));
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index cf641b97fa6085..5b767f43cdd66f 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -54,15 +54,15 @@ constexpr const char VARIABLES_MODULE[] =
     "tensorflow.python.ops.variables";
 constexpr const char CORE_TYPES_MODULE[] =
     "tensorflow.python.types.core";
-string PyObjectToString(PyObject* o);
+std::string PyObjectToString(PyObject* o);
 }  // namespace
 
-std::unordered_map<string, PyObject*>* RegisteredPyObjectMap() {
-  static auto* m = new std::unordered_map<string, PyObject*>();
+std::unordered_map<std::string, PyObject*>* RegisteredPyObjectMap() {
+  static auto* m = new std::unordered_map<std::string, PyObject*>();
   return m;
 }
 
-PyObject* GetRegisteredPyObject(const string& name) {
+PyObject* GetRegisteredPyObject(const std::string& name) {
   const auto* m = RegisteredPyObjectMap();
   auto it = m->find(name);
   if (it == m->end()) {
@@ -75,7 +75,7 @@ PyObject* GetRegisteredPyObject(const string& name) {
 }
 
 PyObject* RegisterPyObject(PyObject* name, PyObject* value) {
-  string key;
+  std::string key;
   if (PyBytes_Check(name)) {
     key = PyBytes_AsString(name);
 #if PY_MAJOR_VERSION >= 3
@@ -136,7 +136,7 @@ absl::string_view GetClassName(PyObject* o) {
   return name;
 }
 
-string PyObjectToString(PyObject* o) {
+std::string PyObjectToString(PyObject* o) {
   if (o == nullptr) {
     return "<null object>";
   }
@@ -145,7 +145,7 @@ string PyObjectToString(PyObject* o) {
 #if PY_MAJOR_VERSION < 3
     string s(PyString_AS_STRING(str));
 #else
-    string s(PyUnicode_AsUTF8(str));
+    std::string s(PyUnicode_AsUTF8(str));
 #endif
     Py_DECREF(str);
     return absl::StrCat("type=", GetClassName(o), " str=", s);
@@ -787,8 +787,8 @@ bool FlattenHelper(
 
 // Sets error using keys of 'dict1' and 'dict2'.
 // 'dict1' and 'dict2' are assumed to be Python dictionaries.
-void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
-                           bool* is_type_error) {
+void SetDifferentKeysError(PyObject* dict1, PyObject* dict2,
+                           std::string* error_msg, bool* is_type_error) {
   Safe_PyObjectPtr k1(MappingKeys(dict1));
   if (PyErr_Occurred() || k1.get() == nullptr) {
     *error_msg =
@@ -822,7 +822,7 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 // with appropriate error and sets `is_type_error` to true iff
 // the error to be raised should be TypeError.
 bool AssertSameStructureHelper(
-    PyObject* o1, PyObject* o2, bool check_types, string* error_msg,
+    PyObject* o1, PyObject* o2, bool check_types, std::string* error_msg,
     bool* is_type_error, const std::function<int(PyObject*)>& is_nested_helper,
     const std::function<ValueIteratorPtr(PyObject*)>& value_iterator_getter,
     bool check_composite_tensor_type_spec) {
@@ -832,8 +832,9 @@ bool AssertSameStructureHelper(
   const bool is_nested2 = is_nested_helper(o2);
   if (PyErr_Occurred()) return false;
   if (is_nested1 != is_nested2) {
-    string seq_str = is_nested1 ? PyObjectToString(o1) : PyObjectToString(o2);
-    string non_seq_str =
+    std::string seq_str =
+        is_nested1 ? PyObjectToString(o1) : PyObjectToString(o2);
+    std::string non_seq_str =
         is_nested1 ? PyObjectToString(o2) : PyObjectToString(o1);
     *is_type_error = false;
     *error_msg = tensorflow::strings::StrCat(
@@ -1200,7 +1201,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
   const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
       expand_composites ? GetValueIteratorForComposite : GetValueIterator;
   const bool check_composite_tensor_type_spec = expand_composites;
-  string error_msg;
+  std::string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
                             is_nested_helper, get_value_iterator,
@@ -1225,7 +1226,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
 
 PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
                                      bool check_types) {
-  string error_msg;
+  std::string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
                             IsNestedForDataHelper, GetValueIterator, false);
diff --git a/tensorflow/security/fuzzing/cc/BUILD b/tensorflow/security/fuzzing/cc/BUILD
index bf419d8332cc21..6251e2a39a2fb4 100644
--- a/tensorflow/security/fuzzing/cc/BUILD
+++ b/tensorflow/security/fuzzing/cc/BUILD
@@ -33,6 +33,7 @@ tf_cc_fuzz_test(
     deps = [
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -104,6 +105,7 @@ tf_cc_fuzz_test(
         "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -144,6 +146,7 @@ tf_cc_fuzz_test(
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:stringprintf",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -154,6 +157,7 @@ tf_cc_fuzz_test(
     deps = [
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc b/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
index 66ddf4f75566ad..1f09a16a700d77 100644
--- a/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <string_view>
 
 #include "fuzztest/fuzztest.h"
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
@@ -27,8 +28,8 @@ namespace {
 void FuzzTest(std::string_view data) {
   std::string ns = tensorflow::str_util::ArgDefCase(data);
   for (const auto &c : ns) {
-    const bool is_letter = 'a' <= c && c <= 'z';
-    const bool is_digit = '0' <= c && c <= '9';
+    const bool is_letter = absl::ascii_isalpha(c);
+    const bool is_digit = absl::ascii_isdigit(c);
     if (!is_letter && !is_digit) {
       assert(c == '_');
     }
diff --git a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
index 32f56250bccecf..0f064a8828d3c1 100644
--- a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cassert>
+#include <cstdint>
 #include <string>
 
 #include "fuzztest/fuzztest.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
@@ -26,7 +28,7 @@ namespace {
 
 void FuzzTest(std::string data) {
   absl::string_view sp(data);
-  tensorflow::uint64 val;
+  uint64_t val;
 
   const bool leading_digits =
       tensorflow::str_util::ConsumeLeadingDigits(&sp, &val);
diff --git a/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
index 1498ecdd91c3ed..7445ae9b3c8219 100644
--- a/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
@@ -38,7 +38,7 @@ namespace fuzzing {
 
 FunctionDef EmptyFunctionDefGenerator(int number_of_input_arguments,
                                       int number_of_output_arguments) {
-  std::vector<string> in_def_vec;
+  std::vector<std::string> in_def_vec;
   in_def_vec.reserve(number_of_input_arguments);
   for (int c = 0; c < number_of_input_arguments; ++c) {
     in_def_vec.push_back(absl::StrCat("in", c, ":float"));
@@ -50,12 +50,12 @@ FunctionDef EmptyFunctionDefGenerator(int number_of_input_arguments,
     body_nodes.push_back(
         {{"zero"}, "Const", {}, {{"value", const_value}, {"dtype", DT_FLOAT}}});
   }
-  std::vector<string> out_def_vec;
+  std::vector<std::string> out_def_vec;
   out_def_vec.reserve(number_of_output_arguments);
-  std::vector<std::pair<string, string>> ret_def;
+  std::vector<std::pair<std::string, std::string>> ret_def;
   ret_def.reserve(number_of_output_arguments);
   for (int c = 0; c < number_of_output_arguments; ++c) {
-    string output_id = "out" + std::to_string(c);
+    std::string output_id = "out" + std::to_string(c);
     out_def_vec.push_back(output_id + ":float");
     if (c < number_of_input_arguments) {
       ret_def.emplace_back(output_id, "in" + std::to_string(c));
diff --git a/tensorflow/security/fuzzing/cc/fuzz_session.h b/tensorflow/security/fuzzing/cc/fuzz_session.h
index b492c6e91da2d3..61a8fa03ff874d 100644
--- a/tensorflow/security/fuzzing/cc/fuzz_session.h
+++ b/tensorflow/security/fuzzing/cc/fuzz_session.h
@@ -142,13 +142,13 @@ class FuzzSession {
   // any returned output.
   // Note: We are ignoring Status from Run here since fuzzers don't need to
   // check it (as that will slow them down and printing/logging is useless).
-  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+  void RunInputs(const std::vector<std::pair<std::string, Tensor>>& inputs) {
     RunInputsWithStatus(inputs).IgnoreError();
   }
 
   // Same as RunInputs but don't ignore status
   absl::Status RunInputsWithStatus(
-      const std::vector<std::pair<string, Tensor>>& inputs) {
+      const std::vector<std::pair<std::string, Tensor>>& inputs) {
     return session_->Run(inputs, {}, {"output"}, nullptr);
   }
 
diff --git a/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
index dab62696ce7946..025212d78bcfaf 100644
--- a/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 namespace fuzzing {
 
 // Creates FuzzBincount class that wraps a single operation node session.
-class FuzzBincount : public FuzzSession<Tensor, int32, Tensor> {
+class FuzzBincount : public FuzzSession<Tensor, int32_t, Tensor> {
   void BuildGraph(const Scope& scope) override {
     auto arr = tensorflow::ops::Placeholder(scope.WithOpName("arr"), DT_INT32);
     auto size =
@@ -36,10 +36,10 @@ class FuzzBincount : public FuzzSession<Tensor, int32, Tensor> {
         tensorflow::ops::Placeholder(scope.WithOpName("weights"), DT_INT32);
     tensorflow::ops::Bincount(scope.WithOpName("output"), arr, size, weights);
   }
-  void FuzzImpl(const Tensor& arr, const int32& nbins,
+  void FuzzImpl(const Tensor& arr, const int32_t& nbins,
                 const Tensor& weights) final {
     Tensor size(DT_INT32, {});
-    size.flat<int32>()(0) = nbins;
+    size.flat<int32_t>()(0) = nbins;
 
     absl::Status s = RunInputsWithStatus(
         {{"arr", arr}, {"size", size}, {"weights", weights}});
@@ -58,7 +58,7 @@ FUZZ_TEST_F(FuzzBincount, Fuzz)
                                                     /*dim_lower_bound=*/0,
                                                     /*dim_upper_bound=*/10),
                                                 fuzztest::Just(DT_INT32)),
-                 fuzztest::InRange<int32>(0, 10),
+                 fuzztest::InRange<int32_t>(0, 10),
                  fuzzing::AnyValidNumericTensor(fuzzing::AnyValidTensorShape(
                                                     /*max_rank=*/5,
                                                     /*dim_lower_bound=*/0,
diff --git a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
index 92272768086d55..1631998a20ee25 100644
--- a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 namespace fuzzing {
 
 // Creates FuzzConcat class that wraps a single operation node session.
-class FuzzConcat : public FuzzSession<Tensor, Tensor, int32> {
+class FuzzConcat : public FuzzSession<Tensor, Tensor, int32_t> {
   void BuildGraph(const Scope& scope) override {
     auto value1 =
         tensorflow::ops::Placeholder(scope.WithOpName("value1"), DT_INT32);
@@ -43,7 +43,7 @@ class FuzzConcat : public FuzzSession<Tensor, Tensor, int32> {
                             axis);
   }
   void FuzzImpl(const Tensor& value1, const Tensor& value2,
-                const int32& axis) final {
+                const int32_t& axis) final {
     Tensor axis_tensor(DT_INT32, {});
     axis_tensor.scalar<int32_t>()() = axis;
     absl::Status s = RunInputsWithStatus(
@@ -66,7 +66,7 @@ FUZZ_TEST_F(FuzzConcat, Fuzz)
                                                     /*dim_lower_bound=*/0,
                                                     /*dim_upper_bound=*/10),
                                                 fuzztest::Just(DT_INT32)),
-                 fuzztest::InRange<int32>(0, 6));
+                 fuzztest::InRange<int32_t>(0, 6));
 
 }  // end namespace fuzzing
 }  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc b/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc
index 9dff089f22aa43..f8ddc2c9f72564 100644
--- a/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "fuzztest/fuzztest.h"
 #include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
diff --git a/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc b/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc
index e41334529b52a2..da60f6d40db015 100644
--- a/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <string>
 
 #include "fuzztest/fuzztest.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
diff --git a/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc b/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
index 76a8ffe5f9bef7..724c4ef164d890 100644
--- a/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "fuzztest/fuzztest.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/stringprintf.h"
 
 // This is a fuzzer for tensorflow::strings::Printf
@@ -27,9 +28,11 @@ void FuzzTest(const std::vector<std::string> ss) {
   const std::string all = ss[0] + ss[1] + ss[2];
 
   int n[4] = {-1, -1, -1, -1};
-  const std::string ret =
-      tensorflow::strings::Printf("%n%s%n%s%n%s%n", &n[0], ss[0].c_str(), &n[1],
-                                  ss[1].c_str(), &n[2], ss[2].c_str(), &n[3]);
+  const std::string ret = tensorflow::strings::Printf(
+      "%n%s%n%s%n%s%n", absl::FormatCountCapture(&n[0]), ss[0].c_str(),
+      absl::FormatCountCapture(&n[1]), ss[1].c_str(),
+      absl::FormatCountCapture(&n[2]), ss[2].c_str(),
+      absl::FormatCountCapture(&n[3]));
 
   int size_so_far = 0;
   for (int i = 0; i < 3; i++) {
diff --git a/tensorflow/security/fuzzing/py/annotation_types.py b/tensorflow/security/fuzzing/py/annotation_types.py
index 874b66c6818101..9fd00b1757ad11 100644
--- a/tensorflow/security/fuzzing/py/annotation_types.py
+++ b/tensorflow/security/fuzzing/py/annotation_types.py
@@ -39,6 +39,7 @@ def _create_dtype_wrapper(name, underlying_dtype: _dtypes.DType):
 Float8e5m2fnuz = _create_dtype_wrapper(
     "Float8e5m2fnuz", _dtypes.float8_e5m2fnuz
 )
+Float4e2m1fn = _create_dtype_wrapper("Float4e2m1fn", _dtypes.float4_e2m1fn)
 Float16 = _create_dtype_wrapper("Float16", _dtypes.float16)
 Float32 = _create_dtype_wrapper("Float32", _dtypes.float32)
 Float64 = _create_dtype_wrapper("Float64", _dtypes.float64)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index a8ebc50ee2074d..2d357c24349ff5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -191,6 +191,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_STRING
     }
+    field {
+      name: "online_cost_analysis"
+      number: 36
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     enum_type {
       name: "MlirBridgeRollout"
       value {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index a6bc90e0026c6b..c94a936fb441e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -327,6 +327,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_STRING
       }
+      field {
+        name: "online_cost_analysis"
+        number: 36
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       enum_type {
         name: "MlirBridgeRollout"
         value {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
index 39984d58cbdcd1..2de9041f8d31e8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.dtypes.experimental"
 tf_module {
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index a38bafff65a376..a3cc35cad08b4a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "extension_type"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
index 3509a8bb917bfe..b709330d102931 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\', \'allow_last_axis_squeeze\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\', \'allow_last_axis_squeeze\', \'name\', \'optional\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
index 39984d58cbdcd1..2de9041f8d31e8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.dtypes.experimental"
 tf_module {
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index 75d2ccd9d4a11f..ab6b7184b7932f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "extension_type"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 1d161320ac5356..ad6cb8fd9ee30b 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -457,6 +457,8 @@ def testAPIBackwardsCompatibilityV1(self):
     omit_golden_symbols_map.update(
         self._ignored_is_instance_types(['tensorflow.python_io.TFRecordWriter'])
     )
+    # In OSS we have a different version of ABSL.
+    omit_golden_symbols_map['tensorflow.logging'] = ['log_if']
     self._checkBackwardsCompatibility(
         tf.compat.v1,
         golden_file_patterns,
diff --git a/tensorflow/tools/api/tests/convert_from_multiline.cc b/tensorflow/tools/api/tests/convert_from_multiline.cc
index cd138e250f2aef..5c51fdf66b3cde 100644
--- a/tensorflow/tools/api/tests/convert_from_multiline.cc
+++ b/tensorflow/tools/api/tests/convert_from_multiline.cc
@@ -28,12 +28,12 @@ namespace tensorflow {
 namespace {
 constexpr char kApiDefFilePattern[] = "*.pbtxt";
 
-absl::Status ConvertFilesFromMultiline(const string& input_dir,
-                                       const string& output_dir) {
+absl::Status ConvertFilesFromMultiline(const std::string& input_dir,
+                                       const std::string& output_dir) {
   Env* env = Env::Default();
 
-  const string file_pattern = io::JoinPath(input_dir, kApiDefFilePattern);
-  std::vector<string> matching_paths;
+  const std::string file_pattern = io::JoinPath(input_dir, kApiDefFilePattern);
+  std::vector<std::string> matching_paths;
   TF_CHECK_OK(env->GetMatchingPaths(file_pattern, &matching_paths));
 
   if (!env->IsDirectory(output_dir).ok()) {
@@ -41,10 +41,10 @@ absl::Status ConvertFilesFromMultiline(const string& input_dir,
   }
 
   for (const auto& path : matching_paths) {
-    string contents;
+    std::string contents;
     TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(env, path, &contents));
     contents = tensorflow::PBTxtFromMultiline(contents);
-    string output_path = io::JoinPath(output_dir, io::Basename(path));
+    std::string output_path = io::JoinPath(output_dir, io::Basename(path));
     // Write contents to output_path
     TF_RETURN_IF_ERROR(
         tensorflow::WriteStringToFile(env, output_path, contents));
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index fad14ad8eb647d..84c420c3ed6f9d 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -63,9 +63,9 @@ namespace benchmark_model {
 namespace {
 
 absl::Status InitializeVariables(Session* session,
-                                 const std::vector<string>& init_ops) {
+                                 const std::vector<std::string>& init_ops) {
   LOG(INFO) << "Initializing graph variables";
-  for (const string& init_op : init_ops) {
+  for (const std::string& init_op : init_ops) {
     TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
   }
   return absl::OkStatus();
@@ -85,16 +85,16 @@ void InitializeTensor(const std::vector<float>& initialization_values,
 
 void CreateTensorsFromInputInfo(
     const std::vector<InputLayerInfo>& inputs,
-    std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
+    std::vector<std::pair<std::string, tensorflow::Tensor> >* input_tensors) {
   for (const InputLayerInfo& input : inputs) {
     Tensor input_tensor(input.data_type, input.shape);
     switch (input.data_type) {
       case DT_INT32: {
-        InitializeTensor<int32>(input.initialization_values, &input_tensor);
+        InitializeTensor<int32_t>(input.initialization_values, &input_tensor);
         break;
       }
       case DT_INT64: {
-        InitializeTensor<int64>(input.initialization_values, &input_tensor);
+        InitializeTensor<int64_t>(input.initialization_values, &input_tensor);
         break;
       }
       case DT_FLOAT: {
@@ -106,7 +106,7 @@ void CreateTensorsFromInputInfo(
         break;
       }
       case DT_UINT8: {
-        InitializeTensor<uint8>(input.initialization_values, &input_tensor);
+        InitializeTensor<uint8_t>(input.initialization_values, &input_tensor);
         break;
       }
       case DT_BOOL: {
@@ -131,15 +131,15 @@ void CreateTensorsFromInputInfo(
 
 absl::Status GetOutputShapes(
     const std::vector<InputLayerInfo>& inputs,
-    const std::set<string>& wanted_shapes, Session* session,
-    std::unordered_map<string, TensorShape>* node_shapes) {
-  std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
+    const std::set<std::string>& wanted_shapes, Session* session,
+    std::unordered_map<std::string, TensorShape>* node_shapes) {
+  std::vector<std::pair<std::string, tensorflow::Tensor> > input_tensors;
   CreateTensorsFromInputInfo(inputs, &input_tensors);
   std::vector<tensorflow::Tensor> output_tensors;
-  std::vector<string> output_tensor_names;
-  for (const string& wanted_shape : wanted_shapes) {
+  std::vector<std::string> output_tensor_names;
+  for (const std::string& wanted_shape : wanted_shapes) {
     bool is_input = false;
-    for (const std::pair<string, tensorflow::Tensor>& input_tensor :
+    for (const std::pair<std::string, tensorflow::Tensor>& input_tensor :
          input_tensors) {
       if (input_tensor.first == wanted_shape) {
         (*node_shapes)[wanted_shape] = input_tensor.second.shape();
@@ -155,31 +155,31 @@ absl::Status GetOutputShapes(
       session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
   CHECK_EQ(output_tensors.size(), output_tensor_names.size());
   for (int i = 0; i < output_tensor_names.size(); ++i) {
-    const string& wanted_shape_name = output_tensor_names[i];
+    const std::string& wanted_shape_name = output_tensor_names[i];
     const TensorShape& found_shape = output_tensors[i].shape();
     (*node_shapes)[wanted_shape_name] = found_shape;
   }
   return absl::OkStatus();
 }
 
-absl::Status CalculateFlops(const GraphDef& graph,
-                            const std::vector<InputLayerInfo>& inputs,
-                            Session* session, int64_t* total_flops,
-                            std::unordered_map<string, int64_t>* flops_by_op) {
-  std::unordered_set<string> floppable_ops = {
+absl::Status CalculateFlops(
+    const GraphDef& graph, const std::vector<InputLayerInfo>& inputs,
+    Session* session, int64_t* total_flops,
+    std::unordered_map<std::string, int64_t>* flops_by_op) {
+  std::unordered_set<std::string> floppable_ops = {
       "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
       "DepthwiseConv2dNative"};
 
-  std::set<string> wanted_shapes;
+  std::set<std::string> wanted_shapes;
   for (const NodeDef& node : graph.node()) {
     if (floppable_ops.count(node.op())) {
-      for (const string& input : node.input()) {
+      for (const std::string& input : node.input()) {
         wanted_shapes.insert(input);
       }
       wanted_shapes.insert(node.name());
     }
   }
-  std::unordered_map<string, TensorShape> found_shapes;
+  std::unordered_map<std::string, TensorShape> found_shapes;
   TF_RETURN_IF_ERROR(
       GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
 
@@ -228,10 +228,10 @@ absl::Status CalculateFlops(const GraphDef& graph,
   return absl::OkStatus();
 }
 
-void RecordBenchmarkEntry(const string& output_prefix,
-                          const string& benchmark_name, const string& postfix,
-                          int num_runs, double total_time_s,
-                          double throughput = -1.0) {
+void RecordBenchmarkEntry(const std::string& output_prefix,
+                          const std::string& benchmark_name,
+                          const std::string& postfix, int num_runs,
+                          double total_time_s, double throughput = -1.0) {
   std::stringstream stream;
   stream << benchmark_name;
   if (!postfix.empty()) {
@@ -262,7 +262,7 @@ void SleepSeconds(double sleep_seconds) {
 
 }  // namespace
 
-absl::Status InitializeSession(int num_threads, const string& graph,
+absl::Status InitializeSession(int num_threads, const std::string& graph,
                                std::unique_ptr<Session>* session,
                                std::unique_ptr<GraphDef>* graph_def) {
   LOG(INFO) << "Loading TensorFlow.";
@@ -298,10 +298,11 @@ absl::Status InitializeSession(int num_threads, const string& graph,
 }
 
 absl::Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
-                          const std::vector<string>& outputs,
-                          const std::vector<string>& targets, Session* session,
-                          StatSummarizer* stats, int64_t* inference_time_us) {
-  std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
+                          const std::vector<std::string>& outputs,
+                          const std::vector<std::string>& targets,
+                          Session* session, StatSummarizer* stats,
+                          int64_t* inference_time_us) {
+  std::vector<std::pair<std::string, tensorflow::Tensor> > input_tensors;
   CreateTensorsFromInputInfo(inputs, &input_tensors);
 
   std::vector<tensorflow::Tensor> output_tensors;
@@ -337,8 +338,8 @@ absl::Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
 absl::Status TimeMultipleRuns(double sleep_seconds, int num_runs,
                               double max_time_s,
                               const std::vector<InputLayerInfo>& inputs,
-                              const std::vector<string>& outputs,
-                              const std::vector<string>& targets,
+                              const std::vector<std::string>& outputs,
+                              const std::vector<std::string>& targets,
                               Session* session, StatSummarizer* stats,
                               int64_t* total_time_us,
                               int64_t* actual_num_runs) {
@@ -384,21 +385,21 @@ absl::Status TimeMultipleRuns(double sleep_seconds, int num_runs,
 }
 
 int Main(int argc, char** argv) {
-  string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
-  string init_ops_string = "";
-  string input_layer_string = "input:0";
-  string input_layer_shape_string = "1,224,224,3";
-  string input_layer_type_string = "float";
-  string input_layer_values_string = "";
-  string output_layer_string = "output:0";
-  string target_layer_string = "";
+  std::string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
+  std::string init_ops_string = "";
+  std::string input_layer_string = "input:0";
+  std::string input_layer_shape_string = "1,224,224,3";
+  std::string input_layer_type_string = "float";
+  std::string input_layer_values_string = "";
+  std::string output_layer_string = "output:0";
+  std::string target_layer_string = "";
   int max_num_runs = 1000;
-  string max_time = "10.0";
-  string inference_delay = "-1.0";
-  string inter_benchmark_delay = "-1.0";
+  std::string max_time = "10.0";
+  std::string inference_delay = "-1.0";
+  std::string inter_benchmark_delay = "-1.0";
   int num_threads = -1;
-  string benchmark_name = "";
-  string output_prefix = "";
+  std::string benchmark_name = "";
+  std::string output_prefix = "";
   bool show_sizes = false;
   bool show_run_order = true;
   int run_order_limit = 0;
@@ -446,7 +447,7 @@ int Main(int argc, char** argv) {
       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
   };
-  string usage = Flags::Usage(argv[0], flag_list);
+  std::string usage = Flags::Usage(argv[0], flag_list);
   const bool parse_result = Flags::Parse(&argc, argv, flag_list);
 
   if (!parse_result) {
@@ -454,16 +455,19 @@ int Main(int argc, char** argv) {
     return -1;
   }
 
-  std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
-  std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
-  std::vector<string> input_layer_shapes =
+  std::vector<std::string> init_ops = str_util::Split(init_ops_string, ',');
+  std::vector<std::string> input_layers =
+      str_util::Split(input_layer_string, ',');
+  std::vector<std::string> input_layer_shapes =
       str_util::Split(input_layer_shape_string, ':');
-  std::vector<string> input_layer_types =
+  std::vector<std::string> input_layer_types =
       str_util::Split(input_layer_type_string, ',');
-  std::vector<string> input_layer_values =
+  std::vector<std::string> input_layer_values =
       str_util::Split(input_layer_values_string, ':');
-  std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
-  std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
+  std::vector<std::string> output_layers =
+      str_util::Split(output_layer_string, ',');
+  std::vector<std::string> target_layers =
+      str_util::Split(target_layer_string, ',');
   if ((input_layers.size() != input_layer_shapes.size()) ||
       (input_layers.size() != input_layer_types.size())) {
     LOG(ERROR) << "There must be the same number of items in --input_layer,"
@@ -552,9 +556,9 @@ int Main(int argc, char** argv) {
     CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
         << input_layer_types[n] << " was an invalid type";
 
-    std::vector<string> split_layer_shapes =
+    std::vector<std::string> split_layer_shapes =
         str_util::Split(input_layer_shapes[n], ',');
-    for (const string& layer_shape : split_layer_shapes) {
+    for (const std::string& layer_shape : split_layer_shapes) {
       int32_t tmp;
       CHECK(absl::SimpleAtoi(layer_shape, &tmp))
           << "Incorrect size string specified: " << input_layer_shapes[n];
@@ -568,11 +572,11 @@ int Main(int argc, char** argv) {
     }
     input.name = input_layers[n];
     if (n < input_layer_values.size()) {
-      std::vector<string> string_tokens =
+      std::vector<std::string> string_tokens =
           str_util::Split(input_layer_values[n], ',');
       input.initialization_values.clear();
       input.initialization_values.reserve(string_tokens.size());
-      for (const string& str_val : string_tokens) {
+      for (const std::string& str_val : string_tokens) {
         float val;
         CHECK(absl::SimpleAtof(str_val, &val))
             << "Incorrect initialization values string specified: "
@@ -641,14 +645,14 @@ int Main(int argc, char** argv) {
 
   if (show_flops) {
     int64_t total_flops;
-    std::unordered_map<string, int64_t> flops_by_op;
+    std::unordered_map<std::string, int64_t> flops_by_op;
     absl::Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
                                               &total_flops, &flops_by_op);
     if (!flop_status.ok()) {
       LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
       return -1;
     }
-    string pretty_flops;
+    std::string pretty_flops;
     if (total_flops < 1000) {
       pretty_flops = absl::StrCat(total_flops, " FLOPs");
     } else if (total_flops < (1000 * 1000)) {
diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h
index a2349de570ed65..de476688339044 100644
--- a/tensorflow/tools/benchmark/benchmark_model.h
+++ b/tensorflow/tools/benchmark/benchmark_model.h
@@ -33,29 +33,30 @@ namespace benchmark_model {
 
 // Used to help construct dummy inputs for the benchmarking.
 struct InputLayerInfo {
-  string name;
+  std::string name;
   DataType data_type;
   TensorShape shape;
   std::vector<float> initialization_values;
 };
 
 // Loads a model from disk into a new session.
-absl::Status InitializeSession(int num_threads, const string& graph,
+absl::Status InitializeSession(int num_threads, const std::string& graph,
                                std::unique_ptr<Session>* session,
                                std::unique_ptr<GraphDef>* graph_def);
 
 // Does a single run of the model that's been loaded into the given session.
 absl::Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
-                          const std::vector<string>& outputs,
-                          const std::vector<string>& targets, Session* session,
-                          StatSummarizer* stats, int64_t* inference_time_us);
+                          const std::vector<std::string>& outputs,
+                          const std::vector<std::string>& targets,
+                          Session* session, StatSummarizer* stats,
+                          int64_t* inference_time_us);
 
 // Runs the model multiple time, keeping track of timing information.
 absl::Status TimeMultipleRuns(double sleep_seconds, int num_runs,
                               double max_time_s,
                               const std::vector<InputLayerInfo>& inputs,
-                              const std::vector<string>& outputs,
-                              const std::vector<string>& targets,
+                              const std::vector<std::string>& outputs,
+                              const std::vector<std::string>& targets,
                               Session* session, StatSummarizer* stats,
                               int64_t* total_time_us, int64_t* actual_num_runs);
 
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index f6657a2bb5b322..d09558ed194fea 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -39,7 +39,7 @@ namespace {
 
 void CreateTestGraph(const ::tensorflow::Scope& root,
                      benchmark_model::InputLayerInfo* input,
-                     string* output_name, GraphDef* graph_def) {
+                     std::string* output_name, GraphDef* graph_def) {
   // Create a simple graph and write it to filename_pb.
   const int input_width = 400;
   const int input_height = 10;
@@ -59,15 +59,15 @@ void CreateTestGraph(const ::tensorflow::Scope& root,
 }
 
 TEST(BenchmarkModelTest, InitializeAndRun) {
-  const string dir = testing::TmpDir();
-  const string filename_pb = io::JoinPath(dir, "graphdef.pb");
+  const std::string dir = testing::TmpDir();
+  const std::string filename_pb = io::JoinPath(dir, "graphdef.pb");
   auto root = Scope::NewRootScope().ExitOnError();
 
   benchmark_model::InputLayerInfo input;
-  string output_name;
+  std::string output_name;
   GraphDef graph_def;
   CreateTestGraph(root, &input, &output_name, &graph_def);
-  string graph_def_serialized;
+  std::string graph_def_serialized;
   graph_def.SerializeToString(&graph_def_serialized);
   TF_ASSERT_OK(
       WriteStringToFile(Env::Default(), filename_pb, graph_def_serialized));
@@ -88,12 +88,12 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
 }
 
 TEST(BenchmarkModeTest, TextProto) {
-  const string dir = testing::TmpDir();
-  const string filename_txt = io::JoinPath(dir, "graphdef.pb.txt");
+  const std::string dir = testing::TmpDir();
+  const std::string filename_txt = io::JoinPath(dir, "graphdef.pb.txt");
   auto root = Scope::NewRootScope().ExitOnError();
 
   benchmark_model::InputLayerInfo input;
-  string output_name;
+  std::string output_name;
   GraphDef graph_def;
   CreateTestGraph(root, &input, &output_name, &graph_def);
   TF_ASSERT_OK(WriteTextProto(Env::Default(), filename_txt, graph_def));
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 2b78cee47ad843..d6bcf1efd6b9f6 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-BAZEL_VERSION="7.4.1"
+BAZEL_VERSION="7.7.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index 8fdfbcaf324ab0..4dcad86bdf6721 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -17,7 +17,7 @@
 
 # Keeps Bazel versions of the build scripts.
 # LINT.IfChange
-LATEST_BAZEL_VERSION=7.4.1
+LATEST_BAZEL_VERSION=7.7.0
 # LINT.ThenChange(
 #   //tf_keras/google/kokoro/pip/build_and_upload_pip_package.sh,
 #   //tensorflow/opensource_only/.bazelversion,
diff --git a/tensorflow/tools/ci_build/release/requirements_mac.txt b/tensorflow/tools/ci_build/release/requirements_mac.txt
index aa08a8c8db45e3..780af817c1f0b6 100644
--- a/tensorflow/tools/ci_build/release/requirements_mac.txt
+++ b/tensorflow/tools/ci_build/release/requirements_mac.txt
@@ -1,7 +1,7 @@
 -r requirements_common.txt
 
 # Dependencies only required for Mac
-certifi ~= 2022.12.07
+certifi ~= 2023.7.22
 
 # Install build related dependencies
 twine ~= 3.6.0
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 973f71afcbf90f..c26385b12b004a 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -132,7 +132,7 @@ py_strict_library(
     srcs = ["fenced_doctest_lib.py"],
     deps = [
         ":tf_doctest_lib",
-        "@pypi_astor//:pkg",
+        "@pypi//astor",
     ],
 )
 
@@ -169,7 +169,7 @@ py_strict_test(
         ":generate2_lib",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/platform:test",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -200,7 +200,7 @@ py_strict_library(
         "//tensorflow/python/util:tf_inspect",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index 477c04d3e936f6..b5fbef19051f8a 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:24.04@sha256:353675e2a41babd526e2b837d7ec780c2a05bca0164f7ea5dbbd433d21d166fc
+FROM ubuntu:24.04@sha256:66460d557b25769b102175144d538d88219c077c678a49af4afca6fbfc1b5252
 
 LABEL maintainer="Shanqing Cai <cais@google.com>"
 
diff --git a/tensorflow/tools/graph_transforms/transform_utils_test.cc b/tensorflow/tools/graph_transforms/transform_utils_test.cc
index 0f50a6332b9f3b..ea218e74cc4ca0 100644
--- a/tensorflow/tools/graph_transforms/transform_utils_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils_test.cc
@@ -51,7 +51,7 @@ class TransformUtilsTest : public ::testing::Test {
 
     GraphDef graph_def;
     TF_ASSERT_OK(root.ToGraphDef(&graph_def));
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(graph_def, &node_map);
 
     EXPECT_EQ(1, node_map.count("a"));
@@ -85,7 +85,7 @@ class TransformUtilsTest : public ::testing::Test {
     GraphDef graph_def;
     TF_ASSERT_OK(root.ToGraphDef(&graph_def));
 
-    std::map<string, std::vector<const NodeDef*>> outputs_map;
+    std::map<std::string, std::vector<const NodeDef*>> outputs_map;
     MapNodesToOutputs(graph_def, &outputs_map);
 
     EXPECT_EQ(1, outputs_map.count("a"));
@@ -109,9 +109,9 @@ class TransformUtilsTest : public ::testing::Test {
   }
 
   void TestNodeNamePartsFromInput() {
-    string prefix;
-    string node_name;
-    string suffix;
+    std::string prefix;
+    std::string node_name;
+    std::string suffix;
 
     NodeNamePartsFromInput("some_node_name", &prefix, &node_name, &suffix);
     EXPECT_EQ("", prefix);
@@ -175,45 +175,45 @@ class TransformUtilsTest : public ::testing::Test {
     int32_t value_i = 32;
     SetNodeAttr("foo", value_i, &node);
     EXPECT_EQ(32, node.attr().at("foo").i());
-    string value_s = "some_value";
+    std::string value_s = "some_value";
     SetNodeAttr("bar", value_s, &node);
     EXPECT_EQ("some_value", node.attr().at("bar").s());
   }
 
   void TestSetNodeTensorAttr() {
     NodeDef node;
-    SetNodeTensorAttr<int32>("foo", {3, 1}, {1, 2, 3}, &node);
+    SetNodeTensorAttr<int32_t>("foo", {3, 1}, {1, 2, 3}, &node);
     TensorProto tensor_proto = node.attr().at("foo").tensor();
     Tensor tensor;
     CHECK(tensor.FromProto(tensor_proto));
     EXPECT_EQ(DT_INT32, tensor.dtype());
     EXPECT_EQ(3, tensor.shape().dim_size(0));
     EXPECT_EQ(1, tensor.shape().dim_size(1));
-    EXPECT_EQ(1, tensor.flat<int32>()(0));
-    EXPECT_EQ(2, tensor.flat<int32>()(1));
-    EXPECT_EQ(3, tensor.flat<int32>()(2));
+    EXPECT_EQ(1, tensor.flat<int32_t>()(0));
+    EXPECT_EQ(2, tensor.flat<int32_t>()(1));
+    EXPECT_EQ(3, tensor.flat<int32_t>()(2));
   }
 
   void TestSetNodeTensorAttrWithTensor() {
     NodeDef node;
     Tensor input_tensor(DT_INT32, {4, 5});
-    test::FillIota<int32>(&input_tensor, 1);
-    SetNodeTensorAttr<int32>("foo", input_tensor, &node);
+    test::FillIota<int32_t>(&input_tensor, 1);
+    SetNodeTensorAttr<int32_t>("foo", input_tensor, &node);
     TensorProto tensor_proto = node.attr().at("foo").tensor();
     Tensor tensor;
     CHECK(tensor.FromProto(tensor_proto));
-    test::ExpectTensorEqual<int32>(input_tensor, tensor);
+    test::ExpectTensorEqual<int32_t>(input_tensor, tensor);
   }
 
   void TestGetNodeTensorAttr() {
     NodeDef node;
     Tensor input_tensor(DT_INT32, {4, 5});
-    test::FillIota<int32>(&input_tensor, 1);
+    test::FillIota<int32_t>(&input_tensor, 1);
     TensorProto tensor_proto;
     input_tensor.AsProtoTensorContent(&tensor_proto);
     SetNodeAttr("foo", tensor_proto, &node);
     Tensor result = GetNodeTensorAttr(node, "foo");
-    test::ExpectTensorEqual<int32>(input_tensor, result);
+    test::ExpectTensorEqual<int32_t>(input_tensor, result);
   }
 
   void TestFilterGraphDef() {
@@ -247,7 +247,7 @@ class TransformUtilsTest : public ::testing::Test {
         [](const NodeDef& node) { return (node.name() != "remove_me"); },
         &result_graph_def);
 
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(result_graph_def, &node_map);
     EXPECT_EQ(1, node_map.count("a"));
     EXPECT_EQ(1, node_map.count("b"));
@@ -269,7 +269,7 @@ class TransformUtilsTest : public ::testing::Test {
     GraphDef result_graph_def;
     RemoveAttributes(graph_def, {"dtype"}, &result_graph_def);
 
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(result_graph_def, &node_map);
     const NodeDef* removed_placeholder = node_map["placeholder"];
     EXPECT_EQ(nullptr,
@@ -432,12 +432,12 @@ class TransformUtilsTest : public ::testing::Test {
     GraphDef replaced_graph_def;
     TF_ASSERT_OK(ReplaceMatchingOpTypes(
         graph_def, {"*"},
-        [](const NodeMatch& match, const std::set<string>& input_nodes,
-           const std::set<string>& output_nodes,
+        [](const NodeMatch& match, const std::set<std::string>& input_nodes,
+           const std::set<std::string>& output_nodes,
            std::vector<NodeDef>* new_nodes) {
           NodeDef original_copy;
           original_copy = match.node;
-          const string original_name = match.node.name();
+          const std::string original_name = match.node.name();
           original_copy.set_name(original_name + "_before_identity");
           new_nodes->push_back(original_copy);
 
@@ -540,10 +540,10 @@ class TransformUtilsTest : public ::testing::Test {
 
     GraphDef renamed_graph_def;
     TF_ASSERT_OK(RenameNodeInputs(graph_def, {{"a", "b"}},
-                                  std::unordered_set<string>(),
+                                  std::unordered_set<std::string>(),
                                   &renamed_graph_def));
 
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(renamed_graph_def, &node_map);
     EXPECT_EQ("b", node_map.at("add")->input(0));
     EXPECT_EQ("b", node_map.at("add")->input(1));
@@ -579,9 +579,9 @@ class TransformUtilsTest : public ::testing::Test {
     GraphDef renamed_graph_def;
     TF_ASSERT_OK(RenameNodeInputs(
         graph_def, {{"a", "f"}, {"f", "e"}, {"e", "d"}, {"d", "c"}},
-        std::unordered_set<string>(), &renamed_graph_def));
+        std::unordered_set<std::string>(), &renamed_graph_def));
 
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(renamed_graph_def, &node_map);
     EXPECT_EQ("c", node_map.at("add")->input(0));
     EXPECT_EQ("b", node_map.at("add")->input(1));
@@ -617,7 +617,7 @@ class TransformUtilsTest : public ::testing::Test {
     GraphDef renamed_graph_def;
     absl::Status rename_status =
         RenameNodeInputs(graph_def, {{"a", "d"}, {"d", "a"}},
-                         std::unordered_set<string>(), &renamed_graph_def);
+                         std::unordered_set<std::string>(), &renamed_graph_def);
     EXPECT_FALSE(rename_status.ok());
   }
 
@@ -651,10 +651,10 @@ class TransformUtilsTest : public ::testing::Test {
 
     GraphDef renamed_graph_def;
     TF_ASSERT_OK(RenameNodeInputs(graph_def, {{"quantize_a:*", "quantize_b"}},
-                                  std::unordered_set<string>(),
+                                  std::unordered_set<std::string>(),
                                   &renamed_graph_def));
 
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(renamed_graph_def, &node_map);
     EXPECT_EQ("quantize_b:1", node_map.at("add")->input(0));
     EXPECT_EQ("quantize_b:2", node_map.at("add")->input(1));
@@ -691,7 +691,7 @@ class TransformUtilsTest : public ::testing::Test {
     TF_ASSERT_OK(RenameNodeInputs(graph_def, {{"a", "b"}}, {"add2"},
                                   &renamed_graph_def));
 
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(renamed_graph_def, &node_map);
     EXPECT_EQ("b", node_map.at("add")->input(0));
     EXPECT_EQ("b", node_map.at("add")->input(1));
@@ -731,10 +731,11 @@ class TransformUtilsTest : public ::testing::Test {
     const_node2->set_op("Const");
     const_node2->set_name("const_node2");
 
-    std::vector<std::pair<string, string>> invalid_inputs;
+    std::vector<std::pair<std::string, std::string>> invalid_inputs;
     FindInvalidInputs(graph_def, &invalid_inputs);
     EXPECT_EQ(3, invalid_inputs.size());
-    for (const std::pair<string, string>& invalid_input : invalid_inputs) {
+    for (const std::pair<std::string, std::string>& invalid_input :
+         invalid_inputs) {
       EXPECT_TRUE((invalid_input.first == "add_node1") ||
                   (invalid_input.first == "add_node2"));
       if (invalid_input.first == "add_node1") {
@@ -802,7 +803,7 @@ class TransformUtilsTest : public ::testing::Test {
         Const(root.WithOpName("float_const"), Input::Initializer(float_data));
 
     Tensor int_data(DT_INT32, TensorShape({width}));
-    test::FillIota<int32>(&int_data, 1);
+    test::FillIota<int32_t>(&int_data, 1);
     Output int_const =
         Const(root.WithOpName("int_const"), Input::Initializer(int_data));
 
@@ -813,7 +814,7 @@ class TransformUtilsTest : public ::testing::Test {
     GraphDef graph_def;
     TF_ASSERT_OK(root.ToGraphDef(&graph_def));
 
-    std::map<string, const NodeDef*> node_map;
+    std::map<std::string, const NodeDef*> node_map;
     MapNamesToNodes(graph_def, &node_map);
 
     const NodeDef* float_const_def = node_map.at("float_const");
@@ -920,7 +921,7 @@ class TransformUtilsTest : public ::testing::Test {
 
     auto e_root = tensorflow::Scope::NewRootScope();
     Tensor e_data(DT_INT32, TensorShape({width}));
-    test::FillIota<int32>(&e_data, 1);
+    test::FillIota<int32_t>(&e_data, 1);
     Output e_const = Const(e_root.WithOpName("a"), Input::Initializer(e_data));
     GraphDef e_graph_def;
     TF_ASSERT_OK(e_root.ToGraphDef(&e_graph_def));
@@ -976,7 +977,7 @@ class TransformUtilsTest : public ::testing::Test {
     TransformFuncContext context;
     context.params.insert({"foo", {"a", "b"}});
     context.params.insert({"bar", {"c"}});
-    string value;
+    std::string value;
     TF_EXPECT_OK(context.GetOneStringParameter("bar", "d", &value));
     EXPECT_EQ("c", value);
     EXPECT_FALSE(context.GetOneStringParameter("foo", "d", &value).ok());
diff --git a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
index 4dbf5f8ba9b208..5c63d3dcf70a81 100644
--- a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
+++ b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
@@ -35,9 +35,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 absl::Status RealMain(int argc, char** argv) {
-  string input_file_path;
-  string output_file_path;
-  string optimization_pass;
+  std::string input_file_path;
+  std::string output_file_path;
+  std::string optimization_pass;
 
   const std::vector<Flag> flag_list = {
       Flag("input_file_path", &input_file_path, "Location of the input graph."),
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
index 8baf7c38b9a117..fc846178897682 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.cc
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -53,15 +53,16 @@ class FakeDevice : public Device {
 
  public:
   absl::Status Sync() override;
-  static std::unique_ptr<Device> Make(const string& name, const string& type);
+  static std::unique_ptr<Device> Make(const std::string& name,
+                                      const std::string& type);
 };
 
 absl::Status FakeDevice::Sync() {
   return errors::Unimplemented("FakeDevice::Sync()");
 }
 
-std::unique_ptr<Device> FakeDevice::Make(const string& name,
-                                         const string& type) {
+std::unique_ptr<Device> FakeDevice::Make(const std::string& name,
+                                         const std::string& type) {
   DeviceAttributes device_attributes;
   device_attributes.set_name(name);
   device_attributes.set_device_type(DeviceType(type).type());
diff --git a/tensorflow/tools/pip_package/setup.py.tpl b/tensorflow/tools/pip_package/setup.py.tpl
index 94cb080c4f911a..b27f2b3be05dfe 100644
--- a/tensorflow/tools/pip_package/setup.py.tpl
+++ b/tensorflow/tools/pip_package/setup.py.tpl
@@ -126,7 +126,8 @@ REQUIRED_PACKAGES = [
     'tb-nightly ~= 2.20.0.a',
     'keras-nightly >= 3.10.0.dev',
     'numpy >= 1.26.0',
-    'h5py >= 3.11.0',
+    # Starting with 3.15, only MacOS 14 and 15 are supported.
+    'h5py >= 3.11.0, < 3.15.0',
     'ml_dtypes >= 0.5.1, < 1.0.0',
 ]
 
diff --git a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
index 3a8bcf14ee5ffe..1d946b65c97c00 100644
--- a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
+++ b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
@@ -86,14 +86,14 @@ class RepeatedStringSplitter : public ComposableSplitter {
   }
 };
 
-RepeatedString SetUpRepeatedString(std::vector<string> strings) {
+RepeatedString SetUpRepeatedString(std::vector<std::string> strings) {
   RepeatedString message;
   *message.mutable_strings() = {strings.begin(), strings.end()};
   return message;
 }
 
 TEST(RepeatedStringSplitterTest, TestSplitChunks) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
   TF_ASSERT_OK_AND_ASSIGN(auto ret, splitter.Split());
@@ -131,7 +131,7 @@ TEST(RepeatedStringSplitterTest, TestSplitChunks) {
 }
 
 static void CheckChunks(riegeli::RecordReaderBase& reader,
-                        std::vector<string>& strings) {
+                        std::vector<std::string>& strings) {
   ChunkMetadata chunk_metadata;
   reader.Seek(reader.Size().value());
   reader.SeekBack();
@@ -165,7 +165,7 @@ static void CheckChunks(riegeli::RecordReaderBase& reader,
 }
 
 TEST(RepeatedStringSplitterTest, TestWrite) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
 
@@ -185,7 +185,7 @@ TEST(RepeatedStringSplitterTest, TestWrite) {
 }
 
 TEST(RepeatedStringSplitterTest, TestWriteToString) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
   auto string_output_results = splitter.WriteToString();
@@ -202,7 +202,7 @@ TEST(RepeatedStringSplitterTest, TestWriteToString) {
 
 #if !IS_OSS
 TEST(RepeatedStringSplitterTest, TestWriteToCord) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
   auto cord_output_results = splitter.WriteToCord();
@@ -255,11 +255,11 @@ class RepeatedRepeatedStringSplitter : public ComposableSplitter {
 };
 
 TEST(ComposableTest, RepeatedRepeatedStringTest) {
-  std::vector<string> strings1 = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings1 = {"piece-1", "piece-2", "piece-3"};
   auto rs1 = SetUpRepeatedString(strings1);
-  std::vector<string> strings2 = {"new-strings-1"};
+  std::vector<std::string> strings2 = {"new-strings-1"};
   auto rs2 = SetUpRepeatedString(strings2);
-  std::vector<string> strings3 = {"foo-1", "foo-2"};
+  std::vector<std::string> strings3 = {"foo-1", "foo-2"};
   auto rs3 = SetUpRepeatedString(strings3);
 
   std::vector<RepeatedString> rs = {rs1, rs2, rs3};
@@ -275,8 +275,8 @@ TEST(ComposableTest, RepeatedRepeatedStringTest) {
   ChunkedMessage* chunked_message = ret.chunked_message;
   ASSERT_NE(chunked_message, nullptr);
 
-  std::vector<string> expected_chunks = {"piece-1",       "piece-2", "piece-3",
-                                         "new-strings-1", "foo-1",   "foo-2"};
+  std::vector<std::string> expected_chunks = {
+      "piece-1", "piece-2", "piece-3", "new-strings-1", "foo-1", "foo-2"};
 
   // RepeatedRepeatedStringSplitter sets the first chunk as the user-provided
   // message, so the expected size is 7.
@@ -300,12 +300,12 @@ TEST(ComposableTest, RepeatedRepeatedStringTest) {
 }
 
 TEST(ComposableTest, ChildSplitterTest) {
-  std::vector<string> strings1 = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings1 = {"piece-1", "piece-2", "piece-3"};
   auto message1 = SetUpRepeatedString(strings1);
   RepeatedStringSplitter splitter(&message1);
   std::vector<FieldType> fields = {};
 
-  std::vector<string> strings2 = {"s1", "s2"};
+  std::vector<std::string> strings2 = {"s1", "s2"};
   auto message2 = SetUpRepeatedString(strings2);
   RepeatedStringSplitter child(&message2, &splitter, &fields);
 
@@ -338,7 +338,7 @@ class NoOpSplitter : public ComposableSplitter {
 };
 
 TEST(NoOpSplitterTest, TestWrite) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   NoOpSplitter splitter(&message);
 
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index db66265156d0ea..2c05eb4267f817 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -68,6 +68,7 @@ cc_library(
     }),
     deps = [
         "//tensorflow/core:lib_proto_parsing",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
     ] + if_ios(["//tensorflow/core/platform:logging"]),
 )
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index d92879db5ab9e2..953a8e8cfa76c7 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -45,8 +45,8 @@ static const char kPlaceholderFile[] =
     "tensorflow/tools/proto_text/placeholder.txt";
 
 bool IsPlaceholderFile(const char* s) {
-  string ph(kPlaceholderFile);
-  string str(s);
+  std::string ph(kPlaceholderFile);
+  std::string str(s);
   return str.size() >= strlen(kPlaceholderFile) &&
          ph == str.substr(str.size() - ph.size());
 }
@@ -76,14 +76,15 @@ int MainImpl(int argc, char** argv) {
     return -1;
   }
 
-  const string output_root = argv[1];
-  const string output_relative_path = kTensorFlowHeaderPrefix + string(argv[2]);
+  const std::string output_root = argv[1];
+  const std::string output_relative_path =
+      kTensorFlowHeaderPrefix + std::string(argv[2]);
 
-  string src_relative_path;
+  std::string src_relative_path;
   bool has_placeholder = false;
   for (int i = 3; i < argc; ++i) {
     if (IsPlaceholderFile(argv[i])) {
-      const string s(argv[i]);
+      const std::string s(argv[i]);
       src_relative_path = s.substr(0, s.size() - strlen(kPlaceholderFile));
       has_placeholder = true;
     }
@@ -102,13 +103,14 @@ int MainImpl(int argc, char** argv) {
 
   for (int i = 3; i < argc; i++) {
     if (IsPlaceholderFile(argv[i])) continue;
-    const string proto_path = string(argv[i]).substr(src_relative_path.size());
+    const std::string proto_path =
+        std::string(argv[i]).substr(src_relative_path.size());
 
     const tensorflow::protobuf::FileDescriptor* fd =
         importer.Import(proto_path);
 
     const int index = proto_path.find_last_of('.');
-    string proto_path_no_suffix = proto_path.substr(0, index);
+    std::string proto_path_no_suffix = proto_path.substr(0, index);
 
     proto_path_no_suffix =
         proto_path_no_suffix.substr(output_relative_path.size());
@@ -118,8 +120,8 @@ int MainImpl(int argc, char** argv) {
 
     // Three passes, one for each output file.
     for (int pass = 0; pass < 3; ++pass) {
-      string suffix;
-      string data;
+      std::string suffix;
+      std::string data;
       if (pass == 0) {
         suffix = ".pb_text.h";
         data = code.header;
@@ -131,7 +133,8 @@ int MainImpl(int argc, char** argv) {
         data = code.cc;
       }
 
-      const string path = output_root + "/" + proto_path_no_suffix + suffix;
+      const std::string path =
+          output_root + "/" + proto_path_no_suffix + suffix;
       FILE* f = fopen(path.c_str(), "w");
       if (f == nullptr) {
         // We don't expect this output to be generated. It was specified in the
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 7123a2e144681b..acc884bad434b1 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -36,14 +37,14 @@ namespace tensorflow {
 namespace {
 
 template <typename... Args>
-string StrCat(const Args&... args) {
+std::string StrCat(const Args&... args) {
   std::ostringstream s;
   std::vector<int> give_me_a_name{((s << args), 0)...};
   return s.str();
 }
 
 template <typename... Args>
-string StrAppend(string* to_append, const Args&... args) {
+std::string StrAppend(std::string* to_append, const Args&... args) {
   *to_append += StrCat(args...);
   return *to_append;
 }
@@ -60,7 +61,7 @@ string StrAppend(string* to_append, const Args&... args) {
 // the field names (it's a loop over all names), and tracking of has_seen.
 class Generator {
  public:
-  explicit Generator(const string& tf_header_prefix)
+  explicit Generator(const std::string& tf_header_prefix)
       : tf_header_prefix_(tf_header_prefix),
         header_(&code_.header),
         header_impl_(&code_.header_impl),
@@ -73,9 +74,9 @@ class Generator {
 
  private:
   struct Section {
-    explicit Section(string* str) : str(str) {}
-    string* str;
-    string indent;
+    explicit Section(std::string* str) : str(str) {}
+    std::string* str;
+    std::string indent;
   };
 
   // Switches the currently active section to <section>.
@@ -110,7 +111,7 @@ class Generator {
   // <field_expr> is code that when emitted yields the field's value.
   void AppendFieldValueAppend(const FieldDescriptor& field,
                               const bool omit_default,
-                              const string& field_expr);
+                              const std::string& field_expr);
 
   // Appends the print code for as single field.
   void AppendFieldAppend(const FieldDescriptor& field);
@@ -135,17 +136,17 @@ class Generator {
   void AddNamespaceToCurrentSection(absl::string_view package, bool open);
 
   // Appends the given headers as sorted #include lines.
-  void AddHeadersToCurrentSection(const std::vector<string>& headers);
+  void AddHeadersToCurrentSection(const std::vector<std::string>& headers);
 
   // When adding #includes for tensorflow headers, prefix them with this.
-  const string tf_header_prefix_;
+  const std::string tf_header_prefix_;
   ProtoTextFunctionCode code_;
   Section* cur_ = nullptr;
   Section header_;
   Section header_impl_;
   Section cc_;
 
-  std::unordered_set<string> map_append_signatures_included_;
+  std::unordered_set<std::string> map_append_signatures_included_;
 
   Generator(const Generator&) = delete;
   void operator=(const Generator&) = delete;
@@ -153,8 +154,8 @@ class Generator {
 
 // Returns the prefix needed to reference objects defined in <fd>. E.g.
 // "::tensorflow::test".
-string GetPackageReferencePrefix(const FileDescriptor* fd) {
-  string result = "::";
+std::string GetPackageReferencePrefix(const FileDescriptor* fd) {
+  std::string result = "::";
   absl::string_view package = fd->package();
   for (size_t i = 0; i < package.size(); ++i) {
     if (package[i] == '.') {
@@ -168,67 +169,67 @@ string GetPackageReferencePrefix(const FileDescriptor* fd) {
 }
 
 // Returns the name of the class generated by proto to represent <d>.
-string GetClassName(const Descriptor& d) {
+std::string GetClassName(const Descriptor& d) {
   if (d.containing_type() == nullptr) return std::string(d.name());
   return StrCat(GetClassName(*d.containing_type()), "_", d.name());
 }
 
 // Returns the name of the class generated by proto to represent <ed>.
-string GetClassName(const EnumDescriptor& ed) {
+std::string GetClassName(const EnumDescriptor& ed) {
   if (ed.containing_type() == nullptr) return std::string(ed.name());
   return StrCat(GetClassName(*ed.containing_type()), "_", ed.name());
 }
 
 // Returns the qualified name that refers to the class generated by proto to
 // represent <d>.
-string GetQualifiedName(const Descriptor& d) {
+std::string GetQualifiedName(const Descriptor& d) {
   return StrCat(GetPackageReferencePrefix(d.file()), GetClassName(d));
 }
 
 // Returns the qualified name that refers to the class generated by proto to
 // represent <ed>.
-string GetQualifiedName(const EnumDescriptor& d) {
+std::string GetQualifiedName(const EnumDescriptor& d) {
   return StrCat(GetPackageReferencePrefix(d.file()), GetClassName(d));
 }
 
 // Returns the qualified name that refers to the generated
 // AppendProtoDebugString function for <d>.
-string GetQualifiedAppendFn(const Descriptor& d) {
+std::string GetQualifiedAppendFn(const Descriptor& d) {
   return StrCat(GetPackageReferencePrefix(d.file()),
                 "internal::AppendProtoDebugString");
 }
 
 // Returns the name of the generated function that returns an enum value's
 // string value.
-string GetEnumNameFn(const EnumDescriptor& enum_d) {
+std::string GetEnumNameFn(const EnumDescriptor& enum_d) {
   return StrCat("EnumName_", GetClassName(enum_d));
 }
 
 // Returns the qualified name of the function returned by GetEnumNameFn().
-string GetQualifiedEnumNameFn(const EnumDescriptor& enum_d) {
+std::string GetQualifiedEnumNameFn(const EnumDescriptor& enum_d) {
   return StrCat(GetPackageReferencePrefix(enum_d.file()),
                 GetEnumNameFn(enum_d));
 }
 
 // Returns the name of a generated header file, either the public api (if impl
 // is false) or the internal implementation header (if impl is true).
-string GetProtoTextHeaderName(const FileDescriptor& fd, bool impl) {
+std::string GetProtoTextHeaderName(const FileDescriptor& fd, bool impl) {
   const int dot_index = fd.name().find_last_of('.');
   return StrCat(fd.name().substr(0, dot_index),
                 (impl ? ".pb_text-impl.h" : ".pb_text.h"));
 }
 
 // Returns the name of the header generated by the proto library for <fd>.
-string GetProtoHeaderName(const FileDescriptor& fd) {
+std::string GetProtoHeaderName(const FileDescriptor& fd) {
   const int dot_index = fd.name().find_last_of('.');
   return StrCat(fd.name().substr(0, dot_index), ".pb.h");
 }
 
 // Returns the C++ class name for the given proto field.
-string GetCppClass(const FieldDescriptor& d) {
-  string cpp_class = d.cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE
-                         ? GetQualifiedName(*d.message_type())
-                         : std::string(d.cpp_type_name());
+std::string GetCppClass(const FieldDescriptor& d) {
+  std::string cpp_class = d.cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE
+                              ? GetQualifiedName(*d.message_type())
+                              : std::string(d.cpp_type_name());
 
   // In open-source TensorFlow, the definition of int64 varies across
   // platforms. The following line, which is manipulated during internal-
@@ -243,8 +244,8 @@ string GetCppClass(const FieldDescriptor& d) {
 // Returns the string that can be used for a header guard for the generated
 // headers for <fd>, either for the public api (if impl is false) or the
 // internal implementation header (if impl is true).
-string GetHeaderGuard(const FileDescriptor& fd, bool impl) {
-  string s(fd.name());
+std::string GetHeaderGuard(const FileDescriptor& fd, bool impl) {
+  std::string s(fd.name());
   std::replace(s.begin(), s.end(), '/', '_');
   std::replace(s.begin(), s.end(), '.', '_');
   return s + (impl ? "_IMPL_H_" : "_H_");
@@ -252,7 +253,7 @@ string GetHeaderGuard(const FileDescriptor& fd, bool impl) {
 
 void Generator::AppendFieldValueAppend(const FieldDescriptor& field,
                                        const bool omit_default,
-                                       const string& field_expr) {
+                                       const std::string& field_expr) {
   // This does not emit code with proper presence semantics (e.g. it doesn't
   // check 'has' fields on non-messages).
   CHECK(!field.has_presence() || field.containing_oneof() != nullptr ||
@@ -351,8 +352,8 @@ void Generator::AppendFieldAppend(const FieldDescriptor& field) {
   } else {
     const auto* oneof = field.containing_oneof();
     if (oneof != nullptr) {
-      string camel_name(field.camelcase_name());
-      camel_name[0] = toupper(camel_name[0]);
+      std::string camel_name(field.camelcase_name());
+      camel_name[0] = absl::ascii_toupper(camel_name[0]);
       Print("if (msg.", oneof->name(), "_case() == ",
             GetQualifiedName(*oneof->containing_type()), "::k", camel_name,
             ") {");
@@ -369,10 +370,11 @@ void Generator::AppendFieldAppend(const FieldDescriptor& field) {
 }
 
 void Generator::AppendEnumFunctions(const EnumDescriptor& enum_d) {
-  const string sig = StrCat("const char* ", GetEnumNameFn(enum_d), "(\n    ",
-                            GetQualifiedName(enum_d), " value)");
+  const std::string sig =
+      StrCat("const char* ", GetEnumNameFn(enum_d), "(\n    ",
+             GetQualifiedName(enum_d), " value)");
   SetOutput(&header_);
-  Print().Print("// Enum text output for ", string(enum_d.full_name()));
+  Print().Print("// Enum text output for ", std::string(enum_d.full_name()));
   Print(sig, ";");
 
   SetOutput(&cc_);
@@ -389,7 +391,7 @@ void Generator::AppendEnumFunctions(const EnumDescriptor& enum_d) {
 
 void Generator::AppendParseMessageFunction(const Descriptor& md) {
   const bool map_append = (md.options().map_entry());
-  string sig;
+  std::string sig;
   if (!map_append) {
     sig = StrCat("bool ProtoParseFromString(\n    const string& s,\n    ",
                  GetQualifiedName(md), "* msg)");
@@ -476,8 +478,8 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
   for (int i = 0; i < md.field_count(); ++i) {
     const FieldDescriptor* field = md.field(i);
     absl::string_view field_name = field->name();
-    string mutable_value_expr;
-    string set_value_prefix;
+    std::string mutable_value_expr;
+    std::string set_value_prefix;
     if (map_append) {
       mutable_value_expr = StrCat("&map_", field_name);
       set_value_prefix = StrCat("map_", field_name, " = ");
@@ -551,7 +553,7 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
           "Scanner::LETTER_DIGIT_DASH_UNDERSCORE)."
           "GetResult(nullptr, &value)) return false;");
       const auto* enum_d = field->enum_type();
-      string value_prefix;
+      std::string value_prefix;
       if (enum_d->containing_type() == nullptr) {
         value_prefix = GetPackageReferencePrefix(enum_d->file());
       } else {
@@ -561,7 +563,7 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
       for (int enum_i = 0; enum_i < enum_d->value_count(); ++enum_i) {
         const auto* value_d = enum_d->value(enum_i);
         absl::string_view value_name = value_d->name();
-        string condition = StrCat("value == \"", value_name, "\"");
+        std::string condition = StrCat("value == \"", value_name, "\"");
 
         Print(enum_i == 0 ? "" : "} else ", "if (", condition, ") {");
         Nest();
@@ -628,14 +630,14 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
 void Generator::AppendDebugStringFunctions(const Descriptor& md) {
   SetOutput(&header_impl_).Print();
   SetOutput(&header_).Print().Print("// Message-text conversion for ",
-                                    string(md.full_name()));
+                                    std::string(md.full_name()));
 
   // Append the two debug string functions for <md>.
   for (int short_pass = 0; short_pass < 2; ++short_pass) {
     const bool short_debug = (short_pass == 1);
 
     // Make the Get functions.
-    const string sig = StrCat(
+    const std::string sig = StrCat(
         "string ", short_debug ? "ProtoShortDebugString" : "ProtoDebugString",
         "(\n    const ", GetQualifiedName(md), "& msg)");
     SetOutput(&header_).Print(sig, ";");
@@ -652,7 +654,7 @@ void Generator::AppendDebugStringFunctions(const Descriptor& md) {
   }
 
   // Make the Append function.
-  const string sig =
+  const std::string sig =
       StrCat("void AppendProtoDebugString(\n",
              "    ::tensorflow::strings::ProtoTextOutput* o,\n    const ",
              GetQualifiedName(md), "& msg)");
@@ -703,7 +705,7 @@ void Generator::AppendMessageFunctions(const Descriptor& md) {
 void Generator::AddNamespaceToCurrentSection(absl::string_view package,
                                              bool open) {
   Print();
-  std::vector<string> parts = {""};
+  std::vector<std::string> parts = {""};
   for (size_t i = 0; i < package.size(); ++i) {
     if (package[i] == '.') {
       parts.resize(parts.size() + 1);
@@ -722,8 +724,9 @@ void Generator::AddNamespaceToCurrentSection(absl::string_view package,
   }
 }
 
-void Generator::AddHeadersToCurrentSection(const std::vector<string>& headers) {
-  std::vector<string> sorted = headers;
+void Generator::AddHeadersToCurrentSection(
+    const std::vector<std::string>& headers) {
+  std::vector<std::string> sorted = headers;
   std::sort(sorted.begin(), sorted.end());
   for (const auto& h : sorted) {
     Print("#include \"", h, "\"");
@@ -783,7 +786,7 @@ void Generator::Generate(const FileDescriptor& fd) {
   std::set<const Descriptor*> all_d;
   GetAllFileDescriptorsFromFile(&fd, &all_fd, &all_d);
 
-  std::vector<string> headers;
+  std::vector<std::string> headers;
 
   // Add header to header file.
   SetOutput(&header_);
@@ -862,8 +865,8 @@ void Generator::Generate(const FileDescriptor& fd) {
 
 }  // namespace
 
-ProtoTextFunctionCode GetProtoTextFunctionCode(const FileDescriptor& fd,
-                                               const string& tf_header_prefix) {
+ProtoTextFunctionCode GetProtoTextFunctionCode(
+    const FileDescriptor& fd, const std::string& tf_header_prefix) {
   Generator gen(tf_header_prefix);
   gen.Generate(fd);
   return gen.code();
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
index 2878717634b2b3..edcf913b8c284c 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
@@ -22,9 +22,9 @@ limitations under the License.
 namespace tensorflow {
 
 struct ProtoTextFunctionCode {
-  string header;       // for a file named proto_name + ".pb_text.h"
-  string header_impl;  // for a file named proto_name + ".pb_text-impl.h"
-  string cc;           // for a file named proto_name + ".pb_text.cc"
+  std::string header;       // for a file named proto_name + ".pb_text.h"
+  std::string header_impl;  // for a file named proto_name + ".pb_text-impl.h"
+  std::string cc;           // for a file named proto_name + ".pb_text.cc"
 };
 
 // Returns the generated source code for a proto file descriptor.
@@ -46,7 +46,7 @@ struct ProtoTextFunctionCode {
 //      in proto.
 ProtoTextFunctionCode GetProtoTextFunctionCode(
     const tensorflow::protobuf::FileDescriptor& fd,
-    const string& tf_header_prefix);
+    const std::string& tf_header_prefix);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
index db0beb6d58084d..075922dc2be020 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
@@ -59,7 +59,7 @@ std::string PrintTextFormat(const tensorflow::protobuf::Message& message) {
 // new message using the generated parse function. Return the new message.
 template <typename T>
 T RoundtripParseProtoOrDie(const T& input, bool short_text) {
-  const string s =
+  const std::string s =
       short_text ? PrintShortTextFormat(input) : PrintTextFormat(input);
   T t;
   EXPECT_TRUE(ProtoParseFromString(s, &t)) << "Failed to parse " << s;
@@ -120,10 +120,10 @@ TEST(CreateProtoDebugStringLibTest, ValidSimpleTypes) {
 
   // Max numeric values.
   proto.Clear();
-  proto.set_optional_int32(std::numeric_limits<int32>::max());
+  proto.set_optional_int32(std::numeric_limits<int32_t>::max());
   proto.set_optional_int64(std::numeric_limits<protobuf_int64>::max());
-  proto.set_optional_uint32(std::numeric_limits<uint32>::max());
-  proto.set_optional_uint64(std::numeric_limits<uint64>::max());
+  proto.set_optional_uint32(std::numeric_limits<uint32_t>::max());
+  proto.set_optional_uint64(std::numeric_limits<uint64_t>::max());
   // TODO(b/67475677): Re-enable after resolving float precision issue
   // proto.set_optional_float(std::numeric_limits<float>::max());
   proto.set_optional_double(std::numeric_limits<double>::max());
@@ -138,7 +138,7 @@ TEST(CreateProtoDebugStringLibTest, ValidSimpleTypes) {
 
   // Lowest numeric values.
   proto.Clear();
-  proto.set_optional_int32(std::numeric_limits<int32>::lowest());
+  proto.set_optional_int32(std::numeric_limits<int32_t>::lowest());
   proto.set_optional_int64(std::numeric_limits<protobuf_int64>::lowest());
   // TODO(b/67475677): Re-enable after resolving float precision issue
   // proto.set_optional_float(std::numeric_limits<float>::lowest());
@@ -361,14 +361,15 @@ TEST(CreateProtoDebugStringLibTest, RecursiveMessage) {
 }
 
 template <typename T>
-T ParseProto(const string& value_text_proto) {
+T ParseProto(const std::string& value_text_proto) {
   T value;
   EXPECT_TRUE(protobuf::TextFormat::ParseFromString(value_text_proto, &value))
       << value_text_proto;
   return value;
 }
 
-TestAllTypes::NestedMessage ParseNestedMessage(const string& value_text_proto) {
+TestAllTypes::NestedMessage ParseNestedMessage(
+    const std::string& value_text_proto) {
   return ParseProto<TestAllTypes::NestedMessage>(value_text_proto);
 }
 
@@ -494,11 +495,11 @@ TEST(CreateProtoDebugStringLibTest, Enums) {
   EXPECT_PARSE_FAILURE("optional_nested_enum: 'BAR'");
   EXPECT_PARSE_FAILURE("optional_nested_enum: \"BAR\" ");
 
-  EXPECT_EQ(string("BAR"),
-            string(EnumName_TestAllTypes_NestedEnum(TestAllTypes::BAR)));
+  EXPECT_EQ(std::string("BAR"),
+            std::string(EnumName_TestAllTypes_NestedEnum(TestAllTypes::BAR)));
   // out of range - returns empty string (see NameOfEnum in proto library).
-  EXPECT_EQ(string(""), string(EnumName_TestAllTypes_NestedEnum(
-                            static_cast<TestAllTypes_NestedEnum>(123))));
+  EXPECT_EQ(std::string(""), std::string(EnumName_TestAllTypes_NestedEnum(
+                                 static_cast<TestAllTypes_NestedEnum>(123))));
 }
 
 TEST(CreateProtoDebugStringLibTest, Oneof) {
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
index 10afb46f493cbe..8d35977d14a987 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -1,5 +1,5 @@
 ################################################################################
-FROM ubuntu:22.04@sha256:4e0171b9275e12d375863f2b3ae9ce00a4c53ddda176bd55868df97ac6f21a6e as builder
+FROM ubuntu:22.04@sha256:09506232a8004baa32c47d68f1e5c307d648fdd59f5e7eaa42aaf87914100db3 as builder
 ################################################################################
 
 # Install devtoolset build dependencies
diff --git a/tensorflow/tools/tfg_graph_transforms/BUILD b/tensorflow/tools/tfg_graph_transforms/BUILD
index 7cf22924a0c3aa..4c695c7a0d850a 100644
--- a/tensorflow/tools/tfg_graph_transforms/BUILD
+++ b/tensorflow/tools/tfg_graph_transforms/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/tensorflow/tools/tfg_graph_transforms/utils.cc b/tensorflow/tools/tfg_graph_transforms/utils.cc
index ed4b13442c1e9e..77cb33fd6a778f 100644
--- a/tensorflow/tools/tfg_graph_transforms/utils.cc
+++ b/tensorflow/tools/tfg_graph_transforms/utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/cc/saved_model/image_format/internal_api.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/tools/tfg_graph_transforms/utils.h b/tensorflow/tools/tfg_graph_transforms/utils.h
index 93c66433f47ca3..1c72364a2a0d38 100644
--- a/tensorflow/tools/tfg_graph_transforms/utils.h
+++ b/tensorflow/tools/tfg_graph_transforms/utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
index 1598845e61e497..9ab22386abd31b 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "@local_xla//third_party/remote_config:common.bzl",
+    "//third_party/remote_config:common.bzl",
     "err_out",
     "get_host_environ",
     "raw_exec",
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 774444369e224d..166d87f63e8617 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,7 +1,7 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
-load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index fbf723bb44ade9..7f9de39c1e1644 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -50,7 +50,7 @@ def initialize_rbe_configs():
     # The `ml-build`'s base image is a standard `ubuntu22.04` image.
     # Note that in order to use this image with RBE GPU builds, you need to have hermetic CUDA
     # toolchain integrated into your project, and pass
-    # `--@cuda_driver//:enable_forward_compatibility=true` to Bazel command.
+    # `--@cuda_driver//:include_cuda_umd_libs=true` to Bazel command.
     ml_build_rbe_config("docker://us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build@sha256:ea67e8453d8b09c2ba48853da5e79efef4b65804b4a48dfae4b4da89ffd38405")
 
     # TF-Version-Specific SIG Build RBE Configs. The crosstool generated from these
diff --git a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
index bdcfc6f4089221..ddd87ae0cf9786 100644
--- a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+++ b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,9 +1,9 @@
 """Macro that creates external repositories for remote config."""
 
-load("@local_xla//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
+load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/tensorflow/tools/toolchains/win/BUILD b/tensorflow/tools/toolchains/win/BUILD
index 258ca032ecd1ea..3aeae034d1f4b2 100644
--- a/tensorflow/tools/toolchains/win/BUILD
+++ b/tensorflow/tools/toolchains/win/BUILD
@@ -14,22 +14,12 @@ platform(
         "@platforms//cpu:x86_64",
         "@platforms//os:windows",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
 
 # Register the clang-cl platform
@@ -40,20 +30,10 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
diff --git a/tensorflow/tools/toolchains/win2022/BUILD b/tensorflow/tools/toolchains/win2022/BUILD
index 1499e7f0767ab9..0dba97d9d4a4b7 100644
--- a/tensorflow/tools/toolchains/win2022/BUILD
+++ b/tensorflow/tools/toolchains/win2022/BUILD
@@ -16,20 +16,11 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-	  value: "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "win2022"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc",
+        "OSFamily": "Windows",
+        "Pool": "win2022",
+        "dockerNetwork": "off",
+        "cache-silo-key": "20251105-1762360217",
+    },
 )
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index dc653bd5c6a1b0..782429526b0aab 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -19,38 +19,6 @@ def _tf_bind():
     # If that ends up being the case, please leave a comment explaining
     # why we can't depend on the canonical build target.
 
-    # Needed by Protobuf
-    native.bind(
-        name = "grpc_cpp_plugin",
-        actual = "@com_github_grpc_grpc//src/compiler:grpc_cpp_plugin",
-    )
-    native.bind(
-        name = "grpc_python_plugin",
-        actual = "@com_github_grpc_grpc//src/compiler:grpc_python_plugin",
-    )
-
-    native.bind(
-        name = "grpc_lib",
-        actual = "@com_github_grpc_grpc//:grpc++",
-    )
-
-    native.bind(
-        name = "grpc_lib_unsecure",
-        actual = "@com_github_grpc_grpc//:grpc++_unsecure",
-    )
-
-    # Needed by Protobuf
-    native.bind(
-        name = "python_headers",
-        actual = str(Label("@local_xla//third_party/python_runtime:headers")),
-    )
-
-    # Needed by Protobuf
-    native.bind(
-        name = "six",
-        actual = "@six_archive//:six",
-    )
-
 def workspace():
     http_archive(
         name = "inception_v1",
@@ -140,10 +108,10 @@ def workspace():
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
     http_archive(
         name = "rules_ml_toolchain",
-        sha256 = "7a77198375cfdfcdcd5fec8dfe405d05e3d26a54e963ce3721e861debb4b988b",
-        strip_prefix = "rules_ml_toolchain-5b47bb36d6b6942ec399d4ffa29059cb148c2286",
+        sha256 = "8123d826b0a4c5ceda767abc8092419fcc980c3ce45fb0f438b101fb886c014c",
+        strip_prefix = "rules_ml_toolchain-552b53a04a86fd5cdb4d5091e7420411d8b2a045",
         urls = [
-            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/5b47bb36d6b6942ec399d4ffa29059cb148c2286.tar.gz",
+            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/552b53a04a86fd5cdb4d5091e7420411d8b2a045.tar.gz",
         ],
     )
 
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 46e8fc1481d928..6cb3d986cac945 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -32,20 +32,21 @@ load("@local_xla//third_party/rmm:workspace.bzl", rmm = "repo")
 load("@local_xla//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("@local_xla//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
 load("@local_xla//third_party/shardy:workspace.bzl", shardy = "repo")
+load("@local_xla//third_party/slinky:workspace.bzl", slinky = "repo")
 load("@local_xla//third_party/spdlog:workspace.bzl", spdlog = "repo")
 load("@local_xla//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("@local_xla//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("@local_xla//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
 load("@local_xla//third_party/triton:workspace.bzl", triton = "repo")
+load("@local_xla//tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
+load("@local_xla//tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
+load("@local_xla//tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
+load("@local_xla//tools/toolchains/clang6:repo.bzl", "clang6_configure")
+load("@local_xla//tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
+load("@local_xla//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("@local_xla//tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("@rules_jvm_external//:defs.bzl", "maven_install")
 load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
-load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
-load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
-load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
-load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
-load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
-load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
-load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/hexagon:workspace.bzl", hexagon_nn = "repo")
@@ -99,6 +100,7 @@ def _initialize_third_party():
     rocm_device_libs()
     ruy()
     shardy()
+    slinky()
     spdlog()
     sobol_data()
     stablehlo()
@@ -135,7 +137,7 @@ def _tf_toolchains():
     # Point //external/local_config_arm_compiler to //external/arm_compiler
     arm_compiler_configure(
         name = "local_config_arm_compiler",
-        build_file = "//tensorflow/tools/toolchains/cpus/arm:template.BUILD",
+        build_file = "@local_xla//tools/toolchains/cpus/arm:template.BUILD",
         remote_config_repo_arm = "../arm_compiler",
         remote_config_repo_aarch64 = "../aarch64_compiler",
     )
@@ -146,7 +148,7 @@ def _tf_toolchains():
     # TFLite crossbuild toolchain for embeddeds Linux
     arm_linux_toolchain_configure(
         name = "local_config_embedded_arm",
-        build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:template.BUILD",
+        build_file = "@local_xla//tools/toolchains/embedded/arm-linux:template.BUILD",
         aarch64_repo = "../aarch64_linux_toolchain",
         armhf_repo = "../armhf_linux_toolchain",
     )
@@ -166,18 +168,18 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "d36a005c707c0cf26696acfb5ef27d55a37551a49ed2eeb5979815a61138f07d",
-        strip_prefix = "XNNPACK-ea1906f8df2faf8172da1b341c563bf9115581dd",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/ea1906f8df2faf8172da1b341c563bf9115581dd.zip"),
+        sha256 = "a633a48ba393211771204d25ebc5f35359b71bfbefaa6e955aa92570caede727",
+        strip_prefix = "XNNPACK-fa0fd6471a39a5d66a59d4cd8f8cc4a93a4bd470",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/fa0fd6471a39a5d66a59d4cd8f8cc4a93a4bd470.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
     # XNNPack dependency.
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "42155cfc084bf1f80e9ef486470f949502ea8d1b845b2f1bebd58978a1b540aa",
-        strip_prefix = "kleidiai-8ca226712975f24f13f71d04cda039a0ee9f9e2f",
-        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/8ca226712975f24f13f71d04cda039a0ee9f9e2f.zip"),
+        sha256 = "fb4f8180171d035a08432b086194121f627d00a76d58cebaad57d7a87ad40dbd",
+        strip_prefix = "kleidiai-7a3a609a3278106df7157bdd27b8f0e75ab00b60",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/7a3a609a3278106df7157bdd27b8f0e75ab00b60.zip"),
     )
 
     tf_http_archive(
@@ -190,9 +192,9 @@ def _tf_repositories():
     # LINT.IfChange(pthreadpool)
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "8b1d13195842c9b7e8ef5aa7d9b44ca4168a41b8ae97b4e50db4fcc562211f5b",
-        strip_prefix = "pthreadpool-d561aae9dfeab38ff595a0ae3e6bbd90b862c5f8",
-        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/d561aae9dfeab38ff595a0ae3e6bbd90b862c5f8.zip"),
+        sha256 = "f602ab141bdc5d5872a79d6551e9063b5bfa7ad6ad60cceaa641de5c45c86d70",
+        strip_prefix = "pthreadpool-0e6ca13779b57d397a5ba6bfdcaa8a275bc8ea2e",
+        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/0e6ca13779b57d397a5ba6bfdcaa8a275bc8ea2e.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/cmake/DownloadPThreadPool.cmake)
 
@@ -208,9 +210,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "@local_xla//third_party:cudnn_frontend.BUILD",
         patch_file = ["@local_xla//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "34dfe01057e43e799af207522aa0c863ad3177f8c1568b6e7a7e4ccf1cbff769",
-        strip_prefix = "cudnn-frontend-1.11.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.11.0.zip"),
+        sha256 = "257b3b7f8a99abc096094abc9e5011659117b647d55293bcd2c5659f9181b99e",
+        strip_prefix = "cudnn-frontend-1.13.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.13.0.zip"),
     )
 
     tf_http_archive(
@@ -286,7 +288,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "aarch64_linux_toolchain",
-        build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:aarch64-linux-toolchain.BUILD",
+        build_file = "@local_xla//tools/toolchains/embedded/arm-linux:aarch64-linux-toolchain.BUILD",
         sha256 = "50cdef6c5baddaa00f60502cc8b59cc11065306ae575ad2f51e412a9b2a90364",
         strip_prefix = "arm-gnu-toolchain-11.3.rel1-x86_64-aarch64-none-linux-gnu",
         urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu/11.3.rel1/binrel/arm-gnu-toolchain-11.3.rel1-x86_64-aarch64-none-linux-gnu.tar.xz"),
@@ -294,7 +296,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "armhf_linux_toolchain",
-        build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:armhf-linux-toolchain.BUILD",
+        build_file = "@local_xla//tools/toolchains/embedded/arm-linux:armhf-linux-toolchain.BUILD",
         sha256 = "3f76650b1d048036473b16b647b8fd005ffccd1a2869c10994967e0e49f26ac2",
         strip_prefix = "arm-gnu-toolchain-11.3.rel1-x86_64-arm-none-linux-gnueabihf",
         urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu/11.3.rel1/binrel/arm-gnu-toolchain-11.3.rel1-x86_64-arm-none-linux-gnueabihf.tar.xz"),
@@ -751,8 +753,8 @@ def _tf_repositories():
     # https://github.com/bazelbuild/apple_support/releases
     tf_http_archive(
         name = "build_bazel_apple_support",
-        sha256 = "d71b02d6df0500f43279e22400db6680024c1c439115c57a9a82e9effe199d7b",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.18.1/apple_support.1.18.1.tar.gz"),
+        sha256 = "1ae6fcf983cff3edab717636f91ad0efff2e5ba75607fdddddfd6ad0dbdfaf10",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.24.5/apple_support.1.24.5.tar.gz"),
     )
 
     # https://github.com/apple/swift-protobuf/releases
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 1fd3ae0a337ee2..bd10cee3af127b 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -19,7 +19,7 @@ def _get_python_interpreter():
     )
     if _is_bzlmod_enabled():
         return str(INTERPRETER_LABELS[python_toolchain_name])
-    return "@{}_host//:python".format(python_toolchain_name)
+    return "@{}//:python".format(python_toolchain_name)
 
 def _create_local_python_repository(repository_ctx):
     """Creates the repository containing files set up to build with Python."""
diff --git a/third_party/xla/.bazelversion b/third_party/xla/.bazelversion
index 815da58b7a9ed1..1985849fb58967 100644
--- a/third_party/xla/.bazelversion
+++ b/third_party/xla/.bazelversion
@@ -1 +1 @@
-7.4.1
+7.7.0
diff --git a/third_party/xla/.github/workflows/bazel_dependency_violations.yml b/third_party/xla/.github/workflows/bazel_dependency_violations.yml
index d496ddcc4271eb..0588447392e993 100644
--- a/third_party/xla/.github/workflows/bazel_dependency_violations.yml
+++ b/third_party/xla/.github/workflows/bazel_dependency_violations.yml
@@ -39,7 +39,7 @@ jobs:
     continue-on-error: true
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/bazel_query.yml b/third_party/xla/.github/workflows/bazel_query.yml
index 736463ec5441c6..052309ef806012 100644
--- a/third_party/xla/.github/workflows/bazel_query.yml
+++ b/third_party/xla/.github/workflows/bazel_query.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/bazel_tags.yml b/third_party/xla/.github/workflows/bazel_tags.yml
index 15ff2b53420757..00ed95c8e6f0e0 100644
--- a/third_party/xla/.github/workflows/bazel_tags.yml
+++ b/third_party/xla/.github/workflows/bazel_tags.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/benchmark_postsubmit.yml b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
index c1922b6e46371c..906c59e9b82630 100644
--- a/third_party/xla/.github/workflows/benchmark_postsubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
@@ -110,7 +110,7 @@ jobs:
           PR: ${{ steps.find_pr.outputs.pr }}
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Wait For Connection
         uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
         with:
@@ -235,7 +235,7 @@ jobs:
           gsutil cp "$OUTPUT_FILE" "$GCS_BUCKET/$GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: xspace-artifacts-${{ matrix.job_info.pool }}-${{ matrix.job_info.platform }}
           path: ${{ env.XSPACE_FILE }}
\ No newline at end of file
diff --git a/third_party/xla/.github/workflows/benchmark_presubmit.yml b/third_party/xla/.github/workflows/benchmark_presubmit.yml
index fde61b17a20464..33f65f9eead53d 100644
--- a/third_party/xla/.github/workflows/benchmark_presubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_presubmit.yml
@@ -86,7 +86,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Configure GPU backend
         if: ${{ matrix.job_info.platform == 'GPU' }}
diff --git a/third_party/xla/.github/workflows/buildifier.yml b/third_party/xla/.github/workflows/buildifier.yml
index 23557ddaeeb966..d61728b29b4716 100644
--- a/third_party/xla/.github/workflows/buildifier.yml
+++ b/third_party/xla/.github/workflows/buildifier.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 6
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: "Install buildifier"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/buildtools/buildifier@433ea85 # 6.4.0
       - name: "Run buildifier"
diff --git a/third_party/xla/.github/workflows/check_contents.yml b/third_party/xla/.github/workflows/check_contents.yml
index 80f7324dab96bc..820a99675525ca 100644
--- a/third_party/xla/.github/workflows/check_contents.yml
+++ b/third_party/xla/.github/workflows/check_contents.yml
@@ -46,7 +46,7 @@ jobs:
       contains(github.event.pull_request.body, 'FORCE_TEST_ACTIONS')
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: "Fetch HEAD of main branch"
         run: git fetch origin main --depth=1
 
diff --git a/third_party/xla/.github/workflows/ci.yml b/third_party/xla/.github/workflows/ci.yml
index a8a835467f9a60..c3719d8f430df2 100644
--- a/third_party/xla/.github/workflows/ci.yml
+++ b/third_party/xla/.github/workflows/ci.yml
@@ -78,6 +78,12 @@ jobs:
             name: "JAX Linux x86 CPU",
             repo: "jax-ml/jax",
           },
+          {
+            pool: "windows-x86-n2-16",
+            container: null,
+            name: "JAX Windows x86 CPU",
+            repo: "jax-ml/jax",
+          },
           {
             pool: "linux-x86-g2-16-l4-1gpu",
             container: "gcr.io/tensorflow-sigs/build:latest-python3.11",
@@ -106,12 +112,12 @@ jobs:
     timeout-minutes: 60
     steps:
       - name: "Checking out openxla/xla"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           path: "openxla/xla"
       - name: Checking out ${{ matrix.job_info.repo }}
         if: ${{ matrix.job_info.repo != 'openxla/xla' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           repository: ${{ matrix.job_info.repo }}
           path: ${{ matrix.job_info.repo }}
@@ -121,4 +127,9 @@ jobs:
           halt-dispatch-input: ${{ inputs.halt-for-connection }}
       - name: "Run build.py"
         working-directory: ${{ matrix.job_info.repo }}
-        run: $GITHUB_WORKSPACE/openxla/xla/build_tools/ci/build.py --build="${{ matrix.job_info.name }}_github_actions"
+        run: |
+          if [[ "${{ matrix.job_info.pool }}" == *windows* ]]; then
+            python $GITHUB_WORKSPACE\\openxla\\xla\\build_tools\\ci\\build.py --build="${{ matrix.job_info.name }}_github_actions"
+          else
+            $GITHUB_WORKSPACE/openxla/xla/build_tools/ci/build.py --build="${{ matrix.job_info.name }}_github_actions"
+          fi
diff --git a/third_party/xla/.github/workflows/clang_format.yml b/third_party/xla/.github/workflows/clang_format.yml
index 4d53a5e9c35ff9..9cf44b0713f92c 100644
--- a/third_party/xla/.github/workflows/clang_format.yml
+++ b/third_party/xla/.github/workflows/clang_format.yml
@@ -34,7 +34,7 @@ jobs:
       contains(github.event.pull_request.body, 'FORCE_TEST_ACTIONS')
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: "Fetch HEAD of main branch"
         run: git fetch origin main --depth=1
       - name: "Run clang-format" # Use pipx to get version that apt doesn't have by default
diff --git a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
index 013248ba912a4e..4aafd8a074e529 100644
--- a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
@@ -75,7 +75,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Create results directory
         run:
@@ -207,7 +207,7 @@ jobs:
           gsutil cp "$OUTPUT_DIR/$FILENAME_GEMMA3" "$GCS_BUCKET/$GEMMA3_GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: cpu-xla-benchmarks-xspace-${{ matrix.job_info.pool }}
           path: ${{ github.workspace }}/${{ matrix.job_info.output_dir }}/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/generate_benchmark_matrix.yml b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
index 3410ea9c1cf3f4..51f97449ee6b41 100644
--- a/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
+++ b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
@@ -54,7 +54,7 @@ jobs:
 
     steps:
       - name: Checkout OpenXLA
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           # Use inputs.checkout_ref if provided, otherwise default to the event's ref
           # (e.g., PR's HEAD SHA or caller's commit SHA)
diff --git a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
index e68884cfd8bf1b..748226655d8b9d 100644
--- a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
@@ -56,7 +56,7 @@ jobs:
       OUTPUT_DIR: ${{ github.workspace }}/output
     steps:
       - name: Checkout XLA
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Download Gemma Hlo Files
         run: |
@@ -198,7 +198,7 @@ jobs:
           upload_to_gcs "$GEMMA3_SAMPLE_LOOP_BASE_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: gpu-xla-benchmarks-xspace-${{ matrix.job_info.os }}
           path: ${{ github.workspace }}/output/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/nightly_benchmarks.yml b/third_party/xla/.github/workflows/nightly_benchmarks.yml
index 85ba6aeafaa79d..23a82d9350624d 100644
--- a/third_party/xla/.github/workflows/nightly_benchmarks.yml
+++ b/third_party/xla/.github/workflows/nightly_benchmarks.yml
@@ -110,7 +110,7 @@ jobs:
             exit 1
           fi
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ env.CHECKOUT_REF }}
       - name: Build Binaries
@@ -182,7 +182,7 @@ jobs:
           gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/postsubmit_benchmark.yml b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
index 9d1a4c521e6d08..346c17bfaec5c4 100644
--- a/third_party/xla/.github/workflows/postsubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
@@ -145,7 +145,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
@@ -224,7 +224,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/presubmit_benchmark.yml b/third_party/xla/.github/workflows/presubmit_benchmark.yml
index e3747a7fae78a8..e3efa8d429bf5b 100644
--- a/third_party/xla/.github/workflows/presubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/presubmit_benchmark.yml
@@ -139,7 +139,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
@@ -199,7 +199,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: results-${{ env.CONFIG_ID }} 
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/rollback_notification.yml b/third_party/xla/.github/workflows/rollback_notification.yml
index 35153b32ff6c45..8978fe8d9984e5 100644
--- a/third_party/xla/.github/workflows/rollback_notification.yml
+++ b/third_party/xla/.github/workflows/rollback_notification.yml
@@ -33,7 +33,7 @@ jobs:
     timeout-minutes: 6
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: "Check if PR was rolled back"
         run: python3 .github/workflows/rollback_notification.py
 
diff --git a/third_party/xla/.github/workflows/scorecards-analysis.yml b/third_party/xla/.github/workflows/scorecards-analysis.yml
index b854339738b71b..eb9ffa347ff68e 100644
--- a/third_party/xla/.github/workflows/scorecards-analysis.yml
+++ b/third_party/xla/.github/workflows/scorecards-analysis.yml
@@ -44,7 +44,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           persist-credentials: false
 
@@ -58,7 +58,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v3.pre.node20
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v3.pre.node20
         with:
           name: SARIF file
           path: results.sarif
@@ -67,6 +67,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@3599b3baa15b485a2e49ef411a7a4bb2452e7f93 # v3.30.5
+        uses: github/codeql-action/upload-sarif@fdbfb4d2750291e159f0156def62b853c2798ca2 # v4.31.5
         with:
           sarif_file: results.sarif
diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index a89ccb748eacf7..12332fa69e80b6 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -12,6 +12,7 @@ bazel_dep(name = "curl", version = "8.11.0")
 bazel_dep(name = "google_benchmark", version = "1.8.5", repo_name = "com_google_benchmark")
 bazel_dep(name = "googletest", version = "1.17.0", repo_name = "com_google_googletest")
 bazel_dep(name = "grpc", version = "1.74.1", repo_name = "com_github_grpc_grpc")
+bazel_dep(name = "gutil", version = "20250502.0", repo_name = "com_google_gutil");
 bazel_dep(name = "jsoncpp", version = "1.9.6", repo_name = "jsoncpp_git")
 bazel_dep(name = "or-tools", version = "9.12", repo_name = "com_google_ortools")
 bazel_dep(name = "platforms", version = "1.0.0")
@@ -33,39 +34,44 @@ bazel_dep(name = "grpc-java", version = "1.75.0", repo_name = "DO_NOT_USE_grpc_j
 
 # TODO: publish an official version of rules_ml_toolchain to BCR
 bazel_dep(name = "rules_ml_toolchain")
+
+# To calculate integrity:
+# wget -O temp_module_archive.tar.gz <archive URL>
+# HASH=$(openssl dgst -sha256 -binary temp_module_archive.tar.gz | openssl base64 -A)
+# echo "sha256-${HASH}"
 archive_override(
-  module_name = "rules_ml_toolchain",
-  integrity = "sha256-encZg3XP383NX+yN/kBdBePSalTpY843Iehh3rtLmIs=",
-  urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/5b47bb36d6b6942ec399d4ffa29059cb148c2286.tar.gz"],
-  strip_prefix = "rules_ml_toolchain-5b47bb36d6b6942ec399d4ffa29059cb148c2286",
+    module_name = "rules_ml_toolchain",
+    integrity = "sha256-XxcnU5d1K2Zq2/jwqBo+v7HiapcLRZysM6BqjwPKpTc=",
+    strip_prefix = "rules_ml_toolchain-a2626615e1277a635b43dd268e1d4bc892afea10",
+    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/a2626615e1277a635b43dd268e1d4bc892afea10.tar.gz"],
 )
 
 # TODO: Upstream the patch?
 single_version_override(
-  module_name = "grpc",
-  patches = ["//third_party/grpc:grpc.patch"],
-  patch_strip = 1,
+    module_name = "grpc",
+    patch_strip = 1,
+    patches = ["//third_party/grpc:grpc.patch"],
 )
 
 # TODO: Upstream those patch?
 single_version_override(
-  module_name = "abseil-cpp",
-  patches = [
-      "//third_party/absl:btree.patch",
-      "//third_party/absl:build_dll.patch",
-      "//third_party/absl:endian.patch",
-      "//third_party/absl:rules_cc.patch",
-      "//third_party/absl:check_op.patch",
-      "//third_party/absl:check_op_2.patch",
-  ],
-  patch_strip = 1,
+    module_name = "abseil-cpp",
+    patch_strip = 1,
+    patches = [
+        "//third_party/absl:btree.patch",
+        "//third_party/absl:build_dll.patch",
+        "//third_party/absl:endian.patch",
+        "//third_party/absl:rules_cc.patch",
+        "//third_party/absl:check_op.patch",
+        "//third_party/absl:check_op_2.patch",
+    ],
 )
 
 # Use an unreleased version of googletest
 archive_override(
-  module_name = "googletest",
-  urls = ["https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c94018.zip"],
-  strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
+    module_name = "googletest",
+    strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
+    urls = ["https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c94018.zip"],
 )
 
 ##############################################################
@@ -73,20 +79,23 @@ archive_override(
 
 # TODO: most of them can be Bazel modules, but we need a release strategy for them
 third_party = use_extension("//third_party/extensions:third_party.bzl", "third_party_ext")
-use_repo(third_party,
+use_repo(
+    third_party,
+    "FXdiv",
+    "XNNPACK",
     "cpuinfo",
     "cudnn_frontend_archive",
+    "dlpack",
     "ducc",
     "eigen_archive",
     "farmhash_archive",
     "farmhash_gpu_archive",
-    "FXdiv",
     "gloo",
     "highwayhash",
     "hwloc",
     "implib_so",
-    "llvm_openmp",
     "llvm-raw",
+    "llvm_openmp",
     "ml_dtypes_py",
     "mpitrampoline",
     "nanobind",
@@ -95,9 +104,9 @@ use_repo(third_party,
     "pthreadpool",
     "rocm_device_libs",
     "shardy",
+    "slinky",
     "stablehlo",
     "triton",
-    "XNNPACK",
 )
 
 ##############################################################
@@ -109,10 +118,7 @@ python.toolchain(python_version = "3.11")
 use_repo(python, "pythons_hub")
 
 pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
-
 pip.whl_mods(
-    hub_name = "pypi_mods",
-    whl_name = "numpy",
     additive_build_content = """
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 cc_library(
@@ -130,21 +136,21 @@ cc_library(
     deps = [":numpy_headers_2", ":numpy_headers_1"],
 )
 """,
+    hub_name = "pypi_mods",
+    whl_name = "numpy",
 )
-
 pip.parse(
+    extra_hub_aliases = {
+        "numpy": ["numpy_headers"],
+    },
     hub_name = "xla_pypi",
     python_version = "3.11",
     requirements_lock = "//:requirements_lock_3_11.txt",
     whl_modifications = {
         "@pypi_mods//:numpy.json": "numpy",
     },
-    extra_hub_aliases = {
-        "numpy": ["numpy_headers"],
-    }
 )
-
-use_repo(pip, "pypi_mods", "xla_pypi_311_numpy", pypi = "xla_pypi")
+use_repo(pip, "pypi_mods", pypi = "xla_pypi")
 
 python_version_ext = use_extension("//third_party/extensions:python_version.bzl", "python_version_ext")
 use_repo(python_version_ext, "python_version_repo")
@@ -201,3 +207,9 @@ use_repo(rocm_configure, "local_config_rocm")
 
 tensorrt_configure = use_extension("//third_party/extensions:tensorrt_configure.bzl", "tensorrt_configure_ext")
 use_repo(tensorrt_configure, "local_config_tensorrt")
+
+pjrt_nightly_timestamp = use_extension("//build_tools/pjrt_wheels:nightly.bzl", "nightly_timestamp_repo_bzlmod")
+use_repo(pjrt_nightly_timestamp, "nightly_timestamp")
+
+pjrt_rc_number = use_extension("//build_tools/pjrt_wheels:release_candidate.bzl", "rc_number_repo_bzlmod")
+use_repo(pjrt_rc_number, "rc_number")
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index 88ece5da1c6701..f591654b7047e0 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -9,10 +9,10 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "7a77198375cfdfcdcd5fec8dfe405d05e3d26a54e963ce3721e861debb4b988b",
-    strip_prefix = "rules_ml_toolchain-5b47bb36d6b6942ec399d4ffa29059cb148c2286",
+    sha256 = "5f17275397752b666adbf8f0a81a3ebfb1e26a970b459cac33a06a8f03caa537",
+    strip_prefix = "rules_ml_toolchain-a2626615e1277a635b43dd268e1d4bc892afea10",
     urls = [
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/5b47bb36d6b6942ec399d4ffa29059cb148c2286.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a2626615e1277a635b43dd268e1d4bc892afea10.tar.gz",
     ],
 )
 
@@ -152,3 +152,13 @@ load(
 nvshmem_redist_init_repository(
     nvshmem_redistributions = NVSHMEM_REDISTRIBUTIONS,
 )
+
+# This is used for building nightly PJRT wheels.
+load("//build_tools/pjrt_wheels:nightly.bzl", "nightly_timestamp_repo")
+
+nightly_timestamp_repo(name = "nightly_timestamp")
+
+# This is used for building release candidate PJRT wheels.
+load("//build_tools/pjrt_wheels:release_candidate.bzl", "rc_number_repo")
+
+rc_number_repo(name = "rc_number")
diff --git a/third_party/xla/build_tools/ci/build.py b/third_party/xla/build_tools/ci/build.py
index 8b25ec8e148662..17af3bf0d9280e 100755
--- a/third_party/xla/build_tools/ci/build.py
+++ b/third_party/xla/build_tools/ci/build.py
@@ -126,6 +126,7 @@ class BuildType(enum.Enum):
   XLA_MACOS_ARM64_CPU_KOKORO = enum.auto()
 
   JAX_LINUX_X86_CPU_GITHUB_ACTIONS = enum.auto()
+  JAX_WINDOWS_X86_CPU_GITHUB_ACTIONS = enum.auto()
   JAX_LINUX_X86_GPU_L4_GITHUB_ACTIONS = enum.auto()
 
   TENSORFLOW_LINUX_X86_CPU_GITHUB_ACTIONS = enum.auto()
@@ -159,6 +160,7 @@ class Build:
   repo_env: Dict[str, Any] = dataclasses.field(default_factory=dict)
   override_repository: Dict[str, str] = dataclasses.field(default_factory=dict)
   options: Dict[str, Any] = dataclasses.field(default_factory=dict)
+  startup_options: Dict[str, Any] = dataclasses.field(default_factory=dict)
   extra_setup_commands: Tuple[List[str], ...] = ()
 
   def __post_init__(self):
@@ -187,6 +189,7 @@ def bazel_command(
     Returns: List of command line arguments
     """
     options = _dict_to_cli_options(self.options)
+    startup_options = _dict_to_cli_options(self.startup_options)
     configs = [f"--config={config}" for config in self.configs]
     build_tag_filters = (
         f"--build_tag_filters={','.join(self.build_tag_filters)}"
@@ -211,7 +214,14 @@ def bazel_command(
         + options
         + list(extra_options)
     )
-    return ["bazel", subcommand, *all_options, "--", *self.target_patterns]
+    return [
+        "bazel",
+        *startup_options,
+        subcommand,
+        *all_options,
+        "--",
+        *self.target_patterns,
+    ]
 
   def commands(self) -> List[List[str]]:
     """Returns list of commands for a build."""
@@ -228,7 +238,8 @@ def commands(self) -> List[List[str]]:
         self.type_ == BuildType.XLA_MACOS_X86_CPU_KOKORO
         or self.type_ == BuildType.XLA_MACOS_ARM64_CPU_KOKORO
     )
-    if not macos_build:
+    windows_build = (self.type_ == BuildType.JAX_WINDOWS_X86_CPU_GITHUB_ACTIONS)
+    if not (macos_build or windows_build):
       cmds.append(
           retry(
               self.bazel_command(
@@ -377,7 +388,13 @@ def nvidia_gpu_build_with_compute_capability(
 Build(
     type_=BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS,
     repo="openxla/xla",
-    configs=("sycl", "sycl_hermetic", "icpx_clang"),
+    configs=(
+        "nonccl",
+        "rbe_linux_cpu",
+        "sycl",
+        "sycl_hermetic",
+        "icpx_clang",
+    ),
     target_patterns=_XLA_ONEAPI_TARGET_PATTERNS,
     build_tag_filters=oneapi_build_tag_filter,
     test_tag_filters=oneapi_test_tag_filter,
@@ -701,6 +718,26 @@ def nvidia_gpu_build_with_compute_capability(
     repo_env={"HERMETIC_PYTHON_VERSION": "3.12"},
 )
 
+Build(
+    type_=BuildType.JAX_WINDOWS_X86_CPU_GITHUB_ACTIONS,
+    repo="google/jax",
+    configs=("rbe_windows_amd64",),
+    target_patterns=("//tests:cpu_tests", "//tests:backend_independent_tests"),
+    test_env=dict(
+        JAX_NUM_GENERATED_CASES=25,
+        JAX_SKIP_SLOW_TESTS=1,
+    ),
+    override_repository=dict(
+        xla=f"{_GITHUB_WORKSPACE}\\openxla\\xla",
+    ),
+    options={**_DEFAULT_BAZEL_OPTIONS, "build_runfile_links": False},
+    repo_env={"HERMETIC_PYTHON_VERSION": "3.12"},
+    subcommand="build",
+    startup_options={
+        "output_base": f"{_GITHUB_WORKSPACE}\\bazel_output_base",
+    },
+)
+
 Build(
     type_=BuildType.JAX_LINUX_X86_GPU_L4_GITHUB_ACTIONS,
     repo="google/jax",
diff --git a/third_party/xla/build_tools/ci/golden_commands.txt b/third_party/xla/build_tools/ci/golden_commands.txt
index f5a5cf7312c552..0b160bfa3ef2e8 100644
--- a/third_party/xla/build_tools/ci/golden_commands.txt
+++ b/third_party/xla/build_tools/ci/golden_commands.txt
@@ -9,6 +9,10 @@ parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_fi
 bazel test --build_tag_filters=-multiaccelerator --test_tag_filters=-multiaccelerator --config=rbe_linux_x86_64_cuda --test_env=JAX_SKIP_SLOW_TESTS=1 --test_env=TF_CPP_MIN_LOG_LEVEL=0 --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow --repo_env=HERMETIC_PYTHON_VERSION=3.11 --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //tests:gpu_tests //tests:backend_independent_tests
 bazel analyze-profile profile.json.gz
 # END BuildType.JAX_LINUX_X86_GPU_L4_GITHUB_ACTIONS
+# BEGIN BuildType.JAX_WINDOWS_X86_CPU_GITHUB_ACTIONS
+bazel --output_base=$GITHUB_WORKSPACE\bazel_output_base build --build_tag_filters= --test_tag_filters= --config=rbe_windows_amd64 --test_env=JAX_NUM_GENERATED_CASES=25 --test_env=JAX_SKIP_SLOW_TESTS=1 --repo_env=HERMETIC_PYTHON_VERSION=3.12 --override_repository=xla=$GITHUB_WORKSPACE\openxla\xla --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --build_runfile_links=False -- //tests:cpu_tests //tests:backend_independent_tests
+bazel analyze-profile profile.json.gz
+# END BuildType.JAX_WINDOWS_X86_CPU_GITHUB_ACTIONS
 # BEGIN BuildType.TENSORFLOW_LINUX_X86_CPU_GITHUB_ACTIONS
 find $GITHUB_WORKSPACE/openxla/xla -type f -exec sed -i s/@local_xla/@local_xla/g {} +
 find $GITHUB_WORKSPACE/openxla/xla -type f -exec sed -i s/@local_tsl/@local_tsl/g {} +
@@ -92,8 +96,8 @@ bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneap
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build --nobuild -- //xla/stream_executor/sycl/... //xla/service/gpu/...
-bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build -- //xla/stream_executor/sycl/... //xla/service/gpu/...
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=nonccl --config=rbe_linux_cpu --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build --nobuild -- //xla/stream_executor/sycl/... //xla/service/gpu/...
+bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=nonccl --config=rbe_linux_cpu --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build -- //xla/stream_executor/sycl/... //xla/service/gpu/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_MACOS_ARM64_CPU_KOKORO
diff --git a/third_party/xla/build_tools/lint/tags.py b/third_party/xla/build_tools/lint/tags.py
index 5184cf24587952..d0d226d5e0d166 100644
--- a/third_party/xla/build_tools/lint/tags.py
+++ b/third_party/xla/build_tools/lint/tags.py
@@ -29,6 +29,7 @@
     # Tags that Bazel recognizes
     "local": "https://bazel.build/reference/be/common-definitions",
     "manual": "https://bazel.build/reference/be/common-definitions",
+    "exclusive-if-local": "https://bazel.build/reference/be/common-definitions",
     "large": "Conventional tag for `test_suites` of large tests",
     "__PYTHON_RULES_MIGRATION_DO_NOT_USE_WILL_BREAK__": "Internal bazel tag",
     # Various disable tags (currently recognized by OpenXLA CI)
@@ -95,9 +96,6 @@
         " additional targets."
     ),
     "multi_gpu": "Used by `xla_test` to signal that multiple GPUs are needed.",
-    "multi_gpu_h100": (
-        "Used by `xla_test` to signal that multiple H100s are needed."
-    ),
 }
 
 
diff --git a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
index d04ded8e2e2901..9ac9c1114f1589 100644
--- a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
+++ b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
@@ -1,10 +1,18 @@
 load("@cuda_cudart//:version.bzl", cuda_major_version = "VERSION")
+load("@nightly_timestamp//:timestamp.bzl", "XLA_NIGHTLY_TIMESTAMP")
+load("@rc_number//:rc_number.bzl", "XLA_RC_NUMBER")
 load("@rules_python//python:packaging.bzl", "py_wheel")
 
 # This ensures we can only build plugins for selected CUDA versions.
 cuda_label = "cuda" + cuda_major_version if cuda_major_version else "null"
 
-wheel_version = "0.0.0.dev0"
+# If we're building a nightly, append .devYYYYMMDD to the version
+# If we're not, the timestamp is an empty string
+# If we're building a release candidate, append rc# to the version
+# If we're not, the rc number is an empty string
+# Note that PEP 440 supports both of these at the same time, but
+# our CI will never set both.
+wheel_version = "0.0.0" + XLA_RC_NUMBER + XLA_NIGHTLY_TIMESTAMP
 
 wheel_platform = select({
     "//conditions:default": "manylinux_2_27_x86_64",
@@ -27,6 +35,21 @@ genrule(
     cmd = "cp $< $@",
 )
 
+# The wheels have the same C API header, but we need to give each their own copy
+genrule(
+    name = "pjrt_c_api_hdr_cuda",
+    srcs = ["//xla/pjrt/c:pjrt_c_api.h"],
+    outs = ["xla_plugins/xla_" + cuda_label + "_pjrt/include/pjrt_c_api.h"],
+    cmd = "cp $< $@",
+)
+
+genrule(
+    name = "pjrt_c_api_hdr_cpu",
+    srcs = ["//xla/pjrt/c:pjrt_c_api.h"],
+    outs = ["xla_plugins/xla_cpu_pjrt/include/pjrt_c_api.h"],
+    cmd = "cp $< $@",
+)
+
 # GPU-specific files
 cc_binary(
     name = "xla_plugins/xla_" + cuda_label + "_pjrt/xla_gpu_pjrt.so",
@@ -57,8 +80,9 @@ py_wheel(
     summary = "XLA PJRT Plugin",
     version = wheel_version,
     deps = [
-        ":xla_plugins/xla_" + cuda_label + "_pjrt/xla_gpu_pjrt.so",
         ":init_file_" + cuda_label,
+        ":pjrt_c_api_hdr_cuda",
+        ":xla_plugins/xla_" + cuda_label + "_pjrt/xla_gpu_pjrt.so",
     ],
 )
 
@@ -98,6 +122,45 @@ py_wheel(
     version = wheel_version,
     deps = [
         ":init_file_cpu",
+        ":pjrt_c_api_hdr_cpu",
         ":xla_plugins/xla_cpu_pjrt/xla_cpu_pjrt.so",
     ],
 )
+
+# Tests
+cc_test(
+    name = "cpu_smoke_test",
+    srcs = ["smoke_test.cc"],
+    data = [":xla_plugins/xla_cpu_pjrt/xla_cpu_pjrt.so"],
+    env = {
+        "PJRT_PLUGIN_PATH": "build_tools/pjrt_wheels/xla_plugins/xla_cpu_pjrt/xla_cpu_pjrt.so",
+    },
+    linkopts = ["-ldl"],
+    deps = ["//xla/pjrt/c:pjrt_c_api_hdrs"],
+)
+
+cc_test(
+    name = cuda_label + "_smoke_test",
+    srcs = ["smoke_test.cc"],
+    data = [":xla_plugins/xla_" + cuda_label + "_pjrt/xla_gpu_pjrt.so"],
+    env = {
+        "PJRT_PLUGIN_PATH": "build_tools/pjrt_wheels/xla_plugins/xla_" + cuda_label + "_pjrt/xla_gpu_pjrt.so",
+    },
+    linkopts = ["-ldl"],
+    deps = ["//xla/pjrt/c:pjrt_c_api_hdrs"],
+)
+
+# The CPU and CUDA test suites are run in CI's build script
+test_suite(
+    name = "cpu_test_suite",
+    tests = [
+        ":cpu_smoke_test",
+    ],
+)
+
+test_suite(
+    name = "cuda_test_suite",
+    tests = [
+        ":" + cuda_label + "_smoke_test",
+    ],
+)
diff --git a/third_party/xla/build_tools/pjrt_wheels/nightly.bzl b/third_party/xla/build_tools/pjrt_wheels/nightly.bzl
new file mode 100644
index 00000000000000..f479196ac2f9d4
--- /dev/null
+++ b/third_party/xla/build_tools/pjrt_wheels/nightly.bzl
@@ -0,0 +1,35 @@
+"""If we're building a nightly, we use this to pass a timestamp for the wheel version."""
+
+def _nightly_timestamp_impl(rctx):
+    timestamp_val = rctx.getenv("XLA_NIGHTLY_TIMESTAMP", "")  # Default to ""
+
+    # Smuggle the value via a new .bzl file
+    if timestamp_val:
+        rctx.file(
+            "timestamp.bzl",
+            content = 'XLA_NIGHTLY_TIMESTAMP = ".dev{}"'.format(timestamp_val),
+        )
+    else:
+        rctx.file(
+            "timestamp.bzl",
+            content = 'XLA_NIGHTLY_TIMESTAMP = ""',
+        )
+
+    # Create a BUILD file to make timestamp.bzl addressable
+    rctx.file("BUILD.bazel", content = "")
+
+nightly_timestamp_repo = repository_rule(
+    implementation = _nightly_timestamp_impl,
+    environ = ["XLA_NIGHTLY_TIMESTAMP"],
+)
+
+# bzlmod implementation
+def _nightly_timestamp_ext_impl(mctx):  # @unused
+    nightly_timestamp_repo(
+        name = "nightly_timestamp",
+    )
+
+nightly_timestamp_repo_bzlmod = module_extension(
+    implementation = _nightly_timestamp_ext_impl,
+    environ = ["XLA_NIGHTLY_TIMESTAMP"],
+)
diff --git a/third_party/xla/build_tools/pjrt_wheels/release_candidate.bzl b/third_party/xla/build_tools/pjrt_wheels/release_candidate.bzl
new file mode 100644
index 00000000000000..cf5f1bcd9e6d2f
--- /dev/null
+++ b/third_party/xla/build_tools/pjrt_wheels/release_candidate.bzl
@@ -0,0 +1,35 @@
+"""If we're building a release candidate, we use this to pass a rc number for the wheel version."""
+
+def _rc_number_impl(rctx):
+    rc_number = rctx.getenv("XLA_RC_NUMBER", "")  # Default to ""
+
+    # Smuggle the value via a new .bzl file
+    if rc_number:
+        rctx.file(
+            "rc_number.bzl",
+            content = 'XLA_RC_NUMBER = "{}"'.format(rc_number),
+        )
+    else:
+        rctx.file(
+            "rc_number.bzl",
+            content = 'XLA_RC_NUMBER = ""',
+        )
+
+    # Create a BUILD file to make timestamp.bzl addressable
+    rctx.file("BUILD.bazel", content = "")
+
+rc_number_repo = repository_rule(
+    implementation = _rc_number_impl,
+    environ = ["XLA_RC_NUMBER"],
+)
+
+# bzlmod implementation
+def _rc_number_ext_impl(mctx):  # @unused
+    rc_number_repo(
+        name = "rc_number",
+    )
+
+rc_number_repo_bzlmod = module_extension(
+    implementation = _rc_number_ext_impl,
+    environ = ["XLA_RC_NUMBER"],
+)
diff --git a/third_party/xla/build_tools/pjrt_wheels/smoke_test.cc b/third_party/xla/build_tools/pjrt_wheels/smoke_test.cc
new file mode 100644
index 00000000000000..e00f439a363d6d
--- /dev/null
+++ b/third_party/xla/build_tools/pjrt_wheels/smoke_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dlfcn.h>
+
+#include <iostream>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+typedef const PJRT_Api* (*GetPjrtApi_Func)();
+
+int main() {
+  // 1. Open the shared object
+  const char* so_path = std::getenv("PJRT_PLUGIN_PATH");
+  std::cout << "so_path: " << so_path << std::endl;
+  void* handle = dlopen(so_path, RTLD_LAZY);
+  if (!handle) {
+    std::cerr << "Error: Could not open shared object." << std::endl;
+    std::cerr << "Reason: " << dlerror() << std::endl;
+    return 1;
+  }
+
+  // 2. Load the symbol (the function)
+  GetPjrtApi_Func get_pjrt_api = (GetPjrtApi_Func)dlsym(handle, "GetPjrtApi");
+  const char* dlsym_error = dlerror();
+  if (dlsym_error) {
+    std::cerr << "Error: Could not find symbol 'GetPjrtApi'." << std::endl;
+    std::cerr << "Reason: " << dlsym_error << std::endl;
+    dlclose(handle);
+    return 1;
+  }
+
+  // 3. Call the function
+  std::cout << "Successfully loaded symbol. Calling GetPjrtApi()..."
+            << std::endl;
+  const PJRT_Api* api = get_pjrt_api();
+  if (api) {
+    std::cout << "Success! Received PjrtApi struct pointer." << std::endl;
+  } else {
+    std::cerr << "Error: GetPjrtApi() returned a null pointer." << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/third_party/xla/build_tools/rocm/BUILD b/third_party/xla/build_tools/rocm/BUILD
index 94e73c9f542638..5b63729938ac42 100644
--- a/third_party/xla/build_tools/rocm/BUILD
+++ b/third_party/xla/build_tools/rocm/BUILD
@@ -13,7 +13,70 @@
 # limitations under the License.
 # ============================================================================
 
+load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
+load("@rules_shell//shell:sh_binary.bzl", "sh_binary")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
 )
+
+string_flag(
+    name = "sanitizer",
+    build_setting_default = "none",
+    values = [
+        "none",
+        "asan",
+        "tsan",
+    ],
+)
+
+config_setting(
+    name = "asan",
+    flag_values = {"//build_tools/rocm:sanitizer": "asan"},
+)
+
+config_setting(
+    name = "tsan",
+    flag_values = {"//build_tools/rocm:sanitizer": "tsan"},
+)
+
+filegroup(
+    name = "sanitizer_ignore_lists",
+    srcs = select({
+        ":asan": [
+            "asan_ignore_list.txt",
+            "lsan_ignore_list.txt",
+        ],
+        ":tsan": ["tsan_ignore_list.txt"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "san_wrapper_script",
+    srcs = [":sanitizer_ignore_lists"],
+    outs = ["san_wrapper.sh"],
+    cmd = """
+      echo '#!/bin/bash' > $@
+      echo 'exec "$$@"' >> $@
+      chmod +x $@
+    """,
+)
+
+# this wrapper ensures the test target
+# take into account any changes in the ignore list files
+sh_binary(
+    name = "sanitizer_wrapper",
+    srcs = [":san_wrapper_script"],
+    data = [":sanitizer_ignore_lists"],
+    visibility = ["//visibility:public"],
+)
+
+sh_binary(
+    name = "parallel_gpu_execute",
+    srcs = ["parallel_gpu_execute.sh"],
+    data = [":sanitizer_ignore_lists"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/build_tools/rocm/parallel_gpu_execute.sh b/third_party/xla/build_tools/rocm/parallel_gpu_execute.sh
new file mode 100755
index 00000000000000..867c8557a971e4
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/parallel_gpu_execute.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+#
+# A script to run multiple GPU tests in parallel controlled with an environment
+# variable.
+#
+# Required environment variables:
+#     TF_GPU_COUNT = Number of GPUs available.
+
+TF_GPU_COUNT=$(lspci | grep -e 'controller' -e 'accelerators' | grep 'AMD/ATI' | wc -l)
+TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8}
+
+# This function is used below in rlocation to check that a path is absolute
+function is_absolute {
+  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
+}
+
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-4096}
+
+# *******************************************************************
+#         This section of the script is needed to
+#         make things work on windows under msys.
+# *******************************************************************
+RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
+function rlocation() {
+  if is_absolute "$1" ; then
+    # If the file path is already fully specified, simply return it.
+    echo "$1"
+  elif [[ -e "$TEST_SRCDIR/$1" ]]; then
+    # If the file exists in the $TEST_SRCDIR then just use it.
+    echo "$TEST_SRCDIR/$1"
+  elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
+    # If a runfiles manifest file exists then use it.
+    echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
+  fi
+}
+
+TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
+shift
+# *******************************************************************
+
+mkdir -p /var/lock
+# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
+# slots to run a test at.
+#
+# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
+# So, we iterate over TF_TESTS_PER_GPU first.
+for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
+  for i in `seq 0 $((TF_GPU_COUNT-1))`; do
+    exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
+    if flock -n "$lock_fd";
+    then
+      (
+        # This export only works within the brackets, so it is isolated to one
+        # single command.
+        export CUDA_VISIBLE_DEVICES=$i
+        export HIP_VISIBLE_DEVICES=$i
+        echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
+        "$TEST_BINARY" $@
+      )
+      return_code=$?
+      flock -u "$lock_fd"
+      exit $return_code
+    fi
+  done
+done
+
+echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
+exit 1
diff --git a/third_party/xla/build_tools/rocm/platform/linux_x64/BUILD b/third_party/xla/build_tools/rocm/platform/linux_x64/BUILD
deleted file mode 100644
index cd57c71a2b2cdb..00000000000000
--- a/third_party/xla/build_tools/rocm/platform/linux_x64/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2025 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-)
-
-platform(
-    name = "linux_x64",
-    constraint_values = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-        "@bazel_tools//tools/cpp:clang",
-    ],
-    exec_properties = {
-        "container-image": "docker://rocm/tensorflow-build:latest-jammy-python3.10-rocm6.4.0@sha256:9c2fbc861570735fc86dbf39dd3cefef6a13fcf6ae08358271af52f1aebd4248",
-        "OSFamily": "Linux",
-    },
-)
diff --git a/third_party/xla/xla/python/tools/_types.pyi b/third_party/xla/build_tools/rocm/rocm_tag_filters.sh
old mode 100644
new mode 100755
similarity index 53%
rename from third_party/xla/xla/python/tools/_types.pyi
rename to third_party/xla/build_tools/rocm/rocm_tag_filters.sh
index f355656f05b674..ed03b599377c4d
--- a/third_party/xla/xla/python/tools/_types.pyi
+++ b/third_party/xla/build_tools/rocm/rocm_tag_filters.sh
@@ -1,4 +1,5 @@
-# Copyright 2024 The OpenXLA Authors.
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,15 +12,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 # ==============================================================================
 
-from typing import Union
-import numpy as np
-from xla import xla_data_pb2
 
-# LINT.IfChange
-NdarrayTree = Union[np.ndarray, tuple['NdarrayTree', ...]]
-def make_ndarray(proto: xla_data_pb2.LiteralProto, /) -> NdarrayTree: ...
-def dtype_to_etype(dtype: np.dtype, /) -> xla_data_pb2.PrimitiveType: ...
-def etype_to_dtype(ptype: xla_data_pb2.PrimitiveType, /) -> np.dtype: ...
-# LINT.ThenChange(types.py, _types.cc)
+TAG_FILTERS=(
+    -no_gpu
+    -requires-gpu-intel
+    -requires-gpu-nvidia
+    -cuda-only
+    -oneapi-only
+    -requires-gpu-sm60
+    -requires-gpu-sm60-only
+    -requires-gpu-sm70
+    -requires-gpu-sm70-only
+    -requires-gpu-sm80
+    -requires-gpu-sm80-only
+    -requires-gpu-sm86
+    -requires-gpu-sm86-only
+    -requires-gpu-sm89
+    -requires-gpu-sm89-only
+    -requires-gpu-sm90
+    -requires-gpu-sm90-only
+)
+
+echo $(IFS=, ; echo "${TAG_FILTERS[*]}")
diff --git a/third_party/xla/build_tools/rocm/rocm_xla.bazelrc b/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
index 79bb6559919541..eeaccd6f9f873c 100644
--- a/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
+++ b/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
@@ -5,18 +5,19 @@ build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"
 
 build:rocm_rbe --bes_backend="grpcs://wardite.cluster.engflow.com"
 build:rocm_rbe --bes_results_url="https://wardite.cluster.engflow.com/invocation/"
-build:rocm_rbe --host_platform="//build_tools/rocm/platform/linux_x64"
-build:rocm_rbe --extra_execution_platforms="//build_tools/rocm/platform/linux_x64"
-build:rocm_rbe --platforms="//build_tools/rocm/platform/linux_x64"
+build:rocm_rbe --host_platform="@local_config_rocm//rocm:linux_x64"
+build:rocm_rbe --extra_execution_platforms="@local_config_rocm//rocm:linux_x64"
+build:rocm_rbe --platforms="@local_config_rocm//rocm:linux_x64"
 build:rocm_rbe --bes_timeout=600s
 build:rocm_rbe --tls_client_certificate="/tf/certificates/ci-cert.crt"
 build:rocm_rbe --tls_client_key="/tf/certificates/ci-cert.key"
 build:rocm_rbe --spawn_strategy=remote,local
+build:rocm_rbe --grpc_keepalive_time=30s
 
 test:rocm_rbe --jobs=200
 test:rocm_rbe --remote_executor=grpcs://wardite.cluster.engflow.com
 test:rocm_rbe --remote_timeout=3600
-test:rocm_rbe --strategy=TestRunner=remote
+test:rocm_rbe --strategy=TestRunner=remote,local
 
 build:tsan --strip=never
 build:tsan --copt -fsanitize=thread
@@ -25,3 +26,55 @@ build:tsan --copt -fno-omit-frame-pointer
 build:tsan --linkopt -fsanitize=thread
 build:tsan --linkopt -g
 build:tsan --//build_tools/rocm:sanitizer=tsan
+build:tsan --test_env=TSAN_OPTIONS=suppressions=build_tools/rocm/tsan_ignore_list.txt::history_size=7:ignore_noninstrumented_modules=1
+build:tsan --run_under=//build_tools/rocm:sanitizer_wrapper
+
+build:asan --test_env=ASAN_OPTIONS=suppressions=build_tools/rocm/asan_ignore_list.txt:use_sigaltstack=0
+build:asan --test_env=LSAN_OPTIONS=suppressions=build_tools/rocm/lsan_ignore_list.txt:use_sigaltstack=0
+build:asan --//build_tools/rocm:sanitizer=asan
+build:asan --run_under=//build_tools/rocm:sanitizer_wrapper
+
+test:xla_sgpu -- \
+//xla/... \
+-//xla/backends/gpu/collectives:gpu_clique_key_test \
+-//xla/backends/gpu/collectives:nccl_communicator_test \
+-//xla/service:collective_ops_utils_test \
+-//xla/service:collective_pipeliner_test \
+-//xla/service:collective_permute_cycle_test \
+-//xla/service:batched_gather_scatter_normalizer_test \
+-//xla/service:all_reduce_simplifier_test \
+-//xla/service:all_gather_simplifier_test \
+-//xla/service:reduce_scatter_decomposer_test \
+-//xla/service:reduce_scatter_reassociate_test \
+-//xla/service:reduce_scatter_combiner_test \
+-//xla/service:scatter_simplifier_test \
+-//xla/service:sharding_propagation_test \
+-//xla/service:sharding_remover_test \
+-//xla/service:p2p_schedule_preparation_test \
+-//xla/pjrt/distributed:topology_util_test \
+-//xla/pjrt/distributed:client_server_test
+
+test:xla_mgpu -- \
+//xla/tests:collective_ops_e2e_test \
+//xla/tests:collective_ops_test \
+//xla/tests:collective_pipeline_parallelism_test \
+//xla/tests:replicated_io_feed_test \
+//xla/backends/gpu/collectives:gpu_clique_key_test \
+//xla/backends/gpu/collectives:nccl_communicator_test \
+//xla/backends/gpu/runtime:all_reduce_test \
+//xla/service:collective_ops_utils_test \
+//xla/service:collective_pipeliner_test \
+//xla/service:collective_permute_cycle_test \
+//xla/service:batched_gather_scatter_normalizer_test \
+//xla/service:all_reduce_simplifier_test \
+//xla/service:all_gather_simplifier_test \
+//xla/service:reduce_scatter_decomposer_test \
+//xla/service:reduce_scatter_reassociate_test \
+//xla/service:reduce_scatter_combiner_test \
+//xla/service:scatter_simplifier_test \
+//xla/service:sharding_propagation_test \
+//xla/service:sharding_remover_test \
+//xla/service:p2p_schedule_preparation_test \
+//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
+//xla/pjrt/distributed:topology_util_test \
+//xla/pjrt/distributed:client_server_test 
diff --git a/third_party/xla/build_tools/rocm/rocm_xla_ci.bazelrc b/third_party/xla/build_tools/rocm/rocm_xla_ci.bazelrc
new file mode 100644
index 00000000000000..f648ef7385cf2e
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/rocm_xla_ci.bazelrc
@@ -0,0 +1,3 @@
+# CI related imports
+try-import /usertools/rocm.bazelrc
+try-import %workspace%/build_tools/rocm/rocm_xla.bazelrc
\ No newline at end of file
diff --git a/third_party/xla/build_tools/rocm/run_xla.sh b/third_party/xla/build_tools/rocm/run_xla.sh
index afacda10c6c2ca..8df56376d92a52 100755
--- a/third_party/xla/build_tools/rocm/run_xla.sh
+++ b/third_party/xla/build_tools/rocm/run_xla.sh
@@ -37,30 +37,23 @@ echo ""
 echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
 echo ""
 
-# First positional argument (if any) specifies the ROCM_INSTALL_DIR
-if [[ -n $1 ]]; then
-    ROCM_INSTALL_DIR=$1
-else
-    if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm/
-    else
-        ROCM_INSTALL_DIR=$ROCM_PATH
-    fi
-fi
-
 export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_ROCM=1
-export ROCM_PATH=$ROCM_INSTALL_DIR
-TAGS_FILTER="gpu,requires-gpu-amd,-multi_gpu,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-cuda-only,-oneapi-only"
-UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
-TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+export ROCM_PATH=/opt/rocm
+
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
 
-bazel \
-    test \
-    --define xnn_enable_avxvnniint8=false --define xnn_enable_avx512fp16=false \
-    --config=rocm_gcc \
-    --build_tag_filters=${TAGS_FILTER} \
-    --test_tag_filters=${TAGS_FILTER} \
+bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
+    --config=rocm_ci \
+    --config=xla_sgpu \
+    --build_tag_filters=$TAG_FILTERS \
+    --test_tag_filters=$TAG_FILTERS \
+    --profile=/tf/pkg/profile.json.gz \
     --test_timeout=920,2400,7200,9600 \
     --test_sharding_strategy=disabled \
     --test_output=errors \
@@ -70,7 +63,6 @@ bazel \
     --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
     --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
     --action_env=TF_ROCM_AMDGPU_TARGETS=${AMD_GPU_GFX_ID} \
-    --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-    --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
-    --run_under=//build_tools/ci:parallel_gpu_execute \
-    -- //xla/...
+    --action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
+    --repo_env="ROCM_PATH=$ROCM_PATH" \
+    --run_under=//build_tools/ci:parallel_gpu_execute
diff --git a/third_party/xla/build_tools/rocm/run_xla_ci_build.sh b/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
index 434c804cad5d83..42b1ccee872035 100755
--- a/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
+++ b/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
@@ -18,29 +18,21 @@
 set -e
 set -x
 
-CONFIG=$1
-DISK_CACHE_PATH=$2
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
 
-ASAN_ARGS=()
-if [[ $CONFIG == "rocm_ci_hermetic" ]]; then
-	ASAN_ARGS+=("--test_env=ASAN_OPTIONS=suppressions=$(realpath $(dirname $0))/asan_ignore_list.txt")
-	ASAN_ARGS+=("--test_env=LSAN_OPTIONS=suppressions=$(realpath $(dirname $0))/lsan_ignore_list.txt")
-	ASAN_ARGS+=("--config=asan")
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
 fi
 
-bazel --bazelrc=/usertools/rocm.bazelrc test \
-	--config=${CONFIG} \
-	--config=xla_cpp \
-	--disk_cache=${DISK_CACHE_PATH} \
-	--test_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
-	--build_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
+SCRIPT_DIR=$(dirname $0)
+bazel --bazelrc="$SCRIPT_DIR/rocm_xla_ci.bazelrc" test \
+	--build_tag_filters=$TAG_FILTERS \
+    --test_tag_filters=$TAG_FILTERS \
 	--profile=/tf/pkg/profile.json.gz \
 	--keep_going \
 	--test_env=TF_TESTS_PER_GPU=1 \
-	--test_env=TF_GPU_COUNT=2 \
-	--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-	--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+	--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
 	--test_output=errors \
-	--local_test_jobs=2 \
-	--run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
-	"${ASAN_ARGS[@]}"
+	--run_under=//build_tools/rocm:parallel_gpu_execute \
+	"$@"
diff --git a/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh b/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
index 20daaec57bc2ad..1ecf94167bee04 100755
--- a/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
+++ b/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
@@ -53,31 +53,23 @@ echo ""
 echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
 echo ""
 
-# First positional argument (if any) specifies the ROCM_INSTALL_DIR
-if [[ -n $1 ]]; then
-    ROCM_INSTALL_DIR=$1
-else
-    if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm/
-    else
-        ROCM_INSTALL_DIR=$ROCM_PATH
-    fi
-fi
-
 export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_ROCM=1
-export ROCM_PATH=$ROCM_INSTALL_DIR
-TAGS_FILTER="-requires-gpu-nvidia,-oss_excluded,-oss_serial"
-UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
-TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+export ROCM_PATH=/opt/rocm/
+
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS="$($SCRIPT_DIR/rocm_tag_filters.sh)"
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
 
-bazel \
-    test \
-    --define xnn_enable_avxvnniint8=false \
-    --define xnn_enable_avx512fp16=false \
-    --config=rocm_gcc \
-    --build_tag_filters=${TAGS_FILTER} \
-    --test_tag_filters=${TAGS_FILTER} \
+bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
+    --config=rocm_ci \
+    --config=xla_mgpu \
+    --build_tag_filters=${TAG_FILTERS} \
+    --test_tag_filters=${TAG_FILTERS} \
+    --profile=/tf/pkg/profile.json.gz \
     --test_timeout=920,2400,7200,9600 \
     --test_sharding_strategy=disabled \
     --test_output=errors \
@@ -90,11 +82,4 @@ bazel \
     --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
     --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
     --action_env=NCCL_MAX_NCHANNELS=1 \
-    -- //xla/tests:collective_ops_e2e_test \
-       //xla/tests:collective_ops_test \
-       //xla/tests:collective_pipeline_parallelism_test \
-       //xla/tests:replicated_io_feed_test \
-       //xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
-       //xla/pjrt/distributed:topology_util_test \
-       //xla/pjrt/distributed:client_server_test \
-       //xla/backends/gpu/runtime:all_reduce_test
+    --repo_env="ROCM_PATH=$ROCM_PATH"
diff --git a/third_party/xla/build_tools/rocm/tsan_ignore_list.txt b/third_party/xla/build_tools/rocm/tsan_ignore_list.txt
new file mode 100644
index 00000000000000..9f88753e2fcc5a
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/tsan_ignore_list.txt
@@ -0,0 +1,35 @@
+race:libhsa-runtime64.so
+race:libamdhip64.so
+race:hipStreamSynchronize
+race:libhipblaslt.so
+race:libamd_comgr.so
+race:librccl.so
+
+# Abseil reference counting (DropRef / RefCount init)
+race:tsl::ReferenceCounted
+race:absl::lts_*::Mutex
+race:absl::lts_*::CondVar
+
+# XLA GPU RawSEDeviceMemory RCReference reuse
+race:xla::RawSEDeviceMemory
+race:xla::gpu::AllocateDestinationBuffer
+race:xla::LocalDeviceState::ThenRelease
+
+# To be fixed
+race:xla::GpuAsyncHostToDeviceTransferManager::TransferRawDataToSubBuffer
+race:xla::LiteralBase::Piece::DeallocateBuffers
+race:xla::PjRtStreamExecutorLoadedExecutable::ExecuteHelper
+race:xla::PjRtStreamExecutorClient::BufferFromHostBufferInternal
+race:xla::PjRtStreamExecutorClient::AllocateAndRecordEvent
+race:xla::HloRunnerPjRt::TransferLiteralsFromDevice
+race:xla::MutableLiteralBase::~MutableLiteralBase
+race:xla::MutableLiteralBase::PopulateR1<int>
+race:xla::gpu::GpuCompiler::CompileSingleModule
+race:xla::LiteralBase::Piece::Storage::Storage
+race:xla::LocalClient::TransferFromOutfeedLocal
+race:llvm::cl::opt_storage<bool, false, false>::setValue<int>
+
+race:xla::gpu::(anonymous namespace)::RecoverExp2Pattern::initStaticsIfNeeded*
+race:lld::lldMain
+race:llvm::*
+race:xla::gpu::GpuExecutable::ExecuteAsyncOnStream
diff --git a/third_party/xla/docs/_toc.yaml b/third_party/xla/docs/_toc.yaml
index e5b747ba5d2fc1..1060f680df19f4 100644
--- a/third_party/xla/docs/_toc.yaml
+++ b/third_party/xla/docs/_toc.yaml
@@ -27,8 +27,14 @@ toc:
     path: /xla/copybara
   - title: Determinism
     path: /xla/determinism
+  - title: Effort Levels
+    path: /xla/effort_levels
   - title: Emitters
     path: /xla/emitters
+  - title: Error Codes
+    path: /xla/error_codes
+  - title: Errors Overview
+    path: /xla/errors_overview
   - title: Hermetic CUDA overview
     path: /xla/hermetic_cuda
   - title: Indexing Analysis
@@ -55,8 +61,6 @@ toc:
     path: /xla/flags_guidance
   - title: XLA Tooling
     path: /xla/tools
-  - title: XLA:GPU optimization levels
-    path: /xla/gpu_optimization_levels
 - title: Debugging
   section:
   # This is the default tab for the Debugging section.
diff --git a/third_party/xla/docs/effort_levels.md b/third_party/xla/docs/effort_levels.md
new file mode 100644
index 00000000000000..5628041ed01f7b
--- /dev/null
+++ b/third_party/xla/docs/effort_levels.md
@@ -0,0 +1,80 @@
+# Effort Levels
+
+XLA provides options to control the amount of effort the compiler will expend to
+
+*   optimize for runtime performance, and
+*   make the program "fit in memory" (which has a platform-dependent meaning)
+
+## Optimization Level
+
+Similar to the -O flags in gcc or clang, this field allows the user to influence
+how much work the compiler does in optimizing for execution time. It can be set
+via the
+[optimization_level](https://github.com/openxla/xla/blob/f4d624b6811c28925c3006f5b779f1149b3b39ac/xla/pjrt/proto/compile_options.proto#L71)
+field of the ExecutableBuildOptionsProto message, or the
+[optimization_level](https://github.com/openxla/xla/blob/f4d624b6811c28925c3006f5b779f1149b3b39ac/xla/xla.proto#L1580)
+field of the ExecutionOptions message.
+
+Lower optimization levels will cause various HLO passes to behave differently,
+typically doing less work, or may disable certain HLO passes entirely. The
+optimization level may also influence the compiler backend, such that the exact
+effect of this field has a dependence on the target platform. However, as a
+general guideline, the following table describes the expected overall effect of
+each value:
+
+| Level     | Use Case                                                                |
+| :-------- | :---------------------------------------------------------------------- |
+| EFFORT_O0 | Fastest compilation, slowest runtime                                    |
+| EFFORT_O1 | Faster compilation with reasonable runtime                              |
+| EFFORT_O2 | Strongly prioritize runtime (suitable default for production workloads) |
+| EFFORT_O3 | Expensive or experimental optimizations                                 |
+
+### Use in XLA:GPU
+
+In XLA:GPU, there are several passes that we disable by default because they
+significantly increase compilation time by increasing the HLO size. For
+convenience, we consolidate them under the optimization level option, such that
+setting optimization_level to O1 or above will lead to the following behavior:
+
+*   Collectives commonly used for data-parallel communication will be pipelined.
+    This behavior can also be steered more granularly by enabling individual
+    flags.
+    *   `xla_gpu_enable_pipelined_all_gather`
+    *   `xla_gpu_enable_pipelined_all_reduce`
+    *   `xla_gpu_enable_pipelined_reduce_scatter`
+*   Unrolling while loops by a factor of two. Breaks down the loop-barrier
+    potentially leading to a better compute-communication overlap and less
+    copies.
+    *   `xla_gpu_enable_while_loop_double_buffering`
+*   Latency Hiding Scheduler will do most of the work to hide the communication
+    latency.
+    *   `xla_gpu_enable_latency_hiding_scheduler`
+*   To maximize networking bandwidth, combiner passes will combine pipelined
+    collectives to the maximum available memory. The optimization does not kick
+    in if the loop is already unrolled in the input HLO.
+    *   [all_gather_combiner](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/service/gpu/transforms/collectives/all_gather_combiner.cc#L78)
+    *   [all_reduce_combiner](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc#L76)
+    *   [reduce_scatter_combiner](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc#L76)
+
+## Memory Fitting Level
+
+Another effort level option controls the degree to which the compiler will
+attempt to make the resulting program "fit in memory", where "fit" and "memory"
+have backend-dependent meanings (for example, in XLA:TPU, this option controls
+the degree to which the compiler works to keep the TPU's high-bandwidth memory
+(HBM) usage below the HBM capacity). It can be set via the
+[memory_fitting_level](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/pjrt/proto/compile_options.proto#L79)
+field of the ExecutableBuildOptionsProto message, or the
+[memory_fitting_level](https://github.com/openxla/xla/blob/f4d624b6811c28925c3006f5b779f1149b3b39ac/xla/xla.proto#L1588)
+field of the ExecutionOptions message.
+
+As with optimization level, the exact meaning of each effort level value is
+backend-dependent, but the following table describes the expected effect as a
+general guideline:
+
+| Level     | Use Case                                                                |
+| :-------- | :---------------------------------------------------------------------- |
+| EFFORT_O0 | Minimal effort to fit (fail compilation as quickly as possible instead) |
+| EFFORT_O1 | Reduced effort to fit                                                   |
+| EFFORT_O2 | Significant effort to fit (suitable default for production workloads)   |
+| EFFORT_O3 | Expensive or experimental algorithms to reduce memory usage             |
diff --git a/third_party/xla/docs/error_codes.md b/third_party/xla/docs/error_codes.md
new file mode 100644
index 00000000000000..c196c9024507da
--- /dev/null
+++ b/third_party/xla/docs/error_codes.md
@@ -0,0 +1,7 @@
+# XLA Error codes
+
+This page is a list of all error codes emitted by the XLA compiler.
+
+-   [E0100](./errors/error_0100.md)
+-   [E0101](./errors/error_0101.md)
+-   [E0102](./errors/error_0102.md)
diff --git a/third_party/xla/docs/errors/error_0100.md b/third_party/xla/docs/errors/error_0100.md
new file mode 100644
index 00000000000000..36a7d62aae95ba
--- /dev/null
+++ b/third_party/xla/docs/errors/error_0100.md
@@ -0,0 +1,68 @@
+# Error code: 0100
+
+**Category:** Buffer allocation failure - TPU
+
+**Type:** Runtime
+
+## Error log example
+
+```
+ValueError: RESOURCE_EXHAUSTED: Error allocating device buffer: Attempting to allocate 8.00M. That was not possible. There are 6.43M free.; (0x0x1_HBM0)
+```
+
+## Why do these happen?
+
+XLA:TPU runtime’s memory allocator failed to find a suitable block of memory on
+the accelerator’s HBM for the requested operation. These operations are
+typically user initiated buffer allocations via
+[`jax.device_put`](https://docs.jax.dev/en/latest/_autosummary/jax.device_put.html)
+or allocations for program outputs. These failures stem from a couple of
+reasons: - Out of Memory (OOM) - The user is trying to allocate a chunk of
+memory that is larger than the total amount of free memory available on the
+TPU’s HBM. - Memory Fragmentation - The allocation fails because **no single
+contiguous free block** in the memory space is large enough to satisfy the
+requested size. The total amount of free memory is sufficient for the
+allocation, but it is scattered across the memory space in small, non-contiguous
+blocks.
+
+The TPU runtime has a number of mechanisms in-place to retry allocation failures
+including: - If there are queued deallocations, the runtime retries failed
+allocations, - On OOMs caused by a fragmentation the runtime can automatically
+trigger a defragmentation and a retry. - The TPU runtime prioritizes buffer
+allocations over keeping programs loaded. If a buffer allocation fails due to
+insufficient HBM, the system will evict loaded TPU programs until enough memory
+is available for the buffer.
+
+So an error encountered after the above mitigations typically require user
+action.
+
+## How can a user fix their program when they do happen?
+
+-   Reduce your model's memory footprint:
+    -   Decrease Batch Size: Reducing the batch size directly lowers memory
+        usage.
+    -   Parameter Sharding: For very large models, use techniques like model
+        parallelism or sharding to distribute parameters across the HBM of
+        multiple TPU cores or hosts.
+    -   Shorten Sequence/Context Length: For models that operate on sequences
+        (like language models), reducing the input sequence length can
+        significantly decrease the memory footprint.
+    -   Buffer Donation: Utilize framework features (such as: `jax.jit(...,
+        donate_argnums=...)`) to signal to XLA that certain input buffers can be
+        overwritten and reused for outputs.
+    -   Optimize Checkpoint Strategy: Instead of saving the entire model state
+        at once, consider saving only the model weights or using a sharded
+        checkpointing strategy.
+-   Address Memory Layout and Padding:
+    -   TPU memory is allocated in chunks, and padding can increase the actual
+        size of tensors.
+-   Ensure no memory leaks:
+    -   Ensure references to `jax.Array` objects are not being held longer than
+        intended. Holding on to `jax.Array` objects might prevent automatic
+        de-allocation even after program compilation is completed.
+
+## How can a user debug these failures?
+
+Enable the `tpu_log_allocations_on_oom` flag for which the allocator will dump a
+detailed report of all current allocations when an OOM occurs, which can be
+invaluable for debugging.
diff --git a/third_party/xla/docs/errors/error_0101.md b/third_party/xla/docs/errors/error_0101.md
new file mode 100644
index 00000000000000..aa239d4cfce7e3
--- /dev/null
+++ b/third_party/xla/docs/errors/error_0101.md
@@ -0,0 +1,66 @@
+# Error code: 0101
+
+**Category:** Program allocation failure
+
+**Type:** Runtime
+
+## Error log example
+
+```
+XlaRuntimeError: RESOURCE_EXHAUSTED: Error loading program 'jit_embedding_pipeline_step_fn': Attempting to reserve 29.49G at the bottom of memory. That was not possible. There are 147.64M free, 0B reserved, and 147.64M reservable. Scope: unknown..: while running replica 0 and partition 34 of a replicated computation (other replicas may have failed as well).
+```
+
+## Why do these happen?
+
+This error indicates that the XLA runtime on a TPU device failed to load a
+compiled XLA program executable into the TPU's HBM. It typically occurs for one
+of the following reasons: - Program Size Exceeds Available HBM: The compiled XLA
+program, including its instructions, static data, and any embedded constants, is
+larger than the total amount of free HBM currently available on the specific TPU
+core(s) where the program is being loaded. - HBM Fragmentation: While the total
+free HBM on the device might be sufficient in aggregate, it is not available in
+a single, contiguous block large enough to fit the entire program.
+
+It's important to understand how the TPU runtime prioritizes memory. Buffer
+allocations are privileged over loaded programs. If a buffer allocation fails,
+the runtime will evict already loaded programs from HBM to free up space. This
+can lead to a situation where a program that loaded successfully before now
+fails with an OOM error, because the HBM is now occupied with more data buffers.
+
+## How can a user fix their program when they do happen?
+
+-   Reduce Buffer Memory Footprint: Freeing up memory used by data buffers will
+    leave more room for the program itself:
+    -   Decrease Batch Size: This is one of the most effective ways to reduce
+        the amount of memory used for activations.
+    -   Parameter Sharding: For very large models, use model parallelism or
+        sharding techniques (like FSDP or Megascale) to distribute the model's
+        parameters and computation across multiple TPU cores or hosts.
+    -   Shorten Sequence/Context Length: For models processing sequential data
+        (e.g., NLP models), reducing the sequence length can significantly
+        decrease memory usage.
+    -   Buffer Donation: Use framework features (e.g., `jax.jit(...,
+        donate_argnums=...)`) to allow XLA to reuse the memory of input buffers
+        for storing output, reducing peak memory usage.
+-   Reduce program’s memory requirements for temporaries:
+    -   Reduce programs memory usage for temporaries by using the
+        `tpu_shared_memory_percent` flag. Note that this might negatively affect
+        performance.
+-   Optimize Execution Strategy/Reduce Serving load:
+    -   Manage Program Loading: If you are JIT-compiling multiple functions, be
+        aware that each function can result in a program being loaded. Try to
+        structure your workload to minimize the number of concurrently loaded
+        programs.
+-   Ensure no memory leaks:
+    -   Ensure references to `jax.Array` objects are not being held longer than
+        intended. Holding on to `jax.Array` objects might prevent automatic
+        de-allocation even after program compilation is completed.
+
+## How can a user debug these failures?
+
+-   Enable the `tpu_log_allocations_on_oom` flag for which the allocator will
+    dump a detailed report of all current allocations when an OOM occurs, which
+    can be invaluable for debugging.
+-   Profile Your Program: Use the JAX memory profiler or the TensorFlow profiler
+    to get a detailed view of your program's memory usage over time. This can
+    help identify unexpected peaks in memory consumption.
diff --git a/third_party/xla/docs/errors/error_0102.md b/third_party/xla/docs/errors/error_0102.md
new file mode 100644
index 00000000000000..5f58474c5b04a7
--- /dev/null
+++ b/third_party/xla/docs/errors/error_0102.md
@@ -0,0 +1,53 @@
+# Error code: 0102
+
+**Category:** Program input buffer mismatch
+
+**Type:** Runtime
+
+## Error log example
+
+```
+XlaRuntimeError: INVALID_ARGUMENT: Executable(jit_embedding_pipeline_step_fn) expected parameter 2482 of size 5242880 (bf16[16,1280,40]{2,1,0:T(8,128)(2,1)}) but got buffer with incompatible size 1638400 (bf16[16,1280,40]{1,2,0:T(8,128)(2,1)}): while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).
+```
+
+## Why do these happen?
+
+This error occurs when the XLA runtime detects a mismatch between the size of a
+memory buffer expected by a compiled program and the size of the buffer that is
+actually provided at execution time. The error message indicates both the
+expected and actual sizes, as well as the tensor shapes and layouts.
+
+Note that these errors might occur even if two tensors have the same shape but
+their size in memory can be different if their physical layout (how the data is
+tiled and arranged on the hardware) is different.
+
+These errors are predominantly caused by: - **Checkpoint and XLA configuration
+mismatch** - A model is trained and a checkpoint is saved. The physical layout
+of the weights in that checkpoint is determined by the exact XLA version and
+configuration (e.g. XLA flags) at that time. Later, this checkpoint is loaded in
+a different environment where the configuration has changed. A new flag, a
+different default value, or a change in the model/XLA code can cause the runtime
+to expect a different physical layout for the weights. When the old buffer from
+the checkpoint is passed to the new compiled XLA program, the runtime throws an
+error. - **Hardware/Topology-Specific Layouts** - The XLA compiler is free to
+choose different physical layouts for tensors to optimize performance on
+different hardware. A layout that is optimal for v4 TPU might be different from
+a v5 TPU, or even for different pod slices of the same chip (e.g., 4x4x4 vs
+4x8). The error occurs when a model is compiled with an assumption about one
+topology's layout, but at runtime it is scheduled on a different topology, or
+there is a bug in the compiler's layout logic for a specific piece of hardware.
+
+## How can a user fix their program when they do happen?
+
+-   Ensure configuration consistency between model export and re-runs from
+    checkpoints:
+    -   Avoid using old checkpoints with new code unless you are certain that no
+        layout-affecting changes have been made.
+    -   Re-export the Saved Model: If you suspect a checkpoint/configuration
+        mismatch, the most reliable solution is to re-export the saved model
+        using the exact same (and current) codebase and configuration that you
+        are using for inference or fine-tuning.
+    -   Check for configuration changes (e.g. XLA flags) between the two runs.
+-   Hardware/Topology-Specific layouts:
+    -   Check for hardware version and topology mismatches if switching hardware
+        or topologies.
diff --git a/third_party/xla/docs/errors_overview.md b/third_party/xla/docs/errors_overview.md
new file mode 100644
index 00000000000000..4c392281d789b8
--- /dev/null
+++ b/third_party/xla/docs/errors_overview.md
@@ -0,0 +1,36 @@
+# XLA Errors Overview
+
+XLA errors are categorized into different XLA error sources. Each source has a
+list of an additional context other than the error message, which will be
+attached to each error within the category.
+
+🚧 Note that this standarization effort is a work in progress so not all error
+messages will have an attached error code yet.
+
+An example error log might look like:
+
+```
+XlaRuntimeError: RESOURCE_EXHAUSTED: XLA:TPU compile permanent error. Ran out of memory in memory space hbm. Used 49.34G of 32.00G hbm. Exceeded hbm capacity by 17.34G. Total hbm usage >= 49.34G: reserved 3.12M program unknown size arguments 49.34G
+
+JaxRuntimeError: RESOURCE_EXHAUSTED: Ran out of memory in memory space vmem while allocating on stack for %ragged_latency_optimized_all_gather_lhs_contracting_gated_matmul_kernel.18 = bf16[2048,4096]{1,0:T(8,128)(2,1)} custom-call(%get-tuple-element.18273, %get-tuple-element.18274, %get-tuple-element.18275, %get-tuple-element.18276, %get-tuple-element.18277, /*index=5*/%bitcast.8695, %get-tuple-element.19201, %get-tuple-element.19202, %get-tuple-element.19203, %get-tuple-element.19204), custom_call_target=""
+```
+
+## Statuses and CHECK failures
+
+In general, in XLA we can flag corrupted execution with two mechanisms: statuses
+and CHECK macro failures.
+
+Statuses are meant for non-fatal, recoverable errors. The assumption is that the
+function returns, and execution continues down the path where the caller
+explicitly checks the returned Status object. It's useful for handling invalid
+user input or expected resource constraints.
+
+On the other hand, CHECK failures cover programmer's errors or violations of
+invariants that should never happen if the code is correct. In case of an
+activated CHECK the program will log the error message and immediately
+terminate. It could ensure internal consistency, such as checking that a pointer
+is non-null before dereferencing it.
+
+## Error codes
+
+Here is an index list with all [error codes](error_codes.md).
diff --git a/third_party/xla/docs/flags_guidance.md b/third_party/xla/docs/flags_guidance.md
index 0b48e10a9eb139..f887b66c4fbc50 100644
--- a/third_party/xla/docs/flags_guidance.md
+++ b/third_party/xla/docs/flags_guidance.md
@@ -70,6 +70,18 @@ Flag                                            | Type                 | Notes
 `xla_tpu_enable_ag_backward_pipelining`         | Boolean (true/false) | Pipelines all-gathers (currently megascale all-gathers) backwards through scan loops.
 
 ### GPU XLA flags
+
+The `-O1` optimization level enables advanced compiler passes for improved GPU
+performance, including several categories of flags below: pipelining of
+data-parallel collectives (`xla_gpu_enable_pipelined_all_gather`,
+`xla_gpu_enable_pipelined_all_reduce`,
+`xla_gpu_enable_pipelined_reduce_scatter`), while loop unrolling
+(`xla_gpu_enable_while_loop_double_buffering`), latency hiding scheduling
+(`xla_gpu_enable_latency_hiding_scheduler`), and SOL latency estimator on
+Hopper/Blackwell (`xla_gpu_enable_analytical_sol_latency_estimator`). See
+[GPU Optimization Levels](https://openxla.org/xla/gpu_optimization_levels) for
+details.
+
 | Flag | Type | Notes |
 | :---- | :---- | :----- |
 | `xla_gpu_enable_latency_hiding_scheduler` | Boolean (true/false) |This flag enables latency hiding schedulers to overlap asynchronous communication with computation efficiently. The default value is False. |
diff --git a/third_party/xla/docs/gpu_optimization_levels.md b/third_party/xla/docs/gpu_optimization_levels.md
deleted file mode 100644
index 67160dabcb97a2..00000000000000
--- a/third_party/xla/docs/gpu_optimization_levels.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# XLA:GPU optimization Level (-On).
-
-There are some passes which we find beneficial to run (especially at scale), but
-they increase the HLO size, and thus compilation time. That's why they are not
-enabled by default. For convenience, we consolidate them under a single option.
-
-Setting [optimization_level](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/pjrt/proto/compile_options.proto#L66-L71)
-to [O1 or above](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/xla.proto#L1481) will lead to the following behaviour:
-
-* Collectives commonly used for data-parallel communication will be pipelined.
-This behavior can also be steered more granularly by enabling individual flags.
-  * `xla_gpu_enable_pipelined_all_gather`
-  * `xla_gpu_enable_pipelined_all_reduce`
-  * `xla_gpu_enable_pipelined_reduce_scatter`
-* Unrolling while loops by a factor of two. Breaks down the loop-barrier potentially leading to a better compute-communication overlap and less copies.
-  * `xla_gpu_enable_while_loop_double_buffering`
-* Latency Hiding Scheduler will do most the work to hide the communication latency.
-  * `xla_gpu_enable_latency_hiding_scheduler`
-* To maximize networking bandwidth, combiner passes will combine pipelined
-collectives to the maximum available memory. The optimization does not kick in
-if the loop is already unrolled in the input HLO.
-  * [all_gather_combiner](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/service/gpu/transforms/collectives/all_gather_combiner.cc#L78)
-  * [all_reduce_combiner](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc#L76)
-  * [reduce_scatter_combiner](https://github.com/openxla/xla/blob/5b54d0e9cf34f4e5ab05b3752ecb390145ca5716/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc#L76)
diff --git a/third_party/xla/docs/hlo_to_thunks.md b/third_party/xla/docs/hlo_to_thunks.md
index c7dccc4de4fc73..641f10f23bb489 100644
--- a/third_party/xla/docs/hlo_to_thunks.md
+++ b/third_party/xla/docs/hlo_to_thunks.md
@@ -1,28 +1,37 @@
 # From HLO to Thunks
 
+This document outlines the journey of an XLA *High Level Optimizer* (HLO) module
+from its initial state to a final executable. Sometimes we will omit the
+"module" and refer to it just as "HLO".
+
+<img src="./images/hlo_to_thunks.svg" alt="HLO to thunks diagram" width="50%">
+
 ## Pre-optimization HLO
-We start with pre-optimization HLO. Pre-optimization HLO does not contain ops
-that are considered internal to XLA, e.g. `fusion` or `bitcast`. Ops don't have
-a layout at this stage, or if they have, it will be ignored. Pre-optimization
-HLO is usually produced by higher level frameworks like Tensorflow and JAX.
-When using the XLA flag `-xla_dump_to`, the pre-optimization HLO is dumped to a
-file with file name suffix “before_optimizations.txt”.
+
+We start with pre-optimization HLO module. Pre-optimization HLO does not contain
+operations (*ops*) that are considered internal to XLA, such as `fusion` or
+`bitcast`. Ops don't have a layout at this stage, or if they do, it will be
+ignored. Pre-optimization HLO is usually produced by higher-level frameworks
+like TensorFlow and JAX. When using the XLA flag `-xla_dump_to`, the
+pre-optimization HLO is dumped to a file with file name suffix
+“before_optimizations.txt”.
 
 ## Optimize HLO Module
 
-The XLA:GPU pipeline will turn the pre-optimization HLO into optimized HLO by
+The XLA:GPU pipeline turns the pre-optimization HLO into optimized HLO by
 running a sequence of passes. The passes can be grouped together semantically
 and run in the following order:
 
 ### Sharding related passes
 
-[Shardy Partitioner](https://openxla.org/shardy/overview) or SPMD sharding.
+This includes passes like the [Shardy
+Partitioner](https://openxla.org/shardy/overview) or those for SPMD sharding.
 
-### Optimization passes.
+### Optimization passes
 
 This can include both legalization passes and simplification passes.
 
-### Collective optimization passes.
+### Collective optimization passes
 
 Similar to **Optimization passes**, but focuses on collective ops.
 
@@ -42,10 +51,10 @@ by the layout permutation in minor to major order. In this example, the most
 minor dimension is 30, the second most minor dimension is 10, and the major
 dimension is 20.
 
-The goal of the layout assignment is to minimize the number of physical
-transpositions that are required using a greedy strategy. It starts off with
-certain layout constraints (e.g. CuDNN/cuBLAS libraries expect consecutive
-dimensions) and propagates layout “down”, and then “up” the HLO graph. At the
+The goal of layout assignment is to minimize the number of required physical
+transpositions using a greedy strategy. It starts with certain layout
+constraints (e.g., cuDNN/cuBLAS libraries expect consecutive dimensions) and
+propagates layouts “down” and then “up” the HLO graph. At the
 end of layout propagation, some instructions may have conflicting layouts, one
 propagated from an operand, one propagated from a user. To resolve this
 conflict, a `copy` HLO instruction is inserted that changes the layout from the
@@ -56,10 +65,10 @@ operand layout to the instruction layout.
 Given that it is somewhat difficult to figure out the physical shape, layout
 normalization attempts to rewrite the shape such that it uses the default layout
 `{rank-1, rank-2, …, 0}`. In the example above, the normalized shape would be
-`f32[20,10,30]{2,1,0}`. Copy ops that change layouts are rewritten to a
-combination of `transpose` + `bitcast`. Given that currently we cannot normalize
-all ops, there are still some ops that may have non-default layouts, most
-notably `gather` and `dot`. At the boundaries between normalized ops and
+`f32[20,10,30]{2,1,0}`. Copy ops that change layouts are rewritten as a
+combination of `transpose` and `bitcast`. Given that currently we cannot
+normalize all ops, there are still some ops that may have non-default layouts,
+most notably `gather` and `dot`. At the boundaries between normalized ops and
 non-normalized ops there will be `bitcast` ops that represent a transpose, i.e.
 a transpose with a layout assigned that makes it a no-op physically.
 
@@ -74,10 +83,10 @@ turns a reshape into a sequence of `transpose`, reshape `bitcast` and
 ### Post layout assignment optimization passes
 
 The most important passes here are Triton fusions (GEMM fusions +
-Softmax/Layernorm fusions) or rewrites to library calls. But also Autotuning
-runs in this step, where we pick the best algorithm for convolutions or dots, or
-the best tiling for dots handled by the legacy Triton GEMM emitter, or whether
-we should use Triton or Cublas for a certain dot fusion.
+Softmax/Layernorm fusions) or rewrites to library calls. Autotuning also runs in
+this step, where XLA chooses chooses between different emitters, picks the best
+algorithm for convolutions or dots, finds the best tiling for fusions handled by
+the Triton emitter etc.
 
 ### Fusion passes
 
@@ -88,15 +97,15 @@ would allow duplicating ops with several users if the op can be fused into all
 users. We would also allow extending existing Triton Softmax fusions if
 possible.
 
-`Multi-Output` fusion is a separate pass that allows to fuse ops/fusions
-together that share an operand, or fuse operands/operand fusions into users
-without duplication but instead adding extra output(s) so other users of the op
-to be fused can be redirected to this output. This pass needs to be careful not
-to introduce cycles into the HLO graph.
+`Multi-Output` fusion is a separate pass that allows fusing ops/fusions that
+share an operand. It can also fuse operands/operand fusions into users without
+duplication by adding extra output(s), so other users of the op to be fused can
+be redirected to these outputs. This pass needs to be careful not to introduce
+cycles into the HLO graph.
 
- After Multi-Output fusion, we run common subexpression elimination (`HloCSE`
- pass) which will potentially merge previously duplicated ops back together if
- they ended up in the same fusion.
+After Multi-Output fusion, common subexpression elimination (`HloCSE` pass)
+runs, potentially merging previously duplicated ops back together if they ended
+up in the same fusion.
 
 ### Several post-fusion passes
 
@@ -114,34 +123,32 @@ passes that actually change the HloModule, you can use the flag
 certain passes).
 
 ## Scheduling
-An HloModule without schedule still has some degree of freedom in which order
-the ops are processed. Basically any topological sort according to
-operand/result relationship and control dependencies is ok. The scheduling
-enforces a certain order. This influences the amount of memory that is required,
-because we cannot reuse a buffer as long as not all readers of that buffer have
-been processed. In an initial step, we try different scheduler algorithms and
-pick the schedule that minimizes peak memory consumption.
-
-As a follow-up, we run the `LatencyHidingScheduler` pass that tries to maximize
-compute-communication overlap but may increase memory usage again.
 
-After scheduling, we run `HloRematerialization` which attempts to reduce memory
-usage in case peak memory consumption is higher than the amount of memory we
-have available. This is at the cost of performance, as e.g. some fusions might
-be split and some ops might be duplicated to have shorter buffer lifetimes. If
-rematerialization is happening, it would potentially make sense to look if there
-are ways at model side to reduce the amount of memory required (e.g. smaller
-batch sizes).
+An HLO Module without a schedule still has some degree of freedom in the order
+in which ops are processed. Any topological sort respecting operand/result
+relationships and control dependencies is valid. Scheduling determines what
+specific order to use. The main concern at this stage is the maximum memory
+consumption that depends on the lifetime of tensors. In an initial step, we try
+different scheduler algorithms and pick the schedule that should minimize the
+peak memory consumption. Note that at this point we don't work with a physical
+buffers yet (that will happen in "Buffer Assignment") and simulate the memory
+usage.
 
-## Thunks and CommandBuffers
+Then `LatencyHidingScheduler` pass runs and tries to maximize
+compute-communication overlap. But that may increase memory usage again.
 
-TBD
+Finally, in case peak memory consumption is higher than the amount of memory we
+have available we run `HloRematerialization`. This pass attempts to reduce
+memory usage at the cost of performance, as e.g. some fusions might be split and
+some ops might be duplicated to have shorter buffer lifetimes. If
+rematerialization occurs, it might be beneficial to investigate ways to reduce
+memory requirements on the model side (e.g., using smaller batch sizes).
 
-## BufferAssignment
+## Buffer Assignment
 
 Immediately before we lower to LLVM IR, we run the buffer assignment passes that
 will assign buffer slices to each instruction in the HLO graph. The buffer
-assignment runs in several steps.
+assignment runs in several steps:
 
 1. `HloDataflowAnalysis` assigns `HloValues` (essentially logical buffers) to
 instructions. For in-place ops, the `HloValue` of an operand can be reused. An
@@ -158,3 +165,113 @@ the start time of the other `HloBuffer`). When using the flag `-xla_dump_to`,
 some information about buffer assignment is dumped to a file with the name
 suffix "after_optimizations-buffer-assignment.txt".
 
+## Thunks
+
+After an HLO graph is optimized and scheduled, it is lowered into a
+linear sequence of thunks for a specific backend (CPU or GPU).
+
+In XLA, a **Thunk** is an abstraction of a self-contained unit of work that the
+runtime executes. It might be a compiled kernel launch, specific operation,
+library call, control-flow construct, collective communication, and so on. A
+**Thunk Sequence** represents the entire executable for a specific backend.
+
+### Thunk Emission
+
+The process of converting a scheduled HLO computation into a thunk sequence is
+called "thunk emission". This is handled by a dedicated emitter class in each
+backend.
+
+For the GPU Backend, this is handled by
+[IrEmitterUnnested](https://github.com/openxla/xla/tree/main/xla/service/gpu/ir_emitter_unnested.h).
+`EmitHloComputation` iterates through the scheduled list of HLO Instructions in
+a computation and dispatches to a specialized `Emit...` method (e.g.,
+`EmitFusion`, `EmitConvolutionThunk`, `EmitWhile`). Each of these methods
+constructs the appropriate Thunk object(s) and appends them to the
+thunk sequence.
+
+For the CPU Backend,
+[ThunkEmitter](https://github.com/openxla/xla/tree/main/xla/service/cpu/thunk_emitter.h)
+performs this role and is organized in a similar manner. Final `ThunkSequence`
+is embedded in the `CpuExecutable`.
+
+Note that each instruction in the entry computation of an HLO module might
+correspond to no (`kTuple`, `kConstant`, ..), one, or multiple (for example
+sort instruction) thunks in the final thunk sequence.
+
+### Command Buffers: Optimizing Execution on the GPU
+
+Modern GPU hardware allows recording a sequence of GPU operations (kernel
+launches, memory copies, etc.) once and then replaying the sequence multiple
+times with minimal CPU overhead. This is a critical performance optimization,
+especially for workloads with many small, fast-launching kernels. XLA uses
+**Command Buffer** as an abstraction of CUDA Graphs or HIP Graphs. The core
+interface is defined in
+[GpuCommandBuffer](https://github.com/openxla/xla/tree/main/xla/stream_executor/gpu/gpu_command_buffer.h).
+
+A command buffer is represented in a thunk sequence by
+[CommandBufferThunk](https://github.com/openxla/xla/tree/main/xla/backends/gpu/runtime/command_buffer_thunk.h).
+
+The emitter does not produce this thunk directly from HLO instructions. Instead,
+this is done by
+[CommandBufferConversionPass](https://github.com/openxla/xla/tree/main/xla/backends/gpu/runtime/command_buffer_conversion_pass.h)
+that runs on the ThunkSequence itself.
+
+The pass identifies contiguous sub-sequences of compatible thunks (e.g., a
+series of `KernelThunk`s and `GemmThunk`s). It then replaces the found
+sub-sequence with a single `CommandBufferThunk`. The new thunk encapsulates the
+logic of the original thunks as a list of lightweight CommandBufferCmd objects.
+When a `CommandBufferThunk` executes for the first time on a given GPU stream,
+it "records" its sequence of commands into a hardware command buffer. On all
+subsequent executions, it simply issues a single command to the GPU to "replay"
+the recorded sequence. This avoids the CPU overhead of launching each individual
+kernel.
+
+## Executable
+
+The final product of the XLA compilation pipeline is a self-contained,
+platform-specific
+[Executable](https://github.com/openxla/xla/tree/main/xla/service/executable.h).
+This object encapsulates all the information needed to run the compiled program
+on a target device, such as a CPU or GPU. It is the bridge between the compiler
+and the runtime. Modern runtimes like PJRT use slightly higher-level
+abstractions (see
+[PjRtExecutable](https://github.com/openxla/xla/tree/main/xla/pjrt/pjrt_executable.h)),
+but these ultimately wrap a backend-specific executable.
+
+An `Executable` contains several key pieces of information generated
+during compilation. While the exact contents vary by backend, they generally
+include:
+
+- Compiled Code: This is the low-level machine code that will run on the device.
+  For CPUs, this is typically one or more object files. For GPUs, this is the
+  compiled device code in PTX or HSACO format, which is loaded onto the GPU at
+  runtime.
+
+- Execution Plan (ThunkSequence): The core of the runtime logic. This is a
+  linear sequence of Thunk objects. Each thunk represents a single unit of work,
+  such as launching a kernel, calling a library function (e.g., cuBLAS), or
+  handling control flow. The runtime executes the program by iterating through
+  this sequence.
+
+- Memory Layout (BufferAssignment): This critical piece of metadata, produced by
+  the BufferAssigner, describes the complete memory layout for the computation.
+  It specifies the size of every buffer and how memory is allocated and reused
+  for parameters, outputs, and temporary values. The runtime uses this to
+  allocate device memory and pass the correct pointers to each thunk.
+
+- (optional) HLO Module: For debugging and profiling, the executable often
+  retains a reference to the final, optimized HloModule that it was compiled
+  from.
+
+The creation of the final executable is orchestrated by the compiler for each
+specific backend. The `RunBackend` method of a Compiler implementation is the
+final step in the compilation process, which packages all the compiled artifacts
+into an Executable object.
+[GpuCompiler](https://github.com/openxla/xla/tree/main/xla/service/gpu/gpu_compiler.cc)
+and
+[CpuCompiler](https://github.com/openxla/xla/tree/main/xla/service/cpu/cpu_compiler.cc)
+target GPU and CPU respectively.
+
+When a user calls `Execute...` on an executable, the runtime uses the
+`BufferAssignment` to allocate memory, and then invokes the `ThunkSequence` to
+launch the operations on the device using the compiled code.
diff --git a/third_party/xla/docs/images/hlo_to_thunks.mermaid.txt b/third_party/xla/docs/images/hlo_to_thunks.mermaid.txt
new file mode 100644
index 00000000000000..f3a814c4ae8bd1
--- /dev/null
+++ b/third_party/xla/docs/images/hlo_to_thunks.mermaid.txt
@@ -0,0 +1,37 @@
+flowchart TD
+    InitialHlo("HLO before optimizations")
+    OptimizedSheduledHlo("Optimized and scheduled HLO")
+    BufferAssignment("BufferAssignment")
+    ThunkSequence("ThunkSequence")
+    LlvmIr("LLVM IR")
+    MachineCode("Machine Code<br/><i>PTX, obj, ..</i>")
+    Executable("Executable")
+
+    %% Stages (Processes - Subroutine Shape)
+    HloPasses[[HLO Optimization]]
+    Scheduling[[Scheduling]]
+    BufferAssigner[[Buffer Assignment]]
+    ThunkEmitter[[Thunk Emission]]
+    LlvmCompiler[[LLVM Compilation]]
+    ExecutableBuilder[[Build Executable]]
+
+    %% Edges (Flow of Artifacts through Stages)
+    InitialHlo --> HloPasses
+    HloPasses --> Scheduling
+    Scheduling --> OptimizedSheduledHlo
+    OptimizedSheduledHlo --> BufferAssigner
+    BufferAssigner --> BufferAssignment
+
+    OptimizedSheduledHlo --> ThunkEmitter
+    BufferAssignment --> ThunkEmitter
+    ThunkEmitter --> ThunkSequence
+    ThunkEmitter -->
+
+    LlvmIr --> LlvmCompiler
+    LlvmCompiler --> MachineCode
+
+    MachineCode --> ExecutableBuilder
+    ThunkSequence --> ExecutableBuilder
+    BufferAssignment --> ExecutableBuilder
+
+    ExecutableBuilder --> Executable
\ No newline at end of file
diff --git a/third_party/xla/docs/images/hlo_to_thunks.svg b/third_party/xla/docs/images/hlo_to_thunks.svg
new file mode 100644
index 00000000000000..b5f74423b394a8
--- /dev/null
+++ b/third_party/xla/docs/images/hlo_to_thunks.svg
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.6.0/css/all.min.css" type="text/css"?>
+<svg aria-roledescription="flowchart-v2" role="graphics-document document" viewBox="-8 -8 436.87109375 1058" style="max-width: 100%;" xmlns="http://www.w3.org/2000/svg" width="100%" id="graph-div" height="100%" xmlns:xlink="http://www.w3.org/1999/xlink"><style>#graph-div{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#graph-div .error-icon{fill:#552222;}#graph-div .error-text{fill:#552222;stroke:#552222;}#graph-div .edge-thickness-normal{stroke-width:2px;}#graph-div .edge-thickness-thick{stroke-width:3.5px;}#graph-div .edge-pattern-solid{stroke-dasharray:0;}#graph-div .edge-pattern-dashed{stroke-dasharray:3;}#graph-div .edge-pattern-dotted{stroke-dasharray:2;}#graph-div .marker{fill:#333333;stroke:#333333;}#graph-div .marker.cross{stroke:#333333;}#graph-div svg{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;}#graph-div .label{font-family:"trebuchet ms",verdana,arial,sans-serif;color:#333;}#graph-div .cluster-label text{fill:#333;}#graph-div .cluster-label span,#graph-div p{color:#333;}#graph-div .label text,#graph-div span,#graph-div p{fill:#333;color:#333;}#graph-div .node rect,#graph-div .node circle,#graph-div .node ellipse,#graph-div .node polygon,#graph-div .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#graph-div .flowchart-label text{text-anchor:middle;}#graph-div .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#graph-div .node .label{text-align:center;}#graph-div .node.clickable{cursor:pointer;}#graph-div .arrowheadPath{fill:#333333;}#graph-div .edgePath .path{stroke:#333333;stroke-width:2.0px;}#graph-div .flowchart-link{stroke:#333333;fill:none;}#graph-div .edgeLabel{background-color:#e8e8e8;text-align:center;}#graph-div .edgeLabel rect{opacity:0.5;background-color:#e8e8e8;fill:#e8e8e8;}#graph-div .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#graph-div .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#graph-div .cluster text{fill:#333;}#graph-div .cluster span,#graph-div p{color:#333;}#graph-div div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#graph-div .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#graph-div :root{--mermaid-font-family:"trebuchet ms",verdana,arial,sans-serif;}</style><g><marker orient="auto" markerHeight="12" markerWidth="12" markerUnits="userSpaceOnUse" refY="5" refX="6" viewBox="0 0 10 10" class="marker flowchart" id="graph-div_flowchart-pointEnd"><path style="stroke-width: 1; stroke-dasharray: 1, 0;" class="arrowMarkerPath" d="M 0 0 L 10 5 L 0 10 z"></path></marker><marker orient="auto" markerHeight="12" markerWidth="12" markerUnits="userSpaceOnUse" refY="5" refX="4.5" viewBox="0 0 10 10" class="marker flowchart" id="graph-div_flowchart-pointStart"><path style="stroke-width: 1; stroke-dasharray: 1, 0;" class="arrowMarkerPath" d="M 0 5 L 10 10 L 10 0 z"></path></marker><marker orient="auto" markerHeight="11" markerWidth="11" markerUnits="userSpaceOnUse" refY="5" refX="11" viewBox="0 0 10 10" class="marker flowchart" id="graph-div_flowchart-circleEnd"><circle style="stroke-width: 1; stroke-dasharray: 1, 0;" class="arrowMarkerPath" r="5" cy="5" cx="5"></circle></marker><marker orient="auto" markerHeight="11" markerWidth="11" markerUnits="userSpaceOnUse" refY="5" refX="-1" viewBox="0 0 10 10" class="marker flowchart" id="graph-div_flowchart-circleStart"><circle style="stroke-width: 1; stroke-dasharray: 1, 0;" class="arrowMarkerPath" r="5" cy="5" cx="5"></circle></marker><marker orient="auto" markerHeight="11" markerWidth="11" markerUnits="userSpaceOnUse" refY="5.2" refX="12" viewBox="0 0 11 11" class="marker cross flowchart" id="graph-div_flowchart-crossEnd"><path style="stroke-width: 2; stroke-dasharray: 1, 0;" class="arrowMarkerPath" d="M 1,1 l 9,9 M 10,1 l -9,9"></path></marker><marker orient="auto" markerHeight="11" markerWidth="11" markerUnits="userSpaceOnUse" refY="5.2" refX="-1" viewBox="0 0 11 11" class="marker cross flowchart" id="graph-div_flowchart-crossStart"><path style="stroke-width: 2; stroke-dasharray: 1, 0;" class="arrowMarkerPath" d="M 1,1 l 9,9 M 10,1 l -9,9"></path></marker><g class="root"><g class="clusters"></g><g class="edgePaths"><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-InitialHlo LE-HloPasses" id="L-InitialHlo-HloPasses-0" d="M216.297,39L216.297,43.167C216.297,47.333,216.297,55.667,216.363,63.2C216.429,70.734,216.561,77.467,216.627,80.834L216.693,84.201"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-HloPasses LE-Scheduling" id="L-HloPasses-Scheduling-0" d="M216.797,128.5L216.714,132.583C216.63,136.667,216.464,144.833,216.446,152.284C216.429,159.734,216.561,166.467,216.627,169.834L216.693,173.201"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-Scheduling LE-OptimizedSheduledHlo" id="L-Scheduling-OptimizedSheduledHlo-0" d="M216.797,217.5L216.714,221.583C216.63,225.667,216.464,233.833,216.38,241.2C216.297,248.567,216.297,255.133,216.297,258.417L216.297,261.7"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-OptimizedSheduledHlo LE-BufferAssigner" id="L-OptimizedSheduledHlo-BufferAssigner-0" d="M270.814,306L282.463,310.167C294.111,314.333,317.409,322.667,329.124,330.2C340.839,337.734,340.971,344.467,341.037,347.834L341.103,351.201"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-BufferAssigner LE-BufferAssignment" id="L-BufferAssigner-BufferAssignment-0" d="M341.207,395.5L341.124,399.583C341.04,403.667,340.874,411.833,340.79,419.2C340.707,426.567,340.707,433.133,340.707,436.417L340.707,439.7"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-OptimizedSheduledHlo LE-ThunkEmitter" id="L-OptimizedSheduledHlo-ThunkEmitter-0" d="M188.836,306L182.968,310.167C177.1,314.333,165.364,322.667,159.497,334.25C153.629,345.833,153.629,360.667,153.629,375.5C153.629,390.333,153.629,405.167,153.629,420C153.629,434.833,153.629,449.667,153.629,464.5C153.629,479.333,153.629,494.167,154.442,504.974C155.256,515.782,156.883,522.564,157.697,525.955L158.51,529.346"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-BufferAssignment LE-ThunkEmitter" id="L-BufferAssignment-ThunkEmitter-0" d="M317.628,484L312.696,488.167C307.765,492.333,297.902,500.667,282.234,508.779C266.566,516.891,245.093,524.781,234.357,528.727L223.62,532.672"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-ThunkEmitter LE-ThunkSequence" id="L-ThunkEmitter-ThunkSequence-0" d="M201.47,573.5L209.366,577.583C217.261,581.667,233.053,589.833,240.948,601.333C248.844,612.833,248.844,627.667,248.844,642.5C248.844,657.333,248.844,672.167,248.844,687C248.844,701.833,248.844,716.667,248.844,731.5C248.844,746.333,248.844,761.167,248.844,773.867C248.844,786.567,248.844,797.133,248.844,802.417L248.844,807.7"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-ThunkEmitter LE-LlvmIr" id="L-ThunkEmitter-LlvmIr-0" d="M126.788,573.5L118.725,577.583C110.663,581.667,94.539,589.833,86.476,597.2C78.414,604.567,78.414,611.133,78.414,614.417L78.414,617.7"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-LlvmIr LE-LlvmCompiler" id="L-LlvmIr-LlvmCompiler-0" d="M78.414,662L78.414,666.167C78.414,670.333,78.414,678.667,78.48,686.2C78.546,693.734,78.678,700.467,78.744,703.834L78.81,707.201"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-LlvmCompiler LE-MachineCode" id="L-LlvmCompiler-MachineCode-0" d="M78.914,751.5L78.831,755.583C78.747,759.667,78.581,767.833,78.497,775.2C78.414,782.567,78.414,789.133,78.414,792.417L78.414,795.7"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-MachineCode LE-ExecutableBuilder" id="L-MachineCode-ExecutableBuilder-0" d="M78.414,864L78.414,868.167C78.414,872.333,78.414,880.667,93.601,888.857C108.789,897.048,139.163,905.095,154.351,909.119L169.538,913.143"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-ThunkSequence LE-ExecutableBuilder" id="L-ThunkSequence-ExecutableBuilder-0" d="M248.844,852L248.844,858.167C248.844,864.333,248.844,876.667,248.91,886.2C248.976,895.734,249.108,902.467,249.174,905.834L249.24,909.201"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-BufferAssignment LE-ExecutableBuilder" id="L-BufferAssignment-ExecutableBuilder-0" d="M345.089,484L346.025,488.167C346.962,492.333,348.834,500.667,349.771,512.25C350.707,523.833,350.707,538.667,350.707,553.5C350.707,568.333,350.707,583.167,350.707,598C350.707,612.833,350.707,627.667,350.707,642.5C350.707,657.333,350.707,672.167,350.707,687C350.707,701.833,350.707,716.667,350.707,731.5C350.707,746.333,350.707,761.167,350.707,778C350.707,794.833,350.707,813.667,350.707,832.5C350.707,851.333,350.707,870.167,342.058,883.471C333.41,896.776,316.112,904.551,307.463,908.439L298.815,912.327"></path><path marker-end="url(#graph-div_flowchart-pointEnd)" style="fill:none;" class="edge-thickness-normal edge-pattern-solid flowchart-link LS-ExecutableBuilder LE-Executable" id="L-ExecutableBuilder-Executable-0" d="M249.344,953.5L249.26,957.583C249.177,961.667,249.01,969.833,248.927,977.2C248.844,984.567,248.844,991.133,248.844,994.417L248.844,997.7"></path></g><g class="edgeLabels"><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g><g class="edgeLabel"><g transform="translate(0, 0)" class="label"><foreignObject height="0" width="0"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="edgeLabel"></span></div></foreignObject></g></g></g><g class="nodes"><g transform="translate(216.296875, 19.5)" data-id="InitialHlo" data-node="true" id="flowchart-InitialHlo-31755" class="node default default flowchart-label"><rect height="39" width="198.046875" y="-19.5" x="-99.0234375" ry="5" rx="5" style="" class="basic label-container"></rect><g transform="translate(-91.5234375, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="183.046875"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">HLO before optimizations</span></div></foreignObject></g></g><g transform="translate(216.296875, 286.5)" data-id="OptimizedSheduledHlo" data-node="true" id="flowchart-OptimizedSheduledHlo-31756" class="node default default flowchart-label"><rect height="39" width="230.4375" y="-19.5" x="-115.21875" ry="5" rx="5" style="" class="basic label-container"></rect><g transform="translate(-107.71875, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="215.4375"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">Optimized and scheduled HLO</span></div></foreignObject></g></g><g transform="translate(340.70703125, 464.5)" data-id="BufferAssignment" data-node="true" id="flowchart-BufferAssignment-31757" class="node default default flowchart-label"><rect height="39" width="140.390625" y="-19.5" x="-70.1953125" ry="5" rx="5" style="" class="basic label-container"></rect><g transform="translate(-62.6953125, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="125.390625"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">BufferAssignment</span></div></foreignObject></g></g><g transform="translate(248.84375, 832.5)" data-id="ThunkSequence" data-node="true" id="flowchart-ThunkSequence-31758" class="node default default flowchart-label"><rect height="39" width="126.78125" y="-19.5" x="-63.390625" ry="5" rx="5" style="" class="basic label-container"></rect><g transform="translate(-55.890625, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="111.78125"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">ThunkSequence</span></div></foreignObject></g></g><g transform="translate(78.4140625, 642.5)" data-id="LlvmIr" data-node="true" id="flowchart-LlvmIr-31759" class="node default default flowchart-label"><rect height="39" width="68.328125" y="-19.5" x="-34.1640625" ry="5" rx="5" style="" class="basic label-container"></rect><g transform="translate(-26.6640625, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="53.328125"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">LLVM IR</span></div></foreignObject></g></g><g transform="translate(78.4140625, 832.5)" data-id="MachineCode" data-node="true" id="flowchart-MachineCode-31760" class="node default default flowchart-label"><rect height="63" width="114.078125" y="-31.5" x="-57.0390625" ry="5" rx="5" style="" class="basic label-container"></rect><g transform="translate(-49.5390625, -24)" style="" class="label"><rect></rect><foreignObject height="48" width="99.078125"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">Machine Code<br/><i>PTX, obj, ..</i></span></div></foreignObject></g></g><g transform="translate(248.84375, 1022.5)" data-id="Executable" data-node="true" id="flowchart-Executable-31761" class="node default default flowchart-label"><rect height="39" width="94.09375" y="-19.5" x="-47.046875" ry="5" rx="5" style="" class="basic label-container"></rect><g transform="translate(-39.546875, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="79.09375"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">Executable</span></div></foreignObject></g></g><g transform="translate(216.296875, 108.5)" data-id="HloPasses" data-node="true" id="flowchart-HloPasses-31762" class="node default default flowchart-label"><polygon style="" transform="translate(-70.9296875,19.5)" class="label-container" points="0,0 141.859375,0 141.859375,-39 0,-39 0,0 -8,0 149.859375,0 149.859375,-39 -8,-39 -8,0"></polygon><g transform="translate(-63.4296875, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="126.859375"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">HLO Optimization</span></div></foreignObject></g></g><g transform="translate(216.296875, 197.5)" data-id="Scheduling" data-node="true" id="flowchart-Scheduling-31763" class="node default default flowchart-label"><polygon style="" transform="translate(-45.8984375,19.5)" class="label-container" points="0,0 91.796875,0 91.796875,-39 0,-39 0,0 -8,0 99.796875,0 99.796875,-39 -8,-39 -8,0"></polygon><g transform="translate(-38.3984375, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="76.796875"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">Scheduling</span></div></foreignObject></g></g><g transform="translate(340.70703125, 375.5)" data-id="BufferAssigner" data-node="true" id="flowchart-BufferAssigner-31764" class="node default default flowchart-label"><polygon style="" transform="translate(-72.1640625,19.5)" class="label-container" points="0,0 144.328125,0 144.328125,-39 0,-39 0,0 -8,0 152.328125,0 152.328125,-39 -8,-39 -8,0"></polygon><g transform="translate(-64.6640625, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="129.328125"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">Buffer Assignment</span></div></foreignObject></g></g><g transform="translate(163.62890625, 553.5)" data-id="ThunkEmitter" data-node="true" id="flowchart-ThunkEmitter-31765" class="node default default flowchart-label"><polygon style="" transform="translate(-62.3359375,19.5)" class="label-container" points="0,0 124.671875,0 124.671875,-39 0,-39 0,0 -8,0 132.671875,0 132.671875,-39 -8,-39 -8,0"></polygon><g transform="translate(-54.8359375, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="109.671875"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">Thunk Emission</span></div></foreignObject></g></g><g transform="translate(78.4140625, 731.5)" data-id="LlvmCompiler" data-node="true" id="flowchart-LlvmCompiler-31766" class="node default default flowchart-label"><polygon style="" transform="translate(-70.4140625,19.5)" class="label-container" points="0,0 140.828125,0 140.828125,-39 0,-39 0,0 -8,0 148.828125,0 148.828125,-39 -8,-39 -8,0"></polygon><g transform="translate(-62.9140625, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="125.828125"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">LLVM Compilation</span></div></foreignObject></g></g><g transform="translate(248.84375, 933.5)" data-id="ExecutableBuilder" data-node="true" id="flowchart-ExecutableBuilder-31767" class="node default default flowchart-label"><polygon style="" transform="translate(-67.453125,19.5)" class="label-container" points="0,0 134.90625,0 134.90625,-39 0,-39 0,0 -8,0 142.90625,0 142.90625,-39 -8,-39 -8,0"></polygon><g transform="translate(-59.953125, -12)" style="" class="label"><rect></rect><foreignObject height="24" width="119.90625"><div style="display: inline-block; white-space: nowrap;" xmlns="http://www.w3.org/1999/xhtml"><span class="nodeLabel">Build Executable</span></div></foreignObject></g></g></g></g></g></svg>
\ No newline at end of file
diff --git a/third_party/xla/docs/operation_semantics.md b/third_party/xla/docs/operation_semantics.md
index 2da3f9a064df1b..da201691d155f9 100644
--- a/third_party/xla/docs/operation_semantics.md
+++ b/third_party/xla/docs/operation_semantics.md
@@ -1206,8 +1206,8 @@ replicas.
 | `channel_id`          | optional `ChannelHandle` | Optional channel ID for   |
 :                       :                          : cross-module              :
 :                       :                          : communication             :
-| `inpace`              | optional `bool`          | flag whether permutation  |
-:                       :                          : should be done inplace    :
+| `inplace`             | optional `bool`          | flag whether permutation  |
+:                       :                          : should be done in place   :
 
 Note that there are the following restrictions on the `source_target_pairs`:
 
diff --git a/third_party/xla/docs/pjrt/cpp_api_overview.md b/third_party/xla/docs/pjrt/cpp_api_overview.md
index d69bead946455e..7eb417fce20ecf 100644
--- a/third_party/xla/docs/pjrt/cpp_api_overview.md
+++ b/third_party/xla/docs/pjrt/cpp_api_overview.md
@@ -80,7 +80,7 @@ For communicating with frameworks, buffers know how to convert to and from an
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(...) {...}
 
 // Buffer to Literal
-xla::PjRtFuture<> ToLiteral(xla::MutableLiteralBase* literal) override {...}
+xla::Future<> ToLiteral(xla::MutableLiteralBase* literal) override {...}
 ```
 
 APIs for creating a buffer have [Buffer Semantics](https://github.com/openxla/xla/blob/3e448cf9e86775a37ec5f7d3c69dfb20e0c760df/xla/pjrt/pjrt_client.h#L858)
@@ -138,7 +138,7 @@ reference. These buffers are then provided as arguments to the `Execute` method.
 
 ## PJRT Concepts
 
-### PjRtFutures & Async Computations
+### Futures & Async Computations
 
 If any part of a plugin is implemented asynchronously, it _must_ properly
 implement futures.
diff --git a/third_party/xla/requirements_lock_3_11.txt b/third_party/xla/requirements_lock_3_11.txt
index 86dc23e0d34fa9..ee59b922503f48 100644
--- a/third_party/xla/requirements_lock_3_11.txt
+++ b/third_party/xla/requirements_lock_3_11.txt
@@ -1,67 +1,117 @@
-numpy==1.24.3 \
-    --hash=sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187 \
-    --hash=sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812 \
-    --hash=sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7 \
-    --hash=sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4 \
-    --hash=sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6 \
-    --hash=sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0 \
-    --hash=sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4 \
-    --hash=sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570 \
-    --hash=sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4 \
-    --hash=sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f \
-    --hash=sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80 \
-    --hash=sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289 \
-    --hash=sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385 \
-    --hash=sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078 \
-    --hash=sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c \
-    --hash=sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463 \
-    --hash=sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3 \
-    --hash=sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950 \
-    --hash=sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155 \
-    --hash=sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7 \
-    --hash=sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c \
-    --hash=sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096 \
-    --hash=sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17 \
-    --hash=sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf \
-    --hash=sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4 \
-    --hash=sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02 \
-    --hash=sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c \
-    --hash=sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b
+numpy==2.3.5 \
+    --hash=sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b \
+    --hash=sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae \
+    --hash=sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3 \
+    --hash=sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0 \
+    --hash=sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b \
+    --hash=sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa \
+    --hash=sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28 \
+    --hash=sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e \
+    --hash=sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017 \
+    --hash=sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41 \
+    --hash=sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e \
+    --hash=sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63 \
+    --hash=sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9 \
+    --hash=sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8 \
+    --hash=sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff \
+    --hash=sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7 \
+    --hash=sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139 \
+    --hash=sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4 \
+    --hash=sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748 \
+    --hash=sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952 \
+    --hash=sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd \
+    --hash=sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b \
+    --hash=sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce \
+    --hash=sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f \
+    --hash=sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5 \
+    --hash=sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42 \
+    --hash=sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7 \
+    --hash=sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248 \
+    --hash=sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e \
+    --hash=sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3 \
+    --hash=sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b \
+    --hash=sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e \
+    --hash=sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0 \
+    --hash=sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa \
+    --hash=sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a \
+    --hash=sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5 \
+    --hash=sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d \
+    --hash=sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4 \
+    --hash=sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c \
+    --hash=sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52 \
+    --hash=sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5 \
+    --hash=sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d \
+    --hash=sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1 \
+    --hash=sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c \
+    --hash=sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18 \
+    --hash=sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7 \
+    --hash=sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188 \
+    --hash=sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218 \
+    --hash=sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2 \
+    --hash=sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903 \
+    --hash=sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c \
+    --hash=sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c \
+    --hash=sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234 \
+    --hash=sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82 \
+    --hash=sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39 \
+    --hash=sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf \
+    --hash=sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20 \
+    --hash=sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946 \
+    --hash=sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0 \
+    --hash=sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9 \
+    --hash=sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff \
+    --hash=sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad \
+    --hash=sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227 \
+    --hash=sha256:de5672f4a7b200c15a4127042170a694d4df43c992948f5e1af57f0174beed10 \
+    --hash=sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e \
+    --hash=sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf \
+    --hash=sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769 \
+    --hash=sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310 \
+    --hash=sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425 \
+    --hash=sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013 \
+    --hash=sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c \
+    --hash=sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb \
+    --hash=sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d \
+    --hash=sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520
 lit==17.0.6 \
     --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
-ml-dtypes==0.5.3 \
-    --hash=sha256:01de48de4537dc3c46e684b969a40ec36594e7eeb7c69e9a093e7239f030a28a \
-    --hash=sha256:0a1d68a7cb53e3f640b2b6a34d12c0542da3dd935e560fdf463c0c77f339fc20 \
-    --hash=sha256:0cd5a6c711b5350f3cbc2ac28def81cd1c580075ccb7955e61e9d8f4bfd40d24 \
-    --hash=sha256:0e44a3761f64bc009d71ddb6d6c71008ba21b53ab6ee588dadab65e2fa79eafc \
-    --hash=sha256:156418abeeda48ea4797db6776db3c5bdab9ac7be197c1233771e0880c304057 \
-    --hash=sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93 \
-    --hash=sha256:1b255acada256d1fa8c35ed07b5f6d18bc21d1556f842fbc2d5718aea2cd9e55 \
-    --hash=sha256:1db60c154989af253f6c4a34e8a540c2c9dce4d770784d426945e09908fbb177 \
-    --hash=sha256:2db74788fc01914a3c7f7da0763427280adfc9cd377e9604b6b64eb8097284bd \
-    --hash=sha256:4a177b882667c69422402df6ed5c3428ce07ac2c1f844d8a1314944651439458 \
-    --hash=sha256:4cae435a68861660af81fa3c5af16b70ca11a17275c5b662d9c6f58294e0f113 \
-    --hash=sha256:5103856a225465371fe119f2fef737402b705b810bd95ad5f348e6e1a6ae21af \
-    --hash=sha256:58e39349d820b5702bb6f94ea0cb2dc8ec62ee81c0267d9622067d8333596a46 \
-    --hash=sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39 \
-    --hash=sha256:5ee72568d46b9533ad54f78b1e1f3067c0534c5065120ea8ecc6f210d22748b3 \
-    --hash=sha256:66c2756ae6cfd7f5224e355c893cfd617fa2f747b8bbd8996152cbdebad9a184 \
-    --hash=sha256:6936283b56d74fbec431ca57ce58a90a908fdbd14d4e2d22eea6d72bb208a7b7 \
-    --hash=sha256:8b1a6e231b0770f2894910f1dce6d2f31d65884dbf7668f9b08d73623cdca909 \
-    --hash=sha256:8bb9cd1ce63096567f5f42851f5843b5a0ea11511e50039a7649619abfb4ba6d \
-    --hash=sha256:93c36a08a6d158db44f2eb9ce3258e53f24a9a4a695325a689494f0fdbc71770 \
-    --hash=sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9 \
-    --hash=sha256:9849ce7267444c0a717c80c6900997de4f36e2815ce34ac560a3edb2d9a64cd2 \
-    --hash=sha256:9d55ea7f7baf2aed61bf1872116cefc9d0c3693b45cae3916897ee27ef4b835e \
-    --hash=sha256:a4f39b9bf6555fab9bfb536cf5fdd1c1c727e8d22312078702e9ff005354b37f \
-    --hash=sha256:aec640bd94c4c85c0d11e2733bd13cbb10438fb004852996ec0efbc6cacdaf70 \
-    --hash=sha256:aecbd7c5272c82e54d5b99d8435fd10915d1bc704b7df15e4d9ca8dc3902be61 \
-    --hash=sha256:bda32ce212baa724e03c68771e5c69f39e584ea426bfe1a701cb01508ffc7035 \
-    --hash=sha256:bdcf26c2dbc926b8a35ec8cbfad7eff1a8bd8239e12478caca83a1fc2c400dc2 \
-    --hash=sha256:bdf40d2aaabd3913dec11840f0d0ebb1b93134f99af6a0a4fd88ffe924928ab4 \
-    --hash=sha256:c205cac07d24a29840c163d6469f61069ce4b065518519216297fc2f261f8db9 \
-    --hash=sha256:c3f5ae0309d9f888fd825c2e9d0241102fadaca81d888f26f845bc8c13c1e4ee \
-    --hash=sha256:cd7c0bb22d4ff86d65ad61b5dd246812e8993fbc95b558553624c33e8b6903ea \
-    --hash=sha256:d0f730a17cf4f343b2c7ad50cee3bd19e969e793d2be6ed911f43086460096e4 \
-    --hash=sha256:da65e5fd3eea434ccb8984c3624bc234ddcc0d9f4c81864af611aaebcc08a50e \
-    --hash=sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3
+ml-dtypes==0.5.4 \
+    --hash=sha256:0d2ffd05a2575b1519dc928c0b93c06339eb67173ff53acb00724502cda231cf \
+    --hash=sha256:11942cbf2cf92157db91e5022633c0d9474d4dfd813a909383bd23ce828a4b7d \
+    --hash=sha256:14a4fd3228af936461db66faccef6e4f41c1d82fcc30e9f8d58a08916b1d811f \
+    --hash=sha256:19b9a53598f21e453ea2fbda8aa783c20faff8e1eeb0d7ab899309a0053f1483 \
+    --hash=sha256:2314892cdc3fcf05e373d76d72aaa15fda9fb98625effa73c1d646f331fcecb7 \
+    --hash=sha256:2b857d3af6ac0d39db1de7c706e69c7f9791627209c3d6dedbfca8c7e5faec22 \
+    --hash=sha256:304ad47faa395415b9ccbcc06a0350800bc50eda70f0e45326796e27c62f18b6 \
+    --hash=sha256:35f29491a3e478407f7047b8a4834e4640a77d2737e0b294d049746507af5175 \
+    --hash=sha256:388d399a2152dd79a3f0456a952284a99ee5c93d3e2f8dfe25977511e0515270 \
+    --hash=sha256:3bbbe120b915090d9dd1375e4684dd17a20a2491ef25d640a908281da85e73f1 \
+    --hash=sha256:3d277bf3637f2a62176f4575512e9ff9ef51d00e39626d9fe4a161992f355af2 \
+    --hash=sha256:4381fe2f2452a2d7589689693d3162e876b3ddb0a832cde7a414f8e1adf7eab1 \
+    --hash=sha256:4ff7f3e7ca2972e7de850e7b8fcbb355304271e2933dd90814c1cb847414d6e2 \
+    --hash=sha256:531eff30e4d368cb6255bc2328d070e35836aa4f282a0fb5f3a0cd7260257298 \
+    --hash=sha256:533ce891ba774eabf607172254f2e7260ba5f57bdd64030c9a4fcfbd99815d0d \
+    --hash=sha256:557a31a390b7e9439056644cb80ed0735a6e3e3bb09d67fd5687e4b04238d1de \
+    --hash=sha256:5a0f68ca8fd8d16583dfa7793973feb86f2fbb56ce3966daf9c9f748f52a2049 \
+    --hash=sha256:6a0df4223b514d799b8a1629c65ddc351b3efa833ccf7f8ea0cf654a61d1e35d \
+    --hash=sha256:6c7ecb74c4bd71db68a6bea1edf8da8c34f3d9fe218f038814fd1d310ac76c90 \
+    --hash=sha256:7c23c54a00ae43edf48d44066a7ec31e05fdc2eee0be2b8b50dd1903a1db94bb \
+    --hash=sha256:805cef3a38f4eafae3a5bf9ebdcdb741d0bcfd9e1bd90eb54abd24f928cd2465 \
+    --hash=sha256:88c982aac7cb1cbe8cbb4e7f253072b1df872701fcaf48d84ffbb433b6568f24 \
+    --hash=sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453 \
+    --hash=sha256:8c6a2dcebd6f3903e05d51960a8058d6e131fe69f952a5397e5dbabc841b6d56 \
+    --hash=sha256:8c760d85a2f82e2bed75867079188c9d18dae2ee77c25a54d60e9cc79be1bc48 \
+    --hash=sha256:9ad459e99793fa6e13bd5b7e6792c8f9190b4e5a1b45c63aba14a4d0a7f1d5ff \
+    --hash=sha256:9bad06436568442575beb2d03389aa7456c690a5b05892c471215bfd8cf39460 \
+    --hash=sha256:a174837a64f5b16cab6f368171a1a03a27936b31699d167684073ff1c4237dac \
+    --hash=sha256:a7f7c643e8b1320fd958bf098aa7ecf70623a42ec5154e3be3be673f4c34d900 \
+    --hash=sha256:a9b61c19040397970d18d7737375cffd83b1f36a11dd4ad19f83a016f736c3ef \
+    --hash=sha256:b4b801ebe0b477be666696bda493a9be8356f1f0057a57f1e35cd26928823e5a \
+    --hash=sha256:b95e97e470fe60ed493fd9ae3911d8da4ebac16bd21f87ffa2b7c588bf22ea2c \
+    --hash=sha256:bc11d7e8c44a65115d05e2ab9989d1e045125d7be8e05a071a48bc76eb6d6040 \
+    --hash=sha256:bfc534409c5d4b0bf945af29e5d0ab075eae9eecbb549ff8a29280db822f34f9 \
+    --hash=sha256:c1a953995cccb9e25a4ae19e34316671e4e2edaebe4cf538229b1fc7109087b7 \
+    --hash=sha256:cb73dccfc991691c444acc8c0012bee8f2470da826a92e3a20bb333b1a7894e6 \
+    --hash=sha256:ce756d3a10d0c4067172804c9cc276ba9cc0ff47af9078ad439b075d1abdc29b \
+    --hash=sha256:d81fdb088defa30eb37bf390bb7dde35d3a83ec112ac8e33d75ab28cc29dd8b0 \
+    --hash=sha256:f21c9219ef48ca5ee78402d5cc831bd58ea27ce89beda894428bc67a52da5328
diff --git a/third_party/xla/requirements_lock_3_12.txt b/third_party/xla/requirements_lock_3_12.txt
index 914149c9727d00..ee59b922503f48 100644
--- a/third_party/xla/requirements_lock_3_12.txt
+++ b/third_party/xla/requirements_lock_3_12.txt
@@ -1,94 +1,117 @@
-numpy==2.1.3 \
-    --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \
-    --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \
-    --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \
-    --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \
-    --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \
-    --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \
-    --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \
-    --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \
-    --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \
-    --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \
-    --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \
-    --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \
-    --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \
-    --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \
-    --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \
-    --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \
-    --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \
-    --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \
-    --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \
-    --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \
-    --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \
-    --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \
-    --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \
-    --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \
-    --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \
-    --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \
-    --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \
-    --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \
-    --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \
-    --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \
-    --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \
-    --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \
-    --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \
-    --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \
-    --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \
-    --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \
-    --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \
-    --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \
-    --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \
-    --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \
-    --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \
-    --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \
-    --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \
-    --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \
-    --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \
-    --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \
-    --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \
-    --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \
-    --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \
-    --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \
-    --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \
-    --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \
-    --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \
-    --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \
-    --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4
+numpy==2.3.5 \
+    --hash=sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b \
+    --hash=sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae \
+    --hash=sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3 \
+    --hash=sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0 \
+    --hash=sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b \
+    --hash=sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa \
+    --hash=sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28 \
+    --hash=sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e \
+    --hash=sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017 \
+    --hash=sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41 \
+    --hash=sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e \
+    --hash=sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63 \
+    --hash=sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9 \
+    --hash=sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8 \
+    --hash=sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff \
+    --hash=sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7 \
+    --hash=sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139 \
+    --hash=sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4 \
+    --hash=sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748 \
+    --hash=sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952 \
+    --hash=sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd \
+    --hash=sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b \
+    --hash=sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce \
+    --hash=sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f \
+    --hash=sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5 \
+    --hash=sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42 \
+    --hash=sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7 \
+    --hash=sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248 \
+    --hash=sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e \
+    --hash=sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3 \
+    --hash=sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b \
+    --hash=sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e \
+    --hash=sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0 \
+    --hash=sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa \
+    --hash=sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a \
+    --hash=sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5 \
+    --hash=sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d \
+    --hash=sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4 \
+    --hash=sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c \
+    --hash=sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52 \
+    --hash=sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5 \
+    --hash=sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d \
+    --hash=sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1 \
+    --hash=sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c \
+    --hash=sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18 \
+    --hash=sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7 \
+    --hash=sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188 \
+    --hash=sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218 \
+    --hash=sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2 \
+    --hash=sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903 \
+    --hash=sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c \
+    --hash=sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c \
+    --hash=sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234 \
+    --hash=sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82 \
+    --hash=sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39 \
+    --hash=sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf \
+    --hash=sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20 \
+    --hash=sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946 \
+    --hash=sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0 \
+    --hash=sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9 \
+    --hash=sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff \
+    --hash=sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad \
+    --hash=sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227 \
+    --hash=sha256:de5672f4a7b200c15a4127042170a694d4df43c992948f5e1af57f0174beed10 \
+    --hash=sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e \
+    --hash=sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf \
+    --hash=sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769 \
+    --hash=sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310 \
+    --hash=sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425 \
+    --hash=sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013 \
+    --hash=sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c \
+    --hash=sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb \
+    --hash=sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d \
+    --hash=sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520
 lit==17.0.6 \
     --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
-ml-dtypes==0.5.3 \
-    --hash=sha256:01de48de4537dc3c46e684b969a40ec36594e7eeb7c69e9a093e7239f030a28a \
-    --hash=sha256:0a1d68a7cb53e3f640b2b6a34d12c0542da3dd935e560fdf463c0c77f339fc20 \
-    --hash=sha256:0cd5a6c711b5350f3cbc2ac28def81cd1c580075ccb7955e61e9d8f4bfd40d24 \
-    --hash=sha256:0e44a3761f64bc009d71ddb6d6c71008ba21b53ab6ee588dadab65e2fa79eafc \
-    --hash=sha256:156418abeeda48ea4797db6776db3c5bdab9ac7be197c1233771e0880c304057 \
-    --hash=sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93 \
-    --hash=sha256:1b255acada256d1fa8c35ed07b5f6d18bc21d1556f842fbc2d5718aea2cd9e55 \
-    --hash=sha256:1db60c154989af253f6c4a34e8a540c2c9dce4d770784d426945e09908fbb177 \
-    --hash=sha256:2db74788fc01914a3c7f7da0763427280adfc9cd377e9604b6b64eb8097284bd \
-    --hash=sha256:4a177b882667c69422402df6ed5c3428ce07ac2c1f844d8a1314944651439458 \
-    --hash=sha256:4cae435a68861660af81fa3c5af16b70ca11a17275c5b662d9c6f58294e0f113 \
-    --hash=sha256:5103856a225465371fe119f2fef737402b705b810bd95ad5f348e6e1a6ae21af \
-    --hash=sha256:58e39349d820b5702bb6f94ea0cb2dc8ec62ee81c0267d9622067d8333596a46 \
-    --hash=sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39 \
-    --hash=sha256:5ee72568d46b9533ad54f78b1e1f3067c0534c5065120ea8ecc6f210d22748b3 \
-    --hash=sha256:66c2756ae6cfd7f5224e355c893cfd617fa2f747b8bbd8996152cbdebad9a184 \
-    --hash=sha256:6936283b56d74fbec431ca57ce58a90a908fdbd14d4e2d22eea6d72bb208a7b7 \
-    --hash=sha256:8b1a6e231b0770f2894910f1dce6d2f31d65884dbf7668f9b08d73623cdca909 \
-    --hash=sha256:8bb9cd1ce63096567f5f42851f5843b5a0ea11511e50039a7649619abfb4ba6d \
-    --hash=sha256:93c36a08a6d158db44f2eb9ce3258e53f24a9a4a695325a689494f0fdbc71770 \
-    --hash=sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9 \
-    --hash=sha256:9849ce7267444c0a717c80c6900997de4f36e2815ce34ac560a3edb2d9a64cd2 \
-    --hash=sha256:9d55ea7f7baf2aed61bf1872116cefc9d0c3693b45cae3916897ee27ef4b835e \
-    --hash=sha256:a4f39b9bf6555fab9bfb536cf5fdd1c1c727e8d22312078702e9ff005354b37f \
-    --hash=sha256:aec640bd94c4c85c0d11e2733bd13cbb10438fb004852996ec0efbc6cacdaf70 \
-    --hash=sha256:aecbd7c5272c82e54d5b99d8435fd10915d1bc704b7df15e4d9ca8dc3902be61 \
-    --hash=sha256:bda32ce212baa724e03c68771e5c69f39e584ea426bfe1a701cb01508ffc7035 \
-    --hash=sha256:bdcf26c2dbc926b8a35ec8cbfad7eff1a8bd8239e12478caca83a1fc2c400dc2 \
-    --hash=sha256:bdf40d2aaabd3913dec11840f0d0ebb1b93134f99af6a0a4fd88ffe924928ab4 \
-    --hash=sha256:c205cac07d24a29840c163d6469f61069ce4b065518519216297fc2f261f8db9 \
-    --hash=sha256:c3f5ae0309d9f888fd825c2e9d0241102fadaca81d888f26f845bc8c13c1e4ee \
-    --hash=sha256:cd7c0bb22d4ff86d65ad61b5dd246812e8993fbc95b558553624c33e8b6903ea \
-    --hash=sha256:d0f730a17cf4f343b2c7ad50cee3bd19e969e793d2be6ed911f43086460096e4 \
-    --hash=sha256:da65e5fd3eea434ccb8984c3624bc234ddcc0d9f4c81864af611aaebcc08a50e \
-    --hash=sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3
+ml-dtypes==0.5.4 \
+    --hash=sha256:0d2ffd05a2575b1519dc928c0b93c06339eb67173ff53acb00724502cda231cf \
+    --hash=sha256:11942cbf2cf92157db91e5022633c0d9474d4dfd813a909383bd23ce828a4b7d \
+    --hash=sha256:14a4fd3228af936461db66faccef6e4f41c1d82fcc30e9f8d58a08916b1d811f \
+    --hash=sha256:19b9a53598f21e453ea2fbda8aa783c20faff8e1eeb0d7ab899309a0053f1483 \
+    --hash=sha256:2314892cdc3fcf05e373d76d72aaa15fda9fb98625effa73c1d646f331fcecb7 \
+    --hash=sha256:2b857d3af6ac0d39db1de7c706e69c7f9791627209c3d6dedbfca8c7e5faec22 \
+    --hash=sha256:304ad47faa395415b9ccbcc06a0350800bc50eda70f0e45326796e27c62f18b6 \
+    --hash=sha256:35f29491a3e478407f7047b8a4834e4640a77d2737e0b294d049746507af5175 \
+    --hash=sha256:388d399a2152dd79a3f0456a952284a99ee5c93d3e2f8dfe25977511e0515270 \
+    --hash=sha256:3bbbe120b915090d9dd1375e4684dd17a20a2491ef25d640a908281da85e73f1 \
+    --hash=sha256:3d277bf3637f2a62176f4575512e9ff9ef51d00e39626d9fe4a161992f355af2 \
+    --hash=sha256:4381fe2f2452a2d7589689693d3162e876b3ddb0a832cde7a414f8e1adf7eab1 \
+    --hash=sha256:4ff7f3e7ca2972e7de850e7b8fcbb355304271e2933dd90814c1cb847414d6e2 \
+    --hash=sha256:531eff30e4d368cb6255bc2328d070e35836aa4f282a0fb5f3a0cd7260257298 \
+    --hash=sha256:533ce891ba774eabf607172254f2e7260ba5f57bdd64030c9a4fcfbd99815d0d \
+    --hash=sha256:557a31a390b7e9439056644cb80ed0735a6e3e3bb09d67fd5687e4b04238d1de \
+    --hash=sha256:5a0f68ca8fd8d16583dfa7793973feb86f2fbb56ce3966daf9c9f748f52a2049 \
+    --hash=sha256:6a0df4223b514d799b8a1629c65ddc351b3efa833ccf7f8ea0cf654a61d1e35d \
+    --hash=sha256:6c7ecb74c4bd71db68a6bea1edf8da8c34f3d9fe218f038814fd1d310ac76c90 \
+    --hash=sha256:7c23c54a00ae43edf48d44066a7ec31e05fdc2eee0be2b8b50dd1903a1db94bb \
+    --hash=sha256:805cef3a38f4eafae3a5bf9ebdcdb741d0bcfd9e1bd90eb54abd24f928cd2465 \
+    --hash=sha256:88c982aac7cb1cbe8cbb4e7f253072b1df872701fcaf48d84ffbb433b6568f24 \
+    --hash=sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453 \
+    --hash=sha256:8c6a2dcebd6f3903e05d51960a8058d6e131fe69f952a5397e5dbabc841b6d56 \
+    --hash=sha256:8c760d85a2f82e2bed75867079188c9d18dae2ee77c25a54d60e9cc79be1bc48 \
+    --hash=sha256:9ad459e99793fa6e13bd5b7e6792c8f9190b4e5a1b45c63aba14a4d0a7f1d5ff \
+    --hash=sha256:9bad06436568442575beb2d03389aa7456c690a5b05892c471215bfd8cf39460 \
+    --hash=sha256:a174837a64f5b16cab6f368171a1a03a27936b31699d167684073ff1c4237dac \
+    --hash=sha256:a7f7c643e8b1320fd958bf098aa7ecf70623a42ec5154e3be3be673f4c34d900 \
+    --hash=sha256:a9b61c19040397970d18d7737375cffd83b1f36a11dd4ad19f83a016f736c3ef \
+    --hash=sha256:b4b801ebe0b477be666696bda493a9be8356f1f0057a57f1e35cd26928823e5a \
+    --hash=sha256:b95e97e470fe60ed493fd9ae3911d8da4ebac16bd21f87ffa2b7c588bf22ea2c \
+    --hash=sha256:bc11d7e8c44a65115d05e2ab9989d1e045125d7be8e05a071a48bc76eb6d6040 \
+    --hash=sha256:bfc534409c5d4b0bf945af29e5d0ab075eae9eecbb549ff8a29280db822f34f9 \
+    --hash=sha256:c1a953995cccb9e25a4ae19e34316671e4e2edaebe4cf538229b1fc7109087b7 \
+    --hash=sha256:cb73dccfc991691c444acc8c0012bee8f2470da826a92e3a20bb333b1a7894e6 \
+    --hash=sha256:ce756d3a10d0c4067172804c9cc276ba9cc0ff47af9078ad439b075d1abdc29b \
+    --hash=sha256:d81fdb088defa30eb37bf390bb7dde35d3a83ec112ac8e33d75ab28cc29dd8b0 \
+    --hash=sha256:f21c9219ef48ca5ee78402d5cc831bd58ea27ce89beda894428bc67a52da5328
diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index 94101d0d49697e..8ce053c34680ae 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -200,9 +200,11 @@ common:cuda --@rules_ml_toolchain//common:enable_cuda
 common:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
 common:cuda --@local_config_cuda//cuda:include_cuda_libs=true
+common:cuda --@cuda_driver//:include_cuda_umd_libs=true
 
 # This configuration is used for building the wheels.
 common:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false
+common:cuda_wheel --@cuda_driver//:include_cuda_umd_libs=false
 
 # CUDA: This config refers to building CUDA op kernels with clang.
 common:cuda_clang --config=cuda
@@ -276,23 +278,18 @@ common:rocm_base --define=xnn_enable_avxvnniint8=false
 common:rocm_base --define=xnn_enable_avx512fp16=false
 common:rocm_base --repo_env TF_NEED_ROCM=1
 
-# Depraceted, will be removed once all build/test scripts are migrated from --config=rocm.
-common:rocm --config=rocm_base 
-
-common:rocm_gcc --config=rocm_base
-common:rocm_gcc --copt=-Wno-stringop-truncation
-
 common:rocm_clang_official --config=rocm_base
 common:rocm_clang_official --action_env=CLANG_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
 common:rocm_clang_official --action_env=TF_ROCM_CLANG="1"
 common:rocm_clang_official --linkopt="-fuse-ld=lld"
 common:rocm_clang_official --host_linkopt="-fuse-ld=lld"
 
-common:rocm_ci --config=rocm_clang_official
+common:rocm --config=rocm_clang_official
+common:rocm_ci --config=rocm
 
+common:rocm_ci_hermetic --dynamic_mode=off
 common:rocm_ci_hermetic --config=rocm_clang_official
-common:rocm_ci_hermetic --repo_env="OS=ubuntu_22.04"
-common:rocm_ci_hermetic --repo_env="ROCM_VERSION=6.4.1"
+common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_VERSION=rocm_7.10.0_gfx94X"
 common:rocm_ci_hermetic --@local_config_rocm//rocm:rocm_path_type=hermetic
 
 # This config option is used for SYCL as GPU backend.
@@ -479,7 +476,6 @@ common:use_tar_archive_files --repo_env=USE_LLVM_TAR_ARCHIVE_FILES=1
 common:use_tar_archive_files --repo_env=USE_MIRRORED_TAR_ARCHIVE_FILES=1
 
 # Make Bazel not try to probe the host system for a C++ toolchain.
-common:rbe_base --config=use_tar_archive_files
 common:rbe_base --config=resultstore
 common:rbe_base --define=EXECUTOR=remote
 common:rbe_base --jobs=800
@@ -542,9 +538,6 @@ common:rbe_linux_cuda --config=rbe_linux_cpu
 common:rbe_linux_cuda --repo_env=TF_SYSROOT=
 # For Remote build execution -- GPU configuration
 common:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-# Enable forward compatibility for CUDA builds because RBE docker image doesn't
-# have latest CUDA drivers installed.
-common:rbe_linux_cuda --@cuda_driver//:enable_forward_compatibility=true
 
 common:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 common:rbe_linux_cuda_nvcc --config=cuda_nvcc
diff --git a/third_party/xla/third_party/compute_library/build_defs.bzl b/third_party/xla/third_party/compute_library/build_defs.bzl
index cd85be26bde540..c4dc1d9c0d7313 100644
--- a/third_party/xla/third_party/compute_library/build_defs.bzl
+++ b/third_party/xla/third_party/compute_library/build_defs.bzl
@@ -1,6 +1,6 @@
 def if_enable_acl(if_true, if_false = []):
     return select({
-        "@local_xla//third_party/compute_library:build_with_acl": if_true,
+        Label("//third_party/compute_library:build_with_acl"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -15,6 +15,6 @@ def acl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@local_xla//third_party/compute_library:build_with_acl": ["@compute_library//:arm_compute"],
+        Label("//third_party/compute_library:build_with_acl"): ["@compute_library//:arm_compute"],
         "//conditions:default": [],
     })
diff --git a/third_party/xla/third_party/eigen3/workspace.bzl b/third_party/xla/third_party/eigen3/workspace.bzl
index ee8f420dd30c2d..2ab7c777c24091 100644
--- a/third_party/xla/third_party/eigen3/workspace.bzl
+++ b/third_party/xla/third_party/eigen3/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    EIGEN_COMMIT = "70d8d99d0df9fd967b135efd8d12ed20fc48d007"
-    EIGEN_SHA256 = "78d1158871b8d3663cead3fb3c482721155df9a331d94cfcc60bcdf5cdbf18e1"
+    EIGEN_COMMIT = "dcbaf2d608f306450f1e74949eb87e9a22a7ef4b"
+    EIGEN_SHA256 = "a71517b3815984c1a8174db1ebc58a17d4f5c23c06e377bbc4a5dfc85855a516"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/extensions/python_version.bzl b/third_party/xla/third_party/extensions/python_version.bzl
index a8c6ac5556b296..d74a5ace691c25 100644
--- a/third_party/xla/third_party/extensions/python_version.bzl
+++ b/third_party/xla/third_party/extensions/python_version.bzl
@@ -6,7 +6,10 @@ This is just to keep the current build compatible with both WORKSPACE and Bzlmod
 
 _PY_VERSION_BZL = """
 HERMETIC_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION_KIND = "{py_kind}"
 USE_PYWRAP_RULES = {use_pywrap_rules}
+# TODO(pcloudy): Figure out how to support requirements_lock in Bzlmod.
+REQUIREMENTS = "//:requirements.txt"
 """
 
 def _python_version_repo_impl(repository_ctx):
@@ -19,6 +22,7 @@ def _python_version_repo_impl(repository_ctx):
         "py_version.bzl",
         _PY_VERSION_BZL.format(
             version = version,
+            py_kind = "",  # TODO(pcloudy): introduce this value properly.
             use_pywrap_rules = use_pywrap_rules,
         ),
     )
@@ -27,6 +31,7 @@ python_version_repo = repository_rule(
     implementation = _python_version_repo_impl,
     environ = [
         "HERMETIC_PYTHON_VERSION",
+        "HERMETIC_PYTHON_VERSION_KIND",
         "USE_PYWRAP_RULES",
     ],
 )
diff --git a/third_party/xla/third_party/extensions/remote_execution_configure.bzl b/third_party/xla/third_party/extensions/remote_execution_configure.bzl
index bc9f9e0efb521a..5728fad8e461b5 100644
--- a/third_party/xla/third_party/extensions/remote_execution_configure.bzl
+++ b/third_party/xla/third_party/extensions/remote_execution_configure.bzl
@@ -1,6 +1,6 @@
 """Module extension for remote_execution."""
 
-load("@local_xla//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
 
 remote_execution_configure_ext = module_extension(
     implementation = lambda mctx: remote_execution_configure(name = "local_config_remote_execution"),
diff --git a/third_party/xla/third_party/extensions/rocm_configure.bzl b/third_party/xla/third_party/extensions/rocm_configure.bzl
index b08948085caf34..cf755c8bc307fd 100644
--- a/third_party/xla/third_party/extensions/rocm_configure.bzl
+++ b/third_party/xla/third_party/extensions/rocm_configure.bzl
@@ -1,6 +1,6 @@
 """Module extension for rocm."""
 
-load("@local_xla//third_party/gpus:rocm_configure.bzl", "rocm_configure")
+load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 
 rocm_configure_ext = module_extension(
     implementation = lambda mctx: rocm_configure(name = "local_config_rocm"),
diff --git a/third_party/xla/third_party/extensions/tensorrt_configure.bzl b/third_party/xla/third_party/extensions/tensorrt_configure.bzl
index 6e795e3a0226e4..87f9d8e08cafd3 100644
--- a/third_party/xla/third_party/extensions/tensorrt_configure.bzl
+++ b/third_party/xla/third_party/extensions/tensorrt_configure.bzl
@@ -1,6 +1,6 @@
 """Module extension for tensorrt."""
 
-load("@local_xla//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 
 tensorrt_configure_ext = module_extension(
     implementation = lambda mctx: tensorrt_configure(name = "local_config_tensorrt"),
diff --git a/third_party/xla/third_party/extensions/third_party.bzl b/third_party/xla/third_party/extensions/third_party.bzl
index a76e676fa11147..f7532198381899 100644
--- a/third_party/xla/third_party/extensions/third_party.bzl
+++ b/third_party/xla/third_party/extensions/third_party.bzl
@@ -30,6 +30,7 @@ load("//third_party/rmm:workspace.bzl", rmm = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
 load("//third_party/shardy:workspace.bzl", shardy = "repo")
+load("//third_party/slinky:workspace.bzl", slinky = "repo")
 load("//third_party/spdlog:workspace.bzl", spdlog = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
@@ -68,6 +69,7 @@ def _third_party_ext_impl(mctx):  # @unused
     robin_map()
     rocm_device_libs()
     shardy()
+    slinky()
     spdlog()
     stablehlo()
     tensorrt()
diff --git a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index e4e0c53e83999f..9a1fa830d14c8a 100755
--- a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -32,6 +32,7 @@ HIP_RUNTIME_PATH = '%{rocm_root}/lib'
 HIP_RUNTIME_LIBRARY = '%{rocm_root}/lib'
 ROCR_RUNTIME_PATH = '%{rocm_root}/lib'
 ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
+TMPDIR= '%{tmpdir}'
 VERBOSE = '%{crosstool_verbose}'=='1'
 
 def Log(s):
@@ -232,6 +233,8 @@ def InvokeHipcc(argv, log=False):
 
 
 def main():
+  if TMPDIR:
+    os.environ['TMPDIR'] = TMPDIR
   # ignore PWD env var
   os.environ['PWD']=''
 
diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index a4d256831716f1..87b44db02d9cb9 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -94,6 +94,7 @@ cc_library(
     deps = [
         ":rocm_headers_includes",
         ":rocm_rpath",
+        ":rocm_config",
     ],
 )
 
@@ -136,11 +137,16 @@ cc_library(
     deps = [":rocm_config"],
 )
 
+# workaround to bring data to the same fs layout as expected in the rocm libs
+# rocblas assumes that miopen db files are located in ../share/miopen/db directory
+# hibplatslt assumes that tensile files are located in ../hipblaslt/library directory
 cc_library(
     name = "rocm_rpath",
     linkopts = select({
         ":build_hermetic": [
-            "-Wl,-rpath,%{rocm_toolkit_path}/lib",
+            "-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib",
+            "-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib/llvm/lib",
+            "-Lexternal/local_config_rocm/rocm/%{rocm_root}/lib",
         ],
         ":multiple_rocm_paths": [
             "-Wl,-rpath=%{rocm_lib_paths}",
@@ -163,7 +169,17 @@ cc_library(
 
 cc_library(
     name = "rocm_hip",
-    srcs = glob(["%{rocm_root}/lib/libamdhip*.so"]),
+    srcs = glob(
+        [
+            "%{rocm_root}/lib/libamdhip*.so*",
+            "%{rocm_root}/lib/libhiprtc.so*",
+            "%{rocm_root}/lib/libhiprtc-builtins.so*",
+        ],
+        exclude = [
+            # exclude files like libamdhip64.so.7.1.25445-7484b05b13 -> misplaced
+            "%{rocm_root}/**/*.so.*.*",
+        ],
+    ),
     hdrs = glob(["%{rocm_root}/include/hip/**"]),
     include_prefix = "rocm",
     includes = [
@@ -184,7 +200,17 @@ cc_library(
 # Used by jax_rocm_plugin to minimally link to hip runtime.
 cc_library(
     name = "hip_runtime",
-    srcs = glob(["%{rocm_root}/lib/libamdhip*.so"]),
+    srcs = glob(
+        [
+            "%{rocm_root}/lib/libamdhip*.so*",
+            "%{rocm_root}/lib/libhiprtc.so*",
+            "%{rocm_root}/lib/libhiprtc-builtins.so*",
+        ],
+        exclude = [
+            # exclude files like libamdhip64.so.7.1.25445-7484b05b13 -> misplaced
+            "%{rocm_root}/**/*.so.*.*",
+        ],
+    ),
     hdrs = glob(["%{rocm_root}/include/hip/**"]),
     include_prefix = "rocm",
     includes = [
@@ -193,7 +219,9 @@ cc_library(
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
     deps = [
+        ":amd_comgr",
         ":rocm_config",
+        ":rocm_rpath",
         ":rocprofiler_register",
         ":system_libs",
     ],
@@ -204,42 +232,53 @@ cc_library(
     hdrs = glob(["%{rocm_root}/include/rocblas/**"]),
     data = glob([
         "%{rocm_root}/lib/librocblas*.so*",
+        "%{rocm_root}/lib/librocroller*.so*",
         "%{rocm_root}/lib/rocblas/**",
     ]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include",
     ],
-    # workaround to  bring tensile files to the same fs layout as expected in the lib
-    # rocblas assumes that tensile files are located in ../roblas/libraries directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":hipblaslt",
+        ":rocm_config",
+        ":roctracer",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
     name = "rocfft",
-    srcs = glob(["%{rocm_root}/lib/librocfft*.so*"]),
+    data = glob(["%{rocm_root}/lib/librocfft*.so*"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include",
     ],
+    linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
     name = "hipfft",
-    srcs = glob(["%{rocm_root}/lib/libhipfft*.so*"]),
+    data = glob(["%{rocm_root}/lib/libhipfft*.so*"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include",
     ],
+    linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -259,7 +298,6 @@ cc_library(
 
 cc_library(
     name = "miopen",
-    srcs = glob(["%{rocm_root}/lib/libMIOpen*.so*"]),
     hdrs = glob(["%{rocm_root}/include/miopen/**"]),
     data = glob([
         "%{rocm_root}/lib/libMIOpen*.so*",
@@ -269,12 +307,13 @@ cc_library(
     includes = [
         "%{rocm_root}/include",
     ],
-    # workaround to  bring miopen db files to the same fs layout as expected in the lib
-    # rocblas assumes that miopen db files are located in ../share/miopen/db directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm-core",
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -291,6 +330,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":rocm_config",
+        ":roctracer",
         ":system_libs",
     ],
 )
@@ -340,8 +380,24 @@ cc_library(
 
 cc_library(
     name = "roctracer",
+    srcs = glob([
+        "%{rocm_root}/lib/libroctracer*.so*",
+        "%{rocm_root}/lib/libroctx64.so*",
+    ]),
     hdrs = glob(["%{rocm_root}/include/roctracer/**"]),
-    data = glob(["%{rocm_root}/lib/libroctracer*.so*"]),
+    include_prefix = "rocm",
+    includes = [
+        "%{rocm_root}/include/",
+    ],
+    strip_include_prefix = "%{rocm_root}",
+    visibility = ["//visibility:public"],
+    deps = [":rocm_config"],
+)
+
+cc_library(
+    name = "rocprofiler-sdk",
+    srcs = glob(["%{rocm_root}/lib/librocprofiler-sdk*.so*"]),
+    hdrs = glob(["%{rocm_root}/include/rocprofiler-sdk/**"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include/",
@@ -353,15 +409,18 @@ cc_library(
 
 cc_library(
     name = "rocsolver",
-    srcs = glob(["%{rocm_root}/lib/librocsolver*.so*"]),
     hdrs = glob(["%{rocm_root}/include/rocsolver/**"]),
+    data = glob(["%{rocm_root}/lib/librocsolver*.so*"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include/",
     ],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -378,7 +437,6 @@ cc_library(
 
 cc_library(
     name = "hipsolver",
-    srcs = glob(["%{rocm_root}/lib/libhipsolver*.so*"]),
     hdrs = glob(["%{rocm_root}/include/hipsolver/**"]),
     data = glob(["%{rocm_root}/lib/libhipsolver*.so*"]),
     include_prefix = "rocm",
@@ -392,7 +450,6 @@ cc_library(
 
 cc_library(
     name = "hipblas",
-    srcs = glob(["%{rocm_root}/lib/libhipblas.so*"]),
     hdrs = glob(["%{rocm_root}/include/hipblas/**"]),
     data = glob(["%{rocm_root}/lib/libhipblas.so*"]),
     include_prefix = "rocm",
@@ -404,6 +461,7 @@ cc_library(
     deps = [
         ":hipblas-common",
         ":rocm_config",
+        ":rocm_rpath",
     ],
 )
 
@@ -419,6 +477,15 @@ cc_library(
     deps = [":rocm_config"],
 )
 
+cc_library(
+    name = "rocm-core",
+    srcs = glob([
+        "%{rocm_root}/lib/librocm-core.so*",
+    ]),
+    visibility = ["//visibility:public"],
+    deps = [":rocm_config"],
+)
+
 cc_library(
     name = "hipblaslt",
     hdrs = glob(["%{rocm_root}/include/hipblaslt/**"]),
@@ -428,14 +495,15 @@ cc_library(
     ]),
     include_prefix = "rocm",
     includes = [
-        "%{rocm_root}/include/",
+        "%{rocm_root}/include/hipblaslt",
     ],
-    # workaround to  bring tensile files to the same fs layout as expected in the lib
-    # hibplatslt assumes that tensile files are located in ../hipblaslt/libraries directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":hip_runtime",
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -466,16 +534,28 @@ cc_library(
 
 cc_library(
     name = "amd_comgr",
-    srcs = glob([
+    hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
+    data = glob([
+        "%{rocm_root}/lib/libamd_comgr_loader.so*",
         "%{rocm_root}/lib/libamd_comgr.so*",
+        "%{rocm_root}/lib/llvm/lib/libLLVM.so*",
     ]),
-    hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include",
     ],
+    linkopts = select({
+        ":build_hermetic": [
+            "-lamd_comgr_loader",
+        ],
+        "//conditions:default": [],
+    }),
     strip_include_prefix = "%{rocm_root}",
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+        ":system_libs",
+    ],
 )
 
 cc_library(
@@ -499,13 +579,10 @@ cc_library(
 cc_library(
     name = "system_libs",
     srcs = glob([
-        "rocm_dist/usr/lib/**/libelf.so*",
-        "rocm_dist/usr/lib/**/libdrm.so*",
-        "rocm_dist/usr/lib/**/libnuma.so*",
-        "rocm_dist/usr/lib/**/libdrm_amdgpu.so*",
+        "%{rocm_root}/lib/rocm_sysdeps/lib/*.so*",
     ]),
     data = glob([
-        "rocm_dist/usr/**",
+        "%{rocm_root}/lib/rocm_sysdeps/share/**",
     ]),
 )
 
@@ -524,6 +601,9 @@ filegroup(
         "%{rocm_root}/lib/llvm/**",
         "%{rocm_root}/share/hip/**",
         "%{rocm_root}/amdgcn/**",
+        "%{rocm_root}/lib/rocm_sysdeps/lib/*.so*",
+        "%{rocm_root}/lib/libamd_comgr_loader.so*",
+        "%{rocm_root}/lib/libamd_comgr.so*",
     ]),
     visibility = ["//visibility:public"],
 )
@@ -533,3 +613,16 @@ filegroup(
     srcs = glob(["%{rocm_root}/**"]),
     visibility = ["//visibility:public"],
 )
+
+platform(
+    name = "linux_x64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+        "@bazel_tools//tools/cpp:clang",
+    ],
+    exec_properties = {
+        "container-image": "docker://%{rocm_rbe_docker_image}",
+        "OSFamily": "Linux",
+    },
+)
diff --git a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
index 7b583e56934edc..a690f767d8dbd5 100644
--- a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -60,9 +60,7 @@ def if_rocm_is_configured(if_true, if_false = []):
     Unlike if_rocm(), this does not require that we are building with
     --config=rocm. Used to allow non-ROCm code to depend on ROCm libraries.
     """
-    if %{rocm_is_configured}:
-      return select({"//conditions:default": if_true})
-    return select({"//conditions:default": if_false})
+    return if_true if %{rocm_is_configured} else if_false
 
 def is_rocm_configured():
     """
@@ -83,3 +81,6 @@ def rocm_library(copts = [], deps = [], **kwargs):
     if "@local_config_rocm//rocm:rocm_headers" not in deps:
       deps.append("@local_config_rocm//rocm:rocm_headers")
     native.cc_library(copts = rocm_default_copts() + copts, deps = deps, **kwargs)
+
+def get_rbe_amdgpu_pool(is_single_gpu = False):
+    return "%{single_gpu_rbe_pool}" if is_single_gpu else "%{multi_gpu_rbe_pool}"
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
index 4057c55141fc26..0628122609f8a2 100644
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
+++ b/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
@@ -1,18 +1,41 @@
-load(
-    "//third_party/gpus/rocm:rocm_redist_ubuntu_20_04.bzl",
-    "rocm_redist_ubuntu_20_04",
-)
-load(
-    "//third_party/gpus/rocm:rocm_redist_ubuntu_22_04.bzl",
-    "rocm_redist_ubuntu_22_04",
-)
-load(
-    "//third_party/gpus/rocm:rocm_redist_ubuntu_24_04.bzl",
-    "rocm_redist_ubuntu_24_04",
-)
-
 rocm_redist = {
-    "ubuntu_20.04": rocm_redist_ubuntu_20_04,
-    "ubuntu_22.04": rocm_redist_ubuntu_22_04,
-    "ubuntu_24.04": rocm_redist_ubuntu_24_04,
+    "rocm_7.10.0_gfx90X": struct(
+        packages = [
+            {
+                "url": "https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-gfx90X-dcgpu-7.10.0a20251106.tar.gz",
+                "sha256": "a9270cac210e02f60a7f180e6a4d2264436cdcce61167440e6e16effb729a8ea",
+            },
+        ],
+        required_softlinks = [struct(target = "llvm/amdgcn", link = "amdgcn")],
+        rocm_root = "",
+    ),
+    "rocm_7.10.0_gfx94X": struct(
+        packages = [
+            {
+                "url": "https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-gfx94X-dcgpu-7.10.0a20251107.tar.gz",
+                "sha256": "486dbf647bcf9b78f21d7477f43addc7b2075b1a322a119045db9cdc5eb98380",
+            },
+        ],
+        required_softlinks = [struct(target = "llvm/amdgcn", link = "amdgcn")],
+        rocm_root = "",
+    ),
+    "rocm_7.10.0_gfx94X_whl": struct(
+        packages = [
+            {
+                "url": "https://rocm.nightlies.amd.com/v2/gfx94X-dcgpu/rocm_sdk_libraries_gfx94x_dcgpu-7.10.0a20251009-py3-none-linux_x86_64.whl",
+                "sha256": "e4aa688ef0f4c54e57b0746fe7a617d6ee57ce4d19164308803b3f3eaf07fb30",
+            },
+            {
+                "url": "https://rocm.nightlies.amd.com/v2/gfx94X-dcgpu/rocm_sdk_core-7.10.0a20251009-py3-none-linux_x86_64.whl",
+                "sha256": "a284d98122a82464199b633d845909ce57c961f5a21fd890c5343fb27e2a110b",
+            },
+            {
+                "url": "https://rocm.nightlies.amd.com/v2/gfx94X-dcgpu/rocm_sdk_devel-7.10.0a20251009-py3-none-linux_x86_64.whl",
+                "sha256": "21b4ad7fe2d667977e0acd9f77490c2c5296d0039b0f773c337375c4580ce69d",
+                "sub_package": "rocm_sdk_devel/_devel.tar",
+            },
+        ],
+        required_softlinks = [struct(target = "_rocm_sdk_devel/llvm/amdgcn", link = "_rocm_sdk_devel/amdgcn")],
+        rocm_root = "_rocm_sdk_devel",
+    ),
 }
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_20_04.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_20_04.bzl
deleted file mode 100644
index c9793eae3df872..00000000000000
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_20_04.bzl
+++ /dev/null
@@ -1,364 +0,0 @@
-rocm_redist_ubuntu_20_04 = {
-    "6.2.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/c/comgr6.2.0/comgr6.2.0_2.8.0.60200-66~20.04_amd64.deb",
-                sha256 = "fabf4a831f21b5248932e08654149bc215da2a816613ad8d05b805d4e226171a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-runtime-amd6.2.0/hip-runtime-amd6.2.0_6.2.41133.60200-66~20.04_amd64.deb",
-                sha256 = "215fae8759742bc048699feaacd6256a3ac2138771b69731dab7779325bb1b41",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev6.2.0/hip-dev6.2.0_6.2.41133.60200-66~20.04_amd64.deb",
-                sha256 = "e901d66275b3b520ee73250caa4a1836be142823083528b4db6cc31a18bfb94d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas6.2.0/hipblas6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "f8a20128b5c26198bd9ecec894f8a4c74fa28ee668e4ef1bf73d0c3edff8c144",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "ab3ee54b33eba013fbf3d9aefe64b54e1918b9fb72790ca0b57fb391cb662cf0",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcc6.2.0/hipcc6.2.0_1.1.1.60200-66~20.04_amd64.deb",
-                sha256 = "a68123c046b8c913705262014463a8a30768167a1b68a78d8455deaf85a802d7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcub-dev6.2.0/hipcub-dev6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "c71fab59f62ad9d4b60aa4217f4db42c6996d83d5ad7ba29e127cc13bda59afc",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft6.2.0/hipfft6.2.0_1.0.14.60200-66~20.04_amd64.deb",
-                sha256 = "25887526ea2e955d4c0afa4749f8db55a49e399a349d43ccf66e0ad99ff78b2a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft-dev6.2.0/hipfft-dev6.2.0_1.0.14.60200-66~20.04_amd64.deb",
-                sha256 = "3cfec840c79c6bce4e83bf6e056e241cc13ff572352b040a952c7642b61d45aa",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver6.2.0/hipsolver6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "cb56dd79ff52eaddfed379831023484d9ec32b9538bc3d02ee34c328457cd20e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver-dev6.2.0/hipsolver-dev6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "1e968f9405c8b90fbb58dff09d8bab08cf31c8386880fff95e1cb8932320bc37",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse6.2.0/hipsparse6.2.0_3.1.1.60200-66~20.04_amd64.deb",
-                sha256 = "f08ba25b6b950754b5a2bb64c125a01b9f44280f227ff19eeb78e188f0b17320",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse-dev6.2.0/hipsparse-dev6.2.0_3.1.1.60200-66~20.04_amd64.deb",
-                sha256 = "e9464369619bbea7299ac83e17b3cbbabdeb16e6d4da116400532e7737332b65",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand6.2.0/hiprand6.2.0_2.11.0.60200-66~20.04_amd64.deb",
-                sha256 = "2efed49be9413e08e91b3fb67736644bb0e8809fc673d310a0abab65b69eacad",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand-dev6.2.0/hiprand-dev6.2.0_2.11.0.60200-66~20.04_amd64.deb",
-                sha256 = "19564fb2f9616860234aa8bd69cca324a1a3ec33476581ec57200a1dac1d4dcb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-rocr6.2.0/hsa-rocr6.2.0_1.14.0.60200-66~20.04_amd64.deb",
-                sha256 = "e4940a5d47e9e39d603f18936e7921c603fd8dde0e359e0be796f9c1cdacd431",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip6.2.0/miopen-hip6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "638a28c5407c3af7d16e1b0179b7494b0aeb36c314114af148b1bcd52e883db1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "77c9d26c4f0053b71fb86f7a6b489655e27053f9605efca3a16344ccf286e313",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl6.2.0/rccl6.2.0_2.20.5.60200-66~20.04_amd64.deb",
-                sha256 = "2b3ce1ca2e58e891963f26d4bd31ae45894480483f691d371f269e698f75f8eb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl-dev6.2.0/rccl-dev6.2.0_2.20.5.60200-66~20.04_amd64.deb",
-                sha256 = "0dedbffa5bb272d656086a9586e3705551345945f35f4f6be6dc8a27b63127a9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas6.2.0/rocblas6.2.0_4.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "6e5b3caeadf592367f8638db67a70b8dd9231a8257dc2012a9c46e2c5974fff5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas-dev/rocblas-dev_4.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "eaefe5a7d75ef61314b83af5bb85d8e652a730deaa58e1d600b1e9c2e673673c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft6.2.0/rocfft6.2.0_1.0.28.60200-66~20.04_amd64.deb",
-                sha256 = "b2bfe29ab688781bad5bc067ee682658085e22caaf09b18278f2f4b9905081d3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft-dev6.2.0/rocfft-dev6.2.0_1.0.28.60200-66~20.04_amd64.deb",
-                sha256 = "e94d50fd6f24d70649ce046dbfe4dda2587d1d82892d4c126a4c3e91d1570071",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-core/rocm-core_6.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "0e16c9fc58fc904542be4dad63bb2ff34268b5c13957c432e91ec0e4fd149c82",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "14f47d79b508eb259bfe4e0e5f360edb5721b908caf3bb981a4eee4181783be9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev/hip-dev_6.2.41133.60200-66~20.04_amd64.deb",
-                sha256 = "97e6e77eaea56de6cc4ea2c525dd8b9a587546eb99c782c7af46cdc5363b99bf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-device-libs6.2.0/rocm-device-libs6.2.0_1.0.0.60200-66~20.04_amd64.deb",
-                sha256 = "ae055b579d319e1a779783ba774f119fb0e1a731d058a03b36dc5c15214d210a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocminfo6.2.0/rocminfo6.2.0_1.0.0.60200-66~20.04_amd64.deb",
-                sha256 = "3bcf3dc22dbede7da70299cde1484776827808b967d371441f6cf6d3fe8af30d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm6.2.0/rocm-llvm6.2.0_18.0.0.24292.60200-66~20.04_amd64.deb",
-                sha256 = "ce17d2b85407b9539e0feda513fd360a48ebfd971c19af122dda21d60448c9fc",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_18.0.0.24292.60200-66~20.04_amd64.deb",
-                sha256 = "322ca8425c3a8f2ec17c551bad606b96d957b0c1eea07196dd66ac9f15460ed5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-smi-lib6.2.0/rocm-smi-lib6.2.0_7.3.0.60200-66~20.04_amd64.deb",
-                sha256 = "1bbdb32d21dbc12bf9a736f6ca8726df9673e4401465d2b9b537c47b358b67f1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprim-dev6.2.0/rocprim-dev6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "e74e1907eb90a692344626e881cb88eeed5565ac3b487eb94ad4ac02ffd838ed",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprofiler-register6.2.0/rocprofiler-register6.2.0_0.4.0.60200-66~20.04_amd64.deb",
-                sha256 = "4be88c5010c2cf0223c1dd7dc9d4a430fc54ee401ca093de2dcca60dabea763a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocrand-dev/rocrand-dev_3.1.0.60200-66~20.04_amd64.deb",
-                sha256 = "ddd0ac44b08470dfc128d6f6d2598a9728879f5a78bc5290645baebf22433b63",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer6.2.0/roctracer6.2.0_4.1.60200.60200-66~20.04_amd64.deb",
-                sha256 = "b94cdf230b372ebcaf97085cf67f01ef7977f814280fdaf1886797f39899ef41",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer-dev6.2.0/roctracer-dev6.2.0_4.1.60200.60200-66~20.04_amd64.deb",
-                sha256 = "9a85b57eea3790432eae06421081b3e59d3c9841d59646364ecd174f9ed4821a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver6.2.0/rocsolver6.2.0_3.26.0.60200-66~20.04_amd64.deb",
-                sha256 = "87dcd34a9b50f46161ecdb7781ab03c2b311fb7e13aa167c4a9c5e3bcf24b473",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver-dev6.2.0/rocsolver-dev6.2.0_3.26.0.60200-66~20.04_amd64.deb",
-                sha256 = "21e4aa1957e7bc5d293a418a983d9b3c3917fb78eb79d3d4d55a253b9bae7743",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsparse6.2.0/rocsparse6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "dacc13278f2be1cd847fca30ce409dcf95749df5f1a27635bc6dbd61be488d14",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.101-2_amd64.deb",
-                sha256 = "4cd2e10f9486456a2782487f8bfd39f330f35a4d5bd6d693412b9e4ca2a6acbd",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.101-2_amd64.deb",
-                sha256 = "d4567a30f7d68b4dcf794f8677b96e89083693c94e88279fecf577ceba8b9774",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.176-1.1build1_amd64.deb",
-                sha256 = "78a8761227efc04a1e37527f2f33ba608c6fb5d6c911616346ada5d7b9b72ee3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.12-1_amd64.deb",
-                sha256 = "0b1edf08cf9befecd21fe94e298ac25e476f87fd876ddd4adf42ef713449e637",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.2.0",
-    },
-    "6.1.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/c/comgr6.1.0/comgr6.1.0_2.7.0.60100-82~20.04_amd64.deb",
-                sha256 = "8d2e99edf6d42ddcfa5269165ebfdb05476188478c2fe0ff4ad5c6a6f4c4bc43",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-runtime-amd/hip-runtime-amd_6.1.40091.60100-82~20.04_amd64.deb",
-                sha256 = "c9a1e9d88e22d94022eb8e7297d687cd28bce7122a0448ac3a9856593799d632",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~20.04_amd64.deb",
-                sha256 = "0eef319439a66a317b29a8966896a3aee9d234ceddb7561458f96699c8dc5e6c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas6.1.0/hipblas6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "71c247d7bc0d91b738b2ada4e44fc54e74d6cd86598827b1002207c9a4553151",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas-dev6.1.0/hipblas-dev6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "6280d93934d9d91c6ab1ddda2494bc3f8acb6eb2fc056c7e62cfbf9cac963e32",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcc6.1.0/hipcc6.1.0_1.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "a79627c50fbf88f63935004cb3e7f88a8bcf315bb11aabfd3d55fd7d6c65723b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcub-dev6.1.0/hipcub-dev6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "1f6e7b113687f8d2389f6681dc8f0da8b6208debd00af860025ffaf605e37090",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft6.1.0/hipfft6.1.0_1.0.14.60100-82~20.04_amd64.deb",
-                sha256 = "77e234ad957b75801516d7e201328126067a410eb0179b4c0cf66200dca51579",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft-dev6.1.0/hipfft-dev6.1.0_1.0.14.60100-82~20.04_amd64.deb",
-                sha256 = "9897ae1bcd8f09d8736570787caf42fcf9480de33a7cc26384fbbd1a59a37ee5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver6.1.0/hipsolver6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "5d75bcc5490bee4fc83290792c553e1447aa4a4bd8581263d8c0530eaff8e84c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver-dev6.1.0/hipsolver-dev6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "3cf5007fe92ecfc6aaf61f0b2fd01de10f3650c83f5dc1f927379f89326b9c88",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse6.1.0/hipsparse6.1.0_3.0.1.60100-82~20.04_amd64.deb",
-                sha256 = "ec7d5084d7bb468b6333f486670a17f9fe2e50b725701e7d1c6507960fe7165d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse-dev6.1.0/hipsparse-dev6.1.0_3.0.1.60100-82~20.04_amd64.deb",
-                sha256 = "16bb3224eecd4bef18618cddaf6d39be5604b1657a530e859ee2d58e291583c9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand6.1.0/hiprand6.1.0_2.10.16.60100-82~20.04_amd64.deb",
-                sha256 = "26e2d7207c520346aee45feada559c534f891c05a42aeaae9eb088f438b02882",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand-dev6.1.0/hiprand-dev6.1.0_2.10.16.60100-82~20.04_amd64.deb",
-                sha256 = "f5bc772653bcdb2dd175843a290c55c2365e6d14f2ed10cd6416af27196662d8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hsa-rocr6.1.0/hsa-rocr6.1.0_1.13.0.60100-82~20.04_amd64.deb",
-                sha256 = "54101ca7659857065a1fb1310e1ca676d65b10f099d2d32779c6e63fff365b9d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip6.1.0/miopen-hip6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "16badbf9ac54e6d3e98155ec8d79b40c05b757c0ff2e8561ee6a141b87f19084",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip-dev6.1.0/miopen-hip-dev6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "9de62c9775ff18a4043cdc01b67fb7f3fa3416964d111faf80ddaae467b3e73a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl6.1.0/rccl6.1.0_2.18.6.60100-82~20.04_amd64.deb",
-                sha256 = "78816c8f233b202f82c38b8ea271bec488606f64d8ec626f27a3fc0df85b2785",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl-dev6.1.0/rccl-dev6.1.0_2.18.6.60100-82~20.04_amd64.deb",
-                sha256 = "8e6e2a1e3378e2ef3322a636d713133fb7380fbeb6e564976623801cb7200aea",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas6.1.0/rocblas6.1.0_4.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "214f600887472639bcb3462ea5aa79583f4f6748c9eaa8bec2d343c1fb6bb9e4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas-dev/rocblas-dev_4.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "32d9d16eda1e94a8d5af14e1bbe94d48bada4846f52b75c4b82f7b81abeaef9e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft6.1.0/rocfft6.1.0_1.0.27.60100-82~20.04_amd64.deb",
-                sha256 = "634cba977370d3959d05de9df70cf624dc4fcf533ce91e7a5dc90e64a16e80de",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft-dev6.1.0/rocfft-dev6.1.0_1.0.27.60100-82~20.04_amd64.deb",
-                sha256 = "cf1ba42d0e456bbf55e51aae75990425874699273be64445d3b80ead7a8e74fb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-core6.1.0/rocm-core6.1.0_6.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "f3ada47a42dcafb981cc2c05720c0f74a70a07d3c41dc48b250d292538097a0d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "dbfec580eed58da19481931278846e70306125725dd64c9f60e5e1aa9718aa45",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~20.04_amd64.deb",
-                sha256 = "0eef319439a66a317b29a8966896a3aee9d234ceddb7561458f96699c8dc5e6c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-device-libs/rocm-device-libs_1.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "e9c346e49703eed446c741384f84feb1ee710fd7039702ab3d92e62ad7f87a30",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocminfo/rocminfo_1.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "e364070bb7ef21a5a8cac39ca30b75f20e373ca2043f68caac0fa44b2e0349bf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm/rocm-llvm_17.0.0.24103.60100-82~20.04_amd64.deb",
-                sha256 = "24811a2baed22cd54c359b3e9d7260ea47112da13d32a08fefc6d2cc6ff1d3ee",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm-dev6.1.0/rocm-llvm-dev6.1.0_17.0.0.24103.60100-82~20.04_amd64.deb",
-                sha256 = "96013f2ca73a6d4883b12edddaaaeeb388f1c47e88d1b47ccb55c25725eb5574",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-smi-lib6.1.0/rocm-smi-lib6.1.0_7.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "bba0a1dbd2a109990a5d3df8d7038b39074578858530388cb054d8c997fa6207",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprim-dev6.1.0/rocprim-dev6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "b4bd7250279a21c07692d4254ba8c033a6bb41f6a7f72ebec09abdc34de025f2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprofiler-register6.1.0/rocprofiler-register6.1.0_0.3.0.60100-82~20.04_amd64.deb",
-                sha256 = "1b4501a42a5bab66cbfb9396c0e681a382fce48a1f62c1dc6adeb0e4ef565fcc",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocrand-dev6.1.0/rocrand-dev6.1.0_3.0.1.60100-82~20.04_amd64.deb",
-                sha256 = "3e4dbb60ef3ba15cd1fd2ca8a3927b65689f4e9fdc808c8e431ba9254f53345d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer6.1.0/roctracer6.1.0_4.1.60100.60100-82~20.04_amd64.deb",
-                sha256 = "77fd92577f5f09d518fde94c5ba652e0ad06f117bbc9f0cd280ee3bd8c2c44d5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer-dev6.1.0/roctracer-dev6.1.0_4.1.60100.60100-82~20.04_amd64.deb",
-                sha256 = "7ae9a9864204298bd04d41526cecb071bbc40b998f906cfef30abb40750cd834",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver6.1.0/rocsolver6.1.0_3.25.0.60100-82~20.04_amd64.deb",
-                sha256 = "a2b6ecf01b2a701b899abfd0429b354ed3af7a878ee5db19698f7cc831ee0829",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver-dev6.1.0/rocsolver-dev6.1.0_3.25.0.60100-82~20.04_amd64.deb",
-                sha256 = "12685a77b9fab6919accead417657237f755cbb91ae96a7b052edf73fee5f5ce",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsparse6.1.0/rocsparse6.1.0_3.1.2.60100-82~20.04_amd64.deb",
-                sha256 = "1e2fca5caf1cabcc84a114a3865e0aaf518f01babe6cfa56bcda907b5d86ea17",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.101-2_amd64.deb",
-                sha256 = "4cd2e10f9486456a2782487f8bfd39f330f35a4d5bd6d693412b9e4ca2a6acbd",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.101-2_amd64.deb",
-                sha256 = "d4567a30f7d68b4dcf794f8677b96e89083693c94e88279fecf577ceba8b9774",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.176-1.1build1_amd64.deb",
-                sha256 = "78a8761227efc04a1e37527f2f33ba608c6fb5d6c911616346ada5d7b9b72ee3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.12-1_amd64.deb",
-                sha256 = "0b1edf08cf9befecd21fe94e298ac25e476f87fd876ddd4adf42ef713449e637",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.1.0",
-    },
-}
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl
deleted file mode 100644
index 7de7709a19efe7..00000000000000
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl
+++ /dev/null
@@ -1,557 +0,0 @@
-rocm_redist_ubuntu_22_04 = {
-    "6.4.1": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/c/comgr/comgr_3.0.0.60401-83~22.04_amd64.deb",
-                sha256 = "ba02bdf830458668a39300ad082419f4bac3644bf5727c496846697e9fddf429",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hip-runtime-amd/hip-runtime-amd_6.4.43483.60401-83~22.04_amd64.deb",
-                sha256 = "dee21498aa4fd4dd267efad563b4cc7fc3363d60f46b626c23681631f5f0cf79",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hip-dev/hip-dev_6.4.43483.60401-83~22.04_amd64.deb",
-                sha256 = "e10323a872b833c7846f7a6d204efa8126b2b6b1727f6da317b687ea4da8fb0e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas/hipblas_2.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "9d1bb3a5006a5b69655abbd890efaad1d347b3cd66b5d0a7fe60a6885494b19b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas-dev/hipblas-dev_2.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "df5dd894a6840693f060fcdd1e96c0e8cf9eed16b2763643f17a5a9c4baa69a5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas-common-dev/hipblas-common-dev_1.0.0.60401-83~22.04_amd64.deb",
-                sha256 = "5df3e4a8a1959cbf94106f7bf87d7fb71bf06e726cde00c6092ef29bbd8156f0",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipcc/hipcc_1.1.1.60401-83~22.04_amd64.deb",
-                sha256 = "71aabfdefdd126fe6145b0e21be6964371d550b4be1b955095db6cbb53f9e146",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipcub-dev/hipcub-dev_3.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "fb30135690cb247862ab997caec1f92757ac69ed0f734659166dc159a946f407",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipfft/hipfft_1.0.18.60401-83~22.04_amd64.deb",
-                sha256 = "64f2bc139adf53f00affdda5b189d60d491b93a7260c4cbcffc7b26af6569348",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipfft-dev/hipfft-dev_1.0.18.60401-83~22.04_amd64.deb",
-                sha256 = "6b7e88ebedd52ec2203679c786f738484d6f2150fe7dca5a90427141a889ed5b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsolver/hipsolver_2.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "d461a5a321aad9743dd91906b393615cc2066e7ca792092d4d7619aa14903612",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsolver-dev/hipsolver-dev_2.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "a4ffd06f7148b195e68ff80fbdd614992d9d69a841be0ce8ba98ac1c1b7b4491",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsparse/hipsparse_3.2.0.60401-83~22.04_amd64.deb",
-                sha256 = "742c06039eb264d8de5338c52acc82f63d8e407eaff4d2d27354280a715380e1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsparse-dev/hipsparse-dev_3.2.0.60401-83~22.04_amd64.deb",
-                sha256 = "87077c062653ff1ed63a272482bc970804959ce56b560e057e7788dc4515ec18",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hiprand/hiprand_2.12.0.60401-83~22.04_amd64.deb",
-                sha256 = "1bb01aa2d9b96e999dff6fbf67edeab7d743bb182ce653b048628393e1d9756d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hiprand-dev/hiprand-dev_2.12.0.60401-83~22.04_amd64.deb",
-                sha256 = "6deb08288aa1900202d52c34899d5f54195d00d32c96d45b2e6d6f840dc018ae",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hsa-rocr/hsa-rocr_1.15.0.60401-83~22.04_amd64.deb",
-                sha256 = "07a37ac162acbc5f054b0ea9cfd4e37e1640adee367d31aad15d504c49999372",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/m/miopen-hip/miopen-hip_3.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "584431b94f2809f553633a887a7f88d308391114cc9547da305d5ddcd84bd3e5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "939f6650adcbe7f98a79437c063910f397516f535a6a159a0b605bcd1f8d23b1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rccl/rccl_2.22.3.60401-83~22.04_amd64.deb",
-                sha256 = "6b5675f7bb81b03387a64478a896a671d10e83a5629e54a0c3a8bf503a7ac106",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rccl-dev/rccl-dev_2.22.3.60401-83~22.04_amd64.deb",
-                sha256 = "fa563de9f06da443203e2b3d893d4a412541ba23e7a426200914aecd99baadeb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocblas/rocblas_4.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "0f850a080c312d80c0b928b48efca4fb507c64e95536aaefe7f23c6fc0c1fe09",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocblas-dev/rocblas-dev_4.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "caffc1656c4a6e5507fdf1e24ef1e30def2f5fa02ac0dd1ff3cdbc7a27f2cd0c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocfft/rocfft_1.0.32.60401-83~22.04_amd64.deb",
-                sha256 = "0c2ec9e4b492adb6d782db21d6438751d93be4c0ac64a9b8d0dd294a32ca7586",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocfft-dev/rocfft-dev_1.0.32.60401-83~22.04_amd64.deb",
-                sha256 = "d0b91fdedc947ca4d023ae8aca3d04471d91ee7ba541e1b2f0bd4fbae59198a2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-core/rocm-core_6.4.1.60401-83~22.04_amd64.deb",
-                sha256 = "221236cf665597beea8c6faeffa1569fc35a6ad4cab89d5c8931974120b42621",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.4.1.60401-83~22.04_amd64.deb",
-                sha256 = "b902aa964db9d9a0ae6b55f5ab4c62f0eba180e114971c5fec82e9cc6ef30fde",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hip-dev/hip-dev_6.4.43483.60401-83~22.04_amd64.deb",
-                sha256 = "e10323a872b833c7846f7a6d204efa8126b2b6b1727f6da317b687ea4da8fb0e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-device-libs/rocm-device-libs_1.0.0.60401-83~22.04_amd64.deb",
-                sha256 = "080c1cbcade5a22cea800256eccc993116ae05b60f11785fda76beffe68c38bf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocminfo/rocminfo_1.0.0.60401-83~22.04_amd64.deb",
-                sha256 = "55d19b59464dd444002d12bf1fe8c762c15f2ae79f9dccbbf38360b42e3156b6",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-llvm/rocm-llvm_19.0.0.25184.60401-83~22.04_amd64.deb",
-                sha256 = "1dec6406f10c8f2b8fa2baefd1b7a8af844908d7da1493fca407ea696f5b0877",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_19.0.0.25184.60401-83~22.04_amd64.deb",
-                sha256 = "b36fe5c9fb769a2133f25c3ecd5288961ee3bfdd0aef8d5714004b18d9744aef",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprim-dev/rocprim-dev_3.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "e42b3945e186fdc02764fe0fcc5a1df6d5b42aa76123075adae6f6aa0e18c358",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-smi-lib/rocm-smi-lib_7.5.0.60401-83~22.04_amd64.deb",
-                sha256 = "3db3358e0ae302b4589e586d6b9dae2ae183d521e61979723d9eb918c7bfbde1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-register/rocprofiler-register_0.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "da49a66ca3e6ee8b9491777c2b5170b6020e8308371e26b869d7af81bc50f571",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocrand6.4.1/rocrand6.4.1_3.3.0.60401-83~22.04_amd64.deb",
-                sha256 = "ebc85dfef24a03afc28671e3df47519f520bedd08643ef5957dd5b08e15dc1f1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocrand-dev/rocrand-dev_3.3.0.60401-83~22.04_amd64.deb",
-                sha256 = "eeaa28b540cf1ca190a668ad386aecccde8b90fe632e1b3a969f337c7478509d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/roctracer/roctracer_4.1.60401.60401-83~22.04_amd64.deb",
-                sha256 = "58cead537cf07c8a8770bfe28346c3b3c92cc4297b51e307c9032b04434b187c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/roctracer-dev/roctracer-dev_4.1.60401.60401-83~22.04_amd64.deb",
-                sha256 = "5bb52357a3326edabde80a8cbd89d95d68083e1f8e5e3751989b0b81e51420b1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocsolver/rocsolver_3.28.0.60401-83~22.04_amd64.deb",
-                sha256 = "82451d985c3f9ea9472df45e044651a16cc7a8d510a338e57e0d7c19b04a7243",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocsolver-dev/rocsolver-dev_3.28.0.60401-83~22.04_amd64.deb",
-                sha256 = "26b4bfc60e41655b4780d35204df775ec2c1f847b1da26c718dd4d649e18376e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocsparse/rocsparse_3.4.0.60401-83~22.04_amd64.deb",
-                sha256 = "d30ed6c108371c578874e1bd76e964ba429f774beebf92c682ddda94497b5780",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.110-1ubuntu1_amd64.deb",
-                sha256 = "e5ea68db36b31aab442c790e1c78ecdf53646c16b0cd83db15966632ba04152c",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.110-1ubuntu1_amd64.deb",
-                sha256 = "ae1f0d77668d7275d085ba820206ba91e90833dd1a02b8e251af0c73aa119ba3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.186-1build1_amd64.deb",
-                sha256 = "8effc4d7a0cc341bcf6cb11af0134f3defa6292376ecfdfc697a9b228606345c",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.14-3ubuntu2_amd64.deb",
-                sha256 = "0721c89001fbbd1ada23e89da5d60e762763c1a7b3dc814a2e9a518480a8043d",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.4.1",
-    },
-    "6.2.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/c/comgr6.2.0/comgr6.2.0_2.8.0.60200-66~22.04_amd64.deb",
-                sha256 = "bc5d620e4e0db3746fc6b2279e463f618681f1f95ba973e40b687cef50ca2489",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-runtime-amd6.2.0/hip-runtime-amd6.2.0_6.2.41133.60200-66~22.04_amd64.deb",
-                sha256 = "38e9670bedc7bbdc0b9f38c7a0fe90f73ef80f161cbf63c98d30e422438ce2c5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev6.2.0/hip-dev6.2.0_6.2.41133.60200-66~22.04_amd64.deb",
-                sha256 = "c66cc8c19b57cab740710811457f02a16e24cff761e5c99c3640f63ceefe8281",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas6.2.0/hipblas6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "fbd647e1b13e7aa2c14c9581f9102c069ddab9ecb47a4b226d433ec37b19e92d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "885cf3f3a52ebde9caadf6348a6cda28fd15e3bc52bab0c90b587d72b29ff7ef",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcc6.2.0/hipcc6.2.0_1.1.1.60200-66~22.04_amd64.deb",
-                sha256 = "468026fa8eb70121f0c545557a926ddc41228cef9457b4a00d8fc3a36b04310f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcub-dev6.2.0/hipcub-dev6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "c2c7d2ec5a8a31837c0addfc619ee67a374ea967cc6d43900472005489f62722",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft6.2.0/hipfft6.2.0_1.0.14.60200-66~22.04_amd64.deb",
-                sha256 = "6e649430cc5e247bbd052dff2d681b6bf0ef09d0bc3446a4911f4ab4cd317140",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft-dev6.2.0/hipfft-dev6.2.0_1.0.14.60200-66~22.04_amd64.deb",
-                sha256 = "389b0c83a39adbeeec442adde3fedba2820ed948179a4a0df03d67560501cd97",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver6.2.0/hipsolver6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "adf9aad1fc062445e34cdddbeca80db9c02f4c5f258e01c45e2a6222d15cb66d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver-dev6.2.0/hipsolver-dev6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "cb46dfbff3943a3167f6173fc381d744eb966a3451bcff49458c696888ec452c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse6.2.0/hipsparse6.2.0_3.1.1.60200-66~22.04_amd64.deb",
-                sha256 = "8c7a216aeef6ceeb3881d3e443a89a0f5c15a17deb5926cba4b787554c8fab87",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse-dev6.2.0/hipsparse-dev6.2.0_3.1.1.60200-66~22.04_amd64.deb",
-                sha256 = "501cad72df5f09572f99c11eebbb1eff49afb6ca8c91bcf4966f81068177a95d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand6.2.0/hiprand6.2.0_2.11.0.60200-66~22.04_amd64.deb",
-                sha256 = "b20c86be57698a944f91048699d0fbde5253bea28ba9d4035ce1de1d3c20f9ac",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand-dev6.2.0/hiprand-dev6.2.0_2.11.0.60200-66~22.04_amd64.deb",
-                sha256 = "9dab6f44b92b6020e183777f6f07219d68de5d10cad7538c7ddcae0192aa3e33",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-rocr6.2.0/hsa-rocr6.2.0_1.14.0.60200-66~22.04_amd64.deb",
-                sha256 = "62d280204d8ff642b464dab03fc344442df6dc5f04e152da20604e8050303c41",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip6.2.0/miopen-hip6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "6c2aa042067e51d5b70a264ca83c92ffaa6e81d00d08b55986917da860e66d85",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "f3452b2bd9c2869c550c7f963cca65fb35a37183ad4a56d96e05c69adb2f1d04",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl6.2.0/rccl6.2.0_2.20.5.60200-66~22.04_amd64.deb",
-                sha256 = "f3205c0a7d736f457ee2262988260e8dc4c495fa74a394ff73a9dfe002aff335",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl-dev6.2.0/rccl-dev6.2.0_2.20.5.60200-66~22.04_amd64.deb",
-                sha256 = "953a248cd44f403e5423185918166bfa29a009519c3d7b5b5a8e067fdf672602",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas6.2.0/rocblas6.2.0_4.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "c306ca3e59b851ebb35872e09e5598adf2e2ebb736c1b200ff4ee204fe262f7e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas-dev/rocblas-dev_4.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "115d0e9ec1b93bf7cba5fa1e3de1428f0d999d931c2dd495e4cdad22b5078936",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft6.2.0/rocfft6.2.0_1.0.28.60200-66~22.04_amd64.deb",
-                sha256 = "0d40fc9aa1da617cd8864258cd1259a0e7444ea0da446297d154b5b3422393af",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft-dev6.2.0/rocfft-dev6.2.0_1.0.28.60200-66~22.04_amd64.deb",
-                sha256 = "8c1e72cf1c165e20960b0c2f3c499900a809d59340d14a0acff95c543c7087f2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-core/rocm-core_6.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "22c80c1a704f4ce7d6a49a8b41acd64f3ed0513cd7f5570a0664a10df5858334",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "9c2ff1dc100e342969bd51a7cd4918048c8b25579de709efde56425d969cd50f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev/hip-dev_6.2.41133.60200-66~22.04_amd64.deb",
-                sha256 = "1101f3edb9dbc9f4914d7f26b5569ec9bde076d52d4125c98d22a99dd730ab51",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-device-libs6.2.0/rocm-device-libs6.2.0_1.0.0.60200-66~22.04_amd64.deb",
-                sha256 = "d5b660df350130e0ab04ddf3e36dd442bde27ae9cbb8e5f12c047b0d3cb05463",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocminfo6.2.0/rocminfo6.2.0_1.0.0.60200-66~22.04_amd64.deb",
-                sha256 = "0d06a84ac53d388089b7b8c80133f60c1eea5bfd85155ecc113efb206a747c25",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm6.2.0/rocm-llvm6.2.0_18.0.0.24292.60200-66~22.04_amd64.deb",
-                sha256 = "4a29539480a7e4b27991ccf533a35526dd3994a457fa84e4c960192c2fa05b46",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_18.0.0.24292.60200-66~22.04_amd64.deb",
-                sha256 = "febb8614cedd98f13ba0624072ffdd13b9a6dc3431380a17a0eaf87583627890",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprim-dev6.2.0/rocprim-dev6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "3d859bb735ff8bf1962ce680e9257dcc574ab36224f50069f833fa19c6d7e69d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-smi-lib6.2.0/rocm-smi-lib6.2.0_7.3.0.60200-66~22.04_amd64.deb",
-                sha256 = "ffd4e064e8a1d52b9e72114e8a1d51c78004a960f1d923448af8ed07a1b6f30b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprofiler-register6.2.0/rocprofiler-register6.2.0_0.4.0.60200-66~22.04_amd64.deb",
-                sha256 = "66df78d8c5e2d1a0ae43cd4a5e41cf75ec120c870a0bbd7da18a2ba4dec42f9c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocrand6.2.0/rocrand6.2.0_3.1.0.60200-66~22.04_amd64.deb",
-                sha256 = "82b3e6521383779e6693997223ba33390db38e3bdd9d0fd16bfaa125e1f28759",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocrand-dev/rocrand-dev_3.1.0.60200-66~22.04_amd64.deb",
-                sha256 = "317c16a6e0b0b456153437406dd92225e17dbd454fc1304b0c3fef5fbfc69bc2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer6.2.0/roctracer6.2.0_4.1.60200.60200-66~22.04_amd64.deb",
-                sha256 = "9ddf8835f1e94d5004b4c466091c8110cb72e11eda545d0de395395832076c0a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer-dev6.2.0/roctracer-dev6.2.0_4.1.60200.60200-66~22.04_amd64.deb",
-                sha256 = "9a9ed0c66d3a9d9ff50f1fc3a9e9105bb8b1a6d93c1f856682625dfb68ab627f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver6.2.0/rocsolver6.2.0_3.26.0.60200-66~22.04_amd64.deb",
-                sha256 = "5b86bf7b33a3ffa7098878f27d1b119aada69ebb02bd121b47209559c32703be",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver-dev6.2.0/rocsolver-dev6.2.0_3.26.0.60200-66~22.04_amd64.deb",
-                sha256 = "4573f99191fbe3a2afab84fdf5a05e024bd230ca7866d7eba71a5f2560a3a0bf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsparse6.2.0/rocsparse6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "4fbc91db9085ecd80a5e051bba56863ae33b22516d727ab3fef15fb500187222",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.110-1ubuntu1_amd64.deb",
-                sha256 = "e5ea68db36b31aab442c790e1c78ecdf53646c16b0cd83db15966632ba04152c",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.110-1ubuntu1_amd64.deb",
-                sha256 = "ae1f0d77668d7275d085ba820206ba91e90833dd1a02b8e251af0c73aa119ba3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.186-1build1_amd64.deb",
-                sha256 = "8effc4d7a0cc341bcf6cb11af0134f3defa6292376ecfdfc697a9b228606345c",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.14-3ubuntu2_amd64.deb",
-                sha256 = "0721c89001fbbd1ada23e89da5d60e762763c1a7b3dc814a2e9a518480a8043d",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.2.0",
-    },
-    "6.1.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/c/comgr6.1.0/comgr6.1.0_2.7.0.60100-82~22.04_amd64.deb",
-                sha256 = "49967e2e98b96a95c618a1db7eacf8892b2700e0cf88960b3b0097da081ec1c8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-runtime-amd/hip-runtime-amd_6.1.40091.60100-82~22.04_amd64.deb",
-                sha256 = "8cb31ffd9d313e19a6e9b7bed8a106d0ed59fe92f479fa042405217f787cae16",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~22.04_amd64.deb",
-                sha256 = "7ca5568b754948576555b07924abbb35e24b7448b6f612738a5fdde6ae7020c9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas6.1.0/hipblas6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "4703e568dd8d6314b81508260b9d799c577ee38ae59655ce6a4782c0f6d3e3ef",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas-dev6.1.0/hipblas-dev6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "3b6ddd2df992002afd0684de4ace6a6e86e497c4db95813febd7c0da851f8da5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcc6.1.0/hipcc6.1.0_1.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "e11db2414fec41b45f605616a10793956611850b42406bdf5c4f067e195e502a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcub-dev6.1.0/hipcub-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "888d7643506f00023b617beb8446d09608216dae075e978c7862a41adb7e94c2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft6.1.0/hipfft6.1.0_1.0.14.60100-82~22.04_amd64.deb",
-                sha256 = "e8de4cd7a377a718e8c4392e02fafbe3f43f38a2397aaf5cd2136eb03c43a5c3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft-dev6.1.0/hipfft-dev6.1.0_1.0.14.60100-82~22.04_amd64.deb",
-                sha256 = "afaf929e06c43310b5325a735169e73af85cf5e764d43ef319038d25484201b5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver6.1.0/hipsolver6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "6bb779344bd39e9da75ee8474d7de5d10e6993d627e6cbd9ac7a3fcf260b1a6a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver-dev6.1.0/hipsolver-dev6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "450c0849203b69da15d593fde712555328715626027df980823df9458f9b4631",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse6.1.0/hipsparse6.1.0_3.0.1.60100-82~22.04_amd64.deb",
-                sha256 = "b3806a85a483da4fa06f8e4edf917c5ceb1a4c00af6426ec61fbec23828291b3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse-dev6.1.0/hipsparse-dev6.1.0_3.0.1.60100-82~22.04_amd64.deb",
-                sha256 = "7003d85e42e988d9e5b80da0d5b81aa34a393ded1d9d567b0edf06e3ba2fc9b4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand6.1.0/hiprand6.1.0_2.10.16.60100-82~22.04_amd64.deb",
-                sha256 = "bf6678ba14b9baebe6fe39a0aeaeeb2c10b8154a5e9c6d0223d8b01f36d1a7b9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand-dev6.1.0/hiprand-dev6.1.0_2.10.16.60100-82~22.04_amd64.deb",
-                sha256 = "3f2069097efc8a9bbf1cb9be60f7240dcd17a5380614b2b6faf29c7b53b657c4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hsa-rocr6.1.0/hsa-rocr6.1.0_1.13.0.60100-82~22.04_amd64.deb",
-                sha256 = "562904659abd5e905a806b4ffc30af5c25442e3d6143e6a99b4660badced2b86",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip6.1.0/miopen-hip6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "2f26448b8ef551383bf16f0e066dd6f4b7539b51f382b7028b377de5164f8b63",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip-dev6.1.0/miopen-hip-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "df6d5a8151f216dc02cd96e45d0ca8133cca51d272ede25eb30898f07d0f3e82",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl6.1.0/rccl6.1.0_2.18.6.60100-82~22.04_amd64.deb",
-                sha256 = "2d367697957bba93c79e8da1d1bc7c8bbd8d07fb7f013de7c83824f9047372f1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl-dev6.1.0/rccl-dev6.1.0_2.18.6.60100-82~22.04_amd64.deb",
-                sha256 = "1c4927aa49e4dcb441608c3fa6ec86c9d078aaa767214be2b213c1a8421a3929",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas6.1.0/rocblas6.1.0_4.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "43cb1dd308f08a9d766ed846bd4d345b74fcc3a87e6e7ee727c7e5cf49629416",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas-dev/rocblas-dev_4.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "fc8bef370666fad72c01fc131749ccb835b8bfcb1639ed43dda26b9e64702b3b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft6.1.0/rocfft6.1.0_1.0.27.60100-82~22.04_amd64.deb",
-                sha256 = "50d0ad3cb37a69285b6132a17fdefbdea2e18ab6faf8265ead44ef3d7a4d16cb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft-dev6.1.0/rocfft-dev6.1.0_1.0.27.60100-82~22.04_amd64.deb",
-                sha256 = "2e091de9499e493c03a79ca7673b9c8640f896051542ff3c4f635efccc97d10f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-core6.1.0/rocm-core6.1.0_6.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "685993f25f9da6e17cf69bc7dc9cdde0ca33b9955474a11bb903cae0d4a25d66",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "57c4212475dd5a8fe2bdab92eeff71332a0d408615dc2a4254482eb46d13e212",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~22.04_amd64.deb",
-                sha256 = "7ca5568b754948576555b07924abbb35e24b7448b6f612738a5fdde6ae7020c9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-device-libs/rocm-device-libs_1.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "7e155798e1027dd4fc0d49a89865245f3017090e44ea057584b8b86d5ea931cd",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocminfo/rocminfo_1.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "b7cf95b3b20e3accba23de34265ac408603176279412fda116dce47047a36e7b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm/rocm-llvm_17.0.0.24103.60100-82~22.04_amd64.deb",
-                sha256 = "4c245a83e48517d627f34f52c0e7020434dcf4ef4ef073c736afc60e69f8b6f2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm-dev6.1.0/rocm-llvm-dev6.1.0_17.0.0.24103.60100-82~22.04_amd64.deb",
-                sha256 = "c99854c0b92ea9c530be6c656157d26587b74c4ea1e9e12522570438a189d5b9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-smi-lib6.1.0/rocm-smi-lib6.1.0_7.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "4b02aa9e5e09a36303e185def69ae67702a7177a5e6793e00565c8c6fdd32f88",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprim-dev6.1.0/rocprim-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "ce3ce32ed9692c58d1a6ba089a7c07b27d2935b0f126a1c84b214cd2433ebe48",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprofiler-register6.1.0/rocprofiler-register6.1.0_0.3.0.60100-82~22.04_amd64.deb",
-                sha256 = "73b877f13ba65c6ba01197452c3b538f50f687d54ae0b3428c85c07bff20dcb7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocrand-dev6.1.0/rocrand-dev6.1.0_3.0.1.60100-82~22.04_amd64.deb",
-                sha256 = "afcdfa0cbc71363ccd9bb71f421343b12263bc88d42fa9a4c78c60bbc3fa17d3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer6.1.0/roctracer6.1.0_4.1.60100.60100-82~22.04_amd64.deb",
-                sha256 = "c98aedc99d252bf40b8069f497d24d60e2eaca25d001471e42ceb4df531ecba7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer-dev6.1.0/roctracer-dev6.1.0_4.1.60100.60100-82~22.04_amd64.deb",
-                sha256 = "9c2967d988e7a1408a3e4b2c83177eb7c88af939619a9d0d5ab7af2db9489884",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver6.1.0/rocsolver6.1.0_3.25.0.60100-82~22.04_amd64.deb",
-                sha256 = "65a6270f66194e033af1dc4b238bf7ecdfa439933b9c330bcb307caf516e8b3b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver-dev6.1.0/rocsolver-dev6.1.0_3.25.0.60100-82~22.04_amd64.deb",
-                sha256 = "ca40789c82d3e46f2951cb0b1a7d5e8026daf5af6d597693746d95b8a49cd9a1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsparse6.1.0/rocsparse6.1.0_3.1.2.60100-82~22.04_amd64.deb",
-                sha256 = "ede46a9ccd505543425c5f75c6e8180c05c3b865dd638edbe297237664b3fe31",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.101-2_amd64.deb",
-                sha256 = "4cd2e10f9486456a2782487f8bfd39f330f35a4d5bd6d693412b9e4ca2a6acbd",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.101-2_amd64.deb",
-                sha256 = "d4567a30f7d68b4dcf794f8677b96e89083693c94e88279fecf577ceba8b9774",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.176-1.1build1_amd64.deb",
-                sha256 = "78a8761227efc04a1e37527f2f33ba608c6fb5d6c911616346ada5d7b9b72ee3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.12-1_amd64.deb",
-                sha256 = "0b1edf08cf9befecd21fe94e298ac25e476f87fd876ddd4adf42ef713449e637",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.1.0",
-    },
-}
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl
deleted file mode 100644
index 29bf258aa3ecc0..00000000000000
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl
+++ /dev/null
@@ -1,376 +0,0 @@
-rocm_redist_ubuntu_24_04 = {
-    "6.4.1": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/c/comgr/comgr_3.0.0.60401-83~24.04_amd64.deb",
-                sha256 = "667163f00488475b2a902b6fed1d05c9c9202b361693b2d97cdb97e7d9e997ba",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hip-runtime-amd/hip-runtime-amd_6.4.43483.60401-83~24.04_amd64.deb",
-                sha256 = "848a2b5533bc3715c2f2944b97ff1ea300d54490352f8820574a4baa6772258e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hip-dev/hip-dev_6.4.43483.60401-83~24.04_amd64.deb",
-                sha256 = "d9a60e541ff6862c684f7a7c6f4b9a37f44afda06bc4d972f64c32d8d838d9e6",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas/hipblas_2.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "feaf51ba1b97d59d525a7317f6940dce01de85b18ab1356e198aae6287c9f25a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas-dev/hipblas-dev_2.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "4503107e2979b014870781067e2ad7976b9981e0a84fdde288a6247187e36725",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas-common-dev/hipblas-common-dev_1.0.0.60401-83~24.04_amd64.deb",
-                sha256 = "cc68c954a933b63727b9503fd55d83ca334387c5edf5bb8ba5143d04a9e6deaa",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipcc/hipcc_1.1.1.60401-83~24.04_amd64.deb",
-                sha256 = "89be3390bf03998154dc03dab4a47e6584491b3ebb15d4d26932f1a4136d10a7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipcub-dev/hipcub-dev_3.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "8a408ad8ed04575826c6b8bf3bc0b85bf3163f6593153804065d8bff58b576c4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipfft/hipfft_1.0.18.60401-83~24.04_amd64.deb",
-                sha256 = "a33c8b2d4cf60508508dc4a5fcdd39e90501805726cd75e5d41a9c3efdd3d0c1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipfft-dev/hipfft-dev_1.0.18.60401-83~24.04_amd64.deb",
-                sha256 = "253ce37f03b35ee7eb3566adf447984264f5176846643df24937915558da669c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsolver/hipsolver_2.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "efb25b3b8a89df0c3eb5dfff7169b8608904f702445e7e4ba8bda6ab71c11a9c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsolver-dev/hipsolver-dev_2.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "7b5317bbea1ac913ce9caf3fc3c08edc2f6b08f33c1c8c0497069cee2a649759",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsparse/hipsparse_3.2.0.60401-83~24.04_amd64.deb",
-                sha256 = "54c881c3744fb8106b659686f8e85c0f0c5eb420e67df68b92fcd291420c0f47",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipsparse-dev/hipsparse-dev_3.2.0.60401-83~24.04_amd64.deb",
-                sha256 = "63a7bafc858acad0b839e6bf24c88aa821fc8be76d90dae2978385472d3222e8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hiprand/hiprand_2.12.0.60401-83~24.04_amd64.deb",
-                sha256 = "6abc4615724f908c009a1a9d3189d7ba55cc4848ff25bc1b38a050a4d2f9c0a8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hiprand-dev/hiprand-dev_2.12.0.60401-83~24.04_amd64.deb",
-                sha256 = "e6589695b343d3c196123ea5e6f9c26f6318d8345e0b2c5001722ee0ab035f73",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hsa-rocr/hsa-rocr_1.15.0.60401-83~24.04_amd64.deb",
-                sha256 = "3e49792082f32e7d9fd808461fcc2df04b12910824caac29097dea550df48824",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/m/miopen-hip/miopen-hip_3.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "3927f844a5a827a435c374e585caceb03e7ac69c8e7159535995b82e22671bae",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "0e5112a44c9ffcfa19eba2158f22d29838c2e29aa9f2743362fb0ae59451ce7d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rccl/rccl_2.22.3.60401-83~24.04_amd64.deb",
-                sha256 = "90177aa2a258c171987ec2d7a874f092cc879623a0131cdf3416dcabec36e158",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rccl-dev/rccl-dev_2.22.3.60401-83~24.04_amd64.deb",
-                sha256 = "94c018af0102aa08e1489fca98f853c44f40f3040ac9a6f0e3d2eecd63bca385",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocblas/rocblas_4.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "57281984a9a4e1397c1a0881b684a30e8b360b81679e5b9855c22d5270fff4dc",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocblas-dev/rocblas-dev_4.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "2bd4f68d36c6dfee0009c17548186a8e5e09606404c6a95236cc2b29dd9f6ef3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocfft/rocfft_1.0.32.60401-83~24.04_amd64.deb",
-                sha256 = "96150c7acd7d278cd748055be80c8c3758170849604270d74aacaefca4ccf098",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocfft-dev/rocfft-dev_1.0.32.60401-83~24.04_amd64.deb",
-                sha256 = "5f69daa0e567bd3c38b451bbc473618608b4c93b67d7cd5aa229de06ea9885f6",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-core/rocm-core_6.4.1.60401-83~24.04_amd64.deb",
-                sha256 = "79a0c87d5f52648f60e3808da6fd3f4ebcdf8f1fa24355f2ae1975052872afaf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.4.1.60401-83~24.04_amd64.deb",
-                sha256 = "117ee081b6ecac7f42fe87e0593dd8b99cf4c90871572d64f08fe23da88712fd",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hip-dev/hip-dev_6.4.43483.60401-83~24.04_amd64.deb",
-                sha256 = "d9a60e541ff6862c684f7a7c6f4b9a37f44afda06bc4d972f64c32d8d838d9e6",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-device-libs/rocm-device-libs_1.0.0.60401-83~24.04_amd64.deb",
-                sha256 = "4e9478f6d2ed407fa64ff285a8fba4326e2cee1518aa59919e4e275841a86fd0",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocminfo/rocminfo_1.0.0.60401-83~24.04_amd64.deb",
-                sha256 = "793327ecf73b398509cbe13e653ba2691c3a27a24cf434bc64d889ccbfad8fe7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-llvm/rocm-llvm_19.0.0.25184.60401-83~24.04_amd64.deb",
-                sha256 = "d695a1aa7f150f44fd48828668cc51991f41eea28e3abfd847cfd757a3a18aff",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_19.0.0.25184.60401-83~24.04_amd64.deb",
-                sha256 = "b297fddd7ee3f0fd7ce15e37d2c5c63791a176fec4d95b2dcb8a5339194068e8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprim-dev/rocprim-dev_3.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "a250940f727b9af6e5a1db3360da33dd45c1287671d6bfa1ddc4ea68074cf171",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocm-smi-lib/rocm-smi-lib_7.5.0.60401-83~24.04_amd64.deb",
-                sha256 = "c4cc156891dd15a2f9bc7a2c7094d0802ff570f421dc7e4b76bd974a2902cb0d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-register/rocprofiler-register_0.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "b0d459cf9deaab61c199c629c9298d50a7f50f538c6f08fc9a99c420290c6e04",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocrand6.4.1/rocrand6.4.1_3.3.0.60401-83~24.04_amd64.deb",
-                sha256 = "d572e64ca54d29482829a694badc65d09893d18ae3352c8d8d37abae23e7fd58",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocrand-dev/rocrand-dev_3.3.0.60401-83~24.04_amd64.deb",
-                sha256 = "665b0f6bf2caaff189eb36ac9af3dc95b34b44a12acdc660cc3497cd890e876f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/roctracer/roctracer_4.1.60401.60401-83~24.04_amd64.deb",
-                sha256 = "79089e3b27240c848c540756ece1aea9efd968625790d5e46f48858b29a7cd69",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/roctracer-dev/roctracer-dev_4.1.60401.60401-83~24.04_amd64.deb",
-                sha256 = "dbf8faa8afa56ce3459dd1f9b5b5551b4e2d118c17a31527bccf8ce501b63a43",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocsolver/rocsolver_3.28.0.60401-83~24.04_amd64.deb",
-                sha256 = "a047b73c25bf6549f9f3db581e9d08413ff694a688ec0d67e5289a50b57d975f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocsolver-dev/rocsolver-dev_3.28.0.60401-83~24.04_amd64.deb",
-                sha256 = "a13fc89a1fac76c234d45c48c8ec740dbf684943b0fbf445a3e0aa3798a224fe",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocsparse/rocsparse_3.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "79dbad8ea29d084980f1116099c617b75fca9e6cf11667ddb0f33e5f5fb8c387",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.120-2build1_amd64.deb",
-                sha256 = "f5fb4e7ce17921cc466fb7911abf91495ffb181b36772f68e2e82cb621703112",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.120-2build1_amd64.deb",
-                sha256 = "e149d4daea33f58853b8013fd6c24888429ce7716a4b26d1a1f45181b5a4e73e",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1t64_0.190-1.1build4_amd64.deb",
-                sha256 = "b277e52769302778bd052376ac6687b52954b6605dd5f781bff8631e3504d58f",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.18-1build1_amd64.deb",
-                sha256 = "508daa855e99959acaa945e6a89d218e0be6b5727fd28773580942ff37cf5805",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.4.1",
-    },
-    "6.2.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/c/comgr6.2.0/comgr6.2.0_2.8.0.60200-66~24.04_amd64.deb",
-                sha256 = "7e1ff2d9f2435f5b9db9aa952bb57d1a878a8aa7d96bda61361c107b7e1428e3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev6.2.0/hip-dev6.2.0_6.2.41133.60200-66~24.04_amd64.deb",
-                sha256 = "5e6601ada30432ee0dab0473585bdf1fa7c398f0c655538d48eba9c44e6dc77a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas6.2.0/hipblas6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "7ff8f6308c744c71008959b17ab6338de1c6fd3e4581dd94271e6eca9fdc4c13",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "e9f71e71db600d72dcb2b61e64b965b6c60d47bd4bb699e8abec85edb260b819",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblaslt6.2.0/hipblaslt6.2.0_0.8.0.60200-66~24.04_amd64.deb",
-                sha256 = "e5dfd8ba9e49f919a96c102d3a652e8ef0c4d1a63b3f3909c856d40b1745e2a9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblaslt-dev6.2.0/hipblaslt-dev6.2.0_0.8.0.60200-66~24.04_amd64.deb",
-                sha256 = "639bd47010035ee6719425510be33d2f54483004a909dfa4c64f853d7394a22f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcc6.2.0/hipcc6.2.0_1.1.1.60200-66~24.04_amd64.deb",
-                sha256 = "c2782a98633e4400f46ba732605e56b2821366db60ec06d88db0615e4d1acf3c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcub-dev6.2.0/hipcub-dev6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "48fec4d06aef3159db4117125b728242a1eeb480ea3d55d3901d945d4b883694",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft6.2.0/hipfft6.2.0_1.0.14.60200-66~24.04_amd64.deb",
-                sha256 = "8dd73cdbd4f0563f4a0481304771e4cbcac5905eea1f2d8ef41f922cdf9aba85",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft-dev6.2.0/hipfft-dev6.2.0_1.0.14.60200-66~24.04_amd64.deb",
-                sha256 = "e3c0a4ebda8d3aacd44b19c6872f23222513be0a5c04f793605088d9183f1be4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver6.2.0/hipsolver6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "adbba9ffcf8b5e4202efbe45924d87520bf4100ec5464bd0ba3beb61cb535c6c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver-dev6.2.0/hipsolver-dev6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "01d3dd6195111808b40a5837d3e51d8c27c4700b4bd8bb2d901e39d0474fd98a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse6.2.0/hipsparse6.2.0_3.1.1.60200-66~24.04_amd64.deb",
-                sha256 = "2ba33a96388cd3edd7b5b8b261fe99cbd569894f4d7db291fc0dd0ff5d7c67ce",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse-dev6.2.0/hipsparse-dev6.2.0_3.1.1.60200-66~24.04_amd64.deb",
-                sha256 = "6a767f493a722e2d4260a9bc23cf9db66fd275a094b395c768e305f60d6b4fe9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand6.2.0/hiprand6.2.0_2.11.0.60200-66~24.04_amd64.deb",
-                sha256 = "82f182134b415080ba4a12fd7993b6099ee9b9e549c72bfebee24c8486704078",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand-dev6.2.0/hiprand-dev6.2.0_2.11.0.60200-66~24.04_amd64.deb",
-                sha256 = "011d5c28f45cd9d756e0cf6ea6a3d37eabd98a3381ffd961c772ab92a37e4ee8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-rocr6.2.0/hsa-rocr6.2.0_1.14.0.60200-66~24.04_amd64.deb",
-                sha256 = "fa04f707debb75087ea2bf5e327602034eaa3a6900421f2cf32ad5f5f1c887b9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip6.2.0/miopen-hip6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "2dbf6d126d0de6930e0cd94d0e525e07d3019d90bd7256f3151a7f1fbc2250af",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "df5fdd2218e4d380b133ba402f3734fbe0589d9cdd8618a101b71b968909b4ba",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl6.2.0/rccl6.2.0_2.20.5.60200-66~24.04_amd64.deb",
-                sha256 = "4d7efa4ee6aa2bf69b0aab449cc1d01c25ca65814e1b3cb07f6b59fa8b1608b8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl-dev6.2.0/rccl-dev6.2.0_2.20.5.60200-66~24.04_amd64.deb",
-                sha256 = "4ab4f880344e04d61b6fa746be5c4bdc2841409fb6987ee61e39c6420b4eca42",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas6.2.0/rocblas6.2.0_4.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "521c87ce396c6ce10076cc641b6035451fd68ddb36a684c5a9c9538dfc831ade",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas-dev/rocblas-dev_4.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "00f135ce2ae47c35085ef06248ff7d5ce8c12fd0d5b82e7bd77b1dbc0ce7058e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft6.2.0/rocfft6.2.0_1.0.28.60200-66~24.04_amd64.deb",
-                sha256 = "40c936452e84bfec87236f08de5a9d3f232c397a3305b6143c26697ed56ceda1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft-dev6.2.0/rocfft-dev6.2.0_1.0.28.60200-66~24.04_amd64.deb",
-                sha256 = "eb3904263b396d46799eeea1081d8e8d1a551a890432a803364db2d013849f92",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-core/rocm-core_6.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "af5fcbe8dc2b6cbec30e2d39d30736e8a47a0b9d0ca2be7f179f2947f9c98245",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "228f07a3caefc41f6efd5345eb9d3630f1db769f9b4abd1313cbcb32d077ce53",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev/hip-dev_6.2.41133.60200-66~24.04_amd64.deb",
-                sha256 = "cda72054d2011dbb062e75386766d928fd8905c15c88685c3ef87fc963bd88ad",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-device-libs6.2.0/rocm-device-libs6.2.0_1.0.0.60200-66~24.04_amd64.deb",
-                sha256 = "298544f717dfb236b9257b19a0ab81abaaa770128976d4abfdea546cd32d8b02",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocminfo6.2.0/rocminfo6.2.0_1.0.0.60200-66~24.04_amd64.deb",
-                sha256 = "8e78ed8e480b55a496153b150acb22bab39c3bb8cf1e62f9aff7eaf75a3a3a92",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm6.2.0/rocm-llvm6.2.0_18.0.0.24292.60200-66~24.04_amd64.deb",
-                sha256 = "72c388eae7c0f54151b46fbd8fa6e26f1ca81e2b8b415c43411a156b3f25b6e7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_18.0.0.24292.60200-66~24.04_amd64.deb",
-                sha256 = "3e85a859c5dafa82a9a57dda096d566b821217bacfac995f7cc45ed460b68999",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-smi-lib6.2.0/rocm-smi-lib6.2.0_7.3.0.60200-66~24.04_amd64.deb",
-                sha256 = "c094e3022c73fca2aa6c8bb435f93550109531de37fe8de5fbf6cfe1f047b645",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprim-dev6.2.0/rocprim-dev6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "6c832e2feb0885fbe481245825c76a466921b294f530eb0d0da70a44cfe6e608",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprofiler-register6.2.0/rocprofiler-register6.2.0_0.4.0.60200-66~24.04_amd64.deb",
-                sha256 = "d198d010fedfbe51d3fd19444e2848d430e08f91d19a5b2661b94ac6d1135863",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocrand-dev/rocrand-dev_3.1.0.60200-66~24.04_amd64.deb",
-                sha256 = "2a2a95185ce0e54df226474b2f5cfcdc9e5ede5a6d88a8a70c2635ea2237abba",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer6.2.0/roctracer6.2.0_4.1.60200.60200-66~24.04_amd64.deb",
-                sha256 = "2f2fb6f8d06ace89131934c833b0ea359335a4b45aeec1559b293d7bc14b1d1d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer-dev6.2.0/roctracer-dev6.2.0_4.1.60200.60200-66~24.04_amd64.deb",
-                sha256 = "c6c781ee87c459aed32e943b389137f98ecd402fb83a3d1c98de9a76abadc3a3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver6.2.0/rocsolver6.2.0_3.26.0.60200-66~24.04_amd64.deb",
-                sha256 = "5e4b3e38556f0826e5322971635a49a72283d60862ccc4d28efd11c8fb955b47",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver-dev6.2.0/rocsolver-dev6.2.0_3.26.0.60200-66~24.04_amd64.deb",
-                sha256 = "5bb6ae92a25f33488f2ee5f123ac4f67ad130e18e4949161715451509be3b89d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsparse6.2.0/rocsparse6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "1867833a569fbf3f87b82c81bc47f5d62085ea40f12d1cb33475c1f2dec89bc4",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.120-2build1_amd64.deb",
-                sha256 = "f5fb4e7ce17921cc466fb7911abf91495ffb181b36772f68e2e82cb621703112",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.120-2build1_amd64.deb",
-                sha256 = "e149d4daea33f58853b8013fd6c24888429ce7716a4b26d1a1f45181b5a4e73e",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1t64_0.190-1.1build4_amd64.deb",
-                sha256 = "b277e52769302778bd052376ac6687b52954b6605dd5f781bff8631e3504d58f",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.18-1build1_amd64.deb",
-                sha256 = "508daa855e99959acaa945e6a89d218e0be6b5727fd28773580942ff37cf5805",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.2.0",
-    },
-}
diff --git a/third_party/xla/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/gpus/rocm_configure.bzl
index 43212efeab1203..2679e2e0447a44 100644
--- a/third_party/xla/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/gpus/rocm_configure.bzl
@@ -9,8 +9,12 @@
     host code compilation if TF_ROCM_CLANG is 1.
   * `ROCM_PATH`: The path to the ROCm toolkit. Default is `/opt/rocm`.
   * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
+  * `TF_ROCM_RBE_DOCKER_IMAGE`: Docker image to be used in rbe worker to execute the action
+  * `TF_ROCM_RBE_SINGLE_GPU_POOL`: The name of the rbe pool used to execute single gpu tests
+  * `TF_ROCM_RBE_MULTI_GPU_POOL`: The name of the rbe pool used to execute multi gpu tests
 """
 
+load("@bazel_skylib//lib:paths.bzl", "paths")
 load(
     "//third_party/gpus/rocm:rocm_redist.bzl",
     "rocm_redist",
@@ -49,11 +53,19 @@ _ROCM_TOOLKIT_PATH = "ROCM_PATH"
 _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
 _TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
 _DISTRIBUTION_PATH = "rocm/rocm_dist"
-_OS = "OS"
-_ROCM_VERSION = "ROCM_VERSION"
+_ROCM_DISTRO_VERSION = "ROCM_DISTRO_VERSION"
+_TMPDIR = "TMPDIR"
 
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
 _TF_ROCM_MULTIPLE_PATHS = "TF_ROCM_MULTIPLE_PATHS"
+_TF_ROCM_RBE_DOCKER_IMAGE = "TF_ROCM_RBE_DOCKER_IMAGE"
+_TF_ROCM_RBE_SINGLE_GPU_POOL = "TF_ROCM_RBE_SINGLE_GPU_POOL"
+_TF_ROCM_RBE_MULTI_GPU_POOL = "TF_ROCM_RBE_MULTI_GPU_POOL"
+_DEFAULT_TF_ROCM_RBE_SINGLE_GPU_POOL = "linux_x64_gpu"
+_DEFAULT_TF_ROCM_RBE_MULTI_GPU_POOL = "linux_x64_multigpu"
+
+# rocm/tensorflow-build:latest-jammy-python3.11-rocm7.0.2
+_DEFAULT_TF_ROCM_RBE_DOCKER_IMAGE = "rocm/tensorflow-build@sha256:a2672ff2510b369b4a5f034272a518dc93c2e492894e3befaeef19649632ccaa"
 _LLVM_PATH = "LLVM_PATH"
 
 def verify_build_defines(params):
@@ -235,6 +247,7 @@ def _rocm_lib_paths(repository_ctx, lib, basedir):
         repository_ctx.path("%s/lib64/stubs/%s" % (basedir, file_name)),
         repository_ctx.path("%s/lib/x86_64-linux-gnu/%s" % (basedir, file_name)),
         repository_ctx.path("%s/lib/%s" % (basedir, file_name)),
+        repository_ctx.path("%s/lib/%s.0" % (basedir, file_name)),  # hipblaslt has this pattern
         repository_ctx.path("%s/%s" % (basedir, file_name)),
     ]
 
@@ -305,7 +318,7 @@ def _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin):
 
     return libs
 
-def _find_libs(repository_ctx, rocm_config, miopen_path, rccl_path, bash_bin):
+def _find_libs(repository_ctx, rocm_config, bash_bin):
     """Returns the ROCm libraries on the system.
 
     Args:
@@ -316,28 +329,28 @@ def _find_libs(repository_ctx, rocm_config, miopen_path, rccl_path, bash_bin):
     Returns:
       Map of library names to structs of filename and path
     """
+    repo_path = str(repository_ctx.path(rocm_config.rocm_toolkit_path))
     libs_paths = [
         (name, _rocm_lib_paths(repository_ctx, name, path))
         for name, path in [
-            ("amdhip64", rocm_config.rocm_toolkit_path),
-            ("rocblas", rocm_config.rocm_toolkit_path),
-            ("hiprand", rocm_config.rocm_toolkit_path),
-            ("MIOpen", miopen_path),
-            ("rccl", rccl_path),
-            ("hipsparse", rocm_config.rocm_toolkit_path),
-            ("roctracer64", rocm_config.rocm_toolkit_path),
-            ("rocsolver", rocm_config.rocm_toolkit_path),
-            ("hipfft", rocm_config.rocm_toolkit_path),
-            ("rocrand", rocm_config.rocm_toolkit_path),
+            ("amdhip64", repo_path),
+            ("rocblas", repo_path),
+            ("hiprand", repo_path),
+            ("MIOpen", repo_path),
+            ("rccl", repo_path),
+            ("hipsparse", repo_path),
+            ("roctracer64", repo_path),
+            ("rocsolver", repo_path),
+            ("rocsolver", repo_path),
+            ("hipsolver", repo_path),
+            ("hipfft", repo_path),
+            ("rocrand", repo_path),
+            ("hipblas", repo_path),
+            ("hipblaslt", repo_path),
+            ("rocprofiler-sdk", repo_path),
         ]
     ]
-    if int(rocm_config.rocm_version_number) >= 40500:
-        libs_paths.append(("hipsolver", _rocm_lib_paths(repository_ctx, "hipsolver", rocm_config.rocm_toolkit_path)))
-        libs_paths.append(("hipblas", _rocm_lib_paths(repository_ctx, "hipblas", rocm_config.rocm_toolkit_path)))
 
-    # hipblaslt may be absent even in versions of ROCm where it exists
-    # (it is not installed by default in some containers). Autodetect.
-    libs_paths.append(("hipblaslt", _rocm_lib_paths(repository_ctx, "hipblaslt", rocm_config.rocm_toolkit_path), True))
     return _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin)
 
 def find_rocm_config(repository_ctx, rocm_path):
@@ -429,6 +442,8 @@ def _create_dummy_repository(repository_ctx):
             "%{rocm_gpu_architectures}": "[]",
             "%{rocm_version_number}": "0",
             "%{rocm_hipblaslt}": "False",
+            "%{single_gpu_rbe_pool}": repository_ctx.os.environ.get(_TF_ROCM_RBE_SINGLE_GPU_POOL, _DEFAULT_TF_ROCM_RBE_SINGLE_GPU_POOL),
+            "%{multi_gpu_rbe_pool}": repository_ctx.os.environ.get(_TF_ROCM_RBE_MULTI_GPU_POOL, _DEFAULT_TF_ROCM_RBE_MULTI_GPU_POOL),
         },
     )
     _tpl(
@@ -482,24 +497,6 @@ def _norm_path(path):
         path = path[:-1]
     return path
 
-def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
-
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
 
@@ -515,29 +512,32 @@ def _compute_rocm_extra_copts(repository_ctx, amdgpu_targets):
                            amdgpu_target for amdgpu_target in amdgpu_targets]
     return str(amdgpu_target_flags)
 
+def _canonical_path(p):
+    parts = [x for x in p.split("/") if x != ""]
+    return paths.join(*parts)
+
 def _get_file_name(url):
     last_slash_index = url.rfind("/")
     return url[last_slash_index + 1:]
 
-def _download_package(repository_ctx, archive):
-    file_name = _get_file_name(archive.url)
-    tmp_dir = "tmp"
-    repository_ctx.file(tmp_dir + "/.idx")  # create tmp dir
+def _download_package(repository_ctx, pkg):
+    file_name = _get_file_name(pkg["url"])
 
-    repository_ctx.report_progress("Downloading and extracting {}, expected hash is {}".format(archive.url, archive.sha256))  # buildifier: disable=print
+    repository_ctx.report_progress("Downloading and extracting {}, expected hash is {}".format(pkg["url"], pkg["sha256"]))  # buildifier: disable=print
     repository_ctx.download_and_extract(
-        url = archive.url,
-        output = tmp_dir if archive.url.endswith(".deb") else _DISTRIBUTION_PATH,
-        sha256 = archive.sha256,
+        url = pkg["url"],
+        output = _DISTRIBUTION_PATH,
+        sha256 = pkg["sha256"],
+        type = "zip" if pkg["url"].endswith(".whl") else "",
     )
 
-    all_files = repository_ctx.path(tmp_dir).readdir()
-
-    matched_files = [f for f in all_files if _get_file_name(str(f)).startswith("data.")]
-    for f in matched_files:
-        repository_ctx.extract(f, _DISTRIBUTION_PATH)
+    if pkg.get("sub_package", None):
+        repository_ctx.report_progress("Extracting {}".format(pkg["sub_package"]))  # buildifier: disable=print
+        repository_ctx.extract(
+            archive = "{}/{}".format(_DISTRIBUTION_PATH, pkg["sub_package"]),
+            output = _DISTRIBUTION_PATH,
+        )
 
-    repository_ctx.delete(tmp_dir)
     repository_ctx.delete(file_name)
 
 def _remove_root_dir(path, root_dir):
@@ -548,15 +548,20 @@ def _remove_root_dir(path, root_dir):
 def _setup_rocm_distro_dir(repository_ctx):
     """Sets up the rocm hermetic installation directory to be used in hermetic build"""
     bash_bin = get_bash_bin(repository_ctx)
-    os = repository_ctx.os.environ.get(_OS)
-    rocm_version = repository_ctx.os.environ.get(_ROCM_VERSION)
+    rocm_distro = repository_ctx.os.environ.get(_ROCM_DISTRO_VERSION)
     multiple_paths = repository_ctx.os.environ.get(_TF_ROCM_MULTIPLE_PATHS)
-    if os and rocm_version:
-        redist = rocm_redist[os][rocm_version]
+    if rocm_distro:
+        redist = rocm_redist[rocm_distro]
         repository_ctx.file("rocm/.index")
-        for archive in redist["archives"]:
-            _download_package(repository_ctx, archive)
-        return _get_rocm_config(repository_ctx, bash_bin, "{}/{}".format(_DISTRIBUTION_PATH, redist["rocm_root"]), "/{}".format(redist["rocm_root"]))
+        for pkg in redist.packages:
+            _download_package(repository_ctx, pkg)
+
+        for entry in redist.required_softlinks:
+            repository_ctx.symlink(
+                "{}/{}".format(_DISTRIBUTION_PATH, entry.target),
+                "{}/{}".format(_DISTRIBUTION_PATH, entry.link),
+            )
+        return _get_rocm_config(repository_ctx, bash_bin, _canonical_path("{}/{}".format(_DISTRIBUTION_PATH, redist.rocm_root)), "")
     elif multiple_paths:
         paths_list = multiple_paths.split(":")
         for rocm_custom_path in paths_list:
@@ -596,16 +601,12 @@ def _create_local_rocm_repository(repository_ctx):
     rocm_config = _setup_rocm_distro_dir(repository_ctx)
     rocm_version_number = int(rocm_config.rocm_version_number)
 
-    # For ROCm 5.2 and above, find MIOpen and RCCL in the main rocm lib path
-    miopen_path = rocm_config.rocm_toolkit_path + "/miopen" if rocm_version_number < 50200 else rocm_config.rocm_toolkit_path
-    rccl_path = rocm_config.rocm_toolkit_path + "/rccl" if rocm_version_number < 50200 else rocm_config.rocm_toolkit_path
-
     # Copy header and library files to execroot.
     # rocm_toolkit_path
     rocm_toolkit_path = _remove_root_dir(rocm_config.rocm_toolkit_path, "rocm")
 
     bash_bin = get_bash_bin(repository_ctx)
-    rocm_libs = _find_libs(repository_ctx, rocm_config, miopen_path, rccl_path, bash_bin)
+    rocm_libs = _find_libs(repository_ctx, rocm_config, bash_bin)
     rocm_lib_srcs = []
     rocm_lib_outs = []
     for lib in rocm_libs.values():
@@ -615,8 +616,6 @@ def _create_local_rocm_repository(repository_ctx):
 
     clang_offload_bundler_path = rocm_toolkit_path + "/llvm/bin/clang-offload-bundler"
 
-    have_hipblaslt = "1" if rocm_libs["hipblaslt"] != None else "0"
-
     # Set up BUILD file for rocm/
     repository_ctx.template(
         "rocm/build_defs.bzl",
@@ -629,27 +628,23 @@ def _create_local_rocm_repository(repository_ctx):
                 repository_ctx,
                 rocm_config.amdgpu_targets,
             ),
+            "%{single_gpu_rbe_pool}": repository_ctx.os.environ.get(_TF_ROCM_RBE_SINGLE_GPU_POOL, _DEFAULT_TF_ROCM_RBE_SINGLE_GPU_POOL),
+            "%{multi_gpu_rbe_pool}": repository_ctx.os.environ.get(_TF_ROCM_RBE_MULTI_GPU_POOL, _DEFAULT_TF_ROCM_RBE_MULTI_GPU_POOL),
             "%{rocm_gpu_architectures}": str(rocm_config.amdgpu_targets),
             "%{rocm_version_number}": str(rocm_version_number),
-            "%{rocm_hipblaslt}": "True" if rocm_libs["hipblaslt"] != None else "False",
+            "%{rocm_hipblaslt}": "True",
         },
     )
 
     repository_dict = {
         "%{rocm_root}": rocm_toolkit_path,
         "%{rocm_toolkit_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path)),
+        "%{rocm_rbe_docker_image}": repository_ctx.os.environ.get(_TF_ROCM_RBE_DOCKER_IMAGE, _DEFAULT_TF_ROCM_RBE_DOCKER_IMAGE),
     }
 
     is_rocm_clang = _use_rocm_clang(repository_ctx)
     tf_sysroot = _tf_sysroot(repository_ctx)
 
-    if rocm_libs["hipblaslt"] != None:
-        repository_dict["%{hipblaslt_lib}"] = rocm_libs["hipblaslt"].file_name
-
-    if rocm_version_number >= 40500:
-        repository_dict["%{hipsolver_lib}"] = rocm_libs["hipsolver"].file_name
-        repository_dict["%{hipblas_lib}"] = rocm_libs["hipblas"].file_name
-
     multiple_paths = repository_ctx.os.environ.get(_TF_ROCM_MULTIPLE_PATHS)
     if multiple_paths:
         paths_list = multiple_paths.split(":")
@@ -733,6 +728,11 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocm_amdgpu_targets}": ",".join(
                 ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
             ),
+            "%{tmpdir}": get_host_environ(
+                repository_ctx,
+                _TMPDIR,
+                "",
+            ),
         },
     )
 
@@ -749,7 +749,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocm_version_number}": rocm_config.rocm_version_number,
             "%{miopen_version_number}": rocm_config.miopen_version_number,
             "%{hipruntime_version_number}": rocm_config.hipruntime_version_number,
-            "%{hipblaslt_flag}": have_hipblaslt,
+            "%{hipblaslt_flag}": "1",
             "%{hip_soversion_number}": rocm_libs["amdhip64"].soversion,
             "%{rocblas_soversion_number}": rocm_libs["rocblas"].soversion,
             "%{hipblaslt_soversion_number}": rocm_libs["hipblaslt"].soversion if rocm_libs["hipblaslt"] != None else "",
@@ -776,7 +776,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocm_version_number}": rocm_config.rocm_version_number,
             "%{miopen_version_number}": rocm_config.miopen_version_number,
             "%{hipruntime_version_number}": rocm_config.hipruntime_version_number,
-            "%{hipblaslt_flag}": have_hipblaslt,
+            "%{hipblaslt_flag}": "1",
             "%{hip_soversion_number}": rocm_libs["amdhip64"].soversion,
             "%{rocblas_soversion_number}": rocm_libs["rocblas"].soversion,
             "%{hipblaslt_soversion_number}": rocm_libs["hipblaslt"].soversion if rocm_libs["hipblaslt"] != None else "",
@@ -854,8 +854,11 @@ _ENVIRONS = [
     "TF_NEED_CUDA",  # Needed by the `if_gpu_is_configured` macro
     _ROCM_TOOLKIT_PATH,
     _TF_ROCM_AMDGPU_TARGETS,
-    _OS,
-    _ROCM_VERSION,
+    _ROCM_DISTRO_VERSION,
+    _TF_ROCM_RBE_DOCKER_IMAGE,
+    _TF_ROCM_RBE_SINGLE_GPU_POOL,
+    _TF_ROCM_RBE_MULTI_GPU_POOL,
+    _TF_ROCM_MULTIPLE_PATHS,
 ]
 
 remote_rocm_configure = repository_rule(
diff --git a/third_party/xla/third_party/gutil/BUILD b/third_party/xla/third_party/gutil/BUILD
new file mode 100644
index 00000000000000..d0809a068e964f
--- /dev/null
+++ b/third_party/xla/third_party/gutil/BUILD
@@ -0,0 +1,3 @@
+# Dummy BUILD file to make this directory a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/gutil/workspace.bzl b/third_party/xla/third_party/gutil/workspace.bzl
new file mode 100644
index 00000000000000..165ad610390844
--- /dev/null
+++ b/third_party/xla/third_party/gutil/workspace.bzl
@@ -0,0 +1,22 @@
+"""Provides the repository macro to import gutil."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports gutil."""
+
+    # Attention: tools parse and update these lines.
+    GUTIL_COMMIT = "b498c8d364ac96c32194f71f8f719707a398e82b"  # LTS 20250502.0
+    GUTIL_SHA256 = "aeca39e4a50f9607437731aba79189a64ff51b742c00f8b80049686e7600e09f"
+
+    tf_http_archive(
+        name = "com_google_gutil",
+        sha256 = GUTIL_SHA256,
+        strip_prefix = "gutil-{commit}".format(commit = GUTIL_COMMIT),
+        urls = tf_mirror_urls("https://github.com/google/gutil/archive/{commit}.tar.gz".format(commit = GUTIL_COMMIT)),
+        repo_mapping = {
+            "@absl-cpp": "@com_google_absl",
+            "@google_benchmark": "@com_google_benchmark",
+            "@googletest": "@com_google_googletest",
+        },
+    )
diff --git a/third_party/xla/third_party/hwloc/hwloc.BUILD b/third_party/xla/third_party/hwloc/hwloc.BUILD
index ac935ad148c3f8..1828fdba5f9d81 100644
--- a/third_party/xla/third_party/hwloc/hwloc.BUILD
+++ b/third_party/xla/third_party/hwloc/hwloc.BUILD
@@ -272,6 +272,10 @@ cc_library(
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
+        "@local_xla//xla/tsl:linux_riscv64": [
+            "hwloc/topology-linux.c",
+            "include/hwloc/linux.h",
+        ],
         "@local_xla//xla/tsl:linux_s390x": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
diff --git a/third_party/xla/third_party/llvm/toolchains.patch b/third_party/xla/third_party/llvm/toolchains.patch
index 2370c1e3953bed..9aba6852556350 100644
--- a/third_party/xla/third_party/llvm/toolchains.patch
+++ b/third_party/xla/third_party/llvm/toolchains.patch
@@ -55,4 +55,4 @@ index 2e3bff53ead9..8d01617effdc 100644
 +    "//llvm:macos_x86_64_default": native_arch_defines("X86", "x86_64-unknown-darwin"),
      "@bazel_tools//src/conditions:linux_aarch64": native_arch_defines("AArch64", "aarch64-unknown-linux-gnu"),
      "@bazel_tools//src/conditions:linux_ppc64le": native_arch_defines("PowerPC", "powerpc64le-unknown-linux-gnu"),
-     "@bazel_tools//src/conditions:linux_s390x": native_arch_defines("SystemZ", "systemz-unknown-linux_gnu"),
+     "@bazel_tools//src/conditions:linux_riscv64": native_arch_defines("RISCV", "riscv64-unknown-linux-gnu"),
diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index b28a9da54ce315..daf3e7454b4ca3 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "891f002026df122b36813b9e1819769c94327503"
-    LLVM_SHA256 = "0576a3a0b6baf63308c1432ef2c4c4f082fc5a9ca4b298f0a741dab044d598b5"
+    LLVM_COMMIT = "0c2701fe7fa002e1befc5f86c268a7964f96d286"
+    LLVM_SHA256 = "50b0cb64adf161b4441ee32453f2eb9a015263e4eb3c036548f0207cbe3d0707"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/llvm/zstd.patch b/third_party/xla/third_party/llvm/zstd.patch
index 2bdf7b59d79ffd..2d6f45abbdcbe5 100644
--- a/third_party/xla/third_party/llvm/zstd.patch
+++ b/third_party/xla/third_party/llvm/zstd.patch
@@ -9,11 +9,11 @@ index 37e79db376b4..058788948f94 100644
 -        # We unconditionally depend on the custom LLVM zlib wrapper. This will
 -        # be an empty library unless zlib is enabled, in which case it will
 -        # both provide the necessary dependencies and configuration defines.
--        "@llvm_zlib//:zlib",
+-        "//third-party:zlib",
 -        # We unconditionally depend on the custom LLVM zstd wrapper. This will
 -        # be an empty library unless zstd is enabled, in which case it will
 -        # both provide the necessary dependencies and configuration defines.
--        "@llvm_zstd//:zstd",
+-        "//third-party:zstd",
      ],
  )
  
@@ -24,8 +24,8 @@ diff -u a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel  b/utils/bazel/llvm-p
          "//llvm:TargetParser",
          "//llvm:TransformUtils",
          "//llvm:config",
--        "@llvm_zlib//:zlib",
--        "@llvm_zstd//:zstd",
+-        "//third-party:zlib",
+-        "//third-party:zstd",
      ],
  )
 
diff --git a/third_party/xla/third_party/llvm_openmp/openmp.bzl b/third_party/xla/third_party/llvm_openmp/openmp.bzl
index 1b41d1dcc3e2d9..b384d9cb549d2b 100644
--- a/third_party/xla/third_party/llvm_openmp/openmp.bzl
+++ b/third_party/xla/third_party/llvm_openmp/openmp.bzl
@@ -4,7 +4,7 @@ after the TF 2.4 branch cut has passed.
 """
 
 load(
-    "@local_xla//xla/tsl/platform:rules_cc.bzl",
+    "//xla/tsl/platform:rules_cc.bzl",
     "cc_binary",
 )
 
@@ -33,9 +33,9 @@ def dict_add(*dictionaries):
 
 def select_os_specific(L, M, W):
     return select({
-        "@local_xla//xla/tsl:linux_x86_64": L,
-        "@local_xla//xla/tsl:macos": M,
-        "@local_xla//xla/tsl:windows": W,
+        Label("//xla/tsl:linux_x86_64"): L,
+        Label("//xla/tsl:macos"): M,
+        Label("//xla/tsl:windows"): W,
         "//conditions:default": L,
     })
 
diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
index 651f0a4189010d..3a079c87ab9dd6 100644
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -146,6 +146,7 @@ cc_library(
             "src/cpu/**/*.hpp",
             "src/cpu/*.hpp",
             "src/cpu/aarch64/xbyak_aarch64/**/*.h",
+            "src/graph/**/*.hpp",
         ],
     ) + [
         ":dnnl_config_h",
diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
index 3b1337d6d3b707..bf38ad269caec3 100644
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -220,6 +220,7 @@ cc_library(
         "@local_xla//xla/tsl:linux_aarch64": ["-lrt"],
         "@local_xla//xla/tsl:linux_x86_64": ["-lrt"],
         "@local_xla//xla/tsl:linux_ppc64le": ["-lrt"],
+        "@local_xla//xla/tsl:linux_riscv64": ["-lrt"],
         "//conditions:default": [],
     }),
     textual_hdrs = _TEXTUAL_HDRS_LIST,
diff --git a/third_party/xla/third_party/nanobind/workspace.bzl b/third_party/xla/third_party/nanobind/workspace.bzl
index f3dbd8aa4fbab5..d8be62e27e1caf 100644
--- a/third_party/xla/third_party/nanobind/workspace.bzl
+++ b/third_party/xla/third_party/nanobind/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "nanobind",
-        strip_prefix = "nanobind-2.9.2",
-        sha256 = "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90",
-        urls = tf_mirror_urls("https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz"),
+        strip_prefix = "nanobind-e507b118927bc3a12446d0ca235e1baaf343932e",
+        sha256 = "95004c4cd1f3e7417b71ff25be9cfba7e8ad79e570248e377815bc980c8b3c73",
+        urls = tf_mirror_urls("https://github.com/wjakob/nanobind/archive/e507b118927bc3a12446d0ca235e1baaf343932e.tar.gz"),
         build_file = "//third_party/nanobind:nanobind.BUILD",
     )
diff --git a/third_party/xla/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
index a0930df34ecec8..ac7f3bc92cff33 100644
--- a/third_party/xla/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
@@ -328,8 +328,8 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         out = dlink_cc,
         gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
-            "@local_xla//xla/tsl:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@local_xla//xla/tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
+            Label("//xla/tsl:linux_x86_64"): ["--cpu-arch=X86_64"],
+            Label("//xla/tsl:linux_ppc64le"): ["--cpu-arch=PPC64LE"],
             "//conditions:default": [],
         }),
     )
diff --git a/third_party/xla/third_party/nvshmem/nvshmem.BUILD b/third_party/xla/third_party/nvshmem/nvshmem.BUILD
index c99dfc1a3138d6..7472bed6cfe77e 100644
--- a/third_party/xla/third_party/nvshmem/nvshmem.BUILD
+++ b/third_party/xla/third_party/nvshmem/nvshmem.BUILD
@@ -1,5 +1,6 @@
 # NVSHMEM
 
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:write_file.bzl", "write_file")
 
@@ -67,12 +68,19 @@ expand_template(
     template = "src/include/non_abi/nvshmem_version.h.in",
 )
 
+copy_file(
+    name = "nvshmem_transfer_device_cuh",
+    src = "src/include/non_abi/device/pt-to-pt/transfer_device.cuh.in",
+    out = "src/include/non_abi/device/pt-to-pt/transfer_device.cuh",
+)
+
 cc_library(
     name = "nvshmem_lib",
     hdrs = glob([
         "src/include/**",
     ]) + [
         ":nvshmem_build_options_h",
+        ":nvshmem_transfer_device_cuh",
         ":nvshmem_version_h",
     ],
     include_prefix = "third_party/nvshmem",
diff --git a/third_party/xla/third_party/pthreadpool/workspace.bzl b/third_party/xla/third_party/pthreadpool/workspace.bzl
index 8bc77a023af7ec..2486a9fa30aad5 100644
--- a/third_party/xla/third_party/pthreadpool/workspace.bzl
+++ b/third_party/xla/third_party/pthreadpool/workspace.bzl
@@ -5,7 +5,7 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "8b1d13195842c9b7e8ef5aa7d9b44ca4168a41b8ae97b4e50db4fcc562211f5b",
-        strip_prefix = "pthreadpool-d561aae9dfeab38ff595a0ae3e6bbd90b862c5f8",
-        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/d561aae9dfeab38ff595a0ae3e6bbd90b862c5f8.zip"),
+        sha256 = "f602ab141bdc5d5872a79d6551e9063b5bfa7ad6ad60cceaa641de5c45c86d70",
+        strip_prefix = "pthreadpool-0e6ca13779b57d397a5ba6bfdcaa8a275bc8ea2e",
+        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/0e6ca13779b57d397a5ba6bfdcaa8a275bc8ea2e.zip"),
     )
diff --git a/third_party/xla/third_party/py/python_configure.bzl b/third_party/xla/third_party/py/python_configure.bzl
index 1fd3ae0a337ee2..bd10cee3af127b 100644
--- a/third_party/xla/third_party/py/python_configure.bzl
+++ b/third_party/xla/third_party/py/python_configure.bzl
@@ -19,7 +19,7 @@ def _get_python_interpreter():
     )
     if _is_bzlmod_enabled():
         return str(INTERPRETER_LABELS[python_toolchain_name])
-    return "@{}_host//:python".format(python_toolchain_name)
+    return "@{}//:python".format(python_toolchain_name)
 
 def _create_local_python_repository(repository_ctx):
     """Creates the repository containing files set up to build with Python."""
diff --git a/third_party/xla/third_party/rocm_device_libs/build_defs.bzl b/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
index 845618bd9c4fa2..5b9aee4f085e34 100644
--- a/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
+++ b/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
@@ -2,110 +2,148 @@
 
 load("@bazel_skylib//lib:paths.bzl", "paths")
 
-def bitcode_library(
-        name,
-        srcs = [],
-        hdrs = [],
-        file_specific_flags = {}):
-    """Builds a bitcode library
+def _bitcode_library_impl(ctx):
+    """Implements a bitcode library rule."""
+    srcs = ctx.files.srcs
+    hdrs = ctx.files.hdrs
 
-    Args:
-        name: Unique name of the build rule.
-        srcs: List of source files (*.cl, *.ll).
-        hdrs: List of header files (*.h).
-        file_specific_flags: Per-file dict of flags to be passed to clang.
-    """
-    # Takes the CL sources and compiles them into bitcode files.
-    # Merges those bitcode files together with any given .ll files into a single bitcode file.
-    # Strips unnecessary metadata and forces linkonce local visibility for symbols.
-    # Adapted from:
-    #   https://github.com/ROCm/llvm-project/blob/22ee53fa53edc3a5f25feb08dc840f5b0fc362da/amd/device-libs/cmake/OCL.cmake#L73
+    bc_outputs = []
 
-    clang_tool = "@llvm-project//clang:clang"
-    llvm_link_tool = "@llvm-project//llvm:llvm-link"
-    opt_tool = "@llvm-project//llvm:opt"
-    prepare_builtins_tool = ":prepare_builtins"
-    clang_includes = "@llvm-project//clang:builtin_headers_gen"
-
-    # Just for calculating the include path.
-    clang_header = "@llvm-project//clang:staging/include/opencl-c.h"
-
-    include_paths = dict([(paths.dirname(h), None) for h in hdrs]).keys()
-
-    #TODO(rocm): Maybe compute this in cmd not to pass dirs as srcs
-    includes = " ".join(["-I$(location {})".format(inc) for inc in include_paths])
-    flags = ("-fcolor-diagnostics -Werror -Wno-error=atomic-alignment -x cl -Xclang " +
-             "-cl-std=CL2.0 --target=amdgcn-amd-amdhsa -fvisibility=hidden -fomit-frame-pointer " +
-             "-Xclang -finclude-default-header -Xclang -fexperimental-strict-floating-point " +
-             "-Xclang -fdenormal-fp-math=dynamic -Xclang -Qn " +
-             "-nogpulib -cl-no-stdinc -Xclang -mcode-object-version=none")
-
-    link_inputs = []
+    include_dirs = dict([(paths.dirname(h.path), None) for h in ctx.files.hdrs]).keys()
 
+    # Compile .cl files to .bc
     for src in srcs:
-        filename = paths.basename(src)
-        (basename, _, ext) = filename.partition(".")
-
-        if (ext == "ll"):
-            link_inputs.append(src)
-            continue
-
-        out = basename + ".bc"
-        link_inputs.append(out)
-        extra_flags = " ".join(file_specific_flags.get(filename, []))
-        native.genrule(
-            name = "compile_" + basename,
-            srcs = [src] + hdrs + include_paths + [clang_includes, clang_header],
-            outs = [out],
-            cmd = "$(location {}) -I$$(dirname $(location {}))  {} {} {} -emit-llvm -c $(location {}) -o $@".format(
-                clang_tool,
-                clang_header,
-                includes,
-                flags,
-                extra_flags,
-                src,
-            ),
-            tools = [clang_tool],
-            message = "Compiling {} ...".format(filename),
-        )
-
-    link_message = "Linking {}.bc ...".format(name)
-
-    prelink_out = name + ".link0.lib.bc"
-    native.genrule(
-        name = "prelink_" + name,
-        srcs = link_inputs,
-        outs = [prelink_out],
-        cmd = "$(location {}) $(SRCS) -o $@".format(llvm_link_tool),
-        tools = [llvm_link_tool],
-        message = link_message,
+        if src.path.endswith(".cl"):
+            out = ctx.actions.declare_file(src.basename + ".bc")
+            bc_outputs.append(out)
+
+            extra_flags = ctx.attr.file_specific_flags.get(src.basename, [])
+            include_flags = ["-I{}".format(dir) for dir in include_dirs]
+            include_flags += ["-I{}".format(ctx.files._clang_header[0].dirname)]
+            include_flags += ["-I{}".format(ctx.files._clang_includes[0].dirname)]
+
+            # https://github.com/ROCm/llvm-project/blob/679865ee84553d564ad0551d878196e58c9d03f3/amd/device-libs/cmake/OCL.cmake#L33
+            args = [
+                "-fcolor-diagnostics",
+                "-Werror",
+                "-Wno-error=atomic-alignment",
+                "-x",
+                "cl",
+                "-Xclang",
+                "-cl-std=CL2.0",
+                "--target=amdgcn-amd-amdhsa",
+                "-fvisibility=hidden",
+                "-fomit-frame-pointer",
+                "-Xclang",
+                "-finclude-default-header",
+                "-Xclang",
+                "-fexperimental-strict-floating-point",
+                "-Xclang",
+                "-fdenormal-fp-math=dynamic",
+                "-Xclang",
+                "-Qn",
+                "-nogpulib",
+                "-cl-no-stdinc",
+                "-Xclang",
+                "-mcode-object-version=none",
+                "-emit-llvm",
+                "-c",
+            ] + include_flags + [src.path, "-o", out.path] + extra_flags
+
+            ctx.actions.run(
+                executable = ctx.executable._clang,
+                inputs = [src] + hdrs + ctx.files._clang_includes + ctx.files._clang_header,
+                outputs = [out],
+                arguments = args,
+                progress_message = "Compiling {} to bitcode".format(src.basename),
+                mnemonic = "BitcodeCompile",
+            )
+
+        elif src.path.endswith(".ll"):
+            # Directly include .ll files in linking
+            bc_outputs.append(src)
+
+    # Link all .bc files into one prelinked .bc
+    prelink_out = ctx.actions.declare_file(ctx.label.name + ".link0.lib.bc")
+    ctx.actions.run(
+        executable = ctx.executable._llvm_link,
+        inputs = bc_outputs,
+        outputs = [prelink_out],
+        arguments = [f.path for f in bc_outputs] + ["-o", prelink_out.path],
+        progress_message = "Linking {} bitcode files".format(ctx.label.name),
+        mnemonic = "BitcodeLink",
     )
 
-    internalize_out = name + ".lib.bc"
-    native.genrule(
-        name = "internalize_" + name,
-        srcs = [prelink_out],
-        outs = [internalize_out],
-        cmd = "$(location {}) -internalize -only-needed $< -o $@".format(llvm_link_tool),
-        tools = [llvm_link_tool],
-        message = link_message,
+    # Internalize symbols (llvm-link + -internalize)
+    internalize_out = ctx.actions.declare_file(ctx.label.name + ".lib.bc")
+    ctx.actions.run(
+        executable = ctx.executable._llvm_link,
+        inputs = [prelink_out],
+        outputs = [internalize_out],
+        arguments = ["-internalize", "-only-needed", prelink_out.path, "-o", internalize_out.path],
+        progress_message = "Internalizing symbols for {}".format(ctx.label.name),
+        mnemonic = "BitcodeInternalizeSymbols",
     )
 
-    strip_out = name + ".strip.bc"
-    native.genrule(
-        name = "strip_" + name,
-        srcs = [internalize_out],
-        outs = [strip_out],
-        cmd = "$(location {}) -passes=strip -o $@ $<".format(opt_tool),
-        tools = [opt_tool],
-        message = link_message,
+    # Strip unnecessary metadata
+    strip_out = ctx.actions.declare_file(ctx.label.name + ".strip.bc")
+    ctx.actions.run(
+        executable = ctx.executable._opt,
+        inputs = [internalize_out],
+        outputs = [strip_out],
+        arguments = ["-passes=strip", "-o", strip_out.path, internalize_out.path],
+        progress_message = "Stripping {}".format(ctx.label.name),
+        mnemonic = "BitcodeStrip",
     )
 
-    native.genrule(
-        name = name,
-        srcs = [strip_out],
-        outs = [name + ".bc"],
-        cmd = "$(location {}) -o $@ $<".format(prepare_builtins_tool),
-        tools = [prepare_builtins_tool],
-        message = link_message,
+    # Final preparation of bitcode (custom prepare_builtins tool)
+    final_bc = ctx.actions.declare_file(ctx.label.name + ".bc")
+    ctx.actions.run(
+        executable = ctx.executable._prepare_builtins,
+        inputs = [strip_out],
+        outputs = [final_bc],
+        arguments = [strip_out.path, "-o", final_bc.path],
+        progress_message = "Preparing final bitcode for {}".format(ctx.label.name),
+        mnemonic = "BitcodeFinalize",
     )
+
+    return [
+        DefaultInfo(files = depset([final_bc])),
+    ]
+
+bitcode_library = rule(
+    implementation = _bitcode_library_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files = [".cl", ".ll"]),
+        "hdrs": attr.label_list(allow_files = [".h"]),
+        "file_specific_flags": attr.string_list_dict(),
+        "_clang": attr.label(
+            default = Label("@llvm-project//clang:clang"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_llvm_link": attr.label(
+            default = Label("@llvm-project//llvm:llvm-link"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_opt": attr.label(
+            default = Label("@llvm-project//llvm:opt"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_prepare_builtins": attr.label(
+            default = Label(":prepare_builtins"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_clang_includes": attr.label(
+            default = Label("@llvm-project//clang:builtin_headers_gen"),
+            allow_files = True,
+        ),
+        "_clang_header": attr.label(
+            default = Label("@llvm-project//clang:staging/include/opencl-c.h"),
+            allow_files = True,
+        ),
+    },
+)
diff --git a/third_party/xla/third_party/rocm_device_libs/workspace.bzl b/third_party/xla/third_party/rocm_device_libs/workspace.bzl
index e05ba8de4fe1bf..2516a3b7d9e846 100644
--- a/third_party/xla/third_party/rocm_device_libs/workspace.bzl
+++ b/third_party/xla/third_party/rocm_device_libs/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     """Imports Rocm-Device-Libs."""
-    LLVM_COMMIT = "c93c6e5451544e9ead12f2d2b15e1969b9a1bd04"
-    LLVM_SHA256 = "f715a0a9c3c1a2b09a79939016ed53a0cbd454f7b0ea4ef32878433275c7b16c"
+    LLVM_COMMIT = "bc1578256b4894680e0d1c2552cabd2d93803c37"
+    LLVM_SHA256 = "dfef281553a980a960ac38c697682f2f7c8eebafccfd32ce8e481eb290482acd"
 
     tf_http_archive(
         name = "rocm_device_libs",
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 2ee9a67d85ed8e..0cfa64a8afac5b 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,99 +1,72 @@
-diff --git a/shardy/dialect/sdy/ir/attrs.td b/shardy/dialect/sdy/ir/attrs.td
-index 363ddb0..d0647ba 100644
---- a/shardy/dialect/sdy/ir/attrs.td
-+++ b/shardy/dialect/sdy/ir/attrs.td
-@@ -567,9 +567,6 @@ def Sdy_DimensionSharding : AttrDef<Sdy_Dialect, "DimensionSharding"> {
- 
-     // Builds a closed `DimensionShardingAttr` matching `dimSharding` in axes and priority.
-     static DimensionShardingAttr getClosedLike(DimensionShardingAttr sharding);
--
--    // Builds an open `DimensionShardingAttr` matching `dimSharding` in axes and priority.
--    static DimensionShardingAttr getOpenLike(DimensionShardingAttr sharding);
-   }];
- }
- 
-@@ -820,10 +817,6 @@ def Sdy_TensorSharding : AttrDef<Sdy_Dialect, "TensorSharding"> {
-     // and matching `sharding` in dim sharding axes, `mesh_or_ref` and rank.
-     static TensorShardingAttr getClosedLike(TensorShardingAttr sharding);
- 
--    // Builds a `TensorShardingAttr` with all dim shardings being marked open
--    // and matching `sharding` in dim sharding axes, `mesh_or_ref` and rank.
--    static TensorShardingAttr getOpenLike(TensorShardingAttr sharding);
--
-     // Builds a `TensorShardingAttr` with a closed dim sharding for each axis
-     // list in `axesPerDim`.
-     static TensorShardingAttr getClosed(
-diff --git a/shardy/dialect/sdy/ir/dialect.cc b/shardy/dialect/sdy/ir/dialect.cc
-index 60abf9f..5113b47 100644
---- a/shardy/dialect/sdy/ir/dialect.cc
-+++ b/shardy/dialect/sdy/ir/dialect.cc
-@@ -694,13 +694,6 @@ DimensionShardingAttr DimensionShardingAttr::getClosedLike(
-                                     /*priority=*/dimSharding.getPriority());
- }
- 
--DimensionShardingAttr DimensionShardingAttr::getOpenLike(
--    DimensionShardingAttr dimSharding) {
--  return DimensionShardingAttr::get(dimSharding.getContext(),
--                                    dimSharding.getAxes(), /*isClosed=*/false,
--                                    /*priority=*/dimSharding.getPriority());
--}
--
- //===----------------------------------------------------------------------===//
- // TensorShardingAttr
- //===----------------------------------------------------------------------===//
-@@ -787,7 +780,8 @@ TensorShardingAttr TensorShardingAttr::closeShardingDims(
-   SmallVector<DimensionShardingAttr> dimShardings(getDimShardings().begin(),
-                                                   getDimShardings().end());
-   for (int64_t dim : dimIndices) {
--    dimShardings[dim] = DimensionShardingAttr::getClosedLike(dimShardings[dim]);
-+    dimShardings[dim] = DimensionShardingAttr::get(
-+        getContext(), dimShardings[dim].getAxes(), /*isClosed=*/true);
-   }
-   return TensorShardingAttr::get(getContext(), getMeshOrRef(), dimShardings,
-                                  getReplicatedAxes(), getUnreducedAxes());
-@@ -798,7 +792,8 @@ TensorShardingAttr TensorShardingAttr::openShardingDims(
-   SmallVector<DimensionShardingAttr> dimShardings(getDimShardings().begin(),
-                                                   getDimShardings().end());
-   for (int64_t dim : dimIndices) {
--    dimShardings[dim] = DimensionShardingAttr::getOpenLike(dimShardings[dim]);
-+    dimShardings[dim] = DimensionShardingAttr::get(
-+        getContext(), dimShardings[dim].getAxes(), /*isClosed=*/false);
-   }
-   return TensorShardingAttr::get(getContext(), getMeshOrRef(), dimShardings,
-                                  getReplicatedAxes(), getUnreducedAxes());
-@@ -880,20 +875,6 @@ TensorShardingAttr TensorShardingAttr::getClosedLike(
-                                  sharding.getUnreducedAxes());
- }
- 
--TensorShardingAttr TensorShardingAttr::getOpenLike(
--    TensorShardingAttr sharding) {
--  SmallVector<DimensionShardingAttr> openDimShardings(sharding.getRank());
--  for (int index = 0; index < sharding.getRank(); index++) {
--    openDimShardings[index] =
--        DimensionShardingAttr::getOpenLike(sharding.getDimSharding(index));
--  }
--  return TensorShardingAttr::get(
--      sharding.getContext(), sharding.getMeshOrRef(),
--      /*dimShardings=*/openDimShardings,
--      /*replicatedAxes=*/sharding.getReplicatedAxes(),
--      sharding.getUnreducedAxes());
--}
--
- TensorShardingAttr TensorShardingAttr::getClosed(
-     MLIRContext* context, Attribute meshOrRef,
-     ArrayRef<SmallVector<AxisRefAttr>> axesPerDim,
+diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
+index 37a7256..509398d 100644
+--- a/third_party/llvm/generated.patch
++++ b/third_party/llvm/generated.patch
+@@ -1,29 +1 @@
+ Auto generated patch. Do not edit or delete it, even if empty.
+-diff -ruN --strip-trailing-cr a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
+---- a/clang/lib/Analysis/ThreadSafety.cpp
+-+++ b/clang/lib/Analysis/ThreadSafety.cpp
+-@@ -2820,7 +2820,7 @@
+-         case CFGElement::AutomaticObjectDtor: {
+-           CFGAutomaticObjDtor AD = BI.castAs<CFGAutomaticObjDtor>();
+-           const auto *DD = AD.getDestructorDecl(AC.getASTContext());
+--          if (!DD->hasAttrs())
+-+          if (!DD || !DD->hasAttrs())
+-             break;
+- 
+-           LocksetBuilder.handleCall(
+-diff -ruN --strip-trailing-cr a/clang/test/SemaCXX/no-warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/no-warn-thread-safety-analysis.cpp
+---- a/clang/test/SemaCXX/no-warn-thread-safety-analysis.cpp
+-+++ b/clang/test/SemaCXX/no-warn-thread-safety-analysis.cpp
+-@@ -0,0 +1,12 @@
+-+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wthread-safety -Wthread-safety-pointer -Wthread-safety-beta -Wno-thread-safety-negative -fcxx-exceptions -DUSE_CAPABILITY=0 %s
+-+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wthread-safety -Wthread-safety-pointer -Wthread-safety-beta -Wno-thread-safety-negative -fcxx-exceptions -DUSE_CAPABILITY=1 %s
+-+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++17 -Wthread-safety -Wthread-safety-pointer -Wthread-safety-beta -Wno-thread-safety-negative -fcxx-exceptions -DUSE_CAPABILITY=0 %s
+-+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++17 -Wthread-safety -Wthread-safety-pointer -Wthread-safety-beta -Wno-thread-safety-negative -fcxx-exceptions -DUSE_CAPABILITY=1 %s
+-+// expected-no-diagnostics
+-+
+-+struct foo {
+-+  ~foo();
+-+};
+-+struct bar : foo {};
+-+struct baz : bar {};
+-+baz foobar(baz a) { return a; }
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 0afb70d..b28a9da 100644
+index 3b35979..daf3e74 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "4d45718b478940cd11ac80dd64db8408bb21dbca"
--    LLVM_SHA256 = "ce14b25873232ca904c3d2bba2ca71a358b9124c3107cde52ed1483a3e2603c8"
-+    LLVM_COMMIT = "891f002026df122b36813b9e1819769c94327503"
-+    LLVM_SHA256 = "0576a3a0b6baf63308c1432ef2c4c4f082fc5a9ca4b298f0a741dab044d598b5"
+-    LLVM_COMMIT = "4f39a4ff0ada92870ca1c2dccad382ea04947da8"
+-    LLVM_SHA256 = "264c7cc3e166b840911494a2e94cff2ae8730b56239bd91dc8f65c8dc9468262"
++    LLVM_COMMIT = "0c2701fe7fa002e1befc5f86c268a7964f96d286"
++    LLVM_SHA256 = "50b0cb64adf161b4441ee32453f2eb9a015263e4eb3c036548f0207cbe3d0707"
  
      tf_http_archive(
          name = name,
+diff --git a/third_party/llvm/zstd.patch b/third_party/llvm/zstd.patch
+index 2d6f45a..39e69eb 100644
+--- a/third_party/llvm/zstd.patch
++++ b/third_party/llvm/zstd.patch
+@@ -9,7 +9,7 @@ index 37e79db376b4..058788948f94 100644
+ -        # We unconditionally depend on the custom LLVM zlib wrapper. This will
+ -        # be an empty library unless zlib is enabled, in which case it will
+ -        # both provide the necessary dependencies and configuration defines.
+--        "//third-party:zlib",
++-        "@llvm_zlib//:zlib",
+ -        # We unconditionally depend on the custom LLVM zstd wrapper. This will
+ -        # be an empty library unless zstd is enabled, in which case it will
+ -        # both provide the necessary dependencies and configuration defines.
+@@ -24,8 +24,8 @@ diff -u a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel  b/utils/bazel/llvm-p
+          "//llvm:TargetParser",
+          "//llvm:TransformUtils",
+          "//llvm:config",
+--        "//third-party:zlib",
+ -        "//third-party:zstd",
++-        "@llvm_zlib//:zlib",
+      ],
+  )
+ 
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 70609588b144b7..01111898b25922 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "eb89d2f7623bf19fb478627a5304eec9d48e26fc"
-    SHARDY_SHA256 = "aa94690e059be52ae95c9d209d6f79c741d33af6983dcdbb40600e3d72f9293b"
+    SHARDY_COMMIT = "97ce26f0493d3aa720d1d6c18bde48d38c2e3866"
+    SHARDY_SHA256 = "22e964d3a228f0613c3663eb743d0b726ce271247652e5710b7d63bd6703e2a4"
 
     tf_http_archive(
         name = "shardy",
diff --git a/third_party/xla/xla/python/tools/__init__.py b/third_party/xla/third_party/slinky/BUILD.bazel
similarity index 100%
rename from third_party/xla/xla/python/tools/__init__.py
rename to third_party/xla/third_party/slinky/BUILD.bazel
diff --git a/third_party/xla/third_party/slinky/workspace.bzl b/third_party/xla/third_party/slinky/workspace.bzl
new file mode 100644
index 00000000000000..cc48542026381f
--- /dev/null
+++ b/third_party/xla/third_party/slinky/workspace.bzl
@@ -0,0 +1,11 @@
+"""slinky is a lightweight runtime for semi-automatical optimization of data flow pipelines for locality."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "slinky",
+        sha256 = "1cf72ab8135680bcefc676eef5acfa59dc68bf7fc5a20e174394edfc85074d08",
+        strip_prefix = "slinky-395643708e97085cff91ac5f9d7afc5b4a02f2c5",
+        urls = tf_mirror_urls("https://github.com/dsharlet/slinky/archive/395643708e97085cff91ac5f9d7afc5b4a02f2c5.zip"),
+    )
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index fb9341d62d1659..dddb6e502042fc 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,1409 +1,931 @@
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir b/stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir
---- stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir
-+++ stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir
-@@ -1,5 +1,6 @@
- // RUN: stablehlo-opt %s --stablehlo-legalize-to-linalg --split-input-file --canonicalize | FileCheck %s
- // RUN: stablehlo-opt %s --stablehlo-legalize-to-linalg="enable-primitive-ops=true" --split-input-file --canonicalize | FileCheck %s --check-prefix=CHECK-PRIMITIVE
-+// RUN: stablehlo-opt %s --stablehlo-legalize-to-linalg="capture-scalar-inputs=false" --split-input-file --canonicalize | FileCheck %s --check-prefix=CHECK-NO-CAPTURE
- 
- // CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
- // CHECK-LABEL: func @float_add
-@@ -534,6 +535,19 @@
-   %0 = "stablehlo.sign"(%arg0) : (tensor<2x2xcomplex<f32>>)
-                           -> tensor<2x2xcomplex<f32>>
-   func.return %0 : tensor<2x2xcomplex<f32>>
+diff --ruN a/stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir b/stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
+--- stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
++++ stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
+@@ -913,6 +913,15 @@
+ 
+ // -----
+ 
++// CHECK-LABEL: func @reshape_0D_0D
++func.func @reshape_0D_0D(%arg0: tensor<i32>) ->tensor<i32> {
++  %0 = "stablehlo.reshape"(%arg0) : (tensor<i32>) -> tensor<i32>
++  func.return %0 : tensor<i32>
 +}
++// CHECK: return %arg0 : tensor<i32>
 +
 +// -----
 +
-+// CHECK-LABEL: func @float_tan
-+// CHECK-PRIMITIVE-LABEL: func @float_tan
-+func.func @float_tan(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-+  // CHECK: linalg.generic
-+  // CHECK: tan
-+  // CHECK-PRIMITIVE: linalg.map
-+  // CHECK-PRIMITIVE: tan
-+  %0 = "stablehlo.tan"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-+  func.return %0 : tensor<2x2xf32>
- }
- 
- // -----
-@@ -926,6 +940,23 @@
- // CHECK-PRIMITIVE:      (%[[LHS_:.*]]: f32, %[[RHS_:.*]]: f32) {
- // CHECK-PRIMITIVE:        %[[RES:.*]] = arith.select %[[PRED_ELEM]], %[[LHS_]], %[[RHS_]] : f32
- // CHECK-PRIMITIVE:        linalg.yield %[[RES]]
-+
-+// CHECK-NO-CAPTURE:      #[[SCALAR_MAP:.*]] = affine_map<(d0, d1) -> ()>
-+// CHECK-NO-CAPTURE:      #[[ID_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-+// CHECK-NO-CAPTURE:      func @select_scalar_pred_dyn
-+// CHECK-NO-CAPTURE-SAME:  (%[[PRED:.*]]: tensor<i1>, %[[LHS:.*]]: tensor<2x?xf32>, %[[RHS:.*]]: tensor<2x?xf32>)
-+// CHECK-NO-CAPTURE-DAG:  %[[C1:.*]] = arith.constant 1
-+// CHECK-NO-CAPTURE-DAG:  %[[DIM:.*]] =  tensor.dim %[[LHS]], %[[C1]]
-+// CHECK-NO-CAPTURE-DAG:  %[[DST:.*]] = tensor.empty(%[[DIM]])
-+// CHECK-NO-CAPTURE:      linalg.generic
-+// CHECK-NO-CAPTURE-SAME:   indexing_maps = [#[[SCALAR_MAP]], #[[ID_MAP]], #[[ID_MAP]], #[[ID_MAP]]]
-+// CHECK-NO-CAPTURE-SAME:   iterator_types = ["parallel", "parallel"]
-+// CHECK-NO-CAPTURE-SAME:   ins(%[[PRED]], %[[LHS]], %[[RHS]] : tensor<i1>, tensor<2x?xf32>, tensor<2x?xf32>)
-+// CHECK-NO-CAPTURE-SAME:   outs(%[[DST]] : tensor<2x?xf32>)
-+// CHECK-NO-CAPTURE-SAME:   {someattr}
-+// CHECK-NO-CAPTURE:      ^bb0(%[[PRED_:.*]]: i1, %[[LHS_:.*]]: f32, %[[RHS_:.*]]: f32, %{{.*}}: f32):
-+// CHECK-NO-CAPTURE:        %[[RES:.*]] = arith.select %[[PRED_]], %[[LHS_]], %[[RHS_]] : f32
-+// CHECK-NO-CAPTURE:        linalg.yield %[[RES]]
- 
- // -----
- 
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp
-@@ -140,12 +140,11 @@
-   // (any sign-op, or an integral abs-op).
-   // TODO(peiming, ajcbik): these all can potentially be optimized by applying
-   // value transform on sparse_tenosr.value memref
--  if (isa<mlir::stablehlo::SignOp>(op) || isa<mlir::stablehlo::NegOp>(op) ||
-+  if (isa<mlir::stablehlo::SignOp, mlir::stablehlo::NegOp,
-+          mlir::stablehlo::TanOp>(op) ||
-       (isa<mlir::stablehlo::AbsOp>(op) && hasIntegralShapeType(op)) ||
--      isa<chlo::AsinOp>(op) || isa<chlo::AsinhOp>(op) ||
--      isa<chlo::AtanOp>(op) || isa<chlo::AtanhOp>(op) ||
--      isa<chlo::BesselI1eOp>(op) || isa<chlo::SinhOp>(op) ||
--      isa<chlo::TanOp>(op)) {
-+      isa<chlo::AsinOp, chlo::AsinhOp, chlo::AtanOp, chlo::AtanhOp,
-+          chlo::BesselI1eOp, chlo::SinhOp, chlo::TanOp>(op)) {
-     if (!sparse_tensor::getSparseTensorEncoding(op->getResult(0).getType()) &&
-         !sparse_tensor::getSparseTensorEncoding(op->getOperand(0).getType()))
-       return Value();
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h b/stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h
---- stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h
-+++ stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h
-@@ -153,14 +153,11 @@
-   using FOp = ::mlir::math::SinOp;
-   using COp = ::mlir::complex::SinOp;
- };
--// FIXME(Jakub)
--/*
- template <>
- struct StablehloToScalarOp<stablehlo::TanOp> {
-   using FOp = ::mlir::math::TanOp;
-   using COp = ::mlir::complex::TanOp;
- };
--*/
- template <>
- struct StablehloToScalarOp<stablehlo::Atan2Op> {
-   using FOp = ::mlir::math::Atan2Op;
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/Passes.td b/stablehlo/stablehlo/conversions/linalg/transforms/Passes.td
---- stablehlo/stablehlo/conversions/linalg/transforms/Passes.td
-+++ stablehlo/stablehlo/conversions/linalg/transforms/Passes.td
-@@ -39,7 +39,11 @@
-                  Option<"enableSparseOps", "enable-sparse-ops", "bool",
-                         /*default=*/"false",
-                         "Lower to Sparse Tensor ops (sparse_tensor.concatenate)"
--                        "when possible, instead of linalg.generic">];
-+                        "when possible, instead of linalg.generic">,
-+                 Option<"captureScalarInputs", "capture-scalar-inputs", "bool",
-+                        /*default=*/"true",
-+                        "Capture scalar inputs in generic ops instead of"
-+                        "passing as tensor-scalar argument.">];
- }
- 
- #endif  // STABLEHLO_TO_LINALG_PASSES
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h b/stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h
---- stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h
-+++ stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h
-@@ -26,11 +26,12 @@
- //===----------------------------------------------------------------------===//
- 
- /// Populates the patterns that convert from StableHLO to Linalg on tensors.
--void populateStablehloToLinalgConversionPatterns(MLIRContext *context,
--                                                 TypeConverter &typeConverter,
--                                                 RewritePatternSet *patterns,
-+void populateStablehloToLinalgConversionPatterns(MLIRContext* context,
-+                                                 TypeConverter& typeConverter,
-+                                                 RewritePatternSet* patterns,
-                                                  bool enablePrimitiveOps,
--                                                 bool enableSparseOps);
-+                                                 bool enableSparseOps,
-+                                                 bool captureScalarInputs);
- 
- //===----------------------------------------------------------------------===//
- // Fine-grained patterns used by the implementation.
-@@ -39,8 +40,9 @@
- /// Populates the patterns that convert from elementwise StableHLO ops to Linalg
- /// on tensors.
- void populatePointwiseStablehloToLinalgConversionPatterns(
--    MLIRContext *context, TypeConverter &typeConverter,
--    RewritePatternSet *patterns, bool enablePrimitiveOps);
-+    MLIRContext* context, TypeConverter& typeConverter,
-+    RewritePatternSet* patterns, bool enablePrimitiveOps,
-+    bool captureScalarInputs);
- 
- /// Populates the patterns that convert from convolution StableHLO ops to Linalg
- /// on tensors.
+ // CHECK-LABEL: func @reshape_0D_1D_unsigned
+ // CHECK-SAME:    %[[ARG_UNSIGNED:[a-zA-Z0-9_]*]]
+ func.func @reshape_0D_1D_unsigned(%arg0: tensor<ui32>) -> tensor<1xui32> {
 diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
 --- stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
 +++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
-@@ -2634,7 +2634,8 @@
- 
-     RewritePatternSet patterns_(context);
-     populateStablehloToLinalgConversionPatterns(
--        context, converter, &patterns_, enablePrimitiveOps, enableSparseOps);
-+        context, converter, &patterns_, enablePrimitiveOps, enableSparseOps,
-+        captureScalarInputs);
-     patterns = std::move(patterns_);
- 
-     return success();
-@@ -2657,7 +2658,8 @@
-                                                  TypeConverter& typeConverter,
-                                                  RewritePatternSet* patterns,
-                                                  bool enablePrimitiveOps,
--                                                 bool enableSparseOps) {
-+                                                 bool enableSparseOps,
-+                                                 bool captureScalarInputs) {
-   // clang-format off
-   patterns->add<ConcatenateConverter>(typeConverter, context,
-                                       enablePrimitiveOps);
-@@ -2680,7 +2682,8 @@
-       >(typeConverter, context);
- 
-   detail::populatePointwiseStablehloToLinalgConversionPatterns(
--      context, typeConverter, patterns, enablePrimitiveOps);
-+      context, typeConverter, patterns, enablePrimitiveOps,
-+      captureScalarInputs);
- 
-   if (enableSparseOps) {
-     patterns->add<SparseConcatenateConverter>(typeConverter, context);
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp
-@@ -145,6 +145,7 @@
-       ScalarHloToArithmeticPattern<mlir::stablehlo::SineOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::SqrtOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::SubtractOp>,
-+      ScalarHloToArithmeticPattern<mlir::stablehlo::TanOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::TanhOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::XorOp>>(typeConverter,
-                                                             context, filterFn);
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp
-@@ -23,6 +23,7 @@
- 
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/SmallVector.h"
-+#include "llvm/Support/Debug.h"
- #include "mlir/Dialect/Linalg/IR/Linalg.h"
- #include "mlir/Dialect/Tensor/IR/Tensor.h"
- #include "mlir/IR/AffineMap.h"
-@@ -43,6 +44,8 @@
- #include "stablehlo/conversions/linalg/transforms/Rewriters.h"
- #include "stablehlo/dialect/StablehloOps.h"
- 
-+#define DEBUG_TYPE "stablehlo-conversions"
-+
- namespace mlir::stablehlo {
- namespace {
- int64_t getRank(Value v) { return cast<ShapedType>(v.getType()).getRank(); }
-@@ -142,6 +145,11 @@
- struct PointwiseToLinalgMapConverter : OpConversionPattern<OpTy> {
-   using OpConversionPattern<OpTy>::OpConversionPattern;
-   using OpAdaptor = typename OpTy::Adaptor;
-+
-+  PointwiseToLinalgMapConverter(TypeConverter& typeConverter,
-+                                MLIRContext* context, bool captureScalarInputs)
-+      : OpConversionPattern<OpTy>(typeConverter, context),
-+        captureScalarInputs(captureScalarInputs) {}
- 
-   virtual FailureOr<Operation *> createLinalgOp(
-       OpTy &op, ConversionPatternRewriter &rewriter,
-@@ -190,8 +198,11 @@
-             rewriter, loc, cast<TypedValue<ShapedType>>(input),
-             cast<ShapedType>(emptyTensor.getType())));
-         scalarInputs.push_back(nullptr);
-+      } else if (captureScalarInputs) {
-+        scalarInputs.push_back(rewriter.create<tensor::ExtractOp>(loc, input));
-       } else {
--        scalarInputs.push_back(rewriter.create<tensor::ExtractOp>(loc, input));
-+        mappedInputs.push_back(input);
-+        scalarInputs.push_back(nullptr);
-       }
-     }
+@@ -1103,6 +1103,12 @@
  
-@@ -202,6 +213,8 @@
-     rewriter.replaceOp(op, (*mapOp)->getResults());
-     return success();
-   }
+     if (!resultType.hasStaticShape()) return failure();
+ 
++    // If the reshape is a no-op simply fold it away.
++    if (resultType == operandType) {
++      rewriter.replaceOp(reshapeOp, operand);
++      return success();
++    }
 +
-+  bool captureScalarInputs;
- };
- 
- /// Converts a HLO operation to a linalg.generic op that contains the
-@@ -211,12 +224,12 @@
-   using PointwiseToLinalgMapConverter<OpTy>::PointwiseToLinalgMapConverter;
-   using OpAdaptor = typename OpTy::Adaptor;
- 
--  FailureOr<Operation *> createLinalgOp(OpTy &op,
--                                        ConversionPatternRewriter &rewriter,
--                                        ArrayRef<Value> mappedInputs,
--                                        ArrayRef<Value> scalarVals,
--                                        Value emptyTensor,
--                                        int64_t maxRank) const override {
-+  FailureOr<Operation*> createLinalgOp(OpTy& op,
-+                                       ConversionPatternRewriter& rewriter,
-+                                       ArrayRef<Value> mappedInputs,
-+                                       ArrayRef<Value> scalarVals,
-+                                       Value emptyTensor,
-+                                       int64_t maxRank) const override {
-     // Create indexing maps.
-     AffineMap scalarMap = AffineMap::get(maxRank, 0, rewriter.getContext());
-     AffineMap idMap = rewriter.getMultiDimIdentityMap(maxRank);
-@@ -225,10 +238,10 @@
-       maps.push_back(isScalar(v) ? scalarMap : idMap);
-     maps.push_back(idMap);
-     bool failed = false;
--    Operation *linalgOp = rewriter.create<linalg::GenericOp>(
-+    Operation* linalgOp = rewriter.create<linalg::GenericOp>(
-         op.getLoc(), emptyTensor.getType(), mappedInputs, emptyTensor, maps,
-         getNParallelLoopsAttrs(maxRank),
--        [&](OpBuilder &nestedBuilder, Location /*nested_loc*/,
-+        [&](OpBuilder& nestedBuilder, Location /*nested_loc*/,
-             ValueRange args) {
-           Type innerResultTy = getElementTypeOrSelf(emptyTensor);
-           auto argvec =
-@@ -253,8 +266,9 @@
- 
- namespace detail {
- void populatePointwiseStablehloToLinalgConversionPatterns(
--    MLIRContext *context, TypeConverter &typeConverter,
--    RewritePatternSet *patterns, bool enablePrimitiveOps) {
-+    MLIRContext* context, TypeConverter& typeConverter,
-+    RewritePatternSet* patterns, bool enablePrimitiveOps,
-+    bool captureScalarInputs) {
-   if (enablePrimitiveOps) {
-     patterns->add<
-         PointwiseToLinalgMapConverter<mlir::stablehlo::AbsOp>,
-@@ -301,12 +315,12 @@
-         PointwiseToLinalgMapConverter<mlir::stablehlo::SineOp>,
-         PointwiseToLinalgMapConverter<mlir::stablehlo::SqrtOp>,
-         PointwiseToLinalgMapConverter<mlir::stablehlo::SubtractOp>,
-+        PointwiseToLinalgMapConverter<mlir::stablehlo::TanOp>,
-         PointwiseToLinalgMapConverter<mlir::stablehlo::TanhOp>,
--        PointwiseToLinalgMapConverter<mlir::stablehlo::XorOp>>(typeConverter,
--                                                               context);
-+        PointwiseToLinalgMapConverter<mlir::stablehlo::XorOp>>(
-+        typeConverter, context, captureScalarInputs);
-     return;
+     // If any of the output dimensions is 0, the tensor has no elements. In that
+     // case, we can just replace the reshape with an empty op.
+     if (llvm::is_contained(resultType.getShape(), 0)) {
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -4024,6 +4024,61 @@
+   ReturnOp::create(*builder, loc, compare);
+ }
+ 
++void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
++                           OpBuilder& builder) {
++  OpBuilder::InsertionGuard guard(builder);
++  if (body.getBlocks().empty()) builder.createBlock(&body);
++  Block* block = &body.getBlocks().front();
++
++  Type value_type = RankedTensorType::get(/*shape=*/{}, elementType);
++  Type index_type = RankedTensorType::get(/*shape=*/{}, indices_type);
++  Location loc = body.getLoc();
++  block->addArguments({value_type, index_type}, {loc, loc});
++  block->addArguments({value_type, index_type}, {loc, loc});
++
++  auto lhs_value = block->getArgument(0);
++  auto lhs_index = block->getArgument(1);
++  auto rhs_value = block->getArgument(2);
++  auto rhs_index = block->getArgument(3);
++
++  auto gt_pred =
++      builder
++          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::GT)
++          .getResult();
++
++  // Tie-Breaker Condition: (lhs == rhs) AND (lhs_index < rhs_index)
++  auto eq_pred =
++      builder
++          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::EQ)
++          .getResult();
++  auto lt_index_pred =
++      builder
++          .create<CompareOp>(loc, lhs_index, rhs_index, ComparisonDirection::LT)
++          .getResult();
++  auto tie_breaker_condition =
++      builder.create<AndOp>(loc, eq_pred, lt_index_pred).getResult();
++
++  // Final lhs Selection Condition: (gt_pred) OR (tie_breaker_condition)
++  auto final_lhs_condition =
++      builder.create<OrOp>(loc, gt_pred, tie_breaker_condition).getResult();
++
++  // Select Final Results:
++  // if final_lhs_condition:
++  //     return (lhs_value, lhs_index)
++  // else:
++  //     return (rhs_value, rhs_index)
++  auto selected_value = builder
++                            .create<stablehlo::SelectOp>(
++                                loc, final_lhs_condition, lhs_value, rhs_value)
++                            .getResult();
++  auto selected_index = builder
++                            .create<stablehlo::SelectOp>(
++                                loc, final_lhs_condition, lhs_index, rhs_index)
++                            .getResult();
++  builder.create<stablehlo::ReturnOp>(
++      loc, mlir::ValueRange{selected_value, selected_index});
++}
++
+ SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
+                     const llvm::ArrayRef<Value>& operands,
+                     const llvm::ArrayRef<Type>& elementTypes, int64_t dimension,
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.h b/stablehlo/stablehlo/dialect/StablehloOps.h
+--- stablehlo/stablehlo/dialect/StablehloOps.h
++++ stablehlo/stablehlo/dialect/StablehloOps.h
+@@ -204,6 +204,16 @@
+   stablehlo::ReturnOp::create(builder, loc, reducer.getResult());
+ }
+ 
++// Builds the region `body` for a max-and-argmax computation, suitable for
++// use in ReduceWindow operations with varidic value and index inputs.
++// It creates four block arguments (val1, idx1, val2, idx2) of `elementType` and
++// `indices_type`, and returns two results: result_val and result_idx.
++// result_val is the maximum of val1 and val2, and result_idx is the index
++// corresponding to result_val. If val1 >= val2, idx1 is returned, otherwise
++// idx2 is returned.
++void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
++                           OpBuilder& builder);
++
+ // PrecisionConfigAttr is a constraint attribute on ArrayAttrs.
+ // Create this class to allow for building this attr similar to other
+ // attributes.
+diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp b/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
+--- stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
++++ stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
+@@ -203,6 +203,9 @@
+   // If the op does not support type inference, return a default output shape
+   // parameter that must be injected.
+   MethodParameter getDefaultOutputShape() {
++    if (hasSingleVariadicResult(getOp()) || getOp().getNumResults() > 1) {
++      return MethodParameter("TypeRange", "resultTypes");
++    }
+     return MethodParameter("Type", "resultType");
    }
--
-   patterns
-       ->add<PointwiseToLinalgConverter<mlir::stablehlo::AbsOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::AddOp>,
-@@ -352,9 +366,10 @@
-             PointwiseToLinalgConverter<mlir::stablehlo::SineOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::SqrtOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::SubtractOp>,
-+            PointwiseToLinalgConverter<mlir::stablehlo::TanOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::TanhOp>,
--            PointwiseToLinalgConverter<mlir::stablehlo::XorOp>>(typeConverter,
--                                                                context);
-+            PointwiseToLinalgConverter<mlir::stablehlo::XorOp>>(
-+          typeConverter, context, captureScalarInputs);
- }
- }  // namespace detail
- }  // namespace mlir::stablehlo
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp b/stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp
---- stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp
-+++ stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp
-@@ -40,7 +40,7 @@
- 
- namespace {
- 
--Value buildRescaleMultiplier(bool scale32, OpBuilder& builder, Location loc,
-+Value buildRescaleMultiplier(bool scale32, OpBuilder &builder, Location loc,
-                              ArrayRef<int32_t> multipliers) {
-   if (scale32) {
-     return tosa::getConstTensorInt<int32_t>(builder, loc, multipliers);
-@@ -51,7 +51,7 @@
- }
  
- // create a tosa rescale op and return its result value
--Value buildRescale(PatternRewriter& rewriter, Location loc,
-+Value buildRescale(PatternRewriter &rewriter, Location loc,
-                    ShapedType outputType, Value inputVal, int32_t multiplier,
-                    int32_t shift, int64_t inputZp, int64_t outputZp,
-                    bool doubleRound, bool scale32, bool perChannel) {
-@@ -85,7 +85,7 @@
- }
- 
- // Creates TOSA rescale op with int32 output
--Value buildRescaleToInt32(PatternRewriter& rewriter, Location loc,
-+Value buildRescaleToInt32(PatternRewriter &rewriter, Location loc,
-                           Value inputVal, double inputScale, int64_t inputZp) {
-   auto inputType = cast<ShapedType>(inputVal.getType());
-   auto outputType = inputType.clone(rewriter.getI32Type());
-@@ -103,7 +103,7 @@
- }
- 
- // Creates TOSA rescale op with int32 input
--Value buildRescaleFromInt32(PatternRewriter& rewriter, Location loc,
-+Value buildRescaleFromInt32(PatternRewriter &rewriter, Location loc,
-                             ShapedType outputType, Value inputVal,
-                             double outputScale, int64_t outputZp) {
-   // Input should be int32 type
-@@ -124,14 +124,14 @@
- }
+@@ -276,7 +279,7 @@
+     BuilderParams params = getOpBuilderParameters();
+     SmallVector<MethodParameter> parameters;
+     if (params.outputShape.has_value()) {
+-      parameters.push_back(getDefaultOutputShape());
++      parameters.push_back(params.outputShape.value());
+     }
+     for (auto& operand : params.operands) {
+       parameters.push_back(
+diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
+--- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
++++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
+@@ -17,12 +17,12 @@
+ #include <cstdint>
+ #include <string>
  
- using UnaryRescaleScalesFn =
--    void (*)(const quant::UniformQuantizedType& operandQType,
--             const quant::UniformQuantizedType& resultQType,
--             double& operandRescaleScale, double& resultRescaleScale);
--
--void GetUnaryRescaleScales(const quant::UniformQuantizedType& operandQType,
--                           const quant::UniformQuantizedType& resultQType,
--                           double& operandRescaleScale,
--                           double& resultRescaleScale) {
-+    void (*)(const quant::UniformQuantizedType &operandQType,
-+             const quant::UniformQuantizedType &resultQType,
-+             double &operandRescaleScale, double &resultRescaleScale);
+-#include "gtest/gtest.h"
+ #include "mlir/IR/BuiltinAttributes.h"
+ #include "mlir/IR/BuiltinOps.h"
+ #include "mlir/IR/DialectRegistry.h"
+ #include "mlir/IR/MLIRContext.h"
+ #include "mlir/IR/OwningOpRef.h"
++#include "mlir/IR/Types.h"
+ #include "mlir/IR/Verifier.h"
+ #include "mlir/Support/DebugStringHelper.h"
+ #include "mlir/Support/LLVM.h"
+@@ -32,6 +32,7 @@
+ #include "stablehlo/integrations/cpp/builder/FuncBuilder.h"
+ #include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
+ #include "stablehlo/integrations/cpp/builder/StablehloBuilder.h"
++#include "gtest/gtest.h"
+ 
+ namespace mlir {
+ namespace stablehlo {
+@@ -1517,6 +1518,29 @@
+   EXPECT_EQ(expected, debugString(*module));
+ }
+ 
++TEST(MlirBuilderTest, VariadicResult) {
++  std::string expected = R"mlir(module {
++  func.func @main() -> (tensor<f64>, tensor<f64>) {
++    %0:2 = stablehlo.custom_call @two_outs() : () -> (tensor<f64>, tensor<f64>)
++    return %0#0, %0#1 : tensor<f64>, tensor<f64>
++  }
++})mlir";
++
++  StablehloModuleBuilder mb;
++  {
++    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
++    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
++    auto type = makeTensorType(fb.getContext(), {}, ElementType::F64);
++    SmallVector<Type> resultTypes = {type, type};
++    // Pass double data with i64 type.
++    auto cc = stablehlo::CustomCall(fb, resultTypes, {}, "two_outs");
++    func::Return(fb, {cc});
++  }
 +
-+void GetUnaryRescaleScales(const quant::UniformQuantizedType &operandQType,
-+                           const quant::UniformQuantizedType &resultQType,
-+                           double &operandRescaleScale,
-+                           double &resultRescaleScale) {
-   double operandScale = operandQType.getScale();
-   double resultScale = resultQType.getScale();
- 
-@@ -145,7 +145,7 @@
- 
- template <typename StablehloOp>
- LogicalResult matchAndRewriteUnaryOp(
--    StablehloOp op, PatternRewriter& rewriter,
-+    StablehloOp op, PatternRewriter &rewriter,
-     UnaryRescaleScalesFn rescaleScalesFn = GetUnaryRescaleScales) {
-   Value operand = op.getOperand();
-   Value result = op.getResult();
-@@ -190,21 +190,21 @@
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::AbsOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteUnaryOp(op, rewriter);
- }
- 
- using BinaryRescaleScalesFn = void (*)(
--    const quant::UniformQuantizedType& lhsQType,
--    const quant::UniformQuantizedType& rhsQType,
--    const quant::UniformQuantizedType& resultQType, double& lhsRescaleScale,
--    double& rhsRescaleScale, double& resultRescaleScale);
--
--void GetAddSubRescaleScales(const quant::UniformQuantizedType& lhsQType,
--                            const quant::UniformQuantizedType& rhsQType,
--                            const quant::UniformQuantizedType& resultQType,
--                            double& lhsRescaleScale, double& rhsRescaleScale,
--                            double& resultRescaleScale) {
-+    const quant::UniformQuantizedType &lhsQType,
-+    const quant::UniformQuantizedType &rhsQType,
-+    const quant::UniformQuantizedType &resultQType, double &lhsRescaleScale,
-+    double &rhsRescaleScale, double &resultRescaleScale);
++  OwningOpRef<ModuleOp> module = mb->build();
++  EXPECT_EQ(expected, debugString(*module));
++}
 +
-+void GetAddSubRescaleScales(const quant::UniformQuantizedType &lhsQType,
-+                            const quant::UniformQuantizedType &rhsQType,
-+                            const quant::UniformQuantizedType &resultQType,
-+                            double &lhsRescaleScale, double &rhsRescaleScale,
-+                            double &resultRescaleScale) {
-   // 1. Rescale inputs to scale = 2.0 x max(lhs.scale, rhs.scale)
-   // 2. Extra left shift to input to increase precision
-   // Where input_shift = 20 if input is 8-bit
-@@ -230,11 +230,11 @@
-       maxScale2x / (resultScale * static_cast<double>(1 << inputShift));
- }
- 
--void GetMulDivRescaleScales(const quant::UniformQuantizedType& lhsQType,
--                            const quant::UniformQuantizedType& rhsQType,
--                            const quant::UniformQuantizedType& resultQType,
--                            double& lhsRescaleScale, double& rhsRescaleScale,
--                            double& resultRescaleScale) {
-+void GetMulDivRescaleScales(const quant::UniformQuantizedType &lhsQType,
-+                            const quant::UniformQuantizedType &rhsQType,
-+                            const quant::UniformQuantizedType &resultQType,
-+                            double &lhsRescaleScale, double &rhsRescaleScale,
-+                            double &resultRescaleScale) {
-   double lhsScale = lhsQType.getScale();
-   double rhsScale = rhsQType.getScale();
-   double resultScale = resultQType.getScale();
-@@ -248,11 +248,11 @@
-   resultRescaleScale = lhsScale * rhsScale / resultScale;
- }
- 
--void GetMinMaxRescaleScales(const quant::UniformQuantizedType& lhsQType,
--                            const quant::UniformQuantizedType& rhsQType,
--                            const quant::UniformQuantizedType& resultQType,
--                            double& lhsRescaleScale, double& rhsRescaleScale,
--                            double& resultRescaleScale) {
-+void GetMinMaxRescaleScales(const quant::UniformQuantizedType &lhsQType,
-+                            const quant::UniformQuantizedType &rhsQType,
-+                            const quant::UniformQuantizedType &resultQType,
-+                            double &lhsRescaleScale, double &rhsRescaleScale,
-+                            double &resultRescaleScale) {
-   // 1. Rescale inputs to scale = max(lhs.scale, rhs.scale)
-   // 2. Extra left shift to input to increase precision
-   // Where input_shift = 20 if input is 8-bit
-@@ -280,7 +280,7 @@
- }
- 
- template <typename StablehloOp>
--LogicalResult matchAndRewriteBinaryOp(StablehloOp op, PatternRewriter& rewriter,
-+LogicalResult matchAndRewriteBinaryOp(StablehloOp op, PatternRewriter &rewriter,
-                                       BinaryRescaleScalesFn rescaleScalesFn) {
-   Value lhs = op.getLhs();
-   Value rhs = op.getRhs();
-@@ -339,37 +339,37 @@
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::AddOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetAddSubRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::SubtractOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetAddSubRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::MulOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMulDivRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::DivOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMulDivRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::MinOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMinMaxRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::MaxOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMinMaxRescaleScales);
- }
- 
- LogicalResult matchAndRewriteCompareOp(stablehlo::CompareOp op,
--                                       PatternRewriter& rewriter) {
-+                                       PatternRewriter &rewriter) {
-   Value lhs = op.getLhs();
-   Value rhs = op.getRhs();
-   Value result = op.getResult();
-@@ -429,7 +429,7 @@
- }
+ ////////
+ // Custom Attribute Tests
+ ////////
+diff --ruN a/stablehlo/stablehlo/tests/ops_broadcasting.mlir b/stablehlo/stablehlo/tests/ops_broadcasting.mlir
+--- stablehlo/stablehlo/tests/ops_broadcasting.mlir
++++ stablehlo/stablehlo/tests/ops_broadcasting.mlir
+@@ -92,6 +92,8 @@
+ // [<=10] x [1] => [<=10]
+ // [1] x [<=10] => [<=10]
+ // [1] x [1, <=10, 1] => [1, <=10, 1]
++// [5] x [10, 1] => [10, 5]
++// [5] x [<=10, 1] => [<=10, 5]
  
- LogicalResult matchAndRewriteOp(stablehlo::CompareOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteCompareOp(op, rewriter);
- }
  
-@@ -438,7 +438,7 @@
-     : public OpRewritePattern<StablehloOpType> {
-   using OpRewritePattern<StablehloOpType>::OpRewritePattern;
-   LogicalResult matchAndRewrite(StablehloOpType op,
--                                PatternRewriter& rewriter) const override {
-+                                PatternRewriter &rewriter) const override {
-     return matchAndRewriteOp(op, rewriter);
-   }
- };
-@@ -446,7 +446,7 @@
- struct StablehloQuantLegalizeToTosaRescalePass
-     : impl::StablehloQuantLegalizeToTosaRescalePassBase<
-           StablehloQuantLegalizeToTosaRescalePass> {
--  LogicalResult initialize(MLIRContext* ctx) override {
-+  LogicalResult initialize(MLIRContext *ctx) override {
-     RewritePatternSet patternList(ctx);
-     populateStablehloQuantLegalizeToTosaRescalePatterns(&patternList, ctx);
-     patterns = std::move(patternList);
-@@ -468,7 +468,7 @@
- }  // namespace
+ // [1] x [1] => [1]
+@@ -232,6 +234,38 @@
  
- void populateStablehloQuantLegalizeToTosaRescalePatterns(
--    RewritePatternSet* patterns, MLIRContext* context) {
-+    RewritePatternSet *patterns, MLIRContext *context) {
-   // unary ops
-   patterns->addWithLabel<QuantizedStablehloOpConversion<stablehlo::AbsOp>>(
-       {"StablehloQuantAbsOp"}, context);
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp b/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp
-@@ -110,6 +110,43 @@
-   }
- }
+ // -----
  
-+bool IsBoolean(ElementType elementType) {
-+  MLIRContext ctx;
-+  return getElementType(ctx, elementType).isInteger(1);
++// [5] x [10, 1] => [10, 5]
++// CHECK-LABEL: func @tensor_broadcast_5_x_10_1
++func.func @tensor_broadcast_5_x_10_1(%arg0: tensor<5xf64>, %arg1: tensor<10x1xf64>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [1] : (tensor<5xf64>) -> tensor<10x5xf64>
++  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<10x1xf64>) -> tensor<10x5xf64>
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %[[RHS_BCAST]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<5xf64>, tensor<10x1xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
 +}
 +
-+bool IsComplex(ElementType elementType) {
-+  MLIRContext ctx;
-+  auto type = dyn_cast<ComplexType>(getElementType(ctx, elementType));
-+  return !!type;
-+}
++// -----
 +
-+bool IsFloat(ElementType elementType) {
-+  MLIRContext ctx;
-+  return getElementType(ctx, elementType).isFloat();
++// [<=10, 1] x [5] => [<=10, 5]
++// CHECK-LABEL: func @tensor_broadcast_b5_1_x_5
++func.func @tensor_broadcast_b5_1_x_5(
++  %arg0: tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
++  %arg1: tensor<5xf64>
++) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<?x1xf64, #stablehlo.bounds<10, ?>>) -> tensor<?x5xf64, #stablehlo.bounds<10, ?>>
++  // CHECK: %[[RHS_BCAST_STATIC:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [1] : (tensor<5xf64>) -> tensor<10x5xf64>
++  // CHECK: %[[ARG0_DIM0_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
++  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST_STATIC]], %[[ARG0_DIM0_SIZE]], dim = 0
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %[[RHS_BCAST_DYN]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (
++    tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
++    tensor<5xf64>
++  ) -> !stablehlo.token
++  return %0 : !stablehlo.token
 +}
 +
-+bool IsInteger(ElementType elementType, bool includeBool = false) {
-+  MLIRContext ctx;
-+  Type type = getElementType(ctx, elementType);
-+  return type.isInteger() && (includeBool || !IsBoolean(elementType));
-+}
++// -----
 +
-+bool IsSignedInteger(ElementType elementType) {
-+  MLIRContext ctx;
-+  Type type = getElementType(ctx, elementType);
+ //////
+ // N-ary broadcast tests.
+ 
+@@ -247,3 +281,42 @@
+   return %0 : !stablehlo.token
+ }
+ 
++// -----
 +
-+  // Note that this is not the same as `type.isSignedInteger()`. Signed integers
-+  // are not used in StableHLO.
-+  return type.isSignlessInteger() && !IsBoolean(elementType);
-+}
++/////
++// Broadcast errors
 +
-+bool IsUnsignedInteger(ElementType elementType) {
-+  MLIRContext ctx;
-+  return getElementType(ctx, elementType).isUnsignedInteger() &&
-+         !IsBoolean(elementType);
++// [10] x [5] => error
++// expected-error @+1 {{incompatible shapes for broadcasting 10 and 5}}
++func.func @broadcast_error_10_x_5(%arg0: tensor<10xf64>, %arg1: tensor<5xf64>) -> !stablehlo.token {
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, tensor<5xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
 +}
 +
- RankedTensorType makeTensorType(MLIRContext& ctx, ArrayRef<int64_t> shape,
-                                 ElementType elementType) {
-   return makeTensorType(ctx, shape, getElementType(ctx, elementType));
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h b/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h
---- stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h
-+++ stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h
-@@ -18,7 +18,6 @@
- 
- #include <complex>
- #include <cstdint>
--#include <source_location>
- #include <type_traits>
- #include <vector>
- 
-@@ -68,6 +67,20 @@
-   // clang-format on
- };
- 
-+bool IsBoolean(ElementType elementType);
++// -----
 +
-+bool IsComplex(ElementType elementType);
++// [10] x [<=10] => error
++// expected-error @+1 {{cannot mix bounded and static dimensions in broadcast}}
++func.func @broadcast_error_10_x_b10(%arg0: tensor<10xf64>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token {
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
 +
-+bool IsFloat(ElementType elementType);
++// -----
 +
-+bool IsInteger(ElementType elementType, bool includeBool);
++// [10] x not_tensor => error
++func.func @broadcast_error_not_tensor(%arg0: tensor<10xf64>, %arg1: !stablehlo.token) -> !stablehlo.token {
++  // expected-error @+1 {{expected ranked tensor type for broadcast inputs}}
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, !stablehlo.token) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
 +
-+// In StableHLO, we refer to signed integer as the MLIR's equivalent signless
-+// integer. StableHLO does not have a notion of signless integers like MLIR.
-+bool IsSignedInteger(ElementType elementType);
++// -----
 +
-+bool IsUnsignedInteger(ElementType elementType);
++// [] => error
++func.func @broadcast_error_empty() -> !stablehlo.token {
++  // expected-error @+1 {{requires at least one operand to broadcast}}
++  %0 = "hlo_test_broadcast.numpy_broadcast"() : () -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
 +
- Type getElementType(MLIRContext& ctx, ElementType elementType);
- 
- // Build a ranked tensor type with an element type of ElementType.
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp
-@@ -20,7 +20,7 @@
- #include <utility>
- #include <vector>
- 
--#include "gtest/gtest.h"
-+#include "testing/base/public/gunit.h"
- #include "llvm/ADT/DenseMap.h"
- #include "mlir/IR/BuiltinTypeInterfaces.h"
- #include "mlir/IR/BuiltinTypes.h"
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTest.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTest.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTest.cpp
-@@ -15,7 +15,7 @@
- 
- #include <string>
- 
--#include "gtest/gtest.h"
-+#include "testing/base/public/gunit.h"
- #include "llvm/Support/raw_ostream.h"
- #include "mlir/Dialect/Func/IR/FuncOps.h"
- #include "mlir/IR/BuiltinOps.h"
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
-@@ -17,7 +17,7 @@
- #include <cstdint>
- #include <string>
- 
--#include "gtest/gtest.h"
-+#include "testing/base/public/gunit.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/DialectRegistry.h"
 diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
 --- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
 +++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-@@ -529,28 +529,15 @@
- // IotaOp
- 
- // CHECK-LABEL: func @eval_iota
--func.func @eval_iota() -> (tensor<3x4x5xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>) {
--  // CHECK-NOT: stablehlo.iota
--  // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<
--  // CHECK-SAME: {{\[\[}}[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
--  // CHECK-SAME: {{\[}}[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
--  // CHECK-SAME: {{\[}}[2, 2, 2, 2, 2], [2, 2, 2, 2, 2], [2, 2, 2, 2, 2], [2, 2, 2, 2, 2]]]> : tensor<3x4x5xi32>
--
--  // CHECK: [[RESULT1:%.*]] = stablehlo.constant dense<
--  // CHECK-SAME: {{\[\[}}[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]],
--  // CHECK-SAME: {{\[}}[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]],
--  // CHECK-SAME: {{\[}}[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]]]> : tensor<3x4x5xi32>
--
--  // CHECK: [[RESULT2:%.*]] = stablehlo.constant dense<
--  // CHECK-SAME: {{\[\[}}[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
--  // CHECK-SAME: {{\[}}[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
--  // CHECk-SAME: {{\[}}[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]]> : tensor<3x4x5xi32>
--
-+func.func @eval_iota() -> (tensor<1xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>) {
-+  // CHECK:      [[RESULT0:%.*]] = stablehlo.constant dense<0> : tensor<1xi32>
-+  // CHECK-NEXT: [[RESULT1:%.*]] = stablehlo.iota dim = 1 : tensor<3x4x5xi32>
-+  // CHECK-NEXT: [[RESULT2:%.*]] = stablehlo.iota dim = 2 : tensor<3x4x5xi32>
-   // CHECK: return [[RESULT0]], [[RESULT1]], [[RESULT2]]
--  %0 = stablehlo.iota dim = 0 : tensor<3x4x5xi32>
-+  %0 = stablehlo.iota dim = 0 : tensor<1xi32>
-   %1 = stablehlo.iota dim = 1 : tensor<3x4x5xi32>
-   %2 = stablehlo.iota dim = 2 : tensor<3x4x5xi32>
--  func.return %0, %1, %2 : tensor<3x4x5xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>
-+  func.return %0, %1, %2 : tensor<1xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>
- }
- 
- // -----
-@@ -596,6 +583,37 @@
-   // CHECK-DAG:  [[CST2:%.+]] = stablehlo.constant dense<{{\[\[1, 2\], \[3, 4\]\]}}> : tensor<2x2xi32>
-   // CHECK-NEXT: return [[CST1]], [[CST2]]
-   return %0, %1 : tensor<1xi32>, tensor<2x2xi32>
+@@ -47,8 +47,8 @@
+ ////////
+ // CaseOp
+ 
+-// CHECK-LABEL: func.func @case_fold_constant_branch_index
+-func.func @case_fold_constant_branch_index(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
++// CHECK-LABEL: func.func @case_fold_constant_branch_index_int_result
++func.func @case_fold_constant_branch_index_int_result(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
+   // CHECK-NEXT: {{(^ *|func\.)}}return %arg1
+   // CHECK-NOT:  stablehlo.case
+   %branch_index = stablehlo.constant dense<1> : tensor<i32>
+@@ -60,6 +60,47 @@
+     stablehlo.return %arg2 : tensor<i32>
+   }) : (tensor<i32>) -> tensor<i32>
+   func.return %result: tensor<i32>
 +}
 +
 +// -----
 +
-+////////
-+// SliceOp / DynamicSliceOp
-+
-+// CHECK-LABEL: @slice_fold
-+func.func @slice_fold(%arg0: tensor<6x1xi32>) -> tensor<1x1xi32> {
-+  %c = stablehlo.constant dense<[[0], [1], [2], [3], [4], [5]]> : tensor<6x1xi32>
-+  %0 = stablehlo.slice %c [2:3, 0:1] : (tensor<6x1xi32>) -> tensor<1x1xi32>
-+  // CHECK: stablehlo.constant dense<2> : tensor<1x1xi32>
-+  return %0 : tensor<1x1xi32>
++// CHECK-LABEL: func.func @case_fold_constant_branch_index_complex_result
++func.func @case_fold_constant_branch_index_complex_result(%arg0: tensor<complex<f32>>, %arg1: tensor<complex<f32>>, %arg2: tensor<complex<f32>>) -> tensor<complex<f32>> {
++  // CHECK-NEXT: {{(^ *|func\.)}}return %arg1
++  // CHECK-NOT:  stablehlo.case
++  %branch_index = stablehlo.constant dense<1> : tensor<i32>
++  %result = "stablehlo.case"(%branch_index) ({
++    stablehlo.return %arg0 : tensor<complex<f32>>
++  }, {
++    stablehlo.return %arg1 : tensor<complex<f32>>
++  }, {
++    stablehlo.return %arg2 : tensor<complex<f32>>
++  }) : (tensor<i32>) -> tensor<complex<f32>>
++  func.return %result: tensor<complex<f32>>
 +}
 +
-+// CHECK-LABEL: @slice_fold_splat
-+func.func @slice_fold_splat(%arg0: tensor<6x1xi32>) -> tensor<1x1xi32> {
-+  %c = stablehlo.constant dense<1> : tensor<6x1xi32>
-+  %0 = stablehlo.slice %c [2:3, 0:1] : (tensor<6x1xi32>) -> tensor<1x1xi32>
-+  // CHECK: stablehlo.constant dense<1> : tensor<1x1xi32>
-+  return %0 : tensor<1x1xi32>
-+}
-+
-+// CHECK-LABEL: @dynamic_slice_fold
-+func.func @dynamic_slice_fold(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<1x1xi32> {
-+  %0 = stablehlo.constant dense<256> : tensor<6x1xi32>
-+  %1 = "stablehlo.dynamic_slice"(%0, %arg0, %arg1) <{slice_sizes = array<i64: 1, 1>}> : (tensor<6x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x1xi32>
++// -----
 +
-+  // CHECK: %[[RESULT:.*]] = stablehlo.constant dense<256> : tensor<1x1xi32>
-+  // CHECK: return %[[RESULT]]
-+  return %1 : tensor<1x1xi32>
++// CHECK-LABEL: func.func @case_fold_inline_call_tf_function
++func.func @case_fold_inline_call_tf_function(%arg0: !stablehlo.token {jax.token = true}, %arg1: tensor<16xi32>, %arg2: tensor<16xi64>) -> (!stablehlo.token {jax.token = true}, tensor<16xi32> {jax.result_info = "result"}) {
++  // CHECK: [[RESULT_TOKEN:%.+]] = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2)
++  // CHECK: [[UNUSED_TOKEN:%.+]] = {{"?}}stablehlo.case{{"?}}(
++  // CHECK: return [[RESULT_TOKEN]], %arg1
++  %c = stablehlo.constant dense<1> : tensor<i32>
++  %c_0 = stablehlo.constant dense<0> : tensor<i32>
++  %0 = "stablehlo.case"(%c_0) ({
++    stablehlo.return %c_0 : tensor<i32>
++  }, {
++    stablehlo.return %c : tensor<i32>
++  }) : (tensor<i32>) -> tensor<i32>
++  %1 = "stablehlo.case"(%0) ({
++    %2 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_index = 0 : i64, has_token_input_output = true}} : (!stablehlo.token, tensor<16xi32>, tensor<16xi64>) -> !stablehlo.token
++    stablehlo.return %2 : !stablehlo.token
++  }, {
++    %2 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_index = 1 : i64, has_token_input_output = true}} : (!stablehlo.token, tensor<16xi32>, tensor<16xi64>) -> !stablehlo.token
++    stablehlo.return %2 : !stablehlo.token
++  }) : (tensor<i32>) -> !stablehlo.token
++  return %1, %arg1 : !stablehlo.token, tensor<16xi32>
  }
  
  // -----
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -73,7 +73,7 @@
- template <typename FromOpTy, typename ToOpTy>
- struct HloNaryElementwiseAdaptor {
-   static ToOpTy createOp(FromOpTy fromOp, Type resultType,
--                         ValueRange broadcastedOperands, OpBuilder& builder) {
-+                         ValueRange broadcastedOperands, OpBuilder &builder) {
-     return builder.create<ToOpTy>(fromOp.getLoc(), resultType,
-                                   broadcastedOperands);
-   }
-@@ -118,7 +118,7 @@
- struct HloCompareAdaptor {
-   static mlir::stablehlo::CompareOp createOp(
-       mlir::chlo::BroadcastCompareOp fromOp, Type resultType,
--      ValueRange broadcastedOperands, OpBuilder& builder) {
-+      ValueRange broadcastedOperands, OpBuilder &builder) {
-     auto chloDirection = fromOp.getComparisonDirection();
-     auto hloDirection = toStableHloComparisonDirection(chloDirection);
-     if (!hloDirection) return nullptr;
-@@ -140,9 +140,9 @@
- // to take a ChloOpTy, NonBroadcastingOpTy, and an Adaptor as templated values.
- template <template <typename, typename, typename> typename Pattern,
-           typename... ConstructorArgs>
--static void populateForBroadcastingBinaryOp(MLIRContext* context,
--                                            RewritePatternSet* patterns,
--                                            ConstructorArgs&&... args) {
-+static void populateForBroadcastingBinaryOp(MLIRContext *context,
-+                                            RewritePatternSet *patterns,
-+                                            ConstructorArgs &&...args) {
- #define POPULATE_BCAST(ChloOp, HloOp)                                          \
-   patterns                                                                     \
-       ->add<Pattern<ChloOp, HloOp, HloNaryElementwiseAdaptor<ChloOp, HloOp>>>( \
-@@ -179,21 +179,21 @@
-       context, args...);
- }
+diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
+--- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
++++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
+@@ -128,6 +128,16 @@
+   return %7 : tensor<3x2x3x3xi32>
+ }
+ 
++// CHECK-LABEL: func.func @broadcast_in_dim_nested_bounded
++func.func @broadcast_in_dim_nested_bounded(%arg0: tensor<3x3xi32>, %arg1: tensor<i32>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>> {
++  // CHECK: [[SDS:%.+]] = stablehlo.set_dimension_size
++  // CHECK-NEXT: stablehlo.broadcast_in_dim [[SDS]], dims = [2, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
++  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<3x3xi32>, tensor<i32>) -> tensor<?x3xi32, #stablehlo.bounds<3, ?>>
++  %1 = stablehlo.broadcast_in_dim %0, dims = [1, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>
++  %2 = stablehlo.broadcast_in_dim %1, dims = [0, 2, 1] : (tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
++  return %2 : tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
++}
++
+ // CHECK-LABEL: func.func @broadcast_in_dim_reshape
+ // CHECK-SAME:   ([[ARG0:%.+]]: tensor<3x6xi32>)
+ func.func @broadcast_in_dim_reshape(%arg0: tensor<3x6xi32>)
+@@ -140,6 +150,15 @@
  
--static Value getConstantLikeMaxFiniteValue(OpBuilder& b, Location loc,
-+static Value getConstantLikeMaxFiniteValue(OpBuilder &b, Location loc,
-                                            Value val) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-       b, loc, llvm::APFloat::getLargest(ty.getFloatSemantics()), val);
- }
+   // CHECK-NEXT: return [[R0]], [[R5]]
+   return %0, %5 : tensor<1x3x6xi32>, tensor<3x6x1xi32>
++}
++
++// CHECK-LABEL: func.func @broadcast_in_dim_bounded_no_reshape
++func.func @broadcast_in_dim_bounded_no_reshape(%arg0: tensor<20xf32>, %arg1: tensor<i32>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>> {
++  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<20xf32>, tensor<i32>) -> tensor<?xf32, #stablehlo.bounds<20>>
++  // CHECK: stablehlo.set_dimension_size
++  // CHECK-NEXT: stablehlo.broadcast_in_dim
++  %1 = stablehlo.broadcast_in_dim %0, dims = [1] : (tensor<?xf32, #stablehlo.bounds<20>>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>>
++  return %1 : tensor<1x?xf32, #stablehlo.bounds<?, 20>>
+ }
+ 
+ // CHECK-LABEL: func.func @broadcast_in_dim_prefer_nested_reshape
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
+--- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
++++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
+@@ -63,7 +63,8 @@
+   // Get tensor type
+   mlir::RankedTensorType tensor_type = dyn_cast<RankedTensorType>(op.getType());
+   if (!tensor_type)
+-    return emitError(op.getLoc(), "expected ranked tensor type");
++    return emitError(op.getLoc(),
++                     "expected ranked tensor type for broadcast inputs");
+ 
+   auto encoding =
+       mlir::dyn_cast_if_present<mlir::stablehlo::TypeExtensionsAttr>(
+@@ -78,10 +79,11 @@
+   return dimensions;
+ }
+ 
+-FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(const Dimensions& a,
++FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(Value op,
++                                                       const Dimensions& a,
+                                                        const Dimensions& b) {
+   LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] inputs: "
+-                          << toString(a) << " * " << toString(b));
++                          << toString(a) << " * " << toString(b) << "\n");
+   size_t max_rank = std::max(a.size(), b.size());
+   Dimensions result(max_rank);
+ 
+@@ -110,14 +112,14 @@
+ 
+     // If both LHS and RHS are not 1, dim size must match.
+     if (dim_a.size != dim_b.size) {
+-      return emitError(a[a_idx].boundOp.value().getLoc(),
+-                       "incompatible shapes for broadcasting ")
++      // FIXME
++      return emitError(op.getLoc(), "incompatible shapes for broadcasting ")
+              << dim_a.size << " and " << dim_b.size;
+     }
  
--static Value getConstantLikeInfValue(OpBuilder& b, Location loc, Value val,
-+static Value getConstantLikeInfValue(OpBuilder &b, Location loc, Value val,
-                                      bool negative) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-       b, loc, llvm::APFloat::getInf(ty.getFloatSemantics(), negative), val);
- }
+     // If bounded both must be bounded
+     if (dim_a.boundOp.has_value() != dim_b.boundOp.has_value()) {
+-      return emitError(a[a_idx].boundOp.value().getLoc(),
++      return emitError(op.getLoc(),
+                        "cannot mix bounded and static dimensions in broadcast");
+     }
  
--static Value getConstantLikeSmallestNormalizedValue(OpBuilder& b, Location loc,
-+static Value getConstantLikeSmallestNormalizedValue(OpBuilder &b, Location loc,
-                                                     Value val) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-@@ -239,7 +239,7 @@
- 
-   LogicalResult matchAndRewrite(
-       ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     // Only rewrite for statically determinable non-broadcasting cases.
-     auto lhsType = dyn_cast<RankedTensorType>(adaptor.getLhs().getType());
-     auto rhsType = dyn_cast<RankedTensorType>(adaptor.getRhs().getType());
-@@ -329,7 +329,7 @@
- 
-   LogicalResult matchAndRewrite(
-       ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     // Only support ranked operands.
-     Value lhs = adaptor.getLhs();
-     Value rhs = adaptor.getRhs();
-@@ -413,7 +413,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ConstantLikeOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     auto resultTy = cast<ShapedType>(op.getType());
- 
-     // Unranked uses are not supported.
-@@ -445,7 +445,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::BroadcastSelectOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     // Only support ranked operands.
-     Value pred = adaptor.getPred();
-     Value onTrue = adaptor.getOnTrue();
-@@ -533,7 +533,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ConstantOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     rewriter.replaceOpWithNewOp<mlir::stablehlo::ConstantOp>(op, op.getValue());
-     return success();
+@@ -126,7 +128,7 @@
    }
-@@ -541,7 +541,7 @@
- 
- template <typename FTy>
- static Value materializeChebyshevPolynomialApproximation(
--    OpBuilder& rewriter, Location loc, Value x, ArrayRef<FTy> coefficients) {
-+    OpBuilder &rewriter, Location loc, Value x, ArrayRef<FTy> coefficients) {
-   Value b0 = getConstantLike(rewriter, loc, 0.0, x);
-   Value b1 = getConstantLike(rewriter, loc, 0.0, x);
-   Value b2 = getConstantLike(rewriter, loc, 0.0, x);
-@@ -561,7 +561,7 @@
- }
- 
- template <typename FTy>
--static Value materializeBesselI1eApproximation(OpBuilder& rewriter,
-+static Value materializeBesselI1eApproximation(OpBuilder &rewriter,
-                                                Location loc, Value x,
-                                                ArrayRef<FTy> kI1eCoeffsA,
-                                                ArrayRef<FTy> kI1eCoeffsB) {
-@@ -594,7 +594,7 @@
-       loc, rewriter.create<mlir::stablehlo::SignOp>(loc, x), select);
- }
- 
--Value materializeBesselI1eApproximationF32(OpBuilder& rewriter, Location loc,
-+Value materializeBesselI1eApproximationF32(OpBuilder &rewriter, Location loc,
-                                            ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-@@ -620,7 +620,7 @@
-                                                   kI1eCoeffsB);
- }
- 
--static Value materializeBesselI1eApproximationF64(OpBuilder& rewriter,
-+static Value materializeBesselI1eApproximationF64(OpBuilder &rewriter,
-                                                   Location loc,
-                                                   ValueRange args) {
-   Value x = args.front();
-@@ -663,10 +663,10 @@
-                                                    kI1eCoeffsA, kI1eCoeffsB);
- }
- 
--static Value materializeWithUpcast(ConversionPatternRewriter& rewriter,
-+static Value materializeWithUpcast(ConversionPatternRewriter &rewriter,
-                                    Location loc, ValueRange args,
-                                    FloatType minPrecisionTy,
--                                   Value callback(OpBuilder&, Location,
-+                                   Value callback(OpBuilder &, Location,
-                                                   ValueRange)) {
-   Type originalTy = getElementTypeOrSelf(args.front().getType());
-   auto floatOriginalTy = dyn_cast<FloatType>(originalTy);
-@@ -699,7 +699,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::BesselI1eOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     Value x = adaptor.getOperand();
-     Type ty = cast<ShapedType>(x.getType()).getElementType();
-@@ -725,7 +725,7 @@
- };
- 
- template <typename FTy>
--static Value materializePolynomialApproximation(OpBuilder& rewriter,
-+static Value materializePolynomialApproximation(OpBuilder &rewriter,
-                                                 Location loc, Value x,
-                                                 ArrayRef<FTy> coefficients) {
-   if (coefficients.empty()) return getConstantLike(rewriter, loc, 0.0, x);
-@@ -746,7 +746,7 @@
- // argument and derive the final approximation for all |x| >= 1.
- // This implementation is based on Cephes.
- static Value materializeErfcApproximationF64ForMagnituteGeOne(
--    ConversionPatternRewriter& rewriter, Location loc, ValueRange args) {
-+    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-          "expect f64 element type");
-@@ -831,7 +831,7 @@
- // Precondition is |x| <= 1. Use erfc approximation, otherwise.
- // This implementation is based on Cephes.
- static Value materializeErfApproximationF64ForMagnituteLeOne(
--    ConversionPatternRewriter& rewriter, Location loc, ValueRange args) {
-+    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-          "expect f64 element type");
-@@ -856,7 +856,7 @@
- }
  
- // This implementation is based on Cephes.
--static Value materializeErfApproximationF64(ConversionPatternRewriter& rewriter,
-+static Value materializeErfApproximationF64(ConversionPatternRewriter &rewriter,
-                                             Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-@@ -884,7 +884,7 @@
+   LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] result: "
+-                          << toString(result));
++                          << toString(result) << "\n");
+   return result;
  }
  
- static Value materializeErfcApproximationF64(
--    ConversionPatternRewriter& rewriter, Location loc, ValueRange args) {
-+    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-          "expect f64 element type");
-@@ -916,7 +916,7 @@
- // argument and derive the final approximation for all |x| >= 1.
- // This implementation is based on Cephes.
- static Value materializeErfcApproximationF32ForMagnitudeGeOne(
--    OpBuilder& rewriter, Location loc, ValueRange args) {
-+    OpBuilder &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-          "expect f32 element type");
-@@ -982,7 +982,7 @@
- // Precondition is |x| <= 1. Use erfc approximation, otherwise.
- // This implementation is based on Cephes.
- static Value materializeErfApproximationF32ForMagnitudeLeOne(
--    OpBuilder& rewriter, Location loc, ValueRange args) {
-+    OpBuilder &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-          "expect f32 element type");
-@@ -1001,7 +1001,7 @@
- }
+@@ -155,8 +157,11 @@
  
- // This is the same approximation as used in Eigen.
--static Value materializeErfApproximationF32(OpBuilder& rewriter, Location loc,
-+static Value materializeErfApproximationF32(OpBuilder &rewriter, Location loc,
-                                             ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-@@ -1038,7 +1038,7 @@
-                                                    erf, ubErf);
- }
+ }  // namespace
  
--static Value materializeErfcApproximationF32(OpBuilder& rewriter, Location loc,
-+static Value materializeErfcApproximationF32(OpBuilder &rewriter, Location loc,
-                                              ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-@@ -1070,7 +1070,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ErfOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     Value x = adaptor.getOperand();
-     Type ty = cast<ShapedType>(x.getType()).getElementType();
-@@ -1098,7 +1098,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ErfcOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     Value x = adaptor.getOperand();
-     Type ty = cast<ShapedType>(x.getType()).getElementType();
-@@ -1121,7 +1121,7 @@
+-FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops) {
+-  if (ops.empty()) return failure();
++FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
++                                             ArrayRef<Value> ops) {
++  if (ops.empty())
++    return emitError(builder.getInsertionPoint()->getLoc(),
++                     "requires at least one operand to broadcast");
+ 
+   Value first = ops[0];
+   auto bcastShapeOrFail = getDimensions(first);
+@@ -168,7 +173,7 @@
+     auto dims = getDimensions(currOp);
+     if (failed(dims)) return failure();
+     auto currBcastShapeOrFail =
+-        getNumpyBroadcastShapeWithBounds(bcastShape, *dims);
++        getNumpyBroadcastShapeWithBounds(currOp, bcastShape, *dims);
+     if (failed(currBcastShapeOrFail)) return failure();
+     bcastShape = std::move(*currBcastShapeOrFail);
+   }
+@@ -192,7 +197,7 @@
+ FailureOr<SmallVector<Value>> numpyBroadcastIfNeeded(OpBuilder& builder,
+                                                      ArrayRef<Value> operands) {
+   // Figure out the broadcast shape
+-  auto bcastShapeOrFail = getNumpyBroadcastShape(operands);
++  auto bcastShapeOrFail = getNumpyBroadcastShape(builder, operands);
+   if (failed(bcastShapeOrFail)) return failure();
+   Dimensions bcastShape = std::move(*bcastShapeOrFail);
+ 
+@@ -208,35 +213,34 @@
+ 
+ FailureOr<Value> numpyBroadcastIfNeeded(OpBuilder& builder, Value input,
+                                         const Dimensions& shape) {
+-  LLVM_DEBUG(llvm::dbgs() << "[BroadcastIfNeeded] input: " << input
+-                          << " shape: " << toString(shape));
++  LLVM_DEBUG(llvm::dbgs() << "[numpyBroadcastIfNeeded] Broadcasting input "
++                          << input.getType() << " => " << toString(shape)
++                          << "\n");
+   auto loc = input.getLoc();
+-  mlir::RankedTensorType input_type =
++  mlir::RankedTensorType inputType =
+       dyn_cast<RankedTensorType>(input.getType());
+-  if (!input_type) return emitError(input.getLoc(), "expected tensor type");
+-  mlir::RankedTensorType output_type =
+-      getRankedTensorType(shape, input_type.getElementType());
++  if (!inputType)
++    return emitError(loc, "expected ranked tensor type for broadcast inputs");
++  mlir::RankedTensorType outputType =
++      getRankedTensorType(shape, inputType.getElementType());
+ 
+   // Short circuit if no broadcasting is needed.
+-  if (input_type == output_type) return input;
+-
+-  int64_t input_rank = input_type.getRank();
+-  int64_t output_rank = output_type.getRank();
+-  if (input_rank > output_rank)
++  if (inputType == outputType) return input;
++
++  int64_t inputRank = inputType.getRank();
++  int64_t outputRank = outputType.getRank();
++  if (inputRank > outputRank)
+     return emitError(loc, "input rank must be <= output rank, got ")
+-           << input_rank << " vs " << output_rank;
+-
+-  size_t rank_diff = output_rank - input_rank;
+-  SmallVector<int64_t> bcast_dims;
+-  bcast_dims.reserve(input_rank);
+-
++           << inputRank << " vs " << outputRank;
++
++  size_t rankDiff = outputRank - inputRank;
+   auto inputShapeOrFail = getDimensions(input);
+   if (failed(inputShapeOrFail)) return failure();
+   Dimensions inputShape = std::move(*inputShapeOrFail);
+ 
+   // Construct broadcast dimensions.
+   auto broadcastDimensions = llvm::to_vector(
+-      llvm::seq<int64_t>(output_rank - input_rank, output_rank));
++      llvm::seq<int64_t>(outputRank - inputRank, outputRank));
+ 
+   // Construct the result type of the broadcast
+   //  - If input is static and target shape is static, use static shape.
+@@ -244,33 +248,35 @@
+   //  - If input is not bounded, but target shape is bounded, broadcast to
+   //    the padded shape then call SetDimensionSize to make dynamic.
+   auto bcastShape = shape;
+-  for (int64_t i = 0; i < input_rank; ++i) {
+-    int64_t input_dim_size = inputShape[i].size;
+-    int64_t result_idx = i + rank_diff;
+-    int64_t result_dim_size = shape[result_idx].size;
+-    if (input_dim_size != 1 && input_dim_size != result_dim_size)
++  for (int64_t i = 0; i < inputRank; ++i) {
++    int64_t inputDimSize = inputShape[i].size;
++    int64_t resultIdx = i + rankDiff;
++    int64_t resultDimSize = shape[resultIdx].size;
++    if (inputDimSize != 1 && inputDimSize != resultDimSize)
+       return emitError(loc, "Cannot broadcast input: ")
+-             << input_type << " to target shape " << toString(shape);
++             << inputType << " to target shape " << toString(shape);
+ 
+     if (!inputShape[i].boundOp.has_value() &&
+-        shape[result_idx].boundOp.has_value()) {
++        shape[resultIdx].boundOp.has_value()) {
+       // Use padded shape in broadcast.
+-      bcastShape[result_idx] = DimensionInfo{shape[result_idx].size};
+-    }
+-    bcast_dims.push_back(result_idx);
++      bcastShape[resultIdx] = DimensionInfo{shape[resultIdx].size};
++    }
    }
- };
- 
--static Value erfInv32(OpBuilder& b, Location loc, ValueRange args) {
-+static Value erfInv32(OpBuilder &b, Location loc, ValueRange args) {
-   constexpr int kDegree = 9;
-   constexpr std::array<float, 9> wLessThan5Constants = {
-       2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
-@@ -1178,7 +1178,7 @@
-       result);
- }
- 
--static Value erfInv64(ConversionPatternRewriter& b, Location loc,
-+static Value erfInv64(ConversionPatternRewriter &b, Location loc,
-                       ValueRange args) {
-   constexpr std::array<double, 23> wLessThan625Constants = {
-       -3.6444120640178196996e-21, -1.685059138182016589e-19,
-@@ -1298,7 +1298,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ErfInvOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     if (op.getType().getElementType().isF64()) {
-       rewriter.replaceOp(op, erfInv64(rewriter, loc, adaptor.getOperands()));
-@@ -1338,7 +1338,7 @@
- //   with   t(z) = z + kLanczosGamma + 1/2
- //          a(z) = kBaseLanczosCoeff
- //                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
--Value materializeLgamma(OpBuilder& rewriter, Location loc, ValueRange args) {
-+Value materializeLgamma(OpBuilder &rewriter, Location loc, ValueRange args) {
-   // If the input is less than 0.5 use Euler's reflection formula.
-   //   gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
-   // Let z be
-@@ -1485,7 +1485,7 @@
- // +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
- // correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
- // we deem this acceptable.
--static Value materializeCoshApproximation(OpBuilder& rewriter, Location loc,
-+static Value materializeCoshApproximation(OpBuilder &rewriter, Location loc,
-                                           ValueRange operands) {
-   mlir::chlo::CoshOp::Adaptor transformed(operands);
-   Value x = transformed.getOperand();
-@@ -1504,7 +1504,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::CoshOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     rewriter.replaceOp(
-         op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                   rewriter.getF32Type(),
-@@ -1523,7 +1523,7 @@
- //          a(z) = kBaseLanczosCoeff
- //                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
- //          a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
--Value materializeDigamma(OpBuilder& rewriter, Location loc, ValueRange args) {
-+Value materializeDigamma(OpBuilder &rewriter, Location loc, ValueRange args) {
-   // If the input is less than 0.5 use Euler's reflection formula.
-   //   digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-   // Let z be
-@@ -1630,14 +1630,14 @@
- 
- namespace {
- 
--static Value getConstantLikeSmallestFiniteValue(OpBuilder& b, Location loc,
-+static Value getConstantLikeSmallestFiniteValue(OpBuilder &b, Location loc,
-                                                 Value val) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-       b, loc, llvm::APFloat::getSmallest(ty.getFloatSemantics()), val);
- }
- 
--static Value materializeZeta(OpBuilder& rewriter, Location loc,
-+static Value materializeZeta(OpBuilder &rewriter, Location loc,
-                              ValueRange args) {
-   // Implementation ported from:
-   // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
-@@ -1790,7 +1790,7 @@
- 
- }  // namespace
  
--Value materializePolygamma(OpBuilder& rewriter, Location loc, ValueRange args) {
-+Value materializePolygamma(OpBuilder &rewriter, Location loc, ValueRange args) {
-   mlir::chlo::PolygammaOp::Adaptor transformed(args);
-   Value n = transformed.getN();
-   Value x = transformed.getX();
-@@ -1840,7 +1840,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::LgammaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-         op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-@@ -1854,7 +1854,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::DigammaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-         op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-@@ -1863,7 +1863,7 @@
+   // Broadcast to padded size for remaining dimensions.
+-  for (size_t i = input_rank; i < shape.size(); ++i) {
++  for (size_t i = 0; i < rankDiff; ++i) {
+     bcastShape[i] = DimensionInfo{shape[i].size};
    }
- };
- 
--static Value materializeNextAfter(ConversionPatternRewriter& rewriter,
-+static Value materializeNextAfter(ConversionPatternRewriter &rewriter,
-                                   Location loc, ValueRange operands) {
-   mlir::chlo::NextAfterOp::Adaptor transformed(operands);
-   Value x = transformed.getX();
-@@ -1957,7 +1957,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::NextAfterOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     rewriter.replaceOp(
-         op, materializeNextAfter(rewriter, op.getLoc(), adaptor.getOperands()));
-     return success();
-@@ -1969,7 +1969,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::PolygammaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-@@ -1989,7 +1989,7 @@
- // +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
- // correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
- // we deem this acceptable.
--static Value materializeSinhApproximationForLargeX(OpBuilder& rewriter,
-+static Value materializeSinhApproximationForLargeX(OpBuilder &rewriter,
-                                                    Location loc,
-                                                    ValueRange operands) {
-   mlir::chlo::SinhOp::Adaptor transformed(operands);
-@@ -2007,7 +2007,7 @@
- // Express `sinh` as
- //   sinh(x) = (e^x - e^-x) / 2                     if |x| < 1
- //           = e^(x + log(1/2)) - e^(-x + log(1/2)) otherwise.
--static Value materializeSinhApproximation(OpBuilder& rewriter, Location loc,
-+static Value materializeSinhApproximation(OpBuilder &rewriter, Location loc,
-                                           ValueRange operands) {
-   Value largeSinhResult =
-       materializeSinhApproximationForLargeX(rewriter, loc, operands);
-@@ -2043,7 +2043,7 @@
- namespace {
- 
- ArrayAttr convertPrecisionConfig(mlir::ArrayAttr precisionConfig,
--                                 ConversionPatternRewriter& rewriter) {
-+                                 ConversionPatternRewriter &rewriter) {
-   std::vector<Attribute> precisions;
-   for (Attribute precision : precisionConfig.getValue()) {
-     switch (dyn_cast<mlir::chlo::PrecisionAttr>(precision).getValue()) {
-@@ -2077,7 +2077,7 @@
- // In this implementation, the IR size increases by a factor of g. If this
- // becomes a problem, we can try adding stablehlo.while to reduce the IR size.
- LogicalResult handleRaggedDotMode1(mlir::chlo::RaggedDotOp op,
--                                   ConversionPatternRewriter& rewriter) {
-+                                   ConversionPatternRewriter &rewriter) {
-   Value lhs = op.getLhs();
-   Value rhs = op.getRhs();
-   chlo::RaggedDotDimensionNumbersAttr raggedDotDimensionNumbers =
-@@ -2231,7 +2231,7 @@
- //   group_sizes : [g]
- //   result : [g, b, m, n]
- LogicalResult handleRaggedDotMode2(mlir::chlo::RaggedDotOp op,
--                                   ConversionPatternRewriter& rewriter) {
-+                                   ConversionPatternRewriter &rewriter) {
-   return failure();
- }
  
-@@ -2241,7 +2241,7 @@
- //   group_sizes : [g]
- //   result : [b, m, n]
- LogicalResult handleRaggedDotMode3(mlir::chlo::RaggedDotOp op,
--                                   ConversionPatternRewriter& rewriter) {
-+                                   ConversionPatternRewriter &rewriter) {
-   return failure();
+   // Insert broadcast ops
+-  mlir::RankedTensorType bcast_type =
+-      getRankedTensorType(bcastShape, input_type.getElementType());
+-  Value bcast_op = stablehlo::BroadcastInDimOp::create(
+-      builder, loc, bcast_type, input, broadcastDimensions);
+-  if (bcast_op.getType() == output_type) return bcast_op;
++  mlir::RankedTensorType bcastType =
++      getRankedTensorType(bcastShape, inputType.getElementType());
++  LLVM_DEBUG(
++      llvm::dbgs() << "[numpyBroadcastIfNeeded] Broadcast to padded type "
++                   << bcastType << "\n");
++  Value bcastOp = stablehlo::BroadcastInDimOp::create(
++      builder, loc, bcastType, input, broadcastDimensions);
++  if (bcastOp.getType() == outputType) return bcastOp;
+ 
+   // Mark the padded broadcast as dynamic where the result is bounded.
+   // Inserts `GetDimSize(boundOp)->SetDimSize(inputBcast)` for any bounded
+@@ -278,13 +284,13 @@
+   for (size_t i = 0; i < shape.size(); ++i) {
+     if (!bcastShape[i].boundOp.has_value() && shape[i].boundOp.has_value()) {
+       Value boundOp = shape[i].boundOp.value();
+-      auto dim_size = stablehlo::GetDimensionSizeOp::create(
++      auto dimSize = stablehlo::GetDimensionSizeOp::create(
+           builder, loc, boundOp, shape[i].boundOpDim);
+-      bcast_op = stablehlo::SetDimensionSizeOp::create(builder, loc, bcast_op,
+-                                                       dim_size, i);
+-    }
+-  }
+-  return bcast_op;
++      bcastOp = stablehlo::SetDimensionSizeOp::create(builder, loc, bcastOp,
++                                                       dimSize, i);
++    }
++  }
++  return bcastOp;
  }
  
-@@ -2254,7 +2254,7 @@
-   // dimension.
-   LogicalResult matchAndRewrite(
-       mlir::chlo::RaggedDotOp op, OpAdaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     if (op.getLhs().getType().getRank() < op.getRhs().getType().getRank()) {
-       return handleRaggedDotMode1(op, rewriter);
-     } else if (op.getLhs().getType().getRank() <
-@@ -2271,7 +2271,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::SinhOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Value x = adaptor.getOperand();
-     if (isa<ComplexType>(cast<ShapedType>(x.getType()).getElementType())) {
-       rewriter.replaceOp(op, materializeSinhApproximationForLargeX(
-@@ -2321,7 +2321,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::TopKOp op, OpAdaptor /*adaptor*/,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     auto operandType = dyn_cast<RankedTensorType>(op.getOperand().getType());
-     if (!operandType) return failure();
-     int64_t operandRank = operandType.getRank();
-@@ -2436,7 +2436,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ZetaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-@@ -2452,7 +2452,7 @@
- 
- struct ChloLegalizeToStablehloPass final
-     : impl::ChloLegalizeToStablehloPassBase<ChloLegalizeToStablehloPass> {
--  LogicalResult initialize(MLIRContext* context) override {
-+  LogicalResult initialize(MLIRContext *context) override {
-     target = std::make_shared<ConversionTarget>(*context);
-     target->addIllegalDialect<chlo::ChloDialect>();
-     target->addLegalDialect<mlir::stablehlo::StablehloDialect,
-@@ -2482,8 +2482,8 @@
- }  // namespace
- 
- namespace {
--static void populateChloBroadcastingPatterns(MLIRContext* context,
--                                             RewritePatternSet* patterns) {
-+static void populateChloBroadcastingPatterns(MLIRContext *context,
-+                                             RewritePatternSet *patterns) {
-   // Instantiate conversion templates for conforming binary elementwise ops
-   // that do not have different dtypes between operands and results and do
-   // not have special attributes that need to be preserved.
-@@ -2496,8 +2496,8 @@
-   patterns->add<ConvertConstantLikeOp, ConvertSelectOp>(context);
- }
+ }  // namespace stablehlo
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
+--- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
++++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
+@@ -49,7 +49,8 @@
  
--static void populateChloDecompositionPatterns(MLIRContext* context,
--                                              RewritePatternSet* patterns) {
-+static void populateChloDecompositionPatterns(MLIRContext *context,
-+                                              RewritePatternSet *patterns) {
-   populateWithGenerated(*patterns);
-   patterns
-       ->add<ConvertConstantOp, ConvertBesselI1eOp, ConvertCoshOp,
-@@ -2508,8 +2508,8 @@
- }
- }  // namespace
+ // Returns the common shape these ops would broadcast to, or an error if the
+ // ops are not broadcastable.
+-FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops);
++FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
++                                             ArrayRef<Value> ops);
  
--void populateChloToStablehloPatterns(MLIRContext* context,
--                                     RewritePatternSet* patterns) {
-+void populateChloToStablehloPatterns(MLIRContext *context,
-+                                     RewritePatternSet *patterns) {
-   populateChloBroadcastingPatterns(context, patterns);
-   populateChloDecompositionPatterns(context, patterns);
- }
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/Passes.td b/stablehlo/stablehlo/transforms/optimization/Passes.td
---- stablehlo/stablehlo/transforms/optimization/Passes.td
-+++ stablehlo/stablehlo/transforms/optimization/Passes.td
-@@ -23,14 +23,14 @@
-          "explicit MLIR `MemoryEffects`. Notably, this means `func.call` ops "
-          "will be assumed pure.">,
-   Option<"foldOpElementLimit", "fold-op-element-limit", "int64_t",
--         /*default=*/"1",
-+         /*default=*/"65536",
-          "Folding an op into a constant can sometimes come at the cost of "
-          "memory overhead. (This occurs if the op's inputs are reused, meaning "
-          "that they can't be deleted after the op is folded to a constant, or "
--         "when folding operations like `iota` whose outputs take up more "
-+         "when folding operations like `concat` whose outputs take up more "
-          "memory than their inputs.) In such cases, this config option sets an "
-          "upper limit on how many elements an op's result may have before the "
--         "op is no longer folded.">,
-+         "op is no longer folded. Splat folds are exempt from this limit.">,
-   Option<"optimizeFloat", "optimize-float", "bool", /*default=*/"true",
-          "Allow float optimizations that, though mathematically equivalent, "
-          "may result in slightly different quantization of floating-point "
+ // Apply numpy broadcasting to the given operands, returning an error if any
+ // operands are not broadcastable.
 diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
 --- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
 +++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-@@ -74,12 +74,39 @@
- 
- static constexpr StablehloAggressiveFolderPassOptions kDefaultOptions;
- 
-+APSInt getAPSInt(Type type, uint64_t value) {
-+  unsigned numBits;
-+  bool isUnsigned;
-+  if (auto integerType = dyn_cast<IntegerType>(type)) {
-+    numBits = integerType.getWidth();
-+    // Signless types are treated as signed, per StableHLO convention.
-+    isUnsigned = integerType.isUnsignedInteger();
-+  } else {
-+    llvm::report_fatal_error("expected integer type");
-+  }
-+  return APSInt(
-+      {/*numBits=*/numBits, value, /*isSigned=*/false, /*implicitTrunc=*/true},
-+      /*isUnsigned=*/isUnsigned);
-+}
-+
- template <typename T>
- APSInt getAPSInt(unsigned bitWidth, T value, bool isSigned) {
-   return APSInt({/*numBits=*/bitWidth, static_cast<uint64_t>(value),
-                  /*isSigned=*/isSigned,
-                  /*implicitTrunc=*/true},
+@@ -14,6 +14,7 @@
+ 
+ #include <cassert>
+ #include <cmath>
++#include <complex>
+ #include <cstddef>
+ #include <cstdint>
+ #include <functional>
+@@ -38,6 +39,7 @@
+ #include "mlir/Dialect/CommonFolders.h"
+ #include "mlir/Dialect/Func/IR/FuncOps.h"
+ #include "mlir/Dialect/Utils/IndexingUtils.h"
++#include "mlir/IR/Builders.h"
+ #include "mlir/IR/BuiltinAttributeInterfaces.h"
+ #include "mlir/IR/BuiltinAttributes.h"
+ #include "mlir/IR/BuiltinTypeInterfaces.h"
+@@ -82,6 +84,71 @@
                  /*isUnsigned=*/!isSigned);
-+}
-+
-+APFloat getAPFloat(
-+    Type type, double value,
-+    llvm::RoundingMode roundingMode = llvm::RoundingMode::NearestTiesToEven) {
-+  auto floatType = dyn_cast<FloatType>(type);
-+  if (!floatType) llvm::report_fatal_error("expected float type");
-+
-+  APFloat result(value);
-+  bool unusedLosesInfo = false;
-+  result.convert(floatType.getFloatSemantics(), roundingMode, &unusedLosesInfo);
-+  return result;
  }
  
- LogicalResult validateStaticShapeResult(PatternRewriter& rewriter,
-@@ -1256,21 +1283,48 @@
-       return rewriter.notifyMatchFailure(
-           op, "expected operand with static ranked tensor type");
- 
--    ElementsAttr els;
-+    DenseElementsAttr els;
-     if (!matchPattern(operand, m_Constant(&els)))
-       return rewriter.notifyMatchFailure(
-           op, "expected constant integer or float operand");
- 
-+    // Short circuit on splat resizes
-+    if (els.isSplat()) {
-+      rewriter.replaceOpWithNewOp<ConstantOp>(op, els.resizeSplat(resultType));
-+      return success();
++class LazyPlaceholderValue {
++ public:
++  static FailureOr<LazyPlaceholderValue> preparePlaceholderFor(
++      PatternRewriter& rewriter, Value likeValue) {
++    Type valueType = likeValue.getType();
++
++    // If `getZeroAttr(valueType)` returns a valid attribute, simply wrap the
++    // result in a `stablehlo.constant` op.
++    if (TypedAttr placeholderAttr = rewriter.getZeroAttr(valueType)) {
++      return LazyPlaceholderValue([&rewriter, placeholderAttr](Location loc) {
++        return ConstantOp::create(rewriter, loc, placeholderAttr);
++      });
 +    }
 +
-     DenseElementsAttr resAttr;
--    if (auto data = els.tryGetValues<APInt>())
-+    if (auto data = els.tryGetValues<APInt>(); succeeded(data))
-       resAttr = sliceType(op, *data);
--    else if (auto data = els.tryGetValues<APFloat>())
-+    else if (auto data = els.tryGetValues<APFloat>(); succeeded(data))
-       resAttr = sliceType(op, *data);
-     else
-       return rewriter.notifyMatchFailure(op.getLoc(),
-                                          "unsupported element type");
- 
-     rewriter.replaceOpWithNewOp<ConstantOp>(op, resAttr);
-+    return success();
-+  }
-+};
++    // `getZeroAttr` doesn't support complex types, so we handle that case here.
++    if (auto shapedType = dyn_cast<ShapedType>(valueType)) {
++      if (auto complexElementType =
++              dyn_cast<ComplexType>(shapedType.getElementType())) {
++        if (!isa<FloatType>(complexElementType.getElementType()))
++          return rewriter.notifyMatchFailure(
++              likeValue.getLoc(),
++              "unexpected real component type for complex element type");
++        auto realImagComponentFloatType =
++            cast<FloatType>(complexElementType.getElementType());
++        APFloat apFloatZero(0.0);
++        bool losesInfo;
++        apFloatZero.convert(realImagComponentFloatType.getFloatSemantics(),
++                            llvm::RoundingMode::NearestTiesToEven, &losesInfo);
++        std::complex<APFloat> complexZeroScalar(apFloatZero, apFloatZero);
++        auto complexZeroSplat =
++            SplatElementsAttr::get(shapedType, complexZeroScalar);
++        return LazyPlaceholderValue(
++            [&rewriter, complexZeroSplat](Location loc) {
++              return ConstantOp::create(rewriter, loc, complexZeroSplat);
++            });
++      }
++    }
 +
-+// Pattern: dynamic_slice(splat_cst, start, end) -> resized_splat_cst
-+struct FoldDynamicSliceOpPattern : public FoldOpRewritePattern<DynamicSliceOp> {
-+  using FoldOpRewritePattern::FoldOpRewritePattern;
++    // If `valueType` is a token type, use `stablehlo.after_all` with no
++    // arguments to create a placeholder token.
++    if (isa<TokenType>(valueType)) {
++      return LazyPlaceholderValue([&rewriter](Location loc) {  //
++        return AfterAllOp::create(rewriter, loc, {});
++      });
++    }
 +
-+  LogicalResult matchAndRewrite(DynamicSliceOp op,
-+                                PatternRewriter& rewriter) const override {
-+    auto resultType = op.getType();
-+    if (failed(validateStaticShapeResult(rewriter, op, resultType)))
-+      return failure();
++    // TODO: Support quantized and buffer types.
 +
-+    SplatElementsAttr inputSplatAttr;
-+    if (!matchPattern(op.getOperand(), m_Constant(&inputSplatAttr)) ||
-+        !inputSplatAttr)
-+      return rewriter.notifyMatchFailure(op, "Input must be a splat constant.");
++    return rewriter.notifyMatchFailure(
++        likeValue.getLoc(), "unable to create placeholder value for type");
++  }
 +
-+    rewriter.replaceOpWithNewOp<ConstantOp>(
-+        op, inputSplatAttr.resizeSplat(resultType));
-     return success();
-   }
- };
-@@ -1482,6 +1536,14 @@
-       rewriter.replaceOpWithNewOp<ConstantOp>(
-           op, DenseIntElementsAttr::get(resultType, values));
-       return success();
-+    }
++  Value createAt(Location loc) const {
++    if (!lazyInitializer)
++      llvm::report_fatal_error("No lazy initializer for this value type.");
++    return lazyInitializer(loc);
++  }
 +
-+    // TODO: Support more iota folding, but doing so currently causes OOMs,
-+    // so this pattern needs to be enabled more carefully.
-+    if (outputSize != 1) {
-+      return rewriter.notifyMatchFailure(
-+          op, "expected output size to be 1, but got: " +
-+                  std::to_string(outputSize));
++ private:
++  LazyPlaceholderValue(std::function<Value(Location)> lazyInitializer)
++      : lazyInitializer(std::move(lazyInitializer)) {}
++
++  std::function<Value(Location)> lazyInitializer;
++};
++
+ LogicalResult validateStaticShapeResult(PatternRewriter& rewriter,
+                                         Operation* op, ShapedType resultType) {
+   if (!resultType.hasStaticShape())
+@@ -737,18 +804,14 @@
+     Operation* terminator = blockToInline->getTerminator();
+     ValueRange results = terminator->getOperands();
+ 
+-    // TODO: Add support for complex, quantized, and token return types.
+-    // Currently, this pattern only supports int and float return types. We'll
+-    // need a more general equivalent of `getZeroAttr` to support other types.
+-    SmallVector<TypedAttr> placeholderAttrs;
++    SmallVector<LazyPlaceholderValue> lazyPlaceholderResults;
+     for (auto result : op.getResults()) {
+-      TypedAttr placeholderAttr = rewriter.getZeroAttr(result.getType());
+-      if (!placeholderAttr)
+-        return rewriter.notifyMatchFailure(
+-            op,
+-            "The case op's return type isn't currently supported by this "
+-            "optimization pattern.");
+-      placeholderAttrs.push_back(placeholderAttr);
++      auto placeholder =
++          LazyPlaceholderValue::preparePlaceholderFor(rewriter, result);
++
++      if (failed(placeholder)) return failure();
++
++      lazyPlaceholderResults.push_back(std::move(placeholder.value()));
      }
  
-     int64_t sequences = 1;
-@@ -1881,6 +1943,7 @@
-   patterns->add<FoldConcatenateOpPattern>(context, options, benefit);
-   patterns->add<FoldConvertOpPattern>(context, options, benefit);
-   patterns->add<FoldDivOpPattern>(context, options, benefit);
-+  patterns->add<FoldDynamicSliceOpPattern>(context, options, benefit);
-   patterns->add<FoldGetDimensionSizeOpPattern>(context, options, benefit);
-   patterns->add<FoldMaxOpPattern>(context, options, benefit);
-   patterns->add<FoldMinOpPattern>(context, options, benefit);
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp
---- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp
-+++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp
-@@ -331,7 +331,7 @@
- DenseI64ArrayAttr getInvertedBroadcastDimensions(OpBuilder& b,
-                                                  ArrayRef<int64_t> dims) {
-   SmallVector<int64_t> permutation(dims.size());
--  for (size_t i = 0; i < dims.size(); ++i) {
-+  for (auto i = 0; i < dims.size(); ++i) {
-     permutation[dims[i]] = i;
-   }
-   return b.getDenseI64ArrayAttr(permutation);
+     // Inline the active branch of the `case` op.
+@@ -763,9 +826,9 @@
+     Block& noopBlock = region.emplaceBlock();
+     SmallVector<Value> placeholderResults;
+     rewriter.setInsertionPointToEnd(&noopBlock);
+-    for (auto placeholderAttr : placeholderAttrs) {
++    for (const auto& lazyPlaceholderResult : lazyPlaceholderResults) {
+       placeholderResults.push_back(
+-          ConstantOp::create(rewriter, region.getLoc(), placeholderAttr));
++          lazyPlaceholderResult.createAt(region.getLoc()));
+     }
+     stablehlo::ReturnOp::create(rewriter, region.getLoc(), placeholderResults);
+ 
+diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
+--- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
++++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
+@@ -44,7 +44,8 @@
+     "same number of elements">;
+ 
+ def BroadcastNotReducibleToReshape : Constraint<
+-    CPred<"llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
++    CPred<"!llvm::cast<ShapedType>($0.getType()).hasStaticShape() || "
++          "llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
+           "!("
+             "llvm::is_sorted($0.getDefiningOp<stablehlo::BroadcastInDimOp>().getBroadcastDimensions()) && "
+             "llvm::cast<ShapedType>($0.getType()).getNumElements() == llvm::cast<ShapedType>($1.getType()).getNumElements()"
+@@ -134,8 +135,7 @@
+ 
+ def MergePermutations : NativeCodeCall<"getMergedTransposePermutation($_builder, $0, $1)">;
+ 
+-def MergeDiscardableAttributes
+-    : NativeCodeCall<"mergeDiscardableAttributes($0, $1)">;
++def MergeDiscardableAttributes : NativeCodeCall<"mergeDiscardableAttributes($0, $1)">;
+ 
+ def StableHLO_ConvertOpWithShape : NativeCodeCall<
+     "stablehlo::ConvertOp::create($_builder, $_loc, $0.getType(), $1)">;
+@@ -151,10 +151,10 @@
+ 
+ // op(cst, X) -> op(X, cst)
+ class CanonicalizeConstantToRhs<Op StableHLO_OpType>
+-    : Pat<(StableHLO_OpType:$op (StableHLO_ConstantOp:$lhs $value), $rhs),
+-          (StableHLO_OpType:$new_op $rhs, $lhs),
+-          [(NotConstantOp $rhs), (CommutativeOp $op)],
+-          [(MergeDiscardableAttributes $op, $new_op)]>;
++  : Pat<(StableHLO_OpType:$op (StableHLO_ConstantOp:$lhs $value), $rhs),
++        (StableHLO_OpType:$new_op $rhs, $lhs),
++        [(NotConstantOp $rhs), (CommutativeOp $op)],
++        [(MergeDiscardableAttributes $op, $new_op)]>;
+ 
+ ////////
+ // AddOp
+@@ -165,9 +165,9 @@
+ 
+ // Pattern: add(X, 0) -> X
+ def AddOp_RemoveNoop
+-    : Pat<(StableHLO_AddOp:$op $lhs, (ConstantLikeMatcher AnyZero:$value)),
+-          (replaceWithValue $lhs), [],
+-          [(MergeDiscardableAttributes $op, $lhs)]>;
++  : Pat<(StableHLO_AddOp:$op $lhs, (ConstantLikeMatcher AnyZero:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
+ 
+ ////////
+ // AndOp
+@@ -177,25 +177,26 @@
+   : CanonicalizeConstantToRhs<StableHLO_AndOp>;
+ 
+ // Pattern: and(X, 0) -> 0
+-def AndOp_FoldToZero : Pat<(StableHLO_AndOp:$op $lhs,
+-                               (StableHLO_ConstantOp:$zero IntZero:$value)),
+-                           (replaceWithValue $zero), [],
+-                           [(MergeDiscardableAttributes $op, $zero)]>;
++def AndOp_FoldToZero
++  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
++        (replaceWithValue $zero), [],
++        [(MergeDiscardableAttributes $op, $zero)]>;
+ 
+ // Pattern: and(X, 1) -> X
+-def AndOp_RemoveNoop : Pat<(StableHLO_AndOp:$op $lhs,
+-                               (StableHLO_ConstantOp:$one IntAllOnes:$value)),
+-                           (replaceWithValue $lhs), [],
+-                           [(MergeDiscardableAttributes $op, $lhs)]>;
++def AndOp_RemoveNoop
++  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
+ 
+ ////////
+ // BroadcastInDimOp
+ 
+ // Pattern: broadcast_in_dim(X, [iota...]) -> X
+ def BroadcastInDimOp_RemoveNoop
+-    : Pat<(StableHLO_BroadcastInDimOp:$op $operand, IotaDims:$dims),
+-          (replaceWithValue $operand), [(TypesEqual $op, $operand)],
+-          [(MergeDiscardableAttributes $op, $operand)]>;
++  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, IotaDims:$dims),
++        (replaceWithValue $operand),
++        [(TypesEqual $op, $operand)],
++        [(MergeDiscardableAttributes $op, $operand)]>;
+ 
+ // Pattern: broadcast_in_dim(broadcast_in_dim(X, [dimsA...]), [dimsB...])
+ //       -> broadcast_in_dim(X, merge(dimsA, dimsB))
+@@ -210,8 +211,10 @@
+ 
+ // Pattern: broadcast_in_dim(X, [sorted...]) -> reshape(X, [sorted...])
+ //          [if same numel]
++// TODO: Figure out if static extents matching is valid (i.e. <=10 -> 1x[<=10])
++// for bounded dynamism, same for BroadcastInDimOp_ReplaceWithReshape
+ def BroadcastInDimOp_ReplaceWithReshape
+-  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, SortedDims:$dims),
++  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, SortedDims:$dims),
+         (StableHLO_ReshapeOpWithShape $op, $operand),
+         [(NumberOfElementsEqual $op, $operand)],
+         [],
+@@ -220,7 +223,7 @@
+ // Pattern: broadcast_in_dim(X, [dims...]) -> transpose(X, [dims...])
+ //          [if same numel & rank]
+ def BroadcastInDimOp_ReplaceWithTranspose
+-  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, $dims),
++  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, $dims),
+         (StableHLO_TransposeOp $operand, (InvertBroadcastDims $dims)),
+         [(NumberOfElementsEqual $op, $operand), (RankEqual $op, $operand)]>;
+ 
+@@ -259,9 +262,10 @@
+ 
+ // Pattern: convert(X, [X.type]) -> X
+ def ConvertOp_RemoveNoop
+-    : Pat<(StableHLO_ConvertOp:$convert $operand),
+-          (replaceWithValue $operand), [(TypesEqual $convert, $operand)],
+-          [(MergeDiscardableAttributes $convert, $operand)]>;
++  : Pat<(StableHLO_ConvertOp:$convert $operand),
++        (replaceWithValue $operand),
++        [(TypesEqual $convert, $operand)],
++        [(MergeDiscardableAttributes $convert, $operand)]>;
+ 
+ ////////
+ // DynamicBroadcastInDimOp
+@@ -447,16 +451,16 @@
+ //
+ // Multiplication by 0. This fold is not trivial for floats in presence of NaNs,
+ // so we currently only enable it for ints.
+-def MulOp_FoldToZero : Pat<(StableHLO_MulOp:$mul_op $lhs,
+-                               (StableHLO_ConstantOp:$zero IntZero:$value)),
+-                           (replaceWithValue $zero), [],
+-                           [(MergeDiscardableAttributes $mul_op, $zero)]>;
++def MulOp_FoldToZero
++  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
++        (replaceWithValue $zero), [],
++        [(MergeDiscardableAttributes $mul_op, $zero)]>;
+ 
+ // Pattern: multiply(X, 1i) -> X
+ def MulOp_RemoveNoop
+-    : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp AnyOne:$value)),
+-          (replaceWithValue $lhs), [],
+-          [(MergeDiscardableAttributes $mul_op, $lhs)]>;
++  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp AnyOne:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $mul_op, $lhs)]>;
+ 
+ ////////
+ // OrOp
+@@ -465,16 +469,16 @@
+ def OrOp_CanonicalizeConstantToRhs : CanonicalizeConstantToRhs<StableHLO_OrOp>;
+ 
+ // Pattern: or(X, 1) -> 1
+-def OrOp_FoldToOne : Pat<(StableHLO_OrOp:$op $lhs,
+-                             (StableHLO_ConstantOp:$one IntAllOnes:$value)),
+-                         (replaceWithValue $one), [],
+-                         [(MergeDiscardableAttributes $op, $one)]>;
++def OrOp_FoldToOne
++  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
++        (replaceWithValue $one), [],
++        [(MergeDiscardableAttributes $op, $one)]>;
+ 
+ // Pattern: or(X, 0) -> X
+-def OrOp_RemoveNoop : Pat<(StableHLO_OrOp:$op $lhs,
+-                              (StableHLO_ConstantOp:$zero IntZero:$value)),
+-                          (replaceWithValue $lhs), [],
+-                          [(MergeDiscardableAttributes $op, $lhs)]>;
++def OrOp_RemoveNoop
++  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
+ 
+ ////////
+ // PadOp
+@@ -574,10 +578,10 @@
+         (StableHLO_ConstantLike<"0"> $operand)>;
+ 
+ // Pattern: subtract(X, 0) -> X
+-def SubtractOp_RemoveNoop : Pat<(StableHLO_SubtractOp:$op $lhs,
+-                                    (StableHLO_ConstantOp AnyZero:$value)),
+-                                (replaceWithValue $lhs), [],
+-                                [(MergeDiscardableAttributes $op, $lhs)]>;
++def SubtractOp_RemoveNoop
++  : Pat<(StableHLO_SubtractOp:$op $lhs, (StableHLO_ConstantOp AnyZero:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
+ 
+ ////////
+ // SliceOp
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 34abe5661db1ee..6012798b53e02e 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "0a4440a5c8de45c4f9649bf3eb4913bf3f97da0d"
-    STABLEHLO_SHA256 = "f1620aafc2b6d730e2ee9c33b35a59a2656a11eed10b1ef8049f175eb4fbdd9c"
+    STABLEHLO_COMMIT = "96acdcb7724f4a9eec6d2e5af2597b0750c13948"
+    STABLEHLO_SHA256 = "68e068a78d71f0764d5dd385ef434df922050530de99001969493298a00d64a0"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
index 32c6d96f161874..8a4761348e35ff 100644
--- a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
@@ -7,13 +7,13 @@
 """
 
 load(
-    "@local_xla//third_party/gpus:cuda_configure.bzl",
+    "//third_party/gpus:cuda_configure.bzl",
     "find_cuda_config",
     "lib_name",
     "make_copy_files_rule",
 )
 load(
-    "@local_xla//third_party/remote_config:common.bzl",
+    "//third_party/remote_config:common.bzl",
     "config_repo_label",
     "get_cpu_value",
     "get_host_environ",
@@ -330,7 +330,7 @@ remote_tensorrt_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
+        "_find_cuda_config": attr.label(default = "//third_party/gpus:find_cuda_config.py"),
     },
 )
 
@@ -338,7 +338,7 @@ tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
     environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
     attrs = {
-        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
+        "_find_cuda_config": attr.label(default = "//third_party/gpus:find_cuda_config.py"),
     },
 )
 """Detects and configures the local CUDA toolchain.
diff --git a/third_party/xla/third_party/transformer_engine/BUILD b/third_party/xla/third_party/transformer_engine/BUILD
new file mode 100644
index 00000000000000..fe1959f93a952c
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/BUILD
@@ -0,0 +1,3 @@
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
+
+exports_files(["codegen.py"])
diff --git a/third_party/xla/third_party/transformer_engine/codegen.py b/third_party/xla/third_party/transformer_engine/codegen.py
new file mode 100644
index 00000000000000..a0daa028cb4b70
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/codegen.py
@@ -0,0 +1,41 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Codegen script for Transformer Engine."""
+
+from absl import app
+from absl import flags
+
+_TEMPLATE_FILE = flags.DEFINE_string(
+    'template_file', None, 'Path to the template file.', required=True
+)
+_DATA_FILE = flags.DEFINE_string(
+    'data_file', None, 'Path to the data file.', required=True
+)
+_STRING_NAME = flags.DEFINE_string(
+    'string_name', None, 'String name to use in the template.', required=True
+)
+
+
+def main(_):
+  with open(_TEMPLATE_FILE.value, 'rt') as f, open(_DATA_FILE.value, 'rt') as g:
+    template = f.read()
+    data = g.read()
+  template = template.replace('@STRING_NAME@', _STRING_NAME.value)
+  template = template.replace('@STRING@', data)
+  print(template)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/third_party/xla/third_party/transformer_engine/transformer_engine.BUILD b/third_party/xla/third_party/transformer_engine/transformer_engine.BUILD
new file mode 100644
index 00000000000000..5d9abf67401346
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/transformer_engine.BUILD
@@ -0,0 +1,249 @@
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("@local_xla//third_party/py/rules_pywrap:pywrap.impl.bzl", "python_extension", "pywrap_library")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+load("@rules_python//python:py_binary.bzl", "py_binary")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = [
+        "-header_modules",
+        "-use_header_modules",
+    ],
+    licenses = ["notice"],
+)
+
+py_binary(
+    name = "codegen",
+    srcs = ["@local_xla//third_party/transformer_engine:codegen.py"],
+    deps = [
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+    ],
+)
+
+genrule(
+    name = "make_string_code_utils_cuh",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/utils.cuh",
+    ],
+    outs = ["string_headers/string_code_utils_cuh.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/utils.cuh) --string_name=string_code_utils_cuh > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_util_math_h",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/util/math.h",
+    ],
+    outs = ["string_headers/string_code_util_math_h.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/util/math.h) --string_name=string_code_util_math_h > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_transpose_rtc_transpose_cu",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/transpose/rtc/transpose.cu",
+    ],
+    outs = ["string_headers/string_code_transpose_rtc_transpose_cu.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/transpose/rtc/transpose.cu) --string_name=string_code_transpose_rtc_transpose_cu > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_transpose_rtc_cast_transpose_cu",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/transpose/rtc/cast_transpose.cu",
+    ],
+    outs = ["string_headers/string_code_transpose_rtc_cast_transpose_cu.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/transpose/rtc/cast_transpose.cu) --string_name=string_code_transpose_rtc_cast_transpose_cu > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_transpose_rtc_cast_transpose_fusion_cu",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/transpose/rtc/cast_transpose_fusion.cu",
+    ],
+    outs = ["string_headers/string_code_transpose_rtc_cast_transpose_fusion_cu.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/transpose/rtc/cast_transpose_fusion.cu) --string_name=string_code_transpose_rtc_cast_transpose_fusion_cu > $@",
+    tools = [":codegen"],
+)
+
+cc_library(
+    name = "string_headers",
+    hdrs = [
+        "string_headers/string_code_transpose_rtc_cast_transpose_cu.h",
+        "string_headers/string_code_transpose_rtc_cast_transpose_fusion_cu.h",
+        "string_headers/string_code_transpose_rtc_transpose_cu.h",
+        "string_headers/string_code_util_math_h.h",
+        "string_headers/string_code_utils_cuh.h",
+    ],
+    includes = ["string_headers"],
+)
+
+UNSUPPORTED_ARCHITECTURES_FLAGS = [
+    "--no-cuda-gpu-arch=sm_50",
+    "--no-cuda-gpu-arch=sm_60",
+]
+
+cuda_library(
+    name = "nvshmem_api",
+    srcs = [
+        "transformer_engine/common/nvshmem_api/nvshmem_waitkernel.cu",
+        "transformer_engine/common/util/logging.h",
+        "transformer_engine/common/util/string.h",
+    ],
+    hdrs = ["transformer_engine/common/nvshmem_api/nvshmem_waitkernel.h"],
+    copts = [
+        "-fexceptions",
+    ] + UNSUPPORTED_ARCHITECTURES_FLAGS,
+    includes = [
+        "transformer_engine",
+        "transformer_engine/common",
+        "transformer_engine/common/include",
+        "transformer_engine/common/include/transformer_engine",
+    ],
+    local_defines = ["NVSHMEM_ENABLE_ALL_DEVICE_INLINING"],
+    deps = [
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudnn_header",
+        "@local_config_cuda//cuda:nvrtc_headers",
+        "@nvshmem//:nvshmem_lib",
+    ],
+)
+
+cuda_library(
+    name = "common_lib",
+    srcs = glob(
+        [
+            "transformer_engine/common/**/*.cc",
+            "transformer_engine/common/**/*.cpp",
+            "transformer_engine/common/**/*.cu",
+        ],
+        exclude = [
+            "transformer_engine/common/permutation/permutation.cu",
+            "transformer_engine/common/transpose/rtc/transpose.cu",
+            "transformer_engine/common/transpose/rtc/cast_transpose.cu",
+            "transformer_engine/common/transpose/rtc/cast_transpose_fusion.cu",
+            "transformer_engine/common/nvshmem_api/nvshmem_waitkernel.cu",
+        ],
+    ),
+    hdrs = glob([
+        "transformer_engine/common/**/*.cuh",
+        "transformer_engine/common/**/*.h",
+    ]),
+    copts = [
+        "-fexceptions",
+        "-Wno-logical-op-parentheses",
+        "-Wno-missing-braces",
+        "-Wno-pass-failed",
+        "-Wno-reorder-ctor",
+        "-Wno-unused-variable",
+        "-Wno-switch",
+        "-Wno-exceptions",
+        "-Wno-assume",
+        "-Wno-self-assign",
+        "-Wno-sometimes-uninitialized",
+    ] + UNSUPPORTED_ARCHITECTURES_FLAGS,
+    data = ["@local_config_cuda//cuda:cuda_headers"],
+    includes = [
+        "transformer_engine",
+        "transformer_engine/common",
+        "transformer_engine/common/include",
+        "transformer_engine/common/include/transformer_engine",
+        "transformer_engine/common/layer_norm",
+    ],
+    deps = [
+        ":nvshmem_api",
+        ":string_headers",
+        "@com_google_absl//absl/log",
+        "@cuda_nccl//:nccl",
+        "@cudnn_frontend_archive//:cudnn_frontend",
+        "@cutlass_archive//:cutlass",
+        "@local_config_cuda//cuda",
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_config_cuda//cuda:cufft",
+        "@local_config_cuda//cuda:cusparse",
+        "@local_config_cuda//cuda:nvrtc_headers",
+        "@local_tsl//tsl/platform:cuda_root_path",
+        "@local_xla//xla/ffi/api:ffi",
+        "@pybind11",
+    ],
+)
+
+cuda_library(
+    name = "transformer_engine_jax_utils",
+    srcs = ["transformer_engine/jax/csrc/extensions/utils.cpp"],
+    hdrs = ["transformer_engine/jax/csrc/extensions/utils.h"],
+    copts = [
+        "-fexceptions",
+    ] + UNSUPPORTED_ARCHITECTURES_FLAGS,
+    includes = [
+        "transformer_engine",
+    ],
+    deps = [
+        ":common_lib",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@pybind11",
+    ],
+)
+
+python_extension(
+    name = "transformer_engine_jax_extension",
+    srcs = glob(
+        [
+            "transformer_engine/jax/csrc/*.cpp",
+            "transformer_engine/jax/csrc/*.h",
+            "transformer_engine/jax/csrc/*/*.h",
+            "transformer_engine/jax/csrc/*/*.cpp",
+        ],
+        exclude = [
+            "transformer_engine/jax/csrc/extensions/utils.h",
+            "transformer_engine/jax/csrc/extensions/utils.cpp",
+        ],
+    ),
+    copts = [
+        "-fexceptions",
+        "-Wno-unused-variable",
+        "-Wno-c++11-narrowing",
+    ],
+    includes = [
+        "transformer_engine",
+        "transformer_engine/jax/csrc",
+        "transformer_engine/jax/csrc/extensions",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":common_lib",
+        ":transformer_engine_jax_utils",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_xla//xla/ffi/api:c_api",
+        "@local_xla//xla/ffi/api:ffi",
+        "@pybind11",
+    ],
+)
+
+pywrap_library(
+    name = "transformer_engine_jax",
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":transformer_engine_jax_extension",
+    ],
+)
diff --git a/third_party/xla/third_party/transformer_engine/transformer_engine.patch b/third_party/xla/third_party/transformer_engine/transformer_engine.patch
new file mode 100644
index 00000000000000..4f23d58df7cf7e
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/transformer_engine.patch
@@ -0,0 +1,158 @@
+diff --git a/transformer_engine/common/common.cu b/transformer_engine/common/common.cu
+index 192c915..e94a92b 100644
+--- a/transformer_engine/common/common.cu
++++ b/transformer_engine/common/common.cu
+@@ -6,7 +6,6 @@
+
+ #include <transformer_engine/transformer_engine.h>
+
+-#include <bit>
+
+ #include "./common.h"
+ #include "./utils.cuh"
+diff --git a/transformer_engine/common/cudnn_utils.h b/transformer_engine/common/cudnn_utils.h
+index 0016ad7..985a9ba 100644
+--- a/transformer_engine/common/cudnn_utils.h
++++ b/transformer_engine/common/cudnn_utils.h
+@@ -8,8 +8,8 @@
+ #define TRANSFORMER_ENGINE_CUDNN_UTILS_H_
+
+ #include <cudnn.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+ #include <cudnn_graph.h>
+
+ #include "transformer_engine/transformer_engine.h"
+diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+index 0932b2c..7fd2a94 100644
+--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
++++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+@@ -6,8 +6,8 @@
+
+ #include <cuda_bf16.h>
+ #include <cuda_fp16.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+
+ #include <map>
+ #include <vector>
+diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu
+index 89528fa..9f53123 100644
+--- a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu
++++ b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu
+@@ -6,7 +6,7 @@
+
+ #include <cuda_bf16.h>
+ #include <cuda_fp16.h>
+-#include <cudnn_frontend.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
+
+ #include <map>
+ #include <vector>
+diff --git a/transformer_engine/common/fused_attn/utils.h b/transformer_engine/common/fused_attn/utils.h
+index 678b636..58bcec4 100644
+--- a/transformer_engine/common/fused_attn/utils.h
++++ b/transformer_engine/common/fused_attn/utils.h
+@@ -8,8 +8,8 @@
+ #define TRANSFORMER_ENGINE_FUSED_ATTN_UTILS_H_
+
+ #include <cudnn.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+
+ #include <cstdint>
+ #include <mutex>
+diff --git a/transformer_engine/common/normalization/common.h b/transformer_engine/common/normalization/common.h
+index 0ec1604..aef319d 100644
+--- a/transformer_engine/common/normalization/common.h
++++ b/transformer_engine/common/normalization/common.h
+@@ -8,8 +8,8 @@
+ #define TRANSFORMER_ENGINE_COMMON_NORM_COMMON_H_
+
+ #include <cudnn.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+ #include <transformer_engine/normalization.h>
+ #include <transformer_engine/transformer_engine.h>
+
+diff --git a/transformer_engine/common/util/cuda_runtime.cpp b/transformer_engine/common/util/cuda_runtime.cpp
+index ac1bb1b..febb9a1 100644
+--- a/transformer_engine/common/util/cuda_runtime.cpp
++++ b/transformer_engine/common/util/cuda_runtime.cpp
+@@ -13,18 +13,12 @@
+ #include "../util/cuda_driver.h"
+ #include "../util/system.h"
+ #include "common/util/cuda_runtime.h"
++#include "tsl/platform/cuda_root_path.h"
+
+ namespace transformer_engine {
+
+ namespace cuda {
+
+-namespace {
+-
+-// String with build-time CUDA include path
+-#include "string_path_cuda_include.h"
+-
+-}  // namespace
+-
+ int num_devices() {
+   auto query_num_devices = []() -> int {
+     int count;
+@@ -146,8 +140,10 @@ const std::string &include_directory(bool required) {
+     std::vector<std::pair<std::string, Path>> search_paths = {{"NVTE_CUDA_INCLUDE_DIR", ""},
+                                                               {"CUDA_HOME", ""},
+                                                               {"CUDA_DIR", ""},
+-                                                              {"", string_path_cuda_include},
+                                                               {"", "/usr/local/cuda"}};
++    for (auto &candidate : tsl::CandidateCudaRoots()) {
++      search_paths.push_back({"", candidate});
++    }
+     for (auto &[env, p] : search_paths) {
+       if (p.empty()) {
+         p = getenv<Path>(env.c_str());
+diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
+index 40089dc..cf256e3 100644
+--- a/transformer_engine/jax/csrc/extensions/attention.cpp
++++ b/transformer_engine/jax/csrc/extensions/attention.cpp
+@@ -5,6 +5,7 @@
+  ************************************************************************/
+
+ #include "../extensions.h"
++#include <cudnn_graph.h>
+ #include "transformer_engine/fused_attn.h"
+ #include "transformer_engine/transformer_engine.h"
+
+diff --git a/transformer_engine/jax/csrc/extensions/ffi.h b/transformer_engine/jax/csrc/extensions/ffi.h
+index 852a67c..92df9de 100644
+--- a/transformer_engine/jax/csrc/extensions/ffi.h
++++ b/transformer_engine/jax/csrc/extensions/ffi.h
+@@ -5,7 +5,7 @@
+  ************************************************************************/
+
+ #include <transformer_engine/transformer_engine.h>
+-#include <xla/ffi/api/ffi.h>
++#include "xla/ffi/api/ffi.h"
+
+ #include <numeric>
+
+diff --git a/transformer_engine/jax/csrc/extensions/utils.cpp b/transformer_engine/jax/csrc/extensions/utils.cpp
+index 3ba0737..10b72c8 100644
+--- a/transformer_engine/jax/csrc/extensions/utils.cpp
++++ b/transformer_engine/jax/csrc/extensions/utils.cpp
+@@ -6,6 +6,7 @@
+ #include "utils.h"
+
+ #include <cuda_runtime_api.h>
++#include <cudnn_graph.h>
+
+ #include <cassert>
+
diff --git a/third_party/xla/third_party/transformer_engine/workspace.bzl b/third_party/xla/third_party/transformer_engine/workspace.bzl
new file mode 100644
index 00000000000000..e12528aa70fd3d
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/workspace.bzl
@@ -0,0 +1,13 @@
+"""Loads the TransformerEngine library."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "transformer_engine",
+        strip_prefix = "TransformerEngine-2.5",
+        sha256 = "ee52ee9e43e44edc8598bc3d111eedc2445c9ebfe78a1fcab6f5c4c887020b72",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/TransformerEngine/archive/refs/tags/v2.5.tar.gz"),
+        build_file = "//third_party/transformer_engine:transformer_engine.BUILD",
+        patch_file = ["//third_party/transformer_engine:transformer_engine.patch"],
+    )
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl801607173.patch b/third_party/xla/third_party/triton/llvm_integration/cl801607173.patch
deleted file mode 100644
index ae836b4d9efb84..00000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl801607173.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-
---- a/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.cpp
-+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/AtomicRMWOpsEmitter.cpp
-@@ -407,13 +407,28 @@ Value AtomicRMWEmitter::atomicIntraWaveReduce(RewriterBase &rewriter,
-   Value start = loopBody->getArgument(0);
-   Value cnt = b.trunc(i32_ty, generatePopcount64(rewriter, mask));
-   Value maskLo = b.trunc(i32_ty, mask);
-+  NamedAttribute noundef = rewriter.getNamedAttr(
-+      LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr());
-+  NamedAttribute lowRange = rewriter.getNamedAttr(
-+      LLVM::LLVMDialect::getRangeAttrName(),
-+      LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
-+                                   APInt(32, 32)));
-+  NamedAttribute highRange = rewriter.getNamedAttr(
-+      LLVM::LLVMDialect::getRangeAttrName(),
-+      LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
-+                                   APInt(32, 64)));
-   Value mbcntLoRes =
--      ROCDL::MbcntLoOp::create(rewriter, loc, i32_ty, maskLo, b.i32_val(0),
--                               /*arg_attrs=*/{}, /*res_attrs=*/{});
-+      ROCDL::MbcntLoOp::create(
-+        rewriter, loc, i32_ty, maskLo, b.i32_val(0),
-+        /*arg_attrs=*/{},
-+        /*res_attrs=*/rewriter.getArrayAttr(
-+          rewriter.getDictionaryAttr({noundef, lowRange})));
-   Value maskHi = b.trunc(i32_ty, b.lshr(mask, b.i64_val(32)));
-   Value idx =
-       ROCDL::MbcntHiOp::create(rewriter, loc, i32_ty, maskHi, mbcntLoRes,
--                               /*arg_attrs=*/{}, /*res_attrs=*/{});
-+        /*arg_attrs=*/{},
-+        /*res_attrs=*/rewriter.getArrayAttr(
-+          rewriter.getDictionaryAttr({noundef, highRange})));
-   Value base = b.add(start, cnt);
-   Value leader = b.icmp_eq(idx, b.i32_val(0));
-   cnt = b.sub(cnt, idx);
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl808150672.patch b/third_party/xla/third_party/triton/llvm_integration/cl808150672.patch
deleted file mode 100644
index 5e7c6765cb4c35..00000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl808150672.patch
+++ /dev/null
@@ -1,45 +0,0 @@
-
---- a/test/Conversion/tritongpu_to_llvm.mlir
-+++ b/test/Conversion/tritongpu_to_llvm.mlir
-@@ -1,4 +1,7 @@
--// RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv --convert-triton-gpu-to-llvm 2>/dev/null | FileCheck %s --dump-input-context 20
-+// RUN: triton-opt %s -split-input-file \
-+// RUN:   --allocate-shared-memory-nv --convert-triton-gpu-to-llvm \
-+// RUN:   --reconcile-unrealized-casts 2>/dev/null \
-+// RUN:   | FileCheck %s --dump-input-context 20
-
- module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
-   // CHECK: llvm.func @test_empty_kernel(%arg0: i32, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>)
-
---- a/test/Conversion/tritonnvidiagpu_to_llvm.mlir
-+++ b/test/Conversion/tritonnvidiagpu_to_llvm.mlir
-@@ -1,4 +1,7 @@
--// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=compute-capability=90 | FileCheck %s
-+// RUN: triton-opt %s -split-input-file \
-+// RUN: --convert-triton-gpu-to-llvm=compute-capability=90 \
-+// RUN: --reconcile-unrealized-casts \
-+// RUN: | FileCheck %s
-
- #shared0 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
- #smem = #ttg.shared_memory
-
---- a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
-+++ b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
-@@ -1415,7 +1415,7 @@
-     if (auto integerAttr =
-             llvm::dyn_cast_or_null<mlir::IntegerAttr>(maybeAttr)) {
-       if (integerAttr.getValue() == 0) {
--        rewriter.replaceAllUsesWith(castOp.getResult(0), fatPtrBase);
-+        rewriter.RewriterBase::replaceAllUsesWith(castOp.getResult(0), fatPtrBase);
-         rewriter.eraseOp(castOp);
-         return success();
-       }
-@@ -1425,7 +1425,7 @@
-         fatPtrs.at({fatPtrBase, fatPtrOffset});
-     auto newPtr = createTensorPointer(rewriter, fatPtrBase, fatPtrOffset,
-                                       castOp.getLoc(), fatPtrAttrs);
--    rewriter.replaceAllUsesWith(newPtr, fatPtrBase);
-+    rewriter.RewriterBase::replaceAllUsesWith(newPtr, fatPtrBase);
-     rewriter.eraseOp(castOp);
-     return success();
-   }
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl809972027.patch b/third_party/xla/third_party/triton/llvm_integration/cl809972027.patch
deleted file mode 100644
index f44e7e5712659f..00000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl809972027.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-
---- a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp	2025-07-31 00:13:23.000000000 -0700
-+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp	2025-09-22 05:43:47.000000000 -0700
-@@ -257,7 +257,7 @@
-         loc, arrayTy, /*isConstant=*/false, LLVM::Linkage::External,
-         "global_smem", /*value=*/Attribute(), /*alignment=*/16,
-         // Add ROCm support.
--        static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace));
-+        static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared));
-   }
- };
- 
-
---- a/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp	2025-04-25 05:19:43.000000000 -0700
-+++ b/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp	2025-09-22 05:43:47.000000000 -0700
-@@ -24,9 +24,9 @@
-           deduceMinCountInBlock(ifOp.getElseRegion().front(), countFunc);
-       count += std::min(minThen, minElse);
-     } else if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
--      auto tripCount = constantTripCount(forOp.getLowerBound(),
--                                         forOp.getUpperBound(), forOp.getStep())
--                           .value_or(0);
-+      int64_t tripCount = forOp.getStaticTripCount()
-+                              .value_or(llvm::APInt(64, 0))
-+                              .getZExtValue();
-       if (tripCount > 0) {
-         count += tripCount * deduceMinCountInBlock(*forOp.getBody(), countFunc);
-       }
-
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp	2025-08-22 04:02:56.000000000 -0700
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp	2025-09-22 05:43:47.000000000 -0700
-@@ -223,7 +223,7 @@
-         loc, arrayTy, /*isConstant=*/false, LLVM::Linkage::External,
-         "global_smem", /*value=*/Attribute(), /*alignment=*/16,
-         // Add ROCm support.
--        static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace));
-+        static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared));
-   }
- };
- 
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl812994567.patch b/third_party/xla/third_party/triton/llvm_integration/cl812994567.patch
deleted file mode 100644
index 2c6ffbd7ec9844..00000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl812994567.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-
---- a/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp	2025-08-22 04:02:56.000000000 -0700
-+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp	2025-09-29 17:07:00.000000000 -0700
-@@ -82,7 +82,7 @@
- 
-   Value flagsConst = b.int_val(32, flags);
-   Type rsrcType = LLVM::LLVMPointerType::get(rewriter.getContext(), 8);
--  Value numRecordsByte = b.int_val(32, std::numeric_limits<int>::max() - 1);
-+  Value numRecordsByte = b.int_val(64, std::numeric_limits<int>::max() - 1);
- 
-   Value resource = rewriter.createOrFold<ROCDL::MakeBufferRsrcOp>(
-       loc, rsrcType, basePtr, stride, numRecordsByte, flagsConst);
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl823109577.patch b/third_party/xla/third_party/triton/llvm_integration/cl823109577.patch
new file mode 100644
index 00000000000000..8e421c8fc9a06f
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl823109577.patch
@@ -0,0 +1,11 @@
+
+--- a/python/src/llvm.cc	2025-08-22 04:02:56.000000000 -0700
++++ b/python/src/llvm.cc	2025-10-23 11:14:07.000000000 -0700
+@@ -53,7 +53,6 @@
+   bool disableLLVMOpt = mlir::triton::tools::getBoolEnv("DISABLE_LLVM_OPT");
+   if (enable_fp_fusion)
+     opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+-  opt.UnsafeFPMath = false;
+   opt.NoInfsFPMath = false;
+   opt.NoNaNsFPMath = true;
+   opt.TrapUnreachable = true;
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl825373861.patch b/third_party/xla/third_party/triton/llvm_integration/cl825373861.patch
new file mode 100644
index 00000000000000..807507cbce4b75
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl825373861.patch
@@ -0,0 +1,102 @@
+
+--- a/lib/Analysis/Allocation.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Analysis/Allocation.cpp	2025-10-28 22:48:41.000000000 -0700
+@@ -16,6 +16,8 @@
+ #include "llvm/Support/Debug.h"
+ #include "llvm/Support/raw_ostream.h"
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "allocation-shared-memory"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/lib/Analysis/AxisInfo.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Analysis/AxisInfo.cpp	2025-10-28 22:48:41.000000000 -0700
+@@ -9,6 +9,8 @@
+ 
+ #include <numeric>
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "axis-info"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-10-28 22:48:42.000000000 -0700
+@@ -903,8 +903,9 @@
+     return;
+   }
+   // And the default region branches transparently back to the parent.
+-  assert(src.getRegionOrNull() == &getDefaultRegion());
+-  successors.push_back(RegionSuccessor(getResults()));
++  assert(src.getTerminatorPredecessorOrNull()->getParentRegion() ==
++         &getDefaultRegion());
++  successors.push_back(RegionSuccessor(getOperation(), getResults()));
+ }
+ 
+ LogicalResult WarpSpecializeOp::verify() {
+
+--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2025-10-28 22:48:42.000000000 -0700
+@@ -15,6 +15,8 @@
+ #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+ #include "llvm/Support/Debug.h"
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "ttg-utility"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2025-07-31 00:13:23.000000000 -0700
++++ b/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2025-10-28 22:48:42.000000000 -0700
+@@ -1,5 +1,7 @@
+ #include "triton/Dialect/TritonNvidiaGPU/Transforms/Utility.h"
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "ttng-utility"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/third_party/amd/include/Analysis/RangeAnalysis.h	2025-04-30 09:57:08.000000000 -0700
++++ b/third_party/amd/include/Analysis/RangeAnalysis.h	2025-10-29 06:16:49.000000000 -0700
+@@ -78,7 +78,7 @@
+   /// the loop operands and all users and all users of the results of the loop.
+   void visitRegionSuccessors(
+       ProgramPoint *point, RegionBranchOpInterface branch,
+-      RegionBranchPoint successor,
++      RegionSuccessor successor,
+       ArrayRef<dataflow::AbstractSparseLattice *> abstractLattices) override;
+ 
+   /// Collect all operands that participate in assumptions (see description of
+
+--- a/third_party/amd/lib/Analysis/RangeAnalysis.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/third_party/amd/lib/Analysis/RangeAnalysis.cpp	2025-10-29 06:16:50.000000000 -0700
+@@ -471,7 +471,7 @@
+ 
+ void TritonIntegerRangeAnalysis::visitRegionSuccessors(
+     ProgramPoint *point, RegionBranchOpInterface branch,
+-    RegionBranchPoint successor,
++    RegionSuccessor successor,
+     ArrayRef<dataflow::AbstractSparseLattice *> abstractLattices) {
+   LLVM_DEBUG({
+     DBGS() << "Inferring ranges for ";
+@@ -535,10 +535,11 @@
+         if (!inputs.empty()) {
+           firstIndex = cast<OpResult>(inputs.front()).getResultNumber();
+         }
+-        visitNonControlFlowArguments(branch,
+-                                     RegionSuccessor(branch->getResults().slice(
+-                                         firstIndex, inputs.size())),
+-                                     lattices, firstIndex);
++        visitNonControlFlowArguments(
++            branch,
++            RegionSuccessor(
++                branch, branch->getResults().slice(firstIndex, inputs.size())),
++            lattices, firstIndex);
+       } else {
+         if (!inputs.empty()) {
+           firstIndex = cast<BlockArgument>(inputs.front()).getArgNumber();
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl828494580.patch b/third_party/xla/third_party/triton/llvm_integration/cl828494580.patch
new file mode 100644
index 00000000000000..2c12f164868aec
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl828494580.patch
@@ -0,0 +1,12 @@
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp	2025-08-29 00:00:16.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp	2025-11-05 09:23:21.000000000 -0800
+@@ -655,7 +655,7 @@
+     Value prod = b.fmul(f32_ty, operands[0][0], b.f32_val(log2e));
+ 
+     Type resultTy = operands[0][0].getType();
+-    StringRef name = "llvm.nvvm.ex2.approx.f";
++    StringRef name = "llvm.nvvm.ex2.approx.f32";
+     auto callOp =
+         LLVM::createLLVMIntrinsicCallOp(rewriter, loc, name, resultTy, {prod});
+     return {callOp.getResult(0)};
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl831451347.patch b/third_party/xla/third_party/triton/llvm_integration/cl831451347.patch
new file mode 100644
index 00000000000000..5dd2a4ed33641b
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl831451347.patch
@@ -0,0 +1,27 @@
+
+--- a/test/Conversion/tritonnvidiagpu_to_llvm.mlir	2025-11-12 02:33:41.000000000 -0800
++++ b/test/Conversion/tritonnvidiagpu_to_llvm.mlir	2025-11-12 10:58:30.000000000 -0800
+@@ -215,9 +215,9 @@
+ // CHECK-LABEL: async_copy_mbarrier_arrive
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+   tt.func public @async_copy_mbarrier_arrive(%arg0: !ttg.memdesc<1xi64, #shared, #ttg.shared_memory>)  attributes { noinline = false } {
+-    // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3>
++    // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} : !llvm.ptr<3>
+     ttng.async_copy_mbarrier_arrive %arg0 : !ttg.memdesc<1xi64, #shared, #ttg.shared_memory>
+-    // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true} : !llvm.ptr<3>
++    // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} : !llvm.ptr<3>
+     ttng.async_copy_mbarrier_arrive %arg0 { noIncrement } : !ttg.memdesc<1xi64, #shared, #ttg.shared_memory>
+     tt.return
+   }
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp	2025-11-12 10:58:30.000000000 -0800
+@@ -1818,7 +1818,7 @@
+         typeConverter->convertType(op.getBarrier().getType().getElementType()),
+         rewriter);
+     TritonLLVMOpBuilder b(loc, rewriter);
+-    rewriter.create<NVVM::CpAsyncMBarrierArriveSharedOp>(
++    rewriter.create<NVVM::CpAsyncMBarrierArriveOp>(
+         loc, barrierMemObj.getBase(), noinc);
+     op->erase();
+     return success();
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl833447018.patch b/third_party/xla/third_party/triton/llvm_integration/cl833447018.patch
new file mode 100644
index 00000000000000..f2a151c730ca56
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl833447018.patch
@@ -0,0 +1,22 @@
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp	2025-11-20 11:24:12.000000000 -0800
+@@ -7,6 +7,7 @@
+ #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+ #include "mlir/IR/BuiltinOps.h"
+ #include "mlir/IR/ImplicitLocOpBuilder.h"
++#include "mlir/IR/TypeRange.h"
+ #include "mlir/Pass/PassManager.h"
+ #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+ #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
+@@ -91,7 +92,9 @@
+   if (numThreads == 32)
+     LLVM::NVIDIA::createSyncWarp(b.getLoc(), b);
+   else
+-    b.create<NVVM::BarrierOp>(b.i32_val(barIdx), b.i32_val(numThreads));
++    mlir::NVVM::BarrierOp::create(b, b.getLoc(), mlir::TypeRange(),
++                                  b.i32_val(barIdx), b.i32_val(numThreads), {},
++                                  {});
+ }
+ 
+ static void createAllBarrier(TritonLLVMIRRewriter &b, unsigned barIdx) {
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl835942347.patch b/third_party/xla/third_party/triton/llvm_integration/cl835942347.patch
new file mode 100644
index 00000000000000..d93012cc46793e
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl835942347.patch
@@ -0,0 +1,24 @@
+
+--- a/test/Conversion/warp_specialize_to_llvm.mlir	2025-10-15 10:11:13.000000000 -0700
++++ b/test/Conversion/warp_specialize_to_llvm.mlir	2025-11-23 12:53:49.000000000 -0800
+@@ -653,7 +653,7 @@
+   %1 = llvm.getelementptr %0[%arg0] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, i32
+   %2 = llvm.add %arg0, %arg1 : i32
+   %3 = llvm.mul %2, %arg1 : i32
+-  %4 = llvm.udiv %2, %3 : i32
++  %4 = llvm.sub %2, %3 : i32
+   ttg.warp_specialize(%1, %4) attributes {allocation.offset = 0 : i32, warpGroupStartIds = array<i32: 4>}
+   default {
+     ttg.warp_yield
+@@ -663,9 +663,9 @@
+     // CHECK-NEXT: "llvm.nvvm.barrier.cta.sync.all"([[C1]])
+     // CHECK-NEXT: [[ADD:%.*]] = llvm.add %arg0, %arg1 : i32
+     // CHECK-NEXT: [[MUL:%.*]] = llvm.mul [[ADD]], %arg1 : i32
+-    // CHECK-NEXT: [[UDIV:%.*]] = llvm.udiv [[ADD]], [[MUL]] : i32
++    // CHECK-NEXT: [[SUB:%.*]] = llvm.sub [[ADD]], [[MUL]] : i32
+     // CHECK-NEXT: [[PTR:%.*]] = llvm.getelementptr [[ADDR]][%arg0]
+-    // CHECK-NEXT: "use"([[PTR]], [[UDIV]])
++    // CHECK-NEXT: "use"([[PTR]], [[SUB]])
+     "use"(%arg2, %arg3) : (!llvm.ptr<3>, i32) -> ()
+     // CHECK-NEXT: "llvm.nvvm.barrier.cta.sync.all"([[C1]])
+     ttg.warp_return
diff --git a/third_party/xla/third_party/triton/llvm_integration/series.bzl b/third_party/xla/third_party/triton/llvm_integration/series.bzl
index c0fe45289de365..45a5161752ba0e 100644
--- a/third_party/xla/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/xla/third_party/triton/llvm_integration/series.bzl
@@ -8,9 +8,8 @@ LLVM nor MLIR integrator, please do not add any patches to this list.
 """
 
 llvm_patch_list = [
-    "//third_party/triton:llvm_integration/cl801607173.patch",
-    "//third_party/triton:llvm_integration/cl808150672.patch",
-    "//third_party/triton:llvm_integration/cl809972027.patch",
-    "//third_party/triton:llvm_integration/cl812994567.patch",
+    "//third_party/triton:llvm_integration/cl831451347.patch",
+    "//third_party/triton:llvm_integration/cl833447018.patch",
+    "//third_party/triton:llvm_integration/cl835942347.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/cherrypick_dominance_fix.patch b/third_party/xla/third_party/triton/temporary/cherrypick_dominance_fix.patch
deleted file mode 100644
index 084249e632d1a9..00000000000000
--- a/third_party/xla/third_party/triton/temporary/cherrypick_dominance_fix.patch
+++ /dev/null
@@ -1,19 +0,0 @@
-This is a cherry-pick of https://github.com/triton-lang/triton/pull/7719 which
-should make it into the next integration so this patch can simply be deleted
-next integrate.
-
-diff --git a/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp b/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
---- a/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
-@@ -181,6 +181,11 @@ public:
-       return failure();
-     if (alloc->getBlock() != store->getBlock())
-       return failure();
-+    if (auto srcDef = store.getSrc().getDefiningOp()) {
-+      if (alloc->getBlock() == srcDef->getBlock() &&
-+          alloc->isBeforeInBlock(srcDef))
-+        return failure();
-+    }
-     alloc.getSrcMutable().assign(store.getSrc());
-     rewriter.replaceOp(store, alloc.getToken());
-     return success();
diff --git a/third_party/xla/third_party/triton/temporary/convert_layout_op_to_llvm_small_width.patch b/third_party/xla/third_party/triton/temporary/convert_layout_op_to_llvm_small_width.patch
deleted file mode 100644
index 74e7e69d8d2948..00000000000000
--- a/third_party/xla/third_party/triton/temporary/convert_layout_op_to_llvm_small_width.patch
+++ /dev/null
@@ -1,71 +0,0 @@
-This fix should be upstreamed.
-
---- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-08-28 04:30:50.000000000 -0700
-+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-09-10 06:50:50.000000000 -0700
-@@ -331,11 +331,11 @@
-     // Pack registers if possible.
-     int elemsPerVec = 1 << nPack;
-     int bitsPerVecElem = 32 / elemsPerVec;
-+    bool packTwo16Bit = bitwidth < bitsPerVecElem && elemsPerVec == 2;
-     if (elemsPerVec > 1) {
-       SmallVector<Value> packedVals;
-       packedVals.reserve(regDim / elemsPerVec);
--      if (bitwidth < bitsPerVecElem) {
--        // Should have bitsPerVecElem == 16 here.
-+      if (packTwo16Bit) {
-         for (int i = 0; i < regDim; i += elemsPerVec) {
-           Value x0 = b.zext(i32_ty, b.bitcast(inVals[i], int_ty(bitwidth)));
-           Value x1 = b.zext(i32_ty, b.bitcast(inVals[i + 1], int_ty(bitwidth)));
-@@ -343,6 +343,10 @@
-           packedVals.emplace_back(b.or_(x0, x1));
-         }
-       } else {
-+        // For small types, we need to extend the values to i8.
-+        if (bitwidth < 8) {
-+          llvm::for_each(inVals, [&](Value& v) { v = b.zext(i8_ty, v); });
-+        }
-         for (int i = 0; i < regDim; i += elemsPerVec) {
-           auto slice = ArrayRef<Value>(inVals).slice(i, elemsPerVec);
-           Value v = packLLVector(loc, slice, rewriter);
-@@ -376,9 +380,21 @@
-     if (elemsPerVec > 1) {
-       SmallVector<Value> unpackedVals;
-       unpackedVals.reserve(regDim);
--      if (bitwidth >= bitsPerVecElem) {
-+      if (packTwo16Bit) {
-+        for (auto packedVal : outVals) {
-+          Value x0 =
-+              b.trunc(int_ty(bitwidth), b.and_(packedVal, b.i32_val(0xFF)));
-+          Value x1 =
-+              b.trunc(int_ty(bitwidth), b.lshr(packedVal, b.i32_val(16)));
-+          unpackedVals.push_back(b.bitcast(x0, elemTy));
-+          unpackedVals.push_back(b.bitcast(x1, elemTy));
-+        }
-+      } else {
-         auto packedTy =
-             bitwidth < bitsPerVecElem ? int_ty(bitsPerVecElem) : elemTy;
-+        if (bitwidth < 8) {
-+          packedTy = i8_ty;
-+        }
-         auto vecTy = vec_ty(packedTy, elemsPerVec);
-         auto unpackVal = [&](Value v) {
-           v = b.bitcast(v, vecTy);
-@@ -388,14 +404,10 @@
-           auto unpacked = unpackVal(v);
-           unpackedVals.append(unpacked.begin(), unpacked.end());
-         }
--      } else {
--        for (auto packedVal : outVals) {
--          Value x0 =
--              b.trunc(int_ty(bitwidth), b.and_(packedVal, b.i32_val(0xFF)));
--          Value x1 =
--              b.trunc(int_ty(bitwidth), b.lshr(packedVal, b.i32_val(16)));
--          unpackedVals.push_back(b.bitcast(x0, elemTy));
--          unpackedVals.push_back(b.bitcast(x1, elemTy));
-+        if (bitwidth < 8) {
-+          // Truncate the values to the original bitwidth from i8.
-+          llvm::for_each(unpackedVals,
-+                         [&](Value& v) { v = b.trunc(elemTy, v); });
-         }
-       }
-       outVals = std::move(unpackedVals);
diff --git a/third_party/xla/third_party/triton/temporary/convert_layout_op_to_llvm_small_width_2.patch b/third_party/xla/third_party/triton/temporary/convert_layout_op_to_llvm_small_width_2.patch
deleted file mode 100644
index 9748aa50da77cd..00000000000000
--- a/third_party/xla/third_party/triton/temporary/convert_layout_op_to_llvm_small_width_2.patch
+++ /dev/null
@@ -1,72 +0,0 @@
-# patch to match https://github.com/triton-lang/triton/pull/8155
-
---- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-09-11 02:55:30.000000000 -0700
-+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-09-11 23:33:17.000000000 -0700
-@@ -331,11 +331,11 @@
-     // Pack registers if possible.
-     int elemsPerVec = 1 << nPack;
-     int bitsPerVecElem = 32 / elemsPerVec;
--    bool packTwo16Bit = bitwidth < bitsPerVecElem && elemsPerVec == 2;
-     if (elemsPerVec > 1) {
-       SmallVector<Value> packedVals;
-       packedVals.reserve(regDim / elemsPerVec);
--      if (packTwo16Bit) {
-+      if (bitwidth == 8 && bitsPerVecElem == 16) {
-+        // TODO: Can remove `if` part of `if-else` once ptxas bugfix lands.
-         for (int i = 0; i < regDim; i += elemsPerVec) {
-           Value x0 = b.zext(i32_ty, b.bitcast(inVals[i], int_ty(bitwidth)));
-           Value x1 = b.zext(i32_ty, b.bitcast(inVals[i + 1], int_ty(bitwidth)));
-@@ -343,9 +343,12 @@
-           packedVals.emplace_back(b.or_(x0, x1));
-         }
-       } else {
--        // For small types, we need to extend the values to i8.
--        if (bitwidth < 8) {
--          llvm::for_each(inVals, [&](Value& v) { v = b.zext(i8_ty, v); });
-+        if (bitwidth < bitsPerVecElem) {
-+          for (Value &v : inVals) {
-+            if (elemTy != int_ty(bitwidth))
-+              v = b.bitcast(v, int_ty(bitwidth));
-+            v = b.zext(int_ty(bitsPerVecElem), v);
-+          }
-         }
-         for (int i = 0; i < regDim; i += elemsPerVec) {
-           auto slice = ArrayRef<Value>(inVals).slice(i, elemsPerVec);
-@@ -380,21 +383,8 @@
-     if (elemsPerVec > 1) {
-       SmallVector<Value> unpackedVals;
-       unpackedVals.reserve(regDim);
--      if (packTwo16Bit) {
--        for (auto packedVal : outVals) {
--          Value x0 =
--              b.trunc(int_ty(bitwidth), b.and_(packedVal, b.i32_val(0xFF)));
--          Value x1 =
--              b.trunc(int_ty(bitwidth), b.lshr(packedVal, b.i32_val(16)));
--          unpackedVals.push_back(b.bitcast(x0, elemTy));
--          unpackedVals.push_back(b.bitcast(x1, elemTy));
--        }
--      } else {
-         auto packedTy =
-             bitwidth < bitsPerVecElem ? int_ty(bitsPerVecElem) : elemTy;
--        if (bitwidth < 8) {
--          packedTy = i8_ty;
--        }
-         auto vecTy = vec_ty(packedTy, elemsPerVec);
-         auto unpackVal = [&](Value v) {
-           v = b.bitcast(v, vecTy);
-@@ -404,10 +394,11 @@
-           auto unpacked = unpackVal(v);
-           unpackedVals.append(unpacked.begin(), unpacked.end());
-         }
--        if (bitwidth < 8) {
--          // Truncate the values to the original bitwidth from i8.
--          llvm::for_each(unpackedVals,
--                         [&](Value& v) { v = b.trunc(elemTy, v); });
-+      if (bitwidth < bitsPerVecElem) {
-+        for (Value &v : unpackedVals) {
-+          v = b.trunc(int_ty(bitwidth), v);
-+          if (elemTy != int_ty(bitwidth))
-+            v = b.bitcast(v, elemTy);
-         }
-       }
-       outVals = std::move(unpackedVals);
diff --git a/third_party/xla/third_party/triton/temporary/launcher_overflow_fix.patch b/third_party/xla/third_party/triton/temporary/launcher_overflow_fix.patch
deleted file mode 100644
index 100913dd425573..00000000000000
--- a/third_party/xla/third_party/triton/temporary/launcher_overflow_fix.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-Merge with launcher.patch
-
---- a/third_party/nvidia/backend/cuda_utils.cc
-+++ b/third_party/nvidia/backend/cuda_utils.cc
-@@ -605,8 +605,8 @@
-     return nullptr;
-   }
- 
--  // +1 for the global scratch pointer.
--  std::size_t num_params = signature_metadata.size() + 1;
-+  // +2 for the global scratch pointer and profile scratch pointer.
-+  std::size_t num_params = signature_metadata.size() + 2;
-   // Use alloca to set up kernel parameters on the stack and avoid dynamic
-   // memory allocations.
-   config.params = static_cast<void**>(alloca(num_params * sizeof(void*)));
diff --git a/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch b/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
new file mode 100644
index 00000000000000..57d4c2121e37ea
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
@@ -0,0 +1,144 @@
+diff --git a/third_party/nvidia/backend/cuda_utils.cc b/third_party/nvidia/backend/cuda_utils.cc
+--- a/third_party/nvidia/backend/cuda_utils.cc
++++ b/third_party/nvidia/backend/cuda_utils.cc
+@@ -270,51 +270,16 @@ bool extractPointer(PyObject* obj, void*
+   return true;
+ }
+ 
++CUtensorMap* getTmaDesc(PyObject* obj);
++
+ // Extract a CUtensorMap descriptor from a python object, and store it to the
+ // memory location pointed by ptr.
+ bool extractTmaDesc(PyObject* obj, void* ptr) {
+-  if (sizeof(CUtensorMap*) != 8) {
+-    PyErr_SetString(PyExc_SystemError,
+-                "extractTmaDesc() requires 64-bit compilation");
+-    return false;
+-  }
+-
+-  UniquePyObjectPtr method_ret(
+-      PyObject_CallMethod(obj, "tma_desc_cpu_ptr", nullptr));
+-  // Checking the error retains context if tma_desc_cpu_ptr raises an exception.
+-  if (PyErr_Occurred()) {
+-    return false;
+-  }
+-
+-  if (!method_ret) {
+-    PyErr_SetString(PyExc_SystemError, "Call to tma_desc_cpu_ptr() failed");
++  CUtensorMap* tensor_map = getTmaDesc(obj);
++  if (tensor_map == nullptr) {
+     return false;
+   }
+-
+-  if (!PyLong_Check(method_ret.get())) {
+-    PyErr_SetString(PyExc_TypeError,
+-                    "tma_desc_cpu_ptr() must return 64-bit int");
+-    return false;
+-  }
+-
+-  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret.get());
+-  if (PyErr_Occurred()) {
+-    return false;
+-  }
+-
+-  if (!ptr_as_uint) {
+-    PyErr_SetString(PyExc_ValueError,
+-                    "received NULL ptr from tma_desc_cpu_ptr()");
+-    return false;
+-  }
+-  if (ptr_as_uint % 64 != 0) {
+-    PyErr_SetString(PyExc_ValueError,
+-                    "tma_desc_cpu_ptr() must be 64-byte aligned");
+-    return false;
+-  }
+-
+-  *static_cast<CUtensorMap*>(ptr) =
+-      *reinterpret_cast<CUtensorMap*>(ptr_as_uint);
++  *static_cast<CUtensorMap*>(ptr) = *tensor_map;
+   return true;
+ }
+ 
+@@ -392,6 +357,7 @@ struct ExtractionInfo {
+   // Prefixes of types reprs supported by the extractor.
+   llvm::SmallVector<llvm::StringRef> supported_type_repr_prefixes;
+   std::size_t size;         // Size required by the extracted value.
++  std::size_t alignment;    // Alignment requirement for the extracted value.
+   ExtractorType extractor;  // Function to call to extract the value.
+ 
+   // Builds an ExtractionInfo for a given type T and a list of type reprs that
+@@ -400,7 +366,7 @@ struct ExtractionInfo {
+   static ExtractionInfo build(
+       std::initializer_list<llvm::StringRef> supported_type_reprs,
+       ExtractorType extractor = extractValue<T>) {
+-    return {supported_type_reprs, sizeof(T), extractor};
++    return {supported_type_reprs, sizeof(T), alignof(T), extractor};
+   }
+ 
+   // Checks if the extractor supports extracting a given type repr.
+@@ -428,7 +394,7 @@ const ExtractionInfo kExtractionInfos[]{
+     // Note: types are e.g. '*fp32', so no closing quote is intentional.
+     ExtractionInfo::build<void*>({"'*"}, extractPointer),
+     ExtractionInfo{
+-        {"None", "'none'"}, 0, nullptr},  // Represent constexprs as None
++        {"None", "'none'"}, 0, 0, nullptr},  // Represent constexprs as None
+     ExtractionInfo::build<CUtensorMap>({"'nvTmaDesc'"}, extractTmaDesc),
+ };
+ 
+@@ -628,7 +594,19 @@ PyObject* launch(PyObject* self, PyObjec
+     if (extraction_info.size == 0) {
+       continue;  // skip adding constexpr parameters
+     }
+-    config.params[params_idx] = alloca(extraction_info.size);
++    size_t alignment = std::max(1UL, extraction_info.alignment);
++
++    // Allocate enough space on the stack to guarantee an aligned block.
++    size_t size_with_alignment = extraction_info.size + alignment - 1;
++    void *param_storage_ptr = alloca(size_with_alignment);
++
++    void *aligned_ptr = std::align(alignment, extraction_info.size,
++                                   param_storage_ptr, size_with_alignment);
++    if (aligned_ptr == nullptr) {
++      PyErr_SetString(PyExc_MemoryError, "Failed to align parameter storage");
++      return nullptr;
++    }
++    config.params[params_idx] = aligned_ptr;
+     if (!extraction_info.extractor(arg, config.params[params_idx])) {
+       return nullptr;
+     }
+@@ -940,6 +918,36 @@ static PyTypeObject PyCUtensorMapType = 
+ };
+ // clang-format on
+ 
++namespace {
++
++// Extracts a pointer to `CUtensorMap` from a `PyCUtensorMapObject`.
++CUtensorMap* getTmaDesc(PyObject* obj) {
++  if (sizeof(CUtensorMap*) != 8) {
++    PyErr_SetString(PyExc_SystemError,
++                    "getTmaDesc() requires 64-bit compilation");
++    return nullptr;
++  }
++  if (Py_TYPE(obj) != static_cast<PyTypeObject*>(&PyCUtensorMapType)) {
++    PyErr_Format(PyExc_TypeError,
++                 "object must be of type PyCUtensorMap, got %s",
++                 Py_TYPE(obj)->tp_name);
++    return nullptr;
++  }
++  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
++  // PyCUtensorMapObject aligns tensorMap to 128.
++  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
++  if (align_128 != 0) {
++    PyErr_Format(
++        PyExc_ValueError,
++        "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld",
++        align_128);
++    return nullptr;
++  }
++  return map;
++}
++
++}  // namespace
++
+ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
+   unsigned long long global_address;
+   int swizzle;
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index 870af59b86424c..4fa55269e3323c 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -14,11 +14,5 @@ those to this list.
 """
 
 temporary_patch_list = [
-    "//third_party/triton:temporary/verify_nvmma_encoding.patch",
-    "//third_party/triton:temporary/triton-tensor-layout-init-fiasco.patch",
-    "//third_party/triton:temporary/launcher_overflow_fix.patch",
-    "//third_party/triton:temporary/convert_layout_op_to_llvm_small_width.patch",
-    "//third_party/triton:temporary/convert_layout_op_to_llvm_small_width_2.patch",
-    "//third_party/triton:temporary/cherrypick_dominance_fix.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/triton-tensor-layout-init-fiasco.patch b/third_party/xla/third_party/triton/temporary/triton-tensor-layout-init-fiasco.patch
deleted file mode 100644
index 599be88da7b1fd..00000000000000
--- a/third_party/xla/third_party/triton/temporary/triton-tensor-layout-init-fiasco.patch
+++ /dev/null
@@ -1,67 +0,0 @@
-Upsteam Pull Request: https://github.com/triton-lang/triton/pull/8117.
-
-diff --git a/bin/triton-tensor-layout.cpp b/bin/triton-tensor-layout.cpp
---- a/bin/triton-tensor-layout.cpp
-+++ b/bin/triton-tensor-layout.cpp
-@@ -39,29 +39,32 @@ using namespace mlir;
- // CLI options
- //===--------------------------------------------------------------------===//
-
--cl::OptionCategory PrinterCategory("Available Print Options",
--                                   "Options for the tensor layout printing.");
-+static cl::OptionCategory &getPrinterCategory() {
-+  static cl::OptionCategory PrinterCategory(
-+      "Available Print Options", "Options for the tensor layout printing.");
-+  return PrinterCategory;
-+}
-
- static cl::opt<std::string> InputFile(
-     "i", cl::desc("File that contains the tensor data layout attributes"),
--    cl::init(""), cl::value_desc("filename"), cl::cat(PrinterCategory));
-+    cl::init(""), cl::value_desc("filename"), cl::cat(getPrinterCategory()));
-
- static cl::opt<std::string>
-     OutputFile("o", cl::desc("Output file to write the layout into"),
-                cl::init(""), cl::value_desc("filename"),
--               cl::cat(PrinterCategory));
-+               cl::cat(getPrinterCategory()));
-
- static cl::opt<std::string>
-     DataLayoutStr("l", cl::desc("Tensor data layout attribute in string"),
-                   cl::value_desc("layout-string"), cl::init(""),
--                  cl::cat(PrinterCategory));
-+                  cl::cat(getPrinterCategory()));
-
- static cl::list<std::string>
-     AliasName("alias-names",
-               cl::desc("A list of alias names (separated by comma) of the "
-                        "layout attributes in the input file"),
-               cl::value_desc("name1,name2,name3,..."), cl::CommaSeparated,
--              cl::ZeroOrMore, cl::cat(PrinterCategory));
-+              cl::ZeroOrMore, cl::cat(getPrinterCategory()));
-
- static cl::opt<bool> UseHWPointOfView(
-     "use-hw-view",
-@@ -69,11 +72,11 @@ static cl::opt<bool> UseHWPointOfView(
-         "Print the layout in hardware point of view. This means the output is "
-         "from the warp's perspective. Otherwise, the output is from the "
-         "tensor's perspective (e.g., each element maps to xxx thread)."),
--    cl::init(false), cl::cat(PrinterCategory));
-+    cl::init(false), cl::cat(getPrinterCategory()));
-
- static cl::opt<std::string> TensorStr(
-     "t", cl::desc("Tensor shape and element type (e.g., tensor<2x2xf32>)"),
--    cl::init(""), cl::value_desc("tensor-type"), cl::cat(PrinterCategory));
-+    cl::init(""), cl::value_desc("tensor-type"), cl::cat(getPrinterCategory()));
-
- //===--------------------------------------------------------------------===//
- // Helper functions
-@@ -180,7 +183,7 @@ static LogicalResult printLayoutFromString(MLIRContext *context,
- //===--------------------------------------------------------------------===//
-
- int main(int argc, char **argv) {
--  cl::HideUnrelatedOptions(PrinterCategory);
-+  cl::HideUnrelatedOptions(getPrinterCategory());
-   cl::ParseCommandLineOptions(argc, argv, "tensor layout printer\n");
-
-   DialectRegistry registry;
diff --git a/third_party/xla/third_party/triton/temporary/utility-fix.patch b/third_party/xla/third_party/triton/temporary/utility-fix.patch
new file mode 100644
index 00000000000000..f8cc5d0821f098
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/utility-fix.patch
@@ -0,0 +1,22 @@
+This patch would probably not be accepted upstream because our infrastructure
+uses Index type for indexing, while they use Integer type. Triton frontend
+wouldn't generate a situation that would run into this issue.
+
+diff --git a/lib/Dialect/Triton/IR/Utility.cpp b/lib/Dialect/Triton/IR/Utility.cpp
+--- a/lib/Dialect/Triton/IR/Utility.cpp
++++ b/lib/Dialect/Triton/IR/Utility.cpp
+@@ -97,8 +97,12 @@ Value tt::getLastInductionValue(OpBuilde
+   // (ub - lb -1) // step * step + lb
+   Value diff =
+       b.create<arith::SubIOp>(loc, loop.getUpperBound(), loop.getLowerBound());
+-  diff = b.create<arith::SubIOp>(
+-      loc, diff, b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(1)));
++  Value one;
++  if (diff.getType().isIndex())
++    one = b.create<arith::ConstantIndexOp>(loc, 1);
++  else
++    one = b.create<arith::ConstantOp>(loc, b.getIntegerAttr(diff.getType(), 1));
++  diff = b.create<arith::SubIOp>(loc, diff, one);
+   Value ceilStep = b.create<arith::MulIOp>(
+       loc, b.create<arith::DivSIOp>(loc, diff, loop.getStep()), loop.getStep());
+   return b.create<arith::AddIOp>(loc, ceilStep, loop.getLowerBound());
diff --git a/third_party/xla/third_party/triton/temporary/verify_nvmma_encoding.patch b/third_party/xla/third_party/triton/temporary/verify_nvmma_encoding.patch
deleted file mode 100644
index e9e66e03defb4a..00000000000000
--- a/third_party/xla/third_party/triton/temporary/verify_nvmma_encoding.patch
+++ /dev/null
@@ -1,93 +0,0 @@
-
---- a/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-08-22 04:02:56.000000000 -0700
-+++ b/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-09-08 07:22:55.000000000 -0700
-@@ -1,3 +1,5 @@
-+#include "llvm/Support/Casting.h"
-+#include "llvm/Support/LogicalResult.h"
- #include "mlir/IR/BuiltinTypes.h"
- #include "mlir/IR/Diagnostics.h"
- #include "mlir/Support/DebugStringHelper.h"
-@@ -9,9 +11,8 @@
- #include "triton/Dialect/TritonGPU/IR/Types.h"
- #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
- #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
-+#include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
- #include "triton/Tools/LayoutUtils.h"
--#include "llvm/Support/Casting.h"
--#include "llvm/Support/LogicalResult.h"
-
- // Provide custom directive handlers for declarative assemblyFormat.
- // They must be visible before including the generated op classes.
-@@ -517,10 +518,47 @@ LogicalResult MemDescReshapeOp::verify() {
-   return success();
- }
-
--static LogicalResult inferMemDescReshapeOpEncoding(ArrayRef<int64_t> srcShape,
-+// Verification copied from nvmmaSharedToLinearLayout().
-+LogicalResult verifyNVMMASharedEncoding(std::optional<Location> loc,
-+                                        NVMMASharedEncodingAttr attr,
-+                                        ArrayRef<int64_t> shape,
-+                                        int elementBitWidth) {
-+  if (attr.getSwizzlingByteWidth() == 0) return success();
-+  if (shape.size() < 2)
-+    return emitOptionalError(loc, "nvmma_shared encoding requires rank >= 2");
-+
-+  auto shapePerCTA = getShapePerCTA(attr, shape);
-+  auto tmaShape = triton::nvidia_gpu::getTMABlockShape(attr, shapePerCTA,
-+                                                       /*packedSize=*/true);
-+  std::array<int64_t, 2> collapsedTmaShape{1, tmaShape.back()};
-+  for (int i = 0; i + 1 < shape.size(); i++)
-+    collapsedTmaShape[0] *= tmaShape[i];
-+  if (attr.getTransposed()) {
-+    std::swap(collapsedTmaShape[0], collapsedTmaShape[1]);
-+  }
-+
-+  int tileRows = 8;
-+  int tileCols = 8 * attr.getSwizzlingByteWidth() / elementBitWidth;
-+  if (attr.getFp4Padded()) tileCols /= 2;
-+
-+  int packingFactor = attr.getFp4Padded() ? 2 : 1;
-+  if (collapsedTmaShape[1] * packingFactor < tileCols ||
-+      collapsedTmaShape[0] < tileRows) {
-+    return emitOptionalError(
-+        loc,
-+        "Illegal shared layout; expected collapsed shapePerCTA to "
-+        "be at least [",
-+        tileRows, ", ", (tileCols / packingFactor), "], collapsedTmaShape: [",
-+        collapsedTmaShape[0], ", ", collapsedTmaShape[1], "]");
-+  }
-+  return success();
-+}
-+
-+static LogicalResult inferMemDescReshapeOpEncoding(std::optional<Location> loc,
-+                                                   ArrayRef<int64_t> srcShape,
-                                                    Attribute srcEnc,
-                                                    ArrayRef<int64_t> dstShape,
--                                                   Attribute &dstEnc) {
-+                                                   Attribute& dstEnc) {
-   if (auto mmaEncoding = dyn_cast<NVMMASharedEncodingAttr>(srcEnc)) {
-     // TODO: supporting reshape of CTA layouts is non-trivial.
-     if (getNumCTAs(mmaEncoding) > 1)
-@@ -544,6 +582,11 @@ static LogicalResult inferMemDescReshapeOpEncoding(ArrayRef<int64_t> srcShape,
-         ctx, mmaEncoding.getSwizzlingByteWidth(), mmaEncoding.getTransposed(),
-         mmaEncoding.getElementBitWidth(), mmaEncoding.getFp4Padded(),
-         CTALayout);
-+    if (failed(verifyNVMMASharedEncoding(
-+            loc, cast<NVMMASharedEncodingAttr>(dstEnc), dstShape,
-+            mmaEncoding.getElementBitWidth()))) {
-+      return failure();
-+    }
-     // Big guns, check linear layouts are equivalent
-     // We disallow reshaping memdesc_subslice in the verifier
-     // so allocShape == shape
-@@ -566,8 +609,8 @@ LogicalResult MemDescReshapeOp::inferReturnTypes(
-
-   Attribute dstEncoding;
-   if (Attribute srcEnc = srcTy.getEncoding()) {
--    if (failed(inferMemDescReshapeOpEncoding(srcTy.getShape(), srcEnc, dstShape,
--                                             dstEncoding)))
-+    if (failed(inferMemDescReshapeOpEncoding(loc, srcTy.getShape(), srcEnc,
-+                                             dstShape, dstEncoding)))
-       return failure();
-   }
-
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 4e34ce8ed02a3f..6316fc91a1a9fa 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -7,8 +7,8 @@ load("//third_party/triton:temporary/series.bzl", "temporary_patch_list")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "triton_integrate_branch-1.12"
-    TRITON_SHA256 = "6754c1c474c58916c1ddd88ceb1adb2a553ec3609afbe5fec936902a0297a7ad"
+    TRITON_COMMIT = "triton_integrate_branch-1.14"
+    TRITON_SHA256 = "b684cff8d07e839f8a1ea6cc7d331f370615b4c5530489db76f619aa7aa66608"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index ef05bbeab5f31d..3809738d7bde2d 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -42,6 +42,7 @@ cc_library(
     deps = [
         ":stringpiece",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//xla/tsl/platform:errors",
         "@local_xla//xla/tsl/platform:macros",
         "@local_xla//xla/tsl/platform:status",
@@ -66,6 +67,7 @@ cc_library(
     deps = [
         ":stringpiece",
         ":tstring",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//xla/tsl/platform:byte_order",
         "@local_xla//xla/tsl/platform:types",
     ],
@@ -158,11 +160,11 @@ cc_library(
     srcs = ["numbers.cc"],
     hdrs = ["numbers.h"],
     deps = [
-        ":str_util",
         ":stringpiece",
         ":stringprintf",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_xla//xla/tsl/platform:logging",
         "@local_xla//xla/tsl/platform:macros",
         "@local_xla//xla/tsl/platform:types",
@@ -314,6 +316,7 @@ cc_library(
     deps = [
         ":cord",
         ":platform",
+        ":refcount",
         ":stringpiece",
     ],
 )
@@ -453,7 +456,6 @@ filegroup(
         "str_util.h",
         "strcat.h",
         "stringpiece.h",
-        "stringprintf.cc",
         "stringprintf.h",
         "thread_annotations.h",
         "threadpool.h",
@@ -688,6 +690,7 @@ cc_library(
     ],
     deps = tf_windows_aware_platform_deps("platform_port") + [
         ":platform",
+        "@com_google_absl//absl/base:core_headers",
         "@local_xla//xla/tsl/platform:byte_order",
         "@local_xla//xla/tsl/platform:dynamic_annotations",
         "@local_xla//xla/tsl/platform:types",
@@ -791,11 +794,10 @@ cc_library(
 
 cc_library(
     name = "stringprintf",
-    srcs = ["stringprintf.cc"],
     hdrs = ["stringprintf.h"],
     deps = [
-        "@local_xla//xla/tsl/platform:macros",
-        "@local_xla//xla/tsl/platform:types",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -908,6 +910,7 @@ cc_library(
     deps = [
         ":raw_coding",
         ":stringpiece",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//xla/tsl/platform:macros",
         "@local_xla//xla/tsl/platform:types",
     ],
@@ -1038,6 +1041,7 @@ cc_library(
     deps = [
         ":str_util",
         ":stringpiece",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/platform:macros",
     ],
 )
@@ -1087,10 +1091,9 @@ tsl_cc_test(
     srcs = ["path_test.cc"],
     deps = [
         ":path",
-        ":stringpiece",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_xla//xla/tsl/platform:env",
-        "@local_xla//xla/tsl/platform:env_impl",
         "@local_xla//xla/tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/platform/base64.h b/third_party/xla/third_party/tsl/tsl/platform/base64.h
index 08867207f6e76e..e02f77aa9e5338 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/base64.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/base64.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/status.h"
 #include "tsl/platform/stringpiece.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/coding.h b/third_party/xla/third_party/tsl/tsl/platform/coding.h
index b8153c18de45fd..ddd372b1ce3c69 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/coding.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/coding.h
@@ -21,6 +21,7 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_CODING_H_
 #define TENSORFLOW_TSL_PLATFORM_CODING_H_
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/stringpiece.h"
 #include "tsl/platform/tstring.h"
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ctstring.h b/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
index f841e5f4d22af5..0e45c0e8685636 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
@@ -92,6 +92,12 @@ inline char *TF_TString_GetMutableDataPointer(TF_TString *str);
 inline void TF_TString_AssignView(TF_TString *dst, const char *src,
                                   size_t size);
 
+// Sets `dst' as an owning VIEW type to `src', taking shared ownership via
+// `owner_ref`. If `owner_ref` is null, behaves as TF_TString_AssignView.
+inline void TF_TString_AssignViewWithOwner(TF_TString *dst, const char *src,
+                                           size_t size,
+                                           TStringOwnerCApi *owner_ref);
+
 // Appends `src' onto `dst'.  If `dst' is a VIEW or OFFSET type, it will first
 // be converted to an owned LARGE or SMALL type.  `dst' should not point to
 // memory owned by `src'.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h b/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
index 43e909a8065aaa..dad9f483b378a1 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_TSL_PLATFORM_CTSTRING_INTERNAL_H_
 
 #include <limits.h>
+#include <stdbool.h>  // IWYU pragma: keep, provides bool
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -81,6 +82,18 @@ typedef enum TF_TString_Type {  // NOLINT
   TF_TSTR_TYPE_MASK = 0x03
 } TF_TString_Type;
 
+// C-compatible API for tstring shared ownership.
+struct TStringOwnerCApi;
+
+typedef void (*tstring_owner_ref_t)(struct TStringOwnerCApi *self);
+typedef bool (*tstring_owner_unref_t)(struct TStringOwnerCApi *self);
+
+typedef struct TStringOwnerCApi {
+  tstring_owner_ref_t ref;
+  tstring_owner_unref_t unref;
+  void *obj;  // Opaque pointer to the C++ owner object
+} TStringOwnerCApi;
+
 typedef struct TF_TString_Large {  // NOLINT
   size_t size;
   size_t cap;
@@ -96,6 +109,7 @@ typedef struct TF_TString_Offset {  // NOLINT
 typedef struct TF_TString_View {  // NOLINT
   size_t size;
   const char *ptr;
+  TStringOwnerCApi *owner_ref;
 } TF_TString_View;
 
 typedef struct TF_TString_Raw {  // NOLINT
@@ -177,6 +191,10 @@ static inline void TF_TString_Dealloc(TF_TString *str) {
       str->u.large.ptr != NULL) {  // NOLINT
     free(str->u.large.ptr);
     TF_TString_Init(str);
+  } else if (TF_TString_GetType(str) == TF_TSTR_VIEW &&
+             str->u.view.owner_ref != NULL) {  // NOLINT
+    (void)str->u.view.owner_ref->unref(str->u.view.owner_ref);
+    TF_TString_Init(str);
   }
 }
 
@@ -231,6 +249,8 @@ static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
 
   TF_TString_Type curr_type = TF_TString_GetType(str);
   const char *curr_ptr = TF_TString_GetDataPointer(str);
+  TStringOwnerCApi *curr_owner_ref =
+      curr_type == TF_TSTR_VIEW ? str->u.view.owner_ref : NULL;  // NOLINT
 
   // Case: SMALL/LARGE/VIEW/OFFSET -> SMALL
   if (new_size <= TF_TString_SmallCapacity) {
@@ -243,10 +263,10 @@ static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
 
     if (curr_type == TF_TSTR_LARGE) {
       free((void *)curr_ptr);  // NOLINT
+    } else if (curr_owner_ref != NULL) {  // NOLINT
+      curr_owner_ref->unref(curr_owner_ref);
     }
 
-    // We do not clear out the newly excluded region.
-
     return str->u.smll.str;
   }
 
@@ -273,6 +293,9 @@ static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
     if (copy_size) {
       memcpy(new_ptr, curr_ptr, copy_size);
     }
+    if (curr_type == TF_TSTR_VIEW && str->u.view.owner_ref != NULL) {  // NOLINT
+      (void)str->u.view.owner_ref->unref(str->u.view.owner_ref);
+    }
   }
 
   str->u.large.size = TF_TString_ToInternalSizeT(new_size, TF_TSTR_LARGE);
@@ -333,6 +356,9 @@ static inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
     // Convert to Large
     char *new_ptr = (char *)malloc(new_cap + 1);  // NOLINT
     memcpy(new_ptr, curr_ptr, curr_size);
+    if (curr_type == TF_TSTR_VIEW && str->u.view.owner_ref != NULL) {  // NOLINT
+      (void)str->u.view.owner_ref->unref(str->u.view.owner_ref);
+    }
 
     str->u.large.size = TF_TString_ToInternalSizeT(curr_size, TF_TSTR_LARGE);
     str->u.large.ptr = new_ptr;
@@ -362,12 +388,22 @@ static inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
   return cstr;
 }
 
-static inline void TF_TString_AssignView(TF_TString *dst, const char *src,
-                                         size_t size) {
+static inline void TF_TString_AssignViewWithOwner(TF_TString *dst,
+                                                  const char *src, size_t size,
+                                                  TStringOwnerCApi *owner_ref) {
+  if (owner_ref != NULL) {  // NOLINT
+    owner_ref->ref(owner_ref);
+  }
   TF_TString_Dealloc(dst);
 
   dst->u.view.size = TF_TString_ToInternalSizeT(size, TF_TSTR_VIEW);
   dst->u.view.ptr = src;
+  dst->u.view.owner_ref = owner_ref;
+}
+
+static inline void TF_TString_AssignView(TF_TString *dst, const char *src,
+                                         size_t size) {
+  TF_TString_AssignViewWithOwner(dst, src, size, NULL);  // NOLINT
 }
 
 static inline void TF_TString_AppendN(TF_TString *dst, const char *src,
@@ -400,14 +436,21 @@ static inline void TF_TString_Copy(TF_TString *dst, const char *src,
 static inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
   if (dst == src) return;
 
-  TF_TString_Dealloc(dst);
-
   switch (TF_TString_GetType(src)) {
     case TF_TSTR_SMALL:
-    case TF_TSTR_VIEW:
+      TF_TString_Dealloc(dst);
       *dst = *src;
       return;
+    case TF_TSTR_VIEW: {
+      TF_TString_Dealloc(dst);
+      *dst = *src;
+      if (dst->u.view.owner_ref != NULL) {  // NOLINT
+        dst->u.view.owner_ref->ref(dst->u.view.owner_ref);
+      }
+      return;
+    }
     case TF_TSTR_LARGE: {
+      TF_TString_Dealloc(dst);
       const char *src_c = TF_TString_GetDataPointer(src);
       size_t size = TF_TString_GetSize(src);
 
@@ -433,9 +476,9 @@ static inline void TF_TString_Move(TF_TString *dst, TF_TString *src) {
 
   switch (TF_TString_GetType(src)) {
     case TF_TSTR_SMALL:
-    case TF_TSTR_VIEW:
       *dst = *src;
       return;
+    case TF_TSTR_VIEW:
     case TF_TSTR_LARGE:
       *dst = *src;
       TF_TString_Init(src);
diff --git a/third_party/xla/third_party/tsl/tsl/platform/hash.h b/third_party/xla/third_party/tsl/tsl/platform/hash.h
index 174b233c2d3b25..0945bf071d5e6d 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/hash.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/hash.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <functional>
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/stringpiece.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc b/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc
index 80655dbee9407d..b7142c5e2f5675 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+#include <limits>
+
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/types.h"
 
@@ -32,15 +35,19 @@ TEST(IntegralTypes, Basic) {
 }
 
 TEST(IntegralTypes, MinAndMaxConstants) {
-  EXPECT_EQ(static_cast<uint8>(kint8min), static_cast<uint8>(kint8max) + 1);
-  EXPECT_EQ(static_cast<uint16>(kint16min), static_cast<uint16>(kint16max) + 1);
-  EXPECT_EQ(static_cast<uint32>(kint32min), static_cast<uint32>(kint32max) + 1);
-  EXPECT_EQ(static_cast<uint64>(kint64min), static_cast<uint64>(kint64max) + 1);
-
-  EXPECT_EQ(0, static_cast<uint8>(kuint8max + 1));
-  EXPECT_EQ(0, static_cast<uint16>(kuint16max + 1));
-  EXPECT_EQ(0, static_cast<uint32>(kuint32max + 1));
-  EXPECT_EQ(0, static_cast<uint64>(kuint64max + 1));
+  EXPECT_EQ(static_cast<uint8_t>(std::numeric_limits<int8_t>::min()),
+            static_cast<uint8_t>(std::numeric_limits<int8_t>::max()) + 1);
+  EXPECT_EQ(static_cast<uint16_t>(std::numeric_limits<int16_t>::min()),
+            static_cast<uint16_t>(std::numeric_limits<int16_t>::max()) + 1);
+  EXPECT_EQ(static_cast<uint32_t>(std::numeric_limits<int32_t>::min()),
+            static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) + 1);
+  EXPECT_EQ(static_cast<uint64_t>(std::numeric_limits<int64_t>::min()),
+            static_cast<uint64_t>(std::numeric_limits<int64_t>::max()) + 1);
+
+  EXPECT_EQ(0, static_cast<uint8_t>(std::numeric_limits<uint8_t>::max() + 1));
+  EXPECT_EQ(0, static_cast<uint16_t>(std::numeric_limits<uint16_t>::max() + 1));
+  EXPECT_EQ(0, static_cast<uint32_t>(std::numeric_limits<uint32_t>::max() + 1));
+  EXPECT_EQ(0, static_cast<uint64_t>(std::numeric_limits<uint64_t>::max() + 1));
 }
 
 }  // namespace
diff --git a/third_party/xla/third_party/tsl/tsl/platform/mem.h b/third_party/xla/third_party/tsl/tsl/platform/mem.h
index f88b4e1b197ef2..d2b3286f07e78a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/mem.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/mem.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <new>
 
 // TODO(cwhipkey): remove this when callers use annotations directly.
+#include "absl/base/macros.h"
 #include "xla/tsl/platform/dynamic_annotations.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/platform.h"
@@ -29,12 +31,23 @@ namespace port {
 
 // Aligned allocation/deallocation. `minimum_alignment` must be a power of 2
 // and a multiple of sizeof(void*).
-void* AlignedMalloc(size_t size, int minimum_alignment);
+void* AlignedMalloc(size_t size, std::align_val_t minimum_alignment);
+ABSL_DEPRECATE_AND_INLINE()
+inline void* AlignedMalloc(size_t size, int minimum_alignment) {
+  return AlignedMalloc(size, static_cast<std::align_val_t>(minimum_alignment));
+}
 void AlignedFree(void* aligned_memory);
-void AlignedSizedFree(void* aligned_memory, size_t alignment, size_t size);
+void AlignedSizedFree(void* aligned_memory, size_t size,
+                      std::align_val_t minimum_alignment);
+ABSL_DEPRECATE_AND_INLINE()
+inline void AlignedSizedFree(void* aligned_memory, size_t alignment,
+                             size_t size) {
+  AlignedSizedFree(aligned_memory, size,
+                   static_cast<std::align_val_t>(alignment));
+}
 
 // An allocator that allocates memory with the given minimum alignment.
-template <class T, size_t minimum_alignment>
+template <class T, std::align_val_t minimum_alignment>
 struct AlignedAllocator {
   using value_type = T;
 
@@ -44,7 +57,7 @@ struct AlignedAllocator {
   }
 
   void deallocate(value_type* p, size_t n) {
-    return AlignedSizedFree(p, minimum_alignment, n);
+    return AlignedSizedFree(p, n, minimum_alignment);
   }
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/mutex.h b/third_party/xla/third_party/tsl/tsl/platform/mutex.h
index ec9d593a40ce4f..4163e3edd4d993 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/mutex.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/mutex.h
@@ -222,25 +222,27 @@ inline ConditionResult WaitForMilliseconds(mutex_lock* mu,
 
 inline mutex::mutex() = default;
 
-inline void mutex::lock() TF_EXCLUSIVE_LOCK_FUNCTION() { mu_.Lock(); }
+inline void mutex::lock() TF_EXCLUSIVE_LOCK_FUNCTION() { mu_.lock(); }
 
 inline bool mutex::try_lock() TF_EXCLUSIVE_TRYLOCK_FUNCTION(true) {
-  return mu_.TryLock();
+  return mu_.try_lock();
 };
 
-inline void mutex::unlock() TF_UNLOCK_FUNCTION() { mu_.Unlock(); }
+inline void mutex::unlock() TF_UNLOCK_FUNCTION() { mu_.unlock(); }
 
 inline void mutex::assert_held() const TF_ASSERT_EXCLUSIVE_LOCK() {
   mu_.AssertHeld();
 }
 
-inline void mutex::lock_shared() TF_SHARED_LOCK_FUNCTION() { mu_.ReaderLock(); }
+inline void mutex::lock_shared() TF_SHARED_LOCK_FUNCTION() {
+  mu_.lock_shared();
+}
 
 inline bool mutex::try_lock_shared() TF_SHARED_TRYLOCK_FUNCTION(true) {
-  return mu_.ReaderTryLock();
+  return mu_.try_lock_shared();
 }
 
-inline void mutex::unlock_shared() TF_UNLOCK_FUNCTION() { mu_.ReaderUnlock(); }
+inline void mutex::unlock_shared() TF_UNLOCK_FUNCTION() { mu_.unlock_shared(); }
 
 inline void mutex::assert_held_shared() const TF_ASSERT_SHARED_LOCK() {
   mu_.AssertReaderHeld();
diff --git a/third_party/xla/third_party/tsl/tsl/platform/net.h b/third_party/xla/third_party/tsl/tsl/platform/net.h
index 6c48b89fc6c9fe..cce8b6d43fa983 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/net.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/net.h
@@ -27,6 +27,16 @@ int PickUnusedPort();
 // that case, the error message is logged to FATAL.
 int PickUnusedPortOrDie();
 
+// Relinquish a claim on the given port which was previously returned by
+// PickUnusedPort[OrDie](). This allows PickUnusedPort[OrDie]() to return
+// the given port to another caller in the future. Since the number of
+// ports the portserver will give to a process is limited (typically 200),
+// recycling ports after they are no longer needed can help avoid
+// exhausting them. 'port' must be a positive number that was previously
+// returned by PickUnusedPort[OrDie](), and not yet recycled, otherwise an
+// abort may occur.
+void RecycleUnusedPort(int port);
+
 }  // namespace internal
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/net_test.cc b/third_party/xla/third_party/tsl/tsl/platform/net_test.cc
index d99c7cb3952777..0c4a88518f5ad0 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/net_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/net_test.cc
@@ -31,5 +31,23 @@ TEST(Net, PickUnusedPortOrDie) {
   CHECK_NE(port0, port1);
 }
 
+TEST(Net, RecycleUnusedPort) {
+  for (int i = 0; i < 1000; ++i) {
+    int port0 = PickUnusedPortOrDie();
+    CHECK_GE(port0, 0);
+    CHECK_LT(port0, 65536);
+    RecycleUnusedPort(port0);
+  }
+}
+
+TEST(Net, RecycleUnusedPortTwiceShallFail) {
+  int port0 = PickUnusedPortOrDie();
+  CHECK_GE(port0, 0);
+  CHECK_LT(port0, 65536);
+  RecycleUnusedPort(port0);
+
+  EXPECT_DEATH(RecycleUnusedPort(port0), "");
+}
+
 }  // namespace internal
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numa.h b/third_party/xla/third_party/tsl/tsl/platform/numa.h
index 12a65894a0cc9d..85d16a4b398ad6 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numa.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/numa.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_NUMA_H_
 #define TENSORFLOW_TSL_PLATFORM_NUMA_H_
 
-#include "xla/tsl/platform/types.h"
-#include "tsl/platform/platform.h"
+#include <cstddef>
 
 namespace tsl {
 namespace port {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers.cc b/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
index c9dd5fc4fb6e36..708de7097833db 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
@@ -14,128 +14,45 @@ limitations under the License.
 
 #include "tsl/platform/numbers.h"
 
-#include <ctype.h>
-#include <float.h>
-#include <stdio.h>
-#include <stdlib.h>
-
 #include <algorithm>
 #include <charconv>
-#include <cmath>
+#include <cstddef>
 #include <cstdint>
-#include <locale>
+#include <limits>
+#include <optional>
 #include <string>
 #include <system_error>  // NOLINT
-#include <unordered_map>
+#include <type_traits>
 
+#include "absl/strings/charconv.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/stringprintf.h"
 
 namespace tsl {
 
 namespace {
 
 template <typename T>
-const std::unordered_map<std::string, T>* GetSpecialNumsSingleton() {
-  static const std::unordered_map<std::string, T>* special_nums =
-      CHECK_NOTNULL((new const std::unordered_map<std::string, T>{
-          {"inf", std::numeric_limits<T>::infinity()},
-          {"+inf", std::numeric_limits<T>::infinity()},
-          {"-inf", -std::numeric_limits<T>::infinity()},
-          {"infinity", std::numeric_limits<T>::infinity()},
-          {"+infinity", std::numeric_limits<T>::infinity()},
-          {"-infinity", -std::numeric_limits<T>::infinity()},
-          {"nan", std::numeric_limits<T>::quiet_NaN()},
-          {"+nan", std::numeric_limits<T>::quiet_NaN()},
-          {"-nan", -std::numeric_limits<T>::quiet_NaN()},
-      }));
-  return special_nums;
-}
-
-template <typename T>
-T locale_independent_strtonum(const char* str, const char** endptr) {
-  auto special_nums = GetSpecialNumsSingleton<T>();
-  std::stringstream s(str);
-
-  // Check if str is one of the special numbers.
-  std::string special_num_str;
-  s >> special_num_str;
-
-  for (size_t i = 0; i < special_num_str.length(); ++i) {
-    special_num_str[i] =
-        std::tolower(special_num_str[i], std::locale::classic());
+std::optional<T> AsciiToFp(absl::string_view str) {
+  T value;
+  absl::from_chars_result result =
+      absl::from_chars(str.data(), str.data() + str.size(), value);
+  if (result.ec != std::errc{}) {
+    return std::nullopt;
   }
-
-  auto entry = special_nums->find(special_num_str);
-  if (entry != special_nums->end()) {
-    *endptr = str + (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
-                             : s.tellg());
-    return entry->second;
-  } else {
-    // Perhaps it's a hex number
-    if (special_num_str.compare(0, 2, "0x") == 0 ||
-        special_num_str.compare(0, 3, "-0x") == 0) {
-      return strtol(str, const_cast<char**>(endptr), 16);
-    }
+  if (result.ptr != str.data() + str.size()) {
+    // Not all characters consumed.
+    return std::nullopt;
   }
-  // Reset the stream
-  s.str(str);
-  s.clear();
-  // Use the "C" locale
-  s.imbue(std::locale::classic());
-
-  T result;
-  s >> result;
-
-  // Set to result to what strto{f,d} functions would have returned. If the
-  // number was outside the range, the stringstream sets the fail flag, but
-  // returns the +/-max() value, whereas strto{f,d} functions return +/-INF.
-  if (s.fail()) {
-    if (result == std::numeric_limits<T>::max() ||
-        result == std::numeric_limits<T>::infinity()) {
-      result = std::numeric_limits<T>::infinity();
-      s.clear(s.rdstate() & ~std::ios::failbit);
-    } else if (result == -std::numeric_limits<T>::max() ||
-               result == -std::numeric_limits<T>::infinity()) {
-      result = -std::numeric_limits<T>::infinity();
-      s.clear(s.rdstate() & ~std::ios::failbit);
-    }
-  }
-
-  if (endptr) {
-    *endptr =
-        str +
-        (s.fail() ? static_cast<std::iostream::pos_type>(0)
-                  : (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
-                             : s.tellg()));
-  }
-  return result;
+  return value;
 }
 
-}  // namespace
-
-namespace strings {
-
-size_t FastInt32ToBufferLeft(int32_t i, char* buffer) {
-  uint32_t u = i;
-  size_t length = 0;
-  if (i < 0) {
-    *buffer++ = '-';
-    ++length;
-    // We need to do the negation in modular (i.e., "unsigned")
-    // arithmetic; MSVC++ apparently warns for plain "-u", so
-    // we write the equivalent expression "0 - u" instead.
-    u = 0 - u;
-  }
-  length += FastUInt32ToBufferLeft(u, buffer);
-  return length;
-}
-
-size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer) {
+template <typename T>
+size_t FastUIntToBufferLeft(T i, char* buffer) {
+  static_assert(std::is_unsigned_v<T>);
   char* start = buffer;
   do {
     *buffer++ = ((i % 10) + '0');
@@ -146,103 +63,108 @@ size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer) {
   return buffer - start;
 }
 
-size_t FastInt64ToBufferLeft(int64_t i, char* buffer) {
-  uint64_t u = i;
+template <typename T>
+size_t FastIntToBufferLeft(T i, char* buffer) {
+  static_assert(std::is_signed_v<T>);
+  std::make_unsigned_t<T> u = i;
   size_t length = 0;
   if (i < 0) {
     *buffer++ = '-';
     ++length;
+    // We need to do the negation in modular (i.e., "unsigned")
+    // arithmetic; MSVC++ apparently warns for plain "-u", so
+    // we write the equivalent expression "0 - u" instead.
     u = 0 - u;
   }
-  length += FastUInt64ToBufferLeft(u, buffer);
+  length += FastUIntToBufferLeft(u, buffer);
   return length;
 }
+}  // namespace
 
-size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer) {
-  char* start = buffer;
-  do {
-    *buffer++ = ((i % 10) + '0');
-    i /= 10;
-  } while (i > 0);
-  *buffer = 0;
-  std::reverse(start, buffer);
-  return buffer - start;
-}
-
-static const double kDoublePrecisionCheckMax = DBL_MAX / 1.000000000000001;
-
-size_t DoubleToBuffer(double value, char* buffer) {
-  // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
-  // platforms these days.  Just in case some system exists where DBL_DIG
-  // is significantly larger -- and risks overflowing our buffer -- we have
-  // this assert.
-  static_assert(DBL_DIG < 20, "DBL_DIG is too big");
-
-  if (std::isnan(value)) {
-    int snprintf_result = snprintf(buffer, kFastToBufferSize, "%snan",
-                                   std::signbit(value) ? "-" : "");
-    // Paranoid check to ensure we don't overflow the buffer.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
-    return snprintf_result;
-  }
+namespace strings {
 
-  if (std::abs(value) <= kDoublePrecisionCheckMax) {
-    int snprintf_result =
-        snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG, value);
+size_t FastInt32ToBufferLeft(int32_t i, char* buffer) {
+  return FastIntToBufferLeft(i, buffer);
+}
 
-    // The snprintf should never overflow because the buffer is significantly
-    // larger than the precision we asked for.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer) {
+  return FastUIntToBufferLeft(i, buffer);
+}
 
-    if (locale_independent_strtonum<double>(buffer, nullptr) == value) {
-      // Round-tripping the string to double works; we're done.
-      return snprintf_result;
-    }
-    // else: full precision formatting needed. Fall through.
-  }
+size_t FastInt64ToBufferLeft(int64_t i, char* buffer) {
+  return FastIntToBufferLeft(i, buffer);
+}
 
-  int snprintf_result =
-      snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG + 2, value);
+size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer) {
+  return FastUIntToBufferLeft(i, buffer);
+}
 
-  // Should never overflow; see above.
-  DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+namespace {
 
-  return snprintf_result;
+constexpr int NumDecimalDigits(int n) {
+  int count = 0;
+  do {
+    ++count;
+    n /= 10;
+  } while (n != 0);
+  return count;
 }
 
-size_t FloatToBuffer(float value, char* buffer) {
-  // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
-  // platforms these days.  Just in case some system exists where FLT_DIG
-  // is significantly larger -- and risks overflowing our buffer -- we have
-  // this assert.
-  static_assert(FLT_DIG < 10, "FLT_DIG is too big");
-
+template <typename T>
+size_t FpToBuffer(T value, char* buffer) {
+  using strings_internal::kFastToBufferSize;
+  // Out of an abundance of caution, we ensure that the buffer is large enough
+  // to hold the worst-case formatting of any floating-point number.
+  constexpr size_t kMaxExponentDigits10 =
+      std::max(NumDecimalDigits(std::numeric_limits<T>::max_exponent10),
+               NumDecimalDigits(std::numeric_limits<T>::min_exponent10));
+  constexpr size_t kMaxCharsWritten =
+      1 +                                     // sign bit
+      std::numeric_limits<T>::max_digits10 +  // decimal digits
+      1 +                                     // decimal point
+      1 +                                     // exponent character
+      1 +                                     // exponent sign
+      kMaxExponentDigits10;                   // exponent digits
+  static_assert(kMaxCharsWritten < kFastToBufferSize);
   if (std::isnan(value)) {
-    int snprintf_result = snprintf(buffer, kFastToBufferSize, "%snan",
-                                   std::signbit(value) ? "-" : "");
+    int snprintf_result = absl::SNPrintF(buffer, kFastToBufferSize, "%snan",
+                                         std::signbit(value) ? "-" : "");
     // Paranoid check to ensure we don't overflow the buffer.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
     return snprintf_result;
   }
 
-  int snprintf_result =
-      snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG, value);
+  int snprintf_result = absl::SNPrintF(buffer, kFastToBufferSize, "%.*g",
+                                       std::numeric_limits<T>::digits10, value);
 
   // The snprintf should never overflow because the buffer is significantly
   // larger than the precision we asked for.
-  DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+  DCHECK(snprintf_result > 0 && snprintf_result <= kMaxCharsWritten);
 
-  float parsed_value;
-  if (!absl::SimpleAtof(buffer, &parsed_value) || parsed_value != value) {
+  if (auto parsed_value = AsciiToFp<T>(buffer); parsed_value != value) {
+    // Round-trip conversion failed, so we need to use full precision
+    // formatting.
     snprintf_result =
-        snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG + 3, value);
+        absl::SNPrintF(buffer, kFastToBufferSize, "%.*g",
+                       std::numeric_limits<T>::max_digits10, value);
 
     // Should never overflow; see above.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+    DCHECK(snprintf_result > 0 && snprintf_result <= kMaxCharsWritten);
   }
+
   return snprintf_result;
 }
 
+}  // namespace
+
+size_t DoubleToBuffer(double value, char* buffer) {
+  return FpToBuffer(value, buffer);
+}
+
+size_t FloatToBuffer(float value, char* buffer) {
+  return FpToBuffer(value, buffer);
+}
+
 strings_internal::AlphaNumBuffer LegacyPrecision(double d) {
   strings_internal::AlphaNumBuffer result;
   result.size = DoubleToBuffer(d, result.data.data());
@@ -274,63 +196,58 @@ bool HexStringToUint64(absl::string_view s, uint64_t* result) {
   return true;
 }
 
-std::string HumanReadableNum(int64_t value) {
+std::string HumanReadableNum(int64_t signed_value) {
   std::string s;
-  if (value < 0) {
-    s += "-";
+
+  uint64_t value = static_cast<uint64_t>(signed_value);
+  if (signed_value < 0) {
+    s = "-";
     value = -value;
   }
   if (value < 1000) {
-    Appendf(&s, "%lld", static_cast<long long>(value));
+    absl::StrAppendFormat(&s, "%d", value);
   } else if (value >= static_cast<int64_t>(1e15)) {
     // Number bigger than 1E15; use that notation.
-    Appendf(&s, "%0.3G", static_cast<double>(value));
+    absl::StrAppendFormat(&s, "%0.3G", static_cast<double>(value));
   } else {
-    static const char units[] = "kMBT";
-    const char* unit = units;
+    static absl::string_view kUnits = "kMBT";
+    auto unit = kUnits.begin();
     while (value >= static_cast<int64_t>(1000000)) {
       value /= static_cast<int64_t>(1000);
       ++unit;
-      CHECK(unit < units + TF_ARRAYSIZE(units));
+      CHECK(unit < kUnits.end());
     }
-    Appendf(&s, "%.2f%c", value / 1000.0, *unit);
+    absl::StrAppendFormat(&s, "%.2f%c", value / 1000.0, *unit);
   }
   return s;
 }
 
-std::string HumanReadableNumBytes(int64_t num_bytes) {
-  if (num_bytes == kint64min) {
-    // Special case for number with not representable negation.
-    return "-8E";
-  }
-
-  const char* neg_str = (num_bytes < 0) ? "-" : "";
-  if (num_bytes < 0) {
+std::string HumanReadableNumBytes(int64_t signed_num_bytes) {
+  static absl::string_view kNegSign = "-";
+  absl::string_view sign_str;
+  uint64_t num_bytes = static_cast<uint64_t>(signed_num_bytes);
+  if (signed_num_bytes < 0) {
     num_bytes = -num_bytes;
+    sign_str = kNegSign;
   }
 
   // Special case for bytes.
   if (num_bytes < 1024) {
     // No fractions for bytes.
-    char buf[8];  // Longest possible string is '-XXXXB'
-    snprintf(buf, sizeof(buf), "%s%lldB", neg_str,
-             static_cast<long long>(num_bytes));
-    return std::string(buf);
+    return absl::StrCat(sign_str, num_bytes, "B");
   }
 
-  static const char units[] = "KMGTPE";  // int64 only goes up to E.
-  const char* unit = units;
+  static absl::string_view kUnits = "KMGTPE";  // int64 only goes up to E.
+  auto unit = kUnits.begin();
   while (num_bytes >= static_cast<int64_t>(1024) * 1024) {
     num_bytes /= 1024;
     ++unit;
-    CHECK(unit < units + TF_ARRAYSIZE(units));
+    CHECK(unit < kUnits.end());
   }
 
   // We use SI prefixes.
-  char buf[16];
-  snprintf(buf, sizeof(buf), ((*unit == 'K') ? "%s%.1f%ciB" : "%s%.2f%ciB"),
-           neg_str, num_bytes / 1024.0, *unit);
-  return std::string(buf);
+  return absl::StrFormat("%s%.*f%ciB", sign_str, *unit == 'K' ? 1 : 2,
+                         num_bytes / 1024.0, *unit);
 }
 
 std::string HumanReadableElapsedTime(double seconds) {
@@ -346,43 +263,43 @@ std::string HumanReadableElapsedTime(double seconds) {
   // the tested condition and returning, e.g., "1e+03 us" instead of "1 ms".
   const double microseconds = seconds * 1.0e6;
   if (microseconds < 999.5) {
-    strings::Appendf(&human_readable, "%0.3g us", microseconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g us", microseconds);
     return human_readable;
   }
   double milliseconds = seconds * 1e3;
   if (milliseconds >= .995 && milliseconds < 1) {
-    // Round half to even in Appendf would convert this to 0.999 ms.
+    // Round half to even in StrAppendFormat would convert this to 0.999 ms.
     milliseconds = 1.0;
   }
   if (milliseconds < 999.5) {
-    strings::Appendf(&human_readable, "%0.3g ms", milliseconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g ms", milliseconds);
     return human_readable;
   }
   if (seconds < 60.0) {
-    strings::Appendf(&human_readable, "%0.3g s", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g s", seconds);
     return human_readable;
   }
   seconds /= 60.0;
   if (seconds < 60.0) {
-    strings::Appendf(&human_readable, "%0.3g min", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g min", seconds);
     return human_readable;
   }
   seconds /= 60.0;
   if (seconds < 24.0) {
-    strings::Appendf(&human_readable, "%0.3g h", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g h", seconds);
     return human_readable;
   }
   seconds /= 24.0;
   if (seconds < 30.0) {
-    strings::Appendf(&human_readable, "%0.3g days", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g days", seconds);
     return human_readable;
   }
   if (seconds < 365.2425) {
-    strings::Appendf(&human_readable, "%0.3g months", seconds / 30.436875);
+    absl::StrAppendFormat(&human_readable, "%0.3g months", seconds / 30.436875);
     return human_readable;
   }
   seconds /= 365.2425;
-  strings::Appendf(&human_readable, "%0.3g years", seconds);
+  absl::StrAppendFormat(&human_readable, "%0.3g years", seconds);
   return human_readable;
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers.h b/third_party/xla/third_party/tsl/tsl/platform/numbers.h
index 8d87fc87187637..43b4ac86414130 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <string>
+#include <type_traits>
 
 #include "absl/base/macros.h"
 #include "absl/strings/numbers.h"
@@ -30,21 +31,7 @@ limitations under the License.
 namespace tsl {
 namespace strings {
 
-// ----------------------------------------------------------------------
-// FastIntToBufferLeft()
-//    These are intended for speed.
-//
-//    All functions take the output buffer as an arg.  FastInt() uses
-//    at most 22 bytes, FastTime() uses exactly 30 bytes.  They all
-//    return a pointer to the beginning of the output, which is the same as
-//    the beginning of the input buffer.
-//
-//    NOTE: In 64-bit land, sizeof(time_t) is 8, so it is possible
-//    to pass to FastTimeToBuffer() a time whose year cannot be
-//    represented in 4 digits. In this case, the output buffer
-//    will contain the string "Invalid:<value>"
-// ----------------------------------------------------------------------
-
+namespace strings_internal {
 // Previously documented minimums -- the buffers provided must be at least this
 // long, though these numbers are subject to change:
 //     Int32, UInt32:                   12 bytes
@@ -53,33 +40,6 @@ namespace strings {
 // Use kFastToBufferSize rather than hardcoding constants.
 inline constexpr int kFastToBufferSize = 32;
 
-// ----------------------------------------------------------------------
-// FastInt32ToBufferLeft()
-// FastUInt32ToBufferLeft()
-// FastInt64ToBufferLeft()
-// FastUInt64ToBufferLeft()
-//
-// These functions convert their numeric argument to an ASCII
-// representation of the numeric value in base 10, with the
-// representation being left-aligned in the buffer.  The caller is
-// responsible for ensuring that the buffer has enough space to hold
-// the output.  The buffer should typically be at least kFastToBufferSize
-// bytes.
-//
-// Returns the number of characters written.
-// ----------------------------------------------------------------------
-
-size_t FastInt32ToBufferLeft(int32_t i, char* buffer);  // at least 12 bytes
-size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer);  // at least 12 bytes
-size_t FastInt64ToBufferLeft(int64_t i, char* buffer);  // at least 22 bytes
-size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer);  // at least 22 bytes
-
-// Required buffer size for DoubleToBuffer is kFastToBufferSize.
-// Required buffer size for FloatToBuffer is kFastToBufferSize.
-size_t DoubleToBuffer(double value, char* buffer);
-size_t FloatToBuffer(float value, char* buffer);
-
-namespace strings_internal {
 // AlphaNumBuffer allows a way to pass a string to absl::StrCat without having
 // to do memory allocation. It is simply a pair of a fixed-size character
 // array, and a size.  Please don't use outside of the "strings" package.
@@ -173,36 +133,19 @@ inline bool safe_strtod(absl::string_view str, double* value) {
   return absl::SimpleAtod(str, value);
 }
 
-inline bool ProtoParseNumeric(absl::string_view s, int32_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, uint32_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, int64_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, uint64_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, float* value) {
-  return absl::SimpleAtof(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, double* value) {
-  return absl::SimpleAtod(s, value);
-}
-
 // Convert strings to number of type T.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 template <typename T>
 bool SafeStringToNumeric(absl::string_view s, T* value) {
-  return ProtoParseNumeric(s, value);
+  if constexpr (std::is_integral_v<T>) {
+    return absl::SimpleAtoi(s, value);
+  } else if constexpr (std::is_same_v<T, float>) {
+    return absl::SimpleAtof(s, value);
+  } else {
+    static_assert(std::is_same_v<T, double>);
+    return absl::SimpleAtod(s, value);
+  }
 }
 
 // Converts from an int64 to a human readable string representing the
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc b/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
index 60b6deb67dac82..db94c24766767c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tsl/platform/numbers.h"
 
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <limits>
 #include <string>
@@ -27,6 +28,8 @@ limitations under the License.
 namespace tsl {
 namespace strings {
 
+using strings_internal::kFastToBufferSize;
+
 // NOTE: most of the routines in numbers.h are tested indirectly through
 // strcat_test.cc in this directory.
 
@@ -73,6 +76,8 @@ TEST(HumanReadableNum, Basic) {
   EXPECT_EQ(HumanReadableNum(1048576), "1.05M");
   EXPECT_EQ(HumanReadableNum(23956812342), "23.96B");
   EXPECT_EQ(HumanReadableNum(123456789012345678), "1.23E+17");
+  EXPECT_EQ(HumanReadableNum(std::numeric_limits<int64_t>::max()), "9.22E+18");
+  EXPECT_EQ(HumanReadableNum(std::numeric_limits<int64_t>::min()), "-9.22E+18");
 }
 
 TEST(HumanReadableNumBytes, Bytes) {
@@ -99,7 +104,8 @@ TEST(HumanReadableNumBytes, Bytes) {
   EXPECT_EQ("-4B", HumanReadableNumBytes(-4));
   EXPECT_EQ("-1000B", HumanReadableNumBytes(-1000));
   EXPECT_EQ("-11.77MiB", HumanReadableNumBytes(-12345678));
-  EXPECT_EQ("-8E", HumanReadableNumBytes(kint64min));
+  EXPECT_EQ("-8.00EiB",
+            HumanReadableNumBytes(std::numeric_limits<int64_t>::min()));
 }
 
 TEST(HumanReadableElapsedTime, Basic) {
@@ -200,9 +206,9 @@ TEST(safe_strto64, Int64s) {
   EXPECT_EQ(true, absl::SimpleAtoi("9223372036854775807", &result));
   EXPECT_EQ(9223372036854775807, result);
   EXPECT_EQ(true, absl::SimpleAtoi("-9223372036854775808", &result));
-  // kint64min == -9223372036854775808
+  // std::numeric_limits<int64_t>::min() == -9223372036854775808
   // Use -9223372036854775808 directly results in out of range error
-  EXPECT_EQ(kint64min, result);
+  EXPECT_EQ(std::numeric_limits<int64_t>::min(), result);
 
   // Invalid argument
   EXPECT_EQ(false, absl::SimpleAtoi(" 132as ", &result));
diff --git a/third_party/xla/third_party/tsl/tsl/platform/path_test.cc b/third_party/xla/third_party/tsl/tsl/platform/path_test.cc
index f644b0742ab1e2..4fa6cff629ef2f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/path_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/path_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
-#include "tsl/platform/stringpiece.h"
 
 namespace tsl {
 namespace io {
@@ -106,8 +106,8 @@ TEST(PathTest, CleanPath) {
 
 #define EXPECT_PARSE_URI(uri, scheme, host, path)  \
   do {                                             \
-    StringPiece u(uri);                            \
-    StringPiece s, h, p;                           \
+    absl::string_view u(uri);                      \
+    absl::string_view s, h, p;                     \
     ParseURI(u, &s, &h, &p);                       \
     EXPECT_EQ(scheme, s);                          \
     EXPECT_EQ(host, h);                            \
diff --git a/third_party/xla/third_party/tsl/tsl/platform/scanner.h b/third_party/xla/third_party/tsl/tsl/platform/scanner.h
index 4eb70b8244bc71..a13bcaf2b65e12 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/scanner.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/scanner.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/ascii.h"
 #include "xla/tsl/platform/macros.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/stringpiece.h"
@@ -174,58 +175,43 @@ class Scanner {
     return *this;
   }
 
-  static bool IsLetter(char ch) {
-    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
-  }
-
-  static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
-
-  static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
-
-  static bool IsSpace(char ch) {
-    return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
-            ch == '\r');
-  }
-
   static bool Matches(CharClass clz, char ch) {
     switch (clz) {
       case ALL:
         return true;
       case DIGIT:
-        return IsDigit(ch);
+        return absl::ascii_isdigit(ch);
       case LETTER:
-        return IsLetter(ch);
+        return absl::ascii_isalpha(ch);
       case LETTER_DIGIT:
-        return IsLetter(ch) || IsDigit(ch);
+        return absl::ascii_isalnum(ch);
       case LETTER_DIGIT_DASH_UNDERSCORE:
-        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
+        return (absl::ascii_isalnum(ch) || ch == '-' || ch == '_');
       case LETTER_DIGIT_DASH_DOT_SLASH:
-        return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
-               ch == '/';
+        return absl::ascii_isalnum(ch) || ch == '-' || ch == '.' || ch == '/';
       case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
-        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
+        return (absl::ascii_isalnum(ch) || ch == '-' || ch == '.' ||
                 ch == '/' || ch == '_');
       case LETTER_DIGIT_DOT:
-        return IsLetter(ch) || IsDigit(ch) || ch == '.';
+        return absl::ascii_isalnum(ch) || ch == '.';
       case LETTER_DIGIT_DOT_PLUS_MINUS:
-        return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
-               ch == '.';
+        return absl::ascii_isalnum(ch) || ch == '+' || ch == '-' || ch == '.';
       case LETTER_DIGIT_DOT_UNDERSCORE:
-        return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
+        return absl::ascii_isalnum(ch) || ch == '.' || ch == '_';
       case LETTER_DIGIT_UNDERSCORE:
-        return IsLetter(ch) || IsDigit(ch) || ch == '_';
+        return absl::ascii_isalnum(ch) || ch == '_';
       case LOWERLETTER:
-        return ch >= 'a' && ch <= 'z';
+        return absl::ascii_islower(ch);
       case LOWERLETTER_DIGIT:
-        return IsLowerLetter(ch) || IsDigit(ch);
+        return absl::ascii_islower(ch) || absl::ascii_isdigit(ch);
       case LOWERLETTER_DIGIT_UNDERSCORE:
-        return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
+        return absl::ascii_islower(ch) || absl::ascii_isdigit(ch) || ch == '_';
       case NON_ZERO_DIGIT:
-        return IsDigit(ch) && ch != '0';
+        return absl::ascii_isdigit(ch) && ch != '0';
       case SPACE:
-        return IsSpace(ch);
+        return absl::ascii_isspace(ch);
       case UPPERLETTER:
-        return ch >= 'A' && ch <= 'Z';
+        return absl::ascii_isupper(ch);
       case RANGLE:
         return ch == '>';
     }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/str_util.cc b/third_party/xla/third_party/tsl/tsl/platform/str_util.cc
index f22bc6f0c45e3a..9a275de74d2a99 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/str_util.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/str_util.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "tsl/platform/str_util.h"
 
-#include <cctype>
+#include <charconv>
+#include <cstddef>
 #include <cstdint>
 #include <string>
+#include <system_error>  // NOLINT
 
 #include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/logging.h"
 #include "tsl/platform/stringpiece.h"
 
@@ -48,28 +52,16 @@ size_t RemoveWhitespaceContext(absl::string_view* text) {
 }
 
 bool ConsumeLeadingDigits(absl::string_view* s, uint64_t* val) {
-  const char* p = s->data();
-  const char* limit = p + s->size();
-  uint64_t v = 0;
-  while (p < limit) {
-    const char c = *p;
-    if (c < '0' || c > '9') break;
-    uint64_t new_v = (v * 10) + (c - '0');
-    if (new_v / 8 < v) {
-      // Overflow occurred
-      return false;
-    }
-    v = new_v;
-    p++;
-  }
-  if (p > s->data()) {
-    // Consume some digits
-    s->remove_prefix(p - s->data());
-    *val = v;
-    return true;
-  } else {
+  uint64_t v;
+  auto [p, ec] =
+      std::from_chars(s->data(), s->data() + s->size(), v, /*base=*/10);
+  if (ec != std::errc{}) {
     return false;
   }
+  // Consume some digits
+  s->remove_prefix(p - s->data());
+  *val = v;
+  return true;
 }
 
 bool ConsumeNonWhitespace(absl::string_view* s, absl::string_view* val) {
@@ -77,7 +69,7 @@ bool ConsumeNonWhitespace(absl::string_view* s, absl::string_view* val) {
   const char* limit = p + s->size();
   while (p < limit) {
     const char c = *p;
-    if (isspace(c)) break;
+    if (absl::ascii_isspace(c)) break;
     p++;
   }
   const size_t n = p - s->data();
@@ -95,9 +87,9 @@ void TitlecaseString(string* s, absl::string_view delimiters) {
   bool upper = true;
   for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
     if (upper) {
-      *ss = toupper(*ss);
+      *ss = absl::ascii_toupper(*ss);
     }
-    upper = (delimiters.find(*ss) != absl::string_view::npos);
+    upper = absl::StrContains(delimiters, *ss);
   }
 }
 
@@ -120,14 +112,6 @@ string StringReplace(absl::string_view s, absl::string_view oldsub,
   return res;
 }
 
-size_t Strnlen(const char* str, const size_t string_max_len) {
-  size_t len = 0;
-  while (len < string_max_len && str[len] != '\0') {
-    ++len;
-  }
-  return len;
-}
-
 string ArgDefCase(absl::string_view s) {
   const size_t n = s.size();
 
@@ -138,7 +122,7 @@ string ArgDefCase(absl::string_view s) {
   size_t to_skip = 0;
   for (size_t i = 0; i < n; ++i) {
     // If we are skipping and current letter is non-alpha, skip it as well
-    if (i == to_skip && !isalpha(s[i])) {
+    if (i == to_skip && !absl::ascii_isalpha(s[i])) {
       ++to_skip;
       continue;
     }
@@ -147,7 +131,8 @@ string ArgDefCase(absl::string_view s) {
     // If this letter is upper case, not the very first char in the
     // resulting string, and previous letter isn't replaced with an underscore,
     // we will need to insert an underscore.
-    if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) {
+    if (absl::ascii_isupper(s[i]) && i != to_skip && i > 0 &&
+        absl::ascii_isalnum(s[i - 1])) {
       ++extra_us;
     }
   }
@@ -162,15 +147,15 @@ string ArgDefCase(absl::string_view s) {
     char c = s[i];
     // If c is not alphanumeric, we don't need to do anything
     // since there is already an underscore in its place.
-    if (isalnum(c)) {
-      if (isupper(c)) {
+    if (absl::ascii_isalnum(c)) {
+      if (absl::ascii_isupper(c)) {
         // If current char is upper case, we might need to insert an
         // underscore.
         if (i != to_skip) {
           DCHECK_GT(j, 0);
           if (result[j - 1] != '_') ++j;
         }
-        result[j] = tolower(c);
+        result[j] = absl::ascii_tolower(c);
       } else {
         result[j] = c;
       }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/str_util.h b/third_party/xla/third_party/tsl/tsl/platform/str_util.h
index f6ba02e7014190..0242ff350d1f3f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/str_util.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/str_util.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
@@ -210,11 +211,6 @@ inline bool StrContains(absl::string_view haystack, absl::string_view needle) {
   return absl::StrContains(haystack, needle);
 }
 
-// Returns the length of the given null-terminated byte string 'str'.
-// Returns 'string_max_len' if the null character was not found in the first
-// 'string_max_len' bytes of 'str'.
-size_t Strnlen(const char* str, const size_t string_max_len);
-
 //   ----- NON STANDARD, TF SPECIFIC METHOD -----
 // Converts "^2ILoveYou!" to "i_love_you_". More specifically:
 // - converts all non-alphanumeric characters to underscores
diff --git a/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc b/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc
index 607d7d1bbdf0c7..4a2e14dd1d78f3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc
@@ -367,12 +367,4 @@ TEST(StringReplace, EmptyStringReplaceAll) {
   EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
 }
 
-TEST(Strnlen, Basic) {
-  EXPECT_EQ(0, str_util::Strnlen("ab", 0));
-  EXPECT_EQ(1, str_util::Strnlen("a", 1));
-  EXPECT_EQ(2, str_util::Strnlen("abcd", 2));
-  EXPECT_EQ(3, str_util::Strnlen("abc", 10));
-  EXPECT_EQ(4, str_util::Strnlen("a \t\n", 10));
-}
-
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h b/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
index c7c9841ff701cd..c885f477e0a390 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
@@ -26,8 +26,10 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_STRINGPIECE_H_
 #define TENSORFLOW_TSL_PLATFORM_STRINGPIECE_H_
 
+// IWYU pragma: private, include "absl/strings/string_view.h"
+
 #include "absl/base/macros.h"
-#include "absl/strings/string_view.h"  // IWYU pragma: export
+#include "absl/strings/string_view.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.cc b/third_party/xla/third_party/tsl/tsl/platform/stringprintf.cc
deleted file mode 100644
index 2e33fb7107c136..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tsl/platform/stringprintf.h"
-
-#include <errno.h>
-#include <stdarg.h>  // For va_list and related operations
-#include <stdio.h>   // MSVC requires this for _vsnprintf
-
-namespace tsl {
-namespace strings {
-
-void Appendv(string* dst, const char* format, va_list ap) {
-  // First try with a small fixed size buffer
-  static const int kSpaceLength = 1024;
-  char space[kSpaceLength];
-
-  // It's possible for methods that use a va_list to invalidate
-  // the data in it upon use.  The fix is to make a copy
-  // of the structure before using it and use that copy instead.
-  va_list backup_ap;
-  va_copy(backup_ap, ap);
-  int result = vsnprintf(space, kSpaceLength, format, backup_ap);
-  va_end(backup_ap);
-
-  if (result < kSpaceLength) {
-    if (result >= 0) {
-      // Normal case -- everything fit.
-      dst->append(space, result);
-      return;
-    }
-
-#ifdef _MSC_VER
-      // Error or MSVC running out of space.  MSVC 8.0 and higher
-      // can be asked about space needed with the special idiom below:
-      va_copy(backup_ap, ap);
-      result = vsnprintf(nullptr, 0, format, backup_ap);
-      va_end(backup_ap);
-#endif
-
-    if (result < 0) {
-      // Just an error.
-      return;
-    }
-  }
-
-  // Increase the buffer size to the size requested by vsnprintf,
-  // plus one for the closing \0.
-  int length = result + 1;
-  char* buf = new char[length];
-
-  // Restore the va_list before we use it again
-  va_copy(backup_ap, ap);
-  result = vsnprintf(buf, length, format, backup_ap);
-  va_end(backup_ap);
-
-  if (result >= 0 && result < length) {
-    // It fit
-    dst->append(buf, result);
-  }
-  delete[] buf;
-}
-
-string Printf(const char* format, ...) {
-  va_list ap;
-  va_start(ap, format);
-  string result;
-  Appendv(&result, format, ap);
-  va_end(ap);
-  return result;
-}
-
-void Appendf(string* dst, const char* format, ...) {
-  va_list ap;
-  va_start(ap, format);
-  Appendv(dst, format, ap);
-  va_end(ap);
-}
-
-}  // namespace strings
-}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h b/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
index 6e1268dfa352dc..837f117bd6950f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
@@ -26,25 +26,118 @@ limitations under the License.
 
 #include <string>
 
-#include "xla/tsl/platform/macros.h"
-#include "xla/tsl/platform/types.h"
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
 
 namespace tsl {
 namespace strings {
 
 // Return a C++ string
-std::string Printf(const char* format, ...)
-    // Tell the compiler to do printf format string checking.
-    TF_PRINTF_ATTRIBUTE(1, 2);
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string Printf(const absl::FormatSpec<>& format) {
+  return absl::StrFormat(format);
+}
+
+template <typename Arg1>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1>& format, Arg1 arg1) {
+  return absl::StrFormat(format, arg1);
+}
+
+template <typename Arg1, typename Arg2>
+ABSL_DEPRECATE_AND_INLINE()
+std::string
+    Printf(const absl::FormatSpec<Arg1, Arg2>& format, Arg1 arg1, Arg2 arg2) {
+  return absl::StrFormat(format, arg1, arg2);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1, Arg2, Arg3>& format, Arg1 arg1,
+                   Arg2 arg2, Arg3 arg3) {
+  return absl::StrFormat(format, arg1, arg2, arg3);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4>& format,
+                   Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) {
+  return absl::StrFormat(format, arg1, arg2, arg3, arg4);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5>& format,
+                   Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) {
+  return absl::StrFormat(format, arg1, arg2, arg3, arg4, arg5);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5, typename... AV>
+ABSL_DEPRECATED("Use absl::StrFormat instead.")
+std::string
+    Printf(const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5, AV...>& format,
+           Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, AV... args) {
+  return absl::StrFormat(format, arg1, arg2, arg3, arg4, arg5,
+                         std::forward<AV>(args)...);
+}
 
 // Append result to a supplied string
-void Appendf(std::string* dst, const char* format, ...)
-    // Tell the compiler to do printf format string checking.
-    TF_PRINTF_ATTRIBUTE(2, 3);
+ABSL_DEPRECATE_AND_INLINE()
+inline void Appendf(std::string* dst, const absl::FormatSpec<>& format) {
+  absl::StrAppendFormat(dst, format);
+}
+
+template <typename Arg1>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst, const absl::FormatSpec<Arg1>& format,
+             Arg1 arg1) {
+  absl::StrAppendFormat(dst, format, arg1);
+}
+
+template <typename Arg1, typename Arg2>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst, const absl::FormatSpec<Arg1, Arg2>& format,
+             Arg1 arg1, Arg2 arg2) {
+  absl::StrAppendFormat(dst, format, arg1, arg2);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst, const absl::FormatSpec<Arg1, Arg2, Arg3>& format,
+             Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst,
+             const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4>& format, Arg1 arg1,
+             Arg2 arg2, Arg3 arg3, Arg4 arg4) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3, arg4);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst,
+             const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5>& format,
+             Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3, arg4, arg5);
+}
 
-// Lower-level routine that takes a va_list and appends to a specified
-// string.  All other routines are just convenience wrappers around it.
-void Appendv(std::string* dst, const char* format, va_list ap);
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5, typename... AV>
+ABSL_DEPRECATED("Use absl::StrAppendFormat instead.")
+void Appendf(
+    std::string* dst,
+    const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5, AV...>& format,
+    Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, AV... args) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3, arg4, arg5,
+                        std::forward<AV>(args)...);
+}
 
 }  // namespace strings
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tstring.h b/third_party/xla/third_party/tsl/tsl/platform/tstring.h
index 9c10389fc04379..a302174b93cc66 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tstring.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/tstring.h
@@ -18,12 +18,16 @@ limitations under the License.
 
 #include <assert.h>
 
+#include <cstddef>
+#include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
 
 #include "tsl/platform/cord.h"
 #include "tsl/platform/ctstring.h"
 #include "tsl/platform/platform.h"
+#include "tsl/platform/refcount.h"
 #include "tsl/platform/stringpiece.h"
 
 namespace tsl {
@@ -92,6 +96,45 @@ class tstring {
     view& operator=(const view&) = delete;
   };
 
+  // `tstring::owner` manages reference-counted shared ownership of a `tstring`
+  // buffer via an owned object of type `T`. `owner<T>` is useful for cases
+  // where an existing object already manages a buffer, and we want to transfer
+  // ownership of that object to a `tstring` via reference counting.
+  //
+  // Ownership of `v` is taken via `std::move`, and `value_` is destroyed via
+  // its destructor when `owner`'s reference count drops to zero. `T` must be
+  // move-constructible.
+  template <typename T>
+  class owner : public tsl::core::RefCounted {
+   public:
+    // Constructs an owner by moving `v` into `value_`.
+    explicit owner(T v) : value_(std::move(v)) {
+      capi_.ref = &RefImpl;
+      capi_.unref = &UnrefImpl;
+      capi_.obj = this;  // Store this to facilitate safe capi_ ref/unref calls.
+    }
+
+    // Accessors for the underlying object.
+    T& value() { return value_; }
+    const T& value() const { return value_; }
+
+   protected:
+    ~owner() override = default;
+
+   private:
+    friend class tstring;
+
+    static void RefImpl(TStringOwnerCApi* self) {
+      static_cast<owner*>(self->obj)->Ref();
+    }
+    static bool UnrefImpl(TStringOwnerCApi* self) {
+      return static_cast<owner*>(self->obj)->Unref();
+    }
+
+    TStringOwnerCApi capi_;
+    T value_;
+  };
+
   typedef const char* const_iterator;
 
   // Ctor
@@ -189,6 +232,23 @@ class tstring {
   tstring& assign_as_view(const char* str, size_t len);
   tstring& assign_as_view(const char* str);
 
+  // Assigns `str` as a shared view, where the lifetime of the buffer `str`
+  // points to is managed by `owner` via reference counting. This is in
+  // contrast to `assign_as_view`, which creates an unowned view.
+  //
+  // This function increments the reference count of `owner`. The caller should
+  // typically call `owner->Unref()` after this to donate their reference to the
+  // tstring(s), ensuring `owner` is deleted only when all tstrings referencing
+  // it are destroyed.
+  //
+  // The buffer pointed to by `str` is guaranteed to remain valid as long as
+  // this tstring (or any copies of it) exists. If `owner` is null, this
+  // method behaves like `assign_as_view`.
+  template <typename T>
+  tstring& assign_as_shared_view(absl::string_view str, owner<T>* owner);
+  template <typename T>
+  tstring& assign_as_shared_view(const char* str, size_t len, owner<T>* owner);
+
   // Modifiers
   // NOTE: Invalid input will result in undefined behavior.
   tstring& append(const tstring& str);
@@ -250,8 +310,16 @@ inline tstring::tstring(const absl::string_view str)
 #ifdef PLATFORM_GOOGLE
 inline tstring::tstring(const absl::Cord& cord) {
   TF_TString_Init(&tstr_);
+  if (cord.size() > TF_TString_SmallCapacity) {
+    std::optional<absl::string_view> flat = cord.TryFlat();
+    if (flat.has_value()) {
+      auto* cord_owner = new tstring::owner<absl::Cord>(cord);
+      assign_as_shared_view(*flat, cord_owner);
+      cord_owner->Unref();
+      return;
+    }
+  }
   TF_TString_ResizeUninitialized(&tstr_, cord.size());
-
   cord.CopyToArray(data());
 }
 #endif  // PLATFORM_GOOGLE
@@ -308,10 +376,17 @@ inline tstring& tstring::operator=(const absl::string_view str) {
 
 #ifdef PLATFORM_GOOGLE
 inline tstring& tstring::operator=(const absl::Cord& cord) {
+  if (cord.size() > TF_TString_SmallCapacity) {
+    std::optional<absl::string_view> flat = cord.TryFlat();
+    if (flat.has_value()) {
+      auto* cord_owner = new tstring::owner<absl::Cord>(cord);
+      assign_as_shared_view(*flat, cord_owner);
+      cord_owner->Unref();
+      return *this;
+    }
+  }
   TF_TString_ResizeUninitialized(&tstr_, cord.size());
-
   cord.CopyToArray(data());
-
   return *this;
 }
 #endif  // PLATFORM_GOOGLE
@@ -486,6 +561,22 @@ inline tstring& tstring::assign_as_view(const char* str) {
   return *this;
 }
 
+template <typename T>
+inline tstring& tstring::assign_as_shared_view(const absl::string_view str,
+                                               tstring::owner<T>* owner) {
+  TF_TString_AssignViewWithOwner(&tstr_, str.data(), str.size(),
+                                 owner ? &owner->capi_ : nullptr);
+  return *this;
+}
+
+template <typename T>
+inline tstring& tstring::assign_as_shared_view(const char* str, size_t len,
+                                               tstring::owner<T>* owner) {
+  TF_TString_AssignViewWithOwner(&tstr_, str, len,
+                                 owner ? &owner->capi_ : nullptr);
+  return *this;
+}
+
 // Modifiers
 
 inline tstring& tstring::append(const tstring& str) {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc b/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
index 04951c6e9de5be..16118d88953e60 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
@@ -183,15 +183,29 @@ TEST(TF_TStringTest, Assignment) {
 #ifdef PLATFORM_GOOGLE
   s33 = absl::Cord(kLongString);
 
+  // Check flat cord.
   EXPECT_EQ(kLongString, s33);
-  EXPECT_EQ(tstring::Type::LARGE, s33.type());
+  EXPECT_EQ(tstring::Type::VIEW, s33.type());
   EXPECT_EQ(kLongStringLen, s33.size());
 
   tstring s34((absl::Cord(kLongString)));
 
   EXPECT_EQ(kLongString, s34);
-  EXPECT_EQ(tstring::Type::LARGE, s34.type());
+  EXPECT_EQ(tstring::Type::VIEW, s34.type());
   EXPECT_EQ(kLongStringLen, s34.size());
+
+  // Check non-flat cord.
+  absl::Cord c1(kLongString);
+  absl::Cord c2(std::string(500, 'x'));
+  c1.Append(c2);
+  if (!c1.TryFlat()) {
+    tstring s35(c1);
+    EXPECT_EQ(tstring::Type::LARGE, s35.type());
+    EXPECT_EQ(c1.size(), s35.size());
+    s33 = c1;
+    EXPECT_EQ(tstring::Type::LARGE, s33.type());
+    EXPECT_EQ(c1.size(), s33.size());
+  }
 #endif  // PLATFORM_GOOGLE
 }
 
@@ -407,3 +421,66 @@ TEST(TF_TStringTest, Friends) {
 
   EXPECT_EQ(std::string("\0a\0", 3), ss.str());
 }
+
+struct DeletionMarker {
+  bool* deleted = nullptr;
+  DeletionMarker() = default;
+  explicit DeletionMarker(bool* d) : deleted(d) {
+    if (deleted) *deleted = false;
+  }
+  DeletionMarker(DeletionMarker&& other) : deleted(other.deleted) {
+    other.deleted = nullptr;
+  }
+  DeletionMarker& operator=(DeletionMarker&& other) {
+    deleted = other.deleted;
+    other.deleted = nullptr;
+    return *this;
+  }
+  ~DeletionMarker() {
+    if (deleted) *deleted = true;
+  }
+};
+
+TEST(OwnerTest, RefUnref) {
+  bool deleted = false;
+  auto* owner =
+      new tsl::tstring::owner<DeletionMarker>(DeletionMarker(&deleted));
+  EXPECT_FALSE(deleted);
+
+  tsl::tstring s1;
+  s1.assign_as_shared_view("hello", owner);
+  owner->Unref();
+  EXPECT_FALSE(deleted);
+
+  tsl::tstring s2 = s1;
+  EXPECT_FALSE(deleted);
+
+  s1.clear();
+  EXPECT_FALSE(deleted);
+
+  s2.clear();
+  EXPECT_TRUE(deleted);
+}
+
+TEST(OwnerTest, Assign) {
+  bool deleted = false;
+  auto* owner =
+      new tsl::tstring::owner<DeletionMarker>(DeletionMarker(&deleted));
+  EXPECT_FALSE(deleted);
+
+  tsl::tstring s1;
+  s1.assign_as_shared_view("hello", owner);
+  owner->Unref();
+
+  tsl::tstring s2;
+  s2 = s1;
+
+  tsl::tstring s3;
+  s3 = std::move(s1);
+
+  s2.clear();
+  EXPECT_FALSE(deleted);
+
+  s3.clear();
+  EXPECT_TRUE(deleted);
+}
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
index 422e8271ee4fc3..c9e4e520c91028 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
@@ -82,7 +82,7 @@ class TraceMeProducer : public TraceMe {
   explicit TraceMeProducer(NameT&& name,
                            ContextType context_type = ContextType::kGeneric,
                            std::optional<uint64> context_id = std::nullopt,
-                           int level = 2)
+                           int level = tsl::profiler::TraceMeLevel::kCritical)
       : TraceMe(std::forward<NameT>(name), level),
         context_id_(context_id.has_value() ? context_id.value()
                                            : TraceMe::NewActivityId()) {
@@ -101,7 +101,7 @@ class TraceMeConsumer : public TraceMe {
  public:
   template <typename NameT>
   TraceMeConsumer(NameT&& name, ContextType context_type, uint64 context_id,
-                  int level = 2)
+                  int level = tsl::profiler::TraceMeLevel::kCritical)
       : TraceMe(std::forward<NameT>(name), level) {
     AppendMetadata([&] {
       return TraceMeEncode({{"_ct", context_type}, {"_c", context_id}});
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
index b41215e77ae6c7..5ffe1e799284c2 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
@@ -2,7 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 
-// Next ID: 14
+// Next ID: 15
 message ProfileOptions {
   // Some default value of option are not proto3 default value. Use this version
   // to determine if we should use default option value instead of proto3
@@ -96,6 +96,10 @@ message ProfileOptions {
   map<string, AdvancedConfigValue> advanced_configuration = 12;
 
   bool raise_error_on_start_failure = 13;
+
+  // Identifier of the profiling session. This will be used as the subdirectory
+  // under the repository path. If not set, the current timestamp will be used.
+  string session_id = 14;
 }
 
 // Options for remote profiler session manager.
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index c5549cc0f6dd51..2ecbc74047b931 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "d36a005c707c0cf26696acfb5ef27d55a37551a49ed2eeb5979815a61138f07d",
-        strip_prefix = "XNNPACK-ea1906f8df2faf8172da1b341c563bf9115581dd",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/ea1906f8df2faf8172da1b341c563bf9115581dd.zip"),
+        sha256 = "a633a48ba393211771204d25ebc5f35359b71bfbefaa6e955aa92570caede727",
+        strip_prefix = "XNNPACK-fa0fd6471a39a5d66a59d4cd8f8cc4a93a4bd470",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/fa0fd6471a39a5d66a59d4cd8f8cc4a93a4bd470.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
index da4cc1302b7c79..8945d3988e5b91 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "@local_xla//third_party/remote_config:common.bzl",
+    "//third_party/remote_config:common.bzl",
     "err_out",
     "get_host_environ",
     "raw_exec",
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 77f3565f07364a..166d87f63e8617 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,6 +1,6 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 4c689ece55a536..807f13fb546fae 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -50,7 +50,7 @@ def initialize_rbe_configs():
     # The `ml-build`'s base image is a standard `ubuntu22.04` image.
     # Note that in order to use this image with RBE GPU builds, you need to have hermetic CUDA
     # toolchain integrated into your project, and pass
-    # `--@cuda_driver//:enable_forward_compatibility=true` to Bazel command.
+    # `--@cuda_driver//:include_cuda_umd_libs=true` to Bazel command.
     ml_build_rbe_config("docker://us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build@sha256:ea67e8453d8b09c2ba48853da5e79efef4b65804b4a48dfae4b4da89ffd38405")
 
     # TF-Version-Specific SIG Build RBE Configs. The crosstool generated from these
diff --git a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
index ba78bb509de7a4..dbfafdfb08c180 100644
--- a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,8 +1,8 @@
 """Macro that creates external repositories for remote config."""
 
-load("@local_xla//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tools/toolchains/remote_config:containers.bzl", "containers")
 
 def _container_image_uri(container_name):
diff --git a/third_party/xla/tools/toolchains/win/BUILD b/third_party/xla/tools/toolchains/win/BUILD
index 258ca032ecd1ea..3aeae034d1f4b2 100644
--- a/third_party/xla/tools/toolchains/win/BUILD
+++ b/third_party/xla/tools/toolchains/win/BUILD
@@ -14,22 +14,12 @@ platform(
         "@platforms//cpu:x86_64",
         "@platforms//os:windows",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
 
 # Register the clang-cl platform
@@ -40,20 +30,10 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
diff --git a/third_party/xla/tools/toolchains/win2022/BUILD b/third_party/xla/tools/toolchains/win2022/BUILD
index 1499e7f0767ab9..0dba97d9d4a4b7 100644
--- a/third_party/xla/tools/toolchains/win2022/BUILD
+++ b/third_party/xla/tools/toolchains/win2022/BUILD
@@ -16,20 +16,11 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-	  value: "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "win2022"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc",
+        "OSFamily": "Windows",
+        "Pool": "win2022",
+        "dockerNetwork": "off",
+        "cache-silo-key": "20251105-1762360217",
+    },
 )
diff --git a/third_party/xla/warnings.bazelrc b/third_party/xla/warnings.bazelrc
index 1719a553e29b03..a72ec447f07948 100644
--- a/third_party/xla/warnings.bazelrc
+++ b/third_party/xla/warnings.bazelrc
@@ -20,6 +20,7 @@ build:warnings --copt=-Wno-expansion-to-defined
 build:warnings --copt=-Wno-ignored-attributes
 build:warnings --copt=-Wno-ignored-qualifiers
 build:warnings --copt=-Wno-inconsistent-missing-override
+build:warnings --copt=-Wno-nontrivial-memaccess
 build:warnings --copt=-Wno-nullability-completeness
 build:warnings --copt=-Wno-potentially-evaluated-expression
 build:warnings --copt=-Wno-range-loop-analysis
@@ -50,7 +51,6 @@ build:warnings --copt=-Wno-misleading-indentation
 build:warnings --copt=-Wno-psabi
 build:warnings --copt=-Wno-unqualified-std-cast-call
 build:warnings --copt=-Wno-deprecated-literal-operator
-build:warnings --copt=-Wno-nontrivial-memaccess
 build:warnings --copt=-Wno-ambiguous-member-template
 build:warnings --copt=-Wno-char-subscripts
 build:warnings --copt=-Wno-deprecated-declarations
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index ede7574c1f8d18..dbeb3d5b0894bb 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -140,10 +140,10 @@ def workspace():
     if "rules_ml_toolchain" not in native.existing_rules():
         http_archive(
             name = "rules_ml_toolchain",
-            sha256 = "7a77198375cfdfcdcd5fec8dfe405d05e3d26a54e963ce3721e861debb4b988b",
-            strip_prefix = "rules_ml_toolchain-5b47bb36d6b6942ec399d4ffa29059cb148c2286",
+            sha256 = "5f17275397752b666adbf8f0a81a3ebfb1e26a970b459cac33a06a8f03caa537",
+            strip_prefix = "rules_ml_toolchain-a2626615e1277a635b43dd268e1d4bc892afea10",
             urls = [
-                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/5b47bb36d6b6942ec399d4ffa29059cb148c2286.tar.gz",
+                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a2626615e1277a635b43dd268e1d4bc892afea10.tar.gz",
             ],
         )
 
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 91d5a51ef22805..1595878e2719c4 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -24,6 +24,7 @@ load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/gloo:workspace.bzl", gloo = "repo")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
+load("//third_party/gutil:workspace.bzl", gutil = "repo")
 load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
@@ -45,10 +46,12 @@ load("//third_party/rmm:workspace.bzl", rmm = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
 load("//third_party/shardy:workspace.bzl", shardy = "repo")
+load("//third_party/slinky:workspace.bzl", slinky = "repo")
 load("//third_party/spdlog:workspace.bzl", spdlog = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
+load("//third_party/transformer_engine:workspace.bzl", transformer_engine = "repo")
 load("//third_party/triton:workspace.bzl", triton = "repo")
 load("//third_party/uv:workspace.bzl", uv = "repo")
 load("//third_party/xnnpack:workspace.bzl", xnnpack = "repo")
@@ -75,6 +78,7 @@ def _initialize_third_party():
     fxdiv()
     gemmlowp()
     gloo()
+    gutil()
     highwayhash()
     hwloc()
     implib_so()
@@ -94,9 +98,11 @@ def _initialize_third_party():
     robin_map()
     rocm_device_libs()
     shardy()
+    slinky()
     spdlog()
     stablehlo()
     tensorrt()
+    transformer_engine()
     triton()
     uv()
     xnnpack()
@@ -157,9 +163,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "42155cfc084bf1f80e9ef486470f949502ea8d1b845b2f1bebd58978a1b540aa",
-        strip_prefix = "kleidiai-8ca226712975f24f13f71d04cda039a0ee9f9e2f",
-        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/8ca226712975f24f13f71d04cda039a0ee9f9e2f.zip"),
+        sha256 = "fb4f8180171d035a08432b086194121f627d00a76d58cebaad57d7a87ad40dbd",
+        strip_prefix = "kleidiai-7a3a609a3278106df7157bdd27b8f0e75ab00b60",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/7a3a609a3278106df7157bdd27b8f0e75ab00b60.zip"),
     )
 
     tf_http_archive(
@@ -579,8 +585,8 @@ def _tf_repositories():
     # https://github.com/bazelbuild/apple_support/releases
     tf_http_archive(
         name = "build_bazel_apple_support",
-        sha256 = "d71b02d6df0500f43279e22400db6680024c1c439115c57a9a82e9effe199d7b",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.18.1/apple_support.1.18.1.tar.gz"),
+        sha256 = "1ae6fcf983cff3edab717636f91ad0efff2e5ba75607fdddddfd6ad0dbdfaf10",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.24.5/apple_support.1.24.5.tar.gz"),
     )
 
     # https://github.com/apple/swift-protobuf/releases
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index e68bc390e67845..91f239040db42a 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -4,7 +4,7 @@ load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 # copybara:uncomment load("@rules_python//python:proto.bzl", "py_proto_library")
 load("//xla:package_groups.bzl", "xla_package_groups")
 load("//xla:xla.default.bzl", "xla_bzl_library", "xla_cc_test", "xla_py_proto_library")
-load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "if_google", "if_oss", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//xla/tsl/platform:build_config.bzl",
@@ -51,7 +51,6 @@ filegroup(
 filegroup(
     name = "cpu_runtime_hdrs",
     srcs = [
-        "cpu_function_runtime.h",
         "executable_run_options.h",
         "types.h",
     ],
@@ -386,6 +385,7 @@ xla_cc_test(
         ":permutation_util",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -403,6 +403,7 @@ cc_library(
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -557,12 +558,13 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -603,9 +605,7 @@ xla_cc_test(
         ":xla_data_proto_cc",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1213,10 +1213,16 @@ cc_library(
     name = "debug_options_flags",
     srcs = [
         "debug_options_flags.cc",
+        "debug_options_parsers.cc",
         "debug_options_parsers.h",
     ],
     hdrs = ["debug_options_flags.h"],
-    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]),
+    copts = if_enable_acl([
+        "-DXLA_CPU_USE_ACL=1",
+    ]) + if_oss([
+        "-DHAS_SUPPORT_FOR_LLD_AS_A_LIBRARY=1",
+        "-DHAS_SUPPORT_FOR_EMBEDDED_LIB_DEVICE=1",
+    ]),
     visibility = internal_visibility([":friends"]),
     deps =
         [
@@ -1252,22 +1258,11 @@ xla_cc_test(
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
-cc_library(
-    name = "cpu_function_runtime",
-    hdrs = ["cpu_function_runtime.h"],
-    compatible_with = get_compatible_with_portable(),
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        "//xla/backends/cpu:alignment",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-    ],
-)
-
 xla_cc_test(
     name = "debug_options_parsers_test",
     size = "small",
@@ -1284,6 +1279,7 @@ xla_cc_test(
             "//xla/tsl/platform:env",
             "//xla/tsl/platform:test",
             "//xla/tsl/util:command_line_flags",
+            "@com_google_absl//absl/base:nullability",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/status",
             "@com_google_absl//absl/status:status_matchers",
@@ -1427,7 +1423,6 @@ xla_cc_test(
     srcs = ["sort_json_test.cc"],
     deps = [
         ":sort_json",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/array.h b/third_party/xla/xla/array.h
index 7d1274de103bb8..17d4ce9a2f2eb0 100644
--- a/third_party/xla/xla/array.h
+++ b/third_party/xla/xla/array.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <functional>
-#include <initializer_list>
 #include <iterator>
 #include <limits>
 #include <memory>
diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index f606982d14542f..b3ded36658852c 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -74,6 +74,7 @@ message AutotuneResult {
   // If you don't need a proto in your code, please use TritonGemmConfig instead
   // of using this proto directly.
   message TritonGemmKey {
+    // LINT.IfChange
     int64 block_m = 1;
     int64 block_n = 2;
     int64 block_k = 3;
@@ -82,6 +83,8 @@ message AutotuneResult {
     int64 num_warps = 6;
     int64 num_ctas = 7;
     bool is_tma_allowed = 8;
+    bool is_warp_specialization_allowed = 9;
+    // LINT.ThenChange(//tensorflow/compiler/xla/service/gpu/matmul_utils.h)
   }
 
   message CustomKernelFusionKey {
@@ -109,6 +112,10 @@ message AutotuneResult {
   }
 }
 
+message TritonGemmConfigsProto {
+  repeated AutotuneResult.TritonGemmKey config = 1;
+}
+
 message AutotuningLog {
   google.protobuf.Any instr = 1;
 
diff --git a/third_party/xla/xla/backends/autotuner/BUILD b/third_party/xla/xla/backends/autotuner/BUILD
index c19b3b8f6722c3..87fa53104ca953 100644
--- a/third_party/xla/xla/backends/autotuner/BUILD
+++ b/third_party/xla/xla/backends/autotuner/BUILD
@@ -39,15 +39,15 @@ cc_library(
         "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:shaped_buffer",
-        "//xla/stream_executor:device_description",
+        "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_utils",
-        "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
@@ -59,10 +59,10 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         "@com_google_protobuf//:any_cc_proto",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -74,10 +74,13 @@ xla_cc_test(
         ":autotuner_cache_interface",
         ":codegen_backend",
         ":profiler",
+        "//xla:autotune_results_proto_cc",
+        "//xla:autotuning_proto_cc",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:executable",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:backend_configs_cc",
@@ -85,6 +88,7 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/testing:temporary_directory",
         "//xla/tsl/util/proto:proto_matchers",
         "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/status",
@@ -92,12 +96,12 @@ xla_cc_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:any_cc_proto",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -127,6 +131,9 @@ cc_library(
         ":autotuner_cache_proto_cc",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -152,6 +159,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:base64",
         "@local_tsl//tsl/platform:path",
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index f9dd675cff2318..2c2dcb14750ef7 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "xla/backends/autotuner/autotuner.h"
 
 #include <algorithm>
+#include <cmath>
+#include <cstddef>
 #include <cstdint>
+#include <iterator>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -25,6 +28,7 @@ limitations under the License.
 #include <vector>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
@@ -34,14 +38,18 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
+#include "google/protobuf/text_format.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/autotuner/profiler.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/dump.h"
 #include "xla/service/executable.h"
 #include "xla/service/shaped_buffer.h"
+#include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -49,7 +57,6 @@ limitations under the License.
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/fingerprint.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -73,6 +80,38 @@ std::string UnpackedAnyShortDebugString(const google::protobuf::Any& any) {
   return s;
 }
 
+// It is important to fingerprint the entire module not just the autotuning
+// candidates, to avoid collisions in the key-value store when several
+// distinct modules have the same fusions, and are compiled at different
+// times by the same PjRt client.
+//
+// TODO(b/394763704): Eliminate the sharding feature when we have offline
+// autotuning. See below for an explanation of some issues.
+//
+// Theoretically, we also want to include the hash of the module config
+// to ensure that a module compiled twice with different configs is
+// autotuned twice.
+//
+// This is important since the config could e.g. affect codegen, or the
+// space of possible parameters for autotuning. As a result, the autotuning
+// results could look very different for the same module.
+//
+// Why is it not done here? Well, proto serialization is non-deterministic
+// and may change across different builds. Which means that users who run
+// on several hosts with different CPUs may end up generating different
+// fingerprints for the same module config. They would then fail to
+// exchange results through the key value store, which would lead to
+// deadlocks. Therefore, we don't hash the module config here.
+//
+// The flip side is this: if we compile the same module twice in the same
+// client, but with a different module config each time, we may hit the
+// cache the second time and recover potentially inferior, or incomplete
+// autotuning results.
+std::string GetKvStoreKey(const HloModule* module, int shard_index) {
+  return absl::StrCat("autotune_results_", module->GetFingerprint128(), "_",
+                      shard_index);
+}
+
 }  // namespace
 
 absl::StatusOr<Autotuner::Config> Autotuner::GetDefaultConfig(
@@ -109,39 +148,143 @@ absl::StatusOr<std::unique_ptr<Autotuner>> Autotuner::Create(
 
 absl::Status Autotuner::Autotune(HloModule* module,
                                  const InstructionFilterFn& should_autotune) {
-  InstructionsByFingerprint instrunctions_by_fingerprint =
+  InstructionsByFingerprint instructions_by_fingerprint =
       GetAutotuningCandidates(module, should_autotune);
-  if (instrunctions_by_fingerprint.empty()) {
+  if (instructions_by_fingerprint.empty()) {
     VLOG(1) << "No instructions to autotune.";
     return absl::OkStatus();
   }
-  VLOG(1) << "Autotuning " << instrunctions_by_fingerprint.size()
+  VLOG(1) << "Finding configs for " << instructions_by_fingerprint.size()
           << " unique instructions.";
-  for (auto& [_, instructions] : instrunctions_by_fingerprint) {
+  for (auto& [_, instructions] : instructions_by_fingerprint) {
     CHECK(!instructions.empty());
-    VLOG(1) << "Autotuning instruction:" << instructions[0]->ToString();
-    TF_ASSIGN_OR_RETURN(Config best_config, GetConfig(instructions[0]));
-    CodegenBackend* best_codegen_backend = best_config.codegen_backend;
+    TF_ASSIGN_OR_RETURN(Config config, GetConfig(instructions[0]));
+    CodegenBackend* codegen_backend = config.codegen_backend;
+    if (autotune_config_.dump_hlos) {
+      TF_RETURN_IF_ERROR(DumpHlo(instructions[0], config));
+    }
     for (auto* instr : instructions) {
-      TF_RETURN_IF_ERROR(best_codegen_backend->ApplyConfig(
-          *instr, *best_config.backend_config));
+      TF_RETURN_IF_ERROR(
+          codegen_backend->ApplyConfig(*instr, *config.backend_config));
     }
   }
   return DumpLogsToFile();
 }
 
+absl::Status Autotuner::Autotune(HloModule* module,
+                                 const InstructionFilterFn& should_autotune,
+                                 MultiProcessKeyValueStore& sharding_kv_store) {
+  CHECK(cache_ != nullptr) << "Sharding autotuning requires a cache.";
+  int total_shards = sharding_kv_store.process_count;
+  int my_shard_index = sharding_kv_store.process_index;
+
+  // 1. Get all the instructions that could be autotuned.
+  InstructionsByFingerprint all_instructions_by_fingerprint =
+      GetAutotuningCandidates(module, should_autotune);
+  if (all_instructions_by_fingerprint.empty()) {
+    VLOG(1) << "No instructions to autotune.";
+    return absl::OkStatus();
+  }
+
+  // 2. Shard and get instructions to autotune for current shard.
+  const size_t bucket_size =
+      std::ceil(static_cast<double>(all_instructions_by_fingerprint.size()) /
+                static_cast<double>(total_shards));
+  const size_t start = bucket_size * my_shard_index;
+  const size_t end =
+      std::min(start + bucket_size, all_instructions_by_fingerprint.size());
+  InstructionsByFingerprint instructions_by_fingerprint(
+      std::next(all_instructions_by_fingerprint.begin(), start),
+      std::next(all_instructions_by_fingerprint.begin(), end));
+
+  // 3. Autotune instructions for this shard. Use cached configs if available,
+  // otherwise autotune and cache the best config.
+  VLOG(1) << "Shard " << my_shard_index << "/" << total_shards
+          << ": finding configs for " << instructions_by_fingerprint.size()
+          << "/" << all_instructions_by_fingerprint.size()
+          << " unique instructions ";
+  std::vector<const HloInstruction*> autotuned_instructions;
+  for (auto& [_, instructions] : instructions_by_fingerprint) {
+    CHECK(!instructions.empty());
+    TF_ASSIGN_OR_RETURN(Config config, GetConfig(instructions[0]));
+    autotuned_instructions.push_back(instructions[0]);
+  }
+  TF_RETURN_IF_ERROR(DumpLogsToFile());
+
+  // 4. Store the results for this shard as a serialized string to the KV store.
+  KeyValueStoreInterface& kv_store = *sharding_kv_store.key_value_store;
+  const std::string local_key = GetKvStoreKey(module, my_shard_index);
+  std::string local_results;
+  if (!autotuned_instructions.empty()) {
+    TF_ASSIGN_OR_RETURN(local_results,
+                        cache_->Serialize(autotuned_instructions));
+  }
+  absl::StatusOr<std::string> stored_result = kv_store.TryGet(local_key);
+  if (stored_result.status().code() == absl::StatusCode::kNotFound) {
+    VLOG(2) << "Storing results for " << local_key;
+    TF_RETURN_IF_ERROR(kv_store.Set(local_key, local_results));
+    VLOG(2) << "Shard " << my_shard_index << " stored results at " << local_key;
+  } else if (!stored_result.ok()) {
+    return stored_result.status();
+  } else {
+    VLOG(2) << "Results already exist for " << local_key << ", skipping store.";
+  }
+
+  // 5. Load the autotune results of other shards from the KV store and update
+  // the current shard's cache by deserializing the results.
+  for (int i = 0; i < total_shards; ++i) {
+    if (i == my_shard_index) {
+      continue;
+    }
+    const std::string remote_key = GetKvStoreKey(module, i);
+    VLOG(2) << "Shard " << my_shard_index << ": waiting for results from shard "
+            << i << " / " << total_shards << " at " << remote_key;
+    // TODO(b/361009609): reset to infinite duration once issue with MPI is
+    // fixed. https://github.com/google/jax/issues/22995.
+    TF_ASSIGN_OR_RETURN(std::string remote_results,
+                        kv_store.Get(remote_key, absl::Hours(24)));
+    if (!remote_results.empty()) {
+      TF_RETURN_IF_ERROR(cache_->Deserialize(remote_results));
+    }
+  }
+
+  // 6. Apply the results to all candidate instructions, must be already in
+  // cache_ due to step 3 and 5 above.
+  for (auto& [_, instructions] : all_instructions_by_fingerprint) {
+    CHECK(!instructions.empty());
+    std::optional<Config> cached_config = LookUp(instructions[0]);
+    CHECK(cached_config.has_value())
+        << "Sharding autotuning failed: no config found for HLO: " +
+               instructions[0]->ToString();
+    if (autotune_config_.dump_hlos) {
+      TF_RETURN_IF_ERROR(DumpHlo(instructions[0], *cached_config));
+    }
+    CodegenBackend* codegen_backend = cached_config->codegen_backend;
+    for (auto* instr : instructions) {
+      TF_RETURN_IF_ERROR(
+          codegen_backend->ApplyConfig(*instr, *cached_config->backend_config));
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 absl::Status Autotuner::Autotune(HloInstruction* instr) {
-  VLOG(1) << "Autotuning HLO: " << instr->ToString();
-  TF_ASSIGN_OR_RETURN(Config best_config, GetConfig(instr));
-  CodegenBackend* best_codegen_backend = best_config.codegen_backend;
+  TF_ASSIGN_OR_RETURN(Config config, GetConfig(instr));
+  CodegenBackend* codegen_backend = config.codegen_backend;
+  if (autotune_config_.dump_hlos) {
+    TF_RETURN_IF_ERROR(DumpHlo(instr, config));
+  }
   TF_RETURN_IF_ERROR(
-      best_codegen_backend->ApplyConfig(*instr, *best_config.backend_config));
+      codegen_backend->ApplyConfig(*instr, *config.backend_config));
   return DumpLogsToFile();
 }
 
 absl::StatusOr<Autotuner::Config> Autotuner::GetConfig(HloInstruction* instr) {
+  VLOG(1) << "Getting config for HLO: " << instr->ToString();
   std::optional<Config> cached_config = LookUp(instr);
   if (cached_config.has_value()) {
+    VLOG(1) << "Using cached config: " << cached_config->ToString();
     return std::move(cached_config.value());
   }
 
@@ -151,11 +294,13 @@ absl::StatusOr<Autotuner::Config> Autotuner::GetConfig(HloInstruction* instr) {
   }
 
   if (autotune_config_.use_default_config) {
-    return GetDefaultConfig(*instr);
+    TF_ASSIGN_OR_RETURN(Config default_config, GetDefaultConfig(*instr));
+    VLOG(1) << "Using default config: " << default_config.ToString();
+    return default_config;
   }
 
-  Config best_config;
-  TF_ASSIGN_OR_RETURN(best_config, TuneBestConfig(instr));
+  VLOG(1) << "Autotuning the HLO instruction to find best config.";
+  TF_ASSIGN_OR_RETURN(Config best_config, TuneBestConfig(instr));
   Insert(instr, best_config);
   return best_config;
 }
@@ -165,7 +310,9 @@ absl::StatusOr<Autotuner::Config> Autotuner::TuneBestConfig(
   TF_ASSIGN_OR_RETURN(std::vector<Config> supported_configs,
                       GetSupportedConfigs(instr));
   if (supported_configs.empty()) {
-    return absl::InternalError("No supported configs found!");
+    return absl::InternalError(
+        absl::StrCat("Autotuner could not find any supported configs for HLO: ",
+                     instr->ToString()));
   }
   VLOG(1) << "Found " << supported_configs.size() << " supported configs.";
 
@@ -179,38 +326,52 @@ absl::StatusOr<Autotuner::Config> Autotuner::TuneBestConfig(
           {std::move(supported_configs[i]), std::move(executables[i].value())});
     } else {
       VLOG(4) << "Compilation failed for config "
-              << supported_configs[i].codegen_backend->name() << " : "
-              << UnpackedAnyShortDebugString(
-                     *supported_configs[i].backend_config)
+              << supported_configs[i].ToString()
               << " with status: " << executables[i].status();
     }
   }
 
   if (executable_candidates.empty()) {
-    return absl::InternalError("No executable candidates to profile!");
+    return absl::InternalError(
+        absl::StrCat("Autotuner could not compile any configs for HLO: ",
+                     instr->ToString()));
   }
   VLOG(1) << "Successfully compiled " << executable_candidates.size()
           << " configs out of " << supported_configs.size() << " configs.";
 
+  bool skip_profiling =
+      executable_candidates.size() == 1 || autotune_config_.select_first_config;
+  if (skip_profiling) {
+    VLOG(1) << "Skipping profiling and using the "
+            << (autotune_config_.select_first_config ? "first" : "only")
+            << " config: " << executable_candidates[0].config.ToString();
+    return std::move(executable_candidates[0].config);
+  }
+
   TF_ASSIGN_OR_RETURN(std::vector<ConfigResult> results,
                       ProfileAll(executable_candidates));
   LogConfigResults(*instr, results);
-  TF_ASSIGN_OR_RETURN(auto best_result, PickBestConfig(results));
-  VLOG(1) << "Picked best config: " << best_result.ToString();
-  return std::move(best_result.config);
+  absl::StatusOr<ConfigResult> best_result = PickBestConfig(results);
+  if (!best_result.ok()) {
+    return absl::InternalError(
+        absl::StrCat("Autotuning failed for HLO: ", instr->ToString(),
+                     " with error: ", best_result.status().ToString()));
+  }
+  VLOG(1) << "Picked best config: " << best_result.value().ToString();
+  return std::move(best_result.value().config);
 }
 
 Autotuner::InstructionsByFingerprint Autotuner::GetAutotuningCandidates(
     const HloModule* module, const InstructionFilterFn& should_autotune) {
-  InstructionsByFingerprint instrunctions_by_fingerprint;
+  InstructionsByFingerprint instructions_by_fingerprint;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instr : computation->MakeInstructionPostOrder()) {
       if (should_autotune(*instr)) {
-        instrunctions_by_fingerprint[GetFingerprint(instr)].push_back(instr);
+        instructions_by_fingerprint[GetFingerprint(instr)].push_back(instr);
       }
     }
   }
-  return instrunctions_by_fingerprint;
+  return instructions_by_fingerprint;
 }
 
 std::optional<Autotuner::Config> Autotuner::LookUp(
@@ -247,10 +408,12 @@ absl::StatusOr<std::vector<Autotuner::Config>> Autotuner::GetSupportedConfigs(
     HloInstruction* instr) {
   std::vector<Config> configs;
   for (auto& codegen_backend : codegen_backends_) {
-    std::vector<std::unique_ptr<BackendConfig>> per_backend_configs;
-    TF_ASSIGN_OR_RETURN(per_backend_configs,
-                        codegen_backend->GetSupportedConfigs(*instr));
-    for (auto& config : per_backend_configs) {
+    absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+        per_backend_configs = codegen_backend->GetSupportedConfigs(*instr);
+    if (!per_backend_configs.ok()) {
+      continue;
+    }
+    for (auto& config : *per_backend_configs) {
       configs.push_back({codegen_backend.get(), std::move(config)});
     }
   }
@@ -295,8 +458,11 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
 
   std::optional<ScopedShapedBuffer> reference_output;
   if (autotune_config_.check_buffers) {
-    TF_ASSIGN_OR_RETURN(reference_output,
-                        GetReferenceOutput(candidates, *input_buffers));
+    reference_output = GetReferenceOutput(candidates, *input_buffers);
+    if (!reference_output.has_value()) {
+      LOG(WARNING) << "No reference output found even though buffer checking "
+                      "was requested while autotuning";
+    }
   }
 
   for (int i = 0; i < candidates.size(); ++i) {
@@ -312,8 +478,7 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
     } else {
       duration = profile_result->duration;
       scratch_bytes = profile_result->scratch_bytes;
-      if (autotune_config_.check_buffers) {
-        CHECK(reference_output.has_value());
+      if (autotune_config_.check_buffers && reference_output.has_value()) {
         CHECK(profile_result->output_buffer.has_value());
         failure =
             CheckBuffers(*input_buffers, profile_result->output_buffer.value(),
@@ -336,7 +501,7 @@ absl::StatusOr<Autotuner::ConfigResult> Autotuner::PickBestConfig(
         std::remove_if(results.begin(), results.end(),
                        [](const ConfigResult& result) {
                          return result.config.codegen_backend->name() ==
-                                "cublas";
+                                "Cublas_fission";
                        }),
         results.end());
   }
@@ -359,29 +524,43 @@ absl::StatusOr<Autotuner::ConfigResult> Autotuner::PickBestConfig(
         absl::InfiniteDuration();
     for (ConfigResult& result : results) {
       if (!result.failure.has_value() && result.duration <= duration_limit) {
-        if (result.scratch_bytes < min_scratch_bytes) {
+        bool current_result_is_better =
+            result.scratch_bytes < min_scratch_bytes ||
+            (result.scratch_bytes == min_scratch_bytes &&
+             result.duration < min_duration_with_optimzed_scratch_bytes);
+        if (current_result_is_better) {
           min_scratch_bytes = result.scratch_bytes;
           min_duration_with_optimzed_scratch_bytes = result.duration;
           best_result = &result;
-        } else if (result.scratch_bytes == min_scratch_bytes &&
-                   result.duration < min_duration_with_optimzed_scratch_bytes) {
-          best_result = &result;
         }
       }
     }
   }
 
   if (best_result == nullptr) {
-    return absl::InternalError("No valid config found!");
-  }
-  if (autotune_config_.select_first_config) {
-    return std::move(results[0]);
+    return absl::NotFoundError("No valid config found!");
   }
 
   return std::move(*best_result);
 }
 
-absl::StatusOr<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
+absl::Status Autotuner::DumpHlo(HloInstruction* instr, const Config& config) {
+  const HloModule* parent_module = instr->GetModule();
+  std::unique_ptr<HloModule> module = ExtractInstructionIntoNewModule(*instr);
+  module->set_name(std::string(instr->name()));
+  std::string id =
+      absl::StrCat("autotuner_", dump_counter_++, ".", instr->name());
+  DumpToFileInDirOrStdout(*parent_module, "", absl::StrCat(id, ".before.txt"),
+                          module->ToString());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  TF_RETURN_IF_ERROR(
+      config.codegen_backend->ApplyConfig(*root, *config.backend_config));
+  DumpToFileInDirOrStdout(*parent_module, "", absl::StrCat(id, ".after.txt"),
+                          module->ToString());
+  return absl::OkStatus();
+}
+
+std::optional<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
     std::vector<ExecutableCandidate>& candidates, InputBuffers& input_buffers) {
   for (auto& candidate : candidates) {
     if (candidate.config.codegen_backend->CanProduceWrongResults()) {
@@ -397,7 +576,7 @@ absl::StatusOr<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
       return std::move(profile_result.value().output_buffer.value());
     }
   }
-  return absl::InternalError("No reference output found!");
+  return std::nullopt;
 }
 
 std::optional<Autotuner::Failure> Autotuner::CheckBuffers(
@@ -516,4 +695,9 @@ AutotuneResult Autotuner::ConfigResult::ToProto() const {
   return result;
 }
 
+std::string Autotuner::Config::ToString() const {
+  return absl::StrFormat("%s : %s", codegen_backend->name(),
+                         UnpackedAnyShortDebugString(*backend_config));
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.h b/third_party/xla/xla/backends/autotuner/autotuner.h
index 02abbfa37a7b2e..eb15e516070b22 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.h
+++ b/third_party/xla/xla/backends/autotuner/autotuner.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/autotuner/profiler.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/executable.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -45,6 +46,8 @@ namespace xla {
 struct AutotuneConfig {
   // Whether to check the correctness of the output buffers and OOM reads on
   // Input Buffers.
+  // Correctness check is only performed when a trustable reference output is
+  // available.
   bool check_buffers = true;
   // Relative tolerance for correctness check.
   float relative_tolerance = 1e-6;
@@ -66,9 +69,9 @@ struct AutotuneConfig {
   std::string dump_logs_to = "";
   // TODO b/446618161 - Remove this when old triton emitter is
   // deprecated.
-  // If true, autotuner will not select cublas configs. We still try cublas
-  // configs as they can be used to check numerical issues with triton but they
-  // are not considered for selection.
+  // If true, autotuner will not select cublas configs for fusions. We still try
+  // the configs as they can be used to check numerical issues with triton but
+  // they are not considered for selection, unless there are no other options.
   bool exclude_cublas_config = false;
   // TODO b/446870267- Remove this option and use default configs rather than
   // the first config.
@@ -81,6 +84,9 @@ struct AutotuneConfig {
   // Note: If cache is provided, the cached config will be used instead of the
   // default config.
   bool use_default_config = false;
+  // If true, dump the autotuned instructions to the modules's xla_dump_to or
+  // to stdout if not set.
+  bool dump_hlos = false;
 };
 
 class Autotuner {
@@ -103,6 +109,13 @@ class Autotuner {
   absl::Status Autotune(HloModule* module,
                         const InstructionFilterFn& should_autotune);
 
+  // Same as above, but also takes a sharding KV store which helps to shard
+  // the autotuning work across multiple processes.
+  // This is used for distributed autotuning.
+  absl::Status Autotune(HloModule* module,
+                        const InstructionFilterFn& should_autotune,
+                        MultiProcessKeyValueStore& sharding_kv_store);
+
  private:
   using InstructionsByFingerprint =
       absl::flat_hash_map<tsl::Fprint128, std::vector<HloInstruction*>,
@@ -111,6 +124,8 @@ class Autotuner {
   struct Config {
     CodegenBackend* codegen_backend;
     std::unique_ptr<BackendConfig> backend_config;
+
+    std::string ToString() const;
   };
   struct ExecutableCandidate {
     Config config;
@@ -184,7 +199,7 @@ class Autotuner {
   absl::StatusOr<ConfigResult> PickBestConfig(
       std::vector<ConfigResult>& results);
 
-  absl::StatusOr<ScopedShapedBuffer> GetReferenceOutput(
+  std::optional<ScopedShapedBuffer> GetReferenceOutput(
       std::vector<ExecutableCandidate>& candidates,
       InputBuffers& input_buffers);
 
@@ -195,6 +210,8 @@ class Autotuner {
   void LogConfigResults(const HloInstruction& instr,
                         const std::vector<ConfigResult>& results);
   absl::Status DumpLogsToFile();
+  // Dumps HLO before and after applying the config.
+  absl::Status DumpHlo(HloInstruction* instr, const Config& config);
 
   std::vector<std::unique_ptr<CodegenBackend>> codegen_backends_;
   std::unique_ptr<Profiler> profiler_;
@@ -202,6 +219,7 @@ class Autotuner {
   std::unique_ptr<AutotunerCacheInterface> cache_;
   tsl::thread::ThreadPool* thread_pool_;
   AutotuningLogs logs_;
+  int dump_counter_ = 0;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h b/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h
index f07399f0b18501..09fa40dedb76da 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h
+++ b/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h
@@ -20,6 +20,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/autotuner/autotuner_cache.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 
@@ -42,6 +45,20 @@ class AutotunerCacheInterface {
 
   virtual absl::Status Insert(const HloInstruction* instr,
                               const Config& best_config) = 0;
+
+  // Serializes the cache to a string. If instructions are provided, only the
+  // cache entries corresponding to the instructions will be serialized,
+  // otherwise all cache entries will be serialized.
+  virtual absl::StatusOr<std::string> Serialize(
+      absl::Span<const HloInstruction* const> instructions_to_serialize) {
+    return absl::UnimplementedError("Serialize is not implemented.");
+  };
+
+  // Deserializes the string and updates the cache, overwriting the keys if they
+  // already exist.
+  virtual absl::Status Deserialize(absl::string_view serialized_cache) {
+    return absl::UnimplementedError("Deserialize is not implemented.");
+  };
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_test.cc b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
index 96ecf98c3631c7..1fc74269cead16 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_test.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
@@ -29,6 +29,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "google/protobuf/text_format.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/autotuner/profiler.h"
@@ -36,6 +40,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/literal_util.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/shaped_buffer.h"
@@ -45,6 +50,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/testing/temporary_directory.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "tsl/platform/path.h"
@@ -63,6 +69,7 @@ MATCHER_P(ConfigMatcher, name, "") {
 }
 
 MATCHER_P(InstructionMatcher, opcode, "") { return arg.opcode() == opcode; }
+MATCHER_P(InstrPtrMatcher, opcode, "") { return arg->opcode() == opcode; }
 
 std::unique_ptr<google::protobuf::Any> GetTestConfig(std::string name) {
   TestConfig config;
@@ -122,6 +129,11 @@ class MockAutotunerCache : public AutotunerCacheInterface {
               (const HloInstruction* instr,
                const AutotunerCacheInterface::Config& best_config),
               (override));
+  MOCK_METHOD(absl::StatusOr<std::string>, Serialize,
+              (absl::Span<const HloInstruction* const> instructions),
+              (override));
+  MOCK_METHOD(absl::Status, Deserialize, (absl::string_view serialized_cache),
+              (override));
 };
 
 using absl_testing::IsOk;
@@ -129,7 +141,9 @@ using absl_testing::StatusIs;
 using ::testing::_;
 using ::testing::AtMost;
 using ::testing::ByMove;
+using ::testing::MatchesRegex;
 using ::testing::Return;
+using ::testing::UnorderedElementsAre;
 using tsl::proto_utils::ToDurationProto;
 
 se::DeviceDescription CreateDummyDeviceDescription() {
@@ -139,43 +153,45 @@ se::DeviceDescription CreateDummyDeviceDescription() {
 }
 
 absl::StatusOr<std::unique_ptr<Autotuner>> SetupAutotunerWithExpectations(
-    HloOpcode instr_to_autotune,
-    std::pair<HloOpcode, int> instr_to_apply_config_and_count) {
-  auto cache_manager = std::make_unique<MockAutotunerCache>();
-  EXPECT_CALL(*cache_manager, Lookup(_)).WillRepeatedly(Return(std::nullopt));
-  EXPECT_CALL(*cache_manager, Insert(_, _))
-      .WillRepeatedly(Return(absl::OkStatus()));
-
-  std::vector<std::unique_ptr<BackendConfig>> configs;
-  configs.push_back(GetTestConfig("test_config_1"));
-  configs.push_back(GetTestConfig("test_config_2"));
-
+    std::vector<HloOpcode> instrs_to_autotune,
+    std::vector<std::pair<HloOpcode, int>> instrs_to_apply_config_and_count,
+    std::unique_ptr<MockAutotunerCache> cache = nullptr,
+    bool dump_hlos = false) {
   auto backend = std::make_unique<MockCodegenBackend>();
-  EXPECT_CALL(*backend,
-              GetSupportedConfigs(InstructionMatcher(instr_to_autotune)))
-      .WillOnce(Return(std::move(configs)));
-  EXPECT_CALL(*backend, Compile(_, _))
-      .WillOnce(Return(std::unique_ptr<Executable>()))
-      .WillOnce(Return(std::unique_ptr<Executable>()));
-  HloOpcode instr_to_apply_config = instr_to_apply_config_and_count.first;
-  int count = instr_to_apply_config_and_count.second;
-  EXPECT_CALL(*backend,
-              ApplyConfig(InstructionMatcher(instr_to_apply_config), _))
-      .Times(count)
-      .WillRepeatedly(Return(absl::OkStatus()));
-
   auto profiler = std::make_unique<MockProfiler>();
-  auto device_description = CreateDummyDeviceDescription();
+  EXPECT_CALL(*backend, name()).WillRepeatedly(Return("mock_backend"));
+  for (const auto& instr_to_autotune : instrs_to_autotune) {
+    std::vector<std::unique_ptr<BackendConfig>> configs;
+    // Best config is just by notion here since profiler time is same for all.
+    configs.push_back(GetTestConfig("best_config"));
+    configs.push_back(GetTestConfig("another_config"));
+    EXPECT_CALL(*backend,
+                GetSupportedConfigs(InstructionMatcher(instr_to_autotune)))
+        .WillOnce(Return(std::move(configs)));
+  }
   EXPECT_CALL(*profiler, CreateInputBuffers(_))
-      .WillOnce(Return(std::make_unique<InputBuffers>()));
+      .Times(instrs_to_autotune.size())
+      .WillRepeatedly([] { return std::make_unique<InputBuffers>(); });
+  EXPECT_CALL(*backend, Compile(_, _))
+      .Times(2 * instrs_to_autotune.size())
+      .WillRepeatedly([] { return std::unique_ptr<Executable>(); });
   EXPECT_CALL(*profiler, Profile(_, _))
-      .WillOnce(Return(ProfileResult({absl::Seconds(2)})))
-      .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
-
+      .Times(2 * instrs_to_autotune.size())
+      .WillRepeatedly([] { return ProfileResult({absl::Seconds(1)}); });
+
+  for (const auto& [instr_to_apply_config, count] :
+       instrs_to_apply_config_and_count) {
+    EXPECT_CALL(*backend,
+                ApplyConfig(InstructionMatcher(instr_to_apply_config), _))
+        .Times(count)
+        .WillRepeatedly(Return(absl::OkStatus()));
+  }
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  return Autotuner::Create(std::move(backends), std::move(profiler),
-                           GetTestAutotuneConfig(), std::move(cache_manager));
+  AutotuneConfig config = GetTestAutotuneConfig();
+  config.dump_hlos = dump_hlos;
+  return Autotuner::Create(std::move(backends), std::move(profiler), config,
+                           std::move(cache));
 }
 
 constexpr absl::string_view kHlo = R"(
@@ -367,8 +383,8 @@ TEST_F(AutotunerTest, AutotuneModuleFollowsFilter) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
-          /*instr_to_autotune=*/HloOpcode::kCopy,
-          /*instr_to_apply_config_and_count=*/{HloOpcode::kCopy, 1}));
+          /*instrs_to_autotune=*/{HloOpcode::kCopy},
+          /*instrs_to_apply_config_and_count=*/{{HloOpcode::kCopy, 1}}));
 
   EXPECT_THAT(autotuner->Autotune(module.get(), should_autotune),
               absl_testing::IsOk());
@@ -384,12 +400,44 @@ TEST_F(AutotunerTest, AutotuneModuleWithDuplicateInstructions) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
-          /*instr_to_autotune=*/HloOpcode::kAdd,
-          /*instr_to_apply_config_and_count=*/{HloOpcode::kAdd, 2}));
+          /*instrs_to_autotune=*/{HloOpcode::kAdd},
+          /*instrs_to_apply_config_and_count=*/{{HloOpcode::kAdd, 2}}));
 
   EXPECT_THAT(autotuner->Autotune(module.get(), should_autotune), IsOk());
 }
 
+TEST_F(AutotunerTest, AutotuneButOneBackendFails) {
+  auto cache_manager = std::make_unique<MockAutotunerCache>();
+  EXPECT_CALL(*cache_manager, Lookup(_)).WillOnce(Return(std::nullopt));
+  EXPECT_CALL(*cache_manager, Insert(_, _)).WillOnce(Return(absl::OkStatus()));
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config"));
+
+  auto good_backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*good_backend, GetSupportedConfigs)
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*good_backend, Compile(_, _))
+      .WillOnce(Return(std::unique_ptr<Executable>()));
+  EXPECT_CALL(*good_backend, ApplyConfig(_, ConfigMatcher("test_config")))
+      .Times(1)
+      .WillRepeatedly(Return(absl::OkStatus()));
+  auto bad_backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*bad_backend, GetSupportedConfigs)
+      .WillOnce(Return(absl::InternalError("test error")));
+
+  auto profiler = std::make_unique<MockProfiler>();
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(good_backend));
+  backends.push_back(std::move(bad_backend));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto autotuner,
+      Autotuner::Create(std::move(backends), std::move(profiler), config_,
+                        std::move(cache_manager)));
+  auto dummy_instr = HloInstruction::CreateConstant(LiteralUtil::CreateR0(1));
+  EXPECT_THAT(autotuner->Autotune(dummy_instr.get()), absl_testing::IsOk());
+}
+
 TEST_F(AutotunerTest, CacheHit) {
   auto cache_manager = std::make_unique<MockAutotunerCache>();
   AutotunerCacheInterface::Config config;
@@ -419,7 +467,7 @@ TEST_F(AutotunerTest, CacheHit) {
   EXPECT_THAT(autotuner->Autotune(dummy_instr.get()), IsOk());
 }
 
-TEST_F(AutotunerTest, AutotuneWithBufferCheck) {
+TEST_F(AutotunerTest, AutotuneWithBufferCheckFiltersWrongResults) {
   config_.check_buffers = true;
 
   std::vector<std::unique_ptr<BackendConfig>> configs_1;
@@ -466,15 +514,54 @@ TEST_F(AutotunerTest, AutotuneWithBufferCheck) {
   EXPECT_THAT(autotuner->Autotune(dummy_instr.get()), IsOk());
 }
 
+TEST_F(AutotunerTest, AutotuneSkipsBufferCheckWhenNoReferenceOutput) {
+  config_.check_buffers = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+  auto backend = std::make_unique<MockCodegenBackendWithWrongResults>();
+  EXPECT_CALL(*backend, GetSupportedConfigs)
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, _))
+      .WillOnce(Return(std::unique_ptr<Executable>()))
+      .WillOnce(Return(std::unique_ptr<Executable>()));
+
+  EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_1")))
+      .Times(1)
+      .WillRepeatedly(Return(absl::OkStatus()));
+
+  auto profiler = std::make_unique<MockProfiler>();
+  ScopedShapedBuffer output_1(Shape(), nullptr, 0),
+      output_2(Shape(), nullptr, 0), output_3(Shape(), nullptr, 0);
+  EXPECT_CALL(*profiler, CreateInputBuffers(_))
+      .WillOnce(Return(std::make_unique<InputBuffers>()));
+  EXPECT_CALL(*profiler, Profile(_, _))
+      .WillOnce(Return(ProfileResult({absl::Seconds(1), std::move(output_1)})))
+      .WillOnce(Return(ProfileResult({absl::Seconds(2), std::nullopt})));
+  EXPECT_CALL(*profiler, CheckOutputBuffer(_, _, _)).Times(0);
+
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto autotuner,
+      Autotuner::Create(std::move(backends), std::move(profiler), config_,
+                        std::make_unique<MockAutotunerCache>()));
+  auto dummy_instr = HloInstruction::CreateConstant(LiteralUtil::CreateR0(1));
+  EXPECT_THAT(autotuner->Autotune(dummy_instr.get()), IsOk());
+}
+
 TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
   std::vector<std::unique_ptr<BackendConfig>> configs;
-  configs.push_back(GetTestConfig("config_more_time_less_scratch"));
+  configs.push_back(GetTestConfig("config_most_time_less_scratch"));
   configs.push_back(GetTestConfig("config_less_time_less_scratch"));
   configs.push_back(GetTestConfig("config_least_time_most_scratch"));
+  configs.push_back(GetTestConfig("config_more_time_less_scratch"));
   auto backend_1 = std::make_unique<MockCodegenBackend>();
   EXPECT_CALL(*backend_1, GetSupportedConfigs)
       .WillOnce(Return(std::move(configs)));
   EXPECT_CALL(*backend_1, Compile(_, _))
+      .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()));
@@ -489,7 +576,7 @@ TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
       .WillOnce(Return(std::make_unique<InputBuffers>()));
   EXPECT_CALL(*profiler, Profile(_, _))
       .WillOnce(Return(ProfileResult({
-          /*duration=*/absl::Microseconds(5),
+          /*duration=*/absl::Microseconds(7),
           /*output_buffer=*/std::nullopt,
           /*scratch_bytes=*/100,
       })))
@@ -502,12 +589,17 @@ TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
           /*duration=*/absl::Microseconds(2),
           /*output_buffer=*/std::nullopt,
           /*scratch_bytes=*/200,
+      })))
+      .WillOnce(Return(ProfileResult({
+          /*duration=*/absl::Microseconds(6),
+          /*output_buffer=*/std::nullopt,
+          /*scratch_bytes=*/100,
       })));
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend_1));
   config_.optimize_scratch_bytes = true;
-  config_.scratch_bytes_window_size_us = 2;
+  config_.scratch_bytes_window_size_us = 8;
   TF_ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
@@ -537,7 +629,10 @@ TEST_F(AutotunerTest, ExpectAllInstructionsInCache) {
 }
 
 TEST_F(AutotunerTest, DumpLogsToFile) {
-  config_.dump_logs_to = tsl::io::JoinPath(tsl::testing::TmpDir(), "dump.log");
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::testing::TemporaryDirectory temp_dir,
+      tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
+  config_.dump_logs_to = tsl::io::JoinPath(temp_dir.path(), "dump.log");
 
   std::vector<std::unique_ptr<BackendConfig>> configs;
   configs.push_back(GetTestConfig("test_config_1"));
@@ -601,13 +696,15 @@ TEST_F(AutotunerTest, ExcludeCublasConfig) {
   config_.exclude_cublas_config = true;
   std::vector<std::unique_ptr<BackendConfig>> configs;
   configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
 
   auto backend = std::make_unique<MockCodegenBackend>();
   EXPECT_CALL(*backend, GetSupportedConfigs(_))
       .WillOnce(Return(std::move(configs)));
   EXPECT_CALL(*backend, Compile(_, _))
+      .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()));
-  EXPECT_CALL(*backend, name()).WillRepeatedly(Return("cublas"));
+  EXPECT_CALL(*backend, name()).WillRepeatedly(Return("Cublas_fission"));
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
@@ -615,7 +712,8 @@ TEST_F(AutotunerTest, ExcludeCublasConfig) {
   EXPECT_CALL(*profiler, CreateInputBuffers(_))
       .WillOnce(Return(std::make_unique<InputBuffers>()));
   EXPECT_CALL(*profiler, Profile(_, _))
-      .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
+      .WillOnce(Return(ProfileResult({absl::Seconds(1)})))
+      .WillOnce(Return(ProfileResult({absl::Seconds(2)})));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
@@ -646,11 +744,6 @@ TEST_F(AutotunerTest, SelectFirstConfig) {
   backends.push_back(std::move(backend));
 
   auto profiler = std::make_unique<MockProfiler>();
-  EXPECT_CALL(*profiler, CreateInputBuffers(_))
-      .WillOnce(Return(std::make_unique<InputBuffers>()));
-  EXPECT_CALL(*profiler, Profile(_, _))
-      .WillOnce(Return(ProfileResult({absl::Seconds(2)})))
-      .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
@@ -705,5 +798,108 @@ TEST_F(AutotunerTest, UseDefaultConfigUnimplemented) {
                "GetDefaultConfig is not implemented for mock_backend");
 }
 
+class MockKeyValueStore : public KeyValueStoreInterface {
+ public:
+  MOCK_METHOD(absl::Status, Set,
+              (absl::string_view key, absl::string_view value), (override));
+  MOCK_METHOD(absl::StatusOr<std::string>, Get,
+              (absl::string_view key, absl::Duration timeout), (override));
+  MOCK_METHOD(absl::StatusOr<std::string>, TryGet, (absl::string_view key),
+              (override));
+};
+
+AutotunerCacheInterface::Config GetCacheConfig(absl::string_view name) {
+  AutotunerCacheInterface::Config config;
+  config.codegen_backend_name = "mock_backend";
+  config.backend_config = *GetTestConfig(std::string(name));
+  return config;
+};
+
+TEST_F(AutotunerTest, ShardedAutotuning) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  constexpr int kShardCount = 2;
+  auto should_autotune = [](const HloInstruction& instruction) {
+    return instruction.opcode() == HloOpcode::kAdd ||
+           instruction.opcode() == HloOpcode::kCopy;
+  };
+  auto kv_store = std::make_shared<MockKeyValueStore>();
+  auto cache = std::make_unique<MockAutotunerCache>();
+
+  // Shard 0 autotunes kCopy instructions, updates the cache and serializes the
+  // result to a string "kCopy_autotune_result".
+  EXPECT_CALL(*cache, Lookup(InstrPtrMatcher(HloOpcode::kCopy)))
+      .WillOnce(Return(std::nullopt))                    // During autotuning.
+      .WillOnce(Return(GetCacheConfig("best_config")));  // Config application.
+  EXPECT_CALL(*cache, Insert(InstrPtrMatcher(HloOpcode::kCopy), _))
+      .WillOnce(Return(absl::OkStatus()));
+  EXPECT_CALL(*cache, Serialize(_)).WillOnce(Return("kCopy_autotune_result"));
+  // Stores the serialized results to the KV store if it does not exist.
+  EXPECT_CALL(*kv_store, TryGet(testing::HasSubstr("_0")))
+      .WillOnce(Return(absl::NotFoundError("not found")));
+  EXPECT_CALL(*kv_store, Set(testing::HasSubstr("_0"), "kCopy_autotune_result"))
+      .WillOnce(Return(absl::OkStatus()));
+
+  // Shard 0 reads the KV store entry for shard 1 and updates the current cache.
+  EXPECT_CALL(*kv_store, Get(testing::HasSubstr("_1"), _))
+      .WillOnce(Return("kAdd_autotune_result"));
+  EXPECT_CALL(*cache, Deserialize("kAdd_autotune_result"))
+      .WillOnce(Return(absl::OkStatus()));
+  EXPECT_CALL(*cache, Lookup(InstrPtrMatcher(HloOpcode::kAdd)))
+      .WillOnce(Return(GetCacheConfig("best_config")));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Autotuner> autotuner,
+      SetupAutotunerWithExpectations(
+          /*instrs_to_autotune=*/{HloOpcode::kCopy},
+          /*instrs_to_apply_config_and_count=*/
+          {{HloOpcode::kCopy, 1}, {HloOpcode::kAdd, 2}}, std::move(cache)));
+
+  MultiProcessKeyValueStore sharding_kv_store;
+  sharding_kv_store.key_value_store = kv_store;
+  sharding_kv_store.process_count = kShardCount;
+  sharding_kv_store.process_index = 0;
+  EXPECT_THAT(
+      autotuner->Autotune(module.get(), should_autotune, sharding_kv_store),
+      IsOk());
+}
+
+TEST_F(AutotunerTest, DumpHlos) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::testing::TemporaryDirectory dump_dir,
+      tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  module->mutable_config().mutable_debug_options().set_xla_dump_to(
+      dump_dir.path());
+  auto should_autotune = [](const HloInstruction& instruction) {
+    return instruction.opcode() == HloOpcode::kCopy ||
+           instruction.opcode() == HloOpcode::kAdd;
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Autotuner> autotuner,
+      SetupAutotunerWithExpectations(
+          /*instrs_to_autotune=*/{HloOpcode::kCopy, HloOpcode::kAdd},
+          // One apply config call per instruction is expected for dumping HLOs.
+          /*instrs_to_apply_config_and_count=*/
+          {{HloOpcode::kCopy, 2}, {HloOpcode::kAdd, 3}},
+          /*cache=*/nullptr,
+          /*dump_hlos=*/true));
+
+  EXPECT_THAT(autotuner->Autotune(module.get(), should_autotune), IsOk());
+
+  std::vector<std::string> files;
+  EXPECT_THAT(tsl::Env::Default()->GetChildren(dump_dir.path(), &files),
+              IsOk());
+  EXPECT_THAT(files.size(), 4);
+  EXPECT_THAT(
+      files,
+      UnorderedElementsAre(
+          MatchesRegex(".*\\.test_module\\.autotuner_0\\.copy\\.before\\.txt"),
+          MatchesRegex(".*\\.test_module\\.autotuner_0\\.copy\\.after\\.txt"),
+          MatchesRegex(".*\\.test_module\\.autotuner_1\\.add\\.after\\.txt"),
+          MatchesRegex(".*\\.test_module\\.autotuner_1\\.add\\.before\\.txt")));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc b/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
index d7aa821265f9cf..969286250aa1e9 100644
--- a/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
+++ b/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SHA256.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/autotuner/autotuner_cache.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -65,12 +66,11 @@ FileBasedAutotunerCache::FileBasedAutotunerCache(
 std::string FileBasedAutotunerCache::DeviceDescriptionToString(
     const se::DeviceDescription& device_desc) {
   std::string compute_capability;
-  if (auto* ccc = std::get_if<se::CudaComputeCapability>(
-          &device_desc.gpu_compute_capability())) {
+  if (auto* ccc =
+          device_desc.gpu_compute_capability().cuda_compute_capability()) {
     compute_capability = absl::StrCat("CUDA: ", ccc->major, ".", ccc->minor);
   } else {
-    auto* rcc = std::get_if<se::RocmComputeCapability>(
-        &device_desc.gpu_compute_capability());
+    auto* rcc = device_desc.gpu_compute_capability().rocm_compute_capability();
     compute_capability = absl::StrCat("ROCM: ", rcc->gfx_version());
   }
 
diff --git a/third_party/xla/xla/backends/cpu/BUILD b/third_party/xla/xla/backends/cpu/BUILD
index eb807d3fbc5184..2157066d84e2ff 100644
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@@ -23,7 +23,10 @@ package_group(
 
 filegroup(
     name = "xla_cpu_runtime_hdrs",
-    srcs = ["alignment.h"],
+    srcs = [
+        "alignment.h",
+        "buffer_allocation_info.h",
+    ],
     visibility = internal_visibility([":friends"]),
 )
 
@@ -38,18 +41,34 @@ cc_library(
     name = "buffer_allocation_info",
     hdrs = ["buffer_allocation_info.h"],
     visibility = internal_visibility([":friends"]),
+)
+
+cc_library(
+    name = "buffer_allocation_info_util",
+    srcs = ["buffer_allocation_info_util.cc"],
+    hdrs = ["buffer_allocation_info_util.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
-        "//xla:xla_data_proto_cc",
+        ":buffer_allocation_info",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
-xla_cc_test(
-    name = "buffer_allocation_info_test",
-    srcs = ["buffer_allocation_info_test.cc"],
+cc_library(
+    name = "ffi",
+    hdrs = ["ffi.h"],
+    visibility = ["//visibility:public"],
     deps = [
-        ":buffer_allocation_info",
-        "@com_google_googletest//:gtest_main",
+        "//xla/ffi",
+        "//xla/ffi/api:c_api",
+        "//xla/ffi/api:c_api_internal",
+        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -64,7 +83,7 @@ onednn_graph_cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime/onednn:onednn_interop",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/mkl:onednn",
@@ -94,7 +113,7 @@ onednn_graph_cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/tsl/mkl:onednn",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
@@ -104,8 +123,8 @@ onednn_graph_cc_library(
 )
 
 tf_proto_library(
-    name = "xnnpack_config_proto",
-    srcs = ["xnnpack_config.proto"],
+    name = "xnn_fusion_options_proto",
+    srcs = ["xnn_fusion_options.proto"],
 )
 
 cc_library(
@@ -131,6 +150,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ynn_emitter",
+    srcs = ["ynn_emitter.cc"],
+    hdrs = ["ynn_emitter.h"],
+    deps = [
+        ":ynn_support",
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/runtime:dot_dims",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_memory",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@XNNPACK//ynnpack",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "xnn_gemm_config",
     srcs = ["xnn_gemm_config.cc"],
@@ -139,7 +184,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Target",
@@ -156,7 +201,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
@@ -186,6 +231,68 @@ xla_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "ynn_fusion_options_proto",
+    srcs = ["ynn_fusion_options.proto"],
+)
+
+cc_library(
+    name = "ynn_support",
+    srcs = ["ynn_support.cc"],
+    hdrs = ["ynn_support.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/cpu/runtime:dot_dims",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
+        "@XNNPACK//ynnpack",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "target_machine_options",
+    srcs = ["target_machine_options.cc"],
+    hdrs = ["target_machine_options.h"],
+    deps = [
+        "//xla:util",
+        "//xla:xla_proto_cc",
+        "//xla/backends/cpu/codegen:cpu_features",
+        "//xla/service/cpu:executable_proto_cc",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+    ],
+)
+
+xla_cc_test(
+    name = "target_machine_options_test",
+    srcs = ["target_machine_options_test.cc"],
+    deps = [
+        ":target_machine_options",
+        "//xla:xla_proto_cc",
+        "//xla/service/cpu:executable_proto_cc",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "constant_allocation",
     srcs = ["constant_allocation.cc"],
diff --git a/third_party/xla/xla/backends/cpu/autotuner/BUILD b/third_party/xla/xla/backends/cpu/autotuner/BUILD
index b756df83ef23e0..89b7f87c5cdce7 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/cpu/autotuner/BUILD
@@ -91,8 +91,8 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
         "//xla/backends/cpu:xnn_support",
-        "//xla/backends/cpu:xnnpack_config_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service/cpu:backend_config_proto_cc",
@@ -118,13 +118,11 @@ xla_cc_test(
         "//xla/service:compiler",
         "//xla/service/cpu:backend_config_proto_cc",
         "//xla/service/cpu:cpu_compiler",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -160,7 +158,6 @@ xla_cc_test(
         "//xla/service:compiler",
         "//xla/service/cpu:backend_config_proto_cc",
         "//xla/service/cpu:cpu_compiler",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc
index d8984847035da0..35d6c5b33b334a 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc
@@ -39,7 +39,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-absl::StatusOr<bool> LlvmKernelAutotuner::Run(
+absl::StatusOr<bool> LlvmKernelAutotuner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(auto compiler,
diff --git a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h
index 6fac8e0eb743d8..fe26d971346742 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h
+++ b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h
@@ -40,7 +40,8 @@ class LlvmKernelAutotuner : public HloModulePass {
 
   absl::string_view name() const override { return kLlvmKernelAutotunerName; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
index 5a801d053f41c5..765a50a887cd54 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/cpu/xnn_fusion_options.pb.h"
 #include "xla/backends/cpu/xnn_support.h"
-#include "xla/backends/cpu/xnnpack_config.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/compiler.h"
@@ -78,18 +78,18 @@ XnnpackBackend::GetSupportedConfigs(const HloInstruction& instr) {
   TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
   std::vector<std::unique_ptr<xla::BackendConfig>> configs;
   {
-    Config config;
-    config.set_use_threadpool(true);
+    XnnFusionOptions options;
+    options.set_use_threadpool(true);
     auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(config);
+    any->PackFrom(options);
     configs.push_back(std::move(any));
   }
 
   {
-    Config config;
-    config.set_use_threadpool(false);
+    XnnFusionOptions options;
+    options.set_use_threadpool(false);
     auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(config);
+    any->PackFrom(options);
     configs.push_back(std::move(any));
   }
   return configs;
@@ -97,7 +97,7 @@ XnnpackBackend::GetSupportedConfigs(const HloInstruction& instr) {
 absl::StatusOr<std::unique_ptr<xla::BackendConfig>>
 XnnpackBackend::GetDefaultConfig(const HloInstruction& instr) {
   TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  auto config = std::make_unique<Config>();
+  auto config = std::make_unique<XnnFusionOptions>();
   config->set_use_threadpool(true);
   auto any = std::make_unique<xla::BackendConfig>();
   any->PackFrom(*config);
@@ -110,11 +110,11 @@ absl::Status XnnpackBackend::ApplyConfig(HloInstruction& instr,
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr.backend_config<xla::cpu::BackendConfig>());
 
-  XnnpackBackend::Config xnn_config;
-  config.UnpackTo(&xnn_config);
+  XnnFusionOptions options;
+  config.UnpackTo(&options);
 
-  *backend_config.mutable_fusion_config()->mutable_xnn_fusion_config() =
-      xnn_config;
+  *backend_config.mutable_fusion_config()->mutable_xnn_fusion_options() =
+      options;
 
   TF_RETURN_IF_ERROR(instr.set_backend_config(backend_config));
 
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
index 3246f10fab9561..71b8b8c86d8011 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
+++ b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/cpu/autotuner/cpu_codegen_backend.h"
-#include "xla/backends/cpu/xnnpack_config.pb.h"
+#include "xla/backends/cpu/xnn_fusion_options.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/compiler.h"
 
@@ -37,8 +37,6 @@ class XnnpackBackend : public CpuCodegenBackend {
   static absl::StatusOr<std::unique_ptr<CodegenBackend>> Create(
       Compiler* compiler);
 
-  using Config = XnnFusionBackendConfig;
-
   bool IsSupported(const HloInstruction& instr);
 
   absl::StatusOr<std::vector<std::unique_ptr<xla::BackendConfig>>>
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
index 4b6a53eb13643b..9dea563a7b6d0a 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
@@ -29,9 +29,7 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/casts.h"
 
 namespace xla::cpu {
 namespace {
@@ -77,10 +75,10 @@ TEST_F(XnnpackBackendTest, GetDefaultConfigTest) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto config, backend_->GetDefaultConfig(
                        *module->entry_computation()->root_instruction()));
-  XnnFusionBackendConfig xnnpack_config;
-  config->UnpackTo(&xnnpack_config);
+  XnnFusionOptions xnn_fusion_options;
+  config->UnpackTo(&xnn_fusion_options);
 
-  EXPECT_TRUE(xnnpack_config.use_threadpool());
+  EXPECT_TRUE(xnn_fusion_options.use_threadpool());
 }
 
 TEST_F(XnnpackBackendTest, InvalidFusionKind) {
@@ -123,12 +121,12 @@ TEST_F(XnnpackBackendTest, GetSupportedConfigsTest) {
                         *module->entry_computation()->root_instruction()));
 
   EXPECT_EQ(configs.size(), 2);
-  XnnFusionBackendConfig xnnpack_config0;
-  configs[0]->UnpackTo(&xnnpack_config0);
-  EXPECT_TRUE(xnnpack_config0.use_threadpool());
-  XnnFusionBackendConfig xnnpack_config1;
-  configs[1]->UnpackTo(&xnnpack_config1);
-  EXPECT_FALSE(xnnpack_config1.use_threadpool());
+  XnnFusionOptions xnn_fusion_options0;
+  configs[0]->UnpackTo(&xnn_fusion_options0);
+  EXPECT_TRUE(xnn_fusion_options0.use_threadpool());
+  XnnFusionOptions xnn_fusion_options1;
+  configs[1]->UnpackTo(&xnn_fusion_options1);
+  EXPECT_FALSE(xnn_fusion_options1.use_threadpool());
 }
 
 TEST_F(XnnpackBackendTest, CompileSupportedBackends) {
@@ -153,8 +151,8 @@ TEST_F(XnnpackBackendTest, EnsureConfigIsApplied) {
                           backend_->GetSupportedConfigs(*fusion_instruction));
 
   for (const auto& config : configs) {
-    XnnFusionBackendConfig xnnpack_config;
-    config->UnpackTo(&xnnpack_config);
+    XnnFusionOptions xnn_fusion_options;
+    config->UnpackTo(&xnn_fusion_options);
     EXPECT_TRUE(backend_->ApplyConfig(*fusion_instruction, *config).ok());
 
     TF_ASSERT_OK_AND_ASSIGN(
@@ -162,9 +160,9 @@ TEST_F(XnnpackBackendTest, EnsureConfigIsApplied) {
         fusion_instruction->backend_config<BackendConfig>());
 
     EXPECT_EQ(instruction_backend_config.fusion_config()
-                  .xnn_fusion_config()
+                  .xnn_fusion_options()
                   .use_threadpool(),
-              xnnpack_config.use_threadpool());
+              xnn_fusion_options.use_threadpool());
   }
 }
 
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/benchmark.py b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/benchmark.py
index 46d5e4355c1136..4bd8a254255a93 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/benchmark.py
+++ b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/benchmark.py
@@ -51,21 +51,32 @@ def compute_stats(array):
   return (mean, diff)
 
 
-def run(gemma_lm, max_len):
+def run(gemma_lm, tokenizer, max_len):
   """Benchmarks inferences with at most `max_len` output tokens.
 
   Args:
     gemma_lm: The Gemma2 Keras model.
+    tokenizer: The tokenizer for the Gemma2 model.
     max_len: The maximum number of output tokens per one inference.
 
   Returns:
     mean ± %diff and the actual number of output tokens generated per inference.
   """
+  in_tokens = tokenizer(_QUERY)
+  in_tokens_len = len(in_tokens)
+  total_output_len = in_tokens_len + max_len
+  if max_len < 1:
+    print(f"Error: max_len {max_len} should be >= 1")
+    exit()
+
   # Warm up.
   start = time.time()
-  output = gemma_lm.generate(_QUERY, max_length=max_len + 1)
-  num_actual_output_tokens = len(output.split(" "))
+  output = gemma_lm.generate(_QUERY, max_length=total_output_len + 1)
   warmup_time = (time.time() - start) * 1000
+  num_actual_output_tokens = len(tokenizer(output))
+  gen_tokens = max(num_actual_output_tokens - in_tokens_len, 0)
+
+  print("Warmup: Number of generated output tokens: ", gen_tokens)
 
   if _VERBOSE:
     print("=== Max len: %d ===" % max_len)
@@ -75,27 +86,33 @@ def run(gemma_lm, max_len):
   times = []
   for i in range(1, 6):
     start = time.time()
-    output = gemma_lm.generate(_QUERY, max_length=max_len + 1)
-    assert num_actual_output_tokens == len(output.split(" "))
+    output = gemma_lm.generate(_QUERY, max_length=total_output_len + 1)
     elapsed_time = (time.time() - start) * 1000
+    assert num_actual_output_tokens == len(tokenizer(output))
     times.append(elapsed_time)
+
     if _VERBOSE:
       print("%d: %lf ms" % (i, elapsed_time))
+      print("Benchmark: Number of generated output tokens: ", gen_tokens)
 
   mean, diff = compute_stats(times)
   if _VERBOSE:
     print("Mean: %lf ± %d%% ms\n" % (mean, diff))
 
-  return (mean, diff, num_actual_output_tokens)
+  return (mean, diff, gen_tokens)
 
 
 def main():
   if _VERBOSE:
     print("Query: %s" % _QUERY)
 
-  gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma2_2b_en")
-  mean_1, diff_1, _ = run(gemma_lm, 1)
-  mean_n, diff_n, num_output_tokens = run(gemma_lm, _NUM_OUTPUT_TOKENS)
+  model_name = "gemma2_2b_en"
+  gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(model_name)
+  tokenizer = keras_nlp.models.GemmaTokenizer.from_preset(model_name)
+  mean_1, diff_1, _ = run(gemma_lm, tokenizer, 1)
+  mean_n, diff_n, num_output_tokens = run(
+      gemma_lm, tokenizer, _NUM_OUTPUT_TOKENS
+  )
 
   print("Generated %d tokens", num_output_tokens)
   tpot = (mean_n - mean_1) / (num_output_tokens - 1)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt
index 2b732513d6a34f..a74ade4204850d 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt
+++ b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt
@@ -1,4 +1,4 @@
-keras==3.9.0
+keras==3.12.0
 keras_nlp==0.18.1
 tensorflow==2.18.0
 jax==0.4.38
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
index e61af00e38efa0..0aeeb6d69a12bc 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
@@ -228,7 +228,6 @@ absl::Status RunHloBenchmark(benchmark::State& state,
   // thread pool if we need to run multiple executions in parallel.
   ExecuteOptions execute_options;
   execute_options.execution_mode = ExecuteOptions::ExecutionMode::kSynchronous;
-  execute_options.untuple_result = true;
 
   std::vector<std::vector<PjRtBuffer*>> execution_args_ptrs(
       benchmark_options.num_executions);
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
index a9d8a3c958496f..272d96002c2721 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
@@ -198,7 +198,8 @@ static void BM_ReduceWindowAddF32OverlappingWindows(
       ->Arg(512)                \
       ->Arg(1024)               \
       ->Arg(8192)               \
-      ->Arg(16384)
+      ->Arg(16384)              \
+      ->MeasureProcessCPUTime();
 
 BENCHMARK_SIZES(BM_ReduceAddF32);
 BENCHMARK_SIZES(BM_ReduceAddBF16);
@@ -206,7 +207,8 @@ BENCHMARK_SIZES(BM_ReduceAddBF16);
 XLA_CPU_BENCHMARK(BM_ReduceAddF32OverDimension)
     ->ArgName("reduce_dim")
     ->Arg(0)
-    ->Arg(1);
+    ->Arg(1)
+    ->MeasureProcessCPUTime();
 
 XLA_CPU_BENCHMARK(BM_ReduceWindowAddF32OuterAndInnerDim)
     ->MeasureProcessCPUTime()
@@ -217,7 +219,8 @@ XLA_CPU_BENCHMARK(BM_ReduceWindowAddF32OuterAndInnerDim)
     ->Args({32, 4})
     ->Args({32, 8})
     ->Args({32, 16})
-    ->Args({32, 32});
+    ->Args({32, 32})
+    ->MeasureProcessCPUTime();
 
 XLA_CPU_BENCHMARK(BM_ReduceWindowAddF32SkippingData)->MeasureProcessCPUTime();
 
diff --git a/third_party/xla/xla/backends/cpu/buffer_allocation_info.h b/third_party/xla/xla/backends/cpu/buffer_allocation_info.h
index 63880db378692e..d71811569a3eb5 100644
--- a/third_party/xla/xla/backends/cpu/buffer_allocation_info.h
+++ b/third_party/xla/xla/backends/cpu/buffer_allocation_info.h
@@ -16,10 +16,9 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_BUFFER_ALLOCATION_INFO_H_
 #define XLA_BACKENDS_CPU_BUFFER_ALLOCATION_INFO_H_
 
+#include <cassert>
 #include <cstdint>
 
-#include "absl/log/check.h"
-
 namespace xla::cpu {
 
 // `BufferAllocationInfo` stores information about buffer allocations required
@@ -32,41 +31,29 @@ namespace xla::cpu {
 // don't want to bring in these dependencies, e.g. in AOT compilation.
 class BufferAllocationInfo {
  public:
-  // Encoded version of `BufferAllocationInfo`, which can be used to reconstruct
-  // the `BufferAllocationInfo` later. It's used in the AOT compiler, to
-  // represent buffer allocation info as a lightweight struct.
-  struct Encoded {
-    uint64_t packed_kind_and_size = 0;
-    uint32_t entry_param_number = -1;
-    uint32_t result_param_number = -1;
+  // If buffer allocation is an in-out parameter, we use `kParameter` kind and
+  // set both entry parameter and result numbers.
+  enum class Kind : uint64_t {
+    kConstant = 0,
+    kTemp = 1,
+    kParameter = 2,
+    kThreadLocal = 3
   };
 
-  // Creates a BufferAllocationInfo from a serialized encoding generated by
-  // `Encode`.
-  explicit constexpr BufferAllocationInfo(const Encoded& encoded)
-      : kind_(UnpackKind(encoded.packed_kind_and_size)),
-        size_(UnpackSize(encoded.packed_kind_and_size)),
-        entry_param_number_(encoded.entry_param_number),
-        result_param_number_(encoded.result_param_number) {}
-
   bool is_constant() const { return kind_ == Kind::kConstant; }
 
-  bool is_entry_parameter() const {
-    return kind_ == Kind::kParameter && entry_param_number_ >= 0;
-  }
+  bool is_entry_parameter() const { return entry_param_number_ >= 0; }
 
   int32_t entry_parameter_number() const {
-    DCHECK(is_entry_parameter());
+    assert(is_entry_parameter());  // WARNING: do not replace with DCHECK
     return entry_param_number_;
   }
 
-  bool is_result_parameter() const {
-    return kind_ == Kind::kParameter && result_param_number_ >= 0;
-  }
+  bool is_result() const { return result_number_ >= 0; }
 
-  int32_t result_parameter_number() const {
-    DCHECK(is_result_parameter());
-    return result_param_number_;
+  int32_t result_number() const {
+    assert(is_result());  // WARNING: do not replace with DCHECK
+    return result_number_;
   }
 
   // Returns true if this buffer is temporary scratch space required by the XLA
@@ -77,26 +64,13 @@ class BufferAllocationInfo {
   // These buffers are never allocated by the runtime.
   bool is_thread_local() const { return kind_ == Kind::kThreadLocal; }
 
-  // Returns the size for this buffer.
+  Kind kind() const { return kind_; }
   uint64_t size() const { return size_; }
 
-  // Encodes this BufferAllocationInfo into the struct that can be used
-  // to reconstruct the BufferAllocationInfo later using the constructor. We
-  // need this because we use BufferAllocationInfo in places where using
-  // protocol buffers would negatively impact binary size.
-  Encoded Encode() const {
-    static_assert(sizeof(*this) == 16);
-    return Encoded{
-        Pack(kind_, size_),
-        static_cast<uint32_t>(entry_param_number_),
-        static_cast<uint32_t>(result_param_number_),
-    };
-  }
-
   bool operator==(const BufferAllocationInfo& buffer_info) const {
     return kind_ == buffer_info.kind_ && size_ == buffer_info.size_ &&
            entry_param_number_ == buffer_info.entry_param_number_ &&
-           result_param_number_ == buffer_info.result_param_number_;
+           result_number_ == buffer_info.result_number_;
   }
 
   static BufferAllocationInfo Temp(uint64_t size) {
@@ -112,16 +86,15 @@ class BufferAllocationInfo {
     return BufferAllocationInfo(Kind::kParameter, size, entry_param_number);
   }
 
-  static BufferAllocationInfo ResultParameter(uint64_t size,
-                                              int32_t result_param_number) {
-    return BufferAllocationInfo(Kind::kParameter, size,
-                                /*entry_param_number=*/-1, result_param_number);
-  }
-
   static BufferAllocationInfo InOutParameter(uint64_t size,
                                              int32_t entry_param_number,
-                                             int32_t result_param_number) {
-    return BufferAllocationInfo(Kind::kParameter, size, entry_param_number);
+                                             int32_t result_number) {
+    return BufferAllocationInfo(Kind::kParameter, size, entry_param_number,
+                                result_number);
+  }
+
+  static BufferAllocationInfo Result(uint64_t size, int32_t result_number) {
+    return BufferAllocationInfo(Kind::kParameter, size, -1, result_number);
   }
 
   static BufferAllocationInfo ThreadLocal(uint64_t size) {
@@ -129,32 +102,18 @@ class BufferAllocationInfo {
   }
 
  private:
-  enum class Kind : uint64_t { kConstant, kTemp, kParameter, kThreadLocal };
-
   BufferAllocationInfo(Kind kind, uint64_t size,
                        int32_t entry_param_number = -1,
-                       int32_t result_param_number = -1)
+                       int32_t result_number = -1)
       : kind_(kind),
         size_(size),
         entry_param_number_(entry_param_number),
-        result_param_number_(result_param_number) {}
-
-  static uint64_t Pack(Kind kind, uint64_t size) {
-    return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
-  }
-
-  static inline constexpr Kind UnpackKind(uint64_t packed) {
-    return static_cast<Kind>((packed << 62) >> 62);
-  }
-
-  static inline constexpr uint64_t UnpackSize(uint64_t packed) {
-    return packed >> 2;
-  }
+        result_number_(result_number) {}
 
-  Kind kind_ : 2;
-  uint64_t size_ : 62;
+  Kind kind_;
+  uint64_t size_;
   int32_t entry_param_number_ = -1;
-  int32_t result_param_number_ = -1;
+  int32_t result_number_ = -1;
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/buffer_allocation_info_util.cc b/third_party/xla/xla/backends/cpu/buffer_allocation_info_util.cc
new file mode 100644
index 00000000000000..94f276cf4e8d3c
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/buffer_allocation_info_util.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/buffer_allocation_info_util.h"
+
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla::cpu {
+
+std::vector<BufferAllocationInfo> CreateBufferAllocationInfos(
+    const HloModule& module, const BufferAssignment& buffer_assignment) {
+  std::vector<BufferAllocationInfo> allocations;
+
+  // A mapping from a buffer allocation index to the result parameter number.
+  absl::flat_hash_map<BufferAllocation::Index, int64_t> result_allocations;
+  const HloInstruction* root = module.entry_computation()->root_instruction();
+  ShapeUtil::ForEachLeafShape(
+      root->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        BufferAllocation::Index allocation_index =
+            buffer_assignment.GetUniqueSlice(root, index)->index();
+        int64_t result_index = result_allocations.size();
+        result_allocations[allocation_index] = result_index;
+      });
+
+  for (const BufferAllocation& allocation : buffer_assignment.Allocations()) {
+    // Check that the allocations index is contiguous in [0, num_allocations).
+    DCHECK_EQ(allocation.index(), allocations.size());
+
+    if (allocation.is_thread_local()) {
+      allocations.push_back(
+          BufferAllocationInfo::ThreadLocal(allocation.size()));
+
+    } else if (allocation.is_constant()) {
+      allocations.push_back(BufferAllocationInfo::Constant(allocation.size()));
+
+    } else if (allocation.is_entry_computation_parameter() &&
+               allocation.maybe_live_out()) {
+      // Entry computation parameter that is aliased with one of the results.
+      allocations.push_back(BufferAllocationInfo::InOutParameter(
+          allocation.size(), allocation.parameter_number(),
+          result_allocations.at(allocation.index())));
+
+    } else if (allocation.is_entry_computation_parameter()) {
+      // A read-only entry computation parameter.
+      allocations.push_back(BufferAllocationInfo::EntryParameter(
+          allocation.size(), allocation.parameter_number()));
+
+    } else if (allocation.maybe_live_out() &&
+               result_allocations.contains(allocation.index())) {
+      // This is a result buffer that corresponds to a flatten result index.
+      allocations.push_back(BufferAllocationInfo::Result(
+          allocation.size(), result_allocations[allocation.index()]));
+
+    } else if (allocation.maybe_live_out()) {
+      // This is a result buffer that holds the tuple. It doesn't correspond to
+      // a flatten result index, and it's never used by XLA:CPU at run time, but
+      // we still record it as we want to know about all allocations.
+      allocations.push_back(
+          BufferAllocationInfo::Result(allocation.size(), -1));
+
+    } else {
+      // A temporary allocation that holds intermediate buffers.
+      DCHECK(allocation.IsPreallocatedTempBuffer());
+      allocations.push_back(BufferAllocationInfo::Temp(allocation.size()));
+    }
+  }
+
+  return allocations;
+}
+
+std::vector<int32_t> CreateArgIndexTable(
+    absl::Span<const BufferAllocationInfo> allocations) {
+  std::vector<int32_t> ret;
+  for (int64_t i = 0; i < allocations.size(); i++) {
+    if (allocations[i].is_entry_parameter()) {
+      int32_t parameter_number = allocations[i].entry_parameter_number();
+      if (parameter_number >= ret.size()) {
+        ret.resize(parameter_number + 1);
+      }
+      ret[parameter_number] = i;
+    }
+  }
+  return ret;
+}
+
+std::vector<int32_t> CreateResultIndexTable(
+    absl::Span<const BufferAllocationInfo> allocations) {
+  std::vector<int32_t> ret;
+  for (int64_t i = 0; i < allocations.size(); i++) {
+    if (allocations[i].is_result()) {
+      int32_t result_number = allocations[i].result_number();
+      if (result_number >= ret.size()) {
+        ret.resize(result_number + 1);
+      }
+      ret[result_number] = i;
+    }
+  }
+  return ret;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/buffer_info_util.h b/third_party/xla/xla/backends/cpu/buffer_allocation_info_util.h
similarity index 53%
rename from third_party/xla/xla/service/cpu/buffer_info_util.h
rename to third_party/xla/xla/backends/cpu/buffer_allocation_info_util.h
index c21ea2f8459a1b..cf5fcbbb0604d3 100644
--- a/third_party/xla/xla/service/cpu/buffer_info_util.h
+++ b/third_party/xla/xla/backends/cpu/buffer_allocation_info_util.h
@@ -13,36 +13,35 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
-#define XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
+#ifndef XLA_BACKENDS_CPU_BUFFER_ALLOCATION_INFO_UTIL_H_
+#define XLA_BACKENDS_CPU_BUFFER_ALLOCATION_INFO_UTIL_H_
 
 #include <cstdint>
 #include <vector>
 
 #include "absl/types/span.h"
-#include "xla/cpu_function_runtime.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 
-namespace xla {
-namespace cpu {
-// Creates and returns a list of BufferInfo instances containing relevant
-// information from `buffer_assignment`.
-std::vector<cpu_function_runtime::BufferInfo>
-CreateBufferInfosFromBufferAssignment(
+namespace xla::cpu {
+
+// Creates and returns a list of `BufferAllocationInfo` instances containing
+// relevant information from `buffer_assignment`.
+std::vector<BufferAllocationInfo> CreateBufferAllocationInfos(
     const HloModule& module, const BufferAssignment& buffer_assignment);
 
 // Creates and returns a table containing the mapping from entry computation
-// parameters to buffer allocation indices.
+// parameters to buffer allocation indices:
+//
+//   vector[parameter_number] == allocation.index()
+//   vector[result_number]    == allocation.index()
 //
-// If this function returns V then entry parameter i has buffer allocation index
-// V[i].
-std::vector<int32_t> CreateArgIndexTableFromBufferInfos(
-    absl::Span<const cpu_function_runtime::BufferInfo> buffer_infos);
+std::vector<int32_t> CreateArgIndexTable(
+    absl::Span<const BufferAllocationInfo> allocations);
+std::vector<int32_t> CreateResultIndexTable(
+    absl::Span<const BufferAllocationInfo> allocations);
 
-std::vector<int32_t> CreateResultIndexTableFromBufferInfos(
-    absl::Span<const cpu_function_runtime::BufferInfo> buffer_infos);
-}  // namespace cpu
-}  // namespace xla
+}  // namespace xla::cpu
 
-#endif  // XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
+#endif  // XLA_BACKENDS_CPU_BUFFER_ALLOCATION_INFO_UTIL_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/BUILD b/third_party/xla/xla/backends/cpu/codegen/BUILD
index e7cefb4ba09c76..a3c3c5c8f3427d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/BUILD
@@ -1,10 +1,11 @@
 load("//xla:py_strict.bzl", "py_strict_test")
 load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
-load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
 load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
@@ -23,6 +24,42 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "builtin_definition_generator",
+    srcs = ["builtin_definition_generator.cc"],
+    hdrs = ["builtin_definition_generator.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":builtin_fp16",
+        ":builtin_pow",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcShared",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:mlir_c_runner_utils",
+    ],
+)
+
+cc_library(
+    name = "builtin_fp16",
+    srcs = ["builtin_fp16.cc"],
+    hdrs = ["builtin_fp16.h"],
+    visibility = internal_visibility([":friends"]),
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
+cc_library(
+    name = "builtin_pow",
+    srcs = ["builtin_pow.cc"],
+    hdrs = ["builtin_pow.h"],
+    visibility = internal_visibility([":friends"]),
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
 cc_library(
     name = "contiguous_section_memory_manager",
     srcs = ["contiguous_section_memory_manager.cc"],
@@ -64,12 +101,14 @@ cc_library(
         ":polynomial_approximations",
         "//xla:util",
         "//xla:xla_proto_cc",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/codegen:intrinsic_lib",
         "//xla/codegen/intrinsic",
         "//xla/codegen/intrinsic:intrinsic_compiler_lib",
         "//xla/service:hlo_module_config",
         "//xla/service/cpu:backend_config_proto_cc",
         "//xla/service/cpu:cpu_options",
+        "//xla/service/cpu:executable_proto_cc",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -106,8 +145,12 @@ xla_cc_test(
     deps = [
         ":ir_compiler",
         ":kernel_api_ir_builder",
+        "//xla:debug_options_flags",
         "//xla:util",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/service/cpu:backend_config_proto_cc",
+        "//xla/service/cpu:cpu_compiler_pure",
+        "//xla/service/cpu:test_header_helper",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -129,6 +172,8 @@ xla_cc_test(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
@@ -146,19 +191,23 @@ cc_library(
         "//xla:util",
         "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
         "//xla/backends/cpu/codegen/emitters/transforms:passes",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/backends/cpu/codegen/tiled/transforms:passes",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen:trace_pass_instrumentation",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/codegen/xtile/ir/transforms:passes",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",
         "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -169,6 +218,9 @@ cc_library(
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:AffineTransforms",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:BufferizationPipelines",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
@@ -179,21 +231,33 @@ cc_library(
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMIRTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:MemRefToLLVM",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:UBToLLVM",
         "@llvm-project//mlir:VectorDialect",
+        "@llvm-project//mlir:VectorToLLVM",
+        "@llvm-project//mlir:VectorToSCF",
+        "@llvm-project//mlir:VectorTransforms",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
+        "@stablehlo//:linalg_passes",
+        "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_passes",
+        "@stablehlo//:stablehlo_passes_optimization",
     ],
 )
 
@@ -207,10 +271,10 @@ cc_library(
         ":cpu_features",
         ":execution_engine",
         ":ir_compiler",
+        ":jit_memory_mapper",
         ":object_loader",
         "//xla:util",
         "//xla/backends/cpu/runtime:function_library",
-        "//xla/service/cpu:orc_jit_memory_mapper",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
@@ -242,11 +306,13 @@ cc_library(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
-    ]) + xla_internal(["service/cpu:named_orc_jit_memory_mapper"]),
+    ]) + xla_internal(["service/cpu:named_jit_memory_mapper"]),
 )
 
 xla_cc_test(
@@ -256,7 +322,9 @@ xla_cc_test(
         ":ir_compiler",
         ":jit_compiler",
         ":kernel_api_ir_builder",
+        "//xla:debug_options_flags",
         "//xla:util",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
@@ -276,6 +344,22 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "jit_memory_mapper",
+    srcs = ["jit_memory_mapper.cc"],
+    hdrs = ["jit_memory_mapper.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@llvm-project//llvm:ExecutionEngine",
+    ],
+)
+
 cc_library(
     name = "polynomial_approximations",
     srcs = ["polynomial_approximations.cc"],
@@ -476,8 +560,8 @@ cc_library(
         ":compiled_function_library",
         ":contiguous_section_memory_manager",
         ":execution_engine",
+        ":jit_memory_mapper",
         "//xla/backends/cpu/runtime:function_library",
-        "//xla/service/cpu:orc_jit_memory_mapper",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -500,7 +584,7 @@ cc_library(
     hdrs = ["execution_engine.h"],
     deps = [
         ":contiguous_section_memory_manager",
-        "//xla/service/cpu:orc_jit_memory_mapper",
+        ":jit_memory_mapper",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -526,9 +610,7 @@ cc_library(
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -575,10 +657,12 @@ xla_cc_test(
         ":execution_engine",
         ":ir_compiler",
         ":jit_compiler",
+        ":kernel_api_ir_builder",
         ":object_loader",
+        "//xla:debug_options_flags",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:kernel_api_ir_builder",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/service:cpu_plugin",
         "//xla/service/cpu:executable_proto_cc",
@@ -623,14 +707,17 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/backends/cpu:alignment",
+        "//xla/backends/cpu/codegen/tiled:tiled_fusion_emitter",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:ir_emission_utils",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters:concatenate_kernel_emitter",
         "//xla/codegen/emitters:dynamic_update_slice_kernel_emitter",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_cluster",
@@ -640,7 +727,6 @@ cc_library(
         "//xla/runtime:work_tile_size",
         "//xla/service:buffer_assignment",
         "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.cc b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.cc
new file mode 100644
index 00000000000000..e91212e97e8fb2
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.cc
@@ -0,0 +1,274 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
+
+#ifdef _MSC_VER
+#include <math.h>
+#endif  // _MSC_VER
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/CoreContainers.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Error.h"
+#include "xla/backends/cpu/codegen/builtin_fp16.h"
+#include "xla/backends/cpu/codegen/builtin_pow.h"
+
+namespace xla::cpu {
+
+//===----------------------------------------------------------------------===//
+// A global static registry of builtin symbols available to XLA:CPU executables.
+//===----------------------------------------------------------------------===//
+
+using Registry = absl::flat_hash_map<std::string, llvm::orc::ExecutorSymbolDef>;
+
+// Create a new registry of builtin runtime symbols by looking up the addresses
+// of the symbols in the current process. Defined below.
+static Registry CreateRegistry();
+
+// Returns a global static registry of builtin runtime symbols.
+static const Registry& StaticRegistry() {
+  static absl::NoDestructor<Registry> registry(CreateRegistry());
+  return *registry;
+}
+
+static std::optional<llvm::orc::ExecutorSymbolDef> ResolveBuiltinSymbol(
+    const llvm::DataLayout& data_layout, llvm::StringRef name) {
+  const Registry& registry = StaticRegistry();
+
+  if (name.size() > 1 && name.front() == data_layout.getGlobalPrefix()) {
+    // On Mac OS X, 'name' may have a leading underscore prefix, even though the
+    // registered name may not.
+    std::string stripped_name(name.begin() + 1, name.end());
+    if (registry.contains(stripped_name)) {
+      return registry.at(stripped_name);
+    }
+  } else {
+    if (registry.contains(name)) {
+      return registry.at(name.str());
+    }
+  }
+
+  return std::nullopt;
+}
+
+//===----------------------------------------------------------------------===//
+// Create builtin runtime symbols registry for the current process.
+//===----------------------------------------------------------------------===//
+
+#if defined(PLATFORM_WINDOWS)
+// This function is used by compiler-generated code on windows, but it's not
+// declared anywhere. The signature does not matter, we just need the address.
+extern "C" void __chkstk(size_t);
+#endif
+
+extern "C" {
+// Provided by compiler-rt and MLIR.
+// Converts an F32 value to a BF16.
+uint16_t __truncsfbf2(float);
+// Converts an F64 value to a BF16.
+uint16_t __truncdfbf2(double);
+
+#ifdef __APPLE__
+// Converts an F32 value to a F16.
+uint16_t __truncsfhf2(float);
+
+float __extendhfsf2(uint16_t a);
+#endif  // __APPLE__
+
+}  // extern "C"
+
+// MSVC does not have sincos[f].
+#ifdef _MSC_VER
+
+static void sincos(double x, double* sinv, double* cosv) {
+  *sinv = sin(x);
+  *cosv = cos(x);
+}
+
+static void sincosf(float x, float* sinv, float* cosv) {
+  *sinv = sinf(x);
+  *cosv = cosf(x);
+}
+
+#endif  // _MSC_VER
+
+template <typename R, typename... Args>
+static llvm::orc::ExecutorSymbolDef SymbolDef(R (*func)(Args...)) {
+  // We register runtime symbols as weak, because during concurrent compilation
+  // different threads may race to register their symbols in the same dylib and
+  // we get spurious "symbol already defined" errors.
+  return llvm::orc::ExecutorSymbolDef{
+      llvm::orc::ExecutorAddr(reinterpret_cast<uint64_t>(func)),
+      llvm::JITSymbolFlags::Weak};
+}
+
+// Register both the f32 (float) and f64 (double) versions of a libm symbol.
+// Unfortunately the double versions are overloaded on some systems, e.g.
+// Mac so we need an explicit cast. This requires passing the function signature
+// for that case.
+#define REGISTER_LIBM_SYMBOL(name, double_sig) \
+  registry[#name "f"] = SymbolDef(name##f);    \
+  registry[#name] = SymbolDef(static_cast<double_sig>(name));
+
+static Registry CreateRegistry() {
+  Registry registry;
+
+  // Some platforms have overloaded memcpy, memmove, and memset, so we need to
+  // specify the signature type to get the address of the specific function.
+  registry["memcpy"] =
+      SymbolDef(static_cast<void* (*)(void*, const void*, size_t)>(memcpy));
+  registry["memmove"] =
+      SymbolDef(static_cast<void* (*)(void*, const void*, size_t)>(memmove));
+  registry["memset"] =
+      SymbolDef(static_cast<void* (*)(void*, int, size_t)>(memset));
+
+  registry["__gnu_f2h_ieee"] = SymbolDef(__gnu_f2h_ieee);
+  registry["__gnu_h2f_ieee"] = SymbolDef(__gnu_h2f_ieee);
+
+  registry["__truncdfhf2"] = SymbolDef(__truncdfhf2);
+  registry["__truncdfbf2"] = SymbolDef(__truncdfbf2);
+  registry["__truncsfbf2"] = SymbolDef(__truncsfbf2);
+
+  registry["__powisf2"] = SymbolDef(__powisf2);
+  registry["__powidf2"] = SymbolDef(__powidf2);
+
+  REGISTER_LIBM_SYMBOL(acos, double (*)(double));
+  REGISTER_LIBM_SYMBOL(acosh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(asin, double (*)(double));
+  REGISTER_LIBM_SYMBOL(asinh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(atan, double (*)(double));
+  REGISTER_LIBM_SYMBOL(atan2, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(atanh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(cbrt, double (*)(double));
+  REGISTER_LIBM_SYMBOL(ceil, double (*)(double));
+  REGISTER_LIBM_SYMBOL(copysign, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(cos, double (*)(double));
+  REGISTER_LIBM_SYMBOL(cosh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(erf, double (*)(double));
+  REGISTER_LIBM_SYMBOL(erfc, double (*)(double));
+  REGISTER_LIBM_SYMBOL(exp, double (*)(double));
+  REGISTER_LIBM_SYMBOL(exp2, double (*)(double));
+  REGISTER_LIBM_SYMBOL(expm1, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fabs, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fdim, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(floor, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fma, double (*)(double, double, double));
+  REGISTER_LIBM_SYMBOL(fmax, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(fmin, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(fmod, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(frexp, double (*)(double, int*));
+  REGISTER_LIBM_SYMBOL(hypot, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(ilogb, int (*)(double));
+  REGISTER_LIBM_SYMBOL(ldexp, double (*)(double, int));
+  REGISTER_LIBM_SYMBOL(lgamma, double (*)(double));
+  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));   // NOLINT
+  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));  // NOLINT
+  REGISTER_LIBM_SYMBOL(log, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log10, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log1p, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log2, double (*)(double));
+  REGISTER_LIBM_SYMBOL(logb, double (*)(double));
+  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));   // NOLINT
+  REGISTER_LIBM_SYMBOL(lround, long (*)(double));  // NOLINT
+  REGISTER_LIBM_SYMBOL(modf, double (*)(double, double*));
+  REGISTER_LIBM_SYMBOL(nan, double (*)(const char*));
+  REGISTER_LIBM_SYMBOL(nearbyint, double (*)(double));
+  REGISTER_LIBM_SYMBOL(nextafter, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(nexttoward, double (*)(double, long double));
+  REGISTER_LIBM_SYMBOL(pow, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(remainder, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(remquo, double (*)(double, double, int*));
+  REGISTER_LIBM_SYMBOL(rint, double (*)(double));
+  REGISTER_LIBM_SYMBOL(round, double (*)(double));
+  REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long));  // NOLINT
+  REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
+  REGISTER_LIBM_SYMBOL(sin, double (*)(double));
+  REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tan, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tanh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tgamma, double (*)(double));
+  REGISTER_LIBM_SYMBOL(trunc, double (*)(double));
+
+#ifdef __APPLE__
+  REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
+  registry["__sincosf_stret"] = SymbolDef(__sincosf_stret);
+  registry["__sincos_stret"] = SymbolDef(__sincos_stret);
+#else
+  REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
+#endif
+
+#undef REGISTER_LIBM_SYMBOL
+
+#ifdef __APPLE__
+  registry["__truncsfhf2"] = SymbolDef(__truncsfhf2);
+  registry["__extendhfsf2"] = SymbolDef(__extendhfsf2);
+  registry["__bzero"] = SymbolDef(bzero);
+  registry["bzero"] = SymbolDef(bzero);
+  registry["memset_pattern16"] = SymbolDef(memset_pattern16);
+#endif
+
+#if defined(PLATFORM_WINDOWS)
+  registry["__chkstk"] = SymbolDef(__chkstk);
+#endif
+
+#ifdef MEMORY_SANITIZER
+  registry["__msan_unpoison"] = SymbolDef(__msan_unpoison);
+#endif
+
+  return registry;
+}
+
+//===----------------------------------------------------------------------===//
+// BuiltinDefinitionGenerator
+//===----------------------------------------------------------------------===//
+
+BuiltinDefinitionGenerator::BuiltinDefinitionGenerator(
+    llvm::DataLayout data_layout)
+    : data_layout_(std::move(data_layout)) {}
+
+llvm::Error BuiltinDefinitionGenerator::tryToGenerate(
+    llvm::orc::LookupState&, llvm::orc::LookupKind kind,
+    llvm::orc::JITDylib& jit_dylib, llvm::orc::JITDylibLookupFlags,
+    const llvm::orc::SymbolLookupSet& names) {
+  llvm::orc::SymbolMap symbols;
+  symbols.reserve(names.size());
+
+  for (const auto& [name, flags] : names) {
+    if (auto symbol = ResolveBuiltinSymbol(data_layout_, *name)) {
+      symbols[name] = *symbol;
+    }
+  }
+
+  cantFail(jit_dylib.define(llvm::orc::absoluteSymbols(std::move(symbols))));
+  return llvm::Error::success();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.h b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.h
similarity index 56%
rename from third_party/xla/xla/service/cpu/runtime_symbol_generator.h
rename to third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.h
index 4173717a91163e..7689e7b13e425c 100644
--- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.h
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.h
@@ -13,24 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
-#define XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
+#ifndef XLA_BACKENDS_CPU_CODEGEN_BUILTIN_DEFINITION_GENERATOR_H_
+#define XLA_BACKENDS_CPU_CODEGEN_BUILTIN_DEFINITION_GENERATOR_H_
 
-#include <optional>
-
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Error.h"
 
 namespace xla::cpu {
 
-// Generates symbol definitions for XLA runtime symbols, which are linked into
-// the compiled XLA kernels.
-class RuntimeSymbolGenerator : public llvm::orc::DefinitionGenerator {
+// Generates symbol definitions for builtin XLA runtime symbols, which are
+// looked up at run time in the parent process:
+//
+//   - libc symbols (e.g. memcpy, memmove, memset)
+//   - libm symbols (e.g. sin, cos, etc.)
+//   - compiler-rt symbols (e.g. __msan_unpoison)
+//   - custom XLA symbols (e.g. __truncsfbf2)
+//
+// We keep the list of definitions short, and prefer to compile math functions
+// into generated XLA:CPU executables via intrinsics, as it allows the LLVM
+// optimizer to inline them and optimize across function call boundaries.
+class BuiltinDefinitionGenerator : public llvm::orc::DefinitionGenerator {
  public:
-  explicit RuntimeSymbolGenerator(llvm::DataLayout data_layout);
+  explicit BuiltinDefinitionGenerator(llvm::DataLayout data_layout);
 
   llvm::Error tryToGenerate(llvm::orc::LookupState&, llvm::orc::LookupKind,
                             llvm::orc::JITDylib& jit_dylib,
@@ -38,12 +43,9 @@ class RuntimeSymbolGenerator : public llvm::orc::DefinitionGenerator {
                             const llvm::orc::SymbolLookupSet& names) final;
 
  private:
-  std::optional<llvm::orc::ExecutorSymbolDef> ResolveRuntimeSymbol(
-      llvm::StringRef name);
-
   llvm::DataLayout data_layout_;
 };
 
 }  // namespace xla::cpu
 
-#endif  // XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
+#endif  // XLA_BACKENDS_CPU_CODEGEN_BUILTIN_DEFINITION_GENERATOR_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_fp16.cc b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.cc
similarity index 98%
rename from third_party/xla/xla/service/cpu/runtime_fp16.cc
rename to third_party/xla/xla/backends/cpu/codegen/builtin_fp16.cc
index 5bca7fecee69e5..4208d838c0a4d4 100644
--- a/third_party/xla/xla/service/cpu/runtime_fp16.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/runtime_fp16.h"
+#include "xla/backends/cpu/codegen/builtin_fp16.h"
 
 #include <cstdint>
 #include <cstring>
diff --git a/third_party/xla/xla/service/cpu/runtime_fp16.h b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.h
similarity index 89%
rename from third_party/xla/xla/service/cpu/runtime_fp16.h
rename to third_party/xla/xla/backends/cpu/codegen/builtin_fp16.h
index c86d6dc37f0d5c..cbe75ec2e18684 100644
--- a/third_party/xla/xla/service/cpu/runtime_fp16.h
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_RUNTIME_FP16_H_
-#define XLA_SERVICE_CPU_RUNTIME_FP16_H_
-
-#include <stdint.h>
+#ifndef XLA_BACKENDS_CPU_CODEGEN_BUILTIN_FP16_H_
+#define XLA_BACKENDS_CPU_CODEGEN_BUILTIN_FP16_H_
 
 // _Float16 always gets us the correct ABI type, so use that if available.
 // AArch64 GCC defines __FLT16_MANT_DIG__ even when _Float16 is not available.
@@ -43,4 +41,4 @@ extern "C" float __gnu_h2f_ieee(XlaF16ABIType);
 // Converts an F64 value to a F16.
 extern "C" XlaF16ABIType __truncdfhf2(double);
 
-#endif  // XLA_SERVICE_CPU_RUNTIME_FP16_H_
+#endif  // XLA_BACKENDS_CPU_CODEGEN_BUILTIN_FP16_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_pow.cc b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.cc
similarity index 85%
rename from third_party/xla/xla/service/cpu/runtime_pow.cc
rename to third_party/xla/xla/backends/cpu/codegen/builtin_pow.cc
index 0a915715b46407..ddc23f59b68015 100644
--- a/third_party/xla/xla/service/cpu/runtime_pow.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/runtime_pow.h"
+#include "xla/backends/cpu/codegen/builtin_pow.h"
 
 #include <cstdint>
 
@@ -24,9 +24,13 @@ static T Powi(T a, int32_t b) {
   const bool recip = b < 0;
   T r = 1;
   while (true) {
-    if (b & 1) r *= a;
+    if (b & 1) {
+      r *= a;
+    }
     b /= 2;
-    if (b == 0) break;
+    if (b == 0) {
+      break;
+    }
     a *= a;
   }
   return recip ? 1 / r : r;
diff --git a/third_party/xla/xla/service/cpu/runtime_pow.h b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.h
similarity index 81%
rename from third_party/xla/xla/service/cpu/runtime_pow.h
rename to third_party/xla/xla/backends/cpu/codegen/builtin_pow.h
index ae3196cba89ccb..cbbdba29375b00 100644
--- a/third_party/xla/xla/service/cpu/runtime_pow.h
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_RUNTIME_POW_H_
-#define XLA_SERVICE_CPU_RUNTIME_POW_H_
+#ifndef XLA_BACKENDS_CPU_CODEGEN_BUILTIN_POW_H_
+#define XLA_BACKENDS_CPU_CODEGEN_BUILTIN_POW_H_
 
 #include <stdint.h>
 
@@ -24,4 +24,4 @@ extern "C" float __powisf2(float a, int32_t b);
 // Raises F64 value a to the power of b.
 extern "C" double __powidf2(double a, int32_t b);
 
-#endif  // XLA_SERVICE_CPU_RUNTIME_POW_H_
+#endif  // XLA_BACKENDS_CPU_CODEGEN_BUILTIN_POW_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
index 6d5b7fe4accbb3..113e12bcc4d37a 100644
--- a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
@@ -44,8 +44,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -133,7 +132,7 @@ ComputationKernelEmitter::ComputationKernelEmitter(
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<ComputationKernelEmitter::KernelDefinition>
 ComputationKernelEmitter::EmitKernelDefinition() {
   VLOG(2) << "Emit Computation host kernel: " << instr_->name();
 
@@ -215,14 +214,14 @@ ComputationKernelEmitter::EmitKernelDefinition() {
                                     buffer_table, llvm_nullptr, llvm_nullptr};
   ir_builder.CreateCall(computation_function, args);
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
 
   KernelSpec spec(kernel_prototype.function->getName(), NumWorkGroups(),
                   std::move(kernel_prototype.argument_buffers),
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 absl::StatusOr<llvm::Function*> ComputationKernelEmitter::EmitNestedComputation(
diff --git a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
index e7c6246b87a014..95c538ad261729 100644
--- a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
@@ -17,19 +17,17 @@ limitations under the License.
 #define XLA_BACKENDS_CPU_CODEGEN_COMPUTATION_KERNEL_EMITTER_H_
 
 #include <cstdint>
-#include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 
@@ -46,15 +44,14 @@ namespace xla::cpu {
 // producing a synthetic buffer_table for all arguments and results (including
 // intermediate instructions), though this may change in the future to use stack
 // allocations for small buffers.
-class ComputationKernelEmitter final : public LlvmKernelEmitter {
+class ComputationKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   ComputationKernelEmitter(const HloInstruction* instr,
                            const BufferAssignment* buffer_assignment,
                            const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const final { return "computation_kernel_emitter"; }
+  absl::string_view name() const final { return "computation_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   absl::StatusOr<llvm::Function*> EmitNestedComputation(
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
index 915cce121f9a58..067d7d21b6df1d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
@@ -18,9 +18,7 @@ cc_library(
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
index 2c20b19e4b197e..1e158daec9fe83 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
@@ -26,8 +26,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -57,7 +56,8 @@ DotKernelEmitter::DotKernelEmitter(const HloInstruction* instr,
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition> DotKernelEmitter::EmitKernelDefinition() {
+absl::StatusOr<DotKernelEmitter::KernelDefinition>
+DotKernelEmitter::EmitKernelDefinition() {
   const HloModuleConfig& config = instr_->GetModule()->config();
 
   DotImplementationStrategy strategy = GetDotImplementationStrategy(
@@ -104,7 +104,7 @@ absl::StatusOr<LlvmKernelDefinition> DotKernelEmitter::EmitKernelDefinition() {
           *target_machine_,
           /*allow_runtime_calls=*/false));
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
 
   KernelSpec spec(kernel_prototype.function->getName(),
                   NumWorkGroups{num_workgroups.x, num_workgroups.y},
@@ -112,7 +112,7 @@ absl::StatusOr<LlvmKernelDefinition> DotKernelEmitter::EmitKernelDefinition() {
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h
index 48ca003f05c5ad..8a7916aec66bda 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h
@@ -16,28 +16,24 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_CODEGEN_DOT_DOT_KERNEL_EMITTER_H_
 #define XLA_BACKENDS_CPU_CODEGEN_DOT_DOT_KERNEL_EMITTER_H_
 
-#include <string>
-
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 
 namespace xla::cpu {
 
-class DotKernelEmitter final : public LlvmKernelEmitter {
+class DotKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   DotKernelEmitter(const HloInstruction* instr,
                    const BufferAssignment* buffer_assignment,
                    const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() override;
-
-  std::string name() const final { return "dot_kernel_emitter"; }
+  absl::string_view name() const final { return "dot_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
  private:
   const HloInstruction* instr_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
index 3179b0a355fdb8..bdce35a2907d7e 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
@@ -26,10 +26,9 @@ cc_library(
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -72,10 +71,9 @@ cc_library(
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -106,7 +104,8 @@ xla_cc_test(
         ":elemental_kernel_emitter",
         "//xla:xla_data_proto_cc",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_kernel_definition",
+        "//xla/codegen:kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc
index 85e2b4c90766b5..4c5a24a42b35b5 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc
@@ -33,8 +33,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout_util.h"
@@ -44,7 +43,6 @@ limitations under the License.
 #include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
@@ -68,7 +66,7 @@ ConcatenateKernelEmitter::ConcatenateKernelEmitter(
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<ConcatenateKernelEmitter::KernelDefinition>
 ConcatenateKernelEmitter::EmitKernelDefinition() {
   if (absl::Status status = CanDoFastConcatenate(instr_); !status.ok()) {
     VLOG(1) << "Could not emit fast concatenate for " << instr_->ToString()
@@ -112,7 +110,7 @@ ConcatenateKernelEmitter::EmitKernelDefinition() {
                           llvm_module.get(), ir_builder,
                           kernel_prototype.workgroup_id.x, total_workgroups));
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
   NumWorkGroups num_workgroups;
   if (is_parallel) {
     num_workgroups.x = total_workgroups;
@@ -123,7 +121,7 @@ ConcatenateKernelEmitter::EmitKernelDefinition() {
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h
index 36ba97ed2ad10a..19a77b3519f2ed 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h
@@ -16,27 +16,24 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_CONCATENATE_KERNEL_EMITTER_H_
 #define XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_CONCATENATE_KERNEL_EMITTER_H_
 
-#include <string>
-
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 
 namespace xla::cpu {
 
-class ConcatenateKernelEmitter final : public LlvmKernelEmitter {
+class ConcatenateKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   ConcatenateKernelEmitter(const HloInstruction* instr,
                            const BufferAssignment* buffer_assignment,
                            const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() override;
-
-  std::string name() const final { return "concatenate_kernel_emitter"; }
+  absl::string_view name() const final { return "concatenate_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
  private:
   const HloInstruction* instr_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
index ffe6485bd67d9c..a771c6d4a7c59b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
@@ -39,8 +39,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -169,7 +168,7 @@ ElementalKernelEmitter::ElementalKernelEmitter(
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<ElementalKernelEmitter::KernelDefinition>
 ElementalKernelEmitter::EmitKernelDefinition() {
   VLOG(2) << "Emit elemental host kernel: " << instr_->name();
 
@@ -228,14 +227,14 @@ ElementalKernelEmitter::EmitKernelDefinition() {
                       EmitElementalLoops(ir_builder, instr_, kernel_prototype,
                                          element_generator));
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
 
   KernelSpec spec(kernel_prototype.function->getName(), num_workgroups,
                   std::move(kernel_prototype.argument_buffers),
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 absl::StatusOr<NumWorkGroups> ElementalKernelEmitter::EmitElementalLoops(
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h
index 0b57dca65d39e9..59549f300719dc 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h
@@ -16,16 +16,14 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_ELEMENTAL_KERNEL_EMITTER_H_
 #define XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_ELEMENTAL_KERNEL_EMITTER_H_
 
-#include <string>
-
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -34,15 +32,14 @@ limitations under the License.
 
 namespace xla::cpu {
 
-class ElementalKernelEmitter final : public LlvmKernelEmitter {
+class ElementalKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   ElementalKernelEmitter(const HloInstruction* instr,
                          const BufferAssignment* buffer_assignment,
                          const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() override;
-
-  std::string name() const final { return "elemental_kernel_emitter"; }
+  absl::string_view name() const final { return "elemental_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
  private:
   // Emits LLVM IR using elemental loop emitter and the given element generator.
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
index 0a1bf19e023125..453e37dfa6ec9d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -46,7 +46,7 @@ class ElementalKernelEmitterTest : public HloHardwareIndependentTestBase {
   ElementalKernelEmitterTest()
       : target_machine_features_([](int64_t size) { return 1; }) {}
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition(
+  absl::StatusOr<KernelDefinition<LlvmKernelSource>> EmitKernelDefinition(
       const HloInstruction* instr, const BufferAssignment* buffer_assignment) {
     ElementalKernelEmitter emitter(instr, buffer_assignment,
                                    &target_machine_features_);
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
index 00275912de536d..d0ac6c73095272 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
@@ -38,9 +38,8 @@ cc_library(
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:symbol_name_util",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
@@ -49,6 +48,7 @@ cc_library(
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",
@@ -101,10 +101,11 @@ xla_cc_test(
         ":cpu_fusion_emitters",
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
index f9497af64fece9..53f2f0dc055c5f 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -65,6 +64,7 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -131,9 +131,11 @@ bool Needs64BitIndices(const HloComputation* computation) {
 
 using mlir::AffineExpr;
 
-IndexingMap GetDefaultIndexingMap(absl::Span<const int64_t> thread_tile_sizes,
-                                  absl::Span<const int64_t> shape,
-                                  mlir::MLIRContext* mlir_context) {
+IndexingMap GetDefaultIndexingMap(
+    absl::Span<const int64_t> thread_tile_sizes,
+    absl::Span<const int64_t> shape,
+    // TODO: b/451959933 - Use reference or absl_nullable pointer.
+    mlir::MLIRContext* mlir_context) {
   CHECK_EQ(thread_tile_sizes.size(), shape.size())
       << "thread_tile_sizes and shape must have the same size";
   SmallVector<int64_t> thread_tile_counts;
@@ -272,7 +274,8 @@ absl::StatusOr<emitters::CallTargetProvider> EmitCallTargets(
     for (const auto& subgraph : comp.subgraphs()) {
       if (subgraph_to_mlir_fn.contains(&subgraph)) {
         TF_RETURN_IF_ERROR(emitters::SubgraphToMlirFunction(
-            comp, subgraph, subgraph_to_mlir_fn[&subgraph], call_targets));
+            comp, subgraph, subgraph_to_mlir_fn[&subgraph], call_targets,
+            computations.mlir_context()));
       }
     }
   }
@@ -281,7 +284,8 @@ absl::StatusOr<emitters::CallTargetProvider> EmitCallTargets(
     TF_RETURN_IF_ERROR(emitters::SubgraphToMlirFunction(
         computations.FindPartitionedComputation(
             fusion.fused_instructions_computation()),
-        epilogue, subgraph_to_mlir_fn[&epilogue], call_targets));
+        epilogue, subgraph_to_mlir_fn[&epilogue], call_targets,
+        computations.mlir_context()));
   }
 
   return call_targets;
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
index 79fe2858a39da6..9dc42fa9c96f15 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
@@ -28,11 +28,11 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/service/buffer_assignment.h"
@@ -65,23 +65,6 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateNamedMlirModuleOp(
 // HloModule and the name of the fusion.
 absl::StatusOr<std::string> GetFusionName(const HloFusionInstruction& fusion);
 
-class CpuFusionEmitterBase {
- public:
-  virtual ~CpuFusionEmitterBase() = default;
-
-  virtual int64_t num_threads() const = 0;
-
-  virtual std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t, mlir::MLIRContext*) const = 0;
-
-  virtual std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
-      int64_t, int64_t, mlir::MLIRContext*) const = 0;
-
-  virtual std::string BackendExtraOptions() { return {}; }
-
-  virtual absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Emit() const = 0;
-};
-
 int64_t CeilDiv(int64_t a, int64_t b);
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
index 6d7d125035d158..53b49f4db086da 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
@@ -30,10 +30,11 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -44,26 +45,11 @@ limitations under the License.
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/casts.h"
 
 namespace xla {
 namespace cpu {
 namespace {
 
-std::string LlvmModuleToString(const llvm::Module& module) {
-  std::string dump;
-  llvm::raw_string_ostream stream(dump);
-  stream << module;
-  return dump;
-}
-
-std::string MlirModuleToString(const mlir::ModuleOp& module) {
-  std::string mlir_dump;
-  llvm::raw_string_ostream mlir_stream(mlir_dump);
-  module->print(mlir_stream);
-  return mlir_dump;
-}
-
 class CpuFusionEmitterTest : public HloHardwareIndependentTestBase {
  protected:
   absl::StatusOr<std::unique_ptr<BufferAssignment>> RunBufferAssignment(
@@ -129,8 +115,8 @@ TEST_F(CpuFusionEmitterTest, ScatterMlir) {
                           RunBufferAssignment(*hlo_module));
   auto fusion = Cast<HloFusionInstruction>(
       hlo_module->entry_computation()->root_instruction());
-  auto context = FusionCompiler::CreateContext();
-  CpuScatterFusion emitter(*buffer_assignment, fusion, context.get());
+  auto mlir_context = FusionCompiler::CreateContext();
+  CpuScatterFusion emitter(*buffer_assignment, fusion, mlir_context.get());
   TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
                           emitter.EmitKernelDefinition());
   const auto& mlir_source = kernel_definition.source();
@@ -157,14 +143,15 @@ TEST_F(CpuFusionEmitterTest, ScatterLlvm) {
                           RunBufferAssignment(*hlo_module));
   auto fusion = Cast<HloFusionInstruction>(
       hlo_module->entry_computation()->root_instruction());
-  auto context = FusionCompiler::CreateContext();
-  CpuScatterFusion emitter(*buffer_assignment, fusion, context.get());
+  auto mlir_context = FusionCompiler::CreateContext();
+  CpuScatterFusion emitter(*buffer_assignment, fusion, mlir_context.get());
   TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
                           emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-  FusionCompiler compiler(context.get(), FusionCompiler::Options{512, 1, true});
-  TF_ASSERT_OK_AND_ASSIGN(LlvmIrKernelSource llvm_source,
-                          compiler.Compile(std::move(source)));
+  FusionCompiler compiler(mlir_context.get(),
+                          FusionCompiler::Options{512, 1, true});
+  TF_ASSERT_OK_AND_ASSIGN(
+      LlvmKernelSource llvm_source,
+      compiler.Compile(std::move(kernel_definition).TakeSource()));
   auto llvm_dump = llvm_source.ToString();
   TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
                           RunFileCheck(llvm_dump, kExpected));
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
index fe680029cdb86b..537569a53404e5 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
@@ -59,10 +59,10 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -84,6 +84,7 @@ namespace cpu {
 
 using llvm::SmallVector;
 using mlir::ImplicitLocOpBuilder;
+using mlir::MLIRContext;
 using mlir::Value;
 using mlir::ValueRange;
 
@@ -91,22 +92,21 @@ namespace ma = ::mlir::arith;
 namespace scf = ::mlir::scf;
 
 std::vector<emitters::EpilogueSpecification> CpuScatterFusion::GetEpilogues(
-    const HloFusionInstruction& fusion, mlir::MLIRContext* context) const {
+    const HloFusionInstruction& fusion, MLIRContext* mlir_context) const {
   const auto* scatter = fusion_->fused_expression_root();
   // We don't actually support epilogues for scatter, but this is how we tell
   // the base class that we don't want it to generate code for the scatter.
   return {emitters::EpilogueSpecification::FromIdentityIndexing(
-      scatter, scatter, context)};
+      scatter, scatter, mlir_context)};
 }
 
 std::optional<IndexingMap> CpuScatterFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, MLIRContext* ctx) const {
   return std::nullopt;
 }
 
 std::optional<IndexingMap> CpuScatterFusion::ComputeThreadIdToInputIndexing(
-    int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
+    int64_t root_index, int64_t hero_operand_index, MLIRContext* ctx) const {
   const auto* scatter =
       DynCast<HloScatterInstruction>(fusion_->fused_expression_root());
   CHECK(ScatterSimplifier::IsSimplifiedScatter(scatter))
@@ -183,10 +183,10 @@ SmallVector<Value> EmitScatterComputation(
 
 CpuScatterFusion::CpuScatterFusion(const BufferAssignment& buffer_assignment,
                                    const HloFusionInstruction* fusion,
-                                   mlir::MLIRContext* context)
+                                   MLIRContext* mlir_context)
     : buffer_assignment_(buffer_assignment),
       fusion_(fusion),
-      context_(context) {
+      mlir_context_(mlir_context) {
   const auto* scatter = Cast<HloScatterInstruction>(
       fusion->fused_instructions_computation()->root_instruction());
   auto update_shape = scatter->scatter_updates().front()->shape();
@@ -248,8 +248,9 @@ IndexingMap GetScatterIndexingMap(
       {}, constraints);
 }
 
-absl::StatusOr<MlirKernelDefinition> CpuScatterFusion::EmitKernelDefinition() {
-  mlir::OpBuilder builder(context_);
+absl::StatusOr<CpuScatterFusion::KernelDefinition>
+CpuScatterFusion::EmitKernelDefinition() {
+  mlir::OpBuilder builder(mlir_context_);
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
                       CreateNamedMlirModuleOp(*fusion_, builder));
 
@@ -273,9 +274,9 @@ absl::StatusOr<MlirKernelDefinition> CpuScatterFusion::EmitKernelDefinition() {
                            std::string(module_name), buffer_assignment_));
 
   std::vector<emitters::EpilogueSpecification> epilogues =
-      GetEpilogues(*fusion_, context_);
+      GetEpilogues(*fusion_, mlir_context_);
   emitters::PartitionedComputations computations(
-      fusion_->fused_instructions_computation(), context_, epilogues);
+      fusion_->fused_instructions_computation(), mlir_context_, epilogues);
   TF_ASSIGN_OR_RETURN(
       emitters::CallTargetProvider call_targets,
       EmitCallTargets(mlir_module.get(), *fusion_, computations, epilogues));
@@ -323,8 +324,8 @@ absl::StatusOr<MlirKernelDefinition> CpuScatterFusion::EmitKernelDefinition() {
                          std::move(argument_buffers), std::move(result_buffers),
                          std::move(invariant_arguments));
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(mlir_module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(mlir_module)));
 }
 
 absl::Status CpuScatterFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
index b77bd18738ae3c..20c4a7332b86f0 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
@@ -17,19 +17,20 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/Value.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 
@@ -37,15 +38,14 @@ namespace xla {
 namespace cpu {
 
 // Generic scatter fusion. Lowers to LLVM via MLIR.
-class CpuScatterFusion final : public MlirKernelEmitter {
+class CpuScatterFusion final : public KernelEmitter<MlirKernelSource> {
  public:
-  explicit CpuScatterFusion(const BufferAssignment& buffer_assignment,
-                            const HloFusionInstruction* fusion,
-                            mlir::MLIRContext* context);
+  CpuScatterFusion(const BufferAssignment& buffer_assignment,
+                   const HloFusionInstruction* fusion,
+                   mlir::MLIRContext* mlir_context);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const final { return "cpu_scatter_fusion"; }
+  absl::string_view name() const final { return "cpu_scatter_fusion"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   absl::Status EmitEntryFunction(
@@ -55,7 +55,8 @@ class CpuScatterFusion final : public MlirKernelEmitter {
       const HloFusionInstruction& fusion) const;
 
   std::vector<emitters::EpilogueSpecification> GetEpilogues(
-      const HloFusionInstruction& fusion, mlir::MLIRContext* context) const;
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const;
 
   mlir::Value EmitThreadId(mlir::ImplicitLocOpBuilder& builder, int dim) const;
 
@@ -69,7 +70,7 @@ class CpuScatterFusion final : public MlirKernelEmitter {
 
   const BufferAssignment& buffer_assignment_;
   const HloFusionInstruction* fusion_;
-  mlir::MLIRContext* context_;
+  mlir::MLIRContext* mlir_context_;
 
   int64_t vector_size_;
   int64_t num_threads_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
index 0ff3804e171ffe..d231d5014e929d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
@@ -23,6 +23,7 @@ td_library(
     deps = [
         "@llvm-project//mlir:BuiltinDialectTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
     ],
 )
 
@@ -94,5 +95,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc
index 6280c7108e1f4b..972ffff654e9be 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc
@@ -15,7 +15,26 @@ limitations under the License.
 
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.h"
 
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Builders.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace xla::cpu {
+
+using EffectsVector = llvm::SmallVectorImpl<
+    mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>;
+
+void LoadOp::getEffects(EffectsVector& effects) {
+  effects.emplace_back(mlir::MemoryEffects::Read::get(), &getCallFrameMutable(),
+                       mlir::SideEffects::DefaultResource::get());
+}
+
+void ExtractWorkgroupIdOp::getEffects(EffectsVector& effects) {
+  effects.emplace_back(mlir::MemoryEffects::Read::get(), &getCallFrameMutable(),
+                       mlir::SideEffects::DefaultResource::get());
+}
+
+}  // namespace xla::cpu
 
 #define GET_OP_CLASSES
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc.inc"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td
index 78591ef6800b50..b4581ba1f50560 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_BACKENDS_CPU_CODEGEN_EMITTERS_IR_XLA_CPU_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_dialect.td"
 include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_types.td"
 include "xla/codegen/emitters/ir/xla_attrs.td"
@@ -31,7 +32,8 @@ def TensorOrMemRef : AnyTypeOf<[AnyMemRef, AnyRankedTensor]>;
 // !xla_cpu.load
 //===----------------------------------------------------------------------===//
 
-def XLACPU_LoadOp : XLACPU_Op<"load"> {
+def XLACPU_LoadOp : XLACPU_Op<"load",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "Loads a tensor from an XLA:CPU call frame";
 
   let description = [{
@@ -56,7 +58,8 @@ def XLACPU_LoadOp : XLACPU_Op<"load"> {
 // !xla_cpu.extract_workgroup_id
 //===----------------------------------------------------------------------===//
 
-def XLACPU_ExtractWorkgroupIdOp : XLACPU_Op<"extract_workgroup_id"> {
+def XLACPU_ExtractWorkgroupIdOp : XLACPU_Op<"extract_workgroup_id",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "Extracts the workgroup id from the call frame";
 
   let description = [{
@@ -83,7 +86,7 @@ def XLACPU_ExtractWorkgroupIdOp : XLACPU_Op<"extract_workgroup_id"> {
 // !xla_cpu.success
 //===----------------------------------------------------------------------===//
 
-def XLACPU_SuccessOp : XLACPU_Op<"success"> {
+def XLACPU_SuccessOp : XLACPU_Op<"success", [Pure]> {
   let summary = "Creates an !xla_cpu.error in the 'success' state.";
   let arguments = (ins);
   let results = (outs XLACPU_Error:$result);
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
index 04f14554cce723..cab42ce16aab09 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
@@ -43,15 +43,10 @@ cc_library(
         ":passes_inc_gen",
         ":xla_cpu_rewrite_patterns",
         "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
-        "//xla/codegen/emitters:implicit_arith_op_builder",
         "//xla/codegen/emitters/ir:xla",
-        "//xla/codegen/intrinsic:fptrunc",
-        "//xla/codegen/intrinsic:log1p",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/mlir/utils:type_util",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc
index 0fbf7f014d1214..4580afb61e44c4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc
@@ -55,6 +55,13 @@ struct RewriteCallPattern
       return rewriter.notifyMatchFailure(call_op, "Could not resolve callee.");
     }
 
+    // Adding reassoc flags to reductions with more than one fast math op
+    // can result in unexpected behaviour as they can reassociate between
+    // themselves.
+    if (FastMathOpCount(callee) > 1) {
+      return rewriter.notifyMatchFailure(call_op, "Too many fast math ops.");
+    }
+
     callee->walk([&rewriter](mlir::Operation* op) {
       if (auto fm_op =
               mlir::dyn_cast_or_null<mlir::arith::ArithFastMathInterface>(op)) {
@@ -74,6 +81,13 @@ struct RewriteCallPattern
 
     return mlir::success();
   }
+
+ private:
+  static int FastMathOpCount(mlir::func::FuncOp callee) {
+    int count = 0;
+    callee.walk([&](mlir::arith::ArithFastMathInterface op) { count++; });
+    return count;
+  }
 };
 
 class AddReductionFastMathFlagsPass
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc
index c62c0c2e51ab77..b14e7c172a37e3 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc
@@ -17,17 +17,17 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -35,8 +35,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/codegen/emitters/implicit_arith_op_builder.h"
-#include "xla/mlir/utils/type_util.h"
 
 namespace xla::cpu {
 
@@ -48,33 +46,37 @@ namespace {
 
 namespace ma = ::mlir::arith;
 
-mlir::func::FuncOp GetOrInsertDeclaration(mlir::PatternRewriter& rewriter,
-                                          mlir::ModuleOp& module_op,
-                                          absl::string_view name,
-                                          mlir::FunctionType func_type) {
-  // Check if the function already exists
-  if (auto func = module_op.lookupSymbol<mlir::func::FuncOp>(name)) {
-    // Ensure the existing function has the correct type
-    if (func.getFunctionType() == func_type) {
-      return func;
-    }
+// Get a constant value, if the type is a vector, splat the value to the vector
+// type.
+mlir::Value GetConst(mlir::ImplicitLocOpBuilder& b, mlir::Type type,
+                     mlir::TypedAttr value) {
+  if (auto vector_type = mlir::dyn_cast<mlir::VectorType>(type)) {
+    value =
+        mlir::SplatElementsAttr::get(mlir::cast<mlir::ShapedType>(type), value);
   }
+  return mlir::arith::ConstantOp::create(b, type, value);
+}
 
-  // If not found or type mismatch, create the declaration
-  mlir::PatternRewriter::InsertionGuard insertGuard(rewriter);
-  rewriter.setInsertionPointToStart(module_op.getBody());
+mlir::Value EmitBF16ToF32(mlir::Type dst_ty, mlir::Value in,
+                          mlir::ImplicitLocOpBuilder& b) {
+  auto get_type = [&](mlir::Type element_type) -> mlir::Type {
+    if (auto vector_type = mlir::dyn_cast<mlir::VectorType>(in.getType())) {
+      return vector_type.clone(element_type);
+    }
+    return element_type;
+  };
 
-  auto func_decl =
-      rewriter.create<mlir::func::FuncOp>(module_op.getLoc(), name, func_type);
-  func_decl.setPrivate();
-  return func_decl;
-}
+  mlir::Type i16_type = get_type(b.getI16Type());
+  mlir::Type i32_type = get_type(b.getI32Type());
 
-mlir::Value EmitBF16ToF32(mlir::Value in, mlir::ImplicitLocOpBuilder& b) {
-  mlir::Value i16 = b.create<ma::BitcastOp>(b.getI16Type(), in);
-  emitters::ImplicitArithOpBuilder i32(
-      b.create<ma::ExtUIOp>(b.getI32Type(), i16), &b);
-  return b.create<ma::BitcastOp>(b.getType<mlir::Float32Type>(), i32 << 16);
+  mlir::Value i16 = ma::BitcastOp::create(b, i16_type, in);
+  mlir::Value i32 = ma::ExtUIOp::create(b, i32_type, i16);
+
+  mlir::TypedAttr shift_value = b.getI32IntegerAttr(16);
+  mlir::Value shift_const = GetConst(b, i32_type, shift_value);
+
+  mlir::Value i32_shl = mlir::arith::ShLIOp::create(b, i32, shift_const);
+  return ma::BitcastOp::create(b, dst_ty, i32_shl);
 }
 
 struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
@@ -83,13 +85,14 @@ struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
   mlir::LogicalResult matchAndRewrite(
       ma::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
     auto src = op.getOperand();
-    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    auto dst_ty = op.getType();
 
     mlir::ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
 
-    if (mlir::isa<mlir::BFloat16Type>(src.getType()) &&
-        mlir::isa<mlir::Float32Type>(dst_ty)) {
-      rewriter.replaceOp(op, EmitBF16ToF32(src, builder));
+    if (mlir::isa<mlir::BFloat16Type>(
+            mlir::getElementTypeOrSelf(src.getType())) &&
+        mlir::isa<mlir::Float32Type>(mlir::getElementTypeOrSelf(dst_ty))) {
+      rewriter.replaceOp(op, EmitBF16ToF32(dst_ty, src, builder));
       return mlir::success();
     }
 
@@ -104,19 +107,19 @@ class RewriteCbrtPattern : public mlir::OpRewritePattern<mlir::math::CbrtOp> {
   mlir::LogicalResult matchAndRewrite(
       mlir::math::CbrtOp op, mlir::PatternRewriter& rewriter) const override {
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    mlir::arith::FastMathFlagsAttr fastmath = op.getFastmathAttr();
 
     mlir::Value input_abs =
-        b.create<mlir::math::AbsFOp>(op.getOperand(), op.getFastmathAttr())
-            .getResult();
+        b.create<mlir::math::AbsFOp>(op.getOperand(), fastmath).getResult();
 
-    mlir::Value one_third = b.create<mlir::arith::ConstantOp>(
-        b.getFloatAttr(op.getType(), 1.0 / 3.0));
-    mlir::Value cbrt_abs = b.create<mlir::math::PowFOp>(input_abs, one_third,
-                                                        op.getFastmathAttr());
+    mlir::TypedAttr third_attr =
+        b.getFloatAttr(mlir::getElementTypeOrSelf(op.getType()), 1.0 / 3.0);
+    mlir::Value third_value = GetConst(b, op.getType(), third_attr);
+    mlir::Value cbrt_abs =
+        b.create<mlir::math::PowFOp>(input_abs, third_value, fastmath);
 
     mlir::Value cbrt_signed =
-        b.create<mlir::math::CopySignOp>(cbrt_abs, op.getOperand(),
-                                         op.getFastmathAttr())
+        b.create<mlir::math::CopySignOp>(cbrt_abs, op.getOperand(), fastmath)
             .getResult();
 
     rewriter.replaceOp(op, cbrt_signed);
@@ -136,29 +139,27 @@ class RewriteExpm1Pattern : public mlir::OpRewritePattern<mlir::math::ExpM1Op> {
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     mlir::Type type = op.getType();
-    mlir::Value one =
-        b.create<mlir::arith::ConstantOp>(b.getFloatAttr(type, 1.0));
-    mlir::Value half =
-        b.create<mlir::arith::ConstantOp>(b.getFloatAttr(type, 0.5));
-    mlir::Value zero =
-        b.create<mlir::arith::ConstantOp>(b.getFloatAttr(type, 0.0));
+    mlir::Type element_type = mlir::getElementTypeOrSelf(type);
+    mlir::Value one = GetConst(b, type, b.getFloatAttr(element_type, 1.0));
+    mlir::Value half = GetConst(b, type, b.getFloatAttr(element_type, 0.5));
+    mlir::Value zero = GetConst(b, type, b.getFloatAttr(element_type, 0.0));
     mlir::Value x = op.getOperand();
 
-    mlir::Value exp_x = b.create<mlir::math::ExpOp>(x, op.getFastmathAttr());
+    mlir::arith::FastMathFlagsAttr fastmath = op.getFastmathAttr();
+
+    mlir::Value exp_x = b.create<mlir::math::ExpOp>(x, fastmath);
 
     mlir::Value exp_x_minus_1 =
-        b.create<mlir::arith::SubFOp>(exp_x, one, op.getFastmathAttr());
+        b.create<mlir::arith::SubFOp>(exp_x, one, fastmath);
 
-    mlir::Value half_x =
-        b.create<mlir::arith::MulFOp>(x, half, op.getFastmathAttr());
-    mlir::Value tanh_half_x =
-        b.create<mlir::math::TanhOp>(half_x, op.getFastmathAttr());
+    mlir::Value half_x = b.create<mlir::arith::MulFOp>(x, half, fastmath);
+    mlir::Value tanh_half_x = b.create<mlir::math::TanhOp>(half_x, fastmath);
     mlir::Value exp_x_plus_1 =
-        b.create<mlir::arith::AddFOp>(exp_x, one, op.getFastmathAttr());
-    mlir::Value small_result = b.create<mlir::arith::MulFOp>(
-        tanh_half_x, exp_x_plus_1, op.getFastmathAttr());
+        b.create<mlir::arith::AddFOp>(exp_x, one, fastmath);
+    mlir::Value small_result =
+        b.create<mlir::arith::MulFOp>(tanh_half_x, exp_x_plus_1, fastmath);
 
-    mlir::Value abs_x = b.create<mlir::math::AbsFOp>(x, op.getFastmathAttr());
+    mlir::Value abs_x = b.create<mlir::math::AbsFOp>(x, fastmath);
     mlir::Value x_is_large = b.create<mlir::arith::CmpFOp>(
         mlir::arith::CmpFPredicate::OGT, abs_x, half);
     mlir::Value normal_result = b.create<mlir::arith::SelectOp>(
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir
index 6761f2f1b3cd72..742ebb6382a515 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir
@@ -14,4 +14,24 @@ func.func @reducer(%x: f32, %y: f32) -> f32
 
 // CHECK-LABEL: func.func @caller
 // CHECK-LABEL: func.func @reducer
-// CHECK arith.addf {{.*}} fastmath<reassoc> : f32
+// CHECK: arith.addf {{.*}} fastmath<reassoc> : f32
+
+// -----
+
+
+func.func @caller(%x: f32, %y: f32) -> f32
+{
+  %z = func.call @reducer(%x, %y) { xla.is_reduction }: (f32, f32) -> f32
+  func.return %z : f32
+}
+
+func.func @reducer(%x: f32, %y: f32) -> f32
+{
+  %w = arith.addf %x, %y : f32
+  %z = arith.mulf %w, %y : f32
+  func.return %z : f32
+}
+
+// CHECK-LABEL: func.func @caller
+// CHECK-LABEL: func.func @reducer
+// CHECK-NOT: fastmath
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir
index b8ffcc83489e5e..5c2cab2fec7b46 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir
@@ -1,12 +1,18 @@
 // RUN: emitters_opt %s -split-input-file -xla-cpu-expand-float-ops | FileCheck %s
 
-
 func.func @extend(%input: bf16) -> f32 {
+  // CHECK-NOT: arith.extf
   %truncated = arith.extf %input : bf16 to f32
   func.return %truncated : f32
 }
 
-// CHECK-NOT: arith.extf
+// -----
+
+func.func @extend_vector(%input: vector<8xbf16>) -> vector<8xf32> {
+  // CHECK-NOT: arith.extf
+  %truncated = arith.extf %input : vector<8xbf16> to vector<8xf32>
+  func.return %truncated : vector<8xf32>
+}
 
 // -----
 
@@ -15,7 +21,6 @@ func.func @cbrt(%arg0: f64) -> f64 {
   return %ret : f64
 }
 
-
 // CHECK: @cbrt(%[[ARG:.*]]: f64) -> f64
 // CHECK-NOT: math.cbrt
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant 0.3333333
@@ -26,6 +31,21 @@ func.func @cbrt(%arg0: f64) -> f64 {
 
 // -----
 
+func.func @cbrt_vector(%arg0: vector<8xf64>) -> vector<8xf64> {
+  %ret = math.cbrt %arg0 fastmath<reassoc> : vector<8xf64>
+  return %ret : vector<8xf64>
+}
+
+// CHECK: @cbrt_vector(%[[ARG:.*]]: vector<8xf64>) -> vector<8xf64>
+// CHECK-NOT: math.cbrt
+// CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.33333333333333331> : vector<8xf64>
+// CHECK: %[[ABS:.*]] = math.absf %[[ARG]] fastmath<reassoc> : vector<8xf64>
+// CHECK: %[[CBRT_ABS:.*]] = math.powf %[[ABS]], %[[CONSTANT]] fastmath<reassoc> : vector<8xf64>
+// CHECK: %[[CBRT_SIGNED:.*]] = math.copysign %[[CBRT_ABS]], %[[ARG]] fastmath<reassoc> : vector<8xf64>
+// CHECK: return %[[CBRT_SIGNED]]
+
+// -----
+
 func.func @expm1(%arg0: f64) -> f64 {
   %ret = math.expm1 %arg0 : f64
   return %ret : f64
@@ -33,3 +53,13 @@ func.func @expm1(%arg0: f64) -> f64 {
 
 // CHECK-LABEL: @expm1
 // CHECK-NOT: math.expm1
+
+// -----
+
+func.func @expm1_vector(%arg0: vector<4xf64>) -> vector<4xf64> {
+  %ret = math.expm1 %arg0 : vector<4xf64>
+  return %ret : vector<4xf64>
+}
+
+// CHECK-LABEL: @expm1_vector
+// CHECK-NOT: math.expm1
diff --git a/third_party/xla/xla/backends/cpu/codegen/execution_engine.cc b/third_party/xla/xla/backends/cpu/codegen/execution_engine.cc
index 261208ad8dee72..ec5bfbb34536cc 100644
--- a/third_party/xla/xla/backends/cpu/codegen/execution_engine.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/execution_engine.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "xla/backends/cpu/codegen/contiguous_section_memory_manager.h"
-#include "xla/service/cpu/orc_jit_memory_mapper.h"
+#include "xla/backends/cpu/codegen/jit_memory_mapper.h"
 
 namespace xla::cpu {
 
@@ -38,7 +38,7 @@ CreateObjectLinkingLayer(llvm::orc::ExecutionSession& execution_session) {
   return std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(
       execution_session, [](const llvm::MemoryBuffer& obj) {
         return std::make_unique<ContiguousSectionMemoryManager>(
-            orc_jit_memory_mapper::GetInstance(obj.getBufferIdentifier()));
+            GetJitMemoryMapper(obj.getBufferIdentifier()));
       });
 }
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
index 693090d72e75be..8eaddc18832745 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
@@ -16,17 +16,20 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 
+#include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/FMF.h"
@@ -38,11 +41,21 @@ limitations under the License.
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Pipelines/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
@@ -50,34 +63,56 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/Visitors.h"
+#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/WalkResult.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/Passes.h"
+#include "stablehlo/conversions/linalg/transforms/Passes.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "stablehlo/transforms/Passes.h"
+#include "stablehlo/transforms/optimization/Passes.h"
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_dialect.h"
 #include "xla/backends/cpu/codegen/emitters/transforms/passes.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
 #include "xla/codegen/emitters/ir/xla_attrs.h.inc"
 #include "xla/codegen/emitters/ir/xla_dialect.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/trace_pass_instrumentation.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
+#include "xla/codegen/xtile/ir/xtile_dialect.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/status_macros.h"
@@ -90,6 +125,43 @@ limitations under the License.
 
 namespace xla::cpu {
 
+static void RegisterPassPipeline(
+    absl::string_view name, absl::string_view description,
+    absl::FunctionRef<void(mlir::OpPassManager&)> pipeline_builder) {
+  using ErrorHandlerFn =
+      llvm::function_ref<mlir::LogicalResult(const llvm::Twine&)>;
+
+  mlir::PassRegistryFunction register_pass_callback =
+      [pipeline_builder](mlir::OpPassManager& pm, llvm::StringRef options,
+                         ErrorHandlerFn error_handler) {
+        if (!options.empty()) {
+          return mlir::failure();
+        }
+        pipeline_builder(pm);
+        return mlir::success();
+      };
+
+  auto option_handler =
+      [](llvm::function_ref<void(const mlir::detail::PassOptions&)>
+             options_handler) { options_handler(mlir::detail::PassOptions()); };
+
+  mlir::registerPassPipeline(name, description, register_pass_callback,
+                             option_handler);
+}
+
+class ModuleCallbackPass
+    : public mlir::PassWrapper<ModuleCallbackPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  explicit ModuleCallbackPass(absl::FunctionRef<void(mlir::ModuleOp)> callback)
+      : callback_(callback) {}
+
+  void runOnOperation() override { callback_(getOperation()); }
+
+ private:
+  absl::FunctionRef<void(mlir::ModuleOp)> callback_;
+};
+
 static absl::Status RunPassPipeline(
     mlir::ModuleOp module, mlir::PassManager& pm,
     mlir::interpreter::MlirCompilationTrace* trace,
@@ -114,6 +186,37 @@ static std::unique_ptr<::mlir::Pass> CreateConvertMathToLLVMPass() {
   return mlir::createConvertMathToLLVMPass(options);
 }
 
+// The final lowering passes common to both scalar and tiled kernels.
+// These passes are primarily responsible for lowering individual ops to
+// their LLVM equivalent.
+static void AddGenericLoweringPasses(mlir::OpPassManager& pm,
+                                     bool fast_min_max) {
+  pm.addNestedPass<mlir::func::FuncOp>(
+      emitters::CreateSimplifyArithPass(fast_min_max));
+  pm.addPass(emitters::CreateSimplifyAffinePass());
+  pm.addPass(mlir::createCanonicalizerPass());
+
+  // simplify-affine lowers most affine.apply ops, but if it can't prove a
+  // division or modulo is unsigned, affine.apply ops will remain.
+  pm.addPass(mlir::createLowerAffinePass());
+
+  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::createCSEPass());
+
+  pm.addNestedPass<mlir::func::FuncOp>(cpu::CreateExpandFloatOpsPass());
+  pm.addPass(emitters::CreateExpandFloatOpsPass(/*aproximate_tanh=*/false));
+  pm.addPass(emitters::CreateEraseDeadFunctionsPass());
+  pm.addPass(mlir::createLowerAffinePass());
+  pm.addPass(mlir::createSCFToControlFlowPass());
+  pm.addPass(emitters::CreateLowerXlaIntrinsicLibPass());
+  pm.addNestedPass<mlir::func::FuncOp>(CreateConvertMathToLLVMPass());
+  pm.addPass(emitters::CreateLowerToLLVMPass(/*target_type=*/"cpu"));
+  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+}
+
 static std::unique_ptr<::mlir::Pass> CreateInlinerAndCsePass() {
   return mlir::createCompositeFixedPointPass(
       "Inliner", [](mlir::OpPassManager& pm) {
@@ -124,8 +227,12 @@ static std::unique_ptr<::mlir::Pass> CreateInlinerAndCsePass() {
       });
 }
 
-static void AddLoopTransformationPasses(mlir::OpPassManager& pm,
+// Optimizations passes for the "hero" emitters, e.g. loop emitter.
+// It is expected that the input has a simple nested loop structure that works
+// on scalar instructions extracted/inserted from tensor types.
+static void AddScalarOptimizationPasses(mlir::OpPassManager& pm,
                                         int32_t vector_width) {
+  emitters::RegisterOptimizationPasses(pm);
   pm.addPass(CreateAddReductionFastMathFlagsPass());
   pm.addPass(CreateInlinerAndCsePass());
   pm.addNestedPass<mlir::func::FuncOp>(CreatePeelWorkgroupLoopPass());
@@ -154,8 +261,12 @@ static void AddLoopTransformationPasses(mlir::OpPassManager& pm,
   pm.addNestedPass<mlir::func::FuncOp>(CreateAddLoopUnrollFlagsPass());
 }
 
-static void AddLoweringPasses(mlir::OpPassManager& pm, int32_t vector_width,
-                              bool fast_min_max) {
+// Lowering passes for the "hero" emitters, e.g. loop emitter.
+// It is expected that the input has a simple nested loop structure that works
+// on scalar instructions extracted/inserted from tensor types.
+// The resulting IR can then be translated to native LLVM.
+static void AddScalarLoweringPasses(mlir::OpPassManager& pm,
+                                    int32_t vector_width, bool fast_min_max) {
   pm.addNestedPass<mlir::func::FuncOp>(
       emitters::CreateConvertPureCallOpsPass());
   pm.addPass(cpu::createLowerToLLVMPass(
@@ -168,32 +279,90 @@ static void AddLoweringPasses(mlir::OpPassManager& pm, int32_t vector_width,
   // simplify-affine has maximally folded expressions to work with.
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
+
+  AddGenericLoweringPasses(pm, fast_min_max);
+}
+
+static void AddBufferizationPasses(mlir::OpPassManager& pm) {
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::bufferization::createEmptyTensorEliminationPass());
+  pm.addPass(mlir::bufferization::createOneShotBufferizePass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      emitters::CreateSimplifyArithPass(fast_min_max));
-  pm.addPass(emitters::CreateSimplifyAffinePass());
+      mlir::bufferization::createBufferHoistingPass());
+  pm.addPass(mlir::memref::createFoldMemRefAliasOpsPass());
+  mlir::bufferization::PromoteBuffersToStackPassOptions
+      buffer_promotion_options;
+  // We don't want any heap allocation for now.
+  buffer_promotion_options.maxAllocSizeInBytes =
+      std::numeric_limits<unsigned>::max();
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::bufferization::createPromoteBuffersToStackPass(
+          buffer_promotion_options));
+  // This shouldn't be necessary as we promote everything to the stack, but we
+  // leave it in for now while we are experimenting.
+  mlir::bufferization::buildBufferDeallocationPipeline(
+      pm, mlir::bufferization::BufferDeallocationPipelineOptions());
+}
+
+// Optimizations passes for the tiled emitter.
+// This is currently very simple but will grow to include tiled optimizations
+// such as transpose hoisting and dimension reduction.
+static void AddTiledOptimizationPasses(mlir::OpPassManager& pm) {
+  emitters::RegisterOptimizationPasses(pm);
+
+  pm.addPass(CreateLowerXTileEntryPass());
+
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::createStablehloTargetIndependentOptimizationPass());
+
+  pm.addPass(CreateShloToVectorPass());
   pm.addPass(mlir::createCanonicalizerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::vector::createLowerVectorMultiReductionPass(
+          mlir::vector::VectorMultiReductionLowering::InnerParallel));
+  pm.addPass(CreateTensorOpsToBufferizablePass());
 
-  // simplify-affine lowers most affine.apply ops, but if it can't prove a
-  // division or modulo is unsigned, affine.apply ops will remain.
-  pm.addPass(mlir::createLowerAffinePass());
+  mlir::stablehlo::StablehloLegalizeToLinalgPassOptions
+      stablehlo_to_linalg_options;
+  stablehlo_to_linalg_options.enablePrimitiveOps = true;
+  pm.addPass(mlir::stablehlo::createStablehloLegalizeToLinalgPass());
+  pm.addPass(xtile::createConvertElementwise0DTensorToScalarPass());
 
-  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
-  pm.addPass(mlir::createSymbolDCEPass());
-  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createConvertElementwiseToLinalgPass());
+  pm.addPass(CreateFuseElementwisePass());
+
+  AddBufferizationPasses(pm);
+
+  pm.addPass(CreateLinalgElementwiseToVectorPass());
 
-  pm.addNestedPass<mlir::func::FuncOp>(cpu::CreateExpandFloatOpsPass());
-  pm.addPass(emitters::CreateExpandFloatOpsPass(/*aproximate_tanh=*/false));
-  pm.addPass(emitters::CreateEraseDeadFunctionsPass());
-  pm.addPass(mlir::createLowerAffinePass());
-  pm.addPass(mlir::createSCFToControlFlowPass());
-  pm.addPass(emitters::CreateLowerXlaIntrinsicLibPass());
-  pm.addNestedPass<mlir::func::FuncOp>(CreateConvertMathToLLVMPass());
-  pm.addPass(emitters::CreateLowerToLLVMPass(/*target_type=*/"cpu"));
-  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
 }
 
+// Lowering passes for the tiled emitter.
+// The input IR is from the xtile dialect which uses tensors that are converted
+// first to the vector dialect and then to LLVM.
+static void AddTiledLoweringPasses(mlir::OpPassManager& pm, bool fast_min_max) {
+  pm.addPass(CreateVectorToScalarPass());
+  pm.addPass(cpu::CreateMemrefCopyToLoopsPass());
+  pm.addPass(cpu::createLowerToLLVMPass());
+  pm.addPass(mlir::createConvertVectorToSCFPass(
+      mlir::VectorTransferToSCFOptions().enableFullUnroll(false)));
+  mlir::ConvertVectorToLLVMPassOptions options;
+  options.vectorTransposeLowering =
+      mlir::vector::VectorTransposeLowering::Shuffle1D;
+  pm.addPass(mlir::createConvertVectorToLLVMPass(options));
+
+  pm.addPass(mlir::createConvertComplexToStandardPass());
+  pm.addPass(mlir::memref::createExpandStridedMetadataPass());
+
+  pm.addPass(emitters::CreateSafeIntegerArithmeticPass());
+
+  AddGenericLoweringPasses(pm, fast_min_max);
+}
+
 static int GetLlvmFunctionDefCount(mlir::ModuleOp m) {
   int count = 0;
   m.walk([&count](mlir::LLVM::LLVMFuncOp func) {
@@ -223,18 +392,29 @@ FusionCompiler::FusionCompiler(mlir::MLIRContext* context, Options options,
                                CompilationHooks hooks)
     : options_(std::move(options)),
       hooks_(std::move(hooks)),
-      optimization_pass_manager_(
-          mlir::PassManager::on<mlir::ModuleOp>(context)),
-      lowering_pass_manager_(mlir::PassManager::on<mlir::ModuleOp>(context)) {
-  emitters::RegisterOptimizationPasses(optimization_pass_manager_);
-  AddLoopTransformationPasses(optimization_pass_manager_,
-                              options_.vector_width);
-  optimization_pass_manager_.addInstrumentation(
-      std::make_unique<TraceInstrumentation>());
+      scalar_pass_manager_(mlir::PassManager::on<mlir::ModuleOp>(context)),
+      tiled_pass_manager_(mlir::PassManager::on<mlir::ModuleOp>(context)) {
+  // Scalar passes.
+  AddScalarOptimizationPasses(scalar_pass_manager_, options_.vector_width);
+  if (hooks_.post_optimization) {
+    scalar_pass_manager_.addPass(
+        std::make_unique<ModuleCallbackPass>(hooks_.post_optimization));
+  }
+  AddScalarLoweringPasses(scalar_pass_manager_, options_.vector_width,
+                          options_.fast_min_max);
+
+  // Tiled passes.
+  tiled_pass_manager_.addPass(xtile::createVerifyLegalXTileOpsPass());
+  AddTiledOptimizationPasses(tiled_pass_manager_);
+  if (hooks_.post_optimization) {
+    tiled_pass_manager_.addPass(
+        std::make_unique<ModuleCallbackPass>(hooks_.post_optimization));
+  }
+  AddTiledLoweringPasses(tiled_pass_manager_, options_.fast_min_max);
 
-  AddLoweringPasses(lowering_pass_manager_, options_.vector_width,
-                    options_.fast_min_max);
-  lowering_pass_manager_.addInstrumentation(
+  scalar_pass_manager_.addInstrumentation(
+      std::make_unique<TraceInstrumentation>());
+  tiled_pass_manager_.addInstrumentation(
       std::make_unique<TraceInstrumentation>());
 }
 
@@ -252,6 +432,10 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> FusionCompiler::Compile(
     });
     return count;
   };
+
+  bool is_tiled = !mlir_module.getBody()->getOps<xtile::EntryFuncOp>().empty();
+  mlir::PassManager& pm = is_tiled ? tiled_pass_manager_ : scalar_pass_manager_;
+
   VLOG(1) << "Compiling MLIR module: " << module_name << ", with "
           << get_module_op_count() << " operations.";
   XLA_SCOPED_LOGGING_TIMER_LEVEL(
@@ -266,15 +450,8 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> FusionCompiler::Compile(
   if (hooks_.pre_optimization) {
     hooks_.pre_optimization(mlir_module);
   }
-  TF_RETURN_IF_ERROR(RunPassPipeline(mlir_module, optimization_pass_manager_,
-                                     nullptr, options_.verification_level));
-
-  if (hooks_.post_optimization) {
-    hooks_.post_optimization(mlir_module);
-  }
-
-  TF_RETURN_IF_ERROR(RunPassPipeline(mlir_module, lowering_pass_manager_,
-                                     nullptr, options_.verification_level));
+  TF_RETURN_IF_ERROR(
+      RunPassPipeline(mlir_module, pm, nullptr, options_.verification_level));
 
   if (hooks_.post_lowering) {
     hooks_.post_lowering(mlir_module);
@@ -293,6 +470,9 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> FusionCompiler::Compile(
       mlir_module, llvm_context,
       absl::StrCat(kXlaModuleIdentifier, "_", module_name));
 
+  TF_RET_CHECK(llvm_module != nullptr)
+      << "Failed to translate module to LLVM IR.";
+
   if (mlir::Attribute options =
           mlir_module->getAttr(xla::ExtraBackendOptionsAttr::name)) {
     const auto formatter = [](std::string* out, const mlir::StringAttr& attr) {
@@ -312,9 +492,6 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> FusionCompiler::Compile(
                               mlir::cast<mlir::StringAttr>(options).str());
   }
 
-  TF_RET_CHECK(llvm_module != nullptr)
-      << "Failed to translate module to LLVM IR.";
-
   llvm_module->setDataLayout(llvm_module->getDataLayout());
 
   if (options_.fast_math_flags.any()) {
@@ -325,12 +502,12 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> FusionCompiler::Compile(
 }
 
 // Compile a MLIR kernel source to a LLVM kernel source.
-absl::StatusOr<LlvmIrKernelSource> FusionCompiler::Compile(
+absl::StatusOr<LlvmKernelSource> FusionCompiler::Compile(
     MlirKernelSource mlir_kernel_source) {
   auto llvm_context = std::make_unique<llvm::LLVMContext>();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> llvm_module,
                       Compile(*llvm_context, mlir_kernel_source.module()));
-  return LlvmIrKernelSource(std::move(llvm_context), std::move(llvm_module));
+  return LlvmKernelSource(std::move(llvm_context), std::move(llvm_module));
 }
 
 std::unique_ptr<mlir::MLIRContext> FusionCompiler::CreateContext() {
@@ -341,23 +518,62 @@ std::unique_ptr<mlir::MLIRContext> FusionCompiler::CreateContext() {
   auto context = std::make_unique<mlir::MLIRContext>(
       mlir::MLIRContext::Threading::DISABLED);
 
-  context->loadDialect<mlir::DLTIDialect, mlir::affine::AffineDialect,
-                       mlir::arith::ArithDialect, mlir::cf::ControlFlowDialect,
-                       mlir::func::FuncDialect, mlir::math::MathDialect,
-                       xla::cpu::XlaCpuDialect, mlir::mhlo::MhloDialect,
-                       mlir::scf::SCFDialect, mlir::LLVM::LLVMDialect,
-                       mlir::tensor::TensorDialect, mlir::vector::VectorDialect,
-                       xla::XlaDialect>();
+  context->appendDialectRegistry(CreateDialectRegistry());
+  context->loadAllAvailableDialects();
+
+  return context;
+}
 
+mlir::DialectRegistry FusionCompiler::CreateDialectRegistry(
+    bool register_pass_pipelines) {
   mlir::DialectRegistry registry;
+
+  registry.insert<
+      mlir::DLTIDialect, mlir::affine::AffineDialect, mlir::arith::ArithDialect,
+      mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,
+      mlir::math::MathDialect, xla::cpu::XlaCpuDialect, mlir::mhlo::MhloDialect,
+      mlir::scf::SCFDialect, mlir::LLVM::LLVMDialect,
+      mlir::tensor::TensorDialect, mlir::vector::VectorDialect, xla::XlaDialect,
+      xla::xtile::XTileDialect, mlir::stablehlo::StablehloDialect,
+      mlir::linalg::LinalgDialect, mlir::memref::MemRefDialect>();
+
   mlir::LLVM::registerInlinerInterface(registry);
   mlir::func::registerInlinerExtension(registry);
+
+  mlir::memref::registerAllocationOpInterfaceExternalModels(registry);
+
+  mlir::arith::registerBufferDeallocationOpInterfaceExternalModels(registry);
+  mlir::scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
+
+  mlir::arith::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
+      registry);
+  mlir::linalg::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::scf::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::tensor::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::vector::registerBufferizableOpInterfaceExternalModels(registry);
+
+  mlir::vector::registerSubsetOpInterfaceExternalModels(registry);
+
   mlir::registerLLVMDialectTranslation(registry);
   mlir::registerBuiltinDialectTranslation(registry);
   mlir::registerConvertMathToLLVMInterface(registry);
-  context->appendDialectRegistry(registry);
+  mlir::registerConvertMemRefToLLVMInterface(registry);
+  mlir::ub::registerConvertUBToLLVMInterface(registry);
+  mlir::vector::registerConvertVectorToLLVMInterface(registry);
+
+  if (register_pass_pipelines) {
+    RegisterPassPipeline(
+        "xla-test-optimize",
+        "Test pipeline of passes up to inlining. Intended to simplify IR in "
+        "tests.",
+        &xla::emitters::RegisterOptimizationPasses);
+    RegisterPassPipeline("xtile-cpu-bufferization",
+                         "Run the bufferization pipeline for a tiled kernel.",
+                         &AddBufferizationPasses);
+  }
 
-  return context;
+  return registry;
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
index 434a25bb634ed4..9dc88e385e8de0 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 
 namespace xla::cpu {
@@ -57,21 +57,27 @@ class FusionCompiler {
   absl::StatusOr<std::unique_ptr<llvm::Module>> Compile(
       llvm::LLVMContext& llvm_context, mlir::ModuleOp mlir_module);
   // Compile a MLIR kernel source to a LLVM kernel source.
-  absl::StatusOr<LlvmIrKernelSource> Compile(
-      MlirKernelSource mlir_kernel_source);
+  absl::StatusOr<LlvmKernelSource> Compile(MlirKernelSource mlir_kernel_source);
 
   // Create a new MLIR context for the compiler with the required dialects for
   // compiling an XLA:CPU fusion.
   static std::unique_ptr<mlir::MLIRContext> CreateContext();
 
+  // Create a dialect registry for the compiler with the required dialects for
+  // compiling an XLA:CPU fusion. If `register_pass_pipelines` is true, this
+  // will also register the pass pipelines for the compiler, typically to be
+  // used in tests.
+  static mlir::DialectRegistry CreateDialectRegistry(
+      bool register_pass_pipelines = false);
+
  private:
   Options options_;
   CompilationHooks hooks_;
-  // Pass manager that holds the optimization & loop transformation passes.
-  mlir::PassManager optimization_pass_manager_;
-  // Pass manager that holds the passes responsible for lowering the module from
-  // MLIR to LLVM.
-  mlir::PassManager lowering_pass_manager_;
+  // We have 2 distinct pipelines for scalar and tiled kernels, this is
+  // because they differ slightly in their semantics, ideally these would be
+  // unified but this is a larger change.
+  mlir::PassManager scalar_pass_manager_;
+  mlir::PassManager tiled_pass_manager_;
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
index 00d5867b8dfdf6..5e0bdaddccbb5f 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/backends/cpu/alignment.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/symbol_name_util.h"
+#include "xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h"
 #include "xla/codegen/emitters/concatenate_kernel_emitter.h"
 #include "xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
@@ -43,7 +45,9 @@ limitations under the License.
 #include "xla/codegen/emitters/loop_kernel_emitter.h"
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/ir_emission_utils.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -56,7 +60,6 @@ limitations under the License.
 #include "xla/runtime/work_tile_size.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/backend_config.pb.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -64,6 +67,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
+using ::mlir::MLIRContext;
+
 static absl::StatusOr<std::string> GetName(const HloFusionInstruction& fusion,
                                            bool use_unique_c_name) {
   if (!use_unique_c_name) {
@@ -207,8 +212,8 @@ static HloFusionSpec GetLoopFusionSpec(const HloFusionInstruction& fusion) {
                        std::move(heroes));
 }
 
-static absl::StatusOr<MlirKernelDefinition> EmitLoopFusionKernel(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+static absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitLoopFusionKernel(
+    MLIRContext& context, const HloFusionInstruction& fusion,
     const BufferAssignment* buffer_assignment, absl::string_view name) {
   VLOG(2) << "Emitting loop fusion kernel: " << name;
   HloFusionSpec fusion_spec = GetLoopFusionSpec(fusion);
@@ -220,22 +225,20 @@ static absl::StatusOr<MlirKernelDefinition> EmitLoopFusionKernel(
   TF_ASSIGN_OR_RETURN(auto mlir_kernel_definition,
                       loop_fusion_emitter.EmitKernelDefinition());
 
-  // We have to release otherwise the source wouldn't be mutable, and we
-  // wouldn't be able to set the CpuMemoryRegionNameAttr.
-  auto [kernel_spec, kernel_source] =
-      std::move(mlir_kernel_definition).ReleaseStorage();
-
   mlir::OpBuilder builder(&context);
-  kernel_source.module().getOperation()->setAttr(
+  mlir_kernel_definition.source().module().getOperation()->setAttr(
       xla::CpuMemoryRegionNameAttr::name,
       builder.getStringAttr(
           BuildModuleMemoryRegionName(loop_fusion_emitter.name(), &fusion)));
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(kernel_source));
+
+  return mlir_kernel_definition;
 }
 
-static absl::StatusOr<MlirKernelDefinition> EmitConcatenateFusionKernel(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
-    const BufferAssignment* buffer_assignment, absl::string_view name) {
+static absl::StatusOr<KernelDefinition<MlirKernelSource>>
+EmitConcatenateFusionKernel(MLIRContext& context,
+                            const HloFusionInstruction& fusion,
+                            const BufferAssignment* buffer_assignment,
+                            absl::string_view name) {
   VLOG(2) << "Emitting concatenate fusion kernel: " << name;
   HloFusionSpec fusion_spec = GetLoopFusionSpec(fusion);
   auto work_dimensions = GetConcatenateEmitterWorkDims(fusion, fusion_spec);
@@ -246,22 +249,20 @@ static absl::StatusOr<MlirKernelDefinition> EmitConcatenateFusionKernel(
   TF_ASSIGN_OR_RETURN(auto mlir_kernel_definition,
                       concatenate_fusion_emitter.EmitKernelDefinition());
 
-  // We have to release otherwise the source wouldn't be mutable, and we
-  // wouldn't be able to set the CpuMemoryRegionNameAttr.
-  auto [kernel_spec, kernel_source] =
-      std::move(mlir_kernel_definition).ReleaseStorage();
-
   mlir::OpBuilder builder(&context);
-  kernel_source.module().getOperation()->setAttr(
+  mlir_kernel_definition.source().module().getOperation()->setAttr(
       xla::CpuMemoryRegionNameAttr::name,
       builder.getStringAttr(BuildModuleMemoryRegionName(
           concatenate_fusion_emitter.name(), &fusion)));
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(kernel_source));
+
+  return mlir_kernel_definition;
 }
 
-static absl::StatusOr<MlirKernelDefinition> EmitDynamicUpdateSliceFusionKernel(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
-    const BufferAssignment* buffer_assignment, absl::string_view name) {
+static absl::StatusOr<KernelDefinition<MlirKernelSource>>
+EmitDynamicUpdateSliceFusionKernel(MLIRContext& context,
+                                   const HloFusionInstruction& fusion,
+                                   const BufferAssignment* buffer_assignment,
+                                   absl::string_view name) {
   VLOG(2) << "Emitting dynamic update slice fusion kernel: " << name;
   HloFusionSpec fusion_spec = GetLoopFusionSpec(fusion);
   auto work_dimensions =
@@ -273,29 +274,36 @@ static absl::StatusOr<MlirKernelDefinition> EmitDynamicUpdateSliceFusionKernel(
   TF_ASSIGN_OR_RETURN(auto mlir_kernel_definition,
                       emitter.EmitKernelDefinition());
 
-  // We have to release otherwise the source wouldn't be mutable, and we
-  // wouldn't be able to set the CpuMemoryRegionNameAttr.
-  auto [kernel_spec, kernel_source] =
-      std::move(mlir_kernel_definition).ReleaseStorage();
-
   mlir::OpBuilder builder(&context);
-  kernel_source.module().getOperation()->setAttr(
+  mlir_kernel_definition.source().module().getOperation()->setAttr(
       xla::CpuMemoryRegionNameAttr::name,
       builder.getStringAttr(
           BuildModuleMemoryRegionName(emitter.name(), &fusion)));
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(kernel_source));
+
+  return mlir_kernel_definition;
 }
 
-absl::StatusOr<MlirKernelDefinition> EmitFusionKernel(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
-    const BufferAssignment* buffer_assignment, bool use_unique_c_name) {
+absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitFusionKernel(
+    MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    const BufferAssignment* buffer_assignment, bool use_unique_c_name,
+    bool enable_tiled_emitter) {
+  TF_ASSIGN_OR_RETURN(std::string name, GetName(fusion, use_unique_c_name));
+
+  if (enable_tiled_emitter) {
+    if (auto tiling_or = GetTilingIfSupported(mlir_context, fusion);
+        tiling_or.ok()) {
+      return EmitTiledFusionKernel(mlir_context, fusion, buffer_assignment,
+                                   name, GetWorkGroupCount(fusion),
+                                   std::move(*tiling_or));
+    }
+  }
+
   if (fusion.fusion_kind() == HloFusionInstruction::FusionKind::kLoop) {
-    TF_ASSIGN_OR_RETURN(std::string name, GetName(fusion, use_unique_c_name));
     const HloInstruction& hero =
         FindNonTrivialHero(*fusion.fused_expression_root());
     if (hero.opcode() == HloOpcode::kConcatenate) {
-      return EmitConcatenateFusionKernel(context, fusion, buffer_assignment,
-                                         name);
+      return EmitConcatenateFusionKernel(mlir_context, fusion,
+                                         buffer_assignment, name);
     }
     auto fusion_spec = GetLoopFusionSpec(fusion);
     if (IsDynamicUpdateSliceFusion(fusion_spec)) {
@@ -304,11 +312,11 @@ absl::StatusOr<MlirKernelDefinition> EmitFusionKernel(
           CanEmitFusedDynamicUpdateSliceInPlace(fusion_spec.fusion(),
                                                 buffer_assignment, &fusion));
       if (dus_inplace) {
-        return EmitDynamicUpdateSliceFusionKernel(context, fusion,
+        return EmitDynamicUpdateSliceFusionKernel(mlir_context, fusion,
                                                   buffer_assignment, name);
       }
     }
-    return EmitLoopFusionKernel(context, fusion, buffer_assignment, name);
+    return EmitLoopFusionKernel(mlir_context, fusion, buffer_assignment, name);
   }
 
   return absl::UnimplementedError("Fusion kind not supported.");
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
index 39591d53cac810..74302f25fd368e 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 
@@ -27,9 +29,10 @@ namespace xla::cpu {
 
 emitters::KernelArguments::BufferAlignment GetDefaultBufferAlignment();
 
-absl::StatusOr<MlirKernelDefinition> EmitFusionKernel(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
-    const BufferAssignment* buffer_assignment, bool use_unique_c_name);
+absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitFusionKernel(
+    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    const BufferAssignment* buffer_assignment, bool use_unique_c_name,
+    bool enable_tiled_emitter);
 
 }  // namespace xla::cpu
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py
index 4d6809b31e08b0..47f14330325cd2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py
@@ -54,7 +54,10 @@ def test_basic_add_sub(self):
     jit_compiler = testlib_cpu.JitCompiler(hlo_module.get_config())
     mlir_context = testlib_cpu.MLIRContext()
     kernel_definition = testlib_cpu.emit_fusion_kernel(
-        mlir_context, hlo_module.get_root_instruction(), buffer_assignment
+        mlir_context,
+        hlo_module.get_root_instruction(),
+        buffer_assignment,
+        False,
     )
 
     kernel_runner = testlib_cpu.KernelRunner.create(
@@ -112,7 +115,10 @@ def test_convert_f32_bf16_f32(self):
     jit_compiler = testlib_cpu.JitCompiler(hlo_module.get_config())
     mlir_context = testlib_cpu.MLIRContext()
     kernel_definition = testlib_cpu.emit_fusion_kernel(
-        mlir_context, hlo_module.get_root_instruction(), buffer_assignment
+        mlir_context,
+        hlo_module.get_root_instruction(),
+        buffer_assignment,
+        False,
     )
 
     kernel_runner = testlib_cpu.KernelRunner.create(
@@ -165,7 +171,10 @@ def test_convert_f32_bf16_f32_nan(self):
     jit_compiler = testlib_cpu.JitCompiler(hlo_module.get_config())
     mlir_context = testlib_cpu.MLIRContext()
     kernel_definition = testlib_cpu.emit_fusion_kernel(
-        mlir_context, hlo_module.get_root_instruction(), buffer_assignment
+        mlir_context,
+        hlo_module.get_root_instruction(),
+        buffer_assignment,
+        False,
     )
 
     kernel_runner = testlib_cpu.KernelRunner.create(
@@ -215,7 +224,10 @@ def test_constant_with_layout(self):
     jit_compiler = testlib_cpu.JitCompiler(hlo_module.get_config())
     mlir_context = testlib_cpu.MLIRContext()
     kernel_definition = testlib_cpu.emit_fusion_kernel(
-        mlir_context, hlo_module.get_root_instruction(), buffer_assignment
+        mlir_context,
+        hlo_module.get_root_instruction(),
+        buffer_assignment,
+        False,
     )
 
     kernel_runner = testlib_cpu.KernelRunner.create(
@@ -262,7 +274,10 @@ def test_exp_nan_dce(self):
     jit_compiler = testlib_cpu.JitCompiler(hlo_module.get_config())
     mlir_context = testlib_cpu.MLIRContext()
     kernel_definition = testlib_cpu.emit_fusion_kernel(
-        mlir_context, hlo_module.get_root_instruction(), buffer_assignment
+        mlir_context,
+        hlo_module.get_root_instruction(),
+        buffer_assignment,
+        False,
     )
 
     kernel_runner = testlib_cpu.KernelRunner.create(
diff --git a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
index d53ff46421b05b..a532b353ce2f2b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -63,9 +62,9 @@ limitations under the License.
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
-#include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/polynomial_approximations.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/codegen/intrinsic/intrinsic_compiler_lib.h"
 #include "xla/codegen/intrinsic_lib.h"
@@ -76,7 +75,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/cpu_info.h"
 
 namespace xla::cpu {
 
@@ -194,7 +192,7 @@ std::unique_ptr<IrCompiler> IrCompiler::Create(
   TargetMachineBuilder target_machine_builder =
       IrCompiler::InferTargetMachineBuilder(std::move(target_options),
                                             options.opt_level,
-                                            options.max_cpu_feature);
+                                            options.target_machine_options);
 
   return std::make_unique<IrCompiler>(target_machine_builder,
                                       std::move(options), std::move(hooks));
@@ -218,18 +216,9 @@ absl::once_flag initialize_llvm_flag;
 absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
 IrCompiler::InferTargetMachine(
     const llvm::TargetOptions& target_options, llvm::CodeGenOptLevel opt_level,
-    std::optional<tsl::port::CPUFeature> max_cpu_feature) {
-  // Detect machine attributes for the target CPU.
-  auto result = DetectMachineAttributes(max_cpu_feature);
-  llvm::SmallVector<std::string> attrs(result.features.begin(),
-                                       result.features.end());
-
-  // If `max_cpu_feature` is newer than the host CPU, we should keep the host
-  // CPU name, e.g., we don't want to set the target CPU to Skylake when we are
-  // on a Broadwell host.
-  absl::string_view cpu = result.num_filtered_features
-                              ? CpuTargetFromMaxFeature(*max_cpu_feature)
-                              : absl::string_view(llvm::sys::getHostCPUName());
+    const TargetMachineOptions& target_machine_options) {
+  auto attrs_vec = target_machine_options.GetTargetMachineFeaturesVector();
+  llvm::SmallVector<std::string> attrs(attrs_vec.begin(), attrs_vec.end());
 
   absl::call_once(initialize_llvm_flag, InitializeLLVMTarget);
   std::unique_ptr<llvm::TargetMachine> target_machine(
@@ -237,12 +226,14 @@ IrCompiler::InferTargetMachine(
           .setTargetOptions(target_options)
           .setOptLevel(opt_level)
           .selectTarget(
-              /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
-              /*MCPU=*/cpu,
+              /*TargetTriple=*/llvm::Triple(target_machine_options.triple()),
+              /*MArch=*/"",
+              /*MCPU=*/target_machine_options.cpu(),
               /*MAttrs=*/attrs));
 
   if (target_machine == nullptr) {
-    return Internal("Failed to create target machine for CPU %s", cpu);
+    return Internal("Failed to create target machine for CPU %s",
+                    target_machine_options.cpu());
   }
 
   return std::move(target_machine);
@@ -250,9 +241,10 @@ IrCompiler::InferTargetMachine(
 
 IrCompiler::TargetMachineBuilder IrCompiler::InferTargetMachineBuilder(
     const llvm::TargetOptions& target_options, llvm::CodeGenOptLevel opt_level,
-    std::optional<tsl::port::CPUFeature> max_cpu_feature) {
-  return [target_options, opt_level, max_cpu_feature] {
-    return InferTargetMachine(target_options, opt_level, max_cpu_feature);
+    const TargetMachineOptions& target_machine_options) {
+  return [target_options, opt_level, target_machine_options] {
+    return InferTargetMachine(target_options, opt_level,
+                              target_machine_options);
   };
 }
 
@@ -262,8 +254,8 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
   XLA_SCOPED_LOGGING_TIMER_LEVEL(
       absl::StrCat("Compiled LLVM module: ", module_name), 1);
 
-  VLOG(2) << "IR before optimizations";
-  XLA_VLOG_LINES(2, llvm_ir::DumpToString(&module));
+  VLOG(3) << "IR before optimizations";
+  XLA_VLOG_LINES(3, llvm_ir::DumpToString(&module));
 
   // Get a target machine for compilation. If compilations run concurrently on
   // multiple threads, `IrCompiler` user (in most cases `SimpleOrcJIT`)
@@ -292,8 +284,8 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
     return ir_passes_error;
   }
 
-  VLOG(2) << "IR after optimizations";
-  XLA_VLOG_LINES(2, llvm_ir::DumpToString(&module));
+  VLOG(3) << "IR after optimizations";
+  XLA_VLOG_LINES(3, llvm_ir::DumpToString(&module));
 
   {  // Synchronize access to user-defined hooks.
     absl::MutexLock lock(mutex_);
@@ -474,31 +466,7 @@ llvm::CodeGenOptLevel IrCompiler::GetCodeGenOptLevel(
 
 absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
 IrCompiler::build_target_machine() const {
-  TF_ASSIGN_OR_RETURN(auto target_machine, target_machine_builder_());
-
-  absl::string_view current_features(target_machine->getTargetFeatureString());
-
-  std::vector<std::string> additional_features;
-  for (absl::string_view feature : absl::StrSplit(current_features, ',')) {
-    // Scatter & gather can result in very poor performance.
-    if (absl::StartsWith(feature, "+avx512")) {
-      additional_features.push_back("+prefer-no-scatter");
-      additional_features.push_back("+prefer-no-gather");
-    }
-  }
-
-  if (additional_features.empty()) {
-    return target_machine;
-  }
-  std::string additional_features_str = absl::StrJoin(additional_features, ",");
-  if (current_features.empty()) {
-    target_machine->setTargetFeatureString(additional_features_str);
-  } else {
-    target_machine->setTargetFeatureString(
-        absl::StrCat(current_features, ",", additional_features_str));
-  }
-
-  return target_machine;
+  return target_machine_builder_();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
index 915af457dddbbf..c170bc0755f9c7 100644
--- a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
+++ b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
@@ -35,9 +35,10 @@ limitations under the License.
 #include "llvm/Support/Error.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/executable.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "tsl/platform/cpu_info.h"
 
 namespace xla::cpu {
 
@@ -64,10 +65,7 @@ class IrCompiler : public llvm::orc::IRCompileLayer::IRCompiler {
     llvm::CodeGenOptLevel opt_level = llvm::CodeGenOptLevel::None;
     bool optimize_for_size = false;
 
-    // Maximum CPU instruction set for wich the compiler should generate code.
-    // If instruction set is empty, compiler will generate code for all ISA
-    // extensions detected on the current machine.
-    std::optional<tsl::port::CPUFeature> max_cpu_feature;
+    TargetMachineOptions target_machine_options;
 
     llvm::FastMathFlags fast_math_flags;
 
@@ -96,22 +94,18 @@ class IrCompiler : public llvm::orc::IRCompileLayer::IRCompiler {
   IrCompiler(TargetMachineBuilder target_machine_builder, Options options,
              CompilationHooks hooks);
 
-  // Infers the `llvm::TargetMachine` for the current host. If `max_cpu_feature`
-  // is provided, it will be used to constrain the set of features that LLVM
-  // codegen (instruction selection) is allowed to use, e.g. it can be used to
-  // explicitly disable certain AVX512 extensions, in case the compiled
-  // executable will be serialized and later loaded on a different machine.
+  // Infers the `llvm::TargetMachine` for the targeted host.
   static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
   InferTargetMachine(const llvm::TargetOptions& target_options,
                      llvm::CodeGenOptLevel opt_level,
-                     std::optional<tsl::port::CPUFeature> max_cpu_feature);
+                     const TargetMachineOptions& target_machine_options);
 
   // Returns a target machine builder that uses `InferTargetMachine` defined
   // above to infer the target machine for the given options.
   static TargetMachineBuilder InferTargetMachineBuilder(
       const llvm::TargetOptions& target_options,
       llvm::CodeGenOptLevel opt_level,
-      std::optional<tsl::port::CPUFeature> max_cpu_feature);
+      const TargetMachineOptions& target_machine_options);
 
   // Compiles a `module` to an ObjectFile.
   llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> operator()(
diff --git a/third_party/xla/xla/backends/cpu/codegen/ir_compiler_test.cc b/third_party/xla/xla/backends/cpu/codegen/ir_compiler_test.cc
index e89c0d413cad52..4cd49d46fce01d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/ir_compiler_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/ir_compiler_test.cc
@@ -39,7 +39,11 @@ limitations under the License.
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/Triple.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
+#include "xla/backends/cpu/target_machine_options.h"
+#include "xla/debug_options_flags.h"
 #include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/cpu_compiler.h"
+#include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -105,7 +109,9 @@ TEST(IrCompilerTest, OverrideIrCompilerCompileOptions) {
 
   std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
       llvm::TargetOptions(),
-      IrCompiler::Options{/*opt_level=*/llvm::CodeGenOptLevel::Aggressive},
+      IrCompiler::Options{/*opt_level=*/llvm::CodeGenOptLevel::Aggressive,
+                          /*optimize_for_size=*/false,
+                          TargetMachineOptions(GetDebugOptionsFromFlags())},
       compilation_hooks);
 
   std::vector<std::unique_ptr<llvm::Module>> modules;
@@ -182,20 +188,29 @@ TEST(IrCompilerTest, TestAdditionalFeatures) {
     absl::string_view features = has_avx512 ? "+avx512f" : "-avx512f";
     absl::string_view triple = "x86_64-unknown-linux-gnu";
     std::string error;
+    llvm::Triple target_triple((llvm::StringRef(triple)));
     const llvm::Target* target =
-        llvm::TargetRegistry::lookupTarget(triple, error);
+        llvm::TargetRegistry::lookupTarget(target_triple, error);
     if (target == nullptr) {
       return absl::InternalError("Failed to lookup target: " + error);
     }
 
+    TargetMachineOptions target_machine_options(triple, cpu_name, features);
+
     llvm::TargetOptions target_options;
     return absl::WrapUnique(target->createTargetMachine(
-        llvm::Triple(llvm::Twine(triple)), cpu_name, features, target_options,
+        llvm::Triple(target_machine_options.triple()),
+        target_machine_options.cpu(),
+        target_machine_options.GetTargetMachineFeatures(), target_options,
         /*RM=*/std::nullopt));
   };
 
-  IrCompiler ir_compiler(std::move(builder), IrCompiler::Options(),
-                         IrCompiler::CompilationHooks());
+  IrCompiler ir_compiler(
+      std::move(builder),
+      IrCompiler::Options{/*opt_level=*/llvm::CodeGenOptLevel::None,
+                          /*optimize_for_size=*/false,
+                          TargetMachineOptions(GetDebugOptionsFromFlags())},
+      IrCompiler::CompilationHooks());
 
   {
     has_avx512 = true;
@@ -218,6 +233,29 @@ TEST(IrCompilerTest, TestAdditionalFeatures) {
   }
 }
 
+TEST(IrCompilerTest, TargetMachineOptionsAreCorrectlySet) {
+  auto context = std::make_unique<llvm::LLVMContext>();
+  IrCompiler::CompilationHooks compilation_hooks;
+
+  TargetMachineOptions target_machine_options(
+      kTargetTripleForHost, kTargetCpuForHost, "+foo-feature,-bar-feature");
+
+  std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
+      llvm::TargetOptions(),
+      IrCompiler::Options{/*opt_level=*/llvm::CodeGenOptLevel::Aggressive,
+                          /*optimize_for_size=*/false, target_machine_options},
+      compilation_hooks);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto target_machine,
+                          ir_compiler->build_target_machine());
+
+  EXPECT_EQ(target_machine->getTargetCPU(), kTargetCpuForHost);
+  EXPECT_EQ(target_machine->getTargetTriple().getTriple(),
+            kTargetTripleForHost);
+  EXPECT_EQ(target_machine->getTargetFeatureString(),
+            "+foo-feature,-bar-feature");
+}
+
 }  // namespace
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc b/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc
index dc54152a151bc2..dfb4110e82f839 100644
--- a/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Target/TargetMachine.h"
@@ -45,6 +46,8 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/target_machine_options.h"
+#include "xla/debug_options_flags.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -97,9 +100,12 @@ TEST(JitCompilerTest, Compile) {
     thread_pool.Schedule(std::move(task));
   };
 
-  std::unique_ptr<IrCompiler> ir_compiler =
-      IrCompiler::Create(llvm::TargetOptions(), IrCompiler::Options(),
-                         IrCompiler::CompilationHooks());
+  std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
+      llvm::TargetOptions(),
+      IrCompiler::Options{/*opt_level=*/llvm::CodeGenOptLevel::None,
+                          /*optimize_for_size=*/false,
+                          TargetMachineOptions(GetDebugOptionsFromFlags())},
+      IrCompiler::CompilationHooks());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto compiler,
@@ -193,9 +199,12 @@ TEST(JitCompilerTest, ExternalDefinitionGenerator) {
     return std::make_unique<ExternalDefinitionGenerator>();
   };
 
-  std::unique_ptr<IrCompiler> ir_compiler =
-      IrCompiler::Create(llvm::TargetOptions(), IrCompiler::Options(),
-                         IrCompiler::CompilationHooks());
+  std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
+      llvm::TargetOptions(),
+      IrCompiler::Options{/*opt_level=*/llvm::CodeGenOptLevel::None,
+                          /*optimize_for_size=*/false,
+                          TargetMachineOptions(GetDebugOptionsFromFlags())},
+      IrCompiler::CompilationHooks());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto compiler,
diff --git a/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.cc b/third_party/xla/xla/backends/cpu/codegen/jit_memory_mapper.cc
similarity index 54%
rename from third_party/xla/xla/service/cpu/orc_jit_memory_mapper.cc
rename to third_party/xla/xla/backends/cpu/codegen/jit_memory_mapper.cc
index 764573967081af..ef518190305d34 100644
--- a/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/jit_memory_mapper.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/orc_jit_memory_mapper.h"
+#include "xla/backends/cpu/codegen/jit_memory_mapper.h"
 
 #include <atomic>
 #include <memory>
 #include <string>
 
 #include "absl/base/const_init.h"
+#include "absl/base/no_destructor.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
@@ -27,27 +28,34 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 
-namespace xla {
-namespace cpu {
-namespace orc_jit_memory_mapper {
+namespace xla::cpu {
 
-static std::atomic<Registrar::MemoryMapperGetter*> mapper_getter_ptr{nullptr};
+static absl::NoDestructor<
+    std::atomic<internal::JitMemoryMapperRegistration::MemoryMapperGetter*>>
+    mapper_getter(nullptr);
 
 static absl::Mutex mapper_instances_mutex(absl::kConstInit);
-static absl::flat_hash_map<std::string,
-                           llvm::SectionMemoryManager::MemoryMapper*>*
-    mapper_instances ABSL_GUARDED_BY(mapper_instances_mutex) = nullptr;
+static absl::NoDestructor<
+    absl::flat_hash_map<std::string, llvm::SectionMemoryManager::MemoryMapper*>>
+    mapper_instances ABSL_GUARDED_BY(mapper_instances_mutex);
+
+internal::JitMemoryMapperRegistration::JitMemoryMapperRegistration(
+    JitMemoryMapperRegistration::MemoryMapperGetter* getter) {
+  JitMemoryMapperRegistration::MemoryMapperGetter* expected_nullptr = nullptr;
+  CHECK(mapper_getter->compare_exchange_strong(expected_nullptr, getter,
+                                               std::memory_order_release,
+                                               std::memory_order_acquire));
+}
 
-llvm::SectionMemoryManager::MemoryMapper* GetInstance(
+llvm::SectionMemoryManager::MemoryMapper* GetJitMemoryMapper(
     absl::string_view allocation_region_name) {
-  Registrar::MemoryMapperGetter* getter =
-      mapper_getter_ptr.load(std::memory_order_acquire);
+  internal::JitMemoryMapperRegistration::MemoryMapperGetter* getter =
+      mapper_getter->load(std::memory_order_acquire);
 
-  {
-    if (getter == nullptr) {
-      return nullptr;
-    }
+  if (getter == nullptr) {
+    return nullptr;
   }
+
   absl::MutexLock lock(mapper_instances_mutex);
   auto it = mapper_instances->find(allocation_region_name);
   if (it == mapper_instances->end()) {
@@ -60,23 +68,4 @@ llvm::SectionMemoryManager::MemoryMapper* GetInstance(
   return it->second;
 }
 
-Registrar::Registrar(Registrar::MemoryMapperGetter* mapper_getter) {
-  Registrar::MemoryMapperGetter* expected_nullptr = nullptr;
-
-  CHECK(mapper_getter_ptr.compare_exchange_strong(
-      expected_nullptr, mapper_getter, std::memory_order_release,
-      std::memory_order_acquire));
-
-  {
-    absl::MutexLock lock(mapper_instances_mutex);
-    if (mapper_instances == nullptr) {
-      mapper_instances =
-          new absl::flat_hash_map<std::string,
-                                  llvm::SectionMemoryManager::MemoryMapper*>();
-    }
-  }
-}
-
-}  // namespace orc_jit_memory_mapper
-}  // namespace cpu
-}  // namespace xla
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/jit_memory_mapper.h b/third_party/xla/xla/backends/cpu/codegen/jit_memory_mapper.h
new file mode 100644
index 00000000000000..c573c8e5f6c795
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/jit_memory_mapper.h
@@ -0,0 +1,57 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_JIT_MEMORY_MAPPER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_JIT_MEMORY_MAPPER_H_
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+
+namespace xla::cpu {
+
+// Registers (if needed) a memory mapper by name and returns it if the
+// memory mapper getter has been set. Otherwise returns nullptr.
+llvm::SectionMemoryManager::MemoryMapper* GetJitMemoryMapper(
+    absl::string_view allocation_region_name);
+
+namespace internal {
+// Registers the `mapper_getter`.  This is a no-op if `mapper_getter` is
+// null. Precondition:  no other memory mapper getter has been registered yet.
+class JitMemoryMapperRegistration {
+ public:
+  using MemoryMapperGetter =
+      std::unique_ptr<llvm::SectionMemoryManager::MemoryMapper>(
+          absl::string_view allocation_region_name);
+  explicit JitMemoryMapperRegistration(MemoryMapperGetter* mapper_getter);
+};
+
+}  // namespace internal
+}  // namespace xla::cpu
+
+#define XLA_CPU_INTERNAL_REGISTER_JIT_MEMORY_MAPPER_GETTER(INSTANCE, COUNT)  \
+  static absl::NoDestructor<xla::cpu::internal::JitMemoryMapperRegistration> \
+  XLA_CPU_INTERNAL_REGISTER_JIT_MEMORY_MAPPER_GETTER_NAME(COUNT)(INSTANCE)
+
+// __COUNTER__ must go through another macro to be properly expanded
+#define XLA_CPU_INTERNAL_REGISTER_JIT_MEMORY_MAPPER_GETTER_NAME(COUNT) \
+  __xla_cpu_jit_memory_mapper_registration_##COUNT
+
+// Registers the MemoryMapperGetter.
+#define XLA_CPU_REGISTER_JIT_MEMORY_MAPPER_GETTER(factory) \
+  XLA_CPU_INTERNAL_REGISTER_JIT_MEMORY_MAPPER_GETTER(factory, __COUNTER__)
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_JIT_MEMORY_MAPPER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
index 294d537bc791b8..2e45622be876da 100644
--- a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
@@ -329,7 +329,7 @@ KernelApiIrBuilder::KernelApiIrBuilder(llvm::LLVMContext& context,
 auto KernelApiIrBuilder::EmitKernelPrototype(
     llvm::Module& module, const HloInstruction* instr,
     const BufferAssignment* buffer_assignment,
-    const std::string& generating_emitter_name, absl::string_view suffix)
+    absl::string_view generating_emitter_name, absl::string_view suffix)
     -> absl::StatusOr<KernelPrototype> {
   TF_ASSIGN_OR_RETURN(std::vector<KernelParameter> arguments,
                       GetKernelArgumentsParameters(instr, buffer_assignment));
@@ -347,7 +347,7 @@ auto KernelApiIrBuilder::EmitKernelPrototype(
     llvm::Module& module, absl::string_view name,
     absl::Span<const KernelParameter> arguments,
     absl::Span<const KernelParameter> results,
-    const std::string& module_memory_region_name)
+    absl::string_view module_memory_region_name)
     -> absl::StatusOr<KernelPrototype> {
   CHECK(&module.getContext() == &context_) << "Module context mismatch";
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
index 579d7fcdfe1789..22c355988102bb 100644
--- a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
+++ b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
@@ -124,14 +124,13 @@ class KernelApiIrBuilder {
   absl::StatusOr<KernelPrototype> EmitKernelPrototype(
       llvm::Module& module, const HloInstruction* instr,
       const BufferAssignment* buffer_assignment,
-      const std::string& generating_emitter_name,
-      absl::string_view suffix = "");
+      absl::string_view generating_emitter_name, absl::string_view suffix = "");
 
   absl::StatusOr<KernelPrototype> EmitKernelPrototype(
       llvm::Module& module, absl::string_view name,
       absl::Span<const KernelParameter> arguments,
       absl::Span<const KernelParameter> results,
-      const std::string& module_memory_region_name);
+      absl::string_view module_memory_region_name);
 
   // Get the kernel name for the given HLO instruction.
   // If generate_unique_c_style_kernel_entry_points is enabled, the name will
diff --git a/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc b/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc
index de7c260fd47c88..5c86bdffd7de5f 100644
--- a/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Target/TargetMachine.h"
@@ -51,6 +52,8 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/jit_compiler.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/target_machine_options.h"
+#include "xla/debug_options_flags.h"
 #include "xla/service/cpu/executable.pb.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -114,7 +117,11 @@ TEST_P(ObjectLoaderTest, Load) {
   ir_compiler_hooks.post_codegen = object_files_saver;
 
   std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
-      llvm::TargetOptions(), IrCompiler::Options(), ir_compiler_hooks);
+      llvm::TargetOptions(),
+      IrCompiler::Options{/*opt_level=*/llvm::CodeGenOptLevel::None,
+                          /*optimize_for_size=*/false,
+                          TargetMachineOptions(GetDebugOptionsFromFlags())},
+      ir_compiler_hooks);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto compiler,
diff --git a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
index c7689f82e2bc54..232bfe0488ba26 100644
--- a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
@@ -522,8 +522,11 @@ void RewriteToPolynomialApproximations(llvm::Module* module,
   rewrite_calls("expf", GenerateVF32Exp, /*vector_width=*/1);
   rewrite_calls("llvm.exp.f32", GenerateVF32Exp, /*vector_width=*/1);
   rewrite_calls(kExpV4F32Sym, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls("llvm.exp.v4f32", GenerateVF32Exp, /*vector_width=*/4);
   rewrite_calls(kExpV8F32Sym, GenerateVF32Exp, /*vector_width=*/8);
+  rewrite_calls("llvm.exp.v8f32", GenerateVF32Exp, /*vector_width=*/8);
   rewrite_calls(kExpV16F32Sym, GenerateVF32Exp, /*vector_width=*/16);
+  rewrite_calls("llvm.exp.v16f32", GenerateVF32Exp, /*vector_width=*/16);
 
   rewrite_calls("llvm.exp.f16", UpcastF16ToF32<GenerateVF32Exp>,
                 /*vector_width=*/1);
diff --git a/third_party/xla/xla/backends/cpu/codegen/scatter_kernel_emitter_test.py b/third_party/xla/xla/backends/cpu/codegen/scatter_kernel_emitter_test.py
index 1717f6a827365c..d6ccc0535a68a2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/scatter_kernel_emitter_test.py
+++ b/third_party/xla/xla/backends/cpu/codegen/scatter_kernel_emitter_test.py
@@ -94,10 +94,11 @@ def create_scatter_runner(
   hlo_module = testlib_cpu.run_fusion_wrapper_pass(hlo_module)
   hlo_module, buffer_assignment = utilities.annotate_hlo_module(hlo_module)
 
-  context = testlib_cpu.MLIRContext()
-
+  mlir_context = testlib_cpu.MLIRContext()
   scatter_emitter = testlib_cpu.ScatterKernelEmitter(
-      hlo_module.get_root_instruction(), buffer_assignment, context
+      hlo_module.get_root_instruction(),
+      buffer_assignment,
+      mlir_context,
   )
   kernel_definition = scatter_emitter.emit_kernel_definition()
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/target_machine_test_base.h b/third_party/xla/xla/backends/cpu/codegen/target_machine_test_base.h
index b55366d21777ed..e646f7735a9b6f 100644
--- a/third_party/xla/xla/backends/cpu/codegen/target_machine_test_base.h
+++ b/third_party/xla/xla/backends/cpu/codegen/target_machine_test_base.h
@@ -48,15 +48,15 @@ class TargetMachineTestBase : public HloHardwareIndependentTestBase {
   std::unique_ptr<llvm::TargetMachine> CreateTargetMachine(
       absl::string_view triple_string, absl::string_view cpu_name,
       absl::string_view features) {
+    llvm::Twine triple_twine(triple_string);
+    llvm::Triple triple(triple_twine);
     std::string error;
     const llvm::Target* target =
-        llvm::TargetRegistry::lookupTarget(triple_string, error);
+        llvm::TargetRegistry::lookupTarget(triple, error);
     if (target == nullptr) {
       LOG(ERROR) << "Failed to lookup target: " << error;
     }
 
-    llvm::Twine triple_twine(triple_string);
-    llvm::Triple triple(triple_twine);
     llvm::TargetOptions target_options;
     return absl::WrapUnique(target->createTargetMachine(
         triple, cpu_name, features, target_options, /*RM=*/std::nullopt));
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
new file mode 100644
index 00000000000000..4d01ccd097a39d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
@@ -0,0 +1,68 @@
+load("//xla:py_strict.bzl", "py_strict_test")
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_cuda_or_rocm_is_configured",
+)
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+py_strict_test(
+    name = "tiled_kernel_test",
+    srcs = ["tiled_kernel_test.py"],
+    main = "tiled_kernel_test.py",
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        "//third_party/py/numpy",
+        "//xla/backends/cpu/testlib",
+        "//xla/codegen/testlib",
+        "@absl_py//absl/testing:absltest",
+    ],
+)
+
+cc_library(
+    name = "tiled_fusion_emitter",
+    # As the tiled emitter currently depends on GPU code we need to add a stub in the case that CUDA
+    # or ROCm is not enabled (in effect this is non-Linux builds).
+    srcs = if_cuda_or_rocm_is_configured(
+        ["tiled_fusion_emitter.cc"],
+        ["tiled_fusion_emitter_stub.cc"],
+    ),
+    hdrs = ["tiled_fusion_emitter.h"],
+    visibility = ["//xla/backends/cpu/codegen:__pkg__"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/backends/cpu/codegen:kernel_api_ir_builder",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_spec",
+        "//xla/codegen:mlir_kernel_source",
+        "//xla/codegen/emitters:kernel_api_builder",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:work_dimensions",
+        "//xla/service:buffer_assignment",
+        "//xla/service:instruction_fusion",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ] + if_cuda_or_rocm_is_configured([
+        "//xla/backends/gpu/codegen/triton:fusion_emitter",
+    ]),
+)
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
new file mode 100644
index 00000000000000..979da369e0a786
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
@@ -0,0 +1,241 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
+#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/emitters/kernel_api_builder.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_spec.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/codegen/xtile/ir/xtile_attrs.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/primitive_util.h"
+#include "xla/runtime/work_dimensions.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<std::vector<FlatTiling>> GetTiling(
+    mlir::MLIRContext& context, const HloFusionInstruction& fusion) {
+  auto symbolic_tile_analysis_or = SymbolicTileAnalysis::AnalyzeComputation(
+      *fusion.fused_instructions_computation(), &context);
+  if (std::holds_alternative<FusionDecision>(symbolic_tile_analysis_or)) {
+    return Internal(
+        "Unsupported fusion in EmitGeneric: %s",
+        std::get<FusionDecision>(symbolic_tile_analysis_or).Explain());
+  }
+
+  const auto& symbolic_tile_analysis =
+      std::get<SymbolicTileAnalysis>(symbolic_tile_analysis_or);
+
+  TF_ASSIGN_OR_RETURN(auto valid_tilings,
+                      symbolic_tile_analysis.GetValidTilings());
+  if (valid_tilings.empty()) {
+    return Internal("No valid tilings found for fusion: %s", fusion.name());
+  }
+
+  // TODO(willfroom): Improve this heuristic.
+  constexpr int64_t kTargetDimSize = 8;
+
+  auto l1_distance = [&](llvm::ArrayRef<int64_t> tile_sizes) {
+    int64_t distance = 0;
+    for (auto [dim, tile_size] :
+         llvm::zip(fusion.shape().dimensions(), tile_sizes)) {
+      auto target_dim = std::min<int64_t>(dim, kTargetDimSize);
+      distance += std::abs(target_dim - tile_size);
+    }
+    return distance;
+  };
+
+  auto root_hlo = fusion.fused_instructions_computation()->root_instruction();
+  std::vector<int64_t> filtered_tilings;
+  int64_t best_distance = std::numeric_limits<int64_t>::max();
+  FlatTiling best_tile_sizes;
+  for (const auto& tiling : valid_tilings) {
+    auto tile_sizes = tiling.tile_sizes().at(root_hlo);
+    auto distance_to_target = l1_distance(tile_sizes);
+
+    if (distance_to_target < best_distance) {
+      best_distance = distance_to_target;
+      best_tile_sizes.assign(tile_sizes.begin(), tile_sizes.end());
+    }
+  }
+
+  std::vector<FlatTiling> result{best_tile_sizes};
+  return result;
+}
+
+// We don't currently support sub-byte types in the tiled CPU emitter.
+static bool IsSupportedType(PrimitiveType type) {
+  if (type == PRED) {
+    return true;
+  }
+
+  if (primitive_util::BitWidth(type) < 8) {
+    return false;
+  }
+
+  if (primitive_util::IsUnsignedIntegralType(type)) {
+    return false;
+  }
+
+  if (primitive_util::IsComplexType(type)) {
+    return false;
+  }
+
+  return true;
+}
+
+static bool IsSupportedShape(const Shape& shape) {
+  bool is_supported = true;
+  ShapeUtil::ForEachSubshape(
+      shape, [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsArray()) {
+          if (!IsSupportedType(subshape.element_type())) {
+            is_supported = false;
+          }
+        }
+      });
+
+  return is_supported;
+}
+
+static bool IsSupportedInstruction(const HloInstruction& inst) {
+  HloOpcode opcode = inst.opcode();
+  switch (opcode) {
+    case xla::HloOpcode::kBitcast:
+    case xla::HloOpcode::kIota:
+    case xla::HloOpcode::kReshape:
+    case xla::HloOpcode::kTranspose:
+    case xla::HloOpcode::kParameter:
+    case xla::HloOpcode::kConstant:
+      return true;
+    case xla::HloOpcode::kBitcastConvert:
+      return false;
+      break;
+    default:
+      return inst.IsElementwise();
+  }
+}
+
+absl::StatusOr<std::vector<FlatTiling>> GetTilingIfSupported(
+    mlir::MLIRContext& context, const HloFusionInstruction& fusion) {
+  // TODO(willfroom): Support multi-output fusions.
+  if (!fusion.shape().IsArray()) {
+    return Internal(
+        "Multi-output fusions are not supported by the tiled CPU emitter.");
+  }
+
+  for (const HloInstruction* operand : fusion.operands()) {
+    if (!operand->shape().IsArray()) {
+      return Internal(
+          "Non-array operands are not supported by the tiled CPU emitter.");
+    }
+  }
+
+  for (const HloInstruction* inst : fusion.fused_instructions()) {
+    if (!IsSupportedShape(inst->shape())) {
+      return Internal(
+          "Instruction %s has a type, which is not supported by the "
+          "tiled CPU emitter.",
+          inst->ToString());
+    }
+
+    if (!IsSupportedInstruction(*inst)) {
+      return Internal(
+          "Instruction %s is not supported by the tiled CPU emitter.",
+          inst->ToString());
+    }
+  }
+
+  return GetTiling(context, fusion);
+}
+
+absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitTiledFusionKernel(
+    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+    const BufferAssignment* buffer_assignment, absl::string_view name,
+    int64_t num_work_groups, absl::Span<const FlatTiling> tiling) {
+  gpu::BlockLevelParameters block_level_parameters;
+  for (const auto& tile_sizes : tiling) {
+    block_level_parameters.output_tile_sizes.emplace_back(tile_sizes.begin(),
+                                                          tile_sizes.end());
+  }
+
+  TF_ASSIGN_OR_RETURN(auto module,
+                      gpu::EmitXTileModule(name, nullptr, &fusion,
+                                           block_level_parameters, context));
+  module->setName(absl::StrCat("__compute_module", "_", name));
+
+  int64_t num_tiles = 1;
+  for (auto [dim, tile_size] :
+       llvm::zip(fusion.shape().dimensions(),
+                 block_level_parameters.output_tile_sizes.front())) {
+    num_tiles *= CeilOfRatio(dim, tile_size);
+  }
+  int64_t tiles_per_workgroup =
+      CeilOfRatio<int64_t>(num_tiles, num_work_groups);
+  module->walk([&](xtile::EntryFuncOp op) {
+    auto info = xtile::TilingInfoAttr::get(op->getContext(), num_tiles,
+                                           tiles_per_workgroup);
+    op->setAttr("xtile.tiling_info", info);
+  });
+
+  module->getOperation()->setAttr(
+      xla::CpuMemoryRegionNameAttr::name,
+      mlir::StringAttr::get(
+          &context, BuildModuleMemoryRegionName("tiled_emitter", &fusion)));
+
+  WorkDimensions work_dimensions;
+  work_dimensions.num_work_groups.x = num_work_groups;
+  TF_ASSIGN_OR_RETURN(KernelSpec kernel_spec,
+                      emitters::GetKernelSpec(name, fusion, buffer_assignment,
+                                              work_dimensions));
+  return KernelDefinition<MlirKernelSource>(
+      std::move(kernel_spec), MlirKernelSource(std::move(module)));
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
new file mode 100644
index 00000000000000..d2f88d17d85b74
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
@@ -0,0 +1,45 @@
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/service/buffer_assignment.h"
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TILED_TILED_FUSION_EMITTER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TILED_TILED_FUSION_EMITTER_H_
+
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<std::vector<FlatTiling>> GetTilingIfSupported(
+    mlir::MLIRContext& context, const HloFusionInstruction& fusion);
+
+absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitTiledFusionKernel(
+    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+    const BufferAssignment* buffer_assignment, absl::string_view name,
+    int64_t num_work_groups, absl::Span<const FlatTiling> tiling);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TILED_TILED_FUSION_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
new file mode 100644
index 00000000000000..37f2abadb37ce4
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
@@ -0,0 +1,45 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<std::vector<FlatTiling>> GetTilingIfSupported(
+    mlir::MLIRContext& context, const HloFusionInstruction& fusion) {
+  return absl::UnimplementedError("not supported for this build configuration");
+}
+
+absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitTiledFusionKernel(
+    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+    const BufferAssignment* buffer_assignment, absl::string_view name,
+    int64_t num_work_groups, absl::Span<const FlatTiling> tiling) {
+  return absl::UnimplementedError("not supported for this build configuration");
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py
new file mode 100644
index 00000000000000..061b5ad8e0e1eb
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py
@@ -0,0 +1,479 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from collections.abc import Callable, Iterable
+from typing import Optional
+
+from absl.testing import absltest
+import numpy as np
+
+from xla.backends.cpu import testlib as cpu_testlib
+from xla.codegen import testlib as base_testlib
+from xla.codegen.testlib import utilities as testlib_utilities
+
+create_literal = testlib_utilities.create_literal_from_np
+
+
+class InputSpec:
+
+  def __init__(self, shape: tuple[int, ...]):
+    """Initializes the InputSpec.
+
+    Args:
+      shape: The shape of the input array.
+    """
+    self.shape = shape
+
+
+def get_random_array(shape: tuple[int, ...], dtype: np.dtype) -> np.ndarray:
+  rng = np.random.default_rng()
+  return rng.uniform(low=-5, high=5, size=shape).astype(dtype)
+
+
+def compare_kernel(
+    ir: str,
+    kernel_name: str,
+    num_workgroups: int,
+    input_specs: Iterable[InputSpec],
+    output_shape: tuple[int, ...],
+    dtype,
+    expected_output: Callable[[np.ndarray, ...], np.ndarray],
+    maxulp: Optional[int] = None,
+) -> None:
+  mlir_emitter = cpu_testlib.MlirTestKernelEmitter(
+      ir, kernel_name, (num_workgroups, 1, 1)
+  )
+  kernel_definition = mlir_emitter.emit_kernel_definition()
+
+  runner = cpu_testlib.KernelRunner.create(
+      kernel_definition,
+      cpu_testlib.JitCompiler(base_testlib.HloModuleConfig()),
+  )
+
+  def get_input(spec: InputSpec):
+    return np.arange(np.prod(spec.shape), dtype=dtype).reshape(spec.shape)
+
+  inputs = [get_input(spec) for spec in input_specs]
+
+  input_tensors = [create_literal(input) for input in inputs]
+  # Use a random array as the output to ensure all values are written to.
+  output_tensor = create_literal(get_random_array(output_shape, dtype))
+  runner.call(input_tensors + [output_tensor])
+
+  output_np = np.asarray(output_tensor)
+  expected_output_np = expected_output(*inputs)
+  if maxulp is None:
+    np.testing.assert_array_equal(output_np, expected_output_np)
+  else:
+    np.testing.assert_array_max_ulp(
+        output_np, expected_output_np, maxulp=maxulp
+    )
+
+
+class XtileLoweringTest(absltest.TestCase):
+
+  def test_slice(self):
+    # Check that masked extract / insert works.
+    ir = """
+      module @tiled_slice {
+        xtile.entry_func @tiled_slice(
+            %input: memref<5x5xf32>,
+            %output: memref<5x5xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %offset = arith.constant 0 : index
+          %input_tile = xtile.extract %input[%offset, %offset][64, 64][1, 1] : memref<5x5xf32> -> tensor<64x64xf32>
+          %transposed_tile = stablehlo.transpose %input_tile, dims = [1, 0] : (tensor<64x64xf32>) -> tensor<64x64xf32>
+          xtile.insert %transposed_tile into %output[%offset, %offset][64, 64][1, 1] : tensor<64x64xf32> -> memref<5x5xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "tiled_slice",
+        1,
+        [InputSpec((5, 5))],
+        (5, 5),
+        np.float32,
+        lambda arg: arg.transpose(),
+    )
+
+  def test_strided(self):
+    ir = """
+      module @tiled_slice {
+        xtile.entry_func @tiled_slice(
+            %input: memref<64x64xf32>,
+            %output: memref<4x32xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %input_tile = xtile.extract %input[%tile_id, %tile_id][4, 32][21, 2] : memref<64x64xf32> -> tensor<4x32xf32>
+          xtile.insert %input_tile into %output[%tile_id, %tile_id][4, 32][1, 1] : tensor<4x32xf32> -> memref<4x32xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "tiled_slice",
+        1,
+        [InputSpec((64, 64))],
+        (4, 32),
+        np.float32,
+        lambda arg: arg[::21, ::2],
+    )
+
+  def test_transpose(self):
+    ir = """
+      module @tiled_transpose {
+        xtile.entry_func @tiled_transpose(
+            %input: memref<4096x4096xf32>,
+            %output: memref<4096x4096xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:262144, tiles_per_workgroup:32768>} {
+          %offset_0 = xla.apply_indexing #xla.indexing_map<"(tid) -> ((tid mod 512) * 8), domain: tid in [0, 262144]">(%tile_id)
+          %offset_1 = xla.apply_indexing #xla.indexing_map<"(tid) -> ((tid floordiv 512) * 8), domain: tid in [0, 262144]">(%tile_id)
+          %input_tile = xtile.extract %input[%offset_0, %offset_1][8, 8][1, 1] : memref<4096x4096xf32> -> tensor<8x8xf32>
+          %transposed_tile = stablehlo.transpose %input_tile, dims = [1, 0] : (tensor<8x8xf32>) -> tensor<8x8xf32>
+          xtile.insert %transposed_tile into %output[%offset_1, %offset_0][8, 8][1, 1] : tensor<8x8xf32> -> memref<4096x4096xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "tiled_transpose",
+        8,
+        [InputSpec((4096, 4096))],
+        (4096, 4096),
+        np.float32,
+        lambda arg: arg.transpose(),
+    )
+
+  def test_add_tranpose(self):
+    ir = """
+      module @add_tranpose {
+        xtile.entry_func @add_tranpose(
+            %input: memref<4096x4096xf32>,
+            %output: memref<4096x4096xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:262144, tiles_per_workgroup:32768>} {
+          %offset_0 = xla.apply_indexing #xla.indexing_map<"(tid) -> ((tid mod 512) * 8), domain: tid in [0, 262144]">(%tile_id)
+          %offset_1 = xla.apply_indexing #xla.indexing_map<"(tid) -> ((tid floordiv 512) * 8), domain: tid in [0, 262144]">(%tile_id)
+          %input_tile_0 = xtile.extract %input[%offset_0, %offset_1][8, 8][1, 1] : memref<4096x4096xf32> -> tensor<8x8xf32>
+          %input_tile_1 = xtile.extract %input[%offset_1, %offset_0][8, 8][1, 1] : memref<4096x4096xf32> -> tensor<8x8xf32>
+          %transposed_tile = stablehlo.transpose %input_tile_0, dims = [1, 0] : (tensor<8x8xf32>) -> tensor<8x8xf32>
+          %added_tile = arith.addf %input_tile_1, %transposed_tile : tensor<8x8xf32>
+          xtile.insert %added_tile into %output[%offset_1, %offset_0][8, 8][1, 1] : tensor<8x8xf32> -> memref<4096x4096xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "add_tranpose",
+        8,
+        [InputSpec((4096, 4096))],
+        (4096, 4096),
+        np.float32,
+        lambda arg: arg + arg.transpose(),
+    )
+
+  def test_dot_single_tile(self):
+    ir = """
+      module @dot_single_tile {
+        xtile.entry_func @dot_single_tile(
+            %lhs: memref<8x16xf32>,
+            %rhs: memref<16x8xf32>,
+            %output: memref<8x8xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %offset = arith.constant 0 : index
+          %lhs_tile = xtile.extract %lhs[%offset, %offset][8, 16][1, 1] : memref<8x16xf32> -> tensor<8x16xf32>
+          %rhs_tile = xtile.extract %rhs[%offset, %offset][16, 8][1, 1] : memref<16x8xf32> -> tensor<16x8xf32>
+          %result = stablehlo.dot_general %lhs_tile, %rhs_tile, contracting_dims = [1] x [0] : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+          xtile.insert %result into %output[%offset, %offset][8, 8][1, 1] : tensor<8x8xf32> -> memref<8x8xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "dot_single_tile",
+        1,
+        [InputSpec((8, 16)), InputSpec((16, 8))],
+        (8, 8),
+        np.float32,
+        lambda lhs, rhs: lhs @ rhs,
+        maxulp=5,
+    )
+
+  def test_dot_scalar_output(self):
+    ir = """
+      module @test_dot_scalar_output {
+        xtile.entry_func @test_dot_scalar_output(
+            %lhs: memref<8x16xf32>,
+            %rhs: memref<16x8xf32>,
+            %output: memref<f32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %offset = arith.constant 0 : index
+          %lhs_tile = xtile.extract %lhs[%offset, %offset][8, 16][1, 1] : memref<8x16xf32> -> tensor<8x16xf32>
+          %rhs_tile = xtile.extract %rhs[%offset, %offset][16, 8][1, 1] : memref<16x8xf32> -> tensor<16x8xf32>
+          %result = stablehlo.dot_general %lhs_tile, %rhs_tile, contracting_dims = [1, 0] x [0, 1] : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<f32>
+          xtile.insert %result into %output[][][] : tensor<f32> -> memref<f32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "test_dot_scalar_output",
+        1,
+        [InputSpec((8, 16)), InputSpec((16, 8))],
+        (),
+        np.float32,
+        lambda lhs, rhs: np.tensordot(lhs, rhs, axes=[[1, 0], [0, 1]]),
+        maxulp=8,
+    )
+
+  def test_dot_fusion_single_tile(self):
+    ir = """
+      module @dot_fusion_single_tile {
+        xtile.entry_func @dot_fusion_single_tile(
+            %lhs_0: memref<8x16xf32>,
+            %lhs_1: memref<8x16xf32>,
+            %rhs: memref<16x1xf32>,
+            %output: memref<8x1xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %offset = arith.constant 0 : index
+          %lhs_0_tile = xtile.extract %lhs_0[%offset, %offset][8, 16][1, 1] : memref<8x16xf32> -> tensor<8x16xf32>
+          %lhs_1_tile = xtile.extract %lhs_1[%offset, %offset][8, 16][1, 1] : memref<8x16xf32> -> tensor<8x16xf32>
+          %add_lhs = arith.addf %lhs_0_tile, %lhs_1_tile : tensor<8x16xf32>
+          %rhs_tile = xtile.extract %rhs[%offset, %offset][16, 1][1, 1] : memref<16x1xf32> -> tensor<16xf32>
+          %result = stablehlo.dot_general %add_lhs, %rhs_tile, contracting_dims = [1] x [0] : (tensor<8x16xf32>, tensor<16xf32>) -> tensor<8xf32>
+          %tanh_result = math.tanh %result : tensor<8xf32>
+          xtile.insert %tanh_result into %output[%offset, %offset][8, 1][1, 1] : tensor<8xf32> -> memref<8x1xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "dot_fusion_single_tile",
+        1,
+        [
+            InputSpec((8, 16)),
+            InputSpec((8, 16)),
+            InputSpec((16, 1)),
+        ],
+        (8, 1),
+        np.float32,
+        lambda lhs_0, lhs_1, rhs: np.tanh((lhs_0 + lhs_1) @ rhs),
+        maxulp=5,
+    )
+
+  def test_reduction_add_inner(self):
+    ir = """
+      module @reduction_add_inner {
+        xtile.entry_func @reduction_add_inner(
+            %input: memref<1024x32xf32>,
+            %init: memref<f32>,
+            %output: memref<1024xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:128, tiles_per_workgroup:32>} {
+          %c_0 = arith.constant 0 : index
+          %c_8 = arith.constant 8 : index
+          %init_tile = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %index = arith.muli %tile_id, %c_8 : index
+          %input_tile = xtile.extract %input[%index, %c_0][8, 32][1, 1] : memref<1024x32xf32> -> tensor<8x32xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_tile)
+                    across dimensions = [1]
+                    : (tensor<8x32xf32>, tensor<f32>) -> tensor<8xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%index][8][1] : tensor<8xf32> -> memref<1024xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_inner",
+        4,
+        [InputSpec((1024, 32)), InputSpec((1,))],
+        (1024,),
+        np.int32,
+        lambda input, init: np.sum(input, axis=1) + init,
+    )
+
+  def test_reduction_add_outer(self):
+    ir = """
+      module @reduction_add_outer {
+        xtile.entry_func @reduction_add_outer(
+            %input: memref<1024x32xf32>,
+            %init: memref<f32>,
+            %output: memref<32xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:4, tiles_per_workgroup:1>} {
+          %c_0 = arith.constant 0 : index
+          %c_8 = arith.constant 8 : index
+          %init_tile = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %index = arith.muli %tile_id, %c_8 : index
+          %input_tile = xtile.extract %input[%c_0, %index][1024, 8][1, 1] : memref<1024x32xf32> -> tensor<1024x8xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_tile)
+                    across dimensions = [0]
+                    : (tensor<1024x8xf32>, tensor<f32>) -> tensor<8xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%index][8][1] : tensor<8xf32> -> memref<32xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_outer",
+        4,
+        [InputSpec((1024, 32)), InputSpec((1,))],
+        (32,),
+        np.float32,
+        lambda input, init: np.sum(input, axis=0),
+    )
+
+  def test_reduction_middle(self):
+    ir = """
+      module @reduction_add_middle {
+        xtile.entry_func @reduction_add_middle(
+            %input: memref<8x4x2xf32>,
+            %init: memref<f32>,
+            %output: memref<8x2xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %init_val = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %input_tile = xtile.extract %input[%tile_id, %tile_id, %tile_id][8, 4, 2][1, 1, 1] : memref<8x4x2xf32> -> tensor<8x4x2xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_val)
+                    across dimensions = [1]
+                    : (tensor<8x4x2xf32>, tensor<f32>) -> tensor<8x2xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%tile_id, %tile_id][8, 2][1, 1] : tensor<8x2xf32> -> memref<8x2xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_middle",
+        1,
+        [InputSpec((8, 4, 2)), InputSpec((1,))],
+        (8, 2),
+        np.float32,
+        lambda input, init: np.sum(input, axis=1),
+    )
+
+  def test_reduction_outer_inner(self):
+    ir = """
+      module @reduction_add_outer_inner {
+        xtile.entry_func @reduction_add_outer_inner(
+            %input: memref<8x4x2xf32>,
+            %init: memref<f32>,
+            %output: memref<4xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %init_val = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %input_tile = xtile.extract %input[%tile_id, %tile_id, %tile_id][8, 4, 2][1, 1, 1] : memref<8x4x2xf32> -> tensor<8x4x2xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_val)
+                    across dimensions = [0, 2]
+                    : (tensor<8x4x2xf32>, tensor<f32>) -> tensor<4xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%tile_id][4][1] : tensor<4xf32> -> memref<4xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_outer_inner",
+        1,
+        [InputSpec((8, 4, 2)), InputSpec((1,))],
+        (4,),
+        np.float32,
+        lambda input, init: np.sum(input, axis=(0, 2)),
+    )
+
+  def test_broadcast_in_dim_inner(self):
+    ir = """
+      module @broadcast_in_dim_inner {
+        xtile.entry_func @broadcast_in_dim_inner(
+            %input: memref<4xf32>,
+            %output: memref<32x4xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %input_tile = xtile.extract %input[%tile_id][4][1] : memref<4xf32> -> tensor<4xf32>
+          %result = stablehlo.broadcast_in_dim %input_tile, dims = [1] : (tensor<4xf32>) -> tensor<32x4xf32>
+          xtile.insert %result into %output[%tile_id, %tile_id][32,4][1,1] : tensor<32x4xf32> -> memref<32x4xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "broadcast_in_dim_inner",
+        1,
+        [InputSpec((4,))],
+        (32, 4),
+        np.float32,
+        lambda input: np.broadcast_to(input, (32, 4)),
+    )
+
+  def test_broadcast_in_dim_outer(self):
+    ir = """
+      module @broadcast_in_dim_outer {
+        xtile.entry_func @broadcast_in_dim_outer(
+            %input: memref<4xf32>,
+            %output: memref<4x32xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %input_tile = xtile.extract %input[%tile_id][4][1] : memref<4xf32> -> tensor<4xf32>
+          %result = stablehlo.broadcast_in_dim %input_tile, dims = [0] : (tensor<4xf32>) -> tensor<4x32xf32>
+          xtile.insert %result into %output[%tile_id, %tile_id][4,32][1,1] : tensor<4x32xf32> -> memref<4x32xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "broadcast_in_dim_outer",
+        1,
+        [InputSpec((4,))],
+        (4, 32),
+        np.float32,
+        lambda input: np.transpose(np.broadcast_to(input, (32, 4))),
+    )
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
new file mode 100644
index 00000000000000..6aaea220050ae5
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
@@ -0,0 +1,117 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=XTileCpuTransforms",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "lowering_utils",
+    srcs = [
+        "lowering_utils.cc",
+    ],
+    hdrs = ["lowering_utils.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:VectorDialect",
+    ],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "fuse_elementwise_pass.cc",
+        "linalg_elementwise_to_vector_pass.cc",
+        "lower_xtile_entry.cc",
+        "memref_copy_to_loops.cc",
+        "shlo_to_vector.cc",
+        "tensor_ops_to_bufferizable.cc",
+        "vector_to_scalar_pass.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":lowering_utils",
+        ":passes_inc_gen",
+        ":vectorized_reduce_emitter",
+        "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/xtile/ir:xtile",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:LinalgUtils",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:MemRefUtils",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFUtils",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:UBDialect",
+        "@llvm-project//mlir:VectorDialect",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "vectorized_reduce_emitter",
+    srcs = ["vectorized_reduce_emitter.cc"],
+    hdrs = ["vectorized_reduce_emitter.h"],
+    deps = [
+        ":lowering_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:VectorDialect",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/fuse_elementwise_pass.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/fuse_elementwise_pass.cc
new file mode 100644
index 00000000000000..d8248ebea93845
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/fuse_elementwise_pass.cc
@@ -0,0 +1,73 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DEF_FUSEELEMENTWISEPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+class FuseElementwisePass
+    : public impl::FuseElementwisePassBase<FuseElementwisePass> {
+ public:
+  using FuseElementwisePassBase::FuseElementwisePassBase;
+
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+
+    // Only fuse op with one use.
+    mlir::linalg::ControlFusionFn fuse_control_fn =
+        [](mlir::OpOperand* fused_operand) {
+          mlir::Operation* producer = fused_operand->get().getDefiningOp();
+          return producer && producer->hasOneUse();
+        };
+
+    mlir::linalg::populateElementwiseOpsFusionPatterns(patterns,
+                                                       fuse_control_fn);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateFuseElementwisePass() {
+  return std::make_unique<FuseElementwisePass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/linalg_elementwise_to_vector_pass.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/linalg_elementwise_to_vector_pass.cc
new file mode 100644
index 00000000000000..4126bc4a17df23
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/linalg_elementwise_to_vector_pass.cc
@@ -0,0 +1,110 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL_LINALGELEMENTWISETOVECTORPASS
+#define GEN_PASS_DEF_LINALGELEMENTWISETOVECTORPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+class ElementwiseToVectorPattern
+    : public mlir::OpInterfaceRewritePattern<mlir::linalg::LinalgOp> {
+ public:
+  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::linalg::LinalgOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (!mlir::linalg::isElementwise(op)) {
+      return rewriter.notifyMatchFailure(op, "Op is not elementwise");
+    }
+
+    // Is this possible?
+    if (op.getDpsInits().empty()) {
+      return rewriter.notifyMatchFailure(op, "op has no outputs");
+    }
+
+    auto result_type =
+        mlir::dyn_cast<mlir::ShapedType>(op.getDpsInits().front().getType());
+    if (!result_type) {
+      return rewriter.notifyMatchFailure(op, "could not convert result type");
+    }
+    if (!result_type.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(op,
+                                         "only static shapes are supported");
+    }
+
+    // The default linalg vectorization is very naive and just replaces the
+    // elementwise op with a transfer_read -> super_vector -> transfer_write,
+    // but this works as a first pass.
+    // TODO(willfroom): replace this with explicit loops on natural vector
+    // sizes.
+    mlir::FailureOr<mlir::linalg::VectorizationResult> result =
+        mlir::linalg::vectorize(rewriter, op);
+
+    if (mlir::failed(result)) {
+      return rewriter.notifyMatchFailure(op, "could not vectorize");
+    }
+
+    rewriter.replaceOp(op, result->replacements);
+    return mlir::success();
+  }
+};
+
+struct LinalgElementwiseToVectorPass
+    : public impl::LinalgElementwiseToVectorPassBase<
+          LinalgElementwiseToVectorPass> {
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<ElementwiseToVectorPattern>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateLinalgElementwiseToVectorPass() {
+  return std::make_unique<LinalgElementwiseToVectorPass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
new file mode 100644
index 00000000000000..6ae31ae3594c00
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
@@ -0,0 +1,234 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/WalkResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.h"
+#include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_types.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/xtile/ir/xtile_attrs.h"
+#include "xla/codegen/xtile/ir/xtile_dialect.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL_LOWERXTILEENTRYPASS
+#define GEN_PASS_DEF_LOWERXTILEENTRYPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+struct LowerXtileEntry : mlir::OpRewritePattern<xtile::EntryFuncOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      xtile::EntryFuncOp op, mlir::PatternRewriter& rewriter) const override {
+    llvm::SmallVector<mlir::NamedAttribute> filtered_attrs;
+    for (const auto& attr : op->getAttrs()) {
+      if (!absl::c_linear_search(mlir::func::FuncOp::getAttributeNames(),
+                                 attr.getName())) {
+        filtered_attrs.push_back(attr);
+      }
+    }
+
+    auto new_func_op = rewriter.create<mlir::func::FuncOp>(
+        op->getLoc(), op.getSymName(), op.getFunctionType(), filtered_attrs);
+    new_func_op.setArgAttrsAttr(op.getArgAttrsAttr());
+
+    // Move the region from the old function to the new one.
+    rewriter.inlineRegionBefore(op.getBody(), new_func_op.getBody(),
+                                new_func_op.getBody().end());
+
+    // Replace the original operation. Since a function definition does not
+    // produce any results, we replace it with an empty list of values.
+    rewriter.replaceOp(op, new_func_op);
+
+    return mlir::success();
+  }
+};
+
+struct LowerXTileEntryReturn
+    : mlir::OpRewritePattern<xtile::EntryFuncReturnOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      xtile::EntryFuncReturnOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    rewriter.replaceOp(op, rewriter.create<mlir::func::ReturnOp>(op->getLoc()));
+    return mlir::success();
+  }
+};
+
+class LowerXTileEntryPass
+    : public impl::LowerXTileEntryPassBase<LowerXTileEntryPass> {
+ public:
+  using LowerXTileEntryPassBase::LowerXTileEntryPassBase;
+
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    if (WrapInCallFrame(module).failed()) {
+      signalPassFailure();
+      return;
+    }
+
+    patterns.add<LowerXtileEntry, LowerXTileEntryReturn>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+
+ private:
+  // Wrap the entry function in another func that abides by the XLA:CPU ABI.
+  mlir::LogicalResult WrapInCallFrame(mlir::ModuleOp module) {
+    mlir::MLIRContext* context = module.getContext();
+    mlir::ImplicitLocOpBuilder builder(module->getLoc(), module);
+
+    for (auto entry_func : module.getOps<xtile::EntryFuncOp>()) {
+      if (!entry_func.symbolKnownUseEmpty(module)) {
+        module->emitError() << "entry function is itself called.";
+        return mlir::failure();
+      }
+
+      llvm::StringRef kernel_name = entry_func.getName();
+      std::string kernel_impl_name =
+          absl::StrCat(absl::AlphaNum(entry_func.getName()), "_impl");
+      entry_func.setName(kernel_impl_name);
+      entry_func.setPrivate();
+      entry_func->setAttr(
+          "llvm.linkage",
+          mlir::LLVM::LinkageAttr::get(context, mlir::LLVM::Linkage::Internal));
+      entry_func->setAttr("always_inline", builder.getUnitAttr());
+
+      auto call_frame_type = CallFrameType::get(context);
+      auto error_type = ErrorType::get(context);
+      builder.setInsertionPointToStart(module.getBody());
+      mlir::func::FuncOp kernel_func = builder.create<mlir::func::FuncOp>(
+          kernel_name,
+          builder.getFunctionType({call_frame_type}, {error_type}));
+
+      builder.setInsertionPointToStart(kernel_func.addEntryBlock());
+
+      auto call_frame = mlir::cast<mlir::TypedValue<CallFrameType>>(
+          kernel_func.getArgument(0));
+      llvm::SmallVector<mlir::Value> call_args;
+      for (const auto& [idx, arg] :
+           llvm::enumerate(entry_func.getBufferArgs())) {
+        LoadOp load = builder.create<LoadOp>(arg.getType(), call_frame, idx);
+        call_args.push_back(load);
+      }
+
+      auto tile_info = entry_func->getAttrOfType<xla::xtile::TilingInfoAttr>(
+          "xtile.tiling_info");
+
+      if (!tile_info) {
+        entry_func->emitError() << "missing tiling info.";
+        return mlir::failure();
+      }
+      int32_t tile_count = tile_info.getTileCount();
+      int32_t tiles_per_workgroup = tile_info.getTilesPerWorkgroup();
+
+      mlir::Value tile_count_value =
+          builder.create<mlir::arith::ConstantIndexOp>(tile_count);
+      mlir::Value tiles_per_workgroup_value =
+          builder.create<mlir::arith::ConstantIndexOp>(tiles_per_workgroup);
+      mlir::Value workgroup_id = builder.create<ExtractWorkgroupIdOp>(
+          builder.getIndexType(), call_frame, WorkGroupDimension::x);
+
+      auto flags = mlir::arith::IntegerOverflowFlags::nsw |
+                   mlir::arith::IntegerOverflowFlags::nuw;
+
+      // This isn't needed for correctness as the workgroup id passed from the
+      // runtime will always be in bounds but it constrains the range which LLVM
+      // can then take advantage of.
+      mlir::Value bounded_workgroup_id = builder.create<mlir::arith::MaxSIOp>(
+          workgroup_id, builder.create<mlir::arith::ConstantIndexOp>(0));
+
+      mlir::Value start_tile_id = builder.create<mlir::arith::MulIOp>(
+          bounded_workgroup_id, tiles_per_workgroup_value, flags);
+      mlir::Value bounded_start_tile_id =
+          builder.create<mlir::arith::MinSIOp>(start_tile_id, tile_count_value);
+
+      mlir::Value end_tile_id = builder.create<mlir::arith::AddIOp>(
+          start_tile_id, tiles_per_workgroup_value, flags);
+      mlir::Value bounded_end_tile_id =
+          builder.create<mlir::arith::MinSIOp>(end_tile_id, tile_count_value);
+
+      mlir::Value step = builder.create<mlir::arith::ConstantIndexOp>(1);
+
+      auto for_op = builder.create<mlir::scf::ForOp>(bounded_start_tile_id,
+                                                     bounded_end_tile_id, step);
+      {
+        mlir::ImplicitLocOpBuilder body_builder(entry_func->getLoc(),
+                                                entry_func);
+        body_builder.setInsertionPointToStart(for_op.getBody());
+
+        call_args.push_back(for_op.getInductionVar());
+
+        body_builder.create<mlir::func::CallOp>(kernel_impl_name,
+                                                mlir::TypeRange(), call_args);
+      }
+
+      auto error = builder.create<cpu::SuccessOp>(error_type);
+      builder.create<mlir::func::ReturnOp>(error.getResult());
+    }
+
+    return mlir::success();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateLowerXTileEntryPass() {
+  return std::make_unique<LowerXTileEntryPass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
new file mode 100644
index 00000000000000..c88fd34f242e40
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
@@ -0,0 +1,93 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h"
+
+#include <cstdint>
+#include <optional>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+
+namespace xla::cpu {
+
+static llvm::SmallVector<mlir::Value> MakeZeroIndices(mlir::OpBuilder& builder,
+                                                      mlir::Location loc,
+                                                      int64_t rank) {
+  return llvm::SmallVector<mlir::Value>(
+      rank, mlir::arith::ConstantIndexOp::create(builder, loc, 0));
+}
+
+mlir::VectorType GetVectorType(mlir::ShapedType type) {
+  return mlir::VectorType::get(type.getShape(), type.getElementType());
+}
+
+mlir::TypedValue<mlir::VectorType> ReadTensorToVector(mlir::OpBuilder& builder,
+                                                      mlir::Value input) {
+  if (input.getType().isIntOrFloat()) {
+    return builder.create<mlir::vector::FromElementsOp>(
+        input.getLoc(), mlir::VectorType::get({}, input.getType()), input);
+  }
+
+  auto input_tensor =
+      mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(input);
+  auto vector_type = GetVectorType(input_tensor.getType());
+
+  return mlir::vector::TransferReadOp::create(
+      builder, input.getLoc(), vector_type, input_tensor,
+      MakeZeroIndices(builder, input.getLoc(), vector_type.getRank()),
+      std::nullopt);
+}
+
+mlir::RankedTensorType GetTensorType(mlir::ShapedType type) {
+  return mlir::RankedTensorType::get(type.getShape(), type.getElementType());
+}
+
+mlir::TypedValue<mlir::RankedTensorType> WriteVectorToTensor(
+    mlir::OpBuilder& builder, mlir::Value input) {
+  if (input.getType().isIntOrFloat()) {
+    return builder.create<mlir::tensor::FromElementsOp>(
+        input.getLoc(), mlir::RankedTensorType::get({}, input.getType()),
+        input);
+  }
+
+  auto input_vector = mlir::cast<mlir::TypedValue<mlir::VectorType>>(input);
+  mlir::VectorType vector_type = input_vector.getType();
+  auto empty_tensor = mlir::tensor::EmptyOp::create(
+      builder, input.getLoc(), vector_type.getShape(),
+      vector_type.getElementType());
+  return mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(
+      mlir::vector::TransferWriteOp::create(
+          builder, input.getLoc(), input, empty_tensor,
+          MakeZeroIndices(builder, input.getLoc(), vector_type.getRank()))
+          .getResult());
+}
+
+mlir::TypedValue<mlir::MemRefType> CreateBufferOfShape(mlir::OpBuilder& builder,
+                                                       mlir::Location loc,
+                                                       mlir::ShapedType shape) {
+  mlir::MemRefType memrefType =
+      mlir::MemRefType::get(shape.getShape(), shape.getElementType());
+  return mlir::memref::AllocaOp::create(builder, loc, memrefType);
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h
new file mode 100644
index 00000000000000..dee0a0a607725d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h
@@ -0,0 +1,56 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_LOWERING_UTILS_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_LOWERING_UTILS_H_
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Value.h"
+
+namespace xla::cpu {
+
+// Get the vector type that has the same shape and element type as the tensor
+// type.
+mlir::VectorType GetVectorType(mlir::ShapedType tensor_type);
+
+// Cast the input to a vector value.
+// If the input is a scalar it will be simply constructed as a
+// vector.from_elements to create a 0D vector.
+// If it is a vector it will be cast to a vector using an unrealized cast op.
+// Any other type will crash.
+mlir::TypedValue<mlir::VectorType> ReadTensorToVector(mlir::OpBuilder& builder,
+                                                      mlir::Value input);
+
+// Get the tensor type that has the same shape and element type as the vector
+// type.
+mlir::RankedTensorType GetTensorType(mlir::ShapedType vector_type);
+
+// Cast the input to a tensor value.
+// If the input is a scalar it will be simply constructed as a
+// tensor.from_elements to create a 0D tensor.
+// If it is a vector it will be cast to a tensor using an unrealized cast op.
+// Any other type will crash.
+mlir::TypedValue<mlir::RankedTensorType> WriteVectorToTensor(
+    mlir::OpBuilder& builder, mlir::Value input);
+
+mlir::TypedValue<mlir::MemRefType> CreateBufferOfShape(mlir::OpBuilder& builder,
+                                                       mlir::Location loc,
+                                                       mlir::ShapedType shape);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_LOWERING_UTILS_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/memref_copy_to_loops.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/memref_copy_to_loops.cc
new file mode 100644
index 00000000000000..b28434c74c4423
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/memref_copy_to_loops.cc
@@ -0,0 +1,141 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL_MEMREFCOPYTOLOOPSPASS
+#define GEN_PASS_DEF_MEMREFCOPYTOLOOPSPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+// Super simple lowering of memref.copies that would otherwise be lowered to a
+// external call by the default memref lowering.
+// TODO(willfroom): look into vectorizing these.
+struct LowerMemRefCopyPattern
+    : public mlir::OpRewritePattern<mlir::memref::CopyOp> {
+  using mlir::OpRewritePattern<mlir::memref::CopyOp>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::memref::CopyOp op, mlir::PatternRewriter& rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    auto source =
+        mlir::cast<mlir::TypedValue<mlir::MemRefType>>(op.getSource());
+    auto dest = mlir::cast<mlir::TypedValue<mlir::MemRefType>>(op.getTarget());
+
+    mlir::MemRefType src_type = source.getType();
+    mlir::MemRefType dest_type = dest.getType();
+
+    // These will be lowered by the default memref -> llvm pipeline to a memcpy
+    // intrinsic.
+    // TODO(willfroom): We should update the default memref lowering to allow
+    // the same layout rather than requiring identity.
+    if (mlir::memref::isStaticShapeAndContiguousRowMajor(src_type) &&
+        mlir::memref::isStaticShapeAndContiguousRowMajor(dest_type)) {
+      return rewriter.notifyMatchFailure(
+          op, "memref.copy will be lowered to a memcpy intrinsic");
+    }
+
+    int64_t rank = src_type.getRank();
+
+    llvm::SmallVector<mlir::Value> lbs, ubs, steps;
+    lbs.reserve(rank);
+    ubs.reserve(rank);
+    steps.reserve(rank);
+
+    mlir::Value c1 = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+    mlir::Value c0 = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+
+    for (int64_t idx = 0; idx < rank; ++idx) {
+      lbs.push_back(c0);
+      steps.push_back(c1);
+
+      // Source & destination must have the same shape as defined by the copy op
+      // spec so we can just extract it from the source without checking the
+      // destination.
+      if (src_type.isDynamicDim(idx)) {
+        ubs.push_back(mlir::memref::DimOp::create(rewriter, loc, source, idx));
+      } else {
+        ubs.push_back(mlir::arith::ConstantIndexOp::create(
+            rewriter, loc, src_type.getDimSize(idx)));
+      }
+    }
+
+    // TODO(willfroom): We should ensure that the loop order is major-to-minor.
+    mlir::scf::buildLoopNest(
+        rewriter, loc, lbs, ubs, steps,
+        [source, dest](mlir::OpBuilder& builder, mlir::Location loc,
+                       mlir::ValueRange ivs) {
+          mlir::Value element =
+              mlir::memref::LoadOp::create(builder, loc, source, ivs);
+          mlir::memref::StoreOp::create(builder, loc, element, dest, ivs);
+        });
+
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+class MemrefCopyToLoopsPass
+    : public impl::MemrefCopyToLoopsPassBase<MemrefCopyToLoopsPass> {
+ public:
+  using MemrefCopyToLoopsPassBase::MemrefCopyToLoopsPassBase;
+
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<LowerMemRefCopyPattern>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateMemrefCopyToLoopsPass() {
+  return std::make_unique<MemrefCopyToLoopsPass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.h b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.h
new file mode 100644
index 00000000000000..b086a33d018ae4
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_PASSES_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Math/IR/Math.h"  // IWYU pragma: keep
+#include "mlir/Dialect/SCF/IR/SCF.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Vector/IR/VectorOps.h"  // IWYU pragma: keep
+#include "mlir/Pass/Pass.h"
+#include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_dialect.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_dialect.h"  // IWYU pragma: keep
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateLinalgElementwiseToVectorPass();
+std::unique_ptr<mlir::Pass> CreateLowerXTileEntryPass();
+std::unique_ptr<mlir::Pass> CreateShloToVectorPass();
+std::unique_ptr<mlir::Pass> CreateTensorOpsToBufferizablePass();
+std::unique_ptr<mlir::Pass> CreateMemrefCopyToLoopsPass();
+std::unique_ptr<mlir::Pass> CreateFuseElementwisePass();
+std::unique_ptr<mlir::Pass> CreateVectorToScalarPass();
+
+#define GEN_PASS_REGISTRATION
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_PASSES_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.td b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.td
new file mode 100644
index 00000000000000..522094c7f10d7c
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.td
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def LowerXTileEntryPass : Pass<"xtile-cpu-lower-xtile-entry", "mlir::ModuleOp"> {
+  let summary = "Lowers the entry function into the form required by the CPU runtime";
+
+  let constructor = "CreateLowerXTileEntryPass()";
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "mlir::LLVM::LLVMDialect",
+    "mlir::scf::SCFDialect",
+    "xla::cpu::XlaCpuDialect",
+    "xla::xtile::XTileDialect"
+  ];
+}
+
+def ShloToVectorPass : Pass<"xtile-cpu-shlo-to-vector", "mlir::ModuleOp"> {
+  let summary = "Lowering satble hlo ops to vector ops";
+
+  let constructor = "CreateShloToVectorPass()";
+
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::bufferization::BufferizationDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::scf::SCFDialect",
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::tensor::TensorDialect",
+    "mlir::vector::VectorDialect",
+  ];
+}
+
+def LinalgElementwiseToVectorPass : Pass<"xtile-cpu-linalg-elementwise-to-vector",
+                                       "mlir::ModuleOp"> {
+  let summary = "Convert elementwise linalg ops to vector ops";
+
+  let constructor = "CreateLinalgElementwiseToVectorPass()";
+
+  let dependentDialects = [
+    "mlir::vector::VectorDialect",
+  ];
+}
+
+def TensorOpsToBufferizablePass : Pass<"xtile-cpu-tensor-ops-to-bufferizable",
+                                 "mlir::ModuleOp"> {
+  let summary = "Lowering tensor dialect ops to bufferizable ops";
+
+  let description = [{
+    Some tensor ops such as bitcast are not directly bufferizable. This pass
+    lowers such ops to ops that have bufferization support.
+  }];
+
+  let constructor = "CreateTensorOpsToBufferizablePass()";
+
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+  ];
+}
+
+
+def MemrefCopyToLoopsPass : Pass<"xtile-cpu-memref-copy-to-loops",
+                                 "mlir::ModuleOp"> {
+  let summary = "Rewrite mmeref.copy to loops.";
+
+  let dependentDialects = [
+    "::mlir::scf::SCFDialect",
+    "::mlir::memref::MemRefDialect",
+  ];
+}
+
+def FuseElementwisePass : Pass<"xtile-cpu-fuse-elementwise"> {
+  let summary = "Fuse linalg elementwise ops.";
+
+  let description = [{
+    This pass fuses multiple linalg elementwise ops into linalg elementwise ops
+    that contain multiple instructions. This allows for fewer matrializations
+    and fewer temporary allocations in bufferization.
+  }];
+}
+
+def VectorToScalarPass : Pass<"xtile-cpu-vector-to-scalar"> {
+  let summary = "Convert vector ops to scalar ops where possible.";
+
+  let description = [{
+    This pass converts elementwise vector ops to scalar ops if the operation
+    acts on a single element.
+  }];
+
+  let dependentDialects = [
+    "::mlir::vector::VectorDialect",
+  ];
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
new file mode 100644
index 00000000000000..4949d2e4edd1cc
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
@@ -0,0 +1,351 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL_SHLOTOVECTORPASS
+#define GEN_PASS_DEF_SHLOTOVECTORPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+mlir::AffineMapAttr GetOperandIndexingMap(
+    mlir::OpBuilder& builder, int64_t iterator_count, int64_t rank,
+    llvm::ArrayRef<int64_t> batch_dims,
+    llvm::ArrayRef<int64_t> contracting_dims, int64_t free_dim_offset) {
+  llvm::SmallVector<unsigned> targets(rank, -1);
+  unsigned idx = 0;
+  for (int64_t dim : batch_dims) {
+    targets[dim] = idx++;
+  }
+  for (int64_t dim : contracting_dims) {
+    targets[dim] = idx++;
+  }
+  for (unsigned& target : targets) {
+    if (target == -1) {
+      target = free_dim_offset + idx++;
+    }
+  }
+  auto affine_map = mlir::AffineMap::getMultiDimMapWithTargets(
+      iterator_count, targets, builder.getContext());
+
+  return mlir::AffineMapAttr::get(affine_map);
+}
+
+mlir::AffineMapAttr GetOutputIndexingMap(mlir::OpBuilder& builder,
+                                         int64_t iterator_count,
+                                         int64_t batch_dim_count,
+                                         int64_t contracting_dim_count) {
+  llvm::SmallVector<unsigned> targets(iterator_count - contracting_dim_count);
+  unsigned idx = 0;
+  for (int64_t dim = 0; dim != batch_dim_count; ++dim) {
+    targets[dim] = idx++;
+  }
+  idx += contracting_dim_count;
+  int64_t total_free_dims =
+      iterator_count - batch_dim_count - contracting_dim_count;
+  for (int64_t dim = 0; dim != total_free_dims; ++dim) {
+    targets[batch_dim_count + dim] = idx++;
+  }
+  auto affine_map = mlir::AffineMap::getMultiDimMapWithTargets(
+      iterator_count, targets, builder.getContext());
+
+  return mlir::AffineMapAttr::get(affine_map);
+}
+
+mlir::ArrayAttr GetIteratorTypes(mlir::OpBuilder& builder,
+                                 int64_t iterator_count,
+                                 int64_t batch_dim_count,
+                                 int64_t contracting_dim_count) {
+  llvm::SmallVector<mlir::Attribute> iterator_types;
+  iterator_types.reserve(iterator_count);
+  for (int64_t dim = 0; dim != batch_dim_count; ++dim) {
+    iterator_types.push_back(builder.getAttr<mlir::vector::IteratorTypeAttr>(
+        mlir::vector::IteratorType::parallel));
+  }
+  for (int64_t dim = 0; dim != contracting_dim_count; ++dim) {
+    iterator_types.push_back(builder.getAttr<mlir::vector::IteratorTypeAttr>(
+        mlir::vector::IteratorType::reduction));
+  }
+  int64_t free_dims = iterator_count - batch_dim_count - contracting_dim_count;
+  for (int64_t dim = 0; dim != free_dims; ++dim) {
+    iterator_types.push_back(builder.getAttr<mlir::vector::IteratorTypeAttr>(
+        mlir::vector::IteratorType::parallel));
+  }
+
+  return mlir::ArrayAttr::get(builder.getContext(), iterator_types);
+}
+
+// Lowers from stablehlo.dot_general to vector.contract.
+// The vector contract is very general as described here:
+// https://mlir.llvm.org/docs/Dialects/Vector/#vectorcontract-vectorcontractionop
+// In this lowering the iteration order attribute passed is of the form:
+// (batch..., contracting..., free_lhs..., free_rhs...)
+// TODO(willfroom): Check if there is any performance impact on the order.
+struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::DotGeneralOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    auto lhs_vector = ReadTensorToVector(rewriter, op.getLhs());
+    auto lhs_rank = lhs_vector.getType().getRank();
+
+    auto rhs_vector = ReadTensorToVector(rewriter, op.getRhs());
+    auto rhs_rank = rhs_vector.getType().getRank();
+
+    // TODO(willfroom): Ensure this is being folded into the accumulator in the
+    // dot loop.
+    mlir::Value accumulator =
+        GetAccumulator(rewriter, op->getLoc(), op.getType());
+
+    mlir::stablehlo::DotDimensionNumbersAttr dimension_numbers =
+        op.getDotDimensionNumbers();
+
+    llvm::ArrayRef<int64_t> lhs_batch =
+        dimension_numbers.getLhsBatchingDimensions();
+    llvm::ArrayRef<int64_t> lhs_contracting =
+        dimension_numbers.getLhsContractingDimensions();
+
+    llvm::ArrayRef<int64_t> rhs_batch =
+        dimension_numbers.getRhsBatchingDimensions();
+    llvm::ArrayRef<int64_t> rhs_contracting =
+        dimension_numbers.getRhsContractingDimensions();
+
+    int64_t lhs_free_dims =
+        lhs_rank - lhs_batch.size() - lhs_contracting.size();
+    int64_t rhs_free_dims =
+        rhs_rank - rhs_batch.size() - rhs_contracting.size();
+    int64_t iterator_count = lhs_batch.size() + lhs_contracting.size() +
+                             lhs_free_dims + rhs_free_dims;
+
+    mlir::Attribute lhs_indexing_map = GetOperandIndexingMap(
+        rewriter, iterator_count, lhs_rank, lhs_batch, lhs_contracting, 0);
+    mlir::Attribute rhs_indexing_map =
+        GetOperandIndexingMap(rewriter, iterator_count, rhs_rank, rhs_batch,
+                              rhs_contracting, lhs_free_dims);
+    mlir::Attribute output_indexing_map = GetOutputIndexingMap(
+        rewriter, iterator_count, lhs_batch.size(), lhs_contracting.size());
+
+    mlir::ArrayAttr indexing_maps = rewriter.getArrayAttr(
+        {lhs_indexing_map, rhs_indexing_map, output_indexing_map});
+    mlir::ArrayAttr iterator_types = GetIteratorTypes(
+        rewriter, iterator_count, lhs_batch.size(), lhs_contracting.size());
+
+    mlir::Value result = rewriter.create<mlir::vector::ContractionOp>(
+        op->getLoc(), lhs_vector, rhs_vector, accumulator, indexing_maps,
+        iterator_types);
+
+    rewriter.replaceOp(op, WriteVectorToTensor(rewriter, result));
+
+    return mlir::success();
+  }
+
+ private:
+  mlir::Value GetAccumulator(mlir::OpBuilder& builder, mlir::Location loc,
+                             mlir::RankedTensorType result_type) const {
+    mlir::Type element_type = result_type.getElementType();
+    auto zero_const = builder.create<mlir::arith::ConstantOp>(
+        loc, element_type, builder.getZeroAttr(element_type));
+
+    if (result_type.getRank() == 0) {
+      return zero_const;
+    }
+
+    auto result_vector_type = GetVectorType(result_type);
+    return builder.create<mlir::vector::BroadcastOp>(loc, result_vector_type,
+                                                     zero_const);
+  }
+};
+
+struct LowerTranspose : mlir::OpRewritePattern<mlir::stablehlo::TransposeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::TransposeOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    mlir::Value source_vector = ReadTensorToVector(rewriter, op.getOperand());
+
+    mlir::TypedValue<mlir::VectorType> dest_vector =
+        rewriter.create<mlir::vector::TransposeOp>(op->getLoc(), source_vector,
+                                                   op.getPermutation());
+
+    mlir::Value dest_tensor = WriteVectorToTensor(rewriter, dest_vector);
+
+    rewriter.replaceAllUsesWith(op, dest_tensor);
+    return mlir::success();
+  }
+};
+
+struct LowerReduce : mlir::OpRewritePattern<mlir::stablehlo::ReduceOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::ReduceOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (op.getNumResults() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "reduce op with multiple results is not supported");
+    }
+
+    auto source_tensor = mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(
+        op.getInputs().front());
+    mlir::Value result_tensor = op.getResult(0);
+    auto result_type =
+        mlir::cast<mlir::RankedTensorType>(result_tensor.getType());
+
+    mlir::Value init_value = rewriter.create<mlir::tensor::ExtractOp>(
+        op->getLoc(), result_type.getElementType(), op.getInitValues().front());
+
+    // Ensure the reduction dimensions are sorted so we can easily check if the
+    // minor dimension is reduced.
+    llvm::SmallVector<int64_t> reduction_dims(op.getDimensions());
+    absl::c_sort(reduction_dims);
+
+    mlir::Value reduced_vector = EmitVectorizedReduction(
+        rewriter, op->getLoc(), result_type, source_tensor, init_value,
+        reduction_dims, op.getBody().front());
+
+    rewriter.replaceOp(op, reduced_vector);
+
+    return mlir::success();
+  }
+};
+
+struct LowerBroadcastInDim
+    : mlir::OpRewritePattern<mlir::stablehlo::BroadcastInDimOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::BroadcastInDimOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    auto source_vector = ReadTensorToVector(rewriter, op.getOperand());
+    auto result_vector_type = GetVectorType(op.getType());
+
+    llvm::ArrayRef<int64_t> source_shape = source_vector.getType().getShape();
+    llvm::ArrayRef<int64_t> broadcast_dims = op.getBroadcastDimensions();
+
+    // First create an intermediate vector with the rank of the result vector
+    // but with the broadcasted dimensions set to the source shape with all
+    // additional dimensions set to 1.
+    llvm::SmallVector<int64_t> intermediate_shape(result_vector_type.getRank(),
+                                                  1);
+    for (auto [input_dim, result_dim] : llvm::enumerate(broadcast_dims)) {
+      intermediate_shape[result_dim] = source_shape[input_dim];
+    }
+    mlir::Value intermediate_vector = mlir::vector::ShapeCastOp::create(
+        rewriter, op->getLoc(),
+        mlir::VectorType::get(intermediate_shape,
+                              result_vector_type.getElementType()),
+        source_vector);
+    // Now that all the inserted dimensions are size 1 we can legally call
+    // broadcast even if they are not the most major dimensions.
+    mlir::Value broadcast_op = mlir::vector::BroadcastOp::create(
+        rewriter, op->getLoc(), result_vector_type, intermediate_vector);
+
+    rewriter.replaceOp(op, WriteVectorToTensor(rewriter, broadcast_op));
+    return mlir::success();
+  }
+};
+
+struct LowerIota : mlir::OpRewritePattern<mlir::stablehlo::IotaOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::IotaOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (op.getType().getRank() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "iota op with rank != 1 is not supported");
+    }
+
+    auto result_type = op.getType();
+    auto element_type = result_type.getElementType();
+    int64_t iota_size = result_type.getNumElements();
+
+    llvm::SmallVector<mlir::Attribute> iota_values(iota_size);
+    for (int idx = 0; idx != iota_size; ++idx) {
+      iota_values[idx] = rewriter.getIntegerAttr(element_type, idx);
+    }
+
+    mlir::Value iota_const = mlir::arith::ConstantOp::create(
+        rewriter, op->getLoc(),
+        mlir::DenseElementsAttr::get(result_type, iota_values));
+
+    rewriter.replaceOp(op, iota_const);
+    return mlir::success();
+  }
+};
+
+class ShloToVectorPass : public impl::ShloToVectorPassBase<ShloToVectorPass> {
+ public:
+  using ShloToVectorPassBase::ShloToVectorPassBase;
+
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<LowerTranspose, LowerDotGeneral, LowerReduce,
+                 LowerBroadcastInDim, LowerIota>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateShloToVectorPass() {
+  return std::make_unique<ShloToVectorPass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tensor_ops_to_bufferizable.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tensor_ops_to_bufferizable.cc
new file mode 100644
index 00000000000000..69c45ca69964d9
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tensor_ops_to_bufferizable.cc
@@ -0,0 +1,76 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL_TENSOROPSTOBUFFERIZABLEPASS
+#define GEN_PASS_DEF_TENSOROPSTOBUFFERIZABLEPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+struct TensorToArithBitcast : mlir::OpRewritePattern<mlir::tensor::BitcastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::tensor::BitcastOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlir::arith::BitcastOp>(op, op.getType(),
+                                                        op.getOperand());
+    return mlir::success();
+  }
+};
+
+class TensorOpsToBufferizablePass
+    : public impl::TensorOpsToBufferizablePassBase<
+          TensorOpsToBufferizablePass> {
+ public:
+  using TensorOpsToBufferizablePassBase::TensorOpsToBufferizablePassBase;
+
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<TensorToArithBitcast>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateTensorOpsToBufferizablePass() {
+  return std::make_unique<TensorOpsToBufferizablePass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/BUILD
new file mode 100644
index 00000000000000..7efb5865d7b8d0
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/BUILD
@@ -0,0 +1,16 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+lit_test_suite(
+    name = "tests",
+    srcs = glob(["*.mlir"]),
+    cfg = "//xla:lit.cfg.py",
+    tools = [
+        "//xla/backends/cpu/codegen/tools:fusion_compiler_opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/fuse_elementwise.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/fuse_elementwise.mlir
new file mode 100644
index 00000000000000..b9adea8741e8cc
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/fuse_elementwise.mlir
@@ -0,0 +1,23 @@
+// RUN: fusion_compiler_opt %s -split-input-file  \
+// RUN:   -linalg-generalize-named-ops -xtile-cpu-fuse-elementwise | FileCheck %s
+
+func.func @elementwise_add_to_vector(
+    %lhs : tensor<8x1024xf32>,
+    %rhs : tensor<8x1024xf32>) -> tensor<8x1024xf32> {
+  %out = tensor.empty() : tensor<8x1024xf32>
+
+  %intermediate = linalg.elementwise kind=#linalg.elementwise_kind<mul>
+    ins(%lhs, %rhs : tensor<8x1024xf32>, tensor<8x1024xf32>)
+    outs(%out : tensor<8x1024xf32>) -> tensor<8x1024xf32>
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<add>
+    ins(%intermediate, %rhs : tensor<8x1024xf32>, tensor<8x1024xf32>)
+    outs(%out : tensor<8x1024xf32>) -> tensor<8x1024xf32>
+  return %result : tensor<8x1024xf32>
+}
+
+// CHECK: linalg.generic
+// CHECK:    (%[[LHS:.*]]: f32, %[[RHS:.*]]: f32, %[[OUT:.*]]: f32):
+// CHECK:       %[[MUL:.*]] = arith.mulf %[[LHS]], %[[RHS]] : f32
+// CHECK:       %[[RES:.*]] = arith.addf %[[MUL]], %[[RHS]] : f32
+// CHECK:       linalg.yield %[[RES]] : f32
+// CHECK:     }
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/linalg_elementwise_to_vector_pass.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/linalg_elementwise_to_vector_pass.mlir
new file mode 100644
index 00000000000000..d1ec75fd8a426a
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/linalg_elementwise_to_vector_pass.mlir
@@ -0,0 +1,16 @@
+// RUN: fusion_compiler_opt %s -xtile-cpu-linalg-elementwise-to-vector -split-input-file | FileCheck %s
+
+func.func @elementwise_add_to_vector(
+    %lhs : memref<8x1024xf32>,
+    %rhs : memref<8x1024xf32>,
+    %out : memref<8x1024xf32>) {
+  // CHECK: %1 = vector.transfer_read %arg0
+  // CHECK: %2 = vector.transfer_read %arg1
+  // CHECK: %3 = arith.addf {{.*}} : vector<8x1024xf32>
+  // CHECK: vector.transfer_write %{{.*}}, %arg2{{.*}} :
+  // CHECK-SAME: vector<8x1024xf32>, memref<8x1024xf32>
+  linalg.elementwise kind=#linalg.elementwise_kind<add>
+    ins(%lhs, %rhs : memref<8x1024xf32>, memref<8x1024xf32>)
+    outs(%out : memref<8x1024xf32>)
+  return
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/lower_xtile_entry.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/lower_xtile_entry.mlir
new file mode 100644
index 00000000000000..b0e3e4702c241b
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/lower_xtile_entry.mlir
@@ -0,0 +1,40 @@
+// RUN: fusion_compiler_opt %s --xtile-cpu-lower-xtile-entry -split-input-file | FileCheck %s
+
+xtile.entry_func @simple_wrap(%input: memref<1024xf32> {xla.some_attr = 1},
+                             %output: memref<32xf64>,
+                             %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1012, tiles_per_workgroup:64>} {
+  xtile.return
+}
+
+// CHECK: func.func @simple_wrap(%[[CALL_FRAME:.*]]: !xla_cpu.call_frame) -> !xla_cpu.error {
+
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[STEP:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[TILES_PER_WORKGROUP:.*]] = arith.constant 64 : index
+// CHECK-DAG: %[[TILE_COUNT:.*]] = arith.constant 1012 : index
+
+// CHECK: %[[INPUT:.*]] = xla_cpu.load %[[CALL_FRAME]], 0 : memref<1024xf32>
+// CHECK: %[[OUTPUT:.*]] = xla_cpu.load %[[CALL_FRAME]], 1 : memref<32xf64>
+// CHECK: %[[WORKGROUP_ID:.*]] = xla_cpu.extract_workgroup_id %[[CALL_FRAME]], x
+
+// CHECK: %[[BOUNDED_WORKGROUP_ID:.*]] = arith.maxsi %[[WORKGROUP_ID]], %[[C0]] : index
+// CHECK: %[[START_IDX:.*]] = arith.muli %[[BOUNDED_WORKGROUP_ID]], %[[TILES_PER_WORKGROUP]] overflow<nsw, nuw> : index
+// CHECK: %[[CLAMPED_START_IDX:.*]] = arith.minsi %[[START_IDX]], %[[TILE_COUNT]] : index
+// CHECK: %[[END_IDX:.*]] = arith.addi %[[START_IDX]], %[[TILES_PER_WORKGROUP]] overflow<nsw, nuw> : index
+// CHECK: %[[CLAMPED_END_IDX:.*]] = arith.minsi %[[END_IDX]], %[[TILE_COUNT]] : index
+// CHECK: scf.for %[[IDX:.*]] = %[[CLAMPED_START_IDX]] to %[[CLAMPED_END_IDX]] step %[[STEP]] {
+// CHECK:   func.call @[[IMPL_FUNC:.*]](%[[INPUT]], %[[OUTPUT]], %[[IDX]]) : (memref<1024xf32>, memref<32xf64>, index) -> ()
+// CHECK: }
+
+// CHECK: %[[SUCCESS:.*]] = xla_cpu.success : !xla_cpu.error
+// CHECK: return %[[SUCCESS]] : !xla_cpu.error
+
+// CHECK: func.func @[[IMPL_FUNC]](
+// CHECK-SAME: %{{.*}}: memref<1024xf32> {xla.some_attr = 1 : i64},
+// CHECK-SAME: %{{.*}}: memref<32xf64>,
+// CHECK-SAME: %{{.*}}: index)
+// CHECK-SAME: attributes {always_inline, llvm.linkage = #llvm.linkage<internal>
+// CHECK: return
+
+
+// -----
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/memref_copy_to_loops.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/memref_copy_to_loops.mlir
new file mode 100644
index 00000000000000..f305ddbfb4af45
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/memref_copy_to_loops.mlir
@@ -0,0 +1,31 @@
+// RUN: fusion_compiler_opt %s \
+// RUN: -xtile-cpu-memref-copy-to-loops -split-input-file \
+// RUN: | FileCheck %s
+
+// CHECK-LABEL: @identity_copy_is_unchanged
+func.func @identity_copy_is_unchanged(%arg0: memref<5xi32>, %arg1: memref<5xi32>) {
+  // CHECK: memref.copy
+  memref.copy %arg0, %arg1 : memref<5xi32> to memref<5xi32>
+  func.return
+}
+
+
+// CHECK-LABEL: @non_default_layout_copy_to_loops
+func.func @non_default_layout_copy_to_loops(
+    %arg0: memref<5x2xf32, strided<[1, 5]>>,
+    %arg1: memref<5x2xf32>) {
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index
+  // CHECK: scf.for %[[IDX0:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
+  // CHECK:   scf.for %[[IDX1:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
+  // CHECK:     %[[ELEMENT:.*]] = memref.load %arg0[%[[IDX0]], %[[IDX1]]]
+  // CHECK-SAME: : memref<5x2xf32, strided<[1, 5]>>
+  // CHECK:     memref.store %[[ELEMENT]], %arg1[%[[IDX0]], %[[IDX1]]]
+  // CHECK-SAME: : memref<5x2xf32>
+  // CHECK:   }
+  // CHECK: }
+  memref.copy %arg0, %arg1 : memref<5x2xf32, strided<[1, 5]>> to memref<5x2xf32>
+  func.return
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir
new file mode 100644
index 00000000000000..8dd5709c7fead9
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir
@@ -0,0 +1,174 @@
+// RUN: fusion_compiler_opt %s --xtile-cpu-shlo-to-vector -split-input-file | FileCheck %s
+
+func.func @transpose(%input : tensor<1024x32xf32>) -> tensor<32x1024xf32> {
+  // CHECK: vector.transpose %{{.*}}, [1, 0] : vector<1024x32xf32> to vector<32x1024xf32>
+  %transposed = stablehlo.transpose %input, dims = [1, 0] : (tensor<1024x32xf32>) -> tensor<32x1024xf32>
+  return %transposed : tensor<32x1024xf32>
+}
+// -----
+
+// CHECK-DAG: #[[LHS_MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+// CHECK-DAG: #[[RHS_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[OUTPUT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+func.func @dot_general(%lhs : tensor<1024x32xf32>, %rhs : tensor<32x1024xf32>) -> tensor<1024x1024xf32> {
+  // CHECK: %[[ACCUMULATOR:.*]] = arith.constant dense<0.000000e+00> : vector<1024x1024xf32>
+  // CHECK: vector.contract
+  // CHECK-SAME: {indexing_maps = [#[[LHS_MAP]], #[[RHS_MAP]], #[[OUTPUT_MAP]]],
+  // CHECK-SAME: iterator_types = ["reduction", "parallel", "parallel"],
+  // CHECK-SAME: kind = #vector.kind<add>}
+  // CHECK-SAME: %[[ACCUMULATOR]] : vector<1024x32xf32>, vector<32x1024xf32> into vector<1024x1024xf32>
+  %result = stablehlo.dot_general %lhs, %rhs, contracting_dims = [1] x [0] : (tensor<1024x32xf32>, tensor<32x1024xf32>) -> tensor<1024x1024xf32>
+  return %result : tensor<1024x1024xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[INPUT_MAP:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #[[OUTPUT_MAP:.*]] = affine_map<(d0) -> ()>
+func.func @dot_scalar_output(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<f32> {
+  // CHECK: %[[ACCUMULATOR:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[RESULT:.*]] = vector.contract
+  // CHECK-SAME: {indexing_maps = [#[[INPUT_MAP]], #[[INPUT_MAP]], #[[OUTPUT_MAP]]],
+  // CHECK-SAME: iterator_types = ["reduction"],
+  // CHECK-SAME: kind = #vector.kind<add>}
+  // CHECK-SAME: %[[ACCUMULATOR]] : vector<1024xf32>, vector<1024xf32> into f32
+  // CHECK: %[[RESULT_TENSOR:.*]] = tensor.from_elements %[[RESULT]] : tensor<f32>
+  %result = stablehlo.dot_general %lhs, %rhs, contracting_dims = [0] x [0] : (tensor<1024xf32>, tensor<1024xf32>) -> tensor<f32>
+  // CHECK: return %[[RESULT_TENSOR]] : tensor<f32>
+  return %result : tensor<f32>
+}
+
+// -----
+
+
+func.func @reduce_outer(%input : tensor<1024x32xf32>, %init : tensor<f32>) -> tensor<32xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [0] : (tensor<1024x32xf32>, tensor<f32>) -> tensor<32xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<32xf32>
+}
+
+// CHECK: func.func @reduce_outer
+// CHECK:   memref.alloca() : memref<32xf32>
+// CHECK:   vector.transfer_read %{{.*}} : tensor<1024x32xf32>, vector<32xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read %{{.*}} : tensor<1024x32xf32>, vector<32xf32>
+// CHECK:     arith.addf {{.*}} : vector<32xf32>
+// CHECK:     scf.yield %{{.*}} : vector<32xf32>
+// CHECK:   }
+// CHECK:   vector.transfer_write %{{.*}} : vector<32xf32>, memref<32xf32>
+
+// -----
+
+
+func.func @reduce_inner(%input : tensor<1024x32xf32>, %init : tensor<f32>) -> tensor<1024xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [1] : (tensor<1024x32xf32>, tensor<f32>) -> tensor<1024xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<1024xf32>
+}
+
+// CHECK: func.func @reduce_inner
+// CHECK:   memref.alloca() : memref<1024xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read {{.*}} : tensor<1024x32xf32>, vector<32xf32>
+// CHECK:     vector.reduction <add>, {{.*}} : vector<32xf32> into f32
+// CHECK:     memref.store {{.*}} : memref<1024xf32>
+// CHECK:   }
+
+// -----
+
+func.func @reduce_middle(%input : tensor<1024x32x8xf32>, %init : tensor<f32>) -> tensor<1024x8xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [1] : (tensor<1024x32x8xf32>, tensor<f32>) -> tensor<1024x8xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<1024x8xf32>
+}
+
+// CHECK: func.func @reduce_middle
+// CHECK:   memref.alloca() : memref<1024x8xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read {{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:     scf.for
+// CHECK:       vector.transfer_read %{{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:       arith.addf {{.*}} : vector<8xf32>
+// CHECK:       scf.yield {{.*}} : vector<8xf32>
+// CHECK:     }
+// CHECK:     vector.transfer_write {{.*}} : vector<8xf32>, memref<1024x8xf32>
+// CHECK:   }
+// CHECK: }
+
+// -----
+
+func.func @reduce_outer_and_inner(%input : tensor<1024x32x8xf32>, %init : tensor<f32>) -> tensor<32xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [0, 2] : (tensor<1024x32x8xf32>, tensor<f32>) -> tensor<32xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<32xf32>
+}
+
+// CHECK: func.func @reduce_outer_and_inner
+// CHECK:   %[[BUFFER:.*]] = memref.alloca() : memref<32xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read {{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:     scf.for
+// CHECK:       vector.transfer_read %{{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:       arith.addf %{{.*}} : vector<8xf32>
+// CHECK:       scf.yield {{.*}} : vector<8xf32>
+// CHECK:     }
+// CHECK:     vector.reduction <add>, {{.*}} : vector<8xf32> into f32
+// CHECK:     memref.store {{.*}}, %[[BUFFER]]{{.*}} : memref<32xf32>
+// CHECK:   }
+// CHECK: }
+
+// -----
+
+func.func @broadcast_0D_tensor(%input : tensor<f32>) -> tensor<32xf32> {
+  %result = stablehlo.broadcast_in_dim %input, dims = [] : (tensor<f32>) -> tensor<32xf32>
+  return %result : tensor<32xf32>
+}
+
+// CHECK-LABEL: @broadcast_0D_tensor
+// CHECK-NOT: vector.shape_cast
+// CHECK: vector.broadcast {{.*}} : vector<f32> to vector<32xf32>
+
+// -----
+
+func.func @broadcast_2D_tensor_inner(%input : tensor<4xf32>) -> tensor<32x4xf32> {
+  %result = stablehlo.broadcast_in_dim %input, dims = [1] : (tensor<4xf32>) -> tensor<32x4xf32>
+  return %result : tensor<32x4xf32>
+}
+
+// CHECK-LABEL: @broadcast_2D_tensor_inner
+// CHECK-NOT: vector.shape_cast
+// CHECK: vector.broadcast {{.*}} : vector<4xf32> to vector<32x4xf32>
+
+// -----
+
+func.func @broadcast_2D_tensor_outer(%input : tensor<4xf32>) -> tensor<4x32xf32> {
+  %result = stablehlo.broadcast_in_dim %input, dims = [0] : (tensor<4xf32>) -> tensor<4x32xf32>
+  return %result : tensor<4x32xf32>
+}
+
+// CHECK-LABEL: @broadcast_2D_tensor_outer
+// CHECK: vector.shape_cast {{.*}} : vector<4xf32> to vector<4x1xf32>
+// CHECK: vector.broadcast {{.*}} : vector<4x1xf32> to vector<4x32xf32>
+
+// -----
+
+func.func @iota() -> tensor<4xi32> {
+  %result = stablehlo.iota dim = 0 : tensor<4xi32>
+  return %result : tensor<4xi32>
+}
+
+// CHECK-LABEL: @iota
+// CHECK: arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/tensor_ops_to_bufferizable.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/tensor_ops_to_bufferizable.mlir
new file mode 100644
index 00000000000000..ea6c7d81a9003b
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/tensor_ops_to_bufferizable.mlir
@@ -0,0 +1,9 @@
+// RUN: fusion_compiler_opt %s -xtile-cpu-tensor-ops-to-bufferizable -split-input-file | FileCheck %s
+
+
+func.func @bitcast(%arg0 : tensor<8xf32>) -> tensor<8xi32> {
+  // CHECK: %[[RESULT:.*]] = arith.bitcast %arg0 : tensor<8xf32> to tensor<8xi32>
+  %result = arith.bitcast %arg0 : tensor<8xf32> to tensor<8xi32>
+  // CHECK: return %[[RESULT]] : tensor<8xi32>
+  return %result : tensor<8xi32>
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/vector_to_scalar.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/vector_to_scalar.mlir
new file mode 100644
index 00000000000000..2b4705ddc4abcc
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/vector_to_scalar.mlir
@@ -0,0 +1,53 @@
+// RUN: fusion_compiler_opt %s -xtile-cpu-vector-to-scalar -split-input-file | FileCheck %s
+
+func.func @vector_to_scalar_0d(%arg0 : vector<f32>, %arg1 : vector<f32>) -> vector<f32> {
+  // CHECK-DAG: %[[SCALAR0:.*]] = vector.extract %arg0[]
+  // CHECK-DAG: %[[SCALAR1:.*]] = vector.extract %arg1[]
+  // CHECK: %[[SCALAR_ADD:.*]] = arith.addf %[[SCALAR0]], %[[SCALAR1]] : f32
+  // CHECK: %[[VECTOR_ADD:.*]] = vector.from_elements %[[SCALAR_ADD]] : vector<f32>
+  %add = arith.addf %arg0, %arg1 : vector<f32>
+  // CHECK: return %[[VECTOR_ADD]] : vector<f32>
+  return %add : vector<f32>
+}
+
+//-----
+
+func.func @vector_to_scalar_1d(%arg0 : vector<1xf32>, %arg1 : vector<1xf32>) -> vector<1xf32> {
+  // CHECK-DAG: %[[SCALAR0:.*]] = vector.extract %arg0[0]
+  // CHECK-DAG: %[[SCALAR1:.*]] = vector.extract %arg1[0]
+  // CHECK: %[[SCALAR_MUL:.*]] = arith.mulf %[[SCALAR0]], %[[SCALAR1]] : f32
+  // CHECK: %[[VECTOR_MUL:.*]] = vector.from_elements %[[SCALAR_MUL]] : vector<1xf32>
+  %mul = arith.mulf %arg0, %arg1 : vector<1xf32>
+  // CHECK: return %[[VECTOR_MUL]] : vector<1xf32>
+  return %mul : vector<1xf32>
+}
+
+//-----
+
+func.func @vector_to_scalar_2d(%arg0 : vector<1x1xf32>) -> vector<1x1xf32> {
+  // CHECK: %[[SCALAR0:.*]] = vector.extract %arg0[0, 0]
+  // CHECK: %[[SCALAR_COS:.*]] = math.cos %[[SCALAR0]] : f32
+  // CHECK: %[[VECTOR_COS:.*]] = vector.from_elements %[[SCALAR_COS]] : vector<1x1xf32>
+  %cos = math.cos %arg0 : vector<1x1xf32>
+  // CHECK: return %[[VECTOR_COS]] : vector<1x1xf32>
+  return %cos : vector<1x1xf32>
+}
+
+//-----
+
+func.func @vector_to_scalar_constant() -> vector<1x1xf32> {
+  // CHECK: %[[SCALAR:.*]] = arith.constant 1.000000e+00 : f32
+  // CHECK: %[[VECTOR:.*]] = vector.from_elements %[[SCALAR]] : vector<1x1xf32>
+  %cos = arith.constant dense<1.0> : vector<1x1xf32>
+  // CHECK: return %[[VECTOR]] : vector<1x1xf32>
+  return %cos : vector<1x1xf32>
+}
+
+//-----
+
+func.func @skips_multi_element(%arg0 : vector<2xf32>) -> vector<2xf32> {
+  // CHECK: %[[RES:.*]] = math.sin %arg0 : vector<2xf32>
+  %sin = math.sin %arg0 : vector<2xf32>
+  // CHECK: return %[[RES]] : vector<2xf32>
+  return %sin : vector<2xf32>
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vector_to_scalar_pass.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vector_to_scalar_pass.cc
new file mode 100644
index 00000000000000..5a4dc5addcf83d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vector_to_scalar_pass.cc
@@ -0,0 +1,170 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DEF_VECTORTOSCALARPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+mlir::Type TypeConverter(mlir::Type type) {
+  auto maybe_vector_type = mlir::dyn_cast<mlir::VectorType>(type);
+  if (!maybe_vector_type) {
+    return type;
+  }
+  if (maybe_vector_type.getNumElements() != 1) {
+    return type;
+  }
+
+  return maybe_vector_type.getElementType();
+}
+
+mlir::Value SourceMaterialization(mlir::OpBuilder& builder,
+                                  mlir::Type result_type,
+                                  mlir::ValueRange inputs, mlir::Location loc) {
+  if (inputs.size() != 1) {
+    return nullptr;
+  }
+  return mlir::vector::FromElementsOp::create(builder, loc, result_type,
+                                              inputs.front());
+}
+
+mlir::Value TargetMaterialization(mlir::OpBuilder& builder,
+                                  mlir::Type result_type,
+                                  mlir::ValueRange inputs, mlir::Location loc) {
+  if (inputs.size() != 1) {
+    return nullptr;
+  }
+  auto input_vector_type =
+      mlir::cast<mlir::VectorType>(inputs.front().getType());
+  llvm::SmallVector<int64_t> indices(input_vector_type.getRank(), 0);
+  return mlir::vector::ExtractOp::create(builder, loc, inputs.front(), indices);
+}
+
+struct ElementwiseConverter
+    : public mlir::OpTraitConversionPattern<mlir::OpTrait::Elementwise> {
+ public:
+  using OpTraitConversionPattern::OpTraitConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::Operation* op, mlir::ArrayRef<mlir::Value> operands,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    llvm::SmallVector<mlir::Type> new_result_types;
+    if (mlir::failed(getTypeConverter()->convertTypes(op->getResultTypes(),
+                                                      new_result_types))) {
+      return rewriter.notifyMatchFailure(op, "failed to convert type");
+    }
+
+    mlir::IRMapping mapping;
+    mapping.map(op->getOperands(), operands);
+    mlir::Operation* new_op = rewriter.clone(*op, mapping);
+
+    for (auto [results, new_type] :
+         llvm::zip(new_op->getResults(), new_result_types)) {
+      results.setType(new_type);
+    }
+
+    rewriter.replaceOp(op, new_op);
+    return mlir::success();
+  }
+};
+
+struct ConstantConversionPattern
+    : public mlir::OpConversionPattern<mlir::arith::ConstantOp> {
+  using OpConversionPattern<mlir::arith::ConstantOp>::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::arith::ConstantOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    auto dense_attr = mlir::cast<mlir::DenseElementsAttr>(op.getValueAttr());
+    auto scalar_attr = dense_attr.getValues<mlir::TypedAttr>()[0];
+    rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(op, scalar_attr);
+
+    return mlir::success();
+  }
+};
+
+class VectorToScalarPass
+    : public impl::VectorToScalarPassBase<VectorToScalarPass> {
+ public:
+  using VectorToScalarPassBase::VectorToScalarPassBase;
+
+  void runOnOperation() override {
+    mlir::TypeConverter type_converter;
+    type_converter.addConversion(&TypeConverter);
+
+    type_converter.addSourceMaterialization(&SourceMaterialization);
+    type_converter.addTargetMaterialization(&TargetMaterialization);
+
+    mlir::ConversionTarget target(getContext());
+
+    target.markUnknownOpDynamicallyLegal(
+        [&](mlir::Operation* op) -> std::optional<bool> {
+          if (op->hasTrait<mlir::OpTrait::Elementwise>()) {
+            return type_converter.isLegal(op);
+          }
+          return std::nullopt;
+        });
+
+    target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
+        [&](mlir::arith::ConstantOp op) {
+          return type_converter.isLegal(op.getOperation());
+        });
+
+    mlir::RewritePatternSet patterns(&getContext());
+
+    patterns.add<ElementwiseConverter, ConstantConversionPattern>(
+        type_converter, &getContext());
+
+    if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
+                                                  std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateVectorToScalarPass() {
+  return std::make_unique<VectorToScalarPass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
new file mode 100644
index 00000000000000..4f7354d0cc669c
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
@@ -0,0 +1,312 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h"
+
+namespace xla::cpu {
+
+static absl::StatusOr<mlir::vector::CombiningKind> GetCombiningKind(
+    mlir::Block& reduction_body) {
+  mlir::Operation* op =
+      reduction_body.getTerminator()->getOperand(0).getDefiningOp();
+  if (!op) {
+    return absl::InternalError("No reduction combiner");
+  }
+
+  for (mlir::Value operand : op->getOperands()) {
+    if (operand.getDefiningOp()) {
+      return absl::InternalError("Non trivial reduction combiner");
+    }
+  }
+
+  if (auto kind = mlir::linalg::getCombinerOpKind(op)) {
+    return *kind;
+  }
+
+  return absl::InternalError("Unsupported reduction combiner");
+}
+
+static void InsertValue(mlir::OpBuilder& builder, mlir::Location loc,
+                        mlir::Value value,
+                        mlir::TypedValue<mlir::MemRefType> buffer,
+                        mlir::ValueRange indices) {
+  llvm::SmallVector<mlir::Value> padded_indices(indices);
+  while (padded_indices.size() < buffer.getType().getRank()) {
+    padded_indices.push_back(
+        builder.create<mlir::arith::ConstantIndexOp>(loc, 0));
+  }
+
+  if (mlir::isa<mlir::VectorType>(value.getType())) {
+    mlir::vector::TransferWriteOp::create(builder, loc, value, buffer,
+                                          padded_indices);
+  } else {
+    mlir::memref::StoreOp::create(builder, loc, value, buffer, padded_indices);
+  }
+}
+
+static mlir::TypedValue<mlir::VectorType> ExtractVector(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::TypedValue<mlir::ShapedType> input, mlir::ValueRange indices = {}) {
+  llvm::SmallVector<mlir::Value> padded_indices(indices);
+  while (padded_indices.size() < input.getType().getRank()) {
+    padded_indices.push_back(
+        mlir::arith::ConstantIndexOp::create(builder, loc, 0));
+  }
+  mlir::VectorType vector_type = mlir::VectorType::get(
+      input.getType().getShape().drop_front(indices.size()),
+      input.getType().getElementType());
+  return mlir::vector::TransferReadOp::create(builder, loc, vector_type, input,
+                                              padded_indices,
+                                              /*padding=*/std::nullopt);
+}
+
+static std::array<llvm::SmallVector<mlir::Value>, 3> GetLoopBounds(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    llvm::ArrayRef<int64_t> upper_bounds, int64_t lower_bound = 0) {
+  llvm::SmallVector<mlir::Value> lbs(
+      upper_bounds.size(),
+      builder.create<mlir::arith::ConstantIndexOp>(loc, lower_bound));
+  llvm::SmallVector<mlir::Value> ubs =
+      llvm::map_to_vector(upper_bounds, [&](int64_t size) -> mlir::Value {
+        return builder.create<mlir::arith::ConstantIndexOp>(loc, size);
+      });
+  llvm::SmallVector<mlir::Value> step(
+      upper_bounds.size(),
+      builder.create<mlir::arith::ConstantIndexOp>(loc, 1));
+  return {lbs, ubs, step};
+}
+
+mlir::Value VectorizeBody(mlir::OpBuilder& builder, mlir::Location loc,
+                          mlir::Block& old_body, mlir::Value lhs_vector,
+                          mlir::Value rhs_vector) {
+  mlir::IRMapping mapping;
+
+  mapping.map(old_body.getArgument(0), lhs_vector);
+  mapping.map(old_body.getArgument(1), rhs_vector);
+
+  for (mlir::Operation& op : old_body.without_terminator()) {
+    // TODO(willfroom): Check
+    // mlir::OpTrait::hasElementwiseMappableTraits
+    auto new_operands = llvm::map_to_vector(
+        op.getOperands(),
+        [&](mlir::Value operand) { return mapping.lookup(operand); });
+    mlir::Operation* new_op = op.create(
+        loc, op.getName(), {lhs_vector.getType()}, new_operands, op.getAttrs(),
+        op.getPropertiesStorage(), op.getSuccessors(), op.getNumRegions());
+    mapping.map(&op, new_op);
+    for (auto [old_res, new_res] :
+         llvm::zip(op.getResults(), new_op->getResults())) {
+      mapping.map(old_res, new_res);
+    }
+    builder.insert(new_op);
+  }
+  return mapping.lookup(old_body.getTerminator()->getOperand(0));
+}
+
+// Reduce a 1D vector to a scalar with the given body.
+mlir::Value EmitMinorReduction(mlir::OpBuilder& builder, mlir::Location loc,
+                               mlir::RankedTensorType result_type,
+                               mlir::TypedValue<mlir::VectorType> input,
+                               mlir::Value init_value, mlir::Block& body) {
+  absl::StatusOr<mlir::vector::CombiningKind> kind_or = GetCombiningKind(body);
+  if (!kind_or.ok()) {
+    body.getParentOp()->emitRemark() << kind_or.status().ToString();
+  }
+
+  auto input_type = input.getType();
+  int64_t minor_dim_size = input_type.getShape().back();
+
+  if (kind_or.ok()) {
+    // TODO(willfroom): Investigate tree-reduction to split the reduction
+    // op into natural sizes (2, 4, 8, 16, ...) and then remove the
+    // reassociation flag.
+    mlir::Value reduced_scalar = mlir::vector::ReductionOp::create(
+        builder, loc, *kind_or, input, init_value,
+        mlir::arith::FastMathFlags::reassoc);
+
+    return reduced_scalar;
+  }
+
+  mlir::Value lbs = mlir::arith::ConstantIndexOp::create(builder, loc, 0);
+  mlir::Value ubs =
+      mlir::arith::ConstantIndexOp::create(builder, loc, minor_dim_size);
+  mlir::Value step = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
+  auto loop = mlir::scf::ForOp::create(
+      builder, loc, lbs, ubs, step, {init_value},
+      [&](mlir::OpBuilder& builder, mlir::Location loc, mlir::Value index,
+          mlir::ValueRange carry_value) {
+        mlir::TypedValue<mlir::VectorType> element_vector =
+            ExtractVector(builder, loc, input, index);
+        mlir::Value element =
+            mlir::vector::ExtractOp::create(builder, loc, element_vector);
+
+        mlir::Value result =
+            VectorizeBody(builder, loc, body, element, carry_value.front());
+
+        mlir::scf::YieldOp::create(builder, loc, result);
+      });
+
+  return loop.getResult(0);
+}
+
+mlir::TypedValue<mlir::MemRefType> EmitReductionLoop(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::RankedTensorType result_type,
+    mlir::TypedValue<mlir::RankedTensorType> source_tensor,
+    llvm::ArrayRef<int64_t> reduction_dims, mlir::Block& body,
+    mlir::Value init_value) {
+  mlir::RankedTensorType source_tensor_type = source_tensor.getType();
+  int64_t rank = source_tensor_type.getRank();
+  int64_t minor_dim = rank - 1;
+  bool minor_dim_reduced = reduction_dims.back() == minor_dim;
+
+  // The set of non-reduced dimensions that are not the minor dimension.
+  llvm::SmallVector<int64_t> non_reduced_non_minor_dims(rank);
+  absl::c_iota(non_reduced_non_minor_dims, 0);
+  non_reduced_non_minor_dims.erase(
+      std::remove_if(
+          non_reduced_non_minor_dims.begin(), non_reduced_non_minor_dims.end(),
+          [&](int64_t dim) {
+            return absl::c_find(reduction_dims, dim) != reduction_dims.end() ||
+                   dim == minor_dim;
+          }),
+      non_reduced_non_minor_dims.end());
+
+  // The set of reduced dimensions that are not the minor dimension.
+  llvm::SmallVector<int64_t> non_minor_reduced_dims(reduction_dims);
+  if (auto itr = absl::c_find(non_minor_reduced_dims, minor_dim);
+      itr != non_minor_reduced_dims.end()) {
+    non_minor_reduced_dims.erase(itr);
+  }
+
+  auto buffer = CreateBufferOfShape(builder, loc, result_type);
+
+  auto get_source_vector_dim_size = [&](llvm::ArrayRef<int64_t> dims) {
+    return llvm::map_to_vector(
+        dims, [&](int64_t dim) { return source_tensor_type.getDimSize(dim); });
+  };
+
+  // Outer loop is non-minor non-reduced dimensions.
+  auto [lbs, ubs, step] = GetLoopBounds(
+      builder, loc, get_source_vector_dim_size(non_reduced_non_minor_dims));
+
+  mlir::scf::buildLoopNest(
+      builder, loc, lbs, ubs, step,
+      [&](mlir::OpBuilder& builder, mlir::Location loc,
+          mlir::ValueRange outer_induction_vars) {
+        auto [lbs, ubs, step] = GetLoopBounds(
+            builder, loc, get_source_vector_dim_size(non_minor_reduced_dims),
+            1);
+
+        llvm::SmallVector<mlir::Value> zeroth_step_indices(
+            rank - 1, mlir::arith::ConstantIndexOp::create(builder, loc, 0));
+        for (auto [idx, var] :
+             llvm::zip(non_reduced_non_minor_dims, outer_induction_vars)) {
+          zeroth_step_indices[idx] = var;
+        }
+        // Get the first iteration
+        mlir::Value minor_accumilator =
+            ExtractVector(builder, loc, source_tensor, zeroth_step_indices);
+        // Inner loop is the non-minor reduced dimension.
+        mlir::scf::LoopNest loop_nest = mlir::scf::buildLoopNest(
+            builder, loc, lbs, ubs, step, minor_accumilator,
+            [&](mlir::OpBuilder& builder, mlir::Location loc,
+                mlir::ValueRange inner_induction_vars,
+                mlir::ValueRange minor_accumilator)
+                -> mlir::SmallVector<mlir::Value> {
+              // Handle the case when there are no non-minor reduced dimensions.
+              if (inner_induction_vars.empty()) {
+                return {minor_accumilator.front()};
+              }
+
+              llvm::SmallVector<mlir::Value> indices = zeroth_step_indices;
+              for (auto [idx, var] :
+                   llvm::zip(non_minor_reduced_dims, inner_induction_vars)) {
+                indices[idx] = var;
+              }
+
+              mlir::Value vector_slice =
+                  ExtractVector(builder, loc, source_tensor, indices);
+
+              return {VectorizeBody(builder, loc, body, vector_slice,
+                                    minor_accumilator.front())};
+            });
+
+        auto non_minor_reduced_result =
+            mlir::cast<mlir::TypedValue<mlir::VectorType>>(
+                loop_nest.results.front());
+
+        if (minor_dim_reduced) {
+          mlir::Value reduced_scalar =
+              EmitMinorReduction(builder, loc, result_type,
+                                 non_minor_reduced_result, init_value, body);
+
+          InsertValue(builder, loc, reduced_scalar, buffer,
+                      outer_induction_vars);
+        } else {
+          InsertValue(builder, loc, non_minor_reduced_result, buffer,
+                      outer_induction_vars);
+        }
+      });
+
+  return buffer;
+}
+
+mlir::Value EmitVectorizedReduction(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::RankedTensorType result_type,
+    mlir::TypedValue<mlir::RankedTensorType> source, mlir::Value init_value,
+    llvm::ArrayRef<int64_t> reduction_dims, mlir::Block& body) {
+  mlir::TypedValue<mlir::ShapedType> result;
+  result = EmitReductionLoop(builder, loc, result_type, source, reduction_dims,
+                             body, init_value);
+
+  auto to_tensor = mlir::bufferization::ToTensorOp::create(builder, loc,
+                                                           result_type, result);
+  // This is a local allocation so we know it doesn't alias.
+  to_tensor.setRestrict(true);
+  to_tensor.setWritable(true);
+  return to_tensor;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h
new file mode 100644
index 00000000000000..89443a52c7b3ab
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h
@@ -0,0 +1,48 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_VECTORIZED_REDUCE_EMITTER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_VECTORIZED_REDUCE_EMITTER_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Value.h"
+
+namespace xla::cpu {
+
+// Create a vectorized reduction of the given source vector.
+//
+// The implementation is as follows:
+// 1. If the reduction dimension is only the most minor we convert it into a
+//    nested scf.loop of horizonal reductions and if the body of the reduce is a
+//    single binary operation that is supported by ReductionOp we use that,
+//    otherwise we simply loop over the scalar values.
+// 2. If the reduction dimensions does not include the most minor dimension, we
+//    loop over the reductions dimensions and apply the body with vectorized
+//    inputs.
+// 3. If the dimensions are a combindation of minor & non-minor dimensions we
+//    simply apply strategy 2 followed by strategy 1.
+mlir::Value EmitVectorizedReduction(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::RankedTensorType result_type,
+    mlir::TypedValue<mlir::RankedTensorType> source, mlir::Value init_value,
+    llvm::ArrayRef<int64_t> reduction_dims, mlir::Block& body);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_VECTORIZED_REDUCE_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/BUILD b/third_party/xla/xla/backends/cpu/codegen/tools/BUILD
index 06ee08eb72c8c8..e58cdb044d3806 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/BUILD
@@ -4,6 +4,7 @@ load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
@@ -19,7 +20,10 @@ cc_library(
     testonly = True,
     srcs = ["ir_compiler_opt_main.cc"],
     deps = [
+        "//xla:debug_options_flags",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu/codegen:ir_compiler",
+        "//xla/service/cpu:cpu_compiler_pure",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -38,6 +42,8 @@ cc_library(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
@@ -79,7 +85,7 @@ xla_cc_binary(
     deps = [
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:fusion_emitter",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:kernel_definition",
         "//xla/codegen/tools:test_lib",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:statusor",
@@ -89,3 +95,23 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
+
+xla_cc_binary(
+    name = "fusion_compiler_opt",
+    srcs = ["fusion_compiler_opt.cc"],
+    deps = [
+        "//xla/backends/cpu/codegen:fusion_compiler",
+        "//xla/backends/cpu/codegen/emitters/transforms:passes",
+        "//xla/backends/cpu/codegen/tiled/transforms:passes",
+        "//xla/codegen/emitters/transforms:convert_pure_call_ops_pass",
+        "//xla/codegen/xtile/ir/transforms:passes",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:LLVMIRTransforms",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:RegisterAllPasses",  # buildcleaner: keep
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:linalg_passes",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/fusion_compiler_opt.cc b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_compiler_opt.cc
new file mode 100644
index 00000000000000..db5170f43ebbe9
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_compiler_opt.cc
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "stablehlo/conversions/linalg/transforms/Passes.h"
+#include "xla/backends/cpu/codegen/emitters/transforms/passes.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+#include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
+
+int main(int argc, char** argv) {
+  mlir::DialectRegistry registry =
+      xla::cpu::FusionCompiler::CreateDialectRegistry(true);
+
+  mlir::registerAllPasses();
+
+  xla::emitters::registerTransformsPasses();
+  xla::cpu::registerXlaCpuTransformsPasses();
+  xla::cpu::registerXTileCpuTransformsPasses();
+  xla::xtile::registerXTileTransformsPasses();
+  mlir::stablehlo::registerStablehloLinalgTransformsPasses();
+
+  return mlir::failed(MlirOptMain(
+      argc, argv, "XLA:CPU Fusion compiler pass driver\n", registry));
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc
index 7c19fe053e76af..9b08b75b378950 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "absl/log/check.h"
@@ -20,7 +21,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/fusion_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/tools/test_lib.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -30,13 +31,14 @@ limitations under the License.
 namespace xla::cpu {
 
 absl::Status Run(const std::string& filename) {
-  auto context = FusionCompiler::CreateContext();
+  auto mlir_context = FusionCompiler::CreateContext();
   TF_ASSIGN_OR_RETURN(auto module, LoadTestModule(filename));
   auto fusion = DynCast<HloFusionInstruction>(
       module->entry_computation()->root_instruction());
   fusion->SetAndSanitizeName("main");
-  TF_ASSIGN_OR_RETURN(MlirKernelDefinition kernel_definition,
-                      EmitFusionKernel(*context, *fusion, nullptr, false));
+  TF_ASSIGN_OR_RETURN(
+      KernelDefinition kernel_definition,
+      EmitFusionKernel(*mlir_context, *fusion, nullptr, false, false));
   llvm::outs() << kernel_definition.source().ToString();
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc b/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
index 3674eb396ba9b7..03c71381010e3a 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
@@ -40,6 +40,9 @@ limitations under the License.
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
+#include "xla/backends/cpu/target_machine_options.h"
+#include "xla/debug_options_flags.h"
+#include "xla/service/cpu/cpu_compiler.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -114,11 +117,12 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> ParseLlvmIr(
 absl::StatusOr<std::string> RunIrCompilerPasses(const IrCompilerOptConfig& opts,
                                                 int argc, char** argv) {
   llvm::TargetOptions target_options;
-  IrCompiler::Options ir_compiler_options;
   CHECK(opts.opt_level >= 0 && opts.opt_level <= 3)
       << "Optimization level must be between 0 and 3";
-  ir_compiler_options.opt_level =
-      static_cast<llvm::CodeGenOptLevel>(opts.opt_level);
+  IrCompiler::Options ir_compiler_options{
+      /*opt_level=*/static_cast<llvm::CodeGenOptLevel>(opts.opt_level),
+      /*optimize_for_size=*/false,
+      TargetMachineOptions(GetDebugOptionsFromFlags())};
   auto ir_compiler = IrCompiler::Create(target_options, ir_compiler_options,
                                         IrCompiler::CompilationHooks());
 
@@ -133,7 +137,7 @@ absl::StatusOr<std::string> RunIrCompilerPasses(const IrCompilerOptConfig& opts,
       std::unique_ptr<llvm::TargetMachine> target_machine,
       ir_compiler->InferTargetMachine(
           target_options, static_cast<llvm::CodeGenOptLevel>(opts.opt_level),
-          std::nullopt));
+          ir_compiler_options.target_machine_options));
 
   llvm::Error error = ir_compiler->RunIrPasses(*module, target_machine.get());
   if (error) {
diff --git a/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h b/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
index 330b35f52146d1..4e864c139f890b 100644
--- a/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
+++ b/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
@@ -64,7 +64,8 @@ class CpuCollectives : public Collectives {
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Config& config) final {
+      absl::Span<const RankId> keys, const Config& config,
+      absl::Span<const DeviceRank> ranks) final {
     return Unimplemented(
         "CPU collectives do not support communicator splitting");
   }
diff --git a/third_party/xla/xla/backends/cpu/ffi.h b/third_party/xla/xla/backends/cpu/ffi.h
new file mode 100644
index 00000000000000..3866e165942964
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/ffi.h
@@ -0,0 +1,66 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_FFI_H_
+#define XLA_BACKENDS_CPU_FFI_H_
+
+#include <optional>
+
+#include "absl/base/optimization.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/ffi.h"  // IWYU pragma: export
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}  // namespace Eigen
+
+namespace xla::ffi {
+
+//===----------------------------------------------------------------------===//
+// Type tags to bind parameters passed via execution context to FFI handler
+//===----------------------------------------------------------------------===//
+
+struct IntraOpThreadPool {};  // binds `const Eigen::ThreadPoolDevice*`
+
+//===----------------------------------------------------------------------===//
+// Context decoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct CtxDecoding<IntraOpThreadPool> {
+  using Type = const Eigen::ThreadPoolDevice*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    void* thread_pool = nullptr;
+    if (XLA_FFI_Error* error =
+            api->internal_api->XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
+                ctx, &thread_pool);
+        ABSL_PREDICT_FALSE(error)) {
+      diagnostic.Emit("Failed to get intra op thread pool: ")
+          << internal::GetErrorMessage(api, error);
+      internal::DestroyError(api, error);
+      return std::nullopt;
+    }
+
+    return reinterpret_cast<Type>(thread_pool);
+  }
+};
+
+}  // namespace xla::ffi
+
+#endif  // XLA_BACKENDS_CPU_FFI_H_
diff --git a/third_party/xla/xla/backends/cpu/nanort/BUILD b/third_party/xla/xla/backends/cpu/nanort/BUILD
index d445433b79051b..ee6537863a524d 100644
--- a/third_party/xla/xla/backends/cpu/nanort/BUILD
+++ b/third_party/xla/xla/backends/cpu/nanort/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/client:executable_build_options",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:utils",
@@ -61,6 +62,9 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/c_api_client:pjrt_c_api_client",
+        "//xla/pjrt/plugin:plugin_names",
+        "//xla/pjrt/plugin/xla_cpu:cpu_static_registration",
         "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "//xla/runtime:device_id",
         "//xla/service:computation_placer_hdr",
diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
index e816203685cd55..733dd00f6eb2f7 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
@@ -103,7 +103,7 @@ limitations under the License.
 namespace xla::cpu {
 namespace {
 
-static const char kMemoryKind[] = "";
+static const char kMemoryKind[] = "device";
 
 // Returns a Future that is immediately ready with the given status. This is
 // mostly useful because everything NanoRT does is immediately ready.
@@ -412,9 +412,7 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
 
   absl::StatusOr<std::shared_ptr<const PjRtLayout>> pjrt_layout()
       const override {
-    TF_RETURN_IF_ERROR(ValidateNotDeleted());
-    return std::make_shared<PjRtLayout>(
-        LayoutUtil::MakeDescendingLayout(shape().dims().size()));
+    return nullptr;
   }
 
   absl::StatusOr<std::vector<ifrt::ArrayRef>> DisassembleIntoSingleDeviceArrays(
@@ -646,7 +644,7 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
 
   absl::StatusOr<std::shared_ptr<const PjRtLayout>> pjrt_layout()
       const override {
-    return std::make_shared<PjRtLayout>(Layout(shape().dims()));
+    return nullptr;
   }
 
   absl::StatusOr<std::vector<ifrt::ArrayRef>> DisassembleIntoSingleDeviceArrays(
@@ -682,6 +680,10 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
 
   absl::StatusOr<tsl::RCReference<NanoArray>> Assemble(
       ifrt::ShardingRef sharding) {
+    if (sharding->IsFullyReplicated()) {
+      return shards_[0];
+    }
+
     TF_ASSIGN_OR_RETURN(auto index_domains, sharding->IndexDomains(shape()));
     if (ABSL_PREDICT_FALSE(index_domains.size() != shards_.size())) {
       return absl::FailedPreconditionError(
@@ -1007,7 +1009,9 @@ class NanoExecutable final
     return client_->addressable_devices();
   }
 
-  const ifrt::DeviceListRef& devices() const override { return devices_; }
+  std::optional<ifrt::DeviceListRef> devices() const override {
+    return devices_;
+  }
 
   static char ID;  // NOLINT
 
diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc
index 2d22ff7086ece2..7fc51e21a47551 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc
@@ -319,6 +319,8 @@ int main(int argc, char** argv) {
       "ArrayImplTest.MakeArrayFromHostBufferAndCopyToHostBufferWithString:"
       "ArrayImplTest."
       "MakeArraysFromHostBufferShardsAndCopyToHostBufferWithString:"
+      // Custom layouts are not supported in NanoIfrtClient.
+      "ArrayImplTest.MakeArraysFromHostBufferShardsWithLayout:"
       // `MakeErrorArrays` is not supported in NanoIfrtClient.
       "ArrayImplTest.MakeErrorArrays:"
       "ArrayImplTest.CopyPoisonedArray:"
@@ -335,6 +337,8 @@ int main(int argc, char** argv) {
       "*LoadedExecutableImplTest.GetHloModules*:"
       "*LoadedExecutableImplTest.ProgramText*:"
       "*LoadedExecutableImplTest.Analysis*:"
+      // NanoRT does not support portable execution.
+      "*LoadedExecutableImplTest.CompileAndExecutePortable*:"
       // Serialization is not implemented.
       "*SerializeAndLoad*";
   xla::ifrt::test_util::SetTestFilterIfNotUserSpecified(kFilter);
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
index 3dd8db1f22ebaa..bec4d24483521b 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -42,7 +43,8 @@ using ::tsl::profiler::TraceMe;
 using ::tsl::profiler::TraceMeEncode;
 
 absl::StatusOr<std::unique_ptr<NanoRtExecutable>> NanoRtClient::Compile(
-    const XlaComputation& computation) {
+    const XlaComputation& computation,
+    const ExecutableBuildOptions& executable_build_options) {
   TraceMe trace([&] {
     return TraceMeEncode("NanoRtClient::Compile",
                          {{"computation", computation.name()}});
@@ -61,14 +63,16 @@ absl::StatusOr<std::unique_ptr<NanoRtExecutable>> NanoRtClient::Compile(
   static constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
 
-  // Use default XLA compiler options.
   Compiler::CompileOptions compile_options;
 
   // Run high-level XLA CPU compiler passes.
   cpu::CpuCompiler compiler;
-  TF_ASSIGN_OR_RETURN(hlo_module, compiler.RunHloPasses(std::move(hlo_module),
-                                                        /*stream_exec=*/nullptr,
-                                                        compile_options));
+  if (!executable_build_options.run_backend_only()) {
+    TF_ASSIGN_OR_RETURN(
+        hlo_module,
+        compiler.RunHloPasses(std::move(hlo_module),
+                              /*stream_exec=*/nullptr, compile_options));
+  }
 
   auto optimized_hlo_program_shape =
       hlo_module->entry_computation_layout().ComputeProgramShape();
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
index 3dc3ddf2962019..cad9c584a0fba4 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/service/compiler.h"
 
@@ -33,7 +34,9 @@ class NanoRtClient {
   // Compiles the given XLA computation to a NanoRtExecutable using the XLA:CPU
   // backend.
   absl::StatusOr<std::unique_ptr<NanoRtExecutable>> Compile(
-      const XlaComputation& computation);
+      const XlaComputation& computation,
+      const ExecutableBuildOptions& executable_build_options =
+          ExecutableBuildOptions());
 
   // Exports the given NanoRtExecutable to an AotCompilationResult.
   absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
index d8df49dcb12c9a..01c74ffe3bece6 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
@@ -42,8 +42,10 @@ limitations under the License.
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/plugin/plugin_names.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/computation_placer.h"
@@ -577,8 +579,11 @@ BENCHMARK_CAPTURE(BM_NanoRtFibonacci, no_thread_pool, std::nullopt);
 BENCHMARK_CAPTURE(BM_NanoRtFibonacci, thread_pool,
                   std::make_optional<Eigen::ThreadPool>(2));
 
-static void BM_PjRtAddScalars(benchmark::State& state) {
-  auto client = GetXlaPjrtCpuClient(/*options=*/{});
+static void BM_PjRtAddScalars(benchmark::State& state,
+                              bool use_c_api_sandwich) {
+  auto client = use_c_api_sandwich
+                    ? xla::GetCApiClient(kCpuPjrtName, /*create_options=*/{})
+                    : GetXlaPjrtCpuClient(/*options=*/{});
   PjRtDevice* device = (*client)->devices().front();
   PjRtMemorySpace* memory_space = *device->default_memory_space();
 
@@ -609,10 +614,13 @@ static void BM_PjRtAddScalars(benchmark::State& state) {
   }
 }
 
-BENCHMARK(BM_PjRtAddScalars);
+BENCHMARK_CAPTURE(BM_PjRtAddScalars, Direct, false);
+BENCHMARK_CAPTURE(BM_PjRtAddScalars, CSandwich, true);
 
-static void BM_PjRtFibonacci(benchmark::State& state) {
-  auto client = GetXlaPjrtCpuClient(/*options=*/{});
+static void BM_PjRtFibonacci(benchmark::State& state, bool use_c_api_sandwich) {
+  auto client = use_c_api_sandwich
+                    ? xla::GetCApiClient(kCpuPjrtName, /*create_options=*/{})
+                    : GetXlaPjrtCpuClient(/*options=*/{});
   PjRtDevice* device = (*client)->devices().front();
   PjRtMemorySpace* memory_space = *device->default_memory_space();
 
@@ -643,7 +651,8 @@ static void BM_PjRtFibonacci(benchmark::State& state) {
   }
 }
 
-BENCHMARK(BM_PjRtFibonacci);
+BENCHMARK_CAPTURE(BM_PjRtFibonacci, Direct, false);
+BENCHMARK_CAPTURE(BM_PjRtFibonacci, CSandwich, true);
 
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
index f12920e5a64a9d..baecfc09471db4 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
+#include <new>
 #include <optional>
 #include <utility>
 #include <vector>
@@ -174,7 +175,9 @@ class NanoRtExecutable {
 
    private:
     friend class NanoRtExecutable;
-    using Allocator = tsl::port::AlignedAllocator<std::byte, Align()>;
+    using Allocator =
+        tsl::port::AlignedAllocator<std::byte,
+                                    static_cast<std::align_val_t>(Align())>;
     alignas(Align()) absl::FixedArray<std::byte, n, Allocator> data_;
   };
 
diff --git a/third_party/xla/xla/backends/cpu/onednn_emitter.cc b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
index 43641cc2cf119a..0ea1f6a916ffc5 100644
--- a/third_party/xla/xla/backends/cpu/onednn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "oneapi/dnnl/dnnl_graph.hpp"  // NOLINT
 #include "xla/backends/cpu/onednn_fusion.h"
 #include "xla/backends/cpu/onednn_support.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/onednn/onednn_interop.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -100,7 +100,7 @@ static dnnl::graph::logical_tensor::dims OneDnnDimensions(const Shape& shape) {
 }
 
 static dnnl::graph::logical_tensor::dims OneDnnStrides(const Shape& shape) {
-  dnnl::graph::logical_tensor::dims strides(shape.dimensions_size());
+  dnnl::graph::logical_tensor::dims strides(shape.dimensions().size());
   int64_t stride = 1;
   for (int i : shape.layout().minor_to_major()) {
     strides.at(i) = stride;
diff --git a/third_party/xla/xla/backends/cpu/onednn_support.cc b/third_party/xla/xla/backends/cpu/onednn_support.cc
index 396e681bcc700d..332c0a1cf70212 100644
--- a/third_party/xla/xla/backends/cpu/onednn_support.cc
+++ b/third_party/xla/xla/backends/cpu/onednn_support.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "dnnl.hpp"  // NOLINT: for DNNL_MAX_NDIMS
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
@@ -84,9 +84,9 @@ absl::StatusOr<bool> IsOneDnnDotSupported(
   }
 
   // NOLINTNEXTLINE: Use dnnl.hpp for DNNL_MAX_NDIMS for now.
-  if (lhs_shape.dimensions_size() > DNNL_MAX_NDIMS ||
-      rhs_shape.dimensions_size() > DNNL_MAX_NDIMS ||
-      lhs_shape.dimensions_size() != rhs_shape.dimensions_size()) {
+  if (lhs_shape.dimensions().size() > DNNL_MAX_NDIMS ||
+      rhs_shape.dimensions().size() > DNNL_MAX_NDIMS ||
+      lhs_shape.dimensions().size() != rhs_shape.dimensions().size()) {
     return false;
   }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index 91d68bb0312ddf..bbff5423620301 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -4,6 +4,7 @@ load("//xla/tsl:tsl.bzl", "if_windows", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -21,9 +22,17 @@ package_group(
 filegroup(
     name = "runtime_srcs",
     srcs = [
-        "convolution_thunk_f16.cc",
-        "convolution_thunk_f32.cc",
+        "convolution_lib_f16.cc",
+        "convolution_lib_f32.cc",
+        "dot_lib_c128.cc",
+        "dot_lib_c64.cc",
+        "dot_lib_f16.cc",
+        "dot_lib_f32.cc",
+        "dot_lib_f64.cc",
+        "dot_lib_s32.cc",
+        "dot_lib_s8.cc",
         "rng_state_lib.cc",
+        "sort_lib.cc",
     ],
     visibility = internal_visibility([":friends"]),
 )
@@ -31,9 +40,11 @@ filegroup(
 filegroup(
     name = "runtime_hdrs",
     srcs = [
-        "convolution_thunk_internal.h",
+        "convolution_lib.h",
+        "dot_lib.h",
         "kernel_c_api.h",
         "rng_state_lib.h",
+        "sort_lib.h",
         "work_queue.h",
     ],
     visibility = internal_visibility([":friends"]),
@@ -135,9 +146,9 @@ cc_library(
     deps = [
         ":kernel_c_api",
         "//xla/tsl/lib/gtl:int_type",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -147,7 +158,8 @@ tf_proto_library(
     create_grpc_library = True,
     make_default_target_header_only = True,
     protodeps = [
-        "//xla/backends/cpu:xnnpack_config_proto",
+        "//xla/backends/cpu:xnn_fusion_options_proto",
+        "//xla/backends/cpu:ynn_fusion_options_proto",
         "//xla/service:buffer_assignment_proto",
         "//xla:xla_data_proto",
         "//xla/service:hlo_proto",
@@ -160,6 +172,7 @@ cc_library(
     name = "thunk",
     srcs = ["thunk.cc"],
     hdrs = ["thunk.h"],
+    defines = if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":buffer_allocations",
         ":function_library",
@@ -188,7 +201,10 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
-    ],
+    ] + if_ynnpack([
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_threadpool",
+    ]),
 )
 
 cc_library(
@@ -309,6 +325,7 @@ cc_library(
     deps = [
         ":thunk",
         ":thunk_executor",
+        "//xla:shape_util",
         "//xla:util",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
@@ -330,6 +347,7 @@ xla_cc_test(
         ":conditional_thunk",
         ":thunk",
         ":thunk_testlib",
+        "//xla:shape_util",
         "//xla/runtime:buffer_use",
         "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
@@ -365,9 +383,9 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_lib",
-    srcs = ["convolution_lib.cc"],
-    hdrs = ["convolution_lib.h"],
+    name = "convolution_dims",
+    srcs = ["convolution_dims.cc"],
+    hdrs = ["convolution_dims.h"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -385,20 +403,22 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_thunk_internal",
+    name = "convolution_lib",
     srcs = [
-        "convolution_thunk_f16.cc",
-        "convolution_thunk_f32.cc",
+        "convolution_lib_f16.cc",
+        "convolution_lib_f32.cc",
     ],
-    hdrs = ["convolution_thunk_internal.h"],
+    hdrs = ["convolution_lib.h"],
     copts = runtime_copts(),
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":work_queue",
+        "//xla/backends/cpu/runtime:work_queue",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/framework/contraction:eigen_contraction_kernel",
         "//xla/tsl/framework/convolution:eigen_helpers",
-        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -407,10 +427,9 @@ cc_library(
     name = "convolution_thunk",
     srcs = ["convolution_thunk.cc"],
     hdrs = ["convolution_thunk.h"],
-    copts = runtime_copts(),
     deps = [
+        ":convolution_dims",
         ":convolution_lib",
-        ":convolution_thunk_internal",
         ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
@@ -704,9 +723,9 @@ cc_library(
 )
 
 cc_library(
-    name = "dot_lib",
-    srcs = ["dot_lib.cc"],
-    hdrs = ["dot_lib.h"],
+    name = "dot_dims",
+    srcs = ["dot_dims.cc"],
+    hdrs = ["dot_dims.h"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -723,20 +742,54 @@ cc_library(
 )
 
 cc_library(
-    name = "dot_thunk",
+    name = "dot_lib",
     srcs = [
-        "dot_thunk.cc",
-        "dot_thunk_c128.cc",
-        "dot_thunk_c64.cc",
-        "dot_thunk_f16.cc",
-        "dot_thunk_f32.cc",
-        "dot_thunk_f64.cc",
-        "dot_thunk_s32.cc",
-        "dot_thunk_s8.cc",
+        "dot_lib_c128.cc",
+        "dot_lib_c64.cc",
+        "dot_lib_f16.cc",
+        "dot_lib_f32.cc",
+        "dot_lib_f64.cc",
+        "dot_lib_s32.cc",
+        "dot_lib_s8.cc",
     ],
+    hdrs = ["dot_lib.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+# By including `eigen_contraction_kernel` into the list of dependencies, we enable the use of
+# oneDNN Eigen contraction kernel for jit-compiling microkernels. This brings oneDNN to the list
+# of transitive dependencies, and some clients (e.g. tfcompile) don't want this extra dependency.
+cc_library(
+    name = "dot_lib_onednn",
+    srcs = [
+        "dot_lib_c128.cc",
+        "dot_lib_c64.cc",
+        "dot_lib_f16.cc",
+        "dot_lib_f32.cc",
+        "dot_lib_f64.cc",
+        "dot_lib_s32.cc",
+        "dot_lib_s8.cc",
+    ],
+    hdrs = ["dot_lib.h"],
+    deps = [
+        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "dot_thunk",
+    srcs = ["dot_thunk.cc"],
     hdrs = ["dot_thunk.h"],
     deps = [
-        ":dot_lib",
+        ":dot_dims",
+        ":dot_lib_onednn",
         ":thunk",
         "//xla:shape_util",
         "//xla:types",
@@ -745,7 +798,6 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
@@ -814,9 +866,8 @@ xla_cc_test(
         "//xla/runtime:buffer_use",
         "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1010,12 +1061,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sort_lib",
+    srcs = ["sort_lib.cc"],
+    hdrs = ["sort_lib.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "sort_thunk",
     srcs = ["sort_thunk.cc"],
     hdrs = ["sort_thunk.h"],
     deps = [
         ":function_library",
+        ":sort_lib",
         ":thunk",
         "//xla:shape_util",
         "//xla:util",
@@ -1076,6 +1142,8 @@ cc_library(
         ":buffer_allocations",
         ":thunk",
         ":thunk_executor",
+        "//xla:shape_util",
+        "//xla:util",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
@@ -1101,6 +1169,7 @@ xla_cc_test(
         ":thunk_testlib",
         ":while_thunk",
         "//xla:literal_util",
+        "//xla:shape_util",
         "//xla/runtime:buffer_use",
         "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
@@ -1137,18 +1206,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "topk_lib",
+    hdrs = ["topk_lib.h"],
+    deps = [
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/numeric:bits",
+    ],
+)
+
 cc_library(
     name = "topk_thunk",
     srcs = ["topk_thunk.cc"],
     hdrs = ["topk_thunk.h"],
     deps = [
         ":thunk",
+        ":topk_lib",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
-        "//xla/service/cpu:runtime_topk",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
     ],
@@ -1172,7 +1252,7 @@ cc_library(
         ":collective_permute_thunk",
         ":collective_thunk",
         ":conditional_thunk",
-        ":convolution_lib",
+        ":convolution_dims",
         ":convolution_thunk",
         ":copy_thunk",
         ":custom_call_thunk",
@@ -1192,26 +1272,35 @@ cc_library(
         ":while_thunk",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/backends/cpu:xnnpack_config_proto_cc",
+        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
+        "//xla/backends/cpu:ynn_fusion_options_proto_cc",
         "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:resource_use",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
-    ],
+    ] + if_ynnpack([
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
+        "//xla/backends/cpu:ynn_emitter",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+    ]),
 )
 
 cc_library(
@@ -1246,6 +1335,7 @@ xla_cc_test(
 xla_cc_test(
     name = "thunk_sequence_serdes_test",
     srcs = ["thunk_sequence_serdes_test.cc"],
+    local_defines = if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":all_gather_thunk",
         ":all_reduce_thunk",
@@ -1261,7 +1351,6 @@ xla_cc_test(
         ":dot_thunk",
         ":fft_thunk",
         ":infeed_thunk",
-        ":kernel",
         ":kernel_thunk",
         ":logical_id_thunk",
         ":outfeed_thunk",
@@ -1293,7 +1382,6 @@ xla_cc_test(
         "//xla/service:hlo_proto_cc",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1305,7 +1393,9 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:casts",
-    ],
+    ] + if_ynnpack([
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
+    ]),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc
index bed45d3d4491b6..908fad230246dd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc
@@ -91,11 +91,13 @@ CollectiveThunk::CollectiveThunk(CollectiveKind collective_kind,
 Thunk::BufferUses CollectiveThunk::buffer_uses() const {
   BufferUses uses;
   uses.reserve(source_buffers().size() + destination_buffers().size());
-  for (auto& source_buffer : source_buffers()) {
-    uses.push_back(BufferUse::Read(source_buffer));
+  for (int i = 0; i < source_buffers().size(); i++) {
+    uses.push_back(BufferUse::Read(op_buffers_.source_buffers[i],
+                                   op_buffers_.source_shapes[i]));
   }
-  for (auto& destination_buffer : destination_buffers()) {
-    uses.push_back(BufferUse::Write(destination_buffer));
+  for (int i = 0; i < destination_buffers().size(); i++) {
+    uses.push_back(BufferUse::Write(op_buffers_.destination_buffers[i],
+                                    op_buffers_.destination_shapes[i]));
   }
   return uses;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
index 308f95877e3a80..22b7d01cc22906 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
@@ -31,12 +31,36 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla::cpu {
+namespace {
+
+absl::StatusOr<Shape> ShapeForBranchIndexBuffer(
+    BufferAllocation::Slice& branch_index_buffer) {
+  // See operation semantics documentation:
+  // https://openxla.org/xla/operation_semantics#conditional
+
+  // Branch index is pred[].
+  if (branch_index_buffer.size() == sizeof(bool)) {
+    return ShapeUtil::MakeShape(PRED, {1});
+  }
+
+  // Branch index is s32[].
+  if (branch_index_buffer.size() == sizeof(int32_t)) {
+    return ShapeUtil::MakeShape(S32, {1});
+  }
+
+  return Internal("Unsupported branch index buffer size %d",
+                  branch_index_buffer.size());
+}
+
+}  // namespace
 
 absl::StatusOr<std::unique_ptr<ConditionalThunk>> ConditionalThunk::Create(
     Info info, BufferAllocation::Slice branch_index_buffer,
@@ -48,16 +72,22 @@ absl::StatusOr<std::unique_ptr<ConditionalThunk>> ConditionalThunk::Create(
                         ThunkExecutor::Create(std::move(branch_sequence)));
     branch_executors.push_back(std::move(branch_executor));
   }
-  return absl::WrapUnique(new ConditionalThunk(std::move(info),
-                                               std::move(branch_index_buffer),
-                                               std::move(branch_executors)));
+
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      ShapeForBranchIndexBuffer(branch_index_buffer));
+
+  return absl::WrapUnique(
+      new ConditionalThunk(std::move(info), std::move(branch_index_buffer),
+                           shape, std::move(branch_executors)));
 }
 
 ConditionalThunk::ConditionalThunk(Info info,
                                    BufferAllocation::Slice branch_index_buffer,
+                                   Shape branch_index_buffer_shape,
                                    std::vector<ThunkExecutor> branch_executors)
     : Thunk(Kind::kConditional, std::move(info)),
       branch_index_buffer_(branch_index_buffer),
+      branch_index_buffer_shape_(branch_index_buffer_shape),
       branch_executors_(std::move(branch_executors)) {}
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent> ConditionalThunk::Execute(
@@ -79,18 +109,13 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> ConditionalThunk::Execute(
                : branch_index;
   };
 
-  // See operation semantics documentation:
-  // https://openxla.org/xla/operation_semantics#conditional
-
-  // Branch index is pred[].
-  if (branch_index_buffer_.size() == sizeof(bool)) {
+  if (branch_index_buffer_shape_.element_type() == PRED) {
     bool* pred = reinterpret_cast<bool*>(branch_index_data.opaque());
     VLOG(3) << "  loaded pred[] branch index: " << *pred;
     return branch_executors_.at(*pred ? 0 : 1).Execute(params);
   }
 
-  // Branch index is s32[].
-  if (branch_index_buffer_.size() == sizeof(int32_t)) {
+  if (branch_index_buffer_shape_.element_type() == S32) {
     int32_t* index = reinterpret_cast<int32_t*>(branch_index_data.opaque());
     VLOG(3) << "  loaded s32[] branch index: " << *index;
     return branch_executors_.at(clamp(*index)).Execute(params);
@@ -101,7 +126,8 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> ConditionalThunk::Execute(
 }
 
 ConditionalThunk::BufferUses ConditionalThunk::buffer_uses() const {
-  BufferUses buffer_uses = {BufferUse::Read(branch_index_buffer_)};
+  BufferUses buffer_uses = {
+      BufferUse::Read(branch_index_buffer_, branch_index_buffer_shape_)};
   for (const auto& branch_executor : branch_executors_) {
     BufferUses uses = branch_executor.buffer_uses();
     buffer_uses.insert(buffer_uses.end(), uses.begin(), uses.end());
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
index 7a980561c7bb16..4baca3d05da38b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla::cpu {
@@ -53,9 +54,11 @@ class ConditionalThunk final : public Thunk {
 
  private:
   ConditionalThunk(Info info, BufferAllocation::Slice branch_index_buffer,
+                   Shape branch_index_buffer_shape,
                    std::vector<ThunkExecutor> branch_executors);
 
   BufferAllocation::Slice branch_index_buffer_;
+  Shape branch_index_buffer_shape_;
   std::vector<ThunkExecutor> branch_executors_;
 };
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
index b3f0cda919a5dc..b2a918018fc137 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -33,20 +35,24 @@ namespace {
 
 TEST(ConditionalThunkTest, BufferUses) {
   BufferAllocation alloc(0, 1024, 0);
+  Shape branch_index_slice_shape = ShapeUtil::MakeShape(S32, {1});
   BufferAllocation::Slice branch_index_slice(&alloc, 0, sizeof(int32_t));
-  BufferAllocation::Slice read_slice(&alloc, 10, 10);
+  Shape read_slice_shape = ShapeUtil::MakeShape(F32, {4});
+  BufferAllocation::Slice read_slice(&alloc, 10, 12);
 
   std::vector<ThunkSequence> branch_sequences(1);
-  branch_sequences[0].push_back(
-      std::make_unique<BufferUseThunk>(BufferUse::Read(read_slice)));
+  branch_sequences[0].push_back(std::make_unique<BufferUseThunk>(
+      BufferUse::Read(read_slice, read_slice_shape)));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto thunk, ConditionalThunk::Create({"conditional"}, branch_index_slice,
                                            std::move(branch_sequences)));
 
   EXPECT_EQ(thunk->buffer_uses().size(), 2);
-  EXPECT_EQ(thunk->buffer_uses()[0], BufferUse::Read(branch_index_slice));
-  EXPECT_EQ(thunk->buffer_uses()[1], BufferUse::Read(read_slice));
+  EXPECT_EQ(thunk->buffer_uses()[0],
+            BufferUse::Read(branch_index_slice, branch_index_slice_shape));
+  EXPECT_EQ(thunk->buffer_uses()[1],
+            BufferUse::Read(read_slice, read_slice_shape));
 }
 
 TEST(ConditionalThunkTest, ResourceUses) {
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.cc
similarity index 96%
rename from third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc
rename to third_party/xla/xla/backends/cpu/runtime/convolution_dims.cc
index cda85eb51229f9..281451c65e2823 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -33,9 +33,9 @@ namespace xla::cpu {
 
 absl::InlinedVector<BufferUse, 4> ConvolutionBufferUses(
     const ConvolutionSlices& slices) {
-  return {BufferUse::Read(slices.input_buffer),
-          BufferUse::Read(slices.kernel_buffer),
-          BufferUse::Write(slices.output_buffer)};
+  return {BufferUse::Read(slices.input_buffer, slices.input_shape),
+          BufferUse::Read(slices.kernel_buffer, slices.kernel_shape),
+          BufferUse::Write(slices.output_buffer, slices.output_shape)};
 }
 
 ConvolutionCanonicalDims::Dims::Dims(absl::Span<const int64_t> dims)
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_dims.h b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.h
new file mode 100644
index 00000000000000..312b3918b2a483
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.h
@@ -0,0 +1,128 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_DIMS_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_DIMS_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// Allocation slices of the convolution operation.
+struct ConvolutionSlices {
+  BufferAllocation::Slice input_buffer;
+  Shape input_shape;
+
+  BufferAllocation::Slice kernel_buffer;
+  Shape kernel_shape;
+
+  BufferAllocation::Slice output_buffer;
+  Shape output_shape;
+};
+
+// Returns buffer uses of the dot operation.
+absl::InlinedVector<BufferUse, 4> ConvolutionBufferUses(
+    const ConvolutionSlices& slices);
+
+// Convolution dimensions in canonical form inferred from the operands shapes
+// and convolution parameters.
+struct ConvolutionCanonicalDims {
+  // A helper struct to store the x, y and z dimensions of a tensor, introduced
+  // for readability. In case of 2D convolution, only the x and y dimensions are
+  // used and z is set to 0.
+  struct Dims {
+    explicit Dims(absl::Span<const int64_t> dims);
+
+    template <typename Sink>
+    friend void AbslStringify(Sink& sink, const Dims& d);
+
+    int64_t rank;
+    int64_t x;
+    int64_t y;
+    int64_t z;
+  };
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d);
+
+  size_t convolution_rank() const { return input_dims.rank; }
+
+  int64_t input_batch;
+  Dims input_dims;
+  int64_t input_channels;
+
+  Dims kernel_dims;
+  int64_t kernel_channels;
+  int64_t kernel_filters;
+
+  Dims output_dims;
+
+  Dims strides;
+  Dims padding_before;
+  Dims padding_after;
+  Dims base_dilation;
+  Dims window_dilation;
+
+  int64_t feature_group_count;
+};
+
+// Get convolution dimensions in canonical form inferred from the operands
+// shapes and convolution parameters.
+absl::StatusOr<ConvolutionCanonicalDims> GetConvolutionCanonicalDims(
+    const ConvolutionSlices& slices, const ConvolutionDimensionNumbers& dnums,
+    const Window& window, int64_t feature_group_count);
+
+template <typename Sink>
+void AbslStringify(Sink& sink, const ConvolutionCanonicalDims::Dims& d) {
+  switch (d.rank) {
+    case 2:
+      absl::Format(&sink, "[%d,%d]", d.x, d.y);
+      break;
+    case 3:
+      absl::Format(&sink, "[%d,%d,%d]", d.x, d.y, d.z);
+      break;
+    default:
+      absl::Format(&sink, "[invalid rank %d]", d.rank);
+  }
+}
+
+template <typename Sink>
+void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d) {
+  absl::Format(&sink,
+               "convolution_rank=%d input_batch=%d input_dims=%v "
+               "input_channels=%d kernel_dims=%v kernel_channels=%d "
+               "kernel_filters=%d output_dims=%v strides=%v padding_before=%v "
+               "padding_after=%v base_dilation=%v window_dilation=%v "
+               "feature_group_count=%d",
+               d.convolution_rank(), d.input_batch, d.input_dims,
+               d.input_channels, d.kernel_dims, d.kernel_channels,
+               d.kernel_filters, d.output_dims, d.strides, d.padding_before,
+               d.padding_after, d.base_dilation, d.window_dilation,
+               d.feature_group_count);
+}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_DIMS_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h b/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h
index bf4610a3f18a64..cc42cfd2c079cd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h
@@ -1,4 +1,4 @@
-/* Copyright 2025 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,113 +16,650 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_LIB_H_
 #define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_LIB_H_
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <memory>
+#include <utility>
 
-#include "absl/container/inlined_vector.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Allocation slices of the convolution operation.
-struct ConvolutionSlices {
-  BufferAllocation::Slice input_buffer;
-  Shape input_shape;
-
-  BufferAllocation::Slice kernel_buffer;
-  Shape kernel_shape;
-
-  BufferAllocation::Slice output_buffer;
-  Shape output_shape;
-};
-
-// Returns buffer uses of the dot operation.
-absl::InlinedVector<BufferUse, 4> ConvolutionBufferUses(
-    const ConvolutionSlices& slices);
-
-// Convolution dimensions in canonical form inferred from the operands shapes
-// and convolution parameters.
-struct ConvolutionCanonicalDims {
-  // A helper struct to store the x, y and z dimensions of a tensor, introduced
-  // for readability. In case of 2D convolution, only the x and y dimensions are
-  // used and z is set to 0.
-  struct Dims {
-    explicit Dims(absl::Span<const int64_t> dims);
-
-    template <typename Sink>
-    friend void AbslStringify(Sink& sink, const Dims& d);
-
-    int64_t rank;
-    int64_t x;
-    int64_t y;
-    int64_t z;
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/backends/cpu/runtime/work_queue.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"  // IWYU pragma: keep
+
+#define EIGEN_USE_THREADS
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu::internal {
+
+// Done callback is called when the Convolution computation is complete.
+using DoneCallback = absl::AnyInvocable<void()>;
+
+template <typename ScalarType>
+void EigenConv2D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_channels,
+                 Eigen::Index kernel_x, Eigen::Index kernel_y,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index x_stride, Eigen::Index y_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done);
+
+template <typename ScalarType>
+void EigenConv3D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_z,
+                 Eigen::Index input_channels, Eigen::Index kernel_x,
+                 Eigen::Index kernel_y, Eigen::Index kernel_z,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index output_z, Eigen::Index x_stride,
+                 Eigen::Index y_stride, Eigen::Index z_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index padding_z_before, Eigen::Index padding_z_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
+                 Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done);
+
+//===----------------------------------------------------------------------===//
+// Convolution 2D implementation details.
+//===----------------------------------------------------------------------===//
+
+// Returns in 'out_data' (assumes to be zero-initialized) image patch in storage
+// order (width, height, depth), constructed from patches in 'conv_matrix',
+// which is required to be in storage order (in_width * in_height, filter_width,
+// filter_height, out_depth).
+// Based on TF implementation by Yangqing Jia (jiayq).
+// TODO(adambanas): The original implementation implicitly rotates the kernel by
+// 180 degrees, but to be backwards compatible, we cannot do that in XLA. This
+// results in counterintuitive operations on conv_matrix, which is also 15-20%
+// slower. Try alternative approaches (e.g. rotate kernel before matrix
+// multiplication in the calling function).
+template <typename T>
+void Pack2DPatches(const T* conv_matrix, const int depth, const int height,
+                   const int width, const int filter_h, const int filter_w,
+                   const int pad_top, const int pad_bottom, const int pad_left,
+                   const int pad_right, const int stride_h, const int stride_w,
+                   T* __restrict out_im_data) {
+  int w_patches_number =
+      (width + filter_w - pad_left - pad_right - 2) / stride_w + 1;
+  int h_patches_number =
+      (height + filter_h - pad_top - pad_bottom - 2) / stride_h + 1;
+
+  const int filter_spatial_size = filter_h * filter_w;
+
+  int w_patch_begin = pad_left - filter_w + 1;
+  conv_matrix += depth * (filter_spatial_size - 1);
+  for (int w = 0; w < w_patches_number; ++w) {
+    int h_patch_begin = pad_top - filter_h + 1;
+    for (int h = 0; h < h_patches_number; ++h) {
+      // This loop body covers 1 output patch, at all depths, accounting for
+      // padding. The next line is always a pointer to the first element of the
+      // new output patch. Notice in case of less-than-full padding, the pointer
+      // can point to an element outside the image, but such elements will be
+      // skipped by the inner if (so no write occurs).
+      T* out_im_patch_data =
+          out_im_data + (w_patch_begin * height + h_patch_begin) * depth;
+
+      for (int iw = w_patch_begin; iw < w_patch_begin + filter_w; ++iw) {
+        for (int ih = h_patch_begin; ih < h_patch_begin + filter_h; ++ih) {
+          // This loop body covers 1 spatial point with coordinates (iw, ih)
+          // in the output buffer, at all depths
+          if (iw >= 0 && iw < width && ih >= 0 && ih < height) {
+            for (int i = 0; i < depth; ++i) {
+              out_im_patch_data[i] += conv_matrix[i];
+            }
+          }
+          out_im_patch_data += depth;
+          conv_matrix -= depth;
+        }
+        // Jump over remaining number of depth.
+        out_im_patch_data += depth * (height - filter_h);
+      }
+
+      conv_matrix += 2 * depth * filter_spatial_size;
+      h_patch_begin += stride_h;
+    }
+    w_patch_begin += stride_w;
+  }
+}
+
+template <typename ScalarType>
+bool CanUseCustomTransposedConv(
+    Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y,
+    Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_filters,
+    Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride,
+    Eigen::Index y_stride, Eigen::Index lhs_x_dilation,
+    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count) {
+  // Total spatial dimensions.
+  const int input_image_size = input_x * input_y;
+  const int kernel_total_size = kernel_x * kernel_y * kernel_filters;
+
+  // Don't use custom transposed convolutions with intermediate buffers.
+  constexpr auto kMaxConvMatrixSize = static_cast<size_t>(8) << 30;  // 8 GiB
+
+  // Intermediate buffer (convolution matrix)
+  const size_t buffer_size = input_batch * input_image_size * kernel_total_size;
+  if (buffer_size * sizeof(ScalarType) > kMaxConvMatrixSize) {
+    return false;
+  }
+
+  return (lhs_x_dilation > 1 || lhs_y_dilation > 1) && rhs_x_dilation == 1 &&
+         rhs_y_dilation == 1 && feature_group_count == 1 && x_stride == 1 &&
+         y_stride == 1;
+}
+
+// This implementation is based on TF algorithm with parallel contraction.
+// TODO(adambanas): There are other variants of this algorithm, 10% performance
+// improvement was observed on 1D case when not using parallel contraction.
+// Explore these alternatives.
+// TODO(adambanas): Add support for feature group count.
+template <typename ScalarType>
+void EigenTransposedConv2D(
+    const Eigen::ThreadPoolDevice* device, ScalarType* out,
+    const ScalarType* lhs, const ScalarType* rhs, Eigen::Index input_batch,
+    Eigen::Index input_x, Eigen::Index input_y, Eigen::Index input_channels,
+    Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_channels,
+    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
+    Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+    Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+    Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+    DoneCallback done) {
+  // Grouped convolutions are not supported yet.
+  CHECK(kernel_channels == input_channels);
+
+  using TensorMap2D =
+      Eigen::TensorMap<Eigen::Tensor<ScalarType, 2, Eigen::RowMajor>,
+                       Eigen::Unaligned>;
+  using ConstTensorMap3D =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 3, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+  using ConstTensorMap2D =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 2, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+
+  // Total spatial dimensions.
+  const int input_image_size = input_x * input_y;
+  const int output_image_size = output_x * output_y;
+  // Kernel dimensions per input channel.
+  const int kernel_total_size = kernel_x * kernel_y * kernel_filters;
+
+  // Intermediate buffer (convolution matrix)
+  const size_t buffer_size = input_batch * input_image_size * kernel_total_size;
+
+  auto conv_matrix = std::make_unique<ScalarType[]>(buffer_size);
+  ScalarType* conv_matrix_data = conv_matrix.get();
+
+  // Initialize output to zero.
+  ScalarType* out_data = out;
+  std::fill(out_data,
+            out_data + input_batch * output_image_size * kernel_filters,
+            ScalarType(0.0f));
+
+  // Initialize contraction dims (we need to transpose 'B' below, the dimension
+  // we need to contract is 'kernel_channels').
+  Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims = {
+      Eigen::IndexPair<Eigen::DenseIndex>(1, 1)};
+
+  // Compute intermediate results (convolution matrix) into conv_matrix.
+  TensorMap2D C(conv_matrix_data, input_batch * input_image_size,
+                kernel_total_size);
+
+  ConstTensorMap2D A(lhs, input_batch * input_image_size, input_channels);
+  ConstTensorMap3D B(rhs, kernel_x * kernel_y, kernel_channels, kernel_filters);
+
+  const int input_offset = input_image_size * kernel_total_size;
+  const int output_offset = output_image_size * kernel_filters;
+
+  // Pack the calculated patches into the output buffer.
+  // NOTE: The ownership of the conv_matrix is transferred to the lambda without
+  // data copy or reallocation. Thanks to that, conv_matrix_data pointer remains
+  // valid, and that is important because 'C' matrix is referencing it.
+  auto pack_patches = [=, conv_matrix = std::move(conv_matrix),
+                       done = std::move(done)]() mutable {
+    // Using local pointers to buffers, because lambda is not mutable.
+    const ScalarType* conv_matrix_data = conv_matrix.get();
+    ScalarType* local_out_data = out_data;
+
+    // TODO(adambanas): Run this part in parallel.
+    for (int image_id = 0; image_id < input_batch; ++image_id) {
+      Pack2DPatches<ScalarType>(
+          conv_matrix_data, kernel_filters, output_y, output_x, kernel_y,
+          kernel_x, padding_y_before, padding_y_after, padding_x_before,
+          padding_x_after, lhs_y_dilation, lhs_x_dilation, local_out_data);
+
+      conv_matrix_data += input_offset;
+      local_out_data += output_offset;
+    }
+
+    // Signal completion of the work once the patches are packed.
+    done();
+  };
+
+  // Molds the output of the contraction into the shape expected by packing
+  // algorithm:
+  // - the minor dimension (dims[1]): the patch values to be packed; contiguous
+  //   in memory
+  // - the major dimension (dims[0]): everything else
+  Eigen::DSizes<Eigen::Index, 2> post_contract_dims;
+  post_contract_dims[0] = input_batch * input_image_size;
+  post_contract_dims[1] = kernel_total_size;
+
+  if (device != nullptr) {
+    C.device(*device, std::move(pack_patches)) =
+        A.contract(B, contract_dims).reshape(post_contract_dims);
+  } else {
+    C = A.contract(B, contract_dims).reshape(post_contract_dims);
+    pack_patches();
+  }
+}
+
+// Algorithm that works for all types of 2D convolutions. Even though it works
+// for transposed convolutions, the custom algorithm should be used whenever
+// applicable, because it is faster.
+template <bool is_grouped, typename ScalarType>
+void EigenGenericConv2D(
+    const Eigen::ThreadPoolDevice* device, ScalarType* out,
+    const ScalarType* lhs, const ScalarType* rhs, Eigen::Index input_batch,
+    Eigen::Index input_x, Eigen::Index input_y, Eigen::Index input_channels,
+    Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_channels,
+    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
+    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index padding_x_before,
+    Eigen::Index padding_x_after, Eigen::Index padding_y_before,
+    Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
+    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
+    DoneCallback done) {
+  // For non-grouped convolutions, we can optimize Eigen expressions and avoid
+  // introducing an extra dimension of size `1`.
+  if constexpr (!is_grouped) {
+    DCHECK_EQ(feature_group_count, 1) << "Expected feature group count to be 1";
+  }
+
+  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      input(lhs, input_batch, input_x, input_y, input_channels);
+
+  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      kernel(rhs, kernel_x, kernel_y, kernel_channels, kernel_filters);
+
+  Eigen::TensorMap<Eigen::Tensor<ScalarType, 4, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, input_batch, output_x, output_y, kernel_filters);
+
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
+
+  Eigen::DSizes<Eigen::Index, 5> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = feature_group_count;
+  input_reshaped_dims[4] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 5> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = feature_group_count;
+  output_reshaped_dims[4] = kernel_filters / feature_group_count;
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  //   kernels
+  // - the second dimension (dims[1]): everything else
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
+  pre_contract_dims[0] = output_y * output_x * input_batch;
+  pre_contract_dims[1] = kernel_channels * kernel_y * kernel_x;
+
+  // Molds the output of the contraction into the shape expected by the user:
+  Eigen::DSizes<Eigen::Index, 4> post_contract_dims;
+  post_contract_dims[0] = input_batch;
+  post_contract_dims[1] = output_x;
+  post_contract_dims[2] = output_y;
+  post_contract_dims[3] = kernel_filters / feature_group_count;
+
+  // Kernel dimensions for non-grouped convolution.
+  Eigen::DSizes<Eigen::Index, 2> kernel_dims;
+  kernel_dims[0] = kernel_channels * kernel_y * kernel_x;
+  kernel_dims[1] = kernel_filters;
+
+  // Kernel dimensions for grouped convolution.
+  Eigen::DSizes<Eigen::Index, 3> kernel_reshaped_dims;
+  kernel_reshaped_dims[0] = kernel_channels * kernel_y * kernel_x;
+  kernel_reshaped_dims[1] = feature_group_count;
+  kernel_reshaped_dims[2] = kernel_filters / feature_group_count;
+
+  // IMPORTANT: Below in `convolve_group` and `convolve` lambdas, the row and
+  // column dimensions must be flipped when passed to Eigen.
+
+  // Constructs convolution and output expressions for a given group index.
+  auto convolve_group = [=](int64_t i) {
+    auto convolved =
+        input.reshape(input_reshaped_dims)
+            .chip(i, 3)
+            .extract_image_patches(
+                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
+                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
+                padding_y_before, padding_y_after, padding_x_before,
+                padding_x_after, static_cast<ScalarType>(0.0f))
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_reshaped_dims).chip(i, 1),
+                      contract_dims)
+            .reshape(post_contract_dims);
+    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 3);
+    return std::make_pair(output_reshaped, convolved);
+  };
+
+  // Constructs convolution and output expressions for full input.
+  auto convolve = [=] {
+    auto convolved =
+        input
+            .extract_image_patches(
+                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
+                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
+                padding_y_before, padding_y_after, padding_x_before,
+                padding_x_after, static_cast<ScalarType>(0.0f))
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims)
+            .reshape(post_contract_dims);
+    return std::make_pair(output, convolved);
   };
 
-  template <typename Sink>
-  friend void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d);
-
-  size_t convolution_rank() const { return input_dims.rank; }
-
-  int64_t input_batch;
-  Dims input_dims;
-  int64_t input_channels;
-
-  Dims kernel_dims;
-  int64_t kernel_channels;
-  int64_t kernel_filters;
-
-  Dims output_dims;
-
-  Dims strides;
-  Dims padding_before;
-  Dims padding_after;
-  Dims base_dilation;
-  Dims window_dilation;
-
-  int64_t feature_group_count;
-};
-
-// Get convolution dimensions in canonical form inferred from the operands
-// shapes and convolution parameters.
-absl::StatusOr<ConvolutionCanonicalDims> GetConvolutionCanonicalDims(
-    const ConvolutionSlices& slices, const ConvolutionDimensionNumbers& dnums,
-    const Window& window, int64_t feature_group_count);
-
-template <typename Sink>
-void AbslStringify(Sink& sink, const ConvolutionCanonicalDims::Dims& d) {
-  switch (d.rank) {
-    case 2:
-      absl::Format(&sink, "[%d,%d]", d.x, d.y);
-      break;
-    case 3:
-      absl::Format(&sink, "[%d,%d,%d]", d.x, d.y, d.z);
-      break;
-    default:
-      absl::Format(&sink, "[invalid rank %d]", d.rank);
+  // For non-grouped convolutions, we need to execute only one Eigen expression.
+  if constexpr (!is_grouped) {
+    auto [output, convolved] = convolve();
+
+    if (device != nullptr) {
+      output.device(*device, std::move(done)) = convolved;
+    } else {
+      output = convolved;
+      done();
+    }
+
+    return;
+  }
+
+  // For grouped convolutions, we need to execute multiple Eigen expressions and
+  // we might use thread pool to run them concurrently.
+  if (device != nullptr) {
+    // Although we schedule at most one tasks for each thread, individual
+    // convolution might also schedule more tasks into the same thread pool.
+    auto max_tasks = static_cast<Eigen::Index>(device->numThreads());
+    auto task_size = Eigen::numext::div_ceil(feature_group_count, max_tasks);
+    auto num_tasks = Eigen::numext::div_ceil(feature_group_count, task_size);
+
+    // Signal done callback when all feature groups are done.
+    tsl::CountDownAsyncValueRef<tsl::Chain> count_down(feature_group_count);
+    count_down.AsPtr().AndThen([done = std::move(done)]() mutable { done(); });
+
+    Worker::Parallelize(
+        device->getPool(), /*num_workers=*/num_tasks, num_tasks,
+        [=](Eigen::Index task_index) {
+          Eigen::Index start = task_index * task_size;
+          Eigen::Index end = std::min(start + task_size, feature_group_count);
+          for (Eigen::Index i = start; i < end; ++i) {
+            auto on_done = [count_down]() mutable { count_down.CountDown(); };
+            auto [output, convolved] = convolve_group(i);
+            output.device(*device, std::move(on_done)) = convolved;
+          }
+        });
+
+  } else {
+    // Convolve all feature groups sequentially in the caller thread.
+    for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+      auto [output, convolved] = convolve_group(i);
+      output = convolved;
+    }
+    done();
   }
 }
 
-template <typename Sink>
-void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d) {
-  absl::Format(&sink,
-               "convolution_rank=%d input_batch=%d input_dims=%v "
-               "input_channels=%d kernel_dims=%v kernel_channels=%d "
-               "kernel_filters=%d output_dims=%v strides=%v padding_before=%v "
-               "padding_after=%v base_dilation=%v window_dilation=%v "
-               "feature_group_count=%d",
-               d.convolution_rank(), d.input_batch, d.input_dims,
-               d.input_channels, d.kernel_dims, d.kernel_channels,
-               d.kernel_filters, d.output_dims, d.strides, d.padding_before,
-               d.padding_after, d.base_dilation, d.window_dilation,
-               d.feature_group_count);
+template <typename ScalarType>
+void EigenConv2D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_channels,
+                 Eigen::Index kernel_x, Eigen::Index kernel_y,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index x_stride, Eigen::Index y_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done) {
+  if (CanUseCustomTransposedConv<ScalarType>(
+          input_batch, input_x, input_y, kernel_x, kernel_y, kernel_filters,
+          output_x, output_y, x_stride, y_stride, lhs_x_dilation,
+          lhs_y_dilation, rhs_x_dilation, rhs_y_dilation,
+          feature_group_count)) {
+    EigenTransposedConv2D(device, out, lhs, rhs, input_batch, input_x, input_y,
+                          input_channels, kernel_x, kernel_y, kernel_channels,
+                          kernel_filters, output_x, output_y, padding_x_before,
+                          padding_x_after, padding_y_before, padding_y_after,
+                          lhs_x_dilation, lhs_y_dilation, std::move(done));
+    return;
+  }
+
+  if (feature_group_count == 1) {
+    EigenGenericConv2D</*is_grouped=*/false>(
+        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
+        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
+        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
+        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
+        rhs_y_dilation, feature_group_count, std::move(done));
+
+  } else {
+    EigenGenericConv2D</*is_grouped=*/true>(
+        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
+        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
+        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
+        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
+        rhs_y_dilation, feature_group_count, std::move(done));
+  }
 }
 
-}  // namespace xla::cpu
+//===----------------------------------------------------------------------===//
+// Convolution 3D implementation details.
+//===----------------------------------------------------------------------===//
+
+template <typename ScalarType>
+void EigenConv3D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_z,
+                 Eigen::Index input_channels, Eigen::Index kernel_x,
+                 Eigen::Index kernel_y, Eigen::Index kernel_z,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index output_z, Eigen::Index x_stride,
+                 Eigen::Index y_stride, Eigen::Index z_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index padding_z_before, Eigen::Index padding_z_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
+                 Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done) {
+  using ConstTType =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+  const ConstTType input(lhs, input_batch, input_x, input_y, input_z,
+                         input_channels);
+
+  const ConstTType kernel(rhs, kernel_x, kernel_y, kernel_z, kernel_channels,
+                          kernel_filters);
+
+  Eigen::TensorMap<Eigen::Tensor<ScalarType, 5, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, input_batch, output_x, output_y, output_z, kernel_filters);
+
+  Eigen::DSizes<Eigen::Index, 6> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = input_z;
+  input_reshaped_dims[4] = feature_group_count;
+  input_reshaped_dims[5] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 6> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = output_z;
+  output_reshaped_dims[4] = feature_group_count;
+  output_reshaped_dims[5] = kernel_filters / feature_group_count;
+
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  //   kernels
+  // - the second dimension (dims[1]): everything else
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
+  pre_contract_dims[0] = output_x * output_y * output_z * input_batch;
+  pre_contract_dims[1] = kernel_channels * kernel_x * kernel_y * kernel_z;
+
+  // Molds the output of the contraction into the shape expected by the user:
+  Eigen::DSizes<Eigen::Index, 5> post_contract_dims;
+  post_contract_dims[0] = input_batch;
+  post_contract_dims[1] = output_x;
+  post_contract_dims[2] = output_y;
+  post_contract_dims[3] = output_z;
+  post_contract_dims[4] = kernel_filters / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
+  kernel_dims[0] = kernel_channels * kernel_x * kernel_y * kernel_z;
+  kernel_dims[1] = feature_group_count;
+  kernel_dims[2] = kernel_filters / feature_group_count;
+
+  // Signal done callback when all feature groups are done.
+  tsl::CountDownAsyncValueRef<tsl::Chain> count_down(feature_group_count);
+  count_down.AsPtr().AndThen([done = std::move(done)]() mutable { done(); });
+
+  for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+    // The dimension order must be flipped when passed to Eigen.
+    auto input_chip = input.reshape(input_reshaped_dims).chip(i, 4);
+    auto patches =
+        Eigen::TensorVolumePatchOp<Eigen::Dynamic, Eigen::Dynamic,
+                                   Eigen::Dynamic, decltype(input_chip)>(
+            input_chip, kernel_z, kernel_y, kernel_x, z_stride, y_stride,
+            x_stride, rhs_z_dilation, rhs_y_dilation, rhs_x_dilation,
+            lhs_z_dilation, lhs_y_dilation, lhs_x_dilation, padding_z_before,
+            padding_z_after, padding_y_before, padding_y_after,
+            padding_x_before, padding_x_after, static_cast<ScalarType>(0.0f));
+
+    auto convolved =
+        patches.reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
+            .reshape(post_contract_dims);
+
+    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 4);
+
+    if (device != nullptr) {
+      auto on_done = [count_down]() mutable { count_down.CountDown(); };
+      output_reshaped.device(*device, std::move(on_done)) = convolved;
+    } else {
+      output_reshaped = convolved;
+      count_down.CountDown();
+    }
+  }
+}
+
+}  // namespace xla::cpu::internal
+
+// Define Conv2D template for all supported data types.
+#define XLA_CPU_DECLARE_CONV2D(SCALAR_TYPE)                                 \
+  extern template void xla::cpu::internal::EigenConv2D<SCALAR_TYPE>(        \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_channels, Eigen::Index kernel_x,                   \
+      Eigen::Index kernel_y, Eigen::Index kernel_channels,                  \
+      Eigen::Index kernel_filters, Eigen::Index output_x,                   \
+      Eigen::Index output_y, Eigen::Index x_stride, Eigen::Index y_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
+
+XLA_CPU_DECLARE_CONV2D(Eigen::half);
+XLA_CPU_DECLARE_CONV2D(float);
+
+#undef XLA_CPU_DECLARE_CONV2D
+
+// Define Conv3D template for all supported data types.
+#define XLA_CPU_DECLARE_CONV3D(SCALAR_TYPE)                                 \
+  extern template void xla::cpu::internal::EigenConv3D<SCALAR_TYPE>(        \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_z, Eigen::Index input_channels,                    \
+      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,  \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,            \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,  \
+      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index padding_z_before, Eigen::Index padding_z_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,             \
+      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
+
+XLA_CPU_DECLARE_CONV3D(Eigen::half);
+XLA_CPU_DECLARE_CONV3D(float);
+
+#undef XLA_CPU_DECLARE_CONV3D
+
+#define XLA_CPU_DEFINE_CONV2D(SCALAR_TYPE)                                  \
+  template void xla::cpu::internal::EigenConv2D<SCALAR_TYPE>(               \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_channels, Eigen::Index kernel_x,                   \
+      Eigen::Index kernel_y, Eigen::Index kernel_channels,                  \
+      Eigen::Index kernel_filters, Eigen::Index output_x,                   \
+      Eigen::Index output_y, Eigen::Index x_stride, Eigen::Index y_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
+
+#define XLA_CPU_DEFINE_CONV3D(SCALAR_TYPE)                                  \
+  template void xla::cpu::internal::EigenConv3D<SCALAR_TYPE>(               \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_z, Eigen::Index input_channels,                    \
+      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,  \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,            \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,  \
+      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index padding_z_before, Eigen::Index padding_z_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,             \
+      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
 
 #endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_LIB_H_
diff --git a/third_party/xla/xla/codegen/llvm_kernel_emitter.h b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f16.cc
similarity index 66%
rename from third_party/xla/xla/codegen/llvm_kernel_emitter.h
rename to third_party/xla/xla/backends/cpu/runtime/convolution_lib_f16.cc
index ca0c1605ec1323..e374aefae76748 100644
--- a/third_party/xla/xla/codegen/llvm_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f16.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,16 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_LLVM_KERNEL_EMITTER_H_
-#define XLA_CODEGEN_LLVM_KERNEL_EMITTER_H_
-
-#include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-
-namespace xla {
-
-using LlvmKernelEmitter = KernelEmitter<LlvmKernelDefinition>;
-
-}  // namespace xla
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 
-#endif  // XLA_CODEGEN_LLVM_KERNEL_EMITTER_H_
+XLA_CPU_DEFINE_CONV2D(Eigen::half);
+XLA_CPU_DEFINE_CONV3D(Eigen::half);
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f32.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f32.cc
similarity index 72%
rename from third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f32.cc
rename to third_party/xla/xla/backends/cpu/runtime/convolution_lib_f32.cc
index c4bdf05dac12d2..4caddbb902cfcd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f32.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f32.cc
@@ -13,14 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"  // IWYU pragma: keep
 #endif
 
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, float);
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, float);
-
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, float);
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, float);
+XLA_CPU_DEFINE_CONV2D(float);
+XLA_CPU_DEFINE_CONV3D(float);
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc
index 1e211b54b571cd..986676020c9946 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "xla/backends/cpu/runtime/convolution_thunk.h"
 
 #include <cstdint>
@@ -22,10 +23,11 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/convolution_lib.h"
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
+#include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
@@ -101,12 +103,6 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> ConvolutionThunk::Execute(
                                 convolution_slices_.output_buffer.ToString(),
                                 output_data.opaque());
 
-  if (options_.multi_threaded && params.intra_op_threadpool == nullptr) {
-    return Internal(
-        "Intra-op threadpool must be provided for ConvolutionThunk in "
-        "multi-threaded mode.");
-  }
-
   // Eigen convolution
   if (convolution_canonical_dims_.convolution_rank() == 2) {
     return HandleEigen2DConvolution(params, input_data, kernel_data,
@@ -122,13 +118,14 @@ ConvolutionThunk::HandleEigen2DConvolution(const ExecuteParams& params,
                                            se::DeviceMemoryBase input,
                                            se::DeviceMemoryBase kernel,
                                            se::DeviceMemoryBase output) {
-  auto dispatch = [&](auto type_tag, const auto& eigen_device,
-                      tsl::CountDownAsyncValueRef<ExecuteEvent> count_down) {
-    using scalar_type = decltype(type_tag);
+  auto dispatch = [&](auto type_tag) {
+    auto execute_event = tsl::MakeConstructedAsyncValueRef<ExecuteEvent>();
+
+    using ScalarType = decltype(type_tag);
     internal::EigenConv2D(
-        eigen_device, static_cast<scalar_type*>(output.opaque()),
-        static_cast<scalar_type*>(input.opaque()),
-        static_cast<scalar_type*>(kernel.opaque()),
+        params.intra_op_threadpool, static_cast<ScalarType*>(output.opaque()),
+        static_cast<const ScalarType*>(input.opaque()),
+        static_cast<const ScalarType*>(kernel.opaque()),
         convolution_canonical_dims_.input_batch,
         convolution_canonical_dims_.input_dims.x,
         convolution_canonical_dims_.input_dims.y,
@@ -149,35 +146,23 @@ ConvolutionThunk::HandleEigen2DConvolution(const ExecuteParams& params,
         convolution_canonical_dims_.base_dilation.y,
         convolution_canonical_dims_.window_dilation.x,
         convolution_canonical_dims_.window_dilation.y,
-        convolution_canonical_dims_.feature_group_count, std::move(count_down),
-        /*use_thunk_runtime=*/true);
+        convolution_canonical_dims_.feature_group_count,
+        [execute_event] { execute_event.SetStateConcrete(); });
+
+    return execute_event;
   };
 
   PrimitiveType input_type = convolution_slices_.input_shape.element_type();
 
-  // Execute convolution in the intra-op threadpool.
-  if (options_.multi_threaded) {
-    tsl::CountDownAsyncValueRef<ExecuteEvent> count_down(
-        convolution_canonical_dims_.feature_group_count);
-    auto execute_event = count_down.AsRef();
-
-    if (input_type == PrimitiveType::F16) {
-      dispatch(Eigen::half{}, *params.intra_op_threadpool,
-               std::move(count_down));
-    } else {
-      dispatch(float{}, *params.intra_op_threadpool, std::move(count_down));
-    }
-    return execute_event;
-  }
-
-  // Execute convolution in the caller thread.
-  if (input_type == PrimitiveType::F16) {
-    dispatch(Eigen::half{}, Eigen::DefaultDevice(), /*count_down=*/{});
-  } else {
-    dispatch(float{}, Eigen::DefaultDevice(), /*count_down=*/{});
+  switch (input_type) {
+    case PrimitiveType::F16:
+      return dispatch(Eigen::half{});
+    case PrimitiveType::F32:
+      return dispatch(float{});
+    default:
+      return Internal("Unsupported Conv2D input type: %s",
+                      primitive_util::LowercasePrimitiveTypeName(input_type));
   }
-
-  return OkExecuteEvent();
 }
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent>
@@ -185,13 +170,14 @@ ConvolutionThunk::HandleEigen3DConvolution(const ExecuteParams& params,
                                            se::DeviceMemoryBase input,
                                            se::DeviceMemoryBase kernel,
                                            se::DeviceMemoryBase output) {
-  auto dispatch = [&](auto type_tag, const auto& eigen_device,
-                      tsl::CountDownAsyncValueRef<ExecuteEvent> count_down) {
-    using scalar_type = decltype(type_tag);
+  auto dispatch = [&](auto type_tag) {
+    auto execute_event = tsl::MakeConstructedAsyncValueRef<ExecuteEvent>();
+
+    using ScalarType = decltype(type_tag);
     internal::EigenConv3D(
-        eigen_device, static_cast<scalar_type*>(output.opaque()),
-        static_cast<scalar_type*>(input.opaque()),
-        static_cast<scalar_type*>(kernel.opaque()),
+        params.intra_op_threadpool, static_cast<ScalarType*>(output.opaque()),
+        static_cast<const ScalarType*>(input.opaque()),
+        static_cast<const ScalarType*>(kernel.opaque()),
         convolution_canonical_dims_.input_batch,
         convolution_canonical_dims_.input_dims.x,
         convolution_canonical_dims_.input_dims.y,
@@ -220,33 +206,23 @@ ConvolutionThunk::HandleEigen3DConvolution(const ExecuteParams& params,
         convolution_canonical_dims_.window_dilation.x,
         convolution_canonical_dims_.window_dilation.y,
         convolution_canonical_dims_.window_dilation.z,
-        convolution_canonical_dims_.feature_group_count, std::move(count_down));
+        convolution_canonical_dims_.feature_group_count,
+        [execute_event] { execute_event.SetStateConcrete(); });
+
+    return execute_event;
   };
 
   PrimitiveType input_type = convolution_slices_.input_shape.element_type();
 
-  // Execute convolution in the intra-op threadpool.
-  if (options_.multi_threaded) {
-    tsl::CountDownAsyncValueRef<ExecuteEvent> count_down(
-        convolution_canonical_dims_.feature_group_count);
-    auto execute_event = count_down.AsRef();
-
-    if (input_type == PrimitiveType::F16) {
-      dispatch(Eigen::half{}, *params.intra_op_threadpool,
-               std::move(count_down));
-    } else {
-      dispatch(float{}, *params.intra_op_threadpool, std::move(count_down));
-    }
-    return execute_event;
-  }
-
-  // Execute convolution in the caller thread.
-  if (input_type == PrimitiveType::F16) {
-    dispatch(Eigen::half{}, Eigen::DefaultDevice(), /*count_down=*/{});
-  } else {
-    dispatch(float{}, Eigen::DefaultDevice(), /*count_down=*/{});
+  switch (input_type) {
+    case PrimitiveType::F16:
+      return dispatch(Eigen::half{});
+    case PrimitiveType::F32:
+      return dispatch(float{});
+    default:
+      return Internal("Unsupported Conv3D input type: %s",
+                      primitive_util::LowercasePrimitiveTypeName(input_type));
   }
-  return OkExecuteEvent();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
index cbfa0beb52df0f..5f639d5dbdd0ff 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
@@ -33,8 +33,9 @@ namespace xla::cpu {
 // Performs 1D, 2D or 3D convolution.
 class ConvolutionThunk final : public Thunk {
  public:
+  // TODO(ezhulenev): Remove this struct as we always use thread pool.
   struct Options {
-    bool multi_threaded = false;
+    bool multi_threaded = true;
   };
 
   static absl::StatusOr<std::unique_ptr<ConvolutionThunk>> Create(
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f16.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f16.cc
deleted file mode 100644
index 7b6e2ae17d1855..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f16.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
-
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
-
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h
deleted file mode 100644
index 0d49186fb52578..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
-#define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "xla/backends/cpu/runtime/work_queue.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/concurrency/chain.h"
-#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/logging.h"
-
-#define EIGEN_USE_THREADS
-#include "Eigen/Core"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu::internal {
-
-constexpr auto kMaxConvMatrixSize = static_cast<size_t>(8) << 30;  // 8 GiB
-
-// Returns in 'out_data' (assumes to be zero-initialized) image patch in storage
-// order (width, height, depth), constructed from patches in 'conv_matrix',
-// which is required to be in storage order (in_width * in_height, filter_width,
-// filter_height, out_depth).
-// Based on TF implementation by Yangqing Jia (jiayq).
-// TODO(adambanas): The original implementation implicitly rotates the kernel by
-// 180 degrees, but to be backwards compatible, we cannot do that in XLA. This
-// results in counterintuitive operations on conv_matrix, which is also 15-20%
-// slower. Try alternative approaches (e.g. rotate kernel before matrix
-// multiplication in the calling function).
-template <typename T>
-void Pack2DPatches(const T* conv_matrix, const int depth, const int height,
-                   const int width, const int filter_h, const int filter_w,
-                   const int pad_top, const int pad_bottom, const int pad_left,
-                   const int pad_right, const int stride_h, const int stride_w,
-                   T* __restrict out_im_data) {
-  int w_patches_number =
-      (width + filter_w - pad_left - pad_right - 2) / stride_w + 1;
-  int h_patches_number =
-      (height + filter_h - pad_top - pad_bottom - 2) / stride_h + 1;
-
-  const int filter_spatial_size = filter_h * filter_w;
-
-  int w_patch_begin = pad_left - filter_w + 1;
-  conv_matrix += depth * (filter_spatial_size - 1);
-  for (int w = 0; w < w_patches_number; ++w) {
-    int h_patch_begin = pad_top - filter_h + 1;
-    for (int h = 0; h < h_patches_number; ++h) {
-      // This loop body covers 1 output patch, at all depths, accounting for
-      // padding. The next line is always a pointer to the first element of the
-      // new output patch. Notice in case of less-than-full padding, the pointer
-      // can point to an element outside the image, but such elements will be
-      // skipped by the inner if (so no write occurs).
-      T* out_im_patch_data =
-          out_im_data + (w_patch_begin * height + h_patch_begin) * depth;
-
-      for (int iw = w_patch_begin; iw < w_patch_begin + filter_w; ++iw) {
-        for (int ih = h_patch_begin; ih < h_patch_begin + filter_h; ++ih) {
-          // This loop body covers 1 spatial point with coordinates (iw, ih)
-          // in the output buffer, at all depths
-          if (iw >= 0 && iw < width && ih >= 0 && ih < height) {
-            for (int i = 0; i < depth; ++i) {
-              out_im_patch_data[i] += conv_matrix[i];
-            }
-          }
-          out_im_patch_data += depth;
-          conv_matrix -= depth;
-        }
-        // Jump over remaining number of depth.
-        out_im_patch_data += depth * (height - filter_h);
-      }
-
-      conv_matrix += 2 * depth * filter_spatial_size;
-      h_patch_begin += stride_h;
-    }
-    w_patch_begin += stride_w;
-  }
-}
-
-// This implementation is based on TF algorithm with parallel contraction.
-// TODO(adambanas): There are other variants of this algorithm, 10% performance
-// improvement was observed on 1D case when not using parallel contraction.
-// Explore these alternatives.
-// TODO(adambanas): Add support for feature group count.
-template <typename EigenDevice, typename ScalarType>
-bool EigenTransposedConv2D(
-    const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-    ScalarType* rhs, Eigen::Index input_batch, Eigen::Index input_x,
-    Eigen::Index input_y, Eigen::Index input_channels, Eigen::Index kernel_x,
-    Eigen::Index kernel_y, Eigen::Index kernel_channels,
-    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
-    Eigen::Index padding_x_before, Eigen::Index padding_x_after,
-    Eigen::Index padding_y_before, Eigen::Index padding_y_after,
-    Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
-    tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-    bool use_thunk_runtime) {
-  // Grouped convolutions are not supported yet.
-  CHECK(kernel_channels == input_channels);
-
-  using TensorMap2D =
-      Eigen::TensorMap<Eigen::Tensor<ScalarType, 2, Eigen::RowMajor>,
-                       Eigen::Unaligned>;
-  using ConstTensorMap3D =
-      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 3, Eigen::RowMajor>,
-                       Eigen::Aligned>;
-  using ConstTensorMap2D =
-      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 2, Eigen::RowMajor>,
-                       Eigen::Aligned>;
-
-  // Total spatial dimensions.
-  const int input_image_size = input_x * input_y;
-  const int output_image_size = output_x * output_y;
-  // Kernel dimensions per input channel.
-  const int kernel_total_size = kernel_x * kernel_y * kernel_filters;
-
-  // Intermediate buffer (convolution matrix)
-  const size_t buffer_size = input_batch * input_image_size * kernel_total_size;
-  if (buffer_size * sizeof(ScalarType) > kMaxConvMatrixSize) {
-    LOG(WARNING)
-        << "Falling back to generic convolution implementation, because custom "
-           "transposed convolution algorithm needs too much memory ("
-        << buffer_size * sizeof(ScalarType)
-        << " bytes, exceeding the threshold of " << kMaxConvMatrixSize
-        << " bytes).";
-    return false;
-  }
-  auto conv_matrix = std::make_unique<ScalarType[]>(buffer_size);
-  ScalarType* conv_matrix_data = conv_matrix.get();
-
-  // Initialize output to zero.
-  ScalarType* out_data = out;
-  std::fill(out_data,
-            out_data + input_batch * output_image_size * kernel_filters,
-            ScalarType(0.0f));
-
-  // Initialize contraction dims (we need to transpose 'B' below, the dimension
-  // we need to contract is 'kernel_channels').
-  Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims = {
-      Eigen::IndexPair<Eigen::DenseIndex>(1, 1)};
-
-  // Compute intermediate results (convolution matrix) into conv_matrix.
-  TensorMap2D C(conv_matrix_data, input_batch * input_image_size,
-                kernel_total_size);
-
-  ConstTensorMap2D A(lhs, input_batch * input_image_size, input_channels);
-  ConstTensorMap3D B(rhs, kernel_x * kernel_y, kernel_channels, kernel_filters);
-
-  // Use concurrent execution if we have a thread pool device.
-  constexpr bool use_thread_pool =
-      std::is_same_v<EigenDevice, Eigen::ThreadPoolDevice>;
-
-  // For thunk runtime, `count_down` must be provided only if we use a thread
-  // pool device. This check is not true for classic runtime which does not
-  // support async execution.
-  if (use_thunk_runtime) {
-    CHECK_EQ(use_thread_pool, static_cast<bool>(count_down));  // Crash OK
-  }
-
-  const int input_offset = input_image_size * kernel_total_size;
-  const int output_offset = output_image_size * kernel_filters;
-
-  // Pack the calculated patches into the output buffer.
-  // NOTE: The ownership of the conv_matrix is transferred to the lambda without
-  // data copy or reallocation. Thanks to that, conv_matrix_data pointer remains
-  // valid, and that is important because 'C' matrix is referencing it.
-  auto pack_patches = [=, conv_matrix = std::move(conv_matrix)]() mutable {
-    // Using local pointers to buffers, because lambda is not mutable.
-    const ScalarType* conv_matrix_data = conv_matrix.get();
-    ScalarType* local_out_data = out_data;
-
-    // TODO(adambanas): Run this part in parallel.
-    for (int image_id = 0; image_id < input_batch; ++image_id) {
-      Pack2DPatches<ScalarType>(
-          conv_matrix_data, kernel_filters, output_y, output_x, kernel_y,
-          kernel_x, padding_y_before, padding_y_after, padding_x_before,
-          padding_x_after, lhs_y_dilation, lhs_x_dilation, local_out_data);
-
-      conv_matrix_data += input_offset;
-      local_out_data += output_offset;
-    }
-
-    // If `count_down` is provided, we need to count it down after the work is
-    // done.
-    if (count_down) {
-      count_down.CountDown();
-    }
-  };
-
-  // Molds the output of the contraction into the shape expected by packing
-  // algorithm:
-  // - the minor dimension (dims[1]): the patch values to be packed; contiguous
-  //   in memory
-  // - the major dimension (dims[0]): everything else
-  Eigen::DSizes<Eigen::Index, 2> post_contract_dims;
-  post_contract_dims[0] = input_batch * input_image_size;
-  post_contract_dims[1] = kernel_total_size;
-
-  if (count_down) {
-    // Schedule the work in the thread pool and return.
-    C.device(device, std::move(pack_patches)) =
-        A.contract(B, contract_dims).reshape(post_contract_dims);
-  } else {
-    // Run synchronously in the current thread.
-    C.device(device) = A.contract(B, contract_dims).reshape(post_contract_dims);
-    pack_patches();
-  }
-  return true;
-}
-
-inline bool CanUseCustomTransposedConv(
-    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index lhs_x_dilation,
-    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
-    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count) {
-  return (lhs_x_dilation > 1 || lhs_y_dilation > 1) && rhs_x_dilation == 1 &&
-         rhs_y_dilation == 1 && feature_group_count == 1 && x_stride == 1 &&
-         y_stride == 1;
-}
-
-// Algorithm that works for all types of 2D convolutions. Even though it works
-// for transposed convolutions, the custom algorithm should be used whenever
-// applicable, because it is faster.
-template <bool is_grouped, typename EigenDevice, typename ScalarType>
-void EigenGenericConv2D(
-    const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-    ScalarType* rhs, Eigen::Index input_batch, Eigen::Index input_x,
-    Eigen::Index input_y, Eigen::Index input_channels, Eigen::Index kernel_x,
-    Eigen::Index kernel_y, Eigen::Index kernel_channels,
-    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
-    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index padding_x_before,
-    Eigen::Index padding_x_after, Eigen::Index padding_y_before,
-    Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
-    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
-    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
-    tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-    bool use_thunk_runtime) {
-  // For non-grouped convolutions, we can optimize Eigen expressions and avoid
-  // introducing an extra dimension of size `1`.
-  if constexpr (!is_grouped) {
-    DCHECK_EQ(feature_group_count, 1) << "Expected feature group count to be 1";
-  }
-
-  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
-                         Eigen::Aligned>
-      input(lhs, input_batch, input_x, input_y, input_channels);
-
-  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
-                         Eigen::Aligned>
-      kernel(rhs, kernel_x, kernel_y, kernel_channels, kernel_filters);
-
-  Eigen::TensorMap<Eigen::Tensor<ScalarType, 4, Eigen::RowMajor>,
-                   Eigen::Aligned>
-      output(out, input_batch, output_x, output_y, kernel_filters);
-
-  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
-  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
-
-  Eigen::DSizes<Eigen::Index, 5> input_reshaped_dims;
-  input_reshaped_dims[0] = input_batch;
-  input_reshaped_dims[1] = input_x;
-  input_reshaped_dims[2] = input_y;
-  input_reshaped_dims[3] = feature_group_count;
-  input_reshaped_dims[4] = input_channels / feature_group_count;
-
-  Eigen::DSizes<Eigen::Index, 5> output_reshaped_dims;
-  output_reshaped_dims[0] = input_batch;
-  output_reshaped_dims[1] = output_x;
-  output_reshaped_dims[2] = output_y;
-  output_reshaped_dims[3] = feature_group_count;
-  output_reshaped_dims[4] = kernel_filters / feature_group_count;
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the
-  //   kernels
-  // - the second dimension (dims[1]): everything else
-  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
-  pre_contract_dims[0] = output_y * output_x * input_batch;
-  pre_contract_dims[1] = kernel_channels * kernel_y * kernel_x;
-
-  // Molds the output of the contraction into the shape expected by the user:
-  Eigen::DSizes<Eigen::Index, 4> post_contract_dims;
-  post_contract_dims[0] = input_batch;
-  post_contract_dims[1] = output_x;
-  post_contract_dims[2] = output_y;
-  post_contract_dims[3] = kernel_filters / feature_group_count;
-
-  // Kernel dimensions for non-grouped convolution.
-  Eigen::DSizes<Eigen::Index, 2> kernel_dims;
-  kernel_dims[0] = kernel_channels * kernel_y * kernel_x;
-  kernel_dims[1] = kernel_filters;
-
-  // Kernel dimensions for grouped convolution.
-  Eigen::DSizes<Eigen::Index, 3> kernel_reshaped_dims;
-  kernel_reshaped_dims[0] = kernel_channels * kernel_y * kernel_x;
-  kernel_reshaped_dims[1] = feature_group_count;
-  kernel_reshaped_dims[2] = kernel_filters / feature_group_count;
-
-  // IMPORTANT: Below in `convolve_group` and `convolve` lambdas, the row and
-  // column dimensions must be flipped when passed to Eigen.
-
-  // Constructs convolution and output expressions for a given group index.
-  auto convolve_group = [=](int64_t i) {
-    auto convolved =
-        input.reshape(input_reshaped_dims)
-            .chip(i, 3)
-            .extract_image_patches(
-                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
-                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
-                padding_y_before, padding_y_after, padding_x_before,
-                padding_x_after, static_cast<ScalarType>(0.0f))
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_reshaped_dims).chip(i, 1),
-                      contract_dims)
-            .reshape(post_contract_dims);
-    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 3);
-    return std::make_pair(output_reshaped, convolved);
-  };
-
-  // Constructs convolution and output expressions for full input.
-  auto convolve = [=] {
-    auto convolved =
-        input
-            .extract_image_patches(
-                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
-                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
-                padding_y_before, padding_y_after, padding_x_before,
-                padding_x_after, static_cast<ScalarType>(0.0f))
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims)
-            .reshape(post_contract_dims);
-    return std::make_pair(output, convolved);
-  };
-
-  // Use concurrent execution if we have a thread pool device.
-  constexpr bool use_thread_pool =
-      std::is_same_v<EigenDevice, Eigen::ThreadPoolDevice>;
-
-  // For thunk runtime, `count_down` must be provided only if we use a thread
-  // pool device. This check is not true for classic runtime which does not
-  // support async execution.
-  if (use_thunk_runtime) {
-    CHECK_EQ(use_thread_pool, static_cast<bool>(count_down));  // Crash OK
-  }
-
-  // For non-grouped convolutions, we need to execute only one Eigen expression.
-  if constexpr (!is_grouped) {
-    auto [output, convolved] = convolve();
-
-    if (count_down) {
-      auto on_done = [count_down]() mutable { count_down.CountDown(); };
-      output.device(device, std::move(on_done)) = convolved;
-    } else {
-      output.device(device) = convolved;
-    }
-
-    return;
-  }
-
-  // For grouped convolutions, we need to execute multiple Eigen expressions and
-  // we might use thread pool to run them concurrently.
-  if constexpr (use_thread_pool) {
-    // Although we schedule at most one tasks for each thread, individual
-    // convolution might also schedule more tasks into the same thread pool.
-    auto max_tasks = static_cast<Eigen::Index>(device.numThreads());
-    auto task_size = Eigen::numext::div_ceil(feature_group_count, max_tasks);
-    auto num_tasks = Eigen::numext::div_ceil(feature_group_count, task_size);
-
-    if (use_thunk_runtime) {
-      Worker::Parallelize(
-          device.getPool(), /*num_workers=*/num_tasks, num_tasks,
-          [=, &device](Eigen::Index task_index) mutable {
-            Eigen::Index start = task_index * task_size;
-            Eigen::Index end = std::min(start + task_size, feature_group_count);
-            for (Eigen::Index i = start; i < end; ++i) {
-              auto on_done = [count_down]() mutable { count_down.CountDown(); };
-              auto [output, convolved] = convolve_group(i);
-              output.device(device, std::move(on_done)) = convolved;
-            }
-          });
-    } else {
-      tsl::BlockUntilReady(Worker::Parallelize(
-          device.getPool(), /*num_workers=*/num_tasks, num_tasks,
-          [=, &device](Eigen::Index task_index) {
-            Eigen::Index start = task_index * task_size;
-            Eigen::Index end = std::min(start + task_size, feature_group_count);
-            for (Eigen::Index i = start; i < end; ++i) {
-              auto [output, convolved] = convolve_group(i);
-              output.device(device) = convolved;
-            }
-          }));
-    }
-
-  } else {
-    // Convolve all feature groups sequentially in the caller thread.
-    for (Eigen::Index i = 0; i < feature_group_count; ++i) {
-      auto [output, convolved] = convolve_group(i);
-      output.device(device) = convolved;
-    }
-  }
-}
-
-// TODO(ezhulenev): Make internal implementation a private static method of
-// ConvolutionThunk (for consistency with DotThunk). Today we keep it as a
-// free function to use it in the legacy XLA CPU runtime.
-template <typename EigenDevice, typename ScalarType>
-void EigenConv2D(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-                 ScalarType* rhs, Eigen::Index input_batch,
-                 Eigen::Index input_x, Eigen::Index input_y,
-                 Eigen::Index input_channels, Eigen::Index kernel_x,
-                 Eigen::Index kernel_y, Eigen::Index kernel_channels,
-                 Eigen::Index kernel_filters, Eigen::Index output_x,
-                 Eigen::Index output_y, Eigen::Index x_stride,
-                 Eigen::Index y_stride, Eigen::Index padding_x_before,
-                 Eigen::Index padding_x_after, Eigen::Index padding_y_before,
-                 Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
-                 Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
-                 Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
-                 tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-                 bool use_thunk_runtime) {
-  DCHECK(!count_down || (feature_group_count == count_down.count()));
-
-  if (CanUseCustomTransposedConv(x_stride, y_stride, lhs_x_dilation,
-                                 lhs_y_dilation, rhs_x_dilation, rhs_y_dilation,
-                                 feature_group_count)) {
-    if (EigenTransposedConv2D(device, out, lhs, rhs, input_batch, input_x,
-                              input_y, input_channels, kernel_x, kernel_y,
-                              kernel_channels, kernel_filters, output_x,
-                              output_y, padding_x_before, padding_x_after,
-                              padding_y_before, padding_y_after, lhs_x_dilation,
-                              lhs_y_dilation, count_down, use_thunk_runtime)) {
-      return;
-    }
-  }
-
-  if (feature_group_count == 1) {
-    EigenGenericConv2D</*is_grouped=*/false>(
-        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
-        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
-        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
-        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
-        rhs_y_dilation, feature_group_count, std::move(count_down),
-        use_thunk_runtime);
-
-  } else {
-    EigenGenericConv2D</*is_grouped=*/true>(
-        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
-        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
-        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
-        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
-        rhs_y_dilation, feature_group_count, std::move(count_down),
-        use_thunk_runtime);
-  }
-}
-
-template <typename EigenDevice, typename ScalarType>
-void EigenConv3D(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-                 ScalarType* rhs, Eigen::Index input_batch,
-                 Eigen::Index input_x, Eigen::Index input_y,
-                 Eigen::Index input_z, Eigen::Index input_channels,
-                 Eigen::Index kernel_x, Eigen::Index kernel_y,
-                 Eigen::Index kernel_z, Eigen::Index kernel_channels,
-                 Eigen::Index kernel_filters, Eigen::Index output_x,
-                 Eigen::Index output_y, Eigen::Index output_z,
-                 Eigen::Index x_stride, Eigen::Index y_stride,
-                 Eigen::Index z_stride, Eigen::Index padding_x_before,
-                 Eigen::Index padding_x_after, Eigen::Index padding_y_before,
-                 Eigen::Index padding_y_after, Eigen::Index padding_z_before,
-                 Eigen::Index padding_z_after, Eigen::Index lhs_x_dilation,
-                 Eigen::Index lhs_y_dilation, Eigen::Index lhs_z_dilation,
-                 Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,
-                 Eigen::Index rhs_z_dilation, Eigen::Index feature_group_count,
-                 tsl::CountDownAsyncValueRef<tsl::Chain> count_down) {
-  DCHECK(!count_down || (feature_group_count == count_down.count()));
-
-  using ConstTType =
-      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
-                       Eigen::Aligned>;
-  const ConstTType input(lhs, input_batch, input_x, input_y, input_z,
-                         input_channels);
-
-  const ConstTType kernel(rhs, kernel_x, kernel_y, kernel_z, kernel_channels,
-                          kernel_filters);
-
-  Eigen::TensorMap<Eigen::Tensor<ScalarType, 5, Eigen::RowMajor>,
-                   Eigen::Aligned>
-      output(out, input_batch, output_x, output_y, output_z, kernel_filters);
-
-  Eigen::DSizes<Eigen::Index, 6> input_reshaped_dims;
-  input_reshaped_dims[0] = input_batch;
-  input_reshaped_dims[1] = input_x;
-  input_reshaped_dims[2] = input_y;
-  input_reshaped_dims[3] = input_z;
-  input_reshaped_dims[4] = feature_group_count;
-  input_reshaped_dims[5] = input_channels / feature_group_count;
-
-  Eigen::DSizes<Eigen::Index, 6> output_reshaped_dims;
-  output_reshaped_dims[0] = input_batch;
-  output_reshaped_dims[1] = output_x;
-  output_reshaped_dims[2] = output_y;
-  output_reshaped_dims[3] = output_z;
-  output_reshaped_dims[4] = feature_group_count;
-  output_reshaped_dims[5] = kernel_filters / feature_group_count;
-
-  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
-  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the
-  //   kernels
-  // - the second dimension (dims[1]): everything else
-  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
-  pre_contract_dims[0] = output_x * output_y * output_z * input_batch;
-  pre_contract_dims[1] = kernel_channels * kernel_x * kernel_y * kernel_z;
-
-  // Molds the output of the contraction into the shape expected by the user:
-  Eigen::DSizes<Eigen::Index, 5> post_contract_dims;
-  post_contract_dims[0] = input_batch;
-  post_contract_dims[1] = output_x;
-  post_contract_dims[2] = output_y;
-  post_contract_dims[3] = output_z;
-  post_contract_dims[4] = kernel_filters / feature_group_count;
-
-  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
-  kernel_dims[0] = kernel_channels * kernel_x * kernel_y * kernel_z;
-  kernel_dims[1] = feature_group_count;
-  kernel_dims[2] = kernel_filters / feature_group_count;
-
-  for (Eigen::Index i = 0; i < feature_group_count; ++i) {
-    // The dimension order must be flipped when passed to Eigen.
-    auto input_chip = input.reshape(input_reshaped_dims).chip(i, 4);
-    auto patches =
-        Eigen::TensorVolumePatchOp<Eigen::Dynamic, Eigen::Dynamic,
-                                   Eigen::Dynamic, decltype(input_chip)>(
-            input_chip, kernel_z, kernel_y, kernel_x, z_stride, y_stride,
-            x_stride, rhs_z_dilation, rhs_y_dilation, rhs_x_dilation,
-            lhs_z_dilation, lhs_y_dilation, lhs_x_dilation, padding_z_before,
-            padding_z_after, padding_y_before, padding_y_after,
-            padding_x_before, padding_x_after, static_cast<ScalarType>(0.0f));
-
-    auto convolved =
-        patches.reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
-            .reshape(post_contract_dims);
-
-    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 4);
-    if (count_down) {
-      auto on_done = [count_down]() mutable { count_down.CountDown(); };
-      output_reshaped.device(device, std::move(on_done)) = convolved;
-    } else {
-      output_reshaped.device(device) = convolved;
-    }
-  }
-}
-
-// Extern Conv2D template for all supported devices and data types.
-#define CONV2D_EXTERN_TEMPLATE(DEVICE, SCALAR_TYPE)                        \
-  extern template void EigenConv2D<DEVICE, SCALAR_TYPE>(                   \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,            \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,    \
-      Eigen::Index input_y, Eigen::Index input_channels,                   \
-      Eigen::Index kernel_x, Eigen::Index kernel_y,                        \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,           \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride, \
-      Eigen::Index y_stride, Eigen::Index padding_x_before,                \
-      Eigen::Index padding_x_after, Eigen::Index padding_y_before,         \
-      Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,           \
-      Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,            \
-      Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,       \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down,                  \
-      bool use_thunk_runtime)
-
-CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
-CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
-CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
-
-#undef CONV2D_EXTERN_TEMPLATE
-
-// Extern Conv3D template for all supported devices and data types.
-#define CONV3D_EXTERN_TEMPLATE(DEVICE, SCALAR_TYPE)                            \
-  extern template void EigenConv3D<DEVICE, SCALAR_TYPE>(                       \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,                \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,        \
-      Eigen::Index input_y, Eigen::Index input_z, Eigen::Index input_channels, \
-      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,     \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,               \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,     \
-      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,     \
-      Eigen::Index padding_x_before, Eigen::Index padding_x_after,             \
-      Eigen::Index padding_y_before, Eigen::Index padding_y_after,             \
-      Eigen::Index padding_z_before, Eigen::Index padding_z_after,             \
-      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,                \
-      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,                \
-      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,                \
-      Eigen::Index feature_group_count,                                        \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down)
-
-CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
-CONV3D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
-CONV3D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
-
-#undef CONV3D_EXTERN_TEMPLATE
-
-}  // namespace xla::cpu::internal
-
-#define CONV2D_INSTANTIATE_TEMPLATE(DEVICE, SCALAR_TYPE)                   \
-  template void xla::cpu::internal::EigenConv2D<DEVICE, SCALAR_TYPE>(      \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,            \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,    \
-      Eigen::Index input_y, Eigen::Index input_channels,                   \
-      Eigen::Index kernel_x, Eigen::Index kernel_y,                        \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,           \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride, \
-      Eigen::Index y_stride, Eigen::Index padding_x_before,                \
-      Eigen::Index padding_x_after, Eigen::Index padding_y_before,         \
-      Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,           \
-      Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,            \
-      Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,       \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down,                  \
-      bool use_thunk_runtime)
-
-#define CONV3D_INSTANTIATE_TEMPLATE(DEVICE, SCALAR_TYPE)                       \
-  template void xla::cpu::internal::EigenConv3D<DEVICE, SCALAR_TYPE>(          \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,                \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,        \
-      Eigen::Index input_y, Eigen::Index input_z, Eigen::Index input_channels, \
-      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,     \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,               \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,     \
-      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,     \
-      Eigen::Index padding_x_before, Eigen::Index padding_x_after,             \
-      Eigen::Index padding_y_before, Eigen::Index padding_y_after,             \
-      Eigen::Index padding_z_before, Eigen::Index padding_z_after,             \
-      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,                \
-      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,                \
-      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,                \
-      Eigen::Index feature_group_count,                                        \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down)
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc
index 89de8ed1f08f55..80004ffc471391 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc
@@ -182,34 +182,5 @@ TEST(ConvolutionThunkTest, CreationErrorOnOutputChannelsMismatch) {
                            "should be the same as output channels count (4)"));
 }
 
-TEST(ConvolutionThunkTest,
-     ExecutionErrorOnMissingThreadPoolInMultiThreadedMode) {
-  ConvolutionThunkBuilder<float> builder;
-
-  auto options = MakeConvolutionOptions();
-  options.multi_threaded = true;
-  builder.SetOptions(options);
-
-  TF_ASSERT_OK_AND_ASSIGN(auto thunk, builder.Build());
-  BufferAllocations allocations = builder.GetAllocations();
-
-  // Execute thunk and wait for completion.
-  Thunk::ExecuteParams params;
-  params.intra_op_threadpool = nullptr;
-  params.buffer_allocations = &allocations;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-
-  // Verify that the execution was not successful.
-  ASSERT_TRUE(execute_event.IsError());
-  auto status = execute_event.GetError();
-  EXPECT_EQ(absl::StatusCode::kInternal, status.code());
-  EXPECT_EQ(
-      "Intra-op threadpool must be provided for ConvolutionThunk in "
-      "multi-threaded mode.",
-      status.message());
-}
-
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
index c598984cbfaa84..006a4c10eb5117 100644
--- a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
@@ -57,7 +57,12 @@ absl::StatusOr<std::unique_ptr<CopyThunk>> CopyThunk::Create(
         "Source shape %s must be compatible with destination shape %s",
         src_shape.ToString(true), dst_shape.ToString(true));
   }
-
+  if (src_shape != dst_shape) {
+    if (!ShapeUtil::ByteStrides(src_shape).has_value()) {
+      return InvalidArgument("Source shape %s must have valid byte strides",
+                             src_shape.ToString(true));
+    }
+  }
   return absl::WrapUnique(new CopyThunk(std::move(info), src_buffer, src_shape,
                                         dst_buffer, dst_shape));
 }
@@ -78,6 +83,7 @@ CopyThunk::CopyThunk(Info info, BufferAllocation::Slice src_buffer,
     options.dims = src_shape_.dimensions();
 
     auto byte_strides = ShapeUtil::ByteStrides(src_shape_);
+    CHECK(byte_strides.has_value());
     options.input_layout = TransposePlan::Striding{*byte_strides};
 
     absl::InlinedVector<int64_t, 4> permutation(options.dims.size());
diff --git a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.h b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.h
index bf3a6f224ee92e..ce9c274da7b2a1 100644
--- a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.h
@@ -46,7 +46,8 @@ class CopyThunk final : public Thunk {
   tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
 
   BufferUses buffer_uses() const final {
-    return {BufferUse::Read(src_buffer_), BufferUse::Write(dst_buffer_)};
+    return {BufferUse::Read(src_buffer_, src_shape_),
+            BufferUse::Write(dst_buffer_, dst_shape_)};
   }
 
   const Shape& src_shape() const { return src_shape_; }
diff --git a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
index b27ba96f926bd5..a1bd9e28b80d7c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
@@ -60,7 +60,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-using AttributesMap = ffi::CallFrameBuilder::AttributesMap;
+using AttributesMap = ffi::AttributesMap;
 
 static absl::StatusOr<AttributesMap> ParseAttributes(
     absl::string_view backend_config) {
@@ -396,11 +396,13 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::CallUntypedAPI(
 
 CustomCallThunk::BufferUses CustomCallThunk::buffer_uses() const {
   BufferUses buffer_uses;
-  for (const auto& argument : op_buffers_.arguments_buffers) {
-    buffer_uses.emplace_back(BufferUse::Read(argument));
+  for (int i = 0; i < op_buffers_.arguments_buffers.size(); i++) {
+    buffer_uses.emplace_back(BufferUse::Read(op_buffers_.arguments_buffers[i],
+                                             op_buffers_.arguments_shapes[i]));
   }
-  for (const auto& result : op_buffers_.results_buffers) {
-    buffer_uses.emplace_back(BufferUse::Write(result));
+  for (int i = 0; i < op_buffers_.results_buffers.size(); i++) {
+    buffer_uses.emplace_back(BufferUse::Write(op_buffers_.results_buffers[i],
+                                              op_buffers_.results_shapes[i]));
   }
   return buffer_uses;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc b/third_party/xla/xla/backends/cpu/runtime/dot_dims.cc
similarity index 95%
rename from third_party/xla/xla/backends/cpu/runtime/dot_lib.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_dims.cc
index 2b477022831e05..418c3a718835a1 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_dims.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 
 #include <cstdint>
 #include <functional>
@@ -40,9 +40,9 @@ limitations under the License.
 namespace xla::cpu {
 
 absl::InlinedVector<BufferUse, 4> DotBufferUses(const DotSlices& slices) {
-  return {BufferUse::Read(slices.lhs_buffer),
-          BufferUse::Read(slices.rhs_buffer),
-          BufferUse::Write(slices.out_buffer)};
+  return {BufferUse::Read(slices.lhs_buffer, slices.lhs_shape),
+          BufferUse::Read(slices.rhs_buffer, slices.rhs_shape),
+          BufferUse::Write(slices.out_buffer, slices.out_shape)};
 }
 
 std::string MakeVectorString(absl::Span<const int64_t> values) {
@@ -138,10 +138,12 @@ absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
       dot_dimensions.rhs_contracting_dimensions().end());
 
   // Adjust contracting dimensions for leading batch dimensions.
-  for (int64_t& dim : lhs_contracting_dims)
+  for (int64_t& dim : lhs_contracting_dims) {
     dim -= dot_dimensions.lhs_batch_dimensions_size();
-  for (int64_t& dim : rhs_contracting_dims)
+  }
+  for (int64_t& dim : rhs_contracting_dims) {
     dim -= dot_dimensions.rhs_batch_dimensions_size();
+  }
 
   // Non-contracting dots should never make it here.
   TF_RET_CHECK(lhs_contracting_dims.size() == 1);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_dims.h b/third_party/xla/xla/backends/cpu/runtime/dot_dims.h
new file mode 100644
index 00000000000000..2b40717a5314c3
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_dims.h
@@ -0,0 +1,100 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_DIMS_H_
+#define XLA_BACKENDS_CPU_RUNTIME_DOT_DIMS_H_
+
+#include <cstdint>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// Allocation slices of the dot operation.
+struct DotSlices {
+  BufferAllocation::Slice lhs_buffer;
+  Shape lhs_shape;
+
+  BufferAllocation::Slice rhs_buffer;
+  Shape rhs_shape;
+
+  BufferAllocation::Slice out_buffer;
+  Shape out_shape;
+};
+
+// Shape of the batched dot operation supported by the XLA:CPU runtime.
+struct DotShape {
+  // Product of batch dimensions.
+  int64_t batch_size;
+
+  // Shapes of the non-batch matrix-multiplication for the dot operation
+  Shape lhs_matmul_shape;
+  Shape rhs_matmul_shape;
+  Shape out_matmul_shape;
+};
+
+// Dot operation is implemented as a matrix-matrix multiply (row-major x
+// rowm-major or col-major x col-major). For batched dot operations, it is
+// implemented as multiple matrix multiplications repeated for each batch
+// element.
+struct DotCanonicalDims {
+  // The number of rows in the LHS.
+  int64_t m;
+
+  // The number of columns in the LHS, which also must be equal to the
+  // number of rows in the RHS.
+  int64_t k;
+
+  // The number of columns in the RHS.
+  int64_t n;
+
+  // True if the LHS matrix is column major.
+  bool lhs_column_major;
+
+  // True if the LHS contraction dimension is 1.
+  bool lhs_canonical;
+
+  // True if the RHS matrix is column major.
+  bool rhs_column_major;
+
+  // True if the RHS contraction dimension is 0.
+  bool rhs_canonical;
+
+  // True if the output matrix is column major.
+  bool output_column_major;
+};
+
+// Returns buffer uses of the dot operation.
+absl::InlinedVector<BufferUse, 4> DotBufferUses(const DotSlices& slices);
+
+// Verifies dot dimensions and shapes and returns the shape of the dot operation
+// in a form that is convenient for the runtime implementation.
+absl::StatusOr<DotShape> GetDotShape(const DotDimensionNumbers& dot_dimensions,
+                                     const Shape& lhs_shape,
+                                     const Shape& rhs_shape,
+                                     const Shape& out_shape);
+
+// Get canonical dot dimensions for the given dot shape.
+absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
+    const DotDimensionNumbers& dot_dimensions, const DotShape& dot_shape);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_DIMS_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_lib.h b/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
index 00a29d7d2fc20f..363e83609f1eab 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,85 +16,126 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
 #define XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
 
+#include <array>
 #include <cstdint>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/status/statusor.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Allocation slices of the dot operation.
-struct DotSlices {
-  BufferAllocation::Slice lhs_buffer;
-  Shape lhs_shape;
-
-  BufferAllocation::Slice rhs_buffer;
-  Shape rhs_shape;
-
-  BufferAllocation::Slice out_buffer;
-  Shape out_shape;
-};
-
-// Shape of the batched dot operation supported by the XLA:CPU runtime.
-struct DotShape {
-  // Product of batch dimensions.
-  int64_t batch_size;
-
-  // Shapes of the non-batch matrix-multiplication for the dot operation
-  Shape lhs_matmul_shape;
-  Shape rhs_matmul_shape;
-  Shape out_matmul_shape;
-};
-
-// Dot operation is implemented as a matrix-matrix multiply (row-major x
-// rowm-major or col-major x col-major). For batched dot operations, it is
-// implemented as multiple matrix multiplications repeated for each batch
-// element.
-struct DotCanonicalDims {
-  // The number of rows in the LHS.
-  int64_t m;
-
-  // The number of columns in the LHS, which also must be equal to the
-  // number of rows in the RHS.
-  int64_t k;
-
-  // The number of columns in the RHS.
-  int64_t n;
-
-  // True if the LHS matrix is column major.
-  bool lhs_column_major;
-
-  // True if the LHS contraction dimension is 1.
-  bool lhs_canonical;
-
-  // True if the RHS matrix is column major.
-  bool rhs_column_major;
-
-  // True if the RHS contraction dimension is 0.
-  bool rhs_canonical;
-
-  // True if the output matrix is column major.
-  bool output_column_major;
-};
-
-// Returns buffer uses of the dot operation.
-absl::InlinedVector<BufferUse, 4> DotBufferUses(const DotSlices& slices);
-
-// Verifies dot dimensions and shapes and returns the shape of the dot operation
-// in a form that is convenient for the runtime implementation.
-absl::StatusOr<DotShape> GetDotShape(const DotDimensionNumbers& dot_dimensions,
-                                     const Shape& lhs_shape,
-                                     const Shape& rhs_shape,
-                                     const Shape& out_shape);
-
-// Get canonical dot dimensions for the given dot shape.
-absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
-    const DotDimensionNumbers& dot_dimensions, const DotShape& dot_shape);
-
-}  // namespace xla::cpu
+#include <utility>
+
+#include "absl/base/optimization.h"
+#include "absl/functional/any_invocable.h"
+
+#define EIGEN_USE_THREADS
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu::internal {
+
+// Done callback is called when the MatMul computation is complete.
+using DoneCallback = absl::AnyInvocable<void()>;
+
+// Col-major x Col-major MatMul implementation as Eigen contraction.
+template <typename LhsType, typename RhsType, typename OutType,
+          Eigen::AlignmentType alignment>
+void MatMul(const Eigen::ThreadPoolDevice* device, OutType* out, LhsType* lhs,
+            RhsType* rhs, int64_t m, int64_t n, int64_t k,
+            int32_t transpose_lhs, int32_t transpose_rhs, DoneCallback done);
+
+// Col-major x Col-major MatMul implementation as Eigen contraction.
+template <typename LhsType, typename RhsType, typename OutType>
+void TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out, void* lhs,
+                 void* rhs, int64_t m, int64_t n, int64_t k, bool transpose_lhs,
+                 bool transpose_rhs, DoneCallback done);
+
+//===----------------------------------------------------------------------===//
+// TypedMatMul/MatMul implementation details.
+//===----------------------------------------------------------------------===//
+
+template <typename LhsType, typename RhsType, typename OutType,
+          Eigen::AlignmentType alignment>
+void MatMul(const Eigen::ThreadPoolDevice* device, OutType* out, LhsType* lhs,
+            RhsType* rhs, int64_t m, int64_t n, int64_t k,
+            int32_t transpose_lhs, int32_t transpose_rhs, DoneCallback done) {
+  int64_t lhs_rows = m;
+  int64_t lhs_cols = k;
+  if (transpose_lhs) {
+    std::swap(lhs_rows, lhs_cols);
+  }
+
+  int64_t rhs_rows = k;
+  int64_t rhs_cols = n;
+  if (transpose_rhs) {
+    std::swap(rhs_rows, rhs_cols);
+  }
+
+  const Eigen::TensorMap<Eigen::Tensor<const LhsType, 2>, alignment> a(
+      lhs, lhs_rows, lhs_cols);
+  const Eigen::TensorMap<Eigen::Tensor<const RhsType, 2>, alignment> b(
+      rhs, rhs_rows, rhs_cols);
+  Eigen::TensorMap<Eigen::Tensor<OutType, 2>, alignment> c(out, m, n);
+
+  typedef typename Eigen::Tensor<LhsType, 2>::DimensionPair DimPair;
+  int lhs_contract_dim = transpose_lhs ? 0 : 1;
+  int rhs_contract_dim = transpose_rhs ? 1 : 0;
+
+  std::array<DimPair, 1> dims({DimPair(lhs_contract_dim, rhs_contract_dim)});
+
+  if (device != nullptr) {
+    c.device(*device, std::move(done)) =
+        a.contract(b, dims).template cast<OutType>();
+  } else {
+    c = a.contract(b, dims).template cast<OutType>();
+    done();
+  }
+}
+
+template <typename LhsType, typename RhsType, typename OutType>
+void TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out, void* lhs,
+                 void* rhs, int64_t m, int64_t n, int64_t k, bool transpose_lhs,
+                 bool transpose_rhs, DoneCallback done) {
+  auto is_16_byte_aligned = [](void* ptr) {
+    return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
+  };
+
+  bool is_aligned = is_16_byte_aligned(lhs) && is_16_byte_aligned(rhs) &&
+                    is_16_byte_aligned(out);
+
+  if (ABSL_PREDICT_TRUE(is_aligned)) {
+    MatMul<LhsType, RhsType, OutType, Eigen::Aligned16>(
+        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
+        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
+        std::move(done));
+  } else {
+    MatMul<LhsType, RhsType, OutType, Eigen::Unaligned>(
+        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
+        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
+        std::move(done));
+  }
+}
+
+// Declare TypedMatMul template for all supported data types to enable
+// parallel compilation.
+#define DECLARE_TYPED_MATMUL(T)                                                \
+  extern template void TypedMatMul<T, T, T>(                                   \
+      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
+      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
+      DoneCallback done)
+
+DECLARE_TYPED_MATMUL(Eigen::half);
+DECLARE_TYPED_MATMUL(float);
+DECLARE_TYPED_MATMUL(double);
+DECLARE_TYPED_MATMUL(int32_t);
+DECLARE_TYPED_MATMUL(std::complex<float>);
+DECLARE_TYPED_MATMUL(std::complex<double>);
+
+#define DECLARE_MIXED_MATMUL(LhsType, RhsType, OutType)                        \
+  extern template void TypedMatMul<LhsType, RhsType, OutType>(                 \
+      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
+      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
+      DoneCallback done)
+
+DECLARE_MIXED_MATMUL(int8_t, int8_t, int32_t);
+
+#undef DECLARE_TYPED_MATMUL
+
+}  // namespace xla::cpu::internal
 
 #endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c128.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c128.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_c128.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_c128.cc
index 6c86df47d02c59..158ee7b82e25d7 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c128.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c128.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<
+template void ::xla::cpu::internal::TypedMatMul<
     std::complex<double>, std::complex<double>, std::complex<double>>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c64.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c64.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_c64.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_c64.cc
index 39ae563ffff276..6daf5ccc8516cd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c64.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c64.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<
+template void ::xla::cpu::internal::TypedMatMul<
     std::complex<float>, std::complex<float>, std::complex<float>>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f16.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f16.cc
similarity index 86%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_f16.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_f16.cc
index c6a8d049c545c7..b9e50c11be5905 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f16.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f16.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<Eigen::half, Eigen::half,
+template void ::xla::cpu::internal::TypedMatMul<Eigen::half, Eigen::half,
                                                 Eigen::half>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f32.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f32.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_f32.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_f32.cc
index d41bc567c99961..cdc654c41787b4 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f32.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f32.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"  // IWYU pragma: keep
 #endif
 
-template void ::xla::cpu::DotThunk::TypedMatMul<float, float, float>(
+template void ::xla::cpu::internal::TypedMatMul<float, float, float>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f64.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f64.cc
similarity index 85%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_f64.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_f64.cc
index d8ef6762d87f9e..4626c002dab800 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f64.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f64.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<double, double, double>(
+template void ::xla::cpu::internal::TypedMatMul<double, double, double>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s32.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s32.cc
similarity index 85%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_s32.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_s32.cc
index e6557ec7e03433..ca3f145c05f339 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s32.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s32.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<int32_t, int32_t, int32_t>(
+template void ::xla::cpu::internal::TypedMatMul<int32_t, int32_t, int32_t>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s8.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s8.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_s8.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_s8.cc
index eb959f5c457fb4..122bde428d2517 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s8.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s8.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"  // IWYU pragma: keep
 #endif
 
-template void ::xla::cpu::DotThunk::TypedMatMul<int8_t, int8_t, int32_t>(
+template void ::xla::cpu::internal::TypedMatMul<int8_t, int8_t, int32_t>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
index 84678dcbd9f834..79a68a40dd7c80 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/dot_lib.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/primitive_util.h"
@@ -71,7 +72,6 @@ DotThunk::DotThunk(Info info, DotDimensionNumbers dot_dimensions,
 
 tsl::AsyncValueRef<DotThunk::ExecuteEvent> DotThunk::Execute(
     const ExecuteParams& params) {
-
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase lhs_data,
       params.buffer_allocations->GetDeviceAddress(dot_slices_.lhs_buffer));
@@ -169,13 +169,17 @@ tsl::AsyncValueRef<DotThunk::ExecuteEvent> DotThunk::Execute(
 
   auto dispatch = [&](auto lhs_type, auto rhs_type, auto out_type) {
     for (int64_t i = 0; i < dot_shape_.batch_size; ++i) {
-      TypedMatMul<decltype(lhs_type), decltype(rhs_type), decltype(out_type)>(
+      using LhsType = decltype(lhs_type);
+      using RhsType = decltype(rhs_type);
+      using OutType = decltype(out_type);
+      internal::TypedMatMul<LhsType, RhsType, OutType>(
           params.intra_op_threadpool, batch_ptr(out, out_stride, i),
           batch_ptr(lhs, lhs_stride, i), batch_ptr(rhs, rhs_stride, i), m, n, k,
           transpose_lhs, transpose_rhs,
           [state]() mutable { state.CountDown(); });
     }
   };
+
   auto dispatch_same_type = [&](auto type_tag) {
     dispatch(type_tag, type_tag, type_tag);
   };
@@ -205,7 +209,7 @@ tsl::AsyncValueRef<DotThunk::ExecuteEvent> DotThunk::Execute(
         dispatch_same_type(std::complex<double>{});
         break;
       default:
-        absl::string_view type_name = PrimitiveType_Name(lhs_dtype);
+        auto type_name = primitive_util::LowercasePrimitiveTypeName(lhs_dtype);
         return Unimplemented(
             "Unsupported element type for DotThunk::Execute: %s x %s = %s",
             type_name, type_name, type_name);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
index a5d98fff2e3165..2a57a1ed021bf3 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
@@ -16,20 +16,12 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
 #define XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
 
-#include "xla/backends/cpu/runtime/dot_lib.h"
-#define EIGEN_USE_THREADS
-
-#include <array>
 #include <cstdint>
 #include <memory>
-#include <utility>
 
-#include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
-#include "Eigen/Core"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
@@ -57,22 +49,6 @@ class DotThunk final : public Thunk {
   DotThunk(Info info, DotDimensionNumbers dot_dimensions, DotSlices dot_slices,
            DotShape dot_shape, DotCanonicalDims dot_canonical_dims);
 
-  using DoneCallback = absl::AnyInvocable<void()>;
-
-  // Col-major x Col-major MatMul implementation as Eigen contraction.
-  template <typename LhsType, typename RhsType, typename OutType,
-            Eigen::AlignmentType alignment>
-  static void MatMul(const Eigen::ThreadPoolDevice* device, OutType* out,
-                     LhsType* lhs, RhsType* rhs, int64_t m, int64_t n,
-                     int64_t k, int32_t transpose_lhs, int32_t transpose_rhs,
-                     DoneCallback done);
-
-  template <typename LhsType, typename RhsType, typename OutType>
-  static void TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out,
-                          void* lhs, void* rhs, int64_t m, int64_t n, int64_t k,
-                          bool transpose_lhs, bool transpose_rhs,
-                          DoneCallback done);
-
   DotDimensionNumbers dot_dimensions_;
   DotSlices dot_slices_;
   DotShape dot_shape_;
@@ -83,95 +59,6 @@ class DotThunk final : public Thunk {
   absl::InlinedVector<int64_t, 2> rhs_matmul_contracting_dims_;
 };
 
-//===----------------------------------------------------------------------===//
-// DotThunk implementation details.
-//===----------------------------------------------------------------------===//
-
-template <typename LhsType, typename RhsType, typename OutType,
-          Eigen::AlignmentType alignment>
-void DotThunk::MatMul(const Eigen::ThreadPoolDevice* device, OutType* out,
-                      LhsType* lhs, RhsType* rhs, int64_t m, int64_t n,
-                      int64_t k, int32_t transpose_lhs, int32_t transpose_rhs,
-                      DoneCallback done) {
-  int64_t lhs_rows = m;
-  int64_t lhs_cols = k;
-  if (transpose_lhs) std::swap(lhs_rows, lhs_cols);
-
-  int64_t rhs_rows = k;
-  int64_t rhs_cols = n;
-  if (transpose_rhs) std::swap(rhs_rows, rhs_cols);
-
-  const Eigen::TensorMap<Eigen::Tensor<const LhsType, 2>, alignment> a(
-      lhs, lhs_rows, lhs_cols);
-  const Eigen::TensorMap<Eigen::Tensor<const RhsType, 2>, alignment> b(
-      rhs, rhs_rows, rhs_cols);
-  Eigen::TensorMap<Eigen::Tensor<OutType, 2>, alignment> c(out, m, n);
-
-  typedef typename Eigen::Tensor<LhsType, 2>::DimensionPair DimPair;
-  int lhs_contract_dim = transpose_lhs ? 0 : 1;
-  int rhs_contract_dim = transpose_rhs ? 1 : 0;
-  std::array<DimPair, 1> dims({DimPair(lhs_contract_dim, rhs_contract_dim)});
-
-  if (device != nullptr) {
-    c.device(*device, std::move(done)) =
-        a.contract(b, dims).template cast<OutType>();
-  } else {
-    c = a.contract(b, dims).template cast<OutType>();
-    done();
-  }
-}
-
-template <typename LhsType, typename RhsType, typename OutType>
-void DotThunk::TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out,
-                           void* lhs, void* rhs, int64_t m, int64_t n,
-                           int64_t k, bool transpose_lhs, bool transpose_rhs,
-                           DoneCallback done) {
-  auto is_16_byte_aligned = [](void* ptr) {
-    return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
-  };
-
-  bool is_aligned = is_16_byte_aligned(lhs) && is_16_byte_aligned(rhs) &&
-                    is_16_byte_aligned(out);
-
-  if (ABSL_PREDICT_TRUE(is_aligned)) {
-    MatMul<LhsType, RhsType, OutType, Eigen::Aligned16>(
-        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
-        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
-        std::move(done));
-  } else {
-    MatMul<LhsType, RhsType, OutType, Eigen::Unaligned>(
-        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
-        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
-        std::move(done));
-  }
-}
-
-// Extern DotThunk::TypedMatMul template for all supported data types to enable
-// parallel compilation.
-#define DOT_THUNK_EXTERN_MATMUL_TEMPLATE(T)                                    \
-  extern template void DotThunk::TypedMatMul<T, T, T>(                         \
-      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
-      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
-      DoneCallback done)
-
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(Eigen::half);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(float);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(double);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(int32_t);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(std::complex<float>);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(std::complex<double>);
-
-#define DOT_THUNK_EXTERN_MATMUL_MIXED_PRECISION_TEMPLATE(LhsType, RhsType,     \
-                                                         OutType)              \
-  extern template void DotThunk::TypedMatMul<LhsType, RhsType, OutType>(       \
-      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
-      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
-      DoneCallback done)
-
-DOT_THUNK_EXTERN_MATMUL_MIXED_PRECISION_TEMPLATE(int8_t, int8_t, int32_t);
-
-#undef DOT_THUNK_EXTERN_MATMUL_TEMPLATE
-
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc
index 2345f997f5a609..2f6ac18d069472 100644
--- a/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc
@@ -197,7 +197,8 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> FftThunk::Execute(
 }
 
 Thunk::BufferUses FftThunk::buffer_uses() const {
-  return {BufferUse::Read(input_buffer_), BufferUse::Write(output_buffer_)};
+  return {BufferUse::Read(input_buffer_, input_shape_),
+          BufferUse::Write(output_buffer_, output_shape_)};
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/function_library.h b/third_party/xla/xla/backends/cpu/runtime/function_library.h
index d9a685fb16a52e..3b2699642389be 100644
--- a/third_party/xla/xla/backends/cpu/runtime/function_library.h
+++ b/third_party/xla/xla/backends/cpu/runtime/function_library.h
@@ -19,13 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 #include <type_traits>
-#include <utility>
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/runtime/kernel_c_api.h"
 #include "xla/tsl/lib/gtl/int_type.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::cpu {
 
@@ -64,14 +63,14 @@ class FunctionLibrary {
   };
 
   template <typename F, std::enable_if_t<std::is_function_v<F>>* = nullptr>
-  static Symbol Sym(std::string name) {
-    return Symbol{GetTypeId<F>(), std::move(name)};
+  static Symbol Sym(absl::string_view name) {
+    return Symbol{GetTypeId<F>(), std::string(name)};
   }
 
   template <typename F, std::enable_if_t<std::is_function_v<F>>* = nullptr>
   absl::StatusOr<F*> ResolveFunction(absl::string_view name) {
     TF_ASSIGN_OR_RETURN(void* ptr, ResolveFunction(GetTypeId<F>(), name));
-    return reinterpret_cast<F*>(ptr);
+    return reinterpret_cast<F*>(ptr);  // NOLINT
   }
 
  protected:
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel.cc b/third_party/xla/xla/backends/cpu/runtime/kernel.cc
index d065549cfb5947..82c843b4025b31 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel.cc
@@ -62,10 +62,10 @@ static absl::InlinedVector<XLA_CPU_KernelArg, 8> ConvertBuffersToKernelArgs(
 }
 
 template <bool num_workgroups_x_only>
-class Kernel::ParallelTask {
+class Kernel::Task {
  public:
-  ParallelTask(XLA_CPU_Kernel* kernel, NumWorkGroups num_workgroups,
-               absl::Span<const XLA_CPU_KernelArg> args);
+  Task(XLA_CPU_Kernel* kernel, NumWorkGroups num_workgroups,
+       absl::Span<const XLA_CPU_KernelArg> args);
 
   // Invokes a host kernel for a given task index.
   absl::Status operator()(size_t task_index) const;
@@ -87,7 +87,7 @@ class Kernel::ParallelTask {
 };
 
 template <bool num_workgroups_x_only>
-Kernel::ParallelTask<num_workgroups_x_only>::ParallelTask(
+Kernel::Task<num_workgroups_x_only>::Task(
     XLA_CPU_Kernel* kernel, NumWorkGroups num_workgroups,
     absl::Span<const XLA_CPU_KernelArg> args)
     : kernel_(kernel),
@@ -98,7 +98,7 @@ Kernel::ParallelTask<num_workgroups_x_only>::ParallelTask(
       stride_y_(num_workgroups.x) {}
 
 template <bool num_workgroups_x_only>
-absl::Status Kernel::ParallelTask<num_workgroups_x_only>::operator()(
+absl::Status Kernel::Task<num_workgroups_x_only>::operator()(
     size_t task_index) const {
   DCHECK_LT(task_index, num_tasks_) << "Task index out of range";  // Crash OK
 
@@ -117,7 +117,7 @@ absl::Status Kernel::ParallelTask<num_workgroups_x_only>::operator()(
 }
 
 template <bool num_workgroups_x_only>
-XLA_CPU_WorkGroupId Kernel::ParallelTask<num_workgroups_x_only>::Delinearize(
+XLA_CPU_WorkGroupId Kernel::Task<num_workgroups_x_only>::Delinearize(
     uint64_t task_index) const {
   // In the most common case we parallelize only over the `x` dimension.
   if constexpr (num_workgroups_x_only) {
@@ -197,14 +197,12 @@ tsl::AsyncValueRef<LaunchEvent> Kernel::Launch(
                        std::numeric_limits<uint16_t>::max());
 
   if (ABSL_PREDICT_TRUE(num_workgroups.y == 1 && num_workgroups.z == 1)) {
-    return Worker::Parallelize(
-        device->getPool(), num_workers, num_tasks,
-        ParallelTask<true>(kernel_, num_workgroups, args));
+    return Worker::Parallelize(device->getPool(), num_workers, num_tasks,
+                               Task<true>(kernel_, num_workgroups, args));
   }
 
-  return Worker::Parallelize(
-      device->getPool(), num_workers, num_tasks,
-      ParallelTask<false>(kernel_, num_workgroups, args));
+  return Worker::Parallelize(device->getPool(), num_workers, num_tasks,
+                             Task<false>(kernel_, num_workgroups, args));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel.h b/third_party/xla/xla/backends/cpu/runtime/kernel.h
index 40b4083694ffb3..9fddf45cd93b6e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel.h
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel.h
@@ -105,9 +105,9 @@ class Kernel {
   }
 
  private:
-  // A kernel parallel task that is used to parallelize host kernel execution.
+  // A kernel task that is used to parallelize host kernel execution.
   template <bool num_workgroups_x_only>
-  class ParallelTask;
+  class Task;
 
   std::unique_ptr<KernelFunction> function_;
   XLA_CPU_Kernel* kernel_;  // pointer to the kernel owned by `function_`
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
index efa44a13fde7f1..90fd1b74c3c6bd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
 #include "xla/backends/cpu/runtime/function_library.h"
@@ -46,7 +47,6 @@ limitations under the License.
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -64,7 +64,9 @@ namespace internal {
 static absl::Status CheckBufferAlignment(
     const Thunk::Info& info, uint64_t min_alignment,
     absl::Span<const XLA_CPU_KernelArg> kernel_args) {
-  if (min_alignment == 0) return absl::OkStatus();
+  if (min_alignment == 0) {
+    return absl::OkStatus();
+  }
 
   for (int64_t i = 0; i < kernel_args.size(); ++i) {
     auto ptr = reinterpret_cast<uintptr_t>(kernel_args[i].data);
@@ -114,8 +116,9 @@ template <int64_t num_arguments, int64_t num_results>
 KernelThunk<num_arguments, num_results>::KernelThunk(
     Info info, absl::Span<const BufferAllocation::Slice> arguments_buffers,
     absl::Span<const BufferAllocation::Slice> results_buffers,
-    absl::flat_hash_set<int64_t> invariant_arguments, std::string kernel_name,
-    NumWorkGroups num_workgroups, std::optional<uint64_t> min_alignment)
+    absl::flat_hash_set<int64_t> invariant_arguments,
+    absl::string_view kernel_name, NumWorkGroups num_workgroups,
+    std::optional<uint64_t> min_alignment)
     : KernelThunkBase(Kind::kKernel, std::move(info)),
       invariant_arguments_(std::move(invariant_arguments)),
       num_kernel_args_(arguments_buffers.size() + results_buffers.size()),
@@ -312,7 +315,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> KernelThunk::Create(
     Thunk::Info info,
     absl::Span<const BufferAllocation::Slice> arguments_buffers,
     absl::Span<const BufferAllocation::Slice> results_buffers,
-    std::string kernel_name, NumWorkGroups num_workgroups,
+    absl::string_view kernel_name, NumWorkGroups num_workgroups,
     absl::flat_hash_set<int64_t> invariant_arguments,
     std::optional<uint64_t> min_alignment) {
   if (min_alignment.has_value() && !absl::has_single_bit(*min_alignment)) {
@@ -324,8 +327,8 @@ absl::StatusOr<std::unique_ptr<Thunk>> KernelThunk::Create(
     return absl::WrapUnique(
         new SmallKernelThunk<num_arguments(), num_results()>(
             std::move(info), arguments_buffers, results_buffers,
-            std::move(invariant_arguments), std::move(kernel_name),
-            num_workgroups, min_alignment));
+            std::move(invariant_arguments), kernel_name, num_workgroups,
+            min_alignment));
   };
 
   static constexpr auto _0 = std::integral_constant<size_t, 0>{};
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
index 4b7519bcf9f904..1a4500bbbfa4c1 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
@@ -137,7 +137,7 @@ class KernelThunk : public KernelThunkBase {
               absl::Span<const BufferAllocation::Slice> arguments_buffers,
               absl::Span<const BufferAllocation::Slice> results_buffers,
               absl::flat_hash_set<int64_t> invariant_arguments,
-              std::string kernel_name, NumWorkGroups num_workgroups,
+              absl::string_view kernel_name, NumWorkGroups num_workgroups,
               std::optional<uint64_t> min_alignment);
 
   absl::Status CheckInvariantBuffersMemory(const KernelArgs& kernel_args) const;
@@ -196,7 +196,7 @@ class KernelThunk final : public internal::KernelThunk<> {
       Thunk::Info info,
       absl::Span<const BufferAllocation::Slice> arguments_buffers,
       absl::Span<const BufferAllocation::Slice> results_buffers,
-      std::string kernel_name, NumWorkGroups num_workgroups,
+      absl::string_view kernel_name, NumWorkGroups num_workgroups,
       absl::flat_hash_set<int64_t> invariant_arguments,
       std::optional<uint64_t> min_alignment = std::nullopt);
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD b/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
index 7d19ec45031d10..b83f084d040c62 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
@@ -50,7 +50,7 @@ onednn_graph_cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:onednn_fusion",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/runtime:buffer_use",
         "//xla/runtime:object_pool",
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk.cc
index e4536323c26b64..95af21510e8106 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk.cc
@@ -142,11 +142,13 @@ OneDnnOpThunk::~OneDnnOpThunk() = default;
 
 OneDnnOpThunk::BufferUses OneDnnOpThunk::buffer_uses() const {
   BufferUses buffer_uses;
-  for (const auto& argument : op_buffers_.arguments_buffers) {
-    buffer_uses.emplace_back(BufferUse::Read(argument));
+  for (int i = 0; i < op_buffers_.arguments_buffers.size(); i++) {
+    buffer_uses.emplace_back(BufferUse::Read(op_buffers_.arguments_buffers[i],
+                                             op_buffers_.arguments_shapes[i]));
   }
-  for (const auto& result : op_buffers_.results_buffers) {
-    buffer_uses.emplace_back(BufferUse::Write(result));
+  for (int i = 0; i < op_buffers_.results_buffers.size(); i++) {
+    buffer_uses.emplace_back(BufferUse::Write(op_buffers_.results_buffers[i],
+                                              op_buffers_.results_shapes[i]));
   }
   return buffer_uses;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc
index 933d657f4bdcf7..3900fdc388408d 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc
@@ -114,12 +114,6 @@ TEST(OneDnnOpThunkTest, SimpleOneDnnMatMulThunk) {
 // Input: 1x3x3x1, Kernel: 2x2x1x1 with values [[1,0],[0,1]] (HWIO).
 // Output (valid conv): each element = top-left + bottom-right of 2x2 patch:
 // [[1+5, 2+6], [4+8, 5+9]] = [[6, 8], [12, 14]].
-// Layout metadata uses one-based spatial dim indices.
-// Window parameter encoding (matches runtime expectations defined in
-// onednn_contraction_rewriter.cc):
-//   strides stored as (actual + 1)
-//   pads stored as (actual + 1)
-//   dilations stored as (actual + 1).
 TEST(OneDnnOpThunkTest, SimpleOneDnnConvolutionThunk) {
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 4);
   Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
@@ -176,19 +170,17 @@ TEST(OneDnnOpThunkTest, SimpleOneDnnConvolutionThunk) {
   OneDnnDataLayoutProto* inp_data = inp->mutable_data();
   inp_data->set_batch_dim(0);
   inp_data->set_feature_dim(3);
-  // Spatial dims stored as one-based (so 1->2, 2->3).
+  inp_data->add_spatial_dims(1);
   inp_data->add_spatial_dims(2);
-  inp_data->add_spatial_dims(3);
 
   // Kernel layout assumed HWIO (H,W,In,Out):
   OneDnnTensorLayoutProto* ker = conv_config.mutable_kernel();
   ker->set_dims(4);
   OneDnnFilterLayoutProto* filter = ker->mutable_filter();
-  filter->set_input_feature_dim(2);   // zero-based index of IC
-  filter->set_output_feature_dim(3);  // zero-based index of OC
-  // Spatial dims (H,W) one-based: (0->1,1->2) => 1,2
+  filter->set_input_feature_dim(2);
+  filter->set_output_feature_dim(3);
+  filter->add_spatial_dims(0);
   filter->add_spatial_dims(1);
-  filter->add_spatial_dims(2);
 
   // Output layout NHWC
   OneDnnTensorLayoutProto* out = conv_config.mutable_output();
@@ -196,24 +188,21 @@ TEST(OneDnnOpThunkTest, SimpleOneDnnConvolutionThunk) {
   OneDnnDataLayoutProto* out_data = out->mutable_data();
   out_data->set_batch_dim(0);
   out_data->set_feature_dim(3);
+  out_data->add_spatial_dims(1);
   out_data->add_spatial_dims(2);
-  out_data->add_spatial_dims(3);
 
   conv_config.set_feature_groups(1);
 
-  // Window parameters: stride=1, pad=0, dilation=1 encoded with offsets.
+  // Window parameters: stride=1, pad=0, dilation=1.
   OneDnnWindowProto* win = conv_config.mutable_window();
-  // Store (actual + 1) for strides so 2 -> (2 - 1 = 1 real stride).
-  win->add_strides(2);
-  win->add_strides(2);
-  // Pads store (actual +1) so 1 -> 0 actual pad.
-  win->add_pad_left(1);
-  win->add_pad_left(1);
-  win->add_pad_right(1);
-  win->add_pad_right(1);
-  // Dilations store (actual +1) so 2 -> 1 actual dilation.
-  win->add_window_dilations(2);
-  win->add_window_dilations(2);
+  win->add_strides(1);
+  win->add_strides(1);
+  win->add_pad_left(0);
+  win->add_pad_left(0);
+  win->add_pad_right(0);
+  win->add_pad_right(0);
+  win->add_window_dilations(1);
+  win->add_window_dilations(1);
 
   // Set up op buffers
   OneDnnOpThunk::OpBuffers op_buffers;
diff --git a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc
index e5c49b7b3a720c..d7308550cb1171 100644
--- a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc
@@ -99,7 +99,8 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> OutfeedThunk::Execute(
 OutfeedThunk::BufferUses OutfeedThunk::buffer_uses() const {
   BufferUses buffer_uses;
   for (const OutfeedBuffer& outfeed_buffer : outfeed_buffers_) {
-    buffer_uses.emplace_back(BufferUse::Read(outfeed_buffer.slice));
+    buffer_uses.emplace_back(
+        BufferUse::Read(outfeed_buffer.slice, outfeed_buffer.shape));
   }
   return buffer_uses;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc
index d2004c2f54b044..28b5f6a58ca33e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla::cpu {
 namespace {
@@ -33,10 +33,11 @@ namespace {
 TEST(OutfeedThunkTest, BufferAndResourceUses) {
   BufferAllocation alloc(0, 1024, 0);
   BufferAllocation::Slice outfeed_slice(&alloc, 10, 40);
+  Shape outfeed_shape = ShapeUtil::MakeShape(F32, {10});
 
   OutfeedThunk::OutfeedBuffer outfeed_buffer = {
       outfeed_slice,
-      ShapeUtil::MakeShape(F32, {10}),
+      outfeed_shape,
   };
 
   auto consume_token = Resource::Create(Resource::kToken);
@@ -47,7 +48,8 @@ TEST(OutfeedThunkTest, BufferAndResourceUses) {
                                                {consume_token, produce_token}));
 
   EXPECT_EQ(thunk->buffer_uses().size(), 1);
-  EXPECT_EQ(thunk->buffer_uses()[0], BufferUse::Read(outfeed_slice));
+  EXPECT_EQ(thunk->buffer_uses()[0],
+            BufferUse::Read(outfeed_slice, outfeed_shape));
 
   EXPECT_EQ(thunk->resource_uses().size(), 2);
   EXPECT_EQ(thunk->resource_uses()[0], ResourceUse::Read(consume_token));
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_lib.cc b/third_party/xla/xla/backends/cpu/runtime/sort_lib.cc
new file mode 100644
index 00000000000000..df58a2a5371817
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_lib.cc
@@ -0,0 +1,712 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/sort_lib.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/dynamic_annotations.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+
+namespace xla::cpu::internal {
+
+namespace {
+
+// We use a lot of template metaprogramming below to be able to construct
+// iterators with statically known number of compared elements. We support a
+// limited set of template instantiations that we need in practice.
+
+// The size of the largest element we support (std::complex<double>).
+static constexpr size_t kMaxElementSize = 16;
+
+// Type erased storage suitable for storing any primitive type.
+using ValueStorage = std::array<std::byte, kMaxElementSize>;
+
+// Pointers to the input arrays together with their primitive sizes.
+template <size_t n>
+class Inputs {
+ public:
+  Inputs(absl::Span<std::byte* const> ptrs,
+         absl::Span<const size_t> primitive_sizes) {
+    DCHECK_EQ(n, ptrs.size());
+    DCHECK_EQ(n, primitive_sizes.size());
+    for (size_t i = 0; i < n; ++i) {
+      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
+    }
+  }
+
+  // Accessing arrays with `operator[]` has zero overheads, so we don't need to
+  // use pointers to data in contrast to `DInputs` below.
+
+  std::byte* ptr(size_t i, size_t offset) const {
+    DCHECK_LT(i, n) << "Input index out of bounds";
+    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_[i];
+    return ptr + offset * primitive_size;
+  }
+
+  size_t primitive_size(size_t i) const {
+    return ptrs_and_primitive_sizes_[i].second;
+  }
+
+ private:
+  // Pointers into the input buffers and each input's primitive size. Keep
+  // pointers and primitives sizes next to each other to avoid cache misses
+  // on a hot path.
+  std::array<std::pair<std::byte*, size_t>, n> ptrs_and_primitive_sizes_;
+};
+
+class DInputs {
+ public:
+  DInputs(absl::Span<std::byte* const> ptrs,
+          absl::Span<const size_t> primitive_sizes)
+      : n_(ptrs.size()), ptrs_and_primitive_sizes_(ptrs.size()) {
+    DCHECK_EQ(ptrs.size(), primitive_sizes.size());
+    for (size_t i = 0; i < ptrs.size(); ++i) {
+      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
+    }
+  }
+
+  size_t n() const { return n_; }
+
+  // Accessing vectors with `operator[]` is significantly slower than using a
+  // pointer to data because of libc++ hardening which checks for OOB access on
+  // every call. We know that we are not going to access out of bounds, so we
+  // use a pointer to data instead.
+
+  std::byte* ptr(size_t i, size_t offset) const {
+    DCHECK_LT(i, n_) << "Input index out of bounds";
+    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_.data()[i];
+    return ptr + offset * primitive_size;
+  }
+
+  size_t primitive_size(size_t i) const {
+    return ptrs_and_primitive_sizes_.data()[i].second;
+  }
+
+ private:
+  size_t n_;  // number of sorted inputs
+
+  // Pointers into the input buffers and each input's primitive size. Keep
+  // pointers and primitives sizes next to each other to avoid cache misses
+  // on a hot path.
+  std::vector<std::pair<std::byte*, size_t>> ptrs_and_primitive_sizes_;
+};
+
+// Forward declare reference type defined below.
+template <size_t n>
+struct Ref;
+struct DRef;
+
+// Value type to store values loaded from the input buffers.
+template <size_t n>
+struct Value {
+  Value(const Ref<n>& ref);  // NOLINT
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  std::array<ValueStorage, n> values;
+};
+
+struct DValue {
+  DValue(const DRef& ref);  // NOLINT
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  std::vector<ValueStorage> values;
+};
+
+// Reference to values stored in the input buffers.
+template <size_t n>
+struct Ref {
+  Ref(const Inputs<n>* inputs, size_t offset)
+      : inputs(inputs), offset(offset) {}
+
+  Ref& operator=(const Value<n>& value);
+  Ref& operator=(const Ref<n>& other);
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
+  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
+
+  const Inputs<n>* inputs;
+  size_t offset;
+};
+
+struct DRef {
+  DRef(const DInputs* inputs, size_t offset) : inputs(inputs), offset(offset) {}
+
+  DRef& operator=(const DValue& value);
+  DRef& operator=(const DRef& other);
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  size_t n() const { return inputs->n(); }
+  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
+  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
+
+  const DInputs* inputs;
+  size_t offset;
+};
+
+// We know that we can only copy up to 16 bytes for the largest element type
+// and can specialize `std::memcpy` to allow LLVM to inline it with statically
+// known sizes.
+static ABSL_ATTRIBUTE_ALWAYS_INLINE void Memcpy(void* __restrict dest,
+                                                const void* __restrict src,
+                                                size_t n) {
+  switch (n) {
+    case 1:
+      std::memcpy(dest, src, 1);
+      break;
+    case 2:
+      std::memcpy(dest, src, 2);
+      break;
+    case 4:
+      std::memcpy(dest, src, 4);
+      break;
+    case 8:
+      std::memcpy(dest, src, 8);
+      break;
+    case 16:
+      std::memcpy(dest, src, 16);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported memcpy size: " << n;
+  }
+}
+
+// Specialize swap for statically known sizes to avoid going through the same
+// switch statement multiple times.
+static ABSL_ATTRIBUTE_ALWAYS_INLINE void Swap(void* __restrict a,
+                                              void* __restrict b, size_t n) {
+  std::array<std::byte, kMaxElementSize> tmp;
+  switch (n) {
+    case 1:
+      std::memcpy(tmp.data(), a, 1);
+      std::memcpy(a, b, 1);
+      std::memcpy(b, tmp.data(), 1);
+      break;
+    case 2:
+      std::memcpy(tmp.data(), a, 2);
+      std::memcpy(a, b, 2);
+      std::memcpy(b, tmp.data(), 2);
+      break;
+    case 4:
+      std::memcpy(tmp.data(), a, 4);
+      std::memcpy(a, b, 4);
+      std::memcpy(b, tmp.data(), 4);
+      break;
+    case 8:
+      std::memcpy(tmp.data(), a, 8);
+      std::memcpy(a, b, 8);
+      std::memcpy(b, tmp.data(), 8);
+      break;
+    case 16:
+      std::memcpy(tmp.data(), a, 16);
+      std::memcpy(a, b, 16);
+      std::memcpy(b, tmp.data(), 16);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported swap size: " << n;
+  }
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Value<n>::Value(const Ref<n>& ref) {
+  for (size_t i = 0; i < n; ++i) {
+    Memcpy(values[i].data(), ref.ptr(i), ref.primitive_size(i));
+  }
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void Value<n>::FillComparedValues(
+    const void** __restrict compared_values) const {
+  for (const ValueStorage& value : values) {
+    *compared_values = value.data();
+    compared_values += 2;
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE DValue::DValue(const DRef& ref) : values(ref.n()) {
+  for (size_t i = 0, end = ref.n(); i < end; ++i) {
+    Memcpy(values.data()[i].data(), ref.ptr(i), ref.primitive_size(i));
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE void DValue::FillComparedValues(
+    const void** __restrict compared_values) const {
+#pragma unroll 8
+  for (const ValueStorage& value : values) {
+    *compared_values = value.data();
+    compared_values += 2;
+  }
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Value<n>& value) {
+  for (size_t i = 0; i < n; ++i) {
+    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
+  }
+  return *this;
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Ref<n>& other) {
+  for (size_t i = 0; i < n; ++i) {
+    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
+    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
+  }
+  return *this;
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void Ref<n>::FillComparedValues(
+    const void** __restrict compared_values) const {
+  for (size_t i = 0; i < n; ++i) {
+    *compared_values = ptr(i);
+    compared_values += 2;
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DValue& value) {
+  for (size_t i = 0, end = n(); i < end; ++i) {
+    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
+  }
+  return *this;
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DRef& other) {
+  for (size_t i = 0, end = n(); i < end; ++i) {
+    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
+    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
+  }
+  return *this;
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE void DRef::FillComparedValues(
+    const void** __restrict compared_values) const {
+#pragma unroll 8
+  for (size_t i = 0, end = n(); i < end; ++i) {
+    *compared_values = ptr(i);
+    compared_values += 2;
+  }
+}
+
+// Swap function required by `std::sort` and `std::stable_sort` implementations.
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const Ref<n>& lhs, const Ref<n>& rhs) {
+  for (size_t i = 0; i < n; ++i) {
+    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
+    size_t primitive_size = lhs.primitive_size(i);
+    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const DRef& lhs, const DRef& rhs) {
+  for (size_t i = 0, end = lhs.n(); i < end; ++i) {
+    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
+    size_t primitive_size = lhs.primitive_size(i);
+    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
+  }
+}
+
+// An array of pointers to the input data.
+template <size_t n>
+struct Ptr {
+  using difference_type = std::ptrdiff_t;
+
+  Ptr() = default;
+
+  explicit Ptr(const Inputs<n>* inputs, size_t offset = 0)
+      : inputs(inputs), offset(offset) {}
+
+  Ref<n> operator*() const { return Ref<n>{inputs, offset}; }
+
+  Ptr& operator+=(difference_type diff) {
+    offset += diff;
+    return *this;
+  }
+
+  Ptr& operator-=(difference_type diff) {
+    offset -= diff;
+    return *this;
+  }
+
+  Ptr operator+(difference_type diff) const {
+    return Ptr(inputs, offset + diff);
+  }
+
+  Ptr operator-(difference_type diff) const {
+    return Ptr(inputs, offset - diff);
+  }
+
+  difference_type operator-(const Ptr& rhs) const {
+    return offset - rhs.offset;
+  }
+
+  bool operator==(const Ptr& rhs) const { return offset == rhs.offset; }
+  bool operator!=(const Ptr& rhs) const { return offset != rhs.offset; }
+  bool operator>(const Ptr& rhs) const { return offset > rhs.offset; }
+  bool operator<(const Ptr& rhs) const { return offset < rhs.offset; }
+  bool operator>=(const Ptr& rhs) const { return offset >= rhs.offset; }
+  bool operator<=(const Ptr& rhs) const { return offset <= rhs.offset; }
+
+  const Inputs<n>* inputs;  // pointer to the input arrays
+  size_t offset;            // offset into the inputs arrays
+};
+
+struct DPtr {
+  using difference_type = std::ptrdiff_t;
+
+  DPtr() = default;
+
+  explicit DPtr(const DInputs* inputs, size_t offset = 0)
+      : inputs(inputs), offset(offset) {}
+
+  DRef operator*() const { return DRef{inputs, offset}; }
+
+  DPtr& operator+=(difference_type diff) {
+    offset += diff;
+    return *this;
+  }
+
+  DPtr& operator-=(difference_type diff) {
+    offset -= diff;
+    return *this;
+  }
+
+  DPtr operator+(difference_type diff) const {
+    return DPtr(inputs, offset + diff);
+  }
+
+  DPtr operator-(difference_type diff) const {
+    return DPtr(inputs, offset - diff);
+  }
+
+  difference_type operator-(const DPtr& rhs) const {
+    return offset - rhs.offset;
+  }
+
+  bool operator==(const DPtr& rhs) const { return offset == rhs.offset; }
+  bool operator!=(const DPtr& rhs) const { return offset != rhs.offset; }
+  bool operator>(const DPtr& rhs) const { return offset > rhs.offset; }
+  bool operator<(const DPtr& rhs) const { return offset < rhs.offset; }
+  bool operator>=(const DPtr& rhs) const { return offset >= rhs.offset; }
+  bool operator<=(const DPtr& rhs) const { return offset <= rhs.offset; }
+
+  const DInputs* inputs;  // pointer to the input arrays
+  size_t offset;          // offset into the inputs arrays
+};
+
+// We rely on `std::sort` and `std::stable_sort` to sort the raw data. We sort
+// multiple input buffers together using the same comparator function, so we
+// need to provide a custom iterator that can access the data of all input
+// buffers at the same time and swap elements in them.
+template <class Value, class Ref, class Ptr>
+class SortIterator {
+ public:
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+
+  using value_type = Value;
+  using reference = Ref;
+  using pointer = Ptr;
+
+  SortIterator() = default;
+  SortIterator(pointer ptr, difference_type stride)
+      : ptr_(std::move(ptr)), stride_(stride) {}
+
+  SortIterator(const SortIterator& other) = default;
+  SortIterator& operator=(const SortIterator& other) = default;
+  SortIterator(SortIterator&& other) = default;
+  SortIterator& operator=(SortIterator&& other) = default;
+
+  reference operator*() const { return *ptr_; }
+  reference operator[](difference_type diff) const { return *(*this + diff); }
+
+  difference_type operator-(const SortIterator& rhs) const {
+    return (ptr_ - rhs.ptr_) / stride_;
+  }
+
+  SortIterator& operator+=(difference_type diff) {
+    ptr_ += diff * stride_;
+    return *this;
+  }
+
+  SortIterator& operator-=(difference_type diff) {
+    ptr_ -= diff * stride_;
+    return *this;
+  }
+
+  SortIterator& operator++() {
+    ptr_ += stride_;
+    return *this;
+  }
+
+  SortIterator& operator--() {
+    ptr_ -= stride_;
+    return *this;
+  }
+
+  SortIterator operator+(difference_type diff) const {
+    return SortIterator(ptr_ + diff * stride_, stride_);
+  }
+
+  SortIterator operator-(difference_type diff) const {
+    return SortIterator(ptr_ - diff * stride_, stride_);
+  }
+
+  bool operator==(const SortIterator& rhs) const { return ptr_ == rhs.ptr_; }
+  bool operator!=(const SortIterator& rhs) const { return ptr_ != rhs.ptr_; }
+  bool operator>(const SortIterator& rhs) const { return ptr_ > rhs.ptr_; }
+  bool operator<(const SortIterator& rhs) const { return ptr_ < rhs.ptr_; }
+  bool operator>=(const SortIterator& rhs) const { return ptr_ >= rhs.ptr_; }
+  bool operator<=(const SortIterator& rhs) const { return ptr_ <= rhs.ptr_; }
+
+ private:
+  pointer ptr_;
+  difference_type stride_ = 1;
+};
+
+}  // namespace
+
+template <size_t n>
+static void Sort1DInplace(const SortDims& sort_dims, int64_t offset,
+                          absl::Span<std::byte* const> data,
+                          absl::Span<const size_t> primitive_sizes,
+                          bool is_stable, LessThan* less_than) {
+  DCHECK_EQ(n, data.size());
+  DCHECK_EQ(n, primitive_sizes.size());
+
+  std::array<std::byte*, n> ptrs;
+  for (size_t i = 0; i < n; ++i) {
+    ptrs[i] = data[i] + offset * primitive_sizes[i];
+  }
+
+  Inputs<n> inputs(ptrs, primitive_sizes);
+
+  auto compare = [&](const auto& a, const auto& b) {
+    std::array<const void*, 2 * n> values;
+    a.FillComparedValues(&values[0]);
+    b.FillComparedValues(&values[1]);
+    return (*less_than)(values.data());
+  };
+
+  SortIterator<Value<n>, Ref<n>, Ptr<n>> begin(
+      Ptr<n>(&inputs), /*stride=*/sort_dims.inner_dim_size);
+  if (is_stable) {
+    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
+  } else {
+    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
+  }
+}
+
+static void DSort1DInplace(const SortDims& sort_dims, int64_t offset,
+                           absl::Span<std::byte* const> data,
+                           absl::Span<const size_t> primitive_sizes,
+                           bool is_stable, LessThan* less_than) {
+  DCHECK_EQ(data.size(), primitive_sizes.size());
+
+  std::vector<std::byte*> ptrs(data.size());
+  for (size_t i = 0; i < data.size(); ++i) {
+    ptrs[i] = data[i] + offset * primitive_sizes[i];
+  }
+
+  DInputs inputs(std::move(ptrs), primitive_sizes);
+
+  // Allocate scratch space for sorted values outside of the lambda to avoid
+  // allocating it on every call to `compare`.
+  std::vector<const void*> values(2 * data.size());
+
+  auto compare = [&, values = values.data()](const auto& a, const auto& b) {
+    a.FillComparedValues(&values[0]);
+    b.FillComparedValues(&values[1]);
+    return (*less_than)(values);
+  };
+
+  SortIterator<DValue, DRef, DPtr> begin(DPtr(&inputs),
+                                         /*stride=*/sort_dims.inner_dim_size);
+  if (is_stable) {
+    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
+  } else {
+    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
+  }
+}
+
+// Sorts `data` using `less_than` comparator function.
+void SortInplace(const SortDims& sort_dims, absl::Span<std::byte* const> data,
+                 absl::Span<const size_t> primitive_sizes, bool is_stable,
+                 LessThan* less_than) {
+  // Iterate over all the 1-dimensional slices of the buffers and sort them.
+  int64_t num_iterations = sort_dims.outer_dim_size * sort_dims.inner_dim_size;
+
+  // Annotate memory that might have been initialized by jit-compiled code.
+  for (int64_t i = 0; i < data.size(); ++i) {
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
+        data[i], primitive_sizes[i] * sort_dims.sort_dim_size * num_iterations);
+  }
+
+  for (int64_t i = 0; i < num_iterations; ++i) {
+    int64_t inner_idx = i % sort_dims.inner_dim_size;
+    int64_t offset = inner_idx + (i - inner_idx) * sort_dims.sort_dim_size;
+
+    // Use "sort" for statically known number of sorted inputs (expected to be
+    // faster) and "dsort" for dynamically known number of sorted inputs.
+    auto sort = [&](auto num_inputs) {
+      Sort1DInplace<decltype(num_inputs)::value>(
+          sort_dims, offset, data, primitive_sizes, is_stable, less_than);
+    };
+
+    switch (data.size()) {
+      case 1:
+        sort(std::integral_constant<size_t, 1>{});
+        break;
+      case 2:
+        sort(std::integral_constant<size_t, 2>{});
+        break;
+      case 3:
+        sort(std::integral_constant<size_t, 3>{});
+        break;
+      case 4:
+        sort(std::integral_constant<size_t, 4>{});
+        break;
+      case 5:
+        sort(std::integral_constant<size_t, 5>{});
+        break;
+      case 6:
+        sort(std::integral_constant<size_t, 6>{});
+        break;
+      case 7:
+        sort(std::integral_constant<size_t, 7>{});
+        break;
+      case 8:
+        sort(std::integral_constant<size_t, 8>{});
+        break;
+      case 9:
+        sort(std::integral_constant<size_t, 9>{});
+        break;
+      case 10:
+        sort(std::integral_constant<size_t, 10>{});
+        break;
+      case 11:
+        sort(std::integral_constant<size_t, 11>{});
+        break;
+      case 12:
+        sort(std::integral_constant<size_t, 12>{});
+        break;
+      case 13:
+        sort(std::integral_constant<size_t, 13>{});
+        break;
+      case 14:
+        sort(std::integral_constant<size_t, 14>{});
+        break;
+      case 15:
+        sort(std::integral_constant<size_t, 15>{});
+        break;
+      case 16:
+        sort(std::integral_constant<size_t, 16>{});
+        break;
+      default:
+        DSort1DInplace(sort_dims, offset, data, primitive_sizes, is_stable,
+                       less_than);
+        break;
+    }
+  }
+}
+
+template <class Iterator, class T>
+static void Sort1DInplace(Iterator begin, Iterator end, bool is_stable,
+                          SortDirection direction) {
+  if (direction == SortDirection::kAscending) {
+    if (is_stable) {
+      std::stable_sort(begin, end, std::less<T>());
+    } else {
+      std::sort(begin, end, std::less<T>());
+    }
+  } else {
+    if (is_stable) {
+      std::stable_sort(begin, end, std::greater<T>());
+    } else {
+      std::sort(begin, end, std::greater<T>());
+    }
+  };
+}
+
+template <typename T>
+static void Sort1DInplace(const SortDims& sort_dims, int64_t offset, T* data,
+                          bool is_stable, SortDirection direction) {
+  T* begin = data + offset;
+  T* end = begin + sort_dims.sort_dim_size;
+
+  if (sort_dims.inner_dim_size == 1) {
+    Sort1DInplace<T*, T>(begin, end, is_stable, direction);
+  } else {
+    using Iterator = internal::SortIterator<T, T&, T*>;
+    Iterator begin_it(begin, /*stride=*/sort_dims.inner_dim_size);
+    Iterator end_it = begin_it + sort_dims.sort_dim_size;
+    Sort1DInplace<Iterator, T>(begin_it, end_it, is_stable, direction);
+  }
+}
+
+template <typename T>
+void SortInplace(const SortDims& sort_dims, T* data, bool is_stable,
+                 SortDirection direction) {
+  // Iterate over all the 1-dimensional slices of the buffers and sort them.
+  int64_t num_iterations = sort_dims.outer_dim_size * sort_dims.inner_dim_size;
+
+  // Annotate memory that might have been initialized by jit-compiled code.
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
+      data, sizeof(T) * sort_dims.sort_dim_size * num_iterations);
+
+  for (int64_t i = 0; i < num_iterations; ++i) {
+    int64_t inner_idx = i % sort_dims.inner_dim_size;
+    int64_t offset = inner_idx + (i - inner_idx) * sort_dims.sort_dim_size;
+
+    Sort1DInplace<T>(sort_dims, offset, data, is_stable, direction);
+  }
+}
+
+// Declare Sort1DInplace for all supported types. Template is instantiated in
+// the .cc file.
+#define DEFINE_SORT_INPLACE(T) \
+  template void SortInplace<T>(const SortDims&, T*, bool, SortDirection)
+
+DEFINE_SORT_INPLACE(float);
+DEFINE_SORT_INPLACE(double);
+DEFINE_SORT_INPLACE(int8_t);
+DEFINE_SORT_INPLACE(int16_t);
+DEFINE_SORT_INPLACE(int32_t);
+DEFINE_SORT_INPLACE(int64_t);
+DEFINE_SORT_INPLACE(uint8_t);
+DEFINE_SORT_INPLACE(uint16_t);
+DEFINE_SORT_INPLACE(uint32_t);
+DEFINE_SORT_INPLACE(uint64_t);
+
+#undef DEFINE_SORT_INPLACE
+
+}  // namespace xla::cpu::internal
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_lib.h b/third_party/xla/xla/backends/cpu/runtime/sort_lib.h
new file mode 100644
index 00000000000000..7b271b21b221eb
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_lib.h
@@ -0,0 +1,81 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_SORT_LIB_H_
+#define XLA_BACKENDS_CPU_RUNTIME_SORT_LIB_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/types/span.h"
+
+namespace xla::cpu::internal {
+
+// Conceptually we have a 3-dimensional shape:
+//
+//   [outer_dim_size, sort_dim_size, inner_dim_size]
+//
+// We sort `outer_dim_size * inner_dim_size` vectors of length `sort_dim_size`,
+// by iterating over `data` memory and calling `std::sort` (or
+// `std::stable_sort`) on each (strided) slice of the buffer.
+struct SortDims {
+  int64_t outer_dim_size;
+  int64_t sort_dim_size;
+  int64_t inner_dim_size;
+};
+
+// For trivial sort functors (computation with two parameters that are
+// compared using `LT` or `GT` direction) we can define sort as a enum. We use
+// it for performance optimization to be able to inline the sort function.
+enum class SortDirection {
+  kAscending,
+  kDescending,
+};
+
+// Sorts `data` using `less_than` comparator function. Data is sorted in place,
+// and sort dimensions are specified in `sort_dims`.
+using LessThan = absl::AnyInvocable<bool(const void** data)>;
+void SortInplace(const SortDims& sort_dims, absl::Span<std::byte* const> data,
+                 absl::Span<const size_t> primitive_sizes, bool is_stable,
+                 LessThan* less_than);
+
+// Sorts `data` using the sort `direction` with builtin comparator functions.
+// This is more efficient, as the comparator can be inlined.
+template <typename T>
+void SortInplace(const SortDims& sort_dims, T* data, bool is_stable,
+                 SortDirection direction);
+
+// Declare SortInplace for all supported types. Template is instantiated in
+// the .cc file.
+#define DECLARE_SORT_INPLACE(T) \
+  extern template void SortInplace<T>(const SortDims&, T*, bool, SortDirection)
+
+DECLARE_SORT_INPLACE(float);
+DECLARE_SORT_INPLACE(double);
+DECLARE_SORT_INPLACE(int8_t);
+DECLARE_SORT_INPLACE(int16_t);
+DECLARE_SORT_INPLACE(int32_t);
+DECLARE_SORT_INPLACE(int64_t);
+DECLARE_SORT_INPLACE(uint8_t);
+DECLARE_SORT_INPLACE(uint16_t);
+DECLARE_SORT_INPLACE(uint32_t);
+DECLARE_SORT_INPLACE(uint64_t);
+
+#undef DECLARE_SORT_INPLACE
+
+}  // namespace xla::cpu::internal
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_SORT_LIB_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
index 9a53032011347c..3f753a6010ea00 100644
--- a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 
-#include <algorithm>
-#include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -26,23 +24,21 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/base/call_once.h"
 #include "absl/base/dynamic_annotations.h"
 #include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/sort_lib.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/layout_util.h"
 #include "xla/primitive_util.h"
@@ -60,8 +56,42 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static absl::Status VerifySortInputs(absl::Span<const SortThunk::Input> inputs,
-                                     int64_t dimension) {
+// Conceptually we have a 3-dimensional shape:
+//
+//   [outer_dim_size, sort_dim_size, inner_dim_size]
+//
+// We sort `outer_dim_size * inner_dim_size` vectors of length
+// `sort_dim_size`, by iterating over `data` memory and calling `std::sort`
+// (or `std::stable_sort`) on each (strided) slice of the buffer.
+static SortThunk::SortDims GetSortDims(const Shape& shape, int64_t dimension) {
+  int64_t sort_dimension =
+      dimension >= 0 ? dimension : shape.dimensions().size() + dimension;
+
+  // We need to normalize shape + layout into a descending layout, so that we
+  // can compute access strides according to the physical layout.
+  Shape physical_shape =
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape);
+
+  // Map `sort_dimension` from logical to physical.
+  auto logical_to_physical = LayoutUtil::MakeLogicalToPhysical(shape.layout());
+  sort_dimension = logical_to_physical[sort_dimension];
+
+  auto product = [](absl::Span<const int64_t> dims) {
+    return absl::c_accumulate(dims, int64_t{1}, std::multiplies<>());
+  };
+
+  // Use physical dimensions to compute access strides.
+  absl::Span<const int64_t> dimensions = physical_shape.dimensions();
+
+  int64_t outer_dim_size = product(dimensions.subspan(0, sort_dimension));
+  int64_t sort_dim_size = dimensions[sort_dimension];
+  int64_t inner_dim_size = product(dimensions.subspan(sort_dimension + 1));
+
+  return SortThunk::SortDims{outer_dim_size, sort_dim_size, inner_dim_size};
+}
+
+static absl::StatusOr<SortThunk::SortDims> VerifySortInputs(
+    absl::Span<const SortThunk::Input> inputs, int64_t dimension) {
   // We should have at least one input buffer.
   if (inputs.empty()) {
     return Internal("Inputs must not be empty");
@@ -86,768 +116,96 @@ static absl::Status VerifySortInputs(absl::Span<const SortThunk::Input> inputs,
         absl::StrJoin(shape.dimensions(), ","), dimension);
   }
 
-  return absl::OkStatus();
+  return GetSortDims(inputs[0].shape, dimension);
 }
 
 absl::StatusOr<std::unique_ptr<SortThunk>> SortThunk::Create(
     Info info, absl::Span<const Input> inputs, int64_t dimension,
     bool is_stable, LessThan less_than,
     std::optional<SortDirection> direction) {
-  TF_RETURN_IF_ERROR(VerifySortInputs(inputs, dimension));
+  TF_ASSIGN_OR_RETURN(auto sort_dims, VerifySortInputs(inputs, dimension));
   return absl::WrapUnique(new SortThunk(std::move(info), inputs, dimension,
                                         is_stable, std::move(less_than),
-                                        direction));
+                                        sort_dims, direction));
 }
 
 absl::StatusOr<std::unique_ptr<SortThunk>> SortThunk::Create(
     Info info, absl::Span<const Input> inputs, int64_t dimension,
     bool is_stable, std::string comparator_name,
     std::optional<SortDirection> direction) {
-  TF_RETURN_IF_ERROR(VerifySortInputs(inputs, dimension));
+  TF_ASSIGN_OR_RETURN(auto sort_dims, VerifySortInputs(inputs, dimension));
   return absl::WrapUnique(new SortThunk(std::move(info), inputs, dimension,
                                         is_stable, std::move(comparator_name),
-                                        direction));
+                                        sort_dims, direction));
 }
 
 SortThunk::SortThunk(Info info, absl::Span<const Input> inputs,
                      int64_t dimension, bool is_stable, LessThan less_than,
-                     std::optional<SortDirection> direction)
+                     SortDims sort_dims, std::optional<SortDirection> direction)
     : Thunk(Kind::kSort, std::move(info)),
       inputs_(inputs.begin(), inputs.end()),
       dimension_(dimension),
       is_stable_(is_stable),
+      sort_dims_(sort_dims),
       direction_(direction),
       less_than_(std::move(less_than)) {}
 
 SortThunk::SortThunk(Info info, absl::Span<const Input> inputs,
                      int64_t dimension, bool is_stable,
-                     std::string comparator_name,
+                     std::string comparator_name, SortDims sort_dims,
                      std::optional<SortDirection> direction)
     : Thunk(Kind::kSort, std::move(info)),
       inputs_(inputs.begin(), inputs.end()),
       dimension_(dimension),
       is_stable_(is_stable),
+      sort_dims_(sort_dims),
       direction_(direction),
       comparator_name_(std::move(comparator_name)) {}
 
-namespace {
-
-// We use a lot of template metaprogramming below to be able to construct
-// iterators with statically known number of compared elements. We support a
-// limited set of template instantiations that we need in practice.
-
-// The size of the largest element we support (std::complex<double>).
-static constexpr size_t kMaxElementSize = 16;
-
-// Type erased storage suitable for storing any primitive type.
-using ValueStorage = std::array<std::byte, kMaxElementSize>;
-
-// Pointers to the input arrays together with their primitive sizes.
-template <size_t n>
-class Inputs {
- public:
-  Inputs(std::array<std::byte*, n> ptrs,
-         std::array<size_t, n> primitive_sizes) {
-    for (size_t i = 0; i < n; ++i) {
-      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
-    }
-  }
-
-  // Accessing arrays with `operator[]` has zero overheads, so we don't need to
-  // use pointers to data in contrast to `DInputs` below.
-
-  std::byte* ptr(size_t i, size_t offset) const {
-    DCHECK_LT(i, n) << "Input index out of bounds";
-    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_[i];
-    return ptr + offset * primitive_size;
-  }
-
-  size_t primitive_size(size_t i) const {
-    return ptrs_and_primitive_sizes_[i].second;
-  }
-
- private:
-  // Pointers into the input buffers and each input's primitive size. Keep
-  // pointers and primitives sizes next to each other to avoid cache misses
-  // on a hot path.
-  std::array<std::pair<std::byte*, size_t>, n> ptrs_and_primitive_sizes_;
-};
-
-class DInputs {
- public:
-  DInputs(std::vector<std::byte*> ptrs, std::vector<size_t> primitive_sizes)
-      : n_(ptrs.size()), ptrs_and_primitive_sizes_(ptrs.size()) {
-    DCHECK_EQ(ptrs.size(), primitive_sizes.size());
-    for (size_t i = 0; i < ptrs.size(); ++i) {
-      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
-    }
-  }
-
-  size_t n() const { return n_; }
-
-  // Accessing vectors with `operator[]` is significantly slower than using a
-  // pointer to data because of libc++ hardening which checks for OOB access on
-  // every call. We know that we are not going to access out of bounds, so we
-  // use a pointer to data instead.
-
-  std::byte* ptr(size_t i, size_t offset) const {
-    DCHECK_LT(i, n_) << "Input index out of bounds";
-    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_.data()[i];
-    return ptr + offset * primitive_size;
-  }
-
-  size_t primitive_size(size_t i) const {
-    return ptrs_and_primitive_sizes_.data()[i].second;
-  }
-
- private:
-  size_t n_;  // number of sorted inputs
-
-  // Pointers into the input buffers and each input's primitive size. Keep
-  // pointers and primitives sizes next to each other to avoid cache misses
-  // on a hot path.
-  std::vector<std::pair<std::byte*, size_t>> ptrs_and_primitive_sizes_;
-};
-
-// Forward declare reference type defined below.
-template <size_t n>
-struct Ref;
-struct DRef;
-
-// Value type to store values loaded from the input buffers.
-template <size_t n>
-struct Value {
-  Value(const Ref<n>& ref);  // NOLINT
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  std::array<ValueStorage, n> values;
-};
-
-struct DValue {
-  DValue(const DRef& ref);  // NOLINT
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  std::vector<ValueStorage> values;
-};
-
-// Reference to values stored in the input buffers.
-template <size_t n>
-struct Ref {
-  Ref(const Inputs<n>* inputs, size_t offset)
-      : inputs(inputs), offset(offset) {}
-
-  Ref& operator=(const Value<n>& value);
-  Ref& operator=(const Ref<n>& other);
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
-  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
-
-  const Inputs<n>* inputs;
-  size_t offset;
-};
-
-struct DRef {
-  DRef(const DInputs* inputs, size_t offset) : inputs(inputs), offset(offset) {}
-
-  DRef& operator=(const DValue& value);
-  DRef& operator=(const DRef& other);
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  size_t n() const { return inputs->n(); }
-  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
-  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
-
-  const DInputs* inputs;
-  size_t offset;
-};
-
-// We know that we can only copy up to 16 bytes for the largest element type
-// and can specialize `std::memcpy` to allow LLVM to inline it with statically
-// known sizes.
-static ABSL_ATTRIBUTE_ALWAYS_INLINE void Memcpy(void* __restrict dest,
-                                                const void* __restrict src,
-                                                size_t n) {
-  switch (n) {
-    case 1:
-      std::memcpy(dest, src, 1);
-      break;
-    case 2:
-      std::memcpy(dest, src, 2);
-      break;
-    case 4:
-      std::memcpy(dest, src, 4);
-      break;
-    case 8:
-      std::memcpy(dest, src, 8);
-      break;
-    case 16:
-      std::memcpy(dest, src, 16);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported memcpy size: " << n;
-  }
-}
-
-// Specialize swap for statically known sizes to avoid going through the same
-// switch statement multiple times.
-static ABSL_ATTRIBUTE_ALWAYS_INLINE void Swap(void* __restrict a,
-                                              void* __restrict b, size_t n) {
-  std::array<std::byte, kMaxElementSize> tmp;
-  switch (n) {
-    case 1:
-      std::memcpy(tmp.data(), a, 1);
-      std::memcpy(a, b, 1);
-      std::memcpy(b, tmp.data(), 1);
-      break;
-    case 2:
-      std::memcpy(tmp.data(), a, 2);
-      std::memcpy(a, b, 2);
-      std::memcpy(b, tmp.data(), 2);
-      break;
-    case 4:
-      std::memcpy(tmp.data(), a, 4);
-      std::memcpy(a, b, 4);
-      std::memcpy(b, tmp.data(), 4);
-      break;
-    case 8:
-      std::memcpy(tmp.data(), a, 8);
-      std::memcpy(a, b, 8);
-      std::memcpy(b, tmp.data(), 8);
-      break;
-    case 16:
-      std::memcpy(tmp.data(), a, 16);
-      std::memcpy(a, b, 16);
-      std::memcpy(b, tmp.data(), 16);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported swap size: " << n;
-  }
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE Value<n>::Value(const Ref<n>& ref) {
-  for (size_t i = 0; i < n; ++i) {
-    Memcpy(values[i].data(), ref.ptr(i), ref.primitive_size(i));
-  }
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE void Value<n>::FillComparedValues(
-    const void** __restrict compared_values) const {
-  for (const ValueStorage& value : values) {
-    *compared_values = value.data();
-    compared_values += 2;
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE DValue::DValue(const DRef& ref) : values(ref.n()) {
-  for (size_t i = 0, end = ref.n(); i < end; ++i) {
-    Memcpy(values.data()[i].data(), ref.ptr(i), ref.primitive_size(i));
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE void DValue::FillComparedValues(
-    const void** __restrict compared_values) const {
-#pragma unroll 8
-  for (const ValueStorage& value : values) {
-    *compared_values = value.data();
-    compared_values += 2;
-  }
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Value<n>& value) {
-  for (size_t i = 0; i < n; ++i) {
-    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
-  }
-  return *this;
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Ref<n>& other) {
-  for (size_t i = 0; i < n; ++i) {
-    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
-    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
-  }
-  return *this;
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE void Ref<n>::FillComparedValues(
-    const void** __restrict compared_values) const {
-  for (size_t i = 0; i < n; ++i) {
-    *compared_values = ptr(i);
-    compared_values += 2;
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DValue& value) {
-  for (size_t i = 0, end = n(); i < end; ++i) {
-    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
-  }
-  return *this;
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DRef& other) {
-  for (size_t i = 0, end = n(); i < end; ++i) {
-    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
-    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
-  }
-  return *this;
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE void DRef::FillComparedValues(
-    const void** __restrict compared_values) const {
-#pragma unroll 8
-  for (size_t i = 0, end = n(); i < end; ++i) {
-    *compared_values = ptr(i);
-    compared_values += 2;
-  }
-}
-
-// Swap function required by `std::sort` and `std::stable_sort` implementations.
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const Ref<n>& lhs, const Ref<n>& rhs) {
-  for (size_t i = 0; i < n; ++i) {
-    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
-    size_t primitive_size = lhs.primitive_size(i);
-    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const DRef& lhs, const DRef& rhs) {
-  for (size_t i = 0, end = lhs.n(); i < end; ++i) {
-    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
-    size_t primitive_size = lhs.primitive_size(i);
-    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
-  }
-}
-
-// An array of pointers to the input data.
-template <size_t n>
-struct Ptr {
-  using difference_type = std::ptrdiff_t;
-
-  Ptr() = default;
-
-  explicit Ptr(const Inputs<n>* inputs, size_t offset = 0)
-      : inputs(inputs), offset(offset) {}
-
-  Ref<n> operator*() const { return Ref<n>{inputs, offset}; }
-
-  Ptr& operator+=(difference_type diff) {
-    offset += diff;
-    return *this;
-  }
-
-  Ptr& operator-=(difference_type diff) {
-    offset -= diff;
-    return *this;
-  }
-
-  Ptr operator+(difference_type diff) const {
-    return Ptr(inputs, offset + diff);
-  }
-
-  Ptr operator-(difference_type diff) const {
-    return Ptr(inputs, offset - diff);
-  }
-
-  difference_type operator-(const Ptr& rhs) const {
-    return offset - rhs.offset;
-  }
-
-  bool operator==(const Ptr& rhs) const { return offset == rhs.offset; }
-  bool operator!=(const Ptr& rhs) const { return offset != rhs.offset; }
-  bool operator>(const Ptr& rhs) const { return offset > rhs.offset; }
-  bool operator<(const Ptr& rhs) const { return offset < rhs.offset; }
-  bool operator>=(const Ptr& rhs) const { return offset >= rhs.offset; }
-  bool operator<=(const Ptr& rhs) const { return offset <= rhs.offset; }
-
-  const Inputs<n>* inputs;  // pointer to the input arrays
-  size_t offset;            // offset into the inputs arrays
-};
-
-struct DPtr {
-  using difference_type = std::ptrdiff_t;
-
-  DPtr() = default;
-
-  explicit DPtr(const DInputs* inputs, size_t offset = 0)
-      : inputs(inputs), offset(offset) {}
-
-  DRef operator*() const { return DRef{inputs, offset}; }
-
-  DPtr& operator+=(difference_type diff) {
-    offset += diff;
-    return *this;
-  }
-
-  DPtr& operator-=(difference_type diff) {
-    offset -= diff;
-    return *this;
-  }
-
-  DPtr operator+(difference_type diff) const {
-    return DPtr(inputs, offset + diff);
-  }
-
-  DPtr operator-(difference_type diff) const {
-    return DPtr(inputs, offset - diff);
-  }
-
-  difference_type operator-(const DPtr& rhs) const {
-    return offset - rhs.offset;
-  }
-
-  bool operator==(const DPtr& rhs) const { return offset == rhs.offset; }
-  bool operator!=(const DPtr& rhs) const { return offset != rhs.offset; }
-  bool operator>(const DPtr& rhs) const { return offset > rhs.offset; }
-  bool operator<(const DPtr& rhs) const { return offset < rhs.offset; }
-  bool operator>=(const DPtr& rhs) const { return offset >= rhs.offset; }
-  bool operator<=(const DPtr& rhs) const { return offset <= rhs.offset; }
-
-  const DInputs* inputs;  // pointer to the input arrays
-  size_t offset;          // offset into the inputs arrays
-};
-
-// We rely on `std::sort` and `std::stable_sort` to sort the raw data. We sort
-// multiple input buffers together using the same comparator function, so we
-// need to provide a custom iterator that can access the data of all input
-// buffers at the same time and swap elements in them.
-template <class Value, class Ref, class Ptr>
-class SortIterator {
- public:
-  using iterator_category = std::random_access_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-
-  using value_type = Value;
-  using reference = Ref;
-  using pointer = Ptr;
-
-  SortIterator() = default;
-  SortIterator(pointer ptr, difference_type stride)
-      : ptr_(std::move(ptr)), stride_(stride) {}
-
-  SortIterator(const SortIterator& other) = default;
-  SortIterator& operator=(const SortIterator& other) = default;
-  SortIterator(SortIterator&& other) = default;
-  SortIterator& operator=(SortIterator&& other) = default;
-
-  reference operator*() const { return *ptr_; }
-  reference operator[](difference_type diff) const { return *(*this + diff); }
-
-  difference_type operator-(const SortIterator& rhs) const {
-    return (ptr_ - rhs.ptr_) / stride_;
-  }
-
-  SortIterator& operator+=(difference_type diff) {
-    ptr_ += diff * stride_;
-    return *this;
-  }
-
-  SortIterator& operator-=(difference_type diff) {
-    ptr_ -= diff * stride_;
-    return *this;
-  }
-
-  SortIterator& operator++() {
-    ptr_ += stride_;
-    return *this;
-  }
-
-  SortIterator& operator--() {
-    ptr_ -= stride_;
-    return *this;
-  }
-
-  SortIterator operator+(difference_type diff) const {
-    return SortIterator(ptr_ + diff * stride_, stride_);
-  }
-
-  SortIterator operator-(difference_type diff) const {
-    return SortIterator(ptr_ - diff * stride_, stride_);
-  }
-
-  bool operator==(const SortIterator& rhs) const { return ptr_ == rhs.ptr_; }
-  bool operator!=(const SortIterator& rhs) const { return ptr_ != rhs.ptr_; }
-  bool operator>(const SortIterator& rhs) const { return ptr_ > rhs.ptr_; }
-  bool operator<(const SortIterator& rhs) const { return ptr_ < rhs.ptr_; }
-  bool operator>=(const SortIterator& rhs) const { return ptr_ >= rhs.ptr_; }
-  bool operator<=(const SortIterator& rhs) const { return ptr_ <= rhs.ptr_; }
-
- private:
-  pointer ptr_;
-  difference_type stride_ = 1;
-};
-
-struct SortDims {
-  int64_t outer_dim_size;
-  int64_t sort_dim_size;
-  int64_t inner_dim_size;
-  int64_t num_iterations;
-};
-
-}  // namespace
-
-// Conceptually we have a 3-dimensional shape:
-//
-//   [outer_dim_size, sort_dim_size, inner_dim_size]
-//
-// We sort `outer_dim_size * inner_dim_size` vectors of length
-// `sort_dim_size`, by iterating over `data` memory and calling `std::sort`
-// (or `std::stable_sort`) on each (strided) slice of the buffer.
-static SortDims GetSortDims(const Shape& shape, int64_t dimension) {
-  int64_t sort_dimension =
-      dimension >= 0 ? dimension : shape.dimensions().size() + dimension;
-
-  // We need to normalize shape + layout into a descending layout, so that we
-  // can compute access strides according to the physical layout.
-  Shape physical_shape =
-      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape);
-
-  // Map `sort_dimension` from logical to physical.
-  auto logical_to_physical = LayoutUtil::MakeLogicalToPhysical(shape.layout());
-  sort_dimension = logical_to_physical[sort_dimension];
-
-  auto product = [](absl::Span<const int64_t> dims) {
-    return absl::c_accumulate(dims, int64_t{1}, std::multiplies<>());
-  };
-
-  // Use physical dimensions to compute access strides.
-  absl::Span<const int64_t> dimensions = physical_shape.dimensions();
-
-  int64_t outer_dim_size = product(dimensions.subspan(0, sort_dimension));
-  int64_t sort_dim_size = dimensions[sort_dimension];
-  int64_t inner_dim_size = product(dimensions.subspan(sort_dimension + 1));
-  int64_t num_iterations = outer_dim_size * inner_dim_size;
-
-  return SortDims{outer_dim_size, sort_dim_size, inner_dim_size,
-                  num_iterations};
-}
-
-template <class Iterator, class NativeT>
-static void Sort1DArrInplace(int64_t sort_dims_size, int64_t offset,
-                             Iterator begin, bool is_stable,
-                             SortThunk::SortDirection direction) {
-  if (direction == SortThunk::SortDirection::kAscending) {
-    if (is_stable) {
-      std::stable_sort(begin, begin + sort_dims_size, std::less<NativeT>());
-    } else {
-      std::sort(begin, begin + sort_dims_size, std::less<NativeT>());
-    }
-  } else {
-    if (is_stable) {
-      std::stable_sort(begin, begin + sort_dims_size, std::greater<NativeT>());
-    } else {
-      std::sort(begin, begin + sort_dims_size, std::greater<NativeT>());
-    }
-  };
-}
-
-// The most efficient way to sort a single buffer is to use the builtin
-// comparator functions.
-template <PrimitiveType Type>
-static void Sort1DArrInplace(const SortDims& sort_dims, int64_t offset,
-                             absl::Span<se::DeviceMemoryBase> data,
-                             bool is_stable,
-                             SortThunk::SortDirection direction) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<Type>::type;
-  DCHECK_EQ(data.size(), 1);
-  NativeT* begin = reinterpret_cast<NativeT*>(data[0].opaque()) + offset;
-
-  if (sort_dims.inner_dim_size == 1) {
-    Sort1DArrInplace<NativeT*, NativeT>(sort_dims.sort_dim_size, offset, begin,
-                                        is_stable, direction);
-  } else {
-    using Iterator = SortIterator<NativeT, NativeT&, NativeT*>;
-    Iterator begin_iter(begin, /*stride=*/sort_dims.inner_dim_size);
-    Sort1DArrInplace<Iterator, NativeT>(sort_dims.sort_dim_size, offset,
-                                        begin_iter, is_stable, direction);
-  }
-}
-
-// Sorts `n` buffers in place.
-template <size_t n>
-static void SortInplace(const SortDims& sort_dims, int64_t offset,
+// Sorts `data` of the given `shape` along the `dimension` inplace.
+static void SortInplace(const SortThunk::SortDims& sort_dims,
                         absl::Span<se::DeviceMemoryBase> data,
                         absl::Span<const Shape> shapes, bool is_stable,
-                        SortThunk::LessThan* less_than) {
-  std::array<std::byte*, n> ptrs;
-  std::array<size_t, n> primitive_sizes;
-
-  for (size_t i = 0; i < n; ++i) {
-    std::byte* base = reinterpret_cast<std::byte*>(data[i].opaque());
-    primitive_sizes[i] = primitive_util::ByteWidth(shapes[i].element_type());
-    ptrs[i] = base + offset * primitive_sizes[i];
-  }
-
-  Inputs<n> inputs(ptrs, primitive_sizes);
-
-  auto compare = [&](const auto& a, const auto& b) {
-    std::array<const void*, 2 * n> values;
-    a.FillComparedValues(&values[0]);
-    b.FillComparedValues(&values[1]);
-    return (*less_than)(values.data());
-  };
-
-  SortIterator<Value<n>, Ref<n>, Ptr<n>> begin(
-      Ptr<n>(&inputs), /*stride=*/sort_dims.inner_dim_size);
-  if (is_stable) {
-    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
-  } else {
-    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
-  }
-}
-
-static void DSortInplace(const SortDims& sort_dims, int64_t offset,
-                         absl::Span<se::DeviceMemoryBase> data,
-                         absl::Span<const Shape> shapes, bool is_stable,
-                         SortThunk::LessThan* less_than, size_t n) {
-  std::vector<std::byte*> ptrs(n);
-  std::vector<size_t> primitive_sizes(n);
-
-  for (size_t i = 0; i < n; ++i) {
-    std::byte* base = reinterpret_cast<std::byte*>(data[i].opaque());
-    primitive_sizes[i] = primitive_util::ByteWidth(shapes[i].element_type());
-    ptrs[i] = base + offset * primitive_sizes[i];
-  }
-
-  DInputs inputs(std::move(ptrs), std::move(primitive_sizes));
+                        SortThunk::LessThan* less_than,
+                        std::optional<SortThunk::SortDirection> direction) {
+  absl::InlinedVector<std::byte*, 16> raw_data;
+  absl::c_transform(data, std::back_inserter(raw_data),
+                    [](const se::DeviceMemoryBase& mem) {
+                      return reinterpret_cast<std::byte*>(mem.opaque());
+                    });
+
+  absl::InlinedVector<size_t, 16> primitive_sizes;
+  absl::c_transform(shapes, std::back_inserter(primitive_sizes),
+                    [](const Shape& shape) {
+                      return primitive_util::ByteWidth(shape.element_type());
+                    });
+
+  if (raw_data.size() == 1 && direction.has_value()) {
+    primitive_util::ArrayTypeSwitch(
+        [&](auto type) {
+          if constexpr ((primitive_util::IsFloatingPointType(type) &&
+                         primitive_util::BitWidth(type) >= 32) ||
+                        (primitive_util::IsIntegralType(type) &&
+                         primitive_util::BitWidth(type) >= 8)) {
+            using T = primitive_util::NativeTypeOf<type>;
+            internal::SortInplace<T>(sort_dims,
+                                     reinterpret_cast<T*>(raw_data[0]),
+                                     is_stable, *direction);
+          } else {
+            internal::SortInplace(sort_dims, raw_data, primitive_sizes,
+                                  is_stable, less_than);
+          }
+        },
+        shapes[0].element_type());
 
-  // Allocate scratch space for sorted values outside of the lambda to avoid
-  // allocating it on every call to `compare`.
-  std::vector<const void*> values(2 * n);
-
-  auto compare = [&, values = values.data()](const auto& a, const auto& b) {
-    a.FillComparedValues(&values[0]);
-    b.FillComparedValues(&values[1]);
-    return (*less_than)(values);
-  };
-
-  SortIterator<DValue, DRef, DPtr> begin(DPtr(&inputs),
-                                         /*stride=*/sort_dims.inner_dim_size);
-  if (is_stable) {
-    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
   } else {
-    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
+    internal::SortInplace(sort_dims, raw_data, primitive_sizes, is_stable,
+                          less_than);
   }
 }
 
-// Sorts `data` of the given `shape` along the `dimension` inplace.
-static absl::Status SortInplace(
-    absl::Span<se::DeviceMemoryBase> data, absl::Span<const Shape> shapes,
-    int64_t dimension, bool is_stable, SortThunk::LessThan* less_than,
-    std::optional<SortThunk::SortDirection> direction) {
-  // All inputs have the same dimensions and layout, so we can use the first
-  // shape to get the sort dimensions.
-  SortDims sort_dims = GetSortDims(shapes[0], dimension);
-
-  // Iterate over all the 1-dimensional slices of the buffers and sort them.
-  for (int64_t i = 0; i < sort_dims.num_iterations; ++i) {
-    int64_t inner_idx = i % sort_dims.inner_dim_size;
-    int64_t offset = inner_idx + (i - inner_idx) * sort_dims.sort_dim_size;
-
-    auto sort = [&](auto num_inputs) {
-      SortInplace<decltype(num_inputs)::value>(sort_dims, offset, data, shapes,
-                                               is_stable, less_than);
-    };
-
-    auto dsort = [&](size_t num_inputs) {
-      DSortInplace(sort_dims, offset, data, shapes, is_stable, less_than,
-                   num_inputs);
-    };
-
-    // Sorts array using builtin comparator functor
-    auto builtin_sort = [&](PrimitiveType type,
-                            SortThunk::SortDirection direction) {
-      primitive_util::ArrayTypeSwitch(
-          [&](auto cst_type) {
-            if constexpr ((primitive_util::IsFloatingPointType(cst_type) ||
-                           primitive_util::IsIntegralType(cst_type)) &&
-                          primitive_util::BitWidth(cst_type) >= 8) {
-              Sort1DArrInplace<cst_type>(sort_dims, offset, data, is_stable,
-                                         direction);
-            } else {
-              sort(std::integral_constant<size_t, 1>{});
-            }
-          },
-          type);
-    };
-
-    // Use "sort" for statically known number of sorted inputs (expected to be
-    // faster) and "dsort" for dynamically known number of sorted inputs.
-    switch (data.size()) {
-      case 1:
-        DCHECK_EQ(shapes.size(), 1);
-        if (direction.has_value()) {
-          builtin_sort(shapes[0].element_type(), *direction);
-        } else {
-          sort(std::integral_constant<size_t, 1>{});
-        }
-        break;
-      case 2:
-        sort(std::integral_constant<size_t, 2>{});
-        break;
-      case 3:
-        sort(std::integral_constant<size_t, 3>{});
-        break;
-      case 4:
-        sort(std::integral_constant<size_t, 4>{});
-        break;
-      case 5:
-        sort(std::integral_constant<size_t, 5>{});
-        break;
-      case 6:
-        sort(std::integral_constant<size_t, 6>{});
-        break;
-      case 7:
-        sort(std::integral_constant<size_t, 7>{});
-        break;
-      case 8:
-        sort(std::integral_constant<size_t, 8>{});
-        break;
-      case 9:
-        sort(std::integral_constant<size_t, 9>{});
-        break;
-      case 10:
-        sort(std::integral_constant<size_t, 10>{});
-        break;
-      case 11:
-        sort(std::integral_constant<size_t, 11>{});
-        break;
-      case 12:
-        sort(std::integral_constant<size_t, 12>{});
-        break;
-      case 13:
-        sort(std::integral_constant<size_t, 13>{});
-        break;
-      case 14:
-        sort(std::integral_constant<size_t, 14>{});
-        break;
-      case 15:
-        sort(std::integral_constant<size_t, 15>{});
-        break;
-      case 16:
-        sort(std::integral_constant<size_t, 16>{});
-        break;
-      default:
-        dsort(data.size());
-        break;
-    }
-  }
-
-  return absl::OkStatus();
-}
-
 tsl::AsyncValueRef<SortThunk::ExecuteEvent> SortThunk::Execute(
     const ExecuteParams& params) {
-
   VLOG(3) << absl::StreamFormat(
       "Sort %d inputs along dimension %d (is_stable=%v)", inputs_.size(),
       dimension_, is_stable_);
@@ -865,10 +223,6 @@ tsl::AsyncValueRef<SortThunk::ExecuteEvent> SortThunk::Execute(
         params.buffer_allocations->GetDeviceAddress(input.slice));
     shapes.push_back(input.shape);
 
-    // Annotate memory that might have been initialized by jit-compiled code.
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(data.back().opaque(),
-                                        data.back().size());
-
     VLOG(3) << absl::StreamFormat("  sort input #%d: %s in slice %s (%p)", idx,
                                   input.shape.ToString(/*print_layout=*/true),
                                   input.slice.ToString(), data.back().opaque());
@@ -901,8 +255,8 @@ tsl::AsyncValueRef<SortThunk::ExecuteEvent> SortThunk::Execute(
   TF_RETURN_IF_ERROR(less_than_.status());
   LessThan* less_than = &less_than_.value();
 
-  TF_RETURN_IF_ERROR(SortInplace(absl::MakeSpan(data), shapes, dimension_,
-                                 is_stable_, less_than, direction_));
+  SortInplace(sort_dims_, absl::MakeSpan(data), shapes, is_stable_, less_than,
+              direction_);
 
   return OkExecuteEvent();
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
index dd2f151296430b..4f5544834fb057 100644
--- a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
@@ -25,8 +25,11 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/sort_lib.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
@@ -36,12 +39,9 @@ namespace xla::cpu {
 // less-than comparator function.
 class SortThunk final : public Thunk {
  public:
-  using LessThan = absl::AnyInvocable<bool(const void** data)>;
-
-  enum class SortDirection {
-    kAscending,
-    kDescending,
-  };
+  using LessThan = internal::LessThan;
+  using SortDims = internal::SortDims;
+  using SortDirection = internal::SortDirection;
 
   struct Input {
     BufferAllocation::Slice slice;
@@ -62,27 +62,31 @@ class SortThunk final : public Thunk {
 
   BufferUses buffer_uses() const final;
 
-  std::optional<SortDirection> direction() const { return direction_; }
   int64_t dimension() const { return dimension_; }
   bool is_stable() const { return is_stable_; }
-  const std::vector<Input>& inputs() const { return inputs_; }
 
-  const std::string& comparator_name() const { return comparator_name_; }
+  absl::Span<const Input> inputs() const { return inputs_; }
 
+  absl::string_view comparator_name() const { return comparator_name_; }
   bool has_less_than() const { return less_than_.ok(); }
 
+  const SortDims& sort_dims() const { return sort_dims_; }
+  std::optional<SortDirection> direction() const { return direction_; }
+
  private:
   SortThunk(Info info, absl::Span<const Input> inputs, int64_t dimension,
-            bool is_stable, LessThan less_than,
+            bool is_stable, LessThan less_than, SortDims sort_dims,
             std::optional<SortDirection> direction);
 
   SortThunk(Info info, absl::Span<const Input> inputs, int64_t dimension,
-            bool is_stable, std::string comparator_name,
+            bool is_stable, std::string comparator_name, SortDims sort_dims,
             std::optional<SortDirection> direction);
 
   std::vector<Input> inputs_;
   int64_t dimension_;
   bool is_stable_;
+
+  SortDims sort_dims_;
   std::optional<SortDirection> direction_;
 
   // Name of the comparator function, lazily resolved to a comparator function
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc
index 797847a42c8bc5..9f9928ad99c3af 100644
--- a/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc
@@ -327,7 +327,9 @@ void BM_Sort1D(benchmark::State& state) {
   // Use sort direction to activate the most efficient sorting function, or fall
   // back on the comparator functor.
   std::optional<SortThunk::SortDirection> direction;
-  if (sort_ascending) direction = SortThunk::SortDirection::kAscending;
+  if (sort_ascending) {
+    direction = SortThunk::SortDirection::kAscending;
+  }
 
   auto [alloc, dummy_alloc] = CreateBufferAllocation(*data, *dummy_data);
   auto [slice, dummy_slice] = CreateBufferAllocationSlice(alloc, dummy_alloc);
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.cc b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
index 4801d69499e782..219fb4f69d0527 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
@@ -43,6 +43,11 @@ limitations under the License.
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
+#endif  // XLA_YNNPACK
+
 namespace xla::cpu {
 
 // Ok execute event allocated with the static storage duration.
@@ -88,6 +93,8 @@ absl::string_view Thunk::KindToString(Kind kind) {
       return "while";
     case Kind::kXnnFusion:
       return "xnn-fusion";
+    case Kind::kYnnFusion:
+      return "ynn-fusion";
     case Kind::kOneDnnFusion:
       return "onednn-fusion";
   }
@@ -168,6 +175,18 @@ absl::StatusOr<Thunk::XnnParams> Thunk::XnnParams::Create(
 Thunk::XnnParams::XnnParams(XnnThreadpool threadpool)
     : threadpool(std::move(threadpool)) {}
 
+#ifdef XLA_YNNPACK
+absl::StatusOr<Thunk::YnnParams> Thunk::YnnParams::Create(
+    const ExecutableRunOptions* run_options) {
+  TF_ASSIGN_OR_RETURN(YnnThreadpool threadpool,
+                      CreateYnnThreadpool(run_options->intra_op_thread_pool()));
+  return YnnParams(std::move(threadpool));
+}
+
+Thunk::YnnParams::YnnParams(YnnThreadpool threadpool)
+    : threadpool(std::move(threadpool)) {}
+#endif  // XLA_YNNPACK
+
 Thunk::ExecuteSession::ExecuteSession(int64_t max_workers,
                                       int64_t split_threshold)
     : lock_(std::make_shared<std::nullopt_t>(std::nullopt)),
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.h b/third_party/xla/xla/backends/cpu/runtime/thunk.h
index df4e276b19663e..4d98447b3c4628 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.h
@@ -47,6 +47,11 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
+#endif  // XLA_YNNPACK
+
 namespace Eigen {
 struct ThreadPoolDevice;
 }  // namespace Eigen
@@ -87,6 +92,7 @@ class Thunk {
     kTopK,
     kWhile,
     kXnnFusion,
+    kYnnFusion,
     kOneDnnFusion,
   };
 
@@ -262,6 +268,25 @@ class Thunk {
     explicit XnnParams(XnnThreadpool threadpool);
   };
 
+  //===--------------------------------------------------------------------===//
+  // YnnParams
+  //===--------------------------------------------------------------------===//
+
+#ifdef XLA_YNNPACK
+  // Parameters capturing all the details required for running XNNPACK fusions.
+  struct YnnParams {
+    static absl::StatusOr<YnnParams> Create(
+        const ExecutableRunOptions* run_options);
+
+    YnnThreadpool threadpool = nullptr;
+
+    explicit YnnParams(YnnThreadpool threadpool);
+  };
+#else
+  // Use XnnParams for placeholder. The parameter won't be used anyway.
+  using YnnParams = XnnParams;
+#endif  // XLA_YNNPACK
+
   //===--------------------------------------------------------------------===//
   // ExecuteParams
   //===--------------------------------------------------------------------===//
@@ -277,6 +302,7 @@ class Thunk {
     CollectiveExecuteParams* collective_params = nullptr;
     CustomCallExecuteParams* custom_call_params = nullptr;
     XnnParams* xnn_params = nullptr;
+    YnnParams* ynn_params = nullptr;
     int64_t run_id = -1;          // -1 means no run id is set.
     int64_t device_ordinal = -1;  // -1 means no device ordinal is set.
     ExecuteSession session = ExecuteSession(ExecuteSession::kMaxWorkers,
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.proto b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
index 46bfcdb093ae81..0af36ecb40e915 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
@@ -17,7 +17,8 @@ syntax = "proto3";
 
 package xla.cpu;
 
-import "xla/backends/cpu/xnnpack_config.proto";
+import "xla/backends/cpu/xnn_fusion_options.proto";
+import "xla/backends/cpu/ynn_fusion_options.proto";
 import "xla/service/buffer_assignment.proto";
 import "xla/service/hlo.proto";
 import "xla/xla_data.proto";
@@ -169,7 +170,7 @@ message XnnFusionThunkProtoImpl {
 }
 
 message XnnFusionThunkProto {
-  XnnFusionBackendConfig options = 1;
+  XnnFusionOptions options = 1;
 
   oneof impl {
     XnnDotThunkProto xnn_dot_thunk = 2;
@@ -178,6 +179,14 @@ message XnnFusionThunkProto {
   }
 }
 
+message YnnFusionThunkProto {
+  YnnFusionOptions options = 1;
+
+  int64 instruction_id = 2;
+  repeated ShapeBufferAllocationSliceProto arguments_shapes = 3;
+  repeated ShapeBufferAllocationSliceProto results_shapes = 4;
+}
+
 message DotThunkProto {
   DotDimensionNumbers dot_dimensions = 1;
   ShapeBufferAllocationSliceProto lhs_buffer_shape = 2;
@@ -301,6 +310,7 @@ message ThunkProto {
     CollectiveThunkProto collective_thunk = 18;
     PartitionIdThunkProto partition_id_thunk = 19;
     ReplicaIdThunkProto replica_id_thunk = 20;
+    YnnFusionThunkProto ynn_fusion_thunk = 21;
   }
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
index 1d42b43f1731ad..a09487a91e36e3 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
@@ -233,11 +233,12 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> ThunkExecutor::TracedExecute(
 
   // When thunk execution completes, create a consumer traceme to capture the
   // end event.
-  execute_event.AndThen([context_id = producer.GetContextId(), &thunk] {
-    tsl::profiler::TraceMeConsumer(
-        [&] { return absl::StrFormat("end: %s", thunk.info().op_name); },
-        tsl::profiler::ContextType::kGeneric, context_id);
-  });
+  execute_event.AndThen(
+      [context_id = producer.GetContextId(), op_name = thunk.info().op_name] {
+        tsl::profiler::TraceMeConsumer(
+            [&] { return absl::StrFormat("end: %s", op_name); },
+            tsl::profiler::ContextType::kGeneric, context_id);
+      });
 
   return execute_event;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc
index 6b5353a51b6c8d..c2c147c694eb96 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc
@@ -242,10 +242,14 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> AddI32Thunk::Execute(
 AddI32Thunk::BufferUses AddI32Thunk::buffer_uses() const {
   BufferUses buffer_uses;
   for (const auto& src : srcs_) {
-    buffer_uses.push_back(BufferUse::Read(src));
+    buffer_uses.push_back(BufferUse::Read(
+        src, ShapeUtil::MakeShape(
+                 S32, {src.size() / ShapeUtil::ByteSizeOfPrimitiveType(S32)})));
   }
   for (const auto& dst : dsts_) {
-    buffer_uses.push_back(BufferUse::Write(dst));
+    buffer_uses.push_back(BufferUse::Write(
+        dst, ShapeUtil::MakeShape(
+                 S32, {dst.size() / ShapeUtil::ByteSizeOfPrimitiveType(S32)})));
   }
   return buffer_uses;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
index 7cbc6ea7ac12a6..1928c719151f0f 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
@@ -27,12 +27,14 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/all_gather_thunk.h"
 #include "xla/backends/cpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/cpu/runtime/all_to_all_thunk.h"
@@ -40,7 +42,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/cpu/runtime/collective_thunk.h"
 #include "xla/backends/cpu/runtime/conditional_thunk.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/convolution_thunk.h"
 #include "xla/backends/cpu/runtime/copy_thunk.h"
 #include "xla/backends/cpu/runtime/custom_call_thunk.h"
@@ -61,17 +63,29 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/xnnpack_config.pb.h"
+#include "xla/backends/cpu/xnn_fusion_options.pb.h"
+#include "xla/backends/cpu/ynn_fusion_options.pb.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_emitter.h"
+#endif  // XLA_YNNPACK
+
 namespace xla::cpu {
 
 void ForEachThunkProto(const ThunkSequenceProto& proto,
@@ -163,6 +177,8 @@ static absl::StatusOr<Thunk::Kind> ProtoThunkToThunkKind(
       return Thunk::Kind::kPartitionId;
     case ThunkProto::ImplCase::kReplicaIdThunk:
       return Thunk::Kind::kReplicaId;
+    case ThunkProto::ImplCase::kYnnFusionThunk:
+      return Thunk::Kind::kYnnFusion;
     case ThunkProto::ImplCase::IMPL_NOT_SET:
       return Internal("Thunk kind not set.");
   }
@@ -343,6 +359,7 @@ class ThunkSerDesProtobuf : public SerDesBase<Thunk> {
  public:
   // Buffer allocations and resources are not needed for serialization.
   explicit ThunkSerDesProtobuf(
+      const HloModule* hlo_module = nullptr,
       const std::vector<BufferAllocation>* buffer_allocations = nullptr,
       const std::vector<std::shared_ptr<Resource>>* thunk_resources = nullptr);
   absl::StatusOr<std::string> Serialize(const Thunk& thunk) override;
@@ -355,16 +372,18 @@ class ThunkSerDesProtobuf : public SerDesBase<Thunk> {
       const ThunkProto& proto) const;
 
  private:
-  // TODO(basiol) remove NOLINT when this actually gets used
-  const std::vector<BufferAllocation>* buffer_allocations_;  // NOLINT
+  const HloModule* hlo_module_;
+  const std::vector<BufferAllocation>* buffer_allocations_;
 
   const std::vector<std::shared_ptr<Resource>>* thunk_resources_;
 };
 
 ThunkSerDesProtobuf::ThunkSerDesProtobuf(
+    const HloModule* hlo_module,
     const std::vector<BufferAllocation>* buffer_allocations,
     const std::vector<std::shared_ptr<Resource>>* thunk_resources)
-    : buffer_allocations_(buffer_allocations),
+    : hlo_module_(hlo_module),
+      buffer_allocations_(buffer_allocations),
       thunk_resources_(thunk_resources) {}
 
 absl::StatusOr<std::string> ThunkSerDesProtobuf::Serialize(const Thunk& thunk) {
@@ -720,6 +739,28 @@ static absl::Status ToProto(const WhileThunk& thunk, ThunkProto& proto) {
   return absl::OkStatus();
 }
 
+#ifdef XLA_YNNPACK
+static absl::Status ToProto(const YnnFusionThunk& thunk, ThunkProto& proto) {
+  YnnFusionThunkProto* ynn_fusion_proto = proto.mutable_ynn_fusion_thunk();
+  ynn_fusion_proto->mutable_options()->set_use_threadpool(
+      thunk.options().use_threadpool);
+  ynn_fusion_proto->set_instruction_id(thunk.hlo()->unique_id());
+
+  for (const YnnFusionThunk::Argument& argument : thunk.arguments()) {
+    TF_RETURN_IF_ERROR(
+        SerializeSliceShapeIntoProto(argument.slice, argument.shape,
+                                     ynn_fusion_proto->add_arguments_shapes()));
+  }
+
+  for (const YnnFusionThunk::Result& result : thunk.results()) {
+    TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
+        result.slice, result.shape, ynn_fusion_proto->add_results_shapes()));
+  }
+
+  return absl::OkStatus();
+}
+#endif  // XLA_YNNPACK
+
 static absl::Status ToProto(const XnnFusionThunk& thunk, ThunkProto& proto) {
   // TODO(basioli) XnnFusionThunk is not serializable because it contains
   // a builder function that is not serializable.
@@ -980,6 +1021,12 @@ absl::StatusOr<ThunkProto> ThunkSerDesProtobuf::ToProto(
                   internal::LogicalIdKind::kReplicaId>&>(thunk)),
           proto));
       break;
+#ifdef XLA_YNNPACK
+    case Thunk::Kind::kYnnFusion:
+      TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
+          tsl::down_cast<const YnnFusionThunk&>(thunk), proto));
+      break;
+#endif  // XLA_YNNPACK
     default:
       return absl::UnimplementedError(
           absl::StrFormat("ToProto is not implemented for thunk kind: %s",
@@ -1086,9 +1133,10 @@ ReduceScatterThunkFromProto(
 }
 
 static absl::StatusOr<std::unique_ptr<CallThunk>> CallThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(&buffer_allocations);
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>* buffer_allocations) {
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(hlo_module,
+                                                    buffer_allocations);
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ThunkSequence> call_sequence,
@@ -1100,9 +1148,10 @@ static absl::StatusOr<std::unique_ptr<CallThunk>> CallThunkFromProto(
 
 static absl::StatusOr<std::unique_ptr<ConditionalThunk>>
 ConditionalThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(&buffer_allocations);
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>* buffer_allocations) {
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(hlo_module,
+                                                    buffer_allocations);
 
   std::vector<ThunkSequence> branch_sequences;
   for (const ThunkSequenceProto& branch_sequence_proto :
@@ -1113,10 +1162,10 @@ ConditionalThunkFromProto(
   }
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
 
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice branch_index_buffer,
-      BufferAllocation::Slice::FromProto(
-          proto.conditional_thunk().branch_index_buffer(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice branch_index_buffer,
+                      BufferAllocation::Slice::FromProto(
+                          proto.conditional_thunk().branch_index_buffer(),
+                          *buffer_allocations));
 
   return ConditionalThunk::Create(std::move(info),
                                   std::move(branch_index_buffer),
@@ -1479,9 +1528,10 @@ static absl::StatusOr<std::unique_ptr<TopKThunk>> TopKThunkFromProto(
 }
 
 static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(&buffer_allocations);
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>* buffer_allocations) {
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(hlo_module,
+                                                    buffer_allocations);
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ThunkSequence> cond_sequence,
@@ -1495,7 +1545,7 @@ static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice cond_buffer,
       BufferAllocation::Slice::FromProto(proto.while_thunk().cond_buffer(),
-                                         buffer_allocations));
+                                         *buffer_allocations));
 
   std::optional<int64_t> trip_count = std::nullopt;
   if (proto.while_thunk().trip_count().contains_value()) {
@@ -1507,6 +1557,86 @@ static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
                             std::move(*body_sequence), trip_count);
 }
 
+#ifdef XLA_YNNPACK
+static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunkFromProto(
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>& buffer_allocations) {
+  const YnnFusionThunkProto& ynn_fusion_proto = proto.ynn_fusion_thunk();
+
+  YnnFusionThunk::Options options = {
+      ynn_fusion_proto.options().use_threadpool(),
+  };
+
+  TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
+
+  const HloInstruction* hlo = std::invoke([&]() -> const HloInstruction* {
+    for (const HloComputation* computation : hlo_module->computations()) {
+      for (const HloInstruction* instruction : computation->instructions()) {
+        if (instruction->unique_id() == ynn_fusion_proto.instruction_id()) {
+          return instruction;
+        }
+      }
+    }
+    return nullptr;
+  });
+
+  if (hlo == nullptr) {
+    return Internal(
+        "HLO instruction with unique id %d not found in the HLO module",
+        ynn_fusion_proto.instruction_id());
+  }
+
+  std::vector<YnnFusionThunk::Argument> arguments;
+  for (auto& argument_shape_proto : ynn_fusion_proto.arguments_shapes()) {
+    TF_ASSIGN_OR_RETURN(auto argument_shape,
+                        DeserializeSliceShapeFromProto(argument_shape_proto,
+                                                       buffer_allocations));
+    arguments.push_back(
+        YnnFusionThunk::Argument{argument_shape.first, argument_shape.second});
+  }
+
+  std::vector<YnnFusionThunk::Result> results;
+  for (auto& result_shape_proto : ynn_fusion_proto.results_shapes()) {
+    TF_ASSIGN_OR_RETURN(
+        auto result_shape,
+        DeserializeSliceShapeFromProto(result_shape_proto, buffer_allocations));
+    results.push_back(
+        YnnFusionThunk::Result{result_shape.first, result_shape.second});
+  }
+
+  absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers)>
+      builder;
+  absl::Span<const int64_t> captured_arguments_ids;
+  if (hlo->opcode() == HloOpcode::kDot) {
+    const HloDotInstruction* dot = Cast<HloDotInstruction>(hlo);
+    // TODO(b/455903737): If we know the RHS is a constant, we should capture it
+    // here.
+    bool capture_rhs = false;
+    // Construct YNNPACK subgraph builder from the dot instruction.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnDotBuilder(dot, capture_rhs));
+    static constexpr int64_t kCapturedIds[1] = {1};
+    if (capture_rhs) {
+      captured_arguments_ids = kCapturedIds;
+    }
+  } else {
+    auto* fusion = Cast<HloFusionInstruction>(hlo);
+    const HloComputation* computation =
+        fusion->fused_instructions_computation();
+    // Construct YNNPACK subgraph builder from the fusion computation.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnFusionBuilder(computation));
+  }
+
+  return YnnFusionThunk::Create(
+      std::move(options), std::move(info), hlo, std::move(arguments),
+      std::move(results),
+      [b = std::move(builder)](auto, auto, auto arg_buffers) mutable {
+        return b(arg_buffers);
+      },
+      captured_arguments_ids);
+}
+#endif  // XLA_YNNPACK
+
 static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunkFromProto(
     const ThunkProto& proto,
     const std::vector<BufferAllocation>& buffer_allocations) {
@@ -1661,9 +1791,9 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
       }
     }
     case Thunk::Kind::kCall:
-      return CallThunkFromProto(proto, *buffer_allocations_);
+      return CallThunkFromProto(proto, hlo_module_, buffer_allocations_);
     case Thunk::Kind::kConditional:
-      return ConditionalThunkFromProto(proto, *buffer_allocations_);
+      return ConditionalThunkFromProto(proto, hlo_module_, buffer_allocations_);
     case Thunk::Kind::kConvolution:
       return ConvolutionThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kCopy:
@@ -1687,7 +1817,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
     case Thunk::Kind::kTopK:
       return TopKThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kWhile:
-      return WhileThunkFromProto(proto, *buffer_allocations_);
+      return WhileThunkFromProto(proto, hlo_module_, buffer_allocations_);
     case Thunk::Kind::kXnnFusion: {
       TF_ASSIGN_OR_RETURN(
           auto xnn_fusion_kind,
@@ -1705,6 +1835,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
       return PartitionIdThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kReplicaId:
       return ReplicaIdThunkFromProto(proto, *buffer_allocations_);
+#ifdef XLA_YNNPACK
+    case Thunk::Kind::kYnnFusion:
+      return YnnFusionThunkFromProto(proto, hlo_module_, *buffer_allocations_);
+#endif  // XLA_YNNPACK
     default:
       return absl::Status(absl::StatusCode::kInvalidArgument,
                           absl::StrFormat("Unsupported thunk kind: %s",
@@ -1714,8 +1848,9 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
 }
 
 ThunkSequenceSerDesProtobuf::ThunkSequenceSerDesProtobuf(
+    const HloModule* hlo_module,
     const std::vector<BufferAllocation>* buffer_allocations)
-    : buffer_allocations_(buffer_allocations) {}
+    : hlo_module_(hlo_module), buffer_allocations_(buffer_allocations) {}
 
 absl::StatusOr<std::string> ThunkSequenceSerDesProtobuf::Serialize(
     const ThunkSequence& thunk_sequence) {
@@ -1735,7 +1870,7 @@ ThunkSequenceSerDesProtobuf::Deserialize(const std::string& serialized) {
 
 absl::StatusOr<ThunkSequenceProto> ThunkSequenceSerDesProtobuf::ToProto(
     const ThunkSequence& thunk_sequence) const {
-  ThunkSerDesProtobuf thunk_serdes(buffer_allocations_);
+  ThunkSerDesProtobuf thunk_serdes(hlo_module_, buffer_allocations_);
   ThunkSequenceProto proto;
   proto.mutable_thunks()->Reserve(thunk_sequence.size());
 
@@ -1797,7 +1932,7 @@ ThunkSequenceSerDesProtobuf::FromProto(const ThunkSequenceProto& proto) const {
 
   size_t thunk_index = 0;
   for (const ThunkProto& thunk_proto : proto.thunks()) {
-    ThunkSerDesProtobuf thunk_serdes(buffer_allocations_,
+    ThunkSerDesProtobuf thunk_serdes(hlo_module_, buffer_allocations_,
                                      &thunk_resources[thunk_index++]);
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
                         thunk_serdes.FromProto(thunk_proto));
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
index b18f727ccea94a..08b4deaaf91838 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/serdes_base.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 
 namespace xla::cpu {
@@ -36,10 +37,13 @@ void ForEachThunkProto(const ThunkSequenceProto& proto,
 
 class ThunkSequenceSerDesProtobuf : public SerDesBase<ThunkSequence> {
  public:
+  // For serialization, `hlo_module` and `buffer_allocations` are optional. For
+  // deserialization, both are required as we rely on the HLO module to resolve
+  // thunks that were generated from `HloComputation`s, and we also need buffer
+  // allocations to resolve buffer slices.
   explicit ThunkSequenceSerDesProtobuf(
-      const std::vector<BufferAllocation>* buffer_allocations =
-          nullptr);  // NOTE buffer allocations aren't
-                     // needed for serialization.
+      const HloModule* hlo_module = nullptr,
+      const std::vector<BufferAllocation>* buffer_allocations = nullptr);
 
   absl::StatusOr<std::string> Serialize(
       const ThunkSequence& thunk_sequence) override;
@@ -52,6 +56,7 @@ class ThunkSequenceSerDesProtobuf : public SerDesBase<ThunkSequence> {
       const ThunkSequenceProto& proto) const;
 
  private:
+  const HloModule* hlo_module_;
   const std::vector<BufferAllocation>* buffer_allocations_;
 };
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
index dd968f964df38c..a8e425116af038 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
@@ -80,6 +80,10 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#endif  // XLA_YNNPACK
+
 namespace xla::cpu {
 namespace {
 
@@ -215,8 +219,8 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
 
  public:
   void SetUp() override {
-    thunk_sequence_serdes_ =
-        std::make_unique<T>(&buffer_allocations_.GetUnderlyingVector());
+    thunk_sequence_serdes_ = std::make_unique<T>(
+        nullptr, &buffer_allocations_.GetUnderlyingVector());
   }
 
  protected:
@@ -229,6 +233,13 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
 
     return absl::OkStatus();
   }
+  absl::Status AddPredBufferAllocation() {
+    literals_.push_back(LiteralUtil::CreateFull<bool>({1}, false));
+    TF_RETURN_IF_ERROR(buffer_allocations_.push_back(
+        CreateBufferAllocation(buffer_allocations_.size(), literals_.back())));
+
+    return absl::OkStatus();
+  }
 
   // Thunk creation helper functions.
   absl::StatusOr<std::unique_ptr<Thunk>> CreateAllGatherThunk(
@@ -420,7 +431,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
       branch_sequences.push_back(std::move(called_sequence));
     }
 
-    TF_RETURN_IF_ERROR(AddBufferAllocations(1));
+    TF_RETURN_IF_ERROR(AddPredBufferAllocation());
 
     return ConditionalThunk::Create(
         Thunk::Info(),
@@ -431,7 +442,8 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
   }
 
   absl::StatusOr<std::unique_ptr<Thunk>> CreateCustomCallThunk() {
-    TF_RETURN_IF_ERROR(AddBufferAllocations(2));
+    TF_RETURN_IF_ERROR(AddPredBufferAllocation());
+    TF_RETURN_IF_ERROR(AddBufferAllocations(1));
 
     return CustomCallThunk::Create(
         Thunk::Info(), "no_op",
@@ -571,7 +583,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         /*batch_size=*/1,
         /*input_size=*/1,
         /*k=*/2
-
     );
   }
 
@@ -584,7 +595,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     TF_ASSIGN_OR_RETURN(body_sequence.emplace_back(), CreateAllReduceThunk());
     TF_ASSIGN_OR_RETURN(body_sequence.emplace_back(), CreateAllToAllThunk());
 
-    TF_RETURN_IF_ERROR(AddBufferAllocations(1));
+    TF_RETURN_IF_ERROR(AddPredBufferAllocation());
     return WhileThunk::Create(
         Thunk::Info(),
         /*cond_buffer=*/
@@ -1103,6 +1114,15 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     return false;
   }
 
+#ifdef XLA_YNNPACK
+  bool VerifyYnnFusionThunkEquality(const YnnFusionThunk& thunk_1,
+                                    const YnnFusionThunk& thunk_2) {
+    // TODO(ashaposhnikov) assume this is always false until we implement
+    // serialization of YnnFusionThunk.
+    return false;
+  }
+#endif  // XLA_YNNPACK
+
   bool VerifyXnnDotThunkEquality(const XnnDotThunk& thunk_1,
                                  const XnnDotThunk& thunk_2) {
     const bool are_dot_dimensions_equal =
@@ -1412,6 +1432,24 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
                 tsl::down_cast<const XnnConvolutionThunk&>(thunk_2));
         }
       }
+      case Thunk::Kind::kYnnFusion: {
+#ifdef XLA_YNNPACK
+        const YnnFusionThunk& ynn_fusion_thunk_1 =
+            tsl::down_cast<const YnnFusionThunk&>(thunk_1);
+        const YnnFusionThunk& ynn_fusion_thunk_2 =
+            tsl::down_cast<const YnnFusionThunk&>(thunk_2);
+        if (ynn_fusion_thunk_1.ynn_fusion_kind() !=
+            ynn_fusion_thunk_2.ynn_fusion_kind()) {
+          return false;
+        }
+        return VerifyYnnFusionThunkEquality(
+            tsl::down_cast<const YnnFusionThunk&>(thunk_1),
+            tsl::down_cast<const YnnFusionThunk&>(thunk_2));
+#else
+        CHECK(false) << "Unsupported YNN fusion thunk type";
+        return false;
+#endif  // XLA_YNNPACK
+      }
       case Thunk::Kind::kKernel:
         return VerifyKernelThunkEquality(
             tsl::down_cast<const KernelThunkBase&>(thunk_1),
diff --git a/third_party/xla/xla/service/cpu/runtime_topk.cc b/third_party/xla/xla/backends/cpu/runtime/topk_lib.h
similarity index 75%
rename from third_party/xla/xla/service/cpu/runtime_topk.cc
rename to third_party/xla/xla/backends/cpu/runtime/topk_lib.h
index 867f398930fd41..f26043de1b5e87 100644
--- a/third_party/xla/xla/service/cpu/runtime_topk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/topk_lib.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,27 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/runtime_topk.h"
+#ifndef XLA_BACKENDS_CPU_RUNTIME_TOPK_LIB_H_
+#define XLA_BACKENDS_CPU_RUNTIME_TOPK_LIB_H_
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <limits>
 #include <numeric>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/base/casts.h"
 #include "absl/base/dynamic_annotations.h"
 
+namespace xla::cpu::internal {
+
 template <typename T>
-static void TopK(int64_t batch_size, int64_t input_size, int64_t k,
-                 const T* values, T* out_values, int32_t* out_indices) {
-  // 'values' is managed by the JIT code, so msan can't tell they are
-  // initialized.
+void TopK(int64_t batch_size, int64_t input_size, int64_t k, const T* values,
+          T* out_values, int32_t* out_indices) {
+  // values is managed by the JIT code, so msan can't tell they are initialized.
   ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values,
                                       input_size * batch_size * sizeof(T));
-  static constexpr auto convert_to_int = [](T value) {
+
+  auto convert_to_int = [](T value) {
     uint32_t x = absl::bit_cast<uint32_t>(value);
     return static_cast<int32_t>(x) < 0 ? std::numeric_limits<int32_t>::max() - x
                                        : x;
@@ -47,7 +49,7 @@ static void TopK(int64_t batch_size, int64_t input_size, int64_t k,
 
     auto kth_element = temp_indices.begin() + k;
     std::partial_sort(temp_indices.begin(), kth_element, temp_indices.end(),
-                      [values_batch](size_t i1, size_t i2) {
+                      [&](size_t i1, size_t i2) {
                         // Do the comparison in integers to enforce a total
                         // order of -NaN < -Inf < -0 < +0 < +Inf < +NaN.
                         int32_t v1 = convert_to_int(values_batch[i1]);
@@ -67,8 +69,6 @@ static void TopK(int64_t batch_size, int64_t input_size, int64_t k,
   }
 }
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_TopKF32(
-    int64_t batch_size, int64_t input_size, int64_t k, const float* values,
-    float* out_values, int32_t* out_indices) {
-  TopK(batch_size, input_size, k, values, out_values, out_indices);
-}
+}  // namespace xla::cpu::internal
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_TOPK_LIB_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc
index caff11b3c22ee2..cc473b82459879 100644
--- a/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/topk_lib.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/cpu/runtime_topk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/statusor.h"
@@ -62,10 +62,11 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> TopKThunk::Execute(
       se::DeviceMemoryBase indices,
       params.buffer_allocations->GetDeviceAddress(indices_buffer_));
 
-  __xla_cpu_runtime_TopKF32(batch_size_, input_size_, k_,
-                            reinterpret_cast<const float*>(values.opaque()),
-                            reinterpret_cast<float*>(output.opaque()),
-                            reinterpret_cast<int32_t*>(indices.opaque()));
+  internal::TopK<float>(batch_size_, input_size_, k_,
+                        static_cast<const float*>(values.opaque()),
+                        static_cast<float*>(output.opaque()),
+                        static_cast<int32_t*>(indices.opaque()));
+
   return OkExecuteEvent();
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc
index b477b8b1ad75e4..2b1dc2f2603917 100644
--- a/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc
@@ -36,9 +36,12 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 
 namespace xla::cpu {
 
@@ -49,16 +52,23 @@ absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunk::Create(
                       ThunkExecutor::Create(std::move(cond_sequence)));
   TF_ASSIGN_OR_RETURN(ThunkExecutor body_executor,
                       ThunkExecutor::Create(std::move(body_sequence)));
-  return absl::WrapUnique(new WhileThunk(std::move(info), cond_buffer,
-                                         std::move(cond_executor),
-                                         std::move(body_executor), trip_count));
+
+  if (cond_buffer.size() != sizeof(bool)) {
+    return Internal("Unsupported cond buffer size %d", cond_buffer.size());
+  }
+
+  return absl::WrapUnique(new WhileThunk(
+      std::move(info), cond_buffer, ShapeUtil::MakeShape(PRED, {1}),
+      std::move(cond_executor), std::move(body_executor), trip_count));
 }
 
 WhileThunk::WhileThunk(Info info, BufferAllocation::Slice cond_buffer,
-                       ThunkExecutor cond_executor, ThunkExecutor body_executor,
+                       Shape cond_buffer_shape, ThunkExecutor cond_executor,
+                       ThunkExecutor body_executor,
                        std::optional<int64_t> trip_count)
     : Thunk(Kind::kWhile, std::move(info)),
       cond_buffer_(cond_buffer),
+      cond_buffer_shape_(cond_buffer_shape),
       cond_executor_(std::move(cond_executor)),
       body_executor_(std::move(body_executor)),
       trip_count_(trip_count) {}
@@ -273,7 +283,7 @@ tsl::AsyncValueRef<WhileThunk::ExecuteEvent> WhileThunk::ExecuteAsyncWhileLoop(
 }
 
 WhileThunk::BufferUses WhileThunk::buffer_uses() const {
-  BufferUses buffer_uses = {BufferUse::Write(cond_buffer_)};
+  BufferUses buffer_uses = {BufferUse::Write(cond_buffer_, cond_buffer_shape_)};
 
   BufferUses cond_uses = cond_executor_.buffer_uses();
   buffer_uses.insert(buffer_uses.end(), cond_uses.begin(), cond_uses.end());
diff --git a/third_party/xla/xla/backends/cpu/runtime/while_thunk.h b/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
index 45832f5b461490..aff947cac7a3ea 100644
--- a/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla::cpu {
@@ -61,8 +62,8 @@ class WhileThunk final : public Thunk {
 
  private:
   WhileThunk(Info info, BufferAllocation::Slice cond_buffer,
-             ThunkExecutor cond_executor, ThunkExecutor body_executor,
-             std::optional<int64_t> trip_count);
+             Shape cond_buffer_shape, ThunkExecutor cond_executor,
+             ThunkExecutor body_executor, std::optional<int64_t> trip_count);
 
   tsl::AsyncValueRef<ExecuteEvent> ExecuteForLoop(const ExecuteParams& params,
                                                   int64_t trip_count);
@@ -84,6 +85,7 @@ class WhileThunk final : public Thunk {
       bool* condition);
 
   BufferAllocation::Slice cond_buffer_;
+  Shape cond_buffer_shape_;
   ThunkExecutor cond_executor_;
   ThunkExecutor body_executor_;
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc
index a338e55edf0c78..f8ae4f1ff8c5a0 100644
--- a/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/env.h"
@@ -46,17 +48,19 @@ namespace {
 
 TEST(WhileThunkTest, BufferUses) {
   BufferAllocation alloc(0, 1024, 0);
+  Shape pred_shape = ShapeUtil::MakeShape(PRED, {1});
   BufferAllocation::Slice pred_slice(&alloc, 0, sizeof(char));
-  BufferAllocation::Slice cond_read_slice(&alloc, 10, 10);
-  BufferAllocation::Slice body_read_slice(&alloc, 20, 10);
+  Shape read_slice_shape = ShapeUtil::MakeShape(F32, {4});
+  BufferAllocation::Slice cond_read_slice(&alloc, 10, 12);
+  BufferAllocation::Slice body_read_slice(&alloc, 22, 12);
 
   ThunkSequence cond_sequence;
-  cond_sequence.push_back(
-      std::make_unique<BufferUseThunk>(BufferUse::Read(cond_read_slice)));
+  cond_sequence.push_back(std::make_unique<BufferUseThunk>(
+      BufferUse::Read(cond_read_slice, read_slice_shape)));
 
   ThunkSequence body_sequence;
-  body_sequence.push_back(
-      std::make_unique<BufferUseThunk>(BufferUse::Read(body_read_slice)));
+  body_sequence.push_back(std::make_unique<BufferUseThunk>(
+      BufferUse::Read(body_read_slice, read_slice_shape)));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto thunk,
@@ -64,9 +68,11 @@ TEST(WhileThunkTest, BufferUses) {
                          std::move(body_sequence)));
 
   EXPECT_EQ(thunk->buffer_uses().size(), 3);
-  EXPECT_EQ(thunk->buffer_uses()[0], BufferUse::Write(pred_slice));
-  EXPECT_EQ(thunk->buffer_uses()[1], BufferUse::Read(cond_read_slice));
-  EXPECT_EQ(thunk->buffer_uses()[2], BufferUse::Read(body_read_slice));
+  EXPECT_EQ(thunk->buffer_uses()[0], BufferUse::Write(pred_slice, pred_shape));
+  EXPECT_EQ(thunk->buffer_uses()[1],
+            BufferUse::Read(cond_read_slice, read_slice_shape));
+  EXPECT_EQ(thunk->buffer_uses()[2],
+            BufferUse::Read(body_read_slice, read_slice_shape));
 }
 
 TEST(WhileThunkTest, ResourceUses) {
@@ -123,7 +129,7 @@ class CondThunk : public Thunk {
   }
 
   BufferUses buffer_uses() const final {
-    return {BufferUse::Write(pred_slice_)};
+    return {BufferUse::Write(pred_slice_, ShapeUtil::MakeShape(PRED, {1}))};
   }
 
  private:
diff --git a/third_party/xla/xla/backends/cpu/runtime/work_queue.h b/third_party/xla/xla/backends/cpu/runtime/work_queue.h
index 93f6fe557fc5e1..2513c09ef2f577 100644
--- a/third_party/xla/xla/backends/cpu/runtime/work_queue.h
+++ b/third_party/xla/xla/backends/cpu/runtime/work_queue.h
@@ -40,36 +40,37 @@ limitations under the License.
 
 namespace xla::cpu {
 
-// A work queue that partitions `num_tasks` tasks into `num_partitions`
-// partitions processed by parallel workers.
+// A work queue that partitions `num_work_items` work items into
+// `num_partitions` partitions processed by parallel workers.
 class WorkQueue {
  public:
-  WorkQueue(size_t num_tasks, size_t num_partitions);
+  WorkQueue(size_t num_work_items, size_t num_partitions);
 
-  // Returns the next task in the given partition. Returns std::nullopt
+  // Returns the next work item in the given partition. Returns std::nullopt
   // if the partition is complete.
   std::optional<size_t> Pop(size_t partition_index);
 
-  // Return the partition [begin, end) task range.
+  // Return the partition [begin, end) work items range.
   std::pair<size_t, size_t> partition_range(size_t partition_index) const;
 
   size_t num_partitions() const { return partitions_.size(); }
 
+  bool IsEmpty() const { return empty_.load(std::memory_order_relaxed); }
+
  private:
   friend class Worker;
 
   struct Partition {
     void Initialize(size_t begin, size_t end);
 
-    // Tracks index of the next task in the assigned partition.
+    // Tracks index of the next work item in the assigned partition.
     ABSL_CACHELINE_ALIGNED std::atomic<size_t> index;
     size_t begin;
     size_t end;
   };
 
-  // An empty work queue flag to stop worker threads from looping through all
-  // partitions looking for work.
-  bool IsEmpty() const { return empty_.load(std::memory_order_relaxed); }
+  // Sets an empty work queue flag to stop worker threads from looping through
+  // all partitions looking for work.
   void SetEmpty() { empty_.store(true, std::memory_order_relaxed); }
 
   // Notify that one of the workers switched to the work stealing mode.
@@ -84,34 +85,38 @@ class WorkQueue {
   ABSL_CACHELINE_ALIGNED std::atomic<size_t> num_work_stealing_workers_;
 };
 
-// Worker processes tasks from the work queue starting from the assigned
+// Worker processes work items from the work queue starting from the assigned
 // work partition. Once the assigned partition is complete it tries to pop
-// the task from the next partition. Once the work queue is empty (the worker
-// wraps around to the initial partition) it returns and empty task.
+// the work item from the next partition. Once the work queue is empty (the
+// worker wraps around to the initial partition) it returns and empty work item.
 class Worker {
  public:
   Worker(size_t worker_index, WorkQueue* queue);
 
-  std::optional<size_t> Pop();
+  // Pops a work item from the work queue. If `notify_work_stealing` is true,
+  // the worker will notify the work queue when it switches to the work
+  // stealing mode. Worker parallelization has an optimization to avoid
+  // scheduling more workers if there are workers in the work stealing mode.
+  std::optional<size_t> Pop(bool notify_work_stealing = true);
 
   // Schedule `num_workers` workers into the Eigen thread pool that process
-  // `num_tasks` parallel tasks and return an async value that becomes
+  // `num_work_items` parallel work items and return an async value that becomes
   // available when all workers are completed.
-  template <typename ParallelTask>
+  template <typename ParallelWork>
   static tsl::AsyncValueRef<tsl::Chain> Parallelize(
       Eigen::ThreadPoolInterface* thread_pool, size_t num_workers,
-      size_t num_tasks, ParallelTask&& parallel_task);
+      size_t num_work_items, ParallelWork&& parallel_work);
 
  private:
-  template <typename ParallelTask>
+  template <typename ParallelWork>
   struct ParallelizeContext;
 
-  template <typename ParallelTask>
-  static absl::Status ExecuteInline(size_t num_tasks,
-                                    ParallelTask&& parallel_task);
+  template <typename ParallelWork>
+  static absl::Status ExecuteInline(size_t num_work_items,
+                                    ParallelWork&& parallel_work);
 
-  template <typename ParallelTask>
-  static void Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
+  template <typename ParallelWork>
+  static void Parallelize(std::shared_ptr<ParallelizeContext<ParallelWork>> ctx,
                           uint16_t start_index, uint16_t end_index);
 
   size_t worker_index_;
@@ -125,14 +130,14 @@ inline void WorkQueue::Partition::Initialize(size_t begin, size_t end) {
   this->end = end;
 }
 
-inline WorkQueue::WorkQueue(size_t num_tasks, size_t num_partitions)
+inline WorkQueue::WorkQueue(size_t num_work_items, size_t num_partitions)
     : partitions_(num_partitions),
-      empty_(num_tasks == 0),
+      empty_(num_work_items == 0),
       num_work_stealing_workers_(0) {
-  size_t partition_size = num_tasks / num_partitions;
-  size_t rem_tasks = num_tasks % num_partitions;
+  size_t partition_size = num_work_items / num_partitions;
+  size_t rem_work_items = num_work_items % num_partitions;
   for (size_t i = 0, begin = 0, end = 0; i < num_partitions; ++i, begin = end) {
-    end = begin + partition_size + ((i < rem_tasks) ? 1 : 0);
+    end = begin + partition_size + ((i < rem_work_items) ? 1 : 0);
     partitions_[i].Initialize(begin, end);
   }
 }
@@ -147,7 +152,7 @@ inline std::optional<size_t> WorkQueue::Pop(size_t partition_index) {
     return std::nullopt;
   }
 
-  // Try to acquire the next task in the partition.
+  // Try to acquire the next work item in the partition.
   size_t index = partition.index.fetch_add(1, std::memory_order_relaxed);
   return ABSL_PREDICT_FALSE(index >= partition.end) ? std::nullopt
                                                     : std::make_optional(index);
@@ -181,19 +186,20 @@ inline Worker::Worker(size_t worker_index, WorkQueue* queue)
       partition_index_(worker_index),
       queue_(queue) {}
 
-inline std::optional<size_t> Worker::Pop() {
-  std::optional<size_t> task = queue_->Pop(partition_index_);
-  if (ABSL_PREDICT_TRUE(task)) {
-    return task;
+inline std::optional<size_t> Worker::Pop(bool notify_work_stealing) {
+  std::optional<size_t> work_item = queue_->Pop(partition_index_);
+  if (ABSL_PREDICT_TRUE(work_item)) {
+    return work_item;
   }
 
-  // If we didn't find a task in the initially assigned partition, notify the
-  // work queue that we are switching to work stealing mode.
-  if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
+  // If we didn't find a work item in the initially assigned partition, notify
+  // the work queue that we are switching to work stealing mode.
+  if (ABSL_PREDICT_FALSE(notify_work_stealing &&
+                         partition_index_ == worker_index_)) {
     queue_->NotifyWorkStealingWorker();
   }
 
-  while (!task.has_value() && !queue_->IsEmpty()) {
+  while (!work_item.has_value() && !queue_->IsEmpty()) {
     // Wrap around to the first partition.
     if (ABSL_PREDICT_FALSE(++partition_index_ >= queue_->num_partitions())) {
       partition_index_ = 0;
@@ -205,44 +211,44 @@ inline std::optional<size_t> Worker::Pop() {
       break;
     }
 
-    task = queue_->Pop(partition_index_);
+    work_item = queue_->Pop(partition_index_);
   }
 
-  return task;
+  return work_item;
 }
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 struct Worker::ParallelizeContext {
   ParallelizeContext(Eigen::ThreadPoolInterface* thread_pool,
                      tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-                     size_t num_tasks, ParallelTask&& parallel_task);
+                     size_t num_work_items, ParallelWork&& parallel_work);
 
   Eigen::ThreadPoolInterface* thread_pool;
   tsl::CountDownAsyncValueRef<tsl::Chain> count_down;
 
   WorkQueue work_queue;
-  ParallelTask parallel_task;
+  ParallelWork parallel_work;
 };
 
-template <typename ParallelTask>
-Worker::ParallelizeContext<ParallelTask>::ParallelizeContext(
+template <typename ParallelWork>
+Worker::ParallelizeContext<ParallelWork>::ParallelizeContext(
     Eigen::ThreadPoolInterface* thread_pool,
-    tsl::CountDownAsyncValueRef<tsl::Chain> count_down, size_t num_tasks,
-    ParallelTask&& parallel_task)
+    tsl::CountDownAsyncValueRef<tsl::Chain> count_down, size_t num_work_items,
+    ParallelWork&& parallel_work)
     : thread_pool(thread_pool),
       count_down(std::move(count_down)),
-      work_queue(num_tasks, /*num_partitions=*/this->count_down.count()),
-      parallel_task(std::forward<ParallelTask>(parallel_task)) {}
+      work_queue(num_work_items, /*num_partitions=*/this->count_down.count()),
+      parallel_work(std::forward<ParallelWork>(parallel_work)) {}
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
-void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
+void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelWork>> ctx,
                          uint16_t start_index, uint16_t end_index) {
   DCHECK_LT(start_index, end_index) << "Invalid worker index range";
 
-  using R = std::invoke_result_t<ParallelTask, size_t>;
+  using R = std::invoke_result_t<ParallelWork, size_t>;
   static_assert(std::is_same_v<R, absl::Status> || std::is_void_v<R>,
-                "Unsupported parallel task return type");
+                "Unsupported parallel work return type");
 
   // Recursively split assigned workers into two halves and schedule the
   // right half into the thread pool.
@@ -253,7 +259,7 @@ void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
     }
 
     // If we have workers in the work stealing mode, we can skip scheduling
-    // more tasks as existing workers will process remaining partitions. By
+    // more workers as existing workers will process remaining partitions. By
     // doing this optimization we avoid unnecessary thread pool overheads.
     size_t skip_workers =
         ctx->work_queue.DecrementWorkStealingWorkers(end_index - start_index);
@@ -282,54 +288,54 @@ void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
 
   // Execute the `start_index` worker in the caller thread.
   Worker worker(start_index, &ctx->work_queue);
-  size_t num_processed_tasks = 0;
+  size_t num_processed_work_items = 0;
 
   // Keep track of the first error status encountered by any of the workers.
   absl::Status status;
 
-  while (std::optional<size_t> task = worker.Pop()) {
+  while (std::optional<size_t> work_item = worker.Pop()) {
     if constexpr (std::is_same_v<R, absl::Status>) {
       if (ABSL_PREDICT_TRUE(status.ok())) {
-        status.Update(ctx->parallel_task(*task));
+        status.Update(ctx->parallel_work(*work_item));
       }
     } else {
-      ctx->parallel_task(*task);
+      ctx->parallel_work(*work_item);
     }
-    ++num_processed_tasks;
+    ++num_processed_work_items;
   }
 
-  ctx->count_down.CountDown(num_processed_tasks, std::move(status));
+  ctx->count_down.CountDown(num_processed_work_items, std::move(status));
 }
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 ABSL_ATTRIBUTE_ALWAYS_INLINE absl::Status Worker::ExecuteInline(
-    size_t num_tasks, ParallelTask&& parallel_task) {
-  using R = std::invoke_result_t<ParallelTask, size_t>;
+    size_t num_work_items, ParallelWork&& parallel_work) {
+  using R = std::invoke_result_t<ParallelWork, size_t>;
   static_assert(std::is_same_v<R, absl::Status> || std::is_void_v<R>,
-                "Unsupported parallel task return type");
+                "Unsupported parallel work return type");
 
-  for (size_t i = 0; i < num_tasks; ++i) {
+  for (size_t i = 0; i < num_work_items; ++i) {
     if constexpr (std::is_same_v<R, absl::Status>) {
-      absl::Status status = parallel_task(i);
+      absl::Status status = parallel_work(i);
       if (ABSL_PREDICT_FALSE(!status.ok())) {
         return status;
       }
     } else {
-      parallel_task(i);
+      parallel_work(i);
     }
   }
 
   return absl::OkStatus();
 }
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 ABSL_ATTRIBUTE_ALWAYS_INLINE tsl::AsyncValueRef<tsl::Chain> Worker::Parallelize(
     Eigen::ThreadPoolInterface* thread_pool, size_t num_workers,
-    size_t num_tasks, ParallelTask&& parallel_task) {
+    size_t num_work_items, ParallelWork&& parallel_work) {
   // Short-circuit single-threaded execution.
   if (ABSL_PREDICT_FALSE(num_workers == 1)) {
-    if (absl::Status status =
-            ExecuteInline(num_tasks, std::forward<ParallelTask>(parallel_task));
+    if (absl::Status status = ExecuteInline(
+            num_work_items, std::forward<ParallelWork>(parallel_work));
         ABSL_PREDICT_FALSE(!status.ok())) {
       return status;
     }
@@ -340,16 +346,16 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE tsl::AsyncValueRef<tsl::Chain> Worker::Parallelize(
   if (ABSL_PREDICT_FALSE(num_workers > std::numeric_limits<uint16_t>::max())) {
     num_workers = std::numeric_limits<uint16_t>::max();
   }
-  // Ensure we don't launch more workers than tasks.
-  // Extra workers would be idle or cause out-of-bounds partition access.
-  num_workers = std::min(num_tasks, num_workers);
+  // Ensure we don't launch more workers than work items. Extra workers would be
+  // idle or cause out-of-bounds partition access.
+  num_workers = std::min(num_work_items, num_workers);
 
-  tsl::CountDownAsyncValueRef<tsl::Chain> count_down(num_tasks);
+  tsl::CountDownAsyncValueRef<tsl::Chain> count_down(num_work_items);
   auto execute_event = count_down.AsRef();
 
-  auto ctx = std::make_shared<ParallelizeContext<ParallelTask>>(
-      thread_pool, std::move(count_down), num_tasks,
-      std::forward<ParallelTask>(parallel_task));
+  auto ctx = std::make_shared<ParallelizeContext<ParallelWork>>(
+      thread_pool, std::move(count_down), num_work_items,
+      std::forward<ParallelWork>(parallel_work));
 
   Parallelize(std::move(ctx), 0, num_workers);
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc b/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc
index 33e62f63f4c388..7b4a988e16d69f 100644
--- a/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc
@@ -41,7 +41,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   };
 
   {
-    WorkQueue queue(/*num_tasks=*/2, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/2, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 1));
     EXPECT_EQ(queue.partition_range(1), task_range(1, 2));
     EXPECT_EQ(queue.partition_range(2), task_range(2, 2));
@@ -49,7 +49,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/4, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/4, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 1));
     EXPECT_EQ(queue.partition_range(1), task_range(1, 2));
     EXPECT_EQ(queue.partition_range(2), task_range(2, 3));
@@ -57,7 +57,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/5, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/5, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 2));
     EXPECT_EQ(queue.partition_range(1), task_range(2, 3));
     EXPECT_EQ(queue.partition_range(2), task_range(3, 4));
@@ -65,7 +65,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/9, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/9, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 3));
     EXPECT_EQ(queue.partition_range(1), task_range(3, 5));
     EXPECT_EQ(queue.partition_range(2), task_range(5, 7));
@@ -73,7 +73,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/14, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/14, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 4));
     EXPECT_EQ(queue.partition_range(1), task_range(4, 8));
     EXPECT_EQ(queue.partition_range(2), task_range(8, 11));
@@ -107,17 +107,17 @@ TEST(WorkQueueTest, WorkQueue) {
     for (size_t num_partitions : {1, 2, 3, 4, 5, 6, 7, 8}) {
       WorkQueue queue(size, num_partitions);
 
-      std::vector<size_t> expected_tasks(size);
-      absl::c_iota(expected_tasks, 0);
+      std::vector<size_t> expected_work_items(size);
+      absl::c_iota(expected_work_items, 0);
 
-      std::vector<size_t> tasks;
+      std::vector<size_t> work_items;
       for (size_t i = 0; i < num_partitions; ++i) {
-        while (std::optional<size_t> task = queue.Pop(i)) {
-          tasks.push_back(*task);
+        while (std::optional<size_t> work_item = queue.Pop(i)) {
+          work_items.push_back(*work_item);
         }
       }
 
-      EXPECT_EQ(tasks, expected_tasks);
+      EXPECT_EQ(work_items, expected_work_items);
     }
   }
 }
@@ -126,21 +126,21 @@ TEST(WorkQueueTest, Worker) {
   for (size_t size : {1, 2, 4, 8, 16, 32, 64}) {
     for (size_t num_partitions : {1, 2, 3, 4, 5, 6, 7, 8}) {
       // We check that no matter what is the initial partition, the worker
-      // processes all tasks in the queue.
+      // processes all work items in the queue.
       for (size_t i = 0; i < num_partitions; ++i) {
         WorkQueue queue(size, num_partitions);
         Worker worker(i, &queue);
 
-        std::vector<size_t> expected_tasks(size);
-        absl::c_iota(expected_tasks, 0);
+        std::vector<size_t> expected_work_items(size);
+        absl::c_iota(expected_work_items, 0);
 
-        std::vector<size_t> tasks;
-        while (std::optional<size_t> task = worker.Pop()) {
-          tasks.push_back(*task);
+        std::vector<size_t> work_items;
+        while (std::optional<size_t> work_item = worker.Pop()) {
+          work_items.push_back(*work_item);
         }
 
-        absl::c_sort(tasks);  // we pop tasks out of order
-        EXPECT_EQ(tasks, expected_tasks);
+        absl::c_sort(work_items);  // we pop work_items out of order
+        EXPECT_EQ(work_items, expected_work_items);
       }
     }
   }
@@ -154,22 +154,22 @@ TEST(WorkQueueTest, WorkerConcurrency) {
 
   WorkQueue queue(size, num_partitions);
 
-  // Check that we pop exactly `size` tasks.
-  std::atomic<size_t> num_tasks(0);
+  // Check that we pop exactly `size` work_items.
+  std::atomic<size_t> num_work_items(0);
 
   absl::BlockingCounter counter(num_partitions);
   for (size_t i = 0; i < num_partitions; ++i) {
     threads.Schedule([&, i] {
       Worker worker(i, &queue);
-      while (std::optional<size_t> task = worker.Pop()) {
-        ++num_tasks;
+      while (std::optional<size_t> work_item = worker.Pop()) {
+        ++num_work_items;
       }
       counter.DecrementCount();
     });
   }
 
   counter.Wait();
-  EXPECT_EQ(num_tasks.load(), size);
+  EXPECT_EQ(num_work_items.load(), size);
 }
 
 TEST(WorkQueueTest, WorkerParallelize) {
@@ -215,35 +215,36 @@ TEST(WorkQueueTest, WorkerParallelizeVariousWorkerTaskRatios) {
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 16);
 
   struct TestCase {
-    size_t num_tasks;
+    size_t num_work_items;
     size_t num_workers;
   };
 
   std::vector<TestCase> test_cases = {
-      {0, 1},     // Edge: no tasks
+      {0, 1},     // Edge: no work_items
       {1, 1},     // Edge: single task, single worker
       {1, 8},     // Edge: single task, many workers
       {8, 1},     // Serial execution
-      {8, 4},     // Fewer workers than tasks
+      {8, 4},     // Fewer workers than work_items
       {8, 8},     // Equal
-      {8, 16},    // More workers than tasks
-      {1024, 8},  // Many tasks, fewer workers
-      {1024, 64}  // Many tasks, many workers
+      {8, 16},    // More workers than work_items
+      {1024, 8},  // Many work_items, fewer workers
+      {1024, 64}  // Many work_items, many workers
   };
 
   for (const auto& test : test_cases) {
-    std::vector<size_t> data(test.num_tasks, 0);
+    std::vector<size_t> data(test.num_work_items, 0);
 
     auto event = Worker::Parallelize(
-        threads.AsEigenThreadPool(), test.num_workers, test.num_tasks,
+        threads.AsEigenThreadPool(), test.num_workers, test.num_work_items,
         [&](size_t task_index) { ++data[task_index]; });
 
     tsl::BlockUntilReady(event);
 
-    // Verify that all tasks were executed once (if any exist)
-    std::vector<size_t> expected(test.num_tasks, 1);
-    EXPECT_EQ(data, expected) << "Failed for num_tasks=" << test.num_tasks
-                              << ", num_workers=" << test.num_workers;
+    // Verify that all work_items were executed once (if any exist)
+    std::vector<size_t> expected(test.num_work_items, 1);
+    EXPECT_EQ(data, expected)
+        << "Failed for num_work_items=" << test.num_work_items
+        << ", num_workers=" << test.num_workers;
   }
 }
 
@@ -251,35 +252,35 @@ TEST(WorkQueueTest, WorkerParallelizeVariousWorkerTaskRatios) {
 // Performance benchmarks.
 //===----------------------------------------------------------------------===//
 
-static void BM_PopTask(benchmark::State& state) {
+static void BM_PopWorkItem(benchmark::State& state) {
   std::optional<WorkQueue> queue;
   std::optional<Worker> worker;
 
   size_t n = 0;
   for (auto _ : state) {
     if (n++ % (1024 * 10) == 0) {
-      queue.emplace(/*num_tasks=*/1024 * 10, /*num_partitions=*/10);
+      queue.emplace(/*num_work_items=*/1024 * 10, /*num_partitions=*/10);
       worker.emplace(0, &*queue);
     }
     worker->Pop();
   }
 }
 
-BENCHMARK(BM_PopTask);
+BENCHMARK(BM_PopWorkItem);
 
-static void BM_PopTaskMultiThreaded(benchmark::State& state) {
+static void BM_PopWorkItemMultiThreaded(benchmark::State& state) {
   size_t num_threads = state.range(0);
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "benchmark",
-                                  num_threads);
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "bench", num_threads);
 
   for (auto _ : state) {
     absl::BlockingCounter counter(num_threads);
-    WorkQueue queue(/*num_tasks=*/1024 * 10, /*num_partitions=*/num_threads);
+    WorkQueue queue(/*num_work_items=*/1024 * 10,
+                    /*num_partitions=*/num_threads);
 
     for (size_t i = 0; i < num_threads; ++i) {
       threads.Schedule([i, &queue, &counter] {
         Worker worker(i, &queue);
-        while (std::optional<size_t> task = worker.Pop()) {
+        while (std::optional<size_t> work_item = worker.Pop()) {
         }
         counter.DecrementCount();
       });
@@ -291,7 +292,7 @@ static void BM_PopTaskMultiThreaded(benchmark::State& state) {
   state.SetItemsProcessed(state.iterations() * 1024 * 10);
 }
 
-BENCHMARK(BM_PopTaskMultiThreaded)
+BENCHMARK(BM_PopWorkItemMultiThreaded)
     ->MeasureProcessCPUTime()
     ->Arg(2)
     ->Arg(4)
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
index 416424e2ab7dc3..3b3a8666298aff 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
@@ -53,7 +53,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:convolution_lib",
+        "//xla/backends/cpu/runtime:convolution_dims",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
@@ -112,7 +112,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
index 27c7038932be43..2e9ce21d285239 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
index 2e37ab97ed7cb6..116f865a52a421 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
index 741c3e18155026..6780fc4b500f78 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
index 2349250b2f2736..b97789bf11ab5c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
index 44f6c5cfc2ab8a..3071e055abb3d1 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
@@ -272,10 +272,10 @@ XnnFusionThunk::~XnnFusionThunk() = default;
 XnnFusionThunk::BufferUses XnnFusionThunk::buffer_uses() const {
   BufferUses buffer_uses;
   for (const Argument& argument : arguments_) {
-    buffer_uses.push_back(BufferUse::Read(argument.slice));
+    buffer_uses.push_back(BufferUse::Read(argument.slice, argument.shape));
   }
   for (const Result& result : results_) {
-    buffer_uses.push_back(BufferUse::Write(result.slice));
+    buffer_uses.push_back(BufferUse::Write(result.slice, result.shape));
   }
 
   return buffer_uses;
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
new file mode 100644
index 00000000000000..a37949d4fe0239
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
@@ -0,0 +1,146 @@
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl/xnnpack:build_defs.bzl", "ynn_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "slinky_threadpool",
+    srcs = ["slinky_threadpool.cc"],
+    hdrs = ["slinky_threadpool.h"],
+    deps = [
+        "//xla/backends/cpu/runtime:work_queue",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@slinky//slinky/base:thread_pool",
+    ],
+)
+
+ynn_cc_test(
+    name = "slinky_threadpool_test",
+    srcs = ["slinky_threadpool_test.cc"],
+    deps = [
+        ":slinky_threadpool",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/strings:str_format",
+        "@slinky//slinky/base:thread_pool",
+    ],
+)
+
+cc_library(
+    name = "ynn_interop",
+    srcs = ["ynn_interop.cc"],
+    hdrs = ["ynn_interop.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/platform:logging",
+        "@XNNPACK//ynnpack",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+cc_library(
+    name = "ynn_threadpool",
+    srcs = ["ynn_threadpool.cc"],
+    hdrs = ["ynn_threadpool.h"],
+    deps = [
+        ":slinky_threadpool",
+        ":ynn_interop",
+        "@XNNPACK//ynnpack:ynnpack_h",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@slinky//slinky/base:thread_pool",
+    ],
+)
+
+cc_library(
+    name = "ynn_fusion_thunk",
+    srcs = ["ynn_fusion_thunk.cc"],
+    hdrs = ["ynn_fusion_thunk.h"],
+    deps = [
+        ":ynn_interop",
+        "//xla:shape_util",
+        "//xla/backends/cpu/runtime:thunk",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:buffer_use",
+        "//xla/runtime:object_pool",
+        "//xla/service:buffer_assignment",
+        "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@XNNPACK//ynnpack:ynnpack_h",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+ynn_cc_test(
+    name = "ynn_fusion_thunk_test",
+    srcs = ["ynn_fusion_thunk_test.cc"],
+    deps = [
+        ":ynn_fusion_thunk",
+        ":ynn_interop",
+        ":ynn_threadpool",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/runtime:buffer_allocations",
+        "//xla/backends/cpu/runtime:thunk",
+        "//xla/backends/cpu/runtime:thunk_testlib",
+        "//xla/hlo/ir:hlo",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@XNNPACK//ynnpack:ynnpack_h",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@eigen_archive//:eigen3",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.cc
new file mode 100644
index 00000000000000..018d7102b13230
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.cc
@@ -0,0 +1,424 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/optimization.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "slinky/base/function_ref.h"
+#include "slinky/base/ref_count.h"
+#include "slinky/base/thread_pool.h"
+#include "xla/backends/cpu/runtime/work_queue.h"
+#include "tsl/profiler/lib/traceme.h"
+
+#define EIGEN_USE_THREADS
+#include "Eigen/ThreadPool"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu {
+
+//===----------------------------------------------------------------------===//
+// Task
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Running a task can result in three states:
+//
+//   kPending:  The task is still being processed by the worker threads.
+//   kComplete: The caller thread is the one who completed the task.
+//   kDone:     The task is done and all work items have been processed, however
+//              the caller thread did't process any work items.
+//
+// We need this state to signal the waiter thread just once, from a thread that
+// completed the task.S
+enum class TaskState { kPending, kComplete, kDone };
+
+class Task final : public SlinkyThreadPool::task {
+ public:
+  Task(SlinkyThreadPool::task_body body, size_t num_work_items,
+       size_t num_partitions);
+
+  // Runs this task by processing work items in the current thread.
+  TaskState Run();
+
+  // Returns true if the work queue is empty. It doesn't mean that the task is
+  // complete, as some threads might still be working on this task.
+  bool IsEmptyWorkQueue() const;
+
+  // Returns the number of workers that are currently working on this task.
+  int64_t num_workers() const;
+
+  // Returns true if the task is done.
+  bool done() const final;
+
+ private:
+  SlinkyThreadPool::task_body body_;
+  WorkQueue work_queue_;
+
+  ABSL_CACHELINE_ALIGNED std::atomic<size_t> worker_index_;
+  ABSL_CACHELINE_ALIGNED std::atomic<size_t> pending_work_items_;
+};
+}  // namespace
+
+Task::Task(SlinkyThreadPool::task_body body, size_t num_work_items,
+           size_t num_partitions)
+    : body_(std::move(body)),
+      work_queue_(num_work_items, num_partitions),
+      worker_index_(0),
+      pending_work_items_(num_work_items) {}
+
+TaskState Task::Run() {
+  // If we have more workers joining the task than the number of partitions,
+  // then we have to wrap around to the first partition.
+  size_t worker_index = worker_index_.fetch_add(1, std::memory_order_relaxed);
+  if (ABSL_PREDICT_FALSE(worker_index >= work_queue_.num_partitions())) {
+    worker_index %= work_queue_.num_partitions();
+  }
+
+  // Each worker processes the body using its own copy of the task.
+  Worker w(worker_index, &work_queue_);
+  size_t num_processed_work_items = 0;
+
+  if (std::optional<size_t> item = w.Pop(/*notify_work_stealing=*/false)) {
+    SlinkyThreadPool::task_body body = body_;
+
+    do {
+      body(*item);
+      ++num_processed_work_items;
+    } while ((item = w.Pop(/*notify_work_stealing=*/false)).has_value());
+  }
+
+  // The number of pending work items should never go below zero.
+  size_t previous_work_items = pending_work_items_.fetch_sub(
+      num_processed_work_items, std::memory_order_acq_rel);
+  DCHECK_GE(previous_work_items, num_processed_work_items);
+
+  // Task is done if we have no more work items to process. Task is complete if
+  // we are the one who processed the last work item.
+  bool is_done = previous_work_items == num_processed_work_items;
+  bool is_complete = is_done && num_processed_work_items > 0;
+
+  return is_complete ? TaskState::kComplete
+         : is_done   ? TaskState::kDone
+                     : TaskState::kPending;
+}
+
+int64_t Task::num_workers() const {
+  return worker_index_.load(std::memory_order_relaxed);
+}
+
+bool Task::IsEmptyWorkQueue() const { return work_queue_.IsEmpty(); }
+
+bool Task::done() const {
+  return pending_work_items_.load(std::memory_order_acquire) == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SlinkyThreadPool::Impl
+//===----------------------------------------------------------------------===//
+
+// We keep a stack of tasks that are currently being processed by current
+// thread, to avoid recursive calls.
+static thread_local std::vector<const Task*> task_stack;  // NOLINT
+
+class SlinkyThreadPool::Impl : public slinky::ref_counted<Impl> {
+ public:
+  explicit Impl(Eigen::ThreadPoolInterface* threadpool);
+
+  // Enqueues a new task into the queue and returns a reference to it.
+  slinky::ref_count<Task> Enqueue(SlinkyThreadPool::task_body body,
+                                  size_t num_work_items, size_t num_partitions);
+
+  // Work on the single task and return the state of the task.
+  TaskState WorkOnTask(Task* task);
+
+  // Work on all tasks in the queue. Returns when Run out of tasks to process.
+  void WorkOnTasks(const absl::Condition& condition);
+
+  void Await(const absl::Condition& condition);
+  void AtomicCall(slinky::function_ref<void()> t);
+
+  // Returns true if we can schedule more workers into the underlying scheduler.
+  bool CanScheduleWorkers() const;
+
+  // Schedules the given number of workers for the given task. Worker scheduling
+  // uses recursive work splitting and early exit if the task does not need any
+  // more workers, of if we reached the maximum number of scheduled workers.
+  void ScheduleWorkers(int64_t num_workers, slinky::ref_count<Task> task);
+
+  size_t thread_count() const { return thread_count_; }
+
+ private:
+  friend class slinky::ref_counted<Impl>;
+  static void destroy(Impl* ptr) { delete ptr; }
+
+  // A state of the work scheduling for a given task.
+  struct ScheduleState : public slinky::ref_counted<ScheduleState> {
+    ScheduleState(int64_t remaining_workers, slinky::ref_count<Task> task,
+                  slinky::ref_count<Impl> impl)
+        : remaining_workers(remaining_workers),
+          task(std::move(task)),
+          impl(std::move(impl)) {}
+
+    static void destroy(ScheduleState* ptr) { delete ptr; }
+
+    std::atomic<int64_t> remaining_workers;
+    slinky::ref_count<Task> task;
+    slinky::ref_count<Impl> impl;
+  };
+
+  // Worker scheduling function for the underlying scheduler.
+  template <bool release_impl_ref>
+  static void ScheduleWorkers(ScheduleState* context);
+
+  // Dequeues a pending task from the queue.
+  slinky::ref_count<Task> Dequeue();
+
+  // Signals all waiter threads waiting on the waiter mutex.
+  void SignalWaiters();
+
+  Eigen::ThreadPoolInterface* threadpool_;
+  size_t thread_count_;
+
+  std::deque<slinky::ref_count<Task>> tasks_ ABSL_GUARDED_BY(tasks_mutex_);
+
+  // A mutex for guarding mutable state accessed concurrently.
+  ABSL_CACHELINE_ALIGNED absl::Mutex tasks_mutex_;
+
+  // A mutex for signalling threads waiting on the tasks or conditions.
+  ABSL_CACHELINE_ALIGNED absl::Mutex waiter_mutex_;
+};
+
+SlinkyThreadPool::Impl::Impl(Eigen::ThreadPoolInterface* threadpool)
+    : threadpool_(threadpool),
+      thread_count_(threadpool_ ? threadpool_->NumThreads() : 0) {}
+
+slinky::ref_count<Task> SlinkyThreadPool::Impl::Enqueue(
+    SlinkyThreadPool::task_body body, size_t num_work_items,
+    size_t num_partitions) {
+  slinky::ref_count<Task> task(
+      new Task(std::move(body), num_work_items, num_partitions));
+
+  absl::MutexLock lock(tasks_mutex_);
+  return tasks_.emplace_back(std::move(task));
+}
+
+slinky::ref_count<Task> SlinkyThreadPool::Impl::Dequeue() {
+  absl::MutexLock lock(tasks_mutex_);
+
+  for (auto i = tasks_.begin(); i != tasks_.end();) {
+    slinky::ref_count<Task>& task = *i;
+
+    // Task doesn't have any more work items to process.
+    if (ABSL_PREDICT_FALSE(task->IsEmptyWorkQueue())) {
+      i = tasks_.erase(i);
+      continue;
+    }
+
+    // Don't Run the same task multiple times on the same thread.
+    if (ABSL_PREDICT_FALSE(absl::c_contains(task_stack, &*task))) {
+      ++i;
+      continue;
+    }
+
+    return task;
+  }
+
+  return nullptr;
+}
+
+TaskState SlinkyThreadPool::Impl::WorkOnTask(Task* task) {
+  DCHECK(absl::c_find(task_stack, task) == task_stack.end());
+
+  task_stack.push_back(task);
+  TaskState state = task->Run();
+  task_stack.pop_back();
+
+  // If we are the one who completed the task, we signal the waiters to wake upS
+  // any threads that are waiting for the task completion. If the task was
+  // completed by another worker, we do nothing to avoid the cost of waking up
+  // the same thread multiple times.
+  if (ABSL_PREDICT_FALSE(state == TaskState::kComplete)) {
+    SignalWaiters();
+  }
+
+  return state;
+}
+
+void SlinkyThreadPool::Impl::WorkOnTasks(const absl::Condition& condition) {
+  while (slinky::ref_count<Task> task = Dequeue()) {
+    WorkOnTask(&*task);
+
+    if (ABSL_PREDICT_TRUE(condition.Eval())) {
+      return;
+    }
+  }
+}
+
+void SlinkyThreadPool::Impl::Await(const absl::Condition& condition) {
+  if (ABSL_PREDICT_FALSE(!condition.Eval())) {
+    tsl::profiler::TraceMe trace("SlinkyThreadPool::Await");
+    absl::MutexLock lock(waiter_mutex_);
+    waiter_mutex_.Await(condition);
+  }
+}
+
+void SlinkyThreadPool::Impl::SignalWaiters() {
+  absl::MutexLock lock(waiter_mutex_);
+}
+
+void SlinkyThreadPool::Impl::AtomicCall(slinky::function_ref<void()> t) {
+  absl::MutexLock lock(waiter_mutex_);
+  t();
+}
+
+bool SlinkyThreadPool::Impl::CanScheduleWorkers() const {
+  // One reference is owned by the parent SlinkyThreadPool, every other
+  // reference is owned by a worker scheduled into the underlying scheduler.
+  return ref_count() < 1 + thread_count();
+}
+
+void SlinkyThreadPool::Impl::ScheduleWorkers(int64_t num_workers,
+                                             slinky::ref_count<Task> task) {
+  if (ABSL_PREDICT_TRUE(num_workers > 0 && CanScheduleWorkers())) {
+    slinky::ref_count<ScheduleState> state(
+        new ScheduleState(num_workers - 1, std::move(task), {this}));
+    threadpool_->Schedule([state = state.take()] {
+      ScheduleWorkers</*release_impl_ref=*/false>(state);
+    });
+  }
+}
+
+template <bool release_impl_ref>
+void SlinkyThreadPool::Impl::ScheduleWorkers(ScheduleState* context) {
+  auto state = slinky::ref_count<ScheduleState>::assume(context);
+
+  // We recursively keep scheduling workers into the underlying scheduler.
+  // This is more efficient than scheduling them sequentially from a single
+  // thread, because workers can start processing the task sooner and we
+  // distribute thread wake-ups evenly across underlying threads.
+  static constexpr int32_t kNumRecursiveWorkers = 2;
+
+  for (size_t i = 0; i < kNumRecursiveWorkers; ++i) {
+    bool schedule_worker =
+        state->impl->CanScheduleWorkers() && !state->task->IsEmptyWorkQueue() &&
+        state->remaining_workers.fetch_sub(1, std::memory_order_relaxed) > 0;
+
+    if (ABSL_PREDICT_TRUE(!schedule_worker)) {
+      break;
+    }
+
+    // Add +1 reference to account for the scheduled worker, as we use `impl`
+    // reference count to track the number of active workers.
+    state->impl->add_ref();
+    state->impl->threadpool_->Schedule(
+        [state = slinky::ref_count<ScheduleState>(state).take()] {
+          SlinkyThreadPool::Impl::ScheduleWorkers</*release_impl_ref=*/true>(
+              state);
+        });
+  }
+
+  // Keep processing tasks from the queue until we are out of tasks.
+  static constexpr bool kFalse = false;
+  state->impl->WorkOnTasks(absl::Condition(&kFalse));
+
+  // One `impl` reference implicitly owned by the `state`, every additional
+  // reference is added and released explicitly by the worker task.
+  if constexpr (release_impl_ref) {
+    state->impl->release();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SlinkyThreadPool
+//===----------------------------------------------------------------------===//
+
+SlinkyThreadPool::SlinkyThreadPool(Eigen::ThreadPoolDevice* device)
+    : impl_(new Impl(device ? device->getPool() : nullptr)) {}
+
+SlinkyThreadPool::SlinkyThreadPool(Eigen::ThreadPoolInterface* threadpool)
+    : impl_(new Impl(threadpool)) {}
+
+SlinkyThreadPool::SlinkyThreadPool(SlinkyThreadPool&&) = default;
+SlinkyThreadPool& SlinkyThreadPool::operator=(SlinkyThreadPool&&) = default;
+
+SlinkyThreadPool::~SlinkyThreadPool() = default;
+
+slinky::ref_count<SlinkyThreadPool::task> SlinkyThreadPool::enqueue(
+    size_t n, task_body t, int32_t max_workers) {
+  CHECK_GE(max_workers, n);
+
+  // Don't create more partitions than the number of threads. Also make sure
+  // that we have at least one partition (if we don't have a scheduler).
+  size_t num_partitions = std::min<size_t>(n, thread_count());
+  num_partitions = std::max<size_t>(1, num_partitions);
+
+  auto task = impl_->Enqueue(std::move(t), n, num_partitions);
+
+  // If we don't have any worker threads, we return a task to the caller, and
+  // assume that the caller will wait on it.
+  if (ABSL_PREDICT_FALSE(impl_->thread_count() == 0)) {
+    return task;
+  }
+
+  // We assume that the caller will immediately start working on the task, so we
+  // need to schedule workers only for the remaining number of partitions.
+  impl_->ScheduleWorkers(/*num_workers=*/num_partitions - 1, task);
+
+  return task;
+}
+
+void SlinkyThreadPool::wait_for(task* t) {
+  Task* task = static_cast<Task*>(t);
+  TaskState state = impl_->WorkOnTask(task);
+
+  // If the task is complete or done, we are immediately done with waiting.
+  if (ABSL_PREDICT_TRUE(state == TaskState::kComplete ||
+                        state == TaskState::kDone)) {
+    return;
+  }
+
+  // Switch to the work stealing mode and work on other tasks in the queue
+  // until the given task is done.
+  impl_->WorkOnTasks(absl::Condition(task, &Task::done));
+  impl_->Await(absl::Condition(task, &Task::done));
+}
+
+void SlinkyThreadPool::wait_for(predicate_ref condition) {
+  impl_->WorkOnTasks(absl::Condition(&condition));
+  impl_->Await(absl::Condition(&condition));
+}
+
+void SlinkyThreadPool::atomic_call(slinky::function_ref<void()> t) {
+  impl_->AtomicCall(t);
+}
+
+int SlinkyThreadPool::thread_count() const { return impl_->thread_count(); }
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h
new file mode 100644
index 00000000000000..b27cb29d4c7b46
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h
@@ -0,0 +1,61 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_YNNPACK_SLINKY_THREADPOOL_H_
+#define XLA_BACKENDS_CPU_RUNTIME_YNNPACK_SLINKY_THREADPOOL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "slinky/base/function_ref.h"
+#include "slinky/base/ref_count.h"
+#include "slinky/base/thread_pool.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+class ThreadPoolInterface;
+}  // namespace Eigen
+
+namespace xla::cpu {
+
+// This is an implementation of slinky::thread_pool, using absl::Mutex for
+// synchronization, and dispatches work to Eigen::ThreadPoolInterface.
+class SlinkyThreadPool final : public slinky::thread_pool {
+ public:
+  explicit SlinkyThreadPool(Eigen::ThreadPoolDevice* device);
+  explicit SlinkyThreadPool(Eigen::ThreadPoolInterface* threadpool);
+  ~SlinkyThreadPool() final;
+
+  SlinkyThreadPool(SlinkyThreadPool&&);
+  SlinkyThreadPool& operator=(SlinkyThreadPool&&);
+
+  slinky::ref_count<task> enqueue(size_t n, task_body t,
+                                  int32_t max_workers) final;
+
+  void wait_for(task* t) final;
+  void wait_for(predicate_ref condition) final;
+
+  void atomic_call(slinky::function_ref<void()> t) final;
+
+  int thread_count() const final;
+
+ private:
+  class Impl;
+  slinky::ref_count<Impl> impl_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_YNNPACK_SLINKY_THREADPOOL_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool_test.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool_test.cc
new file mode 100644
index 00000000000000..c79c9ccda45c08
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool_test.cc
@@ -0,0 +1,156 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h"
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "slinky/base/ref_count.h"
+#include "slinky/base/thread_pool.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace xla::cpu {
+
+TEST(SlinkyThreadPoolTest, InlineScheduling) {
+  SlinkyThreadPool thread_pool(
+      static_cast<Eigen::ThreadPoolInterface*>(nullptr));
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 1);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(SlinkyThreadPoolTest, SingleLoop) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  SlinkyThreadPool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 1);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(SlinkyThreadPoolTest, LoopChain) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  SlinkyThreadPool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 5);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(SlinkyThreadPoolTest, NestedLoops) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  SlinkyThreadPool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 100;
+
+  std::array<std::atomic<int32_t>, size> data = {{0}};
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(
+      size, [&](size_t i) { thread_pool.parallel_for(size, inc); });
+
+  for (size_t i = 0; i < size; ++i) {
+    EXPECT_EQ(data[i], size);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks below.
+//===----------------------------------------------------------------------===//
+
+static void BM_ParallelFor(benchmark::State& state) {
+  int64_t num_threads = state.range(0);
+  int64_t num_threadpools = state.range(1);
+
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "bench", num_threads);
+  std::vector<SlinkyThreadPool> thread_pools;
+  for (size_t i = 0; i < num_threadpools; ++i) {
+    thread_pools.emplace_back(threads.AsEigenThreadPool());
+  }
+
+  static constexpr size_t kNumLoops = 100;
+  static constexpr size_t kLoopSize = 100;
+
+  for (auto _ : state) {
+    std::vector<slinky::ref_count<slinky::thread_pool::task>> tasks;
+
+    for (size_t i = 0; i < kNumLoops; ++i) {
+      SlinkyThreadPool& thread_pool = thread_pools[i % num_threadpools];
+      tasks.push_back(thread_pool.enqueue(
+          kLoopSize, [](int64_t) {}, std::numeric_limits<int32_t>::max()));
+    }
+
+    for (size_t i = 0; i < kNumLoops; ++i) {
+      SlinkyThreadPool& thread_pool = thread_pools[i % num_threadpools];
+      thread_pool.wait_for(&*tasks[i]);
+    }
+  }
+
+  state.SetItemsProcessed(kLoopSize * kNumLoops * state.iterations());
+  state.SetLabel(absl::StrFormat("#threads=%d, #threadpools=%d", num_threads,
+                                 num_threadpools));
+}
+
+BENCHMARK(BM_ParallelFor)
+    ->MeasureProcessCPUTime()
+    ->ArgPair(8, 1)
+    ->ArgPair(8, 2)
+    ->ArgPair(8, 4)
+    ->ArgPair(8, 8)
+    ->ArgPair(8, 16)
+    ->ArgPair(16, 1)
+    ->ArgPair(16, 2)
+    ->ArgPair(16, 4)
+    ->ArgPair(16, 8)
+    ->ArgPair(16, 16)
+    ->ArgPair(32, 1)
+    ->ArgPair(32, 2)
+    ->ArgPair(32, 4)
+    ->ArgPair(32, 8)
+    ->ArgPair(32, 16);
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.cc
new file mode 100644
index 00000000000000..d87974ed705391
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.cc
@@ -0,0 +1,378 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <utility>
+#include <vector>
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/algorithm/container.h"
+#include "absl/base/no_destructor.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/bind_front.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::cpu {
+
+absl::string_view YnnFusionThunk::YnnFusionKindToString(YnnFusionKind kind) {
+  switch (kind) {
+    case YnnFusionKind::kFusion:
+      return "ynn-fusion";
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, YnnFusionThunk::YnnFusionKind kind) {
+  return os << YnnFusionThunk::YnnFusionKindToString(kind);
+}
+
+// YNNPACK executable instantiated for the fusion operation.
+struct YnnFusionThunk::YnnExecutable {
+  tsl::AsyncValueRef<YnnFusionThunk::ExecuteEvent> Invoke(
+      const YnnThreadpool& threadpool,
+      absl::Span<se::DeviceMemoryBase> arguments,
+      absl::Span<se::DeviceMemoryBase> results,
+      absl::FunctionRef<bool(size_t)> is_captured_argument);
+
+  // Resets YNNPACK runtime and subgraph.
+  absl::Status Reset();
+
+  YnnSubgraph subgraph = nullptr;
+  YnnRuntime runtime = nullptr;
+
+  // TODO(ezhulenev): Today we rely on device memory as an identity of the
+  // captured argument, and this is not correct as we can have multiple
+  // arguments allocated to the heap address. This is work in progress, and will
+  // be migrated to a buffer identity passed to XLA by the client (PjRt).
+  std::vector<se::DeviceMemoryBase> captured_arguments;
+};
+
+namespace {
+struct YnnExternalValue {
+  uint32_t id;
+  void* data;
+};
+}  // namespace
+
+static enum ynn_status set_external_values(
+    ynn_runtime_t runtime, absl::Span<const YnnExternalValue> external_values) {
+  for (const auto& [id, data] : external_values) {
+    enum ynn_status status = ynn_set_external_value_data(runtime, id, data);
+    if (status != ynn_status_success) {
+      return status;
+    }
+  }
+  return ynn_status_success;
+}
+
+tsl::AsyncValueRef<YnnFusionThunk::ExecuteEvent>
+YnnFusionThunk::YnnExecutable::Invoke(
+    const YnnThreadpool& threadpool, absl::Span<se::DeviceMemoryBase> arguments,
+    absl::Span<se::DeviceMemoryBase> results,
+    absl::FunctionRef<bool(size_t)> is_captured_argument) {
+  // Create external values for all arguments and results.
+  absl::InlinedVector<YnnExternalValue, 8> external_values;
+  external_values.reserve(arguments.size() + results.size());
+
+  // External tensor id for arguments and results.
+  uint32_t id = 0;
+
+  for (const se::DeviceMemoryBase& argument : arguments) {
+    YnnExternalValue value{id++, argument.opaque()};
+    if (!is_captured_argument(value.id)) {
+      external_values.push_back(value);
+    }
+  }
+
+  for (const se::DeviceMemoryBase& result : results) {
+    YnnExternalValue value{id++, result.opaque()};
+    external_values.push_back(value);
+  }
+
+  DCHECK_NE(runtime.get(), nullptr) << "YNNPACK runtime is not initialized";
+
+  YNN_RETURN_IF_ERROR(set_external_values(runtime.get(), external_values));
+
+  // Update threadpool used by the YNNPACK runtime.
+  YNN_RETURN_IF_ERROR(ynn_update_runtime_with_threadpool(
+      runtime.get(), reinterpret_cast<ynn_threadpool_t>(threadpool.get())));
+
+  // Execute YNNPACK runtime in the caller thread.
+  YNN_RETURN_IF_ERROR(ynn_invoke_runtime(runtime.get()));
+  return OkExecuteEvent();
+}
+
+absl::Status YnnFusionThunk::YnnExecutable::Reset() {
+  runtime.reset();
+  subgraph.reset();
+  return absl::OkStatus();
+}
+
+absl::StatusOr<YnnFusionThunk::YnnExecutable>
+YnnFusionThunk::CreateYnnExecutable(
+    const YnnThreadpool& threadpool,
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers) {
+  bool capturing = !captured_arguments_ids_.empty();
+  VLOG(3) << absl::StreamFormat(
+      "Create %s YNN executable for `%s` operation: num_created=%d",
+      capturing ? "capturing" : "pooled", info().op_name,
+      capturing ? num_capturing_created_.fetch_add(1)
+                : ynn_executable_pool_.num_created());
+
+  YnnExecutable executable;
+
+  // Keep track of the arguments captured by value.
+  executable.captured_arguments = CaptureArguments(arguments_buffers);
+
+  if (builder_) {
+    TF_ASSIGN_OR_RETURN(executable.subgraph, builder_(arguments_, results_));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        executable.subgraph,
+        capturing_builder_(arguments_, results_, arguments_buffers));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      executable.runtime, CreateYnnRuntime([&](ynn_runtime_t* runtime) {
+        uint32_t ynn_flags = 0;
+        return ynn_create_runtime(
+            executable.subgraph.get(),
+            reinterpret_cast<ynn_threadpool_t>(threadpool.get()), ynn_flags,
+            runtime);
+      }));
+  YNN_RETURN_IF_ERROR(ynn_reshape_runtime(executable.runtime.get()));
+
+  return {std::move(executable)};
+}
+
+absl::Status YnnFusionThunk::UpdateYnnExecutable(
+    const YnnThreadpool& threadpool, YnnExecutable& executable,
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers) {
+  DCHECK(capturing_builder_) << "YNN executable is not capturing arguments";
+  DCHECK_EQ(executable.captured_arguments.size(),
+            captured_arguments_ids_.size())
+      << "Unexpected number of captured arguments";
+
+  // If all arguments captured by value are the same as the last execution,
+  // we can reuse the YNN executable.
+  auto capture_arguments = CaptureArguments(arguments_buffers);
+  if (executable.captured_arguments == capture_arguments) {
+    VLOG(3) << absl::StreamFormat("Reuse YNN executable for `%s` operation",
+                                  info().op_name);
+    return absl::OkStatus();
+  }
+
+  VLOG(3) << absl::StreamFormat("Update YNN executable for `%s` operation",
+                                info().op_name);
+
+  TF_RETURN_IF_ERROR(executable.Reset());
+
+  // Keep track of the updated arguments captured by value.
+  executable.captured_arguments = std::move(capture_arguments);
+
+  TF_ASSIGN_OR_RETURN(
+      executable.subgraph,
+      capturing_builder_(arguments_, results_, arguments_buffers));
+
+  TF_ASSIGN_OR_RETURN(
+      executable.runtime, CreateYnnRuntime([&](ynn_runtime_t* runtime) {
+        uint32_t ynn_flags = 0;
+        return ynn_create_runtime(
+            executable.subgraph.get(),
+            reinterpret_cast<ynn_threadpool_t>(threadpool.get()), ynn_flags,
+            runtime);
+      }));
+  YNN_RETURN_IF_ERROR(ynn_reshape_runtime(executable.runtime.get()));
+
+  return absl::OkStatus();
+}
+
+std::vector<se::DeviceMemoryBase> YnnFusionThunk::CaptureArguments(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers) {
+  std::vector<se::DeviceMemoryBase> captured_arguments_ids;
+  captured_arguments_ids.reserve(captured_arguments_ids_.size());
+  for (int64_t i = 0; i < captured_arguments_ids_.size(); ++i) {
+    int32_t arg_index = captured_arguments_ids_[i];
+    captured_arguments_ids.push_back(arguments_buffers[arg_index]);
+  }
+  return captured_arguments_ids;
+}
+
+absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunk::Create(
+    Options options, Info info, const HloInstruction* hlo,
+    std::vector<Argument> arguments, std::vector<Result> results,
+    Builder builder) {
+  return absl::WrapUnique(new YnnFusionThunk(
+      YnnFusionKind::kFusion, std::move(options), std::move(info), hlo,
+      std::move(arguments), std::move(results), std::move(builder)));
+}
+
+absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunk::Create(
+    Options options, Info info, const HloInstruction* hlo,
+    std::vector<Argument> arguments, std::vector<Result> results,
+    CapturingBuilder capturing_builder,
+    absl::Span<const int64_t> captured_arguments_ids) {
+  return absl::WrapUnique(new YnnFusionThunk(
+      YnnFusionKind::kFusion, std::move(options), std::move(info), hlo,
+      std::move(arguments), std::move(results), std::move(capturing_builder),
+      captured_arguments_ids));
+}
+
+YnnFusionThunk::YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
+                               const HloInstruction* hlo,
+                               std::vector<Argument> arguments,
+                               std::vector<Result> results, Builder builder)
+    : Thunk(Kind::kYnnFusion, std::move(info)),
+      ynn_fusion_kind_(kind),
+      options_(std::move(options)),
+      hlo_(hlo),
+      arguments_(std::move(arguments)),
+      results_(std::move(results)),
+      builder_(std::move(builder)),
+      ynn_executable_pool_(
+          absl::bind_front(&YnnFusionThunk::CreateYnnExecutable, this)) {}
+
+YnnFusionThunk::YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
+                               const HloInstruction* hlo,
+                               std::vector<Argument> arguments,
+                               std::vector<Result> results,
+                               CapturingBuilder capturing_builder,
+                               absl::Span<const int64_t> captured_arguments_ids)
+    : Thunk(Kind::kYnnFusion, std::move(info)),
+      ynn_fusion_kind_(kind),
+      options_(std::move(options)),
+      hlo_(hlo),
+      arguments_(std::move(arguments)),
+      results_(std::move(results)),
+      capturing_builder_(std::move(capturing_builder)),
+      captured_arguments_ids_(captured_arguments_ids.begin(),
+                              captured_arguments_ids.end()),
+      ynn_executable_pool_(
+          absl::bind_front(&YnnFusionThunk::CreateYnnExecutable, this)) {}
+
+YnnFusionThunk::~YnnFusionThunk() = default;
+
+YnnFusionThunk::BufferUses YnnFusionThunk::buffer_uses() const {
+  BufferUses buffer_uses;
+  for (const Argument& argument : arguments_) {
+    buffer_uses.push_back(BufferUse::Read(argument.slice, argument.shape));
+  }
+  for (const Result& result : results_) {
+    buffer_uses.push_back(BufferUse::Write(result.slice, result.shape));
+  }
+
+  return buffer_uses;
+}
+
+const YnnThreadpool& GetYnnThreadpool(const Thunk::ExecuteParams& params) {
+  static absl::NoDestructor<YnnThreadpool> no_threadpool(nullptr);
+  return params.ynn_params ? params.ynn_params->threadpool : *no_threadpool;
+}
+
+tsl::AsyncValueRef<YnnFusionThunk::ExecuteEvent> YnnFusionThunk::Execute(
+    const ExecuteParams& params) {
+  VLOG(3) << absl::StreamFormat("YNN %s `%s`: %s", fusion_kind(),
+                                info().op_name, fusion_description());
+
+  if (VLOG_IS_ON(3) && has_fusion_details()) {
+    for (auto& detail : fusion_details()) {
+      VLOG(3) << detail;
+    }
+  }
+
+  // Resolve device memory for arguments.
+  absl::InlinedVector<se::DeviceMemoryBase, 8> arguments_buffers;
+  arguments_buffers.resize(arguments_.size());
+  for (size_t i = 0; i < arguments_.size(); ++i) {
+    Argument& argument = arguments_[i];
+
+    TF_ASSIGN_OR_RETURN(
+        arguments_buffers[i],
+        params.buffer_allocations->GetDeviceAddress(argument.slice));
+
+    VLOG(3) << absl::StreamFormat("  %s: %s in slice %s (%p)", argument_name(i),
+                                  argument.shape.ToString(true),
+                                  argument.slice.ToString(),
+                                  arguments_buffers[i].opaque());
+  }
+
+  // Resolve device memory for results.
+  absl::InlinedVector<se::DeviceMemoryBase, 4> results_buffers;
+  results_buffers.resize(results_.size());
+  for (size_t i = 0; i < results_.size(); ++i) {
+    Result& result = results_[i];
+
+    TF_ASSIGN_OR_RETURN(
+        results_buffers[i],
+        params.buffer_allocations->GetDeviceAddress(results_[i].slice));
+
+    VLOG(3) << absl::StreamFormat("  %s: %s in slice %s (%p)", result_name(i),
+                                  result.shape.ToString(true),
+                                  result.slice.ToString(),
+                                  results_buffers[i].opaque());
+  }
+
+  DCHECK(builder_ || capturing_builder_) << "One of the builders must be set.";
+
+  auto invoke = [&](typename YnnExecutablePool::BorrowedObject executable) {
+    auto executed = executable->Invoke(
+        GetYnnThreadpool(params), absl::MakeSpan(arguments_buffers),
+        absl::MakeSpan(results_buffers), [&](size_t id) {
+          return absl::c_linear_search(captured_arguments_ids_, id);
+        });
+
+    // Do not return executable to the pool until the execution is done.
+    executed.AndThen([executable = std::move(executable)] {});
+    return executed;
+  };
+
+  // Borrow YnnExecutable from the pool.
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      ynn_executable_pool_.GetOrCreate(GetYnnThreadpool(params),
+                                                       arguments_buffers));
+
+  // If YNN graph doesn't capture any of the arguments by value, we can execute
+  // XnnExecutable immediately.
+  if (captured_arguments_ids_.empty()) {
+    return invoke(std::move(executable));
+  }
+
+  // Otherwise reset YnnExecutable to capture new arguments buffers.
+  TF_RETURN_IF_ERROR(UpdateYnnExecutable(GetYnnThreadpool(params), *executable,
+                                         arguments_buffers));
+  return invoke(std::move(executable));
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h
new file mode 100644
index 00000000000000..19518575a3f1e7
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h
@@ -0,0 +1,195 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_FUSION_THUNK_H_
+#define XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_FUSION_THUNK_H_
+
+#include <stdbool.h>
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/runtime/object_pool.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::cpu {
+
+// YNN fusion thunk encapsulates YNNPACK subgraph constructed from an XLA fusion
+// operation.
+class YnnFusionThunk : public Thunk {
+ public:
+  enum class YnnFusionKind {
+    kFusion,
+  };
+
+  static absl::string_view YnnFusionKindToString(YnnFusionKind kind);
+
+  ~YnnFusionThunk() override;
+
+  struct Options {
+    // Pass YnnThreadpool constructed from the intra-op threadpool to the
+    // YNNPACK runtime to allow YNNPACK to parallelize the execution.
+    bool use_threadpool = true;
+  };
+
+  struct Argument {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  struct Result {
+    BufferAllocation::Slice slice;
+    Shape shape;
+  };
+
+  // Builder function constructs YNNPACK subgraph for the fusion operation.
+  using Builder = absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+      absl::Span<const Argument> arguments, absl::Span<const Result> results)>;
+
+  // Builder function that constructs YNNPACK subgraph for the fusion operation
+  // and captures some of the arguments buffers by value. Such YNNPACK subgraphs
+  // can't be reused if captured arguments are not the same, and can lead to
+  // crashes and undefined behavior if captured arguments are destroyed.
+  // Capturing arguments by value allows YNNPACK to do packing at graph compile
+  // time, and avoid re-packing costs at run time (at inference weights stay
+  // constant, i.e. convolution filters and one of the dot arguments).
+  using CapturingBuilder = absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+      absl::Span<const Argument> arguments, absl::Span<const Result> results,
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers)>;
+
+  static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> Create(
+      Options options, Info info, const HloInstruction* hlo,
+      std::vector<Argument> arguments, std::vector<Result> results,
+      Builder builder);
+
+  static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> Create(
+      Options options, Info info, const HloInstruction* hlo,
+      std::vector<Argument> arguments, std::vector<Result> results,
+      CapturingBuilder capturing_builder,
+      absl::Span<const int64_t> captured_arguments_ids);
+
+  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
+
+  bool ExecuteMayBlock() const final { return true; }
+
+  BufferUses buffer_uses() const final;
+
+  Options options() const { return options_; }
+
+  YnnFusionKind ynn_fusion_kind() const { return ynn_fusion_kind_; }
+
+  const HloInstruction* hlo() const { return hlo_; }
+
+  absl::Span<const Argument> arguments() const { return arguments_; }
+  absl::Span<const Result> results() const { return results_; }
+
+ protected:
+  YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
+                 const HloInstruction* hlo, std::vector<Argument> arguments,
+                 std::vector<Result> results, Builder builder);
+
+  YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
+                 const HloInstruction* hlo, std::vector<Argument> arguments,
+                 std::vector<Result> results,
+                 CapturingBuilder capturing_builder,
+                 absl::Span<const int64_t> captured_arguments_ids);
+
+  // Extension points for subclasses to customize the logging behavior.
+  virtual std::string fusion_kind() const { return "fusion"; }
+  virtual std::string fusion_description() const { return ""; }
+
+  virtual bool has_fusion_details() const { return false; }
+  virtual std::vector<std::string> fusion_details() const { return {}; }
+
+  virtual std::string argument_name(size_t index) const {
+    return absl::StrCat("arg #", index);
+  }
+
+  virtual std::string result_name(size_t index) const {
+    return absl::StrCat("res #", index);
+  }
+
+ private:
+  // YNNPACK subgraph + runtime instantiated and ready for execution.
+  struct YnnExecutable;
+
+  // Creates YnnExecutable for the fusion operation using one of the builders.
+  absl::StatusOr<YnnExecutable> CreateYnnExecutable(
+      const YnnThreadpool& threadpool,
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers);
+
+  // Updates YnnExecutable to the YNN subgraph constructed with the given
+  // arguments buffers.
+  absl::Status UpdateYnnExecutable(
+      const YnnThreadpool& threadpool, YnnExecutable& executable,
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers);
+
+  // Returns the list of captured arguments buffers.
+  std::vector<se::DeviceMemoryBase> CaptureArguments(
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers);
+
+  YnnFusionKind ynn_fusion_kind_;
+  Options options_;
+
+  // A pointer to the HLO instruction that this thunk is associated with. Owned
+  // by the `HloModule` associated with the XLA executable.
+  const HloInstruction* hlo_;  // not owned
+
+  std::vector<Argument> arguments_;
+  std::vector<Result> results_;
+
+  // Builder that constructs YNNPACK subgraph for the fusion operation.
+  Builder builder_;
+
+  // Builder that constructs YNNPACK subgraph for the fusion operation and
+  // captures some of the arguments buffers by value. Such subgraphs can't be
+  // reused if captured arguments changed since the last execution.
+  CapturingBuilder capturing_builder_;
+
+  // Indices of arguments that are captured by YNNPACK subgraph by value.
+  std::vector<int64_t> captured_arguments_ids_;
+
+  // XLA:CPU executable can be called concurrently from multiple threads,
+  // and we need to keep a pool of YNNPACK executables to avoid data races.
+  using YnnExecutablePool = ObjectPool<YnnExecutable, const YnnThreadpool&,
+                                       absl::Span<const se::DeviceMemoryBase>>;
+  YnnExecutablePool ynn_executable_pool_;
+
+  // The number of YNNPACK executables created for capturing graphs.
+  std::atomic<int64_t> num_capturing_created_{0};
+};
+
+std::ostream& operator<<(std::ostream& os, YnnFusionThunk::YnnFusionKind kind);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_FUSION_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk_test.cc
new file mode 100644
index 00000000000000..8a1c79540ac530
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/buffer_allocations.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/thunk_testlib.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/xla_data.pb.h"
+
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu {
+namespace {
+
+static absl::StatusOr<YnnSubgraph> BuildBinaryAddSubgraph(
+    absl::Span<const YnnFusionThunk::Argument> arguments,
+    absl::Span<const YnnFusionThunk::Result> results) {
+  TF_ASSIGN_OR_RETURN(YnnSubgraph subgraph,
+                      CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
+                        return ynn_create_subgraph(
+                            /*external_value_ids=*/3,
+                            /*flags=*/0, subgraph);
+                      }));
+
+  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
+    return {dims.begin(), dims.end()};
+  };
+
+  uint32_t lhs_id = 0;
+  uint32_t rhs_id = 1;
+  uint32_t out_id = 2;
+
+  std::vector<size_t> lhs_dims = dims(arguments[0].shape.dimensions());
+  std::vector<size_t> rhs_dims = dims(arguments[1].shape.dimensions());
+  std::vector<size_t> out_dims = dims(results[0].shape.dimensions());
+
+  YNN_RETURN_IF_ERROR(
+      ynn_define_tensor_value(subgraph.get(), ynn_type_fp32, lhs_dims.size(),
+                              lhs_dims.data(), /*data=*/nullptr,
+                              /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+                              /*scale_id=*/YNN_INVALID_VALUE_ID,
+                              YNN_VALUE_FLAG_EXTERNAL_INPUT, &lhs_id));
+
+  YNN_RETURN_IF_ERROR(
+      ynn_define_tensor_value(subgraph.get(), ynn_type_fp32, rhs_dims.size(),
+                              rhs_dims.data(), /*data=*/nullptr,
+                              /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+                              /*scale_id=*/YNN_INVALID_VALUE_ID,
+                              YNN_VALUE_FLAG_EXTERNAL_INPUT, &rhs_id));
+
+  YNN_RETURN_IF_ERROR(
+      ynn_define_tensor_value(subgraph.get(), ynn_type_fp32, rhs_dims.size(),
+                              rhs_dims.data(), /*data=*/nullptr,
+                              /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+                              /*scale_id=*/YNN_INVALID_VALUE_ID,
+                              YNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
+
+  YNN_RETURN_IF_ERROR(ynn_define_binary(subgraph.get(), ynn_binary_add, lhs_id,
+                                        rhs_id, &out_id, /*flags=*/0));
+
+  return subgraph;
+}
+
+class YnnFusionThunkTest : public testing::TestWithParam<bool> {
+ public:
+  static std::string Name(const ::testing::TestParamInfo<bool>& info) {
+    return absl::StrCat(info.param ? "threadpool" : "single_threaded");
+  }
+
+ protected:
+  bool use_threadpool() const { return GetParam(); }
+};
+
+TEST_P(YnnFusionThunkTest, ElementwiseAdd) {
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
+  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
+                                 threads.NumThreads());
+
+  auto lhs = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
+  auto rhs = LiteralUtil::CreateR1<float>({4.0, 3.0, 2.0, 1.0});
+  auto out = LiteralUtil::CreateR1<float>({0.0, 0.0, 0.0, 0.0});
+
+  BufferAllocations allocations = CreateBufferAllocations(lhs, rhs, out);
+
+  auto [lhs_alloc, rhs_alloc, out_alloc] =
+      CreateBufferAllocation(lhs, rhs, out);
+  auto [lhs_slice, rhs_slice, out_slice] =
+      CreateBufferAllocationSlice(lhs_alloc, rhs_alloc, out_alloc);
+
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  YnnFusionThunk::Argument lhs_arg = {lhs_slice, shape};
+  YnnFusionThunk::Argument rhs_arg = {rhs_slice, shape};
+  YnnFusionThunk::Result out_res = {out_slice, shape};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto thunk, YnnFusionThunk::Create(
+                      YnnFusionThunk::Options{use_threadpool()}, {"fusion"},
+                      reinterpret_cast<HloInstruction*>(0xDEADBEEF),
+                      {lhs_arg, rhs_arg}, {out_res}, &BuildBinaryAddSubgraph));
+
+  YnnThreadpool threadpool;
+  if (use_threadpool()) {
+    TF_ASSERT_OK_AND_ASSIGN(threadpool, CreateYnnThreadpool(&device));
+  }
+  Thunk::YnnParams ynn_params(std::move(threadpool));
+
+  Thunk::ExecuteParams params;
+  params.buffer_allocations = &allocations;
+  params.intra_op_threadpool = use_threadpool() ? &device : nullptr;
+  params.ynn_params = &ynn_params;
+
+  auto execute_event = thunk->Execute(params);
+  tsl::BlockUntilReady(execute_event);
+  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
+
+  EXPECT_EQ(out, LiteralUtil::CreateR1<float>({5.0, 5.0, 5.0, 5.0}));
+}
+
+INSTANTIATE_TEST_SUITE_P(YnnFusion, YnnFusionThunkTest, ::testing::Bool(),
+                         YnnFusionThunkTest::Name);
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.cc
new file mode 100644
index 00000000000000..67cc6fce397845
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.cc
@@ -0,0 +1,72 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "xla/primitive_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<YnnSubgraph> CreateYnnSubgraph(
+    absl::FunctionRef<ynn_status(ynn_subgraph_t*)> builder) {
+  ynn_subgraph_t subgraph = nullptr;
+  YNN_RETURN_IF_ERROR(builder(&subgraph));
+  return YnnSubgraph(subgraph);
+}
+
+absl::StatusOr<YnnRuntime> CreateYnnRuntime(
+    absl::FunctionRef<ynn_status(ynn_runtime_t*)> builder) {
+  ynn_runtime_t runtime = nullptr;
+  YNN_RETURN_IF_ERROR(builder(&runtime));
+  return YnnRuntime(runtime);
+}
+
+absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
+    absl::FunctionRef<ynn_status(ynn_threadpool_t*)> builder) {
+  ynn_threadpool_t threadpool = nullptr;
+  YNN_RETURN_IF_ERROR(builder(&threadpool));
+  return YnnThreadpool(threadpool);
+}
+
+absl::StatusOr<ynn_type> YnnType(const PrimitiveType& type) {
+  switch (type) {
+    case S4:
+      return ynn_type_int4;
+    case U4:
+      return ynn_type_uint4;
+    case S8:
+      return ynn_type_int8;
+    case U8:
+      return ynn_type_uint8;
+    case BF16:
+      return ynn_type_bf16;
+    case F16:
+      return ynn_type_fp16;
+    case F32:
+      return ynn_type_fp32;
+    case S32:
+      return ynn_type_int32;
+    default:
+      return InvalidArgument("Unsupported YNNPACK type: %s",
+                             primitive_util::LowercasePrimitiveTypeName(type));
+  }
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.h b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.h
new file mode 100644
index 00000000000000..372b833e054ac4
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.h
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_INTEROP_H_
+#define XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_INTEROP_H_
+
+#include <memory>
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/base/optimization.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+//===----------------------------------------------------------------------===//
+// YNNPACK status to ABSL status conversion macros.
+//===----------------------------------------------------------------------===//
+
+#define YNN_RETURN_IF_ERROR(expr)             \
+  do {                                        \
+    absl::Status s = YnnStatusToStatus(expr); \
+    if (!s.ok()) {                            \
+      return s;                               \
+    }                                         \
+  } while (0)
+
+#define YNN_LOG_IF_ERROR(expr)                         \
+  do {                                                 \
+    absl::Status s = YnnStatusToStatus(expr);          \
+    if (!s.ok()) {                                     \
+      LOG(ERROR) << "YNNPACK operation failed: " << s; \
+    }                                                  \
+  } while (0)
+
+// Converts YNNPACK status to absl::Status.
+inline absl::Status YnnStatusToStatus(ynn_status status) {
+  if (ABSL_PREDICT_TRUE(status == ynn_status_success)) {
+    return absl::OkStatus();
+  }
+
+  auto error_message = [](ynn_status status) {
+    switch (status) {
+      case ynn_status_success:
+        return "";
+      case ynn_status_deprecated:
+        return "deprecated";
+      case ynn_status_error:
+        return "error";
+      case ynn_status_invalid_parameter:
+        return "invalid parameter";
+      case ynn_status_unsupported_parameter:
+        return "unsupported parameter";
+    }
+  };
+
+  return Internal("YNNPACK operation failed: %s", error_message(status));
+}
+
+//===----------------------------------------------------------------------===//
+// XLA to YNNPACK type conversions.
+//===----------------------------------------------------------------------===//
+
+absl::StatusOr<ynn_type> YnnType(const PrimitiveType& type);
+
+//===----------------------------------------------------------------------===//
+// RAII wrappers for YNNPACK types.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+struct YnnDeleter {
+  void operator()(ynn_subgraph* subgraph) { ynn_delete_subgraph(subgraph); }
+  void operator()(ynn_runtime* runtime) { ynn_delete_runtime(runtime); }
+  void operator()(ynn_threadpool* threadpool) {
+    ynn_delete_threadpool(threadpool);
+  }
+};
+
+}  // namespace internal
+
+using YnnSubgraph = std::unique_ptr<ynn_subgraph, internal::YnnDeleter>;
+using YnnRuntime = std::unique_ptr<ynn_runtime, internal::YnnDeleter>;
+using YnnThreadpool = std::unique_ptr<ynn_threadpool, internal::YnnDeleter>;
+
+absl::StatusOr<YnnSubgraph> CreateYnnSubgraph(
+    absl::FunctionRef<ynn_status(ynn_subgraph_t*)> builder);
+
+absl::StatusOr<YnnRuntime> CreateYnnRuntime(
+    absl::FunctionRef<ynn_status(ynn_runtime_t*)> builder);
+
+absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
+    absl::FunctionRef<ynn_status(ynn_threadpool_t*)> builder);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_INTEROP_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.cc
new file mode 100644
index 00000000000000..11027e4b90adee
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.cc
@@ -0,0 +1,46 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
+
+#include <cassert>
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+
+#define EIGEN_USE_THREADS
+#include "Eigen/ThreadPool"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu {
+
+absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
+    Eigen::ThreadPoolInterface* threadpool) {
+  return CreateYnnThreadpool([&](ynn_threadpool_t* ynn_threadpool) {
+    *ynn_threadpool =
+        reinterpret_cast<ynn_threadpool_t>(new SlinkyThreadPool(threadpool));
+    return ynn_status_success;
+  });
+}
+
+absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
+    const Eigen::ThreadPoolDevice* device) {
+  return CreateYnnThreadpool(device->getPool());
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h
new file mode 100644
index 00000000000000..9fe8186c34a64c
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h
@@ -0,0 +1,39 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_THREADPOOL_H_
+#define XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_THREADPOOL_H_
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+class ThreadPoolInterface;
+}  // namespace Eigen
+
+namespace xla::cpu {
+
+// Creates an YNNPACK threadpool from an Eigen threadpool.
+absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
+    Eigen::ThreadPoolInterface* threadpool);
+
+// Creates an YNNPACK threadpool from an Eigen ThreadPoolDevice.
+absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
+    const Eigen::ThreadPoolDevice* device);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_YNNPACK_YNN_THREADPOOL_H_
diff --git a/third_party/xla/xla/backends/cpu/target_machine_options.cc b/third_party/xla/xla/backends/cpu/target_machine_options.cc
new file mode 100644
index 00000000000000..d52eb5ad92287d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/target_machine_options.cc
@@ -0,0 +1,180 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/target_machine_options.h"
+
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"  // IWYU pragma: keep
+#include "llvm/TargetParser/Host.h"
+#include "xla/backends/cpu/codegen/cpu_features.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace cpu {
+
+namespace {
+
+bool ValidateTargetMachineFeaturesString(absl::string_view features) {
+  if (features.empty()) {
+    return true;
+  }
+  for (const auto& feature : absl::StrSplit(features, ',')) {
+    if ((!absl::StartsWith(feature, "+") && !absl::StartsWith(feature, "-")) ||
+        feature.size() <= 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void EnableFeaturesIfAVX512(std::vector<std::string>& features) {
+  auto avx512_it = std::find_if(features.begin(), features.end(),
+                                [](const std::string& feature) {
+                                  return absl::StrContains(feature, "avx512");
+                                });
+  bool has_avx512 = avx512_it != features.end();
+  if (!has_avx512) {
+    return;
+  }
+
+  auto prefer_no_scatter_it = std::find_if(
+      features.begin(), features.end(), [](const std::string& feature) {
+        return absl::StrContains(feature, "prefer-no-scatter");
+      });
+
+  if (prefer_no_scatter_it == features.end()) {
+    features.push_back("prefer-no-scatter");
+  }
+
+  auto prefer_no_gather_it = std::find_if(
+      features.begin(), features.end(), [](const std::string& feature) {
+        return absl::StrContains(feature, "prefer-no-gather");
+      });
+
+  if (prefer_no_gather_it == features.end()) {
+    features.push_back("prefer-no-gather");
+  }
+}
+
+std::pair<std::vector<std::string>, std::vector<std::string>>
+GetEnabledAndDisabledFeatures(const std::vector<std::string>& features) {
+  std::vector<std::string> enabled_features;
+  std::vector<std::string> disabled_features;
+  for (const auto& feature : features) {
+    if (absl::StartsWith(feature, "+")) {
+      enabled_features.push_back(feature.substr(1));
+    } else if (absl::StartsWith(feature, "-")) {
+      disabled_features.push_back(feature.substr(1));
+    }
+  }
+  return std::make_pair(enabled_features, disabled_features);
+}
+
+}  // namespace
+
+TargetMachineOptions::TargetMachineOptions(const DebugOptions& debug_options) {
+  triple_ = llvm::sys::getDefaultTargetTriple();
+  auto xla_cpu_max_isa = CpuFeatureFromString(debug_options.xla_cpu_max_isa());
+  auto detected_machine_attributes = DetectMachineAttributes(xla_cpu_max_isa);
+
+  std::tie(enabled_features_, disabled_features_) =
+      GetEnabledAndDisabledFeatures(detected_machine_attributes.features);
+
+  // If `max_cpu_feature` is newer than the host CPU, we should keep the host
+  // CPU name, e.g., we don't want to set the target CPU to Skylake when we
+  // are on a Broadwell host.
+  cpu_ = detected_machine_attributes.num_filtered_features
+             ? CpuTargetFromMaxFeature(*xla_cpu_max_isa)
+             : absl::string_view(llvm::sys::getHostCPUName());
+
+  EnableFeaturesIfAVX512(enabled_features_);
+}
+
+TargetMachineOptions::TargetMachineOptions(absl::string_view triple,
+                                           absl::string_view cpu,
+                                           absl::string_view features)
+    : triple_(triple), cpu_(cpu) {
+  std::vector<std::string> features_vec = absl::StrSplit(features, ',');
+  std::tie(enabled_features_, disabled_features_) =
+      GetEnabledAndDisabledFeatures(features_vec);
+  EnableFeaturesIfAVX512(enabled_features_);
+}
+
+std::vector<std::string> TargetMachineOptions::GetTargetMachineFeaturesVector()
+    const {
+  std::vector<std::string> all_features;
+  all_features.reserve(enabled_features_.size() + disabled_features_.size());
+  for (const auto& feature : enabled_features_) {
+    all_features.push_back(absl::StrCat("+", feature));
+  }
+  for (const auto& feature : disabled_features_) {
+    all_features.push_back(absl::StrCat("-", feature));
+  }
+
+  return all_features;
+}
+
+std::string TargetMachineOptions::GetTargetMachineFeatures() const {
+  return absl::StrJoin(GetTargetMachineFeaturesVector(), ",");
+}
+
+TargetMachineOptionsProto TargetMachineOptions::ToProto() const {
+  TargetMachineOptionsProto proto;
+  proto.set_triple(triple_);
+  proto.set_cpu(cpu_);
+  proto.set_features(GetTargetMachineFeatures());
+  return proto;
+}
+
+/*static*/
+absl::StatusOr<TargetMachineOptions> TargetMachineOptions::FromProto(
+    const TargetMachineOptionsProto& proto) {
+  if (!ValidateTargetMachineFeaturesString(proto.features())) {
+    return Internal("Invalid target machine features: %s",
+                    std::string(proto.features()));
+  }
+  return TargetMachineOptions(proto.triple(), proto.cpu(), proto.features());
+}
+
+absl::Status TargetMachineOptions::SetFeatures(absl::string_view features) {
+  if (!ValidateTargetMachineFeaturesString(features)) {
+    return Internal("Trying to set invalid target machine features: %s",
+                    std::string(features));
+  }
+
+  std::vector<std::string> features_vec = absl::StrSplit(features, ',');
+  std::tie(enabled_features_, disabled_features_) =
+      GetEnabledAndDisabledFeatures(features_vec);
+  EnableFeaturesIfAVX512(enabled_features_);
+
+  return absl::OkStatus();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/target_machine_options.h b/third_party/xla/xla/backends/cpu/target_machine_options.h
new file mode 100644
index 00000000000000..e503039bb2e90d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/target_machine_options.h
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_TARGET_MACHINE_OPTIONS_H_
+#define XLA_BACKENDS_CPU_TARGET_MACHINE_OPTIONS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace cpu {
+
+// Helper class to manage the target machine options for CPU compilation.
+class TargetMachineOptions {
+ public:
+  // Creates a TargetMachineOptions object from the given DebugOptions. This
+  // will create a TargetMachineOptions object for the host machine.
+  explicit TargetMachineOptions(const DebugOptions& debug_options);
+
+  // Creates a TargetMachineOptions object from the given triple, cpu, and
+  // features.
+  TargetMachineOptions(absl::string_view triple, absl::string_view cpu,
+                       absl::string_view features);
+
+  TargetMachineOptionsProto ToProto() const;
+  static absl::StatusOr<TargetMachineOptions> FromProto(
+      const TargetMachineOptionsProto& proto);
+
+  const std::string& triple() const { return triple_; }
+  const std::string& cpu() const { return cpu_; }
+  const std::vector<std::string>& enabled_features() const {
+    return enabled_features_;
+  }
+  const std::vector<std::string>& disabled_features() const {
+    return disabled_features_;
+  }
+
+  absl::Status SetFeatures(absl::string_view features);
+
+  // Returns the target machine features in the format that LLVM understands
+  // (e.x. "+avx2,-avx512")).
+  std::string GetTargetMachineFeatures() const;
+
+  // Returns the target machine features in the format that LLVM understands -
+  // features prefixed with "+" or "-". E.x. {"+avx2", "-avx512"}.
+  std::vector<std::string> GetTargetMachineFeaturesVector() const;
+
+ private:
+  TargetMachineOptions() = default;
+
+  std::string triple_;
+  std::string cpu_;
+  std::vector<std::string> enabled_features_;
+  std::vector<std::string> disabled_features_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_CPU_TARGET_MACHINE_OPTIONS_H_
diff --git a/third_party/xla/xla/backends/cpu/target_machine_options_test.cc b/third_party/xla/xla/backends/cpu/target_machine_options_test.cc
new file mode 100644
index 00000000000000..fec5439e0bb31d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/target_machine_options_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/target_machine_options.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+TEST(TargetMachineOptionsTest, ToProto) {
+  DebugOptions debug_options;
+  TargetMachineOptions options(debug_options);
+  TargetMachineOptionsProto proto = options.ToProto();
+
+  EXPECT_EQ(proto.triple(), options.triple());
+  EXPECT_EQ(proto.cpu(), options.cpu());
+  EXPECT_EQ(proto.features(), options.GetTargetMachineFeatures());
+}
+
+TEST(TargetMachineOptionsTest, FromProto) {
+  TargetMachineOptionsProto proto;
+  proto.set_triple("test_triple");
+  proto.set_cpu("test_cpu");
+  proto.set_features("+enabled_feature,-disabled_feature");
+
+  TF_ASSERT_OK_AND_ASSIGN(TargetMachineOptions options,
+                          TargetMachineOptions::FromProto(proto));
+
+  EXPECT_EQ(options.triple(), "test_triple");
+  EXPECT_EQ(options.cpu(), "test_cpu");
+  EXPECT_THAT(options.enabled_features(),
+              testing::ElementsAre("enabled_feature"));
+  EXPECT_THAT(options.disabled_features(),
+              testing::ElementsAre("disabled_feature"));
+  EXPECT_EQ(options.GetTargetMachineFeatures(),
+            "+enabled_feature,-disabled_feature");
+}
+
+TEST(TargetMachineOptionsTest, ProtoRoundTrip) {
+  DebugOptions debug_options;
+  TargetMachineOptions options(debug_options);
+  TargetMachineOptionsProto proto = options.ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(TargetMachineOptions new_options,
+                          TargetMachineOptions::FromProto(proto));
+
+  EXPECT_EQ(new_options.triple(), options.triple());
+  EXPECT_EQ(new_options.cpu(), options.cpu());
+  EXPECT_EQ(new_options.GetTargetMachineFeatures(),
+
+            options.GetTargetMachineFeatures());
+}
+
+TEST(TargetMachineOptionsTest, ConstructorWithFeatures) {
+  TargetMachineOptions options("test_triple", "test_cpu", "+avx2,-avx512");
+
+  EXPECT_EQ(options.triple(), "test_triple");
+  EXPECT_EQ(options.cpu(), "test_cpu");
+  EXPECT_THAT(options.enabled_features(), testing::ElementsAre("avx2"));
+  EXPECT_THAT(options.disabled_features(), testing::ElementsAre("avx512"));
+  EXPECT_EQ(options.GetTargetMachineFeatures(), "+avx2,-avx512");
+}
+
+TEST(TargetMachineOptionsTest, GetTargetMachineFeaturesFormat) {
+  TargetMachineOptions options1("t", "c", "+f1,-f2");
+  EXPECT_EQ(options1.GetTargetMachineFeatures(), "+f1,-f2");
+
+  TargetMachineOptions options2("t", "c", "-f2,+f1");
+  EXPECT_EQ(options2.GetTargetMachineFeatures(), "+f1,-f2");
+
+  TargetMachineOptions options3("t", "c", "+f1");
+  EXPECT_EQ(options3.GetTargetMachineFeatures(), "+f1");
+
+  TargetMachineOptions options4("t", "c", "-f2");
+  EXPECT_EQ(options4.GetTargetMachineFeatures(), "-f2");
+
+  TargetMachineOptions options5("t", "c", "");
+  EXPECT_EQ(options5.GetTargetMachineFeatures(), "");
+
+  TargetMachineOptions options6("t", "c", "+f1,-f2,+f3,-f4");
+  EXPECT_EQ(options6.GetTargetMachineFeatures(), "+f1,+f3,-f2,-f4");
+}
+
+TEST(TargetMachineOptionsTest, FromProtoWithMalformedFeatures) {
+  TargetMachineOptionsProto proto;
+  proto.set_triple("test_triple");
+  proto.set_cpu("test_cpu");
+  proto.set_features("malformed");
+
+  auto options = TargetMachineOptions::FromProto(proto);
+
+  EXPECT_EQ(options.status().code(), absl::StatusCode::kInternal);
+}
+
+TEST(TargetMachineOptionsTest, FromProtoWithEmptyFeatureAfterPlus) {
+  TargetMachineOptionsProto proto;
+  proto.set_triple("test_triple");
+  proto.set_cpu("test_cpu");
+  proto.set_features("+");
+
+  auto options = TargetMachineOptions::FromProto(proto);
+
+  EXPECT_EQ(options.status().code(), absl::StatusCode::kInternal);
+}
+
+TEST(TargetMachineOptionsTest, SetFeatures) {
+  TargetMachineOptions options("test_triple", "test_cpu", "");
+  TF_ASSERT_OK(options.SetFeatures("+avx2,-avx512"));
+
+  EXPECT_EQ(options.GetTargetMachineFeatures(), "+avx2,-avx512");
+}
+
+TEST(TargetMachineOptionsTest, AVX512ImpliesNoScatterAndNoGather) {
+  TargetMachineOptions options("test_triple", "test_cpu", "+avx512");
+  EXPECT_EQ(options.GetTargetMachineFeatures(),
+            "+avx512,+prefer-no-scatter,+prefer-no-gather");
+}
+
+TEST(TargetMachineOptionsTest, GetTargetMachineFeaturesVector) {
+  TargetMachineOptions options("test_triple", "test_cpu", "+avx2,-avx512");
+  EXPECT_THAT(options.GetTargetMachineFeaturesVector(),
+              testing::ElementsAre("+avx2", "-avx512"));
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/testlib/BUILD b/third_party/xla/xla/backends/cpu/testlib/BUILD
index 61ea2c2c9c8e16..e4d57e372c0ef0 100644
--- a/third_party/xla/xla/backends/cpu/testlib/BUILD
+++ b/third_party/xla/xla/backends/cpu/testlib/BUILD
@@ -24,6 +24,8 @@ cc_library(
     hdrs = ["kernel_runner.h"],
     deps = [
         "//xla:xla_proto_cc",
+        "//xla/backends/cpu:target_machine_options",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
         "//xla/backends/cpu/codegen:fusion_compiler",
@@ -35,21 +37,20 @@ cc_library(
         "//xla/backends/cpu/runtime:kernel_c_api",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
         "//xla/runtime:work_group",
         "//xla/service:hlo_module_config",
+        "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:cpu_options",
-        "//xla/service/cpu:runtime_symbol_generator",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:ir_headers",
@@ -68,7 +69,7 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:jit_compiler",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
         "//xla/hlo/testlib:test",
         "//xla/runtime:buffer_use",
@@ -91,10 +92,9 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -114,9 +114,8 @@ cc_library(
     deps = [
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/runtime:work_group",
@@ -156,13 +155,11 @@ tsl_pybind_extension(
         "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter",
         "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
+        "//xla/codegen:kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -192,7 +189,7 @@ xla_cc_test(
         ":llvm_ir_kernel_emitter",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:launch_dim",
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
index 49b7bc25c9e272..e1597dc72dcde0 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
@@ -16,16 +16,17 @@ limitations under the License.
 #include "xla/backends/cpu/testlib/kernel_runner.h"
 
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetOptions.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
@@ -35,15 +36,14 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/kernel.h"
 #include "xla/backends/cpu/runtime/kernel_c_api.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/runtime/work_group.h"
+#include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/cpu_options.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -53,16 +53,15 @@ limitations under the License.
 namespace xla::cpu {
 
 absl::StatusOr<KernelRunner> KernelRunner::Create(
-    LlvmKernelDefinition kernel_definition, JitCompiler compiler) {
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-
-  auto thread_safe_module = std::move(source).thread_safe_module();
+    KernelDefinition<LlvmKernelSource> kernel, JitCompiler compiler) {
+  auto spec = kernel.spec();
+  auto thread_safe_module = std::move(kernel).TakeSource().thread_safe_module();
   SetModuleMemoryRegionName(*thread_safe_module.getModuleUnlocked(),
                             "kernel_runner_test");
 
   TF_RETURN_IF_ERROR(compiler.AddModule(std::move(thread_safe_module)));
 
-  const std::string& kernel_name = spec.name();
+  absl::string_view kernel_name = spec.name();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> library,
                       std::move(compiler).Compile(
                           {FunctionLibrary::Sym<XLA_CPU_Kernel>(kernel_name)}));
@@ -75,13 +74,12 @@ absl::StatusOr<KernelRunner> KernelRunner::Create(
 }
 
 absl::StatusOr<KernelRunner> KernelRunner::Create(
-    MlirKernelDefinition kernel_definition, JitCompiler compiler) {
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-
-  TF_ASSIGN_OR_RETURN(LlvmIrKernelSource llvm_kernel_source,
-                      LowerToLlvm(source));
+    KernelDefinition<MlirKernelSource> kernel, JitCompiler compiler) {
+  auto spec = kernel.spec();
+  auto source = std::move(kernel).TakeSource();
+  TF_ASSIGN_OR_RETURN(LlvmKernelSource llvm_kernel_source, LowerToLlvm(source));
 
-  return Create(LlvmKernelDefinition(spec, std::move(llvm_kernel_source)),
+  return Create(KernelDefinition(spec, std::move(llvm_kernel_source)),
                 std::move(compiler));
 }
 
@@ -107,7 +105,8 @@ absl::StatusOr<JitCompiler> KernelRunner::CreateJitCompiler(
   IrCompiler::Options ir_compiler_options{
       /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(config),
       /*optimize_for_size=*/options::OptimizeForSizeRequested(config),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
+      /*target_machine_options=*/
+      TargetMachineOptions(debug_options),
       /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config),
       /*disable_expensive_passes=*/
       debug_options.xla_llvm_disable_expensive_passes(),
@@ -120,7 +119,7 @@ absl::StatusOr<JitCompiler> KernelRunner::CreateJitCompiler(
   // Needed to resolve symbols such as built in intrinsics (sin, cos etc).
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   JitCompiler::Options jit_compiler_options{
@@ -139,7 +138,7 @@ absl::StatusOr<JitCompiler> KernelRunner::CreateJitCompiler(
                              std::move(ir_compiler));
 }
 
-absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
+absl::StatusOr<LlvmKernelSource> LowerToLlvm(
     MlirKernelSource& mlir_kernel_source) {
   auto llvm_context = std::make_unique<llvm::LLVMContext>();
 
@@ -153,7 +152,7 @@ absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
       std::unique_ptr<llvm::Module> llvm_module,
       fusion_compiler.Compile(*llvm_context, mlir_kernel_source.module()));
 
-  return LlvmIrKernelSource(std::move(llvm_context), std::move(llvm_module));
+  return LlvmKernelSource(std::move(llvm_context), std::move(llvm_module));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
index 5cfb951aedebbf..62ce986b88bdb7 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
@@ -25,9 +25,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/kernel.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/runtime/work_group.h"
@@ -39,9 +37,9 @@ namespace xla::cpu {
 class KernelRunner final : public xla::KernelRunner {
  public:
   static absl::StatusOr<KernelRunner> Create(
-      LlvmKernelDefinition kernel_definition, JitCompiler compiler);
+      KernelDefinition<LlvmKernelSource> kernel, JitCompiler compiler);
   static absl::StatusOr<KernelRunner> Create(
-      MlirKernelDefinition kernel_definition, JitCompiler compiler);
+      KernelDefinition<MlirKernelSource> kernel, JitCompiler compiler);
 
   KernelRunner(KernelRunner&&) = default;
   KernelRunner& operator=(KernelRunner&&) = default;
@@ -59,7 +57,7 @@ class KernelRunner final : public xla::KernelRunner {
   NumWorkGroups num_workgroups_;
 };
 
-absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
+absl::StatusOr<LlvmKernelSource> LowerToLlvm(
     MlirKernelSource& mlir_kernel_source);
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
index 0f22197433d6e9..20385e34237e0d 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
@@ -42,11 +42,8 @@ limitations under the License.
 #include "xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h"
 #include "xla/backends/cpu/testlib/mlir_kernel_emitter.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -89,10 +86,10 @@ NB_MODULE(_extension, kernel_runner_module) {
   // Use a tuple and cast to NumWorkGroups to take advantage of built in
   // bindings.
   using NbNumWorkGroups = std::tuple<uint64_t, uint64_t, uint64_t>;
-  nb::class_<LlvmTestKernelEmitter, LlvmKernelEmitter>(kernel_runner_module,
-                                                       "LlvmTestKernelEmitter")
-      .def("__init__", [](LlvmKernelEmitter* self, absl::string_view ir,
-                          absl::string_view kernel_name,
+  nb::class_<LlvmTestKernelEmitter, KernelEmitter<LlvmKernelSource>>(
+      kernel_runner_module, "LlvmTestKernelEmitter")
+      .def("__init__", [](KernelEmitter<LlvmKernelSource>* self,
+                          absl::string_view ir, absl::string_view kernel_name,
                           NbNumWorkGroups num_workgroups) {
         new (self)
             LlvmTestKernelEmitter(ir, kernel_name,
@@ -102,10 +99,10 @@ NB_MODULE(_extension, kernel_runner_module) {
                                   {});
       });
 
-  nb::class_<MlirTestKernelEmitter, MlirKernelEmitter>(kernel_runner_module,
-                                                       "MlirTestKernelEmitter")
-      .def("__init__", [](MlirKernelEmitter* self, absl::string_view ir,
-                          absl::string_view kernel_name,
+  nb::class_<MlirTestKernelEmitter, KernelEmitter<MlirKernelSource>>(
+      kernel_runner_module, "MlirTestKernelEmitter")
+      .def("__init__", [](KernelEmitter<MlirKernelSource>* self,
+                          absl::string_view ir, absl::string_view kernel_name,
                           NbNumWorkGroups num_workgroups) {
         new (self)
             MlirTestKernelEmitter(ir, kernel_name,
@@ -116,15 +113,14 @@ NB_MODULE(_extension, kernel_runner_module) {
       });
 
   kernel_runner_module.def("lower_to_llvm", [](MlirKernelSource& source) {
-    absl::StatusOr<LlvmIrKernelSource> llvm_ir_kernel_source =
-        LowerToLlvm(source);
+    absl::StatusOr<LlvmKernelSource> llvm_kernel_source = LowerToLlvm(source);
 
-    if (!llvm_ir_kernel_source.ok()) {
+    if (!llvm_kernel_source.ok()) {
       throw std::runtime_error(
-          std::string(llvm_ir_kernel_source.status().message()));
+          std::string(llvm_kernel_source.status().message()));
     }
 
-    return std::move(llvm_ir_kernel_source).value();
+    return std::move(llvm_kernel_source).value();
   });
 
   nb::class_<CpuCompiler>(kernel_runner_module, "HloCompiler")
@@ -160,53 +156,54 @@ NB_MODULE(_extension, kernel_runner_module) {
                                     "TargetMachineFeatures")
       .def("__str__", &TargetMachineFeatures::get_target_feature_string);
 
-  nb::class_<ElementalKernelEmitter, LlvmKernelEmitter>(
+  nb::class_<ElementalKernelEmitter, KernelEmitter<LlvmKernelSource>>(
       kernel_runner_module, "ElementalKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<DotKernelEmitter, LlvmKernelEmitter>(kernel_runner_module,
-                                                  "DotKernelEmitter")
+  nb::class_<DotKernelEmitter, KernelEmitter<LlvmKernelSource>>(
+      kernel_runner_module, "DotKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<ConcatenateKernelEmitter, LlvmKernelEmitter>(
+  nb::class_<ConcatenateKernelEmitter, KernelEmitter<LlvmKernelSource>>(
       kernel_runner_module, "ConcatenateKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<ComputationKernelEmitter, LlvmKernelEmitter>(
+  nb::class_<ComputationKernelEmitter, KernelEmitter<LlvmKernelSource>>(
       kernel_runner_module, "ComputationKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<CpuScatterFusion, MlirKernelEmitter>(kernel_runner_module,
-                                                  "ScatterKernelEmitter")
+  nb::class_<CpuScatterFusion, KernelEmitter<MlirKernelSource>>(
+      kernel_runner_module, "ScatterKernelEmitter")
       .def(
           "__init__",
           [](CpuScatterFusion* self, const HloFusionInstruction* instruction,
-             const BufferAssignment* bufffer_assignment,
-             mlir::MLIRContext* context) {
+             const BufferAssignment* buffer_assignment,
+             mlir::MLIRContext* mlir_context) {
             new (self)
-                CpuScatterFusion(*bufffer_assignment, instruction, context);
+                CpuScatterFusion(*buffer_assignment, instruction, mlir_context);
           },
           nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
-          nb::keep_alive<1, 3>());
+          nb::keep_alive<1, 4>());
 
   kernel_runner_module.def(
       "emit_fusion_kernel",
-      [](mlir::MLIRContext& context, const HloFusionInstruction& fusion,
-         const BufferAssignment* buffer_assignment) {
-        absl::StatusOr<MlirKernelDefinition> kernel_definition =
-            EmitFusionKernel(context, fusion, buffer_assignment, false);
+      [](mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+         const BufferAssignment* buffer_assignment, bool enable_tiled_emitter) {
+        auto kernel_definition =
+            EmitFusionKernel(mlir_context, fusion, buffer_assignment, false,
+                             enable_tiled_emitter);
         if (!kernel_definition.ok()) {
           throw std::runtime_error(kernel_definition.status().ToString());
         }
@@ -240,8 +237,8 @@ NB_MODULE(_extension, kernel_runner_module) {
                                               "KernelRunner")
       .def_static(
           "create",
-          [](std::unique_ptr<MlirKernelDefinition,
-                             nb::deleter<MlirKernelDefinition>>
+          [](std::unique_ptr<KernelDefinition<MlirKernelSource>,
+                             nb::deleter<KernelDefinition<MlirKernelSource>>>
                  kernel_definition,
              std::unique_ptr<JitCompiler, nb::deleter<JitCompiler>>
                  jit_compiler) {
@@ -255,11 +252,12 @@ NB_MODULE(_extension, kernel_runner_module) {
             return *std::move(runner);
           })
       .def_static(
-          "create", [](std::unique_ptr<LlvmKernelDefinition,
-                                       nb::deleter<LlvmKernelDefinition>>
-                           kernel_definition,
-                       std::unique_ptr<JitCompiler, nb::deleter<JitCompiler>>
-                           jit_compiler) {
+          "create",
+          [](std::unique_ptr<KernelDefinition<LlvmKernelSource>,
+                             nb::deleter<KernelDefinition<LlvmKernelSource>>>
+                 kernel_definition,
+             std::unique_ptr<JitCompiler, nb::deleter<JitCompiler>>
+                 jit_compiler) {
             absl::StatusOr<KernelRunner> runner = KernelRunner::Create(
                 std::move(*kernel_definition), std::move(*jit_compiler));
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc
index fe85aaa811a145..c169c3fc14de26 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/jit_compiler.h"
 #include "xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
@@ -83,9 +83,8 @@ TEST(KernelRunnerTest, Add) {
                                 NumWorkGroups{kNumElements},
                                 {read_arg, read_arg, write_arg});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      KernelDefinition<LlvmIrKernelSource> kernel_definition,
-      emitter.EmitKernelDefinition());
+  TF_ASSERT_OK_AND_ASSIGN(KernelDefinition<LlvmKernelSource> kernel_definition,
+                          emitter.EmitKernelDefinition());
   TF_ASSERT_OK_AND_ASSIGN(JitCompiler compiler,
                           KernelRunner::CreateJitCompiler(HloModuleConfig()));
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc
index c517e3caf14ad0..aabcf084b0fa8c 100644
--- a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc
@@ -29,8 +29,7 @@ limitations under the License.
 #include "llvm/Support/SourceMgr.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -53,11 +52,11 @@ LlvmTestKernelEmitter::LlvmTestKernelEmitter(absl::string_view llvm_ir,
   }
 }
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<LlvmTestKernelEmitter::KernelDefinition>
 LlvmTestKernelEmitter::EmitKernelDefinition() {
   auto context = std::make_unique<llvm::LLVMContext>();
 
-  // Parse LLVM IR into a module and create a LlvmIrKernelSource.
+  // Parse LLVM IR into a module and create a LlvmKernelSource.
   llvm::SMDiagnostic diagnostic;
   std::unique_ptr<llvm::Module> module = llvm::parseAssembly(
       llvm::MemoryBufferRef(llvm_ir_, kernel_name_), diagnostic, *context);
@@ -67,7 +66,7 @@ LlvmTestKernelEmitter::EmitKernelDefinition() {
                     diagnostic.getMessage().str());
   }
 
-  LlvmIrKernelSource source(std::move(context), std::move(module));
+  LlvmKernelSource source(std::move(context), std::move(module));
 
   // Convert kernel arguments to fake allocations and buffer uses.
   KernelSpec::Buffers argument_buffers;
@@ -85,7 +84,7 @@ LlvmTestKernelEmitter::EmitKernelDefinition() {
   KernelSpec kernel_spec(kernel_name_, num_workgroups_,
                          std::move(argument_buffers), std::move(result_buffers),
                          /*invariant_arguments=*/{});
-  return LlvmKernelDefinition(std::move(kernel_spec), std::move(source));
+  return KernelDefinition(std::move(kernel_spec), std::move(source));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
index 3e1a29fd1221f0..a9c353a13629a4 100644
--- a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -35,7 +35,7 @@ namespace xla::cpu {
 // into the dedicated LLVM context and module instance. This kernel emitter is
 // intended to be used for testing purposes only: (1) load pre-compiled LLVM IR
 // into the XLA kernel spec; (2) Execute it with user provided input buffers.
-class LlvmTestKernelEmitter : public LlvmKernelEmitter {
+class LlvmTestKernelEmitter : public KernelEmitter<LlvmKernelSource> {
  public:
   // When loading kernel IR into the KernelSpec we create a separate buffer
   // allocation for every kernel argument. We don't use buffer assignment in
@@ -50,9 +50,8 @@ class LlvmTestKernelEmitter : public LlvmKernelEmitter {
                         NumWorkGroups num_workgroups,
                         absl::Span<const KernelArg> args);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const override { return "llvm_test_kernel_emitter"; }
+  absl::string_view name() const override { return "llvm_test_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   std::string llvm_ir_;
diff --git a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc
index 9eedd0656e758e..a40d98f62b0012 100644
--- a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc
@@ -25,13 +25,11 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
-#include "tsl/platform/casts.h"
 
 namespace xla::cpu {
 
@@ -48,12 +46,11 @@ TEST(LlvmIrKernelEmitterTest, ParseLlvmIr) {
   LlvmTestKernelEmitter::KernelArg arg{1024, BufferUse::MemoryAccess::kWrite};
   LlvmTestKernelEmitter emitter(kLlvmIr, "noop", {}, {arg});
 
-  TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
+  TF_ASSERT_OK_AND_ASSIGN(auto kernel_definition,
                           emitter.EmitKernelDefinition());
 
   // Check that LLVM IR was parsed and loaded as a LLVM IR kernel source.
-  auto [kernel_spec, kernel_source] =
-      std::move(kernel_definition).ReleaseStorage();
+  const KernelSpec& kernel_spec = kernel_definition.spec();
 
   EXPECT_EQ(kernel_spec.name(), "noop");
 
@@ -66,7 +63,7 @@ TEST(LlvmIrKernelEmitterTest, ParseLlvmIr) {
   EXPECT_EQ(result_slice.size(), 1024);
 
   llvm::orc::ThreadSafeModule thread_safe_module =
-      std::move(kernel_source).thread_safe_module();
+      std::move(kernel_definition).TakeSource().thread_safe_module();
   const llvm::Module::FunctionListType& functions =
       thread_safe_module.getModuleUnlocked()->getFunctionList();
   EXPECT_THAT(functions,
diff --git a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
index baa524191f318b..6f884f5559c3b5 100644
--- a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
@@ -48,7 +47,7 @@ MlirTestKernelEmitter::MlirTestKernelEmitter(absl::string_view mlir,
   }
 }
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<MlirTestKernelEmitter::KernelDefinition>
 MlirTestKernelEmitter::EmitKernelDefinition() {
   std::unique_ptr<mlir::MLIRContext> context = FusionCompiler::CreateContext();
 
@@ -72,6 +71,6 @@ MlirTestKernelEmitter::EmitKernelDefinition() {
   KernelSpec kernel_spec(kernel_name_, num_workgroups_,
                          std::move(argument_buffers), std::move(result_buffers),
                          /*invariant_arguments=*/{});
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(source));
+  return KernelDefinition(std::move(kernel_spec), std::move(source));
 }
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
index ad447583e556d9..566b97c4b4037b 100644
--- a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -35,7 +35,7 @@ namespace xla::cpu {
 // into the dedicated MLIR context and module instance. This kernel emitter is
 // intended to be used for testing purposes only: (1) load pre-compiled LLVM IR
 // into the XLA kernel spec; (2) Execute it with user provided input buffers.
-class MlirTestKernelEmitter : public MlirKernelEmitter {
+class MlirTestKernelEmitter : public KernelEmitter<MlirKernelSource> {
  public:
   // When loading kernel IR into the KernelSpec we create a separate buffer
   // allocation for every kernel argument. We don't use buffer assignment in
@@ -49,9 +49,8 @@ class MlirTestKernelEmitter : public MlirKernelEmitter {
                         NumWorkGroups num_workgroups,
                         absl::Span<const KernelArg> args);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const override { return "mlir_test_kernel_emitter"; }
+  absl::string_view name() const override { return "mlir_test_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   std::string mlir_;
diff --git a/third_party/xla/xla/backends/cpu/tests/BUILD b/third_party/xla/xla/backends/cpu/tests/BUILD
index 256c723d73f5fd..0e63cc568a6459 100644
--- a/third_party/xla/xla/backends/cpu/tests/BUILD
+++ b/third_party/xla/xla/backends/cpu/tests/BUILD
@@ -33,3 +33,22 @@ xla_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+xla_test(
+    name = "ynn_fusion_test",
+    srcs = ["ynn_fusion_test.cc"],
+    backends = ["cpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
+    deps = [
+        "//xla:error_spec",
+        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
new file mode 100644
index 00000000000000..413f90b4ab959b
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla::cpu {
+namespace {
+
+struct YnnFusionTestParams {
+  std::string in_dtype;
+  std::string out_dtype;  // Only used for mixed input/output types.
+};
+
+class YnnFusionTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>,
+      public ::testing::WithParamInterface<YnnFusionTestParams> {
+ public:
+  static std::string Name(
+      const ::testing::TestParamInfo<YnnFusionTestParams>& info) {
+    return absl::StrCat(info.param.in_dtype, "x", info.param.out_dtype);
+  }
+
+ protected:
+  void RunTest(absl::string_view hlo_template) {
+    YnnFusionTestParams params = GetParam();
+    std::string hlo_text =
+        absl::StrReplaceAll(hlo_template, {{"$dtype", params.in_dtype},
+                                           {"$in_dtype", params.in_dtype},
+                                           {"$out_dtype", params.out_dtype}});
+    bool bf16_compute = params.in_dtype == "bf16" || params.out_dtype == "bf16";
+    double tolerance = bf16_compute ? 1e-2 : 1e-7;
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        hlo_text, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  }
+};
+
+TEST_P(YnnFusionTest, AddAndMultiply) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule add_and_multiply
+
+    ynn_fusion {
+      %lhs = $dtype[4] parameter(0)
+      %rhs = $dtype[4] parameter(1)
+      %add = $dtype[4] add(%lhs, %rhs)
+      ROOT %mul = $in_dtype[4] multiply(%add, %add)
+    }
+
+    ENTRY entry {
+      %p0 = $dtype[4] parameter(0)
+      %p1 = $dtype[4] parameter(1)
+      ROOT %fusion = $dtype[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
+    })";
+
+  RunTest(kModuleStr);
+}
+
+std::vector<YnnFusionTestParams> GetSameTypeTestCases() {
+  return std::vector<YnnFusionTestParams>({
+      YnnFusionTestParams{"bf16", "bf16"},
+      YnnFusionTestParams{"f32", "f32"},
+  });
+}
+
+INSTANTIATE_TEST_SUITE_P(YnnFusionTestInstantiation, YnnFusionTest,
+                         ::testing::ValuesIn(GetSameTypeTestCases()),
+                         YnnFusionTest::Name);
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index a1340d72d42d7a..d23039f371acb1 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -3,6 +3,7 @@ load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api")
 load("//xla/tsl/mkl:graph.bzl", "onednn_graph_cc_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -42,7 +43,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:protobuf",
-    ],
+    ] + if_ynnpack([":ynn_matcher"]),
 )
 
 xla_cc_test(
@@ -115,6 +116,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ynn_matcher",
+    hdrs = ["ynn_matcher.h"],
+    deps = [
+        ":library_matcher",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:protobuf",
+    ] + if_ynnpack(["//xla/backends/cpu:ynn_support"]),
+)
+
 cc_library(
     name = "xnn_graph_fusion",
     srcs = ["xnn_graph_fusion.cc"],
@@ -128,7 +144,6 @@ cc_library(
         "//xla/service:call_graph",
         "//xla/service:instruction_fusion",
         "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/backends/cpu/transforms/collectives/BUILD b/third_party/xla/xla/backends/cpu/transforms/collectives/BUILD
index aae721bdfe7101..5e73102651f193 100644
--- a/third_party/xla/xla/backends/cpu/transforms/collectives/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/collectives/BUILD
@@ -22,12 +22,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/collectives:all_reduce_combiner",
-        "//xla/service:hlo_domain_map",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
     ],
 )
@@ -40,9 +36,6 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/service:collective_utils",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc
index 105ad861eff106..5bf97d564a25c7 100644
--- a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-absl::StatusOr<bool> CpuAllReduceCombiner::Run(
+absl::StatusOr<bool> CpuAllReduceCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return RunWithKeyCombiner(module, execution_threads,
diff --git a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h
index 85d8560337e325..be3f2b21c03749 100644
--- a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h
+++ b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h
@@ -33,8 +33,8 @@ class CpuAllReduceCombiner : public AllReduceCombiner {
 
   absl::string_view name() const override { return "cpu-all-reduce-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
index bc1c2bf9ba9f88..23c5874c652fb0 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
@@ -43,6 +43,9 @@ class LibraryMatcher {
         case DebugOptions::LIBRARY_FUSION_TYPE_ELTWISE:
           fuse_eltwise_ = true;
           break;
+        // Not intended to be used by LibraryMatcher.
+        case DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT:
+          break;
         case DebugOptions::LIBRARY_FUSION_TYPE_REDUCE:
           fuse_reduce_ = true;
           break;
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
index 0faebfbd42e6fa..4e3fd3e2d1a0d9 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
@@ -334,7 +334,7 @@ absl::StatusOr<bool> LibraryRewriter::ProcessComputation(
   return !fused_.empty();
 }
 
-absl::StatusOr<bool> LibraryRewriter::Run(
+absl::StatusOr<bool> LibraryRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool module_changed = false;
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
index fc03585abfdd71..a36f612c655442 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
@@ -39,6 +39,10 @@ limitations under the License.
 #include "xla/backends/cpu/transforms/onednn_matcher.h"
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/transforms/ynn_matcher.h"
+#endif
+
 namespace xla::cpu {
 
 enum class FusionDirection {
@@ -50,8 +54,10 @@ enum class FusionDirection {
 struct LibraryRewriterOptions {
   bool use_onednn = false;
   bool use_xnnpack = false;
+  bool use_ynnpack = false;
   const tsl::protobuf::RepeatedField<int>* onednn_fusion_types = nullptr;
   const tsl::protobuf::RepeatedField<int>* xnn_fusion_types = nullptr;
+  const tsl::protobuf::RepeatedField<int>* ynn_fusion_types = nullptr;
 };
 
 // Rewrites suitable Dot operations into library fusions.
@@ -74,6 +80,14 @@ class LibraryRewriter : public HloModulePass {
       libs_.push_back(std::make_unique<XnnMatcher>(target_machine_features_,
                                                    options_.xnn_fusion_types));
     }
+#ifdef XLA_YNNPACK
+    if (options_.use_ynnpack && options_.ynn_fusion_types != nullptr &&
+        !options_.ynn_fusion_types->empty()) {
+      libs_.push_back(std::make_unique<YnnMatcher>(target_machine_features_,
+                                                   options_.ynn_fusion_types));
+    }
+#endif  // XLA_YNNPACK
+
     for (std::unique_ptr<LibraryMatcher>& lib : libs_) {
       supported_ops_.merge(lib->SupportedOps());
     }
@@ -116,14 +130,13 @@ class LibraryRewriter : public HloModulePass {
   // Finds and creates fusions in the given computation.
   absl::StatusOr<bool> ProcessComputation(HloComputation* computation);
 
-  // Runs the pass.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::string_view name() const override { return "dot-library-rewriter"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  absl::string_view name() const override { return "dot-library-rewriter"; }
-
  private:
   const TargetMachineFeatures* target_machine_features_;
   const LibraryRewriterOptions options_;
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
index 022407dbf9ce75..2646ee286b0259 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
@@ -101,11 +101,16 @@ class CpuLibraryTest : public TargetMachineTestBase {
     tsl::protobuf::RepeatedField<int> empty_fusion_types;
     bool use_onednn = spec.lib == "onednn";
     bool use_xnnpack = spec.lib == "xnn";
+    bool use_ynnpack = spec.lib == "ynn";
     LibraryRewriterOptions options = {
-        use_onednn, use_xnnpack,
+        use_onednn,
+        use_xnnpack,
+        use_ynnpack,
         /*onednn_fusion_types=*/
         use_onednn ? &fusion_types : &empty_fusion_types,
-        /*xnn_fusion_types=*/use_xnnpack ? &fusion_types : &empty_fusion_types};
+        /*xnn_fusion_types=*/use_xnnpack ? &fusion_types : &empty_fusion_types,
+        /*ynn_fusion_types=*/use_ynnpack ? &fusion_types : &empty_fusion_types,
+    };
     LibraryRewriter rewriter(features.get(), options);
     EXPECT_EQ(expected.changed, rewriter.Run(module.get()).value());
     if (!expected.changed) {
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
index 7e92e5d44e97a8..b360691f66f6d6 100644
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <memory>
-#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
@@ -34,7 +33,6 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla.pb.h"
 
 namespace xla::cpu {
@@ -89,9 +87,9 @@ HloInstruction* XnnGraphFusion::Fuse(HloInstruction* producer,
 
   BackendConfig backend_config;
   FusionBackendConfig* fusion_config = backend_config.mutable_fusion_config();
-  fusion_config->set_kind(std::string{kXnnFusionKind});
+  fusion_config->set_kind(kXnnFusionKind);
   CHECK(backend_config.has_fusion_config());
-  TF_CHECK_OK(fusion->set_backend_config(backend_config));
+  CHECK_OK(fusion->set_backend_config(backend_config));
   return fusion;
 }
 
diff --git a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
new file mode 100644
index 00000000000000..38dc8f6f820cf7
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
@@ -0,0 +1,115 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_TRANSFORMS_YNN_MATCHER_H_
+#define XLA_BACKENDS_CPU_TRANSFORMS_YNN_MATCHER_H_
+
+#include <string>
+
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/transforms/library_matcher.h"
+#include "xla/backends/cpu/ynn_support.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla::cpu {
+
+class YnnMatcher : public LibraryMatcher {
+ public:
+  explicit YnnMatcher(const TargetMachineFeatures* target_machine_features,
+                      const tsl::protobuf::RepeatedField<int>* fusion_types)
+      : LibraryMatcher(target_machine_features, fusion_types) {}
+  ~YnnMatcher() override = default;
+
+  // Returns the set of supported HLO instructions.
+  absl::flat_hash_set<HloOpcode> SupportedOps() const override {
+    static const absl::NoDestructor<absl::flat_hash_set<HloOpcode>>
+        kSupportedOps{[]() {
+          absl::flat_hash_set<HloOpcode> supported_ops{
+              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant};
+          for (const auto& [op, _] : GetYnnUnaryOpMap()) {
+            supported_ops.insert(op);
+          }
+          for (const auto& [op, _] : GetYnnBinaryOpMap()) {
+            supported_ops.insert(op);
+          }
+          return supported_ops;
+        }()};
+    return *kSupportedOps;
+  }
+
+  // Returns true if the HLO instruction is supported by the library.
+  absl::StatusOr<bool> IsOpSupported(const HloInstruction* instr) override {
+    if (instr->opcode() == HloOpcode::kDot) {
+      return IsDotSupportedByYnn(instr->dot_dimension_numbers(),
+                                 instr->operand(0)->shape(),
+                                 instr->operand(1)->shape(), instr->shape());
+    }
+    if (instr->opcode() == HloOpcode::kReduce) {
+      return IsReduceOpOffloadedToYnn(instr);
+    }
+    if (instr->IsConstant()) {
+      return IsConstantSupportedByYnn(instr);
+    }
+    // TODO(b/441837668): Need to get the reduction performance right before
+    // enabling fusions. Fusions make performance analysis quite challenging.
+    if (fuse_reduce_) {
+      return false;
+    }
+    if (instr->IsElementwise()) {
+      return IsElementwiseOpSupportedByYnn(instr);
+    }
+    return false;
+  }
+
+  // Returns true if we should start a new fusion containing just the given HLO
+  // instruction. We control the instructions that can start a fusion with the
+  // `--xla_cpu_experimental_ynn_fusion_type` flag.
+  bool ShouldCreateFusion(const HloInstruction* instr) override {
+    if (fuse_dot_ && instr->opcode() == HloOpcode::kDot) {
+      return true;
+    }
+    if (fuse_reduce_ && instr->opcode() == HloOpcode::kReduce) {
+      return true;
+    }
+    return fuse_eltwise_ && instr->IsElementwise();
+  }
+
+  PrimitiveType LibraryOpOutputType(const HloInstruction* instr) override {
+    auto out_type = instr->shape().element_type();
+    if (instr->opcode() != HloOpcode::kDot) {
+      return out_type;
+    }
+    return out_type == BF16 ? F32 : out_type;
+  }
+
+  // Returns a prefix string for the fusion op's name.
+  std::string fusion_prefix() const override { return "ynn_"; }
+
+  // Returns a string for FusionBackendConfig's fusion kind.
+  absl::string_view fusion_kind() const override { return kYnnFusionKind; }
+
+ private:
+  absl::flat_hash_set<DebugOptions::LibraryFusionType> fusion_types_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_TRANSFORMS_YNN_MATCHER_H_
diff --git a/third_party/xla/xla/backends/cpu/xnnpack_config.proto b/third_party/xla/xla/backends/cpu/xnn_fusion_options.proto
similarity index 67%
rename from third_party/xla/xla/backends/cpu/xnnpack_config.proto
rename to third_party/xla/xla/backends/cpu/xnn_fusion_options.proto
index 77e023b6fda674..56a5cfbf29992f 100644
--- a/third_party/xla/xla/backends/cpu/xnnpack_config.proto
+++ b/third_party/xla/xla/backends/cpu/xnn_fusion_options.proto
@@ -2,6 +2,6 @@ syntax = "proto3";
 
 package xla.cpu;
 
-message XnnFusionBackendConfig {
+message XnnFusionOptions {
   bool use_threadpool = 1;
 }
diff --git a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h b/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
index a6a03afd400003..83bac68b0c2ce8 100644
--- a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
+++ b/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
@@ -16,9 +16,10 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
 #define XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
 
+#include <functional>
+
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
-#include "xla/primitive_util.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
diff --git a/third_party/xla/xla/backends/cpu/xnn_support.cc b/third_party/xla/xla/backends/cpu/xnn_support.cc
index a72d11b71abe87..307d7adb859472 100644
--- a/third_party/xla/xla/backends/cpu/xnn_support.cc
+++ b/third_party/xla/xla/backends/cpu/xnn_support.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
 #include "xla/backends/cpu/xnn_gemm_config.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
new file mode 100644
index 00000000000000..5671bcbafb8c28
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -0,0 +1,479 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/ynn_emitter.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_support.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
+#include "xla/primitive_util.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// A mapping from HloInstruction to YNNPACK subgraph tensor id.
+using TensorIdMap = absl::flat_hash_map<const HloInstruction*, uint32_t>;
+
+//===----------------------------------------------------------------------===//
+// XLA <-> YNNPACK type conversion library.
+//===----------------------------------------------------------------------===//
+
+static std::vector<size_t> YnnDimensions(const Shape& shape) {
+  absl::Span<const int64_t> dims = shape.dimensions();
+  return {dims.begin(), dims.end()};
+}
+
+//===----------------------------------------------------------------------===//
+// XLA <-> YNNPACK emitters.
+//===----------------------------------------------------------------------===//
+
+static absl::StatusOr<uint32_t> FindTensorValue(const TensorIdMap& tensor_ids,
+                                                const HloInstruction* instr) {
+  if (auto it = tensor_ids.find(instr); it != tensor_ids.end()) {
+    return it->second;
+  }
+  return Internal("Can't fine YNNPACK tensor value for instruction %s",
+                  instr->ToString());
+}
+
+static absl::StatusOr<uint32_t> DefineTensorValue(ynn_subgraph_t subgraph,
+                                                  const HloInstruction* instr) {
+  // We do not support instructions with multiple results (tuples).
+  if (!instr->shape().IsArray()) {
+    return Internal("Unsupported YNNPACK instruction shape: %s",
+                    instr->ToString());
+  }
+
+  auto dims = YnnDimensions(instr->shape());
+  TF_ASSIGN_OR_RETURN(auto type, YnnType(instr->shape().element_type()));
+
+  uint32_t tensor_id = YNN_INVALID_VALUE_ID;
+  uint32_t tensor_flags = 0;
+
+  // If instruction is a root instruction of the parent computation we assign it
+  // an external tensor id corresponding to the result index.
+  const HloComputation* computation = instr->parent();
+  if (computation->root_instruction() == instr) {
+    tensor_id = computation->num_parameters();
+    tensor_flags = YNN_VALUE_FLAG_EXTERNAL_OUTPUT;
+  }
+
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph, type, dims.size(), dims.data(), /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, tensor_flags, &tensor_id));
+  return tensor_id;
+}
+
+static absl::StatusOr<uint32_t> DefineConstant(
+    ynn_subgraph_t subgraph, std::vector<std::unique_ptr<Literal>>& literals,
+    const HloInstruction* instr) {
+  // We do not support instructions with multiple results (tuples).
+  if (!instr->shape().IsArray()) {
+    return Internal("Unsupported YNNPACK instruction shape: %s",
+                    instr->ToString());
+  }
+
+  auto dims = YnnDimensions(instr->shape());
+  TF_ASSIGN_OR_RETURN(auto type, YnnType(instr->shape().element_type()));
+
+  uint32_t tensor_id = YNN_INVALID_VALUE_ID;
+
+  literals.push_back(instr->literal().CloneToUnique());
+  const void* value = literals.back()->untyped_data();
+
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph, type, dims.size(), dims.data(), /*data=*/value,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID,
+      /*flags=*/0, &tensor_id));
+
+  return tensor_id;
+}
+
+static absl::StatusOr<uint32_t> DefineParameter(ynn_subgraph_t subgraph,
+                                                const HloInstruction* param) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for parameter: %s",
+                                param->ToString());
+
+  auto dims = YnnDimensions(param->shape());
+  TF_ASSIGN_OR_RETURN(auto type, YnnType(param->shape().element_type()));
+
+  uint32_t tensor_id = param->parameter_number();
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph, type, dims.size(), dims.data(), /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, YNN_VALUE_FLAG_EXTERNAL_INPUT,
+      &tensor_id));
+
+  return tensor_id;
+}
+
+static absl::StatusOr<uint32_t> DefineBitcastOp(ynn_subgraph_t subgraph,
+                                                TensorIdMap& tensor_ids,
+                                                const HloInstruction* instr) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for bitcast op: %s",
+                                instr->ToString());
+  CHECK_EQ(instr->opcode(), HloOpcode::kBitcast);
+  const HloInstruction* input = instr->operand(0);
+  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
+  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
+  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
+
+  auto dims = YnnDimensions(instr->shape());
+  YNN_RETURN_IF_ERROR(ynn_define_static_reshape(subgraph, dims.size(),
+                                                dims.data(), in, &out,
+                                                /*flags=*/0));
+  return out;
+}
+
+static absl::StatusOr<uint32_t> DefineUnaryOp(ynn_subgraph_t subgraph,
+                                              TensorIdMap& tensor_ids,
+                                              const HloInstruction* instr) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for unary op: %s",
+                                instr->ToString());
+  TF_ASSIGN_OR_RETURN(auto unary_op, YnnUnaryOperator(instr->opcode()));
+
+  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, instr->operand(0)));
+  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
+
+  VLOG(3) << absl::StreamFormat("  tensors: in=%d, out=%d", in, out);
+
+  YNN_RETURN_IF_ERROR(
+      ynn_define_unary(subgraph, unary_op, in, &out, /*flags=*/0));
+
+  return out;
+}
+
+static absl::StatusOr<uint32_t> DefineBinaryOp(ynn_subgraph_t subgraph,
+                                               TensorIdMap& tensor_ids,
+                                               const HloInstruction* instr) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for binary op: %s",
+                                instr->ToString());
+
+  TF_ASSIGN_OR_RETURN(auto binary_op, YnnBinaryOperator(instr->opcode()));
+
+  TF_ASSIGN_OR_RETURN(auto lhs, FindTensorValue(tensor_ids, instr->operand(0)));
+  TF_ASSIGN_OR_RETURN(auto rhs, FindTensorValue(tensor_ids, instr->operand(1)));
+  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
+
+  VLOG(3) << absl::StreamFormat("  tensors: lhs=%d, rhs=%d, out=%d", lhs, rhs,
+                                out);
+
+  YNN_RETURN_IF_ERROR(
+      ynn_define_binary(subgraph, binary_op, lhs, rhs, &out, /*flags=*/0));
+
+  return out;
+}
+
+static absl::StatusOr<uint32_t> DefineReduceOp(ynn_subgraph_t subgraph,
+                                               TensorIdMap& tensor_ids,
+                                               const HloInstruction* instr) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for reduce op: %s",
+                                instr->ToString());
+  CHECK_EQ(instr->opcode(), HloOpcode::kReduce);
+  const HloReduceInstruction* reduce_instr = Cast<HloReduceInstruction>(instr);
+  const HloInstruction* input = instr->operand(0);
+  const HloInstruction* init = instr->operand(1);
+  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
+  CHECK_EQ(init->shape().element_type(), instr->shape().element_type());
+
+  ynn_reduce_operator ynn_reduce_op = ynn_reduce_invalid;
+  CHECK_EQ(reduce_instr->to_apply()->num_parameters(), 2);
+  CHECK_EQ(reduce_instr->to_apply()->instruction_count(), 3);
+
+  switch (reduce_instr->to_apply()->root_instruction()->opcode()) {
+    case HloOpcode::kAdd:
+      ynn_reduce_op = ynn_reduce_sum;
+      break;
+    case HloOpcode::kMaximum:
+      ynn_reduce_op = ynn_reduce_max;
+      break;
+    case HloOpcode::kMinimum:
+      ynn_reduce_op = ynn_reduce_min;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported reduction: " << instr->to_apply()->ToString();
+  }
+
+  const absl::Span<const int64_t> reduce_dims = reduce_instr->dimensions();
+  const std::vector<int32_t> dims(reduce_dims.begin(), reduce_dims.end());
+  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
+  TF_ASSIGN_OR_RETURN(auto init_id, FindTensorValue(tensor_ids, init));
+  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
+
+  YNN_RETURN_IF_ERROR(
+      ynn_define_reduce(subgraph, ynn_reduce_op, /*num_axes=*/dims.size(),
+                        /*axes=*/dims.data(), in, init_id, &out, /*flags=*/0));
+  return out;
+}
+
+//===----------------------------------------------------------------------===//
+// Emit YNNPACK subgraph for the given HLO computation.
+//===----------------------------------------------------------------------===//
+
+static absl::StatusOr<YnnSubgraph> EmitYnnSubgraph(
+    const HloComputation* computation,
+    std::vector<std::unique_ptr<Literal>>& literals) {
+  VLOG(3) << "Emit YNNPACK subgraph for computation: " << computation->name();
+
+  TF_ASSIGN_OR_RETURN(
+      YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
+        return ynn_create_subgraph(
+            /*external_value_ids=*/computation->num_parameters() + 1,
+            YnnFlags(computation->parent()->config().debug_options()),
+            subgraph);
+      }));
+
+  // Traverse fused computation in post-order and define YNNPACK operations
+  // corresponding to each HLO instruction.
+  TensorIdMap tensor_ids;
+  auto instructions = computation->MakeInstructionPostOrder();
+
+  for (const HloInstruction* instr : instructions) {
+    if (!IsLayoutSupportedByYnn(instr->shape())) {
+      return InvalidArgument(
+          "Instruction with unsupported layout in YNN fusion: %s",
+          instr->ToString());
+    }
+
+    if (instr->IsConstant()) {
+      if (!IsConstantSupportedByYnn(instr)) {
+        return InvalidArgument(
+            "Unsupported constant instruction in YNN fusion: %s",
+            instr->ToString());
+      }
+      TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                          DefineConstant(subgraph.get(), literals, instr));
+      continue;
+    }
+
+    if (instr->IsElementwise()) {
+      if (!IsElementwiseOpSupportedByYnn(instr)) {
+        return InvalidArgument(
+            "Unsupported elementwise instruction in YNN fusion: %s",
+            instr->ToString());
+      }
+      if (instr->operand_count() == 1) {
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineUnaryOp(subgraph.get(), tensor_ids, instr));
+      } else if (instr->operand_count() == 2) {
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineBinaryOp(subgraph.get(), tensor_ids, instr));
+      } else {
+        LOG(FATAL) << "Unexpected operand count " << instr->operand_count();
+      }
+      continue;
+    }
+
+    switch (instr->opcode()) {
+      case HloOpcode::kParameter: {
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineParameter(subgraph.get(), instr));
+      } break;
+
+      case HloOpcode::kBitcast: {
+        if (!IsBitcastOpSupportedByYnn(instr)) {
+          return InvalidArgument(
+              "Unsupported bitcast instruction in YNN fusion: %s",
+              instr->ToString());
+        }
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineBitcastOp(subgraph.get(), tensor_ids, instr));
+      } break;
+
+      case HloOpcode::kReduce: {
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineReduceOp(subgraph.get(), tensor_ids, instr));
+      } break;
+
+      default: {
+        return InvalidArgument("Unsupported fusion instruction: %s",
+                               instr->ToString());
+      }
+    }
+  }
+
+  ynn_status status = ynn_optimize_subgraph(
+      subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
+  TF_RETURN_IF_ERROR(YnnStatusToStatus(status));
+
+  return subgraph;
+}
+
+//===----------------------------------------------------------------------===//
+// Emit YNNPACK subgraph for the given HLO dot instruction.
+//===----------------------------------------------------------------------===//
+
+// TODO(ashaposhnikov): Use DefineBatchMatrixMultiply in EmitYnnSubgraph.
+static ynn_status DefineBatchMatrixMultiply(ynn_subgraph_t subgraph,
+                                            uint32_t input1_id,
+                                            uint32_t input2_id,
+                                            uint32_t output_id, size_t b_rank,
+                                            bool transpose_b) {
+  if (transpose_b) {
+    uint32_t input2_id_transposed = YNN_INVALID_VALUE_ID;
+    std::array<int32_t, YNN_MAX_TENSOR_RANK> perm;
+    std::iota(perm.begin(), perm.end(), 0);
+    CHECK_LT(b_rank, YNN_MAX_TENSOR_RANK);
+    std::swap(perm[b_rank - 1], perm[b_rank - 2]);
+    ynn_status status = ynn_define_static_transpose(
+        subgraph,
+        /*num_dims=*/b_rank, perm.data(), input2_id, &input2_id_transposed,
+        /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input2_id = input2_id_transposed;
+  }
+
+  return ynn_define_dot(subgraph, /*num_k_dims=*/1, input1_id, input2_id,
+                        YNN_INVALID_VALUE_ID, &output_id, /*flags=*/0);
+}
+
+static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
+    const HloDotInstruction* dot,
+    std::vector<std::unique_ptr<Literal>>& literals,
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers,
+    bool capture_rhs) {
+  TF_ASSIGN_OR_RETURN(
+      YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
+        return ynn_create_subgraph(
+            /*external_value_ids=*/3,
+            YnnFlags(dot->GetModule()->config().debug_options()), subgraph);
+      }));
+
+  uint32_t lhs_id = 0;
+  uint32_t rhs_id = 1;
+  uint32_t out_id = 2;
+
+  const HloInstruction* lhs = dot->operand(0);
+  const HloInstruction* rhs = dot->operand(1);
+
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& out_shape = dot->shape();
+
+  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
+    return {dims.begin(), dims.end()};
+  };
+
+  std::vector<size_t> lhs_dims = dims(lhs_shape.dimensions());
+  std::vector<size_t> rhs_dims = dims(rhs_shape.dimensions());
+  std::vector<size_t> out_dims = dims(out_shape.dimensions());
+
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_lhs_type, YnnType(lhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_rhs_type, YnnType(rhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_out_type, YnnType(out_shape.element_type()));
+
+  const uint32_t input_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_INPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_lhs_type, lhs_dims.size(), lhs_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &lhs_id));
+
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_rhs_type, rhs_dims.size(), rhs_dims.data(),
+      capture_rhs ? arguments_buffers[1].opaque() : nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &rhs_id));
+
+  const uint32_t output_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_OUTPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_out_type, out_dims.size(), out_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, output_tensor_flags, &out_id));
+
+  DotDimensionNumbers dot_dimensions = dot->dot_dimension_numbers();
+  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
+                                                      rhs_shape, out_shape));
+
+  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
+                      GetDotCanonicalDims(dot_dimensions, dot_shape));
+
+  const size_t b_rank = rhs_shape.dimensions().size();
+  const bool transpose_b = !dot_canonical_dims.rhs_canonical;
+  YNN_RETURN_IF_ERROR(DefineBatchMatrixMultiply(subgraph.get(), lhs_id, rhs_id,
+                                                out_id, b_rank, transpose_b));
+
+  ynn_status status = ynn_optimize_subgraph(
+      subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
+  TF_RETURN_IF_ERROR(YnnStatusToStatus(status));
+
+  return subgraph;
+}
+
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
+EmitYnnFusionBuilder(const HloComputation* computation) {
+  // We do not support non-array parameters for YNNPACK operations.
+  for (auto& param : computation->parameter_instructions()) {
+    if (!param->shape().IsArray()) {
+      return InvalidArgument(
+          "YNNPACK fusion parameters must have array shapes, got %s",
+          param->shape().ToString());
+    }
+  }
+
+  // Result also must be a single array.
+  if (!computation->root_instruction()->shape().IsArray()) {
+    return InvalidArgument("YNNPACK fusion result must be an array, got %s",
+                           computation->root_instruction()->shape().ToString());
+  }
+
+  return [computation, literals = std::vector<std::unique_ptr<Literal>>()](
+             absl::Span<const se::DeviceMemoryBase> arguments_buffers) mutable {
+    return EmitYnnSubgraph(computation, literals);
+  };
+}
+
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
+EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs) {
+  return [dot, capture_rhs, literals = std::vector<std::unique_ptr<Literal>>()](
+             absl::Span<const se::DeviceMemoryBase> arguments_buffers) mutable {
+    return EmitYnnDotSubgraph(dot, literals, arguments_buffers, capture_rhs);
+  };
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.h b/third_party/xla/xla/backends/cpu/ynn_emitter.h
new file mode 100644
index 00000000000000..280216a9a68a0d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.h
@@ -0,0 +1,38 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_YNN_EMITTER_H_
+#define XLA_BACKENDS_CPU_YNN_EMITTER_H_
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/stream_executor/device_memory.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
+EmitYnnFusionBuilder(const HloComputation* computation);
+
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
+EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_YNN_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/ynn_fusion_options.proto b/third_party/xla/xla/backends/cpu/ynn_fusion_options.proto
new file mode 100644
index 00000000000000..5b4ed7e6c901b3
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/ynn_fusion_options.proto
@@ -0,0 +1,7 @@
+syntax = "proto3";
+
+package xla.cpu;
+
+message YnnFusionOptions {
+  bool use_threadpool = 1;
+}
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
new file mode 100644
index 00000000000000..14e3aef9563534
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -0,0 +1,282 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/ynn_support.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <tuple>
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/shape.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+const absl::flat_hash_map<HloOpcode, ynn_unary_operator>& GetYnnUnaryOpMap() {
+  static absl::NoDestructor<absl::flat_hash_map<HloOpcode, ynn_unary_operator>>
+      unary_op_map({
+          {HloOpcode::kAbs, ynn_unary_abs},
+          {HloOpcode::kCeil, ynn_unary_ceil},
+          {HloOpcode::kConvert, ynn_unary_convert},
+          {HloOpcode::kCos, ynn_unary_cosine},
+          {HloOpcode::kExp, ynn_unary_exp},
+          {HloOpcode::kCbrt, ynn_unary_cube_root},
+          {HloOpcode::kFloor, ynn_unary_floor},
+          {HloOpcode::kLog, ynn_unary_log},
+          {HloOpcode::kLogistic, ynn_unary_sigmoid},
+          {HloOpcode::kNegate, ynn_unary_negate},
+          {HloOpcode::kRoundNearestEven, ynn_unary_round},
+          {HloOpcode::kRsqrt, ynn_unary_reciprocal_square_root},
+          {HloOpcode::kSign, ynn_unary_sign},
+          {HloOpcode::kSin, ynn_unary_sine},
+          {HloOpcode::kSqrt, ynn_unary_square_root},
+          {HloOpcode::kTanh, ynn_unary_tanh},
+      });
+  return *unary_op_map;
+}
+
+absl::StatusOr<ynn_unary_operator> YnnUnaryOperator(const HloOpcode& opcode) {
+  const auto& unary_op_map = GetYnnUnaryOpMap();
+  auto result = unary_op_map.find(opcode);
+  if (result == unary_op_map.end()) {
+    return InvalidArgument("Unsupported YNNPACK unary operator: %s",
+                           HloOpcodeString(opcode));
+  }
+  return result->second;
+}
+
+const absl::flat_hash_map<HloOpcode, ynn_binary_operator>& GetYnnBinaryOpMap() {
+  static absl::NoDestructor<absl::flat_hash_map<HloOpcode, ynn_binary_operator>>
+      binary_op_map({
+          {HloOpcode::kAdd, ynn_binary_add},
+          {HloOpcode::kDivide, ynn_binary_divide},
+          {HloOpcode::kMaximum, ynn_binary_max},
+          {HloOpcode::kMinimum, ynn_binary_min},
+          {HloOpcode::kMultiply, ynn_binary_multiply},
+          {HloOpcode::kPower, ynn_binary_pow},
+          {HloOpcode::kSubtract, ynn_binary_subtract},
+      });
+  return *binary_op_map;
+}
+
+absl::StatusOr<ynn_binary_operator> YnnBinaryOperator(const HloOpcode& opcode) {
+  const auto& binary_op_map = GetYnnBinaryOpMap();
+  auto result = binary_op_map.find(opcode);
+  if (result == binary_op_map.end()) {
+    return InvalidArgument("Unsupported YNNPACK binary operator: %s",
+                           HloOpcodeString(opcode));
+  }
+  return result->second;
+}
+
+bool IsLayoutSupportedByYnn(const Shape& shape) {
+  return !shape.has_layout() || LayoutUtil::HasDescendingLayout(shape.layout());
+}
+
+bool IsBitcastOpSupportedByYnn(const HloInstruction* hlo) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kBitcast);
+  if (!YnnType(hlo->shape().element_type()).ok()) {
+    return false;
+  }
+  const HloInstruction* input = hlo->operand(0);
+  return hlo->shape().element_type() == input->shape().element_type();
+}
+
+bool IsConstantSupportedByYnn(const HloInstruction* hlo) {
+  CHECK(hlo->IsConstant());
+
+  if (!YnnType(hlo->shape().element_type()).ok()) {
+    return false;
+  }
+
+  return hlo->shape().IsArray();
+}
+
+bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo) {
+  CHECK(hlo->IsElementwise());
+  // In XLA IsElementwise is true for constants.
+  CHECK(!hlo->IsConstant());
+
+  if (!YnnType(hlo->shape().element_type()).ok()) {
+    return false;
+  }
+
+  if (!std::all_of(hlo->operands().begin(), hlo->operands().end(),
+                   [](const HloInstruction* op) {
+                     return YnnType(op->shape().element_type()).ok();
+                   })) {
+    return false;
+  }
+
+  switch (hlo->operand_count()) {
+    case 1:
+      return YnnUnaryOperator(hlo->opcode()).ok();
+    case 2:
+      return YnnBinaryOperator(hlo->opcode()).ok();
+    default:
+      return false;
+  }
+}
+
+absl::StatusOr<bool> IsDotSupportedByYnn(
+    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
+    const Shape& rhs_shape, const Shape& out_shape) {
+  // Stores tuple of allowed (input, output) dtypes.
+  static const absl::NoDestructor<absl::flat_hash_set<
+      std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
+      kAllowedTypes({
+          // TODO(b/452693819): We plan to enable this in stages, starting with
+          // int8, and enable f32 later.
+          // {F32, F32, F32},
+          // TODO(b/449998002): We don't have fast fp16 kernels yet.
+          // {F16, F16, F32},
+          {BF16, BF16, F32},
+          {S8, S8, S32},
+          {U8, S8, S32},
+          // TODO(b/441600372): We don't have fast int4 kernels yet. Even the
+          // reference kernel might be pretty good though?
+          // {S8, S4, S32},
+      });
+
+  // Types must be in the allowed set.
+  PrimitiveType lhs_dtype = lhs_shape.element_type();
+  PrimitiveType rhs_dtype = rhs_shape.element_type();
+  PrimitiveType out_dtype = out_shape.element_type();
+  if (!kAllowedTypes->contains({lhs_dtype, rhs_dtype, out_dtype})) {
+    return false;
+  }
+
+  if (!IsLayoutSupportedByYnn(lhs_shape) ||
+      !IsLayoutSupportedByYnn(rhs_shape) ||
+      !IsLayoutSupportedByYnn(out_shape)) {
+    return false;
+  }
+
+  // Check shapes.
+  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
+                                                      rhs_shape, out_shape));
+
+  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
+                      GetDotCanonicalDims(dot_dimensions, dot_shape));
+
+  if (dot_canonical_dims.m == 1 && dot_canonical_dims.n == 1 &&
+      dot_shape.batch_size > 1) {
+    // TODO(b/430079105): YNNPACK does not handle batch dimensions that are not
+    // matrix dimensions. We could handle this case by fully implementing dot
+    // (b/430079105), but we also could just insert dummy dimensions of size 1
+    // for the matrix dimensions, so the batch dimensions get handled correctly.
+    return false;
+  }
+
+  if (std::max({dot_canonical_dims.m, dot_canonical_dims.k,
+                dot_canonical_dims.n}) < 8) {
+    // If this dot is small, our overhead is probably too significant.
+    // TODO(b/458529782): This is here as a workaround for an unrelated bug.
+    return false;
+  }
+
+  // YNNPACK supports transposing the inputs efficiently if possible (they will
+  // fuse with dot packing), but we don't currently support generating the
+  // necessary transposes.
+  if (!dot_canonical_dims.lhs_canonical ||
+      dot_canonical_dims.lhs_column_major ||
+      dot_canonical_dims.rhs_column_major) {
+    return false;
+  }
+
+  return true;
+}
+
+bool IsReduceOpSupportedByYnn(const HloInstruction* hlo) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kReduce);
+  if (!YnnType(hlo->shape().element_type()).ok()) {
+    return false;
+  }
+  const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
+  CHECK_NE(reduce, nullptr);
+  // TODO(ashaposhnikov): we can support this edge case,
+  // planning to come back to this later.
+  if (reduce->dimensions().empty()) {
+    return false;
+  }
+
+  HloInstruction* init = reduce->init_values().front();
+  const PrimitiveType type = init->shape().element_type();
+  // TODO(ashaposhnikov): The list of supported types can be extended.
+  if (type != F32) {
+    return false;
+  }
+  if (type != hlo->shape().element_type()) {
+    return false;
+  }
+
+  const HloComputation* to_apply = reduce->to_apply();
+  CHECK_NE(to_apply, nullptr);
+  return Match(to_apply->root_instruction(),
+               match::AnyOf<HloInstruction>(match::Add(), match::Maximum(),
+                                            match::Minimum())
+                   .WithBinaryOperandsAnyOrder(match::Parameter(0),
+                                               match::Parameter(1)));
+}
+
+bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo) {
+  if (!IsReduceOpSupportedByYnn(hlo)) {
+    return false;
+  }
+  const HloInstruction* input = hlo->operand(0);
+  if (ShapeUtil::ElementsIn(input->shape()) < 32 * 1024) {
+    return false;
+  }
+  switch (input->opcode()) {
+    case HloOpcode::kMultiply:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kSlice:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kConvert:
+    case HloOpcode::kReshape:
+      return false;
+    default: {
+      return true;
+    }
+  }
+}
+
+uint32_t YnnFlags(const DebugOptions& debug_options) {
+  uint32_t flags = 0;
+  if (!debug_options.xla_cpu_enable_platform_dependent_math()) {
+    flags |= YNN_FLAG_CONSISTENT_ARITHMETIC;
+  }
+  return flags;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.h b/third_party/xla/xla/backends/cpu/ynn_support.h
new file mode 100644
index 00000000000000..7025010715dad9
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/ynn_support.h
@@ -0,0 +1,79 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_YNN_SUPPORT_H_
+#define XLA_BACKENDS_CPU_YNN_SUPPORT_H_
+
+#include <cstdint>
+
+#include "ynnpack/include/ynnpack.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
+#include "xla/xla.pb.h"
+
+namespace xla::cpu {
+
+inline constexpr absl::string_view kYnnFusionKind = "__ynn_fusion";
+
+// Returns the mappings from HLO opcodes to YNNPACK unary operators.
+const absl::flat_hash_map<HloOpcode, ynn_unary_operator>& GetYnnUnaryOpMap();
+
+// Returns the YNNPACK unary operator corresponding to the given HLO opcode.
+// Returns `InvalidArgument` if the opcode is not supported.
+absl::StatusOr<ynn_unary_operator> YnnUnaryOperator(const HloOpcode& opcode);
+
+// Returns the mappings from HLO opcodes to YNNPACK binary operators.
+const absl::flat_hash_map<HloOpcode, ynn_binary_operator>& GetYnnBinaryOpMap();
+
+// Returns the YNNPACK binary operator corresponding to the given HLO opcode.
+// Returns `InvalidArgument` if the opcode is not supported.
+absl::StatusOr<ynn_binary_operator> YnnBinaryOperator(const HloOpcode& opcode);
+
+// Returns true if the shape either doesn't have a layout or the layout is
+// descending. Shapes without layout are accepted to make HLO tests less
+// verbose.
+bool IsLayoutSupportedByYnn(const Shape& shape);
+
+// Returns true if the bitcast op is supported by YNNPACK.
+bool IsBitcastOpSupportedByYnn(const HloInstruction* hlo);
+
+// Returns true if the constant is supported by YNNPACK.
+bool IsConstantSupportedByYnn(const HloInstruction* hlo);
+
+// Returns true if the nonconstant elementwise op is supported by YNNPACK.
+bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo);
+
+// Returns true if the dot operation is supported by YNNPACK. Returns an error
+// if the dot operation shape is invalid.
+absl::StatusOr<bool> IsDotSupportedByYnn(
+    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
+    const Shape& rhs_shape, const Shape& out_shape);
+
+// Returns true if the reduce op is supported by YNNPACK.
+bool IsReduceOpSupportedByYnn(const HloInstruction* hlo);
+
+// Returns true if the reduce op will be offloaded to YNNPACK.
+bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo);
+
+// Convert XLA options to YNNPACK flags.
+uint32_t YnnFlags(const DebugOptions& debug_options);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_YNN_SUPPORT_H_
diff --git a/third_party/xla/xla/backends/gpu/BUILD b/third_party/xla/xla/backends/gpu/BUILD
new file mode 100644
index 00000000000000..89c120f26bf5e5
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/BUILD
@@ -0,0 +1,29 @@
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "ffi",
+    hdrs = ["ffi.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/ffi:api",
+        "//xla/ffi/api:c_api",
+        "//xla/ffi/api:c_api_internal",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:scratch_allocator",
+        "//xla/stream_executor:stream",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index fb702ba428e1bc..6f8544fa7f9766 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:compiler",
@@ -112,7 +113,7 @@ xla_test(
         "//xla/service/gpu:nvptx_compiler_impl",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:status_matchers",
+        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status:status_matchers",
@@ -196,9 +197,9 @@ xla_test(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
@@ -262,9 +263,9 @@ xla_test(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
@@ -299,7 +300,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -341,11 +342,11 @@ xla_test(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
@@ -404,10 +405,10 @@ xla_test(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
@@ -436,6 +437,7 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
@@ -503,6 +505,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/simplifiers:float_normalization",
         "//xla/hlo/utils:hlo_query",
@@ -511,6 +514,7 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_float_support",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:split_k_gemm_rewriter",
@@ -525,10 +529,12 @@ cc_library(
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -554,6 +560,7 @@ xla_test(
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:compiler",
@@ -680,13 +687,18 @@ cc_library(
         ":cublaslt",
         ":cudnn",
         ":factory",
+        ":fission_backend",
         ":triton",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/service:compiler",
+        "//xla/service/gpu/transforms:dot_algorithm_rewriter",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/platform:platform_object_registry",
-        "@llvm-project//mlir:IR",
     ],
     alwayslink = True,
 )
@@ -703,11 +715,11 @@ cc_library(
         ":factory",
         ":triton",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/service:compiler",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/stream_executor/rocm:rocm_platform_id",
-        "@llvm-project//mlir:IR",
     ],
     alwayslink = True,
 )
@@ -773,14 +785,98 @@ cc_library(
         "//xla/service/gpu/autotuning:autotune_cache_key",
         "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:duration_cc_proto",
     ],
 )
 
+cc_library(
+    name = "fission_backend",
+    srcs = ["fission_backend.cc"],
+    hdrs = ["fission_backend.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/service:compiler",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/gpu/transforms:priority_fusion",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_test(
+    name = "fission_backend_test",
+    srcs = ["fission_backend_test.cc"],
+    backends = ["h100"],
+    tags = ["cuda-only"],
+    deps = [
+        ":cublas",
+        ":custom_kernel",
+        ":fission_backend",
+        ":gpu_codegen_backend",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:nvptx_compiler_impl",
+        "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
+        "//xla/service/gpu/transforms:dot_algorithm_rewriter",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "miopen",
+    srcs = ["miopen.cc"],
+    hdrs = ["miopen.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotuning_proto_cc",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:dnn_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 xla_cc_test(
     name = "legacy_cache_test",
     srcs = ["legacy_cache_test.cc"],
@@ -790,10 +886,12 @@ xla_cc_test(
         "//xla/backends/autotuner:autotuner_cache_interface",
         "//xla/backends/autotuner:autotuner_cache_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
@@ -819,6 +917,7 @@ xla_cc_binary(
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/autotuner:file_based_autotuner_cache",
         "//xla/backends/autotuner:profiler",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:compiler",
@@ -851,3 +950,36 @@ xla_cc_binary(
         "//xla/stream_executor/rocm:all_runtime",
     ]),
 )
+
+xla_test(
+    name = "miopen_test",
+    srcs = ["miopen_test.cc"],
+    backends = ["gpu"],
+    tags = [
+        "rocm-only",
+    ],
+    deps = [
+        ":miopen",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:amdgpu_compiler",
+        "//xla/service/gpu:amdgpu_compiler_impl",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/rocm:rocm_platform_id",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:dnn_proto_cc",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
index 15a05f3049cb17..bb2dadf1f7093d 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/gpu_profiler.h"
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -99,7 +100,7 @@ absl::Status Autotune(HloModule& module, const std::string& cache_dir,
                       xla::Compiler::GetForPlatform(platform));
   se::StreamExecutor* stream_executor = platform->ExecutorForDevice(0).value();
   DebugOptions debug_options = GetDebugOptionsFromFlags();
-  Compiler::TargetConfig target_config(stream_executor);
+  Compiler::GpuTargetConfig target_config(stream_executor);
 
   auto& registry = stream_executor::PlatformObjectRegistry::GetGlobalRegistry();
   TF_ASSIGN_OR_RETURN(const GetCodegenBackends::Type& get_codegen_backends,
diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
index 981421f5bb6ce7..bfda4e0c804bcc 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -201,11 +202,13 @@ void ExtendConfigsWithTma(
       LOG(ERROR) << "Failed to unpack BlockLevelFusionConfig";
       continue;
     }
-    BlockLevelFusionConfig new_config = original_config;
-    new_config.set_is_tma_allowed(true);
-    auto any = std::make_unique<google::protobuf::Any>();
-    any->PackFrom(new_config);
-    configs.push_back(std::move(any));
+    if (IsTmaRecommended(original_config)) {
+      BlockLevelFusionConfig new_config = original_config;
+      new_config.set_is_tma_allowed(true);
+      auto any = std::make_unique<google::protobuf::Any>();
+      any->PackFrom(new_config);
+      configs.push_back(std::move(any));
+    }
   }
 }
 }  // namespace
@@ -304,9 +307,9 @@ BlockLevelEmitterBackend::GetCostModelConfig(
     const HloInstruction& instr) const {
   auto device_info = target_config().device_description;
   HloFusionAnalysisCache fusion_analysis_cache(device_info);
-  mlir::MLIRContext ctx;
+  mlir::MLIRContext mlir_context;
   GpuPerformanceModelWithIndexingAnalysis indexing_performance_model(
-      &device_info, &fusion_analysis_cache, shape_size_fn_, &ctx);
+      &device_info, &fusion_analysis_cache, shape_size_fn_, &mlir_context);
 
   auto fusion_adaptor =
       HloFusionAdaptor::ForInstruction(Cast<HloFusionInstruction>(&instr));
diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.h b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.h
index 3e5b9d00f57433..d3a3845de79efd 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.h
@@ -46,7 +46,7 @@ class BlockLevelEmitterBackend : public GpuCodegenBackend {
       const DebugOptions* absl_nonnull debug_options,
       Compiler* absl_nonnull compiler,
       HloCostAnalysis::ShapeSizeFunction shape_size_fn,
-      const Compiler::TargetConfig* target_config,
+      const Compiler::GpuTargetConfig* target_config,
       bool use_default_config = false)
       : GpuCodegenBackend("BlockLevelEmitter", debug_options, compiler,
                           target_config),
diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
index ad5d16c1eed2b8..096e9080256933 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
@@ -35,8 +35,8 @@ limitations under the License.
 #include "xla/service/gpu/nvptx_compiler.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla.pb.h"
@@ -46,10 +46,9 @@ namespace gpu {
 
 using ::tsl::proto_testing::EqualsProto;
 
-// Counts the number of configs with is_tma_allowed set to true.
-int CountTmaAllowed(
-    const std::vector<std::unique_ptr<BackendConfig>>& configs) {
-  return std::count_if(configs.begin(), configs.end(), [](auto& config) {
+// Checks if any config has is_tma_allowed set to true.
+bool AnyTmaAllowed(const std::vector<std::unique_ptr<BackendConfig>>& configs) {
+  return std::any_of(configs.begin(), configs.end(), [](auto& config) {
     BlockLevelFusionConfig actual_config;
     if (!config->UnpackTo(&actual_config)) {
       return false;
@@ -74,15 +73,13 @@ class TritonBlockLevelFusionEmitterBackendTest
         target_config_(stream_executor_),
         backend_(&debug_options_, &compiler_,
                  compiler_.ShapeSizeBytesFunction(), &target_config_) {
-    // TODO(b/315957220): Remove the experimental flags once TMA is enabled by
-    // default.
     debug_options_.set_xla_gpu_experimental_enable_triton_tma(true);
   }
 
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   BlockLevelEmitterBackend backend_;
 };
 
@@ -205,23 +202,15 @@ ENTRY %main {
       backend_.GetSupportedConfigs(
           *(module->entry_computation()->root_instruction())));
 
-  // If device supports TMA, the backend should generate 70 combinations:
-  // (7 x 5) x 2.
-  // Expect 70 total configurations:
+  // Expect 35 configurations without TMA:
   // - 7 choices for d0 (output dim 0 = 64): 1, 2, 4, 8, 16, 32, 64
   // - 5 choices for d2 (output dim 2 = 16): 1, 2, 4, 8, 16
-  // - 2 choices for is_tma_allowed: true, false
   // The middle dimension (d1 = 1) must always have tile size 1.
-  //
-  // If device doesn't support TMA, we currently expect half the number (35).
-  bool is_tma_supported = backend_.target_config()
-                              .device_description.cuda_compute_capability()
-                              .IsAtLeastHopper();
-  if (is_tma_supported) {
-    ASSERT_EQ(configs.size(), 70);
-    // The current TMA autotuning duplicates the given configurations with
-    // is_tma_allowed set to true.
-    EXPECT_EQ(CountTmaAllowed(configs), configs.size() / 2);
+  if (stream_executor::gpu::IsTmaAvailableForDevice(
+          backend_.target_config().device_description)) {
+    ASSERT_GT(configs.size(), 35);
+    // Check that TMA configurations are generated.
+    EXPECT_TRUE(AnyTmaAllowed(configs));
   } else {
     ASSERT_EQ(configs.size(), 35);
   }
@@ -230,8 +219,6 @@ ENTRY %main {
 
   // Iterate over all expected tile size combinations for d0 and d2.
   // (d1 is fixed at 1 as per the input shape [16,1,64]).
-  // TMA configurations repeat in the 2nd half of the configs. We already
-  // checked them, so we don't inspect them here.
   for (int d0 : {1, 2, 4, 8, 16, 32, 64}) {
     for (int d2 : {1, 2, 4, 8, 16}) {
       BlockLevelFusionConfig block_level_fusion_config;
@@ -289,21 +276,15 @@ backend_config={"fusion_backend_config":{"kind":"__triton"}}
       backend_.GetSupportedConfigs(
           *(module->entry_computation()->root_instruction())));
 
-  // If device supports TMA, expect 40 total configurations:
+  // Expect 20 configurations without TMA:
   // - 5 choices for d0 (output dim 0 = 10): 1, 2, 4, 8, 16
   // - 4 choices for d2 (output dim 2 = 8): 1, 2, 4, 8
-  // - 2 choices for is_tma_allowed: true, false
   // The middle dimension (d1 = 0) must always have tile size 0.
-  //
-  // If device doesn't support TMA, we currently expect half the number (20).
-  bool is_tma_supported = backend_.target_config()
-                              .device_description.cuda_compute_capability()
-                              .IsAtLeastHopper();
-  if (is_tma_supported) {
-    ASSERT_EQ(configs.size(), 40);
-    // The current TMA autotuning duplicates the given configurations with
-    // is_tma_allowed set to true.
-    EXPECT_EQ(CountTmaAllowed(configs), configs.size() / 2);
+  if (stream_executor::gpu::IsTmaAvailableForDevice(
+          backend_.target_config().device_description)) {
+    ASSERT_GT(configs.size(), 20);
+    // Check that TMA configurations are generated.
+    EXPECT_TRUE(AnyTmaAllowed(configs));
   } else {
     ASSERT_EQ(configs.size(), 20);
   }
@@ -312,8 +293,6 @@ backend_config={"fusion_backend_config":{"kind":"__triton"}}
 
   // Iterate over tile size combinations for dimensions 0 and 2.
   // Dimension 1 (middle) is zero-sized, so its tile size is fixed to 0.
-  // TMA configurations repeat in the 2nd half of the configs. We already
-  // checked them, so we don't inspect them here.
   for (int d0 : {1, 2, 4, 8, 16}) {
     for (int d2 : {1, 2, 4, 8}) {
       BlockLevelFusionConfig block_level_fusion_config;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
index 0bd3993dfd2efc..4434f5ec25b0c8 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
@@ -153,5 +153,9 @@ absl::Status CublasBackend::ApplyConfig(HloInstruction& instr,
   return absl::OkStatus();
 }
 
+bool CublasBackend::IsSupported(const HloInstruction& instr) {
+  return IsLegacyCublasMatmul(instr);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas.h b/third_party/xla/xla/backends/gpu/autotuner/cublas.h
index 281be4581232b6..a57eb16dedb22f 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -48,7 +48,7 @@ class CublasBackend : public GpuCodegenBackend {
  public:
   explicit CublasBackend(stream_executor::StreamExecutor* stream_executor,
                          const DebugOptions* debug_options, Compiler* compiler,
-                         const Compiler::TargetConfig* target_config)
+                         const Compiler::GpuTargetConfig* target_config)
       : GpuCodegenBackend("Cublas", debug_options, compiler, target_config,
                           stream_executor) {}
 
@@ -60,6 +60,9 @@ class CublasBackend : public GpuCodegenBackend {
 
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas_test.cc b/third_party/xla/xla/backends/gpu/autotuner/cublas_test.cc
index a8463704e0e0dd..93b25b8e08530c 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla.pb.h"
@@ -45,8 +45,6 @@ namespace gpu {
 using CublasBackendConfig = AutotuneResult::GemmKey;
 
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
 
 const char kCublasCustomCallHlo[] = R"(
   HloModule module, entry_computation_layout={(f32[100,100]{1,0}, f32[100,100]{1,0})->f32[100,100]{1,0}}
@@ -95,7 +93,7 @@ class CublasBackendTest : public HloHardwareIndependentTestBase {
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   CublasBackend backend_;
 
   CublasBackendTest()
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
index ef03149115ccbf..54c5b0e50a7bd6 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
@@ -74,12 +74,12 @@ absl::StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
   }
 }
 
-bool IsSupported(const HloInstruction& instr) {
+}  // namespace
+
+bool CublasLtBackend::IsSupported(const HloInstruction& instr) {
   return IsCublasLtMatmul(instr) || IsCublasLtMatmulF8(instr);
 }
 
-}  // namespace
-
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
 CublasLtBackend::GetSupportedConfigs(const HloInstruction& instr) {
   if (!IsSupported(instr)) {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h
index 88c5f8a6d7e442..009b4fbf73e040 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -47,7 +47,7 @@ class CublasLtBackend : public GpuCodegenBackend {
   explicit CublasLtBackend(stream_executor::StreamExecutor* stream_executor,
                            const DebugOptions* debug_options,
                            Compiler* compiler,
-                           const Compiler::TargetConfig* target_config)
+                           const Compiler::GpuTargetConfig* target_config)
       : GpuCodegenBackend("CublasLt", debug_options, compiler, target_config,
                           stream_executor) {}
 
@@ -59,6 +59,9 @@ class CublasLtBackend : public GpuCodegenBackend {
 
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublaslt_test.cc b/third_party/xla/xla/backends/gpu/autotuner/cublaslt_test.cc
index 6c128a39e873ec..9077a36b651a38 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublaslt_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublaslt_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
@@ -43,9 +43,6 @@ namespace xla {
 namespace gpu {
 
 using CublasLtBackendConfig = AutotuneResult::GemmKey;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 const char kCublasLtCustomCallHlo[] = R"(
 HloModule module
@@ -107,7 +104,7 @@ class CublasLtBackendTest : public HloHardwareIndependentTestBase {
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   CublasLtBackend backend_;
 
   CublasLtBackendTest()
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
index fe9f6fa3b56497..ed16280d5e7669 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -120,7 +120,7 @@ bool IsSupportedCudnnFusion(const HloInstruction& instr,
       instr.backend_config<GpuBackendConfig>()
               ->fusion_backend_config()
               .kind() != kCuDnnFusionKind) {
-    LOG(ERROR) << "Instr is not a cudnn fusion.";
+    VLOG(1) << "Instr is not a cudnn fusion.";
     return false;
   }
 
@@ -128,17 +128,17 @@ bool IsSupportedCudnnFusion(const HloInstruction& instr,
       Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
           *instr.fused_instructions_computation(), HloOpcode::kDot));
   if (dot == nullptr) {
-    LOG(ERROR) << "Fusion does not contain a dot.";
+    VLOG(1) << "Fusion does not contain a dot.";
     return false;
   }
   if (!algorithm_util::IsSupportedByCudnn(
           dot->precision_config().algorithm())) {
-    LOG(ERROR) << "Fusion contains a precision config not supported by cudnn.";
+    VLOG(1) << "Fusion contains a precision config not supported by cudnn.";
     return false;
   }
 
   if (GetDnnVersionInfoOrDefault(stream_executor).major_version() < 9) {
-    LOG(ERROR) << "Cudnn version is too old.";
+    VLOG(1) << "Cudnn version is too old.";
     return false;
   }
 
@@ -151,21 +151,7 @@ bool IsSupportedCudnnFusion(const HloInstruction& instr,
     return true;
   }
 
-  LOG(ERROR) << "Fusion is not supported by cudnn.";
-  return false;
-}
-
-bool IsSupportedByCudnn(const HloInstruction& instr,
-                        se::StreamExecutor* stream_executor,
-                        const DebugOptions& debug_options) {
-  if (instr.opcode() == HloOpcode::kFusion) {
-    return IsSupportedCudnnFusion(instr, stream_executor, debug_options);
-  }
-
-  if (instr.opcode() == HloOpcode::kCustomCall) {
-    return IsCustomCallToDnnConvolution(instr);
-  }
-
+  VLOG(1) << "Fusion is not supported by cudnn.";
   return false;
 }
 
@@ -173,7 +159,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
     se::dnn::DnnSupport* dnn, se::dnn::ConvolutionKind conv_kind,
     se::dnn::DataType input_type, se::dnn::DataType output_type,
     se::Stream* stream, const GpuConvConfig& gpu_conv_config,
-    const se::NumericOptions& numeric_options, bool use_fallback) {
+    const se::EngineOptions& engine_options, bool use_fallback) {
   std::vector<std::unique_ptr<const se::dnn::ConvRunner>> conv_runners;
   std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>>
       fused_conv_runners;
@@ -194,7 +180,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
           gpu_conv_config.input_descriptor, gpu_conv_config.filter_descriptor,
           gpu_conv_config.bias_descriptor, gpu_conv_config.output_descriptor,
           gpu_conv_config.conv_desc, use_fallback, gpu_conv_config.fusion->mode,
-          numeric_options, &fused_conv_runners));
+          engine_options, &fused_conv_runners));
       break;
     }
     case se::dnn::ConvolutionKind::FORWARD_GRAPH: {
@@ -202,7 +188,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
           conv_kind, input_type, output_type, stream,
           gpu_conv_config.input_descriptor, gpu_conv_config.filter_descriptor,
           gpu_conv_config.output_descriptor, gpu_conv_config.conv_desc,
-          use_fallback, numeric_options, &graph_conv_runners,
+          use_fallback, engine_options, &graph_conv_runners,
           gpu_conv_config.serialized_graph));
       break;
     }
@@ -218,7 +204,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
           gpu_conv_config.output_descriptor,
           /*output_data=*/se::DeviceMemoryBase(nullptr),
           gpu_conv_config.conv_desc, use_fallback,
-          /*scratch_allocator=*/nullptr, numeric_options, &conv_runners));
+          /*scratch_allocator=*/nullptr, engine_options, &conv_runners));
       break;
     }
     default:
@@ -283,8 +269,9 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
   bool allow_tf32 = absl::c_all_of(
       instr->precision_config().operand_precision(),
       [](int precision) { return precision <= PrecisionConfig::HIGH; });
-  const se::NumericOptions numeric_options{
-      RequireDeterminism(instr->GetModule()->config()), allow_tf32};
+  const se::EngineOptions engine_options{
+      RequireDeterminism(instr->GetModule()->config()), allow_tf32,
+      /*require_command_buffer=*/false};
 
   // Try to get algorithms without fallback first, as fallback algorithms can be
   // very slow.
@@ -292,13 +279,13 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
   TF_ASSIGN_OR_RETURN(
       algorithm_configs,
       GetAlgorithms(dnn, conv_kind, input_type, output_type, stream,
-                    gpu_conv_config, numeric_options, /*use_fallback=*/false));
+                    gpu_conv_config, engine_options, /*use_fallback=*/false));
 
   if (algorithm_configs.empty()) {
     TF_ASSIGN_OR_RETURN(
         algorithm_configs,
         GetAlgorithms(dnn, conv_kind, input_type, output_type, stream,
-                      gpu_conv_config, numeric_options, /*use_fallback=*/true));
+                      gpu_conv_config, engine_options, /*use_fallback=*/true));
   }
 
   std::vector<std::unique_ptr<BackendConfig>> configs;
@@ -338,6 +325,18 @@ absl::Status ApplyConfigToCudnnCustomCall(HloInstruction& instr,
 
 }  // namespace
 
+bool CudnnBackend::IsSupported(const HloInstruction& instr) {
+  if (instr.opcode() == HloOpcode::kFusion) {
+    return IsSupportedCudnnFusion(instr, stream_executor(), debug_options());
+  }
+
+  if (instr.opcode() == HloOpcode::kCustomCall) {
+    return IsCustomCallToDnnConvolution(instr);
+  }
+
+  return false;
+}
+
 absl::StatusOr<std::unique_ptr<BackendConfig>> CudnnBackend::GetDefaultConfig(
     const HloInstruction& instr) {
   if (IsCustomCallToDnnConvolution(instr)) {
@@ -358,7 +357,7 @@ absl::StatusOr<std::unique_ptr<BackendConfig>> CudnnBackend::GetDefaultConfig(
 
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
 CudnnBackend::GetSupportedConfigs(const HloInstruction& instr) {
-  if (!IsSupportedByCudnn(instr, stream_executor(), debug_options())) {
+  if (!IsSupported(instr)) {
     return std::vector<std::unique_ptr<BackendConfig>>();
   }
   if (instr.opcode() == HloOpcode::kFusion) {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn.h b/third_party/xla/xla/backends/gpu/autotuner/cudnn.h
index 5a16f3226c1e6e..a328cdefa2811b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -58,7 +58,7 @@ class CudnnBackend : public GpuCodegenBackend {
  public:
   explicit CudnnBackend(stream_executor::StreamExecutor* stream_executor,
                         const DebugOptions* debug_options, Compiler* compiler,
-                        const Compiler::TargetConfig* target_config)
+                        const Compiler::GpuTargetConfig* target_config)
       : GpuCodegenBackend("Cudnn", debug_options, compiler, target_config,
                           stream_executor) {}
 
@@ -72,6 +72,9 @@ class CudnnBackend : public GpuCodegenBackend {
   // apply the configs with non-zero workspace size.
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn_test.cc b/third_party/xla/xla/backends/gpu/autotuner/cudnn_test.cc
index 240a15cd5be83e..63cefe11da420c 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -48,8 +48,6 @@ using CudnnBackendConfig = stream_executor::dnn::AlgorithmProto;
 using ::testing::Gt;
 using ::testing::SizeIs;
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 const char kCudnnFusionHlo[] = R"(
   fusion1 {
@@ -113,7 +111,7 @@ class CudnnBackendTest : public HloHardwareIndependentTestBase {
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   CudnnBackend backend_;
 
   CudnnBackendTest()
diff --git a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc
index 1b8d3e4bd8a24b..aa166da4c5342f 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc
@@ -44,24 +44,21 @@ namespace se = ::stream_executor;
 
 using CustomKernelBackendConfig = AutotuneResult::CustomKernelFusionKey;
 
-namespace {
-bool IsSupported(const HloInstruction& instr) {
+bool CustomKernelBackend::IsSupported(const HloInstruction& instr) {
   if (instr.opcode() != HloOpcode::kFusion) {
-    LOG(ERROR)
-        << "CustomKernelBackend doesn't support non-fusion instructions.";
+    VLOG(1) << "CustomKernelBackend doesn't support non-fusion instructions.";
     return false;
   }
 
   if (instr.backend_config<GpuBackendConfig>()
           ->fusion_backend_config()
           .kind() != kCustomFusionKind) {
-    LOG(ERROR) << "CustomKernelBackend expected a custom fusion.";
+    VLOG(1) << "CustomKernelBackend expected a custom fusion.";
     return false;
   }
 
   return true;
 }
-}  // namespace
 
 absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
     const HloInstruction* fusion_instruction,
diff --git a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h
index 7d4cec2f7d8eb2..c1aa956a61995b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h
@@ -36,7 +36,7 @@ class CustomKernelBackend : public GpuCodegenBackend {
   explicit CustomKernelBackend(stream_executor::StreamExecutor* stream_executor,
                                const DebugOptions* debug_options,
                                Compiler* compiler,
-                               const Compiler::TargetConfig* target_config)
+                               const Compiler::GpuTargetConfig* target_config)
       : GpuCodegenBackend("CustomKernel", debug_options, compiler,
                           target_config, stream_executor) {}
 
@@ -48,6 +48,9 @@ class CustomKernelBackend : public GpuCodegenBackend {
 
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel_test.cc b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel_test.cc
index b8cde0976c2d0a..ae4ad8b230ed7a 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla.pb.h"
@@ -44,9 +44,6 @@ namespace gpu {
 using CustomKernelBackendConfig = AutotuneResult::CustomKernelFusionKey;
 
 using ::tsl::proto_testing::EqualsProto;
-using tsl::testing::IsOk;
-using tsl::testing::IsOkAndHolds;
-using tsl::testing::StatusIs;
 
 const char kCustomKernelFusionHlo[] = R"(
 HloModule extracted
@@ -103,7 +100,7 @@ class CustomKernelBackendTest : public HloHardwareIndependentTestBase {
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   CustomKernelBackend backend_;
 
   CustomKernelBackendTest()
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory.h b/third_party/xla/xla/backends/gpu/autotuner/factory.h
index 42477542c277e4..bf9570538e9ace 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory.h
@@ -32,7 +32,13 @@ namespace gpu {
 struct GetCodegenBackends {
   using Type = std::function<std::vector<std::unique_ptr<CodegenBackend>>(
       stream_executor::StreamExecutor*, const DebugOptions*, Compiler*,
-      const Compiler::TargetConfig*, mlir::MLIRContext* mlir_context)>;
+      const Compiler::GpuTargetConfig*, mlir::MLIRContext* mlir_context)>;
+};
+
+struct GetFissionBackends {
+  using Type = std::function<std::vector<std::unique_ptr<CodegenBackend>>(
+      stream_executor::StreamExecutor*, const DebugOptions*, Compiler*,
+      const Compiler::GpuTargetConfig*, mlir::MLIRContext* mlir_context)>;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
index a7431bd2cd7b21..1b4f73b9a7c2ec 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
@@ -17,28 +17,52 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_CUDA_FACTORY_H_
 
 #include <memory>
+#include <utility>
 #include <vector>
 
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/cublas.h"
 #include "xla/backends/gpu/autotuner/cublaslt.h"
 #include "xla/backends/gpu/autotuner/cudnn.h"
 #include "xla/backends/gpu/autotuner/factory.h"
+#include "xla/backends/gpu/autotuner/fission_backend.h"
 #include "xla/backends/gpu/autotuner/triton.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/compiler.h"
+#include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
+#include "xla/service/gpu/transforms/gemm_rewriter.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
+namespace {
+
+using ::mlir::MLIRContext;
+
+std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
+    const se::DeviceDescription& device_description) {
+  auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
+  for (GemmRewriterOptions::DType dtype :
+       {GemmRewriterOptions::DType::kFp8Only,
+        GemmRewriterOptions::DType::kNonFp8Only}) {
+    auto gemm_rewriter = std::make_unique<GemmRewriter>(
+        device_description.gpu_compute_capability(),
+        device_description.runtime_version(), GemmRewriterOptions{dtype});
+    pipeline->AddPass(std::move(gemm_rewriter));
+  }
+  return pipeline;
+}
+
+}  // namespace
 
 std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForCuda(
     stream_executor::StreamExecutor* stream_executor,
     const DebugOptions* debug_options, Compiler* compiler,
-    const Compiler::TargetConfig* target_config,
-    mlir::MLIRContext* mlir_context) {
+    const Compiler::GpuTargetConfig* target_config, MLIRContext* mlir_context) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::make_unique<TritonBackend>(
       debug_options, compiler, target_config, mlir_context));
@@ -51,10 +75,28 @@ std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForCuda(
   return backends;
 }
 
+std::vector<std::unique_ptr<CodegenBackend>> GetFissionBackendsForCuda(
+    stream_executor::StreamExecutor* stream_executor,
+    const DebugOptions* debug_options, Compiler* compiler,
+    const Compiler::GpuTargetConfig* target_config, MLIRContext* mlir_context) {
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::make_unique<FissionBackend>(
+      debug_options, compiler, target_config,
+      std::make_unique<CublasBackend>(stream_executor, debug_options, compiler,
+                                      target_config),
+      GetCublasRewriterPipeline(target_config->device_description),
+      mlir_context));
+  return backends;
+}
+
 STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetCodegenBackendsCudaRegistration,
                                            GetCodegenBackends,
                                            se::cuda::kCudaPlatformId,
                                            GetCodegenBackendsForCuda);
+STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetFissionBackendsCudaRegistration,
+                                           GetFissionBackends,
+                                           se::cuda::kCudaPlatformId,
+                                           GetFissionBackendsForCuda);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
index 635a07ffb83a26..602aa66217ca2b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/cublas.h"
 #include "xla/backends/gpu/autotuner/factory.h"
 #include "xla/backends/gpu/autotuner/triton.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -32,11 +32,12 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using ::mlir::MLIRContext;
+
 std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForROCm(
     stream_executor::StreamExecutor* stream_executor,
     const DebugOptions* debug_options, Compiler* compiler,
-    const Compiler::TargetConfig* target_config,
-    mlir::MLIRContext* mlir_context) {
+    const Compiler::GpuTargetConfig* target_config, MLIRContext* mlir_context) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::make_unique<TritonBackend>(
       debug_options, compiler, target_config, mlir_context));
@@ -45,10 +46,21 @@ std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForROCm(
   return backends;
 }
 
+std::vector<std::unique_ptr<CodegenBackend>> GetFissionBackendsForROCm(
+    stream_executor::StreamExecutor* stream_executor,
+    const DebugOptions* debug_options, Compiler* compiler,
+    const Compiler::GpuTargetConfig* target_config, MLIRContext* mlir_context) {
+  return {};
+}
+
 STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetCodegenBackendsROCmRegistration,
                                            GetCodegenBackends,
                                            se::rocm::kROCmPlatformId,
                                            GetCodegenBackendsForROCm);
+STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetFissionBackendsROCmRegistration,
+                                           GetFissionBackends,
+                                           se::rocm::kROCmPlatformId,
+                                           GetFissionBackendsForROCm);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission.cc b/third_party/xla/xla/backends/gpu/autotuner/fission.cc
index 1756ab5d47ee68..8953ad72ec566e 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/cublas.h"
@@ -40,6 +39,7 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
@@ -59,6 +59,9 @@ namespace xla {
 namespace gpu {
 
 namespace se = ::stream_executor;
+
+using ::mlir::MLIRContext;
+
 using CublasOrCublasLtBackendConfig = AutotuneResult::GemmKey;
 using CustomKernelBackendConfig = AutotuneResult::CustomKernelFusionKey;
 
@@ -79,26 +82,32 @@ HloCostAnalysis::Options PriorityFusionOptions() {
 absl::Status FissionToCublas(HloModule* hlo_module,
                              const se::DeviceDescription& device_description,
                              bool rewrite_to_cublaslt,
-                             mlir::MLIRContext* mlir_context) {
+                             MLIRContext* mlir_context) {
   hlo_module->mutable_config()
       .mutable_debug_options()
       .set_xla_gpu_enable_cublaslt(rewrite_to_cublaslt);
 
-  HloInstruction* dot = hlo_query::GetFirstInstructionWithOpcode(
-      *hlo_module->entry_computation(), HloOpcode::kDot);
-
-  if (dot == nullptr) {
-    return absl::InvalidArgumentError(
-        "No dot instruction found in the fusion.");
+  HloInstruction* dot = nullptr;
+  bool has_dot = false;
+  for (HloComputation* computation : hlo_module->computations()) {
+    dot =
+        hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+    if (dot != nullptr) {
+      // Substitute algorithms, which are not supported by cuBLAS for the check,
+      // but don't use cuBlas in the end. This assumes that the substituting
+      // algorithm has result which are close enough for the check in this file.
+      if (dot->precision_config().algorithm() ==
+          PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
+        dot->mutable_precision_config()->set_algorithm(
+            PrecisionConfig::ALG_DOT_F32_F32_F32);
+      }
+      has_dot = true;
+    }
   }
 
-  // Substitute algorithms, which are not supported by cuBLAS for the check, but
-  // don't use cuBlas in the end. This assumes that the substituting algorithm
-  // has result which are close enough for the check in this file.
-  if (dot->precision_config().algorithm() ==
-      PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
-    dot->mutable_precision_config()->set_algorithm(
-        PrecisionConfig::ALG_DOT_F32_F32_F32);
+  if (!has_dot) {
+    return absl::InvalidArgumentError(
+        "Fission to cuBLAS failed because no dot instruction found.");
   }
 
   bool is_rewritten_to_cublas_custom_call = false;
@@ -130,7 +139,7 @@ absl::Status FissionToCublas(HloModule* hlo_module,
 
 absl::Status FissionToCustomKernel(
     HloModule* hlo_module, const se::DeviceDescription& device_description,
-    mlir::MLIRContext* mlir_context) {
+    MLIRContext* mlir_context) {
   CustomKernelFusionRewriter custom_kernel_fusion_rewriter(&device_description);
   PriorityFusion fusion_pass(
       /*thread_pool=*/nullptr, device_description, PriorityFusionOptions(),
@@ -199,12 +208,7 @@ bool IsCustomKernel(const HloComputation* computation) {
     return false;
   }
 
-  if (!gpu_backend_config->has_fusion_backend_config()) {
-    return false;
-  }
-
-  return gpu_backend_config->fusion_backend_config().kind() ==
-         kCustomFusionKind;
+  return IsGpuFusionKind(*instruction, kCustomFusionKind);
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission.h b/third_party/xla/xla/backends/gpu/autotuner/fission.h
index ce8daa7f7330d3..1bb54ed13ab350 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission.h
@@ -45,7 +45,7 @@ class FissionBackend : public GpuCodegenBackend {
  public:
   explicit FissionBackend(stream_executor::StreamExecutor* stream_executor,
                           const DebugOptions* debug_options, Compiler* compiler,
-                          const Compiler::TargetConfig* target_config,
+                          const Compiler::GpuTargetConfig* target_config,
                           mlir::MLIRContext* mlir_context)
       : GpuCodegenBackend("Fission", debug_options, compiler, target_config),
         cublas_backend_(stream_executor, debug_options, compiler,
@@ -65,6 +65,11 @@ class FissionBackend : public GpuCodegenBackend {
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
 
+ private:
+  bool IsSupported(const HloInstruction& instr) override {
+    return instr.opcode() == HloOpcode::kFusion;
+  }
+
   CublasBackend cublas_backend_;
   CublasLtBackend cublaslt_backend_;
   CustomKernelBackend custom_kernel_backend_;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
new file mode 100644
index 00000000000000..2cddcba025c932
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
@@ -0,0 +1,178 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/fission_backend.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/transforms/priority_fusion.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+
+namespace gpu {
+
+namespace {
+
+// Replaces the fusion instruction with the instructions from the fissioned
+// computation.
+absl::Status InlineFissionedComputation(HloInstruction* fusion_instr,
+                                        HloComputation* fissioned_computation) {
+  if (fusion_instr->opcode() != HloOpcode::kFusion) {
+    return absl::InvalidArgumentError("Not a fusion instruction.");
+  }
+  HloModule* original_module = fusion_instr->GetModule();
+  HloCloneContext clone_context(original_module);
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*>
+      cloned_instructions;
+  HloComputation* parent_computation = fusion_instr->parent();
+
+  for (HloInstruction* instruction_to_clone :
+       fissioned_computation->MakeInstructionPostOrder()) {
+    if (instruction_to_clone->opcode() == HloOpcode::kParameter) {
+      cloned_instructions[instruction_to_clone] = fusion_instr->mutable_operand(
+          instruction_to_clone->parameter_number());
+      continue;
+    }
+
+    std::vector<HloInstruction*> new_operands;
+    for (const HloInstruction* operand : instruction_to_clone->operands()) {
+      new_operands.push_back(cloned_instructions.at(operand));
+    }
+    HloInstruction* new_instruction = parent_computation->AddInstruction(
+        instruction_to_clone->CloneWithNewOperands(
+            instruction_to_clone->shape(), new_operands, &clone_context));
+    cloned_instructions[instruction_to_clone] = new_instruction;
+  }
+  HloInstruction* new_root =
+      cloned_instructions.at(fissioned_computation->root_instruction());
+  return parent_computation->ReplaceInstruction(fusion_instr, new_root);
+}
+
+}  // namespace
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+FissionBackend::GetSupportedConfigs(const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return std::vector<std::unique_ptr<BackendConfig>>();
+  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      GetFissionedAndRewrittenModule(instr));
+  absl::StatusOr<HloInstruction*> supported_instr =
+      FindFirstSupportedInstruction(hlo_module.get());
+  if (supported_instr.status().code() == absl::StatusCode::kNotFound) {
+    return std::vector<std::unique_ptr<BackendConfig>>();
+  }
+  TF_RETURN_IF_ERROR(supported_instr.status());
+  return codegen_backend_->GetSupportedConfigs(**supported_instr);
+
+  return std::vector<std::unique_ptr<BackendConfig>>();
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>> FissionBackend::GetDefaultConfig(
+    const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError("Not a fusion instruction.");
+  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      GetFissionedAndRewrittenModule(instr));
+  TF_ASSIGN_OR_RETURN(HloInstruction * supported_instr,
+                      FindFirstSupportedInstruction(hlo_module.get()));
+  return codegen_backend_->GetDefaultConfig(*supported_instr);
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> FissionBackend::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    const Compiler::CompileOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      codegen_backend_->RunHloPasses(std::move(hlo_module), options));
+
+  // Run priority fusion to fuse the fissioned HLOs.
+  HloCostAnalysis::Options priority_fusion_options;
+  priority_fusion_options.count_multiple_input_accesses = true;
+  // TODO: b/407494653 - Get rid of PriorityFusion.
+  PriorityFusion priority_fusion(
+      /*thread_pool=*/nullptr, target_config().device_description,
+      priority_fusion_options, mlir_context_);
+  TF_RETURN_IF_ERROR(priority_fusion.Run(module.get()).status());
+  return module;
+}
+
+absl::Status FissionBackend::ApplyConfig(HloInstruction& instr,
+                                         const BackendConfig& config) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      GetFissionedAndRewrittenModule(instr));
+  TF_ASSIGN_OR_RETURN(HloInstruction * supported_instr,
+                      FindFirstSupportedInstruction(hlo_module.get()));
+  TF_RETURN_IF_ERROR(codegen_backend_->ApplyConfig(*supported_instr, config));
+  return InlineFissionedComputation(&instr, hlo_module->entry_computation());
+}
+
+bool FissionBackend::IsSupported(const HloInstruction& instr) {
+  return instr.opcode() == HloOpcode::kFusion;
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>>
+FissionBackend::GetFissionedAndRewrittenModule(
+    const HloInstruction& fusion_instr) {
+  const auto* fusion = Cast<HloFusionInstruction>(&fusion_instr);
+  std::unique_ptr<HloModule> hlo_module =
+      ExtractComputationIntoNewModule(*fusion->called_computation());
+  TF_RETURN_IF_ERROR(rewriter_pipeline_->Run(hlo_module.get()).status());
+  return hlo_module;
+}
+
+absl::StatusOr<HloInstruction*> FissionBackend::FindFirstSupportedInstruction(
+    const HloModule* module) {
+  std::vector<HloInstruction*> supported_instructions;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (codegen_backend_->IsSupported(*instruction)) {
+        supported_instructions.push_back(instruction);
+      }
+    }
+  }
+  if (supported_instructions.empty()) {
+    return absl::NotFoundError("No supported instructions found.");
+  }
+  if (supported_instructions.size() > 1) {
+    LOG(WARNING) << "Backend " << name()
+                 << " found multiple supported instructions found. Using the "
+                    "first one.";
+  }
+  return supported_instructions[0];
+}
+
+}  // namespace gpu
+
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
new file mode 100644
index 00000000000000..01f42446ec07d6
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_AUTOTUNER_FISSION_BACKEND_H_
+#define XLA_BACKENDS_GPU_AUTOTUNER_FISSION_BACKEND_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/compiler.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+// A proxy backend that wraps an actual codegen backend. The `rewriter_pipeline`
+// is used to transform unfused instructions to retarget them for the underlying
+// codegen backend.
+// For the get/apply config operations, the proxy backend only operates on the
+// *first* supported instruction by the underlying backend, found in the unfused
+// and transmormed HLO.
+// The assumption is that there is only one operation of interest in the fusion
+// (e.g., a 'dot' in a gemm fusion).
+class FissionBackend : public GpuCodegenBackend {
+ public:
+  FissionBackend(const DebugOptions* debug_options, Compiler* compiler,
+                 const Compiler::GpuTargetConfig* target_config,
+                 std::unique_ptr<GpuCodegenBackend> backend,
+                 std::unique_ptr<HloPassPipeline> rewriter_pipeline,
+                 mlir::MLIRContext* mlir_context,
+                 stream_executor::StreamExecutor* stream_executor = nullptr)
+      : GpuCodegenBackend(absl::StrCat(backend->name(), "_fission"),
+                          debug_options, compiler, target_config,
+                          stream_executor),
+        rewriter_pipeline_(std::move(rewriter_pipeline)),
+        codegen_backend_(std::move(backend)),
+        mlir_context_(mlir_context) {}
+  ~FissionBackend() override = default;
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(const HloInstruction& instr) override;
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      const Compiler::CompileOptions& options) override;
+
+  absl::Status ApplyConfig(HloInstruction& instr,
+                           const BackendConfig& config) override;
+
+  bool IsSupported(const HloInstruction& instr) override;
+
+ private:
+  absl::StatusOr<std::unique_ptr<HloModule>> GetFissionedAndRewrittenModule(
+      const HloInstruction& fusion_instr);
+  absl::StatusOr<HloInstruction*> FindFirstSupportedInstruction(
+      const HloModule* module);
+
+  std::unique_ptr<HloPassPipeline> rewriter_pipeline_;
+  std::unique_ptr<GpuCodegenBackend> codegen_backend_;
+  mlir::MLIRContext* mlir_context_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_AUTOTUNER_FISSION_BACKEND_H_
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
new file mode 100644
index 00000000000000..64877558f90559
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
@@ -0,0 +1,264 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/fission_backend.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/cublas.h"
+#include "xla/backends/gpu/autotuner/custom_kernel.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/nvptx_compiler.h"
+#include "xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h"
+#include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
+#include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+using absl_testing::IsOk;
+using absl_testing::IsOkAndHolds;
+using ::testing::HasSubstr;
+
+const char kTritonFusionHlo[] = R"(
+  HloModule module
+
+  computation {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    convert0 = f32[1024,1024]{1,0} convert(p0)
+    p1 = s8[1024,1024]{1,0} parameter(1)
+    convert1 = f32[1024,1024]{1,0} convert(p1)
+    ROOT dot = f32[1024,1024]{1,0} dot(convert0, convert1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY main {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    p1 = s8[1024,1024]{1,0} parameter(1)
+    ROOT fusion = f32[1024,1024]{1,0} fusion(p0, p1),
+      kind=kCustom, calls=computation,
+      backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+  })";
+
+const char kUnsupportedFusionHlo[] = R"(
+  HloModule module
+  computation {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    convert0 = f32[1024,1024]{1,0} convert(p0)
+    p1 = s8[1024,1024]{1,0} parameter(1)
+    convert1 = f32[1024,1024]{1,0} convert(p1)
+    ROOT add = f32[1024,1024]{1,0} add(convert0, convert1)
+  }
+
+  ENTRY main {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    p1 = s8[1024,1024]{1,0} parameter(1)
+    ROOT fusion = f32[1024,1024]{1,0} fusion(p0, p1),
+      kind=kCustom, calls=computation
+  })";
+
+struct FissionTestParams {
+  std::string test_name;
+  std::string hlo_string;
+  // Factory function to create the rewriter pipeline.
+  std::function<std::unique_ptr<HloPassPipeline>(
+      const se::DeviceDescription& device_description)>
+      pipeline_factory;
+  // Factory function to create the underlying codegen backend.
+  std::function<std::unique_ptr<GpuCodegenBackend>(
+      se::StreamExecutor*, const DebugOptions*, Compiler*,
+      const Compiler::GpuTargetConfig*)>
+      backend_factory;
+  // Substrings expected to be in the module after ApplyConfig.
+  std::vector<std::string> expected_module_substrings;
+  std::string expected_backend_name;
+};
+
+class FissionTest : public HloHardwareIndependentTestBase,
+                    public ::testing::WithParamInterface<FissionTestParams> {
+ public:
+  // Static helper to create the Cublas rewriter pipeline.
+  static std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
+      const se::DeviceDescription& device_description) {
+    auto pipeline = std::make_unique<HloPassPipeline>("fission_pipeline");
+    pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
+    for (GemmRewriterOptions::DType dtype :
+         {GemmRewriterOptions::DType::kFp8Only,
+          GemmRewriterOptions::DType::kNonFp8Only}) {
+      auto gemm_rewriter = std::make_unique<GemmRewriter>(
+          device_description.gpu_compute_capability(),
+          device_description.runtime_version(), GemmRewriterOptions{dtype});
+      pipeline->AddPass(std::move(gemm_rewriter));
+    }
+    return pipeline;
+  }
+
+  // Static helper to create the Custom Kernel rewriter pipeline.
+  static std::unique_ptr<HloPassPipeline> GetCustomKernelRewriterPipeline(
+      const se::DeviceDescription& device_description) {
+    auto pipeline = std::make_unique<HloPassPipeline>("fission_pipeline");
+    pipeline->AddPass(
+        std::make_unique<CustomKernelFusionRewriter>(&device_description));
+    return pipeline;
+  }
+
+  // Static helper to create a CublasBackend.
+  static std::unique_ptr<GpuCodegenBackend> CreateCublasBackend(
+      se::StreamExecutor* stream_executor, const DebugOptions* debug_options,
+      Compiler* compiler, const Compiler::GpuTargetConfig* target_config) {
+    return std::make_unique<CublasBackend>(stream_executor, debug_options,
+                                           compiler, target_config);
+  }
+
+  // Static helper to create a CustomKernelBackend.
+  static std::unique_ptr<GpuCodegenBackend> CreateCustomKernelBackend(
+      se::StreamExecutor* stream_executor, const DebugOptions* debug_options,
+      Compiler* compiler, const Compiler::GpuTargetConfig* target_config) {
+    return std::make_unique<CustomKernelBackend>(stream_executor, debug_options,
+                                                 compiler, target_config);
+  }
+
+ protected:
+  DebugOptions debug_options_;
+  NVPTXCompiler compiler_;
+  se::StreamExecutor* stream_executor_;
+  Compiler::GpuTargetConfig target_config_;
+  se::DeviceDescription device_description_;
+  std::unique_ptr<HloPassPipeline> rewriter_pipeline_;
+  std::unique_ptr<GpuCodegenBackend> base_codegen_backend_;
+  std::unique_ptr<FissionBackend> fission_backend_;
+  mlir::MLIRContext mlir_context_;
+
+  FissionTest()
+      : stream_executor_(PlatformUtil::GetDefaultPlatform()
+                             .value()
+                             ->ExecutorForDevice(0)
+                             .value()),
+        target_config_(stream_executor_),
+        device_description_(stream_executor_->GetDeviceDescription()),
+        rewriter_pipeline_(GetParam().pipeline_factory(device_description_)),
+        base_codegen_backend_(GetParam().backend_factory(
+            stream_executor_, &debug_options_, &compiler_, &target_config_)),
+        fission_backend_(std::make_unique<FissionBackend>(
+            &debug_options_, &compiler_, &target_config_,
+            std::move(base_codegen_backend_), std::move(rewriter_pipeline_),
+            &mlir_context_, stream_executor_)) {}
+};
+
+TEST_P(FissionTest, CanCreateFissionBackend) {
+  EXPECT_EQ(fission_backend_->name(), GetParam().expected_backend_name);
+}
+
+TEST_P(FissionTest, GetSupportedConfigs) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(GetParam().hlo_string));
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      fission_backend_->GetSupportedConfigs(
+          (*module->entry_computation()->root_instruction()));
+  EXPECT_THAT(configs, IsOkAndHolds(testing::SizeIs(1)));
+}
+
+TEST_P(FissionTest, GetSupportedConfigsUnsupportedFusion) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kUnsupportedFusionHlo));
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      fission_backend_->GetSupportedConfigs(
+          (*module->entry_computation()->root_instruction()));
+  EXPECT_THAT(configs, IsOkAndHolds(testing::IsEmpty()));
+}
+
+TEST_P(FissionTest, GetDefaultConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(GetParam().hlo_string));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  EXPECT_THAT(fission_backend_->GetDefaultConfig(*fusion), IsOk());
+}
+
+TEST_P(FissionTest, Compile) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(GetParam().hlo_string));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<BackendConfig> config,
+                          fission_backend_->GetDefaultConfig(*fusion));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          fission_backend_->Compile(*fusion, *config));
+  EXPECT_NE(executable, nullptr);
+}
+
+TEST_P(FissionTest, ApplyConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(GetParam().hlo_string));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<BackendConfig> config,
+                          fission_backend_->GetDefaultConfig(*fusion));
+  EXPECT_THAT(fission_backend_->ApplyConfig(*fusion, *config), IsOk());
+  std::string module_str = module->ToString();
+  for (const std::string& expected_substr :
+       GetParam().expected_module_substrings) {
+    EXPECT_THAT(module_str, HasSubstr(expected_substr));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FissionTests, FissionTest,
+    ::testing::ValuesIn<FissionTestParams>({
+        {"TritonFusion_Cublas",
+         kTritonFusionHlo,
+         &FissionTest::GetCublasRewriterPipeline,
+         &FissionTest::CreateCublasBackend,
+         /*expected_module_substrings=*/
+         {"custom_call_target=\"__cublas$gemm\"",
+          "\"selected_algorithm\":\"-1\""},
+         /*expected_backend_name=*/"Cublas_fission"},
+        {"TritonFusion_CustomKernel",
+         kTritonFusionHlo,
+         &FissionTest::GetCustomKernelRewriterPipeline,
+         &FissionTest::CreateCustomKernelBackend,
+         /*expected_module_substrings=*/
+         {
+             "\"kind\":\"__custom_fusion\"",
+         },
+         /*expected_backend_name=*/"CustomKernel_fission"},
+    }),
+    [](const ::testing::TestParamInfo<FissionTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_test.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_test.cc
index cd881c9240ac99..56b8907c42c13b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/compiler.h"
@@ -73,7 +74,7 @@ class FissionBackendTest : public HloHardwareIndependentTestBase {
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   FissionBackend backend_;
   mlir::MLIRContext mlir_context_;
 
@@ -184,6 +185,79 @@ TEST_F(FissionBackendTest, ApplyCustomKernelConfigToFusionInstruction) {
               IsOkAndHolds(true));
 }
 
+TEST_F(FissionBackendTest, ApplyCublasConfigToFusionInWhileBody) {
+  const char kWhileHlo[] = R"(
+HloModule module
+
+fusion_computation {
+  fp0 = bf16[1024,1024]{1,0} parameter(0)
+  convert0 = f32[1024,1024]{1,0} convert(fp0)
+  fp1 = s8[1024,1024]{1,0} parameter(1)
+  convert1 = f32[1024,1024]{1,0} convert(fp1)
+  ROOT dot = f32[1024,1024]{1,0} dot(convert0, convert1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+while_cond {
+  cond_param = (s32[], f32[1024,1024]{1,0}, bf16[1024,1024]{1,0}, s8[1024,1024]{1,0}) parameter(0)
+  count = s32[] get-tuple-element(cond_param), index=0
+  limit = s32[] constant(1)
+  ROOT result = pred[] compare(count, limit), direction=LT
+}
+
+while_body {
+  body_param = (s32[], f32[1024,1024]{1,0}, bf16[1024,1024]{1,0}, s8[1024,1024]{1,0}) parameter(0)
+  count = s32[] get-tuple-element(body_param), index=0
+  p0_body = bf16[1024,1024]{1,0} get-tuple-element(body_param), index=2
+  p1_body = s8[1024,1024]{1,0} get-tuple-element(body_param), index=3
+  fusion = f32[1024,1024]{1,0} fusion(p0_body, p1_body),
+    kind=kCustom, calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+  one = s32[] constant(1)
+  new_count = s32[] add(count, one)
+  ROOT result = (s32[], f32[1024,1024]{1,0}, bf16[1024,1024]{1,0}, s8[1024,1024]{1,0}) tuple(new_count, fusion, p0_body, p1_body)
+}
+
+ENTRY main {
+  p0 = bf16[1024,1024]{1,0} parameter(0)
+  p1 = s8[1024,1024]{1,0} parameter(1)
+  c0 = s32[] constant(0)
+  init_f32 = f32[1024,1024]{1,0} broadcast(f32[] constant(0.0)), dimensions={}
+  while_init = (s32[], f32[1024,1024]{1,0}, bf16[1024,1024]{1,0}, s8[1024,1024]{1,0}) tuple(c0, init_f32, p0, p1)
+  while_result = (s32[], f32[1024,1024]{1,0}, bf16[1024,1024]{1,0}, s8[1024,1024]{1,0}) while(while_init),
+    body=while_body, condition=while_cond
+  ROOT result = f32[1024,1024]{1,0} get-tuple-element(while_result), index=1
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kWhileHlo));
+  hlo_module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_enable_cublaslt(false);
+
+  HloInstruction* while_instr =
+      hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
+  ASSERT_EQ(while_instr->opcode(), HloOpcode::kWhile);
+  HloComputation* body_computation = while_instr->while_body();
+  HloInstruction* fusion_instr =
+      body_computation->root_instruction()->mutable_operand(1);
+  ASSERT_EQ(fusion_instr->opcode(), HloOpcode::kFusion);
+
+  AutotuneResult::GemmKey config;
+  config.set_algorithm(3);
+  google::protobuf::Any any;
+  any.PackFrom(config);
+  TF_EXPECT_OK(backend_.ApplyConfig(*fusion_instr, any));
+  EXPECT_THAT(
+      RunFileCheck(
+          hlo_module->ToString(),
+          "CHECK: while_body"
+          "\nCHECK: custom-call({{.*}}), custom_call_target=\"__cublas$gemm\""
+          "\nCHECK: \"selected_algorithm\":\"3\""),
+      IsOkAndHolds(true));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h
index 68aef91ec19308..424850500ee5fa 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/xla.pb.h"
 
 namespace xla {
-namespace  gpu {
+namespace gpu {
 
 // Abstract base class for GPU backends, implementing the Backend interface.
 class GpuCodegenBackend : public CodegenBackend {
@@ -43,7 +43,7 @@ class GpuCodegenBackend : public CodegenBackend {
   // TODO(b/447096292): Remove stream_executor from GpuCodegenBackend.
   GpuCodegenBackend(absl::string_view name, const DebugOptions* debug_options,
                     Compiler* compiler,
-                    const Compiler::TargetConfig* target_config,
+                    const Compiler::GpuTargetConfig* target_config,
                     stream_executor::StreamExecutor* stream_executor = nullptr)
       : name_(name),
         stream_executor_(stream_executor),
@@ -53,7 +53,9 @@ class GpuCodegenBackend : public CodegenBackend {
 
   absl::string_view name() const override { return name_; }
 
-  const Compiler::TargetConfig& target_config() const { return target_config_; }
+  const Compiler::GpuTargetConfig& target_config() const {
+    return target_config_;
+  }
   const DebugOptions& debug_options() const { return debug_options_; }
   stream_executor::StreamExecutor* stream_executor() {
     return stream_executor_;
@@ -75,7 +77,7 @@ class GpuCodegenBackend : public CodegenBackend {
         allow_register_spills_);
 
     Compiler::CompileOptions options;
-    options.target_config = target_config_;
+    options.gpu_target_config = target_config_;
     options.is_autotuning_compilation = true;
     TF_ASSIGN_OR_RETURN(auto optimized_module,
                         RunHloPasses(std::move(hlo_module), options));
@@ -84,11 +86,11 @@ class GpuCodegenBackend : public CodegenBackend {
   }
 
   bool CanProduceWrongResults() const override { return false; }
+  // When called, the backend will not set
+  // `xla_gpu_fail_ptx_compilation_on_register_spilling` flag during autotuning,
+  // keeping the value already set in module config.
   // TODO b/443207721 - Remove this once we have a better way to handle register
   // spilling during autotuning.
-  // Allows compilation to succeed even if kernels spill registers,
-  // ignoring the `xla_gpu_filter_kernels_spilling_registers_on_autotuning`
-  // flag. If not called, the flag's value is honored.
   void AllowRegisterSpills() { allow_register_spills_ = true; }
 
   static void AdjustDebugOptionsForAutotuning(
@@ -104,9 +106,13 @@ class GpuCodegenBackend : public CodegenBackend {
     debug_options.set_xla_gpu_async_dot(false);
     debug_options.set_xla_embed_ir_in_executable(false);
     debug_options.set_xla_gpu_kernel_cache_file("");
-    if (force_allow_register_spills) {
-      debug_options.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
-          false);
+    debug_options.set_xla_enable_scoped_logging_timers(false);
+    // Don't touch the "fail on register spilling" flag if it's already on.
+    if (!debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling()) {
+      debug_options.set_xla_gpu_fail_ptx_compilation_on_register_spilling(
+          debug_options
+              .xla_gpu_filter_kernels_spilling_registers_on_autotuning() &&
+          !force_allow_register_spills);
     }
   }
 
@@ -121,9 +127,13 @@ class GpuCodegenBackend : public CodegenBackend {
     return hlo_module;
   };
 
+  virtual bool IsSupported(const HloInstruction& instr) = 0;
+
+  friend class FissionBackend;
+
   std::string name_;
   stream_executor::StreamExecutor* stream_executor_;
-  const Compiler::TargetConfig& target_config_;
+  const Compiler::GpuTargetConfig& target_config_;
   const DebugOptions& debug_options_;
   // TODO(b/407494653): remove compiler when we don't need to run any HLO passes
   // and the codegen backend can directly produce an executable without a
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
index e9f139ba1ce46c..7b08c0f0637cbf 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
@@ -34,6 +34,7 @@ TEST_F(GpuCodegenBackendTest, AdjustDebugOptionsForAutotuning) {
   debug_options.set_xla_gpu_kernel_cache_file("foo.txt");
   debug_options.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
       true);
+  debug_options.set_xla_gpu_fail_ptx_compilation_on_register_spilling(false);
 
   GpuCodegenBackend::AdjustDebugOptionsForAutotuning(
       debug_options, /*force_allow_register_spills=*/false);
@@ -47,19 +48,34 @@ TEST_F(GpuCodegenBackendTest, AdjustDebugOptionsForAutotuning) {
   EXPECT_FALSE(debug_options.xla_embed_ir_in_executable());
   EXPECT_EQ(debug_options.xla_gpu_kernel_cache_file(), "");
   EXPECT_TRUE(
-      debug_options.xla_gpu_filter_kernels_spilling_registers_on_autotuning());
+      debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling());
 }
 
 TEST_F(GpuCodegenBackendTest, AdjustDebugOptionsForAutotuningAllowSpilling) {
   DebugOptions debug_options;
   debug_options.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
       true);
-
+  debug_options.set_xla_gpu_fail_ptx_compilation_on_register_spilling(false);
   GpuCodegenBackend::AdjustDebugOptionsForAutotuning(
       debug_options, /*force_allow_register_spills=*/true);
-
   EXPECT_FALSE(
-      debug_options.xla_gpu_filter_kernels_spilling_registers_on_autotuning());
+      debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling());
+}
+
+TEST_F(GpuCodegenBackendTest,
+       AdjustDebugOptionsForAutotuningKeepsAlreadySetFailOnSpilling) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
+      false);
+  debug_options.set_xla_gpu_fail_ptx_compilation_on_register_spilling(true);
+  GpuCodegenBackend::AdjustDebugOptionsForAutotuning(
+      debug_options, /*force_allow_register_spills=*/false);
+  EXPECT_TRUE(
+      debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling());
+  GpuCodegenBackend::AdjustDebugOptionsForAutotuning(
+      debug_options, /*force_allow_register_spills=*/true);
+  EXPECT_TRUE(
+      debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling());
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
index a5de5c4a0b630f..79242cf983550c 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
@@ -16,14 +16,19 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 
 #include <optional>
+#include <string>
 
 #include "google/protobuf/duration.pb.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/autotuning/autotune_cache_key.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 
 namespace xla {
@@ -42,7 +47,7 @@ std::optional<LegacyCache::Config> LegacyCache::Lookup(
   if (!result->has_value()) {
     return std::nullopt;
   }
-  return GetConfig(result->value());
+  return GetConfig(result->value(), instr->opcode() == HloOpcode::kFusion);
 }
 
 absl::Status LegacyCache::Insert(const HloInstruction* instr,
@@ -63,19 +68,48 @@ absl::Status LegacyCache::Insert(const HloInstruction* instr,
   return absl::OkStatus();
 }
 
+void LegacyCache::ClearCache() { AutotunerUtil::ClearAutotuneResults(); }
+
+absl::StatusOr<std::string> LegacyCache::Serialize(
+    absl::Span<const HloInstruction* const> instructions_to_serialize) {
+  AutotuneCacheKeySet key_set;
+  for (const HloInstruction* instr : instructions_to_serialize) {
+    key_set.insert(GetAutotuneCacheKey(*instr));
+  }
+
+  std::optional<const AutotuneCacheKeySet*> keys_to_send = std::nullopt;
+  if (!key_set.empty()) {
+    keys_to_send = &key_set;
+  }
+
+  AutotuneResults results;
+  TF_RETURN_IF_ERROR(
+      AutotunerUtil::SerializeAutotuneResults(&results, keys_to_send));
+  return AutotuneResultsToString(results, true);
+}
+
+absl::Status LegacyCache::Deserialize(absl::string_view serialized_cache) {
+  return AutotunerUtil::LoadAutotuneResults(serialized_cache,
+                                            /*as_textproto=*/true,
+                                            /*allow_override=*/true);
+}
+
 AutotuneCacheKey LegacyCache::GetAutotuneCacheKey(const HloInstruction& instr) {
   AutotuneCacheKey key(device_desc_, instr);
   return key;
 }
 
 std::optional<LegacyCache::Config> LegacyCache::GetConfig(
-    const AutotuneResult& result) {
+    const AutotuneResult& result, bool is_fusion_instruction) {
   Config config;
   if (result.has_triton()) {
     config.codegen_backend_name = "Triton";
     config.backend_config.PackFrom(result.triton());
   } else if (result.has_gemm()) {
     config.codegen_backend_name = "Cublas";
+    if (is_fusion_instruction) {
+      config.codegen_backend_name = "Cublas_fission";
+    }
     config.backend_config.PackFrom(result.gemm());
   } else if (result.has_algorithm()) {
     config.codegen_backend_name = "Cudnn";
@@ -83,6 +117,9 @@ std::optional<LegacyCache::Config> LegacyCache::GetConfig(
   } else if (result.has_other()) {
     config.codegen_backend_name = result.other().name();
     config.backend_config = result.other().config();
+  } else if (result.has_custom_kernel_fusion()) {
+    config.codegen_backend_name = "CustomKernel_fission";
+    config.backend_config.PackFrom(result.custom_kernel_fusion());
   } else {
     return std::nullopt;
   }
@@ -94,10 +131,13 @@ std::optional<AutotuneResult> LegacyCache::GetAutotuneResult(
   AutotuneResult result;
   if (config.codegen_backend_name == "Triton") {
     config.backend_config.UnpackTo(result.mutable_triton());
-  } else if (config.codegen_backend_name == "Cublas") {
+  } else if (config.codegen_backend_name == "Cublas" ||
+             config.codegen_backend_name == "Cublas_fission") {
     config.backend_config.UnpackTo(result.mutable_gemm());
   } else if (config.codegen_backend_name == "Cudnn") {
     config.backend_config.UnpackTo(result.mutable_algorithm());
+  } else if (config.codegen_backend_name == "CustomKernel_fission") {
+    config.backend_config.UnpackTo(result.mutable_custom_kernel_fusion());
   } else {
     result.mutable_other()->set_name(config.codegen_backend_name);
     *result.mutable_other()->mutable_config() = config.backend_config;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h
index 48b79d7fee666a..48c30b085235a4 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h
@@ -21,6 +21,9 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -45,12 +48,19 @@ class LegacyCache : public AutotunerCacheInterface {
   absl::Status Insert(const HloInstruction* instr,
                       const Config& best_config) override;
 
+  absl::StatusOr<std::string> Serialize(absl::Span<const HloInstruction* const>
+                                            instructions_to_serialize) override;
+  absl::Status Deserialize(absl::string_view serialized_cache) override;
+
+  void ClearCache();
+
  private:
   AutotuneCacheKey GetAutotuneCacheKey(const HloInstruction& instr);
 
   // Translates between the AutotunerCacheInterface::Config and the
   // AutotuneResult.
-  std::optional<Config> GetConfig(const AutotuneResult& result);
+  std::optional<Config> GetConfig(const AutotuneResult& result,
+                                  bool is_fusion_instruction);
   std::optional<AutotuneResult> GetAutotuneResult(const Config& config);
 
   const std::string cache_dir_;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
index b1379a62e26b44..6c2744e0bd026f 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
 #include <gmock/gmock.h>
@@ -28,11 +29,13 @@ limitations under the License.
 #include "xla/backends/autotuner/autotuner_cache.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/parser/hlo_parser.h"
 #include "xla/literal_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 
 namespace xla {
@@ -98,6 +101,13 @@ class LegacyCacheTest : public ::testing::Test {
     return config;
   }
 
+  Config CreateDummyCublasFissionConfig() {
+    Config config;
+    config.codegen_backend_name = "Cublas_fission";
+    config.backend_config.PackFrom(AutotuneResult::GemmKey());
+    return config;
+  }
+
   Config CreateDummyCudnnConfig() {
     Config config;
     config.codegen_backend_name = "Cudnn";
@@ -105,6 +115,13 @@ class LegacyCacheTest : public ::testing::Test {
     return config;
   }
 
+  Config CreateDummyCustomKernelFissionConfig() {
+    Config config;
+    config.codegen_backend_name = "CustomKernel_fission";
+    config.backend_config.PackFrom(AutotuneResult::CustomKernelFusionKey());
+    return config;
+  }
+
   Config CreateDummyBackendConfig() {
     using DummyOtherConfig = AutotuneResult::CustomKernelFusionKey;
     Config config;
@@ -164,6 +181,31 @@ TEST_F(LegacyCacheTest, InsertAndLookupCublas) {
   EXPECT_THAT(cache.Lookup(instr.get()), Optional(ConfigEq(config)));
 }
 
+TEST_F(LegacyCacheTest, InsertAndLookupCublasFission) {
+  auto cache = LegacyCache(test_dir_, mode_, device_desc_);
+  constexpr char kHLO[] = R"(
+HloModule test_module
+
+fused_computation {
+  param.0 = f32[] parameter(0)
+  param.1 = f32[] parameter(1)
+  ROOT add.0 = f32[] add(param.0, param.1)
+}
+
+ENTRY main {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT fusion.0 = f32[] fusion(p0, p1), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHLO));
+  auto instr = module->entry_computation()->root_instruction();
+  Config config = CreateDummyCublasFissionConfig();
+
+  TF_ASSERT_OK(cache.Insert(instr, config));
+  EXPECT_THAT(cache.Lookup(instr), Optional(ConfigEq(config)));
+}
+
 TEST_F(LegacyCacheTest, InsertAndLookupCudnn) {
   auto cache = LegacyCache(test_dir_, mode_, device_desc_);
   auto instr = CreateDummyInstr("hlo3");
@@ -173,9 +215,17 @@ TEST_F(LegacyCacheTest, InsertAndLookupCudnn) {
   EXPECT_THAT(cache.Lookup(instr.get()), Optional(ConfigEq(config)));
 }
 
-TEST_F(LegacyCacheTest, InsertAndLookupOther) {
+TEST_F(LegacyCacheTest, InsertAndLookupCustomKernelFission) {
   auto cache = LegacyCache(test_dir_, mode_, device_desc_);
   auto instr = CreateDummyInstr("hlo4");
+  Config config = CreateDummyCustomKernelFissionConfig();
+  TF_ASSERT_OK(cache.Insert(instr.get(), config));
+  EXPECT_THAT(cache.Lookup(instr.get()), Optional(ConfigEq(config)));
+}
+
+TEST_F(LegacyCacheTest, InsertAndLookupOther) {
+  auto cache = LegacyCache(test_dir_, mode_, device_desc_);
+  auto instr = CreateDummyInstr("hlo5");
   Config config = CreateDummyBackendConfig();
 
   TF_ASSERT_OK(cache.Insert(instr.get(), config));
@@ -232,6 +282,32 @@ TEST_F(LegacyCacheTest, OnlyInsertOncePerHlo) {
   EXPECT_THAT(cache.Lookup(instr.get()), Optional(ConfigEq(config)));
 }
 
+TEST_F(LegacyCacheTest, SerializeAndDeserialize) {
+  LegacyCache cache(test_dir_, mode_, device_desc_);
+  std::unique_ptr<HloInstruction> instr_1 = CreateDummyInstr("hlo9");
+  std::unique_ptr<HloInstruction> instr_2 = CreateDummyInstr("hlo10");
+  Config orig_config = CreateDummyTritonConfig();
+  TF_ASSERT_OK(cache.Insert(instr_1.get(), orig_config));
+  TF_ASSERT_OK(cache.Insert(instr_2.get(), orig_config));
+
+  // Serialize instr_1 to a string.
+  std::vector<const HloInstruction*> instructions_to_serialize = {
+      instr_1.get()};
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_cache,
+                          cache.Serialize(instructions_to_serialize));
+
+  // Overwrite config for both instructions.
+  cache.ClearCache();
+  Config another_config = CreateDummyCublasConfig();
+  TF_ASSERT_OK(cache.Insert(instr_1.get(), another_config));
+  TF_ASSERT_OK(cache.Insert(instr_2.get(), another_config));
+
+  // Deserialize the cache, only instr_1 should be overwritten.
+  TF_ASSERT_OK(cache.Deserialize(serialized_cache));
+  EXPECT_THAT(cache.Lookup(instr_1.get()), Optional(ConfigEq(orig_config)));
+  EXPECT_THAT(cache.Lookup(instr_2.get()), Optional(ConfigEq(another_config)));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen.cc b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
new file mode 100644
index 00000000000000..c4e0872244c092
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/miopen.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using MIOpenBackendConfig = stream_executor::dnn::AlgorithmProto;
+
+namespace {
+
+// Replaces the instruction with a new instruction with the same name in the
+// parent computation. The given instruction will be replaced by a tuple of the
+// convolution result and the workspace size. A few following instructions will
+// be added to the parent computation to extract the convolution result from the
+// new tuple.
+absl::Status ApplyConfigAndUpdateWorkspaceInOutputTuple(
+    HloInstruction& instr, const MIOpenBackendConfig& config) {
+  HloComputation* computation = instr.parent();
+  std::vector<Shape> new_call_element_shapes;
+  // Add the shapes of the outputs of the convolution.
+  new_call_element_shapes.reserve(instr.shape().tuple_shapes().size() - 1);
+  for (int i = 0; i < instr.shape().tuple_shapes().size() - 1; ++i) {
+    new_call_element_shapes.emplace_back(instr.shape().tuple_shapes(i));
+  }
+  // The final element is the size of the workspace.
+  int64_t workspace_size = config.workspace_size().value();
+  new_call_element_shapes.emplace_back(
+      ShapeUtil::MakeShape(U8, {workspace_size}));
+  Shape new_call_shape = ShapeUtil::MakeTupleShape(new_call_element_shapes);
+  HloInstruction* new_call = computation->AddInstruction(
+      instr.CloneWithNewOperands(new_call_shape, instr.operands()));
+  new_call->SetAndSanitizeName(instr.name());
+
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_backend_config,
+                      instr.backend_config<GpuBackendConfig>());
+  CudnnConvBackendConfig* cudnn_conv_config =
+      gpu_backend_config.mutable_cudnn_conv_backend_config();
+  *cudnn_conv_config->mutable_algorithm() = config;
+  TF_RETURN_IF_ERROR(new_call->set_backend_config(gpu_backend_config));
+
+  std::vector<HloInstruction*> new_tuple_elements;
+  new_tuple_elements.reserve(new_call->shape().tuple_shapes().size() - 1);
+  for (int i = 0; i < new_call->shape().tuple_shapes().size() - 1; ++i) {
+    new_tuple_elements.emplace_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_call->shape().tuple_shapes(i), new_call, i)));
+  }
+  new_tuple_elements.emplace_back(computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<uint8_t>({}))));
+
+  // Repackage new_call so it has the same shape as the original call, namely
+  // (conv_result, u8[0]).
+  HloInstruction* new_tuple = computation->AddInstruction(
+      HloInstruction::CreateTuple(new_tuple_elements));
+
+  TF_RETURN_IF_ERROR(instr.parent()->ReplaceInstruction(&instr, new_tuple));
+  return absl::OkStatus();
+}
+
+absl::Status ApplyConfigToMIOpenCustomCall(HloInstruction& instr,
+                                           const MIOpenBackendConfig& config) {
+  if (config.has_workspace_size() && config.workspace_size().value() > 0) {
+    return ApplyConfigAndUpdateWorkspaceInOutputTuple(instr, config);
+  }
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
+                      instr.backend_config<GpuBackendConfig>());
+  CudnnConvBackendConfig* cudnn_conv_config =
+      gpu_config.mutable_cudnn_conv_backend_config();
+  *cudnn_conv_config->mutable_algorithm() = config;
+  TF_RETURN_IF_ERROR(instr.set_backend_config(std::move(gpu_config)));
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+bool MIOpenBackend::IsSupported(const HloInstruction& instr) {
+  return IsCustomCallToDnnConvolution(instr);
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>> MIOpenBackend::GetDefaultConfig(
+    const HloInstruction& instr) {
+  if (IsSupported(instr)) {
+    MIOpenBackendConfig config;
+    config.set_algo_id(-1);
+    auto any = std::make_unique<google::protobuf::Any>();
+    any->PackFrom(config);
+    return any;
+  }
+  return absl::InvalidArgumentError(
+      "MIOpen backend doesn't support getting a default config for this "
+      "instruction.");
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+MIOpenBackend::GetSupportedConfigs(const HloInstruction& instr) {
+  if (IsSupported(instr)) {
+    MIOpenBackendConfig config;
+    config.set_algo_id(-1);
+    auto any = std::make_unique<google::protobuf::Any>();
+    any->PackFrom(config);
+    std::vector<std::unique_ptr<BackendConfig>> configs;
+    configs.push_back(std::move(any));
+    return configs;
+  }
+  return std::vector<std::unique_ptr<BackendConfig>>();
+}
+
+absl::Status MIOpenBackend::ApplyConfig(HloInstruction& instr,
+                                        const BackendConfig& config) {
+  MIOpenBackendConfig algorithm_config;
+  if (!config.UnpackTo(&algorithm_config)) {
+    return absl::InvalidArgumentError(
+        "Failed to unpack MIOpenBackendConfig from Any.");
+  }
+  if (IsSupported(instr)) {
+    return ApplyConfigToMIOpenCustomCall(instr, algorithm_config);
+  }
+  return absl::InvalidArgumentError(
+      "MIOpen backend doesn't support this instruction.");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen.h b/third_party/xla/xla/backends/gpu/autotuner/miopen.h
new file mode 100644
index 00000000000000..73bc68705091a9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen.h
@@ -0,0 +1,59 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_AUTOTUNER_MIOPEN_H_
+#define XLA_BACKENDS_GPU_AUTOTUNER_MIOPEN_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/compiler.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// A codegen backend for MIOpen.
+class MIOpenBackend : public GpuCodegenBackend {
+ public:
+  explicit MIOpenBackend(stream_executor::StreamExecutor* stream_executor,
+                         const DebugOptions* debug_options, Compiler* compiler,
+                         const Compiler::GpuTargetConfig* target_config)
+      : GpuCodegenBackend("MIOpen", debug_options, compiler, target_config,
+                          stream_executor) {}
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(const HloInstruction& instr) override;
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+  absl::Status ApplyConfig(HloInstruction& instr,
+                           const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_AUTOTUNER_MIOPEN_H_
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen_test.cc b/third_party/xla/xla/backends/gpu/autotuner/miopen_test.cc
new file mode 100644
index 00000000000000..5a5538f51190af
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/miopen.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/amdgpu_compiler.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using MIOpenBackendConfig = stream_executor::dnn::AlgorithmProto;
+
+using absl_testing::IsOkAndHolds;
+using ::testing::SizeIs;
+using ::tsl::proto_testing::EqualsProto;
+
+const char kMIOpenCustomCallHlo[] = R"(
+  HloModule module
+
+  ENTRY %main {
+    %arg0 = f32[3,56,56,16]{2,1,0,3} parameter(0)
+    %arg1 = f32[3,3,3,64]{2,1,0,3} parameter(1)
+    %cudnn-conv = (f32[54,54,16,64]{1,0,3,2}, u8[0]{0})
+      custom-call(%arg0, %arg1), custom_call_target="__cudnn$convForward",
+      window={size=3x3},
+      dim_labels=f01b_i01o->01bf,
+      backend_config={
+        "cudnn_conv_backend_config":{
+          "activation_mode":"kNone",
+          "conv_result_scale":1,
+          "side_input_scale":0,
+          "leakyrelu_alpha":0
+        },
+      }
+    ROOT %get-tuple-element = f32[54,54,16,64]{1,0,3,2} get-tuple-element(%cudnn-conv), index=0
+  })";
+
+class MIOpenBackendTest : public HloHardwareIndependentTestBase {
+ protected:
+  DebugOptions debug_options_;
+  AMDGPUCompiler compiler_;
+  se::StreamExecutor* stream_executor_;
+  Compiler::GpuTargetConfig target_config_;
+  MIOpenBackend backend_;
+
+  MIOpenBackendTest()
+      : stream_executor_(PlatformUtil::GetDefaultPlatform()
+                             .value()
+                             ->ExecutorForDevice(0)
+                             .value()),
+        target_config_(stream_executor_),
+        backend_(stream_executor_, &debug_options_, &compiler_,
+                 &target_config_) {}
+
+  bool IsRocm() {
+    return stream_executor_->GetPlatform()->id() == se::rocm::kROCmPlatformId;
+  }
+};
+
+TEST_F(MIOpenBackendTest, CanCreateMIOpenBackend) {
+  ASSERT_NE(nullptr, &backend_);
+}
+
+TEST_F(MIOpenBackendTest, GetSupportedConfigsFromMIOpenCustomCall) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend_.GetSupportedConfigs(
+          (*hlo_module->entry_computation()->root_instruction()->operand(0)));
+  ASSERT_THAT(configs, IsOkAndHolds(SizeIs(1)));
+  MIOpenBackendConfig algorithm_config;
+  ASSERT_TRUE((*configs)[0]->UnpackTo(&algorithm_config));
+  EXPECT_EQ(algorithm_config.algo_id(), -1);
+}
+
+TEST_F(MIOpenBackendTest, GetDefaultConfigFromMIOpenCustomCall) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(
+          (*hlo_module->entry_computation()->root_instruction()->operand(0)));
+  TF_ASSERT_OK(config);
+  MIOpenBackendConfig algorithm_config;
+  ASSERT_TRUE(config->get()->UnpackTo(&algorithm_config));
+  EXPECT_EQ(algorithm_config.algo_id(), -1);
+}
+
+TEST_F(MIOpenBackendTest, ApplyConfigToMIOpenCustomCall) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  MIOpenBackendConfig config;
+  config.set_algo_id(1);
+  HloInstruction* instr =
+      hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
+  google::protobuf::Any any;
+  any.PackFrom(config);
+  TF_ASSERT_OK(backend_.ApplyConfig(*instr, any));
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                          instr->backend_config<GpuBackendConfig>());
+  EXPECT_THAT(gpu_config.cudnn_conv_backend_config().algorithm(),
+              EqualsProto(config));
+}
+
+TEST_F(MIOpenBackendTest, ApplyConfigToMIOpenCustomCallWithWorkspace) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  MIOpenBackendConfig config;
+  config.set_algo_id(1);
+  config.mutable_workspace_size()->set_value(1024);
+  HloInstruction* instr =
+      hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
+  google::protobuf::Any any;
+  any.PackFrom(config);
+  TF_ASSERT_OK(backend_.ApplyConfig(*instr, any));
+
+  auto* replaced_instr =
+      hlo_module->entry_computation()->GetInstructionWithName("cudnn-conv");
+
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                          replaced_instr->backend_config<GpuBackendConfig>());
+  EXPECT_THAT(gpu_config.cudnn_conv_backend_config().algorithm(),
+              EqualsProto(config));
+  EXPECT_EQ(replaced_instr->shape().tuple_shapes(1).dimensions(0), 1024);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc
index f3828419f64e4b..bb29210837ae18 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc
@@ -43,7 +43,7 @@ namespace gpu {
 // it has a config for another backend, and we currently don't have an easy way
 // to check that. Therefore, we only support fusions that are already set up to
 // go through the native emitter.
-bool IsSupported(const HloInstruction& instr) {
+bool NativeEmitterBackend::IsSupported(const HloInstruction& instr) {
   if (instr.opcode() != HloOpcode::kFusion) {
     return false;
   }
diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h
index 37eff23ea87e37..a7fc19d559bf1a 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h
@@ -40,7 +40,7 @@ class NativeEmitterBackend : public GpuCodegenBackend {
  public:
   explicit NativeEmitterBackend(const DebugOptions* absl_nonnull debug_options,
                                 Compiler* absl_nonnull compiler,
-                                const Compiler::TargetConfig* target_config)
+                                const Compiler::GpuTargetConfig* target_config)
       : GpuCodegenBackend("NativeEmitter", debug_options, compiler,
                           target_config) {}
 
@@ -55,6 +55,9 @@ class NativeEmitterBackend : public GpuCodegenBackend {
   // Applies a given fusion config to the instruction.
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
index 5aebab0ce49c9e..bdf8dad85a9b8b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
@@ -101,7 +101,7 @@ class NativeEmitterBackendTest : public HloHardwareIndependentTestBase {
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   NativeEmitterBackend backend_;
 };
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.cc b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
index 0a4251a839a672..3ea59acbd0c445 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
@@ -18,14 +18,16 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
-#include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -38,6 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/triton_configs.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_float_support.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/split_k_gemm_rewriter.h"
@@ -59,39 +62,39 @@ namespace gpu {
 namespace {
 std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
     se::GpuComputeCapability compute_capability, bool autotune_tma) {
-  if (std::holds_alternative<se::CudaComputeCapability>(compute_capability)) {
-    auto cuda_compute_capability =
-        std::get<se::CudaComputeCapability>(compute_capability);
-    std::vector<TritonGemmConfig> configs;
-
-    if (cuda_compute_capability.IsAtLeastBlackwell()) {
-      configs = *kBlackwellConfigs;
-    } else if (cuda_compute_capability.IsHopper() ||
-               cuda_compute_capability.IsAmpere()) {
-      configs = *kHopperAmpereConfigs;
-    } else {
-      configs = *kDefaultCudaConfigs;
-    }
+  if (compute_capability.IsRocm()) {
+    return *kDefaultRocmConfigs;
+  }
 
-    if (!autotune_tma) {
-      return configs;
-    }
+  CHECK(compute_capability.IsCuda());
+  auto* cuda_compute_capability = compute_capability.cuda_compute_capability();
+  std::vector<TritonGemmConfig> configs;
+
+  if (cuda_compute_capability->IsAtLeastBlackwell()) {
+    configs = *kBlackwellConfigs;
+  } else if (cuda_compute_capability->IsHopper() ||
+             cuda_compute_capability->IsAmpere()) {
+    configs = *kHopperAmpereConfigs;
+  } else {
+    configs = *kDefaultCudaConfigs;
+  }
 
-    // Hopper+ devices support TMA. Add TMA parameterized configs.
-    std::vector<TritonGemmConfig> tma_parameterized_configs;
-    for (auto& config : configs) {
-      config.is_tma_allowed = false;
-      tma_parameterized_configs.push_back(config);
+  if (!autotune_tma) {
+    return configs;
+  }
+
+  // Hopper+ devices support TMA. Add TMA parameterized configs.
+  std::vector<TritonGemmConfig> tma_parameterized_configs;
+  for (auto& config : configs) {
+    config.is_tma_allowed = false;
+    tma_parameterized_configs.push_back(config);
 
+    if (IsTmaRecommended(config)) {
       config.is_tma_allowed = true;
       tma_parameterized_configs.push_back(config);
     }
-    return tma_parameterized_configs;
   }
-  if (std::holds_alternative<se::RocmComputeCapability>(compute_capability)) {
-    return *kDefaultRocmConfigs;
-  }
-  return {};
+  return tma_parameterized_configs;
 }
 
 }  // namespace
@@ -149,13 +152,13 @@ TritonBackend::GetSupportedConfigs(const HloInstruction& instr) {
 
 absl::StatusOr<std::unique_ptr<BackendConfig>> TritonBackend::GetDefaultConfig(
     const HloInstruction& instr) {
-  if (!IsSupported(instr)) {
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<BackendConfig>> configs,
+                      GetSupportedConfigs(instr));
+  if (configs.empty()) {
     return absl::InvalidArgumentError(
         "TritonBackend does not support this instruction.");
   }
-  auto any = std::make_unique<google::protobuf::Any>();
-  any->PackFrom(TritonGemmConfig(64, 64, 64, 1, 1, 2, 1, false).ToProto());
-  return any;
+  return std::move(configs[0]);
 }
 
 absl::Status TritonBackend::ApplyConfig(HloInstruction& instr,
@@ -175,6 +178,7 @@ absl::Status TritonBackend::ApplyConfig(HloInstruction& instr,
   FusionBackendConfig& backend_config =
       *gpu_config.mutable_fusion_backend_config();
 
+  backend_config.set_kind(kTritonGemmFusionKind);
   *backend_config.mutable_triton_gemm_config() = triton_config_proto;
   TF_RETURN_IF_ERROR(instr.set_backend_config(gpu_config));
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.h b/third_party/xla/xla/backends/gpu/autotuner/triton.h
index 465436b0080c13..2c36114d0138b7 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.h
@@ -37,7 +37,7 @@ namespace gpu {
 class TritonBackend : public GpuCodegenBackend {
  public:
   explicit TritonBackend(const DebugOptions* debug_options, Compiler* compiler,
-                         const Compiler::TargetConfig* target_config,
+                         const Compiler::GpuTargetConfig* target_config,
                          mlir::MLIRContext* mlir_context)
       : GpuCodegenBackend("Triton", debug_options, compiler, target_config),
         mlir_context_(mlir_context) {}
@@ -53,11 +53,12 @@ class TritonBackend : public GpuCodegenBackend {
   bool CanProduceWrongResults() const override { return true; }
 
  private:
+  bool IsSupported(const HloInstruction& instr) override;
+
   absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module,
       const Compiler::CompileOptions& options) override;
 
-  bool IsSupported(const HloInstruction& instr);
   mlir::MLIRContext* mlir_context_;
 };
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
index 300278ecbd49a2..f617f6261cfa50 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
@@ -79,15 +79,13 @@ class TritonBackendTest : public HloHardwareIndependentTestBase {
                              .value()),
         target_config_(stream_executor_),
         backend_(&debug_options_, &compiler_, &target_config_, &mlir_context_) {
-    // TODO(b/315957220): Remove the experimental flags once TMA is enabled by
-    // default.
     debug_options_.set_xla_gpu_experimental_enable_triton_tma(true);
   }
 
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
   se::StreamExecutor* stream_executor_;
-  Compiler::TargetConfig target_config_;
+  Compiler::GpuTargetConfig target_config_;
   TritonBackend backend_;
   mlir::MLIRContext mlir_context_;
 };
@@ -105,20 +103,15 @@ TEST_F(TritonBackendTest, GetSupportedConfigs) {
   if (backend_.target_config()
           .device_description.cuda_compute_capability()
           .IsAtLeastHopper()) {
-    auto count_tma_allowed =
-        [](const std::vector<std::unique_ptr<BackendConfig>>& configs) {
-          return std::count_if(configs.begin(), configs.end(),
-                               [](auto& config) {
-                                 TritonBackendConfig actual_config;
-                                 if (!config->UnpackTo(&actual_config)) {
-                                   return false;
-                                 }
-                                 return actual_config.is_tma_allowed();
-                               });
-        };
-    // The current TMA autotuning duplicates the given configurations with
-    // is_tma_allowed set to true.
-    EXPECT_EQ(count_tma_allowed(configs.value()), configs.value().size() / 2);
+    // Check that TMA configurations are generated.
+    EXPECT_TRUE(std::any_of(configs.value().begin(), configs.value().end(),
+                            [](auto& config) {
+                              TritonBackendConfig actual_config;
+                              if (!config->UnpackTo(&actual_config)) {
+                                return false;
+                              }
+                              return actual_config.is_tma_allowed();
+                            }));
   }
 }
 
@@ -153,17 +146,11 @@ TEST_F(TritonBackendTest, GetSupportedConfigsForUnsupportedInstruction) {
 TEST_F(TritonBackendTest, GetDefaultConfig) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kHlo));
-  TritonBackendConfig expected_config =
-      TritonGemmConfig(64, 64, 64, 1, 1, 2, 1, false).ToProto();
-
   absl::StatusOr<std::unique_ptr<BackendConfig>> config =
       backend_.GetDefaultConfig(
           *(module->entry_computation()->root_instruction()));
 
   EXPECT_THAT(config, absl_testing::IsOk());
-  TritonBackendConfig actual_config;
-  ASSERT_TRUE(config.value()->UnpackTo(&actual_config));
-  EXPECT_THAT(actual_config, EqualsProto(expected_config));
 }
 
 TEST_F(TritonBackendTest, GetDefaultConfigForUnsupportedInstruction) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index 4caf58490096bb..2cafa6e1f930ed 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -79,10 +79,6 @@ cc_library(
 xla_test(
     name = "cudnn_test",
     srcs = ["cudnn_test.cc"],
-    backend_tags = {
-        # TODO(b/445172709): Re-enable once fixed.
-        "b200": ["broken"],
-    },
     backends = ["gpu"],
     tags = ["cuda-only"],
     deps = [
@@ -108,7 +104,6 @@ xla_test(
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -132,15 +127,17 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/runtime:all_reduce_thunk",
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:copy_thunk",
         "//xla/backends/gpu/runtime:custom_call_target",
         "//xla/backends/gpu/runtime:custom_call_thunk",
+        "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:dynamic_slice_thunk",
         "//xla/backends/gpu/runtime:gemm_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/ffi:attribute_map",
@@ -192,6 +189,10 @@ xla_test(
             "multi_gpu",
             "no_oss",
         ],
+        "b200": [
+            "broken",
+            "no_oss",
+        ],
     },
     backends = ["gpu"],
     deps = [
@@ -218,7 +219,10 @@ xla_test(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -254,6 +258,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -290,6 +295,7 @@ cc_library(
         "//xla/backends/gpu/codegen/emitters:transpose",
         "//xla/backends/gpu/codegen/triton:fusion",
         "//xla/codegen:ir_emission_utils",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:buffer_assignment",
@@ -298,6 +304,5 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
index 8681ecd78e5080..41626debc7fc55 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
@@ -314,6 +314,24 @@ TEST_F(CopyFusionTest, BuildUpdateSliceDescriptor) {
   EXPECT_EQ(offset.byte_stride, 8 * 8 * sizeof(float));
 }
 
+TEST_F(CopyFusionTest, PackedSubByteTypesAreNotSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    dynamic_slice {
+      a = s4[20]{0:E(4)} parameter(0)
+      c = s32[] constant(10)
+      s = s4[10] dynamic-slice(a, c), dynamic_slice_sizes={10}
+    }
+
+    entry {
+      a = s4[20]{0:E(4)} parameter(0)
+      f = s4[10] fusion(a), kind=kLoop, calls=dynamic_slice
+    }
+  )"));
+  EXPECT_FALSE(
+      DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(GetFusion(module.get()))
+          .has_value());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
index 779c5eac8f123a..2ceef17cbdf053 100644
--- a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/env.h"
@@ -59,8 +58,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 class CuDnnFusionTest : public GpuCodegenTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
@@ -71,19 +68,23 @@ class CuDnnFusionTest : public GpuCodegenTest {
     debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
     return debug_options;
   }
+  se::CudaComputeCapability get_cuda_cc() const {
+    se::StreamExecutor* executor = backend().default_stream_executor();
+    return executor->GetDeviceDescription().cuda_compute_capability();
+  }
   bool IsAtLeastAmpereWithCuDnn9() {
     se::StreamExecutor* executor = backend().default_stream_executor();
-    return executor->GetDeviceDescription()
-               .cuda_compute_capability()
-               .IsAtLeastAmpere() &&
+    return get_cuda_cc().IsAtLeastAmpere() &&
            GetDnnVersionInfoOrDefault(executor).major_version() >= 9;
   }
-  bool IsAtLeastCuDnn91() {
+  bool IsAtLeastCuDnnVersion(int major, int minor) {
     se::StreamExecutor* executor = backend().default_stream_executor();
     const se::dnn::VersionInfo version = GetDnnVersionInfoOrDefault(executor);
-    return (version.major_version() == 9 && version.minor_version() >= 1) ||
-           version.major_version() > 9;
+    return (version.major_version() == major &&
+            version.minor_version() >= minor) ||
+           version.major_version() > major;
   }
+  bool IsAtLeastCuDnn91() { return IsAtLeastCuDnnVersion(9, 1); }
 
  protected:
   void SetUp() override {
@@ -232,6 +233,11 @@ ENTRY e {
 }
 
 TEST_F(CuDnnFusionExecutionTest, CompilerSupportsFusionsWithWorkspace) {
+  if (get_cuda_cc().IsAtLeastBlackwell()) {
+    // TODO(b/445172709): Re-enable once fixed.
+    GTEST_SKIP();
+  }
+
   const std::string kHloText = R"(
 f {
   a = f32[32,96] parameter(0)
@@ -450,6 +456,29 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
+TEST_F(CuDnnFusionExecutionTest, DotS4BF16ExecutesCorrectly) {
+  if (!IsAtLeastCuDnnVersion(9, 12)) {
+    GTEST_SKIP() << "This test case requires cuDNN 9.12+.";
+  }
+  EXPECT_TRUE(RunAndCompare(R"(
+f {
+  a = s4[3,128,128] parameter(0)
+  c = bf16[3,128,128] convert(a)
+  b = bf16[3,128,128] parameter(1)
+  d = bf16[3,128,128] dot(c, b),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+e {
+  a = s4[3,128,128] parameter(0)
+  b = bf16[3,128,128] parameter(1)
+  f = bf16[3,128,128] fusion(a, b), kind=kCustom, calls=f,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
+}
+
 TEST_F(CuDnnFusionExecutionTest, DotF32WithOutputSubtractionExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
@@ -530,6 +559,24 @@ ENTRY e {
 })"));
 }
 
+TEST_F(CuDnnFusionExecutionTest, NonDefaultDotAlgorithmIsNotSupported) {
+  EXPECT_FALSE(Run(R"(
+fusion1 {
+  a = bf16[32,96] parameter(0)
+  b = bf16[96,64] parameter(1)
+  r = f32[32,64] dot(a, b),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_bf16_bf16_f32
+}
+
+e {
+  a = bf16[32,96] parameter(0)
+  b = bf16[96,64] parameter(1)
+  _ = f32[32,64] fusion(a, b), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})"));
+}
+
 TEST_F(CuDnnFusionExecutionTest,
        DotF16NegateNonDefaultDimensionsExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
@@ -868,6 +915,83 @@ ENTRY r {
                             ErrorSpec{/*aabs=*/1, /*arel=*/1e-3}));
 }
 
+TEST_F(CuDnnFusionExecutionTest, ConvFpropWithNHWCLayoutExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion {
+  zero = f32[] constant(0)
+  zeros = f32[2,9,9,32] broadcast(zero), dimensions={}
+  input = f32[2,9,9,17] parameter(0)
+  filter = f32[32,3,3,17] parameter(1)
+  conv = f32[2,9,9,32] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, feature_group_count=1
+  ROOT relu = f32[2,9,9,32] maximum(zeros, conv)
+}
+
+
+ENTRY Test {
+  input = f32[2,9,9,17] parameter(0)
+  filter = f32[32,3,3,17] parameter(1)
+  ROOT conv = f32[2,9,9,32] fusion(input, filter), kind=kCustom, calls=fusion, backend_config={"fusion_backend_config": {kind: "__cudnn$fusion", cudnn_fusion_config: {"kind":"CONV_FPROP"}}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-5}));
+}
+
+TEST_F(CuDnnFusionExecutionTest, ConvWgradWithNHWCLayoutExecutesCorrectly) {
+  if (get_cuda_cc().IsAtLeastBlackwell()) {
+    // TODO(b/445172709): Re-enable once fixed.
+    GTEST_SKIP();
+  }
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion {
+  zero = f32[] constant(0)
+  zeros = f32[32,3,3,17] broadcast(zero), dimensions={}
+  input = f32[2,9,9,17] parameter(0)
+  dout = f32[2,9,9,32] parameter(1)
+  conv = f32[32,3,3,17] convolution(input, dout), window={size=9x9 pad=1_1x1_1}, dim_labels=f01b_i01o->f01b, feature_group_count=1
+  ROOT relu = f32[32,3,3,17] maximum(zeros, conv)
+}
+
+
+ENTRY Test {
+  input = f32[2,9,9,17] parameter(0)
+  dout = f32[2,9,9,32] parameter(1)
+  ROOT conv = f32[32,3,3,17] fusion(input, dout), kind=kCustom, calls=fusion, backend_config={"fusion_backend_config": {kind: "__cudnn$fusion", cudnn_fusion_config: {"kind":"CONV_WGRAD"}}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-5}));
+}
+
+TEST_F(CuDnnFusionExecutionTest, ConvDgradWithNHWCLayoutExecutesCorrectly) {
+  const std::string kHloReference = R"(
+ENTRY main {
+  zero = f32[] constant(0)
+  zeros = f32[2,9,9,17] broadcast(zero), dimensions={}
+  dout = f32[2,9,9,32] parameter(0)
+  filter = f32[32,3,3,17] parameter(1)
+  reverse = f32[32,3,3,17] reverse(filter), dimensions={1,2}
+  conv = f32[2,9,9,17] convolution(dout, reverse), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_i01o->b01f, feature_group_count=1
+  ROOT relu = f32[2,9,9,17] maximum(zeros, conv)
+})";
+
+  const std::string kHlo = R"(
+fusion {
+  zero = f32[] constant(0)
+  zeros = f32[2,9,9,17] broadcast(zero), dimensions={}
+  dout = f32[2,9,9,32] parameter(0)
+  filter = f32[32,3,3,17] parameter(1)
+  conv = f32[2,9,9,17] convolution(dout, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_i01o->b01f, feature_group_count=1
+  ROOT relu = f32[2,9,9,17] maximum(zeros, conv)
+}
+
+
+ENTRY Test {
+  dout = f32[2,9,9,32] parameter(0)
+  filter = f32[32,3,3,17] parameter(1)
+  ROOT conv = f32[2,9,9,17] fusion(dout, filter), kind=kCustom, calls=fusion, backend_config={"fusion_backend_config": {kind: "__cudnn$fusion", cudnn_fusion_config: {"kind":"CONV_DGRAD"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(kHlo, kHloReference,
+                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-5}));
+}
+
 class ElementwiseTest : public CuDnnFusionExecutionTest,
                         public ::testing::WithParamInterface<
                             std::tuple<PrimitiveType, HloOpcode, float>> {};
@@ -1097,8 +1221,7 @@ ENTRY e {
 ; CHECK: ENTRY
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: parameter
-; CHECK-NEXT: ROOT
-; CHECK-SAME: fusion
+; CHECK-NEXT: fusion
 ; CHECK-NOT: cudnn
 )");
 }
@@ -1107,10 +1230,7 @@ TEST_F(CuDnnFusionRewriteTest, AutotuningPicksCuDnnForS8BF16OnHopper) {
   // The test case relies on measurements by the autotuner and current
   // performance comparison of the backends. May need to be updated if
   // the situation changes.
-  if (backend()
-          .default_stream_executor()
-          ->GetDeviceDescription()
-          .cuda_compute_capability() != se::CudaComputeCapability::Hopper()) {
+  if (get_cuda_cc() != se::CudaComputeCapability::Hopper()) {
     GTEST_SKIP() << "The test is for Hopper.";
   }
   MatchOptimizedHlo(R"(
@@ -1148,15 +1268,18 @@ ENTRY main {
       backend_config={"fusion_backend_config":{kind:"__cudnn$fusion"}}
 })";
   EXPECT_TRUE(*RunCuDnnFileCheck(kHloText, R"(
+CHECK: "intermediate_data_type": "FLOAT"
 CHECK: "nodes"
 CHECK: {
 CHECK: "block_size": [{{[[:space:]]*32[[:space:]]*}}]
+CHECK: "compute_data_type": "FLOAT"
 CHECK: "X": "lhs"
 CHECK: "scale": "lhs_scale"
 CHECK: "Y": "result_lhs_dq"
 CHECK: "tag": "BLOCK_SCALE_DEQUANTIZE"
 CHECK: {
 CHECK: "block_size": [{{[[:space:]]*32[[:space:]]*}}]
+CHECK: "compute_data_type": "FLOAT"
 CHECK: "X": "rhs"
 CHECK: "scale": "rhs_scale"
 CHECK: "Y": "result_rhs_dq"
@@ -1191,6 +1314,63 @@ CHECK: "stride": [{{[[:space:]]*1,[[:space:]]*1,[[:space:]]*4[[:space:]]*}}]
 )"));
 }
 
+TEST_F(CuDnnFusionFileCheckTest, ConvFpropGraphConvertedCorrectly) {
+  const std::string kHloText = R"(
+fusion {
+  input = f32[2,9,9,17] parameter(0)
+  filter = f32[32,3,3,17] parameter(1)
+  ROOT conv = f32[2,9,9,32] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, feature_group_count=1
+}
+
+
+ENTRY Test {
+  input = f32[2,9,9,17] parameter(0)
+  filter = f32[32,3,3,17] parameter(1)
+  ROOT conv = f32[2,9,9,32] fusion(input, filter), kind=kCustom, calls=fusion, backend_config={"fusion_backend_config": {kind: "__cudnn$fusion", cudnn_fusion_config: {"kind":"CONV_FPROP"}}}
+})";
+
+  EXPECT_TRUE(*RunCuDnnFileCheck(kHloText, R"(
+CHECK: "nodes": [
+CHECK:  {
+CHECK:   "compute_data_type": "FLOAT",
+CHECK:   "dilation": [{{[[:space:]]*1,[[:space:]]*1[[:space:]]*}}],
+CHECK:   "inputs": {
+CHECK:    "W": "filter",
+CHECK:    "X": "input"
+CHECK:   },
+CHECK:   "math_mode": "CROSS_CORRELATION",
+CHECK:   "name": "0",
+CHECK:   "outputs": {
+CHECK:    "Y": "conv"
+CHECK:   },
+CHECK:   "post_padding": [{{[[:space:]]*1,[[:space:]]*1[[:space:]]*}}],
+CHECK:   "pre_padding": [{{[[:space:]]*1,[[:space:]]*1[[:space:]]*}}],
+CHECK:   "stride": [{{[[:space:]]*1,[[:space:]]*1[[:space:]]*}}],
+CHECK:   "tag": "CONV_FPROP"
+CHECK:  }
+CHECK: ],
+CHECK:"tensors": {
+CHECK:  "conv": {
+CHECK:   "data_type": "FLOAT",
+CHECK:   "dim": [{{[[:space:]]*2,[[:space:]]*32,[[:space:]]*9,[[:space:]]*9[[:space:]]*}}],
+CHECK:   "name": "conv",
+CHECK:   "stride": [{{[[:space:]]*2592,[[:space:]]*1,[[:space:]]*288,[[:space:]]*32[[:space:]]*}}],
+CHECK:  },
+CHECK:  "filter": {
+CHECK:   "data_type": "FLOAT",
+CHECK:   "dim": [{{[[:space:]]*32,[[:space:]]*17,[[:space:]]*3,[[:space:]]*3[[:space:]]*}}],
+CHECK:   "name": "filter",
+CHECK:   "stride": [{{[[:space:]]*153,[[:space:]]*1,[[:space:]]*51,[[:space:]]*17[[:space:]]*}}],
+CHECK:  },
+CHECK:  "input": {
+CHECK:   "data_type": "FLOAT",
+CHECK:   "dim": [{{[[:space:]]*2,[[:space:]]*17,[[:space:]]*9,[[:space:]]*9[[:space:]]*}}],
+CHECK:   "name": "input",
+CHECK:   "stride": [{{[[:space:]]*1377,[[:space:]]*1,[[:space:]]*153,[[:space:]]*17[[:space:]]*}}],
+CHECK:  }
+CHECK: }
+)"));
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/custom.cc b/third_party/xla/xla/backends/gpu/codegen/custom.cc
index 9b502af45a0391..22a2e1d159b29d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/custom.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/custom.cc
@@ -36,15 +36,16 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_target.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/ffi/attribute_map.h"
@@ -84,6 +85,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -100,9 +102,11 @@ absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
       emitters::KernelArguments::Create(ir_emitter_context.buffer_assignment(),
                                         GetDefaultBufferAlignment(), &fusion));
 
-  return std::make_unique<CustomKernelThunk>(
-      &fusion, std::move(custom_kernel), std::move(kernel_arguments),
-      ir_emitter_context.GetNextThunkId());
+  Thunk::ThunkInfo thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(
+      &fusion, ir_emitter_context.GetNextThunkId());
+  return std::make_unique<CustomKernelThunk>(std::move(thunk_info),
+                                             std::move(custom_kernel),
+                                             std::move(kernel_arguments));
 }
 
 absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
@@ -613,7 +617,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   // different offset by creating new fake allocations so each operand will
   // have a different buffer index. The slices can thus always start at offset
   // 0. DynamicSliceThunk will take care of the offset adjustment.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+  std::vector<BufferAllocation> fake_allocations(4, {0, 0, 0});
   if (fusion.shape().IsArray()) {
     TF_ASSIGN_OR_RETURN(
         output, GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
@@ -645,10 +649,10 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
         offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
         extracted_offset_modules, arg_idx, can_compute_indvar_on_host, while_op,
         indvar_idx, inlined_module));
-    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[arg_idx] = BufferAllocation(
         /*index=*/arg_idx, workspace->size(), /*color=*/0);
-    slice_workspace_fake = BufferAllocation::Slice(
-        fake_allocations[arg_idx].get(), 0, workspace->size());
+    slice_workspace_fake = BufferAllocation::Slice(&fake_allocations[arg_idx],
+                                                   0, workspace->size());
   }
 
   if (absl::c_all_of(slice_instrs, [&](auto slice_instr) {
@@ -676,27 +680,27 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
     unsigned fake_arg_idx = 0;
     int64_t lhs_byte_size =
         ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[fake_arg_idx] = BufferAllocation(
         /*index=*/fake_arg_idx, lhs_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_lhs_fake(fake_allocations[fake_arg_idx].get(),
-                                           0, lhs_byte_size);
+    BufferAllocation::Slice slice_lhs_fake(&fake_allocations[fake_arg_idx], 0,
+                                           lhs_byte_size);
 
     fake_arg_idx++;
     int64_t rhs_byte_size =
         ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[fake_arg_idx] = BufferAllocation(
         /*index=*/fake_arg_idx, rhs_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_rhs_fake(fake_allocations[fake_arg_idx].get(),
-                                           0, rhs_byte_size);
+    BufferAllocation::Slice slice_rhs_fake(&fake_allocations[fake_arg_idx], 0,
+                                           rhs_byte_size);
 
     fake_arg_idx++;
     int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
         custom_call.shape().IsArray() ? custom_call.shape()
                                       : custom_call.shape().tuple_shapes(0));
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[fake_arg_idx] = BufferAllocation(
         /*index=*/fake_arg_idx, out_fake_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_out_fake(fake_allocations[fake_arg_idx].get(),
-                                           0, out_fake_byte_size);
+    BufferAllocation::Slice slice_out_fake(&fake_allocations[fake_arg_idx], 0,
+                                           out_fake_byte_size);
     ThunkSequence seq;
     seq.emplace_back(std::make_unique<GemmThunk>(
         thunk_info, std::move(config), slice_lhs_fake, slice_rhs_fake,
@@ -762,7 +766,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         "thunks");
   }
 
-  using Slices = std::vector<std::optional<CustomCallThunk::Slice>>;
+  using Slices = std::vector<NullableShapedSlice>;
 
   int64_t num_args = ShapeUtil::GetLeafCount(custom_call.shape());
   absl::c_for_each(custom_call.operands(), [&](auto* operand) {
@@ -830,7 +834,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
               arg_idx++, can_compute_indvar_on_host, while_op, indvar_idx,
               inlined_module));
 
-          operands.push_back(CustomCallThunk::Slice{slice, subshape});
+          operands.push_back(ShapedSlice{slice, subshape});
           arguments.push_back(slice);
           return absl::OkStatus();
         }));
@@ -858,7 +862,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
             arg_idx++, can_compute_indvar_on_host, while_op, indvar_idx,
             inlined_module));
 
-        results.push_back(CustomCallThunk::Slice{slice, subshape});
+        results.push_back(ShapedSlice{slice, subshape});
         arguments.push_back(slice);
         return absl::OkStatus();
       }));
@@ -878,7 +882,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   // For XLA FFI handlers we decode opaque backend config into attributes map
   // at IR emission time, so that we do not need to parse MLIR at run time.
   // For FFI handlers backend config must be a compatible MLIR dictionary.
-  CustomCallThunk::AttributesMap attributes;
+  ffi::AttributesMap attributes;
 
   // For information about this calling convention, see
   // xla/g3doc/custom_call.md.
@@ -941,9 +945,10 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
       TF_ASSIGN_OR_RETURN(attributes, xla::ffi::BuildAttributesMap(dict));
     }
     return CustomCallThunk::Create(
-        thunk_info, call_target_name, registration->bundle, std::move(ops),
-        std::move(res), std::move(attributes),
-        called_computations.empty() ? nullptr : called_computations[0]);
+        thunk_info, call_target_name, std::move(ops), std::move(res),
+        std::move(attributes),
+        called_computations.empty() ? nullptr : called_computations[0],
+        ir_emitter_context.platform_name());
   };
 
   auto legacy_thunk =
@@ -953,12 +958,13 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         backend_config.ok()
             ? backend_config->custom_call_backend_config().opaque()
             : custom_call.raw_backend_config_string();
-    return CustomCallThunk::Create(
-        thunk_info, call_target_name, std::move(custom_call_target),
-        std::move(ops), std::move(res), std::move(opaque));
+    return CustomCallThunk::Create(thunk_info, call_target_name, std::move(ops),
+                                   std::move(res), std::move(opaque),
+                                   custom_call.api_version(),
+                                   ir_emitter_context.platform_name());
   };
 
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(num_args);
+  std::vector<BufferAllocation> fake_allocations(num_args, {0, 0, 0});
   if (absl::c_any_of(slice_instrs, IsDynamicSliceOrDynamicUpdateSlice)) {
     // Creating embedded custom call thunk.
     unsigned fake_arg_idx = 0;
@@ -978,14 +984,13 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
             }
 
             int64_t operand_byte_size = ShapeUtil::ByteSizeOf(subshape);
-            fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+            fake_allocations[fake_arg_idx] = BufferAllocation(
                 /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
-            BufferAllocation::Slice fake_slice(
-                fake_allocations[fake_arg_idx].get(), 0, operand_byte_size);
+            BufferAllocation::Slice fake_slice(&fake_allocations[fake_arg_idx],
+                                               0, operand_byte_size);
 
             fake_arg_idx++;
-            fake_operands.push_back(
-                CustomCallThunk::Slice{fake_slice, subshape});
+            fake_operands.push_back(ShapedSlice{fake_slice, subshape});
             return absl::OkStatus();
           }));
     }
@@ -1004,13 +1009,13 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
           }
 
           int64_t result_byte_size = ShapeUtil::ByteSizeOf(subshape);
-          fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+          fake_allocations[fake_arg_idx] = BufferAllocation(
               /*index=*/fake_arg_idx, result_byte_size, /*color=*/0);
-          BufferAllocation::Slice fake_slice(
-              fake_allocations[fake_arg_idx].get(), 0, result_byte_size);
+          BufferAllocation::Slice fake_slice(&fake_allocations[fake_arg_idx], 0,
+                                             result_byte_size);
 
           fake_arg_idx++;
-          fake_results.push_back(CustomCallThunk::Slice{fake_slice, subshape});
+          fake_results.push_back(ShapedSlice{fake_slice, subshape});
           return absl::OkStatus();
         }));
 
@@ -1062,7 +1067,7 @@ using Slices = std::vector<Slice>;
 // fake_arguments: the fake slices of the inputs/outputs of the hero
 // instruction, when the slicing is dynamic.
 struct SliceDataForCollectives {
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
+  std::vector<BufferAllocation> fake_allocations;
   std::vector<HloInstruction*> slice_instrs;
   Slices arguments, fake_arguments;
   std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
@@ -1073,7 +1078,7 @@ struct SliceDataForCollectives {
   std::unique_ptr<HloModule> init_module, update_module;
   bool isDynamic, can_compute_indvar_on_host;
   explicit SliceDataForCollectives(int num_args)
-      : fake_allocations(num_args),
+      : fake_allocations(num_args, {0, 0, 0}),
         slice_instrs(num_args),
         arguments(num_args, std::nullopt),
         fake_arguments(num_args, std::nullopt),
@@ -1194,11 +1199,10 @@ CollectSliceArgumentMetadataForCollectives(
     unsigned fake_arg_idx = 0;
     for (HloInstruction* operand : instr->operands()) {
       int64_t operand_byte_size = ShapeUtil::ByteSizeOf(operand->shape());
-      slice_data.fake_allocations[fake_arg_idx] =
-          std::make_unique<BufferAllocation>(
-              /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
+      slice_data.fake_allocations[fake_arg_idx] = BufferAllocation(
+          /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
       BufferAllocation::Slice fake_slice(
-          /*allocation=*/slice_data.fake_allocations[fake_arg_idx].get(),
+          /*allocation=*/&slice_data.fake_allocations[fake_arg_idx],
           /*offset=*/0,
           /*size=*/operand_byte_size);
       slice_data.fake_arguments[fake_arg_idx] = fake_slice;
@@ -1214,12 +1218,11 @@ CollectSliceArgumentMetadataForCollectives(
     }
     for (const HloInstruction* user : collective_results) {
       int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(user->shape());
-      slice_data.fake_allocations[fake_arg_idx] =
-          std::make_unique<BufferAllocation>(
-              /*index=*/fake_arg_idx, /*size=*/out_fake_byte_size,
-              /*color=*/0);
+      slice_data.fake_allocations[fake_arg_idx] = BufferAllocation(
+          /*index=*/fake_arg_idx, /*size=*/out_fake_byte_size,
+          /*color=*/0);
       BufferAllocation::Slice fake_slice(
-          /*allocation=*/slice_data.fake_allocations[fake_arg_idx].get(),
+          /*allocation=*/&slice_data.fake_allocations[fake_arg_idx],
           /*offset=*/0, /*size=*/out_fake_byte_size);
       slice_data.fake_arguments[fake_arg_idx] = fake_slice;
       fake_arg_idx++;
@@ -1309,9 +1312,7 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
           /*source_buffer=*/src.value(),
           /*destination_buffer=*/dst.value(),
           /*source_memory_space=*/src_shape.layout().memory_space(),
-          /*destination_memory_space=*/dst_shape.layout().memory_space(),
-          /*source_value=*/nullptr,
-          /*destination_value=*/nullptr});
+          /*destination_memory_space=*/dst_shape.layout().memory_space()});
     }
     auto collective_start_thunk =
         std::make_unique<NcclThunkType>(thunk_info, instr, buffers);
@@ -1332,7 +1333,7 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
           Thunk::ThunkInfo::WithProfileAnnotation(
               instr, ir_emitter_context.GetNextThunkId()),
           /*async_events=*/async_events,
-          /*async_stream_kind=*/AsyncStreamKind::kCollective);
+          /*async_stream_kind=*/AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE);
       seq.emplace_back(std::move(collective_done_thunk));
     }
   } else {
diff --git a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
index de824f3c23daaa..2d06e3c4a70d87 100644
--- a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
@@ -47,8 +47,11 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -73,6 +76,28 @@ MATCHER_P(ThunkKindIs, kind, "") {
   return ExplainMatchResult(::testing::Eq(kind), arg->kind(), result_listener);
 }
 
+se::StreamExecutor* GpuExecutor() {
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  auto* platform = se::PlatformManager::PlatformWithName(name).value();
+  return platform->ExecutorForDevice(0).value();
+}
+
+bool IsAtLeastCuda12900(const se::StreamExecutor* stream_executor) {
+  const auto& device_description = stream_executor->GetDeviceDescription();
+  const auto* cuda_cc =
+      device_description.gpu_compute_capability().cuda_compute_capability();
+  if (cuda_cc != nullptr) {
+    if (device_description.driver_version() >=
+            stream_executor::SemanticVersion(12, 9, 0) &&
+        device_description.runtime_version() >=
+            stream_executor::SemanticVersion(12, 9, 0)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 class DynamicSliceFusionTest : public HloTestBase {
  public:
   HloModuleConfig GetModuleConfigWithoutCommandBuffer() {
@@ -105,6 +130,30 @@ class DynamicSliceFusionTest : public HloTestBase {
     return config;
   }
 
+  HloModuleConfig GetModuleConfigWithCommandBufferUnrollLoops() {
+    DebugOptions debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_cublaslt(false);
+    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
+    debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
+    debug_options.set_xla_gpu_triton_gemm_any(false);
+    debug_options.set_xla_gpu_enable_cublaslt(false);
+    debug_options.set_xla_gpu_cublas_fallback(true);
+    debug_options.set_xla_gpu_graph_min_graph_size(1);
+    debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
+    debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLAS);
+    debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLASLT);
+    debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUSTOM_CALL);
+    debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
+    debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+    debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::WHILE);
+    debug_options.add_xla_gpu_enable_command_buffer(
+        DebugOptions::DYNAMIC_SLICE_FUSION);
+    debug_options.set_xla_gpu_command_buffer_unroll_loops(true);
+    HloModuleConfig config;
+    config.set_debug_options(debug_options);
+    return config;
+  }
+
   HloModuleConfig GetModuleConfigWithDeterministicOps() {
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_exclude_nondeterministic_ops(true);
@@ -3454,10 +3503,6 @@ TEST_F(DynamicSliceFusionTest,
     ROOT %while = while(%initial_tuple), condition=%Cond, body=%Body, backend_config={"known_trip_count":{"n":"11"}}
   })";
 
-  // Run the same HLO with and without command buffer and compare results.
-  HloModuleConfig with_cmd_buffer = GetModuleConfigWithCommandBuffer();
-  HloModuleConfig without_cmd_buffer = GetModuleConfigWithoutCommandBuffer();
-
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> cmd_buffer_module,
                           ParseAndReturnVerifiedModule(hlo));
 
@@ -3469,6 +3514,15 @@ TEST_F(DynamicSliceFusionTest,
       RunAndCompareTwoModules(hlo, hlo, GetModuleConfigWithCommandBuffer(),
                               GetModuleConfigWithoutCommandBuffer(), error_spec,
                               /*run_hlo_passes=*/true));
+
+  se::StreamExecutor* stream_executor = GpuExecutor();
+  if (!IsAtLeastCuda12900(stream_executor)) {
+    GTEST_SKIP() << "While loop unrolling is not supported for CUDA < 12.9";
+  }
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo, hlo, GetModuleConfigWithCommandBufferUnrollLoops(),
+      GetModuleConfigWithoutCommandBuffer(), error_spec,
+      /*run_hlo_passes=*/true));
 }
 
 TEST_F(DynamicSliceFusionTest, MultipleOffsetsAsFunctionOfInductionVariable) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
index 48572318f5e0c2..b72a97787b82fe 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
@@ -25,6 +25,7 @@ cc_library(
         "//xla/codegen/emitters:concatenate_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_constants",
@@ -61,6 +62,7 @@ cc_library(
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
@@ -134,6 +136,7 @@ xla_cc_test(
         ":emitter_base",
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -169,9 +172,9 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/codegen/emitters:dynamic_update_slice_kernel_emitter",
-        "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -186,10 +189,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:TensorDialect",
     ],
 )
 
@@ -205,6 +206,7 @@ cc_library(
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -240,6 +242,7 @@ cc_library(
         "//xla/codegen/emitters:type_util",
         "//xla/codegen/emitters:utils",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:platform_util",
@@ -331,6 +334,7 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:scatter_simplifier",
         "//xla/service/gpu:gpu_fusible",
@@ -373,6 +377,7 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:hlo_fusion_analysis",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
index d98b16a05e2275..5490797ffd64c6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
@@ -26,12 +26,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/concatenate_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
@@ -43,6 +43,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using ::mlir::MLIRContext;
+
 using KernelEmitter = emitters::ConcatenateFusionKernelEmitter;
 
 ConcatenateFusion::ConcatenateFusion(const HloFusionAnalysis& analysis)
@@ -59,13 +61,13 @@ LaunchDimensions ConcatenateFusion::launch_dimensions() const {
 }
 
 std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, MLIRContext* ctx) const {
   return std::nullopt;
 }
 
 std::optional<std::vector<IndexingMap>>
-ConcatenateFusion::ComputeThreadIdToInputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+ConcatenateFusion::ComputeThreadIdToInputIndexing(int64_t root_index,
+                                                  MLIRContext* ctx) const {
   IndexingMap map_for_largest_shape =
       KernelEmitter::ComputeWorkItemIdToOutputIndexing(GetWorkDimensions(),
                                                        largest_shape_, ctx);
@@ -78,17 +80,16 @@ ConcatenateFusion::ComputeThreadIdToInputIndexing(
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 ConcatenateFusion::CreateMLIRModule(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
   emitters::ConcatenateFusionKernelEmitter emitter(
-      context, fusion, analysis_.fusion_spec(), buffer_assignment,
+      mlir_context, fusion, analysis_.fusion_spec(), buffer_assignment,
       GetDefaultBufferAlignment(), GetWorkDimensions(), entry_function_name,
       BackendKind::kGpu);
 
   TF_ASSIGN_OR_RETURN(auto kernel_definition, emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-  return std::move(source).ReleaseStorage().module;
+  return std::move(kernel_definition).TakeSource().TakeModule();
 }
 
 absl::Status ConcatenateFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
index 6f1ca56478c442..673dbb8e50a327 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
@@ -25,11 +25,11 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
@@ -54,7 +54,7 @@ class ConcatenateFusion final : public EmitterBase {
 
  protected:
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
-      mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+      mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
       const BufferAssignment* buffer_assignment) const override;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
index c0cd17b3037bd0..d9ea25e1989a93 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
@@ -92,6 +93,7 @@ limitations under the License.
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -124,6 +126,7 @@ namespace gpu {
 namespace {
 
 using llvm::SmallVector;
+using mlir::MLIRContext;
 using mlir::Value;
 using mlir::ValueRange;
 using mlir::func::FuncOp;
@@ -177,7 +180,7 @@ absl::Status RunPassPipeline(mlir::ModuleOp module, const HloModule& hlo_module,
                              absl::string_view entry_function_name) {
   bool should_dump_mlir_passes =
       DumpingEnabledForHloModule(hlo_module) &&
-      DumpingEnabledForHloPass("mlir-fusion-emitter",
+      DumpingEnabledForEmitter("mlir-fusion",
                                hlo_module.config().debug_options());
 
   std::string mlir_passes_dump_result;
@@ -226,7 +229,7 @@ Value EmitterBase::EmitWorkGroupId(mlir::ImplicitLocOpBuilder& builder,
   int64_t count = dim == WorkGroupDimension::x   ? counts.x
                   : dim == WorkGroupDimension::y ? counts.y
                                                  : counts.z;
-  auto block_id = builder.create<WorkGroupIdOp>(dim);
+  auto block_id = WorkGroupIdOp::create(builder, dim);
   block_id->setAttr("xla.range", builder.getIndexArrayAttr({0, count - 1}));
   return block_id;
 }
@@ -235,8 +238,8 @@ Value EmitterBase::EmitBlockId(mlir::ImplicitLocOpBuilder& builder,
                                int dim) const {
   const auto& counts = launch_dimensions().block_counts();
   int64_t count = dim == 0 ? counts.x : dim == 1 ? counts.y : counts.z;
-  auto block_id = builder.create<mlir::gpu::BlockIdOp>(
-      static_cast<mlir::gpu::Dimension>(dim));
+  auto block_id = mlir::gpu::BlockIdOp::create(
+      builder, static_cast<mlir::gpu::Dimension>(dim));
   block_id->setAttr("xla.range", builder.getIndexArrayAttr({0, count - 1}));
   return block_id;
 }
@@ -245,8 +248,8 @@ Value EmitterBase::EmitThreadId(mlir::ImplicitLocOpBuilder& builder,
                                 int dim) const {
   const auto& counts = launch_dimensions().thread_counts_per_block();
   int64_t count = dim == 0 ? counts.x : dim == 1 ? counts.y : counts.z;
-  auto thread_id = builder.create<mlir::gpu::ThreadIdOp>(
-      static_cast<mlir::gpu::Dimension>(dim));
+  auto thread_id = mlir::gpu::ThreadIdOp::create(
+      builder, static_cast<mlir::gpu::Dimension>(dim));
   thread_id->setAttr("xla.range", builder.getIndexArrayAttr({0, count - 1}));
   return thread_id;
 }
@@ -266,19 +269,22 @@ absl::StatusOr<FusionEmissionResult> EmitterBase::Emit(
                                      ir_emitter_context.buffer_assignment(),
                                      GetDefaultBufferAlignment(), &fusion));
   auto launch_dims = launch_dimensions();
+  mlir::MLIRContext& mlir_context = *ir_emitter_context.mlir_context();
+  std::unique_ptr<llvm::Module> module;
   auto [status_or_entry, cached] =
       ir_emitter_context.kernel_cache().GetWithStatus(
           fusion.fused_instructions_computation(), args.args(),
           /*discriminator=*/"",
           [&]() -> absl::StatusOr<KernelReuseCache::Entry> {
-            std::string kernel_name =
-                ir_emitter_context.name_uniquer()->GetUniqueName(
-                    llvm_ir::SanitizeFunctionName(std::string(fusion.name())));
+            std::string kernel_name = GetSanitizedUniqueName(
+                ir_emitter_context, std::string{fusion.name()});
             if (ir_emitter_context.emit_kernels()) {
+              mlir_context.appendDialectRegistry(GetDialectRegistry());
+              mlir_context.loadAllAvailableDialects();
               TF_ASSIGN_OR_RETURN(
-                  auto module,
+                  module,
                   CreateLLVMModule(
-                      *ir_emitter_context.mlir_context(),
+                      mlir_context,
                       ir_emitter_context.llvm_module()->getContext(),
                       ir_emitter_context.gpu_device_info(), fusion, kernel_name,
                       &ir_emitter_context.buffer_assignment()));
@@ -294,12 +300,6 @@ absl::StatusOr<FusionEmissionResult> EmitterBase::Emit(
               TF_RETURN_IF_ERROR(AnnotateKernelLaunchDimensions(
                   ir_emitter_context.gpu_device_info(), launch_dims,
                   kernel_func, module.get()));
-
-              // Use override flag because libdevice functions can be present in
-              // both.
-              CHECK(!llvm::Linker::linkModules(
-                  *target, std::move(module),
-                  llvm::Linker::Flags::OverrideFromSrc));
             } else {
               VLOG(3) << "Skipped kernel compilation.";
             }
@@ -315,6 +315,7 @@ absl::StatusOr<FusionEmissionResult> EmitterBase::Emit(
   }
 
   FusionEmissionResult result;
+  result.module = std::move(module);
   result.thunks.emplace_back(std::make_unique<KernelThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           &fusion, ir_emitter_context.GetNextThunkId()),
@@ -329,8 +330,6 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
     const se::DeviceDescription& device, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
-  mlir_context.appendDialectRegistry(GetDialectRegistry());
-  mlir_context.loadAllAvailableDialects();
   TF_ASSIGN_OR_RETURN(
       auto module, CreateMLIRModule(mlir_context, fusion, entry_function_name,
                                     buffer_assignment));
@@ -340,8 +339,8 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
   AddLoopTransformationPasses(pm, device);
   AddLoweringPasses(pm, device);
 
-  auto pipeline_status = RunPassPipeline(module.get(), *fusion.GetModule(), pm,
-                                         entry_function_name);
+  TF_RETURN_IF_ERROR(RunPassPipeline(module.get(), *fusion.GetModule(), pm,
+                                     entry_function_name));
 
   auto llvm_module = mlir::translateModuleToLLVMIR(module.get(), llvm_context);
   TF_RET_CHECK(llvm_module != nullptr)
@@ -351,10 +350,10 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitterBase::CreateMLIRModule(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+    MLIRContext& mlir_context, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
-  mlir::OpBuilder builder(&context);
+  mlir::OpBuilder builder(&mlir_context);
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion.name()));
   mlir::OwningOpRef<mlir::ModuleOp> module = llvm_ir::CreateMlirModuleOp(loc);
 
@@ -362,9 +361,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitterBase::CreateMLIRModule(
                       emitters::EmitKernelApi(
                           *module, fusion, buffer_assignment,
                           GetDefaultBufferAlignment(), entry_function_name));
-  SetBackendKind(&context, entry_func, BackendKind::kGpu);
+  SetBackendKind(&mlir_context, entry_func, BackendKind::kGpu);
 
-  TF_RETURN_IF_ERROR(EmitMlir(module.get(), entry_func, fusion));
+  TF_RETURN_IF_ERROR(EmitMlir(module.get(), entry_func, fusion, mlir_context));
   return module;
 }
 
@@ -372,7 +371,7 @@ emitters::EpilogueSpecification EmitterBase::GetEpilogueForOutputIndexing(
     const HloFusionAnalysis& analysis,
     const std::vector<const HloInstruction*>& heroes,
     const std::vector<const HloInstruction*>& roots,
-    mlir::MLIRContext* mlir_context) const {
+    MLIRContext* mlir_context) const {
   emitters::EpilogueSpecification result;
 
   absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
@@ -430,11 +429,12 @@ mlir::DialectRegistry EmitterBase::GetDialectRegistry() {
 }
 
 absl::Status EmitterBase::EmitMlir(mlir::ModuleOp module, FuncOp entry_function,
-                                   const HloFusionInstruction& fusion) const {
+                                   const HloFusionInstruction& fusion,
+                                   MLIRContext& mlir_context) const {
   std::vector<emitters::EpilogueSpecification> epilogues =
-      GetEpilogues(fusion, module->getContext());
+      GetEpilogues(fusion, &mlir_context);
   emitters::PartitionedComputations computations(
-      fusion.fused_instructions_computation(), module->getContext(), epilogues);
+      fusion.fused_instructions_computation(), &mlir_context, epilogues);
 
   TF_ASSIGN_OR_RETURN(auto call_targets, emitters::EmitPartitionedComputations(
                                              module, computations));
@@ -497,15 +497,14 @@ void AddLoweringPasses(mlir::OpPassManager& pm,
   pm.addPass(mlir::createCSEPass());
 
   // This pass has to run before `ExpandFloatOpsPass`.
-  if (auto* cc = std::get_if<se::CudaComputeCapability>(
-          &device.gpu_compute_capability())) {
+  if (auto* cc = device.gpu_compute_capability().cuda_compute_capability()) {
     se::SemanticVersion ptx_version =
         nvptx::DetermineHighestSupportedPtxVersionFromCudaVersion(
             device.runtime_version());
     pm.addPass(CreateConvertFloatNvidiaPass(
         cc->major, cc->minor, ptx_version.major(), ptx_version.minor()));
-  } else if (auto* cc = std::get_if<se::RocmComputeCapability>(
-                 &device.gpu_compute_capability())) {
+  } else if (auto* cc =
+                 device.gpu_compute_capability().rocm_compute_capability()) {
     if (cc->has_fp8_support()) {
       pm.addPass(CreateConvertFloatAMDPass(*cc));
     }
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
index d0303edd03fe9d..bb0adddf675d1d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -39,6 +38,7 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
@@ -67,7 +67,7 @@ class EmitterBase : public KernelFusionInterface {
   // Visible for testing. `buffer_assignment` is optional for testing (assigns
   // a different buffer to each tensor).
   virtual absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
-      mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+      mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
       const BufferAssignment* buffer_assignment) const;
 
@@ -110,7 +110,8 @@ class EmitterBase : public KernelFusionInterface {
   // The fuson outputs may only be used with `tensor.insert` ops.a
   absl::Status EmitMlir(mlir::ModuleOp module,
                         mlir::func::FuncOp entry_function,
-                        const HloFusionInstruction& fusion) const;
+                        const HloFusionInstruction& fusion,
+                        mlir::MLIRContext& mlir_context) const;
 };
 
 // Adds passes that transform XLA_GPU and SCF loops, e.g. peel, pipeline,
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
index de6fb8b61a87c1..151456f686069a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/filecheck.h"
@@ -74,11 +75,11 @@ class DummyCopyEmitter : public EmitterBase {
     mlir::ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
     b.setInsertionPointToStart(entry_function.addEntryBlock());
     auto thread_id = EmitThreadId(b, 0);
-    auto value = b.create<mlir::tensor::ExtractOp>(
-        entry_function.getArgument(0), mlir::ValueRange{thread_id});
-    auto result = b.create<mlir::tensor::InsertOp>(
-        value, entry_function.getArgument(1), mlir::ValueRange{thread_id});
-    b.create<mlir::func::ReturnOp>(result->getResults());
+    auto value = mlir::tensor::ExtractOp::create(
+        b, entry_function.getArgument(0), mlir::ValueRange{thread_id});
+    auto result = mlir::tensor::InsertOp::create(
+        b, value, entry_function.getArgument(1), mlir::ValueRange{thread_id});
+    mlir::func::ReturnOp::create(b, result->getResults());
     return absl::OkStatus();
   }
 };
@@ -86,11 +87,11 @@ class DummyCopyEmitter : public EmitterBase {
 class EmitterBaseTest : public HloHardwareIndependentTestBase {
  protected:
   EmitterBaseTest() {
-    context_.appendDialectRegistry(EmitterBase::GetDialectRegistry());
-    context_.loadAllAvailableDialects();
+    mlir_context_.appendDialectRegistry(EmitterBase::GetDialectRegistry());
+    mlir_context_.loadAllAvailableDialects();
   }
 
-  mlir::MLIRContext context_;
+  mlir::MLIRContext mlir_context_;
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::CudaOrRocmDeviceInfo();
 };
@@ -111,7 +112,7 @@ TEST_F(EmitterBaseTest, CreateMlirModule) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto mlir_module,
       emitter.CreateMLIRModule(
-          context_,
+          mlir_context_,
           *Cast<HloFusionInstruction>(
               module->entry_computation()->root_instruction()),
           "fusion",
@@ -142,7 +143,7 @@ TEST_F(EmitterBaseTest, CreateLLVMModule) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto llvm_module,
       emitter.CreateLLVMModule(
-          context_, llvm_context, device_info_,
+          mlir_context_, llvm_context, device_info_,
           *Cast<HloFusionInstruction>(
               module->entry_computation()->root_instruction()),
           "fusion",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
index ebe39d4b233c49..4eb2b55f210f88 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
@@ -46,6 +46,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::MLIRContext;
+
 constexpr int kDUSUpdateIndex = 1;
 
 }  // namespace
@@ -59,7 +61,7 @@ LaunchDimensions InPlaceDynamicUpdateSliceFusion::launch_dimensions() const {
 
 std::optional<std::vector<IndexingMap>>
 InPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
-    int64_t root_index, mlir::MLIRContext* indexing_context) const {
+    int64_t root_index, MLIRContext* mlir_context) const {
   // TODO(b/331355203): Implement thread ID -> operand indexing.
   std::vector<IndexingMap> result(
       analysis_.fusion_hero(root_index).GetOperands().size(),
@@ -68,14 +70,13 @@ InPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
   using KernelEmitter = emitters::DynamicUpdateSliceKernelEmitter;
   result[kDUSUpdateIndex] = KernelEmitter::ComputeWorkItemIdToOutputIndexing(
       GetWorkDimensions(),
-      KernelEmitter::GetIndexingShape(analysis_.fusion_spec()),
-      indexing_context);
+      KernelEmitter::GetIndexingShape(analysis_.fusion_spec()), mlir_context);
   return result;
 }
 
 std::vector<emitters::EpilogueSpecification>
 InPlaceDynamicUpdateSliceFusion::GetEpilogues(
-    const HloFusionInstruction& fusion, mlir::MLIRContext* mlir_context) const {
+    const HloFusionInstruction& fusion, MLIRContext* mlir_context) const {
   // We don't actually support epilogues for DUS, but this is how we tell
   // the base class that we don't want it to generate code for the DUS.
   std::vector<emitters::EpilogueSpecification> epilogues;
@@ -95,17 +96,16 @@ WorkDimensions InPlaceDynamicUpdateSliceFusion::GetWorkDimensions() const {
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 InPlaceDynamicUpdateSliceFusion::CreateMLIRModule(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
   emitters::DynamicUpdateSliceKernelEmitter emitter(
-      context, fusion, analysis_.fusion_spec(), buffer_assignment,
+      mlir_context, fusion, analysis_.fusion_spec(), buffer_assignment,
       GetDefaultBufferAlignment(), GetWorkDimensions(), entry_function_name,
       BackendKind::kGpu);
 
   TF_ASSIGN_OR_RETURN(auto kernel_definition, emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-  return std::move(source).ReleaseStorage().module;
+  return std::move(kernel_definition).TakeSource().TakeModule();
 }
 
 absl::Status InPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
index 57033f10d097b0..2b96e3f1fe54ab 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -56,14 +56,14 @@ class InPlaceDynamicUpdateSliceFusion : public EmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* indexing_context) const override {
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override {
     // The mapping cannot be statically computed in general, since the offsets
     // are unknown.
     return std::nullopt;
   }
 
   std::optional<std::vector<IndexingMap>> ComputeThreadIdToInputIndexing(
-      int64_t root_index, mlir::MLIRContext* indexing_context) const override;
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
 
  protected:
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc
index 8e114d441b5fad..1635c533b61968 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc
@@ -282,5 +282,6 @@ void SyncThreadsOp::getAsmResultNames(
 }  // namespace gpu
 }  // namespace xla
 
+using mlir::DenseI64ArrayAttr;
 #define GET_OP_CLASSES
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc.inc"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc
index 93f86eb81a6bc8..8a9a546d705fb7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
@@ -50,6 +49,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::MLIRContext;
+
 const Shape& GetIndexShape(const Shape& shape) {
   return shape.IsTuple() ? shape.tuple_shapes(0) : shape;
 }
@@ -57,24 +58,24 @@ const Shape& GetIndexShape(const Shape& shape) {
 }  // namespace
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, MLIRContext* mlir_context) const {
   return emitters::LoopFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
       GetWorkDimensions(),
-      GetIndexShape(analysis_.fusion_root(root_index).shape()), ctx);
+      GetIndexShape(analysis_.fusion_root(root_index).shape()), mlir_context);
 }
 
 std::optional<std::vector<IndexingMap>>
 LoopFusion::ComputeThreadIdToInputIndexing(int64_t root_index,
-                                           mlir::MLIRContext* ctx) const {
+                                           MLIRContext* mlir_context) const {
   std::optional<IndexingMap> thread_id_to_output_indexing =
-      ComputeThreadIdToOutputIndexing(root_index, ctx);
+      ComputeThreadIdToOutputIndexing(root_index, mlir_context);
   if (!thread_id_to_output_indexing.has_value()) {
     return std::nullopt;
   }
   const HloInstruction* fusion_root =
       &analysis_.fusion_root(root_index).instruction();
   auto output_to_input_indexing =
-      ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
+      ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, mlir_context);
   std::vector<IndexingMap> result;
   result.reserve(fusion_root->operand_count());
   for (int64_t operand_index = 0; operand_index < fusion_root->operand_count();
@@ -107,17 +108,16 @@ WorkDimensions LoopFusion::GetWorkDimensions() const {
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoopFusion::CreateMLIRModule(
-    mlir::MLIRContext& context, const HloFusionInstruction& fusion,
+    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
   emitters::LoopFusionKernelEmitter emitter(
-      context, fusion, analysis_.fusion_spec(), buffer_assignment,
+      mlir_context, fusion, analysis_.fusion_spec(), buffer_assignment,
       GetDefaultBufferAlignment(), GetWorkDimensions(), entry_function_name,
       BackendKind::kGpu);
 
   TF_ASSIGN_OR_RETURN(auto kernel_definition, emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-  return std::move(source).ReleaseStorage().module;
+  return std::move(kernel_definition).TakeSource().TakeModule();
 }
 
 absl::Status LoopFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
index 303358a488c597..d71111d9f1329b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -37,15 +37,15 @@ namespace gpu {
 // Generic loop fusion. Lowers to LLVM via MLIR.
 class LoopFusion final : public EmitterBase {
  public:
-  LoopFusion(const HloFusionAnalysis& analysis, mlir::MLIRContext* ctx)
+  LoopFusion(const HloFusionAnalysis& analysis, mlir::MLIRContext* mlir_context)
       : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {}
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
 
   std::optional<std::vector<IndexingMap>> ComputeThreadIdToInputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
 
  private:
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
index ac700150ed46cd..192292d8f81964 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -58,6 +57,7 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -100,7 +100,8 @@ struct ReductionFusion::EmitterState {
   EmitterState(const ReductionFusion& owner, mlir::func::FuncOp entry_function,
                const HloFusionInstruction& fusion,
                const PartitionedComputations& computations,
-               const emitters::CallTargetProvider& call_target)
+               const emitters::CallTargetProvider& call_target,
+               MLIRContext* mlir_context)
       : owner(owner),
         entry_function(entry_function),
         fusion(fusion),
@@ -108,7 +109,8 @@ struct ReductionFusion::EmitterState {
         call_target(call_target),
         builder(entry_function.getLoc(), entry_function),
         computation(computations.FindPartitionedComputation(
-            fusion.fused_instructions_computation())) {
+            fusion.fused_instructions_computation())),
+        mlir_context(mlir_context) {
     int output_index = 0;
     for (const auto& [root_index, root] :
          llvm::enumerate(owner.analysis_.fusion_roots())) {
@@ -174,12 +176,12 @@ struct ReductionFusion::EmitterState {
   absl::flat_hash_map<const HloInstruction*, int> fusion_result_index_starts;
   absl::flat_hash_map<const HloInstruction*, int> root_indices;
   SmallVector<Value> thread_and_block_ids;
+  MLIRContext* mlir_context;
 };
 
 PerThreadOutputs ReductionFusion::EmitterState::EmitPerThreadElements(
     int group_id, const HloValueMap& inits, const SmallVector<Value>& outputs) {
-  auto tile_indexing =
-      owner.ComputeReductionInputIndexing(builder.getContext());
+  auto tile_indexing = owner.ComputeReductionInputIndexing(mlir_context);
   tile_indexing
       .GetMutableDimensionBound(
           KernelFusionInterface::kIndexingMapBlockIdxDims[1])
@@ -192,9 +194,9 @@ PerThreadOutputs ReductionFusion::EmitterState::EmitPerThreadElements(
   const auto& reductions = owner.reduction_heroes_[group_id];
   absl::flat_hash_map<const HloInstruction*, int> iter_arg_starts;
 
-  for (const auto& [reduction, init] : inits) {
+  for (const HloInstruction* reduction : reductions) {
     iter_arg_starts[reduction] = iter_arg_inits.size();
-    iter_arg_inits.append(init);
+    iter_arg_inits.append(inits.find(reduction)->second);
   }
 
   auto body_builder = [&](ImplicitLocOpBuilder& nested_b,
@@ -207,7 +209,7 @@ PerThreadOutputs ReductionFusion::EmitterState::EmitPerThreadElements(
       SmallVector<Value> reduce_args = iter_args.slice(start, arity);
       auto indices = emitters::ApplyIndexing(
           GetBitcastMap(owner.input_shape_, reduction->operand(0)->shape(),
-                        nested_b.getContext()),
+                        mlir_context),
           map_results, {}, nested_b);
       reduce_args.append(ProvideParameterRange(computation, reduction, 0, arity,
                                                indices, call_target,
@@ -221,7 +223,7 @@ PerThreadOutputs ReductionFusion::EmitterState::EmitPerThreadElements(
         addf->setAttr("fastmath", no_signed_zeros);
       });
       absl::c_copy(
-          nested_b.create<PureCallOp>(reducer, reduce_args).getResults(),
+          PureCallOp::create(nested_b, reducer, reduce_args).getResults(),
           results.begin() + start);
     }
     struct SideOutput {
@@ -231,8 +233,7 @@ PerThreadOutputs ReductionFusion::EmitterState::EmitPerThreadElements(
     llvm::SmallVector<SideOutput> side_output_values;
     for (auto* side_output : side_outputs) {
       auto indices = emitters::ApplyIndexing(
-          GetBitcastMap(owner.input_shape_, side_output->shape(),
-                        builder.getContext()),
+          GetBitcastMap(owner.input_shape_, side_output->shape(), mlir_context),
           map_results, {}, builder);
       auto* root_tuple = fusion.fused_expression_root();
       Value value = emitters::ProvideParameter(
@@ -244,8 +245,8 @@ PerThreadOutputs ReductionFusion::EmitterState::EmitPerThreadElements(
          llvm::zip(side_outputs, side_output_values)) {
       // The first iter args are the outputs.
       int offset = OutputIndex(side_output, 0);
-      results[offset] = builder.create<mlir::tensor::InsertOp>(
-          values.scalar, iter_args[offset], values.indices);
+      results[offset] = mlir::tensor::InsertOp::create(
+          builder, values.scalar, iter_args[offset], values.indices);
     }
     return results;
   };
@@ -268,7 +269,7 @@ SmallVector<Value> ReductionFusion::EmitterState::WriteToSharedMemory(
     absl::Span<const HloInstruction* const> reductions,
     const HloValueMap& values, std::optional<int> padding) {
   SmallVector<int64_t> shape;
-  auto map = owner.GetSharedMemoryWriteMap(builder.getContext());
+  auto map = owner.GetSharedMemoryWriteMap(mlir_context);
   for (auto result : map.GetAffineMap().getResults()) {
     shape.push_back(
         map.GetRangeEvaluator().ComputeExpressionRange(result).upper + 1);
@@ -285,8 +286,8 @@ SmallVector<Value> ReductionFusion::EmitterState::WriteToSharedMemory(
     for (int i = 0; i < reduction->operand_count() / 2; ++i) {
       auto tile_shape = ShapeUtil::MakeShapeWithDescendingLayout(
           reduction->operand(i)->shape().element_type(), shape);
-      tiles.push_back(builder.create<AllocateSharedOp>(
-          emitters::TensorShapeToMlirType(tile_shape, builder)));
+      tiles.push_back(AllocateSharedOp::create(
+          builder, emitters::TensorShapeToMlirType(tile_shape, builder)));
     }
   }
 
@@ -301,11 +302,12 @@ SmallVector<Value> ReductionFusion::EmitterState::WriteToSharedMemory(
         for (auto* hero : reductions) {
           for (auto value : values.at(hero)) {
             if (mlir::isa<mlir::VectorType>(value.getType())) {
-              value = builder.create<mlir::vector::ExtractOp>(
-                  value, symbol_values.back());
+              value = mlir::vector::ExtractOp::create(builder, value,
+                                                      symbol_values.back());
             }
             auto& tile = written[shared_index++];
-            tile = builder.create<mlir::tensor::InsertOp>(value, tile, indices);
+            tile =
+                mlir::tensor::InsertOp::create(builder, value, tile, indices);
           }
         }
         return written;
@@ -313,7 +315,7 @@ SmallVector<Value> ReductionFusion::EmitterState::WriteToSharedMemory(
 
   // Wait for the entire tile to be written.
   auto synced_tiles =
-      builder.create<SyncThreadsOp>(mlir::TypeRange(tiles), written_tiles)
+      SyncThreadsOp::create(builder, mlir::TypeRange(tiles), written_tiles)
           .getResults();
 
   return synced_tiles;
@@ -324,8 +326,8 @@ HloValueMap ReductionFusion::EmitterState::ShuffleReduce(
     const HloValueMap& per_thread_values, int max_dist) {
   HloValueMap results;
   for (auto* hero : reductions) {
-    auto reduce = builder.create<ShuffleReduceOp>(
-        GetReducer(hero), per_thread_values.at(hero), max_dist);
+    auto reduce = ShuffleReduceOp::create(builder, GetReducer(hero),
+                                          per_thread_values.at(hero), max_dist);
     results[hero] = reduce.getResults();
   }
   return results;
@@ -335,8 +337,7 @@ mlir::ValueRange ReductionFusion::EmitterState::ReduceViaSharedMemory(
     int group_id, const PerThreadOutputs& per_thread, const HloValueMap& inits,
     std::optional<int> padding, int max_dist) {
   const auto& reductions = owner.reduction_heroes_[group_id];
-  auto read_indexing =
-      owner.GetSharedMemoryReductionReadMap(builder.getContext());
+  auto read_indexing = owner.GetSharedMemoryReductionReadMap(mlir_context);
   auto loop_indexing = read_indexing;
   // All threads must participate in the shuffle, so we clear the constraints
   // for the iteration. Otherwise, some threads might not be part of the loop,
@@ -366,8 +367,8 @@ mlir::ValueRange ReductionFusion::EmitterState::ReduceViaSharedMemory(
           auto& args = reduce_args[hero];
           for (auto init : inits.at(hero)) {
             // If a warp didn't write anything, use the init values instead.
-            auto extract = builder.create<PredicatedExtractOp>(
-                read_condition, init, tiles[tile_index++], indices);
+            auto extract = PredicatedExtractOp::create(
+                builder, read_condition, init, tiles[tile_index++], indices);
             args.push_back(extract.getResult());
           }
         }
@@ -377,8 +378,9 @@ mlir::ValueRange ReductionFusion::EmitterState::ReduceViaSharedMemory(
       });
 }
 
-ReductionFusion::ReductionFusion(const HloFusionAnalysis& analysis)
-    : analysis_(analysis) {
+ReductionFusion::ReductionFusion(const HloFusionAnalysis& analysis,
+                                 MLIRContext* mlir_context)
+    : analysis_(analysis), mlir_context_(mlir_context) {
   auto* hero_reduction = analysis.FindHeroReduction();
   CHECK_NE(hero_reduction, nullptr);
   reduction_dimensions_ =
@@ -418,13 +420,14 @@ ReductionFusion::ReductionFusion(const HloFusionAnalysis& analysis)
 IndexingMap ReductionFusion::GetIndexingMap(
     llvm::ArrayRef<mlir::AffineExpr> results,
     absl::Span<int64_t const> symbol_sizes) const {
-  auto* ctx = results.front().getContext();
+  auto* mlir_context = results.front().getContext();
   auto num_groups = static_cast<int64_t>(reduction_heroes_.size());
-  return IndexingMap{AffineMap::get(6, symbol_sizes.size(), results, ctx),
-                     DimVarsFromGPUGrid({Product(num_threads_), 1, 1,
-                                         Product(num_blocks_), num_groups, 1}),
-                     RangeVarsFromTensorSizes(symbol_sizes),
-                     /*rt_vars=*/{}};
+  return IndexingMap{
+      AffineMap::get(6, symbol_sizes.size(), results, mlir_context),
+      DimVarsFromGPUGrid(
+          {Product(num_threads_), 1, 1, Product(num_blocks_), num_groups, 1}),
+      RangeVarsFromTensorSizes(symbol_sizes),
+      /*rt_vars=*/{}};
 }
 
 IndexingMap ReductionFusion::GetThreadIndexingMap(
@@ -475,22 +478,24 @@ absl::Status ReductionFusion::EmitEntryFunction(
     const emitters::CallTargetProvider& call_targets,
     mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  EmitterState state{*this, entry_function, fusion, computations, call_targets};
+  EmitterState state{*this,        entry_function, fusion,
+                     computations, call_targets,   mlir_context_};
   auto& b = state.builder;
   b.setInsertionPointToStart(entry_function.addEntryBlock());
   state.thread_and_block_ids = EmitThreadAndBlockIds(b);
   if (reduction_heroes_.size() == 1) {
-    b.create<mlir::func::ReturnOp>(EmitReduction(0, state));
+    mlir::func::ReturnOp::create(b, EmitReduction(0, state));
     return absl::OkStatus();
   }
   SmallVector<int64_t> cases(reduction_heroes_.size() - 1);
   absl::c_iota(cases, 1);  // `default` is region 0.
-  auto switch_op = b.create<mlir::scf::IndexSwitchOp>(
-      entry_function.getResultTypes(), EmitBlockId(b, 1), cases, cases.size());
-  b.create<mlir::func::ReturnOp>(switch_op.getResults());
+  auto switch_op =
+      mlir::scf::IndexSwitchOp::create(b, entry_function.getResultTypes(),
+                                       EmitBlockId(b, 1), cases, cases.size());
+  mlir::func::ReturnOp::create(b, switch_op.getResults());
   for (auto [id, region] : llvm::enumerate(switch_op->getRegions())) {
     b.setInsertionPointToStart(&region.emplaceBlock());
-    b.create<mlir::scf::YieldOp>(EmitReduction(id, state));
+    mlir::scf::YieldOp::create(b, EmitReduction(id, state));
   }
   return absl::OkStatus();
 }
@@ -508,14 +513,14 @@ HloValueMap ReductionFusion::GetInits(int group_id, EmitterState& state) const {
 }
 
 std::optional<std::vector<IndexingMap>>
-ReductionFusion::ComputeThreadIdToInputIndexing(int64_t root_index,
-                                                MLIRContext* ctx) const {
+ReductionFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, MLIRContext* mlir_context) const {
   const auto& hero = analysis_.fusion_hero(root_index).instruction();
   std::vector<IndexingMap> result(hero.operand_count(),
                                   IndexingMap::GetUndefined());
   if (!groups_.is_reduction_root[root_index]) {
     auto thread_id_to_output_indexing =
-        ComputeThreadIdToOutputIndexing(root_index, ctx);
+        ComputeThreadIdToOutputIndexing(root_index, mlir_context);
     if (!thread_id_to_output_indexing.has_value()) {
       return std::nullopt;
     }
@@ -524,7 +529,7 @@ ReductionFusion::ComputeThreadIdToInputIndexing(int64_t root_index,
       result[operand_index] = ComposeIndexingMaps(
           *thread_id_to_output_indexing,
           ComputeOutputToInputIndexing(
-              &analysis_.fusion_root(root_index).instruction(), 0, ctx)
+              &analysis_.fusion_root(root_index).instruction(), 0, mlir_context)
               .indexing_maps[operand_index]
               .begin()
               ->map());
@@ -535,29 +540,30 @@ ReductionFusion::ComputeThreadIdToInputIndexing(int64_t root_index,
   // We don't have indexing for the init values.
   for (int64_t operand_index = 0; operand_index < hero.operand_count() / 2;
        ++operand_index) {
-    auto projected_map = ComputeReductionInputIndexing(ctx);
+    auto projected_map = ComputeReductionInputIndexing(mlir_context);
     AddGroupIdConstraint(projected_map, root_index, groups_);
     result[operand_index] =
-        projected_map *
-        GetBitcastMap(input_shape_, hero.operand(operand_index)->shape(), ctx);
+        projected_map * GetBitcastMap(input_shape_,
+                                      hero.operand(operand_index)->shape(),
+                                      mlir_context);
     result[operand_index].Simplify();
   }
   return result;
 }
 
 std::optional<IndexingMap> ReductionFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, MLIRContext* ctx) const {
+    int64_t root_index, MLIRContext* mlir_context) const {
   if (!groups_.is_reduction_root[root_index]) {
     auto map = ComposeIndexingMaps(
-        ComputeReductionInputIndexing(ctx),
+        ComputeReductionInputIndexing(mlir_context),
         GetBitcastMap(input_shape_, analysis_.fusion_root(root_index).shape(),
-                      ctx));
+                      mlir_context));
     AddGroupIdConstraint(map, root_index, groups_);
     map.Simplify();
     return map;
   }
 
-  auto projected_indexing = ComputeReductionOutputIndexing(ctx);
+  auto projected_indexing = ComputeReductionOutputIndexing(mlir_context);
   auto output_shape = reduction_dimensions_.GetOutputShape();
   CHECK_EQ(output_shape.size(),
            projected_indexing.GetAffineMap().getNumResults());
@@ -570,8 +576,8 @@ std::optional<IndexingMap> ReductionFusion::ComputeThreadIdToOutputIndexing(
   const auto& hero = analysis_.fusion_hero(root_index).instruction();
   auto physical_shape =
       ShapeUtil::DeleteDimensions(hero.dimensions(), hero.operand(0)->shape());
-  auto map =
-      projected_indexing * GetBitcastMap(output_shape, physical_shape, ctx);
+  auto map = projected_indexing *
+             GetBitcastMap(output_shape, physical_shape, mlir_context);
   map.Simplify();
   return map;
 }
@@ -590,7 +596,7 @@ SmallVector<Value> ReductionFusion::EvaluateEpilogue(
                              results, epilogue_input_indices, b);
   int first_root_index = state.root_indices[epilogue.roots.front()];
   auto thread_has_output = emitters::CheckConstraints(
-      *ComputeThreadIdToOutputIndexing(first_root_index, b.getContext()),
+      *ComputeThreadIdToOutputIndexing(first_root_index, mlir_context_),
       state.thread_and_block_ids, symbol_values, b);
   for (auto [index, root] : llvm::enumerate(epilogue.roots)) {
     auto output_indices =
@@ -598,15 +604,16 @@ SmallVector<Value> ReductionFusion::EvaluateEpilogue(
                                 state.thread_and_block_ids, symbol_values, b);
     for (auto [result_index, result] : llvm::enumerate(values.at(root))) {
       auto& output = outputs[state.OutputIndex(root, result_index)];
-      output = b.create<PredicatedInsertOp>(thread_has_output, result, output,
-                                            output_indices);
+      output = PredicatedInsertOp::create(b, thread_has_output, result, output,
+                                          output_indices);
     }
   }
   return outputs;
 }
 
-ColumnReductionFusion::ColumnReductionFusion(const HloFusionAnalysis& analysis)
-    : ReductionFusion(analysis) {
+ColumnReductionFusion::ColumnReductionFusion(const HloFusionAnalysis& analysis,
+                                             MLIRContext* mlir_context)
+    : ReductionFusion(analysis, mlir_context) {
   CHECK(!reduction_dimensions_.is_row_reduction);
 
   input_shape_ = {reduction_dimensions_.dimensions[0],
@@ -634,12 +641,12 @@ ColumnReductionFusion::ColumnReductionFusion(const HloFusionAnalysis& analysis)
 }
 
 IndexingMap ColumnReductionFusion::ComputeReductionOutputIndexing(
-    MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto thread_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx), num_threads_);
+      DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context), num_threads_);
   auto block_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(3, ctx), num_blocks_);
-  auto vector_index = getAffineSymbolExpr(0, ctx);
+      DelinearizeInBoundsIndex(getAffineDimExpr(3, mlir_context), num_blocks_);
+  auto vector_index = getAffineSymbolExpr(0, mlir_context);
   SmallVector<AffineExpr, 2> results{
       block_id[0],
       (block_id[1] * kTileSize + thread_id[0]) * vector_size_ + vector_index};
@@ -650,13 +657,13 @@ IndexingMap ColumnReductionFusion::ComputeReductionOutputIndexing(
 }
 
 IndexingMap ColumnReductionFusion::ComputeReductionInputIndexing(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto thread_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx), num_threads_);
+      DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context), num_threads_);
   auto block_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(3, ctx), num_blocks_);
-  AffineExpr element_index = getAffineSymbolExpr(0, ctx);
-  AffineExpr vector_index = getAffineSymbolExpr(1, ctx);
+      DelinearizeInBoundsIndex(getAffineDimExpr(3, mlir_context), num_blocks_);
+  AffineExpr element_index = getAffineSymbolExpr(0, mlir_context);
+  AffineExpr vector_index = getAffineSymbolExpr(1, mlir_context);
 
   SmallVector<AffineExpr, 3> results{
       block_id[0], thread_id[0] + element_index * num_threads_[1],
@@ -670,20 +677,20 @@ IndexingMap ColumnReductionFusion::ComputeReductionInputIndexing(
 }
 
 IndexingMap ColumnReductionFusion::GetSharedMemoryReductionReadMap(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto thread_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx), num_threads_);
-  auto vector_index = getAffineSymbolExpr(0, ctx);
+      DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context), num_threads_);
+  auto vector_index = getAffineSymbolExpr(0, mlir_context);
   return GetThreadIndexingMap(
       {thread_id[0], thread_id[1] * vector_size_ + vector_index}, {},
       /*symbol_sizes=*/{vector_size_});
 }
 
 IndexingMap ColumnReductionFusion::GetSharedMemoryWriteMap(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto thread_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx), num_threads_);
-  auto vector_index = getAffineSymbolExpr(0, ctx);
+      DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context), num_threads_);
+  auto vector_index = getAffineSymbolExpr(0, mlir_context);
   return GetThreadIndexingMap(
       {thread_id[1], thread_id[0] * vector_size_ + vector_index}, {},
       /*symbol_sizes=*/{vector_size_});
@@ -699,8 +706,8 @@ llvm::SmallVector<mlir::Value> ColumnReductionFusion::EmitReduction(
 }
 
 SmallColumnReductionFusion::SmallColumnReductionFusion(
-    const HloFusionAnalysis& analysis)
-    : ReductionFusion(analysis) {
+    const HloFusionAnalysis& analysis, MLIRContext* mlir_context)
+    : ReductionFusion(analysis, mlir_context) {
   CHECK(!reduction_dimensions_.is_row_reduction);
 
   input_shape_ = {reduction_dimensions_.dimensions[0],
@@ -731,10 +738,10 @@ SmallColumnReductionFusion::SmallColumnReductionFusion(
 }
 
 IndexingMap SmallColumnReductionFusion::ComputeReductionOutputIndexing(
-    MLIRContext* ctx) const {
-  auto thread_id = getAffineDimExpr(0, ctx);
-  auto block_id = getAffineDimExpr(3, ctx);
-  auto vector_index = getAffineSymbolExpr(0, ctx);
+    MLIRContext* mlir_context) const {
+  auto thread_id = getAffineDimExpr(0, mlir_context);
+  auto block_id = getAffineDimExpr(3, mlir_context);
+  auto vector_index = getAffineSymbolExpr(0, mlir_context);
   SmallVector<AffineExpr, 2> results{
       block_id,
       (thread_id + vector_index * num_threads_[0]).floorDiv(shared_rows_)};
@@ -745,11 +752,11 @@ IndexingMap SmallColumnReductionFusion::ComputeReductionOutputIndexing(
 }
 
 IndexingMap SmallColumnReductionFusion::ComputeReductionInputIndexing(
-    mlir::MLIRContext* ctx) const {
-  auto thread_id = getAffineDimExpr(0, ctx);
-  auto block_id = getAffineDimExpr(3, ctx);
-  AffineExpr loop_index = getAffineSymbolExpr(0, ctx);
-  AffineExpr vector_index = getAffineSymbolExpr(1, ctx);
+    MLIRContext* mlir_context) const {
+  auto thread_id = getAffineDimExpr(0, mlir_context);
+  auto block_id = getAffineDimExpr(3, mlir_context);
+  AffineExpr loop_index = getAffineSymbolExpr(0, mlir_context);
+  AffineExpr vector_index = getAffineSymbolExpr(1, mlir_context);
 
   AffineExpr linear_index = thread_id * vector_size_ + vector_index +
                             loop_index * (vector_size_ * num_threads_[0]);
@@ -758,7 +765,7 @@ IndexingMap SmallColumnReductionFusion::ComputeReductionInputIndexing(
       GetBitcastMap({num_blocks_[0], input_shape_[1] * input_shape_[2]},
                     ShapeUtil::MakeShapeWithDescendingLayout(PrimitiveType::U8,
                                                              input_shape_),
-                    ctx);
+                    mlir_context);
 
   for (auto [result, dim_size] :
        llvm::zip(map.GetAffineMap().getResults(), input_shape_)) {
@@ -768,18 +775,20 @@ IndexingMap SmallColumnReductionFusion::ComputeReductionInputIndexing(
 }
 
 IndexingMap SmallColumnReductionFusion::GetSharedMemoryReductionReadMap(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto indices = DelinearizeInBoundsIndex(
-      getAffineDimExpr(0, ctx) + getAffineSymbolExpr(0, ctx) * num_threads_[0],
+      getAffineDimExpr(0, mlir_context) +
+          getAffineSymbolExpr(0, mlir_context) * num_threads_[0],
       {input_shape_[2], shared_rows_});
   return GetThreadIndexingMap({indices[1], indices[0]}, {},
                               /*symbol_sizes=*/{vector_size_});
 }
 
 IndexingMap SmallColumnReductionFusion::GetSharedMemoryWriteMap(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto indices = DelinearizeInBoundsIndex(
-      getAffineDimExpr(0, ctx) * vector_size_ + getAffineSymbolExpr(0, ctx),
+      getAffineDimExpr(0, mlir_context) * vector_size_ +
+          getAffineSymbolExpr(0, mlir_context),
       {shared_rows_, input_shape_[2]});
   return GetThreadIndexingMap(indices, {},
                               /*symbol_sizes=*/{vector_size_});
@@ -801,8 +810,9 @@ llvm::SmallVector<mlir::Value> SmallColumnReductionFusion::EmitReduction(
                                      shared_rows_ / 2);
 }
 
-RowReductionFusion::RowReductionFusion(const HloFusionAnalysis& analysis)
-    : ReductionFusion(analysis) {
+RowReductionFusion::RowReductionFusion(const HloFusionAnalysis& analysis,
+                                       MLIRContext* mlir_context)
+    : ReductionFusion(analysis, mlir_context) {
   CHECK(reduction_dimensions_.is_row_reduction);
   Vector3 shape = reduction_dimensions_.dimensions;
   int64_t kMinorReducedElementsPerThread = 8;
@@ -876,14 +886,14 @@ RowReductionFusion::RowReductionFusion(const HloFusionAnalysis& analysis)
 }
 
 IndexingMap RowReductionFusion::ComputeReductionInputIndexing(
-    mlir::MLIRContext* ctx) const {
-  auto thread_id =
-      DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx), num_threads_);
-  auto block_id =
-      DelinearizeInBoundsIndex(mlir::getAffineDimExpr(3, ctx), num_blocks_);
-  auto major_reduced = getAffineSymbolExpr(0, ctx);
-  auto minor_reduced = getAffineSymbolExpr(1, ctx);
-  auto vector_index = getAffineSymbolExpr(2, ctx);
+    MLIRContext* mlir_context) const {
+  auto thread_id = DelinearizeInBoundsIndex(
+      mlir::getAffineDimExpr(0, mlir_context), num_threads_);
+  auto block_id = DelinearizeInBoundsIndex(
+      mlir::getAffineDimExpr(3, mlir_context), num_blocks_);
+  auto major_reduced = getAffineSymbolExpr(0, mlir_context);
+  auto minor_reduced = getAffineSymbolExpr(1, mlir_context);
+  auto vector_index = getAffineSymbolExpr(2, mlir_context);
 
   SmallVector<AffineExpr> indices{
       major_reduced,
@@ -901,11 +911,11 @@ IndexingMap RowReductionFusion::ComputeReductionInputIndexing(
 }
 
 IndexingMap RowReductionFusion::ComputeReductionOutputIndexing(
-    MLIRContext* ctx) const {
-  auto thread_id =
-      DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx), num_threads_);
-  auto block_id =
-      DelinearizeInBoundsIndex(mlir::getAffineDimExpr(3, ctx), num_blocks_);
+    MLIRContext* mlir_context) const {
+  auto thread_id = DelinearizeInBoundsIndex(
+      mlir::getAffineDimExpr(0, mlir_context), num_threads_);
+  auto block_id = DelinearizeInBoundsIndex(
+      mlir::getAffineDimExpr(3, mlir_context), num_blocks_);
   IndexingMap projected_index =
       GetIndexingMap(block_id[0] * tile_sizes_per_block_[0] + thread_id[0]);
   projected_index.AddConstraint(thread_id[1], {0, 0});
@@ -917,18 +927,18 @@ int RowReductionFusion::GetWarpsPerRow() const {
 }
 
 IndexingMap RowReductionFusion::GetSharedMemoryReductionReadMap(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto thread_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx), num_threads_);
+      DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context), num_threads_);
   auto lane_id = thread_id[1] % WarpSize();
   return GetThreadIndexingMap({thread_id[0], lane_id},
                               {{thread_id[1], {0, GetWarpsPerRow() - 1}}});
 }
 
 IndexingMap RowReductionFusion::GetSharedMemoryWriteMap(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   auto thread_id =
-      DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx), num_threads_);
+      DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context), num_threads_);
   // The reduced dimension is tiled; each warp writes one element to shared
   // memory (from lane 0).
   auto lane_id = thread_id[1] % WarpSize();
@@ -958,8 +968,9 @@ llvm::SmallVector<mlir::Value> RowReductionFusion::EmitReduction(
 }
 
 MultiRowReductionFusion::MultiRowReductionFusion(
-    const HloFusionAnalysis& analysis, int vector_size)
-    : ReductionFusion(analysis) {
+    const HloFusionAnalysis& analysis, int vector_size,
+    MLIRContext* mlir_context)
+    : ReductionFusion(analysis, mlir_context) {
   CHECK(reduction_dimensions_.is_row_reduction);
   Vector3 shape = reduction_dimensions_.dimensions;
   input_shape_ = {shape[0], shape[1], shape[2]};
@@ -969,7 +980,7 @@ MultiRowReductionFusion::MultiRowReductionFusion(
 }
 
 std::unique_ptr<ReductionFusion> MultiRowReductionFusion::TryCreate(
-    const HloFusionAnalysis& analysis) {
+    const HloFusionAnalysis& analysis, MLIRContext* mlir_context) {
   auto* hero_reduction = analysis.FindHeroReduction();
   CHECK_NE(hero_reduction, nullptr);
   auto reduction_dimensions =
@@ -1050,7 +1061,8 @@ std::unique_ptr<ReductionFusion> MultiRowReductionFusion::TryCreate(
 
   VLOG(3) << "MultiRowReductionFusion::TryCreate selected vector_size = "
           << vector_size;
-  return std::make_unique<MultiRowReductionFusion>(analysis, vector_size);
+  return std::make_unique<MultiRowReductionFusion>(analysis, vector_size,
+                                                   mlir_context);
 }
 
 absl::InlinedVector<int64_t, 4> MultiRowReductionFusion::GetNumThreads(
@@ -1080,13 +1092,14 @@ int64_t MultiRowReductionFusion::GetNumBlocks(
 }
 
 IndexingMap MultiRowReductionFusion::ComputeReductionInputIndexing(
-    mlir::MLIRContext* ctx) const {
-  auto thread_id =
-      DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx), num_threads_);
-  auto block_id = num_blocks_.front() == 1 ? mlir::getAffineConstantExpr(0, ctx)
-                                           : mlir::getAffineDimExpr(3, ctx);
-  auto major_reduced = getAffineSymbolExpr(0, ctx);
-  auto vector_index = getAffineSymbolExpr(1, ctx);
+    MLIRContext* mlir_context) const {
+  auto thread_id = DelinearizeInBoundsIndex(
+      mlir::getAffineDimExpr(0, mlir_context), num_threads_);
+  auto block_id = num_blocks_.front() == 1
+                      ? mlir::getAffineConstantExpr(0, mlir_context)
+                      : mlir::getAffineDimExpr(3, mlir_context);
+  auto major_reduced = getAffineSymbolExpr(0, mlir_context);
+  auto vector_index = getAffineSymbolExpr(1, mlir_context);
 
   SmallVector<AffineExpr> indices{
       major_reduced, block_id * num_threads_[0] + thread_id[0],
@@ -1100,11 +1113,12 @@ IndexingMap MultiRowReductionFusion::ComputeReductionInputIndexing(
 }
 
 IndexingMap MultiRowReductionFusion::ComputeReductionOutputIndexing(
-    MLIRContext* ctx) const {
-  auto thread_id =
-      DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx), num_threads_);
-  auto block_id = num_blocks_.front() == 1 ? mlir::getAffineConstantExpr(0, ctx)
-                                           : mlir::getAffineDimExpr(3, ctx);
+    MLIRContext* mlir_context) const {
+  auto thread_id = DelinearizeInBoundsIndex(
+      mlir::getAffineDimExpr(0, mlir_context), num_threads_);
+  auto block_id = num_blocks_.front() == 1
+                      ? mlir::getAffineConstantExpr(0, mlir_context)
+                      : mlir::getAffineDimExpr(3, mlir_context);
   IndexingMap projected_index =
       GetIndexingMap(block_id * num_threads_[0] + thread_id[0]);
   projected_index.AddConstraint(thread_id[1] % num_threads_[1], {0, 0});
@@ -1127,24 +1141,25 @@ llvm::SmallVector<mlir::Value> MultiRowReductionFusion::EmitReduction(
 }
 
 std::unique_ptr<ReductionFusion> CreateReductionFusion(
-    const HloFusionAnalysis& analysis) {
+    const HloFusionAnalysis& analysis, MLIRContext* mlir_context) {
   auto* hero_reduction = analysis.FindHeroReduction();
   CHECK_NE(hero_reduction, nullptr);
   ReductionDimensions reduction_dimensions =
       GetReductionKindAndContiguousComponents(*hero_reduction);
   if (reduction_dimensions.is_row_reduction) {
-    auto multi_row_emitter = MultiRowReductionFusion::TryCreate(analysis);
+    auto multi_row_emitter =
+        MultiRowReductionFusion::TryCreate(analysis, mlir_context);
     if (multi_row_emitter != nullptr) {
       return multi_row_emitter;
     }
-    return std::make_unique<RowReductionFusion>(analysis);
+    return std::make_unique<RowReductionFusion>(analysis, mlir_context);
   }
 
   const int64_t warp_size = analysis.device_info().threads_per_warp();
   if (warp_size % reduction_dimensions.dimensions[kColMinorKept] == 0) {
-    return std::make_unique<SmallColumnReductionFusion>(analysis);
+    return std::make_unique<SmallColumnReductionFusion>(analysis, mlir_context);
   }
-  return std::make_unique<ColumnReductionFusion>(analysis);
+  return std::make_unique<ColumnReductionFusion>(analysis, mlir_context);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
index e75aa724fd36fe..5841543ccf5148 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
@@ -28,13 +28,13 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/backends/gpu/codegen/emitters/reduction_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -48,18 +48,19 @@ namespace gpu {
 using HloValueMap =
     absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<mlir::Value>>;
 
-// Reduction fusion. Lowers to LLVM via MLIR. Currently not fully
+// Reduction fusion. Lowers to LLVM viamlir::MLIR. Currently not fully
 // implemented: only single reduction groups, no side outputs, only row
 // reductions.
 class ReductionFusion : public EmitterBase {
  public:
-  explicit ReductionFusion(const HloFusionAnalysis& analysis);
+  explicit ReductionFusion(const HloFusionAnalysis& analysis,
+                           mlir::MLIRContext* mlir_context);
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
 
   std::optional<std::vector<IndexingMap>> ComputeThreadIdToInputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
 
   LaunchDimensions launch_dimensions() const override;
 
@@ -101,22 +102,23 @@ class ReductionFusion : public EmitterBase {
   // Returns the input indexing. The inputs are given in the projected shape
   // (i.e., the indexing map has three results).
   virtual IndexingMap ComputeReductionInputIndexing(
-      mlir::MLIRContext* ctx) const = 0;
+      mlir::MLIRContext* mlir_context) const = 0;
   // Returns the output indexing. The outputs are given in the  projected
   // reduced shape (i.e., one or two results, depending on the reduction type).
   virtual IndexingMap ComputeReductionOutputIndexing(
-      mlir::MLIRContext* ctx) const = 0;
+      mlir::MLIRContext* mlir_context) const = 0;
 
   // Returns the (thread ID, vector index) -> (shared index...) map for the
   // shared memory reduction.
   virtual IndexingMap GetSharedMemoryReductionReadMap(
-      mlir::MLIRContext* ctx) const {
+      mlir::MLIRContext* mlir_context) const {
     return IndexingMap::GetUndefined();
   }
 
   // Returns the (thread ID, vector index) -> (shared index...) map for the
   // write to shared memory.
-  virtual IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const {
+  virtual IndexingMap GetSharedMemoryWriteMap(
+      mlir::MLIRContext* mlir_context) const {
     return IndexingMap::GetUndefined();
   }
 
@@ -131,6 +133,7 @@ class ReductionFusion : public EmitterBase {
   // The side output roots for each reduction group.
   std::vector<std::vector<const HloInstruction*>> side_output_roots_;
   const HloFusionAnalysis& analysis_;
+  mlir::MLIRContext* mlir_context_;
 
   // The number of elements in each dimension.
   absl::InlinedVector<int64_t, 4> input_shape_;
@@ -149,7 +152,8 @@ class ReductionFusion : public EmitterBase {
 
 class RowReductionFusion : public ReductionFusion {
  public:
-  explicit RowReductionFusion(const HloFusionAnalysis& analysis);
+  explicit RowReductionFusion(const HloFusionAnalysis& analysis,
+                              mlir::MLIRContext* mlir_context);
 
  protected:
   // The number of warps working on one output element.
@@ -157,24 +161,26 @@ class RowReductionFusion : public ReductionFusion {
   llvm::SmallVector<mlir::Value> EmitReduction(
       int group_id, EmitterState& state) const override;
   IndexingMap ComputeReductionInputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
   IndexingMap ComputeReductionOutputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
   IndexingMap GetSharedMemoryReductionReadMap(
-      mlir::MLIRContext* ctx) const override;
-  IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
+  IndexingMap GetSharedMemoryWriteMap(
+      mlir::MLIRContext* mlir_context) const override;
 
   absl::InlinedVector<int64_t, 4> tile_sizes_per_block_;
 };
 
 class MultiRowReductionFusion : public ReductionFusion {
  public:
-  MultiRowReductionFusion(const HloFusionAnalysis& analysis, int vector_size);
+  MultiRowReductionFusion(const HloFusionAnalysis& analysis, int vector_size,
+                          mlir::MLIRContext* mlir_context);
 
   // Attempts to create a multi-row reduction emitter for the given analysis.
   // Returns nullptr if the fusion is not supported.
   static std::unique_ptr<ReductionFusion> TryCreate(
-      const HloFusionAnalysis& analysis);
+      const HloFusionAnalysis& analysis, mlir::MLIRContext* mlir_context);
 
  protected:
   // Returns the number of {kept, reduced} threads for the given reduction and
@@ -188,25 +194,27 @@ class MultiRowReductionFusion : public ReductionFusion {
   llvm::SmallVector<mlir::Value> EmitReduction(
       int group_id, EmitterState& state) const override;
   IndexingMap ComputeReductionInputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
   IndexingMap ComputeReductionOutputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
 };
 
 class ColumnReductionFusion : public ReductionFusion {
  public:
-  explicit ColumnReductionFusion(const HloFusionAnalysis& analysis);
+  explicit ColumnReductionFusion(const HloFusionAnalysis& analysis,
+                                 mlir::MLIRContext* mlir_context);
 
  protected:
   llvm::SmallVector<mlir::Value> EmitReduction(
       int group_id, EmitterState& state) const override;
   IndexingMap ComputeReductionInputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
   IndexingMap ComputeReductionOutputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
   IndexingMap GetSharedMemoryReductionReadMap(
-      mlir::MLIRContext* ctx) const override;
-  IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
+  IndexingMap GetSharedMemoryWriteMap(
+      mlir::MLIRContext* mlir_context) const override;
 
   const int64_t kTileSize = 32;
 };
@@ -215,18 +223,20 @@ class ColumnReductionFusion : public ReductionFusion {
 // the warp size.
 class SmallColumnReductionFusion : public ReductionFusion {
  public:
-  explicit SmallColumnReductionFusion(const HloFusionAnalysis& analysis);
+  explicit SmallColumnReductionFusion(const HloFusionAnalysis& analysis,
+                                      mlir::MLIRContext* mlir_context);
 
  protected:
   llvm::SmallVector<mlir::Value> EmitReduction(
       int group_id, EmitterState& state) const override;
   IndexingMap ComputeReductionInputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
   IndexingMap ComputeReductionOutputIndexing(
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
   IndexingMap GetSharedMemoryReductionReadMap(
-      mlir::MLIRContext* ctx) const override;
-  IndexingMap GetSharedMemoryWriteMap(mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
+  IndexingMap GetSharedMemoryWriteMap(
+      mlir::MLIRContext* mlir_context) const override;
 
   const int64_t kTileSize = 32;
 
@@ -235,7 +245,7 @@ class SmallColumnReductionFusion : public ReductionFusion {
 };
 
 std::unique_ptr<ReductionFusion> CreateReductionFusion(
-    const HloFusionAnalysis& analysis);
+    const HloFusionAnalysis& analysis, mlir::MLIRContext* mlir_context);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
index d335562d6d7dd3..0b43566026296b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -124,11 +125,11 @@ ValueRange EmitUpdateIf(
           condition,
           [&](OpBuilder& then_b, Location then_loc) -> void {
             ImplicitLocOpBuilder implicit_then_b(then_loc, then_b);
-            then_b.create<scf::YieldOp>(then_loc,
-                                        updated_values_fn(implicit_then_b));
+            scf::YieldOp::create(then_b, then_loc,
+                                 updated_values_fn(implicit_then_b));
           },
           [&](OpBuilder& else_b, Location else_loc) -> void {
-            else_b.create<scf::YieldOp>(else_loc, values);
+            scf::YieldOp::create(else_b, else_loc, values);
           })
       .getResults();
 }
@@ -139,10 +140,10 @@ Value EmitBoundsCheck(ImplicitLocOpBuilder& b,
                       absl::Span<const int64_t> slice_shape,
                       absl::Span<const int64_t> operand_shape,
                       ValueRange offsets) {
-  Value in_bounds = b.create<arith::ConstantIntOp>(b.getI1Type(), 1);
+  Value in_bounds = arith::ConstantIntOp::create(b, b.getI1Type(), 1);
   for (auto [update_dim, operand_dim, offset] :
        llvm::zip(slice_shape, operand_shape, offsets)) {
-    Value ub = b.create<arith::ConstantIndexOp>(operand_dim - update_dim);
+    Value ub = arith::ConstantIndexOp::create(b, operand_dim - update_dim);
     // One bounds check is enough even for signed indices: `sge 0` is
     // implied by `ule ub`, because `ub >= 0`.
     in_bounds = b.createOrFold<arith::AndIOp>(
@@ -154,7 +155,7 @@ Value EmitBoundsCheck(ImplicitLocOpBuilder& b,
 
 Value EmitInequalityCheck(ImplicitLocOpBuilder& b, ValueRange lhs,
                           ValueRange rhs) {
-  Value not_equal = b.create<arith::ConstantIntOp>(b.getI1Type(), 0);
+  Value not_equal = arith::ConstantIntOp::create(b, b.getI1Type(), 0);
   for (auto [lhs_elem, rhs_elem] : llvm::zip(lhs, rhs)) {
     not_equal = b.createOrFold<arith::OrIOp>(
         not_equal, b.createOrFold<arith::CmpIOp>(arith::CmpIPredicate::ne,
@@ -208,7 +209,7 @@ SmallVector<Value, 4> PadWithZeros(ValueRange values, int64_t size,
                                    ImplicitLocOpBuilder& b) {
   SmallVector<Value, 4> padded_values(values.begin(), values.end());
   if (values.size() >= size) return padded_values;
-  auto zero = b.create<arith::ConstantIndexOp>(0);
+  auto zero = arith::ConstantIndexOp::create(b, 0);
   for (int i = values.size(); i < size; ++i) {
     padded_values.push_back(zero);
   }
@@ -282,13 +283,13 @@ SmallVector<Value, 4> EmitterHelper::ExtractOffsets(ImplicitLocOpBuilder& b,
   offsets.reserve(description_->index_vector_length);
   for (int i = 0; i < description_->index_vector_length; ++i) {
     SmallVector<Value, 4> indices_tensor_indices = {
-        slice_id, b.create<arith::ConstantIndexOp>(i)};
+        slice_id, arith::ConstantIndexOp::create(b, i)};
     auto index = GetIndicesElement(b, indices_tensor_indices);
     index =
         IsUnsignedIntegralType(
             description_->scatter->scatter_indices()->shape().element_type())
-            ? b.create<arith::IndexCastUIOp>(index_type, index).getResult()
-            : b.create<arith::IndexCastOp>(index_type, index).getResult();
+            ? arith::IndexCastUIOp::create(b, index_type, index).getResult()
+            : arith::IndexCastOp::create(b, index_type, index).getResult();
     offsets.push_back(index);
   }
   return offsets;
@@ -303,27 +304,27 @@ Value EmitterHelper::EmitScatterComputation(ImplicitLocOpBuilder& b,
     auto operand_elem = GetOperandElement(b, indices);
     auto reduced_val = emitters::InlineBlock(b, reducer.getBody().front(),
                                              {operand_elem, update_elem})[0];
-    return b.create<tensor::InsertOp>(reduced_val, output_tensor, indices);
+    return tensor::InsertOp::create(b, reduced_val, output_tensor, indices);
   }
-  auto atomic_rmw = b.create<AtomicRMWOp>(output_tensor, indices);
+  auto atomic_rmw = AtomicRMWOp::create(b, output_tensor, indices);
   OpBuilder body_b = atomic_rmw.getBodyBuilder();
   auto reduced_val =
       emitters::InlineBlock(body_b, reducer.getBody().front(),
                             {atomic_rmw.getCurrentValue(), update_elem})[0];
-  body_b.create<xla::YieldOp>(reducer->getLoc(), reduced_val);
+  xla::YieldOp::create(body_b, reducer->getLoc(), reduced_val);
   return atomic_rmw->getResult(0);
 }
 
 SmallVector<Value> EmitterHelper::WriteAccumulatedElementToOutput(
     ImplicitLocOpBuilder& b, Value accumulator, ValueRange accumulator_indices,
     ValueRange slice_indices, ValueRange offsets, Value output_tensor) const {
-  Value accumulator_elem = b.create<vector::ExtractOp>(
-      accumulator, mlir::getAsOpFoldResult(accumulator_indices));
+  Value accumulator_elem = vector::ExtractOp::create(
+      b, accumulator, mlir::getAsOpFoldResult(accumulator_indices));
 
   SmallVector<Value, 4> output_indices(offsets.begin(), offsets.end());
   for (int i = 0; i < output_indices.size(); ++i) {
     output_indices[i] =
-        b.create<arith::AddIOp>(slice_indices[i + 1], output_indices[i]);
+        arith::AddIOp::create(b, slice_indices[i + 1], output_indices[i]);
   }
   return {EmitScatterComputation(b, output_indices, accumulator_elem,
                                  output_tensor)};
@@ -360,9 +361,10 @@ Value EmitterHelper::GetElement(ImplicitLocOpBuilder& b, int operand_index,
 
 ScatterFusion::ScatterFusion(const HloFusionAnalysis& analysis,
                              const ScatterDescription& description,
-                             int64_t vector_size)
+                             int64_t vector_size, MLIRContext* mlir_context)
     : analysis_(analysis),
       description_(description),
+      mlir_context_(mlir_context),
       warp_size_(WarpSize(analysis_.device_info())),
       vector_size_(vector_size) {}
 
@@ -403,8 +405,8 @@ std::vector<emitters::EpilogueSpecification> ScatterFusion::GetEpilogues(
 
 ScatterWithDistributedUpdates::ScatterWithDistributedUpdates(
     const HloFusionAnalysis& analysis, const ScatterDescription& description,
-    int64_t vector_size)
-    : ScatterFusion(analysis, description, vector_size) {
+    int64_t vector_size, MLIRContext* mlir_context)
+    : ScatterFusion(analysis, description, vector_size, mlir_context) {
   // We have to make sure that there is no thread that processes elements of
   // two different update slice.
   auto launch_dimensions = CalculateLaunchDimensions(
@@ -417,11 +419,12 @@ ScatterWithDistributedUpdates::ScatterWithDistributedUpdates(
 }
 
 void ScatterWithDistributedUpdates::ComputeIndexing(
-    MLIRContext* ctx, IndexingMap* updates_map,
+    MLIRContext* mlir_context, IndexingMap* updates_map,
     IndexingMap* indices_map) const {
   // Compute thread id mapping based on the first update operand.
-  IndexingMap scatter_update_map = GetDefaultThreadIdIndexingMap(
-      launch_dimensions(), vector_size_, description_.update_shape, ctx);
+  IndexingMap scatter_update_map =
+      GetDefaultThreadIdIndexingMap(launch_dimensions(), vector_size_,
+                                    description_.update_shape, mlir_context);
 
   // For scatter indices we project indexing for scatter updates and take the
   // first result of the affine map only, because they coincide.
@@ -430,8 +433,8 @@ void ScatterWithDistributedUpdates::ComputeIndexing(
     *indices_map = IndexingMap{
         AffineMap::get(6, 1,
                        {scatter_update_map.GetAffineMap().getResult(0),
-                        getAffineSymbolExpr(0, ctx)},
-                       ctx),
+                        getAffineSymbolExpr(0, mlir_context)},
+                       mlir_context),
         DimVarsFromGPUGrid({num_warps_ * warp_size_, 1, 1, num_blocks_, 1, 1}),
         RangeVarsFromTensorSizes({description_.index_vector_length}),
         /*rt_vars=*/{}};
@@ -455,11 +458,9 @@ absl::Status ScatterFusion::EmitEntryFunction(
   auto thread_and_block_ids = EmitThreadAndBlockIds(b);
   Value output_tensor = entry_function.getArguments().back();
 
-  // Compute indexing maps.
-  MLIRContext* mlir_context = entry_function.getContext();
   IndexingMap updates_map = IndexingMap::GetUndefined();
   IndexingMap indices_map = IndexingMap::GetUndefined();
-  ComputeIndexing(mlir_context, &updates_map, &indices_map);
+  ComputeIndexing(mlir_context_, &updates_map, &indices_map);
   updates_map.Simplify();
 
   return EmitEntryFunctionImpl(b, helper, updates_map, indices_map,
@@ -487,7 +488,7 @@ void EmitNaiveImplementation(ImplicitLocOpBuilder& b,
           .front();
   Value index_id_in_bounds = b.createOrFold<arith::CmpIOp>(
       arith::CmpIPredicate::ult, thread_id_to_index_id_value,
-      b.create<arith::ConstantIndexOp>(description.num_slices));
+      arith::ConstantIndexOp::create(b, description.num_slices));
   auto result = EmitUpdateIf(
       b, index_id_in_bounds, {output_tensor},
       [&](ImplicitLocOpBuilder& outer_nested_b) -> SmallVector<Value> {
@@ -514,8 +515,8 @@ void EmitNaiveImplementation(ImplicitLocOpBuilder& b,
                     output_indices = PadWithZeros(output_indices, output_rank,
                                                   update_loop_b);
                     for (int i = 0; i < output_indices.size(); ++i) {
-                      output_indices[i] = update_loop_b.create<arith::AddIOp>(
-                          map_results[i + 1], output_indices[i]);
+                      output_indices[i] = arith::AddIOp::create(
+                          update_loop_b, map_results[i + 1], output_indices[i]);
                     }
                     Value output_tensor = output_tensors.front();
                     Value updated_output = helper.EmitScatterComputation(
@@ -526,7 +527,7 @@ void EmitNaiveImplementation(ImplicitLocOpBuilder& b,
             });
         return predicated_update;
       });
-  b.create<ReturnOp>(result.front());
+  ReturnOp::create(b, result.front());
 }
 
 absl::Status ScatterWithDistributedUpdates::EmitEntryFunctionImpl(
@@ -547,8 +548,9 @@ absl::Status ScatterWithDistributedUpdates::EmitEntryFunctionImpl(
 ScatterWithDistributedIndices::ScatterWithDistributedIndices(
     const HloFusionAnalysis& analysis, const ScatterDescription& description,
     int64_t vector_size, int64_t num_warps_per_slice,
-    int64_t num_indices_per_warp, int64_t indices_vector_size)
-    : ScatterFusion(analysis, description, vector_size),
+    int64_t num_indices_per_warp, int64_t indices_vector_size,
+    MLIRContext* mlir_context)
+    : ScatterFusion(analysis, description, vector_size, mlir_context),
       num_warps_per_slice_(num_warps_per_slice),
       num_indices_per_warp_(num_indices_per_warp),
       indices_vector_size_(indices_vector_size) {
@@ -558,21 +560,21 @@ ScatterWithDistributedIndices::ScatterWithDistributedIndices(
 }
 
 void ScatterWithDistributedIndices::ComputeIndexing(
-    MLIRContext* ctx, IndexingMap* updates_map,
+    MLIRContext* mlir_context, IndexingMap* updates_map,
     IndexingMap* indices_map) const {
   // Compute thread id mapping based on the first update operand.
   auto thread_x = getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
-  auto block_x =
-      getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
+  auto block_x = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapBlockIdxDims[0], mlir_context);
   auto warp_id = thread_x.floorDiv(warp_size_);
   auto slice_id =
       (block_x * num_warps_ + warp_id).floorDiv(num_warps_per_slice_);
   auto warp_id_in_slice =
       (block_x * num_warps_ + warp_id) % num_warps_per_slice_;
   auto lane_id = thread_x % warp_size_;
-  auto index_id_loop = getAffineSymbolExpr(0, ctx);
-  auto index_vector_id = getAffineSymbolExpr(1, ctx);
+  auto index_id_loop = getAffineSymbolExpr(0, mlir_context);
+  auto index_vector_id = getAffineSymbolExpr(1, mlir_context);
 
   auto vectorized_index_id_expr = slice_id * num_indices_per_warp_ +
                                   index_id_loop * indices_vector_size_ +
@@ -581,9 +583,10 @@ void ScatterWithDistributedIndices::ComputeIndexing(
   auto grid_vars =
       DimVarsFromGPUGrid({num_warps_ * warp_size_, 1, 1, num_blocks_, 1, 1});
   if (indices_map) {
-    auto index_dim_loop = getAffineSymbolExpr(2, ctx);
+    auto index_dim_loop = getAffineSymbolExpr(2, mlir_context);
     *indices_map = IndexingMap{
-        AffineMap::get(6, 3, {vectorized_index_id_expr, index_dim_loop}, ctx),
+        AffineMap::get(6, 3, {vectorized_index_id_expr, index_dim_loop},
+                       mlir_context),
         grid_vars,
         {IndexingMap::Variable{
              {0, num_indices_per_warp_ / indices_vector_size_ - 1},
@@ -600,9 +603,9 @@ void ScatterWithDistributedIndices::ComputeIndexing(
   }
 
   if (updates_map) {
-    auto index_id = getAffineSymbolExpr(0, ctx);
-    auto update_dim_loop = getAffineSymbolExpr(1, ctx);
-    auto vector_id = getAffineSymbolExpr(2, ctx);
+    auto index_id = getAffineSymbolExpr(0, mlir_context);
+    auto update_dim_loop = getAffineSymbolExpr(1, mlir_context);
+    auto vector_id = getAffineSymbolExpr(2, mlir_context);
     auto num_elements_per_slice = Product(description_.slice_shape);
 
     auto index_id_expr = slice_id * num_indices_per_warp_ + index_id;
@@ -616,7 +619,7 @@ void ScatterWithDistributedIndices::ComputeIndexing(
         DelinearizeInBoundsIndex(linear_slice_index, description_.slice_shape));
 
     *updates_map = IndexingMap{
-        AffineMap::get(6, 3, updates_indexing, ctx),
+        AffineMap::get(6, 3, updates_indexing, mlir_context),
         grid_vars,
         {IndexingMap::Variable{{0, num_indices_per_warp_ - 1}, "index_id_loop"},
          IndexingMap::Variable{
@@ -644,8 +647,9 @@ Value ScatterWithDistributedIndices::InitializeAccumulator(
       num_elements_per_slice, num_warps_per_slice_ * warp_size_ * vector_size_);
   auto accumulator_type =
       VectorType::get({update_iterations_per_thread, vector_size_}, elem_type);
-  return b.create<arith::ConstantOp>(
-      accumulator_type, emitters::GetZeroDenseElementsAttr(accumulator_type));
+  return arith::ConstantOp::create(
+      b, accumulator_type,
+      emitters::GetZeroDenseElementsAttr(accumulator_type));
 }
 
 absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
@@ -683,10 +687,10 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
 
   // Prepare loop initial values. Inits are packed as
   // [index_changed, is_inbounds, index_0,  ..., accumulator].
-  Value is_inbounds_init = b.create<arith::ConstantIntOp>(b.getI1Type(), 0);
-  Value slice_id_init = b.create<arith::ConstantIndexOp>(0);
+  Value is_inbounds_init = arith::ConstantIntOp::create(b, b.getI1Type(), 0);
+  Value slice_id_init = arith::ConstantIndexOp::create(b, 0);
   std::vector<Value> indices_init(description_.index_vector_length,
-                                  b.create<arith::ConstantIndexOp>(-1));
+                                  arith::ConstantIndexOp::create(b, -1));
   Value accumulator_init = InitializeAccumulator(b);
   SmallVector<Value> inits =
       Pack({slice_id_init, indices_init, is_inbounds_init, accumulator_init,
@@ -708,10 +712,11 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
     CHECK_EQ(ivs.size(), 2);
     Value index_loop_id = ivs.front();
     Value index_vector_id = ivs.back();
-    Value iter_slice_id = nested_b.create<arith::AddIOp>(
-        nested_b.create<arith::MulIOp>(
-            index_loop_id,
-            nested_b.create<arith::ConstantIndexOp>(indices_vector_size_)),
+    Value iter_slice_id = arith::AddIOp::create(
+        nested_b,
+        arith::MulIOp::create(
+            nested_b, index_loop_id,
+            arith::ConstantIndexOp::create(nested_b, indices_vector_size_)),
         index_vector_id);
 
     SmallVector<Value> offsets =
@@ -725,19 +730,20 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
         EmitInequalityCheck(nested_b, trimmed_offsets, new_trimmed_offsets);
 
     for (int i = 0; i < description_.index_vector_length; ++i) {
-      new_trimmed_offsets[i] = nested_b.create<arith::SelectOp>(
-          offsets_changed, new_trimmed_offsets[i], trimmed_offsets[i]);
+      new_trimmed_offsets[i] =
+          arith::SelectOp::create(nested_b, offsets_changed,
+                                  new_trimmed_offsets[i], trimmed_offsets[i]);
     }
 
     auto new_offsets = PadWithZeros(new_trimmed_offsets, output_rank, nested_b);
 
     // Write accumulated values into the tensor if the offsets changed.
     Value is_not_first_iteration =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::ne, iter_slice_id,
-                                b.create<arith::ConstantIndexOp>(0));
-    Value write_to_output_required = b.create<arith::AndIOp>(
-        is_not_first_iteration,
-        b.create<arith::AndIOp>(offsets_changed, iter_is_inbounds));
+        arith::CmpIOp::create(b, arith::CmpIPredicate::ne, iter_slice_id,
+                              arith::ConstantIndexOp::create(b, 0));
+    Value write_to_output_required = arith::AndIOp::create(
+        b, is_not_first_iteration,
+        arith::AndIOp::create(b, offsets_changed, iter_is_inbounds));
     iter_output = helper.WriteAccumulatorToOutput(
         b, write_to_output_required, thread_and_block_ids, iter_slice_id,
         slice_indexing, offsets, iter_acc, iter_output);
@@ -767,7 +773,7 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
                                           acc_ind_opfold)
                 ->getResults();
           });
-      implicit_then_b.create<scf::YieldOp>(then_loc, then_results);
+      scf::YieldOp::create(implicit_then_b, then_loc, then_results);
     };
     // Emits a loop that combines the accumulator with the new update elements
     // if the offsets did not change.
@@ -784,8 +790,8 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
             auto update_elem =
                 helper.GetUpdateElement(update_loop_b, slice_indices);
             auto acc_ind_opfold = mlir::getAsOpFoldResult(accumulator_indices);
-            Value accumulator_elem = update_loop_b.create<vector::ExtractOp>(
-                acc_arg, acc_ind_opfold);
+            Value accumulator_elem = vector::ExtractOp::create(
+                update_loop_b, acc_arg, acc_ind_opfold);
             auto reduced_val = emitters::InlineBlock(
                 update_loop_b, helper.GetReducer().getBody().front(),
                 {accumulator_elem, update_elem})[0];
@@ -793,7 +799,7 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
                 .create<vector::InsertOp>(reduced_val, acc_arg, acc_ind_opfold)
                 ->getResults();
           });
-      implicit_else_b.create<scf::YieldOp>(else_results);
+      scf::YieldOp::create(implicit_else_b, else_results);
     };
     auto updated_accumulator =
         EmitUpdateIf(nested_b, new_is_inbounds, {iter_acc},
@@ -828,7 +834,7 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
       b, result_is_inbounds, thread_and_block_ids, result_slice_id,
       slice_indexing, result_offsets, result_acc, result_output);
 
-  b.create<ReturnOp>(result_output);
+  ReturnOp::create(b, result_output);
   return absl::OkStatus();
 }
 
@@ -880,7 +886,7 @@ int64_t GetNumPossibleValidIndices(absl::Span<const int64_t> slice_shape,
 }
 
 std::unique_ptr<ScatterFusion> CreateScatterFusion(
-    const HloFusionAnalysis& analysis) {
+    const HloFusionAnalysis& analysis, MLIRContext* mlir_context) {
   auto description = GetScatterDescription(analysis);
   int64_t warp_size = WarpSize(analysis.device_info());
   int64_t num_elements_per_slice = Product(description.slice_shape);
@@ -930,15 +936,15 @@ std::unique_ptr<ScatterFusion> CreateScatterFusion(
     }
     return std::make_unique<ScatterWithDistributedIndices>(
         analysis, description, vector_size, num_warps_per_slice,
-        num_indices_per_warp, indices_vector_size);
+        num_indices_per_warp, indices_vector_size, mlir_context);
   }
   // Otherwise, we distribute the linearized updates tensor.
   vector_size =
       std::gcd(num_elements_per_slice,
                ComputeLoopFusionConfig(analysis, description.update_shape)
                    .unroll_factor);
-  return std::make_unique<ScatterWithDistributedUpdates>(analysis, description,
-                                                         vector_size);
+  return std::make_unique<ScatterWithDistributedUpdates>(
+      analysis, description, vector_size, mlir_context);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
index f89c820542f067..d6261abfaadc56 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
@@ -25,12 +25,12 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -64,7 +64,7 @@ class ScatterFusion : public EmitterBase {
  public:
   explicit ScatterFusion(const HloFusionAnalysis& analysis,
                          const ScatterDescription& description,
-                         int64_t vector_size);
+                         int64_t vector_size, mlir::MLIRContext* mlir_context);
 
   absl::Status EmitEntryFunction(
       const emitters::PartitionedComputations& computations,
@@ -102,6 +102,7 @@ class ScatterFusion : public EmitterBase {
 
   const HloFusionAnalysis& analysis_;
   ScatterDescription description_;
+  mlir::MLIRContext* mlir_context_;
 
   // The grid is {num_warps_ * WarpSize(), 1, 1, num_blocks_, 1, 1}.
   int64_t warp_size_;
@@ -121,7 +122,8 @@ class ScatterWithDistributedUpdates : public ScatterFusion {
  public:
   explicit ScatterWithDistributedUpdates(const HloFusionAnalysis& analysis,
                                          const ScatterDescription& description,
-                                         int64_t vector_size);
+                                         int64_t vector_size,
+                                         mlir::MLIRContext* mlir_context);
 
  protected:
   absl::Status EmitEntryFunctionImpl(mlir::ImplicitLocOpBuilder& b,
@@ -131,7 +133,8 @@ class ScatterWithDistributedUpdates : public ScatterFusion {
                                      mlir::ValueRange thread_and_block_ids,
                                      mlir::Value output_tensor) const override;
 
-  void ComputeIndexing(mlir::MLIRContext* ctx, IndexingMap* updates_map,
+  void ComputeIndexing(mlir::MLIRContext* mlir_context,
+                       IndexingMap* updates_map,
                        IndexingMap* indices_map) const override;
 };
 
@@ -188,10 +191,12 @@ class ScatterWithDistributedIndices : public ScatterFusion {
                                          int64_t vector_size,
                                          int64_t num_warps_per_slice,
                                          int64_t num_indices_per_warp,
-                                         int64_t indices_vector_size);
+                                         int64_t indices_vector_size,
+                                         mlir::MLIRContext* mlir_context);
 
  protected:
-  void ComputeIndexing(mlir::MLIRContext* ctx, IndexingMap* updates_map,
+  void ComputeIndexing(mlir::MLIRContext* mlir_context,
+                       IndexingMap* updates_map,
                        IndexingMap* indices_map) const override;
 
   absl::Status EmitEntryFunctionImpl(mlir::ImplicitLocOpBuilder& b,
@@ -215,7 +220,7 @@ class ScatterWithDistributedIndices : public ScatterFusion {
 };
 
 std::unique_ptr<ScatterFusion> CreateScatterFusion(
-    const HloFusionAnalysis& analysis);
+    const HloFusionAnalysis& analysis, mlir::MLIRContext* mlir_context);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/reduction_groups_two_in_same_group.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/reduction_groups_two_in_same_group.hlo
new file mode 100644
index 00000000000000..3cca83f157acbd
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/reduction_groups_two_in_same_group.hlo
@@ -0,0 +1,39 @@
+// RUN: fusion_to_mlir %s | FileCheck %s
+// RUN: gpu_test_correctness %s
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%add_f32.2 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+fusion {
+  param_1.23864 = f32[1,1,144,256,32]{4,3,2,1,0} parameter(1)
+  constant_8203_2_clone_1 = f32[] constant(0.844827533)
+  broadcast.6321.5.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} broadcast(constant_8203_2_clone_1), dimensions={}
+  mul.770.5.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} multiply(param_1.23864, broadcast.6321.5.clone.1)
+  bitcast.125.3.clone.1 = f32[1,144,256,32]{3,2,1,0} bitcast(mul.770.5.clone.1)
+  param_0.21084 = f32[1,144,256,32]{3,2,1,0} parameter(0)
+  add_any.68.3.clone.1 = f32[1,144,256,32]{3,2,1,0} add(bitcast.125.3.clone.1, param_0.21084)
+  constant_8190_1_clone_1 = f32[] constant(0.393919319)
+  broadcast.6364.3.clone.1 = f32[1,144,256,32]{3,2,1,0} broadcast(constant_8190_1_clone_1), dimensions={}
+  mul.675.1.clone.1 = f32[1,144,256,32]{3,2,1,0} multiply(add_any.68.3.clone.1, broadcast.6364.3.clone.1)
+  bitcast.15178.1 = f32[128,288,32]{2,1,0} bitcast(mul.675.1.clone.1)
+  constant_8186_50 = f32[] constant(0)
+  reduce.812.1 = f32[128,32]{1,0} reduce(bitcast.15178.1, constant_8186_50), dimensions={1}, to_apply=add_f32
+  constant_8204_2_clone_1 = f32[] constant(0.362068981)
+  broadcast.6327.3.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} broadcast(constant_8204_2_clone_1), dimensions={}
+  mul.771.3.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} multiply(param_1.23864, broadcast.6327.3.clone.1)
+  bitcast.15180.1.clone.1 = f32[128,288,32]{2,1,0} bitcast(mul.771.3.clone.1)
+  reduce.816.1.clone.1 = f32[128,32]{1,0} reduce(bitcast.15180.1.clone.1, constant_8186_50), dimensions={1}, to_apply=add_f32.2
+  ROOT tuple.1351 = (f32[128,32]{1,0}, f32[1,144,256,32]{3,2,1,0}, f32[128,32]{1,0}) tuple(reduce.812.1, mul.675.1.clone.1, reduce.816.1.clone.1)
+}
+
+// CHECK: xla.pure_call @add_f32_add
+// CHECK: xla.pure_call @add_f32_2_add_1
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
index 4d34703a4433a8..b96ff357233680 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
@@ -60,6 +60,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
index e14dca0de81cf8..d8f239b8f6d340 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
@@ -47,7 +47,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/tsl/platform/status.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 
 namespace xla {
 namespace gpu {
@@ -217,9 +217,10 @@ struct RewriteFp8TruncFPattern : public Fp8OpRewritePattern<arith::TruncFOp> {
 
     llvm::transform(inputs, inputs.begin(), [&](mlir::Value v) -> mlir::Value {
       if (v.getType().getIntOrFloatBitWidth() < f32_ty.getWidth()) {
-        return b.create<arith::ExtFOp>(f32_ty, v);
-      } else if (v.getType() != f32_ty) {
-        return b.create<arith::TruncFOp>(f32_ty, v);
+        return arith::ExtFOp::create(b, f32_ty, v);
+      }
+      if (v.getType() != f32_ty) {
+        return arith::TruncFOp::create(b, f32_ty, v);
       } else {
         return v;
       }
@@ -236,17 +237,17 @@ struct RewriteFp8TruncFPattern : public Fp8OpRewritePattern<arith::TruncFOp> {
     size_t num_chunks = (num_elements + 2) / 4;
 
     mlir::Type chunks_ty = mlir::VectorType::get(num_chunks, i32_ty);
-    mlir::Value chunks = b.create<LLVM::UndefOp>(chunks_ty);
+    mlir::Value chunks = LLVM::UndefOp::create(b, chunks_ty);
     bool pos = false;
     for (size_t i = 0; i < inputs.size() / 2; i++) {
-      mlir::Value chunk_pos = b.create<LLVM::ConstantOp>(i32_ty, 2 * i / 4);
-      mlir::Value chunk = b.create<LLVM::ExtractElementOp>(chunks, chunk_pos);
-      LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
-          i32_ty, cvtIntr,
+      mlir::Value chunk_pos = LLVM::ConstantOp::create(b, i32_ty, 2 * i / 4);
+      mlir::Value chunk = LLVM::ExtractElementOp::create(b, chunks, chunk_pos);
+      LLVM::CallIntrinsicOp cvtOp = LLVM::CallIntrinsicOp::create(
+          b, i32_ty, cvtIntr,
           mlir::ValueRange{inputs[2 * i], inputs[2 * i + 1], chunk,
-                           b.create<LLVM::ConstantOp>(i1_ty, pos)});
-      chunks = b.create<LLVM::InsertElementOp>(chunks, cvtOp.getResult(0),
-                                               chunk_pos);
+                           LLVM::ConstantOp::create(b, i1_ty, pos)});
+      chunks = LLVM::InsertElementOp::create(b, chunks, cvtOp.getResult(0),
+                                             chunk_pos);
       pos ^= true;
     }
 
@@ -254,19 +255,20 @@ struct RewriteFp8TruncFPattern : public Fp8OpRewritePattern<arith::TruncFOp> {
       return b
           .create<mlir::UnrealizedConversionCastOp>(
               to_ty,
-              mlir::ValueRange{b.create<LLVM::BitcastOp>(
-                  mlir::VectorType::get(num_elements, i8_ty),
-                  b.create<LLVM::ExtractElementOp>(
-                      b.create<LLVM::BitcastOp>(
-                          mlir::VectorType::get(2, b.getI16Type()), chunks),
-                      b.create<LLVM::ConstantOp>(i32_ty, 0)))})
+              mlir::ValueRange{LLVM::BitcastOp::create(
+                  b, mlir::VectorType::get(num_elements, i8_ty),
+                  LLVM::ExtractElementOp::create(
+                      b,
+                      LLVM::BitcastOp::create(
+                          b, mlir::VectorType::get(2, b.getI16Type()), chunks),
+                      LLVM::ConstantOp::create(b, i32_ty, 0)))})
           .getResult(0);
     }
 
     return b
         .create<mlir::UnrealizedConversionCastOp>(
-            to_ty, mlir::ValueRange{b.create<LLVM::BitcastOp>(
-                       mlir::VectorType::get(num_elements, i8_ty), chunks)})
+            to_ty, mlir::ValueRange{LLVM::BitcastOp::create(
+                       b, mlir::VectorType::get(num_elements, i8_ty), chunks)})
         .getResult(0);
   }
 
@@ -277,22 +279,22 @@ struct RewriteFp8TruncFPattern : public Fp8OpRewritePattern<arith::TruncFOp> {
     mlir::FloatType f32_ty = b.getF32Type();
     mlir::IntegerType i32_ty = b.getI32Type();
     if (value.getType().getIntOrFloatBitWidth() < f32_ty.getWidth()) {
-      value = b.create<arith::ExtFOp>(f32_ty, value);
+      value = arith::ExtFOp::create(b, f32_ty, value);
     } else if (value.getType() != f32_ty) {
-      value = b.create<arith::TruncFOp>(f32_ty, value);
+      value = arith::TruncFOp::create(b, f32_ty, value);
     }
 
     mlir::StringAttr cvtIntr =
         b.getStringAttr(isFp8(to_ty) ? "llvm.amdgcn.cvt.pk.fp8.f32"
                                      : "llvm.amdgcn.cvt.pk.bf8.f32");
 
-    LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
-        i32_ty, cvtIntr,
-        mlir::ValueRange{value, b.create<LLVM::UndefOp>(f32_ty),
-                         b.create<LLVM::UndefOp>(i32_ty),
-                         b.create<LLVM::ConstantOp>(b.getI1Type(), 0)});
+    LLVM::CallIntrinsicOp cvtOp = LLVM::CallIntrinsicOp::create(
+        b, i32_ty, cvtIntr,
+        mlir::ValueRange{value, LLVM::UndefOp::create(b, f32_ty),
+                         LLVM::UndefOp::create(b, i32_ty),
+                         LLVM::ConstantOp::create(b, b.getI1Type(), 0)});
     mlir::Value res =
-        b.create<LLVM::TruncOp>(b.getI8Type(), cvtOp.getResults());
+        LLVM::TruncOp::create(b, b.getI8Type(), cvtOp.getResults());
     return b
         .create<mlir::UnrealizedConversionCastOp>(to_ty, mlir::ValueRange{res})
         .getResult(0);
@@ -418,20 +420,20 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
     }
 
     if (to_ty.getWidth() > f32_ty.getWidth()) {
-      return b.create<arith::ExtFOp>(to_ty, v);
+      return arith::ExtFOp::create(b, to_ty, v);
     }
 
     if (to_ty.isBF16()) {
-      return b.create<LLVM::BitcastOp>(
-          to_ty,
-          b.create<LLVM::TruncOp>(
-              b.getI16Type(),
-              b.create<LLVM::LShrOp>(b.create<LLVM::BitcastOp>(i32_ty, v),
-                                     b.create<LLVM::ConstantOp>(i32_ty, 16))));
+      return LLVM::BitcastOp::create(
+          b, to_ty,
+          LLVM::TruncOp::create(
+              b, b.getI16Type(),
+              LLVM::LShrOp::create(b, LLVM::BitcastOp::create(b, i32_ty, v),
+                                   LLVM::ConstantOp::create(b, i32_ty, 16))));
     }
 
     assert(to_ty.getWidth() < f32_ty.getWidth());
-    return b.create<arith::TruncFOp>(to_ty, v);
+    return arith::TruncFOp::create(b, to_ty, v);
   }
 
   llvm::SmallVector<mlir::Value, 4> EmitVectorizedExtFromF8Intrinsic(
@@ -442,8 +444,8 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
     mlir::IntegerType i16_ty = b.getI16Type();
     mlir::IntegerType i8_ty = b.getI8Type();
     mlir::IntegerType i1_ty = b.getI1Type();
-    mlir::Value zero_cst = b.create<LLVM::ConstantOp>(i32_ty, 0);
-    mlir::Value one_cst = b.create<LLVM::ConstantOp>(i32_ty, 1);
+    mlir::Value zero_cst = LLVM::ConstantOp::create(b, i32_ty, 0);
+    mlir::Value one_cst = LLVM::ConstantOp::create(b, i32_ty, 1);
 
     size_t num_elements = value.getType().getNumElements();
     assert(num_elements == 2 || num_elements % 4 == 0);
@@ -453,22 +455,24 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
     mlir::Value chunks;
 
     if (num_elements == 2) {
-      chunks = b.create<LLVM::BitcastOp>(
-          chunks_ty,
-          b.create<LLVM::InsertElementOp>(
-              b.create<LLVM::UndefOp>(mlir::VectorType::get(2, i16_ty)),
-              b.create<LLVM::BitcastOp>(
-                  i16_ty, b.create<mlir::UnrealizedConversionCastOp>(
-                               mlir::VectorType::get(num_elements, i8_ty),
-                               mlir::ValueRange{value})
-                              .getResult(0)),
+      chunks = LLVM::BitcastOp::create(
+          b, chunks_ty,
+          LLVM::InsertElementOp::create(
+              b, LLVM::UndefOp::create(b, mlir::VectorType::get(2, i16_ty)),
+              LLVM::BitcastOp::create(
+                  b, i16_ty,
+                  mlir::UnrealizedConversionCastOp::create(
+                      b, mlir::VectorType::get(num_elements, i8_ty),
+                      mlir::ValueRange{value})
+                      .getResult(0)),
               zero_cst));
     } else {
-      chunks = b.create<LLVM::BitcastOp>(
-          chunks_ty, b.create<mlir::UnrealizedConversionCastOp>(
-                          mlir::VectorType::get(num_elements, i8_ty),
-                          mlir::ValueRange{value})
-                         .getResult(0));
+      chunks = LLVM::BitcastOp::create(
+          b, chunks_ty,
+          mlir::UnrealizedConversionCastOp::create(
+              b, mlir::VectorType::get(num_elements, i8_ty),
+              mlir::ValueRange{value})
+              .getResult(0));
     }
 
     llvm::SmallVector<mlir::Value, 4> results;
@@ -479,32 +483,32 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
     LLVM::FastmathFlagsAttr flags =
         LLVM::FastmathFlagsAttr::get(b.getContext(), LLVM::FastmathFlags::ninf);
     for (size_t i = 0; i < num_elements / 2; i++) {
-      mlir::Value chunk_pos = b.create<LLVM::ConstantOp>(i32_ty, (2 * i) / 4);
-      mlir::Value chunk = b.create<LLVM::ExtractElementOp>(chunks, chunk_pos);
-      LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
-          result_ty, cvtIntr,
+      mlir::Value chunk_pos = LLVM::ConstantOp::create(b, i32_ty, (2 * i) / 4);
+      mlir::Value chunk = LLVM::ExtractElementOp::create(b, chunks, chunk_pos);
+      LLVM::CallIntrinsicOp cvtOp = LLVM::CallIntrinsicOp::create(
+          b, result_ty, cvtIntr,
           mlir::ValueRange{
-              chunk, b.create<LLVM::ConstantOp>(i1_ty, ((2 * i) % 4) != 0)},
+              chunk, LLVM::ConstantOp::create(b, i1_ty, ((2 * i) % 4) != 0)},
           flags);
 
       results.push_back(
-          b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), zero_cst));
+          LLVM::ExtractElementOp::create(b, cvtOp.getResult(0), zero_cst));
       results.push_back(
-          b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), one_cst));
+          LLVM::ExtractElementOp::create(b, cvtOp.getResult(0), one_cst));
     }
 
     if (to_ty.isF16()) {
       result_ty = mlir::VectorType::get(2, b.getF16Type());
       cvtIntr = b.getStringAttr("llvm.amdgcn.cvt.pkrtz");
       for (size_t i = 0; i < num_elements / 2; i++) {
-        LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
-            result_ty, cvtIntr,
+        LLVM::CallIntrinsicOp cvtOp = LLVM::CallIntrinsicOp::create(
+            b, result_ty, cvtIntr,
             mlir::ValueRange{results[2 * i], results[2 * i + 1]}, flags);
 
         results[2 * i] =
-            b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), zero_cst);
+            LLVM::ExtractElementOp::create(b, cvtOp.getResult(0), zero_cst);
         results[2 * i + 1] =
-            b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), one_cst);
+            LLVM::ExtractElementOp::create(b, cvtOp.getResult(0), one_cst);
       }
     } else if (to_ty != f32_ty) {
       llvm::transform(results, results.begin(),
@@ -523,23 +527,24 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
     mlir::FloatType f32_ty = b.getF32Type();
     mlir::IntegerType i32_ty = b.getI32Type();
     mlir::IntegerType i8_ty = b.getI8Type();
-    mlir::Value zero_cst = b.create<LLVM::ConstantOp>(i32_ty, 0);
+    mlir::Value zero_cst = LLVM::ConstantOp::create(b, i32_ty, 0);
     // Emulate anyext
-    mlir::Value input = b.create<LLVM::BitcastOp>(
-        i32_ty, b.create<LLVM::InsertElementOp>(
-                    b.create<LLVM::UndefOp>(mlir::VectorType::get(4, i8_ty)),
-                    b.create<mlir::UnrealizedConversionCastOp>(
-                         i8_ty, mlir::ValueRange{value})
-                        .getResult(0),
-                    zero_cst));
+    mlir::Value input = LLVM::BitcastOp::create(
+        b, i32_ty,
+        LLVM::InsertElementOp::create(
+            b, LLVM::UndefOp::create(b, mlir::VectorType::get(4, i8_ty)),
+            mlir::UnrealizedConversionCastOp::create(b, i8_ty,
+                                                     mlir::ValueRange{value})
+                .getResult(0),
+            zero_cst));
     mlir::StringAttr cvtIntr =
         b.getStringAttr(isFp8(value.getType()) ? "llvm.amdgcn.cvt.f32.fp8"
                                                : "llvm.amdgcn.cvt.f32.bf8");
     LLVM::FastmathFlagsAttr flags =
         LLVM::FastmathFlagsAttr::get(b.getContext(), LLVM::FastmathFlags::ninf);
-    LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
-        mlir::TypeRange{f32_ty}, cvtIntr, mlir::ValueRange{input, zero_cst},
-        flags);
+    LLVM::CallIntrinsicOp cvtOp =
+        LLVM::CallIntrinsicOp::create(b, mlir::TypeRange{f32_ty}, cvtIntr,
+                                      mlir::ValueRange{input, zero_cst}, flags);
 
     return ConvertFromFloat(cvtOp.getResult(0), to_ty, b);
   }
@@ -560,7 +565,7 @@ class ConvertFloatAMDPass
                                                        &device_info));
       absl::StatusOr<se::DeviceDescription> device_description =
           se::DeviceDescription::FromProto(device_info);
-      TF_CHECK_OK(device_description.status());
+      CHECK_OK(device_description.status());
       cc_ = device_description->rocm_compute_capability();
     }
     mlir::RewritePatternSet patterns(&getContext());
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
index 974875bdac571b..2e7112771a93d2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cassert>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 
 #include "llvm/ADT/APFloat.h"
@@ -58,9 +59,9 @@ Value ConvertToF32(Value v, mlir::ImplicitLocOpBuilder& b) {
     return v;
   }
   if (v.getType().getIntOrFloatBitWidth() < f32_ty.getWidth()) {
-    return b.create<ma::ExtFOp>(f32_ty, v);
+    return ma::ExtFOp::create(b, f32_ty, v);
   }
-  return b.create<ma::TruncFOp>(f32_ty, v);
+  return ma::TruncFOp::create(b, f32_ty, v);
 }
 
 struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
@@ -103,15 +104,15 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
         value.getType() == b.getF16Type()) {
       // Fast path for truncating F16 type.
       Value vec =
-          b.create<ml::UndefOp>(mlir::VectorType::get(2, value.getType()));
-      vec = b.create<ml::InsertElementOp>(vec, value,
-                                          b.create<ma::ConstantIntOp>(0, 8));
+          ml::UndefOp::create(b, mlir::VectorType::get(2, value.getType()));
+      vec = ml::InsertElementOp::create(b, vec, value,
+                                        ma::ConstantIntOp::create(b, 0, 8));
       const std::string cvtIntr = llvm::isa<mlir::Float8E4M3FNType>(to_ty)
                                       ? "llvm.nvvm.f16x2.to.e4m3x2.rn"
                                       : "llvm.nvvm.f16x2.to.e5m2x2.rn";
-      cvtOp = b.create<ml::CallIntrinsicOp>(b.getIntegerType(16),
-                                            b.getStringAttr(cvtIntr),
-                                            mlir::ValueRange{vec});
+      cvtOp = ml::CallIntrinsicOp::create(b, b.getIntegerType(16),
+                                          b.getStringAttr(cvtIntr),
+                                          mlir::ValueRange{vec});
     } else {
       // Other FP types get converted to F32 first.
       value = ConvertToF32(value, b);
@@ -120,16 +121,16 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
                                   : llvm::isa<mlir::Float8E4M3FNType>(to_ty)
                                       ? "llvm.nvvm.ff.to.e4m3x2.rn"
                                       : "llvm.nvvm.ff.to.e5m2x2.rn";
-      cvtOp = b.create<ml::CallIntrinsicOp>(b.getIntegerType(16),
-                                            b.getStringAttr(cvtIntr),
-                                            mlir::ValueRange{value, value});
+      cvtOp = ml::CallIntrinsicOp::create(b, b.getIntegerType(16),
+                                          b.getStringAttr(cvtIntr),
+                                          mlir::ValueRange{value, value});
     }
 
-    Value res = b.create<ml::TruncOp>(
-        b.getIntegerType(to_ty.getIntOrFloatBitWidth()), cvtOp.getResults());
+    Value res = ml::TruncOp::create(
+        b, b.getIntegerType(to_ty.getIntOrFloatBitWidth()), cvtOp.getResults());
 
     if (llvm::isa<mlir::Float4E2M1FNType>(to_ty)) {
-      return b.create<ma::BitcastOp>(to_ty, res);
+      return ma::BitcastOp::create(b, to_ty, res);
     }
 
     // Downcasting to float8 saturates the value (uses "satfinite" modifier).
@@ -137,7 +138,7 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
     mlir::Type src_int_ty =
         b.getIntegerType(value.getType().getIntOrFloatBitWidth());
     return FixInfinityConversionValue(
-        b.create<ma::BitcastOp>(src_int_ty, value),
+        ma::BitcastOp::create(b, src_int_ty, value),
         mlir::cast<mlir::FloatType>(value.getType()), res, to_ty, b);
   }
 
@@ -151,11 +152,12 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
                                           mlir::ImplicitLocOpBuilder& b) {
     // Extract and discard sign bit.
     auto make_const = [&](int64_t c) {
-      return b.create<ma::ConstantIntOp>(src.getType(), c);
+      return ma::ConstantIntOp::create(b, src.getType(), c);
     };
     int sign_pos = src.getType().getIntOrFloatBitWidth() - 1;
-    Value sign_bit = b.create<ma::ShRUIOp>(src, make_const(sign_pos));
-    Value input = b.create<ma::AndIOp>(src, make_const((1ull << sign_pos) - 1));
+    Value sign_bit = ma::ShRUIOp::create(b, src, make_const(sign_pos));
+    Value input =
+        ma::AndIOp::create(b, src, make_const((1ull << sign_pos) - 1));
 
     // Values in the interval that contains all the values above the largest
     // representable in the destination type, as well as the infinity (source),
@@ -164,23 +166,25 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
     int64_t upper = llvm::APFloat::getInf(src_type.getFloatSemantics())
                         .bitcastToAPInt()
                         .getZExtValue();
-    Value is_inf = b.create<ma::AndIOp>(
-        b.create<ma::CmpIOp>(ma::CmpIPredicate::ugt, input, make_const(lower)),
-        b.create<ma::CmpIOp>(ma::CmpIPredicate::ule, input, make_const(upper)));
+    Value is_inf = ma::AndIOp::create(
+        b,
+        ma::CmpIOp::create(b, ma::CmpIPredicate::ugt, input, make_const(lower)),
+        ma::CmpIOp::create(b, ma::CmpIPredicate::ule, input,
+                           make_const(upper)));
 
     // Build signed infinity result value.
     int64_t inf_val = llvm::APFloat::getInf(dst_type.getFloatSemantics())
                           .bitcastToAPInt()
                           .getZExtValue();
     Value sign_dst =
-        b.create<ma::ShLIOp>(b.create<ml::TruncOp>(dst.getType(), sign_bit),
-                             b.create<ma::ConstantIntOp>(dst.getType(), 7));
-    Value inf = b.create<ma::OrIOp>(
-        b.create<ma::ConstantIntOp>(dst.getType(), inf_val), sign_dst);
+        ma::ShLIOp::create(b, ml::TruncOp::create(b, dst.getType(), sign_bit),
+                           ma::ConstantIntOp::create(b, dst.getType(), 7));
+    Value inf = ma::OrIOp::create(
+        b, ma::ConstantIntOp::create(b, dst.getType(), inf_val), sign_dst);
 
     // Select result based on the predicate.
-    Value res = b.create<ma::SelectOp>(is_inf, inf, dst);
-    return b.create<ma::BitcastOp>(dst_type, res);
+    Value res = ma::SelectOp::create(b, is_inf, inf, dst);
+    return ma::BitcastOp::create(b, dst_type, res);
   }
 
   // Calculate the minimum raw value (represented as an integer) that would
@@ -252,24 +256,25 @@ struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
         : llvm::isa<mlir::Float8E4M3FNType>(value.getType())
             ? "llvm.nvvm.e4m3x2.to.f16x2.rn"
             : "llvm.nvvm.e5m2x2.to.f16x2.rn";
-    Value input = b.create<ml::ZExtOp>(
-        b.getIntegerType(16),
-        b.create<ma::BitcastOp>(
-            b.getIntegerType(value.getType().getIntOrFloatBitWidth()), value));
+    Value input = ml::ZExtOp::create(
+        b, b.getIntegerType(16),
+        ma::BitcastOp::create(
+            b, b.getIntegerType(value.getType().getIntOrFloatBitWidth()),
+            value));
 
     mlir::FloatType f16_ty = b.getF16Type();
-    auto cvtOp = b.create<ml::CallIntrinsicOp>(mlir::VectorType::get(2, f16_ty),
-                                               b.getStringAttr(cvtIntr),
-                                               mlir::ValueRange{input});
-    Value res = b.create<ml::ExtractElementOp>(
-        cvtOp.getResults(), b.create<ma::ConstantIntOp>(0, 8));
+    auto cvtOp = ml::CallIntrinsicOp::create(
+        b, mlir::VectorType::get(2, f16_ty), b.getStringAttr(cvtIntr),
+        mlir::ValueRange{input});
+    Value res = ml::ExtractElementOp::create(
+        b, cvtOp.getResults(), ma::ConstantIntOp::create(b, 0, 8));
     if (to_ty.getWidth() > f16_ty.getWidth()) {
-      res = b.create<ma::ExtFOp>(to_ty, res);
+      res = ma::ExtFOp::create(b, to_ty, res);
     } else if (to_ty != f16_ty) {
       if (to_ty == b.getBF16Type()) {
-        res = b.create<ma::ExtFOp>(b.getF32Type(), res);
+        res = ma::ExtFOp::create(b, b.getF32Type(), res);
       }
-      res = b.create<ma::TruncFOp>(to_ty, res);
+      res = ma::TruncFOp::create(b, to_ty, res);
     }
     return res;
   }
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_index_type.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_index_type.cc
index 50e5868b6908e8..f6a47e965b9ba9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_index_type.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_index_type.cc
@@ -85,13 +85,13 @@ class RewriteIndexBinaryElementwiseOp
 
     Type index_type = IndexType::get(op->getContext());
     Type dst_type = b.getIntegerType(index_bitwidth_);
-    auto lhs = b.create<arith::IndexCastOp>(dst_type, op->getOperand(0));
-    auto rhs = b.create<arith::IndexCastOp>(dst_type, op->getOperand(1));
-    auto new_op = b.create<BinaryElementwiseOp>(lhs, rhs);
+    auto lhs = arith::IndexCastOp::create(b, dst_type, op->getOperand(0));
+    auto rhs = arith::IndexCastOp::create(b, dst_type, op->getOperand(1));
+    auto new_op = BinaryElementwiseOp::create(b, lhs, rhs);
 
     rewriter.replaceAllUsesWith(
         op.getResult(),
-        b.create<arith::IndexCastOp>(index_type, new_op.getResult()));
+        arith::IndexCastOp::create(b, index_type, new_op.getResult()));
 
     return mlir::success();
   }
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/lower_xla_shared.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/lower_xla_shared.cc
index c1e8bbfce17cf9..b6ccf47acb2068 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/lower_xla_shared.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/lower_xla_shared.cc
@@ -70,8 +70,8 @@ struct LowerForall : mlir::OpRewritePattern<mlir::scf::ForallOp> {
       // xla.range is a closed interval so we subtract 1 from the size which is
       // half-open.
       int64_t upper_range = size - 1;
-      auto thread_id = b.create<mlir::gpu::ThreadIdOp>(
-          static_cast<mlir::gpu::Dimension>(idx));
+      auto thread_id = mlir::gpu::ThreadIdOp::create(
+          b, static_cast<mlir::gpu::Dimension>(idx));
       thread_id->setAttr("xla.range", b.getIndexArrayAttr({0, upper_range}));
       new_args.push_back(thread_id);
     }
@@ -118,7 +118,7 @@ struct LowerWorkgroupId : mlir::OpRewritePattern<WorkGroupIdOp> {
                                          mlir::gpu::Dimension dimension) const {
     mlir::Location loc = op.getLoc();
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto block_id = b.create<mlir::gpu::BlockIdOp>(dimension);
+    auto block_id = mlir::gpu::BlockIdOp::create(b, dimension);
     if (mlir::Attribute range = op->getAttr("xla.range")) {
       block_id->setAttr("xla.range", op->getAttr("xla.range"));
     }
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc
index a1cd81f8fe0b48..de72215604a0dc 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc
@@ -53,6 +53,8 @@ bool IsExpensiveToUnroll(mlir::Operation* op) {
       mlir::func::CallOp,
       mlir::math::AcosOp,
       mlir::math::AcoshOp,
+      mlir::math::AsinOp,
+      mlir::math::AsinhOp,
       mlir::math::AtanhOp,
       mlir::math::SinhOp,
       mlir::scf::ForOp
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/peel_loops.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/peel_loops.cc
index 8f592dea736858..438528dfe65959 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/peel_loops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/peel_loops.cc
@@ -79,7 +79,9 @@ struct PeelLoop : public OpRewritePattern<LoopOp> {
       peeled_map.Simplify();
 
       // If the symbol is still constrained, peeling does not help.
-      if (peeled_map.IsSymbolConstrained(sym_index)) continue;
+      if (peeled_map.IsSymbolConstrained(sym_index)) {
+        continue;
+      }
 
       // Create remainder indexing map.
       IndexingMap tail_map = indexing_map;
@@ -104,9 +106,11 @@ struct PeelLoop : public OpRewritePattern<LoopOp> {
     Location loc = loop_op.getLoc();
     SmallVector<Value, 4> inits = loop_op.getInits();
     for (const auto& indexing_map : llvm::reverse(indexing_maps)) {
-      if (indexing_map.IsKnownEmpty()) continue;
-      auto tail_loop = rewriter.create<LoopOp>(
-          loc, indexing_map, loop_op.getDims(), inits,
+      if (indexing_map.IsKnownEmpty()) {
+        continue;
+      }
+      auto tail_loop = LoopOp::create(
+          rewriter, loc, indexing_map, loop_op.getDims(), inits,
           [&](OpBuilder& nested_b, Location nested_loc, ValueRange ivs,
               ValueRange map_results, ValueRange iter_args) {
             OpBuilder::InsertionGuard guard(nested_b);
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
index 99eaae24287205..f277cdee097303 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -116,17 +117,18 @@ Value ReadVectorTileFromShmem(ImplicitLocOpBuilder& b, Value shmem,
       mlir::cast<mlir::RankedTensorType>(shmem.getType()).getElementType();
   auto vector_type = mlir::VectorType::get({vector_size}, elem_type);
   for (int64_t i = 0; i < vector_size; ++i) {
-    Value loaded_vector = b.create<mv::TransferReadOp>(
-        vector_type, shmem, shmem_indices_vec, /*padding=*/std::nullopt,
+    Value loaded_vector = mv::TransferReadOp::create(
+        b, vector_type, shmem, shmem_indices_vec, /*padding=*/std::nullopt,
         llvm::ArrayRef<bool>{true});
     for (int64_t j = 0; j < vector_size; ++j) {
       Value elem =
-          b.create<mv::ExtractOp>(loaded_vector, SmallVector<int64_t>{j});
-      vector_tile =
-          b.create<mv::InsertOp>(elem, vector_tile, SmallVector<int64_t>{j, i});
+          mv::ExtractOp::create(b, loaded_vector, SmallVector<int64_t>{j});
+      vector_tile = mv::InsertOp::create(b, elem, vector_tile,
+                                         SmallVector<int64_t>{j, i});
     }
-    shmem_indices_vec.front() = b.create<mlir::arith::AddIOp>(
-        shmem_indices_vec.front(), b.create<mlir::arith::ConstantIndexOp>(1));
+    shmem_indices_vec.front() =
+        mlir::arith::AddIOp::create(b, shmem_indices_vec.front(),
+                                    mlir::arith::ConstantIndexOp::create(b, 1));
   }
   return vector_tile;
 }
@@ -176,8 +178,9 @@ std::vector<emitters::EpilogueSpecification> TransposeFusionBase::GetEpilogues(
   return epilogues;
 }
 
-TransposeFusion::TransposeFusion(const HloFusionAnalysis& analysis)
-    : TransposeFusionBase(analysis),
+TransposeFusion::TransposeFusion(const HloFusionAnalysis& analysis,
+                                 MLIRContext* mlir_context)
+    : TransposeFusionBase(analysis, mlir_context),
       transpose_(analysis.tiled_transpose()),
       permutation_(transpose_.permutation),
       input_shape_(
@@ -305,8 +308,8 @@ LaunchDimensions TransposeFusion::launch_dimensions() const {
 }
 
 IndexingMap TransposeFusion::GetSharedMemoryIndexing(
-    bool read, mlir::MLIRContext* ctx) const {
-  auto thread_offsets = GetThreadOffsets(/*read=*/true, ctx);
+    bool read, MLIRContext* mlir_context) const {
+  auto thread_offsets = GetThreadOffsets(/*read=*/true, mlir_context);
   if (!read) {
     // Regarding shared memory indexing, the permutation we need to apply is
     // just a swap of the two dimensions that are tiled.
@@ -322,7 +325,7 @@ IndexingMap TransposeFusion::GetSharedMemoryIndexing(
       kNumThreadsPerBlock;
   dim_var_sizes[KernelFusionInterface::kIndexingMapBlockIdxDims[0]] =
       Product(block_counts_);
-  return {mlir::AffineMap::get(6, 2, thread_offsets, ctx),
+  return {mlir::AffineMap::get(6, 2, thread_offsets, mlir_context),
           DimVarsFromGPUGrid(dim_var_sizes),
           RangeVarsFromTensorSizes({block_size_ / kNumRows, vector_size_}),
           {}};
@@ -334,7 +337,6 @@ TransposeFusion::WriteResult TransposeFusion::EmitWriteToShMemMlir(
     const emitters::PartitionedComputation& root_computation,
     const emitters::CallTargetProvider& call_target_provider,
     ValueRange output_args, mlir::ValueRange thread_and_block_ids) const {
-  MLIRContext* ctx = builder.getContext();
   auto shmem_tensor_size = block_sizes_;
   // Avoid bank conflicts.
   if (MostMinorDimensionUnchanged()) {
@@ -350,8 +352,8 @@ TransposeFusion::WriteResult TransposeFusion::EmitWriteToShMemMlir(
   for (auto* transpose : shmem_transposes_) {
     auto elem_type = emitters::PrimitiveTypeToMlirType(
         transpose->shape().element_type(), builder);
-    inits.push_back(builder.create<AllocateSharedOp>(
-        RankedTensorType::get(shmem_tensor_size, elem_type)));
+    inits.push_back(AllocateSharedOp::create(
+        builder, RankedTensorType::get(shmem_tensor_size, elem_type)));
   }
 
   // Add output arguments for side outputs.
@@ -360,13 +362,15 @@ TransposeFusion::WriteResult TransposeFusion::EmitWriteToShMemMlir(
     inits.push_back(entry_function.getArgument(num_inputs + index));
   }
 
-  IndexingMap write_indexing = GetSharedMemoryIndexing(/*read=*/false, ctx);
+  IndexingMap write_indexing =
+      GetSharedMemoryIndexing(/*read=*/false, mlir_context_);
   auto body_builder = [&](ImplicitLocOpBuilder& nested_b,
                           ValueRange symbol_values, ValueRange map_results,
                           ValueRange output_tensors) -> SmallVector<Value> {
     auto input_indices = [&](const HloInstruction* instr) {
-      return ApplyIndexing(GetIndexing(/*input=*/true, instr->shape(), ctx),
-                           thread_and_block_ids, symbol_values, nested_b);
+      return ApplyIndexing(
+          GetIndexing(/*input=*/true, instr->shape(), mlir_context_),
+          thread_and_block_ids, symbol_values, nested_b);
     };
 
     SmallVector<Value> result_tensors;
@@ -379,8 +383,8 @@ TransposeFusion::WriteResult TransposeFusion::EmitWriteToShMemMlir(
           root_computation, transpose,
           /*operand_index=*/0, input_indices(transpose->operand(0)),
           call_target_provider, entry_function, builder)[0];
-      result_tensors.push_back(builder.create<mlir::tensor::InsertOp>(
-          result_scalar, output, shmem_indices));
+      result_tensors.push_back(mlir::tensor::InsertOp::create(
+          builder, result_scalar, output, shmem_indices));
     }
 
     // Produce all side outputs and then write them.
@@ -400,14 +404,15 @@ TransposeFusion::WriteResult TransposeFusion::EmitWriteToShMemMlir(
          llvm::zip(side_outputs, side_output_indices,
                    output_tensors.take_back(side_output_roots_.size()))) {
       result_tensors.push_back(
-          nested_b.create<mt::InsertOp>(value, output, indices));
+          mt::InsertOp::create(nested_b, value, output, indices));
     }
 
     return result_tensors;
   };
 
   auto indexing = GetIndexing(
-      /*input=*/true, shmem_transposes_.front()->operand(0)->shape(), ctx);
+      /*input=*/true, shmem_transposes_.front()->operand(0)->shape(),
+      mlir_context_);
   auto written_vector = emitters::EmitXlaLoopOp(builder, thread_and_block_ids,
                                                 inits, indexing, body_builder);
   ValueRange written = written_vector;
@@ -432,11 +437,10 @@ void TransposeFusion::EmitReadFromShMemMlir(
     const HloFusionInstruction& fusion,
     const emitters::PartitionedComputations& computations,
     const WriteResult& written, mlir::ValueRange thread_and_block_ids) const {
-  auto* mlir_context = builder.getContext();
   auto output_indexing = *ComputeThreadIdToOutputIndexing(
-      shmem_transpose_root_indices_[0], mlir_context);
+      shmem_transpose_root_indices_[0], mlir_context_);
   auto shmem_read_indexing =
-      GetSharedMemoryIndexing(/*read=*/true, mlir_context);
+      GetSharedMemoryIndexing(/*read=*/true, mlir_context_);
   auto result_tensors = emitters::EmitXlaLoopOp(
       builder, thread_and_block_ids, written.updated_outputs, output_indexing,
       [&](ImplicitLocOpBuilder& nested_b, ValueRange symbol_values,
@@ -449,7 +453,7 @@ void TransposeFusion::EmitReadFromShMemMlir(
         for (auto [transpose, shmem] :
              llvm::zip(shmem_transposes_, written.shmem_tensors)) {
           transpose_values[transpose].push_back(
-              nested_b.create<mt::ExtractOp>(shmem, shmem_indices));
+              mt::ExtractOp::create(nested_b, shmem, shmem_indices));
         }
         llvm::SmallVector<Value> epilogue_indices = thread_and_block_ids;
         absl::c_copy(symbol_values, std::back_inserter(epilogue_indices));
@@ -463,21 +467,22 @@ void TransposeFusion::EmitReadFromShMemMlir(
                        shmem_transpose_root_indices_)) {
           llvm::SmallVector<Value> indices = ApplyIndexing(
               indexing, thread_and_block_ids, symbol_values, nested_b);
-          results[root_index] = nested_b.create<mt::InsertOp>(
-              result_scalars.at(root).front(), results[root_index], indices);
+          results[root_index] =
+              mt::InsertOp::create(nested_b, result_scalars.at(root).front(),
+                                   results[root_index], indices);
         }
         return results;
       });
 
-  builder.create<ReturnOp>(result_tensors);
+  ReturnOp::create(builder, result_tensors);
 }
 
 llvm::SmallVector<mlir::AffineExpr, 4> TransposeFusion::GetThreadOffsets(
-    bool read, mlir::MLIRContext* ctx) const {
+    bool read, MLIRContext* mlir_context) const {
   auto thread = getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
-  auto loop = getAffineSymbolExpr(0, ctx);
-  auto vector = getAffineSymbolExpr(1, ctx);
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
+  auto loop = getAffineSymbolExpr(0, mlir_context);
+  auto vector = getAffineSymbolExpr(1, mlir_context);
   int loop_stride = block_size_ * kNumRows;
   if (MostMinorDimensionUnchanged()) {
     loop_stride *= vector_size_;
@@ -488,14 +493,14 @@ llvm::SmallVector<mlir::AffineExpr, 4> TransposeFusion::GetThreadOffsets(
 }
 
 IndexingMap TransposeFusion::GetIndexing(bool input, const xla::Shape& shape,
-                                         mlir::MLIRContext* ctx) const {
-  auto raw_id =
-      getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+                                         MLIRContext* mlir_context) const {
+  auto raw_id = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapBlockIdxDims[0], mlir_context);
   auto block_ids = DelinearizeInBoundsIndex(raw_id, block_counts_);
   if (!input) {
     absl::c_copy(Permute(block_ids, permutation_), block_ids.begin());
   }
-  auto thread_offsets = GetThreadOffsets(input, ctx);
+  auto thread_offsets = GetThreadOffsets(input, mlir_context);
   const auto& permuted_block_sizes = input ? block_sizes_ : output_block_sizes_;
   llvm::SmallVector<AffineExpr, 3> offsets;
   for (auto [block_id, block_size, thread] :
@@ -508,7 +513,7 @@ IndexingMap TransposeFusion::GetIndexing(bool input, const xla::Shape& shape,
   dim_var_sizes[KernelFusionInterface::kIndexingMapBlockIdxDims[0]] =
       Product(block_counts_);
   IndexingMap result{
-      mlir::AffineMap::get(6, 2, offsets, ctx),
+      mlir::AffineMap::get(6, 2, offsets, mlir_context),
       DimVarsFromTensorSizes(dim_var_sizes),
       RangeVarsFromTensorSizes({block_size_ / kNumRows, vector_size_}),
       {}};
@@ -519,8 +524,8 @@ IndexingMap TransposeFusion::GetIndexing(bool input, const xla::Shape& shape,
                                     result.GetAffineMap().getResults())) {
     result.AddConstraint(dim, {0, size - 1});
   }
-  result =
-      ComposeIndexingMaps(result, GetBitcastMap(normalized_shape, shape, ctx));
+  result = ComposeIndexingMaps(
+      result, GetBitcastMap(normalized_shape, shape, mlir_context));
   result.Simplify();
   return result;
 }
@@ -541,8 +546,8 @@ std::vector<int64_t> GetBlockCounts(absl::Span<const int64_t> shape,
 PackedTranspose::PackedTranspose(const HloFusionAnalysis& analysis,
                                  const TransposeSpec& spec,
                                  absl::Span<const int64_t> output_block_tile,
-                                 int64_t num_warps)
-    : TransposeFusionBase(analysis),
+                                 int64_t num_warps, MLIRContext* mlir_context)
+    : TransposeFusionBase(analysis, mlir_context),
       spec_(spec),
       output_tile_(output_block_tile.begin(), output_block_tile.end()),
       input_tile_(Permute(output_tile_, spec_.canonical_inv_permutation)),
@@ -622,17 +627,16 @@ PackedTranspose::WriteResult PackedTranspose::EmitWriteToShMemMlir(
     const emitters::PartitionedComputation& root_computation,
     const emitters::CallTargetProvider& call_target_provider,
     ValueRange output_args, mlir::ValueRange thread_and_block_ids) const {
-  MLIRContext* ctx = builder.getContext();
-  IndexingMap input_indexing = GetInputIndexing(ctx);
-  IndexingMap shmem_write_indexing = GetShmemWriteIndexing(ctx);
+  IndexingMap input_indexing = GetInputIndexing(mlir_context_);
+  IndexingMap shmem_write_indexing = GetShmemWriteIndexing(mlir_context_);
 
   int64_t shmem_dim = kNumShmemBanks * vector_size_;
   SmallVector<Value> shmem_tensors;
   for (auto* transpose : shmem_transposes_) {
     Type elem_type = emitters::PrimitiveTypeToMlirType(
         transpose->shape().element_type(), builder);
-    Value shmem = builder.create<AllocateSharedOp>(
-        RankedTensorType::get({shmem_dim, shmem_dim}, elem_type));
+    Value shmem = AllocateSharedOp::create(
+        builder, RankedTensorType::get({shmem_dim, shmem_dim}, elem_type));
 
     auto tids_and_bids = EmitThreadAndBlockIds(builder);
     Value updated_shmem =
@@ -673,7 +677,7 @@ PackedTranspose::WriteResult PackedTranspose::EmitWriteToShMemMlir(
     for (auto root : side_output_roots_) {
       auto indexing = ComposeIndexingMaps(
           input_indexing,
-          GetBitcastMap(spec_.input_shape(), root->shape(), ctx));
+          GetBitcastMap(spec_.input_shape(), root->shape(), mlir_context_));
       indexing.Simplify();
       side_output_indices.push_back(ApplyIndexing(
           indexing, thread_and_block_ids, symbol_values, nested_b));
@@ -688,7 +692,7 @@ PackedTranspose::WriteResult PackedTranspose::EmitWriteToShMemMlir(
     for (const auto& [value, indices, output] :
          llvm::zip(side_outputs, side_output_indices, output_tensors)) {
       result_tensors.push_back(
-          nested_b.create<mt::InsertOp>(value, output, indices));
+          mt::InsertOp::create(nested_b, value, output, indices));
     }
 
     return result_tensors;
@@ -717,8 +721,9 @@ Value GetZeroVector(ImplicitLocOpBuilder& b, PrimitiveType elem_type,
                     llvm::ArrayRef<int64_t> shape) {
   auto mlir_elem_type = emitters::PrimitiveTypeToMlirType(elem_type, b);
   auto accumulator_type = mlir::VectorType::get(shape, mlir_elem_type);
-  return b.create<mlir::arith::ConstantOp>(
-      accumulator_type, emitters::GetZeroDenseElementsAttr(accumulator_type));
+  return mlir::arith::ConstantOp::create(
+      b, accumulator_type,
+      emitters::GetZeroDenseElementsAttr(accumulator_type));
 }
 
 void PackedTranspose::EmitReadFromShMemMlir(
@@ -726,15 +731,14 @@ void PackedTranspose::EmitReadFromShMemMlir(
     const HloFusionInstruction& fusion,
     const emitters::PartitionedComputations& computations,
     const WriteResult& written, mlir::ValueRange thread_and_block_ids) const {
-  auto* mlir_context = builder.getContext();
-  auto shmem_read_indexing = GetShmemReadIndexing(mlir_context);
+  auto shmem_read_indexing = GetShmemReadIndexing(mlir_context_);
   auto outer_loop_indexing = ConvertRangeVariablesToDimensions(
       shmem_read_indexing, /*range_var_indices=*/{1, 2});
-  auto output_indexing = GetOutputIndexing(mlir_context);
+  auto output_indexing = GetOutputIndexing(mlir_context_);
   auto output_indexing_over_vectors = ConvertRangeVariablesToDimensions(
       output_indexing, /*range_var_indices=*/{0});
 
-  auto c0 = builder.create<mlir::arith::ConstantIndexOp>(0);
+  auto c0 = mlir::arith::ConstantIndexOp::create(builder, 0);
   SmallVector<Value> grid_and_vector_ids{thread_and_block_ids};
   grid_and_vector_ids.append({c0, c0});
   absl::flat_hash_map<PrimitiveType, Value> elem_type_to_vector_tile;
@@ -774,8 +778,8 @@ void PackedTranspose::EmitReadFromShMemMlir(
                 ValueRange output_tensors) -> SmallVector<Value> {
               for (auto [transpose, shmem] :
                    llvm::zip(shmem_transposes_, written.shmem_tensors)) {
-                Value elem = nested_b_2.create<mv::ExtractOp>(
-                    transpose_values[transpose].front(),
+                Value elem = mv::ExtractOp::create(
+                    nested_b_2, transpose_values[transpose].front(),
                     getAsOpFoldResult(ivs));
                 transpose_values[transpose] = {elem};
               }
@@ -794,23 +798,23 @@ void PackedTranspose::EmitReadFromShMemMlir(
                 symbols.append(ivs.begin(), ivs.end());
                 llvm::SmallVector<Value> indices = ApplyIndexing(
                     indexing, thread_and_block_ids, symbols, nested_b);
-                results[root_index] = nested_b_2.create<mt::InsertOp>(
-                    result_scalars.at(root).front(), results[root_index],
-                    indices);
+                results[root_index] = mt::InsertOp::create(
+                    nested_b_2, result_scalars.at(root).front(),
+                    results[root_index], indices);
               }
               return results;
             });
         return inner_loop_results;
       });
-  builder.create<ReturnOp>(outer_loop_results);
+  ReturnOp::create(builder, outer_loop_results);
 }
 
-IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* ctx) const {
+IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* mlir_context) const {
   // Dimensions variables.
   auto thread_id = getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
-  auto block_id =
-      getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
+  auto block_id = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapBlockIdxDims[0], mlir_context);
   auto warp_size = kNumShmemBanks;
   auto lane_id = thread_id % warp_size;
   auto warp_id = thread_id.floorDiv(warp_size);
@@ -818,8 +822,8 @@ IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* ctx) const {
       {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
 
   // Range variables.
-  auto loop = getAffineSymbolExpr(0, ctx);
-  auto vector_element_id = getAffineSymbolExpr(1, ctx);
+  auto loop = getAffineSymbolExpr(0, mlir_context);
+  auto vector_element_id = getAffineSymbolExpr(1, mlir_context);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
       {{CeilOfRatio(tile_size_t2_, num_warps_per_block_), vector_size_}});
 
@@ -833,7 +837,7 @@ IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* ctx) const {
   auto shmem_col = lane_id * vector_size_ + vector_element_id;
 
   // Offsets within the block.
-  auto c0 = getAffineConstantExpr(0, ctx);
+  auto c0 = getAffineConstantExpr(0, mlir_context);
   int64_t canonical_rank = spec_.canonical_rank();
   llvm::SmallVector<AffineExpr, 4> offsets_within_tile(canonical_rank, c0);
   offsets_within_tile[spec_.dim_A_id()] = shmem_col.floorDiv(tile_size_t1_);
@@ -852,13 +856,13 @@ IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* ctx) const {
       {shmem_row, Interval{0, populated_shmem_rows_ - 1}}};
   IndexingMap canonical_input_indexing{
       mlir::AffineMap::get(/*num_dims=*/6, /*num_symbols=*/2, canonical_offsets,
-                           ctx),
+                           mlir_context),
       std::move(dim_vars), std::move(range_vars), /*rt_vars=*/{}, constraints};
   canonical_input_indexing.Simplify();
 
   // Actual indexing.
-  auto canonical_input_shape_to_real_shape =
-      GetBitcastMap(spec_.canonical_input_shape, spec_.input_shape(), ctx);
+  auto canonical_input_shape_to_real_shape = GetBitcastMap(
+      spec_.canonical_input_shape, spec_.input_shape(), mlir_context);
   // When we compose, the constraints w.r.t. to the input dimension sizes will
   // be added.
   auto input_indexing = ComposeIndexingMaps(
@@ -868,10 +872,10 @@ IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* ctx) const {
 }
 
 IndexingMap PackedTranspose::GetShmemWriteIndexing(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   // Dimensions variables.
   auto thread_id = getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
   auto warp_size = kNumShmemBanks;
   auto lane_id = thread_id % warp_size;
   auto warp_id = thread_id.floorDiv(warp_size);
@@ -879,8 +883,8 @@ IndexingMap PackedTranspose::GetShmemWriteIndexing(
       {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
 
   // Range variables.
-  auto loop = getAffineSymbolExpr(0, ctx);
-  auto vector_element_id = getAffineSymbolExpr(1, ctx);
+  auto loop = getAffineSymbolExpr(0, mlir_context);
+  auto vector_element_id = getAffineSymbolExpr(1, mlir_context);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
       {CeilOfRatio(tile_size_t2_, num_warps_per_block_), vector_size_});
 
@@ -893,17 +897,17 @@ IndexingMap PackedTranspose::GetShmemWriteIndexing(
   shmem_col = Swizzle(shmem_row, shmem_col, vector_size_);
 
   IndexingMap shmem_write_indexing_map{
-      mlir::AffineMap::get(6, 2, {shmem_row, shmem_col}, ctx), dim_vars,
-      range_vars, /*rt_vars=*/{}, constraints};
+      mlir::AffineMap::get(6, 2, {shmem_row, shmem_col}, mlir_context),
+      dim_vars, range_vars, /*rt_vars=*/{}, constraints};
   shmem_write_indexing_map.Simplify();
   return shmem_write_indexing_map;
 }
 
 IndexingMap PackedTranspose::GetShmemReadIndexing(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   // Dimensions variables.
   auto thread_id = getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
   auto warp_size = kNumShmemBanks;
   auto lane_id = thread_id % warp_size;
   auto warp_id = thread_id.floorDiv(warp_size);
@@ -911,9 +915,9 @@ IndexingMap PackedTranspose::GetShmemReadIndexing(
       {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
 
   // Range variables.
-  auto loop = getAffineSymbolExpr(0, ctx);
-  auto vector_horizontal = getAffineSymbolExpr(1, ctx);
-  auto vector_vertical = getAffineSymbolExpr(2, ctx);
+  auto loop = getAffineSymbolExpr(0, mlir_context);
+  auto vector_horizontal = getAffineSymbolExpr(1, mlir_context);
+  auto vector_vertical = getAffineSymbolExpr(2, mlir_context);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
       {CeilOfRatio(populated_shmem_cols_,
                    (vector_size_ * num_warps_per_block_)),
@@ -929,18 +933,19 @@ IndexingMap PackedTranspose::GetShmemReadIndexing(
   shmem_col = Swizzle(shmem_row, shmem_col, vector_size_);
 
   IndexingMap shmem_read_indexing_map{
-      mlir::AffineMap::get(6, 3, {shmem_row, shmem_col}, ctx), dim_vars,
-      range_vars, /*rt_vars=*/{}, constraints};
+      mlir::AffineMap::get(6, 3, {shmem_row, shmem_col}, mlir_context),
+      dim_vars, range_vars, /*rt_vars=*/{}, constraints};
   shmem_read_indexing_map.Simplify();
   return shmem_read_indexing_map;
 }
 
-IndexingMap PackedTranspose::GetOutputIndexing(mlir::MLIRContext* ctx) const {
+IndexingMap PackedTranspose::GetOutputIndexing(
+    MLIRContext* mlir_context) const {
   // Dimensions variables.
   auto thread_id = getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
-  auto block_id =
-      getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
+  auto block_id = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapBlockIdxDims[0], mlir_context);
   auto warp_size = kNumShmemBanks;
   auto lane_id = thread_id % warp_size;
   auto warp_id = thread_id.floorDiv(warp_size);
@@ -948,9 +953,9 @@ IndexingMap PackedTranspose::GetOutputIndexing(mlir::MLIRContext* ctx) const {
       {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
 
   // Range variables.
-  auto loop = getAffineSymbolExpr(0, ctx);
-  auto vector_horizontal = getAffineSymbolExpr(1, ctx);
-  auto vector_vertical = getAffineSymbolExpr(2, ctx);
+  auto loop = getAffineSymbolExpr(0, mlir_context);
+  auto vector_horizontal = getAffineSymbolExpr(1, mlir_context);
+  auto vector_vertical = getAffineSymbolExpr(2, mlir_context);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
       {CeilOfRatio(populated_shmem_cols_, vector_size_ * num_warps_per_block_),
        vector_size_, vector_size_});
@@ -964,7 +969,7 @@ IndexingMap PackedTranspose::GetOutputIndexing(mlir::MLIRContext* ctx) const {
   auto shmem_row = lane_id * vector_size_ + vector_vertical;
 
   // Offsets within the block.
-  auto c0 = getAffineConstantExpr(0, ctx);
+  auto c0 = getAffineConstantExpr(0, mlir_context);
   int64_t canonical_rank = spec_.canonical_rank();
   llvm::SmallVector<AffineExpr, 4> offsets_within_tile(canonical_rank, c0);
   offsets_within_tile[spec_.dim_A_id()] = shmem_col.floorDiv(tile_size_t1_);
@@ -982,13 +987,13 @@ IndexingMap PackedTranspose::GetOutputIndexing(mlir::MLIRContext* ctx) const {
       {shmem_col, Interval{0, populated_shmem_cols_ - 1}},
       {shmem_row, Interval{0, populated_shmem_rows_ - 1}}};
   IndexingMap canonical_output_indexing{
-      mlir::AffineMap::get(6, 3, canonical_offsets, ctx), std::move(dim_vars),
-      std::move(range_vars), /*rt_vars=*/{}, constraints};
+      mlir::AffineMap::get(6, 3, canonical_offsets, mlir_context),
+      std::move(dim_vars), std::move(range_vars), /*rt_vars=*/{}, constraints};
   canonical_output_indexing.Simplify();
 
   // Actual indexing.
-  auto canonical_output_shape_to_real_shape =
-      GetBitcastMap(spec_.canonical_output_shape, spec_.output_shape(), ctx);
+  auto canonical_output_shape_to_real_shape = GetBitcastMap(
+      spec_.canonical_output_shape, spec_.output_shape(), mlir_context);
   // When we compose, the constraints w.r.t. to the output dimension sizes will
   // be added.
   auto output_indexing = ComposeIndexingMaps(
@@ -998,15 +1003,16 @@ IndexingMap PackedTranspose::GetOutputIndexing(mlir::MLIRContext* ctx) const {
 }
 
 std::unique_ptr<EmitterBase> CreateTransposeFusion(
-    const HloFusionAnalysis& analysis) {
+    const HloFusionAnalysis& analysis, MLIRContext* mlir_context) {
   auto spec = GetTransposeSpec(
       Cast<HloTransposeInstruction>(analysis.tiled_transpose().instr));
   auto packed_transpose_tile = GetPackedTransposeTileSizes(spec);
   if (packed_transpose_tile.ok()) {
-    return std::make_unique<PackedTranspose>(
-        analysis, spec, *packed_transpose_tile, /* num_warps= */ 4);
+    return std::make_unique<PackedTranspose>(analysis, spec,
+                                             *packed_transpose_tile,
+                                             /* num_warps= */ 4, mlir_context);
   }
-  return std::make_unique<TransposeFusion>(analysis);
+  return std::make_unique<TransposeFusion>(analysis, mlir_context);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
index 2501d1b5ba1d6e..64c406d5c0949c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
@@ -29,13 +29,13 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -49,8 +49,9 @@ namespace gpu {
 
 class TransposeFusionBase : public EmitterBase {
  public:
-  explicit TransposeFusionBase(const HloFusionAnalysis& analysis)
-      : analysis_(analysis) {}
+  explicit TransposeFusionBase(const HloFusionAnalysis& analysis,
+                               mlir::MLIRContext* mlir_context)
+      : analysis_(analysis), mlir_context_(mlir_context) {}
 
  protected:
   absl::Status EmitEntryFunction(
@@ -86,6 +87,7 @@ class TransposeFusionBase : public EmitterBase {
       mlir::ValueRange thread_and_block_ids) const = 0;
 
   const HloFusionAnalysis& analysis_;
+  mlir::MLIRContext* mlir_context_;
 
   // Transpose instructions that require shared memory. Note that not all
   // transposes require shared memory, e.g. the ones with a large innermost
@@ -115,7 +117,8 @@ class TransposeFusionBase : public EmitterBase {
 // https://goo.gl/MStRV6.
 class TransposeFusion : public TransposeFusionBase {
  public:
-  explicit TransposeFusion(const HloFusionAnalysis& analysis);
+  explicit TransposeFusion(const HloFusionAnalysis& analysis,
+                           mlir::MLIRContext* mlir_context);
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
@@ -141,11 +144,12 @@ class TransposeFusion : public TransposeFusionBase {
 
  private:
   IndexingMap GetIndexing(bool input, const xla::Shape& shape,
-                          mlir::MLIRContext* ctx) const;
-  IndexingMap GetSharedMemoryIndexing(bool read, mlir::MLIRContext* ctx) const;
+                          mlir::MLIRContext* mlir_context) const;
+  IndexingMap GetSharedMemoryIndexing(bool read,
+                                      mlir::MLIRContext* mlir_context) const;
 
   llvm::SmallVector<mlir::AffineExpr, 4> GetThreadOffsets(
-      bool read, mlir::MLIRContext* ctx) const;
+      bool read, mlir::MLIRContext* mlir_context) const;
   bool MostMinorDimensionUnchanged() const;
 
   TransposeDescription transpose_;
@@ -230,7 +234,7 @@ class PackedTranspose : public TransposeFusionBase {
   explicit PackedTranspose(const HloFusionAnalysis& analysis,
                            const TransposeSpec& spec,
                            absl::Span<const int64_t> output_block_tile,
-                           int64_t num_warps);
+                           int64_t num_warps, mlir::MLIRContext* mlir_context);
 
   LaunchDimensions launch_dimensions() const override;
 
@@ -294,7 +298,7 @@ class PackedTranspose : public TransposeFusionBase {
 };
 
 std::unique_ptr<EmitterBase> CreateTransposeFusion(
-    const HloFusionAnalysis& analysis);
+    const HloFusionAnalysis& analysis, mlir::MLIRContext* mlir_context);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
index 5a716e18fd91cd..657827d4d97b63 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -34,10 +36,9 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/TargetParser/Triple.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/AffineMap.h"
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/analysis/indexing_map.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -100,10 +102,11 @@ absl::Status AnnotateKernelLaunchDimensions(
 
 IndexingMap KernelFusionInterface::GetDefaultThreadIdIndexingMap(
     const LaunchDimensions& launch_dims, int unroll_factor, const Shape& shape,
-    mlir::MLIRContext* ctx) {
+    mlir::MLIRContext* mlir_context) {
   WorkDimensions work_dimensions = launch_dims.AsWorkDimensions();
   work_dimensions.work_tile_size.dimensions.push_back(unroll_factor);
-  return emitters::GetDefaultWorkItemIndexingMap(work_dimensions, shape, ctx);
+  return emitters::GetDefaultWorkItemIndexingMap(work_dimensions, shape,
+                                                 mlir_context);
 }
 
 std::string GetSanitizedUniqueName(IrEmitterContext& ir_emitter_context,
@@ -112,40 +115,45 @@ std::string GetSanitizedUniqueName(IrEmitterContext& ir_emitter_context,
       llvm_ir::SanitizeFunctionName(suggested_name));
 }
 
-absl::StatusOr<llvm::Function*> BuildKernelPrototype(
-    IrEmitterContext& ir_emitter_context, const std::string& impl_fn_name,
-    const std::string& suggested_name,
-    const emitters::KernelArguments& arguments,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilderBase* builder) {
-  return BuildKernelPrototypeFromUniqueName(
-      ir_emitter_context, impl_fn_name,
-      GetSanitizedUniqueName(ir_emitter_context, suggested_name), arguments,
-      launch_dimensions, builder);
-}
-
 absl::StatusOr<llvm::Function*> BuildKernelPrototypeFromUniqueName(
-    IrEmitterContext& ir_emitter_context, const std::string& impl_fn_name,
-    const std::string& unique_kernel_name,
+    llvm::Module* llvm_module, const se::DeviceDescription& gpu_device_info,
+    const std::string& impl_fn_name, const std::string& unique_kernel_name,
     const emitters::KernelArguments& arguments,
     const LaunchDimensions& launch_dimensions, llvm::IRBuilderBase* builder) {
   // Create the kernel and add it to the module.
-  auto* llvm_module = ir_emitter_context.llvm_module();
   llvm::LLVMContext& context = llvm_module->getContext();
   // Explicitly set global addrspace for SPIR backend.
   int addrspace = llvm::Triple(llvm_module->getTargetTriple()).isSPIR() ? 1 : 0;
+  std::vector<llvm::Type*> kernel_args;
+  kernel_args.reserve(arguments.args().size());
+  for (const auto& arg : arguments.args()) {
+    // Handle pointer arguments.
+    // Either managed OR unmanaged with non-scalar shape.
+    if (arg.kind() == emitters::KernelArgument::Kind::kManaged ||
+        !arg.shape().dimensions().empty()) {
+      kernel_args.push_back(builder->getPtrTy(addrspace));
+      continue;
+    }
+    // Handle scalars.
+    llvm::Type* ir_type =
+        llvm_ir::PrimitiveTypeToIrType(arg.shape().element_type(), context);
+    if (!ir_type) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported scalar type: ",
+                       PrimitiveType_Name(arg.shape().element_type())));
+    }
+    kernel_args.push_back(ir_type);
+  }
   llvm::FunctionType* kernel_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(context),
-      std::vector<llvm::Type*>(arguments.args().size(),
-                               builder->getPtrTy(addrspace)),
+      /*Result=*/llvm::Type::getVoidTy(context), std::move(kernel_args),
       /*isVarArg=*/false);
   llvm::Function* kernel =
       llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage,
                              unique_kernel_name, llvm_module);
 
   AnnotateFunctionAsGpuKernel(llvm_module, kernel, builder);
-  TF_RETURN_IF_ERROR(
-      AnnotateKernelLaunchDimensions(ir_emitter_context.gpu_device_info(),
-                                     launch_dimensions, kernel, llvm_module));
+  TF_RETURN_IF_ERROR(AnnotateKernelLaunchDimensions(
+      gpu_device_info, launch_dimensions, kernel, llvm_module));
 
   // Update the insert point to the entry basic block.
   llvm::BasicBlock* entry_bb =
@@ -175,7 +183,8 @@ absl::StatusOr<llvm::Function*> BuildKernelPrototypeFromUniqueName(
     if (impl_arg && impl_arg->hasAttribute(llvm::Attribute::Alignment)) {
       kernel->addParamAttr(arg_idx,
                            impl_arg->getAttribute(llvm::Attribute::Alignment));
-    } else {
+    } else if (kernel_argument.alignment() > 0) {
+      // Scalars don't need an alignment attribute.
       kernel->addParamAttr(arg_idx,
                            llvm::Attribute::get(llvm_arg.getContext(),
                                                 llvm::Attribute::Alignment,
@@ -194,5 +203,63 @@ absl::StatusOr<llvm::Function*> BuildKernelPrototypeFromUniqueName(
   return kernel;
 }
 
+absl::StatusOr<llvm::Function*> BuildKernelPrototype(
+    llvm::Module* llvm_module, const se::DeviceDescription& gpu_device_info,
+    const std::string& impl_fn_name, const std::string& unique_kernel_name,
+    const emitters::KernelArguments& arguments,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilderBase* builder) {
+  return BuildKernelPrototypeFromUniqueName(
+      llvm_module, gpu_device_info, impl_fn_name, unique_kernel_name, arguments,
+      launch_dimensions, builder);
+}
+
+// Triton's kernel ABI expects additional scratchpad global memory for
+// TMA and profiling information.
+// For now it is only used for on-device creation of TMA descriptors, which
+// we do not use yet, so we are just replacing this argument with a null
+// pointer.
+// TODO: b/381242007 - Allocate a proper buffer if we want to use
+// device-side TMA APIs.
+absl::StatusOr<llvm::Function*> RemoveUnusedTritonAbiArguments(
+    IrEmitterContext& ir_emitter_context,
+    const std::string& sanitized_kernel_name,
+    LaunchDimensions& launch_dimensions,
+    const emitters::KernelArguments& kernel_arguments) {
+  llvm::Function* impl_fn =
+      ir_emitter_context.llvm_module()->getFunction(sanitized_kernel_name);
+  TF_RET_CHECK(impl_fn);
+  impl_fn->setName(ir_emitter_context.name_uniquer()->GetUniqueName(
+      sanitized_kernel_name + "_impl"));
+
+  llvm::IRBuilder builder(ir_emitter_context.llvm_module()->getContext());
+
+  TF_ASSIGN_OR_RETURN(llvm::Function * kernel,
+                      BuildKernelPrototypeFromUniqueName(
+                          ir_emitter_context.llvm_module(),
+                          ir_emitter_context.gpu_device_info(),
+                          impl_fn->getName().str(), sanitized_kernel_name,
+                          kernel_arguments, launch_dimensions, &builder));
+
+  // Move function body into kernel prototype.
+  llvm::Function* prototype_func = builder.GetInsertBlock()->getParent();
+  prototype_func->splice(prototype_func->begin(), impl_fn);
+  for (const auto& [impl_fn_arg, kernel_arg] :
+       llvm::zip(impl_fn->args(), kernel->args())) {
+    impl_fn_arg.replaceAllUsesWith(&kernel_arg);
+  }
+  CHECK_EQ(impl_fn->arg_size(), kernel->arg_size() + 2);
+
+  auto tma_scratchpad_arg = impl_fn->getArg(impl_fn->arg_size() - 2);
+  tma_scratchpad_arg->replaceAllUsesWith(llvm::ConstantPointerNull::get(
+      llvm::cast<llvm::PointerType>(tma_scratchpad_arg->getType())));
+  auto profiling_scratchpad_arg = impl_fn->getArg(impl_fn->arg_size() - 1);
+  profiling_scratchpad_arg->replaceAllUsesWith(llvm::ConstantPointerNull::get(
+      llvm::cast<llvm::PointerType>(profiling_scratchpad_arg->getType())));
+
+  impl_fn->eraseFromParent();
+
+  return kernel;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h
index a6ebb3a5b9d86d..df0b3257b09a2d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
@@ -46,6 +45,7 @@ namespace xla {
 namespace gpu {
 
 struct FusionEmissionResult {
+  std::unique_ptr<llvm::Module> module;
   std::vector<std::unique_ptr<Thunk>> thunks;
 };
 
@@ -101,16 +101,16 @@ class KernelFusionInterface : public FusionInterface {
 };
 
 absl::StatusOr<llvm::Function*> BuildKernelPrototype(
-    IrEmitterContext& ir_emitter_context, const std::string& impl_fn_name,
-    const std::string& suggested_name,
+    llvm::Module* llvm_module, const se::DeviceDescription& gpu_device_info,
+    const std::string& impl_fn_name, const std::string& unique_kernel_name,
     const emitters::KernelArguments& arguments,
     const LaunchDimensions& launch_dimensions, llvm::IRBuilderBase* builder);
 
-absl::StatusOr<llvm::Function*> BuildKernelPrototypeFromUniqueName(
-    IrEmitterContext& ir_emitter_context, const std::string& impl_fn_name,
-    const std::string& unique_kernel_name,
-    const emitters::KernelArguments& arguments,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilderBase* builder);
+absl::StatusOr<llvm::Function*> RemoveUnusedTritonAbiArguments(
+    IrEmitterContext& ir_emitter_context,
+    const std::string& sanitized_kernel_name,
+    LaunchDimensions& launch_dimensions,
+    const emitters::KernelArguments& arguments);
 
 // Compute the kernel name. The opcode string may contain "-" which cannot be
 // in a PTX function name, so sanitize the name before uniquifying it.
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.cc b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
index bf6ce4b3ad0589..d8e082f8b0a3a8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/copy.h"
 #include "xla/backends/gpu/codegen/cudnn.h"
 #include "xla/backends/gpu/codegen/custom.h"
@@ -77,8 +76,8 @@ bool HloFusionInfo::CanEmitDynamicUpdateSliceInPlace() const {
   return ret.ok() && *ret;
 }
 
-std::unique_ptr<FusionInterface> GetFusionEmitter(const FusionInfo& fusion_info,
-                                                  mlir::MLIRContext* ctx) {
+std::unique_ptr<FusionInterface> GetFusionEmitter(
+    const FusionInfo& fusion_info, mlir::MLIRContext* mlir_context) {
   const auto& analysis = fusion_info.analysis();
   const FusionBackendConfig& backend_config = analysis.fusion_backend_config();
 
@@ -109,16 +108,16 @@ std::unique_ptr<FusionInterface> GetFusionEmitter(const FusionInfo& fusion_info,
           fusion_info.CanEmitDynamicUpdateSliceInPlace()) {
         return std::make_unique<InPlaceDynamicUpdateSliceFusion>(analysis);
       }
-      return std::make_unique<LoopFusion>(analysis, ctx);
+      return std::make_unique<LoopFusion>(analysis, mlir_context);
     }
     case HloFusionAnalysis::EmitterFusionKind::kReduction: {
-      return CreateReductionFusion(analysis);
+      return CreateReductionFusion(analysis, mlir_context);
     }
     case HloFusionAnalysis::EmitterFusionKind::kScatter: {
-      return CreateScatterFusion(analysis);
+      return CreateScatterFusion(analysis, mlir_context);
     }
     case HloFusionAnalysis::EmitterFusionKind::kTranspose: {
-      return CreateTransposeFusion(analysis);
+      return CreateTransposeFusion(analysis, mlir_context);
     }
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate: {
       return std::make_unique<ConcatenateFusion>(analysis);
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.h b/third_party/xla/xla/backends/gpu/codegen/fusions.h
index 8c16653f2a4eb2..c1f1aabfed5e0c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.h
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <optional>
 
 #include "absl/status/statusor.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
@@ -95,8 +94,8 @@ class PreBufferAssignmentFusionInfo : public FusionInfo {
 };
 
 // Returns the emitter for the given fusion.
-std::unique_ptr<FusionInterface> GetFusionEmitter(const FusionInfo& fusion_info,
-                                                  mlir::MLIRContext* ctx);
+std::unique_ptr<FusionInterface> GetFusionEmitter(
+    const FusionInfo& fusion_info, mlir::MLIRContext* mlir_context);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
index ddb987ea2b3183..dfe8f7f9a8100f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
@@ -32,28 +32,13 @@ cc_library(
         "//xla:status_macros",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/emitters:emitter_base",
-        "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir_hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
-        "//xla/tools:hlo_module_loader",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:DLTIDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:VectorDialect",
     ],
 )
@@ -70,6 +55,7 @@ xla_cc_binary(
     deps = [
         ":test_lib",
         "//xla/codegen/tools:test_lib",
+        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
@@ -123,7 +109,7 @@ xla_cc_binary(
     name = "fusion_wrapper",
     testonly = 1,
     srcs = ["fusion_wrapper.cc"],
-    visibility = [":codegen_tests"],
+    visibility = ["//visibility:private"],
     deps = [
         "//xla/codegen/tools:test_lib",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc b/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc
index e65f9af3b6225b..d5aa2878e1ccb7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc
@@ -26,14 +26,13 @@ namespace xla {
 namespace gpu {
 
 absl::Status Run(const std::string& filename) {
+  auto mlir_context = GetMlirContextForTest();
+  mlir_context.loadAllAvailableDialects();
   TF_ASSIGN_OR_RETURN(auto module, LoadTestModule(filename));
-  TF_ASSIGN_OR_RETURN(auto emitter_data, GetEmitter(*module));
-
-  auto context = GetMlirContextForTest();
-  context.loadAllAvailableDialects();
+  TF_ASSIGN_OR_RETURN(auto emitter_data, GetEmitter(*module, mlir_context));
   TF_ASSIGN_OR_RETURN(auto mlir_module,
                       emitter_data->emitter->CreateMLIRModule(
-                          context, *emitter_data->fusion, "main",
+                          mlir_context, *emitter_data->fusion, "main",
                           /*buffer_assignment=*/nullptr));
   llvm::outs() << *mlir_module;
   return absl::OkStatus();
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc b/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
index 03d042f836206f..02ab435155c921 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
@@ -107,14 +107,15 @@ std::pair<std::string, std::vector<int64_t>> ParseHeroAndIds(
 }
 
 TEST_F(CorrectnessTest, InputIndexingIsBijection) {
-  auto context = GetMlirContextForTest();
+  auto mlir_context = GetMlirContextForTest();
+  RegisterSymbolicExprStorage(&mlir_context);
   TF_ASSERT_OK_AND_ASSIGN(auto module, LoadTestModule(flags.input_file));
-  TF_ASSERT_OK_AND_ASSIGN(auto emitter_data, GetEmitter(*module));
+  TF_ASSERT_OK_AND_ASSIGN(auto emitter_data, GetEmitter(*module, mlir_context));
   for (const auto& [hero_name, ids] : flags.bijection_inputs) {
     TF_ASSERT_OK_AND_ASSIGN(int64_t hero_index,
                             GetHeroIndex(hero_name, *emitter_data->analysis));
     auto indexing = emitter_data->emitter->ComputeThreadIdToInputIndexing(
-        hero_index, &context);
+        hero_index, &mlir_context);
     ASSERT_TRUE(indexing.has_value());
     for (int64_t id : ids) {
       TF_ASSERT_OK(TestBijection(indexing.value()[id],
@@ -129,14 +130,15 @@ TEST_F(CorrectnessTest, InputIndexingIsBijection) {
 }
 
 TEST_F(CorrectnessTest, OutputIndexingIsBijection) {
-  auto context = GetMlirContextForTest();
+  auto mlir_context = GetMlirContextForTest();
+  RegisterSymbolicExprStorage(&mlir_context);
   TF_ASSERT_OK_AND_ASSIGN(auto module, LoadTestModule(flags.input_file));
-  TF_ASSERT_OK_AND_ASSIGN(auto emitter_data, GetEmitter(*module));
+  TF_ASSERT_OK_AND_ASSIGN(auto emitter_data, GetEmitter(*module, mlir_context));
   for (const auto& hero_name : flags.bijection_outputs) {
     TF_ASSERT_OK_AND_ASSIGN(int64_t hero_index,
                             GetHeroIndex(hero_name, *emitter_data->analysis));
     auto indexing = emitter_data->emitter->ComputeThreadIdToOutputIndexing(
-        hero_index, &context);
+        hero_index, &mlir_context);
     ASSERT_TRUE(indexing.has_value());
     TF_ASSERT_OK(TestBijection(
         *indexing, GetFirstArrayShape(
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc
index 4fb1389367a53e..c97bd90ae1550a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc
@@ -32,7 +32,7 @@ namespace xla {
 namespace gpu {
 
 absl::StatusOr<std::unique_ptr<EmitterData>> GetEmitter(
-    const HloModule& module) {
+    const HloModule& module, mlir::MLIRContext& mlir_context) {
   auto data = std::make_unique<EmitterData>();
   data->fusion = DynCast<HloFusionInstruction>(
       module.entry_computation()->root_instruction());
@@ -41,8 +41,7 @@ absl::StatusOr<std::unique_ptr<EmitterData>> GetEmitter(
   data->analysis.emplace(
       HloFusionAnalysis::Create(*data->fusion, data->device.value()));
   PreBufferAssignmentFusionInfo info(data->analysis.value());
-  mlir::MLIRContext ctx = GetMlirContextForTest();
-  auto fusion_emitter = GetFusionEmitter(info, &ctx);
+  auto fusion_emitter = GetFusionEmitter(info, &mlir_context);
 
   auto emitter = dynamic_cast<EmitterBase*>(fusion_emitter.get());
   TF_RET_CHECK(emitter != nullptr) << "Expected emitter to be an EmitterBase";
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h
index ceacfa8fe17839..0c50c02762eeab 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h
@@ -39,7 +39,7 @@ struct EmitterData {
   std::unique_ptr<EmitterBase> emitter;
 };
 absl::StatusOr<std::unique_ptr<EmitterData>> GetEmitter(
-    const HloModule& module);
+    const HloModule& module, mlir::MLIRContext& mlir_context);
 
 // Returns an MLIR context with all the dialects needed for testing.
 mlir::MLIRContext GetMlirContextForTest();
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 52c71484c9dc4d..ca7438dee5ce5e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -7,6 +7,7 @@ load(
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "//xla/tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -35,14 +36,14 @@ cc_library(
     ],
     deps = [
         ":fusion_emitter",
-        ":fusion_emitter_legacy_matmul",
+        ":xtile_compiler",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:util",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/codegen/emitters:kernel_arguments",
-        "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:backend_configs_cc",
@@ -52,12 +53,11 @@ cc_library(
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:kernel_reuse_cache",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:triton_fusion_analysis",
-        "//xla/service/llvm_ir:ir_array",
-        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/gpu/model:block_level_parameters",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -80,17 +80,19 @@ xla_cc_test(
         ":fusion",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -100,32 +102,79 @@ cc_library(
     hdrs = [
         "emitter_helpers.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
+        "//xla:comparison_util",
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/emitters:elemental_hlo_to_mlir",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/mlir_hlo:transformation_helpers",
-        "//xla/service/gpu:target_util",
         "//xla/service/llvm_ir:llvm_util",
-        "//xla/stream_executor:device_description",
-        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:statusor",
+        "@llvm-project//mlir:TensorDialect",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "lowering_util",
+    srcs = ["lowering_util.cc"],
+    hdrs = ["lowering_util.h"],
+    deps = [
+        ":tma_utils",
+        "//xla:util",
+        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+    ],
+)
+
+xla_cc_test(
+    name = "lowering_util_test",
+    srcs = ["lowering_util_test.cc"],
+    deps = [
+        ":lowering_util",
+        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Parser",
         "@triton//:TritonDialects",
     ],
 )
@@ -195,12 +244,84 @@ xla_cc_test(
 
 cc_library(
     name = "fusion_emitter",
+    srcs = ["fusion_emitter.cc"],
+    hdrs = ["fusion_emitter.h"],
+    deps = [
+        ":collective_emitter",  # TODO(willfroom): Migrate to using stablehlo.allreduce etc.
+        ":dot_algorithms",
+        ":emitter_helpers",
+        "//xla:autotuning_proto_cc",
+        "//xla:permutation_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/emitters:elemental_hlo_to_mlir",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/codegen/tiling:tiled_hlo_fusion_instruction",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/tiling:tiled_hlo_schedule",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
+        "//xla/mlir_hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service:instruction_fusion",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:FunctionInterfaces",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@local_tsl//tsl/platform:path",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "xtile_compiler",
     # Using if_cuda_or_rocm_is_configured guard to prevent sycl target build / link errors.
     srcs = if_cuda_or_rocm_is_configured(
-        ["fusion_emitter.cc"],
-        ["fusion_emitter_stub.cc"],
+        ["xtile_compiler.cc"],
+        ["xtile_compiler_stub.cc"],
     ),
-    hdrs = ["fusion_emitter.h"],
+    hdrs = ["xtile_compiler.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//xla:autotuning_proto_cc",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
@@ -209,29 +330,36 @@ cc_library(
         "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/codegen/tiling:tiled_hlo_fusion_instruction",
         "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/tiling:tiled_hlo_schedule",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/codegen/xtile/ir/transforms:passes",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu/model:block_level_parameters",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@stablehlo//:stablehlo_ops",
         "@triton//:TritonDialects",
     ] + if_cuda_or_rocm_is_configured([
+        ":fusion_emitter",
+        ":lowering_util",
         ":compilation_pipeline",
+        ":collective_emitter",
         ":dot_algorithms",
         ":emitter_helpers",
-        ":fusion_emitter_legacy_matmul",
         ":support",
-        ":tma_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -305,79 +433,18 @@ cc_library(
     ]),
 )
 
-cc_library(
-    name = "fusion_emitter_legacy_matmul",
-    srcs = if_gpu_is_configured(
-        ["fusion_emitter_legacy_matmul.cc"],
-        ["fusion_emitter_legacy_matmul_stub.cc"],
-    ),
-    hdrs = ["fusion_emitter_legacy_matmul.h"],
-    deps = [
-        ":dot_algorithms",
-        ":emitter_helpers",
-        "//xla:autotuning_proto_cc",
-        "//xla:comparison_util",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/codegen:emitter_loc_op_builder",
-        "//xla/codegen/tiling:tiled_hlo_computation",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/hlo/utils:hlo_traversal",
-        "//xla/mlir_hlo",
-        "//xla/mlir_hlo:map_mhlo_to_scalar_op",
-        "//xla/mlir_hlo:transformation_helpers",
-        "//xla/service:algorithm_util",
-        "//xla/service:matmul_indexing_utils",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:triton_fusion_analysis",
-        "//xla/service/gpu:triton_tiling_propagation",
-        "//xla/service/gpu/model:block_level_parameters",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor:launch_dim",
-        "//xla/stream_executor/gpu:tma_metadata",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:cord",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FunctionInterfaces",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:Support",
-        "@triton//:TritonDialects",
-    ],
-)
-
 cc_library(
     name = "dot_algorithms",
     srcs = ["dot_algorithms.cc"],
     hdrs = ["dot_algorithms.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":emitter_helpers",
-        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/xtile/ir:xtile",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_traversal",
+        "//xla/hlo/translate/hlo_to_mhlo:attribute_importer",
         "//xla/service:algorithm_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
@@ -394,26 +461,31 @@ cc_library(
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
+        "@stablehlo//:stablehlo_ops",
         "@triton//:TritonDialects",
     ],
 )
 
 cc_library(
-    name = "fusion_emitter_stub_for_testing",
+    name = "xtile_compiler_stub_for_testing",
     srcs = [
-        "fusion_emitter_legacy_matmul_stub.cc",
-        "fusion_emitter_stub.cc",
+        "xtile_compiler_stub.cc",
     ],
     hdrs = [
-        "fusion_emitter.h",
-        "fusion_emitter_legacy_matmul.h",
+        "xtile_compiler.h",
     ],
+    visibility = ["//visibility:private"],
     deps = [
+        ":emitter_helpers",
+        ":support",
         "//xla:autotuning_proto_cc",
-        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla:util",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_module_config",
@@ -424,6 +496,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor/gpu:tma_metadata",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -438,14 +511,15 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "fusion_emitter_stub_test",
-    srcs = ["fusion_emitter_stub_test.cc"],
+    name = "xtile_compiler_stub_test",
+    srcs = ["xtile_compiler_stub_test.cc"],
     deps = [
-        ":fusion_emitter_stub_for_testing",
+        ":xtile_compiler_stub_for_testing",
         "//xla:literal",
         "//xla:literal_util",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_module_config",
@@ -460,21 +534,21 @@ xla_cc_test(
     srcs = ["fusion_emitter_deviceless_test.cc"],
     tags = ["no_oss"],  # Doesn't pass in OSS when building with the `fusion_emitter_stub`.
     deps = [
-        ":fusion_emitter",
+        ":xtile_compiler",
         "//xla:xla_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
-        "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu/model:block_level_parameters",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:ir_headers",
@@ -482,63 +556,7 @@ xla_cc_test(
     ],
 )
 
-xla_test(
-    name = "fusion_emitter_device_legacy_test",
-    size = "large",
-    srcs = if_gpu_is_configured(["fusion_emitter_device_legacy_test.cc"]),
-    # TODO(b/372714955): Fix the memory leak!
-    backend_args = if_google(
-        {
-            "h100": ["--heap_check="],
-            "a100": ["--heap_check="],
-        },
-        {},
-    ),
-    backends = [
-        "a100",
-        "h100",
-        "b200",
-        "amdgpu_any",
-    ],
-    shard_count = 20,
-    tags = [
-        "large",
-        "no_mac",
-    ],
-    deps = [
-        ":fusion_emitter",
-        ":test_utils",
-        "//xla:autotuning_proto_cc",
-        "//xla:error_spec",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/codegen/tiling:tiled_hlo_computation",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:filecheck",
-        "//xla/hlo/testlib:pattern_matcher_gmock",
-        "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service:pattern_matcher",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/tests:gpu_codegen_test",
-        "//xla/stream_executor:device_description",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//llvm:ir_headers",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@local_tsl//tsl/platform:path",
-    ],
-)
-
+# TODO(b/393299275): Rename this test file now that the legacy emitter is gone.
 xla_test(
     name = "fusion_emitter_device_legacy_port_test",
     srcs = if_gpu_is_configured(["fusion_emitter_device_legacy_port_test.cc"]),
@@ -554,11 +572,12 @@ xla_test(
     deps = [
         ":fusion_emitter",
         ":test_utils",
+        ":xtile_compiler",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -567,6 +586,7 @@ xla_test(
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
@@ -575,7 +595,6 @@ xla_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
@@ -609,6 +628,7 @@ xla_test(
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/service/gpu:backend_configs_cc",
@@ -616,6 +636,7 @@ xla_test(
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
@@ -627,45 +648,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "fusion_emitter_legacy_int4_device_test",
-    size = "large",
-    srcs = if_gpu_is_configured(["fusion_emitter_legacy_int4_device_test.cc"]),
-    backends = [
-        "a100",
-        "h100",
-        "b200",
-        "amdgpu_any",
-    ],
-    shard_count = 10,
-    tags = [
-        "large",
-        "no_mac",
-    ],
-    deps = [
-        ":test_utils",
-        "//xla:autotuning_proto_cc",
-        "//xla:error_spec",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:types",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:filecheck",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu/tests:gpu_codegen_test",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:path",
-    ],
-)
-
 xla_test(
     name = "dot_algorithms_test",
     srcs = ["dot_algorithms_test.cc"],
@@ -731,72 +713,6 @@ xla_test(
     ],
 )
 
-# TODO(b/393299275): Remove this target once the legacy emitter is removed.
-xla_test(
-    name = "dot_algorithms_legacy_test",
-    srcs = ["dot_algorithms_legacy_test.cc"],
-    backend_args = if_google(
-        {
-            "b200": ["--heap_check="],
-            "a100": ["--heap_check="],
-            "h100": ["--heap_check="],
-        },
-        {},
-    ),
-    backends = [
-        "a100",
-        "h100",
-        "b200",
-        "amdgpu_any",
-    ],
-    env = {
-        "CUBLAS_EMULATE_SINGLE_PRECISION": "1",  # Trigger single precision emulation (F32_F32_F32) with BF16x9 cublas algorithm. It was introduced in cublas 12.9.
-        "CUBLAS_EMULATION_STRATEGY": "performant",  # Trigger single precision emulation (F32_F32_F32) with BF16x9 cublas algorithm. It was introduced in cublas 12.9.
-    },
-    shard_count = 30,
-    tags = [
-        "no_mac",
-    ],
-    deps = [
-        ":test_utils",
-        "//xla:autotuning_proto_cc",
-        "//xla:error_spec",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/gpu/profiler:kernel_name_tracer",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:filecheck",
-        "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service:dump",
-        "//xla/service:hlo_module_config",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/tests:gpu_codegen_test",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tests:test_utils",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:path",
-    ],
-)
-
 xla_test(
     name = "fusion_emitter_device_test",
     srcs = if_gpu_is_configured(["fusion_emitter_device_test.cc"]),
@@ -814,6 +730,7 @@ xla_test(
         ":fusion_emitter",
         ":support",
         ":test_utils",
+        ":xtile_compiler",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:literal",
@@ -823,16 +740,23 @@ xla_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/tiling:tiled_hlo_schedule",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:algorithm_util",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
@@ -860,11 +784,12 @@ cc_library(
     hdrs = ["test_utils.h"],
     deps = [
         ":fusion_emitter",
+        ":xtile_compiler",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
@@ -877,6 +802,7 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/model:block_level_parameters",
+        "//xla/service/gpu/model:triton_emitter_constraints",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
@@ -949,6 +875,7 @@ xla_test(
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
@@ -956,38 +883,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "fusion_emitter_parametrized_legacy_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_parametrized_legacy_test.cc"]),
-    backend_tags = {
-        # TODO(b/445172709): Re-enable once fixed.
-        "b200": ["broken"],
-    },
-    backends = [
-        "a100",
-        "h100",
-        "b200",
-        "amdgpu_any",
-    ],
-    shard_count = 10,
-    tags = ["no_mac"],
-    deps = [
-        ":support",
-        ":test_utils",
-        "//xla:comparison_util",
-        "//xla:error_spec",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/gpu/tests:gpu_codegen_test",
-        "//xla/stream_executor:device_description",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 xla_cc_test(
     name = "fusion_emitter_shared_dialect_test",
     srcs = if_gpu_is_configured(["fusion_emitter_shared_dialect_test.cc"]),
@@ -1027,6 +922,7 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/service:instruction_fusion",
         "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -1055,6 +951,7 @@ xla_cc_test(
         ":fusion_emitter",
         ":support",
         ":test_utils",
+        ":xtile_compiler",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
@@ -1090,22 +987,24 @@ xla_test(
         ":fusion_emitter",
         ":support",
         ":test_utils",
+        ":xtile_compiler",
         "//xla:error_spec",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:triton_fusion_analysis",
+        "//xla/service/gpu/model:block_level_parameters",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1115,14 +1014,88 @@ cc_library(
     hdrs = ["tma_utils.h"],
     deps = [
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/service/gpu:matmul_utils",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:LLVMDialect",
+    ],
+)
+
+cc_library(
+    name = "collective_emitter",
+    srcs = ["collective_emitter.cc"],
+    hdrs = ["collective_emitter.h"],
+    deps = [
+        ":emitter_helpers",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla/backends/gpu/runtime:all_reduce",
+        "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:collective_ops_utils",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/gpu:all_reduce_kernel",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FunctionInterfaces",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Support",
+        "@triton//:TritonDialects",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_emitter_test",
+    srcs = ["collective_emitter_test.cc"],
+    tags = ["gpu"],
+    deps = [
+        ":collective_emitter",
+        ":fusion",
+        ":xtile_compiler",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla/backends/gpu/codegen:fusion_emitter",
+        "//xla/backends/gpu/codegen:fusions",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:hlo_creation_utils",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1133,9 +1106,9 @@ xla_cc_test(
         ":tma_utils",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/stream_executor/gpu:tma_metadata",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
new file mode 100644
index 00000000000000..c044577abc149f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
@@ -0,0 +1,424 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/collective_emitter.h"
+
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/Support/MathExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/backends/gpu/runtime/all_reduce.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/all_reduce_kernel.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::mlir::ShapedType;
+using ::mlir::Value;
+using ::xla::gpu::triton::TensorValue;
+using ::xla::gpu::triton::TileInfo;
+using ::xla::se::gpu::AllReduceStrategy;
+
+namespace ttir = ::mlir::triton;
+namespace mtx = ::mlir::triton::xla;
+namespace arith = ::mlir::arith;
+
+// The main memory space on a device (HBM).
+static constexpr auto kGlobalAddressSpace =
+    static_cast<std::underlying_type_t<mlir::NVVM::NVVMMemorySpace>>(
+        mlir::NVVM::NVVMMemorySpace::Global);
+
+// Metadata arguments for the collective emitter.
+// device_rank, signal_value, signal_buffers.
+static constexpr int32_t kNumCollectiveMetadataArgs = 3;
+
+struct AllReduceInfo {
+  ReductionKind reduction_kind;
+  int64_t num_devices;
+  int64_t num_elements;
+  PrimitiveType element_type;
+  AllReduceStrategy all_reduce_strategy;
+};
+
+// Returns the AllReduceInfo for the given all-reduce instruction if the
+// instruction is supported by the codegen.
+std::optional<AllReduceInfo> MaybeBuildAllReduceInfo(
+    const HloAllReduceInstruction* all_reduce) {
+  if (!all_reduce->GetModule()
+           ->config()
+           .debug_options()
+           .xla_gpu_unsupported_use_all_reduce_one_shot_kernel()) {
+    return std::nullopt;
+  }
+  if (all_reduce->device_list().replica_groups().empty()) {
+    VLOG(1) << "Replica groups are empty for " << all_reduce->name()
+            << ". Codegen will not be supported.";
+    return std::nullopt;
+  }
+  const int64_t num_devices = all_reduce->device_list().num_devices_per_group();
+  const std::optional<ReductionKind> reduction_kind =
+      MatchReductionComputation(all_reduce->called_computations().front());
+  if (!reduction_kind.has_value()) {
+    return std::nullopt;
+  }
+  const int64_t num_elements =
+      ShapeUtil::ElementsIn(all_reduce->operand(0)->shape());
+  const PrimitiveType element_type =
+      all_reduce->operand(0)->shape().element_type();
+  // NB: We do not codegen multimem kernels for now.
+  const AllReduceStrategy all_reduce_strategy =
+      GetAllReduceStrategy(num_elements, /*is_multimem_enabled=*/false);
+  // TODO(b/383125489): Support variadic all-reduce.
+  if (all_reduce->operand_count() > 1) {
+    return std::nullopt;
+  }
+  const int64_t byte_size =
+      num_elements * ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+  // TODO(b/457333991): Support twoShot for codegen.
+  if (byte_size >
+      GetMaxSupportedAllReduceSizeBytes(AllReduceStrategy::kOneShot)) {
+    return std::nullopt;
+  }
+  if (!IsAllReduceKernelSupported(num_devices, num_elements, element_type,
+                                  reduction_kind.value(),
+                                  all_reduce_strategy)) {
+    return std::nullopt;
+  }
+  return AllReduceInfo{
+      /* .reduction_kind= */ reduction_kind.value(),
+      /* .num_devices= */ num_devices,
+      /* .num_elements= */ num_elements,
+      /* .element_type= */ element_type,
+      /* .all_reduce_strategy= */ all_reduce_strategy,
+  };
+}
+
+// The logic here is very naive and assumes a monotonic layout
+// where only the last dimension is used as a tiling dimension.
+absl::StatusOr<std::optional<BlockLevelFusionConfig>>
+GetBlockLevelFusionConfigForAllReduce(
+    const se::DeviceDescription& device_info,
+    const HloAllReduceInstruction* all_reduce) {
+  const std::optional<AllReduceInfo> all_reduce_info =
+      MaybeBuildAllReduceInfo(all_reduce);
+  if (!all_reduce_info.has_value()) {
+    return std::nullopt;
+  }
+  const Shape& output_shape = all_reduce->shape();
+  const LaunchDimensions launch_dims = AllReduceLaunchDimensions(
+      all_reduce_info->num_elements, all_reduce_info->num_devices,
+      all_reduce_info->all_reduce_strategy);
+  BlockLevelFusionConfig block_level_config;
+  block_level_config.set_num_warps(launch_dims.num_threads_per_block() /
+                                   WarpSize(device_info));
+  block_level_config.set_num_ctas(1);    // No block-level clustering.
+  block_level_config.set_num_stages(1);  // No pipelining of loops.
+  Tile* output_tile = block_level_config.add_output_tiles();
+  const int64_t rank = output_shape.dimensions().size();
+
+  // Tile sizes are rolled up to power of 2 because this is what triton expects
+  // and consequently the tiling infra.
+  for (int i = 0; i < rank - 1; ++i) {
+    output_tile->add_sizes(llvm::PowerOf2Ceil(output_shape.dimensions(i)));
+  }
+  // The last dimension is divided amongst blocks.
+  if (rank > 0) {
+    const int64_t tile_size =
+        CeilOfRatio(output_shape.dimensions(rank - 1),
+                    absl::implicit_cast<int64_t>(launch_dims.num_blocks()));
+    output_tile->add_sizes(llvm::PowerOf2Ceil(tile_size));
+  }
+  return block_level_config;
+}
+
+absl::StatusOr<TensorValue> EmitAllReduce(
+    EmitterLocOpBuilder b, const HloComputation* computation,
+    const HloAllReduceInstruction& all_reduce,
+    const TiledHloInstruction& tiled_hlo_reduce,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, mlir::Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
+  const TiledHloInstruction* tiled_input_hlo = tiled_hlo_reduce.operand(0);
+  TensorValue input_tile = values[tiled_input_hlo];
+
+  // Variadics are not supported yet so we can fix inputs to 1.
+  // Which means 2 arguments for input/output one for scratch buffers and 3
+  // metadata arguments. Plus 1 for the tile index for a total of 7.
+  const int32_t num_input_output_args = computation->num_parameters() * 2;
+  const int32_t num_scratch_buffers = computation->num_parameters();
+  static constexpr int32_t kNumTileIndexArgs = 1;
+  TF_RET_CHECK(fn.getNumArguments() ==
+               (num_input_output_args + num_scratch_buffers +
+                kNumCollectiveMetadataArgs + kNumTileIndexArgs));
+  // Opaque arguments start after the input/output arguments.
+  const int32_t start_idx = num_input_output_args;
+  mlir::Value device_rank = fn.getArgument(start_idx);
+  TF_RET_CHECK(device_rank.getType().isInteger(32));
+  mlir::Value signal_value = fn.getArgument(start_idx + 1);
+  TF_RET_CHECK(signal_value.getType().isInteger(32));
+  // !tt.ptr<!tt.ptr<i32>>
+  mlir::Value signal_buffers = fn.getArgument(start_idx + 2);
+  // !tt.ptr<!tt.ptr<i64>>
+  mlir::Value remote_input_buffers = fn.getArgument(start_idx + 3);
+
+  TF_ASSIGN_OR_RETURN(
+      TileInfo tile_info,
+      TileInfo::Construct(b, pid, /*runtime_values=*/{}, *tiled_input_hlo));
+
+  // 1. Scatter phase: Copy local tile to the remote buffer of the current rank.
+  const auto ptr_to_i64_type =
+      ttir::PointerType::get(b.getI64Type(), kGlobalAddressSpace);
+  auto remote_input_buffers_i64 =
+      ttir::BitcastOp::create(b, ptr_to_i64_type, remote_input_buffers);
+  Value remote_buf_ptr_addr = ttir::AddPtrOp::create(
+      b, ptr_to_i64_type, remote_input_buffers_i64, device_rank);
+  Value remote_buf_i64 = ttir::LoadOp::create(b, remote_buf_ptr_addr,
+                                              ttir::CacheModifier::NONE,     //
+                                              ttir::EvictionPolicy::NORMAL,  //
+                                              false);  // isVolatile
+  const auto elem_type =
+      mlir::cast<ShapedType>(input_tile.getType()).getElementType();
+  const auto ptr_to_elem_type =
+      ttir::PointerType::get(elem_type, kGlobalAddressSpace);
+  Value remote_buf_ptr =
+      ttir::IntToPtrOp::create(b, ptr_to_elem_type, remote_buf_i64);
+  mlir::ArrayRef<int64_t> remote_shape = tile_info.original_shape();
+  const mlir::MemRefType remote_memref_type =
+      mlir::MemRefType::get(remote_shape, elem_type);
+  mlir::Value remote_buf_memref =
+      mtx::PtrToMemrefOp::create(b, remote_memref_type, remote_buf_ptr);
+  xtile::InsertTileOp::create(
+      b, input_tile, remote_buf_memref, tile_info.offsets(),
+      tile_info.padded_tile_sizes(), tile_info.tile_strides());
+
+  // 2. Synchronization phase: Wait for all ranks to complete the scatter.
+  int64_t world_size = all_reduce.device_list().num_devices_per_group();
+  mtx::BlockBarrierOp::create(b, signal_buffers, device_rank, signal_value,
+                              b.getI32IntegerAttr(world_size));
+
+  // 3. Reduce phase: Load tiles from all ranks and reduce them.
+  HloComputation* reduction_computation = all_reduce.to_apply();
+  llvm::SmallVector<const HloInstruction*> to_emit;
+  // There is really only one non-parameter instruction in the computation.
+  for (const HloInstruction* instr : reduction_computation->instructions()) {
+    if (instr->opcode() != HloOpcode::kParameter) {
+      to_emit.push_back(instr);
+    }
+  }
+  // Set accumulator zero.
+  mlir::Value accumulator_zero =
+      arith::ConstantOp::create(b, elem_type, b.getZeroAttr(elem_type));
+  TensorValue accumulator =
+      triton::Splat(b, accumulator_zero, input_tile.getType().getShape());
+  for (int rank = 0; rank < world_size; ++rank) {
+    Value rank_idx =
+        arith::ConstantOp::create(b, b.getI64Type(), b.getI64IntegerAttr(rank));
+    Value remote_buf_ptr_addr = ttir::AddPtrOp::create(
+        b, ptr_to_i64_type, remote_input_buffers_i64, rank_idx);
+    Value remote_buf_i64 =
+        ttir::LoadOp::create(b, remote_buf_ptr_addr,
+                             ttir::CacheModifier::NONE,     //
+                             ttir::EvictionPolicy::NORMAL,  //
+                             false);                        // isVolatile
+    Value remote_buf_ptr =
+        ttir::IntToPtrOp::create(b, ptr_to_elem_type, remote_buf_i64);
+    Value remote_buf_memref =
+        mtx::PtrToMemrefOp::create(b, remote_memref_type, remote_buf_ptr);
+    TensorValue next_tile =
+        EmitParameterExtract(b, tile_info, remote_buf_memref);
+
+    absl::flat_hash_map<const HloInstruction*, TensorValue> region_values;
+    region_values[reduction_computation->parameter_instruction(0)] =
+        accumulator;
+    region_values[reduction_computation->parameter_instruction(1)] = next_tile;
+    TF_ASSIGN_OR_RETURN(accumulator,
+                        triton::EmitScope(b,
+                                          /*instructions=*/to_emit,
+                                          /*values=*/region_values));
+  }
+  return accumulator;
+}
+
+absl::StatusOr<std::vector<Shape>> GetAllReduceUnmanagedKernelArguments(
+    const HloComputation* computation,
+    const HloAllReduceInstruction* all_reduce) {
+  const int32_t num_devices = all_reduce->device_list().num_devices_per_group();
+  std::vector<Shape> unmanaged_arguments;
+  unmanaged_arguments.reserve(computation->num_parameters() +
+                              kNumCollectiveMetadataArgs);
+
+  // rank and signal_value
+  unmanaged_arguments.push_back(ShapeUtil::MakeShape(S32, {}));
+  unmanaged_arguments.push_back(ShapeUtil::MakeShape(S32, {}));
+  // The shape for signal and scratch buffers does not really matter in the end
+  // because this would just be a pointer. For documentation purposes we add
+  // the correct shape which would be
+  // - num_devices * num_blocks for the signal buffer.
+  // - num_devices * shape of the parameter for scratch buffers.
+  // Since number of blocks is not known in this context we use a constant.
+  static constexpr int32_t kMaxBlocksPerGrid = 24;
+  unmanaged_arguments.push_back(
+      ShapeUtil::MakeShape(S32, {num_devices, kMaxBlocksPerGrid}));
+  // Scratch buffers
+  for (const HloInstruction* instr : computation->parameter_instructions()) {
+    Shape shape =
+        ShapeUtil::InsertDimensionAtIndex(instr->shape(), 0, num_devices);
+    unmanaged_arguments.push_back(shape);
+  }
+  TF_RET_CHECK(unmanaged_arguments.size() ==
+               computation->num_parameters() + kNumCollectiveMetadataArgs);
+  return unmanaged_arguments;
+}
+
+}  // namespace
+
+absl::StatusOr<std::optional<BlockLevelFusionConfig>>
+GetCollectiveBlockLevelFusionConfig(const se::DeviceDescription& device_info,
+                                    const HloFusionInstruction* fusion_instr) {
+  const HloInstruction* root = fusion_instr->fused_expression_root();
+  switch (root->opcode()) {
+    case HloOpcode::kAllReduceStart:
+      return GetBlockLevelFusionConfigForAllReduce(
+          device_info, Cast<HloAllReduceInstruction>(root));
+    default:
+      return std::nullopt;
+  }
+}
+
+absl::StatusOr<bool> TrySetGpuBackendConfigForCollective(
+    const se::DeviceDescription& device_info,
+    HloFusionInstruction* fusion_instr) {
+  TF_ASSIGN_OR_RETURN(
+      const std::optional<BlockLevelFusionConfig> block_config,
+      GetCollectiveBlockLevelFusionConfig(device_info, fusion_instr));
+  if (!block_config.has_value()) {
+    return false;
+  }
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_backend_config,
+                      fusion_instr->backend_config<GpuBackendConfig>());
+  gpu_backend_config.mutable_fusion_backend_config()->set_kind(
+      kTritonCollectiveFusionKind);
+  *gpu_backend_config.mutable_fusion_backend_config()
+       ->mutable_block_level_fusion_config() = *std::move(block_config);
+  TF_RETURN_IF_ERROR(
+      fusion_instr->set_backend_config(std::move(gpu_backend_config)));
+  return true;
+}
+
+absl::StatusOr<std::vector<Shape>> GetCollectiveUnmanagedKernelArguments(
+    const HloFusionInstruction* fusion) {
+  const HloComputation* computation = fusion->fused_instructions_computation();
+  const HloInstruction* root = computation->root_instruction();
+  switch (root->opcode()) {
+    case HloOpcode::kAllReduceStart:
+      return GetAllReduceUnmanagedKernelArguments(
+          computation, Cast<HloAllReduceInstruction>(root));
+    default:
+      return std::vector<Shape>();
+  }
+}
+
+absl::StatusOr<int32_t> AddCollectiveMetadataArguments(
+    llvm::SmallVector<mlir::Type>& fn_arg_types, EmitterLocOpBuilder& b,
+    const HloComputation* hlo_computation) {
+  // rank: i32
+  fn_arg_types.push_back(b.getI32Type());
+  // signal_value: i32
+  fn_arg_types.push_back(b.getI32Type());
+  // signal_buffers: !tt.ptr<!tt.ptr<i32>>
+  fn_arg_types.push_back(ttir::PointerType::get(
+      ttir::PointerType::get(b.getI32Type(), kGlobalAddressSpace),
+      kGlobalAddressSpace));
+  for (HloInstruction* p : hlo_computation->parameter_instructions()) {
+    PrimitiveType type = p->shape().element_type();
+    mlir::Type ir_type;
+    if (type == U16) {
+      ir_type = b.getI16Type();
+    } else if (type == S4) {
+      ir_type = b.getI4Type();
+    } else {
+      TF_ASSIGN_OR_RETURN(ir_type, triton::TritonType(b, type));
+    }
+    // Also add the remote/scratch buffers for collectives.
+    // !tt.ptr<!tt.ptr<type>>
+    fn_arg_types.push_back(ttir::PointerType::get(
+        ttir::PointerType::get(ir_type, kGlobalAddressSpace),
+        kGlobalAddressSpace));
+  }
+  // num_metadata_args =
+  return hlo_computation->num_parameters() + kNumCollectiveMetadataArgs;
+}
+
+absl::StatusOr<TensorValue> EmitCollective(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_reduce,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, mlir::Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
+  const HloComputation* computation = fusion->fused_instructions_computation();
+  const HloInstruction* root = computation->root_instruction();
+  switch (root->opcode()) {
+    case HloOpcode::kAllReduceStart:
+      return EmitAllReduce(
+          b, computation, *xla::Cast<HloAllReduceInstruction>(root),
+          tiled_hlo_reduce, block_level_parameters, fn, pid, values);
+    default:
+      return absl::UnimplementedError(
+          absl::StrCat("Unsupported collective fusion: ", root->ToString()));
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.h b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.h
new file mode 100644
index 00000000000000..105e47c53c2288
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.h
@@ -0,0 +1,86 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_COLLECTIVE_EMITTER_H_
+#define XLA_BACKENDS_GPU_CODEGEN_TRITON_COLLECTIVE_EMITTER_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla::gpu {
+
+// Returns the block level fusion config for the collective kernel.
+// For now only all-reduce is supported.
+// If an std::nullopt is returned, it implies that the collective kernel is
+// not supported and cannot be emitted.
+absl::StatusOr<std::optional<xla::gpu::BlockLevelFusionConfig>>
+GetCollectiveBlockLevelFusionConfig(const se::DeviceDescription& device_info,
+                                    const HloFusionInstruction* fusion_instr);
+
+// Sets the BlockLevelFusionConfig for a collective op inside the
+// GpuBackendConfig for the fusion instruction.
+// Returns true if the collective op is supported and the config is set.
+// Returns false if the collective op is not supported. No backend config is set
+// in this case.
+// Returns an error in case of an internal error or invalid arguments.
+absl::StatusOr<bool> TrySetGpuBackendConfigForCollective(
+    const se::DeviceDescription& device_info,
+    HloFusionInstruction* fusion_instr);
+
+// Adds the metadata arguments to the function's argument list.
+// For collective some extra metadata arguments are needed such as rank,
+// and pointers to remote GPU buffers.
+// The fn_arg_types is updated in place to add these.
+// Returns the number of metadata arguments added or error.
+absl::StatusOr<int32_t> AddCollectiveMetadataArguments(
+    llvm::SmallVector<mlir::Type>& fn_arg_types, EmitterLocOpBuilder& b,
+    const HloComputation* hlo_computation);
+
+// Version of [AddCollectiveMetadataArguments] that does the same for
+// [emitters::KernelArgument] structure.
+absl::StatusOr<std::vector<Shape>> GetCollectiveUnmanagedKernelArguments(
+    const HloFusionInstruction* fusion);
+
+// Emits tiled XTile/Triton IR for a collective op.
+// See [EmitTiledHloInstruction] for an overview of how this fits into the
+// emitter.
+absl::StatusOr<triton::TensorValue> EmitCollective(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_reduce,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, mlir::Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, triton::TensorValue>&
+        values);
+
+}  // namespace xla::gpu
+#endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_COLLECTIVE_EMITTER_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
new file mode 100644
index 00000000000000..99f8201d630bbf
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
@@ -0,0 +1,271 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/collective_emitter.h"
+
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/fusions.h"
+#include "xla/backends/gpu/codegen/triton/fusion.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/hlo_creation_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::Optional;
+using ::tsl::proto_testing::EqualsProto;
+
+struct ModuleWithFusion {
+  std::unique_ptr<HloModule> module;
+
+  const HloFusionInstruction* FusionInstr() const {
+    return Cast<HloFusionInstruction>(
+        module->entry_computation()->root_instruction());
+  }
+  HloFusionInstruction* MutableFusionInstr() {
+    return Cast<HloFusionInstruction>(
+        module->entry_computation()->root_instruction());
+  }
+};
+
+struct ModuleWithEmitter : public ModuleWithFusion {
+  mlir::MLIRContext mlir_context;
+  std::optional<HloFusionAnalysis> analysis;
+  std::unique_ptr<TritonFusion> emitter;
+  llvm::LLVMContext llvm_context;
+  llvm::Module llvm_module{"test_module", llvm_context};
+
+  explicit ModuleWithEmitter(std::unique_ptr<HloModule> module_arg)
+      : ModuleWithFusion{std::move(module_arg)} {}
+};
+
+class CollectiveBlockLevelConfigTest : public HloHardwareIndependentTestBase {
+ public:
+  CollectiveBlockLevelConfigTest()
+      : device_info_{TestGpuDeviceInfo::RTXH100SXMDeviceInfo()} {}
+
+  absl::StatusOr<ModuleWithFusion> BuildModuleWithFusion(
+      std::string module_str) const {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        ParseAndReturnVerifiedModule(module_str));
+    const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+        *module->entry_computation(), HloOpcode::kAllReduceStart);
+    std::unique_ptr<HloModule> module_with_fusion =
+        NewModuleWithFusion(instr, HloInstruction::FusionKind::kLoop);
+    module_with_fusion->mutable_config()
+        .mutable_debug_options()
+        .set_xla_gpu_unsupported_use_all_reduce_one_shot_kernel(true);
+    return ModuleWithFusion{std::move(module_with_fusion)};
+  }
+
+ protected:
+  static std::string GetModuleStr(const Shape& shape,
+                                  absl::string_view replica_groups = "{0,1}") {
+    return absl::StrFormat(R"(
+      HloModule test
+      apply_op {
+        x = f32[] parameter(0)
+        y = f32[] parameter(1)
+        ROOT apply_op = f32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        param_0 = %1$s parameter(0)
+        all-reduce-start = %1$s all-reduce-start(param_0), to_apply=apply_op, replica_groups={%2$s}
+        ROOT all-reduce-done = %1$s all-reduce-done(all-reduce-start)
+      }
+    )",
+                           shape.ToString(), replica_groups);
+  }
+
+  const se::DeviceDescription device_info_;
+};
+
+class CollectiveEmitterTest : public CollectiveBlockLevelConfigTest {
+ public:
+  absl::StatusOr<std::unique_ptr<ModuleWithEmitter>> BuildModuleWithEmitter(
+      std::string module_str, const se::DeviceDescription& device_info) const {
+    TF_ASSIGN_OR_RETURN(ModuleWithFusion module_with_fusion,
+                        BuildModuleWithFusion(std::move(module_str)));
+    TF_ASSIGN_OR_RETURN(
+        bool collective_fusion_config_set,
+        TrySetGpuBackendConfigForCollective(
+            device_info_, module_with_fusion.MutableFusionInstr()));
+    if (!collective_fusion_config_set) {
+      return absl::InternalError(
+          "Failed to set collective fusion config. "
+          "TrySetGpuBackendConfigForCollective returned false.");
+    }
+    auto result = std::make_unique<ModuleWithEmitter>(
+        std::move(module_with_fusion.module));
+    result->analysis =
+        HloFusionAnalysis::Create(*result->FusionInstr(), device_info);
+    std::unique_ptr<FusionInterface> fusion_emitter =
+        GetFusionEmitter(PreBufferAssignmentFusionInfo{*result->analysis},
+                         &result->mlir_context);
+    TritonFusion* triton_emitter =
+        dynamic_cast<TritonFusion*>(fusion_emitter.get());
+    TF_RET_CHECK(triton_emitter != nullptr);
+    fusion_emitter.release();
+    result->emitter = absl::WrapUnique(triton_emitter);
+    return result;
+  }
+};
+
+struct AllReduceBlockLevelConfigTestCase {
+  std::string test_name;
+  Shape shape;
+  std::string expected_proto;
+
+  // Teach gTest how to print the test case.
+  [[maybe_unused]] friend void PrintTo(
+      const AllReduceBlockLevelConfigTestCase& test_case, std::ostream* os) {
+    *os << "{test_name: " << test_case.test_name
+        << " shape: " << test_case.shape.ToString()
+        << " expected_proto: " << test_case.expected_proto << "}";
+  }
+};
+
+class CollectiveEmitterParameterizedTest
+    : public CollectiveBlockLevelConfigTest,
+      public ::testing::WithParamInterface<AllReduceBlockLevelConfigTestCase> {
+};
+
+TEST_P(CollectiveEmitterParameterizedTest, AllReduceBlockLevelConfig) {
+  const auto& param = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(const auto module_with_fusion,
+                          BuildModuleWithFusion(GetModuleStr(param.shape)));
+  TF_ASSERT_OK_AND_ASSIGN(const auto block_level_config,
+                          GetCollectiveBlockLevelFusionConfig(
+                              device_info_, module_with_fusion.FusionInstr()));
+  EXPECT_THAT(block_level_config, Optional(EqualsProto(param.expected_proto)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CollectiveEmitterParameterizedTestInstantiation,
+    CollectiveEmitterParameterizedTest,
+    ::testing::Values(AllReduceBlockLevelConfigTestCase{
+                          /* .test_name = */ "F32_65536",
+                          /* .shape = */ ShapeUtil::MakeShape(F32, {65536}),
+                          /* .expected_proto = */ R"pb(
+                            num_warps: 16
+                            num_ctas: 1
+                            num_stages: 1
+                            output_tiles { sizes: 4096 }
+                          )pb"},
+                      AllReduceBlockLevelConfigTestCase{
+                          /* .test_name= */ "F32_200_100",
+                          /* .shape= */ ShapeUtil::MakeShape(F32, {200, 100}),
+                          /* .expected_proto= */ R"pb(
+                            num_warps: 16
+                            num_ctas: 1
+                            num_stages: 1
+                            output_tiles { sizes: 256 sizes: 16 }
+                          )pb"}),
+    [](const ::testing::TestParamInfo<
+        CollectiveEmitterParameterizedTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+TEST_F(CollectiveEmitterTest, AllReduceBlockLevelConfigNoReplicaGroups) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      const auto module_with_fusion,
+      BuildModuleWithFusion(GetModuleStr(ShapeUtil::MakeShape(F32, {65536}),
+                                         /* replica_groups= */ "")));
+  TF_ASSERT_OK_AND_ASSIGN(const auto block_level_config,
+                          GetCollectiveBlockLevelFusionConfig(
+                              device_info_, module_with_fusion.FusionInstr()));
+  EXPECT_EQ(block_level_config, std::nullopt);
+}
+
+TEST_F(CollectiveEmitterTest, AllReduceGetCollectiveUnmanagedKernelArguments) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      const auto module_with_fusion,
+      BuildModuleWithFusion(GetModuleStr(ShapeUtil::MakeShape(F32, {65536}))));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const auto unmanaged_arguments,
+      GetCollectiveUnmanagedKernelArguments(module_with_fusion.FusionInstr()));
+  ASSERT_EQ(unmanaged_arguments.size(), 4);
+  EXPECT_EQ(unmanaged_arguments[0].dimensions().size(), 0);
+  EXPECT_EQ(unmanaged_arguments[1].dimensions().size(), 0);
+  // num_devices x input_shape
+  ASSERT_EQ(unmanaged_arguments[2].dimensions().size(), 2);
+  EXPECT_EQ(unmanaged_arguments[2].dimensions()[0], 2);  // num_devices
+
+  ASSERT_EQ(unmanaged_arguments[3].dimensions().size(), 2);
+  EXPECT_EQ(unmanaged_arguments[3].dimensions()[0], 2);      // num_devices
+  EXPECT_EQ(unmanaged_arguments[3].dimensions()[1], 65536);  // input_shape[0]
+}
+
+TEST_F(CollectiveEmitterTest, AllReduceWithTritonGetLaunchConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ModuleWithEmitter> result_ptr,
+      BuildModuleWithEmitter(GetModuleStr(ShapeUtil::MakeShape(F32, {65536})),
+                             device_info_));
+  auto& result = *result_ptr;
+  const TritonFusion* triton_fusion = result.emitter.get();
+  ASSERT_NE(triton_fusion, nullptr);
+  auto const launch_config = triton_fusion->GetLaunchConfig();
+  ASSERT_NE(launch_config, std::nullopt);
+  EXPECT_EQ(launch_config->launch_dimensions.num_blocks(), 16);
+  EXPECT_EQ(launch_config->launch_dimensions.num_threads_per_block(), 512);
+}
+
+TEST_F(CollectiveEmitterTest, AllReduceWithTritonGenerateTritonKernel) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ModuleWithEmitter> result,
+      BuildModuleWithEmitter(GetModuleStr(ShapeUtil::MakeShape(F32, {65536})),
+                             device_info_));
+  const TritonFusion* triton_fusion = result->emitter.get();
+  ASSERT_NE(triton_fusion, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(
+      TritonWrapperResult triton_kernel,
+      triton_fusion->GenerateTritonKernelAndWrapper(
+          *result->FusionInstr(), "test-all-reduce-start", device_info_,
+          &result->llvm_module, &result->mlir_context));
+}
+
+}  // namespace
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
index 31cda4fbc3683e..71b3ceaead1cb0 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
 
-#include <variant>
-
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -32,11 +30,16 @@ namespace xla::gpu {
 void CreateTritonXlaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::GpuComputeCapability& gpu_cc, bool rewrite_int4,
-    bool allow_tma) {
+    bool allow_tma, int num_stages) {
   pm->addPass(mlir::triton::xla::CreateTritonXLASqueezeDimsPass());
   pm->addPass(mlir::triton::xla::CreateTritonXLAFoldTransposePass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerBlockBarrierPass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerAtomicsPass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerGetTidPass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerXTilePass());
+  pm->addPass(mlir::triton::xla::CreateStableHLOLowerToTritonPass());
 
-  auto* cuda_cc = std::get_if<stream_executor::CudaComputeCapability>(&gpu_cc);
+  auto* cuda_cc = gpu_cc.cuda_compute_capability();
   bool is_at_least_hopper = cuda_cc != nullptr && cuda_cc->IsAtLeastHopper();
 
   if (rewrite_int4) {
@@ -45,7 +48,7 @@ void CreateTritonXlaPipeline(
   }
 
   pm->addPass(mlir::triton::xla::CreateTritonXLAExtractInsertToTritonPass(
-      /*allow_tma=*/allow_tma && is_at_least_hopper));
+      /*allow_tma=*/allow_tma && is_at_least_hopper, num_stages));
 
   // Lower affine expressions into arithmetic ops.
   pm->addPass(mlir::createLowerAffinePass());
@@ -75,15 +78,13 @@ void CreateTritonPipeline(
     const stream_executor::GpuComputeCapability& gpu_cc, int num_warps,
     int num_ctas, int num_stages,
     mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
-  if (auto* cuda_cc =
-          std::get_if<stream_executor::CudaComputeCapability>(&gpu_cc)) {
+  if (auto* cuda_cc = gpu_cc.cuda_compute_capability()) {
     return CreateTritonCudaPipeline(pm, *cuda_cc, num_warps, num_ctas,
                                     num_stages, out_cluster_info);
   }
 
-  CreateTritonRocmPipeline(
-      pm, std::get<stream_executor::RocmComputeCapability>(gpu_cc), num_warps,
-      num_ctas, num_stages);
+  CreateTritonRocmPipeline(pm, *gpu_cc.rocm_compute_capability(), num_warps,
+                           num_ctas, num_stages);
   // There is no clusters in ROCm for now.
   out_cluster_info.clusterDimX = 1;
   out_cluster_info.clusterDimY = 1;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
index e43939888106f3..8ae26d3a691cc5 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
@@ -26,7 +26,7 @@ namespace xla::gpu {
 void CreateTritonXlaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::GpuComputeCapability& gpu_cc, bool rewrite_int4,
-    bool allow_tma);
+    bool allow_tma, int num_stages);
 
 // Creates a Triton compilation pipeline.
 //
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
index 6c9719c9ee2bdd..133de281e50e85 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -102,6 +102,7 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
     pm->addPass(
         mt::gpu::createTritonGPUAutomaticWarpSpecialization({num_stages}));
     pm->addPass(mt::gpu::createTritonGPUPipeline({num_stages}));
+    pm->addPass(mt::gpu::createTritonGPUOptimizePartitionWarps());
     pm->addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
     pm->addPass(mt::gpu::createTritonGPUHoistTMEMAlloc({true}));
     pm->addPass(ttng::createTritonNvidiaGPURemoveTMEMTokensPass());
@@ -115,15 +116,15 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
       mt::gpu::createTritonGPUOptimizeDotOperands({cuda_cc.IsAtLeastAmpere()}));
   pm->addPass(mt::gpu::createTritonGPUCoalesceAsyncCopy());
   pm->addPass(ttng::createTritonNvidiaGPUOptimizeTMemLayoutsPass());
+  if (cuda_cc.IsAtLeastHopper()) {
+    pm->addPass(ttng::createTritonNvidiaGPUTMALoweringPass());
+  }
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(ttng::createTritonNvidiaGPUInterleaveTMemPass());
   pm->addPass(mt::gpu::createTritonGPUReduceDataDuplication());
   pm->addPass(mt::gpu::createTritonGPUReorderInstructions());
   pm->addPass(mt::createTritonLoopAwareCSE());
   pm->addPass(mlir::createSymbolDCEPass());
-  if (cuda_cc.IsAtLeastHopper()) {
-    pm->addPass(ttng::createTritonNvidiaGPUTMALoweringPass());
-  }
   pm->addPass(ttng::createTritonGPUFenceInsertion({cuda_cc_as_int}));
   pm->addPass(ttng::createTritonNvidiaGPUMMALoweringPass());
   pm->addPass(mlir::createSCCPPass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
index e5e6da54199c3a..502f22fafa6b35 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -80,11 +80,11 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
   pm->addPass(mlir::createLoopInvariantCodeMotionPass());
   pm->addPass(mlir::createCanonicalizerPass());
 
-  if (rocm_cc.has_amd_matrix_core()) {
-    pm->addPass(mlir::createTritonAMDGPUStreamPipeline(
-        {num_stages, /*global_prefetch=*/0, /*local_prefetch=*/0,
-         /*use_async_copy=*/false, /*use_block_pingpong=*/false}));
+  if (rocm_cc.has_amd_matrix_instr()) {
     // TODO(ROCm) Modify when corresponding run time flags are introduced.
+    pm->addPass(mlir::createTritonAMDGPUScheduleLoops({num_stages}));
+    pm->addPass(mlir::createTritonAMDGPUPipeline(
+        {/*useAsyncCopy=*/false, /*usePingpong=*/false}));
     if (/*use_async_copy=*/false) {  // Not enabled by default.
       pm->addPass(mlir::createTritonAMDGPUCoalesceAsyncCopy());
     }
@@ -100,7 +100,7 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
     pm->addPass(mlir::createTritonAMDGPUInThreadTranspose());
     pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   }
-  if (rocm_cc.has_amd_matrix_core()) {
+  if (rocm_cc.has_amd_matrix_instr()) {
     pm->addPass(mt::gpu::createTritonGPUReorderInstructions());
   }
   if (/*use_block_pingpong=*/false) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
index 8e341120d06fd0..2c9272ce02b4f7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
@@ -39,7 +39,8 @@ TEST(CompilationPipelineTest, UnswitchLoopsAfterLICM) {
   mlir::PassManager pm(&ctx);
 
   CreateTritonXlaPipeline(&pm, stream_executor::CudaComputeCapability(),
-                          /*rewrite_int4=*/false, /*allow_tma=*/true);
+                          /*rewrite_int4=*/false, /*allow_tma=*/true,
+                          /*num_stages=*/1);
 
   std::vector<std::string> pass_names;
   for (const mlir::Pass& pass : pm.getPasses()) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
index 8422f26caedd35..5951d2a21a9951 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
 
-#include <cstdint>
 #include <limits>
 #include <optional>
 #include <string>
@@ -36,13 +35,13 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/primitive_util.h"
+#include "xla/hlo/translate/hlo_to_mhlo/attribute_importer.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -57,8 +56,6 @@ namespace triton {
 
 namespace {
 
-namespace arith = ::mlir::arith;
-namespace math = ::mlir::math;
 namespace ttir = ::mlir::triton;
 
 using ::mlir::ShapedType;
@@ -67,69 +64,23 @@ using ::mlir::Value;
 
 Type ElementType(Value v) { return mlir::getElementTypeOrSelf(v); }
 
-// Precision-relevant configuration bits for `dot`s.
-struct PrecisionSpec {
-  PrecisionConfig::Algorithm algorithm;
-  // TODO(bchetioui): we hope to get rid of operand precisions eventually, they
-  // are currently a (XLA-wide) bridge to work with ALG_UNSET.
-  PrecisionConfig::Precision lhs_operand_precision;
-  PrecisionConfig::Precision rhs_operand_precision;
-  // Encodes `tt.dot`'s `inputPrecision` attribute.
-  ttir::InputPrecision ttir_input_precision;
-};
-
-using AlgorithmEmitter = absl::StatusOr<Value> (*)(EmitterLocOpBuilder,
-                                                   const DotOperands&,
-                                                   const PrecisionSpec&);
-
-// If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
-// If rhs is +infinity, we will have:
-// +infinity * 1.0 = +infinity
-// +infinity * 0.0 = NaN
-// We would get the wrong result if we sum these partial products. Instead, we
-// must override any accumulated result if the last partial product is
-// non-finite. See b/115844437.
-Value ZeroNaNs(EmitterLocOpBuilder b, Value input) {
-  Value positive_inf =
-      CreateConst<float>(b, b.getF32Type(),
-                         std::numeric_limits<float>::infinity(),
-                         mlir::cast<ShapedType>(input.getType()).getShape())
-          .UnwrapTensor();
-  Value abs_input = b.create<math::AbsFOp>(input);
-  Value is_finite = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT,
-                                            positive_inf, abs_input);
-  return b.create<arith::SelectOp>(is_finite, input, ZerosLike(b, input));
-}
-
-absl::Status ExpectType(Value v, Type expected_type) {
-  if (ElementType(v) != expected_type) {
-    std::string expected_type_str, actual_type_str;
-    {
-      llvm::raw_string_ostream os_expected(expected_type_str);
-      llvm::raw_string_ostream os_actual(actual_type_str);
-      expected_type.print(os_expected);
-      ElementType(v).print(os_actual);
-    }
-    return absl::FailedPreconditionError(absl::StrCat(
-        "Expected type ", expected_type_str, " but got ", actual_type_str));
+mlir::stablehlo::Precision XlaPrecisionToStableHloPrecision(
+    PrecisionConfig::Precision precision) {
+  switch (precision) {
+    case PrecisionConfig::DEFAULT:
+      return mlir::stablehlo::Precision::DEFAULT;
+    case PrecisionConfig::HIGH:
+      return mlir::stablehlo::Precision::HIGH;
+    case PrecisionConfig::HIGHEST:
+      return mlir::stablehlo::Precision::HIGHEST;
+    default:
+      LOG(FATAL) << "Unsupported precision: " << precision;
   }
-  return absl::OkStatus();
 }
 
-std::vector<Value> SplitF32(EmitterLocOpBuilder b, Value input,
-                            int split_count) {
-  std::vector<Value> split_inputs;
-  split_inputs.reserve(split_count);
-  for (int i = 0; i < split_count; ++i) {
-    Value input_as_bf16 = Cast(b, input, b.getBF16Type());
-    if (i != split_count - 1) {
-      Value input_as_f32 = Cast(b, input_as_bf16, b.getF32Type());
-      input = b.create<arith::SubFOp>(input, input_as_f32);
-    }
-    split_inputs.push_back(input_as_bf16);
-  }
-  return split_inputs;
-}
+}  // namespace
+
+namespace internal {
 
 absl::StatusOr<ttir::ScaleDotElemType> GetScaleDotElemType(Type value) {
   auto type = getElementTypeOrSelf(value);
@@ -142,154 +93,84 @@ absl::StatusOr<ttir::ScaleDotElemType> GetScaleDotElemType(Type value) {
   if (type == mlir::Float4E2M1FNType::get(value.getContext())) {
     return ttir::ScaleDotElemType::E2M1;
   }
+  if (type == mlir::BFloat16Type::get(value.getContext())) {
+    return ttir::ScaleDotElemType::BF16;
+  }
   return absl::InvalidArgumentError(
       absl::StrCat("Unsupported type: ", llvm_ir::DumpToString(type)));
 }
 
+}  // namespace internal
+
+namespace {
+
 absl::StatusOr<Value> ScaledDot(EmitterLocOpBuilder b,
                                 ScaledDotOperands& operands) {
   TF_ASSIGN_OR_RETURN(auto lhs_dot_elem_type,
-                      GetScaleDotElemType(operands.lhs.getType()));
+                      internal::GetScaleDotElemType(operands.lhs.getType()));
   TF_ASSIGN_OR_RETURN(auto rhs_dot_elem_type,
-                      GetScaleDotElemType(operands.rhs.getType()));
-
-  auto lhs_scale = Bitcast(b, operands.lhs_scale, b.getI8Type());
-  auto rhs_scale = Bitcast(b, operands.rhs_scale, b.getI8Type());
-
-  // TODO(b/436988479): Remove this once we have a fix for the scaled dot
-  // rewrite on the Triton side. With this transpose we have matching numerics.
-  rhs_scale = b.create<ttir::TransOp>(rhs_scale, mlir::ArrayRef<int32_t>{1, 0});
-
-  // make type with the same shape as the scale but with i8 type
-  return b.create<ttir::DotScaledOp>(
-      operands.accumulator.getType(), operands.lhs, operands.rhs,
-      operands.accumulator, lhs_scale, rhs_scale, lhs_dot_elem_type,
-      rhs_dot_elem_type, true);
-}
-
-Value IEEEDot(EmitterLocOpBuilder b, Value lhs, Value rhs, Value acc) {
-  return b.create<ttir::DotOp>(lhs, rhs, acc,
-                               /*inputPrecision=*/ttir::InputPrecision::IEEE,
-                               /*maxNumImpreciseAcc=*/0);
-}
-
-// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
-// from https://arxiv.org/pdf/1904.06376.pdf.
-absl::StatusOr<Value> EmitBF16x9Matmul(EmitterLocOpBuilder b,
-                                       const DotOperands& dot_operands,
-                                       const PrecisionSpec& precision_spec) {
-  constexpr int kNumParts = 3;
-  constexpr int kHigh = 0;
-  constexpr int kMid = 1;
-  constexpr int kLow = 2;
-
-  Type f32 = b.getF32Type();
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
-
-  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
-  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
-
-  Value result = triton::ZerosLike(b, dot_operands.accumulator);
-
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kLow], result);
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kLow], result);
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kMid], result);
-
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
-
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
-
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
-
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
-  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
-  return result;
-}
-
-// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
-// from https://arxiv.org/pdf/1904.06376.pdf.
-absl::StatusOr<Value> EmitBF16x6Matmul(EmitterLocOpBuilder b,
-                                       const DotOperands& dot_operands,
-                                       const PrecisionSpec& precision_spec) {
-  constexpr int kNumParts = 3;
-  constexpr int kHigh = 0;
-  constexpr int kMid = 1;
-  constexpr int kLow = 2;
-
-  Type f32 = b.getF32Type();
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
-
-  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
-  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
+                      internal::GetScaleDotElemType(operands.rhs.getType()));
 
-  Value result = triton::ZerosLike(b, dot_operands.accumulator);
-
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
-
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
+  Value lhs_scale;
+  if (lhs_dot_elem_type != ttir::ScaleDotElemType::BF16) {
+    lhs_scale = Bitcast(b, operands.lhs_scale, b.getI8Type());
+  }
+  Value rhs_scale;
+  if (rhs_dot_elem_type != ttir::ScaleDotElemType::BF16) {
+    rhs_scale = Bitcast(b, operands.rhs_scale, b.getI8Type());
+    rhs_scale = mlir::stablehlo::TransposeOp::create(
+        b, rhs_scale, b.getDenseI64ArrayAttr({1, 0}));
+  }
 
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
+  auto dot_scaled_op = xtile::DotScaledOp::create(
+      b, operands.accumulator.getType(), operands.lhs, operands.rhs, lhs_scale,
+      rhs_scale, true);
 
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
-  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
-  return result;
+  auto add_result =
+      mlir::isa<mlir::IntegerType>(
+          dot_scaled_op.getResult().getType().getElementType())
+          ? mlir::arith::AddIOp::create(b, operands.accumulator, dot_scaled_op)
+          : mlir::arith::AddFOp::create(b, operands.accumulator, dot_scaled_op);
+  return add_result->getResult(0);
 }
 
-// Compute F32 matmul with 3 BF16 dots. It is less accurate than
-// EmitBF16x6Matmul.
-absl::StatusOr<Value> EmitBF16x3Matmul(EmitterLocOpBuilder b,
-                                       const DotOperands& dot_operands,
-                                       const PrecisionSpec& precision_spec) {
-  constexpr int kNumParts = 2;
-  constexpr int kHigh = 0;
-  constexpr int kLow = 1;
-
-  Type f32 = b.getF32Type();
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
-
-  std::vector<Value> lhs_bf16 = SplitF32(b, dot_operands.lhs, kNumParts);
-  std::vector<Value> rhs_bf16 = SplitF32(b, dot_operands.rhs, kNumParts);
-
-  Value result = triton::ZerosLike(b, dot_operands.accumulator);
-  result = IEEEDot(b, lhs_bf16[kLow], rhs_bf16[kHigh], result);
-  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kLow], result);
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kHigh], result);
-  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
-  return result;
-}
+namespace {
 
-bool IsTf32Allowed(const HloDotInstruction& dot) {
-  auto precision_config = dot.precision_config();
-  if (precision_config.algorithm() == PrecisionConfig::ALG_UNSET) {
-    return tsl::tensor_float_32_execution_enabled() &&
-           precision_config.operand_precision(0) == PrecisionConfig::DEFAULT &&
-           precision_config.operand_precision(1) == PrecisionConfig::DEFAULT;
-  }
-  return algorithm_util::HasTf32InputType(precision_config.algorithm());
+Value EmitStableHloDotAndAdd(EmitterLocOpBuilder b, Value lhs, Value rhs,
+                             Value acc, PrecisionSpec precision_spec) {
+  auto lhs_type = mlir::cast<ShapedType>(lhs.getType());
+  auto rhs_type = mlir::cast<ShapedType>(rhs.getType());
+
+  CHECK(lhs_type.getRank() <= 2 && rhs_type.getRank() <= 2)
+      << "Unsupported ranks. LHS rank: " << lhs_type.getRank()
+      << " RHS rank: " << rhs_type.getRank();
+
+  llvm::SmallVector<int64_t> array_attr{0};
+  auto dot_dimension_numbers = mlir::stablehlo::DotDimensionNumbersAttr::get(
+      b.getContext(), /*lhsBatchingDimensions=*/{},
+      /*rhsBatchingDimensions=*/{},
+      /*lhsContractingDimensions=*/
+      {lhs_type.getRank() - 1},
+      /*rhsContractingDimensions=*/
+      {0});
+
+  auto precision_config = mlir::stablehlo::PrecisionConfigAttr::get(
+      b.getContext(), {precision_spec.lhs_operand_precision,
+                       precision_spec.rhs_operand_precision});
+  auto dot = mlir::stablehlo::DotGeneralOp::create(
+      b, acc.getType(), lhs, rhs, dot_dimension_numbers,
+      /*precision_config=*/precision_config,
+      /*algorithm=*/
+      stablehlo::ConvertDotAlgorithm(precision_spec.algorithm, &b));
+
+  auto add_result =
+      mlir::isa<mlir::IntegerType>(dot.getResult().getType().getElementType())
+          ? mlir::arith::AddIOp::create(b, acc, dot)
+          : mlir::arith::AddFOp::create(b, acc, dot);
+  return add_result->getResult(0);
 }
 
-ttir::InputPrecision InferDotPrecision(const HloDotInstruction& dot) {
-  if (dot.precision_config().algorithm() ==
-      PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
-    return ttir::InputPrecision::TF32x3;
-  }
-
-  return IsTf32Allowed(dot) ? ttir::InputPrecision::TF32
-                            : ttir::InputPrecision::IEEE;
-}
+}  // namespace
 
 absl::StatusOr<Type> GetAlgUnsetAccumulatorType(EmitterLocOpBuilder b,
                                                 const HloDotInstruction& dot) {
@@ -318,102 +199,6 @@ absl::StatusOr<Type> GetAlgUnsetAccumulatorType(EmitterLocOpBuilder b,
                                                         : b.getF32Type();
 }
 
-absl::StatusOr<Value> EmitDotAlgUnset(EmitterLocOpBuilder b,
-                                      const DotOperands& dot_operands,
-                                      const PrecisionSpec& precision_spec) {
-  // Execute matrix multiplication of input tiles and pass the accumulator.
-  // TODO(manany): Should be looked into once we enable Hopper workloads.
-  // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
-  // lower precision than the output type. The change was introduced here:
-  // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
-  Value lhs = dot_operands.lhs;
-  Value rhs = dot_operands.rhs;
-  Value acc = dot_operands.accumulator;
-
-  int max_num_imprecise_acc = 0;
-  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
-    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
-    // sense to enable frequent accumulator promotion at higher matmul
-    // precisions set in the config.
-    max_num_imprecise_acc = std::numeric_limits<int>::max();
-  }
-
-  return b.create<ttir::DotOp>(
-      lhs, rhs, acc,
-      /*inputPrecision=*/precision_spec.ttir_input_precision,
-      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
-}
-
-absl::StatusOr<Value> EmitRegularDot(EmitterLocOpBuilder b,
-                                     const DotOperands& dot_operands,
-                                     const PrecisionSpec& precision_spec) {
-  Value lhs = dot_operands.lhs;
-  Value rhs = dot_operands.rhs;
-
-  int max_num_imprecise_acc = 0;
-  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
-    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
-    // sense to enable frequent accumulator promotion at higher matmul
-    // precisions set in the config.
-    max_num_imprecise_acc = std::numeric_limits<int>::max();
-  }
-
-  // Cast F32 inputs to BF16 if the algorithm is BF16_BF16_F32.
-  // TODO(bchetioui): abstract this.
-  if (precision_spec.algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32) {
-    if (ElementType(lhs).isF32()) {
-      lhs = Cast(b, lhs, b.getBF16Type());
-    }
-
-    if (ElementType(rhs).isF32()) {
-      rhs = Cast(b, rhs, b.getBF16Type());
-    }
-  }
-
-  return b.create<ttir::DotOp>(
-      dot_operands.lhs, dot_operands.rhs, dot_operands.accumulator,
-      /*inputPrecision=*/precision_spec.ttir_input_precision,
-      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
-}
-
-// Returns an emitter for the given dot algorithm. Raises an
-// `UnimplementedError` if the algorithm is not supported.
-absl::StatusOr<AlgorithmEmitter> GetAlgorithmEmitter(
-    const PrecisionConfig::Algorithm algorithm) {
-  switch (algorithm) {
-    case PrecisionConfig::ALG_UNSET:
-      return EmitDotAlgUnset;
-    case PrecisionConfig::ALG_DOT_F16_F16_F16:
-    case PrecisionConfig::ALG_DOT_F32_F32_F32:
-    case PrecisionConfig::ALG_DOT_F64_F64_F64:
-    case PrecisionConfig::ALG_DOT_F16_F16_F32:
-    case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
-      return EmitRegularDot;
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
-      return EmitBF16x3Matmul;
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
-      return EmitBF16x6Matmul;
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
-      // TODO(bchetioui): this should be factored out of EmitRegularDot.
-      return EmitRegularDot;
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
-      // TODO(bchetioui): this should be factored out of EmitRegularDot.
-      return EmitRegularDot;
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
-      return EmitBF16x9Matmul;
-    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
-    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
-    default:
-      break;
-  }
-
-  // Couldn't find an algorithm emitter for this algorithm. Raise an error.
-  return absl::UnimplementedError(
-      absl::StrCat("This algorithm is not supported yet: ",
-                   PrecisionConfig::Algorithm_Name(algorithm)));
-}
-
 // Returns the `Type` that the dot operands should be casted to if there is a
 // clear candidate. Raises an error if there are multiple allowed choices but
 // the operands do not already conform to any of them. Returns `std::nullopt` if
@@ -487,11 +272,11 @@ absl::StatusOr<Value> EmitSingleTileDot(EmitterLocOpBuilder b,
                                         DotOperands dot_operands) {
   PrecisionConfig::Algorithm algorithm = dot.precision_config().algorithm();
   PrecisionSpec precision_spec{
-      algorithm, dot.precision_config().operand_precision(0),
-      dot.precision_config().operand_precision(1), InferDotPrecision(dot)};
-
-  TF_ASSIGN_OR_RETURN(AlgorithmEmitter algorithm_emitter,
-                      GetAlgorithmEmitter(algorithm));
+      algorithm,
+      XlaPrecisionToStableHloPrecision(
+          dot.precision_config().operand_precision(0)),
+      XlaPrecisionToStableHloPrecision(
+          dot.precision_config().operand_precision(1))};
 
   TF_ASSIGN_OR_RETURN(std::optional<Type> force_operands_type,
                       GetForceOperandsType(b, dot, dot_operands));
@@ -514,8 +299,9 @@ absl::StatusOr<Value> EmitSingleTileDot(EmitterLocOpBuilder b,
         Cast(b, dot_operands.accumulator, force_accumulator_type);
   }
 
-  TF_ASSIGN_OR_RETURN(Value result,
-                      algorithm_emitter(b, dot_operands, precision_spec));
+  Value result =
+      EmitStableHloDotAndAdd(b, dot_operands.lhs, dot_operands.rhs,
+                             dot_operands.accumulator, precision_spec);
 
   // TODO(b/393299275): once we've moved on from the legacy emitter, we should
   // make sure that this accumulator type is equal to the one derived here.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
index 24268fca654848..1a55f393fdbe8a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -27,6 +28,15 @@ namespace xla {
 namespace gpu {
 namespace triton {
 
+// Precision-relevant configuration bits for `dot`s.
+struct PrecisionSpec {
+  PrecisionConfig::Algorithm algorithm;
+  // TODO(bchetioui): we hope to get rid of operand precisions eventually, they
+  // are currently a (XLA-wide) bridge to work with ALG_UNSET.
+  mlir::stablehlo::Precision lhs_operand_precision;
+  mlir::stablehlo::Precision rhs_operand_precision;
+};
+
 // Carries named `Value`s corresponding to `dot` operands. This includes an
 // accumulator.
 struct DotOperands {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_legacy_test.cc
deleted file mode 100644
index 967ddb13fef41a..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_legacy_test.cc
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <initializer_list>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_replace.h"
-#include "absl/strings/string_view.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/STLExtras.h"
-#include "xla/autotuning.pb.h"
-#include "xla/backends/gpu/codegen/triton/test_utils.h"
-#include "xla/backends/gpu/profiler/kernel_name_tracer.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_print_options.h"
-#include "xla/hlo/testlib/filecheck.h"
-#include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/literal.h"
-#include "xla/literal_util.h"
-#include "xla/service/dump.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tests/test_utils.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-class AlgorithmTest : public GpuCodegenTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_autotune_level(0);
-    // Use legacy Triton emitter for these tests by removing all generic
-    // Triton emitter features
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    return debug_options;
-  }
-
-  std::string HloModuleTestName() const {
-    auto test_info = ::testing::UnitTest::GetInstance()->current_test_info();
-    return absl::StrReplaceAll(
-        absl::StrCat(test_info->test_suite_name(), "_", test_info->name()),
-        {{"/", "_"}});
-  }
-
-  stream_executor::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
-  }
-
-  const stream_executor::GpuComputeCapability& GpuComputeComp() {
-    return device_desc().gpu_compute_capability();
-  }
-
- protected:
-  const stream_executor::DeviceDescription& device_desc() {
-    return backend().default_stream_executor()->GetDeviceDescription();
-  }
-};
-
-// In these tests, we depend on "algorithm" annotations for selecting the 6XBF16
-// algorithm.
-class Triton6xBF16GemmTest : public AlgorithmTest {
- protected:
-  void SetUp() override {
-    if (!SupportsBF16(GpuComputeComp())) {
-      GTEST_SKIP() << "BF16 not supported.";
-    }
-  }
-};
-
-class BlasAlgorithmTest : public AlgorithmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = AlgorithmTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_triton_gemm(false);
-    return debug_options;
-  }
-};
-
-using TritonAlgorithmTest = AlgorithmTest;
-
-TEST_F(AlgorithmTest, Algorithm3xBF16) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm3xBF16
-
-    ENTRY e {
-      p0 = f32[128,128] parameter(0)
-      p1 = f32[128,128] parameter(1)
-      ROOT dot = f32[128,128] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x3
-    }
-  )";
-  EXPECT_TRUE(
-      RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0.001, /*arel=*/0.001}));
-}
-
-TEST_F(AlgorithmTest, Algorithm6xBF16) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm6xBF16
-
-    ENTRY e {
-      p0 = f32[128,128] parameter(0)
-      p1 = f32[128,128] parameter(1)
-      ROOT dot = f32[128,128] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x6
-    }
-  )";
-  EXPECT_TRUE(
-      RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0.001, /*arel=*/0.001}));
-}
-
-TEST_F(BlasAlgorithmTest, Algorithm_BF16_BF16_F32) {
-  // We check that the algorithm is propagated to the BLAS call.
-  // We also check that the kernel name matches the algorithm for Ampere.
-  // The algorithm for Hopper is not the one we expect because it uses TF32.
-
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_BF16_BF16_F32
-
-    ENTRY main {
-      lhs = f32[8512,256]{1,0} parameter(0)
-      rhs = f32[256,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_bf16_bf16_f32,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  constexpr absl::string_view kPattern = R"(
-    CHECK:  %convert{{.*}} = bf16[
-    CHECK:  %convert{{.*}} = bf16[
-    CHECK: "algorithm":"ALG_UNSET"
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  ASSERT_TRUE(ok);
-
-  absl::StatusOr<std::unique_ptr<KernelNameTracer>> tracer =
-      KernelNameTracer::Create(
-          backend().default_stream_executor()->GetPlatform()->id());
-  if (!tracer.ok()) {
-    GTEST_SKIP() << "KernelNameTracer is not implemented.";
-  }
-  tracer.value()->start();
-  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
-  auto kernel_names = tracer.value()->stop();
-
-  auto cc = GetCudaComputeCapability();
-  using CudaComputeCapabilities =
-      stream_executor::CudaComputeCapability::CudaComputeCapabilities;
-  switch (cc.major) {
-    case CudaComputeCapabilities::kBlackwell:
-      EXPECT_THAT(kernel_names, ::testing::UnorderedElementsAre(
-                                    ::testing::Eq("wrapped_convert"),
-                                    ::testing::Eq("wrapped_convert_1"),
-                                    ::testing::HasSubstr("nvjet")));
-      break;
-    case CudaComputeCapabilities::kAmpere:
-      EXPECT_THAT(kernel_names, ::testing::UnorderedElementsAre(
-                                    ::testing::Eq("wrapped_convert"),
-                                    ::testing::Eq("wrapped_convert_1"),
-                                    ::testing::HasSubstr("gemm_bf16_")));
-      break;
-    case CudaComputeCapabilities::kHopper:
-      // Convert to bf16+cublas works faster than dot with algorithm.
-      EXPECT_THAT(kernel_names,
-                  ::testing::Contains(::testing::HasSubstr("wrapped_convert"))
-                      .Times(2));
-      break;
-    default:
-      GTEST_SKIP() << "Unsupported compute capability: " << cc.major
-                   << " has the kernel name: " << kernel_names[0];
-  }
-}
-
-TEST_F(AlgorithmTest, Algorithm_BF16_BF16_F32_on_BF16_input_for_multiply) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_BF16_BF16_F32_with_BF16_input
-
-    ENTRY main {
-      lhs = bf16[256,8512] parameter(0)
-      rhs = bf16[256,8512] parameter(1)
-      ROOT dot = f32[256] dot(lhs, rhs),
-          algorithm=dot_bf16_bf16_f32,
-          lhs_batch_dims={0},
-          rhs_batch_dims={0},
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={1}
-    }
-  )";
-  // Multiply on a100 operates with f32, h100 operates with bf16.
-  const std::string pattern = R"(
-    CHECK:    %[[multiply:.*]] = [[type:.*]][256,8512]{1,0} multiply([[type]]
-    CHECK:    %[[reduce:.*]] = f32[256]{0} reduce(
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto ok,
-      RunFileCheck(
-          module->ToString(HloPrintOptions().set_print_operand_shape(true)),
-          pattern));
-  ASSERT_TRUE(ok);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-7, /*arel=*/1e-7}));
-}
-
-TEST_F(BlasAlgorithmTest, Algorithm_BF16_BF16_F32_X3) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_BF16_BF16_F32_X3
-
-    ENTRY main {
-      lhs = f32[8512,256]{1,0} parameter(0)
-      rhs = f32[256,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_bf16_bf16_f32_x3,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  // Single dot was replaced with 3 dots.
-  constexpr absl::string_view kPattern = R"(
-    CHECK-COUNT-3: custom_call_target="__cublas$gemm"
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  ASSERT_TRUE(ok);
-
-  absl::StatusOr<std::unique_ptr<KernelNameTracer>> tracer =
-      KernelNameTracer::Create(
-          backend().default_stream_executor()->GetPlatform()->id());
-  if (!tracer.ok()) {
-    GTEST_SKIP() << "KernelNameTracer is not implemented.";
-  }
-  tracer.value()->start();
-  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
-  auto kernel_names = tracer.value()->stop();
-
-  auto cc = GetCudaComputeCapability();
-  using CudaComputeCapabilities =
-      stream_executor::CudaComputeCapability::CudaComputeCapabilities;
-  switch (cc.major) {
-    case CudaComputeCapabilities::kAmpere:
-      EXPECT_THAT(kernel_names, ::testing::UnorderedElementsAre(
-                                    ::testing::HasSubstr("loop_convert_fusion"),
-                                    ::testing::HasSubstr("loop_convert_fusion"),
-                                    ::testing::HasSubstr("loop_select_fusion"),
-                                    ::testing::HasSubstr("gemm_bf16_"),
-                                    ::testing::HasSubstr("gemm_bf16_"),
-                                    ::testing::HasSubstr("gemm_bf16_")));
-      break;
-    case CudaComputeCapabilities::kHopper:
-    case CudaComputeCapabilities::kBlackwell:
-      EXPECT_THAT(kernel_names, ::testing::UnorderedElementsAre(
-                                    ::testing::HasSubstr("loop_convert_fusion"),
-                                    ::testing::HasSubstr("loop_convert_fusion"),
-                                    ::testing::HasSubstr("loop_select_fusion"),
-                                    ::testing::HasSubstr("nvjet"),
-                                    ::testing::HasSubstr("nvjet"),
-                                    ::testing::HasSubstr("nvjet")));
-      break;
-    default:
-      GTEST_SKIP() << "Unsupported compute capability: " << cc.major
-                   << " has the kernel name: " << kernel_names[0];
-  }
-}
-
-TEST_F(BlasAlgorithmTest, Algorithm_BF16_BF16_F32_X6) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_BF16_BF16_F32_X6
-
-    ENTRY main {
-      lhs = f32[8512,256]{1,0} parameter(0)
-      rhs = f32[256,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_bf16_bf16_f32_x6,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  // Single dot was replaced with 3 dots.
-  constexpr absl::string_view kPattern = R"(
-    CHECK-COUNT-6: custom_call_target="__cublas$gemm"
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  ASSERT_TRUE(ok);
-
-  absl::StatusOr<std::unique_ptr<KernelNameTracer>> tracer =
-      KernelNameTracer::Create(
-          backend().default_stream_executor()->GetPlatform()->id());
-  if (!tracer.ok()) {
-    GTEST_SKIP() << "KernelNameTracer is not implemented.";
-  }
-  tracer.value()->start();
-  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
-  auto kernel_names = tracer.value()->stop();
-
-  auto cc = GetCudaComputeCapability();
-  using CudaComputeCapabilities =
-      stream_executor::CudaComputeCapability::CudaComputeCapabilities;
-  switch (cc.major) {
-    case CudaComputeCapabilities::kAmpere:
-      EXPECT_THAT(kernel_names, ::testing::UnorderedElementsAre(
-                                    ::testing::HasSubstr("loop_convert_fusion"),
-                                    ::testing::HasSubstr("loop_convert_fusion"),
-                                    ::testing::HasSubstr("loop_select_fusion"),
-                                    ::testing::HasSubstr("wrapped_add"),
-                                    ::testing::HasSubstr("gemm_bf16_"),
-                                    ::testing::HasSubstr("gemm_bf16_"),
-                                    ::testing::HasSubstr("gemm_bf16_"),
-                                    ::testing::HasSubstr("gemm_bf16_"),
-                                    ::testing::HasSubstr("gemm_bf16_"),
-                                    ::testing::HasSubstr("gemm_bf16_")));
-      break;
-    case CudaComputeCapabilities::kHopper:
-    case CudaComputeCapabilities::kBlackwell:
-      EXPECT_THAT(
-          kernel_names,
-          ::testing::UnorderedElementsAre(
-              ::testing::HasSubstr("loop_convert_fusion"),
-              ::testing::HasSubstr("loop_convert_fusion"),
-              ::testing::HasSubstr("loop_select_fusion"),
-              ::testing::HasSubstr("wrapped_add"),
-              ::testing::HasSubstr("nvjet"), ::testing::HasSubstr("nvjet"),
-              ::testing::HasSubstr("nvjet"), ::testing::HasSubstr("nvjet"),
-              ::testing::HasSubstr("nvjet"), ::testing::HasSubstr("nvjet")));
-      break;
-    default:
-      GTEST_SKIP() << "Unsupported compute capability: " << cc.major
-                   << " has the kernel name: " << kernel_names[0];
-  }
-}
-
-TEST_F(BlasAlgorithmTest, Algorithm_TF32_TF32_F32_X3) {
-  // We check that the algorithm is propagated to the BLAS call.
-  // We also check that the kernel name matches the algorithm for Ampere.
-
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_TF32_TF32_F32_X3
-
-    ENTRY main {
-      lhs = f32[8512,256]{1,0} parameter(0)
-      rhs = f32[256,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_tf32_tf32_f32_x3,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  constexpr absl::string_view kPattern = R"(
-      CHECK: custom_call_target="__cublas$gemm"{{.*}}"algorithm":"ALG_DOT_TF32_TF32_F32"
-      CHECK: custom_call_target="__cublas$gemm"{{.*}}"algorithm":"ALG_DOT_TF32_TF32_F32"
-      CHECK: custom_call_target="__cublas$gemm"{{.*}}"algorithm":"ALG_DOT_TF32_TF32_F32"
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  ASSERT_TRUE(ok);
-
-  absl::StatusOr<std::unique_ptr<KernelNameTracer>> tracer =
-      KernelNameTracer::Create(
-          backend().default_stream_executor()->GetPlatform()->id());
-  if (!tracer.ok()) {
-    GTEST_SKIP() << "KernelNameTracer is not implemented.";
-  }
-  tracer.value()->start();
-  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
-  auto kernel_names = tracer.value()->stop();
-
-  auto cc = GetCudaComputeCapability();
-  using CudaComputeCapabilities =
-      stream_executor::CudaComputeCapability::CudaComputeCapabilities;
-  switch (cc.major) {
-    case CudaComputeCapabilities::kBlackwell:
-    case CudaComputeCapabilities::kAmpere:
-    case CudaComputeCapabilities::kHopper:
-      EXPECT_THAT(kernel_names, ::testing::UnorderedElementsAre(
-                                    ::testing::HasSubstr("loop_and_subtract"),
-                                    ::testing::HasSubstr("loop_and_subtract"),
-                                    ::testing::HasSubstr("loop_select_fusion"),
-                                    ::testing::HasSubstr("gemm_"),
-                                    ::testing::HasSubstr("gemm_"),
-                                    ::testing::HasSubstr("gemm_")));
-      break;
-    default:
-      GTEST_SKIP() << "Unsupported compute capability: " << cc.major
-                   << " has the kernel name: " << kernel_names[0];
-  }
-}
-
-TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule Emit6xBF16GemmWhenBothInputsAreF32
-
-    triton_dot {
-      p0 = f32[5,7] parameter(0)
-      p1 = f32[7,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x6
-    }
-
-    ENTRY e {
-      p0 = f32[5,7]{1,0} parameter(0)
-      p1 = f32[7,33]{1,0} parameter(1)
-      ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
-    }
-  )";
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
-CHECK:          %[[C0:.*]] = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
-CHECK:          %[[LHS_HI_BF16:.*]] = arith.truncf %[[LHS_INPUT:.*]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[LHS_HI_F32:.*]] = arith.extf %[[LHS_HI_BF16]] : tensor<32x32xbf16> to tensor<32x32xf32>
-CHECK:          %[[LHS_MED_INPUT_F32:.*]] = arith.subf %[[LHS_INPUT]], %[[LHS_HI_F32]] : tensor<32x32xf32>
-CHECK:          %[[LHS_MED_BF16:.*]] = arith.truncf %[[LHS_MED_INPUT_F32]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[LHS_MED_F32:.*]] = arith.extf %[[LHS_MED_BF16]] : tensor<32x32xbf16> to tensor<32x32xf32>
-CHECK:          %[[LHS_LOW_INPUT_F32:.*]] = arith.subf %[[LHS_MED_INPUT_F32]], %[[LHS_MED_F32]] : tensor<32x32xf32>
-CHECK:          %[[LHS_LOW_BF16:.*]] = arith.truncf %[[LHS_LOW_INPUT_F32]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[RHS_HI_BF16:.*]] = arith.truncf %[[RHS_INPUT:.*]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[RHS_HI_F32:.*]] = arith.extf %[[RHS_HI_BF16]] : tensor<32x32xbf16> to tensor<32x32xf32>
-CHECK:          %[[RHS_MED_INPUT_F32:.*]] = arith.subf %[[RHS_INPUT]], %[[RHS_HI_F32]] : tensor<32x32xf32>
-CHECK:          %[[RHS_MED_BF16:.*]] = arith.truncf %[[RHS_MED_INPUT_F32]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[RHS_MED_F32:.*]] = arith.extf %[[RHS_MED_BF16]] : tensor<32x32xbf16> to tensor<32x32xf32>
-CHECK:          %[[RHS_LOW_INPUT_F32:.*]] = arith.subf %[[RHS_MED_INPUT_F32]], %[[RHS_MED_F32]] : tensor<32x32xf32>
-CHECK:          %[[RHS_LOW_BF16:.*]] = arith.truncf %[[RHS_LOW_INPUT_F32]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK-COUNT-5:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x
-CHECK:          %[[ABS:.*]] = math.absf
-CHECK:          %[[CMP:.*]] = arith.cmpf ogt, %[[INFINITY]], %[[ABS]] : tensor<32x32xf32>
-CHECK:          %[[SELECT:.*]] = arith.select %[[CMP]], %{{.*}}, %[[C0]] : tensor<32x32xi1>, tensor<32x32xf32>
-CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
-CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf32>
-    )"));
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-6,
-                                                           /*arel=*/1e-6}));
-}
-
-TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule Triton6xBF16GemmWorksForLongContractingDimension
-
-    triton_dot {
-      p0 = f32[5,2048] parameter(0)
-      p1 = f32[2048,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x6
-    }
-
-    ENTRY e {
-      p0 = f32[5,2048]{1,0} parameter(0)
-      p1 = f32[2048,33]{1,0} parameter(1)
-      ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}
-    }
-  )";
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
-    )"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-5,
-                                                           /*arel=*/1e-5}));
-}
-
-TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmEndToEnd) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X6 not supported on ROCM.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Emit6xBF16GemmEndToEnd
-
-    ENTRY e {
-      p0 = f32[5,32] parameter(0)
-      p1 = f32[32,7] parameter(1)
-      ROOT dot = f32[5,7] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x6
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                R"(
-CHECK: mma.sync.aligned.{{.*}}.row.col.f32.bf16.bf16.f32
-CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
-)");
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6,
-                                                /*arel=*/1e-6}));
-}
-
-// In these tests, we depend on "algorithm" annotations for selecting the 3XBF16
-// algorithm.
-using Triton3xBF16GemmTest = AlgorithmTest;
-
-TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule Emit3xBF16GemmWhenBothInputsAreF32
-
-    triton_dot {
-      p0 = f32[5,7] parameter(0)
-      p1 = f32[7,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x3
-    }
-
-    ENTRY e {
-      p0 = f32[5,7]{1,0} parameter(0)
-      p1 = f32[7,33]{1,0} parameter(1)
-      ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
-    }
-  )";
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
-CHECK:          %[[C0:.*]] = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
-CHECK:          %[[LHS_HI_BF16:.*]] = arith.truncf %[[LHS_INPUT:.*]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[LHS_HI_F32:.*]] = arith.extf %[[LHS_HI_BF16]] : tensor<32x32xbf16> to tensor<32x32xf32>
-CHECK:          %[[LHS_LOW_INPUT_F32:.*]] = arith.subf %[[LHS_INPUT]], %[[LHS_HI_F32]] : tensor<32x32xf32>
-CHECK:          %[[LHS_LOW_BF16:.*]] = arith.truncf %[[LHS_LOW_INPUT_F32]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[RHS_HI_BF16:.*]] = arith.truncf %[[RHS_INPUT:.*]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK:          %[[RHS_HI_F32:.*]] = arith.extf %[[RHS_HI_BF16]] : tensor<32x32xbf16> to tensor<32x32xf32>
-CHECK:          %[[RHS_LOW_INPUT_F32:.*]] = arith.subf %[[RHS_INPUT]], %[[RHS_HI_F32]] : tensor<32x32xf32>
-CHECK:          %[[RHS_LOW_BF16:.*]] = arith.truncf %[[RHS_LOW_INPUT_F32]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK-COUNT-2:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
-CHECK:          %[[ABS:.*]] = math.absf
-CHECK:          %[[CMP:.*]] = arith.cmpf ogt, %[[INFINITY]], %[[ABS]] : tensor<32x32xf32>
-CHECK:          %[[SELECT:.*]] = arith.select %[[CMP]], %{{.*}}, %[[C0]] : tensor<32x32xi1>, tensor<32x32xf32>
-CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
-CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf32>
-    )"));
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-5,
-                                                           /*arel=*/1e-5}));
-}
-
-TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForLongContractingDimension) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule Triton3xBF16GemmWorksForLongContractingDimension
-
-    triton_dot {
-      p0 = f32[5,2048] parameter(0)
-      p1 = f32[2048,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x3
-    }
-
-    ENTRY e {
-      p0 = f32[5,2048]{1,0} parameter(0)
-      p1 = f32[2048,33]{1,0} parameter(1)
-      ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}
-    }
-  )";
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
-    )"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-4,
-                                                           /*arel=*/1e-4}));
-}
-
-TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmEndToEnd) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X3 not supported on ROCM.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Emit3xBF16GemmEndToEnd
-
-    ENTRY e {
-      p0 = f32[5,32] parameter(0)
-      p1 = f32[32,7] parameter(1)
-      ROOT dot = f32[5,7] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x3
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                R"(
-CHECK: mma.sync.aligned.{{.*}}.row.col.f32.bf16.bf16.f32
-CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
-)");
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-5,
-                                                /*arel=*/1e-5}));
-}
-
-TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32_X3) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Triton currently disabled on ROCM.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_BF16_BF16_F32_X3
-
-    ENTRY main {
-      lhs = f32[8512,64]{1,0} parameter(0)
-      rhs = f32[64,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_bf16_bf16_f32_x3,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  constexpr absl::string_view kPattern =
-      R"(CHECK: "kind":"__triton_gemm","triton_gemm_config")";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  EXPECT_TRUE(ok);
-}
-
-TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32_X6) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Triton currently disabled on ROCM.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_BF16_BF16_F32_X6
-
-    ENTRY main {
-      lhs = f32[8512,64]{1,0} parameter(0)
-      rhs = f32[64,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_bf16_bf16_f32_x6,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  constexpr absl::string_view kPattern =
-      R"(CHECK: "kind":"__triton_gemm","triton_gemm_config")";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  EXPECT_TRUE(ok);
-}
-
-TEST_F(TritonAlgorithmTest, Algorithm_TF32_TF32_F32) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Triton currently disabled on ROCM.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_TF32_TF32_F32
-
-    ENTRY main {
-      lhs = f32[128,256]{1,0} parameter(0)
-      rhs = f32[256,128]{1,0} parameter(1)
-      ROOT dot = f32[128,128]{1,0} dot(lhs, rhs),
-          algorithm=dot_tf32_tf32_f32,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  constexpr absl::string_view kPattern = R"(
-    CHECK: algorithm=dot_tf32_tf32_f32
-    CHECK: "kind":"__triton_gemm","triton_gemm_config"
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  EXPECT_TRUE(ok);
-}
-
-TEST_F(TritonAlgorithmTest, Algorithm_TF32_TF32_F32_X3) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Triton currently disabled on ROCM.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_TF32_TF32_F32_X3
-
-    ENTRY main {
-      lhs = f32[8512,64]{1,0} parameter(0)
-      rhs = f32[64,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_tf32_tf32_f32_x3,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  constexpr absl::string_view kPattern =
-      R"(CHECK: "kind":"__triton_gemm","triton_gemm_config")";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  EXPECT_TRUE(ok);
-}
-
-TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Triton currently disabled on ROCM.";
-  }
-  constexpr absl::string_view kHloText = R"(
-    HloModule Algorithm_BF16_BF16_F32
-
-    ENTRY main {
-      lhs = f32[8512,64]{1,0} parameter(0)
-      rhs = f32[64,8512]{1,0} parameter(1)
-      ROOT dot = f32[8512,8512]{1,0} dot(lhs, rhs),
-          algorithm=dot_bf16_bf16_f32,
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
-    }
-  )";
-  constexpr absl::string_view kPattern =
-      R"(CHECK: "kind":"__triton_gemm","triton_gemm_config")";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), kPattern));
-  EXPECT_TRUE(ok);
-}
-
-TEST_F(TritonAlgorithmTest, Dot_BF16_X6_WithConst) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule Dot_BF16_X6_WithConst
-
-    %triton_fusion_dot (p_0: f32[1,258]) -> f32[258] {
-      %c_1 = f32[] constant(-1.22474492)
-      %r_1 = f32[1]{0} reshape(f32[] %c_1)
-      %r_2 = f32[1,1]{1,0} reshape(f32[1]{0} %r_1)
-      %p_0 = f32[1,258]{1,0} parameter(0)
-      %r_3 = f32[258]{0} reshape(f32[1,258]{1,0} %p_0)
-      %r_4 = f32[258,1]{1,0} reshape(f32[258]{0} %r_3)
-      %dot_0 = f32[1,258]{1,0} dot(f32[1,1]{1,0} %r_2, f32[258,1]{1,0} %r_4),
-          lhs_contracting_dims={0},
-          rhs_contracting_dims={1},
-          algorithm=dot_bf16_bf16_f32_x6
-      %r_5 = f32[258]{0} reshape(f32[1,258]{1,0} %dot_0)
-      %c_2 = f32[] constant(0.282094777)
-      %b_0 = f32[258]{0} broadcast(f32[] %c_2), dimensions={}
-      ROOT %m_0 = f32[258]{0} multiply(f32[258]{0} %r_5, f32[258]{0} %b_0)
-    }
-
-    ENTRY %entry_computation {
-      %p_0 = f32[1,258]{1,0} parameter(0)
-      ROOT %dot = f32[258]{0} fusion(f32[1,258]{1,0} %p_0),
-        kind=kCustom,
-        calls=%triton_fusion_dot,
-        backend_config={
-          "operation_queue_id":"0",
-          "wait_on_operation_queues":[],
-          "fusion_backend_config":{
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":"16",
-              "block_n":"256",
-              "block_k":"16",
-              "split_k":"1",
-              "num_stages":"4",
-              "num_warps":"4",
-              "num_ctas":"1"
-            }
-          },
-          "force_earliest_schedule":false
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
-}
-
-using PC = PrecisionConfig;
-using ::testing::TestParamInfo;
-using ::testing::WithParamInterface;
-
-std::string AlgorithmTestParamToString(
-    const TestParamInfo<PC::Algorithm>& info) {
-  return AlgorithmToString(info.param);
-}
-
-class NumericTestsArguments {
- public:
-  NumericTestsArguments() {
-    InitInfinityArguments();
-    InitNaNArguments();
-    InitLargeExponentArguments();
-  }
-  std::vector<Literal*> infinity_arguments_ptrs() {
-    return to_pointers(infinity_arguments_);
-  }
-  const std::vector<Literal>& infinity_arguments() {
-    return infinity_arguments_;
-  }
-  std::vector<Literal*> nan_arguments_ptrs() {
-    return to_pointers(nan_arguments_);
-  }
-  const std::vector<Literal>& nan_arguments() { return nan_arguments_; }
-  std::vector<Literal*> large_exponent_arguments_ptr() {
-    return to_pointers(large_exponent_arguments_);
-  }
-  const std::vector<Literal>& large_exponent_arguments() {
-    return large_exponent_arguments_;
-  }
-
-  static constexpr float kLargeExponentFloat = 0x1.0103p72f;
-
- private:
-  void InitInfinityArguments() {
-    auto inf = +std::numeric_limits<float>::infinity();
-    infinity_arguments_.push_back(LiteralUtil::CreateR2<float>(
-        {{inf, inf, inf, inf, inf, inf, inf, inf},
-         {inf, inf, inf, inf, inf, inf, inf, inf},
-         {inf, inf, inf, inf, inf, inf, inf, inf},
-         {inf, inf, inf, inf, inf, inf, inf, inf},
-         {inf, inf, inf, inf, inf, inf, inf, inf},
-         {inf, inf, inf, inf, inf, inf, inf, inf},
-         {inf, inf, inf, inf, inf, inf, inf, inf},
-         {inf, inf, inf, inf, inf, inf, inf, inf}}));
-    auto one = 1.0f;
-    infinity_arguments_.push_back(LiteralUtil::CreateR2<float>(
-        {{one, one, one, one, one, one, one, one},
-         {one, one, one, one, one, one, one, one},
-         {one, one, one, one, one, one, one, one},
-         {one, one, one, one, one, one, one, one},
-         {one, one, one, one, one, one, one, one},
-         {one, one, one, one, one, one, one, one},
-         {one, one, one, one, one, one, one, one},
-         {one, one, one, one, one, one, one, one}}));
-  }
-  void InitNaNArguments() {
-    auto nan = std::numeric_limits<float>::quiet_NaN();
-    auto inf = +std::numeric_limits<float>::infinity();
-    auto one = 1.0f;
-    nan_arguments_.push_back(LiteralUtil::CreateR2<float>(
-        {{nan, nan, nan, nan, nan, nan, nan, nan},
-         {nan, nan, nan, nan, nan, nan, nan, nan},
-         {nan, nan, nan, nan, nan, nan, nan, nan},
-         {nan, nan, nan, nan, nan, nan, nan, nan},
-         {nan, nan, nan, nan, nan, nan, nan, nan},
-         {nan, nan, nan, nan, nan, nan, nan, nan},
-         {nan, nan, nan, nan, nan, nan, nan, nan},
-         {nan, nan, nan, nan, nan, nan, nan, nan}}));
-    nan_arguments_.push_back(LiteralUtil::CreateR2<float>(
-        {{one, inf, inf, inf, inf, inf, inf, inf},
-         {one, inf, inf, inf, inf, inf, inf, inf},
-         {one, inf, inf, inf, inf, inf, inf, inf},
-         {one, inf, inf, inf, inf, inf, inf, inf},
-         {one, inf, inf, inf, inf, inf, inf, inf},
-         {one, inf, inf, inf, inf, inf, inf, inf},
-         {one, inf, inf, inf, inf, inf, inf, inf},
-         {one, inf, inf, inf, inf, inf, inf, inf}}));
-  }
-
-  void InitLargeExponentArguments() {
-    auto le = kLargeExponentFloat;
-    auto one = 1.0f;
-    large_exponent_arguments_.push_back(LiteralUtil::CreateR2<float>(
-        {{le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one}}));
-    large_exponent_arguments_.push_back(LiteralUtil::CreateR2<float>(
-        {{le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one},
-         {-le, one, one, one, one, one, one, one}}));
-  }
-
-  std::vector<Literal*> to_pointers(const std::vector<Literal>& literals) {
-    std::vector<Literal*> result;
-    absl::c_transform(
-        literals, std::back_inserter(result),
-        [](const Literal& literal) { return const_cast<Literal*>(&literal); });
-    return result;
-  }
-
-  std::vector<Literal> infinity_arguments_;
-  std::vector<Literal> nan_arguments_;
-  std::vector<Literal> large_exponent_arguments_;
-};
-
-class NumericTestsForBlas : public BlasAlgorithmTest,
-                            public NumericTestsArguments,
-                            public WithParamInterface<PC::Algorithm> {
- public:
-  NumericTestsForBlas() : BlasAlgorithmTest() {
-    algorithm_ = AlgorithmToString(GetParam());
-  }
-
-  std::string HloText() const {
-    return absl::StrFormat(kHloTextTemplate, HloModuleTestName(), algorithm_);
-  }
-
-  static constexpr absl::string_view kPattern = R"(CHECK: __cublas$gemm)";
-
-  static constexpr absl::string_view kReferenceHloText = R"(
-    HloModule %s
-
-    ENTRY e {
-      p0 = f32[8,8] parameter(0)
-      p1 = f32[8,8] parameter(1)
-      ROOT dot = f32[8,8] dot(p0, p1),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={0}
-    }
-  )";
-
-  // Takes the reference hlo and compiles it for cublas.
-  std::unique_ptr<HloModule> GetReferenceModuleForCublas() {
-    auto reference_options = GetDebugOptionsForTest();
-    reference_options.set_xla_gpu_triton_gemm_any(false);
-    reference_options.set_xla_gpu_enable_triton_gemm(false);
-    reference_options.set_xla_gpu_cublas_fallback(true);
-
-    HloModuleConfig config;
-    config.set_debug_options(reference_options);
-    config.set_replica_count(1);
-    config.set_num_partitions(1);
-
-    auto reference_module =
-        ParseAndReturnVerifiedModule(kReferenceHloText, config);
-    CHECK_OK(reference_module.status());
-
-    auto optimized_module =
-        GetOptimizedModule(std::move(reference_module.value()));
-    CHECK_OK(optimized_module.status());
-    return std::move(optimized_module.value());
-  }
-
- private:
-  static constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule %s
-
-    ENTRY e {
-      p0 = f32[8,8] parameter(0)
-      p1 = f32[8,8] parameter(1)
-      ROOT dot = f32[8,8] dot(p0, p1),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={0},
-        algorithm=%s
-    }
-  )";
-
- protected:
-  std::string algorithm_;
-};
-
-class NumericTestsForTriton : public TritonAlgorithmTest,
-                              public NumericTestsArguments,
-                              public WithParamInterface<PC::Algorithm> {
- public:
-  NumericTestsForTriton() : TritonAlgorithmTest() {
-    algorithm_ = AlgorithmToString(GetParam());
-  }
-
-  std::string HloText() const {
-    return absl::StrFormat(kHloTextTemplate, HloModuleTestName(), algorithm_);
-  }
-
-  static constexpr absl::string_view kPattern = R"(CHECK: __triton_gemm)";
-
- private:
-  static constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule %s
-
-    triton_dot {
-      p0 = f32[8,8] parameter(0)
-      p1 = f32[8,8] parameter(1)
-      ROOT dot = f32[8,8] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=%s
-    }
-
-    ENTRY e {
-      p0 = f32[8,8]{1, 0} parameter(0)
-      p1 = f32[8,8]{1, 0} parameter(1)
-      ROOT _ = f32[8,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1, "num_ctas":1}}}
-    }
-  )";
-  std::string algorithm_;
-};
-
-TEST_P(NumericTestsForBlas, Infinity) {
-  std::string hlo_text = HloText();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-  auto module_text = module->ToString();
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module_text, kPattern));
-  ASSERT_TRUE(ok);
-
-  auto reference_module = GetReferenceModuleForCublas();
-
-  EXPECT_TRUE(RunAndCompareTwoModulesReplicated(
-      std::move(reference_module), std::move(module), infinity_arguments(),
-      /*run_hlo_passes=*/false,
-      /*use_threads=*/false, ErrorSpec{/*aabs=*/0, /*arel=*/0}))
-      << " failed for module hlo: \n"
-      << module_text;
-}
-
-TEST_P(NumericTestsForBlas, NaN) {
-  std::string hlo_text = HloText();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-  auto module_text = module->ToString();
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module_text, kPattern));
-  ASSERT_TRUE(ok);
-
-  auto reference_module = GetReferenceModuleForCublas();
-
-  EXPECT_TRUE(RunAndCompareTwoModulesReplicated(
-      std::move(reference_module), std::move(module), nan_arguments(),
-      /*run_hlo_passes=*/false,
-      /*use_threads=*/false, ErrorSpec{/*aabs=*/0, /*arel=*/0}))
-      << " failed for module hlo: \n"
-      << module_text;
-}
-
-TEST_P(NumericTestsForBlas, InputsWithLargeExponent) {
-  std::string hlo_text = HloText();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-  auto module_text = module->ToString();
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module_text, kPattern));
-  ASSERT_TRUE(ok);
-
-  auto reference_module = GetReferenceModuleForCublas();
-
-  EXPECT_TRUE(RunAndCompareTwoModulesReplicated(
-      std::move(reference_module), std::move(module),
-      large_exponent_arguments(),
-      /*run_hlo_passes=*/false,
-      /*use_threads=*/false,
-      ErrorSpec{/*aabs=*/kLargeExponentFloat * 1e-4, /*arel=*/1e-6}))
-      << " failed for module hlo: \n"
-      << module_text;
-}
-
-TEST_P(NumericTestsForBlas, PrecisionCheck) {
-  std::string hlo_text = HloText();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-  auto module_text = module->ToString();
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module_text, kPattern));
-  ASSERT_TRUE(ok);
-
-  auto reference_module = GetReferenceModuleForCublas();
-
-  // No specific inputs are needed for this test.
-  EXPECT_TRUE(RunAndCompareTwoModulesReplicated(
-      std::move(reference_module), std::move(module),
-      /*run_hlo_passes=*/false,
-      /*use_threads=*/false, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}))
-      << " failed for module hlo: \n"
-      << module_text;
-}
-
-TEST_P(NumericTestsForTriton, Infinity) {
-  // The test proves that Triton can handle dot for one x infinity inputs.
-  // It is the tricky cases for X3 and X6 algorithms. They should mask the NaN
-  // intermediate results.
-  std::string hlo_text = HloText();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-  auto module_text = module->ToString();
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module_text, kPattern));
-  ASSERT_TRUE(ok);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module),
-                                       infinity_arguments_ptrs(),
-                                       ErrorSpec{/*aabs=*/0, /*arel=*/0}))
-      << " failed for module hlo: \n"
-      << module_text;
-}
-
-TEST_P(NumericTestsForTriton, NaN) {
-  std::string hlo_text = HloText();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-
-  auto module_text = module->ToString();
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module_text, kPattern));
-  ASSERT_TRUE(ok);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), nan_arguments_ptrs(),
-                                       ErrorSpec{/*aabs=*/0, /*arel=*/0}))
-      << " failed for module hlo: \n"
-      << module_text;
-}
-
-TEST_P(NumericTestsForTriton, InputsWithLargeExponent) {
-  std::string hlo_text = HloText();
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-  auto module_text = module->ToString();
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module_text, kPattern));
-  ASSERT_TRUE(ok);
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), large_exponent_arguments_ptr(),
-      ErrorSpec{/*aabs=*/kLargeExponentFloat * 1e-4, /*arel=*/1e-6}))
-      << " failed for module hlo: \n"
-      << module_text;
-}
-
-INSTANTIATE_TEST_SUITE_P(NumericTestsForBlas, NumericTestsForBlas,
-                         ::testing::ValuesIn({PC::ALG_DOT_TF32_TF32_F32_X3,
-                                              PC::ALG_DOT_BF16_BF16_F32_X3,
-                                              PC::ALG_DOT_BF16_BF16_F32_X6,
-                                              PC::ALG_DOT_BF16_BF16_F32_X9}),
-                         AlgorithmTestParamToString);
-
-INSTANTIATE_TEST_SUITE_P(NumericTestsForTriton, NumericTestsForTriton,
-                         ::testing::ValuesIn({PC::ALG_DOT_BF16_BF16_F32_X3,
-                                              PC::ALG_DOT_BF16_BF16_F32_X6,
-                                              PC::ALG_DOT_BF16_BF16_F32_X9,
-                                              PC::ALG_DOT_TF32_TF32_F32_X3}),
-                         AlgorithmTestParamToString);
-
-// Collects the results of a test. The results can be dumped in CSV format.
-class CSVWriter {
- public:
-  // Appends a value to the current row. If there is no current row, creates a
-  // new one.
-  template <typename V>
-  void appendValue(V v) {
-    if (results_.empty()) {
-      results_.emplace_back();
-    }
-    results_.back().push_back(absl::StrCat(v));
-  }
-
-  // Appends a new empty row.
-  void nextRow() { results_.emplace_back(); }
-
-  // Appends a row with the given values.
-  template <typename V>
-  void appendRow(std::vector<V> v) {
-    results_.emplace_back();
-    for (const auto& v : v) {
-      results_.back().push_back(absl::StrCat(v));
-    }
-  }
-
-  // Returns the results in CSV format.
-  std::string GetResult(absl::string_view title,
-                        absl::string_view delimiter = ", ",
-                        bool separate_first_row = true) const {
-    std::vector<size_t> sizes;
-    size_t columns = 0;
-    for (const auto& row : results_) {
-      columns = std::max(columns, row.size());
-      sizes.resize(columns);
-      for (int i = 0; i < row.size(); ++i) {
-        sizes[i] = std::max(sizes[i], row[i].size());
-      }
-    }
-    std::string result = absl::StrCat(title, "\n");
-    bool first_row = true;
-    for (const auto& row : results_) {
-      for (int i = 0; i < row.size(); ++i) {
-        auto format = absl::StrFormat("%%%ds", sizes[i]);
-        auto format_runtime = absl::ParsedFormat<'s'>::New(format);
-        absl::StrAppend(&result, absl::StrFormat(*format_runtime, row[i]),
-                        delimiter);
-      }
-      result += "\n";
-      if (first_row && separate_first_row) {
-        first_row = false;
-        auto total_size = delimiter.size() * (columns - 1);
-        for (const auto& size : sizes) {
-          total_size += size;
-        }
-        result += std::string(total_size, '-');
-        result += "\n";
-      }
-    }
-    return result;
-  }
-
- private:
-  std::vector<std::vector<std::string>> results_;
-};
-
-// The tests builds a matrix of MxN for different tensor sizes with the values
-// Yes/No/Fail for triton and blas and dumps the results in CSV format to the
-// test output.
-class TritonAndBlasSupportForDifferentTensorSizes
-    : public WithParamInterface<PC::Algorithm>,
-      public AlgorithmTest {
- public:
-  static auto GetModuleConfig(const DebugOptions& debug_options) {
-    HloModuleConfig config;
-    config.set_debug_options(debug_options);
-    config.set_replica_count(1);
-    config.set_num_partitions(1);
-    return config;
-  }
-
-  absl::StatusOr<std::unique_ptr<HloModule>> GetModule(
-      absl::string_view hlo_template,
-      const std::vector<std::pair<std::string, std::string>>& args,
-      const DebugOptions& options) {
-    auto config = GetModuleConfig(options);
-    auto hlo_text = absl::StrReplaceAll(hlo_template, args);
-
-    static int counter = 0;
-
-    DumpToFileInDirOrStdout(options, ++counter, GetTestName("_"), "",
-                            "hlo_text.before_passes.txt", hlo_text);
-    auto verified_module_or = ParseAndReturnVerifiedModule(hlo_text, config);
-    if (!verified_module_or.ok()) {
-      LOG(ERROR) << "Failed to parse module: " << verified_module_or.status();
-      return verified_module_or.status();
-    }
-    auto module_or = backend().compiler()->RunHloPasses(
-        std::move(verified_module_or.value()),
-        backend().default_stream_executor(), GetAllocator());
-    if (!module_or.ok()) {
-      LOG(ERROR) << "Failed to compile module: " << module_or.status();
-    }
-    DumpToFileInDirOrStdout(options, counter, GetTestName("_"), "",
-                            "hlo_text.after_passes.txt",
-                            module_or.ok() ? module_or.value()->ToString()
-                                           : module_or.status().message());
-    return module_or;
-  };
-
- protected:
-  void SetUp() override {
-    AlgorithmTest::SetUp();
-    debug_options_ = GetDebugOptionsForTest();
-
-    triton_options_ = debug_options_;
-
-    blas_options_ = debug_options_;
-    blas_options_.set_xla_gpu_enable_triton_gemm(false);
-
-    algorithm_ = AlgorithmToString(GetParam());
-  }
-
-  std::string GetTestName(absl::string_view delimiter) const {
-    auto test_info = ::testing::UnitTest::GetInstance()->current_test_info();
-    auto suite_name = test_info->test_suite_name();
-    std::string test_name = test_info->name();
-    return absl::StrReplaceAll(absl::StrCat(suite_name, delimiter, test_name),
-                               {{"/", "_"}});
-  }
-
-  void DumpResults(const CSVWriter& csv, absl::string_view suffix) {
-    auto title = absl::StrCat("Test name: ", GetTestName("."));
-    auto result = csv.GetResult(title, ", ");
-    LOG(ERROR) << "result: \n" << result;
-
-    auto test_name = GetTestName("_");
-    DumpToFileInDirOrStdout(debug_options_, 0, test_name, "", suffix, result);
-  }
-
-  DebugOptions debug_options_;
-
-  DebugOptions triton_options_;
-
-  DebugOptions blas_options_;
-
-  std::string algorithm_;
-
-  static constexpr absl::string_view kBlasPattern = "__cublas$gemm";
-  static constexpr absl::string_view kTritonGemmPattern = "__triton_gemm";
-  static constexpr int kMaxSize = 8192;
-  static constexpr int kStepSize = 8;
-  static constexpr int kMaxK = kMaxSize;
-};
-
-// The test does not fail. It just dumps the results in CSV format.
-TEST_P(TritonAndBlasSupportForDifferentTensorSizes,
-       DotThatWillBeConvertedToMultiply) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule ${module_name}
-
-    ENTRY e {
-      p0 = f32[${b},${k}] parameter(0)
-      p1 = f32[${b},${k}] parameter(1)
-      ROOT dot = f32[${b}] dot(p0, p1),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={1},
-        lhs_batch_dims={0},
-        rhs_batch_dims={0},
-        algorithm=${algorithm}
-    }
-  )";
-  CSVWriter csv;
-  csv.appendValue("M/N");
-  for (int k = 1; k <= kMaxSize; k *= kStepSize) {
-    csv.appendValue(k);
-  }
-  for (int b = 1; b <= kMaxSize; b *= kStepSize) {
-    csv.nextRow();
-    csv.appendValue(b);
-    for (int k = 1; k <= kMaxSize; k *= kStepSize) {
-      auto run = [&](absl::string_view backend, absl::string_view pattern,
-                     const DebugOptions& options) -> absl::string_view {
-        auto test_name = absl::StrReplaceAll(TestName(), {{"/", "_"}});
-        auto module_name =
-            absl::StrCat(test_name, "_", backend, "_", b, "_", k);
-        auto module = GetModule(kHloText,
-                                {{"${module_name}", module_name},
-                                 {"${algorithm}", algorithm_},
-                                 {"${b}", absl::StrCat(b)},
-                                 {"${k}", absl::StrCat(k)}},
-                                options);
-        if (!module.ok()) {
-          return "Fail";
-        }
-        return absl::StrContains(module.value()->ToString(), pattern) ? " Yes"
-                                                                      : "  No";
-      };
-
-      csv.appendValue(absl::StrCat(
-          "('triton': ", run("triton", kTritonGemmPattern, triton_options_),
-          ", 'blas': ", run("blas", kBlasPattern, blas_options_), ")"));
-    }
-  }
-  DumpResults(csv, "backend_support_matrix");
-}
-
-// The test does not fail. It just dumps the results in CSV format.
-TEST_P(TritonAndBlasSupportForDifferentTensorSizes, Regular2DDot) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule ${module_name}
-
-    ENTRY e {
-      p0 = f32[${m},${k}] parameter(0)
-      p1 = f32[${k},${n}] parameter(1)
-      ROOT dot = f32[${m},${n}] dot(p0, p1),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={0},
-        algorithm=${algorithm}
-    }
-  )";
-  CSVWriter csv;
-  csv.appendValue("M/N");
-  for (int n = 1; n <= kMaxSize; n *= kStepSize) {
-    csv.appendValue(n);
-  }
-  for (int m = 1; m <= kMaxSize; m *= kStepSize) {
-    csv.nextRow();
-    csv.appendValue(m);
-    for (int n = 1; n <= kMaxSize; n *= kStepSize) {
-      LOG(INFO) << "Running test for m=" << m << ", n=" << n;
-      auto run = [&](std::string backend, absl::string_view pattern,
-                     const DebugOptions& options) -> absl::string_view {
-        auto test_name = absl::StrReplaceAll(TestName(), {{"/", "_"}});
-        auto module_name = absl::StrCat(test_name, "_", backend, "_", m, "_",
-                                        kMaxK, "_", n, "_", algorithm_);
-        auto module = GetModule(kHloText,
-                                {{"${module_name}", module_name},
-                                 {"${algorithm}", algorithm_},
-                                 {"${m}", absl::StrCat(m)},
-                                 {"${n}", absl::StrCat(n)},
-                                 {"${k}", absl::StrCat(kMaxK)}},
-                                options);
-        if (!module.ok()) {
-          return "Fail";
-        }
-        return absl::StrContains(module.value()->ToString(), pattern) ? " Yes"
-                                                                      : "  No";
-      };
-
-      csv.appendValue(absl::StrCat(
-          "('triton': ", run("triton", kTritonGemmPattern, triton_options_),
-          ", 'blas': ", run("blas", kBlasPattern, blas_options_), ")"));
-    }
-  }
-  DumpResults(csv, "backend_support_matrix");
-}
-
-TEST_P(TritonAndBlasSupportForDifferentTensorSizes,
-       IsDotAlgorithmSupportedByTriton) {
-  // Here we test which dot algorithm is supported by triton.
-  // In case of a change you need to update the expected results.
-  constexpr absl::string_view kHloText = R"(
-    HloModule ${module_name}
-
-    ENTRY e {
-      p0 = f32[${m},${k}] parameter(0)
-      p1 = f32[${k},${n}] parameter(1)
-      ROOT dot = f32[${m},${n}] dot(p0, p1),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={0},
-        algorithm=${algorithm}
-    }
-  )";
-  auto m = 128;
-  auto n = 128;
-  auto k = 128;
-  auto run = [&](std::string backend, absl::string_view pattern,
-                 const DebugOptions& options) -> absl::StatusOr<bool> {
-    auto test_name = absl::StrReplaceAll(TestName(), {{"/", "_"}});
-    auto module_name = absl::StrCat(test_name, "_", backend, "_", m, "_", kMaxK,
-                                    "_", n, "_", algorithm_);
-    auto module = GetModule(kHloText,
-                            {{"${module_name}", module_name},
-                             {"${algorithm}", algorithm_},
-                             {"${m}", absl::StrCat(m)},
-                             {"${n}", absl::StrCat(n)},
-                             {"${k}", absl::StrCat(k)}},
-                            options);
-    if (!module.ok()) {
-      return module.status();
-    }
-    std::string module_text = module.value()->ToString();
-    if (!Run(std::move(module.value()), false)) {
-      return absl::InternalError("failed to run module");
-    }
-    return absl::StrContains(module_text, pattern);
-  };
-
-  auto result_or_status = run("triton", kTritonGemmPattern, triton_options_);
-  switch (GetParam()) {
-    case PC::ALG_UNSET:
-    case PC::ALG_DOT_TF32_TF32_F32:
-    case PC::ALG_DOT_TF32_TF32_F32_X3:
-    case PC::ALG_DOT_BF16_BF16_F32:
-    case PC::ALG_DOT_BF16_BF16_F32_X3:
-    case PC::ALG_DOT_BF16_BF16_F32_X6:
-    case PC::ALG_DOT_BF16_BF16_F32_X9:
-    case PC::ALG_DOT_F32_F32_F32:
-      ASSERT_TRUE(result_or_status.status().ok())
-          << "failed to compile " << algorithm_;
-      EXPECT_TRUE(result_or_status.value())
-          << "wrong result for " << algorithm_;
-      break;
-    case PC::ALG_DOT_F64_F64_F64:
-      EXPECT_EQ(result_or_status.status().code(),
-                absl::StatusCode::kUnimplemented);
-      break;
-    default:
-      EXPECT_TRUE(false) << "Uncovered algorithm. Please fix: " << algorithm_;
-      break;
-  }
-}
-
-// Applies elementwise absolute value to all arguments to make them
-// non-negative.
-void MakeNonNegative(std::vector<Literal>& fake_arguments) {
-  for (Literal& literal : fake_arguments) {
-    absl::Span<float> data = literal.data<float>();
-    for (int i = 0; i < data.size(); ++i) {
-      data[i] = std::abs(data[i]);
-    }
-  }
-}
-
-std::vector<const Literal*> GetLiteralPointers(
-    const std::vector<Literal>& fake_arguments) {
-  std::vector<const Literal*> fake_argument_ptrs;
-  fake_argument_ptrs.reserve(fake_arguments.size());
-  for (const Literal& literal : fake_arguments) {
-    fake_argument_ptrs.push_back(&literal);
-  }
-  return fake_argument_ptrs;
-}
-
-enum class Backend { kTriton, kBlas };
-
-std::string BackendToString(Backend backend) {
-  switch (backend) {
-    case Backend::kTriton:
-      return "triton";
-    case Backend::kBlas:
-      return "blas";
-    default:
-      CHECK(false) << "Uncovered backend. Please fix.";
-  }
-}
-
-// Returns the maximum relative error for the algorithm, assuming that the
-// majority of the error comes from rounding to narrower type, and not error
-// due to floating point arithmetic calculation. I.e., we assume that:
-//    <contracting dimension> << <narrowing error> / <fp arithmetic error>
-// E.g., for BF16xBF16 -> F32, this would mean k << 2^-7 / 2^-23 = 64k
-double GetMaxRelErrorForSmallContractingDim(Backend backend,
-                                            PC::Algorithm algorithm) {
-  // With `ulp` denoting the "unit in the last place", and proper floating point
-  // implementation, the test does k multiplications and then k-1 additions per
-  // output element. However, we also get an initial error per element due to
-  // rounding to bf16, or tf32, depending on the algorithm.
-  //
-  // Our total error then ends up being k*ulp_f32 + 2*ulp_bf16/tf32.  We can
-  // look at an example of a dot product of 2-value vectors [a,b] and [x,y], to
-  // get an intuition for it:
-  //  (1+ulp_f32)((1+ulp_f32)((1+ulp_bf16)a * (1+ulp_bf16)x)
-  //      + (1+ulp_f32)((1+ulp_bf16)b * (1+ulp_bf16)y))
-  //   = (1+ulp_f32)(1+ulp_f32)(1+ulp_bf16)(1+ulp_bf16)(ax+by)
-  //  ~= (1+2ulp_f32+2ulp_bf16)(ax+by)
-  //
-  // In the last equality we discard any higher-order errors because they are
-  // orders of magnitude smaller than the 1st-order term.
-  //
-  // Thus, we get 2*ulp_bf16 because the multiplication adds up the errors of
-  // the factors, and addition just factors a single error term out. Then we get
-  // k*ulp_f32 because each "layer" of operations adds another rounding error
-  // (and we have 1 layer of multiplications and k-1 layers of additions).
-  //
-  // If we have a small k, such as k=8 then the error bounds are:
-  //
-  // BF16xBF16 -> F32: 8*2^-23 + 2*2^-7 = 2^-20 + 2^-6 ~= 1.6e-2
-  // TF32xTF32 -> F32: 8*2^-23 + 2*2^-10 = 2^-20 + 2^-9 ~= 2.0e-3
-  //
-  // Thus, they do not actually depend on k, since f32 has much higher precision
-  // than the rounding mode.
-  const absl::flat_hash_map<PC::Algorithm, double> kMaxMeanRelErrorTriton = {
-      {PC::ALG_DOT_BF16_BF16_F32, 1.6e-2},
-      {PC::ALG_DOT_TF32_TF32_F32, 2.0e-3},
-      // TODO: b/407744579 - Understand what the expected error is with various
-      // precision-recovering algorithms. For now we just use the errors that
-      // we got assuming that the implementation is correct.
-      {PC::ALG_DOT_BF16_BF16_F32_X3, 7.9e-6},
-      {PC::ALG_DOT_BF16_BF16_F32_X6, 1.3e-7},
-      {PC::ALG_DOT_BF16_BF16_F32_X9, 1.2e-7},
-      {PC::ALG_DOT_TF32_TF32_F32_X3, 5e-7},
-      {PC::ALG_DOT_F32_F32_F32, 2e-07}};
-
-  const absl::flat_hash_map<PC::Algorithm, double> kMaxMeanRelErrorBlas = {
-      {PC::ALG_DOT_BF16_BF16_F32, 3.3e-3},
-      {PC::ALG_DOT_TF32_TF32_F32, 4.1e-4},
-      {PC::ALG_DOT_BF16_BF16_F32_X3, 2.4e-5},
-      {PC::ALG_DOT_TF32_TF32_F32_X3, 5e-7},
-      {PC::ALG_DOT_BF16_BF16_F32_X6, 1.6e-7},
-      {PC::ALG_DOT_BF16_BF16_F32_X9, 6e-8},
-      {PC::ALG_DOT_F32_F32_F32, 2e-07}};
-  if (backend == Backend::kTriton) {
-    auto max_rel_error_it = kMaxMeanRelErrorTriton.find(algorithm);
-    CHECK(max_rel_error_it != kMaxMeanRelErrorTriton.end());
-    return max_rel_error_it->second;
-  }
-
-  if (backend == Backend::kBlas) {
-    auto max_rel_error_it = kMaxMeanRelErrorBlas.find(algorithm);
-    CHECK(max_rel_error_it != kMaxMeanRelErrorBlas.end());
-    return max_rel_error_it->second;
-  }
-
-  CHECK(false) << "Uncovered backend. Please fix.";
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    TritonAndBlasSupportForDifferentTensorSizes,
-    TritonAndBlasSupportForDifferentTensorSizes,
-    ::testing::ValuesIn(
-        {PC::ALG_DOT_BF16_BF16_F32, PC::ALG_DOT_BF16_BF16_F32_X3,
-         PC::ALG_DOT_BF16_BF16_F32_X6, PC::ALG_DOT_BF16_BF16_F32_X9,
-         PC::ALG_DOT_F32_F32_F32, PC::ALG_DOT_TF32_TF32_F32,
-         PC::ALG_DOT_TF32_TF32_F32_X3, PC::ALG_DOT_F64_F64_F64, PC::ALG_UNSET}),
-    AlgorithmTestParamToString);
-
-template <typename... Args>
-void Log(absl::string_view name, const absl::FormatSpec<Args...>& format,
-         const Args&... args) {
-  std::cerr << "stats: " << name << " " << absl::StrFormat(format, args...)
-            << "\n";
-}
-
-double CalculateStdDev(absl::Span<const double> values, double mean) {
-  double sum = 0.0;
-  for (int i = 0; i < values.size(); ++i) {
-    sum += (values[i] - mean) * (values[i] - mean);
-  }
-  return std::sqrt(sum / values.size());
-}
-
-template <typename T>
-std::vector<double> CalculateRelErrors(absl::Span<T> values,
-                                       const std::vector<double>& ref_values) {
-  std::vector<double> rel_errors;
-  rel_errors.reserve(values.size());
-  for (int i = 0; i < values.size(); ++i) {
-    double value = values[i];
-    double ref_value = ref_values[i];
-    double rel_error = (value - ref_value) / ref_value;
-    rel_errors.push_back(rel_error);
-  }
-  return rel_errors;
-}
-
-template <typename T>
-void PrintStats(absl::string_view name, absl::Span<T> values,
-                const std::vector<double>& expected_values) {
-  // Build the histogram of the relative differences.
-  std::vector<double> rel_errors = CalculateRelErrors(values, expected_values);
-  double max_rel_error =
-      *std::max_element(rel_errors.begin(), rel_errors.end());
-  double min_rel_error =
-      *std::min_element(rel_errors.begin(), rel_errors.end());
-  double rel_error_range = max_rel_error - min_rel_error;
-  double rel_error_sum =
-      std::accumulate(rel_errors.begin(), rel_errors.end(), 0.0);
-  double mean_rel_error = rel_error_sum / rel_errors.size();
-  double std_dev_rel_error = CalculateStdDev(rel_errors, mean_rel_error);
-
-  int num_bins = std::ceil(std::log2(values.size() + 1));
-  double bin_width = rel_error_range / num_bins;
-  std::vector<int> histogram(num_bins, 0);
-  int samples_count = rel_errors.size();
-  for (int i = 0; i < rel_errors.size(); ++i) {
-    int bin = static_cast<int>((rel_errors[i] - min_rel_error) / bin_width);
-    if (bin >= num_bins) {
-      bin = num_bins - 1;
-    }
-    histogram[bin]++;
-  }
-  int max_bin_size = *std::max_element(histogram.begin(), histogram.end());
-  constexpr int kMaxBarHeight = 100;
-  int64_t samples = 0;
-  bool median_found = false;
-  std::tuple<int, double, double> median_bin;
-  for (int i = 0; i < num_bins; ++i) {
-    samples += histogram[i];
-    double bin_start = min_rel_error + i * bin_width;
-    double bin_end = min_rel_error + (i + 1) * bin_width;
-    int bar_size = histogram[i] * kMaxBarHeight / max_bin_size;
-    std::string bar =
-        absl::StrCat(std::string(bar_size, '*'), " ", bar_size, " ");
-    if (!median_found && samples >= samples_count / 2) {
-      median_bin = std::make_tuple(i, bin_start, bin_end);
-      median_found = true;
-      bar += " <--- median";
-    }
-    if (mean_rel_error >= bin_start && mean_rel_error < bin_end) {
-      bar += " <--- mean";
-    }
-    if (bin_start <= 0.0 && bin_end >= 0.0) {
-      bar += " <--- zero";
-    }
-    std::string line =
-        absl::StrFormat("%2d: [% 1.3e, % 1.3e) %7d %s\n", i, bin_start, bin_end,
-                        histogram[i], bar.c_str());
-    std::cerr << "hist: " << line;
-  }
-  double max_abs_rel_error =
-      std::max(std::abs(min_rel_error), std::abs(max_rel_error));
-  Log(name, "min(rel_errors), %1.3e", min_rel_error);
-  Log(name, "max(rel_errors), %1.3e", max_rel_error);
-  Log(name, "max(abs(rel_errors)), %1.3e", max_abs_rel_error);
-  Log(name, "mean(rel_errors), %1.3e", mean_rel_error);
-  Log(name, "std_dev(rel_errors), %1.3e", std_dev_rel_error);
-  Log(name, "CV(rel_errors) = %1.3f", std_dev_rel_error / mean_rel_error);
-  Log(name, "range(rel_errors), %1.3e", rel_error_range);
-  Log(name, "median bin, %d [%1.3e - %1.3e)", std::get<0>(median_bin),
-      std::get<1>(median_bin), std::get<2>(median_bin));
-}
-
-class PrecisionTests
-    : public AlgorithmTest,
-      public NumericTestsArguments,
-      public WithParamInterface<::testing::tuple<PC::Algorithm, Backend>> {
- public:
- protected:
-  std::vector<double> RunReferenceDot(
-      const std::vector<const Literal*>& fake_argument_ptrs, int m_size,
-      int n_size, int k_size) {
-    absl::Time start = absl::Now();
-    std::vector<double> ref_result(m_size * n_size, 0.0);
-    auto lhs = fake_argument_ptrs[0]->data<float>();
-    auto rhs = fake_argument_ptrs[1]->data<float>();
-    for (int m = 0; m < m_size; ++m) {
-      for (int n = 0; n < n_size; ++n) {
-        for (int k = 0; k < k_size; ++k) {
-          double lhs_val = lhs[m * k_size + k];
-          double rhs_val = rhs[n * k_size + k];
-          ref_result[m * n_size + n] += lhs_val * rhs_val;
-        }
-      }
-    }
-    auto duration = absl::Now() - start;
-    std::cerr << "Reference dot took " << duration << " for " << m_size << "x"
-              << n_size << "x" << k_size << "\n";
-    return ref_result;
-  }
-
-  absl::Status CheckGemmPattern(const HloModule& module,
-                                absl::string_view pattern) {
-    TF_ASSIGN_OR_RETURN(bool ok, RunFileCheck(module.ToString(), pattern));
-    if (!ok) {
-      return absl::InternalError(
-          absl::StrCat("The module does not contain the pattern: ", pattern));
-    }
-    return absl::OkStatus();
-  }
-
-  absl::StatusOr<std::unique_ptr<HloModule>> GetSimpleDotModule(
-      int lhs_outer_dim, int rhs_outer_dim, int contracting_dim,
-      PC::Algorithm algorithm, Backend backend) {
-    std::string hlo_text = absl::StrReplaceAll(
-        kHloTextPattern, {{"${test_name}", HloModuleTestName()},
-                          {"${m}", absl::StrCat(lhs_outer_dim)},
-                          {"${n}", absl::StrCat(rhs_outer_dim)},
-                          {"${k}", absl::StrCat(contracting_dim)},
-                          {"${algorithm}", AlgorithmToString(algorithm)}});
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                        ParseAndReturnVerifiedModule(hlo_text));
-    auto debug_options = module->config().debug_options();
-    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
-    if (backend == Backend::kTriton) {
-      debug_options.set_xla_gpu_enable_triton_gemm(true);
-      debug_options.set_xla_gpu_cublas_fallback(false);
-    } else if (backend == Backend::kBlas) {
-      debug_options.set_xla_gpu_enable_triton_gemm(false);
-      debug_options.set_xla_gpu_cublas_fallback(true);
-    } else {
-      return absl::InvalidArgumentError("Invalid backend");
-    }
-    module->mutable_config().set_debug_options(debug_options);
-    TF_ASSIGN_OR_RETURN(module, GetOptimizedModule(std::move(module)));
-    if (backend == Backend::kTriton) {
-      TF_RETURN_IF_ERROR(CheckGemmPattern(*module, "CHECK: __triton_gemm"));
-    } else if (backend == Backend::kBlas) {
-      TF_RETURN_IF_ERROR(CheckGemmPattern(*module, "CHECK: __cublas$gemm"));
-    } else {
-      return absl::InvalidArgumentError("Invalid backend");
-    }
-    return module;
-  }
-
- private:
-  static constexpr absl::string_view kHloTextPattern = R"(
-    HloModule ${test_name}
-
-    ENTRY main {
-      p0 = f32[${m},${k}]{1,0} parameter(0)
-      p1 = f32[${n},${k}]{1,0} parameter(1)
-      ROOT %dot = f32[${m},${n}]{1,0} dot(p0, p1),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={1},
-        algorithm=${algorithm}
-    }
-  )";
-};
-
-using ::testing::Combine;
-using ::testing::Values;
-
-std::string AlgorithmAndBackendTestParamToString(
-    const TestParamInfo<::testing::tuple<PC::Algorithm, Backend>>& info) {
-  PC::Algorithm algorithm = std::get<0>(info.param);
-  Backend backend = std::get<1>(info.param);
-  return absl::StrCat(BackendToString(backend), "_",
-                      AlgorithmToString(algorithm));
-}
-
-MATCHER_P(RelativeDifferenceIsWithin, max_rel_difference, "") {
-  double got = std::get<0>(arg);
-  double expected = std::get<1>(arg);
-  double rel_difference = std::abs((got - expected) / expected);
-  *result_listener << "has relative difference " << rel_difference << " = ("
-                   << got << " - " << expected << ") / " << expected
-                   << " that should be within " << max_rel_difference;
-  return rel_difference <= max_rel_difference;
-}
-
-TEST_P(PrecisionTests, PrecisionCheck) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Precision tests is unknown for ROCM.";
-  }
-
-  PC::Algorithm algorithm = std::get<0>(GetParam());
-  Backend backend = std::get<1>(GetParam());
-  if (backend == Backend::kBlas && algorithm == PC::ALG_DOT_F32_F32_F32) {
-    auto desc = device_desc();
-    std::cerr << "platform version: " << desc.platform_version();
-    std::cerr << "driver version: " << desc.driver_version();
-    std::cerr << "runtime version: " << desc.runtime_version();
-    std::cerr << "compile_time_toolkit_version: "
-              << desc.compile_time_toolkit_version();
-    std::cerr << "Name: " << desc.name();
-    EXPECT_THAT(absl::string_view(getenv("CUBLAS_EMULATE_SINGLE_PRECISION")),
-                ::testing::Eq("1"))
-        << "For F32 precision and BLAS, we want to test single precision "
-           "emulation with BF16x9 cublas algorithm. It was introduced in "
-           "cublas 12.9.";
-    EXPECT_THAT(absl::string_view(getenv("CUBLAS_EMULATION_STRATEGY")),
-                ::testing::Eq("performant"));
-  }
-  // Use small contracting dimensions to avoid false-negatives due to changing
-  // contracting dimension tiling factors.
-  constexpr int kLhsOuterDim = 1024;
-  constexpr int kRhsOuterDim = 1024;
-  constexpr int kContractingDim = 8;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> test_module,
-      GetSimpleDotModule(kLhsOuterDim, kRhsOuterDim, kContractingDim, algorithm,
-                         backend));
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> fake_arguments,
-      MakeFakeArguments(test_module.get(), /*pseudo_random=*/true,
-                        /*use_large_range=*/false,
-                        /*treat_gte_as_data_formatting=*/false,
-                        /*max_bits_of_precision=*/23));
-  // Ensure there are no negative arguments to avoid unbounded relative errors
-  // due to subtracting two similarly large numbers.
-  MakeNonNegative(fake_arguments);
-  std::vector<const Literal*> fake_argument_ptrs =
-      GetLiteralPointers(fake_arguments);
-  std::vector<double> ref_result = RunReferenceDot(
-      fake_argument_ptrs, kLhsOuterDim, kRhsOuterDim, kContractingDim);
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, test_runner().CreateExecutable(
-                                               std::move(test_module), false));
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal test_result,
-      test_runner().ExecuteWithExecutable(executable.get(), fake_arguments));
-  ExecutionProfile profile;
-  std::vector<uint64_t> profile_times;
-  profile_times.reserve(100);
-  for (int i = 0; i < 100; ++i) {
-    TF_ASSERT_OK_AND_ASSIGN(
-        Literal test_result,
-        test_runner_as_hlo_runner().ExecuteWithExecutableAndProfile(
-            executable.get(), fake_argument_ptrs, &profile));
-    profile_times.push_back(profile.compute_time_ns());
-  }
-  auto min_time = *std::min_element(profile_times.begin(), profile_times.end());
-  std::cerr << "\n";
-  auto name =
-      absl::StrCat(BackendToString(backend), "_", AlgorithmToString(algorithm));
-  PrintStats(name, test_result.data<float>(), ref_result);
-  std::cerr << "stats: " << name << " min execution time, " << min_time
-            << "ns\n";
-  std::cerr << "stats: \n";
-  EXPECT_THAT(llvm::zip(test_result.data<float>(), ref_result),
-              ::testing::Each(RelativeDifferenceIsWithin(
-                  GetMaxRelErrorForSmallContractingDim(backend, algorithm))));
-}
-
-TEST_P(PrecisionTests, CheckPrecisionDegradationAlongKDimension) {
-  // The goal of this test is to show the precision degradation along the
-  // contracting dimension. We want to check how much the relative error
-  // increases as we increase the size of the contracting dimension.
-  if (!VLOG_IS_ON(1)) {
-    GTEST_SKIP()
-        << "Precision degradation is only tested with vlog level > 0.\n"
-        << "To run the test, set --v=1 and rerun the test.\n"
-        << "The test is quite slow and produces output for manual inspection.";
-  }
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Precision tests is unknown for ROCM.";
-  }
-  Backend backend = std::get<1>(GetParam());
-  if (backend == Backend::kBlas) {
-    GTEST_SKIP() << "Precision degradation is only tested for Triton.";
-  }
-  PC::Algorithm algorithm = std::get<0>(GetParam());
-  // Use small m and n and go over a range of k.
-  constexpr int kMSize = 32;
-  constexpr int kNSize = 32;
-  constexpr int kMinKSize = 64;
-  constexpr int kMaxKSize = 1024 * 1024;
-  CSVWriter csv_writer;
-  csv_writer.appendRow<std::string>(
-      {"iterations_along_k", "max(abs(rel_errors))", "std_dev(rel_errors)"});
-  for (int k = kMinKSize; k <= kMaxKSize; k *= 2) {
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloModule> test_module,
-        GetSimpleDotModule(kMSize, kNSize, k, algorithm, backend));
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::vector<Literal> fake_arguments,
-        MakeFakeArguments(test_module.get(), /*pseudo_random=*/
-                          true,
-                          /*use_large_range=*/false,
-                          /*treat_gte_as_data_formatting=*/false,
-                          /*max_bits_of_precision=*/23));
-    // Ensure there are no negative arguments to avoid unbounded relative errors
-    // due to subtracting two similarly large numbers.
-    MakeNonNegative(fake_arguments);
-    std::vector<const Literal*> fake_argument_ptrs =
-        GetLiteralPointers(fake_arguments);
-    std::vector<double> ref_result =
-        RunReferenceDot(fake_argument_ptrs, kMSize, kNSize, k);
-    TF_ASSERT_OK_AND_ASSIGN(
-        auto executable,
-        test_runner().CreateExecutable(std::move(test_module), false));
-    TF_ASSERT_OK_AND_ASSIGN(
-        Literal test_result,
-        test_runner().ExecuteWithExecutable(executable.get(), fake_arguments));
-    std::vector<double> rel_errors =
-        CalculateRelErrors(test_result.data<float>(), ref_result);
-    double max_rel_error =
-        *std::max_element(rel_errors.begin(), rel_errors.end());
-    double min_rel_error =
-        *std::min_element(rel_errors.begin(), rel_errors.end());
-    double max_abs_rel_error =
-        std::max(std::abs(min_rel_error), std::abs(max_rel_error));
-    double mean_rel_error =
-        std::accumulate(rel_errors.begin(), rel_errors.end(), 0.0) /
-        rel_errors.size();
-    double std_dev_rel_error = CalculateStdDev(rel_errors, mean_rel_error);
-    csv_writer.nextRow();
-    csv_writer.appendValue(k / kMinKSize);
-    csv_writer.appendValue(absl::StrFormat("%1.3e", max_abs_rel_error));
-    csv_writer.appendValue(absl::StrFormat("%1.3e", std_dev_rel_error));
-  }
-  auto name =
-      absl::StrCat(BackendToString(backend), "_", AlgorithmToString(algorithm));
-  std::cerr << csv_writer.GetResult(name);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    PrecisionTests, PrecisionTests,
-    Combine(Values(PC::ALG_DOT_TF32_TF32_F32, PC::ALG_DOT_TF32_TF32_F32_X3,
-                   PC::ALG_DOT_BF16_BF16_F32, PC::ALG_DOT_BF16_BF16_F32_X3,
-                   PC::ALG_DOT_BF16_BF16_F32_X6, PC::ALG_DOT_BF16_BF16_F32_X9,
-                   PC::ALG_DOT_F32_F32_F32),
-            Values(Backend::kTriton, Backend::kBlas)),
-    AlgorithmAndBackendTestParamToString);
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
index 32e60300a40364..4bbcc293b50719 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -81,10 +80,6 @@ class AlgorithmTest : public GpuCodegenTest {
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_autotune_level(0);
-    // TODO(b/393299275): remove when the flag is enabled by default.
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
     return debug_options;
   }
 
@@ -485,10 +480,26 @@ TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
   constexpr absl::string_view kHloText = R"(
     HloModule Emit6xBF16GemmWhenBothInputsAreF32
 
+    lhs {
+      ROOT p0 = f32[5,7] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[7,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,7] parameter(0)
       p1 = f32[7,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,7] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      rhs = f32[7,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x6
     }
@@ -497,9 +508,11 @@ TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
       p0 = f32[5,7]{1,0} parameter(0)
       p1 = f32[7,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["32","32"]}],
+            "num_stages":1,"num_warps":1,"num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -536,10 +549,26 @@ TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
   constexpr absl::string_view kHloText = R"(
     HloModule Triton6xBF16GemmWorksForLongContractingDimension
 
+    lhs {
+      ROOT p0 = f32[5,2048] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[2048,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,2048] parameter(0)
       p1 = f32[2048,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,2048] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+      rhs = f32[2048,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x6
     }
@@ -548,9 +577,11 @@ TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
       p0 = f32[5,2048]{1,0} parameter(0)
       p1 = f32[2048,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["64","32"]}],
+            "num_stages":1,"num_warps":4, "num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -562,7 +593,7 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16>
 }
 
 TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmEndToEnd) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X6 not supported on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -595,10 +626,26 @@ TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
   constexpr absl::string_view kHloText = R"(
     HloModule Emit3xBF16GemmWhenBothInputsAreF32
 
+    lhs {
+      ROOT p0 = f32[5,7] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[7,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,7] parameter(0)
       p1 = f32[7,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,7] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      rhs = f32[7,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x3
     }
@@ -607,9 +654,11 @@ TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
       p0 = f32[5,7]{1,0} parameter(0)
       p1 = f32[7,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["32","32"]}],
+            "num_stages":1,"num_warps":1,"num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -640,10 +689,26 @@ TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForLongContractingDimension) {
   constexpr absl::string_view kHloText = R"(
     HloModule Triton3xBF16GemmWorksForLongContractingDimension
 
+    lhs {
+      ROOT p0 = f32[5,2048] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[2048,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,2048] parameter(0)
       p1 = f32[2048,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,2048] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+      rhs = f32[2048,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x3
     }
@@ -652,9 +717,11 @@ TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForLongContractingDimension) {
       p0 = f32[5,2048]{1,0} parameter(0)
       p1 = f32[2048,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["64","32"]}],
+            "num_stages":1,"num_warps":4, "num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -666,7 +733,7 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16>
 }
 
 TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmEndToEnd) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X3 not supported on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -692,7 +759,7 @@ CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
 }
 
 TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32_X3) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Triton currently disabled on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -715,7 +782,7 @@ TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32_X3) {
 }
 
 TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32_X6) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Triton currently disabled on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -738,7 +805,7 @@ TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32_X6) {
 }
 
 TEST_F(TritonAlgorithmTest, Algorithm_TF32_TF32_F32) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Triton currently disabled on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -763,7 +830,7 @@ TEST_F(TritonAlgorithmTest, Algorithm_TF32_TF32_F32) {
 }
 
 TEST_F(TritonAlgorithmTest, Algorithm_TF32_TF32_F32_X3) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Triton currently disabled on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -789,7 +856,7 @@ TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32) {
   if (!SupportsBF16(GpuComputeComp())) {
     GTEST_SKIP() << "BF16 not supported.";
   }
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Triton currently disabled on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -815,43 +882,50 @@ TEST_F(TritonAlgorithmTest, Dot_BF16_X6_WithConst) {
   constexpr absl::string_view kHloText = R"(
     HloModule Dot_BF16_X6_WithConst
 
-    %triton_fusion_dot (p_0: f32[1,258]) -> f32[258] {
-      %c_1 = f32[] constant(-1.22474492)
-      %r_1 = f32[1]{0} reshape(f32[] %c_1)
-      %r_2 = f32[1,1]{1,0} reshape(f32[1]{0} %r_1)
-      %p_0 = f32[1,258]{1,0} parameter(0)
-      %r_3 = f32[258]{0} reshape(f32[1,258]{1,0} %p_0)
-      %r_4 = f32[258,1]{1,0} reshape(f32[258]{0} %r_3)
-      %dot_0 = f32[1,258]{1,0} dot(f32[1,1]{1,0} %r_2, f32[258,1]{1,0} %r_4),
+    lhs {
+      constant = f32[] constant(-1.22474492)
+      ROOT broadcast = f32[1,1] broadcast(constant)
+    }
+
+    rhs {
+      ROOT p0 = f32[258,1] parameter(0)
+    }
+
+    triton_fusion_dot {
+      p0 = f32[258,1] parameter(0)
+      lhs = f32[1,1] fusion(), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["16","16"]}]}}}
+      rhs = f32[258,1] fusion(p0), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["16","256"]}]}}}
+      dot = f32[1,258] dot(lhs, rhs),
           lhs_contracting_dims={0},
           rhs_contracting_dims={1},
           algorithm=dot_bf16_bf16_f32_x6
-      %r_5 = f32[258]{0} reshape(f32[1,258]{1,0} %dot_0)
-      %c_2 = f32[] constant(0.282094777)
-      %b_0 = f32[258]{0} broadcast(f32[] %c_2), dimensions={}
-      ROOT %m_0 = f32[258]{0} multiply(f32[258]{0} %r_5, f32[258]{0} %b_0)
+      constant = f32[] constant(0.282094777)
+      broadcast = f32[1,258] broadcast(constant), dimensions={}
+      ROOT root = f32[1,258] multiply(dot, broadcast)
     }
 
-    ENTRY %entry_computation {
-      %p_0 = f32[1,258]{1,0} parameter(0)
-      ROOT %dot = f32[258]{0} fusion(f32[1,258]{1,0} %p_0),
+    ENTRY entry_computation {
+      p0 = f32[258,1] parameter(0)
+      ROOT root = f32[1,258] fusion(p0),
         kind=kCustom,
-        calls=%triton_fusion_dot,
+        calls=triton_fusion_dot,
         backend_config={
           "operation_queue_id":"0",
           "wait_on_operation_queues":[],
           "fusion_backend_config":{
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":"16",
-              "block_n":"256",
-              "block_k":"16",
-              "split_k":"1",
-              "num_stages":"4",
-              "num_warps":"4",
-              "num_ctas":"1"
-            }
-          },
+            "kind":"__triton_nested_gemm_fusion",
+            "block_level_fusion_config":{
+              "output_tiles": [{"sizes": ["16","256"]}],
+              "num_stages":4,
+              "num_warps":4,
+              "num_ctas":1
+            }},
           "force_earliest_schedule":false
         }
     }
@@ -1819,7 +1893,8 @@ class PrecisionTests
     module->mutable_config().set_debug_options(debug_options);
     TF_ASSIGN_OR_RETURN(module, GetOptimizedModule(std::move(module)));
     if (backend == Backend::kTriton) {
-      TF_RETURN_IF_ERROR(CheckGemmPattern(*module, "CHECK: __triton_gemm"));
+      TF_RETURN_IF_ERROR(CheckGemmPattern(
+          *module, "CHECK: {{__triton_gemm|__triton_nested_gemm_fusion}}"));
     } else if (backend == Backend::kBlas) {
       TF_RETURN_IF_ERROR(CheckGemmPattern(*module, "CHECK: __cublas$gemm"));
     } else {
@@ -1865,7 +1940,7 @@ MATCHER_P(RelativeDifferenceIsWithin, max_rel_difference, "") {
 }
 
 TEST_P(PrecisionTests, PrecisionCheck) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Precision tests is unknown for ROCM.";
   }
 
@@ -1947,7 +2022,7 @@ TEST_P(PrecisionTests, CheckPrecisionDegradationAlongKDimension) {
         << "To run the test, set --v=1 and rerun the test.\n"
         << "The test is quite slow and produces output for manual inspection.";
   }
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Precision tests is unknown for ROCM.";
   }
   Backend backend = std::get<1>(GetParam());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
index dc0773781aa07e..3ecfe8738f8cdf 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
@@ -16,20 +16,25 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 
 #include <cstdint>
-#include <variant>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/TargetParser/Triple.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
@@ -37,20 +42,30 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/layout_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/mlir_hlo/mhlo/transforms/transformation_helpers.h"
 #include "xla/primitive_util.h"
-#include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/stream_executor/device_description.h"
+#include "xla/shape.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla::gpu::triton {
 
@@ -64,13 +79,85 @@ using ::mlir::ValueRange;
 namespace ma = ::mlir::arith;
 namespace mh = ::mlir::mhlo;
 namespace mm = ::mlir::math;
-namespace mt = ::mlir::triton;
 
-ScalarOrTensor::ScalarOrTensor(mlir::Value value) : value_(value) {
-  CHECK(IsScalar() || UnwrapTensor().getType().getRank() > 0)
-      << "0D tensors are not supported by Triton";
+namespace {
+using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
+
+// Emit a value as Index clamped to [lower, upper].
+Value EmitClampedIndex(EmitterLocOpBuilder b, Value value, int64_t lower,
+                       int64_t upper) {
+  Value clamped_index =
+      ma::MaxSIOp::create(b, value, CreateConst(b, value.getType(), lower));
+  clamped_index = ma::MinSIOp::create(b, clamped_index,
+                                      CreateConst(b, value.getType(), upper));
+  return ma::IndexCastOp::create(b, b.getIndexType(), clamped_index);
 }
 
+absl::StatusOr<SmallVector<Value>> ComputeOffsetsForTile(
+    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
+    const TiledHloInstruction& tiled_hlo) {
+  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
+                      tiled_hlo.tile_offsets_indexing());
+  const std::vector<IndexingMap::Variable>& rt_vars =
+      tile_offsets_indexing.GetRTVars();
+  CHECK_EQ(rt_vars.size(), runtime_values.size())
+      << absl::StrCat(tiled_hlo.ToString(), " has ", rt_vars.size(),
+                      " runtime variables in tile_offsets_indexing but only ",
+                      runtime_values.size(), " runtime values were provided");
+  CHECK_EQ(tile_offsets_indexing.GetRangeVars().size(), 0)
+      << "Range variables must be converted to dimensions. Instruction: "
+      << tiled_hlo.ToString();
+  // emitters::ApplyIndexing does not support symbols at the moment. As a
+  // workaround we convert them to dimensions.
+  IndexingMap dim_only_tiling =
+      tile_offsets_indexing.ConvertSymbolsToDimensions();
+  SmallVector<Value> dims;
+  dims.reserve(1 /* pid */ + runtime_values.size());
+  dims.push_back(pid);
+  for (const auto& [rt_var, value] : llvm::zip(rt_vars, runtime_values)) {
+    Value clamped_index =
+        EmitClampedIndex(b, value, rt_var.bounds.lower, rt_var.bounds.upper);
+    dims.push_back(triton::Cast(b, clamped_index, pid.getType()));
+  }
+  return emitters::ApplyIndexing(dim_only_tiling, /*dims=*/dims,
+                                 /*symbols=*/{}, b);
+}
+
+// Emit code corresponding to a fusion instruction somehow nested within the
+// initial Triton fusion. This can happen when we carry around auxiliary
+// computations, e.g. with reduces. Since we are emitting a single Triton
+// fusion, we simply flatten the fusion inside the computation.
+//
+// TODO(b/331413981): get rid of this special handling once this is solved.
+absl::StatusOr<TensorValue> EmitNestedFusion(
+    EmitterLocOpBuilder b, const HloFusionInstruction& fusion_instruction,
+    absl::flat_hash_map<const HloInstruction*, TensorValue>& values) {
+  // TODO(b/331402498): revisit the order of scope once we completely
+  // deprecate Triton fusion analysis.
+  const HloComputation* fusion_computation =
+      fusion_instruction.fused_instructions_computation();
+
+  absl::flat_hash_map<const HloInstruction*, TensorValue> region_values;
+
+  std::vector<const HloInstruction*> to_emit;
+  for (const HloInstruction* instr :
+       fusion_computation->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kParameter) {
+      int64_t parameter_number = instr->parameter_number();
+      auto it = values.find(fusion_instruction.operand(parameter_number));
+      TF_RET_CHECK(it != values.end());
+      TF_RET_CHECK(region_values.insert({instr, it->second}).second);
+    } else {
+      to_emit.push_back(instr);
+    }
+  }
+
+  TF_RET_CHECK(to_emit.back() == fusion_computation->root_instruction());
+
+  return EmitScope(b, to_emit, region_values);
+}
+}  // namespace
+
 SmallVector<int64_t> GetPaddedTileSizes(ArrayRef<int64_t> tile_sizes) {
   SmallVector<int64_t> result;
   result.reserve(tile_sizes.size());
@@ -166,17 +253,17 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
   }
 
   if (src_ty.isIndex() || dst_ty.isIndex()) {
-    return b.create<ma::IndexCastOp>(dst_ty, value);
+    return ma::IndexCastOp::create(b, dst_ty, value);
   }
 
   // All operations on bf16 are done through f32.
   if (src_element_ty.isBF16()) {
-    return Cast(b, b.create<ma::ExtFOp>(fp32_ty, value), dst_element_ty);
+    return Cast(b, ma::ExtFOp::create(b, fp32_ty, value), dst_element_ty);
   }
   if (dst_element_ty.isBF16()) {
     // S8 -> BF16 is directly supported and doesn't need to go through f32.
     if (!src_element_ty.isInteger(8)) {
-      return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
+      return ma::TruncFOp::create(b, dst_ty, Cast(b, value, b.getF32Type()));
     }
   }
 
@@ -184,30 +271,17 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
   auto src_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(src_element_ty);
   auto dst_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(dst_element_ty);
   if (src_fp_element_ty && dst_fp_element_ty) {
-    // F8 <-> FP16, BF16, FP32, FP64 need to be handled via Triton's tt.fp_to_fp
-    // because LLVM doesn't support casts from/to FP8.
-    // TODO(b/413272992): Add better test coverage for FpToFpOp.
-    if (IsFp8Type(src_element_ty) && !IsFp8Type(dst_element_ty)) {
-      return b.create<mt::FpToFpOp>(dst_ty, value);
-    }
-    if (IsFp8Type(dst_element_ty) && !IsFp8Type(src_element_ty)) {
-      return b.create<mt::FpToFpOp>(
-          dst_ty, value,
-          mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
-    }
     if (IsFp8Type(src_element_ty) && IsFp8Type(dst_element_ty)) {
       // FP8 <-> FP8 conversion needs to go through FP16
-      auto fp16_value = b.create<mt::FpToFpOp>(fp16_ty, value);
-      return b.create<mt::FpToFpOp>(
-          dst_ty, fp16_value,
-          mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
+      auto fp16_value = ma::ExtFOp::create(b, fp16_ty, value);
+      return ma::TruncFOp::create(b, dst_ty, fp16_value);
     }
 
     if (src_fp_element_ty.getFPMantissaWidth() >
         dst_fp_element_ty.getFPMantissaWidth()) {
-      return b.create<ma::TruncFOp>(dst_ty, value);
+      return ma::TruncFOp::create(b, dst_ty, value);
     } else {
-      return b.create<ma::ExtFOp>(dst_ty, value);
+      return ma::ExtFOp::create(b, dst_ty, value);
     }
   }
   // int => int
@@ -216,60 +290,63 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
     if (src_element_ty.getIntOrFloatBitWidth() <
         dst_element_ty.getIntOrFloatBitWidth()) {
       if (src_element_ty.isInteger(1)) {
-        return b.create<ma::ExtUIOp>(dst_ty, value);
+        return ma::ExtUIOp::create(b, dst_ty, value);
       }
-      return b.create<ma::ExtSIOp>(dst_ty, value);
+      return ma::ExtSIOp::create(b, dst_ty, value);
     }
-    return b.create<ma::TruncIOp>(dst_ty, value);
+    // int => bool is always value != 0.
+    if (dst_element_ty.isInteger(1)) {
+      return ma::CmpIOp::create(b, ma::CmpIPredicate::ne, value,
+                                ZerosLike(b, value));
+    }
+    return ma::TruncIOp::create(b, dst_ty, value);
   }
   // int => float
   if (mlir::isa<mlir::IntegerType>(src_element_ty) && dst_fp_element_ty) {
     // The current logic handles signed integer types only.
     if (src_element_ty.isInteger(1)) {
-      return b.create<ma::UIToFPOp>(dst_ty, value);
+      return ma::UIToFPOp::create(b, dst_ty, value);
     }
-    return b.create<ma::SIToFPOp>(dst_ty, value);
+    return ma::SIToFPOp::create(b, dst_ty, value);
   }
   // float => int
   if (src_fp_element_ty && mlir::isa<mlir::IntegerType>(dst_element_ty)) {
     if (dst_element_ty.isInteger(1)) {
-      return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNE, value,
-                                  ZerosLike(b, value));
+      return ma::CmpFOp::create(b, ma::CmpFPredicate::UNE, value,
+                                ZerosLike(b, value));
     }
     // The current logic handles signed integer types only. Additional handling
     // is needed for unsigned integer types.
-    auto cst_int = [&](int64_t x) {
+    auto cst_int = [&](int64_t x) -> Value {
       if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, dst_element_ty, x, src_shaped_ty.getShape())
-            .UnwrapUnsafe();
+        return CreateConst(b, dst_element_ty, x, src_shaped_ty.getShape());
       } else {
-        return CreateConst(b, dst_element_ty, x).UnwrapUnsafe();
+        return CreateConst(b, dst_element_ty, x);
       }
     };
-    auto cst_float = [&](int64_t x) {
+    auto cst_float = [&](int64_t x) -> Value {
       if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, src_fp_element_ty, x, src_shaped_ty.getShape())
-            .UnwrapUnsafe();
+        return CreateConst(b, src_fp_element_ty, x, src_shaped_ty.getShape());
       } else {
-        return CreateConst(b, src_fp_element_ty, x).UnwrapUnsafe();
+        return CreateConst(b, src_fp_element_ty, x);
       }
     };
-    auto fptosi = b.create<ma::FPToSIOp>(dst_ty, value);
+    auto fptosi = ma::FPToSIOp::create(b, dst_ty, value);
     int64_t min = llvm::minIntN(dst_element_ty.getIntOrFloatBitWidth());
     int64_t max = llvm::maxIntN(dst_element_ty.getIntOrFloatBitWidth());
 
     // value <= static_cast<float>(INT_MIN) ? INT_MIN : ...
-    auto clamped = b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::OLE, value, cst_float(min)),
+    auto clamped = ma::SelectOp::create(
+        b, ma::CmpFOp::create(b, ma::CmpFPredicate::OLE, value, cst_float(min)),
         cst_int(min), fptosi);
     // value >= static_cast<float>(INT_MAX) ? INT_MAX : ...
-    clamped = b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::OGE, value, cst_float(max)),
+    clamped = ma::SelectOp::create(
+        b, ma::CmpFOp::create(b, ma::CmpFPredicate::OGE, value, cst_float(max)),
         cst_int(max), clamped);
     // isnan(value) ? 0 : ...
-    return b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::UNO, value, value), cst_int(0),
-        clamped);
+    return ma::SelectOp::create(
+        b, ma::CmpFOp::create(b, ma::CmpFPredicate::UNO, value, value),
+        cst_int(0), clamped);
   }
 
   LOG(FATAL) << "Type conversion not supported: "
@@ -279,9 +356,9 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
 
 Value Subtract(EmitterLocOpBuilder& b, ValueRange values) {
   if (mlir::isa<mlir::IntegerType>(mlir::getElementTypeOrSelf(values[0]))) {
-    return b.create<ma::SubIOp>(values[0], values[1]);
+    return ma::SubIOp::create(b, values[0], values[1]);
   } else {
-    return b.create<ma::SubFOp>(values[0], values[1]);
+    return ma::SubFOp::create(b, values[0], values[1]);
   }
 }
 
@@ -289,130 +366,50 @@ Value Compare(EmitterLocOpBuilder& b, ValueRange values,
               mh::ComparisonDirection direction) {
   const Type type = mlir::getElementTypeOrSelf(values[0]);
   if (mlir::isa<mlir::IntegerType>(type)) {
-    return b.create<ma::CmpIOp>(mh::impl::getCmpPredicate<ma::CmpIPredicate>(
-                                    direction,
-                                    /*isSigned=*/!type.isInteger(1))
-                                    .value(),
-                                values[0], values[1]);
+    return ma::CmpIOp::create(b,
+                              mh::impl::getCmpPredicate<ma::CmpIPredicate>(
+                                  direction,
+                                  /*isSigned=*/!type.isInteger(1))
+                                  .value(),
+                              values[0], values[1]);
   }
-  return b.create<ma::CmpFOp>(
+  return ma::CmpFOp::create(
+      b,
       mh::impl::getCmpPredicate<ma::CmpFPredicate>(direction,
                                                    /*isSigned=*/true)
           .value(),
       values[0], values[1]);
 }
 
-Value Maximum(EmitterLocOpBuilder& b, const se::DeviceDescription& device_info,
-              ValueRange values) {
-  if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
-    return b.create<ma::MaximumFOp>(values);
+Value Maximum(EmitterLocOpBuilder& b, ValueRange values) {
+  auto type = mlir::getElementTypeOrSelf(values[0]);
+  if (mlir::isa<mlir::FloatType>(type)) {
+    return ma::MaximumFOp::create(b, values);
   }
-  // logic: isNaN(lhs) || (!isNan(rhs) && lhs >= rhs) ? lhs : rhs
-  // See also: IEEE Std 754-2008 5.11.
-  //
-  // This also works, but we wanted to make it similar to minimum.
-  // logic: isNaN(lhs) || lhs >= rhs ? lhs : rhs
-  Value lhs_is_nan =
-      Compare(b, {values[0], values[0]}, mh::ComparisonDirection::NE);
-  Value rhs_is_not_nan =
-      Compare(b, {values[1], values[1]}, mh::ComparisonDirection::EQ);
-  Value lhs_is_ge = Compare(b, values, mh::ComparisonDirection::GE);
-  return b.create<ma::SelectOp>(
-      b.create<ma::OrIOp>(lhs_is_nan,
-                          b.create<ma::AndIOp>(rhs_is_not_nan, lhs_is_ge)),
-      values[0], values[1]);
-}
 
-Value Minimum(EmitterLocOpBuilder& b, const se::DeviceDescription& device_info,
-              ValueRange values) {
-  if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
-    return b.create<ma::MinimumFOp>(values);
+  if (type.isInteger(1)) {
+    return ma::OrIOp::create(b, values);
   }
-  // logic: isNaN(lhs) || (!isNan(rhs) && lhs <= rhs) ? lhs : rhs
-  // See also: IEEE Std 754-2008 5.11.
-  //
-  // This should also work, but the tests show that it doesn't work for
-  // minimum(x, NaN):
-  // logic: isNaN(lhs) || lhs <= rhs ? lhs : rhs
-  Value lhs_is_nan =
-      Compare(b, {values[0], values[0]}, mh::ComparisonDirection::NE);
-  Value rhs_is_not_nan =
-      Compare(b, {values[1], values[1]}, mh::ComparisonDirection::EQ);
-  Value lhs_is_le = Compare(b, values, mh::ComparisonDirection::LE);
-  return b.create<ma::SelectOp>(
-      b.create<ma::OrIOp>(lhs_is_nan,
-                          b.create<ma::AndIOp>(rhs_is_not_nan, lhs_is_le)),
-      values[0], values[1]);
-}
 
-ScalarOrTensor Splat(EmitterLocOpBuilder& b, ScalarOrTensor value,
-                     ArrayRef<int64_t> shape) {
-  CHECK(!shape.empty());
-  auto type = mlir::RankedTensorType::get(shape, value.getType());
-  return ScalarOrTensor(b.create<mt::SplatOp>(type, value.UnwrapUnsafe()));
+  return ma::MaxSIOp::create(b, values);
 }
 
-bool IsSupportedElementwiseLibdeviceFunction(const HloInstruction& hlo) {
-  auto dev_fn_id = GetTargetDeviceFunctionID(hlo.opcode());
-  if (!dev_fn_id.has_value()) {
-    return false;
+Value Minimum(EmitterLocOpBuilder& b, ValueRange values) {
+  auto type = mlir::getElementTypeOrSelf(values[0]);
+  if (mlir::isa<mlir::FloatType>(type)) {
+    return ma::MinimumFOp::create(b, values);
   }
-  PrimitiveType output_type = hlo.shape().element_type();
-  return output_type == PrimitiveType::BF16 ||
-         output_type == PrimitiveType::F16 ||
-         output_type == PrimitiveType::F32 || output_type == PrimitiveType::F64;
-}
 
-absl::StatusOr<Value> EmitElementwiseLibdeviceFunction(
-    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info, const HloInstruction& hlo,
-    ValueRange inputs) {
-  auto dev_fn_id = GetTargetDeviceFunctionID(hlo.opcode());
-  if (!dev_fn_id.has_value()) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("No libdevice function for operation ", hlo.ToString()));
-  }
-  PrimitiveType output_type = hlo.shape().element_type();
-  if (output_type != PrimitiveType::BF16 && output_type != PrimitiveType::F16 &&
-      output_type != PrimitiveType::F32 && output_type != PrimitiveType::F64) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Unsupported elementwise operation ", hlo.ToString()));
-  }
-  llvm::Triple triple("nvptx64-unknown-unknown");
-  if (std::holds_alternative<se::RocmComputeCapability>(
-          device_info.gpu_compute_capability())) {
-    triple.setTriple("amdgcn-unknown-unknown");
-  }
-  llvm::SmallVector<Value, 2> casted_inputs;
-  if (output_type == PrimitiveType::BF16 || output_type == PrimitiveType::F16) {
-    // Upcast the inputs to F32.
-    for (int64_t i = 0; i < inputs.size(); ++i) {
-      casted_inputs.push_back(Cast(b, inputs[i], b.getF32Type()));
-    }
-  } else {
-    casted_inputs.assign(inputs.begin(), inputs.end());
+  if (type.isInteger(1)) {
+    return ma::AndIOp::create(b, values);
   }
-  Value res = b.create<mt::ExternElementwiseOp>(
-      casted_inputs[0].getType(), casted_inputs, "libdevice", libdevice_path,
-      ObtainDeviceFunctionName(dev_fn_id.value(), output_type, triple),
-      /*pure=*/true);
-  if (output_type == PrimitiveType::BF16 || output_type == PrimitiveType::F16) {
-    // Downcast back to the original output type.
-    TF_ASSIGN_OR_RETURN(auto dst_ty, TritonType(b, output_type));
-    res = Cast(b, res, dst_ty);
-  }
-  return res;
+
+  return ma::MinSIOp::create(b, values);
 }
 
 absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
-                                      absl::string_view libdevice_path,
-                                      const se::DeviceDescription& device_info,
                                       const HloInstruction& hlo,
                                       ValueRange inputs) {
-  if (IsSupportedElementwiseLibdeviceFunction(hlo)) {
-    return EmitElementwiseLibdeviceFunction(b, libdevice_path, device_info, hlo,
-                                            inputs);
-  }
   const bool is_integer =
       mlir::isa<mlir::IntegerType>(getElementTypeOrSelf(inputs[0].getType()));
 
@@ -422,15 +419,15 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
       return inputs[0];
     case HloOpcode::kAbs:
       if (is_integer) {
-        return b.create<mm::AbsIOp>(inputs[0]);
+        return mm::AbsIOp::create(b, inputs[0]);
       }
-      return b.create<mm::AbsFOp>(inputs[0]);
+      return mm::AbsFOp::create(b, inputs[0]);
     case HloOpcode::kCeil:
-      return b.create<mm::CeilOp>(inputs[0]);
+      return mm::CeilOp::create(b, inputs[0]);
     case HloOpcode::kFloor:
-      return b.create<mm::FloorOp>(inputs[0]);
+      return mm::FloorOp::create(b, inputs[0]);
     case HloOpcode::kNot:
-      return b.create<ma::XOrIOp>(inputs[0], OnesLike(b, inputs[0]));
+      return ma::XOrIOp::create(b, inputs[0], OnesLike(b, inputs[0]));
     case HloOpcode::kNegate:
       // NegFOp is not supported by Triton.
       return Subtract(b, {ZerosLike(b, inputs[0]), inputs[0]});
@@ -443,39 +440,41 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
       if (is_integer) {
         // XLA add semantics for predicates is equal to bitwise OR, while Arith
         // defines it differently. Replace add with or in this case.
-        if (inputs[0].getType().isInteger(1)) {
-          return b.create<ma::OrIOp>(inputs[0], inputs[1]);
+        if (getElementTypeOrSelf(inputs[0]).isInteger(1)) {
+          return ma::OrIOp::create(b, inputs[0], inputs[1]);
         }
-        return b.create<ma::AddIOp>(inputs[0], inputs[1]);
+        return ma::AddIOp::create(b, inputs[0], inputs[1]);
       }
-      return b.create<ma::AddFOp>(inputs[0], inputs[1]);
+      return ma::AddFOp::create(b, inputs[0], inputs[1]);
     case HloOpcode::kSubtract:
       return Subtract(b, inputs);
     case HloOpcode::kMultiply:
       if (is_integer) {
-        return b.create<ma::MulIOp>(inputs[0], inputs[1]);
+        return ma::MulIOp::create(b, inputs[0], inputs[1]);
       }
-      return b.create<ma::MulFOp>(inputs[0], inputs[1]);
+      return ma::MulFOp::create(b, inputs[0], inputs[1]);
     case HloOpcode::kMaximum:
-      return Maximum(b, device_info, inputs);
+      return Maximum(b, inputs);
     case HloOpcode::kMinimum:
-      return Minimum(b, device_info, inputs);
+      return Minimum(b, inputs);
     case HloOpcode::kClamp:
-      return Maximum(
-          b, device_info,
-          {Minimum(b, device_info, {inputs[1], inputs[2]}), inputs[0]});
+      return Minimum(b, {Maximum(b, {inputs[0], inputs[1]}), inputs[2]});
     case HloOpcode::kAnd:
-      return b.create<ma::AndIOp>(inputs[0], inputs[1]);
+      return ma::AndIOp::create(b, inputs[0], inputs[1]);
     case HloOpcode::kOr:
-      return b.create<ma::OrIOp>(inputs[0], inputs[1]);
+      return ma::OrIOp::create(b, inputs[0], inputs[1]);
     case HloOpcode::kXor:
-      return b.create<ma::XOrIOp>(inputs[0], inputs[1]);
+      return ma::XOrIOp::create(b, inputs[0], inputs[1]);
     case HloOpcode::kDivide:
       if (is_integer) {
         // Unsigned not supported yet.
-        return b.create<ma::DivSIOp>(inputs[0], inputs[1]);
+        auto div = ma::DivSIOp::create(b, inputs[0], inputs[1]);
+        // Attr signifies that this op should be re-written to guard against
+        // undefined behavior.
+        div->setAttr("xla.guard_ub", b.getUnitAttr());
+        return div;
       }
-      return b.create<ma::DivFOp>(inputs[0], inputs[1]);
+      return ma::DivFOp::create(b, inputs[0], inputs[1]);
     case HloOpcode::kCompare:
       return Compare(
           b, inputs,
@@ -483,21 +482,75 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
               ComparisonDirectionToString(hlo.comparison_direction()))
               .value());
     case HloOpcode::kSelect:
-      return b.create<ma::SelectOp>(
+      return ma::SelectOp::create(
+          b,
           Compare(b, {inputs[0], ZerosLike(b, inputs[0])},
                   mh::ComparisonDirection::NE),
           inputs[1], inputs[2]);
     case HloOpcode::kReducePrecision:
-      return mh::reducePrecision<mt::BitcastOp>(
+      return mh::reducePrecision<mlir::tensor::BitcastOp>(
           b.getLoc(), inputs[0], hlo.exponent_bits(), hlo.mantissa_bits(), &b);
+    case HloOpcode::kAcos:
+      return mm::AcosOp::create(b, inputs[0]);
+    case HloOpcode::kAcosh:
+      return mm::AcoshOp::create(b, inputs[0]);
+    case HloOpcode::kAsin:
+      return mm::AsinOp::create(b, inputs[0]);
+    case HloOpcode::kAsinh:
+      return mm::AsinhOp::create(b, inputs[0]);
+    case HloOpcode::kAtan2:
+      return mm::Atan2Op::create(b, inputs[0], inputs[1]);
+    case HloOpcode::kAtanh:
+      return mm::AtanhOp::create(b, inputs[0]);
+    case HloOpcode::kCos:
+      return mm::CosOp::create(b, inputs[0]);
+    case HloOpcode::kCosh:
+      return mm::CoshOp::create(b, inputs[0]);
+    case HloOpcode::kExp:
+      return mm::ExpOp::create(b, inputs[0]);
+    case HloOpcode::kErf:
+      return mm::ErfOp::create(b, inputs[0]);
+    case HloOpcode::kExpm1:
+      return mm::ExpM1Op::create(b, inputs[0]);
+    case HloOpcode::kLog:
+      return mm::LogOp::create(b, inputs[0]);
+    case HloOpcode::kLog1p:
+      return mm::Log1pOp::create(b, inputs[0]);
+    case HloOpcode::kPower:
+      return mm::PowFOp::create(b, inputs[0], inputs[1]);
+    case HloOpcode::kRemainder:
+      if (is_integer) {
+        auto rem = ma::RemSIOp::create(b, inputs[0], inputs[1]);
+        // Attr signifies that this op should be re-written to guard against
+        // undefined behavior.
+        rem->setAttr("xla.guard_ub", b.getUnitAttr());
+        return rem;
+      }
+      return ma::RemFOp::create(b, inputs[0], inputs[1]);
+    case HloOpcode::kRsqrt:
+      return mm::RsqrtOp::create(b, inputs[0]);
+    case HloOpcode::kSin:
+      return mm::SinOp::create(b, inputs[0]);
+    case HloOpcode::kSinh:
+      return mm::SinhOp::create(b, inputs[0]);
+    case HloOpcode::kSqrt:
+      return mm::SqrtOp::create(b, inputs[0]);
+    case HloOpcode::kTan:
+      return mm::TanOp::create(b, inputs[0]);
+    case HloOpcode::kTanh:
+      return mm::TanhOp::create(b, inputs[0]);
+    case HloOpcode::kCbrt:
+      return mm::CbrtOp::create(b, inputs[0]);
+    case HloOpcode::kIsFinite:
+      return mm::IsFiniteOp::create(b, inputs[0]);
     default:
       return absl::InvalidArgumentError(
           absl::StrCat("Unsupported elementwise operation ", hlo.ToString()));
   }
 }
 
-absl::StatusOr<ScalarOrTensor> EmitConstant(EmitterLocOpBuilder& b,
-                                            const HloInstruction& constant) {
+absl::StatusOr<mlir::TypedValue<mlir::RankedTensorType>> EmitConstant(
+    EmitterLocOpBuilder& b, const HloInstruction& constant) {
   TF_ASSIGN_OR_RETURN(Type ty, TritonType(b, constant.shape().element_type()));
   llvm::SmallVector<int64_t> shape{constant.shape().dimensions().begin(),
                                    constant.shape().dimensions().end()};
@@ -517,7 +570,121 @@ absl::StatusOr<ScalarOrTensor> EmitConstant(EmitterLocOpBuilder& b,
 Value Bitcast(EmitterLocOpBuilder& b, Value value, Type type) {
   auto value_type = value.getType();
   value_type = mlir::dyn_cast<ShapedType>(value_type).clone(type);
-  return b.create<mlir::arith::BitcastOp>(value_type, value);
+  return mlir::arith::BitcastOp::create(b, value_type, value);
+}
+
+/*static */ absl::StatusOr<TileInfo> TileInfo::Construct(
+    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
+    const TiledHloInstruction& tiled_hlo) {
+  TF_ASSIGN_OR_RETURN(SmallVector<Value> offsets,
+                      ComputeOffsetsForTile(b, pid, runtime_values, tiled_hlo));
+
+  // Triton requires that all block dimensions are a power of 2.
+  auto padded_tile_sizes = GetPaddedTileSizes(tiled_hlo.tile_sizes());
+  SmallVector<int64_t> original_shape;
+  original_shape.assign(tiled_hlo.hlo()->shape().dimensions().begin(),
+                        tiled_hlo.hlo()->shape().dimensions().end());
+
+  const Shape& shape = tiled_hlo.hlo()->shape();
+  TF_ASSIGN_OR_RETURN(Type expected_element_type,
+                      TritonType(b, shape.element_type()));
+  auto storage_type = StorageType(expected_element_type);
+
+  auto tile_strides = tiled_hlo.tile_strides();
+  auto minor_to_major_layout = llvm::to_vector(LayoutUtil::MinorToMajor(shape));
+
+  return TileInfo(offsets, tile_strides, original_shape, padded_tile_sizes,
+                  minor_to_major_layout, storage_type);
+}
+
+TensorValue EmitParameterExtract(EmitterLocOpBuilder b,
+                                 const TileInfo& tile_info, Value arg) {
+  auto tensor_type = mlir::RankedTensorType::get(tile_info.padded_tile_sizes(),
+                                                 tile_info.storage_type());
+
+  return xla::xtile::ExtractTileOp::create(
+      b, tensor_type, arg, tile_info.offsets(), tile_info.padded_tile_sizes(),
+      tile_info.tile_strides());
+}
+
+absl::StatusOr<TensorValue> EmitScope(
+    EmitterLocOpBuilder b, absl::Span<const HloInstruction* const> instructions,
+    absl::flat_hash_map<const HloInstruction*, TensorValue>& values) {
+  for (const HloInstruction* hlo : instructions) {
+    TensorValue result;
+    if (hlo->opcode() == HloOpcode::kConcatenate ||
+        hlo->opcode() == HloOpcode::kDynamicSlice) {
+      // Parameter loads and their concatenations are handled outside EmitScope.
+      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
+      continue;
+    }
+    if (hlo->opcode() == HloOpcode::kParameter) {
+      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate ||
+          hlo->users()[0]->opcode() == HloOpcode::kDynamicSlice) {
+        continue;
+      }
+      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
+      continue;
+    }
+    if (hlo->opcode() == HloOpcode::kBroadcast) {
+      return absl::InvalidArgumentError(
+          "Broadcast is not yet supported in EmitScope().");
+    }
+    if (hlo->opcode() == HloOpcode::kConstant) {
+      TF_ASSIGN_OR_RETURN(result, EmitConstant(b, *hlo));
+    } else if (HloInstruction::IsOpElementwise(hlo->opcode())) {
+      std::vector<Value> operands;
+      operands.reserve(hlo->operands().size());
+      for (const HloInstruction* operand : hlo->operands()) {
+        operands.push_back(values[operand]);
+      }
+      TF_ASSIGN_OR_RETURN(Value elementwise_result,
+                          EmitElementwise(b, *hlo, operands));
+      result = mlir::cast<TensorValue>(elementwise_result);
+    } else if (hlo->opcode() == HloOpcode::kTuple) {
+      TF_RET_CHECK(hlo->IsRoot()) << hlo->ToString();
+    } else if (hlo->opcode() == HloOpcode::kBitcast ||
+               hlo->opcode() == HloOpcode::kTranspose ||
+               hlo->opcode() == HloOpcode::kSlice ||
+               hlo->opcode() == HloOpcode::kReshape ||
+               hlo->opcode() == HloOpcode::kPad) {
+      // All these are currently supported only as operations on indices
+      // which are pushed to loads and stores. No operations on tiles are
+      // performed here.
+      result = values[hlo->operand(0)];
+    } else if (hlo->opcode() == HloOpcode::kFusion) {
+      const auto* fusion_instruction = ::xla::Cast<HloFusionInstruction>(hlo);
+      TF_ASSIGN_OR_RETURN(result,
+                          EmitNestedFusion(b, *fusion_instruction, values));
+    } else {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported operation ", hlo->ToString()));
+    }
+    TF_RET_CHECK(values.insert({hlo, result}).second) << hlo->ToString();
+    VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
+  }
+  return values[instructions.back()];
+}
+
+TensorValue BroadcastInDims(EmitterLocOpBuilder b, TensorValue value,
+                            ArrayRef<int64_t> output_shape,
+                            ArrayRef<int64_t> dims) {
+  CHECK(llvm::is_sorted(dims)) << "broadcast dims must be sorted";
+
+  auto result_type = mlir::RankedTensorType::get(
+      output_shape, value.getType().getElementType());
+
+  return mlir::stablehlo::BroadcastInDimOp::create(b, result_type, value, dims);
+}
+
+TensorValue Splat(EmitterLocOpBuilder b, Value value,
+                  ArrayRef<int64_t> output_shape) {
+  auto tensor_value = mlir::dyn_cast<TensorValue>(value);
+  if (!tensor_value) {
+    tensor_value = mlir::tensor::FromElementsOp::create(
+        b, mlir::RankedTensorType::get({}, value.getType()), value);
+  }
+  return BroadcastInDims(b, tensor_value, output_shape, /*dims=*/{});
 }
 
 }  // namespace xla::gpu::triton
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
index 45ca2c2acdbb77..5b59f8f3ae6a98 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
@@ -18,11 +18,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
+#include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -34,18 +36,19 @@ limitations under the License.
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu::triton {
 
+using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
+
 // Returns a string representation of the given MLIR entity.
 template <typename T>
 std::string MlirToString(T&& value) {
@@ -55,43 +58,60 @@ std::string MlirToString(T&& value) {
   return result;
 }
 
-// This is a wrapper around mlir::Value that can hold either a scalar or a
-// non-0D tensor. An attempt to use this class with 0D tensors will CHECK-fail
-// because 0D tensors are not supported by Triton.
-class ScalarOrTensor {
-  using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
-
+// Constructs and holds information needed to construct a tile. This information
+// is propagated to Extract/Insert ops to use them to load and store the correct
+// tiles.
+class TileInfo {
  public:
-  ScalarOrTensor() = default;
+  static absl::StatusOr<TileInfo> Construct(
+      EmitterLocOpBuilder b, mlir::Value pid, mlir::ValueRange runtime_values,
+      const TiledHloInstruction& tiled_hlo);
 
-  // Wraps the given value in a ScalarOrTensor. CHECK-fails if the
-  // value is a 0D tensor, because Triton does not support 0D tensors.
-  explicit ScalarOrTensor(mlir::Value value);
+  // Tile offsets. Its size is equal to the rank of the output shape.
+  inline mlir::ValueRange offsets() const { return offsets_; }
 
-  bool IsScalar() const { return !IsTensor(); }
-  bool IsTensor() const { return mlir::isa<TensorValue>(value_); }
+  // Tile strides. Its size is equal to the rank of the output shape.
+  inline mlir::ArrayRef<int64_t> tile_strides() const { return tile_strides_; }
 
-  mlir::Value UnwrapScalar() const {
-    CHECK(IsScalar());
-    return value_;
+  // The original shape of the tensor.
+  inline mlir::ArrayRef<int64_t> original_shape() const {
+    return original_shape_;
   }
 
-  TensorValue UnwrapTensor() const {
-    CHECK(IsTensor());
-    return mlir::cast<TensorValue>(value_);
+  // Tile sizes after padding to a power of 2 (Triton requirement).
+  inline mlir::ArrayRef<int64_t> padded_tile_sizes() const {
+    return padded_tile_sizes_;
   }
 
-  // Returns the underlying value regardless of whether it is a scalar or a
-  // tensor. Only call this method in contexts where the consumer of the result
-  // both needs to use an `mlir::Value` and functions identically for scalars
-  // and tensors. In other cases, prefer to use the `UnwrapScalar` or
-  // `UnwrapTensor` methods.
-  mlir::Value UnwrapUnsafe() const { return value_; }
+  // The layout of the tensor in minor-to-major order.
+  inline const llvm::SmallVector<int64_t>& minor_to_major_layout() const {
+    return minor_to_major_layout_;
+  }
 
-  mlir::Type getType() const { return value_.getType(); }
+  // The storage type of the tensor. This could be different from the element
+  // type. e.g. predicates are stored as i8 instead of i1.
+  mlir::Type storage_type() const { return storage_type_; }
 
  private:
-  mlir::Value value_;
+  llvm::SmallVector<mlir::Value> offsets_;
+  llvm::SmallVector<int64_t> tile_strides_;
+  llvm::SmallVector<int64_t> original_shape_;
+  llvm::SmallVector<int64_t> padded_tile_sizes_;
+  llvm::SmallVector<int64_t> minor_to_major_layout_;
+  mlir::Type storage_type_;
+
+  inline TileInfo(llvm::SmallVector<mlir::Value> offsets,
+                  llvm::SmallVector<int64_t> tile_strides,
+                  llvm::SmallVector<int64_t> original_shape,
+                  llvm::SmallVector<int64_t> padded_tile_sizes,
+                  llvm::SmallVector<int64_t> minor_to_major_layout,
+                  mlir::Type storage_type)
+      : offsets_(std::move(offsets)),
+        tile_strides_(std::move(tile_strides)),
+        original_shape_(std::move(original_shape)),
+        padded_tile_sizes_(std::move(padded_tile_sizes)),
+        minor_to_major_layout_(std::move(minor_to_major_layout)),
+        storage_type_(std::move(storage_type)) {}
 };
 
 // Triton requires that all block dimensions are a power of 2.
@@ -114,53 +134,47 @@ T ScalarConstantValue(const HloInstruction& instr, PrimitiveType dst_type) {
   CHECK_EQ(instr.opcode(), HloOpcode::kConstant);
   CHECK(ShapeUtil::IsEffectiveScalar(instr.shape()));
   absl::StatusOr<Literal> converted = instr.literal().Convert(dst_type);
-  TF_CHECK_OK(converted.status());
+  CHECK_OK(converted.status());
   return converted.value().GetFirstElement<T>();
 }
 
 // Create a scalar constant.
 template <typename T>
-ScalarOrTensor CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value) {
+mlir::Value CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value) {
   if (mlir::isa<mlir::IntegerType>(type)) {
-    auto result =
-        b.create<mlir::arith::ConstantOp>(b.getIntegerAttr(type, value));
-    return ScalarOrTensor(result);
+    return b.create<mlir::arith::ConstantOp>(b.getIntegerAttr(type, value));
   }
 
   if (mlir::isa<mlir::IndexType>(type)) {
-    auto result = b.create<mlir::arith::ConstantOp>(b.getIndexAttr(value));
-    return ScalarOrTensor(result);
+    return b.create<mlir::arith::ConstantOp>(b.getIndexAttr(value));
   }
 
   if (mlir::isa<mlir::FloatType>(type)) {
-    auto result = b.create<mlir::arith::ConstantOp>(
+    return b.create<mlir::arith::ConstantOp>(
         b.getFloatAttr(type, static_cast<double>(value)));
-    return ScalarOrTensor(result);
   }
   LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
 }
 
 // Create a tensor constant.
 template <typename T>
-ScalarOrTensor CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value,
-                           llvm::ArrayRef<int64_t> shape) {
-  if (shape.empty()) {
-    return CreateConst<T>(b, type, value);
-  }
+mlir::TypedValue<mlir::RankedTensorType> CreateConst(
+    EmitterLocOpBuilder& b, mlir::Type type, T value,
+    llvm::ArrayRef<int64_t> shape) {
   auto tensor_type = mlir::RankedTensorType::get(shape, type);
   if (auto int_type = mlir::dyn_cast<mlir::IntegerType>(type)) {
-    auto result =
+    mlir::Value result =
         b.create<mlir::arith::ConstantOp>(mlir::DenseElementsAttr::get(
             tensor_type,
             mlir::APInt(int_type.getIntOrFloatBitWidth(), value,
                         /*isSigned=*/false, /*implicitTrunc=*/true)));
-    return ScalarOrTensor(result);
+    return mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(result);
   }
   if (auto float_type = mlir::dyn_cast<mlir::FloatType>(type)) {
-    auto result =
+    mlir::Value result =
         b.create<mlir::arith::ConstantOp>(mlir::DenseElementsAttr::get(
             tensor_type, b.getFloatAttr(type, static_cast<double>(value))));
-    return ScalarOrTensor(result);
+    return mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(result);
   }
   LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
 }
@@ -170,10 +184,9 @@ template <typename T>
 mlir::Value ConstLike(EmitterLocOpBuilder& b, mlir::Value like, T new_value) {
   if (auto src_shaped_ty = mlir::dyn_cast<mlir::ShapedType>(like.getType())) {
     mlir::Type src_ty = src_shaped_ty.getElementType();
-    return CreateConst(b, src_ty, new_value, src_shaped_ty.getShape())
-        .UnwrapUnsafe();
+    return CreateConst(b, src_ty, new_value, src_shaped_ty.getShape());
   }
-  return CreateConst(b, like.getType(), new_value).UnwrapUnsafe();
+  return CreateConst(b, like.getType(), new_value);
 }
 
 inline mlir::Value ZerosLike(EmitterLocOpBuilder& b, mlir::Value x) {
@@ -186,33 +199,47 @@ inline mlir::Value OnesLike(EmitterLocOpBuilder& b, mlir::Value x) {
 
 bool IsFp8Type(mlir::Type t);
 
-ScalarOrTensor Splat(EmitterLocOpBuilder& b, ScalarOrTensor value,
-                     llvm::ArrayRef<int64_t> shape);
-
 // Triton type conversions.
 mlir::Value Cast(EmitterLocOpBuilder& b, mlir::Value value,
                  mlir::Type dst_element_ty);
 
 // Emits a scalar constant.
-absl::StatusOr<ScalarOrTensor> EmitConstant(EmitterLocOpBuilder& b,
-                                            const HloInstruction& constant);
-
-bool IsSupportedElementwiseLibdeviceFunction(const HloInstruction& hlo);
+absl::StatusOr<mlir::TypedValue<mlir::RankedTensorType>> EmitConstant(
+    EmitterLocOpBuilder& b, const HloInstruction& constant);
 
-// Should only be called if IsSupportedElementwiseLibdeviceFunction() returns
-// true for `hlo`, otherwise an error is returned.
-absl::StatusOr<mlir::Value> EmitElementwiseLibdeviceFunction(
-    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info, const HloInstruction& hlo,
-    mlir::ValueRange inputs);
-
-absl::StatusOr<mlir::Value> EmitElementwise(
-    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info, const HloInstruction& hlo,
-    mlir::ValueRange inputs);
+absl::StatusOr<mlir::Value> EmitElementwise(EmitterLocOpBuilder& b,
+                                            const HloInstruction& hlo,
+                                            mlir::ValueRange inputs);
 
 mlir::Value Bitcast(EmitterLocOpBuilder& b, mlir::Value value, mlir::Type type);
 
+// Emits an xtile::ExtractTileOp for the given tile info and argument.
+TensorValue EmitParameterExtract(EmitterLocOpBuilder b,
+                                 const TileInfo& tile_info, mlir::Value arg);
+
+// Emits a sequence of HLO instructions within a specific scope.
+//
+// This function traverses the provided `hlo_instructions` in a
+// defined-before-use order and emits the corresponding MLIR operations using
+// the given `EmitterLocOpBuilder`. It uses `emitted_values` to look up already
+// emitted results for instructions, typically parameters or results from
+// outer scopes. New results are added to the `emitted_values` map.
+//
+// Example usage within [EmitReduce] includes using it to emit the body of the
+// `HloInstruction::to_apply` computation.
+absl::StatusOr<TensorValue> EmitScope(
+    EmitterLocOpBuilder b, absl::Span<const HloInstruction* const> instructions,
+    absl::flat_hash_map<const HloInstruction*, TensorValue>& values);
+
+// Same as HLO BroadcastInDims. The sorted indices in `dims` specify the
+// mapping of the input dimensions to the output dimensions.
+TensorValue BroadcastInDims(EmitterLocOpBuilder b, TensorValue value,
+                            ::mlir::ArrayRef<int64_t> output_shape,
+                            ::mlir::ArrayRef<int64_t> dims);
+
+TensorValue Splat(EmitterLocOpBuilder b, ::mlir::Value value,
+                  ::mlir::ArrayRef<int64_t> output_shape);
+
 }  // namespace xla::gpu::triton
 
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_EMITTER_HELPERS_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
index 7190f181e27669..8f66d9827c7091 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -40,7 +40,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -54,14 +53,13 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
-#include "xla/service/llvm_ir/ir_array.h"
-#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace gpu {
@@ -96,59 +94,38 @@ TritonFusion::GenerateTritonKernelAndWrapper(
     const se::DeviceDescription& device_info, llvm::Module* llvm_module,
     mlir::MLIRContext* mlir_context) const {
   const se::GpuComputeCapability& cc = device_info.gpu_compute_capability();
-  auto backend_config =
-      fusion.backend_config<GpuBackendConfig>()->fusion_backend_config();
-  absl::string_view fusion_kind = backend_config.kind();
-  TritonWrapperResult triton_wrapper_result;
-
-  if (fusion_kind == kTritonFusionKind ||
-      fusion_kind == kTritonNestedGemmFusionKind ||
-      fusion_kind == kTritonScaledDotFusionKind) {
-    std::optional<LaunchConfig> launch_config = this->launch_config();
-    if (!launch_config.has_value()) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Block level fusion config is required for Triton fusions: ",
-          fusion.ToString()));
-    }
-    TF_ASSIGN_OR_RETURN(triton_wrapper_result,
-                        TritonWrapper(impl_fn_name, &fusion, cc, device_info,
-                                      launch_config->block_level_parameters,
-                                      llvm_module, *mlir_context));
-  } else {  // Must be a MatMul
-    CHECK_EQ(fusion_kind, kTritonGemmFusionKind);
-    // TODO(bchetioui): port matmul emitter to fully use the new
-    // infrastructure.
-    BlockLevelParameters block_level_parameters;
-    if (!backend_config.has_triton_gemm_config()) {
-      block_level_parameters.num_ctas = 1;
-      block_level_parameters.num_stages = 1;
-      block_level_parameters.num_warps = 2;
-    } else {
-      const auto& triton_config = backend_config.triton_gemm_config();
-      block_level_parameters.num_ctas = triton_config.num_ctas();
-      block_level_parameters.num_stages = triton_config.num_stages();
-      block_level_parameters.num_warps = triton_config.num_warps();
-    }
 
-    TF_ASSIGN_OR_RETURN(
-        triton_wrapper_result,
-        TritonWrapper(impl_fn_name, &fusion, cc, device_info,
-                      block_level_parameters, llvm_module, *mlir_context));
+  if (!analysis_.fusion_backend_config().has_block_level_fusion_config()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Block level fusion config is required for Triton fusions: ",
+        fusion.ToString()));
   }
-
-  return triton_wrapper_result;
+  return TritonWrapper(
+      impl_fn_name, &fusion, cc, device_info,
+      BlockLevelParameters::FromBlockLevelFusionConfig(
+          analysis_.fusion_backend_config().block_level_fusion_config()),
+      llvm_module, *mlir_context);
 };
 
 absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
+  return Emit(ir_emitter_context, fusion, nullptr, {});
+}
+
+absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
+    IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
+    const HloInstruction* instr_override,
+    absl::Span<const Shape> unmanaged_arguments) const {
   llvm::IRBuilder builder(ir_emitter_context.llvm_module()->getContext());
   VLOG(3) << fusion.ToString();
   std::string suggested_kernel_name = std::string(fusion.name());
   TF_ASSIGN_OR_RETURN(
       auto kernel_arguments,
-      emitters::KernelArguments::Create(ir_emitter_context.buffer_assignment(),
-                                        GetDefaultBufferAlignment(), &fusion));
+      emitters::KernelArguments::Create(
+          ir_emitter_context.buffer_assignment(), GetDefaultBufferAlignment(),
+          instr_override != nullptr ? instr_override : &fusion,
+          unmanaged_arguments));
 
   const HloComputation* hlo_computation =
       fusion.fused_instructions_computation();
@@ -157,14 +134,12 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
   auto generate = [&]() -> absl::StatusOr<KernelReuseCache::Entry> {
     VLOG(3) << "Generating: " << suggested_kernel_name;
 
-    const std::string impl_fn_name =
-        ir_emitter_context.name_uniquer()->GetUniqueName(
-            llvm_ir::SanitizeFunctionName(
-                absl::StrCat(suggested_kernel_name, "_impl")));
+    const std::string sanitized_kernel_name =
+        GetSanitizedUniqueName(ir_emitter_context, suggested_kernel_name);
 
     TF_ASSIGN_OR_RETURN(
         TritonWrapperResult triton_wrapper_result,
-        GenerateTritonKernelAndWrapper(fusion, impl_fn_name,
+        GenerateTritonKernelAndWrapper(fusion, sanitized_kernel_name,
                                        ir_emitter_context.gpu_device_info(),
                                        ir_emitter_context.llvm_module(),
                                        ir_emitter_context.mlir_context()));
@@ -174,81 +149,45 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     absl::string_view fusion_kind = backend_config.kind();
 
     LaunchDimensions launch_dimensions;
-    if (fusion_kind == kTritonFusionKind ||
-        fusion_kind == kTritonNestedGemmFusionKind ||
-        fusion_kind == kTritonScaledDotFusionKind) {
-      std::optional<LaunchConfig> launch_config = this->launch_config();
-      // This check should be enforced by `GenerateTritonKernelWrapper`.
-      CHECK(launch_config.has_value());
-      launch_dimensions = std::move(launch_config->launch_dimensions);
-    } else {  // Must be a MatMul
-      CHECK_EQ(fusion_kind, kTritonGemmFusionKind);
-      // TODO(bchetioui): port matmul emitter to fully use the new
-      // infrastructure.
-      BlockLevelParameters block_level_parameters;
-      if (!backend_config.has_triton_gemm_config()) {
-        LOG(WARNING) << "Using fallback triton GEMM config for op "
-                     << fusion.name();
-        // TODO(bchetioui): deduplicate default matmul config information.
-        auto& triton_config = *backend_config.mutable_triton_gemm_config();
-        triton_config.set_block_m(64);
-        triton_config.set_block_k(64);
-        triton_config.set_block_n(64);
-        triton_config.set_split_k(1);
-        triton_config.set_num_stages(1);
-        triton_config.set_num_warps(2);
-        triton_config.set_num_ctas(1);
-      }
 
-      // TODO(bchetioui): move calculation of launch dimensions to
-      // 'launch_config()'.
-      TF_ASSIGN_OR_RETURN(
-          TritonGemmConfig config,
-          TritonGemmConfig::FromProto(backend_config.triton_gemm_config()));
+    // TODO(bchetioui,pifon): this list should be consolidated; why do we need
+    // so many different fusion kinds?
+    const std::vector<absl::string_view> kSupportedFusionKinds = {
+        kTritonFusionKind,
+        kTritonNestedGemmFusionKind,
+        kTritonCollectiveFusionKind,
+    };
 
-      TF_ASSIGN_OR_RETURN(auto analysis, TritonFusionAnalysis::Execute(
-                                             *hlo_computation, config.split_k));
-
-      TF_ASSIGN_OR_RETURN(
-          launch_dimensions,
-          GetMatMulLaunchDimensions(analysis, analysis_.fusion(), config,
-                                    analysis_.device_info()));
+    if (!absl::c_linear_search(kSupportedFusionKinds, fusion_kind)) {
+      return Internal("Unsupported fusion kind: %s", fusion_kind);
     }
 
-    llvm::Function* impl_fn =
-        ir_emitter_context.llvm_module()->getFunction(impl_fn_name);
-    TF_RET_CHECK(impl_fn);
+    std::optional<LaunchConfig> launch_config;
+    // Currently GetLaunchConfig will compute the same value as the extracted
+    // one. They are different only when warp specialization is enabled.
+    // Ideally we should always pass the thread_dims value extracted from
+    // the Triton compilation. However, we are keeping the old code path
+    // to maintain the current behavior and be safe.
+    if (fusion.GetModule()
+            ->config()
+            .debug_options()
+            .xla_gpu_experimental_enable_triton_warp_specialization()) {
+      launch_config = this->GetLaunchConfig(triton_wrapper_result.thread_dims);
+    } else {
+      launch_config = this->GetLaunchConfig();
+    }
+    // This check should be enforced by `GenerateTritonKernelWrapper`.
+    CHECK(launch_config.has_value());
+    launch_dimensions = std::move(launch_config->launch_dimensions);
 
-    TF_ASSIGN_OR_RETURN(
-        llvm::Function * kernel,
-        BuildKernelPrototype(ir_emitter_context, impl_fn_name,
-                             suggested_kernel_name, kernel_arguments,
-                             launch_dimensions, &builder));
+    TF_ASSIGN_OR_RETURN(llvm::Function * kernel,
+                        RemoveUnusedTritonAbiArguments(
+                            ir_emitter_context, sanitized_kernel_name,
+                            launch_dimensions, kernel_arguments));
 
     PopulateNvvmAnnotations(ir_emitter_context.llvm_module(), kernel,
                             triton_wrapper_result);
 
-    // Move function body into kernel prototype.
-    llvm::Function* prototype_func = builder.GetInsertBlock()->getParent();
-    prototype_func->splice(prototype_func->begin(), impl_fn);
-    for (const auto& [impl_fn_arg, kernel_arg] :
-         llvm::zip(impl_fn->args(), kernel->args())) {
-      impl_fn_arg.replaceAllUsesWith(&kernel_arg);
-    }
-    // Triton's kernel ABI expects additional scratchpad global memory for
-    // TMA and profiling information.
-    // For now it is only used for on-device creation of TMA descriptors, which
-    // we do not use yet, so we are just replacing this argument with a null
-    // pointer.
-    // TODO: b/381242007 - Allocate a proper buffer if we want to use
-    // device-side TMA APIs.
-    CHECK_EQ(impl_fn->arg_size(), kernel->arg_size() + 2);
-    auto tma_scratchpad_arg = impl_fn->getArg(impl_fn->arg_size() - 2);
-    tma_scratchpad_arg->replaceAllUsesWith(llvm::ConstantPointerNull::get(
-        llvm::cast<llvm::PointerType>(tma_scratchpad_arg->getType())));
-    auto profiling_scratchpad_arg = impl_fn->getArg(impl_fn->arg_size() - 1);
-    profiling_scratchpad_arg->replaceAllUsesWith(llvm::ConstantPointerNull::get(
-        llvm::cast<llvm::PointerType>(profiling_scratchpad_arg->getType())));
 
     return {{kernel->getName().str(), launch_dimensions,
              triton_wrapper_result.cluster_dim,
@@ -283,7 +222,8 @@ int64_t GetNumberOfBlocks(absl::Span<const int64_t> dimensions,
 }
 }  // namespace
 
-std::optional<TritonFusion::LaunchConfig> TritonFusion::launch_config() const {
+std::optional<TritonFusion::LaunchConfig> TritonFusion::GetLaunchConfig(
+    std::optional<se::ThreadDim> thread_dims_override) const {
   if (analysis_.fusion_backend_config().has_block_level_fusion_config()) {
     BlockLevelParameters block_level_parameters =
         BlockLevelParameters::FromBlockLevelFusionConfig(
@@ -301,10 +241,20 @@ std::optional<TritonFusion::LaunchConfig> TritonFusion::launch_config() const {
     }
 
     LaunchConfig launch_config;
-    launch_config.launch_dimensions = LaunchDimensions{
-        static_cast<uint64_t>(num_blocks),
-        static_cast<uint64_t>(block_level_parameters.num_warps *
-                              WarpSize(analysis_.device_info()))};
+    // TODO(b/451901200): We eventually also want to be able to predict this
+    // value without compiling so the cost model can rely on it. Currently, we
+    // need the override for auto warp specialization.
+    if (thread_dims_override) {
+      launch_config.launch_dimensions = LaunchDimensions{
+          se::BlockDim(num_blocks), thread_dims_override.value()};
+    } else {
+      int64_t estimated_threads_per_block =
+          block_level_parameters.num_warps * WarpSize(analysis_.device_info());
+      launch_config.launch_dimensions =
+          LaunchDimensions{static_cast<uint64_t>(num_blocks),
+                           static_cast<uint64_t>(estimated_threads_per_block)};
+    }
+
     launch_config.block_level_parameters = std::move(block_level_parameters);
     return launch_config;
   }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.h b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.h
index 233cb186bccbbe..b0320eb1a16e30 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.h
@@ -1,3 +1,4 @@
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 /* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,16 +20,17 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/Module.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
-#include "xla/codegen/tiling/tiled_hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/launch_dim.h"
 
 namespace xla {
 namespace gpu {
@@ -47,10 +49,37 @@ class TritonFusion : public FusionInterface {
       IrEmitterContext& ir_emitter_context,
       const HloFusionInstruction& fusion) const final;
 
+  // Overload of [Emit] that allows passing overrides for instructions
+  // and unmanaged arguments.
+  // - Instruction overloads are required when we forcibly form fusions for
+  // instructions by extracting them into a separate HLO module. In this case
+  // buffer assignments are still associated with the original instruction.
+  // So the root of the fusion cannot be used to determine the emitted kernel
+  // arguments.
+  // - The unmanaged arguments are used for collectives which have extra
+  // arguments (such as pointers to remote buffers, and metadata arguments).
+  // Empty unmanaged arguments mean that all arguments have buffer slices
+  // associated with them.
+  //
+  // TODO(b/461717780): Remove the instruction override once we form collective
+  // based fusions earlier in the compiler pipeline.
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
+      const HloInstruction* instr_override,
+      absl::Span<const Shape> unmanaged_arguments) const;
+
   // Returns the launch config for Triton fusions that have a block level fusion
   // config.
   // Not supported for MatMul fusions yet.
-  std::optional<LaunchConfig> launch_config() const;
+  //
+  // If `thread_dims_override` is provided, it is used instead of the thread
+  // dimensions calculated in the function.
+  // Ideally, we should pass the values extracted from the Triton module after
+  // compilation, but there are use-cases where we want to call the function
+  // without compiling, e.g. during cost modelling. In that case, the function
+  // calculates the values.
+  std::optional<LaunchConfig> GetLaunchConfig(
+      std::optional<se::ThreadDim> thread_dims_override = std::nullopt) const;
 
   // Generates a Triton kernel for the given fusion into the provided LLVM
   // module, and returns the `TritonWrapperResult` corresponding to the
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
index e3ce8819b8bb10..d3f76d8c06fb60 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <system_error>  // NOLINT
 #include <utility>
 #include <variant>
 #include <vector>
@@ -40,35 +39,18 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/LogicalResult.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
-#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
-#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
-#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
@@ -78,35 +60,28 @@ limitations under the License.
 #include "mlir/IR/ValueRange.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
-#include "mlir/Transforms/Passes.h"
 #include "stablehlo/dialect/StablehloOps.h"
-#include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
-#include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
+#include "xla/backends/gpu/codegen/triton/collective_emitter.h"
 #include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
-#include "xla/backends/gpu/codegen/triton/support.h"
-#include "xla/backends/gpu/codegen/triton/tma_utils.h"
-#include "xla/backends/gpu/codegen/triton/transforms/passes.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
-#include "xla/codegen/emitters/ir/xla_dialect.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
-#include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
 #include "xla/codegen/tiling/tiled_hlo_fusion_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/tiling/tiled_hlo_schedule.h"
+#include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -121,46 +96,33 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
-#include "xla/service/dump.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/triton_emitter_constraints.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
-#include "xla/stream_executor/launch_dim.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/path.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Types.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace xla {
 namespace gpu {
 
 namespace arith = ::mlir::arith;
-namespace ttir = ::mlir::triton;
 namespace mtx = ::mlir::triton::xla;
 namespace stablehlo = ::mlir::stablehlo;
 
 using ::llvm::SmallVector;
 using ::mlir::AffineMap;
 using ::mlir::ArrayRef;
-using ::mlir::ShapedType;
+using ::mlir::MLIRContext;
 using ::mlir::Type;
 using ::mlir::Value;
 using ::mlir::ValueRange;
@@ -169,219 +131,32 @@ using ::xla::gpu::triton::Cast;
 using ::xla::gpu::triton::CreateConst;
 using ::xla::gpu::triton::EmitConstant;
 using ::xla::gpu::triton::EmitElementwise;
+using ::xla::gpu::triton::EmitScope;
 using ::xla::gpu::triton::GetPaddedTileSizes;
-using ::xla::gpu::triton::ScalarOrTensor;
 using ::xla::gpu::triton::StorageType;
+using ::xla::gpu::triton::TensorValue;
+using ::xla::gpu::triton::TileInfo;
 using ::xla::gpu::triton::TritonType;
 
 namespace {
 
 Value MakeIndex(EmitterLocOpBuilder& b, int64_t value) {
-  return b.create<arith::ConstantIndexOp>(value);
+  return arith::ConstantIndexOp::create(b, value);
 }
 
-// Emit a value as Index clamped to [lower, upper].
-Value EmitClampedIndex(EmitterLocOpBuilder b, Value value, int64_t lower,
-                       int64_t upper) {
-  Value clamped_index = b.create<arith::MaxSIOp>(
-      value, CreateConst(b, value.getType(), lower).UnwrapUnsafe());
-  clamped_index = b.create<arith::MinSIOp>(
-      clamped_index, CreateConst(b, value.getType(), upper).UnwrapUnsafe());
-  return b.create<arith::IndexCastOp>(b.getIndexType(), clamped_index);
-}
-
-absl::StatusOr<SmallVector<Value>> ComputeOffsetsForTile(
-    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
-    const TiledHloInstruction& tiled_hlo) {
-  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
-                      tiled_hlo.tile_offsets_indexing());
-  const std::vector<IndexingMap::Variable>& rt_vars =
-      tile_offsets_indexing.GetRTVars();
-  CHECK_EQ(rt_vars.size(), runtime_values.size())
-      << absl::StrCat(tiled_hlo.ToString(), " has ", rt_vars.size(),
-                      " runtime variables in tile_offsets_indexing but only ",
-                      runtime_values.size(), " runtime values were provided");
-  CHECK_EQ(tile_offsets_indexing.GetRangeVars().size(), 0)
-      << "Range variables must be converted to dimensions. Instruction: "
-      << tiled_hlo.ToString();
-  // emitters::ApplyIndexing does not support symbols at the moment. As a
-  // workaround we convert them to dimensions.
-  IndexingMap dim_only_tiling =
-      tile_offsets_indexing.ConvertSymbolsToDimensions();
-  SmallVector<Value> dims;
-  dims.reserve(1 /* pid */ + runtime_values.size());
-  dims.push_back(pid);
-  for (const auto& [rt_var, value] : llvm::zip(rt_vars, runtime_values)) {
-    Value clamped_index =
-        EmitClampedIndex(b, value, rt_var.bounds.lower, rt_var.bounds.upper);
-    dims.push_back(triton::Cast(b, clamped_index, pid.getType()));
-  }
-  return emitters::ApplyIndexing(dim_only_tiling, /*dims=*/dims,
-                                 /*symbols=*/{}, b);
-}
-
-// Constructs and holds information needed to construct a tile. This information
-// is propagated to Extract/Insert ops to use them to load and store the correct
-// tiles.
-class TileInfo {
- public:
-  static absl::StatusOr<TileInfo> Construct(
-      EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
-      const TiledHloInstruction& tiled_hlo);
-
-  // Tile offsets. Its size is equal to the rank of the output shape.
-  ValueRange offsets() const { return offsets_; }
-
-  // Tile strides. Its size is equal to the rank of the output shape.
-  ArrayRef<int64_t> tile_strides() const { return tile_strides_; }
-
-  // The original shape of the tensor.
-  ArrayRef<int64_t> original_shape() const { return original_shape_; }
-
-  // Tile sizes after padding to a power of 2 (Triton requirement).
-  ArrayRef<int64_t> padded_tile_sizes() const { return padded_tile_sizes_; }
-
-  // The layout of the tensor in minor-to-major order.
-  const SmallVector<int64_t>& minor_to_major_layout() const {
-    return minor_to_major_layout_;
-  }
-
-  // The storage type of the tensor. This could be different from the element
-  // type. e.g. predicates are stored as i8 instead of i1.
-  Type storage_type() const { return storage_type_; }
-
- private:
-  SmallVector<Value> offsets_;
-  SmallVector<int64_t> tile_strides_;
-  SmallVector<int64_t> original_shape_;
-  SmallVector<int64_t> padded_tile_sizes_;
-  SmallVector<int64_t> minor_to_major_layout_;
-  Type storage_type_;
-
-  explicit TileInfo(SmallVector<Value> offsets,
-                    SmallVector<int64_t> tile_strides,
-                    SmallVector<int64_t> original_shape,
-                    SmallVector<int64_t> padded_tile_sizes,
-                    SmallVector<int64_t> minor_to_major_layout,
-                    Type storage_type)
-      : offsets_(std::move(offsets)),
-        tile_strides_(std::move(tile_strides)),
-        original_shape_(std::move(original_shape)),
-        padded_tile_sizes_(std::move(padded_tile_sizes)),
-        minor_to_major_layout_(std::move(minor_to_major_layout)),
-        storage_type_(std::move(storage_type)) {}
-};
-
-absl::StatusOr<TileInfo> TileInfo::Construct(
-    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
-    const TiledHloInstruction& tiled_hlo) {
-  TF_ASSIGN_OR_RETURN(SmallVector<Value> offsets,
-                      ComputeOffsetsForTile(b, pid, runtime_values, tiled_hlo));
-
-  // Triton requires that all block dimensions are a power of 2.
-  auto padded_tile_sizes = GetPaddedTileSizes(tiled_hlo.tile_sizes());
-  SmallVector<int64_t> original_shape;
-  original_shape.assign(tiled_hlo.hlo()->shape().dimensions().begin(),
-                        tiled_hlo.hlo()->shape().dimensions().end());
-
-  const Shape& shape = tiled_hlo.hlo()->shape();
-  TF_ASSIGN_OR_RETURN(Type expected_element_type,
-                      TritonType(b, shape.element_type()));
-  auto storage_type = StorageType(expected_element_type);
-
-  auto tile_strides = tiled_hlo.tile_strides();
-  auto minor_to_major_layout = llvm::to_vector(LayoutUtil::MinorToMajor(shape));
-
-  return TileInfo(offsets, tile_strides, original_shape, padded_tile_sizes,
-                  minor_to_major_layout, storage_type);
-}
-
-using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
-
-ScalarOrTensor Broadcast(EmitterLocOpBuilder b, TensorValue value,
-                         ArrayRef<int64_t> shape) {
-  return ScalarOrTensor(
-      b.create<ttir::BroadcastOp>(value.getType().clone(shape), value));
-}
-
-// Same as HLO BroadcastInDims. The sorted indices in `dims` specify the mapping
-// of the input dimensions to the output dimensions.
-ScalarOrTensor BroadcastInDims(EmitterLocOpBuilder b, ScalarOrTensor value,
-                               ArrayRef<int64_t> output_shape,
-                               ArrayRef<int64_t> dims) {
-  CHECK(llvm::is_sorted(dims)) << "broadcast dims must be sorted";
-
-  if (value.IsScalar()) {
-    return Splat(b, value, output_shape);
-  }
-  TensorValue input_tensor = value.UnwrapTensor();
-  auto input_shape = input_tensor.getType().getShape();
-  int64_t axis = 0;
-  int64_t input_dim_id = 0;
-  for (int output_dim_id = 0; output_dim_id < output_shape.size();
-       output_dim_id++) {
-    if (input_dim_id < dims.size() && output_dim_id == dims[input_dim_id]) {
-      // The dim is not broadcasted. Validate matching dim sizes.
-      CHECK_EQ(input_shape[input_dim_id], output_shape[output_dim_id]);
-      ++input_dim_id;
-      axis = output_dim_id + 1;
-      continue;
-    }
-    input_tensor = b.create<ttir::ExpandDimsOp>(input_tensor, axis);
-  }
-  return Broadcast(b, input_tensor, output_shape);
-}
-
-ScalarOrTensor Range(EmitterLocOpBuilder b, int32_t limit) {
+TensorValue Iota(EmitterLocOpBuilder b, int32_t limit) {
   auto type = mlir::RankedTensorType::get(limit, b.getI32Type());
-  return ScalarOrTensor(b.create<ttir::MakeRangeOp>(type, 0, limit));
+  return stablehlo::IotaOp::create(b, type, /*iota_dimension=*/0);
 }
 
-ScalarOrTensor EmitParameterExtract(EmitterLocOpBuilder b,
-                                    const TileInfo& tile_info,
-                                    Value parent_base_ptr) {
-  // For a pointer to a scalar or a zero-dimensional tensor, load the base
-  // pointer directly. This shortcut is necessary because Triton does not
-  // support 0-D tensors.
-  // TODO(csigg): This should be handled in the extract/insert rewrite.
-  if (tile_info.padded_tile_sizes().empty()) {
-    return ScalarOrTensor(ttir::LoadOp::create(
-        b, parent_base_ptr, ttir::CacheModifier::NONE,
-        ttir::EvictionPolicy::NORMAL, /*isVolatile=*/false));
-  }
-
-  return ScalarOrTensor(mtx::ExtractOp::create(
-      b,
-      mlir::RankedTensorType::get(tile_info.padded_tile_sizes(),
-                                  tile_info.storage_type()),
-      parent_base_ptr, tile_info.offsets(), tile_info.padded_tile_sizes(),
-      tile_info.tile_strides(), tile_info.original_shape(),
-      tile_info.minor_to_major_layout()));
-}
-
-absl::StatusOr<ScalarOrTensor> EmitScope(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const TritonFusionAnalysis* analysis,
-    absl::Span<const HloInstruction* const> instructions,
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor>& values);
-
-absl::StatusOr<ScalarOrTensor> EmitReduce(
+absl::StatusOr<TensorValue> EmitReduce(
     EmitterLocOpBuilder b, const TiledHloInstruction& tiled_hlo_reduce,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values,
-    absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info) {
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   // At the moment, we should only emit a full reduction over a single
   // dimension using a scalar as a neutral element.
   const HloReduceInstruction& hlo_reduce =
       *::xla::Cast<HloReduceInstruction>(tiled_hlo_reduce.hlo());
-  ScalarOrTensor input = values[tiled_hlo_reduce.operand(0)];
-  llvm::ArrayRef<int64_t> input_shape =
-      mlir::cast<ShapedType>(input.getType()).getShape();
-  absl::Span<const int64_t> source_tensor_shape =
-      hlo_reduce.operand(0)->shape().dimensions();
-
-  int reduction_dimension = hlo_reduce.dimensions().front();
+  TensorValue input = values[tiled_hlo_reduce.operand(0)];
 
   // Since every shape is padded to a power of 2 in Triton, the input tile may
   // be padded with arbitrary values. These values could affect the result of
@@ -391,48 +166,57 @@ absl::StatusOr<ScalarOrTensor> EmitReduce(
   // hlo_reduce.operand(1) is thus always the right choice to ensure that the
   // reduction is computed correctly, since it is the neutral value with
   // regards to the reducer.
-  int64_t source_tensor_reduction_dimension_size =
-      source_tensor_shape[reduction_dimension];
-  int64_t input_reduction_dimension_size = input_shape[reduction_dimension];
-  if (input_reduction_dimension_size !=
-      source_tensor_reduction_dimension_size) {
-    ScalarOrTensor range = Range(b, input_reduction_dimension_size);
-    ScalarOrTensor bcast =
-        BroadcastInDims(b, range, input_shape, {reduction_dimension});
-    ScalarOrTensor constant = CreateConst(
-        b, b.getI32Type(), source_tensor_reduction_dimension_size, input_shape);
-    Value mask =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, bcast.UnwrapUnsafe(),
-                                constant.UnwrapUnsafe());
-
-    ScalarOrTensor neutral = BroadcastInDims(
-        b, values[tiled_hlo_reduce.operand(1)], input_shape, /*dims=*/{});
-    input = ScalarOrTensor(b.create<arith::SelectOp>(mask, input.UnwrapUnsafe(),
-                                                     neutral.UnwrapUnsafe()));
+
+  absl::Span<const int64_t> unpadded_tile_sizes =
+      tiled_hlo_reduce.operand(0)->tile_sizes();
+  llvm::SmallVector<int64_t> mask_dim_bounds;
+  mask_dim_bounds.reserve(unpadded_tile_sizes.size());
+  for (auto [idx, dim_size] : llvm::enumerate(unpadded_tile_sizes)) {
+    if (absl::c_contains(hlo_reduce.dimensions(), idx)) {
+      // We only need to mask the reduction dimensions.
+      mask_dim_bounds.push_back(dim_size);
+    } else {
+      mask_dim_bounds.push_back(input.getType().getDimSize(idx));
+    }
   }
+  mlir::Value neutral_value =
+      mlir::tensor::ExtractOp::create(b, values[tiled_hlo_reduce.operand(1)]);
+  // Use createOrFold as the mask may be be reduntant, in which case it will be
+  // folded away.
+  input = mlir::cast<TensorValue>(
+      b.createOrFold<xtile::MaskOp>(input, mask_dim_bounds, neutral_value));
 
-  ttir::ReduceOp reduction =
-      b.create<ttir::ReduceOp>(input.UnwrapUnsafe(), reduction_dimension);
+  Value init_value = values[tiled_hlo_reduce.operand(1)];
+
+  stablehlo::ReduceOp reduction = stablehlo::ReduceOp::create(
+      b, input, init_value, hlo_reduce.dimensions());
   {
     TF_ASSIGN_OR_RETURN(Type result_ty,
                         TritonType(b, hlo_reduce.shape().element_type()));
+    result_ty = mlir::RankedTensorType::get({}, result_ty);
+
     mlir::Location loc = b.getLoc();
     mlir::Block* reducer = b.createBlock(&reduction->getRegion(0), {},
                                          {result_ty, result_ty}, {loc, loc});
+    b.setInsertionPointToStart(reducer);
 
     HloComputation* reduction_computation = hlo_reduce.to_apply();
 
     std::vector<const HloInstruction*> to_emit;
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor> region_values;
+    absl::flat_hash_map<const HloInstruction*, TensorValue> region_values;
     for (const HloInstruction* instr :
          reduction_computation->MakeInstructionPostOrder()) {
       if (instr->opcode() == HloOpcode::kParameter) {
         int parameter_number = instr->parameter_number();
         TF_RET_CHECK(parameter_number < 2);
-        TF_RET_CHECK(region_values
-                         .insert({instr, ScalarOrTensor(reducer->getArgument(
-                                             parameter_number))})
-                         .second);
+        auto argument = mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(
+            reducer->getArgument(parameter_number));
+
+        if (!argument) {
+          return Internal("Expected reducer argument to be a tensor.");
+        }
+
+        TF_RET_CHECK(region_values.insert({instr, argument}).second);
       } else {
         to_emit.push_back(instr);
       }
@@ -440,53 +224,13 @@ absl::StatusOr<ScalarOrTensor> EmitReduce(
 
     TF_RET_CHECK(!to_emit.empty());
 
-    b.setInsertionPointToStart(reducer);
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor result,
-        EmitScope(b, libdevice_path, device_info, /*analysis=*/nullptr, to_emit,
-                  region_values));
-    b.create<ttir::ReduceReturnOp>(SmallVector<Value>({result.UnwrapUnsafe()}));
+    TF_ASSIGN_OR_RETURN(TensorValue result,
+                        EmitScope(b, to_emit, region_values));
+    stablehlo::ReturnOp::create(b, SmallVector<Value>({result}));
     b.setInsertionPointAfter(reduction);
   }
 
-  return ScalarOrTensor(reduction.getResult().front());
-}
-
-// Emit code corresponding to a fusion instruction somehow nested within the
-// initial Triton fusion. This can happen when we carry around auxiliary
-// computations, e.g. with reduces. Since we are emitting a single Triton
-// fusion, we simply flatten the fusion inside the computation.
-//
-// TODO(b/331413981): get rid of this special handling once this is solved.
-absl::StatusOr<ScalarOrTensor> EmitNestedFusion(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction& fusion_instruction,
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor>& values) {
-  // TODO(b/331402498): revisit the order of scope once we completely
-  // deprecate Triton fusion analysis.
-  const HloComputation* fusion_computation =
-      fusion_instruction.fused_instructions_computation();
-
-  absl::flat_hash_map<const HloInstruction*, ScalarOrTensor> region_values;
-
-  std::vector<const HloInstruction*> to_emit;
-  for (const HloInstruction* instr :
-       fusion_computation->MakeInstructionPostOrder()) {
-    if (instr->opcode() == HloOpcode::kParameter) {
-      int64_t parameter_number = instr->parameter_number();
-      auto it = values.find(fusion_instruction.operand(parameter_number));
-      TF_RET_CHECK(it != values.end());
-      TF_RET_CHECK(region_values.insert({instr, it->second}).second);
-    } else {
-      to_emit.push_back(instr);
-    }
-  }
-
-  TF_RET_CHECK(to_emit.back() == fusion_computation->root_instruction());
-
-  return EmitScope(b, libdevice_path, device_info, /*analysis=*/nullptr,
-                   to_emit, region_values);
+  return mlir::cast<TensorValue>(reduction.getResult(0));
 }
 
 template <typename T>
@@ -494,9 +238,9 @@ ArrayRef<T> MakeArrayRef(const absl::Span<const T> span) {
   return ArrayRef(span.data(), span.size());
 }
 
-ScalarOrTensor EmitTiledBroadcast(
+TensorValue EmitTiledBroadcast(
     EmitterLocOpBuilder b, const TiledHloInstruction& tiled_broadcast,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const SmallVector<int64_t>& input_tile_shape =
       tiled_broadcast.operand(0)->tile_sizes();
   const SmallVector<int64_t>& output_tile_shape = tiled_broadcast.tile_sizes();
@@ -509,12 +253,13 @@ ScalarOrTensor EmitTiledBroadcast(
   SmallVector<int64_t> padded_output_tile_shape =
       GetPaddedTileSizes(output_tile_shape);
 
-  ScalarOrTensor input = values[tiled_broadcast.operand(0)];
-  return BroadcastInDims(b, input, padded_output_tile_shape,
-                         MakeArrayRef(tiled_broadcast.hlo()->dimensions()));
+  TensorValue input = values[tiled_broadcast.operand(0)];
+  return triton::BroadcastInDims(
+      b, input, padded_output_tile_shape,
+      MakeArrayRef(tiled_broadcast.hlo()->dimensions()));
 }
 
-absl::StatusOr<ScalarOrTensor> EmitTiledIota(
+absl::StatusOr<TensorValue> EmitTiledIota(
     EmitterLocOpBuilder b, Value pid, const TiledHloInstruction& tiled_iota) {
   const HloIotaInstruction* hlo_iota =
       ::xla::Cast<HloIotaInstruction>(tiled_iota.hlo());
@@ -535,18 +280,16 @@ absl::StatusOr<ScalarOrTensor> EmitTiledIota(
            b.getI32Type());
 
   // First, stride as needed between the iota components.
-  Value range = b.create<arith::MulIOp>(
-      Range(b, padded_tile_sizes[iota_dim]).UnwrapTensor(),
-      Splat(b,
-            CreateConst(b, b.getI32Type(), tiled_iota.tile_strides()[iota_dim]),
-            padded_tile_sizes[iota_dim])
-          .UnwrapTensor());
+  Value range = arith::MulIOp::create(
+      b, Iota(b, padded_tile_sizes[iota_dim]),
+      triton::Splat(
+          b,
+          CreateConst(b, b.getI32Type(), tiled_iota.tile_strides()[iota_dim]),
+          padded_tile_sizes[iota_dim]));
 
   // Then, add the base offset to the iota components.
-  range = b.create<arith::AddIOp>(
-      range,
-      Splat(b, ScalarOrTensor(iota_dim_offset), padded_tile_sizes[iota_dim])
-          .UnwrapTensor());
+  range = arith::AddIOp::create(
+      b, range, triton::Splat(b, iota_dim_offset, padded_tile_sizes[iota_dim]));
 
   // Cast the result to the targeted type.
   TF_ASSIGN_OR_RETURN(Type iota_element_type,
@@ -556,120 +299,68 @@ absl::StatusOr<ScalarOrTensor> EmitTiledIota(
 
   // And finally, produce a broadcast along the non-iota dimensions in order to
   // produce the whole iota tile.
-  return BroadcastInDims(b, ScalarOrTensor(range), padded_tile_sizes,
-                         /*dims=*/{iota_dim});
+  return triton::BroadcastInDims(b, mlir::cast<TensorValue>(range),
+                                 padded_tile_sizes,
+                                 /*dims=*/{iota_dim});
 }
 
 SmallVector<Value> GetRuntimeValues(
     const TiledHloInstruction& tiled_hlo,
-    const absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>&
+    const absl::flat_hash_map<const TiledHloInstruction*, TensorValue>&
         values) {
   SmallVector<Value> runtime_values;
   if (!tiled_hlo.runtime_variables().empty()) {
     for (const TiledHloInstruction* rt : tiled_hlo.runtime_variables()) {
       CHECK(values.contains(rt))
           << absl::StrCat(" runtime variable ", rt->ToString(), " not found");
-      runtime_values.push_back(values.at(rt).UnwrapScalar());
+      TensorValue value = values.at(rt);
+      mlir::OpBuilder builder(value.getContext());
+      builder.setInsertionPointAfterValue(value);
+      runtime_values.push_back(
+          mlir::tensor::ExtractOp::create(builder, value.getLoc(), value));
     }
   }
   return runtime_values;
 }
 
-// Reshapes a non-0D tensor of shape [1, 1, 1, ...] to a scalar.
-ScalarOrTensor ReshapeTensorToScalar(EmitterLocOpBuilder b, Value input) {
-  auto element_type = mlir::cast<ShapedType>(input.getType()).getElementType();
-
-  // First, reshape to a 1D tensor if not already the case. This is needed
-  // because triton::ReduceOp can only reduce 1 dimension at a time.
-  auto single_dim_tensor = input;
-  if (mlir::cast<ShapedType>(input.getType()).getRank() > 1) {
-    Type output_tensor_type = mlir::RankedTensorType::get({1}, element_type);
-    single_dim_tensor = b.create<ttir::ReshapeOp>(output_tensor_type, input,
-                                                  /*allow_reorder=*/true);
-  }
-
-  // Second, reduce to a scalar.
-  ttir::ReduceOp reduction =
-      b.create<ttir::ReduceOp>(single_dim_tensor, /*axis*/ 0);
-
-  mlir::Location loc = b.getLoc();
-  mlir::Block* reducer = b.createBlock(
-      &reduction->getRegion(0), /*insertPt=*/{},
-      /*argTypes=*/{element_type, element_type}, /*locs=*/{loc, loc});
-
-  b.setInsertionPointToStart(reducer);
-  Value result = mlir::isa<mlir::IntegerType>(element_type)
-                     ? b.create<arith::AddIOp>(reducer->getArgument(0),
-                                               reducer->getArgument(1))
-                           .getResult()
-                     : b.create<arith::AddFOp>(reducer->getArgument(0),
-                                               reducer->getArgument(1))
-                           .getResult();
-  b.create<ttir::ReduceReturnOp>(SmallVector<Value>({result}));
-  b.setInsertionPointAfter(reduction);
-
-  return ScalarOrTensor(reduction.getResult().front());
-}
-
-absl::StatusOr<ScalarOrTensor> EmitTiledReshape(EmitterLocOpBuilder b,
-                                                ArrayRef<int64_t> tile_sizes,
-                                                ScalarOrTensor input) {
+absl::StatusOr<TensorValue> EmitTiledReshape(EmitterLocOpBuilder b,
+                                             ArrayRef<int64_t> tile_sizes,
+                                             TensorValue input) {
+  mlir::RankedTensorType input_type = input.getType();
   SmallVector<int64_t> padded_tile_sizes = GetPaddedTileSizes(tile_sizes);
 
-  if (input.IsScalar()) {
-    if (tile_sizes.empty()) {
-      // Nothing to do.
-      return input;
-    }
-    // Convert the scalar to a tensor.
-    return Splat(b, input, padded_tile_sizes);
-  }
-
-  // At this point we know that the input is a non-0D tensor.
-  auto input_shaped_type = mlir::cast<ShapedType>(input.getType());
-
-  // Handle the case of reshaping [1,1,1...] to a scalar.
-  if (tile_sizes.empty()) {
-    return ReshapeTensorToScalar(b, input.UnwrapTensor());
-  }
-
   // At this point we know that neither the input nor the output are 0D tensors.
   auto output_tensor_type = mlir::RankedTensorType::get(
-      padded_tile_sizes, input_shaped_type.getElementType());
+      padded_tile_sizes, input_type.getElementType());
 
-  if (input_shaped_type.getNumElements() !=
-      output_tensor_type.getNumElements()) {
+  if (input_type.getNumElements() != output_tensor_type.getNumElements()) {
     return absl::InvalidArgumentError(
         absl::StrCat("Reshape input and output shapes must be the same, got ",
-                     absl::StrJoin(input_shaped_type.getShape(), "x"), " -> ",
+                     absl::StrJoin(input_type.getShape(), "x"), " -> ",
                      absl::StrJoin(output_tensor_type.getShape(), "x")));
   }
 
-  // Conservatively prevent Triton from reordering elements within the tile.
-  // TODO(b/353637689): see if this restriction can be lifted.
-  bool allow_reorder = false;
-  auto reshape = b.create<ttir::ReshapeOp>(output_tensor_type,
-                                           input.UnwrapUnsafe(), allow_reorder);
-  return ScalarOrTensor(reshape.getResult());
+  return stablehlo::ReshapeOp::create(b, output_tensor_type, input);
 }
 
-Value EmitTiledTranspose(EmitterLocOpBuilder b, ArrayRef<int64_t> tile_sizes,
-                         SmallVector<int64_t> dimensions, Value input) {
+TensorValue EmitTiledTranspose(EmitterLocOpBuilder b,
+                               ArrayRef<int64_t> tile_sizes,
+                               SmallVector<int64_t> dimensions,
+                               TensorValue input) {
   SmallVector<int64_t> padded_tile_sizes = GetPaddedTileSizes(tile_sizes);
 
-  Type input_element_type =
-      mlir::cast<ShapedType>(input.getType()).getElementType();
+  Type input_element_type = input.getType().getElementType();
   Type output_tensor_type =
       mlir::RankedTensorType::get(padded_tile_sizes, input_element_type);
 
   mlir::DenseI64ArrayAttr order = b.getDenseI64ArrayAttr(dimensions);
 
-  return b.create<stablehlo::TransposeOp>(output_tensor_type, input, order);
+  return stablehlo::TransposeOp::create(b, output_tensor_type, input, order);
 }
 
-absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
+absl::StatusOr<TensorValue> EmitTiledBitcast(
     EmitterLocOpBuilder b, const TiledHloInstruction& tiled_bitcast,
-    Value input) {
+    TensorValue input) {
   Shape input_shape = tiled_bitcast.hlo()->operand(0)->shape();
   const Shape& output_shape = tiled_bitcast.hlo()->shape();
   // If the bitcast changes the element type to an element type of the same
@@ -683,13 +374,11 @@ absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
     }
     TF_ASSIGN_OR_RETURN(Type output_element_type,
                         TritonType(b, output_shape.element_type()));
-    Type output_type =
-        mlir::isa<TensorValue>(input)
-            ? mlir::RankedTensorType::get(
-                  GetPaddedTileSizes(tiled_bitcast.operand(0)->tile_sizes()),
-                  output_element_type)
-            : output_element_type;
-    input = b.create<ttir::BitcastOp>(output_type, input);
+    auto output_type = mlir::RankedTensorType::get(
+        GetPaddedTileSizes(tiled_bitcast.operand(0)->tile_sizes()),
+        output_element_type);
+    input = mlir::cast<TensorValue>(
+        mlir::tensor::BitcastOp::create(b, output_type, input).getResult());
     input_shape.set_element_type(output_shape.element_type());
   }
 
@@ -712,7 +401,7 @@ absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
   // the bitcast, so it's not possible to easily propagate them from the output.
   std::vector<int64_t> transpose1_tile_sizes =
       Permute(tiled_bitcast.operand(0)->tile_sizes(), trt->transpose1_dims);
-  Value normalized_input =
+  TensorValue normalized_input =
       trt->IsTranspose1Identity()
           ? input
           : EmitTiledTranspose(b, transpose1_tile_sizes,
@@ -725,34 +414,30 @@ absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
   // the inverse permutation.
   std::vector<int64_t> reshape_tile_sizes =
       PermuteInverse(tiled_bitcast.tile_sizes(), trt->transpose2_dims);
-  Value normalized_reshape;
+  TensorValue normalized_reshape;
   if (ShapeUtil::Equal(trt->transpose1_shape, trt->reshape_shape)) {
     normalized_reshape = normalized_input;
   } else {
-    TF_ASSIGN_OR_RETURN(auto reshape,
-                        EmitTiledReshape(b, reshape_tile_sizes,
-                                         ScalarOrTensor(normalized_input)));
-    normalized_reshape = reshape.UnwrapUnsafe();
+    TF_ASSIGN_OR_RETURN(
+        normalized_reshape,
+        EmitTiledReshape(b, reshape_tile_sizes, normalized_input));
   }
 
   // The final transpose simply uses the tile sizes computed for the original
   // bitcast by the tiling analysis.
-  return ScalarOrTensor{
-      trt->IsTranspose2Identity()
-          ? normalized_reshape
-          : EmitTiledTranspose(b, tiled_bitcast.tile_sizes(),
-                               llvm::to_vector(trt->transpose2_dims),
-                               normalized_reshape)};
+  return trt->IsTranspose2Identity()
+             ? normalized_reshape
+             : EmitTiledTranspose(b, tiled_bitcast.tile_sizes(),
+                                  llvm::to_vector(trt->transpose2_dims),
+                                  normalized_reshape);
 }
 
-absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloComputation& tiled_computation, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values);
-
+absl::StatusOr<std::vector<TensorValue>> EmitTiledComputation(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloComputation& tiled_computation,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values);
 // Returns the number of iterations of the loop over the contracting
 // dimension of matrix multiplication.
 absl::StatusOr<int64_t> GetDotLoopIterationCount(
@@ -769,16 +454,10 @@ absl::StatusOr<int64_t> GetDotLoopIterationCount(
         absl::StrCat("Only one contracting dimension is supported, got ",
                      dims.lhs_contracting_dimensions_size()));
   }
-  auto contracting_dim_idx = dims.lhs_contracting_dimensions(0);
-  int64_t k = dot.operand(0)->shape().dimensions(contracting_dim_idx);
-
-  const TiledHloFusionInstruction* tiled_hlo_fusion =
-      static_cast<const TiledHloFusionInstruction*>(tiled_dot.operand(0));
-  auto fusion_tile_sizes =
-      tiled_hlo_fusion->called_computation()->GetRoots()[0]->tile_sizes();
-  int64_t tile_k = fusion_tile_sizes[contracting_dim_idx];
-
-  return CeilOfRatio(k, tile_k);
+  auto dim_idx = dims.lhs_contracting_dimensions(0);
+  int64_t k_size = tiled_dot.hlo()->operand(0)->shape().dimensions(dim_idx);
+  int64_t k_tile = tiled_dot.operand(0)->tile_size(dim_idx);
+  return CeilOfRatio(k_size, k_tile);
 }
 
 // TODO(b/393299275): unify with the logic in `EmitReduce`.
@@ -787,18 +466,16 @@ absl::StatusOr<int64_t> GetDotLoopIterationCount(
 //
 // Note: we currently assume that contracting_dimension_tile_index is an i32
 // scalar.
-absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
-                                     const TiledHloInstruction& dot_operand,
-                                     Value dot_operand_value,
-                                     Value contracting_dimension_tile_index,
-                                     int contraction_dimension_index) {
+absl::StatusOr<TensorValue> MaskDotOperand(
+    EmitterLocOpBuilder b, const TiledHloInstruction& dot_operand,
+    TensorValue dot_operand_value, Value contracting_dimension_tile_index,
+    int contraction_dimension_index) {
   if (contracting_dimension_tile_index.getType() != b.getI32Type()) {
     return absl::FailedPreconditionError(
         "contracting_dimension_tile_index must be an i32 scalar");
   }
 
-  llvm::ArrayRef<int64_t> tile_shape =
-      mlir::cast<ShapedType>(dot_operand_value.getType()).getShape();
+  llvm::ArrayRef<int64_t> tile_shape = dot_operand_value.getType().getShape();
 
   int64_t contracting_dimension_size =
       dot_operand.hlo()->shape().dimensions(contraction_dimension_index);
@@ -809,18 +486,16 @@ absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
     // contracting dimension---i.e. tiles whose index exceeds the number of
     // full tiles (tiles without padding).
     Type result_type = dot_operand_value.getType();
-    Value tile_size_value =
-        CreateConst(b, b.getI32Type(), tile_size, {}).UnwrapScalar();
-    Value num_full_tiles = b.create<arith::DivSIOp>(
-        CreateConst(b, b.getI32Type(), contracting_dimension_size, {})
-            .UnwrapScalar(),
+    Value tile_size_value = CreateConst(b, b.getI32Type(), tile_size);
+    Value num_full_tiles = arith::DivSIOp::create(
+        b, CreateConst(b, b.getI32Type(), contracting_dimension_size),
         tile_size_value);
     // if tile_index >= num_full_tiles...
-    auto cond = b.create<arith::CmpIOp>(arith::CmpIPredicate::sge,
-                                        contracting_dimension_tile_index,
-                                        num_full_tiles);
-    auto if_op = b.create<mlir::scf::IfOp>(mlir::TypeRange(result_type), cond,
-                                           /*withElseRegion=*/true);
+    auto cond =
+        arith::CmpIOp::create(b, arith::CmpIPredicate::sge,
+                              contracting_dimension_tile_index, num_full_tiles);
+    auto if_op = mlir::scf::IfOp::create(b, mlir::TypeRange(result_type), cond,
+                                         /*withElseRegion=*/true);
     // then ...
     {
       b.setInsertionPointToStart(if_op.thenBlock());
@@ -828,40 +503,38 @@ absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
       //   contracting_dimension_tile_index * tile_size + range(0, tile_size)
       // mask = indices < contracting_dimension_size
       // operand = select(broadcast(mask, operand.shape), operand, 0)
-      Value tile_offset = b.create<arith::MulIOp>(
-          contracting_dimension_tile_index, tile_size_value);
-      Value range = Range(b, tile_size).UnwrapTensor();
-      Value broadcasted_tile_offset =
-          Splat(b, ScalarOrTensor(tile_offset), {tile_size}).UnwrapTensor();
-      Value indices = b.create<arith::AddIOp>(range, broadcasted_tile_offset);
+      Value tile_offset = arith::MulIOp::create(
+          b, contracting_dimension_tile_index, tile_size_value);
+      TensorValue range = Iota(b, tile_size);
+      TensorValue broadcasted_tile_offset =
+          triton::Splat(b, tile_offset, {tile_size});
+      Value indices = arith::AddIOp::create(b, range, broadcasted_tile_offset);
 
       Value boundary = CreateConst(b, b.getI32Type(),
-                                   contracting_dimension_size, {tile_size})
-                           .UnwrapTensor();
+                                   contracting_dimension_size, {tile_size});
 
-      Value mask =
-          b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, indices, boundary);
+      Value mask = arith::CmpIOp::create(b, arith::CmpIPredicate::slt, indices,
+                                         boundary);
 
-      mask = BroadcastInDims(b, ScalarOrTensor(mask), tile_shape,
-                             {contraction_dimension_index})
-                 .UnwrapTensor();
+      mask = triton::BroadcastInDims(b, mlir::cast<TensorValue>(mask),
+                                     tile_shape, {contraction_dimension_index});
       TF_ASSIGN_OR_RETURN(
           auto element_type,
           TritonType(b, dot_operand.hlo()->shape().element_type()));
 
-      ScalarOrTensor zero = CreateConst(b, element_type, 0.0f, tile_shape);
+      TensorValue zero = CreateConst(b, element_type, 0.0f, tile_shape);
 
-      Value masked_dot_operand = b.create<arith::SelectOp>(
-          mask, dot_operand_value, zero.UnwrapTensor());
-      b.create<mlir::scf::YieldOp>(masked_dot_operand);
+      Value masked_dot_operand =
+          arith::SelectOp::create(b, mask, dot_operand_value, zero);
+      mlir::scf::YieldOp::create(b, masked_dot_operand);
     }
     // else ...
     {
       b.setInsertionPointToStart(if_op.elseBlock());
-      b.create<mlir::scf::YieldOp>(dot_operand_value);
+      mlir::scf::YieldOp::create(b, dot_operand_value);
     }
     b.setInsertionPointAfter(if_op);
-    return if_op.getResult(0);
+    return mlir::cast<TensorValue>(if_op.getResult(0));
   }
 
   return dot_operand_value;
@@ -870,11 +543,11 @@ absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
 // Returns `shape` without all its unit dimensions, as well as the index of the
 // remaining dimensions in the original `shape`.
 std::pair<SmallVector<int64_t>, SmallVector<int64_t>> CollapseUnitDims(
-    llvm::ArrayRef<int64_t> shape) {
+    llvm::ArrayRef<int64_t> shape, llvm::ArrayRef<int64_t> counterpart_shape) {
   SmallVector<int64_t> shape_without_unit_dims;
   SmallVector<int64_t> non_unit_dims_indices;
   for (auto [i, size] : llvm::enumerate(shape)) {
-    if (size != 1) {
+    if (size != 1 || size != counterpart_shape[i]) {
       shape_without_unit_dims.push_back(size);
       non_unit_dims_indices.push_back(i);
     }
@@ -889,15 +562,26 @@ enum class DotOperandSide { kLhs, kRhs };
 // the given side (the second dimension for LHS, the first dimension for the
 // RHS).
 //
+// If it is a scaled-dot scale operand then we drop the extra dims only
+// when they equal to 1  and are matching with the corresponding operand.
+// Example:
+//   when lhs_scale operand with shape [1,128, 1] (passed as operand parameter)
+//   and lhs operand with shape [1,128, 32] (passed as counterpart_operand
+//   parameter)
+//   the function will drop only the first dim and will keep the last one
+//   because the last one of the lhs operand is not equal to 1.
+//
 // Returns an error if canonicalization is not possible.
-absl::StatusOr<Value> CanonicalizeDotOperand(EmitterLocOpBuilder b,
-                                             Value operand,
-                                             int64_t contracting_dim_idx,
-                                             DotOperandSide side) {
-  llvm::ArrayRef<int64_t> shape =
-      mlir::cast<ShapedType>(operand.getType()).getShape();
+absl::StatusOr<TensorValue> CanonicalizeDotOperand(
+    EmitterLocOpBuilder b, TensorValue operand, int64_t contracting_dim_idx,
+    DotOperandSide side, TensorValue counterpart_operand = nullptr) {
+  llvm::ArrayRef<int64_t> shape = operand.getType().getShape();
+  llvm::ArrayRef<int64_t> counterpart_shape =
+      counterpart_operand == nullptr ? shape
+                                     : counterpart_operand.getType().getShape();
+
   auto [shape_without_unit_dims, non_unit_dims_indices] =
-      CollapseUnitDims(shape);
+      CollapseUnitDims(shape, counterpart_shape);
 
   if (shape_without_unit_dims.size() != 2) {
     return absl::FailedPreconditionError(
@@ -905,10 +589,8 @@ absl::StatusOr<Value> CanonicalizeDotOperand(EmitterLocOpBuilder b,
   }
 
   if (shape.size() != shape_without_unit_dims.size()) {
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor wrapped_operand,
-        EmitTiledReshape(b, shape_without_unit_dims, ScalarOrTensor(operand)));
-    operand = wrapped_operand.UnwrapTensor();
+    TF_ASSIGN_OR_RETURN(operand,
+                        EmitTiledReshape(b, shape_without_unit_dims, operand));
   }
 
   int expected_contracting_dim_position = side == DotOperandSide::kLhs ? 1 : 0;
@@ -926,13 +608,12 @@ absl::StatusOr<Value> CanonicalizeDotOperand(EmitterLocOpBuilder b,
   return operand;
 }
 
-absl::StatusOr<ScalarOrTensor> EmitDot(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloInstruction& tiled_hlo_dot, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<TensorValue> EmitDot(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_dot,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   // We expect to get a tiled HLO in form:
   //
   // left { ... }
@@ -975,7 +656,7 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
       GetPaddedTileSizes(tiled_hlo_dot.tile_sizes());
 
   SmallVector<int64_t, 2> padded_tile_sizes_no_unit_dims =
-      CollapseUnitDims(padded_tile_sizes).first;
+      CollapseUnitDims(padded_tile_sizes, padded_tile_sizes).first;
 
   // Sanity check: Triton historically did not support non-2D dots (and still
   // doesn't support arbitrary nD dots), so we require that the dot is tiled
@@ -992,53 +673,55 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
   // and the dot's output type does not match its expectations.
   TF_ASSIGN_OR_RETURN(Type accumulator_type,
                       triton::GetDotAccumulatorType(b, dot));
-  Value accumulator =
-      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims)
-          .UnwrapTensor();
+  TensorValue accumulator =
+      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims);
 
   TF_ASSIGN_OR_RETURN(int64_t loop_iteration_count,
                       GetDotLoopIterationCount(tiled_hlo_dot));
-  auto for_op = b.create<mlir::scf::ForOp>(
+  auto pid_dim = b.getAffineDimExpr(0);
+  auto ki_symbol = b.getAffineSymbolExpr(0);
+  // Nested fusions are tiled with indexing map 'pid * loop_iter_count + ki'
+  IndexingMap computation_index_map{
+      AffineMap::get(1, 1, pid_dim * loop_iteration_count + ki_symbol),
+      {IndexingMap::Variable{
+          tiled_hlo_dot.tile_offsets_indexing()->GetDimensionBound(0), "pid"}},
+      {IndexingMap::Variable{{0, loop_iteration_count - 1}, "k"}},
+      /*rt_vars=*/{}};
+
+  auto for_op = mlir::scf::ForOp::create(
+      b,
       /*lowerBound=*/MakeIndex(b, 0),
       /*upperBound=*/MakeIndex(b, loop_iteration_count),
-      /*step=*/MakeIndex(b, 1), ValueRange{accumulator});
+      /*step=*/MakeIndex(b, 1), accumulator);
+
+  if (block_level_parameters.is_warp_specialization_allowed) {
+    for_op->setAttr("tt.warp_specialize", b.getBoolAttr(true));
+  }
+
   {  // Loop body.
     mlir::OpBuilder::InsertionGuard g(b);
     b.setInsertionPointToStart(for_op.getBody());
-    SmallVector<TensorValue> dot_args;
     Value ki = for_op.getInductionVar();
-    // Nested fusions are tiled with indexing map
-    // (pid * loop_iteration_count_value + loop index) -> ....
-    auto pid_dim = b.getAffineDimExpr(0);
-    auto ki_symbol = b.getAffineSymbolExpr(0);
-    IndexingMap computation_index_map{
-        AffineMap::get(1, 1, {pid_dim * loop_iteration_count + ki_symbol}),
-        {IndexingMap::Variable{
-            tiled_hlo_dot.tile_offsets_indexing()->GetDimensionBound(0),
-            "pid"}},
-        {IndexingMap::Variable{{0, loop_iteration_count - 1}, "k"}},
-        /*rt_vars=*/{}};
-
-    Value computation_index = b.create<xla::ApplyIndexingOp>(
-                                   ValueRange{pid, ki}, computation_index_map)
+    Value computation_index = xla::ApplyIndexingOp::create(
+                                  b, ValueRange{pid, ki}, computation_index_map)
                                   .getResult(0);
+    SmallVector<TensorValue> dot_args;
     for (const TiledHloInstruction* operand : tiled_hlo_dot.operands()) {
       VLOG(3) << "Emitting dot operand: " << operand->ToString();
       const TiledHloFusionInstruction* tiled_fusion_operand =
           static_cast<const TiledHloFusionInstruction*>(operand);
       TF_ASSIGN_OR_RETURN(
-          std::vector<ScalarOrTensor> result,
+          std::vector<TensorValue> result,
           EmitTiledComputation(
-              b, libdevice_path, device_info,
-              ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
-              *tiled_fusion_operand->called_computation(), fn,
-              computation_index, values));
+              b, ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+              *tiled_fusion_operand->called_computation(),
+              block_level_parameters, fn, computation_index, values));
       if (result.size() != 1) {
         return absl::InternalError(absl::StrCat(
             "Expected nested fusion computation to emit a single value, got ",
             result.size()));
       }
-      dot_args.push_back(result.front().UnwrapTensor());
+      dot_args.push_back(result.front());
     }
     Value acc = for_op.getRegionIterArgs().front();
     int64_t lhs_contracting_dim_idx =
@@ -1049,12 +732,14 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
 
     Value ki_i32 = Cast(b, ki, b.getI32Type());
     TF_ASSIGN_OR_RETURN(
-        Value lhs, MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0],
-                                  ki_i32, lhs_contracting_dim_idx));
+        TensorValue lhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0], ki_i32,
+                       lhs_contracting_dim_idx));
 
     TF_ASSIGN_OR_RETURN(
-        Value rhs, MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1],
-                                  ki_i32, rhs_contracting_dim_idx));
+        TensorValue rhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1], ki_i32,
+                       rhs_contracting_dim_idx));
 
     // Canonicalize the dot operands to match Triton's/the hardware's
     // expectations.
@@ -1068,7 +753,7 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
     TF_ASSIGN_OR_RETURN(
         Value acc_next,
         triton::EmitSingleTileDot(b, dot, triton::DotOperands{lhs, rhs, acc}));
-    b.create<mlir::scf::YieldOp>(acc_next);
+    mlir::scf::YieldOp::create(b, acc_next);
   }
 
   // The output of the loop may not match the expected output type of the dot.
@@ -1081,23 +766,21 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
     result = Cast(b, result, dot_output_type);
   }
 
+  auto tensor_result = mlir::cast<TensorValue>(result);
+
   if (padded_tile_sizes.size() != padded_tile_sizes_no_unit_dims.size()) {
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor wrapped_result,
-        EmitTiledReshape(b, padded_tile_sizes, ScalarOrTensor(result)));
-    result = wrapped_result.UnwrapTensor();
+    return EmitTiledReshape(b, padded_tile_sizes, tensor_result);
   }
 
-  return ScalarOrTensor(result);
+  return tensor_result;
 }
 
-absl::StatusOr<ScalarOrTensor> EmitScaledDot(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloInstruction& tiled_hlo_dot, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<TensorValue> EmitScaledDot(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_dot,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   VLOG(2) << "EmitScaledDot: " << tiled_hlo_dot.ToString();
   const HloScaledDotInstruction& scaled_dot =
       *::xla::Cast<HloScaledDotInstruction>(tiled_hlo_dot.hlo());
@@ -1112,7 +795,7 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
       GetPaddedTileSizes(tiled_hlo_dot.tile_sizes());
 
   SmallVector<int64_t, 2> padded_tile_sizes_no_unit_dims =
-      CollapseUnitDims(padded_tile_sizes).first;
+      CollapseUnitDims(padded_tile_sizes, padded_tile_sizes).first;
 
   // Sanity check: Triton historically did not support non-2D dots (and still
   // doesn't support arbitrary nD dots), so we require that the dot is tiled
@@ -1125,53 +808,52 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
   }
 
   Type accumulator_type = b.getF32Type();
-  Value accumulator =
-      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims)
-          .UnwrapTensor();
+  TensorValue accumulator =
+      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims);
 
   TF_ASSIGN_OR_RETURN(int64_t loop_iteration_count,
                       GetDotLoopIterationCount(tiled_hlo_dot));
-  auto for_op = b.create<mlir::scf::ForOp>(
+  auto pid_dim = b.getAffineDimExpr(0);
+  auto ki_symbol = b.getAffineSymbolExpr(0);
+  // Nested fusions are tiled with indexing map 'pid * loop_iter_count + ki'
+  IndexingMap computation_index_map{
+      AffineMap::get(1, 1, pid_dim * loop_iteration_count + ki_symbol),
+      {IndexingMap::Variable{
+          tiled_hlo_dot.tile_offsets_indexing()->GetDimensionBound(0), "pid"}},
+      {IndexingMap::Variable{{0, loop_iteration_count - 1}, "k"}},
+      /*rt_vars=*/{}};
+
+  // TODO(b/449668102): Consider adding warp specialization support for scaled
+  // dot. At the moment, there are no benchmarks that use scaled dot.
+  auto for_op = mlir::scf::ForOp::create(
+      b,
       /*lowerBound=*/MakeIndex(b, 0),
       /*upperBound=*/MakeIndex(b, loop_iteration_count),
-      /*step=*/MakeIndex(b, 1), SmallVector<Value>{accumulator});
+      /*step=*/MakeIndex(b, 1), accumulator);
   {  // Loop body.
     mlir::OpBuilder::InsertionGuard g(b);
     b.setInsertionPointToStart(for_op.getBody());
-    SmallVector<TensorValue> dot_args;
     Value ki = for_op.getInductionVar();
-    // Nested fusions are tiled with indexing map
-    // (pid * loop_iteration_count_value + loop index) -> ....
-    auto pid_dim = b.getAffineDimExpr(0);
-    auto ki_symbol = b.getAffineSymbolExpr(0);
-    IndexingMap computation_index_map{
-        AffineMap::get(1, 1, {pid_dim * loop_iteration_count + ki_symbol}),
-        {IndexingMap::Variable{
-            tiled_hlo_dot.tile_offsets_indexing()->GetDimensionBound(0),
-            "pid"}},
-        {IndexingMap::Variable{{0, loop_iteration_count - 1}, "k"}},
-        /*rt_vars=*/{}};
-
-    Value computation_index = b.create<xla::ApplyIndexingOp>(
-                                   ValueRange{pid, ki}, computation_index_map)
+    Value computation_index = xla::ApplyIndexingOp::create(
+                                  b, ValueRange{pid, ki}, computation_index_map)
                                   .getResult(0);
+    SmallVector<TensorValue> dot_args;
     for (const TiledHloInstruction* operand : tiled_hlo_dot.operands()) {
       VLOG(3) << "Emitting scaled dot operand: " << operand->ToString();
       const TiledHloFusionInstruction* tiled_fusion_operand =
           static_cast<const TiledHloFusionInstruction*>(operand);
       TF_ASSIGN_OR_RETURN(
-          std::vector<ScalarOrTensor> result,
+          std::vector<TensorValue> result,
           EmitTiledComputation(
-              b, libdevice_path, device_info,
-              ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
-              *tiled_fusion_operand->called_computation(), fn,
-              computation_index, values));
+              b, ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+              *tiled_fusion_operand->called_computation(),
+              block_level_parameters, fn, computation_index, values));
       if (result.size() != 1) {
         return absl::InternalError(absl::StrCat(
             "Expected nested fusion computation to emit a single value, got ",
             result.size()));
       }
-      dot_args.push_back(result.front().UnwrapTensor());
+      dot_args.push_back(result.front());
     }
     Value acc = for_op.getRegionIterArgs().front();
     int64_t lhs_contracting_dim_idx =
@@ -1185,43 +867,46 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
     // hinders performance for Triton.
     Value ki_i32 = Cast(b, ki, b.getI32Type());
     TF_ASSIGN_OR_RETURN(
-        Value lhs, MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0],
-                                  ki_i32, lhs_contracting_dim_idx));
+        TensorValue lhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0], ki_i32,
+                       lhs_contracting_dim_idx));
     TF_ASSIGN_OR_RETURN(
-        Value rhs, MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1],
-                                  ki_i32, rhs_contracting_dim_idx));
+        TensorValue rhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1], ki_i32,
+                       rhs_contracting_dim_idx));
 
     TF_ASSIGN_OR_RETURN(
-        Value lhs_scale,
+        TensorValue lhs_scale,
         MaskDotOperand(b, *tiled_hlo_dot.operand(2), dot_args[2], ki_i32,
                        lhs_contracting_dim_idx));
 
     TF_ASSIGN_OR_RETURN(
-        Value rhs_scale,
+        TensorValue rhs_scale,
         MaskDotOperand(b, *tiled_hlo_dot.operand(3), dot_args[3], ki_i32,
                        rhs_contracting_dim_idx));
 
     // Canonicalize the dot operands to match Triton's/the hardware's
     // expectations.
+
+    TF_ASSIGN_OR_RETURN(
+        lhs_scale, CanonicalizeDotOperand(b, lhs_scale, lhs_contracting_dim_idx,
+                                          DotOperandSide::kLhs, lhs));
+    TF_ASSIGN_OR_RETURN(
+        rhs_scale, CanonicalizeDotOperand(b, rhs_scale, rhs_contracting_dim_idx,
+                                          DotOperandSide::kRhs, rhs));
     TF_ASSIGN_OR_RETURN(lhs,
                         CanonicalizeDotOperand(b, lhs, lhs_contracting_dim_idx,
                                                DotOperandSide::kLhs));
     TF_ASSIGN_OR_RETURN(rhs,
                         CanonicalizeDotOperand(b, rhs, rhs_contracting_dim_idx,
                                                DotOperandSide::kRhs));
-    TF_ASSIGN_OR_RETURN(
-        lhs_scale, CanonicalizeDotOperand(b, lhs_scale, lhs_contracting_dim_idx,
-                                          DotOperandSide::kLhs));
-    TF_ASSIGN_OR_RETURN(
-        rhs_scale, CanonicalizeDotOperand(b, rhs_scale, rhs_contracting_dim_idx,
-                                          DotOperandSide::kRhs));
 
     TF_ASSIGN_OR_RETURN(
         Value acc_next,
         triton::EmitSingleTileScaledDot(
             b, scaled_dot,
             triton::ScaledDotOperands{lhs, rhs, lhs_scale, rhs_scale, acc}));
-    b.create<mlir::scf::YieldOp>(acc_next);
+    mlir::scf::YieldOp::create(b, acc_next);
   }
 
   // The output of the loop may not match the expected output type of the dot.
@@ -1234,23 +919,21 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
     result = Cast(b, result, dot_output_type);
   }
 
+  auto tensor_result = mlir::cast<TensorValue>(result);
+
   if (padded_tile_sizes.size() != padded_tile_sizes_no_unit_dims.size()) {
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor wrapped_result,
-        EmitTiledReshape(b, padded_tile_sizes, ScalarOrTensor(result)));
-    result = wrapped_result.UnwrapTensor();
+    return EmitTiledReshape(b, padded_tile_sizes, tensor_result);
   }
 
-  return ScalarOrTensor(result);
+  return tensor_result;
 }
 
-absl::StatusOr<ScalarOrTensor> EmitConcatenate(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloInstruction& tiled_concatenate, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<TensorValue> EmitConcatenate(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_concatenate,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const int64_t concatenate_dimension =
       tiled_concatenate.hlo()->concatenate_dimension();
 
@@ -1259,21 +942,24 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
   // prologue of reductions.
   SmallVector<int64_t> padded_tile_sizes =
       GetPaddedTileSizes(tiled_concatenate.tile_sizes());
-  int64_t concatenate_dimension_tile_size =
-      padded_tile_sizes[concatenate_dimension];
+  int64_t concat_dim_tile_size = padded_tile_sizes[concatenate_dimension];
 
-  for (const TiledHloInstruction* operand : tiled_concatenate.operands()) {
+  int64_t num_operands = tiled_concatenate.operands().size();
+  for (const auto [index, operand] :
+       llvm::enumerate(tiled_concatenate.operands())) {
     if (operand->hlo()->opcode() != HloOpcode::kFusion) {
       // Sanity check: all operands should be nested fusions.
       return absl::FailedPreconditionError(
           "Expected concatenate operands to be nested fusions.");
     }
 
-    int64_t operand_concatenate_dimension_size =
-        tiled_concatenate.hlo()->shape().dimensions(concatenate_dimension);
+    int64_t operand_concat_dim_size =
+        operand->hlo()->shape().dimensions(concatenate_dimension);
 
-    if (operand_concatenate_dimension_size % concatenate_dimension_tile_size !=
-        0) {
+    // The last operand does not have to be a multiple of the tile size, since
+    // we can pad it.
+    if (index != num_operands - 1 &&
+        operand_concat_dim_size % concat_dim_tile_size != 0) {
       // Sanity check: concatenation dimension should be divisible by the tile
       // size for each operand. This is not a fundamental limitation, but this
       // lowering will emit incorrect code if this does not hold---so we gate
@@ -1281,8 +967,7 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
       return absl::FailedPreconditionError(absl::StrCat(
           "Expected the tile size of the concatenation dimension of operand ",
           operand->ToString(), "to divide the dimension size exactly, but got",
-          operand_concatenate_dimension_size, " % ",
-          concatenate_dimension_tile_size, " != 0"));
+          operand_concat_dim_size, " % ", concat_dim_tile_size, " != 0"));
     }
   }
   TF_ASSIGN_OR_RETURN(
@@ -1315,19 +1000,19 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
     // directly populates the `else` block of the previous `if_op`.
     if (if_ops.size() < tiled_concatenate.operands().size() - 1) {
       limit += operand->hlo()->shape().dimensions()[concatenate_dimension];
-      Value offset_limit =
-          CreateConst(b, b.getIndexType(), limit, {}).UnwrapScalar();
+      Value offset_limit = CreateConst(b, b.getIndexType(), limit);
 
       auto cond =
-          b.create<arith::CmpIOp>(arith::CmpIPredicate::slt,
-                                  concatenate_dimension_offset, offset_limit);
-      auto if_op = b.create<mlir::scf::IfOp>(mlir::TypeRange(result_type), cond,
-                                             /*withElseRegion=*/true);
+          arith::CmpIOp::create(b, arith::CmpIPredicate::slt,
+                                concatenate_dimension_offset, offset_limit);
+      auto if_op =
+          mlir::scf::IfOp::create(b, mlir::TypeRange(result_type), cond,
+                                  /*withElseRegion=*/true);
 
       // Propagate the result from the nested `if_op` if we were already within
       // an `if_op`.
       if (!if_ops.empty()) {
-        b.create<mlir::scf::YieldOp>(if_op.getResult(0));
+        mlir::scf::YieldOp::create(b, if_op.getResult(0));
       }
 
       b.setInsertionPointToStart(if_op.thenBlock());
@@ -1338,30 +1023,24 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
         static_cast<const TiledHloFusionInstruction*>(
             tiled_concatenate.operand(i));
     TF_ASSIGN_OR_RETURN(
-        std::vector<ScalarOrTensor> result,
+        std::vector<TensorValue> result,
         EmitTiledComputation(
-            b, libdevice_path, device_info,
-            ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
-            *tiled_fusion_operand->called_computation(), fn, pid, values));
+            b, ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+            *tiled_fusion_operand->called_computation(), block_level_parameters,
+            fn, pid, values));
     CHECK_EQ(result.size(), 1);
-    b.create<mlir::scf::YieldOp>(result.front().UnwrapTensor());
+    mlir::scf::YieldOp::create(b, result.front());
   }
 
   b.setInsertionPointAfter(if_ops.front());
 
-  return ScalarOrTensor(if_ops.front().getResult(0));
+  return mlir::cast<TensorValue>(if_ops.front().getResult(0));
 }
 
-absl::StatusOr<ScalarOrTensor> EmitPad(
-    EmitterLocOpBuilder b, const se::DeviceDescription& device_info,
-    const TiledHloInstruction& tiled_pad,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values,
+absl::StatusOr<TensorValue> EmitPad(
+    EmitterLocOpBuilder b, const TiledHloInstruction& tiled_pad,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values,
     Value pid) {
-  if (!IsTritonSupportedInstruction(*tiled_pad.hlo(),
-                                    device_info.gpu_compute_capability())) {
-    return absl::FailedPreconditionError(
-        absl::StrCat("Pad is not supported: ", tiled_pad.hlo()->ToString()));
-  }
   // TODO(b/393299275): get rid of calls to `GetPaddedTileSizes` once tiling
   // is using power of twos everywhere, including when propagating into the
   // prologue of reductions.
@@ -1389,41 +1068,38 @@ absl::StatusOr<ScalarOrTensor> EmitPad(
     }
 
     // LHS for the compare is an iota broadcasted to the output shape.
-    ScalarOrTensor range = Range(b, pad_output_dim_size);
-    ScalarOrTensor bcast = BroadcastInDims(b, range, padded_tile_sizes,
-                                           {static_cast<int64_t>(dim_index)});
+    TensorValue range = Iota(b, pad_output_dim_size);
+    TensorValue bcast = triton::BroadcastInDims(
+        b, range, padded_tile_sizes, {static_cast<int64_t>(dim_index)});
 
     // RHS for the compare is splat(pad_input_dim_size - tile_offset).
     Value tile_offset_i32 = Cast(b, tile_offset, i32_type);
-    Value threshold = b.create<arith::SubIOp>(
-        CreateConst(b, i32_type, pad_input_dim_size).UnwrapScalar(),
-        tile_offset_i32);
-    ScalarOrTensor threshold_splat =
-        Splat(b, ScalarOrTensor(threshold), padded_tile_sizes);
-    Value cmp =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, bcast.UnwrapTensor(),
-                                threshold_splat.UnwrapTensor());
-    mask = mask ? b.create<arith::AndIOp>(mask, cmp) : cmp;
+    Value threshold = arith::SubIOp::create(
+        b, CreateConst(b, i32_type, pad_input_dim_size), tile_offset_i32);
+    TensorValue threshold_splat =
+        triton::Splat(b, threshold, padded_tile_sizes);
+    Value cmp = arith::CmpIOp::create(b, arith::CmpIPredicate::slt, bcast,
+                                      threshold_splat);
+    mask = mask ? arith::AndIOp::create(b, mask, cmp) : cmp;
   }
   if (!mask) {
     return values[tiled_operand];
   }
   const TiledHloInstruction* padding_value = tiled_pad.operand(1);
 
-  ScalarOrTensor pad_value_splat =
-      Splat(b, values[padding_value], padded_tile_sizes);
-  auto result = ScalarOrTensor(
-      b.create<arith::SelectOp>(mask, values[tiled_operand].UnwrapUnsafe(),
-                                pad_value_splat.UnwrapUnsafe()));
-  return result;
+  TensorValue pad_value_splat =
+      triton::Splat(b, values[padding_value], padded_tile_sizes);
+  return mlir::cast<TensorValue>(
+      arith::SelectOp::create(b, mask, values[tiled_operand], pad_value_splat)
+          .getResult());
 }
 
-absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion, const TiledHloInstruction& tiled_hlo,
+absl::StatusOr<TensorValue> EmitTiledHloInstruction(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo,
+    const BlockLevelParameters& block_level_parameters,
     mlir::FunctionOpInterface fn, Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const HloInstruction* hlo = tiled_hlo.hlo();
   VLOG(4) << "EmitTiledHloInstruction: " << hlo->ToString();
 
@@ -1442,7 +1118,7 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
         TileInfo tile_info,
         TileInfo::Construct(b, pid, GetRuntimeValues(tiled_hlo, values),
                             tiled_hlo));
-    ScalarOrTensor parameter =
+    TensorValue parameter =
         EmitParameterExtract(b, tile_info, fn.getArgument(arg_index));
 
     // Some types are stored using different types, e.g. i1 is stored in memory
@@ -1462,30 +1138,30 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
             "while lowering ",
             fusion->called_computation()->ToString()));
       }
-      parameter = ScalarOrTensor(
-          Cast(b, parameter.UnwrapUnsafe(), expected_element_type));
+      parameter =
+          mlir::cast<TensorValue>(Cast(b, parameter, expected_element_type));
     }
 
     return parameter;
   }
 
   if (hlo->opcode() == HloOpcode::kConcatenate) {
-    return EmitConcatenate(b, libdevice_path, device_info, fusion, tiled_hlo,
-                           fn, pid, values);
+    return EmitConcatenate(b, fusion, tiled_hlo, block_level_parameters, fn,
+                           pid, values);
   }
 
   if (hlo->opcode() == HloOpcode::kPad) {
-    return EmitPad(b, device_info, tiled_hlo, values, pid);
+    return EmitPad(b, tiled_hlo, values, pid);
   }
 
   if (hlo->opcode() == HloOpcode::kDot) {
-    return EmitDot(b, libdevice_path, device_info, fusion, tiled_hlo, fn, pid,
+    return EmitDot(b, fusion, tiled_hlo, block_level_parameters, fn, pid,
                    values);
   }
 
   if (hlo->opcode() == HloOpcode::kScaledDot) {
-    return EmitScaledDot(b, libdevice_path, device_info, fusion, tiled_hlo, fn,
-                         pid, values);
+    return EmitScaledDot(b, fusion, tiled_hlo, block_level_parameters, fn, pid,
+                         values);
   }
 
   if (hlo->opcode() == HloOpcode::kConstant) {
@@ -1505,7 +1181,12 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
   }
 
   if (hlo->opcode() == HloOpcode::kReduce) {
-    return EmitReduce(b, tiled_hlo, values, libdevice_path, device_info);
+    return EmitReduce(b, tiled_hlo, values);
+  }
+
+  if (hlo->opcode() == HloOpcode::kAllReduceStart) {
+    return EmitCollective(b, fusion, tiled_hlo, block_level_parameters, fn, pid,
+                          values);
   }
 
   if (hlo->IsElementwise()) {
@@ -1513,12 +1194,10 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
     operands.reserve(hlo->operands().size());
 
     for (const TiledHloInstruction* operand : tiled_hlo.operands()) {
-      operands.push_back(values[operand].UnwrapUnsafe());
+      operands.push_back(values[operand]);
     }
-    TF_ASSIGN_OR_RETURN(
-        Value result,
-        EmitElementwise(b, libdevice_path, device_info, *hlo, operands));
-    return ScalarOrTensor(result);
+    TF_ASSIGN_OR_RETURN(Value result, EmitElementwise(b, *hlo, operands));
+    return mlir::cast<TensorValue>(result);
   }
 
   if (hlo->opcode() == HloOpcode::kReshape) {
@@ -1527,16 +1206,15 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
   }
 
   if (hlo->opcode() == HloOpcode::kBitcast) {
-    return EmitTiledBitcast(b, tiled_hlo,
-                            values[tiled_hlo.operand(0)].UnwrapUnsafe());
+    return EmitTiledBitcast(b, tiled_hlo, values[tiled_hlo.operand(0)]);
   }
 
   if (hlo->opcode() == HloOpcode::kTranspose) {
     auto transpose =
         ::xla::Cast<const HloTransposeInstruction>(tiled_hlo.hlo());
-    return ScalarOrTensor(EmitTiledTranspose(
-        b, tiled_hlo.tile_sizes(), llvm::to_vector(transpose->dimensions()),
-        values[tiled_hlo.operand(0)].UnwrapUnsafe()));
+    return EmitTiledTranspose(b, tiled_hlo.tile_sizes(),
+                              llvm::to_vector(transpose->dimensions()),
+                              values[tiled_hlo.operand(0)]);
   }
 
   // Slice is currently supported only as an operation on indices
@@ -1555,16 +1233,12 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
       absl::StrCat("Unsupported operation ", hlo->ToString()));
 }
 
-// Emit a sequence of instructions using compatible tiling with producers
-// ordered before consumers in `tiled_computation`. Returns the results for the
-// roots of `tiled_computation`.
-absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloComputation& tiled_computation, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<std::vector<TensorValue>> EmitTiledComputation(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloComputation& tiled_computation,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   VLOG(2) << "EmitTiledComputation: " << tiled_computation.ToString();
   for (const TiledHloInstruction* tiled_hlo :
        tiled_computation.instructions()) {
@@ -1572,30 +1246,17 @@ absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
     // Skip generating nested fusions, they are emitted by their consumer.
     if (hlo->parent()->IsFusionComputation() &&
         hlo->opcode() == HloOpcode::kFusion) {
-      if (hlo->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_experimental_scaled_dot_with_triton()) {
-        continue;
-      }
-      CodegenDecision decision = IsTritonSupportedInstruction(
-          *hlo, device_info.gpu_compute_capability());
-      if (!decision.CanFuse()) {
-        return absl::FailedPreconditionError(
-            absl::StrCat("Fusion ", hlo->ToString(),
-                         " is not supported: ", decision.Explain()));
-      }
       VLOG(1) << "Skipping nested fusion: " << hlo->ToString();
       continue;
     }
     TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor result,
-        EmitTiledHloInstruction(b, libdevice_path, device_info, fusion,
-                                *tiled_hlo, fn, pid, values));
+        TensorValue result,
+        EmitTiledHloInstruction(b, fusion, *tiled_hlo, block_level_parameters,
+                                fn, pid, values));
     TF_RET_CHECK(values.insert({tiled_hlo, result}).second) << hlo->ToString();
     VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
   }
-  std::vector<ScalarOrTensor> results;
+  std::vector<TensorValue> results;
   results.reserve(tiled_computation.GetRoots().size());
   for (const auto* root : tiled_computation.GetRoots()) {
     results.push_back(values[root]);
@@ -1603,68 +1264,6 @@ absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
   return std::move(results);
 }
 
-// Emit sequence of instructions using compatible tiling ordered producers
-// before consumers.
-absl::StatusOr<ScalarOrTensor> EmitScope(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const TritonFusionAnalysis* analysis,
-    absl::Span<const HloInstruction* const> instructions,
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor>& values) {
-  for (const HloInstruction* hlo : instructions) {
-    ScalarOrTensor result;
-    if (hlo->opcode() == HloOpcode::kConcatenate ||
-        hlo->opcode() == HloOpcode::kDynamicSlice) {
-      // Parameter loads and their concatenations are handled outside EmitScope.
-      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
-      continue;
-    } else if (hlo->opcode() == HloOpcode::kParameter) {
-      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate ||
-          hlo->users()[0]->opcode() == HloOpcode::kDynamicSlice) {
-        continue;
-      }
-      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
-      continue;
-    } else if (hlo->opcode() == HloOpcode::kConstant) {
-      TF_ASSIGN_OR_RETURN(result, EmitConstant(b, *hlo));
-    } else if (hlo->opcode() == HloOpcode::kBroadcast) {
-      return absl::InvalidArgumentError(
-          "Broadcast is not yet supported in EmitScope().");
-    } else if (HloInstruction::IsOpElementwise(hlo->opcode())) {
-      std::vector<Value> operands;
-      operands.reserve(hlo->operands().size());
-      for (const HloInstruction* operand : hlo->operands()) {
-        operands.push_back(values[operand].UnwrapUnsafe());
-      }
-      TF_ASSIGN_OR_RETURN(
-          Value elementwise_result,
-          EmitElementwise(b, libdevice_path, device_info, *hlo, operands));
-      result = ScalarOrTensor(elementwise_result);
-    } else if (hlo->opcode() == HloOpcode::kTuple) {
-      TF_RET_CHECK(hlo->IsRoot()) << hlo->ToString();
-    } else if (hlo->opcode() == HloOpcode::kBitcast ||
-               hlo->opcode() == HloOpcode::kTranspose ||
-               hlo->opcode() == HloOpcode::kSlice ||
-               hlo->opcode() == HloOpcode::kReshape ||
-               hlo->opcode() == HloOpcode::kPad) {
-      // All these are currently supported only as operations on indices
-      // which are pushed to loads and stores. No operations on tiles are
-      // performed here.
-      result = values[hlo->operand(0)];
-    } else if (hlo->opcode() == HloOpcode::kFusion) {
-      const auto* fusion_instruction = ::xla::Cast<HloFusionInstruction>(hlo);
-      TF_ASSIGN_OR_RETURN(result,
-                          EmitNestedFusion(b, libdevice_path, device_info,
-                                           *fusion_instruction, values));
-    } else {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Unsupported operation ", hlo->ToString()));
-    }
-    TF_RET_CHECK(values.insert({hlo, result}).second) << hlo->ToString();
-    VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
-  }
-  return values[instructions.back()];
-}
 }  // namespace
 
 namespace ir_emitter_triton_internal {
@@ -1701,6 +1300,11 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
       dot_tiling_parameters.reserve(num_tiling_parameters);
       for (int64_t contracting_dim_id :
            hlo->dot_dimension_numbers().lhs_contracting_dimensions()) {
+        if (contracting_dim_id >= lhs_output_tile_sizes.size()) {
+          return absl::FailedPreconditionError(
+              absl::StrCat("Output tile sizes index ", contracting_dim_id,
+                           " is out of bounds for ", lhs->ToString()));
+        }
         dot_tiling_parameters.push_back(
             lhs_output_tile_sizes[contracting_dim_id]);
       }
@@ -1711,6 +1315,12 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
     // TODO(b/390559452): this should change for generalized multi-output
     // fusions.
     if (hlo == real_root) {
+      if (real_root_index >= block_level_parameters.output_tile_sizes.size()) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Output tile sizes index ", real_root_index,
+            " is out of bounds for block level fusion config: ",
+            block_level_parameters.ToBlockLevelFusionConfig().DebugString()));
+      }
       absl::Span<const int64_t> output_tile_sizes =
           block_level_parameters.output_tile_sizes[real_root_index];
       tile_mapping[hlo].insert(tile_mapping[hlo].end(),
@@ -1726,17 +1336,12 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
 
 namespace {
 
-using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
-
-// Generate Triton IR inside 'fn', using the given block_level_parameters.
-// TODO(b/421837868): `BlockLevelParameters` should hold all the necessary
-// tiling information.
-absl::Status EmitGeneric(mlir::OpBuilder builder,
-                         absl::string_view libdevice_path,
-                         const se::DeviceDescription& device_info,
-                         const HloFusionInstruction* fusion,
-                         mlir::FunctionOpInterface fn,
-                         const BlockLevelParameters& block_level_parameters) {
+absl::Status EmitGeneric(
+    mlir::OpBuilder builder,
+    EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
+    const HloFusionInstruction* fusion, xtile::EntryFuncOp fn,
+    const BlockLevelParameters& block_level_parameters,
+    MLIRContext* mlir_context) {
   if (VLOG_IS_ON(6)) {
     VLOG(6) << "Emitting Triton IR for fusion\n"
             << ExtractInstructionIntoNewModule(*fusion)->ToString();
@@ -1744,8 +1349,8 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
   const HloComputation* computation = fusion->fused_instructions_computation();
   SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
       SymbolicTileAnalysis::AnalyzeComputation(
-          *computation, builder.getContext(),
-          TritonEmitterConstraints::GetBuilder(device_info));
+          *computation, mlir_context, emitter_specific_constraints_builder);
+
   if (std::holds_alternative<FusionDecision>(symbolic_tile_analysis_or)) {
     return Internal(
         "Unsupported fusion in EmitGeneric: %s",
@@ -1774,30 +1379,61 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
                             ->config()
                             .debug_options()
                             .xla_gpu_unsupported_annotate_with_emitter_loc());
-
-  int64_t root_index = FindIndex(symbolic_tile_analysis.GetRoots(), root);
+  absl::Span<const HloInstruction* const> roots =
+      symbolic_tile_analysis.GetRoots();
+  int64_t root_index = FindIndex(roots, root);
+  TiledHloScheduleBuilder schedule_builder = CreateMajorToMinorTiledHloSchedule;
+
+  // TODO(b/417977182): this is a hacky heuristic to avoid regressing cases
+  // involving hardcoded grid tiling in the legacy emitter, as we enable the new
+  // one for `dot` fusions.
+  //
+  // The idea here is that, if `lhs` can fully fit in L2 cache, and `rhs` does
+  // not, we should start with iterating over the full `lhs` in order to have it
+  // in cache for all subsequent iterations over `rhs`. That means we should
+  // iterate over `lhs`'s non-contracting dimensions first.
+  //
+  // Whenever it is not true that one of the operands can fit fully in cache, it
+  // is more beneficial to use a "planar snake" space-filling curve to optimize
+  // L2 cache hits, but this is not implemented yet.
+  if (roots.size() == 1 && root->opcode() == HloOpcode::kDot) {
+    int64_t lhs_bytes_size =
+        Product(root->operand(0)->shape().dimensions()) *
+        primitive_util::ByteWidth(root->operand(0)->shape().element_type());
+    int64_t rhs_bytes_size =
+        Product(root->operand(1)->shape().dimensions()) *
+        primitive_util::ByteWidth(root->operand(1)->shape().element_type());
+    if (lhs_bytes_size < rhs_bytes_size) {
+      // Validates whether the expected invariants are upheld by the analysis to
+      // ensure we don't crash later.
+      //
+      // TODO(b/417977182): use a "conformance" API instead of a builder to
+      // reuse what we build here directly.
+      absl::StatusOr<std::unique_ptr<TransposedDotTiledHloSchedule>>
+          transposed_schedule = TransposedDotTiledHloSchedule::Create(
+              symbolic_tile_analysis.GetTilingSpecification());
+      if (transposed_schedule.ok()) {
+        schedule_builder = TransposedDotTiledHloSchedule::Create;
+      }
+    }
+  }
   TF_RET_CHECK(root_index < symbolic_tile_analysis.GetRoots().size());
   TF_ASSIGN_OR_RETURN(TiledHloComputation tiled_hlo_computation,
                       symbolic_tile_analysis.ComputeTiledHloInstructions(
-                          tiling,
+                          tiling, schedule_builder,
                           /*constraints_are_known_satisfied=*/false,
                           /*compute_all_tile_offset_indexing_maps=*/true));
   VLOG(3) << "EmitGeneric: tiled HLO computation:\n"
           << tiled_hlo_computation.ToString();
 
-  // TODO(b/389955087): we can decide whether to sign extend by understanding if
-  // we need 64 bits to encode indices or if 32 bits are enough. For now, just
-  // use 64 bits to avoid issues.
-  Value pid_i64 = Cast(b, b.create<ttir::GetProgramIdOp>(ttir::ProgramIDDim::X),
-                       b.getI64Type());
-  Value pid = Cast(b, pid_i64, b.getIndexType());
-  absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor> values;
+  Value tile_id = fn.getTileId();
+  absl::flat_hash_map<const TiledHloInstruction*, TensorValue> values;
   TF_ASSIGN_OR_RETURN(
       auto results,
-      EmitTiledComputation(b, libdevice_path, device_info, fusion,
-                           tiled_hlo_computation, fn, pid, values));
+      EmitTiledComputation(b, fusion, tiled_hlo_computation,
+                           block_level_parameters, fn, tile_id, values));
 
-  for (auto [root, result, parent_base_ptr] :
+  for (auto [root, result, arg] :
        llvm::zip(tiled_hlo_computation.GetRoots(), results,
                  fn.getArguments().drop_front(computation->num_parameters()))) {
     // Some types are stored using different types, e.g. i1 is stored in memory
@@ -1807,32 +1443,16 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
     Type result_storage_type = StorageType(result_element_type);
 
     if (result_element_type != result_storage_type) {
-      result =
-          ScalarOrTensor(Cast(b, result.UnwrapUnsafe(), result_storage_type));
+      result = mlir::cast<TensorValue>(Cast(b, result, result_storage_type));
     }
 
-    if (result.IsScalar()) {
-      // TODO(csigg): Handle this in extract/insert rewrite.
-      ttir::StoreOp::create(b, parent_base_ptr, result.UnwrapScalar(),
-                            /*mask=*/nullptr);
-      continue;
-    }
-
-    CHECK(root->hlo()->shape().IsArray() &&
-          !root->hlo()->shape().dimensions().empty());
     TF_ASSIGN_OR_RETURN(
         auto tile_info,
-        TileInfo::Construct(b, pid, /*runtime_values=*/{}, *root));
-
-    // Should not be scalar at this point.
-    CHECK(!tile_info.padded_tile_sizes().empty())
-        << "Unexpected scalar encountered. Expected padded_tile_sizes() to be "
-           "non-empty.";
+        TileInfo::Construct(b, tile_id, /*runtime_values=*/{}, *root));
 
-    mtx::InsertOp::create(b, result.UnwrapTensor(), parent_base_ptr,
-                          tile_info.offsets(), tile_info.padded_tile_sizes(),
-                          tile_info.tile_strides(), tile_info.original_shape(),
-                          tile_info.minor_to_major_layout());
+    xtile::InsertTileOp::create(b, result, arg, tile_info.offsets(),
+                                tile_info.padded_tile_sizes(),
+                                tile_info.tile_strides());
   }
 
   return absl::OkStatus();
@@ -1840,410 +1460,32 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
 
 }  // namespace
 
-void LoadMlirDialectsForTriton(mlir::MLIRContext& mlir_context) {
-  mlir_context.loadDialect<
-      ttir::TritonDialect, ttir::gpu::TritonGPUDialect,
-      mlir::arith::ArithDialect, mlir::affine::AffineDialect,
-      mlir::LLVM::LLVMDialect, xla::XlaDialect, xla::gpu::XlaGpuDialect,
-      ttir::xla::XlaTritonDialect, mlir::func::FuncDialect,
-      mlir::tensor::TensorDialect, stablehlo::StablehloDialect>();
-  mlir::DialectRegistry registry;
-  mlir::func::registerInlinerExtension(registry);
-  mlir::LLVM::registerInlinerInterface(registry);
-  mlir_context.appendDialectRegistry(registry);
-}
-
-// Simplified copy of translateLLVMToLLVMIR which in addition takes
-// path to libdevice directly as an argument.
-absl::StatusOr<std::unique_ptr<llvm::Module>> TranslateLLVMToLLVMIR(
-    llvm::LLVMContext* llvmContext, mlir::ModuleOp module) {
-  mlir::DialectRegistry registry;
-  mlir::registerBuiltinDialectTranslation(registry);
-  mlir::registerLLVMDialectTranslation(registry);
-  mlir::registerNVVMDialectTranslation(registry);
-  mlir::registerROCDLDialectTranslation(registry);
-  module->getContext()->appendDialectRegistry(registry);
-
-  std::unique_ptr<llvm::Module> llvmModule =
-      mlir::translateModuleToLLVMIR(module, *llvmContext);
-  if (!llvmModule) {
-    return Internal("Failed to emit LLVM IR.");
-  }
-  // TODO: b/363203060 - Upstream Triton sets specific flags for the LLVM
-  // optimizer to get best performance. Figure out if we can gain any of it by
-  // propagating these flags to
-  // xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc.
-  return llvmModule;
-}
-
-absl::Status CreateInternalError(absl::string_view message,
-                                 const HloFusionInstruction* fusion,
-                                 mlir::ModuleOp triton_module) {
-  std::string err;
-  llvm::raw_string_ostream os(err);
-  os << message << "\n";
-  os << "fusion instruction: " << fusion->ToString() << "\n";
-  os << "HLO module to reproduce:\n"
-     << ExtractInstructionIntoNewModule(*fusion)->ToString();
-  os << "triton_module>>>\n";
-  triton_module->print(os, mlir::OpPrintingFlags().enableDebugInfo(true, true));
-  os << "<<<triton_module\n";
-  return absl::InternalError(err);
-}
-
-// Legacy emitter works with tt.func. New emitter works with func.func.
-// TODO(b/393299275): Remove legacy optionality once migration is complete.
-void AppendFuncArgType(absl::Span<const int64_t> dims, Type ir_type,
-                       SmallVector<Type>& fn_arg_types) {
-  fn_arg_types.push_back(ttir::PointerType::get(
-      StorageType(ir_type),
-      static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Global)));
-}
-
-// Legacy emitter works with tt.func. New emitter works with func.func.
-// TODO(b/393299275): Remove legacy optionality once migration is complete.
-mlir::FunctionOpInterface CreateFuncOp(EmitterLocOpBuilder b,
-                                       absl::string_view fn_name,
-                                       absl::string_view fusion_kind,
-                                       SmallVector<Type>& fn_arg_types) {
-  if (fusion_kind != kTritonGemmFusionKind) {
-    return b.create<mlir::func::FuncOp>(fn_name,
-                                        b.getFunctionType(fn_arg_types, {}));
-  }
-  auto func = b.create<ttir::FuncOp>(
-      fn_name, b.getFunctionType(fn_arg_types, mlir::TypeRange()));
-  auto divisibility_attr = b.getI32IntegerAttr(16);
-  for (int i = 0; i < func.getNumArguments(); ++i) {
-    func.setArgAttr(i, "tt.divisibility", divisibility_attr);
-  }
-  return func;
-}
-
-// Legacy emitter works with tt.return. New emitter works with func.return.
-// TODO(b/393299275): Remove legacy optionality once migration is complete.
-void EmitReturnOp(EmitterLocOpBuilder b, absl::string_view fusion_kind) {
-  if (fusion_kind == kTritonGemmFusionKind) {
-    b.create<ttir::ReturnOp>();
-  } else {
-    b.create<mlir::func::ReturnOp>();
-  }
-}
-
-absl::StatusOr<stream_executor::gpu::TmaMetadata> ExtractTmaMetadata(
-    mlir::ModuleOp triton_module, absl::string_view kernel_name) {
-  stream_executor::gpu::TmaMetadata tma_metadata;
-  SmallVector<mlir::LLVM::LLVMFuncOp> func_ops;
-  for (auto func : triton_module.getOps<mlir::LLVM::LLVMFuncOp>()) {
-    // Custom calls will also match to LLVMFuncOp, so we are only interested in
-    // the entry function.
-    if (func.getName().str() == kernel_name) {
-      func_ops.push_back(func);
-    }
-  }
-  CHECK_EQ(func_ops.size(), 1)
-      << "Expected a single LLVMFuncOp in the module for the entry function.";
-
-  for (auto [idx, arg] : llvm::enumerate(func_ops[0].getArguments())) {
-    if (auto attr = func_ops[0].getArgAttrOfType<mtx::TmaDescriptorAttr>(
-            idx, "tt.tma_descriptor")) {
-      TF_ASSIGN_OR_RETURN(
-          auto tma_desc,
-          CreateTmaDescriptor(attr.getGlobalShape(), attr.getTileShape(),
-                              attr.getTileStrides(), attr.getLayout(),
-                              attr.getElementByteSize(),
-                              attr.getSwizzleMode().getValue()));
-      tma_metadata.arg_index_to_tma_info.insert({idx, tma_desc});
-    }
-  }
-  return tma_metadata;
-}
-
-absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::DeviceDescription& device_info,
-    const BlockLevelParameters& block_level_parameters,
-    mlir::MLIRContext& mlir_context) {
-  TF_ASSIGN_OR_RETURN(
-      auto triton_module,
-      ir_emitter_triton_internal::EmitXTileModule(
-          fn_name, fusion, device_info, block_level_parameters, mlir_context));
-
-  const HloComputation* hlo_computation =
-      fusion->fused_instructions_computation();
-
-  const auto debug_options = fusion->GetModule()->config().debug_options();
-
-  if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
-    auto suffix = absl::StrCat(fusion->name(), ".before_validation.ttir.txt");
-    DumpToFileInDirOrStdout(
-        *hlo_computation->parent(), "", suffix,
-        DumpTritonIR(triton_module.get(),
-                     fusion->GetModule()
-                         ->config()
-                         .debug_options()
-                         .xla_gpu_unsupported_annotate_with_emitter_loc()));
-    std::string fusion_suffix = absl::StrCat(fusion->name(), ".hlo");
-    DumpToFileInDirOrStdout(
-        *hlo_computation->parent(), "", fusion_suffix,
-        ExtractInstructionIntoNewModule(*fusion)->ToString());
-  }
-
-  TF_RETURN_IF_ERROR(ir_emitter_triton_internal::LowerXTileToTriton(
-      triton_module.get(), mlir_context, *fusion));
-
-  VLOG(6) << DumpTritonIR(triton_module.get(),
-                          fusion->GetModule()
-                              ->config()
-                              .debug_options()
-                              .xla_gpu_unsupported_annotate_with_emitter_loc());
-  if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
-    std::string suffix = absl::StrCat(fusion->name(), ".ttir.txt");
-    DumpToFileInDirOrStdout(
-        *hlo_computation->parent(), "", suffix,
-        DumpTritonIR(triton_module.get(),
-                     fusion->GetModule()
-                         ->config()
-                         .debug_options()
-                         .xla_gpu_unsupported_annotate_with_emitter_loc()));
-  }
-
-  return std::move(triton_module);
-}
-
-absl::Status CheckAtLeastAmpere(const se::GpuComputeCapability& gpu_cc) {
-  if (auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_cc);
-      cuda_cc != nullptr && !cuda_cc->IsAtLeastAmpere()) {
-    return absl::FailedPreconditionError(
-        absl::StrCat("Triton support is only enabled for Ampere GPUs (compute ",
-                     "capability 8.0) and up, but got compute capability ",
-                     cuda_cc->ToString(), "."));
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<TritonWrapperResult> TritonWrapper(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::GpuComputeCapability& gpu_cc,
-    const se::DeviceDescription& device_info,
-    const BlockLevelParameters& block_level_parameters,
-    llvm::Module* llvm_module, mlir::MLIRContext& mlir_context) {
-  TF_RETURN_IF_ERROR(CheckAtLeastAmpere(gpu_cc));
-
-  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> triton_module,
-                      CreateTritonModule(fn_name, fusion, device_info,
-                                         block_level_parameters, mlir_context));
-
-  VLOG(3) << fusion->ToString(HloPrintOptions::ShortParsable());
-  VLOG(3) << fusion->fused_instructions_computation()->ToString(
-      HloPrintOptions::ShortParsable());
-
-  // Compile Triton kernel to LLVM.
-  const HloModule* hlo_module = fusion->GetModule();
-  return CompileTritonToLLVM(fn_name, *hlo_module, device_info,
-                             block_level_parameters, triton_module.get(),
-                             llvm_module, mlir_context,
-                             /*is_xla_fusion=*/true);
-}
-
-absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
-    absl::string_view kernel_name, const HloModule& hlo_module,
-    const se::DeviceDescription& device_info,
-    const BlockLevelParameters& block_level_parameters,
-    mlir::ModuleOp triton_module, llvm::Module* llvm_module,
-    mlir::MLIRContext& mlir_context, bool is_xla_fusion, bool emit_kernel) {
-  const auto& gpu_cc = device_info.gpu_compute_capability();
-  TF_RETURN_IF_ERROR(CheckAtLeastAmpere(gpu_cc));
-  std::string arch_name =
-      std::visit([](auto& cc) { return cc.ToString(); }, gpu_cc);
-
-  const HloModuleConfig& hlo_config = hlo_module.config();
-
-  bool should_verify =
-      (hlo_config.debug_options().xla_gpu_llvm_verification_level() >= 1);
-#ifndef NDEBUG
-  should_verify = true;
-#endif
-
-  bool should_dump_mlir_passes =
-      hlo_config.debug_options().xla_enable_dumping() &&
-      DumpingEnabledForHloModule(hlo_module) &&
-      DumpingEnabledForHloPass("triton-fusion-emitter",
-                               hlo_config.debug_options());
-
-  mlir::PassManager pm(&mlir_context);
-  pm.enableVerifier(should_verify);
-
-  std::optional<llvm::raw_fd_ostream> log_stream;
-  if (should_dump_mlir_passes) {
-    std::string outputs_dir = hlo_config.debug_options().xla_dump_to();
-    if (outputs_dir == "sponge") {
-      if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
-        LOG(ERROR) << "Failed to get test undeclared outputs dir. Lets skip "
-                      "dumping triton passes.";
-        outputs_dir = "";
-      }
-    }
-    if (!outputs_dir.empty()) {
-      const std::string basename =
-          absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module.name())),
-                       ".", kernel_name, ".triton-passes.log");
-      std::string path = tsl::io::JoinPath(outputs_dir, basename);
-      std::error_code err;
-      log_stream.emplace(path, err, llvm::sys::fs::OF_None);
-      if (err) {
-        log_stream.reset();
-        LOG(ERROR) << "Failed to dump triton passes to " << path << ": "
-                   << err.message();
-      } else {
-        pm.getContext()->disableMultithreading();
-        auto print_always = [](mlir::Pass*, mlir::Operation*) { return true; };
-        pm.enableIRPrinting(/*shouldPrintBeforePass=*/print_always,
-                            /*shouldPrintAfterPass=*/print_always,
-                            /*printModuleScope=*/true,
-                            /*printAfterOnlyOnChange=*/false,
-                            /*printAfterOnlyOnFailure=*/true, *log_stream);
-      }
-    } else {
-      LOG(ERROR)
-          << "--xla_dump_hlo_pass_re=triton-fusion-emitter is set, but neither "
-          << "the environment variable TEST_UNDECLARED_OUTPUTS_DIR nor the "
-          << "flag --xla_dump_to is set, so the llvm dumps are disabled.";
-    }
-  }
-
-  CreateTritonXlaPipeline(&pm, gpu_cc, /*rewrite_int4=*/is_xla_fusion,
-                          block_level_parameters.is_tma_allowed);
+mlir::MemRefType GetMemRefType(const Shape& shape, mlir::Type element_type) {
+  mlir::MLIRContext* context = element_type.getContext();
+  mlir::Type storage_type = StorageType(element_type);
 
-  int num_warps = block_level_parameters.num_warps;
-  int num_ctas = block_level_parameters.num_ctas;
-  int num_stages = block_level_parameters.num_stages;
-  if (num_warps <= 0 || num_ctas <= 0 || num_stages <= 0) {
-    return absl::FailedPreconditionError(absl::StrCat(
-        "(num_warps, num_ctas, num_stages) must be positive, but got: (",
-        num_warps, ", ", num_ctas, ", ", num_stages, ")"));
-  }
-  mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
-  CreateTritonPipeline(&pm, gpu_cc, num_warps, num_ctas, num_stages,
-                       cluster_info);
-
-  // Triton generates pointers to the global address space, while XLA needs a
-  // kernel signature with pointers to the generic address space.
-  pm.addPass(mlir::triton::xla::CreateGeneralizeKernelSignaturePass());
-  // llvm::Linker::linkModules() segfaults if we don't strip locations.
-  pm.addPass(mlir::createStripDebugInfoPass());
-
-  if (failed(pm.run(triton_module))) {
-    return Internal("Failed to compile Triton kernel.");
+  // Don't add any attribute for default layouts as it adds a lot of noise to
+  // the printed IR.
+  if (LayoutUtil::IsMonotonicWithDim0Major(shape.layout())) {
+    return mlir::MemRefType::get(shape.dimensions(), storage_type);
   }
 
-  const int shared_mem_bytes =
-      triton_module->getAttrOfType<mlir::IntegerAttr>("ttg.shared").getInt();
-  VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
-  if (shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
-    return absl::ResourceExhaustedError(absl::StrFormat(
-        "Shared memory size limit exceeded: requested %d, available: %d",
-        shared_mem_bytes, device_info.shared_memory_per_block_optin()));
-  }
+  auto minor_to_major_attr =
+      mlir::DenseI64ArrayAttr::get(context, shape.layout().minor_to_major());
+  auto layout = mtx::LayoutAttr::get(context, minor_to_major_attr);
 
-  if (auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_cc);
-      cuda_cc != nullptr && cuda_cc->IsBlackwell()) {
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory
-    constexpr int kTensorMemoryColumns = 512;
-    const int tensor_mem_columns =
-        triton_module
-            ->getAttrOfType<mlir::IntegerAttr>("ttg.tensor_memory_size")
-            .getInt();
-    if (tensor_mem_columns > 0) {
-      VLOG(2) << "Tensor memory usage: " << tensor_mem_columns << " columns";
-    }
-    if (tensor_mem_columns > kTensorMemoryColumns) {
-      return absl::ResourceExhaustedError(absl::StrFormat(
-          "Tensor memory size limit exceeded: requested %d, available: %d",
-          tensor_mem_columns, kTensorMemoryColumns));
-    }
-  }
-
-  std::vector<llvm::Metadata*> captured_nvvm_annotations;
-  if (emit_kernel) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<llvm::Module> ll_triton_module,
-        TranslateLLVMToLLVMIR(&llvm_module->getContext(), triton_module));
-
-    XLA_VLOG_LINES(5, llvm_ir::DumpToString(ll_triton_module.get()));
-    if (should_verify) {
-      VerifyModule(*ll_triton_module);
-    }
-
-    // Integrate LLVM matmul kernel into XLA's LLVM module.
-    auto* nvvm_annotations =
-        ll_triton_module->getNamedMetadata("nvvm.annotations");
-    if (nvvm_annotations) {
-      for (auto operand : nvvm_annotations->operands()) {
-        captured_nvvm_annotations.push_back(operand);
-      }
-      ll_triton_module->eraseNamedMetadata(nvvm_annotations);
-    }
-    ll_triton_module->setDataLayout(llvm_module->getDataLayout());
-    ll_triton_module->setTargetTriple(llvm_module->getTargetTriple());
-    // Use override flag because libdevice functions can be present in both.
-    TF_RET_CHECK(
-        !llvm::Linker::linkModules(*llvm_module, std::move(ll_triton_module),
-                                   llvm::Linker::Flags::OverrideFromSrc));
-
-    XLA_VLOG_LINES(5, llvm_ir::DumpToString(llvm_module));
-    if (should_verify) {
-      VerifyModule(*llvm_module);
-    }
-  }
-
-  // `cluster_info` must be read after pm.run().
-  std::optional<se::ClusterDim> cluster_dim;
-  if (block_level_parameters.num_ctas > 1) {
-    VLOG(3) << "num_ctas: " << block_level_parameters.num_ctas
-            << ", cluster_info: " << cluster_info.clusterDimX << ","
-            << cluster_info.clusterDimY << "," << cluster_info.clusterDimZ;
-    if (cluster_info.clusterDimX > 1 || cluster_info.clusterDimY > 1 ||
-        cluster_info.clusterDimZ > 1) {
-      cluster_dim =
-          se::ClusterDim(cluster_info.clusterDimX, cluster_info.clusterDimY,
-                         cluster_info.clusterDimZ);
-    }
-  } else {
-    TF_RET_CHECK(cluster_info.clusterDimX == 1 &&
-                 cluster_info.clusterDimY == 1 &&
-                 cluster_info.clusterDimZ == 1);
-  }
-
-  // It's okay for tma_metadata to be empty; it's only populated when used
-  // explicitly.
-  TF_ASSIGN_OR_RETURN(stream_executor::gpu::TmaMetadata tma_metadata,
-                      ExtractTmaMetadata(triton_module, kernel_name));
-
-  return {
-      {shared_mem_bytes, cluster_dim, tma_metadata, captured_nvvm_annotations}};
+  return mlir::MemRefType::get(shape.dimensions(), storage_type, layout);
 }
 
-std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
-                             const se::DeviceDescription& device_info) {
-  if (std::holds_alternative<se::CudaComputeCapability>(
-          device_info.gpu_compute_capability())) {
-    return nvptx::LibDevicePath(
-        hlo_config.debug_options().xla_gpu_cuda_data_dir());
-  }
-  return "";
-}
-
-namespace ir_emitter_triton_internal {
-
 // TODO(b/447133106): Contrary to the name, this function still does a lot of
 // triton specific things. It should be migrated to use non-triton specific
 // utilities.
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::DeviceDescription& device_info,
+    absl::string_view fn_name,
+    EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
+    const HloFusionInstruction* fusion,
     const BlockLevelParameters& block_level_parameters,
-    mlir::MLIRContext& mlir_context) {
-  LoadMlirDialectsForTriton(mlir_context);
+    MLIRContext& mlir_context) {
   const auto debug_options = fusion->GetModule()->config().debug_options();
 
   const HloComputation* hlo_computation =
@@ -2259,9 +1501,31 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
       llvm_ir::CreateMlirModuleOp(loc);
   b.setInsertionPointToEnd(triton_module->getBody());
 
-  auto backend_config =
-      fusion->backend_config<GpuBackendConfig>()->fusion_backend_config();
-  absl::string_view fusion_kind = backend_config.kind();
+  std::string fusion_kind(kTritonFusionKind);
+  if (fusion->has_backend_config()) {
+    auto backend_config = fusion->backend_config<GpuBackendConfig>();
+    if (backend_config.ok()) {
+      fusion_kind = backend_config->fusion_backend_config().kind();
+    }
+  }
+
+  if (fusion_kind == kTritonGemmFusionKind) {
+    return Internal(
+        "Attempted to emit a GEMM fusion through the legacy Triton "
+        "emitter, but it has been deleted. This is a bug.");
+  }
+
+  // TODO(bchetioui,pifon): this list should be consolidated; why do we need so
+  // many different fusion kinds?
+  const std::vector<absl::string_view> kSupportedFusionKinds = {
+      kTritonFusionKind,
+      kTritonNestedGemmFusionKind,
+      kTritonCollectiveFusionKind,
+  };
+
+  if (!absl::c_linear_search(kSupportedFusionKinds, fusion_kind)) {
+    return Internal("Unsupported fusion kind: %s", fusion_kind);
+  }
 
   // Build Triton kernel.
   SmallVector<Type> fn_arg_types;
@@ -2275,97 +1539,40 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
     } else {
       TF_ASSIGN_OR_RETURN(ir_type, TritonType(b, type));
     }
+    fn_arg_types.push_back(GetMemRefType(p->shape(), ir_type));
+  }
 
-    AppendFuncArgType(p->shape().dimensions(), ir_type, fn_arg_types);
+  for (const auto& [index, shape] : ShapeUtil::GetLeafShapes(fusion->shape())) {
+    TF_ASSIGN_OR_RETURN(Type triton_ty, TritonType(b, shape.element_type()));
+    fn_arg_types.push_back(GetMemRefType(shape, triton_ty));
   }
 
-  for (const ShapeUtil::IndexedShape& s :
-       ShapeUtil::GetLeafShapes(fusion->shape())) {
-    TF_ASSIGN_OR_RETURN(Type triton_ty, TritonType(b, s.shape.element_type()));
-    AppendFuncArgType(s.shape.dimensions(), triton_ty, fn_arg_types);
+  // Add metadata arguments for collectives.
+  // This is done after the input and output arguments but before the tile
+  // index.
+  int32_t num_metadata_arguments = 0;
+  if (fusion_kind == kTritonCollectiveFusionKind) {
+    TF_ASSIGN_OR_RETURN(
+        num_metadata_arguments,
+        AddCollectiveMetadataArguments(fn_arg_types, b, hlo_computation));
   }
+  // Metadata arguments are opaque to the tiling infra.
+  llvm::SmallVector<mlir::NamedAttribute> named_attributes{b.getNamedAttr(
+      "num_opaque_args", b.getI32IntegerAttr(num_metadata_arguments))};
 
-  mlir::FunctionOpInterface fn =
-      CreateFuncOp(b, fn_name, fusion_kind, fn_arg_types);
+  auto fn = xtile::EntryFuncOp::create(b, fn_name, fn_arg_types,
+                                       named_attributes, {});
 
   fn.addEntryBlock();
   b.setInsertionPointToStart(&fn.front());
 
-  std::string libdevice_path =
-      GetLibdevicePath(fusion->GetModule()->config(), device_info);
-
-  if (fusion_kind == kTritonGemmFusionKind) {
-    if (absl::c_contains(
-            fusion->GetModule()
-                ->config()
-                .debug_options()
-                .xla_gpu_unsupported_generic_triton_emitter_features(),
-            DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM)) {
-      return Internal("Legacy GEMM emitter is disabled.");
-    }
-    TF_RETURN_IF_ERROR(EmitMatMul(b, libdevice_path, device_info, fusion, fn,
-                                  block_level_parameters));
-  } else if (fusion_kind == kTritonFusionKind ||
-             fusion_kind == kTritonNestedGemmFusionKind ||
-             fusion_kind == kTritonScaledDotFusionKind) {
-    TF_RETURN_IF_ERROR(EmitGeneric(b, libdevice_path, device_info, fusion, fn,
-                                   block_level_parameters));
-  } else {
-    return Internal("Unsupported fusion kind: %s", fusion_kind);
-  }
-
-  EmitReturnOp(b, fusion_kind);
+  TF_RETURN_IF_ERROR(EmitGeneric(b, emitter_specific_constraints_builder,
+                                 fusion, fn, block_level_parameters,
+                                 &mlir_context));
 
+  b.create<xtile::EntryFuncReturnOp>();
   return triton_module;
 }
 
-absl::Status LowerXTileToTriton(mlir::ModuleOp xtile_dialect_module,
-                                mlir::MLIRContext& mlir_context,
-                                const HloFusionInstruction& fusion) {
-  {  // Convert xTile ops to Triton ops.
-    mlir::PassManager pm(&mlir_context);
-    // Disable verifier because the Triton code may be invalid due to the
-    // unsupported types.
-    pm.enableVerifier(/*enabled=*/false);
-    pm.addPass(mlir::triton::xla::CreateStableHLOLowerToTritonPass());
-    if (mlir::failed(pm.run(xtile_dialect_module))) {
-      return CreateInternalError(
-          "Failed to lower from shared dialect to Triton.", &fusion,
-          xtile_dialect_module);
-    }
-  }
-
-  if (fusion.GetModule()
-          ->config()
-          .debug_options()
-          .xla_gpu_experimental_scaled_dot_with_triton()) {
-    // Convert unsupported types before verification.
-    mlir::PassManager pm(&mlir_context);
-    pm.addPass(mlir::triton::xla::CreateTritonXLAConvertUnsupportedTypesPass());
-    if (mlir::failed(pm.run(xtile_dialect_module))) {
-      return CreateInternalError(
-          "Failed to fix unsupported types in Triton module for fusion:",
-          &fusion, xtile_dialect_module);
-    }
-  }
-
-  if (mlir::failed(mlir::verify(xtile_dialect_module))) {
-    return CreateInternalError("Failed to verify Triton module for fusion:",
-                               &fusion, xtile_dialect_module);
-  }
-  mlir::PassManager pm(&mlir_context);
-
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::createCSEPass());
-  if (mlir::failed(pm.run(xtile_dialect_module))) {
-    return CreateInternalError("Failed to create Triton module for fusion:",
-                               &fusion, xtile_dialect_module);
-  }
-
-  return absl::OkStatus();
-}
-
-}  // namespace ir_emitter_triton_internal
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
index 39eb70fef4c568..19804ffb78a139 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
@@ -16,132 +16,18 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_H_
 #define XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_H_
 
-#include <cstdint>
-#include <optional>
-
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
-#include "mlir/IR/Value.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/autotuning.pb.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
-#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
-#include "xla/stream_executor/launch_dim.h"
-
-namespace mlir {
-namespace triton {
-namespace nvidia_gpu {
-struct ClusterInfo;
-}
-}  // namespace triton
-}  // namespace mlir
-
-namespace xla {
-namespace gpu {
-
-struct TritonWrapperResult {
-  int64_t shmem_bytes = 0;
-  std::optional<se::ClusterDim> cluster_dim;
-  stream_executor::gpu::TmaMetadata tma_metadata;
-
-  // The captured nvvm.annotations from the lowest level LLVM IR coming from
-  // Triton. We need to propagate them because we later create the kernel and
-  // splice the impl_fn into it.
-  std::vector<llvm::Metadata*> nvvm_annotations;
-};
-
-// Load the MLIR dialects required for Triton IR generation.
-void LoadMlirDialectsForTriton(mlir::MLIRContext& mlir_context);
-
-// Generate Triton IR by running the provided generator and compile it into LLVM
-// IR.
-absl::StatusOr<TritonWrapperResult> TritonWrapper(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::GpuComputeCapability& cc,
-    const se::DeviceDescription& device_info,
-    const BlockLevelParameters& block_level_parameters,
-    llvm::Module* llvm_module, mlir::MLIRContext& mlir_context);
-
-// Creates the initial Triton module for the given fusion. Visible for testing,
-// use TritonWrapper instead.
-absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::DeviceDescription& device_info,
-    const BlockLevelParameters& block_level_parameters,
-    mlir::MLIRContext& mlir_context);
-
-// Compiles a given Triton module to LLVM IR.
-// If `emit_kernels` is false, then the function skips emitting
-// the kernels, but it still returns correctly filled TritonWrapperResult.
-// That is useful when deserializing from the compilation cache.
-absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
-    absl::string_view kernel_name, const HloModule& hlo_module,
-    const se::DeviceDescription& device_info,
-    const BlockLevelParameters& block_level_parameters,
-    mlir::ModuleOp triton_module, llvm::Module* llvm_module,
-    mlir::MLIRContext& mlir_context, bool is_xla_fusion,
-    bool emit_kernel = true);
-
-std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
-                             const se::DeviceDescription& device_info);
-
-// TODO(b/406472229): Move the contents of this namespace to a helpers file
-// to avoid polluting `fusion_emitter.h`.
-// Exposed for testing and experimental purposes only. Do not use.
-namespace ir_emitter_triton_internal {
 
-// Computes the transformation from a 1-d program_id to a tile multi-index.
-llvm::SmallVector<mlir::Value, 3> ComputeDelinearizedTileIndex(
-    EmitterLocOpBuilder b, absl::Span<const int64_t> num_output_tiles_per_dim);
-
-// Dumps the Triton IR to a string.
-//
-// If `dump_annotations` is true, then the function also dumps the loc
-// attributes of the instructions. Otherwise, it dumps the IR without
-// annotations.
-inline std::string DumpTritonIR(mlir::ModuleOp triton_module,
-                                bool dump_annotations) {
-  std::string triton_ir;
-  llvm::raw_string_ostream os(triton_ir);
-  triton_module.print(os, mlir::OpPrintingFlags().enableDebugInfo(
-                              dump_annotations, dump_annotations));
-  if (dump_annotations) {
-    return EmitterLocOpBuilder::FormatTritonIrWithAnnotations(triton_ir);
-  }
-  return triton_ir;
-}
-
-// Given a tiling specification for a fusion and an annotated fusion, derives a
-// tiling for the annotated fusion.
-//
-// Note that the tiling extracted here is voluntarily not checked against the
-// specification, which means that it could be invalid. This should only be the
-// case, though, if this logic gets stale, or if the fusion does not contain
-// the required annotations. Checking constraints is not cheap, so we left it up
-// to the caller to decide when to check the constraints.
-//
-// TODO(b/421837868): this belongs near/in `BlockLevelParameters`, but we start
-// with this here in order to allow an incremental replacement.
-absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
-    const HloFusionInstruction* fusion,
-    const SymbolicTileAnalysis& symbolic_tile_analysis,
-    const BlockLevelParameters& block_level_parameters);
+namespace xla::gpu {
 
 // This function (or its future equivalent) should emit the MLIR module in the
 // shared dialect between XLA:CPU and XLA:GPU. At the moment it is still
@@ -149,22 +35,12 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
 // purposes and will only be used to make sure we are properly emitting the
 // shared dialect.
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::DeviceDescription& device_info,
+    absl::string_view fn_name,
+    EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
+    const HloFusionInstruction* fusion,
     const BlockLevelParameters& block_level_parameters,
     mlir::MLIRContext& mlir_context);
 
-// This function lowers the shared dialect module to Triton. It is exposed for
-// testing with the same motivation as EmitXTileModule.
-//
-// The `fusion` instruction should be the one that was used to create the shared
-// dialect module.
-absl::Status LowerXTileToTriton(mlir::ModuleOp xtile_dialect_module,
-                                mlir::MLIRContext& mlir_context,
-                                const HloFusionInstruction& fusion);
-
-}  // namespace ir_emitter_triton_internal
-}  // namespace gpu
-}  // namespace xla
+}  // namespace xla::gpu
 
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
index ff156435861c94..285aa18e7f853b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -34,6 +33,7 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/pattern_matcher.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
@@ -88,14 +88,6 @@ class TritonTest : public GpuCodegenTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    auto* emitter_opts =
-        debug_options
-            .mutable_xla_gpu_unsupported_generic_triton_emitter_features();
-    emitter_opts->Add(DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
-    emitter_opts->Add(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
-    emitter_opts->Add(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES);
     debug_options.set_xla_gpu_autotune_level(0);
     return debug_options;
   }
@@ -111,15 +103,6 @@ class TritonTest : public GpuCodegenTest {
     return device_desc().gpu_compute_capability();
   }
 
-  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-            GpuComputeCapability())) {
-      return stream_executor::GpuComputeCapability{
-          device_desc().rocm_compute_capability()};
-    }
-    return se::CudaComputeCapability::Ampere();
-  }
-
   // Returns the module, its fusion computation and associated block level
   // parameters from an HLO module text whose entry computation contains a
   // single GEMM fusion.
@@ -169,14 +152,6 @@ class TritonGemmTest : public TritonTest {
     debug_options.set_xla_gpu_enable_split_k_autotuning(false);
     // Always rewrite Gemms with Triton regardless of size.
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
-    // TODO(b/393299275): remove when generic emitter is fully enabled.
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES);
     return debug_options;
   }
 
@@ -265,9 +240,9 @@ ENTRY e {
       CreateTritonIrAndFileCheck(*module_and_metadata.computation,
                                  module_and_metadata.block_level_parameters,
                                  R"(
-CHECK: %[[LOAD:.*]] = triton_xla.extract {{.*}} : tensor<16x16xi8>
-CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
-CHECK: %{{.*}} = arith.andi %[[TRUNCI]], %{{.*}} : tensor<16x16xi1>
+CHECK: %[[LOAD:.*]] = xtile.extract {{.*}} -> tensor<16x16xi8>
+CHECK: %[[CMPI:.*]] = arith.cmpi ne, %[[LOAD]], {{.*}} : tensor<16x16xi8>
+CHECK: %{{.*}} = arith.andi %[[CMPI]], %{{.*}} : tensor<16x16xi1>
 )"));
 }
 
@@ -410,7 +385,7 @@ ENTRY e {
   }
   DebugOptions debug_options = verified_module->config().debug_options();
   debug_options.set_xla_dump_to(output_directory);
-  debug_options.set_xla_dump_hlo_pass_re("triton-fusion-emitter");
+  debug_options.set_xla_dump_emitter_re("triton-fusion");
   verified_module->mutable_config().set_debug_options(debug_options);
 
   EXPECT_TRUE(RunAndCompare(std::move(verified_module),
@@ -484,7 +459,6 @@ TEST_F(TritonGemmTest, FailIfTooMuchShmem) {
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   llvm::LLVMContext llvm_ctx;
   llvm::Module llvm_module("module", llvm_ctx);
-  mlir::MLIRContext mlir_context;
 
   constexpr absl::string_view kHloTextTemplate = R"(
 triton_gemm_dot {
@@ -511,12 +485,13 @@ ENTRY entry {
 
   const HloFusionInstruction* fusion1 = Cast<HloFusionInstruction>(
       module1_and_metadata.computation->FusionInstruction());
-  EXPECT_THAT(TritonWrapper("test_fn", fusion1, cc, device_info,
-                            module1_and_metadata.block_level_parameters,
-                            &llvm_module, mlir_context),
-              absl_testing::StatusIs(
-                  tsl::error::RESOURCE_EXHAUSTED,
-                  ::testing::HasSubstr("Shared memory size limit exceeded")));
+  EXPECT_THAT(
+      TritonWrapper("test_fn", fusion1, se::GpuComputeCapability{cc},
+                    device_info, module1_and_metadata.block_level_parameters,
+                    &llvm_module, mlir_context_),
+      absl_testing::StatusIs(
+          tsl::error::RESOURCE_EXHAUSTED,
+          ::testing::HasSubstr("Shared memory size limit exceeded")));
 
   TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module2_and_metadata,
                           GetModuleAndNestedFusionMetadata(absl::Substitute(
@@ -527,9 +502,9 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
-      TritonWrapper("test_fn", fusion2, cc, device_info,
-                    module2_and_metadata.block_level_parameters, &llvm_module,
-                    mlir_context));
+      TritonWrapper("test_fn", fusion2, se::GpuComputeCapability{cc},
+                    device_info, module2_and_metadata.block_level_parameters,
+                    &llvm_module, mlir_context_));
   // Use optin shared memory which is > shared_memory_per_block.
   EXPECT_GT(result.shmem_bytes, device_info.shared_memory_per_block());
 }
@@ -827,7 +802,6 @@ TEST_F(TritonGemmTest, DISABLED_FailForTooComplexTiling) {
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   llvm::LLVMContext llvm_ctx;
   llvm::Module llvm_module("module", llvm_ctx);
-  mlir::MLIRContext mlir_context;
 
   constexpr absl::string_view kHloTextTemplate = R"(
 HloModule module
@@ -857,11 +831,12 @@ ENTRY entry {
 
   const HloFusionInstruction* fusion1 = Cast<HloFusionInstruction>(
       module1_and_metadata.computation->FusionInstruction());
-  EXPECT_THAT(TritonWrapper("test_fn", fusion1, cc, device_info,
-                            module1_and_metadata.block_level_parameters,
-                            &llvm_module, mlir_context),
-              absl_testing::StatusIs(tsl::error::RESOURCE_EXHAUSTED,
-                                     "Tiling complexity heuristic exceeded"));
+  EXPECT_THAT(
+      TritonWrapper("test_fn", fusion1, se::GpuComputeCapability{cc},
+                    device_info, module1_and_metadata.block_level_parameters,
+                    &llvm_module, mlir_context_),
+      absl_testing::StatusIs(tsl::error::RESOURCE_EXHAUSTED,
+                             "Tiling complexity heuristic exceeded"));
 
   // Succeeds if the tiling is not too complex.
   TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module2_and_metadata,
@@ -871,9 +846,10 @@ ENTRY entry {
   const HloFusionInstruction* fusion2 = Cast<HloFusionInstruction>(
       module1_and_metadata.computation->FusionInstruction());
 
-  TF_EXPECT_OK(TritonWrapper("test_fn", fusion2, cc, device_info,
+  TF_EXPECT_OK(TritonWrapper("test_fn", fusion2, se::GpuComputeCapability{cc},
+                             device_info,
                              module2_and_metadata.block_level_parameters,
-                             &llvm_module, mlir_context)
+                             &llvm_module, mlir_context_)
                    .status());
 }
 
@@ -1206,8 +1182,7 @@ ENTRY e {
 // TODO(b/393299275): this should just be a fusion test and does not need to be
 // in the codegen directory.
 TEST_F(TritonGemmTest, DoNotFuseConcatenationOfSplitNonContractingDimension) {
-  if (std::holds_alternative<se::RocmComputeCapability>(
-          GpuComputeCapability())) {
+  if (GpuComputeCapability().IsRocm()) {
     GTEST_SKIP() << "Not using autotuner on ROCM yet.";
   }
   if (!SupportsBF16(GpuComputeCapability())) {
@@ -1809,8 +1784,7 @@ TEST_F(TritonGemmTest, DISABLED_SplitLHSInputOutputIsFused) {
   if (!SupportsBF16(GpuComputeCapability())) {
     GTEST_SKIP() << "BF16 not supported.";
   }
-  if (std::holds_alternative<se::RocmComputeCapability>(
-          GpuComputeCapability())) {
+  if (GpuComputeCapability().IsRocm()) {
     GTEST_SKIP() << "Skipped until corresponding issue on ROCm is fixed.";
   }
 
@@ -1966,8 +1940,7 @@ ENTRY e {
 // probably be made deviceless and repurposed to test that opt-in shared memory
 // is used only.
 TEST_F(CompareTest, UsingOptinSharedMemoryProducesSameResult) {
-  if (std::holds_alternative<se::RocmComputeCapability>(
-          GpuComputeCapability())) {
+  if (GpuComputeCapability().IsRocm()) {
     GTEST_SKIP() << "No Optin Shared Memory on AMD.";
   }
   const se::DeviceDescription dev_info =
@@ -2015,14 +1988,13 @@ ENTRY e {
       optin_shmem_module_and_metadata.computation->FusionInstruction());
   llvm::LLVMContext llvm_ctx;
   llvm::Module llvm_module("module", llvm_ctx);
-  mlir::MLIRContext mlir_context;
 
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper("test_fn", triton_dot_fusion, GpuComputeCapability(),
                     dev_info,
                     optin_shmem_module_and_metadata.block_level_parameters,
-                    &llvm_module, mlir_context));
+                    &llvm_module, mlir_context_));
   // The config is chosen so that the used memory size is slightly above the
   // 48 kB boundary of standard / opt-in shared memory so that any GPU that
   // has the opt-in one should be able to execute the test.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
deleted file mode 100644
index 6a4a72eafe3353..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
+++ /dev/null
@@ -1,4240 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdlib>
-#include <memory>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/strings/substitute.h"
-#include "llvm/IR/LLVMContext.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/Pass/PassManager.h"
-#include "xla/autotuning.pb.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
-#include "xla/backends/gpu/codegen/triton/test_utils.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/testlib/filecheck.h"
-#include "xla/hlo/testlib/pattern_matcher_gmock.h"
-#include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/path.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace m = ::xla::match;
-using tsl::testing::StatusIs;
-
-class TritonTest : public GpuCodegenTest {
- public:
-  stream_executor::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
-  }
-
-  const stream_executor::GpuComputeCapability& GpuComputeComp() {
-    return device_desc().gpu_compute_capability();
-  }
-  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-            GpuComputeComp())) {
-      return stream_executor::GpuComputeCapability{
-          device_desc().rocm_compute_capability()};
-    } else {
-      return stream_executor::GpuComputeCapability{
-          stream_executor::CudaComputeCapability{
-              stream_executor::CudaComputeCapability::kAmpere, 0}};
-    }
-  }
-
- protected:
-  const stream_executor::DeviceDescription& device_desc() {
-    return backend().default_stream_executor()->GetDeviceDescription();
-  }
-};
-
-class TritonGemmTest : public TritonTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonTest::GetDebugOptionsForTest();
-    // Do not fall back to cuBLAS and disable cuDNN; we are testing Triton.
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(0);
-    // Do not autotune split-k by default, since this prevents deterministically
-    // matching the optimized HLO.
-    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
-    // Always rewrite Gemms with Triton regardless of size.
-    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    return debug_options;
-  }
-
-  void MatchHloModule(HloModule& module, absl::string_view pattern) {
-    TF_ASSERT_OK_AND_ASSIGN(bool filecheck_result,
-                            RunFileCheck(module.ToString(), pattern));
-    EXPECT_TRUE(filecheck_result);
-  }
-};
-
-class TritonGemmTestWithSplitK : public TritonGemmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_split_k_autotuning(true);
-    return debug_options;
-  }
-};
-
-class TritonGemmTestWithoutTritonGemmAny : public TritonGemmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_triton_gemm_any(false);
-    return debug_options;
-  }
-};
-
-TEST_F(TritonGemmTest, S8ToF16DotWithSmallTileDoesNotCrash) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_dot {
-  p0 = s8[33,33]{1,0} parameter(0)
-  c0 = f16[33,33]{1,0} convert(p0)
-  p1 = f16[33,33]{1,0} parameter(1)
-  ROOT _ = f16[33,33]{1,0} dot(c0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[33,33]{1,0} parameter(0)
-  p1 = f16[33,33]{1,0} parameter(1)
-  ROOT _ = f16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":16,
-                         "split_k":1,"num_stages":2,"num_warps":2,
-                         "num_ctas":1}}}
-})";
-
-  EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
-}
-
-TEST_F(TritonGemmTest, S8ToF32DotWithManyWarpsDoesNotCrash) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_dot {
-  p0 = s8[16,65]{0,1} parameter(0)
-  c0 = f32[16,65]{1,0} convert(p0)
-  p1 = f32[65,128]{1,0} parameter(1)
-  ROOT _ = f32[16,128]{1,0} dot(c0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[16,65]{1,0} parameter(0)
-  p1 = f32[65,128]{1,0} parameter(1)
-  ROOT _ = f32[16,128] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":128,"block_k":32,
-                         "split_k":1,"num_stages":2,"num_warps":16,
-                         "num_ctas":1}}}
-})";
-
-  EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
-}
-
-TEST_F(TritonGemmTest, Fp8DotWithSmallTileDoesNotCrash) {
-  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
-  }
-
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_dot {
-  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
-  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
-  ROOT _ = bf16[33,33]{1,0} dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
-  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
-  ROOT _ = bf16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":16,
-                         "split_k":1,"num_stages":2,"num_warps":2,
-                         "num_ctas":1}}}
-})";
-
-  EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
-}
-
-TEST_F(TritonGemmTest, Fp8DotWithManyWarpsDoesNotCrash) {
-  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
-  }
-
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_dot {
-  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
-  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
-  ROOT _ = bf16[33,33]{1,0} dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
-  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
-  ROOT _ = bf16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":32,
-                         "split_k":1,"num_stages":2,"num_warps":16,
-                         "num_ctas":1}}}
-})";
-
-  EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
-}
-
-TEST_F(TritonTest, TestGemm) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t, is_scheduled=true
-
-triton_gemm_r {
-  parameter_0 = s8[80,115]{1,0} parameter(0)
-  convert.3 = f32[80,115]{1,0} convert(parameter_0)
-  parameter_1 = f32[137,115]{1,0} parameter(1)
-  ROOT r.1 = f32[80,137]{1,0} dot(convert.3, parameter_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p1 = f32[137,115]{1,0} parameter(1)
-  p0 = s8[80,115]{1,0} parameter(0)
-  ROOT triton_gemm_r = f32[80,137]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":2,
-                         "num_ctas":1}}}
-})";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_gemm_r", R"(
-CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x64xf32>
-CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
-CHECK-DAG:  %[[ZERO_MN:.*]] = arith.constant dense<0.000000e+00> : tensor<16x64xf32>
-CHECK-DAG:  %[[SIZE_K:.*]] = arith.constant 115 : i32
-CHECK-DAG:  %[[SIZE_M:.*]] = arith.constant 137 : i64
-CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : i64
-CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i32
-CHECK-DAG:  %[[C80:.*]] = arith.constant 80 : i64
-CHECK-DAG:  %[[TILE_SIZE_K:.*]] = arith.constant 32 : i32
-CHECK-DAG:  %[[TILE_SIZE_N:.*]] = arith.constant 64 : i32
-CHECK-DAG:  %[[TILE_SIZE_M:.*]] = arith.constant 16 : i32
-CHECK-DAG:  %[[NUM_TILES_M:.*]] = arith.constant 5 : i32
-CHECK-DAG:  %[[GROUP_M:.*]] = arith.constant 8 : i32
-CHECK-DAG:  %[[WIDTH:.*]] = arith.constant 24 : i32
-CHECK:      %[[PID_NC:.*]] = tt.get_program_id x
-CHECK:      %[[GROUP_ID:.*]] = arith.divsi %[[PID_NC]], %[[WIDTH]]
-CHECK:      %[[FIRST_PID_M:.*]] = arith.muli %[[GROUP_ID]], %[[GROUP_M]]
-CHECK:      %[[MAX_M:.*]] = arith.subi %[[NUM_TILES_M]], %[[FIRST_PID_M]]
-CHECK:      %[[CMP:.*]] = arith.cmpi slt, %[[MAX_M]], %[[GROUP_M]]
-CHECK:      %[[GROUP_SIZE:.*]] = arith.select %[[CMP]], %[[MAX_M]], %[[GROUP_M]]
-CHECK:      %[[PID_M:.*]] = arith.remsi %[[PID_NC]], %[[GROUP_SIZE]]
-CHECK:      %[[TILE_INDEX_M:.*]] = arith.addi %[[FIRST_PID_M]], %[[PID_M]] : i32
-CHECK:      %[[TMP:.*]] = arith.remsi %[[PID_NC]], %[[WIDTH]] : i32
-CHECK:      %[[TILE_INDEX_N:.*]] = arith.divsi %[[TMP]], %[[GROUP_SIZE]] : i32
-CHECK:      %[[TILE_OFFSET_M_LHS:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
-CHECK:      %[[LHS_PTR:.*]] = tt.make_tensor_ptr %[[LHS]]
-CHECK:      %[[LHS_TILE_PTR:.*]] = tt.advance %[[LHS_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[C0]]]
-CHECK:      %[[TILE_OFFSET_N_RHS:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_N]]
-CHECK:      %[[RHS_PTR:.*]] = tt.make_tensor_ptr %[[RHS]]
-CHECK:      %[[RHS_TILE_PTR:.*]] = tt.advance %[[RHS_PTR]], [%[[C0]], %[[TILE_OFFSET_N_RHS]]]
-CHECK:        %[[FOR:.*]]:3 = scf.for %[[BLOCK_K:.*]] = %[[C0]] to %[[SIZE_K]] step %[[TILE_SIZE_K]]
-CHECK-SAME:       iter_args(%[[LHS_ITER_PTR:.*]] = %[[LHS_TILE_PTR]], %[[RHS_ITER_PTR:.*]] = %[[RHS_TILE_PTR]], %[[ACC:.*]] = %[[ZERO_MN]])
-CHECK:        %[[LHS_TILE:.*]] = tt.load %[[LHS_ITER_PTR]] {boundaryCheck = array<i32: 1>
-CHECK:        %[[LHS_ITER_PTR_NEXT:.*]] = tt.advance %[[LHS_ITER_PTR]], [%[[C0]], %[[TILE_SIZE_K]]]
-CHECK:        %[[RHS_TILE:.*]] = tt.load %[[RHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
-CHECK:        %[[RHS_ITER_PTR_NEXT:.*]] = tt.advance %[[RHS_ITER_PTR]], [%[[TILE_SIZE_K]], %[[C0]]]
-CHECK:        %[[CONVERTED:.*]] = arith.sitofp %[[LHS_TILE]] : tensor<16x32xi8> to tensor<16x32xf32>
-CHECK:        %[[TILE_K_LIMIT:.*]] = arith.subi %[[SIZE_K]], %[[BLOCK_K]] : i32
-CHECK:        %[[MASK_IF_COND:.*]]  = arith.cmpi slt, %[[TILE_K_LIMIT]], %c32_i32 : i32
-CHECK:        %[[LHS_MASK_IF_STMT:.*]] = scf.if %[[MASK_IF_COND]] -> (tensor<16x32xf32>) {
-CHECK:        %[[K_TILE_IOTA:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
-CHECK:        %[[K_OFFSETS_1K:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32>
-CHECK:        %[[TILE_K_LIMIT_1K:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<1x32xi32>
-CHECK:        %[[LHS_INBOUNDS_1K:.*]] = arith.cmpi slt, %[[K_OFFSETS_1K]], %[[TILE_K_LIMIT_1K]] : tensor<1x32xi32>
-CHECK:        %[[LHS_INBOUNDS_MK:.*]] = tt.broadcast %[[LHS_INBOUNDS_1K]] : tensor<1x32xi1> -> tensor<16x32xi1>
-CHECK:        %[[LHS_MASKED:.*]] = arith.select %[[LHS_INBOUNDS_MK]], %[[CONVERTED]], %[[ZERO_MK]]
-CHECK:        scf.yield %[[LHS_MASKED]] : tensor<16x32xf32>
-CHECK:        scf.yield %[[CONVERTED]] : tensor<16x32xf32>
-CHECK:        %[[RHS_MASK_IF_STMT:.*]] = scf.if %[[MASK_IF_COND]] -> (tensor<32x64xf32>) {
-CHECK:        %[[K_OFFSETS_K1:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32>
-CHECK:        %[[TILE_K_LIMIT_K1:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<32x1xi32>
-CHECK:        %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TILE_K_LIMIT_K1]] : tensor<32x1xi32>
-CHECK:        %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : tensor<32x1xi1> -> tensor<32x64xi1>
-CHECK:        %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TILE]], %[[ZERO_KN]] : tensor<32x64xi1>, tensor<32x64xf32>
-CHECK:        scf.yield %[[RHS_MASKED]] : tensor<32x64xf32>
-CHECK:        scf.yield %[[RHS_TILE]] : tensor<32x64xf32>
-CHECK:        %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASK_IF_STMT]], %[[RHS_MASK_IF_STMT]], %[[ACC]]
-CHECK:        scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xi8>>, !tt.ptr<tensor<32x64xf32>>, tensor<16x64xf32>
-CHECK:      }
-CHECK:      %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[C80]], %[[SIZE_M]]], [%[[SIZE_M]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x64xf32>>
-CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x64xf32>>
-CHECK:      tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<16x64xf32>>
-CHECK:      tt.return
-CHECK:    }
-)"));
-}
-
-TEST_F(TritonTest, TestGemmWithTrivialNonContractingDimension) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t, is_scheduled=true
-
-triton_dot {
-  param_0.1 = f32[137,115]{1,0} parameter(0)
-  param_1.1 = f32[1,115]{1,0} parameter(1)
-  ROOT dot = f32[137,1]{1,0} dot(param_0.1, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = f32[137,115]{1,0} parameter(0)
-  p1 = f32[1,115]{1,0} parameter(1)
-  ROOT custom-call = f32[137,1]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":2,
-                         "num_ctas":1}}}
-})";
-
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x16xf32>
-CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
-CHECK-DAG:  %[[ZERO_MN:.*]] = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
-CHECK-DAG:  %[[SIZE_K:.*]] = arith.constant 115 : i32
-CHECK-DAG:  %[[SIZE_M:.*]] = arith.constant 137 : i64
-CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : i64
-CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i32
-CHECK-DAG:  %[[C115:.*]] = arith.constant 115 : i64
-CHECK-DAG:  %[[TILE_SIZE_K:.*]] = arith.constant 32 : i32
-CHECK-DAG:  %[[TILE_SIZE_M:.*]] = arith.constant 16 : i32
-CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : i32
-CHECK-DAG:  %[[NUM_TILES_M:.*]] = arith.constant 9 : i32
-CHECK:    %[[PID_NC:.*]] = tt.get_program_id x : i32
-CHECK:    %[[GROUP_ID:.*]] = arith.divsi %[[PID_NC]], %[[C8]]
-CHECK:    %[[FIRST_PID_M:.*]] = arith.muli %[[GROUP_ID]], %[[C8]]
-CHECK:    %[[MAX_M:.*]] = arith.subi %[[NUM_TILES_M]], %[[FIRST_PID_M]]
-CHECK:    %[[CMP:.*]] = arith.cmpi slt, %[[MAX_M]], %[[C8]]
-CHECK:    %[[GROUP_SIZE:.*]] = arith.select %[[CMP]], %[[MAX_M]], %[[C8]]
-CHECK:    %[[PID_M:.*]] = arith.remsi %[[PID_NC]], %[[GROUP_SIZE]]
-CHECK:    %[[TILE_INDEX_M:.*]] = arith.addi %[[FIRST_PID_M]], %[[PID_M]]
-CHECK:    %[[TMP:.*]] = arith.remsi %[[PID_NC]], %[[C8]]
-CHECK:    %[[TILE_INDEX_N:.*]] = arith.divsi %[[TMP]], %[[GROUP_SIZE]]
-CHECK:    %[[TILE_OFFSET_M_LHS:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
-CHECK:    %[[LHS_PTR:.*]] = tt.make_tensor_ptr %[[LHS]]
-CHECK:    %[[LHS_TILE_PTR:.*]] = tt.advance %[[LHS_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[C0]]]
-CHECK:    %[[TILE_OFFSET_N_RHS:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_M]]
-CHECK:    %[[RHS_PTR:.*]] = tt.make_tensor_ptr %[[RHS]]
-CHECK:    %[[RHS_TILE_PTR:.*]] = tt.advance %[[RHS_PTR]], [%[[C0]], %[[TILE_OFFSET_N_RHS]]]
-CHECK:    %[[FOR:.*]]:3 = scf.for %[[BLOCK_K:.*]] = %[[C0]] to %[[SIZE_K]] step %[[TILE_SIZE_K]]
-CHECK-SAME:       iter_args(%[[LHS_ITER_PTR:.*]] = %[[LHS_TILE_PTR]], %[[RHS_ITER_PTR:.*]] = %[[RHS_TILE_PTR]], %[[ACC:.*]] = %[[ZERO_MN]])
-CHECK:      %[[LHS_TILE:.*]] = tt.load %[[LHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
-CHECK:      %[[LHS_ITER_PTR_NEXT:.*]] = tt.advance %[[LHS_ITER_PTR]], [%[[C0]], %[[TILE_SIZE_K]]]
-CHECK:      %[[RHS_TILE:.*]] = tt.load %[[RHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
-CHECK:      %[[RHS_ITER_PTR_NEXT:.*]] = tt.advance %[[RHS_ITER_PTR]], [%[[TILE_SIZE_K]], %[[C0]]]
-CHECK:      %[[TILE_K_LIMIT:.*]] = arith.subi %[[SIZE_K]], %[[BLOCK_K]] : i32
-CHECK:      %[[MASK_IF_COND:.*]]  = arith.cmpi slt, %[[TILE_K_LIMIT]], %c32_i32 : i32
-CHECK:      %[[LHS_MASK_IF_STMT:.*]] = scf.if %[[MASK_IF_COND]] -> (tensor<16x32xf32>) {
-CHECK:      %[[K_TILE_IOTA:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
-CHECK:      %[[K_OFFSETS_1K:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32>
-CHECK:      %[[TILE_K_LIMIT_1K:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<1x32xi32>
-CHECK:      %[[LHS_INBOUNDS_1K:.*]] = arith.cmpi slt, %[[K_OFFSETS_1K]], %[[TILE_K_LIMIT_1K]] : tensor<1x32xi32>
-CHECK:      %[[LHS_INBOUNDS_MK:.*]] = tt.broadcast %[[LHS_INBOUNDS_1K]] : tensor<1x32xi1> -> tensor<16x32xi1>
-CHECK:      %[[LHS_MASKED:.*]] = arith.select %[[LHS_INBOUNDS_MK]], %[[LHS_TILE]], %[[ZERO_MK]]
-CHECK:      scf.yield %[[LHS_MASKED]] : tensor<16x32xf32>
-CHECK:      scf.yield %[[LHS_TILE]] : tensor<16x32xf32>
-CHECK:      %[[RHS_MASK_IF_STMT:.*]] = scf.if %[[MASK_IF_COND]] -> (tensor<32x16xf32>) {
-CHECK:      %[[K_OFFSETS_K1:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32>
-CHECK:      %[[TILE_K_LIMIT_K1:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<32x1xi32>
-CHECK:      %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TILE_K_LIMIT_K1]] : tensor<32x1xi32>
-CHECK:      %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : tensor<32x1xi1> -> tensor<32x16xi1>
-CHECK:      %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TILE]], %[[ZERO_KN]] : tensor<32x16xi1>, tensor<32x16xf32>
-CHECK:      scf.yield %[[RHS_MASKED]] : tensor<32x16xf32>
-CHECK:      scf.yield %[[RHS_TILE]] : tensor<32x16xf32>
-CHECK:      %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASK_IF_STMT]], %[[RHS_MASK_IF_STMT]], %[[ACC]]
-CHECK:      scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xf32>>, !tt.ptr<tensor<32x16xf32>>, tensor<16x16xf32>
-CHECK:    }
-
-CHECK:    %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[SIZE_M]], %[[C1]]], [%[[C1]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x16xf32>>
-CHECK:    %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x16xf32>>
-CHECK:    tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<16x16xf32>>
-CHECK:    tt.return
-CHECK:  }
-)"));
-}
-
-TEST_F(TritonTest, PredParametersAreTruncatedToI1) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_gemm_computation {
-  p = pred[2,2]{1,0} parameter(0)
-  a = f32[2,2]{1,0} parameter(1)
-  b = f32[2,2]{1,0} parameter(2)
-  c = f32[2,2]{1,0} parameter(3)
-  compare = pred[2,2]{1,0} compare(a, b), direction=LT
-  and = pred[2,2]{1,0} and(p, compare)
-  convert = f32[2,2]{1,0} convert(and)
-  ROOT r = f32[2,2]{1,0} dot(convert, c),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p = pred[2,2]{1,0} parameter(0)
-  a = f32[2,2]{1,0} parameter(1)
-  b = f32[2,2]{1,0} parameter(2)
-  c = f32[2,2]{1,0} parameter(3)
-  ROOT triton_gemm = f32[2,2]{1,0} fusion(p, a, b, c), kind=kCustom,
-    calls=triton_gemm_computation,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config: {
-          "block_m":16,"block_n":16,"block_k":16,
-          "split_k":1,"num_stages":1,"num_warps":1,
-          "num_ctas":1
-        }
-      }
-    }
-}
-)";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheckForDot(this, kHloText,
-                                                "triton_gemm_computation", R"(
-CHECK: %[[LOAD:.*]] = tt.load %{{.*}} {{.*}} : !tt.ptr<tensor<16x16xi8>>
-CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
-CHECK: %{{.*}} = arith.andi %[[TRUNCI]], %{{.*}} : tensor<16x16xi1>
-)"));
-}
-
-TEST_F(TritonTest, CodegenBatchedDotWithConcatenationWithCorrectBatchStride) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t, is_scheduled=true
-
-triton_gemm {
-  parameter_0 = f32[2,3,10]{2,1,0} parameter(0)
-  parameter_1 = f32[2,10,128]{2,1,0} parameter(1)
-  parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
-  concatenate = f32[2,10,384]{2,1,0} concatenate(parameter_1, parameter_2), dimensions={2}
-  ROOT dot = f32[2,3,384]{2,1,0} dot(parameter_0, concatenate),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  parameter_0 = f32[2,3,10]{2,1,0} parameter(0)
-  parameter_1 = f32[2,10,128]{2,1,0} parameter(1)
-  parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
-  ROOT dot = f32[2,3,384]{2,1,0} fusion(parameter_0, parameter_1, parameter_2),
-    kind=kCustom, calls=triton_gemm,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":2,
-                         "num_ctas":1}}}
-})";
-
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_gemm", R"(
-CHECK:   tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32>
-CHECK-SAME:                 %[[P1:[^:]*]]: !tt.ptr<f32>
-CHECK-SAME:                 %[[P2:[^:]*]]: !tt.ptr<f32>
-CHECK-DAG: %[[ARG_PTR:.*]] = arith.select %[[CONCAT_COND:.*]], %[[P1]], %[[P2]]
-CHECK-DAG: %[[BATCH_STRIDE_P1:.*]] = arith.constant 1280
-CHECK-DAG: %[[BATCH_STRIDE_P2:.*]] = arith.constant 2560
-CHECK-DAG: %[[BATCH_STRIDE:.*]] = arith.select %[[CONCAT_COND_2:.*]], %[[BATCH_STRIDE_P1]], %[[BATCH_STRIDE_P2]]
-CHECK-DAG: %[[PID_BATCH:.*]] = tt.get_program_id y
-CHECK-DAG: %[[OFFSET:.*]] = arith.muli %[[PID_BATCH]], %[[BATCH_STRIDE]]
-CHECK:     %[[BLOCK_BASE_PTR:.*]] = tt.addptr %[[ARG_PTR]], %[[OFFSET]]
-)"));
-}
-
-TEST_F(TritonTest, CodegenDynamicSliceWithCorrectOffsets) {
-  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
-  // because we don't support dynamic slice on those dimensions.
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-triton_gemm {
-  dot_lhs = f32[2,4] parameter(0)
-  dynamic_slice_input = f32[4,5,2] parameter(1)
-  start_index0 = s32[] parameter(2)
-  start_index1 = s32[] parameter(3)
-  start_index2 = s32[] parameter(4)
-  dynamic_slice = f32[1,5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2), dynamic_slice_sizes={1,5,2}
-  bitcast = f32[5,2] bitcast(dynamic_slice)
-  ROOT dot = f32[4,5] dot(dot_lhs, bitcast), lhs_contracting_dims={0}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  dot_lhs = f32[2,4] parameter(0)
-  dynamic_slice_input = f32[4,5,2] parameter(1)
-  start_index0 = s32[] parameter(2)
-  start_index1 = s32[] constant(0)
-  start_index2 = s32[] constant(0)
-  ROOT fusion = f32[4,5] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
-       kind=kCustom, calls=triton_gemm,
-       backend_config={
-         "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  ASSERT_THAT(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_gemm", R"(
-CHECK:     tt.func @triton_fn({{[^,]*}}, %[[DYNAMIC_SLICE_INPUT:[^:]*]]: !tt.ptr<f32> {{[^,]*}}, %[[START_INDEX0_PTR:[^:]*]]: !tt.ptr<i32>
-CHECK-DAG:   %[[C0_i32:.*]] = arith.constant 0 : i32
-CHECK-DAG:   %[[C1_i64:.*]] = arith.constant 1 : i64
-CHECK-DAG:   %[[C2_i64:.*]] = arith.constant 2 : i64
-CHECK-DAG:   %[[C3_i32:.*]] = arith.constant 3 : i32
-CHECK-DAG:   %[[C5_i32:.*]] = arith.constant 5 : i32
-CHECK-DAG:   %[[C5_i64:.*]] = arith.constant 5 : i64
-CHECK-DAG:   %[[START_INDEX0:.*]] = tt.load %[[START_INDEX0_PTR]] : !tt.ptr<i32>
-CHECK-DAG:   %[[SEMI_CLAMPED_START_INDEX0:.*]] = arith.maxsi %[[START_INDEX0]], %[[C0_i32]] : i32
-CHECK-DAG:   %[[CLAMPED_START_INDEX0:.*]] = arith.minsi %[[SEMI_CLAMPED_START_INDEX0]], %[[C3_i32]] : i32
-CHECK-DAG:   %[[ROW_OFFSET:.*]] = arith.muli %[[CLAMPED_START_INDEX0]], %[[C5_i32]] : i32
-CHECK-DAG:   %[[ROW_OFFSET_i64:.*]] = arith.extsi %[[ROW_OFFSET]] : i32 to i64
-CHECK-DAG:   %[[ROW_LIMIT:.*]] = arith.addi %[[ROW_OFFSET_i64]], %[[C5_i64]] : i64
-CHECK-DAG:   tt.make_tensor_ptr %[[DYNAMIC_SLICE_INPUT]], [%[[C2_i64]], %[[ROW_LIMIT]]], [%[[C1_i64]], %[[C2_i64]]], [%[[C0_i32]], %[[ROW_OFFSET]]]
-)"),
-      absl_testing::IsOk());
-}
-
-TEST_F(TritonGemmTest, DoNotUseTensorCoresWithNonDefaultPrecision) {
-  constexpr absl::string_view kHloText = R"(
-triton_gemm_r {
-  parameter_0 = s8[80,15]{1,0} parameter(0)
-  convert.3 = f32[80,15]{1,0} convert(parameter_0)
-  parameter_1 = f32[16,15]{1,0} parameter(1)
-  ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1},
-    operand_precision={HIGH, HIGH}
-}
-
-ENTRY e {
-  p1 = f32[16,15]{1,0} parameter(1)
-  p0 = s8[80,15]{1,0} parameter(0)
-  ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
-      {"block_m":32,"block_n":32,"block_k":32,
-       "split_k":1,"num_stages":1,"num_warps":2,
-       "num_ctas":1}}}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-
-  CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                R"(
-CHECK-NOT: mma
-)");
-}
-
-TEST_F(TritonGemmTest, DebugOptionsArePropagated) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = f16[30,30] parameter(0)
-  p1 = s8[30,30] parameter(1)
-  cp1 = f16[30,30] convert(p1)
-  ROOT _ = f16[30,30] dot(p0, cp1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  std::string output_directory;
-  if (!tsl::io::GetTestUndeclaredOutputsDir(&output_directory)) {
-    output_directory = tsl::testing::TmpDir();
-  }
-  DebugOptions debug_options = verified_module->config().debug_options();
-  debug_options.set_xla_dump_to(output_directory);
-  debug_options.set_xla_dump_hlo_pass_re("triton-fusion-emitter");
-  verified_module->mutable_config().set_debug_options(debug_options);
-
-  EXPECT_TRUE(RunAndCompare(std::move(verified_module),
-                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-
-  std::vector<std::string> paths;
-  TF_EXPECT_OK(tsl::Env::Default()->GetMatchingPaths(
-      tsl::io::JoinPath(output_directory, "*.triton-passes.log"), &paths));
-  EXPECT_EQ(paths.size(), 1);
-}
-
-TEST_F(TritonGemmTest, DotWithPredFromCompareProducesCorrectResult) {
-  const std::string hlo_text = R"(
-triton_dot {
-  parameter_0 = s32[4,128]{1,0} parameter(0)
-  broadcast.255 = s32[4,128,64]{2,1,0} broadcast(parameter_0), dimensions={0,1}
-  parameter_1 = s32[4,128,64]{2,1,0} parameter(1)
-  compare.39 = pred[4,128,64]{2,1,0} compare(broadcast.255, parameter_1), direction=EQ
-  bitcast.1097 = pred[512,64]{1,0} reshape(compare.39)
-  convert.229 = bf16[512,64]{1,0} convert(bitcast.1097)
-  parameter_2 = bf16[64,256]{0,1} parameter(2)
-  ROOT dot.21 = bf16[512,256]{1,0} dot(convert.229, parameter_2),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-ENTRY main {
-  p0 = s32[4,128]{1,0} parameter(0)
-  p1 = s32[4,128,64]{2,1,0} parameter(1)
-  p2 = bf16[64,256]{0,1} parameter(2)
-  ROOT gemm_fusion_dot.0 = bf16[512,256]{1,0} fusion(p0, p1, p2), kind=kCustom, calls=triton_dot, backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"64","block_n":"128","block_k":"32","split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, UseTensorCoresForF32OnAmpere) {
-  constexpr absl::string_view kHloText = R"(
-triton_gemm_r {
-  parameter_0 = s8[80,15]{1,0} parameter(0)
-  convert.3 = f32[80,15]{1,0} convert(parameter_0)
-  parameter_1 = f32[16,15]{1,0} parameter(1)
-  ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p1 = f32[16,15]{1,0} parameter(1)
-  p0 = s8[80,15]{1,0} parameter(0)
-  ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
-      {"block_m":32,"block_n":32,"block_k":32,
-      "split_k":1,"num_stages":1,"num_warps":2,
-      "num_ctas":1}}}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-
-  CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                R"(
-CHECK: mma
-)");
-}
-
-TEST_F(TritonGemmTest, FailIfTooMuchShmem) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "GEMM padding requirements for ROCM not included yet.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule module, is_scheduled=true
-
-triton_gemm_dot {
-  p0 = s8[1024,1024] parameter(0)
-  p1 = f32[1024,1024] parameter(1)
-  c0 = f32[1024,1024] convert(p0)
-  ROOT dot.0 = f32[1024,1024] dot(c0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = s8[1024,1024] parameter(0)
-  p1 = f32[1024,1024] parameter(1)
-  ROOT r = f32[1024,1024] fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  HloFusionInstruction* triton_dot_fusion = Cast<HloFusionInstruction>(
-      hlo_module->entry_computation()->root_instruction());
-  const se::DeviceDescription dev_info =
-      TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  llvm::LLVMContext llvm_ctx;
-  llvm::Module llvm_module("module", llvm_ctx);
-  mlir::MLIRContext mlir_context;
-
-  auto backend_config_or =
-      triton_dot_fusion->backend_config<GpuBackendConfig>();
-  TF_ASSERT_OK(backend_config_or);
-  GpuBackendConfig& backend_config = *backend_config_or;
-
-  FusionBackendConfig& fusion_backend_config =
-      *backend_config.mutable_fusion_backend_config();
-  auto& config = *fusion_backend_config.mutable_triton_gemm_config();
-  config.set_block_m(16);
-  config.set_block_n(32);
-  config.set_block_k(512);
-  config.set_split_k(1);
-  config.set_num_ctas(1);
-  config.set_num_warps(8);
-  config.set_num_stages(4);
-
-  TF_ASSERT_OK(triton_dot_fusion->set_backend_config(backend_config));
-
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = 1;
-  block_level_parameters.num_stages = 4;
-  block_level_parameters.num_warps = 8;
-
-  EXPECT_THAT(
-      TritonWrapper("test_fn", triton_dot_fusion, CudaAmpereOrRocm(), dev_info,
-                    block_level_parameters, &llvm_module, mlir_context),
-      absl_testing::StatusIs(
-          tsl::error::RESOURCE_EXHAUSTED,
-          ::testing::HasSubstr("Shared memory size limit exceeded")));
-
-  config.set_block_m(64);
-  config.set_block_n(128);
-  config.set_block_k(128);
-  block_level_parameters.num_stages = 1;
-  TF_ASSERT_OK(triton_dot_fusion->set_backend_config(backend_config));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      const auto result,
-      TritonWrapper("test_fn", triton_dot_fusion, CudaAmpereOrRocm(), dev_info,
-                    block_level_parameters, &llvm_module, mlir_context));
-  // Use optin shared memory which is > shared_memory_per_block.
-  EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
-}
-
-TEST_F(TritonGemmTestWithSplitK,
-       WorksWhenKIsDivisibleByBlockKButNotByBlockKTimesSplitK) {
-  // The condition mentioned in the test name is fulfilled by
-  // GemmKey(16, 64, 256, 8, 1, 4), which was part of the default configs for
-  // Ampere at the time of the addition of this test case.
-  constexpr absl::string_view kHloText = R"(
-HloModule extracted
-
-ENTRY e {
-  a = f16[16,5120]{1,0} parameter(0)
-  b = s8[5120,10240]{1,0} parameter(1)
-  converted_b = f16[5120,10240]{1,0} convert(b)
-  ROOT r = f16[16,10240]{1,0} dot(a, converted_b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  // This check tests if Triton is used at all plus it runs GemmFusionAutotuner,
-  // which verifies if the generated kernels can run without errors such as
-  // CUDA_ERROR_ILLEGAL_ADDRESS.
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-  )");
-
-  // Not doing a comparison here, because the input matrices are quite big.
-  // If I reduce their size then they can no longer trigger the error, that I
-  // want to avoid with this test case.
-}
-
-TEST_F(TritonGemmTest, MultipleDims) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f16[1,16,17,3] parameter(0)
-  p1 = s8[16,17,3] parameter(1)
-  cp1 = f16[16,17,3] convert(p1)
-  ROOT _ = f16[1,16,16] dot(p0, cp1),
-    lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-  )");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, PredWithBF16DotProducesCorrectResult) {
-  const std::string hlo_text = R"(
-triton_dot {
-  p0 = pred[8,640]{1,0} parameter(0)
-  cvt = bf16[8,640]{1,0} convert(pred[8,640]{1,0} p0)
-  p1 = bf16[4096,640]{1,0} parameter(1)
-  ROOT dot.10277 = bf16[8,4096]{1,0} dot(cvt, p1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = pred[8,640]{1,0} parameter(0)
-  p1 = bf16[4096,640]{1,0} parameter(1)
-  ROOT dot = bf16[8,4096]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
-      {"block_m":16,"block_n":32,"block_k":64,
-      "split_k":1,"num_stages":2,"num_warps":8,
-      "num_ctas":1}}}
-})";
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, NoPadding) {
-  const char* hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f16[15,19] parameter(0)
-  p1 = s8[19,17] parameter(1)
-  cp1 = f16[19,17] convert(p1)
-  ROOT _ = f16[15,17] dot(p0, cp1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: ROOT
-; CHECK-SAME: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-; CHECK-NOT: pad
-; CHECK-NOT: slice
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, S8xS8) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY f {
-  x = s8[1024,1024]{1,0} parameter(0)
-  y = s8[1024,1024]{1,0} parameter(1)
-  ROOT z = s32[1024,1024]{1,0} dot(x, y),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SplitLhsNoncontractingTransposeRhs) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = pred[3,122,96,12]{3,2,1,0} parameter(0)
-  cp0 = f16[3,122,96,12]{3,2,1,0} convert(p0)
-  p1 = pred[1,5,122]{2,1,0} parameter(1)
-  cp1 = f16[1,5,122]{2,1,0} convert(p1)
-  ROOT _ = f16[3,96,12,1,5]{4,3,2,1,0} dot(cp0, cp1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={2}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
-}
-
-TEST_F(TritonGemmTest, SplitLhsNoncontracting) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[72,72] parameter(0)
-  bc1 = f32[4,3,3,2,4,3,3,2] reshape(p0)
-  tr = f32[4,3,3,2,2,4,3,3] transpose(bc1), dimensions={0,1,2,3,7,4,5,6}
-  bc2 = f32[144,36] reshape(tr)
-  p1 = f16[36,3] parameter(1)
-  c7 = f32[36,3] convert(p1)
-  ROOT _ = f32[144,3] dot(bc2, c7),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SplitAndTransposeLhsExecutesCorrectly) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  tmp_0 = s8[5,50,2,128] parameter(1)
-  tmp_2 = s8[50,5,2,128] transpose(tmp_0), dimensions={1,0,2,3}
-  tmp_3 = s8[50,1280] reshape(tmp_2)
-  tmp_4 = f16[50,1280] convert(tmp_3)
-  tmp_5 = f16[50,79] parameter(0)
-  ROOT tmp_6 = f16[1280,79] dot(tmp_4, tmp_5),
-    lhs_contracting_dims={0}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: ROOT
-; CHECK-SAME: fusion
-; CHECK-SAME: kind=kCustom
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, NondefaultOperandLayoutIsSupported) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY r {
-  p1 = f16[3,10,128]{2,1,0} parameter(1)
-  cp = f16[3,10,128]{2,0,1} copy(p1)
-  cv = f32[3,10,128]{2,0,1} convert(cp)
-  p0 = f32[3,10,123]{2,1,0} parameter(0)
-  ROOT d = f32[3,128,123]{2,1,0} dot(cv, p0),
-    lhs_batch_dims={0}, lhs_contracting_dims={1},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, DoNotFuseSplitRhsContractingTranspose) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f16[5,8] parameter(0)
-  p1 = s8[2,3,4] parameter(1)
-  c0 = f16[2,3,4] convert(p1)
-  t1 = f16[3,2,4] transpose(c0), dimensions={1,0,2}
-  r1 = f16[3,8] reshape(t1)
-  ROOT _ = f16[5,3] dot(p0, r1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK: transpose
-; CHECK: fusion
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, DoNotFuseSplitLhsContractingTranspose) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f16[3,16,25]{2,1,0} parameter(0)
-  p0t = f16[16,3,25]{2,1,0} transpose(p0), dimensions={1,0,2}
-  p0tr = f16[16,75]{1,0} reshape(p0t)
-  p1 = s8[128,75]{1,0} parameter(1)
-  cp1 = f16[128,75]{1,0} convert(p1)
-  ROOT dot.126 = f16[16,128]{1,0} dot(p0tr, cp1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK: transpose
-; CHECK: fusion
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, BatchF32F16) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  x = f32[5,2,3] parameter(0)
-  y = f16[5,3,4] parameter(1)
-  cy = f32[5,3,4] convert(y)
-  ROOT _ = f32[5,2,4] dot(x, cy),
-    lhs_contracting_dims={2}, rhs_contracting_dims={1},
-    lhs_batch_dims={0}, rhs_batch_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTest, NonMajorMostInputBatchWorksCorrectly) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  x = f32[20,50,30] parameter(0)
-  y = f16[30,50,40] parameter(1)
-  cy = f32[30,50,40] convert(y)
-  ROOT _ = f32[50,20,40] dot(x, cy),
-    lhs_contracting_dims={2}, rhs_contracting_dims={0},
-    lhs_batch_dims={1}, rhs_batch_dims={1}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, BatchTransposeF32F16) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  x = f32[5,3,2] parameter(0)
-  y = f16[5,3,4] parameter(1)
-  cy = f32[5,3,4] convert(y)
-  x_transposed = f32[5,2,3] transpose(x), dimensions={0, 2, 1}
-  ROOT _ = f32[5,2,4] dot(x_transposed, cy),
-    lhs_contracting_dims={2}, rhs_contracting_dims={1},
-    lhs_batch_dims={0}, rhs_batch_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTest, DoNotFuseArbitraryReshape) {
-  const std::string hlo_text = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f16[5,2,3] parameter(0)
-  p0c = f32[5,2,3] convert(p0)
-  p1 = f32[20,3] parameter(1)
-  p1r = f32[5,3,4] reshape(p1)
-  ROOT dot.5 = f32[5,2,4] dot(p0c, p1r),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK: f32[5,3,4]{2,1,0} bitcast
-; CHECK: fusion
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
-}
-
-TEST_F(TritonGemmTest, MultipleBatchRequireSeparateTranspose) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  Arg_0 = f16[3,4,2,5,4] parameter(0)
-  c = f32[3,4,2,5,4] convert(Arg_0)
-  Arg_1 = f32[5,3,4,3,2] parameter(1)
-  ROOT dot.3 = f32[5,3,4,4,3] dot(c, Arg_1),
-    lhs_batch_dims={3,0,1}, lhs_contracting_dims={2},
-    rhs_batch_dims={0,1,2}, rhs_contracting_dims={4}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: ROOT
-; CHECK: transpose(
-; CHECK: bitcast(
-; CHECK: kCustom
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
-}
-
-TEST_F(TritonGemmTest, CanCodegenNonBatchedDotWithConcatenationCorrectly) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  parameter_0 = f32[3,10]{1,0} parameter(0)
-  parameter_1 = f32[10,128]{1,0} parameter(1)
-  parameter_2 = f32[10,256]{1,0} parameter(2)
-  concatenate = f32[10,384]{1,0} concatenate(parameter_1, parameter_2), dimensions={1}
-  ROOT dot = f32[3,384]{1,0} dot(parameter_0, concatenate),
-    lhs_batch_dims={}, lhs_contracting_dims={1},
-    rhs_batch_dims={}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK:     ENTRY
-; CHECK-NOT:   concatenate
-; CHECK:       fusion
-; CHECK-SAME:    kind=kCustom
-)");
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, CanCodegenBatchedDotWithConcatenationCorrectly) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  parameter_0 = f32[2,3,10]{2,1,0} parameter(0)
-  parameter_1 = f32[2,10,128]{2,1,0} parameter(1)
-  parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
-  concatenate = f32[2,10,384]{2,1,0} concatenate(parameter_1, parameter_2), dimensions={2}
-  ROOT dot = f32[2,3,384]{2,1,0} dot(parameter_0, concatenate),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK:     ENTRY
-; CHECK-NOT:   concatenate
-; CHECK:       fusion
-; CHECK-SAME:    kind=kCustom
-)");
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTestWithoutTritonGemmAny, SkipU8) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "GEMM padding requirements for ROCM not included yet.";
-  }
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[3,3]{1,0} parameter(0)
-  p1 = u8[3,3]{1,0} parameter(1)
-  c = f32[3,3]{1,0} convert(p1)
-  ROOT r = f32[3,3]{1,0} dot(p0, c),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: cublas
-; CHECK-NOT: triton
-)");
-}
-
-TEST_F(TritonTest, FloatToSignedIntConversion) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t, is_scheduled=true
-
-triton_gemm_r {
-  p_0 = s8[32,32]{1,0} parameter(0)
-  p_1 = f16[32,32]{1,0} parameter(1)
-  cvt_1 = s8[32,32]{1,0} convert(p_1)
-  ROOT r.1 = f32[32,32]{1,0} dot(p_0, cvt_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p_0 = s8[32,32]{1,0} parameter(0)
-  p_1 = f16[32,32]{1,0} parameter(1)
-  ROOT triton_gemm_r = f32[32,32]{1,0} fusion(p_0, p_1), kind=kCustom,
-    calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":4,
-                         "num_ctas":1}}}
-})";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_gemm_r", R"(
-CHECK:    tt.func @triton_fn
-CHECK-DAG:      %[[ZERO:.*]] = arith.constant dense<0>
-CHECK-DAG:      %[[FMIN:.*]] = arith.constant dense<-1.280000e+02>
-CHECK-DAG:      %[[IMIN:.*]] = arith.constant dense<-128>
-CHECK-DAG:      %[[FMAX:.*]] = arith.constant dense<1.270000e+02>
-CHECK-DAG:      %[[IMAX:.*]] = arith.constant dense<127>
-CHECK:          %[[FPTOSI:.*]] = arith.fptosi %[[IN:.*]] :
-CHECK:          %[[CMP1:.*]] = arith.cmpf ole, %[[IN]], %[[FMIN]]
-CHECK:          %[[RES1:.*]] = arith.select %[[CMP1]], %[[IMIN]], %[[FPTOSI]]
-CHECK:          %[[CMP2:.*]] = arith.cmpf oge, %[[IN]], %[[FMAX]]
-CHECK:          %[[RES2:.*]] = arith.select %[[CMP2]], %[[IMAX]], %[[RES1]]
-CHECK:          %[[CMP3:.*]] = arith.cmpf uno, %[[IN]], %[[IN]]
-CHECK:          %[[RES3:.*]] = arith.select %[[CMP3]], %[[ZERO]], %[[RES2]]
-})"));
-}
-
-TEST_F(TritonGemmTestWithoutTritonGemmAny, SkipF32F32) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "GEMM padding requirements for ROCM not included yet.";
-  }
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[3,5] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT _ = f32[3,7] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: cublas
-; CHECK-NOT: triton
-)");
-}
-
-// This tests the complexity heuristics in TritonWrapper.
-TEST_F(TritonGemmTest, FailForTooComplexTiling) {
-  constexpr absl::string_view kHloText = R"(
-HloModule module, is_scheduled=true
-
-triton_gemm_dot {
-  p0 = s8[1024,1024] parameter(0)
-  p1 = f32[1024,1024] parameter(1)
-  c0 = f32[1024,1024] convert(p0)
-  ROOT dot.0 = f32[1024,1024] dot(c0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = s8[1024,1024] parameter(0)
-  p1 = f32[1024,1024] parameter(1)
-  ROOT r = f32[1024,1024] fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  HloFusionInstruction* triton_dot_fusion = Cast<HloFusionInstruction>(
-      hlo_module->entry_computation()->root_instruction());
-  const se::DeviceDescription dev_info =
-      TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  llvm::LLVMContext llvm_ctx;
-  llvm::Module llvm_module("module", llvm_ctx);
-  mlir::MLIRContext mlir_context;
-
-  auto backend_config_or =
-      triton_dot_fusion->backend_config<GpuBackendConfig>();
-  TF_ASSERT_OK(backend_config_or);
-  GpuBackendConfig& backend_config = *backend_config_or;
-
-  FusionBackendConfig& fusion_backend_config =
-      *backend_config.mutable_fusion_backend_config();
-  auto& config = *fusion_backend_config.mutable_triton_gemm_config();
-  // Fails if the tiling is too complex.
-  config.set_block_m(512);
-  config.set_block_n(512);
-  config.set_block_k(32);
-  config.set_split_k(1);
-  config.set_num_ctas(1);
-  config.set_num_stages(1);
-  config.set_num_warps(2);
-  TF_ASSERT_OK(triton_dot_fusion->set_backend_config(backend_config));
-
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = 1;
-  block_level_parameters.num_stages = 1;
-  block_level_parameters.num_warps = 2;
-  EXPECT_THAT(
-      TritonWrapper("test_fn", triton_dot_fusion, CudaAmpereOrRocm(), dev_info,
-                    block_level_parameters, &llvm_module, mlir_context),
-      absl_testing::StatusIs(
-          tsl::error::RESOURCE_EXHAUSTED,
-          "Tiling complexity heuristic exceeded: 147456 > 9000"));
-
-  // Succeeds if the tiling is not too complex.
-  config.set_block_m(32);
-  config.set_block_n(32);
-  config.set_block_k(32);
-  TF_ASSERT_OK(triton_dot_fusion->set_backend_config(backend_config));
-
-  TF_ASSERT_OK(TritonWrapper("test_fn", triton_dot_fusion, CudaAmpereOrRocm(),
-                             dev_info, block_level_parameters, &llvm_module,
-                             mlir_context)
-                   .status());
-}
-
-// Triton compiler used to have an issue with reordering constants:
-// https://github.com/openai/triton/issues/1864
-TEST_F(TritonGemmTest, TritonCompilerDoesNotFailOnConstants) {
-  TF_ASSERT_OK(GetOptimizedModule(R"(
-HloModule m
-
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  c = f32[] constant(0)
-  b = f32[11,63] broadcast(c)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, b),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[92,11]{1,0} parameter(0)
-  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"16","block_n":"64",
-                                          "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2",
-                                          "num_ctas":"1"}}}
-})")
-                   .status());
-}
-
-// Normally optimized HLO should contain `copy` instead of `transpose` but
-// it's also possible to get transposes by modifying the compiler's pipeline.
-// The emitter just has to skip through the transpose - it's handled by the
-// tiled fusion analysis.
-TEST_F(TritonGemmTest, TritonEmitterCanHandleTransposes) {
-  MatchOptimizedHlo(R"(
-t {
-  p0 = f16[55,77,111]{2,1,0} parameter(0)
-  p1 = f16[111,77,99]{2,1,0} parameter(1)
-  t = f16[77,99,111]{2,1,0} transpose(p1), dimensions={1,2,0}
-  ROOT d = f16[77,55,99]{2,1,0} dot(p0, t),
-    lhs_batch_dims={1}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={2}
-}
-
-ENTRY e {
-  p0 = f16[55,77,111]{2,1,0} parameter(0)
-  p1 = f16[111,77,99]{2,1,0} parameter(1)
-  ROOT r = f16[77,55,99]{2,1,0} fusion(p0, p1), kind=kCustom,
-    calls=t, backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-})",
-                    // This partially optimized HLO will go through the
-                    // autotuner which will run the fusion through the emitter
-                    // multiple times and assign block sizes on success.
-                    R"(
-; CHECK: f16[77,99,111]{2,1,0} transpose
-; CHECK-PTX: block_m
-)");
-}
-
-TEST_F(TritonGemmTest, SingleElementTileIsHandled) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Not using autotuner on ROCM yet.";
-  }
-  MatchOptimizedHlo(R"(
-t {
-  p0 = f32[2,7,3]{2,1,0} parameter(0)
-  p1 = s32[2,1]{1,0} parameter(1)
-  c = s32[] constant(1)
-  br0 = s32[2,1]{1,0} broadcast(c), dimensions={}
-  cmp = pred[2,1]{1,0} compare(p1, br0), direction=LT
-  bc0 = pred[2]{0} bitcast(cmp)
-  br1 = pred[2,1,3,3]{3,2,0,1} broadcast(bc0), dimensions={0}
-  cvt = f32[2,1,3,3]{3,2,0,1} convert(br1)
-  bc1 = f32[2,3,3]{2,1,0} bitcast(cvt)
-  ROOT d = f32[2,7,3]{2,1,0} dot(p0, bc1),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = f32[2,7,3]{2,1,0} parameter(0)
-  p1 = s32[2,1]{1,0} parameter(1)
-  ROOT r = f32[2,7,3]{2,1,0} fusion(p0, p1), kind=kCustom,
-    calls=t, backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-})",
-                    // This partially optimized HLO will go through the
-                    // autotuner which will run the fusion through the emitter
-                    // multiple times and assign block sizes on success.
-                    R"(
-; CHECK: block_m
-)");
-}
-
-TEST_F(TritonGemmTest,
-       BroadcastsOfTriviallySizedNonContractingDimensionsAreSupported) {
-  EXPECT_TRUE(RunAndCompare(R"(
-f {
-  p0 = f32[64,6464] parameter(0)
-  p1 = f32[16,6464] parameter(1)
-  dot = f32[16,64] dot(p1, p0),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  bc0 = f32[1,16,64] bitcast(dot)
-  p2 = f32[64] parameter(2)
-  bc1 = f32[1,64] bitcast(p2)
-  br = f32[1,16,64] broadcast(bc1), dimensions={0,2}
-  m = f32[1,16,64] multiply(bc0, br)
-}
-
-e {
-  p0 = f32[64,6464] parameter(0)
-  p1 = f32[16,6464] parameter(1)
-  p2 = f32[64] parameter(2)
-  f = f32[1,16,64] fusion(p0, p1, p2),
-    kind=kCustom, calls=f, backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-})",
-                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest,
-       BroadcastsOfTriviallySizedContractingDimensionsAreSupported) {
-  EXPECT_TRUE(RunAndCompare(R"(
-f {
-  a = f16[2] parameter(0)
-  bc0 = f16[1,2] bitcast(a)
-  br = f16[1,4000,2] broadcast(bc0), dimensions={0,2}
-  bc1 = f16[4000,2] bitcast(br)
-  b = f16[3,4000] parameter(1)
-  d = f16[2,3] dot(bc1, b),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-}
-
-e {
-  a = f16[2] parameter(0)
-  b = f16[3,4000] parameter(1)
-  f = f16[2,3] fusion(a, b),
-    kind=kCustom, calls=f, backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-})",
-                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, DoF32F32) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[3,5] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT _ = f32[3,7] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, DoAddConstantToScalarAndBroadcastThat) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Not using autotuner on ROCM yet.";
-  }
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[] parameter(0)
-  p1 = f32[5,5] parameter(1)
-  %constant = f32[] constant(8)
-  add = add(p0, constant)
-  broadcast = f32[5,5] broadcast(add), dimensions={}
-  ROOT _ = f32[5,5] dot(broadcast, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: fusion({{.*}} kind=kCustom, {{.*}}block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SameInput) {
-  const std::string hlo_text = R"(
-HloModule m
-
-ENTRY e {
-  p0 = pred[5,5]{1,0} parameter(0)
-  c = f32[5,5]{1,0} convert(p0)
-  ROOT r = f32[5,5]{1,0} dot(c, c),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-})";
-
-  // The fusion has separate parameters for each scope.
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: ENTRY
-; CHECK: %[[p0:.*]] = pred[5,5]{1,0} parameter(0)
-; CHECK: fusion(%[[p0]], %[[p0]]), kind=kCustom
-; CHECK-PTX-SAME: "block_m":
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, DynamicSliceIsSupportedInLhsEndToEnd) {
-  // The select is used to restrict the start index to values that make sense.
-  // If it was constant, then the dynamic-slice would be optimized to slice. It
-  // is not strictly needed, because we also support clamping the indices.
-  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
-  // because we don't support dynamic slice on those dimensions.
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  dot_lhs = f32[2,4] parameter(0)
-  dynamic_slice_input = f32[7,2] parameter(1)
-  pred0 = pred[] parameter(2)
-  c1 = s32[] constant(1)
-  c2 = s32[] constant(2)
-  start_index0 = s32[] select(pred0, c1, c2)
-  start_index1 = s32[] constant(0)
-  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
-                  dynamic_slice_sizes={5,2}
-  ROOT dot = f32[4,5] dot(dot_lhs, dynamic_slice),
-          lhs_contracting_dims={0}, rhs_contracting_dims={1}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(),
-                           m::Fusion(m::Parameter()), m::Constant())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-  // Check that it's not optimized away.
-  MatchHloModule(*module, "; CHECK: dynamic-slice(");
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, DynamicSliceIsSupportedInRhs) {
-  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
-  // because we don't support dynamic slice on those dimensions.
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_gemm {
-  dynamic_slice_input = f32[7,2] parameter(0)
-  dot_rhs = f32[2,4] parameter(1)
-  start_index0 = s32[] parameter(2)
-  start_index1 = s32[] parameter(3)
-  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
-                  dynamic_slice_sizes={5,2}
-  ROOT dot = f32[5, 4] dot(dynamic_slice, dot_rhs),
-          lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  dynamic_slice_input = f32[7,2] parameter(0)
-  dot_rhs = f32[2,4] parameter(1)
-  start_index0 = s32[] constant(1)
-  start_index1 = s32[] constant(0)
-  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
-       kind=kCustom, calls=triton_gemm,
-       backend_config={
-         "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, MultiplePathsToSameOperandWorks) {
-  constexpr absl::string_view kHloText = R"(
-triton_computation {
-  p0 = bf16[8192,512]{1,0} parameter(0)
-  p1 = bf16[512,512]{1,0} parameter(1)
-  dot = bf16[8192,512]{1,0} dot(bf16[8192,512]{1,0} p0, bf16[512,512]{1,0} p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  p2 = bf16[8192,512]{1,0} parameter(2)
-  multiply.1 = bf16[8192,512]{1,0} multiply(bf16[8192,512]{1,0} dot, bf16[8192,512]{1,0} p2)
-  ROOT multiply.2 = bf16[8192,512]{1,0} multiply(bf16[8192,512]{1,0} multiply.1, bf16[8192,512]{1,0} p2)
-}
-
-ENTRY e {
-  p0 = bf16[8192,512]{1,0} parameter(0)
-  p1 = bf16[512,512]{1,0} parameter(1)
-  p2 = bf16[8192,512]{1,0} parameter(2)
-  ROOT fusion = bf16[8192,512]{1,0} fusion(p0,p1,p2), kind=kCustom, calls=triton_computation,
-  backend_config={"fusion_backend_config":
-      {"kind":"__triton_gemm", "triton_gemm_config":{"block_m":"64","block_n":"256","block_k":"32","split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_computation", R"(
-        CHECK:      tt.dot
-        CHECK-SAME: tensor<64x32xbf16> * tensor<32x256xbf16> -> tensor<64x256xf32>
-        CHECK:      arith.mulf
-        CHECK:      arith.mulf
-    )"));
-}
-
-class TritonGemmDynamicSliceClampingTest
-    : public TritonTest,
-      public ::testing::WithParamInterface<int> {};
-
-TEST_P(TritonGemmDynamicSliceClampingTest,
-       DynamicSliceIsSupportedWhenTheStartIndexNeedsClamping) {
-  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
-  // because we don't support dynamic slice on those dimensions.
-
-  const std::string hlo_text = absl::Substitute(R"(
-HloModule m
-
-triton_gemm {
-  dynamic_slice_input = f32[7,2] parameter(0)
-  dot_rhs = f32[2,4] parameter(1)
-  start_index0 = s32[] parameter(2)
-  start_index1 = s32[] parameter(3)
-  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
-                  dynamic_slice_sizes={5,2}
-  ROOT dot = f32[5, 4] dot(dynamic_slice, dot_rhs),
-          lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  dynamic_slice_input = f32[7,2] parameter(0)
-  dot_rhs = f32[2,4] parameter(1)
-  start_index0 = s32[] constant($0)
-  start_index1 = s32[] constant(0)
-  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
-       kind=kCustom, calls=triton_gemm,
-       backend_config={
-         "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})",
-                                                GetParam());
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
-}
-
-std::string OffsetParamToString(const ::testing::TestParamInfo<int>& data) {
-  return absl::StrCat("WithOffsetEq", data.param < 0 ? "Negative" : "",
-                      std::abs(data.param));
-}
-
-INSTANTIATE_TEST_SUITE_P(All, TritonGemmDynamicSliceClampingTest,
-                         ::testing::Values(-100, 3, 999), OffsetParamToString);
-
-TEST_F(TritonGemmTest, DynamicSliceOfMajormostContractingDimIsSupported) {
-  // Tests that dynamic-slice works on the majormost dimension even if that
-  // dimension is contracted.
-  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
-  // because we don't support dynamic slice on those dimensions.
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_gemm {
-  dot_lhs = f32[2,4] parameter(0)
-  dynamic_slice_input = f32[5,4] parameter(1)
-  start_index0 = s32[] parameter(2)
-  start_index1 = s32[] parameter(3)
-  dynamic_slice = f32[2,4] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
-                  dynamic_slice_sizes={2,4}
-  ROOT dot = f32[4,4] dot(dot_lhs, dynamic_slice),
-             lhs_contracting_dims={0}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  dot_lhs = f32[2,4] parameter(0)
-  dynamic_slice_input = f32[5,4] parameter(1)
-  start_index0 = s32[] constant(2)
-  start_index1 = s32[] constant(0)
-  ROOT fusion = f32[4,4] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1),
-       kind=kCustom, calls=triton_gemm,
-       backend_config={
-         "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, DynamicSliceOfMajormostBatchDimIsSupported) {
-  // Tests that dynamic-slice works on the majormost dimension even if that
-  // dimension is a batch.
-  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
-  // because we don't support dynamic slice on those dimensions.
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_gemm {
-  dot_lhs = f32[2,2,4] parameter(0)
-  dynamic_slice_input = f32[7,2,4] parameter(1)
-  start_index0 = s32[] parameter(2)
-  start_index1 = s32[] parameter(3)
-  start_index2 = s32[] parameter(4)
-  dynamic_slice = f32[2,2,4] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2),
-                  dynamic_slice_sizes={2,2,4}
-  ROOT dot = f32[2,4,4] dot(dot_lhs, dynamic_slice),
-             lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  dot_lhs = f32[2,2,4] parameter(0)
-  dynamic_slice_input = f32[7,2,4] parameter(1)
-  start_index0 = s32[] constant(2)
-  start_index1 = s32[] constant(0)
-  start_index2 = s32[] constant(0)
-  ROOT fusion = f32[2,4,4] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
-       kind=kCustom, calls=triton_gemm,
-       backend_config={
-         "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, DynamicSliceSingleDimensionIntoReshapeIsSupported) {
-  // This directly tests the targeted use case (b/307922364) of iterating over
-  // layer weights and extracting them with dynamic slice.
-  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
-  // because we don't support dynamic slice on those dimensions.
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-triton_gemm {
-  dot_lhs = f32[2,4] parameter(0)
-  dynamic_slice_input = f32[4,5,2] parameter(1)
-  start_index0 = s32[] parameter(2)
-  start_index1 = s32[] parameter(3)
-  start_index2 = s32[] parameter(4)
-  dynamic_slice = f32[1,5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2),
-                             dynamic_slice_sizes={1,5,2}
-  reshape = f32[5,2] reshape(dynamic_slice)
-  ROOT d = f32[4,5] dot(dot_lhs, reshape),
-           lhs_contracting_dims={0}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  dot_lhs = f32[2,4] parameter(0)
-  dynamic_slice_input = f32[4,5,2] parameter(1)
-  start_index0 = s32[] constant(3)
-  start_index1 = s32[] constant(0)
-  start_index2 = s32[] constant(0)
-  ROOT fusion = f32[4,5] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
-       kind=kCustom, calls=triton_gemm,
-       backend_config={
-         "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, DoNotFuseConcatenationOfSplitNonContractingDimension) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Not using autotuner on ROCM yet.";
-  }
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  const std::string hlo_text = R"(
-HloModule m
-
-ENTRY e {
-  x = bf16[2,128,10] parameter(0)
-  y = bf16[2,256,10] parameter(1)
-  concat = bf16[2,384,10] concatenate(x, y), dimensions={1}
-  z = bf16[10,20] parameter(2)
-  ROOT d = bf16[2,384,20] dot(concat, z), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK:      ENTRY
-; CHECK:      concatenate
-; CHECK:        ROOT
-; CHECK-SAME:     fusion
-; CHECK-SAME:       kind=kCustom
-; CHECK-SAME:       "block_m"
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, BroadcastOfScalarWorksCorrectly) {
-  constexpr absl::string_view kHloText = R"(
-fusion {
-  p0 = f16[2,18] parameter(0)
-  p1 = f16[256,2] parameter(1)
-  d = f16[18,256] dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
-  p2 = f16[] parameter(2)
-  p3 = f16[] parameter(3)
-  multiply = f16[] multiply(p2, p3)
-  broadcast = f16[18,256] broadcast(multiply), dimensions={}
-  ROOT multiply.3 = f16[18,256] multiply(d, broadcast)
-}
-ENTRY e  {
-  p0 = f16[2,18] parameter(0)
-  p1 = f16[256,2] parameter(1)
-  p2 = f16[] parameter(2)
-  p3 = f16[] parameter(3)
-  ROOT gemm_fusion = f16[18,256]{1,0} fusion(p0, p1, p2, p3), kind=kCustom, calls=fusion, backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"16","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  TF_ASSERT_OK(CreateTritonIrAndFileCheckForDot(this, kHloText, "fusion", R"(
-        CHECK:      tt.dot
-        CHECK:      arith.mulf %{{.*}}, %{{.*}} : tensor<f16>
-        CHECK:      tt.broadcast %{{.*}} : tensor<1x1xf16> -> tensor<32x32xf16>
-        CHECK:      arith.mulf %{{.*}}, %{{.*}} : tensor<32x32xf16>
-    )"));
-  const se::DeviceDescription dev_info =
-      backend().default_stream_executor()->GetDeviceDescription();
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  const HloFusionInstruction* triton_dot_fusion = Cast<HloFusionInstruction>(
-      hlo_module->entry_computation()->root_instruction());
-  llvm::LLVMContext llvm_ctx;
-  llvm::Module llvm_module("module", llvm_ctx);
-  mlir::MLIRContext mlir_context;
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto gpu_config, triton_dot_fusion->backend_config<GpuBackendConfig>());
-  const FusionBackendConfig& config = gpu_config.fusion_backend_config();
-  auto gemm_config = config.triton_gemm_config();
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = gemm_config.num_ctas();
-  block_level_parameters.num_warps = gemm_config.num_warps();
-  block_level_parameters.num_stages = gemm_config.num_stages();
-
-  TF_ASSERT_OK(TritonWrapper("test_fn", triton_dot_fusion, GpuComputeComp(),
-                             dev_info, block_level_parameters, &llvm_module,
-                             mlir_context)
-                   .status());
-}
-
-TEST_F(TritonGemmTest, BinaryOperationWithSmallInputsIsFused) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = s8[7,3] parameter(0)
-  p1 = f32[3,16] parameter(1)
-  p2 = f32[3,16] parameter(2)
-  e = f32[3,16] exponential(p1)
-  a = f32[3,16] add(e, p2)
-  c = f32[7,3] convert(p0)
-  ROOT d = f32[7,16] dot(c, a),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-1, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTest, BinaryOperationWithLargeInputsIsNotFused) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f16[333,1000] parameter(0)
-  p1 = f32[1000,333] parameter(1)
-  p1n = f32[1000,333] negate(p1)
-  p2 = f32[1000,333] parameter(2)
-  p2n = f32[1000,333] negate(p2)
-  s = f32[1000,333] subtract(p1n, p2n)
-  c = f32[333,1000] convert(p0)
-  ROOT d = f32[1000,1000] dot(s, c),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fused_subtract
-; CHECK: negate
-; CHECK: negate
-; CHECK: ROOT
-; CHECK-SAME: subtract
-; CHECK: ENTRY
-; CHECK: kLoop
-; CHECK: kCustom
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-1, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, ParametersWithDifferentLayoutsAreSupportedInOneScope) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = s8[5,3] parameter(0)
-  p0c = f16[5,3] convert(p0)
-  p1 = f16[5,7] parameter(1)
-  p2 = f16[7,5] parameter(2)
-  t = f16[5,7] transpose(p2), dimensions={1,0}
-  a = f16[5,7] add(t, p1)
-  ROOT d = f16[3,7] dot(p0c, a),
-    lhs_contracting_dims={0}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, BinaryOperationOnLargeParametersIsFused) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f16[1000,111] parameter(0)
-  p1 = f32[111,10000] parameter(1)
-  p2 = f32[111,10000] parameter(2)
-  s = f32[111,10000] subtract(p1, p2)
-  c = f32[1000,111] convert(p0)
-  ROOT d = f32[10000,1000] dot(s, c),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-1, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, LinkingLibdeviceTwiceWorks) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = s8[7,3] parameter(0)
-  c0 = f32[7,3] convert(p0)
-  p1 = f32[3,16] parameter(1)
-  e1 = f32[3,16] exponential(p1)
-  d0 = f32[7,16] dot(c0, e1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  p2 = s8[7,3] parameter(2)
-  c2 = f32[7,3] convert(p2)
-  e2 = f32[7,3] exponential(c2)
-  p3 = f32[3,16] parameter(3)
-  d1 = f32[7,16] dot(e2, p3),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT a = f32[7,16] add(d0, d1)
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Add(
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom),
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTest, BroadcastOfScalarParameterIsFused) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = f16[64,256] parameter(0)
-  p0c = f32[64,256] convert(p0)
-  p1 = f32[] parameter(1)
-  b = f32[256,128] broadcast(p1), dimensions={}
-  ROOT d = f32[64,128] dot(p0c, b),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, BroadcastOfScalarConstantIsFused) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f16[70,30] parameter(0)
-  p0c = f32[70,30] convert(p0)
-  constant_3663 = f32[] constant(4321)
-  bc0 = f32[30,5] broadcast(constant_3663)
-  ROOT d = f32[70,5] dot(p0c, bc0),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
-}
-
-TEST_F(TritonGemmTest, DoubleBroadcastOfScalarConstantIsHandled) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  c = s32[] constant(1)
-  bc1 = s32[21]{0} broadcast(c), dimensions={}
-  p0 = s32[21]{0} parameter(0)
-  cmp = pred[21]{0} compare(bc1, p0), direction=EQ
-  convert.6 = bf16[21]{0} convert(cmp)
-  bc2 = bf16[3,21]{1,0} broadcast(convert.6), dimensions={1}
-  p1 = bf16[21,71]{1,0} parameter(1)
-  ROOT d = bf16[3,71]{1,0} dot(bc2, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, BroadcastOfVectorConstantIsFused) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = s8[60,5] parameter(0)
-  c0 = f16[60,5] convert(p0)
-  cst1 = f16[120] constant({...})
-  r1 = f16[5,120] broadcast(cst1), dimensions={1}
-  ROOT d = f16[60,120] dot(c0, r1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Constant())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
-}
-
-TEST_F(TritonGemmTest, AlwaysFuseScalarConstantAtBroadcastInput) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = bf16[2,3,3]{2,1,0} parameter(0)
-  p1 = bf16[3,2,3]{2,1,0} parameter(1)
-  d = bf16[2,3,3]{2,1,0} dot(p0, p1),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={1}, rhs_contracting_dims={0}
-  t = bf16[3,2,3]{2,0,1} transpose(d), dimensions={1,0,2}
-  c = bf16[] constant(0.123)
-  b = bf16[3,2,3]{2,1,0} broadcast(c), dimensions={}
-  m = bf16[3,2,3]{2,0,1} multiply(t, b)
-  ROOT tu = (bf16[3,2,3]{2,0,1}, bf16[3,2,3]{2,1,0}) tuple(m, b)
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: gemm_fusion_dot
-; CHECK: dot(
-; CHECK: bf16[] constant(0.123)
-; CHECK: ROOT
-; CHECK: ENTRY
-; CHECK: kCustom
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, BroadcastOfVectorParameterIsFused) {
-  constexpr absl::string_view kHloText = R"(
-triton_dot {
-  p0 = f16[75] parameter(0)
-  bc0 = f16[75,67] broadcast(p0), dimensions={0}
-  p1 = f16[92,75] parameter(1)
-  ROOT d = f16[92,67] dot(p1, bc0),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f16[75] parameter(0)
-  p1 = f16[92,75] parameter(1)
-  ROOT _ = f16[92,67] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
-      {"block_m":32,"block_n":64,"block_k":32,
-      "split_k":1,"num_stages":1,"num_warps":1,
-      "num_ctas":1}}}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(kHloText));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
-}
-
-TEST_F(TritonGemmTest, FuseConcatenation) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-e {
-  p0 = s8[153,1536] parameter(0)
-  p1 = s8[153,128] parameter(1)
-  p2 = s8[153,128] parameter(2)
-  cat = s8[153,1792] concatenate(p0, p1, p2), dimensions={1}
-  cvt = bf16[153,1792] convert(cat)
-  p3 = bf16[16,153] parameter(3)
-  ROOT d = bf16[16,1792] dot(p3, cvt),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
-                           m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2,
-                                                /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTest, MinimumHandlesNaNsOnTheLeft) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  neg1 = f32[] constant(-1)
-  neg1s = f32[5,5] broadcast(neg1), dimensions={}
-  nans = f32[5,5] sqrt(neg1s)
-  min = f32[5,5] minimum(nans, neg1s)
-  ROOT _ = f32[5,5] dot(p0, min),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, MinimumHandlesNaNsOnTheRight) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  neg1 = f32[] constant(-1)
-  neg1s = f32[5,5] broadcast(neg1), dimensions={}
-  nans = f32[5,5] sqrt(neg1s)
-  min = f32[5,5] minimum(neg1s, nans)
-  ROOT _ = f32[5,5] dot(p0, min),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, MaximumHandlesNaNsOnTheLeft) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  neg1 = f32[] constant(-1)
-  neg1s = f32[5,5] broadcast(neg1), dimensions={}
-  nans = f32[5,5] sqrt(neg1s)
-  max = f32[5,5] maximum(nans, neg1s)
-  ROOT _ = f32[5,5] dot(p0, max),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, MaximumHandlesNaNsOnTheRight) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  neg1 = f32[] constant(-1)
-  neg1s = f32[5,5] broadcast(neg1), dimensions={}
-  nans = f32[5,5] sqrt(neg1s)
-  max = f32[5,5] maximum(neg1s, nans)
-  ROOT _ = f32[5,5] dot(p0, max),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, MinimumReturnsLHS) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  zero = f32[] constant(0)
-  zeros = f32[5,5] broadcast(zero), dimensions={}
-  one = f32[] constant(1)
-  ones = f32[5,5] broadcast(one), dimensions={}
-  min = f32[5,5] minimum(zeros, ones)
-  ROOT _ = f32[5,5] dot(p0, min),
-  lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
-                                                /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, MinimumReturnsRHS) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  zero = f32[] constant(0)
-  zeros = f32[5,5] broadcast(zero), dimensions={}
-  one = f32[] constant(1)
-  ones = f32[5,5] broadcast(one), dimensions={}
-  min = f32[5,5] minimum(ones, zeros)
-  ROOT _ = f32[5,5] dot(p0, min),
-  lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
-                                                /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, MaximumReturnsLHS) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  zero = f32[] constant(0)
-  zeros = f32[5,5] broadcast(zero), dimensions={}
-  one = f32[] constant(1)
-  ones = f32[5,5] broadcast(one), dimensions={}
-  max = f32[5,5] maximum(ones, zeros)
-  ROOT _ = f32[5,5] dot(p0, max),
-  lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
-                                                /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, MaximumReturnsRHS) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f32[5,5] parameter(0)
-  zero = f32[] constant(0)
-  zeros = f32[5,5] broadcast(zero), dimensions={}
-  one = f32[] constant(1)
-  ones = f32[5,5] broadcast(one), dimensions={}
-  max = f32[5,5] maximum(zeros, ones)
-  ROOT _ = f32[5,5] dot(p0, max),
-  lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-PTX-SAME: block_m
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
-                                                /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SineOutputIsNotFused) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = s8[7,101] parameter(0)
-  p1 = f32[101,16] parameter(1)
-  c = f32[7,101] convert(p0)
-  d = f32[7,16] dot(c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT r = f32[7,16] sine(d)
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Sin(
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-1, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTest, SliceInputIsFused) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = f16[97,121] parameter(0)
-  s0 = f16[7,101] slice(p0), slice={[3:10], [10:111]}
-  p1 = f32[101,16] parameter(1)
-  c = f32[7,101] convert(s0)
-  ROOT d = f32[16,7] dot(p1, c),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SliceInputWithReshapeIsFused) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = f32[363,1536] parameter(0)
-  p1 = f32[4,1536,611] parameter(1)
-  s = f32[1,1536,611] slice(p1),
-    slice={[1:2], [0:1536], [0:611]}
-  r = f32[1536,611] reshape(s)
-  ROOT d = f32[363,611] dot(p0, r),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, NestedSlicingWorks) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p1 = f32[6,24] parameter(1)
-  slice1 = f32[5,20] slice(p1), slice={[1:6], [3:23]}
-  n1 = f32[5,20] negate(slice1)
-  slice2 = f32[3,7] slice(n1), slice={[1:4], [13:20]}
-  p0 = f32[7,37] parameter(0)
-  ROOT d = f32[3,37] dot(slice2, p0),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SlicedBatchDimensionIsSupported) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = f16[3,3,256] parameter(0)
-  s0 = f16[3,3,128] slice(p0), slice={[0:3], [0:3], [123:251]}
-  r0 = f16[3,3,128] reshape(s0)
-  p1 = f16[3,3,256] parameter(1)
-  svar1 = f16[3,3,128] slice(p1), slice={[0:3], [0:3], [30:158]}
-  r1 = f16[3,3,128] reshape(svar1)
-  ROOT d = f16[128,3,3]{2,1,0} dot(r0, r1),
-    lhs_batch_dims={2}, lhs_contracting_dims={1},
-    rhs_batch_dims={2}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTestWithSplitK,
-       SplitKDoesNotBreakSlicedFragmentedContractingDimension) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = f16[16,8,128]{2,1,0} parameter(0)
-  s0 = f16[16,4,128]{2,1,0} slice(p0),
-    slice={[0:16], [0:4], [0:128]}
-  r0 = f16[16,512]{1,0} reshape(s0)
-  p1 = s8[4096,4,128]{2,1,0} parameter(1)
-  r1 = s8[512,4096]{0,1} reshape(p1)
-  c1 = f16[512,4096]{0,1} convert(r1)
-  ROOT d = f16[16,4096]{1,0} dot(r0, c1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTestWithSplitK, SplitKWithTrivialDimension) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY entry_computation {
-  p0 = f16[1001,1]{1,0} parameter(0)
-  convert = f32[1001,1]{1,0} convert(p0)
-  p1 = f32[1001,2048]{1,0} parameter(1)
-  ROOT dot = f32[1,2048]{1,0} dot(convert, p1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={0}
-})";
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonGemmTest, NarrowingConvertOutputIsFused) {
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = s8[22,80] parameter(0)
-  p1 = f32[80,54] parameter(1)
-  c = f32[22,80] convert(p0)
-  d = f32[54,22] dot(p1, c),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-  ROOT r = f16[54,22] convert(d)
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/4e-2, /*arel=*/6e-2}));
-}
-
-TEST_F(TritonGemmTest, ParameterAfterDotIsFused) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = bf16[350,1280]{1,0} parameter(0)
-  p1 = s16[1280,690]{0,1} parameter(1)
-  p1c = bf16[1280,690]{0,1} convert(p1)
-  dot.21 = bf16[350,690]{1,0} dot(p0, p1c),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  p2 = bf16[350,690]{1,0} parameter(2)
-  ROOT r = bf16[350,690]{1,0} multiply(p2, dot.21)
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  const HloInstruction* instr = module->entry_computation()->root_instruction();
-  if (!instr->IsCustomFusion()) {
-    instr = instr->operand(0);
-    ASSERT_TRUE(instr->IsCustomFusion());
-  }
-  EXPECT_THAT(
-      instr,
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-2, /*arel=*/2e-2}));
-}
-
-TEST_F(TritonGemmTest, OutputFusionExecutesCorrectly) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f16[350,1280]{1,0} parameter(0)
-  p0c = bf16[350,1280]{1,0} convert(p0)
-  p1 = bf16[1280,690]{0,1} parameter(1)
-  d = bf16[350,690]{1,0} dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  p3 = bf16[350,690]{1,0} parameter(3)
-  multiply.8811 = bf16[350,690]{1,0} multiply(d, p3)
-  neg.484 = bf16[350,690]{1,0} negate(multiply.8811)
-  p2 = bf16[350,690]{1,0} parameter(2)
-  ROOT multiply.8808 = bf16[350,690]{1,0} multiply(neg.484, p2)
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  const HloInstruction* instr = module->entry_computation()->root_instruction();
-  if (!instr->IsCustomFusion()) {
-    instr = instr->operand(0);
-    ASSERT_TRUE(instr->IsCustomFusion());
-  }
-  EXPECT_THAT(
-      instr,
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
-                           m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-2, /*arel=*/2e-2}));
-}
-
-TEST_F(TritonGemmTest, SplitLHSOutputTransposeAloneIsNotFused) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  p0 = s8[18,15000] parameter(0)
-  p0c = bf16[18,15000] convert(p0)
-  p1 = bf16[42,18] parameter(1)
-  d = bf16[15000,42] dot(p0c, p1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-  r1 = bf16[5,200,15,42] reshape(d)
-  ROOT t1 = bf16[5,42,200,15] transpose(r1), dimensions={0,3,1,2}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  LOG(INFO) << "module: " << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Bitcast(m::Transpose(m::Fusion().WithFusionKind(
-                  HloInstruction::FusionKind::kCustom)))));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SplitLHSInputOutputIsFused) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Skipped until corresponding issue on ROCm is fixed.";
-  }
-
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0t = (s8[5,18,20,150]) parameter(0)
-  p0 = s8[5,18,20,150] get-tuple-element(p0t), index=0
-  p0c = bf16[5,18,20,150] convert(p0)
-  t0 = bf16[18,5,20,150] transpose(p0c), dimensions={1,0,2,3}
-  r0 = bf16[18,15000] reshape(t0)
-  p1 = bf16[42,18] parameter(1)
-  d = bf16[15000,42] dot(r0, p1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-  r1 = bf16[5,20,150,42] reshape(d)
-  ROOT t1 = bf16[5,42,20,150] transpose(r1), dimensions={0,3,1,2}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::GetTupleElement(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, SupportPredParametersUsedInExpressions) {
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p = pred[2,2]{1,0} parameter(0)
-  a = f32[2,2]{1,0} parameter(1)
-  b = f32[2,2]{1,0} parameter(2)
-  c = f32[2,2]{1,0} parameter(3)
-  compare = pred[2,2]{1,0} compare(a, b), direction=LT
-  and = pred[2,2]{1,0} and(p, compare)
-  convert = f32[2,2]{1,0} convert(and)
-  ROOT r = f32[2,2]{1,0} dot(convert, c),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
-                           m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, Naming) {
-  const char* hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  p0 = f16[15,19] parameter(0)
-  p1 = s8[19,17] parameter(1)
-  cp1 = f16[19,17] convert(p1)
-  ROOT r = f16[15,17] dot(p0, cp1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: %gemm_fusion_r_computation (
-; CHECK: ROOT %gemm_fusion_r
-; CHECK-SAME: kCustom
-)");
-}
-
-TEST_F(TritonGemmTest, LowerDotWithLhsWithoutNonContractingDimThroughTriton) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  parameter_0 = f32[1,40] parameter(0)
-  parameter_1 = f32[1,40,250000] parameter(1)
-  ROOT dot = f32[1,250000] dot(parameter_0, parameter_1), lhs_batch_dims={0},
-    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, LowerDotWithRhsWithoutNonContractingDimThroughTriton) {
-  const std::string hlo_text = R"(
-HloModule t
-
-ENTRY e {
-  parameter_0 = f32[1,40,250000] parameter(0)
-  parameter_1 = f32[1,40] parameter(1)
-  ROOT dot = f32[1,250000] dot(parameter_0, parameter_1), lhs_batch_dims={0},
-    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(hlo_text));
-
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
-                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-// This group of tests compares GPU results of dots already rewritten
-// into Triton fusions.
-using CompareTest = TritonGemmTest;
-
-TEST_F(CompareTest, DifferentTilingsProduceSameResult) {
-  const char* hlo_text_ref = R"(
-HloModule t
-
-triton_dot {
-  p0 = s8[101,202] parameter(0)
-  p0c = f32[101,202] convert(p0)
-  p1 = f32[202,303] parameter(1)
-  ROOT dot = f32[101,303] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[101,202]{1,0} parameter(0)
-  p1 = f32[202,303]{1,0} parameter(1)
-  ROOT _ = f32[101,303] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
-        {"block_m":16,"block_n":64,"block_k":32,
-         "split_k":1,"num_stages":3,"num_warps":8,
-         "num_ctas":1}}}
-})";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  p0 = s8[101,202] parameter(0)
-  p0c = f32[101,202] convert(p0)
-  p1 = f32[202,303] parameter(1)
-  ROOT dot = f32[101,303] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[101,202]{1,0} parameter(0)
-  p1 = f32[202,303]{1,0} parameter(1)
-  ROOT _ = f32[101,303] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":128,"block_k":32,
-                         "split_k":1,"num_stages":2,"num_warps":4,
-                         "num_ctas":1}}}
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, F16) {
-  const char* hlo_text_ref = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = f16[5,7] parameter(0)
-  arg1 = f16[7,33] parameter(1)
-  gemm = (f16[5,33], s8[0]{0}) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f16[5,33]{1,0} get-tuple-element((f16[5,33]{1,0}, s8[0]{0}) gemm), index=0
-}
-)";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  p0 = f16[5,7] parameter(0)
-  p1 = f16[7,33] parameter(1)
-  ROOT dot = f16[5,33] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f16[5,7]{1,0} parameter(0)
-  p1 = f16[7,33]{1,0} parameter(1)
-  ROOT _ = f16[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":1,
-                         "num_ctas":1}}}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, F32) {
-  const char* hlo_text_ref = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = f32[5,7] parameter(0)
-  arg1 = f32[7,33] parameter(1)
-  gemm = (f32[5,33], s8[0]{0}) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[5,33]{1,0} get-tuple-element((f32[5,33]{1,0}, s8[0]{0}) gemm), index=0
-}
-)";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[5,7] parameter(0)
-  p1 = f32[7,33] parameter(1)
-  ROOT dot = f32[5,33] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[5,7]{1,0} parameter(0)
-  p1 = f32[7,33]{1,0} parameter(1)
-  ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":1,
-                         "num_ctas":1}}}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, F32WithTrivialNonContractingDimension) {
-  const char* hlo_text_ref = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = f32[5,7] parameter(0)
-  arg1 = f32[1,7] parameter(1)
-  gemm = (f32[5,1], s8[0]{0}) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[5,1]{1,0} get-tuple-element((f32[5,1]{1,0}, s8[0]{0}) gemm), index=0
-}
-)";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[5,7] parameter(0)
-  p1 = f32[1,7] parameter(1)
-  ROOT dot = f32[5,1] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = f32[5,7]{1,0} parameter(0)
-  p1 = f32[1,7]{1,0} parameter(1)
-  ROOT _ = f32[5,1] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":1,
-                         "num_ctas":1}}}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, BF16TransposedLHS) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  const char* hlo_text_ref = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = bf16[512,16]{1,0} parameter(0)
-  arg1 = bf16[512,256]{1,0} parameter(1)
-  gemm = (bf16[16,256]{1,0}, s8[0]{0}) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[0],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = bf16[16,256]{1,0} get-tuple-element((bf16[16,256]{1,0}, s8[0]{0}) gemm), index=0
-}
-)";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  arg0 = bf16[512,16]{1,0} parameter(0)
-  arg1 = bf16[512,256]{1,0} parameter(1)
-  ROOT dot = bf16[16,256]{1,0} dot(arg0, arg1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  arg0 = bf16[512,16]{1,0} parameter(0)
-  arg1 = bf16[512,256]{1,0} parameter(1)
-  ROOT _ = bf16[16,256]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":128,"block_n":32,"block_k":16,
-                         "split_k":1,"num_stages":2,"num_warps":4,
-                        "num_ctas":1}}}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, UsingOptinSharedMemoryOnAmpereProducesSameResult) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "No Optin Shared Memory on AMD.";
-  }
-  const se::DeviceDescription dev_info =
-      backend().default_stream_executor()->GetDeviceDescription();
-  constexpr int kBytesOfSharedMemoryTested = 64 * 1024;
-  EXPECT_GE(dev_info.shared_memory_per_block_optin(),
-            kBytesOfSharedMemoryTested);
-
-  const std::string kHloTextOptinShmem = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = s8[332,441]{1,0} parameter(0)
-  p0c = f16[332,441]{1,0} convert(param_0.1)
-  param_1.1 = f16[441,39]{1,0} parameter(1)
-  ROOT dot = f16[332,39]{1,0} dot(p0c, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[332,441]{1,0} parameter(0)
-  p1 = f16[441,39]{1,0} parameter(1)
-  ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":128,"block_n":128,"block_k":128,
-                         "split_k":1,"num_stages":2,"num_warps":32,
-                         "num_ctas":1}}}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
-                          ParseAndReturnVerifiedModule(kHloTextOptinShmem));
-  const HloFusionInstruction* triton_dot_fusion = Cast<HloFusionInstruction>(
-      hlo_module->entry_computation()->root_instruction());
-  llvm::LLVMContext llvm_ctx;
-  llvm::Module llvm_module("module", llvm_ctx);
-  mlir::MLIRContext mlir_context;
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto gpu_config, triton_dot_fusion->backend_config<GpuBackendConfig>());
-  const FusionBackendConfig& config = gpu_config.fusion_backend_config();
-  auto gemm_config = config.triton_gemm_config();
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = gemm_config.num_ctas();
-  block_level_parameters.num_warps = gemm_config.num_warps();
-  block_level_parameters.num_stages = gemm_config.num_stages();
-  TF_ASSERT_OK_AND_ASSIGN(
-      const auto result,
-      TritonWrapper("test_fn", triton_dot_fusion, GpuComputeComp(), dev_info,
-                    block_level_parameters, &llvm_module, mlir_context));
-  // The config is chosen so that the used memory size is slightly above the
-  // 48 kB boundary of standard / optin shared memory so that any GPU that
-  // has the optin one should be able to execute the test.
-  EXPECT_EQ(result.shmem_bytes, kBytesOfSharedMemoryTested);
-  // Make sure the written config indeed has to use optin shared memory.
-  EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
-
-  const std::string kHloTextLowShmem = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = s8[332,441]{1,0} parameter(0)
-  p0c = f16[332,441]{1,0} convert(param_0.1)
-  param_1.1 = f16[441,39]{1,0} parameter(1)
-  ROOT dot = f16[332,39]{1,0} dot(p0c, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[332,441]{1,0} parameter(0)
-  p1 = f16[441,39]{1,0} parameter(1)
-  ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":4,
-                         "num_ctas":1}}}
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextLowShmem, kHloTextOptinShmem,
-                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, F16TransposedRHS) {
-  const char* hlo_text_ref = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = f16[128,32]{1,0} parameter(0)
-  arg1 = f16[64,32]{1,0} parameter(1)
-  gemm = (f16[128,64]{1,0}, s8[0]{0}) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f16[128,64]{1,0} get-tuple-element((f16[128,64]{1,0}, s8[0]{0}) gemm), index=0
-}
-)";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  arg0 = f16[128,32]{1,0} parameter(0)
-  arg1 = f16[64,32]{1,0} parameter(1)
-  ROOT dot = f16[128,64]{1,0} dot(arg0, arg1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  arg0 = f16[128,32]{1,0} parameter(0)
-  arg1 = f16[64,32]{1,0} parameter(1)
-  ROOT _ = f16[128,64]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":128,"block_n":32,"block_k":64,
-                         "split_k":1,"num_stages":2,"num_warps":4,
-                         "num_ctas":1}}}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, F32TransposedBoth) {
-  const char* hlo_text_ref = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = f32[64,128]{1,0} parameter(0)
-  arg1 = f32[1024,64]{1,0} parameter(1)
-  gemm = (f32[128,1024]{1,0}, s8[0]{0}) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[0],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[128,1024]{1,0} get-tuple-element((f32[128,1024]{1,0}, s8[0]{0}) gemm), index=0
-}
-)";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  arg0 = f32[64,128]{1,0} parameter(0)
-  arg1 = f32[1024,64]{1,0} parameter(1)
-  ROOT dot = f32[128,1024]{1,0} dot(arg0, arg1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  arg0 = f32[64,128]{1,0} parameter(0)
-  arg1 = f32[1024,64]{1,0} parameter(1)
-  ROOT _ = f32[128,1024]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":64,
-                         "split_k":1,"num_stages":2,"num_warps":4,
-                         "num_ctas":1}}}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, S8BF16) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  const char* hlo_text_ref = R"(
-HloModule r
-
-fused_computation {
-  param_0.1 = s8[144,256]{1,0} parameter(0)
-  ROOT convert.4 = bf16[144,256]{1,0} convert(param_0.1)
-}
-
-ENTRY e {
-  p0 = s8[144,256]{1,0} parameter(0)
-  fusion = bf16[144,256]{1,0} fusion(p0), kind=kInput, calls=fused_computation
-  p1 = bf16[256,122]{1,0} parameter(1)
-  gemm = (bf16[144,122]{1,0}, s8[0]{0}) custom-call(fusion, p1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = bf16[144,122]{1,0} get-tuple-element((bf16[144,122]{1,0}, s8[0]{0}) gemm), index=0
-}
-)";
-
-  const char* hlo_text_triton = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = s8[144,256]{1,0} parameter(0)
-  p0c = bf16[144,256]{1,0} convert(param_0.1)
-  param_1.1 = bf16[256,122]{1,0} parameter(1)
-  ROOT dot = bf16[144,122]{1,0} dot(p0c, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[144,256]{1,0} parameter(0)
-  p1 = bf16[256,122]{1,0} parameter(1)
-  ROOT _ = bf16[144,122]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":64,"block_n":64,"block_k":64,
-                         "split_k":1,"num_stages":1,"num_warps":2,
-                         "num_ctas":1}}}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, SplitK) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  const std::string hlo_text_ref = R"(
-HloModule t, is_scheduled=true
-
-triton_gemm_r {
-  parameter_0 = s8[480,120]{1,0} parameter(0)
-  convert.3 = bf16[480,120]{1,0} convert(parameter_0)
-  parameter_1 = bf16[16,120]{1,0} parameter(1)
-  ROOT r.1 = bf16[480,16]{1,0} dot(convert.3, parameter_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p1 = bf16[16,120]{1,0} parameter(1)
-  p0 = s8[3,120,5,32]{3,2,1,0} parameter(0)
-  bitcast.4 = s8[480,120]{1,0} bitcast(p0)
-  ROOT triton_gemm_r = bf16[480,16]{1,0} fusion(bitcast.4, p1), kind=kCustom,
-    calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
-                         "split_k":1,"num_stages":4,"num_warps":4,
-                         "num_ctas":1}}}
-})";
-
-  const std::string hlo_text_splitk = R"(
-HloModule t, is_scheduled=true
-
-triton_gemm_r {
-  parameter_0 = s8[480,120]{1,0} parameter(0)
-  convert.3 = bf16[480,120]{1,0} convert(parameter_0)
-  bitcast.11 = bf16[480,4,30]{2,1,0} bitcast(convert.3)
-  parameter_1 = bf16[16,120]{1,0} parameter(1)
-  bitcast.12 = bf16[16,4,30]{2,1,0} bitcast(parameter_1)
-  ROOT dot.1 = bf16[4,480,16]{2,1,0} dot(bitcast.11, bitcast.12),
-    lhs_batch_dims={1}, lhs_contracting_dims={2},
-    rhs_batch_dims={1}, rhs_contracting_dims={2}
-}
-
-add {
-  rhs.1 = f32[] parameter(1)
-  lhs.1 = f32[] parameter(0)
-  ROOT add.1 = f32[] add(lhs.1, rhs.1)
-}
-
-fused_computation {
-  param_0.2 = bf16[4,480,16]{2,1,0} parameter(0)
-  convert.18 = f32[4,480,16]{2,1,0} convert(param_0.2)
-  constant_1 = bf16[] constant(0)
-  convert.17 = f32[] convert(constant_1)
-  reduce.1 = f32[480,16]{1,0} reduce(convert.18, convert.17), dimensions={0},
-    to_apply=add
-  ROOT convert.16 = bf16[480,16]{1,0} convert(reduce.1)
-}
-
-ENTRY e {
-  p1 = bf16[16,120]{1,0} parameter(1)
-  p0 = s8[3,120,5,32]{3,2,1,0} parameter(0)
-  bitcast.4 = s8[480,120]{1,0} bitcast(p0)
-  triton_gemm_r = bf16[4,480,16]{2,1,0} fusion(bitcast.4, p1), kind=kCustom,
-    calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":128,
-                         "split_k":4,"num_stages":1,"num_warps":4,
-                         "num_ctas":1}}}
-  ROOT fusion.1 = bf16[480,16]{1,0} fusion(triton_gemm_r), kind=kLoop,
-    calls=fused_computation
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_splitk,
-                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, SplitKBatch) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  const std::string kHloTextRef = R"(
-HloModule m, is_scheduled=true
-
-triton_gemm_dot.24 {
-  parameter_1 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
-  bitcast.3 = bf16[800,5,128]{2,1,0} bitcast(parameter_1)
-  convert.3 = f32[800,5,128]{2,1,0} convert(bitcast.3)
-  parameter_0 = f32[1,5,700,800]{3,2,1,0} parameter(0)
-  bitcast.2 = f32[5,700,800]{2,1,0} bitcast(parameter_0)
-  ROOT dot.26 = f32[5,128,700]{2,1,0} dot(convert.3, bitcast.2), lhs_batch_dims={1}, lhs_contracting_dims={0}, rhs_batch_dims={0}, rhs_contracting_dims={2}
-}
-
-ENTRY e {
-  tmp_3 = f32[1,5,700,800]{3,2,1,0} parameter(0)
-  tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
-  ROOT triton_gemm_dot.24 = f32[5,128,700]{2,1,0} fusion(tmp_3, tmp_0),
-    kind=kCustom, calls=triton_gemm_dot.24,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
-                         "split_k":1,"num_stages":2,"num_warps":8,
-                         "num_ctas":1}}}
-})";
-
-  const std::string kHloTextSplitK = R"(
-HloModule m, is_scheduled=true
-
-triton_gemm_dot {
-  parameter_1 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
-  bitcast.3 = bf16[800,5,128]{2,1,0} bitcast(parameter_1)
-  convert.3 = f32[800,5,128]{2,1,0} convert(bitcast.3)
-  bitcast = f32[8,100,5,128]{3,2,1,0} bitcast(convert.3)
-  parameter_0 = f32[1,5,700,800]{3,2,1,0} parameter(0)
-  bitcast.2 = f32[5,700,800]{2,1,0} bitcast(parameter_0)
-  bitcast.1 = f32[5,700,8,100]{3,2,1,0} bitcast(bitcast.2)
-  ROOT dot = f32[8,5,128,700]{3,2,1,0} dot(bitcast, bitcast.1), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={2,0}, rhs_contracting_dims={3}
-}
-
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-ENTRY e {
-  tmp_3 = f32[1,5,700,800]{3,2,1,0} parameter(0)
-  tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
-  triton_gemm_dot.24 = f32[8,5,128,700]{3,2,1,0} fusion(tmp_3, tmp_0),
-    kind=kCustom, calls=triton_gemm_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
-                         "split_k":8,"num_stages":1,"num_warps":4,
-                         "num_ctas":1}}}
-  constant = f32[] constant(0)
-  ROOT reduce = f32[5,128,700]{2,1,0} reduce(triton_gemm_dot.24, constant), dimensions={0}, to_apply=add
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
-                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, SplitKNontrivialBitcast) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  const std::string kHloTextRef = R"(
-HloModule module, is_scheduled=true
-
-triton_gemm_dot.5316 {
-  parameter_1 = bf16[16,4,128]{2,1,0} parameter(1)
-  bitcast.2 = bf16[16,512]{1,0} bitcast(parameter_1)
-  parameter_0 = s8[512,96]{1,0} parameter(0)
-  convert.4 = bf16[512,96]{1,0} convert(parameter_0)
-  ROOT dot.0 = bf16[16,96]{1,0} dot(bitcast.2, convert.4),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  parameter_0.1 = s8[96,4,128]{2,1,0} parameter(0)
-  bitcast.6 = s8[512,96]{1,0} bitcast(parameter_0.1)
-  parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
-  ROOT triton_gemm_dot.5316 = bf16[16,96]{1,0} fusion(bitcast.6, parameter_1.1),
-    kind=kCustom, calls=triton_gemm_dot.5316,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":256,
-                         "split_k":1,"num_stages":1,"num_warps":4,
-                         "num_ctas":1}}}
-})";
-
-  const std::string kHloTextSplitK = R"(
-HloModule module, is_scheduled=true
-
-triton_gemm_dot.5316 {
-  parameter_1 = bf16[16,4,128]{2,1,0} parameter(1)
-  bitcast.2 = bf16[16,512]{1,0} bitcast(parameter_1)
-  bitcast.17 = bf16[16,16,32]{2,1,0} bitcast(bitcast.2)
-  parameter_0 = s8[512,96]{1,0} parameter(0)
-  convert.4 = bf16[512,96]{1,0} convert(parameter_0)
-  bitcast.18 = bf16[16,32,96]{2,1,0} bitcast(convert.4)
-  ROOT dot.4 = bf16[16,16,96]{2,1,0} dot(bitcast.17, bitcast.18),
-    lhs_batch_dims={1}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}
-}
-
-triton_gemm_dot.5316.reduce_sub_computation.clone {
-  rhs.1 = f32[] parameter(1)
-  lhs.1 = f32[] parameter(0)
-  ROOT add.1 = f32[] add(lhs.1, rhs.1)
-}
-
-fused_computation {
-  param_0.2 = bf16[16,16,96]{2,1,0} parameter(0)
-  convert.19 = f32[16,16,96]{2,1,0} convert(param_0.2)
-  constant_1 = bf16[] constant(0)
-  convert.18 = f32[] convert(constant_1)
-  reduce.1 = f32[16,96]{1,0} reduce(convert.19, convert.18),
-    dimensions={0}, to_apply=triton_gemm_dot.5316.reduce_sub_computation.clone
-  ROOT convert.17 = bf16[16,96]{1,0} convert(reduce.1)
-}
-
-ENTRY entry {
-  parameter_0.1 = s8[96,4,128]{2,1,0} parameter(0)
-  bitcast.6 = s8[512,96]{1,0} bitcast(parameter_0.1)
-  parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
-  triton_gemm_dot.5316 = bf16[16,16,96]{2,1,0} fusion(bitcast.6, parameter_1.1),
-    kind=kCustom, calls=triton_gemm_dot.5316,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":32,
-                         "split_k":16,"num_stages":1,"num_warps":4,
-                         "num_ctas":1}}}
-  ROOT fusion.1 = bf16[16,96]{1,0} fusion(triton_gemm_dot.5316),
-    kind=kLoop, calls=fused_computation
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
-                                      ErrorSpec{/*aabs=*/2, /*arel=*/1e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-// This is based on gemm_fusion_test.cc/SplitKTest.SupportsIndivisible.
-//
-// There were relatively large numeric errors with an f16 temporary buffer, so I
-// ended up using --xla_gpu_triton_gemm_disable_reduced_precision_reduction=true
-// when generating this test case.
-TEST_F(CompareTest, SupportsSplitKWithIndivisibleKComplexExample) {
-  constexpr absl::string_view kHloTextRef = R"(
-HloModule extracted, entry_computation_layout={(s8[3,129,5,32]{3,2,1,0}, f16[16,129]{1,0})->f16[480,16]{1,0}}
-
-triton_gemm_dot.clone {
-  parameter_0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
-  bitcast.1 = s8[3,5,32,129]{2,1,3,0} bitcast(parameter_0)
-  copy.1 = s8[3,5,32,129]{3,2,1,0} copy(bitcast.1)
-  reshape.5 = s8[480,129]{1,0} reshape(copy.1)
-  convert.8 = f16[480,129]{1,0} convert(reshape.5)
-  parameter_1 = f16[16,129]{1,0} parameter(1)
-  ROOT dot.0 = f16[480,16]{1,0} dot(convert.8, parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY entry_computation {
-  p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
-  p1 = f16[16,129]{1,0} parameter(1)
-  ROOT fusion = f16[480,16]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.clone,
-  backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-  "triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256",
-                        "split_k":"1","num_stages":"1","num_warps":"4",
-                        "num_ctas":"1"}}}
-}
-)";
-
-  constexpr absl::string_view kHloTextSplitK = R"(
-HloModule extracted, entry_computation_layout={(s8[3,129,5,32]{3,2,1,0}, f16[16,129]{1,0})->f16[480,16]{1,0}}
-
-triton_gemm_dot.clone {
-  parameter_0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
-  bitcast.1 = s8[3,5,32,129]{2,1,3,0} bitcast(parameter_0)
-  copy.1 = s8[3,5,32,129]{3,2,1,0} copy(bitcast.1)
-  reshape.5 = s8[480,129]{1,0} reshape(copy.1)
-  convert.8 = f16[480,129]{1,0} convert(reshape.5)
-  constant = f16[] constant(0)
-  pad = f16[480,130]{1,0} pad(convert.8, constant), padding=0_0x0_1
-  bitcast = f16[480,2,65]{2,1,0} bitcast(pad)
-  convert.1 = f32[480,2,65]{2,1,0} convert(bitcast)
-  parameter_1 = f16[16,129]{1,0} parameter(1)
-  constant.1 = f16[] constant(0)
-  pad.1 = f16[16,130]{1,0} pad(parameter_1, constant.1), padding=0_0x0_1
-  bitcast.2 = f16[16,2,65]{2,1,0} bitcast(pad.1)
-  convert.2 = f32[16,2,65]{2,1,0} convert(bitcast.2)
-  ROOT dot.2 = f32[2,480,16]{2,1,0} dot(convert.1, convert.2), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={1}, rhs_contracting_dims={2}
-}
-
-fusion.reduce_sub_computation {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-fused_computation {
-  param_0.1 = f32[2,480,16]{2,1,0} parameter(0)
-  constant.3 = f32[] constant(0)
-  reduce.1 = f32[480,16]{1,0} reduce(param_0.1, constant.3), dimensions={0}, to_apply=fusion.reduce_sub_computation
-  ROOT convert.3 = f16[480,16]{1,0} convert(reduce.1)
-}
-
-ENTRY entry_computation {
-  p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
-  p1 = f16[16,129]{1,0} parameter(1)
-  fusion = f32[2,480,16]{2,1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.clone,
-  backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-  "triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"64",
-                        "split_k":"2","num_stages":"1","num_warps":"8",
-                        "num_ctas":"1"}}}
-  ROOT fusion.1 = f16[480,16]{1,0} fusion(fusion), kind=kLoop, calls=fused_computation
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
-                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, SupportsSplitKWithIndivisibleKUsingPaddingEqual1) {
-  constexpr absl::string_view kHloTextRef = R"(
-HloModule extracted, entry_computation_layout={(f16[1,8,4,1023]{3,2,1,0}, f16[1,1023,128]{2,1,0})->f16[1,8,4,128]{3,2,1,0}}
-
-triton_gemm_dot.7103_computation.clone {
-  parameter_0.499 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
-  bitcast.7923 = f16[32,1023]{1,0} bitcast(parameter_0.499)
-  parameter_1.499 = f16[1,1023,128]{2,1,0} parameter(1)
-  bitcast.7924 = f16[1023,128]{1,0} bitcast(parameter_1.499)
-  dot.9350 = f16[32,128]{1,0} dot(bitcast.7923, bitcast.7924), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT bitcast.7925 = f16[1,8,4,128]{3,2,1,0} bitcast(dot.9350)
-}
-
-ENTRY entry_computation {
-  p0 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
-  p1 = f16[1,1023,128]{2,1,0} parameter(1)
-  ROOT triton_gemm_dot.7103 = f16[1,8,4,128]{3,2,1,0} fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot.7103_computation.clone,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-    "triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"32",
-                          "split_k":"1","num_stages":"4","num_warps":"4",
-                          "num_ctas":"1"}}}
-}
-)";
-
-  constexpr absl::string_view kHloTextSplitK = R"(
-HloModule extracted, entry_computation_layout={(f16[1,8,4,1023]{3,2,1,0}, f16[1,1023,128]{2,1,0})->f16[1,8,4,128]{3,2,1,0}}
-
-triton_gemm_dot.7103_computation.clone {
-  parameter_0.499 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
-  bitcast.7923 = f16[32,1023]{1,0} bitcast(parameter_0.499)
-  constant = f16[] constant(0)
-  pad = f16[32,1024]{1,0} pad(bitcast.7923, constant), padding=0_0x0_1
-  bitcast = f16[32,8,128]{2,1,0} bitcast(pad)
-  parameter_1.499 = f16[1,1023,128]{2,1,0} parameter(1)
-  bitcast.7924 = f16[1023,128]{1,0} bitcast(parameter_1.499)
-  constant.1 = f16[] constant(0)
-  pad.1 = f16[1024,128]{1,0} pad(bitcast.7924, constant.1), padding=0_1x0_0
-  bitcast.1 = f16[8,128,128]{2,1,0} bitcast(pad.1)
-  dot.1 = f16[8,32,128]{2,1,0} dot(bitcast, bitcast.1), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  ROOT bitcast.7925.clone = f16[8,1,8,4,128]{4,3,2,1,0} bitcast(dot.1)
-}
-
-triton_gemm_dot.7103.reduce_sub_computation.clone {
-  lhs.1 = f32[] parameter(0)
-  rhs.1 = f32[] parameter(1)
-  add.2 = f32[] add(lhs.1, rhs.1)
-  convert.13 = f16[] convert(add.2)
-  ROOT convert.12 = f32[] convert(convert.13)
-}
-
-fused_computation.1 {
-  param_0.5 = f16[8,1,8,4,128]{4,3,2,1,0} parameter(0)
-  convert.16 = f32[8,1,8,4,128]{4,3,2,1,0} convert(param_0.5)
-  constant.3 = f16[] constant(0)
-  convert.15 = f32[] convert(constant.3)
-  reduce.1 = f32[1,8,4,128]{3,2,1,0} reduce(convert.16, convert.15), dimensions={0}, to_apply=triton_gemm_dot.7103.reduce_sub_computation.clone
-  ROOT convert.14 = f16[1,8,4,128]{3,2,1,0} convert(reduce.1)
-}
-
-ENTRY entry_computation {
-  p0 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
-  p1 = f16[1,1023,128]{2,1,0} parameter(1)
-  triton_gemm_dot.7103 = f16[8,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm_dot.7103_computation.clone,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-    "triton_gemm_config":{"block_m":"16","block_n":"128","block_k":"32",
-                          "split_k":"8","num_stages":"1","num_warps":"4",
-                          "num_ctas":"1"}}}
-  ROOT fusion.1 = f16[1,8,4,128]{3,2,1,0} fusion(triton_gemm_dot.7103), kind=kLoop, calls=fused_computation.1
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
-                                      ErrorSpec{/*aabs=*/4e-2, /*arel=*/2e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, SupportsSplitKWithIndivisibleKUsingPaddingEqual5) {
-  constexpr absl::string_view kHloTextRef = R"(
-HloModule extracted, entry_computation_layout={(f16[1,8,4,1019]{3,2,1,0}, f16[1,1019,128]{2,1,0})->f16[1,8,4,128]{3,2,1,0}}
-
-triton_gemm_dot.7103_computation.clone {
-  parameter_0.499 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
-  bitcast.7923 = f16[32,1019]{1,0} bitcast(parameter_0.499)
-  parameter_1.499 = f16[1,1019,128]{2,1,0} parameter(1)
-  bitcast.7924 = f16[1019,128]{1,0} bitcast(parameter_1.499)
-  dot.9350 = f16[32,128]{1,0} dot(bitcast.7923, bitcast.7924), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT bitcast.7925 = f16[1,8,4,128]{3,2,1,0} bitcast(dot.9350)
-}
-
-ENTRY entry_computation {
-  p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
-  p1 = f16[1,1019,128]{2,1,0} parameter(1)
-  ROOT triton_gemm_dot.7103 = f16[1,8,4,128]{3,2,1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm_dot.7103_computation.clone,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-    "triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256",
-                          "split_k":"1","num_stages":"1","num_warps":"4",
-                          "num_ctas":"1"}}}
-}
-)";
-
-  constexpr absl::string_view kHloTextSplitK = R"(
-HloModule extracted, entry_computation_layout={(f16[1,8,4,1019]{3,2,1,0}, f16[1,1019,128]{2,1,0})->f16[1,8,4,128]{3,2,1,0}}
-
-triton_gemm_dot.7103_computation.clone {
-  parameter_0.499 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
-  bitcast.7923 = f16[32,1019]{1,0} bitcast(parameter_0.499)
-  constant = f16[] constant(0)
-  pad = f16[32,1024]{1,0} pad(bitcast.7923, constant), padding=0_0x0_5
-  bitcast = f16[32,16,64]{2,1,0} bitcast(pad)
-  parameter_1.499 = f16[1,1019,128]{2,1,0} parameter(1)
-  bitcast.7924 = f16[1019,128]{1,0} bitcast(parameter_1.499)
-  constant.1 = f16[] constant(0)
-  pad.1 = f16[1024,128]{1,0} pad(bitcast.7924, constant.1), padding=0_5x0_0
-  bitcast.1 = f16[16,64,128]{2,1,0} bitcast(pad.1)
-  dot.1 = f16[16,32,128]{2,1,0} dot(bitcast, bitcast.1), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  ROOT bitcast.7925.clone = f16[16,1,8,4,128]{4,3,2,1,0} bitcast(dot.1)
-}
-
-triton_gemm_dot.7103.reduce_sub_computation.clone {
-  lhs.1 = f32[] parameter(0)
-  rhs.1 = f32[] parameter(1)
-  add.2 = f32[] add(lhs.1, rhs.1)
-  convert.13 = f16[] convert(add.2)
-  ROOT convert.12 = f32[] convert(convert.13)
-}
-
-fused_computation.1 {
-  param_0.5 = f16[16,1,8,4,128]{4,3,2,1,0} parameter(0)
-  convert.16 = f32[16,1,8,4,128]{4,3,2,1,0} convert(param_0.5)
-  constant.3 = f16[] constant(0)
-  convert.15 = f32[] convert(constant.3)
-  reduce.1 = f32[1,8,4,128]{3,2,1,0} reduce(convert.16, convert.15), dimensions={0}, to_apply=triton_gemm_dot.7103.reduce_sub_computation.clone
-  ROOT convert.14 = f16[1,8,4,128]{3,2,1,0} convert(reduce.1)
-}
-
-ENTRY entry_computation {
-  p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
-  p1 = f16[1,1019,128]{2,1,0} parameter(1)
-  triton_gemm_dot.7103 = f16[16,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm_dot.7103_computation.clone,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-    "triton_gemm_config":{"block_m":"64","block_n":"32","block_k":"32",
-                          "split_k":"16","num_stages":"1","num_warps":"4",
-                          "num_ctas":"1"}}}
-  ROOT fusion.1 = f16[1,8,4,128]{3,2,1,0} fusion(triton_gemm_dot.7103), kind=kLoop, calls=fused_computation.1
-}
-)";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
-                                      ErrorSpec{/*aabs=*/4e-2, /*arel=*/2e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, NonMajorMostOutputBatchWorksCorrectly) {
-  const std::string kHloTextTest = R"(
-HloModule m
-
-triton_gemm_dot.6 {
-  parameter_1 = f32[32,50,104]{2,1,0} parameter(1)
-  parameter_0 = s8[32,26,104]{2,1,0} parameter(0)
-  convert.22 = f32[32,26,104]{2,1,0} convert(parameter_0)
-  ROOT dot.127 = f32[32,50,26]{2,0,1} dot(parameter_1, convert.22),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={2}
-}
-
-ENTRY e {
-  p0 = s8[32,26,104]{2,1,0} parameter(0)
-  p1 = f32[32,50,104]{2,1,0} parameter(1)
-  ROOT triton_gemm_dot.6 = f32[32,50,26]{2,0,1} fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot.6,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":64,"block_n":16,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":4,
-                         "num_ctas":1}}}
-})";
-
-  const std::string kHloTextRef = R"(
-HloModule m
-
-%triton_gemm_dot.127 {
-  %parameter_1.1 = f32[32,50,104]{2,1,0} parameter(1)
-  %parameter_0.1 = s8[32,26,104]{2,1,0} parameter(0)
-  %convert.0 = f32[32,26,104]{2,1,0} convert(%parameter_0.1)
-  ROOT %dot.0 = f32[32,50,26]{2,1,0} dot(%parameter_1.1, %convert.0),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={2}
-}
-
-%fused_computation {
-  %param_0.1 = f32[32,50,26]{2,1,0} parameter(0)
-  %transpose.1 = f32[50,32,26]{2,1,0} transpose(%param_0.1), dimensions={1,0,2}
-  ROOT %bitcast.7 = f32[32,50,26]{2,0,1} bitcast(%transpose.1)
-}
-
-ENTRY e {
-  %parameter_0 = s8[32,26,104]{2,1,0} parameter(0)
-  %parameter_1 = f32[32,50,104]{2,1,0} parameter(1)
-  %triton_gemm_dot.127 = f32[32,50,26]{2,1,0} fusion(%parameter_0, %parameter_1),
-    kind=kCustom, calls=%triton_gemm_dot.127,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":128,"block_k":64,
-                         "split_k":1,"num_stages":2,"num_warps":4,
-                         "num_ctas":1}}}
-  ROOT %fusion.1 = f32[32,50,26]{2,0,1} fusion(%triton_gemm_dot.127), kind=kLoop, calls=%fused_computation
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
-                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, TritonDotFusionCanHaveOnlyRHSParameter) {
-  const std::string kHloTextTest = R"(
-HloModule m, is_scheduled=true
-
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  c = f16[] constant(321)
-  b = f16[11,63] broadcast(c)
-  cc = f32[11,63] convert(b)
-  ROOT _.1 = f32[63,92]{1,0} dot(cc, parameter_0),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = f32[92,11]{1,0} parameter(0)
-  ROOT triton_gemm__ = f32[63,92]{1,0} fusion(p0), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"16","block_n":"64",
-                                          "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2",
-                                          "num_ctas":"1"}}}
-})";
-
-  const std::string kHloTextRef = R"(
-HloModule m, is_scheduled=true
-
-ENTRY e {
-  constant_2 = f32[] constant(321)
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  broadcast.2 = f32[11,63]{1,0} broadcast(constant_2), dimensions={}
-  gemm = (f32[63,92]{1,0}, s8[0]{0}) custom-call(broadcast.2, parameter_0),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["1"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[63,92]{1,0} get-tuple-element((f32[63,92]{1,0}, s8[0]{0}) gemm), index=0
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
-                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, TritonDotFusionCanHaveNoParametersAtAll) {
-  const std::string kHloTextTest = R"(
-HloModule m, is_scheduled=true
-
-triton_gemm___computation {
-  c = f32[] constant(7)
-  b = f32[11,61] broadcast(c)
-  c2 = f32[] constant(5)
-  b2 = f32[61,45] broadcast(c2)
-  ROOT _.1 = f32[11,45]{1,0} dot(b, b2),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  ROOT triton_gemm__ = f32[11,45]{1,0} fusion(), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"16","block_n":"64",
-                                          "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2",
-                                          "num_ctas":"1"}}}
-})";
-
-  const std::string kHloTextRef = R"(
-HloModule m, is_scheduled=true
-
-ENTRY triton_gemm___computation {
-  constant_1 = f32[] constant(7)
-  constant = f32[] constant(5)
-  broadcast = f32[11,61]{1,0} broadcast(constant), dimensions={}
-  broadcast.1 = f32[61,45]{1,0} broadcast(constant_1), dimensions={}
-  gemm = (f32[11,45]{1,0}, s8[0]{0}) custom-call(broadcast, broadcast.1),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[11,45]{1,0} get-tuple-element((f32[11,45]{1,0}, s8[0]{0}) gemm), index=0
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
-                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, TritonDotFusionCanHaveManyParameters) {
-  const std::string kHloTextTest = R"(
-HloModule m
-
-triton_gemm_dot_computation {
-  tmp_1 = pred[3,32]{1,0} parameter(0)
-  tmp_2 = f32[3,32]{1,0} parameter(1)
-  tmp_3 = f32[3,32]{1,0} parameter(2)
-  tmp_4 = f32[3,32]{1,0} select(tmp_1, tmp_2, tmp_3)
-  tmp_5 = f32[3,32]{1,0} parameter(3)
-  tmp_6 = f32[3,32]{1,0} multiply(tmp_4, tmp_5)
-  tmp_7 = f32[3,32]{1,0} parameter(4)
-  tmp_8 = f32[3,32]{1,0} maximum(tmp_6, tmp_7)
-  tmp_9 = f32[3,57]{1,0} parameter(9)
-  tmp_10 = f32[3,57]{1,0} parameter(10)
-  tmp_11 = f32[3,57]{1,0} multiply(tmp_9, tmp_10)
-  tmp_12 = f32[3,57]{1,0} parameter(11)
-  tmp_13 = f32[3,57]{1,0} add(tmp_11, tmp_12)
-  tmp_14 = pred[3,57]{1,0} parameter(5)
-  tmp_15 = f32[3,57]{1,0} parameter(6)
-  tmp_16 = f32[3,57]{1,0} parameter(7)
-  tmp_17 = f32[3,57]{1,0} select(tmp_14, tmp_15, tmp_16)
-  tmp_18 = f32[3,57]{1,0} parameter(8)
-  tmp_19 = f32[3,57]{1,0} multiply(tmp_17, tmp_18)
-  tmp_20 = f32[3,57]{1,0} negate(tmp_19)
-  tmp_21 = f32[3,57]{1,0} add(tmp_13, tmp_20)
-  const_1 = f32[] constant(-3e-3)
-  const_2 = f32[] constant(3e-2)
-  broadcast_1 = f32[3,57]{1,0} broadcast(const_1), dimensions={}
-  broadcast_2 = f32[3,57]{1,0} broadcast(const_2), dimensions={}
-  tmp_22 = f32[3,57]{1,0} clamp(broadcast_1, tmp_21, broadcast_2)
-  ROOT tmp_23 = f32[32,57]{0,1} dot(tmp_8, tmp_22), lhs_contracting_dims={0}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  tmp_1 = pred[3,32]{1,0} parameter(0)
-  tmp_2 = f32[3,32]{1,0} parameter(1)
-  tmp_3 = f32[3,32]{1,0} parameter(2)
-  tmp_5 = f32[3,32]{1,0} parameter(3)
-  tmp_7 = f32[3,32]{1,0} parameter(4)
-  tmp_14 = pred[3,57]{1,0} parameter(5)
-  tmp_15 = f32[3,57]{1,0} parameter(6)
-  tmp_16 = f32[3,57]{1,0} parameter(7)
-  tmp_18 = f32[3,57]{1,0} parameter(8)
-  tmp_9 = f32[3,57]{1,0} parameter(9)
-  tmp_10 = f32[3,57]{1,0} parameter(10)
-  tmp_12 = f32[3,57]{1,0} parameter(11)
-  ROOT r = f32[32,57]{0,1} fusion(tmp_1, tmp_2, tmp_3, tmp_5, tmp_7, tmp_14, tmp_15, tmp_16, tmp_18, tmp_9, tmp_10, tmp_12), kind=kCustom,
-    calls=triton_gemm_dot_computation,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"64","block_n":"64",
-                                          "block_k":"64","split_k":"1",
-                                          "num_stages":"1","num_warps":"4",
-                                          "num_ctas":"1"}}}
-})";
-
-  const std::string kHloTextRef = R"(
-HloModule m
-
-fused_computation {
-  param_5.1 = f32[3,57]{1,0} parameter(5)
-  param_6 = f32[3,57]{1,0} parameter(6)
-  multiply.4 = f32[3,57]{1,0} multiply(param_5.1, param_6)
-  param_4.2 = f32[3,57]{1,0} parameter(4)
-  add.3 = f32[3,57]{1,0} add(multiply.4, param_4.2)
-  param_1.4 = pred[3,57]{1,0} parameter(1)
-  param_2.2 = f32[3,57]{1,0} parameter(2)
-  param_3.1 = f32[3,57]{1,0} parameter(3)
-  select.2 = f32[3,57]{1,0} select(param_1.4, param_2.2, param_3.1)
-  param_0.1 = f32[3,57]{1,0} parameter(0)
-  multiply.3 = f32[3,57]{1,0} multiply(select.2, param_0.1)
-  negate.1 = f32[3,57]{1,0} negate(multiply.3)
-  add.2 = f32[3,57]{1,0} add(add.3, negate.1)
-  const.1 = f32[] constant(-3e-3)
-  const.2 = f32[] constant(3e-2)
-  broadcast.1 = f32[3,57]{1,0} broadcast(const.1), dimensions={}
-  broadcast.2 = f32[3,57]{1,0} broadcast(const.2), dimensions={}
-  ROOT clamp = f32[3,57]{1,0} clamp(broadcast.1, add.2, broadcast.2)
-}
-
-fused_computation.1 {
-  param_2.4 = pred[3,32]{1,0} parameter(2)
-  param_3.2 = f32[3,32]{1,0} parameter(3)
-  param_4.3 = f32[3,32]{1,0} parameter(4)
-  select.3 = f32[3,32]{1,0} select(param_2.4, param_3.2, param_4.3)
-  param_1.7 = f32[3,32]{1,0} parameter(1)
-  multiply.5 = f32[3,32]{1,0} multiply(select.3, param_1.7)
-  param_0.3 = f32[3,32]{1,0} parameter(0)
-  ROOT maximum.1 = f32[3,32]{1,0} maximum(multiply.5, param_0.3)
-}
-
-ENTRY e {
-  tmp_18 = f32[3,57]{1,0} parameter(8)
-  tmp_16 = f32[3,57]{1,0} parameter(7)
-  tmp_15 = f32[3,57]{1,0} parameter(6)
-  tmp_14 = pred[3,57]{1,0} parameter(5)
-  tmp_12 = f32[3,57]{1,0} parameter(11)
-  tmp_10 = f32[3,57]{1,0} parameter(10)
-  tmp_9 = f32[3,57]{1,0} parameter(9)
-  tmp_7 = f32[3,32]{1,0} parameter(4)
-  tmp_5 = f32[3,32]{1,0} parameter(3)
-  tmp_3 = f32[3,32]{1,0} parameter(2)
-  tmp_2 = f32[3,32]{1,0} parameter(1)
-  tmp_1 = pred[3,32]{1,0} parameter(0)
-  fusion.1 = f32[3,32]{1,0} fusion(tmp_7, tmp_5, tmp_1, tmp_2, tmp_3), kind=kLoop, calls=fused_computation.1
-  fusion = f32[3,57]{1,0} fusion(tmp_18, tmp_14, tmp_15, tmp_16, tmp_12, /*index=5*/tmp_9, tmp_10), kind=kLoop, calls=fused_computation
-  gemm = (f32[32,57]{0,1}, s8[0]{0}) custom-call(fusion.1, fusion),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[32,57]{0,1} get-tuple-element((f32[32,57]{0,1}, s8[0]{0}) gemm), index=0
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
-                                      ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, PredToBF16ConversionWorks) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  const std::string kHloTextTest = R"(
-HloModule m, is_scheduled=true
-
-triton_gemm_computation {
-  parameter_0 = bf16[92,11]{1,0} parameter(0)
-  parameter_1 = s32[11,63]{1,0} parameter(1)
-  parameter_2 = s32[11,63]{1,0} parameter(2)
-  f1.1 = pred[11,63]{1,0} compare(parameter_1, parameter_2), direction=GE
-  c.1 = bf16[11,63]{1,0} convert(f1.1)
-  ROOT _.1 = bf16[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = bf16[92,11]{1,0} parameter(0)
-  p1 = s32[11,63]{1,0} parameter(1)
-  p2 = s32[11,63]{1,0} parameter(2)
-  ROOT triton_gemm__ = bf16[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
-    calls=triton_gemm_computation,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"32","block_n":"16",
-                                          "block_k":"32","split_k":"1",
-                                          "num_stages":"1","num_warps":"4",
-                                          "num_ctas":"1"}}}
-})";
-
-  const std::string kHloTextRef = R"(
-HloModule m, is_scheduled=true
-
-fused_computation {
-  p0 = s32[11,63]{1,0} parameter(0)
-  p1 = s32[11,63]{1,0} parameter(1)
-  f.1 = pred[11,63]{1,0} compare(p0, p1), direction=GE
-  ROOT convert.1 = bf16[11,63]{1,0} convert(f.1)
-}
-
-ENTRY e {
-  p2 = s32[11,63]{1,0} parameter(2)
-  p1 = s32[11,63]{1,0} parameter(1)
-  p0 = bf16[92,11]{1,0} parameter(0)
-  fusion = bf16[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
-  gemm = (bf16[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
-    custom_call_target="__cublas$gemm",
-    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":
-      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
-      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
-      "alpha_imag":0,"precision_config":
-      {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = bf16[92,63]{1,0} get-tuple-element((bf16[92,63]{1,0}, s8[0]{0}) gemm), index=0
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
-                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(CompareTest, DifferentLayoutsAreSupportedInOneScope) {
-  const std::string kHloTextTest = R"(
-triton_dot {
-  p1 = f16[3,3,2,16]{1,3,2,0} parameter(1)
-  cvt1 = f32[3,3,2,16]{1,3,2,0} convert(p1)
-  p0 = f16[9,32]{0,1} parameter(0)
-  b0 = f16[3,3,2,16]{1,0,3,2} bitcast(p0)
-  cp0b0 = f16[2,16,3,3]{3,2,1,0} bitcast(b0)
-  cp0t0 = f16[3,2,16,3]{3,2,1,0} transpose(cp0b0), dimensions={2,0,1,3}
-  cp0b1 = f16[3,3,2,16]{1,3,2,0} bitcast(cp0t0)
-  cvt0 = f32[3,3,2,16]{1,3,2,0} convert(cp0b1)
-  m = f32[3,3,2,16]{1,3,2,0} multiply(cvt1, cvt0)
-  cvt2 = f16[3,3,2,16]{1,3,2,0} convert(m)
-  cp1b0 = f16[3,2,16,3]{3,2,1,0} bitcast(cvt2)
-  cp1t0 = f16[3,3,2,16]{3,2,1,0} transpose(cp1b0), dimensions={0,3,1,2}
-  b1 = f16[9,32]{1,0} bitcast(cp1t0)
-  p2 = f16[32,32]{1,0} parameter(2)
-  ROOT r = f16[9,32]{1,0} dot(b1, p2),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f16[9,32]{0,1} parameter(0)
-  p1 = f16[3,3,2,16]{1,3,2,0} parameter(1)
-  p2 = f16[32,32]{1,0} parameter(2)
-  ROOT r = f16[9,32]{1,0} fusion(p0, p1, p2),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
-                         "split_k":1,"num_stages":1,"num_warps":2,
-                         "num_ctas":"1"}}}
-})";
-
-  const std::string kHloTextRef = R"(
-ENTRY e {
-  p1 = f16[3,3,2,16]{1,3,2,0} parameter(1)
-  cvt1 = f32[3,3,2,16]{1,3,2,0} convert(p1)
-  p0 = f16[9,32]{0,1} parameter(0)
-  b0 = f16[3,3,2,16]{1,0,3,2} bitcast(p0)
-  cp0b0 = f16[2,16,3,3]{3,2,1,0} bitcast(b0)
-  cp0t0 = f16[3,2,16,3]{3,2,1,0} transpose(cp0b0), dimensions={2,0,1,3}
-  cp0b1 = f16[3,3,2,16]{1,3,2,0} bitcast(cp0t0)
-  cvt0 = f32[3,3,2,16]{1,3,2,0} convert(cp0b1)
-  m = f32[3,3,2,16]{1,3,2,0} multiply(cvt1, cvt0)
-  cvt2 = f16[3,3,2,16]{1,3,2,0} convert(m)
-  cp1b0 = f16[3,2,16,3]{3,2,1,0} bitcast(cvt2)
-  cp1t0 = f16[3,3,2,16]{3,2,1,0} transpose(cp1b0), dimensions={0,3,1,2}
-  b1 = f16[9,32]{1,0} bitcast(cp1t0)
-  p2 = f16[32,32]{1,0} parameter(2)
-  ROOT r = f16[9,32]{1,0} dot(b1, p2),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
-                                      ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4},
-                                      /*run_hlo_passes=*/false));
-}
-
-TEST_F(TritonTest, UseTF32For8BitOrLessWithF32) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  parameter_0 = s32[11,24]{1,0} parameter(0)
-  broadcast.1747 = s32[11,24,128]{2,1,0} broadcast(parameter_0),
-  dimensions={0,1} parameter_1 = s32[11,24,128]{2,1,0} parameter(1)
-  compare.49 = pred[11,24,128]{2,1,0} compare(broadcast.1747, parameter_1),
-      direction=EQ
-  bitcast.4717 = pred[264,128]{1,0} bitcast(compare.49)
-  convert.142 = f32[264,128]{1,0} convert(bitcast.4717)
-  parameter_2 = f32[128,8]{1,0} parameter(2)
-  ROOT dot.381 = f32[264,8]{1,0} dot(convert.142, parameter_2),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s32[11,24]{1,0} parameter(0)
-  p1 = s32[11,24,128]{2,1,0} parameter(1)
-  p2 = f32[128,8]{1,0} parameter(2)
-  ROOT _ = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
-        {"block_m":32,"block_n":16,"block_k":128,
-         "split_k":1,"num_stages":1,"num_warps":4,
-         "num_ctas":1}}}
-})";
-
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, hlo_text, "triton_dot", R"(
-CHECK:      tt.dot
-CHECK:  inputPrecision = tf32
-  )"));
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, Fp8LoweringIsSupportedPostHopper) {
-  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
-  }
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  parameter_0 = f8e4m3fn[1600,1600]{1,0} parameter(0)
-  parameter_1 = f8e4m3fn[1600,1600]{1,0} parameter(1)
-  transpose = f8e4m3fn[1600,1600]{0,1} transpose(parameter_1), dimensions={1,0}
-  ROOT dot = f16[1600,1600]{1,0} dot(parameter_0, transpose),
-                lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY main {
-  parameter_1 = f8e4m3fn[1600,1600]{1,0} parameter(1)
-  parameter_0 = f8e4m3fn[1600,1600]{1,0} parameter(0)
-  ROOT gemm_fusion_dot = f16[1600,1600]{1,0} fusion(parameter_0, parameter_1),
-       kind=kCustom, calls=triton_dot,
-       backend_config={
-       "fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":
-         {"block_m":"128","block_n":"32","block_k":"64","split_k":"1",
-          "num_stages":"4","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, hlo_text, "triton_dot", R"(
-CHECK: tt.dot {{.*}}{maxNumImpreciseAcc = 2147483647 : i32} : tensor<128x64xf8E4M3FN> * tensor<64x32xf8E4M3FN> -> tensor<128x32xf32>
-  )"));
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, BF16ToFP8EndToEnd) {
-  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
-  }
-
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  parameter_0 = bf16[32,32]{1,0} parameter(0)
-  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
-  convert = f8e4m3fn[32,32]{1,0} convert(parameter_0)
-  ROOT dot = f32[32,32]{1,0} dot(convert, parameter_1),
-                lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY main {
-  parameter_0 = bf16[32,32]{1,0} parameter(0)
-  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
-  ROOT gemm_fusion_dot = f32[32,32]{1,0} fusion(parameter_0, parameter_1),
-       kind=kCustom, calls=triton_dot,
-       backend_config={
-       "fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":
-         {"block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-          "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FP8ToFP8EndToEnd) {
-  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
-  }
-
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
-  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
-  convert = f8e4m3fn[32,32]{1,0} convert(parameter_0)
-  ROOT dot = f32[32,32]{1,0} dot(convert, parameter_1),
-                lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY main {
-  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
-  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
-  ROOT gemm_fusion_dot = f32[32,32]{1,0} fusion(parameter_0, parameter_1),
-       kind=kCustom, calls=triton_dot,
-       backend_config={
-       "fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":
-         {"block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-          "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
-})";
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonGemmTest, WgmmaIsUsedForMemBoundShape) {
-  if (GetCudaComputeCapability().major != se::CudaComputeCapability::kHopper) {
-    GTEST_SKIP() << "wgmma instruction is only available on Hopper";
-  }
-  const std::string hlo_text = R"(
-gemm_fusion_dot {
-  p0 = s8[128,128]{1,0} parameter(0)
-  p1 = bf16[128,16]{1,0} parameter(1)
-  convert = bf16[128,128]{1,0} convert(p0)
-  ROOT %dot = bf16[128,16]{1,0} dot(convert, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[128,128]{1,0} parameter(0)
-  p1 = bf16[128,16]{1,0} parameter(1)
-  ROOT triton_gemm_fusion_dot = bf16[128,16]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=gemm_fusion_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
-        {"block_m":128,"block_n":16,"block_k":16,
-         "split_k":1,"num_stages":1,"num_warps":4,
-         "num_ctas":1}}}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  CompileAndOptionallyVerifyPtx(std::move(verified_module), R"(
-CHECK: wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16
-)");
-}
-
-// Test presence of default matmul config information
-// when gemm autotuner is not present in pipeline,
-// (which is currently the case on rocm).
-TEST_F(TritonGemmTest, TestNoAutotuner) {
-  if (std::holds_alternative<se::CudaComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Autotuner is always in pipeline on Cuda.";
-  }
-  constexpr absl::string_view kHloText = R"(
-ENTRY e {
-  p0 = f16[30,30] parameter(0)
-  p1 = s8[30,30] parameter(1)
-  cp1 = f16[30,30] convert(p1)
-  ROOT _ = f16[30,30] dot(p0, cp1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={1}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-  DebugOptions debug_options = verified_module->config().debug_options();
-  debug_options.set_xla_gpu_autotune_level(0);
-  verified_module->mutable_config().set_debug_options(debug_options);
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: fusion(
-; CHECK-SAME: kind=kCustom
-; CHECK-SAME: __triton_gemm
-  )");
-
-  EXPECT_TRUE(RunAndCompare(std::move(verified_module),
-                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index 73bf293e7ae87b..bb4dc244c58d47 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -38,9 +38,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/autotuning.pb.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -54,10 +54,12 @@ limitations under the License.
 #include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
@@ -115,6 +117,17 @@ INSTANTIATE_TEST_SUITE_P(TmaParameterizedTritonEmitterTestSuite,
                            return info.param ? "tma_allowed" : "tma_disabled";
                          });
 
+class WarpSpecializationTritonEmitterTest : public TritonEmitterTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_experimental_enable_triton_tma(true);
+    debug_options.set_xla_gpu_experimental_enable_triton_warp_specialization(
+        true);
+    return debug_options;
+  }
+};
+
 struct TmaAndDotLayoutTestParams {
   std::vector<int64_t> lhs_layout;
   std::vector<int64_t> rhs_layout;
@@ -217,6 +230,32 @@ ENTRY entry {
       hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
 
+TEST_F(TritonEmitterTest, ConvertIntegerToPredIsEmittedCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_convert {
+  p0 = s32[3,2,2]{2,1,0} parameter(0)
+  ROOT convert0 = pred[3,2,2]{2,1,0} convert(p0)
+}
+
+ENTRY %main {
+  p0 = s32[3,2,2]{2,1,0} parameter(0)
+  ROOT input_convert_fusion = pred[3,2,2]{2,1,0} fusion(p0), kind=kCustom,
+    calls=fused_convert,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton","block_level_fusion_config":{
+        "num_warps":"1","output_tiles":[{"sizes":["1","2","2"]}],"num_ctas":1,
+        "num_stages":1,"is_tma_allowed":false}}}
+}
+)";
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fused_convert", R"(
+CHECK: %[[CST:.*]] = arith.constant dense<0>
+CHECK: arith.cmpi ne, %{{.*}}, %[[CST]]
+)"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
 TEST_F(TritonEmitterTest, PredicateAddIsEmittedCorrectly) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
@@ -301,11 +340,19 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK:  "tt.reduce"(%[[LOAD:.*]]) <{axis = 1 : i32}>
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
+CHECK: %[[REDUCE:.*]] = stablehlo.reduce(%{{.*}} init: %{{.*}}) across dimensions = [1] : (tensor<4x4xf32>, tensor<f32>) -> tensor<4xf32>
 )"));
 
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:  "tt.reduce"(%[[LOAD:.*]]) <{axis = 1 : i32}>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -339,13 +386,26 @@ ENTRY entry_computation {
         "num_ctas":1,"num_stages":1,"is_tma_allowed":false}}}
 }
 )";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fused_reduce", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_reduce", R"(
+CHECK: stablehlo.reduce
+CHECK: reducer(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
+CHECK:   %[[ADD:.*]] = arith.addf %[[ARG0]], %[[ARG1]] : tensor<f32>
+CHECK:   %[[MIN:.*]] = arith.minimumf %[[ADD]]
+CHECK:   stablehlo.return %[[MIN]] : tensor<f32>
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK: "tt.reduce"
 CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32)
 CHECK: %[[ADD:.*]] = arith.addf %[[ARG0]], %[[ARG1]]
 CHECK: %[[MIN:.*]] = arith.minimumf %[[ADD]]
 CHECK: tt.reduce.return %[[MIN]]
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_reduce")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -381,14 +441,26 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK-COUNT-1:  triton_xla.extract
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
+CHECK-COUNT-1:  xtile.extract
 CHECK:  %[[ABS:.*]] = math.absf
-CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 1 : i32}>
-CHECK:  triton_xla.insert %[[REDUCE]] {{.*}} : tensor<64xf32>
-CHECK:  triton_xla.insert %[[ABS]] {{.*}} : tensor<64x512xf32>
+CHECK: %[[REDUCE:.*]] = stablehlo.reduce(%[[ABS]] init: %{{.*}}) across dimensions = [1] : (tensor<64x512xf32>, tensor<f32>) -> tensor<64xf32>
+CHECK:  xtile.insert %[[REDUCE]] {{.*}} : tensor<64xf32>
+CHECK:  xtile.insert %[[ABS]] {{.*}} : tensor<64x512xf32>
 )"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK-COUNT-1:  xtile.extract
+CHECK:  %[[ABS:.*]] = math.absf
+CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 1 : i32}>
+CHECK:  xtile.insert %[[REDUCE]] {{.*}} : tensor<64xf32>
+CHECK:  xtile.insert %[[ABS]] {{.*}} : tensor<64x512xf32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -423,14 +495,26 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK-COUNT-1:  triton_xla.extract
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
+CHECK-COUNT-1:  xtile.extract
 CHECK:  %[[ABS:.*]] = math.absf
-CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 0 : i32}>
-CHECK:  tt.store %arg1, %[[REDUCE]] : !tt.ptr<f32>
-CHECK:  triton_xla.insert %[[ABS]] {{.*}} : tensor<512xf32>
+CHECK:  %[[REDUCE:.*]] = stablehlo.reduce(%[[ABS]] init: %{{.*}}) across dimensions = [0]
+CHECK:  xtile.insert %[[ABS]]
 )"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK-COUNT-1:  xtile.extract
+CHECK:  %[[ABS:.*]] = math.absf
+CHECK:  %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 0 : i32}>
+CHECK: %[[REDUCE_TENSOR:.*]] = tensor.from_elements %[[REDUCE]] : tensor<f32>
+CHECK: xtile.insert %[[REDUCE_TENSOR]] into %arg1
+CHECK:  xtile.insert %[[ABS]] {{.*}} : tensor<512xf32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -790,12 +874,24 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK-COUNT-1:  triton_xla.extract
-CHECK: tt.reduce
-CHECK-COUNT-2:  triton_xla.insert
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
+CHECK-COUNT-1:  xtile.extract
+CHECK: stablehlo.reduce
+CHECK-COUNT-2:  xtile.insert
 )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK-COUNT-1:  xtile.extract
+CHECK: tt.reduce
+CHECK-COUNT-2:  xtile.insert
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -869,12 +965,23 @@ ENTRY main {
         "num_ctas":"1",
         "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_reduction_computation", R"(
-CHECK:  tt.make_range
-CHECK-COUNT-4:  tt.expand_dims
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_reduction_computation",
+                                R"(
+
+        CHECK:  xtile.mask
+        CHECK:  stablehlo.reduce(%[[SELECT:.*]] init: %{{.*}}) across dimensions = [2] : (tensor<4x2x8x8x1xf32>, tensor<f32>) -> tensor<4x2x8x1xf32>
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:  xtile.mask
 CHECK:  "tt.reduce"(%[[SELECT:.*]]) <{axis = 2 : i32}>
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_reduction_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -907,21 +1014,34 @@ ENTRY main {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_reduction_computation", R"(
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_reduction_computation",
+                                R"(
+; Make sure input reduction tile is padded with a neutral value.
+CHECK:  %[[LOAD:.*]] = xtile.extract
+CHECK:  %[[MASKED:.*]] = xtile.mask %[[LOAD]]
+CHECK:  %[[REDUCE:.*]] = stablehlo.reduce(%[[MASKED]] init: %{{.*}}) across dimensions = [0] : (tensor<8x4xf32>, tensor<f32>) -> tensor<4xf32>
+CHECK:   reducer(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)  {
+CHECK:   %[[MAX:.*]] = arith.maximumf %[[ARG0]], %[[ARG1]] : tensor<f32>
+CHECK:   stablehlo.return %[[MAX]] : tensor<f32>
+CHECK: }
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 ; Make sure input reduction tile is padded with a neutral value.
-CHECK:  %[[LOAD:.*]] = triton_xla.extract
-CHECK:  %[[RANGE:.*]] = tt.make_range
-CHECK:  %[[EXPAND:.*]] = tt.expand_dims %[[RANGE]]
-CHECK:  %[[BROADCAST:.*]] = tt.broadcast %[[EXPAND]]
-CHECK:  %[[CMPI:.*]] = arith.cmpi slt, %[[BROADCAST]]
-CHECK:  %[[SELECT:.*]] = arith.select %[[CMPI]], %[[LOAD]]
-CHECK:  "tt.reduce"(%[[SELECT]]) <{axis = 0 : i32}>
+CHECK:  %[[LOAD:.*]] = xtile.extract
+CHECK:  %[[MASKED:.*]] = xtile.mask %[[LOAD]]
+CHECK:  "tt.reduce"(%[[MASKED]]) <{axis = 0 : i32}>
 CHECK:  ^bb0(%[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32):
 CHECK:    %[[MAXIMUM:.*]] = arith.maximumf %[[ARG2]], %[[ARG3]] : f32
 CHECK:    tt.reduce.return %[[MAXIMUM]] : f32
 CHECK:  })
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_reduction_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -957,13 +1077,31 @@ ENTRY main {
         "num_warps":"1",
         "num_ctas":"1",
         "num_stages":"1"}}}})";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_softmax_computation", R"(
-CHECK:        func.func @triton_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}})
-CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_cast %[[PID]] : i32 to index
-CHECK-NEXT:       triton_xla.extract from %[[P0]]
-CHECK-SAME:       [%[[PID_INDEX]], 0] [1, 128] [1, 1]
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(
+                              this, kHloText, "triton_softmax_computation", R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[PID:.*]]: index)
+CHECK-DAG:        %[[EXTRACT_IDX_0:.*]] = xla.apply_indexing #indexing_map(%[[PID]])
+CHECK-NEXT:       xtile.extract %[[P0]]
+CHECK-SAME:       [%[[PID]], %[[EXTRACT_IDX_0]]] [1, 128] [1, 1]
+CHECK:            stablehlo.reduce
+CHECK-NEXT:       reducer(%[[ARG2:[^:]*]]: tensor<f32>, %[[ARG3:[^:]*]]: tensor<f32>)  {
+CHECK:              %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : tensor<f32>
+CHECK:              stablehlo.return %[[ADD]] : tensor<f32>
+CHECK-NEXT:       }
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<1x128xf32>
+CHECK:            xtile.insert {{.*}}[%[[PID]], %{{.*}}] [1, 128] [1, 1]
+CHECK:            return
+CHECK:        }
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[PID:.*]]: index)
+CHECK-DAG:        %[[C_0:.*]] = arith.constant 0 : index
+CHECK-NEXT:       xtile.extract %[[P0]]
+CHECK-SAME:       [%[[PID]], %[[C_0]]] [1, 128] [1, 1]
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG2:[^:]*]]: f32, %[[ARG3:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : f32
@@ -971,10 +1109,12 @@ CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf
 CHECK-SAME:       tensor<1x128xf32>
-CHECK:            triton_xla.insert {{.*}} [%[[PID_INDEX]], 0] [1, 128] [1, 1]
+CHECK:            xtile.insert {{.*}}[%[[PID]], %[[C_0]]] [1, 128] [1, 1]
 CHECK:            return
 CHECK:        }
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_softmax_computation")));
 }
 
 // TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
@@ -1012,25 +1152,50 @@ ENTRY main {
         "num_warps":"1",
         "num_ctas":"1",
         "num_stages":"1"}}}})";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_softmax_computation", R"(
-CHECK:         func.func @triton_fn(
-CHECK-SAME:                      %[[P0:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_cast %[[PID]] : i32 to index
-CHECK-DAG:        triton_xla.extract from %[[P0]] {{.*}} [%[[PID_INDEX]], 0] [1, 128] [1, 1] : tensor<1x128xf32>
-CHECK-DAG:        triton_xla.extract from %[[P1]] {{.*}} [0] [128] [1] : tensor<128xf32>
+
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(
+                              this, kHloText, "triton_softmax_computation", R"(
+CHECK:         xtile.entry_func @xtile_dialect_fn(
+CHECK-SAME:                      %[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: memref<127xf32>
+CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                      %[[TID:[A-Za-z0-9_]*]]: index)
+CHECK-DAG:        %[[EXTRACT_IDX_0:.*]] = xla.apply_indexing #indexing_map(%[[TID]])
+CHECK-DAG:        xtile.extract %[[P0]][%[[TID]], %[[EXTRACT_IDX_0]]] [1, 128] [1, 1] : {{.*}} -> tensor<1x128xf32>
+CHECK-DAG:        %[[EXTRACT_IDX_1:.*]] = xla.apply_indexing #indexing_map(%[[TID]])
+CHECK-DAG:        xtile.extract %[[P1]][%[[EXTRACT_IDX_1]]] [128] [1] : {{.*}} -> tensor<128xf32>
+CHECK:            stablehlo.reduce
+CHECK-NEXT:       reducer(%[[ARG3:[^:]*]]: tensor<f32>, %[[ARG4:[^:]*]]: tensor<f32>)  {
+CHECK:              %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : tensor<f32>
+CHECK:              stablehlo.return %[[ADD]] : tensor<f32>
+CHECK-NEXT:       }
+CHECK:            arith.mulf
+CHECK-DAG:        xtile.insert {{.*}} into %[[P2]]
+CHECK-SAME:       [%[[TID]], %{{.*}}] [1, 128] [1, 1] : tensor<1x128xf32>
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:         xtile.entry_func @xtile_dialect_fn(
+CHECK-SAME:                      %[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: memref<127xf32>
+CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                      %[[TID:[A-Za-z0-9_]*]]: index)
+CHECK-DAG:        %[[C_0:.*]] = arith.constant 0 : index
+CHECK-DAG:        xtile.extract %[[P0]][%[[TID]], %[[C_0]]] [1, 128] [1, 1] : {{.*}} -> tensor<1x128xf32>
+CHECK-DAG:        xtile.extract %[[P1]][%[[C_0]]] [128] [1] : {{.*}} -> tensor<128xf32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf
-CHECK-DAG:        triton_xla.insert {{.*}} into %[[P2]]
-CHECK-SAME:       [%[[PID_INDEX]], 0] [1, 128] [1, 1] : tensor<1x128xf32>
-)"));
+CHECK-DAG:        xtile.insert {{.*}} into %[[P2]]
+CHECK-SAME:       [%[[TID]], %[[C_0]]] [1, 128] [1, 1] : tensor<1x128xf32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_softmax_computation")));
 }
 
 TEST_F(TritonEmitterTest, TestGenericEmitterWithMultipleTiledDimensions) {
@@ -1073,26 +1238,51 @@ ENTRY main {
           "num_stages":"1"}}}
 })";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_softmax_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(
+                              this, kHloText, "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 floordiv 125), domain: pid_0 in [0, 1249]">
 CHECK:        #[[MAP1:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 mod 125), domain: pid_0 in [0, 1249]">
-CHECK:        func.func @triton_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[P2:.*]]: {{.*}}, %[[P3:.*]]: {{.*}})
-CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_cast %[[PID]] : i32 to index
-CHECK-DAG:        %[[ROW_INDEX:.*]] = xla.apply_indexing #[[MAP]](%[[PID_INDEX]]
-CHECK-DAG:        %[[COL_INDEX:.*]] = xla.apply_indexing #[[MAP1]](%[[PID_INDEX]]
-CHECK:            triton_xla.extract from %[[P0]] {{.*}} [%[[ROW_INDEX]], %[[COL_INDEX]], 0] [1, 1, 128] [1, 1, 1] : tensor<1x1x128xf32>
-CHECK:            triton_xla.extract from %[[P1]] {{.*}} [0] [128] [1] : tensor<128xf32>
-CHECK:            triton_xla.extract from %[[P2]] {{.*}} [%[[ROW_INDEX]], %[[COL_INDEX]]] [1, 1] [1, 1] : tensor<1x1xf32>
+CHECK:        #[[C_0_MAP:.*]] = #xla.indexing_map<"(pid_0) -> (0), domain: pid_0 in [0, 1249]">
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[P2:.*]]: {{.*}}, %[[P3:.*]]: {{.*}}, %[[TID:.*]]: index)
+CHECK-DAG:        %[[ROW_INDEX:.*]] = xla.apply_indexing #[[MAP]](%[[TID]]
+CHECK-DAG:        %[[COL_INDEX:.*]] = xla.apply_indexing #[[MAP1]](%[[TID]]
+CHECK-DAG:        %[[C_0:.*]] = xla.apply_indexing #[[C_0_MAP]](%[[TID]])
+CHECK:            xtile.extract %[[P0]][%[[ROW_INDEX]], %[[COL_INDEX]], %[[C_0]]] [1, 1, 128] [1, 1, 1] : {{.*}} -> tensor<1x1x128xf32>
+CHECK:            %[[C_0_COPY:.*]] = xla.apply_indexing #[[C_0_MAP]](%[[TID]])
+CHECK:            xtile.extract %[[P1]][%[[C_0_COPY]]] [128] [1] : {{.*}} -> tensor<128xf32>
+CHECK-DAG:        %[[ROW_INDEX_COPY:.*]] = xla.apply_indexing #[[MAP]](%[[TID]]
+CHECK-DAG:        %[[COL_INDEX_COPY:.*]] = xla.apply_indexing #[[MAP1]](%[[TID]]
+CHECK:            xtile.extract %[[P2]][%[[ROW_INDEX_COPY]], %[[COL_INDEX_COPY]]] [1, 1] [1, 1] : {{.*}} -> tensor<1x1xf32>
+CHECK:            stablehlo.reduce
+CHECK-NEXT:       reducer(%[[ARG4:[^:]*]]: tensor<f32>, %[[ARG5:[^:]*]]: tensor<f32>)  {
+CHECK:              %[[MAX:.*]] = arith.maximumf %[[ARG4]], %[[ARG5]] : tensor<f32>
+CHECK:              stablehlo.return %[[MAX]] : tensor<f32>
+CHECK-NEXT:       }
+CHECK:            xtile.insert {{.*}} into %[[P3]]{{.*}}
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:        #[[MAP:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 floordiv 125), domain: pid_0 in [0, 1249]">
+CHECK:        #[[MAP1:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 mod 125), domain: pid_0 in [0, 1249]">
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[P2:.*]]: {{.*}}, %[[P3:.*]]: {{.*}}, %[[TID:.*]]: index)
+CHECK-DAG:        %[[C_0:.*]] = arith.constant 0 : index
+CHECK-DAG:        %[[ROW_INDEX:.*]] = xla.apply_indexing #[[MAP]](%[[TID]]
+CHECK-DAG:        %[[COL_INDEX:.*]] = xla.apply_indexing #[[MAP1]](%[[TID]]
+CHECK:            xtile.extract %[[P0]][%[[ROW_INDEX]], %[[COL_INDEX]], %[[C_0]]] [1, 1, 128] [1, 1, 1] : {{.*}} -> tensor<1x1x128xf32>
+CHECK:            xtile.extract %[[P1]][%[[C_0]]] [128] [1] : {{.*}} -> tensor<128xf32>
+CHECK:            xtile.extract %[[P2]][%[[ROW_INDEX]], %[[COL_INDEX]]] [1, 1] [1, 1] : {{.*}} -> tensor<1x1xf32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG4:[^:]*]]: f32, %[[ARG5:[^:]*]]: f32):
 CHECK-NEXT:           %[[MAX:.*]] = arith.maximumf %[[ARG4]], %[[ARG5]] : f32
 CHECK-NEXT:           tt.reduce.return %[[MAX]] : f32
 CHECK-NEXT:       }) : (tensor<1x1x128xf32>) -> tensor<1x1xf32>
-CHECK:            triton_xla.insert {{.*}} into %[[P3]]
-CHECK-SAME:       [%[[ROW_INDEX]], %[[COL_INDEX]], 0] [1, 1, 128] [1, 1, 1] : tensor<1x1x128xf32>
-)"));
+CHECK:            xtile.insert {{.*}} into %[[P3]]
+CHECK-SAME:       [%[[ROW_INDEX]], %[[COL_INDEX]], %[[C_0]]] [1, 1, 128] [1, 1, 1] : tensor<1x1x128xf32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_softmax_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -1276,13 +1466,13 @@ ENTRY main {
                                           "triton_softmax_computation", R"(
 // CHECK:         #xla.indexing_map<"(pid_0) -> (pid_0 floordiv 32), domain: pid_0 in [0, 2047]">
 // CHECK:         #xla.indexing_map<"(pid_0) -> (pid_0 mod 32), domain: pid_0 in [0, 2047]">
-// CHECK-LABEL:   func.func @triton_fn(
-// CHECK-SAME:                       %[[P0:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-// CHECK-SAME:                       %[[P1:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-// CHECK-SAME:                       %[[P2:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-// CHECK-DAG:       tt.load {{.*}} : !tt.ptr<f32>
-// CHECK-DAG:       triton_xla.extract {{.*}} : tensor<1x1x16xf32>
-// CHECK:           triton_xla.insert {{.*}} : tensor<1x1x16xf32>
+// CHECK-LABEL:   xtile.entry_func @triton_fn(
+// CHECK-SAME:                       %[[P0:[A-Za-z0-9_]*]]: memref<64x32x16xf32>
+// CHECK-SAME:                       %[[P1:[A-Za-z0-9_]*]]: memref<f32>
+// CHECK-SAME:                       %[[P2:[A-Za-z0-9_]*]]: memref<64x32x16xf32>
+// CHECK-DAG:       xtile.extract {{.*}} -> tensor<f32>
+// CHECK-DAG:       xtile.extract {{.*}} -> tensor<1x1x16xf32>
+// CHECK:           xtile.insert {{.*}} : tensor<1x1x16xf32>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1469,21 +1659,37 @@ ENTRY main {
         "output_tiles":[{"sizes":["1"]}],
         "num_warps":"1",
         "num_ctas":"1",
-        "num_stages":"1"}}}
+          "num_stages":"1"}}}
 })";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_reduction_computation", R"(
-CHECK:        func.func @triton_fn(%[[P0:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-DAG:        triton_xla.extract {{.*}} : tensor<1xf32>
-CHECK-DAG:        triton_xla.extract {{.*}} : tensor<1x128xf32>
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_reduction_computation",
+                                R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                               %[[P1:[A-Za-z0-9_]*]]: memref<125xf32>
+CHECK-SAME:                               %[[P2:[A-Za-z0-9_]*]]: memref<125xf32>
+CHECK-DAG:        xtile.extract {{.*}} -> tensor<1xf32>
+CHECK-DAG:        xtile.extract {{.*}} -> tensor<1x128xf32>
+CHECK: %[[REDUCE:.*]] = stablehlo.reduce(%[[REDUCE_ARG:.*]] init: %{{.*}}) across dimensions = [1] : (tensor<1x128xf32>,    tensor<f32>) -> tensor<1xf32>
+CHECK:            arith.mulf {{.*}} tensor<1xf32>
+CHECK:            xtile.insert {{.*}} : tensor<1xf32>
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                               %[[P1:[A-Za-z0-9_]*]]: memref<125xf32>
+CHECK-SAME:                               %[[P2:[A-Za-z0-9_]*]]: memref<125xf32>
+CHECK-DAG:        xtile.extract {{.*}} -> tensor<1xf32>
+CHECK-DAG:        xtile.extract {{.*}} -> tensor<1x128xf32>
 CHECK:            tt.reduce
 CHECK:              (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf {{.*}} tensor<1xf32>
-CHECK:            triton_xla.insert {{.*}} : tensor<1xf32>
-)"));
+CHECK:            xtile.insert {{.*}} : tensor<1xf32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_reduction_computation")));
 }
 
 TEST_F(TritonEmitterTest,
@@ -1765,11 +1971,20 @@ ENTRY main {
   const bool is_tma_allowed = GetParam();
   const std::string hlo_text =
       absl::Substitute(kHloTextTemplate, is_tma_allowed);
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, hlo_text, "triton_computation", R"(
-CHECK: tt.reshape
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, hlo_text, "triton_computation", R"(
+CHECK: stablehlo.reshape
 )"));
 
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK: tt.reshape
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, kExactMatch));
 }
 
@@ -1793,18 +2008,46 @@ ENTRY main {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "triton_computation",
-                                          R"(
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
 // #xla.indexing_map<"(pid_0) -> (pid_0 * 32), domain: pid_0 in [0, 1]
 
-// CHECK: func @{{.*}}(%[[IN:.*]]: !tt.ptr<f32>, %[[OUT:.*]]: !tt.ptr<f32>)
+// CHECK: xtile.entry_func @{{.*}}(
+// CHECK-SAME: %[[IN:.*]]: memref<17xf32>
+// CHECK-SAME: %[[OUT:.*]]: memref<49xf32>
+
+// CHECK: %[[EXTRACT:.*]] = xtile.extract %[[IN]]{{.*}}
+// CHECK: %[[PAD_VALUE:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
+// CHECK: %[[TILE_OFFSET:.*]] = xla.apply_indexing
+// CHECK: %[[IOTA_VAL:.*]] = stablehlo.iota dim = 0 : tensor<32xi32>
+// CHECK: %[[IOTA:.*]] = stablehlo.broadcast_in_dim %[[IOTA_VAL]], dims = [0] : (tensor<32xi32>) -> tensor<32xi32>
+// CHECK: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET]]
+// CHECK: %[[C17:.*]] = arith.constant 17 : i32
+// CHECK: %[[THRESHOLD:.*]] = arith.subi %[[C17]], %[[TILE_OFFSET_I32]]
+// CHECK: %[[THRESHOLD_TENSOR:.*]] = tensor.from_elements %[[THRESHOLD]]
+// CHECK: %[[THRESHOLD_SPLAT:.*]] = stablehlo.broadcast_in_dim %[[THRESHOLD_TENSOR]], dims = []
+// CHECK: %[[MASK:.*]] = arith.cmpi slt, %[[IOTA]], %[[THRESHOLD_SPLAT]]
+// CHECK: %[[PAD_SPLAT:.*]] = stablehlo.broadcast_in_dim %[[PAD_VALUE]], dims = []
+// CHECK: %[[SELECT:.*]] = arith.select %[[MASK]], %[[EXTRACT]], %[[PAD_SPLAT]]
+
+// CHECK:   xtile.insert %[[SELECT]] into %[[OUT]]
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+// #xla.indexing_map<"(pid_0) -> (pid_0 * 32), domain: pid_0 in [0, 1]
+
+// CHECK: xtile.entry_func @{{.*}}(%[[IN:.*]]: memref<17xf32>
+// CHECK-SAME:, %[[OUT:.*]]: memref<49xf32>
 
 // CHECK: %[[PAD_VALUE:.*]] = arith.constant dense<1.000000e+00> : tensor<32xf32>
 // CHECK: %[[C17:.*]] = arith.constant 17 : i32
 // CHECK: %[[TILE_OFFSET:.*]] = xla.apply_indexing
-// CHECK: %[[EXTRACT:.*]] = triton_xla.extract from %[[IN]]
+// CHECK: %[[EXTRACT:.*]] = xtile.extract %[[IN]]
 // CHECK-SAME: %[[TILE_OFFSET]]] [32] [1]
-// CHECK-SAME:  : tensor<32xf32>
+// CHECK-SAME:  -> tensor<32xf32>
 
 // CHECK: %[[IOTA:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32}
 // CHECK: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET]]
@@ -1813,9 +2056,11 @@ ENTRY main {
 // CHECK: %[[MASK:.*]] = arith.cmpi slt, %[[IOTA]], %[[THRESHOLD_SPLAT]]
 // CHECK: %[[SELECT:.*]] = arith.select %[[MASK]], %[[EXTRACT]], %[[PAD_VALUE]]
 
-// CHECK:   triton_xla.insert %[[SELECT]] into %[[OUT]]
+// CHECK:   xtile.insert %[[SELECT]] into %[[OUT]]
 // CHECK-SAME: [%[[TILE_OFFSET]]] [32] [1] : tensor<32xf32>
-  )"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -1839,9 +2084,24 @@ ENTRY main {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "triton_computation",
-                                          R"(
-// CHECK: triton_xla.extract {{.*}} : tensor<32x16xf32>
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+// CHECK: xtile.extract {{.*}} -> tensor<32x16xf32>
+// CHECK: stablehlo.iota dim = 0 : tensor<32xi32>
+// CHECK: stablehlo.broadcast_in_dim
+// CHECK: arith.cmpi
+// CHECK: stablehlo.iota dim = 0 : tensor<16xi32>
+// CHECK: stablehlo.broadcast_in_dim
+// CHECK: arith.cmpi slt
+// CHECK: arith.andi
+// CHECK: arith.select
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+// CHECK: xtile.extract {{.*}} -> tensor<32x16xf32>
 // CHECK: tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
 // CHECK: tt.expand_dims
 // CHECK: tt.broadcast
@@ -1852,7 +2112,9 @@ ENTRY main {
 // CHECK: arith.cmpi slt
 // CHECK: arith.andi
 // CHECK: arith.select
-  )"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -1876,11 +2138,19 @@ ENTRY main {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK: tt.reshape
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK: stablehlo.reshape
 )"));
 
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK: tt.reshape
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -2057,20 +2327,20 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK-NOT: stablehlo.transpose
-CHECK:     tt.reshape
+CHECK:     stablehlo.reshape
 CHECK-NOT: stablehlo.transpose
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
       this, xtile_module_and_hlo_module.first.get(), R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK-NOT: tt.trans
 CHECK:     tt.reshape
 CHECK-NOT: tt.trans
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
   )",
       GetFusionInstruction(*xtile_module_and_hlo_module.second,
                            "triton_computation")));
@@ -2101,20 +2371,23 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
-CHECK:     stablehlo.transpose
-CHECK:     tt.reshape
-CHECK-NOT: stablehlo.transpose
-CHECK:     triton_xla.insert
+CHECK:      xtile.entry_func @xtile_dialect_fn(
+CHECK-SAME: memref<48x16xi32, #triton_xla.layout<[0, 1]>>
+CHECK-SAME: memref<16x16x3xi32>,
+CHECK:      xtile.extract
+CHECK:      stablehlo.transpose
+CHECK:      stablehlo.reshape
+CHECK-NOT:  stablehlo.transpose
+CHECK:      xtile.insert
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
       this, xtile_module_and_hlo_module.first.get(), R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK:     tt.trans
 CHECK:     tt.reshape
 CHECK-NOT: tt.trans
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
   )",
       GetFusionInstruction(*xtile_module_and_hlo_module.second,
                            "triton_computation")));
@@ -2145,20 +2418,20 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK-NOT: stablehlo.transpose
-CHECK:     tt.reshape
+CHECK:     stablehlo.reshape
 CHECK:     stablehlo.transpose
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
       this, xtile_module_and_hlo_module.first.get(), R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK-NOT: tt.trans
 CHECK:     tt.reshape
 CHECK:     tt.trans
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
   )",
       GetFusionInstruction(*xtile_module_and_hlo_module.second,
                            "triton_computation")));
@@ -2190,20 +2463,20 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK:     stablehlo.transpose
-CHECK:     tt.reshape
+CHECK:     stablehlo.reshape
 CHECK:     stablehlo.transpose
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
       this, xtile_module_and_hlo_module.first.get(), R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK:     tt.trans
 CHECK:     tt.reshape
 CHECK:     tt.trans
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
   )",
       GetFusionInstruction(*xtile_module_and_hlo_module.second,
                            "triton_computation")));
@@ -2234,20 +2507,20 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK:     stablehlo.transpose
-CHECK-NOT: tt.reshape
+CHECK-NOT: stablehlo.reshape
 CHECK-NOT: stablehlo.transpose
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
       this, xtile_module_and_hlo_module.first.get(), R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 CHECK:     tt.trans
 CHECK-NOT: tt.reshape
 CHECK-NOT: tt.trans
-CHECK:     triton_xla.insert
+CHECK:     xtile.insert
   )",
       GetFusionInstruction(*xtile_module_and_hlo_module.second,
                            "triton_computation")));
@@ -2289,13 +2562,24 @@ ENTRY main {
         "num_ctas":"1",
         "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     xtile.extract
+CHECK:     stablehlo.reduce
+CHECK:     stablehlo.broadcast_in_dim
+CHECK:     xtile.insert
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:     xtile.extract
 CHECK:     tt.reduce
 CHECK:     tt.broadcast
-CHECK:     triton_xla.insert
-)"));
+CHECK:     xtile.insert
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2319,10 +2603,20 @@ ENTRY main {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:       %[[EXTRACTED_VALUE:.*]] = xtile.extract
+CHECK:       stablehlo.broadcast_in_dim %[[EXTRACTED_VALUE]], dims = []
+          )"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:       tt.splat {{.*}} f32 -> tensor<8x4xf32>
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 }
 
 TEST_F(TritonEmitterTest, PredOutputIsStoredCorrectly) {
@@ -2354,7 +2648,7 @@ ENTRY main {
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:      %[[CASTED_OUT:.*]] = arith.extui
 CHECK-SAME:   tensor<4xi1> to tensor<4xi8>
-CHECK:      triton_xla.insert %[[CASTED_OUT]]
+CHECK:      xtile.insert %[[CASTED_OUT]]
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -2393,8 +2687,8 @@ ENTRY main {
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:      %[[I8_PARAM:.*]] = triton_xla.extract {{.*}} : tensor<4xi8>
-CHECK:      arith.trunci %[[I8_PARAM]] : tensor<4xi8> to tensor<4xi1>
+CHECK:      %[[I8_PARAM:.*]] = xtile.extract {{.*}} -> tensor<4xi8>
+CHECK:      arith.cmpi ne, %[[I8_PARAM]], {{.*}} : tensor<4xi8>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -2425,13 +2719,13 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:      %[[TILE:.*]] = triton_xla.extract {{.*}} : tensor<8x4x1xf32>
+CHECK:      %[[TILE:.*]] = xtile.extract {{.*}} -> tensor<8x4x1xf32>
 CHECK:      stablehlo.transpose %[[TILE]], dims = [2, 0, 1] : (tensor<8x4x1xf32>) -> tensor<1x8x4xf32>
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
       this, xtile_module_and_hlo_module.first.get(), R"(
-CHECK:      %[[TILE:.*]] = triton_xla.extract {{.*}} : tensor<8x4x1xf32>
+CHECK:      %[[TILE:.*]] = xtile.extract {{.*}} -> tensor<8x4x1xf32>
 CHECK:      tt.trans %[[TILE]] {order = array<i32: 2, 0, 1>} : tensor<8x4x1xf32> -> tensor<1x8x4xf32>
   )",
       GetFusionInstruction(*xtile_module_and_hlo_module.second,
@@ -2470,20 +2764,20 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK:         %[[TILE:.*]] = triton_xla.extract {{.*}} : tensor<15x7x3xf32> to tensor<8x4x1xf32>
-CHECK-NOT:     triton_xla.extract
+CHECK:         %[[TILE:.*]] = xtile.extract {{.*}} -> tensor<15x7x3xf32> to tensor<8x4x1xf32>
+CHECK-NOT:     xtile.extract
 CHECK:         %[[ABS:.*]] = math.absf %[[TILE]]
 CHECK:         stablehlo.transpose %[[ABS]], dims = [2, 0, 1] : (tensor<8x4x1xf32>) -> tensor<1x8x4xf32>
-CHECK-COUNT-2: triton_xla.insert
+CHECK-COUNT-2: xtile.insert
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
       this, xtile_module_and_hlo_module.first.get(), R"(
-CHECK:         %[[TILE:.*]] = triton_xla.extract {{.*}} : tensor<15x7x3xf32> to tensor<8x4x1xf32>
-CHECK-NOT:     triton_xla.extract
+CHECK:         %[[TILE:.*]] = xtile.extract {{.*}} -> tensor<15x7x3xf32> to tensor<8x4x1xf32>
+CHECK-NOT:     xtile.extract
 CHECK:         %[[ABS:.*]] = math.absf %[[TILE]]
 CHECK:         tt.trans %[[ABS]] {order = array<i32: 2, 0, 1>} : tensor<8x4x1xf32> -> tensor<1x8x4xf32>
-CHECK-COUNT-2: triton_xla.insert
+CHECK-COUNT-2: xtile.insert
   )",
       GetFusionInstruction(*xtile_module_and_hlo_module.second,
                            "fused_computation")));
@@ -2536,11 +2830,20 @@ ENTRY main {
         "num_stages":"1"}}}
 })";
 
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:      %[[RANGE:.*]] = stablehlo.iota dim = 0 : tensor<64xi32>
+CHECK:      arith.muli{{.*}} %[[RANGE]]
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:      %[[RANGE:.*]] = tt.make_range {{.*}} : tensor<64xi32>
 CHECK:      arith.muli{{.*}} %[[RANGE]]
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2570,14 +2873,27 @@ ENTRY main {
 })",
                        primitive_util::LowercasePrimitiveTypeName(data_type));
 
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:      %[[RANGE:.*]] = stablehlo.iota dim = 0 : tensor<64xi32>
+CHECK:      %[[MUL:.*]] = arith.muli %[[RANGE]], {{.*}} : tensor<64xi32>
+CHECK:      arith.addi{{.*}} %[[MUL]]
+            // Omit the data type below, since it depends on a test parameter
+            // and is not abbreviated the same as in HLO.
+CHECK:      stablehlo.broadcast_in_dim {{.*}}, dims = [2] : {{.*}}
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:      %[[RANGE:.*]] = tt.make_range {{.*}} : tensor<64xi32>
 CHECK:      arith.addi{{.*}} %[[RANGE]]
             // Omit the data type below, since it depends on a test parameter
             // and is not abbreviated the same as in HLO.
 CHECK:      tt.broadcast {{.*}} -> tensor<1x2x64x8x
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2614,7 +2930,7 @@ ENTRY entry_computation {
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
+CHECK:     xtile.extract
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -2652,10 +2968,10 @@ ENTRY entry_computation {
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load {{.*}} !tt.ptr<f32>
+CHECK:     xtile.extract {{.*}} -> tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
 CHECK:     arith.subf {{.*}} f32
-CHECK:     tt.load {{.*}} !tt.ptr<f32>
+CHECK:     xtile.extract {{.*}} -> tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
 CHECK:     arith.subf {{.*}} f32
 CHECK:     arith.addf {{.*}} f32
@@ -2663,7 +2979,7 @@ CHECK:     arith.mulf {{.*}} f32
 CHECK:     arith.divf {{.*}} f32
 CHECK:     arith.truncf {{.*}} f32 to bf16
 CHECK:     arith.subf {{.*}} bf16
-CHECK:     tt.store {{.*}} !tt.ptr<bf16>
+CHECK:     xtile.insert {{.*}} : tensor<bf16>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
@@ -2702,16 +3018,28 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
-CHECK:     tt.splat
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     xtile.extract {{.*}} -> tensor<f32>
+CHECK:     stablehlo.broadcast_in_dim
 CHECK:     arith.addf
-CHECK:     tt.reduce
-CHECK:     tt.store {{.*}} !tt.ptr<f32>
+CHECK:     stablehlo.reduce
+CHECK:     xtile.insert {{.*}} : tensor<f32>
 )"));
 
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:     xtile.extract {{.*}} -> tensor<f32>
+CHECK:     tt.splat
+CHECK:     arith.addf
+CHECK:     tt.reduce
+CHECK:     xtile.insert {{.*}} : tensor<f32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/6e-1, /*arel=*/6e-1}));
 }
 
@@ -2738,14 +3066,23 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     stablehlo.reshape {{.*}} : (tensor<1x1x1x1xf32>) -> tensor<f32>
+CHECK:     xtile.insert {{.*}} : tensor<f32>
+)"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:     tt.reshape
 CHECK:     tt.reduce{{.*}}axis = 0
 CHECK-NOT: tt.reshape
 CHECK:     tt.reduce{{.*}}axis = 0
-CHECK:     tt.store {{.*}} !tt.ptr<f32>
-)"));
+CHECK:     xtile.insert {{.*}} : tensor<f32>
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2782,14 +3119,26 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     triton_xla.extract
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     xtile.extract
+CHECK:     stablehlo.reshape
+CHECK:     stablehlo.reduce
+CHECK:     stablehlo.reduce
+CHECK:     xtile.insert
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:     xtile.extract
 CHECK:     tt.reshape
 CHECK:     tt.reduce
 CHECK:     tt.reduce
-CHECK:     triton_xla.insert
-)"));
+CHECK:     xtile.insert
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2901,7 +3250,7 @@ ENTRY entry_computation {
 TEST_F(TritonEmitterTest, ConvertF16ToF8E5M2Exhaustive) {
   // TODO(b/396595945): enable post-Ampere once Triton respects RTNE semantics
   // on H100.
-  if (auto cc = std::get_if<se::CudaComputeCapability>(&GpuComputeCapability());
+  if (auto cc = GpuComputeCapability().cuda_compute_capability();
       cc && cc->IsAtLeastHopper()) {
     GTEST_SKIP() << "Skipping tests above Ampere, Triton's conversion isn't "
                     "always correct";
@@ -3047,10 +3396,20 @@ ENTRY entry {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  // We expect that for loop instruction will be optimized away.
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fdot", R"(
+
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(this, kHloText, "fdot",
+                                                    R"(
+CHECK:  stablehlo.dot_general
+CHECK:  arith.addf
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK: tt.dot {{.*}} -> tensor<16x16xf32>
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "fdot")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
@@ -3110,27 +3469,122 @@ ENTRY entry {
 
   const bool is_tma_allowed = GetParam();
   std::string hlo_text = absl::Substitute(kHloTextTemplate, is_tma_allowed);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, hlo_text, "fdot", R"(
-CHECK:      func.func @triton_fn(%[[ARG0:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                    %[[ARG1:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                    %[[ARG2:[A-Za-z0-9_]*]]: !tt.ptr<f32>)
+
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(this, hlo_text, "fdot",
+                                                    R"(
+CHECK:      xtile.entry_func @xtile_dialect_fn(%[[ARG0:[A-Za-z0-9_]*]]: memref<32x123xf32>
+CHECK-SAME:                             %[[ARG1:[A-Za-z0-9_]*]]: memref<123x512xf32>
+CHECK-SAME:                             %[[ARG2:[A-Za-z0-9_]*]]: memref<32x512xf32>
 CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
 CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
 CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
 CHECK:      {{.*}} = scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]]
 CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>) {
-CHECK-DAG:  triton_xla.extract from %[[ARG0]]
-CHECK-DAG:  triton_xla.extract from %[[ARG1]]
+CHECK-DAG:  xtile.extract %[[ARG0]]
+CHECK-DAG:  xtile.extract %[[ARG1]]
+CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
+CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
+CHECK:      stablehlo.dot_general {{.*}} (tensor<16x32xf32>, tensor<32x64xf32>) -> tensor<16x64xf32>
+CHECK:      arith.addf {{.*}}
+CHECK:      scf.yield {{.*}} : tensor<16x64xf32>
+CHECK-COUNT-1: xtile.insert
+
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:      xtile.entry_func @xtile_dialect_fn(%[[ARG0:[A-Za-z0-9_]*]]: memref<32x123xf32>
+CHECK-SAME:                             %[[ARG1:[A-Za-z0-9_]*]]: memref<123x512xf32>
+CHECK-SAME:                             %[[ARG2:[A-Za-z0-9_]*]]: memref<32x512xf32>
+CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
+CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+CHECK:      {{.*}} = scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]]
+CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>) {
+CHECK-DAG:  xtile.extract %[[ARG0]]
+CHECK-DAG:  xtile.extract %[[ARG1]]
 CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
 CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
 CHECK:      tt.dot {{.*}} tensor<16x32xf32> * tensor<32x64xf32> -> tensor<16x64xf32>
 CHECK:      scf.yield {{.*}} : tensor<16x64xf32>
-CHECK-COUNT-1: triton_xla.insert
-)"));
+CHECK-COUNT-1: xtile.insert
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "fdot")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
 
+TEST_F(WarpSpecializationTritonEmitterTest,
+       DotAccumulationLoopUsesWarpSpecialization) {
+  if (!GetCudaComputeCapability().IsAtLeastBlackwell()) {
+    GTEST_SKIP() << "Currently only supported on Blackwell and newer.";
+  }
+
+  const std::string hlo_text = R"(
+flhs {
+  ROOT flhs.p0 = f16[256,256] parameter(0)
+}
+
+frhs {
+  ROOT frhs.p0 = f16[256,256] parameter(0)
+}
+
+fdot {
+  fdot.p0 = f16[256,256] parameter(0)
+  fdot.p1 = f16[256,256] parameter(1)
+  fdot.lhs = f16[256,256] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["128", "64"]}],
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
+      }
+    }
+  }
+  fdot.rhs = f16[256,256]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "128"]}],
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
+      }
+    }
+  }
+  ROOT fdot.root = f16[256,256]{1,0} dot(fdot.lhs, fdot.rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_f16_f16_f32
+}
+
+ENTRY entry {
+  entry.p0 = f16[256,256] parameter(0)
+  entry.p1 = f16[256,256] parameter(1)
+  ROOT fusion = f16[256,256] fusion(entry.p0, entry.p1),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128", "128"]}],
+          "num_warps":"8",
+          "num_ctas":"1",
+          "num_stages":"1",
+          "is_tma_allowed":"1",
+          "is_warp_specialization_allowed":"1"}}}
+})";
+
+  // Check that the IR attribute is set correctly.
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, hlo_text, "fdot", R"(
+  // CHECK:       scf.for
+  // CHECK:       scf.yield
+  // CHECK-NEXT:  tt.warp_specialize
+  // )"));
+
+  // Make sure it runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(TritonEmitterTest, MaskedDotIsEmittedCorrectly) {
   const std::string kHloText = R"(
 flhs {
@@ -3179,7 +3633,21 @@ ENTRY entry {
           "num_stages":"1"}}}
 })";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fdot", R"(
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(this, kHloText, "fdot", R"(
+  // Ensure that masking is applied only conditionally to both operands.
+  CHECK:      %[[MASKED_OPERAND0:.*]] = scf.if
+  CHECK:        %[[SELECT0:.*]] = arith.select
+  CHECK-NEXT:   scf.yield %[[SELECT0]]
+  CHECK:      %[[MASKED_OPERAND1:.*]] = scf.if
+  CHECK:        %[[SELECT1:.*]] = arith.select
+  CHECK-NEXT:   scf.yield %[[SELECT1]]
+  CHECK:      stablehlo.dot_general %[[MASKED_OPERAND0]], %[[MASKED_OPERAND1]]
+  CHECK:      arith.addf %{{.*}}
+  )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
   // Ensure that masking is applied only conditionally to both operands.
   CHECK:      %[[MASKED_OPERAND0:.*]] = scf.if
   CHECK:        %[[SELECT0:.*]] = arith.select
@@ -3188,7 +3656,8 @@ ENTRY entry {
   CHECK:        %[[SELECT1:.*]] = arith.select
   CHECK-NEXT:   scf.yield %[[SELECT1]]
   CHECK:      tt.dot %[[MASKED_OPERAND0]], %[[MASKED_OPERAND1]]
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "fdot")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
@@ -3362,13 +3831,13 @@ nest1 {
 }
 
 nest2 {
-  ROOT p0 = s32[128] parameter(0)
+  ROOT p0 = s32[25] parameter(0)
 }
 
 concatenate_fusion {
   p0 = s32[128] parameter(0)
   p1 = s32[128] parameter(1)
-  p2 = s32[128] parameter(2)
+  p2 = s32[25] parameter(2)
 
   fusion0 = s32[128] fusion(p0), kind=kCustom, calls=nest0, backend_config={
     "fusion_backend_config":{
@@ -3386,7 +3855,7 @@ concatenate_fusion {
         "num_warps":"1",
         "num_ctas":"1",
         "num_stages":"1"}}}
-  fusion2 = s32[128] fusion(p2), kind=kCustom, calls=nest2, backend_config={
+  fusion2 = s32[25] fusion(p2), kind=kCustom, calls=nest2, backend_config={
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion",
       "block_level_fusion_config":{
@@ -3395,14 +3864,15 @@ concatenate_fusion {
         "num_ctas":"1",
         "num_stages":"1"}}}
 
-  ROOT concatenate = s32[384] concatenate(fusion0, fusion1, fusion2), dimensions={0}
+  ROOT concatenate = s32[281] concatenate(fusion0, fusion1, fusion2),
+    dimensions={0}
 }
 
 ENTRY main {
   p0 = s32[128] parameter(0)
   p1 = s32[128] parameter(1)
-  p2 = s32[128] parameter(2)
-  ROOT fusion = s32[384] fusion(p0, p1, p2), kind=kCustom,
+  p2 = s32[25] parameter(2)
+  ROOT fusion = s32[281] fusion(p0, p1, p2), kind=kCustom,
     calls=concatenate_fusion, backend_config={
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion",
@@ -3726,13 +4196,26 @@ TEST_P(BasicDotAlgorithmEmitterTest, BasicAlgorithmIsEmittedCorrectly) {
                           algorithm_util::GetDotAccumulatorType(algorithm));
   const std::string kHloText = GetDotAlgorithmHlo(in_ty, out_ty, algorithm);
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
-      this, kHloText, "dot",
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(
+          this, kHloText, "dot",
+          absl::Substitute(
+              R"(
+  CHECK:  stablehlo.dot_general{{.*}} : (tensor<16x32x$0>, tensor<32x64x$0>) -> tensor<16x64x$1>
+  CHECK:  arith.addf
+  )",
+              primitive_util::LowercasePrimitiveTypeName(in_ty),
+              primitive_util::LowercasePrimitiveTypeName(out_ty))));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(),
       absl::Substitute(R"(
   CHECK:  tt.dot{{.*}} : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
   )",
                        primitive_util::LowercasePrimitiveTypeName(in_ty),
-                       primitive_util::LowercasePrimitiveTypeName(out_ty))));
+                       primitive_util::LowercasePrimitiveTypeName(out_ty)),
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "dot")));
 
   EXPECT_TRUE(
       RunAndCompareNoHloPasses(kHloText, ErrorSpecForDotAlgorithm(algorithm)));
@@ -3766,21 +4249,26 @@ TEST_P(MultiDotAlgorithmEmitterTest, MultiDotAlgorithmIsEmittedCorrectly) {
       algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? F32 : BF16;
   // Dummy value to ensure that the dot count is explicitly set.
   int dot_count_for_algorithm = 0x1337;
+  int stablehlo_dot_count_for_algorithm = 0x1337;
   std::string input_precision_string = "";
   switch (algorithm) {
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
       dot_count_for_algorithm = 3;
+      stablehlo_dot_count_for_algorithm = 3;
       break;
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
       dot_count_for_algorithm = 6;
+      stablehlo_dot_count_for_algorithm = 6;
       break;
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
       dot_count_for_algorithm = 9;
+      stablehlo_dot_count_for_algorithm = 9;
       break;
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
       // Triton implements TF32x3 as a specific precision mode.
       input_precision_string = "tf32x3";
       dot_count_for_algorithm = 1;
+      stablehlo_dot_count_for_algorithm = 3;
       break;
     default:
       // Unreachable.
@@ -3789,14 +4277,24 @@ TEST_P(MultiDotAlgorithmEmitterTest, MultiDotAlgorithmIsEmittedCorrectly) {
 
   const std::string kHloText = GetDotAlgorithmHlo(in_ty, out_ty, algorithm);
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
-      this, kHloText, "dot",
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "dot",
+                                absl::Substitute(
+                                    R"(
+  CHECK:  stablehlo.dot_general{{.*}} num_primitive_operations = $0, {{.*}}
+  )",
+                                    stablehlo_dot_count_for_algorithm)));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(),
       absl::Substitute(R"(
   CHECK-COUNT-$2:  tt.dot{{.*}}$3{{.*}} : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
   )",
                        primitive_util::LowercasePrimitiveTypeName(in_ty),
                        primitive_util::LowercasePrimitiveTypeName(out_ty),
-                       dot_count_for_algorithm, input_precision_string)));
+                       dot_count_for_algorithm, input_precision_string),
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "dot")));
 
   EXPECT_TRUE(
       RunAndCompareNoHloPasses(kHloText, ErrorSpecForDotAlgorithm(algorithm)));
@@ -3829,14 +4327,32 @@ TEST_P(TF32DotAlgorithmEmitterTest, TF32AlgorithmsUseTF32InputPrecision) {
       algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? "tf32x3"
                                                              : "tf32";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
-      this, kHloText, "dot",
+  std::string num_primitive_operations_string =
+      algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? "3" : "1";
+
+  // TODO(basioli): maybe algorithm string?
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(
+          this, kHloText, "dot",
+          absl::Substitute(
+              R"(
+  CHECK:  stablehlo.dot_general{{.*}}, contracting_dims = [1] x [0], {{.*}} algorithm = <lhs_precision_type = tf32, rhs_precision_type = tf32, accumulation_type = f32, lhs_component_count = 1, rhs_component_count = 1, num_primitive_operations = $2, allow_imprecise_accumulation = false> : (tensor<16x32x$0>, tensor<32x64x$0>) -> tensor<16x64x$1>
+  )",
+              primitive_util::LowercasePrimitiveTypeName(in_ty),
+              primitive_util::LowercasePrimitiveTypeName(out_ty),
+              num_primitive_operations_string)));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(),
       absl::Substitute(R"(
   CHECK:  tt.dot{{.*}} inputPrecision = $2 : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
   )",
                        primitive_util::LowercasePrimitiveTypeName(in_ty),
                        primitive_util::LowercasePrimitiveTypeName(out_ty),
-                       input_precision_string)));
+                       input_precision_string),
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "dot")));
+
   // No need to `RunAndCompare` here, these algorithms are already covered by
   // other tests.
 }
@@ -3887,7 +4403,7 @@ INSTANTIATE_TEST_SUITE_P(
     DotUnsetAlgorithmEmitterTest::ParamToString);
 
 TEST_F(TritonEmitterTest, ScaledDotIsSupportedByReferencePlatform) {
-  if (!std::get_if<se::CudaComputeCapability>(&GpuComputeCapability())) {
+  if (GpuComputeCapability().IsRocm()) {
     GTEST_SKIP() << "Ignore scaled dot test on ROCM.";
   }
   constexpr absl::string_view kHloText = R"(
@@ -3908,7 +4424,7 @@ TEST_F(TritonEmitterTest, ScaledDotIsSupportedByReferencePlatform) {
 }
 
 TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
-  if (std::get_if<se::CudaComputeCapability>(&GpuComputeCapability())) {
+  if (GpuComputeCapability().IsCuda()) {
     GTEST_SKIP() << "Warp size is always 32 on CUDA";
   }
 
@@ -3938,7 +4454,8 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
   const se::DeviceDescription dev_info =
       TestGpuDeviceInfo::AMDMI210DeviceInfo();
   TF_ASSERT_OK(TritonWrapper(
-      "test_fn", triton_fusion, se::RocmComputeCapability("gfx942"), dev_info,
+      "test_fn", triton_fusion,
+      se::GpuComputeCapability{se::RocmComputeCapability("gfx942")}, dev_info,
       BlockLevelParameters(), &llvm_module, mlir_context));
   TF_EXPECT_OK(tsl::Env::Default()->GetMatchingPaths(
       tsl::io::JoinPath(output_directory, "*.triton-passes.log"), &paths));
@@ -3954,7 +4471,8 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
   const se::DeviceDescription dev_info_n =
       TestGpuDeviceInfo::AMDRX7900DeviceInfo();
   TF_ASSERT_OK(TritonWrapper(
-      "test_fn", triton_fusion, se::RocmComputeCapability("gfx1100"),
+      "test_fn", triton_fusion,
+      se::GpuComputeCapability{se::RocmComputeCapability("gfx1100")},
       dev_info_n, BlockLevelParameters(), &llvm_module, mlir_context));
   TF_EXPECT_OK(tsl::Env::Default()->GetMatchingPaths(
       tsl::io::JoinPath(output_directory, "*.triton-passes.log"), &paths));
@@ -4034,6 +4552,277 @@ ENTRY entry {
       kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
 
+TEST_F(TritonEmitterTest, BF16WithSmallRHSOuterDimDoesNotCrash) {
+  const std::string kHloText = R"(
+flhs {
+  ROOT flhs.p0 = bf16[64,32] parameter(0)
+}
+
+frhs {
+  ROOT frhs.p0 = bf16[32,8] parameter(0)
+}
+
+fdot {
+  fdot.p0 = bf16[64,32] parameter(0)
+  fdot.p1 = bf16[32,8] parameter(1)
+  fdot.lhs = bf16[64,32] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "32"]}]
+      }
+    }
+  }
+  fdot.rhs = bf16[32,8]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "8"]}]
+      }
+    }
+  }
+  ROOT fdot.root = bf16[64,8]{1,0} dot(fdot.lhs, fdot.rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  entry.p0 = bf16[64,32] parameter(0)
+  entry.p1 = bf16[32,8] parameter(1)
+  ROOT fusion = bf16[64,8] fusion(entry.p0, entry.p1),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["64","8"]}],
+          "num_warps":"4",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-1, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonEmitterTest, UseTransposedDotScheduleWhenDotLhsIsSmallerThanRhs) {
+  constexpr int tile_batch = 1;
+  constexpr int tile_m = 16;
+  constexpr int tile_n = 8;
+  constexpr int tile_k = 32;
+  const std::string hlo_text =
+      absl::Substitute(R"(
+
+lhs {
+  ROOT p0 = f32[2,32,64] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[2,64,128] parameter(0)
+}
+
+fusion {
+  p0 = f32[2,32,64] parameter(0)
+  p1 = f32[2,64,128] parameter(1)
+
+  lhs = f32[2,32,64] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["$0", "$1", "$2"]}]
+      }
+    }
+  }
+  rhs = f32[2,64,128] fusion(p1), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["$0", "$1", "$3"]}]
+      }
+    }
+  }
+
+  ROOT dot = f32[2,32,128] dot(lhs, rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  p0 = f32[2,32,64] parameter(0)
+  p1 = f32[2,64,128] parameter(1)
+  ROOT fusion = f32[2,32,128] fusion(p0, p1),
+    kind=kCustom, calls=fusion, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["$1", "$2", "$3"]}],
+          "num_warps":"4",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})",
+                       tile_k, tile_batch, tile_m, tile_n);
+
+  int64_t m = 32;
+  int64_t n = 128;
+
+  int64_t num_m_tiles = (m / tile_m);
+  int64_t num_n_tiles = (n / tile_n);
+
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      this, hlo_text, "fusion",
+      absl::Substitute(
+          R"(
+CHECK-DAG: (pid_0) -> ((pid_0 mod $0) * $1)
+CHECK-DAG: (pid_0) -> (((pid_0 floordiv $0) mod $2) * $3)
+)",
+          num_m_tiles, tile_m, num_n_tiles, tile_n)));
+
+  // Ensure that the transposed schedule still produces correct numerics.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{/*aabs=*/2e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonEmitterTest, UseMajorToMinorScheduleWhenDotLhsIsLargerThanRhs) {
+  constexpr int tile_batch = 1;
+  constexpr int tile_m = 16;
+  constexpr int tile_n = 8;
+  constexpr int tile_k = 32;
+  const std::string hlo_text =
+      absl::Substitute(R"(
+
+lhs {
+  ROOT p0 = f32[2,128,64] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[2,64,32] parameter(0)
+}
+
+fusion {
+  p0 = f32[2,128,64] parameter(0)
+  p1 = f32[2,64,32] parameter(1)
+
+  lhs = f32[2,128,64] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["$0", "$1", "$2"]}]
+      }
+    }
+  }
+  rhs = f32[2,64,32] fusion(p1), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["$0", "$1", "$3"]}]
+      }
+    }
+  }
+
+  ROOT dot = f32[2,128,32] dot(lhs, rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  p0 = f32[2,128,64] parameter(0)
+  p1 = f32[2,64,32] parameter(1)
+  ROOT fusion = f32[2,128,32] fusion(p0, p1),
+    kind=kCustom, calls=fusion, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["$1", "$2", "$3"]}],
+          "num_warps":"4",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})",
+                       tile_k, tile_batch, tile_m, tile_n);
+
+  int64_t m = 128;
+  int64_t n = 32;
+
+  int64_t num_m_tiles = (m / tile_m);
+  int64_t num_n_tiles = (n / tile_n);
+
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      this, hlo_text, "fusion",
+      absl::Substitute(
+          R"(
+CHECK-DAG: (pid_0) -> ((pid_0 mod $0) * $1)
+CHECK-DAG: (pid_0) -> (((pid_0 floordiv $0) mod $2) * $3)
+)",
+          num_n_tiles, tile_n, num_m_tiles, tile_m)));
+
+  // Ensure that the major-to-minor schedule still produces correct numerics.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{/*aabs=*/2e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonEmitterTest, UseMajorToMinorScheduleWhenFusionIsNotRootedInDot) {
+  constexpr int tile_batch = 1;
+  constexpr int tile_m = 16;
+  constexpr int tile_n = 8;
+  constexpr int tile_k = 32;
+  const std::string hlo_text =
+      absl::Substitute(R"(
+
+lhs {
+  ROOT p0 = f32[2,32,64] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[2,64,128] parameter(0)
+}
+
+fusion {
+  p0 = f32[2,32,64] parameter(0)
+  p1 = f32[2,64,128] parameter(1)
+
+  lhs = f32[2,32,64] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["$0", "$1", "$2"]}]
+      }
+    }
+  }
+  rhs = f32[2,64,128] fusion(p1), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["$0", "$1", "$3"]}]
+      }
+    }
+  }
+
+  dot = f32[2,32,128] dot(lhs, rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  ROOT abs = f32[2,32,128] abs(dot)
+}
+
+ENTRY main {
+  p0 = f32[2,32,64] parameter(0)
+  p1 = f32[2,64,128] parameter(1)
+  ROOT fusion = f32[2,32,128] fusion(p0, p1),
+    kind=kCustom, calls=fusion, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["$1", "$2", "$3"]}],
+          "num_warps":"4",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})",
+                       tile_k, tile_batch, tile_m, tile_n);
+
+  int64_t m = 32;
+  int64_t n = 128;
+
+  int64_t num_m_tiles = (m / tile_m);
+  int64_t num_n_tiles = (n / tile_n);
+
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      this, hlo_text, "fusion",
+      absl::Substitute(
+          R"(
+CHECK-DAG: (pid_0) -> ((pid_0 mod $0) * $1)
+CHECK-DAG: (pid_0) -> (((pid_0 floordiv $0) mod $2) * $3)
+)",
+          num_n_tiles, tile_n, num_m_tiles, tile_m)));
+}
+
 struct ScaleDotTestParams {
   std::string lhs_type;
   std::string rhs_type;
@@ -4078,8 +4867,6 @@ class TritonScaledDotGemmTest
     debug_options.set_xla_gpu_experimental_scaled_dot_with_triton(true);
     debug_options.set_xla_gpu_autotune_level(0);
     debug_options.set_xla_gpu_cublas_fallback(false);
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
     return debug_options;
   }
 };
@@ -4178,7 +4965,7 @@ ENTRY e {
     calls=triton_dot,
     backend_config={
       "fusion_backend_config": {
-        kind: "__triton_scaled_dot_fusion",
+        kind: "__triton_nested_gemm_fusion",
         "block_level_fusion_config":{
           "output_tiles":[{"sizes":["128", "256"]}],
           "num_warps":"4",
@@ -4204,16 +4991,8 @@ ENTRY e {
   auto expected_triton_ir = absl::StrReplaceAll(
       kExpectedTritonIrTmpl, {{"$triton_type", params.expected_triton_type}});
   EXPECT_THAT(
-      CreateTritonIrAndFileCheck(*module->GetComputationWithName("triton_dot"),
-                                 /*block_level_parameters=*/
-                                 {
-                                     {{128, 256}},
-                                     4,
-                                     1,
-                                     1,
-                                     true,
-                                 },
-                                 expected_triton_ir),
+      CreateTritonIrAndFileCheckForDot(
+          *module->GetComputationWithName("triton_dot"), expected_triton_ir),
       absl_testing::IsOk());
   if (GetCudaComputeCapability().IsAtLeastBlackwell()) {
     CompileAndOptionallyVerifyPtx(
@@ -4275,8 +5054,6 @@ class TritonScaledDotTest : public TritonEmitterTest {
     debug_options.set_xla_gpu_experimental_scaled_dot_with_triton(true);
     debug_options.set_xla_gpu_autotune_level(0);
     debug_options.set_xla_gpu_cublas_fallback(false);
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
     return debug_options;
   }
 
@@ -4293,19 +5070,20 @@ class TritonScaledDotTest : public TritonEmitterTest {
   }
 };
 
-TEST_F(TritonScaledDotTest, ScaledDotWithBatchGetFusedAndExecutedCorrectly) {
+TEST_F(TritonScaledDotTest,
+       ScaledDotWithOmmittedLhsScaleGetFusedAndExecutedCorrectly) {
   if (!GetCudaComputeCapability().IsAtLeastHopper()) {
     GTEST_SKIP() << "Skipping test for pre-Hopper GPUs.";
   }
   constexpr absl::string_view kHloTextTemplate = R"hlo(
-HloModule ScaledDotWithBatchGetFusedAndExecutedCorrectly
+HloModule ScaledDotWithOmmittedLhsScaleGetFusedAndExecutedCorrectly
 
 ENTRY e {
-  lhs = f8e4m3fn[3,128,128] parameter(0)
+  lhs = bf16[3,128,128] parameter(0)
   rhs = f8e4m3fn[3,128,128] parameter(1)
-  lhs_scale = f8e8m0fnu[3,128,4] parameter(2)
-  rhs_scale = f8e8m0fnu[3,128,4 ] parameter(3)
-  ROOT _ = bf16[3,128,128] scaled-dot(lhs, rhs, lhs_scale, rhs_scale),
+  constant = bf16[1,1,1] constant(1.0)
+  rhs_scale = f8e8m0fnu[3,128,4] parameter(2)
+  ROOT _ = bf16[3,128,128] scaled-dot(lhs, rhs, constant, rhs_scale),
     lhs_batch_dims={0},
     rhs_batch_dims={0},
     lhs_contracting_dims={2},
@@ -4333,6 +5111,120 @@ ENTRY e {
     }
   }
 
+  HloComputation* scaled_dot_computation = GetFirstComputationWithInstruction(
+      *optimized_module, HloOpcode::kScaledDot);
+  constexpr absl::string_view kExpectedTritonIr = R"(
+      CHECK: tt.dot_scaled
+      CHECK: tensor<16x128xbf16>
+      CHECK: tensor<128x16xf8E4M3FN>, tensor<16x4xi8>
+      CHECK: -> tensor<16x16xf32>
+  )";
+  EXPECT_THAT(CreateTritonIrAndFileCheckForDot(*scaled_dot_computation,
+                                               kExpectedTritonIr),
+              absl_testing::IsOk());
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(optimized_module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonScaledDotTest, ScaledDotWithBatchGetFusedAndExecutedCorrectly) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Skipping test for pre-Hopper GPUs.";
+  }
+  constexpr absl::string_view kHloTextTemplate = R"hlo(
+HloModule ScaledDotWithBatchGetFusedAndExecutedCorrectly
+
+ENTRY e {
+  lhs = f8e4m3fn[3,128,128] parameter(0)
+  rhs = f8e4m3fn[3,128,128] parameter(1)
+  lhs_scale = f8e8m0fnu[3,128,4] parameter(2)
+  rhs_scale = f8e8m0fnu[3,128,4 ] parameter(3)
+  ROOT _ = bf16[3,128,128] scaled-dot(lhs, rhs, lhs_scale, rhs_scale),
+    lhs_batch_dims={0},
+    rhs_batch_dims={0},
+    lhs_contracting_dims={2},
+    rhs_contracting_dims={2}
+}
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloTextTemplate));
+  TF_ASSERT_OK_AND_ASSIGN(auto optimized_module,
+                          GetOptimizedModule(std::move(module)));
+  constexpr absl::string_view kExpectedOptimizedHLO = R"(
+    CHECK: fusion
+    CHECK: ROOT {{.*}} scaled-dot
+    CHECK: ENTRY
+    CHECK: __triton_nested_gemm_fusion
+  )";
+  EXPECT_THAT(RunFileCheck(optimized_module->ToString(), kExpectedOptimizedHLO),
+              true);
+
+  HloComputation* scaled_dot_computation = GetFirstComputationWithInstruction(
+      *optimized_module, HloOpcode::kScaledDot);
+  constexpr absl::string_view kExpectedTritonIr = R"(
+      CHECK: tt.dot_scaled
+      CHECK: tensor<16x128xf8E4M3FN>, tensor<16x4xi8>
+      CHECK: tensor<128x16xf8E4M3FN>, tensor<16x4xi8>
+      CHECK: -> tensor<16x16xf32>
+  )";
+  EXPECT_THAT(CreateTritonIrAndFileCheckForDot(*scaled_dot_computation,
+                                               kExpectedTritonIr),
+              absl_testing::IsOk());
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(optimized_module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonScaledDotTest, BroadcastAndReshapeGetFused) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Skipping test for pre-Hopper GPUs.";
+  }
+  constexpr absl::string_view kHloTextTemplate = R"hlo(
+HloModule ScaledDotWithBatchGetFusedAndExecutedCorrectly
+
+ENTRY e {
+  lhs = f8e4m3fn[3,128,128] parameter(0)
+  rhs = f8e4m3fn[3,128,128] parameter(1)
+  lhs_scale = f8e8m0fnu[3,128,1] parameter(2)
+  lhs_scale_broadcasted = f8e8m0fnu[3,128,1,4] broadcast(lhs_scale),
+      dimensions={0,1,2}
+  lhs_scale_reshaped = f8e8m0fnu[3,128,4] reshape(lhs_scale_broadcasted)
+  rhs_scale = f8e8m0fnu[3,128,1] parameter(3)
+  rhs_scale_broadcasted = f8e8m0fnu[3,128,1,4] broadcast(rhs_scale),
+      dimensions={0,1,2}
+  rhs_scale_reshaped = f8e8m0fnu[3,128,4] reshape(rhs_scale_broadcasted)
+  ROOT _ = bf16[3,128,128] scaled-dot(
+      lhs,
+      rhs,
+      lhs_scale_reshaped,
+      rhs_scale_reshaped),
+    lhs_batch_dims={0},
+    rhs_batch_dims={0},
+    lhs_contracting_dims={2},
+    rhs_contracting_dims={2}
+}
+  )hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloTextTemplate));
+  TF_ASSERT_OK_AND_ASSIGN(auto optimized_module,
+                          GetOptimizedModule(std::move(module)));
+  constexpr absl::string_view kExpectedOptimizedHLO = R"(
+    CHECK: ROOT %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} broadcast(%{{.*}}), dimensions={0,1}
+    CHECK: ROOT %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} broadcast(%{{.*}}), dimensions={0,1}
+    CHECK: %fusion
+    CHECK: %[[parameter_2:.*]] = f8e8m0fnu[3,128]{1,0} parameter(2)
+    CHECK: %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} fusion(%[[parameter_2]])
+    CHECK: %[[parameter_3:.*]] = f8e8m0fnu[3,128]{1,0} parameter(3)
+    CHECK: %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} fusion(%[[parameter_3]])
+    CHECK: ROOT {{.*}} scaled-dot
+    CHECK: ENTRY
+    CHECK: __triton_nested_gemm_fusion
+  )";
+  EXPECT_THAT(RunFileCheck(optimized_module->ToString(), kExpectedOptimizedHLO),
+              true);
+
   HloComputation* scaled_dot_computation = GetFirstComputationWithInstruction(
       *optimized_module, HloOpcode::kScaledDot);
   constexpr absl::string_view kExpectedTritonIr = R"(
@@ -4341,16 +5233,8 @@ ENTRY e {
       CHECK: tensor<128x16xf8E4M3FN>, tensor<16x4xi8>
       CHECK: -> tensor<16x16xf32>
   )";
-  EXPECT_THAT(CreateTritonIrAndFileCheck(*scaled_dot_computation,
-                                         /*block_level_parameters=*/
-                                         {
-                                             {{1, 16, 16}},
-                                             4,
-                                             1,
-                                             1,
-                                             false,
-                                         },
-                                         kExpectedTritonIr),
+  EXPECT_THAT(CreateTritonIrAndFileCheckForDot(*scaled_dot_computation,
+                                               kExpectedTritonIr),
               absl_testing::IsOk());
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
index f921a9b9f96fb5..5c7ba14840ae34 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -30,18 +31,19 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
 namespace xla::gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
 using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
 
+using TritonEmitterDevicelessTest = HloHardwareIndependentTestBase;
+
 class AnnotationsTest : public HloHardwareIndependentTestBase {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
@@ -52,14 +54,41 @@ class AnnotationsTest : public HloHardwareIndependentTestBase {
   }
 };
 
+class WarpSpecializationTritonEmitterTest : public TritonEmitterDevicelessTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options =
+        TritonEmitterDevicelessTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_experimental_enable_triton_tma(true);
+    debug_options.set_xla_gpu_experimental_enable_triton_warp_specialization(
+        true);
+    return debug_options;
+  }
+};
+
 TEST_F(AnnotationsTest, Annotations) {
   static constexpr absl::string_view kHloText = R"(
 HloModule Annotations
 
+triton_dot_lhs {
+  p0 = f32[8,8] parameter(0)
+  ROOT copy = f32[8,8] copy(p0)
+}
+triton_dot_rhs {
+  p1 = f32[8,8] parameter(0)
+  ROOT copy = f32[8,8] copy(p1)
+}
+
 triton_dot {
   p0 = f32[8,8] parameter(0)
   p1 = f32[8,8] parameter(1)
-  ROOT dot = f32[8,8] dot(p0, p1),
+  a = f32[8,8] fusion(p0), kind=kCustom, calls=triton_dot_lhs,
+    backend_config={"fusion_backend_config": {kind: "__triton_nested_gemm_fusion",
+    block_level_fusion_config: {output_tiles:[{sizes:["8","8"]}]}}}
+  b = f32[8,8] fusion(p1), kind=kCustom, calls=triton_dot_rhs,
+    backend_config={"fusion_backend_config": {kind: "__triton_nested_gemm_fusion",
+    block_level_fusion_config: {output_tiles:[{sizes:["8","8"]}]}}}
+  ROOT dot = f32[8,8] dot(a, b),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     algorithm=dot_bf16_bf16_f32_x3
 }
@@ -68,13 +97,10 @@ ENTRY e {
   p0 = f32[8,8]{1, 0} parameter(0)
   p1 = f32[8,8]{1, 0} parameter(1)
   ROOT _ = f32[8,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
+    backend_config={"fusion_backend_config": {kind: "__triton_nested_gemm_fusion",
+      block_level_fusion_config:
       {
-        "block_m":32,
-        "block_n":32,
-        "block_k":32,
-        "split_k":1,
+        "output_tiles":[{"sizes":["8","8"]}],
         "num_stages":1,
         "num_warps":1,
         "num_ctas":1
@@ -87,12 +113,16 @@ ENTRY e {
   auto* fusion = Cast<HloFusionInstruction>(
       module->entry_computation()->root_instruction());
 
-  mlir::MLIRContext context;
+  mlir::MLIRContext mlir_context;
   TF_ASSERT_OK_AND_ASSIGN(
       auto triton_module,
       CreateTritonModule("triton_fn", fusion,
                          TestGpuDeviceInfo::RTXA6000DeviceInfo(),
-                         BlockLevelParameters(), context));
+                         BlockLevelParameters::FromBlockLevelFusionConfig(
+                             fusion->backend_config<GpuBackendConfig>()
+                                 ->fusion_backend_config()
+                                 .block_level_fusion_config()),
+                         mlir_context));
 
   std::string annotated_ir = DumpTritonIR(triton_module.get(), true);
 
@@ -100,7 +130,8 @@ ENTRY e {
     EXPECT_THAT(RunFileCheck(annotated_ir, R"(
       CHECK:  [[SOMETHING:.*]] "triton_dot -> [[FILE_LINE:fusion_emitter.*:.*]]"
     )"),
-                absl_testing::IsOkAndHolds(true));
+                absl_testing::IsOkAndHolds(true))
+        << annotated_ir;
   } else {
     EXPECT_THAT(RunFileCheck(annotated_ir, R"(
       CHECK:  [[SOMETHING:.*]] "triton_dot"
@@ -109,8 +140,6 @@ ENTRY e {
   }
 }
 
-using TritonEmitterDevicelessTest = HloHardwareIndependentTestBase;
-
 TEST_F(TritonEmitterDevicelessTest, FailsGracefullyIfNumWarpsIsMissing) {
   constexpr absl::string_view kHloText = R"(
 triton_computation {
@@ -142,14 +171,176 @@ ENTRY entry {
   block_level_parameters.output_tile_sizes = {{1, 1}};
   block_level_parameters.num_warps = 0;
 
-  EXPECT_THAT(TritonWrapper("test_fn", triton_fusion,
-                            se::CudaComputeCapability::Hopper(), dev_info,
-                            block_level_parameters, &llvm_module, mlir_context),
+  EXPECT_THAT(TritonWrapper(
+                  "test_fn", triton_fusion,
+                  se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+                  dev_info, block_level_parameters, &llvm_module, mlir_context),
               absl_testing::StatusIs(
                   absl::StatusCode::kFailedPrecondition,
                   ::testing::HasSubstr(
                       "(num_warps, num_ctas, num_stages) must be positive")));
 }
 
+TEST_F(TritonEmitterDevicelessTest,
+       BitcastReshapeDifferentTotalSizeRegressionTest) {
+  // This is a regression test for a bug where indexing analysis would fail to
+  // correctly preserve trivial dimensions, causing symbolic tile analysis to
+  // produce an incorrect reshape.
+  const std::string kHloText = R"(
+parameter0 {
+  p0 = bf16[1,5,4] parameter(0)
+  convert = f32[1,5,4] convert(p0)
+  slice = f32[1,5,2] slice(convert), slice={[0:1], [0:5], [2:4]}
+  ROOT bitcast = f32[5,2] bitcast(slice)
+}
+
+parameter1 {
+  ROOT p0 = f32[5,20]{0,1} parameter(0)
+}
+
+fusion {
+  p0 = bf16[1,5,4] parameter(0)
+  p1 = f32[5,20]{0,1} parameter(1)
+  fusion0 = f32[5,2] fusion(p0), kind=kCustom, calls=parameter0,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2",
+        "output_tiles":[{"sizes":["16","16"]}],
+        "num_ctas":1,
+        "num_stages":1,
+        "is_tma_allowed":false}}}
+  fusion1 = f32[5,20]{0,1} fusion(p1), kind=kCustom, calls=parameter1,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2",
+        "output_tiles":[{"sizes":["16","16"]}],
+        "num_ctas":1,
+        "num_stages":1,
+        "is_tma_allowed":false}}}
+  ROOT dot = f32[2,20] dot(fusion0, fusion1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = bf16[1,5,4] parameter(0)
+  p1 = f32[5,20]{0,1} parameter(1)
+  ROOT root = f32[2,20] fusion(p0, p1), kind=kCustom, calls=fusion,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2",
+        "output_tiles":[{"sizes":["16","16"]}],
+        "num_ctas":1,
+        "num_stages":1,
+        "is_tma_allowed":false}}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  const HloFusionInstruction* triton_fusion = Cast<HloFusionInstruction>(
+      hlo_module->entry_computation()->root_instruction());
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_OK(
+      CreateTritonModule("test_fn", triton_fusion, dev_info,
+                         BlockLevelParameters::FromBlockLevelFusionConfig(
+                             triton_fusion->backend_config<GpuBackendConfig>()
+                                 ->fusion_backend_config()
+                                 .block_level_fusion_config()),
+                         mlir_context));
+}
+
+TEST_F(WarpSpecializationTritonEmitterTest,
+       ExtraWarpsAreRequestedForWarpSpecialization) {
+  const std::string hlo_text = R"(
+flhs {
+  ROOT flhs.p0 = f16[256,256] parameter(0)
+}
+
+frhs {
+  ROOT frhs.p0 = f16[256,256] parameter(0)
+}
+
+fdot {
+  fdot.p0 = f16[256,256] parameter(0)
+  fdot.p1 = f16[256,256] parameter(1)
+  fdot.lhs = f16[256,256] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["128", "64"]}],
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
+      }
+    }
+  }
+  fdot.rhs = f16[256,256]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "128"]}],
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
+      }
+    }
+  }
+  ROOT fdot.root = f16[256,256]{1,0} dot(fdot.lhs, fdot.rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_f16_f16_f32
+}
+
+ENTRY entry {
+  entry.p0 = f16[256,256] parameter(0)
+  entry.p1 = f16[256,256] parameter(1)
+  ROOT fusion = f16[256,256] fusion(entry.p0, entry.p1),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128", "128"]}],
+          "num_warps":"8",
+          "num_ctas":"1",
+          "num_stages":"1",
+          "is_tma_allowed":"1",
+          "is_warp_specialization_allowed":"1"}}}
+})";
+
+  // Check that we extract the launch configuration correctly when warp
+  // specialization is used.
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  auto* fusion = Cast<HloFusionInstruction>(
+      module->entry_computation()->root_instruction());
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+  TF_ASSERT_OK_AND_ASSIGN(
+      TritonWrapperResult result,
+      TritonWrapper("test_fn", fusion, se::CudaComputeCapability::Blackwell(),
+                    dev_info,
+                    BlockLevelParameters::FromBlockLevelFusionConfig(
+                        fusion->backend_config<GpuBackendConfig>()
+                            ->fusion_backend_config()
+                            .block_level_fusion_config()),
+                    &llvm_module, mlir_context));
+
+  // Warp specialization influences the total number of threads we end up
+  // using. Usually we would expect num_warps * warp_size threads per block, but
+  // Triton allocates extra "worker warps" when WS is used.
+  //
+  // NOTE: The value used here is based on inspecting the value in the IR.
+  // Hopefully this is stable across different Triton versions. If it starts
+  // failing, we could modify the value here to match and try to understand why
+  // it changed.
+  EXPECT_EQ(result.thread_dims.x, 384);
+  EXPECT_EQ(result.thread_dims.y, 1);
+  EXPECT_EQ(result.thread_dims.z, 1);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
index 579e6d8e530d6d..6f8e467b719b8d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
@@ -19,9 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "absl/algorithm/container.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
@@ -31,11 +28,9 @@ limitations under the License.
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
@@ -57,96 +52,9 @@ class TritonTest : public GpuCodegenTest {
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
     debug_options
         .set_xla_gpu_experimental_enable_subchannel_dequantisation_fusion(true);
-    // TODO(b/393299275): remove this once flag is on by default and test is
-    // updated.
-    // Note that we clear
-    // xla_gpu_unsupported_generic_triton_emitter_opts here to disable
-    // nest gemm fusion pass as test will run the pass manually.
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
     return debug_options;
   }
 
-  ::testing::AssertionResult RunAndCompare(absl::string_view hlo_text,
-                                           ErrorSpec error_spec) {
-    auto module_or = GetOptimizedModule(hlo_text);
-    if (!module_or.ok()) {
-      return ::testing::AssertionFailure() << module_or.status().message();
-    }
-    return NestFusionsRunAndCompare(std::move(*module_or), error_spec);
-  }
-
-  ::testing::AssertionResult RunAndCompare(std::unique_ptr<HloModule> module,
-                                           ErrorSpec error_spec) {
-    auto module_or = GetOptimizedModule(std::move(module));
-    if (!module_or.ok()) {
-      return ::testing::AssertionFailure() << module_or.status().message();
-    }
-    return NestFusionsRunAndCompare(std::move(*module_or), error_spec);
-  }
-
-  ::testing::AssertionResult RunAndCompareNoHloPasses(
-      absl::string_view hlo_text, ErrorSpec error_spec) {
-    auto module_or = ParseAndReturnVerifiedModule(hlo_text);
-    if (!module_or.ok()) {
-      return ::testing::AssertionFailure() << module_or.status().message();
-    }
-    return NestFusionsRunAndCompare(std::move(*module_or), error_spec);
-  }
-
-  ::testing::AssertionResult NestFusionsRunAndCompare(
-      std::unique_ptr<HloModule> module, ErrorSpec error_spec) {
-    if (absl::Status status = MaybeAddTritonGemmConfig(module.get());
-        !status.ok()) {
-      return ::testing::AssertionFailure() << status.message();
-    }
-    // NestGemmFusion pass is controlled by
-    // xla_gpu_unsupported_generic_triton_emitter_opts flag, set it now.
-    auto* emitter_opts =
-        module->mutable_config()
-            .mutable_debug_options()
-            .mutable_xla_gpu_unsupported_generic_triton_emitter_features();
-    emitter_opts->Add(DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
-    emitter_opts->Add(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
-    emitter_opts->Add(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES);
-    absl::StatusOr<bool> nested_or =
-        NestGemmFusion(device_desc(), &mlir_context_).Run(module.get());
-    if (!nested_or.ok()) {
-      return ::testing::AssertionFailure() << nested_or.status().message();
-    }
-    EXPECT_TRUE(nested_or.value());
-    return GpuCodegenTest::RunAndCompareNoHloPasses(std::move(module),
-                                                    error_spec);
-  }
-
-  absl::Status MaybeAddTritonGemmConfig(HloModule* module) {
-    auto instructions = module->entry_computation()->instructions();
-    auto it = absl::c_find_if(instructions, [](auto instruction) {
-      return instruction->opcode() == HloOpcode::kFusion;
-    });
-    if (it == instructions.end()) {
-      return absl::InternalError("No fusion in entry computation.");
-    }
-    HloInstruction* fusion = *it;
-    TF_ASSIGN_OR_RETURN(auto gpu_config,
-                        fusion->backend_config<GpuBackendConfig>());
-    FusionBackendConfig* backend_config =
-        gpu_config.mutable_fusion_backend_config();
-    if (backend_config->has_triton_gemm_config()) {
-      return absl::OkStatus();
-    }
-    auto* triton_gemm_key = backend_config->mutable_triton_gemm_config();
-    triton_gemm_key->set_block_m(64);
-    triton_gemm_key->set_block_k(64);
-    triton_gemm_key->set_block_n(64);
-    triton_gemm_key->set_split_k(1);
-    triton_gemm_key->set_num_stages(1);
-    triton_gemm_key->set_num_warps(2);
-    triton_gemm_key->set_num_ctas(1);
-    return fusion->set_backend_config(gpu_config);
-  }
-
  protected:
   const stream_executor::DeviceDescription& device_desc() {
     return backend().default_stream_executor()->GetDeviceDescription();
@@ -155,70 +63,63 @@ class TritonTest : public GpuCodegenTest {
 };
 
 // The following tests are for the channel and subchannel dequantization
-// fusions. We run the fused version to avoid the hlo passes and prove that
-// emitters work correctly and unfused version with the goal to fail if an hlo
+// fusions. We run the fused version to avoid the HLO passes and prove that
+// emitters work correctly and unfused version with the goal to fail if an HLO
 // rewrite broke the dequantization logic.
 // For the subchannel dequantization there are two cases:
 // 1. The case where we do:
 //   broadcast -> multiply -> bitcast -> dot.
 // 2. The case where we do:
 //   broadcast -> reshape -> multiply -> dot.
-// On top of that there could be an additional bitcast between the parameter and
-// the broadcast.
 TEST_F(TritonTest, FuseChannelDequantizationFused) {
-  // This test is a Channel Dequantization fusion.
-  // We run the fused version to avoid the hlo passes.
-  // The case where we do:
-  // param(1) -> bitcast -> broadcast -> multiply -> bitcast -> dot.
+  // This test is a channel dequantization fusion of the form:
+  //   param(1) -> bitcast -> broadcast -> multiply -> bitcast -> dot.
+  // In a nested fusion, the parameter bitcast can be hoisted out of the fusion,
+  // and is therefore not materialized in the HLO.
   constexpr absl::string_view kHloText = R"(
-    HloModule FuseChannelDequantizationFused
-
-    fusion {
-      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[32,128,256] convert(w.s4)
-      w.b16 = bf16[32,128,256] convert(w.s8)
-
-      s = bf16[32,1,256] parameter(1)
-      s.bitcast = bf16[32,256] bitcast(s)
-      s.broadcast = bf16[32,128,256] broadcast(s.bitcast), dimensions={0,2}
-      w.scaled = bf16[32,128,256] multiply(w.b16, s.broadcast)
-      w.scaled.bitcast = bf16[32,2,64,256] bitcast(w.scaled)
-
-      a = bf16[1,32,128,2,128] parameter(2)
-      a.bitcast = bf16[32,128,256] bitcast(bf16[1,32,128,2,128] a)
-      a.bitcast.2 = bf16[32,2,64,256] bitcast(a.bitcast)
-      dot = f32[2,32,256,256] dot(w.scaled.bitcast, a.bitcast.2),
-        lhs_batch_dims={1,0}, lhs_contracting_dims={2},
-        rhs_batch_dims={1,0}, rhs_contracting_dims={2}
-      ROOT bitcast = f32[2,32,256,2,1,128] bitcast(f32[2,32,256,256] dot)
-    }
+HloModule FuseChannelDequantizationFused
+
+lhs {
+  parameter_0 = s4[32,2,64,256]{3,2,1,0:E(4)} parameter(0)
+  w.s8 = s8[32,2,64,256]{3,2,1,0} convert(parameter_0)
+  w.b16 = bf16[32,2,64,256]{3,2,1,0} convert(w.s8)
+  parameter_1 = bf16[32,256]{1,0} parameter(1)
+  s.broadcast = bf16[32,2,64,256]{3,2,1,0} broadcast(parameter_1), dimensions={0,3}
+  ROOT w.scaled = bf16[32,2,64,256]{3,2,1,0} multiply(w.b16, s.broadcast)
+}
+
+rhs {
+  ROOT parameter_0 = bf16[32,2,64,256]{3,2,1,0} parameter(0)
+}
+
+fusion {
+  w.s4 = s4[32,2,64,256]{3,2,1,0:E(4)} parameter(0)
+  s = bf16[32,256]{1,0} parameter(1)
+  lhs = bf16[32,2,64,256]{3,2,1,0} fusion(w.s4, s), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1","1","64","128"]}]}}}
+  a = bf16[32,2,64,256]{3,2,1,0} parameter(2)
+  rhs = bf16[32,2,64,256]{3,2,1,0} fusion(a), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1","1","64","128"]}]}}}
+  ROOT dot = f32[2,32,256,256]{3,2,1,0} dot(lhs, rhs),
+    lhs_batch_dims={1,0}, lhs_contracting_dims={2},
+    rhs_batch_dims={1,0}, rhs_contracting_dims={2}
+}
+
+ENTRY entry_computation {
+  w.s4 = s4[32,2,64,256]{3,2,1,0:E(4)} parameter(0)
+  s.bf16 = bf16[32,256]{1,0} parameter(1)
+  a.bf16 = bf16[32,2,64,256]{3,2,1,0} parameter(2)                                                                                                                                                                                              bitcast = bf16[32,2,64,256]{3,2,1,0} bitcast(a.bf16)
+  ROOT fusion = f32[2,32,256,256]{3,2,1,0} fusion(w.s4, s.bf16, a.bf16),
+    kind=kCustom, calls=fusion, backend_config={"fusion_backend_config":
+      {"kind":"__triton_nested_gemm_fusion",
+       "block_level_fusion_config":{
+        "num_warps":"8","output_tiles":[{"sizes":["1","1","128","128"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
 
-    ENTRY entry_computation {
-      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
-      s.bf16 = bf16[32,1,256] parameter(1)
-      a.bf16 = bf16[1,32,128,2,128] parameter(2)
-      ROOT fusion = f32[2,32,256,2,1,128] fusion(w.s4, s.bf16, a.bf16),
-          kind=kCustom,
-          calls=fusion,
-          backend_config={
-            "operation_queue_id":"0",
-            "wait_on_operation_queues":[],
-            "fusion_backend_config":{
-              "kind":"__triton_gemm",
-              "triton_gemm_config":{
-                "block_m":"128",
-                "block_n":"128",
-                "block_k":"64",
-                "split_k":"2",
-                "num_stages":"1",
-                "num_warps":"8",
-                "num_ctas":"1"
-              }
-            },
-            "force_earliest_schedule":false
-          }
-    }
-  )";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
@@ -252,14 +153,14 @@ TEST_F(TritonTest, FuseSubchannelDequantizationWithTranspose) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[bitcast:.*]] = bf16[2,8,64]{2,1,0} bitcast({{.*}})
-    CHECK:    %[[transpose:.*]] = bf16[2,64,8]{2,1,0} transpose(%[[bitcast]]), dimensions={0,2,1}
-    CHECK:    %[[broadcast:.*]] = bf16[2,64,8,256]{3,2,1,0} broadcast(%[[transpose]]), dimensions={0,1,2}
-    CHECK:    %[[multiply:.*]] = bf16[2,64,8,256]{3,2,1,0} multiply({{.*}}, %[[broadcast]])
+    CHECK:    %[[transpose:.*]] = bf16[2,64,8]{2,1,0} transpose(
+    CHECK:    %[[broadcast:.*]] = {{.*}} broadcast(%[[transpose]])
+    CHECK:    multiply({{.*}}, %[[broadcast]])
+    CHECK:    ENTRY
+    CHECK:    __triton_nested_gemm_fusion
   )"));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
 
-  EXPECT_TRUE(NestFusionsRunAndCompare(
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
@@ -292,11 +193,18 @@ TEST_F(TritonTest, FuseSubchannelDequantization) {
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
+  EXPECT_TRUE(
+      *RunFileCheck(module->ToString(), "CHECK: __triton_nested_gemm_fusion"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+// Dump trick:
+// TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+// HloPrintOptions options = HloPrintOptions::ShortParsable();
+// options.set_print_backend_config(true);
+// std::cout << "Dumping module: " << module->ToString(options) << std::endl;
+
 TEST_F(TritonTest, FuseChannelDequantization) {
   // This test is a Channel Dequantization fusion.
   // We run the non-fused version with the goal to fail if an hlo rewrite broke
@@ -325,8 +233,9 @@ TEST_F(TritonTest, FuseChannelDequantization) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
+  EXPECT_TRUE(
+      *RunFileCheck(module->ToString(), "CHECK: __triton_nested_gemm_fusion"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
@@ -336,166 +245,57 @@ TEST_F(TritonTest, FuseSubchannelDequantizationFused) {
   // The case where we do:
   // param -> bitcast -> broadcast -> multiply -> bitcast -> dot.
   constexpr absl::string_view kHloText = R"(
-    HloModule FuseSubchannelDequantizationFused
-
-    fusion {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[2,2048,32] convert(w.s4)
-      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
-      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
-
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
-      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
-      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
-      w.bitcast = bf16[2,2048,32] bitcast(w)
-
-      a = bf16[2,2,1,2048] parameter(2)
-      a.bitcast = bf16[2,2,2048] bitcast(a)
-      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={1}, rhs_contracting_dims={2}
-    } // fusion
-
-    ENTRY main {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      a.bf16 = bf16[2,2,1,2048] parameter(2)
-      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "operation_queue_id":"0",
-          "wait_on_operation_queues":[],
-          "fusion_backend_config":{
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":16,
-              "block_n":16,
-              "block_k":256,
-              "split_k":1,
-              "num_stages":1,
-              "num_warps":2,
-              "num_ctas":1
-            }
-          },
-          "force_earliest_schedule":false
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseSubchannelDequantizationFusedWithSmallBlockKSize) {
-  // This test is a Subchannel Dequantization fusion.
-  // We run the fused version to avoid the hlo passes.
-  // The case where we do:
-  // param -> bitcast -> broadcast -> multiply -> bitcast -> dot.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseSubchannelDequantizationFusedWithSmallBlockKSize
-
-    fusion {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[2,2048,32] convert(w.s4)
-      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
-      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
-
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
-      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
-      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
-      w.bitcast = bf16[2,2048,32] bitcast(w)
-
-      a = bf16[2,2,1,2048] parameter(2)
-      a.bitcast = bf16[2,2,2048] bitcast(a)
-      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={1}, rhs_contracting_dims={2}
-    } // fusion
-
-    ENTRY main {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      a.bf16 = bf16[2,2,1,2048] parameter(2)
-      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "operation_queue_id":"0",
-          "wait_on_operation_queues":[],
-          "fusion_backend_config":{
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":16,
-              "block_n":16,
-              "block_k":128,
-              "split_k":1,
-              "num_stages":1,
-              "num_warps":2,
-              "num_ctas":1
-            }
-          },
-          "force_earliest_schedule":false
-        }
-    }
-  )";
+HloModule FuseSubchannelDequantizationFused
+
+lhs {
+  w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+  w.s8 = s8[2,2048,32] convert(w.s4)
+  w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
+  w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
+
+  s.bf16 = bf16[2,8,1,32] parameter(1)
+  s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
+  s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
+  w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
+  ROOT w.bitcast = bf16[2,2048,32] bitcast(w)
+}
+
+rhs {
+  a.bf16 = bf16[2,2,1,2048] parameter(0)
+  ROOT a.bitcast = bf16[2,2,2048] bitcast(a.bf16)
+}
+
+fusion {
+  w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+  s.bf16 = bf16[2,8,1,32] parameter(1)
+  w.bitcast = bf16[2,2048,32] fusion(w.s4, s.bf16), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "128", "16"]}]}}}
+  a = bf16[2,2,1,2048] parameter(2)
+  a.bitcast = bf16[2,2,2048] fusion(a), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["16", "1", "128"]}]}}}
+  ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast),
+      lhs_batch_dims={0}, lhs_contracting_dims={1},
+      rhs_batch_dims={1}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+  s.bf16 = bf16[2,8,1,32] parameter(1)
+  a.bf16 = bf16[2,2,1,2048] parameter(2)
+  ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16), kind=kCustom,
+    calls=fusion, backend_config={
+      "fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["1", "16", "16"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, FuseBroadcastInPrologue) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseBroadcastInPrologue
-
-    ENTRY main {
-      lhs = bf16[2,1024] parameter(0)
-      lhs.broadcast = bf16[2,256,1024] broadcast(lhs), dimensions={0,2}
-
-      rhs = bf16[2,256,512] parameter(1)
-
-      ROOT dot = f32[2,1024,512] dot(lhs.broadcast, rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[2,256,1024]{2,1,0} broadcast
-    CHECK:    %[[dot:.*]] = f32[2,1024,512]{2,1,0} dot
-    CHECK:    ENTRY %main
-  )"));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseBroadcastBitcastInPrologue) {
-  // This test is a Subchannel Dequantization fusion.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseBroadcastBitcastInPrologue
-
-    ENTRY main {
-      lhs = bf16[2,1024] parameter(0)
-      lhs.broadcast = bf16[2,128,1024] broadcast(lhs), dimensions={0,2}
-      lhs.bitcast = bf16[256,1024] reshape(lhs.broadcast)
-
-      rhs = bf16[256,512] parameter(1)
-
-      ROOT dot = f32[1024,512] dot(lhs.bitcast, rhs),
-        lhs_contracting_dims={0}, rhs_contracting_dims={0}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[{{.*}}]{2,1,0} broadcast
-    CHECK:    %[[bitcast:.*]] = bf16[{{.*}}]{1,0} bitcast
-    CHECK:    ROOT %[[dot:.*]] = f32[{{.*}}]{1,0} dot
-    CHECK:    ENTRY %main
-  )"));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
-      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
 TEST_F(TritonTest, FuseBroadcastBitcastMultiplyInPrologue) {
   // This test is a Subchannel Dequantization fusion.
   constexpr absl::string_view kHloText = R"(
@@ -519,117 +319,62 @@ TEST_F(TritonTest, FuseBroadcastBitcastMultiplyInPrologue) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[{{.*}}]{2,1,0} broadcast
-    CHECK:    %[[bitcast:.*]] = bf16[{{.*}}]{1,0} bitcast
-    CHECK:    %[[multiply:.*]] = [[type:.*]][{{.*}}]{1,0} multiply
-    CHECK:    %[[dot:.*]] = f32[1024,512]{1,0} dot
-    CHECK:    ENTRY %main
+    // We don't need to check the bitcast, because it is hoisted.
+    CHECK:    %[[broadcast:.*]] = {{.*}} broadcast
+    CHECK:    %[[multiply:.*]] = {{.*}} multiply
+    CHECK:    f32[1024,512]{1,0} dot
+    CHECK:    ENTRY
+    CHECK:    __triton_nested_gemm_fusion
   )"));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
-      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
-TEST_F(TritonTest, DotWithI4WeightsOnLhsWithBitcastTo3dTensor) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule DotWithI4WeightsOnLhsWithBitcastTo3dTensor
-
-    fusion {
-      p0 = s4[256,16]{1,0:E(4)} parameter(0)
-      p0.2 = bf16[256,16] convert(p0)
-      p0.3 = bf16[4,64,16] bitcast(p0.2)
-      p1 = bf16[4,32,64] parameter(1)
-      ROOT dot = bf16[4,16,32] dot(p0.3, p1),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={2}
-    }
-
-    ENTRY entry_computation {
-      p0 = s4[256,16]{1,0:E(4)} parameter(0)
-      p1 = bf16[4,32,64] parameter(1)
-      ROOT dot = bf16[4,16,32] fusion(p0, p1),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
-TEST_F(TritonTest,
-       DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue
-
-    fusion {
-      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
-      p0.1 = s4[1,32,128]{2,1,0:E(4)} bitcast(p0)
-      p0.2 = bf16[1,32,128] convert(p0.1)
-      p0.3 = bf16[1,128,32]{1,2,0} bitcast(p0.2)
-      p1 = bf16[128,1,64] parameter(1)
-      dot = bf16[1,32,64] dot(p0.3, p1),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={1}, rhs_contracting_dims={0}
-      p2 = bf16[1,1,32]{2,0,1} parameter(2)
-      p2.1 = bf16[1,32] bitcast(p2)
-      p2.2 = bf16[1,32,64] broadcast(p2.1), dimensions={0,1}
-      m = bf16[1,32,64] multiply(dot, p2.2)
-      ROOT m.1 = bf16[1,1,32,64] bitcast(m)
-    }
-
-    ENTRY entry_computation {
-      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
-      p1 = bf16[128,1,64] parameter(1)
-      p2 = bf16[1,1,32]{2,0,1} parameter(2)
-      ROOT gemm_fusion_dot.2 = bf16[1,1,32,64] fusion(p0, p1, p2),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
 
 TEST_F(TritonTest, DotWithInt4WeightsOnLhsFusedWithMultiplyByChannelScales) {
   constexpr absl::string_view kHloText = R"(
-    HloModule DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales
-
-    DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales {
-      w = s4[32,64,128] parameter(0)
-      w.i8 = s8[32,64,128] convert(w)
-      w.bf16 = bf16[32,64,128] convert(w.i8)
-      scales = bf16[32,128] parameter(1)
-      scales.broadcast = bf16[32,64,128] broadcast(scales), dimensions={0,2}
-      weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
-      activations = bf16[32,64,256] parameter(2)
-      ROOT dot = f32[32,128,256] dot(weights.scaled, activations),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      w = s4[32,64,128] parameter(0)
-      scales = bf16[32,128] parameter(1)
-      p2 = bf16[32,64,256] parameter(2)
-      ROOT dot = f32[32,128,256] fusion(w, scales, p2),
-        kind=kCustom,
-        calls=DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
+HloModule DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales
+
+lhs {
+  parameter_0 = s4[32,64,128] parameter(0)
+  parameter_1 = bf16[32,128] parameter(1)
+  w.s8 = s8[32,64,128] convert(parameter_0)
+  w.bf16 = bf16[32,64,128] convert(w.s8)
+  scales.broadcast = bf16[32,64,128] broadcast(parameter_1), dimensions={0,2}
+  ROOT weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
+}
+
+rhs {
+  ROOT activations = bf16[32,64,256] parameter(0)
+}
+
+DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales {
+  w = s4[32,64,128] parameter(0)
+  scales = bf16[32,128] parameter(1)
+  lhs = bf16[32,64,128] fusion(w, scales), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  activations = bf16[32,64,256] parameter(2)
+  rhs = bf16[32,64,256] fusion(activations), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  ROOT dot = f32[32,128,256] dot(lhs, rhs),
+    lhs_batch_dims={0}, lhs_contracting_dims={1},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  w = s4[32,64,128] parameter(0)
+  scales = bf16[32,128] parameter(1)
+  p2 = bf16[32,64,256] parameter(2)
+  ROOT dot = f32[32,128,256] fusion(w, scales, p2),
+    kind=kCustom,
+    calls=DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["1", "64", "64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
@@ -658,6 +403,7 @@ TEST_F(TritonTest, FuseMultiplyInPrologue) {
     CHECK:    %[[multiply:.*]] = [[type:.*]]{{.*}} multiply({{.*}}, {{.*}})
     CHECK:    %[[dot:.*]] = f32[32,128,256]{2,1,0} dot
     CHECK:    ENTRY %main
+    CHECK:    __triton_nested_gemm_fusion
   )"));
 }
 
@@ -683,6 +429,7 @@ TEST_F(TritonTest, DISABLED_FuseMultiplyInEpilogue) {
       CHECK:  %[[dot:.*]] = bf16[4,64,32]{1,2,0} dot
       CHECK:  %[[multiply:.*]] = [[type:.*]][4,32,64]{2,1,0} multiply
       CHECK:  ENTRY %main
+      CHECK:  __triton_nested_gemm_fusion
     )"));
 }
 
@@ -699,8 +446,10 @@ TEST_F(TritonTest, NonstandardLayoutInt4) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  EXPECT_TRUE(
+      *RunFileCheck(module->ToString(), "CHECK: __triton_nested_gemm_fusion"));
+  EXPECT_TRUE(RunAndCompare(std::move(module),
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 using ::testing::TestParamInfo;
@@ -740,31 +489,46 @@ TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhs) {
     GTEST_SKIP() << "2d test ignores batch dim case.";
   }
   constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule lhs_${name}
-
-    lhs_${name} {
-      w.s4 = s4[${lhs}] parameter(0)
-      w.s8 = s8[${lhs}] convert(w.s4)
-      w.bf16 = bf16[${lhs}] convert(w.s8)
-      a = bf16[${rhs}] parameter(1)
-      ROOT lhs_${name} = f32[${out}] dot(w.bf16, a),
-        lhs_contracting_dims={${lhs_contracting_dim}},
-        rhs_contracting_dims={${rhs_contracting_dim}}
-    }
+HloModule lhs_${name}
+
+lhs {
+  parameter_0 = s4[${lhs}] parameter(0)
+  w.s8 = s8[${lhs}] convert(parameter_0)
+  ROOT w.b16 = bf16[${lhs}] convert(w.s8)
+}
+
+rhs {
+  ROOT parameter_0 = bf16[${rhs}] parameter(0)
+}
+
+fusion {
+  parameter_0 = s4[${lhs}] parameter(0)
+
+  lhs = bf16[${lhs}] fusion(parameter_0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  parameter_1 = bf16[${rhs}] parameter(1)
+  rhs = bf16[${rhs}] fusion(parameter_1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  ROOT dot = f32[${out}] dot(lhs, rhs),
+    lhs_contracting_dims={${lhs_contracting_dim}},
+    rhs_contracting_dims={${rhs_contracting_dim}}
+}
+
+ENTRY entry_computation {
+  w = s4[${lhs}] parameter(0)
+  a = bf16[${rhs}] parameter(1)
+  ROOT fusion = f32[${out}] fusion(w, a),
+    kind=kCustom, calls=fusion, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["64", "64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
 
-    ENTRY main {
-      w = s4[${lhs}] parameter(0)
-      a = bf16[${rhs}] parameter(1)
-      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
-        kind=kCustom,
-        calls=lhs_${name},
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
   std::string hlo_text = GetParam().Format(kHloTextTemplate);
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
                                        ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}))
@@ -776,31 +540,45 @@ TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhsWithBatchDim) {
     GTEST_SKIP() << "3d test ignores 2d case.";
   }
   constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule ${name}
-
-    fusion {
-      w.s4 = s4[${lhs}] parameter(0)
-      w.s8 = s8[${lhs}] convert(w.s4)
-      w.bf16 = bf16[${lhs}] convert(w.s8)
-      a = bf16[${rhs}] parameter(1)
-      ROOT dot.0 = f32[${out}] dot(w.bf16, a),
-        lhs_batch_dims={0}, lhs_contracting_dims={${lhs_contracting_dim}},
-        rhs_batch_dims={0}, rhs_contracting_dims={${rhs_contracting_dim}}
-    }
-
-    ENTRY gemm_fusion_dot_computation {
-      w = s4[${lhs}] parameter(0)
-      a = bf16[${rhs}] parameter(1)
-      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
+HloModule lhs_${name}
+
+lhs {
+  parameter_0 = s4[${lhs}] parameter(0)
+  w.s8 = s8[${lhs}] convert(parameter_0)
+  ROOT w.b16 = bf16[${lhs}] convert(w.s8)
+}
+
+rhs {
+  ROOT parameter_0 = bf16[${rhs}] parameter(0)
+}
+
+fusion {
+  parameter_0 = s4[${lhs}] parameter(0)
+
+  lhs = bf16[${lhs}] fusion(parameter_0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  parameter_1 = bf16[${rhs}] parameter(1)
+  rhs = bf16[${rhs}] fusion(parameter_1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  ROOT dot = f32[${out}] dot(lhs, rhs),
+    lhs_batch_dims={0}, lhs_contracting_dims={${lhs_contracting_dim}},
+    rhs_batch_dims={0}, rhs_contracting_dims={${rhs_contracting_dim}}
+}
+
+ENTRY entry_computation {
+  w = s4[${lhs}] parameter(0)
+  a = bf16[${rhs}] parameter(1)
+  ROOT fusion = f32[${out}] fusion(w, a),
+    kind=kCustom, calls=fusion, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["1", "64", "64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   std::string hlo_text = GetParam().Format(kHloTextTemplate);
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
                                        ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}))
@@ -813,31 +591,46 @@ TEST_P(ParametrizedTritonTest, Int4WeightsOnTheRhs) {
   }
 
   constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule rhs_${name}
-
-    rhs_${name} {
-      a = bf16[${lhs}] parameter(0)
-      w.s4 = s4[${rhs}] parameter(1)
-      w.s8 = s8[${rhs}] convert(w.s4)
-      w.bf16 = bf16[${rhs}] convert(w.s8)
-      ROOT rhs_${name} = f32[${out}] dot(a, w.bf16),
-        lhs_contracting_dims={${lhs_contracting_dim}},
-        rhs_contracting_dims={${rhs_contracting_dim}}
-    }
+HloModule rhs_${name}
+
+lhs {
+  ROOT parameter_0 = bf16[${lhs}] parameter(0)
+}
+
+rhs {
+  parameter_0 = s4[${rhs}] parameter(0)
+  w.s8 = s8[${rhs}] convert(parameter_0)
+  ROOT w.b16 = bf16[${rhs}] convert(w.s8)
+}
+
+fusion {
+  parameter_0 = bf16[${lhs}] parameter(0)
+
+  lhs = bf16[${lhs}] fusion(parameter_0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  parameter_1 = s4[${rhs}] parameter(1)
+  rhs = bf16[${rhs}] fusion(parameter_1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  ROOT dot = f32[${out}] dot(lhs, rhs),
+    lhs_contracting_dims={${lhs_contracting_dim}},
+    rhs_contracting_dims={${rhs_contracting_dim}}
+}
+
+ENTRY entry_computation {
+  a = bf16[${lhs}] parameter(0)
+  w = s4[${rhs}] parameter(1)
+  ROOT fusion = f32[${out}] fusion(a, w),
+    kind=kCustom, calls=fusion, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["64", "64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
 
-    ENTRY main {
-      a = bf16[${lhs}] parameter(0)
-      w = s4[${rhs}] parameter(1)
-      ROOT rhs_${name} = f32[${out}] fusion(a, w),
-        kind=kCustom,
-        calls=rhs_${name},
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
   std::string hlo_text = GetParam().Format(kHloTextTemplate);
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
                                        ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}))
@@ -883,7 +676,9 @@ TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDims) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
+  EXPECT_TRUE(
+      *RunFileCheck(module->ToString(), "CHECK: __triton_nested_gemm_fusion"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-2}));
 }
 
@@ -901,27 +696,12 @@ TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDimsReversedLayout) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(NestFusionsRunAndCompare(
+  EXPECT_TRUE(
+      *RunFileCheck(module->ToString(), "CHECK: __triton_nested_gemm_fusion"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, NegatePlusConvertHLO) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule NegatePlusConvertHLO
-
-    ENTRY main {
-      lhs = s4[2,32,64] parameter(0)
-      lhs_negated = s4[2,32,64] negate(lhs)
-      lhs_converted = bf16[2,32,64] convert(lhs_negated)
-      rhs = bf16[2,64,16] parameter(1)
-      ROOT dot = bf16[2,32,16] dot(lhs_converted, rhs),
-          lhs_batch_dims={0}, lhs_contracting_dims={2},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
 TEST_F(TritonTest, RejectTritonFusionForWithMinorBatchDim) {
   constexpr absl::string_view kHloText = R"(
     HloModule RejectTritonFusionForWithMinorBatchDim
@@ -936,63 +716,95 @@ TEST_F(TritonTest, RejectTritonFusionForWithMinorBatchDim) {
     }
   )";
 
-  const std::string pattern =
-      R"(CHECK-NOT: "kind":"__triton_gemm","triton_gemm_config")";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), pattern));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(),
+                            "CHECK-NOT: __triton_nested_gemm_fusion"));
 }
 
 TEST_F(TritonTest, LHSWithMinorDimEqualTo1) {
   // We prove that triton can handle int4 dot with non contracting dim size
-  // equal to 1.
+  // equal to 1 on the left-hand side.
   constexpr absl::string_view kHloText = R"(
-    HloModule LHSWithMinorDimEqualTo1
-
-    triton_computation {
-      lhs = s4[2,1024,1] parameter(0)
-      lhs_converted = bf16[2,1024,1] convert(lhs)
-      rhs = bf16[2,64,1024] parameter(1)
-      ROOT dot = bf16[2,1,64] dot(lhs_converted, rhs),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={0}, rhs_contracting_dims={2}
-    }
-
-    ENTRY main {
-      lhs = s4[2,1024,1] parameter(0)
-      rhs = bf16[2,64,1024] parameter(1)
-      ROOT dot = bf16[2,1,64] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
+HloModule LHSWithMinorDimEqualTo1
+
+lhs {
+  lhs = s4[2,1024,1] parameter(0)
+  ROOT lhs_converted = bf16[2,1024,1] convert(lhs)
+}
+
+rhs {
+  ROOT rhs = bf16[2,64,1024] parameter(0)
+}
+
+triton_computation {
+  p0 = s4[2,1024,1] parameter(0)
+  p1 = bf16[2,64,1024] parameter(1)
+  lhs = bf16[2,1024,1] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  rhs = bf16[2,64,1024] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  ROOT dot = bf16[2,1,64] dot(lhs, rhs),
+      lhs_batch_dims={0}, lhs_contracting_dims={1},
+      rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  lhs = s4[2,1024,1] parameter(0)
+  rhs = bf16[2,64,1024] parameter(1)
+  ROOT dot = bf16[2,1,64] fusion(lhs, rhs), kind=kCustom,
+    calls=triton_computation, backend_config={"fusion_backend_config":
+      {"kind":"__triton_nested_gemm_fusion",
+       "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["1", "64","64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonTest, RHSWithMinorDimEqualTo1) {
   // We prove that triton can handle int4 dot with non contracting dim size
-  // equal to 1.
+  // equal to 1 on the right-hand side.
   constexpr absl::string_view kHloText = R"(
-    HloModule RHSWithMinorDimEqualTo1
-
-    triton_computation {
-      lhs = bf16[2,1024,64] parameter(0)
-      rhs = s4[2,1024,1] parameter(1)
-      rhs_converted = bf16[2,1024,1] convert(rhs)
-      ROOT dot = bf16[2,64,1] dot(lhs, rhs_converted),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = bf16[2,1024,64] parameter(0)
-      rhs = s4[2,1024,1] parameter(1)
-      ROOT dot = bf16[2,64,1] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-
+HloModule RHSWithMinorDimEqualTo1
+
+lhs {
+  ROOT lhs = bf16[2,1024,64] parameter(0)
+}
+
+rhs {
+  rhs = s4[2,1024,1] parameter(0)
+  ROOT rhs_converted = bf16[2,1024,1] convert(rhs)
+}
+
+triton_computation {
+  p0 = bf16[2,1024,64] parameter(0)
+  p1 = s4[2,1024,1] parameter(1)
+  lhs = bf16[2,1024,64] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  rhs = bf16[2,1024,1] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "64", "64"]}]}}}
+  ROOT dot = bf16[2,64,1] dot(lhs, rhs),
+      lhs_batch_dims={0}, lhs_contracting_dims={1},
+      rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  lhs = bf16[2,1024,64] parameter(0)
+  rhs = s4[2,1024,1] parameter(1)
+  ROOT dot = bf16[2,64,1] fusion(lhs, rhs), kind=kCustom,
+    calls=triton_computation, backend_config={"fusion_backend_config":
+      {"kind":"__triton_nested_gemm_fusion",
+       "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["1", "64","64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
@@ -1001,52 +813,41 @@ TEST_F(TritonTest, LHSNonMinorContractingDim) {
   // We prove that triton can handle int4 dot with non minor
   // lhs_contracting_dim.
   constexpr absl::string_view kHloText = R"(
-    HloModule LHSNonMinorContractingDim
-
-    triton_computation {
-      lhs = s4[1024,8] parameter(0)
-      lhs_converted = bf16[1024,8] convert(lhs)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
-          lhs_contracting_dims={0}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = s4[1024,8] parameter(0)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, LHSNonMinorContractingDimWithBatchDim0) {
-  // We prove that triton can handle int4 dot with non minor
-  // lhs_contracting_dim.
-  constexpr absl::string_view kHloText = R"(
-    HloModule LHSNonMinorContractingDimWithBatchDim0
-
-    triton_computation {
-      lhs = s4[2,1024,8] parameter(0)
-      lhs_converted = bf16[2,1024,8] convert(lhs)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = s4[2,1024,8] parameter(0)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
+HloModule LHSNonMinorContractingDim
+
+lhs {
+  lhs = s4[1024,8] parameter(0)
+  ROOT lhs_converted = bf16[1024,8] convert(lhs)
+}
+
+rhs {
+  ROOT rhs = bf16[1024,4] parameter(0)
+}
+
+triton_computation {
+  p0 = s4[1024,8] parameter(0)
+  p1 = bf16[1024,4] parameter(1)
+  lhs = bf16[1024,8] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  rhs = bf16[1024,4] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  ROOT dot = bf16[8,4] dot(lhs, rhs),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY main {
+  p0 = s4[1024,8] parameter(0)
+  p1 = bf16[1024,4] parameter(1)
+  ROOT dot = bf16[8,4] fusion(p0, p1), kind=kCustom,
+    calls=triton_computation, backend_config={"fusion_backend_config":
+      {"kind":"__triton_nested_gemm_fusion",
+       "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["64","64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
@@ -1054,219 +855,121 @@ TEST_F(TritonTest, LHSNonMinorContractingDimWithBatchDim0) {
 TEST_F(TritonTest, LHSMinorContractingDim) {
   // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
   constexpr absl::string_view kHloText = R"(
-    HloModule LHSMinorContractingDim
-
-    triton_computation {
-      lhs = s4[8,1024] parameter(0)
-      lhs_converted = bf16[8,1024] convert(lhs)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = s4[8,1024] parameter(0)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, ConvertPlusNegate) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule ConvertPlusNegate
-
-    triton_computation {
-      lhs = s4[8,1024] parameter(0)
-      lhs_converted = bf16[8,1024] convert(lhs)
-      lhs_negated = bf16[8,1024] negate(lhs_converted)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] dot(lhs_negated, rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = s4[8,1024] parameter(0)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, LHSMinorContractingDimWithBatchDim0) {
-  // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
-  constexpr absl::string_view kHloText = R"(
-    HloModule LHSMinorContractingDimWithBatchDim0
-
-    triton_computation {
-      lhs = s4[2,8,1024] parameter(0)
-      lhs_converted = bf16[2,8,1024] convert(lhs)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={2},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = s4[2,8,1024] parameter(0)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
+HloModule LHSMinorContractingDim
+
+lhs {
+  lhs = s4[8,1024] parameter(0)
+  ROOT lhs_converted = bf16[8,1024] convert(lhs)
+}
+
+rhs {
+  ROOT rhs = bf16[1024,4] parameter(0)
+}
+
+triton_computation {
+  p0 = s4[8,1024] parameter(0)
+  p1 = bf16[1024,4] parameter(1)
+  lhs = bf16[8,1024] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  rhs = bf16[1024,4] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  ROOT dot = bf16[8,4] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY main {
+  p0 = s4[8,1024] parameter(0)
+  p1 = bf16[1024,4] parameter(1)
+  ROOT dot = bf16[8,4] fusion(p0, p1), kind=kCustom,
+    calls=triton_computation, backend_config={"fusion_backend_config":
+      {"kind":"__triton_nested_gemm_fusion",
+       "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["64","64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonTest, RHSTestWithNotMinorContractingDim) {
   constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithNotMinorContractingDim
-
-    triton_computation {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[1024,4] parameter(1)
-      rhs_converted = bf16[1024,4] convert(rhs)
-      ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
-          lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
+HloModule RHSTestWithNotMinorContractingDim
+
+lhs {
+  ROOT lhs = bf16[8,1024] parameter(0)
+}
+
+rhs {
+  rhs = s4[1024,4] parameter(0)
+  ROOT rhs_converted = bf16[1024,4] convert(rhs)
+}
+
+triton_computation {
+  p0 = bf16[8,1024] parameter(0)
+  p1 = s4[1024,4] parameter(1)
+  lhs = bf16[8,1024] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  rhs = bf16[1024,4] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  ROOT dot = bf16[8,4] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY main {
+  p0 = bf16[8,1024] parameter(0)
+  p1 = s4[1024,4] parameter(1)
+  ROOT dot = bf16[8,4] fusion(p0, p1), kind=kCustom,
+    calls=triton_computation, backend_config={"fusion_backend_config":
+      {"kind":"__triton_nested_gemm_fusion",
+       "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["64","64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonTest, RHSTestWithMinorContractingDim) {
   constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithMinorContractingDim
-
-    triton_computation {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[4,1024] parameter(1)
-      rhs_converted = bf16[4,1024] convert(rhs)
-      ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
-          lhs_contracting_dims={1}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[4,1024] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, RHSTestWithMinorContractingDimWithBatchDim) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithMinorContractingDimWithBatchDim
-
-    triton_computation {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,1024,4] parameter(1)
-      rhs_converted = bf16[2,1024,4] convert(rhs)
-      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
-          lhs_batch_dims={0}, lhs_contracting_dims={2},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, RHSTestWithNotMinorContractingDimWithBatchDim0) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithNotMinorContractingDimWithBatchDim0
-
-    triton_computation {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,4,1024] parameter(1)
-      rhs_converted = bf16[2,4,1024] convert(rhs)
-      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
-          lhs_batch_dims={0}, lhs_contracting_dims={2},
-          rhs_batch_dims={0}, rhs_contracting_dims={2}
-    }
-
-    ENTRY main {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,4,1024] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, FusedBroadcastAddBroadcastMultiplyDotGeneratesValidTriton) {
-  // Here we test that the Triton codegen can handle a fusion with the chain of
-  // a broadcast, add, broadcast, multiply, and dot. First broadcast was causing
-  // a problem in the past because it was not using a 1d tile shape. That was
-  // necessary for the Triton kernel to be valid.
-  constexpr absl::string_view kHloText = R"(
-    HloModule gemm_fusion_dot
-
-    %fusion  {
-      p0 = bf16[1024,1,512]{2,1,0} parameter(0)
-      p0_b = bf16[1,128,8,8,64]{4,3,2,1,0} bitcast(p0)
-      p1 = bf16[1,128,8,8]{3,2,1,0} parameter(1)
-      c0 = bf16[] constant(3.e-02)
-      c0_b = bf16[1,128,8,8]{3,2,1,0} broadcast(c0), dimensions={}
-      add_0 = bf16[1,128,8,8]{3,2,1,0} add(p1, c0_b)
-      add_bitcast = bf16[128,8,8]{2,1,0} bitcast(add_0)
-      add_broadcast = bf16[1,128,8,8,64]{4,3,2,1,0} broadcast(add_bitcast), dimensions={1,2,3}
-      m_p0 = bf16[1,128,8,8,64]{4,3,2,1,0} multiply(p0_b, add_broadcast)
-      p2 = bf16[8,64]{1,0} parameter(2)
-      c1 = bf16[] constant(1)
-      c1_broadcast = bf16[8,64]{1,0} broadcast(c1), dimensions={}
-      add_p2 = bf16[8,64]{1,0} add(p2, c1_broadcast)
-      add_p2_broadcast = bf16[1,128,8,8,64]{4,3,2,1,0} broadcast(add_p2),
-          dimensions={3,4}
-      m_m_p0 = bf16[1,128,8,8,64]{4,3,2,1,0} multiply(m_p0, add_p2_broadcast)
-      m_m_p0_bitcast = bf16[1024,512]{1,0} bitcast(m_m_p0)
-      p3 = bf16[64,512]{1,0} parameter(3)
-      ROOT dot = bf16[1024,64]{1,0} dot(m_m_p0_bitcast, p3),
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={1}
-    }
-
-    ENTRY entry_computation {
-      p0 = bf16[1024,1,512]{2,1,0} parameter(0)
-      p1 = bf16[1,128,8,8]{3,2,1,0} parameter(1)
-      p2 = bf16[8,64]{1,0} parameter(2)
-      p3 = bf16[64,512]{1,0} parameter(3)
-      ROOT gemm_fusion_dot.1642 = bf16[1024,64]{1,0} fusion(p0, p1, p2, p3),
-          kind=kCustom,
-          calls=fusion
-    }
-  )";
+lhs {
+  ROOT lhs = bf16[8,1024] parameter(0)
+}
+
+rhs {
+  rhs = s4[4,1024] parameter(0)
+  ROOT rhs_converted = bf16[4,1024] convert(rhs)
+}
+
+triton_computation {
+  p0 = bf16[8,1024] parameter(0)
+  p1 = s4[4,1024] parameter(1)
+  lhs = bf16[8,1024] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  rhs = bf16[4,1024] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["64", "64"]}]}}}
+  ROOT dot = bf16[8,4] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  p0 = bf16[8,1024] parameter(0)
+  p1 = s4[4,1024] parameter(1)
+  ROOT dot = bf16[8,4] fusion(p0, p1), kind=kCustom,
+    calls=triton_computation, backend_config={"fusion_backend_config":
+      {"kind":"__triton_nested_gemm_fusion",
+       "block_level_fusion_config":{
+        "num_warps":"2","output_tiles":[{"sizes":["64","64"]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false,
+        "is_warp_specialization_allowed":false}}}
+})";
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
index f0eea382bc50b4..4af7ab9cbf76e1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
@@ -38,8 +38,7 @@ class TritonGemmTest : public GpuCodegenTest {
   }
 
   void SetUp() override {
-    if (std::holds_alternative<se::RocmComputeCapability>(
-            GetGpuComputeCapability())) {
+    if (GetGpuComputeCapability().IsRocm()) {
       GTEST_SKIP() << "Not supported on ROCm until Triton is re-enabled.";
     }
   }
@@ -68,18 +67,30 @@ ENTRY e {
   const char* kHloTextTest = R"(
 HloModule t
 
-triton_dot {
+lhs {
+  ROOT p0 = f16[65536,32800] parameter(0)
+}
+
+rhs {
+  ROOT p1 = f16[32800,32] parameter(0)
+}
+
+triton_dot_computation {
   p0 = f16[65536,32800] parameter(0)
   p1 = f16[32800,32] parameter(1)
-  ROOT dot = f16[65536,32] dot(p0, p1),
+  lhs = f16[65536,32800] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config="{\"fusion_backend_config\":{\"kind\":\"__triton_nested_gemm_fusion\",\"block_level_fusion_config\":{\"output_tiles\":[{\"sizes\":[\"32\",\"32\"]}]}}}"
+  rhs = f16[32800,32] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config="{\"fusion_backend_config\":{\"kind\":\"__triton_nested_gemm_fusion\",\"block_level_fusion_config\":{\"output_tiles\":[{\"sizes\":[\"32\",\"32\"]}]}}}"
+  ROOT dot = f16[65536,32] dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY e {
   p0 = f16[65536,32800] parameter(0)
   p1 = f16[32800,32] parameter(1)
-  ROOT _ = f16[65536,32] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\",\"num_ctas\":\"1\"}}}"
+  ROOT _ = f16[65536,32] fusion(p0, p1), kind=kCustom, calls=triton_dot_computation,
+    backend_config="{\"fusion_backend_config\":{\"kind\":\"__triton_nested_gemm_fusion\",\"block_level_fusion_config\":{\"output_tiles\":[{\"sizes\":[\"32\",\"32\"]}],\"num_stages\":1,\"num_warps\":1,\"num_ctas\":1}}}"
 }
 )";
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_int4_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_int4_device_test.cc
deleted file mode 100644
index d30cd6dffc9ea5..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_int4_device_test.cc
+++ /dev/null
@@ -1,1229 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_replace.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/autotuning.pb.h"
-#include "xla/backends/gpu/codegen/triton/test_utils.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/testlib/filecheck.h"
-#include "xla/literal_util.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/types.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-class TritonTest : public GpuCodegenTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    // Do not fall back to cuBLAS, we are testing Triton.
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    // Do not autotune split-k by default, since this prevents deterministically
-    // matching the optimized HLO.
-    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
-    // Always rewrite Gemms with Triton regardless of size.
-    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    debug_options
-        .set_xla_gpu_experimental_enable_subchannel_dequantisation_fusion(true);
-    return debug_options;
-  }
-
-  stream_executor::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
-  }
-
-  const stream_executor::GpuComputeCapability& GpuComputeComp() {
-    return device_desc().gpu_compute_capability();
-  }
-  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-            GpuComputeComp())) {
-      return stream_executor::GpuComputeCapability{
-          device_desc().rocm_compute_capability()};
-    }
-    return stream_executor::GpuComputeCapability{
-        stream_executor::CudaComputeCapability{
-            stream_executor::CudaComputeCapability::kAmpere, 0}};
-  }
-
- protected:
-  const stream_executor::DeviceDescription& device_desc() {
-    return backend().default_stream_executor()->GetDeviceDescription();
-  }
-};
-
-TEST_F(TritonTest, DotForInt4vsIdentityBF16ReturnsCorrectResult) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule FusedInt4DotBf16Identity
-
-    ENTRY entry_computation {
-      w.s4 = s4[32,32]{1,0:E(4)} parameter(0)
-      w.f32 = f32[32,32] convert(w.s4)
-
-      a = f32[32,32] parameter(1)
-      ROOT dot = f32[32,32] dot(w.f32, a),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={0}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-
-  // We check that conversion was fused into gemm fusion.
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:  %[[weight_s4:.*]] = s4[32,32]{1,0:E(4)} parameter(0)
-    CHECK:  %[[weight_f32:.*]] = f32[32,32]{1,0} convert(%[[weight_s4]])
-    CHECK:  %[[a_f32:.*]] = f32[32,32]{1,0} parameter(1)
-    CHECK:  ROOT %[[dot:.*]] = f32[32,32]{1,0} dot(%[[weight_f32]], %[[a_f32]])
-  )"));
-
-  // LHS is a int4 matrix with a clear pattern.
-  // RHS is a constant identity matrix.
-  // The result is a matrix with a clear pattern.
-  TF_ASSERT_OK_AND_ASSIGN(auto lhs,
-                          (LiteralUtil::CreateLiteralWithGenerator<S4, s4>(
-                              ShapeUtil::MakeShape(S4, {32, 32}),
-                              [](absl::Span<const int64_t> indices) {
-                                return static_cast<s4>(indices[0] % 16 - 8);
-                              })));
-  TF_ASSERT_OK_AND_ASSIGN(auto rhs,
-                          (LiteralUtil::CreateLiteralWithGenerator<F32, float>(
-                              ShapeUtil::MakeShape(F32, {32, 32}),
-                              [](absl::Span<const int64_t> indices) {
-                                return indices[0] == indices[1] ? 1.0f : 0.0f;
-                              })));
-  auto computation =
-      module->GetComputationWithName("gemm_fusion_dot_computation");
-  ASSERT_NE(computation, nullptr);
-
-  constexpr absl::string_view ttir_expectations = R"(
-    CHECK:  %[[lhs:.*]] = tt.load %[[lhs_ptr:.*]] : !tt.ptr<tensor<[[lhs_shape:.*]]xi4>>
-    CHECK:  %[[rhs:.*]] = tt.load %[[rhs_ptr:.*]] : !tt.ptr<tensor<[[rhs_shape:.*]]xf32>>
-    CHECK:  %[[lhs_i8:.*]] = arith.extsi %[[lhs]] : tensor<[[lhs_shape]]xi4> to tensor<[[lhs_shape]]xi8>
-    CHECK:  %[[lhs_f32:.*]] = arith.sitofp %[[lhs_i8]] : tensor<[[lhs_shape]]xi8> to tensor<[[lhs_shape]]xf32>
-    CHECK:  %[[dot:.*]] = tt.dot %[[lhs_f32]], %[[rhs]], %cst, inputPrecision = tf32 : tensor<[[lhs_shape]]xf32> * tensor<[[rhs_shape]]xf32> -> tensor<[[output_shape:.*]]xf32>
-  )";
-  EXPECT_TRUE(
-      CreateTritonIrAndFileCheckForDot(*computation, ttir_expectations).ok());
-
-  // LHS is a int4 matrix with a clear pattern.
-  // RHS is a constant identity matrix.
-  // The result is a matrix with a clear pattern.
-  EXPECT_TRUE(RunAndCompare(std::move(module), {&lhs, &rhs}, {}));
-}
-
-// The following tests are for the channel and subchannel dequantization
-// fusions. We run the fused version to avoid the hlo passes and prove that
-// emitters work correctly and unfused version with the goal to fail if an hlo
-// rewrite broke the dequantization logic.
-// For the subchannel dequantization there are two cases:
-// 1. The case where we do:
-//   broadcast -> multiply -> bitcast -> dot.
-// 2. The case where we do:
-//   broadcast -> reshape -> multiply -> dot.
-// On top of that there could be an additional bitcast between the parameter and
-// the broadcast.
-TEST_F(TritonTest, FuseChannelDequantizationFused) {
-  // This test is a Channel Dequantization fusion.
-  // We run the fused version to avoid the hlo passes.
-  // The case where we do:
-  // param(1) -> bitcast -> broadcast -> multiply -> bitcast -> dot.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseChannelDequantizationFused
-
-    fusion {
-      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[32,128,256] convert(w.s4)
-      w.b16 = bf16[32,128,256] convert(w.s8)
-
-      s = bf16[32,1,256] parameter(1)
-      s.bitcast = bf16[32,256] bitcast(s)
-      s.broadcast = bf16[32,128,256] broadcast(s.bitcast), dimensions={0,2}
-      w.scaled = bf16[32,128,256] multiply(w.b16, s.broadcast)
-      w.scaled.bitcast = bf16[32,2,64,256] bitcast(w.scaled)
-
-      a = bf16[1,32,128,2,128] parameter(2)
-      a.bitcast = bf16[32,128,256] bitcast(bf16[1,32,128,2,128] a)
-      a.bitcast.2 = bf16[32,2,64,256] bitcast(a.bitcast)
-      dot = f32[2,32,256,256] dot(w.scaled.bitcast, a.bitcast.2),
-        lhs_batch_dims={1,0}, lhs_contracting_dims={2},
-        rhs_batch_dims={1,0}, rhs_contracting_dims={2}
-      ROOT bitcast = f32[2,32,256,2,1,128] bitcast(f32[2,32,256,256] dot)
-    }
-
-    ENTRY entry_computation {
-      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
-      s.bf16 = bf16[32,1,256] parameter(1)
-      a.bf16 = bf16[1,32,128,2,128] parameter(2)
-      ROOT fusion = f32[2,32,256,2,1,128] fusion(w.s4, s.bf16, a.bf16),
-          kind=kCustom,
-          calls=fusion,
-          backend_config={
-            "operation_queue_id":"0",
-            "wait_on_operation_queues":[],
-            "fusion_backend_config":{
-              "kind":"__triton_gemm",
-              "triton_gemm_config":{
-                "block_m":"128",
-                "block_n":"128",
-                "block_k":"64",
-                "split_k":"2",
-                "num_stages":"1",
-                "num_warps":"8",
-                "num_ctas":"1"
-              }
-            },
-            "force_earliest_schedule":false
-          }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseSubchannelDequantizationWithTranspose) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseSubchannelDequantizationWithTranspose
-
-    ENTRY FuseSubchannelDequantizationWithTranspose {
-      w_s4 = s4[2,2048,64] parameter(1)
-      w_s8 = s8[2,2048,64] convert(w_s4)
-      w_s8_reshaped = s8[2,8,256,64] reshape(w_s8)
-      w_bf16 = bf16[2,8,256,64] convert(w_s8_reshaped)
-      s_bf16 = bf16[2,8,1,64]{3,1,0,2} parameter(0)
-      s_bf16_reshaped = bf16[2,8,64] reshape(s_bf16)
-      s_bf16_broadcasted = bf16[2,8,256,64] broadcast(s_bf16_reshaped),
-          dimensions={0,1,3}
-      w_bf16_scaled = bf16[2,8,256,64] multiply(w_bf16, s_bf16_broadcasted)
-      w_bf16_scaled_reshaped = bf16[2,2048,64] reshape(w_bf16_scaled)
-
-      a_bf16 = bf16[2,2048,2,32] parameter(2)
-      a_bf16_reshaped = bf16[2,2048,64] reshape(a_bf16)
-      dot = bf16[2,64,64] dot(w_bf16_scaled_reshaped, a_bf16_reshaped),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-      dot_reshaped = bf16[2,64,2,32] reshape(dot)
-      dot_transposed = bf16[64,2,2,32] transpose(dot_reshaped),
-          dimensions={1,0,2,3}
-      ROOT root = bf16[2,64,2,32]{3,2,0,1} reshape(dot_transposed)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  if (GetCudaComputeCapability().IsAtLeastHopper()) {
-    EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-      CHECK:    %[[bitcast:.*]] = bf16[2,8,64]{2,1,0} bitcast({{.*}})
-      CHECK:    %[[transpose:.*]] = bf16[2,64,8]{2,1,0} transpose(%[[bitcast]]), dimensions={0,2,1}
-      CHECK:    %[[broadcast:.*]] = bf16[2,64,8,256]{3,2,1,0} broadcast(%[[transpose]]), dimensions={0,1,2}
-      CHECK:    %[[multiply:.*]] = bf16[2,64,8,256]{3,2,1,0} multiply({{.*}}, %[[broadcast]])
-    )"));
-  }
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, FuseSubchannelDequantization) {
-  // This test is a Subchannel Dequantization fusion.
-  // We run the non-fused version with the goal to fail if an hlo rewrite broke
-  // the dequantization logic. The case where we do:
-  //  param(1) -> reshape -> broadcast -> multiply -> reshape -> dot.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseSubchannelDequantization
-
-    ENTRY main {
-      w = s4[2,2048,32] parameter(0)
-      w.s8 = s8[2,2048,32] convert(w)
-      w.b16 = bf16[2,2048,32] convert(w.s8)
-      w.b16.reshaped = bf16[2,8,256,32] reshape(w.b16)
-
-      s = bf16[2,8,1,32] parameter(1)
-      s.reshaped = bf16[2,8,32] reshape(s)
-      s.broadcasted = bf16[2,8,256,32] broadcast(s.reshaped), dimensions={0,1,3}
-      w.scaled = bf16[2,8,256,32] multiply(w.b16.reshaped, s.broadcasted)
-      w.scaled.reshaped = bf16[2,2048,32] reshape(w.scaled)
-
-      a = bf16[2,2,1,2048] parameter(2)
-      a.reshaped = bf16[2,2,2048] reshape(a)
-      ROOT dot = f32[2,32,2] dot(w.scaled.reshaped, a.reshaped),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={1}, rhs_contracting_dims={2}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseChannelDequantization) {
-  // This test is a Channel Dequantization fusion.
-  // We run the non-fused version with the goal to fail if an hlo rewrite broke
-  // the dequantization logic. The case where we do:
-  //  param(1) -> bitcast -> broadcast -> multiply -> dot.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseChannelDequantization
-
-    ENTRY main {
-      w.s4 = s4[32,128,256] parameter(0)
-      w.s8 = s8[32,128,256] convert(w.s4)
-      w.bf16 = bf16[32,128,256] convert(w.s8)
-
-      s = bf16[32,1,256] parameter(1)
-      s.broadcast = bf16[32,1,256] broadcast(s), dimensions={0,1,2}
-      s.reshape = bf16[32,256] reshape(s.broadcast)
-      s.broadcast.2 = bf16[32,128,256] broadcast(s.reshape), dimensions={0,2}
-      w.scaled = bf16[32,128,256] multiply(w.bf16, s.broadcast.2)
-
-      a = bf16[2,1,32,128,128] parameter(2)
-      ROOT dot = f32[32,256,2,1,128] dot(w.scaled, a),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={2}, rhs_contracting_dims={4}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseSubchannelDequantizationFused) {
-  // This test is a Subchannel Dequantization fusion.
-  // We run the fused version to avoid the hlo passes.
-  // The case where we do:
-  // param -> bitcast -> broadcast -> multiply -> bitcast -> dot.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseSubchannelDequantizationFused
-
-    fusion {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[2,2048,32] convert(w.s4)
-      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
-      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
-
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
-      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
-      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
-      w.bitcast = bf16[2,2048,32] bitcast(w)
-
-      a = bf16[2,2,1,2048] parameter(2)
-      a.bitcast = bf16[2,2,2048] bitcast(a)
-      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={1}, rhs_contracting_dims={2}
-    } // fusion
-
-    ENTRY main {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      a.bf16 = bf16[2,2,1,2048] parameter(2)
-      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "operation_queue_id":"0",
-          "wait_on_operation_queues":[],
-          "fusion_backend_config":{
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":16,
-              "block_n":16,
-              "block_k":256,
-              "split_k":1,
-              "num_stages":1,
-              "num_warps":2,
-              "num_ctas":1
-            }
-          },
-          "force_earliest_schedule":false
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseSubchannelDequantizationFusedWithSmallBlockKSize) {
-  // This test is a Subchannel Dequantization fusion.
-  // We run the fused version to avoid the hlo passes.
-  // The case where we do:
-  // param -> bitcast -> broadcast -> multiply -> bitcast -> dot.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseSubchannelDequantizationFusedWithSmallBlockKSize
-
-    fusion {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[2,2048,32] convert(w.s4)
-      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
-      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
-
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
-      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
-      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
-      w.bitcast = bf16[2,2048,32] bitcast(w)
-
-      a = bf16[2,2,1,2048] parameter(2)
-      a.bitcast = bf16[2,2,2048] bitcast(a)
-      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast), 
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={1}, rhs_contracting_dims={2}
-    } // fusion
-
-    ENTRY main {
-      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
-      s.bf16 = bf16[2,8,1,32] parameter(1)
-      a.bf16 = bf16[2,2,1,2048] parameter(2)
-      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "operation_queue_id":"0",
-          "wait_on_operation_queues":[],
-          "fusion_backend_config":{
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":16,
-              "block_n":16,
-              "block_k":128,
-              "split_k":1,
-              "num_stages":1,
-              "num_warps":2,
-              "num_ctas":1
-            }
-          },
-          "force_earliest_schedule":false
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseBroadcastInPrologue) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseBroadcastInPrologue
-
-    ENTRY main {
-      lhs = bf16[2,1024] parameter(0)
-      lhs.broadcast = bf16[2,256,1024] broadcast(lhs), dimensions={0,2}
-
-      rhs = bf16[2,256,512] parameter(1)
-
-      ROOT dot = f32[2,1024,512] dot(lhs.broadcast, rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[2,256,1024]{2,1,0} broadcast
-    CHECK:    %[[dot:.*]] = f32[2,1024,512]{2,1,0} dot
-    CHECK:    ENTRY %main
-  )"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, FuseBroadcastBitcastInPrologue) {
-  // This test is a Subchannel Dequantization fusion.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseBroadcastBitcastInPrologue
-
-    ENTRY main {
-      lhs = bf16[2,1024] parameter(0)
-      lhs.broadcast = bf16[2,128,1024] broadcast(lhs), dimensions={0,2}
-      lhs.bitcast = bf16[256,1024] reshape(lhs.broadcast)
-
-      rhs = bf16[256,512] parameter(1)
-
-      ROOT dot = f32[1024,512] dot(lhs.bitcast, rhs),
-        lhs_contracting_dims={0}, rhs_contracting_dims={0}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[2,128,1024]{2,1,0} broadcast
-    CHECK:    %[[bitcast:.*]] = bf16[256,1024]{1,0} bitcast
-    CHECK:    ROOT %[[dot:.*]] = f32[1024,512]{1,0} dot
-    CHECK:    ENTRY %main
-  )"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
-TEST_F(TritonTest, FuseBroadcastBitcastMultiplyInPrologue) {
-  // This test is a Subchannel Dequantization fusion.
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseBroadcastBitcastMultiplyInPrologue
-
-    ENTRY main {
-      lhs = bf16[2,1024] parameter(0)
-      lhs.broadcast = bf16[2,128,1024] broadcast(lhs), dimensions={0,2}
-      lhs.bitcast = bf16[256,1024] reshape(lhs.broadcast)
-
-      lhs.weights = s4[256,1024] parameter(1)
-      lhs.weights.i8 = s8[256,1024] convert(lhs.weights)
-      lhs.weights.bf16 = bf16[256,1024] convert(lhs.weights.i8)
-      lhs.weights.scaled = bf16[256,1024] multiply(lhs.bitcast, lhs.weights.bf16)
-
-      rhs = bf16[256,512] parameter(2)
-
-      ROOT dot = f32[1024,512] dot(lhs.weights.scaled, rhs),
-        lhs_contracting_dims={0}, rhs_contracting_dims={0}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[{{.*}}]{2,1,0} broadcast
-    CHECK:    %[[bitcast:.*]] = bf16[{{.*}}]{1,0} bitcast
-    CHECK:    %[[multiply:.*]] = [[type:.*]][{{.*}}]{1,0} multiply
-    CHECK:    %[[dot:.*]] = f32[1024,512]{1,0} dot
-    CHECK:    ENTRY %main
-  )"));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
-TEST_F(TritonTest, DotWithI4WeightsOnLhsWithBitcastTo3dTensor) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule DotWithI4WeightsOnLhsWithBitcastTo3dTensor
-
-    fusion {
-      p0 = s4[256,16]{1,0:E(4)} parameter(0)
-      p0.2 = bf16[256,16] convert(p0)
-      p0.3 = bf16[4,64,16] bitcast(p0.2)
-      p1 = bf16[4,32,64] parameter(1)
-      ROOT dot = bf16[4,16,32] dot(p0.3, p1),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={2}
-    }
-
-    ENTRY entry_computation {
-      p0 = s4[256,16]{1,0:E(4)} parameter(0)
-      p1 = bf16[4,32,64] parameter(1)
-      ROOT dot = bf16[4,16,32] fusion(p0, p1),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
-TEST_F(TritonTest,
-       DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue
-
-    fusion {
-      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
-      p0.1 = s4[1,32,128]{2,1,0:E(4)} bitcast(p0)
-      p0.2 = bf16[1,32,128] convert(p0.1)
-      p0.3 = bf16[1,128,32]{1,2,0} bitcast(p0.2)
-      p1 = bf16[128,1,64] parameter(1)
-      dot = bf16[1,32,64] dot(p0.3, p1),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={1}, rhs_contracting_dims={0}
-      p2 = bf16[1,1,32]{2,0,1} parameter(2)
-      p2.1 = bf16[1,32] bitcast(p2)
-      p2.2 = bf16[1,32,64] broadcast(p2.1), dimensions={0,1}
-      m = bf16[1,32,64] multiply(dot, p2.2)
-      ROOT m.1 = bf16[1,1,32,64] bitcast(m)
-    }
-
-    ENTRY entry_computation {
-      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
-      p1 = bf16[128,1,64] parameter(1)
-      p2 = bf16[1,1,32]{2,0,1} parameter(2)
-      ROOT gemm_fusion_dot.2 = bf16[1,1,32,64] fusion(p0, p1, p2),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
-TEST_F(TritonTest, DotWithInt4WeightsOnLhsFusedWithMultiplyByChannelScales) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales
-
-    DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales {
-      w = s4[32,64,128] parameter(0)
-      w.i8 = s8[32,64,128] convert(w)
-      w.bf16 = bf16[32,64,128] convert(w.i8)
-      scales = bf16[32,128] parameter(1)
-      scales.broadcast = bf16[32,64,128] broadcast(scales), dimensions={0,2}
-      weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
-      activations = bf16[32,64,256] parameter(2)
-      ROOT dot = f32[32,128,256] dot(weights.scaled, activations),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      w = s4[32,64,128] parameter(0)
-      scales = bf16[32,128] parameter(1)
-      p2 = bf16[32,64,256] parameter(2)
-      ROOT dot = f32[32,128,256] fusion(w, scales, p2),
-        kind=kCustom,
-        calls=DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
-}
-
-TEST_F(TritonTest, FuseMultiplyInPrologue) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseMultiplyInPrologue
-
-    ENTRY main {
-      t = (s4[32,64,128], bf16[32,128]{0,1}, bf16[32,64,256]) parameter(0)
-      w = s4[32,64,128] get-tuple-element(t), index=0
-      w.i8 = s8[32,64,128] convert(w)
-      w.bf16 = bf16[32,64,128] convert(w.i8)
-      scales = bf16[32,128]{0,1} get-tuple-element(t), index=1
-      scales.broadcast = bf16[32,64,128] broadcast(scales), dimensions={0,2}
-      weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
-      activations = bf16[32,64,256] get-tuple-element(t), index=2
-      ROOT dot = f32[32,128,256] dot(weights.scaled, activations),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  // On Ampere the multiply result type is f32, on Hopper it is bf16.
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[multiply:.*]] = [[type:.*]][{{.*}}]{{.*}} multiply({{.*}}, {{.*}})
-    CHECK:    %[[dot:.*]] = f32[32,128,256]{2,1,0} dot
-    CHECK:    ENTRY %main
-  )"));
-}
-
-// TODO(b/449140429): Re-enable this test.
-TEST_F(TritonTest, DISABLED_FuseMultiplyInEpilogue) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule FuseMultiplyInEpilogue
-
-    ENTRY main {
-      p0 = s4[4,32,128]{2,1,0:E(4)} parameter(0)
-      p0.1 = bf16[4,32,128] convert(p0)
-      p1 = bf16[4,128,64] parameter(1)
-      dot = bf16[4,32,64] dot(p0.1, p1),
-        lhs_batch_dims={0}, lhs_contracting_dims={2},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-      p2 = bf16[4,32] parameter(2)
-      p2.1 = bf16[4,32,64] broadcast(p2), dimensions={0,1}
-      ROOT m = bf16[4,32,64] multiply(dot, p2.1)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-      CHECK:  %[[dot:.*]] = bf16[4,64,32]{1,2,0} dot
-      CHECK:  %[[multiply:.*]] = [[type:.*]][4,32,64]{2,1,0} multiply
-      CHECK:  ENTRY %main
-    )"));
-}
-
-TEST_F(TritonTest, NonstandardLayoutInt4) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule NonstandardLayoutInt4
-
-    ENTRY main {
-      p0 = s4[64,128]{0,1} parameter(0)
-      p1 = bf16[256,64] parameter(1)
-      ROOT dot = bf16[128,256] dot(p0, p1),
-        lhs_contracting_dims={0}, rhs_contracting_dims={1}
-    }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-using ::testing::TestParamInfo;
-using ::testing::WithParamInterface;
-
-struct I4TestParams {
-  static std::string ToString(const TestParamInfo<I4TestParams>& params) {
-    return params.param.name;
-  }
-
-  std::string Format(absl::string_view format) const {
-    return absl::StrReplaceAll(
-        format, {{"${name}", name},
-                 {"${lhs}", lhs},
-                 {"${rhs}", rhs},
-                 {"${lhs_contracting_dim}", absl::StrCat(lhs_contracting_dim)},
-                 {"${rhs_contracting_dim}", absl::StrCat(rhs_contracting_dim)},
-                 {"${out}", out}});
-  }
-  bool HasBatchDim() const {
-    return std::vector<std::string>(absl::StrSplit(lhs, ',')).size() > 2;
-  }
-
-  std::string name;         // The name of the test.
-  std::string lhs;          // The lhs shape like "128,16".
-  std::string rhs;          // The rhs shape like "128,256".
-  int lhs_contracting_dim;  // The contracting dimension of the lhs.
-  int rhs_contracting_dim;  // The contracting dimension of the rhs.
-  std::string out;          // The output shape like "16,256".
-};
-
-class ParametrizedTritonTest : public TritonTest,
-                               public WithParamInterface<I4TestParams> {};
-
-TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhs) {
-  if (GetParam().HasBatchDim()) {
-    GTEST_SKIP() << "2d test ignores batch dim case.";
-  }
-  constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule lhs_${name}
-
-    lhs_${name} {
-      w.s4 = s4[${lhs}] parameter(0)
-      w.s8 = s8[${lhs}] convert(w.s4)
-      w.bf16 = bf16[${lhs}] convert(w.s8)
-      a = bf16[${rhs}] parameter(1)
-      ROOT lhs_${name} = f32[${out}] dot(w.bf16, a),
-        lhs_contracting_dims={${lhs_contracting_dim}},
-        rhs_contracting_dims={${rhs_contracting_dim}}
-    }
-
-    ENTRY main {
-      w = s4[${lhs}] parameter(0)
-      a = bf16[${rhs}] parameter(1)
-      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
-        kind=kCustom,
-        calls=lhs_${name},
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
-  std::string hlo_text = GetParam().Format(kHloTextTemplate);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
-                                       ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}))
-      << "Failed for HLO: " << hlo_text;
-}
-
-TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhsWithBatchDim) {
-  if (!GetParam().HasBatchDim()) {
-    GTEST_SKIP() << "3d test ignores 2d case.";
-  }
-  constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule ${name}
-
-    fusion {
-      w.s4 = s4[${lhs}] parameter(0)
-      w.s8 = s8[${lhs}] convert(w.s4)
-      w.bf16 = bf16[${lhs}] convert(w.s8)
-      a = bf16[${rhs}] parameter(1)
-      ROOT dot.0 = f32[${out}] dot(w.bf16, a),
-        lhs_batch_dims={0}, lhs_contracting_dims={${lhs_contracting_dim}},
-        rhs_batch_dims={0}, rhs_contracting_dims={${rhs_contracting_dim}}
-    }
-
-    ENTRY gemm_fusion_dot_computation {
-      w = s4[${lhs}] parameter(0)
-      a = bf16[${rhs}] parameter(1)
-      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
-        kind=kCustom,
-        calls=fusion,
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
-  std::string hlo_text = GetParam().Format(kHloTextTemplate);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
-                                       ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}))
-      << "Failed for HLO: " << hlo_text;
-}
-
-TEST_P(ParametrizedTritonTest, Int4WeightsOnTheRhs) {
-  if (GetParam().HasBatchDim()) {
-    GTEST_SKIP() << "2d test ignores batch dim case.";
-  }
-
-  constexpr absl::string_view kHloTextTemplate = R"(
-    HloModule rhs_${name}
-
-    rhs_${name} {
-      a = bf16[${lhs}] parameter(0)
-      w.s4 = s4[${rhs}] parameter(1)
-      w.s8 = s8[${rhs}] convert(w.s4)
-      w.bf16 = bf16[${rhs}] convert(w.s8)
-      ROOT rhs_${name} = f32[${out}] dot(a, w.bf16),
-        lhs_contracting_dims={${lhs_contracting_dim}},
-        rhs_contracting_dims={${rhs_contracting_dim}}
-    }
-
-    ENTRY main {
-      a = bf16[${lhs}] parameter(0)
-      w = s4[${rhs}] parameter(1)
-      ROOT rhs_${name} = f32[${out}] fusion(a, w),
-        kind=kCustom,
-        calls=rhs_${name},
-        backend_config={
-          "fusion_backend_config":{
-            "kind":"__triton_gemm"
-          }
-        }
-    }
-  )";
-  std::string hlo_text = GetParam().Format(kHloTextTemplate);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
-                                       ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}))
-      << "Failed for HLO: " << hlo_text;
-}
-
-std::vector<I4TestParams> Int4TestCases() {
-  return {
-      {"int4_dot_128_16_x_128_256", "128,16", "128,256", 0, 0, "16,256"},
-      {"int4_dot_128_16_x_256_128", "128,16", "256,128", 0, 1, "16,256"},
-      {"int4_dot_16_128_x_256_128", "16,128", "256,128", 1, 1, "16,256"},
-      {"int4_dot_16_128_x_128_256", "16,128", "128,256", 1, 0, "16,256"},
-      {"int4_dot_1_128_x_256_128", "1,128", "256,128", 1, 1, "1,256"},
-      {"int4_dot_128_1_x_256_128", "128,1", "256,128", 0, 1, "1,256"},
-      {"int4_dot_16_128_x_128_1", "16,128", "128,1", 1, 0, "16,1"},
-      {"int4_dot_16_128_x_1_128", "16,128", "1,128", 1, 1, "16,1"},
-
-      {"dot_8_128_16_x_8_128_256", "8,128,16", "8,128,256", 1, 1, "8,16,256"},
-      {"dot_8_128_16_x_8_256_128", "8,128,16", "8,256,128", 1, 2, "8,16,256"},
-      {"dot_8_16_128_x_8_256_128", "8,16,128", "8,256,128", 2, 2, "8,16,256"},
-      {"dot_8_16_128_x_8_128_256", "8,16,128", "8,128,256", 2, 1, "8,16,256"},
-      {"dot_8_1_128_x_8_256_128", "8,1,128", "8,256,128", 2, 2, "8,1,256"},
-      {"dot_8_128_1_x_8_256_128", "8,128,1", "8,256,128", 1, 2, "8,1,256"},
-      {"dot_8_16_128_x_8_128_1", "8,16,128", "8,128,1", 2, 1, "8,16,1"},
-      {"dot_8_16_128_x_8_1_128", "8,16,128", "8,1,128", 2, 2, "8,16,1"},
-  };
-}
-
-INSTANTIATE_TEST_SUITE_P(ParametrizedTritonTest, ParametrizedTritonTest,
-                         ::testing::ValuesIn(Int4TestCases()),
-                         I4TestParams::ToString);
-
-TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDims) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule NonstandardLayoutWithManyNonContractingDims
-
-    ENTRY main {
-          p0 = s4[128,64,192]{1,0,2} parameter(0)
-          p1 = bf16[256,64] parameter(1)
-          ROOT dot = bf16[128,192,256] dot(p0, p1),
-            lhs_contracting_dims={1}, rhs_contracting_dims={1}
-    }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDimsReversedLayout) {
-  // We cannot do triton_gemm and we use cuBLAS instead.
-  constexpr absl::string_view kHloText = R"(
-    HloModule NonstandardLayoutWithManyNonContractingDimsReversedLayout
-
-    ENTRY main {
-          lhs = s4[128,64,192]{0,1,2} parameter(0)
-          rhs = bf16[256,64] parameter(1)
-          ROOT dot = bf16[128,192,256] dot(lhs, rhs),
-            lhs_contracting_dims={1}, rhs_contracting_dims={1}
-    }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, NegatePlusConvertHLO) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule NegatePlusConvertHLO
-
-    ENTRY main {
-      lhs = s4[2,32,64] parameter(0)
-      lhs_negated = s4[2,32,64] negate(lhs)
-      lhs_converted = bf16[2,32,64] convert(lhs_negated)
-      rhs = bf16[2,64,16] parameter(1)
-      ROOT dot = bf16[2,32,16] dot(lhs_converted, rhs),
-          lhs_batch_dims={0}, lhs_contracting_dims={2},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, RejectTritonFusionForWithMinorBatchDim) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule RejectTritonFusionForWithMinorBatchDim
-
-    ENTRY main {
-      lhs = s4[32,64,2] parameter(0)
-      lhs_converted = bf16[32,64,2] convert(lhs)
-      rhs = bf16[2,64,16] parameter(1)
-      ROOT dot = bf16[2,32,16] dot(lhs_converted, rhs),
-          lhs_batch_dims={2}, lhs_contracting_dims={1},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-  )";
-
-  const std::string pattern =
-      R"(CHECK-NOT: "kind":"__triton_gemm","triton_gemm_config")";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), pattern));
-}
-
-TEST_F(TritonTest, LHSWithMinorDimEqualTo1) {
-  // We prove that triton can handle int4 dot with non contracting dim size
-  // equal to 1.
-  constexpr absl::string_view kHloText = R"(
-    HloModule LHSWithMinorDimEqualTo1
-
-    triton_computation {
-      lhs = s4[2,1024,1] parameter(0)
-      lhs_converted = bf16[2,1024,1] convert(lhs)
-      rhs = bf16[2,64,1024] parameter(1)
-      ROOT dot = bf16[2,1,64] dot(lhs_converted, rhs),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={0}, rhs_contracting_dims={2}
-    }
-
-    ENTRY main {
-      lhs = s4[2,1024,1] parameter(0)
-      rhs = bf16[2,64,1024] parameter(1)
-      ROOT dot = bf16[2,1,64] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, RHSWithMinorDimEqualTo1) {
-  // We prove that triton can handle int4 dot with non contracting dim size
-  // equal to 1.
-  constexpr absl::string_view kHloText = R"(
-    HloModule RHSWithMinorDimEqualTo1
-
-    triton_computation {
-      lhs = bf16[2,1024,64] parameter(0)
-      rhs = s4[2,1024,1] parameter(1)
-      rhs_converted = bf16[2,1024,1] convert(rhs)
-      ROOT dot = bf16[2,64,1] dot(lhs, rhs_converted),
-          lhs_batch_dims={0}, lhs_contracting_dims={1},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = bf16[2,1024,64] parameter(0)
-      rhs = s4[2,1024,1] parameter(1)
-      ROOT dot = bf16[2,64,1] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, LHSNonMinorContractingDim) {
-  // We prove that triton can handle int4 dot with non minor
-  // lhs_contracting_dim.
-  constexpr absl::string_view kHloText = R"(
-    HloModule LHSNonMinorContractingDim
-
-    triton_computation {
-      lhs = s4[1024,8] parameter(0)
-      lhs_converted = bf16[1024,8] convert(lhs)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
-          lhs_contracting_dims={0}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = s4[1024,8] parameter(0)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, LHSNonMinorContractingDimWithBatchDim0) {
-  // We prove that triton can handle int4 dot with non minor
-  // lhs_contracting_dim.
-  constexpr absl::string_view kHloText = R"(
-    HloModule LHSNonMinorContractingDimWithBatchDim0
-
-    triton_computation {
-      lhs = s4[2,1024,8] parameter(0)
-      lhs_converted = bf16[2,1024,8] convert(lhs)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={1},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = s4[2,1024,8] parameter(0)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
-TEST_F(TritonTest, LHSMinorContractingDim) {
-  // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
-  constexpr absl::string_view kHloText = R"(
-    HloModule LHSMinorContractingDim
-
-    triton_computation {
-      lhs = s4[8,1024] parameter(0)
-      lhs_converted = bf16[8,1024] convert(lhs)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = s4[8,1024] parameter(0)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, ConvertPlusNegate) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule ConvertPlusNegate
-
-    triton_computation {
-      lhs = s4[8,1024] parameter(0)
-      lhs_converted = bf16[8,1024] convert(lhs)
-      lhs_negated = bf16[8,1024] negate(lhs_converted)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] dot(lhs_negated, rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = s4[8,1024] parameter(0)
-      rhs = bf16[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, LHSMinorContractingDimWithBatchDim0) {
-  // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
-  constexpr absl::string_view kHloText = R"(
-    HloModule LHSMinorContractingDimWithBatchDim0
-
-    triton_computation {
-      lhs = s4[2,8,1024] parameter(0)
-      lhs_converted = bf16[2,8,1024] convert(lhs)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={2},
-        rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = s4[2,8,1024] parameter(0)
-      rhs = bf16[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, RHSTestWithNotMinorContractingDim) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithNotMinorContractingDim
-
-    triton_computation {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[1024,4] parameter(1)
-      rhs_converted = bf16[1024,4] convert(rhs)
-      ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
-          lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-
-    ENTRY main {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[1024,4] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, RHSTestWithMinorContractingDim) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithMinorContractingDim
-
-    triton_computation {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[4,1024] parameter(1)
-      rhs_converted = bf16[4,1024] convert(rhs)
-      ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
-          lhs_contracting_dims={1}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = bf16[8,1024] parameter(0)
-      rhs = s4[4,1024] parameter(1)
-      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, RHSTestWithMinorContractingDimWithBatchDim) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithMinorContractingDimWithBatchDim
-
-    triton_computation {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,1024,4] parameter(1)
-      rhs_converted = bf16[2,1024,4] convert(rhs)
-      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
-          lhs_batch_dims={0}, lhs_contracting_dims={2},
-          rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY main {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,1024,4] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-TEST_F(TritonTest, RHSTestWithNotMinorContractingDimWithBatchDim0) {
-  constexpr absl::string_view kHloText = R"(
-    HloModule RHSTestWithNotMinorContractingDimWithBatchDim0
-
-    triton_computation {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,4,1024] parameter(1)
-      rhs_converted = bf16[2,4,1024] convert(rhs)
-      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
-          lhs_batch_dims={0}, lhs_contracting_dims={2},
-          rhs_batch_dims={0}, rhs_contracting_dims={2}
-    }
-
-    ENTRY main {
-      lhs = bf16[2,8,1024] parameter(0)
-      rhs = s4[2,4,1024] parameter(1)
-      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
-        calls=triton_computation,
-        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
-    }
-  )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc
deleted file mode 100644
index bfa1d66cd68f3f..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc
+++ /dev/null
@@ -1,2023 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
-
-#include <array>
-#include <climits>
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <queue>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/cord.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/MathExtras.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Support/LLVM.h"
-#include "xla/autotuning.pb.h"
-#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
-#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
-#include "xla/comparison_util.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_print_options.h"
-#include "xla/hlo/utils/hlo_query.h"
-#include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/layout.h"
-#include "xla/literal.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "xla/mlir_hlo/mhlo/transforms/transformation_helpers.h"
-#include "xla/primitive_util.h"
-#include "xla/service/algorithm_util.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
-#include "xla/service/gpu/triton_tiling_propagation.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/service/matmul_indexing_utils.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
-#include "xla/stream_executor/launch_dim.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Types.h"
-
-namespace xla::gpu {
-
-namespace ma = ::mlir::arith;
-namespace mm = ::mlir::math;
-namespace mt = ::mlir::triton;
-namespace mh = ::mlir::mhlo;
-
-using ::llvm::SmallVector;
-using ::mlir::ArrayRef;
-using ::mlir::ShapedType;
-using ::mlir::Type;
-using ::mlir::Value;
-using ::mlir::ValueRange;
-
-namespace {
-
-// Internal indices for the corresponding input scopes.
-const int kLhsIndex = 0;
-const int kRhsIndex = 1;
-
-absl::StatusOr<Type> TritonType(EmitterLocOpBuilder& b, PrimitiveType t) {
-  switch (t) {
-    case F64:
-      return b.getF64Type();
-    case F32:
-      return b.getF32Type();
-    case F16:
-      return b.getF16Type();
-    case BF16:
-      return b.getBF16Type();
-    case S64:
-      return b.getI64Type();
-    case S32:
-      return b.getI32Type();
-    case S16:
-      return b.getI16Type();
-    case PRED:
-      return b.getI1Type();
-    case S8:
-      return b.getI8Type();
-    case S4:
-      return b.getI4Type();
-    case F8E5M2:
-      return b.getType<mlir::Float8E5M2Type>();
-    case F8E4M3FN:
-      return b.getType<mlir::Float8E4M3FNType>();
-    default:
-      return absl::UnimplementedError(
-          absl::StrCat("This type is not supported yet: ",
-                       primitive_util::LowercasePrimitiveTypeName(t)));
-  }
-}
-
-Type StorageType(EmitterLocOpBuilder& b, Type t) {
-  if (t.isInteger(/*width=*/1)) {
-    return b.getI8Type();
-  }
-  return t;
-}
-
-// Create a scalar constant.
-template <typename T>
-ma::ConstantOp CreateConst(EmitterLocOpBuilder b, Type type, T value) {
-  if (mlir::isa<mlir::IntegerType>(type)) {
-    return b.create<ma::ConstantOp>(b.getIntegerAttr(type, value));
-  }
-  if (mlir::isa<mlir::FloatType>(type)) {
-    return b.create<ma::ConstantOp>(
-        b.getFloatAttr(type, static_cast<double>(value)));
-  }
-  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
-}
-
-// Create a tensor constant.
-template <typename T>
-ma::ConstantOp CreateConst(EmitterLocOpBuilder b, Type type, T value,
-                           llvm::ArrayRef<int64_t> shape) {
-  auto tensor_type = mlir::RankedTensorType::get(shape, type);
-  if (auto int_type = mlir::dyn_cast<mlir::IntegerType>(type)) {
-    return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
-        tensor_type,
-        mlir::APInt(int_type.getIntOrFloatBitWidth(), value,
-                    /*isSigned=*/std::is_signed_v<T>, /*implicitTrunc=*/true)));
-  }
-  if (auto float_type = mlir::dyn_cast<mlir::FloatType>(type)) {
-    return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
-        tensor_type, b.getFloatAttr(type, static_cast<double>(value))));
-  }
-  LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
-}
-
-Value ZerosLike(EmitterLocOpBuilder b, Value x) {
-  if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(x.getType())) {
-    Type src_ty = src_shaped_ty.getElementType();
-    return CreateConst(b, src_ty, 0, src_shaped_ty.getShape());
-  }
-  return CreateConst(b, x.getType(), 0);
-}
-
-Value OnesLike(EmitterLocOpBuilder b, Value x) {
-  if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(x.getType())) {
-    Type src_ty = src_shaped_ty.getElementType();
-    return CreateConst(b, src_ty, 1, src_shaped_ty.getShape());
-  }
-  return CreateConst(b, x.getType(), 1);
-}
-
-bool IsFp8Type(Type t) {
-  return llvm::isa<mlir::Float8E5M2Type, mlir::Float8E4M3FNType,
-                   mlir::Float8E5M2FNUZType, mlir::Float8E4M3FNUZType,
-                   mlir::Float8E4M3B11FNUZType>(t);
-}
-
-Value Subtract(EmitterLocOpBuilder b, ValueRange values) {
-  if (mlir::isa<mlir::IntegerType>(mlir::getElementTypeOrSelf(values[0]))) {
-    return b.create<ma::SubIOp>(values[0], values[1]);
-  } else {
-    return b.create<ma::SubFOp>(values[0], values[1]);
-  }
-}
-
-Value Compare(EmitterLocOpBuilder b, ValueRange values,
-              mh::ComparisonDirection direction) {
-  const Type type = mlir::getElementTypeOrSelf(values[0]);
-  if (mlir::isa<mlir::IntegerType>(type)) {
-    return b.create<ma::CmpIOp>(mh::impl::getCmpPredicate<ma::CmpIPredicate>(
-                                    direction,
-                                    /*isSigned=*/!type.isInteger(1))
-                                    .value(),
-                                values[0], values[1]);
-  }
-  return b.create<ma::CmpFOp>(
-      mh::impl::getCmpPredicate<ma::CmpFPredicate>(direction,
-                                                   /*isSigned=*/true)
-          .value(),
-      values[0], values[1]);
-}
-
-Value Maximum(EmitterLocOpBuilder b, const se::DeviceDescription& device_info,
-              ValueRange values) {
-  if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
-    return b.create<ma::MaximumFOp>(values);
-  }
-  // logic: isNaN(lhs) || (!isNan(rhs) && lhs >= rhs) ? lhs : rhs
-  // See also: IEEE Std 754-2008 5.11.
-  //
-  // This also works, but we wanted to make it similar to minimum.
-  // logic: isNaN(lhs) || lhs >= rhs ? lhs : rhs
-  Value lhs_is_nan =
-      Compare(b, {values[0], values[0]}, mh::ComparisonDirection::NE);
-  Value rhs_is_not_nan =
-      Compare(b, {values[1], values[1]}, mh::ComparisonDirection::EQ);
-  Value lhs_is_ge = Compare(b, values, mh::ComparisonDirection::GE);
-  return b.create<ma::SelectOp>(
-      b.create<ma::OrIOp>(lhs_is_nan,
-                          b.create<ma::AndIOp>(rhs_is_not_nan, lhs_is_ge)),
-      values[0], values[1]);
-}
-
-Value Minimum(EmitterLocOpBuilder b, const se::DeviceDescription& device_info,
-              ValueRange values) {
-  if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
-    return b.create<ma::MinimumFOp>(values);
-  }
-  // logic: isNaN(lhs) || (!isNan(rhs) && lhs <= rhs) ? lhs : rhs
-  // See also: IEEE Std 754-2008 5.11.
-  //
-  // This should also work, but the tests show that it doesn't work for
-  // minimum(x, NaN):
-  // logic: isNaN(lhs) || lhs <= rhs ? lhs : rhs
-  Value lhs_is_nan =
-      Compare(b, {values[0], values[0]}, mh::ComparisonDirection::NE);
-  Value rhs_is_not_nan =
-      Compare(b, {values[1], values[1]}, mh::ComparisonDirection::EQ);
-  Value lhs_is_le = Compare(b, values, mh::ComparisonDirection::LE);
-  return b.create<ma::SelectOp>(
-      b.create<ma::OrIOp>(lhs_is_nan,
-                          b.create<ma::AndIOp>(rhs_is_not_nan, lhs_is_le)),
-      values[0], values[1]);
-}
-
-Value Splat(EmitterLocOpBuilder b, Value value, ArrayRef<int64_t> shape) {
-  auto type = mlir::RankedTensorType::get(shape, value.getType());
-  return b.create<mt::SplatOp>(type, value);
-}
-
-absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder b,
-                                      absl::string_view libdevice_path,
-                                      const se::DeviceDescription& device_info,
-                                      const HloInstruction& hlo,
-                                      ValueRange inputs) {
-  if (triton::IsSupportedElementwiseLibdeviceFunction(hlo)) {
-    return triton::EmitElementwiseLibdeviceFunction(b, libdevice_path,
-                                                    device_info, hlo, inputs);
-  }
-  const bool is_integer = mlir::isa<mlir::IntegerType>(
-      mlir::getElementTypeOrSelf(inputs[0].getType()));
-
-  switch (hlo.opcode()) {
-    case HloOpcode::kCopy:
-      // Dimension transformations are taken care of separately.
-      return inputs[0];
-    case HloOpcode::kAbs:
-      if (is_integer) {
-        return b.create<mm::AbsIOp>(inputs[0]);
-      }
-      return b.create<mm::AbsFOp>(inputs[0]);
-    case HloOpcode::kCeil:
-      return b.create<mm::CeilOp>(inputs[0]);
-    case HloOpcode::kFloor:
-      return b.create<mm::FloorOp>(inputs[0]);
-    case HloOpcode::kNot:
-      return b.create<ma::XOrIOp>(inputs[0], OnesLike(b, inputs[0]));
-    case HloOpcode::kNegate:
-      // NegFOp is not supported by Triton.
-      return Subtract(b, {ZerosLike(b, inputs[0]), inputs[0]});
-    case HloOpcode::kConvert: {
-      TF_ASSIGN_OR_RETURN(Type dst_ty,
-                          TritonType(b, hlo.shape().element_type()));
-      return triton::Cast(b, inputs[0], dst_ty);
-    }
-    case HloOpcode::kAdd:
-      if (is_integer) {
-        return b.create<ma::AddIOp>(inputs[0], inputs[1]);
-      }
-      return b.create<ma::AddFOp>(inputs[0], inputs[1]);
-    case HloOpcode::kSubtract:
-      return Subtract(b, inputs);
-    case HloOpcode::kMultiply:
-      if (is_integer) {
-        return b.create<ma::MulIOp>(inputs[0], inputs[1]);
-      }
-      return b.create<ma::MulFOp>(inputs[0], inputs[1]);
-    case HloOpcode::kMaximum:
-      return Maximum(b, device_info, inputs);
-    case HloOpcode::kMinimum:
-      return Minimum(b, device_info, inputs);
-    case HloOpcode::kClamp:
-      return Maximum(
-          b, device_info,
-          {Minimum(b, device_info, {inputs[1], inputs[2]}), inputs[0]});
-    case HloOpcode::kAnd:
-      return b.create<ma::AndIOp>(inputs[0], inputs[1]);
-    case HloOpcode::kOr:
-      return b.create<ma::OrIOp>(inputs[0], inputs[1]);
-    case HloOpcode::kXor:
-      return b.create<ma::XOrIOp>(inputs[0], inputs[1]);
-    case HloOpcode::kDivide:
-      if (is_integer) {
-        // Unsigned not supported yet.
-        return b.create<ma::DivSIOp>(inputs[0], inputs[1]);
-      }
-      return b.create<ma::DivFOp>(inputs[0], inputs[1]);
-    case HloOpcode::kCompare:
-      return Compare(
-          b, inputs,
-          mh::symbolizeComparisonDirection(
-              ComparisonDirectionToString(hlo.comparison_direction()))
-              .value());
-    case HloOpcode::kSelect:
-      return b.create<ma::SelectOp>(
-          Compare(b, {inputs[0], ZerosLike(b, inputs[0])},
-                  mh::ComparisonDirection::NE),
-          inputs[1], inputs[2]);
-    case HloOpcode::kReducePrecision:
-      return mh::reducePrecision<mt::BitcastOp>(
-          b.getLoc(), inputs[0], hlo.exponent_bits(), hlo.mantissa_bits(), &b);
-    default:
-      return absl::InvalidArgumentError(
-          absl::StrCat("Unsupported elementwise operation ", hlo.ToString()));
-  }
-}
-
-absl::StatusOr<Value> EmitConstant(EmitterLocOpBuilder b,
-                                   const HloInstruction& constant) {
-  CHECK_EQ(constant.opcode(), HloOpcode::kConstant);
-  CHECK(ShapeUtil::IsEffectiveScalar(constant.shape()));
-
-  TF_ASSIGN_OR_RETURN(Type ty, TritonType(b, constant.shape().element_type()));
-
-  if (constant.shape().element_type() == U64) {
-    TF_ASSIGN_OR_RETURN(Literal converted, constant.literal().Convert(U64));
-    return CreateConst(b, ty, converted.GetFirstElement<uint64_t>());
-  }
-
-  if (constant.shape().AreAllLeavesIntegers()) {
-    TF_ASSIGN_OR_RETURN(Literal converted, constant.literal().Convert(S64));
-    return CreateConst(b, ty, converted.GetFirstElement<int64_t>());
-  }
-
-  TF_ASSIGN_OR_RETURN(Literal converted, constant.literal().Convert(F64));
-  return CreateConst(b, ty, converted.GetFirstElement<double>());
-}
-
-using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
-
-Value Broadcast(EmitterLocOpBuilder b, TensorValue value,
-                ArrayRef<int64_t> shape) {
-  return b.create<mt::BroadcastOp>(value.getType().clone(shape), value);
-}
-
-Value Range(EmitterLocOpBuilder b, int32_t limit) {
-  auto type = mlir::RankedTensorType::get(limit, b.getI32Type());
-  return b.create<mt::MakeRangeOp>(type, 0, limit);
-}
-
-Value AddPtr(EmitterLocOpBuilder b, Value ptr, Value offset) {
-  return b.create<mt::AddPtrOp>(ptr.getType(), ptr, offset);
-}
-
-Value EmitParameterLoad(EmitterLocOpBuilder b, Value pointer,
-                        ArrayRef<int32_t> boundary_checks) {
-  if (mt::isTensorPointerType(pointer.getType())) {
-    std::optional<mt::PaddingOption> padding;
-    if (!boundary_checks.empty()) {
-      padding = mt::PaddingOption::PAD_ZERO;
-    }
-    return b.create<mt::LoadOp>(pointer, boundary_checks, padding,
-                                mt::CacheModifier::NONE,
-                                mt::EvictionPolicy::NORMAL,
-                                /*isVolatile=*/false);
-  }
-
-  // EmitTensorPointer will not create a MakeTensorPtrOp for scalars.
-  return Splat(b,
-               b.create<mt::LoadOp>(pointer, mt::CacheModifier::NONE,
-                                    mt::EvictionPolicy::NORMAL,
-                                    /*isVolatile=*/false),
-               {});
-}
-
-// Grouped properties of tiled dimensions used to generate block pointers.
-struct DimProperties {
-  DimProperties(int64_t index, Value pid, int block_size, int split_value)
-      : index(index),
-        pid(pid),
-        block_size(block_size),
-        split_value(split_value) {}
-
-  // Logical index of the dimension at the tiling-defining operation.
-  int64_t index;
-  // Block program ID corresponding to this dimension.
-  Value pid;
-  // Elements of the dimension to process per block program.
-  int block_size;
-  // Size of the major part of the dimension if it's split into two parts.
-  int split_value;
-};
-
-struct Side {
-  explicit Side(TritonFusionAnalysis::Scope scope,
-                std::vector<DimProperties> tiled_dims = {},
-                std::optional<int64_t> batch_dim_idx = std::nullopt)
-      : scope(scope), tiled_dims(tiled_dims), batch_dim_idx(batch_dim_idx) {}
-  TritonFusionAnalysis::Scope scope;
-  std::vector<DimProperties> tiled_dims;
-  std::optional<int64_t> batch_dim_idx;
-};
-
-absl::StatusOr<Value> EmitBroadcast(EmitterLocOpBuilder b,
-                                    const TritonFusionAnalysis* analysis,
-                                    const Side& side,
-                                    const HloInstruction& broadcast,
-                                    Value input) {
-  TF_RET_CHECK(analysis != nullptr);
-  std::vector<int64_t> out_shape;
-
-  auto tensor_input = mlir::dyn_cast<TensorValue>(input);
-
-  // The broadcasted dimension could be a non-trivial like broadcast + bitcast.
-  // For example:
-  // s8[2,256,128]broadcast(s8[2,128])
-  // s8[512,128]bitcast(s8[2,256,128])
-  // I.e. we broadcast the first dimension from 2 to 512.
-  // When this is the case we don't need to expand the tile because it is
-  // already 2d but we still need to broadcast it.
-  bool non_trivial_broadcast = false;
-
-  for (const DimProperties& dim : side.tiled_dims) {
-    const TensorIterationSpec::DimIterationSpec* spec =
-        analysis->IterSpec(side.scope, &broadcast, dim.index);
-    if (spec != nullptr && spec->at(0).stride > 0) {
-      out_shape.push_back(
-          (spec->at(0).broadcast_multiplier != 1) ? 1 : dim.block_size);
-      non_trivial_broadcast |= spec->at(0).subfragments.size() != 1;
-    }
-  }
-
-  if (!tensor_input) {
-    // Input is scalar.
-    return Splat(b, input, out_shape);
-  }
-  if (!non_trivial_broadcast &&
-      tensor_input.getType().getRank() == out_shape.size()) {
-    // No dimensions to broadcast.
-    return input;
-  }
-  // Add broadcasted dimensions one by one.
-  Value expanded_input = tensor_input;
-  int dim_idx = 0;
-  for (const DimProperties& dim : side.tiled_dims) {
-    const auto* output_spec =
-        analysis->IterSpec(side.scope, &broadcast, dim.index);
-    if (output_spec != nullptr && output_spec->at(0).stride > 0) {
-      const auto* input_spec =
-          analysis->IterSpec(side.scope, broadcast.operand(0), dim.index);
-      // A dimension is broadcasted if it's either absent in the input or
-      // if its size is increased from the input to the output.
-      if (tensor_input.getType().getRank() != out_shape.size()) {
-        if (input_spec == nullptr ||
-            output_spec->at(0).count > input_spec->at(0).count) {
-          expanded_input = b.create<mt::ExpandDimsOp>(expanded_input, dim_idx);
-        }
-      }
-      ++dim_idx;
-    }
-  }
-  return Broadcast(b, mlir::cast<TensorValue>(expanded_input), out_shape);
-}
-
-// Emit sequence of instructions using compatible tiling ordered producers
-// before consumers.
-absl::StatusOr<Value> EmitScope(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const TritonFusionAnalysis* analysis, const Side& side,
-    absl::Span<const HloInstruction* const> instructions,
-    absl::flat_hash_map<const HloInstruction*, Value>& values) {
-  for (const HloInstruction* hlo : instructions) {
-    Value result;
-    if (hlo->opcode() == HloOpcode::kConvert &&
-        hlo->operand(0)->shape().element_type() == S4) {
-      Value unpacked;
-      unpacked = triton::Cast(b, values[hlo->operand(0)], b.getI8Type());
-      std::vector<Value> operands({unpacked});
-      TF_ASSIGN_OR_RETURN(result, EmitElementwise(b, libdevice_path,
-                                                  device_info, *hlo, operands));
-    } else if (hlo->opcode() == HloOpcode::kConcatenate ||
-               hlo->opcode() == HloOpcode::kDynamicSlice) {
-      // Parameter loads and their concatenations are handled outside EmitScope.
-      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
-      continue;
-    } else if (hlo->opcode() == HloOpcode::kParameter) {
-      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate ||
-          hlo->users()[0]->opcode() == HloOpcode::kDynamicSlice) {
-        continue;
-      }
-      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
-      continue;
-    } else if (hlo->opcode() == HloOpcode::kConstant) {
-      TF_ASSIGN_OR_RETURN(Value constant, EmitConstant(b, *hlo));
-      // Splat makes it a tensor to avoid type mismatches.
-      result = Splat(b, constant, {});
-    } else if (hlo->opcode() == HloOpcode::kBroadcast) {
-      TF_ASSIGN_OR_RETURN(result, EmitBroadcast(b, analysis, side, *hlo,
-                                                values[hlo->operand(0)]));
-    } else if (HloInstruction::IsOpElementwise(hlo->opcode())) {
-      std::vector<Value> operands;
-      operands.reserve(hlo->operands().size());
-      for (const HloInstruction* operand : hlo->operands()) {
-        operands.push_back(values[operand]);
-      }
-      TF_ASSIGN_OR_RETURN(result, EmitElementwise(b, libdevice_path,
-                                                  device_info, *hlo, operands));
-    } else if (hlo->opcode() == HloOpcode::kTuple) {
-      TF_RET_CHECK(hlo->IsRoot()) << hlo->ToString();
-    } else if (hlo->opcode() == HloOpcode::kBitcast ||
-               hlo->opcode() == HloOpcode::kTranspose ||
-               hlo->opcode() == HloOpcode::kSlice ||
-               hlo->opcode() == HloOpcode::kReshape ||
-               hlo->opcode() == HloOpcode::kPad) {
-      // All these are currently supported only as operations on indices
-      // which are pushed to loads and stores. No operations on tiles are
-      // performed here.
-      result = values[hlo->operand(0)];
-    } else {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Unsupported operation ", hlo->ToString()));
-    }
-    TF_RET_CHECK(values.insert({hlo, result}).second) << hlo->ToString();
-    VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
-  }
-  return values[instructions.back()];
-}
-
-const TensorIterationSpec::DimIterationSpec* GetLhsNoncontractingSplitSpec(
-    const TritonFusionAnalysis& analysis, int64_t lhs_noncontracting_dim_idx) {
-  const TensorIterationSpec::DimIterationSpec* result = nullptr;
-  for (const HloInstruction* lhs_param :
-       analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS)) {
-    const TensorIterationSpec::DimIterationSpec* spec =
-        analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, lhs_param,
-                          lhs_noncontracting_dim_idx);
-    if (spec != nullptr && spec->size() > 1) {
-      CHECK_EQ(spec->size(), 2);
-      if (result != nullptr) {
-        CHECK_EQ(result->at(0).count, spec->at(0).count);
-        CHECK_EQ(result->at(1).count, spec->at(1).count);
-      }
-      result = spec;
-    }
-  }
-  return result;
-}
-
-// Structure for parameters relating to the MatMul shape and dimension indices.
-//
-// Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
-//
-// The logical output dimensions are always ordered as:
-//   split-K, batch, non-contracting LHS, non-contracting RHS,
-// where split-K and batch are optional.
-struct MatMulDims {
-  static absl::StatusOr<MatMulDims> Create(
-      const TritonGemmConfig& config, const HloDotInstruction& dot,
-      const TritonFusionAnalysis& analysis);
-
-  std::optional<int> out_split_k_dim_idx = std::nullopt;
-
-  std::optional<int> lhs_batch_dim_idx = std::nullopt;
-  std::optional<int> rhs_batch_dim_idx = std::nullopt;
-  std::optional<int> out_batch_dim_idx = std::nullopt;
-
-  // The LHS non-contracting can be split into two.
-  std::optional<int64_t> lhs_noncontracting_split = std::nullopt;
-
-  int lhs_contracting_dim_idx;
-  int lhs_noncontracting_dim_idx;
-  int rhs_contracting_dim_idx;
-  int rhs_noncontracting_dim_idx;
-  // The index of the LHS noncontracting dim in the output.
-  int out_lhs_noncontracting_dim_idx;
-  // The index of the RHS noncontracting dim in the output.
-  int out_rhs_noncontracting_dim_idx;
-
-  int64_t m;
-  int64_t n;
-  int64_t k;
-  TritonGemmConfig config;
-
-  std::string ToString() const {
-    return absl::StrCat("MxNxK: ", m, "x", n, "x", k,
-                        " contracting: lhs=", lhs_contracting_dim_idx,
-                        " rhs=", rhs_contracting_dim_idx);
-  }
-
- private:
-  MatMulDims() = default;
-};
-
-// Structure for parameters relating to the MatMul launch grid.
-struct MatMulLaunchConfig {
-  explicit MatMulLaunchConfig(const TritonGemmConfig& config,
-                              const HloDotInstruction& dot,
-                              const MatMulDims& dims,
-                              const se::DeviceDescription& device_info);
-
-  int64_t grid_m;
-  int64_t grid_n;
-  LaunchDimensions launch_dims;
-  mt::ProgramIDDim batch_program_id_dim;
-  mt::ProgramIDDim noncontracting_program_id_dim;
-};
-
-ma::ConstantOp Cst(EmitterLocOpBuilder b, Type index_ty, int64_t v) {
-  return CreateConst(b, index_ty, v);
-}
-
-AutotuneResult::TritonGemmKey DefaultTritonGemmKey() {
-  AutotuneResult::TritonGemmKey triton_gemm_key;
-  triton_gemm_key.set_block_m(64);
-  triton_gemm_key.set_block_k(64);
-  triton_gemm_key.set_block_n(64);
-  triton_gemm_key.set_split_k(1);
-  triton_gemm_key.set_num_stages(1);
-  triton_gemm_key.set_num_warps(2);
-  triton_gemm_key.set_num_ctas(1);
-  return triton_gemm_key;
-}
-
-ma::ConstantOp Cst32(EmitterLocOpBuilder b, int32_t v) {
-  return CreateConst(b, b.getI32Type(), v);
-}
-
-ma::ConstantOp Cst64(EmitterLocOpBuilder b, int64_t v) {
-  return CreateConst(b, b.getI64Type(), v);
-}
-
-/*static*/ absl::StatusOr<MatMulDims> MatMulDims::Create(
-    const TritonGemmConfig& config, const HloDotInstruction& dot,
-    const TritonFusionAnalysis& analysis) {
-  MatMulDims matmul_dims;
-  matmul_dims.config = config;
-  if (config.split_k > 1) {
-    // split-k is always the first logical dimension.
-    matmul_dims.out_split_k_dim_idx = 0;
-  }
-
-  int64_t num_split_k_dims = config.split_k > 1 ? 1 : 0;
-  const auto& dims = dot.dot_dimension_numbers();
-  matmul_dims.lhs_contracting_dim_idx = dims.lhs_contracting_dimensions(0);
-  matmul_dims.lhs_noncontracting_dim_idx =
-      GetNonContractingDims(dot.operand(0)->shape(),
-                            dims.lhs_batch_dimensions(),
-                            dims.lhs_contracting_dimensions())
-          .value()[0];
-  matmul_dims.rhs_contracting_dim_idx = dims.rhs_contracting_dimensions(0);
-  matmul_dims.rhs_noncontracting_dim_idx =
-      GetNonContractingDims(dot.operand(1)->shape(),
-                            dims.rhs_batch_dimensions(),
-                            dims.rhs_contracting_dimensions())
-          .value()[0];
-
-  if (dims.lhs_batch_dimensions_size() > num_split_k_dims) {
-    matmul_dims.lhs_batch_dim_idx = *dims.lhs_batch_dimensions().rbegin();
-    matmul_dims.rhs_batch_dim_idx = *dims.rhs_batch_dimensions().rbegin();
-    // The batch dimension (if present) comes after the split-k dimension (if
-    // present, otherwise it's the first dimension).
-    matmul_dims.out_batch_dim_idx = num_split_k_dims;
-  }
-
-  // Logical output dimensions are always ordered as:
-  //   split-K, batch, non-contracting LHS, non-contracting RHS,
-  // where split-K and batch are optional.
-  matmul_dims.out_rhs_noncontracting_dim_idx =
-      dot.shape().dimensions().size() - 1;
-  matmul_dims.out_lhs_noncontracting_dim_idx =
-      dot.shape().dimensions().size() - 2;
-
-  auto* root = dot.parent()->root_instruction();
-  auto iter_spec =
-      analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
-                        matmul_dims.out_rhs_noncontracting_dim_idx);
-  TF_RET_CHECK(iter_spec != nullptr);
-  matmul_dims.n = iter_spec->at(0).count;
-  // Contracting dimension length.
-  if (config.split_k > 1 &&
-      dot.operand(1)->operand(0)->opcode() == HloOpcode::kPad) {
-    // Unpadded LHS shape:  [..., k, ...]
-    // Padded LHS shape:    [..., padded_k, ...]
-    // Bitcasted LHS shape: [..., split_k, padded_k / split_k, ...]
-    TF_RET_CHECK(dot.operand(1)->opcode() == HloOpcode::kBitcast);
-    const Shape& unpadded_rhs_shape =
-        dot.operand(1)->operand(0)->operand(0)->shape();
-    matmul_dims.k =
-        unpadded_rhs_shape.dimensions(dims.rhs_contracting_dimensions(0) - 1);
-  } else {
-    matmul_dims.k =
-        dot.operand(1)->shape().dimensions(dims.rhs_contracting_dimensions(0)) *
-        config.split_k;
-  }
-
-  auto* lhs_noncontracting_split_spec = GetLhsNoncontractingSplitSpec(
-      analysis, matmul_dims.lhs_noncontracting_dim_idx);
-  if (lhs_noncontracting_split_spec != nullptr) {
-    // Just the fastest-varying part of it if the dimension is split.
-    matmul_dims.m = lhs_noncontracting_split_spec->at(0).count;
-    matmul_dims.lhs_noncontracting_split =
-        lhs_noncontracting_split_spec->at(1).count;
-  } else {
-    matmul_dims.m = analysis
-                        .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
-                                  matmul_dims.out_lhs_noncontracting_dim_idx)
-                        ->at(0)
-                        .count;
-  }
-
-  // For now split non-contracting and batch are not supported
-  // simultaneously because they are implemented via same mechanism.
-  TF_RET_CHECK(!(matmul_dims.out_batch_dim_idx.has_value() &&
-                 matmul_dims.lhs_noncontracting_split.has_value()));
-
-  TF_RET_CHECK(matmul_dims.m >= 1);
-  TF_RET_CHECK(matmul_dims.n >= 1);
-  return std::move(matmul_dims);
-}
-
-MatMulLaunchConfig::MatMulLaunchConfig(const TritonGemmConfig& config,
-                                       const HloDotInstruction& dot,
-                                       const MatMulDims& dims,
-                                       const se::DeviceDescription& device_info)
-    : grid_m((dims.m + config.block_m - 1) / config.block_m),
-      grid_n((dims.n + config.block_n - 1) / config.block_n) {
-  int64_t batch_size = dims.lhs_noncontracting_split.value_or(
-      dims.out_batch_dim_idx.has_value()
-          ? dot.shape().dimensions(*dims.out_batch_dim_idx)
-          : 1);
-  // X block size is 32-bit, Y and Z are 16-bit. Use X for large dimensions.
-  constexpr int64_t kBlockCountYZLimit = 65536;
-
-  // In the imaginary situation where both batch size and grid_m * grid_n
-  // are over 65535 we have to give up. Given the minimal m, n block sizes of 16
-  // this requires at least 256 GB of output.
-  CHECK_LT(batch_size * grid_m * grid_n,
-           kBlockCountYZLimit * kBlockCountYZLimit);
-
-  const bool large_batch = batch_size >= kBlockCountYZLimit;
-  if (large_batch) {
-    batch_program_id_dim = mt::ProgramIDDim::X;
-    noncontracting_program_id_dim = mt::ProgramIDDim::Y;
-    launch_dims = LaunchDimensions(
-        se::BlockDim(batch_size, grid_m * grid_n, config.split_k),
-        se::ThreadDim(config.num_warps * WarpSize(device_info), 1, 1));
-  } else {
-    batch_program_id_dim = mt::ProgramIDDim::Y;
-    noncontracting_program_id_dim = mt::ProgramIDDim::X;
-    launch_dims = LaunchDimensions(
-        se::BlockDim(grid_m * grid_n, batch_size, config.split_k),
-        se::ThreadDim(config.num_warps * WarpSize(device_info), 1, 1));
-  }
-}
-
-absl::Status ValidateMatMulConfig(const TritonGemmConfig& config,
-                                  const HloDotInstruction& dot) {
-  TF_RET_CHECK(config.split_k >= 1);
-  TF_RET_CHECK(config.block_m >= 16);
-  TF_RET_CHECK(config.block_k >= 16);
-  TF_RET_CHECK(config.block_n >= 16);
-
-  const auto& dims = dot.dot_dimension_numbers();
-  int num_batch_dims =
-      dims.lhs_batch_dimensions_size() - (config.split_k > 1 ? 1 : 0);
-  TF_RET_CHECK(num_batch_dims <= 1);
-  if (config.split_k > 1) {
-    // Split-K dimension has to be the first batch one and have an index
-    // just before the contracting one.
-    const int lhs_split_k_dim_idx = dims.lhs_contracting_dimensions(0) - 1;
-    const int rhs_split_k_dim_idx = dims.rhs_contracting_dimensions(0) - 1;
-    // Size of this dimension has to match the split_k value.
-    TF_RET_CHECK(dims.lhs_batch_dimensions(0) == lhs_split_k_dim_idx);
-    TF_RET_CHECK(dims.rhs_batch_dimensions(0) == rhs_split_k_dim_idx);
-    TF_RET_CHECK(config.split_k ==
-                 dot.operand(0)->shape().dimensions(lhs_split_k_dim_idx));
-    TF_RET_CHECK(config.split_k ==
-                 dot.operand(1)->shape().dimensions(rhs_split_k_dim_idx));
-  }
-
-  // Rely on dot decomposer: there is just one contracting and one
-  // non-contracting dimension on each side + batch ones optionally.
-  TF_RET_CHECK(dims.lhs_contracting_dimensions_size() == 1);
-  TF_RET_CHECK(dims.rhs_contracting_dimensions_size() == 1);
-
-  TF_RET_CHECK(dot.operand(0)->shape().dimensions().size() ==
-               2 + (config.split_k > 1 ? 1 : 0) + num_batch_dims);
-  return absl::OkStatus();
-}
-
-// if (index < limits[0]) {
-//   return choices[0];
-// } else if (index < limits[1]) {
-//   return choices[1];
-// } else if (...) {
-// ...
-// } else {
-//   return choices.back();
-// }
-absl::StatusOr<Value> EmitMultiSelect(EmitterLocOpBuilder& b, Value index,
-                                      ValueRange limits, ValueRange choices) {
-  TF_RET_CHECK(choices.size() - 1 == limits.size());
-  Value result = choices[0];
-  for (int i = 0; i < choices.size() - 1; ++i) {
-    result = b.create<ma::SelectOp>(
-        b.create<ma::CmpIOp>(ma::CmpIPredicate::slt, index, limits[i]), result,
-        choices[i + 1]);
-  }
-  return result;
-}
-
-absl::Status UncompilableMatmul(absl::string_view explanation) {
-  absl::Status s = absl::CancelledError(explanation);
-  s.SetPayload(kUncompilableFusion, absl::Cord(explanation));
-  return s;
-}
-
-bool IsFp8Matmul(const HloDotInstruction* dot) {
-  return primitive_util::IsF8Type(dot->operand(0)->shape().element_type()) &&
-         primitive_util::IsF8Type(dot->operand(1)->shape().element_type());
-}
-
-class MatMulEmitterHelper {
- public:
-  MatMulEmitterHelper(absl::string_view libdevice_path,
-                      const se::DeviceDescription& device_info,
-                      const HloDotInstruction* dot_instr, Type index_ty,
-                      MatMulDims dims, const MatMulLaunchConfig& launch_config,
-                      const TritonFusionAnalysis& analysis)
-      : libdevice_path_(libdevice_path),
-        device_info_(device_info),
-        dot_instr_(dot_instr),
-        index_ty_(index_ty),
-        analysis_(analysis),
-        dims_(dims),
-        launch_config_(launch_config) {}
-
-  std::vector<const HloInstruction*> EpiloguePostOrderTransitiveOperands(
-      const HloInstruction* root) {
-    // Collect all instructions of the dot's output scope.
-    absl::flat_hash_set<const HloInstruction*> to_order;
-    {
-      std::queue<const HloInstruction*> to_add;
-      if (root != dot_instr_) {
-        to_add.push(root);
-      }
-      while (!to_add.empty()) {
-        const HloInstruction* current = to_add.front();
-        for (const HloInstruction* operand : current->operands()) {
-          if (!to_order.contains(operand)) {
-            if (operand != dot_instr_) {
-              to_add.push(operand);
-            }
-          }
-        }
-        to_order.insert(current);
-        to_add.pop();
-      }
-    }
-    // Order them producers before consumers.
-    std::vector<const HloInstruction*> to_emit;
-    for (const HloInstruction* hlo :
-         dot_instr_->parent()->MakeInstructionPostOrder()) {
-      if (to_order.contains(hlo)) {
-        to_emit.push_back(hlo);
-      }
-    }
-    return to_emit;
-  }
-
-  Value MakeInput(EmitterLocOpBuilder b, const Side& side,
-                  int64_t operand_index,
-                  absl::flat_hash_map<const HloInstruction*, Value>& values) {
-    return *EmitScope(
-        b, libdevice_path_, device_info_, &analysis_, side,
-        dot_instr_->parent()->MakeInstructionPostOrderFrom(
-            const_cast<HloInstruction&>(*dot_instr_->operand(operand_index))),
-        values);
-  }
-
-  int64_t GetNonContractingDimIdxForOperandScope(
-      TritonFusionAnalysis::Scope scope) {
-    if (scope == TritonFusionAnalysis::Scope::LHS) {
-      return dims_.lhs_noncontracting_dim_idx;
-    } else if (scope == TritonFusionAnalysis::Scope::RHS) {
-      return dims_.rhs_noncontracting_dim_idx;
-    } else {
-      CHECK(false) << "This shouldn't be called for the other scopes.";
-    }
-  }
-
-  bool IsNonTrivialTiledDimension(TritonFusionAnalysis::Scope scope,
-                                  int64_t dim_index) {
-    switch (scope) {
-      case TritonFusionAnalysis::Scope::LHS:
-        return (dim_index == dims_.lhs_noncontracting_dim_idx && dims_.m > 1) ||
-               (dim_index == dims_.lhs_contracting_dim_idx && dims_.k > 1);
-      case TritonFusionAnalysis::Scope::RHS:
-        return (dim_index == dims_.rhs_noncontracting_dim_idx && dims_.n > 1) ||
-               (dim_index == dims_.rhs_contracting_dim_idx && dims_.k > 1);
-      case TritonFusionAnalysis::Scope::OUTPUT:
-        return (dim_index == dims_.out_lhs_noncontracting_dim_idx &&
-                dims_.m > 1) ||
-               (dim_index == dims_.out_rhs_noncontracting_dim_idx &&
-                dims_.n > 1);
-      default:
-        break;
-    }
-    return false;
-  }
-
-  bool NonTrivialTiledDimensionHasNoIterationAtParameter(
-      TritonFusionAnalysis::Scope scope, const HloInstruction& hlo,
-      int64_t dim_index) {
-    const TensorIterationSpec::DimIterationSpec* spec =
-        analysis_.IterSpec(scope, &hlo, dim_index);
-    return spec == nullptr ||
-           (IsNonTrivialTiledDimension(scope, dim_index) && spec->size() == 1 &&
-            (spec->at(0).count <= 1 || spec->at(0).stride == 0));
-  }
-
-  // Returns the increments necessary to advance the pointer or offset of the
-  // given side & hlo instruction.
-  SmallVector<Value> EmitIncrements(EmitterLocOpBuilder b, const Side& side,
-                                    const HloInstruction& hlo,
-                                    int64_t contracting_dimension, Value ki,
-                                    int64_t block_k) {
-    SmallVector<Value> increments;
-    for (const DimProperties& dim : side.tiled_dims) {
-      if (NonTrivialTiledDimensionHasNoIterationAtParameter(side.scope, hlo,
-                                                            dim.index)) {
-        continue;
-      }
-      // Only the contracting dimensions are advanced.
-      if (dim.index == contracting_dimension) {
-        const TensorIterationSpec::DimIterationSpec* spec =
-            analysis_.IterSpec(side.scope, &hlo, dim.index);
-        int64_t broadcast = spec->at(0).broadcast_multiplier;
-        if (broadcast > 1) {
-          if (broadcast != block_k) {
-            // If the broadcast multiplier is not equal to the block_k, we need
-            // to compute the increment conditionally. It has to advance by 1
-            // if the current block is the last one in the broadcasted fragment,
-            // and by 0 otherwise. Advance by 1 is computed as:
-            // ((ki + block_k) / broadcast) * broadcast == ki + block_k.
-            Value one = Cst32(b, 1);
-            Value zero = Cst32(b, 0);
-            Value add = b.create<ma::AddIOp>(ki, Cst32(b, block_k));
-            Value div = b.create<ma::DivSIOp>(add, Cst32(b, broadcast));
-            Value mul = b.create<ma::MulIOp>(div, Cst32(b, broadcast));
-            Value cond = b.create<ma::CmpIOp>(ma::CmpIPredicate::eq, mul, add);
-            Value one_or_zero = b.create<ma::SelectOp>(cond, one, zero);
-            increments.push_back(one_or_zero);
-          } else {
-            // If the broadcast multiplier is equal to the block_k, we can
-            // advance by 1 unconditionally.
-            increments.push_back(Cst32(b, 1));
-          }
-        } else {
-          increments.push_back(Cst32(b, dim.block_size * dim.split_value));
-        }
-      } else {
-        increments.push_back(Cst32(b, 0));
-      }
-    }
-    return increments;
-  }
-
-  // Return the batch stride of the HLO passed as a parameter. If the
-  // parameter HLO has no batch dimension, a zero stride is returned.
-  // Also sets offset_batch and updates has_batch_offset as a side effect.
-  absl::StatusOr<Value> GetBatchStride(EmitterLocOpBuilder b, const Side& side,
-                                       const HloInstruction* hlo_param,
-                                       int64_t& offset_batch,
-                                       bool& has_batch_offset) {
-    int64_t stride_batch = 0;
-    if (side.scope != TritonFusionAnalysis::Scope::RHS &&
-        dims_.lhs_noncontracting_split) {
-      const TensorIterationSpec::DimIterationSpec* spec =
-          analysis_.IterSpec(side.scope, hlo_param, side.tiled_dims[0].index);
-      if (spec != nullptr) {
-        if (spec->size() > 1) {
-          // Support one specific kind of output transpose that splits the
-          // dimension originating from the split LHS non-contracting one.
-          stride_batch = spec->at(1).stride;
-        } else {
-          // Because the major part of the split is implemented using the
-          // batch logic stride_batch is populated here as the stride of
-          // the minor part times its size.
-          stride_batch = spec->at(0).stride *
-                         (spec->at(0).count / *dims_.lhs_noncontracting_split);
-        }
-        TF_RET_CHECK(stride_batch != 0);
-      }
-    } else if (side.batch_dim_idx.has_value()) {
-      const TensorIterationSpec::DimIterationSpec* spec =
-          analysis_.IterSpec(side.scope, hlo_param, *side.batch_dim_idx);
-      if (spec != nullptr) {
-        stride_batch = spec->at(0).stride;
-        offset_batch = spec->at(0).slice_start;
-        TF_RET_CHECK(stride_batch != 0);
-      }
-    }
-
-    has_batch_offset |= stride_batch != 0;
-    return Cst(b, index_ty_, stride_batch);
-  }
-
-  // bases: The base pointers of each argument.
-  absl::StatusOr<Value> EmitTensorPointer(
-      EmitterLocOpBuilder b, const HloInstruction* hlo, const Side& side,
-      const ValueRange& bases, Value pid_k,
-      std::vector<int32_t>& boundary_checks) {
-    Value base;
-
-    // Concatenations of parameters are handled during generation of block
-    // pointers because of a limitation of implementation of block pointers
-    // in the Triton compiler: block pointers are not supported inside
-    // conditionals.
-    // Therefore instead of directly using a conditional to emit a concatenation
-    // and emitting its inputs inside the cases a single block pointer is
-    // emitted for all inputs, but all its properties (base, strides etc) get
-    // generated conditionally on the position of the current thread block
-    // within the concatenated dimension.
-    ConcatParams concat_params;
-
-    if (hlo->opcode() == HloOpcode::kConcatenate) {
-      TF_ASSIGN_OR_RETURN(
-          std::tie(concat_params, base),
-          CalculateConcatParamsAndUpdateBase(b, hlo, side, bases));
-    } else {
-      concat_params.dim_idx = -1;
-      base = bases[0];
-    }
-
-    // Parameters of MakeTensorPtrOp to be generated by this function.
-    TensorParams tensor_params;
-    for (const DimProperties& dim : side.tiled_dims) {
-      TF_RETURN_IF_ERROR(AddDimToTensorParams(b, hlo, side, dim, concat_params,
-                                              bases, boundary_checks,
-                                              tensor_params));
-    }
-
-    TF_ASSIGN_OR_RETURN(base, UpddateBaseForBatchAndConcatCases(
-                                  b, hlo, concat_params, side, base));
-
-    if (dims_.out_split_k_dim_idx.has_value()) {
-      TF_ASSIGN_OR_RETURN(base,
-                          UpdateBaseForSplitKCase(b, hlo, side, base, pid_k));
-    }
-
-    if (tensor_params.block_dims.empty()) {
-      // Load of a scalar.
-      return base;
-    }
-    auto tensor_ptr = mlir::cast<Value>(
-        b.create<mt::MakeTensorPtrOp>(
-             base, tensor_params.bounds, tensor_params.strides,
-             tensor_params.tensor_offsets, tensor_params.block_dims,
-             tensor_params.dim_order)
-            .getResult());
-    tensor_ptr = b.create<mt::AdvanceOp>(tensor_ptr.getType(), tensor_ptr,
-                                         tensor_params.block_offsets);
-    return tensor_ptr;
-  }
-
- private:
-  struct ConcatParams {
-    // Index of concatenated dimension if present, -1 otherwise.
-    int dim_idx;
-    // Offsets along the concatenated dimension at which operands change.
-    std::vector<Value> boundaries;
-    // Block index along the concatenated dimension * block size.
-    Value dim_pid_offset;
-  };
-
-  struct TensorParams {
-    std::vector<Value> bounds;
-    std::vector<Value> strides;
-
-    // Offsets from tensor origin, same for all thread blocks.
-    std::vector<Value> tensor_offsets;
-    std::vector<int32_t> block_dims;
-    std::vector<int32_t> dim_order;
-
-    // Offsets for a given thread block, typically pid * block size.
-    // Used in a one-off AdvanceOp applied to the generated MakeTensorPtrOp.
-    std::vector<Value> block_offsets;
-  };
-
-  absl::Status AddDimToTensorParams(EmitterLocOpBuilder b,
-                                    const HloInstruction* hlo, const Side& side,
-                                    const DimProperties& properties,
-                                    const ConcatParams& concat_params,
-                                    const ValueRange& bases,
-                                    std::vector<int32_t>& boundary_checks,
-                                    TensorParams& tensor_params) {
-    if (NonTrivialTiledDimensionHasNoIterationAtParameter(side.scope, *hlo,
-                                                          properties.index)) {
-      // If a non-trivial tiled dimension has only one element at
-      // the parameter, it's being broadcasted. Skip it in the tensor
-      // pointer to prevent it from being padded to the tile size on load
-      // instead of being broadcasted.
-      return absl::OkStatus();
-    }
-    Value pid_offset;
-    if (properties.pid == nullptr) {
-      pid_offset = Cst32(b, 0);
-    } else {
-      pid_offset =
-          b.create<ma::MulIOp>(properties.pid, Cst32(b, properties.block_size));
-    }
-    std::vector<const HloInstruction*> inputs;
-    if (hlo->opcode() == HloOpcode::kConcatenate) {
-      inputs.insert(inputs.end(), hlo->operands().cbegin(),
-                    hlo->operands().cend());
-    } else {
-      inputs = {hlo};
-    }
-    std::vector<const TensorIterationSpec::DimIterationSpec*> specs;
-    std::vector<Value> input_strides;
-    std::vector<Value> input_offsets;
-    std::vector<Value> input_bounds;
-    specs.reserve(inputs.size());
-    input_strides.reserve(inputs.size());
-    input_offsets.reserve(inputs.size());
-    input_bounds.reserve(inputs.size());
-    for (const HloInstruction* input : inputs) {
-      auto* spec = analysis_.IterSpec(side.scope, input, properties.index);
-      specs.push_back(spec);
-      auto dim_spec = spec->at(0);
-      input_strides.push_back(Cst64(b, dim_spec.stride));
-      input_offsets.push_back(
-          b.create<ma::AddIOp>(pid_offset, Cst32(b, dim_spec.slice_start)));
-      input_bounds.push_back(Cst64(b, dim_spec.count));
-    }
-    {
-      TF_ASSIGN_OR_RETURN(
-          Value select_input_strides,
-          EmitMultiSelect(b, concat_params.dim_pid_offset,
-                          concat_params.boundaries, input_strides));
-      tensor_params.strides.push_back(select_input_strides);
-    }
-
-    if (properties.index == concat_params.dim_idx) {
-      TF_RETURN_IF_ERROR(AddConcatDimToTensorParams(
-          b, hlo, side, properties, concat_params, pid_offset, specs.front(),
-          input_offsets, input_bounds, tensor_params));
-    } else if (hlo->opcode() == HloOpcode::kDynamicSlice &&
-               (side.scope == TritonFusionAnalysis::Scope::LHS ||
-                side.scope == TritonFusionAnalysis::Scope::RHS) &&
-               properties.index ==
-                   GetNonContractingDimIdxForOperandScope(side.scope)) {
-      TF_RETURN_IF_ERROR(
-          AddDynamicSliceDimToTensorParams(b, hlo, side, properties, pid_offset,
-                                           specs.back(), bases, tensor_params));
-    } else {
-      TF_RETURN_IF_ERROR(AddStandaloneDimToTensorParams(
-          b, side, properties, pid_offset, specs.front(), bases, tensor_params,
-          boundary_checks));
-    }
-    if (specs.back()->at(0).broadcast_multiplier > 1) {
-      if (specs.back()->at(0).broadcast_multiplier % properties.block_size) {
-        return UncompilableMatmul(
-            absl::StrCat("Broadcast multiplier is not a multiple of the block "
-                         "size. block_size: ",
-                         properties.block_size, " vs broadcast_multiplier: ",
-                         specs.back()->at(0).broadcast_multiplier));
-      }
-      if (properties.split_value > 1) {
-        return UncompilableMatmul(
-            "Broadcasted dimension is split, which is not supported yet.");
-      }
-      tensor_params.block_dims.push_back(1);
-    } else {
-      tensor_params.block_dims.push_back(properties.block_size);
-    }
-    tensor_params.dim_order.emplace(tensor_params.dim_order.begin(),
-                                    tensor_params.dim_order.size());
-    return absl::OkStatus();
-  }
-
-  absl::Status AddStandaloneDimToTensorParams(
-      EmitterLocOpBuilder b, const Side& side, const DimProperties& properties,
-      Value pid_offset, const TensorIterationSpec::DimIterationSpec* spec,
-      const ValueRange& bases, TensorParams& tensor_params,
-      std::vector<int32_t>& boundary_checks) {
-    tensor_params.tensor_offsets.push_back(Cst32(b, spec->at(0).slice_start));
-    tensor_params.block_offsets.push_back(pid_offset);
-    int64_t dim_bound = spec->at(0).count;
-    if (side.scope == TritonFusionAnalysis::Scope::OUTPUT &&
-        properties.index == dims_.out_lhs_noncontracting_dim_idx &&
-        spec->size() == 1 && dims_.lhs_noncontracting_split.has_value()) {
-      // Dimension of the output produced by the non-contracting LHS one
-      // is logically split, major part is addressed using pid_batch.
-      dim_bound /= *dims_.lhs_noncontracting_split;
-    }
-    tensor_params.bounds.push_back(Cst64(b, dim_bound));
-    if (dim_bound % (properties.block_size * properties.split_value) != 0) {
-      boundary_checks.push_back(tensor_params.bounds.size() - 1);
-    }
-    return absl::OkStatus();
-  }
-
-  absl::Status AddDynamicSliceDimToTensorParams(
-      EmitterLocOpBuilder b, const HloInstruction* hlo, const Side& side,
-      const DimProperties& properties, const Value pid_offset,
-      const TensorIterationSpec::DimIterationSpec* spec,
-      const ValueRange& bases, TensorParams& tensor_params) {
-    // Here we compute the offset of where we should read the slice from.
-    // TODO(b/323255699): Add support for slices of the contracting dim.
-    // Dynamic slices are guaranteed to only be offset along the majormost
-    // dimension.
-
-    // The only fragment of the non-contracting dim of the dot's input in
-    // the current scope:
-    TF_RET_CHECK(spec->size() == 1);
-    const TensorIterationSpec::IterationSpecFragment only_fragment_of_nc_dim =
-        spec->at(0);
-    // The majormost dim index in the dynamic slice's output.
-    const int majormost_dim = hlo->shape().layout().minor_to_major().back();
-
-    // dynamic slice operands are (input, start_index0, start_index1, ...)
-    // so the start index corresponding to the ith dimension is bases[i+1].
-    Value majormost_dim_start_index_ptr_val = bases[majormost_dim + 1];
-    Value majormost_dim_start_index_val = b.create<mt::LoadOp>(
-        majormost_dim_start_index_ptr_val, mt::CacheModifier::NONE,
-        mt::EvictionPolicy::NORMAL,
-        /*isVolatile=*/false);
-    int64_t majormost_dim_start_index_upper_limit =
-        hlo->operand(0)->shape().dimensions(majormost_dim) -
-        hlo->dynamic_slice_sizes().at(majormost_dim);
-    // We don't want to cast S64 indices to S32, because that could result
-    // in an incorrect value.
-    if (majormost_dim_start_index_val.getType().isInteger() &&
-        majormost_dim_start_index_val.getType().getIntOrFloatBitWidth() == 64) {
-      return UncompilableMatmul(
-          "64 bit dynamic-slice indices are not supported yet.");
-    }
-    majormost_dim_start_index_val =
-        triton::Cast(b, majormost_dim_start_index_val, b.getI32Type());
-    majormost_dim_start_index_val =
-        b.create<ma::MaxSIOp>(majormost_dim_start_index_val, Cst32(b, 0));
-    majormost_dim_start_index_val =
-        b.create<ma::MinSIOp>(majormost_dim_start_index_val,
-                              Cst32(b, majormost_dim_start_index_upper_limit));
-
-    // How many "rows" (non-contracting dim values) are there in a slice of
-    // size 1?
-    int64_t rows_per_majormost_dim = 1;
-    for (int i = 0; i < hlo->shape().dimensions().size() - 1; ++i) {
-      rows_per_majormost_dim *= hlo->shape().dimensions_minor(i);
-    }
-    rows_per_majormost_dim =
-        rows_per_majormost_dim / only_fragment_of_nc_dim.stride;
-    Value rows_per_majormost_dim_val = Cst32(b, rows_per_majormost_dim);
-
-    Value tensor_offset_val_i32 = b.create<ma::MulIOp>(
-        majormost_dim_start_index_val, rows_per_majormost_dim_val);
-    tensor_params.tensor_offsets.push_back(tensor_offset_val_i32);
-
-    // tt.make_tensor_ptr expects an i64 for shape and size, but expects
-    // i32 for offsets. We extend the offset to calculate the upper bound.
-    Value tensor_offset_val_i64 =
-        b.create<ma::ExtSIOp>(b.getI64Type(), tensor_offset_val_i32);
-    Value sliced_count_val = Cst64(b, only_fragment_of_nc_dim.sliced_count);
-    Value upper_bound_val =
-        b.create<ma::AddIOp>(tensor_offset_val_i64, sliced_count_val);
-    tensor_params.bounds.push_back(upper_bound_val);
-
-    tensor_params.block_offsets.push_back(pid_offset);
-    return absl::OkStatus();
-  }
-
-  absl::Status AddConcatDimToTensorParams(
-      EmitterLocOpBuilder b, const HloInstruction* hlo, const Side& side,
-      const DimProperties& properties, const ConcatParams& concat_params,
-      const Value pid_offset, const TensorIterationSpec::DimIterationSpec* spec,
-      const std::vector<Value>& input_offsets, std::vector<Value>& input_bounds,
-      TensorParams& tensor_params) {
-    TF_ASSIGN_OR_RETURN(Value select_input_offsets,
-                        EmitMultiSelect(b, pid_offset, concat_params.boundaries,
-                                        input_offsets));
-    tensor_params.block_offsets.push_back(select_input_offsets);
-
-    TF_ASSIGN_OR_RETURN(
-        Value select_input_bounds,
-        EmitMultiSelect(b, pid_offset, concat_params.boundaries, input_bounds));
-    tensor_params.bounds.push_back(select_input_bounds);
-    tensor_params.tensor_offsets.push_back(Cst32(b, spec->at(0).slice_start));
-    return absl::OkStatus();
-  }
-
-  absl::StatusOr<Value> UpddateBaseForBatchAndConcatCases(
-      EmitterLocOpBuilder b, const HloInstruction* hlo,
-      const ConcatParams& concat_params, const Side& side, Value base) {
-    int64_t offset_batch = 0;
-    bool has_batch_offset = false;
-    Value batch_stride;
-
-    if (hlo->opcode() == HloOpcode::kConcatenate) {
-      std::vector<Value> batch_strides;
-      batch_strides.reserve(hlo->operands().size());
-      for (const HloInstruction* operand : hlo->operands()) {
-        TF_ASSIGN_OR_RETURN(
-            Value op_stride,
-            GetBatchStride(b, side, operand, offset_batch, has_batch_offset));
-        batch_strides.push_back(op_stride);
-      }
-      TF_ASSIGN_OR_RETURN(
-          batch_stride,
-          EmitMultiSelect(b, concat_params.dim_pid_offset,
-                          concat_params.boundaries, batch_strides));
-    } else {
-      TF_ASSIGN_OR_RETURN(
-          batch_stride,
-          GetBatchStride(b, side, hlo, offset_batch, has_batch_offset));
-    }
-
-    // Avoid generating logic to compute batch offset if unnecessary.
-    if (has_batch_offset) {
-      Value pid_batch =
-          b.create<mt::GetProgramIdOp>(launch_config_.batch_program_id_dim);
-
-      Value pid_offset_batch = b.create<ma::MulIOp>(
-          b.create<ma::AddIOp>(Cst(b, index_ty_, offset_batch),
-                               ConvertScalar(b, pid_batch)),
-          batch_stride);
-
-      base = AddPtr(b, base, pid_offset_batch);
-    }
-    return base;
-  }
-
-  absl::StatusOr<Value> UpdateBaseForSplitKCase(EmitterLocOpBuilder b,
-                                                const HloInstruction* hlo,
-                                                const Side& side, Value base,
-                                                Value pid_k) {
-    const TensorIterationSpec::DimIterationSpec* spec = analysis_.IterSpec(
-        TritonFusionAnalysis::Scope::OUTPUT, hlo, *dims_.out_split_k_dim_idx);
-    if (spec != nullptr && spec->at(0).count > 1) {
-      TF_RET_CHECK(pid_k != nullptr);
-      base =
-          AddPtr(b, base,
-                 b.create<ma::MulIOp>(ConvertScalar(b, pid_k),
-                                      Cst(b, index_ty_, spec->at(0).stride)));
-    }
-    return base;
-  }
-
-  absl::StatusOr<std::pair<ConcatParams, Value>>
-  CalculateConcatParamsAndUpdateBase(EmitterLocOpBuilder b,
-                                     const HloInstruction* hlo,
-                                     const Side& side,
-                                     const ValueRange& bases) {
-    ConcatParams concat_params;
-    // For now only non-contracting dimension can be concatenated.
-    concat_params.dim_idx = (side.scope == TritonFusionAnalysis::Scope::LHS)
-                                ? dims_.lhs_noncontracting_dim_idx
-                                : dims_.rhs_noncontracting_dim_idx;
-    const DimProperties& properties = [&] {
-      for (const DimProperties& dim : side.tiled_dims) {
-        if (dim.index == concat_params.dim_idx) {
-          return dim;
-        }
-      }
-      LOG(FATAL) << "Missing dimension.";
-    }();
-    TF_RET_CHECK(bases.size() == hlo->operand_count());
-
-    concat_params.boundaries.reserve(hlo->operand_count() - 1);
-    for (int i = 0; i < hlo->operand_count() - 1; ++i) {
-      const TensorIterationSpec::IterationSpecFragment& fragment =
-          analysis_
-              .IterSpec(side.scope, hlo->operand(i), concat_params.dim_idx)
-              ->at(0);
-      if (fragment.sliced_count % properties.block_size != 0) {
-        return UncompilableMatmul(
-            "Operand is not divisible by the block size.");
-      }
-      concat_params.boundaries.push_back(
-          Cst32(b, -fragment.slice_start + fragment.sliced_count));
-    }
-
-    concat_params.dim_pid_offset =
-        b.create<ma::MulIOp>(properties.pid, Cst32(b, properties.block_size));
-    TF_ASSIGN_OR_RETURN(Value base,
-                        EmitMultiSelect(b, concat_params.dim_pid_offset,
-                                        concat_params.boundaries, bases));
-    return std::make_pair(concat_params, base);
-  }
-
-  // Extend int32 indexes to int64, if necessary.
-  Value ConvertScalar(EmitterLocOpBuilder b, Value value) {
-    if (index_ty_.getIntOrFloatBitWidth() == 64) {
-      return b.create<ma::ExtSIOp>(index_ty_, value);
-    }
-    return value;
-  }
-
-  absl::string_view libdevice_path_;
-  const se::DeviceDescription& device_info_;
-  const HloDotInstruction* dot_instr_;
-  Type index_ty_;
-  TritonFusionAnalysis analysis_;
-  MatMulDims dims_;
-  MatMulLaunchConfig launch_config_;
-};
-
-absl::StatusOr<SmallVector<Value>> GetArguments(mlir::FunctionOpInterface fn,
-                                                const HloInstruction& input) {
-  if (input.opcode() == HloOpcode::kParameter) {
-    return {{fn.getArgument(input.parameter_number())}};
-  } else if (input.opcode() == HloOpcode::kConcatenate ||
-             input.opcode() == HloOpcode::kDynamicSlice) {
-    // As defined in GemmFusion, all inputs of concatenate and dynamic slice are
-    // parameters.
-    SmallVector<Value> result;
-    for (const HloInstruction* operand : input.operands()) {
-      TF_RET_CHECK(operand->opcode() == HloOpcode::kParameter);
-      result.push_back(fn.getArgument(operand->parameter_number()));
-    }
-    return result;
-  }
-  LOG(FATAL) << "Unexpected opcode: " << input.opcode();
-}
-
-// Concatenations can currently only be applied directly to parameters;
-// all concatenated parameters share the same block pointer. This function
-// returns all inputs of a kernel: concatenations of parameters and standalone
-// parameters.
-ConstHloInstructionSet ScopeInputs(const TritonFusionAnalysis& analysis,
-                                   const TritonFusionAnalysis::Scope scope) {
-  ConstHloInstructionSet result;
-  for (const HloInstruction* parameter : analysis.ScopeParameters(scope)) {
-    if (absl::c_any_of(parameter->users(), [](const HloInstruction* user) {
-          return user->opcode() == HloOpcode::kConcatenate ||
-                 user->opcode() == HloOpcode::kDynamicSlice;
-        })) {
-      // Concatenation is always the only user of its parameters by
-      // construction.
-      CHECK_EQ(parameter->users().size(), 1);
-      for (const HloInstruction* operand : parameter->users()[0]->operands()) {
-        // All operands of a concatenation have to be computation parameters.
-        CHECK_EQ(operand->opcode(), HloOpcode::kParameter);
-      }
-      result.insert(parameter->users()[0]);
-    } else {
-      result.insert(parameter);
-    }
-  }
-  return result;
-}
-
-// This is a heuristic that serves as a proxy for register usage and code size.
-//
-// We have noticed that tilings with very long LLVM IR code are both slow to
-// compile and slow to run. This can be for example due to register spills. So
-// we should skip these tilings to save time. But it's better to skip them
-// before the LLVM IR is generated. To do that, we came up with a formula that
-// strongly correlates with the LLVM IR size. The formula is the size of the two
-// input and the output thread block tiles divided by the number of warps. We
-// read https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/ as a
-// reference, and found the formula by trial and error.
-//
-// To regenerate the limit, we have to run an exhaustive search on all tilings
-// for a few different HLOs, printing the runtimes and the heuristic values.
-//
-// From that, we can find a limit, such that all tilings within alpha *
-// optimal_runtime have a heuristic value less than or equal to the limit.
-//
-// In our measurements, all tilings which were within 1.13 * optimal_runtime had
-// a complexity_heuristic_value <= kComplexityHeuristicLimit.
-//
-// See go/tiling-heuristic for more details.
-absl::Status CheckGemmTilingComplexityHeuristic(
-    const TritonGemmConfig& config) {
-  constexpr int64_t kComplexityHeuristicLimit = 9000;
-  int64_t complexity_heuristic_value =
-      (config.block_m * config.block_n +
-       (config.block_m + config.block_n) * config.block_k) /
-      config.num_warps;
-  VLOG(2) << "Complexity heuristic: " << complexity_heuristic_value;
-  if (complexity_heuristic_value > kComplexityHeuristicLimit) {
-    return ResourceExhausted("Tiling complexity heuristic exceeded: %d > %d",
-                             complexity_heuristic_value,
-                             kComplexityHeuristicLimit);
-  }
-  return absl::OkStatus();
-}
-
-class Scopes {
- public:
-  Scopes(EmitterLocOpBuilder& b, const HloInstruction* dot_instr,
-         const TritonFusionAnalysis& analysis, const MatMulDims& dims,
-         const TritonGemmConfig& config, const MatMulLaunchConfig launch_config)
-      : lhs_(TritonFusionAnalysis::Scope::LHS),
-        rhs_(TritonFusionAnalysis::Scope::RHS),
-        out_(TritonFusionAnalysis::Scope::OUTPUT) {
-    constexpr int group_m = 8;
-    const int64_t width = group_m * launch_config.grid_n;
-
-    auto pid_nc = b.create<mt::GetProgramIdOp>(
-        launch_config.noncontracting_program_id_dim);
-    pid_k_ = (config.split_k > 1)
-                 ? b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::Z)
-                 : Value{};
-
-    auto group_id = b.create<ma::DivSIOp>(pid_nc, Cst32(b, width));
-    ma::ConstantOp group_m_op = Cst32(b, group_m);
-    auto first_pid_m = b.create<ma::MulIOp>(group_id, group_m_op);
-    auto sub0 =
-        b.create<ma::SubIOp>(Cst32(b, launch_config.grid_m), first_pid_m);
-    auto group_size = b.create<ma::SelectOp>(
-        b.create<ma::CmpIOp>(ma::CmpIPredicate::slt, sub0, group_m_op), sub0,
-        group_m_op);
-
-    pid_m_ = b.create<ma::AddIOp>(first_pid_m,
-                                  b.create<ma::RemSIOp>(pid_nc, group_size));
-
-    pid_n_ = b.create<ma::DivSIOp>(
-        b.create<ma::RemSIOp>(pid_nc, Cst32(b, width)), group_size);
-
-    int lhs_non_contracting_block_size = config.block_m;
-    int lhs_contracting_block_size = config.block_k;
-    lhs_.tiled_dims = {
-        DimProperties(dims.lhs_noncontracting_dim_idx, pid_m_,
-                      lhs_non_contracting_block_size,
-                      /*split_value=*/1),
-        DimProperties(dims.lhs_contracting_dim_idx, pid_k_,
-                      lhs_contracting_block_size, config.split_k)};
-    lhs_.batch_dim_idx = dims.lhs_batch_dim_idx;
-
-    int rhs_contracting_block_size = config.block_k;
-    int rhs_non_contracting_block_size = config.block_n;
-    rhs_.tiled_dims = {
-        DimProperties(dims.rhs_contracting_dim_idx, pid_k_,
-                      rhs_contracting_block_size, config.split_k),
-        DimProperties(dims.rhs_noncontracting_dim_idx, pid_n_,
-                      rhs_non_contracting_block_size,
-                      /*split_value=*/1)};
-    rhs_.batch_dim_idx = dims.rhs_batch_dim_idx;
-
-    out_.tiled_dims = {DimProperties(dims.out_lhs_noncontracting_dim_idx,
-                                     pid_m_, config.block_m,
-                                     /*split_value=*/1),
-                       DimProperties(dims.out_rhs_noncontracting_dim_idx,
-                                     pid_n_, config.block_n,
-                                     /*split_value=*/1)};
-    out_.batch_dim_idx = dims.out_batch_dim_idx;
-  }
-
-  std::vector<const Side*> input_scopes() const { return {&lhs_, &rhs_}; }
-  const Side& lhs() const { return lhs_; }
-  const Side& rhs() const { return rhs_; }
-  const Side& out() const { return out_; }
-  const Value& pid_m() const { return pid_m_; }
-  const Value& pid_k() const { return pid_k_; }
-  const Value& pid_n() const { return pid_n_; }
-
- private:
-  Side lhs_;
-  Side rhs_;
-  Side out_;
-
-  Value pid_m_;
-  Value pid_k_;
-  Value pid_n_;
-};
-
-// This class represents a loadable input that needs to be partially loaded on
-// each iteration of a ForOp. It keeps track of which iterable argument indices
-// correspond to this input & all information necessary to load & modify
-// iteration arguments for the next iteration.
-class IterableInput {
- public:
-  IterableInput(size_t iter_arg_index, size_t operand_index,
-                int contracting_dimension, Type type, Type storage_type,
-                const HloInstruction* hlo_instr, const Side* side,
-                std::vector<int32_t> boundary_checks, int64_t block_k)
-      : iter_arg_index_(iter_arg_index),
-        operand_index_(operand_index),
-        contracting_dimension_(contracting_dimension),
-        type_(type),
-        storage_type_(storage_type),
-        block_k_(block_k),
-        hlo_instr_(hlo_instr),
-        side_(side),
-        boundary_checks_(boundary_checks) {}
-
-  static absl::StatusOr<IterableInput> CreateIterableInput(
-      size_t iter_arg_index, EmitterLocOpBuilder& b, const MatMulDims& dims,
-      const Side* side, const HloInstruction* hlo_instr, int64_t block_k) {
-    TF_ASSIGN_OR_RETURN(Type input_ty,
-                        TritonType(b, hlo_instr->shape().element_type()));
-    int contracting_dimension =
-        (side->scope == TritonFusionAnalysis::Scope::RHS)
-            ? dims.rhs_contracting_dim_idx
-            : dims.lhs_contracting_dim_idx;
-
-    return IterableInput(iter_arg_index, static_cast<int>(side->scope),
-                         contracting_dimension, input_ty,
-                         StorageType(b, input_ty), hlo_instr, side,
-                         /*boundary_checks=*/{}, block_k);
-  }
-
-  Value EmitAdvance(EmitterLocOpBuilder b, MatMulEmitterHelper& emitter,
-                    Value ki, ValueRange iter_args) const {
-    SmallVector<Value> increments = emitter.EmitIncrements(
-        b, *side_, *hlo_instr_, contracting_dimension_, ki, block_k_);
-
-    if (increments.empty()) {
-      return iter_args[iter_arg_index_];
-    }
-
-    const Value& iter_arg = iter_args[iter_arg_index_];
-    return b.create<mt::AdvanceOp>(iter_arg.getType(), iter_arg, increments);
-  }
-
-  Value EmitLoad(EmitterLocOpBuilder b, ValueRange args) const {
-    Value param_value = EmitParameterLoad(b, args.front(), boundary_checks_);
-    if (type_ != storage_type_) {
-      // For example cast i8 to i1.
-      param_value = triton::Cast(b, param_value, type_);
-    }
-    return param_value;
-  }
-
-  // Index of the iter_arg of the ForOp associated with this input.
-  size_t iter_arg_index_;
-  // Index used to differentiate it between LHS and RHS inputs.
-  size_t operand_index_;
-  // Index of the contracting dimension in the input.
-  int contracting_dimension_;
-  // Type of the input.
-  Type type_;
-  // Storage type of the input (in case it is different & needs to be casted).
-  Type storage_type_;
-  // Step size of the contracting dimension.
-  int64_t block_k_;
-
-  // Necessary for some operations at the moment. Maybe could be refactored out.
-  const HloInstruction* hlo_instr_;
-  const Side* side_;
-
-  // This is currently set afterwards, but needs to be associated with the input
-  // during iteration.
-  std::vector<int32_t> boundary_checks_;
-};
-
-enum MaskExpandDimension { kMajor = 0, kMinor = 1 };
-
-Value EmitMaskOnInput(EmitterLocOpBuilder& b,
-                      MaskExpandDimension expand_along_dimension, Value input,
-                      int dim_k_denom, Value k, int64_t dims_k, int64_t block_k,
-                      Value pid_k, int64_t other_dim_block_size) {
-  int block_k_size = block_k / dim_k_denom;
-  auto dim_k_elements_to_keep =
-      b.create<ma::SubIOp>(Cst32(b, dims_k / dim_k_denom), k);
-  auto is_last_tile_cond = b.create<ma::CmpIOp>(
-      ma::CmpIPredicate::slt, dim_k_elements_to_keep, Cst32(b, block_k_size));
-  auto input_type = mlir::cast<mlir::RankedTensorType>(input.getType());
-  auto input_element_type = input_type.getElementType();
-
-  // If the input is a scalar, we need to expand it to a 2D tensor.
-  // Otherwise, keep the input type.
-  auto expanded_input_type = [&](Value input) {
-    if (input_type.getRank() != 0) return input_type;
-    // expand along the major dimension.
-    if (expand_along_dimension == kMajor) {
-      return mlir::RankedTensorType::get(
-          ArrayRef<int64_t>{other_dim_block_size, block_k_size},
-          input_element_type);
-    }
-    // expand along the minor dimension.
-    return mlir::RankedTensorType::get(
-        ArrayRef<int64_t>{block_k_size, other_dim_block_size},
-        input_element_type);
-  }(input);
-
-  auto expanded_input = input;
-  // If the input is a scalar, we need to expand it to a 2D tensor.
-  if (input_type.getRank() == 0) {
-    expanded_input = b.create<mt::ExpandDimsOp>(expanded_input, 0);
-    expanded_input = b.create<mt::ExpandDimsOp>(expanded_input, 0);
-    expanded_input =
-        b.create<mt::BroadcastOp>(expanded_input_type, expanded_input);
-  }
-
-  auto if_op = b.create<mlir::scf::IfOp>(
-      is_last_tile_cond, /*thenBranch=*/
-      [&, &parent_builder = b](mlir::OpBuilder& builder, mlir::Location loc) {
-        EmitterLocOpBuilder b(loc, builder, parent_builder.annotate_loc());
-        // Make a range vector from 0 to block_k.
-        auto range_from_0_to_k = Range(b, block_k_size);
-        if (pid_k != nullptr) {
-          range_from_0_to_k = b.create<ma::AddIOp>(
-              range_from_0_to_k,
-              Splat(b, b.create<ma::MulIOp>(pid_k, Cst32(b, block_k_size)),
-                    block_k_size));
-        }
-        // Make it a 2D matrix.
-        TensorValue range_from_0_to_k_2d = mlir::cast<TensorValue>(
-            b.create<mt::ExpandDimsOp>(range_from_0_to_k,
-                                       expand_along_dimension)
-                .getResult());
-        // Make 2d vector of dim_k_elements_to_keep.
-        auto dim_k_elements_to_keep_2d =
-            Splat(b, dim_k_elements_to_keep,
-                  range_from_0_to_k_2d.getType().getShape());
-        // The mask is true for elements in range_from_0_to_k_2d that are less
-        // than dim_k_elements_to_keep.
-        auto elements_mask_vector =
-            b.create<ma::CmpIOp>(ma::CmpIPredicate::slt, range_from_0_to_k_2d,
-                                 dim_k_elements_to_keep_2d);
-
-        Value elements_mask_matrix = b.create<mt::BroadcastOp>(
-            expanded_input_type.clone(b.getI1Type()), elements_mask_vector);
-
-        // Zeros to use instead of the masked elements.
-        auto zeros = CreateConst(b, input_element_type, 0,
-                                 expanded_input_type.getShape());
-        auto result =
-            b.create<ma::SelectOp>(elements_mask_matrix, expanded_input, zeros);
-        b.create<mlir::scf::YieldOp>(mlir::ValueRange(result));
-      },
-      /*elseBranch=*/
-      [&, &parent_builder = b](mlir::OpBuilder& builder, mlir::Location loc) {
-        // We don't need to mask anything but we need to expand the input.
-        // Otherwise Triton complains.
-        EmitterLocOpBuilder b(loc, builder, parent_builder.annotate_loc());
-        b.create<mlir::scf::YieldOp>(mlir::ValueRange(expanded_input));
-      });
-  return if_op.getResult(0);
-}
-
-absl::StatusOr<TritonGemmConfig> GetTritonGemmConfig(
-    const HloFusionInstruction* fusion) {
-  auto backend_config =
-      fusion->backend_config<GpuBackendConfig>()->fusion_backend_config();
-
-  if (!backend_config.has_triton_gemm_config()) {
-    // TODO(bchetioui): consolidate default parameters. At the moment, these
-    // may be constructed in two distinct places.
-    LOG(WARNING) << "Using fallback triton GEMM config for op "
-                 << fusion->name();
-    *backend_config.mutable_triton_gemm_config() = DefaultTritonGemmKey();
-  }
-
-  return TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
-}
-
-Type GetIndexType(EmitterLocOpBuilder& b, const HloDotInstruction& dot_instr,
-                  const TritonGemmConfig& config) {
-  // Use 32-bit indexing if addressing any of the inputs or the output (which
-  // could grow if split_k is set) does not cross the INT_MAX boundary.
-  // Otherwise, fall back to 64-bit indexing, which is slower.
-  bool use_64bit_indexing =
-      ShapeUtil::ElementsIn(dot_instr.operand(0)->shape()) > INT_MAX ||
-      ShapeUtil::ElementsIn(dot_instr.operand(1)->shape()) > INT_MAX ||
-      ShapeUtil::ElementsIn(dot_instr.shape()) * config.split_k > INT_MAX;
-  return b.getIntegerType(use_64bit_indexing ? 64 : 32);
-}
-
-absl::Status EmitForLoopBody(EmitterLocOpBuilder& b,
-                             MatMulEmitterHelper& emitter, const Scopes& scopes,
-                             const HloDotInstruction* dot_instr,
-                             const MatMulDims& dims,
-                             const llvm::SmallVector<IterableInput>& inputs,
-                             Value ki, ValueRange iter_args) {
-  SmallVector<Value> args_for_yield;
-  std::array<absl::flat_hash_map<const HloInstruction*, Value>, 2> values;
-
-  // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
-  for (const IterableInput& input : inputs) {
-    Value param_value = input.EmitLoad(b, iter_args[input.iter_arg_index_]);
-    CHECK(values[input.operand_index_]
-              .insert({input.hlo_instr_, param_value})
-              .second);
-    args_for_yield.push_back(input.EmitAdvance(b, emitter, ki, iter_args));
-  }
-
-  // Emit all operations of LHS and RHS scopes.
-  Value dot_lhs =
-      emitter.MakeInput(b, scopes.lhs(), kLhsIndex, values[kLhsIndex]);
-  Value dot_rhs =
-      emitter.MakeInput(b, scopes.rhs(), kRhsIndex, values[kRhsIndex]);
-
-  // Operation in the fusion before the dot can alter the elements of the
-  // tiles that were zero masked during loads. These have to be zeroed here
-  // again just before the dot so that they do not affect the output.
-  // Only the K dimension needs masking here because unnecessary elements in
-  // the other two get discarded by the masked store at the end.
-  const bool need_masking =
-      dims.k % (dims.config.block_k * dims.config.split_k) > 0;
-  if (need_masking) {
-    dot_lhs = EmitMaskOnInput(b, MaskExpandDimension::kMajor, dot_lhs, 1, ki,
-                              dims.k, dims.config.block_k, scopes.pid_k(),
-                              dims.config.block_m);
-    dot_rhs = EmitMaskOnInput(b, MaskExpandDimension::kMinor, dot_rhs, 1, ki,
-                              dims.k, dims.config.block_k, scopes.pid_k(),
-                              dims.config.block_n);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      Value acc_next,
-      triton::EmitSingleTileDot(
-          b, *dot_instr,
-          triton::DotOperands{dot_lhs, dot_rhs, iter_args.back()}));
-  args_for_yield.push_back(acc_next);
-  b.create<mlir::scf::YieldOp>(args_for_yield);
-
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-// Use tiling and execution parameters from 'config'. BlockLevelParameters are
-// ignored.
-// Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
-absl::Status EmitMatMul(EmitterLocOpBuilder& b,
-                        absl::string_view libdevice_path,
-                        const se::DeviceDescription& device_info,
-                        const HloFusionInstruction* fusion,
-                        mlir::FunctionOpInterface fn,
-                        const BlockLevelParameters&) {
-  TF_ASSIGN_OR_RETURN(TritonGemmConfig config, GetTritonGemmConfig(fusion));
-  TF_ASSIGN_OR_RETURN(auto analysis,
-                      TritonFusionAnalysis::Execute(
-                          *fusion->called_computation(), config.split_k));
-
-  absl::Status status = CheckGemmTilingComplexityHeuristic(config);
-  if (!status.ok()) {
-    VLOG(1) << "EmitMatMul heuristic check failed: "
-            << fusion->called_computation()->ToString() << status.message();
-    return status;
-  }
-
-  const HloComputation* computation = fusion->fused_instructions_computation();
-  const HloInstruction* instr =
-      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
-  const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(instr);
-
-  Type index_ty = GetIndexType(b, *dot_instr, config);
-
-  const HloInstruction* root = dot_instr->parent()->root_instruction();
-  TF_RET_CHECK(!root->shape().IsTuple());
-
-  TF_RETURN_IF_ERROR(ValidateMatMulConfig(config, *dot_instr));
-  const int split_k = config.split_k;
-  const int block_m = config.block_m;
-  const int block_k = config.block_k;
-  const int block_n = config.block_n;
-
-  TF_ASSIGN_OR_RETURN(const MatMulDims dims,
-                      MatMulDims::Create(config, *dot_instr, analysis));
-  const MatMulLaunchConfig launch_config(config, *dot_instr, dims, device_info);
-  VLOG(6) << analysis.ToString();
-
-  MatMulEmitterHelper emitter(libdevice_path, device_info, dot_instr, index_ty,
-                              dims, launch_config, analysis);
-
-  TF_ASSIGN_OR_RETURN(mlir::Type acc_ty,
-                      triton::GetDotAccumulatorType(b, *dot_instr));
-
-  ma::ConstantOp accumulator_init =
-      CreateConst(b, acc_ty, 0, {block_m, block_n});
-
-  // Calculate the sizes of the lhs, rhs, and output sides.
-  Scopes scopes(b, dot_instr, analysis, dims, config, launch_config);
-
-  llvm::SmallVector<IterableInput> inputs;
-
-  // Pointers to inputs of LHS scope, then RHS, then the accumulator
-  // that change with every loop iteration and are passed between them.
-  SmallVector<Value> iter_args;
-  int64_t step_k = block_k * split_k;
-  for (const Side* side : scopes.input_scopes()) {
-    for (const HloInstruction* input_hlo : ScopeInputs(analysis, side->scope)) {
-      TF_ASSIGN_OR_RETURN(SmallVector<Value> arguments,
-                          GetArguments(fn, *input_hlo));
-      TF_ASSIGN_OR_RETURN(
-          IterableInput iter_input,
-          IterableInput::CreateIterableInput(iter_args.size(), b, dims, side,
-                                             input_hlo, step_k));
-      TF_ASSIGN_OR_RETURN(Value tensor_ptr,
-                          emitter.EmitTensorPointer(
-                              b, input_hlo, *side, arguments, scopes.pid_k(),
-                              iter_input.boundary_checks_));
-      inputs.push_back(iter_input);
-      iter_args.push_back(tensor_ptr);
-    }
-  }
-
-  auto body_builder_callback = [&](mlir::OpBuilder&, mlir::Location, Value ki,
-                                   ValueRange iter_args) -> void {
-    CHECK_OK(EmitForLoopBody(b, emitter, scopes, dot_instr, dims, inputs, ki,
-                             iter_args));
-  };
-
-  iter_args.push_back(accumulator_init);
-  Value acc_final = b.create<mlir::scf::ForOp>(
-                         /*lowerBound*/ Cst32(b, 0),
-                         /*upperBound*/ Cst32(b, dims.k),
-                         /*step*/ Cst32(b, step_k),
-                         /*iterArgs*/ iter_args, body_builder_callback)
-                        .getResult(iter_args.size() - 1);
-  absl::flat_hash_map<const HloInstruction*, Value> values_out;
-  TF_ASSIGN_OR_RETURN(Type acc_final_ty,
-                      TritonType(b, dot_instr->shape().element_type()));
-  values_out[dot_instr] = triton::Cast(b, acc_final, acc_final_ty);
-
-  // Emit the output scope.
-  if (std::vector<const HloInstruction*> to_emit =
-          emitter.EpiloguePostOrderTransitiveOperands(root);
-      !to_emit.empty()) {
-    for (const HloInstruction* input :
-         ScopeInputs(analysis, TritonFusionAnalysis::Scope::OUTPUT)) {
-      std::vector<int32_t> boundary_checks;
-      TF_ASSIGN_OR_RETURN(SmallVector<Value> arguments,
-                          GetArguments(fn, *input));
-      TF_ASSIGN_OR_RETURN(
-          Value tensor_pointer,
-          emitter.EmitTensorPointer(b, input, scopes.out(), arguments,
-                                    scopes.pid_k(), boundary_checks));
-      TF_RET_CHECK(values_out
-                       .insert({input, EmitParameterLoad(b, tensor_pointer,
-                                                         boundary_checks)})
-                       .second);
-    }
-    TF_RETURN_IF_ERROR(EmitScope(b, libdevice_path, device_info, &analysis,
-                                 scopes.out(), to_emit, values_out)
-                           .status());
-  }
-
-  // Emit tensor store operations for all outputs.
-  for (int i = 0;
-       i < fn.getNumArguments() - dot_instr->parent()->num_parameters(); ++i) {
-    const HloInstruction* producer =
-        root->shape().IsTuple() ? root->operand(i) : root;
-    std::vector<int32_t> boundary_checks;
-    TF_ASSIGN_OR_RETURN(
-        Value tensor_pointer,
-        emitter.EmitTensorPointer(
-            b, producer, scopes.out(),
-            {fn.getArgument(i + dot_instr->parent()->num_parameters())},
-            scopes.pid_k(), boundary_checks));
-    b.create<mt::StoreOp>(tensor_pointer, values_out[producer], boundary_checks,
-                          mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
-    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
-    const TritonGemmConfig& config, const se::DeviceDescription& device_info) {
-  auto dot = HloBfsFindIf(fusion.GetRoots(), fusion, [](auto node) {
-    return node.opcode() == HloOpcode::kDot;
-  });
-  TF_RET_CHECK(dot != std::nullopt);
-  const auto& dot_instr =
-      *static_cast<const HloDotInstruction*>(&dot->instruction());
-  TF_ASSIGN_OR_RETURN(MatMulDims dims,
-                      MatMulDims::Create(config, dot_instr, analysis));
-  MatMulLaunchConfig launch_config(config, dot_instr, dims, device_info);
-  return launch_config.launch_dims;
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h
deleted file mode 100644
index c33e17a5c0ef80..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
-#define XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
-#include "xla/stream_executor/device_description.h"
-
-namespace xla::gpu {
-
-// Compute the launch dimensions for the given Triton MatMul.
-absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
-    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
-    const TritonGemmConfig& config, const se::DeviceDescription& device_info);
-
-// Use tiling and execution parameters from 'config'. BlockLevelParameters are
-// ignored.
-// Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
-absl::Status EmitMatMul(EmitterLocOpBuilder& builder,
-                        absl::string_view libdevice_path,
-                        const se::DeviceDescription& device_info,
-                        const HloFusionInstruction* fusion,
-                        mlir::FunctionOpInterface fn,
-                        const BlockLevelParameters&);
-
-}  // namespace xla::gpu
-
-#endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul_stub.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul_stub.cc
deleted file mode 100644
index 72cc46e8cd38f2..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul_stub.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
-#include "xla/stream_executor/device_description.h"
-
-namespace xla::gpu {
-
-// Compute the launch dimensions for the given Triton MatMul.
-absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
-    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
-    const TritonGemmConfig& config, const se::DeviceDescription& device_info) {
-  return absl::UnimplementedError("not supported for this build configuration");
-}
-
-absl::Status EmitMatMul(EmitterLocOpBuilder& builder,
-                        absl::string_view libdevice_path,
-                        const se::DeviceDescription& device_info,
-                        const HloFusionInstruction* fusion,
-                        mlir::FunctionOpInterface fn,
-                        const BlockLevelParameters&) {
-  return absl::UnimplementedError("not supported for this build configuration");
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_legacy_test.cc
deleted file mode 100644
index f543963f090ed2..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_legacy_test.cc
+++ /dev/null
@@ -1,910 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <array>
-#include <string>
-#include <tuple>
-#include <variant>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/base/optimization.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_replace.h"
-#include "absl/strings/string_view.h"
-#include "absl/strings/substitute.h"
-#include "xla/backends/gpu/codegen/triton/support_legacy.h"
-#include "xla/backends/gpu/codegen/triton/test_utils.h"
-#include "xla/comparison_util.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/primitive_util.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-struct MixTypeParams {
-  PrimitiveType lhs_ty;
-  PrimitiveType rhs_ty;
-  int m;
-  int k;
-  int n;
-  float aabs = 1e-6;
-  float arel = 1e-6;
-};
-
-class MixedTypeTest : public GpuCodegenTest,
-                      public ::testing::WithParamInterface<MixTypeParams> {
- public:
-  se::GpuComputeCapability GetGpuComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
-  }
-
-  void SetUp() override {
-    if (std::holds_alternative<se::RocmComputeCapability>(
-            GetGpuComputeCapability())) {
-      GTEST_SKIP()
-          << "Related fusions are not performed on ROCm without Triton.";
-    }
-  }
-
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    // We are testing Triton, remove cuBLAS fallback for these tests.
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    // Always rewrite Gemms with Triton regardless of size.
-    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
-    // That is a test for legacy Triton emitter that is being replaced by the
-    // generic Triton emitter.
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    return debug_options;
-  }
-};
-
-TEST_P(MixedTypeTest, MixedTypeDotProducesCorrectResult) {
-  MixTypeParams params = GetParam();
-  const std::string hlo_string_template = R"(
-HloModule m
-
-ENTRY e {
-  p0 = $0[$2,$3] parameter(0)
-  p0c = $1[$2,$3] convert(p0)
-  p1 = $1[$3,$4] parameter(1)
-  ROOT _ = $1[$2,$4] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-  std::string hlo_string = absl::Substitute(
-      hlo_string_template,
-      primitive_util::LowercasePrimitiveTypeName(params.lhs_ty),
-      primitive_util::LowercasePrimitiveTypeName(params.rhs_ty), params.m,
-      params.k, params.n);
-  MatchOptimizedHlo(hlo_string, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: kCustom
-)");
-
-  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{params.aabs, params.arel}));
-}
-
-std::string GemmTestParamsParamsToString(
-    const ::testing::TestParamInfo<MixTypeParams>& data) {
-  return absl::StrCat(
-      primitive_util::LowercasePrimitiveTypeName(data.param.lhs_ty), "_",
-      primitive_util::LowercasePrimitiveTypeName(data.param.rhs_ty), "_",
-      data.param.m, "_", data.param.k, "_", data.param.n);
-}
-
-INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
-                         ::testing::ValuesIn({
-                             MixTypeParams{PRED, F16, 16, 32, 8},
-                             MixTypeParams{PRED, BF16, 16, 32, 8},
-                             MixTypeParams{PRED, F32, 16, 32, 8, 2e-4, 2e-3},
-                             MixTypeParams{S8, F16, 16, 32, 8},
-                             MixTypeParams{S8, BF16, 16, 32, 8},
-                             MixTypeParams{S8, F32, 16, 32, 8, 5e-2, 1e-2},
-                             MixTypeParams{S8, F32, 101, 7, 303, 0.1, 0.1},
-                             MixTypeParams{S8, F32, 101, 32, 303, 0.1, 0.1},
-                             MixTypeParams{S8, F32, 101, 2048, 303, 0.5, 0.1},
-                             MixTypeParams{S8, F32, 101, 2555, 303, 0.5, 0.1},
-                             // Is supported but overflows.
-                             //  GemmTestParams{S32, F16},
-                             MixTypeParams{S16, F16, 30, 19, 12},
-                             MixTypeParams{S32, F32, 4, 4, 4, 1, 1e-2},
-                             MixTypeParams{F16, BF16, 16, 32, 8},
-                             MixTypeParams{F16, F32, 16, 32, 8, 1e-3, 1e-6},
-                             MixTypeParams{BF16, F16, 16, 32, 8, 1e-3, 1e-6},
-                             MixTypeParams{BF16, F32, 16, 32, 8, 1e-3, 1e-6},
-                             MixTypeParams{S8, BF16, 24, 40, 8},
-                             MixTypeParams{S8, F16, 80, 16, 32, 1e-3, 1e-6},
-                             MixTypeParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
-                             MixTypeParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
-                             MixTypeParams{BF16, F32, 77, 500, 333, 3e-3, 3e-3},
-                         }),
-                         GemmTestParamsParamsToString);
-
-class TritonTest : public GpuCodegenTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    // Always rewrite Gemms with Triton regardless of size.
-    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
-    // Use legacy Triton emitter for these tests by removing all generic
-    // Triton emitter features.
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    return debug_options;
-  }
-
-  se::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
-  }
-};
-
-class ElementwiseTest : public TritonTest,
-                        public ::testing::WithParamInterface<
-                            std::tuple<PrimitiveType, HloOpcode, float>> {};
-
-std::string ElementwiseTestParamsToString(
-    const ::testing::TestParamInfo<std::tuple<PrimitiveType, HloOpcode, float>>&
-        data) {
-  PrimitiveType data_type;
-  HloOpcode opcode;
-  float tolerance;
-  std::tie(data_type, opcode, tolerance) = data.param;
-  return absl::StrCat(
-      primitive_util::LowercasePrimitiveTypeName(data_type), "_",
-      absl::StrReplaceAll(HloOpcodeString(opcode), {{"-", "_"}}));
-}
-
-using UnaryElementwiseTest = ElementwiseTest;
-
-TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
-  PrimitiveType data_type;
-  HloOpcode opcode;
-  float tolerance;
-  std::tie(data_type, opcode, tolerance) = GetParam();
-
-  const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = f32[15,33]{1,0} parameter(0)
-  parameter_1 = $0[33,68]{1,0} parameter(1)
-  f1.1 = $0[33,68]{1,0} $1(parameter_1)
-  c.1 = f32[33,68]{1,0} convert(f1.1)
-  ROOT _.1 = f32[15,68]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0},
-    operand_precision={HIGH, HIGH}
-}
-
-ENTRY e {
-  p1 = $0[33,68]{1,0} parameter(1)
-  p0 = f32[15,33]{1,0} parameter(0)
-  ROOT triton_gemm__ = f32[15,68]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"32",
-                       "block_n":"32",
-                       "block_k":"32",
-                       "split_k":"1",
-                       "num_stages":"1",
-                       "num_warps":"4",
-                       "num_ctas":"1"}}}
-})";
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  param_0.1 = $0[33,68]{1,0} parameter(0)
-  f.1 = $0[33,68]{1,0} $1(param_0.1)
-  ROOT convert.1 = f32[33,68]{1,0} convert(f.1)
-}
-
-ENTRY e {
-  p1 = $0[33,68]{1,0} parameter(1)
-  p0 = f32[15,33]{1,0} parameter(0)
-  fusion = f32[33,68]{1,0} fusion(p1), kind=kLoop, calls=fused_computation
-  gemm = (f32[15,68]{1,0}, s8[0]{0}) custom-call(p0, fusion),
-    custom_call_target="__cublas$$gemm",
-    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
-      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
-      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
-      "alpha_imag":0,"precision_config":
-      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
-   ROOT get-tuple-element = f32[15,68]{1,0} get-tuple-element((f32[15,68]{1,0}, s8[0]{0}) gemm), index=0
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
-      /*run_hlo_passes=*/false));
-}
-
-TEST_P(UnaryElementwiseTest, ElementwiseUnaryOpExecutesCorrectly) {
-  PrimitiveType data_type;
-  HloOpcode opcode;
-  float tolerance;
-  std::tie(data_type, opcode, tolerance) = GetParam();
-
-  const std::string kHloTestTemplate = R"(
-triton_computation {
-  parameter_0 = $0[33,68]{1,0} parameter(0)
-  output = $0[33,68]{1,0} $1(parameter_0)
-  ROOT convert = f32[33,68]{1,0} convert(output)
-}
-
-ENTRY e {
-  p0 = $0[33,68]{1,0} parameter(0)
-  ROOT triton_fusion = f32[33,68]{1,0} fusion(p0), kind=kCustom,
-    calls=triton_computation, backend_config={
-      "fusion_backend_config":{
-      "kind":"__triton",
-      "block_level_fusion_config":{
-        "output_tiles":[{"sizes":["1", "1"]}],
-        "num_warps":"1",
-        "num_ctas":"1",
-        "num_stages":"1"}}}
-})";
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  param_0.1 = $0[33,68]{1,0} parameter(0)
-  output = $0[33,68]{1,0} $1(param_0.1)
-  ROOT convert = f32[33,68]{1,0} convert(output)
-}
-
-ENTRY e {
-  p0 = $0[33,68]{1,0} parameter(0)
-  ROOT fusion = f32[33,68]{1,0} fusion(p0), kind=kLoop, calls=fused_computation
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
-      /*run_hlo_passes=*/false));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuitePRED, UnaryElementwiseTest,
-    ::testing::Combine(
-        ::testing::Values(PRED),
-        ::testing::ValuesIn(
-            legacy_triton::
-                TritonSupportedUnaryElementwiseUpToFloatNormalization(PRED)),
-        ::testing::Values(3e-2)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteS8, UnaryElementwiseTest,
-    ::testing::Combine(
-        ::testing::Values(S8),
-        ::testing::ValuesIn(
-            legacy_triton::
-                TritonSupportedUnaryElementwiseUpToFloatNormalization(S8)),
-        ::testing::Values(3e-2)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteS16, UnaryElementwiseTest,
-    ::testing::Combine(
-        ::testing::Values(S16),
-        ::testing::ValuesIn(
-            legacy_triton::
-                TritonSupportedUnaryElementwiseUpToFloatNormalization(S16)),
-        ::testing::Values(1e-3)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteS32, UnaryElementwiseTest,
-    ::testing::Combine(
-        ::testing::Values(S32),
-        ::testing::ValuesIn(
-            legacy_triton::
-                TritonSupportedUnaryElementwiseUpToFloatNormalization(S32)),
-        ::testing::Values(1e-3)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteF16, UnaryElementwiseTest,
-    ::testing::Combine(
-        ::testing::Values(F16),
-        ::testing::ValuesIn(
-            legacy_triton::
-                TritonSupportedUnaryElementwiseUpToFloatNormalization(F16)),
-        ::testing::Values(2e-4)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteF32, UnaryElementwiseTest,
-    ::testing::Combine(
-        ::testing::Values(F32),
-        ::testing::ValuesIn(
-            legacy_triton::
-                TritonSupportedUnaryElementwiseUpToFloatNormalization(F32)),
-        ::testing::Values(1e-6)),
-    ElementwiseTestParamsToString);
-
-using BinaryElementwiseTest = ElementwiseTest;
-
-TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
-  PrimitiveType data_type;
-  HloOpcode opcode;
-  float tolerance;
-  std::tie(data_type, opcode, tolerance) = GetParam();
-
-  const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  parameter_1 = $0[11,63]{1,0} parameter(1)
-  parameter_2 = $0[11,63]{1,0} parameter(2)
-  f1.1 = $0[11,63]{1,0} $1(parameter_1, parameter_2)
-  c.1 = f32[11,63]{1,0} convert(f1.1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0},
-    operand_precision={HIGH, HIGH}
-}
-
-ENTRY e {
-  p0 = f32[92,11]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p2 = $0[11,63]{1,0} parameter(2)
-  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"64",
-                       "block_n":"32",
-                       "block_k":"64",
-                       "split_k":"1",
-                       "num_stages":"2",
-                       "num_warps":"2",
-                       "num_ctas":"1"}}}
-})";
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  p0 = $0[11,63]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  f.1 = $0[11,63]{1,0} $1(p0, p1)
-  ROOT convert.1 = f32[11,63]{1,0} convert(f.1)
-}
-
-ENTRY e {
-  p2 = $0[11,63]{1,0} parameter(2)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p0 = f32[92,11]{1,0} parameter(0)
-  fusion = f32[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
-  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
-    custom_call_target="__cublas$$gemm",
-    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
-      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
-      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
-      "alpha_imag":0,"precision_config":
-      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[92,63]{1,0} get-tuple-element((f32[92,63]{1,0}, s8[0]{0}) gemm), index=0
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
-      /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/6));
-}
-
-TEST_P(BinaryElementwiseTest, ElementwiseBinaryOpExecutesCorrectly) {
-  PrimitiveType data_type;
-  HloOpcode opcode;
-  float tolerance;
-  std::tie(data_type, opcode, tolerance) = GetParam();
-
-  const std::string kHloTestTemplate = R"(
-triton_computation {
-  parameter_0 = $0[11,63]{1,0} parameter(0)
-  parameter_1 = $0[11,63]{1,0} parameter(1)
-  output = $0[11,63]{1,0} $1(parameter_0, parameter_1)
-  ROOT c.1 = f32[11,63]{1,0} convert(output)
-}
-
-ENTRY e {
-  p0 = $0[11,63]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  ROOT triton_fusion = f32[11,63]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_computation, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton",
-        "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["1", "1"]}],
-          "num_warps":"1",
-          "num_ctas":"1",
-          "num_stages":"1"}}}
-})";
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  p0 = $0[11,63]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  output = $0[11,63]{1,0} $1(p0, p1)
-  ROOT convert.1 = f32[11,63]{1,0} convert(output)
-}
-
-ENTRY e {
-  p1 = $0[11,63]{1,0} parameter(1)
-  p0 = $0[11,63]{1,0} parameter(0)
-  ROOT fusion = f32[11,63]{1,0} fusion(p0, p1), kind=kLoop, calls=fused_computation
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      HloOpcodeString(opcode));
-
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
-      /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/6));
-}
-
-bool HloOpcodeIsComparison(HloOpcode opcode) {
-  return opcode == HloOpcode::kCompare;
-}
-std::vector<HloOpcode> TestedBinaryElementwise(PrimitiveType element_type) {
-  std::vector<HloOpcode> ret =
-      legacy_triton::TritonSupportedBinaryElementwiseUpToFloatNormalization(
-          element_type);
-  // Comparison requires an additional property.
-  ret.erase(std::remove_if(ret.begin(), ret.end(), HloOpcodeIsComparison),
-            ret.end());
-  return ret;
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuitePRED, BinaryElementwiseTest,
-    ::testing::Combine(::testing::Values(PRED),
-                       ::testing::ValuesIn(TestedBinaryElementwise(PRED)),
-                       ::testing::Values(0)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteS8, BinaryElementwiseTest,
-    ::testing::Combine(::testing::Values(S8),
-                       ::testing::ValuesIn(TestedBinaryElementwise(S8)),
-                       ::testing::Values(0)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteS16, BinaryElementwiseTest,
-    ::testing::Combine(::testing::Values(S16),
-                       ::testing::ValuesIn(TestedBinaryElementwise(S16)),
-                       ::testing::Values(0)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteS32, BinaryElementwiseTest,
-    ::testing::Combine(::testing::Values(S32),
-                       ::testing::ValuesIn(TestedBinaryElementwise(S32)),
-                       ::testing::Values(0)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteF16, BinaryElementwiseTest,
-    ::testing::Combine(::testing::Values(F16),
-                       ::testing::ValuesIn(TestedBinaryElementwise(F16)),
-                       ::testing::Values(2e-4)),
-    ElementwiseTestParamsToString);
-
-INSTANTIATE_TEST_SUITE_P(
-    ElementwiseTestSuiteF32, BinaryElementwiseTest,
-    ::testing::Combine(::testing::Values(F32),
-                       ::testing::ValuesIn(TestedBinaryElementwise(F32)),
-                       ::testing::Values(1e-6)),
-    ElementwiseTestParamsToString);
-
-class CompareTest : public TritonTest,
-                    public ::testing::WithParamInterface<
-                        std::tuple<PrimitiveType, Comparison::Direction>> {};
-
-std::string CompareTestParamsToString(
-    const ::testing::TestParamInfo<
-        std::tuple<PrimitiveType, Comparison::Direction>>& data) {
-  PrimitiveType data_type;
-  Comparison::Direction direction;
-  std::tie(data_type, direction) = data.param;
-  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type),
-                      "_", ComparisonDirectionToString(direction));
-}
-
-TEST_P(CompareTest, CompareFusionExecutesCorrectly) {
-  PrimitiveType data_type;
-  Comparison::Direction direction;
-  std::tie(data_type, direction) = GetParam();
-
-  const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  parameter_1 = $0[11,63]{1,0} parameter(1)
-  parameter_2 = $0[11,63]{1,0} parameter(2)
-  f1.1 = pred[11,63]{1,0} compare(parameter_1, parameter_2), direction=$1
-  c.1 = f32[11,63]{1,0} convert(f1.1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0},
-    operand_precision={HIGH, HIGH}
-}
-
-ENTRY e {
-  p0 = f32[92,11]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p2 = $0[11,63]{1,0} parameter(2)
-  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
-    calls=triton_gemm___computation, backend_config={
-      "fusion_backend_config":{
-      "kind":"__triton_gemm",
-      "triton_gemm_config": {
-        "block_m":"16",
-        "block_n":"64",
-        "block_k":"16",
-        "split_k":"1",
-        "num_stages":"3",
-        "num_warps":"2",
-        "num_ctas":"1"}}}
-})";
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      ComparisonDirectionToString(direction));
-
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  p0 = $0[11,63]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  f.1 = pred[11,63]{1,0} compare(p0, p1), direction=$1
-  ROOT convert.1 = f32[11,63]{1,0} convert(f.1)
-}
-
-ENTRY e {
-  p2 = $0[11,63]{1,0} parameter(2)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p0 = f32[92,11]{1,0} parameter(0)
-  fusion = f32[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
-  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
-    custom_call_target="__cublas$$gemm",
-    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
-      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
-      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
-      "alpha_imag":0,"precision_config":
-      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[92,63]{1,0} get-tuple-element((f32[92,63]{1,0}, s8[0]{0}) gemm), index=0
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      ComparisonDirectionToString(direction));
-
-  float tolerance;
-  switch (data_type) {
-    case F32:
-      tolerance = 1e-6;
-      break;
-    case F16:
-      tolerance = 2e-4;
-      break;
-    case PRED:
-    case S8:
-      tolerance = 3e-2;
-      break;
-    case S16:
-      tolerance = 1e-3;
-      break;
-    case S32:
-      tolerance = 1e-5;
-      break;
-    default:
-      ABSL_UNREACHABLE();
-  }
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
-      /*run_hlo_passes=*/false));
-}
-
-using cd = Comparison::Direction;
-
-INSTANTIATE_TEST_SUITE_P(
-    CompareTestSuite, CompareTest,
-    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, F32),
-                       ::testing::Values(cd::kEq, cd::kNe, cd::kGe, cd::kGt,
-                                         cd::kLe, cd::kLt)),
-    CompareTestParamsToString);
-
-class SelectTest : public TritonTest,
-                   public ::testing::WithParamInterface<
-                       std::tuple<PrimitiveType, PrimitiveType>> {};
-
-TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
-  PrimitiveType data_type1, data_type2;
-  std::tie(data_type1, data_type2) = GetParam();
-  for (const PrimitiveType type : {data_type1, data_type2}) {
-    if (!legacy_triton::IsTritonSupportedDataType(type,
-                                                  GetCudaComputeCapability())) {
-      GTEST_SKIP() << absl::Substitute(
-          "Unsupported data type: $0",
-          primitive_util::LowercasePrimitiveTypeName(type));
-    }
-  }
-
-  const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = $1[92,13]{1,0} parameter(0)
-  parameter_1 = $0[13,63]{1,0} parameter(1)
-  parameter_2 = $0[13,63]{1,0} parameter(2)
-  parameter_3 = pred[13,63]{1,0} parameter(3)
-  f1.1 = $0[13,63]{1,0} select(parameter_3, parameter_1, parameter_2)
-  c.1 = $1[13,63]{1,0} convert(f1.1)
-  ROOT _.1 = $1[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0},
-    operand_precision={HIGH, HIGH}
-}
-
-ENTRY e {
-  p0 = $1[92,13]{1,0} parameter(0)
-  p1 = $0[13,63]{1,0} parameter(1)
-  p2 = $0[13,63]{1,0} parameter(2)
-  p3 = pred[13,63]{1,0} parameter(3)
-  ROOT triton_gemm__ = $1[92,63]{1,0} fusion(p0, p1, p2, p3), kind=kCustom,
-    calls=triton_gemm___computation, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm",
-        "triton_gemm_config": {
-          "block_m":"16",
-          "block_n":"64",
-          "block_k":"16",
-          "split_k":"1",
-          "num_stages":"3",
-          "num_warps":"2",
-          "num_ctas":"1"}}}
-})";
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
-      primitive_util::LowercasePrimitiveTypeName(data_type2));
-
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  p0 = $0[13,63]{1,0} parameter(0)
-  p1 = $0[13,63]{1,0} parameter(1)
-  p2 = pred[13,63]{1,0} parameter(2)
-  f.1 = $0[13,63]{1,0} select(p2, p0, p1)
-  ROOT convert.1 = $1[13,63]{1,0} convert(f.1)
-}
-
-ENTRY e {
-  p3 = pred[13,63]{1,0} parameter(3)
-  p2 = $0[13,63]{1,0} parameter(2)
-  p1 = $0[13,63]{1,0} parameter(1)
-  p0 = $1[92,13]{1,0} parameter(0)
-  fusion = $1[13,63]{1,0} fusion(p1, p2, p3), kind=kLoop,
-    calls=fused_computation
-  gemm = ($1[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
-    custom_call_target="__cublas$$gemm",
-    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
-      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
-      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
-      "alpha_imag":0,"precision_config":
-      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = $1[92,63]{1,0} get-tuple-element(($1[92,63]{1,0}, s8[0]{0}) gemm), index=0
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
-      primitive_util::LowercasePrimitiveTypeName(data_type2));
-
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/0, /*arel=*/0},
-      /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/9));
-}
-
-std::string TwoPrimitiveTypesToString(
-    const ::testing::TestParamInfo<std::tuple<PrimitiveType, PrimitiveType>>&
-        data) {
-  PrimitiveType data_type1;
-  PrimitiveType data_type2;
-  std::tie(data_type1, data_type2) = data.param;
-  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type1),
-                      "_",
-                      primitive_util::LowercasePrimitiveTypeName(data_type2));
-}
-
-// BF16: depending on the GPU generation.
-constexpr std::array<PrimitiveType, 7> kSupportedDataTypes{PRED, S8,  S16, S32,
-                                                           F16,  F32, BF16};
-
-INSTANTIATE_TEST_SUITE_P(
-    SelectTestSuite, SelectTest,
-    ::testing::Combine(::testing::ValuesIn(kSupportedDataTypes),
-                       ::testing::Values(F16, BF16, F32)),
-    TwoPrimitiveTypesToString);
-
-class ConstantTest : public TritonTest,
-                     public ::testing::WithParamInterface<PrimitiveType> {};
-
-TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
-  const PrimitiveType data_type = GetParam();
-  if (!legacy_triton::IsTritonSupportedDataType(data_type,
-                                                GetCudaComputeCapability())) {
-    GTEST_SKIP() << absl::Substitute(
-        "Unsupported data type: $0",
-        primitive_util::LowercasePrimitiveTypeName(data_type));
-  }
-
-  const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  parameter_1 = f32[11,63]{1,0} parameter(1)
-  c = $0[] constant(123)
-  b = $0[11,63] broadcast(c)
-  cv = f32[11,63] convert(b)
-  m = f32[11,63] multiply(cv, parameter_1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, m),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0},
-    operand_precision={HIGH, HIGH}
-}
-
-ENTRY e {
-  p0 = f32[92,11]{1,0} parameter(0)
-  p1 = f32[11,63]{1,0} parameter(1)
-  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm___computation, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm",
-        "triton_gemm_config":{
-          "block_m":"16",
-          "block_n":"64",
-          "block_k":"16",
-          "split_k":"1",
-          "num_stages":"3",
-          "num_warps":"2",
-          "num_ctas":"1"}}}
-})";
-  const std::string hlo_test = absl::Substitute(
-      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
-
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  p0 = f32[11,63]{1,0} parameter(0)
-  c = $0[] constant(123)
-  b = $0[11,63] broadcast(c)
-  cv = f32[11,63] convert(b)
-  ROOT m = f32[11,63] multiply(cv, p0)
-}
-
-ENTRY e {
-  p1 = f32[11,63]{1,0} parameter(1)
-  p0 = f32[92,11]{1,0} parameter(0)
-  fusion = f32[11,63]{1,0} fusion(p1), kind=kLoop,
-    calls=fused_computation
-  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
-    custom_call_target="__cublas$$gemm",
-    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
-      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
-      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
-      "alpha_imag":0,"precision_config":
-      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[92,63]{1, 0} get-tuple-element((f32[92,63]{1, 0}, s8[0]{0}) gemm), index=0
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
-
-  float tolerance;
-  switch (data_type) {
-    case F32:
-    case BF16:
-      tolerance = 1e-6;
-      break;
-    case F16:
-      tolerance = 2e-4;
-      break;
-    case PRED:
-    case S8:
-      tolerance = 3e-2;
-      break;
-    case S16:
-      tolerance = 1e-3;
-      break;
-    case S32:
-      tolerance = 1e-5;
-      break;
-    default:
-      ABSL_UNREACHABLE();
-  }
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
-      /*run_hlo_passes=*/false));
-}
-
-INSTANTIATE_TEST_SUITE_P(ConstantTestSuite, ConstantTest,
-                         ::testing::ValuesIn(kSupportedDataTypes),
-                         TritonSupportTestTypeToString);
-
-class ConvertTest : public TritonTest,
-                    public ::testing::WithParamInterface<
-                        std::tuple<PrimitiveType, PrimitiveType>> {};
-
-TEST_P(ConvertTest, ConvertFusionExecutesCorrectly) {
-  PrimitiveType data_type1, data_type2;
-  std::tie(data_type1, data_type2) = GetParam();
-  for (const PrimitiveType type : {data_type1, data_type2}) {
-    if (!legacy_triton::IsTritonSupportedDataType(type,
-                                                  GetCudaComputeCapability())) {
-      GTEST_SKIP() << absl::Substitute(
-          "Unsupported data type: $0",
-          primitive_util::LowercasePrimitiveTypeName(type));
-    }
-  }
-
-  const std::string hlo_text = absl::Substitute(
-      R"(
-t {
-  p0 = $0[2,2] parameter(0)
-  p0c = $1[2,2] convert(p0)
-  p0cc = f32[2,2] convert(p0c)
-  p1 = f32[2,2] parameter(1)
-  ROOT r = f32[2,2] dot(p0cc, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0},
-    operand_precision={HIGH, HIGH}
-}
-
-ENTRY e {
-  p0 = $0[2,2] parameter(0)
-  p1 = f32[2,2] parameter(1)
-  ROOT r = f32[2,2] fusion(p0, p1), kind=kCustom, calls=t,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
-})",
-      primitive_util::LowercasePrimitiveTypeName(data_type1),
-      primitive_util::LowercasePrimitiveTypeName(data_type2));
-
-  MatchOptimizedHlo(hlo_text, R"(
-CHECK: block_m
-  )");
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    ConvertTestSuite, ConvertTest,
-    ::testing::Combine(::testing::ValuesIn(kSupportedDataTypes),
-                       ::testing::ValuesIn(kSupportedDataTypes)),
-    TwoPrimitiveTypesToString);
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
index b1d0309e7dc71c..1cf29f90d8c82b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -185,19 +186,31 @@ std::string ElementwiseTestParamsToString(
 
 using UnaryElementwiseTest = ElementwiseTest;
 
-TEST_P(UnaryElementwiseTest, DISABLED_ElementwiseFusionExecutesCorrectly) {
+TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
   std::tie(data_type, opcode, tolerance) = GetParam();
 
   const std::string kHloTestTemplate = R"(
+lhs_computation {
+  ROOT parameter_0 = f32[15,33]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = $0[33,68]{1,0} parameter(0)
+  f1.1 = $0[33,68]{1,0} $1(parameter_1)
+  ROOT c.1 = f32[33,68]{1,0} convert(f1.1)
+}
+
 triton_gemm___computation {
   parameter_0 = f32[15,33]{1,0} parameter(0)
   parameter_1 = $0[33,68]{1,0} parameter(1)
-  f1.1 = $0[33,68]{1,0} $1(parameter_1)
-  c.1 = f32[33,68]{1,0} convert(f1.1)
-  ROOT _.1 = f32[15,68]{1,0} dot(parameter_0, c.1),
+  lhs = f32[15,33]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+  rhs = f32[33,68]{1,0} fusion(parameter_1), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+  ROOT _.1 = f32[15,68]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -207,12 +220,9 @@ ENTRY e {
   p0 = f32[15,33]{1,0} parameter(0)
   ROOT triton_gemm__ = f32[15,68]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"32",
-                       "block_n":"32",
-                       "block_k":"32",
-                       "split_k":"1",
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+                    "block_level_fusion_config":
+                      {"output_tiles":[{"sizes":["32","32"]}],
                        "num_stages":"1",
                        "num_warps":"4",
                        "num_ctas":"1"}}}
@@ -250,7 +260,7 @@ ENTRY e {
       /*run_hlo_passes=*/false));
 }
 
-TEST_P(UnaryElementwiseTest, DISABLED_ElementwiseUnaryOpExecutesCorrectly) {
+TEST_P(UnaryElementwiseTest, ElementwiseUnaryOpExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -361,20 +371,33 @@ INSTANTIATE_TEST_SUITE_P(
 
 using BinaryElementwiseTest = ElementwiseTest;
 
-TEST_P(BinaryElementwiseTest, DISABLED_ElementwiseFusionExecutesCorrectly) {
+TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
   std::tie(data_type, opcode, tolerance) = GetParam();
 
   const std::string kHloTestTemplate = R"(
+lhs_computation {
+  ROOT parameter_0 = f32[92,11]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = $0[11,63]{1,0} parameter(0)
+  parameter_2 = $0[11,63]{1,0} parameter(1)
+  f1.1 = $0[11,63]{1,0} $1(parameter_1, parameter_2)
+  ROOT c.1 = f32[11,63]{1,0} convert(f1.1)
+}
+
 triton_gemm___computation {
   parameter_0 = f32[92,11]{1,0} parameter(0)
   parameter_1 = $0[11,63]{1,0} parameter(1)
   parameter_2 = $0[11,63]{1,0} parameter(2)
-  f1.1 = $0[11,63]{1,0} $1(parameter_1, parameter_2)
-  c.1 = f32[11,63]{1,0} convert(f1.1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
+  lhs = f32[92,11]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+  rhs = f32[11,63]{1,0} fusion(parameter_1, parameter_2), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+  ROOT _.1 = f32[92,63]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -385,12 +408,9 @@ ENTRY e {
   p2 = $0[11,63]{1,0} parameter(2)
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
     calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"64",
-                       "block_n":"32",
-                       "block_k":"64",
-                       "split_k":"1",
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+                    "block_level_fusion_config":
+                      {"output_tiles":[{"sizes":["64","32"]}],
                        "num_stages":"2",
                        "num_warps":"2",
                        "num_ctas":"1"}}}
@@ -430,7 +450,7 @@ ENTRY e {
       /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/6));
 }
 
-TEST_P(BinaryElementwiseTest, DISABLED_ElementwiseBinaryOpExecutesCorrectly) {
+TEST_P(BinaryElementwiseTest, ElementwiseBinaryOpExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -611,7 +631,7 @@ class SelectTest : public TritonTest,
                    public ::testing::WithParamInterface<
                        std::tuple<PrimitiveType, PrimitiveType>> {};
 
-TEST_P(SelectTest, DISABLED_SelectFusionExecutesCorrectly) {
+TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
   PrimitiveType data_type1, data_type2;
   std::tie(data_type1, data_type2) = GetParam();
   for (const PrimitiveType type : {data_type1, data_type2}) {
@@ -624,14 +644,28 @@ TEST_P(SelectTest, DISABLED_SelectFusionExecutesCorrectly) {
   }
 
   const std::string kHloTestTemplate = R"(
+lhs_computation {
+  ROOT parameter_0 = $1[92,13]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = $0[13,63]{1,0} parameter(0)
+  parameter_2 = $0[13,63]{1,0} parameter(1)
+  parameter_3 = pred[13,63]{1,0} parameter(2)
+  f1.1 = $0[13,63]{1,0} select(parameter_3, parameter_1, parameter_2)
+  ROOT c.1 = $1[13,63]{1,0} convert(f1.1)
+}
+
 triton_gemm___computation {
   parameter_0 = $1[92,13]{1,0} parameter(0)
   parameter_1 = $0[13,63]{1,0} parameter(1)
   parameter_2 = $0[13,63]{1,0} parameter(2)
   parameter_3 = pred[13,63]{1,0} parameter(3)
-  f1.1 = $0[13,63]{1,0} select(parameter_3, parameter_1, parameter_2)
-  c.1 = $1[13,63]{1,0} convert(f1.1)
-  ROOT _.1 = $1[92,63]{1,0} dot(parameter_0, c.1),
+  lhs = $1[92,13]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  rhs = $1[13,63]{1,0} fusion(parameter_1, parameter_2, parameter_3), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  ROOT _.1 = $1[92,63]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -644,12 +678,9 @@ ENTRY e {
   ROOT triton_gemm__ = $1[92,63]{1,0} fusion(p0, p1, p2, p3), kind=kCustom,
     calls=triton_gemm___computation, backend_config={
       "fusion_backend_config":{
-        "kind":"__triton_gemm",
-        "triton_gemm_config": {
-          "block_m":"16",
-          "block_n":"64",
-          "block_k":"16",
-          "split_k":"1",
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config": {
+          "output_tiles":[{"sizes":["16","64"]}],
           "num_stages":"3",
           "num_warps":"2",
           "num_ctas":"1"}}}
@@ -716,7 +747,7 @@ INSTANTIATE_TEST_SUITE_P(
 class ConstantTest : public TritonTest,
                      public ::testing::WithParamInterface<PrimitiveType> {};
 
-TEST_P(ConstantTest, DISABLED_ConstantFusionExecutesCorrectly) {
+TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
   const PrimitiveType data_type = GetParam();
   if (!legacy_triton::IsTritonSupportedDataType(data_type,
                                                 GetCudaComputeCapability())) {
@@ -726,14 +757,26 @@ TEST_P(ConstantTest, DISABLED_ConstantFusionExecutesCorrectly) {
   }
 
   const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  parameter_1 = f32[11,63]{1,0} parameter(1)
+lhs_computation {
+  ROOT parameter_0 = f32[92,11]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = f32[11,63]{1,0} parameter(0)
   c = $0[] constant(123)
   b = $0[11,63] broadcast(c)
   cv = f32[11,63] convert(b)
-  m = f32[11,63] multiply(cv, parameter_1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, m),
+  ROOT m = f32[11,63] multiply(cv, parameter_1)
+}
+
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = f32[11,63]{1,0} parameter(1)
+  lhs = f32[92,11]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  rhs = f32[11,63]{1,0} fusion(parameter_1), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  ROOT _.1 = f32[92,63]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -744,12 +787,9 @@ ENTRY e {
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm___computation, backend_config={
       "fusion_backend_config":{
-        "kind":"__triton_gemm",
-        "triton_gemm_config":{
-          "block_m":"16",
-          "block_n":"64",
-          "block_k":"16",
-          "split_k":"1",
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16","64"]}],
           "num_stages":"3",
           "num_warps":"2",
           "num_ctas":"1"}}}
@@ -818,7 +858,7 @@ class ConvertTest : public TritonTest,
                     public ::testing::WithParamInterface<
                         std::tuple<PrimitiveType, PrimitiveType>> {};
 
-TEST_P(ConvertTest, DISABLED_ConvertFusionExecutesCorrectly) {
+TEST_P(ConvertTest, ConvertFusionExecutesCorrectly) {
   PrimitiveType data_type1, data_type2;
   std::tie(data_type1, data_type2) = GetParam();
   for (const PrimitiveType type : {data_type1, data_type2}) {
@@ -832,12 +872,24 @@ TEST_P(ConvertTest, DISABLED_ConvertFusionExecutesCorrectly) {
 
   const std::string hlo_text = absl::Substitute(
       R"(
-t {
+lhs_computation {
   p0 = $0[2,2] parameter(0)
   p0c = $1[2,2] convert(p0)
-  p0cc = f32[2,2] convert(p0c)
+  ROOT p0cc = f32[2,2] convert(p0c)
+}
+
+rhs_computation {
+  ROOT p1 = f32[2,2] parameter(0)
+}
+
+t {
+  p0 = $0[2,2] parameter(0)
   p1 = f32[2,2] parameter(1)
-  ROOT r = f32[2,2] dot(p0cc, p1),
+  lhs = f32[2,2] fusion(p0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion"}}
+  rhs = f32[2,2] fusion(p1), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion"}}
+  ROOT r = f32[2,2] dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -846,14 +898,12 @@ ENTRY e {
   p0 = $0[2,2] parameter(0)
   p1 = f32[2,2] parameter(1)
   ROOT r = f32[2,2] fusion(p0, p1), kind=kCustom, calls=t,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion"}}
 })",
       primitive_util::LowercasePrimitiveTypeName(data_type1),
       primitive_util::LowercasePrimitiveTypeName(data_type2));
 
-  MatchOptimizedHlo(hlo_text, R"(
-CHECK: block_m
-  )");
+  TF_ASSERT_OK(GetOptimizedModule(hlo_text).status());
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -2346,6 +2396,48 @@ INSTANTIATE_TEST_SUITE_P(ReductionTypeTestSuite, ReductionTypeTest,
                          ::testing::ValuesIn(kReductionSupportedDataTypes),
                          TritonSupportTestTypeToString);
 
+class ClampTypeTest : public TritonTest,
+                      public ::testing::WithParamInterface<PrimitiveType> {};
+
+TEST_P(ClampTypeTest, CheckInvertedBoundsGivesExpectedResult) {
+  PrimitiveType data_type = GetParam();
+
+  const std::string kHloTestTemplate = R"hlo(
+    triton_computation {
+      param = $0[512] parameter(0)
+      lower_bound = $0[] constant(2)
+      lower_bound_tensor = $0[512] broadcast(lower_bound)
+      upper_bound = $0[] constant(-2)
+      upper_bound_tensor = $0[512] broadcast(upper_bound)
+      ROOT clamp = $0[512] clamp(lower_bound_tensor, param, upper_bound_tensor)
+    }
+
+    ENTRY entry_computation {
+      p = $0[512] parameter(0)
+      ROOT fusion = $0[512] fusion(p), kind=kCustom, calls=triton_computation,
+        backend_config={
+          "fusion_backend_config":{
+          "kind":"__triton",
+          "block_level_fusion_config":{
+            "output_tiles":[{"sizes":["512"]}],
+            "num_warps":"1",
+            "num_ctas":"1",
+            "num_stages":"1"}}}
+})hlo";
+
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(hlo_test, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
+}
+
+constexpr PrimitiveType kClampSupportedDataTypes[] = {S8,  S16, S32, S64,
+                                                      F16, F32, F64, BF16};
+
+INSTANTIATE_TEST_SUITE_P(ClampTypeTestSuite, ClampTypeTest,
+                         ::testing::ValuesIn(kClampSupportedDataTypes),
+                         TritonSupportTestTypeToString);
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc
index 07ccb041d7c019..853262de2cf703 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc
@@ -37,9 +37,9 @@ namespace {
 // emitter becomes a reality.
 // *****************************************************************************
 
-using XTileDialectTest = HloTestBaseWithMlirContext;
+using XTileDialectTest = HloTestBaseWithMLIRContext;
 
-TEST_F(XTileDialectTest, TestEmittingStableHloTranspose) {
+TEST_F(XTileDialectTest, HloTransposeIsLoweredToStableHloTranspose) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -68,6 +68,369 @@ CHECK: %[[RES:.*]] = stablehlo.transpose %[[ARG:.*]], dims = [1, 0] : (tensor<32
 )"));
 }
 
+TEST_F(XTileDialectTest, HloBitcastIsLoweredToTensorBitcast) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t, is_scheduled=true
+
+bitcast_fusion {
+  p0 = f32[150,160] parameter(0)
+  ROOT bitcast_convert = s32[150,160] bitcast(p0)
+}
+
+ENTRY e {
+  p0 = f32[150,160] parameter(0)
+  ROOT custom-call = s32[150,160] fusion(p0), kind=kCustom,
+    calls=bitcast_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{16, 32}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("bitcast_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = tensor.bitcast %[[ARG:.*]] : tensor<16x32xf32> to tensor<16x32xi32>
+)"));
+}
+
+TEST_F(XTileDialectTest, HloIotaIsLoweredToStableHloIota) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t, is_scheduled=true
+
+iota_fusion {
+  ROOT iota = s32[256] iota(), iota_dimension=0
+}
+
+ENTRY e {
+  ROOT custom-call = s32[256] fusion(), kind=kCustom,
+    calls=iota_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{16}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("iota_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = stablehlo.iota dim = 0 : tensor<16xi32>
+)"));
+}
+
+TEST_F(XTileDialectTest, HloBroadcastInDimIsLoweredToStableHloBroadcastInDim) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+broadcast_in_dim_fusion {
+  p0 = f32[150,160] parameter(0)
+  ROOT broadcast = f32[150,160,31] broadcast(p0), dimensions={0,1}
+}
+
+ENTRY e {
+  p0 = f32[150,160] parameter(0)
+  ROOT custom-call = f32[150,160,31] fusion(p0), kind=kCustom,
+    calls=broadcast_in_dim_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{16, 32, 8}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("broadcast_in_dim_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = stablehlo.broadcast_in_dim %[[ARG:.*]], dims = [0, 1] : (tensor<16x32xf32>) -> tensor<16x32x8xf32>
+)"));
+}
+
+TEST_F(XTileDialectTest,
+       HloZeroDimensionalBroadcastIsLoweredToStableHloBroadcastInDim) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+broadcast_in_dim_fusion {
+  p0 = f32[] parameter(0)
+  ROOT broadcast = f32[150,160,31] broadcast(p0), dimensions={}
+}
+
+ENTRY e {
+  p0 = f32[] parameter(0)
+  ROOT custom-call = f32[150,160,31] fusion(p0), kind=kCustom,
+    calls=broadcast_in_dim_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{16, 32, 8}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("broadcast_in_dim_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = stablehlo.broadcast_in_dim %[[ARG:.*]], dims = [] : (tensor<f32>) -> tensor<16x32x8xf32>
+)"));
+}
+
+TEST_F(XTileDialectTest, HloReduceIsLoweredToStableHloReduce) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+add {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+reduce_fusion {
+  p0 = f32[150,160] parameter(0)
+  const = f32[] constant(0.0)
+  ROOT broadcast = f32[160] reduce(p0, const), dimensions={0}, to_apply=add
+}
+
+ENTRY e {
+  p0 = f32[150,160] parameter(0)
+  ROOT custom-call = f32[160] fusion(p0), kind=kCustom,
+    calls=reduce_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{16}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("reduce_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[INIT:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
+CHECK: %[[MASKED_INPUT:.*]] = xtile.mask {{.*}}
+CHECK: %[[RES:.*]] = stablehlo.reduce(%[[MASKED_INPUT]] init: %[[INIT]]) across dimensions = [0] : (tensor<256x16xf32>, tensor<f32>) -> tensor<16xf32>
+CHECK: reducer(%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>)  {
+CHECK:   %[[SUM:.*]] = arith.addf %[[ARG_0]], %[[ARG_1]] : tensor<f32>
+CHECK:   stablehlo.return %[[SUM]] : tensor<f32>
+CHECK: }
+)"));
+}
+
+TEST_F(XTileDialectTest, HloReshapeIsLoweredToStableHloReshape) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t, is_scheduled=true
+
+reshape_fusion {
+  p0 = s32[150] parameter(0)
+  ROOT reshape = s32[15, 10] reshape(p0)
+}
+
+ENTRY e {
+  p0 = s32[150] parameter(0)
+  ROOT custom-call = s32[15, 10] fusion(p0), kind=kCustom,
+    calls=reshape_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{1, 16}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("reshape_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = stablehlo.reshape %[[ARG:.*]] : (tensor<16xi32>) -> tensor<1x16xi32>
+)"));
+}
+
+TEST_F(XTileDialectTest, HloDotIsLoweredToStableHloDot) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+flhs {
+  ROOT flhs.p0 = f32[150,160] parameter(0)
+}
+
+frhs {
+  ROOT frhs.p0 = f32[160,31] parameter(0)
+}
+
+dot_fusion {
+  fdot.p0 = f32[150,160] parameter(0)
+  fdot.p1 = f32[160,31] parameter(1)
+  fdot.lhs = f32[150,160] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "8"]}]
+      }
+    }
+  }
+  fdot.rhs = f32[160,31]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "8"]}]
+      }
+    }
+  }
+
+  ROOT dot = f32[150,31] dot(fdot.lhs, fdot.rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[150, 160] parameter(0)
+  p1 = f32[160, 31] parameter(1)
+  ROOT custom-call = f32[150,31] fusion(p0, p1), kind=kCustom,
+    calls=dot_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{32, 8}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("dot_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = stablehlo.dot_general %[[ARG0:.*]], %[[ARG1:.*]], contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<32x8xf32>, tensor<8x8xf32>) -> tensor<32x8xf32>
+CHECK: %[[ADD_RES:.*]] = arith.addf %[[ARG2:.*]], %[[RES]] : tensor<32x8xf32>
+)"));
+}
+
+TEST_F(XTileDialectTest, HloScaledDotIsLoweredToXTileDotScaled) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+flhs (p0: f8e5m2[128,128]) -> f8e5m2[128,128] {
+  ROOT p0 = f8e5m2[128,128]{1,0} parameter(0)
+}
+frhs (p0: f8e5m2[128,256]) -> f8e5m2[128,256] {
+  ROOT p0 = f8e5m2[128,256]{1,0} parameter(0)
+}
+flhs_scale (p0: f8e8m0fnu[128,4]) -> f8e8m0fnu[128,4] {
+  ROOT p0 = f8e8m0fnu[128,4]{1,0} parameter(0)
+}
+frhs_scale (p0: f8e8m0fnu[4,256]) -> f8e8m0fnu[4,256] {
+  ROOT p0 = f8e8m0fnu[4,256]{1,0} parameter(0)
+}
+
+triton_dot {
+  lhs = f8e5m2[128,128] parameter(0)
+  lhs1 = f8e5m2[128,128]{1,0} fusion(lhs),
+    kind=kCustom,
+    calls=flhs,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128","128"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  rhs = f8e5m2[128,256] parameter(1)
+  rhs1 = f8e5m2[128,256]{1,0} fusion(rhs),
+    kind=kCustom,
+    calls=frhs,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128","256"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  lhs_scale = f8e8m0fnu[128,4] parameter(2)
+  lhs_scale1 = f8e8m0fnu[128,4]{1,0} fusion(lhs_scale),
+    kind=kCustom,
+    calls=flhs_scale,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128","128"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  rhs_scale = f8e8m0fnu[4,256] parameter(3)
+  rhs_scale1 = f8e8m0fnu[4,256]{1,0} fusion(rhs_scale),
+    kind=kCustom,
+    calls=frhs_scale,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128", "256"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  ROOT _ = bf16[128,256]{1,0} scaled-dot(lhs1, rhs1, lhs_scale1, rhs_scale1),
+    lhs_contracting_dims={1},
+    rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  lhs = f8e5m2[128,128]{1,0} parameter(0)
+  rhs = f8e5m2[128,256]{1,0} parameter(1)
+  lhs_scale = f8e8m0fnu[128,4]{1,0} parameter(2)
+  rhs_scale = f8e8m0fnu[4,256]{1,0} parameter(3)
+  ROOT _ = bf16[128,256]{1,0} fusion(lhs, rhs, lhs_scale, rhs_scale),
+    kind=kCustom,
+    calls=triton_dot,
+    backend_config={
+      "fusion_backend_config": {
+        kind: "__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128", "256"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1"
+        }
+      }
+    }
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  auto& debug_options = module->mutable_config().mutable_debug_options();
+  debug_options.set_xla_gpu_experimental_scaled_dot_with_triton(true);
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{128, 256}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("triton_dot"),
+      block_level_parameters,
+      R"(
+      CHECK: %[[DOT:.*]] = xtile.dot_scaled %[[LHS:.*]] scale %[[LHS_SCALE:.*]], %[[RHS:.*]] scale %[[RHS_SCALE:.*]] {fastMath = true} : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+      CHECK: %[[RES:.*]] = arith.addf %{{.*}}, %[[DOT]] : tensor<128x256xf32>
+      )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
index ecae17a4f6329f..ab44a95362252e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -30,15 +32,14 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
 using ::testing::ElementsAre;
-using ::tsl::testing::StatusIs;
 
 class TritonFusionTest : public HloHardwareIndependentTestBase {};
 
@@ -72,7 +73,7 @@ ENTRY entry_computation {
   auto triton_fusion = dynamic_cast<TritonFusion*>(emitter.get());
   ASSERT_NE(triton_fusion, nullptr);
   std::optional<TritonFusion::LaunchConfig> launch_config =
-      triton_fusion->launch_config();
+      triton_fusion->GetLaunchConfig();
   ASSERT_NE(launch_config, std::nullopt);
   EXPECT_EQ(launch_config->launch_dimensions.num_blocks(),
             /*ceil(125 / 4)=*/32);
@@ -109,7 +110,7 @@ ENTRY entry_computation {
       GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis}, &mlir_context);
   auto triton_fusion_emitter = dynamic_cast<TritonFusion*>(emitter.get());
   ASSERT_NE(triton_fusion_emitter, nullptr);
-  EXPECT_EQ(triton_fusion_emitter->launch_config(), std::nullopt);
+  EXPECT_EQ(triton_fusion_emitter->GetLaunchConfig(), std::nullopt);
 
   // Ensure that the emitter fails gracefully when the launch config is not set.
   EXPECT_THAT(triton_fusion_emitter->GenerateTritonKernelAndWrapper(
@@ -118,6 +119,46 @@ ENTRY entry_computation {
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST_F(
+    TritonFusionTest,
+    TritonFusionWithBlockLevelFusionConfig_LaunchConfigOverrideWorksCorrectly) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+triton_computation {
+  param_0 = f32[125,127] parameter(0)
+  ROOT abs = f32[125,127] abs(param_0)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[125,127] parameter(0)
+  ROOT fusion.1 = f32[125,127] fusion(param_0), kind=kCustom,
+    calls=triton_computation,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["4","127"]}],
+                                   "num_warps":"4"}}}
+})"));
+
+  stream_executor::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  HloFusionAnalysis analysis = HloFusionAnalysis::Create(*root, device_info);
+
+  mlir::MLIRContext mlir_context;
+  std::unique_ptr<FusionInterface> emitter =
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis}, &mlir_context);
+  auto triton_fusion = dynamic_cast<TritonFusion*>(emitter.get());
+
+  ASSERT_NE(triton_fusion, nullptr);
+  std::optional<TritonFusion::LaunchConfig> launch_config =
+      triton_fusion->GetLaunchConfig(se::ThreadDim(32, 2, 1));
+  ASSERT_NE(launch_config, std::nullopt);
+  EXPECT_EQ(launch_config->launch_dimensions.num_blocks(),
+            /*ceil(125 / 4)=*/32);
+  EXPECT_EQ(launch_config->launch_dimensions.num_threads_per_block(),
+            /*32 * 2 * 1=*/64);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD
index f67b4eaba79170..ff64a6dad10ae2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD
@@ -1,5 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@rules_cc//cc:cc_library.bzl", "cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -17,6 +18,7 @@ package_group(
 td_library(
     name = "triton_xla_td_files",
     srcs = glob(["*.td"]),
+    compatible_with = get_compatible_with_portable(),
     includes = ["."],
     deps = [
         "@llvm-project//mlir:BuiltinDialectTdFiles",
@@ -27,6 +29,7 @@ td_library(
 
 gentbl_cc_library(
     name = "triton_xla_dialect_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = {
         "triton_xla_dialect.h.inc": ["-gen-dialect-decls"],
@@ -39,6 +42,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "triton_xla_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = {
         "triton_xla_ops.h.inc": ["-gen-op-decls"],
@@ -57,6 +61,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "triton_xla_attrs_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = {
         "triton_xla_enums.h.inc": ["-gen-enum-decls"],
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/canonicalize.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/canonicalize.mlir
index 2964fae0a79c21..8512fbb549a67b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/canonicalize.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/canonicalize.mlir
@@ -17,3 +17,12 @@ tt.func @xla_triton_extract_insert(%arg0: !tt.ptr<bf16>, %arg1: index) {
       [%c0, %arg1][16, 64][1, 1] {noinline = false} : tensor<16x64xbf16>
   tt.return
 }
+
+// CHECK-LABEL: @fold_ptr_memref_ptr(
+// CHECK-SAME: %[[SRC:.*]]: !tt.ptr<f32>
+func.func @fold_ptr_memref_ptr(%src: !tt.ptr<f32>) -> !tt.ptr<f32> {
+  // CHECK: return %[[SRC]] : !tt.ptr<f32>
+  %src_ptr = triton_xla.ptr_to_memref %src from !tt.ptr<f32> to memref<256xf32>
+  %dst = triton_xla.memref_to_ptr %src_ptr from memref<256xf32> to !tt.ptr<f32>
+  func.return %dst : !tt.ptr<f32>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td
index b72dad61d38a1e..cc9bd2516950a2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td
@@ -28,6 +28,10 @@ def XlaTritonDialect : Dialect {
 
   let cppNamespace = "::mlir::triton::xla";
   let useDefaultAttributePrinterParser = 1;
+  let dependentDialects = [
+    "mlir::triton::TritonDialect",
+    "mlir::triton::gpu::TritonGPUDialect"
+  ];
 }
 
 #endif // XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_DIALECT_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc
index 322c84686817c6..9e6ad7f1e535dc 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc
@@ -254,6 +254,64 @@ void InsertOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<InsertOpOffsetsSizesStridesFolder>(context);
 }
 
+OpFoldResult MemrefToPtrOp::fold(FoldAdaptor adaptor) {
+  if (auto ptr_to_memref = getOperand().getDefiningOp<PtrToMemrefOp>()) {
+    // memref_to_ptr(ptr_to_memref(x)) -> x
+    return ptr_to_memref.getOperand();
+  }
+
+  return {};
+}
+
+LogicalResult MemrefToPtrOp::verify() {
+  mlir::MemRefType src_type = getSrc().getType();
+  if (src_type.getElementType() != getType().getPointeeType()) {
+    getOperation()->emitError(
+        "source element type does not match result pointee type");
+    return failure();
+  }
+
+  // It is only safe to directly convert a pointer to a memref if the memref
+  // has no offset.
+  llvm::SmallVector<int64_t> strides;
+  int64_t offset = 0;
+  if (src_type.getStridesAndOffset(strides, offset).failed()) {
+    getOperation()->emitError("failed to get strides and offset") << src_type;
+    return failure();
+  }
+  if (offset != 0) {
+    getOperation()->emitError("memref has non-zero offset");
+    return failure();
+  }
+
+  return success();
+}
+
+LogicalResult PtrToMemrefOp::verify() {
+  mlir::MemRefType result_type = getType();
+  if (getSrc().getType().getPointeeType() != result_type.getElementType()) {
+    getOperation()->emitError(
+        "source pointee type does not match result element type");
+    return failure();
+  }
+
+  // It is only safe to directly convert a pointer to a memref if the memref
+  // has no offset.
+  llvm::SmallVector<int64_t> strides;
+  int64_t offset = 0;
+  if (result_type.getStridesAndOffset(strides, offset).failed()) {
+    getOperation()->emitError("failed to get strides and offset")
+        << result_type;
+    return failure();
+  }
+  if (offset != 0) {
+    getOperation()->emitError("memref has non-zero offset");
+    return failure();
+  }
+
+  return success();
+}
+
 }  // namespace mlir::triton::xla
 
 #define GET_OP_CLASSES
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
index 41b9fd3a080409..a684ef0b72b540 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
@@ -401,26 +401,65 @@ def TTXLA_GetRankOp : TTXLA_Op<"get_rank", [Pure]> {
 def TTXLA_GetPeerPtrOp : TTXLA_Op<"get_peer_ptr", [Pure]> {
   let summary = [{
     Extract the pointer to the given symmetric memory `address` on the given
-    `peer` device using the symmetric memory `metadata`.
-    For this an operation first calculates an offset of the `address` to the
-    current rank symmetric memory range, and the adds this offset to the 
-    symmetric memory range of the `peer` device.
+    `peer` device. An `address` should point to the memory of the given kernel
+    argument with `argument_index`. The result is calculated using the symmetric
+    memory `metadata` constructed at the runtime.
+    To calculate offsets operation also need to know the number of devices
+    participating in the collective operation (`world_size`).
   }];
   let arguments = (ins
     Arg<TT_PtrLike, "",
       [MemRead<GlobalMemory>]>:$address,
     I64:$peer_id,
     Arg<TT_PtrLike, "",
-      [MemRead<GlobalMemory>]>:$metadata);
+      [MemRead<GlobalMemory>]>:$metadata,
+    I32Attr:$argument_index,
+    // The number of devices participating in the collective operation.
+    I32Attr:$world_size);
 
   let results = (outs Arg<TT_PtrLike, "", [MemRead<GlobalMemory>]>:$result);
 
   let assemblyFormat = [{
-    $address `,` $peer_id `,` $metadata attr-dict `:`
+    $address `,` $peer_id `,` $metadata `,` attr-dict `:`
     functional-type(operands, results)
   }];
 }
 
+def TTXLA_MemrefToPtrOp : TTXLA_Op<"memref_to_ptr", [Pure]> {
+  let summary = [{
+    A specialized version of unrealized_conversion_cast that converts a
+    memref to a pointer.
+  }];
+
+  let arguments = (ins AnyMemRef:$src);
+
+  let results = (outs TT_Ptr:$result);
+
+  let assemblyFormat = [{
+    $src `from` type($src) `to` type($result) attr-dict
+  }];
+
+  let hasFolder = 1;
+  let hasVerifier = 1;
+}
+
+def TTXLA_PtrToMemrefOp : TTXLA_Op<"ptr_to_memref", [Pure]> {
+  let summary = [{
+    A specialized version of unrealized_conversion_cast that converts a
+    pointer to a memref.
+  }];
+
+  let arguments = (ins TT_Ptr:$src);
+
+  let results = (outs AnyMemRef:$result);
+
+  let assemblyFormat = [{
+    $src `from` type($src) `to` type($result) attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
 
 #endif // XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_OPS_TD_
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util.cc b/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util.cc
new file mode 100644
index 00000000000000..21adb83d3cffea
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util.cc
@@ -0,0 +1,142 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/lowering_util.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/backends/gpu/codegen/triton/tma_utils.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::gpu::triton {
+
+absl::StatusOr<stream_executor::ThreadDim> ExtractThreadDims(
+    mlir::ModuleOp triton_module, mlir::LLVM::LLVMFuncOp func_op) {
+  // Extract the launch information from the Triton module.
+  auto threads_per_warp_attr =
+      triton_module->getAttrOfType<mlir::IntegerAttr>("ttg.threads-per-warp");
+  if (!threads_per_warp_attr) {
+    return absl::InternalError("ttg.threads-per-warp attribute not found.");
+  }
+  auto num_warps_attr =
+      triton_module->getAttrOfType<mlir::IntegerAttr>("ttg.num-warps");
+  if (!num_warps_attr) {
+    return absl::InternalError("ttg.num-warps attribute not found.");
+  }
+  // AMD/ROCm Triton backend does not support warp specialization.
+  // Consequently, `ttg.total-num-warps` and  `nvvm.reqntid` are not added
+  // to triton module/function.
+  // ThreadDim is therefore calculated from the Module attributes and not
+  // retrieved from `nvvm.reqntid`.
+  auto target = triton_module->getAttrOfType<mlir::StringAttr>("ttg.target");
+  if (!target) {
+    return absl::InternalError("ttg.target attribute not found.");
+  }
+  if (target.getValue().find("gfx") != std::string::npos) {
+    stream_executor::ThreadDim thread_dims(
+        num_warps_attr.getInt() * threads_per_warp_attr.getInt(), 1, 1);
+    return thread_dims;
+  }
+  auto total_num_warps_attr =
+      triton_module->getAttrOfType<mlir::IntegerAttr>("ttg.total-num-warps");
+  if (!total_num_warps_attr) {
+    return absl::InternalError("ttg.total-num-warps attribute not found.");
+  }
+  auto reqntid_attr =
+      func_op->getAttrOfType<mlir::DenseI32ArrayAttr>("nvvm.reqntid");
+  if (!reqntid_attr) {
+    return absl::InternalError("nvvm.reqntid attribute not found.");
+  }
+  auto reqntids = reqntid_attr.asArrayRef();
+  if (reqntids.empty()) {
+    return absl::InternalError("nvvm.reqntid attribute is empty.");
+  }
+  if (reqntids.size() > 3) {
+    return absl::InternalError(
+        "nvvm.reqntid attribute has more than 3 dimensions.");
+  }
+
+  // Validate the launch information.
+  if (num_warps_attr.getInt() != total_num_warps_attr.getInt()) {
+    VLOG(6)
+        << "num_warps and total_num_warps are different! This can happen if "
+           "Triton compilation decides to use a different number of warps than "
+           "configured. e.g. auto warp specialization can do that.";
+  }
+  int64_t expected_total_threads = xla::Product<int32_t>(reqntids);
+  int64_t actual_total_threads =
+      total_num_warps_attr.getInt() * threads_per_warp_attr.getInt();
+  if (actual_total_threads != expected_total_threads) {
+    return absl::InternalError(absl::StrCat(
+        "Expected total threads as per reqntid attribute to be ",
+        expected_total_threads, " but got ", actual_total_threads,
+        " as per ttg.total-num-warps and tt.threads-per-warp attributes."));
+  }
+
+  stream_executor::ThreadDim thread_dims(reqntids[0],
+                                         reqntids.size() > 1 ? reqntids[1] : 1,
+                                         reqntids.size() > 2 ? reqntids[2] : 1);
+  return thread_dims;
+}
+
+absl::StatusOr<stream_executor::gpu::TmaMetadata> ExtractTmaMetadata(
+    mlir::LLVM::LLVMFuncOp func_op) {
+  stream_executor::gpu::TmaMetadata tma_metadata;
+  for (auto [idx, arg] : llvm::enumerate(func_op.getArguments())) {
+    if (auto attr =
+            func_op.getArgAttrOfType<mlir::triton::xla::TmaDescriptorAttr>(
+                idx, "tt.tma_descriptor")) {
+      TF_ASSIGN_OR_RETURN(
+          auto tma_desc,
+          CreateTmaDescriptor(attr.getGlobalShape(), attr.getTileShape(),
+                              attr.getTileStrides(), attr.getLayout(),
+                              attr.getElementByteSize(),
+                              attr.getSwizzleMode().getValue()));
+      tma_metadata.arg_index_to_tma_info.insert({idx, tma_desc});
+    }
+  }
+  return tma_metadata;
+}
+
+std::vector<llvm::Metadata*> ExtractNvvmAnnotations(
+    llvm::Module* ll_triton_module) {
+  std::vector<llvm::Metadata*> captured_nvvm_annotations;
+  llvm::NamedMDNode* nvvm_annotations =
+      ll_triton_module->getNamedMetadata("nvvm.annotations");
+  if (nvvm_annotations) {
+    for (llvm::MDNode* operand : nvvm_annotations->operands()) {
+      captured_nvvm_annotations.push_back(operand);
+    }
+    ll_triton_module->eraseNamedMetadata(nvvm_annotations);
+  }
+  return captured_nvvm_annotations;
+}
+
+}  // namespace xla::gpu::triton
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util.h b/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util.h
new file mode 100644
index 00000000000000..3e20093dd63bb9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_LOWERING_UTIL_H_
+#define XLA_BACKENDS_GPU_CODEGEN_TRITON_LOWERING_UTIL_H_
+
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::gpu::triton {
+
+// Extracts thread dimensions from Triton module attributes.
+absl::StatusOr<stream_executor::ThreadDim> ExtractThreadDims(
+    mlir::ModuleOp triton_module, mlir::LLVM::LLVMFuncOp func_op);
+
+// Extracts TMA metadata information from LLVM generated by the Triton
+// compilation. The underlying map will be empty if TMA is not used.
+absl::StatusOr<stream_executor::gpu::TmaMetadata> ExtractTmaMetadata(
+    mlir::LLVM::LLVMFuncOp func_op);
+
+// Extracts NVVM annotations from the Triton module.
+std::vector<llvm::Metadata*> ExtractNvvmAnnotations(
+    llvm::Module* ll_triton_module);
+
+}  // namespace xla::gpu::triton
+
+#endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_LOWERING_UTIL_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util_test.cc
new file mode 100644
index 00000000000000..50e2d57c3970d7
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/lowering_util_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/lowering_util.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Parser/Parser.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/statusor.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+using ::testing::ElementsAre;
+
+namespace xgt = ::xla::gpu::triton;
+namespace xla::gpu {
+namespace {
+
+class EmitterHelpersTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    context_.loadDialect<
+        mlir::triton::TritonDialect, mlir::triton::gpu::TritonGPUDialect,
+        mlir::triton::xla::XlaTritonDialect, mlir::LLVM::LLVMDialect>();
+  }
+
+  mlir::OwningOpRef<mlir::ModuleOp> ParseModule(
+      const std::string& mlir_module) {
+    mlir::OwningOpRef<mlir::ModuleOp> module =
+        mlir::parseSourceString<mlir::ModuleOp>(mlir_module, &context_);
+    CHECK(module);
+    return module;
+  }
+
+  mlir::MLIRContext context_;
+};
+
+TEST_F(EmitterHelpersTest, ExtractTmaMetadataWorksCorrectlyWhenTmaIsUsed) {
+  const std::string kMlirModule = R"(
+module {
+  llvm.func @fusion_impl(%arg0: !llvm.ptr<1> {tt.divisibility = 16 : i32},
+                        %arg1: !llvm.ptr {llvm.align = 64 : i32, llvm.byval = !llvm.array<128 x i8>, nvvm.grid_constant, tt.nv_tma_desc = 1 : i32, tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [123, 512], tile_shape = [32, 64], tile_strides = [1, 1], layout = [1, 0], element_byte_size = 4, swizzle_mode = "128b">},
+                        %arg2: !llvm.ptr {llvm.align = 64 : i32, llvm.byval = !llvm.array<128 x i8>, nvvm.grid_constant, tt.nv_tma_desc = 1 : i32, tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [32, 512], tile_shape = [16, 64], tile_strides = [1, 1], layout = [1, 0], element_byte_size = 4, swizzle_mode = "128b">},
+                        %arg3: !llvm.ptr<1>, %arg4: !llvm.ptr<1>)
+                        attributes {nvvm.kernel = 1 : ui1, nvvm.reqntid = array<i32: 32>, ttg.global_scratch_memory_alignment = 1 : i32, ttg.global_scratch_memory_size = 0 : i32} {
+    llvm.return
+  }
+})";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module = ParseModule(kMlirModule);
+  mlir::LLVM::LLVMFuncOp func_op =
+      *module->getOps<mlir::LLVM::LLVMFuncOp>().begin();
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::gpu::TmaMetadata tma_metadata,
+                          xgt::ExtractTmaMetadata(func_op));
+
+  EXPECT_EQ(tma_metadata.arg_index_to_tma_info.size(), 2);
+  EXPECT_TRUE(tma_metadata.arg_index_to_tma_info.contains(1));
+  EXPECT_TRUE(tma_metadata.arg_index_to_tma_info.contains(2));
+
+  auto tma_arg_1 = tma_metadata.arg_index_to_tma_info.at(1);
+  auto tma_arg_2 = tma_metadata.arg_index_to_tma_info.at(2);
+
+  EXPECT_THAT(tma_arg_1.global_dims(), ElementsAre(512, 123));
+  EXPECT_THAT(tma_arg_1.box_dims(), ElementsAre(32, 32));
+  EXPECT_THAT(tma_arg_1.element_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(tma_arg_1.element_size(), 4);
+  EXPECT_EQ(tma_arg_1.swizzle(),
+            stream_executor::gpu::TmaDescriptor::TmaSwizzle::k128B);
+
+  EXPECT_THAT(tma_arg_2.global_dims(), ElementsAre(512, 32));
+  EXPECT_THAT(tma_arg_2.box_dims(), ElementsAre(32, 16));
+  EXPECT_THAT(tma_arg_2.element_strides(), ElementsAre(1, 1));
+  EXPECT_THAT(tma_arg_2.element_size(), 4);
+  EXPECT_EQ(tma_arg_2.swizzle(),
+            stream_executor::gpu::TmaDescriptor::TmaSwizzle::k128B);
+}
+
+TEST_F(EmitterHelpersTest, ExtractThreadDimsWorksCorrectlyWithValidInput) {
+  const std::string kMlirModule = R"(
+module attributes {ttg.global_scratch_memory_alignment = 1 : i32, ttg.global_scratch_memory_size = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 10240 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 0 : i32, "ttg.threads-per-warp" = 32 : i32, "ttg.total-num-warps" = 1 : i32} {
+  llvm.func @fusion_impl() attributes {nvvm.kernel = 1 : ui1, nvvm.reqntid = array<i32: 32>, ttg.global_scratch_memory_alignment = 1 : i32, ttg.global_scratch_memory_size = 0 : i32} {
+   llvm.return
+  }
+})";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module = ParseModule(kMlirModule);
+  mlir::LLVM::LLVMFuncOp func_op =
+      *module->getOps<mlir::LLVM::LLVMFuncOp>().begin();
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::ThreadDim thread_dims,
+                          xgt::ExtractThreadDims(module.get(), func_op));
+  EXPECT_EQ(thread_dims, stream_executor::ThreadDim(32, 1, 1));
+}
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
index a1e1876a735e33..57c38baca46cde 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -63,12 +64,11 @@ bool IsTritonSupportedDataType(PrimitiveType type,
       return true;
     case F8E5M2:
     case F8E4M3FN:
-      return std::holds_alternative<se::CudaComputeCapability>(gpu_version);
+      return gpu_version.IsCuda();
     case BF16:
-      return std::holds_alternative<se::CudaComputeCapability>(gpu_version) ||
-             (std::holds_alternative<se::RocmComputeCapability>(gpu_version) &&
-              std::get<se::RocmComputeCapability>(gpu_version)
-                  .has_bf16_dtype_support());
+      return gpu_version.IsCuda() ||
+             (gpu_version.IsRocm() &&
+              gpu_version.rocm_compute_capability()->has_bf16_dtype_support());
     default:
       return false;
   }
@@ -110,6 +110,7 @@ absl::flat_hash_set<HloOpcode> TritonSupportedUnaryElementwiseOps(
         HloOpcode::kAcos,
         HloOpcode::kAcosh,
         HloOpcode::kAsin,
+        HloOpcode::kAsinh,
         HloOpcode::kAtanh,
         HloOpcode::kCbrt,
         HloOpcode::kCeil,
@@ -155,8 +156,8 @@ CodegenDecision IsTritonSupportedConversion(
   };
 
   if (input != output && any_is(PrimitiveType::F8E4M3FN) &&
-      std::holds_alternative<se::CudaComputeCapability>(gpu_version) &&
-      !std::get<se::CudaComputeCapability>(gpu_version).IsAtLeastHopper()) {
+      gpu_version.IsCuda() &&
+      !gpu_version.cuda_compute_capability()->IsAtLeastHopper()) {
     return error_message();
   }
 
@@ -213,6 +214,7 @@ absl::flat_hash_set<HloOpcode> TritonSupportedBinaryElementwiseOps(
     ret.insert(HloOpcode::kAnd);
     ret.insert(HloOpcode::kOr);
     ret.insert(HloOpcode::kXor);
+    ret.insert(HloOpcode::kRemainder);
   }
 
   if (element_type == PrimitiveType::F32 ||
@@ -290,18 +292,11 @@ CodegenDecision CanTritonHandleReduce(
 }
 
 bool IsInTritonNestedGemmFusion(const HloInstruction& hlo) {
-  const HloComputation* computation = hlo.parent();
-  if (!computation->IsFusionComputation()) {
-    return false;
-  }
-  absl::StatusOr<GpuBackendConfig> backend_config =
-      computation->FusionInstruction()->backend_config<GpuBackendConfig>();
-  if (!backend_config.ok()) {
+  if (!hlo.parent()->IsFusionComputation()) {
     return false;
   }
-  absl::string_view fusion_kind =
-      backend_config.value().fusion_backend_config().kind();
-  return fusion_kind == kTritonNestedGemmFusionKind;
+  return IsGpuFusionKind(*hlo.parent()->FusionInstruction(),
+                         kTritonNestedGemmFusionKind);
 }
 
 absl::Status CheckSupportedCheckDotDimensions(const HloDotInstruction& dot) {
@@ -349,7 +344,7 @@ CodegenDecision AreTypesSupportedByAlgUnsetDot(
   }
 
   if (input_type == F8E4M3FN || result_type == F8E4M3FN) {
-    if (auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
+    if (auto* cuda_cc = gpu_version.cuda_compute_capability();
         cuda_cc && !cuda_cc->IsAtLeastHopper()) {
       return CodegenDecision::Forbid(
           "Dot operation for F8E4M3FN is not supported before Hopper.");
@@ -421,7 +416,7 @@ CodegenDecision AreDotAlgorithmInputAndOutputConversionsSupported(
 
   if (algorithm == PrecisionConfig::ALG_DOT_F64_F64_F64 &&
       primitive_util::BitWidth(lhs_type) < 32 &&
-      !std::get<se::CudaComputeCapability>(gpu_version).IsAtLeastBlackwell()) {
+      !gpu_version.cuda_compute_capability()->IsAtLeastBlackwell()) {
     return forbid("Unsupported BF16 on GPUs before Blackwell");
   }
 
@@ -729,9 +724,9 @@ bool IsTritonUnsupportedOpcode(HloOpcode opcode) {
 absl::Status EnsureTritonSupportsComputeCapability(
     const se::GpuComputeCapability& gpu_compute_capability) {
   auto cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_compute_capability);
+      gpu_compute_capability.cuda_compute_capability();
   auto rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_compute_capability);
+      gpu_compute_capability.rocm_compute_capability();
   if (!cuda_compute_capability && !rocm_compute_capability) {
     return absl::FailedPreconditionError(
         "Triton support is only enabled for CUDA and ROCm GPUs.");
@@ -763,6 +758,14 @@ CodegenDecision IsTritonSupportedComputation(
     const se::GpuComputeCapability& gpu_compute_capability) {
   VLOG(3) << "IsTritonSupportedComputation: " << computation.ToString();
   for (const auto* instruction : computation.instructions()) {
+    // TODO(b/452478982): This check can be removed if we support Tuple ops
+    // generally.
+    if (instruction == computation.root_instruction() &&
+        instruction->opcode() == HloOpcode::kTuple) {
+      // While Tuple is not generally supported by Triton codegen, it is
+      // supported for fusion roots.
+      continue;
+    }
     if (CodegenDecision can_codegen =
             IsTritonSupportedInstruction(*instruction, gpu_compute_capability);
         !can_codegen) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
index b0d6c79274e71b..07e88ba8c458e8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
@@ -68,38 +68,25 @@ bool IsTritonSupportedDotOutputType(
     case F32:
       return true;
     case F8E5M2:
-      return std::visit(
-          absl::Overload(
-              [](const se::CudaComputeCapability& cc) {
-                return cc.IsAtLeastAmpere();
-              },
-              [](const se::RocmComputeCapability& cc) { return false; }),
-          gpu_version);
-
+      if (auto ptr = gpu_version.cuda_compute_capability()) {
+        return ptr->IsAtLeastAmpere();
+      }
+      return false;
     case F8E4M3FN:
-      return std::visit(
-          absl::Overload(
-              [](const se::CudaComputeCapability& cc) {
-                return cc.IsAtLeastHopper();
-              },
-              [](const se::RocmComputeCapability& cc) { return false; }),
-          gpu_version);
+      if (auto ptr = gpu_version.cuda_compute_capability()) {
+        return ptr->IsAtLeastHopper();
+      }
+      return false;
     case BF16:
-      return std::visit(
-          absl::Overload(
-              [](const se::CudaComputeCapability& cc) { return true; },
-              [](const se::RocmComputeCapability& cc) {
-                return cc.has_bf16_dtype_support();
-              }),
-          gpu_version);
+      if (auto ptr = gpu_version.rocm_compute_capability()) {
+        return ptr->has_bf16_dtype_support();
+      }
+      return true;
     case S32:
-      return std::visit(
-          absl::Overload(
-              [](const se::CudaComputeCapability& cc) {
-                return cc.IsAtLeastAmpere();
-              },
-              [](const se::RocmComputeCapability& cc) { return false; }),
-          gpu_version);
+      if (auto ptr = gpu_version.cuda_compute_capability()) {
+        return ptr->IsAtLeastAmpere();
+      }
+      return false;
     default:
       return false;
   }
@@ -229,10 +216,8 @@ CodegenDecision CanTritonHandleElementwise(
 bool IsDotAlgorithmSupportedByTriton(
     PrecisionConfig::Algorithm algorithm,
     const se::GpuComputeCapability& gpu_version) {
-  auto cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_version);
-  auto rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
+  auto cuda_compute_capability = gpu_version.cuda_compute_capability();
+  auto rocm_compute_capability = gpu_version.rocm_compute_capability();
   switch (algorithm) {
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
@@ -264,13 +249,17 @@ bool IsDotAlgorithmSupportedByTriton(
 CodegenDecision AreDotInputAndOutputTypesSupportedAndCompatible(
     const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
   auto output_type = dot.shape().element_type();
-  auto lhs_type = dot.operand(0)->shape().element_type();
-  auto rhs_type = dot.operand(1)->shape().element_type();
-
   if (!IsTritonSupportedDotOutputType(output_type, gpu_version)) {
     return CodegenDecision::Forbid("Unsupported output data type for Dot op.");
   }
 
+  auto lhs_type = dot.operand(0)->shape().element_type();
+  auto rhs_type = dot.operand(1)->shape().element_type();
+  if (lhs_type != rhs_type && !(primitive_util::IsF8Type(lhs_type) &&
+                                primitive_util::IsF8Type(rhs_type))) {
+    return CodegenDecision::Forbid("Non-fp8 input types must be the same.");
+  }
+
   if (!IsTritonSupportedDataType(lhs_type, gpu_version) ||
       !IsTritonSupportedDataType(rhs_type, gpu_version)) {
     return CodegenDecision::Forbid("Unsupported input data type for Dot op.");
@@ -283,26 +272,29 @@ CodegenDecision AreDotInputAndOutputTypesSupportedAndCompatible(
         "Currently, S32 output is only supported for 8-bit integral inputs.");
   }
 
+  if (primitive_util::IsIntegralType(lhs_type) !=
+      primitive_util::IsIntegralType(output_type)) {
+    return CodegenDecision::Forbid(
+        "Dots between integer and floating-point types are not supported.");
+  }
+
   return CodegenDecision::Allow();
 }
 
 // Filters GEMMs which can be handled using Triton.
 CodegenDecision CanTritonHandleGEMM(
     const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
-  auto cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_version);
-  auto rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
+  auto cuda_compute_capability = gpu_version.cuda_compute_capability();
+  auto rocm_compute_capability = gpu_version.rocm_compute_capability();
 
   CHECK(cuda_compute_capability || rocm_compute_capability);
 
   if (dot.precision_config().algorithm() == PrecisionConfig::ALG_UNSET) {
-    if (!tsl::tensor_float_32_execution_enabled() ||
-        absl::c_any_of(dot.precision_config().operand_precision(),
+    if (absl::c_any_of(dot.precision_config().operand_precision(),
                        [](int x) { return x != PrecisionConfig::DEFAULT; })) {
       return CodegenDecision::Forbid(
-          "Having non-default operand precisions or TensorFloat-32 disabled "
-          "for Dot op with unset algorithm.");
+          "Having non-default operand precisions for Dot op with unset "
+          "algorithm.");
     }
   } else {
     if (!IsDotAlgorithmSupportedByTriton(dot.precision_config().algorithm(),
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc
index 0f24b94b5261e6..5deccb9b8ba61e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc
@@ -19,30 +19,33 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -55,11 +58,10 @@ se::GpuComputeCapability GetComputeCapability() {
 bool CombinationCrashesTriton(PrimitiveType lhs_type, PrimitiveType rhs_type,
                               PrimitiveType output_type,
                               se::GpuComputeCapability gpu_compute_capability) {
-  if (std::holds_alternative<se::CudaComputeCapability>(
-          gpu_compute_capability)) {
-    auto cuda_compute_capability =
-        std::get<se::CudaComputeCapability>(gpu_compute_capability);
-    if (!cuda_compute_capability.IsAtLeastHopper() &&
+  if (gpu_compute_capability.IsCuda()) {
+    auto* cuda_compute_capability =
+        gpu_compute_capability.cuda_compute_capability();
+    if (!cuda_compute_capability->IsAtLeastHopper() &&
         (lhs_type == F8E4M3FN || rhs_type == F8E4M3FN ||
          output_type == F8E4M3FN)) {
       return true;
@@ -84,10 +86,22 @@ class DotTest : public TritonSupportTestBaseWithParam {
         primitive_util::LowercasePrimitiveTypeName(output_type);
 
     const std::string kHloTestTemplate = R"(
+lhs {
+  ROOT p0 = $0[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = $1[11,63]{1,0} parameter(0)
+}
 triton_computation {
-  parameter_0 = $0[92,11]{1,0} parameter(0)
-  parameter_1 = $1[11,63]{1,0} parameter(1)
-  ROOT dot = $2[92,63]{1,0} $3(parameter_0, parameter_1),
+  p0 = $0[92,11]{1,0} parameter(0)
+  p1 = $1[11,63]{1,0} parameter(1)
+  lhs_fusion = $0[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = $1[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = $2[92,63]{1,0} $3(lhs_fusion, rhs_fusion),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
@@ -96,11 +110,9 @@ ENTRY e {
   parameter_1 = $1[11,63]{1,0} parameter(1)
   ROOT triton_op = $2[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
     calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-      triton_gemm_config:
-        {"block_m":16,"block_n":32,"block_k":512,
-         "split_k":1,"num_stages":4,"num_warps":8,
-         "num_ctas":1}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
 })";
     const std::string hlo_test = absl::Substitute(
         kHloTestTemplate, lhs, rhs, output, HloOpcodeString(opcode));
@@ -122,10 +134,12 @@ ENTRY e {
       }
       const se::DeviceDescription dev_info =
           TestGpuDeviceInfo::RTXA6000DeviceInfo(GetComputeCapability());
-      BlockLevelParameters block_level_parameters;
-      block_level_parameters.num_ctas = 1;
-      block_level_parameters.num_stages = 4;
-      block_level_parameters.num_warps = 8;
+      auto block_level_parameters =
+          BlockLevelParameters::FromBlockLevelFusionConfig(
+              ti.TritonFusion()
+                  .backend_config<GpuBackendConfig>()
+                  ->fusion_backend_config()
+                  .block_level_fusion_config());
       EXPECT_THAT(
           TritonWrapper("test_fn", &ti.TritonFusion(), GetComputeCapability(),
                         dev_info, block_level_parameters, &llvm_module_,
@@ -203,15 +217,28 @@ TEST_P(DynamicSliceTest, IsTritonSupportedDynamicSlice) {
 
   constexpr absl::string_view kHloTestTemplate =
       R"(
+lhs {
+  p0 = $0[$2,$3] parameter(0)
+  p1 = $1[] parameter(1)
+  p2 = $1[] parameter(2)
+  ds = $0[5,2] dynamic-slice(p0, p1, p2), dynamic_slice_sizes={5,2}
+  ROOT convert = f32[5,2] convert(ds)
+}
+rhs {
+  ROOT p0 = f32[2,4] parameter(0)
+}
 triton_computation {
   dynamic_slice_input = $0[$2,$3] parameter(0)
   dot_rhs = f32[2,4] parameter(1)
   start_index0 = $1[] parameter(2)
   start_index1 = $1[] parameter(3)
-  dynamic_slice = $0[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
-                  dynamic_slice_sizes={5,2}
-  convert = f32[5,2] convert(dynamic_slice)
-  ROOT dot = f32[5, 4] dot(convert, dot_rhs),
+  lhs = f32[5,2] fusion(dynamic_slice_input, start_index0, start_index1), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[32,32]}]}}}
+  rhs = f32[2,4] fusion(dot_rhs), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[32,32]}]}}}
+  ROOT dot = f32[5, 4] dot(lhs, rhs),
           lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
@@ -224,9 +251,9 @@ ENTRY e {
        kind=kCustom, calls=triton_computation,
        backend_config={
          "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+           "kind":"__triton_nested_gemm_fusion",
+            "block_level_fusion_config":{"output_tiles":[{"sizes":[32,32]}],
+            "num_stages":1,"num_warps":4,"num_ctas":1}}}
 })";
 
   const std::string hlo_test = absl::Substitute(
@@ -238,27 +265,32 @@ ENTRY e {
       param.is_the_majormost_dim_being_sliced ? 1 : 0,  // start_index0
       param.is_the_majormost_dim_being_sliced ? 0 : 1   // start_index1
   );
-  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
-                                                    hlo_test, /*data_type=*/{},
-                                                    HloOpcode::kDynamicSlice));
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction dot,
+                          ParseTemplateAndGetInstruction(
+                              hlo_test, /*data_type=*/{}, HloOpcode::kDot));
+  HloInstruction* dynamic_slice =
+      FindInstruction(dot.Module().get(), HloOpcode::kDynamicSlice);
+  ASSERT_NE(dynamic_slice, nullptr);
 
   const bool is_supported_instruction =
-      legacy_triton::IsTritonSupportedInstruction(ti.Instruction(),
+      legacy_triton::IsTritonSupportedInstruction(*dynamic_slice,
                                                   GetComputeCapability())
           .CanFuse();
   const bool is_supported_dynamic_slice =
       legacy_triton::IsTritonSupportedDynamicSlice(
-          *Cast<HloDynamicSliceInstruction>(&ti.Instruction()))
+          *Cast<HloDynamicSliceInstruction>(dynamic_slice))
           .CanFuse();
   EXPECT_EQ(is_supported_instruction, is_supported_dynamic_slice);
 
   if (is_supported_instruction) {
+    // TODO(goncharov): Change to `EXPECT_FALSE(is_supported_instruction)`.
+    GTEST_SKIP() << "The generic emitter does not support dynamic slice yet.";
     TF_EXPECT_OK(
-        ApplyFloatNormalization(ti.Module().get(), GetComputeCapability()));
+        ApplyFloatNormalization(dot.Module().get(), GetComputeCapability()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
-        std::move(ti.Module()), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+        std::move(dot.Module()), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
   } else {
-    EXPECT_THAT(TritonFusionAnalysis::Execute(ti.TritonComputation()),
+    EXPECT_THAT(TritonFusionAnalysis::Execute(dot.TritonComputation()),
                 absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition));
   }
 }
@@ -271,69 +303,156 @@ INSTANTIATE_TEST_SUITE_P(
     DynamicSliceTestParamToString);
 
 TEST_F(TritonSupportTestBase,
-       UnsupportedDotOutputTypeFailsGracefullyWithTriton) {
+       UnsupportedDotOutputTypeFailsCanTritonHandleGEMM) {
   const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f32[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = f32[11,63]{1,0} parameter(0)
+}
 triton_computation {
+  p0 = f32[92,11]{1,0} parameter(0)
+  p1 = f32[11,63]{1,0} parameter(1)
+  lhs_fusion = f32[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = f32[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = pred[92,63]{1,0} dot(lhs_fusion, rhs_fusion),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+ENTRY e {
   parameter_0 = f32[92,11]{1,0} parameter(0)
   parameter_1 = f32[11,63]{1,0} parameter(1)
-  ROOT dot = pred[92,63]{1,0} dot(parameter_0, parameter_1),
+  ROOT triton_op = pred[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTest, /*data_type=*/{}, HloOpcode::kDot));
+  EXPECT_THAT(
+      legacy_triton::CanTritonHandleGEMM(
+          *Cast<HloDotInstruction>(&ti.Instruction()), GetComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Unsupported output data type for Dot op."));
+}
+
+TEST_F(TritonSupportTestBase, UnsupportedIntFloatDotFailsCanTritonHandleGEMM) {
+  const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = s8[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = s8[11,63]{1,0} parameter(0)
+}
+triton_computation {
+  p0 = s8[92,11]{1,0} parameter(0)
+  p1 = s8[11,63]{1,0} parameter(1)
+  lhs_fusion = s8[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = s8[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = f32[92,63]{1,0} dot(lhs_fusion, rhs_fusion),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
+ENTRY e {
+  parameter_0 = s8[92,11]{1,0} parameter(0)
+  parameter_1 = s8[11,63]{1,0} parameter(1)
+  ROOT triton_op = f32[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTest, /*data_type=*/{}, HloOpcode::kDot));
+  EXPECT_THAT(
+      legacy_triton::CanTritonHandleGEMM(
+          *Cast<HloDotInstruction>(&ti.Instruction()), GetComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Dots between integer and floating-point "
+                           "types are not supported."));
+}
 
+TEST_F(TritonSupportTestBase,
+       UnsupportedDifferentOperandTypesDotFailsCanTritonHandleGEMM) {
+  const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f16[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = f32[11,63]{1,0} parameter(0)
+}
+triton_computation {
+  p0 = f16[92,11]{1,0} parameter(0)
+  p1 = f32[11,63]{1,0} parameter(1)
+  lhs_fusion = f16[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = f32[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = f32[92,63]{1,0} dot(lhs_fusion, rhs_fusion),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
 ENTRY e {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_0 = f16[92,11]{1,0} parameter(0)
   parameter_1 = f32[11,63]{1,0} parameter(1)
-  ROOT triton_op = pred[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+  ROOT triton_op = f32[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
     calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-      triton_gemm_config:
-        {"block_m":16,"block_n":32,"block_k":512,
-         "split_k":1,"num_stages":4,"num_warps":8,
-         "num_ctas":1}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
                           ParseTemplateAndGetInstruction(
                               kHloTest, /*data_type=*/{}, HloOpcode::kDot));
-  const se::DeviceDescription dev_info =
-      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetComputeCapability());
-  EXPECT_THAT(legacy_triton::IsTritonSupportedInstruction(
-                  ti.Instruction(), GetComputeCapability())
-                  .Explain(),
-              ::testing::HasSubstr("Unsupported output data type for Dot op."));
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = 1;
-  block_level_parameters.num_stages = 4;
-  block_level_parameters.num_warps = 8;
   EXPECT_THAT(
-      TritonWrapper("test_fn", &ti.TritonFusion(), GetComputeCapability(),
-                    dev_info, block_level_parameters, &llvm_module_,
-                    mlir_context_),
-      absl_testing::StatusIs(
-          absl::StatusCode::kInternal,
-          ::testing::HasSubstr("Failed to verify Triton module for fusion")));
+      legacy_triton::CanTritonHandleGEMM(
+          *Cast<HloDotInstruction>(&ti.Instruction()), GetComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("input types must be the same"));
 }
 
 TEST_F(TritonSupportTestBase,
-       UnsupportedDotWithMultipleBatchDimensionsFailsGracefullyWithTriton) {
+       DotWithMultipleBatchDimensionsIsSupportedWithTriton) {
   const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f32[2,2,2,2] parameter(0)
+}
+rhs {
+  ROOT p0 = f32[2,2,2,2] parameter(0)
+}
 triton_computation {
-  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
-  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
-  ROOT dot = f32[2,2,2,2]{3,2,1,0} dot(parameter_0, parameter_1),
-    lhs_contracting_dims={3}, lhs_batch_dims={1,0}, rhs_contracting_dims={2},
-    rhs_batch_dims={1,0}
+  p0 = f32[2,2,2,2] parameter(0)
+  p1 = f32[2,2,2,2] parameter(1)
+  lhs_fusion = f32[2,2,2,2] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1,2,2]}]}}}
+  rhs_fusion = f32[2,2,2,2] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1,2,2]}]}}}
+  ROOT dot = f32[2,2,2,2] dot(lhs_fusion, rhs_fusion),
+    lhs_contracting_dims={3}, lhs_batch_dims={1,0},
+    rhs_contracting_dims={2}, rhs_batch_dims={1,0}
 }
 
 ENTRY e {
-  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
-  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
-  ROOT triton_op = f32[2,2,2,2]{3,2,1,0} fusion(parameter_0, parameter_1),
+  parameter_0 = f32[2,2,2,2] parameter(0)
+  parameter_1 = f32[2,2,2,2] parameter(1)
+  ROOT triton_op = f32[2,2,2,2] fusion(parameter_0, parameter_1),
     kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-      triton_gemm_config:
-        {"block_m":16,"block_n":32,"block_k":512,
-         "split_k":1,"num_stages":4,"num_warps":8,
-         "num_ctas":1}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1,2,2]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
                           ParseTemplateAndGetInstruction(
@@ -344,25 +463,36 @@ ENTRY e {
                   ti.Instruction(), GetComputeCapability())
                   .Explain(),
               ::testing::HasSubstr("Multiple batch dimensions"));
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = 1;
-  block_level_parameters.num_stages = 4;
-  block_level_parameters.num_warps = 8;
-  EXPECT_THAT(
-      TritonWrapper("test_fn", &ti.TritonFusion(), GetComputeCapability(),
-                    dev_info, block_level_parameters, &llvm_module_,
-                    mlir_context_),
-      absl_testing::StatusIs(absl::StatusCode::kInternal,
-                             ::testing::HasSubstr("num_batch_dims <= 1")));
+  auto block_level_parameters =
+      BlockLevelParameters::FromBlockLevelFusionConfig(
+          ti.TritonFusion()
+              .backend_config<GpuBackendConfig>()
+              ->fusion_backend_config()
+              .block_level_fusion_config());
+  TF_EXPECT_OK(TritonWrapper(
+      "test_fn", &ti.TritonFusion(), GetComputeCapability(), dev_info,
+      block_level_parameters, &llvm_module_, mlir_context_));
 }
 
 TEST_F(TritonSupportTestBase,
        UnsupportedDotWithNoNonContractingDimensionsFailsGracefullyWithTriton) {
   const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f32[2]{0} parameter(0)
+}
+rhs {
+  ROOT p0 = f32[2]{0} parameter(0)
+}
 triton_computation {
-  parameter_0 = f32[2]{0} parameter(0)
-  parameter_1 = f32[2]{0} parameter(1)
-  ROOT dot = f32[] dot(parameter_0, parameter_1),
+  p0 = f32[2]{0} parameter(0)
+  p1 = f32[2]{0} parameter(1)
+  lhs_fusion = f32[2]{0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1]}]}}}
+  rhs_fusion = f32[2]{0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1]}]}}}
+  ROOT dot = f32[] dot(lhs_fusion, rhs_fusion),
     lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
@@ -371,7 +501,8 @@ ENTRY e {
   parameter_1 = f32[2]{0} parameter(1)
   ROOT triton_op = f32[] fusion(parameter_0, parameter_1), kind=kCustom,
     calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1]}]}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
                           ParseTemplateAndGetInstruction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
index f64bdf9c2bcf21..c5bc005541c389 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
@@ -93,6 +94,7 @@ bool DoesOpSupportType(HloOpcode opcode, PrimitiveType type) {
       return pu::IsFloatingPointType(type) || pu::IsComplexType(type);
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAtanh:
     case HloOpcode::kSinh:
     case HloOpcode::kAsin:
@@ -264,9 +266,8 @@ class TritonSupportTest : public TritonSupportTestBase {
     BlockLevelParameters block_level_parameters =
         FromOutputTileSizes(std::move(output_tile_sizes));
     const se::DeviceDescription dev_info =
-        std::holds_alternative<se::CudaComputeCapability>(cc)
-            ? TestGpuDeviceInfo::RTXA6000DeviceInfo(cc)
-            : TestGpuDeviceInfo::AMDMI210DeviceInfo();
+        cc.IsCuda() ? TestGpuDeviceInfo::RTXA6000DeviceInfo(cc)
+                    : TestGpuDeviceInfo::AMDMI210DeviceInfo();
     auto run_triton_codegen = [&]() {
       return TritonWrapper("test_fn", &ti.TritonFusion(), cc, dev_info,
                            block_level_parameters, &llvm_module_,
@@ -300,6 +301,20 @@ class TritonSupportTest : public TritonSupportTestBase {
   }
 };
 
+TEST_F(TritonSupportTest, IsTritonSupportedComputationSkipsRootTuple) {
+  const std::string kHlo = R"(
+  HloModule m
+  ENTRY main {
+    parameter_0 = f32[10] parameter(0)
+    abs = f32[10] abs(parameter_0)
+    negate = f32[10] negate(abs)
+    ROOT res = (f32[10], f32[10]) tuple(abs, negate)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(IsTritonSupportedComputation(
+      *module->entry_computation(), se::CudaComputeCapability::Hopper()));
+}
+
 class TritonSupportTestWithTypeAndOpcodeAndDeviceParam
     : public TritonSupportTest,
       public ::testing::WithParamInterface<
@@ -450,6 +465,7 @@ constexpr std::array kTestedOpsUnaryElementwise = {
     HloOpcode::kAcos,
     HloOpcode::kAcosh,
     HloOpcode::kAsin,
+    HloOpcode::kAsinh,
     HloOpcode::kAtanh,
     HloOpcode::kCbrt,
     HloOpcode::kCeil,
@@ -526,8 +542,7 @@ ENTRY triton_computation {
 
   bool crashes_on_failure = false;
   if (data_type_in != data_type_out && any_is(PrimitiveType::F8E4M3FN) &&
-      std::holds_alternative<se::CudaComputeCapability>(cc) &&
-      !std::get<se::CudaComputeCapability>(cc).IsAtLeastHopper()) {
+      cc.IsCuda() && !cc.cuda_compute_capability()->IsAtLeastHopper()) {
     crashes_on_failure |= any_is(F16) || any_is(BF16) || any_is(F32);
 
     // Crashes due to unsupported/unspecified rounding mode.
@@ -1853,7 +1868,7 @@ TEST_P(DotTypesTest, Dot) {
 
   ExpectedFailMode fail_mode = ExpectedFailMode::kFail;
   if (input_type == F8E4M3FN || result_type == F8E4M3FN) {
-    if (auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&cc);
+    if (auto* cuda_cc = cc.cuda_compute_capability();
         cuda_cc && !cuda_cc->IsAtLeastHopper()) {
       // Hits llvm::report_fatal_error during Triton compilation.
       fail_mode = ExpectedFailMode::kFailOrCrash;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
index ee5969a11e333a..6673811c7c6b31 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -52,6 +53,8 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/service/gpu/model/triton_emitter_constraints.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
@@ -83,13 +86,11 @@ std::vector<xla::PrimitiveType> AllXlaDataTypes() {
 }
 
 bool SupportsBF16(const stream_executor::GpuComputeCapability& cc) {
-  if (std::holds_alternative<stream_executor::CudaComputeCapability>(cc)) {
-    return std::get<stream_executor::CudaComputeCapability>(cc).IsAtLeast(
+  if (cc.IsCuda()) {
+    return cc.cuda_compute_capability()->IsAtLeast(
         se::CudaComputeCapability::kAmpere);
-  } else if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-                 cc)) {
-    return std::get<stream_executor::RocmComputeCapability>(cc)
-        .has_bf16_dtype_support();
+  } else if (cc.IsRocm()) {
+    return cc.rocm_compute_capability()->has_bf16_dtype_support();
   }
   CHECK(false);
 }
@@ -119,12 +120,12 @@ absl::Status CreateTritonIrAndFileCheck(
     absl::string_view filecheck_pattern) {
   auto* fusion = Cast<HloFusionInstruction>(computation.FusionInstruction());
 
-  mlir::MLIRContext context;
+  mlir::MLIRContext mlir_context;
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> triton_module,
       CreateTritonModule("triton_fn", fusion,
                          TestGpuDeviceInfo::RTXA6000DeviceInfo(),
-                         block_level_parameters, context));
+                         block_level_parameters, mlir_context));
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -138,7 +139,7 @@ absl::Status CreateTritonIrAndFileCheck(
 
 absl::StatusOr<
     std::pair<mlir::OwningOpRef<mlir::ModuleOp>, std::unique_ptr<HloModule>>>
-CreateXTileIrAndFileCheck(HloTestBaseWithMlirContext* test,
+CreateXTileIrAndFileCheck(HloTestBaseWithMLIRContext* test,
                           absl::string_view hlo_text,
                           absl::string_view triton_fusion_name,
                           absl::string_view filecheck_pattern) {
@@ -161,16 +162,17 @@ CreateXTileIrAndFileCheck(HloTestBaseWithMlirContext* test,
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateXTileIrAndFileCheck(
-    HloTestBaseWithMlirContext* test, const HloComputation& computation,
+    HloTestBaseWithMLIRContext* test, const HloComputation& computation,
     const BlockLevelParameters& block_level_parameters,
     absl::string_view filecheck_pattern) {
   auto* fusion = Cast<HloFusionInstruction>(computation.FusionInstruction());
-
+  LoadMlirDialectsForTriton(*test->mlir_context());
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> xtile_dialect_module,
-      ir_emitter_triton_internal::EmitXTileModule(
-          "xtile_dialect_fn", fusion, TestGpuDeviceInfo::RTXA6000DeviceInfo(),
-          block_level_parameters, *test->mlir_context()));
+      EmitXTileModule("xtile_dialect_fn",
+                      TritonEmitterConstraints::GetBuilder(
+                          TestGpuDeviceInfo::RTXA6000DeviceInfo()),
+                      fusion, block_level_parameters, *test->mlir_context()));
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -183,10 +185,11 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateXTileIrAndFileCheck(
 }
 
 absl::Status LowerXTileIrToTritonAndFileCheck(
-    HloTestBaseWithMlirContext* test, mlir::ModuleOp xtile_dialect_module,
+    HloTestBaseWithMLIRContext* test, mlir::ModuleOp xtile_dialect_module,
     absl::string_view filecheck_pattern, const HloFusionInstruction& fusion) {
   TF_RETURN_IF_ERROR(ir_emitter_triton_internal::LowerXTileToTriton(
-      xtile_dialect_module, *test->mlir_context(), fusion));
+      xtile_dialect_module, *test->mlir_context(), fusion,
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()));
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -205,13 +208,20 @@ absl::Status CreateTritonIrAndFileCheckForDot(
                       test->ParseAndReturnVerifiedModule(hlo_text));
   auto* comp = verified_module->GetComputationWithName(triton_fusion_name);
   TF_RET_CHECK(comp != nullptr);
-  return CreateTritonIrAndFileCheck(*comp, /*block_level_parameters=*/{},
-                                    filecheck_pattern);
+  return CreateTritonIrAndFileCheckForDot(*comp, filecheck_pattern);
 }
 
 absl::Status CreateTritonIrAndFileCheckForDot(
     const HloComputation& computation, absl::string_view filecheck_pattern) {
-  return CreateTritonIrAndFileCheck(computation, /*block_level_parameters=*/{},
+  BlockLevelParameters block_level_parameters;
+  if (auto gpu_config =
+          computation.FusionInstruction()->backend_config<GpuBackendConfig>();
+      gpu_config.ok() && gpu_config->has_fusion_backend_config() &&
+      gpu_config->fusion_backend_config().has_block_level_fusion_config()) {
+    block_level_parameters = BlockLevelParameters::FromBlockLevelFusionConfig(
+        gpu_config->fusion_backend_config().block_level_fusion_config());
+  }
+  return CreateTritonIrAndFileCheck(computation, block_level_parameters,
                                     filecheck_pattern);
 }
 
@@ -236,10 +246,10 @@ std::string PrimitiveTypeAndHloOpcodeToString(PrimitiveType data_type,
 
 std::string ComputeCapabilityToString(
     const stream_executor::GpuComputeCapability& cc) {
-  if (auto cuda_cc = std::get_if<se::CudaComputeCapability>(&cc)) {
+  if (auto* cuda_cc = cc.cuda_compute_capability()) {
     return absl::StrReplaceAll(cuda_cc->ToString(), {{".", ""}});
   } else {
-    CHECK(std::holds_alternative<se::RocmComputeCapability>(cc));
+    CHECK(cc.IsRocm());
     return "rocm";
   }
 }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
index 7eed1acd0ba207..7688047809bf43 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -73,7 +74,7 @@ absl::Status CreateTritonIrAndFileCheck(
 // `filecheck_pattern`.
 absl::StatusOr<
     std::pair<mlir::OwningOpRef<mlir::ModuleOp>, std::unique_ptr<HloModule>>>
-CreateXTileIrAndFileCheck(HloTestBaseWithMlirContext* test,
+CreateXTileIrAndFileCheck(HloTestBaseWithMLIRContext* test,
                           absl::string_view hlo_text,
                           absl::string_view triton_fusion_name,
                           absl::string_view filecheck_pattern);
@@ -82,14 +83,14 @@ CreateXTileIrAndFileCheck(HloTestBaseWithMlirContext* test,
 // This function also checks the generated shared dialect IR against the
 // `filecheck_pattern`.
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateXTileIrAndFileCheck(
-    HloTestBaseWithMlirContext* test, const HloComputation& computation,
+    HloTestBaseWithMLIRContext* test, const HloComputation& computation,
     const BlockLevelParameters& block_level_parameters,
     absl::string_view filecheck_pattern);
 
 // Lowers the given shared dialect IR to Triton IR and checks the result against
 // the `filecheck_pattern`.
 absl::Status LowerXTileIrToTritonAndFileCheck(
-    HloTestBaseWithMlirContext* test, mlir::ModuleOp xtile_dialect_module,
+    HloTestBaseWithMLIRContext* test, mlir::ModuleOp xtile_dialect_module,
     absl::string_view filecheck_pattern, const HloFusionInstruction& fusion);
 
 absl::Status CreateTritonIrAndFileCheckForDot(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
index ed703b22caa404..b7a6eb1b6ce397 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -138,4 +139,33 @@ absl::StatusOr<TmaDescriptor> CreateTmaDescriptor(
                              GetTmaSwizzleMode(swizzle_mode));
 }
 
+bool IsTmaRecommended(const TritonGemmConfig& config) {
+  // The current recommendation is based on analyzing the E2E "Nucleo" group
+  // data. It might make sense to re-evaluate this recommendation later if we
+  // believe there are missed opportunities.
+  return (config.split_k == 1 || config.split_k == 16) &&
+         config.num_warps <= 8 &&
+         (config.num_stages == 1 || config.num_stages == 3 ||
+          config.num_stages == 4) &&
+         config.block_m <= 256 && config.block_n <= 256 &&
+         config.block_k <= 256;
+}
+
+// Equivalent to the recommendation constructed for TritonGemmConfig.
+bool IsTmaRecommended(const BlockLevelFusionConfig& config) {
+  if (!(config.num_warps() <= 8 &&
+        (config.num_stages() == 1 || config.num_stages() == 3 ||
+         config.num_stages() == 4))) {
+    return false;
+  }
+  for (const auto& tile : config.output_tiles()) {
+    for (const auto& dim : tile.sizes()) {
+      if (dim > 256) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h
index 6a1490979dcba3..19985f1fa863f1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 
 namespace xla::gpu {
@@ -30,6 +31,15 @@ absl::StatusOr<stream_executor::gpu::TmaDescriptor> CreateTmaDescriptor(
     llvm::ArrayRef<int64_t> global_shape, llvm::ArrayRef<int64_t> tile_shape,
     llvm::ArrayRef<int64_t> tile_strides, llvm::ArrayRef<int64_t> layout,
     int element_byte_size, mlir::triton::xla::SwizzleMode swizzle_mode);
+
+// Recommends whether to attempt using TMA for a given configuration. This helps
+// prune the search space and avoid compile-time regressions from trying out all
+// configurations.
+//
+// // The new autotuner uses BlockLevelFusionConfig instead of TritonGemmConfig,
+// hence the two functions.
+bool IsTmaRecommended(const TritonGemmConfig& config);
+bool IsTmaRecommended(const BlockLevelFusionConfig& config);
 }  // namespace xla::gpu
 
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_TMA_UTILS_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc
index da87a276157bea..8af9a765b90034 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -37,7 +37,6 @@ using ::mlir::triton::xla::SwizzleMode;
 using ::stream_executor::gpu::TmaDescriptor;
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 TEST(CreateTmaDescriptorTest, Valid2DInputReturnCorrectDescriptor) {
   mlir::MLIRContext mlir_context;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
index 1ba61058497d4f..ae5fc9397ba709 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
@@ -31,11 +31,13 @@ gentbl_cc_library(
 cc_library(
     name = "passes",
     srcs = [
+        "arith_fp8_conversion_to_triton.cc",
         "extract_tma_info_pass.cc",
         "generalize_kernel_signature.cc",
         "int4_passes.cc",
         "round_f32_to_tf32_for_tf32_dot_pass.cc",
         "stablehlo_lower_to_triton.cc",
+        "tensor_lower_to_triton.cc",
         "triton_xla_convert_unsupported_types.cc",
         "triton_xla_extract_insert_to_triton_pass.cc",
         "triton_xla_fold_transpose_pass.cc",
@@ -43,42 +45,61 @@ cc_library(
         "triton_xla_lower_block_barrier_pass.cc",
         "triton_xla_lower_get_tid_pass.cc",
         "triton_xla_lower_remote_access_pass.cc",
+        "triton_xla_lower_xtile_pass.cc",
+        "triton_xla_math_to_libdevice.cc",
         "triton_xla_squeeze_dims_pass.cc",
         "triton_xla_unswitch_loops_pass.cc",
+        "xtile_lower_to_triton.cc",
     ],
     hdrs = ["passes.h"],
     deps = [
         ":passes_inc_gen",
         "//xla:permutation_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/codegen/triton:dot_algorithms",
         "//xla/backends/gpu/codegen/triton:emitter_helpers",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/translate/mhlo_to_hlo:attribute_exporter",
+        "//xla/service:algorithm_util",
+        "//xla/service/gpu:target_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor/gpu:collective_kernel_metadata",
         "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Rewrite",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
+        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@stablehlo//:stablehlo_ops",
         "@triton//:TritonDialects",
     ],
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/arith_fp8_conversion_to_triton.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/arith_fp8_conversion_to_triton.cc
new file mode 100644
index 00000000000000..730c3df447afe8
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/arith_fp8_conversion_to_triton.cc
@@ -0,0 +1,145 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+namespace ttir = ::mlir::triton;
+
+#define GEN_PASS_DEF_ARITHFP8CONVERSIONTOTRITONPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+bool IsFp8TypeOrFp8TensorType(mlir::Type t) {
+  auto element_type = mlir::getElementTypeOrSelf(t);
+  return mlir::cast<mlir::FloatType>(element_type).getWidth() == 8;
+}
+
+bool IsFp8ToNonFp8Conversion(mlir::Type src_ty, mlir::Type dst_ty) {
+  return IsFp8TypeOrFp8TensorType(src_ty) && !IsFp8TypeOrFp8TensorType(dst_ty);
+}
+
+bool IsNonFp8ToFp8Conversion(mlir::Type src_ty, mlir::Type dst_ty) {
+  return !IsFp8TypeOrFp8TensorType(src_ty) && IsFp8TypeOrFp8TensorType(dst_ty);
+}
+
+class LowerExtFOp : public mlir::OpRewritePattern<mlir::arith::ExtFOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      mlir::arith::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
+    auto src_ty = op.getOperand().getType();
+    auto dst_ty = op.getType();
+
+    if (!IsFp8ToNonFp8Conversion(src_ty, dst_ty)) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "ExtFOp will be lowered to FpToFpOp only if it converts from FP8 to "
+          "FP16, BF16, FP32, FP64.");
+    }
+    rewriter.replaceOpWithNewOp<ttir::FpToFpOp>(op, op.getType(),
+                                                op.getOperand());
+    return mlir::success();
+  }
+};
+
+class LowerTruncFOp : public mlir::OpRewritePattern<mlir::arith::TruncFOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      mlir::arith::TruncFOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    auto src_ty = op.getOperand().getType();
+    auto dst_ty = op.getType();
+
+    if (!IsNonFp8ToFp8Conversion(src_ty, dst_ty)) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "TruncFOp will be lowered to FpToFpOp only if it converts from FP16, "
+          "BF16, FP32 or FP64 to FP8.");
+    }
+
+    // TruncFOp default rounding mode is to_nearest_even based on the code in
+    // ArithOps.cpp
+    auto rounding_mode = op.getRoundingmode().value_or(
+        mlir::arith::RoundingMode::to_nearest_even);
+
+    ttir::RoundingMode triton_rounding_mode;
+    switch (rounding_mode) {
+      case mlir::arith::RoundingMode::to_nearest_even:
+        triton_rounding_mode = ttir::RoundingMode::RTNE;
+        break;
+      case mlir::arith::RoundingMode::toward_zero:
+        triton_rounding_mode = ttir::RoundingMode::RTZ;
+        break;
+      default:
+        return rewriter.notifyMatchFailure(
+            op->getLoc(),
+            "TruncFOp rounding mode attribute not supported by "
+            "FpToFpOp.");
+    }
+
+    ttir::RoundingModeAttr triton_rounding_mode_attr =
+        ttir::RoundingModeAttr::get(rewriter.getContext(),
+                                    triton_rounding_mode);
+
+    rewriter.replaceOpWithNewOp<ttir::FpToFpOp>(
+        op, op.getType(), op.getOperand(), triton_rounding_mode_attr);
+    return mlir::success();
+  }
+};
+
+class ArithFP8ConversionToTritonPass
+    : public impl::ArithFP8ConversionToTritonPassBase<
+          ArithFP8ConversionToTritonPass> {
+ public:
+  void runOnOperation() override {
+    mlir::MLIRContext* mlir_context = &getContext();
+    mlir::RewritePatternSet patterns(mlir_context);
+    patterns.add<LowerExtFOp, LowerTruncFOp>(mlir_context);
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+  }
+};
+}  // namespace
+
+// F8 <-> FP16, BF16, FP32, FP64 need to be handled via Triton's tt.fp_to_fp
+// because LLVM doesn't support casts from/to FP8.
+// TODO(b/413272992): Add better test coverage for FpToFpOp.
+std::unique_ptr<Pass> CreateArithFP8ConversionToTritonPass() {
+  return std::make_unique<ArithFP8ConversionToTritonPass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/generalize_kernel_signature.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/generalize_kernel_signature.cc
index ebc7be67522b29..d4138468bfcdea 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/generalize_kernel_signature.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/generalize_kernel_signature.cc
@@ -73,10 +73,11 @@ void StripParameterAddressSpaces(RewriterBase& rewriter,
   SmallVector<DictionaryAttr> arg_attrs(llvm::map_range(
       func.getArgAttrsAttr().getValue(),
       [](Attribute attr) { return cast<DictionaryAttr>(attr); }));
-  auto generic_func = rewriter.create<LLVM::LLVMFuncOp>(
-      func.getLoc(), func.getSymName(), generic_func_ty, func.getLinkage(),
-      func.getDsoLocal(), func.getCConv(), /*comdat=*/nullptr,
-      GetExtraAttrs(func), arg_attrs, func.getFunctionEntryCount());
+  auto generic_func = LLVM::LLVMFuncOp::create(
+      rewriter, func.getLoc(), func.getSymName(), generic_func_ty,
+      func.getLinkage(), func.getDsoLocal(), func.getCConv(),
+      /*comdat=*/nullptr, GetExtraAttrs(func), arg_attrs,
+      func.getFunctionEntryCount());
 
   // Convert generic address spaces back to original ones within the function
   // body.
@@ -88,7 +89,7 @@ void StripParameterAddressSpaces(RewriterBase& rewriter,
     Value converted = arg;
     if (arg.getType() != type) {
       converted =
-          rewriter.create<LLVM::AddrSpaceCastOp>(arg.getLoc(), type, arg);
+          LLVM::AddrSpaceCastOp::create(rewriter, arg.getLoc(), type, arg);
     }
     converted_args.push_back(converted);
   }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
index 0c3b6d03f32b8e..f36de1d59943c5 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
@@ -171,14 +171,14 @@ class I4ToI8Converter : public TypeConverter {
 };
 
 // Divides a value by an integer constant.
-Value div(ConversionPatternRewriter &r, Value value, int64_t constant) {
+Value div(ConversionPatternRewriter& r, Value value, int64_t constant) {
   auto const_attr = r.getIntegerAttr(value.getType(), constant);
   auto const_op = r.template create<ma::ConstantOp>(value.getLoc(), const_attr);
   return r.template create<ma::DivSIOp>(value.getLoc(), value, const_op);
 }
 
 // Divides a value by an integer constant.
-Value ceilDiv(ConversionPatternRewriter &r, Value value, int64_t constant) {
+Value ceilDiv(ConversionPatternRewriter& r, Value value, int64_t constant) {
   auto const_attr = r.getIntegerAttr(value.getType(), constant);
   auto const_op = r.template create<ma::ConstantOp>(value.getLoc(), const_attr);
   return r.template create<ma::CeilDivSIOp>(value.getLoc(), value, const_op);
@@ -201,14 +201,14 @@ class TritonXlaExtractOpConversionPattern
  public:
   using OpConversionPattern<mtx::ExtractOp>::OpConversionPattern;
 
-  TritonXlaExtractOpConversionPattern(const I4ToI8Converter &converter,
-                                      MLIRContext *context)
+  TritonXlaExtractOpConversionPattern(const I4ToI8Converter& converter,
+                                      MLIRContext* context)
       : OpConversionPattern<mtx::ExtractOp>(converter, context),
         converter_(converter) {}
 
   LogicalResult matchAndRewrite(
       mtx::ExtractOp op, OpConversionPattern<mtx::ExtractOp>::OpAdaptor adaptor,
-      ConversionPatternRewriter &r) const override {
+      ConversionPatternRewriter& r) const override {
     // Convert the tensor type using the TypeConverter
     auto new_result_type = mlir::cast<mlir::RankedTensorType>(
         getTypeConverter()->convertType(op.getType()));
@@ -257,141 +257,7 @@ class TritonXlaExtractOpConversionPattern
   }
 
  private:
-  const I4ToI8Converter &converter_;
-};
-
-class MakeTensorPtrOpConversionPattern
-    : public OpConversionPattern<MakeTensorPtrOp> {
- public:
-  using OpConversionPattern<MakeTensorPtrOp>::OpConversionPattern;
-
-  MakeTensorPtrOpConversionPattern(const I4ToI8Converter &converter,
-                                   MLIRContext *context)
-      : OpConversionPattern<MakeTensorPtrOp>(converter, context),
-        converter_(converter) {}
-
-  LogicalResult matchAndRewrite(
-      MakeTensorPtrOp op,
-      OpConversionPattern<MakeTensorPtrOp>::OpAdaptor adaptor,
-      ConversionPatternRewriter &r) const override {
-    // Convert the tensor type using the TypeConverter
-    auto new_type = getTypeConverter()->convertType(op.getType());
-    if (op.getType() == new_type) {
-      return r.notifyMatchFailure(op, "no conversion needed");
-    }
-
-    SmallVector<Value, 2> shape = adaptor.getShape();
-    int packed_dimension = converter_.packed_dimension();
-    // The shape of the i8 tensor is half of the i4 tensor but at least 1.
-    shape[packed_dimension] = ceilDiv(r, shape[packed_dimension], 2);
-
-    // The stride of the i8 tensor is half of the i4 tensor but at least 1.
-    SmallVector<Value, 2> new_strides = adaptor.getStrides();
-    for (int i = 0; i < new_strides.size(); ++i) {
-      new_strides[i] = ceilDiv(r, new_strides[i], 2);
-    }
-
-    r.replaceOpWithNewOp<MakeTensorPtrOp>(
-        op, new_type, adaptor.getBase(), shape, new_strides,
-        adaptor.getOffsets(), adaptor.getOrderAttr());
-
-    return success();
-  }
-
- private:
-  const I4ToI8Converter &converter_;
-};
-
-class AddPtrOpConversionPattern : public OpConversionPattern<AddPtrOp> {
- public:
-  using OpConversionPattern<AddPtrOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      AddPtrOp op, OpConversionPattern<AddPtrOp>::OpAdaptor adaptor,
-      ConversionPatternRewriter &r) const override {
-    // Convert the tensor type using the TypeConverter
-    auto new_type = getTypeConverter()->convertType(op.getType());
-    if (op.getType() == new_type) {
-      return r.notifyMatchFailure(op, "no conversion needed");
-    }
-
-    // The increment for the next stripe of tiles along K dimension should be
-    // twice smaller.
-    auto ptr = adaptor.getOperands()[0];
-    auto offset = adaptor.getOperands()[1];
-    auto new_offset = div(r, offset, 2);
-
-    r.replaceOpWithNewOp<AddPtrOp>(op, new_type, ptr, new_offset);
-
-    return success();
-  }
-};
-
-class AdvanceOpConversionPattern : public OpConversionPattern<AdvanceOp> {
- public:
-  using OpConversionPattern<AdvanceOp>::OpConversionPattern;
-
-  AdvanceOpConversionPattern(const I4ToI8Converter &converter,
-                             MLIRContext *context)
-      : OpConversionPattern<AdvanceOp>(converter, context),
-        converter_(converter) {}
-  LogicalResult matchAndRewrite(
-      AdvanceOp op, typename OpConversionPattern<AdvanceOp>::OpAdaptor adaptor,
-      ConversionPatternRewriter &r) const override {
-    VLOG(5) << "AvanceOpConversionPattern: matching\n"
-            << DumpToString(op.getOperation());
-    // Convert the tensor type using the TypeConverter
-    auto new_type = converter_.convertType(op.getType());
-    if (op.getType() == new_type) {
-      VLOG(5) << "AdvanceOpConversionPattern: no conversion needed for "
-              << DumpToString(op.getType());
-      return r.notifyMatchFailure(op, "no conversion needed");
-    }
-    SmallVector<Value, 2> offsets = adaptor.getOffsets();
-    int packed_dimension = converter_.packed_dimension();
-    offsets[packed_dimension] = div(r, offsets[packed_dimension], 2);
-    auto new_op = r.replaceOpWithNewOp<AdvanceOp>(op, new_type,
-                                                  adaptor.getPtr(), offsets);
-    VLOG(5) << "AdvanceOpConversionPattern: replaced "
-            << DumpToString(op.getOperation()) << " with "
-            << DumpToString(new_op.getOperation());
-    return success();
-  }
-
- private:
-  const I4ToI8Converter &converter_;
-};
-
-// The generic converter for the ops that requires only type conversion.
-template <typename OpType>
-class OpTypeConversionPattern : public OpConversionPattern<OpType> {
- public:
-  using OpConversionPattern<OpType>::OpConversionPattern;
-
-  OpTypeConversionPattern(const I4ToI8Converter &converter,
-                          MLIRContext *context)
-      : OpConversionPattern<OpType>(converter, context),
-        converter_(converter) {}
-  LogicalResult matchAndRewrite(
-      OpType op, typename OpConversionPattern<OpType>::OpAdaptor adaptor,
-      ConversionPatternRewriter &r) const override {
-    VLOG(5) << "OpTypeConversionPattern: matching\n"
-            << DumpToString(static_cast<Operation *>(op.getOperation()));
-    // Convert the tensor type using the TypeConverter
-    auto new_type = converter_.convertType(op.getType());
-    if (op.getType() == new_type) {
-      VLOG(5) << "OpTypeConversionPattern: no conversion needed for "
-              << DumpToString(op.getType());
-      return r.notifyMatchFailure(op, "no conversion needed");
-    }
-
-    r.replaceOpWithNewOp<OpType>(op, new_type, adaptor.getOperands(),
-                                 op->getAttrs());
-    return success();
-  }
-
- private:
-  const I4ToI8Converter &converter_;
+  const I4ToI8Converter& converter_;
 };
 
 // The pattern converts the ExtSIOp that converts i4 tensor to i8 tensor to an
@@ -399,7 +265,7 @@ class OpTypeConversionPattern : public OpConversionPattern<OpType> {
 // do the same thing.
 class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
  public:
-  ExtSIInt4ToInt8Pattern(const I4ToI8Converter &converter, MLIRContext *context,
+  ExtSIInt4ToInt8Pattern(const I4ToI8Converter& converter, MLIRContext* context,
                          bool bf16x2_enabled)
       : OpConversionPattern<ma::ExtSIOp>(converter, context),
         converter_(converter),
@@ -410,19 +276,20 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
   LogicalResult RewriteI4ToI8(
       ma::ExtSIOp ext_si_op,
       OpConversionPattern<ma::ExtSIOp>::OpAdaptor adaptor,
-      ConversionPatternRewriter &r) const {
+      ConversionPatternRewriter& r) const {
     auto loc = ext_si_op.getLoc();
     auto input_type = cast<RankedTensorType>(ext_si_op.getIn().getType());
     auto packed_type = converter_.convertType(input_type);
     VLOG(5) << "ExtSIInt4ToInt8Pattern: Regular int4 to int8 conversion";
     Value shift4_const =
-        r.create<ma::ConstantOp>(loc, r.getIntegerAttr(r.getI8Type(), 4));
-    Value shift4 = r.create<mt::SplatOp>(loc, packed_type, shift4_const);
+        ma::ConstantOp::create(r, loc, r.getIntegerAttr(r.getI8Type(), 4));
+    Value shift4 = mt::SplatOp::create(r, loc, packed_type, shift4_const);
     Value shifted_lo =
-        r.create<ma::ShLIOp>(loc, packed_type, adaptor.getIn(), shift4);
-    Value lo = r.create<ma::ShRSIOp>(loc, packed_type, shifted_lo, shift4);
-    Value hi = r.create<ma::ShRSIOp>(loc, packed_type, adaptor.getIn(), shift4);
-    Value hi_lo = r.create<mt::JoinOp>(loc, lo, hi);
+        ma::ShLIOp::create(r, loc, packed_type, adaptor.getIn(), shift4);
+    Value lo = ma::ShRSIOp::create(r, loc, packed_type, shifted_lo, shift4);
+    Value hi =
+        ma::ShRSIOp::create(r, loc, packed_type, adaptor.getIn(), shift4);
+    Value hi_lo = mt::JoinOp::create(r, loc, lo, hi);
     if (converter_.packed_dimension() + 1 != input_type.getRank()) {
       // Move the minor (joined) dimension to just after the packed dimension.
       SmallVector<int32_t> trans_order(input_type.getRank() + 1);
@@ -430,7 +297,7 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
       std::rotate(trans_order.begin() + converter_.packed_dimension() + 1,
                   std::prev(trans_order.end()), trans_order.end());
       auto trans_attr = r.getDenseI32ArrayAttr(trans_order);
-      hi_lo = r.create<mt::TransOp>(loc, hi_lo, trans_attr);
+      hi_lo = mt::TransOp::create(r, loc, hi_lo, trans_attr);
     }
     auto unpacked_type = input_type.clone(r.getI8Type());
     r.replaceOpWithNewOp<mt::ReshapeOp>(ext_si_op, unpacked_type, hi_lo,
@@ -442,7 +309,7 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
   LogicalResult RewriteI4ToBf16(
       ma::ExtSIOp ext_si_op, ma::SIToFPOp si_to_fp_op,
       OpConversionPattern<ma::ExtSIOp>::OpAdaptor adaptor,
-      ConversionPatternRewriter &r) const {
+      ConversionPatternRewriter& r) const {
     VLOG(5) << "RewriteI4ToBf16: Using inline asm i4 to bf16 conversion";
     auto loc = ext_si_op.getLoc();
     auto input_type = cast<RankedTensorType>(ext_si_op.getIn().getType());
@@ -476,13 +343,13 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
         sub.bf16x2 $3, $3, bias;
       }
     )";
-    auto elementwise_op = r.create<ElementwiseInlineAsmOp>(
-        loc, std::vector<Type>{bf16_packed_type, bf16_packed_type},
+    auto elementwise_op = ElementwiseInlineAsmOp::create(
+        r, loc, std::vector<Type>{bf16_packed_type, bf16_packed_type},
         kInt4ToBF16Asm, "=r,=r,=r,=r,r",
         /*pure=*/true, /*pack_result=*/4, adaptor.getOperands());
     Value lo = elementwise_op->getResult(0);
     Value hi = elementwise_op->getResult(1);
-    Value hi_lo = r.create<mt::JoinOp>(loc, lo, hi);
+    Value hi_lo = mt::JoinOp::create(r, loc, lo, hi);
     if (converter_.packed_dimension() + 1 != input_type.getRank()) {
       // Move the minor (joined) dimension to just after the packed dimension.
       SmallVector<int32_t> trans_order(input_type.getRank() + 1);
@@ -490,7 +357,7 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
       std::rotate(trans_order.begin() + converter_.packed_dimension() + 1,
                   std::prev(trans_order.end()), trans_order.end());
       auto trans_attr = r.getDenseI32ArrayAttr(trans_order);
-      hi_lo = r.create<mt::TransOp>(loc, hi_lo, trans_attr);
+      hi_lo = mt::TransOp::create(r, loc, hi_lo, trans_attr);
     }
 
     r.replaceOpWithNewOp<mt::ReshapeOp>(si_to_fp_op, bf16_type, hi_lo,
@@ -499,7 +366,7 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
   }
 
   LogicalResult matchAndRewrite(ma::ExtSIOp ext_si_op, OpAdaptor adaptor,
-                                ConversionPatternRewriter &r) const override {
+                                ConversionPatternRewriter& r) const override {
     VLOG(5) << "ExtSIInt4ToInt8Pattern: matching\n"
             << DumpToString(ext_si_op.getOperation());
     auto input_type = dyn_cast<RankedTensorType>(ext_si_op.getIn().getType());
@@ -527,7 +394,7 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
           auto tensor_type = dyn_cast<RankedTensorType>(result_type);
           if (!tensor_type || !tensor_type.getElementType().isBF16()) {
             VLOG(5) << "ExtSIInt4ToInt8Pattern: no conversion needed for "
-                    << DumpToString(static_cast<Operation *>(si_to_fp_op));
+                    << DumpToString(static_cast<Operation*>(si_to_fp_op));
             continue;
           }
           if (RewriteI4ToBf16(ext_si_op, si_to_fp_op, adaptor, r).failed()) {
@@ -540,14 +407,14 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
   }
 
  private:
-  const I4ToI8Converter &converter_;
+  const I4ToI8Converter& converter_;
   const bool bf16x2_enabled_;
 };
 
 // Traverses the operands of the op passing though the forOp and returns the
 // list of ops that belong to the same argument.
-std::vector<Operation *> TraverseUpwards(Operation *op) {
-  std::vector<Operation *> result;
+std::vector<Operation*> TraverseUpwards(Operation* op) {
+  std::vector<Operation*> result;
   while (op != nullptr) {
     VLOG(5) << "op: \n" << DumpToString(op);
     result.push_back(op);
@@ -577,12 +444,12 @@ std::vector<Operation *> TraverseUpwards(Operation *op) {
 }
 
 // Finds all the ExtSIOp that require the type conversion.
-std::vector<Operation *> FindInt4ExtSIOp(const ModuleOp &module) {
+std::vector<Operation*> FindInt4ExtSIOp(const ModuleOp& module) {
   // It does not matter which packed dimension idx we use here, because use the
   // converter to detect that the conversion is needed.
   I4ToI8Converter converter(/*packed_dimension=*/0);
-  std::vector<Operation *> result;
-  module->walk([&](Operation *op) {
+  std::vector<Operation*> result;
+  module->walk([&](Operation* op) {
     if (auto extSI = dyn_cast<ma::ExtSIOp>(op)) {
       VLOG(5) << "found ExtSI: " << DumpToString(op);
       auto input_type = extSI.getIn().getType();
@@ -595,83 +462,62 @@ std::vector<Operation *> FindInt4ExtSIOp(const ModuleOp &module) {
   return result;
 }
 
-// Finds the packed dimension from MakeTensorPtrOp or mtx::ExtractOp.
+// Finds the packed dimension from mtx::ExtractOp.
 // The packed dimension is the most minor dimension that has a unit stride and a
 // shape that is > 1.
-// TODO(b/393299275): MakeTensorPtrOp will not be emitted by the generic Triton
-// emitter. Remove this once the legacy Triton emitter is deprecated.
-absl::StatusOr<int> GetPackedDimension(MLIRContext *ctx,
-                                       const std::vector<Operation *> &ops) {
-  for (auto *op : ops) {
-    auto make_tensor_ptr_op = dyn_cast<MakeTensorPtrOp>(op);
+absl::StatusOr<int> GetPackedDimension(MLIRContext* ctx,
+                                       const std::vector<Operation*>& ops) {
+  for (auto* op : ops) {
     auto extract_op = dyn_cast<mtx::ExtractOp>(op);
-    if (!make_tensor_ptr_op && !extract_op) {
+    if (!extract_op) {
       continue;
     }
 
-    if (make_tensor_ptr_op) {
-      // The order attribute is ignored in Triton, check for default order here.
-      CHECK(
-          absl::c_is_sorted(make_tensor_ptr_op.getOrder(), std::greater<int>()))
-          << "Not default order: " << DumpToString(op);
-      auto shape = make_tensor_ptr_op.getShape();
-      auto strides = make_tensor_ptr_op.getStrides();
-      for (auto dim : make_tensor_ptr_op.getOrder()) {
-        if (GetConstValue(strides[dim]).value_or(1) == 1 &&
-            GetConstValue(shape[dim]).value_or(0) != 1) {
-          return dim;
-        }
-      }
-    }
+    // Make sure the packed dimension is not dynamic and has a stride of 1.
+    auto offsets = extract_op.getStaticOffsets();
+    auto sizes = extract_op.getStaticSizes();
+    auto strides = extract_op.getStaticStrides();
 
-    if (extract_op) {
-      // Make sure the packed dimension is not dynamic and has a stride of 1.
-      auto offsets = extract_op.getStaticOffsets();
-      auto sizes = extract_op.getStaticSizes();
-      auto strides = extract_op.getStaticStrides();
+    if (ShapedType::isDynamicShape(strides) ||
+        ShapedType::isDynamicShape(sizes)) {
+      return absl::InvalidArgumentError(
+          "dynamic shapes, tile strides, and tile sizes not supported");
+    }
 
-      if (ShapedType::isDynamicShape(strides) ||
-          ShapedType::isDynamicShape(sizes)) {
+    for (auto dim : extract_op.getSrcLayout()) {
+      if (extract_op.getSrcShape()[dim] == 1) {
+        continue;
+      }
+      if (strides[dim] != 1) {
         return absl::InvalidArgumentError(
-            "dynamic shapes, tile strides, and tile sizes not supported");
+            "Minor-most non-unit dimension has non-unit stride.");
       }
-
-      for (auto dim : extract_op.getSrcLayout()) {
-        if (extract_op.getSrcShape()[dim] == 1) {
-          continue;
-        }
-        if (strides[dim] != 1) {
-          return absl::InvalidArgumentError(
-              "Minor-most non-unit dimension has non-unit stride.");
-        }
-        if (sizes[dim] % 2 != 0) {
-          return absl::InvalidArgumentError(
-              "Minor-most non-unit dimension has odd size.");
-        }
-        if (!ShapedType::isDynamic(offsets[dim]) && offsets[dim] % 2 != 0) {
-          return absl::InvalidArgumentError(
-              "Minor-most non-unit dimension has odd offset.");
-        }
-        std::optional<llvm::SmallDenseSet<unsigned>> optional_mask =
-            computeRankReductionMask(sizes, extract_op.getType().getShape());
-        if (!optional_mask) {
-          return absl::InvalidArgumentError("Unsupported rank reduction.");
-        }
-        auto mask = llvm::to_vector(*optional_mask);
-        // Convert the packed dimension to the rank-reduced dst type.
-        return dim - (absl::c_upper_bound(mask, dim) - mask.begin());
+      if (sizes[dim] % 2 != 0) {
+        return absl::InvalidArgumentError(
+            "Minor-most non-unit dimension has odd size.");
       }
-
-      return absl::InvalidArgumentError("Failed to find a packed dimension.");
+      if (!ShapedType::isDynamic(offsets[dim]) && offsets[dim] % 2 != 0) {
+        return absl::InvalidArgumentError(
+            "Minor-most non-unit dimension has odd offset.");
+      }
+      std::optional<llvm::SmallDenseSet<unsigned>> optional_mask =
+          computeRankReductionMask(sizes, extract_op.getType().getShape());
+      if (!optional_mask) {
+        return absl::InvalidArgumentError("Unsupported rank reduction.");
+      }
+      auto mask = llvm::to_vector(*optional_mask);
+      // Convert the packed dimension to the rank-reduced dst type.
+      return dim - (absl::c_upper_bound(mask, dim) - mask.begin());
     }
+
+    return absl::InvalidArgumentError("Failed to find a packed dimension.");
   }
-  std::string not_found_message =
-      "No MakeTensorPtrOp or mlir::triton::xla::ExtractOp found";
+  std::string not_found_message = "No mlir::triton::xla::ExtractOp found";
   LOG(ERROR) << not_found_message;
   return absl::InvalidArgumentError(not_found_message);
 }
 
-LogicalResult SitofpInt4ToInt8Rewrite(ma::SIToFPOp op, PatternRewriter &r) {
+LogicalResult SitofpInt4ToInt8Rewrite(ma::SIToFPOp op, PatternRewriter& r) {
   if (!getElementTypeOrSelf(op.getIn().getType()).isInteger(4)) {
     return r.notifyMatchFailure(op, "not an i4 argument");
   }
@@ -679,13 +525,13 @@ LogicalResult SitofpInt4ToInt8Rewrite(ma::SIToFPOp op, PatternRewriter &r) {
   if (auto tensor_type = dyn_cast<RankedTensorType>(op.getType())) {
     type = tensor_type.clone(type);
   }
-  auto ext_si_op = r.create<ma::ExtSIOp>(op.getLoc(), type, op.getIn());
+  auto ext_si_op = ma::ExtSIOp::create(r, op.getLoc(), type, op.getIn());
   r.replaceOpWithNewOp<ma::SIToFPOp>(op, op.getType(), ext_si_op);
   return success();
 }
 
 LogicalResult TruncfSitofpToSitofpRewrite(ma::TruncFOp trunc_op,
-                                          PatternRewriter &rewriter) {
+                                          PatternRewriter& rewriter) {
   auto sitofp_op = trunc_op.getIn().getDefiningOp<ma::SIToFPOp>();
   if (!sitofp_op) {
     return rewriter.notifyMatchFailure(trunc_op, "not preceded by sitofp");
@@ -711,7 +557,7 @@ bool opInputElementTypeIs(mlir::Value value, Type element_type) {
 
 // The pattern converts the Sitofp(i4): Fp32 to ExtFOp(Sitofp(i4): bf16): Fp32.
 LogicalResult SitofpToExtFpSitofpRewrite(ma::SIToFPOp sitofp_op,
-                                         PatternRewriter &rewriter) {
+                                         PatternRewriter& rewriter) {
   auto input = sitofp_op.getIn();
   if (!opInputElementTypeIs<ma::ExtSIOp>(input, rewriter.getIntegerType(4))) {
     return rewriter.notifyMatchFailure(
@@ -733,8 +579,9 @@ LogicalResult SitofpToExtFpSitofpRewrite(ma::SIToFPOp sitofp_op,
              "ExtFOp(SiToFp(i4): bf16): Fp32: type:"
           << DumpToString(type_ranked);
   auto loc = sitofp_op.getLoc();
-  auto sitofp_bf16_op = rewriter.create<ma::SIToFPOp>(
-      loc, type_ranked.clone(rewriter.getBF16Type()), sitofp_op.getIn());
+  auto sitofp_bf16_op = ma::SIToFPOp::create(
+      rewriter, loc, type_ranked.clone(rewriter.getBF16Type()),
+      sitofp_op.getIn());
   rewriter.replaceOpWithNewOp<ma::ExtFOp>(sitofp_op, type, sitofp_bf16_op,
                                           ma::FastMathFlagsAttr{});
   return success();
@@ -752,9 +599,9 @@ class LoadInt4RewritePass
   // and converts it to the twice bigger i8 tensor where every i4 element uses
   // i8 space. At the end the module accepts the tt.ptr<i8> to the packed i4
   // tensor, and unpacks it to the i8 tensor for further processing. It gets the
-  // packed dimension from MakeTensorPtrOp or mtx::ExtractOp.
+  // packed dimension from mtx::ExtractOp.
   void runOnOperation() override {
-    auto *ctx = &getContext();
+    auto* ctx = &getContext();
     auto module = getOperation();
 
     RewritePatternSet normalize_patterns(ctx);
@@ -770,7 +617,7 @@ class LoadInt4RewritePass
     int packed_dimension = 0;
     // TODO(b/383255324): Support the case when both sides of the dot are packed
     // differently.
-    for (auto *op : ext_ops) {
+    for (auto* op : ext_ops) {
       VLOG(5) << "ext_op: " << DumpToString(op);
       auto ops = TraverseUpwards(op);
       auto packed_dimension_result = GetPackedDimension(ctx, ops);
@@ -784,7 +631,7 @@ class LoadInt4RewritePass
 
     ConversionTarget target(*ctx);
     I4ToI8Converter converter(packed_dimension);
-    target.markUnknownOpDynamicallyLegal([&](Operation *op) {
+    target.markUnknownOpDynamicallyLegal([&](Operation* op) {
       if (auto func_op = dyn_cast<mlir::FunctionOpInterface>(op)) {
         VLOG(5) << "check funcOp: " << DumpToString(func_op);
         if (func_op.getFunctionType() !=
@@ -801,14 +648,6 @@ class LoadInt4RewritePass
     scf::populateSCFStructuralTypeConversions(converter, patterns);
     patterns.add<ExtSIInt4ToInt8Pattern>(converter, ctx, enable_bf16x2_);
 
-    // TODO(b/393299275): LoadOp, AdvanceOp, AddPtrOp, and MakeTensorPtrOp will
-    // not be emitted by the generic Triton emitter. Remove these once the
-    // legacy Triton emitter is deprecated.
-    patterns.add<OpTypeConversionPattern<LoadOp>>(converter, ctx);
-    patterns.add<AdvanceOpConversionPattern>(converter, ctx);
-    patterns.add<AddPtrOpConversionPattern>(converter, ctx);
-    patterns.add<MakeTensorPtrOpConversionPattern>(converter, ctx);
-
     patterns.add<TritonXlaExtractOpConversionPattern>(converter, ctx);
 
     // TODO(b/393299275): Remove mt::FuncOp once the legacy Triton emitter is
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
index 0506844311f1c8..2ab433be265408 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
@@ -18,10 +18,12 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_dialect.h"  // IWYU pragma: keep
 
 namespace mlir::triton::xla {
 
@@ -30,7 +32,7 @@ namespace mlir::triton::xla {
 
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
-    bool allow_tma);
+    bool allow_tma, int num_stages);
 std::unique_ptr<mlir::Pass> CreateTritonXLASqueezeDimsPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLAFoldTransposePass();
 std::unique_ptr<mlir::Pass> CreateGeneralizeKernelSignaturePass();
@@ -45,7 +47,13 @@ std::unique_ptr<mlir::Pass> CreateTritonXLALowerAtomicsPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLALowerBlockBarrierPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLAConvertUnsupportedTypesPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLALowerRemoteAccessPass();
+std::unique_ptr<mlir::Pass> CreateTritonXLALowerXTilePass();
 std::unique_ptr<mlir::Pass> CreateStableHLOLowerToTritonPass();
+std::unique_ptr<mlir::Pass> CreateTensorLowerToTritonPass();
+std::unique_ptr<mlir::Pass> CreateTritonXLAMathToLibdevicePass(
+    absl::string_view libdevice_path, absl::string_view triple);
+std::unique_ptr<mlir::Pass> CreateXTileLowerToTritonPass();
+std::unique_ptr<mlir::Pass> CreateArithFP8ConversionToTritonPass();
 
 // Returns true if the `op` contains an operation in it's regions that satisfies
 // the `fn`.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
index b244f2f05ec9c2..1c4d71feb98e1b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
@@ -28,6 +28,8 @@ def TritonXLAExtractInsertToTritonPass : Pass<"triton-xla-extract-insert-to-trit
   let options = [
     Option<"allow_tma_", "allow_tma", "bool", "false",
            "Whether to permit lowering to TMA.">,
+    Option<"num_stages_", "num_stages", "int", "1",
+           "Number of stages for pipelining.">,
   ];
   let dependentDialects = [
     "triton::TritonDialect",
@@ -43,7 +45,9 @@ def TritonXLASqueezeDimsPass : Pass<"triton-xla-squeeze-dims", "mlir::ModuleOp">
   }];
   let dependentDialects = [
     "::mlir::triton::xla::XlaTritonDialect",
-    "triton::TritonDialect"
+    "::mlir::triton::TritonDialect",
+    "::xla::xtile::XTileDialect",
+    "::mlir::memref::MemRefDialect",
   ];
   let options = [
     Option<"finalize_", "finalize", "bool", "true",
@@ -58,7 +62,8 @@ def TritonXLAFoldTransposePass : Pass<"triton-xla-fold-transpose", "mlir::Module
     This pass tries to remove transposes by folding them into loads.
   }];
   let dependentDialects = [
-    "::mlir::triton::xla::XlaTritonDialect"
+    "::mlir::triton::xla::XlaTritonDialect",
+    "::mlir::memref::MemRefDialect",
   ];
   let constructor = "CreateTritonXLAFoldTransposePass()";
 }
@@ -190,14 +195,74 @@ def TritonXLAUnswitchLoopsPass :
   let constructor = "CreateTritonXLAUnswitchLoopsPass()";
 }
 
+def TritonXLALowerXTilePass :
+   Pass<"triton-xla-lower-xtile", "mlir::ModuleOp"> {
+  let summary = "Lowers xtile ops to Triton ops.";
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::triton::xla::XlaTritonDialect",
+    "triton::TritonDialect",
+  ];
+
+  let constructor = "CreateTritonXLALowerXTilePass()";
+}
+
 def StableHLOLowerToTritonPass
     : Pass<"stablehlo-lower-to-triton", "mlir::ModuleOp"> {
   let summary = "Lowers StableHLO operations to their Triton equivalent.";
   let dependentDialects = [
-    "stablehlo::StablehloDialect",
-    "triton::TritonDialect"
+    "::mlir::stablehlo::StablehloDialect",
+    "::mlir::triton::TritonDialect",
+    "::xla::xtile::XTileDialect",
   ];
   let constructor = "CreateStableHLOLowerToTritonPass()";
 }
 
+def TensorLowerToTritonPass
+    : Pass<"tensor-lower-to-triton", "mlir::ModuleOp"> {
+  let summary = "Lowers tensor operations to their Triton equivalent.";
+  let dependentDialects = [
+    "tensor::TensorDialect",
+    "triton::TritonDialect",
+    "::xla::xtile::XTileDialect",
+  ];
+  let constructor = "CreateTensorLowerToTritonPass()";
+}
+
+def TritonXLAMathToLibdevicePass
+    : Pass<"triton-xla-math-to-libdevice", "mlir::ModuleOp"> {
+  let summary = "Lowers math operations to tt.extern_elementwise calls to their libdevice equivalent.";
+  let dependentDialects = [
+    "tensor::TensorDialect",
+    "triton::TritonDialect"
+  ];
+  let options = [
+    Option<"libdevice_path_", "libdevice_path", "std::string", "",
+           "Path to the libdevice library.">,
+    Option<"triple_string_", "triple", "std::string", "",
+           "Device triple string.">,
+  ];
+}
+
+def XTileLowerToTritonPass
+    : Pass<"xtile-lower-to-triton", "mlir::ModuleOp"> {
+  let summary = "Lowers XTile operations to their Triton equivalent.";
+  let dependentDialects = [
+    "::mlir::triton::TritonDialect",
+    "::xla::xtile::XTileDialect",
+  ];
+  let constructor = "CreateXTileLowerToTritonPass()";
+}
+
+def ArithFP8ConversionToTritonPass
+    : Pass<"arith-fp8-conversion-to-triton", "mlir::ModuleOp"> {
+  let summary =
+    "Converts arith extf and truncf operations to their Triton equivalent.";
+  let dependentDialects = [
+    "::mlir::triton::TritonDialect",
+  ];
+}
+
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_PASSES_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc
index 3faf5f231dc48a..00c791c02a4dc8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc
@@ -56,10 +56,10 @@ class Tf32DotPattern : public OpRewritePattern<mt::DotOp> {
     if (op->hasAttr(tf32_args_rounded)) return failure();
 
     auto f32ToTF32 = [&](Value value) -> Value {
-      return rewriter
-          .create<ElementwiseInlineAsmOp>(
-              op.getLoc(), value.getType(), "cvt.rna.tf32.f32 $0, $1;", "=r,r",
-              /*isPure=*/true, /*pack=*/1, ArrayRef<Value>{value})
+      return ElementwiseInlineAsmOp::create(
+                 rewriter, op.getLoc(), value.getType(),
+                 "cvt.rna.tf32.f32 $0, $1;", "=r,r",
+                 /*isPure=*/true, /*pack=*/1, ArrayRef<Value>{value})
           ->getResult(0);
     };
     auto lhs = f32ToTF32(op.getA());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc
index a5506fd3781639..d0176d3f3ba9a9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc
@@ -14,13 +14,32 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <iterator>
+#include <limits>
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -28,6 +47,15 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "stablehlo/dialect/StablehloOps.h"
+#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
+#include "xla/service/algorithm_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace mlir::triton::xla {
@@ -55,13 +83,720 @@ class LowerTranspose : public mlir::OpRewritePattern<stablehlo::TransposeOp> {
   }
 };
 
+class LowerIotaToMakeRange : public mlir::OpRewritePattern<stablehlo::IotaOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::IotaOp op, mlir::PatternRewriter& rewriter) const override {
+    auto result_type = op.getResult().getType();
+
+    if (result_type.getRank() != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(), "tt.make_range is only supported for 1D outputs.");
+    }
+
+    if (!result_type.getElementType().isInteger(32)) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(), "tt.make_range is only supported for integer types.");
+    }
+
+    if (result_type.getElementType().isUnsignedInteger(32)) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "lowering to tt.make_range is only supported for 32 bit signed "
+          "integers.");
+    }
+
+    auto iota_end = result_type.getDimSize(0);
+
+    rewriter.replaceOpWithNewOp<ttir::MakeRangeOp>(op, result_type,
+                                                   /*start=*/0, iota_end);
+    return mlir::success();
+  }
+};
+
+class LowerBroadcastInDim
+    : public mlir::OpRewritePattern<stablehlo::BroadcastInDimOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::BroadcastInDimOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    ::xla::EmitterLocOpBuilder builder(op.getLoc(), rewriter);
+    auto input_tensor = op.getOperand();
+    auto input_shape = input_tensor.getType().getShape();
+    auto output_shape = op.getResult().getType().getShape();
+    auto broadcast_dims = op.getBroadcastDimensions();
+
+    if (input_shape.empty()) {
+      auto broadcast_dim_input = op.getOperand();
+
+      auto extracted = mlir::tensor::ExtractOp::create(rewriter, op.getLoc(),
+                                                       broadcast_dim_input);
+
+      rewriter.replaceOpWithNewOp<ttir::SplatOp>(op, op.getResult().getType(),
+                                                 extracted);
+      return mlir::success();
+    }
+    int64_t axis = 0;
+    int64_t input_dim_id = 0;
+    for (int output_dim_id = 0; output_dim_id < output_shape.size();
+         output_dim_id++) {
+      if (input_dim_id < broadcast_dims.size() &&
+          output_dim_id == broadcast_dims[input_dim_id]) {
+        // The dim is not broadcasted. Validate matching dim sizes.
+        CHECK_EQ(input_shape[input_dim_id], output_shape[output_dim_id]);
+        ++input_dim_id;
+        axis = output_dim_id + 1;
+        continue;
+      }
+      input_tensor = ttir::ExpandDimsOp::create(builder, input_tensor, axis);
+    }
+    rewriter.replaceOpWithNewOp<ttir::BroadcastOp>(op, op.getResult().getType(),
+                                                   input_tensor);
+
+    return mlir::success();
+  }
+};
+
+class LowerReduce : public mlir::OpRewritePattern<stablehlo::ReduceOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::ReduceOp op, mlir::PatternRewriter& rewriter) const override {
+    if (mlir::failed(VerifyOpIsCompatibleWithTritonReduce(op, rewriter))) {
+      return mlir::failure();
+    }
+
+    int32_t axis = op.getDimensions()[0];
+
+    // In case shlo returns a 0 rank tensor triton needs to return a scalar as
+    // triton doesn't support 0 rank tensors.
+    SmallVector<Type> adjusted_result_types;
+    adjusted_result_types.reserve(op.getNumResults());
+    for (auto result : op.getResults()) {
+      auto shaped_type = cast<mlir::ShapedType>(result.getType());
+      if (shaped_type.getRank() == 0) {
+        adjusted_result_types.push_back(shaped_type.getElementType());
+      } else {
+        adjusted_result_types.push_back(shaped_type);
+      }
+    }
+
+    auto triton_reduce_op = ttir::ReduceOp::create(
+        rewriter, op.getLoc(), adjusted_result_types, op.getInputs(), axis);
+    Region& triton_reduce_region = triton_reduce_op.getCombineOp();
+
+    mlir::Block& old_block = op.getBody().front();
+    llvm::SmallVector<Type> arg_types;
+    llvm::SmallVector<mlir::Location> arg_locs;
+    for (auto old_arg_type : old_block.getArgumentTypes()) {
+      arg_types.push_back(
+          llvm::cast<ShapedType>(old_arg_type).getElementType());
+      arg_locs.push_back(op.getLoc());
+    }
+    rewriter.createBlock(&triton_reduce_region, triton_reduce_region.begin(),
+                         arg_types, arg_locs);
+
+    mlir::IRMapping mapping;
+    Block& triton_reduce_region_block = triton_reduce_region.front();
+    rewriter.setInsertionPointToStart(&triton_reduce_region_block);
+    for (auto [old_arg, new_arg] :
+         llvm::zip(old_block.getArguments(),
+                   triton_reduce_region_block.getArguments())) {
+      auto to_tensor_op = mlir::tensor::FromElementsOp::create(
+          rewriter, op.getLoc(), old_arg.getType(), new_arg);
+      mapping.map(old_arg, to_tensor_op);
+    }
+
+    for (mlir::Operation& op : old_block.without_terminator()) {
+      rewriter.clone(op, mapping);
+    }
+
+    SmallVector<Value> return_operands;
+    for (Value operand : old_block.getTerminator()->getOperands()) {
+      return_operands.push_back(mlir::tensor::ExtractOp::create(
+          rewriter, op->getLoc(), mapping.lookupOrDefault(operand)));
+    }
+    ttir::ReduceReturnOp::create(rewriter, op.getLoc(), return_operands);
+
+    // Replace usages of the original op results. If the original result was a
+    // 0-rank tensor, we need to wrap the scalar result of tt.reduce in a
+    // tensor.to_tensor op.
+    rewriter.setInsertionPointAfter(triton_reduce_op);
+    llvm::SmallVector<Value> new_results;
+    for (const auto& triton_result : triton_reduce_op.getResults()) {
+      if (mlir::isa<mlir::ShapedType>(triton_result.getType())) {
+        new_results.push_back(triton_result);
+      } else {
+        new_results.push_back(mlir::tensor::FromElementsOp::create(
+            rewriter, op.getLoc(), op.getType(0), triton_result));
+      }
+    }
+
+    rewriter.replaceOp(op, new_results);
+    return mlir::success();
+  }
+
+  // Verifies that the stablehlo reduce op can be lowered to a triton reduce
+  // op.
+  // This checks that proper emitting of `tensor.from_elements` and
+  // `tensor.extract` on reducer inputs and outputs has happened. It also checks
+  // that `tensor.extract` was emitted on the result of the reduce operation if
+  // the result is a zero rank tensor.
+  mlir::LogicalResult VerifyOpIsCompatibleWithTritonReduce(
+      stablehlo::ReduceOp op, mlir::PatternRewriter& rewriter) const {
+    // Check that the reduction is along a single dimension.
+    auto dimensions = op.getDimensions();
+    if (dimensions.size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(), "tt.reduce only supports single dimension reductions.");
+    }
+
+    return mlir::success();
+  }
+};
+
+class LowerReshape : public mlir::OpRewritePattern<stablehlo::ReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::ReshapeOp op, mlir::PatternRewriter& rewriter) const override {
+    bool input_is_0d = op.getOperand().getType().getRank() == 0;
+    bool output_is_0d = op.getType().getRank() == 0;
+
+    if (input_is_0d && output_is_0d) {
+      rewriter.replaceAllUsesWith(op, op.getOperand());
+      return mlir::success();
+    }
+
+    if (input_is_0d) {
+      auto to_scalar = mlir::tensor::ExtractOp::create(rewriter, op->getLoc(),
+                                                       op.getOperand());
+      rewriter.replaceOpWithNewOp<ttir::SplatOp>(op, op.getType(), to_scalar);
+      return mlir::success();
+    }
+
+    if (output_is_0d) {
+      // We know the input dimensions must be all 1s as reshape input-output
+      // must have the same number of elements.
+      return LowerRank0ToReduce(op, rewriter);
+    }
+
+    // Conservatively prevent Triton from reordering elements within the tile.
+    // TODO(b/353637689): see if this restriction can be lifted.
+    bool allow_reorder = false;
+    rewriter.replaceOpWithNewOp<ttir::ReshapeOp>(
+        op, op.getResult().getType(), op.getOperand(), allow_reorder);
+    return mlir::success();
+  }
+
+  static mlir::LogicalResult LowerRank0ToReduce(
+      stablehlo::ReshapeOp op, mlir::PatternRewriter& rewriter) {
+    auto input_tensor_type = op.getOperand().getType();
+
+    // First, reshape to a 1D tensor if not already the case. This is needed
+    // because triton::ReduceOp can only reduce 1 dimension at a time.
+    auto single_dim_tensor = op.getOperand();
+    if (input_tensor_type.getRank() > 1) {
+      Type output_tensor_type =
+          mlir::RankedTensorType::get({1}, input_tensor_type.getElementType());
+      single_dim_tensor = ttir::ReshapeOp::create(
+          rewriter, op.getLoc(), output_tensor_type, single_dim_tensor,
+          /*allow_reorder=*/true);
+    }
+
+    // Second, reduce to a scalar.
+    ttir::ReduceOp reduction = ttir::ReduceOp::create(
+        rewriter, op.getLoc(), single_dim_tensor, /*axis=*/0);
+
+    auto element_type = input_tensor_type.getElementType();
+    mlir::Location loc = op.getLoc();
+    mlir::Block* reducer =
+        rewriter.createBlock(&reduction->getRegion(0), /*insertPt=*/{},
+                             /*argTypes=*/
+                             {element_type, element_type},
+                             /*locs=*/{loc, loc});
+
+    rewriter.setInsertionPointToStart(reducer);
+    auto create_binary_op = [&](auto op_type) -> Value {
+      return op_type.create(rewriter, reducer->getArgument(0).getLoc(),
+                            reducer->getArgument(0), reducer->getArgument(1));
+    };
+    Value result = mlir::isa<mlir::IntegerType>(element_type)
+                       ? create_binary_op(arith::AddIOp())
+                       : create_binary_op(arith::AddFOp());
+    ttir::ReduceReturnOp::create(rewriter, result.getLoc(), {result});
+
+    rewriter.setInsertionPointAfter(reduction);
+    rewriter.replaceOpWithNewOp<mlir::tensor::FromElementsOp>(
+        op, op.getType(), reduction.getResult());
+
+    return mlir::success();
+  }
+};
+
+namespace {
+
+LogicalResult PopulateOperandPrecision(PatternRewriter& rewriter,
+                                       stablehlo::DotGeneralOp op,
+                                       stablehlo::Precision& lhs_precision,
+                                       stablehlo::Precision& rhs_precision) {
+  auto precision_config = op.getPrecisionConfig();
+
+  if (!precision_config.has_value()) {
+    return rewriter.notifyMatchFailure(op->getLoc(),
+                                       "Dot op must have precision config.");
+  }
+
+  if (precision_config.value().size() != 2) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(),
+        "Dot op must have exactly two precisions. One for lhs and one for "
+        "rhs.");
+  }
+
+  auto lhs_precision_attr =
+      mlir::cast<stablehlo::PrecisionAttr>(precision_config.value()[0]);
+  auto rhs_precision_attr =
+      mlir::cast<stablehlo::PrecisionAttr>(precision_config.value()[1]);
+
+  lhs_precision = lhs_precision_attr.getValue();
+  rhs_precision = rhs_precision_attr.getValue();
+
+  return mlir::success();
+}
+
+::xla::PrecisionConfig::Precision StableHloPrecisionToXlaPrecision(
+    stablehlo::Precision precision) {
+  switch (precision) {
+    case stablehlo::Precision::DEFAULT:
+      return ::xla::PrecisionConfig::DEFAULT;
+    case stablehlo::Precision::HIGH:
+      return ::xla::PrecisionConfig::HIGH;
+    case stablehlo::Precision::HIGHEST:
+      return ::xla::PrecisionConfig::HIGHEST;
+    default:
+      LOG(FATAL) << "Unsupported precision";
+  }
+}
+
+// Triton implementations of dot algorithms.
+
+struct TritonPrecisionSpec {
+  ::xla::PrecisionConfig::Algorithm algorithm;
+  // Encodes `tt.dot`'s `inputPrecision` attribute.
+  ttir::InputPrecision ttir_input_precision;
+};
+
+mlir::Type ElementType(mlir::Value v) { return mlir::getElementTypeOrSelf(v); }
+
+using AlgorithmEmitter = absl::StatusOr<Value> (*)(
+    ::xla::EmitterLocOpBuilder, const ::xla::gpu::triton::DotOperands&,
+    const TritonPrecisionSpec&);
+
+absl::StatusOr<Value> EmitDotAlgUnset(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  // Execute matrix multiplication of input tiles and pass the accumulator.
+  // TODO(manany): Should be looked into once we enable Hopper workloads.
+  // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
+  // lower precision than the output type. The change was introduced here:
+  // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
+  Value lhs = dot_operands.lhs;
+  Value rhs = dot_operands.rhs;
+  Value acc = dot_operands.accumulator;
+
+  int max_num_imprecise_acc = 0;
+  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
+    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
+    // sense to enable frequent accumulator promotion at higher matmul
+    // precisions set in the config.
+    max_num_imprecise_acc = std::numeric_limits<int>::max();
+  }
+
+  return ttir::DotOp::create(
+      b, lhs, rhs, acc,
+      /*inputPrecision=*/precision_spec.ttir_input_precision,
+      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
+}
+
+absl::StatusOr<Value> EmitRegularDot(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  Value lhs = dot_operands.lhs;
+  Value rhs = dot_operands.rhs;
+
+  int max_num_imprecise_acc = 0;
+  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
+    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
+    // sense to enable frequent accumulator promotion at higher matmul
+    // precisions set in the config.
+    max_num_imprecise_acc = std::numeric_limits<int>::max();
+  }
+
+  // Cast F32 inputs to BF16 if the algorithm is BF16_BF16_F32.
+  // TODO(bchetioui): abstract this.
+  if (precision_spec.algorithm ==
+      ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32) {
+    if (ElementType(lhs).isF32()) {
+      lhs = ::xla::gpu::triton::Cast(b, lhs, b.getBF16Type());
+    }
+
+    if (ElementType(rhs).isF32()) {
+      rhs = ::xla::gpu::triton::Cast(b, rhs, b.getBF16Type());
+    }
+  }
+
+  return ttir::DotOp::create(
+      b, dot_operands.lhs, dot_operands.rhs, dot_operands.accumulator,
+      /*inputPrecision=*/precision_spec.ttir_input_precision,
+      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
+}
+
+// If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
+// If rhs is +infinity, we will have:
+// +infinity * 1.0 = +infinity
+// +infinity * 0.0 = NaN
+// We would get the wrong result if we sum these partial products. Instead, we
+// must override any accumulated result if the last partial product is
+// non-finite. See b/115844437.
+Value ZeroNaNs(::xla::EmitterLocOpBuilder b, Value input) {
+  Value positive_inf = ::xla::gpu::triton::CreateConst<float>(
+      b, b.getF32Type(), std::numeric_limits<float>::infinity(),
+      mlir::cast<ShapedType>(input.getType()).getShape());
+  Value abs_input = math::AbsFOp::create(b, input);
+  Value is_finite = arith::CmpFOp::create(b, arith::CmpFPredicate::OGT,
+                                          positive_inf, abs_input);
+  return arith::SelectOp::create(b, is_finite, input,
+                                 ::xla::gpu::triton::ZerosLike(b, input));
+}
+
+absl::Status ExpectType(Value v, Type expected_type) {
+  if (ElementType(v) != expected_type) {
+    std::string expected_type_str, actual_type_str;
+    {
+      llvm::raw_string_ostream os_expected(expected_type_str);
+      llvm::raw_string_ostream os_actual(actual_type_str);
+      expected_type.print(os_expected);
+      ElementType(v).print(os_actual);
+    }
+    return absl::FailedPreconditionError(absl::StrCat(
+        "Expected type ", expected_type_str, " but got ", actual_type_str));
+  }
+  return absl::OkStatus();
+}
+
+std::vector<Value> SplitF32(::xla::EmitterLocOpBuilder b, Value input,
+                            int split_count) {
+  std::vector<Value> split_inputs;
+  split_inputs.reserve(split_count);
+  for (int i = 0; i < split_count; ++i) {
+    Value input_as_bf16 = ::xla::gpu::triton::Cast(b, input, b.getBF16Type());
+    if (i != split_count - 1) {
+      Value input_as_f32 =
+          ::xla::gpu::triton::Cast(b, input_as_bf16, b.getF32Type());
+      input = arith::SubFOp::create(b, input, input_as_f32);
+    }
+    split_inputs.push_back(input_as_bf16);
+  }
+  return split_inputs;
+}
+
+Value IEEEDot(::xla::EmitterLocOpBuilder b, Value lhs, Value rhs, Value acc) {
+  return ttir::DotOp::create(b, lhs, rhs, acc,
+                             /*inputPrecision=*/ttir::InputPrecision::IEEE,
+                             /*maxNumImpreciseAcc=*/0);
+}
+
+// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
+// from https://arxiv.org/pdf/1904.06376.pdf.
+absl::StatusOr<Value> EmitBF16x9Matmul(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 3;
+  constexpr int kHigh = 0;
+  constexpr int kMid = 1;
+  constexpr int kLow = 2;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = ::xla::gpu::triton::ZerosLike(b, dot_operands.accumulator);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kLow], result);
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kLow], result);
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
+
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
+  result = arith::AddFOp::create(b, dot_operands.accumulator, result);
+  return result;
+}
+
+// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
+// from https://arxiv.org/pdf/1904.06376.pdf.
+absl::StatusOr<Value> EmitBF16x6Matmul(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 3;
+  constexpr int kHigh = 0;
+  constexpr int kMid = 1;
+  constexpr int kLow = 2;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = ::xla::gpu::triton::ZerosLike(b, dot_operands.accumulator);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
+
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
+  result = arith::AddFOp::create(b, dot_operands.accumulator, result);
+  return result;
+}
+
+// Compute F32 matmul with 3 BF16 dots. It is less accurate than
+// EmitBF16x6Matmul.
+absl::StatusOr<Value> EmitBF16x3Matmul(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 2;
+  constexpr int kHigh = 0;
+  constexpr int kLow = 1;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_bf16 = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_bf16 = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = ::xla::gpu::triton::ZerosLike(b, dot_operands.accumulator);
+  result = IEEEDot(b, lhs_bf16[kLow], rhs_bf16[kHigh], result);
+  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kLow], result);
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kHigh], result);
+  result = arith::AddFOp::create(b, dot_operands.accumulator, result);
+  return result;
+}
+
+// Returns an emitter for the given dot algorithm. Raises an
+// `UnimplementedError` if the algorithm is not supported.
+absl::StatusOr<AlgorithmEmitter> GetAlgorithmEmitter(
+    const ::xla::PrecisionConfig::Algorithm algorithm) {
+  switch (algorithm) {
+    case ::xla::PrecisionConfig::ALG_UNSET:
+      return EmitDotAlgUnset;
+    case ::xla::PrecisionConfig::ALG_DOT_F16_F16_F16:
+    case ::xla::PrecisionConfig::ALG_DOT_F32_F32_F32:
+    case ::xla::PrecisionConfig::ALG_DOT_F64_F64_F64:
+    case ::xla::PrecisionConfig::ALG_DOT_F16_F16_F32:
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+      return EmitRegularDot;
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+      return EmitBF16x3Matmul;
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      return EmitBF16x6Matmul;
+    case ::xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      // TODO(bchetioui): this should be factored out of EmitRegularDot.
+      return EmitRegularDot;
+    case ::xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+      // TODO(bchetioui): this should be factored out of EmitRegularDot.
+      return EmitRegularDot;
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      return EmitBF16x9Matmul;
+    case ::xla::PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
+    case ::xla::PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
+    default:
+      break;
+  }
+
+  // Couldn't find an algorithm emitter for this algorithm. Raise an error.
+  return absl::UnimplementedError(
+      absl::StrCat("This algorithm is not supported yet: ",
+                   ::xla::PrecisionConfig::Algorithm_Name(algorithm)));
+}
+
+bool IsTf32Allowed(const ::xla::gpu::triton::PrecisionSpec& precision_spec) {
+  if (precision_spec.algorithm == ::xla::PrecisionConfig::ALG_UNSET) {
+    return tsl::tensor_float_32_execution_enabled() &&
+           StableHloPrecisionToXlaPrecision(
+               precision_spec.lhs_operand_precision) ==
+               ::xla::PrecisionConfig::DEFAULT &&
+           StableHloPrecisionToXlaPrecision(
+               precision_spec.rhs_operand_precision) ==
+               ::xla::PrecisionConfig::DEFAULT;
+  }
+  return ::xla::algorithm_util::HasTf32InputType(precision_spec.algorithm);
+}
+
+ttir::InputPrecision InferDotPrecision(
+    const ::xla::gpu::triton::PrecisionSpec& precision_spec) {
+  if (precision_spec.algorithm ==
+      ::xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
+    return ttir::InputPrecision::TF32x3;
+  }
+
+  return IsTf32Allowed(precision_spec) ? ttir::InputPrecision::TF32
+                                       : ttir::InputPrecision::IEEE;
+}
+
+LogicalResult RewriteDotGeneralToTritonDot(mlir::PatternRewriter& rewriter,
+                                           stablehlo::DotGeneralOp op,
+                                           mlir::Operation* add_op,
+                                           Value accumulator) {
+  auto dot_algorithm = op.getAlgorithm();
+
+  auto hlo_algorithm_or_status =
+      dot_algorithm.has_value()
+          ? ::xla::ConvertDotAlgorithm(dot_algorithm.value())
+          : ::xla::PrecisionConfig::ALG_UNSET;
+
+  if (!hlo_algorithm_or_status.ok()) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(),
+        "Dot op must have algorithm set to be converted to "
+        "triton dot.");
+  }
+
+  auto hlo_algorithm = hlo_algorithm_or_status.value();
+  auto algorithm_emitter_or_status = GetAlgorithmEmitter(hlo_algorithm);
+
+  if (!algorithm_emitter_or_status.ok()) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(),
+        absl::StrCat("Algorithm emitter not found with error: ",
+                     algorithm_emitter_or_status.status().message()));
+  }
+
+  auto algorithm_emitter = algorithm_emitter_or_status.value();
+
+  ::xla::EmitterLocOpBuilder builder(op->getLoc(), rewriter);
+
+  ::xla::gpu::triton::DotOperands dot_operands{op.getLhs(), op.getRhs(),
+                                               accumulator};
+
+  stablehlo::Precision lhs_precision;
+  stablehlo::Precision rhs_precision;
+
+  if (mlir::failed(PopulateOperandPrecision(rewriter, op, lhs_precision,
+                                            rhs_precision))) {
+    return mlir::failure();
+  }
+
+  ::xla::gpu::triton::PrecisionSpec precision_spec{hlo_algorithm, lhs_precision,
+                                                   rhs_precision};
+
+  TritonPrecisionSpec triton_precision_spec{hlo_algorithm,
+                                            InferDotPrecision(precision_spec)};
+
+  auto triton_dot_op_or_result =
+      algorithm_emitter(builder, dot_operands, triton_precision_spec);
+
+  if (!triton_dot_op_or_result.ok()) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(), absl::StrCat("Algorithm emitter failed with error: ",
+                                   triton_dot_op_or_result.status().message()));
+  }
+
+  auto triton_dot_op = triton_dot_op_or_result.value();
+
+  rewriter.replaceAllOpUsesWith(add_op, op.getResult());
+  rewriter.replaceOp(op, triton_dot_op);
+
+  return mlir::success();
+}
+
+}  // namespace
+
+class LowerDotGeneral : public mlir::OpRewritePattern<stablehlo::DotGeneralOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::DotGeneralOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (std::distance(op->getUsers().begin(), op->getUsers().end()) != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must have exactly one user in order to be lowered to "
+          "triton.");
+    }
+
+    mlir::Operation* add_op = dyn_cast<arith::AddFOp>(*op->getUsers().begin());
+    if (!add_op) {
+      add_op = dyn_cast<arith::AddIOp>(*op->getUsers().begin());
+    }
+
+    if (!add_op) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must be consumed by an AddOp in order to be convertible to "
+          "triton dot.");
+    }
+
+    // Accumulator is the operand of add that is not the dot operation.
+    auto accumulator = add_op->getOperand(1) == op ? add_op->getOperand(0)
+                                                   : add_op->getOperand(1);
+
+    if (mlir::failed(
+            RewriteDotGeneralToTritonDot(rewriter, op, add_op, accumulator))) {
+      return mlir::failure();
+    }
+    return mlir::success();
+  }
+};
+
 class StableHLOLowerToTritonPass
     : public impl::StableHLOLowerToTritonPassBase<StableHLOLowerToTritonPass> {
  public:
   void runOnOperation() override {
     mlir::MLIRContext* mlir_context = &getContext();
     mlir::RewritePatternSet patterns(mlir_context);
-    patterns.add<LowerTranspose>(mlir_context);
+    patterns.add<LowerTranspose, LowerIotaToMakeRange, LowerBroadcastInDim,
+                 LowerReduce, LowerReshape, LowerDotGeneral>(mlir_context);
 
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tensor_lower_to_triton.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tensor_lower_to_triton.cc
new file mode 100644
index 00000000000000..d367ca8b1ea5e4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tensor_lower_to_triton.cc
@@ -0,0 +1,96 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+namespace ttir = ::mlir::triton;
+
+#define GEN_PASS_DEF_TENSORLOWERTOTRITONPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+class LowerBitcast : public mlir::OpRewritePattern<tensor::BitcastOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      tensor::BitcastOp op, mlir::PatternRewriter& rewriter) const override {
+    mlir::Value source = op.getSource();
+    if (op.getSource().getType().getRank() == 0) {
+      source = mlir::tensor::ExtractOp::create(rewriter, op.getLoc(), source);
+    }
+
+    mlir::TensorType tensor_result_type = op.getResult().getType();
+    bool is_0d_result = tensor_result_type.getRank() == 0;
+    mlir::Type triton_result_type =
+        is_0d_result ? tensor_result_type.getElementType() : tensor_result_type;
+
+    auto bitcast = ttir::BitcastOp::create(rewriter, op.getLoc(),
+                                           triton_result_type, source);
+
+    mlir::Value result = bitcast.getResult();
+    if (is_0d_result) {
+      result = mlir::tensor::FromElementsOp::create(rewriter, op.getLoc(),
+                                                    tensor_result_type, result);
+    }
+
+    rewriter.replaceOp(op, result);
+
+    return mlir::success();
+  }
+};
+
+// TODO(basioli): Consider fusing this with the stablehlo lowering pass into a
+// single xtile to triton lowering pass.
+class TensorLowerToTritonPass
+    : public impl::TensorLowerToTritonPassBase<TensorLowerToTritonPass> {
+ public:
+  void runOnOperation() override {
+    mlir::MLIRContext* mlir_context = &getContext();
+    mlir::RewritePatternSet patterns(mlir_context);
+    patterns.add<LowerBitcast>(mlir_context);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateTensorLowerToTritonPass() {
+  return std::make_unique<TensorLowerToTritonPass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/arith_fp8_conversion_to_triton.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/arith_fp8_conversion_to_triton.mlir
new file mode 100644
index 00000000000000..3bdb50de5b4b9f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/arith_fp8_conversion_to_triton.mlir
@@ -0,0 +1,57 @@
+// RUN: xla-opt %s -arith-fp8-conversion-to-triton | FileCheck %s
+
+// CHECK-LABEL: @extf_fp8_to_bf16
+func.func @extf_fp8_to_bf16(%arg0: tensor<16xf8E5M2>) -> tensor<16xbf16> {
+  // CHECK: tt.fp_to_fp
+  %0 = arith.extf %arg0 : tensor<16xf8E5M2> to tensor<16xbf16>
+  return %0 : tensor<16xbf16>
+}
+
+// CHECK-LABEL: @extf_fp8_to_f32
+func.func @extf_fp8_to_f32(%arg0: tensor<32xf8E4M3FN>) -> tensor<32xf32> {
+  // CHECK: tt.fp_to_fp
+  %0 = arith.extf %arg0 : tensor<32xf8E4M3FN> to tensor<32xf32>
+  return %0 : tensor<32xf32>
+}
+
+// CHECK-LABEL: @truncf_bf16_to_fp8e5m2_round_to_nearest_even
+func.func @truncf_bf16_to_fp8e5m2_round_to_nearest_even(%arg0: tensor<16xbf16>) -> tensor<16xf8E5M2> {
+  // CHECK: tt.fp_to_fp %{{.*}}, rounding = rtne
+  %0 = arith.truncf %arg0 to_nearest_even : tensor<16xbf16> to tensor<16xf8E5M2>
+  return %0 : tensor<16xf8E5M2>
+}
+
+// CHECK-LABEL: @truncf_f32_to_fp8e4m3fn_round_to_zero
+func.func @truncf_f32_to_fp8e4m3fn_round_to_zero(%arg0: tensor<32xf32>) -> tensor<32xf8E4M3FN> {
+  // CHECK: tt.fp_to_fp %{{.*}}, rounding = rtz
+  %0 = arith.truncf %arg0 toward_zero : tensor<32xf32> to tensor<32xf8E4M3FN>
+  return %0 : tensor<32xf8E4M3FN>
+}
+
+// CHECK-LABEL: @truncf_f32_to_fp8e4m3fn_no_rounding_mode_uses_nearest_even
+func.func @truncf_f32_to_fp8e4m3fn_no_rounding_mode_uses_nearest_even(%arg0: tensor<32xf32>) -> tensor<32xf8E4M3FN> {
+  // CHECK: tt.fp_to_fp %{{.*}}, rounding = rtne
+  %0 = arith.truncf %arg0 : tensor<32xf32> to tensor<32xf8E4M3FN>
+  return %0 : tensor<32xf8E4M3FN>
+}
+
+// CHECK-LABEL: @truncf_f32_to_fp8e4m3fn_unsupported_rounding_mode_falls_back_to_arith
+func.func @truncf_f32_to_fp8e4m3fn_unsupported_rounding_mode_falls_back_to_arith(%arg0: tensor<32xf32>) -> tensor<32xf8E4M3FN> {
+  // CHECK: arith.truncf
+  %0 = arith.truncf %arg0 upward : tensor<32xf32> to tensor<32xf8E4M3FN>
+  return %0 : tensor<32xf8E4M3FN>
+}
+
+// CHECK-LABEL: @truncf_f32_to_f16_falls_back_to_arith
+func.func @truncf_f32_to_f16_falls_back_to_arith(%arg0: tensor<32xf32>) -> tensor<32xf16> {
+  // CHECK: arith.truncf
+  %0 = arith.truncf %arg0 to_nearest_even : tensor<32xf32> to tensor<32xf16>
+  return %0 : tensor<32xf16>
+}
+
+// CHECK-LABEL: @extf_f16_to_f64_falls_back_to_arith
+func.func @extf_f16_to_f64_falls_back_to_arith(%arg0: tensor<32xf16>) -> tensor<32xf64> {
+  // CHECK: arith.extf
+  %0 = arith.extf %arg0 : tensor<32xf16> to tensor<32xf64>
+  return %0 : tensor<32xf64>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim.mlir
index 153d844599b98f..f38b7473807bad 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim.mlir
@@ -1,118 +1,5 @@
 // RUN: xla-opt --split-input-file --int4-to-packed-int4-rewrite --canonicalize %s | FileCheck %s
 
-// CHECK-LABEL: @major_1d
-tt.func @major_1d(%arg0: !tt.ptr<i4>) -> (tensor<8x1xi8>) {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i64
-  %c8 = arith.constant 8 : i64
-
-  %0 = tt.make_tensor_ptr %arg0, [%c8, %c1], [%c1, %c1], [%c0, %c0] {order = array<i32: 1, 0>} : <tensor<8x1xi4>>
-  // CHECK: %[[LOAD:.*]] = tt.load %{{.*}} : !tt.ptr<tensor<4x1xi8>>
-  %1 = tt.load %0 : !tt.ptr<tensor<8x1xi4>>
-  // CHECK-DAG: %[[SHLI:.*]] = arith.shli %[[LOAD]]
-  // CHECK-DAG: %[[LO:.*]] = arith.shrsi %[[SHLI]]
-  // CHECK-DAG: %[[HI:.*]] = arith.shrsi %[[LOAD]]
-  // CHECK: %[[JOIN:.*]] = tt.join %[[LO]], %[[HI]]
-  // CHECK: %[[TRANS:.*]] = tt.trans %[[JOIN]] {order = array<i32: 0, 2, 1>}
-  // CHECK: %[[RESHAPE:.*]] = tt.reshape %[[TRANS]]
-  // CHECK: tt.return %[[RESHAPE]] : tensor<8x1xi8>
-  %2 = arith.extsi %1 : tensor<8x1xi4> to tensor<8x1xi8>
-  tt.return %2 : tensor<8x1xi8>
-}
-
-// -----
-
-// CHECK-LABEL: @minor_1d
-tt.func @minor_1d(%arg0: !tt.ptr<i4>) -> (tensor<1x8xi8>) {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i64
-  %c8 = arith.constant 8 : i64
-
-  %0 = tt.make_tensor_ptr %arg0, [%c1, %c8], [%c1, %c1], [%c0, %c0] {order = array<i32: 1, 0>} : <tensor<1x8xi4>>
-  // CHECK: %[[LOAD:.*]] = tt.load %{{.*}} : !tt.ptr<tensor<1x4xi8>>
-  %1 = tt.load %0 : !tt.ptr<tensor<1x8xi4>>
-  // CHECK-DAG: %[[SHLI:.*]] = arith.shli %[[LOAD]]
-  // CHECK-DAG: %[[LO:.*]] = arith.shrsi %[[SHLI]]
-  // CHECK-DAG: %[[HI:.*]] = arith.shrsi %[[LOAD]]
-  // CHECK: %[[JOIN:.*]] = tt.join %[[LO]], %[[HI]]
-  // CHECK-NOT: tt.trans
-  // CHECK: %[[RESHAPE:.*]] = tt.reshape %[[JOIN]]
-  // CHECK: tt.return %[[RESHAPE]] : tensor<1x8xi8>
-  %2 = arith.extsi %1 : tensor<1x8xi4> to tensor<1x8xi8>
-  tt.return %2 : tensor<1x8xi8>
-}
-
-// -----
-
-// CHECK-LABEL: @major_2d
-tt.func @major_2d(%arg0: !tt.ptr<i4>) -> (tensor<8x8xi8>) {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i64
-  %c8 = arith.constant 8 : i64
-
-  %0 = tt.make_tensor_ptr %arg0, [%c8, %c8], [%c1, %c8], [%c0, %c0] {order = array<i32: 1, 0>} : <tensor<8x8xi4>>
-  // CHECK: %[[LOAD:.*]] = tt.load %{{.*}} : !tt.ptr<tensor<4x8xi8>>
-  %1 = tt.load %0 : !tt.ptr<tensor<8x8xi4>>
-  // CHECK-DAG: %[[SHLI:.*]] = arith.shli %[[LOAD]]
-  // CHECK-DAG: %[[LO:.*]] = arith.shrsi %[[SHLI]]
-  // CHECK-DAG: %[[HI:.*]] = arith.shrsi %[[LOAD]]
-  // CHECK: %[[JOIN:.*]] = tt.join %[[LO]], %[[HI]]
-  // CHECK: %[[TRANS:.*]] = tt.trans %[[JOIN]] {order = array<i32: 0, 2, 1>}
-  // CHECK-SAME: tensor<4x8x2xi8> -> tensor<4x2x8xi8>
-  // CHECK: %[[RESHAPE:.*]] = tt.reshape %[[TRANS]]
-  // CHECK: tt.return %[[RESHAPE]] : tensor<8x8xi8>
-  %2 = arith.extsi %1 : tensor<8x8xi4> to tensor<8x8xi8>
-  tt.return %2 : tensor<8x8xi8>
-}
-
-// -----
-
-// CHECK-LABEL: @minor_2d
-tt.func @minor_2d(%arg0: !tt.ptr<i4>) -> (tensor<8x8xi8>) {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i64
-  %c8 = arith.constant 8 : i64
-
-  %0 = tt.make_tensor_ptr %arg0, [%c8, %c8], [%c8, %c1], [%c0, %c0] {order = array<i32: 1, 0>} : <tensor<8x8xi4>>
-  // CHECK: %[[LOAD:.*]] = tt.load %{{.*}} : !tt.ptr<tensor<8x4xi8>>
-  %1 = tt.load %0 : !tt.ptr<tensor<8x8xi4>>
-  // CHECK-DAG: %[[SHLI:.*]] = arith.shli %[[LOAD]]
-  // CHECK-DAG: %[[LO:.*]] = arith.shrsi %[[SHLI]]
-  // CHECK-DAG: %[[HI:.*]] = arith.shrsi %[[LOAD]]
-  // CHECK: %[[JOIN:.*]] = tt.join %[[LO]], %[[HI]]
-  // CHECK-NOT: tt.trans
-  // CHECK: %[[RESHAPE:.*]] = tt.reshape %[[JOIN]]
-  // CHECK: tt.return %[[RESHAPE]] : tensor<8x8xi8>
-  %2 = arith.extsi %1 : tensor<8x8xi4> to tensor<8x8xi8>
-  tt.return %2 : tensor<8x8xi8>
-}
-
-// -----
-
-// CHECK-LABEL: @major_3d
-tt.func @major_3d(%arg0: !tt.ptr<i4>) -> (tensor<8x8x8xi8>) {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i64
-  %c8 = arith.constant 8 : i64
-  %c64 = arith.constant 64 : i64
-
-  %0 = tt.make_tensor_ptr %arg0, [%c8, %c8, %c8], [%c1, %c8, %c64], [%c0, %c0, %c0] {order = array<i32: 2, 1, 0>} : <tensor<8x8x8xi4>>
-  // CHECK: %[[LOAD:.*]] = tt.load %{{.*}} : !tt.ptr<tensor<4x8x8xi8>>
-  %1 = tt.load %0 : !tt.ptr<tensor<8x8x8xi4>>
-  // CHECK-DAG: %[[SHLI:.*]] = arith.shli %[[LOAD]]
-  // CHECK-DAG: %[[LO:.*]] = arith.shrsi %[[SHLI]]
-  // CHECK-DAG: %[[HI:.*]] = arith.shrsi %[[LOAD]]
-  // CHECK: %[[JOIN:.*]] = tt.join %[[LO]], %[[HI]]
-  // CHECK: %[[TRANS:.*]] = tt.trans %[[JOIN]] {order = array<i32: 0, 3, 1, 2>}
-  // CHECK-SAME: tensor<4x8x8x2xi8> -> tensor<4x2x8x8xi8>
-  // CHECK: %[[RESHAPE:.*]] = tt.reshape %[[TRANS]]
-  // CHECK: tt.return %[[RESHAPE]] : tensor<8x8x8xi8>
-  %2 = arith.extsi %1 : tensor<8x8x8xi4> to tensor<8x8x8xi8>
-  tt.return %2 : tensor<8x8x8xi8>
-}
-
-// -----
-
 // CHECK-LABEL: @triton_xla_extract_2d
 func.func @triton_xla_extract_2d(%arg0: !tt.ptr<i4>) -> (tensor<16x16xi8>) {
   // CHECK: %[[EXTRACT:.*]] = triton_xla.extract from %arg0
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_floats.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_floats.mlir
deleted file mode 100644
index 54f504b415e554..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_floats.mlir
+++ /dev/null
@@ -1,54 +0,0 @@
-// RUN: xla-opt --split-input-file --int4-to-packed-int4-rewrite --canonicalize %s | FileCheck %s
-
-tt.func @gemm_fusion_dot_impl(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}) -> (tensor<16x32xf32>) {
-  %c1_i64 = arith.constant 1 : i64
-  %c0_i32 = arith.constant 0 : i32
-  %c32_i64 = arith.constant 32 : i64
-  %11 = tt.make_tensor_ptr %arg0, [%c32_i64, %c32_i64], [%c32_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xi4>>
-  %12 = tt.advance %11, [%c0_i32, %c0_i32] : <tensor<16x32xi4>>
-  %16 = tt.load %12 : !tt.ptr<tensor<16x32xi4>>
-  // CHECK: tt.elementwise_inline_asm
-  // CHECK: tt.join
-  // CHECK: tt.reshape
-  // CHECK: arith.extf
-  %18 = arith.extsi %16 : tensor<16x32xi4> to tensor<16x32xi8>
-  %19 = arith.sitofp %18 : tensor<16x32xi8> to tensor<16x32xf32>
-  tt.return %19 : tensor<16x32xf32>
-}
-
-// -----
-
-tt.func @gemm_fusion_dot_impl(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}) -> (tensor<16x32xbf16>) {
-  %c1_i64 = arith.constant 1 : i64
-  %c0_i32 = arith.constant 0 : i32
-  %c32_i64 = arith.constant 32 : i64
-  %11 = tt.make_tensor_ptr %arg0, [%c32_i64, %c32_i64], [%c32_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xi4>>
-  %12 = tt.advance %11, [%c0_i32, %c0_i32] : <tensor<16x32xi4>>
-  %16 = tt.load %12 : !tt.ptr<tensor<16x32xi4>>
-  // CHECK: tt.elementwise_inline_asm
-  // CHECK: tt.join
-  // CHECK: tt.reshape
-  // CHECK-NOT: arith.extf
-  %18 = arith.extsi %16 : tensor<16x32xi4> to tensor<16x32xi8>
-  %19 = arith.sitofp %18 : tensor<16x32xi8> to tensor<16x32xbf16>
-  tt.return %19 : tensor<16x32xbf16>
-}
-
-// -----
-
-tt.func @gemm_fusion_dot_impl(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}) -> (tensor<16x32xf16>) {
-  %c1_i64 = arith.constant 1 : i64
-  %c0_i32 = arith.constant 0 : i32
-  %c32_i64 = arith.constant 32 : i64
-  %11 = tt.make_tensor_ptr %arg0, [%c32_i64, %c32_i64], [%c32_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xi4>>
-  %12 = tt.advance %11, [%c0_i32, %c0_i32] : <tensor<16x32xi4>>
-  %16 = tt.load %12 : !tt.ptr<tensor<16x32xi4>>
-  // CHECK: arith.shli
-  // CHECK: arith.shrsi
-  // CHECK: arith.shrsi
-  // CHECK: tt.join
-  %18 = arith.extsi %16 : tensor<16x32xi4> to tensor<16x32xi8>
-  // CHECK: arith.sitofp
-  %19 = arith.sitofp %18 : tensor<16x32xi8> to tensor<16x32xf16>
-  tt.return %19 : tensor<16x32xf16>
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_packed_int4.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_packed_int4.mlir
deleted file mode 100644
index 29cdd45524d57c..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_packed_int4.mlir
+++ /dev/null
@@ -1,110 +0,0 @@
-// RUN: xla-opt --int4-to-packed-int4-rewrite %s --mlir-print-ir-after-all
-
-module {
-  tt.func @gemm_fusion_dot_2_impl(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32>
-    %0 = tt.get_program_id x : i32
-    %c16_i32 = arith.constant 16 : i32
-    %1 = arith.divsi %0, %c16_i32 : i32
-    %c8_i32 = arith.constant 8 : i32
-    %2 = arith.muli %1, %c8_i32 : i32
-    %c1_i32 = arith.constant 1 : i32
-    %3 = arith.subi %c1_i32, %2 : i32
-    %4 = arith.cmpi slt, %3, %c8_i32 : i32
-    %5 = arith.select %4, %3, %c8_i32 : i32
-    %6 = arith.remsi %0, %5 : i32
-    %7 = arith.addi %2, %6 : i32
-    %c16_i32_0 = arith.constant 16 : i32
-    %8 = arith.remsi %0, %c16_i32_0 : i32
-    %9 = arith.divsi %8, %5 : i32
-    %c128_i32 = arith.constant 128 : i32
-    %10 = arith.muli %7, %c128_i32 : i32
-    %c1_i64 = arith.constant 1 : i64
-    %c0_i32 = arith.constant 0 : i32
-    %11 = arith.addi %10, %c0_i32 : i32
-    %c128_i64 = arith.constant 128 : i64
-    %c0_i32_1 = arith.constant 0 : i32
-    %c128_i64_2 = arith.constant 128 : i64
-    %c0_i32_3 = arith.constant 0 : i32
-    %c128_i64_4 = arith.constant 128 : i64
-    %c0_i32_5 = arith.constant 0 : i32
-    %12 = arith.addi %c0_i32_3, %c0_i32_5 : i32
-    %c64_i64 = arith.constant 64 : i64
-    %c0_i32_6 = arith.constant 0 : i32
-    %c64_i64_7 = arith.constant 64 : i64
-    %c8192_i32 = arith.constant 8192 : i32
-    %13 = tt.get_program_id y : i32
-    %c0_i32_8 = arith.constant 0 : i32
-    %14 = arith.addi %c0_i32_8, %13 : i32
-    %15 = arith.muli %14, %c8192_i32 : i32
-    %16 = tt.addptr %arg0, %15 : !tt.ptr<i4>, i32
-    %17 = tt.make_tensor_ptr %16, [%c128_i64_2, %c64_i64_7], [%c1_i64, %c128_i64_4], [%c0_i32_1, %c0_i32_6] {order = array<i32: 1, 0>} : <tensor<128x32xi4>>
-    %18 = tt.advance %17, [%10, %c0_i32_3] : <tensor<128x32xi4>>
-    %c0_i32_9 = arith.constant 0 : i32
-    %c256_i64 = arith.constant 256 : i64
-    %c0_i32_10 = arith.constant 0 : i32
-    %19 = arith.addi %c0_i32_9, %c0_i32_10 : i32
-    %c64_i64_11 = arith.constant 64 : i64
-    %c0_i32_12 = arith.constant 0 : i32
-    %c64_i64_13 = arith.constant 64 : i64
-    %c128_i32_14 = arith.constant 128 : i32
-    %20 = arith.muli %9, %c128_i32_14 : i32
-    %c1_i64_15 = arith.constant 1 : i64
-    %c0_i32_16 = arith.constant 0 : i32
-    %21 = arith.addi %20, %c0_i32_16 : i32
-    %c256_i64_17 = arith.constant 256 : i64
-    %c0_i32_18 = arith.constant 0 : i32
-    %c256_i64_19 = arith.constant 256 : i64
-    %c16384_i32 = arith.constant 16384 : i32
-    %22 = tt.get_program_id y : i32
-    %c0_i32_20 = arith.constant 0 : i32
-    %23 = arith.addi %c0_i32_20, %22 : i32
-    %24 = arith.muli %23, %c16384_i32 : i32
-    %25 = tt.addptr %arg1, %24 : !tt.ptr<f32>, i32
-    %26 = tt.make_tensor_ptr %25, [%c64_i64_13, %c256_i64_19], [%c256_i64, %c1_i64_15], [%c0_i32_12, %c0_i32_18] {order = array<i32: 1, 0>} : <tensor<32x128xf32>>
-    %27 = tt.advance %26, [%c0_i32_9, %20] : <tensor<32x128xf32>>
-    %c0_i32_21 = arith.constant 0 : i32
-    %c64_i32 = arith.constant 64 : i32
-    %c32_i32 = arith.constant 32 : i32
-    %28:3 = scf.for %arg3 = %c0_i32_21 to %c64_i32 step %c32_i32 iter_args(%arg4 = %18, %arg5 = %27, %arg6 = %cst) -> (!tt.ptr<tensor<128x32xi4>>, !tt.ptr<tensor<32x128xf32>>, tensor<128x128xf32>)  : i32 {
-      %39 = tt.load %arg4 : !tt.ptr<tensor<128x32xi4>>
-      %c0_i32_35 = arith.constant 0 : i32
-      %c32_i32_36 = arith.constant 32 : i32
-      %40 = tt.advance %arg4, [%c0_i32_35, %c32_i32_36] : <tensor<128x32xi4>>
-      %41 = tt.load %arg5 : !tt.ptr<tensor<32x128xf32>>
-      %c32_i32_37 = arith.constant 32 : i32
-      %c0_i32_38 = arith.constant 0 : i32
-      %42 = tt.advance %arg5, [%c32_i32_37, %c0_i32_38] : <tensor<32x128xf32>>
-      %43 = arith.extsi %39 : tensor<128x32xi4> to tensor<128x32xi8>
-      %44 = arith.sitofp %43 : tensor<128x32xi8> to tensor<128x32xf32>
-      %45 = tt.dot %44, %41, %arg6 : tensor<128x32xf32> * tensor<32x128xf32> -> tensor<128x128xf32>
-      scf.yield %40, %42, %45 : !tt.ptr<tensor<128x32xi4>>, !tt.ptr<tensor<32x128xf32>>, tensor<128x128xf32>
-    }
-    %c128_i32_22 = arith.constant 128 : i32
-    %29 = arith.muli %7, %c128_i32_22 : i32
-    %c256_i64_23 = arith.constant 256 : i64
-    %c0_i32_24 = arith.constant 0 : i32
-    %30 = arith.addi %29, %c0_i32_24 : i32
-    %c128_i64_25 = arith.constant 128 : i64
-    %c0_i32_26 = arith.constant 0 : i32
-    %c128_i64_27 = arith.constant 128 : i64
-    %c128_i32_28 = arith.constant 128 : i32
-    %31 = arith.muli %9, %c128_i32_28 : i32
-    %c1_i64_29 = arith.constant 1 : i64
-    %c0_i32_30 = arith.constant 0 : i32
-    %32 = arith.addi %31, %c0_i32_30 : i32
-    %c256_i64_31 = arith.constant 256 : i64
-    %c0_i32_32 = arith.constant 0 : i32
-    %c256_i64_33 = arith.constant 256 : i64
-    %c32768_i32 = arith.constant 32768 : i32
-    %33 = tt.get_program_id y : i32
-    %c0_i32_34 = arith.constant 0 : i32
-    %34 = arith.addi %c0_i32_34, %33 : i32
-    %35 = arith.muli %34, %c32768_i32 : i32
-    %36 = tt.addptr %arg2, %35 : !tt.ptr<f32>, i32
-    %37 = tt.make_tensor_ptr %36, [%c128_i64_27, %c256_i64_33], [%c256_i64_23, %c1_i64_29], [%c0_i32_26, %c0_i32_32] {order = array<i32: 1, 0>} : <tensor<128x128xf32>>
-    %38 = tt.advance %37, [%29, %31] : <tensor<128x128xf32>>
-    tt.store %38, %28#2 : !tt.ptr<tensor<128x128xf32>>
-    tt.return
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_packed_int4_small.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_packed_int4_small.mlir
deleted file mode 100644
index f66b2f4a479d1d..00000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_to_packed_int4_small.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: xla-opt --int4-to-packed-int4-rewrite %s
-
-module {
-  tt.func @dot_test(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}) -> tensor<16x16xi8> {
-    %c0 = arith.constant 0 : i32
-    %c1 = arith.constant 1: i64
-    %c16 = arith.constant 16: i64
-    %0 = tt.make_tensor_ptr %arg0, [%c16, %c16], [%c16, %c1], [%c0, %c0] {order = array<i32: 1, 0>} : <tensor<16x16xi4>>
-    %1 = tt.load %0 : !tt.ptr<tensor<16x16xi4>>
-    %2 = arith.extsi %1 : tensor<16x16xi4> to tensor<16x16xi8>
-    tt.return %2 : tensor<16x16xi8>
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir
index 65c88524a078aa..b58315a54122d0 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir
@@ -9,3 +9,174 @@ func.func @lower_transpose(%arg0: tensor<2x4x8xf32>) -> tensor<8x2x4xf32> {
   // CHECK: return %[[RES]] : tensor<8x2x4xf32>
   return %0 : tensor<8x2x4xf32>
 }
+
+// CHECK: func @lower_iota_to_make_range() -> tensor<16xi32>
+func.func @lower_iota_to_make_range() -> tensor<16xi32> {
+  // CHECK: %[[RES:.*]] = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+  %0 = stablehlo.iota dim = 0 : tensor<16xi32>
+  // CHECK: return %[[RES]] : tensor<16xi32>
+  return %0 : tensor<16xi32>
+}
+
+// CHECK: func @lower_iota_on_multidimensional_tensor_falls_back_to_stablehlo() -> tensor<16x32xi32>
+func.func @lower_iota_on_multidimensional_tensor_falls_back_to_stablehlo() -> tensor<16x32xi32> {
+  // CHECK: %[[RES:.*]] = stablehlo.iota dim = 0 : tensor<16x32xi32>
+  %0 = stablehlo.iota dim = 0 : tensor<16x32xi32>
+  // CHECK: return %[[RES]] : tensor<16x32xi32>
+  return %0 : tensor<16x32xi32>
+}
+
+// CHECK: func @lower_iota_on_non_signed_32_bit_tensor_falls_back_to_stablehlo() -> tensor<8xui32>
+func.func @lower_iota_on_non_signed_32_bit_tensor_falls_back_to_stablehlo() -> tensor<8xui32> {
+  // CHECK: %[[RES:.*]] = stablehlo.iota dim = 0 : tensor<8xui32>
+  %0 = stablehlo.iota dim = 0 : tensor<8xui32>
+  // CHECK: return %[[RES]] : tensor<8xui32>
+  return %0 : tensor<8xui32>
+}
+
+// CHECK: func @lower_broadcast_in_dim(%[[ARG0:.*]]: tensor<2x4xf32>) -> tensor<8x2x4x16xf32>
+func.func @lower_broadcast_in_dim(%arg0: tensor<2x4xf32>) -> tensor<8x2x4x16xf32> {
+  // CHECK: %[[RES_EXPAND_DIMS_0:.*]] = tt.expand_dims %[[ARG0]] {axis = 0 : i32} : tensor<2x4xf32> -> tensor<1x2x4xf32>
+  // CHECK: %[[RES_EXPAND_DIMS_1:.*]] = tt.expand_dims %[[RES_EXPAND_DIMS_0]] {axis = 3 : i32} : tensor<1x2x4xf32> -> tensor<1x2x4x1xf32>
+  // CHECK: %[[RES:.*]] = tt.broadcast %[[RES_EXPAND_DIMS_1]] : tensor<1x2x4x1xf32> -> tensor<8x2x4x16xf32>
+  %0 = stablehlo.broadcast_in_dim %arg0, dims = [1, 2] : (tensor<2x4xf32>) -> tensor<8x2x4x16xf32>
+  // CHECK: return %[[RES]] : tensor<8x2x4x16xf32>
+  return %0 : tensor<8x2x4x16xf32>
+}
+
+// CHECK: func @lower_broadcast_in_dim_on_0d_tensor_produced_by_to_tensor_to_splat(%[[ARG0:.*]]: f32) -> tensor<4x2xf32>
+func.func @lower_broadcast_in_dim_on_0d_tensor_produced_by_to_tensor_to_splat(%arg0: f32) -> tensor<4x2xf32> {
+  // CHECK-NOT: tensor.from_elements
+  // CHECK: %[[RES:.*]] = tt.splat %[[ARG0]] : f32 -> tensor<4x2xf32>
+  %to_tensor = tensor.from_elements %arg0 : tensor<f32>
+  %0 = stablehlo.broadcast_in_dim %to_tensor, dims = [] : (tensor<f32>) -> tensor<4x2xf32>
+  // CHECK: return %[[RES]] : tensor<4x2xf32>
+  return %0 : tensor<4x2xf32>
+}
+
+// CHECK: func @reduce(%[[ARG0:.*]]: tensor<16x8xf32>) -> tensor<8xf32>
+func.func @reduce(%arg0: tensor<16x8xf32>) -> tensor<8xf32> {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[RES:.*]] = "tt.reduce"(%[[ARG0]]) <{axis = 0 : i32}> ({
+  %1 = "stablehlo.reduce"(%arg0, %0) ({
+  //CHECK: ^bb0(%[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32):
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    // CHECK: %[[ARG1_CAST:.*]] = tensor.from_elements %[[ARG1]] : tensor<f32>
+    // CHECK: %[[ARG2_CAST:.*]] = tensor.from_elements %[[ARG2]] : tensor<f32>
+    // CHECK: %[[RES:.*]] = arith.addf %[[ARG1_CAST]], %[[ARG2_CAST]] : tensor<f32>
+    // CHECK: %[[RES_CAST:.*]] = tensor.extract %[[RES]][] : tensor<f32>
+    // CHECK: tt.reduce.return %[[RES_CAST]] : f32
+    %add = arith.addf %arg1, %arg2 : tensor<f32>
+    stablehlo.return %add : tensor<f32>
+  }) {dimensions = array<i64: 0>} : (tensor<16x8xf32>, tensor<f32>) -> tensor<8xf32>
+  return %1 : tensor<8xf32>
+}
+
+// CHECK: func @reduce_to_scalar_followed_by_extract(%[[ARG0:.*]]: tensor<16xf32>) -> f32
+func.func @reduce_to_scalar_followed_by_extract(%arg0: tensor<16xf32>) -> f32 {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[REDUCE_RESULT:.*]] = "tt.reduce"(%[[ARG0]]) <{axis = 0 : i32}> ({
+  %1 = "stablehlo.reduce"(%arg0, %0) ({
+  //CHECK: ^bb0(%[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32):
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    // CHECK: %[[RES:.*]] = arith.addf {{.*}} : tensor<f32>
+    // CHECK: tt.reduce.return {{.*}} : f32
+    %add = arith.addf %arg1, %arg2 : tensor<f32>
+    stablehlo.return %add : tensor<f32>
+  }) {dimensions = array<i64: 0>} : (tensor<16xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NOT: tensor.from_elements
+  // CHECK-NOT: tensor.extract
+  %extract = tensor.extract %1[] : tensor<f32>
+  // CHECK: return %[[REDUCE_RESULT:.*]] : f32
+  return %extract : f32
+}
+
+// CHECK: func @reduce_over_multiple_dimensions_falls_back_to_stablehlo(%[[ARG0:.*]]: tensor<16x8x4xf32>) -> tensor<4xf32>
+func.func @reduce_over_multiple_dimensions_falls_back_to_stablehlo(%arg0: tensor<16x8x4xf32>) -> tensor<4xf32> {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[RES:.*]] = stablehlo.reduce(%[[ARG0]] init: %{{.*}}) across dimensions = [0, 1] : (tensor<16x8x4xf32>, tensor<f32>) -> tensor<4xf32>
+  %1 = "stablehlo.reduce"(%arg0, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %add = arith.addf %arg1, %arg2 : tensor<f32>
+    stablehlo.return %add : tensor<f32>
+  }) {dimensions = array<i64: 0, 1>} : (tensor<16x8x4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: return %[[RES]] : tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK: func @reduce_with_multiple_inputs(%[[ARG0:.*]]: tensor<16x8xf32>, %[[ARG1:.*]]: tensor<16x8xf32>) -> tensor<8xf32>
+func.func @reduce_with_multiple_inputs(%arg0: tensor<16x8xf32>, %arg1: tensor<16x8xf32>) -> tensor<8xf32> {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[REDUCE_RESULT:.*]] = "tt.reduce"(%[[ARG0]], %[[ARG1]]) <{axis = 0 : i32}> ({
+  %1, %2 = "stablehlo.reduce"(%arg0, %arg1, %0, %0) ({
+  ^bb0(%arg0_reducer: tensor<f32>, %arg1_reducer: tensor<f32>, %arg2_reducer: tensor<f32>, %arg3_reducer: tensor<f32>):
+    %add0 = arith.addf %arg0_reducer, %arg1_reducer : tensor<f32>
+    %add1 = arith.addf %arg2_reducer, %arg3_reducer : tensor<f32>
+    stablehlo.return %add0, %add1 : tensor<f32>, tensor<f32>
+  }) {dimensions = array<i64: 0>} : (tensor<16x8xf32>, tensor<16x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<8xf32>, tensor<8xf32>)
+  return %1 : tensor<8xf32>
+}
+
+func.func @lower_reshape(%arg0: tensor<2x4x8xf32>) -> tensor<8x2x4xf32> {
+  // CHECK: %[[RES:.*]] = tt.reshape %[[ARG]] : tensor<2x4x8xf32> -> tensor<8x2x4xf32>
+  %0 = stablehlo.reshape %arg0 : (tensor<2x4x8xf32>) -> tensor<8x2x4xf32>
+  return %0 : tensor<8x2x4xf32>
+}
+
+// CHECK-LABEL: @reshape_0d_to_0d_folds(%arg0: tensor<f32>)
+func.func @reshape_0d_to_0d_folds(%arg0: tensor<f32>) -> tensor<f32> {
+  %0 = stablehlo.reshape %arg0 : (tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg0 : tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @reshape_0d_to_2d_splats(%arg0: tensor<f32>)
+func.func @reshape_0d_to_2d_splats(%arg0: tensor<f32>) -> tensor<1x1xf32> {
+  // CHECK: %[[SCALAR:.*]] = tensor.extract %arg0[] : tensor<f32>
+  // CHECK: %[[SPLAT:.*]] = tt.splat %[[SCALAR]] : f32 -> tensor<1x1xf32>
+  %0 = stablehlo.reshape %arg0 : (tensor<f32>) -> tensor<1x1xf32>
+  // CHECK: return %[[SPLAT]]
+  return %0 : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: @reshape_2d_to_0d_reduces(%arg0: tensor<1x1xf32>)
+func.func @reshape_2d_to_0d_reduces(%arg0: tensor<1x1xf32>) -> tensor<f32> {
+  // CHECK: %[[RESHAPE:.*]] = tt.reshape %arg0 allow_reorder : tensor<1x1xf32> -> tensor<1xf32>
+  // CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[RESHAPE]]) <{axis = 0 : i32}> ({
+  // CHECK:  ^bb0(%arg1: f32, %arg2: f32):
+  // CHECK:    %[[ADD:.*]] = arith.addf %arg1, %arg2 : f32
+  // CHECK:    tt.reduce.return %[[ADD]] : f32
+  // CHECK:  }) : (tensor<1xf32>) -> f32
+  // CHECK:  %[[REDUCE_TENSOR:.*]] = tensor.from_elements %[[REDUCE]] : tensor<f32>
+  %0 = stablehlo.reshape %arg0 : (tensor<1x1xf32>) -> tensor<f32>
+  // CHECK: return %[[REDUCE_TENSOR]]
+  return %0 : tensor<f32>
+}
+
+// CHECK: func @lower_dot_add_to_triton(%[[ARG0:.*]]: tensor<2x4xf32>, %[[ARG1:.*]]: tensor<4x8xf32>, %[[ARG2:.*]]: tensor<2x8xf32>) -> tensor<2x8xf32>
+func.func @lower_dot_add_to_triton(%arg0: tensor<2x4xf32>, %arg1: tensor<4x8xf32>, %arg2: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  // CHECK: %[[RES:.*]] = tt.dot %[[ARG0]], %[[ARG1]], %[[ARG2]], inputPrecision = tf32 : tensor<2x4xf32> * tensor<4x8xf32> -> tensor<2x8xf32>
+  // CHECK-NOT: arith.addf
+  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf32>, tensor<4x8xf32>) -> tensor<2x8xf32>
+  %1 = arith.addf %0, %arg2 : tensor<2x8xf32>
+  // CHECK: return %[[RES]] : tensor<2x8xf32>
+  return %1 : tensor<2x8xf32>
+}
+
+// CHECK: func @lower_dot_without_add_falls_back_to_stablehlo(%[[ARG0:.*]]: tensor<2x4xf32>, %[[ARG1:.*]]: tensor<4x8xf32>, %[[ARG2:.*]]: tensor<2x8xf32>) -> tensor<2x8xf32>
+func.func @lower_dot_without_add_falls_back_to_stablehlo(%arg0: tensor<2x4xf32>, %arg1: tensor<4x8xf32>, %arg2: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  // CHECK: %[[RES:.*]] = stablehlo.dot_general %[[ARG0]], %[[ARG1]], contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf32>, tensor<4x8xf32>) -> tensor<2x8xf32>
+  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf32>, tensor<4x8xf32>) -> tensor<2x8xf32>
+  // CHECK: return %[[RES]] : tensor<2x8xf32>
+  return %0 : tensor<2x8xf32>
+}
+
+// CHECK: func @lower_dot_f8_no_ieee_has_max_num_imprecise_acc_set_to_max(%[[ARG0:.*]]: tensor<2x4xf8E4M3FN>, %[[ARG1:.*]]: tensor<4x8xf8E4M3FN>, %[[ARG2:.*]]: tensor<2x8xf8E4M3FN>) -> tensor<2x8xf8E4M3FN>
+func.func @lower_dot_f8_no_ieee_has_max_num_imprecise_acc_set_to_max(%arg0: tensor<2x4xf8E4M3FN>, %arg1: tensor<4x8xf8E4M3FN>, %arg2: tensor<2x8xf8E4M3FN>) -> tensor<2x8xf8E4M3FN> {
+  // CHECK: %[[RES:.*]] = tt.dot %[[ARG0]], %[[ARG1]], %[[ARG2]], inputPrecision = tf32 {maxNumImpreciseAcc = 2147483647 : i32} : tensor<2x4xf8E4M3FN> * tensor<4x8xf8E4M3FN> -> tensor<2x8xf8E4M3FN>
+  // CHECK-NOT: arith.addf
+  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf8E4M3FN>, tensor<4x8xf8E4M3FN>) -> tensor<2x8xf8E4M3FN>
+  %1 = arith.addf %0, %arg2 : tensor<2x8xf8E4M3FN>
+  // CHECK: return %[[RES]] : tensor<2x8xf8E4M3FN>
+  return %1 : tensor<2x8xf8E4M3FN>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/tensor_to_triton_lowering.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/tensor_to_triton_lowering.mlir
new file mode 100644
index 00000000000000..76412bc44997c8
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/tensor_to_triton_lowering.mlir
@@ -0,0 +1,23 @@
+// RUN: xla-opt %s -split-input-file \
+// RUN: -tensor-lower-to-triton \
+// RUN: | FileCheck %s
+
+//TODO(basioli): Consider fusing this and stablehlo_to_triton_lowering.mlir into xtile_to_triton_lowering.mlir
+
+// CHECK: func @lower_bitcast(%[[ARG:.*]]: tensor<2x4x8xf32>) -> tensor<2x4x8xi32>
+func.func @lower_bitcast(%arg0: tensor<2x4x8xf32>) -> tensor<2x4x8xi32> {
+  // CHECK: %[[RES:.*]] = tt.bitcast %[[ARG]] : tensor<2x4x8xf32> -> tensor<2x4x8xi32>
+  %0 = tensor.bitcast %arg0 : tensor<2x4x8xf32> to tensor<2x4x8xi32>
+  // CHECK: return %[[RES]] : tensor<2x4x8xi32>
+  return %0 : tensor<2x4x8xi32>
+}
+
+// CHECK: func @lower_bitcast_0d(%[[ARG:.*]]: tensor<f32>) -> tensor<i32>
+func.func @lower_bitcast_0d(%arg0: tensor<f32>) -> tensor<i32> {
+  // CHECK: %[[SCALAR_ARG:.*]] = tensor.extract %[[ARG]][] : tensor<f32>
+  // CHECK: %[[RES:.*]] = tt.bitcast %[[SCALAR_ARG]] : f32 -> i32
+  // CHECK: %[[TENSOR_RES:.*]] = tensor.from_elements %[[RES]] : tensor<i32>
+  %0 = tensor.bitcast %arg0 : tensor<f32> to tensor<i32>
+  // CHECK: return %[[TENSOR_RES]] : tensor<i32>
+  return %0 : tensor<i32>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_spin_wait.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_spin_wait.mlir
index af02ad2cbd2c68..8401d447dfe220 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_spin_wait.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_spin_wait.mlir
@@ -5,12 +5,14 @@
 // CHECK-SAME:    %[[ARG1:.*]]: i32
 tt.func @nomask_kernel(%ptr : !tt.ptr<i32>, %expected : i32) {
 // CHECK-NEXT:  %[[RES:.+]] = tt.elementwise_inline_asm
+// CHECK-SAME:  {
 // CHECK-SAME:  .reg .pred %p<1>;
 // CHECK-SAME:  .reg .b32 %r<1>;
 // CHECK-SAME:  wait:
 // CHECK-SAME:  ld.global.gpu.relaxed.u32 %r0, [$1];
 // CHECK-SAME:  setp.eq.u32 %p0, %r0, $2;
 // CHECK-SAME:  @%p0 bra wait;
+// CHECK-SAME:  }
 // CHECK-SAME:  {constraints = "=r,l,r", packed_element = 1 : i32, pure = false}
 // CHECK-SAME:  %[[ARG0]], %[[ARG1]]
 // CHECK-SAME:  !tt.ptr<i32>, i32 -> i32
@@ -28,6 +30,7 @@ tt.func @masked_kernel(
   %expected: i32
 ) {
 // CHECK:         tt.elementwise_inline_asm
+// CHECK-SAME:    {
 // CHECK-SAME:    .reg .pred %p<2>;
 // CHECK-SAME:    .reg .b32 %r<1>;
 // CHECK-SAME:    setp.ne.u32 %p0, $3, 0;
@@ -37,6 +40,7 @@ tt.func @masked_kernel(
 // CHECK-SAME:    setp.lt.u32 %p1, %r0, $2;
 // CHECK-SAME:    @%p1 bra wait;
 // CHECK-SAME:    done:
+// CHECK-SAME:    }
 // CHECK-SAME:    {constraints = "=r,l,r,r", packed_element = 1 : i32, pure = false}
 // CHECK-SAME:    %[[ARG0]], %[[ARG2]], %[[ARG1]]
 // CHECK-SAME:    tensor<4x!tt.ptr<i32>>, i32, tensor<4xi1> -> tensor<4xi32>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_write.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_write.mlir
index 5ecc1d18be87a8..e5e13eb0efde5b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_write.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_atomic_write.mlir
@@ -29,9 +29,11 @@ tt.func @nomask_vector_kernel(%ptr : tensor<8x!tt.ptr<i32>>, %value : i32) {
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4x!tt.ptr<i32>>, %[[ARG1:.+]]: i32, %[[ARG2:.+]]: tensor<4xi1>)
 tt.func @mask_kernel(%ptr : tensor<4x!tt.ptr<i32>>, %value : i32, %mask : tensor<4xi1>) {
   // CHECK-NEXT: %[[RES:.+]] = tt.elementwise_inline_asm
+  // CHECK-SAME: {
   // CHECK-SAME: .reg .pred %p<>;
   // CHECK-SAME: setp.ne.u32 %p<>, $3, 0;
   // CHECK-SAME: @%p st.global.sys.release.u32 [$1], $2;
+  // CHECK-SAME: }
   // CHECK-SAME: {constraints = "=r,l,r,r", packed_element = 1 : i32, pure = false}
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]] : tensor<4x!tt.ptr<i32>>, i32, tensor<4xi1> -> tensor<4xi32>
   triton_xla.atomic_write sys, release, %ptr,  %value, %mask:
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_block_barrier.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_block_barrier.mlir
index 2fd160d0e42749..7f5d7a37f7ec3d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_block_barrier.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_block_barrier.mlir
@@ -30,6 +30,7 @@ tt.func @block_barrier_kernel(
   // CHECK-NEXT:   %[[ADD_PTR_5:.+]] = tt.addptr %[[SPLAT_ADD_PTR_4]], %[[RANGE]]
   // CHECK-NEXT:   triton_xla.atomic_spin_wait sys, acquire, %[[ADD_PTR_5]], less_than, %[[SIGNAL_VALUE]]
   // CHECK-NEXT: }
+  // CHECK-NEXT:   ttg.local_barrier
   // CHECK-NEXT: tt.return
   triton_xla.block_barrier %ptr, %rank, %signal_value, { world_size = 8 : i32 } :
     (!tt.ptr<!tt.ptr<i32>>, i32, i32) -> ()
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_convert_unsupported_types.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_convert_unsupported_types.mlir
index ab4158dc8d59b7..6ccec39cc26835 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_convert_unsupported_types.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_convert_unsupported_types.mlir
@@ -1,20 +1,32 @@
 // RUN: xla-opt --convert-unsupported-types %s | FileCheck %s
 
 module {
-  // CHECK:   func.func @triton_fn(%arg0: !tt.ptr<f8E4M3FN>, %arg1: !tt.ptr<i8>, %arg2: !tt.ptr<f8E4M3FN>, %arg3: !tt.ptr<i8>, %arg4: !tt.ptr<f32>) {
-  func.func @triton_fn(%arg0: !tt.ptr<f8E4M3FN>, %arg1: !tt.ptr<f8E8M0FNU>, %arg2: !tt.ptr<f8E4M3FN>, %arg3: !tt.ptr<f8E8M0FNU>, %arg4: !tt.ptr<f32>) {
+  // CHECK:   xtile.entry_func @triton_fn(
+  // CHECK-SAME:   %arg0: memref<64x512xf8E4M3FN, #triton_xla.layout<[1, 0]>>,
+  // CHECK-SAME:   %arg1: memref<64x16xi8, #triton_xla.layout<[1, 0]>>,
+  // CHECK-SAME:   %arg2: memref<512x64xf8E4M3FN, #triton_xla.layout<[1, 0]>>,
+  // CHECK-SAME:   %arg3: memref<16x64xi8, #triton_xla.layout<[1, 0]>>,
+  xtile.entry_func @triton_fn(
+      %arg0: memref<64x512xf8E4M3FN, #triton_xla.layout<[1, 0]>>,
+      %arg1: memref<64x16xf8E8M0FNU, #triton_xla.layout<[1, 0]>>,
+      %arg2: memref<512x64xf8E4M3FN, #triton_xla.layout<[1, 0]>>,
+      %arg3: memref<16x64xf8E8M0FNU, #triton_xla.layout<[1, 0]>>,
+      %tile_id: index) {
+    // CHECK-DAG: %[[C_0:.*]] = arith.constant 0 : index
+    %c_0 = arith.constant 0 : index
     %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
-    %extracted_tile = triton_xla.extract from %arg0 as memref<64x512xf8E4M3FN, #triton_xla.layout<[1, 0]>> [0, 0] [16, 32] [1, 1] : tensor<16x32xf8E4M3FN>
-    // CHECK: %[[arg_0:.*]] = triton_xla.extract from %arg0 as memref<64x512xf8E4M3FN, #triton_xla.layout<[1, 0]>> [0, 0] [16, 32] [1, 1] : tensor<16x32xf8E4M3FN>
-    %extracted_tile_0 = triton_xla.extract from %arg1 as memref<64x16xf8E8M0FNU, #triton_xla.layout<[1, 0]>> [0, 0] [16, 1] [1, 1] : tensor<16x1xf8E8M0FNU>
-    // CHECK: %[[arg_1:.*]] = triton_xla.extract from %arg1 as memref<64x16xi8, #triton_xla.layout<[1, 0]>> [0, 0] [16, 1] [1, 1] : tensor<16x1xi8>
-    %extracted_tile_1 = triton_xla.extract from %arg2 as memref<512x64xf8E4M3FN, #triton_xla.layout<[1, 0]>> [0, 0] [32, 16] [1, 1] : tensor<32x16xf8E4M3FN>
-    // CHECK: %[[arg_2:.*]] = triton_xla.extract from %arg2 as memref<512x64xf8E4M3FN, #triton_xla.layout<[1, 0]>> [0, 0] [32, 16] [1, 1] : tensor<32x16xf8E4M3FN>
-    %extracted_tile_2 = triton_xla.extract from %arg3 as memref<16x64xf8E8M0FNU, #triton_xla.layout<[1, 0]>> [0, 0] [1, 16] [1, 1] : tensor<1x16xf8E8M0FNU>
-    // CHECK: %[[arg_3:.*]] = triton_xla.extract from %arg3 as memref<16x64xi8, #triton_xla.layout<[1, 0]>> [0, 0] [1, 16] [1, 1] : tensor<1x16xi8>
+    %extracted_tile = xtile.extract %arg0[%c_0, %c_0] [16, 32] [1, 1] : memref<64x512xf8E4M3FN, #triton_xla.layout<[1, 0]>> -> tensor<16x32xf8E4M3FN>
+    // CHECK: %[[arg_0:.*]] = xtile.extract %arg0[%[[C_0]], %[[C_0]]] [16, 32] [1, 1] : memref<64x512xf8E4M3FN, #triton_xla.layout<[1, 0]>> -> tensor<16x32xf8E4M3FN>
+    %extracted_tile_0 = xtile.extract %arg1[%c_0, %c_0] [16, 1] [1, 1] : memref<64x16xf8E8M0FNU, #triton_xla.layout<[1, 0]>> -> tensor<16x1xf8E8M0FNU>
+    // CHECK: %[[arg_1:.*]] = xtile.extract %arg1[%[[C_0]], %[[C_0]]] [16, 1] [1, 1] : memref<64x16xi8, #triton_xla.layout<[1, 0]>> -> tensor<16x1xi8>
+    %extracted_tile_1 = xtile.extract %arg2[%c_0, %c_0] [32, 16] [1, 1] : memref<512x64xf8E4M3FN, #triton_xla.layout<[1, 0]>> -> tensor<32x16xf8E4M3FN>
+    // CHECK: %[[arg_2:.*]] = xtile.extract %arg2[%[[C_0]], %[[C_0]]] [32, 16] [1, 1] : memref<512x64xf8E4M3FN, #triton_xla.layout<[1, 0]>> -> tensor<32x16xf8E4M3FN>
+    %extracted_tile_2 = xtile.extract %arg3[%c_0, %c_0] [1, 16] [1, 1] : memref<16x64xf8E8M0FNU, #triton_xla.layout<[1, 0]>> -> tensor<1x16xf8E8M0FNU>
+    // CHECK: %[[arg_3:.*]] = xtile.extract %arg3[%[[C_0]], %[[C_0]]] [1, 16] [1, 1] : memref<16x64xi8, #triton_xla.layout<[1, 0]>> -> tensor<1x16xi8>
     %16 = arith.bitcast %extracted_tile_0 : tensor<16x1xf8E8M0FNU> to tensor<16x1xi8>
     %17 = arith.bitcast %extracted_tile_2 : tensor<1x16xf8E8M0FNU> to tensor<1x16xi8>
-    %18 = tt.dot_scaled %extracted_tile scale %16, %extracted_tile_1 scale %17, %cst lhs = e4m3 rhs = e4m3 {fastMath = true} : tensor<16x32xf8E4M3FN>, tensor<16x1xi8> * tensor<32x16xf8E4M3FN>, tensor<1x16xi8> -> tensor<16x16xf32>
-    return
+    %18 = tt.trans %17 {order = array<i32: 1, 0>} : tensor<1x16xi8> -> tensor<16x1xi8>
+    %19 = tt.dot_scaled %extracted_tile scale %16, %extracted_tile_1 scale %18, %cst lhs = e4m3 rhs = e4m3 {fastMath = true} : tensor<16x32xf8E4M3FN>, tensor<16x1xi8> * tensor<32x16xf8E4M3FN>, tensor<16x1xi8> -> tensor<16x16xf32>
+    xtile.return
   }
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
index 9d2649bf637514..a5b50bb5533f7b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
@@ -3,7 +3,7 @@
 // RUN: | FileCheck %s
 
 // RUN: xla-opt %s -split-input-file \
-// RUN: -triton-xla-extract-insert-to-triton=allow_tma=1 \
+// RUN: -triton-xla-extract-insert-to-triton="allow_tma=1 num_stages=3" \
 // RUN: | FileCheck %s --check-prefix=CHECK-TMA
 
 func.func @lower_extract_insert(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
@@ -250,3 +250,28 @@ module {
 // CHECK-TMA-LABEL: tt.func @incompatible_tma_dynamic_offset_not_divisible_by_16_bytes
 // CHECK-TMA:         tt.load
 // CHECK-TMA:         tt.descriptor_store
+
+// -----
+
+func.func @parameter_into_broadcast_with_3_or_more_stages_does_not_use_tma(
+          %arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: !tt.ptr<f32>) {
+  %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
+  %extracted_tile = triton_xla.extract from %arg0 as
+      memref<64xf32, #triton_xla.layout<[0]>> [0] [64] [1] : tensor<64xf32>
+  %0 = tt.expand_dims %extracted_tile {axis = 1 : i32}
+      : tensor<64xf32> -> tensor<64x1xf32>
+  %1 = tt.broadcast %0 : tensor<64x1xf32> -> tensor<64x64xf32>
+  %extracted_tile_0 = triton_xla.extract from %arg1 as
+      memref<64x64xf32, #triton_xla.layout<[1, 0]>> [0, 0] [64, 64] [1, 1]
+      : tensor<64x64xf32>
+  %2 = tt.dot %1, %extracted_tile_0, %cst, inputPrecision = tf32
+      : tensor<64x64xf32> * tensor<64x64xf32> -> tensor<64x64xf32>
+  triton_xla.insert %2 into %arg2 as
+      memref<64x64xf32, #triton_xla.layout<[1, 0]>> [0, 0] [64, 64] [1, 1]
+      : tensor<64x64xf32>
+  return
+}
+
+// CHECK-TMA-LABEL: tt.func @parameter_into_broadcast_with_3_or_more_stages_does_not_use_tma
+// CHECK-TMA-NOT:         tt.descriptor_load %arg0
+// CHECK-TMA:             tt.descriptor_load %arg1
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir
index 8bc1f58ca8cf64..98851a1d7429b0 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir
@@ -1,16 +1,25 @@
 // RUN: xla-opt %s --triton-xla-fold-transpose | FileCheck %s
 
-// CHECK-LABEL: func @fold_transpose_of_extract
-func.func @fold_transpose_of_extract(%arg0: !tt.ptr<f32>, %arg1: i32) -> tensor<8x4xf32> {
-  // CHECK: %[[EXTRACT:.*]] = triton_xla.extract from %arg0
-  // CHECK-SAME: as memref<16x8x4xf32, #triton_xla.layout<[0, 2, 1]>>
-  // CHECK-SAME: [0, 0, 0] [8, 1, 4] [1, 1, 1] : tensor<8x4xf32>
-  %0 = triton_xla.extract from %arg0
-    as memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>>
-    [0, 0, 0] [4, 1, 8] [1, 1, 1] : tensor<4x8xf32>
-  %1 = tt.trans %0 {order = array<i32: 1, 0>} : tensor<4x8xf32> -> tensor<8x4xf32>
+// CHECK-LABEL: func @push_transpose_of_extract_tile_to_memref
+// CHECK-SAME: (%[[INPUT:.*]]: memref
+// CHECK-SAME: , %[[OFFSET0:.*]]: index, %[[OFFSET1:.*]]: index, %[[OFFSET2:.*]]: index)
+func.func @push_transpose_of_extract_tile_to_memref(
+  %input: memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>>,
+  %offset0: index, %offset1: index, %offset2: index)  ->  tensor<8x4xf32>
+{
+  // CHECK: %[[TRANSPOSE:.*]] = memref.transpose %[[INPUT]]
+
+  // CHECK-SAME: (d0, d1, d2) -> (d2, d1, d0)
+  // CHECK-SAME: : memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>>
+  // CHECK-SAME: to memref<16x8x4xf32, strided<[1, 64, 16]>>
+  // CHECK: %[[EXTRACT:.*]] = xtile.extract %[[TRANSPOSE]]
+  // CHECK-SAME: [%[[OFFSET2]], %[[OFFSET1]], %[[OFFSET0]]] [8, 1, 4] [1, 1, 1]
+  // CHECK-SAME: : memref<16x8x4xf32, strided<[1, 64, 16]>> -> tensor<8x4xf32>
+  %tile = xtile.extract %input[%offset0, %offset1, %offset2][4, 1, 8][1, 1, 1]
+    : memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>> -> tensor<4x8xf32>
+  %transposed = tt.trans %tile {order = array<i32: 1, 0>} : tensor<4x8xf32> -> tensor<8x4xf32>
   // CHECK: return %[[EXTRACT]] : tensor<8x4xf32>
-  return %1 : tensor<8x4xf32>
+  return %transposed : tensor<8x4xf32>
 }
 
 // CHECK-LABEL: func @push_transpose_up_through_broadcast
@@ -62,3 +71,11 @@ func.func @push_transpose_up_through_reshape(%arg0: tensor<4x8x2xf32>) -> tensor
   %1 = tt.trans %0 {order = array<i32: 1, 0>} : tensor<4x16xf32> -> tensor<16x4xf32>
   return %1 : tensor<16x4xf32>
 }
+
+// CHECK-LABEL: func @push_transpose_up_through_mask
+func.func @push_transpose_up_through_mask(%arg0: tensor<4x8xf32>, %arg1: f32) -> tensor<8x4xf32> {
+  // CHECK: xtile.mask %{{.*}} bounds [7, 3], %arg1 : tensor<8x4xf32>
+  %0 = xtile.mask %arg0 bounds [3, 7], %arg1 : tensor<4x8xf32>
+  %1 = tt.trans %0 {order = array<i32: 1, 0>} : tensor<4x8xf32> -> tensor<8x4xf32>
+  return %1 : tensor<8x4xf32>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_lower_xtile.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_lower_xtile.mlir
new file mode 100644
index 00000000000000..89b8ea2c452d4c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_lower_xtile.mlir
@@ -0,0 +1,104 @@
+// RUN: xla-opt %s -split-input-file -triton-xla-lower-xtile | FileCheck %s
+
+xtile.entry_func @extract_insert_no_layout(%input: memref<1024xf32, #nvvm.memory_space<global>>,
+                         %output: memref<32xf32, #nvvm.memory_space<global>>,
+                         %tile_id: index) {
+  %tile = xtile.extract %input[%tile_id][1][1] : memref<1024xf32, #nvvm.memory_space<global>> -> tensor<1xf32>
+  xtile.insert %tile into %output[%tile_id][1][1] : tensor<1xf32> -> memref<32xf32, #nvvm.memory_space<global>>
+  xtile.return
+}
+
+// CHECK: func.func @extract_insert_no_layout(%[[ARG0:.*]]: !tt.ptr<f32>, %[[ARG1:.*]]: !tt.ptr<f32>) {
+// CHECK:   %[[PID:.*]] = tt.get_program_id x : i32
+// CHECK:   %[[PID_IDX:.*]] = arith.index_cast %[[PID]] : i32 to index
+// CHECK:   %[[TILE:.*]] = triton_xla.extract from %[[ARG0]] as memref<1024xf32, #triton_xla.layout<[0]>> [%[[PID_IDX]]] [1] [1] : tensor<1xf32>
+// CHECK:   triton_xla.insert %[[TILE]] into %[[ARG1]] as memref<32xf32, #triton_xla.layout<[0]>> [%[[PID_IDX]]] [1] [1] : tensor<1xf32>
+// CHECK:   return
+// CHECK: }
+
+// -----
+
+!arg_type = memref<1024x32x1x1xbf16, #triton_xla.layout<[2, 3, 0, 1]>, #nvvm.memory_space<global>>
+xtile.entry_func @layout_preserved(%input: !arg_type,
+                                   %tile_id: index) {
+  %c_0 = arith.constant 0 : index
+  %tile = xtile.extract %input[%tile_id, %c_0, %c_0, %c_0][1, 1, 1, 1][1, 1, 1, 1] : !arg_type -> tensor<1x1x1x1xbf16>
+  xtile.return
+}
+
+// CHECK: func.func @layout_preserved(%[[ARG0:.*]]: !tt.ptr<bf16>) {
+// CHECK:   %[[PID:.*]] = tt.get_program_id x : i32
+// CHECK:   %[[PID_IDX:.*]] = arith.index_cast %[[PID]] : i32 to index
+// CHECK:   %[[TILE:.*]] = triton_xla.extract from %[[ARG0]]
+// CHECK-SAME: as memref<1024x32x1x1xbf16, #triton_xla.layout<[3, 2, 0, 1]>>
+// CHECK-SAME: [%[[PID_IDX]], 0, 0, 0]
+// CHECK-SAME: [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xbf16>
+// CHECK:   return
+// CHECK: }
+
+// -----
+
+!memref_type = memref<32xf64, #nvvm.memory_space<global>>
+// CHECK:func.func @scalar_insert_extract(
+// CHECK-SAME: %[[ARG0:.*]]: !tt.ptr<f64>, %[[ARG1:.*]]: !tt.ptr<f64>) {
+xtile.entry_func @scalar_insert_extract(%input: !memref_type,
+                                        %output: !memref_type,
+                                        %tile_id: index) {
+  // CHECK: %[[SCALAR_VALUE:.*]] = tt.load %[[ARG0]] : !tt.ptr<f64>
+  %tile = xtile.extract %input[%tile_id][1][1] : !memref_type -> tensor<f64>
+  // CHECK: tt.store %[[ARG1]], %[[SCALAR_VALUE]] : !tt.ptr<f64>
+  xtile.insert %tile into %output[%tile_id][1][1] : tensor<f64> -> !memref_type
+  xtile.return
+}
+
+// -----
+
+!memref_type = memref<32xf64, #nvvm.memory_space<global>>
+// CHECK:func.func @insert_extract_with_opaque_arg(
+// CHECK-SAME: %[[ARG0:.*]]: !tt.ptr<f64>, %[[ARG1:.*]]: !tt.ptr<f64>, %[[ARG2:.*]]: i32) {
+xtile.entry_func @insert_extract_with_opaque_arg(%input: !memref_type,
+                                                 %output: !memref_type,
+                                                 %opaque_arg: i32,
+                                                 %tile_id: index) attributes {
+                                                   num_opaque_args = 1: i32} {
+  // CHECK: %[[SCALAR_VALUE:.*]] = tt.load %[[ARG0]] : !tt.ptr<f64>
+  %tile = xtile.extract %input[%tile_id][1][1] : !memref_type -> tensor<f64>
+  // CHECK: tt.store %[[ARG1]], %[[SCALAR_VALUE]] : !tt.ptr<f64>
+  xtile.insert %tile into %output[%tile_id][1][1] : tensor<f64> -> !memref_type
+  xtile.return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @fold_transpose_into_ptr
+// CHECK-SAME: (%[[ARG0:.*]]: memref<32x16xf64, #triton_xla.layout<[0, 1]>>)
+func.func @fold_transpose_into_ptr(
+    %arg0: memref<32x16xf64, #triton_xla.layout<[0, 1]>>) -> !tt.ptr<f64> {
+  %transposed = memref.transpose %arg0 (d0, d1) -> (d1, d0)
+    : memref<32x16xf64, #triton_xla.layout<[0, 1]>> to memref<16x32xf64>
+  // CHECK: %[[PTR:.*]] = triton_xla.memref_to_ptr %[[ARG0]] from memref<32x16xf64, #triton_xla.layout<[0, 1]>> to <f64>
+  %ptr = triton_xla.memref_to_ptr %transposed from memref<16x32xf64> to !tt.ptr<f64>
+  // CHECK: return %[[PTR]] : !tt.ptr<f64>
+  return %ptr : !tt.ptr<f64>
+}
+
+// -----
+
+// CHECK-LABEL: @mask_lowers_to_stable_hlo(%arg0: tensor<32xf64>, %arg1: f64) -> tensor<32xf64>
+func.func @mask_lowers_to_stable_hlo(%arg0: tensor<32xf64>, %arg1: f64) -> tensor<32xf64> {
+  // CHECK: %[[BOUND:.*]] = arith.constant dense<10> : tensor<32xi32>
+  // CHECK: %[[IDX:.*]] = stablehlo.iota dim = 0 : tensor<32xi32>
+  // CHECK: %[[IDX_BROADCASTED:.*]] = stablehlo.broadcast_in_dim %[[IDX]],
+  // CHECK-SAME: dims = [0] : (tensor<32xi32>) -> tensor<32xi32>
+  // CHECK: %[[MASK:.*]] = arith.cmpi slt, %[[IDX_BROADCASTED]], %[[BOUND]]
+  // CHECK-SAME: : tensor<32xi32>
+  // CHECK: %[[INIT:.*]] = tensor.from_elements %arg1 : tensor<f64>
+  // CHECK: %[[INIT_TENSOR:.*]] = stablehlo.broadcast_in_dim %[[INIT]],
+  // CHECK-SAME: dims = [] : (tensor<f64>) -> tensor<32xf64>
+  // CHECK: %[[RESULT:.*]] = arith.select %[[MASK]], %arg0, %[[INIT_TENSOR]]
+  // CHECK-SAME: : tensor<32xi1>, tensor<32xf64>
+  %paded = xtile.mask %arg0 bounds [10], %arg1 : tensor<32xf64>
+  // CHECK: return %[[RESULT]] : tensor<32xf64>
+  return %paded : tensor<32xf64>
+}
+
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_math_to_libdevice.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_math_to_libdevice.mlir
new file mode 100644
index 00000000000000..09d35c269367a0
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_math_to_libdevice.mlir
@@ -0,0 +1,296 @@
+// RUN: xla-opt %s -split-input-file -triton-xla-math-to-libdevice=' \
+// RUN: libdevice_path=/path/to/libdevice triple=nvptx64-unknown-unknown' \
+// RUN: | FileCheck %s
+
+func.func @main(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.acos %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_acosf"}
+
+// -----
+
+func.func @acosh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.acosh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_acoshf"}
+
+// -----
+
+func.func @asin(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.asin %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_asinf"}
+
+// -----
+
+func.func @asinh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.asinh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_asinhf"}
+
+// -----
+
+func.func @atan2(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.atan2 %arg0, %arg1 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0, %arg1
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_atan2f"}
+
+// -----
+
+func.func @atanh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.atanh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_atanhf"}
+
+// -----
+
+func.func @cos(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.cos %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_cosf"}
+
+// -----
+
+func.func @cosh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.cosh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_coshf"}
+
+// -----
+
+func.func @exp(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.exp %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_expf"}
+
+// -----
+
+func.func @exp_bf16(%arg0: tensor<1024xbf16>) -> tensor<1024xbf16> {
+  %result = math.exp %arg0 : tensor<1024xbf16>
+  return %result : tensor<1024xbf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xbf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_expf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xbf16>
+
+// -----
+
+func.func @exp_f16(%arg0: tensor<1024xf16>) -> tensor<1024xf16> {
+  %result = math.exp %arg0 : tensor<1024xf16>
+  return %result : tensor<1024xf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_expf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xf16>
+
+// -----
+
+func.func @erf(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.erf %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_erff"}
+
+// -----
+
+func.func @expm1(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.expm1 %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_expm1f"}
+
+// -----
+
+func.func @log(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.log %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_logf"}
+
+// -----
+
+
+func.func @log_bf16(%arg0: tensor<1024xbf16>) -> tensor<1024xbf16> {
+  %result = math.log %arg0 : tensor<1024xbf16>
+  return %result : tensor<1024xbf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xbf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_logf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xbf16>
+
+// -----
+
+func.func @log_f16(%arg0: tensor<1024xf16>) -> tensor<1024xf16> {
+  %result = math.log %arg0 : tensor<1024xf16>
+  return %result : tensor<1024xf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_logf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xf16>
+
+// -----
+
+func.func @log1p(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.log1p %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_log1pf"}
+
+// -----
+
+func.func @powf(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.powf %arg0, %arg1 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0, %arg1
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_powf"}
+
+// -----
+
+func.func @remf(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = arith.remf %arg0, %arg1 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0, %arg1
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fmodf"}
+
+// -----
+
+func.func @rsqrt(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.rsqrt %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_rsqrtf"}
+
+// -----
+
+func.func @sin(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.sin %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_sinf"}
+
+// -----
+
+func.func @sinh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.sinh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_sinhf"}
+
+// -----
+
+func.func @sqrt(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.sqrt %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_sqrtf"}
+
+// -----
+
+func.func @tan(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.tan %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_tanf"}
+
+// -----
+
+func.func @tanh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.tanh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_tanhf"}
+
+// -----
+
+func.func @cbrt(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.cbrt %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_cbrtf"}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir
index 55a7ec570939fd..3f571820d261a2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir
@@ -12,24 +12,25 @@ tt.func @get_rank(
 }
 
 tt.func @get_peer_ptr(
-  %arg0: !tt.ptr<i64>, %peer_id: i64, %metadata: !tt.ptr<i64>
+  %arg0: !tt.ptr<i64>, %arg1: !tt.ptr<i64>, %peer_id: i64, %metadata: !tt.ptr<i64>
 ) -> !tt.ptr<i64> {
   // CHECK-NOT: triton_xla.get_peer_ptr
-  // Offset to local_buffer_root_ptrs.
-  // CHECK: %c72_i64 = arith.constant 72 : i64
-
-  // Byte size of a pointer.
-  // CHECK-NEXT: %c8_i64 = arith.constant 8 : i64
+  // An offset from the beginning of metadata to the peer pointers for the %arg1
+  // offset(param_to_peers) + sizeof(uint64_t) * 2 = 20
+  // CHECK: %c24_i64 = arith.constant 24 : i64
+  // Size of the uint64_t.
+  // CHECK: %c8_i64 = arith.constant 8 : i64
 
   // Load metadata->rank
-  // CHECK-NEXT: %0 = tt.load %arg2 : !tt.ptr<i64>
+  // CHECK-NEXT: %0 = tt.load %arg3 : !tt.ptr<i64>
 
   // Calculate offset to current base pointer.
   // CHECK-NEXT: %1 = arith.muli %0, %c8_i64 : i64
 
-  // Load metadata->local_buffer_root_ptrs[metadata->rank].
-  // CHECK-NEXT: %2 = arith.addi %1, %c72_i64 : i64
-  // CHECK-NEXT: %3 = tt.addptr %arg2, %2 : !tt.ptr<i64>, i64
+  // Load metadata->param_to_peers[argument_offset + metadata->rank].
+  // Here argument_offset = 0 since %arg0 is the first argument.
+  // CHECK-NEXT: %2 = arith.addi %1, %c8_i64 : i64
+  // CHECK-NEXT: %3 = tt.addptr %arg3, %2 : !tt.ptr<i64>, i64
   // CHECK-NEXT: %4 = tt.load %3 : !tt.ptr<i64>
 
   // Calculate offset to address.
@@ -37,17 +38,60 @@ tt.func @get_peer_ptr(
   // CHECK-NEXT: %6 = arith.subi %5, %4 : i64
 
   // Calculate offset to peer base pointer.
-  // CHECK-NEXT: %7 = arith.muli %arg1, %c8_i64 : i64
-  // CHECK-NEXT: %8 = arith.addi %7, %c72_i64 : i64
+  // CHECK-NEXT: %7 = arith.muli %arg2, %c8_i64 : i64
+  // CHECK-NEXT: %8 = arith.addi %7, %c8_i64 : i64
 
-  // Load metadata->local_buffer_root_ptrs[peer_id].
-  // CHECK-NEXT: %9 = tt.addptr %arg2, %8 : !tt.ptr<i64>, i64
+  // Load metadata->peer_base_ptrs[argument_offset + peer_id].
+  // CHECK-NEXT: %9 = tt.addptr %arg3, %8 : !tt.ptr<i64>, i64
   // CHECK-NEXT: %10 = tt.load %9 : !tt.ptr<i64>
 
-  // Load metadata->local_buffer_root_ptrs[peer_id] + offset.
+  // Load metadata->buffer_root_ptrs[argument_offset + peer_id] + offset.
   // CHECK-NEXT: %11 = arith.addi %10, %6 : i64
   // CHECK-NEXT: %12 = tt.int_to_ptr %11 : i64 -> !tt.ptr<i64>
-  // CHECK-NEXT: tt.return %12 : !tt.ptr<i64>
-  %peer_ptr = triton_xla.get_peer_ptr %arg0, %peer_id, %metadata : (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
-  tt.return %peer_ptr : !tt.ptr<i64>
+  %arg_0_peer_ptr = triton_xla.get_peer_ptr %arg0, %peer_id, %metadata,
+     { argument_index = 0 : i32, world_size = 2 : i32 } :
+     (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
+
+  // Load metadata->rank
+  // CHECK-NEXT: %13 = tt.load %arg3 : !tt.ptr<i64>
+  // Calculate offset to current base pointer.
+  // CHECK-NEXT: %14 = arith.muli %13, %c8_i64 : i64
+  // Load metadata->param_to_peers[argument_offset + metadata->rank].
+  // CHECK-NEXT: %15 = arith.addi %14, %c24_i64 : i64
+  // CHECK-NEXT: %16 = tt.addptr %arg3, %15 : !tt.ptr<i64>, i64
+  // CHECK-NEXT: %17 = tt.load %16 : !tt.ptr<i64>
+  // Calculate offset to address.
+  // CHECK-NEXT: %18 = tt.ptr_to_int %arg1 : !tt.ptr<i64> -> i64
+  // CHECK-NEXT: %19 = arith.subi %18, %17 : i64
+
+  // Calculate offset to peer base pointer.
+  // CHECK-NEXT: %20 = arith.muli %arg2, %c8_i64 : i64
+  // CHECK-NEXT: %21 = arith.addi %20, %c24_i64 : i64
+
+  // Load metadata->peer_base_ptrs[argument_offset + peer_id].
+  // CHECK-NEXT: %22 = tt.addptr %arg3, %21 : !tt.ptr<i64>, i64
+  // CHECK-NEXT: %23 = tt.load %22 : !tt.ptr<i64>
+
+  // Load metadata->buffer_root_ptrs[argument_offset + peer_id] + offset.
+  // CHECK-NEXT: %24 = arith.addi %23, %19 : i64
+  // CHECK-NEXT: %25 = tt.int_to_ptr %24 : i64 -> !tt.ptr<i64>
+
+  %arg_1_peer_ptr = triton_xla.get_peer_ptr %arg1, %peer_id, %metadata,
+     { argument_index = 1 : i32, world_size = 2 : i32 } :
+     (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
+  
+  // Avoid optimizing away the get_peer_ptr calls, by returning xor of the two
+  // peer pointers.
+  // 
+  // CHECK-NEXT: %26 = tt.ptr_to_int %12 : !tt.ptr<i64> -> i64
+  %int_arg0 = tt.ptr_to_int %arg_0_peer_ptr : !tt.ptr<i64> -> i64
+  // CHECK-NEXT: %27 = tt.ptr_to_int %25 : !tt.ptr<i64> -> i64
+  %int_arg1 = tt.ptr_to_int %arg_1_peer_ptr : !tt.ptr<i64> -> i64
+
+  // CHECK-NEXT: %28 = arith.ori %26, %27 : i64
+  %result_int = arith.ori %int_arg0, %int_arg1 : i64
+  // CHECK-NEXT: %29 = tt.int_to_ptr %28 : i64 -> !tt.ptr<i64>
+  %result_ptr = tt.int_to_ptr %result_int : i64 -> !tt.ptr<i64>
+  // CHECK-NEXT: tt.return %29 : !tt.ptr<i64>
+  tt.return %result_ptr : !tt.ptr<i64>
 }
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_squeeze_dims.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_squeeze_dims.mlir
index 6359143e9f7e6f..42eb59d0824897 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_squeeze_dims.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_squeeze_dims.mlir
@@ -137,41 +137,50 @@ func.func @reshape_with_encoding(%arg0: tensor<1x32xf32, #arg_enc>) -> tensor<32
 // -----
 
 // CHECK-LABEL: func @fold_squeeze_dims_of_extract
-func.func @fold_squeeze_dims_of_extract(%arg0: !tt.ptr<f32>, %arg1: i32) -> tensor<4x8xf32> {
-  // CHECK: %[[EXTRACT:.*]] = triton_xla.extract from %arg0
-  // CHECK-SAME: as memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>
-  // CHECK-SAME: [0, 3, 0] [4, 1, 8] [1, 1, 1] : tensor<4x8xf32>
-  %0 = triton_xla.extract from %arg0
-    as memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>
-    [0, 3, 0] [4, 1, 8] [1, 1, 1] : tensor<4x1x8xf32>
-  %1 = triton_xla.squeeze_dims %0 {axis = 1 : i32} : tensor<4x1x8xf32> -> tensor<4x8xf32>
+// CHECK-SAME: (%[[INPUT:.*]]: memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>,
+func.func @fold_squeeze_dims_of_extract(
+  %input: memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>, %offset: index)  -> tensor<4x8xf32>
+{
+  // CHECK: %[[EXTRACT:.*]] = xtile.extract %[[INPUT]]
+  // CHECK-SAME: memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>> -> tensor<4x8xf32>
+  %tile = xtile.extract %input[%offset, %offset, %offset][4, 1, 8][1, 1, 1]
+    : memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>> -> tensor<4x1x8xf32>
+  // CHECK-NOT: triton_xla.squeeze_dims
+  %squeezed = triton_xla.squeeze_dims %tile {axis = 1 : i32} : tensor<4x1x8xf32> -> tensor<4x8xf32>
   // CHECK: return %[[EXTRACT]]
-  return %1 : tensor<4x8xf32>
+  return %squeezed : tensor<4x8xf32>
 }
 
+
 // -----
 
-// CHECK-LABEL: func @squeeze_insert
-func.func @squeeze_insert(%arg0: !tt.ptr<f32>, %arg1: tensor<4x1x8xf32>) {
-  // CHECK: %[[SRC:.*]] = triton_xla.squeeze_dims %arg1 {axis = 1 : i32}
-  // CHECK: triton_xla.insert %[[SRC]] into %arg0
-  // CHECK-SAME: as memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>
-  // CHECK-SAME: [0, 3, 0] [4, 1, 8] [1, 1, 1] : tensor<4x8xf32>
-  triton_xla.insert %arg1 into %arg0
-    as memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>
-    [0, 3, 0] [4, 1, 8] [1, 1, 1] : tensor<4x1x8xf32>
+// CHECK-LABEL: func @squeeze_insert(
+// CHECK-SAME: %[[BUFFER:.*]]: memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>,
+// CHECK-SAME: %[[TILE:.*]]: tensor<4x1x8xf32>
+func.func @squeeze_insert(
+  %arg0: memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>,
+  %arg1: tensor<4x1x8xf32>,
+  %offset: index) {
+  // CHECK: %[[REDUCED:.*]] = triton_xla.squeeze_dims %[[TILE]]
+  // CHECK-SAME: {axis = 1 : i32} : tensor<4x1x8xf32> -> tensor<4x8xf32>
+  // CHECK: xtile.insert %[[REDUCED]] into %[[BUFFER]]
+  // CHECK-SAME: tensor<4x8xf32> -> memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>
+  xtile.insert %arg1 into %arg0[%offset, %offset, %offset][4, 1, 8][1, 1, 1]
+    : tensor<4x1x8xf32> -> memref<4x16x8xf32, #triton_xla.layout<[2, 1, 0]>>
   return
 }
 
 // -----
 
 // CHECK-LABEL: func @squeeze_insert_unit_tensor
-func.func @squeeze_insert_unit_tensor(%arg0: !tt.ptr<f32>, %arg1: tensor<1x1xf32>) {
+func.func @squeeze_insert_unit_tensor(
+  %arg0: memref<1x1xf32,#triton_xla.layout<[0, 1]>>,
+  %arg1: tensor<1x1xf32>,
+  %offset: index) {
   // CHECK: triton_xla.squeeze_dims
-  // CHECK: triton_xla.insert {{.*}} : tensor<1xf32>
-  triton_xla.insert %arg1 into %arg0
-    as memref<1x1xf32,#triton_xla.layout<[0, 1]>> 
-    [0, 0] [1, 1] [1, 1] : tensor<1x1xf32>
+  // CHECK: xtile.insert {{.*}} : tensor<1xf32>
+  xtile.insert %arg1 into %arg0[%offset, %offset] [1, 1] [1, 1]
+    : tensor<1x1xf32> -> memref<1x1xf32,#triton_xla.layout<[0, 1]>>
   return
 }
 
@@ -247,3 +256,14 @@ func.func @squeeze_dims_to_reshape(%arg0: tensor<4x1x8xf32>) -> tensor<4x8xf32>
   %0 = triton_xla.squeeze_dims %arg0 {axis = 1 : i32} : tensor<4x1x8xf32> -> tensor<4x8xf32>
   return %0 : tensor<4x8xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @push_squeeze_dims_up_through_mask
+func.func @push_squeeze_dims_up_through_mask(
+    %arg0: tensor<4x1x8xf32>, %arg1: f32) -> tensor<4x8xf32> {
+  // CHECK: xtile.mask %{{.*}} bounds [4, 6], %arg1 : tensor<4x8xf32>
+  %0 = xtile.mask %arg0 bounds [4, 1, 6], %arg1 : tensor<4x1x8xf32>
+  %1 = triton_xla.squeeze_dims %0 {axis = 1 : i32} : tensor<4x1x8xf32> -> tensor<4x8xf32>
+  return %1 : tensor<4x8xf32>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/xtile_to_triton_lowering.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/xtile_to_triton_lowering.mlir
new file mode 100644
index 00000000000000..aa095137acb3f2
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/xtile_to_triton_lowering.mlir
@@ -0,0 +1,32 @@
+// RUN: xla-opt %s -split-input-file \
+// RUN: -xtile-lower-to-triton \
+// RUN: | FileCheck %s
+
+
+// CHECK: func @lower_dot_scaled_add_to_triton(%[[LHS:.*]]: tensor<128x128xf8E5M2>, %[[LHS_SCALE:.*]]: tensor<128x4xi8>, %[[RHS:.*]]: tensor<128x256xf8E5M2>, %[[RHS_SCALE:.*]]: tensor<256x4xi8>, %[[ACC:.*]]: tensor<128x256xf32>) -> tensor<128x256xf32> {
+func.func @lower_dot_scaled_add_to_triton(
+  %lhs: tensor<128x128xf8E5M2>, %lhs_scale: tensor<128x4xi8>,
+  %rhs: tensor<128x256xf8E5M2>, %rhs_scale: tensor<256x4xi8>,
+  %acc: tensor<128x256xf32>) -> tensor<128x256xf32> {
+  // CHECK: %[[RES:.*]] = tt.dot_scaled %[[LHS]] scale %[[LHS_SCALE]], %[[RHS]] scale %[[RHS_SCALE]], %[[ACC]] lhs = e5m2 rhs = e5m2 {fastMath = true} : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  // CHECK-NOT: arith.addf
+  %0 = xtile.dot_scaled %lhs scale %lhs_scale, %rhs scale %rhs_scale
+    {fastMath = true} : tensor<128x128xf8E5M2>,
+    tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  %1 = arith.addf %acc, %0 : tensor<128x256xf32>
+  // CHECK: return %[[RES]] : tensor<128x256xf32>
+  return %1 : tensor<128x256xf32>
+}
+
+// CHECK: func @lower_dot_scaled_without_add_falls_back_to_xtile(%[[LHS:.*]]: tensor<128x128xf8E5M2>, %[[LHS_SCALE:.*]]: tensor<128x4xi8>, %[[RHS:.*]]: tensor<128x256xf8E5M2>, %[[RHS_SCALE:.*]]: tensor<256x4xi8>) -> tensor<128x256xf32> {
+func.func @lower_dot_scaled_without_add_falls_back_to_xtile(
+  %lhs: tensor<128x128xf8E5M2>, %lhs_scale: tensor<128x4xi8>,
+  %rhs: tensor<128x256xf8E5M2>, %rhs_scale: tensor<256x4xi8>)
+  -> tensor<128x256xf32> {
+  // CHECK: %[[RES:.*]] = xtile.dot_scaled %[[LHS]] scale %[[LHS_SCALE]], %[[RHS]] scale %[[RHS_SCALE]] {fastMath = true} : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  %0 = xtile.dot_scaled %lhs scale %lhs_scale, %rhs scale %rhs_scale
+    {fastMath = true} : tensor<128x128xf8E5M2>,
+    tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  // CHECK: return %[[RES]] : tensor<128x256xf32>
+  return %0 : tensor<128x256xf32>
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc
index da132c9fffd89a..c72caaa6356002 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
@@ -26,7 +25,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 
@@ -87,22 +86,25 @@ class TritonXLAConvertUnsupportedTypesPass
 
     auto* ctx = &getContext();
     ConversionTarget target(*ctx);
-    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-      return converter.isSignatureLegal(op.getFunctionType()) &&
-             converter.isLegal(&op.getBody());
-    });
+    target.addDynamicallyLegalOp<::xla::xtile::EntryFuncOp>(
+        [&](::xla::xtile::EntryFuncOp op) {
+          return converter.isSignatureLegal(op.getFunctionType()) &&
+                 converter.isLegal(&op.getBody());
+        });
     target.markUnknownOpDynamicallyLegal(
         [&](Operation* op) { return converter.isLegal(op); });
 
     RewritePatternSet patterns(ctx);
-    patterns.add<GenericOpConversionPattern<ExtractOp>,
-                 GenericOpConversionPattern<InsertOp>,
+    patterns.add<GenericOpConversionPattern<::xla::xtile::ExtractTileOp>,
+                 GenericOpConversionPattern<::xla::xtile::InsertTileOp>,
                  GenericOpConversionPattern<ReshapeOp>,
                  GenericOpConversionPattern<TransOp>,
+                 GenericOpConversionPattern<ExpandDimsOp>,
+                 GenericOpConversionPattern<BroadcastOp>,
                  GenericOpConversionPattern<arith::BitcastOp>>(converter, ctx);
     scf::populateSCFStructuralTypeConversions(converter, patterns);
-    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
-                                                                   converter);
+    populateFunctionOpInterfaceTypeConversionPattern<::xla::xtile::EntryFuncOp>(
+        patterns, converter);
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns)))) {
       return signalPassFailure();
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
index 441253c786ff9b..abe34672b4e23a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -59,6 +61,7 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/permutation_util.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 
 namespace mlir::triton::xla {
 
@@ -70,6 +73,17 @@ namespace xgt = xg::triton;
 
 namespace {
 
+bool HasBroadcastConsumer(Operation* op) {
+  llvm::SetVector<Operation*> slice;
+  mlir::getForwardSlice(op, &slice);
+  for (Operation* sliced_op : slice) {
+    if (llvm::isa<triton::BroadcastOp>(sliced_op)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 PointerType GetTensorPtrType(Type type) {
   return PointerType::get(
       xgt::StorageType(type),
@@ -81,7 +95,7 @@ SmallVector<Value> IndexCast(::xla::EmitterLocOpBuilder& builder, Type type,
   SmallVector<Value> result;
   result.reserve(values.size());
   for (auto value : values) {
-    result.push_back(builder.create<arith::IndexCastOp>(type, value));
+    result.push_back(arith::IndexCastOp::create(builder, type, value));
   }
   return result;
 }
@@ -149,7 +163,8 @@ bool IsOffsetDivisibilityGuaranteed(mlir::Value offset_val,
 //      minor tile dimension (in bytes) must be divisible by 16, it is
 //      sufficient to check that the offset in the minor dimension (in bytes) is
 //      divisible by 16.
-bool CanUseTma(bool allow_tma, const ArrayRef<int64_t>& original_shape,
+bool CanUseTma(Operation* op, bool allow_tma, int num_stages,
+               const ArrayRef<int64_t>& original_shape,
                const ArrayRef<int64_t>& tile_shape,
                const ArrayRef<int64_t>& tile_strides, ValueRange offsets,
                const TypedValue<PointerType>& pointer,
@@ -169,6 +184,15 @@ bool CanUseTma(bool allow_tma, const ArrayRef<int64_t>& original_shape,
     return false;
   }
 
+  // TODO(b/421858850): CUDA_ERROR_MISALIGNED_ADDRESS errors are
+  // happening for some cases when pipelining stages are > 2. The pattern
+  // observed is that these happen in the presence of a broadcast.
+  // This is a temporary solution. We should remove this once we have a fix for
+  // the error.
+  if (num_stages > 2 && HasBroadcastConsumer(op)) {
+    return false;
+  }
+
   // Some TMA constraints can't be validated if tile strides are dynamic.
   if (mlir::ShapedType::isDynamicShape(tile_strides)) {
     return false;
@@ -285,30 +309,25 @@ class RewriteFuncOp : public mlir::OpRewritePattern<func::FuncOp> {
 
     SmallVector<Type> new_operand_types(input_types);
     for (auto&& [index, operand_type] : llvm::enumerate(new_operand_types)) {
+      auto attr = op.getArgAttr(index, "tt.tma_descriptor");
+      if (!attr) {
+        continue;
+      }
       mlir::BlockArgument func_arg = op.getArgument(index);
       auto element_type =
           mlir::cast<PointerType>(operand_type).getPointeeType();
+      auto tma_descriptor = mlir::cast<TmaDescriptorAttr>(attr);
+      auto layout = tma_descriptor.getLayout();
+      auto block_shape = tma_descriptor.getTileShape();
+      SmallVector<int64_t> ordered_block_shape =
+          GetMajorToMinorOrder(block_shape, layout);
 
-      mlir::UnrealizedConversionCastOp cast_to_orig_type;
-      if (auto attr = op.getArgAttr(index, "tt.tma_descriptor")) {
-        auto tma_descriptor = mlir::cast<TmaDescriptorAttr>(attr);
-        auto layout = tma_descriptor.getLayout();
-        auto block_shape = tma_descriptor.getTileShape();
-        SmallVector<int64_t> ordered_block_shape =
-            GetMajorToMinorOrder(block_shape, layout);
-
-        operand_type = TensorDescType::get(
-            builder.getContext(),
-            RankedTensorType::get(ordered_block_shape, element_type));
-        // !tt.tensordesc<tensor<block_shape x element_type>> -> !tt.ptr<>
-        cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
-            operand_type, func_arg);
-      } else {
-        // !tt.ptr<> -> !tt.ptr<>
-        cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
-            operand_type, func_arg);
-        operand_type = GetTensorPtrType(element_type);
-      }
+      operand_type = TensorDescType::get(
+          builder.getContext(),
+          RankedTensorType::get(ordered_block_shape, element_type));
+      // !tt.tensordesc<tensor<block_shape x element_type>> -> !tt.ptr<>
+      auto cast_to_orig_type = mlir::UnrealizedConversionCastOp::create(
+          builder, operand_type, func_arg);
       func_arg.replaceAllUsesExcept(cast_to_orig_type.getResult(0),
                                     cast_to_orig_type);
     }
@@ -335,8 +354,8 @@ class RewriteFuncOp : public mlir::OpRewritePattern<func::FuncOp> {
 
     // Currently not propagating any function attributes to the new function.
     ArrayRef<NamedAttribute> attrs;
-    auto new_func = builder.create<triton::FuncOp>(
-        op.getName(), new_function_type, attrs, arg_attrs);
+    auto new_func = triton::FuncOp::create(builder, op.getName(),
+                                           new_function_type, attrs, arg_attrs);
 
     for (int i = 0; i < new_func.getNumArguments(); ++i) {
       // TMA arguments don't require tt.divisibility.
@@ -353,7 +372,7 @@ class RewriteFuncOp : public mlir::OpRewritePattern<func::FuncOp> {
 
     auto terminator = new_func.getBody().front().getTerminator();
     rewriter.setInsertionPoint(terminator);
-    rewriter.create<triton::ReturnOp>(new_func.getLoc());
+    triton::ReturnOp::create(rewriter, new_func.getLoc());
     rewriter.eraseOp(terminator);
 
     return mlir::success();
@@ -395,7 +414,7 @@ Value ExpandAndBroadcastValue(::xla::EmitterLocOpBuilder& builder, Value value,
                               int dim, RankedTensorType tile_type) {
   for (int i = 0; i < tile_type.getRank(); ++i) {
     if (i != dim) {
-      value = builder.create<ExpandDimsOp>(value, i);
+      value = ExpandDimsOp::create(builder, value, i);
     }
   }
   return BroadcastOp::create(builder, tile_type, value);
@@ -496,8 +515,10 @@ static std::pair<Value, Value> CreateTensorOfPointersAndMask(
 
 class RewriteExtract : public mlir::OpRewritePattern<ExtractOp> {
  public:
-  RewriteExtract(mlir::MLIRContext* context, bool allow_tma)
-      : OpRewritePattern(context), allow_tma_(allow_tma) {}
+  RewriteExtract(mlir::MLIRContext* context, bool allow_tma, int num_stages)
+      : OpRewritePattern(context),
+        allow_tma_(allow_tma),
+        num_stages_(num_stages) {}
   using OpRewritePattern::OpRewritePattern;
 
  private:
@@ -524,8 +545,8 @@ class RewriteExtract : public mlir::OpRewritePattern<ExtractOp> {
     auto sizes = op.getStaticSizes();
     auto strides = to_vector(op.getStaticStrides());
 
-    if (CanUseTma(allow_tma_, src_shape, sizes, strides, offsets, op.getSrc(),
-                  src_layout)) {
+    if (CanUseTma(op, allow_tma_, num_stages_, src_shape, sizes, strides,
+                  offsets, op.getSrc(), src_layout)) {
       if (auto result = CanonicalizeTileStrides(strides, sizes, src_shape);
           !result.ok()) {
         return rewriter.notifyMatchFailure(op, result.message());
@@ -576,23 +597,27 @@ class RewriteExtract : public mlir::OpRewritePattern<ExtractOp> {
         reduced_dims, tile_shape);
     Value other;
     if (mask) {
-      other = builder.create<arith::ConstantOp>(builder.getZeroAttr(
-          RankedTensorType::get(tile_shape, tile_type.getElementType())));
+      other = arith::ConstantOp::create(
+          builder, builder.getZeroAttr(RankedTensorType::get(
+                       tile_shape, tile_type.getElementType())));
     }
-    auto load = builder.create<LoadOp>(ptr, mask, other, CacheModifier::NONE,
-                                       EvictionPolicy::NORMAL,
-                                       /*isVolatile=*/false);
+    auto load = LoadOp::create(builder, ptr, mask, other, CacheModifier::NONE,
+                               EvictionPolicy::NORMAL,
+                               /*isVolatile=*/false);
     rewriter.replaceOp(op, load);
     return mlir::success();
   }
 
   const bool allow_tma_;
+  const int num_stages_;
 };
 
 class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
  public:
-  RewriteInsert(mlir::MLIRContext* context, bool allow_tma)
-      : OpRewritePattern(context), allow_tma_(allow_tma) {}
+  RewriteInsert(mlir::MLIRContext* context, bool allow_tma, int num_stages)
+      : OpRewritePattern(context),
+        allow_tma_(allow_tma),
+        num_stages_(num_stages) {}
   using OpRewritePattern::OpRewritePattern;
 
  private:
@@ -627,8 +652,8 @@ class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
     SmallVector<unsigned> reduced_dims = to_vector(*reduction_mask);
     absl::c_sort(reduced_dims);
 
-    if (CanUseTma(allow_tma_, dst_shape, sizes, strides, offsets, op.getDst(),
-                  dst_layout)) {
+    if (CanUseTma(op, allow_tma_, num_stages_, dst_shape, sizes, strides,
+                  offsets, op.getDst(), dst_layout)) {
       if (auto result = CanonicalizeTileStrides(strides, sizes, dst_shape);
           !result.ok()) {
         return rewriter.notifyMatchFailure(op, result.message());
@@ -654,7 +679,7 @@ class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
         // Transpose to a major-to-minor tensor by simply reversing the layout.
         auto transpose_order = llvm::to_vector_of<int32_t>(dst_layout);
         std::reverse(transpose_order.begin(), transpose_order.end());
-        src = builder.create<TransOp>(src, transpose_order);
+        src = TransOp::create(builder, src, transpose_order);
       }
 
       auto ordered_offsets = GetMajorToMinorOrder(offsets, dst_layout);
@@ -673,6 +698,7 @@ class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
   }
 
   const bool allow_tma_;
+  const int num_stages_;
 };
 
 // Rewriting tensor::InsertOp as tt.store.
@@ -688,9 +714,9 @@ class RewriteScalarInsert : public mlir::OpRewritePattern<tensor::InsertOp> {
     }
     ::xla::EmitterLocOpBuilder builder(op.getLoc(), rewriter);
     auto ptr_type = GetTensorPtrType(op.getScalar().getType());
-    auto cast_dst_to_tensor_ptr_type =
-        builder.create<mlir::UnrealizedConversionCastOp>(ptr_type, op.getDest())
-            .getResult(0);
+    auto cast_dst_to_tensor_ptr_type = mlir::UnrealizedConversionCastOp::create(
+                                           builder, ptr_type, op.getDest())
+                                           .getResult(0);
     StoreOp::create(builder, cast_dst_to_tensor_ptr_type, op.getScalar(),
                     /*boundary_checks=*/std::vector<int32_t>{},
                     CacheModifier::NONE, EvictionPolicy::NORMAL);
@@ -733,8 +759,8 @@ class TritonXLAExtractInsertToTritonPass
   void runOnOperation() override {
     mlir::MLIRContext* mlir_context = &getContext();
     mlir::RewritePatternSet patterns(mlir_context);
-    patterns.add<RewriteExtract, RewriteInsert>(mlir_context,
-                                                allow_tma_.getValue());
+    patterns.add<RewriteExtract, RewriteInsert>(
+        mlir_context, allow_tma_.getValue(), num_stages_.getValue());
     patterns.add<RewriteScalarExtract, RewriteScalarInsert>(mlir_context);
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
@@ -757,9 +783,9 @@ std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass() {
 }
 
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
-    bool allow_tma) {
+    bool allow_tma, int num_stages) {
   return std::make_unique<TritonXLAExtractInsertToTritonPass>(
-      TritonXLAExtractInsertToTritonPassOptions{allow_tma});
+      TritonXLAExtractInsertToTritonPassOptions{allow_tma, num_stages});
 }
 
 }  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc
index 0a3c351da1c663..e443ef532a3e21 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc
@@ -16,18 +16,18 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <optional>
 #include <type_traits>
 #include <utility>
 
 #include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OperationSupport.h"
@@ -38,8 +38,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/util.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
@@ -58,20 +58,17 @@ namespace {
   return guard;
 }
 
-LogicalResult FoldTransposeOfExtract(TransOp op, PatternRewriter& rewriter) {
-  auto extract = op.getSrc().getDefiningOp<ExtractOp>();
+// Push the transpose up through the extract tile, this will then be folded into
+// MemrefToPtr at the lowering stage.
+LogicalResult PushTransposeThroughExtractTile(TransOp op,
+                                              PatternRewriter& rewriter) {
+  auto extract = op.getSrc().getDefiningOp<::xla::xtile::ExtractTileOp>();
   if (!extract) {
     return rewriter.notifyMatchFailure(op, "Transpose source is not extract.");
   }
 
-  // Compute the dimensions dropped from the source.
-  std::optional<llvm::SmallDenseSet<unsigned>> reduction_mask =
-      computeRankReductionMask(extract.getStaticSizes(),
-                               extract.getType().getShape());
-  if (!reduction_mask) {
-    return rewriter.notifyMatchFailure(op, "Unsupported rank reduction.");
-  }
-  SmallVector<unsigned> reduced_dims = to_vector(*reduction_mask);
+  SmallVector<unsigned> reduced_dims =
+      to_vector(extract.getReducedDimensions());
   absl::c_sort(reduced_dims);
 
   // Compute the set of not-reduced dimensions.
@@ -88,8 +85,8 @@ LogicalResult FoldTransposeOfExtract(TransOp op, PatternRewriter& rewriter) {
   }
 
   // Compute the permutation of source dimensions.
-  size_t src_rank = extract.getSrcShape().size();
-  SmallVector<int32_t> permutation;
+  size_t src_rank = extract.getSource().getType().getRank();
+  SmallVector<int64_t> permutation;
   permutation.reserve(src_rank);
   for (auto [src_dim, dst_dim] :
        llvm::zip_equal(retained_dims, op.getOrder())) {
@@ -111,24 +108,20 @@ LogicalResult FoldTransposeOfExtract(TransOp op, PatternRewriter& rewriter) {
     return result;
   };
 
-  SmallVector<int32_t> inv_permutation(permutation.size());
-  for (auto [i, dim] : llvm::enumerate(permutation)) {
-    inv_permutation[dim] = i;
-  }
+  auto permutation_map = mlir::AffineMapAttr::get(
+      mlir::AffineMap::getPermutationMap(permutation, rewriter.getContext()));
+  // TODO(willfroom): Return a permutation layout (b/455478641).
+  auto pushed_transpose = mlir::memref::TransposeOp::create(
+      rewriter, extract.getLoc(), extract.getSource(), permutation_map);
 
-  SmallVector<int64_t> layout;
-  layout.reserve(extract.getSrcLayout().size());
-  for (auto dim : extract.getSrcLayout()) {
-    layout.push_back(inv_permutation[dim]);
-  }
+  rewriter.replaceOpWithNewOp<::xla::xtile::ExtractTileOp>(
+      op, op.getType(), pushed_transpose, permute(extract.getOffsets()),
+      permute(extract.getFullTileShape()), permute(extract.getStrides()));
 
-  rewriter.replaceOpWithNewOp<ExtractOp>(
-      op, op.getType(), extract.getSrc(), permute(extract.getMixedOffsets()),
-      permute(extract.getStaticSizes()), permute(extract.getStaticStrides()),
-      permute(extract.getSrcShape()), layout);
   if (extract->use_empty()) {
     rewriter.eraseOp(extract);
   }
+
   return success();
 }
 
@@ -139,8 +132,8 @@ LogicalResult PushTransposeUpThroughBroadcast(TransOp op,
     return rewriter.notifyMatchFailure(  //
         op, "Transpose source is not a broadcast.");
   }
-  Value new_trans = rewriter.create<TransOp>(op.getLoc(), broadcast.getSrc(),
-                                             op.getOrderAttr());
+  Value new_trans = TransOp::create(rewriter, op.getLoc(), broadcast.getSrc(),
+                                    op.getOrderAttr());
   rewriter.replaceOpWithNewOp<BroadcastOp>(op, op.getType(), new_trans);
   return success();
 }
@@ -169,7 +162,7 @@ LogicalResult PushTransposeUpThroughExpandDims(TransOp op,
   }
 
   Value new_trans =
-      rewriter.create<TransOp>(op.getLoc(), expand_dims.getSrc(), new_order);
+      TransOp::create(rewriter, op.getLoc(), expand_dims.getSrc(), new_order);
   rewriter.replaceOpWithNewOp<ExpandDimsOp>(op, op.getType(), new_trans,
                                             new_axis);
   return success();
@@ -188,8 +181,8 @@ LogicalResult PushTransposeUpThroughElementwise(TransOp op,
   new_operands.reserve(elementwise->getNumOperands());
   for (Value operand : elementwise->getOperands()) {
     if (auto tensor_type = dyn_cast<RankedTensorType>(operand.getType())) {
-      operand = rewriter.create<TransOp>(elementwise->getLoc(), operand,
-                                         op.getOrderAttr());
+      operand = TransOp::create(rewriter, elementwise->getLoc(), operand,
+                                op.getOrderAttr());
     }
     new_operands.push_back(operand);
   }
@@ -230,9 +223,10 @@ LogicalResult PushTransposeUpIntoIf(TransOp op, PatternRewriter& rewriter) {
   auto new_types = llvm::to_vector(if_op.getResultTypes());
   new_types[result_number] = op.getType();
 
-  auto new_if_op = rewriter.create<scf::IfOp>(
-      op.getLoc(), new_types, if_op.getCondition(), /*addThenBlock=*/false,
-      /*addElseBlock=*/false);
+  auto new_if_op =
+      scf::IfOp::create(rewriter, op.getLoc(), new_types, if_op.getCondition(),
+                        /*addThenBlock=*/false,
+                        /*addElseBlock=*/false);
 
   // Update then and else regions.
   for (auto [old_region, new_region] :
@@ -243,9 +237,9 @@ LogicalResult PushTransposeUpIntoIf(TransOp op, PatternRewriter& rewriter) {
     }
     auto yield_op = new_region->front().getTerminator();
     OpBuilder::InsertionGuard guard = SetInsertionPoint(rewriter, yield_op);
-    auto trans_op = rewriter.create<TransOp>(
-        op.getLoc(), op.getType(), yield_op->getOperand(result_number),
-        op.getOrderAttr());
+    auto trans_op =
+        TransOp::create(rewriter, op.getLoc(), op.getType(),
+                        yield_op->getOperand(result_number), op.getOrderAttr());
     yield_op->setOperand(result_number, trans_op);
   }
   rewriter.replaceOp(op, new_if_op.getResult(result_number));
@@ -319,11 +313,31 @@ LogicalResult PushTransposeUpThroughReshape(TransOp op,
   }
 
   auto new_trans =
-      rewriter.create<TransOp>(reshape.getLoc(), reshape.getSrc(), new_order);
+      TransOp::create(rewriter, reshape.getLoc(), reshape.getSrc(), new_order);
   rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), new_trans);
   return success();
 }
 
+LogicalResult PushTransposeUpThroughMask(TransOp op,
+                                         PatternRewriter& rewriter) {
+  auto mask_op = op.getSrc().getDefiningOp<::xla::xtile::MaskOp>();
+  if (!mask_op) {
+    return rewriter.notifyMatchFailure(op, "source is not a mask op");
+  }
+
+  llvm::SmallVector<int64_t> new_bounds(op.getOrder().size());
+  for (auto [idx, dim] : llvm::enumerate(op.getOrder())) {
+    new_bounds[idx] = mask_op.getBounds()[dim];
+  }
+
+  auto new_transpose = TransOp::create(rewriter, op.getLoc(),
+                                       mask_op.getSource(), op.getOrderAttr());
+
+  rewriter.replaceOpWithNewOp<::xla::xtile::MaskOp>(
+      op, op.getType(), new_transpose, new_bounds, mask_op.getValue());
+  return success();
+}
+
 class TritonXLAFoldTransposePass
     : public impl::TritonXLAFoldTransposePassBase<TritonXLAFoldTransposePass> {
  public:
@@ -332,13 +346,14 @@ class TritonXLAFoldTransposePass
  private:
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
-    patterns.add(FoldTransposeOfExtract);
+    patterns.add(PushTransposeThroughExtractTile);
     patterns.add(PushTransposeUpIntoIf);
     patterns.add(HoistTransposeUpFromIf, /*benefit=*/2);
     patterns.add(PushTransposeUpThroughBroadcast);
     patterns.add(PushTransposeUpThroughElementwise);
     patterns.add(PushTransposeUpThroughExpandDims);
     patterns.add(PushTransposeUpThroughReshape);
+    patterns.add(PushTransposeUpThroughMask);
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       return signalPassFailure();
     }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_atomics_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_atomics_pass.cc
index e3d7dde02dc39f..c3f34de45d0da6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_atomics_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_atomics_pass.cc
@@ -103,9 +103,11 @@ LogicalResult LowerAtomicWriteOp(AtomicWriteOp atomic_write,
   // Even though we don't care about result($0) in this case it must be there
   // for ElementwiseInlineAsmOp verifiers to work
   constexpr absl::string_view kAtomicWriteAsmWithMaskTemplate = R"(
+    {
     .reg .pred %%p<>;
     setp.ne.u32 %%p<>, $3, 0;
     @%%p st.global.%s.%s.u32 [$1], $2;
+    }
   )";
   constexpr absl::string_view kAtomicWriteAsmTemplate = R"(
     st.global.%s.%s.u32 [$1], $2;
@@ -116,7 +118,8 @@ LogicalResult LowerAtomicWriteOp(AtomicWriteOp atomic_write,
   if (mask) {
     const std::string atomic_write_asm_with_mask = absl::StrFormat(
         kAtomicWriteAsmWithMaskTemplate, scope, memory_semantic);
-    builder.create<triton::ElementwiseInlineAsmOp>(
+    triton::ElementwiseInlineAsmOp::create(
+        builder,
         /*result_types=*/result_type,
         /*asm_string=*/rewriter.getStringAttr(atomic_write_asm_with_mask),
         /*constraints=*/rewriter.getStringAttr("=r,l,r,r"),
@@ -126,7 +129,8 @@ LogicalResult LowerAtomicWriteOp(AtomicWriteOp atomic_write,
   } else {
     const std::string atomic_write_asm =
         absl::StrFormat(kAtomicWriteAsmTemplate, scope, memory_semantic);
-    builder.create<triton::ElementwiseInlineAsmOp>(
+    triton::ElementwiseInlineAsmOp::create(
+        builder,
         /*result_types=*/result_type,
         /*asm_string=*/rewriter.getStringAttr(atomic_write_asm),
         /*constraints=*/rewriter.getStringAttr("=r,l,r"),
@@ -157,14 +161,17 @@ LogicalResult LowerAtomicSpinWaitOp(AtomicSpinWaitOp atomic_wait,
 
   absl::string_view comparator = GetComparatorStr(atomic_wait.getComparator());
   constexpr absl::string_view kAtomicSpinWaitAsmTemplate = R"(
+    {
     .reg .pred %%p<1>;
     .reg .b32 %%r<1>;
     wait:
       ld.global.%s.%s.u32 %%r0, [$1];
       setp.%s.u32 %%p0, %%r0, $2;
       @%%p0 bra wait;
+    }
   )";
   constexpr absl::string_view kAtomicSpinWaitAsmWithMaskTemplate = R"(
+    {
     .reg .pred %%p<2>;
     .reg .b32 %%r<1>;
     setp.ne.u32 %%p0, $3, 0;
@@ -174,13 +181,15 @@ LogicalResult LowerAtomicSpinWaitOp(AtomicSpinWaitOp atomic_wait,
       setp.%s.u32 %%p1, %%r0, $2;
       @%%p1 bra wait;
     done:
+    }
   )";
   mlir::Type result_type = GetResultType(ptr.getType(), rewriter);
   Value mask = atomic_wait.getMask();
   if (mask) {
     const std::string atomic_wait_asm_with_mask = absl::StrFormat(
         kAtomicSpinWaitAsmWithMaskTemplate, scope, memory_semantic, comparator);
-    builder.create<triton::ElementwiseInlineAsmOp>(
+    triton::ElementwiseInlineAsmOp::create(
+        builder,
         /*result_types=*/result_type,
         /*asm_string=*/rewriter.getStringAttr(atomic_wait_asm_with_mask),
         /*constraints=*/rewriter.getStringAttr("=r,l,r,r"),
@@ -190,7 +199,8 @@ LogicalResult LowerAtomicSpinWaitOp(AtomicSpinWaitOp atomic_wait,
   } else {
     const std::string atomic_wait_asm = absl::StrFormat(
         kAtomicSpinWaitAsmTemplate, scope, memory_semantic, comparator);
-    builder.create<triton::ElementwiseInlineAsmOp>(
+    triton::ElementwiseInlineAsmOp::create(
+        builder,
         /*result_types=*/result_type,
         /*asm_string=*/rewriter.getStringAttr(atomic_wait_asm),
         /*constraints=*/rewriter.getStringAttr("=r,l,r"),
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_block_barrier_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_block_barrier_pass.cc
index c36efa0ba782d7..5d83980a13e349 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_block_barrier_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_block_barrier_pass.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::xla {
 
@@ -54,17 +55,18 @@ LogicalResult LowerBlockBarrierOp(BlockBarrierOp block_barrier,
   constexpr int32_t kGlobalAddressSpace = 1;
 
   const mlir::TypedValue<mlir::Type> world_size_op =
-      builder.create<mlir::arith::ConstantOp>(
-          builder.getI32IntegerAttr(world_size));
+      mlir::arith::ConstantOp::create(builder,
+                                      builder.getI32IntegerAttr(world_size));
   const mlir::TypedValue<mlir::IntegerType> thread_id =
-      builder.create<triton::xla::GetTidOp>();
+      triton::xla::GetTidOp::create(builder);
   const mlir::TypedValue<mlir::IntegerType> block_id =
-      builder.create<triton::GetProgramIdOp>(0);
-  auto tid_is_lt_world_size = builder.create<mlir::arith::CmpIOp>(
-      mlir::arith::CmpIPredicate::ult, thread_id, world_size_op);
+      triton::GetProgramIdOp::create(builder, 0);
+  auto tid_is_lt_world_size = mlir::arith::CmpIOp::create(
+      builder, mlir::arith::CmpIPredicate::ult, thread_id, world_size_op);
 
   // Only the first `world_size` threads will execute this block.
-  builder.create<mlir::scf::IfOp>(
+  mlir::scf::IfOp::create(
+      builder,
       /*cond=*/tid_is_lt_world_size,
       // Inside if block so tid must be less than world_size.
       /*thenBuilder=*/
@@ -95,44 +97,45 @@ LogicalResult LowerBlockBarrierOp(BlockBarrierOp block_barrier,
         // Triton seems to fail to do pointer arithmetic on pointer of
         // pointers. So we cast the inner one to i64.
         // -> !tt.ptr<i64>
-        auto signal_buffers_i64 = builder.create<mlir::triton::BitcastOp>(
-            ptr_to_i64_type, signal_buffers_arg);
+        auto signal_buffers_i64 = mlir::triton::BitcastOp::create(
+            builder, ptr_to_i64_type, signal_buffers_arg);
         // SignalBuffers[WorldSize][BlockSize][WorldSize]
         // -> tensor<world_size x !tt.ptr<i64>>
-        auto signal_buffers_tensor = builder.create<mlir::triton::SplatOp>(
-            tensor_of_i64_ptr_type, signal_buffers_i64);
+        auto signal_buffers_tensor = mlir::triton::SplatOp::create(
+            builder, tensor_of_i64_ptr_type, signal_buffers_i64);
         // -> tensor<world_size x i32>
-        auto all_ranks = builder.create<mlir::triton::MakeRangeOp>(
-            i32_tensor_type, 0, world_size);
+        auto all_ranks = mlir::triton::MakeRangeOp::create(
+            builder, i32_tensor_type, 0, world_size);
         // Pointer to SignalBuffers[0..WorldSize]
         // -> tensor<world_size x !tt.ptr<i64>>
-        auto signal_buffer_ptr = builder.create<mlir::triton::AddPtrOp>(
-            tensor_of_i64_ptr_type, signal_buffers_tensor, all_ranks);
+        auto signal_buffer_ptr = mlir::triton::AddPtrOp::create(
+            builder, tensor_of_i64_ptr_type, signal_buffers_tensor, all_ranks);
         // SignalBuffers[0..WorldSize]
         // -> tensor<world_size x i64>
-        auto signal_buffer_i64 = builder.create<mlir::triton::LoadOp>(
+        auto signal_buffer_i64 = mlir::triton::LoadOp::create(
+            builder,
             /*ptr=*/signal_buffer_ptr,
             /*cache=*/mlir::triton::CacheModifier::NONE,
             /*evict=*/mlir::triton::EvictionPolicy::NORMAL,
             /*isVolatile=*/false);
         // -> tensor<world_size x !tt.ptr<i32>>
-        auto signal_buffer = builder.create<mlir::triton::IntToPtrOp>(
-            tensor_of_ptr_to_i32_type, signal_buffer_i64);
-        auto block_offset = builder.create<mlir::arith::MulIOp>(
-            i32_type, block_id, world_size_op);
+        auto signal_buffer = mlir::triton::IntToPtrOp::create(
+            builder, tensor_of_ptr_to_i32_type, signal_buffer_i64);
+        auto block_offset = mlir::arith::MulIOp::create(
+            builder, i32_type, block_id, world_size_op);
         auto block_offset_plus_rank =
-            builder.create<mlir::arith::AddIOp>(i32_type, block_offset, rank);
+            mlir::arith::AddIOp::create(builder, i32_type, block_offset, rank);
         // -> tensor<world_size x i32>
-        auto block_offset_plus_rank_tensor =
-            builder.create<mlir::triton::SplatOp>(i32_tensor_type,
-                                                  block_offset_plus_rank);
+        auto block_offset_plus_rank_tensor = mlir::triton::SplatOp::create(
+            builder, i32_tensor_type, block_offset_plus_rank);
         // SignalBuffers[0..WorldSize][block_id][rank]
         // -> tensor<world_size x !tt.ptr<i32>>
-        auto signal_addresses = builder.create<mlir::triton::AddPtrOp>(
-            tensor_of_ptr_to_i32_type, signal_buffer,
+        auto signal_addresses = mlir::triton::AddPtrOp::create(
+            builder, tensor_of_ptr_to_i32_type, signal_buffer,
             block_offset_plus_rank_tensor);
         // Signal all ranks on the same block id.
-        builder.create<mlir::triton::xla::AtomicWriteOp>(
+        mlir::triton::xla::AtomicWriteOp::create(
+            builder,
             /*result_types=*/mlir::TypeRange{},
             /*ptr=*/signal_addresses,
             /*signal_value=*/signal_value,
@@ -141,34 +144,35 @@ LogicalResult LowerBlockBarrierOp(BlockBarrierOp block_barrier,
             /*sem=*/mlir::triton::MemSemantic::RELEASE);
         // Pointer to SignalBuffers[rank]
         // -> !tt.ptr<i64>
-        auto read_address_ptr_to_i64 = builder.create<mlir::triton::AddPtrOp>(
-            signal_buffers_i64.getType(), signal_buffers_i64, rank);
+        auto read_address_ptr_to_i64 = mlir::triton::AddPtrOp::create(
+            builder, signal_buffers_i64.getType(), signal_buffers_i64, rank);
         // SignalBuffers[rank]
         // -> i64
-        auto read_address_i64 = builder.create<mlir::triton::LoadOp>(
+        auto read_address_i64 = mlir::triton::LoadOp::create(
+            builder,
             /*ptr=*/read_address_ptr_to_i64,
             /*cache=*/mlir::triton::CacheModifier::NONE,
             /*evict=*/mlir::triton::EvictionPolicy::NORMAL,
             /*isVolatile=*/false);
         // -> !tt.ptr<i32>
-        auto read_address = builder.create<mlir::triton::IntToPtrOp>(
-            ptr_to_i32_type, read_address_i64);
+        auto read_address = mlir::triton::IntToPtrOp::create(
+            builder, ptr_to_i32_type, read_address_i64);
         // Pointer to SignalBuffers[rank][block_id]
         // -> !tt.ptr<i32>
-        auto read_address_at_block_offset =
-            builder.create<mlir::triton::AddPtrOp>(ptr_to_i32_type,
-                                                   read_address, block_offset);
+        auto read_address_at_block_offset = mlir::triton::AddPtrOp::create(
+            builder, ptr_to_i32_type, read_address, block_offset);
         // -> tensor<world_size x !tt.ptr<i32>>
         auto read_address_at_block_offset_tensor =
-            builder.create<mlir::triton::SplatOp>(tensor_of_ptr_to_i32_type,
-                                                  read_address_at_block_offset);
+            mlir::triton::SplatOp::create(builder, tensor_of_ptr_to_i32_type,
+                                          read_address_at_block_offset);
         // SignalBuffers[rank][block_id][0..WorldSize]
         // -> tensor<world_size x !tt.ptr<i32>>
-        auto wait_addresses = builder.create<mlir::triton::AddPtrOp>(
-            tensor_of_ptr_to_i32_type, read_address_at_block_offset_tensor,
-            all_ranks);
+        auto wait_addresses = mlir::triton::AddPtrOp::create(
+            builder, tensor_of_ptr_to_i32_type,
+            read_address_at_block_offset_tensor, all_ranks);
         // Wait for all ranks on the same block id to signal.
-        builder.create<mlir::triton::xla::AtomicSpinWaitOp>(
+        mlir::triton::xla::AtomicSpinWaitOp::create(
+            builder,
             /*result_types=*/mlir::TypeRange{},
             /*ptr=*/wait_addresses,
             /*expected=*/signal_value,
@@ -177,8 +181,9 @@ LogicalResult LowerBlockBarrierOp(BlockBarrierOp block_barrier,
             /*sem=*/mlir::triton::MemSemantic::ACQUIRE,
             /*comparator=*/Comparator::LT);
         // Terminate the block.
-        builder.create<mlir::scf::YieldOp>();
+        mlir::scf::YieldOp::create(builder);
       });
+  builder.create<mlir::triton::gpu::LocalBarrierOp>();
   rewriter.eraseOp(block_barrier);
   return success();
 }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_get_tid_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_get_tid_pass.cc
index 8ab68caf2cb233..0b034f0892e4e9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_get_tid_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_get_tid_pass.cc
@@ -45,8 +45,8 @@ LogicalResult LowerGetTidOp(GetTidOp get_flat_tid, PatternRewriter& rewriter) {
   const absl::string_view get_tid_asm = R"(
     mov.u32 $0, %tid.x;
   )";
-  auto tid_op = rewriter.create<mlir::triton::ElementwiseInlineAsmOp>(
-      loc,
+  auto tid_op = mlir::triton::ElementwiseInlineAsmOp::create(
+      rewriter, loc,
       /*result_types=*/i32_type,
       /*asm_string=*/rewriter.getStringAttr(get_tid_asm),
       /*constraints=*/rewriter.getStringAttr("=r"),
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc
index 59814592786756..5d7a889020f1f3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc
@@ -54,8 +54,8 @@ LogicalResult LowerGetRankOp(GetRankOp get_rank, PatternRewriter& rewriter) {
   }
 
   // The rank id is stored as a first element under the metadata pointer.
-  Value loadOp = rewriter.create<LoadOp>(
-      get_rank.getLoc(), expected_result_type, metadata,
+  Value loadOp = LoadOp::create(
+      rewriter, get_rank.getLoc(), expected_result_type, metadata,
       /*mask=*/nullptr, /*other=*/nullptr, /*boundaryCheck=*/nullptr,
       /*padding=*/nullptr,
       CacheModifierAttr::get(get_rank.getContext(), CacheModifier::NONE),
@@ -67,8 +67,14 @@ LogicalResult LowerGetRankOp(GetRankOp get_rank, PatternRewriter& rewriter) {
 
 // The peer address should be computed as follows:
 //
-// offset = address - metadata->buffer_root_ptrs[metadata->rank].
-// peer_address = metadata->buffer_root_ptrs[peer_id] + offset.
+// argument_offset = world_size * argument_index
+// argument_base = metadata->param_to_peers[argument_offset + metadata->rank]
+// offset = address - argument_base
+// peer_base = metadata->param_to_peers[argument_offset + peer_id]
+// peer_address = peer_base + offset
+//
+// For more details regarding peer pointers layout see comments in the:
+// `stream_executor::gpu::CollectiveKernelMetadata`.
 LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
                                 PatternRewriter& rewriter) {
   Value metadata = get_peer_ptr.getMetadata();
@@ -89,59 +95,69 @@ LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
 
   // Size of the pointer in bytes.
   Value pointer_size_bytes_const =
-      builder.create<arith::ConstantIntOp>(type_i64, sizeof(int64_t));
+      arith::ConstantIntOp::create(builder, type_i64, sizeof(int64_t));
 
   // 1. Load metadata->rank.
-  Value current_rank_load_op = builder.create<GetRankOp>(metadata);
+  Value current_rank_load_op = GetRankOp::create(builder, metadata);
+
+  // 2. Calculate argument_offset = num_ranks * argument_index.
+  const int32_t argument_index = get_peer_ptr.getArgumentIndex();
+  const int32_t world_size = get_peer_ptr.getWorldSize();
+  const int32_t argument_offset =
+      world_size * argument_index * sizeof(uint64_t);
 
-  // 2. Load metadata->local_buffer_root_ptrs[metadata->rank].
-  Value local_buffers_ptrs_offset = builder.create<arith::ConstantIntOp>(
-      type_i64, offsetof(CollectiveKernelMetadata, local_buffer_root_ptrs));
+  // 3. Load metadata->param_to_peers[argument_offset + metadata->rank].
+  Value local_buffers_ptrs_offset = arith::ConstantIntOp::create(
+      builder, type_i64, offsetof(CollectiveKernelMetadata, param_to_peers));
 
   Value rank_offset =
-      builder.create<arith::ExtUIOp>(type_i64, current_rank_load_op);
+      arith::ExtUIOp::create(builder, type_i64, current_rank_load_op);
+  Value argument_offset_bytes =
+      arith::ConstantIntOp::create(builder, type_i64, argument_offset);
   Value current_rank_offset_bytes =
-      builder.create<arith::MulIOp>(rank_offset, pointer_size_bytes_const);
-  Value current_ptr_offset_bytes = builder.create<arith::AddIOp>(
-      local_buffers_ptrs_offset, current_rank_offset_bytes);
+      arith::MulIOp::create(builder, rank_offset, pointer_size_bytes_const);
+  Value argument_ptr_offset_bytes = arith::AddIOp::create(
+      builder, local_buffers_ptrs_offset, argument_offset_bytes);
+  Value current_ptr_offset_bytes = arith::AddIOp::create(
+      builder, argument_ptr_offset_bytes, current_rank_offset_bytes);
 
-  Value current_range_address = builder.create<AddPtrOp>(
-      metadata.getType(), metadata, current_ptr_offset_bytes);
+  Value current_range_address = AddPtrOp::create(
+      builder, metadata.getType(), metadata, current_ptr_offset_bytes);
 
-  Value current_range_address_value = builder.create<LoadOp>(
-      type_i64, current_range_address,
+  Value current_range_address_value = LoadOp::create(
+      builder, type_i64, current_range_address,
       /*mask=*/nullptr, /*other=*/nullptr, /*boundaryCheck=*/nullptr,
       /*padding=*/nullptr, CacheModifierAttr::get(ctx, CacheModifier::NONE),
       EvictionPolicyAttr::get(ctx, EvictionPolicy::NORMAL),
       /*isVolatile=*/rewriter.getBoolAttr(false));
 
-  // 3. Calculate offset =
-  //      address - metadata->local_buffer_root_ptrs[metadata->rank].
+  // 4. Calculate offset =
+  //      address - metadata->param_to_peers[argument_offset + metadata->rank].
   Value current_range_address_int =
-      builder.create<PtrToIntOp>(type_i64, address);
-  Value offsetInt = builder.create<arith::SubIOp>(current_range_address_int,
-                                                  current_range_address_value);
+      PtrToIntOp::create(builder, type_i64, address);
+  Value offsetInt = arith::SubIOp::create(builder, current_range_address_int,
+                                          current_range_address_value);
 
-  // 4. Load metadata->local_buffer_root_ptrs[peer_id].
-  Value peer_index = builder.create<arith::ExtUIOp>(type_i64, peer_id);
+  // 5. Load metadata->param_to_peers[argument_offset + peer_id].
+  Value peer_index = arith::ExtUIOp::create(builder, type_i64, peer_id);
   Value peer_index_offset_bytes =
-      builder.create<arith::MulIOp>(peer_index, pointer_size_bytes_const);
-  Value peer_range_offset_bytes = builder.create<arith::AddIOp>(
-      local_buffers_ptrs_offset, peer_index_offset_bytes);
-  Value peer_range_address = builder.create<AddPtrOp>(
-      metadata.getType(), metadata, peer_range_offset_bytes);
-
-  Value peer_range_address_value = builder.create<LoadOp>(
-      type_i64, peer_range_address,
+      arith::MulIOp::create(builder, peer_index, pointer_size_bytes_const);
+  Value peer_range_offset_bytes = arith::AddIOp::create(
+      builder, argument_ptr_offset_bytes, peer_index_offset_bytes);
+  Value peer_range_address = AddPtrOp::create(
+      builder, metadata.getType(), metadata, peer_range_offset_bytes);
+
+  Value peer_range_address_value = LoadOp::create(
+      builder, type_i64, peer_range_address,
       /*mask=*/nullptr, /*other=*/nullptr, /*boundaryCheck=*/nullptr,
       /*padding=*/nullptr, CacheModifierAttr::get(ctx, CacheModifier::NONE),
       EvictionPolicyAttr::get(ctx, EvictionPolicy::NORMAL),
       /*isVolatile=*/rewriter.getBoolAttr(false));
 
-  // 5. Calculate the result address: peerBasePtr + offset.
+  // 6. Calculate the result address: peerBasePtr + offset.
   Value result_int =
-      builder.create<arith::AddIOp>(peer_range_address_value, offsetInt);
-  Value result_address = builder.create<IntToPtrOp>(result_type, result_int);
+      arith::AddIOp::create(builder, peer_range_address_value, offsetInt);
+  Value result_address = IntToPtrOp::create(builder, result_type, result_int);
   rewriter.replaceOp(get_peer_ptr, result_address);
   return success();
 }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_xtile_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_xtile_pass.cc
new file mode 100644
index 00000000000000..552724f1cff6e3
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_xtile_pass.cc
@@ -0,0 +1,386 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Inliner.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
+
+namespace mlir::triton::xla {
+
+#define GEN_PASS_DEF_TRITONXLALOWERXTILEPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+namespace ttir = ::mlir::triton;
+namespace ma = ::mlir::arith;
+
+// Get the new arg types of the lowered function by translating memrefs to the
+// corresponding pointer types.
+llvm::SmallVector<mlir::Type> GetTransformedArgTypes(
+    ::xla::xtile::EntryFuncOp& entry_op) {
+  llvm::SmallVector<mlir::Type> arg_types;
+  // Tile id is not carried over hence -1.
+  arg_types.reserve(entry_op.getNumArguments() - 1U);
+  for (const auto& arg : entry_op.getBufferArgs()) {
+    mlir::MemRefType memref_type = mlir::cast<mlir::MemRefType>(arg.getType());
+    arg_types.push_back(
+        ttir::getPointerTypeToElement(memref_type.getElementType()));
+  }
+  mlir::TypeRange opaque_args(entry_op.getOpaqueArgs());
+  arg_types.append(opaque_args.begin(), opaque_args.end());
+  return arg_types;
+}
+
+// Function to get the permutation vector from a MemRefType.
+// The motivation for extracting it from getStridesAndOffset vs directly from
+// triton_xla.layout is that when we fold memrefs (such as in a transpose) it
+// will have a generic strided layout that does not directly encode the
+// permutation.
+absl::StatusOr<llvm::SmallVector<int64_t>> getPermutationMinorToMajor(
+    mlir::MemRefType memref) {
+  llvm::SmallVector<int64_t> strides;
+  int64_t offset;
+  if (memref.getStridesAndOffset(strides, offset).failed()) {
+    // This can fail if the layout is not strided (e.g., has dynamic strides).
+    return absl::InvalidArgumentError("Failed to get strides and offset");
+  }
+
+  llvm::SmallVector<int64_t> permutation;
+  permutation.resize(strides.size());
+  absl::c_iota(permutation, 0);
+
+  absl::c_sort(permutation, [&](int64_t lhs_dim, int64_t rhs_dim) {
+    int64_t lhs_stride = strides[lhs_dim];
+    int64_t rhs_stride = strides[rhs_dim];
+    if (lhs_stride != rhs_stride) {
+      return lhs_stride < rhs_stride;
+    }
+
+    // If the strides are the same, we need to ensure that the unit dimension is
+    // the more minor.
+    int64_t lhs_size = memref.getDimSize(lhs_dim);
+    int64_t rhs_size = memref.getDimSize(rhs_dim);
+    if (lhs_size != rhs_size) {
+      return lhs_size < rhs_size;
+    }
+
+    // If all else fails just sort in the canonical order.
+    return lhs_dim > rhs_dim;
+  });
+
+  // Check that the strides actually represent a permutation,
+  // this could happen for example with padded buffers.
+  int64_t size_product = 1;
+  for (int64_t dim : permutation) {
+    if (strides[dim] != size_product) {
+      return absl::InvalidArgumentError("Layout is not a valid permutation");
+    }
+    size_product *= memref.getDimSize(dim);
+  }
+
+  return permutation;
+}
+
+MemrefToPtrOp CreateMemrefToPtr(mlir::OpBuilder& builder,
+                                mlir::TypedValue<mlir::MemRefType> memref) {
+  mlir::Type ptr_type =
+      ttir::getPointerTypeToElement(memref.getType().getElementType());
+  return MemrefToPtrOp::create(builder, memref.getLoc(), ptr_type, memref);
+}
+
+// Rewrite a xtile entry to a func.func with the same body, but with memref
+// arguments replaced by pointers.
+class XTileEntryToTriton
+    : public mlir::OpRewritePattern<::xla::xtile::EntryFuncOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      ::xla::xtile::EntryFuncOp entry_op,
+      mlir::PatternRewriter& rewriter) const override {
+    mlir::ModuleOp module = entry_op->getParentOfType<mlir::ModuleOp>();
+    mlir::ImplicitLocOpBuilder builder(module->getLoc(), module);
+    builder.setInsertionPointToStart(module.getBody());
+
+    const int64_t num_buffer_args = entry_op.getBufferArgs().size();
+    auto new_arg_types = GetTransformedArgTypes(entry_op);
+    auto new_func_op =
+        mlir::func::FuncOp::create(builder, entry_op.getName(),
+                                   builder.getFunctionType(new_arg_types, {}));
+
+    // Move the old function's body to the new function
+    rewriter.inlineRegionBefore(
+        entry_op.getBody(), new_func_op.getFunctionBody(), new_func_op.end());
+
+    Block& entry_block = new_func_op.front();
+    builder.setInsertionPointToStart(&entry_block);
+
+    SmallVector<BlockArgument> old_args(entry_block.getArguments());
+    SmallVector<BlockArgument> new_args(entry_block.addArguments(
+        new_arg_types,
+        SmallVector<Location>(new_arg_types.size(), entry_op.getLoc())));
+
+    BlockArgument tile_id_arg = old_args.back();
+
+    auto pid = ttir::GetProgramIdOp::create(builder, ttir::ProgramIDDim::X);
+    Value pid_idx =
+        ma::IndexCastOp::create(builder, builder.getIndexType(), pid);
+    rewriter.replaceAllUsesWith(tile_id_arg, pid_idx);
+
+    // Handle memref arguments.
+    for (auto [old_arg, new_arg] :
+         llvm::zip(old_args,
+                   mlir::ValueRange(new_args).take_front(num_buffer_args))) {
+      mlir::MemRefType memref_type =
+          mlir::cast<mlir::MemRefType>(old_arg.getType());
+
+      mlir::Value memref_cast =
+          PtrToMemrefOp::create(builder, memref_type, new_arg);
+
+      // Replace all uses of the old argument with the result of the cast.
+      rewriter.replaceAllUsesWith(old_arg, memref_cast);
+    }
+    // For opaque arguments, we can simply replace all uses with the new
+    // argument.
+    for (auto [old_arg, new_arg] :
+         llvm::zip(mlir::ValueRange(old_args).drop_front(num_buffer_args),
+                   mlir::ValueRange(new_args).drop_front(num_buffer_args))) {
+      rewriter.replaceAllUsesWith(old_arg, new_arg);
+    }
+
+    entry_block.eraseArguments(0, old_args.size());
+
+    rewriter.setInsertionPointToEnd(&entry_block);
+
+    rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(
+        entry_block.getTerminator());
+
+    rewriter.eraseOp(entry_op);
+    return success();
+  }
+};
+
+// Rewrite a xtile extract to a triton_xla extract.
+class XTileExtractToTriton
+    : public mlir::OpRewritePattern<::xla::xtile::ExtractTileOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      ::xla::xtile::ExtractTileOp extract_op,
+      mlir::PatternRewriter& rewriter) const override {
+    mlir::MemRefType source_type = extract_op.getSource().getType();
+    mlir::RankedTensorType result_type = extract_op.getType();
+
+    mlir::Value memref_to_ptr =
+        CreateMemrefToPtr(rewriter, extract_op.getSource());
+
+    if (result_type.getRank() == 0) {
+      mlir::Value scalar_value = ttir::LoadOp::create(
+          rewriter, extract_op->getLoc(), memref_to_ptr,
+          ttir::CacheModifier::NONE, ttir::EvictionPolicy::NORMAL,
+          /*isVolatile=*/false);
+
+      rewriter.replaceOpWithNewOp<mlir::tensor::FromElementsOp>(
+          extract_op, result_type, scalar_value);
+      return mlir::success();
+    }
+
+    absl::StatusOr<SmallVector<int64_t>> minor_to_major_or =
+        getPermutationMinorToMajor(source_type);
+    if (!minor_to_major_or.ok()) {
+      return rewriter.notifyMatchFailure(extract_op,
+                                         minor_to_major_or.status().ToString());
+    }
+    const SmallVector<int64_t>& minor_to_major = *minor_to_major_or;
+    auto triton_extract_op = ExtractOp::create(
+        rewriter, extract_op.getLoc(), result_type, memref_to_ptr,
+        extract_op.getOffsets(), extract_op.getFullTileShape(),
+        extract_op.getStrides(), source_type.getShape(), minor_to_major);
+
+    rewriter.replaceOp(extract_op, triton_extract_op);
+
+    return mlir::success();
+  }
+};
+
+// Rewrite a xtile insert to a triton_xla insert.
+class XTileInsertToTriton
+    : public mlir::OpRewritePattern<::xla::xtile::InsertTileOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      ::xla::xtile::InsertTileOp insert_op,
+      mlir::PatternRewriter& rewriter) const override {
+    mlir::MemRefType destination_type = insert_op.getDestination().getType();
+
+    mlir::Value memref_to_ptr =
+        CreateMemrefToPtr(rewriter, insert_op.getDestination());
+
+    if (insert_op.getSource().getType().getRank() == 0) {
+      mlir::Value scalar_value = mlir::tensor::ExtractOp::create(
+          rewriter, insert_op.getLoc(), insert_op.getSource());
+
+      rewriter.replaceOpWithNewOp<ttir::StoreOp>(
+          insert_op, memref_to_ptr, scalar_value, /*mask=*/nullptr);
+      return mlir::success();
+    }
+
+    absl::StatusOr<SmallVector<int64_t>> minor_to_major_or =
+        getPermutationMinorToMajor(destination_type);
+    if (!minor_to_major_or.ok()) {
+      return rewriter.notifyMatchFailure(insert_op,
+                                         minor_to_major_or.status().ToString());
+    }
+    const SmallVector<int64_t>& minor_to_major = *minor_to_major_or;
+    auto triton_insert_op = InsertOp::create(
+        rewriter, insert_op.getLoc(), insert_op.getSource(), memref_to_ptr,
+        insert_op.getOffsets(), insert_op.getFullTileShape(),
+        insert_op.getStrides(), destination_type.getShape(), minor_to_major);
+
+    rewriter.replaceOp(insert_op, triton_insert_op);
+
+    return mlir::success();
+  }
+};
+
+class XTileMaskToTriton : public mlir::OpRewritePattern<::xla::xtile::MaskOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      ::xla::xtile::MaskOp op, mlir::PatternRewriter& rewriter) const override {
+    llvm::SmallVector<int64_t> masked_dimensions = op.getMaskedDimensions();
+    if (masked_dimensions.size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "triton masking only supports masking over a single dimension");
+    }
+
+    int64_t mask_dimension = masked_dimensions.front();
+    int64_t mask_bound = op.getBounds()[mask_dimension];
+    int64_t masked_dim_size = op.getType().getDimSize(mask_dimension);
+    auto iota_type =
+        mlir::RankedTensorType::get(masked_dim_size, rewriter.getI32Type());
+    auto range = stablehlo::IotaOp::create(rewriter, op.getLoc(), iota_type, 0);
+    auto bcast_type = mlir::RankedTensorType::get(op.getType().getShape(),
+                                                  iota_type.getElementType());
+    auto bcast = stablehlo::BroadcastInDimOp::create(
+        rewriter, op.getLoc(), bcast_type, range, {mask_dimension});
+    auto constant = mlir::arith::ConstantOp::create(
+        rewriter, op.getLoc(),
+        mlir::DenseElementsAttr::get(bcast_type,
+                                     rewriter.getI32IntegerAttr(mask_bound)));
+    Value mask = arith::CmpIOp::create(
+        rewriter, op.getLoc(), arith::CmpIPredicate::slt, bcast, constant);
+
+    auto mask_value_tensor = mlir::tensor::FromElementsOp::create(
+        rewriter, op.getLoc(),
+        mlir::RankedTensorType::get({}, op.getValue().getType()),
+        op.getValue());
+    auto neutral = stablehlo::BroadcastInDimOp::create(
+        rewriter, op.getLoc(), op.getType(), mask_value_tensor,
+        ArrayRef<int64_t>{});
+
+    rewriter.replaceOpWithNewOp<arith::SelectOp>(op, mask, op.getSource(),
+                                                 neutral);
+
+    return mlir::success();
+  }
+};
+
+class FoldIntoMemrefToPtr : public mlir::OpRewritePattern<MemrefToPtrOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      MemrefToPtrOp op, mlir::PatternRewriter& rewriter) const override {
+    // As a transpose doesn't add any offset we can simply fold it into the
+    // memref_to_ptr.
+    auto transpose = op.getSrc().getDefiningOp<mlir::memref::TransposeOp>();
+    if (!transpose) {
+      return mlir::failure();
+    }
+
+    rewriter.replaceOpWithNewOp<MemrefToPtrOp>(op, op.getType(),
+                                               transpose.getIn());
+    return mlir::success();
+  }
+};
+
+class TritonXLALowerXTilePass
+    : public impl::TritonXLALowerXTilePassBase<TritonXLALowerXTilePass> {
+ public:
+  using TritonXLALowerXTilePassBase::TritonXLALowerXTilePassBase;
+
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::MLIRContext* context = &getContext();
+
+    mlir::RewritePatternSet patterns(context);
+
+    patterns.add<XTileEntryToTriton, XTileExtractToTriton, XTileInsertToTriton,
+                 XTileMaskToTriton, FoldIntoMemrefToPtr>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateTritonXLALowerXTilePass() {
+  return std::make_unique<TritonXLALowerXTilePass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_math_to_libdevice.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_math_to_libdevice.cc
new file mode 100644
index 00000000000000..76d1bd8e6817f1
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_math_to_libdevice.cc
@@ -0,0 +1,281 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/TargetParser/Triple.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/service/gpu/target_util.h"
+#include "xla/xla_data.pb.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+#define GEN_PASS_DEF_TRITONXLAMATHTOLIBDEVICEPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+using ::xla::gpu::TargetDeviceFunctionID;
+
+template <typename OpTy>
+struct OpInfo;
+
+template <>
+struct OpInfo<mlir::math::AcosOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAcos;
+};
+
+template <>
+struct OpInfo<mlir::math::AcoshOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAcosh;
+};
+
+template <>
+struct OpInfo<mlir::math::AsinOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAsin;
+};
+
+template <>
+struct OpInfo<mlir::math::AsinhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAsinh;
+};
+
+template <>
+struct OpInfo<mlir::math::Atan2Op> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAtan2;
+};
+
+template <>
+struct OpInfo<mlir::math::AtanhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAtanh;
+};
+
+template <>
+struct OpInfo<mlir::math::CosOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kCos;
+};
+
+template <>
+struct OpInfo<mlir::math::CoshOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kCosh;
+};
+
+template <>
+struct OpInfo<mlir::math::ExpOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kExp;
+};
+
+template <>
+struct OpInfo<mlir::math::ErfOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kErf;
+};
+
+template <>
+struct OpInfo<mlir::math::ExpM1Op> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kExpm1;
+};
+
+template <>
+struct OpInfo<mlir::math::LogOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kLog;
+};
+
+template <>
+struct OpInfo<mlir::math::Log1pOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kLog1p;
+};
+
+template <>
+struct OpInfo<mlir::math::PowFOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kPow;
+};
+
+template <>
+struct OpInfo<mlir::arith::RemFOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kFmod;
+};
+
+template <>
+struct OpInfo<mlir::math::RsqrtOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kRsqrt;
+};
+
+template <>
+struct OpInfo<mlir::math::SinOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kSin;
+};
+
+template <>
+struct OpInfo<mlir::math::SinhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kSinh;
+};
+
+template <>
+struct OpInfo<mlir::math::SqrtOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kSqrt;
+};
+
+template <>
+struct OpInfo<mlir::math::TanOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kTan;
+};
+
+template <>
+struct OpInfo<mlir::math::TanhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kTanh;
+};
+
+template <>
+struct OpInfo<mlir::math::CbrtOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kCbrt;
+};
+
+template <typename OpTy>
+class ConvertToLibdevice : public mlir::OpRewritePattern<OpTy> {
+ public:
+  ConvertToLibdevice(mlir::MLIRContext* context,
+                     absl::string_view libdevice_path,
+                     const llvm::Triple& triple)
+      : mlir::OpRewritePattern<OpTy>(context),
+        libdevice_path_(libdevice_path),
+        triple_(triple) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      OpTy op, mlir::PatternRewriter& rewriter) const override {
+    auto maybe_shaped_type = mlir::dyn_cast<mlir::ShapedType>(op.getType());
+    mlir::Type output_type =
+        maybe_shaped_type ? maybe_shaped_type.getElementType() : op.getType();
+
+    bool output_type_is_16bit_float =
+        output_type.isBF16() || output_type.isF16();
+    if (!(output_type_is_16bit_float || output_type.isF32() ||
+          output_type.isF64())) {
+      op.emitError() << "unsupported output type";
+      return rewriter.notifyMatchFailure(op, "unsupported output type");
+    }
+
+    absl::StatusOr<::xla::PrimitiveType> primitive_type_or =
+        ::xla::gpu::triton::GetPrimitiveType(output_type);
+    if (!primitive_type_or.ok()) {
+      return rewriter.notifyMatchFailure(op, "could not get primitive type");
+    }
+
+    ::xla::EmitterLocOpBuilder builder(op->getLoc(), rewriter);
+
+    llvm::SmallVector<Value, 2> casted_inputs;
+    if (output_type_is_16bit_float) {
+      // Upcast the inputs to F32.
+      for (auto operand : op->getOperands()) {
+        casted_inputs.push_back(
+            ::xla::gpu::triton::Cast(builder, operand, rewriter.getF32Type()));
+      }
+    } else {
+      casted_inputs = llvm::to_vector(op->getOperands());
+    }
+
+    Value res = mlir::triton::ExternElementwiseOp::create(
+        builder, casted_inputs[0].getType(), casted_inputs, "libdevice",
+        libdevice_path_,
+        ObtainDeviceFunctionName(OpInfo<OpTy>::kFunctionID, *primitive_type_or,
+                                 triple_),
+        /*pure=*/true);
+
+    if (res.getType() != output_type) {
+      // Downcast back to the original output type.
+      res = ::xla::gpu::triton::Cast(builder, res, output_type);
+    }
+
+    rewriter.replaceOp(op, res);
+
+    return mlir::success();
+  }
+
+ private:
+  // These are both owned by the parent pass (TritonXLAMathToLibdevicePass), so
+  // it is safe to store references here.
+  absl::string_view libdevice_path_;
+  const llvm::Triple& triple_;
+};
+
+template <typename... OpTypes>
+void AddPattens(mlir::RewritePatternSet& patterns,
+                absl::string_view libdevice_path, const llvm::Triple& triple) {
+  patterns.add<ConvertToLibdevice<OpTypes>...>(patterns.getContext(),
+                                               libdevice_path, triple);
+}
+
+class TritonXLAMathToLibdevicePass
+    : public impl::TritonXLAMathToLibdevicePassBase<
+          TritonXLAMathToLibdevicePass> {
+ public:
+  using TritonXLAMathToLibdevicePassBase::TritonXLAMathToLibdevicePassBase;
+
+ private:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::MLIRContext* context = &getContext();
+
+    mlir::RewritePatternSet patterns(context);
+
+    llvm::Triple triple(triple_string_);
+
+    AddPattens<mlir::math::AcosOp, mlir::math::AcoshOp, mlir::math::AsinOp,
+               mlir::math::AsinhOp, mlir::math::Atan2Op, mlir::math::AtanhOp,
+               mlir::math::CosOp, mlir::math::CoshOp, mlir::math::ExpOp,
+               mlir::math::ErfOp, mlir::math::ExpM1Op, mlir::math::LogOp,
+               mlir::math::Log1pOp, mlir::math::PowFOp, mlir::arith::RemFOp,
+               mlir::math::RsqrtOp, mlir::math::SinOp, mlir::math::SinhOp,
+               mlir::math::SqrtOp, mlir::math::TanOp, mlir::math::TanhOp,
+               mlir::math::CbrtOp>(patterns, libdevice_path_, triple);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateTritonXLAMathToLibdevicePass(
+    absl::string_view libdevice_path, absl::string_view triple) {
+  TritonXLAMathToLibdevicePassOptions options;
+  options.libdevice_path_ = libdevice_path;
+  options.triple_string_ = triple;
+
+  return std::make_unique<TritonXLAMathToLibdevicePass>(options);
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_squeeze_dims_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_squeeze_dims_pass.cc
index 72656015d45688..5b3046b489f757 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_squeeze_dims_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_squeeze_dims_pass.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -44,6 +44,7 @@ limitations under the License.
 #include "mlir/Transforms/WalkPatternRewriteDriver.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace mlir::triton::xla {
@@ -90,8 +91,8 @@ void ReplaceOpWithExpandDimsOf(PatternRewriter& rewriter, Operation* op,
     }
     // If any users remain, replace the op with expand_dims.
     if (!result.use_empty()) {
-      Value expand_dims = rewriter.create<ExpandDimsOp>(
-          op->getLoc(), result.getType(), value, axis);
+      Value expand_dims = ExpandDimsOp::create(rewriter, op->getLoc(),
+                                               result.getType(), value, axis);
       rewriter.replaceAllUsesWith(result, expand_dims);
     }
   }
@@ -145,43 +146,43 @@ Value SqueezeTensorValue(PatternRewriter& rewriter, Value value,
   for (uint32_t i = 0; i < squeeze_dims.size(); ++i) {
     uint32_t dim = squeeze_dims[i] - i;
     Type type = SqueezeTensorType(cast<RankedTensorType>(value.getType()), dim);
-    value = rewriter.create<SqueezeDimsOp>(value.getLoc(), type, value, dim);
+    value = SqueezeDimsOp::create(rewriter, value.getLoc(), type, value, dim);
   }
   return value;
 }
 
-// Folds squeeze_dims into extract.
-LogicalResult FoldSqueezeDimsOfExtract(ExtractOp op,
-                                       PatternRewriter& rewriter) {
+LogicalResult FoldSqueezeDimsOfExtractTile(::xla::xtile::ExtractTileOp op,
+                                           PatternRewriter& rewriter) {
   std::optional<uint32_t> axis = GetSqueezeDimsUserAxis(op);
   if (!axis) {
     return rewriter.notifyMatchFailure(op, "No squeeze_dims users.");
   }
 
-  Value new_op = rewriter.create<ExtractOp>(
-      op.getLoc(), SqueezeTensorType(op.getType(), *axis), op.getSrc(),
-      op.getMixedOffsets(), op.getStaticSizes(), op.getStaticStrides(),
-      op.getSrcShape(), op.getSrcLayout());
+  auto squeezed_type = SqueezeTensorType(op.getType(), *axis);
+
+  Value new_op = ::xla::xtile::ExtractTileOp::create(
+      rewriter, op.getLoc(), squeezed_type, op.getSource(), op.getOffsets(),
+      op.getFullTileShape(), op.getStrides());
   ReplaceOpWithExpandDimsOf(rewriter, op, new_op, *axis);
   rewriter.eraseOp(op);
   return success();
 }
 
-// Extracts unit dimensions from insert and prepends them as squeeze_dims.
-LogicalResult SqueezeInsert(InsertOp op, PatternRewriter& rewriter) {
-  if (op.getSrc().getType().getRank() == 0) {
+LogicalResult SqueezeInsertTile(::xla::xtile::InsertTileOp op,
+                                PatternRewriter& rewriter) {
+  if (op.getSource().getType().getRank() == 0) {
     return rewriter.notifyMatchFailure(op, "Expected non-scalar source.");
   }
 
-  auto squeeze_dims = GetDimsToSqueeze(op.getSrc().getType());
+  auto squeeze_dims = GetDimsToSqueeze(op.getSource().getType());
   if (squeeze_dims.empty()) {
     return rewriter.notifyMatchFailure(op, "No dimensions to squeeze.");
   }
 
-  Value src = SqueezeTensorValue(rewriter, op.getSrc(), squeeze_dims);
-  rewriter.replaceOpWithNewOp<InsertOp>(
-      op, src, op.getDst(), op.getMixedOffsets(), op.getStaticSizes(),
-      op.getStaticStrides(), op.getDstShape(), op.getDstLayout());
+  Value src = SqueezeTensorValue(rewriter, op.getSource(), squeeze_dims);
+  rewriter.replaceOpWithNewOp<::xla::xtile::InsertTileOp>(
+      op, src, op.getDestination(), op.getOffsets(), op.getFullTileShape(),
+      op.getStrides());
   return success();
 }
 
@@ -212,11 +213,12 @@ LogicalResult ExpandReshapeResult(ReshapeOp op, PatternRewriter& rewriter) {
     return rewriter.notifyMatchFailure(op, "No unit dimensions.");
   }
 
-  Value result = rewriter.create<ReshapeOp>(
-      op.getLoc(), SqueezeTensorType(op.getType(), expand_dims), op.getSrc());
+  Value result = ReshapeOp::create(rewriter, op.getLoc(),
+                                   SqueezeTensorType(op.getType(), expand_dims),
+                                   op.getSrc());
   for (int32_t i = expand_dims.size() - 1; i >= 0; --i) {
     uint32_t dim = expand_dims[i] - i;
-    result = rewriter.create<ExpandDimsOp>(op.getLoc(), result, dim);
+    result = ExpandDimsOp::create(rewriter, op.getLoc(), result, dim);
   }
   rewriter.replaceOp(op, result);
   return success();
@@ -276,7 +278,7 @@ LogicalResult PushSqueezeDimsUpThroughBroadcast(SqueezeDimsOp op,
   OpBuilder::InsertionGuard guard = SetInsertionPoint(rewriter, broadcast);
   Value value = SqueezeTensorValue(rewriter, broadcast.getSrc(), op.getAxis());
   Value new_broadcast =
-      rewriter.create<BroadcastOp>(broadcast.getLoc(), op.getType(), value);
+      BroadcastOp::create(rewriter, broadcast.getLoc(), op.getType(), value);
   ReplaceOpWithExpandDimsOf(rewriter, broadcast, new_broadcast, op.getAxis());
   return success();
 }
@@ -307,7 +309,7 @@ LogicalResult PushSqueezeDimsUpThroughTrans(SqueezeDimsOp op,
 
   OpBuilder::InsertionGuard guard = SetInsertionPoint(rewriter, trans);
   Value value = SqueezeTensorValue(rewriter, trans.getSrc(), src_axis);
-  Value new_trans = rewriter.create<TransOp>(trans.getLoc(), value, new_order);
+  Value new_trans = TransOp::create(rewriter, trans.getLoc(), value, new_order);
   ReplaceOpWithExpandDimsOf(rewriter, trans, new_trans, dst_axis);
   return success();
 }
@@ -335,7 +337,7 @@ LogicalResult PushSqueezeDimsUpThroughJoin(SqueezeDimsOp op,
   }
 
   Value new_join =
-      rewriter.create<JoinOp>(join.getLoc(), op.getType(), operands);
+      JoinOp::create(rewriter, join.getLoc(), op.getType(), operands);
   ReplaceOpWithExpandDimsOf(rewriter, join, new_join, op.getAxis());
   return success();
 }
@@ -365,8 +367,8 @@ LogicalResult PushSqueezeDimsUpThroughReduce(SqueezeDimsOp op,
     operands.push_back(SqueezeTensorValue(rewriter, operand, squeeze_axis));
   }
 
-  auto new_reduce = rewriter.create<ReduceOp>(reduce.getLoc(), op.getType(),
-                                              operands, reduce_axis);
+  auto new_reduce = ReduceOp::create(rewriter, reduce.getLoc(), op.getType(),
+                                     operands, reduce_axis);
   rewriter.cloneRegionBefore(reduce->getRegion(0), new_reduce->getRegion(0),
                              new_reduce->getRegion(0).begin());
   ReplaceOpWithExpandDimsOf(rewriter, reduce, new_reduce->getResult(0),
@@ -439,9 +441,10 @@ LogicalResult PushSqueezeDimsUpIntoIf(SqueezeDimsOp op,
   auto new_types = llvm::to_vector(if_op.getResultTypes());
   new_types[result_number] = op.getType();
 
-  auto new_if_op = rewriter.create<scf::IfOp>(
-      op.getLoc(), new_types, if_op.getCondition(), /*addThenBlock=*/false,
-      /*addElseBlock=*/false);
+  auto new_if_op =
+      scf::IfOp::create(rewriter, op.getLoc(), new_types, if_op.getCondition(),
+                        /*addThenBlock=*/false,
+                        /*addElseBlock=*/false);
 
   // Update then and else regions.
   for (auto [old_region, new_region] :
@@ -452,9 +455,9 @@ LogicalResult PushSqueezeDimsUpIntoIf(SqueezeDimsOp op,
     }
     auto yield_op = new_region->front().getTerminator();
     OpBuilder::InsertionGuard guard = SetInsertionPoint(rewriter, yield_op);
-    auto squeeze_op = rewriter.create<SqueezeDimsOp>(
-        op.getLoc(), op.getType(), yield_op->getOperand(result_number),
-        op.getAxis());
+    auto squeeze_op = SqueezeDimsOp::create(rewriter, op.getLoc(), op.getType(),
+                                            yield_op->getOperand(result_number),
+                                            op.getAxis());
     yield_op->setOperand(result_number, squeeze_op);
   }
   rewriter.replaceOp(op, new_if_op.getResult(result_number));
@@ -485,6 +488,24 @@ LogicalResult ReorderSqueezeDims(SqueezeDimsOp op, PatternRewriter& rewriter) {
   return success();
 }
 
+LogicalResult PushSqueezeDimsUpThroughMask(::xla::xtile::MaskOp op,
+                                           PatternRewriter& rewriter) {
+  std::optional<uint32_t> axis = GetSqueezeDimsUserAxis(op);
+  if (!axis) {
+    return rewriter.notifyMatchFailure(op, "No squeeze_dims users.");
+  }
+
+  auto new_operand = SqueezeTensorValue(rewriter, op.getSource(), *axis);
+
+  llvm::SmallVector<int64_t> new_bounds(op.getBounds());
+  new_bounds.erase(new_bounds.begin() + *axis);
+
+  auto new_mask = ::xla::xtile::MaskOp::create(
+      rewriter, op.getLoc(), new_operand, new_bounds, op.getValue());
+  ReplaceOpWithExpandDimsOf(rewriter, op, new_mask->getResults(), *axis);
+  return success();
+}
+
 // Converts squeeze_dims to tt.reshape.
 LogicalResult SqueezeDimsToReshape(SqueezeDimsOp op,
                                    PatternRewriter& rewriter) {
@@ -512,8 +533,8 @@ class TritonXLASqueezeDimsPass
  private:
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
-    patterns.add(FoldSqueezeDimsOfExtract);
-    patterns.add(SqueezeInsert);
+    patterns.add(FoldSqueezeDimsOfExtractTile);
+    patterns.add(SqueezeInsertTile);
     patterns.add(SqueezeReshapeOperand);
     patterns.add(ExpandReshapeResult);
     patterns.add<PushSqueezeDimsUpThroughElementwise>(&getContext());
@@ -524,6 +545,7 @@ class TritonXLASqueezeDimsPass
     patterns.add(PushSqueezeDimsUpThroughReduce);
     patterns.add(PushSqueezeDimsUpThroughTrans);
     patterns.add(ReorderSqueezeDims);
+    patterns.add(PushSqueezeDimsUpThroughMask);
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       return signalPassFailure();
     }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_unswitch_loops_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_unswitch_loops_pass.cc
index 5f7f763a708530..92a3116c0ad7ca 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_unswitch_loops_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_unswitch_loops_pass.cc
@@ -80,13 +80,13 @@ LogicalResult UnswitchLoop(mlir::scf::ForOp for_op,
   for (int body_index : {0, 1}) {
     auto builder = OpBuilder::atBlockEnd(new_if.getBody(body_index),
                                          rewriter.getListener());
-    arith::ConstantOp condition = builder.create<arith::ConstantOp>(
-        for_op.getLoc(),
+    arith::ConstantOp condition = arith::ConstantOp::create(
+        builder, for_op.getLoc(),
         rewriter.getIntegerAttr(rewriter.getI1Type(), body_index == 0));
     IRMapping mapping;
     mapping.map(if_op.getCondition(), condition);
     Operation* new_for = builder.clone(*for_op, mapping);
-    builder.create<scf::YieldOp>(for_op.getLoc(), new_for->getResults());
+    scf::YieldOp::create(builder, for_op.getLoc(), new_for->getResults());
   }
   rewriter.replaceOp(for_op, new_if);
   return success();
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/xtile_lower_to_triton.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/xtile_lower_to_triton.cc
new file mode 100644
index 00000000000000..def9cb2421f05f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/xtile_lower_to_triton.cc
@@ -0,0 +1,129 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+namespace ttir = ::mlir::triton;
+
+#define GEN_PASS_DEF_XTILELOWERTOTRITONPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+class LowerDotScaled
+    : public mlir::OpRewritePattern<::xla::xtile::DotScaledOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      ::xla::xtile::DotScaledOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (std::distance(op->getUsers().begin(), op->getUsers().end()) != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must have exactly one user in order to be lowered to "
+          "triton.");
+    }
+
+    mlir::Operation* add_op = dyn_cast<arith::AddFOp>(*op->getUsers().begin());
+    if (!add_op) {
+      add_op = dyn_cast<arith::AddIOp>(*op->getUsers().begin());
+    }
+
+    if (!add_op) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must be consumed by an AddOp in order to be convertible to "
+          "triton dot.");
+    }
+
+    // Accumulator is the operand of add that is not the dot operation.
+    auto accumulator = add_op->getOperand(1) == op ? add_op->getOperand(0)
+                                                   : add_op->getOperand(1);
+
+    auto lhs_dot_elem_type_or_status =
+        ::xla::gpu::triton::internal::GetScaleDotElemType(
+            op.getLhs().getType());
+    auto rhs_dot_elem_type_or_status =
+        ::xla::gpu::triton::internal::GetScaleDotElemType(
+            op.getRhs().getType());
+
+    if (!lhs_dot_elem_type_or_status.ok() ||
+        !rhs_dot_elem_type_or_status.ok()) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          absl::StrCat(
+              "Failed to get dot element type for lhs or rhs.\nLhs status: ",
+              lhs_dot_elem_type_or_status.status().message(), "\nRhs status: ",
+              rhs_dot_elem_type_or_status.status().message()));
+    }
+
+    auto lhs_dot_elem_type = lhs_dot_elem_type_or_status.value();
+    auto rhs_dot_elem_type = rhs_dot_elem_type_or_status.value();
+
+    auto triton_dot_scaled_op = ttir::DotScaledOp::create(
+        rewriter, op.getLoc(), accumulator.getType(), op.getLhs(), op.getRhs(),
+        accumulator, op.getLhsScale(), op.getRhsScale(), lhs_dot_elem_type,
+        rhs_dot_elem_type, op.getFastMath(), op.getLhsKPack(),
+        op.getRhsKPack());
+
+    rewriter.replaceAllOpUsesWith(add_op, op.getResult());
+    rewriter.replaceOp(op, triton_dot_scaled_op);
+    return mlir::success();
+  }
+};
+
+class XTileLowerToTritonPass
+    : public impl::XTileLowerToTritonPassBase<XTileLowerToTritonPass> {
+ public:
+  void runOnOperation() override {
+    mlir::MLIRContext* mlir_context = &getContext();
+    mlir::RewritePatternSet patterns(mlir_context);
+    patterns.add<LowerDotScaled>(mlir_context);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateXTileLowerToTritonPass() {
+  return std::make_unique<XTileLowerToTritonPass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
new file mode 100644
index 00000000000000..db82096ad2f6ec
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
@@ -0,0 +1,592 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <system_error>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Transforms/Passes.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
+#include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
+#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/backends/gpu/codegen/triton/lowering_util.h"
+#include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+#include "xla/codegen/emitters/ir/xla_dialect.h"
+#include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
+#include "xla/codegen/xtile/ir/xtile_dialect.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h"
+#include "xla/service/dump.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/service/gpu/model/triton_emitter_constraints.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/path.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+namespace xla {
+namespace gpu {
+
+namespace ttir = ::mlir::triton;
+namespace stablehlo = ::mlir::stablehlo;
+namespace xgt = ::xla::gpu::triton;
+
+using ::llvm::SmallVector;
+using ::mlir::MLIRContext;
+
+using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
+
+void LoadMlirDialectsForTriton(mlir::MLIRContext& mlir_context) {
+  mlir_context.loadDialect<
+      ttir::TritonDialect, ttir::gpu::TritonGPUDialect,
+      mlir::arith::ArithDialect, mlir::affine::AffineDialect,
+      mlir::LLVM::LLVMDialect, xla::XlaDialect, xla::gpu::XlaGpuDialect,
+      ttir::xla::XlaTritonDialect, mlir::func::FuncDialect,
+      mlir::tensor::TensorDialect, xla::xtile::XTileDialect,
+      mlir::NVVM::NVVMDialect, stablehlo::StablehloDialect>();
+  mlir::DialectRegistry registry;
+  mlir::func::registerInlinerExtension(registry);
+  mlir::LLVM::registerInlinerInterface(registry);
+  mlir_context.appendDialectRegistry(registry);
+}
+
+// Simplified copy of translateLLVMToLLVMIR which in addition takes
+// path to libdevice directly as an argument.
+absl::StatusOr<std::unique_ptr<llvm::Module>> TranslateLLVMToLLVMIR(
+    llvm::LLVMContext* llvmContext, mlir::ModuleOp module) {
+  mlir::DialectRegistry registry;
+  mlir::registerBuiltinDialectTranslation(registry);
+  mlir::registerLLVMDialectTranslation(registry);
+  mlir::registerNVVMDialectTranslation(registry);
+  mlir::registerROCDLDialectTranslation(registry);
+  module->getContext()->appendDialectRegistry(registry);
+
+  std::unique_ptr<llvm::Module> llvmModule =
+      mlir::translateModuleToLLVMIR(module, *llvmContext);
+  if (!llvmModule) {
+    return Internal("Failed to emit LLVM IR.");
+  }
+  // TODO: b/363203060 - Upstream Triton sets specific flags for the LLVM
+  // optimizer to get best performance. Figure out if we can gain any of it by
+  // propagating these flags to
+  // xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc.
+  return llvmModule;
+}
+
+absl::Status CreateInternalError(absl::string_view message,
+                                 const HloFusionInstruction* fusion,
+                                 mlir::ModuleOp triton_module) {
+  std::string err;
+  llvm::raw_string_ostream os(err);
+  os << message << "\n";
+  os << "fusion instruction: " << fusion->ToString() << "\n";
+  os << "HLO module to reproduce:\n"
+     << ExtractInstructionIntoNewModule(*fusion)->ToString();
+  os << "triton_module>>>\n";
+  triton_module->print(os, mlir::OpPrintingFlags().enableDebugInfo(true, true));
+  os << "<<<triton_module\n";
+  return absl::InternalError(err);
+}
+
+absl::Status IsTritonSupportedFusion(const HloFusionInstruction& fusion,
+                                     const se::DeviceDescription& device_info) {
+  const HloComputation* computation = fusion.fused_instructions_computation();
+  for (const HloInstruction* hlo : computation->instructions()) {
+    // Skip generating nested fusions, they are emitted by their consumer.
+    if (hlo->parent()->IsFusionComputation() &&
+        hlo->opcode() == HloOpcode::kFusion) {
+      if (hlo->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_experimental_scaled_dot_with_triton()) {
+        continue;
+      }
+      CodegenDecision decision = IsTritonSupportedInstruction(
+          *hlo, device_info.gpu_compute_capability());
+      if (!decision.CanFuse()) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Fusion ", hlo->ToString(),
+                         " is not supported: ", decision.Explain()));
+      }
+      VLOG(1) << "Skipping nested fusion: " << hlo->ToString();
+      continue;
+    }
+
+    if (hlo->opcode() == HloOpcode::kPad) {
+      if (!IsTritonSupportedInstruction(*hlo,
+                                        device_info.gpu_compute_capability())) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Pad is not supported: ", hlo->ToString()));
+      }
+    }
+
+    if (hlo->opcode() == HloOpcode::kReduce && hlo->dimensions().size() != 1) {
+      return absl::FailedPreconditionError(
+          absl::StrCat("Reduction with only a single dimension is supported: ",
+                       hlo->ToString()));
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
+    absl::string_view fn_name, const HloFusionInstruction* fusion,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    MLIRContext& mlir_context) {
+  TF_RETURN_IF_ERROR(IsTritonSupportedFusion(*fusion, device_info));
+
+  LoadMlirDialectsForTriton(mlir_context);
+
+  // TODO: b/451959933 - Use reference or check pointer.
+
+  TF_ASSIGN_OR_RETURN(
+      auto triton_module,
+      EmitXTileModule(fn_name,
+                      TritonEmitterConstraints::GetBuilder(device_info), fusion,
+                      block_level_parameters, mlir_context));
+
+  const HloComputation* hlo_computation =
+      fusion->fused_instructions_computation();
+
+  const auto debug_options = fusion->GetModule()->config().debug_options();
+
+  if (DumpingEnabledForHloModule(*hlo_computation->parent()) &&
+      DumpingEnabledForEmitter("triton-fusion", debug_options)) {
+    auto suffix = absl::StrCat(fusion->name(), ".before_validation.ttir.txt");
+    DumpToFileInDirOrStdout(
+        *hlo_computation->parent(), "", suffix,
+        DumpTritonIR(triton_module.get(),
+                     fusion->GetModule()
+                         ->config()
+                         .debug_options()
+                         .xla_gpu_unsupported_annotate_with_emitter_loc()));
+    std::string fusion_suffix = absl::StrCat(fusion->name(), ".hlo");
+    DumpToFileInDirOrStdout(
+        *hlo_computation->parent(), "", fusion_suffix,
+        ExtractInstructionIntoNewModule(*fusion)->ToString());
+  }
+
+  TF_RETURN_IF_ERROR(ir_emitter_triton_internal::LowerXTileToTriton(
+      triton_module.get(), mlir_context, *fusion, device_info));
+
+  VLOG(6) << DumpTritonIR(triton_module.get(),
+                          fusion->GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_unsupported_annotate_with_emitter_loc());
+  if (DumpingEnabledForHloModule(*hlo_computation->parent()) &&
+      DumpingEnabledForEmitter("triton-fusion", debug_options)) {
+    std::string suffix = absl::StrCat(fusion->name(), ".ttir.txt");
+    DumpToFileInDirOrStdout(
+        *hlo_computation->parent(), "", suffix,
+        DumpTritonIR(triton_module.get(),
+                     fusion->GetModule()
+                         ->config()
+                         .debug_options()
+                         .xla_gpu_unsupported_annotate_with_emitter_loc()));
+  }
+
+  return std::move(triton_module);
+}
+
+absl::Status CheckAtLeastAmpere(const se::GpuComputeCapability& gpu_cc) {
+  if (auto* cuda_cc = gpu_cc.cuda_compute_capability();
+      cuda_cc != nullptr && !cuda_cc->IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Triton support is only enabled for Ampere GPUs (compute ",
+                     "capability 8.0) and up, but got compute capability ",
+                     cuda_cc->ToString(), "."));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<TritonWrapperResult> TritonWrapper(
+    absl::string_view fn_name, const HloFusionInstruction* fusion,
+    const se::GpuComputeCapability& gpu_cc,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    llvm::Module* llvm_module, MLIRContext& mlir_context) {
+  TF_RETURN_IF_ERROR(CheckAtLeastAmpere(gpu_cc));
+
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> triton_module,
+                      CreateTritonModule(fn_name, fusion, device_info,
+                                         block_level_parameters, mlir_context));
+
+  VLOG(3) << fusion->ToString(HloPrintOptions::ShortParsable());
+  VLOG(3) << fusion->fused_instructions_computation()->ToString(
+      HloPrintOptions::ShortParsable());
+
+  // Compile Triton kernel to LLVM.
+  const HloModule* hlo_module = fusion->GetModule();
+  return CompileTritonToLLVM(fn_name, *hlo_module, device_info,
+                             block_level_parameters, triton_module.get(),
+                             llvm_module, mlir_context,
+                             /*is_xla_fusion=*/true);
+}
+
+absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
+    absl::string_view kernel_name, const HloModule& hlo_module,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::ModuleOp triton_module, llvm::Module* llvm_module,
+    mlir::MLIRContext& mlir_context, bool is_xla_fusion, bool emit_kernel) {
+  const auto& gpu_cc = device_info.gpu_compute_capability();
+  TF_RETURN_IF_ERROR(CheckAtLeastAmpere(gpu_cc));
+  std::string arch_name = gpu_cc.ToString();
+
+  const HloModuleConfig& hlo_config = hlo_module.config();
+
+  bool should_verify =
+      (hlo_config.debug_options().xla_gpu_llvm_verification_level() >= 1);
+#ifndef NDEBUG
+  should_verify = true;
+#endif
+
+  bool should_dump_mlir_passes =
+      hlo_config.debug_options().xla_enable_dumping() &&
+      DumpingEnabledForHloModule(hlo_module) &&
+      DumpingEnabledForEmitter("triton-fusion", hlo_config.debug_options());
+
+  mlir::PassManager pm(&mlir_context);
+  pm.enableVerifier(should_verify);
+
+  std::optional<llvm::raw_fd_ostream> log_stream;
+  if (should_dump_mlir_passes) {
+    std::string outputs_dir = hlo_config.debug_options().xla_dump_to();
+    if (outputs_dir == "sponge") {
+      if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
+        LOG(ERROR) << "Failed to get test undeclared outputs dir. Lets skip "
+                      "dumping triton passes.";
+        outputs_dir = "";
+      }
+    }
+    if (!outputs_dir.empty()) {
+      const std::string basename =
+          absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module.name())),
+                       ".", kernel_name, ".triton-passes.log");
+      std::string path = tsl::io::JoinPath(outputs_dir, basename);
+      std::error_code err;
+      log_stream.emplace(path, err, llvm::sys::fs::OF_None);
+      if (err) {
+        log_stream.reset();
+        LOG(ERROR) << "Failed to dump triton passes to " << path << ": "
+                   << err.message();
+      } else {
+        pm.getContext()->disableMultithreading();
+        auto print_always = [](mlir::Pass*, mlir::Operation*) { return true; };
+        pm.enableIRPrinting(/*shouldPrintBeforePass=*/print_always,
+                            /*shouldPrintAfterPass=*/print_always,
+                            /*printModuleScope=*/true,
+                            /*printAfterOnlyOnChange=*/false,
+                            /*printAfterOnlyOnFailure=*/true, *log_stream);
+      }
+    } else {
+      LOG(ERROR)
+          << "--xla_dump_emitter_re=triton-fusion is set, but neither "
+          << "the environment variable TEST_UNDECLARED_OUTPUTS_DIR nor the "
+          << "flag --xla_dump_to is set, so the llvm dumps are disabled.";
+    }
+  }
+
+  CreateTritonXlaPipeline(&pm, gpu_cc, /*rewrite_int4=*/is_xla_fusion,
+                          block_level_parameters.is_tma_allowed,
+                          block_level_parameters.num_stages);
+
+  int num_warps = block_level_parameters.num_warps;
+  int num_ctas = block_level_parameters.num_ctas;
+  int num_stages = block_level_parameters.num_stages;
+  if (num_warps <= 0 || num_ctas <= 0 || num_stages <= 0) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "(num_warps, num_ctas, num_stages) must be positive, but got: (",
+        num_warps, ", ", num_ctas, ", ", num_stages, ")"));
+  }
+  mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
+  CreateTritonPipeline(&pm, gpu_cc, num_warps, num_ctas, num_stages,
+                       cluster_info);
+
+  // Triton generates pointers to the global address space, while XLA needs a
+  // kernel signature with pointers to the generic address space.
+  pm.addPass(mlir::triton::xla::CreateGeneralizeKernelSignaturePass());
+  // llvm::Linker::linkModules() segfaults if we don't strip locations.
+  pm.addPass(mlir::createStripDebugInfoPass());
+
+  if (failed(pm.run(triton_module))) {
+    return Internal("Failed to compile Triton kernel.");
+  }
+
+  const int shared_mem_bytes =
+      triton_module->getAttrOfType<mlir::IntegerAttr>("ttg.shared").getInt();
+  VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
+  if (shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
+    return absl::ResourceExhaustedError(absl::StrFormat(
+        "Shared memory size limit exceeded: requested %d, available: %d",
+        shared_mem_bytes, device_info.shared_memory_per_block_optin()));
+  }
+
+  if (auto* cuda_cc = gpu_cc.cuda_compute_capability();
+      cuda_cc != nullptr && cuda_cc->IsBlackwell()) {
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory
+    constexpr int kTensorMemoryColumns = 512;
+    const int tensor_mem_columns =
+        triton_module
+            ->getAttrOfType<mlir::IntegerAttr>("ttg.tensor_memory_size")
+            .getInt();
+    if (tensor_mem_columns > 0) {
+      VLOG(2) << "Tensor memory usage: " << tensor_mem_columns << " columns";
+    }
+    if (tensor_mem_columns > kTensorMemoryColumns) {
+      return absl::ResourceExhaustedError(absl::StrFormat(
+          "Tensor memory size limit exceeded: requested %d, available: %d",
+          tensor_mem_columns, kTensorMemoryColumns));
+    }
+  }
+
+  std::vector<llvm::Metadata*> captured_nvvm_annotations;
+  if (emit_kernel) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<llvm::Module> ll_triton_module,
+        TranslateLLVMToLLVMIR(&llvm_module->getContext(), triton_module));
+
+    XLA_VLOG_LINES(5, llvm_ir::DumpToString(ll_triton_module.get()));
+    if (should_verify) {
+      VerifyModule(*ll_triton_module);
+    }
+
+    // Integrate LLVM matmul kernel into XLA's LLVM module.
+    captured_nvvm_annotations =
+        xgt::ExtractNvvmAnnotations(ll_triton_module.get());
+    ll_triton_module->setDataLayout(llvm_module->getDataLayout());
+    ll_triton_module->setTargetTriple(llvm_module->getTargetTriple());
+    // Use override flag because libdevice functions can be present in both.
+    TF_RET_CHECK(
+        !llvm::Linker::linkModules(*llvm_module, std::move(ll_triton_module),
+                                   llvm::Linker::Flags::OverrideFromSrc));
+
+    XLA_VLOG_LINES(5, llvm_ir::DumpToString(llvm_module));
+    if (should_verify) {
+      VerifyModule(*llvm_module);
+    }
+  }
+
+  // `cluster_info` must be read after pm.run().
+  std::optional<se::ClusterDim> cluster_dim;
+  if (block_level_parameters.num_ctas > 1) {
+    VLOG(3) << "num_ctas: " << block_level_parameters.num_ctas
+            << ", cluster_info: " << cluster_info.clusterDimX << ","
+            << cluster_info.clusterDimY << "," << cluster_info.clusterDimZ;
+    if (cluster_info.clusterDimX > 1 || cluster_info.clusterDimY > 1 ||
+        cluster_info.clusterDimZ > 1) {
+      cluster_dim =
+          se::ClusterDim(cluster_info.clusterDimX, cluster_info.clusterDimY,
+                         cluster_info.clusterDimZ);
+    }
+  } else {
+    TF_RET_CHECK(cluster_info.clusterDimX == 1 &&
+                 cluster_info.clusterDimY == 1 &&
+                 cluster_info.clusterDimZ == 1);
+  }
+
+  SmallVector<mlir::LLVM::LLVMFuncOp> func_ops;
+  for (auto func : triton_module.getOps<mlir::LLVM::LLVMFuncOp>()) {
+    // Custom calls will also match to LLVMFuncOp, so we are only interested in
+    // the entry function.
+    if (func.getName().str() == kernel_name) {
+      func_ops.push_back(func);
+    }
+  }
+  CHECK_EQ(func_ops.size(), 1)
+      << "Expected a single LLVMFuncOp in the module for the entry function.";
+  mlir::LLVM::LLVMFuncOp func_op = func_ops[0];
+
+  TF_ASSIGN_OR_RETURN(se::ThreadDim thread_dims,
+                      xgt::ExtractThreadDims(triton_module, func_op));
+  TF_ASSIGN_OR_RETURN(stream_executor::gpu::TmaMetadata tma_metadata,
+                      xgt::ExtractTmaMetadata(func_op));
+
+  // Propagate the following extracted information from the Triton module:
+  // - TMA metadata.
+  // - Total threads per block. Computed from module attributes.
+  // - Captured NVVM annotations.
+  TritonWrapperResult result = {
+      shared_mem_bytes,          cluster_dim, tma_metadata, thread_dims,
+      captured_nvvm_annotations,
+  };
+  return result;
+}
+
+std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
+                             const se::DeviceDescription& device_info) {
+  if (device_info.gpu_compute_capability().IsCuda()) {
+    return nvptx::LibDevicePath(
+        hlo_config.debug_options().xla_gpu_cuda_data_dir());
+  }
+  return "";
+}
+
+namespace ir_emitter_triton_internal {
+
+absl::Status LowerXTileToTriton(mlir::ModuleOp xtile_dialect_module,
+                                mlir::MLIRContext& mlir_context,
+                                const HloFusionInstruction& fusion,
+                                const se::DeviceDescription& device_info) {
+  {
+    // Convert xTile ops to Triton ops.
+    mlir::PassManager pm(&mlir_context);
+    // Disable verifier because the Triton code may be invalid due to the
+    // unsupported types.
+    pm.enableVerifier(/*enabled=*/false);
+    pm.addPass(xtile::createConvertElementwise0DTensorToScalarPass());
+    pm.addPass(mlir::triton::xla::CreateArithFP8ConversionToTritonPass());
+    pm.addPass(mlir::triton::xla::CreateTensorLowerToTritonPass());
+    pm.addPass(mlir::triton::xla::CreateStableHLOLowerToTritonPass());
+    pm.addPass(mlir::triton::xla::CreateXTileLowerToTritonPass());
+
+    std::string libdevice_path =
+        GetLibdevicePath(fusion.GetModule()->config(), device_info);
+    absl::string_view triple = device_info.gpu_compute_capability().IsRocm()
+                                   ? "amdgcn-unknown-unknown"
+                                   : "nvptx64-unknown-unknown";
+    pm.addPass(mlir::triton::xla::CreateTritonXLAMathToLibdevicePass(
+        libdevice_path, triple));
+
+    tsl::StatusScopedDiagnosticHandler diagnostic_handler(&mlir_context);
+    if (absl::Status status =
+            diagnostic_handler.consumeStatus(pm.run(xtile_dialect_module));
+        !status.ok()) {
+      return CreateInternalError(
+          "Failed to lower from shared dialect to Triton.", &fusion,
+          xtile_dialect_module);
+    }
+  }
+
+  if (fusion.GetModule()
+          ->config()
+          .debug_options()
+          .xla_gpu_experimental_scaled_dot_with_triton()) {
+    // Convert unsupported types before verification.
+    mlir::PassManager pm(&mlir_context);
+    pm.addPass(mlir::triton::xla::CreateTritonXLAConvertUnsupportedTypesPass());
+    if (mlir::failed(pm.run(xtile_dialect_module))) {
+      return CreateInternalError(
+          "Failed to fix unsupported types in Triton module for fusion:",
+          &fusion, xtile_dialect_module);
+    }
+  }
+
+  if (mlir::failed(mlir::verify(xtile_dialect_module))) {
+    return CreateInternalError("Failed to verify Triton module for fusion:",
+                               &fusion, xtile_dialect_module);
+  }
+  mlir::PassManager pm(&mlir_context);
+
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  if (mlir::failed(pm.run(xtile_dialect_module))) {
+    return CreateInternalError("Failed to create Triton module for fusion:",
+                               &fusion, xtile_dialect_module);
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace ir_emitter_triton_internal
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
new file mode 100644
index 00000000000000..d1e2ec2aaf000f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
@@ -0,0 +1,163 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_XTILE_COMPILER_H_
+#define XLA_BACKENDS_GPU_CODEGEN_TRITON_XTILE_COMPILER_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/PassManager.h"
+#include "xla/autotuning.pb.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace mlir {
+namespace triton {
+namespace nvidia_gpu {
+struct ClusterInfo;
+}
+}  // namespace triton
+}  // namespace mlir
+
+namespace xla {
+namespace gpu {
+
+struct TritonWrapperResult {
+  int64_t shmem_bytes = 0;
+  std::optional<se::ClusterDim> cluster_dim;
+  se::gpu::TmaMetadata tma_metadata;
+  se::ThreadDim thread_dims;
+
+  // The captured nvvm.annotations from the lowest level LLVM IR coming from
+  // Triton. We need to propagate them because we later create the kernel and
+  // splice the impl_fn into it.
+  std::vector<llvm::Metadata*> nvvm_annotations;
+};
+
+// Load the MLIR dialects required for Triton IR generation.
+void LoadMlirDialectsForTriton(mlir::MLIRContext& mlir_context);
+
+// Generate Triton IR by running the provided generator and compile it into LLVM
+// IR.
+absl::StatusOr<TritonWrapperResult> TritonWrapper(
+    absl::string_view fn_name, const HloFusionInstruction* fusion,
+    const se::GpuComputeCapability& cc,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    llvm::Module* llvm_module, mlir::MLIRContext& mlir_context);
+
+// Creates the initial Triton module for the given fusion. Visible for testing,
+// use TritonWrapper instead.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
+    absl::string_view fn_name, const HloFusionInstruction* fusion,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::MLIRContext& mlir_context);
+
+// Compiles a given Triton module to LLVM IR.
+// If `emit_kernels` is false, then the function skips emitting
+// the kernels, but it still returns correctly filled TritonWrapperResult.
+// That is useful when deserializing from the compilation cache.
+absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
+    absl::string_view kernel_name, const HloModule& hlo_module,
+    const se::DeviceDescription& device_info,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::ModuleOp triton_module, llvm::Module* llvm_module,
+    mlir::MLIRContext& mlir_context, bool is_xla_fusion,
+    bool emit_kernel = true);
+
+std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
+                             const se::DeviceDescription& device_info);
+
+// TODO(b/406472229): Move the contents of this namespace to a helpers file
+// to avoid polluting `fusion_emitter.h`.
+// Exposed for testing and experimental purposes only. Do not use.
+namespace ir_emitter_triton_internal {
+
+// Computes the transformation from a 1-d program_id to a tile multi-index.
+llvm::SmallVector<mlir::Value, 3> ComputeDelinearizedTileIndex(
+    EmitterLocOpBuilder b, absl::Span<const int64_t> num_output_tiles_per_dim);
+
+// Dumps the Triton IR to a string.
+//
+// If `dump_annotations` is true, then the function also dumps the loc
+// attributes of the instructions. Otherwise, it dumps the IR without
+// annotations.
+inline std::string DumpTritonIR(mlir::ModuleOp triton_module,
+                                bool dump_annotations) {
+  std::string triton_ir;
+  llvm::raw_string_ostream os(triton_ir);
+  triton_module.print(os, mlir::OpPrintingFlags().enableDebugInfo(
+                              dump_annotations, dump_annotations));
+  if (dump_annotations) {
+    return EmitterLocOpBuilder::FormatTritonIrWithAnnotations(triton_ir);
+  }
+  return triton_ir;
+}
+
+// Given a tiling specification for a fusion and an annotated fusion, derives a
+// tiling for the annotated fusion.
+//
+// Note that the tiling extracted here is voluntarily not checked against the
+// specification, which means that it could be invalid. This should only be the
+// case, though, if this logic gets stale, or if the fusion does not contain
+// the required annotations. Checking constraints is not cheap, so we left it up
+// to the caller to decide when to check the constraints.
+//
+// TODO(b/421837868): this belongs near/in `BlockLevelParameters`, but we start
+// with this here in order to allow an incremental replacement.
+absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
+    const HloFusionInstruction* fusion,
+    const SymbolicTileAnalysis& symbolic_tile_analysis,
+    const BlockLevelParameters& block_level_parameters);
+
+// This function lowers the shared dialect module to Triton. It is exposed for
+// testing with the same motivation as EmitXTileModule.
+//
+// The `fusion` instruction should be the one that was used to create the shared
+// dialect module.
+absl::Status LowerXTileToTriton(mlir::ModuleOp xtile_dialect_module,
+                                mlir::MLIRContext& mlir_context,
+                                const HloFusionInstruction& fusion,
+                                const se::DeviceDescription& device_info);
+
+}  // namespace ir_emitter_triton_internal
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_XTILE_COMPILER_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub.cc
similarity index 98%
rename from third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
rename to third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub.cc
index d5af81b56c90bd..c619671a2a9cb1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Pass/PassManager.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/hlo_module_config.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
similarity index 62%
rename from third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
rename to third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
index 9e091d433d274e..8cb78ad98edb2c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
@@ -14,17 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include <gtest/gtest.h>
-#include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
 
@@ -32,13 +27,14 @@ namespace xla::gpu {
 namespace {
 
 TEST(TritonStub, CallStubApi) {
-  mlir::MLIRContext context;
+  mlir::MLIRContext mlir_context;
 
-  LoadMlirDialectsForTriton(context);
-  EXPECT_FALSE(TritonWrapper({}, nullptr, {}, {}, {}, nullptr, context).ok());
-  EXPECT_FALSE(CreateTritonModule({}, nullptr, {}, {}, context).ok());
+  LoadMlirDialectsForTriton(mlir_context);
+  EXPECT_FALSE(
+      TritonWrapper({}, nullptr, {}, {}, {}, nullptr, mlir_context).ok());
+  EXPECT_FALSE(CreateTritonModule({}, nullptr, {}, {}, mlir_context).ok());
   EXPECT_FALSE(CompileTritonToLLVM("", HloModule("test", HloModuleConfig()), {},
-                                   {}, {}, nullptr, context,
+                                   {}, {}, nullptr, mlir_context,
                                    /*is_xla_fusion=*/true, {})
                    .ok());
 
@@ -53,16 +49,6 @@ TEST(TritonStub, CallStubApi) {
   EXPECT_TRUE(tiled_hlo.ok());
 }
 
-TEST(TritonStub, CallLegacyMatMulApis) {
-  HloConstantInstruction constant(Literal{});
-  auto adaptor = HloFusionAdaptor::ForInstruction(&constant);
-  EXPECT_FALSE(GetMatMulLaunchDimensions({}, *adaptor.get(), {}, {}).ok());
-
-  mlir::MLIRContext context;
-  EmitterLocOpBuilder builder(mlir::UnknownLoc::get(&context), &context);
-  EXPECT_FALSE(EmitMatMul(builder, {}, {}, nullptr, {}, {}).ok());
-}
-
 }  // namespace
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index f7438211a35de1..9b389fdb7e8afe 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -109,6 +109,7 @@ cc_library(
     srcs = ["gpu_clique_key.cc"],
     hdrs = ["gpu_clique_key.h"],
     deps = [
+        "//xla:xla_data_proto_cc",
         "//xla/core/collectives:clique_key",
         "//xla/service:global_device_id",
         "//xla/tsl/lib/gtl:int_type",
@@ -128,6 +129,7 @@ xla_cc_test(
     srcs = ["gpu_clique_key_test.cc"],
     deps = [
         ":gpu_clique_key",
+        "//xla:xla_data_proto_cc",
         "//xla/core/collectives:clique_id",
         "//xla/service:global_device_id",
         "//xla/tsl/platform:test",
@@ -162,6 +164,7 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:coordination_service_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:btree",
@@ -170,6 +173,7 @@ cc_library(
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -188,7 +192,6 @@ cc_library(
     srcs = ["gpu_collectives.cc"],
     hdrs = ["gpu_collectives.h"],
     deps = [
-        ":gpu_communicator",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -197,6 +200,7 @@ cc_library(
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
@@ -207,6 +211,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
     ],
 )
@@ -220,7 +225,6 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_memory",
-        "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -497,7 +501,6 @@ xla_test(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -516,12 +519,18 @@ xla_test(
 xla_test(
     name = "nvshmem_collectives_test",
     srcs = ["nvshmem_collectives_test.cc"],
-    backend_tags = {"gpu": [
-        "multi_gpu_h100",
-        "no_oss",
-        "noasan",
-        "nomsan",
-    ]},
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+            "no_oss",
+            "noasan",
+            "nomsan",
+        ],
+        "nvgpu_any": [
+            "broken",
+            "no_oss",
+        ],
+    },
     backends = ["gpu"],
     env = {
         "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
@@ -561,9 +570,7 @@ xla_cc_test(
     srcs = ["single_threaded_executor_test.cc"],
     deps = [
         ":single_threaded_executor",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
index 55986a8dae2635..6d12361b56047f 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
@@ -31,12 +31,13 @@ limitations under the License.
 #include "xla/core/collectives/clique_key.h"
 #include "xla/service/global_device_id.h"
 #include "xla/tsl/platform/logging.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 
 namespace xla::gpu {
 
 bool IsP2PStreamKind(AsyncStreamKind stream_kind) {
-  return stream_kind != AsyncStreamKind::kCollective;
+  return stream_kind != AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
 }
 
 CollectiveStreamId GetCollectiveStreamId(bool is_async,
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
index 2177ace0a9baf8..0ee7587aae7b40 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
@@ -25,26 +25,14 @@ limitations under the License.
 #include "xla/core/collectives/clique_key.h"
 #include "xla/service/global_device_id.h"
 #include "xla/tsl/lib/gtl/int_type.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 
-// In XLA:GPU we use different streams for different kinds of collective
-// operations, and include the async stream kind into the GPU clique key.
-//
-// We carefully isolate different kinds of collectives using separate
-// communicators and guarantee that all collective operations have a total order
-// that will not create a deadlock.
-enum class AsyncStreamKind : int64_t {
-  kCollective = 0,  // Stream for asynchronous collective ops.
-  kP2P0 = 1,        // One Stream for P2P Send and Recv ops.
-  kP2P1 = 2,        // Another Stream for P2P Send and Recv ops.
-  kMemCpyP2P = 3,   // Stream for MemCpyP2P
-};
-
 bool IsP2PStreamKind(AsyncStreamKind stream_kind);
 
 inline constexpr int64_t kAsyncStreamTotal =
-    static_cast<int64_t>(AsyncStreamKind::kMemCpyP2P) + 1;
+    static_cast<int64_t>(AsyncStreamKind::ASYNC_STREAM_KIND_MEMCPYP2P) + 1;
 
 // Strongly-typed wrapper to represent collective stream ID.
 TSL_LIB_GTL_DEFINE_INT_TYPE(CollectiveStreamId, uint64_t);
@@ -53,7 +41,8 @@ TSL_LIB_GTL_DEFINE_INT_TYPE(CollectiveStreamId, uint64_t);
 // These IDs can be used, for example, to look up the NCCL communicator.
 CollectiveStreamId GetCollectiveStreamId(
     bool is_async, CollectiveStreamId stream_id = CollectiveStreamId(1),
-    AsyncStreamKind stream_kind = AsyncStreamKind::kCollective);
+    AsyncStreamKind stream_kind =
+        AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE);
 
 // Clique key for identifying a particular collectives clique on a GPU backend.
 class GpuCliqueKey : public CliqueKey {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
index 25d0304a0f19fb..0742ebaa08456c 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/core/collectives/clique_id.h"
 #include "xla/service/global_device_id.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 
@@ -202,19 +203,21 @@ TEST(GpuCliqueIdStringTest, ToString) {
 
 TEST(GpuCliqueKeyTest, GetCollectiveStreamId) {
   EXPECT_EQ(GetCollectiveStreamId(false, CollectiveStreamId(0),
-                                  AsyncStreamKind::kP2P0),
+                                  AsyncStreamKind::ASYNC_STREAM_KIND_P2P0),
             CollectiveStreamId(0));
+  EXPECT_EQ(
+      GetCollectiveStreamId(true, CollectiveStreamId(0),
+                            AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      CollectiveStreamId(1));
   EXPECT_EQ(GetCollectiveStreamId(true, CollectiveStreamId(0),
-                                  AsyncStreamKind::kCollective),
-            CollectiveStreamId(1));
-  EXPECT_EQ(GetCollectiveStreamId(true, CollectiveStreamId(0),
-                                  AsyncStreamKind::kP2P0),
-            CollectiveStreamId(2));
-  EXPECT_EQ(GetCollectiveStreamId(true, CollectiveStreamId(2),
-                                  AsyncStreamKind::kCollective),
+                                  AsyncStreamKind::ASYNC_STREAM_KIND_P2P0),
             CollectiveStreamId(2));
+  EXPECT_EQ(
+      GetCollectiveStreamId(true, CollectiveStreamId(2),
+                            AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      CollectiveStreamId(2));
   EXPECT_EQ(GetCollectiveStreamId(true, CollectiveStreamId(1),
-                                  AsyncStreamKind::kP2P0),
+                                  AsyncStreamKind::ASYNC_STREAM_KIND_P2P0),
             CollectiveStreamId(1));
 }
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
index 60b65bb8e3d0f2..f78d12490c7d28 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/collectives/gpu_cliques.h"
 
+#include <atomic>
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
@@ -32,6 +33,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -59,6 +61,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/hash.h"
@@ -100,7 +103,15 @@ namespace {
 // Container for initialized and ready to use local (in-process) GPU cliques.
 struct ProcessGpuCliques {
   absl::Mutex mu;
+
+  // GpuCliques, keyed by GpuCliqueKey.
   absl::node_hash_map<GpuCliqueKey, LockableGpuClique> map ABSL_GUARDED_BY(mu);
+
+  // Booleans that can be set to cancel the construction of a GpuClique.
+  absl::node_hash_map<GpuCliqueKey, std::atomic_bool> cancel
+      ABSL_GUARDED_BY(mu);
+
+  // The latest state of every task.
   std::vector<tensorflow::CoordinatedTaskStateInfo> task_state_infos
       ABSL_GUARDED_BY(mu);
 };
@@ -228,12 +239,8 @@ static absl::StatusOr<bool> EnablePeerAccess(
   return true;
 }
 
-// Returns a non-ok status if the provided clique key is "stale". A clique key
-// is stale if its incarnations don't match the latest incarnations or if any of
-// the tasks specified in the clique key have failed.
-//
 // REQUIRES: GetProcessGpuCliques().mu held
-static absl::Status CheckCliqueKeyIsntStale(
+static absl::Status CheckCliqueKeyIsntStaleImpl(
     absl::Span<const tensorflow::CoordinatedTaskStateInfo> task_state_infos,
     const GpuCliqueKey& clique_key) {
   if (task_state_infos.empty()) {
@@ -265,6 +272,12 @@ static absl::Status CheckCliqueKeyIsntStale(
   return absl::OkStatus();
 }
 
+absl::Status CheckCliqueKeyIsntStale(const GpuCliqueKey& clique_key) {
+  ProcessGpuCliques& cliques = GetProcessGpuCliques();
+  absl::MutexLock lock(cliques.mu);
+  return CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key);
+}
+
 // Joins a GpuClique initialization rendezvous for a `clique_key` and returns
 // a lock that gives an access to initialized clique (access is shared between
 // all participating ranks that own a shared pointer).
@@ -341,24 +354,36 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         clique_ids.fingerprint(), peer_access_enabled);
 
     ProcessGpuCliques& cliques = GetProcessGpuCliques();
+    std::atomic_bool* cancel = nullptr;
     {
       VLOG(5) << "Locking cliques.mu";
       absl::MutexLock lock(cliques.mu);
       VLOG(5) << "Checking clique key " << clique_key.ToString()
               << " for staleness";
       TF_RETURN_IF_ERROR(
-          CheckCliqueKeyIsntStale(cliques.task_state_infos, clique_key));
+          CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key));
+      auto [it, unused_inserted] = cliques.cancel.emplace(clique_key, false);
+      cancel = &it->second;
     }
 
     VLOG(5) << "Creating communicators";
-    TF_ASSIGN_OR_RETURN(
-        std::vector<std::unique_ptr<Communicator>> created_comms,
-        collectives->CreateCommunicators(clique_key, clique_ids, ranks,
-                                         config));
+    // Don't hold cliques.mu while creating the communicators, because creating
+    // communicators can block.
+    absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> created_comms =
+        collectives->CreateCommunicatorsWithCancel(clique_key, clique_ids,
+                                                   ranks, config, cancel);
+
+    VLOG(5) << "Locking cliques.mu";
+    absl::MutexLock lock(cliques.mu);
+    cliques.cancel.erase(clique_key);
+
+    if (!created_comms.ok()) {
+      return created_comms.status();
+    }
 
     absl::btree_map<RankId, std::unique_ptr<Communicator>> comms;
     for (size_t i = 0; i < ranks.size(); ++i) {
-      comms[ranks[i].rank] = std::move(created_comms[i]);
+      comms[ranks[i].rank] = std::move((*created_comms)[i]);
     }
 
     VLOG(3) << absl::StreamFormat(
@@ -367,14 +392,12 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         clique_key.ToString(), DeviceRanksToString(ranks), nroots,
         clique_ids.fingerprint(), peer_access_enabled);
 
-    VLOG(5) << "Locking cliques.mu";
-    absl::MutexLock lock(cliques.mu);
     if (absl::Status s =
-            CheckCliqueKeyIsntStale(cliques.task_state_infos, clique_key);
+            CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key);
         !s.ok()) {
       LOG(WARNING) << "Clique key " << clique_key.ToString()
                    << " is stale. Aborting recently created communicators.";
-      for (std::unique_ptr<Communicator>& comm : created_comms) {
+      for (auto& [rank, comm] : comms) {
         TF_RETURN_IF_ERROR(comm->Abort());
       }
       return s;
@@ -494,7 +517,6 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
     // creating new communicators.
     std::vector<Communicator*> parent_comms;
     std::vector<RankId> keys;
-
     for (auto& [parent_rank, split_rank] : rank_mapping) {
       auto parent_comm = (*parent_clique)->comm(parent_rank);
       if (!parent_comm.has_value()) {
@@ -507,6 +529,12 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
       keys.push_back(split_rank);
     }
 
+    std::vector<DeviceRank> ranks;
+    ranks.reserve(rank_pairs.size());
+    for (auto& rank_pair : rank_pairs) {
+      ranks.emplace_back(rank_pair->second);
+    }
+
     // Get a globally consistent color value for newly created clique.
     int32_t color = GetCommSplitColor(clique_key);
 
@@ -518,11 +546,6 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
     } else {
       // The parent clique is not local, but this clique can be local. We need
       // to check if peer access is possible between all devices in this clique.
-      std::vector<DeviceRank> ranks;
-      ranks.reserve(rank_pairs.size());
-      for (auto& rank_pair : rank_pairs) {
-        ranks.emplace_back(rank_pair->second);
-      }
       TF_ASSIGN_OR_RETURN(peer_access_enabled,
                           EnablePeerAccess(clique_key, ranks));
     }
@@ -535,23 +558,36 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
 
     ProcessGpuCliques& cliques = GetProcessGpuCliques();
+    std::atomic_bool* cancel = nullptr;
     {
       VLOG(5) << "Locking cliques.mu";
       absl::MutexLock lock(cliques.mu);
       VLOG(5) << "Checking clique key " << clique_key.ToString()
               << " for staleness";
       TF_RETURN_IF_ERROR(
-          CheckCliqueKeyIsntStale(cliques.task_state_infos, clique_key));
+          CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key));
+      cancel = &cliques.cancel[clique_key];
+      auto [it, unused_inserted] = cliques.cancel.emplace(clique_key, false);
+      cancel = &it->second;
     }
 
+    // Don't hold cliques.mu while creating the communicators, because creating
+    // communicators can block.
     VLOG(5) << "Splitting communicators";
-    TF_ASSIGN_OR_RETURN(
-        auto splitted_comms,
-        collectives->SplitCommunicators(parent_comms, color, keys, config));
+    auto splitted_comms = collectives->SplitCommunicatorsWithCancel(
+        parent_comms, color, keys, config, ranks, cancel);
+
+    VLOG(5) << "Locking cliques.mu";
+    absl::MutexLock lock(cliques.mu);
+    cliques.cancel.erase(clique_key);
+
+    if (!splitted_comms.ok()) {
+      return splitted_comms.status();
+    }
 
     absl::btree_map<RankId, std::unique_ptr<Communicator>> comms;
-    for (size_t i = 0; i < splitted_comms.size(); ++i) {
-      comms[keys[i]] = std::move(splitted_comms[i]);
+    for (size_t i = 0; i < splitted_comms->size(); ++i) {
+      comms[keys[i]] = std::move((*splitted_comms)[i]);
     }
 
     VLOG(3) << absl::StreamFormat(
@@ -562,14 +598,12 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         peer_access_enabled,
         absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
 
-    VLOG(5) << "Locking cliques.mu";
-    absl::MutexLock lock(cliques.mu);
     if (absl::Status s =
-            CheckCliqueKeyIsntStale(cliques.task_state_infos, clique_key);
+            CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key);
         !s.ok()) {
       LOG(WARNING) << "Clique key " << clique_key.ToString()
                    << " is stale. Aborting recently split communicators.";
-      for (std::unique_ptr<Communicator>& comm : splitted_comms) {
+      for (auto& [rank, comm] : comms) {
         TF_RETURN_IF_ERROR(comm->Abort());
       }
       return s;
@@ -646,8 +680,8 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
             auto lockable_clique = [&]() -> LockableGpuClique* {
               absl::MutexLock lock(cliques.mu);
               auto it = cliques.map.find(clique_key);
-              absl::Status stale =
-                  CheckCliqueKeyIsntStale(cliques.task_state_infos, clique_key);
+              absl::Status stale = CheckCliqueKeyIsntStaleImpl(
+                  cliques.task_state_infos, clique_key);
               return it == cliques.map.end() || !stale.ok() ? nullptr
                                                             : &it->second;
             }();
@@ -710,6 +744,7 @@ bool CliqueKeyContainsIncarnation(
 // REQUIRES: GetProcessGpuCliques().mu held
 static absl::Status AbortCliquesWithIncarnations(
     absl::node_hash_map<GpuCliqueKey, LockableGpuClique>& map,
+    absl::node_hash_map<GpuCliqueKey, std::atomic_bool>& cancel,
     absl::Span<const IncarnationId> incarnations) {
   VLOG(1) << "Aborting GPU cliques for incarnations "
           << absl::StrJoin(incarnations, ", ",
@@ -718,19 +753,59 @@ static absl::Status AbortCliquesWithIncarnations(
                            });
   const absl::flat_hash_set<IncarnationId> incarnation_set(incarnations.begin(),
                                                            incarnations.end());
+
+  // Cancel pending collectives.
+  for (auto& [key, b] : cancel) {
+    if (CliqueKeyContainsIncarnation(key, incarnation_set)) {
+      VLOG(1) << "Canceling pending GPU clique " << key.ToString();
+      b.store(true);
+    }
+  }
+
+  // Abort collectives.
   absl::Status result;
+  absl::Mutex result_mu;
+  {
+    // We need to abort all communicators concurrently. If we abort serially, an
+    // abort of one communicator may get blocked by a pending collective on a
+    // different communicator.
+    std::vector<std::unique_ptr<tsl::Thread>> threads;
+    for (auto& [key, lockable_clique] : map) {
+      if (!CliqueKeyContainsIncarnation(key, incarnation_set)) {
+        VLOG(1) << "Not aborting GPU clique " << key.ToString()
+                << " because it does not include a stale incarnation";
+        continue;
+      }
+
+      auto abort = [&result, &result_mu, key = key,
+                    lockable_clique = &lockable_clique]() {
+        VLOG(1) << "Aborting GPU clique " << key.ToString();
+        if (absl::Status s = lockable_clique->Abort(); !s.ok()) {
+          LOG(ERROR) << "Error aborting GPU clique " << key.ToString() << ": "
+                     << s;
+          absl::MutexLock lock(result_mu);
+          result = std::move(s);
+        } else {
+          VLOG(1) << "Aborted GPU clique " << key.ToString();
+        }
+      };
+
+      VLOG(1) << "Launching thread to abort GPU clique " << key.ToString();
+      threads.push_back(absl::WrapUnique(tsl::Env::Default()->StartThread(
+          tsl::ThreadOptions(), "abort", abort)));
+    }
+  }  // threads' destructor will block until all threads finish.
+
+  // Garbage collect aborted collectives.
   for (auto it = map.begin(); it != map.end();) {
     auto copy = it++;
     auto& [key, lockable_clique] = *copy;
     if (!CliqueKeyContainsIncarnation(key, incarnation_set)) {
-      VLOG(1) << "Not aborting GPU clique " << key.ToString();
+      VLOG(1) << "Not removing GPU clique " << key.ToString()
+              << " because it does not include a stale incarnation";
       continue;
     }
-    VLOG(1) << "Aborting GPU clique " << key.ToString();
-    if (absl::Status s = lockable_clique.Abort(); !s.ok()) {
-      LOG(ERROR) << "Error aborting GPU clique " << key.ToString() << ": " << s;
-      result = std::move(s);
-    }
+    VLOG(1) << "Removing GPU clique " << key.ToString();
     map.erase(copy);
   }
   return result;
@@ -742,6 +817,7 @@ static absl::Status AbortCliquesWithIncarnations(
 // REQUIRES: GetProcessGpuCliques().mu held
 static absl::Status AbortOnFailure(
     absl::node_hash_map<GpuCliqueKey, LockableGpuClique>& map,
+    absl::node_hash_map<GpuCliqueKey, std::atomic_bool>& cancel,
     absl::Span<const tensorflow::CoordinatedTaskStateInfo> previous_state,
     absl::Span<const tensorflow::CoordinatedTaskStateInfo> current_state) {
   if (previous_state.empty()) {
@@ -783,7 +859,7 @@ static absl::Status AbortOnFailure(
   }
 
   if (!failed_incarnations.empty()) {
-    return AbortCliquesWithIncarnations(map, failed_incarnations);
+    return AbortCliquesWithIncarnations(map, cancel, failed_incarnations);
   }
   return absl::OkStatus();
 }
@@ -792,7 +868,8 @@ absl::Status UpdateGlobalProcessInfo(
     absl::Span<tensorflow::CoordinatedTaskStateInfo> infos) {
   ProcessGpuCliques& cliques = GetProcessGpuCliques();
   absl::MutexLock lock(cliques.mu);
-  absl::Status s = AbortOnFailure(cliques.map, cliques.task_state_infos, infos);
+  absl::Status s = AbortOnFailure(cliques.map, cliques.cancel,
+                                  cliques.task_state_infos, infos);
   if (!s.ok()) {
     LOG(WARNING) << s;
   }
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
index bf55b51caff98b..c02697587a7c23 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/service/lockable.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/types.h"  // IWYU pragma: keep
 
 namespace xla::gpu {
@@ -69,6 +70,11 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
     const GpuCollectives::CliqueIdCallback& clique_id_callback, RankId rank,
     const AcquiredCliquesMap& acquired_cliques, int64_t max_nchannels = 0);
 
+// Returns a non-ok status if the provided clique key is "stale". A clique key
+// is stale if its incarnations don't match the latest incarnations or if any of
+// the tasks specified in the clique key have failed.
+absl::Status CheckCliqueKeyIsntStale(const GpuCliqueKey& clique_key);
+
 // Updates the global set of task state information. This function aborts and
 // invalidates all cliques that were created via AcquireGpuClique with
 // incarnations that have become stale.
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
index 31c67bb13241ba..76062ce505336e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
@@ -16,19 +16,23 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_H_
 #define XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_H_
 
+#include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "absl/types/span.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
 #include "xla/executable_run_options.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/global_device_id.h"
@@ -95,6 +99,28 @@ class GpuCollectives : public Collectives {
     bool async_execution = false;
   };
 
+  // A cancelable version of Collectives::CreateCommunicators.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicatorsWithCancel(const CliqueKey& clique_key,
+                                const std::optional<CliqueIds>& clique_ids,
+                                absl::Span<const DeviceRank> ranks,
+                                const Collectives::Config& config,
+                                std::atomic_bool* cancel) {
+    // By default, we ignore cancel.
+    return CreateCommunicators(clique_key, clique_ids, ranks, config);
+  }
+
+  // A cancelable version of Collectives::SplitCommunicators.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  SplitCommunicatorsWithCancel(absl::Span<const Communicator* const> comms,
+                               int32_t color, absl::Span<const RankId> keys,
+                               const Collectives::Config& config,
+                               absl::Span<const DeviceRank> ranks,
+                               std::atomic_bool* cancel) {
+    // By default, we ignore cancel.
+    return SplitCommunicators(comms, color, keys, config, ranks);
+  }
+
   // Returns true if GPU collectives are implemented.
   virtual bool IsImplemented() const = 0;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
index f56e0e52249401..45aedd20acbbbe 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
@@ -58,7 +58,7 @@ class GpuCollectivesStub : public GpuCollectives {
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const>, int32_t, absl::Span<const RankId>,
-      const Collectives::Config&) final {
+      const Collectives::Config&, absl::Span<const DeviceRank> ranks) final {
     return UnimplementedError();
   }
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
index e5f78e7de02e28..2f900d8d119f17 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
@@ -105,15 +106,28 @@ NcclCollectives::GetCliqueIdCallback(const CliqueIdCallback* clique_id_callback,
   return local_callback;
 }
 
-static ncclConfig_t AsNcclConfig(const GpuCollectives::Config& config) {
+static absl::StatusOr<ncclConfig_t> AsNcclConfig(
+    const GpuCollectives::Config& config,
+    const se::StreamExecutor* stream_executor) {
   ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
   comm_config.blocking = config.blocking_communicators ? 1 : 0;
 #if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
   comm_config.splitShare = config.split_share;
 #endif
+  int nccl_version;
+  XLA_NCCL_RETURN_IF_ERROR(ncclGetVersion(&nccl_version));
   if (config.max_nchannels > 0) {
     VLOG(1) << "Maximum number of channels is set to: " << comm_config.maxCTAs;
     comm_config.maxCTAs = config.max_nchannels;
+  } else if (stream_executor->GetDeviceDescription()
+                 .cuda_compute_capability()
+                 .IsBlackwell() &&
+             nccl_version >= NCCL_VERSION(2, 28, 0)) {
+    // Future NCCL versions will reduce the default max number of channels on
+    // Blackwell to 16. We need to manually set it to 32 here to avoid surprise
+    // perf regressions.
+    VLOG(1) << "Setting max number of channels to 32 on Blackwell.";
+    comm_config.maxCTAs = 32;
   }
   return comm_config;
 }
@@ -130,10 +144,10 @@ static absl::StatusOr<ncclUniqueId> AsNcclUniqueId(const CliqueId& clique_id) {
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
-NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
-                                     const std::optional<CliqueIds>& clique_ids,
-                                     absl::Span<const DeviceRank> ranks,
-                                     const Collectives::Config& config) {
+NcclCollectives::CreateCommunicatorsWithCancel(
+    const CliqueKey& clique_key, const std::optional<CliqueIds>& clique_ids,
+    absl::Span<const DeviceRank> ranks, const Collectives::Config& config,
+    std::atomic_bool* cancel) {
   // Validate clique ids. With the NCCL backend, we rely on the host to exchange
   // unique clique ids.
   if (!clique_ids.has_value() || clique_ids->data().empty()) {
@@ -154,7 +168,6 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
         "async_execution is false. Non-blocking communicators require "
         "asynchronous execution.");
   }
-  ncclConfig_t comm_config = AsNcclConfig(gpu_config);
 
   // make_comm returns a new ncclComm_t.
   auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
@@ -165,6 +178,10 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
     auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
     TF_RET_CHECK(device != nullptr);
     auto activate_context = device->stream_executor()->Activate();
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsNcclConfig(gpu_config, device->stream_executor()));
+
     TF_ASSIGN_OR_RETURN(auto nccl_unique_id, AsNcclUniqueId(clique_ids->at(0)));
     ncclComm_t comm;
     XLA_NCCL_RETURN_IF_ERROR(
@@ -184,7 +201,7 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
       pool.Schedule([&, i]() {
         absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
             NcclCommunicator::Create(std::bind(make_comm, i),
-                                     gpu_config.async_execution);
+                                     gpu_config.async_execution, cancel);
         if (!comm.ok()) {
           absl::call_once(once, [&] { status = comm.status(); });
           return;
@@ -198,10 +215,10 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
-NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
-                                    int32_t color,
-                                    absl::Span<const RankId> keys,
-                                    const Collectives::Config& config) {
+NcclCollectives::SplitCommunicatorsWithCancel(
+    absl::Span<const Communicator* const> comms, int32_t color,
+    absl::Span<const RankId> keys, const Collectives::Config& config,
+    absl::Span<const DeviceRank> ranks, std::atomic_bool* cancel) {
   auto rank_formatter = [](std::string* str, RankId rank) {
     absl::StrAppend(str, rank.value());
   };
@@ -218,10 +235,15 @@ NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
 
   const auto& gpu_config =
       tsl::down_cast<const GpuCollectives::Config&>(config);
-  ncclConfig_t comm_config = AsNcclConfig(gpu_config);
 
 #if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
   auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
+    auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
+    TF_RET_CHECK(device != nullptr);
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsNcclConfig(gpu_config, device->stream_executor()));
+
     VLOG(1) << "Split NCCL communicator " << comms[i] << " with color " << color
             << " and key " << keys[i];
     ncclComm_t split_comm;
@@ -240,7 +262,7 @@ NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
       pool.Schedule([&, i]() {
         absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
             NcclCommunicator::Create(std::bind(make_comm, i),
-                                     gpu_config.async_execution);
+                                     gpu_config.async_execution, cancel);
         if (!comm.ok()) {
           absl::call_once(once, [&] { status = comm.status(); });
           return;
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
index 07da8ff485810d..f12cfb70f5a88e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COLLECTIVES_H_
 #define XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COLLECTIVES_H_
 
+#include <atomic>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -49,14 +50,36 @@ class NcclCollectives : public GpuCollectives {
   CreateCommunicators(const CliqueKey& clique_key,
                       const std::optional<CliqueIds>& clique_ids,
                       absl::Span<const DeviceRank> ranks,
-                      const Collectives::Config& config) final;
+                      const Collectives::Config& config) final {
+    return CreateCommunicatorsWithCancel(clique_key, clique_ids, ranks, config,
+                                         nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicatorsWithCancel(const CliqueKey& clique_key,
+                                const std::optional<CliqueIds>& clique_ids,
+                                absl::Span<const DeviceRank> ranks,
+                                const Collectives::Config& config,
+                                std::atomic_bool* cancel) final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const> comms, int32_t color,
+      absl::Span<const RankId> keys, const Collectives::Config& config,
+      absl::Span<const DeviceRank> ranks) final {
+    return SplitCommunicatorsWithCancel(comms, color, keys, config, ranks,
+                                        nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  SplitCommunicatorsWithCancel(absl::Span<const Communicator* const> comms,
+                               int32_t color, absl::Span<const RankId> keys,
+                               const Collectives::Config& config,
+                               absl::Span<const DeviceRank> ranks,
+                               std::atomic_bool* cancel) final;
 
   absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator() final {
     return absl::UnimplementedError("Not implemented.");
   }
-  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
-      absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Collectives::Config& config) final;
 
   absl::StatusOr<void*> Allocate(uint64_t bytes) final;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
index a9784d06adb39f..428e52862a0f21 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
@@ -225,11 +225,15 @@ class NcclCommunicator::NcclRegisteredBufferHandle
 
 absl::StatusOr<std::unique_ptr<NcclCommunicator>> NcclCommunicator::Create(
     absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm, bool is_async,
-    tsl::Env& env) {
-  // TODO(mwhittaker): There is currently no way to abort these operations.
-  auto f = [&make_comm]() -> absl::StatusOr<ncclComm_t> {
+    std::atomic_bool* cancel, tsl::Env& env) {
+  auto f = [cancel, &make_comm]() -> absl::StatusOr<ncclComm_t> {
     TF_ASSIGN_OR_RETURN(ncclComm_t comm, make_comm());
-    TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, std::atomic_bool{}));
+    if (cancel) {
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, *cancel));
+    } else {
+      std::atomic_bool never_cancelled;
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, never_cancelled));
+    }
     return comm;
   };
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
index 379f02ced597b3..5620a5aa3130a1 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
@@ -69,7 +69,8 @@ class NcclCommunicator : public GpuCommunicator {
   // synchronously on the calling thread.
   static absl::StatusOr<std::unique_ptr<NcclCommunicator>> Create(
       absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm,
-      bool is_async = false, tsl::Env& env = *tsl::Env::Default());
+      bool is_async = false, std::atomic_bool* cancel = nullptr,
+      tsl::Env& env = *tsl::Env::Default());
 
   ~NcclCommunicator() override;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
index 8967805603510a..4fc7485d48a0e1 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
@@ -76,7 +76,8 @@ class NvshmemCollectives : public GpuCollectives {
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Collectives::Config& config) final {
+      absl::Span<const RankId> keys, const Collectives::Config& config,
+      absl::Span<const DeviceRank> ranks) final {
     return absl::UnimplementedError("Not implemented.");
   }
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
index 7e62541ba0ede8..12356029078e3c 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
@@ -293,6 +293,20 @@ Future<> NvshmemCommunicator::AllReduce(
           dest_ptr, count);
       break;
     }
+    case PrimitiveType::PRED:
+    case PrimitiveType::U8: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          uint8, uint8_t, NVSHMEM_TEAM_SHARED,
+          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
+          dest_ptr, count);
+      break;
+    }
+    case PrimitiveType::S8: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          int8, int8_t, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
+      break;
+    }
     default:
       return absl::InternalError("Invalid Nvshmem reduction type.");
   }
diff --git a/third_party/xla/xla/backends/gpu/ffi.h b/third_party/xla/xla/backends/gpu/ffi.h
new file mode 100644
index 00000000000000..2cda5f919ea094
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/ffi.h
@@ -0,0 +1,127 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_FFI_H_
+#define XLA_BACKENDS_GPU_FFI_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/base/optimization.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/api/api.h"  // IWYU pragma: export
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::ffi {
+
+//===----------------------------------------------------------------------===//
+// Type tags to bind parameters passed via execution context to FFI handler
+//===----------------------------------------------------------------------===//
+
+struct Stream {};            // binds `se::Stream*`
+struct Allocator {};         // binds `se::DeviceMemoryAllocator*`
+struct ScratchAllocator {};  // binds `se::OwningScratchAllocator`
+
+template <typename T>
+struct PlatformStream {};  // binds a platform stream, e.g. `cudaStream_t`
+
+//===----------------------------------------------------------------------===//
+// Context decoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct CtxDecoding<Stream> {
+  using Type = stream_executor::Stream*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    void* stream = nullptr;
+    if (XLA_FFI_Error* error =
+            api->internal_api->XLA_FFI_INTERNAL_Stream_Get(ctx, &stream);
+        ABSL_PREDICT_FALSE(error)) {
+      diagnostic.Emit("Failed to get stream: ")
+          << internal::GetErrorMessage(api, error);
+      internal::DestroyError(api, error);
+      return std::nullopt;
+    }
+    return reinterpret_cast<Type>(stream);
+  }
+};
+
+template <>
+struct CtxDecoding<Allocator> {
+  using Type = stream_executor::DeviceMemoryAllocator*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    void* device_allocator = nullptr;
+    if (XLA_FFI_Error* error =
+            api->internal_api->XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
+                ctx, &device_allocator);
+        ABSL_PREDICT_FALSE(error)) {
+      diagnostic.Emit("Failed to get device memory allocator: ")
+          << internal::GetErrorMessage(api, error);
+      internal::DestroyError(api, error);
+      return std::nullopt;
+    }
+    return reinterpret_cast<Type>(device_allocator);
+  }
+};
+
+template <>
+struct CtxDecoding<ScratchAllocator> {
+  using Type = stream_executor::OwningScratchAllocator<>;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    int32_t device_ordinal =
+        api->internal_api->XLA_FFI_INTERNAL_DeviceOrdinal_Get(ctx);
+
+    auto device_allocator =
+        CtxDecoding<Allocator>::Decode(api, ctx, diagnostic);
+    if (ABSL_PREDICT_FALSE(!device_allocator)) {
+      return std::nullopt;
+    }
+
+    return stream_executor::OwningScratchAllocator<>(device_ordinal,
+                                                     *device_allocator);
+  }
+};
+
+template <typename T>
+struct CtxDecoding<PlatformStream<T>> {
+  using Type = T;
+  static_assert(std::is_pointer_v<T>, "platform stream type must be a pointer");
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    if (auto stream = CtxDecoding<Stream>::Decode(api, ctx, diagnostic)) {
+      return reinterpret_cast<Type>(
+          stream.value()->platform_specific_handle().stream);
+    }
+    return std::nullopt;
+  }
+};
+
+}  // namespace xla::ffi
+
+#endif  // XLA_BACKENDS_GPU_FFI_H_
diff --git a/third_party/xla/xla/backends/gpu/profiler/BUILD b/third_party/xla/xla/backends/gpu/profiler/BUILD
index 1c07d8043f3fe6..7e175865b87390 100644
--- a/third_party/xla/xla/backends/gpu/profiler/BUILD
+++ b/third_party/xla/xla/backends/gpu/profiler/BUILD
@@ -32,7 +32,7 @@ cc_library(
         "//xla/backends/profiler/gpu:cupti_collector",
         "//xla/backends/profiler/gpu:cupti_tracer",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor/cuda:cuda_platform",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/tsl/profiler/utils:time_utils",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc
index 908c10b056dfa6..3f85e66973a1d3 100644
--- a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/backends/gpu/profiler/kernel_name_tracer_factory.h"
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_tracer.h"
-#include "xla/stream_executor/cuda/cuda_platform.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 49873d997a8033..34ee607d5731f0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1,6 +1,10 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
-load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
+load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal", "xla_py_proto_library")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility", "nvtx_headers")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
@@ -63,15 +67,19 @@ cc_library(
         ":all_to_all_thunk",
         ":annotation",
         ":collective_broadcast_thunk",
+        ":collective_permute_thunk",
         ":collective_thunk",
         ":copy_thunk",
         ":custom_call_thunk",
         ":dynamic_slice_thunk",
         ":gpublas_lt_matmul_thunk",
+        ":p2p_thunk_common",
+        ":shaped_slice",
         ":thunk",
         ":while_thunk",
         "//xla:debug_options_flags",
         "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -79,6 +87,7 @@ cc_library(
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
@@ -103,6 +112,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
@@ -139,6 +149,7 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":command_buffer_cmd",
+        ":copy_thunk",
         ":thunk",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
@@ -150,18 +161,19 @@ xla_test(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -177,12 +189,15 @@ cc_library(
         ":all_gather_thunk",
         ":all_reduce_thunk",
         ":all_to_all_thunk",
+        ":collective_broadcast_thunk",
+        ":collective_permute_thunk",
         ":collective_thunk",
         ":command_buffer_cmd",
         ":conditional_thunk",
         ":copy_thunk",
         ":cudnn_thunk",
         ":custom_call_thunk",
+        ":custom_kernel_thunk",
         ":dynamic_slice_thunk",
         ":gemm_thunk",
         ":gpublas_lt_matmul_thunk",
@@ -221,7 +236,6 @@ cc_library(
         ":sequential_thunk",
         ":thunk",
         ":thunk_proto_cc",
-        ":thunk_proto_deserialization",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -230,11 +244,11 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -249,9 +263,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -266,10 +277,14 @@ xla_test(
         ":dynamic_slice_thunk",
         ":dynamic_slice_thunk_proto_cc",
         ":gemm_thunk",
+        ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
+        ":thunk_proto_deserialization",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/ffi",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
@@ -279,7 +294,6 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:resource_requests",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
@@ -290,7 +304,6 @@ xla_test(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
@@ -301,29 +314,6 @@ xla_test(
     ],
 )
 
-cc_library(
-    name = "cholesky_thunk",
-    srcs = ["cholesky_thunk.cc"],
-    hdrs = ["cholesky_thunk.h"],
-    deps = [
-        ":make_batch_pointers",
-        ":thunk",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/stream_executor:stream",
-        "//xla/tsl/platform:errors",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-    ],
-)
-
 cc_library(
     name = "command_buffer_thunk",
     srcs = ["command_buffer_thunk.cc"],
@@ -367,15 +357,12 @@ xla_test(
         ":memset_thunk",
         ":sequential_thunk",
         ":thunk",
-        "//xla:error_spec",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
-        "//xla/service:hlo_module_config",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
@@ -394,11 +381,8 @@ xla_test(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:gpu_test_kernels",
         "//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
-        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
-        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
@@ -430,8 +414,8 @@ xla_test(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:kernel_spec",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
@@ -491,6 +475,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_proto_cc",
         "//xla/service:buffer_assignment",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/util/proto:proto_matchers",
@@ -498,7 +483,7 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -519,13 +504,11 @@ cc_library(
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
@@ -535,6 +518,64 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "convolution_thunk_test",
+    srcs = ["convolution_thunk_test.cc"],
+    deps = [
+        ":convolution_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "convolution_reorder_thunk",
+    srcs = ["convolution_reorder_thunk.cc"],
+    hdrs = ["convolution_reorder_thunk.h"],
+    deps = [
+        ":convolution_filter_thunk_proto_cc",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_assignment_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:dnn",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "convolution_reorder_thunk_test",
+    srcs = ["convolution_reorder_thunk_test.cc"],
+    deps = [
+        ":convolution_reorder_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "convolution_filter_thunk_proto",
+    srcs = ["convolution_filter_thunk.proto"],
+    protodeps = ["//xla/service:buffer_assignment_proto"],
+)
+
 cc_library(
     name = "copy_thunk",
     srcs = ["copy_thunk.cc"],
@@ -585,6 +626,7 @@ cc_library(
     hdrs = ["cub_sort_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -595,7 +637,6 @@ cc_library(
         "//xla/service/gpu:buffer_allocations",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -603,22 +644,55 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
+xla_cc_test(
+    name = "cub_sort_thunk_test",
+    srcs = ["cub_sort_thunk_test.cc"],
+    tags = ["gpu"],
+    deps = [
+        ":cub_sort_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/service:buffer_assignment",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ] + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:all_runtime",
+    ]) + if_rocm_is_configured([
+        "//xla/stream_executor/rocm:all_runtime",
+    ]),
+)
+
 cc_library(
     name = "custom_call_thunk",
     srcs = ["custom_call_thunk.cc"],
     hdrs = ["custom_call_thunk.h"],
     deps = [
+        ":custom_call_target",
+        ":shaped_slice",
         ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/ffi",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:execution_context",
         "//xla/ffi:execution_state",
@@ -629,19 +703,26 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
+        "//xla/service:custom_call_target_registry",
         "//xla/service/gpu:buffer_allocations",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform",
     ],
 )
 
@@ -651,21 +732,36 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":custom_call_thunk",
+        ":shaped_slice",
         ":thunk",
         "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla/ffi",
+        "//xla/ffi:attribute_map",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status_public_headers",
+        "//xla/service:custom_call_target_registry",
         "//xla/service:executable",
+        "//xla/service:hlo_module_config",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -675,29 +771,45 @@ cc_library(
     hdrs = ["fft_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
+        "//xla/service:buffer_assignment_proto_cc",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "fft_thunk_test",
+    srcs = ["fft_thunk_test.cc"],
+    deps = [
+        ":fft_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -719,8 +831,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:nvtx_utils",
     ],
 )
@@ -811,6 +921,7 @@ cc_library(
     srcs = ["infeed_thunk.cc"],
     hdrs = ["infeed_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla:shape_tree",
         "//xla:shape_util",
@@ -828,6 +939,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -844,7 +956,7 @@ xla_cc_test(
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -853,20 +965,19 @@ cc_library(
     srcs = ["kernel_thunk.cc"],
     hdrs = ["kernel_thunk.h"],
     deps = [
+        ":print_buffer_contents",
         ":thunk",
-        ":thunk_id",
         ":thunk_proto_cc",
         "//xla:shape_util",
         "//xla:types",
         "//xla/codegen/emitters:kernel_arguments",
-        "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu/kernels:custom_kernel",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -880,10 +991,11 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
@@ -927,7 +1039,6 @@ xla_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -1020,12 +1131,15 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:stream",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:statusor",
     ] + if_cuda_is_configured(
         [":select_k_exec_raft"],
         no_cuda = [":select_k_exec_stub"],
@@ -1042,11 +1156,13 @@ xla_cc_test(
         ":thunk_proto_cc",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1090,13 +1206,10 @@ cc_library(
         ":thunk",
         "//xla:future",
         "//xla:shape_util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
-        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:stream",
@@ -1117,6 +1230,7 @@ cc_library(
     hdrs = ["collective_kernel_thunk.h"],
     deps = [
         ":all_reduce",
+        ":collective_metadata_thunk",
         ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
@@ -1126,7 +1240,6 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
-        "//xla/service:rendezvous",
         "//xla/service/gpu:gpu_constants",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
@@ -1139,7 +1252,6 @@ cc_library(
         "//xla/stream_executor/gpu:collective_kernel_metadata",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -1156,6 +1268,11 @@ cc_library(
 xla_test(
     name = "collective_kernel_thunk_test",
     srcs = ["collective_kernel_thunk_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+        ],
+    },
     backends = ["h100"],
     deps = [
         ":collective_kernel_thunk",
@@ -1164,7 +1281,8 @@ xla_test(
         "//xla:array",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:collective_op_group_mode",
+        "//xla/backends/gpu/runtime:collective_params",
+        "//xla/pjrt:worker_thread",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
@@ -1178,11 +1296,16 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_init",
+        "//xla/tsl/concurrency:future",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1210,7 +1333,6 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -1235,17 +1357,15 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:event",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -1279,7 +1399,6 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
-        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
@@ -1288,7 +1407,6 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -1317,17 +1435,14 @@ cc_library(
         ":thunk",
         "//xla:future",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -1337,6 +1452,40 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "collective_broadcast_thunk_test",
+    srcs = ["collective_broadcast_thunk_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":collective_broadcast_thunk",
+        ":collective_thunk",
+        ":command_buffer_cmd",
+        ":command_buffer_cmd_emitter",
+        ":command_buffer_thunk",
+        ":sequential_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:backend",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu:gpu_constants",
+        "//xla/service/gpu:gpu_executable",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tests:hlo_test_base",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:casts",
+    ],
+)
+
 cc_library(
     name = "collective_permute_thunk",
     srcs = ["collective_permute_thunk.cc"],
@@ -1347,15 +1496,13 @@ cc_library(
         ":thunk",
         "//xla:executable_run_options",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
         "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
@@ -1378,34 +1525,155 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
     ],
 )
 
-cc_library(
-    name = "collective_thunk",
-    srcs = ["collective_thunk.cc"],
-    hdrs = ["collective_thunk.h"],
+xla_test(
+    name = "collective_permute_thunk_test",
+    srcs = ["collective_permute_thunk_test.cc"],
+    backends = ["gpu"],
     deps = [
+        ":collective_permute_thunk",
+        ":collective_thunk",
+        ":command_buffer_cmd",
+        ":command_buffer_cmd_emitter",
+        ":command_buffer_thunk",
+        ":sequential_thunk",
         ":thunk",
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
+        ":thunk_proto_cc",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/translate/mhlo_to_hlo:attribute_exporter",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:backend",
         "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
-        "//xla/service:rendezvous",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu:gpu_constants",
+        "//xla/service/gpu:gpu_executable",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tests:hlo_test_base",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:casts",
+    ],
+)
+
+cc_library(
+    name = "collective_clique_requests",
+    srcs = ["collective_clique_requests.cc"],
+    hdrs = ["collective_clique_requests.h"],
+    deps = [
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_clique_requests_test",
+    srcs = ["collective_clique_requests_test.cc"],
+    deps = [
+        ":collective_clique_requests",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/service:global_device_id",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "collective_params",
+    srcs = ["collective_params.cc"],
+    hdrs = ["collective_params.h"],
+    deps = [
+        ":collective_clique_requests",
+        "//xla:executable_run_options",
+        "//xla:util",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/runtime:device_id",
+        "//xla/service:computation_placer",
+        "//xla/service:executable",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "collective_cliques",
+    srcs = ["collective_cliques.cc"],
+    hdrs = ["collective_cliques.h"],
+    deps = [
+        ":collective_params",
+        "//xla:util",
+        "//xla/backends/gpu/collectives:gpu_clique",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/collectives:gpu_cliques",
+        "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/runtime:collective_clique_requests",
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "collective_thunk",
+    srcs = ["collective_thunk.cc"],
+    hdrs = ["collective_thunk.h"],
+    deps = [
+        ":collective_cliques",
+        ":collective_params",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer",
+        "//xla/service:global_device_id",
+        "//xla/service:rendezvous",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
@@ -1425,7 +1693,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1435,15 +1702,20 @@ cc_library(
     hdrs = ["p2p_thunk_common.h"],
     deps = [
         ":collective_thunk",
+        ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:global_device_id",
+        "//xla/service:source_target_pairs",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
@@ -1451,7 +1723,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1464,10 +1735,11 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
@@ -1492,17 +1764,16 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -1523,7 +1794,6 @@ cc_library(
         ":thunk_id",
         "//xla:future",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
@@ -1585,6 +1855,7 @@ cc_library(
     srcs = ["outfeed_thunk.cc"],
     hdrs = ["outfeed_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla:shape_tree",
         "//xla:shape_util",
@@ -1638,6 +1909,40 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "collective_metadata_thunk",
+    srcs = ["collective_metadata_thunk.cc"],
+    hdrs = ["collective_metadata_thunk.h"],
+    deps = [
+        ":collective_thunk",
+        ":thunk",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:device_id",
+        "//xla/service:buffer_assignment",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:rendezvous",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:collective_kernel_metadata",
+        "//xla/stream_executor/gpu:gpu_executor_header",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf_lite",
+    ],
+)
+
 cc_library(
     name = "sequential_thunk",
     srcs = ["sequential_thunk.cc"],
@@ -1668,6 +1973,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_id",
         ":thunk_proto_cc",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -1682,6 +1988,7 @@ cc_library(
     hdrs = ["host_send_recv_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
@@ -1702,6 +2009,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -1729,6 +2037,8 @@ cc_library(
     srcs = ["thunk.cc"],
     hdrs = ["thunk.h"],
     deps = [
+        ":collective_cliques",
+        ":collective_params",
         ":thunk_id",
         ":thunk_proto_cc",
         "//xla:executable_run_options",
@@ -1737,6 +2047,7 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_cliques",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/runtime:collective_clique_requests",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/ffi:execution_context",
@@ -1779,6 +2090,47 @@ xla_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "shaped_slice_proto",
+    srcs = ["shaped_slice.proto"],
+    protodeps = [
+        # keep sorted
+        "//xla:xla_data_proto",
+        "//xla/service:buffer_assignment_proto",
+    ],
+)
+
+cc_library(
+    name = "shaped_slice",
+    srcs = ["shaped_slice.cc"],
+    hdrs = ["shaped_slice.h"],
+    deps = [
+        ":shaped_slice_proto_cc",
+        "//xla:shape_util",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "shaped_slice_test",
+    srcs = ["shaped_slice_test.cc"],
+    deps = [
+        ":shaped_slice",
+        ":shaped_slice_proto_cc",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "for_all_thunks_test",
     srcs = ["for_all_thunks_test.cc"],
@@ -1814,6 +2166,7 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -1835,7 +2188,7 @@ xla_cc_test(
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -1851,7 +2204,7 @@ xla_cc_test(
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -1908,17 +2261,17 @@ xla_test(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:hlo_pjrt_test_base",
-        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/util/proto:parse_text_proto",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -1930,10 +2283,10 @@ cc_library(
         ":thunk",
         ":thunk_proto_cc",
         "//xla/stream_executor:stream",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1978,11 +2331,11 @@ xla_cc_test(
         ":thunk",
         ":thunk_proto_cc",
         "//xla/service:buffer_assignment",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -2030,6 +2383,8 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:buffer_comparator_kernel",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -2056,11 +2411,11 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2094,8 +2449,8 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
@@ -2120,7 +2475,6 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:safe_reinterpret_cast",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -2134,7 +2488,6 @@ xla_test(
     srcs = ["all_reduce_test.cc"],
     backend_tags = {
         "gpu": [
-            "multi_gpu_h100",
             "multi_gpu",
             "xla_amdgpu_any",
             "no_oss",
@@ -2165,19 +2518,21 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:all_reduce_kernel",
         "//xla/stream_executor/gpu:collective_kernel_metadata",
+        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_init",
         "//xla/stream_executor/host:host_platform",
         "//xla/tests:literal_test_util",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -2243,8 +2598,7 @@ cc_library(
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/service/gpu/kernels:custom_kernel",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -2270,10 +2624,12 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:kernel_serialization_check",
         "//xla/stream_executor/host:host_platform",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -2292,18 +2648,30 @@ tf_proto_library(
     ],
     protodeps = [
         # keep sorted
+        ":convolution_filter_thunk_proto",
         ":dynamic_slice_thunk_proto",
+        ":shaped_slice_proto",
         "//xla:xla_data_proto",
+        "//xla/core/host_offloading:host_offloading_executable_proto",
+        "//xla/ffi:attribute_map_proto",
         "//xla/service:buffer_assignment_proto",
+        "//xla/service:hlo_proto",
         "//xla/service/gpu:backend_configs",
+        "//xla/service/gpu:gpu_conv_runner_proto",
         "//xla/service/gpu:gpu_norm_runner_proto",
         "//xla/service/gpu:launch_dimensions_proto",
+        "//xla/service/gpu/kernels:custom_kernel_proto",
         "//xla/stream_executor:launch_dim_proto",
         "//xla/stream_executor/gpu:gpu_blas_lt_proto",
         "//xla/stream_executor/gpu:tma_metadata_proto",
     ],
 )
 
+xla_py_proto_library(
+    name = "thunk_proto_py",
+    deps = [":thunk_proto"],
+)
+
 tf_proto_library(
     name = "dynamic_slice_thunk_proto",
     srcs = [
@@ -2323,14 +2691,24 @@ cc_library(
     hdrs = ["thunk_proto_deserialization.h"],
     deps = [
         ":conditional_thunk",
+        ":convolution_reorder_thunk",
+        ":convolution_thunk",
         ":copy_thunk",
+        ":cub_sort_thunk",
         ":cudnn_thunk",
+        ":custom_call_thunk",
+        ":custom_kernel_thunk",
+        ":dynamic_slice_thunk",
+        ":fft_thunk",
         ":gemm_thunk",
         ":gpublas_lt_matmul_thunk",
+        ":host_execute_thunk",
+        ":host_send_recv_thunk",
         ":infeed_thunk",
         ":kernel_thunk",
         ":memset_thunk",
         ":norm_thunk",
+        ":outfeed_thunk",
         ":replica_id_thunk",
         ":sequential_thunk",
         ":thunk",
@@ -2338,8 +2716,12 @@ cc_library(
         ":triangular_solve_thunk",
         ":wait_for_streams_thunk",
         ":while_thunk",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -2356,17 +2738,29 @@ xla_cc_test(
     deps = [
         ":conditional_thunk",
         ":copy_thunk",
+        ":custom_kernel_thunk",
+        ":host_execute_thunk",
+        ":host_send_recv_thunk",
         ":sequential_thunk",
         ":thunk",
         ":thunk_proto_cc",
         ":thunk_proto_deserialization",
         ":while_thunk",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/tsl/platform:status_matchers",
+        "//xla/service:hlo_module_config",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "//xla/tsl/util/proto:parse_text_proto",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
@@ -2385,11 +2779,12 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives",
         "//xla/core/collectives:collectives_registry",
-        "//xla/core/collectives:communicator",
         "//xla/service:computation_placer",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2399,9 +2794,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -2417,15 +2810,12 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
@@ -2456,15 +2846,14 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -2488,14 +2877,13 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -2510,13 +2898,13 @@ cc_library(
     hdrs = ["nvshmem_all_reduce_thunk.h"],
     deps = [
         ":all_reduce_thunk",
+        ":collective_kernel_thunk",
         ":collective_thunk",
         ":nvshmem_collective_thunk",
         ":thunk",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
-        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
@@ -2537,6 +2925,7 @@ cc_library(
     srcs = ["host_execute_thunk.cc"],
     hdrs = ["host_execute_thunk.h"],
     deps = [
+        ":shaped_slice_proto_cc",
         ":thunk",
         ":thunk_proto_cc",
         "//xla:executable_run_options",
@@ -2593,7 +2982,6 @@ xla_test(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu/nanort:nanort_client",
         "//xla/backends/cpu/nanort:nanort_executable",
-        "//xla/core/host_offloading:host_offloading_executable",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:buffer_assignment",
@@ -2609,10 +2997,11 @@ xla_test(
         "//xla/tests:literal_test_util",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
@@ -2626,6 +3015,8 @@ cc_library(
     hdrs = ["thunk_pass_pipeline.h"],
     deps = [
         ":sequential_thunk",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
@@ -2643,6 +3034,8 @@ xla_cc_test(
         ":sequential_thunk",
         ":thunk",
         ":thunk_pass_pipeline",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
@@ -2671,14 +3064,14 @@ cc_library(
         ":while_thunk",
         "//xla:util",
         "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:overload",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -2711,7 +3104,6 @@ xla_test(
         ":thunk_pass_pipeline",
         ":while_thunk",
         "//xla:shape_util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_module_config",
@@ -2723,6 +3115,7 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -2732,31 +3125,93 @@ xla_test(
 )
 
 cc_library(
-    name = "thunk_checksum_tracing_pass",
-    srcs = ["thunk_checksum_tracing_pass.cc"],
-    hdrs = ["thunk_checksum_tracing_pass.h"],
-    deps = [
+    name = "thunk_buffer_debug_pass",
+    srcs = [
+        "thunk_buffer_debug_checksum.cc",
+        "thunk_buffer_debug_filter.cc",
+        "thunk_buffer_debug_float_check.cc",
+        "thunk_buffer_debug_pass.cc",
+        "thunk_buffer_debug_saver_inserter.cc",
+    ],
+    hdrs = [
+        "thunk_buffer_debug_checksum.h",
+        "thunk_buffer_debug_filter.h",
+        "thunk_buffer_debug_float_check.h",
+        "thunk_buffer_debug_pass.h",
+        "thunk_buffer_debug_saver_inserter.h",
+    ],
+    deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_proto_cc",
+        ":buffer_debug_log_structs",
+        ":buffers_checksum_thunk",
+        ":buffers_float_check_thunk",
+        ":custom_call_thunk",
+        ":runtime_intrinsics",
         ":sequential_thunk",
+        ":shaped_slice",
+        ":thunk",
+        ":thunk_id",
         ":thunk_pass_pipeline",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/gpu:ffi",
+        "//xla/ffi",
+        "//xla/ffi:attribute_map",
+        "//xla/ffi/api:c_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
+        "//xla/service:dump",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_googlesource_code_re2//:re2",
     ],
 )
 
 xla_cc_test(
-    name = "thunk_checksum_tracing_pass_test",
-    srcs = ["thunk_checksum_tracing_pass_test.cc"],
+    name = "thunk_buffer_debug_pass_test",
+    srcs = ["thunk_buffer_debug_pass_test.cc"],
     deps = [
+        ":buffers_checksum_thunk",
+        ":buffers_float_check_thunk",
+        ":conditional_thunk",
+        ":custom_call_thunk",
+        ":runtime_intrinsics",
         ":sequential_thunk",
         ":thunk",
-        ":thunk_checksum_tracing_pass",
+        ":thunk_buffer_debug_pass",
+        ":thunk_id",
         ":thunk_pass_pipeline",
+        ":while_thunk",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
+        "//xla/service:hlo_module_config",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
@@ -2772,6 +3227,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu:ffi",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/service:collective_ops_utils",
@@ -2780,12 +3236,18 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
+        "//xla/tsl/lib/io:record_writer",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
     ],
     alwayslink = 1,
 )
@@ -2796,53 +3258,296 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":runtime_intrinsics",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/io:record_reader",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/testing:temporary_directory",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:tstring",
+    ],
+)
+
+cc_library(
+    name = "buffers_checksum_thunk",
+    srcs = ["buffers_checksum_thunk.cc"],
+    hdrs = ["buffers_checksum_thunk.h"],
+    deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":thunk",
+        ":thunk_id",
+        "//xla/service:buffer_assignment",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/cuda:cuda_platform_id",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/stream_executor/gpu:buffer_debug_xor_checksum_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_test(
+    name = "buffers_checksum_thunk_test",
+    srcs = ["buffers_checksum_thunk_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+        ],
+    },
+    backends = ["gpu"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":buffers_checksum_thunk",
+        ":thunk",
+        ":thunk_id",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:mock_stream",
+        "//xla/stream_executor:mock_stream_executor",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "buffers_float_check_thunk",
+    srcs = ["buffers_float_check_thunk.cc"],
+    hdrs = ["buffers_float_check_thunk.h"],
+    deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":thunk",
+        "//xla:types",
+        "//xla/service:buffer_assignment",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/cuda:cuda_platform_id",
+        "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_test(
+    name = "buffers_float_check_thunk_test",
+    srcs = ["buffers_float_check_thunk_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+        ],
+    },
+    backends = ["gpu"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":buffers_float_check_thunk",
+        ":thunk",
+        ":thunk_id",
+        "//xla:types",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
 tf_proto_library(
-    name = "sdc_proto",
-    srcs = ["sdc.proto"],
+    name = "buffer_debug_log_proto",
+    srcs = ["buffer_debug_log.proto"],
+)
+
+xla_py_proto_library(
+    name = "buffer_debug_log_proto_py",
+    deps = [":buffer_debug_log_proto"],
+)
+
+cc_library(
+    name = "buffer_debug_log_structs",
+    hdrs = ["buffer_debug_log_structs.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":buffer_debug_log_proto_cc",
+        "//xla/tsl/lib/gtl:int_type",
+        "@com_google_absl//absl/strings:str_format",
+    ],
 )
 
 cc_library(
-    name = "sdc_buffer_id",
-    hdrs = ["sdc_buffer_id.h"],
+    name = "buffer_debug_log_entry_metadata_store",
+    srcs = ["buffer_debug_log_entry_metadata_store.cc"],
+    hdrs = ["buffer_debug_log_entry_metadata_store.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":buffer_debug_log_proto_cc",
+        ":buffer_debug_log_structs",
         ":thunk_id",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "print_buffer_contents",
+    srcs = ["print_buffer_contents.cc"],
+    hdrs = ["print_buffer_contents.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:stream",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "print_buffer_contents_test",
+    srcs = ["print_buffer_contents_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":print_buffer_contents",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "custom_kernel_thunk",
+    srcs = ["custom_kernel_thunk.cc"],
+    hdrs = ["custom_kernel_thunk.h"],
+    deps = [
+        ":print_buffer_contents",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/codegen/emitters:kernel_arguments",
+        "//xla/runtime:buffer_use",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 xla_cc_test(
-    name = "sdc_buffer_id_test",
-    srcs = ["sdc_buffer_id_test.cc"],
+    name = "custom_kernel_thunk_test",
+    srcs = ["custom_kernel_thunk_test.cc"],
     deps = [
-        ":sdc_buffer_id",
-        ":thunk_id",
+        ":custom_kernel_thunk",
+        ":thunk",
+        "//xla:shape_util",
+        "//xla/codegen/emitters:kernel_arguments",
+        "//xla/runtime:buffer_use",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
-cc_library(
-    name = "sdc_log_structs",
-    hdrs = ["sdc_log_structs.h"],
-    compatible_with = get_compatible_with_portable(),
+xla_cc_test(
+    name = "buffer_debug_log_entry_metadata_store_test",
+    srcs = ["buffer_debug_log_entry_metadata_store_test.cc"],
     deps = [
-        ":sdc_buffer_id",
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":thunk_id",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
index bc9c761705fadb..672772eccd1c82 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
@@ -21,14 +21,12 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/future.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
@@ -74,10 +72,11 @@ AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
                                          std::vector<Buffer> buffers,
                                          bool p2p_memcpy_enabled)
     : CollectiveThunk(Thunk::kAllGatherStart, thunk_info,
-                      IsGPUSyncCollective(*inst), AsyncStreamKind::kCollective),
+                      IsGPUSyncCollective(*inst),
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
       config_(impl::GetAllGatherConfig(inst)),
       buffers_(std::move(buffers)) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
 /*static*/ absl::Status AllGatherStartThunk::CheckImplementable(
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
index a38b02861bc42a..81b69793cbe213 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
@@ -22,11 +22,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
index e3a3576f6977e4..828978010d5cfd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 
-#include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -43,7 +42,10 @@ namespace xla::gpu {
 
 namespace {
 
-using ::stream_executor::gpu::AllReduceStrategy;
+using se::gpu::AllReduceStrategy;
+static constexpr int64_t kMaxOneShotAllReduceSizeBytes = 256 * 1024;  // 256 KB
+static constexpr int64_t kMaxTwoShotAllReduceSizeBytes =
+    2 * 1024 * 1024;  // 2 MB
 
 template <typename T, ReductionKind kReductionKindV>
 class TagRegistry {
@@ -58,6 +60,7 @@ class TagRegistry {
  public:
   static constexpr auto kOneShot = Impl<AllReduceStrategy::kOneShot>{};
   static constexpr auto kTwoShot = Impl<AllReduceStrategy::kTwoShot>{};
+  static constexpr auto kMultimem = Impl<AllReduceStrategy::kMultimem>{};
 };
 
 // Static set of supported kernel tags.
@@ -159,6 +162,28 @@ bool IsElementReductionSupported(PrimitiveType element_type,
 
 }  // namespace
 
+AllReduceStrategy GetAllReduceStrategy(int64_t input_size_bytes,
+                                       bool is_multimem_enabled) {
+  if (input_size_bytes > kMaxOneShotAllReduceSizeBytes) {
+    return AllReduceStrategy::kTwoShot;
+  }
+  if (is_multimem_enabled) {
+    return AllReduceStrategy::kMultimem;
+  }
+  return AllReduceStrategy::kOneShot;
+}
+
+int64_t GetMaxSupportedAllReduceSizeBytes(AllReduceStrategy strategy) {
+  switch (strategy) {
+    case AllReduceStrategy::kOneShot:
+      return kMaxOneShotAllReduceSizeBytes;
+    case AllReduceStrategy::kTwoShot:
+      return kMaxTwoShotAllReduceSizeBytes;
+    case AllReduceStrategy::kMultimem:
+      return kMaxTwoShotAllReduceSizeBytes;
+  }
+}
+
 LaunchDimensions AllReduceLaunchDimensions(int64_t elements, int64_t num_ranks,
                                            AllReduceStrategy strategy) {
   int64_t threads_per_block;
@@ -182,7 +207,8 @@ bool IsAllReduceKernelSupported(int64_t num_ranks, int64_t num_elements,
     return false;
   }
   const int64_t alignment_requirement =
-      all_reduce_strategy == AllReduceStrategy::kOneShot
+      all_reduce_strategy == AllReduceStrategy::kOneShot ||
+              all_reduce_strategy == AllReduceStrategy::kMultimem
           ? se::gpu::kNumElementsPerThread
           : se::gpu::kNumElementsPerThread * num_ranks;
 
@@ -232,6 +258,8 @@ absl::Status RunAllReduceKernel(
         return launch_kernel_impl(tag_registry.kOneShot);
       case AllReduceStrategy::kTwoShot:
         return launch_kernel_impl(tag_registry.kTwoShot);
+      case AllReduceStrategy::kMultimem:
+        return launch_kernel_impl(tag_registry.kMultimem);
     }
   };
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
index 78656499452388..e2d5ae8bc1d1ef 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/status/status.h"
-#include "absl/types/span.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -31,6 +30,16 @@ limitations under the License.
 
 namespace xla::gpu {
 
+// Returns the all-reduce strategy for the given input size.
+// If `is_multimem_enabled` is true, then multimem strategies are also
+// considered.
+se::gpu::AllReduceStrategy GetAllReduceStrategy(int64_t input_size_bytes,
+                                                bool is_multimem_enabled);
+
+// Returns the maximum supported all-reduce size in bytes for the given
+// strategy.
+int64_t GetMaxSupportedAllReduceSizeBytes(se::gpu::AllReduceStrategy strategy);
+
 // Returns the launch dimensions for the all-reduce kernel.
 // The launch dimensions are determined by the number of elements and the
 // the all-reduce strategy.
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
index 1a5344730d245c..27aceb205ef899 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_reduce.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <tuple>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -42,16 +44,16 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/gpu/all_reduce_kernel.h"
 #include "xla/stream_executor/gpu/collective_kernel_metadata.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/types.h"
@@ -63,7 +65,6 @@ namespace {
 
 using ::stream_executor::gpu::AllReduceStrategy;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 se::StreamExecutor* GetGpuExecutor(int64_t device_ordinal) {
   auto* platform =
@@ -76,15 +77,6 @@ struct TestParams {
   int64_t num_elements;
 };
 
-template <typename T>
-std::vector<T> ToVector(const Array<T>& array) {
-  std::vector<T> result;
-  result.reserve(array.num_elements());
-  array.Each(
-      [&](absl::Span<const int64_t> indices, T val) { result.push_back(val); });
-  return result;
-}
-
 class AllReduceKernelTest : public ::testing::Test,
                             public ::testing::WithParamInterface<TestParams> {
  public:
@@ -103,6 +95,19 @@ class AllReduceKernelTest : public ::testing::Test,
     TF_RETURN_IF_ERROR(executors[0]->EnablePeerAccessTo(executors[1]));
     TF_RETURN_IF_ERROR(executors[1]->EnablePeerAccessTo(executors[0]));
 
+    std::unique_ptr<stream_executor::gpu::GpuExecutor::MulticastMemory>
+        multicast_memory;
+    if (params_.all_reduce_strategy == AllReduceStrategy::kMultimem) {
+      TF_ASSIGN_OR_RETURN(
+          multicast_memory,
+          dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[0])
+              ->CreateMulticastMemory(num_elements * sizeof(T), num_ranks));
+
+      for (int i = 0; i < num_ranks; ++i) {
+        TF_RETURN_IF_ERROR(multicast_memory->SubscribeDevice(i));
+      }
+    }
+
     std::vector<std::unique_ptr<se::Stream>> streams;
     std::vector<se::DeviceMemoryBase> allocated_buffers;
     std::vector<se::DeviceMemoryBase> local_input_buffers;
@@ -124,7 +129,8 @@ class AllReduceKernelTest : public ::testing::Test,
           /*local_input_buffer_size=*/aligned_input_size +
           /*data_buffer_size=*/aligned_input_size +
           /*signal_buffer_size=*/aligned_signal_size;
-      allocated_buffers.emplace_back(executor->AllocateArray<T>(total_size));
+      allocated_buffers.emplace_back(executor->AllocateArray<T>(
+          total_size, static_cast<int64_t>(stream_executor::MemoryType::kP2P)));
       local_input_buffers.emplace_back(
           allocated_buffers[i].GetByteSlice(0, aligned_input_size));
       TF_RET_CHECK(!local_input_buffers[i].is_null());
@@ -143,23 +149,50 @@ class AllReduceKernelTest : public ::testing::Test,
     }
 
     std::vector<se::DeviceMemoryBase> metadata_buffers;
+    // One for signal and one for input parameters.
+    constexpr int kNumPeerParameters = 2;
+    size_t param_to_peers_size = sizeof(void*) * kNumPeerParameters * num_ranks;
+    std::vector<void*> param_to_peers_ptrs;
+    for (const stream_executor::DeviceMemoryBase& local_input_buffer :
+         local_input_buffers) {
+      param_to_peers_ptrs.push_back(local_input_buffer.opaque());
+    }
+    for (const stream_executor::DeviceMemoryBase& signal_flags_buffer :
+         signal_flags_buffers) {
+      param_to_peers_ptrs.push_back(signal_flags_buffer.opaque());
+    }
 
     for (int i = 0; i < num_ranks; ++i) {
       CollectiveKernelMetadata metadata;
       metadata.rank = i;
 
-      for (int j = 0; j < num_ranks; ++j) {
-        // One-Shot all-reduce doesn't use an input buffer from the peers.
-        metadata.buffer_root_ptrs[j] = 0;
-        metadata.local_buffer_root_ptrs[j] =
-            (uint64_t)allocated_buffers[j].opaque();
+      if (params_.all_reduce_strategy == AllReduceStrategy::kMultimem) {
+        stream_executor::gpu::GpuExecutor* gpu_executor =
+            dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[i]);
+        TF_RET_CHECK(gpu_executor != nullptr);
+        TF_ASSIGN_OR_RETURN(
+            void* mapped_memory,
+            multicast_memory->MapMemory(allocated_buffers[i], gpu_executor));
+        metadata.multicast_buffer_ptr = mapped_memory;
+      } else {
+        metadata.multicast_buffer_ptr = nullptr;
       }
 
+      // First map from parameter to peer ptrs and then metadata.
       metadata_buffers.emplace_back(executors[i]->AllocateArray<uint64_t>(
-          sizeof(CollectiveKernelMetadata)));
+          sizeof(CollectiveKernelMetadata) + param_to_peers_size));
+
+      se::DeviceMemoryBase param_to_peers_ptrs_buffer =
+          metadata_buffers[i].GetByteSlice(sizeof(CollectiveKernelMetadata),
+                                           param_to_peers_size);
+      metadata.param_to_peers =
+          reinterpret_cast<void**>(param_to_peers_ptrs_buffer.opaque());
 
       TF_RETURN_IF_ERROR(streams[i]->Memcpy(&metadata_buffers[i], &metadata,
                                             sizeof(CollectiveKernelMetadata)));
+      TF_RETURN_IF_ERROR(streams[i]->Memcpy(&param_to_peers_ptrs_buffer,
+                                            param_to_peers_ptrs.data(),
+                                            param_to_peers_size));
     }
 
     for (int i = 0; i < num_ranks; ++i) {
@@ -205,6 +238,8 @@ class AllReduceKernelTest : public ::testing::Test,
 
   int64_t num_elements() const { return params_.num_elements; }
 
+  AllReduceStrategy strategy() const { return params_.all_reduce_strategy; }
+
  private:
   TestParams params_;
 };
@@ -214,6 +249,11 @@ TEST_P(AllReduceKernelTest, KernelTestAddF32) {
 
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
                                                 GetGpuExecutor(1)};
+  if (strategy() == AllReduceStrategy::kMultimem &&
+      !dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[0])
+           ->is_multicast_supported()) {
+    GTEST_SKIP() << "Multimem not supported on this device.";
+  }
 
   if (!executors[0]->CanEnablePeerAccessTo(executors[1])) {
     GTEST_SKIP() << "Test requires direct peer memory access between devices.";
@@ -247,6 +287,9 @@ TEST_P(AllReduceKernelTest, KernelTestAddF32) {
 }
 
 TEST_P(AllReduceKernelTest, KernelTestAddBF16) {
+  if (strategy() == AllReduceStrategy::kMultimem) {
+    GTEST_SKIP() << "Multimem does not support BF16.";
+  }
   constexpr int64_t kNumRanks = 2;
 
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
@@ -280,6 +323,9 @@ TEST_P(AllReduceKernelTest, KernelTestAddBF16) {
 }
 
 TEST_P(AllReduceKernelTest, KernelTestOrPred) {
+  if (strategy() == AllReduceStrategy::kMultimem) {
+    GTEST_SKIP() << "Multimem does not support predicates.";
+  }
   constexpr int64_t kNumRanks = 2;
 
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
@@ -314,6 +360,9 @@ TEST_P(AllReduceKernelTest, KernelTestOrPred) {
 }
 
 TEST_P(AllReduceKernelTest, KernelTestAddPred_Unsupported) {
+  if (strategy() == AllReduceStrategy::kMultimem) {
+    GTEST_SKIP() << "Multimem does not support predicates.";
+  }
   constexpr int64_t kNumRanks = 2;
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
                                                 GetGpuExecutor(1)};
@@ -336,7 +385,8 @@ INSTANTIATE_TEST_SUITE_P(
     AllReduceKernelTest, AllReduceKernelTest,
     ::testing::ConvertGenerator(
         ::testing::Combine(::testing::Values(AllReduceStrategy::kOneShot,
-                                             AllReduceStrategy::kTwoShot),
+                                             AllReduceStrategy::kTwoShot,
+                                             AllReduceStrategy::kMultimem),
                            ::testing::Values(128000, 124000)),
         [](const std::tuple<AllReduceStrategy, int64_t>& params) {
           return TestParams{std::get<0>(params), std::get<1>(params)};
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
index 48afdb40223a9e..e5ee61a451456c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
@@ -16,16 +16,17 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -65,14 +65,14 @@ absl::Status CheckImplementableInst(const HloInstruction* inst,
 }
 
 template <typename HloInstType>
-CollectiveOpGroupMode GetGroupModeInst(HloInstType* inst) {
+CollectiveOpGroupMode GetGroupModeInst(const HloInstType* inst) {
   return GetAllReduceConfigInst(inst).config.group_mode;
 }
 
 }  // namespace
 
-template <typename HloInstType>
-AllReduceConfig GetAllReduceConfigInst(HloInstType* inst) {
+AllReduceConfig GetAllReduceConfigInst(
+    const HloAllReduceInstructionBase* inst) {
   std::optional<ReductionKind> reduction_kind =
       MatchReductionComputation(inst->called_computations().front());
   CHECK(reduction_kind.has_value());
@@ -112,31 +112,22 @@ absl::Status RunAllReduce(ReductionKind reduction_kind,
 AllReduceReduceScatterThunkBase::AllReduceReduceScatterThunkBase(
     Thunk::Kind kind, ThunkInfo thunk_info, AllReduceConfig config,
     std::vector<Buffer> buffers, bool is_sync)
-    : CollectiveThunk(kind, thunk_info, is_sync, AsyncStreamKind::kCollective),
+    : CollectiveThunk(kind, thunk_info, is_sync,
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
       config_(std::move(config)),
       buffers_(std::move(buffers)) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
-AllReduceStartThunk::AllReduceStartThunk(ThunkInfo thunk_info,
-                                         const HloAllReduceInstruction* inst,
-                                         std::vector<Buffer> buffers,
-                                         bool p2p_memcpy_enabled)
+AllReduceStartThunk::AllReduceStartThunk(
+    ThunkInfo thunk_info, const HloAllReduceInstruction* inst,
+    std::vector<Buffer> buffers,
+    std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
+    bool p2p_memcpy_enabled)
     : AllReduceReduceScatterThunkBase(
           Thunk::kAllReduceStart, thunk_info, GetAllReduceConfigInst(inst),
           std::move(buffers), IsGPUSyncCollective(*inst)),
-      collective_kernel_thunk_{
-          thunk_info,
-          config_.config,
-          config_.reduction_kind,
-          IsAsync(),
-          buffers_,
-          /*is_collective_kernel_enabled=*/
-          inst->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_unsupported_use_all_reduce_one_shot_kernel(),
-      } {}
+      collective_kernel_thunk_(std::move(collective_kernel_thunk)) {}
 
 absl::Status AllReduceStartThunk::CheckImplementable(
     const HloAllReduceInstruction* inst, int64_t replica_count,
@@ -151,10 +142,9 @@ CollectiveOpGroupMode AllReduceStartThunk::GetGroupMode(
   return GetGroupModeInst(inst);
 }
 
-absl::Status AllReduceStartThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
-  TF_RETURN_IF_ERROR(CollectiveThunk::Prepare(params, resource_requests));
-  return collective_kernel_thunk_.Prepare(params, resource_requests);
+absl::Status AllReduceStartThunk::Prepare(const PrepareParams& params) {
+  TF_RETURN_IF_ERROR(CollectiveThunk::Prepare(params));
+  return collective_kernel_thunk_->Prepare(params);
 }
 
 absl::Status AllReduceStartThunk::Initialize(const InitializeParams& params) {
@@ -163,10 +153,10 @@ absl::Status AllReduceStartThunk::Initialize(const InitializeParams& params) {
       GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, config()));
   TF_ASSIGN_OR_RETURN(bool use_collective_kernel,
-                      collective_kernel_thunk_.IsSupported(
+                      collective_kernel_thunk_->IsSupported(
                           clique_key, params.collective_cliques));
   if (use_collective_kernel) {
-    TF_RETURN_IF_ERROR(collective_kernel_thunk_.Initialize(params));
+    TF_RETURN_IF_ERROR(collective_kernel_thunk_->Initialize(params));
   }
   return absl::OkStatus();
 }
@@ -180,11 +170,11 @@ absl::StatusOr<bool> AllReduceStartThunk::RunCollective(
                              config_.config.operand_element_type));
 
   TF_ASSIGN_OR_RETURN(bool use_collective_kernel,
-                      collective_kernel_thunk_.IsSupported(
+                      collective_kernel_thunk_->IsSupported(
                           comm_handle.clique_key, params.collective_cliques));
 
   if (use_collective_kernel) {
-    TF_RETURN_IF_ERROR(collective_kernel_thunk_.ExecuteOnStream(params));
+    TF_RETURN_IF_ERROR(collective_kernel_thunk_->ExecuteOnStream(params));
     return false;  // No need for "first" invocation to rendezvous when not
                    // using nccl.
   }
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
index 304ef9122ce477..d3e06aa6489a60 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
@@ -17,13 +17,12 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_ALL_REDUCE_THUNK_H_
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
@@ -39,8 +38,7 @@ struct AllReduceConfig {
   ReductionKind reduction_kind;
 };
 
-template <typename HloInstType>
-AllReduceConfig GetAllReduceConfigInst(HloInstType* inst);
+AllReduceConfig GetAllReduceConfigInst(const HloAllReduceInstructionBase* inst);
 
 // Thunk that performs a NCCL-based All-Reduce or Reduce-Scatter among CUDA
 // GPU-based replicas.
@@ -66,9 +64,11 @@ class AllReduceReduceScatterThunkBase : public CollectiveThunk {
 
 class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
  public:
-  AllReduceStartThunk(ThunkInfo thunk_info, const HloAllReduceInstruction* inst,
-                      std::vector<Buffer> buffers,
-                      bool p2p_memcpy_enabled = false);
+  AllReduceStartThunk(
+      ThunkInfo thunk_info, const HloAllReduceInstruction* inst,
+      std::vector<Buffer> buffers,
+      std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
+      bool p2p_memcpy_enabled = false);
 
   static const char* GetHloOpName() { return "all-reduce-start"; }
 
@@ -79,8 +79,7 @@ class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
   static CollectiveOpGroupMode GetGroupMode(
       const HloAllReduceInstruction* inst);
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
 
  protected:
@@ -89,7 +88,7 @@ class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
                                      CommunicatorHandle comm) override;
 
  private:
-  CollectiveKernelThunk collective_kernel_thunk_;
+  std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk_;
 };
 
 // -----------------------------------------------------------------------------
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
index 2581b1606c56d2..9c187408622be1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
@@ -48,9 +47,9 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -81,11 +80,11 @@ AllToAllStartThunk::AllToAllStartThunk(
     std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
     : CollectiveThunk(Thunk::kAllToAllStart, thunk_info,
                       IsGPUSyncCollective(*instr),
-                      AsyncStreamKind::kCollective),
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
       config_(GetAllToAllConfig(instr)),
       buffers_(std::move(buffers)),
       p2p_memcpy_enabled_(p2p_memcpy_enabled) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
 /*static*/ absl::Status AllToAllStartThunk::CheckImplementable(
@@ -248,7 +247,7 @@ absl::StatusOr<bool> AllToAllStartThunk::RunCollective(
 
 AsyncStreamKind AllToAllStartThunk::GetAsyncStreamKind() const {
   if (is_local() && p2p_memcpy_enabled_) {
-    return AsyncStreamKind::kMemCpyP2P;
+    return AsyncStreamKind::ASYNC_STREAM_KIND_MEMCPYP2P;
   }
   return CollectiveThunk::GetAsyncStreamKind();
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
index 1d544565b01d6a..caff93eee95bb5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
@@ -29,8 +29,9 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/annotation.cc b/third_party/xla/xla/backends/gpu/runtime/annotation.cc
index f549647b9d0386..b6e383f0a2a30f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/annotation.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/annotation.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <iterator>
 #include <optional>
 #include <ostream>
@@ -31,13 +30,13 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/utils/hlo_longest_prefix.h"
 #include "xla/printer.h"
-#include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
index fe59409a2fb773..8f06af032c53b7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
@@ -37,6 +37,8 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
index d0452885be4e67..1018c14d3c9e11 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/strings/ascii.h"
 #include "absl/types/span.h"
 #include "xla/primitive_util.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -65,11 +65,11 @@ class BufferComparatorTest : public testing::Test {
         stream_exec_,
         stream_exec_->AllocateArray<ElementType>(expected.size()));
 
-    TF_CHECK_OK(stream->Memcpy(current_buffer.memory_ptr(), current.data(),
-                               current_buffer.memory().size()));
-    TF_CHECK_OK(stream->Memcpy(expected_buffer.memory_ptr(), expected.data(),
-                               expected_buffer.memory().size()));
-    TF_CHECK_OK(stream->BlockHostUntilDone());
+    CHECK_OK(stream->Memcpy(current_buffer.memory_ptr(), current.data(),
+                            current_buffer.memory().size()));
+    CHECK_OK(stream->Memcpy(expected_buffer.memory_ptr(), expected.data(),
+                            expected_buffer.memory().size()));
+    CHECK_OK(stream->BlockHostUntilDone());
 
     BufferComparator comparator(
         ShapeUtil::MakeShape(
@@ -109,11 +109,11 @@ class BufferComparatorTest : public testing::Test {
     se::DeviceMemoryHandle expected_buffer(
         stream_exec_, stream_exec_->AllocateScalar<ElementType>());
 
-    TF_CHECK_OK(stream->Memcpy(current_buffer.memory_ptr(), &current,
-                               current_buffer.memory().size()));
-    TF_CHECK_OK(stream->Memcpy(expected_buffer.memory_ptr(), &expected,
-                               expected_buffer.memory().size()));
-    TF_CHECK_OK(stream->BlockHostUntilDone());
+    CHECK_OK(stream->Memcpy(current_buffer.memory_ptr(), &current,
+                            current_buffer.memory().size()));
+    CHECK_OK(stream->Memcpy(expected_buffer.memory_ptr(), &expected,
+                            expected_buffer.memory().size()));
+    CHECK_OK(stream->BlockHostUntilDone());
 
     BufferComparator comparator(
         ShapeUtil::MakeShape(
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log.proto b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log.proto
new file mode 100644
index 00000000000000..4649a76c766cfe
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log.proto
@@ -0,0 +1,58 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.gpu;
+
+message BufferDebugLogEntryProto {
+  // The ID of the thunk that produced this entry, as returned by
+  // ThunkInfo::thunk_id().
+  uint64 thunk_id = 1;
+
+  // The index of the buffer within the list of thunk buffers returned by
+  // Thunk::buffer_uses().
+  uint64 buffer_idx = 2;
+
+  // The checksum of the buffer.
+  uint32 checksum = 3;
+
+  // If true, the entry refers to a thunk input buffer, and the checksum is
+  // calculated based on the buffer value before the thunk execution.
+  //
+  // If false, it refers to thunk output, and the checksum is calculated based
+  // on the buffer value after the thunk execution.
+  bool is_input_buffer = 4;
+
+  // ID of the thunk execution that produced this entry. Entries with the same
+  // (thunk_id, execution_id) describe buffers used by a single execution of a
+  // thunk.
+  uint32 execution_id = 5;
+
+  // The type of check that produced this entry.
+  enum CheckType {
+    CHECK_TYPE_UNSPECIFIED = 0;
+    CHECK_TYPE_CHECKSUM = 1;
+    CHECK_TYPE_FLOAT_CHECKS = 2;
+  }
+
+  CheckType check_type = 6;
+}
+
+// A dump of a `BufferDebugLog` contents.
+message BufferDebugLogProto {
+  // The list of entries in the log.
+  repeated BufferDebugLogEntryProto entries = 1;
+}
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.cc
new file mode 100644
index 00000000000000..949a6ae1fcea67
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.cc
@@ -0,0 +1,94 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+
+namespace xla::gpu {
+
+BufferDebugLogEntryId BufferDebugLogEntryMetadataStore::AssignId(
+    const BufferDebugLogEntryMetadataStore::Metadata& metadata) {
+  absl::MutexLock lock{mutex_};
+  size_t id = log_entry_metadata_.size();
+  CHECK_LT(id, std::numeric_limits<uint32_t>::max())
+      << "BufferDebugLogEntryId overflowed";
+
+  log_entry_metadata_.push_back(std::move(metadata));
+  return BufferDebugLogEntryId{static_cast<uint32_t>(id)};
+}
+
+std::optional<BufferDebugLogEntryMetadataStore::Metadata>
+BufferDebugLogEntryMetadataStore::GetEntryMetadata(
+    BufferDebugLogEntryId entry_id) {
+  absl::MutexLock lock{mutex_};
+  return GetEntryMetadataLocked(entry_id);
+}
+
+std::vector<std::optional<BufferDebugLogEntryMetadataStore::Metadata>>
+BufferDebugLogEntryMetadataStore::GetEntryMetadataBatch(
+    absl::Span<const BufferDebugLogEntryId> entry_ids) {
+  absl::MutexLock lock{mutex_};
+  std::vector<std::optional<Metadata>> result;
+  result.reserve(entry_ids.size());
+  for (BufferDebugLogEntryId entry_id : entry_ids) {
+    result.push_back(GetEntryMetadataLocked(entry_id));
+  }
+  return result;
+}
+
+std::optional<BufferDebugLogEntryMetadataStore::Metadata>
+BufferDebugLogEntryMetadataStore::GetEntryMetadataLocked(
+    BufferDebugLogEntryId entry_id) {
+  if (entry_id >= log_entry_metadata_.size()) {
+    return std::nullopt;
+  }
+  return log_entry_metadata_[entry_id.value()];
+}
+
+BufferDebugLogProto BufferDebugLogEntryMetadataStore::EntriesToProto(
+    absl::Span<const BufferDebugLogEntry> entries) {
+  absl::MutexLock lock{mutex_};
+
+  BufferDebugLogProto proto;
+  for (const BufferDebugLogEntry& entry : entries) {
+    std::optional<Metadata> metadata = GetEntryMetadataLocked(entry.entry_id);
+    if (!metadata.has_value()) {
+      continue;
+    }
+
+    BufferDebugLogEntryProto* entry_proto = proto.add_entries();
+    entry_proto->set_thunk_id(metadata->thunk_id.value());
+    entry_proto->set_buffer_idx(metadata->buffer_idx);
+    entry_proto->set_execution_id(metadata->execution_id);
+    entry_proto->set_is_input_buffer(metadata->is_input);
+    entry_proto->set_checksum(entry.value);
+    entry_proto->set_check_type(metadata->check_type);
+  }
+  return proto;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h
new file mode 100644
index 00000000000000..06ea762e3c7246
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h
@@ -0,0 +1,111 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_ENTRY_METADATA_STORE_H_
+#define XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_ENTRY_METADATA_STORE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+
+namespace xla::gpu {
+
+// Provides unique mapping between `BufferDebugLogEntry::entry_id` and
+// additional information about the entry.
+//
+// For checksumming, the entry_id is transferred between host and device to
+// identify the context of checksummed buffer. This class provides a way to
+// store additional data about the entry without passing excessive information
+// back and forth.
+class BufferDebugLogEntryMetadataStore {
+ public:
+  // Metadata stored for each entry.
+  struct Metadata {
+    // ID of the thunk the entry relates to.
+    ThunkId thunk_id;
+    // Index of the thunk's buffer within the array returned by
+    // `Thunk::buffer_uses()`.
+    size_t buffer_idx;
+    // ID of the execution of the thunk, to distinguish between different
+    // executions of the same thunk, e.g. when it's used in a loop.
+    size_t execution_id;
+    // True if the entry represents a check made before the thunk executes.
+    bool is_input;
+
+    // The type of check that produced this entry.
+    BufferDebugLogEntryProto::CheckType check_type;
+
+    // Profile annotation of the HLO instruction that produced this entry.
+    // This is used to identify the HLO instruction in HloModule that was under
+    // the check. We need that to be able to log the HLO instruction when
+    // a non-zero number of infs or nans were found.
+    std::string profile_annotation;
+
+    std::string ToString() const {
+      return absl::StrCat(
+          "thunk_id: ", thunk_id.value(), ", buffer_idx: ", buffer_idx,
+          ", execution_id: ", execution_id,
+          ", is_input: ", is_input ? "true" : "false", ", check_type: ",
+          BufferDebugLogEntryProto::CheckType_Name(check_type));
+    }
+  };
+
+  // Inserts `metadata` into the store and returns an ID that can be used to
+  // retrieve it with `GetEntryMetadata`.
+  //
+  // The returned ID is guaranteed to be unique within the lifetime of this
+  // store, and stays valid until the store gets destroyed.
+  BufferDebugLogEntryId AssignId(const Metadata& metadata)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Returns the metadata for the entry with `entry_id` previously returned by
+  // `AssignId`, or `std::nullopt` if the ID is invalid.
+  std::optional<Metadata> GetEntryMetadata(BufferDebugLogEntryId entry_id)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Returns the metadata for the entries with `entry_ids` previously
+  // returned by `AssignId`, or `std::nullopt` if the ID is invalid.
+  std::vector<std::optional<Metadata>> GetEntryMetadataBatch(
+      absl::Span<const BufferDebugLogEntryId> entry_ids)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Converts a list of `entries` with IDs assigned by this store to a
+  // `BufferDebugLogProto` with additional metadata.
+  BufferDebugLogProto EntriesToProto(
+      absl::Span<const BufferDebugLogEntry> entries)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  std::optional<Metadata> GetEntryMetadataLocked(BufferDebugLogEntryId entry_id)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  absl::Mutex mutex_;
+  std::vector<Metadata> log_entry_metadata_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_ENTRY_METADATA_STORE_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store_test.cc
new file mode 100644
index 00000000000000..d2d19f9a501c5d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+
+#include <optional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(BufferDebugLogEntryMetadataStoreTest, RoundTrip) {
+  BufferDebugLogEntryMetadataStore store;
+  BufferDebugLogEntryMetadataStore::Metadata metadata = {
+      /*thunk_id=*/ThunkId(123),
+      /*buffer_idx=*/4,
+      /*execution_id=*/5,
+      /*is_input=*/true,
+  };
+
+  BufferDebugLogEntryId entry_id = store.AssignId(metadata);
+  // Add a second entry to ensure mutating the store doesn't invalidate
+  // previously assigned IDs.
+  [[maybe_unused]] BufferDebugLogEntryId unused_id =
+      store.AssignId(BufferDebugLogEntryMetadataStore::Metadata{});
+  std::optional<BufferDebugLogEntryMetadataStore::Metadata> retrieved_metadata =
+      store.GetEntryMetadata(entry_id);
+
+  ASSERT_TRUE(retrieved_metadata.has_value());
+  EXPECT_EQ(retrieved_metadata->thunk_id, metadata.thunk_id);
+  EXPECT_EQ(retrieved_metadata->buffer_idx, metadata.buffer_idx);
+  EXPECT_EQ(retrieved_metadata->execution_id, metadata.execution_id);
+  EXPECT_EQ(retrieved_metadata->is_input, metadata.is_input);
+}
+
+TEST(BufferDebugLogEntryMetadataStoreTest, InvalidId) {
+  BufferDebugLogEntryMetadataStore store;
+
+  EXPECT_EQ(store.GetEntryMetadata(BufferDebugLogEntryId{123}), std::nullopt);
+}
+
+TEST(BufferDebugLogEntryMetadataStoreTest, EntriesToProto) {
+  BufferDebugLogEntryMetadataStore store;
+  const BufferDebugLogEntryId entry_id1 = store.AssignId({
+      /*thunk_id=*/ThunkId(123),
+      /*buffer_idx=*/4,
+      /*execution_id=*/5,
+      /*is_input=*/true,
+      BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+  });
+  const BufferDebugLogEntryId entry_id2 = store.AssignId({
+      /*thunk_id=*/ThunkId(567),
+      /*buffer_idx=*/8,
+      /*execution_id=*/9,
+      /*is_input=*/false,
+      BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+  });
+  std::vector<BufferDebugLogEntry> entries = {
+      {
+          /*entry_id=*/entry_id1,
+          /*checksum=*/12341234,
+      },
+      {
+          /*entry_id=*/entry_id2,
+          /*checksum=*/56785678,
+      },
+  };
+
+  BufferDebugLogProto log_proto = store.EntriesToProto(entries);
+
+  EXPECT_THAT(log_proto, EqualsProto(R"pb(
+                entries {
+                  thunk_id: 123
+                  buffer_idx: 4
+                  execution_id: 5
+                  is_input_buffer: true
+                  checksum: 12341234
+                  check_type: CHECK_TYPE_CHECKSUM
+                }
+                entries {
+                  thunk_id: 567
+                  buffer_idx: 8
+                  execution_id: 9
+                  is_input_buffer: false
+                  checksum: 56785678,
+                  check_type: CHECK_TYPE_FLOAT_CHECKS
+                }
+              )pb"));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
new file mode 100644
index 00000000000000..9ff067c00b633d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
@@ -0,0 +1,107 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_STRUCTS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_STRUCTS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla::gpu {
+
+TSL_LIB_GTL_DEFINE_INT_TYPE(BufferDebugLogEntryId, uint32_t)
+
+struct BufferDebugLogEntry {
+  // An ID that uniquely identifies a log entry within a HLO module execution.
+  BufferDebugLogEntryId entry_id;
+  uint32_t value;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const BufferDebugLogEntry& entry) {
+    absl::Format(&sink, "{entry_id: %v, value: %u}", entry.entry_id.value(),
+                 entry.value);
+  }
+
+  bool operator==(const BufferDebugLogEntry& other) const {
+    return std::tie(entry_id, value) == std::tie(other.entry_id, other.value);
+  }
+
+  bool operator!=(const BufferDebugLogEntry& other) const {
+    return !(*this == other);
+  }
+};
+
+// The struct layout must match on both host and device.
+static_assert(_Alignof(BufferDebugLogEntry) == _Alignof(uint32_t));
+static_assert(sizeof(BufferDebugLogEntry) == sizeof(uint32_t) * 2);
+static_assert(offsetof(BufferDebugLogEntry, entry_id) == 0);
+static_assert(offsetof(BufferDebugLogEntry, value) == sizeof(uint32_t));
+
+struct BufferDebugFloatCheckEntry {
+  // An ID that uniquely identifies a log entry within a HLO module execution.
+  BufferDebugLogEntryId entry_id;
+  uint32_t nan_count;
+  uint32_t inf_count;
+  uint32_t zero_count;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink,
+                            const BufferDebugFloatCheckEntry& entry) {
+    absl::Format(&sink,
+                 "{entry_id: %v, nan_count: %u, inf_count: %u, zero_count: %u}",
+                 entry.entry_id.value(), entry.nan_count, entry.inf_count,
+                 entry.zero_count);
+  }
+
+  bool operator==(const BufferDebugFloatCheckEntry& other) const {
+    return std::tie(entry_id, nan_count, inf_count, zero_count) ==
+           std::tie(other.entry_id, other.nan_count, other.inf_count,
+                    other.zero_count);
+  }
+
+  bool operator!=(const BufferDebugFloatCheckEntry& other) const {
+    return !(*this == other);
+  }
+};
+
+// The struct layout must match on both host and device.
+static_assert(_Alignof(BufferDebugFloatCheckEntry) == _Alignof(uint32_t));
+static_assert(sizeof(BufferDebugFloatCheckEntry) == sizeof(uint32_t) * 4);
+static_assert(offsetof(BufferDebugFloatCheckEntry, entry_id) == 0);
+static_assert(offsetof(BufferDebugFloatCheckEntry, nan_count) ==
+              sizeof(uint32_t));
+
+struct BufferDebugLogHeader {
+  // The first entry in `BufferDebugLogEntry` following the header that has not
+  // been written to. May be bigger than `capacity` if the log was truncated.
+  uint32_t write_idx;
+  // The number of `BufferDebugLogEntry` structs the log can hold.
+  uint32_t capacity;
+};
+
+// The struct layout must match on both host and device.
+static_assert(_Alignof(BufferDebugLogHeader) == _Alignof(uint32_t));
+static_assert(sizeof(BufferDebugLogHeader) == sizeof(uint32_t) * 2);
+static_assert(offsetof(BufferDebugLogHeader, write_idx) == 0);
+static_assert(offsetof(BufferDebugLogHeader, capacity) == sizeof(uint32_t));
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_STRUCTS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.cc
new file mode 100644
index 00000000000000..46bd1554469d5a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.cc
@@ -0,0 +1,148 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+namespace se = stream_executor;
+
+absl::Status BuffersDebugChecksumThunk::Initialize(
+    const InitializeParams& params) {
+  if (params.executor->GetPlatform()->id() != se::cuda::kCudaPlatformId) {
+    VLOG(1)
+        << "Buffer checksumming not supported on non-CUDA platforms, skipping";
+    return absl::OkStatus();
+  }
+  if (!params.executor->GetDeviceDescription()
+           .cuda_compute_capability()
+           .IsAtLeastPascal()) {
+    VLOG(1)
+        << "Buffer checksumming not supported on CUDA architectures older than "
+           "Pascal due to missing atomic fetch_add with system scope, skipping";
+    return absl::OkStatus();
+  }
+
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    if (!kernels_.contains(params.executor)) {
+      se::gpu::GpuKernelRegistry registry =
+          se::gpu::GpuKernelRegistry::GetGlobalRegistry();
+      TF_ASSIGN_OR_RETURN(
+          auto kernel,
+          registry.LoadKernel<se::gpu::BufferDebugXorChecksumKernel>(
+              params.executor));
+      kernels_[params.executor] =
+          std::make_unique<se::gpu::BufferDebugXorChecksumKernel::KernelType>(
+              std::move(kernel));
+      VLOG(1) << "Checksum kernel loaded on device "
+              << params.executor->device_ordinal()
+              << " (stream_executor: " << params.executor
+              << "), kernel: " << kernels_[params.executor].get();
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status BuffersDebugChecksumThunk::ExecuteOnStream(
+    const ExecuteParams& params) {
+  se::StreamExecutor* executor = params.stream->parent();
+
+  se::gpu::BufferDebugXorChecksumKernel::KernelType* kernel = nullptr;
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    auto kernel_it = kernels_.find(executor);
+    if (kernel_it == kernels_.end()) {
+      // Initialize didn't load the kernel. This can happen when we're running
+      // on an unsupported platform.
+      VLOG(1) << "Checksum kernel not loaded on device "
+              << executor->device_ordinal() << ", skipping";
+      return absl::OkStatus();
+    }
+    kernel = kernel_it->second.get();
+  }
+
+  VLOG(1) << "BuffersDebugChecksumThunk::ExecuteOnStream, device "
+          << executor->device_ordinal() << " (stream_executor: " << executor
+          << "), kernel: " << kernel;
+  const uint32_t execution_id = execution_count_.fetch_add(1);
+
+  const se::ThreadDim thread_dim(
+      executor->GetDeviceDescription().threads_per_block_limit(), 1, 1);
+
+  se::DeviceMemory<uint8_t> log_ptr(
+      params.buffer_allocations->GetDeviceAddress(log_slice_));
+  auto buffer_debug_log =
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::FromDeviceMemoryUnchecked(
+          log_ptr);
+
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
+    BufferDebugLogEntryMetadataStore::Metadata metadata{
+        /*thunk_id*/ checked_thunk_id_,
+        /*buffer_idx*/ buffer_idx,
+        /*execution_id*/ execution_id,
+        /*is_input*/ runs_before_checked_thunk_,
+        /*check_type*/ BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+    };
+    const BufferDebugLogEntryId log_entry_id =
+        metadata_store_->AssignId(metadata);
+
+    se::DeviceMemory<uint8_t> device_buffer(
+        params.buffer_allocations->GetDeviceAddress(buffer));
+
+    TF_RETURN_IF_ERROR(kernel->Launch(
+        thread_dim, se::BlockDim(1, 1, 1), params.stream, log_entry_id,
+        device_buffer, device_buffer.size(), buffer_debug_log.GetDeviceHeader(),
+        buffer_debug_log.GetDeviceEntries()));
+  }
+
+  return absl::OkStatus();
+}
+
+std::string BuffersDebugChecksumThunk::ToString(int indent) const {
+  std::string result;
+  absl::StrAppend(&result, ", buffers = ", checked_thunk_buffers_.size());
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
+    absl::StrAppend(&result, "\n", std::string(indent + 2, ' '),
+                    "buffer_idx: ", buffer_idx,
+                    ", buffer: ", buffer.ToString());
+  }
+  return result;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.h b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.h
new file mode 100644
index 00000000000000..4baf4fb10d7efa
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.h
@@ -0,0 +1,107 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFERS_CHECKSUM_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_BUFFERS_CHECKSUM_THUNK_H_
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+class BuffersDebugChecksumThunk : public Thunk {
+ public:
+  explicit BuffersDebugChecksumThunk(
+      ThunkInfo info, BufferAllocation::Slice log_slice,
+      ThunkId checked_thunk_id,
+      // buffer_idx => buffer slice
+      absl::flat_hash_map<size_t, BufferAllocation::Slice>
+          checked_thunk_buffers,
+      bool runs_before_checked_thunk,
+      std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store)
+      : Thunk(Thunk::Kind::kBuffersDebugChecksum, std::move(info)),
+        log_slice_(log_slice),
+        metadata_store_(std::move(metadata_store)),
+        checked_thunk_id_(checked_thunk_id),
+        checked_thunk_buffers_(std::move(checked_thunk_buffers)),
+        runs_before_checked_thunk_(runs_before_checked_thunk) {}
+
+  absl::Status Initialize(const InitializeParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
+
+  std::string ToString(int indent) const override;
+
+  BufferUses buffer_uses() const override {
+    // Intentionally left empty to not checksum the checksumming thunk.
+    return {};
+  }
+
+  const absl::flat_hash_map<size_t, BufferAllocation::Slice>& buffer_slices()
+      const {
+    return checked_thunk_buffers_;
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink,
+                            const BuffersDebugChecksumThunk& thunk) {
+    absl::Format(&sink, "BuffersDebugChecksumThunk{buffers=%s}",
+                 absl::StrJoin(thunk.checked_thunk_buffers_, ", ",
+                               [](std::string* out, const auto& buffer) {
+                                 const auto& [id, slice] = buffer;
+                                 absl::StrAppend(out, id, "=",
+                                                 slice.ToString());
+                               }));
+  }
+
+ private:
+  absl::Mutex kernels_mutex_;
+  // Each loaded kernel is associated with a specific device (represented by its
+  // StreamExecutor).
+  //
+  // ExecuteOnStream implementation requires pointer stability of values, hence
+  // unique_ptr.
+  absl::flat_hash_map<
+      stream_executor::StreamExecutor*,
+      std::unique_ptr<
+          stream_executor::gpu::BufferDebugXorChecksumKernel::KernelType>>
+      kernels_ ABSL_GUARDED_BY(kernels_mutex_);
+
+  BufferAllocation::Slice log_slice_;
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store_;
+  ThunkId checked_thunk_id_;
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> checked_thunk_buffers_;
+  bool runs_before_checked_thunk_;
+  std::atomic<size_t> execution_count_ = 0;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_BUFFERS_CHECKSUM_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
new file mode 100644
index 00000000000000..7f579374c5a0a1
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
@@ -0,0 +1,266 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+namespace se = stream_executor;
+
+using ::stream_executor::gpu::BufferDebugLog;
+using Metadata = BufferDebugLogEntryMetadataStore::Metadata;
+
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::UnorderedElementsAre;
+
+MATCHER_P2(IsEntryWithMetadata, store, metadata, "") {
+  std::optional<Metadata> actual_metadata =
+      store->GetEntryMetadata(arg.entry_id);
+  if (!actual_metadata.has_value()) {
+    *result_listener << "metadata not found for entry_id "
+                     << arg.entry_id.value();
+    return false;
+  }
+
+  return ExplainMatchResult(
+      AllOf(Field(&Metadata::thunk_id, metadata.thunk_id),
+            Field(&Metadata::buffer_idx, metadata.buffer_idx),
+            Field(&Metadata::execution_id, metadata.execution_id),
+            Field(&Metadata::is_input, metadata.is_input)),
+      *actual_metadata, result_listener);
+}
+
+class FakeThunk : public Thunk {
+ public:
+  explicit FakeThunk(ThunkInfo info, BufferUses buffer_uses)
+      : Thunk(Thunk::Kind::kGemm, std::move(info)),
+        buffer_uses_(std::move(buffer_uses)) {}
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+
+  BufferUses buffer_uses() const override { return buffer_uses_; }
+
+ private:
+  BufferUses buffer_uses_;
+};
+
+class BuffersDebugChecksumThunkTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            se::PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+
+    if (!executor_->GetDeviceDescription()
+             .cuda_compute_capability()
+             .IsAtLeastPascal()) {
+      GTEST_SKIP()
+          << "buffer checksumming is not supported on CUDA architectures "
+             "older than Pascal due to missing atomic fetch_add with "
+             "system scope";
+    }
+  }
+
+  se::Platform* platform_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
+  static constexpr size_t kLogSize =
+      BufferDebugLog<BufferDebugLogEntry>::RequiredSizeForEntries(10);
+  static constexpr size_t kInputSize = 1024;
+  static constexpr size_t kInputCount = 2;
+  static constexpr size_t kTotalDeviceMemoryBytes =
+      kLogSize + kInputSize * kInputCount;
+  // Setup memory allocations for the log and inputs
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kTotalDeviceMemoryBytes,
+                         /*color=*/0);
+  BufferAllocation::Slice log_slice(&alloc, /*offset=*/0, kLogSize);
+  BufferAllocation::Slice inputs[kInputCount];
+  for (int i = 0; i < kInputCount; ++i) {
+    inputs[i] = BufferAllocation::Slice(
+        &alloc, /*offset=*/kLogSize + i * kInputSize, kInputSize);
+  }
+  BufferAllocations allocations(
+      {executor_->AllocateArray<uint8_t>(kTotalDeviceMemoryBytes)},
+      executor_->device_ordinal(), allocator_.get());
+  se::DeviceMemoryBase log_mem = allocations.GetDeviceAddress(log_slice);
+  se::DeviceMemoryBase inputs0_mem = allocations.GetDeviceAddress(inputs[0]);
+  se::DeviceMemoryBase inputs1_mem = allocations.GetDeviceAddress(inputs[1]);
+  // Initialize the log in device memory
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                              *stream_, se::DeviceMemory<uint8_t>(log_mem)));
+  // Fill inputs with some data
+  std::vector<uint32_t> zeros(1024, 0);
+  zeros[123] = 12341234;  // expected checksum for inputs_mem[0]
+  TF_ASSERT_OK(stream_->Memcpy(&inputs0_mem, zeros.data(), zeros.size()));
+  zeros[123] = 56785678;  // expected checksum for inputs_mem[1]
+  TF_ASSERT_OK(stream_->Memcpy(&inputs1_mem, zeros.data(), zeros.size()));
+  // Setup parameters for Initialize/Prepare/ExecuteOnStream
+  Thunk::InitializeParams init_params;
+  init_params.executor = executor_;
+  init_params.stream = stream_.get();
+  auto execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), allocations, stream_.get(),
+      /*command_buffer_trace_stream=*/stream_.get(),
+      /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
+  auto metadata_store = std::make_shared<BufferDebugLogEntryMetadataStore>();
+
+  BuffersDebugChecksumThunk thunk(
+      Thunk::ThunkInfo(), log_slice,
+      /*checked_thunk_id=*/ThunkId(123),
+      {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
+      /*runs_before_checked_thunk=*/true, metadata_store);
+  TF_ASSERT_OK(thunk.Initialize(init_params));
+  TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<BufferDebugLogEntry> entries,
+                          device_log.ReadFromDevice(*stream_));
+
+  // BuffersDebugChecksumThunk launches a kernel for each input buffer, they may
+  // complete in any order.
+  EXPECT_THAT(entries,
+              UnorderedElementsAre(
+                  AllOf(IsEntryWithMetadata(
+                            metadata_store,
+                            Metadata{
+                                /*thunk_id=*/ThunkId(123),
+                                /*buffer_idx=*/0,
+                                /*execution_id=*/0,
+                                /*is_input=*/true,
+                                BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+                            }),
+                        Field(&BufferDebugLogEntry::value, 12341234)),
+                  AllOf(IsEntryWithMetadata(
+                            metadata_store,
+                            Metadata{
+                                /*thunk_id=*/ThunkId(123),
+                                /*buffer_idx=*/1,
+                                /*execution_id=*/0,
+                                /*is_input=*/true,
+                                BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+                            }),
+                        Field(&BufferDebugLogEntry::value, 56785678))));
+}
+
+TEST_F(BuffersDebugChecksumThunkTest,
+       ExecutesCorrectKernelsForDifferentDevices) {
+  // Loaded kernels are associated with a specific device represented by its
+  // StreamExecutor. The same Thunk will be Initialized once for each device,
+  // which will load the kernel onto that device. During ExecuteOnStream, the
+  // correct kernel needs to be launched.
+  if (platform_->VisibleDeviceCount() < 2) {
+    GTEST_SKIP() << "need at least 2 devices for this test";
+  }
+
+  static constexpr size_t kLogSizeBytes = 1024;
+  static constexpr size_t kInputSizeBytes = 1024;
+
+  struct TestDevice {
+    se::StreamExecutor* executor;
+    std::unique_ptr<se::Stream> stream;
+    std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator;
+    BufferAllocations allocations;
+  };
+  auto setup_device = [this](int device_ordinal) -> absl::StatusOr<TestDevice> {
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        platform_->ExecutorForDevice(device_ordinal));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
+                        executor->CreateStream());
+    auto allocator =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(executor);
+    BufferAllocations allocations(
+        {executor->AllocateArray<uint8_t>(kLogSizeBytes + kInputSizeBytes)},
+        executor->device_ordinal(), allocator.get());
+
+    return TestDevice{std::move(executor), std::move(stream),
+                      std::move(allocator), std::move(allocations)};
+  };
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device0, setup_device(0));
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device1, setup_device(1));
+  BufferAllocation allocation(0, kLogSizeBytes + kInputSizeBytes, 0);
+  BufferAllocation::Slice log_slice(&allocation, 0, kLogSizeBytes);
+  BufferAllocation::Slice input_slice(&allocation, kLogSizeBytes,
+                                      kInputSizeBytes);
+  BuffersDebugChecksumThunk thunk(
+      Thunk::ThunkInfo(), log_slice,
+      /*checked_thunk_id=*/ThunkId(123), {{/*buffer_idx=*/0, input_slice}},
+      /*runs_before_checked_thunk=*/true,
+      std::make_shared<BufferDebugLogEntryMetadataStore>());
+
+  // Initialize the Thunk on both devices and run the kernel. An attempt to run
+  // a kernel on the wrong device will fail with CUDA_ERROR_INVALID_HANDLE. The
+  // error may be reported from the next operation on the stream, so assert on
+  // BlockHostUntilDone as well.
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device0.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device0.allocations, device0.stream.get(),
+      /*command_buffer_trace_stream=*/device0.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device0.stream->BlockHostUntilDone());
+
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device1.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device1.allocations, device1.stream.get(),
+      /*command_buffer_trace_stream=*/device1.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device1.stream->BlockHostUntilDone());
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
new file mode 100644
index 00000000000000..155b72015d6938
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
@@ -0,0 +1,165 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+
+namespace xla::gpu {
+
+namespace se = stream_executor;
+
+absl::Status BuffersDebugFloatCheckThunk::Initialize(
+    const InitializeParams& params) {
+  if (params.executor->GetPlatform()->id() != se::cuda::kCudaPlatformId) {
+    VLOG(1) << "Buffer float checking not supported on non-CUDA platforms, "
+               "skipping";
+    return absl::OkStatus();
+  }
+  if (!params.executor->GetDeviceDescription()
+           .cuda_compute_capability()
+           .IsAtLeastPascal()) {
+    VLOG(1)
+        << "Buffer float checking not supported on CUDA architectures older "
+           "than Pascal due to missing atomic fetch_add with system scope, "
+           "skipping";
+    return absl::OkStatus();
+  }
+
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    if (!kernels_.contains(params.executor)) {
+      se::gpu::GpuKernelRegistry registry =
+          se::gpu::GpuKernelRegistry::GetGlobalRegistry();
+      TF_ASSIGN_OR_RETURN(
+          auto kernel_f32,
+          registry.LoadKernel<se::gpu::BufferDebugFloatCheckF32Kernel>(
+              params.executor));
+      TF_ASSIGN_OR_RETURN(
+          auto kernel_bf16,
+          registry.LoadKernel<se::gpu::BufferDebugFloatCheckBf16Kernel>(
+              params.executor));
+      kernels_[params.executor] = std::make_unique<Kernels>(
+          Kernels{std::move(kernel_f32), std::move(kernel_bf16)});
+    }
+  }
+
+  VLOG(1) << "FloatCheck kernel loaded";
+  return absl::OkStatus();
+}
+
+absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
+    const ExecuteParams& params) {
+  se::StreamExecutor* executor = params.stream->parent();
+
+  Kernels* kernels = nullptr;
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    auto kernel_it = kernels_.find(executor);
+    if (kernel_it == kernels_.end()) {
+      // Initialize didn't load the kernel. This can happen when we're running
+      // on an unsupported platform.
+      VLOG(1) << "FloatCheck kernels not loaded on device "
+              << executor->device_ordinal() << ", skipping";
+      return absl::OkStatus();
+    }
+    kernels = kernel_it->second.get();
+  }
+
+  VLOG(1) << "BuffersDebugFloatCheckThunk::ExecuteOnStream";
+
+  const se::ThreadDim thread_dim(
+      executor->GetDeviceDescription().threads_per_block_limit(), 1, 1);
+
+  se::DeviceMemory<uint8_t> log_ptr(
+      params.buffer_allocations->GetDeviceAddress(log_slice_));
+  se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry> buffer_debug_log =
+      se::gpu::BufferDebugLog<
+          BufferDebugFloatCheckEntry>::FromDeviceMemoryUnchecked(log_ptr);
+  const uint32_t execution_id = execution_count_.fetch_add(1);
+
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
+    BufferDebugLogEntryMetadataStore::Metadata metadata{
+        checked_thunk_info_.thunk_id,
+        buffer_idx,
+        execution_id,
+        /*is_input=*/false,
+        BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+        checked_thunk_info_.profile_annotation,
+    };
+    const BufferDebugLogEntryId entry_id = metadata_store_->AssignId(metadata);
+
+    PrimitiveType buffer_type = buffer.element_type();
+    se::DeviceMemoryBase device_buffer =
+        params.buffer_allocations->GetDeviceAddress(buffer);
+    if (buffer_type == PrimitiveType::F32) {
+      VLOG(1) << "F32 buffer detected with id: " << entry_id
+              << " and size: " << device_buffer.size();
+      se::DeviceMemory<float> f32_buffer(device_buffer);
+      TF_RETURN_IF_ERROR(kernels->f32.Launch(
+          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
+          f32_buffer, f32_buffer.size(), buffer_debug_log.GetDeviceHeader(),
+          buffer_debug_log.GetDeviceEntries()));
+    } else if (buffer_type == PrimitiveType::BF16) {
+      VLOG(1) << "BF16 buffer detected with id: " << entry_id
+              << " and size: " << device_buffer.size();
+      se::DeviceMemory<Eigen::bfloat16> bf16_buffer(device_buffer);
+      TF_RETURN_IF_ERROR(kernels->bf16.Launch(
+          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
+          bf16_buffer, bf16_buffer.size(), buffer_debug_log.GetDeviceHeader(),
+          buffer_debug_log.GetDeviceEntries()));
+    } else {
+      VLOG(1) << "Unsupported primitive type for float checking: "
+              << PrimitiveType_Name(buffer_type);
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+std::string BuffersDebugFloatCheckThunk::ToString(int indent) const {
+  std::string result;
+  absl::StrAppend(&result, ", buffers = ", checked_thunk_buffers_.size());
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
+    absl::StrAppend(&result, "\n", std::string(indent + 2, ' '),
+                    "buffer_idx: ", buffer_idx,
+                    ", buffer: ", buffer.ToString());
+  }
+  return result;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
new file mode 100644
index 00000000000000..5d2f78e80edb99
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
@@ -0,0 +1,90 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFERS_FLOAT_CHECK_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_BUFFERS_FLOAT_CHECK_THUNK_H_
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+class BuffersDebugFloatCheckThunk : public Thunk {
+ public:
+  explicit BuffersDebugFloatCheckThunk(
+      ThunkInfo info, const ThunkInfo& checked_thunk_info,
+      BufferAllocation::Slice log_slice,
+      absl::flat_hash_map<size_t, BufferAllocation::Slice>
+          checked_thunk_buffers,
+      std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store)
+      : Thunk(Thunk::Kind::kBuffersDebugFloatCheck, std::move(info)),
+        log_slice_(log_slice),
+        checked_thunk_info_(checked_thunk_info),
+        checked_thunk_buffers_(std::move(checked_thunk_buffers)),
+        metadata_store_(std::move(metadata_store)) {}
+
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  std::string ToString(int indent) const override;
+
+  BufferUses buffer_uses() const override {
+    // Intentionally left empty to not float-check the float-checking thunk.
+    return {};
+  }
+
+  const absl::flat_hash_map<size_t, BufferAllocation::Slice>& buffer_slices()
+      const {
+    return checked_thunk_buffers_;
+  }
+
+ private:
+  struct Kernels {
+    stream_executor::gpu::BufferDebugFloatCheckF32Kernel::KernelType f32;
+    stream_executor::gpu::BufferDebugFloatCheckBf16Kernel::KernelType bf16;
+  };
+  absl::Mutex kernels_mutex_;
+  // Each loaded kernel is associated with a specific device (represented by its
+  // StreamExecutor).
+  //
+  // ExecuteOnStream implementation requires pointer stability of values, hence
+  // unique_ptr.
+  absl::flat_hash_map<stream_executor::StreamExecutor*,
+                      std::unique_ptr<Kernels>>
+      kernels_ ABSL_GUARDED_BY(kernels_mutex_);
+
+  BufferAllocation::Slice log_slice_;
+  ThunkInfo checked_thunk_info_;
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> checked_thunk_buffers_;
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store_;
+  std::atomic<size_t> execution_count_ = 0;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_BUFFERS_FLOAT_CHECK_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
new file mode 100644
index 00000000000000..cc2eb8eca41cd4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
@@ -0,0 +1,267 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+
+namespace xla::gpu {
+namespace {
+
+namespace se = stream_executor;
+
+using Metadata = BufferDebugLogEntryMetadataStore::Metadata;
+
+using ::stream_executor::gpu::BufferDebugLog;
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::UnorderedElementsAre;
+
+MATCHER_P2(IsEntryWithMetadata, store, metadata, "") {
+  std::optional<Metadata> actual_metadata =
+      store->GetEntryMetadata(arg.entry_id);
+  if (!actual_metadata.has_value()) {
+    *result_listener << "metadata not found for entry_id "
+                     << arg.entry_id.value();
+    return false;
+  }
+
+  return ExplainMatchResult(
+      AllOf(Field(&Metadata::thunk_id, metadata.thunk_id),
+            Field(&Metadata::buffer_idx, metadata.buffer_idx),
+            Field(&Metadata::execution_id, metadata.execution_id),
+            Field(&Metadata::is_input, metadata.is_input)),
+      *actual_metadata, result_listener);
+}
+
+class BuffersDebugFloatCheckThunkTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            se::PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+
+    if (!executor_->GetDeviceDescription()
+             .cuda_compute_capability()
+             .IsAtLeastPascal()) {
+      GTEST_SKIP()
+          << "buffer float checking is not supported on CUDA architectures "
+             "older than Pascal due to missing atomic fetch_add with "
+             "system scope";
+    }
+  }
+
+  se::Platform* platform_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
+  static constexpr size_t kLogSize =
+      BufferDebugLog<BufferDebugFloatCheckEntry>::RequiredSizeForEntries(10);
+  static constexpr size_t kInputElems = 1024;
+  static constexpr size_t kInputSizeInBytes = kInputElems * sizeof(float);
+  static constexpr size_t kTotalDeviceMemoryBytes =
+      kLogSize + kInputSizeInBytes * 2;
+  // Setup memory allocations for the log and inputs
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kTotalDeviceMemoryBytes,
+                         /*color=*/0);
+  int64_t input_offset = kLogSize;
+  BufferAllocation::Slice log_slice(&alloc, /*offset=*/0, kLogSize);
+  input_offset += kLogSize;
+
+  BufferAllocation::Slice inputs[2];
+  int64_t input_size_bf16 = kInputElems * sizeof(Eigen::bfloat16);
+  inputs[0] = BufferAllocation::Slice(&alloc, input_offset, input_size_bf16,
+                                      PrimitiveType::BF16);
+  input_offset += input_size_bf16;
+
+  inputs[1] = BufferAllocation::Slice(
+      &alloc, input_offset, kInputElems * sizeof(float), PrimitiveType::F32);
+
+  BufferAllocations allocations(
+      {executor_->AllocateArray<uint8_t>(kTotalDeviceMemoryBytes)},
+      executor_->device_ordinal(), allocator_.get());
+  se::DeviceMemoryBase log_mem = allocations.GetDeviceAddress(log_slice);
+  se::DeviceMemoryBase inputs0_mem = allocations.GetDeviceAddress(inputs[0]);
+  se::DeviceMemoryBase inputs1_mem = allocations.GetDeviceAddress(inputs[1]);
+  // Initialize the log in device memory
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, se::DeviceMemory<uint8_t>(log_mem)));
+  // Fill inputs with some data
+  {
+    std::vector<Eigen::bfloat16> data(kInputElems, Eigen::bfloat16(0));
+    data[123] = std::numeric_limits<Eigen::bfloat16>::quiet_NaN();
+    TF_ASSERT_OK(stream_->Memcpy(&inputs0_mem, data.data(), kInputSizeInBytes));
+  }
+  {
+    std::vector<float> data(kInputElems, 0);
+    data[456] = std::numeric_limits<float>::quiet_NaN();
+    data[789] = std::numeric_limits<float>::quiet_NaN();
+    TF_ASSERT_OK(stream_->Memcpy(&inputs1_mem, data.data(), kInputSizeInBytes));
+  }
+
+  // Setup parameters for Initialize/Prepare/ExecuteOnStream
+  Thunk::InitializeParams init_params;
+  init_params.executor = executor_;
+  init_params.stream = stream_.get();
+  auto execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), allocations, stream_.get(),
+      /*command_buffer_trace_stream=*/stream_.get(),
+      /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
+  auto metadata_store = std::make_shared<BufferDebugLogEntryMetadataStore>();
+
+  Thunk::ThunkInfo checked_thunk_info;
+  checked_thunk_info.thunk_id = ThunkId(123);
+  BuffersDebugFloatCheckThunk thunk(
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
+      metadata_store);
+  TF_ASSERT_OK(thunk.Initialize(init_params));
+  TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<BufferDebugFloatCheckEntry> entries,
+                          device_log.ReadFromDevice(*stream_));
+
+  // BuffersDebugFloatCheckThunk launches a kernel for each input buffer, they
+  // may complete in any order.
+  EXPECT_THAT(entries,
+              UnorderedElementsAre(
+                  IsEntryWithMetadata(
+                      metadata_store,
+                      Metadata{
+                          /*thunk_id=*/ThunkId(123),
+                          /*buffer_idx=*/0,
+                          /*execution_id=*/0,
+                          /*is_input=*/false,
+                          BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+                      }),
+                  IsEntryWithMetadata(
+                      metadata_store,
+                      Metadata{
+                          /*thunk_id=*/ThunkId(123),
+                          /*buffer_idx=*/1,
+                          /*execution_id=*/0,
+                          /*is_input=*/false,
+                          BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+                      })));
+}
+
+TEST_F(BuffersDebugFloatCheckThunkTest,
+       ExecutesCorrectKernelsForDifferentDevices) {
+  // Loaded kernels are associated with a specific device represented by its
+  // StreamExecutor. The same Thunk will be Initialized once for each device,
+  // which will load the kernel onto that device. During ExecuteOnStream, the
+  // correct kernel needs to be launched.
+  if (platform_->VisibleDeviceCount() < 2) {
+    GTEST_SKIP() << "need at least 2 devices for this test";
+  }
+
+  static constexpr size_t kLogSizeBytes = 1024;
+  static constexpr size_t kInputSizeBytes = 1024;
+
+  struct TestDevice {
+    se::StreamExecutor* executor;
+    std::unique_ptr<se::Stream> stream;
+    std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator;
+    BufferAllocations allocations;
+  };
+  auto setup_device = [this](int device_ordinal) -> absl::StatusOr<TestDevice> {
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        platform_->ExecutorForDevice(device_ordinal));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
+                        executor->CreateStream());
+    auto allocator =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(executor);
+    BufferAllocations allocations(
+        {executor->AllocateArray<uint8_t>(kLogSizeBytes + kInputSizeBytes)},
+        executor->device_ordinal(), allocator.get());
+
+    return TestDevice{std::move(executor), std::move(stream),
+                      std::move(allocator), std::move(allocations)};
+  };
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device0, setup_device(0));
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device1, setup_device(1));
+  BufferAllocation allocation(0, kLogSizeBytes + kInputSizeBytes, 0);
+  BufferAllocation::Slice log_slice(&allocation, 0, kLogSizeBytes);
+  BufferAllocation::Slice f32_slice(&allocation, kLogSizeBytes, kInputSizeBytes,
+                                    PrimitiveType::F32);
+  BufferAllocation::Slice bf16_slice(&allocation, kLogSizeBytes,
+                                     kInputSizeBytes, PrimitiveType::BF16);
+  Thunk::ThunkInfo checked_thunk_info;
+  checked_thunk_info.thunk_id = ThunkId(123);
+  BuffersDebugFloatCheckThunk thunk(
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      {{/*buffer_idx=*/0, f32_slice}, {/*buffer_idx=*/1, bf16_slice}},
+      std::make_shared<BufferDebugLogEntryMetadataStore>());
+
+  // Initialize the Thunk on both devices and run the kernel. An attempt to run
+  // a kernel on the wrong device will fail with CUDA_ERROR_INVALID_HANDLE. The
+  // error may be reported from the next operation on the stream, so assert on
+  // BlockHostUntilDone as well.
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device0.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device0.allocations, device0.stream.get(),
+      /*command_buffer_trace_stream=*/device0.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device0.stream->BlockHostUntilDone());
+
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device1.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device1.allocations, device1.stream.get(),
+      /*command_buffer_trace_stream=*/device1.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device1.stream->BlockHostUntilDone());
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc
deleted file mode 100644
index 7562386bca4399..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/cholesky_thunk.h"
-
-#include <complex>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "xla/backends/gpu/runtime/make_batch_pointers.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-template <typename T>
-absl::Status DoPotrfBatched(CholeskyParams* params, se::Stream* stream,
-                            stream_executor::GpuSolverContext& context) {
-  T* a_base = static_cast<T*>(params->a_buffer.opaque());
-  se::DeviceMemory<int> infos(params->info_buffer);
-#if TENSORFLOW_USE_ROCSOLVER
-  // hipsolver is not supported so allocate a GPU buffer
-  se::ScopedDeviceMemory<T*> ptrs(
-      stream->parent(), stream->parent()->AllocateArray<T*>(batch_size_));
-  auto as = *ptrs;
-#else
-  se::DeviceMemory<T*> as(params->workspace_buffer);
-#endif
-
-  CHECK_GE(as.size(), params->batch_size);
-  CHECK_GE(infos.size(), params->batch_size);
-
-  // Run a kernel that sets as[i] = &a_base[i * stride].
-  const int64_t stride_bytes = params->n * params->n * sizeof(T);
-  TF_RETURN_IF_ERROR(MakeBatchPointers(
-      stream, se::DeviceMemoryBase(a_base), stride_bytes,
-      static_cast<int>(params->batch_size), se::DeviceMemoryBase(as)));
-
-  // Now that we've set up the `as` array, we can call cusolver.
-  return context.PotrfBatched(params->uplo, params->n, as, params->n, infos,
-                              params->batch_size);
-}
-
-template <typename T>
-absl::Status DoPotrfUnbatched(CholeskyParams* params, se::Stream* stream,
-                              stream_executor::GpuSolverContext& context) {
-  T* a_base = static_cast<T*>(params->a_buffer.opaque());
-  int* info_base = static_cast<int*>(params->info_buffer.opaque());
-
-  int64_t stride = params->n * params->n;
-  for (int64_t i = 0; i < params->batch_size; ++i) {
-    se::DeviceMemory<T> a_data(
-        se::DeviceMemoryBase(&a_base[i * stride], sizeof(T) * stride));
-    se::DeviceMemory<int> info_data(
-        se::DeviceMemoryBase(&info_base[i], sizeof(int)));
-    se::DeviceMemory<T> workspace_data(params->workspace_buffer);
-    TF_RETURN_IF_ERROR(context.Potrf(params->uplo, params->n, a_data, params->n,
-                                     info_data, workspace_data));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RunCholesky(PrimitiveType type, CholeskyParams* cholesky_params,
-                         se::Stream* stream,
-                         stream_executor::GpuSolverContext* local_context) {
-  TF_RETURN_IF_ERROR(local_context->SetStream(stream));
-  if (cholesky_params->batch_size > 1) {
-    switch (type) {
-      case F32:
-        return DoPotrfBatched<float>(cholesky_params, stream, *local_context);
-      case F64:
-        return DoPotrfBatched<double>(cholesky_params, stream, *local_context);
-      case C64:
-        return DoPotrfBatched<std::complex<float>>(cholesky_params, stream,
-                                                   *local_context);
-      case C128:
-        return DoPotrfBatched<std::complex<double>>(cholesky_params, stream,
-                                                    *local_context);
-      default:
-        return InvalidArgument("Invalid type for cholesky %s",
-                               PrimitiveType_Name(type));
-    }
-  } else {
-    switch (type) {
-      case F32:
-        return DoPotrfUnbatched<float>(cholesky_params, stream, *local_context);
-      case F64:
-        return DoPotrfUnbatched<double>(cholesky_params, stream,
-                                        *local_context);
-      case C64:
-        return DoPotrfUnbatched<std::complex<float>>(cholesky_params, stream,
-                                                     *local_context);
-      case C128:
-        return DoPotrfUnbatched<std::complex<double>>(cholesky_params, stream,
-                                                      *local_context);
-      default:
-        return InvalidArgument("Invalid type for cholesky %s",
-                               PrimitiveType_Name(type));
-    }
-  }
-}
-
-}  // namespace
-
-CholeskyThunk::CholeskyThunk(
-    ThunkInfo thunk_info, const CholeskyOptions& options,
-    BufferAllocation::Slice a_buffer, BufferAllocation::Slice workspace_buffer,
-    BufferAllocation::Slice info_buffer, PrimitiveType type, int64_t batch_size,
-    int64_t n,
-    absl::AnyInvocable<
-        absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-        solver_context_creator)
-    : Thunk(Kind::kCholesky, thunk_info),
-      uplo_(options.lower() ? se::blas::UpperLower::kLower
-                            : se::blas::UpperLower::kUpper),
-      a_buffer_(a_buffer),
-      workspace_buffer_(workspace_buffer),
-      info_buffer_(info_buffer),
-      type_(type),
-      batch_size_(batch_size),
-      n_(n),
-      solver_context_creator_(std::move(solver_context_creator)) {}
-
-absl::Status CholeskyThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(3) << "type=" << PrimitiveType_Name(type_)
-          << " uplo=" << se::blas::UpperLowerString(uplo_)
-          << " batch_size=" << batch_size_ << " n=" << n_
-          << " a=" << a_buffer_.ToString()
-          << " workspace=" << workspace_buffer_.ToString()
-          << " info=" << info_buffer_.ToString();
-
-  se::DeviceMemoryBase a_buffer =
-      params.buffer_allocations->GetDeviceAddress(a_buffer_);
-  se::DeviceMemoryBase info_buffer =
-      params.buffer_allocations->GetDeviceAddress(info_buffer_);
-  se::DeviceMemoryBase workspace_buffer =
-      params.buffer_allocations->GetDeviceAddress(workspace_buffer_);
-  CholeskyParams cholesky_params{n_,       batch_size_,      uplo_,
-                                 a_buffer, workspace_buffer, info_buffer};
-  thread_local absl::StatusOr<
-      std::unique_ptr<stream_executor::GpuSolverContext>>
-      context = solver_context_creator_();
-  TF_RETURN_IF_ERROR(context.status());
-  auto local_context = context.value().get();
-  return RunCholesky(type_, &cholesky_params, params.stream, local_context);
-}
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.h b/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.h
deleted file mode 100644
index 23b16c79c0275e..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_CHOLESKY_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_CHOLESKY_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-// This class stores everything that StreamExecutor needs to launch a Cholesky
-// decomposition (LAPACK potrf). It is generated by IrEmitter.
-//
-// As an implementation detail, we may run potrf (potentially in a loop, if
-// batch_size >1), or potrfBatched.
-//
-// Thread-compatible.
-class CholeskyThunk : public Thunk {
- public:
-  CholeskyThunk(
-      ThunkInfo thunk_info, const CholeskyOptions& options,
-      BufferAllocation::Slice a_buffer,
-      BufferAllocation::Slice workspace_buffer,
-      BufferAllocation::Slice info_buffer, PrimitiveType type,
-      int64_t batch_size, int64_t n,
-      absl::AnyInvocable<
-          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-          solver_context_creator);
-
-  CholeskyThunk(const CholeskyThunk&) = delete;
-  CholeskyThunk& operator=(const CholeskyThunk&) = delete;
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  se::blas::UpperLower uplo_;
-
-  const BufferAllocation::Slice a_buffer_;
-  const BufferAllocation::Slice workspace_buffer_;
-  const BufferAllocation::Slice info_buffer_;
-
-  const PrimitiveType type_;
-  const int64_t batch_size_;
-  const int64_t n_;
-  absl::AnyInvocable<
-      absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-      solver_context_creator_;
-};
-
-struct CholeskyParams {
-  int64_t n;
-  int64_t batch_size;
-  se::blas::UpperLower uplo;
-  se::DeviceMemoryBase a_buffer;
-  se::DeviceMemoryBase workspace_buffer;
-  se::DeviceMemoryBase info_buffer;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_CHOLESKY_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
index 38e122e52ce615..b119d69ab83443 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -35,7 +34,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -48,7 +46,7 @@ CollectiveBroadcastStartThunk::CollectiveBroadcastStartThunk(
     std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
     : CollectiveThunk(Thunk::kCollectiveBroadcastStart, thunk_info,
                       IsGPUSyncCollective(*instr),
-                      AsyncStreamKind::kCollective),
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
       config_(GetCollectiveConfig(instr, std::nullopt)),
       buffers_(std::move(buffers)) {}
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
index 44216e5ae52900..74397b399331a8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk_test.cc
new file mode 100644
index 00000000000000..bc0d707ff7715b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk_test.cc
@@ -0,0 +1,215 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/command_buffer_cmd.h"
+#include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
+#include "xla/backends/gpu/runtime/command_buffer_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/backend.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::ElementsAre;
+using Kind = Thunk::Kind;
+
+class GpuCollectiveBroadcastTest : public HloTestBase {};
+
+TEST_F(GpuCollectiveBroadcastTest, TestConvertToCommands) {
+  // Generate HLO text with parameters substituted.
+  std::string hlo_text = R"(
+HloModule test, replica_count=2
+ENTRY test_computation {
+  p = u32[4] parameter(0)
+  ROOT res = u32[4] collective-broadcast(p), replica_groups={{1, 0}}
+}
+)";
+
+  // Configure module with debug options for command buffer.
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+
+  // Get CollectiveBroadcast Instruction
+  const HloInstruction* root_instr =
+      module->entry_computation()->root_instruction();
+  ASSERT_EQ(root_instr->opcode(), HloOpcode::kCollectiveBroadcast);
+  const HloCollectiveBroadcastInstruction* cb_instr =
+      tensorflow::down_cast<const HloCollectiveBroadcastInstruction*>(
+          root_instr);
+  ASSERT_NE(cb_instr, nullptr);
+
+  // Buffer and Allocation Setup
+  using DataT = int32_t;
+  constexpr int64_t kNumElements = 4;
+  constexpr int64_t kAlignmentBytes = kXlaAllocatedBufferAlignBytes;
+
+  const int64_t kElementSize = sizeof(DataT);
+  const int64_t kTotalDataBytes = kNumElements * kElementSize;
+
+  // Use RoundUpTo to calculate the actual size needed for one buffer.
+  const int64_t kAlignedSliceBytes =
+      xla::RoundUpTo<uint64_t>(kTotalDataBytes, kAlignmentBytes);
+
+  // The total buffer size must accommodate input and output slices.
+  const int64_t kTotalBufferBytes = 2 * kAlignedSliceBytes;
+
+  BufferAllocation buffer_allocation(/*index=*/0, kTotalBufferBytes,
+                                     /*color=*/0);
+  BufferAllocation::Slice input_slice(&buffer_allocation, /*offset=*/0,
+                                      kAlignedSliceBytes);
+  BufferAllocation::Slice output_slice(&buffer_allocation, kAlignedSliceBytes,
+                                       kAlignedSliceBytes);
+
+  // Use designated initializers if possible, or format for clarity.
+  std::vector<CollectiveThunk::Buffer> buffers = {
+      {/*element_count=*/kNumElements,
+       /*source_buffer=*/input_slice,
+       /*destination_buffer=*/output_slice,
+       /*source_memory_space=*/0,
+       /*destination_memory_space=*/0},
+  };
+
+  // ThunkSequence Creation
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events =
+      std::make_shared<CollectiveThunk::AsyncEvents>();
+
+  auto cb_start_thunk = std::make_unique<CollectiveBroadcastStartThunk>(
+      Thunk::ThunkInfo{}, cb_instr, std::move(buffers));
+
+  cb_start_thunk->set_async_events(async_events);
+
+  auto cb_done_thunk = std::make_unique<CollectiveDoneThunk>(
+      Kind::kCollectiveBroadcastDone, Thunk::ThunkInfo{}, async_events,
+      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE);
+
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(cb_start_thunk));
+  thunk_sequence.push_back(std::move(cb_done_thunk));
+
+  // Convert to Commands and Verification
+  ConvertToCommandsOptions conv_options;
+  // Use LHS synchronization mode to append Done command
+  conv_options.synchronization_mode =
+      CommandBufferCmdExecutor::SynchronizationMode::kLHS;
+  TF_ASSERT_OK_AND_ASSIGN(CommandBufferCmdExecutor cb_cmd_executor,
+                          ConvertToCommands(thunk_sequence, conv_options));
+
+  // Check that we have two commands: start and done.
+  EXPECT_EQ(cb_cmd_executor.size(), 2);
+}
+
+TEST_F(GpuCollectiveBroadcastTest,
+       TestCommandBufferThunkContainsCorrectThunks) {
+  // Generate HLO text with parameters substituted.
+  std::string hlo_text = R"(
+HloModule test, replica_count=2
+ENTRY test_computation {
+  replica = u32[] replica-id()
+  p = u32[4] broadcast(replica), dimensions={}
+  ROOT res = u32[4] collective-broadcast(p), replica_groups={{1, 0}}
+}
+)";
+
+  // Configure module with debug options for command buffer.
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+
+  se::StreamExecutor* executor = backend().default_stream_executor();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> compiled_module,
+      backend().compiler()->RunHloPasses(module->Clone(), executor,
+                                         /*device_allocator=*/nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      backend().compiler()->RunBackend(std::move(compiled_module), executor,
+                                       {/*device_allocator=*/nullptr,
+                                        /*thread_pool=*/nullptr,
+                                        /*layout_canonicalization_callback=*/{},
+                                        /*is_autotuning_compilation=*/false}));
+
+  // Downcast to GPU executable
+  xla::gpu::GpuExecutable* gpu_executable =
+      tensorflow::down_cast<xla::gpu::GpuExecutable*>(executable.get());
+  ASSERT_NE(gpu_executable, nullptr);
+
+  // Get the thunk sequence and check its size and type
+  const SequentialThunk& seq_thunk = gpu_executable->GetThunk();
+  ASSERT_EQ(seq_thunk.thunks().size(), 1);
+
+  const std::unique_ptr<Thunk>& thunk = seq_thunk.thunks().front();
+  ASSERT_EQ(thunk->kind(), Thunk::kCommandBuffer);
+
+  CommandBufferThunk* cmd_buffer_thunk =
+      tensorflow::down_cast<CommandBufferThunk*>(thunk.get());
+  ASSERT_NE(cmd_buffer_thunk, nullptr);
+
+  std::vector<Kind> kinds;
+  const auto& inner_thunks = cmd_buffer_thunk->thunks()->thunks();
+  kinds.reserve(inner_thunks.size());
+  for (const auto& thunk : inner_thunks) {
+    kinds.push_back(thunk->kind());
+  }
+  EXPECT_THAT(kinds, ElementsAre(Kind::kReplicaId, Kind::kKernel,
+                                 Kind::kCollectiveBroadcastStart,
+                                 Kind::kCollectiveBroadcastDone));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests.cc b/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests.cc
new file mode 100644
index 00000000000000..5a550acbe82a6a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests.cc
@@ -0,0 +1,74 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+
+namespace xla::gpu {
+
+absl::Status CollectiveCliqueRequests::RequestClique(
+    const GpuCliqueKey& clique_key) {
+  VLOG(5) << "Add collective clique request: " << clique_key.ToString();
+
+  // XLA compiler guarantees that all collective operations have the same
+  // order on all replicas. We rely on this property to assign unique id to
+  // clique requests simply based on the number of already recorded requests.
+  int64_t id = cliques_.size();
+  cliques_.try_emplace(clique_key, CliqueRequest{clique_key, id});
+  return absl::OkStatus();
+}
+
+std::vector<GpuCliqueKey> CollectiveCliqueRequests::RequestedCliques() const {
+  std::vector<GpuCliqueKey> clique_keys;
+  clique_keys.reserve(cliques_.size());
+  for (const auto& [key, _] : cliques_) {
+    clique_keys.push_back(key);
+  }
+
+  return clique_keys;
+}
+
+std::vector<CollectiveCliqueRequests::CliqueRequest>
+CollectiveCliqueRequests::OrderedRequestedCliques() const {
+  std::vector<CliqueRequest> cliques;
+  cliques.reserve(cliques_.size());
+  for (const auto& [_, request] : cliques_) {
+    cliques.push_back(request);
+  }
+
+  absl::c_sort(cliques, [](const CliqueRequest& a, const CliqueRequest& b) {
+    // Acquire larger cliques first to be able to split them later.
+    if (a.key.devices().size() > b.key.devices().size()) {
+      return true;
+    }
+    if (b.key.devices().size() > a.key.devices().size()) {
+      return false;
+    }
+
+    // Prefer cliques with smaller id (comes earlier in execution order).
+    return a.id < b.id;
+  });
+
+  return cliques;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests.h b/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests.h
new file mode 100644
index 00000000000000..c75041d99b5118
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests.h
@@ -0,0 +1,85 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_CLIQUE_REQUESTS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_CLIQUE_REQUESTS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+
+namespace xla::gpu {
+
+// Collective thunks (including collective FFI calls) can request communicators
+// for various collective clieques. XLA runtime is responsible for collecting
+// such requests during the prepare stage and acquiring the cliques during the
+// initialize stage.
+class CollectiveCliqueRequests {
+ public:
+  // For each requested clique key, we also assign a monotonically increasing
+  // id, that allows us to deterministically order clique requests.
+  //
+  // Example: 8 ranks splitted in different groups of communicators
+  //
+  // Group #0: [0,1], [2,3], [4,5], [6,7]
+  // Group #1: [0,4], [1,5], [2,6], [3,7]
+  //
+  // Both groups #0 and #1 can be acqured by splitting [0...7] clique. To avoid
+  // deadlocks all participants should acquire all cliques in a group #0 before
+  // acquiring any cliques in a group #1.
+  //
+  // We rely on clique request id to guarantee that the order is identical
+  // on all participating ranks (including ranks running on different hosts).
+  //
+  // Remember that clique requests are collected independently by running thunk
+  // sequence prepare stage in parallel for all ranks. After all requests are
+  // collected, XLA runtime initializes communicators for all requested cliques.
+  //
+  // This initialization must happen in identical order across all ranks, and
+  // ranks might be running as separate processes or even on separate hosts, so
+  // any communication between ranks is impossible.
+  //
+  // We rely on the fact, that XLA uses SPMD programming model, and all ranks
+  // execute identical thunk sequence in exact same order, and assigned request
+  // id is essentially a thunk index in the parent thunk sequence. However we
+  // don't want to sort just by request index, because acquiring large cliques
+  // first improves performance and memory utilization, as we have more chances
+  // to reuse existing communicators when requesting smaller cliques via
+  // communicator splitting.
+  struct CliqueRequest {
+    GpuCliqueKey key;
+    int64_t id;
+  };
+
+  // Adds a clique key to the list of requested cliques.
+  absl::Status RequestClique(const GpuCliqueKey& clique_key);
+
+  // Returns all requested cliques in undefined order.
+  std::vector<GpuCliqueKey> RequestedCliques() const;
+
+  // Returns all requested cliques in a deterministic order optimized for
+  // efficient communicator acquisition.
+  std::vector<CliqueRequest> OrderedRequestedCliques() const;
+
+ private:
+  absl::flat_hash_map<GpuCliqueKey, CliqueRequest> cliques_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_CLIQUE_REQUESTS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests_test.cc
new file mode 100644
index 00000000000000..d35cb0c8f0f4d9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_clique_requests_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+
+#include <vector>
+
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/service/global_device_id.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+TEST(CollectiveCliqueRequestsTest, OrderedRequests) {
+  GlobalDeviceId d0 = GlobalDeviceId(0);
+  GlobalDeviceId d1 = GlobalDeviceId(1);
+  GlobalDeviceId d2 = GlobalDeviceId(2);
+  GlobalDeviceId d3 = GlobalDeviceId(3);
+
+  GpuCliqueKey k0({d2, d3}, 2);
+  GpuCliqueKey k1({d0, d1}, 2);
+  GpuCliqueKey k2({d0, d1, d2, d3}, 4);
+
+  CollectiveCliqueRequests requests;
+  TF_ASSERT_OK(requests.RequestClique(k0));
+  TF_ASSERT_OK(requests.RequestClique(k1));
+  TF_ASSERT_OK(requests.RequestClique(k2));
+
+  // Check that we acquire larger cliques first, and then cliques with smaller
+  // id first, as acquiring cliques according to natural clique key order might
+  // lead to deadlocks during communicator splitting.
+  auto ordered_requests = requests.OrderedRequestedCliques();
+  ASSERT_EQ(ordered_requests.size(), 3);
+  EXPECT_EQ(ordered_requests[0].key, k2);
+  EXPECT_EQ(ordered_requests[1].key, k0);
+  EXPECT_EQ(ordered_requests[2].key, k1);
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/resource_requests.cc b/third_party/xla/xla/backends/gpu/runtime/collective_cliques.cc
similarity index 65%
rename from third_party/xla/xla/service/gpu/resource_requests.cc
rename to third_party/xla/xla/backends/gpu/runtime/collective_cliques.cc
index 348eef44ee0a62..25349c3c18f332 100644
--- a/third_party/xla/xla/service/gpu/resource_requests.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_cliques.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/resource_requests.h"
+#include "xla/backends/gpu/runtime/collective_cliques.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -22,26 +22,25 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
+#include "absl/base/no_destructor.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/collectives/gpu_clique.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_cliques.h"
-#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
+#include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/profiler/lib/traceme_encode.h"
 
-namespace xla {
-namespace gpu {
+namespace xla::gpu {
 
 namespace {
 
@@ -52,28 +51,57 @@ struct PersistentCliquesMap {
 };
 
 static PersistentCliquesMap& GetPersistentCliquesMap() {
-  static auto* const persistent_cliques = new PersistentCliquesMap();
+  static absl::NoDestructor<PersistentCliquesMap> persistent_cliques;
   return *persistent_cliques;
 }
 }  // namespace
 
-absl::Status ResourceRequests::AddClique(const GpuCliqueKey& clique_key) {
-  VLOG(5) << "Add collective clique request: " << clique_key.ToString();
+CollectiveCliques::CollectiveCliques(AcquiredCliquesMap cliques_map,
+                                     int32_t num_transient_cliques)
+    : cliques_map_(std::move(cliques_map)),
+      num_transient_cliques_(num_transient_cliques) {}
+
+absl::StatusOr<Communicator*> CollectiveCliques::GetComm(
+    const GpuCliqueKey& clique_key, RankId rank) const {
+  // Check that we locked access to a clique for `clique_key`.
+  auto clique = cliques_map_.find(clique_key);
+  if (clique == cliques_map_.end()) {
+    return NotFound("No clique found for clique key: %s",
+                    clique_key.ToString());
+  }
+
+  // Check that clique has a communicator for our rank.
+  auto communicator = (*clique->second)->comm(rank);
+  if (!communicator.has_value()) {
+    return Internal("Communicator for rank %d not found in a NCCL clique %s",
+                    rank.value(), clique_key.ToString());
+  }
 
-  // XLA compiler guarantees that all collective operations have the same
-  // order on all replicas. We rely on this property to assign unique id to
-  // clique requests simply based on the number of already recorded requests.
-  int64_t id = cliques_.size();
-  cliques_.try_emplace(clique_key, CliqueRequest{clique_key, id});
-  return absl::OkStatus();
+  return *communicator;
 }
 
-absl::StatusOr<Thunk::CollectiveCliques>
-ResourceRequests::AcquireCollectiveCliques(
-    const Thunk::CollectiveExecuteParams& params, bool use_persistent_cliques) {
-  if (cliques_.empty()) return Thunk::CollectiveCliques();
+absl::StatusOr<bool> CollectiveCliques::peer_access_enabled(
+    const GpuCliqueKey& clique_key) const {
+  // Check that we locked access to a clique for `clique_key`.
+  auto clique = cliques_map_.find(clique_key);
+  if (clique == cliques_map_.end()) {
+    return NotFound("No clique found for clique key: %s",
+                    clique_key.ToString());
+  }
 
-  VLOG(2) << "Acquire " << cliques_.size()
+  return (*clique->second)->peer_access_enabled();
+}
+
+absl::StatusOr<CollectiveCliques> AcquireCollectiveCliques(
+    const CollectiveParams& params, const CollectiveCliqueRequests& cliques,
+    bool use_persistent_cliques) {
+  std::vector<CollectiveCliqueRequests::CliqueRequest> ordered_cliques =
+      cliques.OrderedRequestedCliques();
+  if (ordered_cliques.empty()) {
+    return CollectiveCliques();
+  }
+
+  VLOG(2) << "Acquire " << ordered_cliques.size()
           << " collective cliques for global device id "
           << params.global_device_id.value()
           << "; run_id=" << params.run_id.ToInt()
@@ -82,9 +110,8 @@ ResourceRequests::AcquireCollectiveCliques(
           << "; max number of channels for p2p " << params.p2p_max_nchannels
           << "; use_persistent_cliques=" << use_persistent_cliques;
 
-  std::vector<CliqueRequest> ordered_cliques = GetOrderedCliqueRequests();
   for (size_t i = 0; i < ordered_cliques.size(); ++i) {
-    const CliqueRequest& r = ordered_cliques[i];
+    const CollectiveCliqueRequests::CliqueRequest& r = ordered_cliques[i];
     VLOG(2) << "  clique #" << i << " (for global device id "
             << params.global_device_id.value() << ")"
             << ": num_local_participants=" << r.key.num_local_participants()
@@ -94,7 +121,7 @@ ResourceRequests::AcquireCollectiveCliques(
   tsl::profiler::TraceMe trace([&] {
     return tsl::profiler::TraceMeEncode(
         "AcquireCollectiveCliques",
-        {{"num_cliques", cliques_.size()},
+        {{"num_cliques", ordered_cliques.size()},
          {"use_persistent_cliques", use_persistent_cliques}});
   });
 
@@ -103,13 +130,12 @@ ResourceRequests::AcquireCollectiveCliques(
   AcquiredCliquesMap cliques_map;
   int32_t num_transient_cliques = 0;
 
-  for (const CliqueRequest& r : ordered_cliques) {
+  for (const CollectiveCliqueRequests::CliqueRequest& r : ordered_cliques) {
     std::optional<RankId> rank = r.key.rank(params.global_device_id);
 
     if (!rank.has_value()) {
-      return absl::InternalError(absl::StrCat(
-          "Can't find global device id ", params.global_device_id.value(),
-          " in clique key ", r.key.ToString()));
+      return Internal("Can't find global device id %d in clique key %s",
+                      params.global_device_id.value(), r.key.ToString());
     }
 
     TF_ASSIGN_OR_RETURN(const CliqueIdCallback* clique_id_callback,
@@ -166,26 +192,7 @@ ResourceRequests::AcquireCollectiveCliques(
           << "; run_id=" << params.run_id.ToInt()
           << "; num_transient_cliques=" << num_transient_cliques;
 
-  return Thunk::CollectiveCliques(std::move(cliques_map),
-                                  num_transient_cliques);
+  return CollectiveCliques(std::move(cliques_map), num_transient_cliques);
 }
 
-std::vector<ResourceRequests::CliqueRequest>
-ResourceRequests::GetOrderedCliqueRequests() {
-  std::vector<CliqueRequest> cliques;
-  cliques.reserve(cliques_.size());
-  for (const auto& [_, request] : cliques_) cliques.push_back(request);
-
-  absl::c_sort(cliques, [](const CliqueRequest& a, const CliqueRequest& b) {
-    // Acquire larger cliques first to be able to split them later.
-    if (a.key.devices().size() > b.key.devices().size()) return true;
-    if (b.key.devices().size() > a.key.devices().size()) return false;
-
-    // Prefer cliques with smaller id (comes earlier in execution order).
-    return a.id < b.id;
-  });
-
-  return cliques;
-}
-}  // namespace gpu
-}  // namespace xla
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_cliques.h b/third_party/xla/xla/backends/gpu/runtime/collective_cliques.h
new file mode 100644
index 00000000000000..e2e31ba2664723
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_cliques.h
@@ -0,0 +1,69 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_CLIQUES_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_CLIQUES_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_cliques.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+
+namespace xla::gpu {
+
+// A collection of collective cliques acquired based on GPU clique requests
+// collected from all thunks at prepare stage.
+class CollectiveCliques {
+ public:
+  CollectiveCliques() = default;
+  CollectiveCliques(AcquiredCliquesMap cliques_map,
+                    int32_t num_transient_cliques);
+
+  absl::StatusOr<Communicator*> GetComm(const GpuCliqueKey& clique_key,
+                                        RankId rank) const;
+
+  // Returns whether peer device memory access is possible between all devices
+  // in the clique.
+  absl::StatusOr<bool> peer_access_enabled(
+      const GpuCliqueKey& clique_key) const;
+
+  bool empty() const { return cliques_map_.empty(); }
+
+  bool num_transient_cliques() const { return num_transient_cliques_; }
+
+ private:
+  AcquiredCliquesMap cliques_map_;
+
+  // The number of acquired non-persistent clique. We need to keep track of
+  // newly created communicators to insert rendezvous after first
+  // initialization, because otherwise we observe deadlocks with NCCL
+  // collectives backends.
+  int32_t num_transient_cliques_ = 0;
+};
+
+// Acquires collective cliques using the given collective parameters for all
+// requested GPU cliques.
+absl::StatusOr<CollectiveCliques> AcquireCollectiveCliques(
+    const CollectiveParams& params, const CollectiveCliqueRequests& cliques,
+    bool use_persistent_cliques);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_CLIQUES_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
index 14d132091ea61c..2bd70e22d7a8b8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -53,10 +52,9 @@ CollectiveGroupThunk::CollectiveGroupThunk(
     thunks_.emplace_back(std::move(thunk));
   }
 }
-absl::Status CollectiveGroupThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status CollectiveGroupThunk::Prepare(const PrepareParams& params) {
   for (const std::unique_ptr<Thunk>& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Prepare(params, resource_requests));
+    TF_RETURN_IF_ERROR(thunk->Prepare(params));
   }
   return absl::OkStatus();
 }
@@ -136,5 +134,16 @@ void CollectiveGroupThunk::ForAllThunksMutable(
   }
 }
 
+absl::Status CollectiveGroupThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  for (std::unique_ptr<Thunk>& thunk : thunks_) {
+    TF_RETURN_IF_ERROR(thunk->TransformAllNestedThunks(fn));
+    TF_ASSIGN_OR_RETURN(thunk, fn(std::move(thunk)));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
index 27e8dd3ebfe1f5..c7b392fb310ae7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
@@ -21,9 +21,9 @@ limitations under the License.
 
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
@@ -39,12 +39,16 @@ class CollectiveGroupThunk : public Thunk {
   CollectiveGroupThunk(const HloInstruction* instruction, Thunk::Kind kind,
                        std::vector<std::unique_ptr<Thunk>> thunks,
                        AsyncStreamKind stream_kind, ThunkId thunk_id);
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status ExecuteOnStream(const Thunk::ExecuteParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
+
   std::shared_ptr<CollectiveThunk::AsyncEvents> async_events() const {
     return async_events_;
   }
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
index 9983f7f3913478..de186540fda22b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
@@ -15,13 +15,13 @@ limitations under the License.*/
 #include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -32,13 +32,13 @@ limitations under the License.*/
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_reduce.h"
+#include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/rendezvous.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
@@ -85,21 +85,6 @@ absl::StatusOr<se::DeviceMemoryHandle> AllocateMemory(
   return local_buffer_alloc;
 };
 
-AllReduceStrategy GetAllReduceStrategy(int64_t input_size_bytes) {
-  return input_size_bytes > kMaxOneShotAllReduceSizeBytes
-             ? AllReduceStrategy::kTwoShot
-             : AllReduceStrategy::kOneShot;
-}
-
-int64_t GetMaxSupportedAllReduceSizeBytes(AllReduceStrategy strategy) {
-  switch (strategy) {
-    case AllReduceStrategy::kOneShot:
-      return kMaxOneShotAllReduceSizeBytes;
-    case AllReduceStrategy::kTwoShot:
-      return kMaxTwoShotAllReduceSizeBytes;
-  }
-}
-
 }  // namespace
 
 absl::StatusOr<bool> CollectiveKernelThunk::IsSupported(
@@ -116,7 +101,8 @@ absl::StatusOr<bool> CollectiveKernelThunk::IsSupported(
 
   const int64_t num_elements = buffers_[0].element_count;
   const int64_t input_size_bytes = GetInputSizeBytes();
-  const AllReduceStrategy strategy = GetAllReduceStrategy(input_size_bytes);
+  const AllReduceStrategy strategy =
+      GetAllReduceStrategy(input_size_bytes, is_multimem_enabled_);
   // Custom all-reduce strategy is only supported for small inputs.
   if (input_size_bytes > GetMaxSupportedAllReduceSizeBytes(strategy)) {
     return false;
@@ -135,13 +121,13 @@ absl::StatusOr<bool> CollectiveKernelThunk::IsSupported(
       collective_config_.operand_element_type[0], reduction_kind_, strategy);
 }
 
-absl::Status CollectiveKernelThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status CollectiveKernelThunk::Prepare(const PrepareParams& params) {
+  TF_RET_CHECK(params.collective_params != nullptr);
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, collective_config_,
                                 /*use_nccl=*/false));
-  return resource_requests.AddClique(clique_key);
+  return params.clique_requests->RequestClique(clique_key);
 }
 
 int64_t CollectiveKernelThunk::GetInputSizeBytes() const {
@@ -150,78 +136,27 @@ int64_t CollectiveKernelThunk::GetInputSizeBytes() const {
              collective_config_.operand_element_type[0]);
 }
 
-struct BaseRangePtrRendezvousValue {
-  RankId rank;
-  se::DeviceMemoryBase locally_allocated_buffer_ptr;
-  se::DeviceMemoryBase buffer_ptr;
-
-  bool operator<(const BaseRangePtrRendezvousValue& other) const {
-    return rank < other.rank;
-  }
-};
-
 absl::Status CollectiveKernelThunk::ExchangeStateMetadata(
-    const GpuCliqueKey& clique_key, StreamState& state,
-    const InitializeParams& params) {
-  BaseRangePtrRendezvousValue rendezvous_value;
+    const GpuCliqueKey& clique_key, const InitializeParams& params,
+    StreamState& state) {
   const std::optional<RankId> rank =
       clique_key.rank(params.collective_params->global_device_id);
   TF_RET_CHECK(rank.has_value())
       << "Device " << params.collective_params->global_device_id
       << "is not in the clique.";
-  rendezvous_value.rank = rank.value();
-  rendezvous_value.locally_allocated_buffer_ptr = state.local_buffer.memory();
-  TF_ASSIGN_OR_RETURN(rendezvous_value.buffer_ptr,
-                      params.executor->GetMemoryRange(
-                          params.buffer_allocations->GetDeviceAddress(
-                              buffers_[0].source_buffer)));
-
-  auto rendezvous_fn =
-      [](absl::Span<const BaseRangePtrRendezvousValue* const> values) {
-        std::vector<BaseRangePtrRendezvousValue> values_copy;
-        for (const auto& value : values) {
-          values_copy.push_back(*value);
-        }
-        // Sort to make sure that values are in the same order as the
-        // devices are ordered in the communicator.
-        absl::c_sort(values_copy);
-        return values_copy;
-      };
-  const int64_t num_ranks = clique_key.num_devices();
-  std::string start_rendezvous_key = absl::StrFormat(
-      "Initializing one-shot all-reduce for device %d, clique %s",
-      params.executor->device_ordinal(), clique_key.ToString());
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<std::vector<BaseRangePtrRendezvousValue>>
-                          rendezvous_values,
-                      Rendezvous<std::vector<BaseRangePtrRendezvousValue>>(
-                          /*name=*/
-                          start_rendezvous_key, /*key=*/clique_key,
-                          /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
-                          rendezvous_fn));
-
-  if (rendezvous_values->size() > CollectiveKernelMetadata::kMaxNumDevices) {
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Multi-device kernels require at most %d peers.",
-                        CollectiveKernelMetadata::kMaxNumDevices));
-  }
-  CollectiveKernelMetadata metadata;
-  metadata.rank = rank.value().value();
-  for (int i = 0; i < rendezvous_values->size(); ++i) {
-    metadata.local_buffer_root_ptrs[i] =
-        (uint64_t)rendezvous_values->at(i)
-            .locally_allocated_buffer_ptr.opaque();
-    metadata.buffer_root_ptrs[i] =
-        (uint64_t)rendezvous_values->at(i).buffer_ptr.opaque();
-  }
 
-  se::DeviceMemoryBase metadata_ptr =
-      params.executor->Allocate(sizeof(CollectiveKernelMetadata), 0);
-  TF_RETURN_IF_ERROR(params.stream->Memcpy(&metadata_ptr, (void*)&metadata,
-                                           sizeof(CollectiveKernelMetadata)));
-  TF_RETURN_IF_ERROR(params.stream->BlockHostUntilDone());
-
-  state.metadata = metadata_ptr;
-  return absl::OkStatus();
+  std::vector<se::DeviceMemoryBase> parameters;
+  parameters.push_back(state.local_buffers_handle.memory());
+  parameters.push_back(state.signal_buffers_handle.memory());
+
+  const size_t param_to_peers_ptrs_size_bytes =
+      parameters.size() * clique_key.num_devices() * sizeof(uint64_t);
+  state.metadata = params.executor->Allocate(
+      sizeof(CollectiveKernelMetadata) + param_to_peers_ptrs_size_bytes, 0);
+  return CollectiveMetadataThunk::ConstructCollectiveMetadata(
+      std::move(parameters), params.stream, clique_key,
+      state.multicast_device_ptr,
+      /* device_ordinal= */ params.executor->device_ordinal(), state.metadata);
 }
 
 absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
@@ -234,9 +169,10 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
   TF_RET_CHECK(rank.has_value())
       << "Device " << params.collective_params->global_device_id
       << "is not in the clique.";
+  const AllReduceStrategy strategy =
+      GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
   const LaunchDimensions launch_dimensions = AllReduceLaunchDimensions(
-      buffers_[0].element_count, clique_key.num_local_participants(),
-      GetAllReduceStrategy(GetInputSizeBytes()));
+      buffers_[0].element_count, clique_key.num_local_participants(), strategy);
 
   StreamState* state = nullptr;
   {
@@ -250,11 +186,16 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
           kNumSignalFlags * sizeof(int32_t), kXlaAllocatedBufferAlignBytes);
       const int64_t kLocalBufferSize = xla::RoundUpTo<uint64_t>(
           buffers_[0].source_buffer.size(), kXlaAllocatedBufferAlignBytes);
+
+      TF_ASSIGN_OR_RETURN(
+          se::DeviceMemoryHandle local_buffers_handle,
+          AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
+                         "Local buffers"));
+
       TF_ASSIGN_OR_RETURN(
-          se::DeviceMemoryHandle local_buffer_alloc,
-          AllocateMemory(params.executor,
-                         (kSignalBufferSize + kLocalBufferSize) * kNumBuffers,
-                         "Local and Signal buffers"));
+          se::DeviceMemoryHandle signal_buffers_handle,
+          AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
+                         "Signal buffers"));
 
       // Step2: We needs 1 atomic flag per block per device on each device.
       // One-shot kernel expects that the signal flags buffer is zeroed out.
@@ -263,7 +204,8 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
       // correct state after use, so we don't need to zero out after
       // initialization.
       TF_RETURN_IF_ERROR(params.executor->SynchronousMemZero(
-          local_buffer_alloc.memory_ptr(), local_buffer_alloc.memory().size()));
+          signal_buffers_handle.memory_ptr(),
+          signal_buffers_handle.memory().size()));
       // Create a kernel for execution.
       std::unique_ptr<se::Kernel> kernel = nullptr;
       // If PTX is provided, we create a kernel from it.
@@ -278,29 +220,35 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
           params.executor,
           std::make_unique<StreamState>(
               params.executor->device_ordinal(), rank.value(),
-              std::move(local_buffer_alloc), std::move(kernel)));
+              std::move(local_buffers_handle), std::move(signal_buffers_handle),
+              std::move(kernel)));
 
       state = per_stream_state_.at(params.executor).get();
 
       // NB: This is a double buffer allocation. So size of a single buffer is
       // half of the total allocation.
       for (int i = 0; i < kNumBuffers; ++i) {
-        uint64_t offset = i * (kLocalBufferSize + kSignalBufferSize);
         state->remote_buffer_ptrs[i] =
-            state->local_buffer.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/offset,
+            state->local_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * kLocalBufferSize,
                 /*size_bytes=*/kLocalBufferSize);
 
         state->signal_buffer_ptrs[i] =
-            state->local_buffer.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/offset + kLocalBufferSize,
+            state->signal_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * kSignalBufferSize,
                 /*size_bytes=*/kSignalBufferSize);
       }
     }
   }
 
   if (state != nullptr) {
-    TF_RETURN_IF_ERROR(ExchangeStateMetadata(clique_key, *state, params));
+    if (strategy == AllReduceStrategy::kMultimem) {
+      TF_ASSIGN_OR_RETURN(state->multicast_device_ptr,
+                          address_space_provider_.SetupMultimemAddressSpace(
+                              clique_key, params.executor,
+                              state->local_buffers_handle.memory()));
+    }
+    TF_RETURN_IF_ERROR(ExchangeStateMetadata(clique_key, params, *state));
   }
 
   return absl::OkStatus();
@@ -311,7 +259,7 @@ absl::Status CollectiveKernelThunk::ExecuteOnStream(
   se::Stream* stream = params.stream;
   if (is_async_) {
     stream = params.collective_params->async_streams.at(
-        static_cast<int64_t>(AsyncStreamKind::kCollective));
+        static_cast<int64_t>(AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));
   }
   const int device_ordinal = stream->parent()->device_ordinal();
 
@@ -348,7 +296,8 @@ absl::Status CollectiveKernelThunk::ExecuteOnStream(
   }
 
   const uint32_t buffer_index = state->invocation_count % kNumBuffers;
-  auto const strategy = GetAllReduceStrategy(GetInputSizeBytes());
+  const AllReduceStrategy strategy =
+      GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
   const LaunchDimensions launch_dimensions =
       AllReduceLaunchDimensions(buffer.element_count, num_devices, strategy);
   // In case of two-shot we want to increment in multiples of 2.
@@ -362,8 +311,8 @@ absl::Status CollectiveKernelThunk::ExecuteOnStream(
   se::DeviceMemoryBase signal_buffer_ptr =
       state->signal_buffer_ptrs[buffer_index];
   VLOG(3) << "[" << device_ordinal
-          << "] input_buffer_ptr: " << (uint64_t)input_buffer_ptr.opaque()
-          << " signal_buffer_ptr: " << (uint64_t)signal_buffer_ptr.opaque();
+          << "] input_buffer_ptr: " << input_buffer_ptr.opaque()
+          << " signal_buffer_ptr: " << signal_buffer_ptr.opaque();
   VLOG(3) << "[" << device_ordinal
           << "] launch dimensions: " << launch_dimensions.num_blocks() << "x"
           << launch_dimensions.num_threads_per_block()
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
index ebcff9784677fc..10803231b6ea70 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
@@ -20,6 +20,7 @@ limitations under the License.*/
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
@@ -29,6 +30,7 @@ limitations under the License.*/
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
@@ -58,16 +60,18 @@ class CollectiveKernelThunk : public Thunk {
 
   CollectiveKernelThunk(ThunkInfo info, CollectiveConfig collective_config,
                         ReductionKind reduction_kind, bool is_async,
-                        absl::Span<const CollectiveThunk::Buffer> buffers,
+                        std::vector<CollectiveThunk::Buffer> buffers,
                         bool is_collective_kernel_enabled,
-                        absl::string_view kernel_name = "")
+                        absl::string_view kernel_name = "",
+                        bool is_multimem_enabled = false)
       : Thunk{Thunk::kCollectiveKernel, info},
         collective_kernel_enabled_(is_collective_kernel_enabled),
         is_async_(is_async),
         collective_config_(std::move(collective_config)),
         reduction_kind_(reduction_kind),
         kernel_name_(kernel_name),
-        buffers_(buffers) {
+        buffers_(std::move(buffers)),
+        is_multimem_enabled_(is_multimem_enabled) {
     per_stream_state_.reserve(kMaxNumExecutors);
   }
 
@@ -77,8 +81,7 @@ class CollectiveKernelThunk : public Thunk {
       const CollectiveCliques* collective_cliques) const;
 
   // The single host collective thunk actually requires a clique key.
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) final;
+  absl::Status Prepare(const PrepareParams& params) final;
 
   // Allocate buffers and events as needed for cross device communication.
   // If InitializeParams contains a PTX kernel, it will be used instead of the
@@ -96,7 +99,7 @@ class CollectiveKernelThunk : public Thunk {
   struct StreamState {
     int device_ordinal;
     RankId rank;
-    // Buffers and signal flags allocated for the collective.
+    // Buffers allocated for the collective.
     // Buffers are double buffered to allow for consecutive invocation
     // of the kernel on different GPUs.
     // - GPUs sync on Buffer 0 on first invocation.
@@ -104,7 +107,11 @@ class CollectiveKernelThunk : public Thunk {
     //   This implies that all GPUs must have finished the first invocation
     //   before they can sync on the second invocation.
     // - Alternate back to Buffer 0 on third invocation. And so on.
-    se::DeviceMemoryHandle local_buffer;
+    se::DeviceMemoryHandle local_buffers_handle;
+
+    // Signal buffers allocated for the collective.
+    // Also double buffered for the same reason as local buffers.
+    se::DeviceMemoryHandle signal_buffers_handle;
 
     // Pointer to the collective kernel metadata on device.
     se::DeviceMemoryBase metadata;
@@ -118,14 +125,18 @@ class CollectiveKernelThunk : public Thunk {
     std::unique_ptr<se::Kernel> kernel;
     uint32_t invocation_count = 0;
 
+    void* multicast_device_ptr = nullptr;
+
     // Constructor to make OSS builds happy.
     StreamState() = default;
     StreamState(int device_ordinal_arg, RankId rank_arg,
-                se::DeviceMemoryHandle local_buffer_arg,
+                se::DeviceMemoryHandle local_buffers_handle_arg,
+                se::DeviceMemoryHandle signal_buffers_handle_arg,
                 std::unique_ptr<se::Kernel> kernel_arg)
         : device_ordinal(device_ordinal_arg),
           rank(rank_arg),
-          local_buffer(std::move(local_buffer_arg)),
+          local_buffers_handle(std::move(local_buffers_handle_arg)),
+          signal_buffers_handle(std::move(signal_buffers_handle_arg)),
           kernel(std::move(kernel_arg)) {}
   };
 
@@ -135,8 +146,8 @@ class CollectiveKernelThunk : public Thunk {
   // Internal method to sync thread after Initialize.
   // Returns the collective kernel metadata for the given clique key.
   absl::Status ExchangeStateMetadata(const GpuCliqueKey& clique_key,
-                                     StreamState& state,
-                                     const InitializeParams& params);
+                                     const InitializeParams& params,
+                                     StreamState& state);
 
   // Whether the one-shot kernel is enabled.
   const bool collective_kernel_enabled_;
@@ -150,13 +161,15 @@ class CollectiveKernelThunk : public Thunk {
   // Must match the kernel name in the generated PTX kernel.
   const std::string kernel_name_;
   // Reference to the buffer related information required for the collective.
-  absl::Span<const CollectiveThunk::Buffer> buffers_;
+  std::vector<CollectiveThunk::Buffer> buffers_;
 
+  CollectiveMetadataThunk::MultimemAddressSpaceProvider address_space_provider_;
   // Guard access to the stream state across different threads (which control
   // different streams).
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamState>>
       per_stream_state_ ABSL_GUARDED_BY(mutex_);
+  const bool is_multimem_enabled_;
 };
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
index c30ee8366d7fdb..31965812082fe8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 
 #include <cstdint>
-#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -26,11 +25,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/array.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
@@ -38,14 +35,18 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -133,137 +134,199 @@ static constexpr absl::string_view kKernelSource = R"(
   }
 )";
 
-absl::StatusOr<se::StreamExecutor*> GpuExecutor(int32_t device_ordinal) {
-  TF_ASSIGN_OR_RETURN(auto name, PlatformUtil::CanonicalPlatformName("gpu"));
-  TF_ASSIGN_OR_RETURN(auto* platform,
-                      se::PlatformManager::PlatformWithName(name));
-  return platform->ExecutorForDevice(device_ordinal);
+se::StreamExecutor* GetGpuExecutor(int64_t device_ordinal) {
+  auto* platform =
+      se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
+  return platform->ExecutorForDevice(device_ordinal).value();
 }
 
-TEST(CollectiveKernelThunkTest, ExecutesPtxKernel) {
-  using DataT = int64_t;
-  static constexpr int64_t kNumElements = 128;
-  static constexpr int64_t kInputSizeBytes = kNumElements * sizeof(DataT);
-  static constexpr uint32_t kExpectedSignalValue = 1;
+struct CollectiveKernelThunkMetadata {
+  BufferAllocation buffer_allocation;
+  std::unique_ptr<CollectiveKernelThunk> thunk;
+  int64_t total_buffer_size;
+  int64_t input_data_size_bytes;
+  int64_t aligned_input_size_bytes;
+  int64_t num_devices;
+  std::vector<CollectiveThunk::Buffer> buffers;
+};
 
-  // --------------------
-  // Arrange
-  // --------------------
-  // # Prepare input data and expected output data.
-  Array<DataT> input_data({/*num_elements=*/kNumElements});
-  input_data.FillRandom(5, 5, /*seed=*/12345);
-  Array<DataT> expected_output_data({/*num_elements=*/kNumElements});
-  expected_output_data.Each([&](absl::Span<const int64_t> indices, DataT* val) {
-    *val = input_data(indices) + kExpectedSignalValue;
-  });
-  // # Prepare Infrastructure.
-  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor0, GpuExecutor(0));
-  Thunk::ThunkInfo thunk_info;
-  thunk_info.profile_annotation = kProfileName;
+CollectiveKernelThunkMetadata CreateCollectiveKernelThunk(
+    int num_devices, int num_elements, bool is_multimem_enabled) {
+  const int64_t input_size_bytes = num_elements * sizeof(uint64_t);
   ReplicaGroup replica_group;
-  replica_group.add_replica_ids(0);
+
+  for (int device_number = 0; device_number < num_devices; ++device_number) {
+    replica_group.add_replica_ids(device_number);
+  }
+
   CollectiveConfig collective_config{
-      /*operand_count=*/1,
       /*operand_element_type=*/{PrimitiveType::F32},
-      /* replica_groups=*/{replica_group},
-      /* collective_op_kind=*/RendezvousKey::CollectiveOpKind::kCrossReplica,
-      /* op_id=*/0,
-      /* group_mode=*/CollectiveOpGroupMode::kCrossReplica,
-      /* use_symmetric_buffer=*/false};
+      /*replica_groups=*/{replica_group},
+      /*group_mode=*/
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
+      /*use_symmetric_buffer=*/false};
   const int64_t aligned_input_size_bytes =
-      xla::RoundUpTo<uint64_t>(kInputSizeBytes, kXlaAllocatedBufferAlignBytes);
-  // 2x because we have two buffers, one for input and one for output so we can
-  // test output independently of input.
+      xla::RoundUpTo<uint64_t>(input_size_bytes, kXlaAllocatedBufferAlignBytes);
+  // 2x because we have two buffers, one for input and one for output so we
+  // can test output independently of input.
   const int64_t total_buffer_size = aligned_input_size_bytes * 2;
-  // ## Create physical buffers.
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executor0->CreateStream());
-  std::vector<se::DeviceMemoryBase> allocated_buffers = {
-      executor0->AllocateArray<DataT>(total_buffer_size)};
-  std::vector<se::DeviceMemoryBase> input_buffers = {
-      allocated_buffers[0].GetByteSlice(0, aligned_input_size_bytes)};
-  std::vector<se::DeviceMemoryBase> output_buffers = {
-      allocated_buffers[0].GetByteSlice(aligned_input_size_bytes,
-                                        aligned_input_size_bytes)};
-  BufferAllocations buffer_allocations(
-      /*buffers=*/allocated_buffers,
-      /*device_ordinal=*/0,
-      /*memory_allocator=*/nullptr);
-  TF_ASSERT_OK(
-      stream->Memcpy(&input_buffers[0], input_data.data(), kInputSizeBytes));
+  CollectiveKernelThunkMetadata result{
+      BufferAllocation(/*index=*/0, /*size=*/total_buffer_size, /*color=*/0)};
+  BufferAllocation::Slice input_slice(&result.buffer_allocation, /*offset=*/0,
+                                      /*size=*/aligned_input_size_bytes);
+  BufferAllocation::Slice output_slice(&result.buffer_allocation,
+                                       aligned_input_size_bytes,
+                                       aligned_input_size_bytes);
+  result.buffers = {{/*element_count=*/num_elements,
+                     /*source_buffer=*/input_slice,
+                     /*destination_buffer=*/output_slice,
+                     /*source_memory_space=*/0,
+                     /*destination_memory_space=*/0}};
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = kProfileName;
+  result.thunk = std::make_unique<CollectiveKernelThunk>(
+      std::move(thunk_info), collective_config, ReductionKind::SUM,
+      /*is_async=*/false, result.buffers,
+      /*is_collective_kernel_enabled=*/true,
+      /*kernel_name=*/kKernelName,
+      /*is_multimem_enabled=*/is_multimem_enabled);
+  result.total_buffer_size = total_buffer_size;
+  result.num_devices = num_devices;
+  result.aligned_input_size_bytes = aligned_input_size_bytes;
+  result.input_data_size_bytes = input_size_bytes;
+  return result;
+}
 
-  // ## Create Logical Buffers.
+absl::StatusOr<se::DeviceMemoryBase> RunCollectiveKernelThunk(
+    CollectiveKernelThunkMetadata& metadata, se::StreamExecutor* executor,
+    std::vector<uint64_t> input_data) {
   BufferAllocation buffer_allocation(
-      /*index=*/0, /*size=*/total_buffer_size, /*color=*/0);
-  BufferAllocation::Slice input_slice(&buffer_allocation, /*offset=*/0,
-                                      /*size=*/aligned_input_size_bytes);
-  BufferAllocation::Slice output_slice(
-      &buffer_allocation, aligned_input_size_bytes, aligned_input_size_bytes);
-  std::vector<CollectiveThunk::Buffer> buffers = {
-      {/*element_count=*/kNumElements,
-       /*source_buffer=*/input_slice,
-       /*destination_buffer=*/output_slice,
-       /*source_memory_space=*/0,
-       /*destination_memory_space=*/0,
-       /*source_value=*/nullptr,
-       /*destination_value=*/nullptr}};
-
-  // ## Setup device mapping.
-  DeviceAssignment device_assignment(/*replica_count=*/1,
-                                     /*computation_count=*/1);
-  device_assignment(0, 0) = 0;
+      /*index=*/0, /*size=*/metadata.total_buffer_size, /*color=*/0);
   GpuExecutableRunOptions gpu_options;
-  gpu_options.set_gpu_global_device_ids(
-      std::map{std::make_pair(0, GlobalDeviceId(0))});
+  gpu_options.set_gpu_global_device_ids(GpuExecutableRunOptions::DeviceIdMap{
+      std::make_pair(LocalDeviceId(0), GlobalDeviceId(0)),
+      std::make_pair(LocalDeviceId(1), GlobalDeviceId(1))});
+
+  TF_ASSIGN_OR_RETURN(auto stream, executor->CreateStream());
   ServiceExecutableRunOptions run_options;
   run_options.mutable_run_options()->set_stream(stream.get());
+  DeviceAssignment device_assignment(/*replica_count=*/metadata.num_devices,
+                                     /*computation_count=*/1);
+
+  for (int i = 0; i < metadata.num_devices; ++i) {
+    device_assignment(i, 0) = i;
+  }
+
   run_options.mutable_run_options()->set_device_assignment(&device_assignment);
   run_options.mutable_run_options()->set_gpu_executable_run_options(
       &gpu_options);
-  TF_ASSERT_OK_AND_ASSIGN(
+
+  TF_ASSIGN_OR_RETURN(
       auto collective_params,
-      Thunk::CollectiveExecuteParams::Create(run_options, /*async_streams=*/{},
-                                             /*local_device_ordinal=*/0));
-  // --------------------
-  // Act
-  // --------------------
-  CollectiveKernelThunk thunk(std::move(thunk_info), collective_config,
-                              ReductionKind::SUM,
-                              /*is_async=*/false, buffers,
-                              /*is_collective_kernel_enabled=*/true,
-                              /*kernel_name=*/kKernelName);
-
-  // # Thunk::Initialize
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor->device_ordinal())));
+  std::vector<se::DeviceMemoryBase> allocated_buffers = {
+      executor->AllocateArray<uint64_t>(metadata.total_buffer_size)};
+
+  se::DeviceMemoryBase input_buffer =
+      allocated_buffers[0].GetByteSlice(0, metadata.aligned_input_size_bytes);
+  se::DeviceMemoryBase output_buffer = allocated_buffers[0].GetByteSlice(
+      metadata.aligned_input_size_bytes, metadata.aligned_input_size_bytes);
+  BufferAllocations buffer_allocations(
+      /*buffers=*/allocated_buffers,
+      /*device_ordinal=*/executor->device_ordinal(),
+      /*memory_allocator=*/nullptr);
+
+  if (!input_data.empty()) {
+    VLOG(3) << "Copying input data to the device";
+    TF_RETURN_IF_ERROR(stream->Memcpy(&input_buffer, input_data.data(),
+                                      metadata.input_data_size_bytes));
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  }
+
   Thunk::InitializeParams initialize_params;
-  initialize_params.executor = executor0;
+  initialize_params.executor = executor;
   initialize_params.stream = stream.get();
   initialize_params.buffer_allocations = &buffer_allocations;
   initialize_params.collective_params = &collective_params;
   initialize_params.src = {kKernelSource};
-  TF_ASSERT_OK(thunk.Initialize(initialize_params));
-
-  // # Thunk::Execute
-  auto execute_params =
-      Thunk::ExecuteParams::Create(run_options,                              //
-                                   buffer_allocations,                       //
-                                   stream.get(),                             //
-                                   /*command_buffer_trace_stream=*/nullptr,  //
-                                   &collective_params,                       //
-                                   /*collective_cliques=*/nullptr            //
-      );
-  TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
-
-  // --------------------
-  // Assert
-  // --------------------
-  Array<DataT> output_data({kNumElements});
-  TF_ASSERT_OK(
-      stream->Memcpy(output_data.data(), output_buffers[0], kInputSizeBytes));
+  TF_RETURN_IF_ERROR(metadata.thunk->Initialize(initialize_params));
+
+  auto execute_params = Thunk::ExecuteParams::Create(
+      run_options, buffer_allocations, stream.get(),
+      /*command_buffer_trace_stream=*/nullptr, &collective_params,
+      /*collective_cliques=*/nullptr);
+  TF_RETURN_IF_ERROR(metadata.thunk->ExecuteOnStream(execute_params));
+  return output_buffer;
+}
+
+std::vector<absl::StatusOr<se::DeviceMemoryBase>>
+RunCollectiveKernelThunkOnDevices(CollectiveKernelThunkMetadata& metadata) {
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "device_threads",
+                                      metadata.num_devices);
+  std::vector<tsl::Future<se::DeviceMemoryBase>> futures;
+  for (int device_number = 0; device_number < metadata.num_devices;
+       ++device_number) {
+    futures.push_back(tsl::Future<se::DeviceMemoryBase>::MakeOn(
+        *thread_pool.AsExecutor(), [&metadata, device_number] {
+          return RunCollectiveKernelThunk(metadata,
+                                          GetGpuExecutor(device_number), {});
+        }));
+  }
+
+  std::vector<absl::StatusOr<se::DeviceMemoryBase>> results;
+  for (auto& future : futures) {
+    results.push_back(std::move(future).Await());
+  }
+  return results;
+}
+
+TEST(CollectiveKernelThunkTest, ExecutesPtxKernel) {
+  static constexpr int64_t kNumElements = 128;
+  static constexpr uint32_t kExpectedSignalValue = 1;
+
+  std::vector<uint64_t> input_data(kNumElements);
+  for (int i = 0; i < kNumElements; ++i) {
+    input_data[i] = i;
+  }
+
+  std::vector<uint64_t> expected_output_data(kNumElements);
+  for (int i = 0; i < kNumElements; ++i) {
+    expected_output_data[i] = input_data[i] + kExpectedSignalValue;
+  }
+
+  CollectiveKernelThunkMetadata metadata = CreateCollectiveKernelThunk(
+      /*num_devices=*/1, /*num_elements=*/kNumElements,
+      /*is_multimem_enabled=*/false);
+
+  se::StreamExecutor* executor0 = GetGpuExecutor(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      se::DeviceMemoryBase result_buffer,
+      RunCollectiveKernelThunk(metadata, executor0, input_data));
+
+  std::vector<uint64_t> output_data(kNumElements);
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor0->CreateStream());
+  TF_ASSERT_OK(stream->Memcpy(output_data.data(), result_buffer,
+                              metadata.input_data_size_bytes));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
   for (auto i = 0; i < kNumElements; ++i) {
-    ASSERT_EQ(expected_output_data(i), output_data(i))
+    ASSERT_EQ(expected_output_data[i], output_data[i])
         << "comparison failed at i = " << i;
   }
 }
 
+TEST(CollectiveKernelThunkTest, MultimemSetupTest) {
+  static constexpr int kDevicesCount = 2;
+  static constexpr int64_t kNumElements = 128;
+
+  CollectiveKernelThunkMetadata metadata = CreateCollectiveKernelThunk(
+      /*num_devices=*/kDevicesCount, /*num_elements=*/kNumElements,
+      /*is_multimem_enabled=*/true);
+  for (absl::StatusOr<se::DeviceMemoryBase> result :
+       RunCollectiveKernelThunkOnDevices(metadata)) {
+    TF_ASSERT_OK(result);
+  }
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
new file mode 100644
index 00000000000000..79948fe7e2800b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
@@ -0,0 +1,269 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "google/protobuf/repeated_ptr_field.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/layout.h"
+#include "xla/runtime/device_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/rendezvous.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/collective_kernel_metadata.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// TODO(460077850): Support global device ids and channel id.
+CollectiveConfig CollectiveMetadataThunk::GetCollectiveConfig(
+    const HloInstruction& hlo) {
+  CollectiveConfig config;
+  config.operand_element_type.reserve(hlo.operands().size());
+  for (const HloInstruction* operand : hlo.operands()) {
+    config.operand_element_type.push_back(operand->shape().element_type());
+  }
+
+  if (hlo.has_backend_config()) {
+    xla::gpu::GpuBackendConfig backend_config =
+        hlo.backend_config<GpuBackendConfig>().value_or(GpuBackendConfig());
+    if (backend_config.has_collective_metadata_backend_config()) {
+      ::google::protobuf::RepeatedPtrField<ReplicaGroup> replica_groups =
+          backend_config.collective_metadata_backend_config()
+              .collective_devices()
+              .replica_groups();
+      config.replica_groups = std::vector<ReplicaGroup>(replica_groups.begin(),
+                                                        replica_groups.end());
+    }
+  }
+
+  config.group_mode =
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA;
+
+  return config;
+}
+
+struct CollectiveMetadataRendezvousValue {
+  RankId rank;
+  std::vector<se::DeviceMemoryBase> parameters;
+
+  bool operator<(const CollectiveMetadataRendezvousValue& other) const {
+    return rank < other.rank;
+  }
+};
+
+absl::Status CollectiveMetadataThunk::ConstructCollectiveMetadata(
+    std::vector<se::DeviceMemoryBase> parameters, se::Stream* stream,
+    const GpuCliqueKey& clique_key, void* multimem_address_space,
+    int device_ordinal, se::DeviceMemoryBase destination) {
+  auto rendezvous_fn =
+      [](absl::Span<const CollectiveMetadataRendezvousValue* const> values) {
+        std::vector<CollectiveMetadataRendezvousValue> values_copy;
+        for (const auto& value : values) {
+          values_copy.push_back(*value);
+        }
+        // Sort to make sure that values are in the same order as the
+        // devices are ordered in the communicator.
+        absl::c_sort(values_copy);
+        return values_copy;
+      };
+
+  std::string start_rendezvous_key =
+      absl::StrFormat("[%d] Initializing collective metadata for clique %s",
+                      device_ordinal, clique_key.ToString());
+
+  CollectiveMetadataRendezvousValue rendezvous_value;
+  rendezvous_value.rank = device_ordinal;
+  rendezvous_value.parameters = std::move(parameters);
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<CollectiveMetadataRendezvousValue>>
+          rendezvous_values,
+      Rendezvous<std::vector<CollectiveMetadataRendezvousValue>>(
+          /*name=*/start_rendezvous_key, /*key=*/clique_key,
+          /*value=*/rendezvous_value, /*num_threads=*/clique_key.num_devices(),
+          rendezvous_fn));
+
+  CollectiveKernelMetadata metadata;
+  metadata.rank = clique_key.rank(GlobalDeviceId(device_ordinal))
+                      .value_or(RankId(-1))
+                      .value();
+  if (metadata.rank == -1) {
+    return absl::InternalError(
+        absl::StrFormat("Device %d not found in clique %s", device_ordinal,
+                        clique_key.ToString()));
+  }
+  metadata.multicast_buffer_ptr = multimem_address_space;
+  TF_RET_CHECK(rendezvous_values->size() > 0)
+      << "Not enough devices in the clique.";
+  const size_t num_parameters = (*rendezvous_values)[0].parameters.size();
+  for (const auto& value : *rendezvous_values) {
+    TF_RET_CHECK(value.parameters.size() == num_parameters);
+  }
+
+  std::vector<void*> param_to_peers_ptrs;
+  param_to_peers_ptrs.reserve(rendezvous_values->size() * num_parameters);
+  for (int param = 0; param < num_parameters; ++param) {
+    for (int peer = 0; peer < clique_key.num_devices(); ++peer) {
+      param_to_peers_ptrs.push_back(
+          (*rendezvous_values)[peer].parameters[param].opaque());
+    }
+  }
+
+  const int param_to_peers_ptrs_size =
+      param_to_peers_ptrs.size() * sizeof(void*);
+  se::DeviceMemoryBase param_to_peers_ptrs_buffer = destination.GetByteSlice(
+      sizeof(CollectiveKernelMetadata), param_to_peers_ptrs_size);
+
+  metadata.param_to_peers =
+      reinterpret_cast<void**>(param_to_peers_ptrs_buffer.opaque());
+
+  TF_RETURN_IF_ERROR(stream->Memcpy(&destination, &metadata,
+                                    sizeof(CollectiveKernelMetadata)));
+  TF_RETURN_IF_ERROR(stream->Memcpy(&param_to_peers_ptrs_buffer,
+                                    param_to_peers_ptrs.data(),
+                                    param_to_peers_ptrs_size));
+  return stream->BlockHostUntilDone();
+}
+
+absl::Status CollectiveMetadataThunk::Initialize(
+    const InitializeParams& params) {
+  TF_ASSIGN_OR_RETURN(
+      const GpuCliqueKey clique_key,
+      GetCollectiveGpuCliqueKey(*params.collective_params, collective_config_,
+                                /*use_nccl=*/false));
+  const int64_t num_ranks = clique_key.num_devices();
+  TF_RET_CHECK(result_.size() ==
+               sizeof(CollectiveKernelMetadata) +
+                   num_ranks * parameters_.size() * sizeof(uint64_t));
+
+  std::vector<se::DeviceMemoryBase> parameters;
+  parameters.reserve(parameters_.size());
+  for (const CollectiveMetadataThunk::Buffer& parameter : parameters_) {
+    parameters.push_back(
+        params.buffer_allocations->GetDeviceAddress(parameter.slice));
+  }
+  se::DeviceMemoryBase result_ptr =
+      params.buffer_allocations->GetDeviceAddress(result_);
+
+  TF_ASSIGN_OR_RETURN(void* multimem_address_space,
+                      SetupMultimem(clique_key, params));
+  return ConstructCollectiveMetadata(
+      std::move(parameters), params.stream, clique_key, multimem_address_space,
+      params.executor->device_ordinal(), result_ptr);
+}
+
+absl::Status CollectiveMetadataThunk::ExecuteOnStream(
+    const ExecuteParams& params) {
+  return absl::OkStatus();
+}
+
+absl::StatusOr<void*> CollectiveMetadataThunk::SetupMultimem(
+    const GpuCliqueKey& clique_key, const InitializeParams& params) {
+  se::DeviceMemoryBase memory_range;
+  for (const CollectiveMetadataThunk::Buffer& parameter : parameters_) {
+    if (parameter.memory_space == xla::Layout::kGenericFastMemorySpace) {
+      TF_ASSIGN_OR_RETURN(
+          memory_range,
+          params.executor->GetMemoryRange(
+              params.buffer_allocations->GetDeviceAddress(parameter.slice)));
+      break;
+    }
+  }
+
+  // Since there is no parameter in the collective memory space, we don't need
+  // to set up the multicast memory.
+  if (memory_range.is_null()) {
+    return nullptr;
+  }
+  return address_space_provider_.SetupMultimemAddressSpace(
+      clique_key, params.executor, memory_range);
+}
+
+absl::Status Barrier(int device_number, const GpuCliqueKey& clique_key) {
+  std::string start_rendezvous_key = absl::StrFormat(
+      "Barrier for device %d, "
+      "clique %s",
+      device_number, clique_key.ToString());
+  return Rendezvous(
+      /*name=*/
+      start_rendezvous_key, /*key=*/clique_key,
+      /*num_threads=*/clique_key.num_local_participants());
+}
+
+absl::StatusOr<void*> CollectiveMetadataThunk::MultimemAddressSpaceProvider::
+    SetupMultimemAddressSpace(const GpuCliqueKey& clique_key,
+                              const se::StreamExecutor* stream_executor,
+                              se::DeviceMemoryBase mapped_memory) {
+  const auto* gpu_executor =
+      dynamic_cast<const stream_executor::gpu::GpuExecutor*>(stream_executor);
+  if (gpu_executor == nullptr) {
+    return absl::UnimplementedError("Multicast is not supported on device.");
+  }
+  int device_number = gpu_executor->device_ordinal();
+  TF_RET_CHECK(clique_key.num_local_participants() > 0)
+      << "Number of local participants must be greater than 0.";
+  int64_t first_device = clique_key.devices()[0].value();
+
+  if (device_number == first_device) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<stream_executor::gpu::GpuExecutor::MulticastMemory>
+            multicast_memory,
+        gpu_executor->CreateMulticastMemory(
+            mapped_memory.size(), clique_key.num_local_participants()));
+    first_device_to_multicast_memory_.emplace(device_number,
+                                              std::move(multicast_memory));
+  }
+
+  // Wait for all devices to create the multicast object.
+  TF_RETURN_IF_ERROR(Barrier(device_number, clique_key));
+
+  TF_RET_CHECK(first_device_to_multicast_memory_.contains(first_device))
+      << "Multicast memory is not created for device " << first_device;
+  // Add current devices to the multicast object.
+  TF_RETURN_IF_ERROR(
+      first_device_to_multicast_memory_[first_device]->SubscribeDevice(
+          device_number));
+
+  // Wait for all devices to register the multicast object.
+  TF_RETURN_IF_ERROR(Barrier(device_number, clique_key));
+
+  return first_device_to_multicast_memory_[first_device]->MapMemory(
+      mapped_memory, gpu_executor);
+};
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
new file mode 100644
index 00000000000000..e7496227f256b9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
@@ -0,0 +1,96 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_METADATA_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_METADATA_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+class CollectiveMetadataThunk : public Thunk {
+ public:
+  struct Buffer {
+    BufferAllocation::Slice slice;
+    int64_t memory_space;
+  };
+
+  class MultimemAddressSpaceProvider {
+   public:
+    // Initializes and multimem memory. Each thunk participant should call this
+    // method once. Multimem should be setup before usage when multimem strategy
+    // is selected.
+    absl::StatusOr<void*> SetupMultimemAddressSpace(
+        const GpuCliqueKey& clique_key,
+        const se::StreamExecutor* stream_executor,
+        se::DeviceMemoryBase mapped_memory);
+
+   private:
+    absl::flat_hash_map<
+        int,
+        std::unique_ptr<stream_executor::gpu::GpuExecutor::MulticastMemory>>
+        first_device_to_multicast_memory_;
+  };
+
+  explicit CollectiveMetadataThunk(ThunkInfo thunk_info,
+                                   CollectiveConfig collective_config,
+                                   std::vector<Buffer> parameters,
+                                   BufferAllocation::Slice result)
+      : Thunk(Thunk::Kind::kCollectiveMetadata, thunk_info),
+        collective_config_(std::move(collective_config)),
+        parameters_(std::move(parameters)),
+        result_(result) {}
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  static CollectiveConfig GetCollectiveConfig(const HloInstruction& hlo);
+
+  // Constructs and places the collective metadata on the device.
+  // All participants should call this method to construct their local
+  // metadata.
+  static absl::Status ConstructCollectiveMetadata(
+      std::vector<se::DeviceMemoryBase> parameters, se::Stream* stream,
+      const GpuCliqueKey& clique_key, void* multimem_address_space,
+      int device_ordinal, se::DeviceMemoryBase destination);
+
+  absl::StatusOr<void*> SetupMultimem(const GpuCliqueKey& clique_key,
+                                      const InitializeParams& params);
+
+ private:
+  const CollectiveConfig collective_config_;
+  std::vector<Buffer> parameters_;
+  MultimemAddressSpaceProvider address_space_provider_;
+  BufferAllocation::Slice result_;
+};
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_METADATA_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_params.cc b/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
new file mode 100644
index 00000000000000..269adedd08b013
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_params.h"
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/executable_run_options.h"
+#include "xla/runtime/device_id.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+
+using GlobalDeviceIdMap = CollectiveParams::GlobalDeviceIdMap;
+
+// Returns global device id for a local device ordinal or an error if global
+// device id map is misconfigured and missing an entry for a local device.
+static absl::StatusOr<GlobalDeviceId> GetGlobalDeviceId(
+    const GlobalDeviceIdMap* device_id_map, LocalDeviceId local_device_id) {
+  // No local -> global mapping was provided; assume the identity mapping.
+  if (!device_id_map) {
+    return GlobalDeviceId(local_device_id.value());
+  }
+
+  // Find a global device id in a global device id map.
+  auto it = device_id_map->find(local_device_id);
+  if (it == device_id_map->end()) {
+    return NotFound("No global device id found for local device ordinal: %d",
+                    local_device_id.value());
+  }
+
+  return it->second;
+}
+
+absl::StatusOr<CollectiveParams> CollectiveParams::Create(
+    const ServiceExecutableRunOptions& run_options,
+    absl::Span<se::Stream* const> async_streams, LocalDeviceId local_device_id,
+    int64_t collective_max_nchannels, int64_t p2p_max_nchannels) {
+  const GpuExecutableRunOptions* gpu_options =
+      run_options.run_options().gpu_executable_run_options();
+
+  auto* collectives = gpu_options && gpu_options->collectives()
+                          ? gpu_options->collectives()
+                          : GpuCollectives::Default();
+
+  auto* device_id_map = gpu_options && gpu_options->gpu_global_device_ids()
+                            ? &*gpu_options->gpu_global_device_ids()
+                            : nullptr;
+
+  auto* clique_id_callback = gpu_options && gpu_options->clique_id_callback()
+                                 ? &gpu_options->clique_id_callback()
+                                 : nullptr;
+
+  auto* incarnations = gpu_options && gpu_options->incarnations().has_value()
+                           ? &*gpu_options->incarnations()
+                           : nullptr;
+
+  TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
+                      GetGlobalDeviceId(device_id_map, local_device_id));
+
+  return CollectiveParams(collectives, run_options.stream()->parent(),
+                          run_options.run_options().run_id(), async_streams,
+                          local_device_id, global_device_id,
+                          run_options.run_options().device_assignment(),
+                          device_id_map, clique_id_callback, incarnations,
+                          collective_max_nchannels, p2p_max_nchannels);
+}
+
+CollectiveParams::CollectiveParams(
+    GpuCollectives* collectives, se::StreamExecutor* executor, RunId run_id,
+    absl::Span<se::Stream* const> async_streams, LocalDeviceId local_device_id,
+    GlobalDeviceId global_device_id, const DeviceAssignment* device_assn,
+    const GlobalDeviceIdMap* global_device_id_map,
+    const CliqueIdCallback* nccl_clique_id_callback,
+    const absl::flat_hash_map<GlobalDeviceId, IncarnationId>* incarnations,
+    int64_t collective_max_nchannels, int64_t p2p_max_nchannels)
+    : collectives(collectives),
+      executor(executor),
+      run_id(run_id),
+      async_streams(async_streams.begin(), async_streams.end()),
+      local_device_id(local_device_id),
+      global_device_id(global_device_id),
+      device_assn(device_assn),
+      global_device_id_map(global_device_id_map),
+      nccl_clique_id_callback(nccl_clique_id_callback),
+      incarnations(incarnations),
+      collective_max_nchannels(collective_max_nchannels),
+      p2p_max_nchannels(p2p_max_nchannels) {}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_params.h b/third_party/xla/xla/backends/gpu/runtime/collective_params.h
new file mode 100644
index 00000000000000..d4a6fdbeb73555
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_params.h
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_PARAMS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_PARAMS_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/executable_run_options.h"
+#include "xla/runtime/device_id.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+// Parameters capturing all the details required for collective execution of
+// XLA executables (multiple partitions and replicas).
+struct CollectiveParams {
+  // A mapping from local device ordinals to global device IDs.
+  using GlobalDeviceIdMap = GpuExecutableRunOptions::DeviceIdMap;
+
+  // Creates NCCL execution parameters from the run options for the given
+  // local device. Returns an error if run options are misconfigured (i.e.
+  // missing a global device mapping for a local device ordinal).
+  static absl::StatusOr<CollectiveParams> Create(
+      const ServiceExecutableRunOptions& run_options,
+      absl::Span<se::Stream* const> async_streams,
+      LocalDeviceId local_device_id, int64_t collective_max_nchannels = 0,
+      int64_t p2p_max_nchannels = 0);
+
+  GpuCollectives* collectives;
+  se::StreamExecutor* executor;
+
+  // XLA execution run id allows us to distinguish collective operations
+  // from different concurrent executions and avoid deadlocks.
+  RunId run_id;
+
+  // Streams for asynchronous collective communications.
+  absl::InlinedVector<se::Stream*, 4> async_streams;
+
+  LocalDeviceId local_device_id;
+  GlobalDeviceId global_device_id;
+
+  const DeviceAssignment* device_assn;
+  const GlobalDeviceIdMap* global_device_id_map;
+  const CliqueIdCallback* nccl_clique_id_callback;
+  const absl::flat_hash_map<GlobalDeviceId, IncarnationId>* incarnations;
+
+  int64_t collective_max_nchannels;
+  int64_t p2p_max_nchannels;
+
+  bool need_barrier = false;
+
+ private:
+  CollectiveParams(
+      GpuCollectives* collectives, se::StreamExecutor* executor, RunId run_id,
+      absl::Span<se::Stream* const> async_streams,
+      LocalDeviceId local_device_id, GlobalDeviceId global_device_id,
+      const DeviceAssignment* device_assn,
+      const GlobalDeviceIdMap* global_device_id_map,
+      const CliqueIdCallback* nccl_clique_id_callback,
+      const absl::flat_hash_map<GlobalDeviceId, IncarnationId>* incarnations,
+      int64_t collective_max_nchannels, int64_t p2p_max_nchannels);
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_PARAMS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
index 50c807c3881fad..efe521872692c2 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -43,17 +42,14 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/executable_run_options.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -63,20 +59,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-absl::StatusOr<const int64_t> GetCurrentId(
-    Thunk::CollectiveExecuteParams* collective_params,
-    const P2PConfig& config) {
-  GlobalDeviceId global_device_id = collective_params->global_device_id;
-  TF_ASSIGN_OR_RETURN(
-      const DeviceAssignment::LogicalID current_logical_id,
-      collective_params->device_assn->LogicalIdForDevice(global_device_id));
-  const int64_t current_id =
-      config.config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
-  return current_id;
-}
-
 bool IsLocalPeerTransfer(const P2PConfig::SourceTargetMapEntry& source_target,
                          const int64_t current_id, const int64_t device_count) {
   const std::optional<int64_t> source_id = source_target.source;
@@ -118,18 +100,16 @@ CollectivePermuteStartThunk::CollectivePermuteStartThunk(
           ->config()
           .debug_options()
           .xla_gpu_experimental_enable_nccl_symmetric_buffers();
-  config.operand_count = instr->operand_count();
-  for (int i = 0; i < config.operand_count; ++i) {
-    config.operand_element_type.push_back(
-        instr->operand(i)->shape().element_type());
+  for (const HloInstruction* operand : instr->operands()) {
+    config.operand_element_type.push_back(operand->shape().element_type());
   }
-  config.SetCollectiveOpKindAndID(instr);
   config.group_mode = GetGroupMode(instr);
 
   // With a collective permute, all execution instances together form one
   // replica group.
   const int64_t num_participants =
-      config.group_mode == CollectiveOpGroupMode::kCrossReplica
+      config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
           ? replica_count
           : partition_count;
   config.replica_groups.emplace_back();
@@ -187,8 +167,9 @@ absl::Status CollectivePermuteStartThunk::Initialize(
   VLOG(5) << "Local device count: " << device_count_;
 
   if (p2p_memcpy_enabled_) {
-    TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                        GetCurrentId(params.collective_params, config_));
+    TF_ASSIGN_OR_RETURN(
+        const int64_t current_id,
+        GetCollectiveCurrentId(params.collective_params, config_));
     {
       absl::MutexLock lock(barrier_mutex_);
       if (receiver_barrier_events_.find(current_id) ==
@@ -253,8 +234,9 @@ absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
       ConvertToDeviceBuffers(params,
                              std::vector<CollectiveThunk::Buffer>(buffers_),
                              config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                      GetCurrentId(params.collective_params, config_));
+  TF_ASSIGN_OR_RETURN(
+      const int64_t current_id,
+      GetCollectiveCurrentId(params.collective_params, config_));
   std::string device_string = GetDeviceString(*params.collective_params);
 
   const P2PConfig::SourceTargetMapEntry source_target =
@@ -283,10 +265,9 @@ absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
                         comm_handle.comm->NumRanks());
 
     auto rendezvous_name = absl::StrFormat(
-        "rendezvous before calling collective-permute; run_id=%ld; op id:%d; "
-        "num_local_participants:%d",
-        params.collective_params->run_id.ToInt(), config_.config.op_id,
-        num_local_participants);
+        "rendezvous before calling collective-permute: run_id=%ld; "
+        "num_local_participants=%d",
+        params.collective_params->run_id.ToInt(), num_local_participants);
     auto rendezvous_key = CallRendezvousKey{params.collective_params->run_id};
 
     // Perform a rendezvous to make sure all receivers have their events
@@ -306,7 +287,7 @@ absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
 
   TF_RETURN_IF_ERROR(::xla::gpu::RunCollectivePermute(
       source_target, device_buffers, stream, comm_handle.comm, device_string,
-      current_id, use_memcpy, recv_ptr_map_,
+      current_id, use_memcpy, &recv_ptr_map_,
       config_.config.use_symmetric_buffer));
 
   if (use_memcpy) {
@@ -325,10 +306,9 @@ absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
                         comm_handle.comm->NumRanks());
 
     auto rendezvous_name = absl::StrFormat(
-        "rendezvous after calling collective-permute; run_id=%ld; op id:%d; "
-        "num_local_participants:%d",
-        params.collective_params->run_id.ToInt(), config_.config.op_id,
-        num_local_participants);
+        "rendezvous after calling collective-permute: run_id=%ld; "
+        "num_local_participants=%d",
+        params.collective_params->run_id.ToInt(), num_local_participants);
     auto rendezvous_key = CallRendezvousKey{params.collective_params->run_id};
 
     // Perform a rendezvous to make sure all senders have their events
@@ -351,9 +331,10 @@ absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
 
 absl::Status RunCollectivePermute(
     P2PConfig::SourceTargetMapEntry source_target,
-    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    const std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
     Communicator* comm, absl::string_view device_string, int64_t current_id,
-    bool use_memcpy, CollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map,
+    bool use_memcpy,
+    const CollectivePermuteStartThunk::RecvPtrMap* recv_ptr_map,
     bool use_symmetric_buffer) {
   // Determine the source and target IDs for this instance. The source ID is the
   // ID which will copy its data to this instance. The destination ID is the ID
@@ -451,7 +432,9 @@ absl::Status RunCollectivePermute(
   }
 
   if (use_memcpy && target_id) {
-    TF_ASSIGN_OR_RETURN(auto recv_ptrs, recv_ptr_map.GetRecvPtr(*target_id));
+    CHECK(recv_ptr_map != nullptr);
+    TF_ASSIGN_OR_RETURN(AsyncValueRef<std::vector<void*>> recv_ptrs,
+                        recv_ptr_map->GetRecvPtr(*target_id));
 
     VLOG(3) << "Using memcpy, received target pointers, current_id: "
             << current_id << " target_id: " << *target_id;
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
index 00426412ceee16..39476490ccfc80 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
@@ -29,12 +29,12 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -49,7 +49,7 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
  public:
   class RecvPtrMap {
    public:
-    bool IsInitialized(int64_t current_id) {
+    bool IsInitialized(int64_t current_id) const {
       absl::MutexLock lock(mutex_);
       return recv_ptrs_.find(current_id) != recv_ptrs_.end();
     }
@@ -76,17 +76,17 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
     }
 
     absl::StatusOr<AsyncValueRef<std::vector<void*>>> GetRecvPtr(
-        int64_t target_id) {
+        int64_t target_id) const {
       if (!IsInitialized(target_id)) {
         return absl::InternalError(absl::StrCat("Target ID ", target_id,
                                                 " has not been initialized!"));
       }
       absl::MutexLock lock(mutex_);
-      return recv_ptrs_[target_id];
+      return recv_ptrs_.at(target_id);
     }
 
    private:
-    absl::Mutex mutex_;
+    mutable absl::Mutex mutex_;
     absl::node_hash_map<int64_t, AsyncValueRef<std::vector<void*>>> recv_ptrs_
         ABSL_GUARDED_BY(mutex_);
   };
@@ -106,12 +106,18 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
                               const std::vector<Buffer>& buffers,
                               bool p2p_memcpy_enabled,
                               AsyncStreamKind stream_kind);
+
   absl::Status Initialize(const InitializeParams& params) override;
 
   static const char* GetHloOpName() { return "collective-permute-start"; }
 
- protected:
   const CollectiveConfig& config() const override { return config_.config; }
+
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+  const P2PConfig& p2p_config() const { return config_; }
+
+ protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      se::Stream& stream,
                                      CommunicatorHandle comm_handle) override;
@@ -125,16 +131,16 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
       receiver_barrier_events_;
   absl::flat_hash_map<int64_t, std::unique_ptr<se::Event>>
       sender_barrier_events_;
-
   bool p2p_memcpy_enabled_ = false;
-  int64_t device_count_;
+  int64_t device_count_ = 0;
 };
 
 absl::Status RunCollectivePermute(
     P2PConfig::SourceTargetMapEntry source_target,
-    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    const std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
     Communicator* comm, absl::string_view device_string, int64_t current_id,
-    bool use_memcpy, CollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map,
+    bool use_memcpy = false,
+    const CollectivePermuteStartThunk::RecvPtrMap* recv_ptr_map = nullptr,
     bool use_symmetric_buffer = false);
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
new file mode 100644
index 00000000000000..5036ebd04d8b13
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
@@ -0,0 +1,219 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/command_buffer_cmd.h"
+#include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
+#include "xla/backends/gpu/runtime/command_buffer_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/backend.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::ElementsAre;
+using Kind = Thunk::Kind;
+
+class GpuCollectivePermuteTest : public HloTestBase {};
+
+// Test case to verify that a CollectivePermute HLO instruction is correctly
+// converted into a sequence of command buffer commands (Start and Done).
+TEST_F(GpuCollectivePermuteTest, TestConvertToCommands) {
+  // Generate HLO text
+  std::string hlo_text = R"(
+HloModule test, replica_count=2
+ENTRY test_computation {
+  p = u32[4] parameter(0)
+  ROOT permute = u32[4] collective-permute(p), source_target_pairs={{0,1}, {1,0}}
+}
+)";
+
+  // Configure module with debug options for command buffer.
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+
+  // Get CollectivePermute Instruction
+  const HloInstruction* root_instr =
+      module->entry_computation()->root_instruction();
+  ASSERT_EQ(root_instr->opcode(), HloOpcode::kCollectivePermute);
+  const HloCollectivePermuteInstruction* cp_instr =
+      tensorflow::down_cast<const HloCollectivePermuteInstruction*>(root_instr);
+  ASSERT_NE(cp_instr, nullptr);
+
+  // Buffer and Allocation Setup
+  using DataT = int32_t;
+  constexpr int64_t kNumElements = 4;
+  constexpr int64_t kAlignmentBytes = kXlaAllocatedBufferAlignBytes;
+
+  const int64_t kElementSize = sizeof(DataT);
+  const int64_t kTotalDataBytes = kNumElements * kElementSize;
+
+  // Use RoundUpTo to calculate the actual size needed for one buffer.
+  const int64_t kAlignedSliceBytes =
+      xla::RoundUpTo<uint64_t>(kTotalDataBytes, kAlignmentBytes);
+
+  // The total buffer size must accommodate input and output slices.
+  const int64_t kTotalBufferBytes = 2 * kAlignedSliceBytes;
+
+  BufferAllocation buffer_allocation(/*index=*/0, kTotalBufferBytes,
+                                     /*color=*/0);
+  BufferAllocation::Slice input_slice(&buffer_allocation, /*offset=*/0,
+                                      kAlignedSliceBytes);
+  BufferAllocation::Slice output_slice(&buffer_allocation, kAlignedSliceBytes,
+                                       kAlignedSliceBytes);
+
+  // Use designated initializers if possible, or format for clarity.
+  std::vector<CollectiveThunk::Buffer> buffers = {
+      {/*element_count=*/kNumElements,
+       /*source_buffer=*/input_slice,
+       /*destination_buffer=*/output_slice,
+       /*source_memory_space=*/0,
+       /*destination_memory_space=*/0},
+  };
+
+  // ThunkSequence Creation
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events =
+      std::make_shared<CollectiveThunk::AsyncEvents>();
+
+  auto cp_start_thunk = std::make_unique<CollectivePermuteStartThunk>(
+      Thunk::ThunkInfo{}, cp_instr, /*replica_count=*/2,
+      /*partition_count=*/1, std::move(buffers),
+      /*p2p_memcpy_enabled=*/false,
+      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE);
+
+  cp_start_thunk->set_async_events(async_events);
+
+  auto cp_done_thunk = std::make_unique<CollectiveDoneThunk>(
+      Kind::kCollectivePermuteDone, Thunk::ThunkInfo{}, async_events,
+      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE);
+
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(cp_start_thunk));
+  thunk_sequence.push_back(std::move(cp_done_thunk));
+
+  // Convert to Commands and Verification
+  ConvertToCommandsOptions conv_options;
+  // Use LHS synchronization mode to append Done command
+  conv_options.synchronization_mode =
+      CommandBufferCmdExecutor::SynchronizationMode::kLHS;
+  TF_ASSERT_OK_AND_ASSIGN(CommandBufferCmdExecutor cb_cmd_executor,
+                          ConvertToCommands(thunk_sequence, conv_options));
+
+  // Check that we have two commands: start and done.
+  EXPECT_EQ(cb_cmd_executor.size(), 2);
+}
+
+TEST_F(GpuCollectivePermuteTest,
+       TestCommandBufferThunkContainsCollectivePermute) {
+  // Generate HLO text
+  std::string hlo_text = R"(
+HloModule test, replica_count=2
+ENTRY test_computation {
+  replica = u32[] replica-id()
+  p = u32[4] broadcast(replica), dimensions={}
+  ROOT permute = u32[4] collective-permute(p), source_target_pairs={{0,1}, {1,0}}
+}
+)";
+
+  // Configure module with debug options for command buffer.
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+
+  se::StreamExecutor* executor = backend().default_stream_executor();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> compiled_module,
+      backend().compiler()->RunHloPasses(module->Clone(), executor,
+                                         /*device_allocator=*/nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      backend().compiler()->RunBackend(std::move(compiled_module), executor,
+                                       {/*device_allocator=*/nullptr,
+                                        /*thread_pool=*/nullptr,
+                                        /*layout_canonicalization_callback=*/{},
+                                        /*is_autotuning_compilation=*/false}));
+  // Downcast to GPU executable
+  xla::gpu::GpuExecutable* gpu_executable =
+      tensorflow::down_cast<xla::gpu::GpuExecutable*>(executable.get());
+  ASSERT_NE(gpu_executable, nullptr);
+
+  // Get the thunk sequence and check its size and type
+  const SequentialThunk& seq_thunk = gpu_executable->GetThunk();
+  ASSERT_EQ(seq_thunk.thunks().size(), 1);
+
+  const std::unique_ptr<Thunk>& thunk = seq_thunk.thunks().front();
+  ASSERT_EQ(thunk->kind(), Thunk::kCommandBuffer);
+
+  // Downcast to the specific CommandBufferThunk type for inspection.
+  CommandBufferThunk* cmd_buffer_thunk =
+      tensorflow::down_cast<CommandBufferThunk*>(thunk.get());
+  ASSERT_NE(cmd_buffer_thunk, nullptr);
+
+  // Inspect the Thunk kinds
+  std::vector<Kind> kinds;
+  const auto& inner_thunks = cmd_buffer_thunk->thunks()->thunks();
+  kinds.reserve(inner_thunks.size());
+  for (const auto& thunk : inner_thunks) {
+    kinds.push_back(thunk->kind());
+  }
+  // Verify that the inner Thunks match the expected sequence from the HLO
+  EXPECT_THAT(kinds, ElementsAre(Kind::kReplicaId, Kind::kKernel,
+                                 Kind::kCollectivePermuteStart));
+}
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
index 07a2aff95045a1..23e21a65eb7f8f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -20,13 +20,11 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
-#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -39,11 +37,13 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/debug_options_flags.h"
-#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
+#include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
@@ -65,7 +66,6 @@ namespace xla::gpu {
 namespace {
 
 static constexpr int64_t kCollectiveMemorySpaceColor = 1;
-static constexpr CollectiveStreamId kNoStreamId = CollectiveStreamId(0);
 
 bool IsTypeSupportedBy(PrimitiveType element_type, Thunk::Kind reduction_op) {
   switch (element_type) {
@@ -100,7 +100,7 @@ bool IsTypeSupportedBy(PrimitiveType element_type, Thunk::Kind reduction_op) {
 }
 
 int64_t GetNumLocalParticipants(
-    const Thunk::CollectiveExecuteParams& params,
+    const CollectiveParams& params,
     const std::vector<GlobalDeviceId>& participants) {
   if (!params.global_device_id_map) {
     return participants.size();
@@ -119,19 +119,6 @@ int64_t GetNumLocalParticipants(
 
 }  // namespace
 
-// This file runs collective ops (i.e. ops that communicate between multiple
-// GPUs) using NCCL.
-//
-// Here's a high-level overview of how running an op works.
-//
-//  - Multiple threads call ExecuteOnStream.
-//  - All threads that "go together" (i.e. are participating in the "same"
-//    collective op) choose the same Rendezvous object from a global map.
-//  - Once all threads have arrived at the Rendezvous, we know exactly which
-//    GPUs are participating in the op, so we get or create a NcclClique
-//    containing those GPUs.
-//  - We perform the NCCL operation using the clique.
-
 // Returns if the collective communication operation is degenerate because all
 // the groups formed by the operation are singleton. A given op can be
 // degenerate under several conditions, corresponding to the modes supported
@@ -163,14 +150,15 @@ bool CollectiveConfig::IsDegenerate(int64_t replica_count,
       });
 
   switch (group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA:
       return all_groups_singleton || (groups_empty && replica_count == 1);
-    case CollectiveOpGroupMode::kCrossPartition:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION:
       return all_groups_singleton || (groups_empty && partition_count == 1);
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition:
+    case CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION:
       return (all_groups_singleton && partition_count == 1) ||
              (groups_empty && replica_count == 1 && partition_count == 1);
-    case CollectiveOpGroupMode::kFlattenedID:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID:
       CHECK(!groups_empty)
           << "replica groups cannot be empty if use_global_device_ids = true";
       return all_groups_singleton;
@@ -180,48 +168,15 @@ bool CollectiveConfig::IsDegenerate(int64_t replica_count,
   }
 }
 
-void CollectiveConfig::SetCollectiveOpKindAndID(
-    const HloCollectivePermuteInstruction* instr) {
-  if (instr->channel_id().has_value()) {
-    collective_op_kind = RendezvousKey::kCrossModule;
-    op_id = instr->channel_id().value();
-  } else {
-    collective_op_kind = RendezvousKey::kCrossReplica;
-    op_id = static_cast<int64_t>(instr->GetModule()->unique_id());
-  }
-}
-
-void CollectiveConfig::SetCollectiveOpKindAndID(
-    const HloSendRecvInstruction* instr) {
-  int64_t channel_id = instr->channel_id().value_or(0);
-  if (channel_id > 0) {
-    collective_op_kind = RendezvousKey::kCrossModule;
-    op_id = channel_id;
-  } else {
-    collective_op_kind = RendezvousKey::kCrossReplica;
-    op_id = static_cast<int64_t>(instr->GetModule()->unique_id());
-  }
-}
-
 CollectiveConfig GetCollectiveConfig(
     const HloInstruction* hlo, std::optional<bool> use_global_device_ids) {
   CollectiveConfig config;
-  config.operand_count = hlo->operands().size();
-  config.operand_element_type.reserve(config.operand_count);
-  for (int i = 0; i < config.operand_count; i++) {
-    config.operand_element_type.push_back(
-        hlo->operand(i)->shape().element_type());
+  config.operand_element_type.reserve(hlo->operands().size());
+  for (const HloInstruction* operand : hlo->operands()) {
+    config.operand_element_type.push_back(operand->shape().element_type());
   }
   config.replica_groups = hlo->replica_groups();
 
-  if (hlo->channel_id().has_value()) {
-    config.collective_op_kind = RendezvousKey::kCrossModule;
-    config.op_id = *hlo->channel_id();
-  } else {
-    config.collective_op_kind = RendezvousKey::kCrossReplica;
-    config.op_id = static_cast<int64_t>(hlo->GetModule()->unique_id());
-  }
-
   config.group_mode = GetCollectiveOpGroupMode(hlo->channel_id().has_value(),
                                                use_global_device_ids)
                           .value();
@@ -242,7 +197,7 @@ CollectiveThunk::CollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync,
       async_events_(is_sync ? nullptr : std::make_shared<AsyncEvents>()) {}
 
 absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    GpuCollectives* collectives, const CollectiveParams& params,
     const std::vector<ReplicaGroup>& replica_groups,
     CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind,
     bool use_nccl) {
@@ -255,10 +210,14 @@ absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
         "using a tool designed for only one device like run_hlo_module.");
   }
 
+  // Get the list of all devices that are participating in the collective
+  // operation.
   TF_ASSIGN_OR_RETURN(
       std::vector<GlobalDeviceId> participants,
       GetParticipatingDevices(global_device_id, *params.device_assn,
                               replica_groups, group_mode));
+
+  // Get grouping of participating devices.
   std::vector<std::vector<GlobalDeviceId>> participant_groups;
   if (use_nccl) {
     // If splitting is enabled, participating groups must match in order for a
@@ -279,13 +238,25 @@ absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
           "environment configuration.");
     }
   }
+
+  // Remove trivial group that contains all participants, as we do not want to
+  // create two sets of communicator handles for these cases.
+  if (participant_groups.size() == 1 && participant_groups[0] == participants) {
+    participant_groups.clear();
+  }
+
   int64_t num_local_participants =
       GetNumLocalParticipants(params, participants);
 
   absl::flat_hash_set<IncarnationId> unique_incarnations;
   if (params.incarnations) {
     for (GlobalDeviceId id : participants) {
-      unique_incarnations.insert(params.incarnations->at(id));
+      auto it = params.incarnations->find(id);
+      if (it == params.incarnations->end()) {
+        return FailedPrecondition("Incarnation for device %d not found",
+                                  id.value());
+      }
+      unique_incarnations.insert(it->second);
     }
   }
   std::vector<IncarnationId> incarnations(unique_incarnations.begin(),
@@ -299,18 +270,19 @@ absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
 }
 
 absl::StatusOr<GpuCliqueKey> GetCollectiveGpuCliqueKey(
-    const Thunk::CollectiveExecuteParams& params,
-    const CollectiveConfig& collective_config, bool use_nccl) {
+    const CollectiveParams& params, const CollectiveConfig& collective_config,
+    bool use_nccl) {
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
                       CollectiveThunk::GetGpuCollectives(params));
   return GetGpuCliqueKey(collectives, params, collective_config.replica_groups,
                          collective_config.group_mode,
-                         AsyncStreamKind::kCollective, use_nccl);
+                         AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE,
+                         use_nccl);
 }
 
 absl::StatusOr<CommunicatorHandle> GetComm(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
-    const Thunk::CollectiveCliques& collective_cliques,
+    GpuCollectives* collectives, const CollectiveParams& params,
+    const CollectiveCliques& collective_cliques,
     const std::vector<ReplicaGroup>& replica_groups,
     CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind) {
   TF_ASSIGN_OR_RETURN(GpuCliqueKey clique_key,
@@ -336,8 +308,9 @@ absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
     const BufferAllocations* buffer_allocations,
     const std::vector<CollectiveThunk::Buffer>& buffers,
     const std::vector<PrimitiveType>& element_types) {
-  if (buffers.size() != element_types.size())
+  if (buffers.size() != element_types.size()) {
     return FailedPrecondition("Mismatch in operand buffer counts.");
+  }
 
   std::vector<DeviceBufferPair> device_buffers;
   device_buffers.reserve(buffers.size());
@@ -356,8 +329,8 @@ absl::Status MaybeRegisterBuffer(se::StreamExecutor* executor,
                                  Communicator* comm,
                                  bool use_symmetric_buffer) {
   TF_ASSIGN_OR_RETURN(auto range, executor->GetMemoryRange(buffer));
-  VLOG(1) << "[" << executor->device_ordinal()
-          << "] Registering range: " << range.opaque()
+  VLOG(1) << "[" << executor->device_ordinal() << "] "
+          << "Registering range: " << range.opaque()
           << " with size: " << range.size()
           << " for buffer: " << buffer.opaque()
           << " with size: " << buffer.size()
@@ -388,7 +361,9 @@ absl::Status MaybeRegisterBuffers(se::StreamExecutor* executor,
 absl::Status CollectiveThunk::AsyncEvents::Initialize(
     se::StreamExecutor* executor) {
   absl::MutexLock lock(mu_);
-  if (events_.contains(executor)) return absl::OkStatus();
+  if (events_.contains(executor)) {
+    return absl::OkStatus();
+  }
 
   TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
 
@@ -409,15 +384,15 @@ absl::StatusOr<se::Event*> CollectiveThunk::AsyncEvents::GetEvent(
   return event->second.get();
 }
 
-absl::Status CollectiveThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status CollectiveThunk::Prepare(const PrepareParams& params) {
+  TF_RET_CHECK(params.collective_params != nullptr);
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(collectives, *params.collective_params,
                       config().replica_groups, config().group_mode,
                       GetAsyncStreamKind()));
-  return resource_requests.AddClique(clique_key);
+  return params.clique_requests->RequestClique(clique_key);
 }
 
 absl::Status CollectiveThunk::Initialize(const InitializeParams& params) {
@@ -478,15 +453,14 @@ absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
             << "] Do a rendezvous after a first call to "
             << Thunk::KindToString(kind())
             << "; run_id=" << params.collective_params->run_id.ToInt()
-            << "; op_id=" << config().op_id
             << "; num_local_participants=" << num_local_participants
             << "; rank=" << rank.value()
             << "; clique_key=" << clique_key.ToString();
 
     auto rendezvous_key = FirstCallRendezvousKey{std::move(clique_key)};
     auto rendezvous_name = absl::StrFormat(
-        "first call to collective operation %d; run_id=%ld", config().op_id,
-        params.collective_params->run_id.ToInt());
+        "first call to collective operation: kind=%s; run_id=%ld",
+        Thunk::KindToString(kind()), params.collective_params->run_id.ToInt());
 
     const xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
 
@@ -521,7 +495,7 @@ absl::StatusOr<std::vector<Communicator*>> CollectiveThunk::GetCommunicators(
 }
 
 std::string CollectiveThunk::GetDeviceString(
-    const Thunk::CollectiveExecuteParams& collective_params) {
+    const CollectiveParams& collective_params) {
   GlobalDeviceId global_device_id = collective_params.global_device_id;
   DeviceAssignment::LogicalID logical_id =
       collective_params.device_assn->LogicalIdForDevice(global_device_id)
@@ -529,7 +503,7 @@ std::string CollectiveThunk::GetDeviceString(
   return absl::StrFormat("(r%d, p%d) : GlobalID %d, ord %d",
                          logical_id.replica_id, logical_id.computation_id,
                          global_device_id.value(),
-                         collective_params.local_device_ordinal);
+                         collective_params.local_device_id.value());
 }
 
 std::optional<AsyncEventsUniqueId> CollectiveThunk::GetAsyncEventsUniqueId()
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
index 3a73764c51871e..b185047c65fb95 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
@@ -1,3 +1,7 @@
+#include <cstddef>
+
+#include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 /* Copyright 2019 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,18 +33,14 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Value.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/rendezvous.h"
@@ -53,56 +53,19 @@ limitations under the License.
 namespace xla::gpu {
 
 struct CollectiveConfig {
-  int64_t operand_count;
+  // Returns if the collective communication operation is degenerate because all
+  // the groups formed by the operation are singleton.
+  bool IsDegenerate(int64_t replica_count, int64_t partition_count) const;
+
   std::vector<PrimitiveType> operand_element_type;
   std::vector<ReplicaGroup> replica_groups;
-  RendezvousKey::CollectiveOpKind collective_op_kind;
-  int64_t op_id;
   CollectiveOpGroupMode group_mode;
   bool use_symmetric_buffer;
-
-  template <typename OpT>
-  void SetCollectiveOpKindAndID(OpT op);
-  void SetCollectiveOpKindAndID(const HloCollectivePermuteInstruction* instr);
-  void SetCollectiveOpKindAndID(const HloSendRecvInstruction* instr);
-  bool IsDegenerate(int64_t replica_count, int64_t partition_count) const;
 };
 
-template <typename OpT>
-void CollectiveConfig::SetCollectiveOpKindAndID(OpT op) {
-  if (op.getChannelId()) {
-    collective_op_kind = RendezvousKey::kCrossModule;
-    op_id = static_cast<int64_t>(op.getChannelId()->getHandle());
-  } else {
-    collective_op_kind = RendezvousKey::kCrossReplica;
-    mlir::ModuleOp parent = op->template getParentOfType<mlir::ModuleOp>();
-    mlir::IntegerAttr unique_id =
-        parent->getAttrOfType<mlir::IntegerAttr>("hlo.unique_id");
-    op_id = static_cast<int64_t>(unique_id.getInt());
-  }
-}
-
 CollectiveConfig GetCollectiveConfig(const HloInstruction* hlo,
                                      std::optional<bool> use_global_device_ids);
 
-template <typename OpT>
-CollectiveConfig GetCollectiveConfigForMlir(
-    OpT op, std::optional<bool> use_global_device_ids) {
-  CollectiveConfig config;
-  config.operand_count = op.getInputs().size();
-  config.operand_element_type.reserve(config.operand_count);
-  for (int i = 0; i < config.operand_count; i++) {
-    const Shape shape = GetShape(op.getInputs()[i]);
-    config.operand_element_type.push_back(shape.element_type());
-  }
-  config.replica_groups = ConvertReplicaGroups(op.getReplicaGroups()).value();
-  config.SetCollectiveOpKindAndID(op);
-  config.group_mode = GetCollectiveOpGroupMode(op.getChannelId().has_value(),
-                                               use_global_device_ids)
-                          .value();
-  return config;
-}
-
 // Handle to a communicator object with corresponding clique key.
 struct CommunicatorHandle {
   CommunicatorHandle(Communicator* comm, GpuCliqueKey clique_key)
@@ -146,8 +109,6 @@ class CollectiveThunk : public Thunk {
     BufferAllocation::Slice destination_buffer;
     int64_t source_memory_space;
     int64_t destination_memory_space;
-    mlir::Value source_value;
-    mlir::Value destination_value;
   };
 
   // Completion events for asynchronous collective operations (operations
@@ -171,11 +132,9 @@ class CollectiveThunk : public Thunk {
   };
 
   // Logging support.
-  static std::string GetDeviceString(
-      const Thunk::CollectiveExecuteParams& params);
+  static std::string GetDeviceString(const CollectiveParams& params);
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
 
   absl::Status Initialize(const InitializeParams& params) override;
 
@@ -272,15 +231,13 @@ class CollectiveDoneThunk : public Thunk {
 
  private:
   std::shared_ptr<CollectiveThunk::AsyncEvents> async_events_;
-  AsyncStreamKind stream_kind_ = AsyncStreamKind::kCollective;
+  AsyncStreamKind stream_kind_ = AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
   // NCCL stream id assigned by execution stream assignment.
   CollectiveStreamId stream_id_ = CollectiveStreamId(1);
 };
 
 //===----------------------------------------------------------------------===//
 
-absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op);
-
 absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op);
 
 template <typename CollectiveThunkType, typename OpT>
@@ -316,20 +273,20 @@ absl::Status AddOpDescription(absl::Status status, OpT op,
 //===----------------------------------------------------------------------===//
 
 absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    GpuCollectives* collectives, const CollectiveParams& params,
     const std::vector<ReplicaGroup>& replica_groups,
     CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind,
     bool use_nccl = true);
 
 // Helper over GetGpuCliqueKey that builds key for AsyncStreamKind::kCollective.
 absl::StatusOr<GpuCliqueKey> GetCollectiveGpuCliqueKey(
-    const CollectiveThunk::CollectiveExecuteParams& params,
-    const CollectiveConfig& collective_config, bool use_nccl = true);
+    const CollectiveParams& params, const CollectiveConfig& collective_config,
+    bool use_nccl = true);
 
 // Returns a communicator and additional information about the clique.
 absl::StatusOr<CommunicatorHandle> GetComm(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
-    const Thunk::CollectiveCliques& collective_cliques,
+    GpuCollectives* collectives, const CollectiveParams& params,
+    const CollectiveCliques& collective_cliques,
     const std::vector<ReplicaGroup>& replica_groups,
     CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind);
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index 8a0af36d3558ba..1c2e9cb21c3796 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -52,10 +52,13 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/annotation.h"
 #include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/debug_options_flags.h"
@@ -63,6 +66,7 @@ limitations under the License.
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/execution_graph.h"
@@ -85,6 +89,7 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
@@ -452,10 +457,9 @@ CommandBufferCmdExecutor::CommandBufferCmdExecutor(
 }
 
 absl::Status CommandBufferCmdExecutor::Prepare(
-    const Thunk::PrepareParams& params,
-    Thunk::ResourceRequestsInterface& resource_requests) {
+    const Thunk::PrepareParams& params) {
   for (auto& command : commands_) {
-    TF_RETURN_IF_ERROR(command->Prepare(params, resource_requests));
+    TF_RETURN_IF_ERROR(command->Prepare(params));
   }
   return absl::OkStatus();
 }
@@ -486,6 +490,60 @@ absl::Status CommandBufferCmdExecutor::Record(
   } else {
     auto* create = std::get_if<CommandBufferCmd::RecordCreate>(&record_action);
     CHECK(create);
+
+    if (VLOG_IS_ON(5) &&
+        command_buffer->mode() == se::CommandBuffer::Mode::kPrimary) {
+      int64_t input_count = 0;
+      int64_t output_count = 0;
+      int64_t input_temp_count = 0;
+      int64_t output_temp_count = 0;
+      int64_t input_output_count = 0;
+      int64_t input_temp_output_count = 0;
+
+      for (const auto& cmd : commands_) {
+        bool has_input = false;
+        bool has_output = false;
+        bool has_temp = false;
+
+        for (const auto& buffer : cmd->buffers()) {
+          if (buffer.HasDefinedContentsOnInput()) {
+            has_input = true;
+          }
+          if (buffer.HasDefinedContentsOnOutput()) {
+            has_output = true;
+          }
+          if (!buffer.HasDefinedContentsOnInput() &&
+              !buffer.HasDefinedContentsOnOutput()) {
+            has_temp = true;
+          }
+        }
+
+        if (has_input && !has_output && !has_temp) input_count++;
+        if (!has_input && has_output && !has_temp) output_count++;
+        if (has_input && !has_output && has_temp) input_temp_count++;
+        if (!has_input && has_output && has_temp) output_temp_count++;
+        if (has_input && has_output && !has_temp) input_output_count++;
+        if (has_input && has_output && has_temp) input_temp_output_count++;
+      }
+
+      VLOG(5) << "CommandBufferCmdExecutor allocation summary:\n"
+              << "  Total commands                                 : "
+              << commands_.size() << "\n"
+              << "  ------------------------------------------------\n"
+              << "  Commands consuming input buffer                : "
+              << input_count << "\n"
+              << "  Commands consuming output buffer               : "
+              << output_count << "\n"
+              << "  Commands consuming input, temp buffers         : "
+              << input_temp_count << "\n"
+              << "  Commands consuming output, temp buffers        : "
+              << output_temp_count << "\n"
+              << "  Commands consuming input, output buffers       : "
+              << input_output_count << "\n"
+              << "  Commands consuming input, temp, output buffers : "
+              << input_temp_output_count;
+    }
+
     TF_RETURN_IF_ERROR(RecordCreate(execute_params, record_params,
                                     command_buffer, create->dependencies)
                            .status());
@@ -1493,6 +1551,12 @@ absl::Status WhileCmd::Initialize(const Thunk::InitializeParams& params,
   return absl::OkStatus();
 }
 
+absl::Status WhileCmd::Prepare(const Thunk::PrepareParams& params) {
+  TF_RETURN_IF_ERROR(cond_commands_.Prepare(params));
+  TF_RETURN_IF_ERROR(body_commands_.Prepare(params));
+  return absl::OkStatus();
+}
+
 absl::StatusOr<const se::CommandBuffer::Command*> WhileCmd::Record(
     const Thunk::ExecuteParams& execute_params,
     const RecordParams& record_params, RecordAction record_action,
@@ -1790,7 +1854,7 @@ absl::StatusOr<const se::CommandBuffer::Command*> CuDnnCmd::Record(
       [&](se::Stream* stream) {
         return graph_->get()->Execute(
             *stream, absl::Span<se::DeviceMemoryBase>(operands),
-            execute_params.collective_params->local_device_ordinal);
+            execute_params.collective_params->local_device_id.value());
       });
 }
 
@@ -1823,10 +1887,9 @@ absl::StatusOr<const se::CommandBuffer::Command*> CustomCallCmd::Record(
 namespace {
 // Records each buffer associated with each slice into the provided vector.
 // Returns an error if any of the slices is missing a buffer allocation.
-absl::Status GetBuffers(
-    const Thunk::ExecuteParams& execute_params,
-    absl::Span<const std::optional<CustomCallCmd::Slice>> slices,
-    std::vector<void*>& buffers, absl::string_view label) {
+absl::Status GetBuffers(const Thunk::ExecuteParams& execute_params,
+                        absl::Span<const NullableShapedSlice> slices,
+                        std::vector<void*>& buffers, absl::string_view label) {
   for (int i = 0; i < slices.size(); ++i) {
     if (!slices[i].has_value()) {
       buffers.push_back(nullptr);
@@ -1908,7 +1971,7 @@ CustomCallCmd::RecordXlaFfiCall(const Thunk::ExecuteParams& execute_params,
   arguments.reserve(operands_.size());
 
   for (int i = 0; i < operands_.size(); ++i) {
-    const std::optional<Slice>& slice = operands_[i];
+    const NullableShapedSlice& slice = operands_[i];
     if (!slice.has_value()) {
       arguments.push_back(se::DeviceMemoryBase{});
       continue;
@@ -1925,7 +1988,7 @@ CustomCallCmd::RecordXlaFfiCall(const Thunk::ExecuteParams& execute_params,
   results.reserve(results_.size());
 
   for (int i = 0; i < results_.size(); ++i) {
-    const std::optional<Slice>& slice = results_[i];
+    const NullableShapedSlice& slice = results_[i];
     if (!slice.has_value()) {
       results.push_back(se::DeviceMemoryBase{});
       continue;
@@ -1976,7 +2039,7 @@ CustomCallCmd::RecordXlaFfiCall(const Thunk::ExecuteParams& execute_params,
 CommandBufferCmd::BufferUseVector CustomCallCmd::buffers() const {
   CommandBufferCmd::BufferUseVector buffer_usage;
   for (auto& slices : {operands_, results_}) {
-    for (const std::optional<Slice>& slice : slices) {
+    for (const std::optional<ShapedSlice>& slice : slices) {
       if (slice.has_value()) {
         buffer_usage.push_back(BufferUse::Write(slice->slice));
       }
@@ -1992,21 +2055,20 @@ CommandBufferCmd::BufferUseVector CustomCallCmd::buffers() const {
 CollectiveCmd::CollectiveCmd(
     CommandBufferCmdType cmd_type, CollectiveConfig config,
     std::shared_ptr<CollectiveThunk::AsyncEvents> async_events)
-    : CommandBufferCmd(cmd_type),
+    : CommandBufferCmd(cmd_type, se::StreamPriority::Highest),
       config_(std::move(config)),
       async_events_(std::move(async_events)) {}
 
-absl::Status CollectiveCmd::Prepare(
-    const Thunk::PrepareParams& params,
-    Thunk::ResourceRequestsInterface& resource_requests) {
+absl::Status CollectiveCmd::Prepare(const Thunk::PrepareParams& params) {
+  TF_RET_CHECK(params.collective_params != nullptr);
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
                       Thunk::GetGpuCollectives(params));
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(collectives, *params.collective_params,
                       config().replica_groups, config().group_mode,
-                      AsyncStreamKind::kCollective));
-  return resource_requests.AddClique(clique_key);
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));
+  return params.clique_requests->RequestClique(clique_key);
 }
 
 absl::StatusOr<const se::CommandBuffer::Command*>
@@ -2079,11 +2141,12 @@ absl::StatusOr<const se::CommandBuffer::Command*> AllReduceCmd::Record(
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
                       Thunk::GetGpuCollectives(execute_params));
 
-  TF_ASSIGN_OR_RETURN(CommunicatorHandle comm_handle,
-                      GetComm(collectives, *execute_params.collective_params,
-                              *execute_params.collective_cliques,
-                              config().replica_groups, config().group_mode,
-                              AsyncStreamKind::kCollective));  // Use constant
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode,
+              AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));  // Use constant
 
   return RecordTracedCommand(
       execute_params, record_params, std::move(record_action), command_buffer,
@@ -2144,11 +2207,12 @@ absl::StatusOr<const se::CommandBuffer::Command*> ReduceScatterCmd::Record(
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
                       Thunk::GetGpuCollectives(execute_params));
 
-  TF_ASSIGN_OR_RETURN(CommunicatorHandle comm_handle,
-                      GetComm(collectives, *execute_params.collective_params,
-                              *execute_params.collective_cliques,
-                              config().replica_groups, config().group_mode,
-                              AsyncStreamKind::kCollective));  // Use constant
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode,
+              AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));  // Use constant
 
   return RecordTracedCommand(execute_params, record_params, record_action,
                              command_buffer, [&](se::Stream* stream) {
@@ -2204,17 +2268,19 @@ absl::StatusOr<const se::CommandBuffer::Command*> AllToAllCmd::Record(
 
   if (!execute_params.collective_params || !execute_params.collective_cliques) {
     return absl::InvalidArgumentError(
-        "ReduceScatterCmd requires collective parameters and cliques");
+        "AllToAllCmd requires collective parameters and cliques");
   }
 
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
                       Thunk::GetGpuCollectives(execute_params));
-  TF_ASSIGN_OR_RETURN(CommunicatorHandle comm_handle,
-                      GetComm(collectives, *execute_params.collective_params,
-                              *execute_params.collective_cliques,
-                              config().replica_groups, config().group_mode,
-                              AsyncStreamKind::kCollective));  // Use constant
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode,
+              AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));  // Use constant
 
+  // MemCpy case is not currently supported in CommandBuffer.
   return RecordTracedCommand(
       execute_params, record_params, std::move(record_action), command_buffer,
       [&](se::Stream* stream) {
@@ -2271,11 +2337,12 @@ absl::StatusOr<const se::CommandBuffer::Command*> AllGatherCmd::Record(
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
                       Thunk::GetGpuCollectives(execute_params));
 
-  TF_ASSIGN_OR_RETURN(CommunicatorHandle comm_handle,
-                      GetComm(collectives, *execute_params.collective_params,
-                              *execute_params.collective_cliques,
-                              config().replica_groups, config().group_mode,
-                              AsyncStreamKind::kCollective));  // Use constant
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode,
+              AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));  // Use constant
 
   return RecordTracedCommand(
       execute_params, record_params, std::move(record_action), command_buffer,
@@ -2334,11 +2401,12 @@ CollectiveBroadcastCmd::Record(const Thunk::ExecuteParams& execute_params,
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
                       Thunk::GetGpuCollectives(execute_params));
 
-  TF_ASSIGN_OR_RETURN(CommunicatorHandle comm_handle,
-                      GetComm(collectives, *execute_params.collective_params,
-                              *execute_params.collective_cliques,
-                              config().replica_groups, config().group_mode,
-                              AsyncStreamKind::kCollective));  // Use constant
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode,
+              AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));  // Use constant
 
   return RecordTracedCommand(execute_params, record_params,
                              std::move(record_action), command_buffer,
@@ -2357,6 +2425,86 @@ CommandBufferCmd::BufferUseVector CollectiveBroadcastCmd::buffers() const {
   return buffer_usage;
 }
 
+//===----------------------------------------------------------------------===//
+// CollectivePermuteCmd
+//===----------------------------------------------------------------------===//
+
+CollectivePermuteCmd::CollectivePermuteCmd(
+    CollectiveConfig config, P2PConfig p2p_config,
+    absl::Span<const CollectiveThunk::Buffer> buffers,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events)
+    : CollectiveCmd(CommandBufferCmdType::kCollectivePermuteCmd,
+                    std::move(config), std::move(async_events)),
+      p2p_config_(std::move(p2p_config)),
+      buffers_(buffers.begin(), buffers.end()) {}
+
+absl::StatusOr<const se::CommandBuffer::Command*> CollectivePermuteCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
+                             config().operand_element_type));
+
+  int device_ordinal = execute_params.stream->parent()->device_ordinal();
+  VLOG(5) << "[" << device_ordinal << "] CollectivePermuteCmd:";
+
+  for (size_t i = 0; i < device_buffers.size(); ++i) {
+    VLOG(5) << "[" << device_ordinal << "]  Src: " << buffers_[i].source_buffer
+            << " (" << device_buffers[i].source_buffer.opaque() << ")";
+    VLOG(5) << "[" << device_ordinal
+            << "]  Dst: " << buffers_[i].destination_buffer << " ("
+            << device_buffers[i].destination_buffer.opaque() << ")";
+  }
+
+  if (!execute_params.collective_params || !execute_params.collective_cliques) {
+    return absl::InvalidArgumentError(
+        "CollectivePermuteCmd requires collective parameters and cliques");
+  }
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
+                      Thunk::GetGpuCollectives(execute_params));
+
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode,
+              AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE));  // Use constant
+
+  std::string device_string =
+      CollectiveThunk::GetDeviceString(*execute_params.collective_params);
+  bool use_symmetric_buffer = config().use_symmetric_buffer;
+
+  TF_ASSIGN_OR_RETURN(
+      const int64_t current_id,
+      GetCollectiveCurrentId(execute_params.collective_params, p2p_config_));
+
+  const P2PConfig::SourceTargetMapEntry source_target =
+      P2PConfig::GetSourceTarget(p2p_config_.id_to_source_target, current_id);
+
+  // MemCpy case is not currently supported in CommandBuffer.
+  return RecordTracedCommand(
+      execute_params, record_params, std::move(record_action), command_buffer,
+      [&](se::Stream* stream) {
+        return RunCollectivePermute(source_target, device_buffers, *stream,
+                                    comm_handle.comm, device_string, current_id,
+                                    /*use_memcpy=*/false,
+                                    /*recv_ptr_map=*/nullptr,
+                                    use_symmetric_buffer);
+      });
+}
+
+CommandBufferCmd::BufferUseVector CollectivePermuteCmd::buffers() const {
+  BufferUseVector buffer_usage;
+  for (const CollectiveThunk::Buffer& buffer : buffers_) {
+    buffer_usage.emplace_back(BufferUse::Read(buffer.source_buffer));
+    buffer_usage.emplace_back(BufferUse::Write(buffer.destination_buffer));
+  }
+  return buffer_usage;
+}
+
 //===----------------------------------------------------------------------===//
 // DynamicSliceFusionCmd
 //===----------------------------------------------------------------------===//
@@ -2364,7 +2512,7 @@ CommandBufferCmd::BufferUseVector CollectiveBroadcastCmd::buffers() const {
 DynamicSliceFusionCmd::DynamicSliceFusionCmd(
     CommandBufferCmdExecutor embedded_commands,
     std::vector<std::optional<BufferAllocation::Slice>> arguments,
-    std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+    std::vector<BufferAllocation> fake_allocations,
     std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
@@ -2440,8 +2588,7 @@ absl::Status DynamicSliceFusionCmd::Initialize(
 }
 
 absl::Status DynamicSliceFusionCmd::Prepare(
-    const Thunk::PrepareParams& params,
-    Thunk::ResourceRequestsInterface& resource_requests) {
+    const Thunk::PrepareParams& params) {
   for (DynamicSliceThunk::SliceDef& slice : slices_) {
     VLOG(3) << "DynamicSliceFusionCmd: slice: " << slice.ToString();
     if (slice.offsets.has_value()) {
@@ -2457,7 +2604,7 @@ absl::Status DynamicSliceFusionCmd::Prepare(
                    slice.orig_shape->dimensions().size());
     }
   }
-  TF_RETURN_IF_ERROR(embedded_commands_.Prepare(params, resource_requests));
+  TF_RETURN_IF_ERROR(embedded_commands_.Prepare(params));
   if (offset_as_function_of_indvar_metadata_ != std::nullopt) {
     Indvar(this) =
         HloEvaluator()
@@ -2718,8 +2865,12 @@ DynamicSliceCopyFusionCmd::Record(const Thunk::ExecuteParams& execute_params,
       [&](const se::CommandBuffer::Command* command) {
         int64_t iteration_index = 0;
         if (offsets_.depends_on_loop) {
-          TF_ASSIGN_OR_RETURN(iteration_index,
-                              WhileThunk::CurrentLoopIteration());
+          if (WhileThunk::RunningWhileThunkLoop()) {
+            TF_ASSIGN_OR_RETURN(iteration_index,
+                                WhileThunk::CurrentLoopIteration());
+          } else {
+            iteration_index = record_params.unroll_iteration;
+          }
         }
         int64_t src_offset = offsets_.src_offsets[iteration_index];
         int64_t dst_offset = offsets_.dst_offsets[iteration_index];
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index fae591555e3e09..96a8031d4e1627 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -37,13 +37,17 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/runtime/buffer_use.h"
@@ -94,6 +98,7 @@ namespace xla::gpu {
   V(kAllToAllCmd, "AllToAllCmd")                                 \
   V(kAllGatherCmd, "AllGatherCmd")                               \
   V(kCollectiveBroadcastCmd, "CollectiveBroadcastCmd")           \
+  V(kCollectivePermuteCmd, "CollectivePermuteCmd")               \
   V(kAsyncDone, "AsyncDone")                                     \
   V(kDynamicSliceFusionCmd, "DynamicSliceFusionCmd")             \
   V(kDynamicSliceCopyFusionCmd, "DynamicSliceCopyFusionCmd")     \
@@ -259,9 +264,7 @@ class CommandBufferCmd {
 
   // Prepare command for execution by allowing command to request shared state
   // required for recording (i.e. collective commands request cliques).
-  virtual absl::Status Prepare(
-      const Thunk::PrepareParams& params,
-      Thunk::ResourceRequestsInterface& resource_requests) {
+  virtual absl::Status Prepare(const Thunk::PrepareParams& params) {
     return absl::OkStatus();
   }
 
@@ -418,8 +421,7 @@ class CommandBufferCmdExecutor {
       SynchronizationMode synchronization_mode);
 
   // Prepares all commands added to a sequence.
-  absl::Status Prepare(const Thunk::PrepareParams& params,
-                       Thunk::ResourceRequestsInterface& resource_requests);
+  absl::Status Prepare(const Thunk::PrepareParams& params);
 
   // Initializes all commands added to a sequence.
   absl::Status Initialize(const Thunk::InitializeParams& params,
@@ -869,6 +871,8 @@ class WhileCmd : public CommandBufferCmd {
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
+  absl::Status Prepare(const Thunk::PrepareParams& params) override;
+
   absl::StatusOr<const se::CommandBuffer::Command*> Record(
       const Thunk::ExecuteParams& execute_params,
       const RecordParams& record_params, RecordAction record_action,
@@ -984,15 +988,14 @@ class CuDnnCmd : public TracedCommandBufferCmd {
 
 class CustomCallCmd : public CommandBufferCmd {
  public:
-  using Slice = CustomCallThunk::Slice;
   using CustomCallTarget = CustomCallThunk::CustomCallTarget;
-  using AttributesMap = CustomCallThunk::AttributesMap;
+  using AttributesMap = ffi::AttributesMap;
 
   // This is a legacy custom call API that is discouraged, and will be
   // deprecated once XLA:FFI mechanism is ready.
   CustomCallCmd(std::string target_name, CustomCallTarget call_target,
-                std::vector<std::optional<Slice>> operands,
-                std::vector<std::optional<Slice>> results,
+                std::vector<NullableShapedSlice> operands,
+                std::vector<NullableShapedSlice> results,
                 absl::string_view opaque)
       : CommandBufferCmd(CommandBufferCmdType::kCustomCallCmd),
         target_name_(std::move(target_name)),
@@ -1002,8 +1005,8 @@ class CustomCallCmd : public CommandBufferCmd {
         results_(std::move(results)) {}
 
   CustomCallCmd(std::string target_name, XLA_FFI_Handler* handler,
-                std::vector<std::optional<Slice>> operands,
-                std::vector<std::optional<Slice>> results,
+                std::vector<NullableShapedSlice> operands,
+                std::vector<NullableShapedSlice> results,
                 ffi::CallFrame call_frame,
                 const HloComputation* called_computation)
       : CommandBufferCmd(CommandBufferCmdType::kCustomCallCmd),
@@ -1055,8 +1058,8 @@ class CustomCallCmd : public CommandBufferCmd {
 
   const HloComputation* called_computation_;
 
-  std::vector<std::optional<Slice>> operands_;
-  std::vector<std::optional<Slice>> results_;
+  std::vector<NullableShapedSlice> operands_;
+  std::vector<NullableShapedSlice> results_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1068,9 +1071,7 @@ class CollectiveCmd : public CommandBufferCmd {
   CollectiveCmd(CommandBufferCmdType cmd_type, CollectiveConfig config,
                 std::shared_ptr<CollectiveThunk::AsyncEvents> async_events);
 
-  absl::Status Prepare(
-      const Thunk::PrepareParams& params,
-      Thunk::ResourceRequestsInterface& resource_requests) final;
+  absl::Status Prepare(const Thunk::PrepareParams& params) final;
 
   bool requires_initialization() override { return true; }
 
@@ -1204,6 +1205,29 @@ class CollectiveBroadcastCmd : public CollectiveCmd {
   std::vector<CollectiveThunk::Buffer> buffers_;
 };
 
+//===----------------------------------------------------------------------===//
+// CollectivePermuteCmd
+//===----------------------------------------------------------------------===//
+
+class CollectivePermuteCmd : public CollectiveCmd {
+ public:
+  CollectivePermuteCmd(
+      CollectiveConfig config, P2PConfig p2p_config,
+      absl::Span<const CollectiveThunk::Buffer> buffers,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events);
+
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
+
+  BufferUseVector buffers() const override;
+
+ private:
+  P2PConfig p2p_config_;
+  std::vector<CollectiveThunk::Buffer> buffers_;
+};
+
 //===----------------------------------------------------------------------===//
 // DynamicSliceFusionCmd
 //===----------------------------------------------------------------------===//
@@ -1213,7 +1237,7 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
   DynamicSliceFusionCmd(
       CommandBufferCmdExecutor embedded_commands,
       std::vector<std::optional<BufferAllocation::Slice>> arguments,
-      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
+      std::vector<BufferAllocation> fake_allocations,
       std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
           offsets,
       std::vector<std::optional<Shape>> orig_shapes,
@@ -1226,9 +1250,7 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Prepare(
-      const Thunk::PrepareParams& params,
-      Thunk::ResourceRequestsInterface& resource_requests) final;
+  absl::Status Prepare(const Thunk::PrepareParams& params) final;
 
   absl::StatusOr<const se::CommandBuffer::Command*> Record(
       const Thunk::ExecuteParams& execute_params,
@@ -1241,14 +1263,14 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
 
   bool requires_initialization() override;
 
-  bool support_loop_unroll() override { return false; }
+  bool support_loop_unroll() override { return true; }
 
   bool IsNestedCommandBuffer() const final { return true; }
 
  private:
   CommandBufferCmdExecutor embedded_commands_;
   std::vector<DynamicSliceThunk::SliceDef> slices_;
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
+  std::vector<BufferAllocation> fake_allocations_;
 
   // Pinned host memory for transferring offset values from device to host.
   absl::Mutex mutex_;
@@ -1295,7 +1317,7 @@ class DynamicSliceCopyFusionCmd : public CommandBufferCmd {
 
   bool force_update() override { return offsets_.depends_on_loop; }
 
-  bool support_loop_unroll() override { return false; }
+  bool support_loop_unroll() override { return true; }
 
   BufferUseVector buffers() const override;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
index 8cb463184fb184..a4532883b8215d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -31,12 +31,15 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/gpu/runtime/all_to_all_thunk.h"
+#include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/cudnn_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
@@ -183,6 +186,19 @@ static absl::StatusOr<Command> Convert(const AllGatherStartThunk& thunk) {
                                         thunk.async_events());
 }
 
+static absl::StatusOr<Command> Convert(
+    const CollectiveBroadcastStartThunk& thunk) {
+  return std::make_unique<CollectiveBroadcastCmd>(
+      thunk.config(), thunk.buffers(), thunk.async_events());
+}
+
+static absl::StatusOr<Command> Convert(
+    const CollectivePermuteStartThunk& thunk) {
+  return std::make_unique<CollectivePermuteCmd>(
+      thunk.config(), thunk.p2p_config(), thunk.buffers(),
+      thunk.async_events());
+}
+
 static absl::StatusOr<Command> Convert(
     const DynamicSliceThunk& thunk, const ConvertToCommandsOptions& options) {
   TF_ASSIGN_OR_RETURN(
@@ -190,10 +206,10 @@ static absl::StatusOr<Command> Convert(
       ConvertToCommands(thunk.get_embedded_thunk()->thunks(), options));
 
   auto& thunk_fake_allocations = thunk.get_fake_allocations();
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
+  std::vector<BufferAllocation> fake_allocations;
   for (auto it = thunk_fake_allocations.begin();
        it != thunk_fake_allocations.end(); ++it) {
-    fake_allocations.push_back(std::make_unique<BufferAllocation>(**it));
+    fake_allocations.push_back(BufferAllocation(*it));
   }
   return std::make_unique<DynamicSliceFusionCmd>(
       std::move(embedded_cmds), thunk.get_arguments(),
@@ -218,11 +234,10 @@ static absl::StatusOr<Command> Convert(const CustomCallThunk& thunk) {
         thunk.target_name(), bundle->execute, thunk.operands(), thunk.results(),
         *thunk.call_frame(),
         /*called_computation=*/nullptr);  // TODO(b/342285364)
-  } else {
-    return std::make_unique<CustomCallCmd>(
-        thunk.target_name(), thunk.call_target(), thunk.operands(),
-        thunk.results(), thunk.opaque());
   }
+  return std::make_unique<CustomCallCmd>(thunk.target_name(),
+                                         thunk.call_target(), thunk.operands(),
+                                         thunk.results(), thunk.opaque());
 }
 
 static absl::StatusOr<Command> Convert(const CuDnnThunk& thunk) {
@@ -288,6 +303,10 @@ static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
       return append(Convert<ReduceScatterStartThunk>(thunk));
     case Thunk::Kind::kAllToAllStart:
       return append(Convert<AllToAllStartThunk>(thunk));
+    case Thunk::Kind::kCollectiveBroadcastStart:
+      return append(Convert<CollectiveBroadcastStartThunk>(thunk));
+    case Thunk::Kind::kCollectivePermuteStart:
+      return append(Convert<CollectivePermuteStartThunk>(thunk));
     case Thunk::Kind::kPartitionId:
       return append(Convert<PartitionIdThunk>(thunk));
     case Thunk::Kind::kReplicaId:
@@ -308,8 +327,10 @@ static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
 
     case Thunk::Kind::kAllGatherDone:
     case Thunk::Kind::kAllReduceDone:
-    case Thunk::Kind::kReduceScatterDone:
     case Thunk::Kind::kAllToAllDone:
+    case Thunk::Kind::kCollectiveBroadcastDone:
+    case Thunk::Kind::kCollectivePermuteDone:
+    case Thunk::Kind::kReduceScatterDone:
       if (options.synchronization_mode ==
           CommandBufferCmdExecutor::SynchronizationMode::kLHS) {
         return append(absl::StatusOr<Command>(std::make_unique<AsyncDoneCmd>(
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
index d090887418a652..ff9a89276ee6c5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
@@ -15,17 +15,20 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 
+#include <algorithm>
 #include <array>
 #include <cstdint>
 #include <utility>
 #include <vector>
 
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -38,11 +41,11 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -63,8 +66,8 @@ static se::StreamExecutor* GpuExecutor() {
 // Some of the tests rely on CUDA 12.9+ features.
 bool IsAtLeastCuda12900(const se::StreamExecutor* stream_executor) {
   const auto& device_description = stream_executor->GetDeviceDescription();
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-      &device_description.gpu_compute_capability());
+  const auto* cuda_cc =
+      device_description.gpu_compute_capability().cuda_compute_capability();
   if (cuda_cc != nullptr) {
     // We need a recent driver to support the feature at runtime and we need a
     // recent version of the toolkit at compile time, so that we have access to
@@ -877,10 +880,10 @@ static void BM_GetOrTraceCommandBuffer(benchmark::State& state) {
   absl::FunctionRef<absl::Status(se::Stream*)> trace_ref(trace);
 
   for (auto s : state) {
-    TF_CHECK_OK(traced_cmd_buffer
-                    .GetOrTraceCommandBuffer(&allocations[index++ % 4],
-                                             executor, stream.get(), trace_ref)
-                    .status());
+    CHECK_OK(traced_cmd_buffer
+                 .GetOrTraceCommandBuffer(&allocations[index++ % 4], executor,
+                                          stream.get(), trace_ref)
+                 .status());
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
index c1381c37290d33..f6ef881b0966e3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
@@ -23,14 +23,15 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/overload.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
@@ -44,7 +45,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/ffi/ffi_api.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/tsl/platform/errors.h"
@@ -56,22 +57,10 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace {
 
 using CommandBufferConfig = CommandBufferConversionPass::CommandBufferConfig;
 
-std::string CommandBufferConversionPass::CommandBufferConfig::ToString() const {
-  std::vector<std::string> cmd_names;
-  cmd_names.reserve(enabled_commands.size());
-  for (auto cmd : enabled_commands) {
-    cmd_names.push_back(DebugOptions::CommandBufferCmdType_Name(cmd));
-  }
-  std::string out;
-  out += "enabled_commands: [" + absl::StrJoin(cmd_names, ", ") + "]";
-  return out;
-}
-
-namespace {
-
 CommandBufferConfig GetCommandBufferConfig(
     const DebugOptions& debug_options,
     const se::DeviceDescription& device_info) {
@@ -108,19 +97,16 @@ CommandBufferConfig GetCommandBufferConfig(
   };
 
   // Check if CUDA/ROCM driver supports required features.
-  auto erase_cuda = [&](const se::CudaComputeCapability& cuda_comp) {
+  if (device_info.gpu_compute_capability().IsCuda()) {
     if (std::min(device_info.runtime_version(), device_info.driver_version()) <
         se::SemanticVersion{12, 3, 0}) {
       erase(kRequireTracing);       // cuStreamBeginCaptureToGraph
       erase(kRequireConditionals);  // on-device control flow
     }
-  };
-  auto erase_rocm = [&](const se::RocmComputeCapability& rocm_comp) {
+  }
+  if (device_info.gpu_compute_capability().IsRocm()) {
     erase(kRequireConditionals);  // on-device control flow
-  };
-
-  std::visit(absl::Overload(erase_cuda, erase_rocm),
-             device_info.gpu_compute_capability());
+  }
 
   return config;
 }
@@ -217,7 +203,7 @@ bool IsConvertible(const CustomCallThunk& custom_call_thunk,
   absl::StatusOr<ffi::HandlerRegistration> registration =
       ffi::FindHandler(target_name, "gpu");
   return registration.ok()
-             ? ffi::IsCommandBufferCompatible(registration->traits)
+             ? ffi::IsCommandBufferCompatible(registration->metadata)
              : false;
 }
 
@@ -414,8 +400,18 @@ absl::Status FlushCommandBuffer(
 
 }  // namespace
 
+std::string CommandBufferConversionPass::CommandBufferConfig::ToString() const {
+  auto formatter = [](std::string* out,
+                      DebugOptions::CommandBufferCmdType cmd) {
+    absl::StrAppend(out, DebugOptions::CommandBufferCmdType_Name(cmd));
+  };
+  std::string cmd_names = absl::StrJoin(enabled_commands, ", ", formatter);
+  return absl::StrCat("enabled_commands: [", cmd_names, "]");
+}
+
 absl::StatusOr<bool> CommandBufferConversionPass::Run(
     SequentialThunk* root_thunk, const DebugOptions& debug_options,
+    const HloModule* absl_nullable hlo_module,
     const se::DeviceDescription& device_info,
     ThunkPassBufferAllocator& allocator) {
   tsl::profiler::TraceMe traceme("CommandBufferConversionPass");
@@ -455,11 +451,8 @@ absl::StatusOr<bool> CommandBufferConversionPass::Run(
       if (!region.empty()) {
         // If a valid region is found, add the whole region to the current
         // sequence and continue processing.
-        current_command_buffer_thunks.insert(
-            current_command_buffer_thunks.end(),
-            std::make_move_iterator(region.begin()),
-            std::make_move_iterator(region.end()));
         i += region.size() - 1;
+        absl::c_move(region, std::back_inserter(current_command_buffer_thunks));
         continue;
       }
     } else if (IsConvertible(*thunk.get(), config) && !thunk->IsAsyncDone()) {
@@ -475,16 +468,16 @@ absl::StatusOr<bool> CommandBufferConversionPass::Run(
       auto while_thunk = static_cast<WhileThunk*>(thunk.get());
       TF_ASSIGN_OR_RETURN(bool changed_in_body,
                           Run(while_thunk->body_thunk_sequence(), debug_options,
-                              device_info, allocator));
+                              hlo_module, device_info, allocator));
       changed |= changed_in_body;
     } else if (thunk->kind() == Thunk::kConditional) {
       // If a `ConditionalThunk` itself is not eligible for conversion into a
       // command buffer, we attempt to convert thunks within its branches.
       auto conditional_thunk = static_cast<ConditionalThunk*>(thunk.get());
       for (auto& branch_thunk : conditional_thunk->branch_thunks()) {
-        TF_ASSIGN_OR_RETURN(
-            bool changed_in_branch,
-            Run(branch_thunk.get(), debug_options, device_info, allocator));
+        TF_ASSIGN_OR_RETURN(bool changed_in_branch,
+                            Run(branch_thunk.get(), debug_options, hlo_module,
+                                device_info, allocator));
         changed |= changed_in_branch;
       }
     }
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h
index 9defc515b1f07e..fbde0e128a9479 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -32,7 +33,7 @@ namespace gpu {
 // Converts compatible sequences of Thunks into CommandBufferThunks.
 class CommandBufferConversionPass : public ThunkPassInterface {
  public:
-  CommandBufferConversionPass(absl::string_view module_name = "")
+  explicit CommandBufferConversionPass(absl::string_view module_name = "")
       : module_name_(module_name) {}
 
   absl::string_view name() const override {
@@ -41,6 +42,7 @@ class CommandBufferConversionPass : public ThunkPassInterface {
 
   absl::StatusOr<bool> Run(SequentialThunk* root_thunk,
                            const DebugOptions& debug_options,
+                           const HloModule* absl_nullable hlo_module,
                            const se::DeviceDescription& device_info,
                            ThunkPassBufferAllocator& allocator) override;
   struct CommandBufferConfig {
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
index 2f5a9584fa7e7b..d8854aa70db011 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
@@ -24,11 +24,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
@@ -158,7 +158,7 @@ std::unique_ptr<CollectiveDoneThunk> CreateAllGatherDoneThunk(
       static_cast<const AllGatherStartThunk*>(start_thunk)->async_events();
   return std::make_unique<CollectiveDoneThunk>(
       Thunk::kAllGatherDone, Thunk::ThunkInfo(), std::move(async_events),
-      AsyncStreamKind::kCollective);
+      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE);
 }
 
 std::unique_ptr<WhileThunk> CreateWhileThunk(
@@ -199,7 +199,7 @@ std::unique_ptr<CustomCallThunk> CreateCustomCallThunk(
                               /*operands=*/{},
                               /*results=*/{},
                               /*opaque=*/"");
-  TF_CHECK_OK(thunk.status());
+  CHECK_OK(thunk.status());
   return std::move(thunk).value();
 }
 
@@ -248,7 +248,8 @@ TEST(CommandBufferConversionPassTest, ConvertsToCommandBufferThunk) {
   // supported in command buffers. The expected transformation is:
   // SequentialThunk(CopyThunk) ->
   // SequentialThunk(CommandBufferThunk(CopyThunk))
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   EXPECT_THAT(root_thunk->thunks(), ThunkKindsAre(Thunk::kCommandBuffer));
@@ -287,7 +288,8 @@ TEST(CommandBufferConversionPassTest, PartiallyConvertsToCommandBufferThunk) {
 
   ASSERT_EQ(root_thunk->thunks().size(), 3);
 
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation: (Copy, Gemm, Copy) -> (CommandBuffer(Copy), Gemm,
@@ -335,7 +337,8 @@ TEST(CommandBufferConversionPassTest, ConvertsAsyncPairToCommandBuffer) {
   se::DeviceDescription device_info = TestGpuDeviceInfo::CudaOrRocmDeviceInfo();
   FakeErrorAllocator allocator;
   CommandBufferConversionPass pass("test");
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation:
@@ -381,7 +384,8 @@ TEST(CommandBufferConversionPassTest,
   CommandBufferConversionPass pass("test");
   // Expected no transformation, because there is a non-convertible thunk in
   // between the asyncs.
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(false));
   EXPECT_THAT(root_thunk->thunks(),
               ThunkKindsAre(Thunk::kAllGatherStart, Thunk::kCopy,
@@ -413,7 +417,8 @@ TEST(CommandBufferConversionPassTest, ConvertCrossedAsyncs) {
   debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
 
   FakeErrorAllocator allocator;
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation: Convert all 4 thunks into command buffer
@@ -458,7 +463,8 @@ TEST(CommandBufferConversionPassTest, ConvertNestedAsyncs) {
   debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
 
   FakeErrorAllocator allocator;
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation: Convert all 5 thunks into command buffer
@@ -507,7 +513,8 @@ TEST(CommandBufferConversionPassTest, DontConvertAsyncsIfUnpairedStart) {
   debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
 
   FakeErrorAllocator allocator;
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation: {Copy, AllGatherStart0, AllGatherStart1,
@@ -564,7 +571,8 @@ TEST(CommandBufferConversionPassTest, ConvertsAsyncPairsMixedWithOtherThunks) {
   se::DeviceDescription device_info = TestGpuDeviceInfo::CudaOrRocmDeviceInfo();
   FakeErrorAllocator allocator;
   CommandBufferConversionPass pass("test");
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation:
@@ -605,7 +613,8 @@ TEST(CommandBufferConversionPassTest, DontConvertIfNotMinGraphSize) {
 
   // The size of the sequence is less than the min graph size, so it should not
   // be converted to a command buffer.
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(false));
   EXPECT_THAT(root_thunk->thunks(), ThunkKindsAre(Thunk::kCopy));
 }
@@ -641,7 +650,8 @@ TEST(CommandBufferConversionPassTest, ConvertWhileThunk) {
   FakeErrorAllocator allocator;
   ASSERT_EQ(root_thunk->thunks().size(), 1);
 
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation: (While({Copy}, {Gemm})) ->
@@ -703,7 +713,8 @@ TEST(CommandBufferConversionPassTest,
   FakeErrorAllocator allocator;
   ASSERT_EQ(root_thunk->thunks().size(), 1);
 
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation is: kConditional({kCopy}, {kAllGatherStart, kCopy})
@@ -753,7 +764,8 @@ TEST(CommandBufferConversionPassTest, ConvertWhileThunkWithAsyncPair) {
   FakeErrorAllocator allocator;
   ASSERT_EQ(root_thunk->thunks().size(), 1);
 
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // Expected transformation: (While({Copy}, {AllGatherStart, Copy,
@@ -800,7 +812,8 @@ TEST(CommandBufferConversionPassTest, ConvertsCuDnnThunkToCommandBufferThunk) {
 
   // The expected transformation is: SequentialThunk(CuDnnThunk) ->
   // SequentialThunk(CommandBufferThunk(CuDnnThunk))
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
   EXPECT_THAT(root_thunk->thunks(), ThunkKindsAre(Thunk::kCommandBuffer));
 
@@ -846,7 +859,8 @@ TEST(CommandBufferConversionPassTest, ConvertTheBodyOfWhileThunk) {
   FakeErrorAllocator allocator;
   ASSERT_EQ(root_thunk->thunks().size(), 1);
 
-  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, device_info, allocator),
+  ASSERT_THAT(pass.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                       device_info, allocator),
               IsOkAndHolds(true));
 
   // While thunk is not converted itself, because it has a non-convertible thunk
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
index 660a3e9fc2e5af..3f37e098c9b165 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
@@ -118,20 +118,19 @@ CommandBufferThunk::ExecutorCommandBuffer::UpdateBufferAllocations(
   return updated_allocs;
 }
 
-absl::Status CommandBufferThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status CommandBufferThunk::Prepare(const PrepareParams& params) {
   // We might end up with empty command sequence if all of the captured fusions
   // are no-op (e.g. memcpy of size 0) and we have no emitted thunks for them.
   if (commands_.empty()) {
     return absl::OkStatus();
   }
 
-  TF_RETURN_IF_ERROR(commands_.Prepare(params, resource_requests));
+  TF_RETURN_IF_ERROR(commands_.Prepare(params));
 
   // Always prepare thunks if they are present so we are ready to fall back
   // on them if we detect profiling activity.
   if (thunks_) {
-    TF_RETURN_IF_ERROR(thunks_->Prepare(params, resource_requests));
+    TF_RETURN_IF_ERROR(thunks_->Prepare(params));
   }
 
   return absl::OkStatus();
@@ -157,6 +156,13 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
     TF_RETURN_IF_ERROR(thunks_->Initialize(params));
   }
 
+  // If there are no thunks, or command buffer does not require initialization,
+  // we can mark warm up as done immediately.
+  if ((!thunks_ || !commands_.requires_initialization()) &&
+      !cmd_buffer->warmup_done) {
+    cmd_buffer->warmup_done = true;
+  }
+
   // Construct ExecuteParams with empty fields for everything that is not needed
   // for recording commands.
   Thunk::ExecuteParams execute_params(
@@ -177,9 +183,9 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
   // If commands require initialization, we also record them into the command
   // buffer before execution. This is required to guarantee that collective
   // commands recorded on all participating ranks to avoid deadlocks.
-  if (cmd_buffer->command_buffer->state() ==
-          se::CommandBuffer::State::kCreate ||
-      commands_.requires_initialization()) {
+  if (cmd_buffer->warmup_done && (cmd_buffer->command_buffer->state() ==
+                                      se::CommandBuffer::State::kCreate ||
+                                  commands_.requires_initialization())) {
     VLOG(3) << "Initialize command buffer on device #"
             << params.executor->device_ordinal()
             << " by recoding command buffer cmd sequence"
@@ -229,6 +235,7 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
       !enable_command_buffers_during_profiling_) {
     VLOG(1) << "Execute command buffer thunk as a regular thunk sequence "
                "because we detected active profiling session";
+    TraceMe trace("WARNING: CommandBuffer disabled when profiling");
     return thunks_->ExecuteOnStream(params);
   }
 
@@ -238,6 +245,14 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   absl::MutexLock lock(cmd_buffer->mutex);
 
+  // warm up iteration, run through thunks if they are present.
+  if (!cmd_buffer->warmup_done && thunks_) {
+    VLOG(2) << "Executing warm up iteration of command buffer thunk";
+    TF_RETURN_IF_ERROR(thunks_->ExecuteOnStream(params));
+    cmd_buffer->warmup_done = true;
+    return absl::OkStatus();
+  }
+
   // Update buffer allocations and collect all allocations that changed since
   // the last command buffer execution.
   auto updated_allocs = cmd_buffer->UpdateBufferAllocations(commands_, params);
@@ -331,7 +346,7 @@ void CommandBufferThunk::TrackCommandBuffers(
 }
 
 void CommandBufferThunk::EvictCommandBuffers() {
-  TraceMe trace([&] { return "EvictCommandBuffers"; });
+  TraceMe trace("EvictCommandBuffers");
 
   auto* global_state = GetGlobalState();
   absl::MutexLock global_state_lock(global_state->mutex);
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
index 6930106f758f4d..cfeeda00584e9f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
@@ -45,8 +45,7 @@ class CommandBufferThunk : public Thunk {
 
   const std::unique_ptr<SequentialThunk>& thunks() const { return thunks_; }
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -101,6 +100,15 @@ class CommandBufferThunk : public Thunk {
 
     // Number of command buffer executions since last update.
     int64_t num_executions ABSL_GUARDED_BY(mutex) = 0;
+
+    // For GPU backend, NCCL may call cuda-graph un-supported host side API
+    // during graph capturing (e.g. cuCtxEnablePeerAccess), this will break XLA
+    // cuda graph run. To work around the issue, this PR introduces a warm up
+    // iteration for command buffer thunk, during warm up iteration, command
+    // buffer thunk are executed through normal thunks. The warm up iteration
+    // will do the proper NCCL setup, so later iterations running through
+    // command buffer does not need to call NCCL setup APIs.
+    bool warmup_done ABSL_GUARDED_BY(mutex) = false;
   };
 
   // Command buffer thunk owns commands buffers instantiated on all executors.
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index 46d50b00aa4cd2..98c53490804a7d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -23,14 +23,12 @@ limitations under the License.
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
-#include "absl/strings/match.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
@@ -38,22 +36,18 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/hlo_module_config.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -67,8 +61,6 @@ limitations under the License.
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
-#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
@@ -119,8 +111,8 @@ KernelArgsPacking CreateDefaultArgsPacking() {
 // Some of the tests rely on CUDA 12.3+ features.
 bool IsAtLeastCuda12300(const se::StreamExecutor* stream_executor) {
   const auto& device_description = stream_executor->GetDeviceDescription();
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-      &device_description.gpu_compute_capability());
+  const auto* cuda_cc =
+      device_description.gpu_compute_capability().cuda_compute_capability();
   if (cuda_cc != nullptr) {
     // We need a recent driver to support the feature at runtime and we need a
     // recent version of the toolkit at compile time, so that we have access to
@@ -137,8 +129,8 @@ bool IsAtLeastCuda12300(const se::StreamExecutor* stream_executor) {
 
 bool IsAtLeastCuda12900(const se::StreamExecutor* stream_executor) {
   const auto& device_description = stream_executor->GetDeviceDescription();
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-      &device_description.gpu_compute_capability());
+  const auto* cuda_cc =
+      device_description.gpu_compute_capability().cuda_compute_capability();
   if (cuda_cc != nullptr) {
     // We need a recent driver to support the feature at runtime and we need a
     // recent version of the toolkit at compile time, so that we have access to
@@ -419,6 +411,11 @@ TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersEnabledDuringProfiling) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto profiler_lock,
                           tsl::profiler::ProfilerLock::Acquire());
+
+  // skip warm up iteration
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
   // Execute command buffer thunk and verify that it set the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
   TF_ASSERT_OK(stream->BlockHostUntilDone());
@@ -957,24 +954,22 @@ TEST(CommandBufferThunkTest, DISABLED_DynamicSliceFusionCmd) {
   TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   // Prepare buffer allocations for recording command buffer.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
-  fake_allocations[0] = std::make_unique<BufferAllocation>(
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(
       /*index=*/0, fake_lhs_length, /*color=*/0);
-  fake_allocations[1] =
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0);
-  fake_allocations[2] =
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length,
-                                         /*color=*/0);
-
-  fake_allocations[3] =
-      std::make_unique<BufferAllocation>(/*index=*/3, 1024 * 1024,
-                                         /*color=*/0);
-  BufferAllocation::Slice fake_slice_lhs(fake_allocations[0].get(), 0,
+  fake_allocations.emplace_back(
+      /*index=*/1, rhs_length, /*color=*/0);
+  fake_allocations.emplace_back(/*index=*/2, out_length,
+                                /*color=*/0);
+
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024,
+                                /*color=*/0);
+  BufferAllocation::Slice fake_slice_lhs(&fake_allocations[0], 0,
                                          fake_lhs_length);
-  BufferAllocation::Slice slice_rhs(fake_allocations[1].get(), 0, rhs_length);
-  BufferAllocation::Slice slice_out(fake_allocations[2].get(), 0, out_length);
-  BufferAllocation::Slice slice_workspace(fake_allocations[3].get(), 0,
-                                          1024 * 1024);
+  BufferAllocation::Slice slice_rhs(&fake_allocations[1], 0, rhs_length);
+  BufferAllocation::Slice slice_out(&fake_allocations[2], 0, out_length);
+  BufferAllocation::Slice slice_workspace(&fake_allocations[3], 0, 1024 * 1024);
   auto config = GemmConfig::For(
       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {}, {1},
       ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}), {}, {0},
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
index d693a343f13229..81d3128db6a181 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
@@ -57,15 +57,14 @@ ConditionalThunk::ConditionalThunk(
       branch_thunks_(std::move(branch_thunks)),
       branch_index_is_bool_(branch_index_is_bool) {}
 
-absl::Status ConditionalThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status ConditionalThunk::Prepare(const PrepareParams& params) {
   if (branch_index_is_bool_) {
     TF_RET_CHECK(branch_thunks_.size() == 2);
   } else {
     TF_RET_CHECK(!branch_thunks_.empty());
   }
   for (auto& branch_thunk : branch_thunks_) {
-    TF_RETURN_IF_ERROR(branch_thunk->Prepare(params, resource_requests));
+    TF_RETURN_IF_ERROR(branch_thunk->Prepare(params));
   }
   return absl::OkStatus();
 }
@@ -165,6 +164,20 @@ void ConditionalThunk::ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) {
   }
 }
 
+absl::Status ConditionalThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  for (std::unique_ptr<SequentialThunk>& branch_thunk : branch_thunks_) {
+    TF_RETURN_IF_ERROR(branch_thunk->TransformAllNestedThunks(fn));
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
+                        fn(std::move(branch_thunk)));
+    branch_thunk = SequentialThunk::FromThunk(std::move(thunk));
+  }
+  return absl::OkStatus();
+}
+
 absl::StatusOr<ThunkProto> ConditionalThunk::ToProto() const {
   ThunkProto proto;
   *proto.mutable_thunk_info() = thunk_info().ToProto();
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
index be43d4d6755c56..bf1b73bec3a2a0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_CONDITIONAL_THUNK_H_
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -57,8 +58,7 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -72,6 +72,11 @@ class ConditionalThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
+
   bool branch_index_is_bool() const { return branch_index_is_bool_; }
 
   absl::StatusOr<ThunkProto> ToProto() const override;
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
index 96437e5dad8fe3..247ab4eca9835f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
@@ -25,20 +25,22 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::Pointee;
 using ::testing::Property;
+using ::testing::SizeIs;
 using ::tsl::proto_testing::EqualsProto;
 using Kind = Thunk::Kind;
 
@@ -288,5 +290,29 @@ TEST(ConditionalThunkTest, ToString) {
             "  000: kGemm\t\n");
 }
 
+TEST(ConditionalThunkTest, TransformAllNestedThunks) {
+  BufferAllocation::Slice slice;
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+  branch_thunks.push_back(
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
+  branch_thunks.push_back(
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
+  auto conditional_thunk = std::make_unique<ConditionalThunk>(
+      Thunk::ThunkInfo(), slice, std::move(branch_thunks),
+      /*branch_index_is_bool=*/false);
+
+  TF_EXPECT_OK(conditional_thunk->TransformAllNestedThunks([](auto) {
+    return std::make_unique<DummyThunk>(Kind::kCustomCall, Thunk::ThunkInfo());
+  }));
+
+  EXPECT_THAT(conditional_thunk->branch_thunks(), SizeIs(2));
+  EXPECT_THAT(conditional_thunk->branch_thunks()[0]->thunks(), SizeIs(1));
+  EXPECT_THAT(conditional_thunk->branch_thunks()[0]->thunks()[0]->kind(),
+              Kind::kCustomCall);
+  EXPECT_THAT(conditional_thunk->branch_thunks()[1]->thunks(), SizeIs(1));
+  EXPECT_THAT(conditional_thunk->branch_thunks()[1]->thunks()[0]->kind(),
+              Kind::kCustomCall);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto b/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
new file mode 100644
index 00000000000000..f691051f611d5f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
@@ -0,0 +1,25 @@
+syntax = "proto3";
+
+package xla.gpu;
+
+import "xla/service/buffer_assignment.proto";
+
+// Dimensions of the convolution filter.
+// See stream_executor::dnn::FilterDescriptor for more details.
+message ConvolutionFilterDimensions {
+  // Number of output features (a.k.a. channels).
+  int64 output_feature_map_count = 1;
+  // Number of input features (a.k.a. channels).
+  int64 input_feature_map_count = 2;
+  // Height of the filter.
+  int64 input_filter_height = 3;
+  // Width of the filter.
+  int64 input_filter_width = 4;
+}
+
+// Buffers for the bias input and output of the convolution reorder thunk.
+// Serialized version of xla::gpu::ConvolutionReorderThunk::BiasBuffers.
+message ConvolutionReorderBiasBuffers {
+  xla.buffer_assignment.BufferAllocationSliceProto bias_input = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto bias_output = 2;
+}
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
new file mode 100644
index 00000000000000..c62d1ff735dd04
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
@@ -0,0 +1,142 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_assignment.pb.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+static se::dnn::FilterDescriptor CreateFilterDescriptor(
+    const ConvolutionFilterDimensions& filter_dimensions) {
+  se::dnn::FilterDescriptor filter_desc(/*ndims=*/2);
+  filter_desc.set_layout(se::dnn::FilterLayout::kOutputInputYX32);
+  filter_desc.set_output_feature_map_count(
+      filter_dimensions.output_feature_map_count());
+  filter_desc.set_input_feature_map_count(
+      filter_dimensions.input_feature_map_count());
+  filter_desc.set_input_filter_height(filter_dimensions.input_filter_height());
+  filter_desc.set_input_filter_width(filter_dimensions.input_filter_width());
+  return filter_desc;
+}
+
+ConvolutionReorderThunk::ConvolutionReorderThunk(
+    ThunkInfo thunk_info, ConvolutionFilterDimensions filter_dimensions,
+    BufferAllocation::Slice filter_input, BufferAllocation::Slice filter_output,
+    std::optional<BiasBuffers> biases)
+    : Thunk(Kind::kConvolutionReorder, thunk_info),
+      filter_dimensions_(std::move(filter_dimensions)),
+      filter_descriptor_(CreateFilterDescriptor(filter_dimensions_)),
+      filter_input_(filter_input),
+      filter_output_(filter_output),
+      biases_(biases) {}
+
+absl::Status ConvolutionReorderThunk::ExecuteOnStream(
+    const ExecuteParams& params) {
+  const auto& buffer_allocations = *params.buffer_allocations;
+
+  auto filter_input = se::DeviceMemory<int8_t>(
+      buffer_allocations.GetDeviceAddress(filter_input_));
+  auto filter_output = se::DeviceMemory<int8_t>(
+      buffer_allocations.GetDeviceAddress(filter_output_));
+
+  std::optional<se::DeviceMemory<float>> bias_input;
+  std::optional<se::DeviceMemory<float>> bias_output;
+  if (biases_.has_value()) {
+    bias_input = se::DeviceMemory<float>(
+        buffer_allocations.GetDeviceAddress(biases_->bias_input));
+    bias_output = se::DeviceMemory<float>(
+        buffer_allocations.GetDeviceAddress(biases_->bias_output));
+  }
+
+  auto dnn = params.stream->parent()->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InternalError("No DNN for stream.");
+  }
+  return dnn->CudnnReorderConvolutionFilterAndBias(
+      params.stream, filter_descriptor_, filter_input, &filter_output,
+      std::move(bias_input), std::move(bias_output));
+}
+
+absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>>
+ConvolutionReorderThunk::FromProto(
+    ThunkInfo thunk_info, const ConvolutionReorderThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
+                      BufferAllocation::Slice::FromProto(proto.filter_input(),
+                                                         buffer_allocations));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
+                      BufferAllocation::Slice::FromProto(proto.filter_output(),
+                                                         buffer_allocations));
+
+  std::optional<BiasBuffers> biases;
+  if (proto.has_biases()) {
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_input,
+                        BufferAllocation::Slice::FromProto(
+                            proto.biases().bias_input(), buffer_allocations));
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_output,
+                        BufferAllocation::Slice::FromProto(
+                            proto.biases().bias_output(), buffer_allocations));
+    biases = {{bias_input, bias_output}};
+  }
+
+  return std::make_unique<ConvolutionReorderThunk>(
+      std::move(thunk_info), proto.filter_dimensions(), filter_input,
+      filter_output, biases);
+}
+
+absl::StatusOr<ThunkProto> ConvolutionReorderThunk::ToProto() const {
+  ThunkProto thunk_proto;
+  *thunk_proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  ConvolutionReorderThunkProto* reorder_proto =
+      thunk_proto.mutable_convolution_reorder_thunk();
+  *reorder_proto->mutable_filter_dimensions() = filter_dimensions_;
+
+  TF_ASSIGN_OR_RETURN(*reorder_proto->mutable_filter_input(),
+                      filter_input_.ToProto());
+  TF_ASSIGN_OR_RETURN(*reorder_proto->mutable_filter_output(),
+                      filter_output_.ToProto());
+
+  if (biases_.has_value()) {
+    ConvolutionReorderBiasBuffers* biases_proto =
+        reorder_proto->mutable_biases();
+    TF_ASSIGN_OR_RETURN(*biases_proto->mutable_bias_input(),
+                        biases_->bias_input.ToProto());
+    TF_ASSIGN_OR_RETURN(*biases_proto->mutable_bias_output(),
+                        biases_->bias_output.ToProto());
+  }
+
+  return thunk_proto;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
new file mode 100644
index 00000000000000..064f7859b001b9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_CONVOLUTION_REORDER_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_CONVOLUTION_REORDER_THUNK_H_
+
+#include <memory>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/dnn.h"
+
+namespace xla {
+namespace gpu {
+
+// Launches the kernel that reorders input data for int8x32 convolutions.
+class ConvolutionReorderThunk : public Thunk {
+ public:
+  struct BiasBuffers {
+    BufferAllocation::Slice bias_input;
+    BufferAllocation::Slice bias_output;
+  };
+
+  ConvolutionReorderThunk(ThunkInfo thunk_info,
+                          ConvolutionFilterDimensions filter_dimensions,
+                          BufferAllocation::Slice filter_input,
+                          BufferAllocation::Slice filter_output,
+                          std::optional<BiasBuffers> biases);
+
+  ConvolutionReorderThunk(const ConvolutionReorderThunk&) = delete;
+  ConvolutionReorderThunk& operator=(const ConvolutionReorderThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  static absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>> FromProto(
+      ThunkInfo thunk_info, const ConvolutionReorderThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
+ private:
+  const ConvolutionFilterDimensions filter_dimensions_;
+  const se::dnn::FilterDescriptor filter_descriptor_;
+  BufferAllocation::Slice filter_input_;
+  BufferAllocation::Slice filter_output_;
+  std::optional<BiasBuffers> biases_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_CONVOLUTION_REORDER_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
new file mode 100644
index 00000000000000..69aaf54193541f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(ConvolutionReorderThunkTest, ProtoRoundTrip) {
+  auto proto = ParseTextProtoOrDie<ThunkProto>(R"pb(
+    thunk_info { profile_annotation: "test" execution_stream_id: 0 }
+    convolution_reorder_thunk {
+      filter_dimensions {
+        output_feature_map_count: 1
+        input_feature_map_count: 2
+        input_filter_height: 3
+        input_filter_width: 4
+      }
+      filter_input { buffer_allocation_index: 0 offset: 0 size: 1024 }
+      filter_output { buffer_allocation_index: 1 offset: 0 size: 512 }
+      biases {
+        bias_input { buffer_allocation_index: 2 offset: 0 size: 256 }
+        bias_output { buffer_allocation_index: 3 offset: 0 size: 128 }
+      }
+    }
+  )pb");
+
+  std::vector<BufferAllocation> buffer_allocations;
+  buffer_allocations.emplace_back(/*index=*/0, /*size=*/1024, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/1, /*size=*/512, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/2, /*size=*/256, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/3, /*size=*/128, /*color=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(Thunk::ThunkInfo thunk_info,
+                          Thunk::ThunkInfo::FromProto(proto.thunk_info()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ConvolutionReorderThunk> thunk,
+      ConvolutionReorderThunk::FromProto(
+          thunk_info, proto.convolution_reorder_thunk(), buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
index e935daceb612ad..004a003aa07172 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
@@ -15,14 +15,10 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/convolution_thunk.h"
 
-#include <cstdint>
 #include <memory>
-#include <optional>
 #include <utility>
-#include <variant>
 #include <vector>
 
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
@@ -36,7 +32,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/lazy_op_runner.h"
-#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
@@ -46,6 +41,7 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+using buffer_assignment::BufferAllocationSliceProto;
 
 absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::Create(
     ThunkInfo thunk_info, GpuConvDescriptor descriptor,
@@ -57,14 +53,12 @@ absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::Create(
 
   // Can't use std::make_unique because the constructor is private.
   return absl::WrapUnique(new ConvolutionThunk(
-      thunk_info, std::move(config), std::move(operand_slices),
-      std::move(result_slices), scratch_slice));
+      thunk_info, std::move(descriptor), std::move(config),
+      std::move(operand_slices), std::move(result_slices), scratch_slice));
 }
 
-// TODO: b/431980836 - Store the descriptor once when adding the
-// (de)serialization methods.
 ConvolutionThunk::ConvolutionThunk(
-    ThunkInfo thunk_info, GpuConvConfig config,
+    ThunkInfo thunk_info, GpuConvDescriptor descriptor, GpuConvConfig config,
     std::vector<BufferAllocation::Slice> operand_slices,
     std::vector<BufferAllocation::Slice> result_slices,
     BufferAllocation::Slice scratch_slice)
@@ -72,6 +66,7 @@ ConvolutionThunk::ConvolutionThunk(
       operand_buffers_(std::move(operand_slices)),
       result_buffers_(std::move(result_slices)),
       scratch_buffer_(scratch_slice),
+      descriptor_(std::move(descriptor)),
       config_(std::move(config)) {}
 
 GenericConvRunner& ConvolutionThunk::GetOrCreateRunner(
@@ -108,10 +103,10 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
   RunConvOptions opts;
   opts.runner_cache = &GetOrCreateRunner(params.stream, &runner_created);
 
-  if (runner_created && std::holds_alternative<se::RocmComputeCapability>(
-                            params.stream->parent()
-                                ->GetDeviceDescription()
-                                .gpu_compute_capability())) {
+  if (runner_created && params.stream->parent()
+                            ->GetDeviceDescription()
+                            .gpu_compute_capability()
+                            .IsRocm()) {
     TF_ASSIGN_OR_RETURN(
         GpuConvParams conv_params,
         GetGpuConvParams(config_, operand_se_buffers, result_se_buffers));
@@ -149,54 +144,55 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
   return absl::OkStatus();
 }
 
-ConvolutionReorderThunk::ConvolutionReorderThunk(
-    ThunkInfo thunk_info, absl::Span<int64_t> filter_nchw,
-    absl::InlinedVector<BufferAllocation::Slice, 2> operand_slices,
-    absl::InlinedVector<BufferAllocation::Slice, 2> result_slices)
-    : Thunk(Kind::kConvolutionReorder, thunk_info),
-      filter_descriptor_(CreateFilterDescriptor(filter_nchw)),
-      operand_buffers_(operand_slices),
-      result_buffers_(result_slices) {}
+absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::FromProto(
+    ThunkInfo thunk_info, const ConvolutionThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  TF_ASSIGN_OR_RETURN(GpuConvDescriptor descriptor,
+                      GpuConvDescriptor::FromProto(proto.conv_descriptor()));
 
-absl::Status ConvolutionReorderThunk::ExecuteOnStream(
-    const ExecuteParams& params) {
-  bool has_bias = operand_buffers_.size() > 1;
-  CHECK_EQ(operand_buffers_.size(), result_buffers_.size());
-
-  const auto& buffer_allocations = *params.buffer_allocations;
+  std::vector<BufferAllocation::Slice> operand_slices;
+  operand_slices.reserve(proto.operand_buffers_size());
+  for (const BufferAllocationSliceProto& slice_proto :
+       proto.operand_buffers()) {
+    TF_ASSIGN_OR_RETURN(
+        operand_slices.emplace_back(),
+        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+  }
 
-  auto filter_input = se::DeviceMemory<int8_t>(
-      buffer_allocations.GetDeviceAddress(operand_buffers_[0]));
-  auto filter_output = se::DeviceMemory<int8_t>(
-      buffer_allocations.GetDeviceAddress(result_buffers_[0]));
-  auto bias_input =
-      has_bias ? std::make_optional(se::DeviceMemory<float>(
-                     buffer_allocations.GetDeviceAddress(operand_buffers_[1])))
-               : std::nullopt;
-  auto bias_output =
-      has_bias ? std::make_optional(se::DeviceMemory<float>(
-                     buffer_allocations.GetDeviceAddress(result_buffers_[1])))
-               : std::nullopt;
-
-  auto dnn = params.stream->parent()->AsDnn();
-  if (dnn == nullptr) {
-    return absl::InternalError("No DNN for stream.");
+  std::vector<BufferAllocation::Slice> result_slices;
+  result_slices.reserve(proto.result_buffers_size());
+  for (const BufferAllocationSliceProto& slice_proto : proto.result_buffers()) {
+    TF_ASSIGN_OR_RETURN(
+        result_slices.emplace_back(),
+        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
   }
-  return dnn->CudnnReorderConvolutionFilterAndBias(
-      params.stream, filter_descriptor_, filter_input, &filter_output,
-      std::move(bias_input), std::move(bias_output));
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
+                      BufferAllocation::Slice::FromProto(proto.scratch_buffer(),
+                                                         buffer_allocations));
+
+  return Create(std::move(thunk_info), std::move(descriptor),
+                std::move(operand_slices), std::move(result_slices),
+                scratch_slice);
 }
 
-se::dnn::FilterDescriptor ConvolutionReorderThunk::CreateFilterDescriptor(
-    absl::Span<int64_t> filter_nchw) {
-  CHECK_EQ(filter_nchw.size(), 4);
-  se::dnn::FilterDescriptor filter_desc(2);
-  filter_desc.set_layout(se::dnn::FilterLayout::kOutputInputYX32);
-  filter_desc.set_output_feature_map_count(filter_nchw[0]);
-  filter_desc.set_input_feature_map_count(filter_nchw[1]);
-  filter_desc.set_input_filter_height(filter_nchw[2]);
-  filter_desc.set_input_filter_width(filter_nchw[3]);
-  return filter_desc;
+absl::StatusOr<ThunkProto> ConvolutionThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  ConvolutionThunkProto* conv_proto = proto.mutable_convolution_thunk();
+  *conv_proto->mutable_conv_descriptor() = descriptor_.ToProto();
+
+  for (const BufferAllocation::Slice& slice : operand_buffers_) {
+    TF_ASSIGN_OR_RETURN(*conv_proto->add_operand_buffers(), slice.ToProto());
+  }
+  for (const BufferAllocation::Slice& slice : result_buffers_) {
+    TF_ASSIGN_OR_RETURN(*conv_proto->add_result_buffers(), slice.ToProto());
+  }
+  TF_ASSIGN_OR_RETURN(*conv_proto->mutable_scratch_buffer(),
+                      scratch_buffer_.ToProto());
+
+  return proto;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
index f78d5c0b431e51..e456e23b816d86 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
@@ -16,13 +16,11 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_CONVOLUTION_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_CONVOLUTION_THUNK_H_
 
-#include <cstdint>
 #include <memory>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
@@ -30,7 +28,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
@@ -56,8 +53,15 @@ class ConvolutionThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  static absl::StatusOr<std::unique_ptr<ConvolutionThunk>> FromProto(
+      ThunkInfo thunk_info, const ConvolutionThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
-  ConvolutionThunk(ThunkInfo thunk_info, GpuConvConfig config,
+  ConvolutionThunk(ThunkInfo thunk_info, GpuConvDescriptor descriptor,
+                   GpuConvConfig config,
                    std::vector<BufferAllocation::Slice> operand_slices,
                    std::vector<BufferAllocation::Slice> result_slices,
                    BufferAllocation::Slice scratch_slice);
@@ -68,6 +72,10 @@ class ConvolutionThunk : public Thunk {
   GenericConvRunner& GetOrCreateRunner(const stream_executor::Stream* stream,
                                        bool* runner_created);
 
+  // Technically this is only needed during initialization to create the
+  // GpuConvConfig, but the actual GpuConvConfig is hard to serialize. So we
+  // keep the descriptor around for serialization purposes.
+  const GpuConvDescriptor descriptor_;
   // Convolution config
   const GpuConvConfig config_;
   absl::Mutex mu_;
@@ -76,28 +84,6 @@ class ConvolutionThunk : public Thunk {
       runner_cache_ ABSL_GUARDED_BY(mu_);
 };
 
-// Launches the kernel that reorders input data for int8x32 convolutions.
-class ConvolutionReorderThunk : public Thunk {
- public:
-  ConvolutionReorderThunk(
-      ThunkInfo thunk_info, absl::Span<int64_t> filter_nchw,
-      absl::InlinedVector<BufferAllocation::Slice, 2> operand_slices,
-      absl::InlinedVector<BufferAllocation::Slice, 2> result_slices);
-
-  ConvolutionReorderThunk(const ConvolutionReorderThunk&) = delete;
-  ConvolutionReorderThunk& operator=(const ConvolutionReorderThunk&) = delete;
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  static se::dnn::FilterDescriptor CreateFilterDescriptor(
-      absl::Span<int64_t> filter_nchw);
-
-  const se::dnn::FilterDescriptor filter_descriptor_;
-  absl::InlinedVector<BufferAllocation::Slice, 2> operand_buffers_;
-  absl::InlinedVector<BufferAllocation::Slice, 2> result_buffers_;
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
new file mode 100644
index 00000000000000..6ebd309f06e7af
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/convolution_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(ConvolutionThunkTest, ProtoRoundTrip) {
+  auto proto = ParseTextProtoOrDie<ThunkProto>(R"pb(
+    thunk_info {
+      profile_annotation: "conv_thunk_profile"
+      execution_stream_id: 0
+    }
+    convolution_thunk {
+      conv_descriptor {
+        kind: FORWARD
+        backend_config {
+          conv_result_scale: 1
+          activation_mode: 0
+          side_input_scale: 0
+          leakyrelu_alpha: 0
+        }
+        window {
+          dimensions {
+            size: 1
+            stride: 1
+            padding_low: 0
+            padding_high: 0
+            window_dilation: 1
+            base_dilation: 1
+            window_reversal: false
+          }
+          dimensions {
+            size: 1
+            stride: 1
+            padding_low: 0
+            padding_high: 0
+            window_dilation: 1
+            base_dilation: 1
+            window_reversal: false
+          }
+        }
+        operand0_shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+        operand1_shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+        result_shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+        scratch_size: 1024
+        dnums {
+          input_batch_dimension: 0
+          input_feature_dimension: 1
+          input_spatial_dimensions: [ 2, 3 ]
+          kernel_input_feature_dimension: 1
+          kernel_output_feature_dimension: 0
+          kernel_spatial_dimensions: [ 2, 3 ]
+          output_batch_dimension: 0
+          output_feature_dimension: 1
+          output_spatial_dimensions: [ 2, 3 ]
+        }
+      }
+      operand_buffers { offset: 0 size: 4 buffer_allocation_index: 0 }
+      operand_buffers { offset: 0 size: 4 buffer_allocation_index: 1 }
+      result_buffers { offset: 0 size: 4 buffer_allocation_index: 2 }
+      scratch_buffer { offset: 0 size: 1024 buffer_allocation_index: 3 }
+    }
+  )pb");
+
+  std::vector<BufferAllocation> buffer_allocations;
+  buffer_allocations.emplace_back(/*index=*/0, /*size=*/4, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/1, /*size=*/4, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/2, /*size=*/4, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/3, /*size=*/1024, /*color=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(Thunk::ThunkInfo thunk_info,
+                          Thunk::ThunkInfo::FromProto(proto.thunk_info()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ConvolutionThunk> thunk,
+      ConvolutionThunk::FromProto(thunk_info, proto.convolution_thunk(),
+                                  buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
index 1e3e58df979072..7d532fcf4cb433 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
@@ -26,9 +26,11 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
@@ -46,6 +48,8 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+using buffer_assignment::BufferAllocationSliceProto;
+
 namespace {
 
 // N pairs of [start_offset, end_offset) require (N+1) storage.
@@ -62,7 +66,9 @@ absl::Status CopyOffsets(se::Stream* stream, se::DeviceMemoryBase scratch,
       static_cast<char*>(scratch.opaque()) + scratch.size() - offsets_size;
   se::DeviceMemoryBase d_offsets(offsets_buffer, offsets_size);
   std::vector<int> h_offsets(batch_size + 1);
-  for (int i = 0; i <= batch_size; ++i) h_offsets[i] = i * segment_size;
+  for (int i = 0; i <= batch_size; ++i) {
+    h_offsets[i] = i * segment_size;
+  }
   return stream->Memcpy(&d_offsets, h_offsets.data(), offsets_size);
 }
 
@@ -283,21 +289,90 @@ CubSortRunnerInterface::Create(PrimitiveType type,
              : CreateCubSortRunner(type, platform_name);
 }
 
-CubSortThunk::CubSortThunk(
+absl::StatusOr<std::unique_ptr<CubSortThunk>> CubSortThunk::Create(
     ThunkInfo thunk_info, PrimitiveType type,
     std::optional<PrimitiveType> value_type,
     absl::InlinedVector<BufferAllocation::Slice, 2> operands,
     absl::InlinedVector<BufferAllocation::Slice, 2> results,
     BufferAllocation::Slice scratch, bool descending, int64_t batch_size,
-    absl::string_view platform_name)
+    absl::string_view platform_name) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<CubSortRunnerInterface> runner,
+      CubSortRunnerInterface::Create(type, value_type, platform_name));
+
+  return absl::WrapUnique<CubSortThunk>(new CubSortThunk(
+      thunk_info, std::move(runner), type, value_type, std::move(operands),
+      std::move(results), scratch, descending, batch_size));
+}
+
+CubSortThunk::CubSortThunk(
+    ThunkInfo thunk_info, std::unique_ptr<CubSortRunnerInterface> runner,
+    PrimitiveType type, std::optional<PrimitiveType> value_type,
+    absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+    absl::InlinedVector<BufferAllocation::Slice, 2> results,
+    BufferAllocation::Slice scratch, bool descending, int64_t batch_size)
     : Thunk(Thunk::kCubSort, thunk_info),
-      runner_(CubSortRunnerInterface::Create(type, value_type, platform_name)
-                  .value()),
+      runner_(std::move(runner)),
       operands_(std::move(operands)),
       results_(std::move(results)),
       scratch_(scratch),
+      type_(type),
+      value_type_(value_type),
       descending_(descending),
       batch_size_(batch_size) {}
 
+absl::StatusOr<std::unique_ptr<CubSortThunk>> CubSortThunk::FromProto(
+    ThunkInfo thunk_info, const CubSortThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    absl::string_view platform_name) {
+  absl::InlinedVector<BufferAllocation::Slice, 2> operands;
+  for (const BufferAllocationSliceProto& slice_proto : proto.operands()) {
+    TF_ASSIGN_OR_RETURN(
+        operands.emplace_back(),
+        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+  }
+
+  absl::InlinedVector<BufferAllocation::Slice, 2> results;
+  for (const BufferAllocationSliceProto& slice_proto : proto.results()) {
+    TF_ASSIGN_OR_RETURN(
+        results.emplace_back(),
+        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice scratch,
+      BufferAllocation::Slice::FromProto(proto.scratch(), buffer_allocations));
+
+  std::optional<PrimitiveType> value_type;
+  if (proto.has_value_type()) {
+    value_type = proto.value_type();
+  }
+
+  return Create(thunk_info, proto.type(), value_type, operands, results,
+                scratch, proto.descending(), proto.batch_size(), platform_name);
+}
+
+absl::StatusOr<ThunkProto> CubSortThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  CubSortThunkProto* cub_sort_proto = proto.mutable_cub_sort_thunk();
+
+  cub_sort_proto->set_type(type_);
+  if (value_type_.has_value()) {
+    cub_sort_proto->set_value_type(*value_type_);
+  }
+  for (const BufferAllocation::Slice& slice : operands_) {
+    TF_ASSIGN_OR_RETURN(*cub_sort_proto->add_operands(), slice.ToProto());
+  }
+  for (const BufferAllocation::Slice& slice : results_) {
+    TF_ASSIGN_OR_RETURN(*cub_sort_proto->add_results(), slice.ToProto());
+  }
+  TF_ASSIGN_OR_RETURN(*cub_sort_proto->mutable_scratch(), scratch_.ToProto());
+  cub_sort_proto->set_descending(descending_);
+  cub_sort_proto->set_batch_size(batch_size_);
+
+  return proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
index f66c1259b7f276..f04b84ca022a29 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
@@ -24,10 +24,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 
@@ -55,17 +56,25 @@ class CubSortRunnerInterface {
 
 class CubSortThunk : public Thunk {
  public:
-  CubSortThunk(ThunkInfo thunk_info, PrimitiveType type,
-               std::optional<PrimitiveType> value_type,
-               absl::InlinedVector<BufferAllocation::Slice, 2> operands,
-               absl::InlinedVector<BufferAllocation::Slice, 2> results,
-               BufferAllocation::Slice scratch, bool descending,
-               int64_t batch_size, absl::string_view platform_name);
+  static absl::StatusOr<std::unique_ptr<CubSortThunk>> Create(
+      ThunkInfo thunk_info, PrimitiveType type,
+      std::optional<PrimitiveType> value_type,
+      absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+      absl::InlinedVector<BufferAllocation::Slice, 2> results,
+      BufferAllocation::Slice scratch, bool descending, int64_t batch_size,
+      absl::string_view platform_name);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override {
     return runner_->Run(params, this);
   }
 
+  static absl::StatusOr<std::unique_ptr<CubSortThunk>> FromProto(
+      ThunkInfo thunk_info, const CubSortThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      absl::string_view platform_name);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   BufferAllocation::Slice operand(int i) const { return operands_[i]; }
   BufferAllocation::Slice result(int i) const { return results_[i]; }
   BufferAllocation::Slice scratch() const { return scratch_; }
@@ -73,10 +82,19 @@ class CubSortThunk : public Thunk {
   int64_t batch_size() const { return batch_size_; }
 
  private:
+  CubSortThunk(ThunkInfo thunk_info,
+               std::unique_ptr<CubSortRunnerInterface> runner,
+               PrimitiveType type, std::optional<PrimitiveType> value_type,
+               absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+               absl::InlinedVector<BufferAllocation::Slice, 2> results,
+               BufferAllocation::Slice scratch, bool descending,
+               int64_t batch_size);
   std::unique_ptr<CubSortRunnerInterface> runner_;
   absl::InlinedVector<BufferAllocation::Slice, 2> operands_;
   absl::InlinedVector<BufferAllocation::Slice, 2> results_;
   BufferAllocation::Slice scratch_;
+  PrimitiveType type_;
+  std::optional<PrimitiveType> value_type_;
   bool descending_;
   int64_t batch_size_;
 };
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk_test.cc
new file mode 100644
index 00000000000000..dfd04751d72bcd
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/ffi/ffi.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/platform_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(CubSortThunkTest, ProtoRoundTrip) {
+  TF_ASSERT_OK_AND_ASSIGN(absl::string_view name,
+                          PlatformUtil::CanonicalPlatformName("gpu"));
+  auto proto = ParseTextProtoOrDie<ThunkProto>(R"pb(
+    thunk_info {
+      profile_annotation: "cub_sort_thunk_profile"
+      execution_stream_id: 1
+    }
+    cub_sort_thunk {
+      type: F32
+      value_type: F32
+      operands { offset: 0 size: 4 buffer_allocation_index: 0 }
+      results { offset: 0 size: 4 buffer_allocation_index: 1 }
+      scratch { offset: 0 size: 1024 buffer_allocation_index: 2 }
+      descending: true
+      batch_size: 1
+    }
+  )pb");
+
+  std::vector<BufferAllocation> buffer_allocations;
+  buffer_allocations.emplace_back(/*index=*/0, /*size=*/4, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/1, /*size=*/4, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/2, /*size=*/1024, /*color=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(Thunk::ThunkInfo thunk_info,
+                          Thunk::ThunkInfo::FromProto(proto.thunk_info()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CubSortThunk> thunk,
+      CubSortThunk::FromProto(thunk_info, proto.cub_sort_thunk(),
+                              buffer_allocations, name));
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
index 1677b6416be29a..863cb97b8dced1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
@@ -46,8 +46,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/kernel_spec.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -115,7 +115,10 @@ TEST(CommandBufferThunkTest, CuDnnCmd) {
     return graph;
   }());
   int64_t workspace_size = graph.Graph().get_workspace_size();
-  TF_ASSERT_OK(graph.Prepare(dnn_support, se::NumericOptions{}));
+  TF_ASSERT_OK(graph.Prepare(
+      dnn_support, se::EngineOptions{/*require_determinism=*/false,
+                                     /*allow_tf32=*/true,
+                                     /*require_command_buffer=*/true}));
   TF_ASSERT_OK(graph.Build(dnn_support, /*plan_id=*/std::nullopt));
   EXPECT_THAT(graph.SupportsExplicitCommandBufferConstruction(),
               absl_testing::IsOkAndHolds(true));
diff --git a/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk.cc
index 11af99c8f8958c..d8ee7e0d193952 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk.cc
@@ -89,9 +89,9 @@ absl::Status CuDnnThunk::ExecuteOnStream(const ExecuteParams& params) {
     }
     buffer_args.push_back(addr);
   }
-  return graph_->get()->Execute(*params.stream,
-                                absl::Span<se::DeviceMemoryBase>(buffer_args),
-                                params.collective_params->local_device_ordinal);
+  return graph_->get()->Execute(
+      *params.stream, absl::Span<se::DeviceMemoryBase>(buffer_args),
+      params.collective_params->local_device_id.value());
 }
 
 absl::StatusOr<ThunkProto> CuDnnThunk::ToProto() const {
diff --git a/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc
index 00bb675e42b54c..a7f37c0ef126a8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc
@@ -20,18 +20,17 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
 using tsl::proto_testing::EqualsProto;
-using tsl::testing::IsOkAndHolds;
 
 TEST(CuDnnThunkTest, TestSerializationDeserialization) {
   CudnnThunkProto cudnn_thunk_proto;
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
index 5ae09523ce5c18..973fc5bcc0f0f9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
@@ -15,38 +15,53 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/nullability.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/custom_call_target.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_state.h"
+#include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
+#include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
+#include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "tsl/platform/platform.h"
 
 namespace xla {
 namespace gpu {
@@ -59,9 +74,9 @@ using xla::ffi::CallOptions;
 // memory addresses. This is called once when creating the CustomCall thunk,
 // then the thunk will need to update the addresses at runtime.
 static absl::StatusOr<ffi::CallFrame> BuildCallFramePrototype(
-    absl::Span<const std::optional<CustomCallThunk::Slice>> operands,
-    absl::Span<const std::optional<CustomCallThunk::Slice>> results,
-    CustomCallThunk::AttributesMap attributes) {
+    absl::Span<const NullableShapedSlice> operands,
+    absl::Span<const NullableShapedSlice> results,
+    ffi::AttributesMap attributes) {
   CallFrameBuilder builder(
       /*num_args=*/operands.size(),
       /*num_rets=*/results.size());
@@ -112,19 +127,104 @@ static absl::StatusOr<ffi::CallFrame> BuildCallFramePrototype(
   return builder.Build();
 }
 
+static absl::StatusOr<CustomCallThunk::CustomCallTarget>
+ResolveLegacyCustomCall(const CustomCallTargetRegistry& registry,
+                        absl::string_view target_name,
+                        absl::string_view platform_name,
+                        CustomCallApiVersion api_version) {
+  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
+      std::string(target_name), std::string(platform_name));
+
+  // For information about this calling convention, see
+  // xla/g3doc/custom_call.md.
+  switch (api_version) {
+    case CustomCallApiVersion::API_VERSION_ORIGINAL: {
+      constexpr absl::string_view kErrorMessage =
+          "Custom call API version `API_VERSION_ORIGINAL` is not supported by "
+          "XLA:GPU. Prefer https://docs.jax.dev/en/latest/ffi.html. It will be "
+          "fully removed in November 2025.";
+      if constexpr (tsl::kIsOpenSource) {
+        LOG(ERROR) << kErrorMessage;
+      } else {
+        LOG(FATAL) << kErrorMessage;
+      }
+
+      return [call_target](stream_executor::Stream* stream, void** buffers,
+                           const char* opaque, size_t opaque_len,
+                           XlaCustomCallStatus*) {
+        reinterpret_cast<CustomCallWithOpaqueStreamHandle>(call_target)(
+            stream->platform_specific_handle().stream, buffers, opaque,
+            opaque_len);
+      };
+      break;
+    }
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
+      return [call_target](stream_executor::Stream* stream, void** buffers,
+                           const char* opaque, size_t opaque_len,
+                           XlaCustomCallStatus* status) {
+        reinterpret_cast<CustomCallWithStatusAndOpaqueStreamHandle>(
+            call_target)(stream->platform_specific_handle().stream, buffers,
+                         opaque, opaque_len, status);
+      };
+      break;
+    case CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      return absl::InvalidArgumentError(
+          "Called ResolveLegacyCustomCall with API_VERSION_TYPED_FFI");
+    default:
+      return Internal("Unknown custom-call API version enum value: %d",
+                      api_version);
+  }
+}
+
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name, CustomCallTarget call_target,
-    std::vector<std::optional<Slice>> operands,
-    std::vector<std::optional<Slice>> results, const std::string& opaque) {
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, std::string opaque) {
+  return absl::WrapUnique(new CustomCallThunk(
+      thunk_info, std::move(target_name), std::move(operands),
+      std::move(results), std::move(opaque), std::move(call_target),
+      /*api_version=*/std::nullopt));
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
+    ThunkInfo thunk_info, std::string target_name,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, std::string opaque,
+    CustomCallApiVersion api_version, absl::string_view platform_name) {
+  if (api_version == CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+    return absl::InvalidArgumentError(
+        "Called overload of CustomCallThunk::Create that is intended for "
+        "legacy custom calls with api_version=API_VERSION_TYPED_FFI");
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      CustomCallTarget call_target,
+      ResolveLegacyCustomCall(*CustomCallTargetRegistry::Global(), target_name,
+                              platform_name, api_version));
+
   return absl::WrapUnique(new CustomCallThunk(
-      thunk_info, std::move(target_name), std::move(call_target),
-      std::move(operands), std::move(results), opaque));
+      thunk_info, std::move(target_name), std::move(operands),
+      std::move(results), std::move(opaque), call_target, api_version));
 }
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
-    XLA_FFI_Handler_Bundle bundle, std::vector<std::optional<Slice>> operands,
-    std::vector<std::optional<Slice>> results, AttributesMap attributes,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
+    const HloComputation* called_computation, absl::string_view platform_name) {
+  TF_ASSIGN_OR_RETURN(ffi::HandlerRegistration registration,
+                      ffi::FindHandler(target_name, platform_name));
+
+  return Create(thunk_info, std::move(target_name),
+                std::move(registration.bundle), std::move(operands),
+                std::move(results), std::move(attributes), called_computation);
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
+    ThunkInfo thunk_info, std::string target_name,
+    XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
     const HloComputation* called_computation) {
   auto execution_state = std::make_unique<ffi::ExecutionState>();
 
@@ -146,39 +246,81 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
                             XLA_FFI_ExecutionStage_INSTANTIATE));
   }
 
-  TF_ASSIGN_OR_RETURN(
-      CallFrame call_frame,
-      BuildCallFramePrototype(operands, results, std::move(attributes)));
+  TF_ASSIGN_OR_RETURN(CallFrame call_frame,
+                      BuildCallFramePrototype(operands, results, attributes));
+  return absl::WrapUnique(new CustomCallThunk(
+      thunk_info, std::move(target_name), std::move(bundle),
+      std::move(operands), std::move(results), std::move(call_frame),
+      std::move(attributes), std::move(execution_state), called_computation));
+}
 
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
+    ThunkInfo thunk_info, std::string target_name, OwnedHandlerBundle bundle,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results,
+    xla::ffi::AttributesMap attributes,
+    const HloComputation* called_computation) {
+  if (!bundle.execute) {
+    return absl::InvalidArgumentError(
+        "Execute handler is required for a CustomCallThunk");
+  }
+
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  // Initialize FFI handler state if it has an instantiate callback.
+  if (bundle.instantiate) {
+    // At FFI handler instantiation time, we don't have any arguments or
+    // results or access to the underlying device (stream, etc.)
+    CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+
+    CallFrameBuilder::AttributesBuilder attrs;
+    attrs.Append(attributes);
+
+    builder.AddAttributes(attrs.Build());
+    CallFrame call_frame = builder.Build();
+
+    CallOptions options;
+    options.execution_state = execution_state.get();
+    TF_RETURN_IF_ERROR(Call(*bundle.instantiate, call_frame, options,
+                            xla::ffi::ExecutionStage::kInstantiate));
+  }
+
+  TF_ASSIGN_OR_RETURN(CallFrame call_frame,
+                      BuildCallFramePrototype(operands, results, attributes));
   return absl::WrapUnique(new CustomCallThunk(
-      thunk_info, std::move(target_name), bundle, std::move(operands),
-      std::move(results), std::move(call_frame), std::move(execution_state),
-      called_computation));
+      thunk_info, std::move(target_name), std::move(bundle),
+      std::move(operands), std::move(results), std::move(call_frame),
+      std::move(attributes), std::move(execution_state), called_computation));
 }
 
-CustomCallThunk::CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
-                                 CustomCallTarget call_target,
-                                 std::vector<std::optional<Slice>> operands,
-                                 std::vector<std::optional<Slice>> results,
-                                 const std::string& opaque)
+CustomCallThunk::CustomCallThunk(
+    ThunkInfo thunk_info, std::string target_name,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, std::string opaque,
+    CustomCallTarget call_target,
+    const std::optional<CustomCallApiVersion>& api_version)
     : Thunk(Thunk::kCustomCall, thunk_info),
+      api_version_(api_version),
       target_name_(std::move(target_name)),
       operands_(std::move(operands)),
       results_(std::move(results)),
       call_target_(std::move(call_target)),
-      opaque_(opaque) {}
+      opaque_(std::move(opaque)) {}
 
 CustomCallThunk::CustomCallThunk(
     ThunkInfo thunk_info, std::string target_name,
-    XLA_FFI_Handler_Bundle bundle, std::vector<std::optional<Slice>> operands,
-    std::vector<std::optional<Slice>> results, CallFrame call_frame,
+    std::variant<XLA_FFI_Handler_Bundle, OwnedHandlerBundle> bundle,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, CallFrame call_frame,
+    ffi::AttributesMap attributes,
     std::unique_ptr<ffi::ExecutionState> execution_state,
     const HloComputation* called_computation)
     : Thunk(Thunk::kCustomCall, thunk_info),
+      api_version_(CustomCallApiVersion::API_VERSION_TYPED_FFI),
       target_name_(std::move(target_name)),
       operands_(std::move(operands)),
       results_(std::move(results)),
-      bundle_(bundle),
+      bundle_(std::move(bundle)),
+      attributes_(std::move(attributes)),
       call_frame_(std::move(call_frame)),
       call_frames_([this] { return call_frame_->Copy(); }),
       execution_state_(std::move(execution_state)),
@@ -189,7 +331,7 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
   std::vector<void*> buffers;
   buffers.reserve(operands_.size() + results_.size());
   for (auto& slices : {operands_, results_}) {
-    for (const std::optional<Slice>& slice : slices) {
+    for (const std::optional<ShapedSlice>& slice : slices) {
       if (!slice.has_value()) {
         buffers.push_back(nullptr);
         continue;
@@ -213,23 +355,19 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
   auto message = CustomCallStatusGetMessage(&custom_call_status);
   if (message) {
     return Internal("CustomCall failed: %s", *message);
-  } else {
-    return absl::OkStatus();
   }
+  return absl::OkStatus();
 }
 
-absl::Status CustomCallThunk::ExecuteFfiHandler(
-    RunId run_id, XLA_FFI_Handler* handler, XLA_FFI_ExecutionStage stage,
-    se::Stream* stream, const ffi::ExecutionContext* execution_context,
-    const BufferAllocations* buffer_allocations) {
-  if (handler == nullptr) {
-    return absl::InternalError("FFI execute handler is not set");
-  }
-  if (stage != XLA_FFI_ExecutionStage_PREPARE &&
-      !(buffer_allocations && stream)) {
-    return absl::InternalError("buffer allocations and stream are required");
-  }
-
+// Builds a call frame for the custom call.
+//
+// If `buffer_allocations` is provided, the call frame will contain the actual
+// device memory addresses of the buffers. Otherwise, the call frame will
+// contain placeholders - this should only be the case when calling Prepare()
+// stage handler.
+absl::StatusOr<ObjectPool<CallFrame>::BorrowedObject>
+CustomCallThunk::BuildCallFrame(
+    const BufferAllocations* absl_nullable buffer_allocations) {
   auto device_memory = [&](BufferAllocation::Slice slice) {
     return buffer_allocations ? buffer_allocations->GetDeviceAddress(slice)
                               : se::DeviceMemoryBase{};
@@ -261,60 +399,226 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(
   // device memory addresses.
   TF_ASSIGN_OR_RETURN(auto call_frame, call_frames_->GetOrCreate());
   TF_RETURN_IF_ERROR(call_frame->UpdateWithBuffers(arguments, results));
+  return call_frame;
+}
 
+// Builds call options object for the custom call.
+//
+// `stream` and `buffer_allocations may only be non-null for options passed to
+// Prepare()_stage handler.
+CallOptions CustomCallThunk::BuildCallOptions(
+    RunId run_id, se::Stream* absl_nullable stream,
+    const BufferAllocations* absl_nullable buffer_allocations,
+    const ffi::ExecutionContext* absl_nullable execution_context) {
   int32_t device_ordinal = -1;
   se::DeviceMemoryAllocator* allocator = nullptr;
-  if (stage != XLA_FFI_ExecutionStage_PREPARE) {
+  if (buffer_allocations != nullptr) {
     device_ordinal = buffer_allocations->device_ordinal();
     allocator = buffer_allocations->memory_allocator();
   }
 
-  CallOptions options = {run_id,
-                         device_ordinal,
-                         CallOptions::GpuOptions{stream, allocator},
-                         called_computation_,
-                         execution_context,
-                         execution_state_.get()};
+  return CallOptions{run_id,
+                     device_ordinal,
+                     CallOptions::GpuOptions{stream, allocator},
+                     called_computation_,
+                     execution_context,
+                     execution_state_.get()};
+}
+
+absl::Status CustomCallThunk::ExecuteFfiHandler(
+    RunId run_id, XLA_FFI_Handler* handler, XLA_FFI_ExecutionStage stage,
+    se::Stream* stream, const ffi::ExecutionContext* execution_context,
+    const BufferAllocations* buffer_allocations) {
+  if (handler == nullptr) {
+    return absl::InternalError("FFI execute handler is not set");
+  }
+  if (stage != XLA_FFI_ExecutionStage_PREPARE &&
+      !(buffer_allocations && stream)) {
+    return absl::InternalError("buffer allocations and stream are required");
+  }
+
+  TF_ASSIGN_OR_RETURN(auto call_frame, BuildCallFrame(buffer_allocations));
+  CallOptions options =
+      BuildCallOptions(run_id, stream, buffer_allocations, execution_context);
   return Call(handler, *call_frame, options, stage);
 }
 
-absl::Status CustomCallThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
-  if (!bundle_ || !bundle_->prepare) {
-    return absl::OkStatus();
+absl::Status CustomCallThunk::ExecuteFfiHandler(
+    RunId run_id, xla::ffi::Ffi& handler, xla::ffi::ExecutionStage stage,
+    se::Stream* stream, const ffi::ExecutionContext* execution_context,
+    const BufferAllocations* buffer_allocations) {
+  if (stage != xla::ffi::ExecutionStage::kPrepare &&
+      !(buffer_allocations && stream)) {
+    return absl::InternalError("buffer allocations and stream are required");
   }
 
-  return ExecuteFfiHandler(
-      params.collective_params ? params.collective_params->run_id : RunId{-1},
-      bundle_->prepare, XLA_FFI_ExecutionStage_PREPARE,
-      /*stream=*/nullptr,
-      /*execution_context=*/nullptr,
-      /*buffer_allocations=*/nullptr);
+  TF_ASSIGN_OR_RETURN(auto call_frame, BuildCallFrame(buffer_allocations));
+  CallOptions options =
+      BuildCallOptions(run_id, stream, buffer_allocations, execution_context);
+  return Call(handler, *call_frame, options, stage);
 }
 
-absl::Status CustomCallThunk::Initialize(const InitializeParams& params) {
-  if (!bundle_ || !bundle_->initialize) {
-    return absl::OkStatus();
+absl::Status CustomCallThunk::Prepare(const PrepareParams& params) {
+  if (bundle_.has_value()) {
+    const RunId run_id =
+        params.collective_params ? params.collective_params->run_id : RunId{-1};
+
+    if (const auto* c_bundle =
+            std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+        c_bundle && c_bundle->prepare) {
+      return ExecuteFfiHandler(run_id, c_bundle->prepare,
+                               XLA_FFI_ExecutionStage_PREPARE,
+                               /*stream=*/nullptr,
+                               /*execution_context=*/nullptr,
+                               /*buffer_allocations=*/nullptr);
+    }
+    if (const auto* owned_bundle =
+            std::get_if<OwnedHandlerBundle>(&bundle_.value());
+        owned_bundle && owned_bundle->prepare) {
+      return ExecuteFfiHandler(run_id, *owned_bundle->prepare,
+                               xla::ffi::ExecutionStage::kPrepare,
+                               /*stream=*/nullptr,
+                               /*execution_context=*/nullptr,
+                               /*buffer_allocations=*/nullptr);
+    }
   }
 
-  return ExecuteFfiHandler(
-      params.collective_params ? params.collective_params->run_id : RunId{-1},
-      bundle_->initialize, XLA_FFI_ExecutionStage_INITIALIZE, params.stream,
-      params.ffi_execution_context, params.buffer_allocations);
+  return absl::OkStatus();
+}
+
+absl::Status CustomCallThunk::Initialize(const InitializeParams& params) {
+  if (bundle_.has_value()) {
+    const RunId run_id =
+        params.collective_params ? params.collective_params->run_id : RunId{-1};
+
+    if (const auto* c_bundle =
+            std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+        c_bundle && c_bundle->initialize) {
+      return ExecuteFfiHandler(run_id, *c_bundle->initialize,
+                               XLA_FFI_ExecutionStage_INITIALIZE, params.stream,
+                               params.ffi_execution_context,
+                               params.buffer_allocations);
+    }
+    if (const auto* owned_bundle =
+            std::get_if<OwnedHandlerBundle>(&bundle_.value());
+        owned_bundle && owned_bundle->initialize) {
+      return ExecuteFfiHandler(run_id, *owned_bundle->initialize,
+                               xla::ffi::ExecutionStage::kInitialize,
+                               params.stream, params.ffi_execution_context,
+                               params.buffer_allocations);
+    }
+  }
+  return absl::OkStatus();
 }
 
 absl::Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
   TF_ASSIGN_OR_RETURN(
       se::Stream * stream,
       GetStreamForExecution(Thunk::execution_stream_id(), params));
+
   if (bundle_.has_value()) {
-    return ExecuteFfiHandler(
-        params.collective_params ? params.collective_params->run_id : RunId{-1},
-        bundle_->execute, XLA_FFI_ExecutionStage_EXECUTE, stream,
-        params.ffi_execution_context, params.buffer_allocations);
+    const RunId run_id =
+        params.collective_params ? params.collective_params->run_id : RunId{-1};
+    if (const auto* c_bundle =
+            std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+        c_bundle) {
+      return ExecuteFfiHandler(
+          run_id, c_bundle->execute, XLA_FFI_ExecutionStage_EXECUTE, stream,
+          params.ffi_execution_context, params.buffer_allocations);
+    }
+    if (const auto* owned_bundle =
+            std::get_if<OwnedHandlerBundle>(&bundle_.value());
+        owned_bundle) {
+      if (!owned_bundle->execute) {
+        return absl::InternalError("FFI execute handler is not set");
+      }
+      return ExecuteFfiHandler(
+          run_id, *owned_bundle->execute, xla::ffi::ExecutionStage::kExecute,
+          stream, params.ffi_execution_context, params.buffer_allocations);
+    }
   }
+
   return ExecuteCustomCall(params);
 }
 
+absl::StatusOr<ThunkProto> CustomCallThunk::ToProto() const {
+  if (!api_version_.has_value()) {
+    return absl::FailedPreconditionError(
+        "CustomCallThunk was created from a non-registered target and cannot "
+        "be serialized to a proto");
+  }
+
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  proto.mutable_custom_call_thunk()->set_target_name(target_name_);
+  proto.mutable_custom_call_thunk()->set_opaque(opaque_);
+  proto.mutable_custom_call_thunk()->set_api_version(api_version_.value());
+  if (called_computation_ != nullptr) {
+    proto.mutable_custom_call_thunk()->set_called_computation(
+        called_computation_->name());
+  }
+
+  for (const NullableShapedSlice& operand : operands_) {
+    TF_ASSIGN_OR_RETURN(*proto.mutable_custom_call_thunk()->add_operands(),
+                        operand.ToProto());
+  }
+
+  for (const NullableShapedSlice& result : results_) {
+    TF_ASSIGN_OR_RETURN(*proto.mutable_custom_call_thunk()->add_results(),
+                        result.ToProto());
+  }
+
+  if (attributes_.has_value()) {
+    *proto.mutable_custom_call_thunk()->mutable_attributes() =
+        attributes_->ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::FromProto(
+    ThunkInfo thunk_info, const CustomCallThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module,
+    absl::string_view platform_name) {
+  if (hlo_module == nullptr && proto.has_called_computation()) {
+    return absl::InvalidArgumentError(
+        "HloModule is required to deserialize a CustomCallThunk with a "
+        "called computation");
+  }
+
+  std::vector<NullableShapedSlice> operands, results;
+  for (const auto& operand_proto : proto.operands()) {
+    TF_ASSIGN_OR_RETURN(
+        NullableShapedSlice operand,
+        NullableShapedSlice::FromProto(operand_proto, buffer_allocations));
+    operands.push_back(std::move(operand));
+  }
+  for (const auto& result_proto : proto.results()) {
+    TF_ASSIGN_OR_RETURN(
+        NullableShapedSlice result,
+        NullableShapedSlice::FromProto(result_proto, buffer_allocations));
+    results.push_back(std::move(result));
+  }
+  TF_ASSIGN_OR_RETURN(ffi::AttributesMap attributes,
+                      ffi::AttributesMap::FromProto(proto.attributes()));
+
+  HloComputation* called_computation = nullptr;
+  if (proto.has_called_computation()) {
+    CHECK(hlo_module != nullptr);  // This check is needed for static analysis.
+    called_computation =
+        hlo_module->GetComputationWithName(proto.called_computation());
+    if (called_computation == nullptr) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "HloComputation '", proto.called_computation(),
+          "' not found in the HloModule with name '", hlo_module->name(), "'"));
+    }
+  }
+
+  return CustomCallThunk::Create(std::move(thunk_info), proto.target_name(),
+                                 std::move(operands), std::move(results),
+                                 std::move(attributes), called_computation,
+                                 platform_name);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
index 30a97862647295..f037257200b267 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
@@ -21,23 +21,29 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <variant>
 #include <vector>
 
+#include "absl/base/nullability.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/shape.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
 
@@ -57,77 +63,145 @@ namespace gpu {
 // compiler is allowed to create.
 class CustomCallThunk : public Thunk {
  public:
+  // An owning equivalent of XLA_FFI_Handler_Bundle that allows using lambdas
+  // with captures.
+  //
+  // The members can be initialized with xla::ffi::Ffi::Bind().To(...).
+  struct OwnedHandlerBundle {
+    std::unique_ptr<xla::ffi::Ffi> initialize;
+    std::unique_ptr<xla::ffi::Ffi> instantiate;
+    std::unique_ptr<xla::ffi::Ffi> prepare;
+    std::unique_ptr<xla::ffi::Ffi> execute;
+  };
+
   using CustomCallTarget =
       std::function<void(stream_executor::Stream*, void**, const char*, size_t,
                          XlaCustomCallStatus*)>;
 
-  // We keep buffer allocation slice together with its shape to be able to fill
-  // FFI arguments with required details.
-  struct Slice {
-    BufferAllocation::Slice slice;
-    Shape shape;
-  };
-
-  using Attribute = ffi::CallFrameBuilder::Attribute;
-  using AttributesMap = ffi::CallFrameBuilder::AttributesMap;
+  // Creates a serializable custom call thunk. The callback is resolved using
+  // the legacy CustomCall registry. For new code please use XLA FFI instead.
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
+      ThunkInfo thunk_info, std::string target_name,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results, std::string opaque,
+      CustomCallApiVersion api_version, absl::string_view platform_name);
 
+  // Creates a custom call thunk from the given legacy custom call target.
+  // Note that a thunk created this way can't be serialized to a proto.
+  // This function is only permitted for unit testing code.
   static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
       ThunkInfo thunk_info, std::string target_name,
-      CustomCallTarget call_target, std::vector<std::optional<Slice>> operands,
-      std::vector<std::optional<Slice>> results, const std::string& opaque);
+      CustomCallTarget call_target, std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results, std::string opaque);
 
+  // Creates a serializable custom call thunk. The callback is resolved using
+  // XLA FFI.
   static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
       ThunkInfo thunk_info, std::string target_name,
-      XLA_FFI_Handler_Bundle bundle, std::vector<std::optional<Slice>> operands,
-      std::vector<std::optional<Slice>> results, AttributesMap attributes,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results,
+      xla::ffi::AttributesMap attributes,
+      const HloComputation* called_computation,
+      absl::string_view platform_name);
+
+  // Creates a serializable custom call thunk from the given XLA FFI handler
+  // bundle. Note that `target_name` needs to refer to a registered XLA FFI
+  // handler which matches the given bundle.
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
+      ThunkInfo thunk_info, std::string target_name,
+      XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results,
+      xla::ffi::AttributesMap attributes,
+      const HloComputation* called_computation);
+
+  // Creates a custom call thunk from a bundle of handlers created with
+  // xla::ffi::Bind(). Any pointer or reference lambda captures must be valid
+  // for the lifetime of the thunk.
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
+      ThunkInfo thunk_info, std::string target_name, OwnedHandlerBundle bundle,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results,
+      xla::ffi::AttributesMap attributes,
       const HloComputation* called_computation);
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
   const std::string& target_name() const { return target_name_; }
   CustomCallTarget call_target() const { return call_target_; }
-  std::optional<XLA_FFI_Handler_Bundle> bundle() const { return bundle_; }
+  std::optional<XLA_FFI_Handler_Bundle> bundle() const {
+    if (!bundle_.has_value()) {
+      return std::nullopt;
+    }
+    const XLA_FFI_Handler_Bundle* c_bundle =
+        std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+    return c_bundle ? std::make_optional(*c_bundle) : std::nullopt;
+  }
   std::optional<ffi::CallFrame> call_frame() const {
     return call_frame_ ? std::make_optional(call_frame_->Copy()) : std::nullopt;
   }
 
-  const std::vector<std::optional<Slice>>& operands() const {
-    return operands_;
-  }
-  const std::vector<std::optional<Slice>>& results() const { return results_; }
+  const std::vector<NullableShapedSlice>& operands() const { return operands_; }
+  const std::vector<NullableShapedSlice>& results() const { return results_; }
 
   absl::string_view opaque() const { return opaque_; }
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> FromProto(
+      ThunkInfo thunk_info, const CustomCallThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      const HloModule* absl_nullable hlo_module,
+      absl::string_view platform_name);
+
  private:
   CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
+                  std::vector<NullableShapedSlice> operands,
+                  std::vector<NullableShapedSlice> results, std::string opaque,
                   CustomCallTarget call_target,
-                  std::vector<std::optional<Slice>> operands,
-                  std::vector<std::optional<Slice>> results,
-                  const std::string& opaque);
+                  const std::optional<CustomCallApiVersion>& api_version);
 
-  CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
-                  XLA_FFI_Handler_Bundle bundle,
-                  std::vector<std::optional<Slice>> operands,
-                  std::vector<std::optional<Slice>> results,
-                  ffi::CallFrame call_frame,
-                  std::unique_ptr<ffi::ExecutionState> execution_state,
-                  const HloComputation* called_computation);
+  CustomCallThunk(
+      ThunkInfo thunk_info, std::string target_name,
+      std::variant<XLA_FFI_Handler_Bundle, OwnedHandlerBundle> bundle,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results, ffi::CallFrame call_frame,
+      xla::ffi::AttributesMap attributes,
+      std::unique_ptr<ffi::ExecutionState> execution_state,
+      const HloComputation* called_computation);
 
   absl::Status ExecuteCustomCall(const ExecuteParams& params);
 
+  absl::StatusOr<ObjectPool<xla::ffi::CallFrame>::BorrowedObject>
+  BuildCallFrame(const BufferAllocations* absl_nullable buffer_allocations);
+
+  xla::ffi::CallOptions BuildCallOptions(
+      RunId run_id, se::Stream* absl_nullable stream,
+      const BufferAllocations* absl_nullable buffer_allocations,
+      const ffi::ExecutionContext* absl_nullable execution_context);
+
   absl::Status ExecuteFfiHandler(RunId run_id, XLA_FFI_Handler* handler,
                                  XLA_FFI_ExecutionStage stage,
                                  se::Stream* stream,
                                  const ffi::ExecutionContext* execution_context,
                                  const BufferAllocations* buffer_allocations);
 
+  absl::Status ExecuteFfiHandler(RunId run_id, xla::ffi::Ffi& handler,
+                                 xla::ffi::ExecutionStage stage,
+                                 se::Stream* stream,
+                                 const ffi::ExecutionContext* execution_context,
+                                 const BufferAllocations* buffer_allocations);
+
+  // API version of the custom call. If not set, it means the custom call thunk
+  // was initialized from a non-registered function pointer and can't be
+  // serialized to a proto.
+  std::optional<CustomCallApiVersion> api_version_;
   std::string target_name_;
 
-  std::vector<std::optional<Slice>> operands_;
-  std::vector<std::optional<Slice>> results_;
+  // Nulled shape slices represent null pointer arguments to the thunk.
+  std::vector<NullableShapedSlice> operands_;
+  std::vector<NullableShapedSlice> results_;
 
   // This is a legacy custom call API that is discouraged, and will be
   // deprecated once XLA:FFI mechanism is ready.
@@ -137,7 +211,9 @@ class CustomCallThunk : public Thunk {
   // XLA FFI provides a right type safe mechanism for registering external
   // functions with XLA runtime. It's under construction, and still misses
   // a lot of features. Long term it will replace legacy custom calls.
-  std::optional<XLA_FFI_Handler_Bundle> bundle_;
+  std::optional<std::variant<XLA_FFI_Handler_Bundle, OwnedHandlerBundle>>
+      bundle_;
+  std::optional<xla::ffi::AttributesMap> attributes_;
 
   // Reference call frame pre-initialized at construction time.
   std::optional<ffi::CallFrame> call_frame_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
index 7ad565d7687833..17760b9641897f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
@@ -15,28 +15,52 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 
+#include <array>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
+#include <string>
+#include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
+#include "xla/ffi/attribute_map.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
+#include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
 
 namespace xla::gpu {
 namespace {
+using absl_testing::IsOk;
+using absl_testing::StatusIs;
+using ::testing::HasSubstr;
 
 static absl::StatusOr<se::StreamExecutor*> GpuExecutor() {
   TF_ASSIGN_OR_RETURN(auto name, PlatformUtil::CanonicalPlatformName("gpu"));
@@ -103,5 +127,320 @@ TEST(CustomCallThunkTest, CustomCallOnCustomStream) {
               absl_testing::IsOk());
 }
 
+// A simple callback function that always returns an error.
+absl::Status ReturnError() {
+  return absl::UnknownError("Custom call was executed!");
+}
+
+XLA_FFI_DEFINE_HANDLER(kReturnError, ReturnError, ffi::Ffi::Bind(),
+                       {ffi::Traits::kCmdBufferCompatible});
+
+constexpr absl::string_view kReturnErrorCustomCallName =
+    "__xla_test$$return_error";
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kReturnErrorCustomCallName,
+                         "CUDA", kReturnError);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kReturnErrorCustomCallName,
+                         "ROCM", kReturnError);
+
+TEST(CustomCallThunkTest, ResolvesFFICustomCall) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/std::string(kReturnErrorCustomCallName),
+          /*operands=*/{},
+          /*results=*/{}, /*attributes=*/{},
+          /*called_computation=*/nullptr,
+          /*platform_name=*/executor->GetPlatform()->Name()));
+
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations empty_unused_allocations({}, 0, &allocator);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), empty_unused_allocations,
+      /*stream=*/stream.get(),
+      /*command_buffer_trace_stream=*/stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr);
+  EXPECT_THAT(thunk->ExecuteOnStream(params),
+              StatusIs(absl::StatusCode::kUnknown,
+                       HasSubstr("Custom call was executed!")));
+}
+
+// A simple callback function that always returns an error and has the function
+// signature for a legacy custom call.
+void Callback_WithStatusFailed(void* /*stream*/, void** /*buffers*/,
+                               const char* /*opaque*/, size_t /*opaque_len*/,
+                               XlaCustomCallStatus* status) {
+  constexpr absl::string_view kErrorMessage =
+      "Legacy Custom call was executed!";
+  XlaCustomCallStatusSetFailure(status, kErrorMessage.data(),
+                                kErrorMessage.size());
+}
+
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_WithStatusFailed, "CUDA");
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_WithStatusFailed, "ROCM");
+
+TEST(CustomCallThunkTest, ResolvesLegacyCustomCall) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/"Callback_WithStatusFailed",
+          /*operands=*/{},
+          /*results=*/{}, /*opaque=*/"",
+          CustomCallApiVersion::API_VERSION_STATUS_RETURNING,
+          /*platform_name=*/executor->GetPlatform()->Name()));
+
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations empty_unused_allocations({}, 0, &allocator);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), empty_unused_allocations,
+      /*stream=*/stream.get(),
+      /*command_buffer_trace_stream=*/stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr);
+  EXPECT_THAT(thunk->ExecuteOnStream(params),
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("Legacy Custom call was executed!")));
+}
+
+TEST(CustomCallThunkTest, CustomCallWithOwnedHandlers) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+  int instantiate_calls = 0;
+  int prepare_calls = 0;
+  int initialize_calls = 0;
+  int execute_calls = 0;
+  CustomCallThunk::OwnedHandlerBundle bundle;
+  bundle.instantiate =
+      ffi::Ffi::Bind<ffi::ExecutionStage::kInstantiate>().To([&]() {
+        ++instantiate_calls;
+        return absl::OkStatus();
+      });
+  bundle.prepare = ffi::Ffi::Bind<ffi::ExecutionStage::kPrepare>().To([&]() {
+    ++prepare_calls;
+    return absl::OkStatus();
+  });
+  bundle.initialize =
+      ffi::Ffi::Bind<ffi::ExecutionStage::kInitialize>().To([&]() {
+        ++initialize_calls;
+        return absl::OkStatus();
+      });
+  bundle.execute = ffi::Ffi::Bind<ffi::ExecutionStage::kExecute>().To([&]() {
+    ++execute_calls;
+    return absl::OkStatus();
+  });
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
+  BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::InitializeParams initialize_params;
+  initialize_params.stream = stream.get();
+  initialize_params.buffer_allocations = &buffer_allocations;
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), buffer_allocations, stream.get(),
+      stream.get(), nullptr, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(Thunk::ThunkInfo(), "target_name",
+                              std::move(bundle),
+                              /*operands=*/{},
+                              /*results=*/{}, /*attributes=*/{},
+                              /*called_computation=*/nullptr));
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 0);
+  EXPECT_EQ(initialize_calls, 0);
+  EXPECT_EQ(execute_calls, 0);
+
+  EXPECT_THAT(thunk->Prepare(prepare_params), IsOk());
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 1);
+  EXPECT_EQ(initialize_calls, 0);
+  EXPECT_EQ(execute_calls, 0);
+
+  EXPECT_THAT(thunk->Initialize(initialize_params), IsOk());
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 1);
+  EXPECT_EQ(initialize_calls, 1);
+  EXPECT_EQ(execute_calls, 0);
+
+  EXPECT_THAT(thunk->ExecuteOnStream(execute_params), IsOk());
+  EXPECT_EQ(initialize_calls, 1);
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 1);
+  EXPECT_EQ(execute_calls, 1);
+}
+
+TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutOptionalOnes) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+  int execute_calls = 0;
+  CustomCallThunk::OwnedHandlerBundle bundle;
+  bundle.execute = ffi::Ffi::Bind().To([&]() {
+    ++execute_calls;
+    return absl::OkStatus();
+  });
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
+  Thunk::InitializeParams initialize_params = Thunk::InitializeParams{};
+  BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), buffer_allocations, stream.get(),
+      stream.get(), nullptr, nullptr);
+
+  // Optional handlers are null and shouldn't be invoked.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(Thunk::ThunkInfo(), "target_name",
+                              std::move(bundle),
+                              /*operands=*/{},
+                              /*results=*/{}, /*attributes=*/{},
+                              /*called_computation=*/nullptr));
+  EXPECT_THAT(thunk->Prepare(prepare_params), IsOk());
+  EXPECT_THAT(thunk->Initialize(initialize_params), IsOk());
+  EXPECT_THAT(thunk->ExecuteOnStream(execute_params), IsOk());
+  EXPECT_EQ(execute_calls, 1);
+}
+
+TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutExecute) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+  CustomCallThunk::OwnedHandlerBundle bundle;  // all handlers null
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), BufferAllocations({}, 0, &allocator),
+      stream.get(), stream.get(), nullptr, nullptr);
+
+  EXPECT_THAT(CustomCallThunk::Create(Thunk::ThunkInfo(), "target_name",
+                                      std::move(bundle),
+                                      /*operands=*/{},
+                                      /*results=*/{}, /*attributes=*/{},
+                                      /*called_computation=*/nullptr),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+// A simple callback function that expects specific arguments.
+absl::Status VerifyCallbackArguments(int my_attribute,
+                                     ffi::AnyBuffer my_operand,
+                                     ffi::Result<ffi::AnyBuffer> my_result,
+                                     const HloComputation* called_computation) {
+  EXPECT_EQ(my_attribute, 42);
+  EXPECT_EQ(my_operand.element_type(), xla::PrimitiveType::U8);
+  EXPECT_EQ(my_operand.device_memory().opaque(),
+            absl::bit_cast<void*>(static_cast<intptr_t>(0xDEADBEEF)));
+  EXPECT_EQ(my_result->element_type(), xla::PrimitiveType::U16);
+  EXPECT_EQ(my_result->device_memory().opaque(),
+            absl::bit_cast<void*>(static_cast<intptr_t>(0xABCDEF)));
+  EXPECT_EQ(called_computation->name(), "test_computation");
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kVerifyCallbackArguments, VerifyCallbackArguments,
+                       ffi::Ffi::Bind()
+                           .Attr<int>("my_attribute")
+                           .Arg<ffi::AnyBuffer>()
+                           .Ret<ffi::AnyBuffer>()
+                           .Ctx<ffi::CalledComputation>(),
+                       {ffi::Traits::kCmdBufferCompatible});
+
+constexpr absl::string_view kVerifyCallbackArgumentsCustomCallName =
+    "__xla_test$$verify_callback_arguments";
+constexpr absl::string_view kTestPlatformName = "TEST_PLATFORM";
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
+                         kVerifyCallbackArgumentsCustomCallName,
+                         kTestPlatformName, kVerifyCallbackArguments);
+
+TEST(CustomCallThunkTest, ProtoConversion) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("test_computation");
+  // This instruction is pretty arbitrary, we just need a non-empty computation.
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  BufferAllocation alloc0{0, 1024, 0};
+  BufferAllocation alloc1{1, 1024, 0};
+  ShapedSlice operand_slice{BufferAllocation::Slice{&alloc0, 0, 1024},
+                            ShapeUtil::MakeShape(U8, {1024})};
+  ShapedSlice result_slice{BufferAllocation::Slice{&alloc1, 0, 1024},
+                           ShapeUtil::MakeShape(U16, {512})};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> original_thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/std::string(kVerifyCallbackArgumentsCustomCallName),
+          /*operands=*/{operand_slice},
+          /*results=*/{result_slice}, /*attributes=*/{{"my_attribute", 42}},
+          hlo_module.entry_computation(),
+          /*platform_name=*/kTestPlatformName));
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
+  ASSERT_TRUE(proto.has_custom_call_thunk());
+  original_thunk.reset();
+
+  std::array allocations = {alloc0, alloc1};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> new_thunk,
+      CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto.custom_call_thunk(),
+                                 allocations, &hlo_module, kTestPlatformName));
+
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations device_allocations(
+      {stream_executor::DeviceMemoryBase(
+           absl::bit_cast<void*>(static_cast<intptr_t>(0xDEADBEEF)), 1024),
+       stream_executor::DeviceMemoryBase(
+           absl::bit_cast<void*>(static_cast<intptr_t>(0xABCDEF)), 1024)},
+      0, &allocator);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device_allocations,
+      /*stream=*/stream.get(),
+      /*command_buffer_trace_stream=*/stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr);
+  EXPECT_THAT(new_thunk->ExecuteOnStream(params), IsOk());
+}
+
+TEST(CustomCallThunkTest, DeserializationFailsWithMissingHloModule) {
+  CustomCallThunkProto proto =
+      tsl::proto_testing::ParseTextProtoOrDie<CustomCallThunkProto>(
+          R"pb(
+            target_name: "__xla_test$$verify_callback_arguments"
+            api_version: API_VERSION_TYPED_FFI
+            called_computation: "called_computation"
+          )pb");
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("not_called_computation");
+  // This instruction is pretty arbitrary, we just need a non-empty computation.
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto,
+                                         /*buffer_allocations=*/{}, &hlo_module,
+                                         /*platform_name=*/kTestPlatformName),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.cc
new file mode 100644
index 00000000000000..87250c700c4fc1
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.cc
@@ -0,0 +1,176 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+CustomKernelThunk::CustomKernelThunk(
+    Thunk::ThunkInfo thunk_info, CustomKernel custom_kernel,
+    const emitters::KernelArguments& kernel_arguments)
+    : Thunk(Kind::kCustomKernel, std::move(thunk_info)),
+      args_(kernel_arguments.GetArgumentBufferSlices()),
+      args_shape_(kernel_arguments.GetArgumentBufferShapes()),
+      written_(kernel_arguments.GetArgumentOutputFlags()),
+      custom_kernel_(std::move(custom_kernel)) {}
+
+std::string CustomKernelThunk::ToString(int indent) const {
+  return custom_kernel_.ToString();
+}
+
+absl::Status CustomKernelThunk::Initialize(const InitializeParams& params) {
+  absl::MutexLock lock(mutex_);
+
+  if (!kernel_cache_.contains(params.executor)) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<se::Kernel> kernel,
+        params.executor->LoadKernel(custom_kernel_.kernel_spec()));
+    kernel_cache_.emplace(params.executor, std::move(kernel));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status CustomKernelThunk::ExecuteOnStream(const ExecuteParams& params) {
+  se::StreamExecutor* executor = params.stream->parent();
+
+  se::Kernel* kernel = [&] {
+    absl::MutexLock lock(mutex_);
+    return kernel_cache_[executor].get();
+  }();
+
+  int device_ordinal = executor->device_ordinal();
+  VLOG(3) << "[" << device_ordinal << "] Launching "
+          << custom_kernel_.ToString() << " as device kernel "
+          << kernel->name();
+
+  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
+  for (const BufferAllocation::Slice& arg : args_) {
+    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
+    VLOG(3) << "[" << device_ordinal << "]  Arg: alloc #" << arg.index()
+            << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
+            << buf.size() << "B)";
+    buffer_args.push_back(buf);
+  }
+
+  if (VLOG_IS_ON(100)) {
+    absl::InlinedVector<se::KernelArgument, 4> kernel_args;
+    for (const se::DeviceMemoryBase& arg : buffer_args) {
+      kernel_args.push_back(arg);
+    }
+    PrintBufferContents(params.stream, kernel_args);
+  }
+
+  se::KernelArgsDeviceMemoryArray args(buffer_args,
+                                       custom_kernel_.shared_memory_bytes());
+
+  return kernel->Launch(custom_kernel_.thread_dims(),
+                        custom_kernel_.block_dims(),
+                        custom_kernel_.cluster_dims(), params.stream, args);
+}
+
+Thunk::BufferUses CustomKernelThunk::buffer_uses() const {
+  Thunk::BufferUses buffers;
+  buffers.reserve(args_.size());
+  for (int i = 0; i < args_.size(); ++i) {
+    // We assume that any buffer is either an input or an output of the
+    // kernel, and inout buffers are represented as 2 separate arguments.
+    if (written_[i]) {
+      buffers.push_back(BufferUse::Write(args_[i], args_shape_[i]));
+    } else {
+      buffers.push_back(BufferUse::Read(args_[i], args_shape_[i]));
+    }
+  }
+  return buffers;
+}
+
+CustomKernelThunk::CustomKernelThunk(Thunk::ThunkInfo thunk_info,
+                                     CustomKernel custom_kernel,
+                                     std::vector<BufferAllocation::Slice> args,
+                                     std::vector<bool> written)
+    : Thunk(Kind::kCustomKernel, std::move(thunk_info)),
+      args_(std::move(args)),
+      written_(std::move(written)),
+      custom_kernel_(std::move(custom_kernel)) {}
+
+absl::StatusOr<ThunkProto> CustomKernelThunk::ToProto() const {
+  ThunkProto thunk_proto;
+  *thunk_proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  CustomKernelThunkProto* custom_kernel_thunk_proto =
+      thunk_proto.mutable_custom_kernel_thunk();
+  for (const BufferAllocation::Slice& arg : args_) {
+    TF_ASSIGN_OR_RETURN(*custom_kernel_thunk_proto->add_args(), arg.ToProto());
+  }
+  for (bool written : written_) {
+    custom_kernel_thunk_proto->add_written(written);
+  }
+  TF_ASSIGN_OR_RETURN(*custom_kernel_thunk_proto->mutable_custom_kernel(),
+                      custom_kernel_.ToProto());
+  return thunk_proto;
+}
+
+absl::StatusOr<std::unique_ptr<CustomKernelThunk>> CustomKernelThunk::FromProto(
+    ThunkInfo thunk_info, const CustomKernelThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const std::optional<se::KernelLoaderSpec::SymbolResolver>&
+        symbol_resolver) {
+  TF_ASSIGN_OR_RETURN(
+      CustomKernel custom_kernel,
+      CustomKernel::FromProto(proto.custom_kernel(), symbol_resolver));
+  std::vector<BufferAllocation::Slice> args;
+  args.reserve(proto.args_size());
+  for (const buffer_assignment::BufferAllocationSliceProto& arg_proto :
+       proto.args()) {
+    TF_ASSIGN_OR_RETURN(
+        args.emplace_back(),
+        BufferAllocation::Slice::FromProto(arg_proto, buffer_allocations));
+  }
+  std::vector<bool> written{proto.written().begin(), proto.written().end()};
+  return absl::WrapUnique(new CustomKernelThunk(std::move(thunk_info),
+                                                std::move(custom_kernel), args,
+                                                std::move(written)));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.h
new file mode 100644
index 00000000000000..8a74dc994a1da6
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.h
@@ -0,0 +1,109 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_CUSTOM_KERNEL_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_CUSTOM_KERNEL_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// CustomKernelThunk loads and executes kernels defined by a custom kernel
+// (which in practice means hand written CUDA C++ kernel), instead of a kernel
+// compiled by XLA and loaded from an executable source.
+class CustomKernelThunk : public Thunk {
+ public:
+  CustomKernelThunk(Thunk::ThunkInfo thunk_info, CustomKernel custom_kernel,
+                    const emitters::KernelArguments& kernel_arguments);
+
+  std::string ToString(int indent) const override;
+
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const CustomKernel& custom_kernel() const { return custom_kernel_; }
+
+  const std::vector<BufferAllocation::Slice>& arguments() const {
+    return args_;
+  }
+
+  absl::string_view custom_kernel_name() const { return custom_kernel_.name(); }
+
+  const std::vector<bool>& written() const { return written_; }
+
+  LaunchDimensions launch_dimensions() const {
+    return LaunchDimensions(custom_kernel_.block_dims(),
+                            custom_kernel_.thread_dims());
+  }
+
+  int64_t shmem_bytes() const { return custom_kernel_.shared_memory_bytes(); }
+
+  BufferUses buffer_uses() const override;
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
+  static absl::StatusOr<std::unique_ptr<CustomKernelThunk>> FromProto(
+      ThunkInfo thunk_info, const CustomKernelThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      const std::optional<se::KernelLoaderSpec::SymbolResolver>&
+          symbol_resolver = std::nullopt);
+
+ private:
+  // Private constructor for deserialization.
+  CustomKernelThunk(Thunk::ThunkInfo thunk_info, CustomKernel custom_kernel,
+                    std::vector<BufferAllocation::Slice> args,
+                    std::vector<bool> written);
+
+  // Buffer slices passed to the kernel as arguments.
+  std::vector<BufferAllocation::Slice> args_;
+  std::vector<Shape> args_shape_;
+
+  // args_[i] is written iff (written_[i] == true).
+  std::vector<bool> written_;
+
+  CustomKernel custom_kernel_;
+
+  // Loaded kernels for each `StreamExecutor`.
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
+      kernel_cache_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_CUSTOM_KERNEL_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk_test.cc
new file mode 100644
index 00000000000000..9bcfd5db9d21a9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+using absl_testing::IsOkAndHolds;
+using ::testing::Field;
+using ::testing::Optional;
+using tsl::proto_testing::EqualsProto;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(CustomKernelThunkTest, BufferUsesReturnsCorrectBuffers) {
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {512});
+  CustomKernel kernel(
+      /*name=*/"",
+      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
+          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
+      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
+  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
+  emitters::KernelArgument arg0(arg_shape, slice0);
+  emitters::KernelArgument arg1(arg_shape, slice1);
+  arg0.set_written(false);
+  arg1.set_written(true);
+  emitters::KernelArguments kernel_arguments({arg0, arg1});
+  CustomKernelThunk thunk(Thunk::ThunkInfo{}, kernel, kernel_arguments);
+
+  Thunk::BufferUses buffers = thunk.buffer_uses();
+
+  ASSERT_THAT(buffers, testing::UnorderedElementsAre(
+                           BufferUse::Read(slice0, arg_shape),
+                           BufferUse::Write(slice1, arg_shape)));
+}
+
+TEST(CustomKernelThunkTest, BufferUsesReturnsBuffersInConsistentOrder) {
+  CustomKernel kernel(
+      /*name=*/"",
+      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
+          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
+      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
+  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
+  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
+  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
+  arg0.set_written(false);
+  arg1.set_written(true);
+  emitters::KernelArguments kernel_arguments({arg0, arg1});
+  CustomKernelThunk thunk(Thunk::ThunkInfo{}, kernel, kernel_arguments);
+
+  Thunk::BufferUses buffers1 = thunk.buffer_uses();
+  Thunk::BufferUses buffers2 = thunk.buffer_uses();
+
+  ASSERT_THAT(buffers1, testing::ContainerEq(buffers2));
+}
+
+TEST(CustomKernelThunkTest, ToProto) {
+  CustomKernel kernel("name",
+                      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
+                          "PTX", "kernel_name", /*arity=*/1),
+                      se::BlockDim(3, 2, 1), se::ThreadDim(4, 5, 6),
+                      /*shared_memory_bytes=*/42);
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 7;
+  thunk_info.thunk_id = 42;
+
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
+  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
+  arg0.set_written(true);
+  emitters::KernelArguments kernel_arguments({arg0});
+  CustomKernelThunk thunk(thunk_info, kernel, kernel_arguments);
+
+  EXPECT_THAT(thunk.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+                thunk_info {
+                  profile_annotation: "profile_annotation"
+                  execution_stream_id: 7
+                  thunk_id: 42
+                }
+                custom_kernel_thunk {
+                  custom_kernel {
+                    name: "name"
+                    kernel_spec {
+                      kernel_name: "kernel_name"
+                      ptx { data: "PTX" }
+                      arity: 1
+                    }
+                    block_dims { coordinates { x: 3, y: 2, z: 1 } }
+                    thread_dims { coordinates { x: 4, y: 5, z: 6 } }
+                    shared_memory_bytes: 42
+                  }
+                  args { buffer_allocation_index: 0, offset: 0, size: 512 }
+                  written: true
+                }
+              )pb")));
+}
+
+TEST(CustomKernelThunkTest, FromProto) {
+  CustomKernelThunkProto proto = ParseTextProtoOrDie<CustomKernelThunkProto>(
+      R"pb(
+        custom_kernel {
+          name: "test_kernel"
+          kernel_spec {
+            ptx { data: "PTX" }
+            arity: 1
+          }
+          block_dims { coordinates { x: 1, y: 1, z: 1 } }
+          thread_dims { coordinates { x: 1, y: 1, z: 1 } }
+          shared_memory_bytes: 42
+        }
+        args { buffer_allocation_index: 0, offset: 0, size: 1024 }
+        written: true
+      )pb");
+
+  std::vector<BufferAllocation> buffer_allocations;
+  buffer_allocations.emplace_back(/*index=*/0, /*size=*/1024, /*color=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<CustomKernelThunk> thunk,
+                          CustomKernelThunk::FromProto(
+                              Thunk::ThunkInfo{}, proto, buffer_allocations));
+
+  EXPECT_THAT(thunk->custom_kernel().name(), "test_kernel");
+  EXPECT_THAT(thunk->arguments(), testing::ElementsAre(BufferAllocation::Slice(
+                                      &buffer_allocations[0], /*offset=*/0,
+                                      /*size=*/1024)));
+  EXPECT_THAT(thunk->written(), testing::ElementsAre(true));
+  EXPECT_THAT(thunk->custom_kernel().kernel_spec().cuda_ptx_in_memory(),
+              Optional(Field(&se::CudaPtxInMemory::ptx, "PTX")));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
index 7b01e3ac7c1cb6..7a28e90821371c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -39,23 +41,20 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -175,7 +174,7 @@ std::string DynamicSliceThunk::SliceDef::ToString() const {
 DynamicSliceThunk::DynamicSliceThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
     std::vector<std::optional<BufferAllocation::Slice>> arguments,
-    std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+    std::vector<BufferAllocation> fake_allocations,
     std::vector<std::optional<std::vector<Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
@@ -217,8 +216,7 @@ DynamicSliceThunk::DynamicSliceThunk(
   }
 }
 
-absl::Status DynamicSliceThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status DynamicSliceThunk::Prepare(const PrepareParams& params) {
   for (SliceDef& slice : slices_) {
     VLOG(2) << "DynamicSliceThunk: slice: " << slice.ToString();
     if (slice.offsets.has_value()) {
@@ -237,7 +235,7 @@ absl::Status DynamicSliceThunk::Prepare(
     }
   }
 
-  TF_RETURN_IF_ERROR(embedded_thunk_->Prepare(params, resource_requests));
+  TF_RETURN_IF_ERROR(embedded_thunk_->Prepare(params));
 
   if (offset_as_function_of_indvar_metadata_ != std::nullopt) {
     Indvar(this) =
@@ -440,6 +438,18 @@ void DynamicSliceThunk::ForAllThunksMutable(
   embedded_thunk_->ForAllThunksMutable(fn);
 }
 
+absl::Status DynamicSliceThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  TF_RETURN_IF_ERROR(embedded_thunk_->TransformAllNestedThunks(fn));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
+                      fn(std::move(embedded_thunk_)));
+  embedded_thunk_ = SequentialThunk::FromThunk(std::move(thunk));
+  return absl::OkStatus();
+}
+
 absl::StatusOr<OptionalDynamicSliceOffsetsProto>
 SerializeOptionalDynamicSliceOffsetsToProto(
     const std::optional<std::vector<DynamicSliceThunk::Offset>>& offsets_item,
@@ -605,13 +615,19 @@ absl::StatusOr<ThunkProto> DynamicSliceThunk::ToProto() const {
              ->mutable_offset_as_function_of_indvar_modules_metadata(),
         offset_as_function_of_indvar_metadata_->ToProto());
   }
+
+  // fake_allocations
+  for (const auto& fake_allocation : fake_allocations_) {
+    *dynamic_slice_proto->add_fake_allocations() = fake_allocation.ToProto();
+  }
+
   return proto;
 }
 
 absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     ThunkInfo thunk_info, const DynamicSliceThunkProto& proto,
     absl::Span<const BufferAllocation> buffer_allocations,
-    absl::Span<const BufferAllocation> fake_allocations) {
+    const DeserializerWithCustomAllocations& deserializer) {
   // offset_as_function_of_indvar_metadata
   std::optional<OffsetAsFunctionOfIndvarModulesMetadata>
       offset_as_function_of_indvar_metadata;
@@ -668,21 +684,25 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
+  // fake_allocations
+  std::vector<BufferAllocation> fake_allocations;
+  for (const auto& fake_allocation_proto : proto.fake_allocations()) {
+    fake_allocations.push_back(
+        BufferAllocation::FromProto(fake_allocation_proto));
+  }
+
   // embedded_thunk
   std::vector<std::unique_ptr<Thunk>> embedded_thunks;
   for (const auto& thunk_proto : proto.embedded_thunk().thunks()) {
-    TF_ASSIGN_OR_RETURN(auto thunk,
-                        DeserializeThunkProto(thunk_proto, fake_allocations));
-    embedded_thunks.push_back(std::move(thunk));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> embedded_thunk,
+                        deserializer(thunk_proto, fake_allocations));
+    embedded_thunks.push_back(std::move(embedded_thunk));
   }
 
-  // leave fake_allocations empty, because we manage their lifetime outside
-  // of this function.
   return std::make_unique<DynamicSliceThunk>(
       thunk_info, std::make_unique<ThunkSequence>(std::move(embedded_thunks)),
-      std::move(arguments),
-      /*fake_allocations=*/std::vector<std::unique_ptr<BufferAllocation>>(),
-      std::move(offsets), std::move(orig_shapes), std::move(sliced_shapes),
+      std::move(arguments), std::move(fake_allocations), std::move(offsets),
+      std::move(orig_shapes), std::move(sliced_shapes),
       std::move(offset_byte_sizes),
       std::move(offset_as_function_of_indvar_metadata));
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
index 776dcc7c15d4b8..c41bc02853cb2b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <utility>
 #include <variant>
 #include <vector>
 
@@ -28,9 +30,10 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/literal.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -111,7 +114,7 @@ class DynamicSliceThunk : public Thunk {
   DynamicSliceThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
       std::vector<std::optional<BufferAllocation::Slice>> arguments,
-      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+      std::vector<BufferAllocation> fake_allocations,
       std::vector<std::optional<std::vector<Offset>>> offsets,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
@@ -123,8 +126,7 @@ class DynamicSliceThunk : public Thunk {
 
   const Thunk* embedded_thunk() const { return embedded_thunk_.get(); }
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -147,8 +149,7 @@ class DynamicSliceThunk : public Thunk {
     return arguments_;
   }
 
-  const std::vector<std::unique_ptr<BufferAllocation>>& get_fake_allocations()
-      const {
+  const std::vector<BufferAllocation>& get_fake_allocations() const {
     return fake_allocations_;
   }
 
@@ -170,21 +171,21 @@ class DynamicSliceThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
 
   absl::StatusOr<ThunkProto> ToProto() const override;
 
   // `buffer_allocations`: the actual buffer allocations; required to parse the
   // `arguments` (BufferAllocation::Slice) -- the tensors that we are later
   // slicing from.
-  // `fake_allocations`: The fake allocations that are used as
-  // placeholders during creation of the embedded thunk. These are being
-  // replaced during execution in `ExecuteOnStream` with the actual (dynamic)
-  // slices. We have to create these outside of this method to manage their
-  // lifetime correctly.
+  // `deserializer`: The deserializer is used to deserialize the embedded thunk.
   static absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> FromProto(
       ThunkInfo thunk_info, const DynamicSliceThunkProto& proto,
       absl::Span<const BufferAllocation> buffer_allocations,
-      absl::Span<const BufferAllocation> fake_allocations);
+      const DeserializerWithCustomAllocations& deserializer);
 
   std::optional<const OffsetAsFunctionOfIndvarModulesMetadata*>
   get_offset_function() const {
@@ -198,7 +199,7 @@ class DynamicSliceThunk : public Thunk {
  private:
   std::unique_ptr<SequentialThunk> embedded_thunk_;
   std::vector<std::optional<BufferAllocation::Slice>> arguments_;
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
+  std::vector<BufferAllocation> fake_allocations_;
   std::vector<std::optional<std::vector<Offset>>> offsets_;
   std::vector<std::optional<Shape>> orig_shapes_;
   std::vector<std::optional<Shape>> sliced_shapes_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
index 757f0475135eac..710b4fd0ceddb0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
@@ -33,7 +33,11 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.pb.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -42,7 +46,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/resource_requests.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
@@ -64,7 +67,20 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
+class DummyThunk : public Thunk {
+ public:
+  explicit DummyThunk(Kind kind, const Thunk::ThunkInfo& info)
+      : Thunk(kind, info) {}
+  ~DummyThunk() override = default;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+};
+
 using DynamicSliceThunkTest = HloHardwareIndependentTestBase;
+using ::testing::NotNull;
+using ::testing::SizeIs;
 using ::tsl::proto_testing::EqualsProto;
 
 std::string GetPlatformName() {
@@ -93,11 +109,21 @@ void CheckProtoRoundTrip(const DynamicSliceThunk& thunk,
           BufferAllocation(i, arguments[i].value().allocation()->size(), 0));
     }
   }
+
+  Thunk::DeserializerWithCustomAllocations deserializer =
+      [](const ThunkProto& thunk_proto,
+         absl::Span<const BufferAllocation> fake_allocations_span)
+      -> absl::StatusOr<std::unique_ptr<Thunk>> {
+    return DeserializeThunkProto(thunk_proto, fake_allocations_span,
+                                 /*hlo_module*/ nullptr,
+                                 /*platform_name=*/"TEST_PLATFORM");
+  };
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto thunk_from_proto,
       DynamicSliceThunk::FromProto(Thunk::ThunkInfo(), proto,
                                    /*buffer_allocations=*/buffer_allocations,
-                                   /*fake_allocations=*/fake_allocations_span));
+                                   deserializer));
   TF_ASSERT_OK_AND_ASSIGN(auto proto_roundtrip, thunk_from_proto->ToProto());
   auto dynamic_slice_thunk_proto_roundtrip =
       proto_roundtrip.dynamic_slice_thunk();
@@ -146,28 +172,24 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> CreateSlicedGemmThunk(
   int64_t out_length = sizeof(float) * 1 * 1;
   int64_t offset_length = sizeof(int64_t);
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
   auto alloc_lhs =
       std::make_unique<BufferAllocation>(/*index=*/0, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(alloc_lhs.get(), 0, lhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
-                                    rhs_length);
-
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
-                                    out_length);
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&fake_allocations.back(), 0, rhs_length);
+
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&fake_allocations.back(), 0, out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&fake_allocations.back(), 0,
                                           1024 * 1024);
 
   auto alloc_lhs_offset_0 = std::make_unique<BufferAllocation>(
@@ -308,14 +330,13 @@ CreateMultipleSlicedOperandsGemmThunk(
   int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(float) * 3;
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          slice_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          slice_length);
   auto alloc_lhs =
       std::make_unique<BufferAllocation>(/*index=*/0, length, /*color=*/0);
@@ -323,13 +344,10 @@ CreateMultipleSlicedOperandsGemmThunk(
   auto alloc_rhs =
       std::make_unique<BufferAllocation>(/*index=*/1, length, /*color=*/0);
   BufferAllocation::Slice slice_rhs(alloc_rhs.get(), 0, length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
-                                    out_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&fake_allocations.back(), 0, out_length);
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&fake_allocations.back(), 0,
                                           1024 * 1024);
   auto alloc_lhs_offset_0 = std::make_unique<BufferAllocation>(
       /*index=*/4, offset_length, /*color=*/0);
@@ -525,21 +543,19 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
   // Prepare embedded and dynamic slice thunks.
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(2);
 
   // Fake slices for embedded thunk creation.
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
   BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, dst_length, /*color=*/0));
-  BufferAllocation::Slice slice_dst(fake_allocations.back().get(), 0,
-                                    dst_length);
+  fake_allocations.emplace_back(/*index=*/1, dst_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst(&fake_allocations.back(), 0, dst_length);
 
   BufferAllocation alloc_offset_0(/*index=*/2, offset_length, /*color=*/0);
   BufferAllocation::Slice slice_offset_0(&alloc_offset_0, 0, offset_length);
@@ -559,12 +575,10 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
       auto registration,
       xla::ffi::FindHandler("__xla_test$$memcpy", GetPlatformName()));
 
-  std::vector<std::optional<CustomCallThunk::Slice>> operands{
-      CustomCallThunk::Slice{slice_src_fake,
-                             ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8})}};
-  std::vector<std::optional<CustomCallThunk::Slice>> results{
-      CustomCallThunk::Slice{slice_dst,
-                             ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8})}};
+  std::vector<NullableShapedSlice> operands{ShapedSlice{
+      slice_src_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8})}};
+  std::vector<NullableShapedSlice> results{
+      ShapedSlice{slice_dst, ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8})}};
 
   // Creating embedded custom call thunk.
   ThunkSequence seq;
@@ -572,7 +586,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
       seq.emplace_back(),
       CustomCallThunk::Create(Thunk::ThunkInfo(), "__xla_test$$memcpy",
                               registration.bundle, operands, results,
-                              /*attributes=*/CustomCallThunk::AttributesMap(),
+                              /*attributes=*/ffi::AttributesMap(),
                               /*called_computation=*/nullptr));
 
   // Wrapping dynamic slice thunk around the custom call thunk.
@@ -665,17 +679,16 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
   // Prepare embedded and dynamic slice thunks.
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(2);
 
   // Fake slices for embedded thunk creation.
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
@@ -722,12 +735,10 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
       auto registration,
       xla::ffi::FindHandler("__xla_test$$memcpy", GetPlatformName()));
 
-  std::vector<std::optional<CustomCallThunk::Slice>> operands{
-      CustomCallThunk::Slice{slice_src_fake,
-                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
-  std::vector<std::optional<CustomCallThunk::Slice>> results{
-      CustomCallThunk::Slice{slice_dst_fake,
-                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<NullableShapedSlice> operands{ShapedSlice{
+      slice_src_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<NullableShapedSlice> results{ShapedSlice{
+      slice_dst_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
 
   // Creating embedded custom call thunk.
   ThunkSequence seq;
@@ -735,7 +746,7 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
       seq.emplace_back(),
       CustomCallThunk::Create(Thunk::ThunkInfo(), "__xla_test$$memcpy",
                               registration.bundle, operands, results,
-                              /*attributes=*/CustomCallThunk::AttributesMap(),
+                              /*attributes=*/ffi::AttributesMap(),
                               /*called_computation=*/nullptr));
 
   // Wrapping dynamic slice thunk around the custom call thunk.
@@ -857,22 +868,19 @@ CreateSlicedGemmArbitraryArgumentOrderThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out_fake(&fake_allocations.back(), 0,
                                          out_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace_fake(&fake_allocations.back(), 0,
                                                1024 * 1024);
 
   auto alloc_lhs =
@@ -1032,22 +1040,19 @@ CreateSlicedGemmArbitraryNumberOfArgumentsThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out_fake(&fake_allocations.back(), 0,
                                          out_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace_fake(&fake_allocations.back(), 0,
                                                1024 * 1024);
 
   auto alloc_lhs =
@@ -1209,29 +1214,24 @@ CreateSlicedTupledOperandGemmThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
   auto alloc_lhs = std::make_unique<BufferAllocation>(
       /*index=*/0, 3 * lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(alloc_lhs.get(), lhs_length, lhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
-                                    rhs_length);
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&fake_allocations.back(), 0, rhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
-                                    out_length);
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&fake_allocations.back(), 0, out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&fake_allocations.back(), 0,
                                           1024 * 1024);
 
   auto alloc_lhs_offset_0 = std::make_unique<BufferAllocation>(
@@ -1387,17 +1387,16 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
   // Prepare embedded and dynamic slice thunks.
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(2);
 
   // Fake slices for embedded thunk creation.
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
@@ -1444,12 +1443,10 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
       auto registration,
       xla::ffi::FindHandler("__xla_test$$memcpy", GetPlatformName()));
 
-  std::vector<std::optional<CustomCallThunk::Slice>> operands{
-      CustomCallThunk::Slice{slice_src_fake,
-                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
-  std::vector<std::optional<CustomCallThunk::Slice>> results{
-      CustomCallThunk::Slice{slice_dst_fake,
-                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<NullableShapedSlice> operands{ShapedSlice{
+      slice_src_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<NullableShapedSlice> results{ShapedSlice{
+      slice_dst_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
 
   // Creating embedded custom call thunk.
   ThunkSequence seq;
@@ -1457,7 +1454,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
       seq.emplace_back(),
       CustomCallThunk::Create(Thunk::ThunkInfo(), "__xla_test$$memcpy",
                               registration.bundle, operands, results,
-                              /*attributes=*/CustomCallThunk::AttributesMap(),
+                              /*attributes=*/ffi::AttributesMap(),
                               /*called_computation=*/nullptr));
 
   // Wrapping dynamic slice thunk around the custom call thunk.
@@ -1582,25 +1579,22 @@ CreateSlicedOperandsSameBufferGemmThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out_fake(&fake_allocations.back(), 0,
                                          out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace_fake(&fake_allocations.back(), 0,
                                                1024 * 1024);
 
   auto alloc = std::make_unique<BufferAllocation>(
@@ -1795,11 +1789,11 @@ CreateHostInductionVariableAndOffsetEvaluationThunk(
   int64_t out_length = sizeof(float) * 1 * 1;
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, /*size=*/rhs_length, /*color=*/0));
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, /*size=*/rhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs_fake(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/rhs_length);
 
   auto alloc_lhs = std::make_unique<BufferAllocation>(
@@ -1807,22 +1801,19 @@ CreateHostInductionVariableAndOffsetEvaluationThunk(
   BufferAllocation::Slice slice_lhs(alloc_lhs.get(), /*offset=*/0,
                                     /*size=*/lhs_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, /*size=*/rhs_length, /*color=*/0));
+  fake_allocations.emplace_back(/*index=*/1, /*size=*/rhs_length, /*color=*/0);
   BufferAllocation::Slice slice_rhs(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/rhs_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/2, /*size=*/out_length, /*color=*/0));
+  fake_allocations.emplace_back(/*index=*/2, /*size=*/out_length, /*color=*/0);
   BufferAllocation::Slice slice_out(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, /*size=*/1024 * 1024, /*color=*/0));
+  fake_allocations.emplace_back(/*index=*/3, /*size=*/1024 * 1024, /*color=*/0);
   BufferAllocation::Slice slice_workspace(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/1024 * 1024);
 
   backing_allocations.push_back(std::move(alloc_lhs));
@@ -1953,7 +1944,7 @@ TEST_F(DynamicSliceThunkTest,
                                 /*device_ordinal=*/0,
                                 /*memory_allocator=*/&allocator);
 
-  Thunk::PrepareParams prepare_params{nullptr};
+  Thunk::PrepareParams prepare_params{};
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, /*buffer_allocations=*/allocations, stream.get(),
@@ -1965,8 +1956,7 @@ TEST_F(DynamicSliceThunkTest,
       {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing dynamic slice thunk.
-  ResourceRequests resource_requests;
-  TF_ASSERT_OK(thunk->Prepare(prepare_params, resource_requests));
+  TF_ASSERT_OK(thunk->Prepare(prepare_params));
   TF_ASSERT_OK(thunk->ExecuteOnStream(params));
   TF_ASSERT_OK(stream->BlockHostUntilDone());
 
@@ -2105,5 +2095,29 @@ TEST_F(DynamicSliceThunkTest,
   EXPECT_EQ(proto.offsets().offsets(0).hlo_module_offset_idx(), 0);
 }
 
+TEST_F(DynamicSliceThunkTest, TransformAllNestedThunks) {
+  auto seq = std::make_unique<ThunkSequence>();
+  seq->emplace_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, Thunk::ThunkInfo()));
+  DynamicSliceThunk thunk(Thunk::ThunkInfo(),
+                          /*embedded_thunk=*/std::move(seq),
+                          /*arguments=*/{},
+                          /*fake_allocations=*/{},
+                          /*offsets=*/{},
+                          /*orig_shapes=*/{},
+                          /*sliced_shapes=*/{},
+                          /*offset_byte_sizes=*/{});
+
+  TF_EXPECT_OK(thunk.TransformAllNestedThunks([](auto) {
+    return std::make_unique<DummyThunk>(Thunk::Kind::kCustomCall,
+                                        Thunk::ThunkInfo());
+  }));
+
+  EXPECT_THAT(thunk.get_embedded_thunk(), NotNull());
+  EXPECT_THAT(thunk.get_embedded_thunk()->thunks(), SizeIs(1));
+  EXPECT_THAT(thunk.get_embedded_thunk()->thunks()[0]->kind(),
+              Thunk::Kind::kCustomCall);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc
index b9bdd2cefc12c5..d0883c0435fd06 100644
--- a/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc
@@ -18,16 +18,18 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_assignment.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -37,11 +39,10 @@ limitations under the License.
 #include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -83,6 +84,25 @@ std::string FftTypeToString(se::fft::Type type) {
   }
 }
 
+absl::StatusOr<FftType> SeTypeToFftType(se::fft::Type type) {
+  switch (type) {
+    case se::fft::Type::kC2CForward:
+    case se::fft::Type::kZ2ZForward:
+      return FftType::FFT;
+    case se::fft::Type::kC2CInverse:
+    case se::fft::Type::kZ2ZInverse:
+      return FftType::IFFT;
+    case se::fft::Type::kC2R:
+    case se::fft::Type::kZ2D:
+      return FftType::IRFFT;
+    case se::fft::Type::kR2C:
+    case se::fft::Type::kD2Z:
+      return FftType::RFFT;
+    case se::fft::Type::kInvalid:
+      return Internal("Invalid fft type");
+  }
+}
+
 absl::StatusOr<stream_executor::blas::BlasSupport*> GetBlas(
     se::Stream* stream) {
   auto blas = stream->parent()->AsBlas();
@@ -270,5 +290,47 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
                   FftTypeToString(fft_type));
 }
 
+absl::StatusOr<std::unique_ptr<FftThunk>> FftThunk::FromProto(
+    ThunkInfo thunk_info, const FftThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice input_buffer,
+                      BufferAllocation::Slice::FromProto(proto.input_buffer(),
+                                                         buffer_allocations));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_buffer,
+                      BufferAllocation::Slice::FromProto(proto.output_buffer(),
+                                                         buffer_allocations));
+
+  TF_ASSIGN_OR_RETURN(Shape input_shape, Shape::FromProto(proto.input_shape()));
+  TF_ASSIGN_OR_RETURN(Shape output_shape,
+                      Shape::FromProto(proto.output_shape()));
+
+  std::vector<int64_t> fft_length{proto.fft_length().begin(),
+                                  proto.fft_length().end()};
+
+  return std::make_unique<FftThunk>(thunk_info, proto.fft_type(),
+                                    std::move(fft_length), input_buffer,
+                                    output_buffer, input_shape, output_shape);
+}
+
+absl::StatusOr<ThunkProto> FftThunk::ToProto() const {
+  ThunkProto thunk_proto;
+  *thunk_proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  FftThunkProto* proto = thunk_proto.mutable_fft_thunk();
+  TF_ASSIGN_OR_RETURN(FftType fft_type, SeTypeToFftType(fft_type_));
+  proto->set_fft_type(fft_type);
+
+  *proto->mutable_fft_length() = {fft_length_.begin(), fft_length_.end()};
+
+  TF_ASSIGN_OR_RETURN(*proto->mutable_input_buffer(), input_buffer_.ToProto());
+  TF_ASSIGN_OR_RETURN(*proto->mutable_output_buffer(),
+                      output_buffer_.ToProto());
+
+  *proto->mutable_input_shape() = input_shape_.ToProto();
+  *proto->mutable_output_shape() = output_shape_.ToProto();
+
+  return thunk_proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/fft_thunk.h b/third_party/xla/xla/backends/gpu/runtime/fft_thunk.h
index 0bc4588a30c6f9..92aab371cd2f2c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/fft_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/fft_thunk.h
@@ -23,15 +23,16 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/fft.h"
-#include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -47,11 +48,13 @@ struct FftPlan {
 
 class FftPlanCache {
  public:
-  // Returnes Fft plan cached for the given device ordinal or creates a new one.
+  // Returns Fft plan cached for the given device ordinal or creates a new one.
   FftPlan* GetOrCreate(int device_ordinal) {
     absl::MutexLock lock(mu_);
     std::unique_ptr<FftPlan>& plan = fft_plans_[device_ordinal];
-    if (!plan) plan = std::make_unique<FftPlan>();
+    if (!plan) {
+      plan = std::make_unique<FftPlan>();
+    }
     return plan.get();
   }
 
@@ -81,6 +84,12 @@ class FftThunk : public Thunk {
   // Does the FFT for the thunk on "stream".
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  static absl::StatusOr<std::unique_ptr<FftThunk>> FromProto(
+      ThunkInfo thunk_info, const FftThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
   const se::fft::Type fft_type_;
   const std::vector<int64_t> fft_length_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/fft_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/fft_thunk_test.cc
new file mode 100644
index 00000000000000..a40c44c3a14e1c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/fft_thunk_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/fft_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(FftThunkTest, ProtoRoundTrip) {
+  auto proto = ParseTextProtoOrDie<ThunkProto>(R"pb(
+    thunk_info { profile_annotation: "test" execution_stream_id: 0 }
+    fft_thunk {
+      fft_type: FFT
+      fft_length: [ 64, 64 ]
+      input_buffer { buffer_allocation_index: 0 offset: 0 size: 1024 }
+      output_buffer { buffer_allocation_index: 1 offset: 0 size: 1024 }
+      input_shape {
+        element_type: C64
+        dimensions: 1
+        dimensions: [ 64, 64 ]
+        layout {
+          minor_to_major: [ 2, 1, 0 ]
+          tail_padding_alignment_in_elements: 1
+        }
+        is_dynamic_dimension: [ false, false, false ]
+      }
+      output_shape {
+        element_type: C64
+        dimensions: 1
+        dimensions: [ 64, 64 ]
+        layout {
+          minor_to_major: [ 2, 1, 0 ]
+          tail_padding_alignment_in_elements: 1
+        }
+        is_dynamic_dimension: [ false, false, false ]
+      }
+    }
+  )pb");
+
+  std::vector<BufferAllocation> buffer_allocations;
+  buffer_allocations.emplace_back(/*index=*/0, /*size=*/1024, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/1, /*size=*/1024, /*color=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(Thunk::ThunkInfo thunk_info,
+                          Thunk::ThunkInfo::FromProto(proto.thunk_info()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<FftThunk> thunk,
+      FftThunk::FromProto(thunk_info, proto.fft_thunk(), buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc
index 6d8a65213a4254..dc68516ac462f3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h
index 69acf964dcf21f..cb65d966aaf1f8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_GEMM_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_GEMM_THUNK_H_
 
+#include <memory>
 #include <optional>
 
 #include "absl/status/status.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc
index d59734a9483ffd..db0586bd7bfc9f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 
 #include <memory>
-#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index 830a8b2647cb3e..aa30b3a0737f03 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <random>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -93,7 +92,7 @@ class GpuBlasLtMatmulThunkTest : public HloTestBase {
   }
 
   void SetUp() override {
-    if (auto* rocm = std::get_if<se::RocmComputeCapability>(&gpu_comp());
+    if (auto* rocm = gpu_comp().rocm_compute_capability();
         rocm != nullptr && !rocm->has_hipblaslt()) {
       GTEST_SKIP() << "No hipblas-lt support on this architecture!";
     }
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc
index f6215c5bd50113..264c0dec22538c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc
@@ -19,12 +19,13 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/base/call_once.h"
-#include "absl/container/flat_hash_map.h"
+#include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/host_offloading/host_offloading_allocator.h"
 #include "xla/core/host_offloading/host_offloading_buffer.h"
@@ -425,24 +427,84 @@ HostExecuteStartThunk::HostExecuteStartThunk(
     Thunk::ThunkInfo thunk_info,
     const HostOffloadingExecutableProto& host_offloading_executable_proto,
     absl::InlinedVector<HostExecuteStartThunk::SliceAndShape, 4> args,
-    absl::InlinedVector<HostExecuteStartThunk::SliceAndShape, 4> results)
+    absl::InlinedVector<HostExecuteStartThunk::SliceAndShape, 4> results,
+    std::shared_ptr<HostExecuteAsyncEvents> async_events)
     : Thunk(Thunk::Kind::kHostExecuteStart, std::move(thunk_info)),
       args_(std::move(args)),
       results_(std::move(results)),
-      executable_proto_(host_offloading_executable_proto),
-      async_events_(std::make_shared<HostExecuteAsyncEvents>()) {}
+      executable_proto_(host_offloading_executable_proto) {
+  async_events_ =
+      async_events ? async_events : std::make_shared<HostExecuteAsyncEvents>();
+}
 
 std::string HostExecuteStartThunk::ToString(int indent) const { return ""; }
 
 absl::StatusOr<ThunkProto> HostExecuteStartThunk::ToProto() const {
-  return Unimplemented("Not implemented yet.");
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostExecuteStartThunkProto* host_execute_start_thunk_proto =
+      proto.mutable_host_execute_start_thunk();
+
+  *host_execute_start_thunk_proto->mutable_executable_proto() =
+      executable_proto_;
+
+  for (const auto& [slice, shape] : args_) {
+    ShapedSliceProto* arg_proto = host_execute_start_thunk_proto->add_args();
+    TF_ASSIGN_OR_RETURN(*arg_proto->mutable_slice(), slice.ToProto());
+    *arg_proto->mutable_shape() = shape.ToProto();
+  }
+
+  for (const auto& [slice, shape] : results_) {
+    ShapedSliceProto* result_proto =
+        host_execute_start_thunk_proto->add_results();
+    TF_ASSIGN_OR_RETURN(*result_proto->mutable_slice(), slice.ToProto());
+    *result_proto->mutable_shape() = shape.ToProto();
+  }
+
+  auto async_events_unique_id = GetAsyncEventsUniqueId();
+  // By design, async_events_unique_id should always be present for
+  // HostExecuteStartThunk.
+  CHECK_NE(async_events_unique_id, std::nullopt);
+
+  host_execute_start_thunk_proto->set_async_events_unique_id(
+      async_events_unique_id.value().value());
+
+  return proto;
 }
 
 absl::StatusOr<std::unique_ptr<HostExecuteStartThunk>>
 HostExecuteStartThunk::FromProto(
     ThunkInfo thunk_info, const HostExecuteStartThunkProto& proto,
-    absl::Span<const BufferAllocation> buffer_allocations) {
-  return Unimplemented("Not implemented yet.");
+    absl::Span<const BufferAllocation> buffer_allocations,
+    HostExecuteAsyncEventsMap& async_events_map) {
+  absl::InlinedVector<HostExecuteStartThunk::SliceAndShape, 4> args, results;
+  auto shaped_slice_from_proto =
+      [&](const auto& shaped_slice_protos,
+          absl::InlinedVector<HostExecuteStartThunk::SliceAndShape, 4>&
+              slices_and_shapes) -> absl::Status {
+    for (const auto& shaped_slice_proto : shaped_slice_protos) {
+      TF_ASSIGN_OR_RETURN(auto slice,
+                          BufferAllocation::Slice::FromProto(
+                              shaped_slice_proto.slice(), buffer_allocations));
+      TF_ASSIGN_OR_RETURN(auto shape,
+                          Shape::FromProto(shaped_slice_proto.shape()));
+      slices_and_shapes.push_back({slice, shape});
+    }
+    return absl::OkStatus();
+  };
+
+  TF_RETURN_IF_ERROR(shaped_slice_from_proto(proto.args(), args));
+  TF_RETURN_IF_ERROR(shaped_slice_from_proto(proto.results(), results));
+
+  // If async_events_map already contains an entry for the given unique id,
+  // that means that the pairing done thunk is already serialized and we reuse
+  // the id to connect them. Otherwise, create a new entry.
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostExecuteAsyncEvents>());
+  return std::make_unique<HostExecuteStartThunk>(
+      thunk_info, proto.executable_proto(), std::move(args), std::move(results),
+      async_event_it->second);
 }
 
 static HostOffloadingAllocator* GetHostOffloadingAllocator(
@@ -556,6 +618,14 @@ absl::Status HostExecuteStartThunk::ExecuteOnStream(
   return absl::OkStatus();
 }
 
+std::optional<AsyncEventsUniqueId>
+HostExecuteStartThunk::GetAsyncEventsUniqueId() const {
+  CHECK(async_events_)
+      << "async_events_ must not be null in HostExecuteStartThunk";
+  // We rely on the fact that the pointer to async_events_ is unique.
+  return absl::bit_cast<AsyncEventsUniqueId>(async_events_.get());
+}
+
 // HostExecuteDoneThunk
 
 HostExecuteDoneThunk::HostExecuteDoneThunk(
@@ -569,14 +639,35 @@ HostExecuteDoneThunk::HostExecuteDoneThunk(
 std::string HostExecuteDoneThunk::ToString(int indent) const { return ""; }
 
 absl::StatusOr<ThunkProto> HostExecuteDoneThunk::ToProto() const {
-  return Unimplemented("Not implemented yet.");
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostExecuteDoneThunkProto* host_execute_done_thunk_proto =
+      proto.mutable_host_execute_done_thunk();
+
+  auto async_events_unique_id = GetAsyncEventsUniqueId();
+  // By design, async_events_unique_id should always be present for
+  // HostExecuteDoneThunk.
+  CHECK_NE(async_events_unique_id, std::nullopt);
+
+  host_execute_done_thunk_proto->set_async_events_unique_id(
+      async_events_unique_id.value().value());
+
+  return proto;
 }
 
 absl::StatusOr<std::unique_ptr<HostExecuteDoneThunk>>
 HostExecuteDoneThunk::FromProto(
     ThunkInfo thunk_info, const HostExecuteDoneThunkProto& proto,
-    absl::Span<const BufferAllocation> buffer_allocations) {
-  return Unimplemented("Not implemented yet.");
+    absl::Span<const BufferAllocation> buffer_allocations,
+    HostExecuteAsyncEventsMap& async_events_map) {
+  // If async_events_map already contains an entry for the given unique id,
+  // that means that the pairing start thunk is already serialized and we reuse
+  // the id to connect them. Otherwise, create a new entry.
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostExecuteAsyncEvents>());
+  return std::make_unique<HostExecuteDoneThunk>(thunk_info,
+                                                async_event_it->second);
 }
 
 absl::Status HostExecuteDoneThunk::Initialize(const InitializeParams& params) {
@@ -604,5 +695,13 @@ absl::Status HostExecuteDoneThunk::ExecuteOnStream(
   return absl::OkStatus();
 }
 
+std::optional<AsyncEventsUniqueId>
+HostExecuteDoneThunk::GetAsyncEventsUniqueId() const {
+  CHECK(async_events_)
+      << "async_events_ must not be null in HostExecuteDoneThunk";
+  // We rely on the fact that the pointer to async_events_ is unique.
+  return absl::bit_cast<AsyncEventsUniqueId>(async_events_.get());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.h
index 4771fad13cc890..ee1585fdf61e85 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_HOST_EXECUTE_THUNK_H_
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -70,6 +71,10 @@ class HostExecuteAsyncEvents {
       events_ ABSL_GUARDED_BY(events_mu_);
 };
 
+using HostExecuteAsyncEventsMap =
+    absl::flat_hash_map<AsyncEventsUniqueId,
+                        std::shared_ptr<HostExecuteAsyncEvents>>;
+
 class HostExecuteStartThunk : public Thunk {
  public:
   struct SliceAndShape {
@@ -95,9 +100,14 @@ class HostExecuteStartThunk : public Thunk {
   std::string ToString(int indent) const override;
 
   absl::StatusOr<ThunkProto> ToProto() const override;
+
+  // If async_events_map already contains an entry for the given unique id, we
+  // reuse the id to connect the start and done thunks. Otherwise, insert a new
+  // entry into the map.
   static absl::StatusOr<std::unique_ptr<HostExecuteStartThunk>> FromProto(
       ThunkInfo thunk_info, const HostExecuteStartThunkProto& proto,
-      absl::Span<const BufferAllocation> buffer_allocations);
+      absl::Span<const BufferAllocation> buffer_allocations,
+      HostExecuteAsyncEventsMap& async_events_map);
 
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
@@ -118,12 +128,14 @@ class HostExecuteStartThunk : public Thunk {
     return &executable_proto_;
   }
 
- protected:
+  std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
+
   HostExecuteStartThunk(
       Thunk::ThunkInfo thunk_info,
       const HostOffloadingExecutableProto& host_offloading_executable_proto,
       absl::InlinedVector<SliceAndShape, 4> args,
-      absl::InlinedVector<SliceAndShape, 4> results);
+      absl::InlinedVector<SliceAndShape, 4> results,
+      std::shared_ptr<HostExecuteAsyncEvents> async_events = nullptr);
 
  private:
   absl::once_flag executable_init_flag_;
@@ -149,11 +161,14 @@ class HostExecuteDoneThunk : public Thunk {
   absl::StatusOr<ThunkProto> ToProto() const override;
   static absl::StatusOr<std::unique_ptr<HostExecuteDoneThunk>> FromProto(
       ThunkInfo thunk_info, const HostExecuteDoneThunkProto& proto,
-      absl::Span<const BufferAllocation> buffer_allocations);
+      absl::Span<const BufferAllocation> buffer_allocations,
+      HostExecuteAsyncEventsMap& async_events_map);
 
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
+
  private:
   std::shared_ptr<HostExecuteAsyncEvents> async_events_;
 };
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
index b43fade7529019..20af930ad5d747 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
@@ -19,18 +19,19 @@ limitations under the License.
 #include <cstring>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/backends/cpu/nanort/nanort_client.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/host_offloading/host_offloading_executable.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
@@ -50,8 +51,8 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 
@@ -584,6 +585,123 @@ TEST(HostExecuteDoneThunkTest, WaitingOnErrorEvent) {
               absl_testing::StatusIs(absl::StatusCode::kInternal));
 }
 
+TEST(HostExecuteStartThunkTest, ProtoRoundTrip) {
+  static constexpr char const* kHloModule = R"(
+    HloModule module
+    ENTRY add_inplace {
+      p0 = s32[] parameter(0)
+      ROOT add = s32[] add(p0, p0)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnUnverifiedModule(kHloModule, {}));
+
+  BufferAllocation alloc_arg(/*index=*/0, 4, /*color=*/0);
+  BufferAllocation alloc_result(/*index=*/1, 4, /*color=*/0);
+
+  BufferAllocation::Slice slice_arg(&alloc_arg, 0, 4);
+  BufferAllocation::Slice slice_result(&alloc_result, 0, 4);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto thunk,
+                          CreateHostExecuteStartThunk(
+                              Thunk::ThunkInfo(), *hlo_module,
+                              {{slice_arg, ShapeUtil::MakeShape(S32, {})}},
+                              {{slice_result, ShapeUtil::MakeShape(S32, {})}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk->ToProto());
+
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0),
+      BufferAllocation(/*index=*/1, /*size=*/4, /*color=*/0)};
+
+  TF_ASSERT_OK_AND_ASSIGN(Thunk::ThunkInfo thunk_info,
+                          Thunk::ThunkInfo::FromProto(proto.thunk_info()));
+  HostExecuteAsyncEventsMap async_events_map;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HostExecuteStartThunk> round_trip_thunk,
+      HostExecuteStartThunk::FromProto(thunk_info,
+                                       proto.host_execute_start_thunk(),
+                                       buffer_allocations, async_events_map));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto,
+                          round_trip_thunk->ToProto());
+  EXPECT_EQ(async_events_map.size(), 1);
+  EXPECT_EQ(async_events_map.begin()->first,
+            thunk->GetAsyncEventsUniqueId().value());
+
+  // ids are expected to be different, so drop them for the comparison.
+  round_trip_proto.mutable_host_execute_start_thunk()
+      ->clear_async_events_unique_id();
+  proto.mutable_host_execute_start_thunk()->clear_async_events_unique_id();
+
+  EXPECT_THAT(round_trip_proto, tsl::proto_testing::EqualsProto(proto));
+}
+
+TEST(HostExecuteThunkTest, ProtoRoundTripPairing) {
+  static constexpr char const* kHloModule = R"(
+    HloModule module
+    ENTRY add_inplace {
+      p0 = s32[] parameter(0)
+      ROOT add = s32[] add(p0, p0)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnUnverifiedModule(kHloModule, {}));
+
+  BufferAllocation alloc_arg(/*index=*/0, 4, /*color=*/0);
+  BufferAllocation alloc_result(/*index=*/1, 4, /*color=*/0);
+
+  BufferAllocation::Slice slice_arg(&alloc_arg, 0, 4);
+  BufferAllocation::Slice slice_result(&alloc_result, 0, 4);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto start_thunk_orig,
+                          CreateHostExecuteStartThunk(
+                              Thunk::ThunkInfo(), *hlo_module,
+                              {{slice_arg, ShapeUtil::MakeShape(S32, {})}},
+                              {{slice_result, ShapeUtil::MakeShape(S32, {})}}));
+
+  HostExecuteDoneThunk done_thunk_orig(Thunk::ThunkInfo(),
+                                       start_thunk_orig->async_events());
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto start_proto, start_thunk_orig->ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto done_proto, done_thunk_orig.ToProto());
+
+  // Check that the ids are matching.
+  EXPECT_EQ(start_proto.host_execute_start_thunk().async_events_unique_id(),
+            done_proto.host_execute_done_thunk().async_events_unique_id());
+
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0),
+      BufferAllocation(/*index=*/1, /*size=*/4, /*color=*/0)};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Thunk::ThunkInfo start_thunk_info,
+      Thunk::ThunkInfo::FromProto(start_proto.thunk_info()));
+  TF_ASSERT_OK_AND_ASSIGN(Thunk::ThunkInfo done_thunk_info,
+                          Thunk::ThunkInfo::FromProto(done_proto.thunk_info()));
+
+  HostExecuteAsyncEventsMap async_events_map;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HostExecuteDoneThunk> done_thunk,
+      HostExecuteDoneThunk::FromProto(done_thunk_info,
+                                      done_proto.host_execute_done_thunk(),
+                                      buffer_allocations, async_events_map));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HostExecuteStartThunk> start_thunk,
+      HostExecuteStartThunk::FromProto(start_thunk_info,
+                                       start_proto.host_execute_start_thunk(),
+                                       buffer_allocations, async_events_map));
+
+  EXPECT_EQ(async_events_map.size(), 1);
+  EXPECT_EQ(start_thunk->GetAsyncEventsUniqueId(),
+            done_thunk->GetAsyncEventsUniqueId());
+}
+
 }  // namespace
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
index 543f3987ad2ddf..cce2f2795712a1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
@@ -112,6 +113,53 @@ HostSendThunk::HostSendThunk(
       frontend_attrs_(std::move(frontend_attrs)),
       device_constraint_(device_constraint) {}
 
+absl::StatusOr<ThunkProto> HostSendThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostSendThunkProto& host_send_thunk_proto = *proto.mutable_host_send_thunk();
+  *host_send_thunk_proto.mutable_shape() = shape_.ToProto();
+  TF_ASSIGN_OR_RETURN(*host_send_thunk_proto.mutable_buffer(),
+                      buffer_.ToProto());
+  host_send_thunk_proto.set_channel_id(channel_id_);
+  for (const auto& [key, value] : frontend_attrs_) {
+    host_send_thunk_proto.mutable_frontend_attrs()->insert({key, value});
+  }
+  if (device_constraint_.has_value()) {
+    host_send_thunk_proto.set_device_constraint(device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostSendThunk has no paired Done event");
+  }
+  host_send_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostSendThunk>> HostSendThunk::FromProto(
+    ThunkInfo thunk_info, const HostSendThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice buffer,
+      BufferAllocation::Slice::FromProto(proto.buffer(), allocations));
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+  absl::flat_hash_map<std::string, std::string> frontend_attrs(
+      proto.frontend_attrs().begin(), proto.frontend_attrs().end());
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+  return std::make_unique<HostSendThunk>(
+      thunk_info, std::move(shape), buffer, proto.channel_id(),
+      async_event_it->second, std::move(frontend_attrs), device_constraint);
+}
+
 absl::Status HostSendThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Send buffer: channel_id=" << channel_id_
           << "; shape=" << shape_.ToString();
@@ -168,6 +216,44 @@ HostSendDoneThunk::HostSendDoneThunk(
       events_(std::move(events)),
       device_constraint_(device_constraint) {}
 
+absl::StatusOr<ThunkProto> HostSendDoneThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostSendDoneThunkProto& host_send_done_thunk_proto =
+      *proto.mutable_host_send_done_thunk();
+  host_send_done_thunk_proto.set_channel_id(channel_id_);
+  if (device_constraint_.has_value()) {
+    host_send_done_thunk_proto.set_device_constraint(
+        device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostSendDoneThunk has no paired Start event");
+  }
+  host_send_done_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostSendDoneThunk>> HostSendDoneThunk::FromProto(
+    ThunkInfo thunk_info, const HostSendDoneThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+
+  return std::make_unique<HostSendDoneThunk>(thunk_info, proto.channel_id(),
+                                             std::move(async_event_it->second),
+                                             device_constraint);
+}
+
 absl::Status HostSendDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Wait for send completion: channel_id=" << channel_id_;
 
@@ -217,6 +303,53 @@ HostRecvThunk::HostRecvThunk(
       frontend_attrs_(std::move(frontend_attrs)),
       device_constraint_(device_constraint) {}
 
+absl::StatusOr<ThunkProto> HostRecvThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostRecvThunkProto& host_recv_thunk_proto = *proto.mutable_host_recv_thunk();
+  *host_recv_thunk_proto.mutable_shape() = shape_.ToProto();
+  TF_ASSIGN_OR_RETURN(*host_recv_thunk_proto.mutable_buffer(),
+                      buffer_.ToProto());
+  host_recv_thunk_proto.set_channel_id(channel_id_);
+  for (const auto& [key, value] : frontend_attrs_) {
+    host_recv_thunk_proto.mutable_frontend_attrs()->insert({key, value});
+  }
+  if (device_constraint_.has_value()) {
+    host_recv_thunk_proto.set_device_constraint(device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostRecvThunk has no paired Done event");
+  }
+  host_recv_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostRecvThunk>> HostRecvThunk::FromProto(
+    ThunkInfo thunk_info, const HostRecvThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice buffer,
+      BufferAllocation::Slice::FromProto(proto.buffer(), allocations));
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+  absl::flat_hash_map<std::string, std::string> frontend_attrs(
+      proto.frontend_attrs().begin(), proto.frontend_attrs().end());
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+  return std::make_unique<HostRecvThunk>(
+      thunk_info, std::move(shape), buffer, proto.channel_id(),
+      async_event_it->second, std::move(frontend_attrs), device_constraint);
+}
+
 absl::Status HostRecvThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Recv buffer: channel_id=" << channel_id_
           << "; shape=" << shape_.ToString();
@@ -270,7 +403,46 @@ HostRecvDoneThunk::HostRecvDoneThunk(
     std::optional<GlobalDeviceId> device_constraint)
     : Thunk(Thunk::kHostRecvDone, thunk_info),
       channel_id_(channel_id),
-      events_(std::move(events)) {}
+      events_(std::move(events)),
+      device_constraint_(device_constraint) {}
+
+absl::StatusOr<ThunkProto> HostRecvDoneThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostRecvDoneThunkProto& host_recv_done_thunk_proto =
+      *proto.mutable_host_recv_done_thunk();
+  host_recv_done_thunk_proto.set_channel_id(channel_id_);
+  if (device_constraint_.has_value()) {
+    host_recv_done_thunk_proto.set_device_constraint(
+        device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostRecvDoneThunk has no paired Start event");
+  }
+  host_recv_done_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostRecvDoneThunk>> HostRecvDoneThunk::FromProto(
+    ThunkInfo thunk_info, const HostRecvDoneThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+
+  return std::make_unique<HostRecvDoneThunk>(thunk_info, proto.channel_id(),
+                                             std::move(async_event_it->second),
+                                             device_constraint);
+}
 
 absl::Status HostRecvDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Wait for recv completion: channel_id=" << channel_id_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
index e307270787c655..ca86059a10c358 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
 #include "xla/shape.h"
@@ -78,12 +79,24 @@ class HostSendRecvAsyncEvents {
       events_ ABSL_GUARDED_BY(mutex_);
 };
 
+// A map from a unique id to a shared pointer to HostSendRecvAsyncEvents.
+// This is used to match the pairs of HostSend/Recv and HostSend/RecvDone thunks
+// during deserialization.
+using HostSendRecvAsyncEventsMap =
+    absl::flat_hash_map<AsyncEventsUniqueId,
+                        std::shared_ptr<HostSendRecvAsyncEvents>>;
+
 //===----------------------------------------------------------------------===//
 // HostSendThunk
 //===----------------------------------------------------------------------===//
 
 class HostSendThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostSendThunk>> FromProto(
+      ThunkInfo thunk_info, const HostSendThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostSendThunk(ThunkInfo thunk_info, Shape shape,
                 BufferAllocation::Slice buffer, int64_t channel_id,
                 std::shared_ptr<HostSendRecvAsyncEvents> events,
@@ -92,6 +105,8 @@ class HostSendThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncStart() const override { return events_ != nullptr; }
@@ -113,12 +128,19 @@ class HostSendThunk : public Thunk {
 
 class HostSendDoneThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostSendDoneThunk>> FromProto(
+      ThunkInfo thunk_info, const HostSendDoneThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostSendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
                     std::shared_ptr<HostSendRecvAsyncEvents> events,
                     std::optional<GlobalDeviceId> device_constraint);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncDone() const override { return events_ != nullptr; }
@@ -136,6 +158,11 @@ class HostSendDoneThunk : public Thunk {
 
 class HostRecvThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostRecvThunk>> FromProto(
+      ThunkInfo thunk_info, const HostRecvThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostRecvThunk(ThunkInfo thunk_info, Shape shape,
                 BufferAllocation::Slice buffer, int64_t channel_id,
                 std::shared_ptr<HostSendRecvAsyncEvents> events,
@@ -144,6 +171,8 @@ class HostRecvThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncStart() const override { return events_ != nullptr; }
@@ -165,12 +194,19 @@ class HostRecvThunk : public Thunk {
 
 class HostRecvDoneThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostRecvDoneThunk>> FromProto(
+      ThunkInfo thunk_info, const HostRecvDoneThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostRecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
                     std::shared_ptr<HostSendRecvAsyncEvents> events,
                     std::optional<GlobalDeviceId> device_constraint);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncDone() const override { return events_ != nullptr; }
diff --git a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h
index 00fd8c29368bd4..1cc609a2d2263f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h
@@ -16,10 +16,15 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_INFEED_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_INFEED_THUNK_H_
 
+#include <memory>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc
index 22e2da9206addd..578429cdaf4f15 100644
--- a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
index e180049fa002f9..7af58fec0956fa 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
@@ -15,13 +15,11 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 
-#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
@@ -32,49 +30,33 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace xla {
 namespace gpu {
 
-Thunk::BufferUses BufferUseFromKernelArguments(
-    absl::Span<const BufferAllocation::Slice> args,
-    const std::vector<bool>& written) {
-  Thunk::BufferUses buffers;
-  buffers.reserve(args.size());
-  for (int i = 0; i < args.size(); ++i) {
-    // We assume that any buffer is either an input or an output of the
-    // kernel, and inout buffers are represented as 2 separate arguments.
-    if (written[i]) {
-      buffers.push_back(BufferUse::Write(args[i]));
-    } else {
-      buffers.push_back(BufferUse::Read(args[i]));
-    }
-  }
-  return buffers;
-}
-
-//===----------------------------------------------------------------------===//
-// KernelThunk
-//===----------------------------------------------------------------------===//
-
 KernelThunk::KernelThunk(Thunk::ThunkInfo thunk_info, std::string kernel_name,
                          const emitters::KernelArguments& kernel_arguments,
                          LaunchDimensions launch_dimensions,
@@ -83,6 +65,7 @@ KernelThunk::KernelThunk(Thunk::ThunkInfo thunk_info, std::string kernel_name,
                          stream_executor::gpu::TmaMetadata tma_metadata)
     : Thunk(Kind::kKernel, std::move(thunk_info)),
       args_(kernel_arguments.GetArgumentBufferSlices()),
+      args_shape_(kernel_arguments.GetArgumentBufferShapes()),
       written_(kernel_arguments.GetArgumentOutputFlags()),
       kernel_name_(std::move(kernel_name)),
       launch_dimensions_(std::move(launch_dimensions)),
@@ -102,11 +85,10 @@ absl::StatusOr<ThunkProto> KernelThunk::ToProto() const {
   *proto.mutable_thunk_info() = thunk_info().ToProto();
 
   auto* kernel_proto = proto.mutable_kernel_thunk();
-  for (const auto& arg : args_) {
-    TF_ASSIGN_OR_RETURN(*kernel_proto->add_args(), arg.ToProto());
-  }
-  for (bool written : written_) {
-    kernel_proto->add_written(written);
+  for (int i = 0; i < args_.size(); i++) {
+    TF_ASSIGN_OR_RETURN(*kernel_proto->add_args(), args_[i].ToProto());
+    *kernel_proto->add_args_shape() = args_shape_[i].ToProto();
+    kernel_proto->add_written(written_[i]);
   }
   kernel_proto->set_kernel_name(kernel_name_);
   *kernel_proto->mutable_launch_dimensions() = launch_dimensions_.ToProto();
@@ -130,9 +112,11 @@ absl::StatusOr<std::unique_ptr<KernelThunk>> KernelThunk::FromProto(
         stream_executor::ClusterDim::FromProto(proto.cluster_dim()));
   }
 
-  if (proto.written().size() != proto.args().size()) {
+  if (proto.written().size() != proto.args().size() ||
+      proto.args().size() != proto.args_shape().size()) {
     return absl::InvalidArgumentError(
-        "Proto fields `written` and `args` need to have the same cardinality.");
+        "Proto fields `written`, `args` and `args_shape` need to have the same "
+        "cardinality.");
   }
 
   std::vector<emitters::KernelArgument> arguments;
@@ -141,7 +125,9 @@ absl::StatusOr<std::unique_ptr<KernelThunk>> KernelThunk::FromProto(
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
                         BufferAllocation::Slice::FromProto(proto.args().at(i),
                                                            buffer_allocations));
-    emitters::KernelArgument argument{Shape{}, slice};
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        Shape::FromProto(proto.args_shape().at(i)));
+    emitters::KernelArgument argument{shape, slice};
     argument.set_written(proto.written().at(i));
     arguments.push_back(std::move(argument));
   }
@@ -183,83 +169,72 @@ absl::Status KernelThunk::Initialize(const InitializeParams& params) {
   return absl::OkStatus();
 }
 
-void PrintBufferContents(se::Stream*, int input_idx, se::TensorMap tensor_map) {
-  VLOG(100) << "TENSOR_MAP(" << input_idx << ") = ";
-  for (std::byte element : tensor_map.storage) {
-    VLOG(100) << absl::StrFormat("%x ", static_cast<unsigned>(element));
-  }
-}
-
-void PrintBufferContents(se::Stream* stream, int input_idx,
-                         se::DeviceMemoryBase buf) {
-  auto host_buffer = std::make_unique<char[]>(buf.size());
-  CHECK_OK(stream->Memcpy(host_buffer.get(), buf, buf.size()));
-  CHECK_OK(stream->BlockHostUntilDone());
-
-  std::string buffer_contents;
-  for (int i = 0; i < buf.size(); ++i) {
-    absl::StrAppendFormat(&buffer_contents, "%x ",
-                          static_cast<unsigned>(host_buffer[i]));
-  }
-  VLOG(100) << "BUF(" << input_idx << ") = " << buffer_contents;
-}
-
-void PrintBufferContents(se::Stream*, int input_idx, int64_t int_arg) {
-  VLOG(100) << "INT(" << input_idx << ") = ";
-  VLOG(100) << absl::StrFormat("%x ", int_arg);
-}
-
-static void PrintBufferContents(
-    se::Stream* stream, absl::Span<const se::KernelArgument> kernel_args) {
-  for (const auto& [input_idx, arg] : llvm::enumerate(kernel_args)) {
-    // pre-cpp-20-compat(P0588R1): Capturing structured bindings in lambdas is
-    // ill-formed.
-    std::visit(
-        [&stream, &input_idx = input_idx](auto const& arg) {
-          PrintBufferContents(stream, input_idx, arg);
-        },
-        arg);
-  }
-}
-
 absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
+  TraceMe trace(
+      [] { return TraceMeEncode("KernelThunk::ExecuteOnStream", {}); },
+      /*level=*/TraceMeLevel::kVerbose);
+
   // Load the kernel.
   se::StreamExecutor* executor = params.stream->parent();
   se::Kernel* kernel = nullptr;
 
-  TF_ASSIGN_OR_RETURN(
-      se::Stream * stream,
-      GetStreamForExecution(Thunk::execution_stream_id(), params));
+  se::Stream* stream = nullptr;
+  {
+    TraceMe trace(
+        [] {
+          return TraceMeEncode(
+              "KernelThunk::ExecuteOnStream/GetStreamForExecution", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
+    TF_ASSIGN_OR_RETURN(
+        stream, GetStreamForExecution(Thunk::execution_stream_id(), params));
+  }
 
   {
+    TraceMe trace(
+        [] { return TraceMeEncode("KernelThunk::ExecuteOnStream/mutex", {}); },
+        /*level=*/TraceMeLevel::kVerbose);
     absl::MutexLock lock(mutex_);
+    TraceMe trace_find(
+        [] {
+          return TraceMeEncode("KernelThunk::ExecuteOnStream/mutex/find", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
     auto it = kernel_cache_.find(executor);
     CHECK(it != kernel_cache_.end())
         << "Initialize() not called for StreamExecutor " << executor;
     kernel = it->second.get();
   }
 
-  int device_ordinal = executor->device_ordinal();
-  VLOG(3) << "[" << device_ordinal << "] Launching " << kernel->name();
   absl::InlinedVector<se::KernelArgument, 4> kernel_args;
-  for (const auto& [idx, arg] : llvm::enumerate(args_)) {
-    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
-    VLOG(3) << "[" << device_ordinal << "] Arg: alloc #" << arg.index()
-            << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
-            << buf.size() << "B)";
-
-    if (auto it = tma_metadata_.arg_index_to_tma_info.find(idx);
-        it != tma_metadata_.arg_index_to_tma_info.end()) {
-      // TMA descriptor argument.
-      const se::gpu::TmaDescriptor& tma_desc = it->second;
-      TF_ASSIGN_OR_RETURN(se::TensorMap tensor_map,
-                          executor->CreateTensorMap(tma_desc, buf.opaque()));
-      VLOG(3) << "[" << device_ordinal << "]  Using TensorMap for arg #" << idx
-              << ": " << tma_desc.ToString();
-      kernel_args.push_back(std::move(tensor_map));
-    } else {
-      // Buffer argument.
-      kernel_args.push_back(buf);
+  {
+    TraceMe trace(
+        [] {
+          return TraceMeEncode("KernelThunk::ExecuteOnStream/kernel_args", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
+    int device_ordinal = executor->device_ordinal();
+    VLOG(3) << "[" << device_ordinal << "] Launching " << kernel->name();
+    for (const auto& [idx, arg] : llvm::enumerate(args_)) {
+      se::DeviceMemoryBase buf =
+          params.buffer_allocations->GetDeviceAddress(arg);
+      VLOG(3) << "[" << device_ordinal << "] Arg: alloc #" << arg.index()
+              << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
+              << buf.size() << "B)";
+
+      if (auto it = tma_metadata_.arg_index_to_tma_info.find(idx);
+          it != tma_metadata_.arg_index_to_tma_info.end()) {
+        // TMA descriptor argument.
+        const se::gpu::TmaDescriptor& tma_desc = it->second;
+        TF_ASSIGN_OR_RETURN(se::TensorMap tensor_map,
+                            executor->CreateTensorMap(tma_desc, buf.opaque()));
+        VLOG(3) << "[" << device_ordinal << "]  Using TensorMap for arg #"
+                << idx << ": " << tma_desc.ToString();
+        kernel_args.push_back(std::move(tensor_map));
+      } else {
+        // Buffer argument.
+        kernel_args.push_back(buf);
+      }
     }
   }
 
@@ -274,79 +249,18 @@ absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
 }
 
 Thunk::BufferUses KernelThunk::buffer_uses() const {
-  return BufferUseFromKernelArguments(absl::MakeConstSpan(args_), written_);
-}
-
-//===----------------------------------------------------------------------===//
-// CustomKernelThunk
-//===----------------------------------------------------------------------===//
-
-CustomKernelThunk::CustomKernelThunk(
-    const HloInstruction* instr, CustomKernel custom_kernel,
-    const emitters::KernelArguments& kernel_arguments, ThunkId thunk_id)
-    : Thunk(Kind::kCustomKernel,
-            Thunk::ThunkInfo::WithProfileAnnotation(instr, thunk_id)),
-      args_(kernel_arguments.GetArgumentBufferSlices()),
-      written_(kernel_arguments.GetArgumentOutputFlags()),
-      custom_kernel_(std::move(custom_kernel)) {}
-
-std::string CustomKernelThunk::ToString(int indent) const {
-  return custom_kernel_.ToString();
-}
-
-absl::Status CustomKernelThunk::Initialize(const InitializeParams& params) {
-  absl::MutexLock lock(mutex_);
-
-  if (!kernel_cache_.contains(params.executor)) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<se::Kernel> kernel,
-        params.executor->LoadKernel(custom_kernel_.kernel_spec()));
-    kernel_cache_.emplace(params.executor, std::move(kernel));
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status CustomKernelThunk::ExecuteOnStream(const ExecuteParams& params) {
-  se::StreamExecutor* executor = params.stream->parent();
-
-  se::Kernel* kernel = [&] {
-    absl::MutexLock lock(mutex_);
-    return kernel_cache_[executor].get();
-  }();
-
-  int device_ordinal = executor->device_ordinal();
-  VLOG(3) << "[" << device_ordinal << "] Launching "
-          << custom_kernel_.ToString() << " as device kernel "
-          << kernel->name();
-
-  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
-  for (const BufferAllocation::Slice& arg : args_) {
-    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
-    VLOG(3) << "[" << device_ordinal << "]  Arg: alloc #" << arg.index()
-            << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
-            << buf.size() << "B)";
-    buffer_args.push_back(buf);
-  }
-
-  if (VLOG_IS_ON(100)) {
-    absl::InlinedVector<se::KernelArgument, 4> kernel_args;
-    for (const se::DeviceMemoryBase& arg : buffer_args) {
-      kernel_args.push_back(arg);
+  Thunk::BufferUses buffers;
+  buffers.reserve(args_.size());
+  for (int i = 0; i < args_.size(); ++i) {
+    // We assume that any buffer is either an input or an output of the
+    // kernel, and inout buffers are represented as 2 separate arguments.
+    if (written_[i]) {
+      buffers.push_back(BufferUse::Write(args_[i], args_shape_[i]));
+    } else {
+      buffers.push_back(BufferUse::Read(args_[i], args_shape_[i]));
     }
-    PrintBufferContents(params.stream, kernel_args);
   }
-
-  se::KernelArgsDeviceMemoryArray args(buffer_args,
-                                       custom_kernel_.shared_memory_bytes());
-
-  return kernel->Launch(custom_kernel_.thread_dims(),
-                        custom_kernel_.block_dims(),
-                        custom_kernel_.cluster_dims(), params.stream, args);
-}
-
-Thunk::BufferUses CustomKernelThunk::buffer_uses() const {
-  return BufferUseFromKernelArguments(absl::MakeConstSpan(args_), written_);
+  return buffers;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
index 00a604b4f20d30..9d0f87152ded45 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
@@ -26,17 +26,14 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
@@ -46,19 +43,13 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class GpuExecutable;
-
 // TODO(ezhulenev): Unify KernelThunk and CustomKernelThunk as they are very
 // similar. XLA:GPU should use more of kernel loading APIs provided by
 // StreamExecutor out of the box and less custom kernel loading solutions.
 //
 // Today KernelThunk is required for lowering to XLA runtime, and
 // CustomKernelThunk is only supported for thunk execution.
-
-//===----------------------------------------------------------------------===//
-// KernelThunk
-//===----------------------------------------------------------------------===//
-
+//
 // This class stores everything that StreamExecutor needs for launching a
 // kernel. It implements the ExecuteOnStream interface for GpuExecutable to
 // invoke the corresponding kernel.
@@ -116,7 +107,7 @@ class KernelThunk : public Thunk {
  private:
   // Buffer slices passed to the kernel as arguments.
   std::vector<BufferAllocation::Slice> args_;
-
+  std::vector<Shape> args_shape_;
   // args_[i] is written iff (written_[i] == true).
   std::vector<bool> written_;
 
@@ -141,58 +132,6 @@ class KernelThunk : public Thunk {
       kernel_cache_ ABSL_GUARDED_BY(mutex_);
 };
 
-//===----------------------------------------------------------------------===//
-// CustomKernelThunk
-//===----------------------------------------------------------------------===//
-
-// CustomKernelThunk loads and executes kernels defined by a custom kernel
-// (which in practice means hand written CUDA C++ kernel), instead of a kernel
-// compiled by XLA and loaded from an executable source.
-class CustomKernelThunk : public Thunk {
- public:
-  CustomKernelThunk(const HloInstruction* inst, CustomKernel custom_kernel,
-                    const emitters::KernelArguments& kernel_arguments,
-                    ThunkId thunk_id);
-
-  std::string ToString(int indent) const override;
-
-  absl::Status Initialize(const InitializeParams& params) override;
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
-  const CustomKernel& custom_kernel() const { return custom_kernel_; }
-
-  const std::vector<BufferAllocation::Slice>& arguments() const {
-    return args_;
-  }
-
-  absl::string_view custom_kernel_name() const { return custom_kernel_.name(); }
-
-  const std::vector<bool>& written() const { return written_; }
-
-  LaunchDimensions launch_dimensions() const {
-    return LaunchDimensions(custom_kernel_.block_dims(),
-                            custom_kernel_.thread_dims());
-  }
-
-  int64_t shmem_bytes() const { return custom_kernel_.shared_memory_bytes(); }
-
-  BufferUses buffer_uses() const override;
-
- private:
-  // Buffer slices passed to the kernel as arguments.
-  std::vector<BufferAllocation::Slice> args_;
-
-  // args_[i] is written iff (written_[i] == true).
-  std::vector<bool> written_;
-
-  CustomKernel custom_kernel_;
-
-  // Loaded kernels for each `StreamExecutor`.
-  mutable absl::Mutex mutex_;
-  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
-      kernel_cache_ ABSL_GUARDED_BY(mutex_);
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
index 589af3d27b165a..49145f8316f26c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
@@ -179,6 +178,18 @@ TEST(KernelThunkTest, ToProto) {
         kernel_thunk {
           args { size: 1024 }
           args { size: 256 }
+          args_shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          args_shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
           written: false
           written: true
           kernel_name: "kernel123"
@@ -272,11 +283,12 @@ TEST(KernelThunkTest, ToAndFromProto) {
 }
 
 TEST(KernelThunkTest, BufferUsesReturnsCorrectBuffers) {
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {512});
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
   BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
-  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
-  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
+  emitters::KernelArgument arg0(arg_shape, slice0);
+  emitters::KernelArgument arg1(arg_shape, slice1);
   arg0.set_written(false);
   arg1.set_written(true);
   emitters::KernelArguments kernel_arguments({arg0, arg1});
@@ -286,16 +298,18 @@ TEST(KernelThunkTest, BufferUsesReturnsCorrectBuffers) {
 
   Thunk::BufferUses buffers = thunk.buffer_uses();
 
-  ASSERT_THAT(buffers, testing::UnorderedElementsAre(BufferUse::Read(slice0),
-                                                     BufferUse::Write(slice1)));
+  ASSERT_THAT(buffers, testing::UnorderedElementsAre(
+                           BufferUse::Read(slice0, arg_shape),
+                           BufferUse::Write(slice1, arg_shape)));
 }
 
 TEST(KernelThunkTest, BufferUsesReturnsBuffersInConsistentOrder) {
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {512});
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
   BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
-  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
-  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
+  emitters::KernelArgument arg0(arg_shape, slice0);
+  emitters::KernelArgument arg1(arg_shape, slice1);
   arg0.set_written(false);
   arg1.set_written(true);
   emitters::KernelArguments kernel_arguments({arg0, arg1});
@@ -309,52 +323,6 @@ TEST(KernelThunkTest, BufferUsesReturnsBuffersInConsistentOrder) {
   ASSERT_THAT(buffers1, testing::ContainerEq(buffers2));
 }
 
-TEST(CustomKernelThunkTest, BufferUsesReturnsCorrectBuffers) {
-  CustomKernel kernel(
-      /*name=*/"",
-      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
-      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
-  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
-  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
-  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
-  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
-  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
-  arg0.set_written(false);
-  arg1.set_written(true);
-  emitters::KernelArguments kernel_arguments({arg0, arg1});
-  auto hlo = HloInstruction::CreateConstant(Literal());
-  CustomKernelThunk thunk(hlo.get(), kernel, kernel_arguments, ThunkId{0});
-
-  Thunk::BufferUses buffers = thunk.buffer_uses();
-
-  ASSERT_THAT(buffers, testing::UnorderedElementsAre(BufferUse::Read(slice0),
-                                                     BufferUse::Write(slice1)));
-}
-
-TEST(CustomKernelThunkTest, BufferUsesReturnsBuffersInConsistentOrder) {
-  CustomKernel kernel(
-      /*name=*/"",
-      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
-      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
-  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
-  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
-  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
-  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
-  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
-  arg0.set_written(false);
-  arg1.set_written(true);
-  emitters::KernelArguments kernel_arguments({arg0, arg1});
-  auto hlo = HloInstruction::CreateConstant(Literal());
-  CustomKernelThunk thunk(hlo.get(), kernel, kernel_arguments, ThunkId{0});
-
-  Thunk::BufferUses buffers1 = thunk.buffer_uses();
-  Thunk::BufferUses buffers2 = thunk.buffer_uses();
-
-  ASSERT_THAT(buffers1, testing::ContainerEq(buffers2));
-}
-
 class KernelThunkTmaPTXTest : public ::testing::TestWithParam<bool> {
  public:
   absl::StatusOr<std::unique_ptr<KernelThunk>> GetTmaKernelThunk() {
@@ -364,6 +332,24 @@ class KernelThunkTmaPTXTest : public ::testing::TestWithParam<bool> {
         args { size: 1048576 buffer_allocation_index: 0 }
         args { size: 1048576 offset: 1048576 }
         args { size: 4194304 offset: 2097152 }
+        args_shape {
+          element_type: F32
+          dimensions: 262144
+          layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+          is_dynamic_dimension: false
+        }
+        args_shape {
+          element_type: F32
+          dimensions: 262144
+          layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+          is_dynamic_dimension: false
+        }
+        args_shape {
+          element_type: F32
+          dimensions: 1048576
+          layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+          is_dynamic_dimension: false
+        }
         written: false
         written: false
         written: true
diff --git a/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
index 820fa0a208beca..9042299cb8ff21 100644
--- a/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
@@ -20,19 +20,18 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 using ::testing::ElementsAreArray;
-using tsl::testing::IsOk;
 
 static absl::StatusOr<stream_executor::StreamExecutor*> GpuExecutor() {
   TF_ASSIGN_OR_RETURN(stream_executor::Platform * platform,
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
index da474335ce1ac5..8252269039dd1b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
@@ -60,5 +60,28 @@ absl::Status Memset32BitValueThunk::ExecuteOnStream(
   return params.stream->Memset32(&dest_data, value_, dest_data.size());
 }
 
+absl::StatusOr<std::unique_ptr<Memset32BitValueThunk>>
+Memset32BitValueThunk::FromProto(
+    ThunkInfo thunk_info, const Memset32BitValueThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dest,
+                      BufferAllocation::Slice::FromProto(
+                          thunk_proto.dest_buffer(), buffer_allocations));
+  return std::make_unique<Memset32BitValueThunk>(std::move(thunk_info),
+                                                 thunk_proto.value(), dest);
+}
+
+absl::StatusOr<ThunkProto> Memset32BitValueThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  Memset32BitValueThunkProto* memset_thunk_proto =
+      proto.mutable_memset32bit_value_thunk();
+  TF_ASSIGN_OR_RETURN(*memset_thunk_proto->mutable_dest_buffer(),
+                      dest_.ToProto());
+  memset_thunk_proto->set_value(value_);
+  return proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
index a42aaea7f88680..06e8c2a05fb232 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
@@ -17,8 +17,11 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_MEMSET_THUNK_H_
 
 #include <cstdint>
+#include <memory>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 
@@ -64,6 +67,12 @@ class Memset32BitValueThunk : public Thunk {
   const BufferAllocation::Slice& destination() const { return dest_; }
   uint32_t value() const { return value_; }
 
+  static absl::StatusOr<std::unique_ptr<Memset32BitValueThunk>> FromProto(
+      ThunkInfo thunk_info, const Memset32BitValueThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
   const uint32_t value_;
   const BufferAllocation::Slice dest_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
index a5394937a4d240..0eb1bc60ff2cb3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
@@ -61,5 +61,34 @@ TEST(MemzeroThunkTest, ProtoRoundTrip) {
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
 }
 
+TEST(Memset32BitValueThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        memset32bit_value_thunk {
+          dest_buffer { offset: 0 size: 4 buffer_allocation_index: 0 }
+          value: 123
+        }
+      )pb");
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Memset32BitValueThunk> thunk,
+      Memset32BitValueThunk::FromProto(
+          thunk_info, proto.memset32bit_value_thunk(), buffer_allocations));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc
index 4b6f285cb12f8b..c7ea3d6a9efd1d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
@@ -92,7 +91,7 @@ NvshmemAllReduceReduceScatterThunkBase::NvshmemAllReduceReduceScatterThunkBase(
     : NvshmemCollectiveThunk(kind, thunk_info, is_sync),
       config_(std::move(config)),
       buffers_(std::move(buffers)) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
 NvshmemAllReduceStartThunk::NvshmemAllReduceStartThunk(
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
index 59d286d40b6912..d8ad220cbe0cec 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
-#include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc
index 686c7b5f832e12..2f03bec297a318 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -44,8 +43,6 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
@@ -56,23 +53,6 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-namespace {
-
-absl::StatusOr<const int64_t> GetCurrentId(
-    Thunk::CollectiveExecuteParams* collective_params,
-    const P2PConfig& config) {
-  GlobalDeviceId global_device_id = collective_params->global_device_id;
-  TF_ASSIGN_OR_RETURN(
-      const DeviceAssignment::LogicalID current_logical_id,
-      collective_params->device_assn->LogicalIdForDevice(global_device_id));
-  const int64_t current_id =
-      config.config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
-  return current_id;
-}
-
-}  // namespace
 
 NvshmemCollectivePermuteStartThunk::NvshmemCollectivePermuteStartThunk(
     ThunkInfo thunk_info, const HloCollectivePermuteInstruction* instr,
@@ -91,17 +71,16 @@ NvshmemCollectivePermuteStartThunk::NvshmemCollectivePermuteStartThunk(
   P2PConfig collective_permute_config;
   auto& config = collective_permute_config.config;
 
-  config.operand_count = instr->operand_count();
   for (const HloInstruction* operand : instr->operands()) {
     config.operand_element_type.push_back(operand->shape().element_type());
   }
-  config.SetCollectiveOpKindAndID(instr);
   config.group_mode = GetGroupMode(instr);
 
   // With a collective permute, all execution instances together form one
   // replica group.
   const int64_t num_participants =
-      config.group_mode == CollectiveOpGroupMode::kCrossReplica
+      config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
           ? replica_count
           : partition_count;
   config.replica_groups.emplace_back();
@@ -152,8 +131,9 @@ absl::Status NvshmemCollectivePermuteStartThunk::RunNvshmemCollective(
       ConvertToDeviceBuffers(params,
                              std::vector<CollectiveThunk::Buffer>(buffers_),
                              config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                      GetCurrentId(params.collective_params, config_));
+  TF_ASSIGN_OR_RETURN(
+      const int64_t current_id,
+      GetCollectiveCurrentId(params.collective_params, config_));
   std::string device_string =
       CollectiveThunk::GetDeviceString(*params.collective_params);
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
index f755b82467732a..17c945f85e3149 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
@@ -22,12 +22,10 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
@@ -41,7 +39,8 @@ class NvshmemCollectivePermuteStartThunk : public NvshmemCollectiveThunk {
       int64_t replica_count, int64_t partition_count,
       const std::vector<CollectiveThunk::Buffer>& buffers,
       bool p2p_memcpy_enabled = false,
-      AsyncStreamKind stream_kind = AsyncStreamKind::kCollective);
+      AsyncStreamKind stream_kind =
+          AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE);
 
   static const char* GetHloOpName() { return "collective-permute-start"; }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc
index 1db022e135605f..ff8183d9d6b305 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc
@@ -31,16 +31,15 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
-#include "xla/core/collectives/communicator.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
+#include "tsl/platform/casts.h"
 
 namespace xla {
 namespace gpu {
@@ -90,15 +89,15 @@ absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectivesFromRegistry() {
   return tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
 }
 
-absl::Status NvshmemCollectiveThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status NvshmemCollectiveThunk::Prepare(const PrepareParams& params) {
+  TF_RET_CHECK(params.collective_params != nullptr);
   TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(collectives, *params.collective_params,
                       config().replica_groups, config().group_mode,
                       GetAsyncStreamKind(), /*use_nccl= */ false));
-  return resource_requests.AddClique(clique_key);
+  return params.clique_requests->RequestClique(clique_key);
 }
 
 absl::Status NvshmemCollectiveThunk::Initialize(
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h
index ef4dceabd02d45..1bb43e3c47af4b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h
@@ -13,7 +13,6 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_COLLECTIVE_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_COLLECTIVE_THUNK_H_
 
-#include <cstdint>
 #include <memory>
 #include <optional>
 
@@ -22,7 +21,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -44,8 +42,7 @@ class NvshmemCollectiveThunk : public Thunk {
  public:
   NvshmemCollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync);
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
 
   absl::Status Initialize(const InitializeParams& params) override;
 
@@ -68,7 +65,7 @@ class NvshmemCollectiveThunk : public Thunk {
                                             se::Stream& stream) = 0;
   virtual const CollectiveConfig& config() const = 0;
   virtual AsyncStreamKind GetAsyncStreamKind() const {
-    return AsyncStreamKind::kCollective;
+    return AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
   }
 
  private:
@@ -95,7 +92,8 @@ class NvshmemCollectiveDoneThunk : public Thunk {
 
  private:
   std::shared_ptr<CollectiveThunk::AsyncEvents> async_events_;
-  AsyncStreamKind async_stream_kind_ = AsyncStreamKind::kCollective;
+  AsyncStreamKind async_stream_kind_ =
+      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
index 2bfba5d86bcbf5..e310a931c8790b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
@@ -32,15 +32,14 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/device_id.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -87,7 +86,8 @@ absl::Status NvshmemRecvThunk::RunNvshmemCollective(const ExecuteParams& params,
                       params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   const int64_t current_id =
-      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+      config_.config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
           ? current_logical_id.replica_id
           : current_logical_id.computation_id;
   std::string device_string =
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h
index 430a8d43f5eea9..acba04c1baac89 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h
@@ -16,13 +16,15 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_RECV_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_RECV_THUNK_H_
 
-#include <vector>
+#include <cstdint>
+#include <memory>
+#include <string>
 
 #include "absl/status/status.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
index c5b0615a6255b8..d565d0dbd0ff56 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
@@ -33,17 +33,16 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/device_id.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -91,7 +90,8 @@ absl::Status NvshmemSendThunk::RunNvshmemCollective(const ExecuteParams& params,
                       params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   const int64_t current_id =
-      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+      config_.config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
           ? current_logical_id.replica_id
           : current_logical_id.computation_id;
   std::string device_string =
diff --git a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
index b07f90782b4907..a951df9eea5cf0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
index 74ae615954b8ca..a4ea9df511a284 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
@@ -26,17 +26,21 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/source_target_pairs.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -89,16 +93,15 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
   P2PConfig p2p_config;
   auto& config = p2p_config.config;
 
-  config.operand_count = 1;
   config.operand_element_type.push_back(shape.element_type());
-  config.SetCollectiveOpKindAndID(instr);
   config.group_mode = GetCollectiveOpGroupMode(
                           instr->channel_id().value_or(0) > 0, std::nullopt)
                           .value();
 
   // All execution instances of a Send/Recv together form a replica group.
   const int64_t num_participants =
-      config.group_mode == CollectiveOpGroupMode::kCrossReplica
+      config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
           ? replica_count
           : partition_count;
   config.replica_groups.emplace_back();
@@ -176,15 +179,31 @@ AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr) {
     const auto it = fe_map.find(kCollectiveStreamAttrName);
     if (it != fe_map.end() && it->second == kCollectiveStreamP2P) {
       // Use any of the two p2p streams.
-      return AsyncStreamKind::kP2P0;
+      return AsyncStreamKind::ASYNC_STREAM_KIND_P2P0;
     }
   }
 
   const auto it = fe_map.find(kSendRecvPipelineAttr);
   if (it != fe_map.end() && it->second == "1") {
-    return AsyncStreamKind::kP2P1;
+    return AsyncStreamKind::ASYNC_STREAM_KIND_P2P1;
   }
-  return AsyncStreamKind::kP2P0;
+  return AsyncStreamKind::ASYNC_STREAM_KIND_P2P0;
+}
+
+// Retrieves the current collective ID (replica or partition ID) for the
+// executing device.
+absl::StatusOr<const int64_t> GetCollectiveCurrentId(
+    CollectiveParams* collective_params, const P2PConfig& config) {
+  GlobalDeviceId global_device_id = collective_params->global_device_id;
+  TF_ASSIGN_OR_RETURN(
+      const DeviceAssignment::LogicalID current_logical_id,
+      collective_params->device_assn->LogicalIdForDevice(global_device_id));
+  const int64_t current_id =
+      config.config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
+          ? current_logical_id.replica_id
+          : current_logical_id.computation_id;
+  return current_id;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
index bc5871faea00ba..b45544cc609b46 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
@@ -27,10 +27,13 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
@@ -100,6 +103,9 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
 // instruction.
 AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr);
 
+absl::StatusOr<const int64_t> GetCollectiveCurrentId(
+    CollectiveParams* collective_params, const P2PConfig& config);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.cc b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.cc
new file mode 100644
index 00000000000000..30180fadd8d863
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.cc
@@ -0,0 +1,77 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <variant>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+
+namespace {
+void PrintBufferContents(stream_executor::Stream*, int input_idx,
+                         stream_executor::TensorMap tensor_map) {
+  std::string formatted_contents;
+  for (std::byte element : tensor_map.storage) {
+    absl::StrAppendFormat(&formatted_contents, "%02x ",
+                          static_cast<unsigned>(element));
+  }
+  LOG(INFO) << "TENSOR_MAP(" << input_idx << ") = " << formatted_contents;
+}
+
+void PrintBufferContents(stream_executor::Stream* stream, int input_idx,
+                         stream_executor::DeviceMemoryBase buf) {
+  auto host_buffer = std::make_unique<char[]>(buf.size());
+  CHECK_OK(stream->Memcpy(host_buffer.get(), buf, buf.size()));
+  CHECK_OK(stream->BlockHostUntilDone());
+
+  std::string buffer_contents;
+  for (int i = 0; i < buf.size(); ++i) {
+    absl::StrAppendFormat(&buffer_contents, "%02x ",
+                          static_cast<unsigned>(host_buffer[i]));
+  }
+  LOG(INFO) << "BUF(" << input_idx << ") = " << buffer_contents;
+}
+
+void PrintBufferContents(stream_executor::Stream*, int input_idx,
+                         int64_t int_arg) {
+  LOG(INFO) << "INT(" << input_idx
+            << ") = " << absl::StrFormat("%#08x", int_arg);
+}
+}  // namespace
+
+void PrintBufferContents(
+    stream_executor::Stream* stream,
+    absl::Span<const stream_executor::KernelArgument> kernel_args) {
+  for (int input_idx = 0; input_idx < kernel_args.size(); ++input_idx) {
+    const stream_executor::KernelArgument& arg = kernel_args[input_idx];
+    std::visit(
+        [&](auto const& arg) { PrintBufferContents(stream, input_idx, arg); },
+        arg);
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.h b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.h
new file mode 100644
index 00000000000000..3ced52a0aee03b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_PRINT_BUFFER_CONTENTS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_PRINT_BUFFER_CONTENTS_H_
+
+#include "absl/types/span.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+
+// Prints the contents of the buffer arguments in `kernel_args` to LOG(INFO).
+void PrintBufferContents(
+    stream_executor::Stream* stream,
+    absl::Span<const stream_executor::KernelArgument> kernel_args);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_PRINT_BUFFER_CONTENTS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents_test.cc b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents_test.cc
new file mode 100644
index 00000000000000..102cabd07835d4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/scoped_mock_log.h"
+#include "absl/strings/ascii.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+
+namespace xla::gpu {
+namespace {
+using ::testing::_;
+using ::testing::HasSubstr;
+
+TEST(PrintBufferContentsTest, PrintBufferContents) {
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  stream_executor::Platform* platform =
+      stream_executor::PlatformManager::PlatformWithName(name).value();
+  stream_executor::StreamExecutor* executor =
+      platform->ExecutorForDevice(0).value();
+
+  auto stream = executor->CreateStream().value();
+
+  stream_executor::DeviceMemory<int> arg1 =
+      executor->AllocateArray<int32_t>(10, 0);
+
+  TF_ASSERT_OK(stream->Memset32(&arg1, 0x12345678, 10 * sizeof(int32_t)));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  std::vector<stream_executor::KernelArgument> kernel_args;
+  kernel_args.push_back(arg1);
+  stream_executor::TensorMap tensor_map;
+  for (int i = 0; i < 128; ++i) {
+    tensor_map.storage[i] = static_cast<std::byte>(i);
+  }
+  kernel_args.push_back(tensor_map);
+  kernel_args.push_back(0x123456789);
+
+  absl::ScopedMockLog log{absl::MockLogDefault::kIgnoreUnexpected};
+  EXPECT_CALL(log, Log(_, _, HasSubstr("BUF(0) = 78 56 34 12 78 56 34 12")));
+  EXPECT_CALL(
+      log,
+      Log(_, _,
+          HasSubstr("TENSOR_MAP(1) = 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d "
+                    "0e 0f 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f")));
+  EXPECT_CALL(log, Log(_, _, HasSubstr("INT(2) = 0x123456789")));
+
+  log.StartCapturingLogs();
+  PrintBufferContents(stream.get(), kernel_args);
+  log.StopCapturingLogs();
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index e0fb0ee13cc09c..0cc5e7dc6fda60 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/future.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
@@ -55,7 +54,6 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -409,7 +407,7 @@ RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
     std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
     : CollectiveThunk(Thunk::kRaggedAllToAllStart, thunk_info,
                       IsGPUSyncCollective(*instr),
-                      AsyncStreamKind::kCollective),
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
       config_(GetRaggedAllToAllConfig(instr)),
       buffers_(std::move(buffers)),
       p2p_memcpy_enabled_(p2p_memcpy_enabled),
@@ -418,7 +416,7 @@ RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
               ->config()
               .debug_options()
               .xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel()) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
 /*static*/ absl::Status RaggedAllToAllStartThunk::CheckImplementable(
@@ -503,7 +501,7 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
   }
 
   {
-    absl::MutexLock lock(&mutex_);
+    absl::MutexLock lock(mutex_);
     per_stream_states_.emplace(executor, std::move(state));
   }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
index 44982a95c27cb2..0ab2911c05809d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -30,12 +30,12 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
index d032306060dc48..5e742e2f58ca2a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -79,7 +81,8 @@ absl::StatusOr<bool> RecvThunk::RunCollective(const ExecuteParams& params,
                       params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   const int64_t current_id =
-      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+      config_.config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
           ? current_logical_id.replica_id
           : current_logical_id.computation_id;
   std::string device_string = GetDeviceString(*params.collective_params);
diff --git a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
index 16910d97f00409..ba14a0b1881865 100644
--- a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc
index ac2b16e4b4a75a..a2770d75c90c98 100644
--- a/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
index bd5c38cfc3ca3f..884157fd5f85f1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/const_init.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -29,6 +30,8 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/ffi.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/layout_util.h"
@@ -41,13 +44,17 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/lib/io/record_writer.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/host_info.h"
+#include "tsl/platform/path.h"
 
 namespace xla {
-
 namespace {
 
 std::string GetGpuPlatformName() {
@@ -81,6 +88,23 @@ void NopReturnTokenCustomCall(void* stream_handle, void** buffers,
   VLOG(1) << "NopReturnTokenCustomCall called.";
 }
 
+absl::StatusOr<Literal> ConvertToLiteral(se::Stream* stream,
+                                         const ffi::AnyBuffer& arg) {
+  Shape shape = ShapeUtil::MakeShape(arg.element_type(), arg.dimensions());
+  LayoutUtil::SetToDefaultLayout(&shape);
+
+  TF_ASSIGN_OR_RETURN(Literal literal, Literal::Make(shape));
+
+  int64_t size_bytes = arg.size_bytes();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> host_buffer,
+                      stream->parent()->HostMemoryAllocate(size_bytes));
+  TF_RETURN_IF_ERROR(
+      stream->Memcpy(literal.untyped_data(), arg.device_memory(), size_bytes));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  return literal;
+}
+
 absl::Status DebugPrintCustomCall(se::Stream* stream, ffi::RemainingArgs args,
                                   absl::string_view format,
                                   ffi::Result<ffi::Buffer<xla::TOKEN>> res) {
@@ -107,18 +131,9 @@ absl::Status DebugPrintCustomCall(se::Stream* stream, ffi::RemainingArgs args,
       return absl::FailedPreconditionError(absl::Substitute(
           "Missing formatter for argument $0 in debug print custom call", i));
     }
-    const ffi::AnyBuffer& arg = args_buffers[i];
-    int64_t size_bytes = arg.size_bytes();
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> host_buffer,
-                        stream->parent()->HostMemoryAllocate(size_bytes));
-    TF_RETURN_IF_ERROR(
-        stream->Memcpy(host_buffer->opaque(), arg.device_memory(), size_bytes));
-    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-
-    Shape shape = ShapeUtil::MakeShape(arg.element_type(), arg.dimensions());
-    LayoutUtil::SetToDefaultLayout(&shape);
-    MutableBorrowingLiteral literal(static_cast<char*>(host_buffer->opaque()),
-                                    shape);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ConvertToLiteral(stream, args_buffers[i]));
+
     formatted =
         absl::StrReplaceAll(formatted, {{to_substitute, literal.ToString()}});
   }
@@ -128,6 +143,49 @@ absl::Status DebugPrintCustomCall(se::Stream* stream, ffi::RemainingArgs args,
   return absl::OkStatus();
 }
 
+std::string GetUniqueFilenameForHost() {
+  return absl::StrCat(tsl::port::Hostname(), tsl::port::JobName(),
+                      tsl::port::TaskId(), tsl::port::JobUid(), ".tfrecord");
+}
+
+// This custom call copies its argument to the host and appends it to file.
+absl::Status AppendToFileCustomCall(se::Stream* stream, ffi::AnyBuffer buffer,
+                                    absl::string_view dir,
+                                    absl::string_view metadata,
+                                    ffi::Result<ffi::Buffer<xla::TOKEN>> res) {
+  if (!stream) {
+    return Internal("Stream is nullptr.");
+  }
+  static absl::Mutex host_mutex{absl::kConstInit};
+
+  TF_ASSIGN_OR_RETURN(Literal literal, ConvertToLiteral(stream, buffer));
+
+  auto* env = tsl::Env::Default();
+  std::string destination{dir};
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(destination));
+  std::string path = tsl::io::JoinPath(destination, GetUniqueFilenameForHost());
+
+  // Supports tensors 2+GB. Should not be serialized as proto.
+  TF_ASSIGN_OR_RETURN(std::string serialized, literal.SerializeAsString());
+
+  std::unique_ptr<tsl::WritableFile> file;
+  std::string filename(path);
+
+  {
+    absl::MutexLock lock(&host_mutex);
+
+    TF_RETURN_IF_ERROR(env->NewAppendableFile(filename, &file));
+    tsl::io::RecordWriter writer(file.get());
+
+    TF_RETURN_IF_ERROR(writer.WriteRecord(metadata));
+    TF_RETURN_IF_ERROR(writer.WriteRecord(serialized));
+
+    TF_RETURN_IF_ERROR(writer.Close());
+  }
+
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 // This custom call copies its arguments to the host and pretty-prints them as
@@ -144,6 +202,17 @@ XLA_FFI_DEFINE_HANDLER(kXlaGpuDebugPrintCustomCall, DebugPrintCustomCall,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kXlaGpuDebugPrintCustomCallTag,
                          GetGpuPlatformName(), kXlaGpuDebugPrintCustomCall);
 
+XLA_FFI_DEFINE_HANDLER(kXlaGpuAppendToFileCustomCall, AppendToFileCustomCall,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Arg<ffi::AnyBuffer>()
+                           .Attr<absl::string_view>("dir")
+                           .Attr<absl::string_view>("metadata")
+                           .Ret<xla::ffi::Buffer<xla::TOKEN>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kXlaGpuAppendToFileCustomCallTag,
+                         GetGpuPlatformName(), kXlaGpuAppendToFileCustomCall);
+
 XLA_FFI_DEFINE_HANDLER(kXlaGpuAssertCustomCall, AssertionCustomCall,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h
index 2397f0a1d21681..0a42104f9b0139 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h
@@ -26,6 +26,9 @@ inline constexpr absl::string_view kXlaGpuAssertCustomCallTag =
 inline constexpr absl::string_view kXlaGpuDebugPrintCustomCallTag =
     "__xla_gpu_debug_print";
 
+inline constexpr absl::string_view kXlaGpuAppendToFileCustomCallTag =
+    "__xla_gpu_append_to_file";
+
 }  // namespace xla
 
 #endif  // XLA_BACKENDS_GPU_RUNTIME_RUNTIME_INTRINSICS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc
index 6644623f000281..8153edf3269b2d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc
@@ -13,18 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/base/log_severity.h"
 #include "absl/log/scoped_mock_log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/lib/io/record_reader.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/testing/temporary_directory.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/tstring.h"
 
 namespace xla {
 namespace gpu {
@@ -35,6 +50,40 @@ using RuntimeIntrinsicsTest = HloTestBase;
 using ::testing::EndsWith;
 using ::testing::HasSubstr;
 
+absl::StatusOr<std::vector<std::pair<std::string, Literal>>>
+ReadTFRecordIOLiteral(const std::string& dir) {
+  auto* env = tsl::Env::Default();
+
+  std::vector<std::string> files;
+  TF_RETURN_IF_ERROR(env->GetChildren(dir, &files));
+
+  std::vector<std::pair<std::string, Literal>> result;
+  for (const std::string& path : files) {
+    std::unique_ptr<tsl::RandomAccessFile> file;
+    TF_RETURN_IF_ERROR(tsl::Env::Default()->NewRandomAccessFile(
+        tsl::io::JoinPath(dir, path), &file));
+    tsl::io::RecordReader reader(file.get());
+
+    uint64_t offset = 0;
+    tsl::tstring record;
+
+    for (;;) {
+      tsl::tstring metadata;
+      absl::Status status = reader.ReadRecord(&offset, &metadata);
+      if (absl::IsOutOfRange(status)) {
+        break;
+      }
+      TF_RETURN_IF_ERROR(status);
+
+      TF_RETURN_IF_ERROR(reader.ReadRecord(&offset, &record));
+      TF_ASSIGN_OR_RETURN(Literal literal,
+                          Literal::DeserializeFromString(record));
+      result.emplace_back(metadata, std::move(literal));
+    }
+  }
+  return result;
+}
+
 TEST_F(RuntimeIntrinsicsTest, NopReturnTokenWorks) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
@@ -140,6 +189,43 @@ ENTRY e {
   mock_log.StopCapturingLogs();
 }
 
+TEST_F(RuntimeIntrinsicsTest, AppendToFile) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto temp_dir,
+      tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
+
+  std::string hlo = absl::StrFormat(R"hlo(
+HloModule m
+
+ENTRY e {
+  constant = f32[2]{0} constant({1, 2})
+  ROOT token1 = token[] custom-call(constant),
+    backend_config="{dir = \"%1$s\", metadata = \"op.1\"}",
+    custom_call_target="__xla_gpu_append_to_file",
+    custom_call_has_side_effect=true,
+    api_version=API_VERSION_TYPED_FFI
+})hlo",
+                                    temp_dir.path());
+
+  Literal expected = LiteralUtil::CreateR1<float>({1.0f, 2.0f});
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(hlo));
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
+
+  std::vector<std::pair<std::string, Literal>> literals;
+  TF_ASSERT_OK_AND_ASSIGN(literals, ReadTFRecordIOLiteral(temp_dir.path()));
+  EXPECT_EQ(literals.size(), 1);
+  EXPECT_EQ(literals[0].first, "op.1");
+  EXPECT_EQ(literals[0].second, expected);
+
+  // Verify that append works.
+  TF_ASSERT_OK_AND_ASSIGN(module, GetOptimizedModule(hlo));
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(literals, ReadTFRecordIOLiteral(temp_dir.path()));
+  EXPECT_EQ(literals.size(), 2);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/sdc_buffer_id.h b/third_party/xla/xla/backends/gpu/runtime/sdc_buffer_id.h
deleted file mode 100644
index 3d18fb1576cd25..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/sdc_buffer_id.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_SDC_BUFFER_ID_H_
-#define XLA_BACKENDS_GPU_RUNTIME_SDC_BUFFER_ID_H_
-
-#include <cstddef>
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
-
-namespace xla::gpu {
-
-// An ID that identifies a buffer within a program. It's a combination of the
-// thunk ID and the buffer index within the thunk.
-//
-// A single buffer can be referred to by multiple SdcBufferIds, when it's being
-// used in different thunks.
-class SdcBufferId {
- public:
-  SdcBufferId() = default;
-
-  // Creates a SdcBufferId that represents the `buffer_idx`-th buffer of a thunk
-  // with `thunk_info`.
-  //
-  // Returns an error if `buffer_idx` is too large to be represented in a
-  // SdcBufferId.
-  static absl::StatusOr<SdcBufferId> Create(ThunkId thunk_id,
-                                            size_t buffer_idx) {
-    if (buffer_idx >= (1 << kBitsReservedForBufferIndex)) {
-      return absl::InvalidArgumentError(absl::StrFormat(
-          "Buffer index (%u) is too large to be represented in a SdcBufferId "
-          "(max = %u)",
-          buffer_idx, (1 << kBitsReservedForBufferIndex) - 1));
-    }
-
-    const uint32_t value = (static_cast<uint32_t>(thunk_id.value())
-                            << kBitsReservedForBufferIndex) |
-                           static_cast<uint32_t>(buffer_idx);
-    return SdcBufferId(value);
-  }
-
-  ThunkId thunk_id() const {
-    return ThunkId(value_ >> kBitsReservedForBufferIndex);
-  }
-  size_t buffer_idx() const {
-    return value_ & ((1 << kBitsReservedForBufferIndex) - 1);
-  }
-
-  // Raw numeric value of the ID, for use in SdcLogEntry::entry_id.
-  uint32_t value() const { return value_; }
-
-  bool operator==(const SdcBufferId& other) const {
-    return value_ == other.value_;
-  }
-  bool operator!=(const SdcBufferId& other) const { return !(*this == other); }
-
-  template <typename Sink>
-  friend void AbslStringify(Sink& sink, const SdcBufferId& buffer_id) {
-    absl::Format(&sink, "{thunk_id: %u, buffer_idx: %u}",
-                 buffer_id.thunk_id().value(), buffer_id.buffer_idx());
-  }
-
-  template <typename H>
-  friend H AbslHashValue(H h, const SdcBufferId& buffer_id) {
-    return H::combine(std::move(h), buffer_id.value_);
-  }
-
- private:
-  // Out of 32 bits available in SDC entry id, reserve that much for the
-  // buffer index. This limits us to:
-  // - 2^kBitsReservedForBufferIndex max buffers per thunk
-  // - 2^(32-kBitsReservedForBufferIndex) max thunks
-  // Which hopefully is enough.
-  static constexpr size_t kBitsReservedForBufferIndex = 8;
-
-  explicit SdcBufferId(uint32_t value) : value_(value) {}
-
-  uint32_t value_ = 0;
-};
-
-}  // namespace xla::gpu
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_SDC_BUFFER_ID_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/sdc_buffer_id_test.cc b/third_party/xla/xla/backends/gpu/runtime/sdc_buffer_id_test.cc
deleted file mode 100644
index d88887e5ee3a37..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/sdc_buffer_id_test.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/sdc_buffer_id.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "absl/status/status_matchers.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace {
-
-TEST(SdcBufferIdTest, CreateFailsForLargeBufferIndex) {
-  EXPECT_THAT(xla::gpu::SdcBufferId::Create(xla::gpu::ThunkId(123),
-                                            /*buffer_idx=*/256),
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
-}
-
-TEST(SdcBufferIdTest, CreateSucceedsForSmallBufferIndex) {
-  EXPECT_THAT(xla::gpu::SdcBufferId::Create(xla::gpu::ThunkId(123),
-                                            /*buffer_idx=*/255),
-              absl_testing::IsOk());
-}
-
-TEST(SdcBufferIdTest, CorrectlyStoresAndExtractsThunkIdAndBufferIndex) {
-  TF_ASSERT_OK_AND_ASSIGN(xla::gpu::SdcBufferId buffer_id,
-                          xla::gpu::SdcBufferId::Create(xla::gpu::ThunkId(123),
-                                                        /*buffer_idx=*/45));
-
-  EXPECT_THAT(buffer_id.thunk_id(), xla::gpu::ThunkId(123));
-  EXPECT_THAT(buffer_id.buffer_idx(), 45);
-}
-
-}  // namespace
diff --git a/third_party/xla/xla/backends/gpu/runtime/sdc_log_structs.h b/third_party/xla/xla/backends/gpu/runtime/sdc_log_structs.h
deleted file mode 100644
index af08c9ac60db32..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/sdc_log_structs.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_SDC_LOG_STRUCTS_H_
-#define XLA_BACKENDS_GPU_RUNTIME_SDC_LOG_STRUCTS_H_
-
-#include <cstddef>
-#include <cstdint>
-
-#include "xla/backends/gpu/runtime/sdc_buffer_id.h"
-
-namespace xla::gpu {
-
-struct SdcLogEntry {
-  // An ID that uniquely identifies a thunk and its specific input or output
-  // buffer.
-  SdcBufferId entry_id;
-  uint32_t checksum;
-
-  template <typename Sink>
-  friend void AbslStringify(Sink& sink, const SdcLogEntry& entry) {
-    absl::Format(&sink, "{entry_id: %v, checksum: %u}", entry.entry_id,
-                 entry.checksum);
-  }
-};
-
-// The struct layout must match on both host and device.
-static_assert(_Alignof(SdcLogEntry) == _Alignof(uint32_t));
-static_assert(sizeof(SdcLogEntry) == sizeof(uint32_t) * 2);
-static_assert(offsetof(SdcLogEntry, entry_id) == 0);
-static_assert(offsetof(SdcLogEntry, checksum) == sizeof(uint32_t));
-
-struct SdcLogHeader {
-  // The first entry in `SdcLogEntry` following the header that has not
-  // been written to. May be bigger than `capacity` if the log was truncated.
-  uint32_t write_idx;
-  // The number of `SdcLogEntry` structs the log can hold.
-  uint32_t capacity;
-};
-
-// The struct layout must match on both host and device.
-static_assert(_Alignof(SdcLogHeader) == _Alignof(uint32_t));
-static_assert(sizeof(SdcLogHeader) == sizeof(uint32_t) * 2);
-static_assert(offsetof(SdcLogHeader, write_idx) == 0);
-static_assert(offsetof(SdcLogHeader, capacity) == sizeof(uint32_t));
-
-}  // namespace xla::gpu
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_SDC_LOG_STRUCTS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc
index 6094f3a0215210..bdea7a5f1d4202 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/types.h"
+#include "xla/types.h"  // IWYU pragma: keep
 
 namespace xla::gpu {
 namespace se = ::stream_executor;
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
index 7ac3c63dd600c4..8c6dd201166ef9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/select_k_thunk.h"
 
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
@@ -24,17 +27,18 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/select_k_exec.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 
 namespace xla::gpu {
@@ -43,13 +47,11 @@ namespace xla::gpu {
 // SelectKThunk
 //===----------------------------------------------------------------------===//
 
-SelectKThunk::SelectKThunk(const HloInstruction* inst, std::uint32_t batch_size,
+SelectKThunk::SelectKThunk(ThunkInfo thunk_info, std::uint32_t batch_size,
                            std::uint32_t num_elements, std::uint32_t k,
                            xla::PrimitiveType dtype,
-                           const emitters::KernelArguments& kernel_arguments,
-                           ThunkId thunk_id)
-    : Thunk(Kind::kSelectK,
-            Thunk::ThunkInfo::WithProfileAnnotation(inst, thunk_id)),
+                           const emitters::KernelArguments& kernel_arguments)
+    : Thunk(Kind::kSelectK, thunk_info),
       batch_size_(batch_size),
       num_elements_(num_elements),
       k_(k),
@@ -107,10 +109,35 @@ absl::Status SelectKThunk::ExecuteOnStream(const ExecuteParams& params) {
 absl::StatusOr<ThunkProto> SelectKThunk::ToProto() const {
   ThunkProto proto;
   *proto.mutable_thunk_info() = thunk_info().ToProto();
+  SelectKThunkProto* select_k_proto = proto.mutable_select_k_thunk();
 
-  SelectKThunkProto* select_k_thunk_proto = proto.mutable_select_k_thunk();
-  (void)select_k_thunk_proto;
-  // TODO(upwind): Add fields for SelectKThunkProto.
+  select_k_proto->set_batch_size(batch_size_);
+  select_k_proto->set_num_elements(num_elements_);
+  select_k_proto->set_k(k_);
+  select_k_proto->set_dtype(dtype_);
+
+  for (const BufferAllocation::Slice& arg : args_) {
+    TF_ASSIGN_OR_RETURN(*select_k_proto->add_args(), arg.ToProto());
+  }
   return proto;
 }
+
+absl::StatusOr<std::unique_ptr<SelectKThunk>> SelectKThunk::FromProto(
+    ThunkInfo thunk_info, const SelectKThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  std::vector<emitters::KernelArgument> arguments;
+  arguments.reserve(proto.args().size());
+  for (const xla::buffer_assignment::BufferAllocationSliceProto& arg :
+       proto.args()) {
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice slice,
+        BufferAllocation::Slice::FromProto(arg, buffer_allocations));
+    emitters::KernelArgument argument{Shape{}, slice};
+    arguments.push_back(std::move(argument));
+  }
+  return std::make_unique<SelectKThunk>(
+      thunk_info, proto.batch_size(), proto.num_elements(), proto.k(),
+      proto.dtype(), emitters::KernelArguments(std::move(arguments)));
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
index 94eb9c5f822a61..2b9a25f4307110 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
@@ -17,15 +17,16 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_SELECT_K_THUNK_H_
 
 #include <cstdint>
+#include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/types.h"  // IWYU pragma: keep
 
@@ -40,18 +41,17 @@ class SelectKThunk : public Thunk {
  public:
   // Constructor.
   // Parameters:
-  //   inst             - The HLO instruction that generated this thunk.
+  //   thunk_info       - ThunkInfo contains profile annotation & thunk id.
   //   batch_size       - Number of batches in the input tensor.
   //   num_elements     - Number of elements in each batch.
   //   k                - Number of top elements to select.
   //   dtype            - Data type of elements (e.g., F32, BF16).
   //   kernel_arguments - Kernel arguments holding buffer slices for
   //                      inputs/outputs.
-  SelectKThunk(const HloInstruction* inst, std::uint32_t batch_size,
+  SelectKThunk(ThunkInfo thunk_info, std::uint32_t batch_size,
                std::uint32_t num_elements, std::uint32_t k,
                xla::PrimitiveType dtype,
-               const emitters::KernelArguments& kernel_arguments,
-               ThunkId thunk_id);
+               const emitters::KernelArguments& kernel_arguments);
 
   std::string ToString(int indent) const override;
 
@@ -64,6 +64,10 @@ class SelectKThunk : public Thunk {
 
   absl::StatusOr<ThunkProto> ToProto() const override;
 
+  static absl::StatusOr<std::unique_ptr<SelectKThunk>> FromProto(
+      ThunkInfo thunk_info, const SelectKThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+
  private:
   std::uint32_t batch_size_;
   std::uint32_t num_elements_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk_test.cc
index ba469be381d3ed..9db16fb3f521e3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk_test.cc
@@ -16,10 +16,12 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/select_k_thunk.h"
 
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
@@ -30,48 +32,62 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 namespace {
 
+using ::absl_testing::IsOkAndHolds;
 using ::tsl::proto_testing::EqualsProto;
 
 TEST(SelectKThunkTest, ToProto) {
-  Thunk::ThunkInfo thunk_info;
-  thunk_info.profile_annotation = "profile_annotation";
-  thunk_info.execution_stream_id = 123;
-  thunk_info.thunk_id = 456;
+  auto c1 = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{.125f, 0.875f, .5f, .25f, 0.75f}}));
+  auto topKInst = HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1, 5}), {c1.get()}, "__gpu$TopK");
 
-  BufferAllocation alloc0(/*index=*/0, /*size=*/20, /*color=*/0);
-  BufferAllocation::Slice slice0(&alloc0, /*offset=*/0, /*size=*/20);
+  Thunk::ThunkInfo thunk_info =
+      Thunk::ThunkInfo::WithProfileAnnotation(topKInst.get(), ThunkId{456});
 
-  BufferAllocation alloc1(/*index=*/1, /*size=*/12, /*color=*/0);
-  BufferAllocation::Slice slice1(&alloc1, /*offset=*/0, /*size=*/12);
+  std::vector<BufferAllocation> buffer_allocations = {
+      {/*index=*/0, /*size=*/20, /*color=*/0},
+      {/*index=*/1, /*size=*/12, /*color=*/0},
+      {/*index=*/2, /*size=*/12, /*color=*/0}};
 
-  BufferAllocation alloc2(/*index=*/2, /*size=*/12, /*color=*/0);
-  BufferAllocation::Slice slice2(&alloc2, /*offset=*/0, /*size=*/12);
+  BufferAllocation::Slice slice0(&buffer_allocations[0], /*offset=*/0,
+                                 /*size=*/20);
+  BufferAllocation::Slice slice1(&buffer_allocations[1], /*offset=*/0,
+                                 /*size=*/12);
+  BufferAllocation::Slice slice2(&buffer_allocations[2], /*offset=*/0,
+                                 /*size=*/12);
 
   emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {1, 5}), slice0);
   emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {1, 3}), slice1);
   emitters::KernelArgument arg2(ShapeUtil::MakeShape(U32, {1, 3}), slice2);
-  arg0.set_written(false);
-  arg1.set_written(true);
-  arg2.set_written(true);
 
   emitters::KernelArguments kernel_arguments({arg0, arg1, arg2});
 
-  auto c1 = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{.125f, 0.875f, .5f, .25f, 0.75f}}));
-  auto topKInst = HloInstruction::CreateCustomCall(
-      ShapeUtil::MakeShape(F32, {1, 5}), {c1.get()}, "__gpu$TopK");
+  SelectKThunk thunk(std::move(thunk_info), 1, 5, 3, F32, kernel_arguments);
 
-  SelectKThunk thunk(topKInst.get(), 1, 5, 3, F32, kernel_arguments,
-                     ThunkId(456));
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
   EXPECT_THAT(proto, EqualsProto(R"pb(
                 thunk_info { profile_annotation: "custom-call" thunk_id: 456 }
-                select_k_thunk {}
+                select_k_thunk {
+                  args { buffer_allocation_index: 0 size: 20 }
+                  args { buffer_allocation_index: 1 size: 12 }
+                  args { buffer_allocation_index: 2 size: 12 }
+                  batch_size: 1
+                  num_elements: 5
+                  k: 3
+                  dtype: F32
+                }
               )pb"));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SelectKThunk> deserialized,
+      SelectKThunk::FromProto(thunk.thunk_info(), proto.select_k_thunk(),
+                              buffer_allocations));
+  EXPECT_THAT(deserialized->ToProto(), IsOkAndHolds(EqualsProto(proto)));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
index d4199c21aed48a..9958567d07af31 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
@@ -30,16 +30,16 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -82,7 +82,8 @@ absl::StatusOr<bool> SendThunk::RunCollective(const ExecuteParams& params,
                       params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   const int64_t current_id =
-      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+      config_.config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
           ? current_logical_id.replica_id
           : current_logical_id.computation_id;
   std::string device_string = GetDeviceString(*params.collective_params);
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
index 63113da10e2b6b..bf9dd9da895abf 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
@@ -23,12 +23,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
index a8df1eb635c041..9551f5e77ad634 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/functional/function_ref.h"
@@ -72,10 +73,9 @@ std::string SequentialThunk::ToString(int indent) const {
   return result;
 }
 
-absl::Status SequentialThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+absl::Status SequentialThunk::Prepare(const PrepareParams& params) {
   for (auto& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Prepare(params, resource_requests));
+    TF_RETURN_IF_ERROR(thunk->Prepare(params));
   }
   return absl::OkStatus();
 }
@@ -125,6 +125,17 @@ void SequentialThunk::ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) {
   }
 }
 
+absl::Status SequentialThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  for (std::unique_ptr<Thunk>& thunk : thunks_) {
+    TF_RETURN_IF_ERROR(thunk->TransformAllNestedThunks(fn));
+    TF_ASSIGN_OR_RETURN(thunk, fn(std::move(thunk)));
+  }
+  return absl::OkStatus();
+}
+
 absl::StatusOr<ThunkProto> SequentialThunk::ToProto() const {
   ThunkProto proto;
   *proto.mutable_thunk_info() = thunk_info().ToProto();
@@ -152,5 +163,19 @@ absl::StatusOr<std::unique_ptr<SequentialThunk>> SequentialThunk::FromProto(
   return std::make_unique<SequentialThunk>(std::move(thunk_info),
                                            std::move(thunk_sequence));
 }
+
+std::unique_ptr<SequentialThunk> SequentialThunk::FromThunk(
+    std::unique_ptr<Thunk> thunk) {
+  if (thunk->kind() == Thunk::kSequential) {
+    return std::unique_ptr<SequentialThunk>(
+        static_cast<SequentialThunk*>(thunk.release()));
+  }
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(thunk));
+  return std::make_unique<SequentialThunk>(Thunk::ThunkInfo(),
+                                           std::move(thunks));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
index 3aee984c7d9ba1..c4cd7ed2fd1e97 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
@@ -41,13 +41,16 @@ class SequentialThunk : public Thunk {
   const ThunkSequence& thunks() const { return thunks_; }
   std::string ToString(int indent) const override;
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
 
   absl::StatusOr<ThunkProto> ToProto() const override;
 
@@ -55,6 +58,13 @@ class SequentialThunk : public Thunk {
       ThunkInfo thunk_info, const SequentialThunkProto& thunk_proto,
       const Deserializer& deserializer);
 
+  // Converts a Thunk into a SequentialThunk. If the input is already a
+  // SequentialThunk, the returned value is the downcasted input.
+  //
+  // The new thunk, if created, will use a default-initialized ThunkInfo.
+  static std::unique_ptr<SequentialThunk> FromThunk(
+      std::unique_ptr<Thunk> thunk);
+
  private:
   // The list of sub-thunks.
   ThunkSequence thunks_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
index 9f9a426f97ce07..d3df365b966f42 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -27,6 +28,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -156,5 +158,36 @@ TEST(SequentialThunkTest, ToString) {
             "  003: kGemm\t\n");
 }
 
+TEST(SequentialThunkTest, TransformAllNestedThunks) {
+  auto make_info = [](uint64_t id) {
+    Thunk::ThunkInfo info;
+    info.thunk_id = ThunkId(id);
+    return info;
+  };
+  ThunkSequence thunks;
+  thunks.push_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, make_info(1)));
+  thunks.push_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, make_info(2)));
+  thunks.push_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, make_info(3)));
+  SequentialThunk sequential_thunk(Thunk::ThunkInfo(), std::move(thunks));
+
+  TF_EXPECT_OK(sequential_thunk.TransformAllNestedThunks(
+      [&](std::unique_ptr<Thunk> thunk) -> std::unique_ptr<Thunk> {
+        return std::make_unique<DummyThunk>(
+            Thunk::Kind::kCopy,
+            make_info(thunk->thunk_info().thunk_id.value() + 10));
+      }));
+
+  EXPECT_EQ(sequential_thunk.thunks().size(), 3);
+  EXPECT_EQ(sequential_thunk.thunks()[0]->kind(), Thunk::Kind::kCopy);
+  EXPECT_EQ(sequential_thunk.thunks()[0]->thunk_info().thunk_id, ThunkId(11));
+  EXPECT_EQ(sequential_thunk.thunks()[1]->kind(), Thunk::Kind::kCopy);
+  EXPECT_EQ(sequential_thunk.thunks()[1]->thunk_info().thunk_id, ThunkId(12));
+  EXPECT_EQ(sequential_thunk.thunks()[2]->kind(), Thunk::Kind::kCopy);
+  EXPECT_EQ(sequential_thunk.thunks()[2]->thunk_info().thunk_id, ThunkId(13));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice.cc b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.cc
new file mode 100644
index 00000000000000..f648f8f2e59b78
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.cc
@@ -0,0 +1,68 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+
+#include <optional>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+absl::StatusOr<ShapedSlice> ShapedSlice::FromProto(
+    const ShapedSliceProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  ShapedSlice shaped_slice;
+  TF_ASSIGN_OR_RETURN(
+      shaped_slice.slice,
+      BufferAllocation::Slice::FromProto(proto.slice(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(shaped_slice.shape, Shape::FromProto(proto.shape()));
+  return shaped_slice;
+}
+
+absl::StatusOr<ShapedSliceProto> ShapedSlice::ToProto() const {
+  ShapedSliceProto proto;
+  TF_ASSIGN_OR_RETURN(*proto.mutable_slice(), slice.ToProto());
+  *proto.mutable_shape() = shape.ToProto();
+  return proto;
+}
+
+absl::StatusOr<NullableShapedSlice> NullableShapedSlice::FromProto(
+    const NullableShapedSliceProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  if (proto.has_shaped_slice()) {
+    TF_ASSIGN_OR_RETURN(
+        ShapedSlice shaped_slice,
+        ShapedSlice::FromProto(proto.shaped_slice(), buffer_allocations));
+    return NullableShapedSlice(std::move(shaped_slice));
+  }
+  return NullableShapedSlice(std::nullopt);
+}
+
+absl::StatusOr<NullableShapedSliceProto> NullableShapedSlice::ToProto() const {
+  NullableShapedSliceProto proto;
+  if (has_value()) {
+    TF_ASSIGN_OR_RETURN(*proto.mutable_shaped_slice(), value().ToProto());
+  }
+  return proto;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice.h b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.h
new file mode 100644
index 00000000000000..e4306458650a69
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.h
@@ -0,0 +1,91 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_SHAPED_SLICE_H_
+#define XLA_BACKENDS_GPU_RUNTIME_SHAPED_SLICE_H_
+
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+
+namespace xla::gpu {
+
+// A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its
+// shape.
+struct ShapedSlice {
+  BufferAllocation::Slice slice;
+  Shape shape;
+
+  static absl::StatusOr<ShapedSlice> FromProto(
+      const ShapedSliceProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+  absl::StatusOr<ShapedSliceProto> ToProto() const;
+
+  friend bool operator==(const ShapedSlice& lhs, const ShapedSlice& rhs) {
+    return lhs.slice == rhs.slice && lhs.shape == rhs.shape;
+  }
+
+  friend bool operator!=(const ShapedSlice& lhs, const ShapedSlice& rhs) {
+    return !(lhs == rhs);
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ShapedSlice& shaped_slice) {
+    absl::Format(&sink, "ShapedSlice{slice: %v, shape: %v}", shaped_slice.slice,
+                 shaped_slice.shape.ToString(/*print_layout=*/true));
+  }
+};
+
+// A nullable shaped slice is either a ShapedSlice or a nullopt. This is used
+// to represent the operands and results of a thunk, where a nullopt represents
+// a null pointer argument to the thunk.
+class NullableShapedSlice : public std::optional<ShapedSlice> {
+ public:
+  using std::optional<ShapedSlice>::optional;
+
+  static absl::StatusOr<NullableShapedSlice> FromProto(
+      const NullableShapedSliceProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+  absl::StatusOr<NullableShapedSliceProto> ToProto() const;
+
+  friend bool operator==(const NullableShapedSlice& lhs,
+                         const NullableShapedSlice& rhs) {
+    return static_cast<const std::optional<ShapedSlice>&>(lhs) ==
+           static_cast<const std::optional<ShapedSlice>&>(rhs);
+  }
+
+  friend bool operator!=(const NullableShapedSlice& lhs,
+                         const NullableShapedSlice& rhs) {
+    return !(lhs == rhs);
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink,
+                            const NullableShapedSlice& shaped_slice) {
+    if (shaped_slice.has_value()) {
+      absl::Format(&sink, "%v", *shaped_slice);
+    } else {
+      absl::Format(&sink, "null");
+    }
+  }
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_SHAPED_SLICE_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice.proto b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.proto
new file mode 100644
index 00000000000000..ddb76b96c82e2f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.proto
@@ -0,0 +1,25 @@
+syntax = "proto3";
+
+package xla.gpu;
+
+import "xla/service/buffer_assignment.proto";
+import "xla/xla_data.proto";
+
+option java_multiple_files = true;
+option java_outer_classname = "ShapedSlice";
+
+// A shaped slice is a BufferAllocation::Slice and its shape.
+message ShapedSliceProto {
+  // The buffer allocation slice.
+  xla.buffer_assignment.BufferAllocationSliceProto slice = 1;
+
+  // The shape of the slice.
+  xla.ShapeProto shape = 2;
+}
+
+// A nullable shaped slice is either a ShapedSlice or a nullopt. This is used
+// to represent the operands and results of a thunk, where a nullopt represents
+// a null pointer argument to the thunk.
+message NullableShapedSliceProto {
+  optional ShapedSliceProto shaped_slice = 1;
+}
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice_test.cc b/third_party/xla/xla/backends/gpu/runtime/shaped_slice_test.cc
new file mode 100644
index 00000000000000..0568f3ce8de0d4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice_test.cc
@@ -0,0 +1,225 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/strings/str_cat.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
+#include "xla/primitive_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+using absl_testing::IsOkAndHolds;
+using ::testing::HasSubstr;
+using tsl::proto_testing::EqualsProto;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(ShapedSliceTest, Stringify) {
+  constexpr int64_t kNumElements = 1024;
+  const size_t kSizeInBytes = kNumElements * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice =
+      BufferAllocation::Slice(&alloc, /*offset=*/primitive_util::ByteWidth(F32),
+                              /*size=*/kSizeInBytes);
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElements - 1});
+  EXPECT_THAT(absl::StrCat(shaped_slice), HasSubstr("ShapedSlice"));
+  EXPECT_THAT(absl::StrCat(shaped_slice),
+              HasSubstr(absl::StrCat(shaped_slice.slice)));
+  EXPECT_THAT(absl::StrCat(shaped_slice),
+              HasSubstr(shaped_slice.shape.ToString(/*print_layout=*/true)));
+}
+
+TEST(ShapedSliceTest, ToProto) {
+  constexpr int64_t kNumElements = 1024;
+  const size_t kSizeInBytes = kNumElements * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice = BufferAllocation::Slice(
+      &alloc, /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElements - 1});
+
+  EXPECT_THAT(
+      shaped_slice.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+        slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+        shape {
+          element_type: F32
+          dimensions: 1023
+          layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+          is_dynamic_dimension: false
+        }
+      )pb")));
+}
+
+TEST(ShapedSliceTest, FromProto) {
+  ShapedSliceProto proto = ParseTextProtoOrDie<ShapedSliceProto>(R"pb(
+    slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+    shape {
+      element_type: F32
+      dimensions: 1023
+      layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+      is_dynamic_dimension: false
+    }
+  )pb");
+
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+
+  ShapedSlice expected_shaped_slice;
+  expected_shaped_slice.slice = BufferAllocation::Slice(
+      &alloc, /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr int64_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+
+  expected_shaped_slice.shape =
+      ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+
+  std::vector<BufferAllocation> buffer_allocations = {alloc};
+  EXPECT_THAT(ShapedSlice::FromProto(proto, buffer_allocations),
+              IsOkAndHolds(expected_shaped_slice));
+}
+
+TEST(NullableShapedSliceTest, StringifyNonEmptySlice) {
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice =
+      BufferAllocation::Slice(&alloc, /*offset=*/primitive_util::ByteWidth(F32),
+                              /*size=*/kSizeInBytes);
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr int64_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+  NullableShapedSlice non_empty_slice(shaped_slice);
+  EXPECT_THAT(absl::StrCat(non_empty_slice),
+              HasSubstr(absl::StrCat(shaped_slice)));
+}
+
+TEST(NullableShapedSliceTest, StringifyEmptySlice) {
+  NullableShapedSlice empty_slice;
+  EXPECT_THAT(absl::StrCat(empty_slice), HasSubstr("null"));
+}
+
+TEST(NullableShapedSliceTest, ToProtoNonEmptySlice) {
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice = BufferAllocation::Slice(
+      &alloc, /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr int64_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+  NullableShapedSlice non_empty_slice(shaped_slice);
+  EXPECT_THAT(
+      non_empty_slice.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+        shaped_slice {
+          slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+          shape {
+            element_type: F32
+            dimensions: 1023
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+        }
+      )pb")));
+}
+
+TEST(NullableShapedSliceTest, ToProtoEmptySlice) {
+  NullableShapedSlice empty_slice;
+  EXPECT_THAT(empty_slice.ToProto(), IsOkAndHolds(EqualsProto(R"pb()pb")));
+}
+
+TEST(NullableShapedSliceTest, FromProtoNonEmptySlice) {
+  NullableShapedSliceProto proto =
+      ParseTextProtoOrDie<NullableShapedSliceProto>(R"pb(
+        shaped_slice {
+          slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+          shape {
+            element_type: F32
+            dimensions: 1023
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+        }
+      )pb");
+
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  std::vector<BufferAllocation> buffer_allocations = {alloc};
+  ShapedSlice expected_shaped_slice;
+  expected_shaped_slice.slice = BufferAllocation::Slice(
+      &buffer_allocations[0], /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr size_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+  expected_shaped_slice.shape =
+      ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+
+  EXPECT_THAT(NullableShapedSlice::FromProto(
+                  proto, /*buffer_allocations=*/buffer_allocations),
+              IsOkAndHolds(NullableShapedSlice(expected_shaped_slice)));
+}
+
+TEST(NullableShapedSliceTest, FromProtoEmptySlice) {
+  NullableShapedSliceProto proto;
+  EXPECT_THAT(NullableShapedSlice::FromProto(proto, /*buffer_allocations=*/{}),
+              IsOkAndHolds(NullableShapedSlice()));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.cc b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
index a08a22e6949cca..afb17871c6cc3f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
@@ -17,167 +17,34 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <memory>
 #include <ostream>
 #include <string>
 #include <utility>
 
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_cliques.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/core/collectives/rank_id.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
-namespace xla {
-namespace gpu {
-
-//===----------------------------------------------------------------------===//
-// Thunk::CollectiveCliques
-//===----------------------------------------------------------------------===//
-
-Thunk::CollectiveCliques::CollectiveCliques(AcquiredCliquesMap cliques_map,
-                                            int32_t num_transient_cliques)
-    : cliques_map_(std::move(cliques_map)),
-      num_transient_cliques_(num_transient_cliques) {}
-
-absl::StatusOr<Communicator*> Thunk::CollectiveCliques::GetComm(
-    const GpuCliqueKey& clique_key, RankId rank) const {
-  // Check that we locked access to a clique for `clique_key`.
-  auto clique = cliques_map_.find(clique_key);
-  if (clique == cliques_map_.end()) {
-    return absl::NotFoundError(absl::StrCat("No clique found for clique key: ",
-                                            clique_key.ToString()));
-  }
-
-  // Check that clique has a communicator for our rank.
-  auto communicator = (*clique->second)->comm(rank);
-  if (!communicator.has_value()) {
-    return absl::InternalError(
-        absl::StrCat("Communicator for rank ", rank.value(),
-                     " not found in a NCCL clique ", clique_key.ToString()));
-  }
-
-  return *communicator;
-}
-
-absl::StatusOr<bool> Thunk::CollectiveCliques::peer_access_enabled(
-    const GpuCliqueKey& clique_key) const {
-  // Check that we locked access to a clique for `clique_key`.
-  auto clique = cliques_map_.find(clique_key);
-  if (clique == cliques_map_.end()) {
-    return absl::NotFoundError(absl::StrCat("No clique found for clique key: ",
-                                            clique_key.ToString()));
-  }
-
-  return (*clique->second)->peer_access_enabled();
-}
-
-//===----------------------------------------------------------------------===//
-// Thunk::CollectiveExecuteParams
-//===----------------------------------------------------------------------===//
-
-using GlobalDeviceIdMap = Thunk::CollectiveExecuteParams::GlobalDeviceIdMap;
-
-// Returns global device id for a local device ordinal or an error if global
-// device id map is misconfigured and missing an entry for a local device.
-static absl::StatusOr<GlobalDeviceId> GetGlobalDeviceId(
-    const GlobalDeviceIdMap* device_id_map, int64_t local_device_ordinal) {
-  // No local -> global mapping was provided; assume the identity mapping.
-  if (!device_id_map) {
-    return GlobalDeviceId(local_device_ordinal);
-  }
-
-  // Find a global device id in a global device id map.
-  auto it = device_id_map->find(local_device_ordinal);
-  if (it == device_id_map->end()) {
-    return absl::NotFoundError(
-        absl::StrCat("No global device id found for local device ordinal: ",
-                     local_device_ordinal));
-  }
-
-  return it->second;
-}
-
-absl::StatusOr<Thunk::CollectiveExecuteParams>
-Thunk::CollectiveExecuteParams::Create(
-    const ServiceExecutableRunOptions& run_options,
-    absl::Span<se::Stream* const> async_streams, int64_t local_device_ordinal,
-    int64_t collective_max_nchannels, int64_t p2p_max_nchannels) {
-  const GpuExecutableRunOptions* gpu_options =
-      run_options.run_options().gpu_executable_run_options();
-
-  auto* collectives = gpu_options && gpu_options->collectives()
-                          ? gpu_options->collectives()
-                          : GpuCollectives::Default();
-
-  auto* device_id_map = gpu_options && gpu_options->gpu_global_device_ids()
-                            ? &*gpu_options->gpu_global_device_ids()
-                            : nullptr;
-
-  auto* clique_id_callback = gpu_options && gpu_options->clique_id_callback()
-                                 ? &gpu_options->clique_id_callback()
-                                 : nullptr;
-
-  auto* incarnations = gpu_options && gpu_options->incarnations().has_value()
-                           ? &*gpu_options->incarnations()
-                           : nullptr;
-
-  TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
-                      GetGlobalDeviceId(device_id_map, local_device_ordinal));
-
-  return CollectiveExecuteParams(
-      collectives, run_options.stream()->parent(),
-      run_options.run_options().run_id(), async_streams, local_device_ordinal,
-      global_device_id, run_options.run_options().device_assignment(),
-      device_id_map, clique_id_callback, incarnations, collective_max_nchannels,
-      p2p_max_nchannels);
-}
-
-Thunk::CollectiveExecuteParams::CollectiveExecuteParams(
-    GpuCollectives* collectives, se::StreamExecutor* executor, RunId run_id,
-    absl::Span<se::Stream* const> async_streams, int64_t local_device_ordinal,
-    GlobalDeviceId global_device_id, const DeviceAssignment* device_assn,
-    const GlobalDeviceIdMap* global_device_id_map,
-    const CliqueIdCallback* nccl_clique_id_callback,
-    const absl::flat_hash_map<GlobalDeviceId, IncarnationId>* incarnations,
-    int64_t collective_max_nchannels, int64_t p2p_max_nchannels)
-    : collectives(collectives),
-      executor(executor),
-      run_id(run_id),
-      async_streams(async_streams.begin(), async_streams.end()),
-      local_device_ordinal(local_device_ordinal),
-      global_device_id(global_device_id),
-      device_assn(device_assn),
-      global_device_id_map(global_device_id_map),
-      nccl_clique_id_callback(nccl_clique_id_callback),
-      incarnations(incarnations),
-      collective_max_nchannels(collective_max_nchannels),
-      p2p_max_nchannels(p2p_max_nchannels) {}
+namespace xla::gpu {
 
 //===----------------------------------------------------------------------===//
 // Thunk::ExecuteParams
@@ -187,8 +54,7 @@ Thunk::ExecuteParams Thunk::ExecuteParams::Create(
     const ServiceExecutableRunOptions& run_options,
     const BufferAllocations& buffer_allocations, se::Stream* stream,
     se::Stream* command_buffer_trace_stream,
-    CollectiveExecuteParams* collective_params,
-    CollectiveCliques* collective_cliques,
+    CollectiveParams* collective_params, CollectiveCliques* collective_cliques,
     ExecutionStreamIdMap additional_compute_streams) {
   return ExecuteParams(&buffer_allocations, stream, command_buffer_trace_stream,
                        collective_params, collective_cliques,
@@ -220,9 +86,8 @@ Thunk::ExecuteParams Thunk::ExecuteParams::CloneWithNewAllocations(
 Thunk::ExecuteParams::ExecuteParams(
     const BufferAllocations* buffer_allocations, se::Stream* stream,
     se::Stream* command_buffer_trace_stream,
-    CollectiveExecuteParams* collective_params,
-    CollectiveCliques* collective_cliques, se::Stream* device_to_host_stream,
-    se::Stream* host_to_device_stream,
+    CollectiveParams* collective_params, CollectiveCliques* collective_cliques,
+    se::Stream* device_to_host_stream, se::Stream* host_to_device_stream,
     SendDeviceMemoryFunction* send_device_memory_function,
     RecvDeviceMemoryFunction* recv_device_memory_function,
     const ffi::ExecutionContext* ffi_execution_context,
@@ -259,11 +124,13 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kAllToAll);
     CASE(kAllToAllDone);
     CASE(kAllToAllStart);
-    CASE(kCholesky);
+    CASE(kBuffersDebugChecksum);
+    CASE(kBuffersDebugFloatCheck);
     CASE(kCollectiveBroadcast);
     CASE(kCollectiveBroadcastDone);
     CASE(kCollectiveBroadcastStart);
     CASE(kCollectiveKernel);
+    CASE(kCollectiveMetadata);
     CASE(kCollectivePermute);
     CASE(kCollectivePermuteDone);
     CASE(kCollectivePermuteStart);
@@ -443,7 +310,7 @@ ThunkMetadataListProto GetMetadataListProtoFromThunkGraph(
 }
 
 absl::StatusOr<GpuCollectives* absl_nonnull> Thunk::GetGpuCollectives(
-    CollectiveExecuteParams const& params) {
+    const CollectiveParams& params) {
   if (params.collectives == nullptr) {
     return Internal("Collectives API is not provided");
   }
@@ -458,23 +325,4 @@ ThunkInfoProto Thunk::ThunkInfo::ToProto() const {
   return proto;
 }
 
-absl::StatusOr<ShapedSlice> ShapedSlice::FromProto(
-    const ShapedSliceProto& proto,
-    absl::Span<const BufferAllocation> buffer_allocations) {
-  ShapedSlice shaped_slice;
-  TF_ASSIGN_OR_RETURN(
-      shaped_slice.slice,
-      BufferAllocation::Slice::FromProto(proto.slice(), buffer_allocations));
-  TF_ASSIGN_OR_RETURN(shaped_slice.shape, Shape::FromProto(proto.shape()));
-  return shaped_slice;
-}
-
-absl::StatusOr<ShapedSliceProto> ShapedSlice::ToProto() const {
-  ShapedSliceProto proto;
-  TF_ASSIGN_OR_RETURN(*proto.mutable_slice(), slice.ToProto());
-  *proto.mutable_shape() = shape.ToProto();
-  return proto;
-}
-
-}  // namespace gpu
-}  // namespace xla
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.h b/third_party/xla/xla/backends/gpu/runtime/thunk.h
index c967234c60cdb5..d462d23512ac7a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.h
@@ -34,21 +34,19 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_cliques.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/core/collectives/communicator.h"
-#include "xla/core/collectives/rank_id.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
@@ -137,11 +135,13 @@ class Thunk {
     kAllToAll,
     kAllToAllDone,
     kAllToAllStart,
-    kCholesky,
+    kBuffersDebugChecksum,
+    kBuffersDebugFloatCheck,
     kCollectiveBroadcast,
     kCollectiveBroadcastDone,
     kCollectiveBroadcastStart,
     kCollectiveKernel,
+    kCollectiveMetadata,
     kCollectivePermute,
     kCollectivePermuteDone,
     kCollectivePermuteStart,
@@ -235,107 +235,6 @@ class Thunk {
     ThunkInfoProto ToProto() const;
   };
 
-  //===--------------------------------------------------------------------===//
-  // ResourceRequests
-  //===--------------------------------------------------------------------===//
-
-  // Each individual thunk can request various resources required for execution
-  // at prepare stage. XLA executable is responsible for allocating them before
-  // initializing and executing thunks.
-  class ResourceRequestsInterface {
-   public:
-    virtual ~ResourceRequestsInterface() = default;
-    virtual absl::Status AddClique(const GpuCliqueKey& clique_key) = 0;
-  };
-
-  //===--------------------------------------------------------------------===//
-  // CollectiveCliques
-  //===--------------------------------------------------------------------===//
-
-  // A collection of collective cliques acquired based on resource requests
-  // collected from all thunks at prepare stage.
-  class CollectiveCliques {
-   public:
-    CollectiveCliques() = default;
-    CollectiveCliques(AcquiredCliquesMap cliques_map,
-                      int32_t num_transient_cliques);
-
-    absl::StatusOr<Communicator*> GetComm(const GpuCliqueKey& clique_key,
-                                          RankId rank) const;
-
-    // Returns whether peer device memory access is possible between all devices
-    // in the clique.
-    absl::StatusOr<bool> peer_access_enabled(
-        const GpuCliqueKey& clique_key) const;
-
-    bool empty() const { return cliques_map_.empty(); }
-
-    bool num_transient_cliques() const { return num_transient_cliques_; }
-
-   private:
-    AcquiredCliquesMap cliques_map_;
-
-    // The number of acquired non-persistent clique. We need to keep track of
-    // newly created communicators to insert rendezvous after first
-    // initialization, because otherwise we observe deadlocks with NCCL
-    // collectives backends.
-    int32_t num_transient_cliques_ = 0;
-  };
-
-  //===--------------------------------------------------------------------===//
-  // CollectiveExecuteParams
-  //===--------------------------------------------------------------------===//
-
-  // Parameters capturing all the details required for collective execution of
-  // XLA executables (multiple partitions and replicas).
-  struct CollectiveExecuteParams {
-    // Creates NCCL execution parameters from the run options for the given
-    // local device. Returns an error if run options are misconfigured (i.e.
-    // missing a global device mapping for a local device ordinal).
-    static absl::StatusOr<CollectiveExecuteParams> Create(
-        const ServiceExecutableRunOptions& run_options,
-        absl::Span<se::Stream* const> async_streams,
-        int64_t local_device_ordinal, int64_t collective_max_nchannels = 0,
-        int64_t p2p_max_nchannels = 0);
-
-    // A mapping from local device ordinals to global device IDs.
-    using GlobalDeviceIdMap = std::map<int32_t, GlobalDeviceId>;
-
-    GpuCollectives* collectives;
-    se::StreamExecutor* executor;
-
-    // XLA execution run id allows us to distinguish collective operations
-    // from different concurrent executions and avoid deadlocks.
-    RunId run_id;
-
-    // Streams for asynchronous collective communications.
-    absl::InlinedVector<se::Stream*, 4> async_streams;
-
-    int64_t local_device_ordinal;
-    GlobalDeviceId global_device_id;
-
-    const DeviceAssignment* device_assn;
-    const GlobalDeviceIdMap* global_device_id_map;
-    const CliqueIdCallback* nccl_clique_id_callback;
-    const absl::flat_hash_map<GlobalDeviceId, IncarnationId>* incarnations;
-
-    int64_t collective_max_nchannels;
-    int64_t p2p_max_nchannels;
-
-    bool need_barrier = false;
-
-   private:
-    CollectiveExecuteParams(
-        GpuCollectives* collectives, se::StreamExecutor* executor, RunId run_id,
-        absl::Span<se::Stream* const> async_streams,
-        int64_t local_device_ordinal, GlobalDeviceId global_device_id,
-        const DeviceAssignment* device_assn,
-        const GlobalDeviceIdMap* global_device_id_map,
-        const CliqueIdCallback* nccl_clique_id_callback,
-        const absl::flat_hash_map<GlobalDeviceId, IncarnationId>* incarnations,
-        int64_t collective_max_nchannels, int64_t p2p_max_nchannels);
-  };
-
   //===--------------------------------------------------------------------===//
   // PrepareParams
   //===--------------------------------------------------------------------===//
@@ -345,7 +244,9 @@ class Thunk {
   // back to executable, i.e. request collective cliques required at run time.
   struct PrepareParams {
     // Parameters for executing collective operations.
-    const CollectiveExecuteParams* collective_params = nullptr;
+    const CollectiveParams* collective_params = nullptr;
+    // Clique requests for preparing collective communicators.
+    CollectiveCliqueRequests* clique_requests = nullptr;
   };
 
   //===--------------------------------------------------------------------===//
@@ -373,7 +274,7 @@ class Thunk {
     se::Stream* command_buffer_trace_stream = nullptr;
 
     // Parameters for executing collective operations.
-    CollectiveExecuteParams* collective_params = nullptr;
+    CollectiveParams* collective_params = nullptr;
 
     // Collective cliques acquired based on resource requests.
     CollectiveCliques* collective_cliques = nullptr;
@@ -399,7 +300,7 @@ class Thunk {
         const ServiceExecutableRunOptions& run_options,
         const BufferAllocations& buffer_allocations, se::Stream* stream,
         se::Stream* command_buffer_trace_stream,
-        CollectiveExecuteParams* collective_params,
+        CollectiveParams* collective_params,
         CollectiveCliques* collective_cliques,
         ExecutionStreamIdMap additional_compute_streams = {});
 
@@ -419,7 +320,7 @@ class Thunk {
     se::Stream* command_buffer_trace_stream;
 
     // Parameters for executing collective operations.
-    CollectiveExecuteParams* collective_params;
+    CollectiveParams* collective_params;
 
     // Collective cliques acquired based on resource requests.
     CollectiveCliques* collective_cliques;
@@ -447,7 +348,7 @@ class Thunk {
 
     ExecuteParams(const BufferAllocations* buffer_allocations,
                   se::Stream* stream, se::Stream* command_buffer_trace_stream,
-                  CollectiveExecuteParams* collective_params,
+                  CollectiveParams* collective_params,
                   CollectiveCliques* collective_cliques,
                   se::Stream* device_to_host_stream,
                   se::Stream* host_to_device_stream,
@@ -478,8 +379,7 @@ class Thunk {
   // This may be called multiple times. Its main purpose is to pass resource
   // requests up to the parent executable so it can acquire them before
   // initialization and execution.
-  virtual absl::Status Prepare(const PrepareParams& params,
-                               ResourceRequestsInterface& resource_requests) {
+  virtual absl::Status Prepare(const PrepareParams& params) {
     return absl::OkStatus();
   }
 
@@ -536,10 +436,21 @@ class Thunk {
   // Invokes `fn` with this thunk and all nested thunks.
   virtual void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn);
 
+  // Recursively replaces all nested thunks with the result of applying `fn` to
+  // them.
+  // An error will leave the transformation in invalid state.
+  // InternalError should be used for status.
+  virtual absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) {
+    return absl::OkStatus();
+  }
+
   // A helper function to get the `GpuCollectives*` pointer from the
-  // CollectiveExecuteParams.
+  // CollectiveParams.
   static absl::StatusOr<GpuCollectives* absl_nonnull> GetGpuCollectives(
-      CollectiveExecuteParams const& params);
+      CollectiveParams const& params);
 
   // A helper function to get the `GpuCollectives*` pointer from the
   // thunk parameters. Returns an error if collectives API is not provided.
@@ -564,6 +475,10 @@ class Thunk {
       absl::AnyInvocable<absl::StatusOr<std::unique_ptr<Thunk>>(
           const ThunkProto&) const>;
 
+  using DeserializerWithCustomAllocations =
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<Thunk>>(
+          const ThunkProto&, absl::Span<const BufferAllocation>) const>;
+
   void add_control_predecessor(const Thunk* control_predecessor) {
     control_predecessors_.push_back(control_predecessor);
   }
@@ -597,18 +512,6 @@ using ThunkSequence = std::vector<std::unique_ptr<Thunk>>;
 
 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
 
-// A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its
-// shape.
-struct ShapedSlice {
-  BufferAllocation::Slice slice;
-  Shape shape;
-
-  static absl::StatusOr<ShapedSlice> FromProto(
-      const ShapedSliceProto& proto,
-      absl::Span<const BufferAllocation> buffer_allocations);
-  absl::StatusOr<ShapedSliceProto> ToProto() const;
-};
-
 // Returns if the thunk implements a reduction collective (all-reduce or
 // reduce-scatter).
 bool IsReductionCollective(Thunk::Kind kind);
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 5b11567f2419e7..c34eabae9e45f4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -17,11 +17,17 @@ syntax = "proto3";
 
 package xla.gpu;
 
+import "xla/backends/gpu/runtime/convolution_filter_thunk.proto";
 import "xla/backends/gpu/runtime/dynamic_slice_thunk.proto";
+import "xla/backends/gpu/runtime/shaped_slice.proto";
+import "xla/core/host_offloading/host_offloading_executable.proto";
+import "xla/ffi/attribute_map.proto";
 import "xla/service/buffer_assignment.proto";
-import "xla/service/gpu/backend_configs.proto";
+import "xla/service/gpu/gpu_conv_runner.proto";
 import "xla/service/gpu/gpu_norm_runner.proto";
+import "xla/service/gpu/kernels/custom_kernel.proto";
 import "xla/service/gpu/launch_dimensions.proto";
+import "xla/service/hlo.proto";
 import "xla/stream_executor/gpu/gpu_blas_lt.proto";
 import "xla/stream_executor/gpu/tma_metadata.proto";
 import "xla/stream_executor/launch_dim.proto";
@@ -80,6 +86,7 @@ message WhileThunkProto {
 
 message KernelThunkProto {
   repeated xla.buffer_assignment.BufferAllocationSliceProto args = 1;
+  repeated xla.ShapeProto args_shape = 8;
   repeated bool written = 2;
   string kernel_name = 3;
   LaunchDimensionsProto launch_dimensions = 4;
@@ -130,8 +137,46 @@ message CudnnThunkProto {
   optional int64 sdpa_dropout_seed = 3;
 }
 
-message HostExecuteStartThunkProto {}
-message HostExecuteDoneThunkProto {}
+message HostSendThunkProto {
+  xla.ShapeProto shape = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto buffer = 2;
+  int64 channel_id = 3;
+  map<string, string> frontend_attrs = 4;
+  optional int64 device_constraint = 5;
+  uint64 async_events_unique_id = 6;
+}
+
+message HostSendDoneThunkProto {
+  int64 channel_id = 1;
+  optional int64 device_constraint = 2;
+  uint64 async_events_unique_id = 3;
+}
+
+message HostRecvThunkProto {
+  xla.ShapeProto shape = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto buffer = 2;
+  int64 channel_id = 3;
+  map<string, string> frontend_attrs = 4;
+  optional int64 device_constraint = 5;
+  uint64 async_events_unique_id = 6;
+}
+
+message HostRecvDoneThunkProto {
+  int64 channel_id = 1;
+  optional int64 device_constraint = 2;
+  uint64 async_events_unique_id = 3;
+}
+
+message HostExecuteStartThunkProto {
+  HostOffloadingExecutableProto executable_proto = 1;
+  repeated ShapedSliceProto args = 2;
+  repeated ShapedSliceProto results = 3;
+  uint64 async_events_unique_id = 4;
+}
+
+message HostExecuteDoneThunkProto {
+  uint64 async_events_unique_id = 1;
+}
 
 message DynamicSliceThunkProto {
   SequentialThunkProto embedded_thunk = 1;
@@ -142,15 +187,16 @@ message DynamicSliceThunkProto {
   repeated OptionalInt64Proto offset_byte_sizes = 6;
   optional OffsetAsFunctionOfIndvarModulesMetadataProto
       offset_as_function_of_indvar_modules_metadata = 7;
+  repeated BufferAllocationProto fake_allocations = 8;
 }
 
 message MemzeroThunkProto {
   xla.buffer_assignment.BufferAllocationSliceProto dest_buffer = 1;
 }
 
-message ShapedSliceProto {
-  xla.buffer_assignment.BufferAllocationSliceProto slice = 1;
-  xla.ShapeProto shape = 2;
+message Memset32BitValueThunkProto {
+  xla.buffer_assignment.BufferAllocationSliceProto dest_buffer = 1;
+  uint32 value = 2;
 }
 
 message InfeedThunkProto {
@@ -162,7 +208,11 @@ message OutfeedThunkProto {
 }
 
 message SelectKThunkProto {
-  // TODO(upwind): Add fields for SelectKThunkProto.
+  repeated xla.buffer_assignment.BufferAllocationSliceProto args = 1;
+  uint32 batch_size = 3;
+  uint32 num_elements = 4;
+  uint32 k = 5;
+  xla.PrimitiveType dtype = 6;
 }
 
 message CublasLtMatmulThunkProto {
@@ -184,6 +234,16 @@ message CublasLtMatmulThunkProto {
   optional xla.buffer_assignment.BufferAllocationSliceProto workspace = 16;
 }
 
+message CubSortThunkProto {
+  xla.PrimitiveType type = 1;
+  optional xla.PrimitiveType value_type = 3;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto operands = 4;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto results = 5;
+  xla.buffer_assignment.BufferAllocationSliceProto scratch = 6;
+  bool descending = 7;
+  int64 batch_size = 8;
+}
+
 message NormThunkProto {
   GpuNormDescriptorProto norm_descriptor = 1;
   xla.buffer_assignment.BufferAllocationSliceProto x = 2;
@@ -198,6 +258,47 @@ message NormThunkProto {
   xla.buffer_assignment.BufferAllocationSliceProto scratch = 11;
 }
 
+message ConvolutionThunkProto {
+  GpuConvDescriptorProto conv_descriptor = 1;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto operand_buffers = 2;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto result_buffers = 3;
+  xla.buffer_assignment.BufferAllocationSliceProto scratch_buffer = 4;
+}
+
+message ConvolutionReorderThunkProto {
+  ConvolutionFilterDimensions filter_dimensions = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto filter_input = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto filter_output = 3;
+  optional ConvolutionReorderBiasBuffers biases = 4;
+}
+
+message FftThunkProto {
+  FftType fft_type = 1;
+  repeated int64 fft_length = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto input_buffer = 3;
+  xla.buffer_assignment.BufferAllocationSliceProto output_buffer = 4;
+  xla.ShapeProto input_shape = 5;
+  xla.ShapeProto output_shape = 6;
+}
+
+message CustomCallThunkProto {
+  string target_name = 1;
+  repeated NullableShapedSliceProto operands = 2;
+  repeated NullableShapedSliceProto results = 3;
+  string opaque = 4;
+  CustomCallApiVersion api_version = 5;
+  xla.ffi.AttributesMapProto attributes = 6;
+  // The name of the called computation. It needs to match the HloCompuation in
+  // the HloModule that is used to deserialize the thunk.
+  optional string called_computation = 7;
+}
+
+message CustomKernelThunkProto {
+  repeated xla.buffer_assignment.BufferAllocationSliceProto args = 1;
+  repeated bool written = 2;
+  CustomKernelProto custom_kernel = 3;
+}
+
 message ThunkProto {
   ThunkInfoProto thunk_info = 1;
 
@@ -225,6 +326,17 @@ message ThunkProto {
     CublasLtMatmulThunkProto cublas_lt_matmul_thunk = 22;
     OutfeedThunkProto outfeed_thunk = 23;
     NormThunkProto norm_thunk = 24;
+    ConvolutionThunkProto convolution_thunk = 25;
+    ConvolutionReorderThunkProto convolution_reorder_thunk = 26;
+    FftThunkProto fft_thunk = 27;
+    Memset32BitValueThunkProto memset32bit_value_thunk = 28;
+    CustomCallThunkProto custom_call_thunk = 30;
+    CubSortThunkProto cub_sort_thunk = 31;
+    HostSendThunkProto host_send_thunk = 32;
+    HostSendDoneThunkProto host_send_done_thunk = 33;
+    HostRecvThunkProto host_recv_thunk = 34;
+    HostRecvDoneThunkProto host_recv_done_thunk = 35;
+    CustomKernelThunkProto custom_kernel_thunk = 36;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.cc
new file mode 100644
index 00000000000000..3031cbc9df9c4a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.cc
@@ -0,0 +1,254 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/ffi.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_filter.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
+#include "xla/ffi/ffi.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/dump.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace se = stream_executor;
+
+// With BufferDebugLogEntry size of 8 bytes, this is enough to hold ~8K entries.
+constexpr size_t kLogSizeBytes = 64 * 1024;
+
+namespace {
+
+// If the thunk has any interesting buffers to check, turns it into a sequence
+// of:
+// - BuffersDebugChecksumThunk checking the buffers before execution
+// - The original thunk
+// - BuffersDebugChecksumThunk checking the buffers after execution
+//
+// If the thunk got wrapped, the data dependencies between the thunks will be
+// configured to ensure `predecessor_thunk` executes before the wrapped thunk
+// and `successor_thunk` executes after.
+//
+// If the thunk has no interesting buffers to check, it is returned as is. It
+// can never return nullptr.
+std::unique_ptr<Thunk> WrapWithChecksumThunk(
+    std::unique_ptr<Thunk> thunk, BufferAllocation::Slice log_slice,
+    const Thunk& predecessor_thunk, Thunk& successor_thunk,
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store) {
+  const auto& thunk_buffers = thunk->buffer_uses();
+  if (thunk_buffers.empty()) {
+    return thunk;
+  }
+
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> buffers_to_check_before;
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> buffers_to_check_after;
+
+  for (size_t buffer_idx = 0; buffer_idx < thunk_buffers.size(); ++buffer_idx) {
+    const BufferUse& use = thunk_buffers[buffer_idx];
+    if (use.HasDefinedContentsOnInput()) {
+      buffers_to_check_before.emplace(buffer_idx, use.slice());
+    }
+    if (use.HasDefinedContentsOnOutput()) {
+      buffers_to_check_after.emplace(buffer_idx, use.slice());
+    }
+  }
+
+  if (buffers_to_check_before.empty() && buffers_to_check_after.empty()) {
+    return thunk;
+  }
+
+  std::vector<std::unique_ptr<Thunk>> thunk_and_checks;
+  if (!buffers_to_check_before.empty()) {
+    auto buffer_debug_before_thunk =
+        std::make_unique<BuffersDebugChecksumThunk>(
+            Thunk::ThunkInfo(), log_slice, thunk->thunk_info().thunk_id,
+            std::move(buffers_to_check_before),
+            /*runs_before_checked_thunk=*/true, metadata_store);
+    thunk->add_control_predecessor(buffer_debug_before_thunk.get());
+    thunk_and_checks.push_back(std::move(buffer_debug_before_thunk));
+  }
+
+  Thunk* thunk_ptr = thunk.get();
+  thunk_and_checks.push_back(std::move(thunk));
+
+  if (!buffers_to_check_after.empty()) {
+    auto buffer_debug_after_thunk = std::make_unique<BuffersDebugChecksumThunk>(
+        Thunk::ThunkInfo(), log_slice, thunk_ptr->thunk_info().thunk_id,
+        std::move(buffers_to_check_after),
+        /*runs_before_checked_thunk=*/false, metadata_store);
+    buffer_debug_after_thunk->add_control_predecessor(thunk_ptr);
+    thunk_and_checks.push_back(std::move(buffer_debug_after_thunk));
+  }
+
+  auto wrapped_thunk = std::make_unique<SequentialThunk>(
+      Thunk::ThunkInfo(), std::move(thunk_and_checks));
+  wrapped_thunk->add_control_predecessor(&predecessor_thunk);
+  successor_thunk.add_control_predecessor(wrapped_thunk.get());
+  return wrapped_thunk;
+}
+
+// Saves the contents of the BufferDebugLog stored in `log_buffer` to a file..
+//
+// `metadata_store` is used to retrieve the metadata for the log entries.
+// The filename is derived from the HLO module name and the log dump path
+// configured in `debug_options`.
+absl::Status DumpBufferDebugChecksumLog(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    se::Stream* stream, const HloComputation* absl_nonnull hlo_computation,
+    xla::ffi::Buffer<U8> log_buffer) {
+  VLOG(1) << "HLO computation ptr: " << hlo_computation;
+  const HloModule* hlo_module = hlo_computation->parent();
+  VLOG(1) << "HLO module ptr: " << hlo_module;
+  VLOG(1) << "HLO module name: " << hlo_module->name();
+  CHECK(hlo_module != nullptr);
+  const DebugOptions& debug_options = hlo_module->config().debug_options();
+
+  auto buffer_debug_log =
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::FromDeviceMemoryUnchecked(
+          log_buffer.device_memory());
+  TF_ASSIGN_OR_RETURN(std::vector<BufferDebugLogEntry> log_entries,
+                      buffer_debug_log.ReadFromDevice(*stream));
+  BufferDebugLogProto buffer_debug_log_proto =
+      metadata_store->EntriesToProto(log_entries);
+
+  VLOG(1) << "read " << buffer_debug_log_proto.entries_size() << " entries";
+  DumpPerExecutionProtobufToFile(*hlo_module, buffer_debug_log_proto,
+                                 debug_options, "buffer_debug_log", nullptr);
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    kBufferDebugChecksumLogInitHandler,
+    [](se::Stream* absl_nonnull stream, xla::ffi::Buffer<U8> log_buffer) {
+      return se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                 *stream, log_buffer.device_memory())
+          .status();
+    },
+    xla::ffi::Ffi::Bind().Ctx<xla::ffi::Stream>().Arg<xla::ffi::Buffer<U8>>());
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CreateDebugInitThunk(
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  XLA_FFI_Handler_Bundle buffer_debug_init_bundle{};
+  buffer_debug_init_bundle.execute = kBufferDebugChecksumLogInitHandler;
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_log_init",
+      buffer_debug_init_bundle, /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CreateBufferDebugDumpThunk(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  CustomCallThunk::OwnedHandlerBundle dump_bundle{};
+  dump_bundle.execute =
+      xla::ffi::Ffi::Bind()
+          .Ctx<xla::ffi::Stream>()
+          .Ctx<xla::ffi::CalledComputation>()
+          .Arg<xla::ffi::Buffer<U8>>()
+          .To(absl::bind_front(DumpBufferDebugChecksumLog, metadata_store));
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_log_dump",
+      std::move(dump_bundle),
+      /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+}  // namespace
+absl::Status RunChecksumPassInternal(SequentialThunk* root_thunk,
+                                     const DebugOptions& debug_options,
+                                     const HloModule* absl_nonnull hlo_module,
+                                     ThunkPassBufferAllocator& allocator) {
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store =
+      std::make_shared<BufferDebugLogEntryMetadataStore>();
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation * log_alloc,
+                      allocator.NewEmptyAllocation(kLogSizeBytes));
+  BufferAllocation::Slice log_slice(log_alloc, 0, log_alloc->size());
+
+  TF_ASSIGN_OR_RETURN(auto buffer_debug_init_thunk,
+                      CreateDebugInitThunk(log_slice, hlo_module));
+
+  TF_ASSIGN_OR_RETURN(
+      auto buffer_debug_dump_thunk,
+      CreateBufferDebugDumpThunk(metadata_store, log_slice, hlo_module));
+
+  ThunkFilter thunk_filter = CreateThunkFilter(debug_options);
+  TF_RETURN_IF_ERROR(
+      root_thunk->TransformAllNestedThunks([&](std::unique_ptr<Thunk> thunk) {
+        if (thunk_filter(*thunk) == InstrumentAction::kSkip) {
+          return thunk;
+        }
+        VLOG(1) << "Wrapping with checksum thunk";
+        return WrapWithChecksumThunk(
+            std::move(thunk), log_slice,
+            /*predecessor_thunk=*/*buffer_debug_init_thunk,
+            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store);
+      }));
+
+  ThunkSequence& thunks = root_thunk->thunks();
+  thunks.reserve(thunks.size() + 2);
+  thunks.insert(thunks.begin(), std::move(buffer_debug_init_thunk));
+  thunks.push_back(std::move(buffer_debug_dump_thunk));
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h
new file mode 100644
index 00000000000000..78e6f3ed723804
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h
@@ -0,0 +1,35 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_CHECKSUM_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_CHECKSUM_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/status.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+absl::Status RunChecksumPassInternal(SequentialThunk* root_thunk,
+                                     const DebugOptions& debug_options,
+                                     const HloModule* absl_nonnull hlo_module,
+                                     ThunkPassBufferAllocator& allocator);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_CHECKSUM_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.cc
new file mode 100644
index 00000000000000..8588b29c07262a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.cc
@@ -0,0 +1,129 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_filter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "re2/re2.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/ffi/ffi.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace {
+
+// A function that decides whether the thunk should be instrumented
+// (kInstrument) or not (kSkip).
+using ThunkFilter = absl::AnyInvocable<InstrumentAction(const Thunk&) const>;
+
+// Creates a thunk filter that filters thunks by their IDs, based the allowed
+// ranges passed in debug options.
+ThunkFilter CreateThunkIdFilter(const DebugOptions& debug_options) {
+  std::vector<std::pair<int64_t, int64_t>> thunk_id_ranges;
+  for (const auto& range :
+       debug_options.xla_gpu_experimental_thunk_buffer_debug_filter()
+           .thunk_id_ranges()) {
+    VLOG(1) << "Thunk filter: id range [" << range.first() << ", "
+            << range.last() << "]";
+    thunk_id_ranges.emplace_back(range.first(), range.last());
+  }
+
+  return [id_ranges = std::move(thunk_id_ranges)](const Thunk& thunk) {
+    if (id_ranges.empty()) {
+      return InstrumentAction::kInstrument;
+    }
+
+    const ThunkId thunk_id = thunk.thunk_info().thunk_id;
+    if (absl::c_any_of(id_ranges, [&](const auto& range) {
+          VLOG(2) << "Thunk filter: check ID range: " << range.first
+                  << " <= " << thunk_id.value() << " <= " << range.second;
+          return range.first <= thunk_id.value() &&
+                 thunk_id.value() <= range.second;
+        })) {
+      VLOG(2) << "Thunk filter: ID matches";
+      return InstrumentAction::kInstrument;
+    }
+
+    VLOG(2) << "Thunk filter: ID does not match";
+    return InstrumentAction::kSkip;
+  };
+}
+
+// Creates a thunk filter that filters thunks by matching their profile
+// annotations against regexes configured in debug options.
+ThunkFilter CreateProfileAnnotationRegexFilter(
+    const DebugOptions& debug_options) {
+  std::vector<std::unique_ptr<RE2>> profile_annotation_regexes;
+  for (const auto& regex :
+       debug_options.xla_gpu_experimental_thunk_buffer_debug_filter()
+           .profile_annotation_regexes()) {
+    VLOG(1) << "Thunk filter: profile annotation regex: " << regex;
+    profile_annotation_regexes.push_back(std::make_unique<RE2>(regex));
+  }
+  return [regexes = std::move(profile_annotation_regexes)](const Thunk& thunk) {
+    if (regexes.empty()) {
+      return InstrumentAction::kInstrument;
+    }
+
+    const std::string& profile_annotation =
+        thunk.thunk_info().profile_annotation;
+    if (absl::c_any_of(regexes, [&](const auto& regex) {
+          VLOG(2) << "Thunk filter: check profile annotation regex: "
+                  << regex->pattern();
+          return RE2::PartialMatch(profile_annotation, *regex);
+        })) {
+      VLOG(2) << "Thunk filter: profile annotation matches";
+      return InstrumentAction::kInstrument;
+    }
+
+    VLOG(2) << "Thunk filter: profile annotation does not match";
+    return InstrumentAction::kSkip;
+  };
+}
+
+}  // namespace
+
+// Creates a thunk filter that filters thunks by all the conditions configured
+// in debug options.
+ThunkFilter CreateThunkFilter(const DebugOptions& debug_options) {
+  std::vector<ThunkFilter> filters;
+  filters.push_back(CreateThunkIdFilter(debug_options));
+  filters.push_back(CreateProfileAnnotationRegexFilter(debug_options));
+
+  return [filters = std::move(filters)](const Thunk& thunk) {
+    VLOG(2) << "Thunk filter: check ID " << thunk.thunk_info().thunk_id
+            << ", profile annotation " << thunk.thunk_info().profile_annotation;
+    if (absl::c_all_of(filters, [&](const auto& filter) {
+          return filter(thunk) == InstrumentAction::kInstrument;
+        })) {
+      return InstrumentAction::kInstrument;
+    }
+    return InstrumentAction::kSkip;
+  };
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.h
new file mode 100644
index 00000000000000..9ca74377ad3716
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.h
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FILTER_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FILTER_H_
+
+#include "absl/functional/any_invocable.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+// A boolean-like value returned from thunk filters to indicate whether the
+// thunk should be instrumented or left as is.
+enum class InstrumentAction : bool {
+  // Don't instrument the thunk, leave it as is.
+  kSkip,
+  // Instrument the thunk.
+  kInstrument,
+};
+
+using ThunkFilter = absl::AnyInvocable<InstrumentAction(const Thunk&) const>;
+
+ThunkFilter CreateThunkFilter(const DebugOptions& debug_options);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FILTER_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
new file mode 100644
index 00000000000000..7a6aebc10f12e7
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
@@ -0,0 +1,351 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/ffi.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_filter.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
+#include "xla/ffi/ffi.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace se = stream_executor;
+
+// With BufferDebugFloatCheckEntry size of 16 bytes, this is enough to hold ~4K
+// entries.
+constexpr size_t kLogSizeBytes = 64 * 1024;
+
+namespace {
+
+std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
+    std::unique_ptr<Thunk> thunk, BufferAllocation::Slice log_slice,
+    const Thunk& predecessor_thunk, Thunk& successor_thunk,
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store) {
+  const auto& thunk_buffers = thunk->buffer_uses();
+  if (thunk_buffers.empty()) {
+    VLOG(1) << "No buffers in thunk " << thunk->thunk_info().thunk_id
+            << ", skipping";
+    return thunk;
+  }
+
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> buffers_to_check;
+  for (size_t buffer_idx = 0; buffer_idx < thunk_buffers.size(); ++buffer_idx) {
+    VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+            << thunk->thunk_info().thunk_id;
+    const BufferUse& use = thunk_buffers[buffer_idx];
+    const BufferAllocation::Slice& slice = use.slice();
+    if (slice.allocation() == nullptr) {
+      VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+              << thunk->thunk_info().thunk_id
+              << " has null allocation, skipping";
+      continue;
+    }
+    if (slice.element_type() != PrimitiveType::F32 &&
+        slice.element_type() != PrimitiveType::BF16) {
+      VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+              << thunk->thunk_info().thunk_id
+              << " has unsupported element type "
+              << PrimitiveType_Name(slice.element_type()) << ", skipping";
+      continue;
+    }
+    if (!use.HasDefinedContentsOnOutput()) {
+      VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+              << thunk->thunk_info().thunk_id
+              << " has no defined contents, skipping";
+      continue;
+    }
+    buffers_to_check.emplace(buffer_idx, use.slice());
+    VLOG(1) << "Found buffer " << buffer_idx << " in thunk "
+            << thunk->thunk_info().thunk_id << " with element type "
+            << PrimitiveType_Name(slice.element_type()) << " and size "
+            << slice.size();
+  }
+
+  if (buffers_to_check.empty()) {
+    return thunk;
+  }
+
+  VLOG(1) << "Wrapping thunk " << thunk->thunk_info().thunk_id
+          << " with float check thunk due to presence of buffers: "
+          << buffers_to_check.size();
+  std::vector<std::unique_ptr<Thunk>> thunk_and_checks;
+  Thunk* thunk_ptr = thunk.get();
+  thunk_and_checks.push_back(std::move(thunk));
+  auto buffer_debug_float_check_thunk =
+      std::make_unique<BuffersDebugFloatCheckThunk>(
+          Thunk::ThunkInfo(), thunk_ptr->thunk_info(), log_slice,
+          std::move(buffers_to_check), std::move(metadata_store));
+  buffer_debug_float_check_thunk->add_control_predecessor(thunk_ptr);
+  thunk_and_checks.push_back(std::move(buffer_debug_float_check_thunk));
+  auto wrapped_thunk = std::make_unique<SequentialThunk>(
+      Thunk::ThunkInfo(), std::move(thunk_and_checks));
+  wrapped_thunk->add_control_predecessor(&predecessor_thunk);
+  successor_thunk.add_control_predecessor(wrapped_thunk.get());
+  return wrapped_thunk;
+}
+
+void LogHloInstructionWithId(const HloModule* hlo_module,
+                             const std::string& id) {
+  for (const HloComputation* computation : hlo_module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->name() == id) {
+        LOG(ERROR) << "HLO instruction with id " << id << ":\n\n"
+                   << instruction->ToString() << "\n\n";
+        if (instruction->opcode() == HloOpcode::kFusion) {
+          auto fusion = xla::Cast<HloFusionInstruction>(instruction);
+          LOG(ERROR) << "HLO fusion instruction computation:\n\n"
+                     << fusion->fused_instructions_computation()->ToString()
+                     << "\n\n";
+        }
+        return;
+      }
+    }
+  }
+  LOG(ERROR) << "HLO instruction with id " << id << " was not found";
+}
+
+absl::Status BufferDebugFloatCheck(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    se::Stream* stream, const HloComputation* absl_nonnull hlo_computation,
+    xla::ffi::Buffer<U8> log_buffer) {
+  VLOG(1) << "HLO computation ptr: " << hlo_computation;
+  const HloModule* hlo_module = hlo_computation->parent();
+  VLOG(1) << "HLO module ptr: " << hlo_module;
+  VLOG(1) << "HLO module name: " << hlo_module->name();
+  CHECK(hlo_module != nullptr);
+  bool nan_check_enabled =
+      hlo_module->config().debug_options().xla_gpu_detect_nan() !=
+      DebugOptions::DETECTION_MODE_NONE;
+  bool inf_check_enabled =
+      hlo_module->config().debug_options().xla_gpu_detect_inf() !=
+      DebugOptions::DETECTION_MODE_NONE;
+
+  auto buffer_debug_log = se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::
+      FromDeviceMemoryUnchecked(log_buffer.device_memory());
+  TF_ASSIGN_OR_RETURN(std::vector<BufferDebugFloatCheckEntry> entries,
+                      buffer_debug_log.ReadFromDevice(*stream));
+
+  std::vector<BufferDebugLogEntryId> entry_ids;
+  entry_ids.reserve(entries.size());
+  for (const auto& entry : entries) {
+    entry_ids.push_back(entry.entry_id);
+  }
+
+  VLOG(1) << "read " << entries.size() << " entries";
+  auto entries_metadata = metadata_store->GetEntryMetadataBatch(entry_ids);
+  int non_zero_nan_check_modules_count = 0;
+  int non_zero_inf_check_modules_count = 0;
+  CHECK_EQ(entries.size(), entries_metadata.size());
+
+  absl::flat_hash_set<std::string> reported_nan_thunks;
+  absl::flat_hash_set<std::string> reported_inf_thunks;
+  for (int i = 0; i < entries.size(); ++i) {
+    const auto& entry = entries[i];
+    const auto& metadata = entries_metadata[i];
+    if (!metadata.has_value()) {
+      VLOG(1) << "Entry ID " << entry.entry_id
+              << " for float check not found in metadata";
+      continue;
+    }
+    if (metadata->check_type !=
+        BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS) {
+      VLOG(1) << "Entry ID " << entry.entry_id
+              << " for float check has unsupported check type "
+              << BufferDebugLogEntryProto::CheckType_Name(metadata->check_type);
+      continue;
+    }
+    if (nan_check_enabled && entry.nan_count > 0) {
+      if (reported_nan_thunks.contains(metadata->profile_annotation)) {
+        VLOG(1) << "Skipping entry with non zero nan count " << entry.nan_count
+                << " for thunk " << entry.entry_id << " and execution "
+                << "with metadata: " << metadata->profile_annotation;
+        continue;
+      }
+      reported_nan_thunks.insert(metadata->profile_annotation);
+      LOG(ERROR) << "Found entry with non zero nan count " << entry.nan_count
+                 << " for thunk " << entry.entry_id << " and execution "
+                 << "with metadata: " << metadata->profile_annotation;
+      non_zero_nan_check_modules_count++;
+      LogHloInstructionWithId(hlo_module, metadata->profile_annotation);
+    }
+    if (inf_check_enabled && entry.inf_count > 0) {
+      if (reported_inf_thunks.contains(metadata->profile_annotation)) {
+        VLOG(1) << "Skipping entry with non zero inf count " << entry.inf_count
+                << " for thunk " << entry.entry_id << " with execution_id "
+                << metadata->execution_id
+                << " and profile annotation: " << metadata->profile_annotation;
+        continue;
+      }
+      reported_inf_thunks.insert(metadata->profile_annotation);
+      LOG(ERROR) << "Found entry with non zero inf count " << entry.inf_count
+                 << " for thunk " << entry.entry_id << " with execution_id "
+                 << metadata->execution_id
+                 << " and profile annotation: " << metadata->profile_annotation;
+      non_zero_inf_check_modules_count++;
+      LogHloInstructionWithId(hlo_module, metadata->profile_annotation);
+    }
+  }
+  if (non_zero_nan_check_modules_count > 0 &&
+      hlo_module->config().debug_options().xla_gpu_detect_nan() ==
+          DebugOptions::DETECTION_MODE_FAIL) {
+    LOG(FATAL) << "Crash execution as requested by the xla_gpu_detect_nan flag "
+                  "because "
+               << non_zero_nan_check_modules_count
+               << " NaN values were found in buffers.";
+  }
+  if (non_zero_inf_check_modules_count > 0 &&
+      hlo_module->config().debug_options().xla_gpu_detect_inf() ==
+          DebugOptions::DETECTION_MODE_FAIL) {
+    LOG(FATAL) << "Crash execution as requested by the xla_gpu_detect_inf flag "
+                  "because "
+               << non_zero_inf_check_modules_count
+               << " infinite values were found in buffers.";
+  }
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    kBufferDebugFloatCheckLogInitHandler,
+    [](se::Stream* absl_nonnull stream, xla::ffi::Buffer<U8> log_buffer) {
+      return se::gpu::BufferDebugLog<xla::gpu::BufferDebugFloatCheckEntry>::
+          CreateOnDevice(*stream, log_buffer.device_memory())
+              .status();
+    },
+    xla::ffi::Ffi::Bind().Ctx<xla::ffi::Stream>().Arg<xla::ffi::Buffer<U8>>());
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CreateDebugInitThunk(
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  XLA_FFI_Handler_Bundle buffer_debug_init_bundle{};
+  buffer_debug_init_bundle.execute = kBufferDebugFloatCheckLogInitHandler;
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_float_check_init",
+      buffer_debug_init_bundle, /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>>
+CreateBufferDebugFloatCheckThunk(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  CustomCallThunk::OwnedHandlerBundle float_check_bundle{};
+  float_check_bundle.execute =
+      xla::ffi::Ffi::Bind()
+          .Ctx<xla::ffi::Stream>()
+          .Ctx<xla::ffi::CalledComputation>()
+          .Arg<xla::ffi::Buffer<U8>>()
+          .To(absl::bind_front(BufferDebugFloatCheck, metadata_store));
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_float_check",
+      std::move(float_check_bundle),
+      /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+}  // namespace
+
+absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
+                                       const DebugOptions& debug_options,
+                                       const HloModule* absl_nonnull hlo_module,
+                                       ThunkPassBufferAllocator& allocator) {
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store =
+      std::make_shared<BufferDebugLogEntryMetadataStore>();
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation * log_alloc,
+                      allocator.NewEmptyAllocation(kLogSizeBytes));
+  BufferAllocation::Slice log_slice(log_alloc, 0, log_alloc->size());
+
+  TF_ASSIGN_OR_RETURN(auto buffer_debug_init_thunk,
+                      CreateDebugInitThunk(log_slice, hlo_module));
+
+  TF_ASSIGN_OR_RETURN(
+      auto buffer_debug_dump_thunk,
+      CreateBufferDebugFloatCheckThunk(metadata_store, log_slice, hlo_module));
+
+  ThunkFilter thunk_filter = CreateThunkFilter(debug_options);
+  TF_RETURN_IF_ERROR(
+      root_thunk->TransformAllNestedThunks([&](std::unique_ptr<Thunk> thunk) {
+        if (thunk_filter(*thunk) == InstrumentAction::kSkip) {
+          return thunk;
+        }
+        VLOG(1) << "Wrapping with float check thunk";
+        return WrapWithFloatCheckThunk(
+            std::move(thunk), log_slice,
+            /*predecessor_thunk=*/*buffer_debug_init_thunk,
+            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store);
+      }));
+
+  ThunkSequence& thunks = root_thunk->thunks();
+  thunks.reserve(thunks.size() + 2);
+  thunks.insert(thunks.begin(), std::move(buffer_debug_init_thunk));
+  thunks.push_back(std::move(buffer_debug_dump_thunk));
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h
similarity index 53%
rename from third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.cc
rename to third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h
index 418626fe1ace10..23ba2bca45b278 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h
@@ -13,28 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h"
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FLOAT_CHECK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FLOAT_CHECK_H_
 
-#include "absl/status/statusor.h"
+#include "absl/base/nullability.h"
+#include "absl/status/status.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-absl::StatusOr<bool> ThunkChecksumTracingPass::Run(
-    SequentialThunk* root_thunk, const DebugOptions& debug_options,
-    const se::DeviceDescription& device_info,
-    ThunkPassBufferAllocator& allocator) {
-  TF_ASSIGN_OR_RETURN(BufferAllocation * log_alloc,
-                      allocator.NewEmptyAllocation(1234));
-  (void)log_alloc;
-
-  return false;
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
+                                       const DebugOptions& debug_options,
+                                       const HloModule* absl_nonnull hlo_module,
+                                       ThunkPassBufferAllocator& allocator);
 }
 
-}  // namespace gpu
-}  // namespace xla
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FLOAT_CHECK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.cc
new file mode 100644
index 00000000000000..4f7c4bf5b9fed2
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.cc
@@ -0,0 +1,66 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_pass.h"
+
+#include "absl/base/nullability.h"
+#include "absl/log/log.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/ffi/ffi.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+absl::StatusOr<bool> ThunkBufferDebugPass::Run(
+    SequentialThunk* root_thunk, const DebugOptions& debug_options,
+    const HloModule* absl_nullable hlo_module,
+    const se::DeviceDescription& device_info,
+    ThunkPassBufferAllocator& allocator) {
+  VLOG(1) << "ThunkBufferDebugPass running";
+
+  if (hlo_module == nullptr) {
+    // We need the HLO module to dump the buffer debug log proto to a file. If
+    // it's not available, there's no point in doing extra work.
+    VLOG(1) << "HLO module is null, skip buffer checksumming";
+    return false;
+  }
+
+  switch (mode_) {
+    case Mode::kChecksum:
+      TF_RETURN_IF_ERROR(RunChecksumPassInternal(root_thunk, debug_options,
+                                                 hlo_module, allocator));
+      break;
+    case Mode::kFloatChecker:
+      TF_RETURN_IF_ERROR(RunFloatCheckPassInternal(root_thunk, debug_options,
+                                                   hlo_module, allocator));
+      break;
+    case Mode::kBufferSaver:
+      TF_RETURN_IF_ERROR(
+          RunDebugSaverInserter(*root_thunk, debug_options, *hlo_module));
+      break;
+  }
+
+  return true;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.h
similarity index 64%
rename from third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h
rename to third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.h
index dad6d190c57f14..a368a08bed4e42 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.h
@@ -13,32 +13,45 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_CHECKSUM_TRACING_PASS_H_
-#define XLA_BACKENDS_GPU_RUNTIME_THUNK_CHECKSUM_TRACING_PASS_H_
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_PASS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_PASS_H_
 
+#include "absl/base/nullability.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
 
-// Adds checksum tracing to thunks.
-class ThunkChecksumTracingPass : public ThunkPassInterface {
+// Adds buffer debug tracing to thunks.
+class ThunkBufferDebugPass : public ThunkPassInterface {
  public:
-  ThunkChecksumTracingPass() = default;
+  enum class Mode {
+    kChecksum,
+    kFloatChecker,
+    kBufferSaver,
+  };
 
-  absl::string_view name() const override { return "thunk-checksum-tracing"; }
+  explicit ThunkBufferDebugPass(Mode mode) : mode_(mode) {}
+
+  absl::string_view name() const override { return "thunk-buffer-debug"; }
 
   absl::StatusOr<bool> Run(SequentialThunk* root_thunk,
                            const DebugOptions& debug_options,
+                           const HloModule* absl_nullable hlo_module,
                            const se::DeviceDescription& device_info,
                            ThunkPassBufferAllocator& allocator) override;
+
+ private:
+  Mode mode_;
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_CHECKSUM_TRACING_PASS_H_
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_PASS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
new file mode 100644
index 00000000000000..29826950fecee9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
@@ -0,0 +1,804 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_pass.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+#include "xla/backends/gpu/runtime/conditional_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/runtime_intrinsics.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/backends/gpu/runtime/while_thunk.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal_util.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using testing::ElementsAre;
+using testing::Eq;
+using testing::Pair;
+using testing::Pointer;
+using testing::SizeIs;
+using testing::UnorderedElementsAre;
+using testing::UnorderedElementsAreArray;
+
+MATCHER_P(IsUniquePointerTo, ptr, "") { return arg.get() == ptr; }
+
+MATCHER_P(ThunkKindIs, kind, "") {
+  return ExplainMatchResult(Eq(kind), arg->kind(), result_listener);
+}
+
+MATCHER_P(IsCustomCallThunkWithTargetName, target_name, "") {
+  return ExplainMatchResult(Eq(Thunk::Kind::kCustomCall), arg->kind(),
+                            result_listener) &&
+         ExplainMatchResult(
+             Eq(target_name),
+             static_cast<const CustomCallThunk&>(*arg).target_name(),
+             result_listener);
+}
+
+MATCHER_P(IsChecksumThunkChecking, slice, "") {
+  return ExplainMatchResult(Eq(Thunk::Kind::kBuffersDebugChecksum), arg->kind(),
+                            result_listener) &&
+         ExplainMatchResult(UnorderedElementsAreArray(slice),
+                            static_cast<const BuffersDebugChecksumThunk&>(*arg)
+                                .buffer_slices(),
+                            result_listener);
+}
+
+MATCHER_P(IsSequentialThunkWith, thunk_matcher, "") {
+  return ExplainMatchResult(Eq(Thunk::Kind::kSequential), arg->kind(),
+                            result_listener) &&
+         ExplainMatchResult(thunk_matcher,
+                            static_cast<const SequentialThunk&>(*arg).thunks(),
+                            result_listener);
+}
+
+using SliceList =
+    std::initializer_list<std::pair<size_t, BufferAllocation::Slice>>;
+
+class FakeThunkPassBufferAllocator : public ThunkPassBufferAllocator {
+ public:
+  absl::StatusOr<BufferAllocation*> NewEmptyAllocation(int64_t size) override {
+    if (CreatedAlloc()) {
+      return absl::InvalidArgumentError("Expected only one allocation");
+    }
+    alloc_ = std::make_unique<BufferAllocation>(0, size, 0);
+    return alloc_.get();
+  }
+
+  bool CreatedAlloc() { return alloc_ != nullptr; }
+
+ private:
+  std::unique_ptr<BufferAllocation> alloc_;
+};
+
+class FakeThunk : public Thunk {
+ public:
+  explicit FakeThunk(ThunkInfo info, BufferUses buffer_uses)
+      : Thunk(Thunk::Kind::kGemm, std::move(info)),
+        buffer_uses_(std::move(buffer_uses)) {}
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+
+  BufferUses buffer_uses() const override { return buffer_uses_; }
+
+ private:
+  BufferUses buffer_uses_;
+};
+
+class ThunkBufferDebugPassTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // The callbacks created by ThunkBufferDebugPass require a HloModule
+    // with a non-null entry computation.
+    auto builder = HloComputation::Builder("entry");
+    HloInstruction* root = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(1)));
+    std::unique_ptr<HloComputation> entry_computation = builder.Build(root);
+    fake_hlo_module_ =
+        std::make_unique<HloModule>("test_module", HloModuleConfig());
+    fake_hlo_module_->AddEntryComputation(std::move(entry_computation));
+  }
+
+  Thunk::ThunkInfo ThunkInfoWithId(ThunkId thunk_id) {
+    Thunk::ThunkInfo info;
+    info.thunk_id = thunk_id;
+    return info;
+  }
+
+  // Create a new, unique, non-null slice backed by `alloc_`.
+  BufferAllocation::Slice CreateSlice() {
+    BufferAllocation::Slice slice(&alloc_, used_alloc_size_, 1);
+    used_alloc_size_ += slice.size();
+    return slice;
+  }
+
+  BufferAllocation alloc_ = BufferAllocation(0, 1024, 0);
+  size_t used_alloc_size_ = 0;
+  std::unique_ptr<HloModule> fake_hlo_module_;
+};
+
+TEST_F(ThunkBufferDebugPassTest, IsNoOpWhenHloModuleIsNull) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice(&alloc, 0, 1);
+  auto fake_thunk = std::make_unique<FakeThunk>(
+      Thunk::ThunkInfo(), Thunk::BufferUses{BufferUse::Read(slice)});
+  Thunk* fake_thunk_ptr = fake_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             /*hlo_module=*/nullptr, device_info, allocator));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(root_thunk->thunks(), ElementsAre(Pointer(fake_thunk_ptr)));
+}
+
+TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugChecksumThunks) {
+  static constexpr ThunkId kTestThunkId = ThunkId(123);
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+
+  Shape arg_shape = ShapeUtil::MakeShape(U8, {1});
+  BufferAllocation::Slice slice_i(&alloc, 0, 1);
+  BufferAllocation::Slice slice_o(&alloc, 1, 1);
+  BufferAllocation::Slice slice_io(&alloc, 2, 1);
+  BufferAllocation::Slice slice_scratch(&alloc, 3, 1);
+  Thunk::ThunkInfo fake_thunk_info;
+  fake_thunk_info.thunk_id = ThunkId(kTestThunkId);
+  auto fake_thunk = std::make_unique<FakeThunk>(
+      fake_thunk_info,
+      Thunk::BufferUses{
+          // Consume means the thunk can reuse the buffer for scratch space, so
+          // only check it on input.
+          BufferUse::Consume(slice_i, arg_shape),
+          // Write is undefined on input, but defined on output.
+          BufferUse::Write(slice_o, arg_shape),
+          // Unlike Consume, Read is supposed to preserve the contents of the
+          // buffer, so we check it on input *and* output.
+          BufferUse::Read(slice_io, arg_shape),
+          // Scratch buffers are not checked at all.
+          BufferUse::Scratch(slice_scratch, arg_shape),
+      });
+  Thunk* fake_thunk_ptr = fake_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // ]
+  // 3. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice_i},
+                                                {2, slice_io},
+                                            }),
+                                            Pointer(fake_thunk_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {1, slice_o},
+                                                {2, slice_io},
+                                            }))),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
+  static constexpr ThunkId kWhileConditionFakeThunkId = ThunkId(100);
+  static constexpr ThunkId kWhileBodyId = ThunkId(101);
+  static constexpr ThunkId kBranch0ThunkId = ThunkId(102);
+  static constexpr ThunkId kBranch1ThunkId = ThunkId(103);
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation::Slice slice_while_condition = CreateSlice();
+  BufferAllocation::Slice slice_while_body = CreateSlice();
+  BufferAllocation::Slice slice_branch0 = CreateSlice();
+  BufferAllocation::Slice slice_branch1 = CreateSlice();
+  // Setup a thunk tree.
+  auto while_condition_fake_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kWhileConditionFakeThunkId),
+      Thunk::BufferUses{BufferUse::Read(slice_while_condition)});
+  const Thunk* const while_condition_fake_thunk_ptr =
+      while_condition_fake_thunk.get();
+  auto while_body_fake_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kWhileBodyId),
+      Thunk::BufferUses{BufferUse::Read(slice_while_body)});
+  const Thunk* const while_body_fake_thunk_ptr = while_body_fake_thunk.get();
+  auto conditional_branch0_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kBranch0ThunkId),
+      Thunk::BufferUses{BufferUse::Read(slice_branch0)});
+  const Thunk* const branch0_thunk_ptr = conditional_branch0_thunk.get();
+  auto conditional_branch1_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kBranch1ThunkId),
+      Thunk::BufferUses{BufferUse::Read(slice_branch1)});
+  const Thunk* const branch1_thunk_ptr = conditional_branch1_thunk.get();
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+  branch_thunks.push_back(
+      SequentialThunk::FromThunk(std::move(conditional_branch0_thunk)));
+  branch_thunks.push_back(
+      SequentialThunk::FromThunk(std::move(conditional_branch1_thunk)));
+  auto conditional_thunk = std::make_unique<ConditionalThunk>(
+      Thunk::ThunkInfo(),
+      /*branch_index_buffer_index=*/BufferAllocation::Slice(),
+      std::move(branch_thunks),
+      /*branch_index_is_bool=*/true);
+  const Thunk* const conditional_thunk_ptr = conditional_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> while_body_thunks;
+  while_body_thunks.push_back(std::move(while_body_fake_thunk));
+  while_body_thunks.push_back(std::move(conditional_thunk));
+  auto while_thunk = std::make_unique<WhileThunk>(
+      Thunk::ThunkInfo(), /*loop=*/nullptr,
+      /*condition_result_buffer_index=*/BufferAllocation::Slice(),
+      /*condition_thunk_sequence=*/
+      SequentialThunk::FromThunk(std::move(while_condition_fake_thunk)),
+      /*body_thunk_sequence=*/
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(),
+                                        std::move(while_body_thunks)));
+  std::unique_ptr<SequentialThunk> root_thunk =
+      SequentialThunk::FromThunk(std::move(while_thunk));
+
+  // Thunk structure before the pass:
+  // 1. WhileThunk
+  //    Condition: SequentialThunk [
+  //       FakeThunk (kWhileConditionFakeThunkId)
+  //    ]
+  //    Body: SequentialThunk [
+  //       FakeThunk (kWhileBodyId)
+  //       ConditionalThunk [
+  //          Branch 0: SequentialThunk [
+  //             FakeThunk (kBranch0ThunkId)
+  //          ]
+  //          Branch 1: SequentialThunk [
+  //             FakeThunk (kBranch1ThunkId)
+  //          ]
+  //       ]
+  //    ]
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Each FakeThunk is supposed to be transformed into a SequentialThunk
+  // containing the original FakeThunk sandwiched between two
+  // BuffersDebugChecksumThunk thunks.
+  //
+  // Thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. WhileThunk
+  //    1. Condition: SequentialThunk [
+  //       1. SequentialThunk [
+  //          1. BuffersDebugChecksumThunk (checksum input buffers)
+  //          2. FakeThunk (kWhileConditionFakeThunkId)
+  //          3. BuffersDebugChecksumThunk (checksum output buffers)
+  //       ]
+  //    ]
+  //    2. Body: SequentialThunk [
+  //       1. SequentialThunk [
+  //          1. BuffersDebugChecksumThunk (checksum input buffers)
+  //          2. FakeThunk (kWhileBodyId)
+  //          3. BuffersDebugChecksumThunk (checksum output buffers)
+  //       ]
+  //       2. ConditionalThunk [
+  //          Branch 0: SequentialThunk [
+  //             1. SequentialThunk [
+  //                1. BuffersDebugChecksumThunk (checksum input buffers)
+  //                2. FakeThunk (kBranch0ThunkId)
+  //                3. BuffersDebugChecksumThunk (checksum output buffers)
+  //             ]
+  //          ]
+  //          Branch 1: SequentialThunk [
+  //             1. SequentialThunk [
+  //                1. BuffersDebugChecksumThunk (checksum input buffers)
+  //                2. FakeThunk (kBranch1ThunkId)
+  //                3. BuffersDebugChecksumThunk (checksum output buffers)
+  //             ]
+  //          ]
+  //       ]
+  //    ]
+  // 3. CustomCallThunk (buffer debug log dump)
+
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          ThunkKindIs(Thunk::Kind::kWhile),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+
+  {
+    ASSERT_EQ(new_thunks[1]->kind(), Thunk::Kind::kWhile);
+    const WhileThunk& while_thunk =
+        static_cast<const WhileThunk&>(*new_thunks[1]);
+    EXPECT_THAT(while_thunk.body_thunk_sequence()->thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential),
+                            Pointer(conditional_thunk_ptr)));
+    const SequentialThunk& condition_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(
+            *while_thunk.condition_thunk_sequence()->thunks()[0]);
+    EXPECT_THAT(
+        condition_fake_thunk_sequence.thunks(),
+        ElementsAre(
+            IsChecksumThunkChecking(SliceList{{0, slice_while_condition}}),
+            Pointer(while_condition_fake_thunk_ptr),
+            IsChecksumThunkChecking(SliceList{{0, slice_while_condition}})));
+
+    const SequentialThunk& body_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(
+            *while_thunk.body_thunk_sequence()->thunks()[0]);
+    EXPECT_THAT(
+        body_fake_thunk_sequence.thunks(),
+        ElementsAre(IsChecksumThunkChecking(SliceList{{0, slice_while_body}}),
+                    Pointer(while_body_fake_thunk_ptr),
+                    IsChecksumThunkChecking(SliceList{{0, slice_while_body}})));
+
+    ASSERT_EQ(while_thunk.body_thunk_sequence()->thunks()[1]->kind(),
+              Thunk::Kind::kConditional);
+    const ConditionalThunk& conditional_thunk =
+        static_cast<const ConditionalThunk&>(
+            *while_thunk.body_thunk_sequence()->thunks()[1]);
+    EXPECT_THAT(conditional_thunk.branch_thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential),
+                            ThunkKindIs(Thunk::Kind::kSequential)));
+
+    const SequentialThunk& branch0_thunk = static_cast<const SequentialThunk&>(
+        *conditional_thunk.branch_thunks()[0]);
+    EXPECT_THAT(branch0_thunk.thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential)));
+
+    const SequentialThunk& branch0_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(*branch0_thunk.thunks()[0]);
+    EXPECT_THAT(
+        branch0_fake_thunk_sequence.thunks(),
+        ElementsAre(IsChecksumThunkChecking(SliceList{{0, slice_branch0}}),
+                    Pointer(branch0_thunk_ptr),
+                    IsChecksumThunkChecking(SliceList{{0, slice_branch0}})));
+
+    const SequentialThunk& branch1_thunk = static_cast<const SequentialThunk&>(
+        *conditional_thunk.branch_thunks()[1]);
+    EXPECT_THAT(branch1_thunk.thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential)));
+
+    const SequentialThunk& branch1_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(*branch1_thunk.thunks()[0]);
+    EXPECT_THAT(
+        branch1_fake_thunk_sequence.thunks(),
+        ElementsAre(IsChecksumThunkChecking(SliceList{{0, slice_branch1}}),
+                    Pointer(branch1_thunk_ptr),
+                    IsChecksumThunkChecking(SliceList{{0, slice_branch1}})));
+  }
+}
+
+TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugFloatCheckThunks) {
+  static constexpr ThunkId kTestThunkId = ThunkId(123);
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_detect_nan(DebugOptions::DETECTION_MODE_WARNING);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // The callbacks created by ThunkBufferDebugPass require a HloModule with
+  // a non-null entry computation.
+  auto builder = HloComputation::Builder("entry");
+  HloInstruction* root = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f)));
+  std::unique_ptr<HloComputation> entry_computation = builder.Build(root);
+  HloModule hlo_module("test_module", HloModuleConfig());
+  hlo_module.AddEntryComputation(std::move(entry_computation));
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {1});
+  BufferAllocation::Slice slice_i(&alloc, 0, 4, PrimitiveType::F32);
+  BufferAllocation::Slice slice_o(&alloc, 4, 4, PrimitiveType::F32);
+  BufferAllocation::Slice slice_io(&alloc, 8, 4, PrimitiveType::F32);
+  BufferAllocation::Slice slice_scratch(&alloc, 12, 4, PrimitiveType::F32);
+  Thunk::ThunkInfo fake_thunk_info;
+  fake_thunk_info.thunk_id = ThunkId(kTestThunkId);
+  auto fake_thunk = std::make_unique<FakeThunk>(
+      fake_thunk_info,
+      Thunk::BufferUses{
+          // Consume means the thunk can reuse the buffer for scratch space, so
+          // only check it on input.
+          BufferUse::Consume(slice_i, arg_shape),
+          // Write is undefined on input, but defined on output.
+          BufferUse::Write(slice_o, arg_shape),
+          // Unlike Consume, Read is supposed to preserve the contents of the
+          // buffer, so we check it on input *and* output.
+          BufferUse::Read(slice_io, arg_shape),
+          // Scratch buffers are not checked at all.
+          BufferUse::Scratch(slice_scratch, arg_shape),
+      });
+  Thunk* fake_thunk_ptr = fake_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kFloatChecker);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          pass.Run(root_thunk.get(), debug_options, &hlo_module,
+                                   device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. SequentialThunk
+  //    1. FakeThunk
+  //    2. BuffersDebugFloatCheckThunk (float check output buffers)
+  // 3. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(new_thunks, SizeIs(3));
+  EXPECT_EQ(new_thunks[0]->kind(), Thunk::Kind::kCustomCall);
+  EXPECT_EQ(new_thunks[1]->kind(), Thunk::Kind::kSequential);
+  EXPECT_EQ(new_thunks[2]->kind(), Thunk::Kind::kCustomCall);
+
+  const CustomCallThunk& buffer_debug_init_thunk =
+      static_cast<const CustomCallThunk&>(*new_thunks[0]);
+  EXPECT_EQ(buffer_debug_init_thunk.target_name(),
+            "xla_gpu_buffer_debug_float_check_init");
+
+  const CustomCallThunk& buffer_debug_dump_thunk =
+      static_cast<const CustomCallThunk&>(*new_thunks[2]);
+  EXPECT_EQ(buffer_debug_dump_thunk.target_name(),
+            "xla_gpu_buffer_debug_float_check");
+
+  const std::vector<std::unique_ptr<Thunk>>& sub_thunks =
+      static_cast<const SequentialThunk&>(*new_thunks[1]).thunks();
+  EXPECT_THAT(sub_thunks, SizeIs(2));
+  EXPECT_THAT(sub_thunks[0], Pointer(fake_thunk_ptr));
+  EXPECT_EQ(sub_thunks[1]->kind(), Thunk::Kind::kBuffersDebugFloatCheck);
+
+  const BuffersDebugFloatCheckThunk& buffer_debug_after_fake_thunk =
+      static_cast<const BuffersDebugFloatCheckThunk&>(*sub_thunks[1]);
+  EXPECT_THAT(buffer_debug_after_fake_thunk.buffer_slices(),
+              UnorderedElementsAre(Pair(1, slice_o), Pair(2, slice_io)));
+}
+
+TEST_F(ThunkBufferDebugPassTest, BufferSaverInserter) {
+  static constexpr ThunkId kTestThunkId = ThunkId(123);
+  // The callbacks created by ThunkBufferDebugPass require a HloModule with
+  // a non-null entry computation.
+  auto builder = HloComputation::Builder("entry");
+  HloInstruction* root = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f)));
+  std::unique_ptr<HloComputation> entry_computation = builder.Build(root);
+  HloModule hlo_module("test_module", HloModuleConfig());
+  hlo_module.AddEntryComputation(std::move(entry_computation));
+
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  Shape arg_shape = ShapeUtil::MakeShape(U8, {1});
+  BufferAllocation::Slice slice_o(&alloc, 1, 1, PrimitiveType::F32);
+  BufferAllocation::Slice slice_io(&alloc, 2, 1, PrimitiveType::F32);
+  Thunk::ThunkInfo fake_thunk_info;
+  fake_thunk_info.thunk_id = kTestThunkId;
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::make_unique<FakeThunk>(
+      fake_thunk_info,
+      Thunk::BufferUses{
+          // Write is undefined on input, but defined on output.
+          BufferUse::Write(slice_o, arg_shape),
+          // Unlike Consume, Read is supposed to preserve the contents of the
+          // buffer, so we check it on input *and* output.
+          BufferUse::Read(slice_io, arg_shape),
+      }));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  DebugOptions debug_options =
+      tsl::proto_testing::ParseTextProtoOrDie<DebugOptions>(R"pb(
+        xla_dump_to: "/tmp/123"
+        xla_gpu_experimental_enable_buffer_saver_on_thunks: true
+      )pb");
+
+  TF_EXPECT_OK(RunDebugSaverInserter(*root_thunk, debug_options, hlo_module));
+
+  // Expected thunk structure after the pass:
+  // 1. SequentialThunk
+  //    1. FakeThunk
+  //    2. CustomCall (buffer saver)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(IsSequentialThunkWith(ElementsAre(
+          ThunkKindIs(Thunk::Kind::kGemm),
+          IsCustomCallThunkWithTargetName(kXlaGpuAppendToFileCustomCallTag)))));
+}
+
+TEST_F(ThunkBufferDebugPassTest, FiltersThunksByIdRanges) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  IntRangeInclusive* range =
+      debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+          ->add_thunk_id_ranges();
+  range->set_first(2);
+  range->set_last(2);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice1_io(&alloc, 0, 1);
+  BufferAllocation::Slice slice2_io(&alloc, 1, 1);
+  Thunk::ThunkInfo fake_thunk1_info;
+  fake_thunk1_info.thunk_id = ThunkId(1);
+  auto fake_thunk1 = std::make_unique<FakeThunk>(
+      fake_thunk1_info, Thunk::BufferUses{BufferUse::Read(slice1_io)});
+  Thunk::ThunkInfo fake_thunk2_info;
+  fake_thunk2_info.thunk_id = ThunkId(2);
+  auto fake_thunk2 = std::make_unique<FakeThunk>(
+      fake_thunk2_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk* fake_thunk1_ptr = fake_thunk1.get();
+  Thunk* fake_thunk2_ptr = fake_thunk2.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk1));
+  thunks.push_back(std::move(fake_thunk2));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. FakeThunk1 (not instrumented due to filter)
+  // 3. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk2
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // 4. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          Pointer(fake_thunk1_ptr),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }),
+                                            Pointer(fake_thunk2_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }))),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+TEST_F(ThunkBufferDebugPassTest, FiltersThunksByProfileAnnotationRegexes) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+      ->add_profile_annotation_regexes("thunk1");
+  debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+      ->add_profile_annotation_regexes("^fake.*2$");
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice1_io(&alloc, 0, 1);
+  BufferAllocation::Slice slice2_io(&alloc, 1, 1);
+  Thunk::ThunkInfo fake_thunk1_info;
+  fake_thunk1_info.thunk_id = ThunkId(1);
+  fake_thunk1_info.profile_annotation = "fake_thunk1";
+  auto fake_thunk1 = std::make_unique<FakeThunk>(
+      fake_thunk1_info, Thunk::BufferUses{BufferUse::Read(slice1_io)});
+  Thunk::ThunkInfo fake_thunk2_info;
+  fake_thunk2_info.profile_annotation = "fake_thunk2";
+  fake_thunk2_info.thunk_id = ThunkId(2);
+  auto fake_thunk2 = std::make_unique<FakeThunk>(
+      fake_thunk2_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk::ThunkInfo fake_thunk3_info;
+  fake_thunk3_info.profile_annotation = "fake_thunk3";
+  fake_thunk3_info.thunk_id = ThunkId(3);
+  auto fake_thunk3 = std::make_unique<FakeThunk>(
+      fake_thunk3_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk* fake_thunk1_ptr = fake_thunk1.get();
+  Thunk* fake_thunk2_ptr = fake_thunk2.get();
+  Thunk* fake_thunk3_ptr = fake_thunk3.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk1));
+  thunks.push_back(std::move(fake_thunk2));
+  thunks.push_back(std::move(fake_thunk3));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk1 (instrumented due to thunk1)
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // ]
+  // 3. SequentialThunk [
+  //    4. BuffersDebugChecksumThunk (checksum input buffers)
+  //    5. FakeThunk2 (instrumented due to 2$)
+  //    6. BuffersDebugChecksumThunk (checksum output buffers)
+  // ]
+  // 3. FakeThunk3 (not instrumented)
+  // 4. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice1_io},
+                                            }),
+                                            Pointer(fake_thunk1_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice1_io},
+                                            }))),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }),
+                                            Pointer(fake_thunk2_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }))),
+          Pointer(fake_thunk3_ptr),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+TEST_F(ThunkBufferDebugPassTest,
+       FiltersThunksByIdRangesAndProfileAnnotationRegexes) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  IntRangeInclusive* range =
+      debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+          ->add_thunk_id_ranges();
+  range->set_first(2);
+  range->set_last(3);
+  debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+      ->add_profile_annotation_regexes("instrument_me");
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice1_io(&alloc, 0, 1);
+  BufferAllocation::Slice slice2_io(&alloc, 1, 1);
+  BufferAllocation::Slice slice3_io(&alloc, 2, 1);
+  Thunk::ThunkInfo fake_thunk1_info;
+  fake_thunk1_info.thunk_id = ThunkId(1);
+  fake_thunk1_info.profile_annotation = "instrument_me";
+  auto fake_thunk1 = std::make_unique<FakeThunk>(
+      fake_thunk1_info, Thunk::BufferUses{BufferUse::Read(slice1_io)});
+  Thunk::ThunkInfo fake_thunk2_info;
+  fake_thunk2_info.thunk_id = ThunkId(2);
+  fake_thunk2_info.profile_annotation = "ignore_me";
+  auto fake_thunk2 = std::make_unique<FakeThunk>(
+      fake_thunk2_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk::ThunkInfo fake_thunk3_info;
+  fake_thunk3_info.thunk_id = ThunkId(3);
+  fake_thunk3_info.profile_annotation = "instrument_me";
+  auto fake_thunk3 = std::make_unique<FakeThunk>(
+      fake_thunk3_info, Thunk::BufferUses{BufferUse::Read(slice3_io)});
+  Thunk* fake_thunk1_ptr = fake_thunk1.get();
+  Thunk* fake_thunk2_ptr = fake_thunk2.get();
+  Thunk* fake_thunk3_ptr = fake_thunk3.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk1));
+  thunks.push_back(std::move(fake_thunk2));
+  thunks.push_back(std::move(fake_thunk3));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. FakeThunk1 (not instrumented due to thunk ID filter)
+  // 3. FakeThunk2 (not instrumented due to profile annotation regex filter)
+  // 4. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk3
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // 5. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          Pointer(fake_thunk1_ptr), Pointer(fake_thunk2_ptr),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice3_io},
+                                            }),
+                                            Pointer(fake_thunk3_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice3_io},
+                                            }))),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.cc
new file mode 100644
index 00000000000000..a74a06fb22a72b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.cc
@@ -0,0 +1,137 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/runtime_intrinsics.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_filter.h"
+#include "xla/ffi/attribute_map.h"
+#include "xla/ffi/ffi.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+Shape FindShapeFor(const BufferAllocation::Slice& slice, const Thunk& thunk) {
+  for (const auto& [hlo, offset_size] :
+       slice.allocation()->assigned_buffers()) {
+    if (offset_size.offset != slice.offset() ||
+        offset_size.size != slice.size()) {
+      continue;
+    }
+    if (hlo->instruction()->name() != thunk.thunk_info().profile_annotation) {
+      continue;
+    }
+    if (hlo->shape().element_type() != slice.element_type()) {
+      continue;
+    }
+    return hlo->shape();
+  }
+
+  LOG(WARNING) << "Buffer assigment not found. Assuming flat shape.";
+  return ShapeUtil::MakeShape(
+      slice.element_type(),
+      std::vector<int64_t>{slice.size() / ShapeUtil::ByteSizeOfPrimitiveType(
+                                              slice.element_type())});
+}
+
+absl::StatusOr<std::unique_ptr<Thunk>> InsertBufferSaverCustomCall(
+    const HloModule& hlo_module, std::unique_ptr<Thunk> thunk,
+    const std::string& path) {
+  std::vector<std::unique_ptr<Thunk>> sequence;
+  sequence.emplace_back(std::move(thunk));
+
+  absl::flat_hash_set<BufferAllocation::Slice> processed;
+
+  Thunk::BufferUses uses = sequence[0]->buffer_uses();
+  // Results are last in the list. Process in reverse order in case of InOut
+  // argument, which appears in the list twice.
+  for (int i = uses.size() - 1; i >= 0; i--) {
+    const BufferUse& buffer = uses[i];
+    if (buffer.access() != BufferUse::MemoryAccess::kWrite) {
+      continue;
+    }
+
+    const BufferAllocation::Slice& slice = buffer.slice();
+    if (!processed.insert(slice).second) {
+      continue;
+    }
+
+    ShapedSlice output{slice, FindShapeFor(slice, *sequence[0])};
+    ffi::AttributesMap attributes{
+        {"dir", ffi::Attribute{path}},
+        {"metadata", {sequence[0]->thunk_info().profile_annotation}}};
+
+    Thunk::ThunkInfo info;
+    info.profile_annotation =
+        absl::StrCat("Buffer saver ", sequence[0]->profile_annotation());
+    info.execution_stream_id = sequence[0]->execution_stream_id();
+
+    TF_ASSIGN_OR_RETURN(
+        auto log_thunk,
+        CustomCallThunk::Create(
+            info, std::string{kXlaGpuAppendToFileCustomCallTag}, {output},
+            {std::nullopt}, attributes, hlo_module.entry_computation(), "GPU"));
+    log_thunk->add_control_predecessor(sequence[0].get());
+    sequence.emplace_back(std::move(log_thunk));
+  }
+
+  auto wrapped_thunk = std::make_unique<SequentialThunk>(Thunk::ThunkInfo(),
+                                                         std::move(sequence));
+  return std::unique_ptr<Thunk>(std::move(wrapped_thunk));
+}
+
+}  // namespace
+
+absl::Status RunDebugSaverInserter(SequentialThunk& root_thunk,
+                                   const DebugOptions& debug_options,
+                                   const HloModule& hlo_module) {
+  if (debug_options.xla_dump_to().empty()) {
+    LOG(WARNING)
+        << "Buffer saver enabled but target directory is not provided.";
+    return absl::OkStatus();
+  }
+  ThunkFilter thunk_filter = CreateThunkFilter(debug_options);
+  return root_thunk.TransformAllNestedThunks(
+      [&](std::unique_ptr<Thunk> thunk)
+          -> absl::StatusOr<std::unique_ptr<Thunk>> {
+        if (thunk_filter(*thunk) == InstrumentAction::kSkip) {
+          return thunk;
+        }
+        return InsertBufferSaverCustomCall(hlo_module, std::move(thunk),
+                                           debug_options.xla_dump_to());
+      });
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.h
new file mode 100644
index 00000000000000..66a9c30235ed45
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_SAVER_INSERTER_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_SAVER_INSERTER_H_
+
+#include "absl/status/status.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla::gpu {
+
+// Records outputs of thunks selected by ThunkFilter.
+absl::Status RunDebugSaverInserter(SequentialThunk& root_thunk,
+                                   const DebugOptions& debug_options,
+                                   const HloModule& hlo_module);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_SAVER_INSERTER_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass_test.cc
deleted file mode 100644
index e42165a7ccda3e..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass_test.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h"
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/runtime/sequential_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-class FakeThunkPassBufferAllocator : public ThunkPassBufferAllocator {
- public:
-  absl::StatusOr<BufferAllocation*> NewEmptyAllocation(int64_t size) override {
-    if (CreatedAlloc()) {
-      return absl::InvalidArgumentError("Expected only one allocation");
-    }
-    alloc_ = std::make_unique<BufferAllocation>(0, size, 0);
-    return alloc_.get();
-  }
-
-  bool CreatedAlloc() { return alloc_ != nullptr; }
-
- private:
-  std::unique_ptr<BufferAllocation> alloc_;
-};
-
-TEST(ThunkChecksumTracingPassTest, CreatesLogAlloc) {
-  ThunkChecksumTracingPass pass;
-  auto root_thunk = std::make_unique<SequentialThunk>(
-      Thunk::ThunkInfo(), std::vector<std::unique_ptr<Thunk>>());
-  DebugOptions debug_options;
-  se::DeviceDescription device_info;
-  FakeThunkPassBufferAllocator allocator;
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      pass.Run(root_thunk.get(), debug_options, device_info, allocator));
-  EXPECT_FALSE(changed);
-  EXPECT_TRUE(allocator.CreatedAlloc());
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.cc
index 1f768f8fd4d362..c2c914fde02534 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.cc
@@ -17,24 +17,29 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/base/nullability.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
 
 absl::StatusOr<bool> ThunkPassPipeline::Run(
     SequentialThunk* root_thunk, const DebugOptions& debug_options,
+    const HloModule* absl_nullable hlo_module,
     const se::DeviceDescription& device_info,
     ThunkPassBufferAllocator& allocator) {
   bool changed = false;
   for (const auto& pass : passes_) {
     VLOG(1) << "Running ThunkPass: " << pass->name();
-    TF_ASSIGN_OR_RETURN(bool pass_changed, pass->Run(root_thunk, debug_options,
-                                                     device_info, allocator));
+    TF_ASSIGN_OR_RETURN(bool pass_changed,
+                        pass->Run(root_thunk, debug_options, hlo_module,
+                                  device_info, allocator));
     changed |= pass_changed;
   }
   return changed;
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.h b/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.h
index c12c138a4cb8f2..1ca22194a617ac 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline.h
@@ -26,8 +26,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -49,6 +51,7 @@ class ThunkPassInterface {
 
   virtual absl::StatusOr<bool> Run(SequentialThunk* root_thunk,
                                    const DebugOptions& debug_options,
+                                   const HloModule* absl_nullable hlo_module,
                                    const se::DeviceDescription& device_info,
                                    ThunkPassBufferAllocator& allocator) = 0;
 
@@ -69,6 +72,7 @@ class ThunkPassPipeline : public ThunkPassInterface {
   // Returns true if any pass changed the thunk tree.
   absl::StatusOr<bool> Run(SequentialThunk* root_thunk,
                            const DebugOptions& debug_options,
+                           const HloModule* absl_nullable hlo_module,
                            const se::DeviceDescription& device_info,
                            ThunkPassBufferAllocator& allocator) override;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline_test.cc
index bbe0b058d7511c..77f859f524ad3c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_pass_pipeline_test.cc
@@ -26,9 +26,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -40,6 +42,7 @@ class TestPass : public ThunkPassInterface {
   absl::string_view name() const override { return "test-pass"; }
   absl::StatusOr<bool> Run(SequentialThunk* root_thunk,
                            const DebugOptions& debug_options,
+                           const HloModule* hlo_module,
                            const se::DeviceDescription& device_info,
                            ThunkPassBufferAllocator& /*allocator*/) override {
     root_thunk->thunks().push_back(std::make_unique<SequentialThunk>(
@@ -69,7 +72,8 @@ TEST(ThunkPassPipelineTest, PipelineRunsPass) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
-      pipeline.Run(root_thunk.get(), debug_options, device_info, allocator));
+      pipeline.Run(root_thunk.get(), debug_options, /*hlo_module=*/nullptr,
+                   device_info, allocator));
   EXPECT_TRUE(changed);
   EXPECT_EQ(root_thunk->thunks().size(), 1);
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index c943144dc69f6e..e323e9f722bee8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/base/nullability.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -28,14 +29,24 @@ limitations under the License.
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/message.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
+#include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
+#include "xla/backends/gpu/runtime/convolution_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
 #include "xla/backends/gpu/runtime/cudnn_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
+#include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
+#include "xla/backends/gpu/runtime/fft_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+#include "xla/backends/gpu/runtime/host_execute_thunk.h"
+#include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/infeed_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/norm_thunk.h"
+#include "xla/backends/gpu/runtime/outfeed_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -43,11 +54,15 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/triangular_solve_thunk.h"
 #include "xla/backends/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
+namespace {
+
 static std::optional<absl::string_view> GetStoredThunkTypeName(
     const ThunkProto& proto) {
   const tsl::protobuf::Descriptor* descriptor = proto.GetDescriptor();
@@ -66,17 +81,25 @@ static std::optional<absl::string_view> GetStoredThunkTypeName(
   return field_descriptor->name();
 }
 
-absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
+absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
     const ThunkProto& thunk_proto,
-    absl::Span<const BufferAllocation> buffer_allocations) {
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module, absl::string_view platform_name,
+    HostExecuteAsyncEventsMap& host_executable_async_events_map,
+    HostSendRecvAsyncEventsMap& host_send_recv_async_events_map,
+    const std::optional<stream_executor::KernelLoaderSpec::SymbolResolver>&
+        symbol_resolver) {
   TF_ASSIGN_OR_RETURN(Thunk::ThunkInfo thunk_info,
                       Thunk::ThunkInfo::FromProto(thunk_proto.thunk_info()));
+  auto deserializer = [&](const ThunkProto& thunk_proto) {
+    return DeserializeThunkProtoImpl(
+        thunk_proto, buffer_allocations, hlo_module, platform_name,
+        host_executable_async_events_map, host_send_recv_async_events_map,
+        symbol_resolver);
+  };
 
   switch (thunk_proto.impl_case()) {
     case ThunkProto::kSequentialThunk: {
-      auto deserializer = [&buffer_allocations](const ThunkProto& thunk_proto) {
-        return DeserializeThunkProto(thunk_proto, buffer_allocations);
-      };
       return SequentialThunk::FromProto(
           std::move(thunk_info), thunk_proto.sequential_thunk(), deserializer);
     }
@@ -96,18 +119,13 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
           std::move(thunk_info), thunk_proto.device_to_device_copy_thunk(),
           buffer_allocations);
     case ThunkProto::kWhileThunk:
-      return WhileThunk::FromProto(
-          std::move(thunk_info), thunk_proto.while_thunk(), buffer_allocations,
-          [&buffer_allocations](const ThunkProto& thunk_proto) {
-            return DeserializeThunkProto(thunk_proto, buffer_allocations);
-          });
+      return WhileThunk::FromProto(std::move(thunk_info),
+                                   thunk_proto.while_thunk(),
+                                   buffer_allocations, deserializer);
     case ThunkProto::kConditionalThunk:
-      return ConditionalThunk::FromProto(
-          std::move(thunk_info), thunk_proto.conditional_thunk(),
-          buffer_allocations,
-          [&buffer_allocations](const ThunkProto& thunk_proto) {
-            return DeserializeThunkProto(thunk_proto, buffer_allocations);
-          });
+      return ConditionalThunk::FromProto(std::move(thunk_info),
+                                         thunk_proto.conditional_thunk(),
+                                         buffer_allocations, deserializer);
     case ThunkProto::kGemmThunk:
       return GemmThunk::FromProto(std::move(thunk_info),
                                   thunk_proto.gemm_thunk(), buffer_allocations);
@@ -148,6 +166,75 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
     case ThunkProto::kNormThunk:
       return NormThunk::FromProto(std::move(thunk_info),
                                   thunk_proto.norm_thunk(), buffer_allocations);
+    case ThunkProto::kConvolutionThunk:
+      return ConvolutionThunk::FromProto(std::move(thunk_info),
+                                         thunk_proto.convolution_thunk(),
+                                         buffer_allocations);
+    case ThunkProto::kConvolutionReorderThunk: {
+      return ConvolutionReorderThunk::FromProto(
+          std::move(thunk_info), thunk_proto.convolution_reorder_thunk(),
+          buffer_allocations);
+    }
+    case ThunkProto::kFftThunk:
+      return FftThunk::FromProto(std::move(thunk_info), thunk_proto.fft_thunk(),
+                                 buffer_allocations);
+    case ThunkProto::kMemset32BitValueThunk:
+      return Memset32BitValueThunk::FromProto(
+          std::move(thunk_info), thunk_proto.memset32bit_value_thunk(),
+          buffer_allocations);
+    case ThunkProto::kDynamicSliceThunk: {
+      auto deserializer =
+          [&](const ThunkProto& thunk_proto,
+              absl::Span<const BufferAllocation> custom_allocations) {
+            return DeserializeThunkProtoImpl(
+                thunk_proto, custom_allocations, hlo_module, platform_name,
+                host_executable_async_events_map,
+                host_send_recv_async_events_map, symbol_resolver);
+          };
+      return DynamicSliceThunk::FromProto(std::move(thunk_info),
+                                          thunk_proto.dynamic_slice_thunk(),
+                                          buffer_allocations, deserializer);
+    }
+    case ThunkProto::kCustomCallThunk:
+      return CustomCallThunk::FromProto(
+          std::move(thunk_info), thunk_proto.custom_call_thunk(),
+          buffer_allocations, hlo_module, platform_name);
+    case ThunkProto::kCubSortThunk:
+      return CubSortThunk::FromProto(std::move(thunk_info),
+                                     thunk_proto.cub_sort_thunk(),
+                                     buffer_allocations, platform_name);
+    case ThunkProto::kHostExecuteStartThunk:
+      return HostExecuteStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_execute_start_thunk(),
+          buffer_allocations, host_executable_async_events_map);
+    case ThunkProto::kHostExecuteDoneThunk:
+      return HostExecuteDoneThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_execute_done_thunk(),
+          buffer_allocations, host_executable_async_events_map);
+    case ThunkProto::kHostSendThunk:
+      return HostSendThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_send_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kHostSendDoneThunk:
+      return HostSendDoneThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_send_done_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kHostRecvThunk:
+      return HostRecvThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_recv_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kHostRecvDoneThunk:
+      return HostRecvDoneThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_recv_done_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kOutfeedThunk:
+      return OutfeedThunk::FromProto(std::move(thunk_info),
+                                     thunk_proto.outfeed_thunk(),
+                                     buffer_allocations);
+    case ThunkProto::kCustomKernelThunk:
+      return CustomKernelThunk::FromProto(std::move(thunk_info),
+                                          thunk_proto.custom_kernel_thunk(),
+                                          buffer_allocations, symbol_resolver);
 
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
@@ -166,4 +253,20 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
   }
 }
 
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
+    const ThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module, absl::string_view platform_name,
+    const std::optional<stream_executor::KernelLoaderSpec::SymbolResolver>&
+        symbol_resolver) {
+  HostExecuteAsyncEventsMap host_executable_async_events_map;
+  HostSendRecvAsyncEventsMap host_send_recv_async_events_map;
+  return DeserializeThunkProtoImpl(
+      thunk_proto, buffer_allocations, hlo_module, platform_name,
+      host_executable_async_events_map, host_send_recv_async_events_map,
+      symbol_resolver);
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
index a68b6ed9fa60f6..473c221520594c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
@@ -17,19 +17,34 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_THUNK_PROTO_DESERIALIZATION_H_
 
 #include <memory>
+#include <optional>
 
+#include "absl/base/nullability.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/kernel_spec.h"
 
 namespace xla::gpu {
 
 // Deserializes the given `thunk_proto` into a Thunk.
+// - `buffer_allocations` is used to deserialize buffer slices.
+// - `hlo_module` is used to deserialize thunks that reference HLO instructions.
+// - `platform_name` is used to look up platform-specific kernels in the
+//   GpuKernelRegistry.
+// - `symbol_resolver` is used to deserialize custom kernels where the kernel is
+//   not inlined in the proto, but rather loaded at runtime via symbol
+//   resolution.
 absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
     const ThunkProto& thunk_proto,
-    absl::Span<const BufferAllocation> buffer_allocations);
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module, absl::string_view platform_name,
+    const std::optional<stream_executor::KernelLoaderSpec::SymbolResolver>&
+        symbol_resolver = std::nullopt);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
index e929e64e1466a4..83a1db3b20b828 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
@@ -22,23 +22,39 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
+#include "xla/backends/gpu/runtime/host_execute_thunk.h"
+#include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/tsl/platform/status_matchers.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 namespace {
 using ::testing::ElementsAre;
+using ::testing::Field;
 using ::testing::IsEmpty;
+using ::testing::Optional;
 using ::testing::Pointer;
 using ::testing::Property;
 using ::testing::WhenDynamicCastTo;
@@ -46,6 +62,8 @@ using ::tsl::proto_testing::EqualsProto;
 using ::tsl::proto_testing::ParseTextProtoOrDie;
 using Kind = Thunk::Kind;
 
+constexpr absl::string_view kTestPlatformName = "TEST_PLATFORM";
+
 TEST(ThunkProtoDeserializationTest, SequentialThunkChain) {
   constexpr ExecutionStreamId kExecutionStreamId{123};
   constexpr absl::string_view kProfileAnnotation = "profile_annotation";
@@ -63,8 +81,10 @@ TEST(ThunkProtoDeserializationTest, SequentialThunkChain) {
   SequentialThunk outer_thunk(thunk_info, std::move(thunk_sequence));
 
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, outer_thunk.ToProto());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> new_thunk,
-                          DeserializeThunkProto(proto, {}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> new_thunk,
+      DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                            /*hlo_module=*/nullptr, kTestPlatformName));
 
   EXPECT_THAT(new_thunk.get(),
               WhenDynamicCastTo<const SequentialThunk*>(Property(
@@ -91,8 +111,10 @@ TEST(ThunkProtoDeserializationTest, CopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<CopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -123,8 +145,10 @@ TEST(ThunkProtoDeserializationTest, DeviceToHostCopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<DeviceToHostCopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -155,8 +179,10 @@ TEST(ThunkProtoDeserializationTest, HostToDeviceCopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<HostToDeviceCopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -187,8 +213,10 @@ TEST(ThunkProtoDeserializationTest, DeviceToDeviceCopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<DeviceToDeviceCopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -264,8 +292,10 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
       BufferAllocation(/*index=*/4, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/5, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> athunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> athunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* thunk = dynamic_cast<WhileThunk*>(athunk.get());
   ASSERT_NE(thunk, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
@@ -353,8 +383,10 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
       BufferAllocation(/*index=*/4, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/5, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> athunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> athunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* thunk = dynamic_cast<ConditionalThunk*>(athunk.get());
   ASSERT_NE(thunk, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
@@ -370,7 +402,8 @@ TEST(ThunkProtoDeserializationTest, WaitForStreamsThunk) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Thunk> thunk,
-      DeserializeThunkProto(proto, /*buffer_allocations=*/{}));
+      DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                            /*hlo_module=*/nullptr, kTestPlatformName));
 
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
@@ -391,8 +424,10 @@ TEST(ThunkProtoDeserializationTest, CudnnThunk) {
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0),
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
 
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
@@ -457,8 +492,100 @@ TEST(ThunkProtoDeserializationTest, CublasLtMatmulThunk) {
       BufferAllocation(/*index=*/5, /*size=*/161600, /*color=*/0),
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+XLA_FFI_DEFINE_HANDLER(kSimpleCustomCall, []() { return absl::OkStatus(); },
+                       ffi::Ffi::Bind(), {ffi::Traits::kCmdBufferCompatible});
+
+constexpr absl::string_view kSimpleCustomCallName =
+    "__xla_test$$simple_custom_call";
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kSimpleCustomCallName,
+                         "TEST_PLATFORM", kSimpleCustomCall);
+
+TEST(ThunkProtoDeserializationTest, CustomCallThunk) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        custom_call_thunk {
+          target_name: "__xla_test$$simple_custom_call"
+          operands {
+            shaped_slice {
+              slice { buffer_allocation_index: 0 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          operands {
+            shaped_slice {
+              slice { buffer_allocation_index: 1 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          results {
+            shaped_slice {
+              slice { buffer_allocation_index: 2 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          results {
+            shaped_slice {
+              slice { buffer_allocation_index: 3 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          api_version: API_VERSION_TYPED_FFI
+          attributes {
+            attrs {
+              key: "my_attribute"
+              value { scalar { i32: 42 } }
+            }
+          }
+          called_computation: "called_computation"
+        }
+      )pb");
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
+      BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0),
+      BufferAllocation(/*index=*/2, /*size=*/1024, /*color=*/0),
+      BufferAllocation(/*index=*/3, /*size=*/1024, /*color=*/0),
+  };
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("called_computation");
+  // This instruction is pretty arbitrary, we just need a non-empty computation.
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, &hlo_module,
+                            kTestPlatformName));
+
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
 }
@@ -469,9 +596,250 @@ TEST(ThunkProtoDeserializationTest, EmptyThunkImplReturnsAnError) {
         thunk_info { execution_stream_id: 7 }
       )pb");
 
-  EXPECT_THAT(DeserializeThunkProto(proto, /*buffer_allocations=*/{}),
+  EXPECT_THAT(DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                                    /*hlo_module=*/nullptr, kTestPlatformName),
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST(ThunkProtoDeserializationTest, HostSendRecvThunksRoundTrip) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        sequential_thunk {
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_send_thunk {
+              shape {
+                element_type: F32
+                dimensions: [ 10 ]
+                is_dynamic_dimension: false
+              }
+              buffer { buffer_allocation_index: 0 }
+              channel_id: 123
+              async_events_unique_id: 1
+            }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_send_done_thunk { channel_id: 123 async_events_unique_id: 1 }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_recv_thunk {
+              shape {
+                element_type: F32
+                dimensions: [ 10 ]
+                is_dynamic_dimension: false
+
+              }
+              buffer { buffer_allocation_index: 0 }
+              channel_id: 456
+              async_events_unique_id: 2
+            }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_recv_done_thunk { channel_id: 456 async_events_unique_id: 2 }
+          }
+        }
+      )pb");
+
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0)};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations,
+                            /*hlo_module=*/nullptr, kTestPlatformName));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  const auto* sequential_thunk = dynamic_cast<SequentialThunk*>(thunk.get());
+  ASSERT_NE(sequential_thunk, nullptr);
+  ASSERT_EQ(sequential_thunk->thunks().size(), 4);
+
+  const auto* send_thunk =
+      dynamic_cast<HostSendThunk*>(sequential_thunk->thunks()[0].get());
+  ASSERT_NE(send_thunk, nullptr);
+
+  const auto* send_done_thunk =
+      dynamic_cast<HostSendDoneThunk*>(sequential_thunk->thunks()[1].get());
+  ASSERT_NE(send_done_thunk, nullptr);
+
+  const auto* recv_thunk =
+      dynamic_cast<HostRecvThunk*>(sequential_thunk->thunks()[2].get());
+  ASSERT_NE(recv_thunk, nullptr);
+
+  const auto* recv_done_thunk =
+      dynamic_cast<HostRecvDoneThunk*>(sequential_thunk->thunks()[3].get());
+  ASSERT_NE(recv_done_thunk, nullptr);
+
+  EXPECT_TRUE(send_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_TRUE(send_done_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_EQ(send_thunk->GetAsyncEventsUniqueId(),
+            send_done_thunk->GetAsyncEventsUniqueId());
+
+  EXPECT_TRUE(recv_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_TRUE(recv_done_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_EQ(recv_thunk->GetAsyncEventsUniqueId(),
+            recv_done_thunk->GetAsyncEventsUniqueId());
+
+  // The unique id is regenerated on deserialization. Overwrite it with the
+  // original value for the purpose of the roundtrip test.
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(0)
+      ->mutable_host_send_thunk()
+      ->set_async_events_unique_id(1);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(1)
+      ->mutable_host_send_done_thunk()
+      ->set_async_events_unique_id(1);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(2)
+      ->mutable_host_recv_thunk()
+      ->set_async_events_unique_id(2);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(3)
+      ->mutable_host_recv_done_thunk()
+      ->set_async_events_unique_id(2);
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+TEST(ThunkProtoDeserializationTest, HostExecuteThunksRoundTrip) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        sequential_thunk {
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_execute_start_thunk {
+              executable_proto { executable_type: EXECUTABLE_TYPE_NANORT }
+              async_events_unique_id: 123
+            }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_execute_done_thunk { async_events_unique_id: 123 }
+          }
+        }
+      )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                            /*hlo_module=*/nullptr, kTestPlatformName));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Check that start and done thunks share the same async event id.
+  const auto* sequential_thunk = dynamic_cast<SequentialThunk*>(thunk.get());
+  ASSERT_NE(sequential_thunk, nullptr);
+  ASSERT_EQ(sequential_thunk->thunks().size(), 2);
+
+  const auto* start_thunk =
+      dynamic_cast<HostExecuteStartThunk*>(sequential_thunk->thunks()[0].get());
+  ASSERT_NE(start_thunk, nullptr);
+
+  const auto* done_thunk =
+      dynamic_cast<HostExecuteDoneThunk*>(sequential_thunk->thunks()[1].get());
+  ASSERT_NE(done_thunk, nullptr);
+
+  EXPECT_TRUE(start_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_TRUE(done_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_EQ(start_thunk->GetAsyncEventsUniqueId(),
+            done_thunk->GetAsyncEventsUniqueId());
+
+  // The unique id is regenerated on deserialization. Overwrite it with the
+  // original value for the purpose of the roundtrip test.
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(0)
+      ->mutable_host_execute_start_thunk()
+      ->set_async_events_unique_id(123);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(1)
+      ->mutable_host_execute_done_thunk()
+      ->set_async_events_unique_id(123);
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+TEST(ThunkProtoDeserializationTest, CustomKernelThunkRoundTrip) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        custom_kernel_thunk {
+          custom_kernel {
+            name: "test_kernel"
+            kernel_spec {
+              ptx { data: "PTX" }
+              arity: 1
+            }
+            block_dims { coordinates { x: 1, y: 1, z: 1 } }
+            thread_dims { coordinates { x: 1, y: 1, z: 1 } }
+            shared_memory_bytes: 42
+          }
+          args { buffer_allocation_index: 0 }
+          written: true
+        }
+      )pb");
+
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0)};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+// A test symbol that we can resolve to.
+void test_kernel(void* args) {}
+
+TEST(ThunkProtoDeserializationTest, CustomKernelThunkSymbolResolvingWorks) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        custom_kernel_thunk {
+          custom_kernel {
+            name: "test_kernel"
+            kernel_spec {
+              in_process_symbol { persistent_name: "test_kernel" }
+              arity: 1
+            }
+            block_dims { coordinates { x: 1, y: 1, z: 1 } }
+            thread_dims { coordinates { x: 1, y: 1, z: 1 } }
+            shared_memory_bytes: 42
+          }
+          args { buffer_allocation_index: 0 }
+          written: true
+        }
+      )pb");
+
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0)};
+
+  auto symbol_resolver =
+      [&](absl::string_view persistent_name) -> absl::StatusOr<void*> {
+    if (persistent_name == "test_kernel") {
+      return tsl::safe_reinterpret_cast<void*>(&test_kernel);
+    }
+    return absl::NotFoundError("Symbol not found");
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName, symbol_resolver));
+
+  auto custom_kernel_thunk = dynamic_cast<CustomKernelThunk*>(thunk.get());
+  ASSERT_NE(custom_kernel_thunk, nullptr);
+  EXPECT_THAT(
+      custom_kernel_thunk->custom_kernel().kernel_spec().in_process_symbol(),
+      Optional(Field(&stream_executor::InProcessSymbol::symbol,
+                     tsl::safe_reinterpret_cast<void*>(&test_kernel))));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/topk.cc b/third_party/xla/xla/backends/gpu/runtime/topk.cc
index cb40a63fea7257..10b611d29a6c0f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/topk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/topk.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <limits>
-#include <memory>
 #include <string>
 #include <utility>
 
@@ -31,10 +30,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/topk_kernel.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -47,8 +45,6 @@ namespace xla::gpu::kernel::topk {
 
 namespace {
 
-using KernelArgsPacking = se::KernelLoaderSpec::KernelArgsPacking;
-
 // The optimal number of threads is the smaller value between the number of
 // threads available per block and the number of slices of data.
 size_t EstimateOptimalNumThreads(size_t n, size_t k, size_t batch_size) {
@@ -66,21 +62,16 @@ size_t EstimateOptimalNumThreads(size_t n, size_t k, size_t batch_size) {
   return std::min(threads_per_block, min_slice);
 }
 
-// Returns the function creating packed arguments for TopK kernel.
-template <typename T>
-KernelArgsPacking CreateTopKArgsPacking(size_t num_elements, size_t k) {
-  using Packed = absl::StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>;
-
-  return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
-    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
-
-    se::DeviceMemory<T> data(mem_args->device_memory_args()[0]);
-    se::DeviceMemory<T> top_elements(mem_args->device_memory_args()[1]);
-    se::DeviceMemory<uint32_t> top_indices(mem_args->device_memory_args()[2]);
-
-    return se::PackKernelArgs(args.number_of_shared_bytes(), data, num_elements,
-                              top_elements, top_indices, k);
-  };
+// Returns a packing spec for invoking the TopK kernel.
+se::KernelArgumentsPackingSpec CreateTopKArgsPacking(size_t num_elements,
+                                                     size_t k) {
+  se::KernelArgumentsPackingSpec spec;
+  spec.AddAddressArgument(0);  // data
+  spec.AddConstantArgument(num_elements);
+  spec.AddAddressArgument(1);  // top_elements
+  spec.AddAddressArgument(2);  // top_indices
+  spec.AddConstantArgument(k);
+  return spec;
 }
 
 // Finds the TopK kernel for the given platform registered in the global
@@ -158,7 +149,7 @@ absl::StatusOr<CustomKernel> GetTypedTopK(std::string name, size_t num_elements,
       se::KernelLoaderSpec spec,
       GetTopKKernelForKAndPlatformAndN<T>(k, platform->id(), num_elements));
 
-  spec.set_kernel_args_packing(CreateTopKArgsPacking<T>(num_elements, k));
+  spec.set_kernel_args_packing(CreateTopKArgsPacking(num_elements, k));
   return CustomKernel(std::move(name), std::move(spec),
                       se::BlockDim(batch_size, 1, 1),
                       se::ThreadDim(num_threads, 1, 1), shmem_size);
diff --git a/third_party/xla/xla/backends/gpu/runtime/topk_test.cc b/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
index 8d7006242908d7..2dc181e9bd97ca 100644
--- a/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
@@ -29,7 +29,9 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/kernel_serialization_check.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -190,6 +192,21 @@ TEST_P(TopKKernelTest, TopKPackedNegative) {
   }
 }
 
+TEST_P(TopKKernelTest, EnsureSerializable) {
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
+
+  const auto [n_kb, k, batch_size, offset] = GetParam();
+  const size_t n = n_kb * 1024 + offset;
+
+  auto custom_kernel = GetTopKKernel("topk", PrimitiveType::F32, n, k,
+                                     batch_size, platform->Name(), 32);
+
+  stream_executor::gpu::VerifyKernelIsSerializable(custom_kernel->kernel_spec(),
+                                                   platform->id());
+}
+
 INSTANTIATE_TEST_SUITE_P(TopKTests, TopKKernelTest,
                          Combine(
                              /*n_kb=*/Values(1, 8, 12, 64, 128),
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
index 844dba23ed5076..f19b05685d4678 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
index d1c18ce217b927..6414135ebb3f8b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc
index 9e7bf3aa439fc4..123fd90fd58194 100644
--- a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/stream_executor/stream.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h
index 4f973c26d071e2..9dc3e56dbd586d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
 
 #include <memory>
-#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
index c2518e4b6aeff8..307a4bb59378dd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
@@ -61,6 +61,8 @@ static std::list<RunningLoop>& RunningLoops() {
   return loops;
 }
 
+bool WhileThunk::RunningWhileThunkLoop() { return RunningLoops().size() > 0; }
+
 absl::StatusOr<int64_t> WhileThunk::CurrentLoopIteration(int64_t depth) {
   if (depth >= RunningLoops().size()) {
     return absl::InvalidArgumentError(absl::StrFormat(
@@ -98,11 +100,9 @@ WhileThunk::WhileThunk(
       body_thunk_sequence_(std::move(body_thunk_sequence)),
       trip_count_(trip_count) {}
 
-absl::Status WhileThunk::Prepare(const PrepareParams& params,
-                                 ResourceRequestsInterface& resource_requests) {
-  TF_RETURN_IF_ERROR(
-      condition_thunk_sequence_->Prepare(params, resource_requests));
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Prepare(params, resource_requests));
+absl::Status WhileThunk::Prepare(const PrepareParams& params) {
+  TF_RETURN_IF_ERROR(condition_thunk_sequence_->Prepare(params));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Prepare(params));
   return absl::OkStatus();
 }
 
@@ -197,6 +197,23 @@ void WhileThunk::ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) {
   body_thunk_sequence_->ForAllThunksMutable(fn);
 }
 
+absl::Status WhileThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  TF_RETURN_IF_ERROR(condition_thunk_sequence_->TransformAllNestedThunks(fn));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
+                      fn(std::move(condition_thunk_sequence_)));
+  condition_thunk_sequence_ = SequentialThunk::FromThunk(std::move(thunk));
+
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->TransformAllNestedThunks(fn));
+
+  TF_ASSIGN_OR_RETURN(thunk, fn(std::move(body_thunk_sequence_)));
+  body_thunk_sequence_ = SequentialThunk::FromThunk(std::move(thunk));
+  return absl::OkStatus();
+}
+
 std::string WhileThunk::ToString(int indent) const {
   std::string indent_str(indent * 2, ' ');
   std::string result;
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
index 48788f30f4bd14..4ec690d85d591b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
@@ -64,8 +64,7 @@ class WhileThunk : public Thunk {
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -87,12 +86,17 @@ class WhileThunk : public Thunk {
   //
   // Implementation relies on thread local storage, be careful when call it from
   // code running on multiple threads.
+  static bool RunningWhileThunkLoop();
   static absl::StatusOr<int64_t> CurrentLoopIteration(int64_t depth = 0);
   static absl::StatusOr<int64_t> CurrentLoopIteration(
       const HloInstruction* while_instr);
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
 
   std::string ToString(int indent) const override;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
index bdaa3484d9bef5..f88361546f4db4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
@@ -42,20 +43,20 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
-#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::NotNull;
+using ::testing::SizeIs;
 using ::tsl::proto_testing::EqualsProto;
 using ::tsl::proto_testing::ParseTextProtoOrDie;
-using ::tsl::testing::IsOk;
 using Kind = Thunk::Kind;
 
 // A dummy `Thunk` that does nothing.
@@ -379,5 +380,32 @@ TEST(WhileThunkTest, FromProto) {
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
 }
 
+TEST(WhileThunkTest, TransformAllNestedThunks) {
+  BufferAllocation::Slice slice;
+  auto condition_thunk_sequence =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence());
+  auto body_thunk_sequence =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence());
+  auto while_thunk = std::make_unique<WhileThunk>(
+      Thunk::ThunkInfo(), /*loop=*/nullptr,
+      /*condition_result_buffer_index=*/slice,
+      /*condition_thunk_sequence=*/std::move(condition_thunk_sequence),
+      /*body_thunk_sequence_=*/std::move(body_thunk_sequence),
+      /*trip_count=*/3);
+
+  TF_EXPECT_OK(while_thunk->TransformAllNestedThunks([](auto) {
+    return std::make_unique<DummyThunk>(Kind::kCustomCall, Thunk::ThunkInfo());
+  }));
+
+  EXPECT_THAT(while_thunk->condition_thunk_sequence(), NotNull());
+  EXPECT_THAT(while_thunk->condition_thunk_sequence()->thunks(), SizeIs(1));
+  EXPECT_THAT(while_thunk->condition_thunk_sequence()->thunks()[0]->kind(),
+              Kind::kCustomCall);
+  EXPECT_THAT(while_thunk->body_thunk_sequence(), NotNull());
+  EXPECT_THAT(while_thunk->body_thunk_sequence()->thunks(), SizeIs(1));
+  EXPECT_THAT(while_thunk->body_thunk_sequence()->thunks()[0]->kind(),
+              Kind::kCustomCall);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index 424abe337d83c5..a44a01fd962da1 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -33,7 +33,6 @@ cc_library(
         "//xla:util",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/expanders:cholesky_expander",
         "//xla/hlo/transforms/expanders:dynamic_index_splitter",
@@ -52,13 +51,13 @@ cc_library(
         "//xla/service:triangular_solve_expander",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,  # Contains compiler registration
 )
@@ -142,10 +141,10 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:initialize",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,  # Registers itself with the PlatformManager.
diff --git a/third_party/xla/xla/backends/interpreter/compiler.cc b/third_party/xla/xla/backends/interpreter/compiler.cc
index ab5008af59d4c3..12126740762af9 100644
--- a/third_party/xla/xla/backends/interpreter/compiler.cc
+++ b/third_party/xla/xla/backends/interpreter/compiler.cc
@@ -48,9 +48,9 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace interpreter {
@@ -162,7 +162,7 @@ absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 InterpreterCompiler::CompileAheadOfTime(
     std::unique_ptr<HloModule> hlo_module,
     const AotCompilationOptions& aot_options) {
-  return tsl::errors::InvalidArgument(
+  return absl::InvalidArgumentError(
       "AOT compilation not supported on Interpreter");
 }
 
diff --git a/third_party/xla/xla/backends/interpreter/platform.cc b/third_party/xla/xla/backends/interpreter/platform.cc
index a69e1daab8f3e1..0eea1e08ff76b7 100644
--- a/third_party/xla/xla/backends/interpreter/platform.cc
+++ b/third_party/xla/xla/backends/interpreter/platform.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
-#include "tsl/platform/status.h"
 
 namespace stream_executor {
 namespace interpreter {
@@ -77,7 +77,7 @@ XlaInterpreterPlatform::GetUncachedExecutor(int ordinal) {
 
 static void InitializeXlaInterpreterPlatform() {
   std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
-  TF_CHECK_OK(PlatformManager::RegisterPlatform(std::move(platform)));
+  CHECK_OK(PlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace interpreter
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 4e2c446b5b305f..bdddd7bc54230f 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -65,20 +65,14 @@ cc_library(
     deps = [
         ":rocm_collector",
         ":rocm_tracer",
+        ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "//xla/tsl/platform:env_time",
         "//xla/tsl/platform:errors",
         "//xla/tsl/profiler/backends/cpu:annotation_stack",
-        "//xla/tsl/profiler/utils:parse_annotation",
-        "//xla/tsl/profiler/utils:trace_utils",
-        "//xla/tsl/profiler/utils:xplane_builder",
-        "//xla/tsl/profiler/utils:xplane_schema",
-        "//xla/tsl/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
     ],
@@ -157,7 +151,9 @@ xla_test(
         ":cupti_wrapper",
         ":mock_cupti",
         "//xla/tsl/profiler/utils:time_utils",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -166,6 +162,7 @@ cuda_library(
     testonly = 1,
     srcs = ["cuda_test.cu.cc"],
     hdrs = ["cuda_test.h"],
+    tags = ["cuda-only"],
     visibility = ["//visibility:public"],
     deps = [
         "@com_google_googletest//:gtest_for_library",
@@ -200,6 +197,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cupti_tracer_options_utils",
+    srcs = ["cupti_tracer_options_utils.cc"],
+    hdrs = ["cupti_tracer_options_utils.h"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cupti_collector",
+        ":cupti_tracer",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/profiler/utils:profiler_options_util",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
+    ],
+)
+
 cc_library(
     name = "cupti_tracer",
     srcs = ["cupti_tracer.cc"],
@@ -214,9 +234,9 @@ cc_library(
         ":cupti_buffer_events",
         ":cupti_collector",
         ":cupti_interface",
+        ":cupti_marker_data_parser",
         ":cupti_pm_sampler_factory",
         ":cupti_utils",
-        ":nvtx_utils",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -225,6 +245,7 @@ cc_library(
         "//xla/tsl/profiler/utils:per_thread",
         "//xla/tsl/profiler/utils:xplane_builder",
         "//xla/tsl/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -260,7 +281,6 @@ cc_library(
         "cuda-only",
         "gpu",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":cupti_collector",
         ":cupti_interface",
@@ -287,9 +307,9 @@ cc_library(
         "cuda-only",
         "gpu",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":cupti_collector",
+        ":cupti_interface",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
     ],
@@ -308,7 +328,6 @@ cc_library(
         "gpu",
         "manual",  # This target requires CUDA 12.6+, therefore we only built it if it was requested via a dependency.
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":cupti_collector",
         ":cupti_interface",
@@ -360,6 +379,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "rocm_tracer_utils",
+    srcs = ["rocm_tracer_utils.cc"],
+    hdrs = ["rocm_tracer_utils.h"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_library(
     name = "rocm_collector",
     srcs = ["rocm_collector.cc"],
@@ -373,30 +408,26 @@ cc_library(
         "manual",
     ]),
     deps = [
+        ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
-        "//xla/tsl/profiler/backends/cpu:annotation_stack",
+        "//xla/tsl/platform:status",
         "//xla/tsl/profiler/utils:parse_annotation",
+        "//xla/tsl/profiler/utils:trace_utils",
         "//xla/tsl/profiler/utils:xplane_builder",
         "//xla/tsl/profiler/utils:xplane_schema",
         "//xla/tsl/profiler/utils:xplane_utils",
-        "//xla/tsl/util:env_var",
-        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@local_config_rocm//rocm:rocm_headers",  # buildcleaner: keep
+        "@local_config_rocm//rocm:rocprofiler-sdk",  # buildcleaner: keep
         "@local_tsl//tsl/platform:abi",
-        "@local_tsl//tsl/platform:env_time",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:thread_annotations",
-        "@local_tsl//tsl/platform:types",
-        "@local_tsl//tsl/profiler/lib:profiler_factory",
-        "@local_tsl//tsl/profiler/lib:profiler_interface",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -414,45 +445,126 @@ cc_library(
     ]),
     deps = [
         ":rocm_collector",
+        ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "//xla/tsl/profiler/backends/cpu:annotation_stack",
-        "//xla/tsl/profiler/utils:time_utils",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:env",
+        "@com_google_absl//absl/types:optional",
+        "@local_config_rocm//rocm:rocm_headers",  # buildcleaner: keep
+        "@local_config_rocm//rocm:rocprofiler-sdk",  # buildcleaner: keep
+        "@local_tsl//tsl/platform:abi",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:types",
     ],
 )
 
+xla_cc_test(
+    name = "rocm_tracer_test",
+    size = "small",
+    srcs = ["rocm_tracer_test.cc"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ] + if_google([
+        # Optional: only run internally if ROCm config is enabled
+        "manual",
+    ]),
+    deps = [
+        ":rocm_collector",
+        ":rocm_tracer",
+        ":rocm_tracer_utils",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+    ],
+)
+
+xla_cc_test(
+    name = "rocm_collector_test",
+    size = "small",
+    srcs = ["rocm_collector_test.cc"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ] + if_google([
+        "manual",
+    ]),
+    deps = [
+        # ":rocm_tracer",
+        ":rocm_collector",
+        ":rocm_tracer_utils",
+        "@com_google_googletest//:gtest_main",
+        "//xla/tsl/profiler/utils:xplane_utils",
+    ],
+)
+
 cc_library(
-    name = "nvtx_utils",
-    srcs = ["nvtx_utils.cc"],
-    hdrs = ["nvtx_utils.h"],
+    name = "cupti_nvtx_ext_payload",
+    srcs = if_cuda_newer_than(
+        "13_0",
+        ["cupti_nvtx_ext_payload.cc"],
+        [],
+    ),
+    hdrs = if_cuda_newer_than(
+        "13_0",
+        ["cupti_nvtx_ext_payload.h"],
+        [],
+    ),
     # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
     tags = [
         "cuda-only",
         "gpu",
     ],
     deps = [
+        "//xla/tsl/cuda:cupti",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:macros",
     ],
 )
 
+cc_library(
+    name = "cupti_marker_data_parser",
+    srcs = if_cuda_newer_than(
+        "13_0",
+        ["cupti_marker_data_parser_ext.cc"],
+        ["cupti_marker_data_parser.cc"],
+    ),
+    hdrs = ["cupti_marker_data_parser.h"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla/tsl/cuda:cupti",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_config_cuda//cuda:cuda_headers",
+    ] + if_cuda_newer_than(
+        "13_0",
+        [":cupti_nvtx_ext_payload"],
+        [],
+    ),
+)
+
 cc_library(
     name = "cupti_collector",
     srcs = ["cupti_collector.cc"],
@@ -506,6 +618,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
+        ":cupti_marker_data_parser",
         ":cupti_utils",
         "//xla/tsl/cuda:cupti",
         "//xla/tsl/profiler/utils:buffer_pool",
@@ -707,7 +820,7 @@ xla_test(
     deps = [
         ":cupti_collector",
         ":cupti_error_manager",
-        ":cupti_pm_sampler_stub",
+        ":cupti_pm_sampler_factory",  # buildcleaner: keep
         ":cupti_tracer",
         ":cupti_utils",
         ":cupti_wrapper",
@@ -718,26 +831,3 @@ xla_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-cc_library(
-    name = "cupti_tracer_options_utils",
-    srcs = ["cupti_tracer_options_utils.cc"],
-    hdrs = ["cupti_tracer_options_utils.h"],
-    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cupti_collector",
-        ":cupti_tracer",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/profiler/utils:profiler_options_util",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
-    ],
-)
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
index 668af8494f7b9b..7167139a85b410 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
@@ -16,12 +16,16 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_buffer_events.h"
 
 #include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
+#include "xla/backends/profiler/gpu/cupti_marker_data_parser.h"
 #include "xla/backends/profiler/gpu/cupti_utils.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/mem.h"
@@ -268,6 +272,29 @@ void AddMarkerActivityEvent(CuptiEventCollectorDelegate &collector,
   }
 }
 
+void AddMarkerDataActivityEvent(CuptiEventCollectorDelegate& collector,
+                                void* marker_data_trace) {
+  std::optional<std::pair<std::string, uint32_t>> result =
+      ParseMarkerDataActivity(marker_data_trace);
+  if (result.has_value() && !result.value().first.empty()) {
+    collector.receive(CuptiTracerEvent{
+        /* .type = */ CuptiTracerEventType::MarkerData,
+        /* .source = */ CuptiTracerEventSource::Activity,
+        /* .name = */ std::move(result.value().first),
+        /* .annotation = */ "",
+        /* .nvtx_range = */ "",
+        /* .start_time_ns = */ 0,
+        /* .end_time_ns = */ 0,
+        /* .device_id = */ 0,
+        /* .correlation_id = */ 0,
+        /* .thread_id = */ 0,
+        /* .context_id = */ 0,
+        /* .stream_id = */ 0,
+        /* .graph_id = */ result.value().second,
+    });
+  }
+}
+
 void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
                             const CuptiActivityMemcpyTy *memcpy) {
   CuptiTracerEvent event{};
@@ -576,6 +603,9 @@ static absl::Status ConvertActivityBuffer(
           AddMarkerActivityEvent(
               collector, reinterpret_cast<CuptiActivityMarkerTy *>(record));
           break;
+        case CUPTI_ACTIVITY_KIND_MARKER_DATA:
+          AddMarkerDataActivityEvent(collector, static_cast<void*>(record));
+          break;
         default:
           VLOG(3) << "Activity type " << record->kind << " is not supported.";
           break;
@@ -643,6 +673,8 @@ const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
       return "ThreadMarkerEnd";
     case CuptiTracerEventType::CudaGraphNodeMap:
       return "CudaGraphNodeMap";
+    case CuptiTracerEventType::MarkerData:
+      return "MarkerData";
     case CuptiTracerEventType::Unsupported:
       return "";
   }
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
index 5d3c8bb585c418..2a3db2f266bbd0 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -144,6 +144,10 @@ struct CudaGraphDetails {
                                 // node is cloned.
 };
 
+struct MarkerDataDetails {
+  absl::string_view marker_string;
+};
+
 inline std::string ToXStat(const KernelDetails& kernel_info,
                            double occupancy_pct) {
   return absl::StrCat(
@@ -180,6 +184,7 @@ enum class CuptiTracerEventType {
   ThreadMarkerStart = 17,
   ThreadMarkerEnd = 18,
   CudaGraphNodeMap = 19,
+  MarkerData = 20,
   Generic = 100,
 };
 
@@ -242,6 +247,8 @@ struct CuptiTracerEvent {
     GenericDetails generic_info;
     // Used for `source` DriverCallback, `type` must be CudaGraph.
     CudaGraphDetails cuda_graph_info;
+    // Used for `source` Activity, `type` must be ThreadMarkerRange.
+    MarkerDataDetails marker_data_info;
   };
 };
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index abd78721861a0a..c52d3af6c02fcd 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -355,6 +355,12 @@ class PerDeviceCollector {
                               event.cuda_graph_info.orig_graph_id);
         }
       }
+    } else if (event.type == CuptiTracerEventType::ThreadMarkerRange) {
+      if (!event.marker_data_info.marker_string.empty()) {
+        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                                GetStatTypeStr(StatType::kMarkerPayloadString)),
+                            event.marker_data_info.marker_string);
+      }
     }
 
     std::vector<Annotation> annotation_stack =
@@ -834,7 +840,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
       num_activity_events_++;
     }
     if (event.type == CuptiTracerEventType::ThreadMarkerStart ||
-        event.type == CuptiTracerEventType::ThreadMarkerEnd) {
+        event.type == CuptiTracerEventType::ThreadMarkerEnd ||
+        event.type == CuptiTracerEventType::MarkerData) {
       // Process the nvtx marker, merge thread range start/end if appropriate.
       // If merged, the event will contains the merged content, and be used for
       // followed AddEvent() processing.
@@ -948,6 +955,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
       cuda_graph_node_id_map_;
   // Map from graph_id to original graph_id
   absl::flat_hash_map<uint32_t, uint32_t> cuda_graph_id_map_;
+  // For marker data strings.
+  StringDeduper string_deduper_;
 
   // process the nvtx marker, a)cache range start event, or b)merge range end
   // with its corresponding start event. If merged, the event be updated with
@@ -957,6 +966,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     auto it = nvtx_markers_.find(marker_id);
     if (event.type == CuptiTracerEventType::ThreadMarkerStart) {
       if (it == nvtx_markers_.end()) {
+        // clear the marker data string.
+        event.marker_data_info.marker_string = "";
         nvtx_markers_[marker_id] =
             std::make_unique<CuptiTracerEvent>(std::move(event));
       } else {
@@ -969,12 +980,24 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
         it->second->end_time_ns = event.end_time_ns;
         it->second->graph_id = 0;
         event = std::move(*it->second);
+        event.marker_data_info.marker_string =
+            it->second->marker_data_info.marker_string;
         nvtx_markers_.erase(it);
         return true;  // The event is merged for further processing.
       } else {
         LOG_IF(ERROR, ++num_unmatched_nvtx_marker_end_ < 100)
             << "Unmatched nvtx thread range end marker id: " << marker_id;
       }
+    } else if (event.type == CuptiTracerEventType::MarkerData) {
+      if (it == nvtx_markers_.end()) {
+        LOG_IF(ERROR, ++num_unmatched_nvtx_marker_end_ < 100)
+            << "Unmatched marker data for marker id: " << marker_id;
+      } else {
+        if (!event.name.empty()) {
+          it->second->marker_data_info.marker_string =
+              string_deduper_.Dedup(event.name);
+        }
+      }
     }
     // No merged event is generated, return false.
     return false;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser.cc
new file mode 100644
index 00000000000000..887b9e4dff80fe
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser.cc
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/cupti_marker_data_parser.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
+
+namespace xla {
+namespace profiler {
+
+// For early version of cuda/cupti, where marker data is not fully supported,
+// return nullopt for not including marker data.
+std::optional<CUpti_ActivityKind> GetActivityMarkerDataKind() {
+  return std::nullopt;
+}
+
+std::optional<std::pair<std::string, uint32_t>> ParseMarkerDataActivity(
+    void* marker_data_activity) {
+  return std::nullopt;
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser.h b/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser.h
new file mode 100644
index 00000000000000..a7c86b6c86ae73
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser.h
@@ -0,0 +1,41 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_MARKER_DATA_PARSER_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_MARKER_DATA_PARSER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
+
+namespace xla {
+namespace profiler {
+
+// Returns the list of activity kinds for supported marker data kinds.
+// Start supporting marker data from 13.0 with CUpti_ActivityMarkerData2.
+// So before that, return empty list. The list will be used when starting
+// cupti activity tracing.
+std::optional<CUpti_ActivityKind> GetActivityMarkerDataKind();
+
+std::optional<std::pair<std::string, uint32_t>> ParseMarkerDataActivity(
+    void* marker_data_activity);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_MARKER_DATA_PARSER_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser_ext.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser_ext.cc
new file mode 100644
index 00000000000000..d2cb2d3e6efdf9
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_marker_data_parser_ext.cc
@@ -0,0 +1,63 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstdlib>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExtPayload.h"
+#include "xla/backends/profiler/gpu/cupti_marker_data_parser.h"
+#include "xla/backends/profiler/gpu/cupti_nvtx_ext_payload.h"
+
+namespace xla {
+namespace profiler {
+
+// For cuda version after 13.0. Return CUPTI_ACTIVITY_KIND_MARKER_DATA to
+// indicate cupti activities should include it when starting tracing.
+std::optional<CUpti_ActivityKind> GetActivityMarkerDataKind() {
+  return CUPTI_ACTIVITY_KIND_MARKER_DATA;
+}
+
+std::optional<std::pair<std::string, uint32_t>> ParseMarkerDataActivity(
+    void* marker_data_activity) {
+  auto* marker_data =
+      static_cast<CUpti_ActivityMarkerData2*>(marker_data_activity);
+  if (marker_data->payloadKind ==
+      CUPTI_METRIC_VALUE_KIND_NVTX_EXTENDED_PAYLOAD) {
+    uint64_t payloadAddress =
+        marker_data->payload.metricValueNvtxExtendedPayload;
+    auto* payload = reinterpret_cast<nvtxPayloadData_t*>(payloadAddress);
+
+    std::string result_str;
+    CuptiParseNvtxPayload(marker_data->cuptiDomainId, payload, result_str);
+
+    // Free the payload memory allocated by CUPTI.
+    if (payload != nullptr) {
+      if (payload->payload != nullptr) {
+        free(const_cast<void*>(payload->payload));
+        payload->payload = nullptr;
+      }
+      free(payload);
+    }
+    return std::make_pair(std::move(result_str), marker_data->id);
+  }
+  return std::nullopt;
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_nvtx_ext_payload.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_nvtx_ext_payload.cc
new file mode 100644
index 00000000000000..bee762596cfb10
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_nvtx_ext_payload.cc
@@ -0,0 +1,677 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/cupti_nvtx_ext_payload.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExtPayload.h"
+
+extern "C" CUptiResult CUPTIAPI cuptiActivityGetNvtxExtPayloadAttr(
+    uint32_t cupti_domain_id, uint64_t schema_id,
+    CUpti_NvtxExtPayloadAttr* payload_attributes);
+
+extern "C" const nvtxPayloadEntryTypeInfo_t*
+cuptiActivityGetNvtxExtPayloadEntryTypeInfo();
+
+namespace xla {
+namespace profiler {
+
+namespace {
+
+// Define the size as unknown or invalid. For unknown size, dynamic automatic
+// detection may be used to determine the size. Invalid size indicates
+// the size is not applicable or not supported.
+static constexpr size_t kSizeUnknown = 0;
+static constexpr size_t kSizeInvalid = std::numeric_limits<size_t>::max();
+
+inline bool AreValidSizes(size_t a, size_t b) {
+  return a != kSizeInvalid && b != kSizeInvalid;
+}
+
+inline bool IsFixedSize(size_t a) {
+  return a != kSizeUnknown && a != kSizeInvalid;
+}
+
+inline bool AreFixedSizes(size_t a, size_t b) {
+  return IsFixedSize(a) && IsFixedSize(b);
+}
+
+inline const char* NullCStringToEmpty(const char* str) {
+  return str ? str : "";
+}
+
+// Define the attributes of NVTX payload schema or enum, which are similar to
+// those maintained by NVTX and CUPTI.
+struct NvtxPayloadAttributes {
+  uint32_t payload_type = 0;  // CUPTI_NVTX_EXT_PAYLOAD_TYPE_ SCHEMA or ENUM
+  uint32_t domain_id = 0;
+  uint64_t schema_id = 0;
+  std::string name = {};
+
+  bool IsEnum() const {
+    return payload_type == CUPTI_NVTX_EXT_PAYLOAD_TYPE_ENUM;
+  }
+  bool IsSchema() const {
+    return payload_type == CUPTI_NVTX_EXT_PAYLOAD_TYPE_SCHEMA;
+  }
+};
+
+struct NvtxSchemaEntry {
+  uint64_t flags = 0;            // currently only 0 is supported
+  uint64_t type = 0;             // predefined type or custom schema ID
+  std::string name = {};         // name of field
+  std::string description = {};  // description of field
+  uint64_t extent = 0;           // string or array length or union selector
+  uint64_t offset = 0;           // offset in the structure (in bytes)
+};
+
+// Field of schema_type: NVTX_PAYLOAD_SCHEMA_TYPE_*, where * could be
+// INVALID(0), STATIC, DYNAMIC, UNION, UNION_WITH_INTERNAL_SELECTOR.
+// Currently only STATIC is supported.
+struct NvtxPayloadSchema : public NvtxPayloadAttributes {
+  uint64_t field_mask = 0;
+  uint64_t schema_type = NVTX_PAYLOAD_SCHEMA_TYPE_INVALID;
+  uint64_t flags = 0;
+  uint64_t payload_static_size = 0;
+  uint64_t pack_and_align = 0;
+  std::vector<NvtxSchemaEntry> entries = {};
+  std::atomic<bool> processed = false;
+
+  NvtxPayloadSchema(uint32_t domain_id, uint64_t schema_id,
+                    nvtxPayloadSchemaAttr_t& schema_attr);
+};
+
+struct NvtxEnumEntry {
+  std::string name = {};
+  uint64_t value = 0;
+  int8_t is_flag = 0;
+};
+
+struct NvtxPayloadEnum : public NvtxPayloadAttributes {
+  uint64_t field_mask = 0;
+  uint64_t size_of_enum = 0;
+  std::vector<NvtxEnumEntry> entries = {};
+
+  NvtxPayloadEnum(uint32_t domain_id, uint64_t schema_id,
+                  nvtxPayloadEnumAttr_t& enum_attr);
+};
+
+// Maps schema IDs to the corresponding NVTX payload attributes in single
+// domain.
+class NvtxSchemaIdToSchema {
+  absl::Mutex mutex_ = {};
+  uint32_t domain_id_ ABSL_GUARDED_BY(mutex_) = 0;
+  absl::flat_hash_map<uint64_t, std::unique_ptr<NvtxPayloadAttributes>> schemas_
+      ABSL_GUARDED_BY(mutex_) = {};
+
+ public:
+  explicit NvtxSchemaIdToSchema(uint32_t domain_id) : domain_id_(domain_id) {}
+
+  // Get the NVTX payload attributes for the given schema ID. If the schema is
+  // not found, query CUPTI for the attributes.
+  NvtxPayloadAttributes* GetNvtxPayloadAttributes(uint64_t schema_id);
+};
+
+class NvtxDomainSchemas {
+ public:
+  // Get the per-domain NVTX payload schemas for given domain_id.
+  static NvtxSchemaIdToSchema& ForDomain(uint32_t p_domain_id) {
+    static NvtxDomainSchemas* singleton_instance = new NvtxDomainSchemas();
+    return singleton_instance->GetNvtxDomainSchemas(p_domain_id);
+  }
+
+ private:
+  absl::Mutex mutex_ = {};
+  // Maps domain IDs to the corresponding per-domain NVTX payload schemas, note
+  // that it is pointer stable for the value type NvtxSchemaIdToSchema.
+  absl::flat_hash_map<uint32_t, std::unique_ptr<NvtxSchemaIdToSchema>>
+      all_domains_schemas_ ABSL_GUARDED_BY(mutex_) = {};
+
+  NvtxSchemaIdToSchema& GetNvtxDomainSchemas(uint32_t domain_id) {
+    absl::MutexLock lock(mutex_);
+    auto it = all_domains_schemas_.find(domain_id);
+    if (it == all_domains_schemas_.end()) {
+      it = all_domains_schemas_
+               .insert({domain_id,
+                        std::make_unique<NvtxSchemaIdToSchema>(domain_id)})
+               .first;
+    }
+    return *(it->second);
+  }
+};
+
+NvtxPayloadSchema::NvtxPayloadSchema(uint32_t p_domain_id, uint64_t p_schema_id,
+                                     nvtxPayloadSchemaAttr_t& schema_attr)
+    : NvtxPayloadAttributes{CUPTI_NVTX_EXT_PAYLOAD_TYPE_SCHEMA, p_domain_id,
+                            p_schema_id, NullCStringToEmpty(schema_attr.name)},
+      field_mask(schema_attr.fieldMask),
+      schema_type(schema_attr.type),
+      flags(schema_attr.flags),
+      payload_static_size(schema_attr.payloadStaticSize),
+      pack_and_align(schema_attr.packAlign),
+      processed(false) {
+  // Populate the schema's entries from the CUPTI-provided array.
+  this->entries.reserve(schema_attr.numEntries);
+  for (size_t i = 0; i < schema_attr.numEntries; ++i) {
+    const nvtxPayloadSchemaEntry_t& entry = schema_attr.entries[i];
+    this->entries.push_back(
+        NvtxSchemaEntry{entry.flags, entry.type, NullCStringToEmpty(entry.name),
+                        NullCStringToEmpty(entry.description),
+                        entry.arrayOrUnionDetail, entry.offset});
+  }
+  if (schema_attr.entries != nullptr) {
+    // Free the CUPTI-allocated payload entries array if it was allocated.
+    free(const_cast<void*>(static_cast<const void*>(schema_attr.entries)));
+  }
+  schema_attr.entries = nullptr;  // Make sure it is marked as released.
+}
+
+NvtxPayloadEnum::NvtxPayloadEnum(uint32_t p_domain_id, uint64_t p_schema_id,
+                                 nvtxPayloadEnumAttr_t& enum_attr)
+    : NvtxPayloadAttributes{CUPTI_NVTX_EXT_PAYLOAD_TYPE_ENUM, p_domain_id,
+                            p_schema_id, NullCStringToEmpty(enum_attr.name)},
+      field_mask(enum_attr.fieldMask),
+      size_of_enum(enum_attr.sizeOfEnum) {
+  this->entries.reserve(enum_attr.numEntries);
+  for (size_t i = 0; i < enum_attr.numEntries; ++i) {
+    const nvtxPayloadEnum_t& entry = enum_attr.entries[i];
+    this->entries.push_back(
+        NvtxEnumEntry{entry.name, entry.value, entry.isFlag});
+  }
+  if (enum_attr.entries != nullptr) {
+    // Free the CUPTI-allocated payload entries array if it was allocated.
+    free(const_cast<void*>(static_cast<const void*>(enum_attr.entries)));
+  }
+  enum_attr.entries = nullptr;  // Make sure it is marked as released.
+}
+
+NvtxPayloadAttributes* NvtxSchemaIdToSchema::GetNvtxPayloadAttributes(
+    uint64_t schema_id) {
+  absl::MutexLock lock(mutex_);
+  auto schema = schemas_.find(schema_id);
+  if (schema != schemas_.end()) {
+    return schema->second.get();
+  }
+
+  // If the schema is not found, query CUPTI for the attributes.
+  CUpti_NvtxExtPayloadAttr cupti_payload_attrs = {0};
+  CUptiResult result = cuptiActivityGetNvtxExtPayloadAttr(domain_id_, schema_id,
+                                                          &cupti_payload_attrs);
+  if (result != CUPTI_SUCCESS) {
+    VLOG(1) << "Could not get NVTX payload attributes from CUPTI for schema:"
+            << schema_id << " in domain: " << domain_id_;
+    return nullptr;
+  }
+  if (cupti_payload_attrs.attributes == nullptr) {
+    VLOG(1) << "Payload schema/enum attribute is null from CUPTI for schema:"
+            << schema_id << " in domain: " << domain_id_;
+    return nullptr;
+  }
+
+  NvtxPayloadAttributes* attrs = nullptr;
+  if (cupti_payload_attrs.type == CUPTI_NVTX_EXT_PAYLOAD_TYPE_SCHEMA) {
+    auto [it, inserted] = schemas_.insert(
+        {schema_id, std::make_unique<NvtxPayloadSchema>(
+                        domain_id_, schema_id,
+                        *reinterpret_cast<nvtxPayloadSchemaAttr_t*>(
+                            cupti_payload_attrs.attributes))});
+    attrs = it->second.get();
+  } else if (cupti_payload_attrs.type == CUPTI_NVTX_EXT_PAYLOAD_TYPE_ENUM) {
+    auto [it, inserted] = schemas_.insert(
+        {schema_id, std::make_unique<NvtxPayloadEnum>(
+                        domain_id_, schema_id,
+                        *reinterpret_cast<nvtxPayloadEnumAttr_t*>(
+                            cupti_payload_attrs.attributes))});
+    attrs = it->second.get();
+  }
+  // Free the CUPTI-allocated payload attribute memory by above call to
+  // cuptiActivityGetNvtxExtPayloadAttr().
+  free(cupti_payload_attrs.attributes);
+
+  return attrs;
+}
+
+struct PayloadSizeAndAlign {
+  uint16_t size = 0;   // Size of the data type in bytes
+  uint16_t align = 0;  // Alignment of the data type in bytes
+};
+
+// Get the singleton instance of the predefined payload types from CUPTI.
+const std::vector<PayloadSizeAndAlign>& PredefinedPayloadTypes() {
+  static std::vector<PayloadSizeAndAlign>* predefined_types = []() {
+    auto* global_data = new std::vector<PayloadSizeAndAlign>();
+    // Query CUPTI for the NVTX payload entry type information.
+    const nvtxPayloadEntryTypeInfo_t* payload_type_info =
+        cuptiActivityGetNvtxExtPayloadEntryTypeInfo();
+    if (payload_type_info == nullptr) {
+      LOG(ERROR) << ("Could not initialize NVTX predefined payload type!");
+      return global_data;
+    }
+
+    // The first element in fact defines the length of the info array.
+    global_data->reserve(payload_type_info->size);
+    for (uint16_t i = 0; i < payload_type_info->size; ++i) {
+      global_data->push_back(PayloadSizeAndAlign{
+          payload_type_info[i].size,
+          payload_type_info[i].align,
+      });
+    }
+    VLOG(9) << "Initialized NVTX predefined payload type info with "
+            << global_data->size() << " entries.";
+    return global_data;
+  }();
+  return *predefined_types;
+}
+
+size_t GetSizeOfFixedSizeTypes(uint64_t type) {
+  switch (type) {
+    case NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16:
+    case NVTX_PAYLOAD_ENTRY_TYPE_BF16:
+    case NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16:
+      return 2;
+    case NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32:
+    case NVTX_PAYLOAD_ENTRY_TYPE_TF32:
+    case NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32:
+    case NVTX_PAYLOAD_ENTRY_TYPE_CATEGORY:
+    case NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB:
+    case NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32:
+    case NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32:
+      return 4;
+    case NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64:
+    case NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT64:
+    case NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT64:
+    case NVTX_PAYLOAD_ENTRY_TYPE_SCOPE_ID:
+      return 8;
+    case NVTX_PAYLOAD_ENTRY_TYPE_INT128:
+    case NVTX_PAYLOAD_ENTRY_TYPE_UINT128:
+    case NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128:
+      return 16;
+    case NVTX_PAYLOAD_ENTRY_TYPE_BYTE:
+    case NVTX_PAYLOAD_ENTRY_TYPE_CSTRING:
+    case NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8:
+    case NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR:
+      return 1;
+    default:
+      return kSizeInvalid;
+  }
+}
+
+/**
+ * @brief Returns the size (in bytes) of a predefined NVTX payload entry type.
+ *
+ * For standard NVTX types, size and alignment data is fetched from
+ * g_nvtxData.nvtxPayloadDataTypes. For special cases (not in the standard
+ * array), GetSizeOfFixedSizeTypes() is used. Handles special cases for
+ * registered string handles and unknown types.
+ *
+ * @param type The NVTX payload entry type identifier.
+ * @return The size in bytes of the type, or InvalidTypeSize (usually 0) if
+ * unknown.
+ */
+size_t GetSizeOfPayloadPredefinedType(uint64_t type) {
+  // If the type is within the range of the info array, use the global data
+  // types vector.
+  if (type < NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE) {
+    // Check if the type index is valid for the vector.
+    if (type >= PredefinedPayloadTypes().size()) {
+      VLOG(1) << "NVTX payload entry type:" << type
+              << " is not found among pre-defined payload types.";
+      return kSizeInvalid;
+    }
+    return PredefinedPayloadTypes()[type].size;
+  }
+  if (type < NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE) {
+    // If the type is not in the info array, but is less than the string handle
+    // type, use the fixed size types.
+    return GetSizeOfFixedSizeTypes(type);
+  }
+  if (type == NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE &&
+      NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS < PredefinedPayloadTypes().size()) {
+    // If the type is the registered string handle, use the address type's size.
+    return PredefinedPayloadTypes()[NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS].size;
+  }
+  return kSizeInvalid;
+}
+
+size_t AlignTo(size_t offset, size_t type_size, size_t alignment) {
+  // The entry_offset is treated as a pointer for alignment calculation.
+  void* addr_to_align = reinterpret_cast<void*>(offset);
+
+  // The buffer size is not known, so use SIZE_MAX as a placeholder.
+  size_t sz = SIZE_MAX;
+
+  // Use std::align to compute the next aligned address.
+  // NOTE: This requires C++17 or later.
+  void* aligned_addr = std::align(alignment, type_size, addr_to_align, sz);
+  return aligned_addr ? reinterpret_cast<size_t>(aligned_addr) : offset;
+}
+
+// Align the entry. entry_size and entry_align are valid size. If manual_offset
+// is not zero, it must be less than orig_offset, otherwise the entry will be
+// used as the starting offset of the entry. If manual_offset is zero, the
+// orig_offset will be aligned to the next multiple of entry_align. Note that
+// when entry_idx is zero, manual offset should be set to zero.
+// Returns false if the manual_offset is not valid, otherwise true.
+bool AlignEntryOffset(size_t& orig_offset, size_t manual_offset,
+                      size_t entry_size, size_t entry_align, size_t entry_idx) {
+  if (manual_offset != 0 && manual_offset < orig_offset) {
+    return false;
+  }
+  orig_offset = manual_offset ? manual_offset
+                              : AlignTo(orig_offset, entry_size, entry_align);
+  return true;
+}
+
+// Declare in advance for recursion with UpdateSizeAndAlignForSchema().
+std::pair<size_t, size_t> GetSizeAndAlign(NvtxSchemaIdToSchema& domain_schemas,
+                                          const NvtxSchemaEntry& entry,
+                                          int depth);
+
+void UpdateSizeAndAlignForSchema(NvtxSchemaIdToSchema& domain_schemas,
+                                 NvtxPayloadSchema& schema, int depth) {
+  if (schema.processed) {
+    return;
+  }
+  if (schema.schema_type != NVTX_PAYLOAD_SCHEMA_TYPE_STATIC) {
+    schema.payload_static_size = kSizeInvalid;
+  } else if (AreValidSizes(schema.pack_and_align, schema.payload_static_size)) {
+    // Calculate size and alignment for the schema by iterating through its
+    // entries. Verify or update the size and alignment of the whole schema.
+    // It may recursively update for nested schemas.
+    size_t schema_size = 0LL, schema_align = 0LL, entry_idx = 0LL;
+    for (const NvtxSchemaEntry& entry : schema.entries) {
+      auto [entry_size, entry_align] =
+          GetSizeAndAlign(domain_schemas, entry, depth + 1);
+
+      if (!AreFixedSizes(entry_size, entry_align) ||
+          !AlignEntryOffset(schema_size, entry_idx ? entry.offset : 0,
+                            entry_size, entry_align, entry_idx)) {
+        schema.pack_and_align = kSizeInvalid;
+        schema.payload_static_size = kSizeInvalid;
+        break;
+      }
+      entry_idx++;
+
+      // Increasing size and update alignment
+      schema_size += entry_size;
+      schema_align = std::max(schema_align, entry_align);
+    }
+    if (schema.pack_and_align == kSizeUnknown) {
+      schema.pack_and_align = schema_align;
+    }
+    if (schema.payload_static_size == kSizeUnknown) {
+      schema.payload_static_size = schema_size;
+    } else if (schema.payload_static_size < schema_size) {
+      schema.payload_static_size = kSizeInvalid;
+    }
+  }
+  schema.processed = true;
+}
+
+std::pair<size_t, size_t> GetSizeAndAlign(NvtxSchemaIdToSchema& domain_schemas,
+                                          const NvtxSchemaEntry& entry,
+                                          int depth = 0) {
+  if (entry.flags != 0) {  // No support for flags other than zero.
+    return {kSizeInvalid, kSizeInvalid};
+  }
+  // Limit depth of nested schemas, also avoid circular reference.
+  if (depth > 5) {
+    VLOG(1) << "NVTX payload schema nested too deeply";
+    return {kSizeInvalid, kSizeInvalid};
+  }
+
+  if (entry.type < NVTX_PAYLOAD_SCHEMA_ID_STATIC_START) {
+    size_t type_size = GetSizeOfPayloadPredefinedType(entry.type);
+    // Handle fixed size strings.
+    bool use_extent = (type_size != kSizeInvalid &&
+                       entry.type >= NVTX_PAYLOAD_ENTRY_TYPE_CSTRING &&
+                       entry.type <= NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32);
+    return {type_size * (use_extent ? entry.extent : 1), type_size};
+  }
+
+  if (NvtxPayloadAttributes* payload_attributes =
+          domain_schemas.GetNvtxPayloadAttributes(entry.type);
+      payload_attributes != nullptr) {
+    if (payload_attributes->IsEnum()) {
+      auto& payload_enum = *static_cast<NvtxPayloadEnum*>(payload_attributes);
+      return {payload_enum.size_of_enum, payload_enum.size_of_enum};
+    }
+    if (payload_attributes->IsSchema()) {
+      auto& schema = *static_cast<NvtxPayloadSchema*>(payload_attributes);
+      UpdateSizeAndAlignForSchema(domain_schemas, schema, depth);
+      return {schema.payload_static_size, schema.pack_and_align};
+    }
+  }
+  return {kSizeInvalid, kSizeInvalid};
+}
+
+template <typename T>
+T ValueOf(const char* payload_data) {
+  return *reinterpret_cast<const T*>(payload_data);
+}
+
+void ParseValueOfPredefinedType(const NvtxSchemaEntry& entry,
+                                const char* payload_base, std::string& output) {
+  switch (entry.type) {
+    case NVTX_PAYLOAD_ENTRY_TYPE_CHAR:
+      absl::StrAppend(&output, absl::string_view(payload_base, 1));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_UCHAR:
+      absl::StrAppend(&output, ValueOf<unsigned char>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_SHORT:
+      absl::StrAppend(&output, ValueOf<int16_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_USHORT:
+      absl::StrAppend(&output, ValueOf<uint16_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_INT:
+      absl::StrAppend(&output, ValueOf<int>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_UINT:
+      absl::StrAppend(&output, ValueOf<unsigned int>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_LONG:
+      absl::StrAppend(&output, ValueOf<int32_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_ULONG:
+      absl::StrAppend(&output, ValueOf<uint32_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG:
+      absl::StrAppend(&output, ValueOf<int64_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG:
+      absl::StrAppend(&output, ValueOf<uint64_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_INT8:
+      absl::StrAppend(&output, ValueOf<int8_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_UINT8:
+      absl::StrAppend(&output, ValueOf<uint8_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_INT16:
+      absl::StrAppend(&output, ValueOf<int16_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_UINT16:
+      absl::StrAppend(&output, ValueOf<uint16_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_INT32:
+      absl::StrAppend(&output, ValueOf<int32_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_UINT32:
+      absl::StrAppend(&output, ValueOf<uint32_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_INT64:
+      absl::StrAppend(&output, ValueOf<int64_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_UINT64:
+      absl::StrAppend(&output, ValueOf<uint64_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_FLOAT:
+      absl::StrAppend(&output, ValueOf<float>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE:
+      absl::StrAppend(&output, ValueOf<double>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE:
+      absl::StrAppend(&output,
+                      std::to_string(ValueOf<long double>(payload_base)));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_SIZE:
+      absl::StrAppend(&output, ValueOf<size_t>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32:
+      absl::StrAppend(&output, ValueOf<float>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64:
+      absl::StrAppend(&output, ValueOf<double>(payload_base));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS:
+      absl::StrAppend(&output, absl::Hex(ValueOf<void*>(payload_base)));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_CSTRING:
+    case NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8:
+      absl::StrAppend(&output, absl::string_view(payload_base, entry.extent));
+      break;
+    case NVTX_PAYLOAD_ENTRY_TYPE_BYTE:
+      absl::StrAppend(&output, absl::Hex(ValueOf<unsigned char>(payload_base)));
+      break;
+    default:
+      VLOG(3) << "NVTX payload schema entry type " << entry.type
+              << " is not supported as a predefined type.";
+      break;
+  }
+}
+
+size_t ParseValueOfPayloadEnum(const NvtxPayloadEnum* enum_attrs,
+                               const char* payload_base, std::string& output) {
+  auto size_of_enum = enum_attrs->size_of_enum;
+  if (size_of_enum == 8 || size_of_enum == 4) {
+    uint64_t enum_value =
+        size_of_enum == 8
+            ? *reinterpret_cast<const uint64_t*>(payload_base)
+            : static_cast<uint64_t>(
+                  *reinterpret_cast<const uint32_t*>(payload_base));
+    for (const NvtxEnumEntry& entry : enum_attrs->entries) {
+      if (entry.value == enum_value) {
+        absl::StrAppend(&output, entry.name, "(", enum_value, ")");
+        return size_of_enum;
+      }
+    }
+    absl::StrAppend(&output, "UNKNOWN_ENUM_VALUE(", enum_value, ")");
+  }
+  return size_of_enum;
+}
+
+size_t ParseNvtxExtPayloadEntry(NvtxSchemaIdToSchema& domain_schemas,
+                                const NvtxSchemaEntry& entry,
+                                const char* payload_base, uint64_t payload_size,
+                                std::string& result_str, int depth) {
+  auto [entry_size, entry_align] =
+      GetSizeAndAlign(domain_schemas, entry, depth);
+  if (!AreFixedSizes(entry_size, entry_align)) {
+    return kSizeInvalid;
+  }
+  if (payload_size < entry_size) {
+    VLOG(1) << "NVTX payload size " << payload_size << " < entry size "
+            << entry_size << " for entry " << entry.name << " (" << entry.type
+            << ")";
+    return kSizeInvalid;
+  }
+
+  if (!entry.name.empty()) {
+    absl::StrAppend(&result_str, entry.name, " : ");
+  }
+
+  if (entry.type < NVTX_PAYLOAD_SCHEMA_ID_STATIC_START) {
+    ParseValueOfPredefinedType(entry, payload_base, result_str);
+    return entry_size;
+  }
+
+  NvtxPayloadAttributes* payload_attributes =
+      domain_schemas.GetNvtxPayloadAttributes(entry.type);
+  if (payload_attributes == nullptr) {
+    return kSizeInvalid;
+  }
+
+  if (payload_attributes->IsEnum()) {
+    auto& payload_enum = *static_cast<NvtxPayloadEnum*>(payload_attributes);
+    ParseValueOfPayloadEnum(&payload_enum, payload_base, result_str);
+    return entry_size;
+  }
+
+  if (payload_attributes->IsSchema()) {
+    auto& schema = *static_cast<NvtxPayloadSchema*>(payload_attributes);
+    size_t entry_offset = 0, entry_idx = 0;
+    absl::StrAppend(&result_str, depth ? "{" : "");
+    for (const NvtxSchemaEntry& entry : schema.entries) {
+      auto [entry_size, entry_align] = GetSizeAndAlign(domain_schemas, entry);
+      // Align the entry_offset.
+      if (!AreFixedSizes(entry_size, entry_align) ||
+          !AlignEntryOffset(entry_offset, entry_idx ? entry.offset : 0,
+                            entry_size, entry_align, entry_idx) ||
+          entry_offset > payload_size) {
+        return kSizeInvalid;
+      }
+      if (entry_idx) {
+        absl::StrAppend(&result_str, ", ");
+      }
+      ParseNvtxExtPayloadEntry(
+          domain_schemas, entry, payload_base + entry_offset,
+          payload_size - entry_offset, result_str, depth + 1);
+
+      entry_offset += entry_size;
+      entry_idx++;
+    }
+    absl::StrAppend(&result_str, depth ? "}" : "");
+  }
+  return entry_size;
+}
+
+}  // namespace
+
+void CuptiParseNvtxPayload(uint32_t cupti_domain_id,
+                           nvtxPayloadData_t* payload_data,
+                           std::string& result_str) {
+  if (payload_data != nullptr && payload_data->payload != nullptr &&
+      payload_data->size > 0) {
+    ParseNvtxExtPayloadEntry(
+        NvtxDomainSchemas::ForDomain(cupti_domain_id),
+        NvtxSchemaEntry{.type = payload_data->schemaId, .name = ""},
+        reinterpret_cast<const char*>(payload_data->payload),
+        payload_data->size, result_str, /*depth=*/0);
+  }
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_nvtx_ext_payload.h b/third_party/xla/xla/backends/profiler/gpu/cupti_nvtx_ext_payload.h
new file mode 100644
index 00000000000000..f17d9762b262d8
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_nvtx_ext_payload.h
@@ -0,0 +1,34 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_NVTX_EXT_PAYLOAD_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_NVTX_EXT_PAYLOAD_H_
+
+#include <cstdint>
+#include <string>
+
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExtPayload.h"
+
+namespace xla {
+namespace profiler {
+
+void CuptiParseNvtxPayload(uint32_t cupti_domain_id,
+                           nvtxPayloadData_t* payload_data,
+                           std::string& result_str);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_NVTX_EXT_PAYLOAD_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index f1f144b2576477..61d0d939d7551b 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/optimization.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
@@ -49,6 +50,7 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_buffer_events.h"
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
+#include "xla/backends/profiler/gpu/cupti_marker_data_parser.h"
 #include "xla/backends/profiler/gpu/cupti_pm_sampler.h"
 #include "xla/backends/profiler/gpu/cupti_pm_sampler_factory.h"
 #include "xla/backends/profiler/gpu/cupti_utils.h"
@@ -1088,14 +1090,18 @@ absl::Status CuptiTracer::Enable(
   if (option_->enable_nvtx_tracking) {
     VLOG(1) << "NVTX tracking Enabled.";
     std::vector<CUpti_ActivityKind>& activities = option_->activities_selected;
-    if (std::find(activities.begin(), activities.end(),
-                  CUPTI_ACTIVITY_KIND_MARKER) == activities.end()) {
+    if (!absl::c_contains(activities, CUPTI_ACTIVITY_KIND_MARKER)) {
       VLOG(1) << "Adding CUPTI_ACTIVITY_KIND_MARKER to activities:"
               << (int)CUPTI_ACTIVITY_KIND_MARKER;
       activities.push_back(CUPTI_ACTIVITY_KIND_MARKER);
     }
-    // TODO: Add CUPTI_ACTIVITY_KIND_MARKER_DATA to activities after cupti
-    // more detailed data could be provided by cupti.
+    // If marker data is supported, add it to activities.
+    if (GetActivityMarkerDataKind().has_value() &&
+        !absl::c_contains(activities, CUPTI_ACTIVITY_KIND_MARKER_DATA)) {
+      VLOG(1) << "Adding CUPTI_ACTIVITY_KIND_MARKER_DATA to activities:"
+              << (int)CUPTI_ACTIVITY_KIND_MARKER_DATA;
+      activities.push_back(GetActivityMarkerDataKind().value());
+    }
   }
 
   cupti_driver_api_hook_ = std::make_unique<CuptiDriverApiHookWithActivityApi>(
@@ -1225,6 +1231,8 @@ CuptiTracer::CreateDefaultCallbackIds() {
       CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync,
       // MemAlloc
       CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2,
       CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2,
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc
index 1f7de88ad55802..5284fbd3ded6ac 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <string>
 #include <vector>
 
@@ -73,6 +74,14 @@ absl::Status UpdateCuptiTracerOptionsFromProfilerOptions(
       profile_options, "gpu_dump_graph_node_mapping", input_keys,
       [&](bool value) { collector_options.dump_graph_nope_mapping = value; }));
 
+  TF_RETURN_IF_ERROR(SetValue<int64_t>(
+      profile_options, "gpu_num_chips_to_profile_per_task", input_keys,
+      [&](int64_t value) {
+        if (value >= 0 && value <= std::numeric_limits<uint32_t>::max()) {
+          collector_options.num_gpus = static_cast<uint32_t>(value);
+        }
+      }));
+
   TF_RETURN_IF_ERROR(SetValue<bool>(
       profile_options, "gpu_enable_nvtx_tracking", input_keys,
       [&](bool value) { tracer_options.enable_nvtx_tracking = value; }));
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
index e110e0c6bbd4b3..25043773c3214b 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -78,7 +78,7 @@ class GpuTracer : public tsl::profiler::ProfilerInterface {
 
 absl::Status GpuTracer::DoStart() {
   if (!cupti_tracer_->IsAvailable()) {
-    return tsl::errors::Unavailable("Another profile session running.");
+    return absl::UnavailableError("Another profile session running.");
   }
 
   options_.cbids_selected = CuptiTracer::CreateDefaultCallbackIds();
@@ -101,7 +101,8 @@ absl::Status GpuTracer::DoStart() {
 #endif
 
   CuptiTracerCollectorOptions collector_options;
-  collector_options.num_gpus = cupti_tracer_->NumGpus();
+  int num_gpus = cupti_tracer_->NumGpus();
+  collector_options.num_gpus = num_gpus;
 
   // TODO: Add a test to verify that the options are set correctly and
   // collectors are generating correct data once ProfileData is
@@ -109,6 +110,17 @@ absl::Status GpuTracer::DoStart() {
   TF_RETURN_IF_ERROR(UpdateCuptiTracerOptionsFromProfilerOptions(
       profile_options_, options_, collector_options));
 
+  if (collector_options.num_gpus <= 0 ||
+      collector_options.num_gpus > num_gpus) {
+    if (collector_options.num_gpus != 0) {
+      LOG(WARNING)
+          << "The provided number of GPUs (" << collector_options.num_gpus
+          << ") is invalid. Profiling will be done on all available GPUs ("
+          << num_gpus << ").";
+    }
+    collector_options.num_gpus = num_gpus;
+  }
+
   uint64_t start_gputime_ns = CuptiTracer::GetTimestamp();
   uint64_t start_walltime_ns = tsl::profiler::GetCurrentTimeNanos();
   cupti_collector_ = CreateCuptiCollector(collector_options, start_walltime_ns,
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index f64f01af823979..bc5efe8eac58d8 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -15,17 +15,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <set>
-#include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "rocm/include/hip/amd_detail/hip_prof_str.h"
-#include "rocm/include/roctracer/ext/prof_protocol.h"
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/backends/profiler/gpu/rocm_tracer.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
@@ -38,7 +34,6 @@ namespace profiler {
 using tensorflow::ProfileOptions;
 using tsl::profiler::AnnotationStack;
 using tsl::profiler::ProfilerInterface;
-using tsl::profiler::RegisterProfilerFactory;
 using tsl::profiler::XSpace;
 
 // GpuTracer for ROCm GPU.
@@ -59,7 +54,6 @@ class GpuTracer : public profiler::ProfilerInterface {
   absl::Status DoStop();
 
   RocmTracerOptions GetRocmTracerOptions();
-
   RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);
 
   enum State {
@@ -76,80 +70,8 @@ class GpuTracer : public profiler::ProfilerInterface {
 };
 
 RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
-  // TODO(rocm-profiler): We need support for context similar to CUDA
   RocmTracerOptions options;
-  std::vector<uint32_t> empty_vec;
-
-  // clang formatting does not preserve one entry per line
-  // clang-format off
-  std::vector<uint32_t> hip_api_domain_ops{
-      // KERNEL
-      HIP_API_ID_hipExtModuleLaunchKernel,
-      HIP_API_ID_hipModuleLaunchKernel,
-      HIP_API_ID_hipHccModuleLaunchKernel,
-      HIP_API_ID_hipLaunchKernel,
-      HIP_API_ID_hipExtLaunchKernel,
-      // MEMCPY
-      HIP_API_ID_hipMemcpy,
-      HIP_API_ID_hipMemcpyAsync,
-      HIP_API_ID_hipMemcpyDtoD,
-      HIP_API_ID_hipMemcpyDtoDAsync,
-      HIP_API_ID_hipMemcpyDtoH,
-      HIP_API_ID_hipMemcpyDtoHAsync,
-      HIP_API_ID_hipMemcpyHtoD,
-      HIP_API_ID_hipMemcpyHtoDAsync,
-      HIP_API_ID_hipMemcpyPeer,
-      HIP_API_ID_hipMemcpyPeerAsync,
-
-      // MEMSet
-      HIP_API_ID_hipMemsetD32,
-      HIP_API_ID_hipMemsetD32Async,
-      HIP_API_ID_hipMemsetD16,
-      HIP_API_ID_hipMemsetD16Async,
-      HIP_API_ID_hipMemsetD8,
-      HIP_API_ID_hipMemsetD8Async,
-      HIP_API_ID_hipMemset,
-      HIP_API_ID_hipMemsetAsync,
-
-      // MEMAlloc
-      HIP_API_ID_hipMalloc,
-      HIP_API_ID_hipMallocPitch,
-      // MEMFree
-      HIP_API_ID_hipFree,
-      // GENERIC
-      HIP_API_ID_hipStreamSynchronize,
-  };
-  // clang-format on
-
-  options.api_tracking_set =
-      std::set<uint32_t>(hip_api_domain_ops.begin(), hip_api_domain_ops.end());
-
-  // These are the list of APIs we track since roctracer activity
-  // does not provide all the information necessary to fully populate the
-  // TF events. We need to track the APIs for those activities in API domain but
-  // we only use them for filling the missing items in their corresponding
-  // activity (using correlation id).
-  // clang-format off
-  std::vector<uint32_t> hip_api_aux_ops{
-    HIP_API_ID_hipStreamWaitEvent,
-    // TODO(rocm-profiler): finding device ID from hipEventSynchronize need some
-    // extra work, we ignore it for now.
-    // HIP_API_ID_hipEventSynchronize,
-    HIP_API_ID_hipHostFree,
-    HIP_API_ID_hipHostMalloc,
-    HIP_API_ID_hipSetDevice  //  added to track default device
-  };
-
-  // clang-format on
-
-  hip_api_domain_ops.insert(hip_api_domain_ops.end(), hip_api_aux_ops.begin(),
-                            hip_api_aux_ops.end());
-
-  // options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, hip_api_domain_ops);
-  options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
-
-  options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_OPS, empty_vec);
-
+  options.max_annotation_strings = 1024 * 1024;
   return options;
 }
 
@@ -164,20 +86,16 @@ RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
 }
 
 absl::Status GpuTracer::DoStart() {
-  if (!rocm_tracer_->IsAvailable()) {
-    return tsl::errors::Unavailable("Another profile session running.");
-  }
-
   AnnotationStack::Enable(true);
+  uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
+  uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
 
+  RocmTracerOptions tracer_options = GetRocmTracerOptions();
   RocmTraceCollectorOptions trace_collector_options =
       GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
-  uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
-  uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
   rocm_trace_collector_ = CreateRocmCollector(
       trace_collector_options, start_walltime_ns, start_gputime_ns);
 
-  RocmTracerOptions tracer_options = GetRocmTracerOptions();
   rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
 
   return absl::OkStatus();
@@ -188,9 +106,10 @@ absl::Status GpuTracer::Start() {
   if (status.ok()) {
     profiling_state_ = State::kStartedOk;
     return absl::OkStatus();
+  } else {
+    profiling_state_ = State::kStartedError;
+    return status;
   }
-  profiling_state_ = State::kStartedError;
-  return status;
 }
 
 absl::Status GpuTracer::DoStop() {
@@ -222,9 +141,7 @@ absl::Status GpuTracer::CollectData(XSpace* space) {
       VLOG(3) << "No trace data collected";
       return absl::OkStatus();
     case State::kStoppedOk: {
-      if (rocm_trace_collector_) {
-        rocm_trace_collector_->Export(space);
-      }
+      if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
       return absl::OkStatus();
     }
   }
@@ -236,17 +153,11 @@ absl::Status GpuTracer::CollectData(XSpace* space) {
 std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
     const ProfileOptions& options) {
   if (options.device_type() != ProfileOptions::GPU &&
-      options.device_type() != ProfileOptions::UNSPECIFIED) {
-    return nullptr;
-  }
-
-  profiler::RocmTracer* rocm_tracer =
-      profiler::RocmTracer::GetRocmTracerSingleton();
-  if (!rocm_tracer->IsAvailable()) {
+      options.device_type() != ProfileOptions::UNSPECIFIED)
     return nullptr;
-  }
-
-  return std::make_unique<profiler::GpuTracer>(rocm_tracer);
+  auto& rocm_tracer = profiler::RocmTracer::GetRocmTracerSingleton();
+  if (!rocm_tracer.IsAvailable()) return nullptr;
+  return std::make_unique<profiler::GpuTracer>(&rocm_tracer);
 }
 
 auto register_rocm_gpu_tracer_factory = [] {
diff --git a/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h b/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h
deleted file mode 100644
index 9f253659957cf2..00000000000000
--- a/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
-#define XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
-
-#include <stack>
-
-#include "absl/strings/string_view.h"
-#include "tsl/platform/macros.h"
-
-namespace xla {
-namespace profiler {
-
-/***
- * TODO: After using CUPTI activity marker, remove NVTXRangeTracker related
- * code.
- * We have no intention to use NVTX in tensorflow right now, we use this class
- * to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT).
- * This bears a lot of resemblance to ScopedAnnotation for now.  In the future,
- * we will use TraceMe to keep track trace context within a thread.
- */
-class NVTXRangeTracker {
- public:
-  static void EnterRange(const std::string& range) {
-    auto& range_stack = GetRangeStack();
-    range_stack.push(range);
-  }
-  static void ExitRange() {
-    auto& range_stack = GetRangeStack();
-    if (!range_stack.empty()) range_stack.pop();
-  }
-  static const absl::string_view CurrentRange() {
-    auto& range_stack = GetRangeStack();
-    if (!range_stack.empty()) return range_stack.top();
-    return "";
-  }
-
- private:
-  static std::stack<std::string>& GetRangeStack();
-
-  NVTXRangeTracker(const NVTXRangeTracker&) = delete;
-  void operator=(const NVTXRangeTracker&) = delete;
-};
-
-}  // namespace profiler
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
index 6b9da0c26bbae6..5bcb9d52e84d1e 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,45 +15,45 @@ limitations under the License.
 
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 
-#include "absl/container/fixed_array.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/rocm/roctracer_wrapper.h"
-#include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/rocprofiler-sdk/fwd.h"
+#include "rocm/include/rocprofiler-sdk/rocprofiler.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/profiler/utils/parse_annotation.h"
+#include "xla/tsl/profiler/utils/trace_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/abi.h"
-#include "tsl/platform/env_time.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/macros.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/thread_annotations.h"
-#include "tsl/platform/types.h"
-#include "tsl/profiler/lib/profiler_factory.h"
-#include "tsl/profiler/lib/profiler_interface.h"
 
 namespace xla {
 namespace profiler {
 
-namespace se = ::stream_executor;
-using tensorflow::ProfileOptions;
+using tsl::Status;
 using tsl::profiler::Annotation;
-using tsl::profiler::AnnotationStack;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
 using tsl::profiler::GetStatTypeStr;
 using tsl::profiler::GpuPlaneName;
 using tsl::profiler::kDeviceVendorAMD;
-using tsl::profiler::kThreadIdOverhead;
 using tsl::profiler::ParseAnnotationStack;
-using tsl::profiler::ProfilerInterface;
-// using tsl::profiler::RegisterProfilerFactory;
 using tsl::profiler::StatType;
 using tsl::profiler::XEventBuilder;
 using tsl::profiler::XEventMetadata;
@@ -62,26 +61,6 @@ using tsl::profiler::XLineBuilder;
 using tsl::profiler::XPlaneBuilder;
 using tsl::profiler::XSpace;
 
-void AnnotationMap::Add(uint32_t correlation_id,
-                        const std::string& annotation) {
-  if (annotation.empty()) return;
-  VLOG(3) << "Add annotation: "
-          << " correlation_id=" << correlation_id
-          << ", annotation: " << annotation;
-  absl::MutexLock lock(map_.mutex);
-  if (map_.annotations.size() < max_size_) {
-    absl::string_view annotation_str =
-        *map_.annotations.insert(annotation).first;
-    map_.correlation_map.emplace(correlation_id, annotation_str);
-  }
-}
-
-absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
-  absl::MutexLock lock(map_.mutex);
-  auto it = map_.correlation_map.find(correlation_id);
-  return it != map_.correlation_map.end() ? it->second : absl::string_view();
-}
-
 //==========
 namespace {
 // Set the all XLines of specified XPlane to starting walltime.
@@ -100,7 +79,9 @@ std::string GetDeviceXLineName(
     int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
   std::string line_name = absl::StrCat("Stream #", stream_id);
   event_types.erase(RocmTracerEventType::Unsupported);
-  if (event_types.empty()) return line_name;
+  if (event_types.empty()) {
+    return line_name;
+  }
   std::vector<const char*> type_names;
   for (const auto event_type : event_types) {
     type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
@@ -108,18 +89,17 @@ std::string GetDeviceXLineName(
   return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
 }
 
-}  // namespace
-
-static void DumpRocmTracerEvent(const RocmTracerEvent& event,
-                                uint64_t start_walltime_ns,
-                                uint64_t start_gputime_ns,
-                                const std::string& message) {
+void PrintRocmTracerEvent(const RocmTracerEvent& event,
+                          absl::string_view message = {},
+                          uint64_t start_walltime_ns = 0,
+                          uint64_t start_gputime_ns = 0) {
   std::ostringstream oss;
   oss << "correlation_id=" << event.correlation_id;
   oss << ",type=" << GetRocmTracerEventTypeName(event.type);
   oss << ",source=" << GetRocmTracerEventSourceName(event.source);
   oss << ",domain=" << GetRocmTracerEventDomainName(event.domain);
   oss << ",name=" << event.name;
+  oss << ",corr_id=" << event.correlation_id;
   oss << ",annotation=" << event.annotation;
   oss << ",start_time_us="
       << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000;
@@ -134,7 +114,6 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event,
     case RocmTracerEventType::MemcpyD2H:
     case RocmTracerEventType::MemcpyH2D:
     case RocmTracerEventType::MemcpyD2D:
-    case RocmTracerEventType::MemcpyP2P:
       oss << ",num_bytes=" << event.memcpy_info.num_bytes;
       oss << ",destination=" << event.memcpy_info.destination;
       oss << ",async=" << event.memcpy_info.async;
@@ -152,542 +131,404 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event,
       DCHECK(false);
       break;
   }
-  oss << message;
-  VLOG(3) << oss.str();
+  VLOG(3) << oss.str() << ' ' << message;
 }
 
-static uint64_t get_timestamp() {
+uint64_t get_timestamp() {
   uint64_t ts;
-  if (se::wrap::roctracer_get_timestamp(&ts) != ROCTRACER_STATUS_SUCCESS) {
-    const char* errstr = se::wrap::roctracer_error_string();
-    LOG(ERROR) << "function roctracer_get_timestamp failed with error "
+  rocprofiler_status_t CHECKSTATUS = rocprofiler_get_timestamp(&ts);
+  if (CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) {
+    const char* errstr = rocprofiler_get_status_string(CHECKSTATUS);
+    LOG(ERROR) << "function rocprofiler_get_timestamp failed with error "
                << errstr;
-    // Return 0 on error.
     return 0;
   }
   return ts;
 }
+}  // namespace
 
-struct RocmDeviceOccupancyParams {
-  hipFuncAttributes attributes = {};
-  int block_size = 0;
-  size_t dynamic_smem_size = 0;
-  void* func_ptr;
-
-  friend bool operator==(const RocmDeviceOccupancyParams& lhs,
-                         const RocmDeviceOccupancyParams& rhs) {
-    return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
+OccupancyStats PerDeviceCollector::GetOccupancy(
+    const RocmDeviceOccupancyParams& params) const {
+  // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
+  // return hipSuccess for HIP_API_ID_hipLaunchKernel
+  OccupancyStats stats;
+  int number_of_active_blocks;
+  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+      &number_of_active_blocks, params.func_ptr, params.block_size,
+      params.dynamic_smem_size);
+
+  if (err != hipError_t::hipSuccess) {
+    return {};
   }
 
-  template <typename H>
-  friend H AbslHashValue(H hash_state,
-                         const RocmDeviceOccupancyParams& params) {
-    return H::combine(
-        std::move(hash_state), params.attributes.maxThreadsPerBlock,
-        params.attributes.numRegs, params.attributes.sharedSizeBytes,
-        params.attributes.maxDynamicSharedSizeBytes, params.block_size,
-        params.dynamic_smem_size, params.func_ptr);
-  }
-};
-
-struct OccupancyStats {
-  double occupancy_pct = 0.0;
-  int min_grid_size = 0;
-  int suggested_block_size = 0;
-};
-
-struct CorrelationInfo {
-  CorrelationInfo(uint32_t t, uint32_t e) : thread_id(t), enqueue_time_ns(e) {}
-  uint32_t thread_id;
-  uint64_t enqueue_time_ns;
-};
-
-class PerDeviceCollector {
- private:
-  OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const {
-    // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
-    // return hipSuccess for HIP_API_ID_hipLaunchKernel
-
-    OccupancyStats stats;
-    int number_of_active_blocks;
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &number_of_active_blocks, params.func_ptr, params.block_size,
-        params.dynamic_smem_size);
-
-    if (err != hipError_t::hipSuccess) {
-      return {};
-    }
+  stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
+  stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
 
-    stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
-    stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
+  err = hipOccupancyMaxPotentialBlockSize(
+      &stats.min_grid_size, &stats.suggested_block_size,
+      static_cast<const void*>(params.func_ptr), params.dynamic_smem_size, 0);
 
-    err = hipOccupancyMaxPotentialBlockSize(
-        &stats.min_grid_size, &stats.suggested_block_size,
-        static_cast<const void*>(params.func_ptr), params.dynamic_smem_size, 0);
+  if (err != hipError_t::hipSuccess) {
+    return {};
+  }
 
-    if (err != hipError_t::hipSuccess) {
-      return {};
-    }
+  return stats;
+}
 
-    return stats;
+void PerDeviceCollector::CreateXEvent(const RocmTracerEvent& event,
+                                      XPlaneBuilder* plane,
+                                      uint64_t start_gpu_ns,
+                                      uint64_t end_gpu_ns, XLineBuilder* line) {
+  if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
+      event.start_time_ns > event.end_time_ns) {
+    VLOG(2) << "events have abnormal timestamps:" << event.name
+            << " start time(ns): " << event.start_time_ns
+            << " end time(ns): " << event.end_time_ns
+            << " start gpu(ns):" << start_gpu_ns
+            << " end gpu(ns):" << end_gpu_ns
+            << " corr. id:" << event.correlation_id;
+    return;
+  }
+  std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
+  if (kernel_name.empty()) {
+    kernel_name = GetRocmTracerEventTypeName(event.type);
+  }
+  XEventMetadata* event_metadata =
+      plane->GetOrCreateEventMetadata(std::move(kernel_name));
+  XEventBuilder xevent = line->AddEvent(*event_metadata);
+  VLOG(7) << "Adding event to line=" << line->Id();
+  xevent.SetTimestampNs(event.start_time_ns);
+  xevent.SetEndTimestampNs(event.end_time_ns);
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
+        event.device_id);
+  }
+  if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kCorrelationId)),
+                        event.correlation_id);
+  }
+  if (!event.roctx_range.empty()) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
+        *plane->GetOrCreateStatMetadata(event.roctx_range));
   }
 
-  void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
-                    uint64_t start_gpu_ns, uint64_t end_gpu_ns,
-                    XLineBuilder* line) {
-    if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
-        event.start_time_ns > event.end_time_ns) {
-      VLOG(2) << "events have abnormal timestamps:" << event.name
-              << " start time(ns): " << event.start_time_ns
-              << " end time(ns): " << event.end_time_ns
-              << " start gpu(ns):" << start_gpu_ns
-              << " end gpu(ns):" << end_gpu_ns
-              << " corr. id:" << event.correlation_id;
-      return;
-    }
-    std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
-    if (kernel_name.empty()) {
-      kernel_name = GetRocmTracerEventTypeName(event.type);
-    }
-    XEventMetadata* event_metadata =
-        plane->GetOrCreateEventMetadata(std::move(kernel_name));
-    XEventBuilder xevent = line->AddEvent(*event_metadata);
-    VLOG(7) << "Adding event to line=" << line->Id();
-    xevent.SetTimestampNs(event.start_time_ns);
-    xevent.SetEndTimestampNs(event.end_time_ns);
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
-          event.device_id);
-    }
-    if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kCorrelationId)),
-                          event.correlation_id);
-    }
-    if (!event.roctx_range.empty()) {
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
-          *plane->GetOrCreateStatMetadata(event.roctx_range));
-    }
-
-    if (event.type == RocmTracerEventType::Kernel &&
-        event.source == RocmTracerEventSource::Activity) {
-      RocmDeviceOccupancyParams params{};
-      params.attributes.maxThreadsPerBlock = INT_MAX;
-      params.attributes.numRegs =
-          static_cast<int>(event.kernel_info.registers_per_thread);
-      params.attributes.sharedSizeBytes =
-          event.kernel_info.static_shared_memory_usage;
-      // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
-      // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
-      params.attributes.maxDynamicSharedSizeBytes = 0;
-      params.block_size = static_cast<int>(event.kernel_info.block_x *
-                                           event.kernel_info.block_y *
-                                           event.kernel_info.block_z);
-
-      params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
-      params.func_ptr = event.kernel_info.func_ptr;
-
-      OccupancyStats& occ_stats = occupancy_cache_[params];
-      if (occ_stats.occupancy_pct == 0.0) {
-        occ_stats = GetOccupancy(params);
-      }
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                              StatType::kTheoreticalOccupancyPct)),
-                          occ_stats.occupancy_pct);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kOccupancyMinGridSize)),
-                          static_cast<tsl::int32>(occ_stats.min_grid_size));
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-          static_cast<tsl::int32>(occ_stats.suggested_block_size));
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kKernelDetails)),
-                          *plane->GetOrCreateStatMetadata(ToXStat(
-                              event.kernel_info, occ_stats.occupancy_pct)));
-    } else if (event.type == RocmTracerEventType::MemcpyH2D ||
-               event.type == RocmTracerEventType::MemcpyD2H ||
-               event.type == RocmTracerEventType::MemcpyD2D ||
-               event.type == RocmTracerEventType::MemcpyP2P ||
-               event.type == RocmTracerEventType::MemcpyOther) {
-      VLOG(7) << "Add Memcpy stat";
-      const auto& memcpy_info = event.memcpy_info;
-      std::string memcpy_details = absl::StrCat(
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          "kind:", "Unknown", " size:", memcpy_info.num_bytes,
-          " dest:", memcpy_info.destination, " async:", memcpy_info.async);
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kMemcpyDetails)),
-          *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
-    } else if (event.type == RocmTracerEventType::MemoryAlloc) {
-      VLOG(7) << "Add MemAlloc stat";
-      std::string value =
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          absl::StrCat("kind:", "Unknown",
-                       " num_bytes:", event.memalloc_info.num_bytes);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kMemallocDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    } else if (event.type == RocmTracerEventType::MemoryFree) {
-      VLOG(7) << "Add MemFree stat";
-      std::string value =
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          absl::StrCat("kind:", "Unknown",
-                       " num_bytes:", event.memalloc_info.num_bytes);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kMemFreeDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    } else if (event.type == RocmTracerEventType::Memset) {
-      VLOG(7) << "Add Memset stat";
-      auto value =
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          absl::StrCat("kind:", "Unknown",
-                       " num_bytes:", event.memset_info.num_bytes,
-                       " async:", event.memset_info.async);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kMemsetDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    }
-    // TODO(rocm-profiler): we need to support the following event type
-    /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
-      VLOG(7) << "Add MemoryResidency stat";
-      std::string value = absl::StrCat(
-          "kind:", GetMemoryKindName(event.memory_residency_info.kind),
-          " num_bytes:", event.memory_residency_info.num_bytes,
-          " addr:", event.memory_residency_info.address);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                              StatType::kMemoryResidencyDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    } */
-
-    std::vector<Annotation> annotation_stack =
-        ParseAnnotationStack(event.annotation);
-    if (!annotation_stack.empty()) {
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-          *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
-    }
-    // If multiple metadata have the same key name, show the values from the
-    // top of the stack (innermost annotation). Concatenate the values from
-    // "hlo_op".
-    absl::flat_hash_set<absl::string_view> key_set;
-
-    for (auto annotation = annotation_stack.rbegin();
-         annotation != annotation_stack.rend(); ++annotation) {
-      for (const Annotation::Metadata& metadata : annotation->metadata) {
-        if (key_set.insert(metadata.key).second) {
-          xevent.ParseAndAddStatValue(
-              *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
-        }
+  if (event.type == RocmTracerEventType::Kernel &&
+      event.source == RocmTracerEventSource::Activity) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kKernelDetails)),
+        *plane->GetOrCreateStatMetadata(ToXStat(event.kernel_info,
+                                                /*occupancy_pct*/ 0)));
+  } else if (event.type == RocmTracerEventType::MemcpyH2D ||
+             event.type == RocmTracerEventType::MemcpyD2H ||
+             event.type == RocmTracerEventType::MemcpyD2D ||
+             event.type == RocmTracerEventType::MemcpyOther) {
+    VLOG(7) << "Add Memcpy stat";
+    const auto& memcpy_info = event.memcpy_info;
+    std::string memcpy_details = absl::StrCat(
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        "kind:", "Unknown", " size:", memcpy_info.num_bytes,
+        " dest:", memcpy_info.destination, " async:", memcpy_info.async);
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kMemcpyDetails)),
+        *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
+  } else if (event.type == RocmTracerEventType::MemoryAlloc) {
+    VLOG(7) << "Add MemAlloc stat";
+    std::string value =
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        absl::StrCat("kind:", "Unknown",
+                     " num_bytes:", event.memalloc_info.num_bytes);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kMemallocDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  } else if (event.type == RocmTracerEventType::MemoryFree) {
+    VLOG(7) << "Add MemFree stat";
+    std::string value =
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        absl::StrCat("kind:", "Unknown",
+                     " num_bytes:", event.memalloc_info.num_bytes);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kMemFreeDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  } else if (event.type == RocmTracerEventType::Memset) {
+    VLOG(7) << "Add Memset stat";
+    auto value =
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        absl::StrCat("kind:", "Unknown",
+                     " num_bytes:", event.memset_info.num_bytes,
+                     " async:", event.memset_info.async);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kMemsetDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  }
+  // TODO(rocm-profiler): we need to support the following event type
+  /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
+    VLOG(7) << "Add MemoryResidency stat";
+    std::string value = absl::StrCat(
+        "kind:", GetMemoryKindName(event.memory_residency_info.kind),
+        " num_bytes:", event.memory_residency_info.num_bytes,
+        " addr:", event.memory_residency_info.address);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                            StatType::kMemoryResidencyDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  } */
+
+  std::vector<Annotation> annotation_stack =
+      ParseAnnotationStack(event.annotation);
+  if (!annotation_stack.empty()) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
+        *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
+  }
+  // If multiple metadata have the same key name, show the values from the
+  // top of the stack (innermost annotation). Concatenate the values from
+  // "hlo_op".
+  absl::flat_hash_set<absl::string_view> key_set;
+
+  for (auto annotation = annotation_stack.rbegin();
+       annotation != annotation_stack.rend(); ++annotation) {
+    for (const Annotation::Metadata& metadata : annotation->metadata) {
+      if (key_set.insert(metadata.key).second) {
+        xevent.ParseAndAddStatValue(
+            *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
       }
     }
   }
+}
 
-  void SortByStartTime() {
-    absl::MutexLock lock(events_mutex);
-    std::sort(events.begin(), events.end(),
-              [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
-                return event1.start_time_ns < event2.start_time_ns;
-              });
+void PerDeviceCollector::SortByStartTime() {
+  absl::MutexLock lock(events_mutex_);
+  std::sort(events_.begin(), events_.end(),
+            [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
+              return event1.start_time_ns < event2.start_time_ns;
+            });
+}
+
+bool PerDeviceCollector::IsHostEvent(const RocmTracerEvent& event,
+                                     int64_t* line_id) {
+  // DriverCallback(i.e. kernel launching) events are host events.
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    *line_id = event.thread_id;
+    return true;
+  } else {  // activities
+    *line_id = event.stream_id;
+    return false;
   }
 
-  bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
-    // DriverCallback(i.e. kernel launching) events are host events.
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      *line_id = event.thread_id;
-      return true;
-    } else {  // activities
-      *line_id = event.stream_id;
-      return false;
-    }
+  // TODO(rocm-profiler): do we have such a report in rocm?
+  // Non-overhead activity events are device events.
+  /* if (event.type != CuptiTracerEventType::Overhead) {
+    *line_id = event.stream_id;
+    return false;
+  } */
+  // Overhead events can be associated with a thread or a stream, etc.
+  // If a valid thread id is specified, we consider it as a host event.
+  //
+
+  if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
+    *line_id = event.stream_id;
+    return false;
+  } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
+             event.thread_id != 0) {
+    *line_id = event.thread_id;
+    return true;
+  } else {
+    *line_id = tsl::profiler::kThreadIdOverhead;
+    return false;
+  }
+}
 
-    // TODO(rocm-profiler): do we have such a report in rocm?
-    // Non-overhead activity events are device events.
-    /* if (event.type != CuptiTracerEventType::Overhead) {
-      *line_id = event.stream_id;
-      return false;
-    } */
-    // Overhead events can be associated with a thread or a stream, etc.
-    // If a valid thread id is specified, we consider it as a host event.
-    //
-
-    if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
-      *line_id = event.stream_id;
-      return false;
-    } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
-               event.thread_id != 0) {
-      *line_id = event.thread_id;
-      return true;
+void PerDeviceCollector::Export(uint64_t start_walltime_ns,
+                                uint64_t start_gputime_ns,
+                                uint64_t end_gputime_ns,
+                                XPlaneBuilder* device_plane,
+                                XPlaneBuilder* host_plane) {
+  int host_ev_cnt = 0, dev_ev_cnt = 0;
+  absl::MutexLock lock(events_mutex_);
+  // Tracking event types per line.
+  absl::flat_hash_map<int64_t, absl::flat_hash_set<RocmTracerEventType> >
+      events_types_per_line;
+
+  for (const RocmTracerEvent& event : events_) {
+    int64_t line_id = RocmTracerEvent::kInvalidThreadId;
+    bool is_host_event = IsHostEvent(event, &line_id);
+
+    if (is_host_event) {
+      host_ev_cnt++;
     } else {
-      *line_id = tsl::profiler::kThreadIdOverhead;
-      return false;
+      dev_ev_cnt++;
     }
-  }
-
- public:
-  void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-              uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
-              XPlaneBuilder* host_plane) {
-    int host_ev_cnt = 0, dev_ev_cnt = 0;
-    absl::MutexLock l(events_mutex);
-    // Tracking event types per line.
-    absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
-        events_types_per_line;
-    for (const RocmTracerEvent& event : events) {
-      int64_t line_id = RocmTracerEvent::kInvalidThreadId;
-      bool is_host_event = IsHostEvent(event, &line_id);
-
-      if (is_host_event) {
-        host_ev_cnt++;
-      } else {
-        dev_ev_cnt++;
-      }
 
-      if (line_id == RocmTracerEvent::kInvalidThreadId ||
-          line_id == RocmTracerEvent::kInvalidStreamId) {
-        VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
-        continue;
-      }
-      auto* plane = is_host_event ? host_plane : device_plane;
-      VLOG(9) << "Event"
-              << " type=" << static_cast<int>(event.type)
-              << " line_id=" << line_id
-              << (is_host_event ? " host plane=" : " device plane=")
-              << plane->Name();
-      XLineBuilder line = plane->GetOrCreateLine(line_id);
-      line.SetTimestampNs(start_gputime_ns);
-      CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
-      events_types_per_line[line_id].emplace(event.type);
+    if (line_id == RocmTracerEvent::kInvalidThreadId ||
+        line_id == RocmTracerEvent::kInvalidStreamId) {
+      VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
+      continue;
     }
-    device_plane->ForEachLine([&](XLineBuilder line) {
-      line.SetName(
-          GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
-    });
-    host_plane->ForEachLine([&](XLineBuilder line) {
-      line.SetName(absl::StrCat("Host Threads/", line.Id()));
-    });
-    events.clear();
+    auto* plane = is_host_event ? host_plane : device_plane;
+    VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
+            << " line_id=" << line_id
+            << (is_host_event ? " host plane=" : " device plane=")
+            << plane->Name();
+
+    XLineBuilder line = plane->GetOrCreateLine(line_id);
+    line.SetTimestampNs(start_gputime_ns);
+    CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
   }
 
-  PerDeviceCollector() = default;
+  device_plane->ForEachLine([&](XLineBuilder line) {
+    line.SetName(
+        GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
+  });
+  host_plane->ForEachLine([&](XLineBuilder line) {
+    line.SetName(absl::StrCat("Host Threads/", line.Id()));
+  });
+  events_.clear();
+}
 
-  void AddEvent(const RocmTracerEvent& event) {
-    absl::MutexLock l(events_mutex);
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      // Cupti api callback events were used to populate launch times etc.
-      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-        correlation_info_.insert(
-            {event.correlation_id,
-             CorrelationInfo(event.thread_id, event.start_time_ns)});
-      }
-      events.emplace_back(std::move(event));
-    } else {
-      // Cupti activity events measure device times etc.
-      events.emplace_back(std::move(event));
-    }
-  }
+void PerDeviceCollector::AddEvent(RocmTracerEvent&& event) {
+  absl::MutexLock lock(events_mutex_);
+  events_.emplace_back(std::move(event));
+}
 
-  void GetDeviceCapabilities(int32_t device_ordinal,
-                             XPlaneBuilder* device_plane) {
-    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(StatType::kDevVendor)),
-                               kDeviceVendorAMD);
-
-    if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
-        hipSuccess)
-      return;
-
-    auto clock_rate_in_khz =
-        device_properties_.clockRate;  // this is also in Khz
-    if (clock_rate_in_khz) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapClockRateKHz)),
-          clock_rate_in_khz);
-    }
+void PerDeviceCollector::GetDeviceCapabilities(int32_t device_ordinal,
+                                               XPlaneBuilder* device_plane) {
+  device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                 GetStatTypeStr(StatType::kDevVendor)),
+                             kDeviceVendorAMD);
 
-    auto core_count = device_properties_.multiProcessorCount;
-    if (core_count) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapCoreCount)),
-          core_count);
-    }
+  if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
+      hipSuccess) {
+    return;
+  }
 
-    auto mem_clock_khz = device_properties_.memoryClockRate;
-    auto mem_bus_width_bits = device_properties_.memoryBusWidth;
-
-    if (mem_clock_khz && mem_bus_width_bits) {
-      // Times 2 because HBM is DDR memory; it gets two data bits per each
-      // data lane.
-      auto memory_bandwidth =
-          uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
-          memory_bandwidth);
-    }
+  auto clock_rate_in_khz = device_properties_.clockRate;  // this is also in Khz
+  if (clock_rate_in_khz) {
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapClockRateKHz)),
+        clock_rate_in_khz);
+  }
 
-    size_t total_memory = device_properties_.totalGlobalMem;
-    if (total_memory) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapMemorySize)),
-          static_cast<uint64_t>(total_memory));
-    }
+  auto core_count = device_properties_.multiProcessorCount;
+  if (core_count) {
+    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(StatType::kDevCapCoreCount)),
+                               core_count);
+  }
 
-    auto compute_capability_major = device_properties_.major;
-    if (compute_capability_major) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
-          compute_capability_major);
-    }
-    auto compute_capability_minor = device_properties_.minor;
-    if (compute_capability_minor) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
-          compute_capability_minor);
-    }
+  auto mem_clock_khz = device_properties_.memoryClockRate;
+  auto mem_bus_width_bits = device_properties_.memoryBusWidth;
+
+  if (mem_clock_khz && mem_bus_width_bits) {
+    // Times 2 because HBM is DDR memory; it gets two data bits per each
+    // data lane.
+    auto memory_bandwidth =
+        uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
+        memory_bandwidth);
   }
 
- private:
-  absl::Mutex events_mutex;
-  std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
-  absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
-      TF_GUARDED_BY(events_mutex);
-  absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
-      occupancy_cache_;
-  hipDeviceProp_t device_properties_;
-};
-
-class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
- public:
-  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
-      : RocmTraceCollector(options),
-        num_callback_events_(0),
-        num_activity_events_(0),
-        start_walltime_ns_(start_walltime_ns),
-        start_gputime_ns_(start_gputime_ns),
-        num_gpus_(options.num_gpus) {}
-
-  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override;
-  void Flush() override;
-  void Export(XSpace* space) override;
-
-  void OnEventsDropped(const std::string& reason,
-                       uint32_t correlation_id) override {
-    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
-              << ",) : " << reason << ".";
+  size_t total_memory = device_properties_.totalGlobalMem;
+  if (total_memory) {
+    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(StatType::kDevCapMemorySize)),
+                               static_cast<uint64_t>(total_memory));
   }
 
- private:
-  std::atomic<int> num_callback_events_;
-  std::atomic<int> num_activity_events_;
-  uint64_t start_walltime_ns_;
-  uint64_t start_gputime_ns_;
-  int num_gpus_;
-
-  absl::Mutex event_maps_mutex_;
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
-   trigger multiple HIP ops domain activities. We keep them in a vector and
-   merge them with api activities at flush time.
- */
-  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
-      activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
-  // This is for the APIs that we track because we need some information from
-  // them to populate the corresponding activity that we actually track.
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  const std::vector<RocmTracerEvent> ApiActivityInfoExchange()
-      TF_EXCLUSIVE_LOCKS_REQUIRED(event_maps_mutex_);
-
-  absl::node_hash_map<uint32_t, PerDeviceCollector> per_device_collector_;
-};
-//==========
+  auto compute_capability_major = device_properties_.major;
+  if (compute_capability_major) {
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
+        compute_capability_major);
+  }
+  auto compute_capability_minor = device_properties_.minor;
+  if (compute_capability_minor) {
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
+        compute_capability_minor);
+  }
+}
 
 void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent&& event,
                                       bool is_auxiliary) {
   absl::MutexLock lock(event_maps_mutex_);
 
-  if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
-    if (num_callback_events_ > options_.max_callback_api_events) {
-      OnEventsDropped("max callback event capacity reached",
-                      event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-      return;
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    if (!is_auxiliary) {
+      if (num_callback_events_ >= options_.max_callback_api_events) {
+        OnEventsDropped("max callback event capacity reached",
+                        event.correlation_id);
+        PrintRocmTracerEvent(event, ". Dropped!");
+        return;
+      }
+      num_callback_events_++;
     }
-    num_callback_events_++;
-  } else if (event.source == RocmTracerEventSource::Activity &&
-             event.domain == RocmTracerEventDomain::HIP_API) {
-    // we do not count HIP_OPS activities.
-    if (num_activity_events_ > options_.max_activity_api_events) {
-      OnEventsDropped("max activity event capacity reached",
+    auto& map = is_auxiliary ? auxiliary_api_events_map_ : api_events_map_;
+    auto [it, added] = map.emplace(event.correlation_id, std::move(event));
+
+    if (!added) {
+      OnEventsDropped("event with duplicate correlation_id was received.",
                       event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-      return;
+      PrintRocmTracerEvent(event, ". Dropped!");
     }
-    num_activity_events_++;
-  }
-
-  bool emplace_result = false;
-  if (event.source == RocmTracerEventSource::ApiCallback) {
-    auto& target_api_event_map =
-        (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_;
-    std::tie(std::ignore, emplace_result) =
-        target_api_event_map.emplace(event.correlation_id, std::move(event));
   } else if (event.source == RocmTracerEventSource::Activity) {
-    auto result = activity_ops_events_map_.emplace(
+    if (event.domain == RocmTracerEventDomain::HIP_API) {
+      // we do not count HIP_OPS activities.
+      if (num_activity_events_ >= options_.max_activity_api_events) {
+        OnEventsDropped("max activity event capacity reached",
+                        event.correlation_id);
+        PrintRocmTracerEvent(event, ". Dropped!");
+        return;
+      }
+      num_activity_events_++;
+    }
+
+    auto [it, _] = activity_ops_events_map_.emplace(
         event.correlation_id, std::vector<RocmTracerEvent>{});
-    result.first->second.push_back(std::move(event));
-    emplace_result = true;  // we always accept Hip-Ops events
-  }
-  if (!emplace_result) {
-    OnEventsDropped("event with duplicate correlation_id was received.",
-                    event.correlation_id);
-    DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+    it->second.push_back(std::move(event));
+  } else {
+    VLOG(3) << "Dropping unknown event: " << (int)event.source
+            << " domain: " << (int)event.domain;
   }
 }
 
 void RocmTraceCollectorImpl::Flush() {
   absl::MutexLock lock(event_maps_mutex_);
-  auto& aggregated_events_ = ApiActivityInfoExchange();
+  auto aggregated_events = ApiActivityInfoExchange();
 
   VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
           << " callback events, " << num_activity_events_
           << " activity events, and aggregated them into "
-          << aggregated_events_.size() << " events.";
+          << aggregated_events.size() << " events.";
 
   // device ids for GPUs filled in by roctracer are not zero indexed.
   // They are offset by number of CPUs on the machine
-  tsl::uint32 min_device_id = INT32_MAX;
-  ;
-  for (auto& event : aggregated_events_) {
+  uint32_t min_device_id = INT32_MAX;
+
+  for (const auto& event : aggregated_events) {
     if (event.device_id < min_device_id) {
       min_device_id = event.device_id;
     }
   }
 
-  for (auto event : aggregated_events_) {
-    event.device_id = event.device_id - min_device_id;
-    if (event.device_id < num_gpus_) {
-      per_device_collector_[event.device_id].AddEvent(event);
+  for (auto& event : aggregated_events) {
+    auto id = event.device_id - min_device_id;
+    if (id < num_gpus_) {
+      per_device_collector_[id].AddEvent(std::move(event));
     } else {
-      OnEventsDropped("Invalid device id for an event.", event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+      PrintRocmTracerEvent(event, ". Dropped due to invalid device ID!");
     }
   }
 
@@ -701,24 +542,24 @@ void RocmTraceCollectorImpl::Export(XSpace* space) {
   XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
       space, tsl::profiler::kRoctracerApiPlaneName));
 
-  for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
-    std::string name = GpuPlaneName(device_ordinal);
+  VLOG(3) << "Calling RocmTraceCollectorImpl::Export num_gpus " << num_gpus_;
+
+  for (int id = 0; id < num_gpus_; id++) {
+    std::string name = GpuPlaneName(id);
     XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
-    device_plane.SetId(device_ordinal);
+    device_plane.SetId(id);
     // Calculate device capabilities before flushing, so that device
     // properties are available to the occupancy calculator in export().
-    per_device_collector_[device_ordinal].GetDeviceCapabilities(device_ordinal,
-                                                                &device_plane);
-    per_device_collector_[device_ordinal].Export(
-        start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane,
-        &host_plane);
+    per_device_collector_[id].GetDeviceCapabilities(id, &device_plane);
+    per_device_collector_[id].Export(start_walltime_ns_, start_gputime_ns_,
+                                     end_gputime_ns, &device_plane,
+                                     &host_plane);
     NormalizeTimeStamps(&device_plane, start_walltime_ns_);
   }
   NormalizeTimeStamps(&host_plane, start_walltime_ns_);
 }
 
-const std::vector<RocmTracerEvent>
-RocmTraceCollectorImpl::ApiActivityInfoExchange() {
+std::vector<RocmTracerEvent> RocmTraceCollectorImpl::ApiActivityInfoExchange() {
   /* Different from CUDA, roctracer activity records are not enough to fill a
     TF event. For most of the activities, we need to enable the corresponding
     API callsbacks (we call them auxiliary API callbacks) to capture the
@@ -728,116 +569,108 @@ RocmTraceCollectorImpl::ApiActivityInfoExchange() {
   */
 
   std::vector<RocmTracerEvent> aggregated_events;
+  aggregated_events.reserve(api_events_map_.size());
 
   // Copy info from activity events to API callback events
-  for (auto& api_iter : api_events_map_) {
-    RocmTracerEvent& api_event = api_iter.second;
-    auto activity_event =
-        activity_ops_events_map_.find(api_event.correlation_id);
-
-    if (activity_event == activity_ops_events_map_.end()) {
-      OnEventsDropped(
-          "An event from HIP API discarded."
-          "Could not find the counterpart activity.",
-          api_event.correlation_id);
-      DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-    } else {
-      api_event.device_id = activity_event->second.front().device_id;
-      api_event.stream_id = activity_event->second.front().stream_id;
-      switch (api_event.type) {
-        case RocmTracerEventType::Kernel:
-        case RocmTracerEventType::Memset:
-        case RocmTracerEventType::MemoryAlloc:
-        case RocmTracerEventType::MemoryFree:
-        case RocmTracerEventType::Synchronization: {
-          aggregated_events.push_back(api_event);
-          break;
-        }
-        case RocmTracerEventType::MemcpyD2H:
-        case RocmTracerEventType::MemcpyH2D:
-        case RocmTracerEventType::MemcpyD2D:
-        case RocmTracerEventType::MemcpyOther: {
-          api_event.memcpy_info.destination =
-              activity_event->second.front().device_id;
-          aggregated_events.push_back(api_event);
-          break;
-        }
-        default:
-          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
-                          api_event.correlation_id);
-          DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-          LOG(WARNING) << "A ROCm API event type with unimplemented activity "
-                          "merge dropped! "
-                          "Type="
-                       << GetRocmTracerEventTypeName(api_event.type);
-      }
+  for (auto& [key, api_event] : api_events_map_) {
+    auto iact = activity_ops_events_map_.find(api_event.correlation_id);
+
+    if (iact == activity_ops_events_map_.end()) {
+      PrintRocmTracerEvent(api_event, ". Dropped!");
+      VLOG(1) << api_event.name << "  could not find activity counterpart!";
+      continue;
     }
-  }
+    const auto& item = iact->second.front();
+    api_event.device_id = item.device_id;
+    api_event.stream_id = item.stream_id;
+    switch (api_event.type) {
+      case RocmTracerEventType::Kernel:
+        api_event.kernel_info = item.kernel_info;
+        aggregated_events.push_back(api_event);
+        break;
+      case RocmTracerEventType::Memset:
+      case RocmTracerEventType::MemoryAlloc:
+      case RocmTracerEventType::MemoryFree:
+      case RocmTracerEventType::Synchronization:
+        aggregated_events.push_back(api_event);
+        break;
+      case RocmTracerEventType::MemcpyD2H:
+      case RocmTracerEventType::MemcpyH2D:
+      case RocmTracerEventType::MemcpyD2D:
+      case RocmTracerEventType::MemcpyOther:
+        api_event.memcpy_info = item.memcpy_info;
+        aggregated_events.push_back(api_event);
+        break;
+      default:
+        OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                        api_event.correlation_id);
+        PrintRocmTracerEvent(api_event, ". Dropped!");
+        LOG(WARNING) << "A ROCm API event type with unimplemented activity "
+                        "merge dropped! "
+                        "Type="
+                     << GetRocmTracerEventTypeName(api_event.type);
+    }  // switch
+  }  // for
 
   // Make sure for all activity events we have API callback events
   for (auto& activity_iter : activity_ops_events_map_) {
     RocmTracerEvent& activity_event = activity_iter.second.front();
+
     auto api_event = api_events_map_.find(activity_event.correlation_id);
 
     if (api_event == api_events_map_.end()) {
       api_event = auxiliary_api_events_map_.find(activity_event.correlation_id);
-    }
 
-    if (api_event == auxiliary_api_events_map_.end()) {
-      OnEventsDropped(
-          "An event from activity was discarded."
-          "Could not find the counterpart HIP API.",
-          activity_event.correlation_id);
-      DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-    } else {
-      switch (activity_event.type) {
-        // KERNEL ACTIVITY
-        case RocmTracerEventType::Kernel: {
-          activity_event.name = api_event->second.name;
-          activity_event.kernel_info = api_event->second.kernel_info;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // MEMCPY ACTIVITY
-        case RocmTracerEventType::MemcpyD2H:
-        case RocmTracerEventType::MemcpyH2D:
-        case RocmTracerEventType::MemcpyD2D:
-        case RocmTracerEventType::MemcpyOther: {
-          activity_event.memcpy_info = api_event->second.memcpy_info;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // MEMSET ACTIVITY
-        case RocmTracerEventType::Memset: {
-          activity_event.memset_info = api_event->second.memset_info;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // MALLOC ACTIVITY, FREE ACTIVITY
-        case RocmTracerEventType::MemoryAlloc:
-        case RocmTracerEventType::MemoryFree: {
-          activity_event.device_id = api_event->second.device_id;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // SYNCHRONIZATION ACTIVITY
-        case RocmTracerEventType::Synchronization: {
-          activity_event.device_id = api_event->second.device_id;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        default:
-          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
-                          activity_event.correlation_id);
-          DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-          LOG(WARNING) << "A ROCm activity event with unimplemented API "
-                          "callback merge dropped! "
-                          "Type="
-                       << GetRocmTracerEventTypeName(activity_event.type);
-          break;
+      if (api_event == auxiliary_api_events_map_.end()) {
+        OnEventsDropped(
+            "An event from activity was discarded."
+            "Could not find the counterpart HIP API.",
+            activity_event.correlation_id);
+        PrintRocmTracerEvent(activity_event, ". Dropped!");
+        continue;
       }
     }
-  }
+
+    switch (activity_event.type) {
+      case RocmTracerEventType::Kernel:
+        activity_event.kernel_info = api_event->second.kernel_info;
+        PrintRocmTracerEvent(activity_event,
+                             ". activity event from api_event.");
+        aggregated_events.push_back(activity_event);
+        break;
+
+      case RocmTracerEventType::MemcpyD2H:
+      case RocmTracerEventType::MemcpyH2D:
+      case RocmTracerEventType::MemcpyD2D:
+      case RocmTracerEventType::MemcpyOther:
+        // activity_event.memcpy_info = api_event->second.memcpy_info;
+        aggregated_events.push_back(activity_event);
+        break;
+      case RocmTracerEventType::Memset:
+        activity_event.memset_info = api_event->second.memset_info;
+        aggregated_events.push_back(activity_event);
+        break;
+
+      case RocmTracerEventType::MemoryAlloc:
+      case RocmTracerEventType::MemoryFree:
+        activity_event.device_id = api_event->second.device_id;
+        aggregated_events.push_back(activity_event);
+        break;
+
+      case RocmTracerEventType::Synchronization:
+        activity_event.device_id = api_event->second.device_id;
+        aggregated_events.push_back(activity_event);
+        break;
+      default:
+        OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                        activity_event.correlation_id);
+        PrintRocmTracerEvent(activity_event, ". Dropped!");
+        LOG(WARNING) << "A ROCm activity event with unimplemented API "
+                        "callback merge dropped! "
+                        "Type="
+                     << GetRocmTracerEventTypeName(activity_event.type);
+    }  // switch
+  }  // for
 
   return aggregated_events;
 }
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
index 46e8e71eee77f0..c93dbb2c128f50 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
@@ -16,213 +16,208 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
 #define XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
 
+#include <atomic>
+#include <cstddef>
 #include <cstdint>
-#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_set.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace xla {
 namespace profiler {
 
-using tsl::profiler::XSpace;
-
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: it's the current device.
-  uint32_t destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-};
-
-struct MemAllocDetails {
-  // The amount of data requested for cudaMalloc events.
-  uint64_t num_bytes;
-};
-
-struct MemsetDetails {
-  // The number of memory elements getting set
-  size_t num_bytes;
-  // Whether or not the memset is asynchronous.
-  bool async;
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint32_t registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint32_t static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint32_t dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint32_t block_x;
-  // Y-dimension of a thread block.
-  uint32_t block_y;
-  // Z-dimension of a thread block.
-  uint32_t block_z;
-  // X-dimension of a grid.
-  uint32_t grid_x;
-  // Y-dimension of a grid.
-  uint32_t grid_y;
-  // Z-dimension of a grid.
-  uint32_t grid_z;
-
-  // kernel address. Used for calculating core occupancy
-  void* func_ptr;
-};
-
 inline std::string ToXStat(const KernelDetails& kernel_info,
                            double occupancy_pct) {
-  return absl::StrCat(
-      "regs:", kernel_info.registers_per_thread,
-      " static_shared:", kernel_info.static_shared_memory_usage,
-      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
-      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
-      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
-      kernel_info.block_y, ",", kernel_info.block_z,
-      " occ_pct:", occupancy_pct);
+  uint32_t grid_x = kernel_info.workgroup_x != 0
+                        ? kernel_info.grid_x / kernel_info.workgroup_x
+                        : 0,
+           grid_y = kernel_info.workgroup_y != 0
+                        ? kernel_info.grid_y / kernel_info.workgroup_y
+                        : 0,
+           grid_z = kernel_info.workgroup_z != 0
+                        ? kernel_info.grid_z / kernel_info.workgroup_z
+                        : 0;
+
+  return absl::StrCat(" grid:", grid_x, ",", grid_y, ",", grid_z,
+                      " block:", kernel_info.workgroup_x, ",",
+                      kernel_info.workgroup_y, ",", kernel_info.workgroup_z,
+                      " private_mem:", kernel_info.private_segment_size,
+                      " group_mem:", kernel_info.group_segment_size,
+                      " occ_pct:", occupancy_pct);
 }
 
-enum class RocmTracerEventType {
-  Unsupported = 0,
-  Kernel,
-  MemcpyH2D,
-  MemcpyD2H,
-  MemcpyD2D,
-  MemcpyP2P,
-  MemcpyOther,
-  MemoryAlloc,
-  MemoryFree,
-  Memset,
-  Synchronization,
-  Generic,
-};
-
-const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
-
-enum class RocmTracerEventSource {
-  Invalid = 0,
-  ApiCallback,
-  Activity,
-};
-
-const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
-
-enum class RocmTracerEventDomain {
-  InvalidDomain = 0,
-  HIP_API,
-  HIP_OPS,
-};
-const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
-// RocmTracerSyncTypes forward declaration
-enum class RocmTracerSyncTypes;
-
-struct SynchronizationDetails {
-  RocmTracerSyncTypes sync_type;
-};
-
-struct RocmTracerEvent {
-  static constexpr uint32_t kInvalidDeviceId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidThreadId =
-      std::numeric_limits<uint64_t>::max();
-  static constexpr uint32_t kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  RocmTracerEventType type;
-  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
-  RocmTracerEventDomain domain;
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  absl::string_view roctx_range;
-  uint64_t start_time_ns = 0;
-  uint64_t end_time_ns = 0;
-  uint32_t device_id = kInvalidDeviceId;
-  uint32_t correlation_id = kInvalidCorrelationId;
-  uint64_t thread_id = kInvalidThreadId;
-  int64_t stream_id = kInvalidStreamId;
-  union {
-    MemcpyDetails memcpy_info;                    // If type == Memcpy*
-    MemsetDetails memset_info;                    // If type == Memset*
-    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
-    KernelDetails kernel_info;                    // If type == Kernel
-    SynchronizationDetails synchronization_info;  // If type == Synchronization
-  };
-};
+struct RocmDeviceOccupancyParams {
+  hipFuncAttributes attributes = {};
+  int block_size = 0;
+  size_t dynamic_smem_size = 0;
+  void* func_ptr;
 
-struct RocmTraceCollectorOptions {
-  // Maximum number of events to collect from callback API; if -1, no limit.
-  // if 0, the callback API is enabled to build a correlation map, but no
-  // events are collected.
-  uint64_t max_callback_api_events;
-  // Maximum number of events to collect from activity API; if -1, no limit.
-  uint64_t max_activity_api_events;
-  // Maximum number of annotation strings that we can accommodate.
-  uint64_t max_annotation_strings;
-  // Number of GPUs involved.
-  uint32_t num_gpus;
+  friend bool operator==(const RocmDeviceOccupancyParams& a,
+                         const RocmDeviceOccupancyParams& b) noexcept {
+    // Compare only the fields that affect occupancy decisions.
+    return std::tuple{a.attributes.binaryVersion,
+                      a.attributes.cacheModeCA,
+                      a.attributes.constSizeBytes,
+                      a.attributes.localSizeBytes,
+                      a.attributes.maxDynamicSharedSizeBytes,
+                      a.attributes.maxThreadsPerBlock,
+                      a.attributes.numRegs,
+                      a.attributes.preferredShmemCarveout,
+                      a.attributes.ptxVersion,
+                      a.block_size,
+                      a.dynamic_smem_size,
+                      a.func_ptr} ==
+           std::tuple{b.attributes.binaryVersion,
+                      b.attributes.cacheModeCA,
+                      b.attributes.constSizeBytes,
+                      b.attributes.localSizeBytes,
+                      b.attributes.maxDynamicSharedSizeBytes,
+                      b.attributes.maxThreadsPerBlock,
+                      b.attributes.numRegs,
+                      b.attributes.preferredShmemCarveout,
+                      b.attributes.ptxVersion,
+                      b.block_size,
+                      b.dynamic_smem_size,
+                      b.func_ptr};
+  }
+
+  friend bool operator!=(const RocmDeviceOccupancyParams& a,
+                         const RocmDeviceOccupancyParams& b) noexcept {
+    return !(a == b);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H hash_state,
+                         const RocmDeviceOccupancyParams& params) {
+    return H::combine(
+        std::move(hash_state), params.attributes.maxThreadsPerBlock,
+        params.attributes.numRegs, params.attributes.sharedSizeBytes,
+        params.attributes.maxDynamicSharedSizeBytes, params.block_size,
+        params.dynamic_smem_size, params.func_ptr);
+  }
 };
 
-class AnnotationMap {
- public:
-  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
-  void Add(uint32_t correlation_id, const std::string& annotation);
-  absl::string_view LookUp(uint32_t correlation_id);
-
- private:
-  struct AnnotationMapImpl {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
-  };
-  const uint64_t max_size_;
-  AnnotationMapImpl map_;
-
- public:
-  // Disable copy and move.
-  AnnotationMap(const AnnotationMap&) = delete;
-  AnnotationMap& operator=(const AnnotationMap&) = delete;
+// FIXME: rocprofiler-sdk does not have this one yet
+struct OccupancyStats {
+  double occupancy_pct = 0.0;
+  int min_grid_size = 0;
+  int suggested_block_size = 0;
 };
 
 class RocmTraceCollector {
  public:
   explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
-      : options_(options), annotation_map_(options.max_annotation_strings) {}
+      : options_(options) {}
   virtual ~RocmTraceCollector() {}
 
   virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
   virtual void OnEventsDropped(const std::string& reason,
                                uint32_t num_events) = 0;
   virtual void Flush() = 0;
-  virtual void Export(XSpace* space) = 0;
-
-  AnnotationMap* annotation_map() { return &annotation_map_; }
+  virtual void Export(tsl::profiler::XSpace* space) = 0;
 
  protected:
   RocmTraceCollectorOptions options_;
 
- private:
-  AnnotationMap annotation_map_;
-
  public:
   // Disable copy and move.
   RocmTraceCollector(const RocmTraceCollector&) = delete;
   RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
 };
 
+class PerDeviceCollector {
+ public:
+  void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
+              uint64_t end_gputime_ns,
+              tsl::profiler::XPlaneBuilder* device_plane,
+              tsl::profiler::XPlaneBuilder* host_plane);
+
+  PerDeviceCollector() = default;
+
+  void AddEvent(RocmTracerEvent&& event);
+  void GetDeviceCapabilities(int32_t device_ordinal,
+                             tsl::profiler::XPlaneBuilder* device_plane);
+
+ private:
+  OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const;
+  void CreateXEvent(const RocmTracerEvent& event,
+                    tsl::profiler::XPlaneBuilder* plane, uint64_t start_gpu_ns,
+                    uint64_t end_gpu_ns, tsl::profiler::XLineBuilder* line);
+  void SortByStartTime();
+  bool IsHostEvent(const RocmTracerEvent& event, int64_t* line_id);
+
+ private:
+  absl::Mutex events_mutex_;
+  std::vector<RocmTracerEvent> events_ ABSL_GUARDED_BY(events_mutex_);
+  absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
+      occupancy_cache_;
+  hipDeviceProp_t device_properties_;
+};  // PerDeviceCollector
+
+class RocmTraceCollectorImpl : public RocmTraceCollector {
+ public:
+  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
+      : RocmTraceCollector(options),
+        num_callback_events_(0),
+        num_activity_events_(0),
+        start_walltime_ns_(start_walltime_ns),
+        start_gputime_ns_(start_gputime_ns),
+        num_gpus_(options.num_gpus) {}
+
+  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override;
+  void Flush() override;
+  void Export(tsl::profiler::XSpace* space) override;
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t correlation_id) override {
+    VLOG(2) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
+            << ",) : " << reason << ".";
+  }
+
+ private:
+  std::atomic<int> num_callback_events_;
+  std::atomic<int> num_activity_events_;
+  uint64_t start_walltime_ns_;
+  uint64_t start_gputime_ns_;
+  int num_gpus_;
+
+  absl::Mutex event_maps_mutex_;
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
+      ABSL_GUARDED_BY(event_maps_mutex_);
+
+  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
+   trigger multiple HIP ops domain activities. We keep them in a vector and
+   merge them with api activities at flush time.
+ */
+  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
+      activity_ops_events_map_ ABSL_GUARDED_BY(event_maps_mutex_);
+  // This is for the APIs that we track because we need some information from
+  // them to populate the corresponding activity that we actually track.
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
+      ABSL_GUARDED_BY(event_maps_mutex_);
+
+  std::vector<RocmTracerEvent> ApiActivityInfoExchange()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(event_maps_mutex_);
+
+  absl::node_hash_map<uint32_t, PerDeviceCollector> per_device_collector_;
+};  // RocmTraceCollectorImpl
+
 std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
-    const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns,
-    const uint64_t start_gputime_ns);
+    const RocmTraceCollectorOptions& options, uint64_t start_walltime_ns,
+    uint64_t start_gputime_ns);
 
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc
new file mode 100644
index 00000000000000..1e0d341746546a
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+
+#include <cstdint>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+#include "xla/tsl/profiler/utils/xplane_utils.h"
+
+namespace xla {
+namespace profiler {
+namespace test {
+
+using tsl::profiler::FindOrAddMutablePlaneWithName;
+using tsl::profiler::XSpace;
+
+TEST(RocmCollectorTest, TestAddKernelEventAndExport) {
+  RocmTraceCollectorOptions options;
+  options.max_callback_api_events = 100;
+  options.max_activity_api_events = 100;
+  options.max_annotation_strings = 100;
+  options.num_gpus = 1;
+
+  constexpr uint64_t kStartWallTimeNs = 1000;
+  constexpr uint64_t kStartGpuTimeNs = 2000;
+
+  RocmTraceCollectorImpl collector(options, kStartWallTimeNs, kStartGpuTimeNs);
+
+  constexpr uint32_t kCorrelationId = 42;
+  constexpr uint64_t kStartTimeNs = 3000;
+  constexpr uint64_t kEndTimeNs = 4000;
+
+  // === 1. Add API Callback Event ===
+  RocmTracerEvent api_event;
+  api_event.type = RocmTracerEventType::Kernel;
+  api_event.source = RocmTracerEventSource::ApiCallback;
+  api_event.domain = RocmTracerEventDomain::HIP_API;
+  api_event.name = "test_rocm_kernel";
+  api_event.correlation_id = kCorrelationId;
+  api_event.thread_id = 999;
+  api_event.kernel_info = KernelDetails{};
+  api_event.kernel_info.private_segment_size = 32;
+  api_event.kernel_info.group_segment_size = 1024;
+  api_event.kernel_info.workgroup_x = 256;
+  api_event.kernel_info.workgroup_y = 1;
+  api_event.kernel_info.workgroup_z = 1;
+  api_event.kernel_info.grid_x = 100;
+  api_event.kernel_info.grid_y = 1;
+  api_event.kernel_info.grid_z = 1;
+  api_event.kernel_info.func_ptr = reinterpret_cast<void*>(0xdeadbeef);
+
+  collector.AddEvent(std::move(api_event), /*is_auxiliary=*/false);
+
+  // === 2. Add Activity Event ===
+  RocmTracerEvent activity_event;
+  activity_event.type = RocmTracerEventType::Kernel;
+  activity_event.source = RocmTracerEventSource::Activity;
+  activity_event.domain = RocmTracerEventDomain::HIP_OPS;
+  activity_event.name = "test_rocm_kernel";
+  activity_event.correlation_id = kCorrelationId;
+  activity_event.start_time_ns = kStartTimeNs;
+  activity_event.end_time_ns = kEndTimeNs;
+  activity_event.device_id = 100;
+  activity_event.stream_id = 123;
+
+  collector.AddEvent(std::move(activity_event), /*is_auxiliary=*/false);
+
+  // === 3. Finalize and Export ===
+  collector.Flush();
+
+  tensorflow::profiler::XSpace space;
+  collector.Export(&space);
+
+  // === 4. Check results ===
+  ASSERT_GE(space.planes_size(), 1);
+  const auto* gpu_plane =
+      FindOrAddMutablePlaneWithName(&space, "/device:GPU:0");
+  ASSERT_NE(gpu_plane, nullptr);
+
+  ASSERT_GT(gpu_plane->lines_size(), 0);
+  const auto& line = gpu_plane->lines(0);
+  ASSERT_GT(line.events_size(), 0);
+
+  const auto& event = line.events(0);
+  EXPECT_EQ(event.offset_ps(), (kStartTimeNs - kStartGpuTimeNs) * 1000);
+  EXPECT_EQ(event.duration_ps(), (kEndTimeNs - kStartTimeNs) * 1000);
+  EXPECT_EQ(gpu_plane->event_metadata().at(event.metadata_id()).name(),
+            "test_rocm_kernel");
+}
+
+}  // namespace test
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index f5283708ef9f60..40f0e0e96cfbe9 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -13,1578 +13,570 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This translation unit is **self‑contained**: it provides minimal stub
+// implementations for the rocprofiler callbacks that XLA needs to register
+// (toolInit / toolFinialize / code_object_callback).  They do nothing except
+// keep the compiler and linker happy.  Once real logging is implemented, you
+// can replace the stubs with the actual logic.
+
 #include "xla/backends/profiler/gpu/rocm_tracer.h"
 
-#include <cstdint>
+#include <time.h>
+#include <unistd.h>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/status/status.h"
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
-#include "rocm/rocm_config.h"
+#include "rocm/include/rocprofiler-sdk/agent.h"
+#include "rocm/include/rocprofiler-sdk/buffer.h"
+#include "rocm/include/rocprofiler-sdk/buffer_tracing.h"
+#include "rocm/include/rocprofiler-sdk/callback_tracing.h"
+#include "rocm/include/rocprofiler-sdk/context.h"
+#include "rocm/include/rocprofiler-sdk/cxx/details/name_info.hpp"
+#include "rocm/include/rocprofiler-sdk/fwd.h"
+#include "rocm/include/rocprofiler-sdk/hip/runtime_api_id.h"
+#include "rocm/include/rocprofiler-sdk/internal_threading.h"
+#include "rocm/include/rocprofiler-sdk/registration.h"
+#include "rocm/include/rocprofiler-sdk/rocprofiler.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
-#include "xla/tsl/profiler/utils/time_utils.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/macros.h"
-#include "tsl/platform/mem.h"
+#include "tsl/platform/abi.h"
 
+// for rocprofiler-sdk
 namespace xla {
 namespace profiler {
 
-namespace se = ::stream_executor;
 using tsl::profiler::AnnotationStack;
 
+// represents an invalid or uninitialized device ID used in RocmTracer events.
 constexpr uint32_t RocmTracerEvent::kInvalidDeviceId;
 
-#define RETURN_IF_ROCTRACER_ERROR(expr)                                     \
-  do {                                                                      \
-    roctracer_status_t status = expr;                                       \
-    if (status != ROCTRACER_STATUS_SUCCESS) {                               \
-      const char* errstr = se::wrap::roctracer_error_string();              \
-      LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
-      return absl::InternalError(                                           \
-          absl::StrCat("roctracer call error", errstr));                    \
-    }                                                                       \
-  } while (false)
-
-namespace {
-
-// GetCachedTID() caches the thread ID in thread-local storage (which is a
-// userspace construct) to avoid unnecessary system calls. Without this caching,
-// it can take roughly 98ns, while it takes roughly 1ns with this caching.
-int64_t GetCachedTID() {
-  static thread_local int64_t current_thread_id =
-      tsl::Env::Default()->GetCurrentThreadId();
-  return current_thread_id;
-}
-
-const char* GetActivityDomainName(uint32_t domain) {
-  switch (domain) {
-    case ACTIVITY_DOMAIN_HSA_API:
-      return "HSA API";
-    case ACTIVITY_DOMAIN_HSA_OPS:
-      return "HSA OPS";
-    case ACTIVITY_DOMAIN_HIP_OPS:
-      return "HIP OPS/HCC/VDI";
-    case ACTIVITY_DOMAIN_HIP_API:
-      return "HIP API";
-    case ACTIVITY_DOMAIN_KFD_API:
-      return "KFD API";
-    case ACTIVITY_DOMAIN_EXT_API:
-      return "EXT API";
-    case ACTIVITY_DOMAIN_ROCTX:
-      return "ROCTX";
-    case ACTIVITY_DOMAIN_HSA_EVT:
-      return "HSA envents";
-    default:
-      DCHECK(false);
-      return "";
+inline auto GetCallbackTracingNames() {
+  return rocprofiler::sdk::get_callback_tracing_names();
+}
+
+std::vector<rocprofiler_agent_v0_t> GetGpuDeviceAgents();
+
+//-----------------------------------------------------------------------------
+// copy api calls
+bool isCopyApi(uint32_t id) {
+  switch (id) {
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2D:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArrayAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArrayAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3D:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3DAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAtoH:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoD:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoDAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoH:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoHAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbol:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbolAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoA:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoD:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoDAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2D:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2DAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeer:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeerAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbol:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbolAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyWithStream:
+      return true;
+    default: {
+    };
   }
-  return "";
+  return false;
 }
 
-std::string GetActivityDomainOpName(uint32_t domain, uint32_t op) {
-  std::ostringstream oss;
-  oss << GetActivityDomainName(domain) << " - ";
-  switch (domain) {
-    case ACTIVITY_DOMAIN_HIP_API:
-      oss << hip_api_name(op);
-      break;
-    default:
-      oss << op;
-      break;
-  }
-  return oss.str();
-}
-
-const char* GetActivityPhaseName(uint32_t phase) {
-  switch (phase) {
-    case ACTIVITY_API_PHASE_ENTER:
-      return "ENTER";
-    case ACTIVITY_API_PHASE_EXIT:
-      return "EXIT";
-    default:
-      DCHECK(false);
-      return "";
-  }
-  return "";
+// ----------------------------------------------------------------------------
+// Stub implementations for RocmTracer static functions expected by
+// rocprofiler-sdk.
+// ----------------------------------------------------------------------------
+RocmTracer& RocmTracer::GetRocmTracerSingleton() {
+  static RocmTracer obj;
+  return obj;
 }
 
-inline void DumpApiCallbackData(uint32_t domain, uint32_t cbid,
-                                const void* cbdata) {
-  std::ostringstream oss;
-  oss << "API callback for " << GetActivityDomainName(domain);
-  if (domain == ACTIVITY_DOMAIN_HIP_API) {
-    const hip_api_data_t* data =
-        reinterpret_cast<const hip_api_data_t*>(cbdata);
-    oss << " - " << hip_api_name(cbid);
-    oss << ", correlation_id=" << data->correlation_id;
-    oss << ", phase=" << GetActivityPhaseName(data->phase);
-    switch (cbid) {
-      case HIP_API_ID_hipModuleLaunchKernel:
-      case HIP_API_ID_hipExtModuleLaunchKernel:
-      case HIP_API_ID_hipHccModuleLaunchKernel:
-      case HIP_API_ID_hipLaunchKernel:
-      case HIP_API_ID_hipExtLaunchKernel:
-        break;
-      case HIP_API_ID_hipMemcpyDtoH:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoH.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyDtoHAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoHAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyHtoD:
-        oss << ", sizeBytes=" << data->args.hipMemcpyHtoD.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyHtoDAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyHtoDAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyDtoD:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoD.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyDtoDAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoDAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemsetD32:
-        oss << ", value=" << data->args.hipMemsetD32.value;
-        oss << ", count=" << data->args.hipMemsetD32.count;
-        break;
-      case HIP_API_ID_hipMemsetD32Async:
-        oss << ", value=" << data->args.hipMemsetD32Async.value;
-        oss << ", count=" << data->args.hipMemsetD32Async.count;
-        break;
-      case HIP_API_ID_hipMemsetD8:
-        oss << ", value=" << data->args.hipMemsetD8.value;
-        oss << ", count=" << data->args.hipMemsetD8.count;
-        break;
-      case HIP_API_ID_hipMemsetD8Async:
-        oss << ", value=" << data->args.hipMemsetD8Async.value;
-        oss << ", count=" << data->args.hipMemsetD8Async.count;
-        break;
-      case HIP_API_ID_hipMalloc:
-        oss << ", size=" << data->args.hipMalloc.size;
-        break;
-      case HIP_API_ID_hipFree:
-        oss << ", ptr=" << data->args.hipFree.ptr;
-        break;
-      case HIP_API_ID_hipStreamSynchronize:
-        break;
-      case HIP_API_ID_hipStreamWaitEvent:  // ignore all aux HIP API Events
-      case HIP_API_ID_hipHostFree:
-      case HIP_API_ID_hipHostMalloc:
-      case HIP_API_ID_hipSetDevice:
-        break;
-      default:
-        VLOG(3) << "Warning: HIP API is not handled: HIP_API_ID_"
-                << hip_api_name(cbid);
-        break;
-    }
-  } else {
-    oss << ": " << cbid;
-  }
-  VLOG(3) << oss.str();
-}
-
-void DumpActivityRecord(const roctracer_record_t* record,
-                        std::string extra_info) {
-  std::ostringstream oss;
-  oss << "Activity callback for " << GetActivityDomainName(record->domain);
-  oss << ", op name= "
-      << se::wrap::roctracer_op_string(record->domain, record->op,
-                                       record->kind);
-  oss << ", correlation_id=" << record->correlation_id;
-  oss << ", begin_ns=" << record->begin_ns;
-  oss << ", end_ns=" << record->end_ns;
-  oss << ", duration=" << record->end_ns - record->begin_ns;
-  oss << ", device_id=" << record->device_id;
-  oss << ", queue_id=" << record->queue_id;
-  oss << ", process_id=" << record->process_id;
-  oss << ", thread_id=" << record->thread_id;
-  oss << ", external_id=" << record->external_id;
-  oss << ", bytes=" << record->bytes;
-  oss << ", domain=" << record->domain;
-  oss << ", op=" << record->op;
-  oss << ", kind=" << record->kind;
-  oss << ", extra_info=" << extra_info;
-  VLOG(3) << oss.str();
-}
-
-}  // namespace
-
-const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type) {
-  switch (type) {
-    case RocmTracerEventType::Kernel:
-      return "Kernel";
-    case RocmTracerEventType::MemcpyH2D:
-      return "MemcpyH2D";
-    case RocmTracerEventType::MemcpyD2H:
-      return "MemcpyD2H";
-    case RocmTracerEventType::MemcpyD2D:
-      return "MemcpyD2D";
-    case RocmTracerEventType::MemcpyP2P:
-      return "MemcpyP2P";
-    case RocmTracerEventType::MemcpyOther:
-      return "MemcpyOther";
-    case RocmTracerEventType::MemoryAlloc:
-      return "MemoryAlloc";
-    case RocmTracerEventType::MemoryFree:
-      return "MemoryFree";
-    case RocmTracerEventType::Memset:
-      return "Memset";
-    case RocmTracerEventType::Synchronization:
-      return "Synchronization";
-    case RocmTracerEventType::Generic:
-      return "Generic";
-    default:
-      DCHECK(false);
-      return "";
-  }
-  return "";
+bool RocmTracer::IsAvailable() const {
+  return !activity_tracing_enabled_ && !api_tracing_enabled_;  // &&NumGpus()
 }
 
-const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source) {
-  switch (source) {
-    case RocmTracerEventSource::ApiCallback:
-      return "ApiCallback";
-      break;
-    case RocmTracerEventSource::Activity:
-      return "Activity";
-      break;
-    case RocmTracerEventSource::Invalid:
-      return "Invalid";
-      break;
-    default:
-      DCHECK(false);
-      return "";
+/*static*/ uint64_t RocmTracer::GetTimestamp() {
+  uint64_t ts;
+  if (rocprofiler_get_timestamp(&ts) != ROCPROFILER_STATUS_SUCCESS) {
+    LOG(ERROR) << "function rocprofiler_get_timestamp failed with error ";
+    return 0;
   }
-  return "";
+  return ts;
 }
 
-// FIXME(rocm-profiler): These domain names are not consistent with the
-// GetActivityDomainName function
-const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
-  switch (domain) {
-    case RocmTracerEventDomain::HIP_API:
-      return "HIP_API";
-      break;
-    case RocmTracerEventDomain::HIP_OPS:
-      return "HIP_OPS";
-      break;
-    default:
-      VLOG(3) << "RocmTracerEventDomain::InvalidDomain";
-      DCHECK(false);
-      return "";
+void RocmTracer::Enable(const RocmTracerOptions& options,
+                        RocmTraceCollector* collector) {
+  absl::MutexLock lock(collector_mutex_);
+  if (collector_ != nullptr) {
+    LOG(WARNING) << "ROCM tracer is already running!";
+    return;
   }
-  return "";
-}
-
-absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
-                                             const void* cbdata) {
-  /* Some APIs such as hipMalloc, implicitly work on th devices set by the
-    user using APIs such as hipSetDevice. API callbacks and activity records
-    for functions like hipMalloc does not return the device id (CUDA does). To
-    solve this we need to track the APIs that select the device (such as
-    hipSetDevice) for each thread.
-    */
-
-  thread_local uint32_t default_device = hipGetStreamDeviceId(nullptr);
-
-  // DumpApiCallbackData(domain, cbid, cbdata);
-
-  if (domain != ACTIVITY_DOMAIN_HIP_API) return absl::OkStatus();
-
-  const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(cbdata);
-
-  if (data->phase == ACTIVITY_API_PHASE_ENTER) {
-    if (options_.api_tracking_set.find(cbid) !=
-        options_.api_tracking_set.end()) {
-      absl::MutexLock lock(api_call_start_mutex_);
-      api_call_start_time_.emplace(data->correlation_id,
-                                   RocmTracer::GetTimestamp());
-    }
-
-    if (cbid == HIP_API_ID_hipSetDevice) {
-      default_device = hipGetStreamDeviceId(nullptr);
-    }
-  } else if (data->phase == ACTIVITY_API_PHASE_EXIT) {
-    uint64_t enter_time = 0, exit_time = 0;
-
-    if (options_.api_tracking_set.find(cbid) !=
-        options_.api_tracking_set.end()) {
-      absl::MutexLock lock(api_call_start_mutex_);
-      if (api_call_start_time_.find(data->correlation_id) !=
-          api_call_start_time_.end()) {
-        enter_time = api_call_start_time_.at(data->correlation_id);
-        api_call_start_time_.erase(data->correlation_id);
+  options_ = options;
+  collector_ = collector;
+  api_tracing_enabled_ = true;
+  activity_tracing_enabled_ = true;
+  rocprofiler_start_context(context_);
+  LOG(INFO) << "GpuTracer started with number of GPUs = " << NumGpus();
+}
+
+void RocmTracer::HipApiEvent(const rocprofiler_record_header_t* hdr,
+                             RocmTracerEvent* trace_event) {
+  const auto& rec =
+      *static_cast<const rocprofiler_buffer_tracing_hip_api_record_t*>(
+          hdr->payload);
+
+  trace_event->type = RocmTracerEventType::Kernel;
+  trace_event->source = RocmTracerEventSource::ApiCallback;
+  trace_event->domain = RocmTracerEventDomain::HIP_API;
+  trace_event->name = "??";
+  trace_event->start_time_ns = rec.start_timestamp;
+  trace_event->end_time_ns = rec.end_timestamp;
+  trace_event->device_id = RocmTracerEvent::kInvalidDeviceId;
+  trace_event->correlation_id = rec.correlation_id.internal;
+  trace_event->annotation =
+      annotation_map()->LookUp(trace_event->correlation_id);
+  trace_event->thread_id = rec.thread_id;
+  trace_event->stream_id = RocmTracerEvent::kInvalidStreamId;
+  trace_event->kernel_info = KernelDetails{};
+
+  {
+    // bounds-check name table: kind and operation
+    absl::MutexLock lock(kernel_lock_);
+    const size_t kind = static_cast<size_t>(rec.kind);
+    if (kind < name_info_.size()) {
+      const auto& vec = name_info_[kind];
+      const size_t op = static_cast<size_t>(rec.operation);
+      if (op < vec.operations.size()) {
+        trace_event->name = vec[op];
       } else {
-        LOG(WARNING) << "An API exit callback received without API enter "
-                        "with same correlation id. Event droped!";
-        return absl::OkStatus();  // This API does not belong to us.
+        static std::atomic<int> once{0};
+        if (once.fetch_add(1) == 0) {
+          LOG(ERROR) << "HIP op OOB: kind " << kind << " op = " << op
+                     << " vec.size() = " << vec.operations.size();
+        }
+        trace_event->name = "HIP_UNKNOWN_OP";
       }
-      exit_time = RocmTracer::GetTimestamp();
-    }
-    // Set up the map from correlation id to annotation string.
-    const std::string& annotation = AnnotationStack::Get();
-    if (!annotation.empty()) {
-      collector_->annotation_map()->Add(data->correlation_id, annotation);
-    }
-
-    if (options_.api_tracking_set.find(cbid) ==
-        options_.api_tracking_set.end()) {
-      VLOG(3) << "API callback is from the auxilarity list. Corr. id="
-              << data->correlation_id;
-    }
-    DumpApiCallbackData(domain, cbid, cbdata);
-
-    switch (cbid) {
-      // star in comments means it does not exist in the driver wrapper
-      case HIP_API_ID_hipModuleLaunchKernel:
-      case HIP_API_ID_hipExtModuleLaunchKernel:  // *
-      case HIP_API_ID_hipHccModuleLaunchKernel:  // *
-      case HIP_API_ID_hipLaunchKernel:           // *
-      case HIP_API_ID_hipExtLaunchKernel:
-
-        this->AddKernelEventUponApiExit(cbid, data, enter_time, exit_time);
-
-        // Add the correlation_ids for these events to the pending set
-        // so that we can explicitly wait for their corresponding
-        // HIP runtime activity records, before exporting the trace data
-        tracer_->AddToPendingActivityRecords(data->correlation_id);
-        break;
-      case HIP_API_ID_hipMemcpy:
-      case HIP_API_ID_hipMemcpyDtoH:
-      case HIP_API_ID_hipMemcpyDtoHAsync:
-      case HIP_API_ID_hipMemcpyHtoD:
-      case HIP_API_ID_hipMemcpyHtoDAsync:
-      case HIP_API_ID_hipMemcpyDtoD:
-      case HIP_API_ID_hipMemcpyDtoDAsync:
-      case HIP_API_ID_hipMemcpyAsync:
-        this->AddNormalMemcpyEventUponApiExit(cbid, data, enter_time,
-                                              exit_time);
-        tracer_->AddToPendingActivityRecords(data->correlation_id);
-        break;
-      case HIP_API_ID_hipMemset:
-      case HIP_API_ID_hipMemsetAsync:
-      case HIP_API_ID_hipMemsetD32:
-      case HIP_API_ID_hipMemsetD32Async:
-      case HIP_API_ID_hipMemsetD16:
-      case HIP_API_ID_hipMemsetD16Async:
-      case HIP_API_ID_hipMemsetD8:
-      case HIP_API_ID_hipMemsetD8Async:
-        this->AddMemsetEventUponApiExit(cbid, data, enter_time, exit_time);
-        break;
-      case HIP_API_ID_hipMalloc:
-      case HIP_API_ID_hipMallocPitch:
-      case HIP_API_ID_hipHostMalloc:
-      case HIP_API_ID_hipFree:
-      case HIP_API_ID_hipHostFree:
-        this->AddMallocFreeEventUponApiExit(cbid, data, default_device,
-                                            enter_time, exit_time);
-        break;
-      case HIP_API_ID_hipStreamSynchronize:
-      case HIP_API_ID_hipStreamWaitEvent:
-        // case HIP_API_ID_hipEventSynchronize:
-        this->AddSynchronizeEventUponApiExit(cbid, data, enter_time, exit_time);
-        break;
-      case HIP_API_ID_hipSetDevice:
-        // we track this ID only to find the device ID
-        //  for the current thread.
-        break;
-      default:
-        //
-        VLOG(1) << "API call "
-                << se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid,
-                                                 0)
-                << ", corr. id=" << data->correlation_id
-                << " dropped. No capturing function was found!";
-        // AddGenericEventUponApiExit(cbid, data);
-        break;
+    } else {
+      static std::atomic<int> once{0};
+      if (once.fetch_add(1) == 0) {
+        LOG(ERROR) << "HIP kind OOB: kind = " << kind
+                   << " name_info_.size() = " << name_info_.size();
+      }
+      trace_event->name = "HIP_UNKNOWN_KIND";
     }
   }
-  return absl::OkStatus();
-}
 
-void RocmApiCallbackImpl::AddKernelEventUponApiExit(uint32_t cbid,
-                                                    const hip_api_data_t* data,
-                                                    const uint64_t enter_time,
-                                                    const uint64_t exit_time) {
-  /*
-  extra fields:
-    kernel_info, domain
-
-  missing fields:
-    context_id
-  */
-  RocmTracerEvent event;
-
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Kernel;
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipModuleLaunchKernel: {
-      const hipFunction_t kernelFunc = data->args.hipModuleLaunchKernel.f;
-      if (kernelFunc != nullptr) event.name = hipKernelNameRef(kernelFunc);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipModuleLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipModuleLaunchKernel.blockDimX;
-      event.kernel_info.block_y = data->args.hipModuleLaunchKernel.blockDimY;
-      event.kernel_info.block_z = data->args.hipModuleLaunchKernel.blockDimZ;
-      event.kernel_info.grid_x = data->args.hipModuleLaunchKernel.gridDimX;
-      event.kernel_info.grid_y = data->args.hipModuleLaunchKernel.gridDimY;
-      event.kernel_info.grid_z = data->args.hipModuleLaunchKernel.gridDimZ;
-      event.kernel_info.func_ptr = kernelFunc;
-      const hipStream_t& stream = data->args.hipModuleLaunchKernel.stream;
-      // TODO(rocm-profiler): wrap this API if possible.
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipExtModuleLaunchKernel: {
-      const hipFunction_t kernelFunc = data->args.hipExtModuleLaunchKernel.f;
-      if (kernelFunc != nullptr) event.name = hipKernelNameRef(kernelFunc);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipExtModuleLaunchKernel.sharedMemBytes;
-      unsigned int blockDimX =
-          data->args.hipExtModuleLaunchKernel.localWorkSizeX;
-      unsigned int blockDimY =
-          data->args.hipExtModuleLaunchKernel.localWorkSizeY;
-      unsigned int blockDimZ =
-          data->args.hipExtModuleLaunchKernel.localWorkSizeZ;
-
-      event.kernel_info.block_x = blockDimX;
-      event.kernel_info.block_y = blockDimY;
-      event.kernel_info.block_z = blockDimZ;
-      event.kernel_info.grid_x =
-          data->args.hipExtModuleLaunchKernel.globalWorkSizeX / blockDimX;
-      event.kernel_info.grid_y =
-          data->args.hipExtModuleLaunchKernel.globalWorkSizeY / blockDimY;
-      event.kernel_info.grid_z =
-          data->args.hipExtModuleLaunchKernel.globalWorkSizeZ / blockDimZ;
-      event.kernel_info.func_ptr = kernelFunc;
-      const hipStream_t& stream = data->args.hipExtModuleLaunchKernel.hStream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipHccModuleLaunchKernel: {
-      const hipFunction_t kernelFunc = data->args.hipHccModuleLaunchKernel.f;
-      if (kernelFunc != nullptr) event.name = hipKernelNameRef(kernelFunc);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipHccModuleLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipHccModuleLaunchKernel.blockDimX;
-      event.kernel_info.block_y = data->args.hipHccModuleLaunchKernel.blockDimY;
-      event.kernel_info.block_z = data->args.hipHccModuleLaunchKernel.blockDimZ;
-      event.kernel_info.grid_x =
-          data->args.hipHccModuleLaunchKernel.globalWorkSizeX /
-          event.kernel_info.block_x;
-      event.kernel_info.grid_y =
-          data->args.hipHccModuleLaunchKernel.globalWorkSizeY /
-          event.kernel_info.block_y;
-      event.kernel_info.grid_z =
-          data->args.hipHccModuleLaunchKernel.globalWorkSizeZ /
-          event.kernel_info.block_z;
-      event.kernel_info.func_ptr = kernelFunc;
-      const hipStream_t& stream = data->args.hipHccModuleLaunchKernel.hStream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipHccModuleLaunchKernel.sharedMemBytes;
-    } break;
-    case HIP_API_ID_hipLaunchKernel: {
-      const void* func_addr = data->args.hipLaunchKernel.function_address;
-      hipStream_t stream = data->args.hipLaunchKernel.stream;
-      if (func_addr != nullptr)
-        event.name = hipKernelNameRefByPtr(func_addr, stream);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipLaunchKernel.dimBlocks.x;
-      event.kernel_info.block_y = data->args.hipLaunchKernel.dimBlocks.y;
-      event.kernel_info.block_z = data->args.hipLaunchKernel.dimBlocks.z;
-      event.kernel_info.grid_x = data->args.hipLaunchKernel.numBlocks.x;
-      event.kernel_info.grid_y = data->args.hipLaunchKernel.numBlocks.y;
-      event.kernel_info.grid_z = data->args.hipLaunchKernel.numBlocks.z;
-      event.kernel_info.func_ptr = (void*)func_addr;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipExtLaunchKernel: {
-      const void* func_addr = data->args.hipExtLaunchKernel.function_address;
-      hipStream_t stream = data->args.hipExtLaunchKernel.stream;
-      if (func_addr != nullptr)
-        event.name = hipKernelNameRefByPtr(func_addr, stream);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipExtLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipExtLaunchKernel.dimBlocks.x;
-      event.kernel_info.block_y = data->args.hipExtLaunchKernel.dimBlocks.y;
-      event.kernel_info.block_z = data->args.hipExtLaunchKernel.dimBlocks.z;
-      event.kernel_info.grid_x = data->args.hipExtLaunchKernel.numBlocks.x;
-      event.kernel_info.grid_y = data->args.hipExtLaunchKernel.numBlocks.y;
-      event.kernel_info.grid_z = data->args.hipExtLaunchKernel.numBlocks.z;
-      event.kernel_info.func_ptr = const_cast<void*>(func_addr);
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
+  if (isCopyApi(rec.operation)) {
+    // actually one needs to set the real type
+    trace_event->type = RocmTracerEventType::MemcpyOther;
   }
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
 }
 
-void RocmApiCallbackImpl::AddNormalMemcpyEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint64_t enter_time,
-    uint64_t exit_time) {
-  /*
-    missing:
-      device_id(partially, have only for async), context_id,
-    memcpy_info.kind(CUPTI puts CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN),
-      memcpy_info.destination(partially, only for async)( CUPTI puts device_id),
-
-    extra:
-      domain, name,
-  */
-  // for CUDA, it does NOT capture stream id for these types
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  /* The general hipMemcpy or hipMemcpyAsync can support any kind of memory
-  copy operation, such as H2D, D2D, P2P, and D2H. Here we use MemcpyOther for
-  all api calls with HipMemcpy(+Async) to carry-on this generality.
-  We also assume that if we want to copy data BETWEEN devices, we do not use
-  hipMemcpy(+Async) or hipMemcpyDtoD(+Async) as we explicitly always set the
-  destenation as the source device id). Ultimately, to figure out the actual
-  device we can use hipPointerGetAttributes but we do not do that now .In the
-  other words, we assume we use hipMemcpyPeer to achieve the copy between
-  devices.
-  */
-
-  switch (cbid) {
-    case HIP_API_ID_hipMemcpyDtoH: {
-      event.type = RocmTracerEventType::MemcpyD2H;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoH.sizeBytes;
-      event.memcpy_info.async = false;
-    } break;
-    case HIP_API_ID_hipMemcpyDtoHAsync: {
-      event.type = RocmTracerEventType::MemcpyD2H;
-      const hipStream_t& stream = data->args.hipMemcpyDtoHAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoHAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
-    case HIP_API_ID_hipMemcpyHtoD: {
-      event.type = RocmTracerEventType::MemcpyH2D;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyHtoD.sizeBytes;
-      event.memcpy_info.async = false;
-      // we set the destenattion device id for it using the device id we get
-      // from activities when they exchange information before flushing
-    } break;
-    case HIP_API_ID_hipMemcpyHtoDAsync: {
-      event.type = RocmTracerEventType::MemcpyH2D;
-      const hipStream_t& stream = data->args.hipMemcpyHtoDAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyHtoDAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
-    case HIP_API_ID_hipMemcpyDtoD: {
-      event.type = RocmTracerEventType::MemcpyD2D;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoD.sizeBytes;
-      event.memcpy_info.async = false;
-    } break;
-    case HIP_API_ID_hipMemcpyDtoDAsync: {
-      event.type = RocmTracerEventType::MemcpyD2D;
-      const hipStream_t& stream = data->args.hipMemcpyDtoDAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoDAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
-    case HIP_API_ID_hipMemcpy: {
-      event.type = RocmTracerEventType::MemcpyOther;
-      event.memcpy_info.num_bytes = data->args.hipMemcpy.sizeBytes;
-      event.memcpy_info.async = false;
-    } break;
-    case HIP_API_ID_hipMemcpyAsync: {
-      event.type = RocmTracerEventType::MemcpyOther;
-      const hipStream_t& stream = data->args.hipMemcpyAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
+void RocmTracer::MemcpyEvent(const rocprofiler_record_header_t* hdr,
+                             RocmTracerEvent* trace_event) {
+  const auto& rec =
+      *static_cast<const rocprofiler_buffer_tracing_memory_copy_record_t*>(
+          hdr->payload);
+
+#define OO(src, target)                              \
+  case ROCPROFILER_MEMORY_COPY_##src:                \
+    trace_event->type = RocmTracerEventType::target; \
+    trace_event->name = #target;                     \
+    break;
+
+  switch (rec.operation) {
+    OO(NONE, MemcpyOther)
+    OO(HOST_TO_HOST, MemcpyOther)
+    OO(HOST_TO_DEVICE, MemcpyH2D)
+    OO(DEVICE_TO_HOST, MemcpyD2H)
+    OO(DEVICE_TO_DEVICE, MemcpyD2D)
     default:
-      LOG(WARNING) << "Unsupported Memcpy API for profiling observed for cbid="
-                   << cbid << ". Event dropped!";
-      return;
-      break;
+      LOG(WARNING) << "Unexpected memcopy operation " << rec.operation;
+      trace_event->type = RocmTracerEventType::MemcpyOther;
   }
-
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
-void RocmApiCallbackImpl::AddMemcpyPeerEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint64_t enter_time,
-    uint64_t exit_time) {
-  /*
-    missing: context_id, memcpy_info.kind
-
-    extra: domain, name,
-  */
-
-  RocmTracerEvent event;
-  event.type = RocmTracerEventType::MemcpyP2P;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipMemcpyPeer:
-      event.device_id = data->args.hipMemcpyPeer.srcDeviceId;
-      event.memcpy_info.destination = data->args.hipMemcpyPeer.dstDeviceId;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyPeer.sizeBytes;
-      event.memcpy_info.async = false;
-      break;
-    case HIP_API_ID_hipMemcpyPeerAsync:
-      event.device_id = data->args.hipMemcpyPeerAsync.srcDevice;
-      event.memcpy_info.destination = data->args.hipMemcpyPeerAsync.dstDeviceId;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyPeerAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      break;
-    default:
-      LOG(WARNING)
-          << "Unsupported MemcpyPeer API for profiling observed for cbid="
-          << cbid << ". Event dropped!";
-      return;
-      break;
+#undef OO
+  const auto &src_gpu = agents_[static_cast<uint32_t>(rec.src_agent_id.handle)],
+             &dst_gpu = agents_[static_cast<uint32_t>(rec.dst_agent_id.handle)];
+
+  // Assign device_id based on copy direction
+  if (trace_event->type == RocmTracerEventType::MemcpyH2D &&
+      dst_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+    trace_event->device_id = dst_gpu.id.handle;  // Destination is GPU
+  } else if (trace_event->type == RocmTracerEventType::MemcpyD2H &&
+             src_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+    trace_event->device_id = src_gpu.id.handle;  // Source is GPU
+  } else if (trace_event->type == RocmTracerEventType::MemcpyD2D) {
+    // Prefer destination GPU for D2D
+    trace_event->device_id = dst_gpu.id.handle;
+  } else {
+    // Fallback for MemcpyOther or HOST_TO_HOST
+    if (dst_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+      trace_event->device_id = dst_gpu.id.handle;
+    } else if (src_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+      trace_event->device_id = src_gpu.id.handle;
+    } else {
+      LOG(WARNING) << "No GPU ID available for memory copy operation: "
+                   << trace_event->name << ", src_agent_type=" << src_gpu.type
+                   << ", dst_agent_type=" << dst_gpu.type;
+      trace_event->device_id = 0;  // Invalid ID or default
+    }
   }
 
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
-void RocmApiCallbackImpl::AddMemsetEventUponApiExit(uint32_t cbid,
-                                                    const hip_api_data_t* data,
-                                                    uint64_t enter_time,
-                                                    uint64_t exit_time) {
-  /*
-    misses:
-      device_id(only avail. for async), context_id
-
-    extras:
-      domain, name
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipMemsetD8:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemsetD8.count;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD8Async: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemsetD8Async.count;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetD8Async.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipMemsetD16:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 2 * data->args.hipMemsetD16.count;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD16Async: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 2 * data->args.hipMemsetD16Async.count;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetD16Async.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipMemsetD32:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 4 * data->args.hipMemsetD32.count;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD32Async: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 4 * data->args.hipMemsetD32Async.count;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetD32Async.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipMemset:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemset.sizeBytes;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetAsync: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemsetAsync.sizeBytes;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    default:
-      LOG(WARNING) << "Unsupported Memset API for profiling observed for cbid="
-                   << cbid << ". Event dropped!";
-      return;
-      break;
+  trace_event->source = RocmTracerEventSource::Activity;
+  trace_event->domain = RocmTracerEventDomain::HIP_OPS;
+  trace_event->start_time_ns = rec.start_timestamp;
+  trace_event->end_time_ns = rec.end_timestamp;
+  trace_event->correlation_id = rec.correlation_id.internal;
+  trace_event->annotation =
+      annotation_map()->LookUp(trace_event->correlation_id);
+  trace_event->thread_id = rec.thread_id;
+  // we do not know valid stream ID for memcpy
+  // rec.stream_id.handle;
+  trace_event->stream_id = RocmTracerEvent::kInvalidStreamId;
+  trace_event->memcpy_info = MemcpyDetails{
+      .num_bytes = rec.bytes,
+      .destination = static_cast<uint32_t>(dst_gpu.id.handle),
+      .async = false,
+  };
+
+  VLOG(2) << "copy bytes: " << trace_event->memcpy_info.num_bytes
+          << " stream: " << trace_event->stream_id << " src_id "
+          << trace_event->device_id << " dst_id "
+          << trace_event->memcpy_info.destination;
+}
+
+void RocmTracer::KernelEvent(const rocprofiler_record_header_t* hdr,
+                             RocmTracerEvent* trace_event) {
+  const auto& rec =
+      *static_cast<const rocprofiler_buffer_tracing_kernel_dispatch_record_t*>(
+          hdr->payload);
+
+  const auto& kinfo = rec.dispatch_info;
+  trace_event->type = RocmTracerEventType::Kernel;
+  trace_event->source = RocmTracerEventSource::Activity;
+  trace_event->domain = RocmTracerEventDomain::HIP_OPS;
+  trace_event->name = "??";
+  trace_event->start_time_ns = rec.start_timestamp;
+  trace_event->end_time_ns = rec.end_timestamp;
+  trace_event->device_id = agents_[kinfo.agent_id.handle].id.handle;
+  trace_event->correlation_id = rec.correlation_id.internal;
+  trace_event->annotation =
+      annotation_map()->LookUp(trace_event->correlation_id);
+  trace_event->thread_id = rec.thread_id;
+  trace_event->stream_id = kinfo.queue_id.handle;
+  trace_event->kernel_info = KernelDetails{
+      .private_segment_size = kinfo.private_segment_size,
+      .group_segment_size = kinfo.group_segment_size,
+      .workgroup_x = kinfo.workgroup_size.x,
+      .workgroup_y = kinfo.workgroup_size.y,
+      .workgroup_z = kinfo.workgroup_size.z,
+      .grid_x = kinfo.grid_size.x,
+      .grid_y = kinfo.grid_size.y,
+      .grid_z = kinfo.grid_size.z,
+      .func_ptr = nullptr,
+  };
+
+  auto it = kernel_info_.find(kinfo.kernel_id);
+  if (it != kernel_info_.end()) trace_event->name = it->second.name;
+}
+
+void RocmTracer::TracingCallback(rocprofiler_context_id_t context,
+                                 rocprofiler_buffer_id_t buffer_id,
+                                 rocprofiler_record_header_t** headers,
+                                 size_t num_headers, uint64_t drop_count) {
+  if (collector() == nullptr) {
+    return;
   }
+  if (num_headers == 0) {
+    return;
+  }
+  assert(drop_count == 0 && "drop count should be zero for lossless policy");
 
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
-
-void RocmApiCallbackImpl::AddMallocFreeEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint32_t device_id,
-    uint64_t enter_time, uint64_t exit_time) {
-  /*
-    misses: context_id
-
-    extras: domain
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = (cbid == HIP_API_ID_hipFree || cbid == HIP_API_ID_hipHostFree)
-                   ? RocmTracerEventType::MemoryFree
-                   : RocmTracerEventType::MemoryAlloc;
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.device_id = device_id;
-  event.thread_id = GetCachedTID();
-  // We do not set stream_id (probably to zero as Malloc etc. commands seems
-  // to run on  default stream). Later we use the unassigned stream_id as a
-  // feature to assign events to host or device.
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipMalloc:
-      event.memalloc_info.num_bytes = data->args.hipMalloc.size;
-      break;
-    case HIP_API_ID_hipMallocPitch:
-      event.memalloc_info.num_bytes = data->args.hipMallocPitch.pitch__val *
-                                      data->args.hipMallocPitch.height;
-      break;
-    case HIP_API_ID_hipHostMalloc:
-      event.memalloc_info.num_bytes = data->args.hipHostMalloc.size;
-      break;
-    case HIP_API_ID_hipFree:
-    case HIP_API_ID_hipHostFree:
-      event.memalloc_info.num_bytes = 0;
-      break;
-    default:
-      LOG(WARNING)
-          << "Unsupported Malloc/Free API for profiling observed for cbid="
-          << cbid << ". Event dropped!";
-      return;
-      break;
+  if (headers == nullptr) {
+    LOG(ERROR)
+        << "rocprofiler invoked a buffer callback with a null pointer to the "
+           "array of headers. this should never happen";
+    return;
   }
 
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
+  for (size_t i = 0; i < num_headers; i++) {
+    RocmTracerEvent event;
+    auto header = headers[i];
 
-void RocmApiCallbackImpl::AddSynchronizeEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint64_t enter_time,
-    uint64_t exit_time) {
-  // TODO(rocm-profiler): neither CUDA and nor we capture annotaint for this
-  // event
-  /*
-    misses: context_id
-
-    extras: domain,
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Synchronization;
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipStreamSynchronize: {
-      event.synchronization_info.sync_type =
-          RocmTracerSyncTypes::StreamSynchronize;
-      const hipStream_t& stream = data->args.hipStreamSynchronize.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipStreamWaitEvent: {
-      event.synchronization_info.sync_type = RocmTracerSyncTypes::StreamWait;
-      const hipStream_t& stream = data->args.hipStreamWaitEvent.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    default:
-      LOG(WARNING)
-          << "Unsupported Synchronization API for profiling observed for cbid="
-          << cbid << ". Event dropped!";
-      return;
-      break;
-  }
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
+    if (header->category != ROCPROFILER_BUFFER_CATEGORY_TRACING) continue;
 
-absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
-                                                  const char* end) {
-  // we do not dump activities in this set in logger
-
-  static std::set<activity_op_t> dump_excluded_activities = {
-      HIP_API_ID_hipGetDevice,
-      HIP_API_ID_hipSetDevice,
-      HIP_API_ID___hipPushCallConfiguration,
-      HIP_API_ID___hipPopCallConfiguration,
-      HIP_API_ID_hipEventQuery,
-      HIP_API_ID_hipCtxSetCurrent,
-      HIP_API_ID_hipEventRecord,
-      HIP_API_ID_hipEventQuery,
-      HIP_API_ID_hipGetDeviceProperties,
-      HIP_API_ID_hipPeekAtLastError,
-      HIP_API_ID_hipModuleGetFunction,
-      HIP_API_ID_hipEventCreateWithFlags};
-
-  const roctracer_record_t* record =
-      reinterpret_cast<const roctracer_record_t*>(begin);
-  const roctracer_record_t* end_record =
-      reinterpret_cast<const roctracer_record_t*>(end);
-
-  while (record < end_record) {
-    // DumpActivityRecord(record);
-
-    switch (record->domain) {
-      // HIP API activities.
-      case ACTIVITY_DOMAIN_HIP_API:
-        switch (record->op) {
-          case HIP_API_ID_hipModuleLaunchKernel:
-          case HIP_API_ID_hipExtModuleLaunchKernel:
-          case HIP_API_ID_hipHccModuleLaunchKernel:
-          case HIP_API_ID_hipLaunchKernel:
-          case HIP_API_ID_hipExtLaunchKernel:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipKernelActivityEvent(record);
-            break;
-          case HIP_API_ID_hipMemcpyDtoH:
-          case HIP_API_ID_hipMemcpyHtoD:
-          case HIP_API_ID_hipMemcpyDtoD:
-          case HIP_API_ID_hipMemcpyDtoHAsync:
-          case HIP_API_ID_hipMemcpyHtoDAsync:
-          case HIP_API_ID_hipMemcpyDtoDAsync:
-          case HIP_API_ID_hipMemcpyAsync:
-          case HIP_API_ID_hipMemcpy:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddNormalHipMemcpyActivityEvent(record);
-            break;
-          case HIP_API_ID_hipMemset:
-          case HIP_API_ID_hipMemsetAsync:
-          case HIP_API_ID_hipMemsetD32:
-          case HIP_API_ID_hipMemsetD32Async:
-          case HIP_API_ID_hipMemsetD16:
-          case HIP_API_ID_hipMemsetD16Async:
-          case HIP_API_ID_hipMemsetD8:
-          case HIP_API_ID_hipMemsetD8Async:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipMemsetActivityEvent(record);
-            break;
-
-          case HIP_API_ID_hipMalloc:
-          case HIP_API_ID_hipMallocPitch:
-          case HIP_API_ID_hipHostMalloc:
-          case HIP_API_ID_hipFree:
-          case HIP_API_ID_hipHostFree:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipMallocActivityEvent(record);
-            break;
-          case HIP_API_ID_hipStreamSynchronize:
-          case HIP_API_ID_hipStreamWaitEvent:
-            // case HIP_API_ID_hipStreamWaitEvent:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipStreamSynchronizeActivityEvent(record);
-            break;
-
-          default:
-            if (dump_excluded_activities.find(record->op) ==
-                dump_excluded_activities.end()) {
-              std::string drop_message(
-                  "\nNot in the API tracked activities. Dropped!");
-              DumpActivityRecord(record, drop_message);
-            }
-            break;
-        }  // switch (record->op).
+    switch (header->kind) {
+      case ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API:
+        HipApiEvent(header, &event);
         break;
 
-      // HCC ops activities.
-      case ACTIVITY_DOMAIN_HIP_OPS:
-
-        switch (record->op) {
-          case HIP_OP_ID_DISPATCH:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHccKernelActivityEvent(record);
-            tracer_->RemoveFromPendingActivityRecords(record->correlation_id);
-            break;
-          case HIP_OP_ID_COPY:
-            switch (record->kind) {
-              // TODO(rocm-profiler): use enum instead.
-              case 4595:   /*CopyDeviceToHost*/
-              case 4596:   /*CopyDeviceToDevice*/
-              case 4597: { /*CopyHostToDevice*/
-                /*MEMCPY*/
-                // roctracer returns CopyHostToDevice for hipMemcpyDtoD API
-                //  Please look at the issue #53 in roctracer GitHub repo.
-                DumpActivityRecord(record, "");
-                AddNormalHipOpsMemcpyActivityEvent(record);
-                tracer_->RemoveFromPendingActivityRecords(
-                    record->correlation_id);
-              } break;
-              case 4615: /*FillBuffer*/
-                /*MEMSET*/
-                DumpActivityRecord(record, "");
-                AddHipOpsMemsetActivityEvent(record);
-                break;
-              case 4606: /*MARKER*/
-                // making the log shorter.
-                // markers are with 0ns duration.
-                break;
-              default:
-                std::string drop_message(
-                    "\nNot in the HIP-OPS-COPY tracked activities. Dropeed!");
-                DumpActivityRecord(record, drop_message);
-                break;
-            }  // switch (record->kind)
-            break;
-          default:
-            std::string drop_message(
-                "\nNot in the HIP-OPS tracked activities. Dropped!");
-            DumpActivityRecord(record, drop_message);
-            break;
-        }  // switch (record->op).
+      case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH:
+        KernelEvent(header, &event);
         break;
-      default:
-        std::string drop_message(
-            "\nNot in the tracked domain activities. Dropped!");
-        DumpActivityRecord(record, drop_message);
-        break;
-    }
 
-    RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
-#if TF_ROCM_VERSION >= 50300
-        se::wrap::roctracer_next_record(record, &record)
-#else
-        roctracer_next_record(record, &record)
-#endif
-            ));
-  }
+      case ROCPROFILER_BUFFER_TRACING_MEMORY_COPY:
+        MemcpyEvent(header, &event);
+        break;
 
-  return absl::OkStatus();
-}
+      default:
+        continue;
+    }  // switch
 
-void RocmActivityCallbackImpl::AddHipKernelActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-  missing:
-   name, device_id(got from hcc), context_id, stream_id(got from hcc),
- nvtx_range, kernel_info
-
-  extra:
-   domain
- activity record contains process/thread ID
- */
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Kernel;
-  event.source = RocmTracerEventSource::Activity;
-  // event.name =  /* we use the API name instead*/
-  //    se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  // TODO(rocm-profiler): CUDA uses device id and correlation ID for finding
-  // annotations.
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-
-  collector_->AddEvent(std::move(event), false);
+    absl::MutexLock lock(collector_mutex_);
+    if (collector()) {
+      collector()->AddEvent(std::move(event), false);
+    }
+  }  // for
 }
 
-void RocmActivityCallbackImpl::AddNormalHipMemcpyActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-  ---------------NormalMemcpy-------------------
-    misses:context_id, memcpy_info.kind, memcpy_info.srckind,
-  memcpy_info.dstkind, memcpy_info.num_bytes, memcpy_info.destenation,
-  device_id, stream_id,
-
-    extras: domain
-  ---------------PeerMemcpy---------------------
-    misses: device_id, context_id, stream_id, memcpy_info.kind,
-      memcpy_info.num_bytes, memcpy_info.destination,
-    extras:
-      domain,
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.source = RocmTracerEventSource::Activity;
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  // TODO(roc-profiler): record->bytes is not a valid value
-  // event.memcpy_info.num_bytes = record->bytes;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  switch (record->op) {
-    case HIP_API_ID_hipMemcpyDtoH:
-    case HIP_API_ID_hipMemcpyDtoHAsync:
-      event.type = RocmTracerEventType::MemcpyD2H;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyDtoHAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpyHtoD:
-    case HIP_API_ID_hipMemcpyHtoDAsync:
-      event.type = RocmTracerEventType::MemcpyH2D;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyHtoDAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpyDtoD:
-    case HIP_API_ID_hipMemcpyDtoDAsync:
-      event.type = RocmTracerEventType::MemcpyD2D;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyDtoDAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpy:
-    case HIP_API_ID_hipMemcpyAsync:
-      event.type = RocmTracerEventType::MemcpyOther;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpyPeer:
-    case HIP_API_ID_hipMemcpyPeerAsync:
-      event.type = RocmTracerEventType::MemcpyP2P;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyPeerAsync) ? true : false;
-      break;
-    default:
-      LOG(WARNING) << "Unsupported Memcpy/MemcpyPeer activity for profiling "
-                      "observed for cbid="
-                   << record->op << ". Event dropped!";
-      return;
-      break;
+void RocmTracer::CodeObjectCallback(
+    rocprofiler_callback_tracing_record_t record, void* callback_data) {
+  if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
+      record.operation == ROCPROFILER_CODE_OBJECT_LOAD) {
+    if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) {
+      // mainly for debugging
+      LOG(WARNING)
+          << "Callback phase unload without registering kernel names ...";
+    }
+  } else if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
+             record.operation ==
+                 ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) {
+    auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
+    if (record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) {
+      absl::MutexLock lock(kernel_lock_);
+      kernel_info_.emplace(
+          data->kernel_id,
+          ProfilerKernelInfo{tsl::port::MaybeAbiDemangle(data->kernel_name),
+                             *data});
+    } else if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) {
+      // FIXME: clear these?  At minimum need kernel names at shutdown, async
+      // completion We don't erase it just in case a buffer callback still needs
+      // this kernel_info_.erase(data->kernel_id);
+    }
   }
-
-  collector_->AddEvent(std::move(event), false);
 }
 
-void RocmActivityCallbackImpl::AddHipMemsetActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses:
-      device_id, context_id, stram_id, memset_info.num_bytes
-      memset_info.kind
-
-    extras:
-      domain, annotation
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-
-  event.type = RocmTracerEventType::Memset;
-
-  switch (record->op) {
-    case HIP_API_ID_hipMemset:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetAsync:
-      event.memset_info.async = true;
-      break;
-    case HIP_API_ID_hipMemsetD8:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD8Async:
-      event.memset_info.async = true;
-      break;
-    case HIP_API_ID_hipMemsetD16:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD16Async:
-      event.memset_info.async = true;
-      break;
-    case HIP_API_ID_hipMemsetD32:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD32Async:
-      event.memset_info.async = true;
-      break;
+static void code_object_callback(rocprofiler_callback_tracing_record_t record,
+                                 rocprofiler_user_data_t* user_data,
+                                 void* callback_data) {
+  RocmTracer::GetRocmTracerSingleton().CodeObjectCallback(record,
+                                                          callback_data);
+}
+
+static void tool_tracing_callback(rocprofiler_context_id_t context,
+                                  rocprofiler_buffer_id_t buffer_id,
+                                  rocprofiler_record_header_t** headers,
+                                  size_t num_headers, void* user_data,
+                                  uint64_t drop_count) {
+  RocmTracer::GetRocmTracerSingleton().TracingCallback(
+      context, buffer_id, headers, num_headers, drop_count);
+}
+
+int RocmTracer::toolInit(rocprofiler_client_finalize_t fini_func,
+                         void* tool_data) {
+  // Gather API names
+  name_info_ = GetCallbackTracingNames();
+
+  // Gather agent info
+  num_gpus_ = 0;
+  for (const auto& agent : GetGpuDeviceAgents()) {
+    LOG(INFO) << "agent id = " << agent.id.handle
+              << ", dev = " << agent.device_id
+              << ", name = " << (agent.name ? agent.name : "null");
+    agents_[agent.id.handle] = agent;
+    if (agent.type == ROCPROFILER_AGENT_TYPE_GPU) {
+      num_gpus_++;
+    }
   }
 
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddHipMallocActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses: device_id, context_id, memory_residency_info (num_byts, kind,
-    address)
-
-    extras:
-      annotation, domain,
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::MemoryAlloc;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  // similar to CUDA we set this to the default stream
-  event.stream_id = 0;
-  event.start_time_ns = record->begin_ns;
-  // making sure it does not have 0ns duration. Otherwise, it may not show up in
-  // the trace view
-  event.end_time_ns = std::max(record->end_ns, record->begin_ns + 1);
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddHipStreamSynchronizeActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-  misses: context_id, device_id (cuda also does not provide but we can get from
-  API-CB)
-
-  extras: domain, synchronization_info.sync_type, annotation
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Synchronization;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  event.start_time_ns = record->begin_ns;
-
-  // making sure it does not have 0ns duration. Otherwise, it may not show up in
-  // the trace view
-  event.end_time_ns = std::max(record->end_ns, record->begin_ns + 1);
-
-  switch (record->op) {
-    case HIP_API_ID_hipStreamSynchronize:
-      event.synchronization_info.sync_type =
-          RocmTracerSyncTypes::StreamSynchronize;
-      break;
-    case HIP_API_ID_hipStreamWaitEvent:
-      event.synchronization_info.sync_type = RocmTracerSyncTypes::StreamWait;
-      break;
-    default:
-      event.synchronization_info.sync_type = RocmTracerSyncTypes::InvalidSync;
-      break;
+  // Utility context to gather code‑object info
+  rocprofiler_create_context(&utility_context_);
+
+  // buffered tracing
+  auto code_object_ops = std::vector<rocprofiler_tracing_operation_t>{
+      ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER};
+
+  rocprofiler_configure_callback_tracing_service(
+      utility_context_, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
+      code_object_ops.data(), code_object_ops.size(), code_object_callback,
+      nullptr);
+
+  rocprofiler_start_context(utility_context_);
+  LOG(INFO) << "rocprofiler start utilityContext";
+
+  // a multiple of the page size, and the gap allows the buffer to absorb bursts
+  // of GPU events
+  constexpr auto buffer_size_bytes = 100 * 4096;
+  constexpr auto buffer_watermark_bytes = 40 * 4096;
+
+  // Utility context to gather code‑object info
+  rocprofiler_create_context(&context_);
+
+  rocprofiler_create_buffer(context_, buffer_size_bytes, buffer_watermark_bytes,
+                            ROCPROFILER_BUFFER_POLICY_LOSSLESS,
+                            tool_tracing_callback, tool_data, &buffer_);
+
+  rocprofiler_configure_buffer_tracing_service(
+      context_, ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API, nullptr, 0,
+      buffer_);
+
+  rocprofiler_configure_buffer_tracing_service(
+      context_, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0,
+      buffer_);
+
+  rocprofiler_configure_buffer_tracing_service(
+      context_, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, buffer_);
+
+  {
+    // for annotations
+    const rocprofiler_tracing_operation_t* hip_ops = nullptr;
+    size_t hip_ops_count = 0;
+
+    rocprofiler_configure_callback_tracing_service(
+        context_, ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, hip_ops,
+        hip_ops_count,
+        [](rocprofiler_callback_tracing_record_t record,
+           rocprofiler_user_data_t*, void*) {
+          if (record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) {
+            const std::string& annotation =
+                tsl::profiler::AnnotationStack::Get();
+            if (!annotation.empty()) {
+              RocmTracer::GetRocmTracerSingleton().annotation_map()->Add(
+                  record.correlation_id.internal, annotation);
+            }
+          }
+        },
+        nullptr);
   }
-  collector_->AddEvent(std::move(event), false);
-}
-
-// TODO(rocm-profiler): rename this function. this is HIP-OP
-void RocmActivityCallbackImpl::AddHccKernelActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-   missing:
-     name, context_id, nvtx_range, kernel_info
-
-   extra:
-     domain (thread id from the HIP activity)
-
-   activity record contains device/stream ID
- */
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_OPS;
-  event.type = RocmTracerEventType::Kernel;
-  event.source = RocmTracerEventSource::Activity;
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.device_id = record->device_id;
-  event.stream_id = record->queue_id;
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddNormalHipOpsMemcpyActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses:
-      type, name(the name set here is not clear enough but we keep it for
-    debug), context_id, memcpy_info.kind, memcpy_info.num_bytes,
-    memcpy_info.async, memcpy_info.src_mem_kind, memcpy_info.dst_mem_kind
-
-    extras:
-      domain,
-
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_OPS;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =  // name is stored for debug
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.device_id = record->device_id;
-  event.memcpy_info.destination = event.device_id;
-  event.stream_id = record->queue_id;
-
-  // we set the type as MemcpyOther as HIP-OPS activity record does not carry
-  // this information
-  event.type = RocmTracerEventType::MemcpyOther;
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddHipOpsMemsetActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses:
-      name (name recorder here is not clear enough for Memset. We only capture
-    it for debug), context_id, memset_info.kind, memset_info.num_bytes,
-    memset_info.async
-
-    extras:
-      dommain, annotation,
-
-  */
 
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_OPS;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =  // name is stored for debug
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
+  auto client_thread = rocprofiler_callback_thread_t{};
+  rocprofiler_create_callback_thread(&client_thread);
+  rocprofiler_assign_callback_thread(buffer_, client_thread);
 
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.device_id = record->device_id;
-  event.stream_id = record->queue_id;
-
-  event.type = RocmTracerEventType::Memset;
+  int isValid = 0;
+  rocprofiler_context_is_valid(context_, &isValid);
+  if (isValid == 0) {
+    context_.handle = 0;  // Leak on failure.
+    return -1;
+  }
 
-  collector_->AddEvent(std::move(event), false);
+  return 0;
 }
 
-/* static */ RocmTracer* RocmTracer::GetRocmTracerSingleton() {
-  static auto* const singleton = new RocmTracer();
-  return singleton;
-}
-
-// FIXME(rocm-profiler): we should also check if we have AMD GPUs
-bool RocmTracer::IsAvailable() const {
-  return !activity_tracing_enabled_ && !api_tracing_enabled_;  // &&NumGpus()
-}
-
-int RocmTracer::NumGpus() {
-  static int num_gpus = []() -> int {
-    if (hipInit(0) != hipSuccess) {
-      return 0;
-    }
-    int gpu_count;
-    if (hipGetDeviceCount(&gpu_count) != hipSuccess) {
-      return 0;
-    }
-    LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
-    return gpu_count;
-  }();
-  return num_gpus;
-}
-
-void RocmTracer::Enable(const RocmTracerOptions& options,
-                        RocmTraceCollector* collector) {
-  options_ = options;
-  collector_ = collector;
-  api_cb_impl_ = new RocmApiCallbackImpl(options, this, collector);
-  activity_cb_impl_ = new RocmActivityCallbackImpl(options, this, collector);
-
-  // From ROCm 3.5 onwards, the following call is required.
-  // don't quite know what it does (no documentation!), only that without it
-  // the call to enable api/activity tracing will run into a segfault
-  se::wrap::roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr);
-
-  EnableApiTracing().IgnoreError();
-  EnableActivityTracing().IgnoreError();
-  LOG(INFO) << "GpuTracer started";
+void RocmTracer::toolFinalize(void* tool_data) {
+  auto& obj = RocmTracer::GetRocmTracerSingleton();
+  LOG(INFO) << "Calling toolFinalize!";
+  rocprofiler_stop_context(obj.utility_context_);
+  obj.utility_context_.handle = 0;
+  rocprofiler_stop_context(obj.context_);
+  // flush buffer here or in disable?
+  obj.context_.handle = 0;
 }
 
 void RocmTracer::Disable() {
-  // TODO(rocm-profiler): TF has a SyncAndFlush() function
-  // to be called before disabling. It makes sure all the contexts
-  // has finished all the tasks before shutting down the profiler
-  DisableApiTracing().IgnoreError();
-  DisableActivityTracing().IgnoreError();
-  delete api_cb_impl_;
-  delete activity_cb_impl_;
+  absl::MutexLock lock(collector_mutex_);
   collector_->Flush();
   collector_ = nullptr;
-  options_.reset();
-  LOG(INFO) << "GpuTracer stopped";
-}
-
-void ApiCallback(uint32_t domain, uint32_t cbid, const void* cbdata,
-                 void* user_data) {
-  RocmTracer* tracer = reinterpret_cast<RocmTracer*>(user_data);
-  tracer->ApiCallbackHandler(domain, cbid, cbdata).IgnoreError();
-}
-
-absl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                            const void* cbdata) {
-  if (api_tracing_enabled_)
-    TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata));
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::EnableApiTracing() {
-  if (api_tracing_enabled_) return absl::OkStatus();
-  api_tracing_enabled_ = true;
-
-  for (auto& iter : options_->api_callbacks) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Enabling API tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(se::wrap::roctracer_enable_domain_callback(
-          domain, ApiCallback, this));
-    } else {
-      VLOG(3) << "Enabling API tracing for " << ops.size() << " ops in domain "
-              << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Enabling API tracing for "
-                << GetActivityDomainOpName(domain, op);
-        RETURN_IF_ROCTRACER_ERROR(se::wrap::roctracer_enable_op_callback(
-            domain, op, ApiCallback, this));
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::DisableApiTracing() {
-  if (!api_tracing_enabled_) return absl::OkStatus();
   api_tracing_enabled_ = false;
-
-  for (auto& iter : options_->api_callbacks) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Disabling API tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_disable_domain_callback(domain));
-    } else {
-      VLOG(3) << "Disabling API tracing for " << ops.size() << " ops in domain "
-              << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Disabling API tracing for "
-                << GetActivityDomainOpName(domain, op);
-        RETURN_IF_ROCTRACER_ERROR(
-            se::wrap::roctracer_disable_op_callback(domain, op));
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-void ActivityCallback(const char* begin, const char* end, void* user_data) {
-  RocmTracer* tracer = reinterpret_cast<RocmTracer*>(user_data);
-  tracer->ActivityCallbackHandler(begin, end).IgnoreError();
-}
-
-absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
-                                                 const char* end) {
-  if (activity_tracing_enabled_) {
-    TF_RETURN_IF_ERROR((*activity_cb_impl_)(begin, end));
-  } else {
-    LOG(WARNING) << "ActivityCallbackHandler called when "
-                    "activity_tracing_enabled_ is false";
-
-    VLOG(3) << "Dropped Activity Records Start";
-    const roctracer_record_t* record =
-        reinterpret_cast<const roctracer_record_t*>(begin);
-    const roctracer_record_t* end_record =
-        reinterpret_cast<const roctracer_record_t*>(end);
-    while (record < end_record) {
-      DumpActivityRecord(record,
-                         "activity_tracing_enabled_ is false. Dropped!");
-#if TF_ROCM_VERSION >= 50300
-      RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
-          se::wrap::roctracer_next_record(record, &record)));
-#else
-      RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
-          roctracer_next_record(record, &record)));
-#endif
-    }
-    VLOG(3) << "Dropped Activity Records End";
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::EnableActivityTracing() {
-  if (activity_tracing_enabled_) return absl::OkStatus();
-  activity_tracing_enabled_ = true;
-
-  if (!options_->activity_tracing.empty()) {
-    // Create the memory pool to store activity records in
-    if (se::wrap::roctracer_default_pool_expl(nullptr) == NULL) {
-      roctracer_properties_t properties{};
-      properties.buffer_size = 0x1000;
-      properties.buffer_callback_fun = ActivityCallback;
-      properties.buffer_callback_arg = this;
-      VLOG(3) << "Creating roctracer activity buffer: buff-size="
-              << properties.buffer_size;
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_open_pool_expl(&properties, nullptr));
-    }
-  }
-
-  for (auto& iter : options_->activity_tracing) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Enabling Activity tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_enable_domain_activity_expl(domain, nullptr));
-    } else {
-      VLOG(3) << "Enabling Activity tracing for " << ops.size()
-              << " ops in domain " << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Enabling Activity tracing for "
-                << GetActivityDomainOpName(domain, op);
-        // roctracer library has not exported "roctracer_enable_op_activity"
-        RETURN_IF_ROCTRACER_ERROR(
-            se::wrap::roctracer_enable_op_activity_expl(domain, op, nullptr));
-      }
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::DisableActivityTracing() {
-  if (!activity_tracing_enabled_) return absl::OkStatus();
-
-  for (auto& iter : options_->activity_tracing) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Disabling Activity tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_disable_domain_activity(domain));
-    } else {
-      VLOG(3) << "Disabling Activity tracing for " << ops.size()
-              << " ops in domain " << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Disabling Activity tracing for "
-                << GetActivityDomainOpName(domain, op);
-        RETURN_IF_ROCTRACER_ERROR(
-            se::wrap::roctracer_disable_op_activity(domain, op));
-      }
-    }
-  }
-
-  // TODO(rocm-profiler): this stopping mechanism needs improvement.
-  // Flush the activity buffer BEFORE setting the activity_tracing_enable_
-  // flag to FALSE. This is because the activity record callback routine is
-  // gated by the same flag
-  VLOG(3) << "Flushing roctracer activity buffer";
-  RETURN_IF_ROCTRACER_ERROR(se::wrap::roctracer_flush_activity_expl(nullptr));
-  // roctracer_flush_buf();
-
-  // Explicitly wait for (almost) all pending activity records
-  // The choice of all of the following is based what seemed to work
-  // best when enabling tracing on a large testcase (BERT)
-  // * 100 ms as the initial sleep duration AND
-  // * 1 as the initial threshold value
-  // * 6 as the maximum number of iterations
-  int duration_ms = 100;
-  size_t threshold = 1;
-  for (int i = 0; i < 6; i++, duration_ms *= 2, threshold *= 2) {
-    if (GetPendingActivityRecordsCount() < threshold) break;
-    VLOG(3) << "Wait for pending activity records :"
-            << " Pending count = " << GetPendingActivityRecordsCount()
-            << ", Threshold = " << threshold;
-    VLOG(3) << "Wait for pending activity records : sleep for " << duration_ms
-            << " ms";
-    tsl::profiler::SleepForMillis(duration_ms);
-  }
-  ClearPendingActivityRecordsCount();
-
   activity_tracing_enabled_ = false;
-
-  return absl::OkStatus();
+  LOG(INFO) << "GpuTracer stopped";
 }
 
-/*static*/ uint64_t RocmTracer::GetTimestamp() {
-  uint64_t ts;
-  if (se::wrap::roctracer_get_timestamp(&ts) != ROCTRACER_STATUS_SUCCESS) {
-    const char* errstr = se::wrap::roctracer_error_string();
-    LOG(ERROR) << "function roctracer_get_timestamp failed with error "
-               << errstr;
-    // Return 0 on error.
-    return 0;
-  }
-  return ts;
+// ----------------------------------------------------------------------------
+// Helper that returns all device agents (GPU + CPU for now).
+// ----------------------------------------------------------------------------
+std::vector<rocprofiler_agent_v0_t> GetGpuDeviceAgents() {
+  std::vector<rocprofiler_agent_v0_t> agents;
+
+  rocprofiler_query_available_agents_cb_t iterate_cb =
+      [](rocprofiler_agent_version_t agents_ver, const void** agents_arr,
+         size_t num_agents, void* udata) {
+        if (agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) {
+          LOG(ERROR) << "unexpected rocprofiler agent version: " << agents_ver;
+          return ROCPROFILER_STATUS_ERROR;
+        }
+        auto* agents_vec =
+            static_cast<std::vector<rocprofiler_agent_v0_t>*>(udata);
+        for (size_t i = 0; i < num_agents; ++i) {
+          const auto* agent =
+              static_cast<const rocprofiler_agent_v0_t*>(agents_arr[i]);
+          agents_vec->push_back(*agent);
+        }
+        return ROCPROFILER_STATUS_SUCCESS;
+      };
+
+  rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0,
+                                     iterate_cb, sizeof(rocprofiler_agent_t),
+                                     static_cast<void*>(&agents));
+  return agents;
+}
+
+static int toolInitStatic(rocprofiler_client_finalize_t finalize_func,
+                          void* tool_data) {
+  return RocmTracer::GetRocmTracerSingleton().toolInit(finalize_func,
+                                                       tool_data);
+}
+
+// ----------------------------------------------------------------------------
+// C‑linkage entry‑point expected by rocprofiler-sdk.
+// ----------------------------------------------------------------------------
+extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure(
+    uint32_t version, const char* runtime_version, uint32_t priority,
+    rocprofiler_client_id_t* id) {
+  auto& obj = RocmTracer::GetRocmTracerSingleton();  // Ensure constructed,
+                                                     // critical for tracing.
+
+  id->name = "XLA-with-rocprofiler-sdk";
+  obj.client_id_ = id;
+
+  LOG(INFO) << "Configure rocprofiler-sdk...";
+
+  const uint32_t major = version / 10000;
+  const uint32_t minor = (version % 10000) / 100;
+  const uint32_t patch = version % 100;
+
+  LOG(INFO) << absl::StrFormat(
+      "%s Configure XLA with rocprofv3... (priority=%u) is using "
+      "rocprofiler-sdk v%u.%u.%u (%s)",
+      id->name, static_cast<unsigned>(priority), static_cast<unsigned>(major),
+      static_cast<unsigned>(minor), static_cast<unsigned>(patch),
+      runtime_version ? runtime_version : "unknown");
+
+  static rocprofiler_tool_configure_result_t cfg{
+      sizeof(rocprofiler_tool_configure_result_t), &toolInitStatic,
+      &RocmTracer::toolFinalize, nullptr};
+
+  return &cfg;
 }
 
 }  // namespace profiler
 }  // namespace xla
+
+void __attribute__((constructor)) init_rocm_lib() {
+  rocprofiler_force_configure(xla::profiler::rocprofiler_configure);
+}
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
index ebe099e3839c5c..7bc139d8c4340c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -16,194 +16,113 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 #define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 
-#include <optional>
-
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/profiler/gpu/rocm_collector.h"
+#include "absl/types/optional.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
 
 namespace xla {
 namespace profiler {
-
-enum class RocmTracerSyncTypes {
-  InvalidSync = 0,
-  StreamSynchronize,  // caller thread wait stream to become empty
-  EventSynchronize,   // caller thread will block until event happens
-  StreamWait          // compute stream will wait for event to happen
-};
+// forward declare (interface)
+class RocmTraceCollector;
 
 struct RocmTracerOptions {
-  std::set<uint32_t> api_tracking_set;  // actual api set we want to profile
-
-  // map of domain --> ops for which we need to enable the API callbacks
-  // If the ops vector is empty, then enable API callbacks for entire domain
-  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
-
-  // map of domain --> ops for which we need to enable the Activity records
-  // If the ops vector is empty, then enable Activity records for entire domain
-  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
-      activity_tracing;
-};
-
-class RocmTracer;
-
-class RocmApiCallbackImpl {
- public:
-  RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
-                      RocmTraceCollector* collector)
-      : options_(options), tracer_(tracer), collector_(collector) {}
-
-  absl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
-
- private:
-  void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                 uint64_t enter_time, uint64_t exit_time);
-  void AddNormalMemcpyEventUponApiExit(uint32_t cbid,
-                                       const hip_api_data_t* data,
-                                       uint64_t enter_time, uint64_t exit_time);
-  void AddMemcpyPeerEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                     uint64_t enter_time, uint64_t exit_time);
-  void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                 uint64_t enter_time, uint64_t exit_time);
-  void AddMallocFreeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                     uint32_t device_id, uint64_t enter_time,
-                                     uint64_t exit_time);
-  void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
-                                            const hip_api_data_t* data,
-                                            uint64_t enter_time,
-                                            uint64_t exit_time);
-  void AddSynchronizeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                      uint64_t enter_time, uint64_t exit_time);
-
-  RocmTracerOptions options_;
-  RocmTracer* tracer_ = nullptr;
-  RocmTraceCollector* collector_ = nullptr;
-  absl::Mutex api_call_start_mutex_;
-  // TODO(rocm-profiler): replace this with absl hashmap
-  // keep a map from the corr. id to enter time for API callbacks.
-  std::map<uint32_t, uint64_t> api_call_start_time_
-      TF_GUARDED_BY(api_call_start_mutex_);
-};
-
-class RocmActivityCallbackImpl {
- public:
-  RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
-                           RocmTraceCollector* collector)
-      : options_(options), tracer_(tracer), collector_(collector) {}
-
-  absl::Status operator()(const char* begin, const char* end);
-
- private:
-  void AddHipKernelActivityEvent(const roctracer_record_t* record);
-  void AddNormalHipMemcpyActivityEvent(const roctracer_record_t* record);
-  void AddHipMemsetActivityEvent(const roctracer_record_t* record);
-  void AddHipMallocActivityEvent(const roctracer_record_t* record);
-  void AddHipStreamSynchronizeActivityEvent(const roctracer_record_t* record);
-  void AddHccKernelActivityEvent(const roctracer_record_t* record);
-  void AddNormalHipOpsMemcpyActivityEvent(const roctracer_record_t* record);
-  void AddHipOpsMemsetActivityEvent(const roctracer_record_t* record);
-  RocmTracerOptions options_;
-  RocmTracer* tracer_ = nullptr;
-  RocmTraceCollector* collector_ = nullptr;
+  // maximum number of annotation strings that AnnotationMap in RocmTracer can
+  // store. e.g. 1M
+  uint64_t max_annotation_strings;
 };
 
-// The class uses roctracer callback/activity API and forward the collected
-// trace events to RocmTraceCollector. There should be only one RocmTracer
-// per process.
+// The class use to enable rocprofiler-sdk buffered callback/activity tracing
+// and forward the collected trace events to RocmTraceCollector. There should be
+// only one RocmTracer per process.
 class RocmTracer {
  public:
-  // Returns a pointer to singleton RocmTracer.
-  static RocmTracer* GetRocmTracerSingleton();
+  // Returns a reference to the singleton instance of RocmTracer.
+  // This ensures that only one global instance exists throughout the process
+  // lifetime. The first call to this function lazily constructs the instance in
+  // a thread-safe manner. Subsequent calls return the same instance, enabling
+  // centralized tracer state management.
+  static RocmTracer& GetRocmTracerSingleton();
 
   // Only one profile session can be live in the same time.
   bool IsAvailable() const;
 
-  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
+  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector_);
   void Disable();
 
-  absl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                  const void* cbdata);
-  absl::Status ActivityCallbackHandler(const char* begin, const char* end);
-
   static uint64_t GetTimestamp();
-  static int NumGpus();
+  uint32_t NumGpus() const { return num_gpus_; };
+  RocmTraceCollector* collector() { return collector_; }
 
-  void AddToPendingActivityRecords(uint32_t correlation_id) {
-    pending_activity_records_.Add(correlation_id);
-  }
+  int toolInit(rocprofiler_client_finalize_t finalize_func, void* tool_data);
+  static void toolFinalize(void* tool_data);
 
-  void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
-    pending_activity_records_.Remove(correlation_id);
-  }
+  void TracingCallback(rocprofiler_context_id_t context,
+                       rocprofiler_buffer_id_t buffer_id,
+                       rocprofiler_record_header_t** headers,
+                       size_t num_headers, uint64_t drop_count);
 
-  void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
+  void CodeObjectCallback(rocprofiler_callback_tracing_record_t record,
+                          void* callback_data);
 
-  size_t GetPendingActivityRecordsCount() {
-    return pending_activity_records_.Count();
-  }
+  AnnotationMap* annotation_map() { return &annotation_map_; }
 
  protected:
   // protected constructor for injecting mock cupti interface for testing.
-  explicit RocmTracer() : num_gpus_(NumGpus()) {}
+  RocmTracer() = default;
+
+  void HipApiEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev);
+  void KernelEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev);
+  void MemcpyEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev);
 
  private:
-  absl::Status EnableApiTracing();
-  absl::Status DisableApiTracing();
+  uint32_t num_gpus_{0};
+  std::optional<RocmTracerOptions> options_;
+  RocmTraceCollector* collector_{nullptr};
+  absl::Mutex collector_mutex_;
 
-  absl::Status EnableActivityTracing();
-  absl::Status DisableActivityTracing();
+  bool api_tracing_enabled_{false};
+  bool activity_tracing_enabled_{false};
 
-  int num_gpus_;
-  std::optional<RocmTracerOptions> options_;
-  RocmTraceCollector* collector_ = nullptr;
-
-  bool api_tracing_enabled_ = false;
-  bool activity_tracing_enabled_ = false;
-
-  RocmApiCallbackImpl* api_cb_impl_;
-  RocmActivityCallbackImpl* activity_cb_impl_;
-
-  class PendingActivityRecords {
-   public:
-    // add a correlation id to the pending set
-    void Add(uint32_t correlation_id) {
-      absl::MutexLock lock(mutex);
-      pending_set.insert(correlation_id);
-    }
-    // remove a correlation id from the pending set
-    void Remove(uint32_t correlation_id) {
-      absl::MutexLock lock(mutex);
-      pending_set.erase(correlation_id);
-    }
-    // clear the pending set
-    void Clear() {
-      absl::MutexLock lock(mutex);
-      pending_set.clear();
-    }
-    // count the number of correlation ids in the pending set
-    size_t Count() {
-      absl::MutexLock lock(mutex);
-      return pending_set.size();
-    }
-
-   private:
-    // set of co-relation ids for which the hcc activity record is pending
-    absl::flat_hash_set<uint32_t> pending_set;
-    // the callback which processes the activity records (and consequently
-    // removes items from the pending set) is called in a separate thread
-    // from the one that adds item to the list.
-    absl::Mutex mutex;
+  AnnotationMap annotation_map_{/* default size, e.g. */ 1024 * 1024};
+
+ public:
+  using kernel_symbol_data_t =
+      rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
+
+  struct ProfilerKernelInfo {
+    std::string name;
+    kernel_symbol_data_t data;
   };
-  PendingActivityRecords pending_activity_records_;
+
+  using kernel_info_map_t =
+      std::unordered_map<rocprofiler_kernel_id_t, ProfilerKernelInfo>;
+
+  using agent_info_map_t = std::unordered_map<uint64_t, rocprofiler_agent_v0_t>;
+
+  using callback_name_info = rocprofiler::sdk::callback_name_info;
+
+  rocprofiler_client_id_t* client_id_{nullptr};
+  // Contexts ----------------------------------------------------------
+  // for registering kernel names
+  rocprofiler_context_id_t utility_context_{};
+  // for buffered callback services
+  rocprofiler_context_id_t context_{};
+  rocprofiler_buffer_id_t buffer_{};
+
+  // Maps & misc -------------------------------------------------------
+  kernel_info_map_t kernel_info_{};
+  absl::Mutex kernel_lock_;
+
+  callback_name_info name_info_;
+  agent_info_map_t agents_;
 
  public:
   // Disable copy and move.
@@ -213,4 +132,5 @@ class RocmTracer {
 
 }  // namespace profiler
 }  // namespace xla
+
 #endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
new file mode 100644
index 00000000000000..d8ad1392738d20
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_tracer.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+namespace {
+
+using tsl::profiler::XSpace;
+
+// Minimal mock collector implementation based on RocmTraceCollectorImpl.
+class TestRocmTraceCollector : public RocmTraceCollectorImpl {
+ public:
+  TestRocmTraceCollector(const RocmTraceCollectorOptions& options,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
+      : RocmTraceCollectorImpl(options, start_walltime_ns, start_gputime_ns) {}
+
+  void Export(XSpace* space) override {
+    exported_ = true;
+    exported_space_ = space;
+  }
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t correlation_id) override {
+    dropped_reason_ = reason;
+    dropped_id_ = correlation_id;
+  }
+
+  bool exported() const { return exported_; }
+  const std::string& dropped_reason() const { return dropped_reason_; }
+  uint32_t dropped_id() const { return dropped_id_; }
+  XSpace* exported_space() const { return exported_space_; }
+
+ private:
+  bool exported_ = false;
+  std::string dropped_reason_;
+  uint32_t dropped_id_ = 0;
+  XSpace* exported_space_ = nullptr;
+};
+
+// Utility to create valid options for the test collector.
+std::unique_ptr<TestRocmTraceCollector> CreateTestCollector() {
+  RocmTraceCollectorOptions options;
+  options.max_callback_api_events = 2 * 1024 * 1024;
+  options.max_activity_api_events = 2 * 1024 * 1024;
+  options.max_annotation_strings = 1024 * 1024;
+  options.num_gpus = 1;
+
+  uint64_t walltime_ns = RocmTracer::GetTimestamp();
+  uint64_t gputime_ns = RocmTracer::GetTimestamp();
+
+  return std::make_unique<TestRocmTraceCollector>(options, walltime_ns,
+                                                  gputime_ns);
+}
+
+TEST(RocmTracerTest, SingletonInstance) {
+  LOG(INFO) << "Before RocmTracer::GetRocmTracerSingleton()";
+  RocmTracer& tracer1 = RocmTracer::GetRocmTracerSingleton();
+  RocmTracer& tracer2 = RocmTracer::GetRocmTracerSingleton();
+  LOG(INFO) << "Before RocmTracer::GetRocmTracerSingleton()";
+  EXPECT_EQ(&tracer1, &tracer2) << "RocmTracer must be a singleton";
+}
+
+TEST(RocmTracerTest, InitialStateIsAvailable) {
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  EXPECT_TRUE(tracer.IsAvailable())
+      << "Tracer should be available before Enable()";
+}
+
+TEST(RocmTracerTest, EnableAndDisableLifecycle) {
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  auto collector = CreateTestCollector();
+
+  RocmTracerOptions tracer_options{/*max_annotation_strings=*/128};
+  tracer.Enable(tracer_options, collector.get());
+
+  EXPECT_FALSE(tracer.IsAvailable())
+      << "Tracer should not be available after Enable()";
+  EXPECT_EQ(tracer.collector(), collector.get())
+      << "Collector should be set after Enable()";
+  ASSERT_NE(tracer.annotation_map(), nullptr)
+      << "Annotation map should be initialized";
+
+  tracer.Disable();
+
+  EXPECT_TRUE(tracer.IsAvailable())
+      << "Tracer should be available after Disable()";
+}
+
+TEST(RocmTracerTest, AnnotationMapWorks) {
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  AnnotationMap* map = tracer.annotation_map();
+  ASSERT_NE(map, nullptr);
+
+  uint64_t id = 42;
+  std::string annotation = "matmul_fused_op";
+  map->Add(id, annotation);
+
+  absl::string_view result = map->LookUp(id);
+  EXPECT_EQ(result, annotation);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.cc
new file mode 100644
index 00000000000000..0e02383f12afc4
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+
+// for rocprofiler-sdk
+namespace xla {
+namespace profiler {
+
+//-----------------------------------------------------------------------------
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source) {
+  switch (source) {
+    case RocmTracerEventSource::ApiCallback:
+      return "ApiCallback";
+      break;
+    case RocmTracerEventSource::Activity:
+      return "Activity";
+      break;
+    case RocmTracerEventSource::Invalid:
+      return "Invalid";
+      break;
+    default:
+      DCHECK(false);
+      return "";
+  }
+  return "";
+}
+
+// FIXME(rocm-profiler): These domain names are not consistent with the
+// GetActivityDomainName function
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
+  switch (domain) {
+    case RocmTracerEventDomain::HIP_API:
+      return "HIP_API";
+      break;
+    case RocmTracerEventDomain::HIP_OPS:
+      return "HIP_OPS";
+      break;
+    default:
+      LOG(WARNING) << "RocmTracerEventDomain::InvalidDomain";
+      DCHECK(false);
+      return "";
+  }
+  return "";
+}
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type) {
+#define OO(x)                  \
+  case RocmTracerEventType::x: \
+    return #x;
+  switch (type) {
+    OO(Kernel)
+    OO(MemcpyH2D)
+    OO(MemcpyD2H)
+    OO(MemcpyD2D)
+    OO(MemcpyOther)
+    OO(MemoryAlloc)
+    OO(MemoryFree)
+    OO(Memset)
+    OO(Synchronization)
+    OO(Generic)
+    default: {
+    };
+  }
+#undef OO
+  DCHECK(false);
+  return "";
+}
+
+void AnnotationMap::Add(uint32_t correlation_id,
+                        const std::string& annotation) {
+  if (annotation.empty()) {
+    return;
+  }
+  VLOG(3) << "Add annotation: " << " correlation_id=" << correlation_id
+          << ", annotation: " << annotation;
+  absl::MutexLock lock(map_.mutex);
+  if (map_.annotations.size() < max_size_) {
+    absl::string_view annotation_str =
+        *map_.annotations.insert(annotation).first;
+    map_.correlation_map.emplace(correlation_id, annotation_str);
+  }
+}
+
+absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
+  absl::MutexLock lock(map_.mutex);
+  auto it = map_.correlation_map.find(correlation_id);
+  return it != map_.correlation_map.end() ? it->second : absl::string_view();
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.h
new file mode 100644
index 00000000000000..10b7ad5a24be73
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.h
@@ -0,0 +1,191 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
+#define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64_t num_bytes;
+};
+
+struct MemsetDetails {
+  // The number of memory elements getting set
+  size_t num_bytes;
+  // Whether or not the memset is asynchronous.
+  bool async;
+};
+
+struct KernelDetails {
+  // The amount of private memory used by kernel,
+  // number of register per thread (register spillage if > 0)
+  uint32_t private_segment_size;
+  // The amount of shared memory (SMEM)
+  uint32_t group_segment_size;
+  // X-dimension of a workgroup (grid.x*block.x)
+  uint32_t workgroup_x;
+  // Y-dimension of a workgroup (grid.x*block.x)
+  uint32_t workgroup_y;
+  // Z-dimension of a workgroup (grid.x*block.x)
+  uint32_t workgroup_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // kernel address. Used for calculating core occupancy
+  void* func_ptr;
+};
+
+enum class RocmTracerEventType {
+  Unsupported = 0,
+  Kernel,
+  MemcpyH2D,
+  MemcpyD2H,
+  MemcpyD2D,
+  MemcpyP2P,
+  MemcpyOther,
+  MemoryAlloc,
+  MemoryFree,
+  Memset,
+  Synchronization,
+  Generic,
+};
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
+
+enum class RocmTracerEventSource {
+  Invalid = 0,
+  ApiCallback,
+  Activity,
+};
+
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
+
+enum class RocmTracerEventDomain {
+  InvalidDomain = 0,
+  HIP_API,
+  HIP_OPS,
+};
+
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
+
+// RocmTracerSyncTypes forward declaration
+enum class RocmTracerSyncTypes;
+
+struct SynchronizationDetails {
+  RocmTracerSyncTypes sync_type;
+};
+
+struct RocmTracerEvent {
+  static constexpr uint32_t kInvalidDeviceId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidThreadId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  RocmTracerEventType type;
+  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
+  RocmTracerEventDomain domain;
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view roctx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = kInvalidDeviceId;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint64_t thread_id = kInvalidThreadId;
+  uint64_t stream_id = kInvalidStreamId;
+
+  union {
+    MemcpyDetails memcpy_info;                    // If type == Memcpy*
+    MemsetDetails memset_info;                    // If type == Memset*
+    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
+    KernelDetails kernel_info;                    // If type == Kernel
+    SynchronizationDetails synchronization_info;  // If type == Synchronization
+  };
+};
+
+struct RocmTraceCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
+  void Add(uint32_t correlation_id, const std::string& annotation);
+  absl::string_view LookUp(uint32_t correlation_id);
+
+ private:
+  struct AnnotationMapImpl {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
+  };
+  const uint64_t max_size_;
+  AnnotationMapImpl map_;
+
+ public:
+  // Disable copy and move.
+  AnnotationMap(const AnnotationMap&) = delete;
+  AnnotationMap& operator=(const AnnotationMap&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
diff --git a/third_party/xla/xla/backends/profiler/plugin/BUILD b/third_party/xla/xla/backends/profiler/plugin/BUILD
index 6c3c5620477b8b..ab0f77da414fe7 100644
--- a/third_party/xla/xla/backends/profiler/plugin/BUILD
+++ b/third_party/xla/xla/backends/profiler/plugin/BUILD
@@ -62,6 +62,7 @@ cc_library(
         ":profiler_c_api_hdrs",
         ":profiler_error",
         "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/profiler/lib:profiler_collection",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_factory_impl",
diff --git a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc
index f5ed20fa5bfbf8..e011ef8f690cc3 100644
--- a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc
+++ b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
 #include "xla/backends/profiler/plugin/profiler_error.h"
 #include "xla/tsl/platform/logging.h"
@@ -37,7 +38,7 @@ PLUGIN_Profiler_Error* PLUGIN_Profiler_Create(
   auto profiler = std::make_unique<PLUGIN_Profiler>();
   profiler->stopped = true;
   tensorflow::ProfileOptions options;
-  options.ParseFromArray(args->options, args->options_size);
+  options.ParseFromString(absl::string_view(args->options, args->options_size));
   profiler->impl = std::make_unique<tsl::profiler::ProfilerCollection>(
       tsl::profiler::CreateProfilers(options));
 
diff --git a/third_party/xla/xla/backends/profiler/subprocess/BUILD b/third_party/xla/xla/backends/profiler/subprocess/BUILD
index f3ecec9770a8e2..1a62071beafea5 100644
--- a/third_party/xla/xla/backends/profiler/subprocess/BUILD
+++ b/third_party/xla/xla/backends/profiler/subprocess/BUILD
@@ -3,7 +3,6 @@ load(
     "xla_cc_test",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
 load("//xla/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
@@ -18,6 +17,7 @@ cc_library(
     ]),
     deps = [
         "//xla/tsl/platform:env",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -26,7 +26,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 xla_cc_test(
@@ -56,6 +56,7 @@ cc_library(
         "//xla/tsl/profiler/utils:timestamp_utils",
         "//xla/tsl/profiler/utils:xplane_schema",
         "//xla/tsl/profiler/utils:xplane_utils",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
@@ -69,7 +70,7 @@ cc_library(
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
     alwayslink = True,
 )
 
@@ -106,7 +107,7 @@ xla_cc_test(
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_binary(
diff --git a/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc b/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc
index 148e4ce2b5d804..2807248fd57305 100644
--- a/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc
+++ b/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
 #include <memory>
 #include <vector>
 
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index b785090b8a95a8..8fa2963bc27550 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -87,11 +87,11 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt/proto:compile_options_proto_cc",
+        "//xla/service:compilation_environments",
         "//xla/service:computation_placer",
         "//xla/service:test_compilation_environment_proto_cc",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
         "//xla/tsl/util/proto:proto_matchers",
diff --git a/third_party/xla/xla/client/client_library.cc b/third_party/xla/xla/client/client_library.cc
index 682810643b5626..a4aa10f6cea013 100644
--- a/third_party/xla/xla/client/client_library.cc
+++ b/third_party/xla/xla/client/client_library.cc
@@ -104,7 +104,7 @@ ClientLibrary::~ClientLibrary() = default;
   se::Platform* platform = options.platform();
   int replica_count = options.number_of_replicas();
   ClientLibrary& client_library = Singleton();
-  absl::MutexLock lock(&client_library.service_mutex_);
+  absl::MutexLock lock(client_library.service_mutex_);
 
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
@@ -141,7 +141,7 @@ ClientLibrary::~ClientLibrary() = default;
 /* static */ LocalService* ClientLibrary::GetXlaService(
     se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
-  absl::MutexLock lock(&client_library.service_mutex_);
+  absl::MutexLock lock(client_library.service_mutex_);
   auto it = client_library.local_instances_.find(platform->id());
   CHECK(it != client_library.local_instances_.end());
   return it->second->service.get();
@@ -150,7 +150,7 @@ ClientLibrary::~ClientLibrary() = default;
 /* static */ absl::StatusOr<CompileOnlyClient*>
 ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
-  absl::MutexLock lock(&client_library.service_mutex_);
+  absl::MutexLock lock(client_library.service_mutex_);
 
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
@@ -175,7 +175,7 @@ ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
 
 /* static */ void ClientLibrary::DestroyLocalInstances() {
   ClientLibrary& client_library = Singleton();
-  absl::MutexLock lock(&client_library.service_mutex_);
+  absl::MutexLock lock(client_library.service_mutex_);
 
   client_library.local_instances_.clear();
   client_library.compile_only_instances_.clear();
diff --git a/third_party/xla/xla/client/executable_build_options_test.cc b/third_party/xla/xla/client/executable_build_options_test.cc
index b623a19cc5443d..0c81f544fb10a3 100644
--- a/third_party/xla/xla/client/executable_build_options_test.cc
+++ b/third_party/xla/xla/client/executable_build_options_test.cc
@@ -25,12 +25,12 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/service/compilation_environments.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/test_compilation_environment.pb.h"
 #include "xla/shape.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index ce57cec6e9d4c9..3c865b508f5700 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -447,6 +447,10 @@ LocalClient::Compile(const XlaComputation& computation,
   local_executables.reserve(executables.size());
 
   for (auto& executable : executables) {
+    if (executable->has_module()) {
+      TF_RETURN_IF_ERROR(executable->DumpExecutableIfEnabled(
+          updated_options, executable->module().config().debug_options()));
+    }
     local_executables.push_back(std::make_unique<LocalExecutable>(
         std::move(executable), local_service_->mutable_backend(),
         updated_options));
diff --git a/third_party/xla/xla/codegen/BUILD b/third_party/xla/xla/codegen/BUILD
index f5b7073d1b0530..ce1e83c95b227b 100644
--- a/third_party/xla/xla/codegen/BUILD
+++ b/third_party/xla/xla/codegen/BUILD
@@ -44,7 +44,7 @@ xla_cc_test(
     tags = ["gpu"],
     deps = [
         ":emitter_loc_op_builder",
-        "//xla/backends/gpu/codegen/triton:fusion_emitter",
+        "//xla/backends/gpu/codegen/triton:xtile_compiler",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/llvm_ir:llvm_util",
@@ -53,7 +53,6 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -62,8 +61,10 @@ cc_library(
     hdrs = ["kernel_emitter.h"],
     deps = [
         ":kernel_definition",
+        ":kernel_source",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -80,14 +81,17 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 cc_library(
-    name = "llvm_ir_kernel_source",
-    srcs = ["llvm_ir_kernel_source.cc"],
-    hdrs = ["llvm_ir_kernel_source.h"],
+    name = "llvm_kernel_source",
+    srcs = ["llvm_kernel_source.cc"],
+    hdrs = ["llvm_kernel_source.h"],
     deps = [
+        ":kernel_definition",
+        ":kernel_emitter",
         ":kernel_source",
         "//xla/service/llvm_ir:llvm_util",
         "@llvm-project//llvm:Core",
@@ -115,8 +119,12 @@ cc_library(
     srcs = ["mlir_kernel_source.cc"],
     hdrs = ["mlir_kernel_source.h"],
     deps = [
+        ":kernel_definition",
+        ":kernel_emitter",
         ":kernel_source",
         "//xla:util",
+        "//xla/hlo/analysis:symbolic_expr",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
@@ -126,44 +134,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "mlir_kernel_definition",
-    hdrs = ["mlir_kernel_definition.h"],
-    deps = [
-        ":kernel_definition",
-        ":kernel_source",
-        ":kernel_spec",
-        ":mlir_kernel_source",
-    ],
-)
-
-cc_library(
-    name = "mlir_kernel_emitter",
-    hdrs = ["mlir_kernel_emitter.h"],
-    deps = [
-        ":kernel_emitter",
-        ":mlir_kernel_definition",
-    ],
-)
-
-cc_library(
-    name = "llvm_kernel_emitter",
-    hdrs = ["llvm_kernel_emitter.h"],
-    deps = [
-        ":kernel_emitter",
-        ":llvm_kernel_definition",
-    ],
-)
-
-cc_library(
-    name = "llvm_kernel_definition",
-    hdrs = ["llvm_kernel_definition.h"],
-    deps = [
-        ":kernel_definition",
-        ":llvm_ir_kernel_source",
-    ],
-)
-
 cc_library(
     name = "ir_emission_utils",
     srcs = ["ir_emission_utils.cc"],
@@ -204,11 +174,24 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "intrinsic_function",
+    hdrs = ["intrinsic_function.h"],
+    deps = [
+        "//xla/codegen/intrinsic",
+        "//xla/codegen/intrinsic:type",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:ir_headers",
+    ],
+)
+
 cc_library(
     name = "intrinsic_lib",
     srcs = ["intrinsic_lib.cc"],
     hdrs = ["intrinsic_lib.h"],
     deps = [
+        ":intrinsic_function",
         "//xla:xla_data_proto_cc",
         "//xla/codegen/intrinsic",
         "//xla/codegen/intrinsic:erf",
@@ -219,18 +202,20 @@ cc_library(
         "//xla/codegen/intrinsic:rsqrt",
         "//xla/codegen/intrinsic:string_interner",
         "//xla/codegen/intrinsic:tanh",
+        "//xla/codegen/intrinsic:type",
         "//xla/codegen/intrinsic:vec_name_mangler",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:IPO",
         "@llvm-project//llvm:JITLink",
+        "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
@@ -239,6 +224,18 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "intrinsic_lib_test",
+    srcs = ["intrinsic_lib_test.cc"],
+    deps = [
+        ":intrinsic_lib",
+        "//xla/codegen/intrinsic",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Analysis",
+    ],
+)
+
 cc_library(
     name = "hlo_fusion_spec",
     hdrs = ["hlo_fusion_spec.h"],
diff --git a/third_party/xla/xla/codegen/device_spec.h b/third_party/xla/xla/codegen/device_spec.h
index 9bd265969ea0eb..411fb658b5caec 100644
--- a/third_party/xla/xla/codegen/device_spec.h
+++ b/third_party/xla/xla/codegen/device_spec.h
@@ -50,14 +50,10 @@ class DeviceSpec {
     return std::holds_alternative<stream_executor::DeviceDescription>(type_);
   }
   bool IsAmdGpu() const {
-    return IsGpu() &&
-           std::holds_alternative<stream_executor::RocmComputeCapability>(
-               gpu().gpu_compute_capability());
+    return IsGpu() && gpu().gpu_compute_capability().IsRocm();
   }
   bool IsNvidiaGpu() const {
-    return IsGpu() &&
-           std::holds_alternative<stream_executor::CudaComputeCapability>(
-               gpu().gpu_compute_capability());
+    return IsGpu() && gpu().gpu_compute_capability().IsCuda();
   }
   bool IsIntelGpu() const {
     // TODO(intel-gpu): Align with CUDA and ROCM approach of detecting Intel
diff --git a/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc b/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
index 552415e2051ace..1b440f5c009c9d 100644
--- a/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
+++ b/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
@@ -27,26 +27,24 @@ limitations under the License.
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace {
 
 using mlir::NameLoc;
 using mlir::StringAttr;
-using ::tsl::testing::IsOkAndHolds;
 
 using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
 
 class EmitterLocOpBuilderTest : public HloHardwareIndependentTestBase {
  protected:
-  void SetUp() override { gpu::LoadMlirDialectsForTriton(context_); }
+  void SetUp() override { gpu::LoadMlirDialectsForTriton(mlir_context_); }
 
-  mlir::MLIRContext context_;
+  mlir::MLIRContext mlir_context_;
 };
 
 NameLoc NameLoc(mlir::MLIRContext& context, absl::string_view name) {
@@ -65,9 +63,9 @@ mlir::OwningOpRef<mlir::ModuleOp> MakeModuleWithOneOp(
 }
 
 TEST_F(EmitterLocOpBuilderTest, IRWithAnnotations) {
-  auto loc = NameLoc(context_, "IRWithAnnotations");
-  EmitterLocOpBuilder b(loc, &context_, /*annotate_loc=*/true);
-  auto triton_module = MakeModuleWithOneOp(context_, b);
+  auto loc = NameLoc(mlir_context_, "IRWithAnnotations");
+  EmitterLocOpBuilder b(loc, &mlir_context_, /*annotate_loc=*/true);
+  auto triton_module = MakeModuleWithOneOp(mlir_context_, b);
   std::string ir = DumpTritonIR(triton_module.get(), /*dump_annotations=*/true);
   if constexpr (EmitterLocOpBuilder::kSourceLocationSupported) {
     EXPECT_THAT(RunFileCheck(ir, R"(
@@ -83,9 +81,9 @@ TEST_F(EmitterLocOpBuilderTest, IRWithAnnotations) {
 }
 
 TEST_F(EmitterLocOpBuilderTest, IRWithoutAnnotations) {
-  auto loc = NameLoc(context_, "IRWithoutAnnotations");
-  EmitterLocOpBuilder b(loc, &context_, /*annotate_loc=*/false);
-  auto triton_module = MakeModuleWithOneOp(context_, b);
+  auto loc = NameLoc(mlir_context_, "IRWithoutAnnotations");
+  EmitterLocOpBuilder b(loc, &mlir_context_, /*annotate_loc=*/false);
+  auto triton_module = MakeModuleWithOneOp(mlir_context_, b);
   std::string ir =
       DumpTritonIR(triton_module.get(), /*dump_annotations=*/false);
   EXPECT_THAT(RunFileCheck(ir, R"(
diff --git a/third_party/xla/xla/codegen/emitters/BUILD b/third_party/xla/xla/codegen/emitters/BUILD
index 574689879b17e3..3bf04998441f0b 100644
--- a/third_party/xla/xla/codegen/emitters/BUILD
+++ b/third_party/xla/xla/codegen/emitters/BUILD
@@ -23,6 +23,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/algorithm:container",
@@ -47,6 +48,7 @@ xla_cc_test(
     deps = [
         ":computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -89,7 +91,9 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/transforms/simplifiers:gather_simplifier",
         "//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/mlir_hlo",
@@ -133,6 +137,7 @@ xla_cc_test(
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
@@ -218,6 +223,7 @@ xla_cc_test(
     deps = [
         ":kernel_arguments",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
@@ -231,6 +237,7 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -250,6 +257,7 @@ cc_library(
         "//xla/codegen:kernel_spec",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
@@ -279,6 +287,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/runtime:work_cluster",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
@@ -303,16 +312,15 @@ cc_library(
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
-        "//xla/runtime:work_group",
         "//xla/runtime:work_item",
         "//xla/service:buffer_assignment",
         "//xla/service/llvm_ir:llvm_util",
@@ -358,12 +366,12 @@ cc_library(
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -418,12 +426,12 @@ cc_library(
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:ir_emission_utils",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
index 184e156485b9d2..a610c928102591 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -58,6 +59,8 @@ namespace xla {
 namespace emitters {
 namespace {
 
+using ::mlir::MLIRContext;
+
 int Arity(const Shape& shape) {
   return shape.IsTuple() ? shape.tuple_shapes().size() : 1;
 }
@@ -67,7 +70,7 @@ const Shape& TupleShape(const Shape& shape, int index) {
 }
 
 std::vector<IndexingMapSet> ComputeOperandIndexingMaps(
-    const HloInstruction* instr, mlir::MLIRContext* mlir_context) {
+    const HloInstruction* instr, MLIRContext* mlir_context) {
   std::vector<IndexingMapSet> indexing_maps_per_operand;
   // For some ops, there is no indexing map implemented for the operands (e.g.
   // scatter) or there are multiple results and the common iteration space is
@@ -104,7 +107,7 @@ bool HasNoCompute(const HloInstruction* instr) {
 
 EpilogueSpecification EpilogueSpecification::FromIdentityIndexing(
     const HloInstruction* hero, const HloInstruction* root,
-    mlir::MLIRContext* mlir_context) {
+    MLIRContext* mlir_context) {
   EpilogueSpecification result;
   if (root->shape().IsArray()) {
     absl::c_copy(root->shape().dimensions(),
@@ -203,7 +206,7 @@ struct HloSubgraphData {
 };
 
 PartitionedComputation::PartitionedComputation(
-    const HloComputation* computation, mlir::MLIRContext* mlir_context,
+    const HloComputation* computation, MLIRContext* mlir_context,
     std::function<bool(const HloInstruction*)> is_subgraph_root)
     : computation_(computation) {
   CHECK_NE(computation, nullptr);
@@ -388,15 +391,17 @@ PartitionedComputation::Subgraph PartitionedComputation::Subgraph::ForEpilogue(
 }
 
 PartitionedComputations::PartitionedComputations(
-    const HloComputation* fusion, mlir::MLIRContext* mlir_context,
+    const HloComputation* fusion, MLIRContext* mlir_context,
     std::vector<EpilogueSpecification> epilogues)
-    : fusion_(fusion) {
+    : fusion_(fusion), mlir_context_(mlir_context) {
   // Collect all transitively called computations (including the fusion itself).
   absl::flat_hash_set<const HloComputation*> seen;
   std::vector<const HloComputation*> computations;
   std::function<void(const HloComputation*)> visit;
   visit = [&](const HloComputation* computation) {
-    if (!seen.insert(computation).second) return;
+    if (!seen.insert(computation).second) {
+      return;
+    }
     computations.push_back(computation);
     for (auto* instr : computation->instructions()) {
       absl::c_for_each(instr->called_computations(), visit);
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.h b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
index 973b0f07d27006..8fd113f43602da 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.h
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/util.h"
@@ -176,6 +177,8 @@ class PartitionedComputations {
 
   const HloComputation* fusion() const { return fusion_; }
 
+  mlir::MLIRContext* mlir_context() const { return mlir_context_; }
+
   // Creates a call target lookup function for use with SubgraphToMlir.
   CallTargetProvider CreateCallTargetProvider(
       const absl::flat_hash_map<const PartitionedComputation::Subgraph*,
@@ -193,6 +196,7 @@ class PartitionedComputations {
       computation_to_partitioning_;
   const HloComputation* fusion_;
   std::vector<PartitionedComputation::Subgraph> epilogues_;
+  mlir::MLIRContext* mlir_context_;
 };
 
 // Returns an MLIR function declaration for the given subgraph. For subgraphs of
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
index 8c9244ac0b1d0f..39ef286b2d089c 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -484,6 +485,23 @@ TEST_F(ComputationPartitionerTest, PartitioningIsDeterministic) {
   EXPECT_EQ(computation.subgraphs().size(), 1);
 }
 
+TEST_F(ComputationPartitionerTest, ScaleAndTranslateSamplerE2ETest) {
+  // This is a simple fusion that used to result in a crash.
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+    ENTRY fused_computation (param_0.1: f32[4,4,2]) -> f32[1,1,1,4,1,4,1,2]  {
+      %param_0.1 = f32[4,4,2]{2,1,0} parameter(0)
+      %bitcast.1 = f32[1,1,1,4,1,4,1,2]{7,5,3,6,4,2,1,0} bitcast(%param_0.1)
+      ROOT %copy.1 = f32[1,1,1,4,1,4,1,2]{7,6,5,4,3,2,1,0} copy(%bitcast.1)
+    })")
+                    .value();
+
+  auto* fusion = module->GetComputationWithName("fused_computation");
+  ASSERT_NE(fusion, nullptr);
+  PartitionedComputation computation(fusion, &mlir_context_);
+  EXPECT_EQ(computation.subgraphs().size(), 1);
+}
+
 }  // namespace
 }  // namespace emitters
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
index 6483485fb87bf2..090a1dea456744 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -54,10 +53,10 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -73,8 +72,10 @@ limitations under the License.
 
 namespace xla::emitters {
 
+using ::mlir::MLIRContext;
+
 ConcatenateFusionKernelEmitter::ConcatenateFusionKernelEmitter(
-    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    MLIRContext& mlir_context, const HloFusionInstruction& fusion,
     const HloFusionSpec& fusion_spec, const BufferAssignment* buffer_assignment,
     KernelArguments::BufferAlignment buffer_alignment,
     WorkDimensions work_dimensions, absl::string_view entry_function_name,
@@ -89,7 +90,7 @@ ConcatenateFusionKernelEmitter::ConcatenateFusionKernelEmitter(
       entry_function_name_(entry_function_name),
       backend_kind_(backend_kind) {}
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<ConcatenateFusionKernelEmitter::KernelDefinition>
 ConcatenateFusionKernelEmitter::EmitKernelDefinition() {
   mlir::OpBuilder builder(&mlir_context_);
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion_.name()));
@@ -106,10 +107,9 @@ ConcatenateFusionKernelEmitter::EmitKernelDefinition() {
   SetBackendKind(&mlir_context_, entry_func, backend_kind_);
 
   std::vector<emitters::EpilogueSpecification> epilogues =
-      GetEpilogues(fusion_, module->getContext());
+      GetEpilogues(fusion_, &mlir_context_);
   emitters::PartitionedComputations computations(
-      fusion_.fused_instructions_computation(), module->getContext(),
-      epilogues);
+      fusion_.fused_instructions_computation(), &mlir_context_, epilogues);
   TF_ASSIGN_OR_RETURN(auto call_targets, emitters::EmitPartitionedComputations(
                                              *module, computations));
 
@@ -120,8 +120,8 @@ ConcatenateFusionKernelEmitter::EmitKernelDefinition() {
                       GetKernelSpec(entry_function_name_, fusion_,
                                     buffer_assignment_, work_dimensions_));
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(module)));
 }
 
 const Shape& ConcatenateFusionKernelEmitter::GetIndexingShape(
@@ -151,12 +151,12 @@ int ConcatenateFusionKernelEmitter::GetValidUnrollFactor(
 
 IndexingMap ConcatenateFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
     const WorkDimensions& work_dimensions, const Shape& largest_shape,
-    mlir::MLIRContext* ctx) {
+    MLIRContext* ctx) {
   return GetDefaultWorkItemIndexingMap(work_dimensions, largest_shape, ctx);
 }
 
 IndexingMap ConcatenateFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* ctx) const {
   return ComputeWorkItemIdToOutputIndexing(work_dimensions_, largest_shape_,
                                            ctx);
 }
@@ -169,7 +169,6 @@ absl::Status ConcatenateFusionKernelEmitter::EmitEntryFunction(
   const auto& root_computation = computations.FindPartitionedComputation(
       fusion.fused_instructions_computation());
 
-  mlir::MLIRContext* context = entry_function.getContext();
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
 
@@ -185,9 +184,10 @@ absl::Status ConcatenateFusionKernelEmitter::EmitEntryFunction(
   llvm::SmallVector<mlir::Value> result_tensors{output_tensor_args.begin(),
                                                 output_tensor_args.end()};
 
-  auto work_item_id_to_input_map = ComputeWorkItemIdToOutputIndexing(context);
+  auto work_item_id_to_input_map =
+      ComputeWorkItemIdToOutputIndexing(&mlir_context_);
   auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
-      fusion_spec_.fusion_hero(0), fusion_spec_.fusion_root(0), context);
+      fusion_spec_.fusion_hero(0), fusion_spec_.fusion_root(0), &mlir_context_);
 
   const auto* concat = &fusion_spec_.fusion_hero(0).instruction();
 
@@ -208,7 +208,7 @@ absl::Status ConcatenateFusionKernelEmitter::EmitEntryFunction(
     for (auto [operand_index, operand] : llvm::enumerate(concat->operands())) {
       IndexingMap input_to_output_map =
           ComputeInputToOutputIndexing(concat, /*input_id=*/operand_index,
-                                       context)
+                                       &mlir_context_)
               .indexing_maps.front()
               .begin()
               ->map();
@@ -260,7 +260,8 @@ absl::Status ConcatenateFusionKernelEmitter::EmitEntryFunction(
       llvm::SmallVector<mlir::OpFoldResult> offsets(output_tensor.getRank(),
                                                     nested_b.getIndexAttr(0));
       llvm::SmallVector<mlir::OpFoldResult> sizes =
-          mlir::getAsIndexOpFoldResult(context, output_tensor.getShape());
+          mlir::getAsIndexOpFoldResult(&mlir_context_,
+                                       output_tensor.getShape());
       llvm::SmallVector<mlir::OpFoldResult> strides(output_tensor.getRank(),
                                                     nested_b.getIndexAttr(1));
       nested_b.create<mlir::tensor::ParallelInsertSliceOp>(
@@ -270,7 +271,7 @@ absl::Status ConcatenateFusionKernelEmitter::EmitEntryFunction(
 
   const NumWorkItems& num_work_items = work_dimensions_.num_work_items;
   llvm::SmallVector<mlir::OpFoldResult> upper_bounds =
-      mlir::getAsIndexOpFoldResult(context,
+      mlir::getAsIndexOpFoldResult(&mlir_context_,
                                    {static_cast<int64_t>(num_work_items.x),
                                     static_cast<int64_t>(num_work_items.y),
                                     static_cast<int64_t>(num_work_items.z)});
@@ -284,8 +285,8 @@ absl::Status ConcatenateFusionKernelEmitter::EmitEntryFunction(
 }
 
 std::vector<emitters::EpilogueSpecification>
-ConcatenateFusionKernelEmitter::GetEpilogues(
-    const HloFusionInstruction& fusion, mlir::MLIRContext* mlir_context) const {
+ConcatenateFusionKernelEmitter::GetEpilogues(const HloFusionInstruction& fusion,
+                                             MLIRContext* mlir_context) const {
   return {emitters::EpilogueSpecification::FromIdentityIndexing(
       &fusion_spec_.fusion_hero(0).instruction(),
       &fusion_spec_.fusion_root(0).instruction(), mlir_context)};
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
index fa83a6708eb637..b7d5d7d9b9b970 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_CODEGEN_EMITTERS_CONCATENATE_KERNEL_EMITTER_H_
 #define XLA_CODEGEN_EMITTERS_CONCATENATE_KERNEL_EMITTER_H_
 
-#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -24,16 +23,14 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
@@ -41,7 +38,8 @@ limitations under the License.
 
 namespace xla::emitters {
 
-class ConcatenateFusionKernelEmitter final : public MlirKernelEmitter {
+class ConcatenateFusionKernelEmitter final
+    : public KernelEmitter<MlirKernelSource> {
  public:
   ConcatenateFusionKernelEmitter(
       mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
@@ -51,7 +49,11 @@ class ConcatenateFusionKernelEmitter final : public MlirKernelEmitter {
       WorkDimensions work_dimensions, absl::string_view entry_function_name,
       BackendKind backend_kind);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() override;
+  absl::string_view name() const final {
+    return "concatenate_fusion_kernel_emitter";
+  }
+
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
   static IndexingMap ComputeWorkItemIdToOutputIndexing(
       const WorkDimensions& work_dimensions, const Shape& largest_shape,
@@ -68,8 +70,6 @@ class ConcatenateFusionKernelEmitter final : public MlirKernelEmitter {
   static int GetValidUnrollFactor(const HloFusionSpec& fusion_spec,
                                   int max_unroll_factor);
 
-  std::string name() const final { return "concatenate_fusion_kernel_emitter"; }
-
  private:
   IndexingMap ComputeWorkItemIdToOutputIndexing(mlir::MLIRContext* ctx) const;
 
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
index a3c9b174e01530..fb8d3e6445e32d 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -52,10 +51,10 @@ limitations under the License.
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -73,10 +72,12 @@ limitations under the License.
 
 namespace xla::emitters {
 
+using ::mlir::MLIRContext;
+
 constexpr int kDUSUpdateIndex = 1;
 
 DynamicUpdateSliceKernelEmitter::DynamicUpdateSliceKernelEmitter(
-    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    MLIRContext& mlir_context, const HloFusionInstruction& fusion,
     const HloFusionSpec& fusion_spec, const BufferAssignment* buffer_assignment,
     KernelArguments::BufferAlignment buffer_alignment,
     WorkDimensions work_dimensions, absl::string_view entry_function_name,
@@ -92,7 +93,7 @@ DynamicUpdateSliceKernelEmitter::DynamicUpdateSliceKernelEmitter(
       entry_function_name_(entry_function_name),
       backend_kind_(backend_kind) {}
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<DynamicUpdateSliceKernelEmitter::KernelDefinition>
 DynamicUpdateSliceKernelEmitter::EmitKernelDefinition() {
   mlir::OpBuilder builder(&mlir_context_);
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion_.name()));
@@ -118,16 +119,17 @@ DynamicUpdateSliceKernelEmitter::EmitKernelDefinition() {
 
   TF_ASSIGN_OR_RETURN(auto kernel_spec, GetKernelSpec());
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(module)));
 }
 
 IndexingMap DynamicUpdateSliceKernelEmitter::ComputeWorkItemIdToInputIndexing(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* mlir_context) const {
   // It is guaranteed that all DUS ops have the same output shape at this point.
   const auto& update_shape =
       dus_ops_.front().GetOperand(kDUSUpdateIndex).shape();
-  return ComputeWorkItemIdToOutputIndexing(work_dimensions_, update_shape, ctx);
+  return ComputeWorkItemIdToOutputIndexing(work_dimensions_, update_shape,
+                                           mlir_context);
 }
 
 Shape DynamicUpdateSliceKernelEmitter::GetIndexingShape(
@@ -139,8 +141,9 @@ Shape DynamicUpdateSliceKernelEmitter::GetIndexingShape(
 
 IndexingMap DynamicUpdateSliceKernelEmitter::ComputeWorkItemIdToOutputIndexing(
     const WorkDimensions& work_dimensions, const Shape& update_shape,
-    mlir::MLIRContext* ctx) {
-  return GetDefaultWorkItemIndexingMap(work_dimensions, update_shape, ctx);
+    MLIRContext* mlir_context) {
+  return GetDefaultWorkItemIndexingMap(work_dimensions, update_shape,
+                                       mlir_context);
 }
 
 absl::StatusOr<KernelSpec> DynamicUpdateSliceKernelEmitter::GetKernelSpec()
@@ -192,12 +195,10 @@ absl::Status DynamicUpdateSliceKernelEmitter::EmitEntryFunction(
     const emitters::CallTargetProvider& call_targets,
     mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  mlir::MLIRContext* context = entry_function.getContext();
-
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
 
-  auto indexing = ComputeWorkItemIdToInputIndexing(context);
+  auto indexing = ComputeWorkItemIdToInputIndexing(&mlir_context_);
   indexing.Simplify();
   indexing.RemoveUnusedSymbols();
 
@@ -243,7 +244,7 @@ absl::Status DynamicUpdateSliceKernelEmitter::EmitEntryFunction(
       // Handle bitcasts under the DUS.
       if (dus_instr->shape() != root.shape()) {
         update_indices = ApplyIndexing(
-            GetBitcastMap(dus_instr->shape(), root.shape(), context),
+            GetBitcastMap(dus_instr->shape(), root.shape(), &mlir_context_),
             update_indices, {}, nested_b);
       }
       results.push_back(nested_b.create<mlir::tensor::InsertOp>(
@@ -276,7 +277,8 @@ absl::Status DynamicUpdateSliceKernelEmitter::EmitEntryFunction(
       llvm::SmallVector<mlir::OpFoldResult> offsets(output_tensor.getRank(),
                                                     nested_b.getIndexAttr(0));
       llvm::SmallVector<mlir::OpFoldResult> sizes =
-          mlir::getAsIndexOpFoldResult(context, output_tensor.getShape());
+          mlir::getAsIndexOpFoldResult(&mlir_context_,
+                                       output_tensor.getShape());
       llvm::SmallVector<mlir::OpFoldResult> strides(output_tensor.getRank(),
                                                     nested_b.getIndexAttr(1));
       nested_b.create<mlir::tensor::ParallelInsertSliceOp>(
@@ -286,7 +288,7 @@ absl::Status DynamicUpdateSliceKernelEmitter::EmitEntryFunction(
 
   const NumWorkItems& num_work_items = work_dimensions_.num_work_items;
   llvm::SmallVector<mlir::OpFoldResult> upper_bounds =
-      mlir::getAsIndexOpFoldResult(context,
+      mlir::getAsIndexOpFoldResult(&mlir_context_,
                                    {static_cast<int64_t>(num_work_items.x),
                                     static_cast<int64_t>(num_work_items.y),
                                     static_cast<int64_t>(num_work_items.z)});
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
index e0a2353d030b9c..e727d8296176a5 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
@@ -23,16 +23,15 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/runtime/work_dimensions.h"
@@ -47,7 +46,8 @@ namespace xla::emitters {
 // 3. a tuple op returning the result of several dynamic-update-slice ops
 // 4. a tuple op returning the result of several bitcast
 //    dynamic-update-slice ops
-class DynamicUpdateSliceKernelEmitter final : public MlirKernelEmitter {
+class DynamicUpdateSliceKernelEmitter final
+    : public KernelEmitter<MlirKernelSource> {
  public:
   DynamicUpdateSliceKernelEmitter(
       mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
@@ -57,7 +57,11 @@ class DynamicUpdateSliceKernelEmitter final : public MlirKernelEmitter {
       WorkDimensions work_dimensions, absl::string_view entry_function_name,
       BackendKind backend_kind);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() override;
+  absl::string_view name() const final {
+    return "dynamic_update_slice_kernel_emitter";
+  }
+
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
   // Get the shape that will be used for loop indexing for the given fusion
   // specification.
@@ -67,12 +71,9 @@ class DynamicUpdateSliceKernelEmitter final : public MlirKernelEmitter {
       const WorkDimensions& work_dimensions, const Shape& update_shape,
       mlir::MLIRContext* ctx);
 
-  std::string name() const final {
-    return "dynamic_update_slice_kernel_emitter";
-  }
-
  private:
-  IndexingMap ComputeWorkItemIdToInputIndexing(mlir::MLIRContext* ctx) const;
+  IndexingMap ComputeWorkItemIdToInputIndexing(
+      mlir::MLIRContext* mlir_context) const;
   absl::StatusOr<KernelSpec> GetKernelSpec() const;
 
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
index 39624ac5ec69d7..14d5adaadb48c6 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
@@ -67,11 +67,13 @@ limitations under the License.
 #include "xla/comparison_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/transforms/simplifiers/gather_simplifier.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
@@ -176,8 +178,8 @@ absl::StatusOr<Value> GetSingleOperandValue(
 absl::StatusOr<SmallVector<Value, 1>> EmitReduce(
     const HloInstruction* instr, ValueRange indices,
     const OperandProvider& operand_provider,
-    const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
-  auto* mlir_context = b.getContext();
+    const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b,
+    MLIRContext* mlir_context) {
   HloInstructionIndexing indexing =
       ComputeOutputToInputIndexing(instr, 0, mlir_context);
   const IndexingMap& indexing_map = indexing.indexing_maps[0].begin()->map();
@@ -211,8 +213,8 @@ absl::StatusOr<SmallVector<Value, 1>> EmitReduce(
 absl::StatusOr<SmallVector<Value, 1>> EmitReduceWindow(
     const HloInstruction* instr, ValueRange indices,
     const OperandProvider& operand_provider,
-    const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
-  MLIRContext* mlir_context = b.getContext();
+    const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b,
+    MLIRContext* mlir_context) {
   HloInstructionIndexing indexing =
       ComputeOutputToInputIndexing(instr, 0, mlir_context);
   IndexingMap indexing_map = indexing.indexing_maps[0].begin()->map();
@@ -391,6 +393,9 @@ absl::StatusOr<SmallVector<Value, 1>> EmitDynamicUpdateSlice(
 absl::StatusOr<SmallVector<Value, 1>> EmitGather(
     const HloInstruction* instr, ValueRange indices,
     const OperandProvider& operand_provider, ImplicitLocOpBuilder& b) {
+  const auto* gather = Cast<HloGatherInstruction>(instr);
+  CHECK(GatherSimplifier::IsSimplifiedGather(gather))
+      << "Non-simplified HLO Gather is not supported.";
   auto row = indices[0];
   auto zero = b.create<ConstantIndexOp>(0);
   // Gather allows the index vector to contain fewer elements than the rank
@@ -403,12 +408,12 @@ absl::StatusOr<SmallVector<Value, 1>> EmitGather(
   // simplifier prefers this form. Therefore, we need to check the rank of the
   // indices here and do the implicit reshape in place.
   const auto& indices_shape = instr->operand(1)->shape();
-  int num_indices =
-      indices_shape.dimensions().size() == 1 ? 1 : indices_shape.dimensions(1);
-  for (int i = 0; i < num_indices; ++i) {
+  const auto& dim_numbers = gather->gather_dimension_numbers();
+  const auto& start_index_map = dim_numbers.start_index_map();
+  for (auto [i, operand_dim] : llvm::enumerate(start_index_map)) {
     auto i_val = i == 0 ? zero : b.create<ConstantIndexOp>(i);
-    int64_t slice_size = instr->gather_slice_sizes()[i];
-    int64_t input_size = instr->operand(0)->shape().dimensions()[i];
+    int64_t slice_size = gather->gather_slice_sizes()[operand_dim];
+    int64_t input_size = gather->operand(0)->shape().dimensions()[operand_dim];
     // Read and clamp index.
     TF_ASSIGN_OR_RETURN(auto input_index,
                         operand_provider(instr, 1,
@@ -417,7 +422,7 @@ absl::StatusOr<SmallVector<Value, 1>> EmitGather(
                                              : ValueRange{row, i_val}));
     TF_RET_CHECK(input_index.size() == 1)
         << "Expected operand to be a single value.";
-    operand_indices[i] =
+    operand_indices[operand_dim] =
         ClampIndex(input_index.front(),
                    primitive_util::IsUnsignedIntegralType(
                        instr->operand(1)->shape().element_type()),
@@ -450,10 +455,11 @@ SmallVector<SmallVector<Value, 3>, 2> GetInputIndices(
 
 absl::StatusOr<SmallVector<Value, 1>> EmitPad(
     const HloInstruction* instr, ValueRange indices,
-    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b) {
+    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b,
+    MLIRContext* mlir_context) {
   auto result_element_type =
       PrimitiveTypeToMlirType(instr->shape().element_type(), b);
-  auto indexing = ComputeOutputToInputIndexing(instr, 0, b.getContext());
+  auto indexing = ComputeOutputToInputIndexing(instr, 0, mlir_context);
   const IndexingMap& indexing_map = indexing.indexing_maps[0].begin()->map();
   Value is_in_bounds = CheckConstraints(indexing_map, indices, {}, b);
 
@@ -518,11 +524,12 @@ absl::StatusOr<Value> EmitMulAdd(Value lhs, Value rhs, Value accumulator,
 
 absl::StatusOr<SmallVector<Value, 1>> EmitDotLoop(
     const HloInstruction* instr, ValueRange indices,
-    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b) {
+    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b,
+    MLIRContext* mlir_context) {
   auto result_element_type =
       PrimitiveTypeToMlirType(instr->shape().element_type(), b);
   HloInstructionIndexing indexing =
-      ComputeOutputToInputIndexing(instr, /*output_id=*/0, b.getContext());
+      ComputeOutputToInputIndexing(instr, /*output_id=*/0, mlir_context);
   const IndexingMap& lhs_indexing_map =
       indexing.indexing_maps.at(0).begin()->map();
   const IndexingMap& rhs_indexing_map =
@@ -587,7 +594,8 @@ absl::StatusOr<SmallVector<Value, 1>> EmitDotLoop(
 
 absl::StatusOr<SmallVector<Value, 1>> EmitDot(
     const HloInstruction* instr, ValueRange indices,
-    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b) {
+    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b,
+    MLIRContext* mlir_context) {
   VLOG(10) << "EmitDot: " << instr->ToString();
 
   if (!algorithm_util::IsSupportedByElementalIrEmitter(
@@ -600,14 +608,15 @@ absl::StatusOr<SmallVector<Value, 1>> EmitDot(
   auto* dot = DynCast<HloDotInstruction>(instr);
   TF_RET_CHECK(dot != nullptr);
 
-  return EmitDotLoop(instr, indices, operand_provider, b);
+  return EmitDotLoop(instr, indices, operand_provider, b, mlir_context);
 }
 
 absl::StatusOr<SmallVector<Value, 1>> EmitConvolution(
     const HloInstruction* instr, ValueRange indices,
-    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b) {
+    const OperandProvider& operand_provider, ImplicitLocOpBuilder& b,
+    MLIRContext* mlir_context) {
   VLOG(10) << "EmitConvolution: " << instr->ToString();
-  return EmitDotLoop(instr, indices, operand_provider, b);
+  return EmitDotLoop(instr, indices, operand_provider, b, mlir_context);
 }
 
 absl::StatusOr<SmallVector<Value, 1>> EmitParameter(const HloInstruction* instr,
@@ -709,7 +718,8 @@ namespace {
 
 absl::StatusOr<SmallVector<Value, 1>> EmitTuple(
     const HloInstruction* instr, ValueRange indices,
-    const OperandProvider& operand_provider, ImplicitLocOpBuilder& builder) {
+    const OperandProvider& operand_provider, ImplicitLocOpBuilder& builder,
+    MLIRContext* mlir_context) {
   const auto* first_shape = &instr->shape().tuple_shapes(0);
   while (first_shape->IsTuple()) {
     first_shape = &first_shape->tuple_shapes(0);
@@ -728,8 +738,8 @@ absl::StatusOr<SmallVector<Value, 1>> EmitTuple(
     }
     if (i > 0 && !ShapeUtil::EqualIgnoringElementType(*first_shape,
                                                       *operand_index_shape)) {
-      auto operand_map = GetBitcastMap(*first_shape, *operand_index_shape,
-                                       builder.getContext());
+      auto operand_map =
+          GetBitcastMap(*first_shape, *operand_index_shape, mlir_context);
       operand_indices = ApplyIndexing(operand_map, indices, {}, builder);
     } else {
       operand_indices = indices;
@@ -774,7 +784,8 @@ absl::StatusOr<SmallVector<Value, 1>> EmitConstant(
 
 absl::StatusOr<SmallVector<Value, 2>> GetOperands(
     const HloInstruction* instr, ValueRange indices,
-    const OperandProvider& operand_provider, ImplicitLocOpBuilder& builder) {
+    const OperandProvider& operand_provider, ImplicitLocOpBuilder& builder,
+    MLIRContext* mlir_context) {
   SmallVector<Value, 2> operands;
   bool is_elementwise = HloInstruction::IsOpElementwise(instr->opcode()) ||
                         instr->opcode() == HloOpcode::kMap;
@@ -798,8 +809,7 @@ absl::StatusOr<SmallVector<Value, 2>> GetOperands(
     }
   } else {
     auto input_indices = GetInputIndices(
-        ComputeOutputToInputIndexing(instr, 0, builder.getContext()), indices,
-        builder);
+        ComputeOutputToInputIndexing(instr, 0, mlir_context), indices, builder);
     for (auto&& [operand_number, operand_indices] :
          llvm::enumerate(input_indices)) {
       TF_ASSIGN_OR_RETURN(
@@ -958,7 +968,7 @@ absl::StatusOr<SmallVector<Value, 1>> HloToMlir(
     const HloInstruction* instr, mlir::func::FuncOp this_fn, ValueRange indices,
     const OperandProvider& operand_provider,
     const CallTargetProvider& call_target_provider,
-    ImplicitLocOpBuilder& builder) {
+    ImplicitLocOpBuilder& builder, MLIRContext* mlir_context) {
   CHECK(!kUnsupportedOps.contains(instr->opcode())) << instr->ToShortString();
 
   auto element_type = instr->shape().element_type();
@@ -971,7 +981,8 @@ absl::StatusOr<SmallVector<Value, 1>> HloToMlir(
     case HloOpcode::kConstant:
       return EmitConstant(instr, indices, builder);
     case HloOpcode::kConvolution:
-      return EmitConvolution(instr, indices, operand_provider, builder);
+      return EmitConvolution(instr, indices, operand_provider, builder,
+                             mlir_context);
     case HloOpcode::kDynamicSlice:
       return EmitDynamicSlice(instr, indices, operand_provider, builder);
     case HloOpcode::kDynamicUpdateSlice:
@@ -981,19 +992,19 @@ absl::StatusOr<SmallVector<Value, 1>> HloToMlir(
     case HloOpcode::kIota:
       return EmitIota(instr, indices, builder);
     case HloOpcode::kPad:
-      return EmitPad(instr, indices, operand_provider, builder);
+      return EmitPad(instr, indices, operand_provider, builder, mlir_context);
     case HloOpcode::kDot:
-      return EmitDot(instr, indices, operand_provider, builder);
+      return EmitDot(instr, indices, operand_provider, builder, mlir_context);
     case HloOpcode::kParameter:
       return EmitParameter(instr, this_fn, indices, builder);
     case HloOpcode::kReduce:
       return EmitReduce(instr, indices, operand_provider, call_target_provider,
-                        builder);
+                        builder, mlir_context);
     case HloOpcode::kReduceWindow:
       return EmitReduceWindow(instr, indices, operand_provider,
-                              call_target_provider, builder);
+                              call_target_provider, builder, mlir_context);
     case HloOpcode::kTuple:
-      return EmitTuple(instr, indices, operand_provider, builder);
+      return EmitTuple(instr, indices, operand_provider, builder, mlir_context);
     case HloOpcode::kGetTupleElement: {
       // We have to generate the entire tuple, but since we don't support
       // internal tuple operations (only root tuples), this will always be
@@ -1014,8 +1025,9 @@ absl::StatusOr<SmallVector<Value, 1>> HloToMlir(
     arg_types.push_back(operand_element_type);
   }
 
-  TF_ASSIGN_OR_RETURN(auto operands,
-                      GetOperands(instr, indices, operand_provider, builder));
+  TF_ASSIGN_OR_RETURN(
+      auto operands,
+      GetOperands(instr, indices, operand_provider, builder, mlir_context));
 
   llvm::SmallVector<mlir::NamedAttribute> attributes;
   switch (instr->opcode()) {
@@ -1029,6 +1041,8 @@ absl::StatusOr<SmallVector<Value, 1>> HloToMlir(
       return MapElementwiseOp<mhlo::AcoshOp>(arg_types, operands, builder);
     case HloOpcode::kAsin:
       return MapElementwiseOp<mhlo::AsinOp>(arg_types, operands, builder);
+    case HloOpcode::kAsinh:
+      return MapElementwiseOp<mhlo::AsinhOp>(arg_types, operands, builder);
     case HloOpcode::kAdd:
       if (element_type == PRED) {
         return MapElementwiseOp<mhlo::OrOp>(arg_types, operands, builder);
@@ -1279,7 +1293,7 @@ class SubgraphConverter {
                     mlir::func::FuncOp this_fn,
                     const CallTargetProvider& call_target_provider,
                     ValueRange parameters, ValueRange indices,
-                    ImplicitLocOpBuilder& builder)
+                    ImplicitLocOpBuilder& builder, MLIRContext* mlir_context)
       : computation_(computation),
         subgraph_(subgraph),
         this_fn_(this_fn),
@@ -1287,6 +1301,7 @@ class SubgraphConverter {
         parameters_(parameters),
         indices_(indices),
         builder_(builder),
+        mlir_context_(mlir_context),
         provide_operand_fn_(
             std::bind(std::mem_fn(&SubgraphConverter::ProvideOperand), this,
                       std::placeholders::_1, std::placeholders::_2,
@@ -1322,6 +1337,7 @@ class SubgraphConverter {
   ValueRange parameters_;
   ValueRange indices_;
   ImplicitLocOpBuilder& builder_;
+  MLIRContext* mlir_context_;
   absl::node_hash_map<std::pair<const HloInstruction*, std::vector<void*>>,
                       SmallVector<Value>>
       cached_instructions_;
@@ -1376,9 +1392,9 @@ absl::StatusOr<SmallVector<Value>> SubgraphConverter::EmitInstruction(
     return EmitElementwiseInstruction(instr, indices);
   }
 
-  TF_ASSIGN_OR_RETURN(auto entry,
-                      HloToMlir(instr, this_fn_, indices, provide_operand_fn_,
-                                call_target_provider_, builder_));
+  TF_ASSIGN_OR_RETURN(
+      auto entry, HloToMlir(instr, this_fn_, indices, provide_operand_fn_,
+                            call_target_provider_, builder_, mlir_context_));
   CHECK(!absl::c_linear_search(entry, nullptr))
       << "Failed to lower " << instr->name();
   return CacheInstruction(instr, indices, std::move(entry));
@@ -1413,9 +1429,9 @@ SubgraphConverter::EmitElementwiseInstruction(const HloInstruction* root,
   }
 
   for (auto* instr : llvm::reverse(pre_order)) {
-    TF_ASSIGN_OR_RETURN(auto entry,
-                        HloToMlir(instr, this_fn_, indices, provide_operand_fn_,
-                                  call_target_provider_, builder_));
+    TF_ASSIGN_OR_RETURN(
+        auto entry, HloToMlir(instr, this_fn_, indices, provide_operand_fn_,
+                              call_target_provider_, builder_, mlir_context_));
     CacheInstruction(instr, indices, std::move(entry));
   }
   return cached_instructions_[{root, IndicesToPtrs(indices)}];
@@ -1480,9 +1496,10 @@ absl::StatusOr<SmallVector<Value>> SubgraphToMlir(
     const PartitionedComputation& computation,
     const PartitionedComputation::Subgraph& subgraph,
     mlir::func::FuncOp this_fn, const CallTargetProvider& call_target_provider,
-    ValueRange parameters, ValueRange indices, ImplicitLocOpBuilder& builder) {
+    ValueRange parameters, ValueRange indices, ImplicitLocOpBuilder& builder,
+    MLIRContext* mlir_context) {
   return SubgraphConverter(computation, subgraph, this_fn, call_target_provider,
-                           parameters, indices, builder)
+                           parameters, indices, builder, mlir_context)
       .Convert();
 }
 
@@ -1505,7 +1522,7 @@ void GetLoopBoundsFromIndexingMap(ImplicitLocOpBuilder& b,
 absl::Status SubgraphToMlirFunction(
     const PartitionedComputation& computation,
     const PartitionedComputation::Subgraph& subgraph, mlir::func::FuncOp& func,
-    const CallTargetProvider& call_target_provider) {
+    const CallTargetProvider& call_target_provider, MLIRContext* mlir_context) {
   TF_RET_CHECK(func != nullptr);
   ImplicitLocOpBuilder builder(func.getLoc(), func->getContext());
   builder.setInsertionPointToStart(func.addEntryBlock());
@@ -1518,7 +1535,7 @@ absl::Status SubgraphToMlirFunction(
   TF_ASSIGN_OR_RETURN(
       auto results,
       SubgraphToMlir(computation, subgraph, func, call_target_provider,
-                     parameters, indices, builder));
+                     parameters, indices, builder, mlir_context));
   CHECK_EQ(results.size(), func.getResultTypes().size());
 
   for (auto& result : results) {
@@ -1553,7 +1570,7 @@ ValueRange EmitLoopNestImpl(
     for (auto& init : vector_inits) {
       if (!mlir::isa<mlir::ShapedType>(init.getType())) {
         auto vector_ty = mlir::VectorType::get({vector_size}, init.getType());
-        init = b.create<mlir::vector::SplatOp>(vector_ty, init);
+        init = b.create<mlir::vector::BroadcastOp>(vector_ty, init);
       }
     }
     iter_args_inits = vector_inits;
@@ -1632,7 +1649,7 @@ ValueRange EmitXlaLoopOp(
     for (auto& init : vector_inits) {
       if (!mlir::isa<mlir::ShapedType>(init.getType())) {
         auto vector_ty = mlir::VectorType::get({vector_size}, init.getType());
-        init = b.create<mlir::vector::SplatOp>(vector_ty, init);
+        init = b.create<mlir::vector::BroadcastOp>(vector_ty, init);
       }
     }
     iter_args_inits = vector_inits;
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h
index 112c7bd1ceda6a..35637dcb7d4e8a 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h
@@ -70,7 +70,8 @@ llvm::SmallVector<mlir::Value, 2> ProvideParameterRange(
 absl::Status SubgraphToMlirFunction(
     const PartitionedComputation& computation,
     const PartitionedComputation::Subgraph& subgraph, mlir::func::FuncOp& func,
-    const CallTargetProvider& call_target_provider);
+    const CallTargetProvider& call_target_provider,
+    mlir::MLIRContext* mlir_context);
 
 // Creates an `apply_indexing` op for the given map.
 llvm::SmallVector<mlir::Value, 3> ApplyIndexing(IndexingMap map,
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
index 1a3ca3f69b9864..fd85579d9ccb3f 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
@@ -64,12 +65,12 @@ using ::testing::HasSubstr;
 class ElementalHloToMlirTest : public HloHardwareIndependentTestBase {
  public:
   ElementalHloToMlirTest() {
-    context_.loadDialect<mlir::tensor::TensorDialect, mlir::func::FuncDialect,
-                         mlir::affine::AffineDialect, mlir::arith::ArithDialect,
-                         mlir::math::MathDialect, mlir::scf::SCFDialect,
-                         mlir::mhlo::MhloDialect, mlir::LLVM::LLVMDialect,
-                         mlir::DLTIDialect, xla::XlaDialect,
-                         xla::gpu::XlaGpuDialect>();
+    mlir_context_.loadDialect<
+        mlir::tensor::TensorDialect, mlir::func::FuncDialect,
+        mlir::affine::AffineDialect, mlir::arith::ArithDialect,
+        mlir::math::MathDialect, mlir::scf::SCFDialect, mlir::mhlo::MhloDialect,
+        mlir::LLVM::LLVMDialect, mlir::DLTIDialect, xla::XlaDialect,
+        xla::gpu::XlaGpuDialect>();
   }
 
   // Converts the root subgraph of the entry function of the given hlo module to
@@ -82,8 +83,8 @@ class ElementalHloToMlirTest : public HloHardwareIndependentTestBase {
                    std::optional<xla::BackendKind> xla_backend = std::nullopt) {
     auto hlo_module = ParseAndReturnVerifiedModule(hlo).value();
 
-    mlir::ImplicitLocOpBuilder builder(mlir::UnknownLoc::get(&context_),
-                                       &context_);
+    mlir::ImplicitLocOpBuilder builder(mlir::UnknownLoc::get(&mlir_context_),
+                                       &mlir_context_);
     auto module = llvm_ir::CreateMlirModuleOp(builder.getLoc());
     (*module)->setAttr(
         mlir::DLTIDialect::kDataLayoutAttrName,
@@ -95,32 +96,33 @@ class ElementalHloToMlirTest : public HloHardwareIndependentTestBase {
     if (epilogue_spec_fn) {
       epilogue_spec.push_back(epilogue_spec_fn(entry_computation));
     }
-    PartitionedComputations partitioned_computations(entry_computation,
-                                                     &context_, epilogue_spec);
+    PartitionedComputations partitioned_computations(
+        entry_computation, &mlir_context_, epilogue_spec);
     auto fns = partitioned_computations.DeclareFunctions(module.get());
     auto entry_func = fns[&partitioned_computations
                                .FindPartitionedComputation(entry_computation)
                                .GetRootSubgraph()];
     if (set_xla_entry) {
-      entry_func->setAttr("xla.entry", mlir::UnitAttr::get(&context_));
+      entry_func->setAttr("xla.entry", mlir::UnitAttr::get(&mlir_context_));
     }
     if (xla_backend) {
-      SetBackendKind(&context_, entry_func, *xla_backend);
+      SetBackendKind(&mlir_context_, entry_func, *xla_backend);
     }
     auto& entry_pc =
         partitioned_computations.FindPartitionedComputation(entry_computation);
     auto call_targets = partitioned_computations.CreateCallTargetProvider(fns);
-    TF_RETURN_IF_ERROR(SubgraphToMlirFunction(
-        entry_pc, entry_pc.GetRootSubgraph(), entry_func, call_targets));
+    TF_RETURN_IF_ERROR(
+        SubgraphToMlirFunction(entry_pc, entry_pc.GetRootSubgraph(), entry_func,
+                               call_targets, &mlir_context_));
 
     if (!partitioned_computations.epilogues().empty()) {
       const auto& epilogue = partitioned_computations.epilogues().front();
-      TF_RETURN_IF_ERROR(SubgraphToMlirFunction(entry_pc, epilogue,
-                                                fns[&epilogue], call_targets));
+      TF_RETURN_IF_ERROR(SubgraphToMlirFunction(
+          entry_pc, epilogue, fns[&epilogue], call_targets, &mlir_context_));
     }
 
     // Canonicalize and CSE for better readability of check tests.
-    mlir::PassManager pm(&context_);
+    mlir::PassManager pm(&mlir_context_);
     pm.addPass(mlir::createCanonicalizerPass());
     pm.addPass(mlir::createCSEPass());
     TF_RET_CHECK(pm.run(module.get()).succeeded());
@@ -135,7 +137,7 @@ class ElementalHloToMlirTest : public HloHardwareIndependentTestBase {
     return absl::OkStatus();
   }
 
-  mlir::MLIRContext context_;
+  mlir::MLIRContext mlir_context_;
 };
 
 TEST_F(ElementalHloToMlirTest, Reduce) {
@@ -1384,7 +1386,7 @@ class ElementalHloToMlirEpilogueTest : public ElementalHloToMlirTest {
       epilogue.roots.push_back(entry->GetInstructionWithName("add"));
       epilogue.index_ranges = {2, 16, 17};
       epilogue.root_indexing.push_back(
-          IndexingMap{mlir::AffineMap::getMultiDimIdentityMap(3, &context_)
+          IndexingMap{mlir::AffineMap::getMultiDimIdentityMap(3, &mlir_context_)
                           .getSubMap({0, 2, 1}),
                       DimVarsFromTensorSizes({2, 17, 17}),
                       {},
diff --git a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
index 9d0ef0b98da911..48980f8de9f80b 100644
--- a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
+++ b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace xla {
 namespace emitters {
 
-absl::StatusOr<bool> FusionWrapperBase::Run(
+absl::StatusOr<bool> FusionWrapperBase::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto instructions = module->entry_computation()->MakeInstructionPostOrder();
@@ -47,7 +47,7 @@ absl::StatusOr<bool> FusionWrapperBase::Run(
       }
       return absl::OkStatus();
     }
-    if (!MustWrapInstruction(opcode)) {
+    if (!MustWrapInstruction(*instruction)) {
       return absl::OkStatus();
     }
     auto* computation = instruction->parent();
diff --git a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
index 730561b3651deb..a444559ee4a659 100644
--- a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
+++ b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
@@ -29,14 +29,14 @@ namespace emitters {
 // the type of the wrapper.
 class FusionWrapperBase : public HloModulePass {
  public:
-  virtual bool MustWrapInstruction(HloOpcode opcode) = 0;
+  virtual bool MustWrapInstruction(const HloInstruction& instruction) = 0;
   virtual HloInstruction::FusionKind ChooseFusionKind(
       const HloInstruction& producer, const HloInstruction& consumer) {
     return HloInstruction::FusionKind::kLoop;
   };
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc
index 93cc9364dd0e18..79ef073b9cef29 100644
--- a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc
+++ b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Value.h"
 
@@ -57,6 +58,15 @@ ImplicitArithOpBuilder ImplicitArithOpBuilder::operator*(
   return Binop<mlir::arith::MulIOp>(rhs);
 }
 
+ImplicitArithOpBuilder ImplicitArithOpBuilder::operator/(int64_t rhs) const {
+  return Binop<mlir::arith::DivSIOp>(rhs);
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::operator/(
+    mlir::Value rhs) const {
+  return Binop<mlir::arith::DivSIOp>(rhs);
+}
+
 ImplicitArithOpBuilder ImplicitArithOpBuilder::operator&(
     mlir::Value rhs) const {
   return Binop<mlir::arith::AndIOp>(rhs);
@@ -137,7 +147,26 @@ ImplicitArithOpBuilder ImplicitArithOpBuilder::operator!=(int64_t rhs) const {
   return cmp(mlir::arith::CmpIPredicate::ne, rhs);
 }
 
+ImplicitArithOpBuilder ImplicitArithOpBuilder::min(mlir::Value rhs) const {
+  return {builder_->create<mlir::arith::MinSIOp>(value_, rhs), builder_};
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::min(int64_t rhs) const {
+  return min(MakeConstant(rhs));
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::max(mlir::Value rhs) const {
+  return {builder_->create<mlir::arith::MaxSIOp>(value_, rhs), builder_};
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::max(int64_t rhs) const {
+  return max(MakeConstant(rhs));
+}
+
 ImplicitArithOpBuilder ImplicitArithOpBuilder::MakeConstant(int64_t c) const {
+  if (mlir::isa<mlir::IndexType>(value_.getType())) {
+    return {builder_->create<mlir::arith::ConstantIndexOp>(c), builder_};
+  }
   return {builder_->create<mlir::arith::ConstantIntOp>(value_.getType(), c),
           builder_};
 }
diff --git a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h
index 1dfae3dd4ffa8c..4f7f404b452daf 100644
--- a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h
+++ b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h
@@ -44,6 +44,9 @@ class ImplicitArithOpBuilder {
   // Integer multiplication.
   ImplicitArithOpBuilder operator*(int64_t rhs) const;
   ImplicitArithOpBuilder operator*(mlir::Value rhs) const;
+  // Signed integer division.
+  ImplicitArithOpBuilder operator/(int64_t rhs) const;
+  ImplicitArithOpBuilder operator/(mlir::Value rhs) const;
   // Bitwise and.
   ImplicitArithOpBuilder operator&(mlir::Value rhs) const;
   ImplicitArithOpBuilder operator&(int64_t rhs) const;
@@ -72,6 +75,14 @@ class ImplicitArithOpBuilder {
   ImplicitArithOpBuilder operator==(int64_t rhs) const;
   ImplicitArithOpBuilder operator!=(int64_t rhs) const;
 
+  // Signed integer min.
+  ImplicitArithOpBuilder min(mlir::Value rhs) const;
+  ImplicitArithOpBuilder min(int64_t rhs) const;
+
+  // Signed integer max.
+  ImplicitArithOpBuilder max(mlir::Value rhs) const;
+  ImplicitArithOpBuilder max(int64_t rhs) const;
+
   ImplicitArithOpBuilder MakeConstant(int64_t c) const;
 
  private:
diff --git a/third_party/xla/xla/codegen/emitters/ir/BUILD b/third_party/xla/xla/codegen/emitters/ir/BUILD
index fbf58841037806..bb534898b5fa96 100644
--- a/third_party/xla/xla/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/codegen/emitters/ir/BUILD
@@ -93,13 +93,13 @@ cc_library(
         ":xla_ops_inc_gen",
         "//xla/codegen/emitters:type_util",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BytecodeOpInterface",
-        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
@@ -118,6 +118,7 @@ xla_test(
     deps = [
         ":xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/testlib:filecheck",
         "//xla/mlir/utils:error_util",
         "//xla/tests:hlo_pjrt_test_base",
diff --git a/third_party/xla/xla/codegen/emitters/ir/tests/ops.mlir b/third_party/xla/xla/codegen/emitters/ir/tests/ops.mlir
index 86b305dfc36135..4c92446bb0897a 100644
--- a/third_party/xla/xla/codegen/emitters/ir/tests/ops.mlir
+++ b/third_party/xla/xla/codegen/emitters/ir/tests/ops.mlir
@@ -167,3 +167,12 @@ func.func @workgroup_id_op() -> (index, index, index) {
 // CHECK: [[WORKGROUP_ID_X:.*]] = xla.workgroup_id x {xla.range = [0 : index, 1023 : index]}
 // CHECK: [[WORKGROUP_ID_Y:.*]] = xla.workgroup_id y
 // CHECK: [[WORKGROUP_ID_Z:.*]] = xla.workgroup_id z
+
+// -----
+
+func.func @get_dynamic_dim_size(%in: tensor<16x8x4xf32>) -> (i32) {
+  %out = xla.get_dynamic_dim_size %in 1 : tensor<16x8x4xf32>
+  func.return %out : i32
+}
+// CHECK-LABEL: @get_dynamic_dim_size
+// CHECK: xla.get_dynamic_dim_size
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
index 41aedaacf6a844..1239f8afe06f5e 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_dialect.cc.inc"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
@@ -447,6 +448,7 @@ struct FoldApplyIndexingSequence
               : getAffineSymbolExpr(operand_number - num_dims, ctx);
     }
 
+    // TODO(b/446856303): Get MLIRContext from IndexingMap.
     auto replacement = GetNewIndexingMapAfterFoldingSequence(
         indexing_map, apply_indexing_ops, operand_exprs, ctx);
 
@@ -556,8 +558,8 @@ struct FoldApplyIndexingResults
       return rewriter.notifyMatchFailure(indexing_op,
                                          "Domain of the indexing map is empty");
     }
-    AffineMap* affine_map = &indexing_map.GetMutableAffineMap();
-    unsigned num_results = affine_map->getNumResults();
+    AffineMap affine_map = indexing_map.GetAffineMap();
+    unsigned num_results = affine_map.getNumResults();
     SmallVector<AffineExpr, 4> new_exprs;
     new_exprs.reserve(num_results);
     SmallVector<Value, 4> new_values;
@@ -569,7 +571,7 @@ struct FoldApplyIndexingResults
       }
 
       unsigned id = opresult.getResultNumber();
-      AffineExpr result_expr = affine_map->getResult(id);
+      AffineExpr result_expr = affine_map.getResult(id);
       if (auto const_expr =
               mlir::dyn_cast<mlir::AffineConstantExpr>(result_expr)) {
         new_values.push_back(rewriter.create<arith::ConstantIndexOp>(
@@ -593,11 +595,14 @@ struct FoldApplyIndexingResults
       return rewriter.notifyMatchFailure(
           indexing_op, "No constant or dim/symbol expression found");
     }
-    *affine_map =
-        AffineMap::get(affine_map->getNumDims(), affine_map->getNumSymbols(),
-                       new_exprs, affine_map->getContext());
+    AffineMap new_affine_map =
+        AffineMap::get(affine_map.getNumDims(), affine_map.getNumSymbols(),
+                       new_exprs, affine_map.getContext());
+    IndexingMap new_indexing_map(
+        new_affine_map, indexing_map.GetDimVars(), indexing_map.GetRangeVars(),
+        indexing_map.GetRTVars(), indexing_map.GetConstraints());
     auto new_indexing_op = rewriter.create<ApplyIndexingOp>(
-        loc, indexing_op.getOperands(), indexing_map);
+        loc, indexing_op.getOperands(), new_indexing_map);
     for (int new_result_id = 0, new_indexing_op_result_id = 0;
          new_result_id < new_values.size(); ++new_result_id) {
       auto& new_value = new_values[new_result_id];
@@ -929,7 +934,8 @@ struct SimplifyLoopOfApplyIndexing : public mlir::OpRewritePattern<LoopOp> {
   LogicalResult matchAndRewrite(LoopOp loop_op,
                                 PatternRewriter& rewriter) const override {
     auto loop_indexing_map = loop_op.getIndexingMap();
-    MLIRContext* ctx = loop_op.getContext();
+    MLIRContext* mlir_context = loop_op.getContext();
+    // TODO(b/446856303): Get context from IndexingMap instead.
     int num_dims = loop_indexing_map.GetDimVarsCount();
 
     SmallVector<std::pair<int, ApplyIndexingOp>, 2> apply_indexing_ops;
@@ -957,11 +963,12 @@ struct SimplifyLoopOfApplyIndexing : public mlir::OpRewritePattern<LoopOp> {
     mlir::DenseMap<Value, AffineExpr> operand_exprs;
     for (auto& operand : loop_op->getOpOperands().take_front(num_dims)) {
       int operand_number = operand.getOperandNumber();
-      operand_exprs[operand.get()] = getAffineDimExpr(operand_number, ctx);
+      operand_exprs[operand.get()] =
+          getAffineDimExpr(operand_number, mlir_context);
     }
 
     auto replacement = GetNewIndexingMapAfterFoldingSequence(
-        loop_indexing_map, apply_indexing_ops, operand_exprs, ctx);
+        loop_indexing_map, apply_indexing_ops, operand_exprs, mlir_context);
 
     if (!replacement.ok()) {
       return rewriter.notifyMatchFailure(loop_op,
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops.td b/third_party/xla/xla/codegen/emitters/ir/xla_ops.td
index 144dfc1d4bf3cb..3e4738d494146e 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops.td
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops.td
@@ -298,6 +298,19 @@ def WorkGroupIdOp : XLA_Op<"workgroup_id", [
   let results = (outs Index);
 }
 
+def GetDynamicDimSizeOp : XLA_Op<"get_dynamic_dim_size", [
+    Pure,
+  ]> {
+  let summary = "Returns the dynamic size of a dimension. The dynamic sizes are "
+                "stored in the same buffer, after the main values as an array "
+                "of s32. The `dim` argument can be larger than `tensor`'s rank, "
+                "because XLA has passes like flatten_tensors that only change "
+                "the view of the memory.";
+  let arguments =(ins AnyStaticShapeTensor:$tensor, I64Attr:$dim);
+  let results = (outs I32:$result);
+
+  let assemblyFormat = "$tensor $dim attr-dict `:` type($tensor)";
+}
 
 #endif // XLA_CODEGEN_EMITTERS_IR_XLA_OPS
 
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
index d6c565694107de..cc4002b42c5ea5 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/mlir/utils/error_util.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
diff --git a/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc b/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc
index ad17c85957596e..887763df095a4f 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc
@@ -220,7 +220,7 @@ void SetIndexDataLayout(mlir::ModuleOp module,
 
 IndexingMap GetDefaultWorkItemIndexingMap(const WorkDimensions& work_dimensions,
                                           const Shape& shape,
-                                          mlir::MLIRContext* ctx) {
+                                          mlir::MLIRContext* mlir_context) {
   std::vector<mlir::AffineExpr> output_dims(shape.dimensions().size());
 
   const NumWorkItems& num_work_items = work_dimensions.num_work_items;
@@ -235,13 +235,13 @@ IndexingMap GetDefaultWorkItemIndexingMap(const WorkDimensions& work_dimensions,
       num_work_items.y * num_work_groups.y,
       num_work_items.z * num_work_groups.z};
 
-  mlir::AffineExpr c0 = mlir::getAffineConstantExpr(0, ctx);
+  mlir::AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
   uint64_t stride = 1;
   mlir::AffineExpr linear_index = c0;
   // Reverse to get minor to major order.
   for (auto [idx, dim] : llvm::enumerate(llvm::reverse(work_tile_dimensions))) {
     uint64_t symbol_index = work_tile_dimensions.size() - idx;
-    auto tile_coord = mlir::getAffineSymbolExpr(symbol_index, ctx);
+    auto tile_coord = mlir::getAffineSymbolExpr(symbol_index, mlir_context);
     auto tile_component = tile_coord * stride;
 
     linear_index = linear_index + tile_component;
@@ -258,9 +258,10 @@ IndexingMap GetDefaultWorkItemIndexingMap(const WorkDimensions& work_dimensions,
   // loop emitter doesn't support. This is safe, since the latter CHECK fails
   // if its assumptions are not fulfilled.
   for (int i = 0; i < 3; ++i) {
-    auto coord = mlir::getAffineDimExpr(kIndexingMapWorkItemDims[i], ctx) +
-                 mlir::getAffineDimExpr(kIndexingMapWorkGroupDims[i], ctx) *
-                     work_item_array[i];
+    auto coord =
+        mlir::getAffineDimExpr(kIndexingMapWorkItemDims[i], mlir_context) +
+        mlir::getAffineDimExpr(kIndexingMapWorkGroupDims[i], mlir_context) *
+            work_item_array[i];
     auto linear_component = coord * stride;
     linear_index = linear_index + linear_component;
     stride *= total_item_array[i];
@@ -270,7 +271,7 @@ IndexingMap GetDefaultWorkItemIndexingMap(const WorkDimensions& work_dimensions,
   // chunk.
   uint64_t items_per_chunk = stride;
 
-  mlir::AffineExpr chunk_id = mlir::getAffineSymbolExpr(0, ctx);
+  mlir::AffineExpr chunk_id = mlir::getAffineSymbolExpr(0, mlir_context);
   linear_index = chunk_id * items_per_chunk + linear_index;
 
   // See IndexUtil::LinearIndexToMultidimensionalIndex.
@@ -294,10 +295,11 @@ IndexingMap GetDefaultWorkItemIndexingMap(const WorkDimensions& work_dimensions,
 
   size_t range_vars_size = range_vars.size();
 
-  IndexingMap indexing_map(
-      mlir::AffineMap::get(/*dimCount=*/6,
-                           /*symbolCount=*/range_vars_size, output_dims, ctx),
-      std::move(dim_vars), std::move(range_vars), /*rt_vars=*/{});
+  IndexingMap indexing_map(mlir::AffineMap::get(/*dimCount=*/6,
+                                                /*symbolCount=*/range_vars_size,
+                                                output_dims, mlir_context),
+                           std::move(dim_vars), std::move(range_vars),
+                           /*rt_vars=*/{});
   indexing_map.AddConstraint(linear_index, Interval{0, num_elements - 1});
   indexing_map.Simplify();
   indexing_map.RemoveUnusedSymbols();
@@ -363,7 +365,8 @@ absl::StatusOr<CallTargetProvider> EmitPartitionedComputations(
     for (const auto& subgraph : comp.subgraphs()) {
       if (subgraph_to_mlir_fn.contains(&subgraph)) {
         TF_RETURN_IF_ERROR(SubgraphToMlirFunction(
-            comp, subgraph, subgraph_to_mlir_fn[&subgraph], call_targets));
+            comp, subgraph, subgraph_to_mlir_fn[&subgraph], call_targets,
+            computations.mlir_context()));
       }
     }
   }
@@ -375,7 +378,8 @@ absl::StatusOr<CallTargetProvider> EmitPartitionedComputations(
     }
     TF_RETURN_IF_ERROR(SubgraphToMlirFunction(
         computations.FindPartitionedComputation(fused_computation), epilogue,
-        subgraph_to_mlir_fn[&epilogue], call_targets));
+        subgraph_to_mlir_fn[&epilogue], call_targets,
+        computations.mlir_context()));
   }
 
   return call_targets;
diff --git a/third_party/xla/xla/codegen/emitters/kernel_api_builder.h b/third_party/xla/xla/codegen/emitters/kernel_api_builder.h
index 97d9d1acdc01e0..01f9bff165a36c 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_api_builder.h
+++ b/third_party/xla/xla/codegen/emitters/kernel_api_builder.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -54,7 +53,7 @@ void SetIndexDataLayout(mlir::ModuleOp module,
 // and output shape.
 IndexingMap GetDefaultWorkItemIndexingMap(const WorkDimensions& work_dimensions,
                                           const Shape& shape,
-                                          mlir::MLIRContext* ctx);
+                                          mlir::MLIRContext* mlir_context);
 
 // Emits the work group id ops annotated with the range of each dimension.
 llvm::SmallVector<mlir::Value> EmitWorkGroupIds(
diff --git a/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc b/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc
index bb1cd2581ecc8a..00d2d508a989fa 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc
@@ -33,8 +33,8 @@ namespace xla::emitters {
 namespace {
 
 TEST(DefaultWorkItemIndexingMap, MultiDimensionTile) {
-  mlir::MLIRContext context;
-  context.loadDialect<mlir::affine::AffineDialect>();
+  mlir::MLIRContext mlir_context;
+  mlir_context.loadDialect<mlir::affine::AffineDialect>();
 
   WorkDimensions work_dimensions{NumWorkClusters{}, NumWorkGroups{2},
                                  NumWorkItems{3}, WorkTileSize{{4, 5, 6}}};
@@ -43,7 +43,7 @@ TEST(DefaultWorkItemIndexingMap, MultiDimensionTile) {
   *shape.mutable_layout() = LayoutUtil::GetDefaultLayoutForShape(shape);
 
   IndexingMap indexing_map =
-      GetDefaultWorkItemIndexingMap(work_dimensions, shape, &context);
+      GetDefaultWorkItemIndexingMap(work_dimensions, shape, &mlir_context);
 
   // The shape is the same as the number of elements work dimensions, so there
   // are no constraints.
@@ -57,18 +57,18 @@ TEST(DefaultWorkItemIndexingMap, MultiDimensionTile) {
 
   mlir::AffineMap affine_map = indexing_map.GetAffineMap();
 
-  mlir::AffineExpr work_item_sym = mlir::getAffineDimExpr(0, &context);
-  mlir::AffineExpr work_group_sym = mlir::getAffineDimExpr(3, &context);
+  mlir::AffineExpr work_item_sym = mlir::getAffineDimExpr(0, &mlir_context);
+  mlir::AffineExpr work_group_sym = mlir::getAffineDimExpr(3, &mlir_context);
 
   EXPECT_EQ(affine_map.getResult(0), 3 * work_group_sym + work_item_sym);
 
-  mlir::AffineExpr tile_sym_x = mlir::getAffineSymbolExpr(0, &context);
+  mlir::AffineExpr tile_sym_x = mlir::getAffineSymbolExpr(0, &mlir_context);
   EXPECT_EQ(affine_map.getResult(1), tile_sym_x);
 
-  mlir::AffineExpr tile_sym_y = mlir::getAffineSymbolExpr(1, &context);
+  mlir::AffineExpr tile_sym_y = mlir::getAffineSymbolExpr(1, &mlir_context);
   EXPECT_EQ(affine_map.getResult(2), tile_sym_y);
 
-  mlir::AffineExpr tile_sym_z = mlir::getAffineSymbolExpr(2, &context);
+  mlir::AffineExpr tile_sym_z = mlir::getAffineSymbolExpr(2, &mlir_context);
   EXPECT_EQ(affine_map.getResult(3), tile_sym_z);
 }
 
diff --git a/third_party/xla/xla/codegen/emitters/kernel_arguments.cc b/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
index 91597ee8adfc46..2b61fad0714bac 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
-#include <optional>
 #include <utility>
 #include <vector>
 
@@ -61,6 +60,12 @@ void FillKernelArgumentAttributes(
 
   for (int64_t i = 0; i < kernel_arguments.size(); ++i) {
     KernelArgument& kernel_argument = kernel_arguments[i];
+    if (kernel_argument.kind() == KernelArgument::Kind::kUnmanaged) {
+      if (kernel_argument.shape().dimensions().empty()) {
+        kernel_argument.set_alignment(0);  // scalars have no alignment
+      }
+      continue;
+    }
 
     auto& first_index = first_indices_for_slices[kernel_argument.slice()];
     if (first_index.has_value()) {
@@ -93,7 +98,8 @@ void FillKernelArgumentAttributes(
 
     kernel_argument.set_aliased(kernel_argument.written() && [&] {
       for (size_t j = 0; j < kernel_arguments.size(); ++j) {
-        if (i == j) {
+        if (i == j ||
+            kernel_arguments[j].kind() == KernelArgument::Kind::kUnmanaged) {
           continue;
         }
 
@@ -138,13 +144,11 @@ absl::StatusOr<OutputArguments> ExtractOutputArguments(
       }));
   return result;
 }
-
-}  // namespace
-
-absl::StatusOr<KernelArguments> KernelArguments::Create(
+absl::StatusOr<KernelArguments> CreateKernelArguments(
     const BufferAssignment& buffer_assignment,
-    const BufferAlignment& buffer_alignment,
-    const HloInstruction* hlo_instruction) {
+    const KernelArguments::BufferAlignment& buffer_alignment,
+    const HloInstruction* hlo_instruction,
+    absl::Span<const Shape> unmanaged_arguments) {
   std::vector<KernelArgument> kernel_arguments;
   for (const HloInstruction* operand : hlo_instruction->operands()) {
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
@@ -158,12 +162,33 @@ absl::StatusOr<KernelArguments> KernelArguments::Create(
 
   absl::c_move(output_result.output_arguments,
                std::back_inserter(kernel_arguments));
+  for (const Shape& unmanaged_argument : unmanaged_arguments) {
+    kernel_arguments.emplace_back(unmanaged_argument);
+  }
   FillKernelArgumentAttributes(kernel_arguments, buffer_alignment,
                                output_result.buffers_written);
-
   return KernelArguments(std::move(kernel_arguments));
 }
 
+}  // namespace
+
+absl::StatusOr<KernelArguments> KernelArguments::Create(
+    const BufferAssignment& buffer_assignment,
+    const BufferAlignment& buffer_alignment,
+    const HloInstruction* hlo_instruction) {
+  return CreateKernelArguments(buffer_assignment, buffer_alignment,
+                               hlo_instruction, {});
+}
+
+absl::StatusOr<KernelArguments> KernelArguments::Create(
+    const BufferAssignment& buffer_assignment,
+    const BufferAlignment& buffer_alignment,
+    const HloInstruction* hlo_instruction,
+    absl::Span<const Shape> unmanaged_arguments) {
+  return CreateKernelArguments(buffer_assignment, buffer_alignment,
+                               hlo_instruction, unmanaged_arguments);
+}
+
 absl::StatusOr<KernelArguments> KernelArguments::Create(
     const BufferAssignment& buffer_assignment,
     const BufferAlignment& buffer_alignment,
@@ -171,8 +196,8 @@ absl::StatusOr<KernelArguments> KernelArguments::Create(
     absl::Span<const int32_t> interleaved_output_indices) {
   if (interleaved_output_indices.empty()) {
     // Fall back to regular Create method when no interleaving is requested
-    return KernelArguments::Create(buffer_assignment, buffer_alignment,
-                                   hlo_instruction);
+    return CreateKernelArguments(buffer_assignment, buffer_alignment,
+                                 hlo_instruction, {});
   }
 
   const auto& operands = hlo_instruction->operands();
diff --git a/third_party/xla/xla/codegen/emitters/kernel_arguments.h b/third_party/xla/xla/codegen/emitters/kernel_arguments.h
index f3c6d4a52962f7..c6fae6681c8c61 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_arguments.h
+++ b/third_party/xla/xla/codegen/emitters/kernel_arguments.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
@@ -30,14 +31,29 @@ namespace xla::emitters {
 // Thread-safe.
 class KernelArgument {
  public:
+  // Managed arguments are those that are assigned slices during the buffer
+  // assignment pass.
+  // Unmanaged arguments can be scalars of tensors. Scalars are simply passed
+  // by value. For buffers the memory is managed by the runtime thunk.
+  // The[KernelThunk] currently assumes that all arguments are managed.
+  // The [CollectiveKernelThunk] distinguishes between the two types of
+  // arguments because it manages scratch buffers for the collectives itself.
+  enum class Kind { kManaged, kUnmanaged };
+
   KernelArgument(Shape shape, BufferAllocation::Slice slice)
-      : shape_(shape), slice_(slice) {}
+      : kind_(Kind::kManaged), shape_(shape), slice_(slice) {}
+  // Constructor for arguments that don't have an associated slice.
+  explicit KernelArgument(Shape shape)
+      : kind_(Kind::kUnmanaged), shape_(shape) {}
+
+  Kind kind() const { return kind_; }
   const Shape& shape() const { return shape_; }
   const BufferAllocation::Slice& slice() const { return slice_; }
 
   bool written() const { return written_; }
   void set_written(bool written) { written_ = written; }
 
+  // An alignment of 0 means that the alignment attribute shouldn't be set.
   int64_t alignment() const { return alignment_; }
   void set_alignment(int64_t alignment) { alignment_ = alignment; }
 
@@ -48,6 +64,7 @@ class KernelArgument {
   void set_slice_index(int64_t slice_index) { slice_index_ = slice_index; }
 
  private:
+  Kind kind_;
   Shape shape_;
   BufferAllocation::Slice slice_;
   bool aliased_ = true;
@@ -79,6 +96,15 @@ class KernelArguments {
     int64_t constant_buffer_align_bytes;
   };
 
+  // Creates a KernelArguments object for the given HLO instruction.
+  // The unmanaged_arguments are added to the end of the list of input/output
+  // arguments.
+  static absl::StatusOr<KernelArguments> Create(
+      const BufferAssignment& buffer_assignment,
+      const BufferAlignment& buffer_alignment,
+      const HloInstruction* hlo_instruction,
+      absl::Span<const Shape> unmanaged_arguments);
+
   static absl::StatusOr<KernelArguments> Create(
       const BufferAssignment& buffer_assignment,
       const BufferAlignment& buffer_alignment,
@@ -111,6 +137,15 @@ class KernelArguments {
     return arg_slices;
   }
 
+  std::vector<Shape> GetArgumentBufferShapes() const {
+    std::vector<Shape> arg_shapes;
+    arg_shapes.reserve(args_.size());
+    for (const KernelArgument& arg : args_) {
+      arg_shapes.push_back(arg.shape());
+    }
+    return arg_shapes;
+  }
+
   std::vector<bool> GetArgumentOutputFlags() const {
     std::vector<bool> output_flags;
     output_flags.reserve(args_.size());
@@ -120,6 +155,15 @@ class KernelArguments {
     return output_flags;
   }
 
+  std::vector<KernelArgument::Kind> GetArgumentKinds() const {
+    std::vector<KernelArgument::Kind> kinds;
+    kinds.reserve(args_.size());
+    for (const KernelArgument& arg : args_) {
+      kinds.push_back(arg.kind());
+    }
+    return kinds;
+  }
+
  private:
   std::vector<KernelArgument> args_;
 };
diff --git a/third_party/xla/xla/codegen/emitters/kernel_arguments_test.cc b/third_party/xla/xla/codegen/emitters/kernel_arguments_test.cc
index af5b70b2612107..42c10f13a1718f 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_arguments_test.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_arguments_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::emitters {
 namespace {
@@ -136,7 +138,8 @@ ENTRY main {
   // Test 1: Create regular (non-interleaved) arguments for baseline
   TF_ASSERT_OK_AND_ASSIGN(
       KernelArguments regular_args,
-      KernelArguments::Create(*buffer_assignment, buffer_alignment, root, {}));
+      KernelArguments::Create(*buffer_assignment, buffer_alignment, root,
+                              absl::Span<const int32_t>{}));
 
   // Test 2: Create interleaved arguments
   // Expected order: input0, output0, input1, output1
@@ -198,7 +201,8 @@ ENTRY main {
   // Test 1: Create regular (non-interleaved) arguments for baseline
   TF_ASSERT_OK_AND_ASSIGN(
       KernelArguments regular_args,
-      KernelArguments::Create(*buffer_assignment, buffer_alignment, root, {}));
+      KernelArguments::Create(*buffer_assignment, buffer_alignment, root,
+                              absl::Span<const int32_t>{}));
 
   // Test 2: Create interleaved arguments - output at beginning (position 0)
   // Expected order: output0, input0 (instead of input0, output0)
@@ -313,5 +317,58 @@ ENTRY main {
   ASSERT_EQ(kernel_args.args().size(), 3);  // 2 inputs + 1 output
 }
 
+TEST_F(KernelArgumentsTest, UnmanagedArguments) {
+  constexpr absl::string_view kHloString = R"(
+    HloModule module
+
+    ENTRY entry {
+      param.0 = f32[1,2,3]{2,1,0} parameter(0)
+      param.1 = f32[1,2,3]{2,1,0} parameter(1)
+      ROOT add = f32[1,2,3]{2,1,0} add(param.0, param.1)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AliasInfo alias_info;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> assignment,
+      BufferAssigner::Run(
+          module.get(), std::make_unique<DependencyHloOrdering>(module.get()),
+          &BufferSizeBytes, &alias_info, [](LogicalBuffer::Color) { return 0; },
+          /*allocate_buffers_for_constants=*/true));
+  // Input and output buffers are managed.
+  EXPECT_THAT(assignment->Allocations(), SizeIs(3));
+  auto unmanaged_arguments = std::vector{
+      ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(U32, {}),
+      ShapeUtil::MakeShape(F32, {24}), ShapeUtil::MakeShape(F32, {65536})};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto kernel_arguments,
+      KernelArguments::Create(*assignment, gpu::GetDefaultBufferAlignment(),
+                              module->entry_computation()->root_instruction(),
+                              unmanaged_arguments));
+  // 3 managed arguments + 4 unmanaged arguments.
+  ASSERT_THAT(kernel_arguments.args(), SizeIs(7));
+
+  constexpr size_t kExpectedBufferSize = 1 * 2 * 3 * sizeof(float);
+  EXPECT_THAT(
+      kernel_arguments.GetArgumentBufferSlices(),
+      ElementsAre(BufferAllocation::Slice(&assignment->Allocations()[1],
+                                          /*offset=*/0, kExpectedBufferSize),
+                  BufferAllocation::Slice(&assignment->Allocations()[2],
+                                          /*offset=*/0, kExpectedBufferSize),
+                  // The output is last in KernelArguments.
+                  BufferAllocation::Slice(&assignment->Allocations()[0],
+                                          /*offset=*/0, kExpectedBufferSize),
+                  BufferAllocation::Slice(), BufferAllocation::Slice(),
+                  BufferAllocation::Slice(), BufferAllocation::Slice()));
+  constexpr auto kManaged = KernelArgument::Kind::kManaged;
+  constexpr auto kUnmanaged = KernelArgument::Kind::kUnmanaged;
+  EXPECT_THAT(kernel_arguments.GetArgumentKinds(),
+              ElementsAre(kManaged, kManaged, kManaged, kUnmanaged, kUnmanaged,
+                          kUnmanaged, kUnmanaged));
+}
+
 }  // namespace
 }  // namespace xla::emitters
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
index de5f1c3dccced3..467e80fd26888e 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -50,10 +49,10 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -69,8 +68,10 @@ limitations under the License.
 
 namespace xla::emitters {
 
+using ::mlir::MLIRContext;
+
 LoopFusionKernelEmitter::LoopFusionKernelEmitter(
-    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    MLIRContext& mlir_context, const HloFusionInstruction& fusion,
     const HloFusionSpec& fusion_spec, const BufferAssignment* buffer_assignment,
     KernelArguments::BufferAlignment buffer_alignment,
     WorkDimensions work_dimensions, absl::string_view entry_function_name,
@@ -84,7 +85,7 @@ LoopFusionKernelEmitter::LoopFusionKernelEmitter(
       entry_function_name_(entry_function_name),
       backend_kind_(backend_kind) {}
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<LoopFusionKernelEmitter::KernelDefinition>
 LoopFusionKernelEmitter::EmitKernelDefinition() {
   mlir::OpBuilder builder(&mlir_context_);
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion_.name()));
@@ -102,7 +103,7 @@ LoopFusionKernelEmitter::EmitKernelDefinition() {
 
   // Loop emitters don't support epilogues.
   emitters::PartitionedComputations computations(
-      fusion_.fused_instructions_computation(), module->getContext());
+      fusion_.fused_instructions_computation(), &mlir_context_);
   TF_ASSIGN_OR_RETURN(auto call_targets, emitters::EmitPartitionedComputations(
                                              *module, computations));
 
@@ -113,18 +114,18 @@ LoopFusionKernelEmitter::EmitKernelDefinition() {
                       GetKernelSpec(entry_function_name_, fusion_,
                                     buffer_assignment_, work_dimensions_));
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(module)));
 }
 
 IndexingMap LoopFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
     const WorkDimensions& work_dimensions, const Shape& root_shape,
-    mlir::MLIRContext* ctx) {
+    MLIRContext* ctx) {
   return GetDefaultWorkItemIndexingMap(work_dimensions, root_shape, ctx);
 }
 
 IndexingMap LoopFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
-    mlir::MLIRContext* ctx) const {
+    MLIRContext* ctx) const {
   return ComputeWorkItemIdToOutputIndexing(work_dimensions_,
                                            GetIndexingShape(fusion_spec_), ctx);
 }
@@ -147,12 +148,10 @@ absl::Status LoopFusionKernelEmitter::EmitEntryFunction(
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
 
-  mlir::MLIRContext* context = builder.getContext();
-
   auto workgroup_ids =
       EmitWorkGroupIds(builder, work_dimensions_.num_work_groups);
 
-  auto indexing = ComputeWorkItemIdToOutputIndexing(context);
+  auto indexing = ComputeWorkItemIdToOutputIndexing(&mlir_context_);
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   auto output_tensor_args =
@@ -187,7 +186,7 @@ absl::Status LoopFusionKernelEmitter::EmitEntryFunction(
     for (auto [root_shape, tensor, value] :
          llvm::zip(result_shapes, output_tensors, result_scalars)) {
       llvm::SmallVector<mlir::Value> output_indices = emitters::ApplyIndexing(
-          GetBitcastMap(*result_shapes.front(), *root_shape, context),
+          GetBitcastMap(*result_shapes.front(), *root_shape, &mlir_context_),
           map_results, {}, nested_b);
       result_tensors.push_back(nested_b.create<mlir::tensor::InsertOp>(
           value, tensor, output_indices));
@@ -216,7 +215,8 @@ absl::Status LoopFusionKernelEmitter::EmitEntryFunction(
       llvm::SmallVector<mlir::OpFoldResult> offsets(output_tensor.getRank(),
                                                     nested_b.getIndexAttr(0));
       llvm::SmallVector<mlir::OpFoldResult> sizes =
-          mlir::getAsIndexOpFoldResult(context, output_tensor.getShape());
+          mlir::getAsIndexOpFoldResult(&mlir_context_,
+                                       output_tensor.getShape());
       llvm::SmallVector<mlir::OpFoldResult> strides(output_tensor.getRank(),
                                                     nested_b.getIndexAttr(1));
       nested_b.create<mlir::tensor::ParallelInsertSliceOp>(
@@ -226,7 +226,7 @@ absl::Status LoopFusionKernelEmitter::EmitEntryFunction(
 
   const NumWorkItems& num_work_items = work_dimensions_.num_work_items;
   llvm::SmallVector<mlir::OpFoldResult> upper_bounds =
-      mlir::getAsIndexOpFoldResult(context,
+      mlir::getAsIndexOpFoldResult(&mlir_context_,
                                    {static_cast<int64_t>(num_work_items.x),
                                     static_cast<int64_t>(num_work_items.y),
                                     static_cast<int64_t>(num_work_items.z)});
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
index 8151d4bfce6e38..46546815d2f8cb 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
@@ -16,23 +16,20 @@ limitations under the License.
 #ifndef XLA_CODEGEN_EMITTERS_LOOP_KERNEL_EMITTER_H_
 #define XLA_CODEGEN_EMITTERS_LOOP_KERNEL_EMITTER_H_
 
-#include <cstdint>
 #include <string>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
@@ -41,7 +38,7 @@ limitations under the License.
 namespace xla::emitters {
 
 // Generic loop fusion.
-class LoopFusionKernelEmitter final : public MlirKernelEmitter {
+class LoopFusionKernelEmitter final : public KernelEmitter<MlirKernelSource> {
  public:
   LoopFusionKernelEmitter(mlir::MLIRContext& mlir_context,
                           const HloFusionInstruction& fusion,
@@ -52,7 +49,8 @@ class LoopFusionKernelEmitter final : public MlirKernelEmitter {
                           absl::string_view entry_function_name,
                           BackendKind backend_kind);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() override;
+  absl::string_view name() const final { return "loop_fusion_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
   static IndexingMap ComputeWorkItemIdToOutputIndexing(
       const WorkDimensions& work_dimensions, const Shape& root_shape,
@@ -62,8 +60,6 @@ class LoopFusionKernelEmitter final : public MlirKernelEmitter {
   // specification.
   static Shape GetIndexingShape(const HloFusionSpec& fusion_spec);
 
-  std::string name() const final { return "loop_fusion_kernel_emitter"; }
-
  private:
   IndexingMap ComputeWorkItemIdToOutputIndexing(mlir::MLIRContext* ctx) const;
 
diff --git a/third_party/xla/xla/codegen/emitters/tests/loop/asinh_f32.hlo b/third_party/xla/xla/codegen/emitters/tests/loop/asinh_f32.hlo
new file mode 100644
index 00000000000000..263f31752fc238
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/tests/loop/asinh_f32.hlo
@@ -0,0 +1,8 @@
+// RUN: gpu_fusion_to_mlir %s | emitters_opt -xla-test-optimize \
+// RUN:   -xla-gpu-test-transform-loops -xla-gpu-test-to-llvm \
+// RUN: | FileCheck %s
+fusion {
+  p0 = f32[256] parameter(0)
+  ROOT asinh = f32[256] asinh(p0)
+}
+// CHECK: llvm.call @__nv_asinhf(
diff --git a/third_party/xla/xla/codegen/emitters/tests/loop/asinh_f64.hlo b/third_party/xla/xla/codegen/emitters/tests/loop/asinh_f64.hlo
new file mode 100644
index 00000000000000..3f8b8039b9450b
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/tests/loop/asinh_f64.hlo
@@ -0,0 +1,8 @@
+// RUN: gpu_fusion_to_mlir %s | emitters_opt -xla-test-optimize \
+// RUN:   -xla-gpu-test-transform-loops -xla-gpu-test-to-llvm \
+// RUN: | FileCheck %s
+fusion {
+  p0 = f64[256] parameter(0)
+  ROOT asinh = f64[256] asinh(p0)
+}
+// CHECK: llvm.call @__nv_asinh(
diff --git a/third_party/xla/xla/codegen/emitters/tests/loop/gather_start_index_map.hlo b/third_party/xla/xla/codegen/emitters/tests/loop/gather_start_index_map.hlo
new file mode 100644
index 00000000000000..f8140e0f8c3f99
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/tests/loop/gather_start_index_map.hlo
@@ -0,0 +1,33 @@
+HloModule gather_start_index_map
+// A canonicalized (simplified) gather with non-normalized start_index_map.
+
+ENTRY main {
+  %operand = f32[30,20] parameter(0)
+  %indices = s32[10,2] parameter(1)
+  ROOT %gather = f32[10,4,8] gather(%operand, %indices),
+    offset_dims={1,2},
+    collapsed_slice_dims={},
+    start_index_map={1,0},
+    index_vector_dim=1,
+    slice_sizes={4,8}
+}
+
+// RUN: gpu_fusion_to_mlir %s | FileCheck %s
+// RUN: cpu_fusion_to_mlir %s | FileCheck %s
+// RUN: gpu_test_correctness %s
+// RUN: cpu_test_correctness %s
+
+// CHECK-LABEL: func.func @main
+// CHECK-SAME: %arg0: tensor<30x20xf32>
+// CHECK-SAME: %arg1: tensor<10x2xi32>
+// CHECK-SAME: %arg2: tensor<10x4x8xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[IDX0:.*]] = tensor.extract %arg1[{{.*}}, %[[C0]]]
+// CHECK: %[[IDX0_I:.*]] = arith.index_cast %[[IDX0]]
+// CHECK: %[[C12:.*]] = arith.constant 12 : index
+// CHECK: arith.minsi %[[IDX0_I]], %[[C12]]
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[IDX1:.*]] = tensor.extract %arg1[{{.*}}, %[[C1]]]
+// CHECK: %[[IDX1_I:.*]] = arith.index_cast %[[IDX1]]
+// CHECK: %[[C26:.*]] = arith.constant 26 : index
+// CHECK: arith.minsi %[[IDX1_I]], %[[C26]]
diff --git a/third_party/xla/xla/codegen/emitters/transforms/BUILD b/third_party/xla/xla/codegen/emitters/transforms/BUILD
index 6268da4d6c8b0a..429feecc9eb982 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/codegen/emitters/transforms/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "lower_xla_to_scf.cc",
         "merge_pointers_to_same_slice.cc",
         "propagate_slice_indices.cc",
+        "safe_integer_arithmetic.cc",
         "unswitch_loops.cc",
         "vectorize_loads_stores.cc",
     ],
@@ -73,20 +74,22 @@ cc_library(
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters:implicit_arith_op_builder",
         "//xla/codegen/emitters/ir:xla",
-        "//xla/codegen/intrinsic",
         "//xla/codegen/intrinsic:erf",
         "//xla/codegen/intrinsic:exp",
         "//xla/codegen/intrinsic:fptrunc",
         "//xla/codegen/intrinsic:log1p",
         "//xla/codegen/intrinsic:rsqrt",
         "//xla/codegen/intrinsic:tanh",
+        "//xla/codegen/intrinsic:type",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
@@ -95,6 +98,7 @@ cc_library(
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUUtils",
         "@llvm-project//mlir:AffineToStandard",
@@ -118,6 +122,7 @@ cc_library(
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MathTransforms",
+        "@llvm-project//mlir:MemRefToLLVM",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLDialect",
@@ -127,9 +132,11 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:UBToLLVM",
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorTransforms",
+        "@llvm-project//mlir:VectorUtils",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
index d3c0845eca74dd..d2283480a1b372 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
@@ -505,6 +505,11 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
   mlir::LogicalResult matchAndRewrite(
       ma::TruncFOp op, mlir::PatternRewriter& rewriter) const override {
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
+
+    if (!op.getType().isFloat()) {
+      return rewriter.notifyMatchFailure(op, "not a scalar float");
+    }
+
     auto src = mlir::cast<FloatValue>(op.getOperand());
     auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
     if (dst_ty.getWidth() > 8) {
@@ -523,6 +528,11 @@ struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
   mlir::LogicalResult matchAndRewrite(
       ma::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
+
+    if (!op.getType().isFloat()) {
+      return rewriter.notifyMatchFailure(op, "not a scalar float");
+    }
+
     auto src = mlir::cast<FloatValue>(op.getOperand());
     auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
     if (src.getType().getWidth() > 8) {
@@ -541,6 +551,10 @@ struct RewriteF8Cst : public mlir::OpRewritePattern<ma::CmpFOp> {
 
   mlir::LogicalResult matchAndRewrite(
       ma::CmpFOp op, mlir::PatternRewriter& rewriter) const override {
+    if (!op.getLhs().getType().isFloat()) {
+      return rewriter.notifyMatchFailure(op, "not a scalar cmpf");
+    }
+
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
     auto lhs = mlir::cast<FloatValue>(op.getLhs());
     auto rhs = mlir::cast<FloatValue>(op.getRhs());
@@ -584,7 +598,10 @@ struct RewriteAbsFPattern : public mlir::OpRewritePattern<mlir::math::AbsFOp> {
   mlir::LogicalResult matchAndRewrite(
       mlir::math::AbsFOp op, mlir::PatternRewriter& rewriter) const override {
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
-    auto src = mlir::cast<FloatValue>(op.getOperand());
+    auto src = mlir::dyn_cast<FloatValue>(op.getOperand());
+    if (!src) {
+      return rewriter.notifyMatchFailure(op, "not a scalar float");
+    }
     // LowerGpuOpsToNVVMOps has a lowering for abs that doesn't work with bf16.
     // Once that's removed, remove the code for BF16 here.
     if (src.getType().getWidth() > 8 && !src.getType().isBF16()) {
@@ -615,7 +632,7 @@ struct RewriteIToFpPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
-    if (op.getType().getIntOrFloatBitWidth() > 8) {
+    if (!op.getType().isFloat() || op.getType().getIntOrFloatBitWidth() > 8) {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) itofp");
     }
     Value to_float =
@@ -631,7 +648,8 @@ struct RewriteFpToIPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
-    if (op.getIn().getType().getIntOrFloatBitWidth() > 8) {
+    if (!op.getIn().getType().isFloat() ||
+        op.getIn().getType().getIntOrFloatBitWidth() > 8) {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) fptoi");
     }
     Value to_f32 = rewriter.create<ma::ExtFOp>(
diff --git a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
index cd420b2e19bcdd..f603a2d89a7b25 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
@@ -46,9 +46,11 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/WalkResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.h"
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/layout_util.h"
 #include "xla/shape_util.h"
@@ -227,6 +229,7 @@ Value LinearizeIndex(Location loc, ShapedType type, ValueRange indices,
   }
   auto linear_shape =
       ShapeUtil::MakeShape(U8, {ShapeUtil::ElementsIn(byte_shape)});
+  // TODO(b/446856820): Get MLIRContext from a different source..
   auto linearized_map =
       GetBitcastMap(byte_shape, linear_shape, rewriter.getContext());
   mlir::SmallVector<Value> result;
@@ -746,6 +749,28 @@ struct RewriteSyncThreads : OpRewritePattern<gpu::SyncThreadsOp> {
   }
 };
 
+struct RewriteGetDynamicDimSizeOp : OpRewritePattern<GetDynamicDimSizeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GetDynamicDimSizeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto tensor = op.getTensor();
+    auto tensor_type = tensor.getType();
+    if (tensor_type.getRank() < 2) {
+      return rewriter.notifyMatchFailure(op, "the tensor is already flat");
+    }
+
+    auto tensor_1D = rewriter
+                         .create<UnrealizedConversionCastOp>(
+                             op.getLoc(), GetFlattenedType(tensor_type), tensor)
+                         .getResult(0);
+    rewriter.replaceOpWithNewOp<GetDynamicDimSizeOp>(op, tensor_1D,
+                                                     op.getDim());
+
+    return mlir::success();
+  }
+};
+
 class FlattenTensorsPass
     : public impl::FlattenTensorsPassBase<FlattenTensorsPass> {
  public:
@@ -758,8 +783,10 @@ class FlattenTensorsPass
         RewriteAllocateShared,
         RewriteAtomicRMW,
         RewriteConstant,
+        RewriteCpuLoad,
         RewriteFor,
         RewriteFunctionSignatures,
+        RewriteGetDynamicDimSizeOp,
         RewriteIf,
         RewriteIndexSwitch,
         RewritePureCall,
@@ -769,8 +796,7 @@ class FlattenTensorsPass
         RewriteVectorExtract,
         RewriteVectorFromElements,
         RewriteVectorInsert,
-        RewriteVectorTransferRead,
-        RewriteCpuLoad
+        RewriteVectorTransferRead
     >(mlir_context);
     // clang-format on
     ApplyIndexingOp::getCanonicalizationPatterns(patterns, mlir_context);
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc
index 49d17680f8d9e1..440e518d474053 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
 #include "xla/codegen/device_spec.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
@@ -70,7 +71,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/tsl/platform/status.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
@@ -1325,6 +1326,55 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
   const DeviceSpec& device_spec_;
 };
 
+class RewriteGetDynamicDimSize : public OpRewritePattern<GetDynamicDimSizeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(
+      GetDynamicDimSizeOp op, mlir::PatternRewriter& rewriter) const override {
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    auto tensor = op.getTensor();
+    auto tensor_type = mlir::dyn_cast<mlir::RankedTensorType>(tensor.getType());
+
+    Type element_type = tensor_type.getElementType();
+    int64_t num_elements = tensor_type.getNumElements();
+    std::optional<int> sub_byte_width = GetSubByteBitWidth(element_type);
+    if (sub_byte_width) {
+      element_type = b.getI8Type();
+      // Elements are packed.
+      num_elements = CeilOfRatio<int64_t>(num_elements, 8 / *sub_byte_width);
+    }
+
+    // The offset of the dim size from the start of the buffer. The dynamic dim
+    // sizes are stored after the tensor data as a tail-allocated metadata of
+    // s32 type.
+    int64_t dynamic_size_offset_in_bytes =
+        num_elements * element_type.getIntOrFloatBitWidth() / 8 +
+        op.getDim() * b.getI32Type().getWidth() / 8;
+
+    int64_t alignment = dynamic_size_offset_in_bytes % 4;
+    // TODO(b/463569416): Support unaligned loads.
+    if (alignment != 0) {
+      return op->emitOpError("dynamic size offset is not 4-byte aligned");
+    }
+
+    auto ptr_type = ml::LLVMPointerType::get(b.getContext());
+    Value tensor_ptr =
+        b.create<UnrealizedConversionCastOp>(ptr_type, tensor).getResult(0);
+
+    Value addr_offset =
+        b.create<ml::ConstantOp>(b.getI64Type(), dynamic_size_offset_in_bytes);
+
+    Value addr_int = b.create<ml::PtrToIntOp>(b.getI64Type(), tensor_ptr);
+    Value metadata_addr_int = b.create<ml::AddOp>(addr_int, addr_offset);
+    Value metadata_addr = b.create<ml::IntToPtrOp>(ptr_type, metadata_addr_int);
+
+    rewriter.replaceOpWithNewOp<ml::LoadOp>(op, b.getI32Type(), metadata_addr);
+
+    return success();
+  }
+};
+
 class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
  public:
   explicit LowerTensorsPass(const LowerTensorsPassOptions& options)
@@ -1340,7 +1390,7 @@ class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
                                                        &device_info));
       absl::StatusOr<se::DeviceDescription> device_description =
           se::DeviceDescription::FromProto(device_info);
-      TF_CHECK_OK(device_description.status());
+      CHECK_OK(device_description.status());
       *device_spec_.mutable_type() = *device_description;
     } else if (target_type_ == "cpu") {
       CHECK(gpu_device_info_.empty());
@@ -1351,10 +1401,11 @@ class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
     mlir::RewritePatternSet tensor_patterns(mlir_context);
 
     tensor_patterns.add<RewriteAtomicRMW>(mlir_context, device_spec_);
-    tensor_patterns
-        .add<RewriteAllocateShared, RewriteNonScalarConstants,
-             RewriteSyncThreads, RewriteTensorExtract, RewriteTransferRead,
-             RewriteTensorInsert, RewriteTransferWrite>(mlir_context);
+    tensor_patterns.add<RewriteAllocateShared, RewriteGetDynamicDimSize,
+                        RewriteNonScalarConstants, RewriteSyncThreads,
+                        RewriteTensorExtract, RewriteTensorInsert,
+                        RewriteTransferRead, RewriteTransferWrite>(
+        mlir_context);
     if (mlir::failed(mlir::applyPatternsGreedily(getOperation(),
                                                  std::move(tensor_patterns)))) {
       signalPassFailure();
@@ -1396,14 +1447,8 @@ class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
           if (func.getArgAttr(base.getArgNumber(), "xla.invariant")) {
             load.setInvariant(true);
           }
-          return;
         }
       }
-      if (!device_spec_.IsCpu()) {
-        load.emitOpError(
-            "load op address is not (a GEP of) a function argument");
-        signalPassFailure();
-      }
     });
   }
 
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
index dd8c0975727880..4fe70076991a27 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -29,7 +30,9 @@ limitations under the License.
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -38,18 +41,20 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // IWYU pragma: keep
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "google/protobuf/text_format.h"
 #include "xla/codegen/device_spec.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace xla {
@@ -76,7 +81,7 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
                                                        &device_info));
       absl::StatusOr<se::DeviceDescription> device_description =
           se::DeviceDescription::FromProto(device_info);
-      TF_CHECK_OK(device_description.status());
+      CHECK_OK(device_description.status());
       *device_spec_.mutable_type() = *device_description;
     } else if (target_type_ == "cpu") {
       CHECK(gpu_device_info_.empty());
@@ -129,6 +134,9 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
       }
     }
     mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
+    mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
+                                                         patterns);
+    mlir::ub::populateUBToLLVMConversionPatterns(type_converter, patterns);
     mlir::populateVectorToLLVMConversionPatterns(type_converter, patterns);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(type_converter,
                                                           patterns);
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
index 8f769106582084..95b6fe5737f916 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
@@ -11,6 +11,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -20,13 +21,15 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -38,10 +41,10 @@ limitations under the License.
 #include "xla/codegen/intrinsic/erf.h"
 #include "xla/codegen/intrinsic/exp.h"
 #include "xla/codegen/intrinsic/fptrunc.h"
-#include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/codegen/intrinsic/log1p.h"
 #include "xla/codegen/intrinsic/rsqrt.h"
 #include "xla/codegen/intrinsic/tanh.h"
+#include "xla/codegen/intrinsic/type.h"
 
 namespace xla {
 namespace emitters {
@@ -83,13 +86,28 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
 
   mlir::LogicalResult matchAndRewrite(
       mlir::math::ErfOp op, mlir::PatternRewriter& rewriter) const override {
-    mlir::Type type = op.getType();
+    auto type = op.getType();
+    mlir::Type element_type = mlir::getElementTypeOrSelf(op.getType());
+    auto maybe_vector_type = mlir::dyn_cast<mlir::VectorType>(type);
+
+    if (maybe_vector_type && maybe_vector_type.getRank() != 1) {
+      return rewriter.notifyMatchFailure(op, "Vector rank is not 1.");
+    }
+
+    // Get the vectorized version of the given type if op has a vector type,
+    // else just return the given type.
+    auto get_vector_type = [&maybe_vector_type](mlir::Type type) -> mlir::Type {
+      if (maybe_vector_type) {
+        return maybe_vector_type.clone(type);
+      }
+      return type;
+    };
 
     // Extend the argument to f32 and truncate the result back unconditionally
     // as these will be cleaned up later if they are already f32.
-    if (type.isF16() || type.isF32()) {
+    if (element_type.isF16() || element_type.isF32()) {
       mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-      mlir::Type f32_type = b.getF32Type();
+      mlir::Type f32_type = get_vector_type(b.getF32Type());
 
       mlir::Value input_value =
           b.create<mlir::arith::ExtFOp>(f32_type, op.getOperand());
@@ -105,12 +123,27 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
       return mlir::success();
     }
 
-    if (type.isF64()) {
+    if (element_type.isF64()) {
       mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
       auto erf_decl = GetErf64Declaration(rewriter);
-      auto call_op = b.create<mlir::func::CallOp>(erf_decl, op.getOperand());
-      rewriter.replaceOp(op, call_op->getResults());
+
+      if (!maybe_vector_type) {
+        auto call_op = b.create<mlir::func::CallOp>(erf_decl, op.getOperand());
+        rewriter.replaceOp(op, call_op->getResults());
+        return mlir::success();
+      }
+
+      llvm::SmallVector<mlir::Value> scalar_erf_results;
+      for (int64_t idx = 0; idx < maybe_vector_type.getNumElements(); ++idx) {
+        mlir::Value extracted = mlir::vector::ExtractOp::create(
+            rewriter, op.getLoc(), op.getOperand(), idx);
+        mlir::Value scalar_erf =
+            b.create<mlir::func::CallOp>(erf_decl, extracted).getResult(0);
+        scalar_erf_results.push_back(scalar_erf);
+      }
+      rewriter.replaceOpWithNewOp<mlir::vector::FromElementsOp>(
+          op, type, scalar_erf_results);
       return mlir::success();
     }
 
@@ -139,17 +172,26 @@ class LowerTruncF32BF16FPattern
       mlir::arith::TruncFOp op,
       mlir::PatternRewriter& rewriter) const override {
     auto src = op.getOperand();
-    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    auto dst_ty = op.getType();
 
-    if (!mlir::isa<mlir::Float32Type>(src.getType()) ||
-        !mlir::isa<mlir::BFloat16Type>(dst_ty)) {
+    if (!mlir::isa<mlir::Float32Type>(
+            mlir::getElementTypeOrSelf(src.getType())) ||
+        !mlir::isa<mlir::BFloat16Type>(mlir::getElementTypeOrSelf(dst_ty))) {
       return rewriter.notifyMatchFailure(op, "Not f32 -> bf16");
     }
 
+    if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(src.getType());
+        vec_type && vec_type.getRank() != 1) {
+      // These will later be converted to loops of 1D vectors but will then miss
+      // the XLA intrinsic lowering.
+      op->emitWarning() << "Missed XLA intrinsic lowering as vector rank != 1.";
+      return rewriter.notifyMatchFailure(op, "Vector rank is not 1.");
+    }
+
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    Type src_type = Type::S(F32);
-    Type dst_type = Type::S(BF16);
+    auto src_type = Type::TypeFromIrType(src.getType());
+    auto dst_type = Type::TypeFromIrType(dst_ty);
     auto f32_to_bf16_decl =
         codegen::intrinsics::FpTrunc::GetOrInsertDeclaration(
             rewriter, module_op_, src_type, dst_type);
@@ -171,6 +213,13 @@ class LowerIntrinsicPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
+    if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(op.getType());
+        vec_type && vec_type.getRank() != 1) {
+      // These will later be converted to loops of 1D vectors but will then miss
+      // the XLA intrinsic lowering.
+      op->emitWarning() << "Missed XLA intrinsic lowering as vector rank != 1.";
+      return rewriter.notifyMatchFailure(op, "Vector rank is not 1.");
+    }
     Type type = Type::TypeFromIrType(op.getType());
     mlir::StringAttr features =
         module_op_->getAttrOfType<mlir::StringAttr>("mhlo.cpu_features");
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.h b/third_party/xla/xla/codegen/emitters/transforms/passes.h
index 759b672a5a8c72..74a5ab5fe2f897 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.h
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.h
@@ -59,6 +59,7 @@ std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
     const std::string& gpu_device_info = "");
 std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
     const stream_executor::DeviceDescription& device_description);
+std::unique_ptr<mlir::Pass> CreateSafeIntegerArithmeticPass();
 
 #define GEN_PASS_REGISTRATION
 #include "xla/codegen/emitters/transforms/passes.h.inc"
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.td b/third_party/xla/xla/codegen/emitters/transforms/passes.td
index a4affb0c614cec..0b7afb432042f6 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.td
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.td
@@ -27,6 +27,7 @@ def LowerXlaIntrinsicLibPass : Pass<"xla-lower-xla-intrinsic-lib", "mlir::Module
   }];
   let dependentDialects = [
     "mlir::func::FuncDialect",
+    "mlir::vector::VectorDialect",
   ];
   let constructor = "CreateLowerXlaIntrinsicLibPass()";
 }
@@ -331,4 +332,14 @@ def VectorizeLoadsAndStoresPass :
   let constructor = "CreateVectorizeLoadsAndStoresPass()";
 }
 
+def SafeIntegerArithmeticPass : Pass<"xla-safe-integer-arithmetic"> {
+  let summary = "Make integer arithmetics safe.";
+
+  let description = [{
+    This pass converts integer division and remainder to ones that do not cause
+    undefined behavior. It only converts operations that are marked with the
+    attribute "xla.guard_ub".
+  }];
+}
+
 #endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_PASSES_TD_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/safe_integer_arithmetic.cc b/third_party/xla/xla/codegen/emitters/transforms/safe_integer_arithmetic.cc
new file mode 100644
index 00000000000000..63199d564c10da
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/safe_integer_arithmetic.cc
@@ -0,0 +1,134 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
+
+namespace xla::emitters {
+
+#define GEN_PASS_DEF_SAFEINTEGERARITHMETICPASS
+#include "xla/codegen/emitters/transforms/passes.h.inc"
+
+namespace {
+
+inline mlir::Value GetConstantOrSplat(mlir::ImplicitLocOpBuilder& builder,
+                                      mlir::Type type, mlir::TypedAttr value) {
+  if (auto vector_type = mlir::dyn_cast<mlir::VectorType>(type)) {
+    value = mlir::SplatElementsAttr::get(vector_type, value);
+  }
+  return mlir::arith::ConstantOp::create(builder, type, value);
+}
+
+inline mlir::Value GetConstantOrSplat(mlir::ImplicitLocOpBuilder& builder,
+                                      mlir::Type type, mlir::APInt value) {
+  return GetConstantOrSplat(
+      builder, type,
+      builder.getIntegerAttr(mlir::getElementTypeOrSelf(type), value));
+}
+
+template <typename OpT, bool is_unsigned>
+mlir::LogicalResult RewriteToSafeDiv(OpT op, mlir::PatternRewriter& rewriter) {
+  if (!op->hasAttr("xla.guard_ub")) {
+    return rewriter.notifyMatchFailure(op, "already safe");
+  }
+
+  mlir::ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
+  mlir::Type element_type = mlir::getElementTypeOrSelf(op.getType());
+  auto int_type = mlir::cast<mlir::IntegerType>(element_type);
+  mlir::Value minusOne = GetConstantOrSplat(
+      builder, op.getType(), mlir::APInt::getAllOnes(int_type.getWidth()));
+  mlir::Value smin =
+      GetConstantOrSplat(builder, op.getType(),
+                         mlir::APInt::getSignedMinValue(int_type.getWidth()));
+  mlir::Value safe_div = mlir::mhlo::impl::makeSafeIntDiv<mlir::arith::DivUIOp,
+                                                          mlir::arith::DivSIOp>(
+      builder, is_unsigned, op.getLhs(), op.getRhs(),
+      /*returnedOnZero=*/minusOne,
+      /*returnedOnSignedOverflow=*/smin);
+
+  rewriter.replaceOp(op, safe_div);
+  return mlir::success();
+}
+
+template <typename OpT, bool is_unsigned>
+mlir::LogicalResult RewriteToSafeRem(OpT op, mlir::PatternRewriter& rewriter) {
+  if (!op->hasAttr("xla.guard_ub")) {
+    return rewriter.notifyMatchFailure(op, "already safe");
+  }
+
+  mlir::ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
+  mlir::Type element_type = mlir::getElementTypeOrSelf(op.getType());
+  auto int_type = mlir::cast<mlir::IntegerType>(element_type);
+  auto zero = GetConstantOrSplat(builder, op.getType(),
+                                 mlir::APInt::getZero(int_type.getWidth()));
+  mlir::Value safe_div = mlir::mhlo::impl::makeSafeIntDiv<mlir::arith::RemUIOp,
+                                                          mlir::arith::RemSIOp>(
+      builder, is_unsigned, op.getLhs(), op.getRhs(),
+      /*returnedOnZero=*/op.getLhs(),
+      /*returnedOnSignedOverflow=*/zero);
+
+  rewriter.replaceOp(op, safe_div);
+  return mlir::success();
+}
+
+class SafeIntegerArithmeticPass
+    : public impl::SafeIntegerArithmeticPassBase<SafeIntegerArithmeticPass> {
+ public:
+  using SafeIntegerArithmeticPassBase::SafeIntegerArithmeticPassBase;
+
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+
+    patterns.add(RewriteToSafeDiv<mlir::arith::DivUIOp, /*is_unsigned=*/true>);
+    patterns.add(RewriteToSafeDiv<mlir::arith::DivSIOp, /*is_unsigned=*/false>);
+
+    patterns.add(RewriteToSafeRem<mlir::arith::RemUIOp, /*is_unsigned=*/true>);
+    patterns.add(RewriteToSafeRem<mlir::arith::RemSIOp, /*is_unsigned=*/false>);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateSafeIntegerArithmeticPass() {
+  return std::make_unique<SafeIntegerArithmeticPass>();
+}
+
+}  // namespace xla::emitters
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir
index 07b0d301666154..8ac45ff8fa22eb 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir
@@ -398,3 +398,13 @@ func.func @constant_vector() -> vector<2x3xf32> {
 // CHECK-LABEL: func.func @constant_vector
 // CHECK-SAME: -> vector<6xf32>
 // CHECK-NOT:  builtin.unrealized_conversion_cast
+
+// -----
+
+func.func @get_dynamic_dim_size(%in: tensor<16x8x4xf32>) -> (i32) {
+  %out = xla.get_dynamic_dim_size %in 1 : tensor<16x8x4xf32>
+  func.return %out : i32
+}
+// CHECK-LABEL: func.func @get_dynamic_dim_size(
+// CHECK-SAME:      %[[TENSOR:.*]]: tensor<512xf32>) -> i32 {
+// CHECK:         xla.get_dynamic_dim_size %[[TENSOR]] 1 : tensor<512xf32>
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir
index 161c879a7afb33..4469755829bc54 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir
@@ -1106,3 +1106,30 @@ func.func @transfer_write_f4(%arg0: tensor<43xf4E2M1FN> {xla.slice_index = 1},
 // CHECK-LABEL: @transfer_write_f4
 // CHECK: %[[PTR:.*]] = llvm.getelementptr inbounds %arg0[0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<22 x i8>
 // CHECK: %[[OUT:.*]] = builtin.unrealized_conversion_cast %{{.*}} : vector<2xf4E2M1FN> to vector<2xi4>
+
+// -----
+
+func.func @get_dynamic_dim_size(%arg0: tensor<512xf32>) -> i32 {
+  %0 = xla.get_dynamic_dim_size %arg0 1 : tensor<512xf32>
+  func.return %0 : i32
+}
+// CHECK-LABEL: @get_dynamic_dim_size
+// CHECK: llvm.mlir.constant(2052 : i64) : i64
+
+// -----
+
+func.func @get_dynamic_dim_size_sub_byte_width(%arg0: tensor<512xi4>) -> i32 {
+  %0 = xla.get_dynamic_dim_size %arg0 1 : tensor<512xi4>
+  func.return %0 : i32
+}
+// CHECK-LABEL: @get_dynamic_dim_size_sub_byte_width
+// CHECK: llvm.mlir.constant(260 : i64) : i64
+
+// // -----
+
+func.func @get_dynamic_dim_size_unaligned(%arg0: tensor<7xf16>) -> i32 {
+// expected-error @+1 {{'xla.get_dynamic_dim_size' op dynamic size offset is not 4-byte aligned}}
+  %0 = xla.get_dynamic_dim_size %arg0 1 : tensor<7xf16>
+  func.return %0 : i32
+}
+
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
index 9a404b758b5c2b..24a772c5580f12 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
@@ -53,6 +53,19 @@ module {
 
 // -----
 
+module {
+  // CHECK-LABEL: @trunc
+  func.func @trunc_vector(%input: vector<8xf32>) -> vector<8xbf16> {
+    // CHECK-SAME: (%[[ARG:.*]]: vector<8xf32>) -> vector<8xbf16>
+    // CHECK: %[[TRUNC_CALL:.*]] = call @local_xla.fptrunc.v8f32.to.v8bf16(%[[ARG]])
+    %truncated = arith.truncf %input : vector<8xf32> to vector<8xbf16>
+    // CHECK: return %[[TRUNC_CALL]]
+    func.return %truncated : vector<8xbf16>
+  }
+}
+
+// -----
+
 module {
   func.func @erf32(%arg0: f32) -> f32 {
     %ret = math.erf %arg0 : f32
@@ -67,6 +80,20 @@ module {
 
 // -----
 
+module {
+  func.func @erf32_vector(%arg0: vector<4xf32>) -> vector<4xf32> {
+    %ret = math.erf %arg0 : vector<4xf32>
+    return %ret : vector<4xf32>
+  }
+}
+
+// CHECK-LABEL: @erf32_vector
+// CHECK-NOT: math.erf
+// CHECK: %[[ERF_CALL:.*]] = call @local_xla.erf.v4f32
+// CHECK: return %[[ERF_CALL]]
+
+// -----
+
 module {
   func.func @erf64(%arg0: f64) -> f64 {
     %ret = math.erf %arg0 : f64
@@ -80,6 +107,19 @@ module {
 // CHECK: return %[[ERF_CALL]]
 
 
+// -----
+
+module {
+  func.func @erf64_vector(%arg0: vector<4xf64>) -> vector<4xf64> {
+    %ret = math.erf %arg0 : vector<4xf64>
+    return %ret : vector<4xf64>
+  }
+}
+
+// CHECK-LABEL: @erf64_vector
+// CHECK-NOT: math.erf
+// CHECK-COUNT-4: call @erf
+
 // -----
 
 module {
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/safe_integer_arithmetic.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/safe_integer_arithmetic.mlir
new file mode 100644
index 00000000000000..798e8c7de7e11c
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/safe_integer_arithmetic.mlir
@@ -0,0 +1,178 @@
+// RUN: emitters_opt %s -xla-safe-integer-arithmetic --split-input-file  \
+// RUN: | FileCheck %s
+
+func.func @unmarked_signed_div_is_not_changed(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK: %[[RESULT:.*]] = arith.divsi %arg0, %arg1 : i32
+  %0 = arith.divsi %arg0, %arg1 : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @unmarked_unsigned_div_is_not_changed(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK: %[[RESULT:.*]] = arith.divui %arg0, %arg1 : i32
+  %0 = arith.divui %arg0, %arg1 : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @unmarked_signed_rem_is_not_changed(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK: %[[RESULT:.*]] = arith.remsi %arg0, %arg1 : i32
+  %0 = arith.remsi %arg0, %arg1 : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @unmarked_unsigned_rem_is_not_changed(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK: %[[RESULT:.*]] = arith.remui %arg0, %arg1 : i32
+  %0 = arith.remui %arg0, %arg1 : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @signed_div(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK-DAG: %[[C_1:.*]] = arith.constant -1 : i32
+  // CHECK-DAG: %[[CMIN:.*]] = arith.constant -2147483648 : i32
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+  // CHECK:     %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : i32
+  // CHECK:     %[[LHS_MIN:.*]] = arith.cmpi eq, %arg0, %[[CMIN]] : i32
+  // CHECK:     %[[RHS_MINUS1:.*]] = arith.cmpi eq, %arg1, %[[C_1]] : i32
+  // CHECK:     %[[OVERFLOW:.*]] = arith.andi %[[LHS_MIN]], %[[RHS_MINUS1]] : i1
+  // CHECK:     %[[IS_UB:.*]] = arith.ori %[[RHS_ZERO]], %[[OVERFLOW]] : i1
+  // CHECK:     %[[BOUNDED_RHS:.*]] = arith.select %[[IS_UB]], %[[C1]], %arg1 : i32
+  // CHECK:     %[[DIV:.*]] = arith.divsi %arg0, %[[BOUNDED_RHS]] : i32
+  // CHECK:     %[[OVERFLOW_RESULT:.*]] = arith.select %[[OVERFLOW]], %[[CMIN]], %[[DIV]] : i32
+  // CHECK:     %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %[[C_1]], %[[OVERFLOW_RESULT]] : i32
+  %0 = arith.divsi %arg0, %arg1 {xla.guard_ub} : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @signed_div_vector(%arg0: vector<8xi32>, %arg1: vector<8xi32>) -> vector<8xi32> {
+  // CHECK-DAG: %[[C_1:.*]] = arith.constant dense<-1> : vector<8xi32>
+  // CHECK-DAG: %[[CMIN:.*]] = arith.constant dense<-2147483648> : vector<8xi32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant dense<0> : vector<8xi32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant dense<1> : vector<8xi32>
+  // CHECK:     %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : vector<8xi32>
+  // CHECK:     %[[LHS_MIN:.*]] = arith.cmpi eq, %arg0, %[[CMIN]] : vector<8xi32>
+  // CHECK:     %[[RHS_MINUS1:.*]] = arith.cmpi eq, %arg1, %[[C_1]] : vector<8xi32>
+  // CHECK:     %[[OVERFLOW:.*]] = arith.andi %[[LHS_MIN]], %[[RHS_MINUS1]] : vector<8xi1>
+  // CHECK:     %[[IS_UB:.*]] = arith.ori %[[RHS_ZERO]], %[[OVERFLOW]] : vector<8xi1>
+  // CHECK:     %[[BOUNDED_RHS:.*]] = arith.select %[[IS_UB]], %[[C1]], %arg1 : vector<8xi1>, vector<8xi32>
+  // CHECK:     %[[DIV:.*]] = arith.divsi %arg0, %[[BOUNDED_RHS]] : vector<8xi32>
+  // CHECK:     %[[OVERFLOW_RESULT:.*]] = arith.select %[[OVERFLOW]], %[[CMIN]], %[[DIV]] : vector<8xi1>, vector<8xi32>
+  // CHECK:     %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %[[C_1]], %[[OVERFLOW_RESULT]] : vector<8xi1>, vector<8xi32>
+  %0 = arith.divsi %arg0, %arg1 {xla.guard_ub} : vector<8xi32>
+  // CHECK: return %[[RESULT]] : vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @unsigned_div(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK-DAG: %[[C_1:.*]] = arith.constant -1 : i32
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+  // CHECK: %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : i32
+  // CHECK: %[[BOUNDED_RHS:.*]] = arith.select %[[RHS_ZERO]], %[[C1]], %arg1 : i32
+  // CHECK: %[[DIV:.*]] = arith.divui %arg0, %[[BOUNDED_RHS]] : i32
+  // CHECK: %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %[[C_1]], %[[DIV]] : i32
+  %0 = arith.divui %arg0, %arg1 {xla.guard_ub} : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @unsigned_div_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi32> {
+  // CHECK-DAG: %[[C_1:.*]] = arith.constant dense<-1> : vector<4xi32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant dense<0> : vector<4xi32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK: %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : vector<4xi32>
+  // CHECK: %[[BOUNDED_RHS:.*]] = arith.select %[[RHS_ZERO]], %[[C1]], %arg1 :  vector<4xi1>, vector<4xi32>
+  // CHECK: %[[DIV:.*]] = arith.divui %arg0, %[[BOUNDED_RHS]] : vector<4xi32>
+  // CHECK: %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %[[C_1]], %[[DIV]] :  vector<4xi1>, vector<4xi32>
+  %0 = arith.divui %arg0, %arg1 {xla.guard_ub} : vector<4xi32>
+  // CHECK: return %[[RESULT]] : vector<4xi32>
+  func.return %0 : vector<4xi32>
+}
+
+// -----
+
+func.func @signed_rem(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK-DAG: %[[C_1:.*]] = arith.constant -1 : i32
+  // CHECK-DAG: %[[CMIN:.*]] = arith.constant -2147483648 : i32
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+  // CHECK:     %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : i32
+  // CHECK:     %[[LHS_MIN:.*]] = arith.cmpi eq, %arg0, %[[CMIN]] : i32
+  // CHECK:     %[[RHS_MINUS1:.*]] = arith.cmpi eq, %arg1, %[[C_1]] : i32
+  // CHECK:     %[[OVERFLOW:.*]] = arith.andi %[[LHS_MIN]], %[[RHS_MINUS1]] : i1
+  // CHECK:     %[[IS_UB:.*]] = arith.ori %[[RHS_ZERO]], %[[OVERFLOW]] : i1
+  // CHECK:     %[[BOUNDED_RHS:.*]] = arith.select %[[IS_UB]], %[[C1]], %arg1 : i32
+  // CHECK:     %[[REM:.*]] = arith.remsi %arg0, %[[BOUNDED_RHS]] : i32
+  // CHECK:     %[[OVERFLOW_RESULT:.*]] = arith.select %[[OVERFLOW]], %[[C0]], %[[REM]] : i32
+  // CHECK:     %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %arg0, %[[OVERFLOW_RESULT]] : i32
+  %0 = arith.remsi %arg0, %arg1 {xla.guard_ub} : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @signed_rem_vector(%arg0: vector<2xi32>, %arg1: vector<2xi32>) -> vector<2xi32> {
+  // CHECK-DAG: %[[C_1:.*]] = arith.constant dense<-1> : vector<2xi32>
+  // CHECK-DAG: %[[CMIN:.*]] = arith.constant dense<-2147483648> : vector<2xi32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant dense<0> : vector<2xi32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant dense<1> : vector<2xi32>
+  // CHECK:     %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : vector<2xi32>
+  // CHECK:     %[[LHS_MIN:.*]] = arith.cmpi eq, %arg0, %[[CMIN]] : vector<2xi32>
+  // CHECK:     %[[RHS_MINUS1:.*]] = arith.cmpi eq, %arg1, %[[C_1]] : vector<2xi32>
+  // CHECK:     %[[OVERFLOW:.*]] = arith.andi %[[LHS_MIN]], %[[RHS_MINUS1]] : vector<2xi1>
+  // CHECK:     %[[IS_UB:.*]] = arith.ori %[[RHS_ZERO]], %[[OVERFLOW]] : vector<2xi1>
+  // CHECK:     %[[BOUNDED_RHS:.*]] = arith.select %[[IS_UB]], %[[C1]], %arg1 : vector<2xi1>, vector<2xi32>
+  // CHECK:     %[[REM:.*]] = arith.remsi %arg0, %[[BOUNDED_RHS]] : vector<2xi32>
+  // CHECK:     %[[OVERFLOW_RESULT:.*]] = arith.select %[[OVERFLOW]], %[[C0]], %[[REM]] : vector<2xi1>, vector<2xi32>
+  // CHECK:     %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %arg0, %[[OVERFLOW_RESULT]] : vector<2xi1>, vector<2xi32>
+  %0 = arith.remsi %arg0, %arg1 {xla.guard_ub} : vector<2xi32>
+  // CHECK: return %[[RESULT]] : vector<2xi32>
+  func.return %0 : vector<2xi32>
+}
+
+// -----
+
+func.func @unsigned_rem(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+  // CHECK: %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : i32
+  // CHECK: %[[BOUNDED_RHS:.*]] = arith.select %[[RHS_ZERO]], %[[C1]], %arg1 : i32
+  // CHECK: %[[REM:.*]] = arith.remui %arg0, %[[BOUNDED_RHS]] : i32
+  // CHECK: %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %arg0, %[[REM]] : i32
+  %0 = arith.remui %arg0, %arg1 {xla.guard_ub} : i32
+  // CHECK: return %[[RESULT]] : i32
+  func.return %0 : i32
+}
+
+// -----
+
+func.func @unsigned_rem_vectors(%arg0: vector<16xi32>, %arg1: vector<16xi32>) -> vector<16xi32> {
+  // CHECK-DAG: %[[C0:.*]] = arith.constant dense<0> : vector<16xi32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant dense<1> : vector<16xi32>
+  // CHECK: %[[RHS_ZERO:.*]] = arith.cmpi eq, %arg1, %[[C0]] : vector<16xi32>
+  // CHECK: %[[BOUNDED_RHS:.*]] = arith.select %[[RHS_ZERO]], %[[C1]], %arg1 : vector<16xi1>, vector<16xi32>
+  // CHECK: %[[REM:.*]] = arith.remui %arg0, %[[BOUNDED_RHS]] : vector<16xi32>
+  // CHECK: %[[RESULT:.*]] = arith.select %[[RHS_ZERO]], %arg0, %[[REM]] : vector<16xi1>, vector<16xi32>
+  %0 = arith.remui %arg0, %arg1 {xla.guard_ub} : vector<16xi32>
+  // CHECK: return %[[RESULT]] : vector<16xi32>
+  func.return %0 : vector<16xi32>
+}
diff --git a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
index 5b1c692b85d9b7..dc8dd8eebcf41b 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 
 #include "absl/log/check.h"
 #include "llvm/ADT/APInt.h"
@@ -36,7 +35,6 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -44,12 +42,12 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "google/protobuf/text_format.h"
 #include "xla/codegen/device_spec.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/atomic_rmw_utils.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status.h"
 
 namespace xla {
 namespace emitters {
@@ -564,7 +562,7 @@ class VectorizeLoadsAndStoresPass
                                                        &device_info));
       absl::StatusOr<se::DeviceDescription> device_description =
           se::DeviceDescription::FromProto(device_info);
-      TF_CHECK_OK(device_description.status());
+      CHECK_OK(device_description.status());
       *device_spec_.mutable_type() = *device_description;
     } else if (target_type_ == "cpu") {
       CHECK(gpu_device_info_.empty());
diff --git a/third_party/xla/xla/codegen/intrinsic/BUILD b/third_party/xla/xla/codegen/intrinsic/BUILD
index 8faa2c373e2461..9c5d64b61fffb8 100644
--- a/third_party/xla/xla/codegen/intrinsic/BUILD
+++ b/third_party/xla/xla/codegen/intrinsic/BUILD
@@ -3,6 +3,7 @@ load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
@@ -23,25 +24,17 @@ package_group(
 
 cc_library(
     name = "intrinsic",
-    srcs = ["intrinsic.cc"],
     hdrs = ["intrinsic.h"],
     deps = [
-        "//xla:shape_util",
-        "//xla:util",
+        ":type",
+        ":vec_name_mangler",
         "//xla:xla_data_proto_cc",
-        "//xla/mlir/utils:type_util",
-        "//xla/service/llvm_ir:llvm_util",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -49,10 +42,10 @@ xla_cc_test(
     name = "intrinsic_test",
     srcs = ["intrinsic_test.cc"],
     deps = [
-        ":intrinsic",
+        ":type",
         "//xla:xla_data_proto_cc",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -63,12 +56,12 @@ cc_library(
     hdrs = ["fptrunc.h"],
     deps = [
         ":intrinsic",
+        ":type",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -134,8 +127,6 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:contiguous_section_memory_manager",
-        "//xla/service/cpu:orc_jit_memory_mapper",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/base",
@@ -157,6 +148,9 @@ cc_library(
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCAsmParser",  # fixdeps: keep
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVAsmParser",  # fixdeps: keep
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZAsmParser",  # fixdeps: keep
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
@@ -192,8 +186,13 @@ cc_library(
     name = "vec_name_mangler",
     hdrs = ["vec_name_mangler.h"],
     deps = [
+        ":type",
+        "//xla:util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -359,6 +358,7 @@ xla_cc_test(
         ":intrinsic",
         ":simple_jit_runner",
         ":test_matchers",
+        "//xla:xla_data_proto_cc",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:JITLink",
         "@llvm-project//llvm:ir_headers",
@@ -434,3 +434,37 @@ xla_cc_test(
         "@llvm-project//llvm:ir_headers",
     ],
 )
+
+cc_library(
+    name = "type",
+    srcs = ["type.cc"],
+    hdrs = ["type.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/mlir/utils:type_util",
+        "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "type_test",
+    srcs = ["type_test.cc"],
+    deps = [
+        ":type",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/BUILD b/third_party/xla/xla/codegen/intrinsic/cpp/BUILD
index 14f35ab7bdff4d..ff297f4ffcf871 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/BUILD
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/BUILD
@@ -1,5 +1,5 @@
-load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load(":cc_to_llvm_ir.bzl", "cc_ir_header")
 
 package(
@@ -33,29 +33,14 @@ cc_library(
 cc_library(
     name = "vector_ops",
     hdrs = ["vector_ops.h"],
+    deps = [
+        "@eigen_archive//:eigen3",
+    ],
 )
 
-cpu_features = select({
-    ":is_x86_64": {
-        "avx2": [
-            "-mavx2",
-            "-mno-avx512f",
-        ],
-        "avx512": [
-            "-mavx2",
-            "-mavx512f",
-        ],
-    },
-    ":is_aarch64": {
-        "neon": [],
-    },
-    "//conditions:default": {},
-})
-
 cc_ir_header(
     name = "tanh_ll",
     src = "tanh.cc",
-    cpu_features = cpu_features,
     deps = [
         ":cpp_context_provider",
         ":tanh",
@@ -85,7 +70,6 @@ cc_library(
 cc_ir_header(
     name = "eigen_unary_ll",
     src = "eigen_unary.cc",
-    cpu_features = cpu_features,
     deps = [
         ":cpp_context_provider",
         ":eigen_unary_lib",
@@ -97,21 +81,10 @@ xla_cc_test(
     name = "eigen_unary_test",
     srcs = ["eigen_unary_test.cc"],
     deps = [
+        ":eigen_unary_lib",
         ":eigen_unary_ll",
+        ":vector_ops",
+        "//xla/codegen/intrinsic:test_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-config_setting(
-    name = "is_x86_64",
-    constraint_values = [
-        "@platforms//cpu:x86_64",
-    ],
-)
-
-config_setting(
-    name = "is_aarch64",
-    constraint_values = [
-        "@platforms//cpu:aarch64",
-    ],
-)
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl b/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl
index 91d023f16154f3..84a21e4d8aefa3 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl
@@ -1,12 +1,12 @@
 """
-A rule to compile a C++ file to a header containing LLVM IR for various
-CPU features on the host platform.
+A rule to compile a C++ file to a header containing LLVM IR.
 """
 
-load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("@rules_cc//cc:find_cc_toolchain.bzl", "find_cc_toolchain", "use_cc_toolchain")
 load("@rules_cc//cc/common:cc_common.bzl", "cc_common")
+load("@rules_cc//cc/common:cc_info.bzl", "CcInfo")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
@@ -25,47 +25,42 @@ def _cc_ir_header_impl(ctx):
     compilation_contexts = [dep[CcInfo].compilation_context for dep in ctx.attr.deps]
     output_header = ctx.outputs.out_header
 
-    ir_files = []
-    ir_definitions = []
-
-    for feature, flags in sorted(ctx.attr.cpu_features.items()):
-        ir_file = ctx.actions.declare_file("{}{}.ll".format(ctx.label.name, to_camel_case(feature)))
-        ir_files.append(ir_file)
-
-        cxx_flags = list(flags) + ["-S", "-emit-llvm", "-O3"]
-        compilation_outputs = cc_common.compile(
-            actions = ctx.actions,
-            feature_configuration = feature_configuration,
-            cc_toolchain = cc_toolchain,
-            srcs = ctx.files.src,
-            compilation_contexts = compilation_contexts,
-            cxx_flags = cxx_flags,
-            name = "{}_{}_compiler".format(ctx.label.name, feature),
-        )
-
-        # 3. Copy the compiler's output to our declared intermediate file.
-        temp_ir_output = compilation_outputs[1].pic_objects[0]
-        ctx.actions.run_shell(
-            inputs = [temp_ir_output],
-            outputs = [ir_file],
-            command = "cp {} {}".format(temp_ir_output.path, ir_file.path),
-            mnemonic = "CopyLLVMIR{}".format(to_camel_case(feature)),
-        )
-
-        # 4. Prepare the C++ variable definition for the header.
-        feature_camel_case = to_camel_case(feature)
-        ir_definitions.append(
-            'inline constexpr char k{base_name}{feature}Ir[] = R"IR(\\n$(cat {input})\\n)IR";'.format(
-                base_name = to_camel_case(ctx.attr.base_name),
-                feature = feature_camel_case,
-                input = ir_file.path,
-            ),
-        )
-
-    # 5. Generate the final C++ header file.
-    all_definitions = "\n\n".join(ir_definitions)
+    ir_file = ctx.actions.declare_file("{}.ll".format(ctx.label.name))
+
+    cxx_flags = [
+        "-S",
+        "-emit-llvm",
+        "-O3",
+        "-DEIGEN_VECTORIZE_GENERIC",
+    ]
+    compilation_outputs = cc_common.compile(
+        actions = ctx.actions,
+        feature_configuration = feature_configuration,
+        cc_toolchain = cc_toolchain,
+        srcs = ctx.files.src,
+        compilation_contexts = compilation_contexts,
+        cxx_flags = cxx_flags,
+        name = "{}_compiler".format(ctx.label.name),
+    )
+
+    # Copy the compiler's output to our declared intermediate file.
+    temp_ir_output = compilation_outputs[1].pic_objects[0]
+    ctx.actions.run_shell(
+        inputs = [temp_ir_output],
+        outputs = [ir_file],
+        command = "cp {} {}".format(temp_ir_output.path, ir_file.path),
+        mnemonic = "CopyLLVMIR",
+    )
+
+    # Prepare the C++ variable definition for the header.
+    ir_definition = 'inline constexpr char k{base_name}Ir[] = R"IR($(cat {input}))IR";'.format(
+        base_name = to_camel_case(ctx.attr.base_name),
+        input = ir_file.path,
+    )
+
+    # Generate the final C++ header file.
     ctx.actions.run_shell(
-        inputs = ir_files,
+        inputs = [ir_file],
         outputs = [output_header],
         mnemonic = "EmbeddingLLVMIR",
         command = """
@@ -82,7 +77,7 @@ namespace {namespace} {{
 EOF
 """.format(
             output = output_header.path,
-            defs = all_definitions,
+            defs = ir_definition,
             namespace = ctx.attr.namespace,
         ),
         progress_message = "Embedding LLVM IR into header for %s" % ctx.label,
@@ -105,10 +100,6 @@ _cc_ir_header_rule = rule(
             mandatory = True,
             doc = "The output header file.",
         ),
-        "cpu_features": attr.string_list_dict(
-            mandatory = True,
-            doc = "A dictionary mapping feature names to lists of CXX flags. Use select() here.",
-        ),
         "base_name": attr.string(
             mandatory = True,
             doc = "The base name of the generated IR variables.",
@@ -123,14 +114,13 @@ _cc_ir_header_rule = rule(
     fragments = ["cpp"],
 )
 
-def cc_ir_header(name, src, deps, cpu_features, **kwargs):
+def cc_ir_header(name, src, deps, **kwargs):
     """A macro that generates an IR header and wraps it in a cc_library.
 
     Args:
       name: The name of the generated cc_library.
       src: The C++ source file to compile.
       deps: The C++ dependencies of the source file.
-      cpu_features: A dictionary mapping feature names to lists of CXX flags. Use select() here.
       **kwargs: Additional arguments to pass to the generated cc_library.
     """
     out_header = name + ".h"
@@ -142,8 +132,8 @@ def cc_ir_header(name, src, deps, cpu_features, **kwargs):
         tags = ["manual"],
         src = src,
         deps = deps,
-        cpu_features = cpu_features,
         out_header = out_header,
+        # copybara_removed compatible_with = ["//buildenv/target:non_prod"],
         **kwargs
     )
 
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.cc b/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.cc
index f05d928b00fa01..248b760e32af71 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.cc
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.cc
@@ -20,19 +20,28 @@ limitations under the License.
 
 namespace xla::codegen {
 
-// Using Packet over a Map'd Array yields better llvm IR on ARM.
-using Packet4f = Eigen::internal::Packet4f;
-
-Vec4f FastTanhf(const Vec4f x) {
-  Packet4f packet = static_cast<Eigen::internal::Packet4f>(x);
-  Packet4f res = Eigen::internal::ptanh_float(packet);
-  return *static_cast<Vec4f*>(&res);
+//===--------------------------------------------------------------------===//
+// Generic conversion and operation
+//===--------------------------------------------------------------------===//
+
+template <typename VecType>
+inline VecType VectorTanh(const VecType x) {
+  using ArrayType = typename ArrayMap<VecType>::type;
+  ArrayType x_array = *reinterpret_cast<const ArrayType*>(&x);
+  ArrayType result = x_array.tanh();
+  return *reinterpret_cast<const VecType*>(&result);
 }
 
-Vec8d FastRqsqrtf(const Vec8d x) {
-  const Eigen::Map<const Eigen::Array<double, 8, 1>> x_arr((const double*)&x);
-  const Eigen::Array<double, 8, 1> res = x_arr.rsqrt();
-  return *(Vec8d*)res.data();
-}
+//===--------------------------------------------------------------------===//
+// XLA entrypoints, renamed with asm in header file.
+//===--------------------------------------------------------------------===//
+
+// Single precision
+float tanh_f32(float x) { return Eigen::internal::ptanh_float(x); }
+Vec16f tanh_v16f32(Vec16f x) { return VectorTanh(x); }
+
+// Double precision
+double tanh_f64(double x) { return Eigen::internal::ptanh_double(x); }
+Vec8d tanh_v8f64(Vec8d x) { return VectorTanh(x); }
 
 }  // namespace xla::codegen
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.h b/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.h
index bf5ef161f9a0c4..1248cde5ef1176 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.h
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary.h
@@ -19,7 +19,13 @@ limitations under the License.
 
 namespace xla::codegen {
 
-Vec4f FastTanhf(Vec4f x);
+// Single precision
+float tanh_f32(float x) asm("xla.tanh.f32");
+Vec16f tanh_v16f32(Vec16f x) asm("xla.tanh.v16f32");
+
+// Double precision
+double tanh_f64(double x) asm("xla.tanh.f64");
+Vec8d tanh_v8f64(Vec8d x) asm("xla.tanh.v8f64");
 
 }  // namespace xla::codegen
 
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary_test.cc b/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary_test.cc
index e3ac659b0ecec9..eae40e857c5057 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary_test.cc
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/eigen_unary_test.cc
@@ -13,40 +13,51 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/codegen/intrinsic/cpp/eigen_unary.h"
+
+#include <cmath>
 #include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/codegen/intrinsic/cpp/eigen_unary_ll.h"
+#include "xla/codegen/intrinsic/cpp/vector_ops.h"
+#include "xla/codegen/intrinsic/test_matchers.h"
 
 namespace xla::codegen {
 namespace {
 using ::testing::ContainsRegex;
 using ::testing::Not;
+using ::xla::codegen::intrinsic::NearUlps;
+
+constexpr int kTanhUlps = 4;
+
+TEST(EigenUnaryTest, FastTanhfIsCorrect) {
+  Vec16f x = {1.0f,  2.0f,  -1.0f, 4.0f,   8.0f,   16.0f,  32.0f, 64.0f,
+              -2.0f, -4.0f, -8.0f, -16.0f, -32.0f, -64.0f, 0.0f,  0.5f};
+  Vec16f y = tanh_v16f32(x);
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_THAT(y[i], NearUlps(std::tanh(x[i]), kTanhUlps));
+  }
+}
+
+TEST(EigenUnaryTest, v8f64TanhIsCorrect) {
+  Vec8d x = {1.0, 2.0, -1.0, 4.0, 8.0, 16.0, 32.0, 64.0};
+  Vec8d y = tanh_v8f64(x);
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_THAT(y[i], NearUlps(std::tanh(x[i]), kTanhUlps));
+  }
+}
 
 TEST(EigenUnaryTest, FastTanhfIsVectorized) {
-#ifdef __x86_64__
-  const std::string avx2 = llvm_ir::kEigenUnaryLlAvx2Ir;
-  EXPECT_THAT(avx2, ContainsRegex("fmul <4 x float>"));
-  EXPECT_THAT(avx2, ContainsRegex("<4 x float>.*0x3E4DF2A3C0000000"));
-  EXPECT_THAT(avx2, ContainsRegex("llvm.x86"));
-  EXPECT_THAT(avx2, Not(ContainsRegex("llvm.aarch64")));
-  EXPECT_THAT(avx2, Not(ContainsRegex("llvm.fma.v4f32")));
-
-  const std::string avx512 = llvm_ir::kEigenUnaryLlAvx512Ir;
-  EXPECT_THAT(avx512, ContainsRegex("fmul <4 x float>"));
-  EXPECT_THAT(avx512, ContainsRegex("<4 x float>.*0x3E4DF2A3C0000000"));
-  EXPECT_THAT(avx512, ContainsRegex("llvm.x86"));
-  EXPECT_THAT(avx512, ContainsRegex("llvm.fma.v4f32"));
-#endif
-
-#ifdef __aarch64__
-  const std::string neon = llvm_ir::kEigenUnaryLlNeonIr;
-  EXPECT_THAT(neon, ContainsRegex("fmul <4 x float>"));
-  EXPECT_THAT(neon, ContainsRegex("<4 x float>.*0x3E4DF2A3C0000000"));
-  EXPECT_THAT(neon, ContainsRegex("llvm.aarch64.neon"));
-  EXPECT_THAT(neon, Not(ContainsRegex("llvm.x86")));
-#endif
+  const std::string ir = llvm_ir::kEigenUnaryLlIr;
+  EXPECT_THAT(ir, ContainsRegex("fmul <16 x float>"));
+  EXPECT_THAT(ir, ContainsRegex("fmul <8 x double>"));
+  EXPECT_THAT(ir, ContainsRegex("<16 x float>.*0x3E4DF2A3C0000000"));
+  EXPECT_THAT(ir, Not(ContainsRegex("llvm.x86")));
+  EXPECT_THAT(ir, Not(ContainsRegex("llvm.aarch64")));
+  EXPECT_THAT(ir, ContainsRegex("xla.tanh.v16f32"));
+  EXPECT_THAT(ir, ContainsRegex("xla.tanh.v8f64"));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/tanh.h b/third_party/xla/xla/codegen/intrinsic/cpp/tanh.h
index 8a1eb61a7ea0dd..cc96deeaf53680 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/tanh.h
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/tanh.h
@@ -23,6 +23,9 @@ limitations under the License.
 namespace xla {
 namespace codegen {
 
+// WARNING: This file exists right now purely as a proof-of-concept showing how
+// to hand-code portable llvm ir intrinsics using C++.
+
 template <typename T>
 T FastTanhf(T x) {
   T abs_x = BitwiseAbs<T>(x);
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/tanh_test.cc b/third_party/xla/xla/codegen/intrinsic/cpp/tanh_test.cc
index ecafb9d5b2572f..4085044ad642e3 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/tanh_test.cc
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/tanh_test.cc
@@ -26,18 +26,10 @@ using ::testing::ContainsRegex;
 namespace {
 
 TEST(TanhTest, FloatTanhVectorized) {
-#ifdef __x86_64__
-  std::string ir = llvm_ir::kTanhLlAvx2Ir;
+  std::string ir = llvm_ir::kTanhLlIr;
   EXPECT_THAT(ir, ContainsRegex("fmul <4 x float>"));
   EXPECT_THAT(
       ir, ContainsRegex("fcmp olt <4 x float>.*float 0x3F3A36E2E0000000.*"));
-#endif
-#ifdef __aarch64__
-  std::string ir = llvm_ir::kTanhLlNeonIr;
-  EXPECT_THAT(ir, ContainsRegex("fmul <4 x float>"));
-  EXPECT_THAT(
-      ir, ContainsRegex("fcmp olt <4 x float>.*float 0x3F3A36E2E0000000.*"));
-#endif
 }
 }  // namespace
 }  // namespace codegen
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/vector_ops.h b/third_party/xla/xla/codegen/intrinsic/cpp/vector_ops.h
index ee2c8e0162811e..15c5505dbe4e22 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/vector_ops.h
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/vector_ops.h
@@ -20,11 +20,20 @@ limitations under the License.
 #include <cstdint>
 #include <type_traits>
 
+#include "Eigen/Core"
+
 namespace xla {
 namespace codegen {
-// Float types
+// Half precision (float16)
+typedef _Float16 Vec8h __attribute__((vector_size(16)));
+typedef _Float16 Vec16h __attribute__((vector_size(32)));
+
+// Single precision (float32)
 typedef float Vec4f __attribute__((vector_size(16)));
 typedef float Vec8f __attribute__((vector_size(32)));
+typedef float Vec16f __attribute__((vector_size(64)));
+
+// Double precision (float64)
 typedef double Vec2d __attribute__((vector_size(16)));
 typedef double Vec4d __attribute__((vector_size(32)));
 typedef double Vec8d __attribute__((vector_size(64)));
@@ -76,6 +85,47 @@ struct CorrespondingIntVector {
 };
 }  // namespace internal
 
+// ===--------------------------------------------------------------------===//
+// Eigen Array type mapping
+// ===--------------------------------------------------------------------===//
+template <typename VecType>
+struct ArrayMap;
+
+template <>
+struct ArrayMap<Vec8h> {
+  using type = Eigen::Array<Eigen::half, 8, 1>;
+};
+template <>
+struct ArrayMap<Vec16h> {
+  using type = Eigen::Array<Eigen::half, 16, 1>;
+};
+
+template <>
+struct ArrayMap<Vec4f> {
+  using type = Eigen::Array<float, 4, 1>;
+};
+template <>
+struct ArrayMap<Vec8f> {
+  using type = Eigen::Array<float, 8, 1>;
+};
+template <>
+struct ArrayMap<Vec16f> {
+  using type = Eigen::Array<float, 16, 1>;
+};
+
+template <>
+struct ArrayMap<Vec2d> {
+  using type = Eigen::Array<double, 2, 1>;
+};
+template <>
+struct ArrayMap<Vec4d> {
+  using type = Eigen::Array<double, 4, 1>;
+};
+template <>
+struct ArrayMap<Vec8d> {
+  using type = Eigen::Array<double, 8, 1>;
+};
+
 // Computes the absolute value of a vector using bitwise operations.
 // FloatVec: The floating-point vector type (e.g., Vec4f).
 // x: The input vector.
diff --git a/third_party/xla/xla/codegen/intrinsic/erf_test.cc b/third_party/xla/xla/codegen/intrinsic/erf_test.cc
index 1f02810360760c..546e430695a6db 100644
--- a/third_party/xla/xla/codegen/intrinsic/erf_test.cc
+++ b/third_party/xla/xla/codegen/intrinsic/erf_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/codegen/intrinsic/simple_jit_runner.h"
 #include "xla/codegen/intrinsic/test_matchers.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 namespace {
diff --git a/third_party/xla/xla/codegen/intrinsic/exp.cc b/third_party/xla/xla/codegen/intrinsic/exp.cc
index 71150dc2ac07d9..57205e0a491a9b 100644
--- a/third_party/xla/xla/codegen/intrinsic/exp.cc
+++ b/third_party/xla/xla/codegen/intrinsic/exp.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/codegen/intrinsic/ldexp.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 
diff --git a/third_party/xla/xla/codegen/intrinsic/fptrunc.cc b/third_party/xla/xla/codegen/intrinsic/fptrunc.cc
index f5bdb4d0d8df6b..38b0003d6be14a 100644
--- a/third_party/xla/xla/codegen/intrinsic/fptrunc.cc
+++ b/third_party/xla/xla/codegen/intrinsic/fptrunc.cc
@@ -35,11 +35,10 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
-#include "xla/codegen/intrinsic/intrinsic.h"
+#include "xla/codegen/intrinsic/type.h"
 #include "xla/primitive_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -378,7 +377,7 @@ absl::StatusOr<llvm::Function*> EmitFxxToF8E(llvm::Module* module,
       /*dest_exponent_bits=*/primitive_util::ExponentWidth(fx_type),
       /*dest_mantissa_bits=*/f8_mantissa_bits,
       /*quiet_nans=*/true, &b);
-  TF_CHECK_OK(fx_reduced_statusor.status());  // Crash OK
+  CHECK_OK(fx_reduced_statusor.status());  // Crash OK
   Value* fx_reduced = b.CreateBitCast(fx_reduced_statusor.value(), ix_type);
   fx_reduced = b.CreateAnd(fx_reduced, nosign_mask);
 
diff --git a/third_party/xla/xla/codegen/intrinsic/intrinsic.h b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
index c8b1c19575a4fe..30d6e31088a581 100644
--- a/third_party/xla/xla/codegen/intrinsic/intrinsic.h
+++ b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
@@ -16,18 +16,12 @@ limitations under the License.
 #ifndef XLA_CODEGEN_INTRINSIC_INTRINSIC_H_
 #define XLA_CODEGEN_INTRINSIC_INTRINSIC_H_
 
-#include <cstddef>
 #include <cstdint>
-#include <optional>
 #include <string>
-#include <variant>
 #include <vector>
 
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -36,69 +30,12 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/codegen/intrinsic/type.h"
+#include "xla/codegen/intrinsic/vec_name_mangler.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 
-// A scalar argument or result.
-struct Scalar {
-  PrimitiveType type;
-};
-
-// A vector argument or result.
-struct Vec {
-  PrimitiveType type;
-  size_t width;
-};
-
-class Type : public std::variant<Scalar, Vec> {
- public:
-  using std::variant<Scalar, Vec>::variant;
-  Type(PrimitiveType type, std::optional<size_t> vector_width);
-
-  std::string name() const;
-  bool is_scalar() const;
-  bool is_vector() const;
-  PrimitiveType element_type() const;
-  std::optional<size_t> vector_width() const;
-  llvm::Type* to_ir_type(llvm::LLVMContext& context) const;
-  mlir::Type to_ir_type(mlir::MLIRContext& context) const;
-
-  template <typename Sink>
-  friend void AbslStringify(Sink& sink, const Type& type) {
-    absl::Format(&sink, "%s", type.name());
-  }
-
-  // Shortened builders for the scalar and vector types defined above.
-  static constexpr Type S(PrimitiveType type) { return Scalar{type}; }
-  static constexpr Type V(PrimitiveType type, size_t width) {
-    return Vec{type, width};
-  }
-
-  // Verifies that the two types have the same width.
-  static absl::Status VerifySameWidth(const Type& a, const Type& b);
-
-  // Verifies that the two types have the same width and element type.
-  static absl::Status VerifySameWidthAndElementType(const Type& a,
-                                                    const Type& b);
-
-  // Returns the LLVM IR type for the given intrinsic type.
-  static llvm::Type* TypeToIrType(Type type, llvm::LLVMContext& context);
-
-  // Returns the MLIR type for the given intrinsic type.
-  static mlir::Type TypeToIrType(Type type, mlir::MLIRContext& context);
-
-  // Returns the intrinsic type for the given MLIR type.
-  static Type TypeFromIrType(mlir::Type type);
-
-  // Returns the intrinsic type for the given LLVM type.
-  static Type TypeFromIrType(llvm::Type* type);
-
-  // Returns the intrinsic type for the given type name, e.g. v4f32.
-  static Type FromName(absl::string_view name);
-};
-
 enum class DeviceType {
   kAmdCpu,
   kIntelCpu,
@@ -158,12 +95,8 @@ class Intrinsic {
 
   template <typename... Types>
   static std::string Name(Types... args) {
-    std::vector<std::string> arg_names = {args.name()...};
-    if (Derived::kLastArgIsReturnType) {
-      arg_names.insert(--arg_names.end(), "to");
-    }
-    return absl::StrCat("xla.", Derived::kName, ".",
-                        absl::StrJoin(arg_names, "."));
+    return ::xla::codegen::intrinsic::GetTypedName(
+        Derived::kLastArgIsReturnType, {args...}, Derived::kName);
   }
 
   template <typename... Args>
diff --git a/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc b/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc
index d0c38161491f15..383924e8e9a4f0 100644
--- a/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc
+++ b/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/codegen/intrinsic/intrinsic.h"
 
 #include <optional>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "xla/codegen/intrinsic/type.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsic {
@@ -28,7 +28,6 @@ namespace {
 
 using ::testing::_;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 using ::xla::codegen::intrinsics::Type;
 
 TEST(IntrinsicTest, TypeName) {
diff --git a/third_party/xla/xla/codegen/intrinsic/simple_jit_runner.cc b/third_party/xla/xla/codegen/intrinsic/simple_jit_runner.cc
index af1179e932bd7a..a1054d2e8b2d66 100644
--- a/third_party/xla/xla/codegen/intrinsic/simple_jit_runner.cc
+++ b/third_party/xla/xla/codegen/intrinsic/simple_jit_runner.cc
@@ -238,12 +238,12 @@ llvm::Expected<void*> JitRunner::CreateVectorWrapperWithLoop(
 
 std::unique_ptr<llvm::TargetMachine> CreateHostTargetMachine() {
   initializeNativeTargets();
-  const std::string triple = llvm::sys::getDefaultTargetTriple();
+  const llvm::Triple triple(llvm::sys::getDefaultTargetTriple());
   llvm::StringRef cpu = llvm::sys::getHostCPUName();
   llvm::StringMap<bool> features = llvm::sys::getHostCPUFeatures();
   std::string errors = "";
   const llvm::Target* target =
-      llvm::TargetRegistry::lookupTarget(llvm::StringRef(triple), errors);
+      llvm::TargetRegistry::lookupTarget(triple, errors);
   LOG_IF(FATAL, !target) << "Failed to lookup target: " << errors;
   std::string feature_str;
   for (const auto& [feature, value] : features) {
@@ -253,8 +253,8 @@ std::unique_ptr<llvm::TargetMachine> CreateHostTargetMachine() {
   }
   llvm::TargetOptions target_options;
   std::unique_ptr<llvm::TargetMachine> target_machine(
-      target->createTargetMachine(llvm::Triple(triple), cpu, feature_str,
-                                  target_options, std::nullopt, std::nullopt));
+      target->createTargetMachine(triple, cpu, feature_str, target_options,
+                                  std::nullopt, std::nullopt));
   LOG_IF(FATAL, !target_machine) << "Failed to create target machine";
   return target_machine;
 }
diff --git a/third_party/xla/xla/codegen/intrinsic/tanh.cc b/third_party/xla/xla/codegen/intrinsic/tanh.cc
index 8fbb39fa06448d..dab859ee001361 100644
--- a/third_party/xla/xla/codegen/intrinsic/tanh.cc
+++ b/third_party/xla/xla/codegen/intrinsic/tanh.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 
diff --git a/third_party/xla/xla/codegen/intrinsic/intrinsic.cc b/third_party/xla/xla/codegen/intrinsic/type.cc
similarity index 95%
rename from third_party/xla/xla/codegen/intrinsic/intrinsic.cc
rename to third_party/xla/xla/codegen/intrinsic/type.cc
index df300ae0659368..2ffbb71b4d9515 100644
--- a/third_party/xla/xla/codegen/intrinsic/intrinsic.cc
+++ b/third_party/xla/xla/codegen/intrinsic/type.cc
@@ -13,17 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/codegen/intrinsic/intrinsic.h"
+#include "xla/codegen/intrinsic/type.h"
 
-#include <cctype>
 #include <cstddef>
 #include <optional>
 #include <string>
 #include <variant>
 
 #include "absl/log/check.h"
-#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Types.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/mlir/utils/type_util.h"
 #include "xla/primitive_util.h"
@@ -40,7 +40,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
-namespace xla::codegen {
+namespace xla::codegen::intrinsics {
 
 namespace {
 std::string LowercaseLLVMPrimitiveTypeName(PrimitiveType type) {
@@ -69,8 +69,6 @@ PrimitiveType FromLowercaseLLVMTypeName(absl::string_view in) {
 }
 }  // namespace
 
-namespace intrinsics {
-
 Type::Type(PrimitiveType type, std::optional<size_t> vector_width) {
   if (vector_width) {
     emplace<1>(Vec{type, *vector_width});
@@ -80,6 +78,27 @@ Type::Type(PrimitiveType type, std::optional<size_t> vector_width) {
 }
 
 namespace {
+template <typename ScalarFn, typename VectorFn>
+static absl::Status VerifyTypes(ScalarFn scalar, VectorFn vector, const Type& a,
+                                const Type& b) {
+  // A pair of scalar types.
+  auto* sa = std::get_if<Scalar>(&a);
+  auto* sb = std::get_if<Scalar>(&b);
+  if (sa && sb) {
+    return scalar(*sa, *sb);
+  }
+
+  // A pair of vector types.
+  auto* va = std::get_if<Vec>(&a);
+  auto* vb = std::get_if<Vec>(&b);
+  if (va && vb) {
+    return vector(*va, *vb);
+  }
+
+  return InvalidArgument("Expected types of the same kind, but got %s and %s",
+                         a.name(), b.name());
+}
+
 template <typename R, typename ScalarFn, typename VectorFn>
 static R Visit(ScalarFn scalar, VectorFn vector, const Type* type) {
   if (auto* s = std::get_if<Scalar>(type)) {
@@ -103,7 +122,7 @@ std::string Type::name() const {
 
 Type Type::FromName(absl::string_view name) {
   if (name[0] == 'v') {
-    size_t len = std::isdigit(name[2]) ? 2 : 1;
+    size_t len = absl::ascii_isdigit(name[2]) ? 2 : 1;
     size_t width;
     CHECK(absl::SimpleAtoi(name.substr(1, len), &width)) << name;
     return Type(FromLowercaseLLVMTypeName(name.substr(len + 1)), width);
@@ -126,29 +145,6 @@ std::optional<size_t> Type::vector_width() const {
       [](const Vec& vec) { return vec.width; }, this);
 }
 
-namespace {
-template <typename ScalarFn, typename VectorFn>
-static absl::Status VerifyTypes(ScalarFn scalar, VectorFn vector, const Type& a,
-                                const Type& b) {
-  // A pair of scalar types.
-  auto* sa = std::get_if<Scalar>(&a);
-  auto* sb = std::get_if<Scalar>(&b);
-  if (sa && sb) {
-    return scalar(*sa, *sb);
-  }
-
-  // A pair of vector types.
-  auto* va = std::get_if<Vec>(&a);
-  auto* vb = std::get_if<Vec>(&b);
-  if (va && vb) {
-    return vector(*va, *vb);
-  }
-
-  return InvalidArgument("Expected types of the same kind, but got %s and %s",
-                         a.name(), b.name());
-}
-}  // namespace
-
 absl::Status Type::VerifySameWidth(const Type& a, const Type& b) {
   return VerifyTypes(
       [&](const Scalar&, const Scalar&) { return absl::OkStatus(); },
@@ -205,6 +201,7 @@ mlir::Type Type::to_ir_type(mlir::MLIRContext& context) const {
 
 Type Type::TypeFromIrType(mlir::Type type) {
   if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(type)) {
+    CHECK_EQ(vec_type.getRank(), 1) << "Expected rank 1 for vector type.";
     return Type(ConvertMlirTypeToPrimitiveType(vec_type.getElementType()),
                 vec_type.getShape().front());
   }
@@ -221,5 +218,4 @@ Type Type::TypeFromIrType(llvm::Type* type) {
   return Type(llvm_ir::PrimitiveTypeFromIrType(type), std::nullopt);
 }
 
-}  // namespace intrinsics
-}  // namespace xla::codegen
+}  // namespace xla::codegen::intrinsics
diff --git a/third_party/xla/xla/codegen/intrinsic/type.h b/third_party/xla/xla/codegen/intrinsic/type.h
new file mode 100644
index 00000000000000..549c2dd95024dd
--- /dev/null
+++ b/third_party/xla/xla/codegen/intrinsic/type.h
@@ -0,0 +1,98 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_INTRINSIC_TYPE_H_
+#define XLA_CODEGEN_INTRINSIC_TYPE_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/Type.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Types.h"
+#include "xla/xla_data.pb.h"
+namespace xla::codegen::intrinsics {
+
+// A scalar argument or result.
+struct Scalar {
+  PrimitiveType type;
+
+  bool operator==(const Scalar& other) const { return type == other.type; }
+};
+
+// A vector argument or result.
+struct Vec {
+  PrimitiveType type;
+  size_t width;
+
+  bool operator==(const Vec& other) const {
+    return type == other.type && width == other.width;
+  }
+};
+
+class Type : public std::variant<Scalar, Vec> {
+ public:
+  using std::variant<Scalar, Vec>::variant;
+  Type(PrimitiveType type, std::optional<size_t> vector_width);
+
+  std::string name() const;
+  bool is_scalar() const;
+  bool is_vector() const;
+  PrimitiveType element_type() const;
+  std::optional<size_t> vector_width() const;
+  llvm::Type* to_ir_type(llvm::LLVMContext& context) const;
+  mlir::Type to_ir_type(mlir::MLIRContext& context) const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Type& type) {
+    absl::Format(&sink, "%s", type.name());
+  }
+
+  // Shortened builders for the scalar and vector types defined above.
+  static constexpr Type S(PrimitiveType type) { return Scalar{type}; }
+  static constexpr Type V(PrimitiveType type, size_t width) {
+    return Vec{type, width};
+  }
+
+  // Verifies that the two types have the same width.
+  static absl::Status VerifySameWidth(const Type& a, const Type& b);
+
+  // Verifies that the two types have the same width and element type.
+  static absl::Status VerifySameWidthAndElementType(const Type& a,
+                                                    const Type& b);
+
+  // Returns the LLVM IR type for the given intrinsic type.
+  static llvm::Type* TypeToIrType(Type type, llvm::LLVMContext& context);
+
+  // Returns the MLIR type for the given intrinsic type.
+  static mlir::Type TypeToIrType(Type type, mlir::MLIRContext& context);
+
+  // Returns the intrinsic type for the given MLIR type.
+  static Type TypeFromIrType(mlir::Type type);
+
+  // Returns the intrinsic type for the given LLVM type.
+  static Type TypeFromIrType(llvm::Type* type);
+
+  // Returns the intrinsic type for the given type name, e.g. v4f32.
+  static Type FromName(absl::string_view name);
+};
+
+}  // namespace xla::codegen::intrinsics
+
+#endif  // XLA_CODEGEN_INTRINSIC_TYPE_H_
diff --git a/third_party/xla/xla/codegen/intrinsic/type_test.cc b/third_party/xla/xla/codegen/intrinsic/type_test.cc
new file mode 100644
index 00000000000000..424b8f1c8e0c1f
--- /dev/null
+++ b/third_party/xla/xla/codegen/intrinsic/type_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/codegen/intrinsic/type.h"
+
+#include <gtest/gtest.h>
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/TypeSize.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::codegen::intrinsics {
+namespace {
+
+TEST(TypeTest, TypeToIrType) {
+  llvm::LLVMContext context;
+  EXPECT_EQ(Type::S(F32).to_ir_type(context), llvm::Type::getFloatTy(context));
+  EXPECT_EQ(Type::V(F32, 4).to_ir_type(context),
+            llvm::VectorType::get(llvm::Type::getFloatTy(context),
+                                  llvm::ElementCount::getFixed(4)));
+}
+
+TEST(TypeTest, TypeFromIrType) {
+  llvm::LLVMContext context;
+  EXPECT_EQ(Type::TypeFromIrType(llvm::Type::getFloatTy(context)),
+            Type::S(F32));
+  EXPECT_EQ(
+      Type::TypeFromIrType(llvm::VectorType::get(
+          llvm::Type::getFloatTy(context), llvm::ElementCount::getFixed(4))),
+      Type::V(F32, 4));
+}
+
+TEST(TypeTest, VerifySameWidth) {
+  TF_EXPECT_OK(Type::VerifySameWidth(Type::S(F32), Type::S(F32)));
+  TF_EXPECT_OK(Type::VerifySameWidth(Type::V(F32, 4), Type::V(F32, 4)));
+  EXPECT_FALSE(Type::VerifySameWidth(Type::V(F32, 4), Type::V(F32, 8)).ok());
+}
+
+TEST(TypeTest, VerifySameWidthAndElementType) {
+  TF_EXPECT_OK(Type::VerifySameWidthAndElementType(Type::S(F32), Type::S(F32)));
+  TF_EXPECT_OK(
+      Type::VerifySameWidthAndElementType(Type::V(F32, 4), Type::V(F32, 4)));
+  EXPECT_FALSE(
+      Type::VerifySameWidthAndElementType(Type::V(F32, 4), Type::V(F32, 8))
+          .ok());
+  EXPECT_FALSE(
+      Type::VerifySameWidthAndElementType(Type::V(F32, 4), Type::V(BF16, 4))
+          .ok());
+}
+
+TEST(TypeTest, FromName) {
+  EXPECT_EQ(Type::FromName("f32"), Type::S(F32));
+  EXPECT_EQ(Type::FromName("v4f32"), Type::V(F32, 4));
+}
+
+}  // namespace
+}  // namespace xla::codegen::intrinsics
diff --git a/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h b/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h
index 0919879d3c47a4..515f8f31a3c02c 100644
--- a/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h
+++ b/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h
@@ -16,14 +16,19 @@ limitations under the License.
 #ifndef XLA_CODEGEN_INTRINSIC_VEC_NAME_MANGLER_H_
 #define XLA_CODEGEN_INTRINSIC_VEC_NAME_MANGLER_H_
 
-#include <algorithm>
 #include <cstddef>
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/codegen/intrinsic/type.h"
+#include "xla/util.h"
 
 namespace xla::codegen::intrinsic {
 
@@ -54,12 +59,98 @@ inline std::string GetMangledNamePrefix(
   std::string mask = is_masked ? "M" : "N";
 
   std::vector<std::string> param_strings(param_cardinalities.size());
-  std::transform(param_cardinalities.begin(), param_cardinalities.end(),
-                 param_strings.begin(), VecParamCardinalityToString);
+  absl::c_transform(param_cardinalities, param_strings.begin(),
+                    VecParamCardinalityToString);
   return absl::StrCat("_ZGV_LLVM_", mask, vector_width,
                       absl::StrJoin(param_strings, ""));
 }
 
+inline std::string GetMangledNamePrefix(
+    bool is_masked, bool last_arg_is_return_type,
+    absl::Span<const intrinsics::Type> types) {
+  std::vector<VecParamCardinality> param_cardinalities;
+  auto front = types.front();
+  // Remove the return type if it's in the types list:
+  for (const auto& type : types.first(types.size() - last_arg_is_return_type)) {
+    if (type.is_scalar()) {
+      param_cardinalities.push_back(VecParamCardinality::kScalar);
+    } else {
+      param_cardinalities.push_back(VecParamCardinality::kVector);
+    }
+    CHECK(type.vector_width() == front.vector_width())
+        << "All types must have the same vector width.";
+  }
+  return GetMangledNamePrefix(is_masked, front.vector_width().value_or(1),
+                              param_cardinalities);
+}
+
+inline std::string GetTypedName(bool last_arg_is_return_type,
+                                absl::Span<const intrinsics::Type> types,
+                                absl::string_view func_name) {
+  std::vector<std::string> type_names;
+  type_names.reserve(types.size());
+  for (const auto& type : types) {
+    type_names.push_back(type.name());
+  }
+  if (last_arg_is_return_type) {
+    type_names.insert(--type_names.end(), "to");
+  }
+  return absl::StrCat("xla.", func_name, ".", absl::StrJoin(type_names, "."));
+}
+
+struct ParsedFunctionName {
+  std::string base_name;
+  std::vector<intrinsics::Type> types;
+  bool last_arg_is_return_type;
+  bool is_masked;  // TODO: Add support for masked functions.
+};
+
+inline std::string GetTypedName(const ParsedFunctionName& parsed_name) {
+  return GetTypedName(parsed_name.last_arg_is_return_type, parsed_name.types,
+                      parsed_name.base_name);
+}
+
+inline absl::StatusOr<ParsedFunctionName> ParseFunctionName(
+    absl::string_view function_name) {
+  // The `to` in a typed function name is used to specify the return type, so
+  // we ignore it when parsing the function name.
+  static constexpr absl::string_view kIgnoredParts[] = {"to"};
+  std::vector<intrinsics::Type> types;
+  auto parts = absl::StrSplit(function_name, '.');
+  int i = -1;
+  ParsedFunctionName result;
+  result.last_arg_is_return_type = false;
+  result.is_masked = false;
+  for (absl::string_view part : parts) {
+    // Skip the first two parts, which will be `xla.<func_name>`:
+    i++;
+    if (i == 0) {
+      if (part != "xla") {
+        return InvalidArgument("Invalid function name: %s", function_name);
+      }
+      // skip `xla.`
+      continue;
+    }
+    if (i == 1) {
+      result.base_name = std::string(part);
+      continue;
+    }
+    if (bool ignored =
+            absl::c_find(kIgnoredParts, part) != std::end(kIgnoredParts)) {
+      if (part == "to") {
+        result.last_arg_is_return_type = true;
+      }
+      continue;
+    }
+    types.push_back(intrinsics::Type::FromName(part));
+  }
+  if (i < 2) {
+    return InvalidArgument("Invalid function name: %s", function_name);
+  }
+  result.types = types;
+  return result;
+}
+
 }  // namespace xla::codegen::intrinsic
 
 #endif  // XLA_CODEGEN_INTRINSIC_VEC_NAME_MANGLER_H_
diff --git a/third_party/xla/xla/codegen/intrinsic_function.h b/third_party/xla/xla/codegen/intrinsic_function.h
new file mode 100644
index 00000000000000..dd65f351ad9c85
--- /dev/null
+++ b/third_party/xla/xla/codegen/intrinsic_function.h
@@ -0,0 +1,63 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_INTRINSIC_FUNCTION_H_
+#define XLA_CODEGEN_INTRINSIC_FUNCTION_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "xla/codegen/intrinsic/intrinsic.h"
+#include "xla/codegen/intrinsic/type.h"
+
+namespace xla::codegen {
+
+// Interface representing a single vectorized math function approximation.
+// Each implementation may support multiple vector widths and primitive types,
+// defined by the SupportedVectorTypes() method. To emit LLVM IR for a
+// particular vector width and primitive type, call CreateDefinition() with the
+// desired vector_width and primitive_type.
+class IntrinsicFunction {
+ public:
+  virtual ~IntrinsicFunction() = default;
+  // The name of the function being approximated.
+  virtual absl::string_view FunctionName() const = 0;
+
+  // Returns the vector types supported well by this approximation.
+  virtual std::vector<std::vector<intrinsics::Type>> SupportedVectorTypes(
+      absl::string_view features) const = 0;
+
+  // Returns the LLVM IR function definition for the approximation.
+  virtual llvm::Function* CreateDefinition(llvm::Module& module,
+                                           intrinsics::IntrinsicOptions options,
+                                           absl::string_view name) const = 0;
+
+  // The vectorized function name, e.g. "xla.ldexp.v8f64.v8i32".
+  virtual std::string GenerateVectorizedFunctionName(
+      absl::Span<const intrinsics::Type> types) const = 0;
+
+  // The LLVM mangled prefix for the vectorized function, e.g.
+  // "_ZGV_LLVM_N8" used in llvm::VecDesc.
+  virtual std::string GenerateMangledSimdPrefix(
+      absl::Span<const intrinsics::Type> types) const = 0;
+};
+
+}  // namespace xla::codegen
+
+#endif  // XLA_CODEGEN_INTRINSIC_FUNCTION_H_
diff --git a/third_party/xla/xla/codegen/intrinsic_lib.cc b/third_party/xla/xla/codegen/intrinsic_lib.cc
index bebc567013abcb..23809838c1ebbe 100644
--- a/third_party/xla/xla/codegen/intrinsic_lib.cc
+++ b/third_party/xla/xla/codegen/intrinsic_lib.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/codegen/intrinsic_lib.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -27,7 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
-#include "absl/strings/str_split.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
@@ -48,6 +47,7 @@ limitations under the License.
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/Casting.h"
@@ -70,6 +70,7 @@ limitations under the License.
 #include "xla/codegen/intrinsic/rsqrt.h"
 #include "xla/codegen/intrinsic/string_interner.h"
 #include "xla/codegen/intrinsic/tanh.h"
+#include "xla/codegen/intrinsic/type.h"
 #include "xla/codegen/intrinsic/vec_name_mangler.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/xla_data.pb.h"
@@ -89,25 +90,6 @@ template <size_t N, typename F, typename Container>
 decltype(auto) apply_vector(F&& f, const Container& v) {
   return apply_vector(f, v, std::make_index_sequence<N>{});
 }
-
-std::vector<Type> ParseTypesFromFunctionName(absl::string_view function_name) {
-  // The `to` in a typed function name is used to specify the return type, so
-  // we ignore it when parsing the function name.
-  static constexpr absl::string_view kIgnoredParts[] = {"to"};
-  std::vector<Type> types;
-  auto parts = absl::StrSplit(function_name, '.');
-  size_t i = 0;
-  for (absl::string_view part : parts) {
-    // Skip the first two parts, which will be `xla.<func_name>`:
-    if (i++ < 2 || std::find(std::begin(kIgnoredParts), std::end(kIgnoredParts),
-                             part) != std::end(kIgnoredParts)) {
-      continue;
-    }
-    types.push_back(Type::FromName(part));
-  }
-  return types;
-}
-
 }  // namespace
 
 using intrinsics::IntrinsicOptions;
@@ -130,7 +112,9 @@ class IntrinsicAdapter : public IntrinsicFunction {
   llvm::Function* CreateDefinition(llvm::Module& module,
                                    IntrinsicOptions options,
                                    absl::string_view name) const override {
-    std::vector<Type> types = ParseTypesFromFunctionName(name);
+    absl::StatusOr<intrinsic::ParsedFunctionName> parsed =
+        intrinsic::ParseFunctionName(name);
+    CHECK_OK(parsed);
     return apply_vector<Intrinsic::kNumArgs>(
                [&](auto... args) {
                  if constexpr (std::is_invocable_v<
@@ -143,7 +127,7 @@ class IntrinsicAdapter : public IntrinsicFunction {
                    return Intrinsic::CreateDefinition(&module, args...);
                  }
                },
-               types)
+               parsed->types)
         .value();
   }
 
@@ -154,22 +138,8 @@ class IntrinsicAdapter : public IntrinsicFunction {
   }
   std::string GenerateMangledSimdPrefix(
       absl::Span<const Type> types) const override {
-    std::vector<intrinsic::VecParamCardinality> param_cardinalities;
-    auto front = types.front();
-    // Remove the return type if it's in the types list:
-    for (const auto& type :
-         types.first(types.size() - Intrinsic::kLastArgIsReturnType)) {
-      if (type.is_scalar()) {
-        param_cardinalities.push_back(intrinsic::VecParamCardinality::kScalar);
-      } else {
-        param_cardinalities.push_back(intrinsic::VecParamCardinality::kVector);
-      }
-      CHECK(type.vector_width() == front.vector_width())
-          << "All types must have the same vector width.";
-    }
-    return intrinsic::GetMangledNamePrefix(Intrinsic::kIsMasked,
-                                           front.vector_width().value_or(1),
-                                           param_cardinalities);
+    return intrinsic::GetMangledNamePrefix(
+        Intrinsic::kIsMasked, Intrinsic::kLastArgIsReturnType, types);
   }
 };
 
@@ -251,7 +221,8 @@ std::vector<llvm::VecDesc> IntrinsicFunctionLib::Vectorizations() {
          math_func->SupportedVectorTypes(options_.features)) {
       for (const auto& vector_types :
            math_func->SupportedVectorTypes(options_.features)) {
-        if (!ElementTypesMatch(target_types, vector_types)) {
+        if (!ElementTypesMatch(target_types, vector_types) ||
+            target_types.front().is_vector()) {
           continue;
         }
         absl::string_view target_name = intrinsic::StringInterner::Get().Intern(
diff --git a/third_party/xla/xla/codegen/intrinsic_lib.h b/third_party/xla/xla/codegen/intrinsic_lib.h
index cb49c88c8540c7..2907dcc0badb85 100644
--- a/third_party/xla/xla/codegen/intrinsic_lib.h
+++ b/third_party/xla/xla/codegen/intrinsic_lib.h
@@ -17,51 +17,19 @@ limitations under the License.
 #define XLA_CODEGEN_INTRINSIC_LIB_H_
 
 #include <memory>
-#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "xla/codegen/intrinsic/intrinsic.h"
+#include "xla/codegen/intrinsic_function.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::codegen {
 
-// Interface representing a single vectorized math function approximation.
-// Each implementation may support multiple vector widths and primitive types,
-// defined by the SupportedVectorTypes() method. To emit LLVM IR for a
-// particular vector width and primitive type, call CreateDefinition() with the
-// desired vector_width and primitive_type.
-class IntrinsicFunction {
- public:
-  virtual ~IntrinsicFunction() = default;
-  // The name of the function being approximated.
-  virtual absl::string_view FunctionName() const = 0;
-
-  // Returns the vector types supported well by this approximation.
-  virtual std::vector<std::vector<intrinsics::Type>> SupportedVectorTypes(
-      absl::string_view features) const = 0;
-
-  // Returns the LLVM IR function definition for the approximation.
-  virtual llvm::Function* CreateDefinition(llvm::Module& module,
-                                           intrinsics::IntrinsicOptions options,
-                                           absl::string_view name) const = 0;
-
-  // The vectorized function name, e.g. "xla.ldexp.v8f64.v8i32".
-  virtual std::string GenerateVectorizedFunctionName(
-      absl::Span<const intrinsics::Type> types) const = 0;
-
-  // The LLVM mangled prefix for the vectorized function, e.g.
-  // "_ZGV_LLVM_N8" used in llvm::VecDesc.
-  virtual std::string GenerateMangledSimdPrefix(
-      absl::Span<const intrinsics::Type> types) const = 0;
-};
-
 // A library of intrinsic functions and math approximations.
 // The library hooks into LLVM compilation in two places:
 // 1. It provides a set of VecDescs that are used to replace LLVM
diff --git a/third_party/xla/xla/codegen/intrinsic_lib_test.cc b/third_party/xla/xla/codegen/intrinsic_lib_test.cc
new file mode 100644
index 00000000000000..c25e038f6e8849
--- /dev/null
+++ b/third_party/xla/xla/codegen/intrinsic_lib_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/codegen/intrinsic_lib.h"
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "xla/codegen/intrinsic/intrinsic.h"
+
+namespace xla::codegen::intrinsics {
+namespace {
+
+using ::testing::UnorderedElementsAre;
+
+std::string ToString(const llvm::VecDesc& vec_desc) {
+  return absl::StrJoin(
+      {vec_desc.getScalarFnName().str(), vec_desc.getVectorFnName().str(),
+       absl::StrCat(vec_desc.getVectorizationFactor().getKnownMinValue()),
+       vec_desc.getVABIPrefix()},
+      ":");
+}
+
+TEST(IntrinsicLibTest, ExpVectorizations) {
+  IntrinsicOptions options;
+  auto lib = IntrinsicFunctionLib(options);
+  std::vector<llvm::VecDesc> vec_descs = lib.Vectorizations();
+  std::vector<std::string> vec_descs_str;
+  for (const auto& vec_desc : vec_descs) {
+    if (absl::StrContains(vec_desc.getScalarFnName().str(), "xla.exp")) {
+      vec_descs_str.push_back(ToString(vec_desc));
+    }
+  }
+
+  EXPECT_THAT(vec_descs_str, UnorderedElementsAre(
+                                 "xla.exp.f64:xla.exp.v2f64:2:_ZGV_LLVM_N2v",
+                                 "xla.exp.f64:xla.exp.v4f64:4:_ZGV_LLVM_N4v",
+                                 "xla.exp.f64:xla.exp.v8f64:8:_ZGV_LLVM_N8v"));
+}
+
+}  // namespace
+
+}  // namespace xla::codegen::intrinsics
diff --git a/third_party/xla/xla/codegen/kernel_definition.h b/third_party/xla/xla/codegen/kernel_definition.h
index 420e231831aa9b..bfebb0318059a0 100644
--- a/third_party/xla/xla/codegen/kernel_definition.h
+++ b/third_party/xla/xla/codegen/kernel_definition.h
@@ -1,4 +1,3 @@
-#include "xla/codegen/kernel_source.h"
 /* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,47 +18,68 @@ limitations under the License.
 
 #include <utility>
 
+#include "xla/codegen/kernel_source.h"
 #include "xla/codegen/kernel_spec.h"
 
 namespace xla {
 
+//===----------------------------------------------------------------------===//
+// KernelDefinitionBase.
+//===----------------------------------------------------------------------===//
+
+// A base class for kernel definitions.
+//
+// KernelDefinition defines how the kernel must be executed via the `KernelSpec`
+// and also contains the `KernelSource` that implements the kernel itself.
 class KernelDefinitionBase {
  public:
+  explicit KernelDefinitionBase(KernelSpec spec) : spec_(std::move(spec)) {}
   virtual ~KernelDefinitionBase() = default;
 
-  virtual const KernelSpec& spec() const = 0;
+  const KernelSpec& spec() const { return spec_; }
+  KernelSpec& spec() { return spec_; }
+
   virtual const KernelSource& source() const = 0;
+  virtual KernelSource& source() = 0;
+
+ protected:
+  KernelDefinitionBase(KernelDefinitionBase&&) = default;
+  KernelDefinitionBase& operator=(KernelDefinitionBase&&) noexcept = default;
+
+ private:
+  KernelSpec spec_;
 };
 
-template <typename KernelSourceType>
+//===----------------------------------------------------------------------===//
+// KernelDefinition.
+//===----------------------------------------------------------------------===//
+
+// A concrete kernel definition implementation for the given kernel source type.
+template <typename Source>
 class KernelDefinition final : public KernelDefinitionBase {
- public:
-  struct Storage {
-    KernelSpec spec;
-    KernelSourceType source;
-  };
+  static_assert(std::is_base_of_v<KernelSource, Source>,
+                "Source must be a subclass of KernelSource");
 
-  KernelDefinition(KernelSpec spec, KernelSourceType source)
-      : storage_{std::move(spec), std::move(source)} {}
+ public:
+  KernelDefinition(KernelSpec spec, Source source)
+      : KernelDefinitionBase(std::move(spec)), source_(std::move(source)) {}
 
   KernelDefinition(KernelDefinition&&) = default;
   KernelDefinition& operator=(KernelDefinition&&) noexcept = default;
 
-  const KernelSpec& spec() const override { return storage_.spec; }
-  const KernelSourceType& source() const override { return storage_.source; }
+  const Source& source() const final { return source_; }
+  Source& source() final { return source_; }
 
-  // Release the kernel definition implementation.
-  // This is useful for backends that need to store the kernel definition
-  // separately from the kernel spec.
-  Storage ReleaseStorage() && { return std::move(storage_); }
+  // Moves ownership of the source to the caller.
+  Source TakeSource() && { return std::move(source_); }
 
  private:
-  Storage storage_;
+  Source source_;
 };
 
-template <typename KernelSourceType>
-KernelDefinition(KernelSpec, KernelSourceType)
-    -> KernelDefinition<KernelSourceType>;
+// Class template argument deduction guide for KernelDefinition.
+template <typename Source>
+KernelDefinition(KernelSpec, Source) -> KernelDefinition<Source>;
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/codegen/kernel_emitter.h b/third_party/xla/xla/codegen/kernel_emitter.h
index 3e7922e8dda3eb..03c380c9cb5c55 100644
--- a/third_party/xla/xla/codegen/kernel_emitter.h
+++ b/third_party/xla/xla/codegen/kernel_emitter.h
@@ -17,39 +17,55 @@ limitations under the License.
 #define XLA_CODEGEN_KERNEL_EMITTER_H_
 
 #include <memory>
-#include <string>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_source.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 
+//===----------------------------------------------------------------------===//
+// KernelEmitterBase.
+//===----------------------------------------------------------------------===//
+
+// A base class for emitting XLA kernels.
 class KernelEmitterBase {
  public:
+  KernelEmitterBase() = default;
   virtual ~KernelEmitterBase() = default;
 
+  virtual absl::string_view name() const = 0;
+
   virtual absl::StatusOr<std::unique_ptr<KernelDefinitionBase>>
-  EmitBaseKernelDefinition() = 0;
+  EmitKernelDefinitionBase() = 0;
+
+ protected:
+  KernelEmitterBase(KernelEmitterBase&&) = default;
+  KernelEmitterBase& operator=(KernelEmitterBase&&) = default;
 };
 
+//===----------------------------------------------------------------------===//
+// KernelEmitter.
+//===----------------------------------------------------------------------===//
+
 // KernelEmitter is an API that emits kernel definition from a given input
 // (i.e. it emits kernels compiled from HLO fusions).
-template <typename KernelDefinitionType>
+template <typename Source>
 class KernelEmitter : public KernelEmitterBase {
  public:
-  virtual ~KernelEmitter() = default;
-
-  virtual absl::StatusOr<KernelDefinitionType> EmitKernelDefinition() = 0;
+  static_assert(std::is_base_of_v<KernelSource, Source>,
+                "Source must be a subclass of KernelSource");
 
-  virtual std::string name() const = 0;
+  using KernelDefinition = ::xla::KernelDefinition<Source>;
+  virtual absl::StatusOr<KernelDefinition> EmitKernelDefinition() = 0;
 
  private:
   absl::StatusOr<std::unique_ptr<KernelDefinitionBase>>
-  EmitBaseKernelDefinition() final {
-    TF_ASSIGN_OR_RETURN(KernelDefinitionType kernel_definition,
-                        EmitKernelDefinition());
-    return std::make_unique<KernelDefinitionType>(std::move(kernel_definition));
+  EmitKernelDefinitionBase() final {
+    TF_ASSIGN_OR_RETURN(auto kernel_definition, EmitKernelDefinition());
+    return std::make_unique<KernelDefinition>(std::move(kernel_definition));
   }
 };
 
diff --git a/third_party/xla/xla/codegen/kernel_source.h b/third_party/xla/xla/codegen/kernel_source.h
index b1a81148c56fe6..fbf5f5d6fe53ff 100644
--- a/third_party/xla/xla/codegen/kernel_source.h
+++ b/third_party/xla/xla/codegen/kernel_source.h
@@ -25,10 +25,15 @@ namespace xla {
 // already compiled) or an LLVM IR (if XLA itself will compile it to PTX).
 class KernelSource {
  public:
+  KernelSource() = default;
   virtual ~KernelSource() = default;
 
   // Get a human readable string representation of the kernel source.
   virtual std::string ToString() const = 0;
+
+ protected:
+  KernelSource(KernelSource&&) = default;
+  KernelSource& operator=(KernelSource&&) = default;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/kernel_spec.h b/third_party/xla/xla/codegen/kernel_spec.h
index 01f9ff440ddcae..5ca578767aacf0 100644
--- a/third_party/xla/xla/codegen/kernel_spec.h
+++ b/third_party/xla/xla/codegen/kernel_spec.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/runtime/work_cluster.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/runtime/work_group.h"
@@ -50,9 +51,9 @@ class KernelSpec {
              absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
 
-  // Get the backend specific name of the kernel.
-  // This may be used to identify the kernel in the backend specific runtime.
-  const std::string& name() const { return name_; }
+  // Get the backend specific name of the kernel. This may be used to identify
+  // the kernel in the backend specific runtime.
+  absl::string_view name() const { return name_; }
 
   // Kernel work dimensions define how the kernel execution must be
   // parallelized. The meaning of these dimensions is backend specific, i.e.
@@ -66,12 +67,15 @@ class KernelSpec {
   // on the exact meaning of these dimensions and how they are mapped to the
   // underlying hardware, and how to use them for perfrormance optimization.
   WorkDimensions work_dimensions() const { return work_dimensions_; }
+
   NumWorkClusters num_workclusters() const {
     return work_dimensions_.num_work_clusters;
   }
+
   NumWorkGroups num_workgroups() const {
     return work_dimensions_.num_work_groups;
   }
+
   NumWorkItems num_workitems() const { return work_dimensions_.num_work_items; }
 
   // Requested amount of scratch bytes for the kernel (backed by backend
@@ -80,9 +84,14 @@ class KernelSpec {
   std::optional<size_t> scratch_bytes() const { return scratch_bytes_; }
 
   // Argument buffers read by the kernel.
-  const Buffers& argument_buffers() const { return argument_buffers_; }
+  absl::Span<const BufferAllocation::Slice> argument_buffers() const {
+    return argument_buffers_;
+  }
+
   // Result buffers written to by the kernel.
-  const Buffers& result_buffers() const { return result_buffers_; }
+  absl::Span<const BufferAllocation::Slice> result_buffers() const {
+    return result_buffers_;
+  }
 
   // Returns a set of invariant arguments (corresponding to the indices in the
   // argument buffers list).
diff --git a/third_party/xla/xla/codegen/llvm_ir_kernel_source.cc b/third_party/xla/xla/codegen/llvm_kernel_source.cc
similarity index 89%
rename from third_party/xla/xla/codegen/llvm_ir_kernel_source.cc
rename to third_party/xla/xla/codegen/llvm_kernel_source.cc
index c7a0ad85bc6463..70aa5055951001 100644
--- a/third_party/xla/xla/codegen/llvm_ir_kernel_source.cc
+++ b/third_party/xla/xla/codegen/llvm_kernel_source.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 
 #include <string>
 
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace xla {
 
-std::string LlvmIrKernelSource::ToString() const {
+std::string LlvmKernelSource::ToString() const {
   return module_.withModuleDo(
       [&](llvm::Module& m) { return llvm_ir::DumpToString(&m); });
 }
diff --git a/third_party/xla/xla/codegen/llvm_ir_kernel_source.h b/third_party/xla/xla/codegen/llvm_kernel_source.h
similarity index 75%
rename from third_party/xla/xla/codegen/llvm_ir_kernel_source.h
rename to third_party/xla/xla/codegen/llvm_kernel_source.h
index 3a58afcd08770d..03b19cd2850b46 100644
--- a/third_party/xla/xla/codegen/llvm_ir_kernel_source.h
+++ b/third_party/xla/xla/codegen/llvm_kernel_source.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
-#define XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
+#ifndef XLA_CODEGEN_LLVM_KERNEL_SOURCE_H_
+#define XLA_CODEGEN_LLVM_KERNEL_SOURCE_H_
 
 #include <memory>
 #include <string>
@@ -31,14 +31,14 @@ namespace xla {
 // implementation we might emit a single LLVM module with multiple kernels or a
 // separate LLVM module for each kernel. Kernel function signature is defined by
 // the backend specific ABI.
-class LlvmIrKernelSource final : public KernelSource {
+class LlvmKernelSource final : public KernelSource {
  public:
-  LlvmIrKernelSource(llvm::orc::ThreadSafeContext context,
-                     std::unique_ptr<llvm::Module> module)
+  LlvmKernelSource(llvm::orc::ThreadSafeContext context,
+                   std::unique_ptr<llvm::Module> module)
       : module_(std::move(module), std::move(context)) {}
 
-  LlvmIrKernelSource(LlvmIrKernelSource&& other) = default;
-  LlvmIrKernelSource& operator=(LlvmIrKernelSource&& other) noexcept = default;
+  LlvmKernelSource(LlvmKernelSource&& other) = default;
+  LlvmKernelSource& operator=(LlvmKernelSource&& other) noexcept = default;
 
   llvm::orc::ThreadSafeModule thread_safe_module() && {
     return std::move(module_);
@@ -52,4 +52,4 @@ class LlvmIrKernelSource final : public KernelSource {
 
 }  // namespace xla
 
-#endif  // XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
+#endif  // XLA_CODEGEN_LLVM_KERNEL_SOURCE_H_
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.cc b/third_party/xla/xla/codegen/mlir_kernel_source.cc
index f2f0a2e651d786..8e7ea28a9b8198 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.cc
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.cc
@@ -30,30 +30,32 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/util.h"
 
 namespace xla {
 
 absl::StatusOr<MlirKernelSource> MlirKernelSource::ParseFromString(
-    absl::string_view ir, std::unique_ptr<mlir::MLIRContext> context) {
+    absl::string_view ir, std::unique_ptr<mlir::MLIRContext> mlir_context) {
   llvm::SourceMgr source_mgr;
 
   std::string error_string;
   llvm::raw_string_ostream error_stream(error_string);
-  mlir::SourceMgrDiagnosticHandler source_mgr_handler(source_mgr, context.get(),
-                                                      error_stream);
+  mlir::SourceMgrDiagnosticHandler source_mgr_handler(
+      source_mgr, mlir_context.get(), error_stream);
 
   source_mgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(ir),
                                 llvm::SMLoc());
 
   mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::parseSourceFile<mlir::ModuleOp>(source_mgr, context.get());
+      mlir::parseSourceFile<mlir::ModuleOp>(source_mgr, mlir_context.get());
 
   if (!mlir_module) {
     return Internal("Failed to parse MLIR IR: %s", error_string);
   }
 
-  return MlirKernelSource(std::move(context), std::move(mlir_module));
+  return MlirKernelSource(std::move(mlir_context),
+                          std::move(mlir_module));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.h b/third_party/xla/xla/codegen/mlir_kernel_source.h
index dc7d5344623897..1beeb9b5a3a708 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.h
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/DebugStringHelper.h"
 #include "xla/codegen/kernel_source.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
@@ -39,21 +41,17 @@ namespace xla {
 // compiler.
 class MlirKernelSource final : public KernelSource {
  public:
-  struct Storage {
-    std::unique_ptr<mlir::MLIRContext> context;
-    mlir::OwningOpRef<mlir::ModuleOp> module;
-  };
-
   // Construct a MLIR kernel source from a module and take ownership of its MLIR
   // context.
-  MlirKernelSource(std::unique_ptr<mlir::MLIRContext> context,
+  MlirKernelSource(std::unique_ptr<mlir::MLIRContext> mlir_context,
                    mlir::OwningOpRef<mlir::ModuleOp> module)
-      : storage_{std::move(context), std::move(module)} {}
+      : mlir_context_(std::move(mlir_context)),
+        module_(std::move(module)) {}
 
   // Construct a MLIR kernel source from a module but don't take any ownership
   // of the MLIR context.
   explicit MlirKernelSource(mlir::OwningOpRef<mlir::ModuleOp> module)
-      : storage_{nullptr, std::move(module)} {}
+      : MlirKernelSource(nullptr, std::move(module)) {}
 
   MlirKernelSource(MlirKernelSource&& other) noexcept = default;
   MlirKernelSource& operator=(MlirKernelSource&& other) noexcept = default;
@@ -61,16 +59,22 @@ class MlirKernelSource final : public KernelSource {
   static absl::StatusOr<MlirKernelSource> ParseFromString(
       absl::string_view ir, std::unique_ptr<mlir::MLIRContext> context);
 
-  mlir::ModuleOp module() { return *storage_.module; }
+  mlir::ModuleOp module() { return *module_; }
 
-  Storage ReleaseStorage() && { return std::move(storage_); }
+  mlir::MLIRContext* mlir_context() { return mlir_context_.get(); }
 
-  std::string ToString() const final {
-    return mlir::debugString(*storage_.module);
+  // Moves ownership of the module to the caller.
+  mlir::OwningOpRef<mlir::ModuleOp> TakeModule() && {
+    DCHECK(mlir_context_ == nullptr && mlir_context_ == nullptr)
+        << "Can't move ownership of the module owned by the MlirKernelSource";
+    return std::move(module_);
   }
 
+  std::string ToString() const final { return mlir::debugString(*module_); }
+
  private:
-  Storage storage_;
+  std::unique_ptr<mlir::MLIRContext> mlir_context_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/testlib/BUILD b/third_party/xla/xla/codegen/testlib/BUILD
index 0aacdb3868ae39..36725b09c7c6d3 100644
--- a/third_party/xla/xla/codegen/testlib/BUILD
+++ b/third_party/xla/xla/codegen/testlib/BUILD
@@ -57,11 +57,7 @@ tsl_pybind_extension(
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_source",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
diff --git a/third_party/xla/xla/codegen/testlib/__init__.py b/third_party/xla/xla/codegen/testlib/__init__.py
index a12205ccf1235f..4411b309bd3ebb 100644
--- a/third_party/xla/xla/codegen/testlib/__init__.py
+++ b/third_party/xla/xla/codegen/testlib/__init__.py
@@ -30,9 +30,9 @@
 KernelEmitterBase = _extension.KernelEmitterBase
 KernelRunner = _extension.KernelRunner
 KernelSpec = _extension.KernelSpec
-LlvmIrKernelSource = _extension.LlvmIrKernelSource
 LlvmKernelDefinition = _extension.LlvmKernelDefinition
 LlvmKernelEmitter = _extension.LlvmKernelEmitter
+LlvmKernelSource = _extension.LlvmKernelSource
 MlirKernelDefinition = _extension.MlirKernelDefinition
 MlirKernelEmitter = _extension.MlirKernelEmitter
 MlirKernelSource = _extension.MlirKernelSource
diff --git a/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc b/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
index fe263a8c12af53..5efad7c84a2f8b 100644
--- a/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
@@ -38,11 +38,7 @@ limitations under the License.
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/kernel_source.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/comparison_util.h"
@@ -178,8 +174,8 @@ NB_MODULE(_extension, kernel_runner_module) {
   nb::class_<KernelSource>(kernel_runner_module, "KernelSource")
       .def("__str__", &KernelSource::ToString);
 
-  nb::class_<LlvmIrKernelSource, KernelSource> llvm_kernel_source(
-      kernel_runner_module, "LlvmIrKernelSource");
+  nb::class_<LlvmKernelSource, KernelSource> llvm_kernel_source(
+      kernel_runner_module, "LlvmKernelSource");
 
   nb::class_<MlirKernelSource, KernelSource>(kernel_runner_module,
                                              "MlirKernelSource")
@@ -197,29 +193,37 @@ NB_MODULE(_extension, kernel_runner_module) {
   nb::class_<KernelSpec> kernel_spec(kernel_runner_module, "KernelSpec");
 
   nb::class_<KernelDefinitionBase>(kernel_runner_module, "KernelDefinitionBase")
-      .def("spec", &KernelDefinitionBase::spec,
-           nb::rv_policy::reference_internal)
-      .def("source", &KernelDefinitionBase::source,
-           nb::rv_policy::reference_internal);
+      .def(
+          "spec",
+          [](const KernelDefinitionBase* self) -> const KernelSpec& {
+            return self->spec();
+          },
+          nb::rv_policy::reference_internal)
+      .def(
+          "source",
+          [](const KernelDefinitionBase* self) -> const KernelSource& {
+            return self->source();
+          },
+          nb::rv_policy::reference_internal);
 
-  nb::class_<MlirKernelDefinition, KernelDefinitionBase>(
+  nb::class_<KernelDefinition<MlirKernelSource>, KernelDefinitionBase>(
       kernel_runner_module, "MlirKernelDefinition");
-  nb::class_<LlvmKernelDefinition, KernelDefinitionBase>(
+  nb::class_<KernelDefinition<LlvmKernelSource>, KernelDefinitionBase>(
       kernel_runner_module, "LlvmKernelDefinition");
 
   nb::class_<KernelEmitterBase>(kernel_runner_module, "KernelEmitterBase")
       .def("emit_kernel_definition", [](KernelEmitterBase* self) {
         absl::StatusOr<std::unique_ptr<KernelDefinitionBase>> definition =
-            self->EmitBaseKernelDefinition();
+            self->EmitKernelDefinitionBase();
         if (!definition.ok()) {
           throw std::runtime_error(std::string(definition.status().message()));
         }
         return std::move(definition).value();
       });
-  nb::class_<MlirKernelEmitter, KernelEmitterBase>(kernel_runner_module,
-                                                   "MlirKernelEmitter");
-  nb::class_<LlvmKernelEmitter, KernelEmitterBase>(kernel_runner_module,
-                                                   "LlvmKernelEmitter");
+  nb::class_<KernelEmitter<MlirKernelSource>, KernelEmitterBase>(
+      kernel_runner_module, "MlirKernelEmitter");
+  nb::class_<KernelEmitter<LlvmKernelSource>, KernelEmitterBase>(
+      kernel_runner_module, "LlvmKernelEmitter");
 
   nb::class_<KernelRunner>(kernel_runner_module, "KernelRunner")
       .def("call", &KernelRunnerCall);
diff --git a/third_party/xla/xla/codegen/tiling/BUILD b/third_party/xla/xla/codegen/tiling/BUILD
index 5056574c12b021..e92bdb26903796 100644
--- a/third_party/xla/xla/codegen/tiling/BUILD
+++ b/third_party/xla/xla/codegen/tiling/BUILD
@@ -1,6 +1,6 @@
-load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -49,9 +49,8 @@ xla_cc_test(
         "//xla/hlo/analysis:indexing_test_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
     ],
@@ -105,8 +104,6 @@ xla_cc_test(
         "//xla/hlo/analysis:indexing_test_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
     ],
@@ -120,9 +117,15 @@ cc_library(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
@@ -134,14 +137,23 @@ xla_cc_test(
     name = "tiled_hlo_schedule_test",
     srcs = ["tiled_hlo_schedule_test.cc"],
     deps = [
+        ":symbolic_tile_analysis",
         ":tiled_hlo_schedule",
+        ":tiling_specification",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -152,6 +164,7 @@ cc_library(
     hdrs = ["affine_map_evaluator.h"],
     deps = [
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -277,6 +290,7 @@ xla_cc_test(
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
@@ -317,6 +331,7 @@ xla_cc_test(
     deps = [
         ":symbolic_tile_analysis",
         ":tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -344,11 +359,13 @@ cc_library(
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
         "//xla/service:name_uniquer",
         "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -384,6 +401,7 @@ xla_cc_test(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_test_utils",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
diff --git a/third_party/xla/xla/codegen/tiling/affine_map_evaluator.cc b/third_party/xla/xla/codegen/tiling/affine_map_evaluator.cc
index 4c79247b98d646..24223569602494 100644
--- a/third_party/xla/xla/codegen/tiling/affine_map_evaluator.cc
+++ b/third_party/xla/xla/codegen/tiling/affine_map_evaluator.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/types/span.h"
 #include "llvm/Support/MathExtras.h"
diff --git a/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc b/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
index 3921d7f05886c1..72152b9cee4444 100644
--- a/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
+++ b/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
@@ -42,7 +42,7 @@ class ConstraintExpressionTest : public IndexingTestBase {
  public:
   ConstraintExpression::Constraint GetConstraint(const std::string& string_expr,
                                                  int64_t lower, int64_t upper) {
-    return {ParseAffineExpr(string_expr, &symbolic_expr_context_),
+    return {ParseAffineExpr(string_expr, &mlir_context_),
             Interval{lower, upper}};
   }
   ConstraintExpression Simplify(ConstraintExpression constraints) {
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
index 1d0599894c83b0..c45b9eb185297a 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
@@ -58,16 +58,19 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
 #include "xla/codegen/tiling/tiled_hlo_fusion_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/name_uniquer.h"
@@ -149,60 +152,12 @@ llvm::SmallVector<int64_t> GetNumberOfTilesPerDimension(
   return result;
 }
 
-// Helper to produce a map from a program id to tile offsets.
-//
-// `tile_offsets_indexing` takes in as many parameters as there are tiling
-// parameters in the whole fusion, but we don't always want to linearize
-// indexing into the whole parameter space.
-//
-// In order to linearize indices over only a subset of the axes, we provide a
-// vector `major_to_minor_active_tiling_parameters` that indicates which
-// parameters are "active" (i.e. where the relevant axis should be taken into
-// account), and in which order (major-to-minor) the axes should be processed.
-IndexingMap LinearizeTileOffsets(
-    const IndexingMap& tile_offsets_indexing,
-    absl::Span<const int64_t> num_output_tiles_per_dim,
-    absl::Span<const int64_t> major_to_minor_active_tiling_parameters,
-    mlir::MLIRContext* mlir_context) {
-  // Gather the active output tile sizes in major-to-minor order so as to
-  // produce the right delinearized index.
-  std::vector<int64_t> active_num_output_tiles_per_dim;
-  active_num_output_tiles_per_dim.reserve(
-      major_to_minor_active_tiling_parameters.size());
-  for (int64_t dim_id : major_to_minor_active_tiling_parameters) {
-    active_num_output_tiles_per_dim.push_back(num_output_tiles_per_dim[dim_id]);
-  }
-  int64_t num_tiles = Product(num_output_tiles_per_dim);
-  CHECK_EQ(num_tiles, Product(active_num_output_tiles_per_dim));
-  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
-  std::vector<mlir::AffineExpr> tile_exprs(
-      num_output_tiles_per_dim.size(),
-      mlir::getAffineConstantExpr(0, mlir_context));
-  for (auto [dim_id, tile_expr] :
-       llvm::zip(major_to_minor_active_tiling_parameters,
-                 DelinearizeIndex(active_num_output_tiles_per_dim, program_id,
-                                  mlir_context))) {
-    tile_exprs[dim_id] = tile_expr;
-  }
-  std::vector<IndexingMap::Variable> dim_vars{{0, num_tiles - 1, "pid_0"}};
-  IndexingMap program_id_to_output_dims{
-      mlir::AffineMap::get(
-          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, mlir_context),
-      dim_vars, /*range_vars=*/{}, /*rt_vars=*/{}};
-  auto linearized_tile_offsets_indexing =
-      ComposeIndexingMaps(program_id_to_output_dims, tile_offsets_indexing);
-  linearized_tile_offsets_indexing.Simplify();
-  linearized_tile_offsets_indexing.RescaleSymbols();
-  linearized_tile_offsets_indexing.RemoveUnusedSymbols();
-  return linearized_tile_offsets_indexing;
-}
-
 // Creates the concrete tiling of an output of the computation from the
 // indexing map of the computation's root and the tile sizes.
 absl::StatusOr<OutputTilingInfo> ComputeOutputTilingInfo(
     const IndexingMap& root_indexing, absl::Span<const int64_t> tile_sizes,
     absl::Span<const int64_t> major_to_minor_active_tiling_parameters,
-    mlir::MLIRContext* mlir_context,
+    const TiledHloSchedule& schedule, MLIRContext* mlir_context,
     const std::optional<absl::Span<const Interval>>&
         parent_output_tile_dim_bounds = std::nullopt) {
   int64_t num_tiling_parameters = root_indexing.GetDimVarsCount();
@@ -219,6 +174,8 @@ absl::StatusOr<OutputTilingInfo> ComputeOutputTilingInfo(
   llvm::SmallVector<std::pair<mlir::AffineExpr, Interval>> constraints;
 
   std::vector<Interval> all_dim_bounds = root_indexing.GetDimensionBounds();
+  std::vector<DimensionInfo> iteration_space;
+  iteration_space.reserve(major_to_minor_active_tiling_parameters.size());
   for (int64_t dim_id : major_to_minor_active_tiling_parameters) {
     const Interval& dim_bounds = all_dim_bounds[dim_id];
     int64_t tile_size = tile_sizes[dim_id];
@@ -257,6 +214,7 @@ absl::StatusOr<OutputTilingInfo> ComputeOutputTilingInfo(
                        ToString(root_indexing)));
     }
     outer_loop_bounds[dim_id] = upper_bound;
+    iteration_space.push_back({dim_id, upper_bound});
 
     // TODO(b/393299275): naming is not correct as that might also be a nested
     // tile parameter.
@@ -273,16 +231,14 @@ absl::StatusOr<OutputTilingInfo> ComputeOutputTilingInfo(
       dim_vars, /*range_vars=*/{}, /*rt_vars=*/root_indexing.GetRTVars(),
       constraints};
 
-  // TODO(b/417977182): revisit linearization. This makes it hard to do things
-  // like grid tiling, for instance.
-  IndexingMap linear_output_tile_offset_indexing = LinearizeTileOffsets(
-      output_tile_offset_indexing, outer_loop_bounds,
-      major_to_minor_active_tiling_parameters, mlir_context);
+  TF_ASSIGN_OR_RETURN(IndexingMap program_id_to_tile_offsets,
+                      schedule.Schedule(output_tile_offset_indexing,
+                                        iteration_space, mlir_context));
   return OutputTilingInfo{outer_loop_bounds,
                           output_tile_offset_indexing,
                           {major_to_minor_active_tiling_parameters.begin(),
                            major_to_minor_active_tiling_parameters.end()},
-                          linear_output_tile_offset_indexing};
+                          program_id_to_tile_offsets};
 }
 
 // Extension of SymbolicTiledHloInstruction for fusions that holds the analysis
@@ -305,8 +261,7 @@ class SymbolicTiledHloFusionInstruction : public SymbolicTiledHloInstruction {
 // output.
 absl::StatusOr<IndexingMap> ComputeTileOffsetIndexing(
     const SymbolicTiledHloInstruction& tiled_hlo,
-    const IndexingMap& output_tile_offset_indexing,
-    mlir::MLIRContext* mlir_context) {
+    const IndexingMap& output_tile_offset_indexing, MLIRContext* mlir_context) {
   VLOG(4) << "ComputeTileOffsetIndexing, combining output "
           << ToString(output_tile_offset_indexing) << " with operation "
           << tiled_hlo.ToString();
@@ -426,12 +381,8 @@ class OrderedUniquePtrValueHashSet {
 bool IsWithinNestedGemmFusion(const HloInstruction* hlo) {
   const HloComputation* computation = hlo->parent();
   if (computation->IsFusionComputation()) {
-    const gpu::GpuBackendConfig backend_config =
-        *computation->FusionInstruction()
-             ->backend_config<gpu::GpuBackendConfig>();
-    absl::string_view fusion_kind =
-        backend_config.fusion_backend_config().kind();
-    return fusion_kind == gpu::kTritonNestedGemmFusionKind;
+    return gpu::IsGpuFusionKind(*computation->FusionInstruction(),
+                                gpu::kTritonNestedGemmFusionKind);
   }
 
   return false;
@@ -467,7 +418,7 @@ FusionDecision ShouldProceedWithSymbolicTileDerivation(
   if (hlo->opcode() == HloOpcode::kReshape ||
       hlo->opcode() == HloOpcode::kBitcast) {
     mlir::MLIRContext* ctx = indexing_map.GetMLIRContext();
-
+    // TODO(b/446856303): Get MLIRContext from indexing_map.
     IndexingMap reshape_indexing_map =
         ComputeOutputToInputIndexing(hlo, /*output_id=*/0, ctx)
             .indexing_maps[0]
@@ -583,6 +534,41 @@ void SortTiledHloInstructionsInPostOrder(
   }
 }
 
+// Returns `true` if the given dot instruction has a non-batch point dimension.
+//
+// This function will perform these checks for all `dot`-like instructions for
+// which `IsSomeDot` returns `true`.
+bool IsDotWithNonBatchPointDimension(const HloInstruction* instr) {
+  if (!IsSomeDot(instr)) {
+    return false;
+  }
+
+  auto has_any_trivial_dimension = [](const Shape& shape,
+                                      absl::Span<const int64_t> dimensions) {
+    return absl::c_any_of(
+        dimensions, [&](int64_t dim) { return shape.dimensions(dim) == 1; });
+  };
+
+  absl::Span<const int64_t> lhs_contracting_dimensions =
+      instr->dot_dimension_numbers().lhs_contracting_dimensions();
+  auto lhs_non_contracting_dimensions = GetNonContractingDims(
+      instr->operand(0)->shape().dimensions().size(),
+      lhs_contracting_dimensions,
+      instr->dot_dimension_numbers().lhs_batch_dimensions());
+
+  auto rhs_non_contracting_dimensions = GetNonContractingDims(
+      instr->operand(1)->shape().dimensions().size(),
+      instr->dot_dimension_numbers().rhs_contracting_dimensions(),
+      instr->dot_dimension_numbers().rhs_batch_dimensions());
+
+  return has_any_trivial_dimension(instr->operand(0)->shape(),
+                                   lhs_non_contracting_dimensions) ||
+         has_any_trivial_dimension(instr->operand(0)->shape(),
+                                   lhs_contracting_dimensions) ||
+         has_any_trivial_dimension(instr->operand(1)->shape(),
+                                   rhs_non_contracting_dimensions);
+}
+
 // Returns `true` if `SymbolicTileAnalysis` should simplify point dimensions
 // away when deriving indexing maps.
 //
@@ -613,7 +599,11 @@ bool ShouldDerivationSimplifyPointDimensions(const HloFusionAdaptor& fusion) {
       continue;
     }
 
-    if (IsSomeDot(&instruction_adaptor.instruction())) {
+    // We're OK with simplifying point dimensions if they occur only in the
+    // batch dimensions of a dot, but not if they occur in the contracting or
+    // or non-contracting dimensions. That's because batch dimensions are
+    // unconstrained by the hardware, unlike the others.
+    if (IsDotWithNonBatchPointDimension(&instruction_adaptor.instruction())) {
       return false;
     }
 
@@ -754,7 +744,6 @@ llvm::SmallVector<const TiledHloInstruction*> MapToTiledInstructions(
 
 }  // anonymous namespace
 
-
 // Extracts `HloInstruction`s from a span of `HloInstructionAdaptor`s.
 absl::InlinedVector<const HloInstruction*, 2> ToInstructions(
     absl::Span<const HloInstructionAdaptor> instruction_adaptors) {
@@ -840,7 +829,7 @@ std::vector<int64_t> InputSpaceForParameterMapping(
 absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
     const HloInstruction* root,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    MLIRContext* ctx) {
+    MLIRContext* mlir_context) {
   std::vector<int64_t> input_space =
       InputSpaceForParameterMapping(parameter_mapping);
   int64_t num_output_parameters = root->shape().dimensions().size();
@@ -856,12 +845,12 @@ absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
       for (int64_t parameter_index = num_hidden_parameters;
            parameter_index < num_tiling_parameters; ++parameter_index) {
         result_exprs.push_back(
-            mlir::getAffineDimExpr(dim_offset + parameter_index, ctx));
+            mlir::getAffineDimExpr(dim_offset + parameter_index, mlir_context));
       }
       CHECK_EQ(result_exprs.size(), num_output_parameters);
 
       mlir::AffineMap affine_map = mlir::AffineMap::get(
-          input_space.size(), /*symbolCount=*/0, result_exprs, ctx);
+          input_space.size(), /*symbolCount=*/0, result_exprs, mlir_context);
 
       return IndexingMap::FromTensorSizes(affine_map, std::move(input_space),
                                           /*symbol_upper_bounds=*/{});
@@ -878,7 +867,7 @@ absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
 /*static*/ absl::StatusOr<RootIndexing> SymbolicTileAnalysis::GetRootIndexing(
     const HloFusionAdaptor& fusion,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    MLIRContext* ctx) {
+    MLIRContext* mlir_context) {
   auto fusion_adaptor_roots = fusion.GetRoots();
 
   TF_ASSIGN_OR_RETURN(int64_t real_root_index,
@@ -891,27 +880,28 @@ absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
   absl::InlinedVector<const HloInstruction*, 2> roots =
       ToInstructions(fusion_adaptor_roots);
 
-  TF_ASSIGN_OR_RETURN(IndexingMap indexing_map,
-                      IndexingMapForRootInstruction(roots[real_root_index],
-                                                    parameter_mapping, ctx));
+  TF_ASSIGN_OR_RETURN(
+      IndexingMap indexing_map,
+      IndexingMapForRootInstruction(roots[real_root_index], parameter_mapping,
+                                    mlir_context));
 
   return RootIndexing{real_root_index, std::move(roots),
                       /*real_root_indexing=*/std::move(indexing_map)};
 }
 
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeComputation(
-    const HloComputation& computation, MLIRContext* ctx,
+    const HloComputation& computation, MLIRContext* mlir_context,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder) {
   auto fusion = HloFusionAdaptor::ForComputation(&computation);
   return SymbolicTileAnalysis::AnalyzeFusion(
-      *fusion, ctx, emitter_specific_constraints_builder);
+      *fusion, mlir_context, emitter_specific_constraints_builder);
 }
 
 /*static*/ SymbolicTileAnalysisOrError
 SymbolicTileAnalysis::AnalyzeNestedFusion(
     const HloFusionAdaptor& fusion_adaptor,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    MLIRContext* ctx, const IndexingMap& indexing_map,
+    MLIRContext* mlir_context, const IndexingMap& indexing_map,
     IndexingMap::SimplifyPointDimensions simplification_mode,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
     std::vector<SymbolicTiledHloInstruction*> root_runtime_variables) {
@@ -924,12 +914,13 @@ SymbolicTileAnalysis::AnalyzeNestedFusion(
           root->parameter_number());
     }
   }
+  CHECK_EQ(nested_roots.size(), 1);
   RootIndexing nested_root_indexing{/*real_root_index=*/0,
                                     /*roots=*/nested_roots,
                                     /*real_root_indexing=*/indexing_map};
 
   return SymbolicTileAnalysis::AnalyzeFusionImpl(
-      fusion_adaptor, parameter_mapping, ctx, nested_root_indexing,
+      fusion_adaptor, parameter_mapping, mlir_context, nested_root_indexing,
       simplification_mode, emitter_specific_constraints_builder,
       root_runtime_variables);
 }
@@ -1153,10 +1144,10 @@ ComposeIndexingResult ComposeInstructionIndexing(
 }
 
 std::vector<OperandIndexingSet> GetOperandIndexingMaps(
-    const HloInstruction* hlo, MLIRContext* ctx) {
+    const HloInstruction* hlo, MLIRContext* mlir_context) {
   std::vector<OperandIndexingSet> indexing_maps;
   HloInstructionIndexing operands_indexing =
-      ComputeOutputToInputIndexing(hlo, /*output_id=*/0, ctx);
+      ComputeOutputToInputIndexing(hlo, /*output_id=*/0, mlir_context);
   if (hlo->opcode() == HloOpcode::kPad) {
     OperandIndexing pad_indexing_map =
         *operands_indexing.indexing_maps[0].begin();
@@ -1180,7 +1171,7 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeFusionImpl(
     const HloFusionAdaptor& fusion,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    MLIRContext* ctx, const RootIndexing& root_indexing,
+    MLIRContext* mlir_context, const RootIndexing& root_indexing,
     IndexingMap::SimplifyPointDimensions simplification_mode,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
     std::vector<SymbolicTiledHloInstruction*> root_runtime_variables) {
@@ -1215,7 +1206,7 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
     }
 
     auto operands_indexing =
-        GetOperandIndexingMaps(tiled_hlo_instruction->hlo(), ctx);
+        GetOperandIndexingMaps(tiled_hlo_instruction->hlo(), mlir_context);
 
     HloInstructionAdaptor instruction_adaptor(*hlo, &fusion);
     for (auto [operand_pos, operand_and_indexing_map_set] : llvm::enumerate(
@@ -1248,7 +1239,7 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
             operand.instruction().fused_instructions_computation());
 
         auto analysis_or = SymbolicTileAnalysis::AnalyzeNestedFusion(
-            *nested_fusion_adaptor, parameter_mapping, ctx,
+            *nested_fusion_adaptor, parameter_mapping, mlir_context,
             composed_indexing.indexing_map, simplification_mode,
             emitter_specific_constraints_builder,
             composed_indexing.rt_operands);
@@ -1308,11 +1299,12 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
 
   return SymbolicTileAnalysis(std::move(tiled_hlo_instructions), root_indexing,
                               std::move(tiling_specification),
-                              std::move(emitter_specific_constraints), ctx);
+                              std::move(emitter_specific_constraints),
+                              mlir_context);
 }
 
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeFusion(
-    const HloFusionAdaptor& fusion, MLIRContext* ctx,
+    const HloFusionAdaptor& fusion, MLIRContext* mlir_context,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder) {
   auto real_root_index_or = GetRealRootIndex(fusion.GetRoots());
   if (!real_root_index_or.ok()) {
@@ -1325,7 +1317,8 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
     return FusionDecision(parameter_mapping_or.status());
   }
 
-  auto root_indexing_or = GetRootIndexing(fusion, *parameter_mapping_or, ctx);
+  auto root_indexing_or =
+      GetRootIndexing(fusion, *parameter_mapping_or, mlir_context);
   if (!root_indexing_or.ok()) {
     return FusionDecision(root_indexing_or.status());
   }
@@ -1334,8 +1327,9 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
           ? IndexingMap::SimplifyPointDimensions::kReplace
           : IndexingMap::SimplifyPointDimensions::kPreserve;
 
-  return AnalyzeFusionImpl(fusion, std::move(*parameter_mapping_or), ctx,
-                           std::move(*root_indexing_or), simplification_mode,
+  return AnalyzeFusionImpl(fusion, std::move(*parameter_mapping_or),
+                           mlir_context, std::move(*root_indexing_or),
+                           simplification_mode,
                            emitter_specific_constraints_builder,
                            /*root_runtime_variables=*/{});
 }
@@ -1401,7 +1395,27 @@ namespace {
 // returns a FailedPrecondition error if not.
 absl::StatusOr<bool> IsSafeForBufferSharing(const TiledHloInstruction& output,
                                             int64_t reference_num_output_tiles,
-                                            mlir::MLIRContext* mlir_context) {
+                                            const TiledHloSchedule& schedule,
+                                            MLIRContext* mlir_context) {
+  // TODO(b/453611980): this function can not behave well with regards to
+  // schedules other than the default major-to-minor. This is because
+  // non-trivial schedules require understanding the semantics of the iteration
+  // space (i.e. what tiling parameters correspond to what dimension of the
+  // iteration space), and the call to `ComputeOutputTilingInfo` below will
+  // sometimes use a different number of tiling parameters, or a different
+  // ordering for them, without giving us the ability to easily recoup this
+  // information.
+  //
+  // To remediate this issue in the short-term, we just return an error if a
+  // non-default schedule is provided here. In the long-term, the proper
+  // solution will be to construct a single iteration space that will be used
+  // consistently for tiling and scheduling. (This is already the case
+  // everywhere else than here.)
+  if (!dynamic_cast<const MajorToMinorTiledHloSchedule*>(&schedule)) {
+    return absl::UnimplementedError(
+        "Buffer sharing is only supported with the default schedule");
+  }
+
   // For expanding reshapes, we can have the case that the number of
   // blocks are different. This is not supported by the triton emitter.
   llvm::SmallVector<int64_t> num_tiles_per_dim =
@@ -1417,11 +1431,10 @@ absl::StatusOr<bool> IsSafeForBufferSharing(const TiledHloInstruction& output,
   auto iota = llvm::seq<int64_t>(0, output.hlo()->shape().dimensions().size());
   std::vector<int64_t> major_to_minor_active_tiling_parameters(iota.begin(),
                                                                iota.end());
-
   TF_ASSIGN_OR_RETURN(
       auto tiling_info,
       ComputeOutputTilingInfo(identity_indexing_map, output.tile_sizes(),
-                              major_to_minor_active_tiling_parameters,
+                              major_to_minor_active_tiling_parameters, schedule,
                               mlir_context));
 
   // Check whether the tile_offsets_indexing expression is the same as one
@@ -1441,8 +1454,9 @@ absl::StatusOr<std::vector<const TiledHloInstruction*>> InitializeTiledRoots(
     absl::Span<const HloInstruction* const> roots,
     const std::vector<std::unique_ptr<TiledHloInstruction>>&
         tiled_hlo_instructions,
+    const TiledHloSchedule& schedule,
     absl::Span<const int64_t> num_output_tiles_per_dim,
-    mlir::MLIRContext* mlir_context) {
+    MLIRContext* mlir_context) {
   // TODO(b/390559452): Investigate whether it is faster to use linear lookup.
   absl::flat_hash_map<const HloInstruction*, int64_t> roots_to_output_index;
   roots_to_output_index.reserve(roots.size());
@@ -1473,7 +1487,7 @@ absl::StatusOr<std::vector<const TiledHloInstruction*>> InitializeTiledRoots(
         bool valid, IsSafeForBufferSharing(*tiled_hlo_instr,
                                            /*reference_num_output_tiles=*/
                                            Product(num_output_tiles_per_dim),
-                                           mlir_context));
+                                           schedule, mlir_context));
     if (!valid) {
       continue;
     }
@@ -1514,15 +1528,17 @@ std::vector<int64_t> ExtractDimensionIds(AffineExpr expr) {
 absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructionsImpl(
     const SymbolicTileAnalysis& analysis,
     const FlatTiling& flat_tiling_parameters,
+    const TiledHloSchedule& tiled_hlo_schedule,
     std::vector<int64_t> major_to_minor_active_tiling_parameters,
     bool compute_all_tile_offset_indexing_maps,
     const std::optional<absl::Span<const Interval>>&
         parent_output_tile_dim_bounds,
-    MLIRContext* context,
+    MLIRContext* mlir_context,
     absl::flat_hash_map<const SymbolicTiledHloInstruction*,
                         TiledHloInstruction*>
         symbolic_to_tiled_hlo_map) {
   const IndexingMap& real_root_indexing = analysis.GetRealRootIndexing();
+
   for (mlir::AffineExpr expr : real_root_indexing.GetAffineMap().getResults()) {
     for (int64_t dim_id : ExtractDimensionIds(expr)) {
       if (absl::c_find(major_to_minor_active_tiling_parameters, dim_id) ==
@@ -1600,7 +1616,8 @@ absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructionsImpl(
   TF_ASSIGN_OR_RETURN(
       OutputTilingInfo output_tiling_info,
       ComputeOutputTilingInfo(real_root_indexing, flat_tiling_parameters,
-                              major_to_minor_active_tiling_parameters, context,
+                              major_to_minor_active_tiling_parameters,
+                              tiled_hlo_schedule, mlir_context,
                               parent_output_tile_dim_bounds));
 
   VLOG(3) << "output_tiling_info: " << output_tiling_info.ToString("; ");
@@ -1641,7 +1658,8 @@ absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructionsImpl(
           tile_offset_indexing,
           ComputeTileOffsetIndexing(
               *symbolic_tiled_hlo,
-              output_tiling_info.linear_output_tile_offset_indexing, context));
+              output_tiling_info.linear_output_tile_offset_indexing,
+              mlir_context));
       runtime_variables = MapToTiledInstructions(
           symbolic_tiled_hlo->runtime_variables(), symbolic_to_tiled_hlo_map);
       // Symbols here can only be runtime variables.
@@ -1669,9 +1687,9 @@ absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructionsImpl(
           auto tiled_hlo_computation,
           ComputeTiledHloInstructionsImpl(
               symbolic_fusion_tiling->analysis_, flat_tiling_parameters,
-              major_to_minor_active_tiling_parameters,
+              tiled_hlo_schedule, major_to_minor_active_tiling_parameters,
               compute_all_tile_offset_indexing_maps, fusion_tile_dim_bounds,
-              context, symbolic_to_tiled_hlo_map));
+              mlir_context, symbolic_to_tiled_hlo_map));
 
       TF_ASSIGN_OR_RETURN(
           tiled_instruction,
@@ -1698,9 +1716,9 @@ absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructionsImpl(
   auto tiled_hlo_instructions = tiled_hlo_instructions_set.ExtractData();
   TF_ASSIGN_OR_RETURN(
       auto tiled_roots,
-      InitializeTiledRoots(analysis.GetRoots(), tiled_hlo_instructions,
-                           output_tiling_info.num_output_tiles_per_dim,
-                           context));
+      InitializeTiledRoots(
+          analysis.GetRoots(), tiled_hlo_instructions, tiled_hlo_schedule,
+          output_tiling_info.num_output_tiles_per_dim, mlir_context));
   return TiledHloComputation::FromSortedTiledHloInstructions(
       std::move(tiled_hlo_instructions), tiled_roots,
       output_tiling_info.num_output_tiles_per_dim);
@@ -1710,7 +1728,8 @@ absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructionsImpl(
 
 absl::StatusOr<TiledHloComputation>
 SymbolicTileAnalysis::ComputeTiledHloInstructions(
-    const Tiling& tiling, bool constraints_are_known_satisfied,
+    const Tiling& tiling, const TiledHloScheduleBuilder& schedule_builder,
+    bool constraints_are_known_satisfied,
     bool compute_all_tile_offset_indexing_maps) const {
   // We first check that the provided tiling satisfies the constraints, if
   // necessary. We do this here instead of in `ComputeTiledHloInstructionsImpl`
@@ -1727,11 +1746,14 @@ SymbolicTileAnalysis::ComputeTiledHloInstructions(
   TF_ASSIGN_OR_RETURN(FlatTiling flat_tiling_parameters,
                       tiling.Flatten(GetTilingSpecification()));
 
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TiledHloSchedule> tiled_hlo_schedule,
+                      schedule_builder(GetTilingSpecification()));
+
   return ComputeTiledHloInstructionsImpl(
-      *this, flat_tiling_parameters,
+      *this, flat_tiling_parameters, *tiled_hlo_schedule,
       /*major_to_minor_active_tiling_parameters=*/{},
       compute_all_tile_offset_indexing_maps,
-      /*parent_output_tile_dim_bounds=*/std::nullopt, context_,
+      /*parent_output_tile_dim_bounds=*/std::nullopt, mlir_context_,
       /*symbolic_to_tiled_hlo_map=*/{});
 }
 
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
index de43bb9388472a..0a09103afc2c2f 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
@@ -32,8 +32,10 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tiled_hlo_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
+#include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -99,6 +101,10 @@ using EmitterSpecificConstraintsBuilder =
         const std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>&,
         const HloFusionAdaptor&)>;
 
+using TiledHloScheduleBuilder =
+    std::function<absl::StatusOr<std::unique_ptr<TiledHloSchedule>>(
+        const TilingSpecification&)>;
+
 // Constructs and holds symbolic tiles for all the instructions within a
 // computation. The analysis may hold several different symbolic tiles for the
 // same instruction if the instruction is indexed in several different ways in
@@ -124,7 +130,7 @@ class SymbolicTileAnalysis {
   // tiles of these operands may contain expressions with symbols which would
   // fail to be tiled.
   static SymbolicTileAnalysisOrError AnalyzeComputation(
-      const HloComputation& computation, mlir::MLIRContext* ctx,
+      const HloComputation& computation, mlir::MLIRContext* mlir_context,
       EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder =
           nullptr);
   static SymbolicTileAnalysisOrError AnalyzeFusion(
@@ -138,13 +144,19 @@ class SymbolicTileAnalysis {
   // constraints are satisfied by the chosen tiling parameters. Setting
   // `constraints_are_known_satisfied` to true bypasses this check.
   //
-  // `TiledHloInstruction` will have tile offset indexing map set if either:
-  // - compute_all_tile_offset_indexing_maps == true, or
-  // - there are at least two `TiledHloInstruction`s with the same hash. In that
-  //   case we need tile offset indexing map to decide if we can deduplicate
-  //   those instruction.
+  // `TiledHloInstruction`s will have their `tile_offset_indexing_map` set if
+  // either:
+  // - `compute_all_tile_offset_indexing_maps` is set, or
+  // - `compute_all_tile_offset_indexing_maps` is not set, but there are at
+  //   least two `TiledHloInstruction`s with the same hash. In that case,
+  //   `tile_offset_indexing_map`s are necessary to deduplicate operations.
+  // In either case, the iteration pattern for the `TiledHloInstruction`s will
+  // be dictated by the schedule derived from the provided schedule builder.
   absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructions(
-      const ::xla::Tiling& tiling, bool constraints_are_known_satisfied = false,
+      const Tiling& tiling,
+      const TiledHloScheduleBuilder& schedule_builder =
+          CreateMajorToMinorTiledHloSchedule,
+      bool constraints_are_known_satisfied = false,
       bool compute_all_tile_offset_indexing_maps = false) const;
 
   // Returns the roots of the computation in increasing order of their output
@@ -187,11 +199,10 @@ class SymbolicTileAnalysis {
   //
   // Returns `false` if the tiling does not conform to the tiling
   // specification.
-  absl::StatusOr<bool> ParametersSatisfyConstraints(
-      const ::xla::Tiling& tiling) const;
+  absl::StatusOr<bool> ParametersSatisfyConstraints(const Tiling& tiling) const;
 
-  // Return the underlying MLIRContext.
-  mlir::MLIRContext* GetMLIRContext() const { return context_; };
+  // Return the underlying mlir::MLIRContext.
+  mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; };
 
   // Returns a string representation of the analysis. Used only for error
   // messages and debugging.
@@ -208,24 +219,24 @@ class SymbolicTileAnalysis {
       const RootIndexing& root_indexing,
       TilingSpecification tiling_specification,
       std::unique_ptr<EmitterSpecificConstraints> emitter_specific_constraints,
-      mlir::MLIRContext* context)
+      mlir::MLIRContext* mlir_context)
       : symbolic_tiled_hlo_instructions_(
             std::move(symbolic_tiled_hlo_instructions)),
         root_indexing_(std::move(root_indexing)),
         tiling_specification_(std::move(tiling_specification)),
         emitter_specific_constraints_(std::move(emitter_specific_constraints)),
-        context_(context) {}
+        mlir_context_(mlir_context) {}
 
   // Computes indexing information for the roots of the computation.
   static absl::StatusOr<RootIndexing> GetRootIndexing(
       const HloFusionAdaptor& fusion,
       const TilingSpecification::ParameterMapping& parameter_mapping,
-      mlir::MLIRContext* ctx);
+      mlir::MLIRContext* mlir_context);
 
   static SymbolicTileAnalysisOrError AnalyzeFusionImpl(
       const HloFusionAdaptor& fusion,
       const TilingSpecification::ParameterMapping& parameter_mapping,
-      mlir::MLIRContext* ctx, const RootIndexing& root_indexing,
+      mlir::MLIRContext* mlir_context, const RootIndexing& root_indexing,
       IndexingMap::SimplifyPointDimensions simplification_mode,
       EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
       std::vector<SymbolicTiledHloInstruction*> root_runtime_variables);
@@ -234,7 +245,7 @@ class SymbolicTileAnalysis {
   static SymbolicTileAnalysisOrError AnalyzeNestedFusion(
       const HloFusionAdaptor& fusion,
       const TilingSpecification::ParameterMapping& parameter_mapping,
-      mlir::MLIRContext* ctx, const IndexingMap& indexing_map,
+      mlir::MLIRContext* mlir_context, const IndexingMap& indexing_map,
       IndexingMap::SimplifyPointDimensions simplification_mode,
       EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
       std::vector<SymbolicTiledHloInstruction*> root_runtime_variables);
@@ -254,7 +265,7 @@ class SymbolicTileAnalysis {
   // no builder was provided when constructing the analysis.
   std::unique_ptr<EmitterSpecificConstraints> emitter_specific_constraints_;
 
-  mlir::MLIRContext* context_;
+  mlir::MLIRContext* mlir_context_;
 };
 
 namespace detail {
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
index 8f6f7308b38b31..dbd2b76cef218b 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/constraint_expression.h"
@@ -41,8 +42,11 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
 #include "xla/codegen/tiling/tiled_hlo_fusion_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -61,6 +65,7 @@ namespace {
 
 using absl_testing::IsOkAndHolds;
 using detail::GetFlatTilingsForInputSpace;
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::ExplainMatchResult;
 using ::testing::IsEmpty;
@@ -100,7 +105,7 @@ absl::flat_hash_map<int64_t, const TiledHloInstruction*> GetParametersTiling(
   absl::flat_hash_map<int64_t, const TiledHloInstruction*> result;
   for (const auto& instruction : tiled_hlo_computation->instructions()) {
     const HloParameterInstruction* parameter =
-        dynamic_cast<const HloParameterInstruction*>(instruction->hlo());
+        DynCast<const HloParameterInstruction>(instruction->hlo());
     if (!parameter) {
       continue;
     }
@@ -188,6 +193,8 @@ class SymbolicTileAnalysisTest : public HloHardwareIndependentTestBase {
   }
 
   mlir::MLIRContext mlir_context_;
+  TiledHloScheduleBuilder default_schedule_builder_ =
+      CreateMajorToMinorTiledHloSchedule;
 };
 
 TEST_F(SymbolicTileAnalysisTest, SimpleNormalizationDiamondIsSupported) {
@@ -219,6 +226,7 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({1, 10})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -275,7 +283,8 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
-                              Tiling({{fusion_root, FlatTiling({1, 10})}})));
+                              Tiling({{fusion_root, FlatTiling({1, 10})}}),
+                              default_schedule_builder_));
 
   const TiledHloInstruction* root = tiled_hlo_computation.GetRoots()[0];
 
@@ -307,6 +316,7 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({1, 8})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -366,7 +376,8 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis.ComputeTiledHloInstructions(
-                              Tiling({{fusion_root, FlatTiling({1, 97})}})));
+                              Tiling({{fusion_root, FlatTiling({1, 97})}}),
+                              default_schedule_builder_));
 
   const TiledHloInstruction* root = tiled_hlo_computation.GetRoots()[0];
 
@@ -463,11 +474,12 @@ ENTRY entry_computation {
                                        ->fused_expression_root()
                                        ->operand(0);
 
-  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
-                          analysis->ComputeTiledHloInstructions(
-                              Tiling({{add_root, FlatTiling({2, 4})}}),
-                              /*constraints_are_known_satisfied=*/false,
-                              /*compute_all_tile_offset_indexing_maps=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TiledHloComputation tiled_hlo_computation,
+      analysis->ComputeTiledHloInstructions(
+          Tiling({{add_root, FlatTiling({2, 4})}}), default_schedule_builder_,
+          /*constraints_are_known_satisfied=*/false,
+          /*compute_all_tile_offset_indexing_maps=*/false));
   const auto& roots = tiled_hlo_computation.GetRoots();
   EXPECT_EQ(roots.size(), 2);
   EXPECT_THAT(*roots[0], MatchTiledHloInstruction(
@@ -512,7 +524,7 @@ ENTRY entry_computation {
                                              ->operand(0);
 
   auto maybe_tiled_hlo_computation = analysis->ComputeTiledHloInstructions(
-      Tiling({{broadcast_root, FlatTiling({1, 4})}}),
+      Tiling({{broadcast_root, FlatTiling({1, 4})}}), default_schedule_builder_,
       /*constraints_are_known_satisfied=*/false,
       /*compute_all_tile_offset_indexing_maps=*/false);
   EXPECT_THAT(
@@ -550,6 +562,7 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{reshape_root, FlatTiling({1, 4})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/false));
   const auto& roots = tiled_hlo_computation.GetRoots();
@@ -598,7 +611,7 @@ ENTRY entry_computation {
                                            ->operand(0);
 
   auto maybe_tiled_hlo_computation = analysis->ComputeTiledHloInstructions(
-      Tiling({{reshape_root, FlatTiling({1, 16})}}),
+      Tiling({{reshape_root, FlatTiling({1, 16})}}), default_schedule_builder_,
       /*constraints_are_known_satisfied=*/false,
       /*compute_all_tile_offset_indexing_maps=*/false);
   EXPECT_THAT(
@@ -635,7 +648,7 @@ ENTRY entry_computation {
                                            ->operand(0);
 
   auto maybe_tiled_hlo_computation = analysis->ComputeTiledHloInstructions(
-      Tiling({{reshape_root, FlatTiling({1, 16})}}),
+      Tiling({{reshape_root, FlatTiling({1, 16})}}), default_schedule_builder_,
       /*constraints_are_known_satisfied=*/false,
       /*compute_all_tile_offset_indexing_maps=*/false);
   EXPECT_THAT(
@@ -670,11 +683,12 @@ ENTRY entry_computation {
                                            ->fused_expression_root()
                                            ->operand(0);
 
-  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
-                          analysis->ComputeTiledHloInstructions(
-                              Tiling({{reshape_root, FlatTiling({4})}}),
-                              /*constraints_are_known_satisfied=*/false,
-                              /*compute_all_tile_offset_indexing_maps=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TiledHloComputation tiled_hlo_computation,
+      analysis->ComputeTiledHloInstructions(
+          Tiling({{reshape_root, FlatTiling({4})}}), default_schedule_builder_,
+          /*constraints_are_known_satisfied=*/false,
+          /*compute_all_tile_offset_indexing_maps=*/false));
   const auto& roots = tiled_hlo_computation.GetRoots();
   EXPECT_EQ(roots.size(), 2);
   EXPECT_THAT(*roots[0], MatchTiledHloInstruction(
@@ -713,6 +727,7 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({2, 4, 2})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -758,6 +773,7 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({2, 2})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -814,7 +830,7 @@ ENTRY main {
   Tiling tiling(Tiling::TileMapping{{dot_hlo, {8, 2, 2}}});
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
-                              tiling,
+                              tiling, default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -877,7 +893,7 @@ ENTRY main {
       {dot_hlo, {kContractingTileSize, kLhsTileSize, kRhsTileSize}}});
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
-                              tiling,
+                              tiling, default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -984,7 +1000,7 @@ ENTRY main {
       module->entry_computation()->root_instruction()->fused_expression_root();
 
   auto result = analysis->ComputeTiledHloInstructions(
-      Tiling({{fusion_root, FlatTiling({2})}}),
+      Tiling({{fusion_root, FlatTiling({2})}}), default_schedule_builder_,
       /*constraints_are_known_satisfied=*/false,
       /*compute_all_tile_offset_indexing_maps=*/true);
   ASSERT_THAT(result.status(),
@@ -1065,12 +1081,14 @@ ENTRY main {
 
   // Passing tile parameters that do not satisfy the constraints should result
   // in an error...
-  EXPECT_THAT(analysis->ComputeTiledHloInstructions(impossible_tile_parameters),
+  EXPECT_THAT(analysis->ComputeTiledHloInstructions(impossible_tile_parameters,
+                                                    default_schedule_builder_),
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 
   // ... unless we pinky-promise (lie) that they satisfy the constraints ;)
   TF_EXPECT_OK(analysis->ComputeTiledHloInstructions(
-      impossible_tile_parameters, /*constraints_are_known_satisfied=*/true));
+      impossible_tile_parameters, default_schedule_builder_,
+      /*constraints_are_known_satisfied=*/true));
 }
 
 TEST_F(SymbolicTileAnalysisTest, EmitterSpecificConstraintsAreUsedCorrectly) {
@@ -1394,6 +1412,7 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({1, 1})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -1447,6 +1466,7 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({1, 1})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -1506,11 +1526,12 @@ ENTRY entry_computation {
   const HloInstruction* fusion_root =
       module->entry_computation()->root_instruction()->fused_expression_root();
 
-  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
-                          analysis->ComputeTiledHloInstructions(
-                              Tiling({{fusion_root, FlatTiling({1})}}),
-                              /*constraints_are_known_satisfied=*/false,
-                              /*compute_all_tile_offset_indexing_maps=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TiledHloComputation tiled_hlo_computation,
+      analysis->ComputeTiledHloInstructions(
+          Tiling({{fusion_root, FlatTiling({1})}}), default_schedule_builder_,
+          /*constraints_are_known_satisfied=*/false,
+          /*compute_all_tile_offset_indexing_maps=*/true));
   const TiledHloInstruction* d2 = tiled_hlo_computation.GetRoots()[0];
   const TiledHloInstruction* d1 = d2->operand(0);
   const TiledHloInstruction* convert = d1->operand(0);
@@ -1607,6 +1628,7 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({2, 4, 8})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
   const TiledHloInstruction* dot = tiled_hlo_computation.GetRoots()[0];
@@ -1624,8 +1646,8 @@ ENTRY e {
   EXPECT_EQ(dynamic_slice->hlo()->opcode(), HloOpcode::kDynamicSlice);
   const TiledHloInstruction* p0 = dynamic_slice->operand(0);
   EXPECT_THAT(*p0, MatchTiledHloInstruction(
-                       /*tile_sizes=*/{2, 8, 2},
-                       /*tile_strides=*/{1, 1, 1},
+                       /*tile_sizes=*/{1, 8, 2},
+                       /*tile_strides=*/{0, 1, 1},
                        /*tile_offsets_indexing=*/R"(
     (pid_0){rt0} -> (rt0, 0, 0), domain: pid_0 in [0, 0], rt0 in [0, 3]
   )"));
@@ -1680,7 +1702,7 @@ ENTRY e {
   Tiling tiling = Tiling(Tiling::TileMapping({{root, {1, 1, 1, 1, 32}}}));
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
-                              tiling,
+                              tiling, default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
   absl::flat_hash_map<int64_t, const TiledHloInstruction*> parameter_tiling =
@@ -1741,7 +1763,7 @@ ENTRY e {
       Tiling::TileMapping{{ds_hlo, {2, 4}}, {ds_hlo->operand(0), {8}}});
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
-                              tiling,
+                              tiling, default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
   const TiledHloInstruction* dynamic_slice =
@@ -1810,7 +1832,8 @@ ENTRY main {
           *module->entry_computation()
                ->root_instruction()
                ->fused_instructions_computation(),
-          &mlir_context_, /*emitter_specific_constraints_builder=*/nullptr);
+          &mlir_context_,
+          /*emitter_specific_constraints_builder=*/nullptr);
 
   ASSERT_TRUE(std::holds_alternative<FusionDecision>(analysis_or_error));
   EXPECT_THAT(std::get<FusionDecision>(analysis_or_error).Explain(),
@@ -1862,11 +1885,12 @@ ENTRY main {
   const HloInstruction* fusion_root =
       module->entry_computation()->root_instruction()->fused_expression_root();
 
-  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
-                          analysis->ComputeTiledHloInstructions(
-                              Tiling({{fusion_root, FlatTiling({4})}}),
-                              /*constraints_are_known_satisfied=*/false,
-                              /*compute_all_tile_offset_indexing_maps=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TiledHloComputation tiled_hlo_computation,
+      analysis->ComputeTiledHloInstructions(
+          Tiling({{fusion_root, FlatTiling({4})}}), default_schedule_builder_,
+          /*constraints_are_known_satisfied=*/false,
+          /*compute_all_tile_offset_indexing_maps=*/false));
 
   const TiledHloInstruction* iota = tiled_hlo_computation.GetRoots()[0];
   EXPECT_THAT(iota->tile_offsets_indexing().status(), absl_testing::IsOk());
@@ -1919,7 +1943,7 @@ ENTRY main {
   Tiling dot_tiling = Tiling(Tiling::TileMapping({{dot_hlo, {8, 128, 32}}}));
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
-                              /*tiling=*/dot_tiling,
+                              /*tiling=*/dot_tiling, default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -1991,11 +2015,12 @@ ENTRY main {
   const HloInstruction* fusion_root =
       module->entry_computation()->root_instruction()->fused_expression_root();
 
-  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
-                          analysis->ComputeTiledHloInstructions(
-                              Tiling({{fusion_root, FlatTiling({2})}}),
-                              /*constraints_are_known_satisfied=*/false,
-                              /*compute_all_tile_offset_indexing_maps=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TiledHloComputation tiled_hlo_computation,
+      analysis->ComputeTiledHloInstructions(
+          Tiling({{fusion_root, FlatTiling({2})}}), default_schedule_builder_,
+          /*constraints_are_known_satisfied=*/false,
+          /*compute_all_tile_offset_indexing_maps=*/true));
   const TiledHloInstruction* root = tiled_hlo_computation.GetRoots()[0];
   EXPECT_THAT(*root, MatchTiledHloInstruction(
                          /*tile_sizes=*/{2}, /*tile_strides=*/{1},
@@ -2108,11 +2133,12 @@ ENTRY main {
   const HloInstruction* fusion_root =
       module->entry_computation()->root_instruction()->fused_expression_root();
 
-  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
-                          analysis->ComputeTiledHloInstructions(
-                              Tiling({{fusion_root, FlatTiling({32})}}),
-                              /*constraints_are_known_satisfied=*/false,
-                              /*compute_all_tile_offset_indexing_maps=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TiledHloComputation tiled_hlo_computation,
+      analysis->ComputeTiledHloInstructions(
+          Tiling({{fusion_root, FlatTiling({32})}}), default_schedule_builder_,
+          /*constraints_are_known_satisfied=*/false,
+          /*compute_all_tile_offset_indexing_maps=*/true));
 
   // Gather the three nested fusions present in the module, in order.
   std::vector<const TiledHloFusionInstruction*> nested_fusions(3, nullptr);
@@ -2154,7 +2180,7 @@ ENTRY main {
   // results in the tiling being rejected, even if we pretend that `33`
   // satisfies the constraints.
   auto tiled_hlo_computation_or = analysis->ComputeTiledHloInstructions(
-      Tiling({{fusion_root, FlatTiling({33})}}),
+      Tiling({{fusion_root, FlatTiling({33})}}), default_schedule_builder_,
       /*constraints_are_known_satisfied=*/true,
       /*compute_all_tile_offset_indexing_maps=*/false);
 
@@ -2210,6 +2236,7 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
                               Tiling({{fusion_root, FlatTiling({16, 32})}}),
+                              default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -2256,7 +2283,7 @@ ENTRY main {
   // results in the tiling being rejected, even if we pretend that `33`
   // satisfies the constraints.
   auto tiled_hlo_computation_or = analysis->ComputeTiledHloInstructions(
-      Tiling({{fusion_root, FlatTiling({16, 33})}}),
+      Tiling({{fusion_root, FlatTiling({16, 33})}}), default_schedule_builder_,
       /*constraints_are_known_satisfied=*/true,
       /*compute_all_tile_offset_indexing_maps=*/false);
 
@@ -2266,7 +2293,8 @@ ENTRY main {
                   ::testing::HasSubstr("not divisible by tile size")));
 }
 
-TEST_F(SymbolicTileAnalysisTest, TrivialDimensionParametersArePreserved) {
+TEST_F(SymbolicTileAnalysisTest,
+       TrivialNonBatchDotDimensionParametersArePreserved) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 lhs {
@@ -2310,7 +2338,7 @@ ENTRY main {
   Tiling tiling(Tiling::TileMapping{{dot_hlo, {32, 16, 16}}});
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
                           analysis->ComputeTiledHloInstructions(
-                              tiling,
+                              tiling, default_schedule_builder_,
                               /*constraints_are_known_satisfied=*/false,
                               /*compute_all_tile_offset_indexing_maps=*/true));
 
@@ -2343,6 +2371,76 @@ ENTRY main {
                   "pid_0 in [0, 35]"));
 }
 
+TEST_F(SymbolicTileAnalysisTest,
+       TrivialBatchDotDimensionParametersAreEliminated) {
+  // Note: the batch dot dimension parameters are only eliminated if contracting
+  // and non-contracting dimensions do not contain trivial dimensions.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+lhs {
+  ROOT p0 = f32[1,137,115] parameter(0)
+}
+
+rhs {
+  p0 = f32[1,2,115] parameter(0)
+  ROOT root = f32[1,2,115] convert(p0)
+}
+
+dot {
+  p0 = f32[1,137,115] parameter(0)
+  p1 = f32[1,2,115] parameter(1)
+
+  lhs = f32[1,137,115] fusion(p0),
+    kind=kCustom, calls=lhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","16","32"]}]}}}
+  rhs = f32[1,2,115] fusion(p1),
+    kind=kCustom, calls=rhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","16","32"]}]}}}
+
+  ROOT dot = f32[1,137,2] dot(lhs, rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  p0 = f32[1,137,115] parameter(0)
+  p1 = f32[1,2,115] parameter(1)
+  ROOT fusion = f32[1,137,2] fusion(p0, p1),
+    kind=kCustom, calls=dot
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+  const HloInstruction* dot_hlo =
+      module->entry_computation()->root_instruction()->fused_expression_root();
+  Tiling tiling(Tiling::TileMapping{{dot_hlo, {32, 1, 16, 16}}});
+  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
+                          analysis->ComputeTiledHloInstructions(
+                              tiling, default_schedule_builder_,
+                              /*constraints_are_known_satisfied=*/false,
+                              /*compute_all_tile_offset_indexing_maps=*/true));
+
+  const TiledHloInstruction* dot = tiled_hlo_computation.GetRoots().front();
+  ASSERT_EQ(dot->hlo()->opcode(), HloOpcode::kDot);
+
+  const TiledHloFusionInstruction* lhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(0));
+  const TiledHloFusionInstruction* rhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(1));
+
+  // We recognize that the batch dimension has been simplified away by the fact
+  // that the stride in the relevant dimension is 0.
+  EXPECT_THAT(
+      lhs_fusion->called_computation()->GetRoots().front()->tile_strides(),
+      ElementsAre(0, 1, 1));
+  EXPECT_THAT(
+      rhs_fusion->called_computation()->GetRoots().front()->tile_strides(),
+      ElementsAre(0, 1, 1));
+}
+
 TEST_F(SymbolicTileAnalysisTest,
        SymbolicTilesAlwaysDependOnAllTheHiddenParameters) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -2416,5 +2514,104 @@ ENTRY main {
       analysis->GetTilingSpecification().constraints().IsAlwaysSatisfied());
 }
 
+TEST_F(SymbolicTileAnalysisTest,
+       DotIndexingWorksAsExpectedWithTransposedDotSchedule) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+
+lhs {
+  ROOT p0 = f32[2,4,8] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[2,8,16] parameter(0)
+}
+
+fusion {
+  p0 = f32[2,4,8] parameter(0)
+  p1 = f32[2,8,16] parameter(1)
+
+  lhs = f32[2,4,8] fusion(p0), kind=kCustom, calls=lhs
+  rhs = f32[2,8,16] fusion(p1), kind=kCustom, calls=rhs
+
+  ROOT dot = f32[2,4,16] dot(lhs, rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  p0 = f32[2,4,8] parameter(0)
+  p1 = f32[2,8,16] parameter(1)
+  ROOT fusion = f32[2,4,16] fusion(p0, p1), kind=kLoop, calls=fusion
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+  const HloInstruction* dot_hlo =
+      module->entry_computation()->root_instruction()->fused_expression_root();
+  constexpr int tile_batch = 1;
+  constexpr int tile_m = 2;
+  constexpr int tile_n = 8;
+  constexpr int tile_k = 8;
+  Tiling tiling(
+      Tiling::TileMapping{{dot_hlo, {tile_k, tile_batch, tile_m, tile_n}}});
+  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
+                          analysis->ComputeTiledHloInstructions(
+                              tiling, TransposedDotTiledHloSchedule::Create,
+                              /*constraints_are_known_satisfied=*/false,
+                              /*compute_all_tile_offset_indexing_maps=*/true));
+
+  int64_t m = dot_hlo->shape().dimensions(1);
+  int64_t n = dot_hlo->shape().dimensions(2);
+
+  ASSERT_EQ(dot_hlo->shape().dimensions(0) % tile_batch, 0);
+  ASSERT_EQ(m % tile_m, 0);
+  ASSERT_EQ(n % tile_n, 0);
+
+  int num_m_tiles = m / tile_m;
+  int num_n_tiles = n / tile_n;
+
+  const TiledHloInstruction* dot = tiled_hlo_computation.GetRoots()[0];
+  EXPECT_THAT(*dot,
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{1, 2, 8}, /*tile_strides=*/{1, 1, 1},
+                  /*tile_offsets_indexing=*/
+                  absl::Substitute(R"(
+    (pid_0) -> (pid_0 floordiv 4, (pid_0 mod $0) * $1, ((pid_0 floordiv $0) mod $2) * $3),
+    domain:
+    pid_0 in [0, 7]
+  )",
+                                   num_m_tiles, tile_m, num_n_tiles, tile_n)));
+
+  const auto* lhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(0));
+  const TiledHloInstruction* lhs =
+      lhs_fusion->called_computation()->GetRoots().front();
+
+  EXPECT_THAT(*lhs, MatchTiledHloInstruction(
+                        /*tile_sizes=*/{1, 2, 8}, /*tile_strides=*/{1, 1, 1},
+                        /*tile_offsets_indexing=*/
+                        absl::Substitute(R"(
+    (pid_0) -> (pid_0 floordiv 4, (pid_0 mod $0) * $1, 0),
+    domain:
+    pid_0 in [0, 7]
+  )",
+                                         num_m_tiles, tile_m)));
+
+  const auto* rhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(1));
+  const TiledHloInstruction* rhs =
+      rhs_fusion->called_computation()->GetRoots().front();
+
+  EXPECT_THAT(*rhs, MatchTiledHloInstruction(
+                        /*tile_sizes=*/{1, 8, 8}, /*tile_strides=*/{1, 1, 1},
+                        /*tile_offsets_indexing=*/
+                        absl::Substitute(R"(
+    (pid_0) -> (pid_0 floordiv 4, 0, ((pid_0 floordiv $0) mod $1) * $2),
+    domain:
+    pid_0 in [0, 7]
+  )",
+                                         num_m_tiles, num_n_tiles, tile_n)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_test.cc
index 2fdda1d783d7b4..197c08b6f05bee 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_test.cc
@@ -593,8 +593,8 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughSummationOfSymbols) {
   //   reduce_0 = f32[9] reduce(bitcast), dimensions={1}
   //   reduce_1 = f32[] reduce(reduce_0), dimensions={0}
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap("()[s0, s1] -> (s1 * 2 + s0)", &symbolic_expr_context_),
-      {}, {2, 9});
+      ParseAffineMap("()[s0, s1] -> (s1 * 2 + s0)", &mlir_context_), {},
+      {2, 9});
 
   EXPECT_THAT(SymbolicTile::FromIndexingMap(indexing_map),
               Optional(MatchSymbolicTileString(R"(
@@ -613,7 +613,7 @@ TEST_F(SymbolicTileTest, CanPropagateTileModAndFloorDiv) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(
           "(d0) -> (d0 floordiv 35, (d0 floordiv 7) mod 5, d0 mod 7)",
-          &symbolic_expr_context_),
+          &mlir_context_),
       {105}, {});
 
   EXPECT_THAT(SymbolicTile::FromIndexingMap(indexing_map),
@@ -784,13 +784,12 @@ TEST_F(SymbolicTileTest,
   // https://github.com/google/paxml/blob/91893818862645f5e9f23b84f530e611551745f6/paxml/contrib/gpu/scripts_gpu/configs.py#L107-L120.
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1, d2)[s0] -> (d0 * 2048 + d1, s0)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {4, 2048, 50304}, {50304});
   // This constraint is redundant, because it can be derived from the domains of
   // the dimension variables.
-  indexing_map.AddConstraint(
-      ParseAffineExpr("d0 * 2048 + d1", &symbolic_expr_context_),
-      Interval{0, 8191});
+  indexing_map.AddConstraint(ParseAffineExpr("d0 * 2048 + d1", &mlir_context_),
+                             Interval{0, 8191});
 
   EXPECT_THAT(SymbolicTile::FromIndexingMap(indexing_map),
               Optional(MatchSymbolicTileString(R"(
@@ -806,14 +805,13 @@ TEST_F(SymbolicTileTest,
        CanDeriveTileWhenPreexistingConstraintsModelRightPadding) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1, d2)[s0] -> (d0 * 2048 + d1, s0)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {4, 2048, 50304}, {50304});
   // This constraint is not redundant, but it doesn't prevent us from deriving
   // a valid tile (although that tile will need to be interpreted as containing
   // high padding).
-  indexing_map.AddConstraint(
-      ParseAffineExpr("d0 * 2048 + d1", &symbolic_expr_context_),
-      Interval{0, 4096});
+  indexing_map.AddConstraint(ParseAffineExpr("d0 * 2048 + d1", &mlir_context_),
+                             Interval{0, 4096});
 
   EXPECT_THAT(SymbolicTile::FromIndexingMap(indexing_map),
               Optional(MatchSymbolicTileString(R"(
@@ -829,12 +827,11 @@ TEST_F(SymbolicTileTest,
        BailsOutOnDerivingTileWhenPreexistingConstraintsModelLeftPadding) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1, d2)[s0] -> (d0 * 2048 + d1, s0)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {4, 2048, 50304}, {50304});
   // This constraint models left padding, which we do not handle for now.
-  indexing_map.AddConstraint(
-      ParseAffineExpr("d0 * 2048 + d1", &symbolic_expr_context_),
-      Interval{2, 4096});
+  indexing_map.AddConstraint(ParseAffineExpr("d0 * 2048 + d1", &mlir_context_),
+                             Interval{2, 4096});
 
   EXPECT_FALSE(SymbolicTile::FromIndexingMap(indexing_map).has_value());
 }
@@ -843,13 +840,13 @@ TEST_F(SymbolicTileTest,
        BailsOutOnDerivingTileWhenPreexistingConstraintsDoesNotApplyToResult) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1, d2)[s0] -> (d0 * 2048 + d1, s0)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {4, 2048, 50304}, {50304});
   // This constraint does not apply to a result, and actually models dilation
   // across `d1`. Figuring out how to handle such cases holistically is
   // difficult, and we bail out for now.
-  indexing_map.AddConstraint(
-      ParseAffineExpr("d1 mod 5", &symbolic_expr_context_), Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("d1 mod 5", &mlir_context_),
+                             Interval{0, 0});
 
   EXPECT_FALSE(SymbolicTile::FromIndexingMap(indexing_map).has_value());
 }
@@ -859,7 +856,7 @@ TEST_F(SymbolicTileTest, CanDeriveTileWhenTheIndexingMapHasSymbolsInASum) {
   // https://github.com/google/paxml/blob/91893818862645f5e9f23b84f530e611551745f6/paxml/contrib/gpu/scripts_gpu/configs.py#L107-L120.
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1, d2)[s0] -> (d0, d1, d2 * 128 + s0)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {4, 2048, 393}, {128});
 
   EXPECT_THAT(SymbolicTile::FromIndexingMap(indexing_map),
@@ -876,7 +873,7 @@ TEST_F(SymbolicTileTest, ResultingConstraintsAreSimplifiedAway) {
   // https://github.com/google/paxml/blob/91893818862645f5e9f23b84f530e611551745f6/paxml/contrib/gpu/scripts_gpu/configs.py#L107-L120.
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1, d2)[s0] -> (d0, d1, d2 * 128 + s0)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {4, 2048, 393}, {128});
 
   EXPECT_THAT(SymbolicTile::FromIndexingMap(indexing_map),
@@ -890,7 +887,7 @@ TEST_F(SymbolicTileTest, ResultingConstraintsAreSimplifiedAway) {
 
 TEST_F(SymbolicTileTest, PointDimensionsAreNotSimplified) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0) -> (d0)", &symbolic_expr_context_),
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_),
       /*dim_upper_bounds=*/{1},
       /*symbol_upper_bounds=*/{});
 
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
index 136c83ce3215d7..b480395b10dcc6 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/codegen/tiling/symbolic_tile.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc
index 9861a0e6c63fe2..e3eaeaa1371ebe 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc
@@ -24,9 +24,7 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -37,7 +35,6 @@ using ::testing::HasSubstr;
 class TiledHloFusionInstructionTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 TEST_F(TiledHloFusionInstructionTest,
@@ -48,7 +45,7 @@ TEST_F(TiledHloFusionInstructionTest,
 
   IndexingMap tile_offsets_indexing = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0) -> (d0 floordiv 16, (d0 mod 16) * 16)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       /*dim_upper_bounds=*/{8},
       /*symbol_upper_bounds=*/{});
 
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc
index 86b41c52fb671d..2e53761d300781 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc
@@ -19,14 +19,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -34,12 +33,10 @@ namespace xla {
 namespace {
 
 using ::testing::HasSubstr;
-using ::tsl::testing::IsOk;
 
 class TiledHloInstructionTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 TEST_F(TiledHloInstructionTest, TileSizesAndStridesShouldMatchHloShapeRank) {
@@ -49,7 +46,7 @@ TEST_F(TiledHloInstructionTest, TileSizesAndStridesShouldMatchHloShapeRank) {
 
   IndexingMap tile_offsets_indexing = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0) -> (d0 floordiv 16, (d0 mod 16) * 16)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       /*dim_upper_bounds=*/{8},
       /*symbol_upper_bounds=*/{});
 
@@ -77,7 +74,7 @@ TEST_F(TiledHloInstructionTest,
       ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}), "p0");
 
   IndexingMap tile_offsets_indexing = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0) -> (2 * d0)", &symbolic_expr_context_),
+      ParseAffineMap("(d0) -> (2 * d0)", &mlir_context_),
       /*dim_upper_bounds=*/{2},
       /*symbol_upper_bounds=*/{});
 
@@ -92,7 +89,7 @@ TEST_F(TiledHloInstructionTest,
           "must have the same number of results as the rank of the hlo shape"));
 
   IndexingMap tile_offsets_indexing2 = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0, d1) -> (d0, d1)", &symbolic_expr_context_),
+      ParseAffineMap("(d0, d1) -> (d0, d1)", &mlir_context_),
       /*dim_upper_bounds=*/{8, 4},
       /*symbol_upper_bounds=*/{});
 
@@ -119,12 +116,12 @@ TEST_F(TiledHloInstructionTest,
           /*tile_sizes=*/{16},
           /*tile_strides=*/{1},
           IndexingMap::FromTensorSizes(
-              ParseAffineMap("(d0) -> (d0)", &symbolic_expr_context_),
+              ParseAffineMap("(d0) -> (d0)", &mlir_context_),
               /*dim_upper_bounds=*/{4},
               /*symbol_upper_bounds=*/{})));
 
   IndexingMap indexing_map(
-      ParseAffineMap("(d0)[rt0] -> (d0 + rt0)", &symbolic_expr_context_),
+      ParseAffineMap("(d0)[rt0] -> (d0 + rt0)", &mlir_context_),
       /*dimensions=*/
       {IndexingMap::Variable{0, 32, "d0"}},
       /*range_vars=*/{},
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
index 59d099fb3ba462..aa90ba63242859 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
@@ -16,34 +16,74 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla {
+namespace {
 
-absl::StatusOr<IndexingMap> MajorToMinorTiledHloSchedule::Schedule(
-    const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
-    mlir::MLIRContext* ctx) const {
-  if (iteration_space.size() != tile_offsets_indexing.GetDimVarsCount()) {
+using ::mlir::MLIRContext;
+
+// Helper to validate that an iteration space is compatible with a tile offsets
+// indexing map.
+absl::Status ValidateIterationSpace(const IterationSpace& iteration_space,
+                                    const IndexingMap& tile_offsets_indexing) {
+  if (tile_offsets_indexing.GetDimVarsCount() < iteration_space.size()) {
     return absl::InvalidArgumentError(absl::StrFormat(
-        "Expected iteration space to have exactly as many dimensions as there "
+        "Expected iteration space to have at most as many dimensions as there "
         "are parameters in the tile offsets indexing map, but iteration space "
         "has %d dimensions, and tile offsets indexing map has %d dimensions.",
         iteration_space.size(), tile_offsets_indexing.GetDimVarsCount()));
   }
 
-  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, ctx);
+  std::vector<int64_t> iteration_space_dims;
+  iteration_space_dims.reserve(iteration_space.size());
+
+  for (const auto& [dim_id, dim_size] : iteration_space) {
+    if (dim_id >= tile_offsets_indexing.GetDimVarsCount() || dim_id < 0) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "Dimension id %d is out of bounds for tile offsets indexing map with "
+          "%d dimensions. This can happen if ",
+          dim_id, tile_offsets_indexing.GetDimVarsCount()));
+    }
+
+    if (absl::c_linear_search(iteration_space_dims, dim_id)) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "Iteration space contains multiple dimensions with id %d.", dim_id));
+    }
+    iteration_space_dims.push_back(dim_id);
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<IndexingMap> MajorToMinorScheduleImpl(
+    const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
+    MLIRContext* mlir_context) {
+  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
 
   std::vector<int64_t> iteration_space_sizes;
   iteration_space_sizes.reserve(iteration_space.size());
@@ -53,24 +93,18 @@ absl::StatusOr<IndexingMap> MajorToMinorTiledHloSchedule::Schedule(
 
   std::vector<mlir::AffineExpr> tile_exprs(
       tile_offsets_indexing.GetDimVarsCount(),
-      mlir::getAffineConstantExpr(0, ctx));
+      mlir::getAffineConstantExpr(0, mlir_context));
 
-  for (auto [dim_info, tile_expr] :
-       llvm::zip(iteration_space,
-                 DelinearizeIndex(iteration_space_sizes, program_id, ctx))) {
-    if (dim_info.dimension_id >= tile_exprs.size()) {
-      return absl::InvalidArgumentError(absl::StrFormat(
-          "Dimension id %d is out of bounds for tile offsets indexing map with "
-          "%d dimensions. This can happen if ",
-          dim_info.dimension_id, tile_exprs.size()));
-    }
+  for (auto [dim_info, tile_expr] : llvm::zip(
+           iteration_space,
+           DelinearizeIndex(iteration_space_sizes, program_id, mlir_context))) {
     tile_exprs[dim_info.dimension_id] = tile_expr;
   }
   std::vector<IndexingMap::Variable> dim_vars{
       {0, Product(iteration_space_sizes) - 1, "pid_0"}};
   IndexingMap program_id_to_output_dims{
       mlir::AffineMap::get(
-          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, ctx),
+          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, mlir_context),
       dim_vars, /*range_vars=*/{}, /*rt_vars=*/{}};
   auto scheduled_indexing =
       ComposeIndexingMaps(program_id_to_output_dims, tile_offsets_indexing);
@@ -79,5 +113,130 @@ absl::StatusOr<IndexingMap> MajorToMinorTiledHloSchedule::Schedule(
   scheduled_indexing.RemoveUnusedSymbols();
   return scheduled_indexing;
 }
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<TiledHloSchedule>>
+CreateMajorToMinorTiledHloSchedule(
+    const TilingSpecification& tiling_specification) {
+  // The major-to-minor schedule can just throw away the specification since
+  // it doesn't need to know about any specific of parameters to produce a
+  // schedule.
+  return std::make_unique<MajorToMinorTiledHloSchedule>();
+}
+
+absl::StatusOr<IndexingMap> MajorToMinorTiledHloSchedule::Schedule(
+    const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
+    MLIRContext* ctx) const {
+  TF_RETURN_IF_ERROR(
+      ValidateIterationSpace(iteration_space, tile_offsets_indexing));
+  return MajorToMinorScheduleImpl(tile_offsets_indexing, iteration_space, ctx);
+}
+
+absl::StatusOr<std::unique_ptr<TransposedDotTiledHloSchedule>>
+TransposedDotTiledHloSchedule::Create(
+    const TilingSpecification& tiling_specification) {
+  const TilingSpecification::ParameterMapping& parameter_mapping =
+      tiling_specification.parameter_mapping();
+  CHECK(!parameter_mapping.empty());
+  const HloDotInstruction* dot =
+      ::xla::DynCast<HloDotInstruction>(parameter_mapping.front().instruction);
+  if (dot == nullptr) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("TransposedDotTiledHloSchedule expects its root to be a "
+                     "dot instruction "
+                     "but got ",
+                     parameter_mapping.front().instruction->ToString()));
+  }
+  if (absl::c_any_of(absl::MakeSpan(parameter_mapping).subspan(1),
+                     [](const auto& param) {
+                       return param.instruction->opcode() == HloOpcode::kDot;
+                     })) {
+    return absl::InvalidArgumentError(
+        "TransposedDotTiledHloSchedule is only supported for "
+        "TilingSpecifications specifying tiling for a single dot "
+        "instruction.");
+  }
+
+  int64_t num_lhs_non_contracting_dims =
+      dot->operand(0)->shape().dimensions().size() -
+      dot->dot_dimension_numbers().lhs_contracting_dimensions().size() -
+      dot->dot_dimension_numbers().lhs_batch_dimensions().size();
+
+  int64_t num_rhs_non_contracting_dims =
+      dot->operand(1)->shape().dimensions().size() -
+      dot->dot_dimension_numbers().rhs_contracting_dimensions().size() -
+      dot->dot_dimension_numbers().rhs_batch_dimensions().size();
+
+  constexpr absl::string_view kErrorFormat =
+      "TransposedDotTiledHloSchedule is only supported for dot instructions "
+      "with a single non-contracting dimension, but got %d non-contracting "
+      "dimensions on the %s operand of %s.";
+
+  if (num_lhs_non_contracting_dims != 1) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        kErrorFormat, num_lhs_non_contracting_dims, "lhs", dot->ToString()));
+  }
+
+  if (num_rhs_non_contracting_dims != 1) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        kErrorFormat, num_rhs_non_contracting_dims, "rhs", dot->ToString()));
+  }
+
+  // The shape of the dot's output is now known to always be of the form
+  // [..., m, n]. This is because batch dimensions precede non-contracting
+  // dimensions, the lhs non-contracting dimensions precede the rhs
+  // non-contracting dimensions, and there is exactly one such dimension on
+  // each side.
+  //
+  // Figure out the parameter index of the m and n dimensions within the op.
+  int64_t m_local_parameter_index =
+      parameter_mapping.front().num_tiling_parameters - 2;
+  int64_t n_local_parameter_index =
+      parameter_mapping.front().num_tiling_parameters - 1;
+
+  // Using the local parameter index, we can compute the global parameter index
+  // (i.e. the parameter index within the sequence of all tiling parameters).
+  TF_ASSIGN_OR_RETURN(int64_t m_dim_id, tiling_specification.ParameterIndex(
+                                            dot, m_local_parameter_index));
+  TF_ASSIGN_OR_RETURN(int64_t n_dim_id, tiling_specification.ParameterIndex(
+                                            dot, n_local_parameter_index));
+
+  return std::unique_ptr<TransposedDotTiledHloSchedule>(
+      new TransposedDotTiledHloSchedule(m_dim_id, n_dim_id));
+}
+
+absl::StatusOr<IndexingMap> TransposedDotTiledHloSchedule::Schedule(
+    const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
+    MLIRContext* ctx) const {
+  TF_RETURN_IF_ERROR(
+      ValidateIterationSpace(iteration_space, tile_offsets_indexing));
+
+  std::optional<int64_t> local_m_dim_index;
+  std::optional<int64_t> local_n_dim_index;
+  for (int64_t i = 0; i < iteration_space.size(); ++i) {
+    if (iteration_space[i].dimension_id == m_dim_id_) {
+      local_m_dim_index = i;
+    } else if (iteration_space[i].dimension_id == n_dim_id_) {
+      local_n_dim_index = i;
+    }
+  }
+
+  // Nothing to transpose if any of the dimensions is inactive. Just return the
+  // major-to-minor schedule.
+  if (!local_m_dim_index.has_value() || !local_n_dim_index.has_value()) {
+    return MajorToMinorScheduleImpl(tile_offsets_indexing, iteration_space,
+                                    ctx);
+  }
+
+  DimensionInfo m_dim_info = iteration_space[*local_m_dim_index];
+  DimensionInfo n_dim_info = iteration_space[*local_n_dim_index];
+
+  std::vector<DimensionInfo> transposed_iteration_space(iteration_space.begin(),
+                                                        iteration_space.end());
+  transposed_iteration_space[*local_m_dim_index] = n_dim_info;
+  transposed_iteration_space[*local_n_dim_index] = m_dim_info;
+  return MajorToMinorScheduleImpl(tile_offsets_indexing,
+                                  transposed_iteration_space, ctx);
+}
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
index 57e2e0db215ce9..c7868092a37278 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
@@ -17,11 +17,13 @@ limitations under the License.
 #define XLA_CODEGEN_TILING_TILED_HLO_SCHEDULE_H_
 
 #include <cstdint>
+#include <memory>
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
@@ -48,10 +50,8 @@ class TiledHloSchedule {
 
   // Returns a schedule for the given root instruction as an indexing map.
   //
-  // `iteration_space` must contain one entry for each dimension id in the
-  // discrete range {0, ..., iteration_space.size() - 1}, and the number of
-  // dimensions in `iteration_space` must match the number of dimension
-  // parameters in `tile_offsets_indexing`.
+  // `iteration_space` must contain at most one entry for each dimension id in
+  // the discrete range {0, ..., tile_offsets_indexing.GetDimVarsCount() - 1}.
   //
   // We unfortunately can't pass a `TilingSpecification` here directly in order
   // to handle assumption-breaking calls in the case of multi-output fusions.
@@ -63,9 +63,9 @@ class TiledHloSchedule {
   //     the iteration space (i.e. the product of `iteration_space`'s
   //     `dimension_size`s);
   // (2) the set of results generatable with the map must be equal to the set
-  //     of results of `tile_offsets_indexing` (i.e. the map may only reorder
-  //     how the results are generated, but may not change the results
-  //     themselves);
+  //     of results of `tile_offsets_indexing` on the subspace defined by the
+  //     parameter iteration space (i.e. the map may only reorder how the
+  //     results are generated, but may not change the results themselves);
   virtual absl::StatusOr<IndexingMap> Schedule(
       const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
       mlir::MLIRContext* ctx) const = 0;
@@ -82,6 +82,45 @@ class MajorToMinorTiledHloSchedule : public TiledHloSchedule {
                                        mlir::MLIRContext* ctx) const override;
 };
 
+// Convenience function to produce a `MajorToMinorTiledHloSchedule` that
+// can be passed to `SymbolicTileAnalysis::ComputeTiledHloInstructions`.
+absl::StatusOr<std::unique_ptr<TiledHloSchedule>>
+CreateMajorToMinorTiledHloSchedule(
+    const TilingSpecification& tiling_specification);
+
+// Given a `TilingSpecification` where some of the output tile sizes are
+// provided by a `dot` operation with one left-hand-side and one
+// right-hand-side non-contracting dimensions, this schedule transposes the
+// iteration pattern over these output dimensions.
+//
+// This schedule is only constructible when the underlying `TilingSpecification`
+// contains a single `dot` node.
+//
+// TODO(b/417977182): this is implemented as a very bespoke pattern to unblock
+// the launch of the generic emitter. We probably will want to subsume this with
+// a more flexible approach for user-specified transposed schedules (that don't
+// rely on the "dot" instruction being at the root).
+class TransposedDotTiledHloSchedule : public TiledHloSchedule {
+ public:
+  absl::StatusOr<IndexingMap> Schedule(const IndexingMap& tile_offsets_indexing,
+                                       IterationSpace iteration_space,
+                                       mlir::MLIRContext* ctx) const override;
+
+  static absl::StatusOr<std::unique_ptr<TransposedDotTiledHloSchedule>> Create(
+      const TilingSpecification& tiling_specification);
+
+ private:
+  TransposedDotTiledHloSchedule(int64_t m_dim_id, int64_t n_dim_id)
+      : m_dim_id_(m_dim_id), n_dim_id_(n_dim_id) {}
+
+  // The index of the `m` dimension within the parameter mapping of the
+  // `TilingSpecification`.
+  int64_t m_dim_id_;
+  // The index of the `n` dimension within the parameter mapping of the
+  // `TilingSpecification`.
+  int64_t n_dim_id_;
+};
+
 // TODO(b/417977182): implement the `PlanarSnakeTiledHloSchedule` schedule.
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
index 94db2ddecc0a68..77a1c5279051b9 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
@@ -16,29 +16,43 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 
 #include <cstdint>
+#include <memory>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 using ::absl_testing::StatusIs;
+using ::mlir::AffineExpr;
 using ::testing::HasSubstr;
 
 class TiledHloScheduleTest : public HloHardwareIndependentTestBase {
  protected:
-  mlir::MLIRContext ctx_;
+  mlir::MLIRContext mlir_context_;
 };
 
 using MajorToMinorTiledHloScheduleTest = TiledHloScheduleTest;
@@ -48,13 +62,12 @@ TEST_F(MajorToMinorTiledHloScheduleTest,
   IndexingMap offsets_indexing = *ParseIndexingMap(R"(
       (d0, d1, d2, d3) -> (d2, d3),
       domain: d0 in [0, 1], d1 in [0, 2], d2 in [0, 4], d3 in [0, 6])",
-                                                   &ctx_);
+                                                   &mlir_context_);
   auto bound = [&offsets_indexing](int64_t dim) {
     return offsets_indexing.GetDimensionBound(dim).upper + 1;
   };
   std::vector<DimensionInfo> iteration_space = {
       {/*dimension_id=*/2, /*dimension_size=*/bound(2)},
-      {/*dimension_id=*/0, /*dimension_size=*/bound(0)},
       {/*dimension_id=*/1, /*dimension_size=*/bound(1)},
       {/*dimension_id=*/3, /*dimension_size=*/bound(3)},
   };
@@ -62,28 +75,28 @@ TEST_F(MajorToMinorTiledHloScheduleTest,
   MajorToMinorTiledHloSchedule scheduler;
   TF_ASSERT_OK_AND_ASSIGN(
       IndexingMap scheduled_indexing,
-      scheduler.Schedule(offsets_indexing, iteration_space, &ctx_));
+      scheduler.Schedule(offsets_indexing, iteration_space, &mlir_context_));
 
   // (1) the map must have a single input whose range of values is the size of
   //     the iteration space (i.e. the product of `iteration_space`'s
   //     `dimension_size`s);
   EXPECT_EQ(scheduled_indexing.GetDimVarsCount(), 1);
-  int64_t iteration_space_size = bound(0) * bound(1) * bound(2) * bound(3);
+  int64_t iteration_space_size = bound(1) * bound(2) * bound(3);
   Interval expected_parameter_interval{0, iteration_space_size - 1};
   EXPECT_EQ(scheduled_indexing.GetDimensionBound(0),
             expected_parameter_interval);
 
   // (2) the set of results generatable with the map must be equal to the set
-  //     of results of `tile_offsets_indexing` (i.e. the map may only reorder
-  //     how the results are generated, but may not change the results
-  //     themselves);
+  //     of results of `tile_offsets_indexing` on the subspace defined by the
+  //     parameter iteration space (i.e. the map may only reorder how the
+  //     results are generated, but may not change the results themselves);
   EXPECT_EQ(scheduled_indexing, *ParseIndexingMap(R"(
-    (pid_0) -> (pid_0 floordiv 42, pid_0 mod 7), domain: pid_0 in [0, 209]
+    (pid_0) -> (pid_0 floordiv 21, pid_0 mod 7), domain: pid_0 in [0, 104]
   )",
-                                                  &ctx_));
+                                                  &mlir_context_));
 
-  // `pid_0 floordiv 42` has the same upper bound as `d2`.
-  EXPECT_EQ(iteration_space_size / 42, bound(2));
+  // `pid_0 floordiv 21` has the same upper bound as `d2`.
+  EXPECT_EQ(iteration_space_size / 21, bound(2));
   // `pid_0 mod 7` has the same upper bound as `d3`.
   EXPECT_EQ(7, bound(3));
 }
@@ -91,25 +104,296 @@ TEST_F(MajorToMinorTiledHloScheduleTest,
 TEST_F(MajorToMinorTiledHloScheduleTest,
        MajorToMinorTiledHloScheduleFailsForInvalidIterationSpace) {
   IndexingMap offsets_indexing = *ParseIndexingMap(
-      "(d0, d1) -> (d1), domain: d0 in [0, 1], d1 in [0, 2]", &ctx_);
+      "(d0, d1) -> (d1), domain: d0 in [0, 1], d1 in [0, 2]", &mlir_context_);
   MajorToMinorTiledHloSchedule scheduler;
 
-  // The iteration space has the wrong number of dimensions.
+  // The iteration space has too many dimensions.
   EXPECT_THAT(
-      scheduler.Schedule(offsets_indexing, /*iteration_space=*/{}, &ctx_),
+      scheduler.Schedule(offsets_indexing, /*iteration_space=*/
+                         {{/*dimension_id=*/0, /*dimension_size=*/1},
+                          {/*dimension_id=*/1, /*dimension_size=*/3},
+                          {/*dimension_id=*/2, /*dimension_size=*/0}},
+                         &mlir_context_),
       StatusIs(
           absl::StatusCode::kInvalidArgument,
           HasSubstr(
-              "Expected iteration space to have exactly as many dimensions")));
+              "Expected iteration space to have at most as many dimensions")));
 
   // The iteration space has an out-of-bounds dimension ID.
   EXPECT_THAT(scheduler.Schedule(offsets_indexing, /*iteration_space=*/
                                  {{/*dimension_id=*/0, /*dimension_size=*/1},
                                   {/*dimension_id=*/2, /*dimension_size=*/0}},
-                                 &ctx_),
+                                 &mlir_context_),
               StatusIs(absl::StatusCode::kInvalidArgument,
                        HasSubstr("Dimension id 2 is out of bounds")));
 }
 
+class TransposedDotTiledHloScheduleTest : public TiledHloScheduleTest {
+ public:
+  absl::StatusOr<TilingSpecification> TilingSpecificationForModule(
+      HloModule* module) {
+    SymbolicTileAnalysisOrError analysis_or_error =
+        SymbolicTileAnalysis::AnalyzeComputation(
+            *module->entry_computation()
+                 ->root_instruction()
+                 ->fused_instructions_computation(),
+            &mlir_context_,
+            /*emitter_specific_constraints_builder=*/nullptr);
+
+    if (!std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error)) {
+      return absl::InvalidArgumentError(
+          "SymbolicTileAnalysis expected to be present");
+    }
+    return std::get<SymbolicTileAnalysis>(std::move(analysis_or_error))
+        .GetTilingSpecification();
+  }
+};
+
+TEST_F(TransposedDotTiledHloScheduleTest,
+       CanBeCreatedForFusionRootedInSingleDot) {
+  constexpr absl::string_view kSupportedFusionHlo = R"(
+lhs {
+  ROOT p0 = bf16[2,3,8192,256] parameter(0)
+}
+
+rhs {
+  ROOT p0 = bf16[2,3,256,512] parameter(0)
+}
+
+dot {
+  p0 = bf16[2,3,8192,256] parameter(0)
+  p1 = bf16[2,3,256,512] parameter(1)
+
+  lhs = bf16[2,3,8192,256] fusion(p0), kind=kCustom, calls=lhs
+  rhs = bf16[2,3,256,512] fusion(p1), kind=kCustom, calls=rhs
+
+  ROOT dot = bf16[2,3,8192,512] dot(lhs, rhs),
+    lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+    lhs_contracting_dims={3}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  p0 = bf16[2,3,8192,256] parameter(0)
+  p1 = bf16[2,3,256,512] parameter(1)
+  ROOT fusion = bf16[2,3,8192,512] fusion(p0, p1), kind=kCustom, calls=dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kSupportedFusionHlo));
+
+  TF_ASSERT_OK_AND_ASSIGN(TilingSpecification tiling_specification,
+                          TilingSpecificationForModule(module.get()));
+
+  TF_EXPECT_OK(TransposedDotTiledHloSchedule::Create(tiling_specification));
+}
+
+TEST_F(TransposedDotTiledHloScheduleTest,
+       CanNotBeCreatedForFusionRootedInNonDot) {
+  constexpr absl::string_view kUnsupportedNonDotHlo = R"(
+dot {
+  ROOT p0 = bf16[128] parameter(0)
+}
+
+ENTRY main {
+  p0 = bf16[128] parameter(0)
+  ROOT fusion = bf16[128] fusion(p0), kind=kCustom, calls=dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kUnsupportedNonDotHlo));
+
+  TF_ASSERT_OK_AND_ASSIGN(TilingSpecification tiling_specification,
+                          TilingSpecificationForModule(module.get()));
+  EXPECT_THAT(TransposedDotTiledHloSchedule::Create(tiling_specification),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("expects its root to be a dot")));
+}
+
+TEST_F(TransposedDotTiledHloScheduleTest, CanNotBeCreatedForMultiDotFusion) {
+  constexpr absl::string_view kUnsupportedMultiDotHlo = R"(
+lhs {
+  ROOT p0 = bf16[64,64] parameter(0)
+}
+
+nested_lhs {
+  ROOT p0 = bf16[64,64] parameter(0)
+}
+
+nested_rhs {
+  ROOT p0 = bf16[64,64] parameter(0)
+}
+
+rhs {
+  p0 = bf16[64,64] parameter(0)
+  lhs = bf16[64,64] fusion(p0), kind=kCustom, calls=nested_lhs
+  rhs = bf16[64,64] fusion(p0), kind=kCustom, calls=nested_rhs
+  ROOT dot = bf16[64,64] dot(lhs, rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+dot {
+  p0 = bf16[64,64] parameter(0)
+  p1 = bf16[64,64] parameter(1)
+
+  lhs = bf16[64,64] fusion(p0), kind=kCustom, calls=lhs
+  rhs = bf16[64,64] fusion(p1), kind=kCustom, calls=rhs
+  ROOT dot = bf16[64,64] dot(lhs, rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  p0 = bf16[64,64] parameter(0)
+  p1 = bf16[64,64] parameter(1)
+  ROOT fusion = bf16[64,64] fusion(p0, p1), kind=kCustom, calls=dot
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      ParseAndReturnVerifiedModule(kUnsupportedMultiDotHlo));
+
+  TF_ASSERT_OK_AND_ASSIGN(TilingSpecification tiling_specification,
+                          TilingSpecificationForModule(module.get()));
+
+  EXPECT_THAT(
+      TransposedDotTiledHloSchedule::Create(tiling_specification),
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr(
+                   "only supported for "
+                   "TilingSpecifications specifying tiling for a single dot")));
+}
+
+TEST_F(TransposedDotTiledHloScheduleTest,
+       CanNotBeCreatedForDotWithMultipleNonContractingDimensions) {
+  constexpr absl::string_view kSupportedFusionHlo = R"(
+lhs {
+  ROOT p0 = bf16[2,8192,256] parameter(0)
+}
+
+rhs {
+  ROOT p0 = bf16[256,512] parameter(0)
+}
+
+dot {
+  p0 = bf16[2,8192,256] parameter(0)
+  p1 = bf16[256,512] parameter(1)
+
+  lhs = bf16[2,8192,256] fusion(p0), kind=kCustom, calls=lhs
+  rhs = bf16[256,512] fusion(p1), kind=kCustom, calls=rhs
+
+  ROOT dot = bf16[2,8192,512] dot(lhs, rhs),
+    lhs_contracting_dims={2}, rhs_contracting_dims={0}
+}
+
+ENTRY main {
+  p0 = bf16[2,8192,256] parameter(0)
+  p1 = bf16[256,512] parameter(1)
+  ROOT fusion = bf16[2,8192,512] fusion(p0, p1), kind=kCustom, calls=dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kSupportedFusionHlo));
+
+  TF_ASSERT_OK_AND_ASSIGN(TilingSpecification tiling_specification,
+                          TilingSpecificationForModule(module.get()));
+
+  EXPECT_THAT(TransposedDotTiledHloSchedule::Create(tiling_specification),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("only supported for dot instructions with a "
+                                 "single non-contracting dimension")));
+}
+
+TEST_F(
+    TransposedDotTiledHloScheduleTest,
+    SchedulingProducesTransposedIterationSpaceOfDotNonContractingDimensions) {
+  constexpr absl::string_view kSupportedFusionHlo = R"(
+lhs {
+  ROOT p0 = bf16[1,3,1024,256] parameter(0)
+}
+
+rhs {
+  ROOT p0 = bf16[1,3,256,512] parameter(0)
+}
+
+dot {
+  p0 = bf16[1,3,1024,256] parameter(0)
+  p1 = bf16[1,3,256,512] parameter(1)
+
+  lhs = bf16[1,3,1024,256] fusion(p0), kind=kCustom, calls=lhs
+  rhs = bf16[1,3,256,512] fusion(p1), kind=kCustom, calls=rhs
+
+  ROOT dot = bf16[1,3,1024,512] dot(lhs, rhs),
+    lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+    lhs_contracting_dims={3}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  p0 = bf16[1,3,1024,256] parameter(0)
+  p1 = bf16[1,3,256,512] parameter(1)
+  ROOT fusion = bf16[1,3,1024,512] fusion(p0, p1), kind=kCustom, calls=dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kSupportedFusionHlo));
+
+  TF_ASSERT_OK_AND_ASSIGN(TilingSpecification tiling_specification,
+                          TilingSpecificationForModule(module.get()));
+
+  MajorToMinorTiledHloSchedule major_to_minor_scheduler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<TransposedDotTiledHloSchedule> transposed_scheduler,
+      TransposedDotTiledHloSchedule::Create(tiling_specification));
+
+  IndexingMap offsets_indexing = *ParseIndexingMap(R"(
+      (d0, d1, d2, d3, d4) -> (d1, d2, d3, d4),
+      domain: d0 in [0, 0], d1 in [0, 1], d2 in [0, 1], d3 in [0, 3],
+              d4 in [0, 7])",
+                                                   &mlir_context_);
+
+  std::vector<DimensionInfo> iteration_space;
+  iteration_space.reserve(offsets_indexing.GetDimVarsCount());
+  int64_t linear_iteration_space_size = 1;
+  for (const auto& [i, dim_var] :
+       llvm::enumerate(offsets_indexing.GetDimVars())) {
+    int64_t bound = offsets_indexing.GetDimensionBound(i).upper + 1;
+    iteration_space.push_back({/*dimension_id=*/static_cast<int64_t>(i),
+                               /*dimension_size=*/bound});
+    linear_iteration_space_size *= bound;
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      IndexingMap major_to_minor_scheduled_indexing,
+      major_to_minor_scheduler.Schedule(offsets_indexing, iteration_space,
+                                        &mlir_context_));
+  TF_ASSERT_OK_AND_ASSIGN(
+      IndexingMap transposed_scheduled_indexing,
+      transposed_scheduler->Schedule(offsets_indexing, iteration_space,
+                                     &mlir_context_));
+
+  int64_t m_bound = iteration_space[3].dimension_size;
+  int64_t n_bound = iteration_space[4].dimension_size;
+
+  // Check that evaluating the scheduled indexing map yields a transposed
+  // schedule across the non-contracting dimensions of the dot.
+  for (int64_t i = 0; i < linear_iteration_space_size; ++i) {
+    mlir::AffineExpr pid = mlir::getAffineConstantExpr(i, &mlir_context_);
+    llvm::SmallVector<int64_t> major_to_minor_indices =
+        major_to_minor_scheduled_indexing.Evaluate({pid}, {});
+    llvm::SmallVector<int64_t> transposed_indices =
+        transposed_scheduled_indexing.Evaluate({pid}, {});
+
+    // The first two dimensions should be identical.
+    EXPECT_EQ(major_to_minor_indices[0], transposed_indices[0]);
+    EXPECT_EQ(major_to_minor_indices[1], transposed_indices[1]);
+    // The last two dimensions should be transposed!
+    int64_t expected_major_to_minor_m_index = (i / n_bound) % m_bound;
+    int64_t expected_major_to_minor_n_index = i % n_bound;
+    EXPECT_EQ(major_to_minor_indices[2], expected_major_to_minor_m_index);
+    EXPECT_EQ(major_to_minor_indices[3], expected_major_to_minor_n_index);
+
+    int64_t expected_transposed_m_index = i % m_bound;
+    int64_t expected_transposed_n_index = (i / m_bound) % n_bound;
+    EXPECT_EQ(transposed_indices[2], expected_transposed_m_index);
+    EXPECT_EQ(transposed_indices[3], expected_transposed_n_index);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
index 5685fa2dd86899..b5d4c3cb160641 100644
--- a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -53,7 +54,8 @@ class TilingSpecificationTest : public HloHardwareIndependentTestBase {
             *module->entry_computation()
                  ->root_instruction()
                  ->fused_instructions_computation(),
-            &mlir_context_, /*emitter_specific_constraints_builder=*/nullptr);
+            &mlir_context_,
+            /*emitter_specific_constraints_builder=*/nullptr);
 
     CHECK(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
     return std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
diff --git a/third_party/xla/xla/codegen/tools/BUILD b/third_party/xla/xla/codegen/tools/BUILD
index ae508dcbbb422c..ce17ae13a9e0f8 100644
--- a/third_party/xla/xla/codegen/tools/BUILD
+++ b/third_party/xla/xla/codegen/tools/BUILD
@@ -16,24 +16,31 @@ xla_cc_binary(
     visibility = [
         "//xla/backends/cpu/codegen:__subpackages__",
         "//xla/backends/gpu/codegen:__subpackages__",
+        "//xla/codegen:__subpackages__",
         "//xla/codegen/emitters/ir/tests:__subpackages__",
         "//xla/codegen/emitters/tests:__pkg__",
         "//xla/codegen/emitters/transforms/tests:__subpackages__",
+        "//xla/codegen/xtile/ir/tests:__pkg__",
     ],
     deps = [
         "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
         "//xla/backends/cpu/codegen/emitters/transforms:passes",
+        "//xla/backends/cpu/codegen/tiled/transforms:passes",
         "//xla/backends/gpu/codegen/emitters:emitter_base",
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/backends/gpu/codegen/emitters/transforms:passes",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/codegen/xtile/ir/transforms:passes",
         "//xla/mlir_hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DLTIDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -48,8 +55,10 @@ xla_cc_binary(
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
diff --git a/third_party/xla/xla/codegen/tools/emitters_opt.cc b/third_party/xla/xla/codegen/tools/emitters_opt.cc
index 1bb950245a5559..02f7496c0fc4b9 100644
--- a/third_party/xla/xla/codegen/tools/emitters_opt.cc
+++ b/third_party/xla/xla/codegen/tools/emitters_opt.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include "llvm/ADT/Twine.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
@@ -29,19 +32,26 @@ limitations under the License.
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "mlir/Transforms/Passes.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_dialect.h"
 #include "xla/backends/cpu/codegen/emitters/transforms/passes.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
-#include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/emitters/ir/xla_dialect.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
+#include "xla/codegen/xtile/ir/xtile_dialect.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 
@@ -54,7 +64,8 @@ int main(int argc, char** argv) {
       mlir::gpu::GPUDialect, mlir::math::MathDialect, mlir::mhlo::MhloDialect,
       mlir::mhlo::MhloDialect, mlir::scf::SCFDialect,
       mlir::tensor::TensorDialect, mlir::vector::VectorDialect, xla::XlaDialect,
-      xla::cpu::XlaCpuDialect, xla::gpu::XlaGpuDialect>();
+      xla::cpu::XlaCpuDialect, xla::gpu::XlaGpuDialect,
+      xla::xtile::XTileDialect, mlir::stablehlo::StablehloDialect>();
   mlir::func::registerAllExtensions(registry);
   mlir::LLVM::registerInlinerInterface(registry);
   mlir::registerCanonicalizerPass();
@@ -63,6 +74,15 @@ int main(int argc, char** argv) {
   xla::emitters::registerTransformsPasses();
   xla::gpu::registerGpuFusionTransformsPasses();
   xla::cpu::registerXlaCpuTransformsPasses();
+  xla::cpu::registerXTileCpuTransformsPasses();
+  xla::xtile::registerXTileTransformsPasses();
+  mlir::bufferization::registerBufferizationPasses();
+
+  mlir::arith::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
+      registry);
+  mlir::tensor::registerBufferizableOpInterfaceExternalModels(registry);
+
   mlir::registerPassPipeline(
       "xla-test-optimize",
       "Test pipeline of passes up to inlining. No vectorization, also does not "
diff --git a/third_party/xla/xla/codegen/xtile/ir/BUILD b/third_party/xla/xla/codegen/xtile/ir/BUILD
new file mode 100644
index 00000000000000..27bbf1064e723b
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/BUILD
@@ -0,0 +1,113 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+td_library(
+    name = "xtile_td_files",
+    srcs = glob(["*.td"]),
+    compatible_with = get_compatible_with_portable(),
+    includes = ["."],
+    deps = [
+        "@llvm-project//mlir:BufferizableOpInterfaceTdFiles",
+        "@llvm-project//mlir:BuiltinDialectTdFiles",
+        "@llvm-project//mlir:CallInterfacesTdFiles",
+        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:FunctionInterfacesTdFiles",
+        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "xtile_dialect_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    strip_include_prefix = ".",
+    tbl_outs = {
+        "xtile_dialect.h.inc": ["-gen-dialect-decls"],
+        "xtile_dialect.cc.inc": ["-gen-dialect-defs"],
+    },
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xtile_dialect.td",
+    deps = [":xtile_td_files"],
+)
+
+gentbl_cc_library(
+    name = "xtile_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    strip_include_prefix = ".",
+    tbl_outs = {
+        "xtile_interface_ops.h.inc": ["-gen-op-interface-decls"],
+        "xtile_interface_ops.cc.inc": ["-gen-op-interface-defs"],
+        "xtile_ops.h.inc": ["-gen-op-decls"],
+        "xtile_ops.cc.inc": ["-gen-op-defs"],
+    },
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xtile_ops.td",
+    deps = [":xtile_td_files"],
+)
+
+gentbl_cc_library(
+    name = "xtile_attrs_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    strip_include_prefix = ".",
+    tbl_outs = {
+        "xtile_attrs.h.inc": ["-gen-attrdef-decls"],
+        "xtile_attrs.cc.inc": ["-gen-attrdef-defs"],
+    },
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xtile_attrs.td",
+    deps = [":xtile_td_files"],
+)
+
+cc_library(
+    name = "xtile",
+    srcs = [
+        "xtile_attrs.cc",
+        "xtile_bufferization.cc",
+        "xtile_dialect.cc",
+        "xtile_ops.cc",
+    ],
+    hdrs = [
+        "xtile_attrs.h",
+        "xtile_dialect.h",
+        "xtile_ops.h",
+    ],
+    deps = [
+        ":xtile_attrs_inc_gen",
+        ":xtile_dialect_inc_gen",
+        ":xtile_ops_inc_gen",
+        "//xla/codegen/emitters:implicit_arith_op_builder",
+        "//xla/hlo/analysis:indexing_analysis",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
+        "@llvm-project//mlir:BytecodeOpInterface",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FunctionInterfaces",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/xtile/ir/tests/BUILD b/third_party/xla/xla/codegen/xtile/ir/tests/BUILD
new file mode 100644
index 00000000000000..10228fcc460af8
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/tests/BUILD
@@ -0,0 +1,16 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+lit_test_suite(
+    name = "tests",
+    srcs = glob(["*.mlir"]),
+    cfg = "//xla:lit.cfg.py",
+    tools = [
+        "//xla/codegen/tools:emitters_opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/xtile/ir/tests/ops.mlir b/third_party/xla/xla/codegen/xtile/ir/tests/ops.mlir
new file mode 100644
index 00000000000000..3edd30584d5f9c
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/tests/ops.mlir
@@ -0,0 +1,151 @@
+// RUN: emitters_opt %s --split-input-file --verify-roundtrip -verify-diagnostics
+
+xtile.entry_func @happy_path(%input: memref<1024x4xf32>, %output: memref<128x1024xf32>, %tile_id: index) {
+  %tile = xtile.extract %input[%tile_id, %tile_id][10, 1][1, 1] : memref<1024x4xf32> -> tensor<10xf32>
+  xtile.insert %tile into %output[%tile_id, %tile_id][10, 1][1, 1] : tensor<10xf32> -> memref<128x1024xf32>
+  xtile.return
+}
+
+// -----
+
+xtile.entry_func @with_attributes(
+  %input: memref<1024xf32> {xla.some_attr = 1},
+  %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:10, tiles_per_workgroup:5>} {
+  xtile.return
+}
+
+// -----
+
+// expected-error@+1 {{entry function arguments should be of the form (arg: memref..., tile_id: index)}}
+xtile.entry_func @tile_id_at_start(%tile_id: index, %input: memref<1024xf32>, %output: memref<1024xf32>) {
+  xtile.return
+}
+
+// -----
+
+// expected-error@+1 {{entry function arguments should be of the form (arg: memref..., tile_id: index)}}
+xtile.entry_func @too_many_tile_ids(%input: memref<1024xf32>, %id0: index, %id1: index) {
+  xtile.return
+}
+
+// -----
+
+xtile.entry_func @correct_opaque_args(
+  %input: memref<1024xf32>, %opaque0: index, %opaque1: index, %id1: index)
+  attributes {num_opaque_args = 2 : i32}  {
+  xtile.return
+}
+
+// -----
+
+// expected-error@+1 {{entry function arguments should be of the form (arg: memref..., tile_id: index)}}
+xtile.entry_func @wrong_opaque_args(
+  %input: memref<1024xf32>, %opaque0: index, %opaque1: index, %id1: index)
+  attributes {num_opaque_args = 1 : i32}  {
+  xtile.return
+}
+
+// -----
+
+func.func @incorrect_full_shape_extract(%arg: memref<1024xf32>) -> tensor<10xf32> {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{full tile shape size: 2 does not match rank of buffer: 1}}
+  %tile = xtile.extract %arg[%offset][10, 1][1] : memref<1024xf32> -> tensor<10xf32>
+  return %tile : tensor<10xf32>
+}
+
+// -----
+
+func.func @incorrect_offset_count_extract(%arg: memref<1024xf32>) -> tensor<10xf32> {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{expected 1 offset operands, got 2}}
+  %tile = xtile.extract %arg[%offset, %offset][10][1] : memref<1024xf32> -> tensor<10xf32>
+  return %tile : tensor<10xf32>
+}
+
+// -----
+
+func.func @incorrect_rank_reduction_extract(%arg: memref<16x1024xf32>) -> tensor<10xf32> {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{full tile shape: [16, 10] does not reduce to tile shape: [10]}}
+  %tile = xtile.extract %arg[%offset, %offset][16, 10][1, 1] : memref<16x1024xf32> -> tensor<10xf32>
+  return %tile : tensor<10xf32>
+}
+
+// -----
+
+func.func @type_mismatch_extract(%arg: memref<1024xf32>) -> tensor<10xf64> {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{buffer element type: 'f32' does not match element type of tile: 'f64'}}
+  %tile = xtile.extract %arg[%offset][10][1] : memref<1024xf32> -> tensor<10xf64>
+  return %tile : tensor<10xf64>
+}
+
+// -----
+
+func.func @incorrect_full_shape_insert(%src: tensor<24xf32>, %dst: memref<1024xf32>) {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{full tile shape size: 2 does not match rank of buffer: 1}}
+  xtile.insert %src into %dst[%offset][24, 1][1] : tensor<24xf32> -> memref<1024xf32>
+  return
+}
+
+// -----
+
+func.func @incorrect_offset_count_insert(%src: tensor<24xf32>, %dst: memref<1024xf32>) {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{expected 1 offset operands, got 2}}
+  xtile.insert %src into %dst[%offset, %offset][24][1] : tensor<24xf32> -> memref<1024xf32>
+  return
+}
+
+// -----
+
+func.func @incorrect_rank_reduction_insert(%src: tensor<24xf32>, %dst: memref<16x1024xf32>) {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{full tile shape: [16, 24] does not reduce to tile shape: [24]}}
+  xtile.insert %src into %dst[%offset, %offset][16, 24][1, 1] : tensor<24xf32> -> memref<16x1024xf32>
+  return
+}
+
+// -----
+
+func.func @type_mismatch_insert(%src: tensor<24xf64>, %dst: memref<1024xf32>) {
+  %offset = arith.constant 0 : index
+  // expected-error@+1 {{buffer element type: 'f32' does not match element type of tile: 'f64'}}
+  xtile.insert %src into %dst[%offset][24][1] : tensor<24xf64> -> memref<1024xf32>
+  return
+}
+
+// -----
+
+func.func @dot_scaled(%lhs: tensor<128x128xf32>, %lhs_scale: tensor<128x4xi8>, %rhs: tensor<128x256xf32>, %rhs_scale: tensor<256x4xi8>, %acc: tensor<128x256xf32>) -> tensor<128x256xf32> {
+  %0 = xtile.dot_scaled %lhs scale %lhs_scale, %rhs scale %rhs_scale {fastMath = true} : tensor<128x128xf32>, tensor<128x4xi8> * tensor<128x256xf32>, tensor<256x4xi8> -> tensor<128x256xf32>
+  return %0 : tensor<128x256xf32>
+}
+
+
+// -----
+
+func.func @legal_mask_op(%src: tensor<32xf64>, %mask: f64) -> tensor<32xf64> {
+  %masked = xtile.mask %src bounds [10], %mask : tensor<32xf64>
+  return %masked : tensor<32xf64>
+}
+
+// -----
+
+func.func @illegal_mask_bound_rank_mismatch(
+    %src: tensor<32xf64>, %mask: f64) -> tensor<32xf64> {
+  // expected-error@+1 {{tensor rank: 1 does not match mask bounds rank: 2}}
+  %masked = xtile.mask %src bounds [10, 1], %mask : tensor<32xf64>
+  return %masked : tensor<32xf64>
+}
+
+// -----
+
+func.func @illegal_mask_out_of_bounds(%src: tensor<32xf64>, %mask: f64) -> tensor<32xf64> {
+  // expected-error@+1 {{mask bound not less than or equal to the tensor size}}
+  %masked = xtile.mask %src bounds [33], %mask : tensor<32xf64>
+  return %masked : tensor<32xf64>
+}
+
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/BUILD b/third_party/xla/xla/codegen/xtile/ir/transforms/BUILD
new file mode 100644
index 00000000000000..08926c228e160c
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/BUILD
@@ -0,0 +1,55 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=XTileTransforms",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "convert_elementwise_0d_tensor_to_scalar_pass.cc",
+        "verify_legal_xtile_ops.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":passes_inc_gen",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/xtile/ir:xtile",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/convert_elementwise_0d_tensor_to_scalar_pass.cc b/third_party/xla/xla/codegen/xtile/ir/transforms/convert_elementwise_0d_tensor_to_scalar_pass.cc
new file mode 100644
index 00000000000000..1b2acb322875bc
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/convert_elementwise_0d_tensor_to_scalar_pass.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"  // IWYU pragma: keep
+
+namespace xla::xtile {
+
+#define GEN_PASS_DEF_CONVERTELEMENTWISE0DTENSORTOSCALARPASS
+#include "xla/codegen/xtile/ir/transforms/passes.h.inc"
+
+namespace {
+
+struct ElementwiseConverter
+    : public mlir::OpTraitConversionPattern<mlir::OpTrait::Elementwise> {
+ public:
+  using OpTraitConversionPattern::OpTraitConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::Operation* op, mlir::ArrayRef<mlir::Value> operands,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    llvm::SmallVector<mlir::Type> new_result_types;
+    if (mlir::failed(getTypeConverter()->convertTypes(op->getResultTypes(),
+                                                      new_result_types))) {
+      return rewriter.notifyMatchFailure(op, "failed to convert type");
+    }
+
+    mlir::IRMapping mapping;
+    mapping.map(op->getOperands(), operands);
+    mlir::Operation* new_op = rewriter.clone(*op, mapping);
+
+    for (auto [results, new_type] :
+         llvm::zip(new_op->getResults(), new_result_types)) {
+      results.setType(new_type);
+    }
+
+    rewriter.replaceOp(op, new_op);
+    return mlir::success();
+  }
+};
+
+struct ConstantConversionPattern
+    : public mlir::OpConversionPattern<mlir::arith::ConstantOp> {
+  using OpConversionPattern<mlir::arith::ConstantOp>::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::arith::ConstantOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    auto dense_attr =
+        mlir::dyn_cast<mlir::DenseElementsAttr>(op.getValueAttr());
+    if (!dense_attr) {
+      return rewriter.notifyMatchFailure(op, "expected a DenseElementsAttr");
+    }
+
+    if (dense_attr.size() != 1) {
+      return rewriter.notifyMatchFailure(op, "expected a single element");
+    }
+
+    auto scalar_attr = dense_attr.getValues<mlir::TypedAttr>()[0];
+    rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(op, scalar_attr);
+
+    return mlir::success();
+  }
+};
+
+struct ConvertElementwise0DTensorToScalarPass
+    : public impl::ConvertElementwise0DTensorToScalarPassBase<
+          ConvertElementwise0DTensorToScalarPass> {
+  void runOnOperation() override {
+    mlir::TypeConverter type_converter;
+    type_converter.addConversion([](mlir::Type type) { return type; });
+
+    type_converter.addConversion([](mlir::RankedTensorType type) -> mlir::Type {
+      if (type.getRank() == 0) {
+        return type.getElementType();
+      }
+      return type;
+    });
+
+    type_converter.addSourceMaterialization(
+        [](mlir::OpBuilder& builder, mlir::Type result_type,
+           mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value {
+          if (inputs.size() != 1) {
+            return nullptr;
+          }
+          return mlir::tensor::FromElementsOp::create(builder, loc, result_type,
+                                                      inputs.front());
+        });
+
+    type_converter.addTargetMaterialization(
+        [](mlir::OpBuilder& builder, mlir::Type result_type,
+           mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value {
+          if (inputs.size() != 1) {
+            return nullptr;
+          }
+          return mlir::tensor::ExtractOp::create(builder, loc, inputs.front());
+        });
+
+    mlir::ConversionTarget target(getContext());
+
+    target.markUnknownOpDynamicallyLegal(
+        [&](mlir::Operation* op) -> std::optional<bool> {
+          if (op->hasTrait<mlir::OpTrait::Elementwise>()) {
+            return type_converter.isLegal(op);
+          }
+          return std::nullopt;
+        });
+
+    target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
+        [&](mlir::arith::ConstantOp op) {
+          return type_converter.isLegal(op.getOperation());
+        });
+
+    mlir::RewritePatternSet patterns(&getContext());
+
+    patterns.add<ElementwiseConverter, ConstantConversionPattern>(
+        type_converter, &getContext());
+
+    if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
+                                                  std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/backends/gpu/runtime/sdc.proto b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.h
similarity index 58%
rename from third_party/xla/xla/backends/gpu/runtime/sdc.proto
rename to third_party/xla/xla/codegen/xtile/ir/transforms/passes.h
index 3f5f79907b82b9..9991af93198f4c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sdc.proto
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.h
@@ -13,24 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-syntax = "proto3";
+#ifndef XLA_CODEGEN_XTILE_IR_TRANSFORMS_PASSES_H_
+#define XLA_CODEGEN_XTILE_IR_TRANSFORMS_PASSES_H_
 
-package xla.gpu;
+#include <memory>  // IWYU pragma: keep
 
-message SdcLogEntryProto {
-  // The ID of the thunk that produced this entry, as returned by
-  // ThunkInfo::thunk_id().
-  uint64 thunk_id = 1;
+#include "mlir/IR/BuiltinOps.h"  // IWYU pragma: keep
+#include "mlir/Pass/Pass.h"  // IWYU pragma: keep
 
-  // The index of the buffer within the list of thunk buffers returned by
-  // Thunk::buffer_uses().
-  uint64 buffer_idx = 2;
+namespace xla::xtile {
 
-  // The checksum of the buffer.
-  uint32 checksum = 3;
-}
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "xla/codegen/xtile/ir/transforms/passes.h.inc"
 
-message SdcLogProto {
-  // The list of entries in the SDC log.
-  repeated SdcLogEntryProto entries = 1;
-}
+}  // namespace xla::xtile
+
+#endif  // XLA_CODEGEN_XTILE_IR_TRANSFORMS_PASSES_H_
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/passes.td b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.td
new file mode 100644
index 00000000000000..a9a198318571dc
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.td
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def VerifyLegalXTileOpsPass : Pass<"xtile-verify-legal-ops", "mlir::ModuleOp"> {
+  let summary = "Verify that all ops in the module are legal";
+
+  let description = [{
+    This pass verifies that all ops in the module are legal, which is defined as
+    the ops that are supported by all of the XTile backends.
+  }];
+
+}
+
+def ConvertElementwise0DTensorToScalarPass
+    : Pass<"convert-elementwise-0d-tensor-to-scalar"> {
+  let summary = "Lowers 0D tensors of elementwise ops to scalars.";
+
+  let dependentDialects = [
+    "mlir::tensor::TensorDialect",
+  ];
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/BUILD b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/BUILD
new file mode 100644
index 00000000000000..10228fcc460af8
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/BUILD
@@ -0,0 +1,16 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+lit_test_suite(
+    name = "tests",
+    srcs = glob(["*.mlir"]),
+    cfg = "//xla:lit.cfg.py",
+    tools = [
+        "//xla/codegen/tools:emitters_opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/bufferize.mlir b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/bufferize.mlir
new file mode 100644
index 00000000000000..2042a3c0507f19
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/bufferize.mlir
@@ -0,0 +1,67 @@
+// RUN: emitters_opt %s -one-shot-bufferize -canonicalize -cse \
+// RUN: -split-input-file | FileCheck %s
+
+// CHECK: @extract_strided(%[[SOURCE:.*]]: memref<16xf32>, %[[OFFSET:.*]]: index)
+func.func @extract_strided(%source: memref<16xf32>, %tile_id: index) -> tensor<8xf32> {
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+  // CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index
+
+  // CHECK: %[[SHIFT:.*]] = arith.subi %[[C15]], %[[OFFSET]] : index
+  // CHECK: %[[STRIDED_SHIFT:.*]] = arith.divsi %[[SHIFT]], %[[C2]] : index
+  // CHECK: %[[ELEMENTS_TO_END:.*]] = arith.addi %[[STRIDED_SHIFT]], %[[C1]] : index
+  // CHECK: %[[SIZE:.*]] = arith.minsi %[[ELEMENTS_TO_END]], %[[C8]] : index
+
+  // CHECK: %[[INPUT_SUBVIEW:.*]] = memref.subview %[[SOURCE]]
+  // CHECK-SAME: [%[[OFFSET]]] [%[[SIZE]]] [2]
+  // CHECK-SAME: : memref<16xf32> to memref<?xf32, strided<[2], offset: ?>>
+
+  // CHECK: %[[BUFFER:.*]] = memref.alloc() : memref<8xf32>
+
+  // CHECK: %[[BUFFER_SUBVIEW:.*]] = memref.subview %[[BUFFER]]
+  // CHECK-SAME: [0] [%[[SIZE]]] [1] : memref<8xf32> to memref<?xf32, strided<[1]>>
+
+  // CHECK: memref.copy %[[INPUT_SUBVIEW]], %[[BUFFER_SUBVIEW]]
+  // CHECK-SAME: : memref<?xf32, strided<[2], offset: ?>> to memref<?xf32, strided<[1]>>
+
+  // CHECK: %[[TILE:.*]] = bufferization.to_tensor %[[BUFFER]] restrict writable
+  // CHECK-SAME: : memref<8xf32> to tensor<8xf32>
+  %tile = xtile.extract %source[%tile_id][8][2] : memref<16xf32> -> tensor<8xf32>
+  // CHECK: return %[[TILE]] : tensor<8xf32>
+  return %tile : tensor<8xf32>
+}
+
+// -----
+
+// CHECK: @insert_strided(
+// CHECK-SAME: %[[SOURCE:.*]]: tensor<8xf32>,
+// CHECK-SAME: %[[DESTINATION:.*]]: memref<16xf32>,
+// CHECK-SAME: %[[OFFSET:.*]]: index)
+func.func @insert_strided(%source: tensor<8xf32>, %destination: memref<16xf32>, %tile_id: index) {
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+  // CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index
+
+  // CHECK: %[[SOURCE_BUFFER:.*]] = bufferization.to_buffer %[[SOURCE]]
+  // CHECK-SAME: : tensor<8xf32> to memref<8xf32, strided<[?], offset: ?>>
+
+  // CHECK: %[[SHIFT:.*]] = arith.subi %[[C15]], %[[OFFSET]] : index
+  // CHECK: %[[STRIDED_SHIFT:.*]] = arith.divsi %[[SHIFT]], %[[C2]] : index
+  // CHECK: %[[ELEMENTS_TO_END:.*]] = arith.addi %[[STRIDED_SHIFT]], %[[C1]] : index
+  // CHECK: %[[SIZE:.*]] = arith.minsi %[[ELEMENTS_TO_END]], %[[C8]] : index
+
+  // CHECK: %[[SOURCE_SUBVIEW:.*]] = memref.subview %[[SOURCE_BUFFER]][0] [%[[SIZE]]] [1]
+  // CHECK-SAME: : memref<8xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[?], offset: ?>>
+
+  // CHECK: %[[DESTINATION_SUBVIEW:.*]] = memref.subview %[[DESTINATION]]
+  // CHECK-SAME: [%[[OFFSET]]] [%[[SIZE]]] [2]
+  // CHECK-SAME: : memref<16xf32> to memref<?xf32, strided<[2], offset: ?>>
+
+  // CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DESTINATION_SUBVIEW]]
+  // CHECK-SAME: : memref<?xf32, strided<[?], offset: ?>>
+  // CHECK-SAME: to memref<?xf32, strided<[2], offset: ?>>
+  xtile.insert %source into %destination[%tile_id][8][2] : tensor<8xf32> -> memref<16xf32>
+  return
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/convert_elementwise_0d_tensor_to_scalar_pass.mlir b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/convert_elementwise_0d_tensor_to_scalar_pass.mlir
new file mode 100644
index 00000000000000..3892f343766d51
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/convert_elementwise_0d_tensor_to_scalar_pass.mlir
@@ -0,0 +1,25 @@
+// RUN: emitters_opt %s \
+// RUN: -split-input-file -convert-elementwise-0d-tensor-to-scalar \
+// RUN: | FileCheck %s
+
+func.func @converts_0d_addf(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: arith.addf {{.*}} : f32
+  %0 = arith.addf %arg0, %arg0 : tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @skips_1d_addf(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK: arith.addf {{.*}} : tensor<1xf32>
+  %0 = arith.addf %arg0, %arg0 : tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+// -----
+
+func.func @converts_0d_constant() -> tensor<f32> {
+  // CHECK: arith.constant 1.000000e+00 : f32
+  %0 = arith.constant dense<1.0> : tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/verify_legal_xtile_ops.mlir b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/verify_legal_xtile_ops.mlir
new file mode 100644
index 00000000000000..9a45785b541c16
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/verify_legal_xtile_ops.mlir
@@ -0,0 +1,18 @@
+// RUN: emitters_opt %s -xtile-verify-legal-ops -split-input-file -verify-diagnostics
+
+xtile.entry_func @fails_illegal_op(%arg0: memref<2xf32>, %arg1: index) {
+  %c_0 = arith.constant 0. : f32
+  // expected-error @+1 {{vector.transfer_read: unsupported op}}
+  %0 = vector.transfer_read %arg0[%arg1], %c_0 : memref<2xf32>, vector<2xf32>
+  // expected-error @+1 {{vector.transfer_write: unsupported op}}
+  vector.transfer_write %0, %arg0[%arg1] : vector<2xf32>, memref<2xf32>
+  xtile.return
+}
+
+// -----
+
+func.func @iota_2d_fails() -> tensor<2x2xi32> {
+  // expected-error @+1 {{Only 1D iota is supported}}
+  %0 = stablehlo.iota dim = 0 :  tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/verify_legal_xtile_ops.cc b/third_party/xla/xla/codegen/xtile/ir/transforms/verify_legal_xtile_ops.cc
new file mode 100644
index 00000000000000..a24553b8cfa96f
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/verify_legal_xtile_ops.cc
@@ -0,0 +1,147 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/WalkResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "xla/codegen/emitters/ir/xla_dialect.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_dialect.h"
+
+namespace xla::xtile {
+
+#define GEN_PASS_DEF_VERIFYLEGALXTILEOPSPASS
+#include "xla/codegen/xtile/ir/transforms/passes.h.inc"
+
+namespace {
+
+bool WholeDialectIsLegal(mlir::Dialect* dialect) {
+  return mlir::isa<XTileDialect, XlaDialect, mlir::arith::ArithDialect,
+                   mlir::math::MathDialect, mlir::func::FuncDialect,
+                   mlir::BuiltinDialect>(dialect);
+}
+
+std::optional<absl::string_view> IsLegalSCFOp(mlir::Operation* op) {
+  if (mlir::isa<mlir::scf::ForOp, mlir::scf::IfOp, mlir::scf::YieldOp>(op)) {
+    return std::nullopt;
+  }
+
+  return "unsupported SCF op";
+}
+
+std::optional<absl::string_view> IsLegalTensorOp(mlir::Operation* op) {
+  if (mlir::isa<mlir::tensor::BitcastOp>(op)) {
+    return std::nullopt;
+  }
+
+  // TODO(willfroom): remove this ExtractOp & FromElementsOp once the special
+  // handling of 0D tensors is removed from the emitter.
+  if (auto extract = mlir::dyn_cast<mlir::tensor::ExtractOp>(op)) {
+    if (extract.getTensor().getType().getRank() != 0) {
+      return "Expected rank 0";
+    }
+    return std::nullopt;
+  }
+
+  if (auto from_elements = mlir::dyn_cast<mlir::tensor::FromElementsOp>(op)) {
+    if (from_elements.getType().getRank() != 0) {
+      return "Expected rank 0";
+    }
+    return std::nullopt;
+  }
+
+  return "unsupported Tensor op";
+}
+
+std::optional<absl::string_view> IsLegalStablehloOp(mlir::Operation* op) {
+  if (mlir::isa<mlir::stablehlo::BroadcastInDimOp, mlir::stablehlo::ReduceOp,
+                mlir::stablehlo::ReturnOp, mlir::stablehlo::TransposeOp,
+                mlir::stablehlo::DotGeneralOp, mlir::stablehlo::ReshapeOp>(
+          op)) {
+    return std::nullopt;
+  }
+
+  if (auto iota = mlir::dyn_cast<mlir::stablehlo::IotaOp>(op)) {
+    if (iota.getType().getRank() != 1) {
+      return "Only 1D iota is supported";
+    }
+
+    return std::nullopt;
+  }
+
+  return "unsupported StableHLO op";
+}
+
+// Check if a given op is xtile legal, if it is return std::nullopt else,
+// returns a diagnostic string.
+std::optional<absl::string_view> IsLegalOp(mlir::Operation* op) {
+  mlir::Dialect* dialect = op->getDialect();
+  if (WholeDialectIsLegal(dialect)) {
+    return std::nullopt;
+  }
+
+  if (mlir::isa<mlir::scf::SCFDialect>(dialect)) {
+    return IsLegalSCFOp(op);
+  }
+
+  if (mlir::isa<mlir::tensor::TensorDialect>(dialect)) {
+    return IsLegalTensorOp(op);
+  }
+
+  if (mlir::isa<mlir::stablehlo::StablehloDialect>(dialect)) {
+    return IsLegalStablehloOp(op);
+  }
+
+  return "unsupported op";
+}
+
+struct VerifyLegalXTileOpsPass
+    : public impl::VerifyLegalXTileOpsPassBase<VerifyLegalXTileOpsPass> {
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    bool failed = false;
+    module->walk([&failed](mlir::Operation* op) {
+      if (std::optional<absl::string_view> diagnostic = IsLegalOp(op)) {
+        op->emitError() << op->getName() << ": " << *diagnostic;
+        failed = true;
+      }
+      return mlir::WalkResult::advance();
+    });
+
+    if (failed) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/service/cpu/windows_compatibility.cc b/third_party/xla/xla/codegen/xtile/ir/xtile_attrs.cc
similarity index 64%
rename from third_party/xla/xla/service/cpu/windows_compatibility.cc
rename to third_party/xla/xla/codegen/xtile/ir/xtile_attrs.cc
index f6da04d750f7e0..0bd88b941ea1cc 100644
--- a/third_party/xla/xla/service/cpu/windows_compatibility.cc
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_attrs.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/windows_compatibility.h"
-
-#ifdef _MSC_VER
-
-#include <math.h>
-
-void sincos(double x, double *sinv, double *cosv) {
-  *sinv = sin(x);
-  *cosv = cos(x);
-}
-
-void sincosf(float x, float *sinv, float *cosv) {
-  *sinv = sinf(x);
-  *cosv = cosf(x);
-}
-
-#endif  // _MSC_VER
+#include "xla/codegen/xtile/ir/xtile_attrs.h"
diff --git a/third_party/xla/xla/codegen/llvm_kernel_definition.h b/third_party/xla/xla/codegen/xtile/ir/xtile_attrs.h
similarity index 60%
rename from third_party/xla/xla/codegen/llvm_kernel_definition.h
rename to third_party/xla/xla/codegen/xtile/ir/xtile_attrs.h
index 7efcb9f2e97e12..191fc077142486 100644
--- a/third_party/xla/xla/codegen/llvm_kernel_definition.h
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_attrs.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_LLVM_KERNEL_DEFINITION_H_
-#define XLA_CODEGEN_LLVM_KERNEL_DEFINITION_H_
+#ifndef XLA_CODEGEN_XTILE_IR_XTILE_ATTRS_H_
+#define XLA_CODEGEN_XTILE_IR_XTILE_ATTRS_H_
 
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "mlir/IR/Attributes.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_dialect.h"  // IWYU pragma: keep
+#define GET_ATTRDEF_CLASSES
+#include "xla/codegen/xtile/ir/xtile_attrs.h.inc"  // IWYU pragma: export
 
-namespace xla {
-
-using LlvmKernelDefinition = KernelDefinition<LlvmIrKernelSource>;
-
-}  // namespace xla
-
-#endif  // XLA_CODEGEN_LLVM_KERNEL_DEFINITION_H_
+#endif  // XLA_CODEGEN_XTILE_IR_XTILE_ATTRS_H_
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_attrs.td b/third_party/xla/xla/codegen/xtile/ir/xtile_attrs.td
new file mode 100644
index 00000000000000..511333e1ebbb85
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_attrs.td
@@ -0,0 +1,38 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_XTILE_IR_XTILE_ATTRS
+#define XLA_CODEGEN_XTILE_IR_XTILE_ATTRS
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/EnumAttr.td"
+include "xla/codegen/xtile/ir/xtile_dialect.td"
+
+class XTile_Attr<string name, list<Trait> traits = []> :
+      AttrDef<XTileDialect, name, traits> {
+}
+
+def XTile_TilingInfoAttr : XTile_Attr<"TilingInfo"> {
+  let summary = "Contains auxillary information about how a program is tiled.";
+  let mnemonic = "tiling_info";
+  let parameters = (ins "int32_t":$tile_count, "int32_t":$tiles_per_workgroup);
+
+  let assemblyFormat = [{
+    `<` `tile_count` `:` $tile_count `,`
+    `tiles_per_workgroup` `:` $tiles_per_workgroup `>`
+  }];
+}
+
+#endif // XLA_CODEGEN_XTILE_IR_XTILE_ATTRS
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_bufferization.cc b/third_party/xla/xla/codegen/xtile/ir/xtile_bufferization.cc
new file mode 100644
index 00000000000000..b2b6ef8ec00ee7
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_bufferization.cc
@@ -0,0 +1,254 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/codegen/emitters/implicit_arith_op_builder.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+
+namespace xla::xtile {
+
+static llvm::SmallVector<mlir::OpFoldResult> GetStaticFoldResult(
+    mlir::OpBuilder& builder, llvm::ArrayRef<int64_t> input) {
+  return llvm::map_to_vector(input, [&builder](int64_t value) {
+    return mlir::OpFoldResult(builder.getIndexAttr(value));
+  });
+}
+
+static llvm::SmallVector<mlir::OpFoldResult> GetDynamicFoldResult(
+    mlir::ValueRange input) {
+  return llvm::SmallVector<mlir::OpFoldResult>(input);
+}
+
+// Get the size of the memref subview with the output size clamped to inbound
+// elements, if full_size is true then unit values are inserted for reduced
+// dimensions.
+// The derivation of these bounds is as follows:
+//   index + tile_size * stride <= size - 1
+//   tile_size * stride <= size - 1 - index
+//   tile_size <= size - 1 - index / stride
+//   tile_size < ((size - 1 - index) / stride) + 1
+static llvm::SmallVector<mlir::OpFoldResult> GetClampedTileSize(
+    mlir::ImplicitLocOpBuilder& builder, TiledBufferInterface op,
+    bool full_size) {
+  llvm::SmallVector<mlir::OpFoldResult> tile_size;
+  llvm::SmallDenseSet<unsigned> reduced_dims = op.getReducedDimensions();
+  int64_t idx = 0;
+  for (auto [buffer_size, offset, stride, full_tile_size] :
+       llvm::zip(op.getBuffer().getType().getShape(), op.getOffsets(),
+                 op.getStrides(), op.getFullTileShape())) {
+    if (reduced_dims.contains(idx++)) {
+      if (full_size) {
+        tile_size.emplace_back(builder.getIndexAttr(1));
+      }
+      continue;
+    }
+    emitters::ImplicitArithOpBuilder arith_builder(
+        mlir::arith::ConstantIndexOp::create(builder, (buffer_size - 1)),
+        &builder);
+    auto numerator = arith_builder - offset;
+    // The stride can be 0 for single element tiles.
+    // TODO(willfroom): Fix tile analysis so this never happens.
+    auto clamped_stride = std::max<int64_t>(stride, 1);
+    auto bound = numerator / clamped_stride + 1;
+    tile_size.emplace_back(bound.min(full_tile_size));
+  }
+
+  return tile_size;
+}
+
+// Get the subview of the op buffer with its size clamped such that all elements
+// are in bounds.
+static mlir::TypedValue<mlir::MemRefType> GetClampedSubView(
+    mlir::ImplicitLocOpBuilder& builder, TiledBufferInterface op) {
+  auto tile_size = GetClampedTileSize(builder, op, true);
+
+  auto offsets = GetDynamicFoldResult(op.getOffsets());
+  auto strides = GetStaticFoldResult(builder, op.getStrides());
+
+  mlir::RankedTensorType tile_type = op.getTile().getType();
+  llvm::SmallVector<int64_t> output_shape(tile_type.getRank(),
+                                          mlir::ShapedType::kDynamic);
+  mlir::MemRefType subview_type =
+      mlir::memref::SubViewOp::inferRankReducedResultType(
+          output_shape, op.getBuffer().getType(), offsets, tile_size, strides);
+
+  return mlir::memref::SubViewOp::create(builder, subview_type, op.getBuffer(),
+                                         offsets, tile_size, strides);
+}
+
+// Get the subview of the local buffer - i.e it has 0 offsets & unit strides.
+static mlir::TypedValue<mlir::MemRefType> GetLocalBufferSubview(
+    mlir::ImplicitLocOpBuilder& builder,
+    mlir::TypedValue<mlir::MemRefType> buffer,
+    llvm::ArrayRef<mlir::OpFoldResult> tile_size,
+    llvm::ArrayRef<int64_t> full_tile_shape) {
+  mlir::SmallVector<mlir::OpFoldResult> buffer_offsets(
+      buffer.getType().getRank(), builder.getIndexAttr(0));
+  mlir::SmallVector<mlir::OpFoldResult> buffer_strides(
+      buffer.getType().getRank(), builder.getIndexAttr(1));
+
+  mlir::MemRefType buffer_subview_type =
+      mlir::memref::SubViewOp::inferRankReducedResultType(
+          full_tile_shape, buffer.getType(), buffer_offsets, tile_size,
+          buffer_strides);
+  return mlir::memref::SubViewOp::create(builder, buffer_subview_type, buffer,
+                                         buffer_offsets, tile_size,
+                                         buffer_strides);
+}
+
+// Extract the slice of the tensor that is clamped to be within bounds of the
+// target buffer.
+static mlir::TypedValue<mlir::RankedTensorType> GetTensorSlice(
+    mlir::ImplicitLocOpBuilder& builder, InsertTileOp op) {
+  auto tile_size = GetClampedTileSize(builder, op, false);
+
+  mlir::SmallVector<mlir::OpFoldResult> offsets(tile_size.size(),
+                                                builder.getIndexAttr(0));
+  mlir::SmallVector<mlir::OpFoldResult> strides(tile_size.size(),
+                                                builder.getIndexAttr(1));
+
+  return mlir::tensor::ExtractSliceOp::create(builder, op.getSource(), offsets,
+                                              tile_size, strides);
+}
+
+// Get a buffer copied from the original buffer that is padded to the full tile
+// size.
+static mlir::TypedValue<mlir::MemRefType> GetPaddedTileBuffer(
+    mlir::ImplicitLocOpBuilder& builder, ExtractTileOp op) {
+  auto buffer_tile_subview = GetClampedSubView(builder, op);
+  mlir::RankedTensorType tile_type = op.getResult().getType();
+  auto buffer = mlir::memref::AllocOp::create(
+      builder, GetStaticFoldResult(builder, tile_type.getShape()),
+      tile_type.getElementType());
+
+  auto local_tile_size = GetClampedTileSize(builder, op, false);
+  auto local_buffer_subview =
+      GetLocalBufferSubview(builder, buffer, local_tile_size,
+                            buffer_tile_subview.getType().getShape());
+
+  mlir::memref::CopyOp::create(builder, buffer_tile_subview,
+                               local_buffer_subview);
+
+  return buffer;
+}
+
+bool ExtractTileOp::bufferizesToMemoryRead(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return true;
+}
+
+bool ExtractTileOp::bufferizesToMemoryWrite(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return true;
+}
+
+mlir::bufferization::AliasingValueList ExtractTileOp::getAliasingValues(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return {};
+}
+
+bool ExtractTileOp::isWritable(
+    mlir::Value value, const mlir::bufferization::AnalysisState& state) {
+  // We currently unconditionally create a new buffer to store the extracted
+  // tile so it is always writable.
+  if (value == getResult()) {
+    return true;
+  }
+
+  return false;
+}
+
+llvm::LogicalResult ExtractTileOp::bufferize(
+    mlir::RewriterBase& rewriter,
+    const mlir::bufferization::BufferizationOptions& options,
+    mlir::bufferization::BufferizationState& state) {
+  mlir::ImplicitLocOpBuilder builder(getLoc(), rewriter);
+  auto buffer = GetPaddedTileBuffer(builder, *this);
+  auto to_tensor_op =
+      rewriter.replaceOpWithNewOp<mlir::bufferization::ToTensorOp>(
+          this->getOperation(), getType(), buffer);
+  to_tensor_op.setWritable(true);
+  to_tensor_op.setRestrict(true);
+  return mlir::success();
+}
+
+bool InsertTileOp::bufferizesToMemoryRead(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return true;
+}
+
+bool InsertTileOp::bufferizesToMemoryWrite(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  DCHECK_EQ(operand.getOperandNumber(), 0)
+      << "This should only be called on the tensor operand.";
+  return false;
+}
+
+mlir::bufferization::AliasingValueList InsertTileOp::getAliasingValues(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return {};
+}
+
+bool InsertTileOp::isWritable(mlir::Value value,
+                              const mlir::bufferization::AnalysisState& state) {
+  if (value == getDestination()) {
+    return true;
+  }
+
+  return false;
+}
+
+llvm::LogicalResult InsertTileOp::bufferize(
+    mlir::RewriterBase& rewriter,
+    const mlir::bufferization::BufferizationOptions& options,
+    mlir::bufferization::BufferizationState& state) {
+  mlir::ImplicitLocOpBuilder builder(getLoc(), rewriter);
+
+  auto tile_slice = GetTensorSlice(builder, *this);
+  auto target_buffer_subview = GetClampedSubView(builder, *this);
+  auto materialize_op = mlir::bufferization::MaterializeInDestinationOp::create(
+      builder, tile_slice, target_buffer_subview);
+  materialize_op.setWritable(true);
+
+  rewriter.eraseOp(this->getOperation());
+  return mlir::success();
+}
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.cc b/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.cc
new file mode 100644
index 00000000000000..c41a91e8ac9c54
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.cc
@@ -0,0 +1,72 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/codegen/xtile/ir/xtile_dialect.h"
+
+#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
+#include "mlir/IR/DialectImplementation.h"  // IWYU pragma: keep
+#include "mlir/Transforms/InliningUtils.h"
+#include "xla/codegen/xtile/ir/xtile_attrs.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_ops.h"  // IWYU pragma: keep
+
+// Include the auto-generated implementation file.
+#include "xla/codegen/xtile/ir/xtile_dialect.cc.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "xla/codegen/xtile/ir/xtile_attrs.cc.inc"
+
+namespace xla::xtile {
+
+namespace {
+
+struct XTileInlinerInterface final : public mlir::DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+  // We allow all callables to be inlined.
+  bool isLegalToInline(mlir::Operation* call, mlir::Operation* callable,
+                       bool wouldBeCloned) const override {
+    return true;
+  }
+
+  // We allow any op from the xla dialect to be inlined.
+  bool isLegalToInline(mlir::Operation* op, mlir::Region* dest,
+                       bool wouldBeCloned,
+                       mlir::IRMapping& valueMapping) const override {
+    return true;
+  }
+  // We allow any ops to be inlined into any region.
+  bool isLegalToInline(mlir::Region* dest, mlir::Region* src,
+                       bool wouldBeCloned,
+                       mlir::IRMapping& valueMapping) const override {
+    return true;
+  }
+};
+
+}  // namespace
+
+void XTileDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "xla/codegen/xtile/ir/xtile_ops.cc.inc"
+      >();
+
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "xla/codegen/xtile/ir/xtile_attrs.cc.inc"
+      >();
+
+  addInterfaces<XTileInlinerInterface>();
+}
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.h b/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.h
new file mode 100644
index 00000000000000..1002f5e84d5e2d
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.h
@@ -0,0 +1,29 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_XTILE_IR_XTILE_DIALECT_H_
+#define XLA_CODEGEN_XTILE_IR_XTILE_DIALECT_H_
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // IWYU pragma: keep
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // IWYU pragma: keep
+#include "mlir/IR/Dialect.h"  // IWYU pragma: keep
+#include "stablehlo/dialect/StablehloOps.h"  // IWYU pragma: keep
+
+// Include the auto-generated header file.
+
+#include "xla/codegen/xtile/ir/xtile_dialect.h.inc"  // IWYU pragma: export
+
+#endif  // XLA_CODEGEN_XTILE_IR_XTILE_DIALECT_H_
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.td b/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.td
new file mode 100644
index 00000000000000..409a7849334768
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_dialect.td
@@ -0,0 +1,38 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_XTILE_IR_XTILE_DIALECT
+#define XLA_CODEGEN_XTILE_IR_XTILE_DIALECT
+
+include "mlir/IR/DialectBase.td"
+
+def XTileDialect : Dialect {
+  let name = "xtile";
+  let description = [{
+    This dialect contains ops required for lowering HLO to LLVM.
+  }];
+  let cppNamespace = "::xla::xtile";
+
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::tensor::TensorDialect",
+    "mlir::stablehlo::StablehloDialect",
+  ];
+
+  let useDefaultAttributePrinterParser = 1;
+}
+
+#endif // XLA_CODEGEN_XTILE_IR_XTILE_DIALECT
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.cc b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.cc
new file mode 100644
index 00000000000000..5bc7fc57fde2f1
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.cc
@@ -0,0 +1,245 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Support/LLVM.h"
+
+#define GET_OP_CLASSES
+#include "xla/codegen/xtile/ir/xtile_interface_ops.cc.inc"
+#include "xla/codegen/xtile/ir/xtile_ops.cc.inc"
+
+namespace xla::xtile {
+
+llvm::SmallDenseSet<unsigned> TiledBufferInterface::getReducedDimensions() {
+  std::optional<llvm::SmallDenseSet<unsigned>> mask =
+      mlir::computeRankReductionMask(getFullTileShape(),
+                                     getTile().getType().getShape());
+  // This should have already been verified.
+  CHECK(mask.has_value());
+  return *mask;
+}
+
+static mlir::LogicalResult VerifyBufferOp(TiledBufferInterface op) {
+  mlir::MemRefType buffer_type = op.getBuffer().getType();
+  int64_t buffer_rank = buffer_type.getRank();
+
+  if (op.getFullTileShape().size() != buffer_rank) {
+    return op.emitOpError()
+           << "full tile shape size: " << op.getFullTileShape().size()
+           << " does not match rank of buffer: " << buffer_rank;
+  }
+
+  size_t offset_count = op.getOffsets().size();
+  if (offset_count != buffer_rank) {
+    return op.emitOpError() << "expected " << buffer_rank
+                            << " offset operands, got " << offset_count;
+  }
+
+  mlir::RankedTensorType tile_type = op.getTile().getType();
+  if (!mlir::computeRankReductionMask(op.getFullTileShape(),
+                                      tile_type.getShape())) {
+    return op.emitOpError() << "full tile shape: [" << op.getFullTileShape()
+                            << "] does not reduce to tile shape: ["
+                            << tile_type.getShape() << "]";
+  }
+
+  mlir::Type buffer_element_type = buffer_type.getElementType();
+  mlir::Type tile_element_type = tile_type.getElementType();
+  if (buffer_element_type != tile_element_type) {
+    return op.emitOpError()
+           << "buffer element type: " << buffer_element_type
+           << " does not match element type of tile: " << tile_element_type;
+  }
+
+  return mlir::success();
+}
+
+// This is lifted from the func::FuncOp builder, modified to make the tile
+// index implicit.
+void EntryFuncOp::build(mlir::OpBuilder& builder, mlir::OperationState& state,
+                        mlir::StringRef name,
+                        mlir::ArrayRef<mlir::Type> memref_arg_types,
+                        mlir::ArrayRef<mlir::NamedAttribute> attrs,
+                        mlir::ArrayRef<mlir::DictionaryAttr> memref_arg_attrs) {
+  state.addAttribute(mlir::SymbolTable::getSymbolAttrName(),
+                     builder.getStringAttr(name));
+  mlir::SmallVector<mlir::Type> arg_types(memref_arg_types.begin(),
+                                          memref_arg_types.end());
+  // Append the tile id index type.
+  arg_types.push_back(builder.getIndexType());
+  mlir::FunctionType function_type = builder.getFunctionType(arg_types,
+                                                             /*results=*/{});
+  state.addAttribute(getFunctionTypeAttrName(state.name),
+                     mlir::TypeAttr::get(function_type));
+  state.attributes.append(attrs.begin(), attrs.end());
+  state.addRegion();
+
+  if (memref_arg_attrs.empty()) {
+    return;
+  }
+
+  assert(memref_arg_types.size() == memref_arg_attrs.size());
+  // As the arg attrs passed relate to the memref arg types we need to also
+  // append a tile id attr.
+  llvm::SmallVector<mlir::DictionaryAttr> arg_attrs_with_tile_id(
+      memref_arg_attrs.begin(), memref_arg_attrs.end());
+  arg_attrs_with_tile_id.push_back(
+      mlir::DictionaryAttr::get(builder.getContext(), {}));
+  mlir::call_interface_impl::addArgAndResultAttrs(
+      builder, state, arg_attrs_with_tile_id, /*resultAttrs=*/{},
+      getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));
+}
+
+mlir::ParseResult EntryFuncOp::parse(mlir::OpAsmParser& parser,
+                                     mlir::OperationState& result) {
+  auto buildFuncType =
+      [](mlir::Builder& builder, mlir::ArrayRef<mlir::Type> argTypes,
+         mlir::ArrayRef<mlir::Type> results,
+         mlir::function_interface_impl::VariadicFlag,
+         std::string&) { return builder.getFunctionType(argTypes, results); };
+
+  return mlir::function_interface_impl::parseFunctionOp(
+      parser, result, /*allowVariadic=*/false,
+      getFunctionTypeAttrName(result.name), buildFuncType,
+      getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name));
+}
+
+void EntryFuncOp::print(mlir::OpAsmPrinter& printer) {
+  mlir::function_interface_impl::printFunctionOp(
+      printer, *this, /*isVariadic=*/false, getFunctionTypeAttrName(),
+      getArgAttrsAttrName(), getResAttrsAttrName());
+}
+
+mlir::LogicalResult EntryFuncOp::verify() {
+  if (!getResultTypes().empty()) {
+    return emitOpError() << "entry function should not have any return values";
+  }
+
+  if (getArgumentTypes().empty()) {
+    return emitOpError()
+           << "entry function must have at least the workgroup id";
+  }
+
+  // Deciphering the exact user error is non-trivial as they may have the
+  // arguments in the wrong order or the incorrect number of workgroup ids etc,
+  // so we just give a generic error message.
+  constexpr absl::string_view argument_error =
+      "entry function arguments should be of the form (arg: memref..., "
+      "tile_id: index)";
+
+  // + 1 for the tile id.
+  const int64_t num_opaque_args = getNumOpaqueArgs() + 1;
+  for (mlir::Type arg_types : getArgumentTypes().drop_back(num_opaque_args)) {
+    if (!mlir::isa<mlir::MemRefType>(arg_types)) {
+      return emitOpError() << argument_error;
+    }
+  }
+
+  if (!mlir::isa<mlir::IndexType>(getArgumentTypes().back())) {
+    return emitOpError() << argument_error;
+  }
+
+  return mlir::success();
+}
+
+mlir::TypedValue<mlir::MemRefType> ExtractTileOp::getBuffer() {
+  return getSource();
+}
+
+mlir::TypedValue<mlir::RankedTensorType> ExtractTileOp::getTile() {
+  return getResult();
+}
+
+// This is the function ODS expects you to implement
+mlir::LogicalResult ExtractTileOp::verify() { return VerifyBufferOp(*this); }
+
+mlir::TypedValue<mlir::MemRefType> InsertTileOp::getBuffer() {
+  return getDestination();
+}
+
+mlir::TypedValue<mlir::RankedTensorType> InsertTileOp::getTile() {
+  return getSource();
+}
+
+mlir::LogicalResult InsertTileOp::verify() { return VerifyBufferOp(*this); }
+
+llvm::SmallVector<int64_t> MaskOp::getMaskedDimensions() {
+  llvm::SmallVector<int64_t> masked_dimensions;
+
+  int64_t idx = 0;
+  for (const auto [bound_size, tensor_size] :
+       llvm::zip(getBounds(), getType().getShape())) {
+    if (bound_size < tensor_size) {
+      masked_dimensions.push_back(idx);
+    }
+    ++idx;
+  }
+
+  return masked_dimensions;
+}
+
+mlir::LogicalResult MaskOp::verify() {
+  mlir::ArrayRef<int64_t> tensor_shape = getType().getShape();
+  mlir::ArrayRef<int64_t> bounds = getBounds();
+
+  if (tensor_shape.size() != bounds.size()) {
+    return emitOpError() << "tensor rank: " << tensor_shape.size()
+                         << " does not match mask bounds rank: "
+                         << bounds.size();
+  }
+
+  for (const auto [bound_size, tensor_size] : llvm::zip(bounds, tensor_shape)) {
+    if (bound_size > tensor_size) {
+      return emitOpError()
+             << "mask bound not less than or equal to the tensor size";
+    }
+  }
+
+  return mlir::success();
+}
+
+mlir::OpFoldResult MaskOp::fold(FoldAdaptor) {
+  if (getMaskedDimensions().empty()) {
+    // If none of the dimensions are masked then the op is a nop.
+    return getSource();
+  }
+
+  return {};
+}
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.h b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.h
new file mode 100644
index 00000000000000..ffa99e02dcb902
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.h
@@ -0,0 +1,45 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_XTILE_IR_XTILE_OPS_H_
+#define XLA_CODEGEN_XTILE_IR_XTILE_OPS_H_
+
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep
+#include "mlir/IR/Attributes.h"  // IWYU pragma: keep
+#include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
+#include "mlir/IR/Dialect.h"  // IWYU pragma: keep
+#include "mlir/IR/MLIRContext.h"  // IWYU pragma: keep
+#include "mlir/IR/OpDefinition.h"  // IWYU pragma: keep
+#include "mlir/IR/OpImplementation.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/CallInterfaces.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_attrs.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_dialect.h"  // IWYU pragma: keep
+#include "xla/hlo/analysis/indexing_map.h"  // IWYU pragma: keep
+
+#define GET_OP_CLASSES
+#include "xla/codegen/xtile/ir/xtile_interface_ops.h.inc"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_ops.h.inc"  // IWYU pragma: keep
+
+#endif  // XLA_CODEGEN_XTILE_IR_XTILE_OPS_H_
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.td b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.td
new file mode 100644
index 00000000000000..fa240c77cc938c
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.td
@@ -0,0 +1,295 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_XTILE_IR_XTILE_OPS
+#define XLA_CODEGEN_XTILE_IR_XTILE_OPS
+include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
+include "mlir/Interfaces/CallInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/FunctionInterfaces.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/OpAsmInterface.td"
+include "mlir/IR/OpBase.td"
+include "mlir/IR/SymbolInterfaces.td"
+include "mlir/IR/Interfaces.td"
+include "xla/codegen/xtile/ir/xtile_dialect.td"
+include "xla/codegen/xtile/ir/xtile_attrs.td"
+
+class XTile_Op<string mnemonic, list<Trait> traits = []> :
+      Op<XTileDialect, mnemonic, traits> {
+}
+
+def TiledBufferInterface : OpInterface<"TiledBufferInterface",
+    [DeclareOpInterfaceMethods<BufferizableOpInterface,
+      ["bufferizesToMemoryRead",
+       "bufferizesToMemoryWrite",
+       "getAliasingValues",
+       "isWritable",
+       "bufferize"]>
+    ]> {
+  let description = [{
+
+  }];
+
+  let cppNamespace = "::xla::xtile";
+
+  let extraClassDeclaration = [{
+    // Returns the set of unit dimensions that are removed from the tile
+    // dimensions.
+    llvm::SmallDenseSet<unsigned> getReducedDimensions();
+  }];
+
+  let extraTraitClassDeclaration = [{
+    llvm::SmallDenseSet<unsigned> getReducedDimensions() {
+      return static_cast<TiledBufferInterface>($_op).getReducedDimensions();
+    }
+  }];
+
+  let methods = [
+    InterfaceMethod<"Get the memref value of the buffer.",
+                    "mlir::TypedValue<mlir::MemRefType>", "getBuffer">,
+    InterfaceMethod<"Get the tensor value of the tile.",
+                    "mlir::TypedValue<mlir::RankedTensorType>", "getTile">,
+    InterfaceMethod<"Get the offsets into the buffer.",
+                    "mlir::ValueRange", "getOffsets">,
+    InterfaceMethod<"Get the full unreduced tile shape.",
+                    "llvm::ArrayRef<int64_t>", "getFullTileShape">,
+    InterfaceMethod<"Get the stride lengths of the buffer.",
+                    "llvm::ArrayRef<int64_t>", "getStrides">,
+  ];
+}
+
+
+// Define your custom entry operation
+def EntryFuncOp : XTile_Op<"entry_func", [
+    Symbol,
+    IsolatedFromAbove,
+    AutomaticAllocationScope,
+    FunctionOpInterface]>
+{
+  let summary = "My custom entry function operation";
+
+  let description = [{
+    This operation defines a custom entry function that is the starting
+    point for execution. It has a single-block region, takes a
+    variadic list of memrefs and exactly one tile id index arguments,
+    it has no return values.
+  }];
+
+  let arguments = (ins SymbolNameAttr:$sym_name,
+                       TypeAttrOf<FunctionType>:$function_type,
+                       OptionalAttr<DictArrayAttr>:$arg_attrs,
+                       OptionalAttr<XTile_TilingInfoAttr>:$tile_info,
+                       OptionalAttr<DictArrayAttr>:$res_attrs,
+                       // The number of arguments that are opaque to tiling
+                       // infrastructure and hence do not correspond to a memref
+                       // argument.
+                       DefaultValuedAttr<I32Attr, "0">:$num_opaque_args
+  );
+
+  // The entry function has no return values.
+  let results = (outs);
+
+  let regions = (region SizedRegion<1>:$body);
+
+  let builders = [OpBuilder<(ins
+    "mlir::StringRef":$name, "mlir::ArrayRef<mlir::Type>":$memref_arg_types,
+    CArg<"mlir::ArrayRef<mlir::NamedAttribute>", "{}">:$attrs,
+    CArg<"mlir::ArrayRef<mlir::DictionaryAttr>", "{}">:$memref_arg_attrs)
+  >];
+
+  let extraClassDeclaration = [{
+    //===------------------------------------------------------------------===//
+    // Helper Methods
+    //===------------------------------------------------------------------===//
+    mlir::ValueRange getBufferArgs() {
+      // +1 for the tile id argument.
+      return getBody().getArguments().drop_back(getNumOpaqueArgs() + 1);
+    }
+
+    mlir::ValueRange getOpaqueArgs() {
+      return getBody().getArguments().take_back(getNumOpaqueArgs() + 1)
+          .drop_back();
+    }
+
+    mlir::Value getTileId() {
+      return getBody().getArguments().back();
+    }
+
+    //===------------------------------------------------------------------===//
+    // FunctionOpInterface Methods
+    //===------------------------------------------------------------------===//
+
+    /// Returns the region on the current operation that is callable.
+    mlir::Region *getCallableRegion() { return &getBody(); }
+
+    /// Returns the argument types of this function.
+    mlir::ArrayRef<mlir::Type> getArgumentTypes() { return getFunctionType().getInputs(); }
+
+    /// Returns the result types of this function.
+    mlir::ArrayRef<mlir::Type> getResultTypes() { return getFunctionType().getResults(); }
+  }];
+
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+}
+
+def EntryFuncReturnOp : XTile_Op<"return", [Pure, HasParent<"EntryFuncOp">,
+                                ReturnLike, Terminator]> {
+  let summary = "Terminates the entry function";
+
+  let description = [{
+    This operation terminates the entry function block. It has no operands
+    and produces no results.
+  }];
+
+  let arguments = (ins);
+  let results = (outs);
+
+  let assemblyFormat = "attr-dict";
+}
+
+def ExtractTileOp : XTile_Op<"extract", [TiledBufferInterface]> {
+  let summary = "Extract a tile from a memref.";
+  let description = [{
+  }];
+
+  let arguments = (ins
+    AnyMemRef:$source,
+    Variadic<Index>:$offsets,
+    DenseI64ArrayAttr:$full_tile_shape,
+    DenseI64ArrayAttr:$strides
+  );
+
+
+  let results = (outs AnyRankedTensor:$result);
+
+  let assemblyFormat = [{
+    $source `[` $offsets `]` $full_tile_shape $strides
+    `:` type($source) `->` type($result) attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    mlir::TypedValue<mlir::MemRefType> getBuffer();
+    mlir::TypedValue<mlir::RankedTensorType> getTile();
+  }];
+
+  let hasVerifier = 1;
+}
+
+def InsertTileOp : XTile_Op<"insert", [TiledBufferInterface]> {
+  let summary = "Insert a tile into a memref.";
+  let description = [{
+  }];
+
+  let arguments = (ins
+    AnyRankedTensor:$source,
+    AnyMemRef:$destination,
+    Variadic<Index>:$offsets,
+    DenseI64ArrayAttr:$full_tile_shape,
+    DenseI64ArrayAttr:$strides
+  );
+
+  let results = (outs);
+
+  let assemblyFormat = [{
+    $source `into` $destination `[` $offsets `]` $full_tile_shape $strides
+    `:` type($source) `->` type($destination) attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    mlir::TypedValue<mlir::MemRefType> getBuffer();
+    mlir::TypedValue<mlir::RankedTensorType> getTile();
+  }];
+
+  let hasVerifier = 1;
+}
+
+def MaskOp : XTile_Op<"mask",
+                      [Pure,
+                       AllTypesMatch<["source", "result"]>,
+                       TypesMatchWith<"mask type matches result element type",
+                          "result", "value", ElementType<"_self">.result>
+                      ]> {
+  let summary = "Mask the values of a tensor.";
+
+  let description = [{
+    Masks out the values of the input tensor that are outside the range of the
+    given mask upper bound. Masked values are set to the provided value.
+  }];
+
+
+  let arguments = (ins
+    AnyRankedTensor:$source,
+    DenseI64ArrayAttr:$bounds,
+    AnyType:$value
+  );
+
+  let results = (outs AnyRankedTensor:$result);
+
+  let assemblyFormat = [{
+    $source `bounds` $bounds `,` $value `:` type($result) attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    // Get the dimensions where the mask bound is smaller than the dimension
+    // size. The returned array is sorted in increasing order.
+    llvm::SmallVector<int64_t> getMaskedDimensions();
+  }];
+
+  let hasVerifier = 1;
+  let hasFolder = 1;
+}
+
+
+//
+// DotScaled Op
+//
+// TODO(basioli): This op was copied from the triton dialect. If we want to use 
+// it more we should probably consider documenting it properly, and including 
+// more checks (e.x. similar to the Triton DotOpInterface).
+def DotScaledOp : XTile_Op<"dot_scaled", [Pure, AttrSizedOperandSegments]> {
+    let summary = "dot_scaled";
+
+    let description = [{
+        $result = matrix_multiply(scale($lhs, $lhs_scale), scale($rhs, $rhs_scale)).
+        Where scale(x, s) is a function that applies the scale per block following microscaling spec.
+    }];
+
+    let arguments = (
+      ins
+      // inputs are floats if we have a type for them, otherwise (fp4),
+      // they are packed in pairs in an I8Tensor
+      RankedTensorOf<[AnyFloat,I8]>:$lhs,
+      RankedTensorOf<[AnyFloat,I8]>:$rhs,
+      Optional<RankedTensorOf<[AnyFloat,I8]>>:$lhs_scale,
+      Optional<RankedTensorOf<[AnyFloat,I8]>>:$rhs_scale,
+      BoolAttr:$fastMath,
+      DefaultValuedAttr<BoolAttr, "true">:$lhs_k_pack,
+      DefaultValuedAttr<BoolAttr, "true">:$rhs_k_pack
+    );
+
+    let results = (outs RankedTensorOf<[AnyFloat]>:$result);
+
+    let assemblyFormat = [{
+      $lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? attr-dict
+      `:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($result)
+    }];
+}
+
+#endif // XLA_CODEGEN_XTILE_IR_XTILE_OPS
+
diff --git a/third_party/xla/xla/core/collectives/collectives.h b/third_party/xla/xla/core/collectives/collectives.h
index 95a40d608128d7..903ba4bb441006 100644
--- a/third_party/xla/xla/core/collectives/collectives.h
+++ b/third_party/xla/xla/core/collectives/collectives.h
@@ -79,7 +79,8 @@ class Collectives {
   // Creates communicators by splitting `comms`.
   virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
   SplitCommunicators(absl::Span<const Communicator* const> comms, int32_t color,
-                     absl::Span<const RankId> keys, const Config& config) = 0;
+                     absl::Span<const RankId> keys, const Config& config,
+                     absl::Span<const DeviceRank> ranks) = 0;
 
   // Collectives instance can be ephemeral and used only for a small number of
   // XLA program executions. XLA backends that rely on the collectives instances
diff --git a/third_party/xla/xla/core/host_offloading/BUILD b/third_party/xla/xla/core/host_offloading/BUILD
index fcd71ac95e3bb7..d01b1fc6ddd175 100644
--- a/third_party/xla/xla/core/host_offloading/BUILD
+++ b/third_party/xla/xla/core/host_offloading/BUILD
@@ -68,7 +68,6 @@ strict_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
@@ -118,7 +117,9 @@ strict_cc_test(
 tf_proto_library(
     name = "host_offloading_executable_proto",
     srcs = ["host_offloading_executable.proto"],
-    compatible_with = get_compatible_with_libtpu_portable(),
+    compatible_with = get_compatible_with_libtpu_portable() + [
+        # copybara:uncomment "//buildenv/target:non_prod",
+    ],
     deps = [
         "//xla/service:hlo_proto",
         "//xla/service/cpu:executable_proto",
@@ -243,6 +244,7 @@ strict_cc_test(
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu:ffi",
         "//xla/backends/cpu/nanort:nanort_client",
         "//xla/backends/cpu/nanort:nanort_executable",
         "//xla/ffi",
diff --git a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc
index 04d586418292ea..74569d850b37e6 100644
--- a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc
+++ b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc
@@ -51,8 +51,7 @@ void AnnotateComputationHostOffload(HloComputation& computation) {
 }
 }  // namespace
 
-
-absl::StatusOr<bool> AnnotateHostComputeOffload::Run(
+absl::StatusOr<bool> AnnotateHostComputeOffload::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h
index c25c184572ee05..64999a0ecaca85 100644
--- a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h
+++ b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h
@@ -34,7 +34,8 @@ class AnnotateHostComputeOffload : public HloModulePass {
     return "annotate-host-compute-offload";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc
index 6823d35c9b8078..e96f69fda6a927 100644
--- a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc
+++ b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <utility>
 
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
diff --git a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc
index 9d01b79949d86c..3bcaf4f41a69bf 100644
--- a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc
+++ b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -151,6 +152,304 @@ absl::StatusOr<bool> OffloadHostInstructions(
 
   return modified;
 }
+
+std::string GetDevicePlacement(const HloInstruction* instr) {
+  CHECK(instr->IsCustomCall(memory_annotations::kDevicePlacement))
+      << "Input " << instr->name() << " must be a device placement annotation";
+  CHECK(instr->has_frontend_attributes())
+      << "Input " << instr->name() << " must have frontend attributes";
+  const auto& frontend_attribute_map = instr->frontend_attributes().map();
+  auto buffer_placement_it =
+      frontend_attribute_map.find(kXlaBufferPlacementAttr);
+  CHECK(buffer_placement_it != frontend_attribute_map.end())
+      << "Input " << instr->name()
+      << " must have a buffer placement frontend attribute";
+  return buffer_placement_it->second;
+}
+
+absl::flat_hash_set<HloInstruction*> CollectAllowedDevicePlacementAnnotations(
+    const HloComputation* computation) {
+  // Collect a list of allowed annotations. We only expect annotations in one of
+  // two locations in host computations currently:
+  //  1. The ROOT instruction, if the computation returns a single value.
+  //  2. The items feeding into the ROOT tuple instruction, if the computation
+  //  returns a tuple.
+  absl::flat_hash_set<HloInstruction*> allowed_device_placement_annotations;
+  HloInstruction* root_instr = computation->root_instruction();
+  if (root_instr->opcode() == HloOpcode::kTuple) {
+    // Is a tuple
+    for (int64_t i = 0; i < root_instr->operand_count(); ++i) {
+      HloInstruction* operand = root_instr->mutable_operand(i);
+      if (operand->IsCustomCall(memory_annotations::kDevicePlacement)) {
+        allowed_device_placement_annotations.insert(operand);
+      }
+    }
+  } else {
+    // Is not a tuple
+    if (root_instr->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      allowed_device_placement_annotations.insert(root_instr);
+    }
+  }
+  return allowed_device_placement_annotations;
+}
+
+absl::StatusOr<std::vector<HloInstruction*>>
+CheckRemainingDevicePlacementAnnotations(
+    const HloComputation* computation,
+    const absl::flat_hash_set<HloInstruction*>&
+        allowed_device_placement_annotations) {
+  // Look for annotations which are not in the allowed set. If any annotation is
+  // redundant, return it in a list so that the caller of this function can
+  // remove it. Any other annotation is an error.
+  std::vector<HloInstruction*> redundant_annotations;
+  for (HloInstruction* instr : computation->instructions()) {
+    if (instr->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      if (allowed_device_placement_annotations.contains(instr)) {
+        continue;
+      }
+      const std::string device_placement = GetDevicePlacement(instr);
+      if (device_placement == memory_annotations::kMemoryTargetPinnedHost ||
+          device_placement == memory_annotations::kMemoryTargetUnpinnedHost) {
+        // An annotation in host computation annotating the buffer to be on the
+        // host is redundant.
+        redundant_annotations.push_back(instr);
+      } else {
+        // An annotation in host computation annotating the buffer to be
+        // somewhere other than the host is not allowed.
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Host computation %s contains a device placement "
+                            "annotation %s that is not allowed.",
+                            computation->name(), instr->ToString()));
+      }
+    }
+  }
+  return redundant_annotations;
+}
+
+// Returns true if any redundant annotations were removed.
+absl::StatusOr<bool> CleanUpHostComputationDevicePlacementAnnotations(
+    const HloComputation* computation) {
+  const absl::flat_hash_set<HloInstruction*>
+      allowed_device_placement_annotations =
+          CollectAllowedDevicePlacementAnnotations(computation);
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<HloInstruction*> redundant_device_placement_annotations,
+      CheckRemainingDevicePlacementAnnotations(
+          computation, allowed_device_placement_annotations));
+
+  // Remove redundant annotations
+  for (HloInstruction* redundant_annotation :
+       redundant_device_placement_annotations) {
+    VLOG(1) << "Removing redundant annotation: "
+            << redundant_annotation->ToString();
+    CHECK_EQ(redundant_annotation->operand_count(), 1)
+        << "A device placement annotation must have exactly one operand.";
+    for (HloInstruction* user : redundant_annotation->users()) {
+      for (int64_t operand_index :
+           user->operand_indices(redundant_annotation)) {
+        TF_RETURN_IF_ERROR(user->ReplaceOperandWith(
+            operand_index, redundant_annotation->mutable_operand(0)));
+      }
+    }
+    TF_RETURN_IF_ERROR(redundant_annotation->parent()->RemoveInstruction(
+        redundant_annotation));
+  }
+
+  return !redundant_device_placement_annotations.empty();
+}
+
+bool DevicePlacementMemorySpaceIsSame(const HloInstruction* a,
+                                      const HloInstruction* b) {
+  CHECK(a->IsCustomCall(memory_annotations::kDevicePlacement))
+      << "Input a: " << a->name() << " must be a device placement annotation";
+  CHECK(b->IsCustomCall(memory_annotations::kDevicePlacement))
+      << "Input b: " << b->name() << " must be a device placement annotation";
+  return GetDevicePlacement(a) == GetDevicePlacement(b);
+}
+
+absl::Status CloneAnnotationToDestination(
+    HloComputation* destination_computation,
+    HloInstruction* destination_computation_caller_instruction,
+    const HloInstruction* original_annotation,
+    HloInstruction* destination_instruction) {
+  HloInstruction* moved_annotation = destination_computation->AddInstruction(
+      original_annotation->CloneWithNewOperands(original_annotation->shape(),
+                                                {destination_instruction},
+                                                "move_to_caller"));
+
+  bool used_new_annotation = false;
+  for (HloInstruction* destination_user : destination_instruction->users()) {
+    if (destination_user == moved_annotation) {
+      // Do not replace the annotation with itself.
+      continue;
+    }
+    if (destination_user->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      // The destination already has an annotation.
+      if (!DevicePlacementMemorySpaceIsSame(original_annotation,
+                                            destination_user)) {
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Found conflicting host computation output memory "
+                            "space. Call %s wants output memory space %s but "
+                            "call %s wants output memory space %s",
+                            original_annotation->operand(0)->name(),
+                            GetDevicePlacement(original_annotation),
+                            destination_computation_caller_instruction->name(),
+                            GetDevicePlacement(destination_user)));
+      }
+      // Annotation already exists, nothing to do.
+      continue;
+    }
+    for (int64_t operand_index :
+         destination_user->operand_indices(destination_instruction)) {
+      TF_RETURN_IF_ERROR(destination_user->ReplaceOperandWith(
+          operand_index, moved_annotation));
+    }
+    used_new_annotation = true;
+  }
+
+  // All the places where this annotation would be placed already have this
+  // exact annotation.
+  if (!used_new_annotation) {
+    TF_RETURN_IF_ERROR(
+        destination_computation->RemoveInstruction(moved_annotation));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<bool> MoveAnnotationsToCallerTuple(
+    HloComputation* host_computation) {
+  bool changed = false;
+  for (int64_t operand_index = 0;
+       operand_index < host_computation->root_instruction()->operand_count();
+       ++operand_index) {
+    HloInstruction* root_operand =
+        host_computation->root_instruction()->mutable_operand(operand_index);
+    if (!root_operand->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      // Instruction is not a device placement annotation; nothing to do.
+      continue;
+    }
+    // Root is a device placement annotation.
+    CHECK_EQ(root_operand->operand_count(), 1)
+        << "A device placement annotation must have exactly one operand.";
+
+    // Clone the annotation to each of the callers.
+    for (HloInstruction* caller_instruction :
+         host_computation->caller_instructions()) {
+      HloComputation* caller_computation = caller_instruction->parent();
+      for (HloInstruction* caller_user_gte : caller_instruction->users()) {
+        if (caller_user_gte->opcode() != HloOpcode::kGetTupleElement) {
+          return absl::UnimplementedError(
+              "When moving device placement annotations out of a host "
+              "computation, the tuple is used by something other than a "
+              "get-tuple-element. This is currently not supported.");
+        }
+        if (caller_user_gte->tuple_index() != operand_index) {
+          // This get-tuple-element is getting a different index than the one we
+          // are currently looking at.
+          continue;
+        }
+        TF_RETURN_IF_ERROR(
+            CloneAnnotationToDestination(caller_computation, caller_instruction,
+                                         root_operand, caller_user_gte));
+        changed = true;
+      }
+    }
+
+    TF_RETURN_IF_ERROR(host_computation->root_instruction()->ReplaceOperandWith(
+        operand_index, root_operand->mutable_operand(0)));
+    TF_RETURN_IF_ERROR(host_computation->RemoveInstruction(root_operand));
+    changed = true;
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> MoveAnnotationToCallerNonTuple(
+    HloComputation* host_computation) {
+  HloInstruction* root_instr = host_computation->root_instruction();
+  if (!root_instr->IsCustomCall(memory_annotations::kDevicePlacement)) {
+    // Root is not a device placement annotation; nothing to do.
+    return false;
+  }
+  // Root is a device placement annotation.
+  CHECK_EQ(root_instr->operand_count(), 1)
+      << "A device placement annotation must have exactly one operand.";
+
+  // Clone the annotation to each of the callers.
+  for (HloInstruction* caller_instruction :
+       host_computation->caller_instructions()) {
+    HloComputation* caller_computation = caller_instruction->parent();
+    TF_RETURN_IF_ERROR(
+        CloneAnnotationToDestination(caller_computation, caller_instruction,
+                                     root_instr, caller_instruction));
+  }
+
+  // Remove the annotation from inside this computation.
+  host_computation->set_root_instruction(root_instr->mutable_operand(0));
+  TF_RETURN_IF_ERROR(host_computation->RemoveInstruction(root_instr));
+  return true;
+}
+
+// Move host device placement annotations out of this computation to the calling
+// computation.
+absl::StatusOr<bool> MoveAnnotationsToCaller(HloComputation* computation) {
+  bool changed = false;
+  TF_ASSIGN_OR_RETURN(
+      bool cleaned_up,
+      CleanUpHostComputationDevicePlacementAnnotations(computation));
+  changed = changed || cleaned_up;
+  // All annotations at this point are valid.
+  if (computation->root_instruction()->opcode() == HloOpcode::kTuple) {
+    // When the computation returns a tuple, the annotation is on the operands
+    // of the root tuple.
+    TF_ASSIGN_OR_RETURN(bool moved, MoveAnnotationsToCallerTuple(computation));
+    changed = changed || moved;
+  } else {
+    // When the computation returns a single value, the annotation is the root
+    // instruction.
+    TF_ASSIGN_OR_RETURN(bool moved,
+                        MoveAnnotationToCallerNonTuple(computation));
+    changed = changed || moved;
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> RemoveDevicePlacementAnnotationsFromHostComputations(
+    HloModule* module) {
+  // The only time we currently find device placement annotations in host
+  // computations are when the host computation calls another host computation
+  // and that called host computation has an output memory space annotated. That
+  // output memory space annotation is usually on the users of the host call (or
+  // users of the get-tuple-elements if the call returns a tuple).
+  //
+  // Visit host computations in post-order. We will push annotations out of host
+  // computations into their callers.
+  std::vector<HloComputation*> host_computations;
+  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+    // Check if this computation is a host computation.
+    for (const HloInstruction* caller_instruction :
+         computation->caller_instructions()) {
+      if (caller_instruction->has_frontend_attributes()) {
+        FrontendAttributes frontend_attributes =
+            caller_instruction->frontend_attributes();
+        if (frontend_attributes.map().contains(kXlaComputeTypeAttr) &&
+            frontend_attributes.map().at(kXlaComputeTypeAttr) ==
+                kXlaComputeTypeHost) {
+          // The computation is a host computation.
+          host_computations.push_back(computation);
+          break;
+        }
+      }
+    }
+  }
+
+  bool changed = false;
+  for (HloComputation* computation : host_computations) {
+    TF_ASSIGN_OR_RETURN(bool moved, MoveAnnotationsToCaller(computation));
+    changed = changed || moved;
+  }
+  return changed;
+}
 }  // namespace
 
 /*static*/ absl::StatusOr<HloCallInstruction*>
@@ -303,7 +602,7 @@ HloHostDeviceTypeCallWrapper::MaterializeConstantsOnHostComputation(
   return tsl::down_cast<HloCallInstruction*>(new_call);
 }
 
-absl::StatusOr<bool> HloHostDeviceTypeCallWrapper::Run(
+absl::StatusOr<bool> HloHostDeviceTypeCallWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool has_host_compute_instr = false;
@@ -321,6 +620,16 @@ absl::StatusOr<bool> HloHostDeviceTypeCallWrapper::Run(
     return false;
   }
 
+  // Before any other passes run, move device placement annotations out of host
+  // computations.
+  TF_ASSIGN_OR_RETURN(
+      bool modified,
+      RemoveDevicePlacementAnnotationsFromHostComputations(module));
+  // At this point, this pass will always modify the module. The return value of
+  // this function, which indicates whether the module was modified, is not
+  // useful.
+  (void)modified;
+
   TF_RETURN_IF_ERROR(
       AnnotateHostComputeOffload().Run(module, execution_threads).status());
   TF_RETURN_IF_ERROR(CallInliner().Run(module, execution_threads).status());
diff --git a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h
index 99c83653f27619..4ea47b6daff3eb 100644
--- a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h
+++ b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h
@@ -47,10 +47,6 @@ class HloHostDeviceTypeCallWrapper : public HloModulePass {
     return "hlo_host_device_type_call_wrapper";
   }
 
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Materializes constants on the host computation to avoid unnecessary device
   // to host transfers.
   //
@@ -66,6 +62,11 @@ class HloHostDeviceTypeCallWrapper : public HloModulePass {
   static absl::StatusOr<HloCallInstruction*> RemoveTupleParameters(
       HloCallInstruction* call);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   Options options_;
 };
diff --git a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc
index 0e53cf76efde99..1b9e19c1b83642 100644
--- a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc
+++ b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc
@@ -60,7 +60,7 @@ void RemoveTilesAndMemorySpaces(HloComputation* computation) {
 }
 }  // namespace
 
-absl::StatusOr<bool> HostComputeAsyncifier::Run(
+absl::StatusOr<bool> HostComputeAsyncifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h
index d8d4776bc692d3..c29d437e359e2d 100644
--- a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h
+++ b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h
@@ -39,7 +39,8 @@ class HostComputeAsyncifier : public HloModulePass {
 
   absl::string_view name() const override { return "host_compute_asyncifier"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_executable_test.cc b/third_party/xla/xla/core/host_offloading/host_offloading_executable_test.cc
index a058423f15d97c..6ef9f2e426faaa 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_executable_test.cc
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_executable_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/cpu/ffi.h"
 #include "xla/backends/cpu/nanort/nanort_client.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
 #include "xla/core/host_offloading/host_offloading_buffer.h"
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc
index 47cff456fbf075..3b4335bff07f09 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc
@@ -112,7 +112,7 @@ bool HostOffloadingLayoutAnalysis::ShapeHasPadding(const Shape& shape) {
   return has_padding;
 }
 
-absl::StatusOr<bool> HostOffloadingLayoutAnalysis::Run(
+absl::StatusOr<bool> HostOffloadingLayoutAnalysis::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // TODO(ecg): relax this by allowing padding to then operate on a modified
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h
index 9c08bfeeddd875..778b8637316f0e 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h
@@ -30,17 +30,6 @@ class HostOffloadingLayoutAnalysis : public HloModulePass {
     return "host-offloading-layout-analysis";
   }
 
-  using HloPassInterface::Run;
-  // This method does not modify the module; it purely informs the caller
-  // whether device<->host layout conversion (i.e., (de)linearization of input
-  // and result buffers) can be safely skipped.
-  // Note: the pass is conservative in that it can return true for some cases
-  // that might not need layout conversion. This is OK because performing layout
-  // conversion is always correct, despite its performance impact.
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) final;
-
   // This static method provides an API better named than "Run".
   static absl::StatusOr<bool> NeedsLayoutConversion(HloModule* module) {
     HostOffloadingLayoutAnalysis pass;
@@ -51,6 +40,17 @@ class HostOffloadingLayoutAnalysis : public HloModulePass {
   // This function is useful when the HloModule has no tiling information, yet
   // we have it from shapes coming from buffers, e.g. TpuBuffer's.
   static bool ShapeHasPadding(const Shape& shape);
+
+ protected:
+  // This method does not modify the module; it purely informs the caller
+  // whether device<->host layout conversion (i.e., (de)linearization of input
+  // and result buffers) can be safely skipped.
+  // Note: the pass is conservative in that it can return true for some cases
+  // that might not need layout conversion. This is OK because performing layout
+  // conversion is always correct, despite its performance impact.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) final;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis_test.cc b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis_test.cc
index a4a4b1060f69b0..471856062d5815 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis_test.cc
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis_test.cc
@@ -24,12 +24,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
-using tsl::testing::IsOkAndHolds;
 
 class NeedsLayoutConversionTest : public HloHardwareIndependentTestBase {};
 
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc b/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc
index c212e5512cdcd3..158378bdc4b6b2 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc
@@ -258,8 +258,6 @@ HostOffloadingPjRtExecutable::Execute(
 
   // TODO(b/340666998) Add additional context needed to support megascale ops
   ::xla::ExecuteOptions pjrt_execute_options{
-      // By default untuple results.
-      .untuple_result = true,
       // Forward launch id to the host offloading executable because logically
       // it executes as a part of parent device execution.
       .launch_id = execute_options.launch_id,
diff --git a/third_party/xla/xla/cpu_function_runtime.h b/third_party/xla/xla/cpu_function_runtime.h
deleted file mode 100644
index 6ab0a9ad981ec2..00000000000000
--- a/third_party/xla/xla/cpu_function_runtime.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CPU_FUNCTION_RUNTIME_H_
-#define XLA_CPU_FUNCTION_RUNTIME_H_
-
-#include <stdint.h>
-
-#include <cassert>
-#include <cstdlib>
-
-namespace xla {
-namespace cpu_function_runtime {
-
-struct EncodedBufferInfo {
-  uint64_t packed_kind_and_size = 0;
-  uint32_t entry_param_number = -1;
-  uint32_t result_param_number = -1;
-};
-
-// Stores information about one buffer used by an XLA:CPU compiled function.
-// These buffers are used for holding inputs to the computation, outputs from
-// the computation and as temporary scratch space.
-class BufferInfo {
- public:
-  // Creates a BufferInfo from a serialized encoding generated by `Encode`.
-  explicit constexpr BufferInfo(const EncodedBufferInfo& encoded)
-      : kind_(UnpackKind(encoded.packed_kind_and_size)),
-        size_(UnpackSize(encoded.packed_kind_and_size)),
-        entry_param_number_(encoded.entry_param_number),
-        result_param_number_(encoded.result_param_number) {}
-
-  // Returns true if this buffer stores a constant.
-  bool is_constant() const { return kind() == Kind::kConstant; }
-
-  // Returns true if this buffer stores an entry parameter.  These may or may
-  // not need to be allocated by the runtime, depending on
-  // XlaCompiledCpuFunction::AllocMode.
-  bool is_entry_parameter() const {
-    return kind() == Kind::kParameter && entry_param_number_ >= 0;
-  }
-
-  // Returns the entry parameter number of this buffer.
-  uint32_t entry_parameter_number() const {
-    assert(is_entry_parameter());
-    return entry_param_number_;
-  }
-
-  void set_result_parameter_number(uint32_t param_number) {
-    result_param_number_ = param_number;
-  }
-
-  bool is_result_parameter() const {
-    // Note: the kind is not unique, e.g. could be a kTempBuffer, or a
-    // kParameter if it is an in-out argument.
-    return result_param_number_ >= 0;
-  }
-
-  uint32_t result_parameter_number() const {
-    assert(is_result_parameter());
-    return result_param_number_;
-  }
-
-  // Returns true if this buffer is temporary scratch space required by the XLA
-  // computations.  These are always allocated by the runtime.
-  bool is_temp_buffer() const { return kind() == Kind::kTempBuffer; }
-
-  // Returns true if this buffer is allocated on the C stack or into registers.
-  // These buffers are never allocated by the runtime.
-  bool is_on_stack_buffer() const { return kind() == Kind::kOnStackBuffer; }
-
-  // Returns the size for this buffer.
-  uint64_t size() const { return size_; }
-
-  // Encodes this BufferInfo into two 64 bit integers that can be used to
-  // reconstruct the BufferInfo later using the constructor.  We need this
-  // because we use BufferInfo in places where using protocol buffers would
-  // negatively impact binary size.
-  EncodedBufferInfo Encode() const {
-    static_assert(sizeof(*this) == 16, "");
-    EncodedBufferInfo ret;
-    ret.packed_kind_and_size = Pack(kind(), size_);
-    ret.entry_param_number = entry_param_number_;
-    ret.result_param_number = result_param_number_;
-    return ret;
-  }
-
-  bool operator==(const BufferInfo& buffer_info) const {
-    if (kind() != buffer_info.kind() || size() != buffer_info.size()) {
-      return false;
-    }
-    return !is_entry_parameter() ||
-           entry_parameter_number() == buffer_info.entry_parameter_number();
-  }
-
-  // Factory methods:
-
-  static BufferInfo MakeTempBuffer(uint64_t size) {
-    return BufferInfo(Kind::kTempBuffer, size);
-  }
-  static BufferInfo MakeConstant(uint64_t size) {
-    return BufferInfo(Kind::kConstant, size);
-  }
-  // Note: in-out parameters are possible by first creating an entry parameter
-  // and then calling set_result_parameter_number().
-  static BufferInfo MakeEntryParameter(uint64_t size,
-                                       uint32_t entry_param_number) {
-    return BufferInfo(Kind::kParameter, size, entry_param_number);
-  }
-  // Only used in tests. Here we use kTempBuffer but it is unimportant.
-  static BufferInfo MakeResultParameter(uint64_t size,
-                                        uint32_t result_param_number) {
-    // Here we
-    return BufferInfo(Kind::kTempBuffer, size, /*entry_param_number=*/-1,
-                      result_param_number);
-  }
-  static BufferInfo MakeOnStackBuffer(uint64_t size) {
-    return BufferInfo(Kind::kOnStackBuffer, size);
-  }
-
- private:
-  BufferInfo() = default;
-
-  enum class Kind : uint64_t {
-    kConstant,
-    kTempBuffer,
-    kParameter,
-    kOnStackBuffer
-  };
-
-  Kind kind() const { return static_cast<Kind>(kind_); }
-
-  explicit BufferInfo(Kind kind, uint64_t size)
-      : BufferInfo(kind, size,
-                   /*entry_param_number=*/-1,
-                   /*result_param_number=*/-1) {}
-  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number)
-      : BufferInfo(kind, size, entry_param_number,
-                   /*result_param_number=*/-1) {}
-  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number,
-                      uint32_t result_param_number)
-      : kind_(kind),
-        size_(size),
-        entry_param_number_(entry_param_number),
-        result_param_number_(result_param_number) {}
-
-  static uint64_t Pack(Kind kind, uint64_t size) {
-    return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
-  }
-
-  static inline constexpr Kind UnpackKind(uint64_t packed) {
-    return static_cast<Kind>((packed << 62) >> 62);
-  }
-
-  static inline constexpr uint64_t UnpackSize(uint64_t packed) {
-    return packed >> 2;
-  }
-
-  Kind kind_ : 2;
-  uint64_t size_ : 62;
-  int32_t entry_param_number_ = -1;
-  int32_t result_param_number_ = -1;
-};
-
-}  // namespace cpu_function_runtime
-}  // namespace xla
-
-#endif  // XLA_CPU_FUNCTION_RUNTIME_H_
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 568100b1489d8c..83c74ffd2e0094 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <fstream>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -213,6 +214,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_use_xnnpack(true);
   opts.set_xla_cpu_experimental_xnn_graph_fusion_mode(
       DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED);
+  opts.add_xla_cpu_experimental_ynn_fusion_type(
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
   opts.set_xla_cpu_parallel_codegen_split_count(32);
   opts.set_xla_cpu_copy_insertion_use_region_analysis(false);
   opts.set_xla_cpu_enable_concurrency_optimized_scheduler(true);
@@ -222,6 +225,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_emitter_verification_level(0);
 
   opts.set_xla_cpu_enable_fast_math(false);
+  opts.set_xla_cpu_enable_platform_dependent_math(true);
   // Disable forms of fast math that have caused users problems in the past.
   opts.set_xla_cpu_fast_math_honor_nans(true);
   opts.set_xla_cpu_fast_math_honor_infs(true);
@@ -273,7 +277,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_multiheap_size_constraint_per_heap(-1);
   opts.set_xla_detailed_logging(true);
   opts.set_xla_enable_dumping(true);
+  opts.set_xla_enable_enzyme_comms_opt(false);
 
+  opts.set_xla_gpu_enable_dynamic_slice_fusion(false);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
@@ -320,11 +326,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
       DebugOptions::PARTITIONING_ALGORITHM_NOOP);
 
   opts.set_xla_gpu_enable_triton_gemm(true);
-  opts.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-  // When changing the default value of the flag, please make sure to update the
-  // default value of the command line flag in `MakeDebugOptionsFlags`.
-  opts.add_xla_gpu_unsupported_generic_triton_emitter_features(
-      DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
   opts.set_xla_gpu_unsupported_enable_triton_multi_output_fusion(true);
   opts.set_xla_gpu_enable_cudnn_int8x32_convolution_reordering(true);
   opts.set_xla_gpu_triton_gemm_any(true);
@@ -385,6 +386,14 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   const int64_t kDefaultMinGemmRewriteSize = 100;
   opts.set_xla_gpu_gemm_rewrite_size_threshold(kDefaultMinGemmRewriteSize);
 
+#ifdef HAS_SUPPORT_FOR_EMBEDDED_LIB_DEVICE
+  opts.set_xla_gpu_use_embeded_device_lib(true);
+#endif
+
+#ifdef HAS_SUPPORT_FOR_LLD_AS_A_LIBRARY
+  opts.set_xla_gpu_use_inprocess_lld(true);
+#endif
+
   opts.set_xla_gpu_use_memcpy_local_p2p(false);
 
   opts.set_xla_reduce_window_rewrite_base_length(16);
@@ -438,18 +447,23 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(true);
   opts.set_xla_gpu_unsupported_enable_all_reduce_decomposer(false);
   opts.set_xla_gpu_experimental_use_autotuner_pass(false);
+  opts.set_xla_gpu_experimental_enable_fusion_autotuner(true);
+  opts.set_xla_gpu_experimental_allow_unroll_factor_eight(true);
   opts.set_xla_gpu_experimental_pack_dot_operands_along_k_dimension(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_fix_max_iterations(false);
   opts.set_xla_hlo_pass_fix_detect_cycles(false);
   opts.set_xla_gpu_experimental_enable_heuristic_collective_combining(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_silent_hlo_change(false);
   opts.set_xla_disable_automatic_host_compute_offload(false);
+  opts.set_xla_enable_scoped_logging_timers(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_noop_change(false);
   opts.set_xla_gpu_experimental_enable_split_k_rewrite(false);
-  opts.set_xla_gpu_experimental_enable_triton_tma(false);
+  opts.set_xla_gpu_experimental_enable_triton_tma(true);
+  opts.set_xla_gpu_experimental_enable_triton_warp_specialization(false);
   opts.set_xla_gpu_experimental_enable_command_buffer_on_thunks(true);
-  opts.set_xla_detect_unstable_reductions(
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE);
+  opts.set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_NONE);
+  opts.set_xla_detect_unstable_reductions_post_optimizations(
+      DebugOptions::DETECTION_MODE_NONE);
   opts.set_xla_gpu_experimental_scaled_dot_with_triton(false);
   opts.set_xla_gpu_experimental_use_raft_select_k(false);
 
@@ -457,6 +471,10 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_collective_call_terminate_timeout_seconds(40);
 
   opts.set_xla_keep_shardings_after_spmd(false);
+  opts.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(false);
+  opts.set_xla_gpu_experimental_enable_buffer_saver_on_thunks(false);
+  opts.set_xla_gpu_detect_nan(DebugOptions::DETECTION_MODE_NONE);
+  opts.set_xla_gpu_detect_inf(DebugOptions::DETECTION_MODE_NONE);
   return opts;
 }
 
@@ -760,7 +778,9 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
         DebugOptions::WhileLoopUnrolling unroll_strategy;
         bool parsed = DebugOptions::WhileLoopUnrolling_Parse(
             absl::AsciiStrToUpper(input), &unroll_strategy);
-        if (!parsed) return false;
+        if (!parsed) {
+          return false;
+        }
         debug_options->set_xla_gpu_enable_while_loop_unrolling(unroll_strategy);
         return true;
       };
@@ -843,34 +863,68 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
         return true;
       };
 
-  auto xla_gpu_generic_triton_emitter_features_to_string =
-      [](google::protobuf::RepeatedField<int> values) -> std::string {
-    struct Formatter {
-      void operator()(std::string* out, int type) const {
-        absl::StrAppend(out,
-                        DebugOptions::GenericTritonEmitterFeature_Name(type));
-      }
-    };
-    return absl::StrJoin(values, ",", Formatter());
-  };
-
   // Custom "sub-parser" for xla_gpu_experimental_autotune_cache_mode.
+  auto detection_mode = [](DebugOptions* debug_options,
+                           const std::string& value)
+      -> std::optional<DebugOptions::DetectionMode> {
+    if (value == "none") {
+      return DebugOptions::DETECTION_MODE_NONE;
+    }
+    if (value == "warning") {
+      return DebugOptions::DETECTION_MODE_WARNING;
+    }
+    if (value == "fail") {
+      return DebugOptions::DETECTION_MODE_FAIL;
+    }
+    return std::nullopt;
+  };
   auto setter_for_xla_detect_unstable_reductions =
+      [debug_options, detection_mode](const std::string& value) {
+        if (auto mode = detection_mode(debug_options, value)) {
+          debug_options->set_xla_detect_unstable_reductions(mode.value());
+          return true;
+        }
+        return false;
+      };
+
+  auto setter_for_xla_detect_unstable_reductions_post_optimizations =
+      [debug_options, detection_mode](const std::string& value) {
+        if (auto mode = detection_mode(debug_options, value)) {
+          debug_options->set_xla_detect_unstable_reductions_post_optimizations(
+              mode.value());
+          return true;
+        }
+        return false;
+      };
+
+  // Custom "sub-parser" for
+  // xla_gpu_experimental_thunk_buffer_debug_filter_by_thunk_id_ranges.
+  auto setter_for_thunk_buffer_debug_filter_by_thunk_id =
       [debug_options](const std::string& value) {
-        DebugOptions::UnstableReductionDetectionMode detection_mode;
-        if (value == "none") {
-          detection_mode = DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE;
-        } else if (value == "warning") {
-          detection_mode =
-              DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_WARNING;
-        } else if (value == "fail") {
-          detection_mode = DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL;
-        } else {
-          return false;
+        for (const auto& range_str : absl::StrSplit(value, ',')) {
+          IntRangeInclusive* range =
+              debug_options
+                  ->mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+                  ->add_thunk_id_ranges();
+          if (!details::ParseIntRangeInclusive(range_str, *range)) {
+            return false;
+          }
         }
-        debug_options->set_xla_detect_unstable_reductions(detection_mode);
         return true;
       };
+
+  // Custom "sub-parser" for
+  // xla_gpu_experimental_thunk_buffer_debug_filter_by_profile_annotation_re.
+  auto setter_for_thunk_buffer_debug_filter_by_profile_annotation =
+      [debug_options](const std::string& value) {
+        for (const auto& regex_str : absl::StrSplit(value, ',')) {
+          debug_options
+              ->mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+              ->add_profile_annotation_regexes(regex_str);
+        }
+        return true;
+      };
+
   // Don't use an initializer list for initializing the vector; this would
   // create a temporary copy, and exceeds the stack space when compiling with
   // certain configurations.
@@ -880,6 +934,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_cpu_enable_fast_math(),
       "Enable unsafe fast-math optimizations in the CPU compiler; this may "
       "produce faster code at the expense of some accuracy."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_enable_platform_dependent_math",
+      bool_setter_for(
+          &DebugOptions::set_xla_cpu_enable_platform_dependent_math),
+      debug_options->xla_cpu_enable_platform_dependent_math(),
+      "Enable platform dependent math in the CPU compiler; this may "
+      "produce faster code at the expense of consistent results across CPUs."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_nans",
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
@@ -1095,7 +1156,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::LibraryFusionType_Parse,
           debug_options->mutable_xla_cpu_experimental_xnn_fusion_type()),
       "",
-      "Comma-separated list of XNN fusion types to be enabled.; "
+      "Comma-separated list of XNN fusion types to be enabled; "
       "no whitespace around commas. Two ways to pass values:\n"
       "  1. Exact type names. This overwrites the default setting.\n"
       "  2. '+' or '-' prefix: This adds or removes a fusion type "
@@ -1103,6 +1164,21 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "mode. Every item must have the sign prefix.\n"
       "Available fusion types: dot, eltwise, and reduce.\n"
       "The default list is currently empty."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_experimental_ynn_fusion_type",
+      SetterForRepeatedEnum<DebugOptions::LibraryFusionType>(
+          "xla_cpu_experimental_ynn_fusion_type",
+          /*enum_prefix=*/"LIBRARY_FUSION_TYPE_",
+          &DebugOptions::LibraryFusionType_Parse,
+          debug_options->mutable_xla_cpu_experimental_ynn_fusion_type()),
+      "",
+      "Comma-separated list of YNN fusion types to be enabled; "
+      "no whitespace around commas. Two ways to pass values:\n"
+      "  1. Exact type names. This overwrites the default setting.\n"
+      "  2. '+' or '-' prefix: This adds or removes a fusion type "
+      "from the default list. Cannot be mixed with the overwrite "
+      "mode. Every item must have the sign prefix.\n"
+      "The default list is currently empty."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_experimental_xnn_graph_fusion_mode",
       setter_for_xla_cpu_experimental_xnn_graph_fusion_mode,
@@ -1281,6 +1357,14 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_experimental_dump_fdo_profiles(),
                 "Dumps FDO profiles as text to the directory specified "
                 "by --xla_dump_to."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_dump_gpu_executable",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_dump_gpu_executable),
+      debug_options->xla_gpu_experimental_dump_gpu_executable(),
+      "Dump the serialized GPU executables to 'gpu_executable_proto' suffixed "
+      "files, in the directory specified by `xla_dump_to`. No-op if "
+      "`xla_dump_to` isn't set, or during autotuning compilations."));
   flag_list->push_back(
       tsl::Flag("xla_dump_hlo_as_dot",
                 bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
@@ -1329,6 +1413,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "If specified, dumps HLO before and after optimization passes which "
       "match this regular expression, in addition to dumping at the very "
       "beginning and end of compilation."));
+  flag_list->push_back(tsl::Flag(
+      "xla_dump_emitter_re",
+      string_setter_for(&DebugOptions::set_xla_dump_emitter_re),
+      debug_options->xla_dump_emitter_re(),
+      "If specified, dumps debug logs (e.g. IR like LLVM or MLIR) before and "
+      "after emitters which match this regular expression, in addition to "
+      "dumping at the very beginning and end of compilation."));
   flag_list->push_back(
       tsl::Flag("xla_dump_include_timestamp",
                 bool_setter_for(&DebugOptions::set_xla_dump_include_timestamp),
@@ -1690,6 +1781,15 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
               set_xla_gpu_experimental_enable_nccl_symmetric_buffers),
       debug_options->xla_gpu_experimental_enable_nccl_symmetric_buffers(),
       "Enables NCCL symmetric buffer registration."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_aot_compiled_thunks",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_aot_compiled_thunks),
+      debug_options->xla_gpu_experimental_aot_compiled_thunks(),
+      "Enables an Ahead-of-Time (AOT) compilation flow where the compiled "
+      "binary includes the generated Thunks. In contrast, the legacy flow "
+      "only compiles up to the HLO optimization stage, before Thunk "
+      "generation."));
 
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_nvshmem",
@@ -1855,6 +1955,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           debug_options->xla_gpu_experimental_pipeline_parallelism_opt_level()),
       "Experimental optimizations for SPMD-based pipeline parallelism on "
       "GPU."));
+  flag_list->push_back(tsl::Flag(
+      "xla_enable_enzyme_comms_opt",
+      bool_setter_for(&DebugOptions::set_xla_enable_enzyme_comms_opt),
+      debug_options->xla_enable_enzyme_comms_opt(),
+      "Enable communication optimization patterns specified in Enzyme. More "
+      "details in http://shortn/_jXJ2VFoyMN."));
   flag_list->push_back(tsl::Flag(
       "xla_partitioning_algorithm", setter_for_xla_partitioning_algorithm,
       DebugOptions::PartitioningAlgorithm_Name(
@@ -1865,19 +1971,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 bool_setter_for(&DebugOptions::set_xla_gpu_enable_triton_gemm),
                 debug_options->xla_gpu_enable_triton_gemm(),
                 "[Stable] Whether to use Triton-based matrix multiplication."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_unsupported_generic_triton_emitter_features",
-      SetterForRepeatedEnum<DebugOptions::GenericTritonEmitterFeature>(
-          "xla_gpu_unsupported_generic_triton_emitter_features",
-          /*enum_prefix=*/"GENERIC_TRITON_EMITTER_",
-          &DebugOptions::GenericTritonEmitterFeature_Parse,
-          debug_options
-              ->mutable_xla_gpu_unsupported_generic_triton_emitter_features()),
-      xla_gpu_generic_triton_emitter_features_to_string(
-          debug_options->xla_gpu_unsupported_generic_triton_emitter_features()),
-      "Comma-separated list of individual features of generic Triton emitter. "
-      "Use +/- prefix to modify the default list, or list features to enable "
-      "explicitly - that will override the defaults."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_unsupported_enable_triton_multi_output_fusion",
       bool_setter_for(
@@ -2049,7 +2142,8 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::
               set_xla_gpu_filter_kernels_spilling_registers_on_autotuning),
       debug_options->xla_gpu_filter_kernels_spilling_registers_on_autotuning(),
-      "Filter out kernels that spill registers during autotuning"));
+      "Filter out kernels that spill registers during autotuning. Default is "
+      "true."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_fail_ptx_compilation_on_register_spilling",
       bool_setter_for(
@@ -2254,6 +2348,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_experimental_autotuner_cache_dir),
       debug_options->xla_gpu_experimental_autotuner_cache_dir(),
       "Experimental: Specify the directory to read/write autotuner cache to."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_gemm_autotuner_override_file",
+      string_setter_for(
+          &DebugOptions::set_xla_gpu_gemm_autotuner_override_file),
+      debug_options->xla_gpu_gemm_autotuner_override_file(),
+      "A textproto file to override autotune results. See also "
+      "`xla_gpu_override_gemm_autotuner` to override with a single config."));
   flag_list->push_back(tsl::Flag(
       "xla_enable_command_buffers_during_profiling",
       bool_setter_for(
@@ -2504,6 +2605,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_disable_automatic_host_compute_offload(),
       "Return an error if HostOffloader would have automatically offloaded some"
       " compute to the host."));
+  flag_list->push_back(tsl::Flag(
+      "xla_enable_scoped_logging_timers",
+      bool_setter_for(&DebugOptions::set_xla_enable_scoped_logging_timers),
+      debug_options->xla_enable_scoped_logging_timers(),
+      "Do not run scoped logging timers (only supported in some places)."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_matmul_perf_table_path",
       string_setter_for(
@@ -2525,6 +2631,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_experimental_enable_triton_tma),
       debug_options->xla_gpu_experimental_enable_triton_tma(),
       "Enable Triton's TMA loads/stores for arguments where applicable."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_triton_warp_specialization",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_experimental_enable_triton_warp_specialization),
+      debug_options->xla_gpu_experimental_enable_triton_warp_specialization(),
+      "Enable Triton's auto warp specialization feature where applicable."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_command_buffer_on_thunks",
       bool_setter_for(
@@ -2540,15 +2653,30 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_experimental_use_autotuner_pass(),
       "If true, use the AutotunerPass to autotune fusions, instead of the "
       "gemm_fusion_autotuner."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_allow_unroll_factor_eight",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_allow_unroll_factor_eight),
+      debug_options->xla_gpu_experimental_allow_unroll_factor_eight(),
+      "If true, allows unroll factor 8 on Blackwell architectures."));
   flag_list->push_back(
       tsl::Flag("xla_detect_unstable_reductions",
                 setter_for_xla_detect_unstable_reductions,
-                DebugOptions::UnstableReductionDetectionMode_Name(
+                DebugOptions::DetectionMode_Name(
                     debug_options->xla_detect_unstable_reductions()),
                 "Controls the behavior of the unstable reduction detector pass "
                 "that checks for unstable reductions in HLO computations. "
                 "Acceptable values are: 'none', 'log', and 'crash'. 'none' is "
                 "the default."));
+  flag_list->push_back(tsl::Flag(
+      "xla_detect_unstable_reductions_post_optimizations",
+      setter_for_xla_detect_unstable_reductions_post_optimizations,
+      DebugOptions::DetectionMode_Name(
+          debug_options->xla_detect_unstable_reductions_post_optimizations()),
+      "Controls the behavior of the unstable reduction detector pass "
+      "that checks for unstable reductions in HLO computations after "
+      "optimizations. Acceptable values are: 'none', 'log', and "
+      "'crash'. 'none' is the default."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_use_raft_select_k",
       bool_setter_for(
@@ -2579,6 +2707,74 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_keep_shardings_after_spmd),
       debug_options->xla_keep_shardings_after_spmd(),
       "If true, keep shardings after SPMD."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_checksum_tracing_on_thunks",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_experimental_enable_checksum_tracing_on_thunks),
+      debug_options->xla_gpu_experimental_enable_checksum_tracing_on_thunks(),
+      "Enables an experimental feature to record checksums of selected thunk "
+      "inputs/outputs."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_buffer_saver_on_thunks",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_experimental_enable_buffer_saver_on_thunks),
+      debug_options->xla_gpu_experimental_enable_buffer_saver_on_thunks(),
+      "When provided, enables an experimental feature to save results of "
+      "selected thunks."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_thunk_buffer_debug_filter_by_thunk_id_ranges",
+      setter_for_thunk_buffer_debug_filter_by_thunk_id, "(none)",
+      "Limits the thunk buffer debug instrumentation to thunks with IDs "
+      "matching one or more ranges defined as a single integer, min:max "
+      "(inclusive), or half-open min:/:max."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_thunk_buffer_debug_filter_by_profile_annotation_re",
+      setter_for_thunk_buffer_debug_filter_by_profile_annotation, "(none)",
+      "Limits the thunk buffer debug instrumentation to thunks with profile "
+      "annotations matching one or more regexes passed as comma-separated "
+      "string."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_fusion_autotuner",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_enable_fusion_autotuner),
+      debug_options->xla_gpu_experimental_enable_fusion_autotuner(),
+      "Enable autotuning between the native & triton fusion emitters."));
+
+  auto setter_for_xla_gpu_detect_nan =
+      [debug_options, detection_mode](const std::string& value) {
+        if (auto mode = detection_mode(debug_options, value)) {
+          debug_options->set_xla_gpu_detect_nan(mode.value());
+          return true;
+        }
+        return false;
+      };
+
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_detect_nan", setter_for_xla_gpu_detect_nan,
+      DebugOptions::DetectionMode_Name(debug_options->xla_gpu_detect_nan()),
+      "Controls the behavior of the NaN detector pass that checks for presence "
+      "of NaN values in kernel outputs. Acceptable values are: 'none', "
+      "'warning', and 'fail'. 'none' is the default. If other than 'none' "
+      "value is provided, additional thunks will be added to detect and "
+      "warn or fail the execution if NaNs are detected."));
+  auto setter_for_xla_gpu_detect_inf =
+      [debug_options, detection_mode](const std::string& value) {
+        if (auto mode = detection_mode(debug_options, value)) {
+          debug_options->set_xla_gpu_detect_inf(mode.value());
+          return true;
+        }
+        return false;
+      };
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_detect_inf", setter_for_xla_gpu_detect_inf,
+      DebugOptions::DetectionMode_Name(debug_options->xla_gpu_detect_inf()),
+      "Controls the behavior of the Inf detector pass that checks for presence "
+      "of Inf values in kernel outputs. Acceptable values are: 'none', "
+      "'warning', and 'fail'. 'none' is the default. If other than 'none' "
+      "value is provided, additional thunks will be added to detect and "
+      "warn or fail the execution if Infs are detected."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/debug_options_flags_test.cc b/third_party/xla/xla/debug_options_flags_test.cc
index e9d7f1a22a0765..cf2257f7eee150 100644
--- a/third_party/xla/xla/debug_options_flags_test.cc
+++ b/third_party/xla/xla/debug_options_flags_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
+#include "google/protobuf/descriptor.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/protobuf.h"
 
diff --git a/third_party/xla/xla/debug_options_parsers.cc b/third_party/xla/xla/debug_options_parsers.cc
new file mode 100644
index 00000000000000..ef2756b114af42
--- /dev/null
+++ b/third_party/xla/xla/debug_options_parsers.cc
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/debug_options_parsers.h"
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace details {
+
+bool ParseIntRangeInclusive(absl::string_view string_value,
+                            IntRangeInclusive& range) {
+  std::vector<absl::string_view> parts = absl::StrSplit(string_value, ':');
+
+  if (parts.size() == 1) {
+    // A single integer x is a valid [x, x] range.
+    int64_t first;
+    if (!absl::SimpleAtoi(parts[0], &first)) {
+      return false;
+    }
+    range.set_first(first);
+    range.set_last(first);
+    return true;
+  }
+
+  if (parts.size() == 2) {
+    if (parts[0].empty() && parts[1].empty()) {
+      // ":" is not a valid range.
+      return false;
+    }
+
+    // Allow semi-open ranges (e.g. "1:", ":100").
+    int64_t first = std::numeric_limits<int64_t>::min();
+    int64_t last = std::numeric_limits<int64_t>::max();
+    if (!parts[0].empty() && !absl::SimpleAtoi(parts[0], &first)) {
+      return false;
+    }
+    if (!parts[1].empty() && !absl::SimpleAtoi(parts[1], &last)) {
+      return false;
+    }
+    if (first > last) {
+      return false;
+    }
+    range.set_first(first);
+    range.set_last(last);
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace details
+}  // namespace xla
diff --git a/third_party/xla/xla/debug_options_parsers.h b/third_party/xla/xla/debug_options_parsers.h
index 2ecce94961c20f..dd9fce01c02aa4 100644
--- a/third_party/xla/xla/debug_options_parsers.h
+++ b/third_party/xla/xla/debug_options_parsers.h
@@ -99,6 +99,19 @@ struct RepeatedFlagModifier {
 //   -> [(add, "A"), (remove, "B"), (add, "C")]
 absl::StatusOr<std::vector<RepeatedFlagModifier>> ParseRepeatedEnumModifiers(
     absl::string_view flag_value, absl::string_view add_prefix = "");
+
+// Parses a string representation of an inclusive range into `range`.
+//
+// The string representation can be either:
+// - a single integer x,
+// - a range min:max,
+// - a half-open range min: or :max.
+// The range is inclusive on both ends.
+//
+// Returns true if the string is a valid range representation.
+bool ParseIntRangeInclusive(absl::string_view string_value,
+                            IntRangeInclusive& range);
+
 }  // namespace details
 
 }  // namespace xla
diff --git a/third_party/xla/xla/debug_options_parsers_test.cc b/third_party/xla/xla/debug_options_parsers_test.cc
index 5dd25afa5387ab..717394aded86d4 100644
--- a/third_party/xla/xla/debug_options_parsers_test.cc
+++ b/third_party/xla/xla/debug_options_parsers_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "xla/debug_options_parsers.h"
 
+#include <cstdint>
+#include <limits>
 #include <string>
 #include <vector>
 
@@ -44,6 +46,7 @@ using ::absl_testing::IsOkAndHolds;
 using ::absl_testing::StatusIs;
 using ::testing::ElementsAre;
 using ::testing::IsEmpty;
+using ::xla::details::ParseIntRangeInclusive;
 using ::xla::details::ParseRepeatedEnumModifiers;
 using ::xla::details::RepeatedFlagModifier;
 
@@ -385,56 +388,6 @@ TEST(ParseRepeatedEnumModifiersTest, Invalid) {
               StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
-TEST(ParseRepeatedEnumFlagsTest, GenericTritonEmitterFeatures) {
-  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
-  const auto& enabled_features =
-      debug_options.xla_gpu_unsupported_generic_triton_emitter_features();
-
-  // Check that the default setting is empty.
-  ASSERT_THAT(
-      enabled_features,
-      ElementsAre(DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM));
-
-  // Initialize the flag objects.
-  std::vector<tsl::Flag> flag_objects;
-  MakeDebugOptionsFlags(&flag_objects, &debug_options);
-
-  // Adding options.
-  SetXlaFlagsEnvVar(
-      "--xla_gpu_unsupported_generic_triton_emitter_features=+allow_all_gemm_"
-      "shapes");
-  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_features.size(), 2);
-  EXPECT_THAT(
-      enabled_features,
-      ElementsAre(DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM,
-                  DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES));
-
-  // Overwriting options.
-  SetXlaFlagsEnvVar(
-      "--xla_gpu_unsupported_generic_triton_emitter_features=disable_legacy_"
-      "gemm,allow_all_ops_in_gemm_fusion");
-  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_features.size(), 2);
-  EXPECT_THAT(
-      enabled_features,
-      ElementsAre(
-          DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM,
-          DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION));
-
-  // More adding/removing options. Do not add duplicates.
-  SetXlaFlagsEnvVar(
-      "--xla_gpu_unsupported_generic_triton_emitter_features=-disable_legacy_"
-      "gemm,-unspecified,+enable_nested_gemm,+allow_all_ops_in_gemm_fusion");
-  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_features.size(), 2);
-  EXPECT_THAT(
-      enabled_features,
-      ElementsAre(
-          DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION,
-          DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM));
-}
-
 TEST(ParseRepeatedEnumFlagsTest, CommandBufferCmdType) {
   DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
 
@@ -545,6 +498,49 @@ TEST(ParseRepeatedEnumFlagsTest, XnnFusionType) {
   TestLibraryFusionType("xnn");
 }
 
+TEST(ParseIntRangeInclusiveTest, SingleInteger) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive("10", range));
+  EXPECT_EQ(range.first(), 10);
+  EXPECT_EQ(range.last(), 10);
+}
+
+TEST(ParseIntRangeInclusiveTest, Range) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive("10:20", range));
+  EXPECT_EQ(range.first(), 10);
+  EXPECT_EQ(range.last(), 20);
+}
+
+TEST(ParseIntRangeInclusiveTest, HalfOpenRangeWithMin) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive("10:", range));
+  EXPECT_EQ(range.first(), 10);
+  EXPECT_EQ(range.last(), std::numeric_limits<int64_t>::max());
+}
+
+TEST(ParseIntRangeInclusiveTest, HalfOpenRangeWithMax) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive(":100", range));
+  EXPECT_EQ(range.first(), std::numeric_limits<int64_t>::min());
+  EXPECT_EQ(range.last(), 100);
+}
+
+TEST(ParseIntRangeInclusiveTest, InvalidRange) {
+  IntRangeInclusive range;
+  EXPECT_FALSE(ParseIntRangeInclusive("10:20:30", range));
+}
+
+TEST(ParseIntRangeInclusiveTest, InvalidHalfOpenRange) {
+  IntRangeInclusive range;
+  EXPECT_FALSE(ParseIntRangeInclusive(":", range));
+}
+
+TEST(ParseIntRangeInclusiveTest, ReversedRange) {
+  IntRangeInclusive range;
+  EXPECT_FALSE(ParseIntRangeInclusive("20:10", range));
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/third_party/xla/xla/examples/axpy/BUILD b/third_party/xla/xla/examples/axpy/BUILD
index 33f00b52ed3c37..7c4b0c1cfc8e4a 100644
--- a/third_party/xla/xla/examples/axpy/BUILD
+++ b/third_party/xla/xla/examples/axpy/BUILD
@@ -21,18 +21,19 @@ xla_cc_test(
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
         "@stablehlo//:register",
     ],
 )
diff --git a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
index c7db1c277d8794..fe4de7c6776cb9 100644
--- a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
+++ b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -45,11 +46,11 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/executable_run_options.cc b/third_party/xla/xla/executable_run_options.cc
index 706b4143b91e3e..cbf53b192d80db 100644
--- a/third_party/xla/xla/executable_run_options.cc
+++ b/third_party/xla/xla/executable_run_options.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <atomic>
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <vector>
 
 namespace xla {
 
@@ -180,4 +182,15 @@ int ExecutableRunOptions::local_device_count() const {
   return local_device_count_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_clique_keys(
+    std::vector<std::unique_ptr<CliqueKey>>* clique_keys) {
+  clique_keys_ = clique_keys;
+  return *this;
+}
+
+std::vector<std::unique_ptr<CliqueKey>>* ExecutableRunOptions::clique_keys()
+    const {
+  return clique_keys_;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/executable_run_options.h b/third_party/xla/xla/executable_run_options.h
index b377e91670efb9..fb63e3ee72b501 100644
--- a/third_party/xla/xla/executable_run_options.h
+++ b/third_party/xla/xla/executable_run_options.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
@@ -47,6 +48,7 @@ class AsyncValueRef;
 
 namespace xla {
 
+class CliqueKey;
 class DeviceAssignment;
 class ExecutionProfile;
 class Shape;
@@ -254,6 +256,10 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_local_device_count(int local_device_count);
   int local_device_count() const;
 
+  ExecutableRunOptions& set_clique_keys(
+      std::vector<std::unique_ptr<CliqueKey>>* clique_keys);
+  std::vector<std::unique_ptr<CliqueKey>>* clique_keys() const;
+
  private:
   stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -274,6 +280,7 @@ class ExecutableRunOptions {
   const cpu::CpuExecutableRunOptions* cpu_executable_run_options_ = nullptr;
   const gpu::GpuExecutableRunOptions* gpu_executable_run_options_ = nullptr;
   const ffi::ExecutionContext* ffi_execution_context_ = nullptr;
+  std::vector<std::unique_ptr<CliqueKey>>* clique_keys_ = nullptr;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 95548493e6a80a..eb091eaabb7bb9 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -1,4 +1,5 @@
 load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -6,10 +7,18 @@ package(
     default_visibility = ["//visibility:public"],
 )
 
+package_group(
+    name = "ffi_internal",
+    packages = [
+        "//xla/backends/cpu",
+        "//xla/backends/gpu",
+    ],
+)
+
 cc_library(
     name = "api",
     hdrs = ["//xla/ffi/api:api_headers"],
-    visibility = ["//visibility:private"],
+    visibility = [":ffi_internal"],
     deps = [
         "//xla/ffi/api:c_api",
         "@com_google_absl//absl/strings:string_view",
@@ -22,6 +31,7 @@ cc_library(
     hdrs = ["call_frame.h"],
     deps = [
         ":api",
+        ":attribute_map",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -44,6 +54,7 @@ xla_cc_test(
     name = "call_frame_test",
     srcs = ["call_frame_test.cc"],
     deps = [
+        ":attribute_map",
         ":call_frame",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
@@ -62,12 +73,12 @@ cc_library(
     srcs = ["execution_context.cc"],
     hdrs = ["execution_context.h"],
     deps = [
-        ":type_id_registry",
+        ":type_registry",
         "//xla:util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -79,7 +90,7 @@ xla_cc_test(
     srcs = ["execution_context_test.cc"],
     deps = [
         ":execution_context",
-        ":type_id_registry",
+        ":type_registry",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
@@ -94,13 +105,14 @@ cc_library(
     srcs = ["execution_state.cc"],
     hdrs = ["execution_state.h"],
     deps = [
-        ":type_id_registry",
+        ":type_registry",
         "//xla:util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -109,11 +121,12 @@ xla_cc_test(
     srcs = ["execution_state_test.cc"],
     deps = [
         ":execution_state",
+        ":type_registry",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -124,23 +137,24 @@ cc_library(
         ":api",
         ":execution_context",
         ":execution_state",
+        ":type_registry",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu:ffi",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:scratch_allocator",
-        "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
@@ -158,7 +172,9 @@ cc_library(
         ":call_frame",
         ":execution_context",
         ":execution_state",
-        ":type_id_registry",
+        ":ffi_internal_api",
+        ":ffi_structs",
+        ":type_registry",
         "//xla:executable_run_options",
         "//xla:util",
         "//xla/ffi/api:c_api",
@@ -186,12 +202,52 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ffi_internal_api",
+    srcs = ["ffi_internal_api.cc"],
+    hdrs = ["ffi_internal_api.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":execution_context",
+        ":execution_state",
+        ":ffi_structs",
+        "//xla:util",
+        "//xla/ffi/api:c_api",
+        "//xla/ffi/api:c_api_internal",
+        "//xla/hlo/ir:hlo",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_library(
+    name = "ffi_structs",
+    hdrs = ["ffi_structs.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":execution_context",
+        ":execution_state",
+        "//xla:executable_run_options",
+        "//xla/ffi/api:c_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/tsl/concurrency:async_value",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 cc_library(
     name = "attribute_map",
     srcs = ["attribute_map.cc"],
     hdrs = ["attribute_map.h"],
     deps = [
-        ":call_frame",
+        ":attribute_map_proto_cc",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -199,11 +255,27 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
+xla_cc_test(
+    name = "attribute_map_test",
+    srcs = ["attribute_map_test.cc"],
+    deps = [
+        ":attribute_map",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "attribute_map_proto",
+    srcs = ["attribute_map.proto"],
+)
+
 xla_cc_test(
     name = "ffi_test",
     srcs = ["ffi_test.cc"],
@@ -211,18 +283,26 @@ xla_cc_test(
     features = ["-use_header_modules"],
     shuffle_tests = False,
     deps = [
+        ":attribute_map",
         ":call_frame",
         ":execution_context",
         ":execution_state",
         ":ffi",
         ":ffi_api",
+        ":type_registry",
         "//xla:executable_run_options",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu:ffi",
+        "//xla/backends/gpu:ffi",
         "//xla/ffi/api:c_api",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -232,40 +312,40 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
     ],
 )
 
 cc_library(
-    name = "type_id_registry",
-    srcs = ["type_id_registry.cc"],
-    hdrs = ["type_id_registry.h"],
+    name = "type_registry",
+    srcs = ["type_registry.cc"],
+    hdrs = ["type_registry.h"],
     deps = [
         "//xla:util",
         "//xla/tsl/lib/gtl:int_type",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
     ],
 )
 
 xla_cc_test(
-    name = "type_id_registry_test",
-    srcs = ["type_id_registry_test.cc"],
+    name = "type_registry_test",
+    srcs = ["type_registry_test.cc"],
     deps = [
-        ":type_id_registry",
+        ":type_registry",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index 5b48e05c8f33fc..41889027b9ddd3 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -85,20 +85,21 @@ xla_cc_test(
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:execution_context",
         "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
-        "//xla/ffi:type_id_registry",
+        "//xla/ffi:type_registry",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/functional:overload",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index e28e8993fc8740..658234289befee 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -192,6 +192,49 @@ inline std::ostream& operator<<(std::ostream& os,
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Builtin structs equality
+//===----------------------------------------------------------------------===//
+
+inline bool operator==(const XLA_FFI_TypeId& a, const XLA_FFI_TypeId& b) {
+  return a.type_id == b.type_id;
+}
+
+inline bool operator!=(const XLA_FFI_TypeId& a, const XLA_FFI_TypeId& b) {
+  return !(a == b);
+}
+
+inline bool operator==(const XLA_FFI_Api_Version& a,
+                       const XLA_FFI_Api_Version& b) {
+  return a.major_version == b.major_version &&
+         a.minor_version == b.minor_version;
+}
+
+inline bool operator!=(const XLA_FFI_Api_Version& a,
+                       const XLA_FFI_Api_Version& b) {
+  return !(a == b);
+}
+
+inline bool operator==(const XLA_FFI_Metadata& a, const XLA_FFI_Metadata& b) {
+  return a.api_version == b.api_version && a.traits == b.traits &&
+         a.state_type_id == b.state_type_id;
+}
+
+inline bool operator!=(const XLA_FFI_Metadata& a, const XLA_FFI_Metadata& b) {
+  return !(a == b);
+}
+
+inline bool operator==(const XLA_FFI_Handler_Bundle& a,
+                       const XLA_FFI_Handler_Bundle& b) {
+  return a.instantiate == b.instantiate && a.prepare == b.prepare &&
+         a.initialize == b.initialize && a.execute == b.execute;
+}
+
+inline bool operator!=(const XLA_FFI_Handler_Bundle& a,
+                       const XLA_FFI_Handler_Bundle& b) {
+  return !(a == b);
+}
+
 namespace xla::ffi {
 
 enum class ExecutionStage : uint8_t {
@@ -216,7 +259,8 @@ enum class Traits : uint32_t {
   //      everything it launches will be captured in the command buffer;
   //   2. the FFI handler only uses device allocations passed in as buffer
   //      arguments (e.g. it does *not* do any runtime device memory
-  //      allocations).
+  //      allocations);
+  //   3. the FFI handler may not query the execution status of the stream.
   kCmdBufferCompatible = XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE,
 };
 
@@ -277,7 +321,13 @@ class Ffi {
   // `type_id`.
   static XLA_FFI_Error* RegisterTypeId(const XLA_FFI_Api* api,
                                        std::string_view name,
-                                       XLA_FFI_TypeId* type_id);
+                                       XLA_FFI_TypeId* type_id,  // in-out
+                                       XLA_FFI_TypeInfo type_info);
+
+  static XLA_FFI_Error* RegisterTypeId(const XLA_FFI_Api* api,
+                                       std::string_view name,
+                                       XLA_FFI_TypeId* type_id,  // in-out
+                                       const XLA_FFI_TypeInfo* type_info);
 
   // This is a helper template that allows to convert function pointers from
   // the run time values to compile time values (template arguments) with
@@ -318,6 +368,9 @@ class Ffi {
   static XLA_FFI_Error* InvalidArgument(const XLA_FFI_Api* api,
                                         std::string message);
 
+  static XLA_FFI_Error* FailedPrecondition(const XLA_FFI_Api* api,
+                                           std::string message);
+
   static XLA_FFI_Error* CheckStructSize(const XLA_FFI_Api* api,
                                         std::string_view struct_name,
                                         size_t expected, size_t actual);
@@ -343,13 +396,23 @@ inline XLA_FFI_Error* Ffi::RegisterStaticHandler(
 
 inline XLA_FFI_Error* Ffi::RegisterTypeId(const XLA_FFI_Api* api,
                                           std::string_view name,
-                                          XLA_FFI_TypeId* type_id) {
-  XLA_FFI_TypeId_Register_Args args;
-  args.struct_size = XLA_FFI_TypeId_Register_Args_STRUCT_SIZE;
+                                          XLA_FFI_TypeId* type_id,
+                                          XLA_FFI_TypeInfo type_info) {
+  return RegisterTypeId(api, name, type_id, &type_info);
+}
+
+inline XLA_FFI_Error* Ffi::RegisterTypeId(const XLA_FFI_Api* api,
+                                          std::string_view name,
+                                          XLA_FFI_TypeId* type_id,
+                                          const XLA_FFI_TypeInfo* type_info) {
+  assert(type_id && "type_id must not be null");
+  XLA_FFI_Type_Register_Args args;
+  args.struct_size = XLA_FFI_Type_Register_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.name = XLA_FFI_ByteSpan{name.data(), name.size()};
   args.type_id = type_id;
-  return api->XLA_FFI_TypeId_Register(&args);
+  args.type_info = type_info;
+  return api->XLA_FFI_Type_Register(&args);
 }
 
 template <typename... Args>
@@ -378,6 +441,12 @@ inline XLA_FFI_Error* Ffi::InvalidArgument(const XLA_FFI_Api* api,
                    std::move(message));
 }
 
+inline XLA_FFI_Error* Ffi::FailedPrecondition(const XLA_FFI_Api* api,
+                                              std::string message) {
+  return MakeError(api, XLA_FFI_Error_Code_FAILED_PRECONDITION,
+                   std::move(message));
+}
+
 inline XLA_FFI_Error* Ffi::CheckStructSize(const XLA_FFI_Api* api,
                                            std::string_view struct_name,
                                            size_t expected, size_t actual) {
@@ -401,19 +470,54 @@ inline XLA_FFI_Error* Ffi::StructSizeIsGreaterOrEqual(
   return nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+// XLA_FFI_Error helpers
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+inline void DestroyError(const XLA_FFI_Api* api, XLA_FFI_Error* error) {
+  XLA_FFI_Error_Destroy_Args args;
+  args.struct_size = XLA_FFI_Error_Destroy_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.error = error;
+  api->XLA_FFI_Error_Destroy(&args);
+}
+
+inline const char* GetErrorMessage(const XLA_FFI_Api* api,
+                                   XLA_FFI_Error* error) {
+  XLA_FFI_Error_GetMessage_Args args;
+  args.struct_size = XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.error = error;
+  api->XLA_FFI_Error_GetMessage(&args);
+  return args.message;
+}
+
+}  // namespace internal
+
 //===----------------------------------------------------------------------===//
 // Type tags for distinguishing handler argument types
 //===----------------------------------------------------------------------===//
 
-// Forward declare.
+// Dictionary gives type-safe run time access to all attributes. Concrete
+// implementation is provided by the `ffi.h` header.
 class Dictionary;
 
+// Context gives run time access to the execution context. Concrete
+// implementation is provided by the `ffi.h` header.
+class Context;
+
 namespace internal {
 
 // WARNING: A lot of template metaprogramming on top of C++ variadic templates
 // parameter packs. We need this to be able to pattern match FFI handler
 // signature at compile time.
 
+// A type tag for decoding argument.
+template <typename T>
+struct ArgTag {};
+
 // A type tag for decoding optional argument.
 template <typename T>
 struct OptionalArgTag {};
@@ -442,7 +546,7 @@ struct AttrTag {};
 
 // A type tag to forward all attributes as `Dictionary` (and optionally decode
 // it into a custom struct).
-template <typename T = Dictionary>
+template <typename T>
 struct AttrsTag {};
 
 // A type tag to distinguish parameter extracted from an execution context.
@@ -450,23 +554,23 @@ template <typename T>
 struct CtxTag {};
 
 //----------------------------------------------------------------------------//
-// A template for counting tagged arguments in the Ts pack (i.e. attributes).
+// A template for counting tagged arguments in the Ts pack.
 //----------------------------------------------------------------------------//
 
-template <template <typename> class Tag, typename... Ts>
+template <template <typename> typename Tag, typename... Ts>
 struct NumTagged;
 
-template <template <typename> class Tag>
+template <template <typename> typename Tag>
 struct NumTagged<Tag> {
   static constexpr int64_t value = 0;
 };
 
-template <template <typename> class Tag, typename T, typename... Ts>
+template <template <typename> typename Tag, typename T, typename... Ts>
 struct NumTagged<Tag, Tag<T>, Ts...> {
   static constexpr int64_t value = 1 + NumTagged<Tag, Ts...>::value;
 };
 
-template <template <typename> class Tag, typename T, typename... Ts>
+template <template <typename> typename Tag, typename T, typename... Ts>
 struct NumTagged<Tag, T, Ts...> {
   static constexpr int64_t value = 0 + NumTagged<Tag, Ts...>::value;
 };
@@ -548,7 +652,7 @@ template <ExecutionStage stage, typename... Ts>
 class Binding {
  public:
   template <typename T>
-  Binding<stage, Ts..., T> Arg() && {
+  Binding<stage, Ts..., internal::ArgTag<T>> Arg() && {
     static_assert(!internal::HasOptionalArgTag<Ts...>::value,
                   "argument can't be passed after optional argument");
     static_assert(!internal::HasRemainingArgsTag<Ts...>::value,
@@ -597,6 +701,10 @@ class Binding {
     return {std::move(*this)};
   }
 
+  Binding<stage, Ts..., internal::CtxTag<Context>> Ctx() && {
+    return {std::move(*this)};
+  }
+
   template <typename T>
   Binding<stage, Ts..., internal::AttrTag<T>> Attr(std::string attr) && {
     static_assert(internal::NumTagged<internal::AttrsTag, Ts...>::value == 0,
@@ -897,6 +1005,7 @@ struct AttrsBinding<Dictionary> {
 //
 //   template <>
 //   struct ArgDecoding<MyType> {
+//     static bool Isa(XLA_FFI_ArgType type, void* arg);
 //     static std::optional<MyType> Decode(XLA_FFI_ArgType type, void* arg);
 //   };
 //
@@ -914,6 +1023,7 @@ struct ArgDecoding;
 //
 //   template <>
 //   struct RetDecoding<MyType> {
+//     static bool Isa(XLA_FFI_RetType type, void* ret);
 //     static std::optional<MyType> Decode(XLA_FFI_RetType type, void* ret);
 //   };
 //
@@ -931,9 +1041,10 @@ struct RetDecoding;
 //
 //   template <>
 //   struct AttrDecoding<MyType> {
-//    using Type = <handler argument type for attribute type MyType>
-//    static std::optional<MyType> Decode(XLA_FFI_AttrType type, void* attr,
-//                                        DiagnosticEngine&);
+//     using Type = <handler argument type for attribute type MyType>
+//     static bool Isa(XLA_FFI_AttrType type, void* attr);
+//     static std::optional<MyType> Decode(XLA_FFI_AttrType type, void* attr,
+//                                         DiagnosticEngine&);
 //   }
 //
 template <typename T>
@@ -1078,7 +1189,10 @@ struct DecodingContext {
 };
 
 template <typename T>
-struct Decode {
+struct Decode;
+
+template <typename T>
+struct Decode<ArgTag<T>> {
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
@@ -1097,7 +1211,7 @@ struct Decode<OptionalArgTag<T>> {
     if (XLA_FFI_PREDICT_FALSE(offsets.args >= ctx.call_frame->args.size)) {
       return std::optional<T>(std::nullopt);
     }
-    return Decode<T>::call(offsets, ctx, diagnostic);
+    return Decode<ArgTag<T>>::call(offsets, ctx, diagnostic);
   }
 };
 
@@ -1199,6 +1313,13 @@ class RemainingArgsBase {
     assert(offset <= args_->size && "illegal remaining args offset");
   }
 
+  template <typename T>
+  bool isa(size_t index) const {
+    size_t idx = offset() + index;
+    assert(idx < args_->size && "illegal remaining args index");
+    return ArgDecoding<T>::Isa(args_->types[idx], args_->args[idx]);
+  }
+
   size_t size() const { return args_->size - offset_; }
   bool empty() const { return size() == 0; }
 
@@ -1226,6 +1347,13 @@ class RemainingRetsBase {
     assert(offset <= rets_->size && "illegal remaining rets offset");
   }
 
+  template <typename T>
+  bool isa(size_t index) const {
+    size_t idx = offset_ + index;
+    assert(idx < rets_->size && "illegal remaining rets index");
+    return RetDecoding<T>::Isa(rets_->types[idx], rets_->rets[idx]);
+  }
+
   size_t size() const { return rets_->size - offset_; }
   bool empty() const { return size() == 0; }
 
@@ -1254,14 +1382,73 @@ class DictionaryBase {
  public:
   explicit DictionaryBase(const XLA_FFI_Attrs* attrs) : attrs_(attrs) {}
 
+  // Iterator for iterating over dictionary attribute names.
+  class Iterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = ptrdiff_t;
+    using value_type = std::string_view;
+
+    bool operator==(const Iterator& it) const { return idx_ == it.idx_; }
+    bool operator!=(const Iterator& it) const { return idx_ != it.idx_; }
+
+    std::string_view operator*() const {
+      return std::string_view{attrs_->names[idx_]->ptr,
+                              attrs_->names[idx_]->len};
+    }
+
+    Iterator& operator++() {
+      ++idx_;
+      return *this;
+    }
+
+   private:
+    friend class DictionaryBase;
+    Iterator(const XLA_FFI_Attrs* attrs, size_t idx)
+        : attrs_(attrs), idx_(idx) {}
+
+    const XLA_FFI_Attrs* attrs_;
+    size_t idx_ = 0;
+  };
+
   size_t size() const { return attrs_->size; }
 
   bool contains(std::string_view name) const { return Find(name).has_value(); }
 
+  Iterator begin() const { return Iterator(attrs_, 0); }
+  Iterator end() const { return Iterator(attrs_, size()); }
+
+  template <typename T>
+  bool contains(std::string_view name) const {
+    std::optional<size_t> idx = Find(name);
+    if (XLA_FFI_PREDICT_FALSE(!idx.has_value())) {
+      return false;
+    }
+
+    XLA_FFI_AttrType attr_type = attrs_->types[*idx];
+    void* attr = attrs_->attrs[*idx];
+    return AttrDecoding<T>::Isa(attr_type, attr);
+  }
+
+  template <typename T>
+  bool isa(const Iterator& it) const {
+    XLA_FFI_AttrType attr_type = attrs_->types[it.idx_];
+    void* attr = attrs_->attrs[it.idx_];
+    return AttrDecoding<T>::Isa(attr_type, attr);
+  }
+
  protected:
   template <typename T, typename... Ts>
   friend struct DecodeDictionaryAttr;
 
+  template <typename T>
+  std::optional<size_t> get(const Iterator& it,
+                            DiagnosticEngine& diagnostic) const {
+    XLA_FFI_AttrType attr_type = attrs_->types[it.idx_];
+    void* attr = attrs_->attrs[it.idx_];
+    return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
+  }
+
   template <typename T>
   std::optional<T> get(std::string_view name,
                        DiagnosticEngine& diagnostic) const {
@@ -1315,6 +1502,34 @@ struct internal::Decode<internal::AttrsTag<T>> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing context.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+class ContextBase {
+ public:
+  ContextBase(const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx)
+      : api_(api), ctx_(ctx) {}
+
+  const XLA_FFI_Api* api() const { return api_; }
+  XLA_FFI_ExecutionContext* ctx() const { return ctx_; }
+
+ protected:
+  template <typename T>
+  std::optional<typename CtxDecoding<T>::Type> get(
+      DiagnosticEngine& diagnostic) const {
+    return CtxDecoding<T>::Decode(api_, ctx_, diagnostic);
+  }
+
+ private:
+  const XLA_FFI_Api* api_;
+  XLA_FFI_ExecutionContext* ctx_;
+};
+
+}  // namespace internal
+
 //===----------------------------------------------------------------------===//
 // Template metaprogramming for decoding handler signature
 //===----------------------------------------------------------------------===//
@@ -1329,7 +1544,10 @@ class RemainingRets;
 namespace internal {
 // A helper struct to extract the type of the handler argument.
 template <typename T>
-struct FnArgType {
+struct FnArgType;
+
+template <typename T>
+struct FnArgType<internal::ArgTag<T>> {
   using Type = T;
 };
 
@@ -1373,43 +1591,23 @@ struct FnArgType<internal::CtxTag<T>> {
   using Type = typename CtxDecoding<T>::Type;
 };
 
-// A template for checking if type in a parameter pack is a tagged one and has
-// a special decoding rule defined by template specialization.
-template <typename>
-struct IsTagged : std::false_type {};
+// A template to detect result encodings that are state constructors. We use
+// this to report back the TypeId of the state as a part of the metadata.
+template <typename ResultEnconding, typename = void>
+struct IsStateConstructor : std::false_type {};
 
-template <typename T>
-struct IsTagged<OptionalArgTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<RetTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<OptionalRetTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<AttrTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<AttrsTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<CtxTag<T>> : std::true_type {};
+// Check if the ResultEncoding has a static `state_type_id()` method returning
+// the XLA_FFI_TypeId.
+template <typename ResultEncoding>
+struct IsStateConstructor<
+    ResultEncoding,
+    std::enable_if_t<std::is_same_v<XLA_FFI_TypeId,
+                                    decltype(ResultEncoding::state_type_id())>>>
+    : std::true_type {};
 
-template <>
-struct IsTagged<RemainingArgsTag> : std::true_type {};
-template <>
-struct IsTagged<RemainingRetsTag> : std::true_type {};
-
-// A template for counting regular arguments in the Ts pack (arguments that are
-// not wrapped into a special tag).
-template <typename... Ts>
-struct NumArgs;
-
-template <>
-struct NumArgs<> {
-  static constexpr int64_t value = 0;
-};
-
-template <typename T, typename... Ts>
-struct NumArgs<T, Ts...> {
-  static constexpr int64_t value = !IsTagged<T>::value + NumArgs<Ts...>::value;
-};
+template <typename ResultEncoding>
+static constexpr bool is_state_constructor_v =  // NOLINT
+    IsStateConstructor<ResultEncoding>::value;
 
 }  // namespace internal
 
@@ -1421,7 +1619,8 @@ template <ExecutionStage stage, typename Fn, typename... Ts>
 class Handler : public Ffi {
   static constexpr int64_t kSize = sizeof...(Ts);
 
-  static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
+  static constexpr int64_t kNumArgs =
+      internal::NumTagged<internal::ArgTag, Ts...>::value;
 
   static constexpr int64_t kNumOptionalArgs =
       internal::NumTagged<internal::OptionalArgTag, Ts...>::value;
@@ -1491,7 +1690,8 @@ class Handler : public Ffi {
       if (XLA_FFI_PREDICT_FALSE(call_frame->args.size < kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
-            StrCat("Wrong number of arguments: expected at least ",
+            StrCat("[", call_frame->stage, "] ",
+                   "Wrong number of arguments: expected at least ",
                    kNumArgs - kNumOptionalArgs - 1, " but got ",
                    call_frame->args.size));
       }
@@ -1499,7 +1699,8 @@ class Handler : public Ffi {
       if (XLA_FFI_PREDICT_FALSE(call_frame->args.size < kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
-            StrCat("Wrong number of arguments: expected at least ",
+            StrCat("[", call_frame->stage, "] ",
+                   "Wrong number of arguments: expected at least ",
                    kNumArgs - kNumOptionalArgs, " but got ",
                    call_frame->args.size));
       }
@@ -1507,7 +1708,8 @@ class Handler : public Ffi {
       if (XLA_FFI_PREDICT_FALSE(call_frame->args.size != kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
-            StrCat("Wrong number of arguments: expected ", kNumArgs,
+            StrCat("[", call_frame->stage, "] ",
+                   "Wrong number of arguments: expected ", kNumArgs,
                    " but got ", call_frame->args.size));
       }
     }
@@ -1518,7 +1720,8 @@ class Handler : public Ffi {
       if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size < kNumRets)) {
         return InvalidArgument(
             call_frame->api,
-            StrCat("Wrong number of results: expected at least ",
+            StrCat("[", call_frame->stage, "] ",
+                   "Wrong number of results: expected at least ",
                    kNumRets - kNumOptionalRets - 1, " but got ",
                    call_frame->rets.size));
       }
@@ -1526,7 +1729,8 @@ class Handler : public Ffi {
       if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size < kNumRets)) {
         return InvalidArgument(
             call_frame->api,
-            StrCat("Wrong number of results: expected at least ",
+            StrCat("[", call_frame->stage, "] ",
+                   "Wrong number of results: expected at least ",
                    kNumRets - kNumOptionalRets, " but got ",
                    call_frame->rets.size));
       }
@@ -1534,20 +1738,30 @@ class Handler : public Ffi {
       if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size != kNumRets)) {
         return InvalidArgument(
             call_frame->api,
-            StrCat("Wrong number of results: expected ", kNumRets, " but got ",
+            StrCat("[", call_frame->stage, "] ",
+                   "Wrong number of results: expected ", kNumRets, " but got ",
                    call_frame->rets.size));
       }
     }
 
     // Check that the number of passed attributes matches the signature. Each
-    // individual attribute decoding will check the actual type. If we decode
-    // attributes into a dictionary (or a custom struct decoded from a
-    // dictionary), then there is no need to check attributes, as the FFI
-    // handler (or a struct decoding) should be responsible for it.
-    if (XLA_FFI_PREDICT_FALSE(kNumDictAttrs == 0 &&
+    // individual attribute decoding will check the actual type.
+    //
+    // If we decode attributes into a dictionary (or a custom struct decoded
+    // from a dictionary), then there is no need to check the number of
+    // attributes, as the FFI handler (or a struct decoding) should be
+    // responsible for it.
+    //
+    // If the number of bound attributes is zero, then we also don't care about
+    // the number of attributes in the call frame. FFI handler can safely choose
+    // to ignore attributes in this case. We only need to check the number of
+    // attributes if we plan to decode them, as we build a mapping from the
+    // attribute name to its index in the call frame attributes.
+    if (XLA_FFI_PREDICT_FALSE(kNumDictAttrs == 0 && kNumAttrs > 0 &&
                               call_frame->attrs.size != kNumAttrs)) {
       std::stringstream msg;
-      msg << "Wrong number of attributes: expected " << kNumAttrs << " but got "
+      msg << "[" << call_frame->stage << "] "
+          << "Wrong number of attributes: expected " << kNumAttrs << " but got "
           << call_frame->attrs.size;
       if (call_frame->attrs.size > 0) {
         msg << " with name(s): ";
@@ -1585,6 +1799,7 @@ class Handler : public Ffi {
       return err;
     }
 
+    // Set the API version to the version of the FFI headers used by a handler.
     extension->metadata->api_version = XLA_FFI_Api_Version{
         XLA_FFI_Api_Version_STRUCT_SIZE,
         /*extension_start=*/nullptr,
@@ -1592,12 +1807,27 @@ class Handler : public Ffi {
         XLA_FFI_API_MINOR,
     };
 
+    // Collect all traits and store them in the metadata.
     XLA_FFI_Handler_Traits traits = 0;
-    for (const auto& trait : traits_) {
+    for (const Traits& trait : traits_) {
       traits |= static_cast<XLA_FFI_Handler_Traits>(trait);
     }
     extension->metadata->traits = traits;
 
+    // Check if the handler creates a new state object and if so, record its
+    // type id in the metadata.
+    using ResultEncoding = ResultEncoding<stage, ResultType>;
+    if constexpr (internal::is_state_constructor_v<ResultEncoding>) {
+      if (ResultEncoding::state_type_id() == XLA_FFI_UNKNOWN_TYPE_ID) {
+        return FailedPrecondition(api,
+                                  "Types used by FFI handlers must be "
+                                  "registered before the handler registration");
+      }
+      extension->metadata->state_type_id = ResultEncoding::state_type_id();
+    } else {
+      extension->metadata->state_type_id = XLA_FFI_UNKNOWN_TYPE_ID;
+    }
+
     return Sucess();
   }
 
@@ -1706,7 +1936,8 @@ class Handler : public Ffi {
     // Find index of every attribute in the sorted attributes vector.
     for (size_t i = 0; i < attrs_.size(); ++i) {
       attrs_idx_.push_back(std::distance(
-          sorted.begin(), std::find(sorted.begin(), sorted.end(), attrs_[i])));
+          sorted.begin(),
+          std::find(sorted.begin(), sorted.end(), attrs_[i])));  // NOLINT
     }
   }
 
@@ -1732,6 +1963,12 @@ class Handler : public Ffi {
   template <>                                                              \
   struct AttrDecoding<T> {                                                 \
     using Type = T;                                                        \
+    XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static bool Isa(XLA_FFI_AttrType type, \
+                                                    void* attr) {          \
+      return type == XLA_FFI_AttrType_SCALAR &&                            \
+             reinterpret_cast<XLA_FFI_Scalar*>(attr)->dtype == TYPE;       \
+    }                                                                      \
+                                                                           \
     XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(        \
         XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) { \
       if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR)) {        \
@@ -1767,6 +2004,47 @@ XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(std::complex<double>,
 
 #undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
 
+// Decoding for an attribute of `std::variant<T0, T1, Ts...>` type.
+//
+// Returns the decoding result for a first type that matches the attribute type,
+// if no type matches, returns std::nullopt.
+template <typename T0, typename T1, typename... Ts>
+struct AttrDecoding<std::variant<T0, T1, Ts...>> {
+  using Type = std::variant<T0, T1, Ts...>;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static bool Isa(XLA_FFI_AttrType type,
+                                                  void* attr) {
+    return AttrDecoding<T0>::Isa(type, attr) ||
+           AttrDecoding<T1>::Isa(type, attr) ||
+           (AttrDecoding<Ts>::Isa(type, attr) || ...);
+  };
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
+    return Decode<T0, T1, Ts...>(type, attr, diagnostic);
+  }
+
+ private:
+  template <typename U, typename... Us>
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
+    if (AttrDecoding<U>::Isa(type, attr)) {
+      if (auto decoded = AttrDecoding<U>::Decode(type, attr, diagnostic);
+          XLA_FFI_PREDICT_TRUE(decoded)) {
+        return std::move(*decoded);
+      }
+      return std::nullopt;
+    }
+
+    if constexpr (sizeof...(Us) > 0) {
+      return Decode<Us...>(type, attr, diagnostic);
+    }
+
+    return diagnostic.Emit(
+        "Wrong attribute type: it doesn't match any of the variant types");
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Automatic dictionary attributes to structs decoding.
 //===----------------------------------------------------------------------===//
@@ -1913,14 +2191,15 @@ auto DictionaryDecoder(Members... m) {
 // Helper macro for registering FFI implementations
 //===----------------------------------------------------------------------===//
 
-#if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG)  // GCC-style
-#define XLA_FFI_ATTRIBUTE_UNUSED __attribute__((unused))
-#else  // Non-GCC equivalents
-#define XLA_FFI_ATTRIBUTE_UNUSED
-#endif
+// In all macros below we use captureless lambda to function pointer conversion
+// to create a static XLA_FFI_Handler function pointer variable.
 
-// Use captureless lambda to function pointer conversion to create a static
-// XLA_FFI_Handler function pointer variable.
+// Use explicit binding specification and traits to create a handler.
+#define XLA_FFI_DEFINE_HANDLER_EXPLICIT_WITH_TRAITS(fn, impl, binding, traits) \
+  static constexpr XLA_FFI_Handler* fn = +[](XLA_FFI_CallFrame* call_frame) {  \
+    static auto* handler = binding.To(impl, traits).release();                 \
+    return handler->Call(call_frame);                                          \
+  }
 
 // Use explicit binding specification to create a handler.
 #define XLA_FFI_DEFINE_HANDLER_EXPLICIT(fn, impl, binding)                    \
@@ -1929,12 +2208,6 @@ auto DictionaryDecoder(Members... m) {
     return handler->Call(call_frame);                                         \
   }
 
-#define XLA_FFI_DEFINE_HANDLER_EXPLICIT_WITH_TRAITS(fn, impl, binding, traits) \
-  static constexpr XLA_FFI_Handler* fn = +[](XLA_FFI_CallFrame* call_frame) {  \
-    static auto* handler = binding.To(impl, traits).release();                 \
-    return handler->Call(call_frame);                                          \
-  }
-
 // Automatically infer binding specification from the implementation.
 #define XLA_FFI_DEFINE_HANDLER_AUTO(fn, impl)                                 \
   static constexpr XLA_FFI_Handler* fn = +[](XLA_FFI_CallFrame* call_frame) { \
@@ -1949,11 +2222,11 @@ auto DictionaryDecoder(Members... m) {
 //
 // This is a trick to define macro with optional parameters.
 // Source: https://stackoverflow.com/a/8814003
-#define XLA_FFI_DEFINE_HANDLER(fn, impl, ...)                             \
-  XLA_FFI_DEFINE_HANDLER_X(                                               \
-      , fn, impl, ##__VA_ARGS__,                                          \
-      XLA_FFI_DEFINE_HANDLER_EXPLICIT_WITH_TRAITS(fn, impl, __VA_ARGS__), \
-      XLA_FFI_DEFINE_HANDLER_EXPLICIT(fn, impl, __VA_ARGS__),             \
+#define XLA_FFI_DEFINE_HANDLER(fn, impl, ...)                               \
+  XLA_FFI_DEFINE_HANDLER_X(                                                 \
+      , fn, impl, ##__VA_ARGS__,                                            \
+      XLA_FFI_DEFINE_HANDLER_EXPLICIT_WITH_TRAITS(fn, impl, ##__VA_ARGS__), \
+      XLA_FFI_DEFINE_HANDLER_EXPLICIT(fn, impl, ##__VA_ARGS__),             \
       XLA_FFI_DEFINE_HANDLER_AUTO(fn, impl))
 
 // TODO(ezhulenev): Add a callback so that end users can log registration error
@@ -1965,7 +2238,7 @@ auto DictionaryDecoder(Members... m) {
 #define XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, N, ...) \
   XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ##__VA_ARGS__)
 #define XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ...)       \
-  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                      \
+  [[maybe_unused]] static const XLA_FFI_Error*                              \
       xla_ffi_static_handler_##N##_registered_ = [] {                       \
         return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, PLATFORM,  \
                                                       FUNC, ##__VA_ARGS__); \
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 44a4e21281cceb..3aa0d1c5f13533 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -61,6 +61,15 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Extension_Base, next);
 // Version
 //===----------------------------------------------------------------------===//
 
+// XLA FFI provides a stable binary API for registering custom calls with
+// XLA runtime. XLA runtime guarantees that old API version are supported for
+// at least 12 months, after that point FFI library has to be recompiled with
+// latest XLA FFI headers to support new features. We don't plan to break ABI
+// compatibility, unless it's absolutely necessary to enable new features that
+// can't be implemented in a backward compatible way.
+//
+// The range of supported API versions is defined in `xla/ffi/ffi_api.cc`.
+
 // Incremented when an ABI-incompatible change is made to the interface.
 //
 // Major changes include:
@@ -82,7 +91,7 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Extension_Base, next);
 // Minor changes include:
 // * Adding a new field to the XLA_FFI_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define XLA_FFI_API_MINOR 1
+#define XLA_FFI_API_MINOR 2
 
 struct XLA_FFI_Api_Version {
   size_t struct_size;
@@ -262,11 +271,6 @@ typedef struct XLA_FFI_ExecutionContext XLA_FFI_ExecutionContext;
 // Primitives
 //===----------------------------------------------------------------------===//
 
-// TypeId uniquely identifies a user-defined type in a given XLA FFI instance.
-typedef struct XLA_FFI_TypeId {
-  int64_t type_id;
-} XLA_FFI_TypeId;
-
 // We use byte spans to pass strings to handlers because strings might not be
 // null terminated, and even if they are, looking for a null terminator can
 // become very expensive in tight loops.
@@ -288,6 +292,26 @@ typedef struct XLA_FFI_Array {
   void* data;
 } XLA_FFI_Array;
 
+//===----------------------------------------------------------------------===//
+// Type registry
+//===----------------------------------------------------------------------===//
+
+// TypeId uniquely identifies a user-defined type in a given XLA FFI instance.
+typedef struct XLA_FFI_TypeId {
+  int64_t type_id;
+} XLA_FFI_TypeId;
+
+// TypeInfo contains function pointers required by XLA runtime to manipulate
+// user-defined types. For example stateful handlers must tell XLA runtime how
+// to destroy their state when executable is being destroyed.
+typedef struct XLA_FFI_TypeInfo {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  void (*deleter)(void* object);
+} XLA_FFI_TypeInfo;
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_TypeInfo, deleter);
+
 //===----------------------------------------------------------------------===//
 // Future
 //===----------------------------------------------------------------------===//
@@ -476,22 +500,22 @@ typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
 
 #define XLA_FFI_UNKNOWN_TYPE_ID XLA_FFI_TypeId{0}
 
-struct XLA_FFI_TypeId_Register_Args {
+struct XLA_FFI_Type_Register_Args {
   size_t struct_size;
   XLA_FFI_Extension_Base* extension_start;
 
   XLA_FFI_ByteSpan name;
   XLA_FFI_TypeId* type_id;  // in-out
+  const XLA_FFI_TypeInfo* type_info;
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_TypeId_Register_Args, type_id);
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Type_Register_Args, type_id);
 
 // Registers user type `name` with XLA. If type id is `XLA_FFI_UNKNOWN_TYPE_ID`,
 // XLA will assign a unique type id and return it in `type_id` out argument,
 // otherwise XLA will verify that type id is unique and matches the type id of
 // the type registered with the same `name` earlier.
-typedef XLA_FFI_Error* XLA_FFI_TypeId_Register(
-    XLA_FFI_TypeId_Register_Args* args);
+typedef XLA_FFI_Error* XLA_FFI_Type_Register(XLA_FFI_Type_Register_Args* args);
 
 //===----------------------------------------------------------------------===//
 // ExecutionContext
@@ -523,10 +547,9 @@ struct XLA_FFI_State_Set_Args {
   XLA_FFI_ExecutionContext* ctx;
   XLA_FFI_TypeId* type_id;
   void* state;
-  void (*deleter)(void* state);
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_State_Set_Args, deleter);
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_State_Set_Args, state);
 
 // Sets execution state to the `state` of type `type_id`. Returns an error if
 // state already set.
@@ -694,10 +717,18 @@ typedef XLA_FFI_Error* XLA_FFI_DeviceOrdinal_Get(
 // Metadata extension
 //===----------------------------------------------------------------------===//
 
+// XLA FFI handler metadata allows the XLA runtime to query handler properties
+// during XLA compilation and execution. We use a metadata extension to verify
+// that XLA is compatible with the FFI version used to compile the handler.
 struct XLA_FFI_Metadata {
   size_t struct_size;
+
   XLA_FFI_Api_Version api_version;
   XLA_FFI_Handler_Traits traits;
+
+  // For stateful handlers, the type id of the state type. Otherwise, the type
+  // id is `XLA_FFI_UNKNOWN_TYPE_ID`.
+  XLA_FFI_TypeId state_type_id;
 };
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Metadata, traits);
@@ -720,14 +751,14 @@ struct XLA_FFI_Api {
   XLA_FFI_Extension_Base* extension_start;
 
   XLA_FFI_Api_Version api_version;
-  XLA_FFI_InternalApi* internal_api;
+  const XLA_FFI_InternalApi* internal_api;
 
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Create);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_GetMessage);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Destroy);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Handler_Register);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Stream_Get);
-  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_TypeId_Register);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Type_Register);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_Get);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_State_Set);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_State_Get);
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
index a5398b0ebbc847..d620936e2f881f 100644
--- a/third_party/xla/xla/ffi/api/c_api_internal.h
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -32,11 +32,15 @@ limitations under the License.
 extern "C" {
 #endif
 
-// Because this is an internal XLA FFI API we use a slightly relaxed C API
-// style and do not track the struct size, as we expect this API to be used
-// only in statically linked binaries, and we do not need any backward or
+// IMPORTANT: Because this is an internal XLA FFI API we use a slightly relaxed
+// C API style and do not track the struct size, as we expect this API to be
+// used only in statically linked binaries, and we do not need any backward or
 // forward compatibility.
 
+//===----------------------------------------------------------------------===//
+// Generic XLA internal APIs available on all XLA backends.
+//===----------------------------------------------------------------------===//
+
 // Forwards `absl::Status` object pointed to by `status` to XLA FFI error
 // (status left in moved-from state). Pointer ownership stays with the
 // caller.
@@ -46,12 +50,6 @@ typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Error_Forward(void* status);
 // future. Async value ownership transferred to the XLA FFI future.
 typedef XLA_FFI_Future* XLA_FFI_INTERNAL_Future_Forward(void* async_value);
 
-// Returns a pointer to main compute stream (`se::Stream` pointer). In
-// contrast to public C API which returns a pointer to underlying platform
-// stream (i.e. cudaStream_t for CUDA backend), this API returns a pointer to
-// StreamExecutor stream which is unsafe to use across dynamic library boundary.
-typedef void* XLA_FFI_INTERNAL_Stream_Get(XLA_FFI_ExecutionContext* ctx);
-
 // Returns the device ordinal of the device associated with the execution
 // context.
 typedef int32_t XLA_FFI_INTERNAL_DeviceOrdinal_Get(
@@ -60,12 +58,6 @@ typedef int32_t XLA_FFI_INTERNAL_DeviceOrdinal_Get(
 // Returns the run id associated with the execution context.
 typedef int64_t XLA_FFI_INTERNAL_RunId_Get(XLA_FFI_ExecutionContext* ctx);
 
-// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
-// pointer) which allows to allocate memory inside a custom call from the same
-// allocator as XLA (i.e. it allows to construct scratch memory allocator).
-typedef void* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
-    XLA_FFI_ExecutionContext* ctx);
-
 // Returns a pointer to `xla::HloComputation` if FFI handler has a called
 // computation attached to it.
 typedef void* XLA_FFI_INTERNAL_CalledComputation_Get(
@@ -81,10 +73,31 @@ typedef void* XLA_FFI_INTERNAL_ExecutionContext_Get(
 typedef void* XLA_FFI_INTERNAL_ExecutionState_Get(
     XLA_FFI_ExecutionContext* ctx);
 
+//===----------------------------------------------------------------------===//
+// XLA:CPU specific internal APIs.
+//===----------------------------------------------------------------------===//
+
 // Returns a pointer to the `Eigen::ThreadPoolDevice` passed via run options,
 // which allows FFI handlers to execute tasks in the same thread pool as XLA.
-typedef void* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
-    XLA_FFI_ExecutionContext* ctx);
+typedef XLA_FFI_Error* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
+    XLA_FFI_ExecutionContext* ctx, void** thread_pool);
+
+//===----------------------------------------------------------------------===//
+// XLA:GPU specific internal APIs.
+//===----------------------------------------------------------------------===//
+
+// Returns a pointer to main compute stream (`se::Stream` pointer). In
+// contrast to public C API which returns a pointer to underlying platform
+// stream (i.e. cudaStream_t for CUDA backend), this API returns a pointer to
+// StreamExecutor stream which is unsafe to use across dynamic library boundary.
+typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Stream_Get(
+    XLA_FFI_ExecutionContext* ctx, void** stream);
+
+// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
+// pointer) which allows to allocate memory inside a custom call from the same
+// allocator as XLA (i.e. it allows to construct scratch memory allocator).
+typedef XLA_FFI_Error* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
+    XLA_FFI_ExecutionContext* ctx, void** allocator);
 
 //===----------------------------------------------------------------------===//
 // API access
@@ -93,17 +106,22 @@ typedef void* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
 #define _XLA_FFI_INTERNAL_API_STRUCT_FIELD(fn_type) fn_type* fn_type
 
 struct XLA_FFI_InternalApi {
+  // Generic XLA APIs available on all XLA backends.
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Error_Forward);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Future_Forward);
-  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Stream_Get);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_DeviceOrdinal_Get);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_RunId_Get);
-  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(
-      XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_CalledComputation_Get);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_ExecutionContext_Get);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_ExecutionState_Get);
+
+  // XLA:CPU specific APIs.
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_IntraOpThreadPool_Get);
+
+  // XLA:GPU specific APIs.
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Stream_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(
+      XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get);
 };
 
 #undef _XLA_FFI_INTERNAL_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index 8bc0572cbc8e7a..2a7939dbcabbf6 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -53,7 +53,8 @@ namespace xla::ffi {
 
 // All user data types that are passed via the execution context or state must
 // be registered with the XLA FFI ahead of time to get unique type id.
-using TypeId = XLA_FFI_TypeId;  // NOLINT
+using TypeId = XLA_FFI_TypeId;      // NOLINT
+using TypeInfo = XLA_FFI_TypeInfo;  // NOLINT
 
 enum class DataType : uint8_t {
   INVALID = XLA_FFI_DataType_INVALID,
@@ -437,7 +438,7 @@ class CountDownPromise {
     assert(state_->count.load() >= count && "Invalid count down value");
 
     if (XLA_FFI_PREDICT_FALSE(!error.success())) {
-      const std::lock_guard<std::mutex> lock(state_->mutex);
+      std::lock_guard<std::mutex> lock(state_->mutex);  // NOLINT
       state_->is_error.store(true, std::memory_order_release);
       state_->error = error;
     }
@@ -448,7 +449,7 @@ class CountDownPromise {
       bool is_error = state_->is_error.load(std::memory_order_acquire);
       if (XLA_FFI_PREDICT_FALSE(is_error)) {
         auto take_error = [&] {
-          const std::lock_guard<std::mutex> lock(state_->mutex);
+          std::lock_guard<std::mutex> lock(state_->mutex);  // NOLINT
           return state_->error;
         };
         state_->promise.SetError(take_error());
@@ -476,7 +477,7 @@ class CountDownPromise {
     std::atomic<int64_t> count;
     std::atomic<bool> is_error;
 
-    std::mutex mutex;
+    std::mutex mutex;  // NOLINT
     Error error;
   };
 
@@ -747,6 +748,12 @@ using Token = BufferR0<DataType::TOKEN>;  // NOLINT
 
 namespace internal {
 
+template <DataType dtype, size_t rank>
+XLA_FFI_ATTRIBUTE_ALWAYS_INLINE bool IsaBuffer(XLA_FFI_Buffer* buf) {
+  return static_cast<DataType>(buf->dtype) == dtype &&
+         (rank == internal::kDynamicRank || buf->rank == rank);
+}
+
 template <DataType dtype, size_t rank>
 XLA_FFI_ATTRIBUTE_ALWAYS_INLINE std::optional<Buffer<dtype, rank>> DecodeBuffer(
     XLA_FFI_Buffer* buf, DiagnosticEngine& diagnostic) {
@@ -820,6 +827,11 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_ArgType type) {
 
 template <>
 struct ArgDecoding<AnyBuffer> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_ArgType type, void* arg) {
+    return type == XLA_FFI_ArgType_BUFFER;
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<AnyBuffer> Decode(XLA_FFI_ArgType type, void* arg,
                                          DiagnosticEngine& diagnostic) {
@@ -833,6 +845,13 @@ struct ArgDecoding<AnyBuffer> {
 
 template <DataType dtype, size_t rank>
 struct ArgDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_ArgType type, void* arg) {
+    return type == XLA_FFI_ArgType_BUFFER &&
+           internal::IsaBuffer<dtype, rank>(
+               reinterpret_cast<XLA_FFI_Buffer*>(arg));
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Buffer<dtype, rank>> Decode(
       XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
@@ -895,6 +914,11 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_RetType type) {
 
 template <>
 struct RetDecoding<AnyBuffer> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_RetType type, void* ret) {
+    return type == XLA_FFI_RetType_BUFFER;
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Result<AnyBuffer>> Decode(XLA_FFI_RetType type,
                                                  void* ret,
@@ -909,6 +933,13 @@ struct RetDecoding<AnyBuffer> {
 
 template <DataType dtype, size_t rank>
 struct RetDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_RetType type, void* ret) {
+    return type == XLA_FFI_RetType_BUFFER &&
+           internal::IsaBuffer<dtype, rank>(
+               reinterpret_cast<XLA_FFI_Buffer*>(ret));
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Result<Buffer<dtype, rank>>> Decode(
       XLA_FFI_RetType type, void* ret, DiagnosticEngine& diagnostic) {
@@ -1001,6 +1032,11 @@ template <>
 struct AttrDecoding<std::string_view> {
   using Type = std::string_view;
 
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static bool Isa(XLA_FFI_AttrType type,
+                                                  void* attr) {
+    return type == XLA_FFI_AttrType_STRING;
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<std::string_view> Decode(
       XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
@@ -1044,11 +1080,21 @@ class Dictionary : public internal::DictionaryBase {
  public:
   using internal::DictionaryBase::DictionaryBase;
 
+  template <typename T>
+  ErrorOr<T> get(const Iterator& it) const {
+    DiagnosticEngine diagnostic;
+    auto value = internal::DictionaryBase::get<T>(it, diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
+      return Unexpected(Error::Internal(diagnostic.Result()));
+    }
+    return *value;
+  }
+
   template <typename T>
   ErrorOr<T> get(std::string_view name) const {
     DiagnosticEngine diagnostic;
-    std::optional<T> value = internal::DictionaryBase::get<T>(name, diagnostic);
-    if (!value.has_value()) {
+    auto value = internal::DictionaryBase::get<T>(name, diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
       return Unexpected(Error::Internal(diagnostic.Result()));
     }
     return *value;
@@ -1079,6 +1125,38 @@ struct AttrDecoding<Dictionary> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing context.
+//===----------------------------------------------------------------------===//
+
+class Context : public internal::ContextBase {
+ public:
+  using internal::ContextBase::ContextBase;
+
+  template <typename T>
+  ErrorOr<typename CtxDecoding<T>::Type> get() const {
+    DiagnosticEngine diagnostic;
+    auto value = internal::ContextBase::get<T>(diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
+      return Unexpected(Error::Internal(diagnostic.Result()));
+    }
+    return *value;
+  }
+};
+
+// Context decoding for catch-all `Context` type.
+template <>
+struct CtxDecoding<Context> {
+  using Type = Context;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Context> Decode(const XLA_FFI_Api* api,
+                                       XLA_FFI_ExecutionContext* ctx,
+                                       DiagnosticEngine&) {
+    return Context(api, ctx);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Error helpers
 //===----------------------------------------------------------------------===//
@@ -1094,24 +1172,6 @@ inline XLA_FFI_Error* CreateError(const XLA_FFI_Api* api, const Error& error) {
   return api->XLA_FFI_Error_Create(&args);
 }
 
-inline void DestroyError(const XLA_FFI_Api* api, XLA_FFI_Error* error) {
-  XLA_FFI_Error_Destroy_Args args;
-  args.struct_size = XLA_FFI_Error_Destroy_Args_STRUCT_SIZE;
-  args.extension_start = nullptr;
-  args.error = error;
-  api->XLA_FFI_Error_Destroy(&args);
-}
-
-inline const char* GetErrorMessage(const XLA_FFI_Api* api,
-                                   XLA_FFI_Error* error) {
-  XLA_FFI_Error_GetMessage_Args args;
-  args.struct_size = XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE;
-  args.extension_start = nullptr;
-  args.error = error;
-  api->XLA_FFI_Error_GetMessage(&args);
-  return args.message;
-}
-
 }  // namespace internal
 
 //===----------------------------------------------------------------------===//
@@ -1139,6 +1199,8 @@ struct ResultEncoding<ExecutionStage::kInstantiate,
   static_assert(std::is_same_v<decltype(T::id), TypeId>,
                 "State type must have a static `TypeId id` field");
 
+  static XLA_FFI_TypeId state_type_id() { return T::id; }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
                                XLA_FFI_ExecutionContext* ctx,
@@ -1150,7 +1212,6 @@ struct ResultEncoding<ExecutionStage::kInstantiate,
       args.ctx = ctx;
       args.type_id = &T::id;
       args.state = state.value().release();
-      args.deleter = +[](void* state) { delete reinterpret_cast<T*>(state); };
       return api->XLA_FFI_State_Set(&args);
     }
 
@@ -1430,47 +1491,28 @@ inline ThreadPool::ThreadPool(const XLA_FFI_Api* api,
                               DiagnosticEngine& diagnostic)
     : api_(api), ctx_(ctx), diagnostic_(diagnostic) {}
 
-//===----------------------------------------------------------------------===//
-// Context decoding for FFI internals
-//===----------------------------------------------------------------------===//
-
-struct FfiApi {};
-struct FfiExecutionContext {};
-
-template <>
-struct CtxDecoding<FfiApi> {
-  using Type = const XLA_FFI_Api*;
-
-  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
-      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
-      DiagnosticEngine& diagnostic) {
-    return api;
-  }
-};
-
-template <>
-struct CtxDecoding<FfiExecutionContext> {
-  using Type = XLA_FFI_ExecutionContext*;
-
-  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
-      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
-      DiagnosticEngine& diagnostic) {
-    return ctx;
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // Type Registration
 //===----------------------------------------------------------------------===//
 
-#define XLA_FFI_REGISTER_TYPE(API, NAME, TYPE_ID) \
-  XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, __COUNTER__)
-#define XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, N) \
-  XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, N)
-#define XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, N) \
-  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error* \
-      xla_ffi_type_##N##_registered_ =                 \
-          [] { return ::xla::ffi::Ffi::RegisterTypeId(API, NAME, TYPE_ID); }()
+template <typename T>
+constexpr XLA_FFI_TypeInfo MakeTypeInfo() {
+  return XLA_FFI_TypeInfo{
+      XLA_FFI_TypeInfo_STRUCT_SIZE,
+      /*extension_start=*/nullptr,
+      /*deleter=*/[](void* ptr) { delete static_cast<T*>(ptr); },
+  };
+}
+
+#define XLA_FFI_REGISTER_TYPE(API, NAME, TYPE_ID, TYPE_INFO) \
+  XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, TYPE_INFO, __COUNTER__)
+#define XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, TYPE_INFO, N) \
+  XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, TYPE_INFO, N)
+#define XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, TYPE_INFO, N)              \
+  [[maybe_unused]] static const XLA_FFI_Error*                                 \
+      xla_ffi_type_##N##_registered_ = [] {                                    \
+        return ::xla::ffi::Ffi::RegisterTypeId(API, NAME, TYPE_ID, TYPE_INFO); \
+      }()
 
 //===----------------------------------------------------------------------===//
 // UserData
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index 2991101a343b79..2aabbb0a1ce1aa 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -23,23 +23,27 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/functional/overload.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
-#include "xla/ffi/type_id_registry.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/primitive_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -47,7 +51,6 @@ limitations under the License.
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -58,6 +61,15 @@ limitations under the License.
 
 namespace xla::ffi {
 
+using xla::ffi::internal::ArgTag;
+using xla::ffi::internal::NumTagged;
+using xla::ffi::internal::RetTag;
+
+// Compile-time test for the template metaprogramming for counting tags.
+static_assert(NumTagged<ArgTag, RetTag<int32_t>>::value == 0);
+static_assert(NumTagged<ArgTag, ArgTag<int32_t>>::value == 1);
+static_assert(NumTagged<ArgTag, ArgTag<int32_t>, RetTag<int32_t>>::value == 1);
+
 enum class Int32BasedEnum : int32_t {
   kOne = 1,
   kTwo = 2,
@@ -115,8 +127,8 @@ XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
 
 namespace xla::ffi {
 
+using ::absl_testing::StatusIs;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 TEST(FfiTest, DataTypeEnumValue) {
   // Verify that xla::PrimitiveType and xla::ffi::DataType use the same
@@ -464,13 +476,39 @@ TEST(FfiTest, RunId) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, RunIdViaContext) {
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Ctx().To([&](Context ctx) {
+    ErrorOr<RunId> run_id = ctx.get<RunId>();
+    EXPECT_TRUE(run_id.has_value());
+    EXPECT_EQ(run_id->run_id, 42);
+    return Error::Success();
+  });
+
+  CallOptions options;
+  options.run_id = xla::RunId{42};
+
+  auto status = Call(*handler, call_frame, options);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, DeviceOrdinal) {
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
   auto call_frame = builder.Build();
 
-  auto handler =
-      Ffi::Bind().Ctx<DeviceOrdinal>().To([&](int32_t device_ordinal) {
+  auto handler = Ffi::Bind().Ctx<DeviceOrdinal>().Ctx().To(
+      [&](int32_t device_ordinal, Context ctx) {
+        // Get device ordinal from the argument.
         EXPECT_EQ(device_ordinal, 42);
+
+        // Get device ordinal from the context.
+        ErrorOr<int32_t> device_ordinal_or_error = ctx.get<DeviceOrdinal>();
+        EXPECT_TRUE(device_ordinal_or_error.has_value());
+        EXPECT_EQ(*device_ordinal_or_error, 42);
+
         return Error::Success();
       });
 
@@ -550,9 +588,8 @@ TEST(FfiTest, MissingBufferArgument) {
       [](auto) { return Error::Success(); });
   auto status = Call(*handler, call_frame);
 
-  EXPECT_THAT(status,
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument,
-                                     HasSubstr("Wrong number of arguments")));
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument,
+                               HasSubstr("Wrong number of arguments")));
 }
 
 TEST(FfiTest, WrongRankBufferArgument) {
@@ -568,9 +605,8 @@ TEST(FfiTest, WrongRankBufferArgument) {
   auto status = Call(*handler, call_frame);
 
   EXPECT_THAT(status,
-              absl_testing::StatusIs(
-                  absl::StatusCode::kInvalidArgument,
-                  HasSubstr("Wrong buffer rank: expected 1 but got 2")));
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Wrong buffer rank: expected 1 but got 2")));
 }
 
 TEST(FfiTest, WrongTypeBufferArgument) {
@@ -585,10 +621,10 @@ TEST(FfiTest, WrongTypeBufferArgument) {
       [](auto) { return Error::Success(); });
   auto status = Call(*handler, call_frame);
 
-  EXPECT_THAT(status,
-              absl_testing::StatusIs(
-                  absl::StatusCode::kInvalidArgument,
-                  HasSubstr("Wrong buffer dtype: expected F32 but got S32")));
+  EXPECT_THAT(
+      status,
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("Wrong buffer dtype: expected F32 but got S32")));
 }
 
 TEST(FfiTest, WrongNumberOfArguments) {
@@ -604,9 +640,8 @@ TEST(FfiTest, WrongNumberOfArguments) {
       Ffi::Bind().Attr<int>("foo").To([](int foo) { return Error::Success(); });
   auto status = Call(*handler, call_frame);
 
-  EXPECT_THAT(status,
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument,
-                                     HasSubstr("Wrong number of attributes")));
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument,
+                               HasSubstr("Wrong number of attributes")));
   EXPECT_THAT(status.message(), HasSubstr("foo"));
   EXPECT_THAT(status.message(), HasSubstr("bar"));
 }
@@ -639,6 +674,9 @@ TEST(FfiTest, RemainingArgs) {
   auto fn = [&](RemainingArgs args) {
     EXPECT_EQ(args.size(), 1);
 
+    EXPECT_TRUE(args.isa<AnyBuffer>(0));
+    EXPECT_FALSE(args.isa<BufferR2<F64>>(0));
+
     ErrorOr<AnyBuffer> arg0 = args.get<AnyBuffer>(0);
     ErrorOr<AnyBuffer> arg1 = args.get<AnyBuffer>(1);
 
@@ -666,6 +704,9 @@ TEST(FfiTest, RemainingRets) {
   auto fn = [&](Result<AnyBuffer> ret, RemainingRets rets) {
     EXPECT_EQ(rets.size(), 1);
 
+    EXPECT_TRUE(rets.isa<AnyBuffer>(0));
+    EXPECT_FALSE(rets.isa<BufferR2<F64>>(0));
+
     ErrorOr<Result<AnyBuffer>> ret0 = rets.get<AnyBuffer>(0);
     ErrorOr<Result<AnyBuffer>> ret1 = rets.get<AnyBuffer>(1);
 
@@ -860,8 +901,28 @@ TEST(FfiTest, AutoBindingStructs) {
 
 TEST(FfiTest, AutoBindingDictionary) {
   auto handler = Ffi::BindTo(+[](Dictionary attrs) {
+    EXPECT_TRUE(attrs.contains("i32"));
+    EXPECT_TRUE(attrs.contains("f32"));
+
+    EXPECT_TRUE(attrs.contains<int32_t>("i32"));
+    EXPECT_TRUE(attrs.contains<float>("f32"));
+    EXPECT_FALSE(attrs.contains<int64_t>("i32"));
+    EXPECT_FALSE(attrs.contains<int64_t>("f32"));
+
     EXPECT_EQ(*attrs.get<int32_t>("i32"), 42);
     EXPECT_EQ(*attrs.get<float>("f32"), 42.0f);
+
+    auto it = attrs.begin();
+    EXPECT_EQ(*it, "f32");
+    EXPECT_TRUE(attrs.isa<float>(it));
+    EXPECT_EQ(*attrs.get<float>(it), 42.0f);
+
+    EXPECT_EQ(*++it, "i32");
+    EXPECT_TRUE(attrs.isa<int32_t>(it));
+    EXPECT_EQ(*attrs.get<int32_t>(it), 42);
+
+    EXPECT_EQ(++it, attrs.end());
+
     return Error::Success();
   });
 
@@ -877,6 +938,44 @@ TEST(FfiTest, AutoBindingDictionary) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, VariantAttrDecoding) {
+  using Integral = std::variant<int8_t, int16_t, int32_t, int64_t>;
+
+  auto to_string = absl::Overload{
+      [](int8_t i) { return absl::StrCat("i8: ", i); },
+      [](int16_t i) { return absl::StrCat("i16: ", i); },
+      [](int32_t i) { return absl::StrCat("i32: ", i); },
+      [](int64_t i) { return absl::StrCat("i64: ", i); },
+  };
+
+  auto handler = Ffi::BindTo([&](Dictionary attrs) {
+    EXPECT_TRUE(attrs.contains<Integral>("i32"));
+    EXPECT_TRUE(attrs.contains<Integral>("i64"));
+
+    Integral i32 = *attrs.get<Integral>("i32");
+    EXPECT_EQ(i32.index(), 2);
+
+    Integral i64 = *attrs.get<Integral>("i64");
+    EXPECT_EQ(i64.index(), 3);
+
+    EXPECT_EQ(std::visit(to_string, i32), "i32: 42");
+    EXPECT_EQ(std::visit(to_string, i64), "i64: 42");
+
+    return Error::Success();
+  });
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", int32_t{42});
+  attrs.Insert("i64", int64_t{42});
+
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto status = Call(*handler, call_frame);
+  TF_ASSERT_OK(status);
+}
+
 // Use opaque struct to define a platform stream type just like platform
 // stream for GPU backend (e.g. `CUstream_st`  and `cudaStream_t`).
 struct TestStreamSt;
@@ -992,10 +1091,10 @@ TEST(FfiTest, AttrsAsDictionary) {
 }
 
 TEST(FfiTest, DictionaryAttr) {
-  CallFrameBuilder::AttributesMap dict0;
+  AttributesMap dict0;
   dict0.try_emplace("i32", 42);
 
-  CallFrameBuilder::AttributesMap dict1;
+  AttributesMap dict1;
   dict1.try_emplace("f32", 42.0f);
 
   CallFrameBuilder::AttributesBuilder attrs;
@@ -1039,7 +1138,7 @@ TEST(FfiTest, DictionaryAttr) {
 }
 
 TEST(FfiTest, StructAttr) {
-  CallFrameBuilder::AttributesMap dict;
+  AttributesMap dict;
   dict.try_emplace("i32", 42);
   dict.try_emplace("f32", 42.0f);
 
@@ -1152,7 +1251,7 @@ TEST(FfiTest, EnumAttr) {
 }
 
 TEST(FfiTest, WrongEnumAttrType) {
-  CallFrameBuilder::AttributesMap dict;
+  AttributesMap dict;
   dict.try_emplace("i32", 42);
 
   CallFrameBuilder::AttributesBuilder attrs;
@@ -1202,13 +1301,20 @@ struct MyDataWithExplicitTypeId {
 
 // Rely on XLA to assign unique type id for the type.
 TypeId MyDataWithAutoTypeId::id = XLA_FFI_UNKNOWN_TYPE_ID;
-XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data_auto",
-                      &MyDataWithAutoTypeId::id);
+static constexpr auto kMyDataWithAutoTypeIdTypeInfo =
+    MakeTypeInfo<MyDataWithAutoTypeId>();
+
+XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data_auto", &MyDataWithAutoTypeId::id,
+                      &kMyDataWithAutoTypeIdTypeInfo);
 
 // Provide explicit type id and rely on XLA to check that it's unique.
 TypeId MyDataWithExplicitTypeId::id = {42};
+static constexpr auto kMyDataWithExplicitTypeIdTypeInfo =
+    MakeTypeInfo<MyDataWithExplicitTypeId>();
+
 XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data_explicit",
-                      &MyDataWithExplicitTypeId::id);
+                      &MyDataWithExplicitTypeId::id,
+                      &kMyDataWithExplicitTypeIdTypeInfo);
 
 TEST(FfiTest, UserData) {
   MyDataWithAutoTypeId data0{"foo"};
@@ -1219,9 +1325,9 @@ TEST(FfiTest, UserData) {
 
   ExecutionContext execution_context;
   TF_ASSERT_OK(execution_context.Insert(
-      TypeIdRegistry::TypeId(MyDataWithAutoTypeId::id.type_id), &data0));
+      TypeRegistry::TypeId(MyDataWithAutoTypeId::id.type_id), &data0));
   TF_ASSERT_OK(execution_context.Insert(
-      TypeIdRegistry::TypeId(MyDataWithExplicitTypeId::id.type_id), &data1));
+      TypeRegistry::TypeId(MyDataWithExplicitTypeId::id.type_id), &data1));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
   auto call_frame = builder.Build();
@@ -1247,13 +1353,16 @@ TEST(FfiTest, UserData) {
 
 struct MyState {
   static TypeId id;
+  static TypeInfo info;
 
   explicit MyState(int32_t value) : value(value) {}
   int32_t value;
 };
 
 TypeId MyState::id = {};  // zero-initialize type id
-XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "state", &MyState::id);
+TypeInfo MyState::info = MakeTypeInfo<MyState>();
+
+XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "state", &MyState::id, &MyState::info);
 
 TEST(FfiTest, StatefulHandler) {
   ExecutionState execution_state;
@@ -1346,13 +1455,6 @@ TEST(FfiTest, ScratchAllocatorUnimplemented) {
   TF_ASSERT_OK(status);
 }
 
-TEST(FfiTest, BindFfiInternals) {
-  (void)Ffi::Bind().Ctx<FfiApi>().Ctx<FfiExecutionContext>().To(
-      +[](const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx) {
-        return Error::Success();
-      });
-}
-
 TEST(FfiTest, ThreadPool) {
   tsl::thread::ThreadPool pool(tsl::Env::Default(), "ffi-test", 2);
   Eigen::ThreadPoolDevice device(pool.AsEigenThreadPool(), pool.NumThreads());
@@ -1434,25 +1536,55 @@ TEST(FfiTest, AsyncHandler) {
 }
 
 TEST(FfiTest, Metadata) {
-  auto api = GetXlaFfiApi();
-  auto handler = Ffi::BindTo([]() { return Error::Success(); });
-  auto maybe_metadata = GetMetadata(*handler);
+  auto handler =
+      Ffi::BindInstantiate().To([]() -> ErrorOr<std::unique_ptr<MyState>> {
+        return std::make_unique<MyState>(42);
+      });
+
+  absl::StatusOr<XLA_FFI_Metadata> maybe_metadata = GetMetadata(*handler);
   EXPECT_TRUE(maybe_metadata.ok());
-  auto metadata = maybe_metadata.value();
-  EXPECT_EQ(metadata.api_version.major_version, api->api_version.major_version);
-  EXPECT_EQ(metadata.api_version.minor_version, api->api_version.minor_version);
+
+  XLA_FFI_Metadata metadata = maybe_metadata.value();
+  EXPECT_EQ(metadata.api_version.major_version, XLA_FFI_API_MAJOR);
+  EXPECT_EQ(metadata.api_version.minor_version, XLA_FFI_API_MINOR);
   EXPECT_EQ(metadata.traits, 0);
+  EXPECT_EQ(metadata.state_type_id.type_id, MyState::id.type_id);
 }
 
 TEST(FfiTest, MetadataTraits) {
   auto handler = Ffi::BindTo([]() { return Error::Success(); },
                              {Traits::kCmdBufferCompatible});
-  auto maybe_metadata = GetMetadata(*handler);
+
+  absl::StatusOr<XLA_FFI_Metadata> maybe_metadata = GetMetadata(*handler);
   EXPECT_TRUE(maybe_metadata.ok());
-  auto metadata = maybe_metadata.value();
+
+  XLA_FFI_Metadata metadata = maybe_metadata.value();
   EXPECT_EQ(metadata.api_version.major_version, XLA_FFI_API_MAJOR);
   EXPECT_EQ(metadata.api_version.minor_version, XLA_FFI_API_MINOR);
   EXPECT_EQ(metadata.traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
+  EXPECT_EQ(metadata.state_type_id.type_id, XLA_FFI_UNKNOWN_TYPE_ID.type_id);
+}
+
+// Test that we can automatically generate FFI handler type signature from a C++
+// function declaration.
+static Error BufferR2F32Function(BufferR2<F32> buffer) {
+  EXPECT_EQ(buffer.dimensions().size(), 2);
+  return Error::Success();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(BufferR2F32Handler, BufferR2F32Function);
+
+TEST(FfiTest, DefineAutoSymbol) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto status = Call(BufferR2F32Handler, call_frame);
+
+  TF_ASSERT_OK(status);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1704,4 +1836,33 @@ BENCHMARK(BM_EnumAttrs);
 BENCHMARK(BM_EnumAttrsFunction);
 BENCHMARK(BM_EnumAttrsFunctionWrapper);
 
+//===----------------------------------------------------------------------===//
+// BM_VariantAttr
+//===----------------------------------------------------------------------===//
+
+void BM_VariantAttr(benchmark::State& state) {
+  using Integral = std::variant<int8_t, int16_t, int32_t, int64_t>;
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", int32_t{0});
+  attrs.Insert("i64", int64_t{0});
+
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Attr<Integral>("i32").Attr<Integral>("i64").To(
+      [](Integral i32, Integral i64) {
+        benchmark::DoNotOptimize(i32);
+        benchmark::DoNotOptimize(i64);
+        return Error::Success();
+      });
+
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+BENCHMARK(BM_VariantAttr);
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/attribute_map.cc b/third_party/xla/xla/ffi/attribute_map.cc
index af675f418e2c91..3085ee3a6235b7 100644
--- a/third_party/xla/xla/ffi/attribute_map.cc
+++ b/third_party/xla/xla/ffi/attribute_map.cc
@@ -16,36 +16,60 @@ limitations under the License.
 #include "xla/ffi/attribute_map.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
+#include <string>
+#include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
-#include "xla/ffi/call_frame.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::ffi {
+template <typename OutputVariant, typename... InputTypes>
+static OutputVariant Convert(std::variant<InputTypes...> input) {
+  return std::visit(
+      [](auto&& value) -> OutputVariant {
+        return std::forward<decltype(value)>(value);
+      },
+      std::move(input));
+}
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertBoolAttr(
-    absl::string_view name, mlir::BoolAttr boolean) {
+// Checks if the given value is in the range of the given integer type.
+// Note that only works for integer types where all values can be represented as
+// int32_t.
+template <typename T>
+static bool IsInRange(int32_t value) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && (sizeof(T) < sizeof(int32_t)),
+      "All values of T must be representable as int32_t.");
+  return value >= std::numeric_limits<T>::min() &&
+         value <= std::numeric_limits<T>::max();
+}
+
+static absl::StatusOr<Attribute> ConvertBoolAttr(absl::string_view name,
+                                                 mlir::BoolAttr boolean) {
   return static_cast<bool>(boolean.getValue());
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertStringAttr(
-    absl::string_view name, mlir::StringAttr str) {
+static absl::StatusOr<Attribute> ConvertStringAttr(absl::string_view name,
+                                                   mlir::StringAttr str) {
   return str.getValue().str();
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertIntegerAttr(
-    absl::string_view name, mlir::IntegerAttr integer) {
+static absl::StatusOr<Attribute> ConvertIntegerAttr(absl::string_view name,
+                                                    mlir::IntegerAttr integer) {
   if (integer.getType().isUnsignedInteger()) {
     switch (integer.getType().getIntOrFloatBitWidth()) {
       case 8:
@@ -77,8 +101,8 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertIntegerAttr(
   }
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertFloatAttr(
-    absl::string_view name, mlir::FloatAttr fp) {
+static absl::StatusOr<Attribute> ConvertFloatAttr(absl::string_view name,
+                                                  mlir::FloatAttr fp) {
   switch (fp.getType().getIntOrFloatBitWidth()) {
     case 32:
       return static_cast<float>(fp.getValue().convertToFloat());
@@ -90,24 +114,28 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertFloatAttr(
   }
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertArrayAttr(
-    absl::string_view name, mlir::DenseArrayAttr arr) {
+static absl::StatusOr<Attribute> ConvertArrayAttr(absl::string_view name,
+                                                  mlir::DenseArrayAttr arr) {
   if (auto dense = mlir::dyn_cast<mlir::DenseI8ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseI16ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseI16ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseF32ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseF32ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseF64ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseF64ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Unsupported array element type for attribute: ", name));
   }
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unsupported array element type for attribute: ", name));
 }
 
 template <typename T>
@@ -117,7 +145,7 @@ static std::vector<T> CopyDenseElementsToVec(
   return std::vector<T>(it.begin(), it.end());
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDenseElementsAttr(
+static absl::StatusOr<Attribute> ConvertDenseElementsAttr(
     absl::string_view name, mlir::DenseIntOrFPElementsAttr arr) {
   auto type = arr.getElementType();
   if (type.isInteger()) {
@@ -156,16 +184,15 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDenseElementsAttr(
       absl::StrCat("Unsupported array element type for attribute: ", name));
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDictionaryAttr(
+static absl::StatusOr<Attribute> ConvertDictionaryAttr(
     absl::string_view name, mlir::DictionaryAttr dict) {
   TF_ASSIGN_OR_RETURN(auto attrs, BuildAttributesMap(dict));
-  return CallFrameBuilder::Dictionary{
-      std::make_shared<CallFrameBuilder::AttributesMap>(std::move(attrs))};
+  return AttributesDictionary{
+      std::make_shared<AttributesMap>(std::move(attrs))};
 }
 
-absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict) {
-  CallFrameBuilder::AttributesMap attributes;
+absl::StatusOr<AttributesMap> BuildAttributesMap(mlir::DictionaryAttr dict) {
+  AttributesMap attributes;
   for (auto& kv : dict) {
     absl::string_view name = kv.getName().strref();
     mlir::Attribute value = kv.getValue();
@@ -197,4 +224,311 @@ absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
   return attributes;
 }
 
+AttributesMapProto AttributesDictionary::ToProto() const {
+  if (attrs != nullptr) {
+    return attrs->ToProto();
+  }
+  return AttributesMapProto();
+}
+
+absl::StatusOr<AttributesDictionary> AttributesDictionary::FromProto(
+    const AttributesMapProto& proto) {
+  TF_ASSIGN_OR_RETURN(auto attrs, AttributesMap::FromProto(proto));
+  return AttributesDictionary{std::make_shared<AttributesMap>(attrs)};
+}
+
+AttributesMapProto AttributesMap::ToProto() const {
+  AttributesMapProto proto;
+  for (const auto& [key, value] : *this) {
+    (*proto.mutable_attrs())[key] = value.ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<AttributesMap> AttributesMap::FromProto(
+    const AttributesMapProto& proto) {
+  AttributesMap result;
+  for (const auto& [key, value] : proto.attrs()) {
+    TF_ASSIGN_OR_RETURN(result[key], Attribute::FromProto(value));
+  }
+  return result;
+}
+
+absl::StatusOr<Attribute> Attribute::FromProto(const AttributeProto& proto) {
+  Attribute attribute;
+  switch (proto.value_case()) {
+    case AttributeProto::kScalar:
+      return Scalar::FromProto(proto.scalar());
+    case AttributeProto::kArray:
+      return Array::FromProto(proto.array());
+    case AttributeProto::kStr:
+      return Attribute(proto.str());
+    case AttributeProto::kDict:
+      return AttributesDictionary::FromProto(proto.dict());
+    default:
+      return absl::InvalidArgumentError("Unsupported attribute type");
+  }
+}
+
+xla::ffi::AttributeProto Attribute::ToProto() const {
+  AttributeProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, Scalar>) {
+          *proto.mutable_scalar() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, Array>) {
+          *proto.mutable_array() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, std::string>) {
+          proto.set_str(value);
+        } else if constexpr (std::is_same_v<U, AttributesDictionary>) {
+          *proto.mutable_dict() = value.ToProto();
+        } else {
+          static_assert(false, "Unsupported attribute type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+ScalarProto Scalar::ToProto() const {
+  ScalarProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, bool>) {
+          proto.set_b(value);
+        } else if constexpr (std::is_same_v<U, int8_t>) {
+          proto.set_i8(value);
+        } else if constexpr (std::is_same_v<U, int16_t>) {
+          proto.set_i16(value);
+        } else if constexpr (std::is_same_v<U, int32_t>) {
+          proto.set_i32(value);
+        } else if constexpr (std::is_same_v<U, int64_t>) {
+          proto.set_i64(value);
+        } else if constexpr (std::is_same_v<U, uint8_t>) {
+          proto.set_u8(value);
+        } else if constexpr (std::is_same_v<U, uint16_t>) {
+          proto.set_u16(value);
+        } else if constexpr (std::is_same_v<U, uint32_t>) {
+          proto.set_u32(value);
+        } else if constexpr (std::is_same_v<U, uint64_t>) {
+          proto.set_u64(value);
+        } else if constexpr (std::is_same_v<U, float>) {
+          proto.set_f32(value);
+        } else if constexpr (std::is_same_v<U, double>) {
+          proto.set_f64(value);
+        } else {
+          static_assert(false, "Unsupported scalar type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+absl::StatusOr<Scalar> Scalar::FromProto(const ScalarProto& proto) {
+  switch (proto.value_case()) {
+    case ScalarProto::kB:
+      return proto.b();
+    case ScalarProto::kI8:
+      if (!IsInRange<int8_t>(proto.i8())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int8_t");
+      }
+      return static_cast<int8_t>(proto.i8());
+    case ScalarProto::kI16:
+      if (!IsInRange<int16_t>(proto.i16())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int16_t");
+      }
+      return static_cast<int16_t>(proto.i16());
+    case ScalarProto::kI32:
+      return proto.i32();
+    case ScalarProto::kI64:
+      return proto.i64();
+    case ScalarProto::kU8:
+      if (!IsInRange<uint8_t>(proto.u8())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint8_t");
+      }
+      return static_cast<uint8_t>(proto.u8());
+    case ScalarProto::kU16:
+      if (!IsInRange<uint16_t>(proto.u16())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint16_t");
+      }
+      return static_cast<uint16_t>(proto.u16());
+    case ScalarProto::kU32:
+      return proto.u32();
+    case ScalarProto::kU64:
+      return proto.u64();
+    case ScalarProto::kF32:
+      return proto.f32();
+    case ScalarProto::kF64:
+      return proto.f64();
+    default:
+      return absl::InvalidArgumentError("Unsupported scalar type");
+  }
+}
+
+ArrayProto Array::ToProto() const {
+  ArrayProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, std::vector<int8_t>>) {
+          proto.mutable_i8()->mutable_values()->Assign(value.begin(),
+                                                       value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<int16_t>>) {
+          proto.mutable_i16()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<int32_t>>) {
+          proto.mutable_i32()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<int64_t>>) {
+          proto.mutable_i64()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint8_t>>) {
+          proto.mutable_u8()->mutable_values()->Assign(value.begin(),
+                                                       value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint16_t>>) {
+          proto.mutable_u16()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint32_t>>) {
+          proto.mutable_u32()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint64_t>>) {
+          proto.mutable_u64()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<float>>) {
+          proto.mutable_f32()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<double>>) {
+          proto.mutable_f64()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else {
+          static_assert(false, "Unsupported array type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+absl::StatusOr<Array> Array::FromProto(const ArrayProto& proto) {
+  switch (proto.value_case()) {
+    case ArrayProto::kI8:
+      if (!absl::c_all_of(proto.i8().values(), IsInRange<int8_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int8_t");
+      }
+      return std::vector<int8_t>(proto.i8().values().begin(),
+                                 proto.i8().values().end());
+    case ArrayProto::kI16:
+      if (!absl::c_all_of(proto.i16().values(), IsInRange<int16_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int16_t");
+      }
+      return std::vector<int16_t>(proto.i16().values().begin(),
+                                  proto.i16().values().end());
+    case ArrayProto::kI32:
+      return std::vector<int32_t>(proto.i32().values().begin(),
+                                  proto.i32().values().end());
+    case ArrayProto::kI64:
+      return std::vector<int64_t>(proto.i64().values().begin(),
+                                  proto.i64().values().end());
+    case ArrayProto::kU8:
+      if (!absl::c_all_of(proto.u8().values(), IsInRange<uint8_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint8_t");
+      }
+      return std::vector<uint8_t>(proto.u8().values().begin(),
+                                  proto.u8().values().end());
+    case ArrayProto::kU16:
+      if (!absl::c_all_of(proto.u16().values(), IsInRange<uint16_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint16_t");
+      }
+      return std::vector<uint16_t>(proto.u16().values().begin(),
+                                   proto.u16().values().end());
+    case ArrayProto::kU32:
+      return std::vector<uint32_t>(proto.u32().values().begin(),
+                                   proto.u32().values().end());
+    case ArrayProto::kU64:
+      return std::vector<uint64_t>(proto.u64().values().begin(),
+                                   proto.u64().values().end());
+    case ArrayProto::kF32:
+      return std::vector<float>(proto.f32().values().begin(),
+                                proto.f32().values().end());
+    case ArrayProto::kF64:
+      return std::vector<double>(proto.f64().values().begin(),
+                                 proto.f64().values().end());
+    default:
+      return absl::InvalidArgumentError("Unsupported array type");
+  }
+}
+
+FlatAttributeProto FlatAttribute::ToProto() const {
+  FlatAttributeProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, Scalar>) {
+          *proto.mutable_scalar() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, Array>) {
+          *proto.mutable_array() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, std::string>) {
+          proto.set_str(value);
+        } else {
+          static_assert(false, "Unsupported flat attribute type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+absl::StatusOr<FlatAttribute> FlatAttribute::FromProto(
+    const FlatAttributeProto& proto) {
+  switch (proto.value_case()) {
+    case FlatAttributeProto::kScalar:
+      return Scalar::FromProto(proto.scalar());
+    case FlatAttributeProto::kArray:
+      return Array::FromProto(proto.array());
+    case FlatAttributeProto::kStr:
+      return proto.str();
+    default:
+      return absl::InvalidArgumentError("Unsupported flat attribute type");
+  }
+}
+
+FlatAttributesMapProto FlatAttributesMap::ToProto() const {
+  FlatAttributesMapProto proto;
+  for (const auto& [key, value] : *this) {
+    (*proto.mutable_attrs())[key] = value.ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<FlatAttributesMap> FlatAttributesMap::FromProto(
+    const FlatAttributesMapProto& proto) {
+  FlatAttributesMap result;
+  for (const auto& [key, value] : proto.attrs()) {
+    TF_ASSIGN_OR_RETURN(result[key], FlatAttribute::FromProto(value));
+  }
+  return result;
+}
+
+Attribute::Attribute(const FlatAttribute& flat)
+    : Attribute(Convert<Attribute>(flat)) {}
+
+bool operator==(const AttributesDictionary& lhs,
+                const AttributesDictionary& rhs) {
+  if (lhs.attrs == nullptr) {
+    return rhs.attrs == nullptr;
+  }
+  if (rhs.attrs == nullptr) {
+    return false;
+  }
+  return *lhs.attrs == *rhs.attrs;
+}
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/attribute_map.h b/third_party/xla/xla/ffi/attribute_map.h
index 43ad41888772cb..ff593f9c00eed3 100644
--- a/third_party/xla/xla/ffi/attribute_map.h
+++ b/third_party/xla/xla/ffi/attribute_map.h
@@ -16,16 +16,166 @@ limitations under the License.
 #ifndef XLA_FFI_ATTRIBUTE_MAP_H_
 #define XLA_FFI_ATTRIBUTE_MAP_H_
 
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "xla/ffi/call_frame.h"
+#include "xla/ffi/attribute_map.pb.h"
 
 namespace xla::ffi {
+namespace internal {
+// A little bit of template metaprogramming to append type to std::variant.
+template <typename V, class T>
+struct AppendType;
+
+template <typename... Ts, class T>
+struct AppendType<std::variant<Ts...>, T> {
+  using Type = std::variant<Ts..., T>;
+};
+}  // namespace internal
+
+using ScalarBase =
+    std::variant<bool, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+                 uint32_t, uint64_t, float, double>;
+
+// A single scalar value.
+class Scalar : public ScalarBase {
+ public:
+  using ScalarBase::ScalarBase;
+
+  ScalarProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const ScalarBase& AsVariant() const { return *this; }
+  ScalarBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<Scalar> FromProto(const ScalarProto& proto);
+};
+
+using ArrayBase = std::variant<std::vector<int8_t>, std::vector<int16_t>,
+                               std::vector<int32_t>, std::vector<int64_t>,
+                               std::vector<uint8_t>, std::vector<uint16_t>,
+                               std::vector<uint32_t>, std::vector<uint64_t>,
+                               std::vector<float>, std::vector<double>>;
+
+// An array of elements of the same Scalar type.
+class Array : public ArrayBase {
+ public:
+  using ArrayBase::ArrayBase;
+
+  ArrayProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const ArrayBase& AsVariant() const { return *this; }
+  ArrayBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<Array> FromProto(const ArrayProto& proto);
+};
+
+using FlatAttributeBase = std::variant<Scalar, Array, std::string>;
+
+// Attributes that do not support nested dictionaries.
+class FlatAttribute : public FlatAttributeBase {
+ public:
+  using FlatAttributeBase::FlatAttributeBase;
+
+  FlatAttributeProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const FlatAttributeBase& AsVariant() const { return *this; }
+  FlatAttributeBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<FlatAttribute> FromProto(
+      const FlatAttributeProto& proto);
+};
+
+using FlatAttributesMapBase = absl::flat_hash_map<std::string, FlatAttribute>;
+
+// A map that maps from an arbitrary name (string key) to a flat attribute.
+class FlatAttributesMap : public FlatAttributesMapBase {
+ public:
+  using FlatAttributesMapBase::FlatAttributesMapBase;
+
+  FlatAttributesMapProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const FlatAttributesMapBase& AsVariant() const { return *this; }
+  FlatAttributesMapBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<FlatAttributesMap> FromProto(
+      const FlatAttributesMapProto& proto);
+};
+
+// Dictionary is just a wrapper around `AttributesMap`. We need an indirection
+// through `std::shared_ptr` to be able to define recursive `std::variant`. We
+// use shared pointer to keep `AttributesMap` copyable.
+struct AttributesDictionary {
+  std::shared_ptr<class AttributesMap> attrs;
+
+  AttributesMapProto ToProto() const;
+
+  static absl::StatusOr<AttributesDictionary> FromProto(
+      const AttributesMapProto& proto);
+
+  friend bool operator==(const AttributesDictionary& lhs,
+                         const AttributesDictionary& rhs);
+
+  friend bool operator!=(const AttributesDictionary& lhs,
+                         const AttributesDictionary& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+using AttributeBase =
+    internal::AppendType<FlatAttributeBase, AttributesDictionary>::Type;
+
+// Attributes that support arbitrary nesting.
+class Attribute : public AttributeBase {
+ public:
+  using AttributeBase::AttributeBase;
+
+  explicit Attribute(const FlatAttribute& flat);
+
+  xla::ffi::AttributeProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const AttributeBase& AsVariant() const { return *this; }
+  AttributeBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<Attribute> FromProto(const AttributeProto& proto);
+};
+
+using AttributesMapBase = absl::flat_hash_map<std::string, Attribute>;
+
+// AttributesMap is a map from an arbitrary name (string key) to an attribute.
+class AttributesMap : public AttributesMapBase {
+ public:
+  using AttributesMapBase::AttributesMapBase;
+
+  AttributesMapProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const AttributesMapBase& AsVariant() const { return *this; }
+  AttributesMapBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<AttributesMap> FromProto(
+      const AttributesMapProto& proto);
+};
 
 // Converts MLIR dictionary attribute attached to a custom call operation to a
 // custom call handler attributes that are forwarded to the FFI handler.
-absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict);
+absl::StatusOr<AttributesMap> BuildAttributesMap(mlir::DictionaryAttr dict);
 
 }  // namespace xla::ffi
 
diff --git a/third_party/xla/xla/ffi/attribute_map.proto b/third_party/xla/xla/ffi/attribute_map.proto
new file mode 100644
index 00000000000000..1f3bbef5b63b37
--- /dev/null
+++ b/third_party/xla/xla/ffi/attribute_map.proto
@@ -0,0 +1,102 @@
+syntax = "proto3";
+
+package xla.ffi;
+
+option java_multiple_files = true;
+option java_outer_classname = "AttributeMap";
+
+message ScalarProto {
+  oneof value {
+    bool b = 1;
+    int32 i8 = 2;
+    int32 i16 = 3;
+    int32 i32 = 4;
+    int64 i64 = 5;
+    uint32 u8 = 6;
+    uint32 u16 = 7;
+    uint32 u32 = 8;
+    uint64 u64 = 9;
+    float f32 = 10;
+    double f64 = 11;
+  }
+}
+
+message ArrayProto {
+  message Int8ArrayProto {
+    repeated int32 values = 1;
+  }
+
+  message Int16ArrayProto {
+    repeated int32 values = 1;
+  }
+
+  message Int32ArrayProto {
+    repeated int32 values = 1;
+  }
+
+  message Int64ArrayProto {
+    repeated int64 values = 1;
+  }
+
+  message Uint8ArrayProto {
+    repeated uint32 values = 1;
+  }
+
+  message Uint16ArrayProto {
+    repeated uint32 values = 1;
+  }
+
+  message Uint32ArrayProto {
+    repeated uint32 values = 1;
+  }
+
+  message Uint64ArrayProto {
+    repeated uint64 values = 1;
+  }
+
+  message FloatArrayProto {
+    repeated float values = 1;
+  }
+
+  message DoubleArrayProto {
+    repeated double values = 1;
+  }
+
+  oneof value {
+    Int8ArrayProto i8 = 1;
+    Int16ArrayProto i16 = 2;
+    Int32ArrayProto i32 = 3;
+    Int64ArrayProto i64 = 4;
+    Uint8ArrayProto u8 = 5;
+    Uint16ArrayProto u16 = 6;
+    Uint32ArrayProto u32 = 7;
+    Uint64ArrayProto u64 = 8;
+    FloatArrayProto f32 = 9;
+    DoubleArrayProto f64 = 10;
+  }
+}
+
+message FlatAttributeProto {
+  oneof value {
+    ScalarProto scalar = 1;
+    ArrayProto array = 2;
+    string str = 3;
+  }
+}
+
+message FlatAttributesMapProto {
+  map<string, FlatAttributeProto> attrs = 1;
+}
+
+message AttributeProto {
+  oneof value {
+    ScalarProto scalar = 1;
+    ArrayProto array = 2;
+    string str = 3;
+    AttributesMapProto dict = 4;
+  }
+}
+
+message AttributesMapProto {
+  map<string, AttributeProto> attrs = 1;
+}
diff --git a/third_party/xla/xla/ffi/attribute_map_test.cc b/third_party/xla/xla/ffi/attribute_map_test.cc
new file mode 100644
index 00000000000000..78b577f85180f6
--- /dev/null
+++ b/third_party/xla/xla/ffi/attribute_map_test.cc
@@ -0,0 +1,287 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/attribute_map.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::ffi {
+namespace {
+using absl_testing::IsOkAndHolds;
+using absl_testing::StatusIs;
+using ::testing::HasSubstr;
+using tsl::proto_testing::EqualsProto;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(ScalarTest, ProtoConversion) {
+  EXPECT_THAT(Scalar(true).ToProto(), EqualsProto(R"pb(
+                b: 1
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                b: 1
+              )pb")),
+              IsOkAndHolds(Scalar(true)));
+
+  EXPECT_THAT(Scalar(int8_t{42}).ToProto(), EqualsProto(R"pb(
+                i8: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i8: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int8_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i8: 128
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for int8_t")));
+
+  EXPECT_THAT(Scalar(int16_t{42}).ToProto(), EqualsProto(R"pb(
+                i16: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i16: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int16_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i16: 32768
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for int16_t")));
+
+  EXPECT_THAT(Scalar(int32_t{42}).ToProto(), EqualsProto(R"pb(
+                i32: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i32: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int32_t{42})));
+
+  EXPECT_THAT(Scalar(int64_t{42}).ToProto(), EqualsProto(R"pb(
+                i64: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i64: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int64_t{42})));
+
+  EXPECT_THAT(Scalar(uint8_t{42}).ToProto(), EqualsProto(R"pb(
+                u8: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u8: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint8_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u8: 256
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for uint8_t")));
+
+  EXPECT_THAT(Scalar(uint16_t{42}).ToProto(), EqualsProto(R"pb(
+                u16: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u16: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint16_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u16: 65536
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for uint16_t")));
+
+  EXPECT_THAT(Scalar(uint32_t{42}).ToProto(), EqualsProto(R"pb(
+                u32: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u32: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint32_t{42})));
+
+  EXPECT_THAT(Scalar(uint64_t{42}).ToProto(), EqualsProto(R"pb(
+                u64: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u64: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint64_t{42})));
+
+  EXPECT_THAT(Scalar(float{42.0f}).ToProto(), EqualsProto(R"pb(
+                f32: 42.0
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                f32: 42.0
+              )pb")),
+              IsOkAndHolds(Scalar(float{42.0f})));
+
+  EXPECT_THAT(Scalar(double{42.0}).ToProto(), EqualsProto(R"pb(
+                f64: 42.0
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                f64: 42.0
+              )pb")),
+              IsOkAndHolds(Scalar(double{42.0})));
+}
+
+TEST(ArrayTest, ProtoConversion) {
+  EXPECT_THAT(Array(std::vector<int8_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i8: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i8: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int8_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<int16_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i16: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i16: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int16_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<int32_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i32: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i32: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int32_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<int64_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i64: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i64: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int64_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint8_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u8: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u8: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint8_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint16_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u16: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u16: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint16_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint32_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u32: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u32: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint32_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint64_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u64: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u64: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint64_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<float>{42.0f, 43.0f}).ToProto(),
+              EqualsProto(R"pb(
+                f32: { values: 42.0 values: 43.0 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                f32: { values: 42.0 values: 43.0 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<float>{42.0f, 43.0f})));
+
+  EXPECT_THAT(Array(std::vector<double>{42.0, 43.0}).ToProto(),
+              EqualsProto(R"pb(
+                f64: { values: 42.0 values: 43.0 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                f64: { values: 42.0 values: 43.0 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<double>{42.0, 43.0})));
+}
+
+TEST(FlatAttributeTest, ProtoConversion) {
+  EXPECT_THAT(FlatAttribute(Scalar(true)).ToProto(), EqualsProto(R"pb(
+                scalar: { b: 1 }
+              )pb"));
+  EXPECT_THAT(FlatAttribute::FromProto(ParseTextProtoOrDie<FlatAttributeProto>(
+                  R"pb(
+                    scalar: { b: 1 }
+                  )pb")),
+              IsOkAndHolds(FlatAttribute(Scalar(true))));
+  EXPECT_THAT(FlatAttribute(Array(std::vector<int8_t>{42, 43})).ToProto(),
+              EqualsProto(R"pb(
+                array: { i8: { values: 42 values: 43 } }
+              )pb"));
+  EXPECT_THAT(FlatAttribute::FromProto(ParseTextProtoOrDie<FlatAttributeProto>(
+                  R"pb(
+                    array: { i8: { values: 42 values: 43 } }
+                  )pb")),
+              IsOkAndHolds(FlatAttribute(Array(std::vector<int8_t>{42, 43}))));
+
+  EXPECT_THAT(FlatAttribute(std::string("foo")).ToProto(), EqualsProto(R"pb(
+                str: "foo"
+              )pb"));
+  EXPECT_THAT(FlatAttribute::FromProto(ParseTextProtoOrDie<FlatAttributeProto>(
+                  R"pb(
+                    str: "foo"
+                  )pb")),
+              IsOkAndHolds(FlatAttribute(std::string("foo"))));
+}
+
+TEST(AttributesMapTest, ProtoConversion) {
+  AttributesMap attrs = {{std::string("foo"), Attribute(Scalar(true))}};
+
+  EXPECT_THAT(AttributesMap::FromProto(attrs.ToProto()), IsOkAndHolds(attrs));
+}
+
+TEST(DictionaryTest, ProtoConversion) {
+  AttributesMap attrs = {{std::string("foo"), Attribute(Scalar(true))}};
+  AttributesDictionary dict{};
+  dict.attrs = std::make_shared<AttributesMap>(attrs);
+
+  EXPECT_THAT(AttributesDictionary::FromProto(dict.ToProto()),
+              IsOkAndHolds(dict));
+}
+
+TEST(AttributeTest, ProtoConversion) {
+  AttributesDictionary dict{};
+  dict.attrs = std::make_shared<AttributesMap>(
+      AttributesMap{{std::string("foo"), Attribute(Scalar(true))}});
+  Attribute attr(std::move(dict));
+
+  EXPECT_THAT(Attribute::FromProto(attr.ToProto()), IsOkAndHolds(attr));
+}
+
+}  // namespace
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index ad067220ecf3b8..ad7c71c98f8cd6 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/ffi/api/api.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/attribute_map.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
@@ -51,7 +52,7 @@ struct CallFrameBuilder::Buffer {
   absl::InlinedVector<int64_t, 4> dims;
 };
 
-CallFrameBuilder::AttributesMap CallFrameBuilder::AttributesBuilder::Build() {
+AttributesMap CallFrameBuilder::AttributesBuilder::Build() {
   return std::move(attrs_);
 }
 
@@ -65,8 +66,9 @@ void CallFrameBuilder::AttributesBuilder::Insert(std::string name,
 
 void CallFrameBuilder::AttributesBuilder::Insert(std::string name,
                                                  AttributesMap attrs) {
-  attrs_.try_emplace(std::move(name),
-                     Dictionary{std::make_shared<AttributesMap>(attrs)});
+  attrs_.try_emplace(
+      std::move(name),
+      AttributesDictionary{std::make_shared<AttributesMap>(attrs)});
 }
 
 void CallFrameBuilder::AttributesBuilder::Append(AttributesMap attrs) {
@@ -160,13 +162,13 @@ struct CallFrame::Dictionary {
 };
 
 struct CallFrame::Array {
-  CallFrameBuilder::Array value;  // XLA_FFI_Array::data
+  xla::ffi::Array value;  // XLA_FFI_Array::data
 
   XLA_FFI_Array array = {};
 };
 
 struct CallFrame::Scalar {
-  CallFrameBuilder::Scalar value;  // XLA_FFI_Scalar::value
+  xla::ffi::Scalar value;  // XLA_FFI_Scalar::value
 
   XLA_FFI_Scalar scalar = {};
 };
@@ -413,11 +415,11 @@ std::unique_ptr<CallFrame::Results> CallFrame::FixUpRets(
 // An std::visit overload set for converting CallFrameBuilder::Attribute to
 // CallFrame::Attribute.
 struct CallFrame::ConvertAttribute {
-  CallFrame::Attribute operator()(const CallFrameBuilder::Array& array) {
+  CallFrame::Attribute operator()(const xla::ffi::Array& array) {
     return CallFrame::Array{array};
   }
 
-  CallFrame::Attribute operator()(const CallFrameBuilder::Scalar& scalar) {
+  CallFrame::Attribute operator()(const xla::ffi::Scalar& scalar) {
     return CallFrame::Scalar{scalar};
   }
 
@@ -425,8 +427,8 @@ struct CallFrame::ConvertAttribute {
     return CallFrame::String{str};
   }
 
-  CallFrame::Attribute operator()(const CallFrameBuilder::Dictionary& dict) {
-    return CallFrame::Dictionary{CreateAttrs(*dict.attrs)};
+  CallFrame::Attribute operator()(const xla::ffi::AttributesDictionary& dict) {
+    return Dictionary{CreateAttrs(*dict.attrs)};
   }
 };
 
@@ -440,7 +442,7 @@ struct CallFrame::FixUpAttribute {
       array.array.size = value.size();
       array.array.data = value.data();
     };
-    std::visit(visitor, array.value);
+    std::visit(visitor, array.value.AsVariant());
   }
 
   void operator()(CallFrame::Scalar& scalar) {
@@ -449,7 +451,7 @@ struct CallFrame::FixUpAttribute {
       scalar.scalar.dtype = internal::NativeTypeToCApiDataType<T>();
       scalar.scalar.value = &value;
     };
-    std::visit(visitor, scalar.value);
+    std::visit(visitor, scalar.value.AsVariant());
   }
 
   void operator()(CallFrame::String& str) {
@@ -498,13 +500,14 @@ struct CallFrame::AttributeStorage {
 };
 
 std::unique_ptr<CallFrame::Attributes> CallFrame::CreateAttrs(
-    const CallFrameBuilder::AttributesMap& battrs) {
+    const xla::ffi::AttributesMap& battrs) {
   auto attrs = std::make_unique<Attributes>();
 
   // Convert call frame builder attributes to a collection of named attributes.
   attrs->attributes.reserve(battrs.size());
   for (auto& [name, battr] : battrs) {
-    NamedAttribute attr = {String{name}, std::visit(ConvertAttribute(), battr)};
+    NamedAttribute attr = {String{name},
+                           std::visit(ConvertAttribute(), battr.AsVariant())};
     attrs->attributes.push_back(std::move(attr));
   }
 
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 54863d0615b12c..32dceead1d9b4b 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
@@ -45,15 +46,6 @@ namespace xla::ffi {
 class CallFrame;  // forward declare
 
 class CallFrameBuilder {
-  // A little bit of template metaprogramming to append type to std::variant.
-  template <typename V, class T>
-  struct AppendType;
-
-  template <typename... Ts, class T>
-  struct AppendType<std::variant<Ts...>, T> {
-    using Type = std::variant<Ts..., T>;
-  };
-
  public:
   CallFrameBuilder(size_t num_args, size_t num_rets);
   ~CallFrameBuilder();
@@ -61,32 +53,6 @@ class CallFrameBuilder {
   CallFrameBuilder(CallFrameBuilder&&);
   CallFrameBuilder& operator=(CallFrameBuilder&&);
 
-  using Scalar = std::variant<bool, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                              uint16_t, uint32_t, uint64_t, float, double>;
-  using Array = std::variant<std::vector<int8_t>, std::vector<int16_t>,
-                             std::vector<int32_t>, std::vector<int64_t>,
-                             std::vector<uint8_t>, std::vector<uint16_t>,
-                             std::vector<uint32_t>, std::vector<uint64_t>,
-                             std::vector<float>, std::vector<double>>;
-
-  // Declare implementation detail structs for call frame builder storage.
-  struct Dictionary;
-
-  // Attributes that do not support nested dictionaries.
-  using FlatAttribute = std::variant<Scalar, Array, std::string>;
-  using FlatAttributesMap = absl::flat_hash_map<std::string, FlatAttribute>;
-
-  // Attributes that support arbitrary nesting.
-  using Attribute = typename AppendType<FlatAttribute, Dictionary>::Type;
-  using AttributesMap = absl::flat_hash_map<std::string, Attribute>;
-
-  // Dictionary is just a wrapper around AttributesMap. We need an indirection
-  // through `std::shared_ptr` to be able to define recursive `std::variant`. We
-  // use shared pointer to keep `AttributesMap` copyable.
-  struct Dictionary {
-    std::shared_ptr<AttributesMap> attrs;
-  };
-
   // A helper class to build call frame attributes.
   class AttributesBuilder {
    public:
@@ -224,8 +190,7 @@ class CallFrame {
   //===----- Call frame attributes ----------------------------------------===//
 
   // Creates call frame attributes from the call frame builder attributes.
-  static std::unique_ptr<Attributes> CreateAttrs(
-      const CallFrameBuilder::AttributesMap& attrs);
+  static std::unique_ptr<Attributes> CreateAttrs(const AttributesMap& attrs);
 
   // Fixes up call frame attributes by initializing XLA FFI structs with valid
   // pointers into storage objects.
diff --git a/third_party/xla/xla/ffi/call_frame_test.cc b/third_party/xla/xla/ffi/call_frame_test.cc
index bb568313cd3b4e..f73461fc7d297f 100644
--- a/third_party/xla/xla/ffi/call_frame_test.cc
+++ b/third_party/xla/xla/ffi/call_frame_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/test.h"
@@ -131,7 +132,7 @@ void BM_AddBufferArg(benchmark::State& state) {
 void BM_AddAttributes(benchmark::State& state) {
   size_t num_attrs = state.range(0);
 
-  CallFrameBuilder::AttributesMap attrs;
+  AttributesMap attrs;
   for (size_t i = 0; i < num_attrs; ++i) {
     attrs.try_emplace(absl::StrCat("attr_", i), 42);
   }
diff --git a/third_party/xla/xla/ffi/execution_context.cc b/third_party/xla/xla/ffi/execution_context.cc
index 5d6499aa1728b2..c7879b549b25ae 100644
--- a/third_party/xla/xla/ffi/execution_context.cc
+++ b/third_party/xla/xla/ffi/execution_context.cc
@@ -15,10 +15,8 @@ limitations under the License.
 
 #include "xla/ffi/execution_context.h"
 
-#include <memory>
 #include <utility>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -27,23 +25,11 @@ limitations under the License.
 
 namespace xla::ffi {
 
-ExecutionContext::UserData::UserData(void* data, Deleter<void> deleter)
-    : data_(data), deleter_(std::move(deleter)) {}
-
-ExecutionContext::UserData::~UserData() {
-  if (deleter_) deleter_(data_);
-}
-
-absl::Status ExecutionContext::Insert(TypeId type_id, void* data,
-                                      Deleter<void> deleter) {
-  return InsertUserData(type_id,
-                        std::make_unique<UserData>(data, std::move(deleter)));
+absl::Status ExecutionContext::Insert(TypeId type_id, void* data) {
+  return InsertUserData(type_id, UserData(data, /*deleter=*/[](void*) {}));
 }
 
-absl::Status ExecutionContext::InsertUserData(TypeId type_id,
-                                              std::unique_ptr<UserData> data) {
-  if (!data) return absl::InvalidArgumentError("User data must be not null");
-
+absl::Status ExecutionContext::InsertUserData(TypeId type_id, UserData data) {
   auto emplaced = user_data_.emplace(type_id, std::move(data));
   if (!emplaced.second) {
     return Internal(
@@ -53,27 +39,27 @@ absl::Status ExecutionContext::InsertUserData(TypeId type_id,
   return absl::OkStatus();
 }
 
-absl::StatusOr<ExecutionContext::UserData*> ExecutionContext::LookupUserData(
-    TypeId type_id) const {
+absl::StatusOr<const ExecutionContext::UserData*>
+ExecutionContext::LookupUserData(TypeId type_id) const {
   auto it = user_data_.find(type_id);
   if (it == user_data_.end()) {
     return NotFound("User data with type id %d not found in execution context",
                     type_id.value());
   }
-  return it->second.get();
+  return &it->second;
 }
 
 void ExecutionContext::ForEach(
     absl::FunctionRef<void(TypeId type_id, void* data)> fn) const {
   for (auto& [type_id, user_data] : user_data_) {
-    fn(type_id, user_data->data());
+    fn(type_id, user_data.get());
   }
 }
 
 absl::Status ExecutionContext::ForEachWithStatus(
     absl::FunctionRef<absl::Status(TypeId type_id, void* data)> fn) const {
   for (auto& [type_id, user_data] : user_data_) {
-    TF_RETURN_IF_ERROR(fn(type_id, user_data->data()));
+    TF_RETURN_IF_ERROR(fn(type_id, user_data.get()));
   }
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/ffi/execution_context.h b/third_party/xla/xla/ffi/execution_context.h
index f6c76fec84efac..9bee8ef33b63f2 100644
--- a/third_party/xla/xla/ffi/execution_context.h
+++ b/third_party/xla/xla/ffi/execution_context.h
@@ -16,16 +16,15 @@ limitations under the License.
 #ifndef XLA_FFI_EXECUTION_CONTEXT_H_
 #define XLA_FFI_EXECUTION_CONTEXT_H_
 
-#include <algorithm>
 #include <functional>
 #include <memory>
 #include <utility>
 
-#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/ffi/type_id_registry.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -40,23 +39,40 @@ namespace xla::ffi {
 // to the FFI handler. We rely on type id to guarantee that we forward user data
 // of correct type.
 //
+// We have two kinds of TypeIds:
+//
+// 1. Internal type id. When FFI handler defined in the same binary we rely
+//    on a global static registry to automatically assign type ids.
+//
+// 2. External type id. When FFI handlers defined in a dynamically loaded
+//    library, they must register types used in the execution context ahead
+//    of time and explicitly get a unique type id for them.
+//
+// See `TypeRegistry` documentation for more details about different type ids.
+//
 // Examples: FFI handler can register a per-execution cache in the execution
 // context and get access to it in the FFI handler, with a guarantee that it is
 // unique between separate calls to XLA execute.
 class ExecutionContext {
  public:
-  using TypeId = TypeIdRegistry::TypeId;
+  using TypeId = TypeRegistry::TypeId;
 
-  template <typename T>
-  using Deleter = std::function<void(T*)>;
+  // Inserts user data with a given type id. Caller is responsible for making
+  // sure that the pointer stays valid during the XLA execution and correctly
+  // destroyed afterwards.
+  absl::Status Insert(TypeId type_id, void* data);
 
-  // Inserts opaque user data with a given type id and optional deleter.
-  absl::Status Insert(TypeId type_id, void* data,
-                      Deleter<void> deleter = nullptr);
+  // Looks up opaque execution context data with given `type_id`.
+  absl::StatusOr<void*> Lookup(TypeId type_id) const {
+    TF_ASSIGN_OR_RETURN(auto user_data, LookupUserData(type_id));
+    return user_data->get();
+  }
 
-  // Inserts typed user data of type `T` and optional deleter.
+  // Inserts typed user data of type `T`. Caller is responsible for making sure
+  // that the pointer stays valid during the XLA execution and correctly
+  // destroyed afterwards.
   template <typename T>
-  absl::Status Insert(T* data, Deleter<T> deleter = nullptr);
+  absl::Status Insert(T* data);
 
   // Emplaces typed user data constructed from `args`. Execution context
   // becomes the owner of the constructed object.
@@ -65,17 +81,7 @@ class ExecutionContext {
 
   // Looks up typed execution context data of type `T`.
   template <typename T>
-  absl::StatusOr<T*> Lookup() const {
-    TF_ASSIGN_OR_RETURN(auto user_data,
-                        LookupUserData(TypeIdRegistry::GetTypeId<T>()));
-    return static_cast<T*>(user_data->data());
-  }
-
-  // Looks up opaque execution context data with given `type_id`.
-  absl::StatusOr<void*> Lookup(TypeId type_id) const {
-    TF_ASSIGN_OR_RETURN(auto user_data, LookupUserData(type_id));
-    return user_data->data();
-  }
+  absl::StatusOr<T*> Lookup() const;
 
   // Visit all user data in the execution context.
   void ForEach(absl::FunctionRef<void(TypeId type_id, void* data)> fn) const;
@@ -83,46 +89,36 @@ class ExecutionContext {
       absl::FunctionRef<absl::Status(TypeId type_id, void* data)> fn) const;
 
  private:
-  // An RAII wrapper for opaque user data. Optional deleter will be called when
-  // UserData is destroyed together with the execution context. If deleter is
-  // nullptr then the caller is responsible for making sure that the pointer
-  // stays valid during the XLA execution and correctly destroyed afterwards.
-  class UserData {
-   public:
-    UserData(void* data, Deleter<void> deleter);
-    ~UserData();
-
-    UserData(UserData&) = delete;
-    UserData& operator=(const UserData&) = delete;
+  // An RAII wrapper for opaque user data. If deleter is no-op then the caller
+  // is responsible for making sure that the pointer stays valid during the XLA
+  // execution and correctly destroyed afterwards
+  using UserData = std::unique_ptr<void, std::function<void(void*)>>;
 
-    void* data() const { return data_; }
+  absl::Status InsertUserData(TypeId type_id, UserData data);
+  absl::StatusOr<const UserData*> LookupUserData(TypeId type_id) const;
 
-   private:
-    void* data_;
-    Deleter<void> deleter_;
-  };
-
-  absl::Status InsertUserData(TypeId type_id, std::unique_ptr<UserData> data);
-  absl::StatusOr<UserData*> LookupUserData(TypeId type_id) const;
-
-  absl::flat_hash_map<TypeId, std::unique_ptr<UserData>> user_data_;
+  absl::node_hash_map<TypeId, UserData> user_data_;
 };
 
 template <typename T>
-absl::Status ExecutionContext::Insert(T* data, Deleter<T> deleter) {
-  return InsertUserData(TypeIdRegistry::GetTypeId<T>(),
-                        std::make_unique<UserData>(
-                            data, [deleter = std::move(deleter)](void* data) {
-                              if (deleter) deleter(static_cast<T*>(data));
-                            }));
+absl::StatusOr<T*> ExecutionContext::Lookup() const {
+  TF_ASSIGN_OR_RETURN(auto user_data,
+                      LookupUserData(TypeRegistry::GetTypeId<T>()));
+  return static_cast<T*>(user_data->get());
+}
+
+template <typename T>
+absl::Status ExecutionContext::Insert(T* data) {
+  return InsertUserData(TypeRegistry::GetTypeId<T>(),
+                        UserData(data, /*deleter=*/[](void*) {}));
 }
 
 template <typename T, typename... Args>
 absl::Status ExecutionContext::Emplace(Args&&... args) {
-  return InsertUserData(TypeIdRegistry::GetTypeId<T>(),
-                        std::make_unique<UserData>(
-                            new T(std::forward<Args>(args)...),
-                            [](void* data) { delete static_cast<T*>(data); }));
+  auto type_info = TypeRegistry::GetTypeInfo<T>();
+  return InsertUserData(
+      TypeRegistry::GetTypeId<T>(),
+      UserData(new T(std::forward<Args>(args)...), type_info.deleter));
 }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/execution_context_test.cc b/third_party/xla/xla/ffi/execution_context_test.cc
index 31439ff9562940..11739ff8178319 100644
--- a/third_party/xla/xla/ffi/execution_context_test.cc
+++ b/third_party/xla/xla/ffi/execution_context_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
-#include "xla/ffi/type_id_registry.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -62,8 +62,9 @@ TEST(ExecutionContextTest, InsertUserOwned) {
 }
 
 TEST(ExecutionContextTest, InsertUserOwnedWithTypeId) {
-  TF_ASSERT_OK_AND_ASSIGN(TypeIdRegistry::TypeId type_id,
-                          TypeIdRegistry::AssignExternalTypeId("I32UserData"));
+  TF_ASSERT_OK_AND_ASSIGN(TypeRegistry::TypeId type_id,
+                          TypeRegistry::AssignExternalTypeId(
+                              "I32UserData", TypeRegistry::TypeInfo{}));
 
   I32UserData user_data(42);
 
diff --git a/third_party/xla/xla/ffi/execution_state.cc b/third_party/xla/xla/ffi/execution_state.cc
index 5aab4a7a3a575c..d5fce70fc8a2a2 100644
--- a/third_party/xla/xla/ffi/execution_state.cc
+++ b/third_party/xla/xla/ffi/execution_state.cc
@@ -15,38 +15,46 @@ limitations under the License.
 
 #include "xla/ffi/execution_state.h"
 
-#include <utility>
-
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/ffi/type_id_registry.h"
+#include "xla/ffi/type_registry.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/logging.h"
 
 namespace xla::ffi {
 
 ExecutionState::ExecutionState()
-    : type_id_(TypeIdRegistry::kUnknownTypeId),
-      state_(nullptr),
-      deleter_(nullptr) {}
+    : type_id_(TypeRegistry::kUnknownTypeId), state_(nullptr) {}
 
 ExecutionState::~ExecutionState() {
-  if (deleter_) deleter_(state_);
+  if (type_info_.deleter) {
+    type_info_.deleter(state_);
+  }
+}
+
+absl::Status ExecutionState::Set(TypeId type_id, void* state) {
+  TF_ASSIGN_OR_RETURN(auto type_info, TypeRegistry::GetTypeInfo(type_id));
+  if (type_info.deleter == nullptr) {
+    return InvalidArgument(
+        "Type id %d does not have a registered type info with a deleter",
+        type_id.value());
+  }
+  return Set(type_id, type_info, state);
 }
 
-absl::Status ExecutionState::Set(TypeId type_id, void* state,
-                                 Deleter<void> deleter) {
-  DCHECK(state && deleter) << "State and deleter must not be null";
+absl::Status ExecutionState::Set(TypeId type_id, TypeInfo type_info,
+                                 void* state) {
+  DCHECK(state && type_info.deleter) << "State and deleter must not be null";
 
-  if (type_id_ != TypeIdRegistry::kUnknownTypeId) {
+  if (type_id_ != TypeRegistry::kUnknownTypeId) {
     return FailedPrecondition("State is already set with a type id %d",
                               type_id_.value());
   }
 
   type_id_ = type_id;
+  type_info_ = type_info;
   state_ = state;
-  deleter_ = std::move(deleter);
 
   return absl::OkStatus();
 }
@@ -54,7 +62,7 @@ absl::Status ExecutionState::Set(TypeId type_id, void* state,
 // Returns opaque state of the given type id. If set state type id does not
 // match the requested one, returns an error.
 absl::StatusOr<void*> ExecutionState::Get(TypeId type_id) const {
-  if (type_id_ == TypeIdRegistry::kUnknownTypeId) {
+  if (type_id_ == TypeRegistry::kUnknownTypeId) {
     return NotFound("State is not set");
   }
 
@@ -68,7 +76,7 @@ absl::StatusOr<void*> ExecutionState::Get(TypeId type_id) const {
 }
 
 bool ExecutionState::IsSet() const {
-  return type_id_ != TypeIdRegistry::kUnknownTypeId;
+  return type_id_ != TypeRegistry::kUnknownTypeId;
 }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/execution_state.h b/third_party/xla/xla/ffi/execution_state.h
index f2f263344aa782..549823fcd24bc9 100644
--- a/third_party/xla/xla/ffi/execution_state.h
+++ b/third_party/xla/xla/ffi/execution_state.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef XLA_FFI_EXECUTION_STATE_H_
 #define XLA_FFI_EXECUTION_STATE_H_
 
-#include <functional>
 #include <memory>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/ffi/type_id_registry.h"
-#include "tsl/platform/statusor.h"
+#include "xla/ffi/type_registry.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 
 namespace xla::ffi {
 
@@ -41,10 +41,8 @@ namespace xla::ffi {
 //
 class ExecutionState {
  public:
-  using TypeId = TypeIdRegistry::TypeId;
-
-  template <typename T>
-  using Deleter = std::function<void(T*)>;
+  using TypeId = TypeRegistry::TypeId;
+  using TypeInfo = TypeRegistry::TypeInfo;
 
   ExecutionState();
   ~ExecutionState();
@@ -52,9 +50,9 @@ class ExecutionState {
   ExecutionState(const ExecutionState&) = delete;
   ExecutionState& operator=(const ExecutionState&) = delete;
 
-  // Sets opaque state with a given type id and deleter. Returns an error if
-  // state is already set.
-  absl::Status Set(TypeId type_id, void* state, Deleter<void> deleter);
+  // Sets opaque state with a given type id. Returns an error if state is
+  // already set, or if type id is not supported as a state.
+  absl::Status Set(TypeId type_id, void* state);
 
   // Returns opaque state of the given type id. If set state type id does not
   // match the requested one, returns an error.
@@ -73,21 +71,23 @@ class ExecutionState {
   bool IsSet() const;
 
  private:
+  absl::Status Set(TypeId type_id, TypeInfo type_info, void* state);
+
   TypeId type_id_;
+  TypeInfo type_info_;
   void* state_;
-  Deleter<void> deleter_;
 };
 
 template <typename T>
 absl::Status ExecutionState::Set(std::unique_ptr<T> state) {
-  return Set(TypeIdRegistry::GetTypeId<T>(), state.release(),
-             [](void* state) { delete reinterpret_cast<T*>(state); });
+  return Set(TypeRegistry::GetTypeId<T>(), TypeRegistry::GetTypeInfo<T>(),
+             state.release());
 }
 
 template <typename T>
 absl::StatusOr<T*> ExecutionState::Get() const {
-  TF_ASSIGN_OR_RETURN(void* state, Get(TypeIdRegistry::GetTypeId<T>()));
-  return reinterpret_cast<T*>(state);
+  TF_ASSIGN_OR_RETURN(void* state, Get(TypeRegistry::GetTypeId<T>()));
+  return tsl::safe_reinterpret_cast<T*>(state);
 }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/execution_state_test.cc b/third_party/xla/xla/ffi/execution_state_test.cc
index d32c80f6d92ff4..f3183b0bdb0d41 100644
--- a/third_party/xla/xla/ffi/execution_state_test.cc
+++ b/third_party/xla/xla/ffi/execution_state_test.cc
@@ -20,9 +20,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/ffi/type_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla::ffi {
 
@@ -30,7 +31,7 @@ using TypeId = ExecutionState::TypeId;
 
 using ::testing::HasSubstr;
 
-TEST(ExecutionStateTest, SetAndGet) {
+TEST(ExecutionStateTest, SetAndGetForInternalType) {
   ExecutionState state;
   EXPECT_FALSE(state.IsSet());
 
@@ -52,4 +53,34 @@ TEST(ExecutionStateTest, SetAndGet) {
   EXPECT_EQ(*data, 42);
 }
 
+TEST(ExecutionStateTest, SetAndGetForExternalType) {
+  ExecutionState state;
+  EXPECT_FALSE(state.IsSet());
+
+  {  // Empty state returns an error from Get().
+    auto data = state.Get(TypeId(1));
+    EXPECT_THAT(data.status().message(), HasSubstr("State is not set"));
+  }
+
+  {  // Empty state returns an error from Get().
+    auto data = state.Get<int32_t>();
+    EXPECT_THAT(data.status().message(), HasSubstr("State is not set"));
+  }
+
+  TypeRegistry::TypeInfo type_info = {
+      [](void* ptr) { delete static_cast<int32_t*>(ptr); }};
+  TF_ASSERT_OK_AND_ASSIGN(
+      TypeRegistry::TypeId type_id,
+      TypeRegistry::AssignExternalTypeId("int32_t", type_info));
+
+  int32_t* value = new int32_t(42);
+
+  // Once set, state can be retrieved.
+  TF_ASSERT_OK(state.Set(type_id, value));
+  EXPECT_TRUE(state.IsSet());
+
+  TF_ASSERT_OK_AND_ASSIGN(void* data, state.Get(type_id));
+  EXPECT_EQ(data, value);
+}
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index a72d781b60db05..6f3a8af8eae108 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -37,7 +37,9 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
 #include "absl/base/optimization.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/executable_run_options.h"
@@ -45,33 +47,24 @@ limitations under the License.
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/scratch_allocator.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
+
+// TODO(ezhulenev): Remove this once JAX is migrated to the new header.
+#include "xla/backends/gpu/ffi.h"
 
 namespace xla::ffi {
 
 // Type tags to bind parameters passed via execution context to FFI handler.
-struct Stream {};               // binds `se::Stream*`
-struct DeviceOrdinal {};        // binds `int32_t` with device ordinal
-struct Allocator {};            // binds `se::DeviceMemoryAllocator*`
-struct ScratchAllocator {};     // binds `se::OwningScratchAllocator`
-struct CalledComputation {};    // binds `HloComputation*`
-struct IntraOpThreadPool {};    // binds `const Eigen::ThreadPoolDevice*`
-struct FfiApi {};               // binds `const XLA_FFI_Api*`
-struct FfiExecutionContext {};  // binds `XLA_FFI_ExecutionContext*`
-
-template <typename T>
-struct PlatformStream {};  // binds a platform stream, e.g. `cudaStream_t`
+struct DeviceOrdinal {};      // binds `int32_t` with device ordinal
+struct CalledComputation {};  // binds `HloComputation*`
 
 //===----------------------------------------------------------------------===//
 // Arguments
@@ -249,6 +242,20 @@ struct ArgBinding<Buffer<dtype, rank>> {
   using Arg = Buffer<dtype, rank>;
 };
 
+//===----------------------------------------------------------------------===//
+// Results binding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct RetBinding<Result<AnyBuffer>> {
+  using Ret = AnyBuffer;
+};
+
+template <PrimitiveType dtype, size_t rank>
+struct RetBinding<Result<Buffer<dtype, rank>>> {
+  using Ret = Buffer<dtype, rank>;
+};
+
 //===----------------------------------------------------------------------===//
 // Arguments decoding
 //===----------------------------------------------------------------------===//
@@ -426,7 +433,7 @@ struct AttrDecoding<absl::string_view> {
   static std::optional<absl::string_view> Decode(XLA_FFI_AttrType type,
                                                  void* attr,
                                                  DiagnosticEngine& diagnostic) {
-    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_STRING << " but got " << type;
     }
@@ -470,8 +477,8 @@ class Dictionary : public internal::DictionaryBase {
   template <typename T>
   absl::StatusOr<T> get(absl::string_view name) const {
     DiagnosticEngine diagnostic;
-    std::optional<T> value = internal::DictionaryBase::get<T>(name, diagnostic);
-    if (!value.has_value()) {
+    auto value = internal::DictionaryBase::get<T>(name, diagnostic);
+    if (ABSL_PREDICT_FALSE(!value.has_value())) {
       return Internal("%s", diagnostic.Result());
     }
     return *value;
@@ -494,7 +501,7 @@ struct AttrDecoding<Dictionary> {
   using Type = Dictionary;
   static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
                                           DiagnosticEngine& diagnostic) {
-    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
     }
@@ -503,63 +510,49 @@ struct AttrDecoding<Dictionary> {
 };
 
 //===----------------------------------------------------------------------===//
-// Context decoding
+// Type-safe wrapper for accessing context.
 //===----------------------------------------------------------------------===//
 
-template <>
-struct CtxDecoding<Stream> {
-  using Type = se::Stream*;
+class Context : public internal::ContextBase {
+ public:
+  using internal::ContextBase::ContextBase;
 
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine& diagnostic) {
-    void* ptr = api->internal_api->XLA_FFI_INTERNAL_Stream_Get(ctx);
-    if (ABSL_PREDICT_FALSE(ptr == nullptr)) {
-      return diagnostic.Emit("Failed to decode stream");
+  template <typename T>
+  absl::StatusOr<typename CtxDecoding<T>::Type> get() const {
+    DiagnosticEngine diagnostic;
+    auto value = internal::ContextBase::get<T>(diagnostic);
+    if (ABSL_PREDICT_FALSE(!value.has_value())) {
+      return Internal("%s", diagnostic.Result());
     }
-    return reinterpret_cast<Type>(ptr);
+    return *value;
   }
 };
 
+// Context decoding for catch-all `Context` type.
 template <>
-struct CtxDecoding<DeviceOrdinal> {
-  using Type = int32_t;
+struct CtxDecoding<Context> {
+  using Type = Context;
 
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine&) {
-    return api->internal_api->XLA_FFI_INTERNAL_DeviceOrdinal_Get(ctx);
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Context> Decode(const XLA_FFI_Api* api,
+                                       XLA_FFI_ExecutionContext* ctx,
+                                       DiagnosticEngine&) {
+    return Context(api, ctx);
   }
 };
 
-template <>
-struct CtxDecoding<Allocator> {
-  using Type = se::DeviceMemoryAllocator*;
-
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine&) {
-    void* device_allocator =
-        api->internal_api->XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(ctx);
-    return reinterpret_cast<Type>(device_allocator);
-  }
-};
+//===----------------------------------------------------------------------===//
+// Context decoding
+//===----------------------------------------------------------------------===//
 
 template <>
-struct CtxDecoding<ScratchAllocator> {
-  using Type = se::OwningScratchAllocator<>;
+struct CtxDecoding<DeviceOrdinal> {
+  using Type = int32_t;
 
   static std::optional<Type> Decode(const XLA_FFI_Api* api,
                                     XLA_FFI_ExecutionContext* ctx,
                                     DiagnosticEngine&) {
-    int32_t device_ordinal =
-        api->internal_api->XLA_FFI_INTERNAL_DeviceOrdinal_Get(ctx);
-    void* device_allocator =
-        api->internal_api->XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(ctx);
-
-    return se::OwningScratchAllocator<>(
-        device_ordinal,
-        reinterpret_cast<se::DeviceMemoryAllocator*>(device_allocator));
+    return api->internal_api->XLA_FFI_INTERNAL_DeviceOrdinal_Get(ctx);
   }
 };
 
@@ -575,57 +568,6 @@ struct CtxDecoding<CalledComputation> {
   }
 };
 
-template <>
-struct CtxDecoding<IntraOpThreadPool> {
-  using Type = const Eigen::ThreadPoolDevice*;
-
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine&) {
-    void* intra_op_thread_pool =
-        api->internal_api->XLA_FFI_INTERNAL_IntraOpThreadPool_Get(ctx);
-    return reinterpret_cast<Type>(intra_op_thread_pool);
-  }
-};
-
-template <>
-struct CtxDecoding<FfiApi> {
-  using Type = const XLA_FFI_Api*;
-
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine&) {
-    return api;
-  }
-};
-
-template <>
-struct CtxDecoding<FfiExecutionContext> {
-  using Type = XLA_FFI_ExecutionContext*;
-
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine&) {
-    return ctx;
-  }
-};
-
-template <typename T>
-struct CtxDecoding<PlatformStream<T>> {
-  using Type = T;
-  static_assert(std::is_pointer_v<T>, "platform stream type must be a pointer");
-
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine& diagnostic) {
-    if (auto stream = CtxDecoding<Stream>::Decode(api, ctx, diagnostic)) {
-      return reinterpret_cast<Type>(
-          stream.value()->platform_specific_handle().stream);
-    }
-    return std::nullopt;
-  }
-};
-
 template <>
 struct CtxDecoding<RunId> {
   using Type = RunId;
@@ -722,6 +664,10 @@ struct ResultEncoding<stage, absl::Status> {
 template <typename T>
 struct ResultEncoding<ExecutionStage::kInstantiate,
                       absl::StatusOr<std::unique_ptr<T>>> {
+  static XLA_FFI_TypeId state_type_id() {
+    return XLA_FFI_TypeId{TypeRegistry::GetTypeId<T>().value()};
+  }
+
   static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
                                XLA_FFI_ExecutionContext* ctx,
                                absl::StatusOr<std::unique_ptr<T>> state) {
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index ad992d30b28275..a56b25a7c3ce3e 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -43,16 +43,15 @@ limitations under the License.
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
-#include "xla/ffi/type_id_registry.h"
-#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/ffi/ffi_internal_api.h"
+#include "xla/ffi/ffi_structs.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
-#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -60,45 +59,29 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
-//===----------------------------------------------------------------------===//
-// XLA FFI C structs definition
-//===----------------------------------------------------------------------===//
-
-struct XLA_FFI_Error {
-  absl::Status status;
-};
+namespace xla::ffi {
 
-struct XLA_FFI_Future {
-  tsl::AsyncValueRef<tsl::Chain> async_value;
+// The minimum XLA:FFI API version that XLA runtime supports.
+static constexpr std::pair<int32_t, int32_t> kMinSupportedApiVersion = {
+    /*major=*/0,
+    /*minor=*/1,
 };
 
-struct XLA_FFI_ExecutionContext {
-  struct CpuContext {
-    const Eigen::ThreadPoolDevice* intra_op_thread_pool = nullptr;
-  };
-
-  struct GpuContext {
-    stream_executor::Stream* stream = nullptr;
-    stream_executor::DeviceMemoryAllocator* allocator = nullptr;
-  };
-
-  using BackendContext = std::variant<std::monostate, CpuContext, GpuContext>;
-
-  xla::RunId run_id = {};
-  int32_t device_ordinal = -1;
-  BackendContext backend_context = {};
-
-  const xla::HloComputation* called_computation = nullptr;
-  const xla::ffi::ExecutionContext* execution_context = nullptr;
-  xla::ffi::ExecutionState* execution_state = nullptr;
+// The maximum XLA:FFI API version that XLA runtime supports.
+static constexpr std::pair<int32_t, int32_t> kMaxSupportedApiVersion = {
+    XLA_FFI_API_MAJOR,
+    XLA_FFI_API_MINOR,
 };
 
-//===----------------------------------------------------------------------===//
-
-namespace xla::ffi {
+static bool IsSupportedApiVersion(const XLA_FFI_Api_Version& api_version) {
+  std::pair<int32_t, int32_t> version = {api_version.major_version,
+                                         api_version.minor_version};
+  return version >= kMinSupportedApiVersion &&
+         version <= kMaxSupportedApiVersion;
+}
 
-bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits) {
-  return traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE;
+bool IsCommandBufferCompatible(const XLA_FFI_Metadata& metadata) {
+  return metadata.traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE;
 }
 
 static XLA_FFI_ExecutionContext CreateExecutionContext(
@@ -136,7 +119,9 @@ static XLA_FFI_ExecutionContext CreateExecutionContext(
 //===----------------------------------------------------------------------===//
 
 absl::Status TakeStatus(XLA_FFI_Error* error) {
-  if (ABSL_PREDICT_TRUE(error == nullptr)) return absl::OkStatus();
+  if (ABSL_PREDICT_TRUE(error == nullptr)) {
+    return absl::OkStatus();
+  }
   absl::Status status = std::move(error->status);
   delete error;
   return status;
@@ -150,18 +135,20 @@ tsl::AsyncValueRef<tsl::Chain> TakeFuture(XLA_FFI_Future* future) {
         tsl::MakeAvailableAsyncValueRef<tsl::Chain>(*storage));
   }();
 
-  if (ABSL_PREDICT_TRUE(future == nullptr)) return chain->AsRef();
+  if (ABSL_PREDICT_TRUE(future == nullptr)) {
+    return chain->AsRef();
+  }
 
-  // If the future is already completed, immediately return the underlying async
-  // value and delete the XLA_FFI_Future.
+  // If the future is already completed, immediately return the underlying
+  // async value and delete the XLA_FFI_Future.
   if (ABSL_PREDICT_TRUE(future->async_value.IsAvailable())) {
     tsl::AsyncValueRef<tsl::Chain> async_value = std::move(future->async_value);
     delete future;
     return async_value;
   }
 
-  // If the future is not completed, return a copy of the underlying async value
-  // and keep XLA_FFI_Future alive until it is completed.
+  // If the future is not completed, return a copy of the underlying async
+  // value and keep XLA_FFI_Future alive until it is completed.
   tsl::AsyncValueRef<tsl::Chain> async_value = future->async_value;
   async_value.AndThen([future] { delete future; });
   return async_value;
@@ -204,7 +191,9 @@ static absl::StatusOr<XLA_FFI_Future*> Call(Handler& handler,
 }
 
 static absl::Status BlockUntilReady(XLA_FFI_Future* future) {
-  if (ABSL_PREDICT_TRUE(future == nullptr)) return absl::OkStatus();
+  if (ABSL_PREDICT_TRUE(future == nullptr)) {
+    return absl::OkStatus();
+  }
 
   tsl::AsyncValueRef<tsl::Chain> av = TakeFuture(future);
   tsl::BlockUntilReady(av);
@@ -246,12 +235,12 @@ tsl::AsyncValueRef<tsl::Chain> CallAsync(XLA_FFI_Handler* handler,
   return TakeFuture(future);
 }
 
-static XLA_FFI_Metadata BuildMetadata() {
+static XLA_FFI_Metadata PrepareMetadata() {
   return XLA_FFI_Metadata{XLA_FFI_Metadata_STRUCT_SIZE,
                           XLA_FFI_Api_Version{XLA_FFI_Api_Version_STRUCT_SIZE}};
 }
 
-static XLA_FFI_Metadata_Extension BuildMetadataExtension(
+static XLA_FFI_Metadata_Extension PrepareMetadataExtension(
     XLA_FFI_Metadata* metadata) {
   return XLA_FFI_Metadata_Extension{
       XLA_FFI_Extension_Base{XLA_FFI_Metadata_Extension_STRUCT_SIZE,
@@ -259,12 +248,12 @@ static XLA_FFI_Metadata_Extension BuildMetadataExtension(
       metadata};
 }
 
-static XLA_FFI_CallFrame BuildMetadataCallFrame(
+static XLA_FFI_CallFrame PrepareMetadataCallFrame(
     XLA_FFI_Metadata_Extension* extension) {
   return XLA_FFI_CallFrame{
       XLA_FFI_CallFrame_STRUCT_SIZE,
       &extension->extension_base,
-      /*api=*/nullptr,
+      /*api=*/GetXlaFfiApi(),
       /*context=*/nullptr,
       /*stage=*/XLA_FFI_ExecutionStage_EXECUTE,
       /*args=*/XLA_FFI_Args{XLA_FFI_Args_STRUCT_SIZE},
@@ -274,9 +263,9 @@ static XLA_FFI_CallFrame BuildMetadataCallFrame(
 }
 
 absl::StatusOr<XLA_FFI_Metadata> GetMetadata(Ffi& handler) {
-  XLA_FFI_Metadata metadata = BuildMetadata();
-  XLA_FFI_Metadata_Extension extension = BuildMetadataExtension(&metadata);
-  XLA_FFI_CallFrame call_frame = BuildMetadataCallFrame(&extension);
+  XLA_FFI_Metadata metadata = PrepareMetadata();
+  XLA_FFI_Metadata_Extension extension = PrepareMetadataExtension(&metadata);
+  XLA_FFI_CallFrame call_frame = PrepareMetadataCallFrame(&extension);
   XLA_FFI_Error* error = nullptr;
   try {
     error = handler.Call(&call_frame);
@@ -290,9 +279,9 @@ absl::StatusOr<XLA_FFI_Metadata> GetMetadata(Ffi& handler) {
 }
 
 absl::StatusOr<XLA_FFI_Metadata> GetMetadata(XLA_FFI_Handler* handler) {
-  XLA_FFI_Metadata metadata = BuildMetadata();
-  XLA_FFI_Metadata_Extension extension = BuildMetadataExtension(&metadata);
-  XLA_FFI_CallFrame call_frame = BuildMetadataCallFrame(&extension);
+  XLA_FFI_Metadata metadata = PrepareMetadata();
+  XLA_FFI_Metadata_Extension extension = PrepareMetadataExtension(&metadata);
+  XLA_FFI_CallFrame call_frame = PrepareMetadataCallFrame(&extension);
   XLA_FFI_Error* error = nullptr;
   try {
     error = (*handler)(&call_frame);
@@ -346,10 +335,18 @@ static HandlerRegistry& GetHandlerRegistry() {
 static std::vector<std::string> GetHandlerStages(
     const XLA_FFI_Handler_Bundle& bundle) {
   std::vector<std::string> stages;
-  if (bundle.instantiate != nullptr) stages.push_back("instantiate");
-  if (bundle.prepare != nullptr) stages.push_back("prepare");
-  if (bundle.initialize != nullptr) stages.push_back("initialize");
-  if (bundle.execute != nullptr) stages.push_back("execute");
+  if (bundle.instantiate != nullptr) {
+    stages.push_back("instantiate");
+  }
+  if (bundle.prepare != nullptr) {
+    stages.push_back("prepare");
+  }
+  if (bundle.initialize != nullptr) {
+    stages.push_back("initialize");
+  }
+  if (bundle.execute != nullptr) {
+    stages.push_back("execute");
+  }
   return stages;
 }
 
@@ -367,49 +364,61 @@ static absl::Status RegisterHandler(absl::string_view name,
         name, platform);
   }
 
-  // Check the API versions.
-  TF_ASSIGN_OR_RETURN(auto metadata, GetMetadata(bundle.execute));
-  const XLA_FFI_Api_Version& api_version = metadata.api_version;
-  if (api_version.major_version != XLA_FFI_API_MAJOR ||
-      api_version.minor_version != XLA_FFI_API_MINOR) {
+  // Check the API version that FFI handler was compiled with is supported.
+  TF_ASSIGN_OR_RETURN(XLA_FFI_Metadata metadata, GetMetadata(bundle.execute));
+  if (!IsSupportedApiVersion(metadata.api_version)) {
     return InvalidArgument(
-        "FFI handler registration for %s on platform %s (canonical %s) failed "
-        "because the handler's API version (%d.%d) is incompatible with the "
-        "framework's API version (%d.%d)",
-        name, platform, canonical_platform, api_version.major_version,
-        api_version.minor_version, XLA_FFI_API_MAJOR, XLA_FFI_API_MINOR);
+        "XLA FFI handler registration for %s on platform %s (canonical %s) "
+        "failed because the handler's API version (%d.%d) is incompatible "
+        "with "
+        "the framework's API version (%d.%d). Minimum supported API version "
+        "is "
+        "(%d.%d).",
+        name, platform, canonical_platform, metadata.api_version.major_version,
+        metadata.api_version.minor_version, kMaxSupportedApiVersion.first,
+        kMaxSupportedApiVersion.second, kMinSupportedApiVersion.first,
+        kMinSupportedApiVersion.second);
   }
 
-  // Incorporate handler traits.
-  traits |= metadata.traits;
+  // Incorporate handler traits passed explicitly via handler registration API.
+  metadata.traits |= traits;
+
+  // Incorporate state type id from the instantiate implementation if present.
+  if (bundle.instantiate) {
+    TF_ASSIGN_OR_RETURN(XLA_FFI_Metadata instantiate_metadata,
+                        GetMetadata(bundle.instantiate));
+    metadata.state_type_id = instantiate_metadata.state_type_id;
+  }
 
   VLOG(2) << absl::StreamFormat(
       "Register XLA FFI handler for '%s'; platform=%s (canonical=%s), "
-      "stages=[%s], command_buffer_compatible=%v",
+      "stages=[%s], metadata=%v",
       name, platform, canonical_platform,
-      absl::StrJoin(GetHandlerStages(bundle), ", "),
-      IsCommandBufferCompatible(traits));
-
-  auto emplaced =
-      GetHandlerRegistry().try_emplace(MakeHandlerKey(name, canonical_platform),
-                                       HandlerRegistration{bundle, traits});
-  if (!emplaced.second) {
-    auto existing = emplaced.first->second;
-    if (existing.traits != traits) {
+      absl::StrJoin(GetHandlerStages(bundle), ", "), metadata);
+
+  HandlerRegistration registration{metadata, bundle};
+  auto [it, emplaced] = GetHandlerRegistry().try_emplace(
+      MakeHandlerKey(name, canonical_platform), registration);
+
+  // We might accidentally link the same FFI library multiple times (because
+  // linking shared libraries is hard), and we choose to ignore this problem as
+  // long as we register exactly the same handler.
+  if (!emplaced) {
+    const HandlerRegistration& existing = it->second;
+    if (existing.metadata != metadata) {
       return InvalidArgument(
           "Duplicate FFI handler registration for %s on platform %s "
-          "(canonical %s) with different traits",
-          name, platform, canonical_platform);
+          "(canonical %s) with different metadata: %v vs %v",
+          name, platform, canonical_platform, existing.metadata, metadata);
     }
-    if (existing.bundle.prepare != bundle.prepare ||
-        existing.bundle.initialize != bundle.initialize ||
-        existing.bundle.execute != bundle.execute) {
+    if (existing.bundle != bundle) {
       return InvalidArgument(
           "Duplicate FFI handler registration for %s on platform %s "
           "(canonical %s) with different bundle addresses",
           name, platform, canonical_platform);
     }
   }
+
   return absl::OkStatus();
 }
 
@@ -638,19 +647,20 @@ static XLA_FFI_Error* XLA_FFI_DeviceOrdinal_Get(
   return nullptr;
 }
 
-static XLA_FFI_Error* XLA_FFI_TypeId_Register(
-    XLA_FFI_TypeId_Register_Args* args) {
+static XLA_FFI_Error* XLA_FFI_Type_Register(XLA_FFI_Type_Register_Args* args) {
   XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "XLA_FFI_ExecutionContext_Get_Args",
       XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE, args->struct_size));
 
   absl::string_view type_name(args->name.ptr, args->name.len);
-  TypeIdRegistry::TypeId type_id(args->type_id->type_id);
-
-  // If type_id is unknown, we are registering a new type and XLA will assign a
-  // unique type id to it.
-  if (type_id == TypeIdRegistry::kUnknownTypeId) {
-    auto assigned_type_id = TypeIdRegistry::AssignExternalTypeId(type_name);
+  TypeRegistry::TypeId type_id(args->type_id->type_id);
+  TypeRegistry::TypeInfo type_info = {args->type_info->deleter};
+
+  // If type_id is unknown, we are registering a new type and XLA will assign
+  // a unique type id to it.
+  if (type_id == TypeRegistry::kUnknownTypeId) {
+    auto assigned_type_id =
+        TypeRegistry::AssignExternalTypeId(type_name, type_info);
     if (!assigned_type_id.ok()) {
       return new XLA_FFI_Error{std::move(assigned_type_id).status()};
     }
@@ -660,9 +670,10 @@ static XLA_FFI_Error* XLA_FFI_TypeId_Register(
   }
 
   // If type_id is set, we are relying on the caller-provided unique type id.
-  if (auto status = TypeIdRegistry::RegisterExternalTypeId(type_name, type_id);
-      !status.ok()) {
-    return new XLA_FFI_Error{std::move(status)};
+  auto registered_type_id =
+      TypeRegistry::RegisterExternalTypeId(type_name, type_id, type_info);
+  if (!registered_type_id.ok()) {
+    return new XLA_FFI_Error{std::move(registered_type_id)};
   }
 
   return nullptr;
@@ -676,7 +687,7 @@ static XLA_FFI_Error* XLA_FFI_ExecutionContext_Get(
 
   DCHECK(args->ctx->execution_context) << "ExecutionContext must be set";
   auto user_data = args->ctx->execution_context->Lookup(
-      TypeIdRegistry::TypeId(args->type_id->type_id));
+      TypeRegistry::TypeId(args->type_id->type_id));
   if (!user_data.ok()) {
     return new XLA_FFI_Error{std::move(user_data).status()};
   }
@@ -691,10 +702,9 @@ static XLA_FFI_Error* XLA_FFI_State_Set(XLA_FFI_State_Set_Args* args) {
       args->struct_size));
 
   DCHECK(args->ctx->execution_state) << "ExecutionState must be set";
-  absl::Status status = args->ctx->execution_state->Set(
-      TypeIdRegistry::TypeId(args->type_id->type_id), args->state,
-      [deleter = args->deleter](void* state) { deleter(state); });
 
+  absl::Status status = args->ctx->execution_state->Set(
+      TypeRegistry::TypeId(args->type_id->type_id), args->state);
   if (!status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
@@ -709,7 +719,7 @@ static XLA_FFI_Error* XLA_FFI_State_Get(XLA_FFI_State_Get_Args* args) {
 
   DCHECK(args->ctx->execution_state) << "ExecutionState must be set";
   absl::StatusOr<void*> state = args->ctx->execution_state->Get(
-      TypeIdRegistry::TypeId(args->type_id->type_id));
+      TypeRegistry::TypeId(args->type_id->type_id));
   if (!state.ok()) {
     return new XLA_FFI_Error{std::move(state).status()};
   }
@@ -843,132 +853,46 @@ static XLA_FFI_Error* XLA_FFI_ThreadPool_NumThreads(
 }
 
 //===----------------------------------------------------------------------===//
-// XLA FFI Internal Api Implementation
+// XLA FFI Api access
 //===----------------------------------------------------------------------===//
 
-static XLA_FFI_Error* XLA_FFI_INTERNAL_Error_Forward(void* status) {
-  auto* absl_status = reinterpret_cast<absl::Status*>(status);
-  if (ABSL_PREDICT_TRUE(absl_status->ok())) {
-    return nullptr;
-  }
-  return new XLA_FFI_Error{std::move(*absl_status)};
-}
-
-static XLA_FFI_Future* XLA_FFI_INTERNAL_Future_Forward(void* async_value) {
-  auto* tsl_async_value = reinterpret_cast<tsl::AsyncValue*>(async_value);
-  DCHECK(tsl_async_value) << "Async value must not be null";
-
-  return new XLA_FFI_Future{
-      tsl::AsyncValueRef<tsl::Chain>(tsl::TakeRef(tsl_async_value))};
-}
-
-static void* XLA_FFI_INTERNAL_Stream_Get(XLA_FFI_ExecutionContext* ctx) {
-  if (auto* gpu = std::get_if<XLA_FFI_ExecutionContext::GpuContext>(
-          &ctx->backend_context)) {
-    return gpu->stream;
-  }
-
-  return new XLA_FFI_Error{
-      InvalidArgument("XLA FFI GPU context is not available")};
-}
-
-static int32_t XLA_FFI_INTERNAL_DeviceOrdinal_Get(
-    XLA_FFI_ExecutionContext* ctx) {
-  return ctx->device_ordinal;
-}
-
-static int64_t XLA_FFI_INTERNAL_RunId_Get(XLA_FFI_ExecutionContext* ctx) {
-  return ctx->run_id.ToInt();
-}
-
-static void* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
-    XLA_FFI_ExecutionContext* ctx) {
-  if (auto* gpu = std::get_if<XLA_FFI_ExecutionContext::GpuContext>(
-          &ctx->backend_context)) {
-    return gpu->allocator;
-  }
-
-  return new XLA_FFI_Error{
-      InvalidArgument("XLA FFI GPU context is not available")};
-}
-
-static void* XLA_FFI_INTERNAL_CalledComputation_Get(
-    XLA_FFI_ExecutionContext* ctx) {
-  return const_cast<HloComputation*>(ctx->called_computation);
-}
-
-static void* XLA_FFI_INTERNAL_ExecutionContext_Get(
-    XLA_FFI_ExecutionContext* ctx) {
-  return const_cast<ffi::ExecutionContext*>(ctx->execution_context);
-}
-
-static void* XLA_FFI_INTERNAL_ExecutionState_Get(
-    XLA_FFI_ExecutionContext* ctx) {
-  return const_cast<ffi::ExecutionState*>(ctx->execution_state);
-}
-
-void* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(XLA_FFI_ExecutionContext* ctx) {
-  if (auto* cpu = std::get_if<XLA_FFI_ExecutionContext::CpuContext>(
-          &ctx->backend_context)) {
-    return const_cast<Eigen::ThreadPoolDevice*>(cpu->intra_op_thread_pool);
-  }
+const XLA_FFI_Api* GetXlaFfiApi() {
+  static XLA_FFI_Api api = {
+      XLA_FFI_Api_STRUCT_SIZE,
+      /*extension_start=*/nullptr,
+
+      XLA_FFI_Api_Version{
+          XLA_FFI_Api_Version_STRUCT_SIZE,
+          /*extension_start=*/nullptr,
+          XLA_FFI_API_MAJOR,
+          XLA_FFI_API_MINOR,
+      },
+
+      internal::GetInternalApi(),
+
+      XLA_FFI_Error_Create,
+      XLA_FFI_Error_GetMessage,
+      XLA_FFI_Error_Destroy,
+      XLA_FFI_Handler_Register,
+      XLA_FFI_Stream_Get,
+      XLA_FFI_Type_Register,
+      XLA_FFI_ExecutionContext_Get,
+      XLA_FFI_State_Set,
+      XLA_FFI_State_Get,
+      XLA_FFI_DeviceMemory_Allocate,
+      XLA_FFI_DeviceMemory_Free,
+      XLA_FFI_ThreadPool_Schedule,
+      XLA_FFI_ThreadPool_NumThreads,
+      XLA_FFI_Future_Create,
+      XLA_FFI_Future_SetAvailable,
+      XLA_FFI_Future_SetError,
+      XLA_FFI_RunId_Get,
+      XLA_FFI_DeviceOrdinal_Get,
+  };
 
-  return new XLA_FFI_Error{
-      InvalidArgument("XLA FFI CPU context is not available")};
+  return &api;
 }
 
-//===----------------------------------------------------------------------===//
-// XLA FFI Api access
-//===----------------------------------------------------------------------===//
-
 extern "C" const XLA_FFI_Api* XLA_FFI_GetApi() { return GetXlaFfiApi(); }
 
-static XLA_FFI_InternalApi internal_api = {
-    XLA_FFI_INTERNAL_Error_Forward,
-    XLA_FFI_INTERNAL_Future_Forward,
-    XLA_FFI_INTERNAL_Stream_Get,
-    XLA_FFI_INTERNAL_DeviceOrdinal_Get,
-    XLA_FFI_INTERNAL_RunId_Get,
-    XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get,
-    XLA_FFI_INTERNAL_CalledComputation_Get,
-    XLA_FFI_INTERNAL_ExecutionContext_Get,
-    XLA_FFI_INTERNAL_ExecutionState_Get,
-    XLA_FFI_INTERNAL_IntraOpThreadPool_Get,
-};
-
-static XLA_FFI_Api api = {
-    XLA_FFI_Api_STRUCT_SIZE,
-    /*extension_start=*/nullptr,
-
-    XLA_FFI_Api_Version{
-        XLA_FFI_Api_Version_STRUCT_SIZE,
-        /*extension_start=*/nullptr,
-        XLA_FFI_API_MAJOR,
-        XLA_FFI_API_MINOR,
-    },
-
-    &internal_api,
-
-    XLA_FFI_Error_Create,
-    XLA_FFI_Error_GetMessage,
-    XLA_FFI_Error_Destroy,
-    XLA_FFI_Handler_Register,
-    XLA_FFI_Stream_Get,
-    XLA_FFI_TypeId_Register,
-    XLA_FFI_ExecutionContext_Get,
-    XLA_FFI_State_Set,
-    XLA_FFI_State_Get,
-    XLA_FFI_DeviceMemory_Allocate,
-    XLA_FFI_DeviceMemory_Free,
-    XLA_FFI_ThreadPool_Schedule,
-    XLA_FFI_ThreadPool_NumThreads,
-    XLA_FFI_Future_Create,
-    XLA_FFI_Future_SetAvailable,
-    XLA_FFI_Future_SetError,
-    XLA_FFI_RunId_Get,
-    XLA_FFI_DeviceOrdinal_Get,
-};
-
-const XLA_FFI_Api* GetXlaFfiApi() { return &api; }
-
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index ef9f65f4bd06c8..86c403c10f564f 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -19,10 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 #include <variant>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/api.h"
@@ -142,11 +144,11 @@ class ScopedExecutionContext {
 //===----------------------------------------------------------------------===//
 
 struct HandlerRegistration {
-  XLA_FFI_Handler_Bundle bundle = {};
-  XLA_FFI_Handler_Traits traits = {};
+  XLA_FFI_Metadata metadata;
+  XLA_FFI_Handler_Bundle bundle;
 };
 
-bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits);
+bool IsCommandBufferCompatible(const XLA_FFI_Metadata& metadata);
 
 // Returns registered FFI handler for a given name and platform, or an error if
 // it's not found in the static registry.
@@ -163,6 +165,54 @@ StaticRegisteredHandlers(absl::string_view platform);
 
 const XLA_FFI_Api* GetXlaFfiApi();
 
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+// Decodes XLA FFI traits packed into a 32-bit integer into a vector of traits.
+inline std::vector<Traits> DecodeTraits(XLA_FFI_Handler_Traits traits) {
+  std::vector<Traits> result;
+  if (traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE) {
+    result.push_back(Traits::kCmdBufferCompatible);
+  }
+  return result;
+}
+
+//===----------------------------------------------------------------------===//
+// Pretty printinting for FFI C++ types.
+//===----------------------------------------------------------------------===//
+
+template <typename Sink>
+static void AbslStringify(Sink& sink, Traits traits) {
+  switch (traits) {
+    case Traits::kCmdBufferCompatible:
+      absl::Format(&sink, "cmd_buffer_compatible");
+      break;
+  }
+}
+
 }  // namespace xla::ffi
 
+//===----------------------------------------------------------------------===//
+// Pretty printinting for FFI C types.
+//===----------------------------------------------------------------------===//
+
+template <typename Sink>
+static void AbslStringify(Sink& sink, const XLA_FFI_TypeId& type_id) {
+  if (type_id.type_id == XLA_FFI_UNKNOWN_TYPE_ID.type_id) {
+    absl::Format(&sink, "unknown");
+  } else {
+    absl::Format(&sink, "%d", type_id.type_id);
+  }
+}
+
+template <typename Sink>
+static void AbslStringify(Sink& sink, const XLA_FFI_Metadata& metadata) {
+  absl::Format(&sink, "{api_version: %d.%d, traits: [%s], state: %v}",
+               metadata.api_version.major_version,
+               metadata.api_version.minor_version,
+               absl::StrJoin(xla::ffi::DecodeTraits(metadata.traits), ", "),
+               metadata.state_type_id);
+}
+
 #endif  // XLA_FFI_FFI_API_H_
diff --git a/third_party/xla/xla/ffi/ffi_internal_api.cc b/third_party/xla/xla/ffi/ffi_internal_api.cc
new file mode 100644
index 00000000000000..e18afb3fa2d130
--- /dev/null
+++ b/third_party/xla/xla/ffi/ffi_internal_api.cc
@@ -0,0 +1,156 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/ffi_internal_api.h"
+
+#include <cstdint>
+#include <utility>
+#include <variant>
+
+#include "absl/base/optimization.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/execution_context.h"
+#include "xla/ffi/execution_state.h"
+#include "xla/ffi/ffi_structs.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/util.h"
+
+namespace xla::ffi::internal {
+
+//===----------------------------------------------------------------------===//
+// Generic XLA internal APIs available on all XLA backends.
+//===----------------------------------------------------------------------===//
+
+static XLA_FFI_Error* XLA_FFI_INTERNAL_Error_Forward(void* status) {
+  auto* absl_status = reinterpret_cast<absl::Status*>(status);
+  if (ABSL_PREDICT_TRUE(absl_status->ok())) {
+    return nullptr;
+  }
+  return new XLA_FFI_Error{std::move(*absl_status)};
+}
+
+static XLA_FFI_Future* XLA_FFI_INTERNAL_Future_Forward(void* async_value) {
+  auto* tsl_async_value = reinterpret_cast<tsl::AsyncValue*>(async_value);
+  DCHECK(tsl_async_value) << "Async value must not be null";
+
+  return new XLA_FFI_Future{
+      tsl::AsyncValueRef<tsl::Chain>(tsl::TakeRef(tsl_async_value))};
+}
+
+static int32_t XLA_FFI_INTERNAL_DeviceOrdinal_Get(
+    XLA_FFI_ExecutionContext* ctx) {
+  return ctx->device_ordinal;
+}
+
+static int64_t XLA_FFI_INTERNAL_RunId_Get(XLA_FFI_ExecutionContext* ctx) {
+  return ctx->run_id.ToInt();
+}
+
+static void* XLA_FFI_INTERNAL_CalledComputation_Get(
+    XLA_FFI_ExecutionContext* ctx) {
+  return const_cast<HloComputation*>(ctx->called_computation);  // NOLINT
+}
+
+static void* XLA_FFI_INTERNAL_ExecutionContext_Get(
+    XLA_FFI_ExecutionContext* ctx) {
+  return const_cast<ExecutionContext*>(ctx->execution_context);  // NOLINT
+}
+
+static void* XLA_FFI_INTERNAL_ExecutionState_Get(
+    XLA_FFI_ExecutionContext* ctx) {
+  return const_cast<ExecutionState*>(ctx->execution_state);  // NOLINT
+}
+
+//===----------------------------------------------------------------------===//
+// XLA:CPU specific internal APIs.
+//===----------------------------------------------------------------------===//
+
+static XLA_FFI_Error* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
+    XLA_FFI_ExecutionContext* ctx, void** thread_pool) {
+  if (auto* cpu = std::get_if<XLA_FFI_ExecutionContext::CpuContext>(
+          &ctx->backend_context)) {
+    *thread_pool = const_cast<Eigen::ThreadPoolDevice*>(  // NOLINT
+        cpu->intra_op_thread_pool);
+    return nullptr;
+  }
+
+  // For GPU backend we don't have intra-op thread pool, but we didn't promise
+  // to return one, so instead of an error we return a nullptr thread pool.
+  if (auto* gpu = std::get_if<XLA_FFI_ExecutionContext::GpuContext>(
+          &ctx->backend_context)) {
+    return nullptr;
+  }
+
+  return new XLA_FFI_Error{InvalidArgument("XLA FFI context is not available")};
+}
+
+//===----------------------------------------------------------------------===//
+// XLA:GPU specific internal APIs.
+//===----------------------------------------------------------------------===//
+
+static XLA_FFI_Error* XLA_FFI_INTERNAL_Stream_Get(XLA_FFI_ExecutionContext* ctx,
+                                                  void** stream) {
+  if (auto* gpu = std::get_if<XLA_FFI_ExecutionContext::GpuContext>(
+          &ctx->backend_context)) {
+    *stream = gpu->stream;
+    return nullptr;
+  }
+
+  return new XLA_FFI_Error{
+      InvalidArgument("XLA FFI GPU context is not available")};
+}
+
+static XLA_FFI_Error* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
+    XLA_FFI_ExecutionContext* ctx, void** allocator) {
+  if (auto* gpu = std::get_if<XLA_FFI_ExecutionContext::GpuContext>(
+          &ctx->backend_context)) {
+    *allocator = gpu->allocator;
+    return nullptr;
+  }
+
+  return new XLA_FFI_Error{
+      InvalidArgument("XLA FFI GPU context is not available")};
+}
+
+const XLA_FFI_InternalApi* GetInternalApi() {
+  static XLA_FFI_InternalApi internal_api = {
+      // Generic XLA APIs available on all XLA backends.
+      XLA_FFI_INTERNAL_Error_Forward,
+      XLA_FFI_INTERNAL_Future_Forward,
+      XLA_FFI_INTERNAL_DeviceOrdinal_Get,
+      XLA_FFI_INTERNAL_RunId_Get,
+      XLA_FFI_INTERNAL_CalledComputation_Get,
+      XLA_FFI_INTERNAL_ExecutionContext_Get,
+      XLA_FFI_INTERNAL_ExecutionState_Get,
+
+      // XLA:CPU specific APIs.
+      XLA_FFI_INTERNAL_IntraOpThreadPool_Get,
+
+      // XLA:GPU specific APIs.
+      XLA_FFI_INTERNAL_Stream_Get,
+      XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get,
+  };
+
+  return &internal_api;
+}
+
+}  // namespace xla::ffi::internal
diff --git a/third_party/xla/xla/ffi/ffi_internal_api.h b/third_party/xla/xla/ffi/ffi_internal_api.h
new file mode 100644
index 00000000000000..135388224cca4b
--- /dev/null
+++ b/third_party/xla/xla/ffi/ffi_internal_api.h
@@ -0,0 +1,28 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_FFI_INTERNAL_API_H_
+#define XLA_FFI_FFI_INTERNAL_API_H_
+
+#include "xla/ffi/api/c_api.h"
+
+namespace xla::ffi::internal {
+
+// Returns a pointer to the implementation of the internal XLA FFI API.
+const XLA_FFI_InternalApi* GetInternalApi();
+
+}  // namespace xla::ffi::internal
+
+#endif  // XLA_FFI_FFI_INTERNAL_API_H_
diff --git a/third_party/xla/xla/ffi/ffi_structs.h b/third_party/xla/xla/ffi/ffi_structs.h
new file mode 100644
index 00000000000000..fd09f6f9b2b210
--- /dev/null
+++ b/third_party/xla/xla/ffi/ffi_structs.h
@@ -0,0 +1,76 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_FFI_STRUCTS_H_
+#define XLA_FFI_FFI_STRUCTS_H_
+
+#include <cstdint>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/ffi/execution_state.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+
+//===----------------------------------------------------------------------===//
+// Forward declare backend-specific types.
+//===----------------------------------------------------------------------===//
+
+namespace Eigen {
+struct ThreadPoolDevice;
+}  // namespace Eigen
+
+namespace stream_executor {
+class Stream;
+class DeviceMemoryAllocator;
+}  // namespace stream_executor
+
+//===----------------------------------------------------------------------===//
+// XLA FFI C structs definition
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Error {
+  absl::Status status;
+};
+
+struct XLA_FFI_Future {
+  tsl::AsyncValueRef<tsl::Chain> async_value;
+};
+
+struct XLA_FFI_ExecutionContext {
+  struct CpuContext {
+    const Eigen::ThreadPoolDevice* intra_op_thread_pool = nullptr;
+  };
+
+  struct GpuContext {
+    stream_executor::Stream* stream = nullptr;
+    stream_executor::DeviceMemoryAllocator* allocator = nullptr;
+  };
+
+  using BackendContext = std::variant<std::monostate, CpuContext, GpuContext>;
+
+  xla::RunId run_id = {};
+  int32_t device_ordinal = -1;
+  BackendContext backend_context = {};
+
+  const xla::HloComputation* called_computation = nullptr;
+  const xla::ffi::ExecutionContext* execution_context = nullptr;
+  xla::ffi::ExecutionState* execution_state = nullptr;
+};
+
+#endif  // XLA_FFI_FFI_STRUCTS_H_
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 265e399138aeb7..8f0b00244c0a93 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -33,24 +33,26 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/cpu/ffi.h"
+#include "xla/backends/gpu/ffi.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
-#include "tsl/platform/threadpool.h"
 
 #define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -86,7 +88,6 @@ using ::testing::_;
 using ::testing::HasSubstr;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
-using ::tsl::testing::StatusIs;
 
 TEST(FfiTest, StaticHandlerRegistration) {
   static constexpr auto* noop = +[] { return absl::OkStatus(); };
@@ -107,8 +108,9 @@ TEST(FfiTest, StaticHandlerRegistration) {
   TF_ASSERT_OK(handler0.status());
   TF_ASSERT_OK(handler1.status());
 
-  ASSERT_EQ(handler0->traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
-  ASSERT_EQ(handler1->traits, 0);
+  ASSERT_EQ(handler0->metadata.traits,
+            XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
+  ASSERT_EQ(handler1->metadata.traits, 0);
 
   // Check that platform name was canonicalized an we can find handlers
   // registered for "Host" platform as "Cpu" handlers.
@@ -124,7 +126,8 @@ TEST(FfiTest, RegistrationTraitsBackwardsCompatibility) {
                            XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
   auto handler = FindHandler("traits-bwd-compat", "Host");
   TF_ASSERT_OK(handler.status());
-  ASSERT_EQ(handler->traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
+  ASSERT_EQ(handler->metadata.traits,
+            XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
 }
 
 // Declare XLA FFI handler as a function (extern "C" declaration).
@@ -141,7 +144,7 @@ TEST(FfiTest, StaticHandlerSymbolRegistration) {
   auto handler0 = FindHandler("no-op-sym-0", "Cpu");
 
   TF_ASSERT_OK(handler0.status());
-  ASSERT_EQ(handler0->traits, 0);
+  ASSERT_EQ(handler0->metadata.traits, 0);
 }
 
 TEST(FfiTest, ForwardError) {
@@ -184,8 +187,12 @@ TEST(FfiTest, WrongNumArgs) {
 
   auto status = Call(*handler, call_frame);
 
-  ASSERT_EQ(status.message(),
-            "Wrong number of arguments: expected 2 but got 1");
+  EXPECT_THAT(
+      status,
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          HasSubstr(
+              "[execute] Wrong number of arguments: expected 2 but got 1")));
 }
 
 TEST(FfiTest, WrongNumAttrs) {
@@ -206,23 +213,42 @@ TEST(FfiTest, WrongNumAttrs) {
       status,
       absl_testing::StatusIs(
           absl::StatusCode::kInvalidArgument,
-          HasSubstr("Wrong number of attributes: expected 1 but got 2")));
+          HasSubstr(
+              "[execute] Wrong number of attributes: expected 1 but got 2")));
+}
+
+TEST(FfiTest, IgnoreAttrs) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", 42);
+  attrs.Insert("f32", 42.0f);
+
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  // If signature doesn't have attributes, then we can safely ignore them.
+  auto handler = Ffi::Bind().To([]() { return absl::OkStatus(); });
+
+  auto status = Call(*handler, call_frame);
+  TF_ASSERT_OK(status);
 }
 
 TEST(FfiTest, RunId) {
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
   auto call_frame = builder.Build();
 
-  auto handler = Ffi::Bind().Ctx<RunId>().To([&](RunId run_id) {
-    EXPECT_EQ(run_id.ToInt(), 42);
-    return absl::OkStatus();
-  });
+  auto handler = Ffi::Bind().Ctx<RunId>().Ctx().To(
+      [&](RunId run_id, Context context) -> absl::Status {
+        EXPECT_EQ(run_id.ToInt(), 42);
+        TF_ASSIGN_OR_RETURN(RunId run_id_from_context, context.get<RunId>());
+        EXPECT_EQ(run_id_from_context.ToInt(), 42);
+        return absl::OkStatus();
+      });
 
   CallOptions options;
   options.run_id = RunId{42};
 
   auto status = Call(*handler, call_frame, options);
-
   TF_ASSERT_OK(status);
 }
 
@@ -385,9 +411,15 @@ TEST(FfiTest, AttrsAsDictionary) {
     EXPECT_TRUE(f32.ok());
     EXPECT_TRUE(str.ok());
 
-    if (i32.ok()) EXPECT_EQ(*i32, 42);
-    if (f32.ok()) EXPECT_EQ(*f32, 42.0f);
-    if (str.ok()) EXPECT_EQ(*str, "foo");
+    if (i32.ok()) {
+      EXPECT_EQ(*i32, 42);
+    }
+    if (f32.ok()) {
+      EXPECT_EQ(*f32, 42.0f);
+    }
+    if (str.ok()) {
+      EXPECT_EQ(*str, "foo");
+    }
 
     EXPECT_FALSE(dict.contains("i64"));
     EXPECT_FALSE(dict.get<int64_t>("i32").ok());
@@ -403,10 +435,10 @@ TEST(FfiTest, AttrsAsDictionary) {
 }
 
 TEST(FfiTest, DictionaryAttr) {
-  CallFrameBuilder::AttributesMap dict0;
+  AttributesMap dict0;
   dict0.try_emplace("i32", 42);
 
-  CallFrameBuilder::AttributesMap dict1;
+  AttributesMap dict1;
   dict1.try_emplace("f32", 42.0f);
 
   CallFrameBuilder::AttributesBuilder attrs;
@@ -430,8 +462,12 @@ TEST(FfiTest, DictionaryAttr) {
     EXPECT_TRUE(i32.ok());
     EXPECT_TRUE(f32.ok());
 
-    if (i32.ok()) EXPECT_EQ(*i32, 42);
-    if (f32.ok()) EXPECT_EQ(*f32, 42.0f);
+    if (i32.ok()) {
+      EXPECT_EQ(*i32, 42);
+    }
+    if (f32.ok()) {
+      EXPECT_EQ(*f32, 42.0f);
+    }
 
     return absl::OkStatus();
   };
@@ -445,7 +481,7 @@ TEST(FfiTest, DictionaryAttr) {
 }
 
 TEST(FfiTest, StructAttr) {
-  CallFrameBuilder::AttributesMap dict;
+  AttributesMap dict;
   dict.try_emplace("i32", 42);
   dict.try_emplace("f32", 42.0f);
 
@@ -1080,26 +1116,37 @@ TEST(FfiTest, AsyncHandler) {
 }
 
 TEST(FfiTest, Metadata) {
-  static constexpr auto* noop = +[] { return absl::OkStatus(); };
-  XLA_FFI_DEFINE_HANDLER(handler, noop, Ffi::Bind());
-  auto maybe_metadata = GetMetadata(handler);
+  static constexpr auto* instantiate =
+      +[]() -> absl::StatusOr<std::unique_ptr<StrState>> {
+    return std::make_unique<StrState>("");
+  };
+  XLA_FFI_DEFINE_HANDLER(handler, instantiate, Ffi::BindInstantiate());
+
+  absl::StatusOr<XLA_FFI_Metadata> maybe_metadata = GetMetadata(handler);
   EXPECT_TRUE(maybe_metadata.ok());
-  auto metadata = maybe_metadata.value();
+
+  XLA_FFI_Metadata metadata = maybe_metadata.value();
   EXPECT_EQ(metadata.traits, 0);
   EXPECT_EQ(metadata.api_version.major_version, XLA_FFI_API_MAJOR);
   EXPECT_EQ(metadata.api_version.minor_version, XLA_FFI_API_MINOR);
+
+  TypeRegistry::TypeId type_id = TypeRegistry::GetTypeId<StrState>();
+  EXPECT_EQ(metadata.state_type_id.type_id, type_id);
 }
 
 TEST(FfiTest, MetadataTraits) {
   static constexpr auto* noop = +[] { return absl::OkStatus(); };
   XLA_FFI_DEFINE_HANDLER(handler, noop, Ffi::Bind(),
                          {Traits::kCmdBufferCompatible});
-  auto maybe_metadata = GetMetadata(handler);
+
+  absl::StatusOr<XLA_FFI_Metadata> maybe_metadata = GetMetadata(handler);
   EXPECT_TRUE(maybe_metadata.ok());
-  auto metadata = maybe_metadata.value();
+
+  XLA_FFI_Metadata metadata = maybe_metadata.value();
   EXPECT_EQ(metadata.traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
   EXPECT_EQ(metadata.api_version.major_version, XLA_FFI_API_MAJOR);
   EXPECT_EQ(metadata.api_version.minor_version, XLA_FFI_API_MINOR);
+  EXPECT_EQ(metadata.state_type_id.type_id, XLA_FFI_UNKNOWN_TYPE_ID.type_id);
 }
 
 // Use opaque struct to define a platform stream type just like platform
@@ -1117,13 +1164,6 @@ TEST(FfiTest, PlatformStream) {
   (void)Ffi::BindTo(+[](TestStream stream) { return absl::OkStatus(); });
 }
 
-TEST(FfiTest, BindFfiInternals) {
-  (void)Ffi::Bind().Ctx<FfiApi>().Ctx<FfiExecutionContext>().To(
-      +[](const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx) {
-        return absl::OkStatus();
-      });
-}
-
 //===----------------------------------------------------------------------===//
 // Performance benchmarks are below.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/type_id_registry.cc b/third_party/xla/xla/ffi/type_id_registry.cc
deleted file mode 100644
index 7fdcfe66e2f75f..00000000000000
--- a/third_party/xla/xla/ffi/type_id_registry.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/ffi/type_id_registry.h"
-
-#include <atomic>
-#include <cstdint>
-#include <string>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
-#include "absl/base/const_init.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/util.h"
-
-namespace xla::ffi {
-
-ABSL_CONST_INIT absl::Mutex type_registry_mutex(absl::kConstInit);
-
-using ExternalTypeIdRegistry =
-    absl::flat_hash_map<std::string, TypeIdRegistry::TypeId>;
-
-static ExternalTypeIdRegistry& StaticExternalTypeIdRegistry() {
-  static auto* const registry = new ExternalTypeIdRegistry();
-  return *registry;
-}
-
-TypeIdRegistry::TypeId TypeIdRegistry::GetNextInternalTypeId() {
-  static auto* counter = new std::atomic<int64_t>(1);
-  return TypeId(counter->fetch_add(1));
-}
-
-TypeIdRegistry::TypeId TypeIdRegistry::GetNextExternalTypeId() {
-  static auto* counter = new std::atomic<int64_t>(1);
-  return TypeId(counter->fetch_add(1));
-}
-
-absl::StatusOr<TypeIdRegistry::TypeId> TypeIdRegistry::AssignExternalTypeId(
-    absl::string_view name) {
-  absl::MutexLock lock(type_registry_mutex);
-  auto& registry = StaticExternalTypeIdRegistry();
-
-  // Try to emplace with unknow type id and fill it with real type id only if we
-  // successfully acquired an entry for a given name.
-  auto emplaced = registry.emplace(name, kUnknownTypeId);
-  if (!emplaced.second) {
-    return Internal("Type name %s already registered with type id %d", name,
-                    emplaced.first->second.value());
-  }
-
-  // Returns true if the registry contains an entry with a given type id.
-  auto type_id_is_in_use = [&registry](TypeId type_id) {
-    return absl::c_any_of(registry,
-                          [&](const auto& e) { return e.second == type_id; });
-  };
-
-  // Create a new type id that is not already in use.
-  TypeId type_id = GetNextExternalTypeId();
-  while (type_id_is_in_use(type_id)) {
-    type_id = GetNextExternalTypeId();
-  }
-
-  return emplaced.first->second = type_id;
-}
-
-absl::Status TypeIdRegistry::RegisterExternalTypeId(absl::string_view name,
-                                                    TypeId type_id) {
-  absl::MutexLock lock(type_registry_mutex);
-  auto& registry = StaticExternalTypeIdRegistry();
-
-  auto emplaced = registry.emplace(name, type_id);
-  if (!emplaced.second && emplaced.first->second != type_id) {
-    return Internal("Type name %s already registered with type id %d vs %d)",
-                    name, emplaced.first->second.value(), type_id.value());
-  }
-
-  return absl::OkStatus();
-}
-
-}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/type_id_registry_test.cc b/third_party/xla/xla/ffi/type_id_registry_test.cc
deleted file mode 100644
index 7e555291afb057..00000000000000
--- a/third_party/xla/xla/ffi/type_id_registry_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/ffi/type_id_registry.h"
-
-#include <cstdint>
-#include <limits>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-
-namespace xla::ffi {
-namespace {
-
-using ::testing::HasSubstr;
-
-TEST(TypeIdRegistryTest, RegisterExternalTypeId) {
-  TF_ASSERT_OK_AND_ASSIGN(auto type_id,
-                          TypeIdRegistry::AssignExternalTypeId("foo"));
-  EXPECT_GE(type_id.value(), 0);
-
-  auto duplicate_type_id = TypeIdRegistry::AssignExternalTypeId("foo");
-  EXPECT_THAT(duplicate_type_id.status().message(),
-              HasSubstr("Type name foo already registered with type id"));
-
-  // It's ok to register the same type with same type id.
-  TF_ASSERT_OK(TypeIdRegistry::RegisterExternalTypeId("foo", type_id));
-
-  // It's an error to register the same type with a different type id.
-  auto wrong_type_id = TypeIdRegistry::RegisterExternalTypeId(
-      "foo", TypeIdRegistry::TypeId(std::numeric_limits<int64_t>::max()));
-  EXPECT_THAT(wrong_type_id.message(),
-              HasSubstr("Type name foo already registered with type id"));
-
-  // It's ok to register a new type with a user-provided type id.
-  TF_ASSERT_OK(TypeIdRegistry::RegisterExternalTypeId(
-      "bar", TypeIdRegistry::TypeId(std::numeric_limits<int64_t>::max())));
-}
-
-TEST(TypeIdRegistryTest, RegisterInternalTypeId) {
-  auto int32_type_id = TypeIdRegistry::GetTypeId<int32_t>();
-  auto int64_type_id = TypeIdRegistry::GetTypeId<int64_t>();
-  EXPECT_NE(int32_type_id, int64_type_id);
-}
-
-}  // namespace
-}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/type_registry.cc b/third_party/xla/xla/ffi/type_registry.cc
new file mode 100644
index 00000000000000..fa45bb8dff8505
--- /dev/null
+++ b/third_party/xla/xla/ffi/type_registry.cc
@@ -0,0 +1,141 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/type_registry.h"
+
+#include <atomic>
+#include <cstdint>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/util.h"
+
+namespace xla::ffi {
+namespace {
+
+struct TypeRegistration {
+  TypeRegistry::TypeId type_id;
+  TypeRegistry::TypeInfo type_info;
+};
+
+using TypeRegistryMap = absl::flat_hash_map<std::string, TypeRegistration>;
+
+}  // namespace
+
+ABSL_CONST_INIT absl::Mutex type_registry_mutex(absl::kConstInit);
+
+static TypeRegistryMap& StaticTypeRegistryMap() {
+  static absl::NoDestructor<TypeRegistryMap> registry;
+  return *registry;
+}
+
+TypeRegistry::TypeId TypeRegistry::GetNextTypeId() {
+  static absl::NoDestructor<std::atomic<int64_t>> counter(1);
+  return TypeId(counter->fetch_add(1));
+}
+
+absl::StatusOr<TypeRegistry::TypeId> TypeRegistry::AssignExternalTypeId(
+    absl::string_view name, TypeInfo type_info) {
+  VLOG(3) << absl::StrFormat("Assign external type id: name=%s", name);
+
+  absl::MutexLock lock(type_registry_mutex);
+  auto& registry = StaticTypeRegistryMap();
+
+  // Try to emplace with unknow type id and fill it with real type id only if we
+  // successfully acquired an entry for a given name.
+  auto emplaced =
+      registry.emplace(name, TypeRegistration{kUnknownTypeId, type_info});
+  if (!emplaced.second) {
+    return Internal("Type name %s already registered with type id %d", name,
+                    emplaced.first->second.type_id.value());
+  }
+
+  // Returns true if the registry contains an entry with a given type id.
+  auto type_id_is_in_use = [&registry](TypeId type_id) {
+    return absl::c_any_of(
+        registry, [&](const auto& e) { return e.second.type_id == type_id; });
+  };
+
+  // Create a new type id that is not already in use.
+  TypeId type_id = GetNextTypeId();
+  while (type_id_is_in_use(type_id)) {
+    type_id = GetNextTypeId();
+  }
+
+  VLOG(3) << absl::StrFormat("Assigned external type id: name=%s type_id=%d",
+                             name, type_id.value());
+  return emplaced.first->second.type_id = type_id;
+}
+
+absl::Status TypeRegistry::RegisterExternalTypeId(absl::string_view name,
+                                                  TypeId type_id,
+                                                  TypeInfo type_info) {
+  VLOG(3) << absl::StrFormat("Register external type id: name=%s type_id=%d",
+                             name, type_id.value());
+
+  absl::MutexLock lock(type_registry_mutex);
+  auto& registry = StaticTypeRegistryMap();
+
+  auto emplaced = registry.emplace(name, TypeRegistration{type_id, type_info});
+  if (!emplaced.second && emplaced.first->second.type_id != type_id) {
+    return Internal("Type name %s already registered with type id %d vs %d)",
+                    name, emplaced.first->second.type_id.value(),
+                    type_id.value());
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<TypeRegistry::TypeId> TypeRegistry::GetTypeId(
+    absl::string_view name) {
+  absl::MutexLock lock(type_registry_mutex);
+  auto& registry = StaticTypeRegistryMap();
+
+  auto it = registry.find(name);
+  if (it == registry.end()) {
+    return Internal("Type name %s is not registered", name);
+  }
+  return it->second.type_id;
+}
+
+absl::StatusOr<TypeRegistry::TypeInfo> TypeRegistry::GetTypeInfo(
+    TypeId type_id) {
+  absl::MutexLock lock(type_registry_mutex);
+  auto& registry = StaticTypeRegistryMap();
+
+  auto it = absl::c_find_if(registry, [&](const auto& kv) {
+    auto& [name, registration] = kv;
+    return registration.type_id == type_id;
+  });
+
+  if (it == registry.end()) {
+    return Internal("Type id %d is not registered with a static registry",
+                    type_id.value());
+  }
+
+  return it->second.type_info;
+}
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/type_id_registry.h b/third_party/xla/xla/ffi/type_registry.h
similarity index 52%
rename from third_party/xla/xla/ffi/type_id_registry.h
rename to third_party/xla/xla/ffi/type_registry.h
index 283ec977779609..8fabc15f4ed55e 100644
--- a/third_party/xla/xla/ffi/type_id_registry.h
+++ b/third_party/xla/xla/ffi/type_registry.h
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_FFI_TYPE_ID_REGISTRY_H_
-#define XLA_FFI_TYPE_ID_REGISTRY_H_
+#ifndef XLA_FFI_TYPE_REGISTRY_H_
+#define XLA_FFI_TYPE_REGISTRY_H_
 
 #include <cstdint>
 
+#include "absl/base/no_destructor.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/gtl/int_type.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 
 namespace xla::ffi {
 
@@ -43,39 +45,83 @@ namespace xla::ffi {
 //
 // 2. Internal type id. When FFI handler defined in the same binary we rely
 //    on a global static registry to automatically assign type ids.
-class TypeIdRegistry {
+//
+// TypeInfo defines a set of functions that allow XLA runtime to manipulate
+// external types. For user data, that is forwarded to FFI handlers, they all
+// can be `nullptr` as XLA runtime doesn't manage their lifetime. For stateful
+// handlers, XLA runtime at least must know how to destroy the state when XLA
+// executable is destroyed.
+class TypeRegistry {
  public:
+  // Unique (within a process) identifier for a type.
   TSL_LIB_GTL_DEFINE_INT_TYPE(TypeId, int64_t);
 
   static constexpr TypeId kUnknownTypeId = TypeId(0);
 
+  // Pointers to functions that allow XLA runtime to manipulate external types.
+  struct TypeInfo {
+    using Deleter = void (*)(void*);
+
+    Deleter deleter = nullptr;
+  };
+
+  // Returns type id for a given type name. Returns an error if type is
+  // not registered. Works for both external and internal type ids.
+  static absl::StatusOr<TypeId> GetTypeId(absl::string_view name);
+
+  // Returns type info for a given type id. Returns an error if type id is not
+  // registered. Works for both external and internal type ids.
+  static absl::StatusOr<TypeInfo> GetTypeInfo(TypeId type_id);
+
   // Assigns a unique type id to an external type with a given name. Returns an
   // error if a type with a given name is already registered in the process.
-  static absl::StatusOr<TypeId> AssignExternalTypeId(absl::string_view name);
+  static absl::StatusOr<TypeId> AssignExternalTypeId(absl::string_view name,
+                                                     TypeInfo type_info);
 
   // Registers external type with a given name and type id. Type id is provided
   // by the caller, and must be unique. Returns an error if a type with a given
   // name is already registered with a different type id.
   static absl::Status RegisterExternalTypeId(absl::string_view name,
-                                             TypeId type_id);
+                                             TypeId type_id,
+                                             TypeInfo type_info);
+
+  // Returns a type name for a given type. For internal type ids only.
+  template <typename T>
+  static absl::string_view GetTypeName();
 
   // Returns a type id for a given type. For internal type ids only.
   template <typename T>
   static TypeId GetTypeId();
 
+  // Returns type info for a given type id. For internal type ids only.
+  template <typename T>
+  static TypeInfo GetTypeInfo();
+
  private:
-  // We never mix external and internal type ids, so we can use different type
-  // id spaces to assign unique ids to each type.
-  static TypeId GetNextInternalTypeId();
-  static TypeId GetNextExternalTypeId();
+  static TypeId GetNextTypeId();
 };
 
 template <typename T>
-TypeIdRegistry::TypeId TypeIdRegistry::GetTypeId() {
-  static const TypeId id = GetNextInternalTypeId();
-  return id;
+absl::string_view TypeRegistry::GetTypeName() {
+  return typeid(T).name();
+}
+
+template <typename T>
+TypeRegistry::TypeId TypeRegistry::GetTypeId() {
+  // We always register internal types in the static type registry, because we
+  // want to be able to lookup them by name.
+  static const absl::NoDestructor<absl::StatusOr<TypeId>> id(
+      AssignExternalTypeId(GetTypeName<T>(), GetTypeInfo<T>()));
+  return **id;
+}
+
+template <typename T>
+TypeRegistry::TypeInfo TypeRegistry::GetTypeInfo() {
+  return TypeInfo{
+      [](void* state) { delete tsl::safe_reinterpret_cast<T*>(state); },
+  };
 }
 
 }  // namespace xla::ffi
 
-#endif  // XLA_FFI_TYPE_ID_REGISTRY_H_
+#endif  // XLA_FFI_TYPE_REGISTRY_H_
diff --git a/third_party/xla/xla/ffi/type_registry_test.cc b/third_party/xla/xla/ffi/type_registry_test.cc
new file mode 100644
index 00000000000000..936b4e40231b93
--- /dev/null
+++ b/third_party/xla/xla/ffi/type_registry_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/type_registry.h"
+
+#include <cstdint>
+#include <limits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla::ffi {
+namespace {
+
+using ::testing::HasSubstr;
+
+TEST(TypeRegistryTest, RegisterExternalTypeId) {
+  TypeRegistry::TypeInfo type_info = {+[](void* state) {}};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto foo_id,
+                          TypeRegistry::AssignExternalTypeId("foo", type_info));
+  EXPECT_GE(foo_id.value(), 0);
+
+  auto duplicate_foo_id = TypeRegistry::AssignExternalTypeId("foo", type_info);
+  EXPECT_THAT(duplicate_foo_id.status().message(),
+              HasSubstr("Type name foo already registered with type id"));
+
+  // It's ok to register the same type with same type id.
+  TF_ASSERT_OK(TypeRegistry::RegisterExternalTypeId("foo", foo_id, type_info));
+
+  // It's an error to register the same type with a different type id.
+  auto wrong_foo_id = TypeRegistry::RegisterExternalTypeId(
+      "foo", TypeRegistry::TypeId(std::numeric_limits<int64_t>::max()),
+      type_info);
+  EXPECT_THAT(wrong_foo_id.message(),
+              HasSubstr("Type name foo already registered with type id"));
+
+  // Registered type has a correct type info.
+  TF_ASSERT_OK_AND_ASSIGN(TypeRegistry::TypeInfo foo_info,
+                          TypeRegistry::GetTypeInfo(foo_id));
+  EXPECT_EQ(foo_info.deleter, type_info.deleter);
+
+  // It's ok to register a new type with a user-provided type id.
+  auto bar_id = TypeRegistry::TypeId(std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK(TypeRegistry::RegisterExternalTypeId(
+      "bar", TypeRegistry::TypeId(std::numeric_limits<int64_t>::max()),
+      type_info));
+
+  // And a new type has a correct type info.
+  TF_ASSERT_OK_AND_ASSIGN(TypeRegistry::TypeInfo bar_info,
+                          TypeRegistry::GetTypeInfo(bar_id));
+  EXPECT_EQ(bar_info.deleter, type_info.deleter);
+}
+
+TEST(TypeRegistryTest, RegisterInternalTypeId) {
+  auto int32_id = TypeRegistry::GetTypeId<int32_t>();
+  auto int64_id = TypeRegistry::GetTypeId<int64_t>();
+  EXPECT_NE(int32_id, int64_id);
+
+  absl::string_view int32_name = TypeRegistry::GetTypeName<int32_t>();
+  absl::string_view int64_name = TypeRegistry::GetTypeName<int64_t>();
+  EXPECT_EQ(*TypeRegistry::GetTypeId(int32_name), int32_id);
+  EXPECT_EQ(*TypeRegistry::GetTypeId(int64_name), int64_id);
+}
+
+TEST(TypeRegistryTest, InternalTypeInfo) {
+  int32_t* ptr = new int32_t{42};
+
+  TypeRegistry::TypeInfo type_info = TypeRegistry::GetTypeInfo<int32_t>();
+  type_info.deleter(ptr);
+}
+
+}  // namespace
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/hlo/analysis/BUILD b/third_party/xla/xla/hlo/analysis/BUILD
index 923576c12c7d4a..c502d6c0d5e8a2 100644
--- a/third_party/xla/xla/hlo/analysis/BUILD
+++ b/third_party/xla/xla/hlo/analysis/BUILD
@@ -46,10 +46,10 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
     ],
 )
@@ -85,10 +85,10 @@ xla_cc_test(
         "//xla/service:computation_placer",
         "//xla/service:hlo_module_config",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:test_benchmark",
+        "@com_google_benchmark//:benchmark",
     ],
 )
 
@@ -101,7 +101,6 @@ cc_library(
         ":hlo_dataflow_analysis",
         ":hlo_reachability",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:types",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -114,8 +113,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -155,14 +152,12 @@ cc_library(
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
-        "//xla/service:call_inliner",
         "//xla/service:collective_ops_utils",
         "//xla/service:constant_value",
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service:value_range",
         "//xla/tools:hlo_extractor",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -236,7 +231,6 @@ cc_library(
         "//xla/service:hlo_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -468,7 +462,6 @@ cc_library(
         "//xla/service:hlo_buffer",
         "//xla/service:hlo_value",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -582,47 +575,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "indexed_array_analysis",
-    srcs = ["indexed_array_analysis.cc"],
-    hdrs = ["indexed_array_analysis.h"],
-    deps = [
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/evaluator:hlo_evaluator",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "indexed_array_analysis_test",
-    srcs = ["indexed_array_analysis_test.cc"],
-    deps = [
-        ":indexed_array_analysis",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "interval",
     srcs = ["interval.cc"],
@@ -660,6 +612,7 @@ cc_library(
     ],
     deps = [
         ":interval",
+        ":symbolic_expr",
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:util",
@@ -693,9 +646,9 @@ xla_cc_test(
         ":indexing_analysis",
         ":indexing_test_utils",
         ":interval",
+        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/strings:string_view",
@@ -713,7 +666,7 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
-        ":interval",
+        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
@@ -730,11 +683,11 @@ cc_library(
     hdrs = ["indexing_test_utils.h"],
     deps = [
         ":indexing_analysis",
+        ":symbolic_expr",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
@@ -767,3 +720,93 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
     ],
 )
+
+cc_library(
+    name = "symbolic_map",
+    srcs = ["symbolic_map.cc"],
+    hdrs = ["symbolic_map.h"],
+    deps = [
+        ":symbolic_expr",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_map_test",
+    srcs = ["symbolic_map_test.cc"],
+    deps = [
+        ":symbolic_expr",
+        ":symbolic_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "symbolic_expr",
+    srcs = ["symbolic_expr.cc"],
+    hdrs = ["symbolic_expr.h"],
+    deps = [
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_expr_test",
+    srcs = ["symbolic_expr_test.cc"],
+    deps = [
+        ":indexing_test_utils",
+        ":symbolic_expr",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "symbolic_map_converter",
+    srcs = ["symbolic_map_converter.cc"],
+    hdrs = ["symbolic_map_converter.h"],
+    deps = [
+        ":symbolic_expr",
+        ":symbolic_map",
+        "//xla/hlo/analysis:interval",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_map_converter_test",
+    srcs = ["symbolic_map_converter_test.cc"],
+    deps = [
+        ":symbolic_expr",
+        ":symbolic_map",
+        ":symbolic_map_converter",
+        "//xla/hlo/analysis:interval",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc
index aa5cdd75d87063..811addc6d84651 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc
@@ -43,10 +43,10 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_value.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
@@ -420,7 +420,7 @@ absl::StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
   }
 
   CHECK_EQ(alias_analysis->value_to_buffer_.size(), num_values);
-  TF_DCHECK_OK(alias_analysis->Verify());
+  DCHECK_OK(alias_analysis->Verify());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   ShapeUtil::ForEachSubshape(root->shape(), [&](const Shape& /*subshape*/,
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
index 553cf6363b8c0d..ca00349f4c25d5 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
@@ -44,15 +44,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout_util.h"
 #include "xla/map_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_value.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -136,175 +135,6 @@ bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
   return true;
 }
 
-namespace {
-bool Is1dSliceWithoutStrides(const HloInstruction* instr) {
-  return instr->opcode() == HloOpcode::kSlice &&
-         1 == instr->slice_starts().size() &&
-         1 == instr->slice_limits().size() &&
-         1 == instr->slice_strides().size() &&
-         1 == instr->slice_strides().at(0);
-}
-
-bool IsSliceInputFusion(const HloInstruction& unnested_hlo) {
-  if (!unnested_hlo.IsInputFusion()) {
-    return false;
-  }
-  const HloInstruction* root = unnested_hlo.fused_expression_root();
-  if (root->opcode() != HloOpcode::kTuple) {
-    return false;
-  }
-  return absl::c_all_of(root->operands(), [](const HloInstruction* instr) {
-    return Is1dSliceWithoutStrides(instr);
-  });
-}
-
-struct ConcatUsageInfo {
-  // Pointer to a previously seen concat. nullptr if no previously seen concat.
-  const HloInstruction* prev_concat;
-  // The opnd id of the seen concat.
-  int64_t concat_opnd_idx;
-  // The slice that recovers the opnd in the concat outputs.
-  const HloInstruction* slice_to_recover_opnd;
-};
-
-// Returns an optional concat usage info to denote whether the concat is used in
-// an elementwise manner. A concat followed by slices is considered effectively
-// elementwise if the slices combinedly is a reverse function of the concat.
-std::optional<ConcatUsageInfo> ConcatIsEffectivelyElementwise(
-    const HloInstruction& concat, const HloInstruction& operand,
-    const ConcatUsageInfo& info) {
-  // First, check if this concat is in the below pattern. Also, we check
-  // that the slices combinedly are in effect a reverse function of the concat.
-  //
-  //     Concat
-  //     |    |
-  //     v    v
-  //   Slice Slice
-  //
-  std::vector<HloInstruction*> users = concat.users();
-  if (!absl::c_all_of(users, Is1dSliceWithoutStrides)) {
-    // Limit our supported cases to 1 dimensional slices.
-    return std::optional<ConcatUsageInfo>();
-  }
-  // Verify that each operand to the concat is reversed by a slice.
-  if (users.size() != concat.operand_count() ||
-      concat.operand_count() != concat.unique_operands().size()) {
-    return std::optional<ConcatUsageInfo>();
-  }
-  absl::c_sort(users, [](const HloInstruction* a, const HloInstruction* b) {
-    return a->slice_starts().at(0) < b->slice_starts().at(0);
-  });
-  int64_t prev_limit = 0;
-  for (int64_t i = 0; i < users.size(); ++i) {
-    const HloInstruction* u = users[i];
-    int64_t slice_size = u->slice_limits().at(0) - u->slice_starts().at(0);
-    if (u->slice_starts().at(0) != prev_limit ||
-        slice_size != ShapeUtil::ElementsIn(concat.operand(i)->shape())) {
-      return std::optional<ConcatUsageInfo>();
-    }
-    prev_limit = u->slice_limits().at(0);
-  }
-
-  // If we have seen other concats, make sure they are identical. Multiple
-  // concats exist because horizontal fusion inserts one concat for each output
-  // of the fusion candidates. Check that all concats and operand ids are the
-  // same to know that the "transitive use closure" will be computed in the same
-  // iteration space.
-  int64_t operand_idx = concat.operand_index(&operand);
-  if (info.prev_concat != nullptr) {
-    bool is_concat_identical = info.prev_concat->Identical(
-        concat,
-        /*eq_operands=*/[](const HloInstruction*, const HloInstruction*) {
-          // Operands don't need to be the same.
-          return true;
-        });
-    if (!is_concat_identical || info.concat_opnd_idx != operand_idx) {
-      return std::optional<ConcatUsageInfo>();
-    }
-  }
-
-  const HloInstruction* slice_to_recover_opnd = users.at(operand_idx);
-  return std::optional<ConcatUsageInfo>(
-      ConcatUsageInfo{&concat, operand_idx, slice_to_recover_opnd});
-}
-
-// Returns whether we can prove the transitive uses of `param` are in effect
-// elementwise. In other words, we prove that the "transitive use closure" will
-// all be computed in the same iteration space without any reorder of elements.
-// In addition, we check that the "transitive use closure" includes the output
-// in the `root_tuple`.
-// Theoretically, We can prove more patterns but our primary use case is
-// SliceInputFusion.
-bool AreTransitiveUsesEffectivelyElementwise(const HloInstruction* param,
-                                             const HloInstruction* root_tuple,
-                                             const ShapeIndex& out_shape_idx) {
-  CHECK_EQ(root_tuple->opcode(), HloOpcode::kTuple);
-  CHECK_EQ(out_shape_idx.size(), 1);
-  absl::flat_hash_set<const HloInstruction*> visited;
-  absl::InlinedVector<const HloInstruction*, 4> stack;
-  stack.push_back(param);
-  ConcatUsageInfo concat_usage_info{nullptr, 0, nullptr};
-  bool is_output_reachable = false;
-  while (!stack.empty()) {
-    const HloInstruction* current = stack.back();
-    stack.pop_back();
-    visited.insert(current);
-    for (const HloInstruction* user : current->users()) {
-      VLOG(3) << "Visiting: " << user->ToString();
-      switch (user->opcode()) {
-        case HloOpcode::kTuple:
-          if (user == root_tuple &&
-              current == root_tuple->operand(out_shape_idx.back())) {
-            // We need to know if the output is reachable by the `param` to make
-            // sure that they will be computed in the same iteration space.
-            is_output_reachable = true;
-          }
-          break;
-        case HloOpcode::kReshape:
-          if (!ShapeUtil::ReshapeIsBitcast(current->shape(), user->shape())) {
-            return false;
-          }
-          break;
-        case HloOpcode::kConcatenate: {
-          std::optional<ConcatUsageInfo> optional_concat_info =
-              ConcatIsEffectivelyElementwise(*user, *current,
-                                             concat_usage_info);
-          if (!optional_concat_info) {
-            return false;
-          }
-          concat_usage_info = *optional_concat_info;
-          // Early continue as we only want to traverse through the slice that
-          // recovers the operand. It is guaranteed that the operand to the
-          // concat and the slice have the same iteration space. Insert the
-          // slice instead of the concat.
-          CHECK(!visited.contains(concat_usage_info.slice_to_recover_opnd));
-          stack.push_back(concat_usage_info.slice_to_recover_opnd);
-          continue;
-        }
-        default:
-          for (const int64_t use_index : user->OperandIndices(current)) {
-            if (!user->IsElementwiseOnOperand(use_index)) {
-              // Found a user that is non-elementwise on the current
-              // instruction.
-              return false;
-            }
-          }
-          if (!LayoutUtil::Equal(current->shape().layout(),
-                                 user->shape().layout())) {
-            // Make sure the layout is not changed by the elementwise op.
-            return false;
-          }
-          break;
-      }  // end of switch
-      if (!visited.contains(user)) {
-        stack.push_back(user);
-      }
-    }
-  }
-  return is_output_reachable;
-}
-}  // namespace
-
 bool HloDataflowAnalysis::ValueIsDefinedAt(const HloInstruction* instruction,
                                            const ShapeIndex& index) const {
   const HloValueSet& value_set = GetValueSet(instruction, index);
@@ -1684,7 +1514,7 @@ absl::Status HloDataflowAnalysis::RunImpl() {
   }
   absl::c_sort(values_vector_, HloValue::IdLessThan);
 
-  TF_DCHECK_OK(Verify());
+  DCHECK_OK(Verify());
 
   XLA_VLOG_LINES(1, ToString());
 
@@ -1800,18 +1630,6 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       ShapeUtil::GetSubshape(operand->shape(), operand_index);
   const Shape& user_subshape =
       ShapeUtil::GetSubshape(user->shape(), user_index);
-  if (IsSliceInputFusion(*user)) {
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-    // We don't require the same dimensions but only the same number of elements
-    // and type (to make sure the same buffer size).
-    return operand_subshape.IsArray() && user_subshape.IsArray() &&
-           ShapeUtil::ElementsIn(operand_subshape) ==
-               ShapeUtil::ElementsIn(user_subshape) &&
-           ShapeUtil::SameElementType(operand_subshape, user_subshape) &&
-           AreTransitiveUsesEffectivelyElementwise(
-               fusion_param, user->fused_expression_root(), user_index);
-  }
 
   auto shapes_equal = ShapeUtil::Equal(operand_subshape, user_subshape);
   // Check that operand and user emit the same shape and layout.
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
index f0484a1f835bc3..755e184eb7f7aa 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
@@ -3079,151 +3079,6 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
       reverse, {}, call, {}, &alias_info_));
 }
 
-TEST_F(CanShareOperandBufferWithUserTest, ConcatSliceWithElementwise) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      add0 = f32[10, 20] add(p0, p1)
-      sub0 = f32[10, 10] subtract(p2, p3)
-      reshape0 = f32[200] reshape(add0)
-      reshape1 = f32[100] reshape(sub0)
-      concat0 = f32[300] concatenate(reshape0, reshape1), dimensions={0}
-      slice0 = f32[200] slice(concat0), slice={[0:200]}
-      slice1 = f32[100] slice(concat0), slice={[200:300]}
-      ROOT tuple = (f32[200], f32[100]) tuple(slice0, slice1)
-    }
-
-    ENTRY test {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      ROOT fusion = (f32[200], f32[100]) fusion(p0, p1, p2, p3), kind=kInput, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module->entry_computation()->root_instruction();
-  auto* param0 = module->entry_computation()->parameter_instruction(0);
-  auto* param1 = module->entry_computation()->parameter_instruction(1);
-  auto* param2 = module->entry_computation()->parameter_instruction(2);
-  auto* param3 = module->entry_computation()->parameter_instruction(3);
-
-  auto dataflow_analysis = RunAnalysis(*module);
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {0}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {0}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param2, {}, fusion, {1}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param3, {}, fusion, {1}, &alias_info_));
-  // Tensors of different sizes cannot share buffer.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {1}, &alias_info_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, ConcatSliceNegativeTest) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      // p0 has multiple transitive uses fed to concat. So, p0 cannot share
-      // buffer with outputs because the aliased output could be written before
-      // all the uses of p0 are finished.
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      add0 = f32[100] add(p0, p1)
-      concat0 = f32[200] concatenate(p0, add0), dimensions={0}
-      slice0 = f32[100] slice(concat0), slice={[0:100]}
-      slice1 = f32[100] slice(concat0), slice={[100:200]}
-      ROOT tuple = (f32[100], f32[100]) tuple(slice0, slice1)
-    }
-
-    ENTRY test {
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      ROOT fusion = (f32[100], f32[100]) fusion(p0, p1),
-                        kind=kInput, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module->entry_computation()->root_instruction();
-  auto* param0 = module->entry_computation()->parameter_instruction(0);
-  auto* param1 = module->entry_computation()->parameter_instruction(1);
-
-  auto dataflow_analysis = RunAnalysis(*module);
-  // p0 cannot share with either fusion{0} or fusion{1}.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {0}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {1}, &alias_info_));
-  // p1 cannot share with fusion{0} because we're not sure about their
-  // relationship.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {0}, &alias_info_));
-  // p1 can share with fusion{1} because they will be executed in an
-  // elementwise manner.
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {1}, &alias_info_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, MultipleConcatenates) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      add0 = f32[100] add(p0, p1)
-      sub0 = f32[100] subtract(p1, p1)
-      concat0 = f32[200] concatenate(p0, add0), dimensions={0}
-      slice0 = f32[100] slice(concat0), slice={[0:100]}
-      slice1 = f32[100] slice(concat0), slice={[100:200]}
-      concat1 = f32[200] concatenate(p0, sub0), dimensions={0}
-      slice2 = f32[100] slice(concat1), slice={[0:100]}
-      slice3 = f32[100] slice(concat1), slice={[100:200]}
-      ROOT tuple = (f32[100], f32[100], f32[100], f32[100])
-                       tuple(slice0, slice1, slice2, slice3)
-    }
-
-    ENTRY test {
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      ROOT fusion = (f32[100], f32[100], f32[100], f32[100])
-          fusion(p0, p1), kind=kInput, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module->entry_computation()->root_instruction();
-  auto* param0 = module->entry_computation()->parameter_instruction(0);
-  auto* param1 = module->entry_computation()->parameter_instruction(1);
-
-  auto dataflow_analysis = RunAnalysis(*module);
-  // p0 cannot share.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {0}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {1}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {2}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {3}, &alias_info_));
-  // p1 can share with either fusion{1} or fusion{3}.
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {1}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {3}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {0}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {2}, &alias_info_));
-}
-
 using GetInPlaceInputOutputPairsTest = HloHardwareIndependentTestBase;
 
 TEST_F(GetInPlaceInputOutputPairsTest, DUS) {
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc
index 03948283310daf..2ed4c519058029 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/xla_data.pb.h"
@@ -78,7 +78,7 @@ TEST_F(HloDfsReachabilityTest, NonTrivialReachability) {
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
 
-  TF_CHECK_OK(add->AddControlDependencyTo(exp));
+  CHECK_OK(add->AddControlDependencyTo(exp));
   auto reachability = HloDfsReachability::Build(computation);
 
   EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc
index e5ba48b71f5991..5a49c3f5af13ad 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc
@@ -35,25 +35,60 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
 
 namespace xla {
 
-bool HloDimensionAnalysis::IsInstructionWeight(
+static void ClearDotDependent(ShapeTree<DimensionInfo>& dimension_info_tree) {
+  dimension_info_tree.ForEachMutableElement(
+      [&](const ShapeIndex& index, DimensionInfo* dimension_info) {
+        if (dimension_info_tree.IsLeaf(index) &&
+            *dimension_info == DimensionInfo::kDotDependent) {
+          *dimension_info = DimensionInfo::kUnknown;
+        }
+      });
+}
+
+bool HloDimensionAnalysis::IsWeight(const HloInstruction* instruction) const {
+  auto it = info_map_.find(instruction);
+  if (it == info_map_.end()) {
+    return false;
+  }
+  return absl::c_any_of(it->second.leaves(),
+                        [](const std::pair<ShapeIndex, DimensionInfo>& leaf) {
+                          return leaf.second == DimensionInfo::kWeight;
+                        });
+}
+
+bool HloDimensionAnalysis::IsDotDependent(
+    const HloInstruction* instruction) const {
+  auto it = info_map_.find(instruction);
+  if (it == info_map_.end()) {
+    return false;
+  }
+  return absl::c_any_of(it->second.leaves(),
+                        [](const std::pair<ShapeIndex, DimensionInfo>& leaf) {
+                          return leaf.second == DimensionInfo::kDotDependent;
+                        });
+}
+
+bool HloDimensionAnalysis::IsKnownDimensionInfo(
     const HloInstruction* instruction) const {
   auto it = info_map_.find(instruction);
   if (it == info_map_.end()) {
     return false;
   }
   return absl::c_any_of(it->second.leaves(),
-                        [](const std::pair<ShapeIndex, WeightInfo>& leaf) {
-                          return leaf.second == WeightInfo::kWeight;
+                        [](const std::pair<ShapeIndex, DimensionInfo>& leaf) {
+                          return leaf.second == DimensionInfo::kWeight ||
+                                 leaf.second == DimensionInfo::kDotDependent;
                         });
 }
 
-std::optional<ShapeTree<WeightInfo>> HloDimensionAnalysis::GetWeightInfo(
+std::optional<ShapeTree<DimensionInfo>> HloDimensionAnalysis::GetDimensionInfo(
     const HloInstruction* instruction) const {
   auto it = info_map_.find(instruction);
   if (it == info_map_.end()) {
@@ -62,35 +97,39 @@ std::optional<ShapeTree<WeightInfo>> HloDimensionAnalysis::GetWeightInfo(
   return it->second;
 }
 
-absl::Status HloDimensionAnalysis::SetInstructionAsWeight(
-    HloInstruction* instruction) {
+absl::Status HloDimensionAnalysis::SetDimensionInfo(
+    const HloInstruction* instruction, DimensionInfo value) {
+  CHECK(value == DimensionInfo::kWeight ||
+        value == DimensionInfo::kDotDependent)
+      << "Unsupported dimension info: " << value;
   auto [it, success] = info_map_.emplace(
       std::piecewise_construct, std::forward_as_tuple(instruction),
-      std::forward_as_tuple(instruction->shape(), WeightInfo::kUnknown));
+      std::forward_as_tuple(instruction->shape(), DimensionInfo::kUnknown));
 
   if (!success) {
-    return absl::InternalError(absl::StrCat(
-        "Instruction ", instruction->ToString(), " already has weight info."));
+    return absl::InternalError(absl::StrCat("Instruction ",
+                                            instruction->ToString(),
+                                            " already has dimension info."));
   }
 
-  ShapeTree<WeightInfo>& weight_tree = it->second;
-  weight_tree.ForEachMutableElement(
-      [&](const ShapeIndex& index, WeightInfo* weight_info) {
-        if (weight_tree.IsLeaf(index)) {
-          *weight_info = WeightInfo::kWeight;
+  ShapeTree<DimensionInfo>& dim_info_tree = it->second;
+  dim_info_tree.ForEachMutableElement(
+      [&](const ShapeIndex& index, DimensionInfo* dimension_info) {
+        if (dim_info_tree.IsLeaf(index)) {
+          *dimension_info = value;
           return;
         }
-        *weight_info = WeightInfo::kTuple;
+        *dimension_info = DimensionInfo::kUnknown;
       });
   return absl::OkStatus();
 }
 
-absl::Status HloDimensionAnalysis::SetWeightInfo(
-    const HloInstruction* target, ShapeTree<WeightInfo> weight_annotation) {
-  auto [it, success] = info_map_.emplace(target, std::move(weight_annotation));
+absl::Status HloDimensionAnalysis::SetDimensionInfo(
+    const HloInstruction* target, ShapeTree<DimensionInfo> annotation) {
+  auto [it, success] = info_map_.emplace(target, std::move(annotation));
   if (!success) {
     return absl::InternalError(absl::StrCat("Instruction ", target->ToString(),
-                                            " already has weight info."));
+                                            " already has dimension info."));
   }
   return absl::OkStatus();
 }
@@ -101,28 +140,35 @@ absl::Status HloDimensionAnalysis::AnnotateEntryComputationParameters(
   const auto& params = module.entry_computation()->parameter_instructions();
   info_map_.reserve(params.size());
   for (HloInstruction* instruction : params) {
-    TF_RETURN_IF_ERROR(SetInstructionAsWeight(instruction));
+    TF_RETURN_IF_ERROR(SetDimensionInfo(instruction, DimensionInfo::kWeight));
   }
   return absl::OkStatus();
 }
 
+bool HloDimensionAnalysis::IsDotOrHasDotDependent(
+    const HloInstruction* op) const {
+  if (HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution,
+                       HloOpcode::kRaggedDot>(op)) {
+    return true;
+  }
+  return IsDotDependent(op);
+}
+
 absl::StatusOr<std::unique_ptr<HloDimensionAnalysis>> HloDimensionAnalysis::Run(
     const HloModule& module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  std::unique_ptr<HloDimensionAnalysis> weight_analysis =
+  std::unique_ptr<HloDimensionAnalysis> analysis =
       absl::WrapUnique(new HloDimensionAnalysis(module, execution_threads));
-  TF_RETURN_IF_ERROR(
-      weight_analysis->AnnotateEntryComputationParameters(module));
-  TF_RETURN_IF_ERROR(
-      weight_analysis->RunOnComputation(*module.entry_computation()));
-  return weight_analysis;
+  TF_RETURN_IF_ERROR(analysis->AnnotateEntryComputationParameters(module));
+  TF_RETURN_IF_ERROR(analysis->RunOnComputation(*module.entry_computation()));
+  return analysis;
 }
 
 absl::Status HloDimensionAnalysis::RunOnComputation(
     const HloComputation& computation) {
   if (HloInstruction::IsThreadIncluded(computation.execution_thread(),
                                        execution_threads_)) {
-    HloWeightPropagation propagation(this);
+    HloDimensionInfoPropagation propagation(this);
     return propagation.Run(computation);
   }
   return absl::OkStatus();
@@ -133,17 +179,19 @@ absl::Status HloDimensionAnalysis::RunOnComputation(
     absl::Span<const HloInstruction* const> operands) {
   CHECK_EQ(computation.num_parameters(), operands.size());
   for (int i = 0; i < computation.num_parameters(); ++i) {
-    auto weight_info_iter = info_map_.find(operands[i]);
-    if (weight_info_iter == info_map_.end()) {
+    auto dimension_info_iter = info_map_.find(operands[i]);
+    if (dimension_info_iter == info_map_.end()) {
       continue;
     }
-    TF_RETURN_IF_ERROR(SetWeightInfo(computation.parameter_instructions()[i],
-                                     weight_info_iter->second));
+    ClearDotDependent(dimension_info_iter->second);
+    TF_RETURN_IF_ERROR(SetDimensionInfo(computation.parameter_instructions()[i],
+                                        dimension_info_iter->second));
   }
   return RunOnComputation(computation);
 }
 
-absl::Status HloWeightPropagation::Run(const HloComputation& computation) {
+absl::Status HloDimensionInfoPropagation::Run(
+    const HloComputation& computation) {
   TF_RETURN_IF_ERROR(computation.root_instruction()->Accept(this));
   for (HloInstruction* instruction : computation.instructions()) {
     if (instruction->user_count() == 0) {
@@ -153,151 +201,183 @@ absl::Status HloWeightPropagation::Run(const HloComputation& computation) {
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::DefaultAction(HloInstruction* instruction) {
-  return absl::OkStatus();
-}
-
 #define RETURN_IF_ALREADY_PROPAGATED(instruction) \
-  if (analysis_->HasWeightInfo(instruction)) {    \
+  if (analysis_->HasDimensionInfo(instruction)) { \
     return absl::OkStatus();                      \
   }
 
-absl::Status HloWeightPropagation::HandleTuple(HloInstruction* tuple) {
+absl::Status HloDimensionInfoPropagation::DefaultAction(
+    HloInstruction* instruction) {
+  RETURN_IF_ALREADY_PROPAGATED(instruction);
+  // For non-weight, we want to find out whether the instruction has a
+  // dot-dependent operand.
+  for (const HloInstruction* operand : instruction->operands()) {
+    if (analysis_->IsDotOrHasDotDependent(operand)) {
+      TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(
+          instruction, ShapeTree<DimensionInfo>(instruction->shape(),
+                                                DimensionInfo::kDotDependent)));
+      break;
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status HloDimensionInfoPropagation::HandleTuple(HloInstruction* tuple) {
   RETURN_IF_ALREADY_PROPAGATED(tuple);
-  bool has_weight_info = false;
-  ShapeTree<WeightInfo> weight_tree(tuple->shape(), WeightInfo::kUnknown);
+  bool has_dim_info = false;
+  ShapeTree<DimensionInfo> dim_info_tree(tuple->shape(),
+                                         DimensionInfo::kUnknown);
   for (int64_t idx = 0; idx < tuple->operand_count(); ++idx) {
     const HloInstruction* operand = tuple->operand(idx);
-    if (analysis_->IsInstructionWeight(operand)) {
-      weight_tree.CopySubtreeFrom(*analysis_->GetWeightInfo(operand), {},
-                                  {idx});
-      has_weight_info = true;
+    if (analysis_->IsKnownDimensionInfo(operand)) {
+      dim_info_tree.CopySubtreeFrom(*analysis_->GetDimensionInfo(operand), {},
+                                    {idx});
+      has_dim_info = true;
     }
   }
 
-  if (has_weight_info) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(tuple, std::move(weight_tree)));
+  if (has_dim_info) {
+    TF_RETURN_IF_ERROR(
+        analysis_->SetDimensionInfo(tuple, std::move(dim_info_tree)));
   }
 
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleGetTupleElement(
+absl::Status HloDimensionInfoPropagation::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
   RETURN_IF_ALREADY_PROPAGATED(get_tuple_element);
   const HloInstruction* operand = get_tuple_element->operand(0);
-  if (analysis_->IsInstructionWeight(operand)) {
-    ShapeTree<WeightInfo> weight_tree(get_tuple_element->shape(),
-                                      WeightInfo::kUnknown);
-    weight_tree.CopySubtreeFrom(*analysis_->GetWeightInfo(operand),
-                                {get_tuple_element->tuple_index()}, {});
-    TF_RETURN_IF_ERROR(
-        analysis_->SetWeightInfo(get_tuple_element, std::move(weight_tree)));
+  if (analysis_->IsKnownDimensionInfo(operand)) {
+    ShapeTree<DimensionInfo> dimension_info(get_tuple_element->shape(),
+                                            DimensionInfo::kUnknown);
+    dimension_info.CopySubtreeFrom(*analysis_->GetDimensionInfo(operand),
+                                   {get_tuple_element->tuple_index()}, {});
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(get_tuple_element,
+                                                   std::move(dimension_info)));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleCall(HloInstruction* call) {
+absl::Status HloDimensionInfoPropagation::HandleCall(HloInstruction* call) {
   RETURN_IF_ALREADY_PROPAGATED(call);
   HloComputation* computation = call->called_computations()[0];
   TF_RETURN_IF_ERROR(
       analysis_->RunOnComputation(*computation, call->operands()));
-  if (analysis_->IsInstructionWeight(computation->root_instruction())) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(
-        call, *analysis_->GetWeightInfo(computation->root_instruction())));
+  if (analysis_->IsWeight(computation->root_instruction())) {
+    ShapeTree<DimensionInfo> dimension_info_tree =
+        *analysis_->GetDimensionInfo(computation->root_instruction());
+    ClearDotDependent(dimension_info_tree);
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(call, dimension_info_tree));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleWhile(HloInstruction* xla_while) {
+absl::Status HloDimensionInfoPropagation::HandleWhile(
+    HloInstruction* xla_while) {
   RETURN_IF_ALREADY_PROPAGATED(xla_while);
   TF_RETURN_IF_ERROR(analysis_->RunOnComputation(*xla_while->while_condition(),
                                                  xla_while->operands()));
   HloComputation* computation = xla_while->while_body();
   TF_RETURN_IF_ERROR(
       analysis_->RunOnComputation(*computation, xla_while->operands()));
-  if (analysis_->IsInstructionWeight(computation->root_instruction())) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(
-        xla_while, *analysis_->GetWeightInfo(computation->root_instruction())));
+  if (analysis_->IsWeight(computation->root_instruction())) {
+    ShapeTree<DimensionInfo> dimension_info_tree =
+        *analysis_->GetDimensionInfo(computation->root_instruction());
+    ClearDotDependent(dimension_info_tree);
+    TF_RETURN_IF_ERROR(
+        analysis_->SetDimensionInfo(xla_while, dimension_info_tree));
   }
   return absl::OkStatus();
 }
 
 // Called for operations that operate on a single operand and do not change
 // the weight "nature" of their operand.
-absl::Status HloWeightPropagation::HandleSimpleOp(HloInstruction* op) {
+absl::Status HloDimensionInfoPropagation::HandleSimpleOp(HloInstruction* op) {
   RETURN_IF_ALREADY_PROPAGATED(op);
   const HloInstruction* operand = op->operand(0);
-  if (analysis_->IsInstructionWeight(operand)) {
-    TF_RETURN_IF_ERROR(analysis_->SetInstructionAsWeight(op));
+  if (analysis_->IsWeight(operand)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(op, DimensionInfo::kWeight));
+  } else if (analysis_->IsDotOrHasDotDependent(operand)) {
+    TF_RETURN_IF_ERROR(
+        analysis_->SetDimensionInfo(op, DimensionInfo::kDotDependent));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleDynamicSlice(
+absl::Status HloDimensionInfoPropagation::HandleDynamicSlice(
     HloInstruction* dynamic_slice) {
   return HandleSimpleOp(dynamic_slice);
 }
 
-absl::Status HloWeightPropagation::HandleDynamicUpdateSlice(
+absl::Status HloDimensionInfoPropagation::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
   RETURN_IF_ALREADY_PROPAGATED(dynamic_update_slice);
   // If either the operand or the update is a weight, we consider the output to
   // be a weight.
   const HloInstruction* operand = dynamic_update_slice->operand(0);
   const HloInstruction* update = dynamic_update_slice->operand(1);
-  if (analysis_->IsInstructionWeight(operand) ||
-      analysis_->IsInstructionWeight(update)) {
-    TF_RETURN_IF_ERROR(analysis_->SetInstructionAsWeight(dynamic_update_slice));
+  if (analysis_->IsWeight(operand) || analysis_->IsWeight(update)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(dynamic_update_slice,
+                                                   DimensionInfo::kWeight));
+  } else if (analysis_->IsDotDependent(operand) ||
+             analysis_->IsDotDependent(update)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(
+        dynamic_update_slice, DimensionInfo::kDotDependent));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleSlice(HloInstruction* slice) {
+absl::Status HloDimensionInfoPropagation::HandleSlice(HloInstruction* slice) {
   return HandleSimpleOp(slice);
 }
 
-absl::Status HloWeightPropagation::HandleConvert(HloInstruction* convert) {
+absl::Status HloDimensionInfoPropagation::HandleConvert(
+    HloInstruction* convert) {
   return HandleSimpleOp(convert);
 }
 
-absl::Status HloWeightPropagation::HandleReshape(HloInstruction* reshape) {
+absl::Status HloDimensionInfoPropagation::HandleReshape(
+    HloInstruction* reshape) {
   return HandleSimpleOp(reshape);
 }
 
-absl::Status HloWeightPropagation::HandleBitcast(HloInstruction* bitcast) {
+absl::Status HloDimensionInfoPropagation::HandleBitcast(
+    HloInstruction* bitcast) {
   return HandleSimpleOp(bitcast);
 }
 
-absl::Status HloWeightPropagation::HandleTranspose(HloInstruction* transpose) {
+absl::Status HloDimensionInfoPropagation::HandleTranspose(
+    HloInstruction* transpose) {
   return HandleSimpleOp(transpose);
 }
 
-absl::Status HloWeightPropagation::HandleCopy(HloInstruction* copy) {
+absl::Status HloDimensionInfoPropagation::HandleCopy(HloInstruction* copy) {
   return HandleSimpleOp(copy);
 }
 
-absl::Status HloWeightPropagation::HandleBitcastConvert(
+absl::Status HloDimensionInfoPropagation::HandleBitcastConvert(
     HloInstruction* bitcast_convert) {
   return HandleSimpleOp(bitcast_convert);
 }
 
-absl::Status HloWeightPropagation::HandleOptimizationBarrier(
+absl::Status HloDimensionInfoPropagation::HandleOptimizationBarrier(
     HloInstruction* optimization_barrier) {
   RETURN_IF_ALREADY_PROPAGATED(optimization_barrier);
   CHECK_EQ(optimization_barrier->operand_count(), 1)
       << "Optimization barrier must have exactly one operand.";
   const HloInstruction* optimization_barrier_operand =
       optimization_barrier->operand(0);
-  if (analysis_->IsInstructionWeight(optimization_barrier_operand)) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(
+  if (analysis_->IsKnownDimensionInfo(optimization_barrier_operand)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(
         optimization_barrier,
-        *analysis_->GetWeightInfo(optimization_barrier_operand)));
+        *analysis_->GetDimensionInfo(optimization_barrier_operand)));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleAllGather(HloInstruction* all_gather) {
+absl::Status HloDimensionInfoPropagation::HandleAllGather(
+    HloInstruction* all_gather) {
   return HandleSimpleOp(all_gather);
 }
 
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h
index 6e3466ddae93a8..f9b5452997c76f 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h
@@ -40,25 +40,40 @@ limitations under the License.
 
 namespace xla {
 
-enum WeightInfo : uint8_t {
+enum DimensionInfo : uint8_t {
+  // kDotDependent indicates there is a DOT that can reach an operand of the
+  // instruction. We want to use this information to distinguish between
+  // WeightGradient and ActivationGradient as follows to help decide whether
+  // we can overlap the all-gather/reduce-scatter with other dot operations
+  // outside the chain:
+  //
+  // ActivationGradient: a DOT, there is another DOT that can reach the operands
+  // of this DOT via def-use chain.
+  //
+  // WeightGradient: a DOT, no other DOT can reach the operand of this DOT via
+  // def-use chain.
+  //
+  // Because we don't schedule instructions across computation boundaries, we
+  // don't propagate kDotDependent across computation boundaries. On the other
+  // hand, we propagate kWeight across computation boundaries.
   kWeight,
-  kTuple,
+  kDotDependent,
   kUnknown,
 };
 
-inline std::string WeightInfoToString(WeightInfo weight_info) {
-  switch (weight_info) {
-    case WeightInfo::kWeight:
+inline std::string DimensionInfoToString(DimensionInfo dim_info) {
+  switch (dim_info) {
+    case DimensionInfo::kWeight:
       return "weight";
-    case WeightInfo::kTuple:
-      return "tuple";
-    case WeightInfo::kUnknown:
+    case DimensionInfo::kDotDependent:
+      return "dot_dependent";
+    case DimensionInfo::kUnknown:
       return "unknown";
   }
 }
 
-using WeightInfoMap =
-    absl::node_hash_map<const HloInstruction*, ShapeTree<WeightInfo>>;
+using DimensionInfoMap =
+    absl::node_hash_map<const HloInstruction*, ShapeTree<DimensionInfo>>;
 
 // This analysis pass determines which HLO instructions produce/are weights.
 // Parameters to the entry computation are considered weights, and this property
@@ -66,86 +81,98 @@ using WeightInfoMap =
 // etc).
 class HloDimensionAnalysis {
  public:
-  friend class HloWeightPropagation;
+  friend class HloDimensionInfoPropagation;
   static absl::StatusOr<std::unique_ptr<HloDimensionAnalysis>> Run(
       const HloModule& module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
-  // Whether the instruction has been annotated with weight info.
-  bool HasWeightInfo(const HloInstruction* instruction) const {
+  // Whether the instruction has been annotated with dimension info.
+  bool HasDimensionInfo(const HloInstruction* instruction) const {
     return info_map_.contains(instruction);
   }
 
   // Whether any leaf in the instruction shape is a weight.
-  bool IsInstructionWeight(const HloInstruction* instruction) const;
+  bool IsWeight(const HloInstruction* instruction) const;
+  // Whether any leaf in the instruction shape is dot dependent.
+  bool IsDotDependent(const HloInstruction* instruction) const;
+  // Whether any leaf in the instructon shape is a weight or dot dependent.
+  bool IsKnownDimensionInfo(const HloInstruction* instruction) const;
 
-  // Returns map of HLO instructions to their weight info.
+  // Returns map of HLO instructions to their dimension info.
   // If an instruction is not found in the map, it means that we have not
-  // determined it is a weight.
-  const WeightInfoMap& GetWeightInfoMap() const { return info_map_; }
+  // determined its dimension info.
+  const DimensionInfoMap& GetDimensionInfoMap() const { return info_map_; }
 
-  // Returns the weight info for the given instruction.
-  std::optional<ShapeTree<WeightInfo>> GetWeightInfo(
+  // Returns the dimension info for the given instruction.
+  std::optional<ShapeTree<DimensionInfo>> GetDimensionInfo(
       const HloInstruction* instruction) const;
 
+  bool IsDotOrHasDotDependent(const HloInstruction* op) const;
+
  protected:
   explicit HloDimensionAnalysis(
       const HloModule& module,
       const absl::flat_hash_set<absl::string_view>& execution_threads)
       : module_(module), execution_threads_(execution_threads) {}
 
-  // Sets the instruction as a weight. This is used to annotate the entry
-  // computation parameters and other instructions that are known to be
-  // weights.
-  absl::Status SetInstructionAsWeight(HloInstruction* instruction);
+  // Sets the instruction DimensionInfo to indicate it is a weight or
+  // dot-dependent. This is used to annotate the entry computation parameters
+  // and other instructions that are known to be weights or dot-dependents.
+  absl::Status SetDimensionInfo(const HloInstruction* instruction,
+                                DimensionInfo value);
 
-  // Sets the weight info for the given target instruction.
-  absl::Status SetWeightInfo(const HloInstruction* target,
-                             ShapeTree<WeightInfo> weight_annotation);
+  // Sets the dimension info for the given target instruction.
+  absl::Status SetDimensionInfo(const HloInstruction* target,
+                                ShapeTree<DimensionInfo> annotation);
 
   // Annotates the entry computation parameters as weights.
   absl::Status AnnotateEntryComputationParameters(const HloModule& module);
 
-  // Runs the weight analysis on the given computation.
+  // Runs the analysis on the given computation to determine the DimensionInfo
+  // for each instruction.
   absl::Status RunOnComputation(const HloComputation& computation);
 
-  // Runs the weight analysis on the given computation, with the given operands
-  // as the computation parameters. Propagates the weight info from the
+  // Runs the analysis on the given computation, with the given operands as the
+  // computation parameters. Propagates the dimension info from the callsite
   // operands to the computation parameters.
   absl::Status RunOnComputation(
       const HloComputation& computation,
       absl::Span<const HloInstruction* const> operands);
 
-  WeightInfoMap info_map_;
+  DimensionInfoMap info_map_;
   const HloModule& module_;
   const absl::flat_hash_set<absl::string_view>& execution_threads_;
 };
 
-class HloWeightPropagation : public DfsHloVisitorWithDefault {
+class HloDimensionInfoPropagation : public DfsHloVisitorWithDefault {
  public:
-  explicit HloWeightPropagation(HloDimensionAnalysis* dimension_analysis)
+  explicit HloDimensionInfoPropagation(HloDimensionAnalysis* dimension_analysis)
       : analysis_(dimension_analysis) {}
   absl::Status Run(const HloComputation& computation);
   absl::Status DefaultAction(HloInstruction* instruction) override;
-  absl::Status HandleTuple(HloInstruction* tuple) override;
-  absl::Status HandleGetTupleElement(
-      HloInstruction* get_tuple_element) override;
+  // go/keep-sorted start
+  absl::Status HandleAllGather(HloInstruction* all_gather) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleBitcastConvert(HloInstruction* bitcast_convert) override;
   absl::Status HandleCall(HloInstruction* call) override;
-  absl::Status HandleWhile(HloInstruction* xla_while) override;
-  absl::Status HandleSimpleOp(HloInstruction* op);
+  absl::Status HandleConvert(HloInstruction* convert) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
   absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
   absl::Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
-  absl::Status HandleSlice(HloInstruction* slice) override;
-  absl::Status HandleConvert(HloInstruction* convert) override;
-  absl::Status HandleReshape(HloInstruction* reshape) override;
-  absl::Status HandleBitcast(HloInstruction* bitcast) override;
-  absl::Status HandleTranspose(HloInstruction* transpose) override;
-  absl::Status HandleCopy(HloInstruction* copy) override;
-  absl::Status HandleBitcastConvert(HloInstruction* bitcast_convert) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
   absl::Status HandleOptimizationBarrier(
       HloInstruction* optimization_barrier) override;
-  absl::Status HandleAllGather(HloInstruction* all_gather) override;
+  absl::Status HandleReshape(HloInstruction* reshape) override;
+  absl::Status HandleSlice(HloInstruction* slice) override;
+  absl::Status HandleTranspose(HloInstruction* transpose) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  // go/keep-sorted end
+
+ private:
+  absl::Status HandleSimpleOp(HloInstruction* op);
 
  protected:
   HloDimensionAnalysis* analysis_;
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc
index 5fcccfccb9e67d..bebba7ab529865 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc
@@ -35,10 +35,18 @@ class HloDimensionAnalysisTest : public HloHardwareIndependentTestBase {
   bool IsWeight(const HloDimensionAnalysis& hlo_dimension_analysis,
                 HloModule* module, absl::string_view instruction_name) {
     HloInstruction* instruction = FindInstruction(module, instruction_name);
-    std::optional<ShapeTree<WeightInfo>> weight_info =
-        hlo_dimension_analysis.GetWeightInfo(instruction);
-    return weight_info.has_value() &&
-           (*weight_info).element({}) == WeightInfo::kWeight;
+    std::optional<ShapeTree<DimensionInfo>> dim_info =
+        hlo_dimension_analysis.GetDimensionInfo(instruction);
+    return dim_info.has_value() &&
+           (*dim_info).element({}) == DimensionInfo::kWeight;
+  }
+  bool HasDotDependent(const HloDimensionAnalysis& hlo_dimension_analysis,
+                       HloModule* module, absl::string_view instruction_name) {
+    HloInstruction* instruction = FindInstruction(module, instruction_name);
+    std::optional<ShapeTree<DimensionInfo>> dimension_info =
+        hlo_dimension_analysis.GetDimensionInfo(instruction);
+    return dimension_info.has_value() &&
+           (*dimension_info).element({}) == DimensionInfo::kDotDependent;
   }
 };
 
@@ -83,6 +91,13 @@ ENTRY entry {
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "copy"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "Arg_1.2"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "all-gather"));
+
+  EXPECT_FALSE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.0"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "maximum.33"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "select.35"));
+  EXPECT_TRUE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.2"));
 }
 
 TEST_F(HloDimensionAnalysisTest, RepeatWhile) {
@@ -239,6 +254,19 @@ ENTRY entry {
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "reshape.22"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "reshape.23"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "reshape.24"));
+
+  EXPECT_FALSE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.0"));
+  EXPECT_TRUE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.1"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "reshape.90"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "reshape.95"));
+  // Index 2 of the while result is dot-dependent.
+  EXPECT_TRUE(HasDotDependent(*hlo_dimension_analysis, module.get(),
+                              "dynamic-update-slice.99"));
+  // Check the dot-dependent while result is not propagated to the call site.
+  EXPECT_FALSE(HasDotDependent(*hlo_dimension_analysis, module.get(),
+                               "get-tuple-element.179"));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/hlo_ordering.cc b/third_party/xla/xla/hlo/analysis/hlo_ordering.cc
index aa9bfea72f5420..f67b01067b65e5 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_ordering.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_ordering.cc
@@ -29,15 +29,15 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/map_util.h"
+#include "xla/service/hlo_value.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/types.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 
@@ -559,7 +559,7 @@ SequentialHloOrdering::SequentialHloOrdering(HloSchedule&& schedule)
 
 void SequentialHloOrdering::Initialize() {
   // Create a map from instruction to its order position.
-  TF_DCHECK_OK(schedule_.Verify());
+  DCHECK_OK(schedule_.Verify());
   for (const auto& computation_sequence : schedule_.sequences()) {
     const auto& order = computation_sequence.second.instructions();
     for (int i = 0; i < order.size(); ++i) {
diff --git a/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc b/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc
index 478c13a3edfcb6..64dc917a41f836 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/random/random.h"
 #include "absl/strings/string_view.h"
+#include "benchmark/benchmark.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -34,8 +36,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/test_benchmark.h"
 
 namespace xla {
 
@@ -134,7 +134,7 @@ TEST_F(HloReachabilityTest, NonTrivialReachability) {
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
 
-  TF_CHECK_OK(add->AddControlDependencyTo(exp));
+  CHECK_OK(add->AddControlDependencyTo(exp));
   auto reachability = HloReachabilityMap::Build(computation);
 
   EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc b/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc
deleted file mode 100644
index 2cd08d3129bcda..00000000000000
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc
+++ /dev/null
@@ -1,1189 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/hlo/analysis/indexed_array_analysis.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/hlo/evaluator/hlo_evaluator.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/literal.h"
-#include "xla/map_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-
-namespace {
-using Analysis = IndexedArrayAnalysis;
-using UnknownArray = Analysis::UnknownArray;
-using ConstantArray = Analysis::ConstantArray;
-using ReshapedArray = Analysis::ReshapedArray;
-using ScalarIndexedArray = Analysis::ScalarIndexedArray;
-using absl::StrJoin;
-}  // namespace
-
-std::string IndexedArrayAnalysis::ToString(Array* root, bool print_constants) {
-  switch (root->kind()) {
-    case Array::kUnknown: {
-      auto* unknown_tensor = root->as<UnknownArray>();
-      return absl::StrCat("%", unknown_tensor->instruction().name());
-    }
-
-    case Array::kConstant: {
-      if (print_constants) {
-        std::string contents = root->as<ConstantArray>()->literal()->ToString();
-        return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()),
-                            " ", contents, ")");
-      }
-      return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()),
-                          ")");
-    }
-
-    case Array::kReshaped: {
-      ReshapedArray* reshaped_array = root->as<ReshapedArray>();
-      return absl::StrCat(
-          "(reshape ", ToString(reshaped_array->operand(), print_constants),
-          " to ", ShapeUtil::HumanString(reshaped_array->shape()), ")");
-    }
-
-    case Array::kScalarIndexedConstant:
-    case Array::kScalarIndexed: {
-      auto* indexed_array = root->as<ScalarIndexedArray>();
-      std::string name = root->kind() == Array::kScalarIndexedConstant
-                             ? "scalar-indexed-const"
-                             : "scalar-indexed";
-      return absl::StrCat(
-          "(", name, " ", ToString(indexed_array->source(), print_constants),
-          " ", ToString(indexed_array->indices(), print_constants), " ",
-          indexed_array->source_dim(), "->[",
-          StrJoin(indexed_array->output_dims(), ","), "])");
-    }
-  }
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::GetArrayFor(
-    const HloInstruction* instr) {
-  auto it = cache_.find(instr);
-  if (it != cache_.end()) {
-    return it->second;
-  }
-
-  TF_RETURN_IF_ERROR(TraverseAndPopulateCache(instr));
-  return FindOrDie(cache_, instr);
-}
-
-absl::Status IndexedArrayAnalysis::TraverseAndPopulateCache(
-    const HloInstruction* root) {
-  // Depth first search over the DAG, invoking ComputeArrayFor in post order.
-  // The HLO instructions already in the cache are considered leaves.
-
-  absl::InlinedVector<const HloInstruction*, 4> stack;
-
-  enum DfsState { kDiscovered, kVisited };
-  absl::flat_hash_map<const HloInstruction*, DfsState> dfs_state_map;
-
-  stack.push_back(root);
-  InsertOrDie(&dfs_state_map, root, kDiscovered);
-
-  do {
-    const HloInstruction* instr = stack.back();
-    if (cache_.contains(instr)) {
-      stack.pop_back();
-      continue;
-    }
-
-    switch (FindOrDie(dfs_state_map, instr)) {
-      case kDiscovered: {
-        for (const HloInstruction* operand : instr->operands()) {
-          if (!cache_.contains(operand)) {
-            stack.push_back(operand);
-            CHECK(!dfs_state_map.contains(operand) ||
-                  dfs_state_map[operand] == kDiscovered);
-            dfs_state_map[operand] = kDiscovered;
-          }
-        }
-        dfs_state_map[instr] = kVisited;
-        break;
-      }
-
-      case kVisited:
-        stack.pop_back();
-        TF_ASSIGN_OR_RETURN(Array * array, ComputeArrayFor(instr));
-        InsertOrDie(&cache_, instr, array);
-        break;
-    }
-  } while (!stack.empty());
-
-  return absl::OkStatus();
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayFor(
-    const HloInstruction* instr) {
-  Array* computed_array;
-  if (instr->IsElementwise() && instr->operand_count() == 1) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForElementwiseUnaryOp(
-            instr->opcode(), FindOrDie(cache_, instr->operand(0))));
-  } else if (instr->IsElementwise() && instr->operand_count() == 2) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForElementwiseBinaryOp(
-            instr->opcode(), FindOrDie(cache_, instr->operand(0)),
-            FindOrDie(cache_, instr->operand(1))));
-  } else if (instr->opcode() == HloOpcode::kConstant) {
-    TF_ASSIGN_OR_RETURN(computed_array,
-                        ComputeArrayForConstant(instr->literal()));
-  } else if (instr->opcode() == HloOpcode::kGather) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForGather(instr->shape(), instr->gather_dimension_numbers(),
-                              instr->gather_slice_sizes(),
-                              FindOrDie(cache_, instr->operand(0)),
-                              FindOrDie(cache_, instr->operand(1))));
-  } else if (instr->opcode() == HloOpcode::kReshape) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForReshape(instr->shape(),
-                               FindOrDie(cache_, instr->operand(0))));
-  } else if (instr->opcode() == HloOpcode::kDot) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForDot(instr->shape(), instr->dot_dimension_numbers(),
-                           instr->precision_config(),
-                           FindOrDie(cache_, instr->operand(0)),
-                           FindOrDie(cache_, instr->operand(1))));
-  } else {
-    computed_array = nullptr;
-  }
-
-  if (!computed_array) {
-    computed_array = Construct<UnknownArray>(instr);
-  }
-
-  return computed_array;
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForConstant(
-    const Literal& literal) {
-  return Construct<ConstantArray>(&literal);
-}
-
-absl::StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldGatherOfGather(
-    ScalarIndexedArray* source, Array* indices, int64_t source_dim,
-    absl::Span<const int64_t> output_dims, Shape shape) {
-  // We want to transform Gather(Gather(A, X), Y) => Gather(A, Gather(X, Y)).
-  // `source` is the inner Gather(A, X).
-
-  Array* a = source->source();
-  Array* x = source->indices();
-  Array* y = indices;
-
-  // This bit is slightly tricky, so we do a naive "simulation" of the two
-  // consecutive gather operations to infer what the composed gather should look
-  // like.
-
-  enum class IndexComponent { Ungathered, GatheredFirst, GatheredSecond };
-
-  std::vector<IndexComponent> simulated_index(a->shape().dimensions().size(),
-                                              IndexComponent::Ungathered);
-
-  // Simulate the first gather.
-  EraseAt(&simulated_index, source->source_dim());
-  for (int64_t gather_dim : source->output_dims()) {
-    simulated_index.insert(simulated_index.begin() + gather_dim,
-                           IndexComponent::GatheredFirst);
-  }
-
-  // Simulate the second gather.
-  EraseAt(&simulated_index, source_dim);
-  for (int64_t output_dim : output_dims) {
-    simulated_index.insert(simulated_index.begin() + output_dim,
-                           IndexComponent::GatheredSecond);
-  }
-
-  int64_t source_dim_for_index_array =
-      FindIndex(source->output_dims(), source_dim);
-  CHECK_NE(source_dim_for_index_array, source->output_dims().size());
-
-  std::vector<int64_t> output_dims_for_index_array;
-  int64_t gathered_index_components_seen = 0;
-  for (IndexComponent simulation_dim : simulated_index) {
-    if (simulation_dim == IndexComponent::GatheredSecond) {
-      output_dims_for_index_array.push_back(gathered_index_components_seen);
-    }
-    if (simulation_dim != IndexComponent::Ungathered) {
-      gathered_index_components_seen++;
-    }
-  }
-
-  std::vector<int64_t> dim_sizes_for_composed_index;
-  std::vector<int64_t> output_dims_for_new_gather;
-  for (int64_t i = 0, e = simulated_index.size(); i < e; i++) {
-    if (simulated_index[i] != IndexComponent::Ungathered) {
-      dim_sizes_for_composed_index.push_back(shape.dimensions(i));
-      output_dims_for_new_gather.push_back(i);
-    }
-  }
-
-  Array* inner_indices = ConstructScalarIndexedArray(
-      x, y, source_dim_for_index_array, output_dims_for_index_array,
-      ShapeUtil::MakeShape(x->shape().element_type(),
-                           dim_sizes_for_composed_index));
-  return ConstructScalarIndexedArray(a, inner_indices, source->source_dim(),
-                                     output_dims_for_new_gather,
-                                     std::move(shape));
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
-    const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-    absl::Span<const int64_t> slice_sizes, Array* source, Array* indices) {
-  if (dim_numbers.index_vector_dim() != indices->shape().dimensions().size()) {
-    VLOG(3) << "ComputeArrayForGather: indices are not scalar";
-    return nullptr;
-  }
-
-  CHECK_EQ(dim_numbers.start_index_map_size(), 1);
-
-  // We can also handle dim_numbers.collapsed_slice_dims_size() == 0 here,
-  // should it become relevant.
-
-  if (dim_numbers.collapsed_slice_dims_size() != 1 ||
-      dim_numbers.collapsed_slice_dims(0) != dim_numbers.start_index_map(0)) {
-    VLOG(3) << "ComputeArrayForGather: gather operations must elide "
-               "start_index_map[0] and "
-               "start_index_map[0] only";
-    return nullptr;
-  }
-
-  // ScalarIndexedArray cannot represent gathers that "slice" along some
-  // dimensions -- for instance it cannot represent a gather that picks 5 [2,3]
-  // arrays from an array of size [7,4,6].  We check that condition down below:
-
-  for (int64_t i = 0, e = source->shape().dimensions().size(); i < e; i++) {
-    if (i != dim_numbers.collapsed_slice_dims(0) &&
-        source->shape().dimensions(i) != slice_sizes[i]) {
-      VLOG(3) << "ComputeArrayForGather: slice_sizes[" << i
-              << "] != source->shape().dimensions(" << i << ") -- "
-              << source->shape().dimensions(i) << " vs. " << slice_sizes[i]
-              << " with dim_numbers.collapsed_slice_dims(0) = "
-              << dim_numbers.collapsed_slice_dims(0);
-      return nullptr;
-    }
-  }
-
-  int64_t source_dim = dim_numbers.start_index_map(0);
-  std::vector<int64_t> output_dims;
-  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
-    if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) {
-      output_dims.push_back(i);
-    }
-  }
-
-  if (auto* indexed = dynamic_cast<ScalarIndexedArray*>(source)) {
-    if (absl::c_linear_search(indexed->output_dims(), source_dim)) {
-      return FoldGatherOfGather(indexed, indices, source_dim, output_dims,
-                                shape);
-    }
-  } else if (auto* constant = dynamic_cast<ConstantArray*>(source)) {
-    return Construct<ScalarIndexedConstantArray>(constant, indices, source_dim,
-                                                 output_dims, shape);
-  }
-
-  return Construct<ScalarIndexedArray>(source, indices, source_dim, output_dims,
-                                       shape);
-}
-
-namespace {
-// Returns an index into `values` such that the product of the range
-// [values.begin()+index, values.end()) is equal to `product`.  If there is no
-// such index, return -1.  All integers in `values` must be positive.
-int64_t FindSuffixWithProduct(absl::Span<const int64_t> values,
-                              int64_t product) {
-  DCHECK(absl::c_all_of(values, [](int64_t value) { return value > 0; }));
-
-  int64_t current_product = 1;
-  int64_t i;
-  for (i = values.size() - 1; i >= 0 && product > current_product; --i) {
-    current_product *= values[i];
-  }
-
-  if (product == current_product) {
-    return i + 1;
-  }
-
-  return -1;
-}
-
-struct ReshapePassthroughDimPair {
-  int64_t result_dim;
-  int64_t operand_dim;
-};
-
-// Returns a set of dimension pairs such for all (result_dim, operand_dim) in
-// the set:
-//
-// output_index[result_dim] = SourceIndexOfReshape(output_index)[operand_dim]
-//
-// The returned vector of pairs is sorted in both the result_dim and the
-// operand_dim components.
-std::vector<ReshapePassthroughDimPair> ComputeReshapePassthroughDimPairs(
-    absl::Span<const int64_t> operand_shape,
-    absl::Span<const int64_t> result_shape) {
-  // A reshape can be seen as an index mapping from output index to input index:
-  //
-  // (i_0, ..., i_n) = f(o_0, ..., o_m)
-  //
-  // This function returns the pairs (j, k) for which the following invariant
-  // holds for all indices in the shape:
-  //
-  //   o_j == i_k
-  //
-  // And this occurs when:
-  //
-  //    O_{j+1} * ... * O_n == I_{k+1} * ...  * I_m
-  //
-  // (where O_x are the sizes of the output shape and I_x are the sizes of the
-  // input shape) and the size of the dimension j of the result is the same as
-  // the size of dimension k in the operand.
-  //
-  // These conditions are sufficient because the Reshape HLO is spec'ed such
-  // that the rightmost dimensions are always minor in the flattening and refine
-  // operation.
-
-  std::vector<ReshapePassthroughDimPair> result;
-  int64_t result_subarray_size = 1;
-  for (int64_t result_dim = result_shape.size() - 1; result_dim >= 0;
-       --result_dim) {
-    int64_t candidate_operand_dim =
-        FindSuffixWithProduct(operand_shape, result_subarray_size);
-
-    // result_subarray_size does not include the elements in the current
-    // `result_dim` dimension (we multiply in result_shape[result_dim] at the
-    // end of loop body) so candidate_operand_dim can never be zero.
-    CHECK_NE(candidate_operand_dim, 0)
-        << "result_dim = " << result_dim
-        << ", result_subarray_size = " << result_subarray_size
-        << ", result_shape = [" << StrJoin(result_shape, ",") << "]"
-        << ", operand_shape = [" << StrJoin(operand_shape, ",") << "]";
-
-    if (candidate_operand_dim != -1 &&
-        result_shape[result_dim] == operand_shape[candidate_operand_dim - 1]) {
-      result.push_back({/*result_dim=*/result_dim,
-                        /*operand_dim=*/candidate_operand_dim - 1});
-    }
-    result_subarray_size *= result_shape[result_dim];
-  }
-
-  absl::c_reverse(result);
-
-  if (VLOG_IS_ON(3)) {
-    std::vector<std::string> result_strings;
-    absl::c_transform(result, std::back_inserter(result_strings),
-                      [](ReshapePassthroughDimPair value) {
-                        return absl::StrCat(value.result_dim, "->",
-                                            value.operand_dim);
-                      });
-    VLOG(3) << "For a reshape from [" << StrJoin(operand_shape, ",") << "] to ["
-            << StrJoin(result_shape, ",") << "] passthrough indices are ["
-            << StrJoin(result_strings, ",")
-            << "] (legend: `result`->`operand`)";
-  }
-
-  DCHECK(absl::c_is_sorted(
-      result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
-        return lhs.result_dim < rhs.result_dim;
-      }));
-
-  DCHECK(absl::c_is_sorted(
-      result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
-        return lhs.operand_dim < rhs.operand_dim;
-      }));
-
-  return result;
-}
-
-// Return true if `dim` is stated as an passthrough operand dim in
-// `passthrough_dims`.
-bool IsReshapePassthroughOperandDim(
-    absl::Span<const ReshapePassthroughDimPair> passthrough_dims, int64_t dim) {
-  return absl::c_any_of(passthrough_dims,
-                        [&](ReshapePassthroughDimPair passthrough_dim_pair) {
-                          return passthrough_dim_pair.operand_dim == dim;
-                        });
-}
-
-// Maps `operand_dim` which must be an passthrough operand dimension to its
-// corresponding passthrough result dimension based on `passthrough_dims`.
-int64_t MapPassthroughOperandDimToResultDim(
-    absl::Span<const ReshapePassthroughDimPair> passthrough_dims,
-    int64_t operand_dim) {
-  auto it = absl::c_find_if(
-      passthrough_dims, [&](ReshapePassthroughDimPair passthrough_dim_pair) {
-        return passthrough_dim_pair.operand_dim == operand_dim;
-      });
-  CHECK(it != passthrough_dims.end());
-  return it->result_dim;
-}
-
-int64_t FindSourcePositionForPassthroughResultDim(
-    absl::Span<const int64_t> operand_shape,
-    absl::Span<const int64_t> result_shape, int64_t source_passthrough_dim) {
-  VLOG(3) << "FindSourcePositionForPassthroughResultDim(["
-          << StrJoin(operand_shape, ",") << "], [" << StrJoin(result_shape, ",")
-          << "], " << source_passthrough_dim << ")";
-
-  int64_t indexed_source_subarray_size =
-      std::accumulate(operand_shape.begin() + source_passthrough_dim + 1,
-                      operand_shape.end(), 1LL, std::multiplies<int64_t>());
-
-  return FindSuffixWithProduct(result_shape, indexed_source_subarray_size);
-}
-
-Shape StripDegenerateDimensions(const Shape& shape) {
-  DimensionVector new_dims;
-  absl::c_copy_if(shape.dimensions(), std::back_inserter(new_dims),
-                  [](int64_t dim) { return dim != 1; });
-  return ShapeUtil::MakeShape(shape.element_type(), new_dims);
-}
-};  // namespace
-
-absl::StatusOr<ScalarIndexedArray*>
-IndexedArrayAnalysis::ReshapeToRemoveDegenerateDims(
-    ScalarIndexedArray* operand) {
-  const Shape& shape = operand->shape();
-  if (!ShapeUtil::HasDegenerateDimensions(shape)) {
-    return operand;
-  }
-
-  // We only need to reshape out the degenerate dims from the indices and the
-  // source (except the source dim).
-
-  const Shape& source_shape = operand->source()->shape();
-  DimensionVector new_source_shape_dims;
-  for (int64_t i = 0, e = source_shape.dimensions().size(); i < e; i++) {
-    if (i == operand->source_dim() || source_shape.dimensions(i) != 1) {
-      new_source_shape_dims.push_back(source_shape.dimensions(i));
-    }
-  }
-
-  Shape new_source_shape =
-      ShapeUtil::MakeShape(shape.element_type(), new_source_shape_dims);
-  Shape new_indices_shape =
-      StripDegenerateDimensions(operand->indices()->shape());
-
-  TF_ASSIGN_OR_RETURN(
-      Array* const new_source,
-      ComputeArrayForReshape(new_source_shape, operand->source()));
-  TF_ASSIGN_OR_RETURN(
-      Array* const new_indices,
-      ComputeArrayForReshape(new_indices_shape, operand->indices()));
-
-  // Build the new output dims while keeping track of the degenerate dims that
-  // will no longer be present.
-  DimensionVector new_output_dims;
-  int64_t degenerate_dims_seen = 0;
-  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
-    if (shape.dimensions(i) == 1) {
-      degenerate_dims_seen++;
-    } else if (absl::c_linear_search(operand->output_dims(), i)) {
-      new_output_dims.push_back(i - degenerate_dims_seen);
-    }
-  }
-
-  // Similarly, build the new source dim while keeping track of the degenerate
-  // dims that will no longer be present.
-  int64_t degenerate_dims_before_source_dim =
-      std::count(source_shape.dimensions().begin(),
-                 source_shape.dimensions().begin() + operand->source_dim(), 1);
-  int64_t new_source_dim =
-      operand->source_dim() - degenerate_dims_before_source_dim;
-
-  return ConstructScalarIndexedArray(
-      new_source, new_indices, new_source_dim,
-      InlinedVectorToVector(new_output_dims),
-      StripDegenerateDimensions(operand->shape()));
-}
-
-absl::StatusOr<ScalarIndexedArray*>
-IndexedArrayAnalysis::ReshapeToAddDegenerateDims(
-    ScalarIndexedArray* operand, absl::Span<const int64_t> degenerate_dims) {
-  if (degenerate_dims.empty()) {
-    return operand;
-  }
-
-  CHECK(!ShapeUtil::HasDegenerateDimensions(operand->shape()));
-
-  DimensionVector new_output_dims = [&]() {
-    // To make things easy we use a "scratch" buffer of bools where the i'th
-    // element is true iff the i'th component of the result index is an output
-    // index.
-
-    absl::InlinedVector<bool, 6> output_dims_bitvector(
-        operand->shape().dimensions().size());
-    for (int64_t output_dim : operand->output_dims()) {
-      output_dims_bitvector[output_dim] = true;
-    }
-
-    for (int64_t degenerate_dim : degenerate_dims) {
-      InsertAt(&output_dims_bitvector, degenerate_dim, false);
-    }
-
-    DimensionVector result;
-    result.reserve(operand->output_dims().size());
-    for (int64_t i = 0, e = output_dims_bitvector.size(); i < e; i++) {
-      if (output_dims_bitvector[i]) {
-        result.push_back(i);
-      }
-    }
-
-    return result;
-  }();
-
-  DimensionVector new_result_shape_dims;
-  absl::c_copy(operand->shape().dimensions(),
-               std::back_inserter(new_result_shape_dims));
-  for (int64_t degenerate_dim : degenerate_dims) {
-    InsertAt(&new_result_shape_dims, degenerate_dim, 1);
-  }
-
-  DimensionVector new_source_shape_dims = new_result_shape_dims;
-  for (int64_t output_dim : new_output_dims) {
-    EraseAt(&new_source_shape_dims, output_dim);
-  }
-
-  int64_t new_source_dim = [&]() {
-    for (int i = 0, e = new_source_shape_dims.size(); i < e; i++) {
-      int64_t non_degenerate_dims_seen = 0;
-      if (non_degenerate_dims_seen == operand->source_dim()) {
-        return i;
-      }
-      if (new_source_shape_dims[new_source_dim] != 1) {
-        non_degenerate_dims_seen++;
-      }
-    }
-    LOG(FATAL) << "Did not find source dim in " << ToString(operand);
-  }();
-
-  int64_t source_dim_size =
-      operand->source()->shape().dimensions(operand->source_dim());
-  InsertAt(&new_source_shape_dims, /*index=*/new_source_dim,
-           /*value=*/source_dim_size);
-
-  Shape new_source_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
-                                                new_source_shape_dims);
-  Shape new_result_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
-                                                new_result_shape_dims);
-
-  TF_ASSIGN_OR_RETURN(
-      Array* const new_source,
-      ComputeArrayForReshape(new_source_shape, operand->source()));
-  return ConstructScalarIndexedArray(
-      new_source, operand->indices(), new_source_dim,
-      InlinedVectorToVector(new_output_dims), new_result_shape);
-}
-
-absl::StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldReshapeOfGather(
-    const Shape& shape, ScalarIndexedConstantArray* operand) {
-  VLOG(3) << "FoldReshapeOfGather(" << ToString(operand) << ")";
-
-  // To make things easier on ourselves, instead of directly trying to fold the
-  // reshape of `operand` to `shape`, we call
-  // `FoldReshapeOfGatherNoDegenerateDims` on shapes without degenerate dims and
-  // handle the degenerate dimensions here by inserting reshapes.
-
-  TF_ASSIGN_OR_RETURN(ScalarIndexedArray* const operand_without_degenerate_dims,
-                      ReshapeToRemoveDegenerateDims(operand));
-
-  Shape output_shape_without_degenerate_dims = StripDegenerateDimensions(shape);
-  TF_ASSIGN_OR_RETURN(
-      ScalarIndexedArray* const folded_reshape_without_degenerate_dims,
-      FoldReshapeOfGatherNoDegenerateDims(
-          output_shape_without_degenerate_dims,
-          operand_without_degenerate_dims->as<ScalarIndexedConstantArray>()));
-
-  if (folded_reshape_without_degenerate_dims == nullptr) {
-    return nullptr;
-  }
-
-  DimensionVector degenerate_result_dims;
-  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
-    if (shape.dimensions(i) == 1) {
-      degenerate_result_dims.push_back(i);
-    }
-  }
-
-  return ReshapeToAddDegenerateDims(folded_reshape_without_degenerate_dims,
-                                    degenerate_result_dims);
-}
-
-absl::StatusOr<ScalarIndexedArray*>
-IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims(
-    const Shape& shape, ScalarIndexedConstantArray* scalar_indexed) {
-  VLOG(3) << "FoldReshapeOfGatherNoDegenerateDims(" << ToString(scalar_indexed)
-          << ")";
-  CHECK(!ShapeUtil::HasDegenerateDimensions(shape));
-  CHECK(!ShapeUtil::HasDegenerateDimensions(scalar_indexed->shape()));
-
-  // Try to fold Reshape(ScalarIndexed(Const, Indices))
-  //          => ScalarIndexed(Const', Indices)
-  //
-  // We can view the reshape and the scalar-indexed operations as functions that
-  // map an output index (i.e. an index into the result) to an input index
-  // (i.e. an index into the operand).  The key idea used here is that the
-  // output-to-input mapping for some reshape operations may "pass through" some
-  // output dimensions into the input space unchanged -- i.e. there may exist
-  // output dimension "O" and input dimension "I" such that OutputIndex[O] is
-  // always == InputIndexForReshape(OutputIndex)[I].  If these pass-through
-  // dimensions in the input space of the reshape happen to be include all the
-  // output dimensions for the scalar-indexed node then, roughly, the following
-  // holds:
-  //
-  //    SourceIndexOfScalarIndexed(SourceIndexOfReshape(Idx))
-  // == SourceIndexOfScalarIndexed(SourceIndexOfReshape(Ps ++ Qs))
-  //
-  //      Where Ps are the set of the pass-through components of Idx that are
-  //      also the output dims of the scalar-indexed node, and Qs are the rest.
-  //      For brevity, we're playing fast and loose with the notation here -- we
-  //      don't literally require Idx to be a concatenation of Ps and Qs, as
-  //      suggested by the "++".
-  //
-  // == SourceIndexOfScalarIndexed(Ps ++ SourceIndexOfReshape(Qs))
-  //
-  //      Again, we're playing fast and loose with the notation around "++".
-  //      Generally this ++ will be a different function that the ++ in the
-  //      previous step.
-  //
-  // If the scalar-indexed node has a constant as the source then the
-  // SourceIndexOfReshape function can be "folded into" the constant itself by
-  // reshaping it, leaving us with:
-  //
-  // == SourceIndexOfScalarIndexed(Ps ++ Qs)
-  // == SourceIndexOfScalarIndexed(Idx)
-  //
-  // which is just a scalar-indexed node (with parameters different from the
-  // scalar-indexed node we started with) with a reshaped constant as the
-  // source.
-  //
-  // We can't fold SourceIndexOfReshape into the constant without introducing
-  // another precondition: since the new scalar-indexed node will have a
-  // reshaped (constant) array as its source it will, in general, have a
-  // different source dimension than the original scalar-indexed node.  This
-  // source dimension will have to be a passthrough dimension of the
-  // SourceIndexOfReshape indexing function that is folded into the source. And
-  // such a dimension need not exist so this is a non-trivial precondition.
-
-  std::vector<ReshapePassthroughDimPair> reshape_passthrough_dims =
-      ComputeReshapePassthroughDimPairs(
-          /*operand_shape=*/scalar_indexed->shape().dimensions(),
-          /*result_shape=*/shape.dimensions());
-
-  auto is_reshape_passthrough_operand_dim = [&](int64_t operand_dim) {
-    return IsReshapePassthroughOperandDim(reshape_passthrough_dims,
-                                          operand_dim);
-  };
-
-  if (!absl::c_all_of(scalar_indexed->output_dims(),
-                      is_reshape_passthrough_operand_dim)) {
-    VLOG(3) << "Not all output dims are passthrough dims "
-            << ToString(scalar_indexed);
-    return nullptr;
-  }
-
-  // To compute the shape of the source for the new scalar-indexed node we're
-  // going to create, we first "undo" the scalar-indexed operation.
-  std::vector<int64_t> new_scalar_indexed_source_shape(
-      shape.dimensions().begin(), shape.dimensions().end());
-  for (int64_t i = scalar_indexed->output_dims().size() - 1; i >= 0; i--) {
-    int64_t output_dim = scalar_indexed->output_dims()[i];
-    int64_t output_dim_after_reshape = MapPassthroughOperandDimToResultDim(
-        reshape_passthrough_dims, output_dim);
-    EraseAt(&new_scalar_indexed_source_shape, output_dim_after_reshape);
-  }
-
-  // After this, we need to add in the dimension that will be the source
-  // dimension for the new scalar-indexed node.  A scalar-indexed node "removes"
-  // the source dimensions and "adds" the output dimensions, so to get back to
-  // the shape for the *source* of the scalar-indexed node we need to remove the
-  // output dims (which we did above) and then add back the source dim (which we
-  // are about to do below):
-
-  const Shape& scalar_indexed_source_shape = scalar_indexed->source()->shape();
-
-  int64_t source_dim_for_new_scalar_indexed_node =
-      FindSourcePositionForPassthroughResultDim(
-          /*operand_shape=*/scalar_indexed_source_shape.dimensions(),
-          /*result_shape=*/new_scalar_indexed_source_shape,
-          scalar_indexed->source_dim());
-
-  // We may not be able to find a source dim for the new scalar-indexed node.
-  // For instance consider:
-  //
-  //   operand = s32[3,5,2] constant({...})
-  //   indices = s32[7] parameter(0)
-  //   gather = s32[3,2,7] gather(operand, indices),
-  //       offset_dims={0,1},
-  //       collapsed_slice_dims={1},
-  //       start_index_map={1},
-  //       index_vector_dim=1,
-  //       slice_sizes={3,1,2}
-  //   reshape = s32[6,7] reshape(gather)
-  //
-  // In this case the gather maps to:
-  //    (scalar-indexed-const (constant s32[3,5,2]) %indices 1->[2])
-  //
-  // and the reshape passes through dimension 2 from its input into dimension 1
-  // in its output.  However, we can't rewrite the reshape as a scalar-indexed
-  // node because then we'd have to reshape the [3,5,2] `operand` array to
-  // [6,5], but then dimension 1 of the reshaped [6,5] array indexes differently
-  // (a.k.a. isn't pass-through) than the [3,5,2] array.
-
-  if (source_dim_for_new_scalar_indexed_node == -1) {
-    VLOG(3) << "Could not compute the source dim for the new scalar indexed "
-               "node: scalar_indexed_source_shape = ["
-            << StrJoin(scalar_indexed_source_shape.dimensions(), ",")
-            << "] and new_scalar_indexed_source_shape = ["
-            << StrJoin(new_scalar_indexed_source_shape, ",") << "]";
-    return nullptr;
-  }
-
-  InsertAt(
-      &new_scalar_indexed_source_shape, source_dim_for_new_scalar_indexed_node,
-      scalar_indexed_source_shape.dimensions(scalar_indexed->source_dim()));
-
-  CHECK_EQ(absl::c_accumulate(new_scalar_indexed_source_shape, 1LL,
-                              std::multiplies<int64_t>()),
-           ShapeUtil::ElementsIn(scalar_indexed_source_shape));
-
-  CHECK(IsReshapePassthroughOperandDim(
-      ComputeReshapePassthroughDimPairs(
-          /*operand_shape=*/scalar_indexed_source_shape.dimensions(),
-          /*result_shape=*/new_scalar_indexed_source_shape),
-      scalar_indexed->source_dim()));
-
-  auto map_passthrough_operand_dim_to_result_dim = [&](int64_t result_dim) {
-    return MapPassthroughOperandDimToResultDim(reshape_passthrough_dims,
-                                               result_dim);
-  };
-
-  std::vector<int64_t> output_dims_for_new_scalar_indexed_node;
-  absl::c_transform(scalar_indexed->output_dims(),
-                    std::back_inserter(output_dims_for_new_scalar_indexed_node),
-                    map_passthrough_operand_dim_to_result_dim);
-
-  TF_ASSIGN_OR_RETURN(const Literal* new_scalar_indexed_source_literal,
-                      TakeOwnership(scalar_indexed->literal().Reshape(
-                          new_scalar_indexed_source_shape)));
-  TF_ASSIGN_OR_RETURN(
-      Array * new_scalar_indexed_source,
-      ComputeArrayForConstant(*new_scalar_indexed_source_literal));
-
-  return ConstructScalarIndexedArray(
-      new_scalar_indexed_source, scalar_indexed->indices(),
-      source_dim_for_new_scalar_indexed_node,
-      output_dims_for_new_scalar_indexed_node, shape);
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForReshape(
-    const Shape& shape, Array* operand) {
-  if (ShapeUtil::Compatible(operand->shape(), shape)) {
-    return operand;
-  }
-
-  if (auto* scalar_indexed =
-          dynamic_cast<ScalarIndexedConstantArray*>(operand)) {
-    TF_ASSIGN_OR_RETURN(Analysis::Array * reshape_folded_into_gather,
-                        FoldReshapeOfGather(shape, scalar_indexed));
-    if (reshape_folded_into_gather) {
-      return reshape_folded_into_gather;
-    }
-  }
-
-  if (auto* constant_array = dynamic_cast<ConstantArray*>(operand)) {
-    TF_ASSIGN_OR_RETURN(
-        Literal* const new_literal,
-        TakeOwnership(constant_array->literal()->Reshape(shape.dimensions())));
-    return Construct<ConstantArray>(new_literal);
-  }
-
-  return Construct<ReshapedArray>(operand, shape);
-}
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
-                                                         Array* lhs,
-                                                         Array* rhs) {
-  // Try to fold BinaryOp(Broadcast(Const0), ScalarIndexed(Const1, Indices))
-  //          => ScalarIndexed(BinaryOp(Broadcast'(Const0), Const1), Indices)
-  //
-  // We can do this if every output dimension from the scalar-indexed node is a
-  // broadcasted dimension for the broadcast node.  Informally, the precondition
-  // means Broadcast(Const0)[IDX] is solely a function of the components of IDX
-  // that are not output-dims for the scalar-indexed node. In other words, for
-  // every assignment to the non-output dims in IDX we have a "constant" LHS to
-  // the BinaryOp.  This transform propagates this "constant" to the source for
-  // the scalar-indexed node.
-
-  ScalarIndexedConstantArray* lhs_scalar_indexed_const =
-      dynamic_cast<ScalarIndexedConstantArray*>(lhs);
-  ScalarIndexedConstantArray* rhs_scalar_indexed_const =
-      dynamic_cast<ScalarIndexedConstantArray*>(rhs);
-
-  bool lhs_is_indexed;
-
-  // One of the operands must be scalar-indexed and the other must be a
-  // broadcast of a constant.
-  if (lhs_scalar_indexed_const && !rhs_scalar_indexed_const) {
-    lhs_is_indexed = true;
-  } else if (rhs_scalar_indexed_const && !lhs_scalar_indexed_const) {
-    lhs_is_indexed = false;
-  } else {
-    return nullptr;
-  }
-
-  ScalarIndexedConstantArray* scalar_indexed_const =
-      lhs_is_indexed ? lhs_scalar_indexed_const : rhs_scalar_indexed_const;
-  UnknownArray* candidate_broadcast_array =
-      dynamic_cast<UnknownArray*>(lhs_is_indexed ? rhs : lhs);
-  if (!candidate_broadcast_array ||
-      candidate_broadcast_array->instruction().opcode() !=
-          HloOpcode::kBroadcast) {
-    return nullptr;
-  }
-
-  const HloInstruction* broadcast_instr =
-      &candidate_broadcast_array->instruction();
-  const HloInstruction* broadcast_const_operand = broadcast_instr->operand(0);
-  if (broadcast_const_operand->opcode() != HloOpcode::kConstant) {
-    return nullptr;
-  }
-
-  absl::Span<const int64_t> broadcast_dims = broadcast_instr->dimensions();
-  auto is_broadcasted_dim = [&](int64_t output_dim) {
-    return absl::c_find(broadcast_dims, output_dim) == broadcast_dims.end();
-  };
-
-  // All of the output dims must be "broadcasted" dims for the other operand.
-  if (!absl::c_all_of(scalar_indexed_const->output_dims(),
-                      is_broadcasted_dim)) {
-    return nullptr;
-  }
-
-  // To figure out the broadcast dimensions for the (constant) source for the
-  // scalar-indexed node, we "simulate" the index transformation done by the
-  // existing broadcast:
-  enum class IndexComponent { Broadcasted, NotBroadcasted };
-  std::vector<IndexComponent> simulated_index(
-      broadcast_instr->shape().dimensions().size(),
-      IndexComponent::Broadcasted);
-  for (int64_t broadcast_dim : broadcast_dims) {
-    simulated_index[broadcast_dim] = IndexComponent::NotBroadcasted;
-  }
-
-  // The scalar-indexed node "removes" the source dim and "inserts" the output
-  // dims.  We do the opposite here to undo the scalar-indexed operation.
-  absl::Span<const int64_t> output_dims = scalar_indexed_const->output_dims();
-  for (int64_t i = output_dims.size() - 1; i >= 0; --i) {
-    CHECK(simulated_index[output_dims[i]] == IndexComponent::Broadcasted);
-    EraseAt(&simulated_index, output_dims[i]);
-  }
-
-  InsertAt(&simulated_index, scalar_indexed_const->source_dim(),
-           IndexComponent::Broadcasted);
-
-  // new_inner_broadcast_dims holds the broadcast dimensions for the inner
-  // BinaryOp(Broadcast'(Const0), Const1).  We now translate simulated_index to
-  // new_inner_broadcast_dims.
-  std::vector<int64_t> new_inner_broadcast_dims;
-  for (int64_t i = 0; i < simulated_index.size(); i++) {
-    if (simulated_index[i] == IndexComponent::NotBroadcasted) {
-      new_inner_broadcast_dims.push_back(i);
-    }
-  }
-
-  // inner_broadcast_result is the Broadcast'(Const0) bit in
-  // BinaryOp(Broadcast'(Const0), Const1)
-  TF_ASSIGN_OR_RETURN(
-      Literal inner_broadcast_result,
-      broadcast_const_operand->literal().Broadcast(
-          scalar_indexed_const->source()->shape(), new_inner_broadcast_dims));
-
-  // literal_for_new_source is BinaryOp(Broadcast'(Const0), Const1)
-  const Literal* literal_for_new_source;
-  if (lhs_is_indexed) {
-    TF_ASSIGN_OR_RETURN(
-        literal_for_new_source,
-        TakeOwnership(HloEvaluator{}.EvaluateElementwiseBinaryOp(
-            opcode, scalar_indexed_const->literal(), inner_broadcast_result)));
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        literal_for_new_source,
-        TakeOwnership(HloEvaluator{}.EvaluateElementwiseBinaryOp(
-            opcode, inner_broadcast_result, scalar_indexed_const->literal())));
-  }
-
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, scalar_indexed_const->indices(),
-      scalar_indexed_const->source_dim(),
-      std::vector<int64_t>(scalar_indexed_const->output_dims().begin(),
-                           scalar_indexed_const->output_dims().end()),
-      scalar_indexed_const->shape());
-}
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForElementwiseUnaryOp(HloOpcode opcode,
-                                                        Array* operand) {
-  auto* scalar_indexed_const =
-      dynamic_cast<ScalarIndexedConstantArray*>(operand);
-  if (scalar_indexed_const == nullptr) {
-    return nullptr;
-  }
-
-  // Fold UnaryOp(ScalarIndexed(Const, Indices))
-  //   => ScalarIndexed(UnaryOp(Const), Indices)
-
-  TF_ASSIGN_OR_RETURN(Literal * literal_for_new_source,
-                      TakeOwnership(HloEvaluator{}.EvaluateElementwiseUnaryOp(
-                          opcode, scalar_indexed_const->literal())));
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, scalar_indexed_const->indices(),
-      scalar_indexed_const->source_dim(),
-      SpanToVector(scalar_indexed_const->output_dims()),
-      scalar_indexed_const->shape());
-}
-
-namespace {
-
-// Returns the non-contracting non-batch dimension (as per `contracting_dims`
-// and `batch_dims`) if there is exactly one, otherwise returns nullopt.
-std::optional<int64_t> GetOnlyNonContractingNonBatchDim(
-    int64_t rank, absl::Span<const int64_t> contracting_dims,
-    absl::Span<const int64_t> batch_dims) {
-  std::optional<int64_t> result;
-  for (int64_t dim = 0; dim < rank; dim++) {
-    if (!absl::c_linear_search(contracting_dims, dim) &&
-        !absl::c_linear_search(batch_dims, dim)) {
-      if (result.has_value()) {
-        return std::nullopt;
-      }
-      result = dim;
-    }
-  }
-  return result;
-}
-
-// Returns true if `indexed_array`, which is either the LHS or the RHS of a Dot
-// HLO, can be folded into the dot operation.  For now these conditions are both
-// necessary and sufficient.
-//
-// `tag` describes the caller.  Used only for logging.
-//
-// `contracting_dims` and `batch_dims` are the contracting and batch dimensions
-// of whatever operand `indexed_array` is to the dot (LHS or RHS).
-bool CanFoldDotIntoIndexedArray(
-    absl::string_view tag, Analysis::ScalarIndexedConstantArray* indexed_array,
-    absl::Span<const int64_t> contracting_dims,
-    absl::Span<const int64_t> batch_dims) {
-  std::optional<int64_t> non_contracting_non_batch_dim =
-      GetOnlyNonContractingNonBatchDim(
-          indexed_array->shape().dimensions().size(), contracting_dims,
-          batch_dims);
-  if (!non_contracting_non_batch_dim.has_value()) {
-    VLOG(3) << tag << ": multiple or no non-contracting non-batch dimensions";
-    return false;
-  }
-
-  if (indexed_array->output_dims().size() != 1 ||
-      indexed_array->output_dims()[0] != *non_contracting_non_batch_dim) {
-    VLOG(3) << tag << ": output dims != the lhs non-contracting non-batch dim";
-    return false;
-  }
-
-  int64_t indexed_array_rank = indexed_array->shape().dimensions().size();
-  if (indexed_array->source_dim() < (indexed_array_rank - 2)) {
-    // This restriction can be lifted by inserting reshape nodes.
-    VLOG(3) << tag
-            << ": source dim is not in the low two dims, won't be able to form "
-               "a matmul";
-    return false;
-  }
-
-  return true;
-}
-
-}  // namespace
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForDotWithIndexedLhs(
-    const Shape& shape, const DotDimensionNumbers& dim_numbers,
-    const PrecisionConfig& precision_config, ScalarIndexedConstantArray* lhs,
-    ConstantArray* rhs) {
-  VLOG(3) << "ComputeArrayForDotWithIndexedLhs(" << ToString(lhs) << " "
-          << ToString(rhs);
-  if (!CanFoldDotIntoIndexedArray(
-          "ComputeArrayForDotWithIndexedLhs", lhs, /*contracting_dims=*/
-          dim_numbers.lhs_contracting_dimensions(),
-          /*batch_dims=*/dim_numbers.lhs_batch_dimensions())) {
-    return nullptr;
-  }
-
-  int64_t lhs_rank = lhs->shape().dimensions().size();
-  DotDimensionNumbers new_dim_numbers = dim_numbers;
-  new_dim_numbers.set_lhs_contracting_dimensions(
-      0, lhs->source_dim() == (lhs_rank - 1) ? (lhs_rank - 2) : (lhs_rank - 1));
-
-  TF_ASSIGN_OR_RETURN(
-      Literal * literal_for_new_source,
-      TakeOwnership(HloEvaluator{}.EvaluateDotOp(
-          new_dim_numbers, precision_config, lhs->literal(), *rhs->literal())));
-
-  // The new source dimension is wherever the non-batch non-contracting LHS
-  // dimension "went".
-  int64_t new_source_dim = dim_numbers.lhs_batch_dimensions_size() +
-                           dim_numbers.rhs_batch_dimensions_size();
-
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, lhs->indices(), new_source_dim,
-      SpanToVector(lhs->output_dims()), shape);
-}
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForDotWithIndexedRhs(
-    const Shape& shape, const DotDimensionNumbers& dim_numbers,
-    const PrecisionConfig& precision_config, ConstantArray* lhs,
-    ScalarIndexedConstantArray* rhs) {
-  VLOG(3) << "ComputeArrayForDotWithIndexedRhs(" << ToString(lhs) << " "
-          << ToString(rhs);
-  if (!CanFoldDotIntoIndexedArray(
-          "ComputeArrayForDotWithIndexedRhs", rhs, /*contracting_dims=*/
-          dim_numbers.rhs_contracting_dimensions(),
-          /*batch_dims=*/dim_numbers.rhs_batch_dimensions())) {
-    return nullptr;
-  }
-
-  int64_t rhs_rank = rhs->shape().dimensions().size();
-
-  DotDimensionNumbers new_dim_numbers = dim_numbers;
-  new_dim_numbers.set_rhs_contracting_dimensions(
-      0, rhs->source_dim() == (rhs_rank - 1) ? (rhs_rank - 2) : (rhs_rank - 1));
-
-  TF_ASSIGN_OR_RETURN(
-      Literal * literal_for_new_source,
-      TakeOwnership(HloEvaluator{}.EvaluateDotOp(
-          new_dim_numbers, precision_config, *lhs->literal(), rhs->literal())));
-
-  // The new source dimension is wherever the non-batch non-contracting RHS
-  // dimension "went".
-  int64_t new_source_dim = dim_numbers.lhs_batch_dimensions_size() +
-                           dim_numbers.rhs_batch_dimensions_size() + 1;
-
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, rhs->indices(), new_source_dim,
-      SpanToVector(rhs->output_dims()), shape);
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForDot(
-    const Shape& shape, const DotDimensionNumbers& dim_numbers,
-    const PrecisionConfig& precision_config, Array* lhs, Array* rhs) {
-  // Intuitively, if
-  //
-  //  - The LHS of a dot product is a gathered sequence of rows from a constant
-  //    array (i.e. LHS[I,J] = Const[Indices[I],J]) and the RHS is a constant
-  //
-  //  OR
-  //
-  //  - If the RHS of a dot product is a gathered sequence of columns from a
-  //    constant array (i.e. RHS[I,J] = Const[I, Indices[J]]) and the LHS is a
-  //    constant
-  //
-  // then the result of the dot product itself is a gather from a constant
-  // array.  E.g. Dot(LHS, ConstRhs) where LHS[I,J] = Const[Indices[I],J] can be
-  // rewritten as Result where Result[I,J] = Dot(Const, ConstRhs)[Indices[I],
-  // J].
-  //
-  // We do a general version of this rewrite here.
-  VLOG(3) << "ComputeArrayForDot(" << ToString(lhs) << " " << ToString(rhs);
-  if (auto* lhs_indexed_array =
-          dynamic_cast<ScalarIndexedConstantArray*>(lhs)) {
-    if (auto* rhs_constant = dynamic_cast<ConstantArray*>(rhs)) {
-      return ComputeArrayForDotWithIndexedLhs(shape, dim_numbers,
-                                              precision_config,
-                                              lhs_indexed_array, rhs_constant);
-    }
-  }
-
-  if (auto* rhs_indexed_array =
-          dynamic_cast<ScalarIndexedConstantArray*>(rhs)) {
-    if (auto* lhs_constant = dynamic_cast<ConstantArray*>(lhs)) {
-      return ComputeArrayForDotWithIndexedRhs(shape, dim_numbers,
-                                              precision_config, lhs_constant,
-                                              rhs_indexed_array);
-    }
-  }
-
-  return nullptr;
-}
-
-absl::StatusOr<bool> IndexedArrayAnalysisPrinterPass::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  if (!VLOG_IS_ON(2)) {
-    return false;
-  }
-
-  IndexedArrayAnalysis analysis;
-  for (auto* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
-    for (auto* instr : computation->instructions()) {
-      TF_ASSIGN_OR_RETURN(Analysis::Array * t, analysis.GetArrayFor(instr));
-      if (!dynamic_cast<UnknownArray*>(t) && !dynamic_cast<ConstantArray*>(t)) {
-        VLOG(2) << instr->ToString() << "   ->   " << analysis.ToString(t);
-      }
-    }
-  }
-
-  return false;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h b/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h
deleted file mode 100644
index 83bf625a337120..00000000000000
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h
+++ /dev/null
@@ -1,395 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
-#define XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
-
-#include <type_traits>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/literal.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-
-// IndexedArrayAnalysis decides if an HLO instruction can be rewritten as a
-// gather from another array.  It does this by mapping HLO instructions to
-// instances of IndexedArrayAnalysis::Array, which can be inspected to discover
-// whether said HLO is equivalent to a gather.
-class IndexedArrayAnalysis {
- public:
-  // IndexedArrayAnalysis maps each HLO instruction to an instance of a Array.
-  // Array really just a sum type of the classes that inherit from it.  The
-  // meaning of each of the subtypes is documented on the subtype declaration.
-  //
-  // Array instances are immutable once created.
-  class Array {
-   public:
-    enum Kind {
-      kUnknown,
-      kConstant,
-      kReshaped,
-      kScalarIndexedConstant,
-      kScalarIndexed
-    };
-
-    virtual Kind kind() const = 0;
-    virtual const Shape& shape() const = 0;
-
-    // Does a checked downcast from `Array` to `T` which must be one of its
-    // subtypes.
-    template <typename T>
-    T* as() {
-      static_assert((std::is_base_of<Array, T>::value),
-                    "target type not derived from source type");
-      // We skip the CHECK and hence the dynamic_cast if RTTI is disabled.
-#if !defined(__GNUC__) || defined(__GXX_RTTI)
-      CHECK_NE(dynamic_cast<T*>(this), nullptr);
-#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
-
-      return static_cast<T*>(this);
-    }
-
-    virtual ~Array() = default;
-
-    Array& operator=(const Array& other) = delete;
-  };
-
-  // Represents an HLO instruction that was not analyzable by this
-  // IndexedArrayAnalysis.  Instances of UnknownArray just wrap an existing
-  // HloInstruction.
-  class UnknownArray : public Array {
-   public:
-    Kind kind() const override { return kUnknown; }
-    const Shape& shape() const override { return instruction().shape(); }
-    const HloInstruction& instruction() const { return instruction_; }
-
-   private:
-    explicit UnknownArray(const HloInstruction* instr) : instruction_(*instr) {}
-
-    const HloInstruction& instruction_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // Represents a constant value.  This constant value may be present in the HLO
-  // module being analyzed, or it could have been created on the fly by the
-  // analysis.
-  class ConstantArray : public Array {
-   public:
-    Kind kind() const override { return kConstant; }
-    const Shape& shape() const override { return literal()->shape(); }
-    const Literal* literal() const { return literal_; }
-
-   private:
-    explicit ConstantArray(const Literal* literal) : literal_(literal) {}
-    const Literal* literal_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // Represents an Array that is a reshape of another Array.
-  class ReshapedArray : public Array {
-   public:
-    Kind kind() const override { return kReshaped; }
-
-    // The array to reshape.
-    Array* operand() const { return operand_; }
-
-    // The output shape.
-    const Shape& shape() const override { return shape_; }
-
-   private:
-    explicit ReshapedArray(Array* operand, Shape shape)
-        : operand_(operand), shape_(shape) {}
-
-    Array* operand_;
-    const Shape shape_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // ---------------------------------------------------------------------------
-  // Indexed Array Overview
-  // ---------------------------------------------------------------------------
-  //
-  // ScalarIndexedArray and ScalarIndexedConstantArray form the core of this
-  // analysis.  ScalarIndexedConstantArray is just a specialization of
-  // ScalarIndexedArray so we will only discuss ScalarIndexedArray in this
-  // overview.
-  //
-  // A ScalarIndexedArray represents an array that can be computed by indexing
-  // into a "source" array using an "indices" tensor.  A simple example is a
-  // gather operation gathering 12 rows out of a [100,100] matrix -- such an
-  // operation will be represented by an instance of a ScalarIndexedArray with
-  // the [100,100] matrix as the "source" array and the [12]-shaped indices
-  // array as the "indices" tensor.  The ScalarIndexedArray operation itself
-  // will be of shape [12,100] (assuming we were gathering with axis=0).
-  //
-  // Gather operations are not the only operation that maps to
-  // ScalarIndexedArray instances (if that were true there would be little point
-  // in having a separate analysis).  We can often infer ScalarIndexedArrays for
-  // other operations too.  For instance, consider:
-  //
-  //   %source = f32[100,100] constant
-  //   %indices = s32[12] ...
-  //   %gather = f32[12,100] ... gather from %source using %indices at axis 0
-  //   %dot = dot(%gather, other_constant) [canonical contracting dims]
-  //
-  // The dot operation itself is also a ScalarIndexedArray with source =
-  // dot(constant, other_constant) and indices = %indices.  A reshape of %gather
-  // to [12,5,20] too is a ScalarIndexedArray with source = an appropriately
-  // reshaped constant and indices = %indices.
-
-  // Represents the result of a gather operation.  This gather operation may
-  // explicitly be present in the HLO module being analyzed, or it could have
-  // been created on the fly by the analysis.
-  //
-  // An instance of ScalarIndexedArray represents a array whose I'th element can
-  // be mapped to the J'th element of the `source` array (where I and J are
-  // multidimensional indices) in this way:
-  //
-  //   I' = remove components at positions `output_dims` from I
-  //   G' = remove components not at positions `output_dims` from I
-  //   T  = indices[G']
-  //   J  = I' with T inserted at position `source_dim`
-  //
-  // For example, if source is of shape [11,13,17,19], indices is of shape
-  // [23,29], output_dims is [0,2] and source_dim is 2 then the output is of
-  // shape [23,11,29,13,19] and the output index [A,B,C,D,E] is mapped to the
-  // input index [B,D,indices[A,C],E].
-  class ScalarIndexedArray : public Array {
-   public:
-    Kind kind() const override { return kScalarIndexed; }
-    const Shape& shape() const override { return shape_; }
-
-    Array* source() const { return source_; }
-    Array* indices() const { return indices_; }
-
-    // `source_dim` is the dimension in the source array that is being indexed
-    // over using indices from the `indices` array.  See the class documentation
-    // and the overview for more details.
-    int64_t source_dim() const { return source_dim_; }
-
-    // `output_dims` are the dimensions in the output array that are being used
-    // to compute an index into the `indices` array.  See the class
-    // documentation and the overview for more details.
-    absl::Span<const int64_t> output_dims() const { return output_dims_; }
-
-   private:
-    explicit ScalarIndexedArray(Array* source, Array* indices,
-                                int64_t source_dim,
-                                std::vector<int64_t> output_dims, Shape shape)
-        : source_(source),
-          indices_(indices),
-          source_dim_(source_dim),
-          output_dims_(std::move(output_dims)),
-          shape_(std::move(shape)) {}
-
-    Array* source_;
-    Array* indices_;
-    int64_t source_dim_;
-    std::vector<int64_t> output_dims_;
-    Shape shape_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // A ScalarIndexedConstantArray is just a ScalarIndexedArray constrained to
-  // have a ConstantArray instance as the source.  This is an ergonomic
-  // concession -- in theory it is possible to just keep ScalarIndexedArray and
-  // check source()->kind().
-  class ScalarIndexedConstantArray : public ScalarIndexedArray {
-   public:
-    Kind kind() const override { return kScalarIndexedConstant; }
-
-    const Literal& literal() const {
-      return *source()->as<ConstantArray>()->literal();
-    }
-
-   private:
-    explicit ScalarIndexedConstantArray(Array* source, Array* indices,
-                                        int64_t source_dim,
-                                        std::vector<int64_t> output_dims,
-                                        Shape shape)
-        : ScalarIndexedArray(source, indices, source_dim,
-                             std::move(output_dims), std::move(shape)) {
-      CHECK(dynamic_cast<ConstantArray*>(source));
-    }
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // Returns an Array instance for `instr`.  The IndexedArrayAnalysis instance
-  // keeps ownership of the returned Array instance.
-  //
-  // Caching Behavior: IndexedArrayAnalysis has a cache mapping HLO
-  // instructions to IndexedArrayAnalysis::Array instances.  This entire cache
-  // becomes stale and may cause the analysis to return incorrect results if any
-  // transitive operand (stopping at the containing computation) is modified for
-  // any HLO instruction on which GetArrayFor has been invoked.
-  //
-  // NB!  By inspecting the implementation, you may be able to infer a stronger
-  // caching guarantee than what is mentioned above.  Nevertheless, what is
-  // stated above is the contract.
-  absl::StatusOr<Array*> GetArrayFor(const HloInstruction* instr);
-
-  // Pretty-prints the expression rooted at `root`.
-  std::string ToString(Array* root, bool print_constants = false);
-
- private:
-  // Helper function that ensures that every HLO instruction that is
-  // transitively used by `root` has an entry in `cache_`.
-  absl::Status TraverseAndPopulateCache(const HloInstruction* root);
-
-  // Creates an Array instance for `instr` under the assumption that all
-  // operations of `instr` are present in `cache_`.
-  absl::StatusOr<Array*> ComputeArrayFor(const HloInstruction* instr);
-
-  absl::StatusOr<Array*> ComputeArrayForConstant(const Literal& literal);
-
-  absl::StatusOr<Array*> ComputeArrayForGather(
-      const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-      absl::Span<const int64_t> slice_sizes, Array* source, Array* indices);
-
-  absl::StatusOr<Array*> ComputeArrayForDotWithIndexedLhs(
-      const Shape& shape, const DotDimensionNumbers& dim_numbers,
-      const PrecisionConfig& precision_config, ScalarIndexedConstantArray* lhs,
-      ConstantArray* rhs);
-
-  absl::StatusOr<Array*> ComputeArrayForDotWithIndexedRhs(
-      const Shape& shape, const DotDimensionNumbers& dim_numbers,
-      const PrecisionConfig& precision_config, ConstantArray* lhs,
-      ScalarIndexedConstantArray* rhs);
-
-  absl::StatusOr<Array*> ComputeArrayForDot(
-      const Shape& shape, const DotDimensionNumbers& dim_numbers,
-      const PrecisionConfig& precision_config, Array* lhs, Array* rhs);
-
-  // This tries to fold a ScalarIndexedArray which has another
-  // ScalarIndexedArray as a source into a ScalarIndexedArray that instead has a
-  // ScalarIndexedArray as indices.  If `source` happened to be a
-  // ScalarIndexedConstantArray this can result in an expression that is more
-  // canonical.
-  //
-  // As an example, consider a gather operation, G0, gathering 7 elements from
-  // an array "Arr" of shape [100] resulting in an array of shape [7], and a
-  // second gather operation, G1, which gathers 3 elements out of the result of
-  // G0 resulting in an array of shape [3].  Let the indices uses by G0 be I0
-  // (of shape [7]) and the indices used by G1 be I1 (of shape [3]).  We can
-  // instead rewrite G1 to gather directly from "Arr" with the three indices
-  // from I0 as per I1.  In other words, we can rewrite:
-  //
-  //    G0 = [Arr[i] for i in I0]
-  //    G1 = [G0[i]  for i in I1]
-  //
-  // into
-  //
-  //    I2 = [I0[i]  for i in I1]
-  //    G1 = [Arr[i] for i in I2]
-  absl::StatusOr<ScalarIndexedArray*> FoldGatherOfGather(
-      ScalarIndexedArray* source, Array* indices, int64_t source_dim,
-      absl::Span<const int64_t> output_dims, Shape shape);
-
-  // Reshapes a scalar-indexed node to remove the degenerate dimensions in its
-  // output.  The result is always a scalar-indexed node.
-  absl::StatusOr<ScalarIndexedArray*> ReshapeToRemoveDegenerateDims(
-      ScalarIndexedArray* operand);
-
-  // Reshapes a scalar-indexed node such that the result has the degenerate
-  // dimensions `degenerate_dims`.  The result is always a scalar-indexed node.
-  absl::StatusOr<ScalarIndexedArray*> ReshapeToAddDegenerateDims(
-      ScalarIndexedArray* operand, absl::Span<const int64_t> degenerate_dims);
-
-  absl::StatusOr<ScalarIndexedArray*> FoldReshapeOfGather(
-      const Shape& shape, ScalarIndexedConstantArray* operand);
-  absl::StatusOr<ScalarIndexedArray*> FoldReshapeOfGatherNoDegenerateDims(
-      const Shape& shape, ScalarIndexedConstantArray* scalar_indexed);
-  absl::StatusOr<Array*> ComputeArrayForReshape(const Shape& shape,
-                                                Array* operand);
-
-  absl::StatusOr<Array*> ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
-                                                            Array* lhs,
-                                                            Array* rhs);
-  absl::StatusOr<Array*> ComputeArrayForElementwiseUnaryOp(HloOpcode opcode,
-                                                           Array* operand);
-
-  template <typename T, typename... Args>
-  T* Construct(Args&&... args) {
-    T* new_tensor = new T(std::forward<Args>(args)...);
-    owned_tensors_.push_back(std::unique_ptr<T>(new_tensor));
-    return new_tensor;
-  }
-
-  ScalarIndexedArray* ConstructScalarIndexedArray(
-      Array* source, Array* indices, int64_t source_dim,
-      std::vector<int64_t> output_dims, Shape shape) {
-    if (source->kind() == Array::kConstant) {
-      return Construct<ScalarIndexedConstantArray>(source, indices, source_dim,
-                                                   std::move(output_dims),
-                                                   std::move(shape));
-    } else {
-      return Construct<ScalarIndexedArray>(source, indices, source_dim,
-                                           std::move(output_dims),
-                                           std::move(shape));
-    }
-  }
-
-  Literal* TakeOwnership(Literal literal) {
-    owned_literals_.push_back(std::move(literal));
-    return &owned_literals_.back();
-  }
-
-  absl::StatusOr<Literal*> TakeOwnership(
-      absl::StatusOr<Literal> literal_or_error) {
-    TF_ASSIGN_OR_RETURN(Literal literal, std::move(literal_or_error));
-    owned_literals_.push_back(std::move(literal));
-    return &owned_literals_.back();
-  }
-
-  std::vector<std::unique_ptr<Array>> owned_tensors_;
-  std::vector<Literal> owned_literals_;
-  absl::flat_hash_map<const HloInstruction*, Array*> cache_;
-};
-
-// A pass that prints all non-trivial results returned by IndexedArrayAnalysis.
-// This pass is a no-op if !VLOG_IS_ON(2) so it should be fine to
-// unconditionally add to the regular HLO pass pipeline.
-class IndexedArrayAnalysisPrinterPass : public HloModulePass {
- public:
-  absl::string_view name() const override {
-    return "indexed-array-analysis-printer-pass";
-  }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-};
-
-}  // namespace xla
-
-#endif  // XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc b/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc
deleted file mode 100644
index 574a487c330e1e..00000000000000
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc
+++ /dev/null
@@ -1,974 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/hlo/analysis/indexed_array_analysis.h"
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "absl/strings/ascii.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace {
-class IndexedArrayAnalysisTest : public HloHardwareIndependentTestBase {
- protected:
-  void AssertArrayForRootExpressionIs(const std::string& hlo_text,
-                                      const std::string& root_expression) {
-    AssertArrayForRootExpressionIsImpl(hlo_text, root_expression,
-                                       /*print_constants=*/false);
-  }
-
-  void AssertArrayWithConstantsForRootExpressionIs(
-      const std::string& hlo_text, const std::string& root_expression) {
-    AssertArrayForRootExpressionIsImpl(hlo_text, root_expression,
-                                       /*print_constants=*/true);
-  }
-
- private:
-  // Replaces sequences of whitespace with a single space.  This makes the
-  // strings being matched against "whitespace insensitive" which lets us indent
-  // them for readability.
-  std::string CanonicalizeWhitespace(const std::string& text) {
-    std::string result;
-
-    for (char c : text) {
-      if (!absl::ascii_isspace(c)) {
-        result.push_back(c);
-      } else if (!result.empty() && result.back() != ' ') {
-        result.push_back(' ');
-      }
-    }
-
-    while (!result.empty() && result.back() == ' ') {
-      result.pop_back();
-    }
-
-    return result;
-  }
-
-  void AssertArrayForRootExpressionIsImpl(const std::string& hlo_text,
-                                          const std::string& root_expression,
-                                          bool print_constants) {
-    IndexedArrayAnalysis indexed_tensor_analysis;
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
-                            ParseAndReturnVerifiedModule(hlo_text));
-
-    TF_ASSERT_OK_AND_ASSIGN(IndexedArrayAnalysis::Array* const array_result,
-                            indexed_tensor_analysis.GetArrayFor(
-                                m->entry_computation()->root_instruction()));
-    std::string string_result = CanonicalizeWhitespace(
-        indexed_tensor_analysis.ToString(array_result, print_constants));
-    LOG(INFO) << string_result;
-    ASSERT_EQ(string_result, CanonicalizeWhitespace(root_expression));
-  }
-};
-
-TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneGather) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,3] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text,
-                                 "(scalar-indexed %operand %indices 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneConstantGather) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
-  indices = s32[5] parameter(0)
-  ROOT gather = s32[5,3] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text, "(scalar-indexed-const (constant s32[3,3]) %indices 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed0) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
-  indices = s32[5,2] parameter(0)
-  ROOT gather = s32[5] gather(operand, indices),
-      offset_dims={},
-      collapsed_slice_dims={0,1},
-      start_index_map={0,1},
-      index_vector_dim=1,
-      slice_sizes={1,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed1) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3,1] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,3] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0,2},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed2) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3,1] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,2,3] gather(operand, indices),
-      offset_dims={1,2},
-      collapsed_slice_dims={2},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={2,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed3) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,2] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,2}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
-  indices_a = s32[5] parameter(0)
-  indices_b = s32[2] parameter(1)
-  gather_a = s32[5,3] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-  ROOT gather_b = s32[2,3] gather(gather_a, indices_b),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed-const (constant s32[3,3]) (scalar-indexed %indices_a "
-      "%indices_b 0->[0]) 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithOneToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,2] parameter(0)
-  indices_a = s32[5,7] parameter(1)
-  indices_b = s32[2] parameter(2)
-  gather_a = s32[5,3,7] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=2,
-      slice_sizes={3,1}
-  ROOT gather_b = s32[5,3,2] gather(gather_a, indices_b),
-      offset_dims={0,1},
-      collapsed_slice_dims={2},
-      start_index_map={2},
-      index_vector_dim=1,
-      slice_sizes={5,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text,
-                                 "(scalar-indexed %operand (scalar-indexed "
-                                 "%indices_a %indices_b 1->[1]) 1->[0,2])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOneWithManyToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,6] parameter(0)
-  indices_a = s32[2] parameter(1)
-  indices_b = s32[5,7] parameter(2)
-  gather_a = s32[2,6] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,6}
-  ROOT gather_b = s32[5,6,7] gather(gather_a, indices_b),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,6}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text,
-                                 "(scalar-indexed %operand (scalar-indexed "
-                                 "%indices_a %indices_b 0->[0,1]) 0->[0,2])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithManyToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,2] parameter(0)
-  indices_a = s32[5,7] parameter(1)
-  indices_b = s32[4,8] parameter(2)
-  gather_a = s32[5,3,7] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=2,
-      slice_sizes={3,1}
-  ROOT gather_b = s32[4,5,3,8] gather(gather_a, indices_b),
-      offset_dims={1,2},
-      collapsed_slice_dims={2},
-      start_index_map={2},
-      index_vector_dim=2,
-      slice_sizes={5,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed %operand (scalar-indexed %indices_a %indices_b "
-      "1->[0,2]) 1->[0,1,3])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather0) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT reshape = s32[5,2,2] reshape(gather)
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text, "(scalar-indexed-const (constant s32[3,2,2]) %indices 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather1) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
-  indices = s32[5,7] parameter(0)
-  gather = s32[5,4,7] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,4}
-  ROOT reshape = s32[5,2,2,7] reshape(gather)
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed-const (constant s32[3,2,2]) %indices 0->[0,3])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather2) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,2,6] constant({
-      {{1,2,3,4,5,6},{1,2,3,4,5,6}},
-      {{1,2,3,4,5,6},{1,2,3,4,5,6}},
-      {{1,2,3,4,5,6},{1,2,3,4,5,6}}})
-  indices = s32[5,7] parameter(0)
-  gather = s32[5,2,6,7] gather(operand, indices),
-      offset_dims={1,2},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,2,6}
-  ROOT reshape = s32[5,3,4,7] reshape(gather)
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed-const (constant s32[3,3,4]) %indices 0->[0,3])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather3) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[2,6] constant({
-      {1,2,3,4,5,6},{1,2,3,4,5,6}})
-  indices = s32[1] parameter(0)
-  gather = s32[1,6] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,6}
-  ROOT reshape = s32[1,1,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,1,6])
-  (reshape %indices to s32[])
-  0->[])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather4) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[2,3]{1,0} constant({ { 1, 2, 3 }, { 1, 2, 3 } })
-
-  i.0 = s64[1,3]{1,0} parameter(0)
-  g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), offset_dims={2},
-    collapsed_slice_dims={0}, start_index_map={0},
-    index_vector_dim=2, slice_sizes={1,3}
-
-  i.1 = s64[1] parameter(1)
-  g.1 = s32[1,1,3]{2,1,0} gather(g.0, i.1), offset_dims={0,2},
-    collapsed_slice_dims={1}, start_index_map={1},
-    index_vector_dim=1, slice_sizes={1,1,3}
-
-  ROOT reshape = s32[1,3]{1,0} reshape(g.1)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,3])
-   (reshape
-     (scalar-indexed %i.0 %i.1 1->[1])
-     to s64[])
-  0->[])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather5) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[1,6] constant({{1,2,3,4,5,6}})
-  indices = s32[1] parameter(0)
-  gather = s32[1,6] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,6}
-  ROOT reshape = s32[1,1,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[1,1,1,6])
-  (reshape %indices to s32[])
-  0->[])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather6) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[1,2,6] constant({{
-      {1,2,3,4,5,6},{1,2,3,4,5,6}}})
-  indices = s32[1] parameter(0)
-  gather = s32[1,1,6] gather(operand, indices),
-      offset_dims={1,2},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={1,1,6}
-  ROOT reshape = s32[1,1,1,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,1,1,6] s32[2,1,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } },
-    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } } })
-  (reshape %indices to s32[])
-  0->[])
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text,
-                                              expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather7) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[2,6] constant({
-      {1,2,3,4,5,6},{1,2,3,4,5,6}})
-  indices = s32[1,5] parameter(0)
-  gather = s32[1,5,6] gather(operand, indices),
-      offset_dims={2},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,6}
-  ROOT reshape = s32[1,1,5,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,1,6] s32[2,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } },
-    { /*i0=1*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } } })
-  (reshape %indices to s32[5])
-  0->[2])
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text,
-                                              expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold0) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
-  indices = s32[5,6] parameter(0)
-  gather = s32[5,4,6] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,4}
-  ROOT reshape = s32[5,2,2,2,3] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(reshape
-  (scalar-indexed-const
-    (constant s32[3,4])
-    %indices
-    0->[0,2])
-  to s32[5,2,2,2,3])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold1) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,5,2] constant({
-      {{1,2},{3,4},{5,6},{7,8},{9,10}},
-      {{1,2},{3,4},{5,6},{7,8},{9,10}},
-      {{1,2},{3,4},{5,6},{7,8},{9,10}}})
-  indices = s32[7] parameter(0)
-  gather = s32[3,2,7] gather(operand, indices),
-      offset_dims={0,1},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1,2}
-  ROOT reshape = s32[6,7] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(reshape
-  (scalar-indexed-const
-    (constant s32[3,5,2])
-    %indices
-    1->[2])
-  to s32[6,7])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold2) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4,1] constant({
-    {{1},{2},{3},{4}},
-    {{1},{2},{3},{4}},
-    {{1},{2},{3},{4}}})
-  indices = s32[5,6] parameter(0)
-  gather = s32[5,4,6,1] gather(operand, indices),
-      offset_dims={1,3},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,4,1}
-  ROOT reshape = s32[5,2,2,2,3,1] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(reshape
-  (scalar-indexed-const
-    (constant s32[3,4,1])
-    %indices
-    0->[0,2])
-  to s32[5,2,2,2,3,1])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, UnaryOpOfGather) {
-  std::string hlo_text = R"(
-HloModule UnaryOpOfGather
-
-ENTRY main {
-  operand = f32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  indices = s32[5] parameter(0)
-  gather = f32[5,4] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT tanh = f32[5,4] tanh(gather)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant f32[3,4] f32[3,4] {
-  { 0.761594176, 0.964027584, 0.995054781, 0.999329329 },
-  { 0.761594176, 0.995054781, 0.964027584, 0.999329329 },
-  { 0.999329329, 0.995054781, 0.964027584, 0.761594176 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, AddBroadcastedScalarWithGather) {
-  std::string hlo_text = R"(
-HloModule AddBroadcastedScalarWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant = s32[] constant(5)
-  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT add = s32[5,4] add(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { 6, 7, 8, 9 },
-  { 6, 8, 7, 9 },
-  { 9, 8, 7, 6 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest,
-       SubtractBroadcastedScalarWithGather_GatherIsLhs) {
-  std::string hlo_text = R"(
-HloModule SubtractBroadcastedScalarWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant = s32[] constant(5)
-  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT sub = s32[5,4] subtract(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { -4, -3, -2, -1 },
-  { -4, -2, -3, -1 },
-  { -1, -2, -3, -4 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest,
-       SubtractBroadcastedScalarWithGather_GatherIsRhs) {
-  std::string hlo_text = R"(
-HloModule SubtractBroadcastedScalarWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant = s32[] constant(5)
-  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT sub = s32[5,4] subtract(constant_broadcasted, gather)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { 4, 3, 2, 1 },
-  { 4, 2, 3, 1 },
-  { 1, 2, 3, 4 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather) {
-  std::string hlo_text = R"(
-HloModule AddBroadcastedVectorWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant_vect = s32[4] constant({10,11,12,13})
-  constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT add = s32[5,4] add(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { 11, 13, 15, 17 },
-  { 11, 14, 14, 17 },
-  { 14, 14, 14, 14 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather_Negative) {
-  std::string hlo_text = R"(
-HloModule AddBroadcastedVectorWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant_vect = s32[5] constant({10,11,12,13,14})
-  constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT add = s32[5,4] add(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%add");
-}
-
-TEST_F(IndexedArrayAnalysisTest, RegularUnaryOp) {
-  std::string hlo_text = R"(
-HloModule RegularUnaryOp
-
-ENTRY main {
-  input = f32[100] parameter(0)
-  ROOT tanh = f32[100] tanh(input)
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%tanh");
-}
-
-TEST_F(IndexedArrayAnalysisTest, RegularBinaryOp) {
-  std::string hlo_text = R"(
-HloModule RegularUnaryOp
-
-ENTRY main {
-  input0 = f32[100] parameter(0)
-  input1 = f32[100] parameter(1)
-  ROOT add = f32[100] add(input0, input1)
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%add");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_0) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  indices = s32[5] parameter(0)
-  dot_lhs = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[3,3] s32[3,3] {
-    { 70, 80, 90 },
-    { 158, 184, 210 },
-    { 246, 288, 330 } })
-  %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_1) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[3,3] constant({{1,2,3},{4,5,6},{7,8,9}})
-  indices = s32[5] parameter(0)
-  dot_lhs = s32[3,5] gather(gather_operand, indices),
-      offset_dims={0},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1}
-  ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={0}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[4,3] s32[4,3] {
-    { 84, 99, 114 },
-    { 96, 114, 132 },
-    { 108, 129, 150 },
-    { 120, 144, 168 } })
-   %indices 0->[1]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_2) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  indices = s32[5] parameter(0)
-  dot_rhs = s32[3,5] gather(gather_operand, indices),
-      offset_dims={0},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1}
-  ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[4,4] s32[4,4] {
-    { 38, 44, 50, 56 },
-    { 83, 98, 113, 128 },
-    { 128, 152, 176, 200 },
-    { 173, 206, 239, 272 } })
-  %indices 1->[1])
-)");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_3) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  indices = s32[5] parameter(0)
-  dot_rhs = s32[5,3] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-  ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[4,4] s32[4,4] {
-    { 14, 32, 50, 68 },
-    { 32, 77, 122, 167 },
-    { 50, 122, 194, 266 },
-    { 68, 167, 266, 365 } })
-  %indices 1->[0])
-)");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpWithBatch) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[2,3,2] constant({{{1,2},{3,4},{5,6}},{{7,8},{9,10},{11,12}}})
-  dot_lhs_constant = s32[2,2,3] constant({{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
-  indices = s32[4] parameter(0)
-  dot_rhs = s32[2,3,4] gather(gather_operand, indices),
-      offset_dims={0,1},
-      collapsed_slice_dims={2},
-      start_index_map={2},
-      index_vector_dim=1,
-      slice_sizes={2,3,1}
-  ROOT dot = s32[2,2,4] dot(dot_lhs_constant, dot_rhs),
-      lhs_contracting_dims={2}, rhs_contracting_dims={1},
-      lhs_batch_dims={0}, rhs_batch_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[2,2,2] s32[2,2,2] {
-    { { 22, 28 },
-      { 49, 64 } },
-    { { 220, 244 },
-      { 301, 334 } } })
-  %indices 3->[2])
-)");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpNegative) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[2,3] constant({{1,2,3},{4,5,6}})
-  indices = s32[2] parameter(0)
-  dot_lhs = s32[3,2] gather(gather_operand, indices),
-      offset_dims={0},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1}
-  ROOT dot = s32[3,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, "%dot");
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
index b45125a9929e77..f4fc26d98e0932 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -147,18 +148,19 @@ std::vector<IndexingMap::Variable> ConvertHLORTVarsToRTVars(
 IndexingMap FoldRTVarsAndConstructIndexingMap(
     AffineMap affine_map, std::vector<IndexingMap::Variable> dim_vars,
     std::vector<HLORTVar> hlo_rt_vars) {
-  auto* ctx = affine_map.getContext();
+  auto* mlir_context = affine_map.getContext();
+  // TODO (b/446856820): Get context from SymbolicMap after refactoring.
   // Range and runtime variables share the symbol space in the affine map but
   // currently we never have range variables here.
   CHECK_EQ(affine_map.getNumSymbols(), hlo_rt_vars.size());
   for (auto idx = 0; idx < affine_map.getNumSymbols(); ++idx) {
     auto& rt_var = hlo_rt_vars[idx];
-    std::optional<AffineExpr> result = OptimizeRTVar(rt_var, ctx);
+    std::optional<AffineExpr> result = OptimizeRTVar(rt_var, mlir_context);
     if (!result) {
       continue;
     }
     affine_map =
-        affine_map.replace({{getAffineSymbolExpr(idx, ctx), *result}},
+        affine_map.replace({{getAffineSymbolExpr(idx, mlir_context), *result}},
                            affine_map.getNumDims(), affine_map.getNumSymbols());
   }
   return IndexingMap(affine_map, std::move(dim_vars), /*range_vars=*/{},
@@ -578,34 +580,40 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
       /*rt_vars=*/{}};
 
   // A map for the `operand` operand of gather, from which we extract slices.
-  // (d0, ... d{rank - 1}) -> (d1 + rt0, d2 + rt1, ...),
-  // where rt{i} are RTVars that extract indices from the `indices` operand.
+  // If operand dimension `i` corresponds to `start_index_map[j]`, then i-th
+  // dimension of operand is indexed as d_{offset_dims[i]} + start_indices[d0,
+  // j], otherwise it's d_{offset_dims[i]}.
   std::vector<HLORTVar> rt_vars;
   std::vector<AffineExpr> exprs;
   exprs.reserve(operand_shape.dimensions().size());
+  const auto& start_index_map = dimension_numbers.start_index_map();
   for (auto [operand_dim_id, slice_size] :
        llvm::enumerate(gather->gather_slice_sizes())) {
     int64_t output_dim_id = dimension_numbers.offset_dims(operand_dim_id);
     exprs.push_back(getAffineDimExpr(output_dim_id, mlir_context));
 
-    if (operand_dim_id >= index_vector_length) {
+    int64_t start_index_map_idx =
+        absl::c_find(start_index_map, operand_dim_id) - start_index_map.begin();
+    if (start_index_map_idx == start_index_map.size()) {
       continue;
     }
     AffineMap rt_var_map = AffineMap::get(
         output_rank, /*symbolCount=*/0,
-        {indices_id_dim, getAffineConstantExpr(operand_dim_id, mlir_context)},
+        {indices_id_dim,
+         getAffineConstantExpr(start_index_map_idx, mlir_context)},
         mlir_context);
     rt_vars.push_back(HLORTVar{
         Interval{0, operand_shape.dimensions(operand_dim_id) - slice_size},
         gather->operand(1), rt_var_map,
         ShapeUtil::CreateDimensionVectorFromShape(output_shape)});
     exprs.back() =
-        exprs.back() + getAffineSymbolExpr(operand_dim_id, mlir_context);
+        exprs.back() + getAffineSymbolExpr(rt_vars.size() - 1, mlir_context);
   }
   OperandIndexing operand_indexing = CreateOperandIndexingWithRTVars(
       AffineMap::get(/*dimCount=*/output_rank,
-                     /*symbolCount=*/index_vector_length, exprs, mlir_context),
-      std::move(dim_vars), std::move(rt_vars));
+                     /*symbolCount=*/start_index_map.size(), exprs,
+                     mlir_context),
+      dim_vars, std::move(rt_vars));
 
   return HloInstructionIndexing::FromOperandIndexing(
       {operand_indexing, OperandIndexing(indices_map)});
@@ -960,12 +968,18 @@ HloInstructionIndexing ComputeOutputToInputConvolutionOpIndexing(
       AffineMap::get(rank, input_symbols.size(), input_exprs, mlir_context),
       DimVarsFromTensorSizes(output_shape.dimensions()), input_symbols,
       /*rt_vars=*/{}, input_constraints);
+  // We may need to simplify and remove unused symbols again, as the input
+  // feature dimension size may be trivial.
+  inputs_indexing.Simplify();
+  inputs_indexing.RemoveUnusedSymbols();
 
   // Indexing map for the kernel value.
   IndexingMap kernel_indexing(
       AffineMap::get(rank, kernel_symbols.size(), kernel_exprs, mlir_context),
       DimVarsFromTensorSizes(output_shape.dimensions()), kernel_symbols,
       /*rt_vars=*/{});
+  kernel_indexing.Simplify();
+  kernel_indexing.RemoveUnusedSymbols();
 
   return HloInstructionIndexing::FromIndexingMaps(
       {inputs_indexing, kernel_indexing});
@@ -1250,14 +1264,14 @@ HloInstructionIndexing ComputeInputToOutputTransposeOpIndexing(
 
 IndexingMap GetBitcastMap(absl::Span<const int64_t> input_shape,
                           const Shape& output_shape,
-                          mlir::MLIRContext* mlir_context) {
+                          MLIRContext* mlir_context) {
   return GetBitcastMap(ShapeUtil::MakeShapeWithDescendingLayout(
                            output_shape.element_type(), input_shape),
                        output_shape, mlir_context);
 }
 IndexingMap GetBitcastMap(absl::Span<const int64_t> input_shape,
                           absl::Span<const int64_t> output_shape,
-                          mlir::MLIRContext* mlir_context) {
+                          MLIRContext* mlir_context) {
   return GetBitcastMap(
       ShapeUtil::MakeShapeWithDescendingLayout(PrimitiveType::S8, input_shape),
       ShapeUtil::MakeShapeWithDescendingLayout(PrimitiveType::S8, output_shape),
@@ -1334,7 +1348,7 @@ std::vector<int64_t> ToTransposeDimensions(const Layout& l) {
 }  // namespace
 
 IndexingMap CreateIdentityMap(absl::Span<const int64_t> dimensions,
-                              mlir::MLIRContext* mlir_context) {
+                              MLIRContext* mlir_context) {
   return IndexingMap::FromTensorSizes(
       AffineMap::getMultiDimIdentityMap(dimensions.size(), mlir_context),
       /*dim_upper_bounds=*/dimensions, /*symbol_upper_bounds=*/{});
@@ -1492,9 +1506,9 @@ GroupedByOpIndexing GroupIndexingMapsByProducers(
 
 GroupedByOpIndexing ComputeGroupedOutputToInputIndexing(
     const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
-    MLIRContext* ctx) {
+    MLIRContext* mlir_context) {
   OperandIndexing initial_map = OperandIndexing(
-      CreateIdentityMap(target_instr.instruction().shape(), ctx));
+      CreateIdentityMap(target_instr.instruction().shape(), mlir_context));
 
   GroupedByOpIndexing grouped_indexing_maps;
   // If target_instr is a parameter of a fusion, then we create an identity map
@@ -1516,8 +1530,9 @@ GroupedByOpIndexing ComputeGroupedOutputToInputIndexing(
   // Iterator in reversed post-order (use-before-def).
   auto it = std::find(post_order.rbegin(), post_order.rend(), target_instr);
   for (; it != post_order.rend(); ++it) {
-    auto producer_indexing = ComputeOutputToInputIndexing(&it->instruction(),
-                                                          /*output_id=*/0, ctx);
+    auto producer_indexing =
+        ComputeOutputToInputIndexing(&it->instruction(),
+                                     /*output_id=*/0, mlir_context);
     auto consumer_indexing_maps =
         grouped_indexing_maps.find(&it->instruction());
     if (consumer_indexing_maps == grouped_indexing_maps.end()) {
@@ -1548,7 +1563,8 @@ GroupedByOpIndexing ComputeGroupedOutputToInputIndexing(
 }
 
 namespace {
-// Returns a linearized shape, i.e. tensor<num_elements(input) x element_type>.
+// Returns a linearized shape, i.e. tensor<num_elements(input) x
+// element_type>.
 Shape GetLinearizedShape(const Shape& shape) {
   if (shape.dimensions().empty()) {
     return shape;
@@ -1562,7 +1578,7 @@ Shape GetLinearizedShape(const Shape& shape) {
 
 llvm::SmallVector<IndexingMap, 4> MapLogicalToLinearizedPhysicalShape(
     absl::Span<const HloInstruction* const> operands,
-    mlir::MLIRContext* mlir_context) {
+    MLIRContext* mlir_context) {
   llvm::SmallVector<IndexingMap, 4> indexing_maps;
   // For every operand compute thread ID -> physical layout of operand
   // indexing map.
@@ -1640,17 +1656,17 @@ void AssignValuesToRTVars(IndexingMap* indexing_map) {
   if (indexing_map->GetRTVarsCount() == 0) {
     return;
   }
-  MLIRContext* mlir_context = indexing_map->GetMLIRContext();
   llvm::SmallVector<AffineExpr, 2> symbol_replacements;
   for (int64_t symbol_id = 0; symbol_id < indexing_map->GetRangeVarsCount();
        ++symbol_id) {
     symbol_replacements.push_back(
-        mlir::getAffineSymbolExpr(symbol_id, mlir_context));
+        mlir::getAffineSymbolExpr(symbol_id, indexing_map->GetMLIRContext()));
   }
   for (const IndexingMap::Variable& rt_var : indexing_map->GetRTVars()) {
     // Take midpoint of the feasible interval for the RT variable.
-    symbol_replacements.push_back(getAffineConstantExpr(
-        (rt_var.bounds.lower + rt_var.bounds.upper) / 2, mlir_context));
+    symbol_replacements.push_back(
+        getAffineConstantExpr((rt_var.bounds.lower + rt_var.bounds.upper) / 2,
+                              indexing_map->GetMLIRContext()));
   }
   AffineMap thread_x_to_input_no_dim_symbols =
       indexing_map->GetAffineMap().replaceDimsAndSymbols(
@@ -1665,7 +1681,7 @@ void AssignValuesToRTVars(IndexingMap* indexing_map) {
 }
 
 HloInstructionIndexing ComputeOutputToInputAllGatherOpIndexing(
-    const HloAllGatherInstruction* instr, MLIRContext* ctx) {
+    const HloAllGatherInstruction* instr, MLIRContext* mlir_context) {
   // CHECK_EQ(instr->all_gather_dimension(), 0);
   // if (instr->all_gather_dimension() != 0) {
   //   return CreateUnknownIndexing(instr->operand_count());
@@ -1682,20 +1698,22 @@ HloInstructionIndexing ComputeOutputToInputAllGatherOpIndexing(
       instr->operand(0)->shape().dimensions()[instr->all_gather_dimension()];
 
   for (int64_t i = 0; i < output_rank; ++i) {
-    auto dim = mlir::getAffineDimExpr(i, ctx);
+    auto dim = mlir::getAffineDimExpr(i, mlir_context);
     exprs.push_back(i == all_gather_dim ? dim % all_gather_input_dim_size
                                         : dim);
   }
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, ctx),
+      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
       instr->shape().dimensions(), {});
 
-  AffineExpr replica_id_expr = mlir::getAffineDimExpr(all_gather_dim, ctx)
-                                   .floorDiv(all_gather_input_dim_size);
+  AffineExpr replica_id_expr =
+      mlir::getAffineDimExpr(all_gather_dim, mlir_context)
+          .floorDiv(all_gather_input_dim_size);
 
   IndexingMap replica_id_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_rank, /*symbolCount=*/0, replica_id_expr, ctx),
+      AffineMap::get(output_rank, /*symbolCount=*/0, replica_id_expr,
+                     mlir_context),
       instr->shape().dimensions(), {});
 
   OperandIndexing operand_indexing(indexing_map, {}, replica_id_map);
@@ -1705,76 +1723,81 @@ HloInstructionIndexing ComputeOutputToInputAllGatherOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
                                                     int output_id,
-                                                    MLIRContext* ctx) {
+                                                    MLIRContext* mlir_context) {
   if (HloInstruction::IsOpElementwise(instr->opcode()) ||
-      instr->opcode() == HloOpcode::kMap) {
-    // Note: map has a `dimensions` attribute, but it does nothing. See
-    // b/65689298.
-    return ComputeOutputToInputCwiseOpIndexing(instr, ctx);
+      // Note: map has a `dimensions` attribute, but it does nothing. See
+      // b/65689298.
+      instr->opcode() == HloOpcode::kMap ||
+      // For a single device, all-reduce is an elementwise op.
+      instr->opcode() == HloOpcode::kAllReduceStart) {
+    return ComputeOutputToInputCwiseOpIndexing(instr, mlir_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
-    return ComputeOutputToInputBitcastOpIndexing(instr, ctx);
+    return ComputeOutputToInputBitcastOpIndexing(instr, mlir_context);
   }
   // go/keep-sorted start
   if (auto all_gather = DynCast<HloAllGatherInstruction>(instr)) {
-    return ComputeOutputToInputAllGatherOpIndexing(all_gather, ctx);
+    return ComputeOutputToInputAllGatherOpIndexing(all_gather, mlir_context);
   }
   if (auto broadcast = DynCast<HloBroadcastInstruction>(instr)) {
-    return ComputeOutputToInputBroadcastOpIndexing(broadcast, ctx);
+    return ComputeOutputToInputBroadcastOpIndexing(broadcast, mlir_context);
   }
   if (auto concat = DynCast<HloConcatenateInstruction>(instr)) {
-    return ComputeOutputToInputConcatenateOpIndexing(concat, ctx);
+    return ComputeOutputToInputConcatenateOpIndexing(concat, mlir_context);
   }
   if (auto constant = DynCast<HloConstantInstruction>(instr)) {
     return HloInstructionIndexing{};
   }
   if (auto convolution = DynCast<HloConvolutionInstruction>(instr)) {
-    return ComputeOutputToInputConvolutionOpIndexing(convolution, ctx);
+    return ComputeOutputToInputConvolutionOpIndexing(convolution, mlir_context);
   }
   if (auto dot = DynCast<HloDotInstruction>(instr)) {
-    return ComputeOutputToInputDotOpIndexing(dot, ctx);
+    return ComputeOutputToInputDotOpIndexing(dot, mlir_context);
   }
   if (auto dus = DynCast<HloDynamicUpdateSliceInstruction>(instr)) {
-    return ComputeOutputToInputDynamicUpdateSliceOpIndexing(dus, ctx);
+    return ComputeOutputToInputDynamicUpdateSliceOpIndexing(dus, mlir_context);
   }
   if (auto dynamic_slice = DynCast<HloDynamicSliceInstruction>(instr)) {
-    return ComputeOutputToInputDynamicSliceOpIndexing(dynamic_slice, ctx);
+    return ComputeOutputToInputDynamicSliceOpIndexing(dynamic_slice,
+                                                      mlir_context);
   }
   if (auto fusion = DynCast<HloFusionInstruction>(instr)) {
-    return ComputeOutputToInputFusionOpIndexing(fusion, output_id, ctx);
+    return ComputeOutputToInputFusionOpIndexing(fusion, output_id,
+                                                mlir_context);
   }
   if (auto gather = DynCast<HloGatherInstruction>(instr)) {
-    return ComputeOutputToInputGatherOpIndexing(gather, ctx);
+    return ComputeOutputToInputGatherOpIndexing(gather, mlir_context);
   }
   if (auto iota = DynCast<HloIotaInstruction>(instr)) {
     return HloInstructionIndexing{};
   }
   if (auto pad = DynCast<HloPadInstruction>(instr)) {
-    return ComputeOutputToInputPadOpIndexing(pad, ctx);
+    return ComputeOutputToInputPadOpIndexing(pad, mlir_context);
   }
   if (auto parameter = DynCast<HloParameterInstruction>(instr)) {
     return HloInstructionIndexing{};
   }
   if (auto reduce = DynCast<HloReduceInstruction>(instr)) {
-    return ComputeOutputToInputReduceOpIndexing(reduce, ctx);
+    return ComputeOutputToInputReduceOpIndexing(reduce, mlir_context);
   }
   if (auto reduce_window = DynCast<HloReduceWindowInstruction>(instr)) {
-    return ComputeOutputToInputReduceWindowOpIndexing(reduce_window, ctx);
+    return ComputeOutputToInputReduceWindowOpIndexing(reduce_window,
+                                                      mlir_context);
   }
   if (auto reshape = DynCast<HloReshapeInstruction>(instr)) {
-    return ComputeOutputToInputReshapeOpIndexing(reshape, ctx);
+    return ComputeOutputToInputReshapeOpIndexing(reshape, mlir_context);
   }
   if (auto reverse = DynCast<HloReverseInstruction>(instr)) {
-    return ComputeReverseOpIndexing(reverse, ctx);
+    return ComputeReverseOpIndexing(reverse, mlir_context);
   }
   if (auto scaled_dot = DynCast<HloScaledDotInstruction>(instr)) {
-    return ComputeOutputToInputScaledDotOpIndexing(scaled_dot, ctx);
+    return ComputeOutputToInputScaledDotOpIndexing(scaled_dot, mlir_context);
   }
   if (auto slice = DynCast<HloSliceInstruction>(instr)) {
-    return ComputeOutputToInputSliceOpIndexing(slice, ctx);
+    return ComputeOutputToInputSliceOpIndexing(slice, mlir_context);
   }
   if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
-    return ComputeOutputToInputTransposeOpIndexing(transpose, ctx);
+    return ComputeOutputToInputTransposeOpIndexing(transpose, mlir_context);
   }
   // go/keep-sorted end
   LOG(ERROR) << "ComputeOutputToInputIndexing is not implemented for opcode "
@@ -1786,42 +1809,45 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
 
 HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
                                                     int input_id,
-                                                    MLIRContext* ctx) {
+                                                    MLIRContext* mlir_context) {
   if (HloInstruction::IsOpElementwise(instr->opcode()) ||
-      instr->opcode() == HloOpcode::kMap) {
-    // Note: map has a `dimensions` attribute, but it does nothing. See
-    // b/65689298.
-    return ComputeInputToOutputCwiseOpIndexing(instr, ctx);
+      // Note: map has a `dimensions` attribute, but it does nothing. See
+      // b/65689298.
+      instr->opcode() == HloOpcode::kMap ||
+      // For a single device, all-reduce has 1:1 output to input mapping.
+      instr->opcode() == HloOpcode::kAllReduceStart) {
+    return ComputeInputToOutputCwiseOpIndexing(instr, mlir_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
-    return ComputeInputToOutputBitcastOpIndexing(instr, ctx);
+    return ComputeInputToOutputBitcastOpIndexing(instr, mlir_context);
   }
   // go/keep-sorted start
   if (auto broadcast = DynCast<HloBroadcastInstruction>(instr)) {
-    return ComputeInputToOutputBroadcastOpIndexing(broadcast, ctx);
+    return ComputeInputToOutputBroadcastOpIndexing(broadcast, mlir_context);
   }
   if (auto concat = DynCast<HloConcatenateInstruction>(instr)) {
-    return ComputeInputToOutputConcatenateOpIndexing(concat, input_id, ctx);
+    return ComputeInputToOutputConcatenateOpIndexing(concat, input_id,
+                                                     mlir_context);
   }
   if (auto reduce = DynCast<HloReduceInstruction>(instr)) {
-    return ComputeInputToOutputReduceOpIndexing(reduce, input_id, ctx);
+    return ComputeInputToOutputReduceOpIndexing(reduce, input_id, mlir_context);
   }
   if (auto reshape = DynCast<HloReshapeInstruction>(instr)) {
-    return ComputeInputToOutputReshapeOpIndexing(reshape, ctx);
+    return ComputeInputToOutputReshapeOpIndexing(reshape, mlir_context);
   }
   if (auto reverse = DynCast<HloReverseInstruction>(instr)) {
-    return ComputeReverseOpIndexing(reverse, ctx);
+    return ComputeReverseOpIndexing(reverse, mlir_context);
   }
   if (auto slice = DynCast<HloSliceInstruction>(instr)) {
-    return ComputeInputToOutputSliceOpIndexing(slice, ctx);
+    return ComputeInputToOutputSliceOpIndexing(slice, mlir_context);
   }
   if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
-    return ComputeInputToOutputTransposeOpIndexing(transpose, ctx);
+    return ComputeInputToOutputTransposeOpIndexing(transpose, mlir_context);
   }
   // go/keep-sorted end
   if (instr->opcode() == HloOpcode::kTuple) {
-    return HloInstructionIndexing::FromIndexingMaps(
-        {CreateIdentityMap(instr->shape().tuple_shapes(input_id), ctx)});
+    return HloInstructionIndexing::FromIndexingMaps({CreateIdentityMap(
+        instr->shape().tuple_shapes(input_id), mlir_context)});
   }
   // If we cannot compute input-to-output indexing, we return std::nullopt for
   // every op result.
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.h b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
index 4d24f8cd1f8f4e..e0650668397b83 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
@@ -30,8 +30,8 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/shape.h"
@@ -71,15 +71,14 @@ std::ostream& operator<<(std::ostream& out,
 
 // Computes indexing maps for all input operands necessary to compute an element
 // of the `output_id` instruction output.
-HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
-                                                    int output_id,
-                                                    mlir::MLIRContext* ctx);
+HloInstructionIndexing ComputeOutputToInputIndexing(
+    const HloInstruction* instr, int output_id,
+    mlir::MLIRContext* mlir_context);
 
 // Computes indexing maps for all output operands that the element of the
 // `input_id` instruction input will participate in.
-HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
-                                                    int input_id,
-                                                    mlir::MLIRContext* ctx);
+HloInstructionIndexing ComputeInputToOutputIndexing(
+    const HloInstruction* instr, int input_id, mlir::MLIRContext* mlir_context);
 
 // Computes the indexing for `epilogue_parent`'s epilogue. For example, if
 // `epilogue_parent` is a transpose, computes the input to output indexing for
@@ -206,7 +205,7 @@ using GroupedByOpIndexing =
 // cluster starting with `target_instr` and going from def to use.
 GroupedByOpIndexing ComputeGroupedOutputToInputIndexing(
     const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
-    mlir::MLIRContext* ctx);
+    mlir::MLIRContext* mlir_context);
 
 // Returns the indexing map from logical to linearized physical shape for each
 // operand.
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
index e4a614c199644f..74aaf30a25216f 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
@@ -1084,6 +1084,43 @@ TEST_F(IndexingAnalysisTest, GatherOp) {
     )"));
 }
 
+TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY main {
+      operand = f32[33,76,70] parameter(0)
+      indices = s32[1806,2] parameter(1)
+      ROOT r = f32[1806,7,8,4] gather(operand, indices), offset_dims={1,2,3},
+                                 collapsed_slice_dims={}, start_index_map={1,0},
+                                 index_vector_dim=1, slice_sizes={7,8,4}
+    }
+  )"));
+  EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
+    operand id = 0
+      (d0, d1, d2, d3){rt0, rt1} -> (d1 + rt0, d2 + rt1, d3),
+      domain:
+        d0 in [0, 1805],
+        d1 in [0, 6],
+        d2 in [0, 7],
+        d3 in [0, 3],
+        rt0 in [0, 26],
+        rt1 in [0, 68]
+      runtime variables:
+        rt0: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 1),
+          domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
+        rt1: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 0),
+          domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
+    operand id = 1
+      (d0, d1, d2, d3)[s0] -> (d0, s0),
+      domain:
+        d0 in [0, 1805],
+        d1 in [0, 6],
+        d2 in [0, 7],
+        d3 in [0, 3],
+        s0 in [0, 1]
+    )"));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -1889,6 +1926,41 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
                           )"));
 }
 
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_4DWithTrivalDims_NoPadding) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    ENTRY e {
+      c_inf = f32[] constant(-inf)
+      p0 = f32[1024, 514, 1, 1]parameter(0)
+      ROOT reduce-window = f32[1024, 3, 1, 1] reduce-window(p0, c_inf),
+        window={size=1x512x1x1 pad=0_0x0_0x0_0x0_0}, to_apply=max
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
+                          operand id = 0
+                            (d0, d1, d2, d3)[s0] -> (d0, d1 + s0, 0, 0),
+                            domain:
+                            d0 in [0, 1023],
+                            d1 in [0, 2],
+                            d2 in [0, 0],
+                            d3 in [0, 0],
+                            s0 in [0, 511]
+                          operand id = 1
+                            (d0, d1, d2, d3) -> (),
+                            domain:
+                            d0 in [0, 1023],
+                            d1 in [0, 2],
+                            d2 in [0, 0],
+                            d3 in [0, 0]
+                          )"));
+}
+
 TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -2077,7 +2149,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
                           operand id = 0
-                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, s2),
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (0, d1 + s0, d2 + s1, s2),
                             domain:
                             d0 in [0, 0],
                             d1 in [0, 9],
@@ -2099,6 +2171,43 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
                           )"));
 }
 
+TEST_F(IndexingAnalysisTest, ConvolutionOp_4DWithTrivialDims_NoPadding) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f64[1,1,16,16,2,2]{5,4,3,2,1,0} parameter(0)
+      p1 = f64[1,1,3,3,1,1]{5,4,3,2,1,0} parameter(1)
+      ROOT conv = f64[1,1,14,14,2,2]{5,4,3,2,1,0} convolution(p0, p1),
+        window={size=3x3x1x1}, dim_labels=bf0123_oi0123->bf0123
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
+                          operand id = 0
+                            (d0, d1, d2, d3, d4, d5)[s0, s1] -> (0, 0, d2 + s0, d3 + s1, d4, d5),
+                            domain:
+                            d0 in [0, 0],
+                            d1 in [0, 0],
+                            d2 in [0, 13],
+                            d3 in [0, 13],
+                            d4 in [0, 1],
+                            d5 in [0, 1],
+                            s0 in [0, 2],
+                            s1 in [0, 2]
+                          operand id = 1
+                            (d0, d1, d2, d3, d4, d5)[s0, s1] -> (0, 0, s0, s1, 0, 0),
+                            domain:
+                            d0 in [0, 0],
+                            d1 in [0, 0],
+                            d2 in [0, 13],
+                            d3 in [0, 13],
+                            d4 in [0, 1],
+                            d5 in [0, 1],
+                            s0 in [0, 2],
+                            s1 in [0, 2]
+                          )"));
+}
+
 TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -2112,7 +2221,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
                           operand id = 0
-                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 * 2 + s0 - 1, d2 * 2 + s1 - 2, s2),
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (0, d1 * 2 + s0 - 1, d2 * 2 + s1 - 2, s2),
                             domain:
                             d0 in [0, 0],
                             d1 in [0, 5],
@@ -2149,7 +2258,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
                           operand id = 0
-                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, (d1 + s0) floordiv 2, (d2 + s1) floordiv 2, s2),
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (0, (d1 + s0) floordiv 2, (d2 + s1) floordiv 2, s2),
                             domain:
                             d0 in [0, 0],
                             d1 in [0, 20],
@@ -2186,7 +2295,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
                           operand id = 0
-                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0 * 2, d2 + s1 * 2, s2),
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (0, d1 + s0 * 2, d2 + s1 * 2, s2),
                             domain:
                             d0 in [0, 0],
                             d1 in [0, 7],
@@ -2221,7 +2330,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
                           operand id = 0
-                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, (d3 floordiv 8) * 4 + s2),
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (0, d1 + s0, d2 + s1, (d3 floordiv 8) * 4 + s2),
                             domain:
                             d0 in [0, 0],
                             d1 in [0, 9],
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map.cc b/third_party/xla/xla/hlo/analysis/indexing_map.cc
index cfea502b8d5fe4..58f8b7a0122afa 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <limits>
 #include <numeric>
 #include <optional>
 #include <ostream>
-#include <sstream>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -30,9 +28,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/numeric/int128.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/DenseMap.h"
@@ -1159,8 +1155,9 @@ Interval RangeEvaluator::ComputeExpressionRange(AffineExpr expr) {
   }
 
   if (use_constraints_) {
-    auto constraint = indexing_map_.GetConstraints().find(expr);
-    if (constraint != indexing_map_.GetConstraints().end()) {
+    auto constraints_map = indexing_map_.GetConstraints();
+    auto constraint = constraints_map.find(expr);
+    if (constraint != constraints_map.end()) {
       return result.Intersect(constraint->second);
     }
   }
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map.h b/third_party/xla/xla/hlo/analysis/indexing_map.h
index f76f68b99bc34b..58b5f5bbec139d 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map.h
@@ -164,7 +164,6 @@ class IndexingMap {
 
   // Returns the affine map.
   mlir::AffineMap GetAffineMap() const { return affine_map_; }
-  mlir::AffineMap& GetMutableAffineMap() { return affine_map_; }
 
   // Returns the number of indexing map results.
   int64_t GetNumResults() const { return affine_map_.getNumResults(); }
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
index 0a8be9fef8f2aa..bcb0b595ac114e 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 
 #include <algorithm>
-#include <cctype>
 #include <cstdint>
 #include <optional>
 #include <ostream>
@@ -26,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -39,9 +39,10 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
@@ -58,7 +59,6 @@ using mlir::AffineMap;
 using mlir::AffineMapAttr;
 using mlir::AffineSymbolExpr;
 using mlir::ArrayRef;
-using mlir::MLIRContext;
 
 enum class Delimeter { kParen, kBracket, kBrace };
 
@@ -169,7 +169,7 @@ class Parser {
 
  private:
   void ConsumeWhitespace() {
-    while (it_ != input_.end() && std::isspace(*it_)) {
+    while (it_ != input_.end() && absl::ascii_isspace(*it_)) {
       ++it_;
     }
   }
@@ -302,10 +302,10 @@ Token Parser::GetNextTokenImpl() {
     return Token{"", Token::Kind::kEOF};
   }
   auto start = it_;
-  if (std::isalpha(*it_)) {
+  if (absl::ascii_isalpha(*it_)) {
     // Variable name.
-    while (it_ != input_.end() &&
-           (std::isalpha(*it_) || std::isdigit(*it_) || *it_ == '_')) {
+    while (it_ != input_.end() && (absl::ascii_isalpha(*it_) ||
+                                   absl::ascii_isdigit(*it_) || *it_ == '_')) {
       ++it_;
     }
     StringRef spelling = input_.substr(start - input_.data(), it_ - start);
@@ -326,9 +326,9 @@ Token Parser::GetNextTokenImpl() {
     }
     return Token{spelling, Token::Kind::kVarName};
   }
-  if (std::isdigit(*it_)) {
+  if (absl::ascii_isdigit(*it_)) {
     auto start = it_;
-    while (it_ != input_.end() && std::isdigit(*it_)) {
+    while (it_ != input_.end() && absl::ascii_isdigit(*it_)) {
       ++it_;
     }
 
@@ -342,9 +342,9 @@ Token Parser::GetNextTokenImpl() {
         ++it_;
         return Token{"->", Token::Kind::kArrow};
       }
-      if (std::isdigit(*it_)) {
+      if (absl::ascii_isdigit(*it_)) {
         auto start = it_ - 1;
-        while (it_ != input_.end() && std::isdigit(*it_)) {
+        while (it_ != input_.end() && absl::ascii_isdigit(*it_)) {
           ++it_;
         }
         StringRef spelling = input_.substr(start - input_.data(), it_ - start);
@@ -393,7 +393,7 @@ bool ParseAffineMapResults(Parser& parser,
 bool ParseAffineExprsWithMLIR(ArrayRef<std::string> dim_var_names,
                               ArrayRef<std::string> symbol_var_names,
                               ArrayRef<std::string> affine_expr_strings,
-                              MLIRContext* context,
+                              mlir::MLIRContext* context,
                               SmallVectorImpl<AffineExpr>& affine_exprs) {
   std::stringstream ss;
   ss << "affine_map<(" << absl::StrJoin(dim_var_names, ", ") << ") ";
@@ -598,7 +598,7 @@ void PrintAffineExprImpl(const AffineExpr affine_expr,
 }  // namespace
 
 std::optional<IndexingMap> ParseIndexingMap(llvm::StringRef input,
-                                            MLIRContext* context) {
+                                            mlir::MLIRContext* mlir_context) {
   Parser parser(input);
 
   // Parse variable names.
@@ -631,8 +631,8 @@ std::optional<IndexingMap> ParseIndexingMap(llvm::StringRef input,
       llvm::errs() << "Expected an empty indexing map\n";
       return std::nullopt;
     }
-    return IndexingMap{AffineMap::get(context), /*dimensions=*/{},
-                       /*range_vars=*/{}, /*rt_vars=*/{}};
+    return IndexingMap{AffineMap::get(mlir_context),
+                       /*dimensions=*/{}, /*range_vars=*/{}, /*rt_vars=*/{}};
   }
 
   if (!parser.ConsumeToken(Token::Kind::kComma) ||
@@ -733,7 +733,7 @@ std::optional<IndexingMap> ParseIndexingMap(llvm::StringRef input,
   symbol_var_names.append(rt_var_names.begin(), rt_var_names.end());
   SmallVector<AffineExpr> affine_exprs;
   if (!ParseAffineExprsWithMLIR(dim_var_names, symbol_var_names,
-                                affine_expr_strs, context, affine_exprs)) {
+                                affine_expr_strs, mlir_context, affine_exprs)) {
     llvm::errs() << "Failed to parse affine expressions\n";
     return std::nullopt;
   }
@@ -750,7 +750,7 @@ std::optional<IndexingMap> ParseIndexingMap(llvm::StringRef input,
     constraints.push_back(std::make_pair(expr, bounds));
   }
   auto map = AffineMap::get(dim_vars.size(), range_vars.size() + rt_vars.size(),
-                            affine_map_results, context);
+                            affine_map_results, mlir_context);
   return IndexingMap{map, std::move(dim_vars), std::move(range_vars),
                      std::move(rt_vars), constraints};
 }
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
index 8181be04cabc07..b531cdc2ddb14b 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
@@ -25,14 +25,14 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
 // Parses the given string into an IndexingMap.
 std::optional<IndexingMap> ParseIndexingMap(llvm::StringRef input,
-                                            mlir::MLIRContext* context);
+                                            mlir::MLIRContext* mlir_context);
 
 // Prints AffineExpr using the default (d0, d1, ..., s0, s1, ...) variable
 // names.
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
index 27041cb7ee2555..2ae50aae18f57b 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
index 68cf172419c651..a00ee29785b81b 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/hlo/analysis/indexing_map.h"
 
-#include <limits>
 #include <memory>
 #include <optional>
 #include <sstream>
@@ -34,9 +33,9 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -49,17 +48,13 @@ using ::testing::ElementsAre;
 
 class IndexingMapTest : public HloHardwareIndependentTestBase {
  public:
-  IndexingMapTest() : symbolic_expr_context_(&mlir_context_) {}
-
   IndexingMap Parse(absl::string_view indexing_map_str) {
-    auto indexing_map = ParseIndexingMap(
-        indexing_map_str, symbolic_expr_context_.GetMLIRContext());
+    auto indexing_map = ParseIndexingMap(indexing_map_str, &mlir_context_);
     EXPECT_TRUE(indexing_map.has_value());
     return *indexing_map;
   }
 
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_;
 };
 
 std::vector<bool> ConvertToSTL(const llvm::SmallBitVector& bit_vector) {
@@ -95,7 +90,7 @@ TEST_F(IndexingMapTest, VariableKind) {
 
 TEST_F(IndexingMapTest, VerifyDimensions) {
   auto indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0) -> (d0)", &symbolic_expr_context_),
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_),
       /*dim_upper_bounds=*/{10, 10}, /*symbol_upper_bounds=*/{});
 
   std::stringstream ss;
@@ -107,7 +102,7 @@ TEST_F(IndexingMapTest, VerifyDimensions) {
 
 TEST_F(IndexingMapTest, VerifySymbols) {
   auto indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0) -> (d0)", &symbolic_expr_context_),
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_),
       /*dim_upper_bounds=*/{10}, /*symbol_upper_bounds=*/{10});
 
   std::stringstream ss;
@@ -120,7 +115,7 @@ TEST_F(IndexingMapTest, VerifySymbols) {
 TEST_F(IndexingMapTest, RTVar) {
   IndexingMap indexing_map(
       ParseAffineMap("(d0, d1)[range, rt0, rt1] -> (d1, d0, range + rt0, rt1)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {IndexingMap::Variable{0, 99, "d0"}, IndexingMap::Variable{0, 43, "d1"}},
       {IndexingMap::Variable{-99, 99, "range"}},
       {IndexingMap::Variable{Interval{0, 2}},
@@ -147,10 +142,8 @@ TEST_F(IndexingMapTest, EvaluateIgnoresDomainRanges) {
   )");
 
   auto results = indexing_map.Evaluate(
-      mlir::getAffineConstantExprs({1, 2},
-                                   symbolic_expr_context_.GetMLIRContext()),
-      mlir::getAffineConstantExprs({3, 4},
-                                   symbolic_expr_context_.GetMLIRContext()));
+      mlir::getAffineConstantExprs({1, 2}, &mlir_context_),
+      mlir::getAffineConstantExprs({3, 4}, &mlir_context_));
 
   EXPECT_THAT(results, ElementsAre(2, 1, 4, 3));
 }
@@ -166,20 +159,16 @@ TEST_F(IndexingMapTest, ConstraintsSatisfied) {
   )");
 
   auto feasible = indexing_map.ConstraintsSatisfied(
-      mlir::getAffineConstantExprs({1, 2},
-                                   symbolic_expr_context_.GetMLIRContext()),
-      mlir::getAffineConstantExprs({3, 4},
-                                   symbolic_expr_context_.GetMLIRContext()));
+      mlir::getAffineConstantExprs({1, 2}, &mlir_context_),
+      mlir::getAffineConstantExprs({3, 4}, &mlir_context_));
   EXPECT_TRUE(feasible);
 
-  indexing_map.AddConstraint(
-      ParseAffineExpr("s0 mod 4", &symbolic_expr_context_), Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 4", &mlir_context_),
+                             Interval{0, 0});
 
   auto infeasible = indexing_map.ConstraintsSatisfied(
-      mlir::getAffineConstantExprs({1, 2},
-                                   symbolic_expr_context_.GetMLIRContext()),
-      mlir::getAffineConstantExprs({5, 4},
-                                   symbolic_expr_context_.GetMLIRContext()));
+      mlir::getAffineConstantExprs({1, 2}, &mlir_context_),
+      mlir::getAffineConstantExprs({5, 4}, &mlir_context_));
   EXPECT_FALSE(infeasible);
 }
 
@@ -294,13 +283,13 @@ TEST_F(IndexingMapTest, Composition_RTVar) {
   IndexingMap producer(
       ParseAffineMap(
           "(d0, d1, d2)[rt0, rt1, rt2] -> (d0 + rt0, d1 + rt1, d2 + rt2)",
-          &symbolic_expr_context_),
+          &mlir_context_),
       {IndexingMap::Variable{{0, 0}}, IndexingMap::Variable{{0, 1}},
        IndexingMap::Variable{{0, 226}}},
       {}, std::move(rt_vars));
 
   IndexingMap consumer(
-      ParseAffineMap("(d0, d1)[s] -> (0, d1, s)", &symbolic_expr_context_),
+      ParseAffineMap("(d0, d1)[s] -> (0, d1, s)", &mlir_context_),
       {IndexingMap::Variable{0, 0}, IndexingMap::Variable{0, 1}},
       {IndexingMap::Variable{0, 31, "s"}}, {});
 
@@ -320,7 +309,7 @@ TEST_F(IndexingMapTest, Composition_RTVar) {
 TEST_F(IndexingMapTest, Composition_OnlyRTVars) {
   IndexingMap producer(
       ParseAffineMap("(d0, d1)[s0, s1] -> (d0 + s0, d1 + 4 * s1)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {IndexingMap::Variable{0, 24}, IndexingMap::Variable{0, 15}}, {},
       {IndexingMap::Variable{Interval{0, 2}, "ps_0"},
        IndexingMap::Variable{Interval{0, 1}, "ps_1"}});
@@ -328,7 +317,7 @@ TEST_F(IndexingMapTest, Composition_OnlyRTVars) {
   std::vector<IndexingMap::Variable> consumer_rt_vars;
   IndexingMap consumer(
       ParseAffineMap("(d0, d1)[s0, s1] -> (d0 + 2 * s0, d1 + 3 * s1)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {IndexingMap::Variable{0, 24}, IndexingMap::Variable{0, 15}}, {},
       {IndexingMap::Variable{Interval{0, 25}, "cs_0"},
        IndexingMap::Variable{Interval{0, 16}, "cs_1"}});
@@ -349,6 +338,24 @@ TEST_F(IndexingMapTest, Composition_OnlyRTVars) {
   )"));
 }
 
+TEST_F(IndexingMapTest, ComposeIndexingMapsComputationPartitionerTestCrash) {
+  // This is a simplification of a test case taken from ComputationPartitioner
+  // that used to crash when calling ComposeIndexingMaps.
+  auto indexing_map_identity_7_variables = Parse(R"(
+    (d0, d1, d2, d3, d4, d5, d6)->(d0, d1, d2, d3, d4, d5, d6),
+        domain : d0 in[0, 3],
+                d1 in[0, 3],
+                d2 in[0, 3],
+                d3 in[0, 3],
+                d4 in[0, 3],
+                d5 in[0, 3],
+                d6 in[0, 3]
+  )");
+  auto composed = ComposeIndexingMaps(indexing_map_identity_7_variables,
+                                      indexing_map_identity_7_variables);
+  EXPECT_EQ(composed, indexing_map_identity_7_variables);
+}
+
 TEST_F(IndexingMapTest, KnownEmpty_CreatingIndexingMapWithInfeasibleRange) {
   auto indexing_map = Parse(R"(
     (d0) -> (d0),
@@ -488,15 +495,14 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithRTVars) {
   IndexingMap indexing_map(
       ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
-                     &symbolic_expr_context_),
+                     &mlir_context_),
       {IndexingMap::Variable{{0, 31}}},
       {IndexingMap::Variable{{0, 0}}, IndexingMap::Variable{{0, 1}},
        IndexingMap::Variable{{0, 2}}},
       {IndexingMap::Variable{Interval{0, 3}},
        IndexingMap::Variable{Interval{0, 4}}});
   indexing_map.AddConstraint(
-      ParseAffineExpr("d0 * 4 + s1 + s3", &symbolic_expr_context_),
-      Interval{24, 459});
+      ParseAffineExpr("d0 * 4 + s1 + s3", &mlir_context_), Interval{24, 459});
   indexing_map.RemoveUnusedSymbols();
   // Symbols s0, s2, s4 will be removed and s1 and s3 will become s0 and s1.
   EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
@@ -584,13 +590,13 @@ TEST_F(IndexingMapTest, ConvertSymbolsToDimensions) {
   IndexingMap indexing_map(
       ParseAffineMap(
           "(d0)[s0, s1, s2, s3] -> (d0 * 4 + s0 + s1 + 2 * s2 + 3 * s3 - 42)",
-          &symbolic_expr_context_),
+          &mlir_context_),
       {IndexingMap::Variable{{0, 31}}},
       {IndexingMap::Variable{{0, 0}}, IndexingMap::Variable{{0, 1}}},
       {IndexingMap::Variable{Interval{0, 3}},
        IndexingMap::Variable{Interval{0, 4}}});
   indexing_map.AddConstraint(
-      ParseAffineExpr("d0 * 4 + s0 + 2 * s2", &symbolic_expr_context_),
+      ParseAffineExpr("d0 * 4 + s0 + 2 * s2", &mlir_context_),
       Interval{24, 459});
   EXPECT_THAT(indexing_map.ConvertSymbolsToDimensions(), MatchIndexingMap(R"(
       (d0, d1, d2, d3, d4) -> (d0 * 4 + d1 + d2 + d3 * 2 + d4 * 3 - 42),
@@ -1336,14 +1342,13 @@ TEST_F(IndexingMapTest,
   // important for now.
   EXPECT_THAT(
       std::make_tuple(result3, constraint_expr, constraint_interval),
-      AnyOf(std::make_tuple(
-                ParseAffineExpr("s0 * 6 + 3", &symbolic_expr_context_),
-                ParseAffineExpr("(s0 * 6 + 3) mod 7", &symbolic_expr_context_),
-                Interval{5, 5}),
-            std::make_tuple(
-                ParseAffineExpr("s0 * 7 + 5", &symbolic_expr_context_),
-                ParseAffineExpr("(s0 * 7 + 5) mod 6", &symbolic_expr_context_),
-                Interval{3, 3})));
+      AnyOf(
+          std::make_tuple(ParseAffineExpr("s0 * 6 + 3", &mlir_context_),
+                          ParseAffineExpr("(s0 * 6 + 3) mod 7", &mlir_context_),
+                          Interval{5, 5}),
+          std::make_tuple(ParseAffineExpr("s0 * 7 + 5", &mlir_context_),
+                          ParseAffineExpr("(s0 * 7 + 5) mod 6", &mlir_context_),
+                          Interval{3, 3})));
 }
 
 TEST_F(IndexingMapTest, RescaleSymbolsKeepsHashmapConsistent) {
@@ -1374,10 +1379,9 @@ TEST_F(IndexingMapTest, RangeEvaluatorTest) {
     d2 in [-1, 2],
     d3 in [0, 0]
   )");
-  RangeEvaluator range_evaluator(indexing_map,
-                                 symbolic_expr_context_.GetMLIRContext());
+  RangeEvaluator range_evaluator(indexing_map, &mlir_context_);
   mlir::AffineExpr d0, d1, d2, d3;
-  bindDims(symbolic_expr_context_.GetMLIRContext(), d0, d1, d2, d3);
+  bindDims(&mlir_context_, d0, d1, d2, d3);
 
   // d0 is always positive.
   EXPECT_TRUE(range_evaluator.IsAlwaysPositiveOrZero(d0));
@@ -1394,6 +1398,10 @@ TEST_F(IndexingMapTest, RangeEvaluatorTest) {
   // d3 is always 0.
   EXPECT_TRUE(range_evaluator.IsAlwaysPositiveOrZero(d3));
   EXPECT_TRUE(range_evaluator.IsAlwaysNegativeOrZero(d3));
+
+  // d0 * 2 + d1 between [-10, 17].
+  EXPECT_EQ(range_evaluator.ComputeExpressionRange(d0 * 2 + d1),
+            (Interval{-10, 17}));
 }
 
 template <typename T>
@@ -1521,7 +1529,7 @@ TEST_F(IndexingMapTest, IndexingMapSupportsAbslHashAndEqAndNe) {
       )"),
        IndexingMap(
            ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
-                          &symbolic_expr_context_),
+                          &mlir_context_),
            {IndexingMap::Variable{{0, 31}}},
            {IndexingMap::Variable{{0, 0}}, IndexingMap::Variable{{0, 1}},
             IndexingMap::Variable{{0, 2}}},
@@ -1529,7 +1537,7 @@ TEST_F(IndexingMapTest, IndexingMapSupportsAbslHashAndEqAndNe) {
             IndexingMap::Variable{Interval{0, 4}}}),
        IndexingMap(
            ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
-                          &symbolic_expr_context_),
+                          &mlir_context_),
            {IndexingMap::Variable{{0, 31}}},
            {IndexingMap::Variable{{0, 0}}, IndexingMap::Variable{{0, 1}},
             IndexingMap::Variable{{0, 2}}},
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
index 72ebd9c3099ecd..f7db83e6f76331 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/hlo/analysis/indexing_test_utils.h"
 
-#include <cctype>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -30,6 +29,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -39,12 +39,11 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/errors.h"
 
@@ -53,7 +52,6 @@ namespace {
 
 using ::mlir::AffineExpr;
 using ::mlir::AffineMap;
-using ::mlir::MLIRContext;
 
 std::string FormatDimsAndSyms(absl::Span<int64_t const> dims,
                               absl::Span<int64_t const> syms) {
@@ -73,22 +71,20 @@ HloInstruction* IndexingTestBase::ParseAndGetRoot(
 
 HloInstructionIndexing IndexingTestBase::GetOutputToInputIndexing(
     const HloInstruction* instr, int output_id, bool use_physical_layout) {
-  HloInstructionIndexing indexing = ComputeOutputToInputIndexing(
-      instr, output_id, symbolic_expr_context_.GetMLIRContext());
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, output_id, &mlir_context_);
 
   if (!use_physical_layout) {
     return indexing;
   }
 
   IndexingMap output_permutation = GetIndexingMapFromPhysicalLayoutToLogical(
-      GetOutputShape(instr, output_id),
-      symbolic_expr_context_.GetMLIRContext());
+      GetOutputShape(instr, output_id), &mlir_context_);
 
   for (const auto& [operand_id, indexing_maps] :
        llvm::enumerate(indexing.indexing_maps)) {
     IndexingMap operand_permutation = GetIndexingMapFromLogicalToPhysicalLayout(
-        instr->operand(operand_id)->shape(),
-        symbolic_expr_context_.GetMLIRContext());
+        instr->operand(operand_id)->shape(), &mlir_context_);
 
     OperandIndexingSet operand_indexing_maps;
     for (const OperandIndexing& indexing_map : indexing_maps) {
@@ -110,8 +106,8 @@ HloInstructionIndexing IndexingTestBase::GetOutputToInputIndexing(
 
 HloInstructionIndexing IndexingTestBase::GetInputToOutputIndexing(
     const HloInstruction* instr, int input_id, bool use_physical_layout) {
-  HloInstructionIndexing indexing = ComputeInputToOutputIndexing(
-      instr, input_id, symbolic_expr_context_.GetMLIRContext());
+  HloInstructionIndexing indexing =
+      ComputeInputToOutputIndexing(instr, input_id, &mlir_context_);
 
   if (!use_physical_layout) {
     return indexing;
@@ -119,15 +115,13 @@ HloInstructionIndexing IndexingTestBase::GetInputToOutputIndexing(
 
   OperandIndexing input_permutation =
       OperandIndexing(GetIndexingMapFromPhysicalLayoutToLogical(
-          instr->operand(input_id)->shape(),
-          symbolic_expr_context_.GetMLIRContext()));
+          instr->operand(input_id)->shape(), &mlir_context_));
 
   for (const auto& [output_id, indexing_maps] :
        llvm::enumerate(indexing.indexing_maps)) {
     OperandIndexing operand_permutation =
         OperandIndexing(GetIndexingMapFromLogicalToPhysicalLayout(
-            GetOutputShape(instr, output_id),
-            symbolic_expr_context_.GetMLIRContext()));
+            GetOutputShape(instr, output_id), &mlir_context_));
 
     OperandIndexingSet operand_indexing_maps;
     for (const OperandIndexing& indexing_map : indexing_maps) {
@@ -148,26 +142,24 @@ HloInstructionIndexing IndexingTestBase::GetInputToOutputIndexing(
 }
 
 AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
-                         gpu::SymbolicExprContext* symbolic_expr_context) {
+                         mlir::MLIRContext* mlir_context) {
   std::string full_affine_map_string =
       absl::StrCat("affine_map<", serialized_affine_map, ">");
   return mlir::cast<mlir::AffineMapAttr>(
-             mlir::parseAttribute(full_affine_map_string,
-                                  symbolic_expr_context->GetMLIRContext()))
+             mlir::parseAttribute(full_affine_map_string, mlir_context))
       .getValue();
 }
 
 // Since MLIR does not have AffineExprAttr, we construct an AffineMap and then
 // retrieve its first result.
 AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
-                           gpu::SymbolicExprContext* symbolic_expr_context) {
+                           mlir::MLIRContext* mlir_context) {
   std::string full_affine_map_string = absl::StrCat(
       "affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9)"
       "[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (",
       serialized_affine_expr, ")>");
   return mlir::cast<mlir::AffineMapAttr>(
-             mlir::parseAttribute(full_affine_map_string,
-                                  symbolic_expr_context->GetMLIRContext()))
+             mlir::parseAttribute(full_affine_map_string, mlir_context))
       .getValue()
       .getResult(0);
 }
@@ -177,10 +169,10 @@ bool ApproximateMatch(absl::string_view lhs, absl::string_view rhs) {
   size_t rhs_length = rhs.size();
   size_t l = 0, r = 0;
   while (l < lhs_length || r < rhs_length) {
-    while (l < lhs_length && std::isspace(lhs[l])) {
+    while (l < lhs_length && absl::ascii_isspace(lhs[l])) {
       ++l;
     }
-    while (r < rhs_length && std::isspace(rhs[r])) {
+    while (r < rhs_length && absl::ascii_isspace(rhs[r])) {
       ++r;
     }
     if (l == lhs_length || r == rhs_length) {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
index 5821108fb9e13d..2c8a67f3ae65cc 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
@@ -31,10 +31,10 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 
@@ -68,8 +68,6 @@ MATCHER_P(MatchIndexingString, indexing_string, "") {
 
 class IndexingTestBase : public HloHardwareIndependentTestBase {
  public:
-  IndexingTestBase() : symbolic_expr_context_(&mlir_context_) {}
-
   HloInstruction* ParseAndGetRoot(absl::string_view hlo_string);
 
   HloInstructionIndexing GetOutputToInputIndexing(
@@ -81,16 +79,14 @@ class IndexingTestBase : public HloHardwareIndependentTestBase {
       bool use_physical_layout = false);
 
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_;
   std::unique_ptr<VerifiedHloModule> module_;
 };
 
 mlir::AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
-                               gpu::SymbolicExprContext* symbolic_expr_context);
+                               mlir::MLIRContext* mlir_context);
 
-mlir::AffineExpr ParseAffineExpr(
-    absl::string_view serialized_affine_expr,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+mlir::AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
+                                 mlir::MLIRContext* mlir_context);
 
 // Safely evaluates the given expression, returning nullopt if the result is
 // undefined (due to undefined behavior, e.g. division by zero or overflow).
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.cc b/third_party/xla/xla/hlo/analysis/symbolic_expr.cc
similarity index 73%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_expr.cc
index f5e87640602c64..bf69914223b1a1 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_expr.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 #include <algorithm>
 #include <cassert>
-#include <cctype>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
@@ -29,6 +28,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/const_init.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/ascii.h"
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -43,9 +44,9 @@ limitations under the License.
 #include "llvm/Support/MathExtras.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/StorageUniquer.h"
+#include "mlir/Support/TypeID.h"
 
 namespace xla {
-namespace gpu {
 namespace {
 
 std::string GetBinaryOpString(SymbolicExprType type) {
@@ -72,7 +73,7 @@ std::string GetBinaryOpString(SymbolicExprType type) {
 // Helper class to manage the state of the parser.
 class Parser {
  public:
-  Parser(absl::string_view str, SymbolicExprContext* context)
+  Parser(absl::string_view str, mlir::MLIRContext* context)
       : remaining_str_(str), context_(context) {}
 
   SymbolicExpr Parse() {
@@ -86,11 +87,11 @@ class Parser {
   int64_t ParseNumber(std::string& error_msg) {
     size_t num_len = 0;
     if (!remaining_str_.empty() &&
-        (isdigit(remaining_str_[0]) || remaining_str_[0] == '-')) {
+        (absl::ascii_isdigit(remaining_str_[0]) || remaining_str_[0] == '-')) {
       num_len = 1;
     }
     while (num_len < remaining_str_.size() &&
-           isdigit(remaining_str_[num_len])) {
+           absl::ascii_isdigit(remaining_str_[num_len])) {
       num_len++;
     }
     CHECK(num_len > 0) << error_msg;
@@ -106,8 +107,8 @@ class Parser {
     while (true) {
       SkipWhitespace();
       if (absl::ConsumePrefix(&remaining_str_, "+")) {
-        lhs =
-            context_->CreateBinaryOp(SymbolicExprType::kAdd, lhs, ParseTerm());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kAdd, lhs, ParseTerm(),
+                                     context_);
       } else {
         break;
       }
@@ -121,14 +122,14 @@ class Parser {
     while (true) {
       SkipWhitespace();
       if (absl::ConsumePrefix(&remaining_str_, "*")) {
-        lhs = context_->CreateBinaryOp(SymbolicExprType::kMul, lhs,
-                                       ParseFactor());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kMul, lhs, ParseFactor(),
+                                     context_);
       } else if (absl::ConsumePrefix(&remaining_str_, "floordiv")) {
-        lhs = context_->CreateBinaryOp(SymbolicExprType::kFloorDiv, lhs,
-                                       ParseFactor());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kFloorDiv, lhs,
+                                     ParseFactor(), context_);
       } else if (absl::ConsumePrefix(&remaining_str_, "ceildiv")) {
-        lhs = context_->CreateBinaryOp(SymbolicExprType::kCeilDiv, lhs,
-                                       ParseFactor());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kCeilDiv, lhs,
+                                     ParseFactor(), context_);
       } else {
         break;
       }
@@ -151,7 +152,7 @@ class Parser {
     SkipWhitespace();
     CHECK(absl::ConsumePrefix(&remaining_str_, ")"))
         << "Missing ')' in " << func_name << "()";
-    return context_->CreateBinaryOp(type, lhs, rhs);
+    return CreateSymbolicBinaryOp(type, lhs, rhs, context_);
   }
 
   // Handles highest precedence items: numbers, variables, and functions.
@@ -177,13 +178,13 @@ class Parser {
     if (absl::ConsumePrefix(&remaining_str_, "v")) {
       std::string error_msg = "Invalid variable format";
       int64_t var_id = ParseNumber(error_msg);
-      return context_->CreateVariable(var_id);
+      return CreateSymbolicVariable(var_id, context_);
     }
     // Case 4: Number
     std::string error_msg =
         absl::StrCat("Failed to parse expression: \"", remaining_str_, "\"");
     int64_t val = ParseNumber(error_msg);
-    return context_->CreateConstant(val);
+    return CreateSymbolicConstant(val, context_);
   }
 
   void SkipWhitespace() {
@@ -191,7 +192,7 @@ class Parser {
   }
 
   absl::string_view remaining_str_;
-  SymbolicExprContext* context_;
+  mlir::MLIRContext* context_;
 };
 
 // Returns {BASE, COEFF}, where expr is equivalent to BASE * COEFF.
@@ -220,8 +221,20 @@ void ExtractTerms(SymbolicExpr expr,
   }
 }
 
+// TODO(b/459357586): Remove this function and use CanonicalizeAdd instead.
+SymbolicExpr BasicAddSimplify(SymbolicExpr lhs, SymbolicExpr rhs) {
+  if (rhs.GetType() == SymbolicExprType::kConstant && rhs.GetValue() == 0) {
+    return lhs;
+  }
+  if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
+    return rhs;
+  }
+  return CreateSymbolicBinaryOp(SymbolicExprType::kAdd, lhs, rhs,
+                                lhs.GetContext());
+}
+
 SymbolicExpr CanonicalizeAdd(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   // Flattening and term collection
   llvm::SmallVector<std::pair<SymbolicExpr, int64_t>> terms;
@@ -256,40 +269,79 @@ SymbolicExpr CanonicalizeAdd(SymbolicExpr lhs, SymbolicExpr rhs) {
 
   // Add the combined constant term as an expression
   if (const_val != 0) {
-    exprs.push_back(ctx->CreateConstant(const_val));
+    exprs.push_back(CreateSymbolicConstant(const_val, ctx));
   }
   if (exprs.empty()) {
-    return ctx->CreateConstant(0);
+    return CreateSymbolicConstant(0, ctx);
   }
   // Sort all terms, including the constant
   absl::c_sort(exprs);
 
   SymbolicExpr result = exprs[0];
   for (size_t i = 1; i < exprs.size(); ++i) {
-    result = ctx->CreateBinaryOp(SymbolicExprType::kAdd, result, exprs[i]);
+    result =
+        CreateSymbolicBinaryOp(SymbolicExprType::kAdd, result, exprs[i], ctx);
   }
   return result;
 }
 
-SymbolicExpr CanonicalizeMul(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+// Helper to simplify multiplication when the RHS is a constant.
+SymbolicExpr SimplifyMulByConstantRHS(SymbolicExpr lhs, SymbolicExpr rhs) {
+  if (rhs.GetType() != SymbolicExprType::kConstant) {
+    return SymbolicExpr();
+  }
+  int64_t rhs_val = rhs.GetValue();
+  mlir::MLIRContext* ctx = lhs.GetContext();
+
+  if (rhs_val == 0) {
+    return rhs;  // x * 0 = 0
+  }
+  if (rhs_val == 1) {
+    return lhs;  // x * 1 = x
+  }
 
-  // Neutral Elements
+  // Associativity: (X * C1) * C2 => X * (C1 * C2)
+  if (lhs.GetType() == SymbolicExprType::kMul &&
+      lhs.GetRHS().GetType() == SymbolicExprType::kConstant) {
+    return CreateSymbolicBinaryOp(
+        SymbolicExprType::kMul, lhs.GetLHS(),
+        CreateSymbolicConstant(lhs.GetRHS().GetValue() * rhs_val, ctx), ctx);
+  }
+  return SymbolicExpr();
+}
+
+SymbolicExpr BasicMulSimplify(SymbolicExpr lhs, SymbolicExpr rhs) {
+  mlir::MLIRContext* ctx = lhs.GetContext();
+
+  // Try constant folding, neutral element simplification, and associativity.
   if (rhs.GetType() == SymbolicExprType::kConstant) {
-    if (rhs.GetValue() == 0) {
-      return rhs;  // x * 0 = 0
+    SymbolicExpr simplified = SimplifyMulByConstantRHS(lhs, rhs);
+    if (simplified) {
+      return simplified;
     }
-    if (rhs.GetValue() == 1) {
-      return lhs;  // x * 1 = x
+  } else if (lhs.GetType() == SymbolicExprType::kConstant) {
+    SymbolicExpr simplified = SimplifyMulByConstantRHS(rhs, lhs);
+    if (simplified) {
+      return simplified;
     }
   }
 
-  // Associativity: (X * C1) * C2 => X * (C1 * C2)
-  if (lhs.GetType() == SymbolicExprType::kMul &&
-      lhs.GetRHS().GetType() == SymbolicExprType::kConstant &&
-      rhs.GetType() == SymbolicExprType::kConstant) {
-    return (lhs.GetLHS() * (lhs.GetRHS().GetValue() * rhs.GetValue()))
-        .Canonicalize();
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMul, lhs, rhs, ctx);
+}
+
+SymbolicExpr CanonicalizeMul(SymbolicExpr lhs, SymbolicExpr rhs) {
+  mlir::MLIRContext* ctx = lhs.GetContext();
+
+  if (rhs.GetType() == SymbolicExprType::kConstant) {
+    // Try constant folding, neutral element simplification, and associativity.
+    SymbolicExpr simplified = SimplifyMulByConstantRHS(lhs, rhs);
+    if (simplified) {
+      if (simplified.GetType() == SymbolicExprType::kConstant ||
+          simplified == lhs) {
+        return simplified;
+      }
+      return simplified.Canonicalize();
+    }
   }
 
   // Distribute Mul over Add: (A + B) * C => (A * C) + (B * C)
@@ -300,7 +352,7 @@ SymbolicExpr CanonicalizeMul(SymbolicExpr lhs, SymbolicExpr rhs) {
     return ((lhs * rhs.GetLHS()) + (lhs * rhs.GetRHS())).Canonicalize();
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kMul, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMul, lhs, rhs, ctx);
 }
 
 std::optional<int64_t> SubtractAndGetConstDiff(SymbolicExpr lhs,
@@ -313,21 +365,21 @@ std::optional<int64_t> SubtractAndGetConstDiff(SymbolicExpr lhs,
 }
 
 SymbolicExpr CanonicalizeMin(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
   if (auto diff = SubtractAndGetConstDiff(lhs, rhs)) {  // min(X, X + k) = X
     return (diff.value() <= 0) ? lhs : rhs;
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kMin, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMin, lhs, rhs, ctx);
 }
 
 SymbolicExpr CanonicalizeMax(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
   if (auto diff = SubtractAndGetConstDiff(lhs, rhs)) {  // max(X, X + k) = X + k
     return (diff.value() >= 0) ? lhs : rhs;
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kMax, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMax, lhs, rhs, ctx);
 }
 
 // Helper function to simplify (A * C1) op C2 using GCD.
@@ -386,7 +438,7 @@ SymbolicExpr SimplifyFloorDivAddOperand(SymbolicExpr a, SymbolicExpr b,
     remaining_expr = a.GetLHS();
   } else if (a.GetType() == SymbolicExprType::kConstant) {
     a_coeff = a.GetValue();
-    remaining_expr = a.GetContext()->CreateConstant(1);
+    remaining_expr = CreateSymbolicConstant(1, a.GetContext());
   } else {
     return SymbolicExpr();  // Cannot simplify
   }
@@ -398,7 +450,7 @@ SymbolicExpr SimplifyFloorDivAddOperand(SymbolicExpr a, SymbolicExpr b,
 }
 
 SymbolicExpr CanonicalizeFloorDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
     return lhs;  // 0 floordiv X => 0
@@ -436,11 +488,11 @@ SymbolicExpr CanonicalizeFloorDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
     }
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kFloorDiv, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kFloorDiv, lhs, rhs, ctx);
 }
 
 SymbolicExpr CanonicalizeCeilDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
     return lhs;  // 0 ceildiv X => 0
@@ -468,18 +520,18 @@ SymbolicExpr CanonicalizeCeilDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
     return (-(lhs.floorDiv(-divisor))).Canonicalize();
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kCeilDiv, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kCeilDiv, lhs, rhs, ctx);
 }
 
 SymbolicExpr CanonicalizeMod(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
     return lhs;  // 0 mod X => 0
   }
 
   if (lhs == rhs) {
-    return ctx->CreateConstant(0);  // X mod X => 0
+    return CreateSymbolicConstant(0, ctx);  // X mod X => 0
   }
 
   if (rhs.GetType() == SymbolicExprType::kConstant) {
@@ -537,12 +589,16 @@ class SymbolicExprStorage : public mlir::StorageUniquer::BaseStorage {
 
  protected:
   friend class SymbolicExpr;
-  friend class SymbolicExprContext;
+  friend SymbolicExpr GetOrCreateSymbolicExpr(SymbolicExprType type,
+                                              int64_t value, SymbolicExpr lhs,
+                                              SymbolicExpr rhs,
+                                              mlir::MLIRContext* mlir_context);
+
   SymbolicExprType type_;
   int64_t value_ = 0;
   SymbolicExpr lhs_;
   SymbolicExpr rhs_;
-  SymbolicExprContext* ctx_ = nullptr;
+  mlir::MLIRContext* mlir_context_ = nullptr;
 
  private:
   SymbolicExprStorage(SymbolicExprType type, int64_t value)
@@ -551,10 +607,18 @@ class SymbolicExprStorage : public mlir::StorageUniquer::BaseStorage {
       : type_(type), lhs_(lhs), rhs_(rhs) {}
 };
 
-SymbolicExprContext* SymbolicExpr::GetContext() const { return impl_->ctx_; }
+mlir::MLIRContext* SymbolicExpr::GetContext() const {
+  return impl_->mlir_context_;
+}
 
 SymbolicExprType SymbolicExpr::GetType() const { return impl_->type_; }
 
+bool SymbolicExpr::IsBinaryOp() const {
+  auto type = GetType();
+  return type != SymbolicExprType::kConstant &&
+         type != SymbolicExprType::kVariable;
+}
+
 SymbolicExpr SymbolicExpr::GetLHS() const { return impl_->lhs_; }
 
 SymbolicExpr SymbolicExpr::GetRHS() const { return impl_->rhs_; }
@@ -667,7 +731,7 @@ int64_t SymbolicExpr::Evaluate(
 
 SymbolicExpr SymbolicExpr::ReplaceVariables(
     absl::Span<const SymbolicExpr> substitutions) const {
-  SymbolicExprContext* ctx = GetContext();
+  mlir::MLIRContext* ctx = GetContext();
   switch (GetType()) {
     case SymbolicExprType::kConstant:
       return *this;
@@ -691,7 +755,7 @@ SymbolicExpr SymbolicExpr::ReplaceVariables(
       if (new_lhs == GetLHS() && new_rhs == GetRHS()) {
         return *this;
       }
-      return ctx->CreateBinaryOp(GetType(), new_lhs, new_rhs);
+      return CreateSymbolicBinaryOp(GetType(), new_lhs, new_rhs, ctx);
     }
     default:
       LOG(FATAL) << "Substitute not implemented for this type.";
@@ -709,8 +773,9 @@ SymbolicExpr SymbolicExpr::ReplaceDimsAndSymbols(
     int64_t num_dims) const {
   llvm::SmallVector<SymbolicExpr> replacements;
   replacements.append(dim_replacements.begin(), dim_replacements.end());
+  mlir::MLIRContext* ctx = GetContext();
   for (int64_t i = dim_replacements.size(); i < num_dims; ++i) {
-    replacements.push_back(GetContext()->CreateVariable(i));
+    replacements.push_back(CreateSymbolicVariable(i, ctx));
   }
   replacements.append(symbol_replacements.begin(), symbol_replacements.end());
   return ReplaceVariables(replacements);
@@ -730,12 +795,11 @@ SymbolicExpr SymbolicExpr::Replace(
     return it->second;
   }
 
-  SymbolicExprType type = GetType();
-  if (type == SymbolicExprType::kConstant ||
-      type == SymbolicExprType::kVariable) {
+  if (!IsBinaryOp()) {
     return *this;
   }
 
+  SymbolicExprType type = GetType();
   SymbolicExpr lhs = GetLHS();
   SymbolicExpr rhs = GetRHS();
   SymbolicExpr new_lhs = lhs.Replace(replacements);
@@ -744,7 +808,7 @@ SymbolicExpr SymbolicExpr::Replace(
   if (new_lhs == lhs && new_rhs == rhs) {
     return *this;
   }
-  return GetContext()->CreateBinaryOp(type, new_lhs, new_rhs);
+  return CreateSymbolicBinaryOp(type, new_lhs, new_rhs, GetContext());
 }
 
 void SymbolicExpr::GetUsedVariables(
@@ -777,21 +841,21 @@ SymbolicExpr SymbolicExpr::Canonicalize() const {
     return *this;
   }
 
-  SymbolicExprType type = GetType();
-  if (type == SymbolicExprType::kConstant ||
-      type == SymbolicExprType::kVariable) {
+  if (!IsBinaryOp()) {
     return *this;
   }
 
+  SymbolicExprType type = GetType();
   SymbolicExpr lhs = this->GetLHS().Canonicalize();
   SymbolicExpr rhs = this->GetRHS().Canonicalize();
 
   // If both sides are constants, we can evaluate the expression.
   if (lhs.GetType() == SymbolicExprType::kConstant &&
       rhs.GetType() == SymbolicExprType::kConstant) {
-    return GetContext()->CreateConstant(
-        SymbolicExpr(GetContext()->CreateBinaryOp(type, lhs, rhs))
-            .Evaluate({}));
+    return CreateSymbolicConstant(
+        SymbolicExpr(CreateSymbolicBinaryOp(type, lhs, rhs, GetContext()))
+            .Evaluate({}),
+        GetContext());
   }
 
   // Assure constants are on the RHS for commutative operations.
@@ -829,14 +893,16 @@ SymbolicExpr SymbolicExpr::Canonicalize() const {
 }
 
 SymbolicExpr SymbolicExpr::operator+(int64_t v) const {
-  return *this + GetContext()->CreateConstant(v);
+  return *this + CreateSymbolicConstant(v, GetContext());
 }
 SymbolicExpr SymbolicExpr::operator+(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kAdd, *this, other);
+  // TODO(b/433693782): We should use our own canonicalization here instead of
+  // relying on a similar one to AffineMap so tests do not fail.
+  return BasicAddSimplify(*this, other);
 }
 
 SymbolicExpr SymbolicExpr::operator-() const {
-  return (*this * GetContext()->CreateConstant(-1)).Canonicalize();
+  return (*this * CreateSymbolicConstant(-1, GetContext())).Canonicalize();
 }
 SymbolicExpr SymbolicExpr::operator-(int64_t v) const { return *this + (-v); }
 SymbolicExpr SymbolicExpr::operator-(SymbolicExpr other) const {
@@ -844,84 +910,124 @@ SymbolicExpr SymbolicExpr::operator-(SymbolicExpr other) const {
 }
 
 SymbolicExpr SymbolicExpr::operator*(int64_t v) const {
-  return *this * GetContext()->CreateConstant(v);
+  return *this * CreateSymbolicConstant(v, GetContext());
 }
 SymbolicExpr SymbolicExpr::operator*(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMul, *this, other);
+  // TODO(b/433693782): We should use our own canonicalization here instead of
+  // relying on a similar one to AffineMap so tests do not fail.
+  return BasicMulSimplify(*this, other);
 }
 
 SymbolicExpr SymbolicExpr::operator%(int64_t v) const {
-  return this->operator%(GetContext()->CreateConstant(v));
+  return this->operator%(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::operator%(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMod, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMod, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::floorDiv(int64_t v) const {
-  return this->floorDiv(GetContext()->CreateConstant(v));
+  return this->floorDiv(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::floorDiv(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kFloorDiv, *this,
-                                      other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kFloorDiv, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::ceilDiv(int64_t v) const {
-  return this->ceilDiv(GetContext()->CreateConstant(v));
+  return this->ceilDiv(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::ceilDiv(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kCeilDiv, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kCeilDiv, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::min(int64_t v) const {
-  return this->min(GetContext()->CreateConstant(v));
+  return this->min(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::min(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMin, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMin, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::max(int64_t v) const {
-  return this->max(GetContext()->CreateConstant(v));
+  return this->max(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::max(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMax, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMax, *this, other,
+                                GetContext());
 }
 
-SymbolicExprContext::SymbolicExprContext(mlir::MLIRContext* mlir_context)
-    : mlir_context_(mlir_context) {
-  uniquer_.registerParametricStorageType<SymbolicExprStorage>();
+static absl::Mutex& getSymbolicExprStorageMutex() {
+  static absl::Mutex m(absl::kConstInit);
+  return m;
 }
 
-SymbolicExpr SymbolicExprContext::GetOrCreate(SymbolicExprType type,
-                                              int64_t value, SymbolicExpr lhs,
-                                              SymbolicExpr rhs) {
+void RegisterSymbolicExprStorage(mlir::MLIRContext* mlir_context) {
+  CHECK(mlir_context != nullptr);
+  auto* uniquer = &mlir_context->getAffineUniquer();
+  {
+    absl::MutexLock lock(getSymbolicExprStorageMutex());
+    if (!uniquer->isParametricStorageInitialized(
+            mlir::TypeID::get<SymbolicExprStorage>())) {
+      uniquer->registerParametricStorageType<SymbolicExprStorage>();
+    }
+  }
+}
+
+SymbolicExpr GetOrCreateSymbolicExpr(SymbolicExprType type, int64_t value,
+                                     SymbolicExpr lhs, SymbolicExpr rhs,
+                                     mlir::MLIRContext* mlir_context) {
+  // TODO(b/433696544): This might be too expensive to call per expression.
+  // We should consider calling this once per MLIRContext creation.
+  RegisterSymbolicExprStorage(mlir_context);
+  auto* uniquer = &mlir_context->getAffineUniquer();
   auto initContext = [&](SymbolicExprStorage* storage) {
-    storage->ctx_ = this;
+    storage->mlir_context_ = mlir_context;
   };
-  return uniquer_.get<SymbolicExprStorage>(initContext, type, value, lhs, rhs);
+  return uniquer->get<SymbolicExprStorage>(initContext, type, value, lhs, rhs);
 }
 
-SymbolicExpr SymbolicExprContext::CreateConstant(int64_t value) {
-  return GetOrCreate(SymbolicExprType::kConstant, value, SymbolicExpr(),
-                     SymbolicExpr());
+SymbolicExpr CreateSymbolicConstant(int64_t value,
+                                    mlir::MLIRContext* mlir_context) {
+  return GetOrCreateSymbolicExpr(SymbolicExprType::kConstant, value,
+                                 SymbolicExpr(), SymbolicExpr(), mlir_context);
 }
 
-SymbolicExpr SymbolicExprContext::CreateVariable(int64_t var_id) {
-  return GetOrCreate(SymbolicExprType::kVariable, var_id, SymbolicExpr(),
-                     SymbolicExpr());
+SymbolicExpr CreateSymbolicVariable(int64_t var_id,
+                                    mlir::MLIRContext* mlir_context) {
+  return GetOrCreateSymbolicExpr(SymbolicExprType::kVariable, var_id,
+                                 SymbolicExpr(), SymbolicExpr(), mlir_context);
 }
 
-SymbolicExpr SymbolicExprContext::CreateBinaryOp(SymbolicExprType type,
-                                                 SymbolicExpr lhs,
-                                                 SymbolicExpr rhs) {
+SymbolicExpr CreateSymbolicBinaryOp(SymbolicExprType type, SymbolicExpr lhs,
+                                    SymbolicExpr rhs,
+                                    mlir::MLIRContext* mlir_context) {
   CHECK(type != SymbolicExprType::kConstant &&
         type != SymbolicExprType::kVariable && lhs && rhs)
       << "We expect a binary operation and two symbolic expressions as "
          "children.";
-  return GetOrCreate(type, 0, lhs, rhs);
+  auto result = GetOrCreateSymbolicExpr(type, 0, lhs, rhs, mlir_context);
+  // Basic constant folding.
+  if (lhs.GetType() == SymbolicExprType::kConstant &&
+      rhs.GetType() == SymbolicExprType::kConstant) {
+    return CreateSymbolicConstant(result.Evaluate({}), mlir_context);
+  }
+  return result;
 }
 
-SymbolicExpr SymbolicExprContext::Parse(absl::string_view expr_str) {
-  return Parser(expr_str, this).Parse();
+llvm::SmallVector<SymbolicExpr> CreateSymbolicConstantExprs(
+    llvm::ArrayRef<int64_t> constants, mlir::MLIRContext* context) {
+  llvm::SmallVector<SymbolicExpr> exprs;
+  exprs.reserve(constants.size());
+  for (int64_t constant : constants) {
+    exprs.push_back(CreateSymbolicConstant(constant, context));
+  }
+  return exprs;
+}
+SymbolicExpr ParseSymbolicExpr(absl::string_view expr_str,
+                               mlir::MLIRContext* mlir_context) {
+  return Parser(expr_str, mlir_context).Parse();
 }
 
 void SymbolicExpr::Walk(
@@ -948,5 +1054,4 @@ void SymbolicExpr::Walk(
   callback(*this);
 }
 
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.h b/third_party/xla/xla/hlo/analysis/symbolic_expr.h
similarity index 75%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.h
rename to third_party/xla/xla/hlo/analysis/symbolic_expr.h
index f02c091446e319..677ed8b6136c7e 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.h
+++ b/third_party/xla/xla/hlo/analysis/symbolic_expr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_EXPR_H_
-#define XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_EXPR_H_
+#ifndef XLA_HLO_ANALYSIS_SYMBOLIC_EXPR_H_
+#define XLA_HLO_ANALYSIS_SYMBOLIC_EXPR_H_
 
 #include <cstdint>
 #include <functional>
@@ -32,9 +32,7 @@ limitations under the License.
 #include "mlir/Support/StorageUniquer.h"
 
 namespace xla {
-namespace gpu {
 
-class SymbolicExprContext;
 class SymbolicExprStorage;
 
 typedef int64_t VariableID;
@@ -64,8 +62,9 @@ class SymbolicExpr {
   bool operator!=(SymbolicExpr other) const { return !(*this == other); }
   bool operator<(const SymbolicExpr& other) const;
 
-  SymbolicExprContext* GetContext() const;
+  mlir::MLIRContext* GetContext() const;
   SymbolicExprType GetType() const;
+  bool IsBinaryOp() const;
   SymbolicExpr GetLHS() const;
   SymbolicExpr GetRHS() const;
   int64_t GetValue() const;
@@ -148,52 +147,52 @@ inline ::llvm::hash_code hash_value(SymbolicExpr expr) {
   return ::llvm::hash_value(expr.GetImpl());
 }
 
-class SymbolicExprContext {
- public:
-  explicit SymbolicExprContext(mlir::MLIRContext* mlir_context);
-  SymbolicExpr Parse(absl::string_view expr_str);
-  SymbolicExpr CreateConstant(int64_t value);
-  SymbolicExpr CreateVariable(int64_t var_id);
-  SymbolicExpr CreateBinaryOp(SymbolicExprType type, SymbolicExpr lhs,
-                              SymbolicExpr rhs);
-
-  mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
+template <typename H>
+H AbslHashValue(H h, const SymbolicExpr& expr) {
+  return H::combine(std::move(h), hash_value(expr));
+}
 
- private:
-  SymbolicExpr GetOrCreate(SymbolicExprType type, int64_t value,
-                           SymbolicExpr lhs, SymbolicExpr rhs);
-  mlir::StorageUniquer uniquer_;
-  // TODO(b/446856305): MLIRContext is only used here temporarily while we have
-  // AffineMap <-> SymbolicMap convertors.
-  mlir::MLIRContext* mlir_context_;
-};
+// This method should be called once permlir::MLIRContext to register the
+// SymbolicExprStorage type with themlir::MLIRContext's uniquifier. It should be
+// called before any SymbolicExprs are created.
+void RegisterSymbolicExprStorage(mlir::MLIRContext* mlir_context);
+
+// Free functions to create SymbolicExpr.
+SymbolicExpr ParseSymbolicExpr(absl::string_view expr_str,
+                               mlir::MLIRContext* mlir_context);
+SymbolicExpr CreateSymbolicConstant(int64_t value,
+                                    mlir::MLIRContext* mlir_context);
+SymbolicExpr CreateSymbolicVariable(int64_t var_id,
+                                    mlir::MLIRContext* mlir_context);
+SymbolicExpr CreateSymbolicBinaryOp(SymbolicExprType type, SymbolicExpr lhs,
+                                    SymbolicExpr rhs,
+                                    mlir::MLIRContext* mlir_context);
+llvm::SmallVector<SymbolicExpr> CreateSymbolicConstantExprs(
+    llvm::ArrayRef<int64_t> constants, mlir::MLIRContext* mlir_context);
 
-}  // namespace gpu
 }  // namespace xla
 
 namespace llvm {
 
 // SymbolicExpr hash just like pointers
 template <>
-struct DenseMapInfo<xla::gpu::SymbolicExpr> {
-  static xla::gpu::SymbolicExpr getEmptyKey() {
+struct DenseMapInfo<xla::SymbolicExpr> {
+  static xla::SymbolicExpr getEmptyKey() {
     auto* pointer = llvm::DenseMapInfo<void*>::getEmptyKey();
-    return xla::gpu::SymbolicExpr(
-        static_cast<xla::gpu::SymbolicExprStorage*>(pointer));
+    return xla::SymbolicExpr(static_cast<xla::SymbolicExprStorage*>(pointer));
   }
-  static xla::gpu::SymbolicExpr getTombstoneKey() {
+  static xla::SymbolicExpr getTombstoneKey() {
     auto* pointer = llvm::DenseMapInfo<void*>::getTombstoneKey();
-    return xla::gpu::SymbolicExpr(
-        static_cast<xla::gpu::SymbolicExprStorage*>(pointer));
+    return xla::SymbolicExpr(static_cast<xla::SymbolicExprStorage*>(pointer));
   }
-  static unsigned getHashValue(xla::gpu::SymbolicExpr val) {
+  static unsigned getHashValue(xla::SymbolicExpr val) {
     return hash_value(val);
   }
-  static bool isEqual(xla::gpu::SymbolicExpr LHS, xla::gpu::SymbolicExpr RHS) {
+  static bool isEqual(xla::SymbolicExpr LHS, xla::SymbolicExpr RHS) {
     return LHS == RHS;
   }
 };
 
 }  // namespace llvm
 
-#endif  // XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_EXPR_H_
+#endif  // XLA_HLO_ANALYSIS_SYMBOLIC_EXPR_H_
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr_test.cc b/third_party/xla/xla/hlo/analysis/symbolic_expr_test.cc
similarity index 65%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_expr_test.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_expr_test.cc
index d515d6ff54260c..f9ac2bc49bd4a2 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr_test.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_expr_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 #include <cstdint>
 #include <string>
@@ -22,11 +22,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 
 namespace xla {
-namespace gpu {
 namespace {
 using ::testing::Combine;
 using ::testing::Values;
@@ -34,11 +36,13 @@ using ::testing::Values;
 // Test fixture to hold the context for all tests.
 struct SymbolicExprTest : public ::testing::Test {
  protected:
-  // There should not be any usage of MLIRContext in this test.
-  SymbolicExprContext ctx{nullptr};
-  SymbolicExpr v0 = ctx.CreateVariable(0);
-  SymbolicExpr v1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
+  mlir::MLIRContext ctx;
+  SymbolicExpr v0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr v1 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr c1 = CreateSymbolicConstant(1, &ctx);
+  SymbolicExpr c3 = CreateSymbolicConstant(3, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &ctx);
+  SymbolicExpr c5 = CreateSymbolicConstant(5, &ctx);
 };
 
 TEST_F(SymbolicExprTest, CreateAndPrint) {
@@ -65,17 +69,36 @@ TEST_F(SymbolicExprTest, PrintWithDifferentNumDimensions) {
 TEST_F(SymbolicExprTest, ParseAndPrint) {
   const std::string kStringContainingAllOperators =
       "((((v0 + 42) * max(min(v1, 2), 0)) floordiv 2) ceildiv 2)";
-  SymbolicExpr parsed_expr = ctx.Parse(kStringContainingAllOperators);
+  SymbolicExpr parsed_expr =
+      ParseSymbolicExpr(kStringContainingAllOperators, &ctx);
   ASSERT_NE(parsed_expr, nullptr);
   EXPECT_THAT(parsed_expr.ToString(),
               MatchIndexingString(kStringContainingAllOperators));
 }
 
 TEST_F(SymbolicExprTest, ParseAndPrint_Invalid) {
-  EXPECT_DEATH(ctx.Parse("1 + "), "Unexpected end of expression");
-  EXPECT_DEATH(ctx.Parse("max(1, )"), "Failed to parse expression");
-  EXPECT_DEATH(ctx.Parse("(1 + 2"), "Missing parenthesis");
-  EXPECT_DEATH(ctx.Parse("foo(3, 4)"), "Failed to parse expression");
+  EXPECT_DEATH(ParseSymbolicExpr("1 + ", &ctx), "Unexpected end of expression");
+  EXPECT_DEATH(ParseSymbolicExpr("max(1, )", &ctx),
+               "Failed to parse expression");
+  EXPECT_DEATH(ParseSymbolicExpr("(1 + 2", &ctx), "Missing parenthesis");
+  EXPECT_DEATH(ParseSymbolicExpr("foo(3, 4)", &ctx),
+               "Failed to parse expression");
+}
+
+TEST_F(SymbolicExprTest, ConstantFolding) {
+  // Expressions are simplified at creation if possible.
+  EXPECT_EQ(c2 + c3, c5);
+  EXPECT_EQ(c5 - c2, c3);
+  EXPECT_EQ(c2 * c3, CreateSymbolicConstant(6, &ctx));
+  EXPECT_EQ(c5 / c2, c2);
+  EXPECT_EQ(c5 % c2, c1);
+  EXPECT_EQ(c5.floorDiv(c2), c2);
+  EXPECT_EQ(c5.ceilDiv(c2), c3);
+  EXPECT_EQ(c2.min(c5), c2);
+  EXPECT_EQ(c5.min(c2), c2);
+  EXPECT_EQ(c2.max(c5), c5);
+  EXPECT_EQ(c5.max(c2), c5);
+  EXPECT_EQ(((c2 + c3) * c2).ceilDiv(c5), c2);
 }
 
 TEST_F(SymbolicExprTest, Evaluate) {
@@ -100,8 +123,8 @@ TEST_P(SymbolicExprEvaluateDivModTest, EvaluateDivMod) {
   const auto& params = GetParam();
   const int64_t numerator_val = std::get<0>(params);
   const int64_t denominator_val = std::get<1>(params);
-  SymbolicExpr numerator = ctx.CreateConstant(numerator_val);
-  SymbolicExpr denominator = ctx.CreateConstant(denominator_val);
+  SymbolicExpr numerator = CreateSymbolicConstant(numerator_val, &ctx);
+  SymbolicExpr denominator = CreateSymbolicConstant(denominator_val, &ctx);
 
   if (numerator_val % denominator_val == 0) {
     EXPECT_EQ((numerator % denominator).Evaluate({}), 0);
@@ -118,27 +141,28 @@ INSTANTIATE_TEST_SUITE_P(PositiveAndNegative, SymbolicExprEvaluateDivModTest,
                          Combine(Values(5, -5, 4, -4), Values(2, -2)));
 
 TEST_F(SymbolicExprTest, ReplaceVariables) {
-  SymbolicExpr expr_to_sub = ctx.Parse("(v0 + v1)");
-  std::vector<SymbolicExpr> substitutions{{}, ctx.Parse("(v2 * 10)")};
+  SymbolicExpr expr_to_sub = ParseSymbolicExpr("(v0 + v1)", &ctx);
+  std::vector<SymbolicExpr> substitutions{{},
+                                          ParseSymbolicExpr("(v2 * 10)", &ctx)};
   SymbolicExpr result = expr_to_sub.ReplaceVariables(substitutions);
   EXPECT_EQ(result.ToString(), "(v0 + (v2 * 10))");
 }
 
 TEST_F(SymbolicExprTest, ReplaceSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr s0 = ctx.CreateVariable(1);
-  SymbolicExpr s1 = ctx.CreateVariable(2);
-  SymbolicExpr c7 = ctx.CreateConstant(7);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr s0 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr s1 = CreateSymbolicVariable(2, &ctx);
+  SymbolicExpr c7 = CreateSymbolicConstant(7, &ctx);
   SymbolicExpr expr_to_sub = (d0 + s0 * 2) * s1;
   SymbolicExpr result = expr_to_sub.ReplaceSymbols({d0, c7}, /*num_dims=*/1);
   EXPECT_EQ(result, ((d0 + (d0 * 2)) * c7));
 }
 
 TEST_F(SymbolicExprTest, ReplaceDimsAndSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr s0 = ctx.CreateVariable(1);
-  SymbolicExpr s1 = ctx.CreateVariable(2);
-  SymbolicExpr c7 = ctx.CreateConstant(7);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr s0 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr s1 = CreateSymbolicVariable(2, &ctx);
+  SymbolicExpr c7 = CreateSymbolicConstant(7, &ctx);
   SymbolicExpr expr_to_sub = (d0 + s0 * 2) * s1;
   SymbolicExpr result =
       expr_to_sub.ReplaceDimsAndSymbols({s0}, {d0, c7}, /*num_dims=*/1);
@@ -154,10 +178,10 @@ TEST_F(SymbolicExprTest, ReplaceDimsAndSymbols) {
 }
 
 TEST_F(SymbolicExprTest, UniquingWorks) {
-  SymbolicExpr c1 = ctx.CreateConstant(42);
-  SymbolicExpr c2 = ctx.CreateConstant(42);
+  SymbolicExpr c1 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(42, &ctx);
   EXPECT_EQ(c1, c2);
-  SymbolicExpr c3 = ctx.CreateConstant(99);
+  SymbolicExpr c3 = CreateSymbolicConstant(99, &ctx);
   EXPECT_NE(c1, c3);
 
   SymbolicExpr add1 = v0 + 42;
@@ -167,11 +191,18 @@ TEST_F(SymbolicExprTest, UniquingWorks) {
   EXPECT_NE(add1, add3);
 }
 
+TEST_F(SymbolicExprTest, UniquingDoesNotCrashWithCombinedAffineExpr) {
+  mlir::AffineExpr affine_expr = mlir::getAffineDimExpr(0, &ctx);
+  SymbolicExpr c1 = CreateSymbolicConstant(42, &ctx);
+  EXPECT_EQ(affine_expr, mlir::getAffineDimExpr(0, &ctx));
+  EXPECT_EQ(c1, CreateSymbolicConstant(42, &ctx));
+}
+
 TEST_F(SymbolicExprTest, Replace) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
-  SymbolicExpr c5 = ctx.CreateConstant(5);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr d1 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &ctx);
+  SymbolicExpr c5 = CreateSymbolicConstant(5, &ctx);
 
   SymbolicExpr expr = (d0 + c2) * (d1 + c2);
   EXPECT_EQ(expr.Replace(d0 + c2, c5), (c5 * (d1 + c2)));
@@ -179,22 +210,21 @@ TEST_F(SymbolicExprTest, Replace) {
   EXPECT_EQ(expr.Replace(c2, c5), (d0 + c5) * (d1 + c5));
   EXPECT_EQ(expr.Replace(expr, c2), c2);
   EXPECT_EQ(expr.Replace(d1, d1), expr);
-  EXPECT_EQ(expr.Replace(ctx.CreateConstant(42), d1), expr);
+  EXPECT_EQ(expr.Replace(CreateSymbolicConstant(42, &ctx), d1), expr);
 }
 
 TEST_F(SymbolicExprTest, ReplaceWithMap) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
-  SymbolicExpr c5 = ctx.CreateConstant(5);
-  SymbolicExpr c10 = ctx.CreateConstant(10);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr d1 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &ctx);
+  SymbolicExpr c5 = CreateSymbolicConstant(5, &ctx);
 
   SymbolicExpr expr = (d0 + c2) * (d1 + c2);
 
   llvm::DenseMap<SymbolicExpr, SymbolicExpr> replace_expression;
   replace_expression[d0 + c2] = c5;
-  replace_expression[d1] = c10;
-  EXPECT_EQ(expr.Replace(replace_expression), c5 * (c10 + c2));
+  replace_expression[d1] = d0;
+  EXPECT_EQ(expr.Replace(replace_expression), c5 * (d0 + c2));
 
   llvm::DenseMap<SymbolicExpr, SymbolicExpr> replace_constant;
   replace_constant[c2] = d0;
@@ -206,10 +236,43 @@ TEST_F(SymbolicExprTest, ReplaceWithMap) {
   EXPECT_EQ(expr.Replace(swap_variables), (d1 + c2) * (d0 + c2));
 
   llvm::DenseMap<SymbolicExpr, SymbolicExpr> no_change;
-  no_change[ctx.CreateVariable(99)] = c5;
+  no_change[CreateSymbolicVariable(99, &ctx)] = c5;
   EXPECT_EQ(expr.Replace(no_change), expr);
 }
 
+TEST_F(SymbolicExprTest, BasicSimplificationsAtCreationTime) {
+  auto c0 = CreateSymbolicConstant(0, &ctx);
+  auto c1 = CreateSymbolicConstant(1, &ctx);
+  auto c3 = CreateSymbolicConstant(3, &ctx);
+
+  // x + 0 = x
+  EXPECT_EQ(v0 + c0, v0);
+  EXPECT_EQ(c0 + v0, v0);
+  EXPECT_EQ(c2 + c1, c3);
+
+  // TODO(b/459357586): This will be canonicalized to (v0 + 2) in the future.
+  EXPECT_NE(v0 + c2, c2 + v0);
+
+  // x * 0 = 0
+  EXPECT_EQ(v0 * c0, c0);
+  EXPECT_EQ(c0 * v0, c0);
+  EXPECT_EQ(c2 * c0, c0);
+
+  // x * 1 = x
+  EXPECT_EQ(v0 * c1, v0);
+  EXPECT_EQ(c1 * v0, v0);
+  EXPECT_EQ(c2 * c1, c2);
+
+  // Associativity: (X * C1) * C2 = X * (C1 * C2)
+  EXPECT_EQ(((v0 * 2) * 3), v0 * 6);
+
+  // No associativity if constant is on LHS of outer mul.
+  // TODO(b/459357586): This will be canonicalized to (v0 * 6) in the future.
+  SymbolicExpr mul_2_v0 = CreateSymbolicConstant(2, &ctx) * v0;
+  SymbolicExpr mul_2_v0_3 = mul_2_v0 * 3;
+  EXPECT_EQ(mul_2_v0_3.ToString(), "((2 * v0) * 3)");
+}
+
 TEST_F(SymbolicExprTest, Canonicalization_Basic) {
   SymbolicExpr constants = (c2 * 3) + 5;
   EXPECT_EQ(constants.Canonicalize().ToString(), "11");
@@ -305,6 +368,36 @@ TEST_F(SymbolicExprTest, Walk) {
                                                     "v1", "((v0 + 42) * v1)"));
 }
 
+TEST_F(SymbolicExprTest, Hashing) {
+  absl::flat_hash_set<SymbolicExpr> set;
+
+  SymbolicExpr c42_1 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c42_2 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c3 = CreateSymbolicConstant(3, &ctx);
+
+  set.insert(c42_1);
+  set.insert(c42_2);
+  set.insert(c3);
+  EXPECT_EQ(set.size(), 2);
+
+  SymbolicExpr v0_1 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr v0_2 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr v1 = CreateSymbolicVariable(1, &ctx);
+
+  set.insert(v0_1);
+  set.insert(v0_2);
+  set.insert(v1);
+  EXPECT_EQ(set.size(), 4);
+
+  SymbolicExpr add1 = v0_1 + c42_1;
+  SymbolicExpr add2 = v0_2 + c42_2;
+  SymbolicExpr add3 = v1 + c3;
+
+  set.insert(add1);
+  set.insert(add2);
+  set.insert(add3);
+  EXPECT_EQ(set.size(), 6);
+}
+
 }  // namespace
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.cc b/third_party/xla/xla/hlo/analysis/symbolic_map.cc
similarity index 86%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map.cc
index 6f8cc928f0d82c..a8743bacec83bf 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -29,20 +29,19 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
-namespace gpu {
-
 namespace {
 
-llvm::SmallVector<SymbolicExpr> CreateVariableRange(SymbolicExprContext* ctx,
+llvm::SmallVector<SymbolicExpr> CreateVariableRange(mlir::MLIRContext* ctx,
                                                     int64_t n,
                                                     int64_t offset = 0) {
   llvm::SmallVector<SymbolicExpr> replacements;
   replacements.reserve(n);
   for (int64_t i = 0; i < n; ++i) {
-    replacements.push_back(ctx->CreateVariable(offset + i));
+    replacements.push_back(CreateSymbolicVariable(offset + i, ctx));
   }
   return replacements;
 }
@@ -58,7 +57,7 @@ llvm::DenseSet<VariableID> GetUsedVariablesFromExpressions(
 
 }  // namespace
 
-SymbolicMap::SymbolicMap(SymbolicExprContext* ctx, int64_t num_dimensions,
+SymbolicMap::SymbolicMap(mlir::MLIRContext* ctx, int64_t num_dimensions,
                          int64_t num_symbols,
                          llvm::SmallVector<SymbolicExpr> exprs)
     : ctx_(ctx),
@@ -66,7 +65,7 @@ SymbolicMap::SymbolicMap(SymbolicExprContext* ctx, int64_t num_dimensions,
       num_symbols_(num_symbols),
       exprs_(std::move(exprs)) {}
 
-/*static*/ SymbolicMap SymbolicMap::Get(SymbolicExprContext* ctx,
+/*static*/ SymbolicMap SymbolicMap::Get(mlir::MLIRContext* ctx,
                                         int64_t num_dimensions,
                                         int64_t num_symbols,
                                         llvm::SmallVector<SymbolicExpr> exprs) {
@@ -129,13 +128,28 @@ SymbolicMap SymbolicMap::ReplaceDimsAndSymbols(
     absl::Span<const SymbolicExpr> dim_replacements,
     absl::Span<const SymbolicExpr> sym_replacements, int64_t num_result_dims,
     int64_t num_result_symbols) const {
-  CHECK_EQ(dim_replacements.size(), num_dimensions_);
-  CHECK_EQ(sym_replacements.size(), num_symbols_);
+  CHECK(dim_replacements.empty() || dim_replacements.size() == num_dimensions_);
+  CHECK(sym_replacements.empty() || sym_replacements.size() == num_symbols_);
 
   llvm::SmallVector<SymbolicExpr> all_replacements;
   all_replacements.reserve(num_dimensions_ + num_symbols_);
-  absl::c_copy(dim_replacements, std::back_inserter(all_replacements));
-  absl::c_copy(sym_replacements, std::back_inserter(all_replacements));
+
+  if (!dim_replacements.empty()) {
+    absl::c_copy(dim_replacements, std::back_inserter(all_replacements));
+  } else {
+    for (int i = 0; i < num_dimensions_; ++i) {
+      all_replacements.push_back(CreateSymbolicVariable(i, ctx_));
+    }
+  }
+
+  if (!sym_replacements.empty()) {
+    absl::c_copy(sym_replacements, std::back_inserter(all_replacements));
+  } else {
+    for (int i = 0; i < num_symbols_; ++i) {
+      all_replacements.push_back(
+          CreateSymbolicVariable(num_dimensions_ + i, ctx_));
+    }
+  }
 
   llvm::SmallVector<SymbolicExpr> new_exprs;
   new_exprs.reserve(exprs_.size());
@@ -200,8 +214,8 @@ SymbolicMap SymbolicMap::Replace(SymbolicExpr expr,
 }
 
 bool SymbolicMap::operator==(const SymbolicMap& other) const {
-  return ctx_ == other.ctx_ && num_dimensions_ == other.num_dimensions_ &&
-         num_symbols_ == other.num_symbols_ && exprs_ == other.exprs_;
+  return (ctx_ == other.ctx_ && num_dimensions_ == other.num_dimensions_ &&
+          num_symbols_ == other.num_symbols_ && exprs_ == other.exprs_);
 }
 
 llvm::SmallBitVector GetUnusedDimensionsBitVector(const SymbolicMap& map) {
@@ -259,7 +273,7 @@ SymbolicMap CompressDims(const SymbolicMap& map,
   for (int i = 0; i < map.GetNumDims(); ++i) {
     if (!unused_dims[i]) {
       dim_replacements[i] =
-          map.GetContext()->CreateVariable(current_new_dim_idx++);
+          CreateSymbolicVariable(current_new_dim_idx++, map.GetContext());
     }
   }
   auto sym_replacements =
@@ -295,8 +309,8 @@ SymbolicMap CompressSymbols(const SymbolicMap& map,
   int64_t current_new_sym_idx = 0;
   for (int i = 0; i < map.GetNumSymbols(); ++i) {
     if (!unused_symbols[i]) {
-      sym_replacements[i] =
-          map.GetContext()->CreateVariable(num_dims + current_new_sym_idx++);
+      sym_replacements[i] = CreateSymbolicVariable(
+          num_dims + current_new_sym_idx++, map.GetContext());
     }
   }
   CHECK_EQ(current_new_sym_idx, new_num_symbols);
@@ -305,5 +319,4 @@ SymbolicMap CompressSymbols(const SymbolicMap& map,
                                    new_num_symbols);
 }
 
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.h b/third_party/xla/xla/hlo/analysis/symbolic_map.h
similarity index 66%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map.h
rename to third_party/xla/xla/hlo/analysis/symbolic_map.h
index 8b17e5d671ab56..2dd3fea5e5b939 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.h
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map.h
@@ -13,42 +13,74 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_H_
-#define XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_H_
+#ifndef XLA_HLO_ANALYSIS_SYMBOLIC_MAP_H_
+#define XLA_HLO_ANALYSIS_SYMBOLIC_MAP_H_
 
 #include <cstddef>
 #include <cstdint>
 #include <string>
 
+#include "absl/log/check.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
-namespace gpu {
 
-class SymbolicExprContext;
+// SymbolicMap abstracts away the fact that dimensions and symbols are both
+// implemented as SymbolicExpr variables. These free functions provide a way to
+// work with them without a SymbolicMap instance.
+inline SymbolicExpr CreateDimExpr(unsigned dim_id, mlir::MLIRContext* context) {
+  return CreateSymbolicVariable(dim_id, context);
+}
+
+inline SymbolicExpr CreateSymbolExpr(unsigned symbol_id, int64_t num_dims,
+                                     mlir::MLIRContext* context) {
+  return CreateSymbolicVariable(symbol_id + num_dims, context);
+}
+
+inline bool IsDimension(SymbolicExpr expr, int64_t num_dims) {
+  return expr.GetType() == SymbolicExprType::kVariable &&
+         expr.GetValue() < num_dims;
+}
+
+inline bool IsSymbol(SymbolicExpr expr, int64_t num_dims) {
+  return expr.GetType() == SymbolicExprType::kVariable &&
+         expr.GetValue() >= num_dims;
+}
+
+inline int64_t GetDimensionIndex(SymbolicExpr expr, int64_t num_dims) {
+  CHECK(IsDimension(expr, num_dims));
+  return expr.GetValue();
+}
+
+inline int64_t GetSymbolIndex(SymbolicExpr expr, int64_t num_dims) {
+  CHECK(IsSymbol(expr, num_dims));
+  return expr.GetValue() - num_dims;
+}
 
 // Maps a set of input variables to a set of output SymbolicExpr trees.
 class SymbolicMap {
  public:
   SymbolicMap() = default;
-  static SymbolicMap Get(SymbolicExprContext* ctx, int64_t num_dimensions,
+  static SymbolicMap Get(mlir::MLIRContext* ctx, int64_t num_dimensions,
                          int64_t num_symbols,
                          llvm::SmallVector<SymbolicExpr> exprs);
 
-  SymbolicExprContext* GetContext() const { return ctx_; }
+  mlir::MLIRContext* GetContext() const { return ctx_; }
   int64_t GetNumDims() const { return num_dimensions_; }
   int64_t GetNumSymbols() const { return num_symbols_; }
   SymbolicExpr GetDimExpression(unsigned idx) const {
-    return ctx_->CreateVariable(idx);
+    return CreateDimExpr(idx, ctx_);
   }
   SymbolicExpr GetSymbolExpression(unsigned idx) const {
-    return ctx_->CreateVariable(num_dimensions_ + idx);
+    return CreateSymbolExpr(idx, num_dimensions_, ctx_);
   }
   int64_t GetNumResults() const { return exprs_.size(); }
-  const llvm::SmallVector<SymbolicExpr>& GetResults() const { return exprs_; }
+  llvm::ArrayRef<SymbolicExpr> GetResults() const { return exprs_; }
   SymbolicExpr GetResult(unsigned idx) const { return exprs_[idx]; }
   std::string ToString() const;
 
@@ -98,16 +130,28 @@ class SymbolicMap {
   bool operator==(const SymbolicMap& other) const;
   bool operator!=(const SymbolicMap& other) const { return !(*this == other); }
 
+  template <typename H>
+  friend H AbslHashValue(H h, const SymbolicMap& map) {
+    return H::combine(std::move(h), map.num_dimensions_, map.num_symbols_,
+                      map.exprs_);
+  }
+
+  friend ::llvm::hash_code hash_value(const SymbolicMap& map) {
+    return ::llvm::hash_combine(
+        map.num_dimensions_, map.num_symbols_,
+        ::llvm::hash_combine_range(map.exprs_.begin(), map.exprs_.end()));
+  }
+
   template <typename Sink>
   friend void AbslStringify(Sink& sink, const SymbolicMap& map) {
     sink.Append(map.ToString());
   }
 
  private:
-  SymbolicMap(SymbolicExprContext* ctx, int64_t num_dimensions,
+  SymbolicMap(mlir::MLIRContext* ctx, int64_t num_dimensions,
               int64_t num_symbols, llvm::SmallVector<SymbolicExpr> exprs);
 
-  SymbolicExprContext* ctx_;
+  mlir::MLIRContext* ctx_;
   int64_t num_dimensions_;
   int64_t num_symbols_;
   llvm::SmallVector<SymbolicExpr> exprs_;
@@ -131,7 +175,11 @@ SymbolicMap CompressDims(const SymbolicMap& map,
 SymbolicMap CompressSymbols(const SymbolicMap& map,
                             const llvm::SmallBitVector& unused_symbols);
 
-}  // namespace gpu
+template <typename H>
+H AbslHashValue(H h, const llvm::SmallVector<SymbolicExpr>& vec) {
+  return H::combine(std::move(h), absl::MakeSpan(vec));
+}
+
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_H_
+#endif  // XLA_HLO_ANALYSIS_SYMBOLIC_MAP_H_
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.cc b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.cc
similarity index 65%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map_converter.cc
index 7b5253b4d00c7c..7f5edf400b0075 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map_converter.h"
+#include "xla/hlo/analysis/symbolic_map_converter.h"
 
 #include <cstdint>
 
@@ -22,20 +22,19 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 namespace xla {
-namespace gpu {
-namespace {
 
-// Helper function to convert xla::gpu::SymbolicExpr to mlir::AffineExpr.
-mlir::AffineExpr SymbolicToAffine(SymbolicExpr symbolic_expr,
-                                  mlir::MLIRContext* context, int num_dims) {
+// Helper function to convert xla::SymbolicExpr to mlir::AffineExpr.
+mlir::AffineExpr SymbolicExprToAffineExpr(SymbolicExpr symbolic_expr,
+                                          mlir::MLIRContext* context,
+                                          int num_dims) {
   mlir::AffineExpr lhs, rhs;
   if (symbolic_expr.GetLHS() && symbolic_expr.GetRHS()) {
-    lhs = SymbolicToAffine(symbolic_expr.GetLHS(), context, num_dims);
-    rhs = SymbolicToAffine(symbolic_expr.GetRHS(), context, num_dims);
+    lhs = SymbolicExprToAffineExpr(symbolic_expr.GetLHS(), context, num_dims);
+    rhs = SymbolicExprToAffineExpr(symbolic_expr.GetRHS(), context, num_dims);
     if (!lhs || !rhs) {
       return mlir::AffineExpr();
     }
@@ -69,57 +68,67 @@ mlir::AffineExpr SymbolicToAffine(SymbolicExpr symbolic_expr,
   }
 }
 
-}  // namespace
+llvm::SmallVector<SymbolicExpr> AffineExprsToSymbolicExprs(
+    llvm::ArrayRef<mlir::AffineExpr> affine_exprs, int num_dims) {
+  llvm::SmallVector<SymbolicExpr> symbolic_exprs;
+  symbolic_exprs.reserve(affine_exprs.size());
+  for (mlir::AffineExpr expr : affine_exprs) {
+    symbolic_exprs.push_back(AffineExprToSymbolicExpr(expr, num_dims));
+  }
+  return symbolic_exprs;
+}
 
-// Helper function to convert mlir::AffineExpr to xla::gpu::SymbolicExpr.
-SymbolicExpr AffineToSymbolicExpr(mlir::AffineExpr affine_expr,
-                                  SymbolicExprContext* context, int num_dims) {
+SymbolicExpr AffineExprToSymbolicExpr(mlir::AffineExpr affine_expr,
+                                      int num_dims) {
+  mlir::MLIRContext* context = affine_expr.getContext();
   switch (affine_expr.getKind()) {
     case mlir::AffineExprKind::Constant:
-      return context->CreateConstant(
-          mlir::cast<mlir::AffineConstantExpr>(affine_expr).getValue());
+      return CreateSymbolicConstant(
+          mlir::cast<mlir::AffineConstantExpr>(affine_expr).getValue(),
+          context);
     case mlir::AffineExprKind::DimId:
-      return context->CreateVariable(
-          mlir::cast<mlir::AffineDimExpr>(affine_expr).getPosition());
+      return CreateSymbolicVariable(
+          mlir::cast<mlir::AffineDimExpr>(affine_expr).getPosition(), context);
     case mlir::AffineExprKind::SymbolId:
-      return context->CreateVariable(
+      return CreateSymbolicVariable(
           mlir::cast<mlir::AffineSymbolExpr>(affine_expr).getPosition() +
-          num_dims);
+              num_dims,
+          context);
     case mlir::AffineExprKind::Add: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims) +
-             AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims);
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims) +
+             AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims);
     }
     case mlir::AffineExprKind::Mul: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims) *
-             AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims);
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims) *
+             AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims);
     }
     case mlir::AffineExprKind::FloorDiv: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims)
-          .floorDiv(AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims));
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims)
+          .floorDiv(AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims));
     }
     case mlir::AffineExprKind::CeilDiv: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims)
-          .ceilDiv(AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims));
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims)
+          .ceilDiv(AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims));
     }
     case mlir::AffineExprKind::Mod: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims) %
-             AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims);
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims) %
+             AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims);
     }
   }
 }
 
-SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map,
-                                   SymbolicExprContext* context) {
+SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map) {
+  mlir::MLIRContext* context = affine_map.getContext();
   llvm::SmallVector<SymbolicExpr> results;
   results.reserve(affine_map.getNumResults());
   int num_dims = affine_map.getNumDims();
   for (mlir::AffineExpr expr : affine_map.getResults()) {
-    results.push_back(AffineToSymbolicExpr(expr, context, num_dims));
+    results.push_back(AffineExprToSymbolicExpr(expr, num_dims));
   }
   return SymbolicMap::Get(context, num_dims, affine_map.getNumSymbols(),
                           results);
@@ -133,7 +142,7 @@ mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
   results.reserve(symbolic_map.GetNumResults());
   for (SymbolicExpr expr : symbolic_map.GetResults()) {
     mlir::AffineExpr affine_expr =
-        SymbolicToAffine(expr, context, symbolic_map.GetNumDims());
+        SymbolicExprToAffineExpr(expr, context, symbolic_map.GetNumDims());
     if (!affine_expr) {
       // Conversion failed.
       return mlir::AffineMap();
@@ -147,14 +156,13 @@ mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
 llvm::MapVector<SymbolicExpr, Interval>
 ConvertAffineConstraintsToSymbolicConstraints(
     const llvm::MapVector<mlir::AffineExpr, Interval>& affine_constraints,
-    SymbolicExprContext* context, int num_dims) {
+    int num_dims) {
   llvm::MapVector<SymbolicExpr, Interval> symbolic_constraints;
   for (const auto& [affine_expr, interval] : affine_constraints) {
-    SymbolicExpr expr = AffineToSymbolicExpr(affine_expr, context, num_dims);
+    SymbolicExpr expr = AffineExprToSymbolicExpr(affine_expr, num_dims);
     symbolic_constraints[expr] = interval;
   }
   return symbolic_constraints;
 }
 
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.h b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.h
similarity index 56%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.h
rename to third_party/xla/xla/hlo/analysis/symbolic_map_converter.h
index 362f0f41943dd4..5b6b0c3646b1c5 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.h
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.h
@@ -13,29 +13,37 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_CONVERTER_H_
-#define XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_CONVERTER_H_
+#ifndef XLA_HLO_ANALYSIS_SYMBOLIC_MAP_CONVERTER_H_
+#define XLA_HLO_ANALYSIS_SYMBOLIC_MAP_CONVERTER_H_
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 namespace xla {
-namespace gpu {
 
-// Helper function to convert mlir::AffineExpr to xla::gpu::SymbolicExpr.
-SymbolicExpr AffineToSymbolicExpr(::mlir::AffineExpr affine_expr,
-                                  SymbolicExprContext* context, int num_dims);
+// Converts an mlir::AffineExpr to xla::SymbolicExpr.
+SymbolicExpr AffineExprToSymbolicExpr(::mlir::AffineExpr affine_expr,
+                                      int num_dims);
 
-// Converts an mlir::AffineMap to xla::gpu::SymbolicMap.
-SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map,
-                                   SymbolicExprContext* context);
+// Converts a list of mlir::AffineExpr to xla::SymbolicExpr.
+llvm::SmallVector<SymbolicExpr> AffineExprsToSymbolicExprs(
+    llvm::ArrayRef<mlir::AffineExpr> affine_exprs, int num_dims);
 
-// Converts xla::gpu::SymbolicMap to an mlir::AffineMap.
+// Converts an xla::SymbolicExpr to an mlir::AffineExpr.
+mlir::AffineExpr SymbolicExprToAffineExpr(SymbolicExpr symbolic_expr,
+                                          mlir::MLIRContext* context,
+                                          int num_dims);
+
+// Converts an mlir::AffineMap to xla::SymbolicMap.
+SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map);
+
+// Converts xla::SymbolicMap to an mlir::AffineMap.
 // Returns a null AffineMap if the conversion is not possible.
 mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
                                        mlir::MLIRContext* context);
@@ -44,9 +52,8 @@ mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
 llvm::MapVector<SymbolicExpr, Interval>
 ConvertAffineConstraintsToSymbolicConstraints(
     const llvm::MapVector<mlir::AffineExpr, Interval>& affine_constraints,
-    SymbolicExprContext* context, int num_dims);
+    int num_dims);
 
-}  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_CONVERTER_H_
+#endif  // XLA_HLO_ANALYSIS_SYMBOLIC_MAP_CONVERTER_H_
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter_test.cc b/third_party/xla/xla/hlo/analysis/symbolic_map_converter_test.cc
similarity index 55%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter_test.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map_converter_test.cc
index e3e6661bb9a0a0..cd495a847848aa 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter_test.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_converter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map_converter.h"
+#include "xla/hlo/analysis/symbolic_map_converter.h"
 
 #include <string>
 
@@ -29,11 +29,10 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 namespace xla {
-namespace gpu {
 namespace {
 
 using ::mlir::AffineMap;
@@ -54,59 +53,67 @@ AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
 
 class SymbolicMapConverterTest : public ::testing::Test {
  public:
-  SymbolicMapConverterTest() : symbolic_expr_context_(&mlir_context_) {}
-
-  MLIRContext mlir_context_;
-  SymbolicExprContext symbolic_expr_context_;
+  MLIRContext context_;
 };
 
 TEST_F(SymbolicMapConverterTest, AffineToSymbolicRoundTrip) {
   AffineMap affine_map = ParseAffineMap(
       "(d0, d1)[s0, s1] -> (d0 + s1 * 2, d1 - s0, d0 floordiv 3, d1 mod 4)",
-      &mlir_context_);
+      &context_);
 
-  SymbolicMap symbolic_map =
-      AffineMapToSymbolicMap(affine_map, &symbolic_expr_context_);
+  SymbolicMap symbolic_map = AffineMapToSymbolicMap(affine_map);
 
   EXPECT_EQ(symbolic_map.GetNumResults(), 4);
 
-  AffineMap round_trip_map =
-      SymbolicMapToAffineMap(symbolic_map, &mlir_context_);
+  AffineMap round_trip_map = SymbolicMapToAffineMap(symbolic_map, &context_);
   EXPECT_EQ(affine_map, round_trip_map);
 }
 
 TEST_F(SymbolicMapConverterTest, SymbolicToAffineFailure) {
-  SymbolicExpr d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr c1 = symbolic_expr_context_.CreateConstant(1);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr c1 = CreateSymbolicConstant(1, &context_);
   // kMax is not representable in AffineExpr.
   SymbolicExpr max_expr = d0.max(c1);
 
   AffineMap affine_map = SymbolicMapToAffineMap(
-      SymbolicMap::Get(&symbolic_expr_context_, 1, 0, {max_expr}),
-      &mlir_context_);
+      SymbolicMap::Get(&context_, 1, 0, {max_expr}), &context_);
   EXPECT_FALSE(affine_map);
 }
 
 TEST_F(SymbolicMapConverterTest, SymbolicToAffineNestedFailure) {
-  SymbolicExpr d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr c1 = symbolic_expr_context_.CreateConstant(1);
-  SymbolicExpr c2 = symbolic_expr_context_.CreateConstant(2);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr c1 = CreateSymbolicConstant(1, &context_);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &context_);
 
-  // d0 + max(c1, c2). max is not representable in AffineExpr.
-  SymbolicExpr nested_max_expr = d0 + c1.max(c2);
+  // max(d0, c2) + c1. max is not representable in AffineExpr.
+  SymbolicExpr nested_max_expr = d0.max(c2) + c1;
 
   // This should not crash and should return a null AffineMap.
   AffineMap affine_map = SymbolicMapToAffineMap(
-      SymbolicMap::Get(&symbolic_expr_context_, 1, 0, {nested_max_expr}),
-      &mlir_context_);
+      SymbolicMap::Get(&context_, 1, 0, {nested_max_expr}), &context_);
   EXPECT_FALSE(affine_map);
 }
 
+TEST_F(SymbolicMapConverterTest, AffineExprsToSymbolicExprs) {
+  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &context_);
+  mlir::AffineExpr d1 = mlir::getAffineDimExpr(1, &context_);
+  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &context_);
+  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &context_);
+  llvm::SmallVector<mlir::AffineExpr> affine_exprs = {d0, d1, s0, c1};
+  llvm::SmallVector<SymbolicExpr> symbolic_exprs =
+      AffineExprsToSymbolicExprs(affine_exprs, /*num_dims=*/2);
+  EXPECT_EQ(symbolic_exprs.size(), 4);
+  EXPECT_EQ(symbolic_exprs[0], CreateSymbolicVariable(0, &context_));
+  EXPECT_EQ(symbolic_exprs[1], CreateSymbolicVariable(1, &context_));
+  EXPECT_EQ(symbolic_exprs[2], CreateSymbolicVariable(2, &context_));
+  EXPECT_EQ(symbolic_exprs[3], CreateSymbolicConstant(1, &context_));
+}
+
 TEST_F(SymbolicMapConverterTest,
        ConvertAffineConstraintsToSymbolicConstraints) {
-  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &mlir_context_);
-  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &mlir_context_);
-  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &mlir_context_);
+  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &context_);
+  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &context_);
+  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &context_);
 
   llvm::MapVector<mlir::AffineExpr, Interval> affine_constraints;
   affine_constraints[d0 + s0] = {0, 127};
@@ -114,12 +121,12 @@ TEST_F(SymbolicMapConverterTest,
   affine_constraints[d0 - c1] = {10, 20};
 
   llvm::MapVector<SymbolicExpr, Interval> symbolic_constraints =
-      ConvertAffineConstraintsToSymbolicConstraints(
-          affine_constraints, &symbolic_expr_context_, /*num_dims=*/1);
+      ConvertAffineConstraintsToSymbolicConstraints(affine_constraints,
+                                                    /*num_dims=*/1);
 
-  SymbolicExpr sym_d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr sym_s0 = symbolic_expr_context_.CreateVariable(1);
-  SymbolicExpr sym_c1 = symbolic_expr_context_.CreateConstant(1);
+  SymbolicExpr sym_d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr sym_s0 = CreateSymbolicVariable(1, &context_);
+  SymbolicExpr sym_c1 = CreateSymbolicConstant(1, &context_);
 
   EXPECT_EQ(symbolic_constraints.size(), 3);
   EXPECT_EQ(symbolic_constraints[sym_d0 + sym_s0], (Interval{0, 127}));
@@ -128,12 +135,12 @@ TEST_F(SymbolicMapConverterTest,
 }
 
 TEST_F(SymbolicMapConverterTest, ConvertAffineToSymbolicExpr) {
-  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &mlir_context_);
-  mlir::AffineExpr d1 = mlir::getAffineDimExpr(1, &mlir_context_);
-  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &mlir_context_);
-  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &mlir_context_);
-  mlir::AffineExpr c2 = mlir::getAffineConstantExpr(2, &mlir_context_);
-  mlir::AffineExpr c3 = mlir::getAffineConstantExpr(3, &mlir_context_);
+  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &context_);
+  mlir::AffineExpr d1 = mlir::getAffineDimExpr(1, &context_);
+  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &context_);
+  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &context_);
+  mlir::AffineExpr c2 = mlir::getAffineConstantExpr(2, &context_);
+  mlir::AffineExpr c3 = mlir::getAffineConstantExpr(3, &context_);
 
   mlir::AffineExpr affine_expr =
       mlir::getAffineBinaryOpExpr(
@@ -143,21 +150,19 @@ TEST_F(SymbolicMapConverterTest, ConvertAffineToSymbolicExpr) {
           c3) +
       d1;  // ((d0 * 2 + s0 - 1) floordiv 2) mod 3 + d1
 
-  SymbolicExpr exp_d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr exp_d1 = symbolic_expr_context_.CreateVariable(1);
-  SymbolicExpr exp_s0 = symbolic_expr_context_.CreateVariable(2);
-  SymbolicExpr exp_c1 = symbolic_expr_context_.CreateConstant(1);
-  SymbolicExpr exp_c2 = symbolic_expr_context_.CreateConstant(2);
-  SymbolicExpr exp_c3 = symbolic_expr_context_.CreateConstant(3);
+  SymbolicExpr exp_d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr exp_d1 = CreateSymbolicVariable(1, &context_);
+  SymbolicExpr exp_s0 = CreateSymbolicVariable(2, &context_);
+  SymbolicExpr exp_c1 = CreateSymbolicConstant(1, &context_);
+  SymbolicExpr exp_c2 = CreateSymbolicConstant(2, &context_);
+  SymbolicExpr exp_c3 = CreateSymbolicConstant(3, &context_);
 
   SymbolicExpr expected_symbolic_expr =
       ((exp_d0 * exp_c2 + exp_s0 - exp_c1) / exp_c2) % exp_c3 + exp_d1;
 
-  EXPECT_EQ(AffineToSymbolicExpr(affine_expr, &symbolic_expr_context_,
-                                 /*num_dims=*/2),
+  EXPECT_EQ(AffineExprToSymbolicExpr(affine_expr, /*num_dims=*/2),
             expected_symbolic_expr);
 }
 
 }  // namespace
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_test.cc b/third_party/xla/xla/hlo/analysis/symbolic_map_test.cc
similarity index 58%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_test.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map_test.cc
index cfabfd6f85eebd..9c8cbc9a9795e3 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_test.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace gpu {
@@ -27,84 +29,97 @@ namespace {
 using ::testing::ElementsAre;
 
 struct SymbolicMapTest : public ::testing::Test {
-  // TODO(b/446856305): MLIRContext should not be used in this test.
-  SymbolicExprContext ctx{nullptr};
+  mlir::MLIRContext ctx;
+  SymbolicExpr d0;
+  SymbolicExpr d1;
+  static constexpr int kSampleDims = 2;
+  SymbolicExpr s0;
+  SymbolicExpr s1;
+  static constexpr int kSampleSymbols = 2;
+  SymbolicExpr c2;
+  SymbolicExpr c10;
+  SymbolicMap sample_map;
+
+  SymbolicMapTest()
+      : d0(CreateDimExpr(0, &ctx)),
+        d1(CreateDimExpr(1, &ctx)),
+        s0(CreateSymbolExpr(0, kSampleDims, &ctx)),
+        s1(CreateSymbolExpr(1, kSampleDims, &ctx)),
+        c2(CreateSymbolicConstant(2, &ctx)),
+        c10(CreateSymbolicConstant(10, &ctx)),
+        sample_map(SymbolicMap::Get(&ctx, kSampleDims, kSampleSymbols,
+                                    {d0 + s0, d1 * s1})) {}
 };
 
 TEST_F(SymbolicMapTest, GetSymbolAndDimExpressions) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr s0 = ctx.CreateVariable(2);
-  SymbolicExpr s1 = ctx.CreateVariable(3);
-  SymbolicMap map = SymbolicMap::Get(&ctx, 2, 2, {d0 + s0, d1 * s1});
-  EXPECT_EQ(map.GetSymbolExpression(0), s0);
-  EXPECT_EQ(map.GetSymbolExpression(1), s1);
-  EXPECT_EQ(map.GetDimExpression(0), d0);
-  EXPECT_EQ(map.GetDimExpression(1), d1);
+  EXPECT_EQ(sample_map.GetSymbolExpression(0), s0);
+  EXPECT_EQ(sample_map.GetSymbolExpression(1), s1);
+  EXPECT_EQ(sample_map.GetDimExpression(0), d0);
+  EXPECT_EQ(sample_map.GetDimExpression(1), d1);
 }
 
 TEST_F(SymbolicMapTest, ToString) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr s0 = ctx.CreateVariable(2);
-  SymbolicExpr s1 = ctx.CreateVariable(3);
-
-  SymbolicMap map = SymbolicMap::Get(&ctx, 2, 2, {d0 + s0, d1 * s1});
-  EXPECT_EQ(map.ToString(), "(d0, d1)[s0, s1] -> ((d0 + s0), (d1 * s1))");
+  EXPECT_EQ(sample_map.ToString(),
+            "(d0, d1)[s0, s1] -> ((d0 + s0), (d1 * s1))");
 
   SymbolicMap empty_map = SymbolicMap::Get(&ctx, 0, 0, {});
   EXPECT_EQ(empty_map.ToString(), "()[] -> ()");
 
-  SymbolicMap dims_only = SymbolicMap::Get(&ctx, 2, 0, {d0, d1});
+  SymbolicMap dims_only = SymbolicMap::Get(&ctx, kSampleDims, 0, {d0, d1});
   EXPECT_EQ(dims_only.ToString(), "(d0, d1)[] -> (d0, d1)");
 
-  SymbolicExpr s0_no_dims = ctx.CreateVariable(0);
-  SymbolicExpr s1_no_dims = ctx.CreateVariable(1);
+  SymbolicExpr s0_no_dims =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/0, &ctx);
+  SymbolicExpr s1_no_dims =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/0, &ctx);
   SymbolicMap symbols_only =
-      SymbolicMap::Get(&ctx, 0, 2, {s0_no_dims, s1_no_dims});
+      SymbolicMap::Get(&ctx, 0, kSampleSymbols, {s0_no_dims, s1_no_dims});
   EXPECT_EQ(symbols_only.ToString(), "()[s0, s1] -> (s0, s1)");
 }
 
 TEST_F(SymbolicMapTest, IsEmpty) {
   EXPECT_TRUE(SymbolicMap::Get(&ctx, 0, 0, {}).IsEmpty());
   EXPECT_TRUE(SymbolicMap::Get(&ctx, 2, 1, {}).IsEmpty());
-  EXPECT_FALSE(SymbolicMap::Get(&ctx, 1, 0, {ctx.CreateVariable(0)}).IsEmpty());
+  EXPECT_FALSE(
+      SymbolicMap::Get(&ctx, 1, 0, {CreateDimExpr(0, &ctx)}).IsEmpty());
 }
 
 TEST_F(SymbolicMapTest, IsIdentity) {
   SymbolicMap true_identity = SymbolicMap::Get(
-      &ctx, 2, 0, {ctx.CreateVariable(0), ctx.CreateVariable(1)});
+      &ctx, 2, 0, {CreateDimExpr(0, &ctx), CreateDimExpr(1, &ctx)});
   EXPECT_TRUE(true_identity.IsIdentity());
 
   SymbolicMap true_identity_with_symbols = SymbolicMap::Get(
-      &ctx, 2, 1, {ctx.CreateVariable(0), ctx.CreateVariable(1)});
+      &ctx, 2, 1, {CreateDimExpr(0, &ctx), CreateDimExpr(1, &ctx)});
   EXPECT_TRUE(true_identity_with_symbols.IsIdentity());
 
   SymbolicMap few_results =
-      SymbolicMap::Get(&ctx, 2, 0, {ctx.CreateVariable(0)});
+      SymbolicMap::Get(&ctx, 2, 0, {CreateDimExpr(0, &ctx)});
   EXPECT_FALSE(few_results.IsIdentity());
 
   SymbolicMap too_many_results = SymbolicMap::Get(
-      &ctx, 1, 0, {ctx.CreateVariable(0), ctx.CreateVariable(1)});
+      &ctx, 1, 0, {CreateDimExpr(0, &ctx), CreateDimExpr(1, &ctx)});
   EXPECT_FALSE(too_many_results.IsIdentity());
 
   SymbolicMap wrong_expr_type = SymbolicMap::Get(
-      &ctx, 2, 0, {ctx.CreateVariable(0), ctx.CreateConstant(1)});
+      &ctx, 2, 0, {CreateDimExpr(0, &ctx), CreateSymbolicConstant(1, &ctx)});
   EXPECT_FALSE(wrong_expr_type.IsIdentity());
 
   SymbolicMap unordered_variable_id = SymbolicMap::Get(
-      &ctx, 2, 0, {ctx.CreateVariable(1), ctx.CreateVariable(0)});
+      &ctx, 2, 0, {CreateDimExpr(1, &ctx), CreateDimExpr(0, &ctx)});
   EXPECT_FALSE(unordered_variable_id.IsIdentity());
 }
 
 TEST_F(SymbolicMapTest, GetConstantResults) {
   SymbolicMap all_constants_map = SymbolicMap::Get(
-      &ctx, 0, 0, {ctx.CreateConstant(5), ctx.CreateConstant(10)});
+      &ctx, 0, 0,
+      {CreateSymbolicConstant(5, &ctx), CreateSymbolicConstant(10, &ctx)});
   EXPECT_TRUE(all_constants_map.IsConstant());
   EXPECT_THAT(all_constants_map.GetConstantResults(), ElementsAre(5, 10));
 
   SymbolicMap mixed_map = SymbolicMap::Get(
-      &ctx, 1, 0, {ctx.CreateConstant(5), ctx.CreateVariable(0)});
+      &ctx, 1, 0,
+      {CreateSymbolicConstant(5, &ctx), CreateSymbolicVariable(0, &ctx)});
   EXPECT_FALSE(mixed_map.IsConstant());
   EXPECT_DEATH(mixed_map.GetConstantResults(),
                "Cannot get constant results from a non-constant map");
@@ -115,18 +130,11 @@ TEST_F(SymbolicMapTest, GetConstantResults) {
 }
 
 TEST_F(SymbolicMapTest, ReplaceDimsAndSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr s0 = ctx.CreateVariable(2);
-  SymbolicExpr s1 = ctx.CreateVariable(3);
-  SymbolicExpr c1 = ctx.CreateConstant(10);
-  SymbolicExpr c2 = ctx.CreateConstant(20);
-  SymbolicExpr c3 = ctx.CreateConstant(30);
-
-  SymbolicMap map_basic = SymbolicMap::Get(&ctx, 2, 2, {d0 + s0, d1 * s1});
-  SymbolicMap replaced_basic = map_basic.ReplaceDimsAndSymbols(
-      {c1, c2}, {c3, d0}, map_basic.GetNumDims(), map_basic.GetNumSymbols());
-  EXPECT_THAT(replaced_basic.GetResults(), ElementsAre(c1 + c3, c2 * d0));
+  SymbolicExpr c3 = CreateSymbolicConstant(30, &ctx);
+
+  SymbolicMap replaced_basic = sample_map.ReplaceDimsAndSymbols(
+      {d1, c2}, {c3, d0}, sample_map.GetNumDims(), sample_map.GetNumSymbols());
+  EXPECT_THAT(replaced_basic.GetResults(), ElementsAre(d1 + c3, c2 * d0));
 
   SymbolicMap map_empty = SymbolicMap::Get(&ctx, 0, 0, {});
   SymbolicMap replaced_empty = map_empty.ReplaceDimsAndSymbols({}, {}, 0, 0);
@@ -134,21 +142,32 @@ TEST_F(SymbolicMapTest, ReplaceDimsAndSymbols) {
 
   SymbolicMap map_change_dims = SymbolicMap::Get(&ctx, 1, 1, {d0 + s0 * c2});
   // Replacements in the context of the NEW map (2 dims, 1 symbol)
-  SymbolicExpr new_d0 = ctx.CreateVariable(0);
-  SymbolicExpr new_d1 = ctx.CreateVariable(1);
-  SymbolicExpr new_s0 = ctx.CreateVariable(2);
+  SymbolicExpr new_d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr new_d1 = CreateDimExpr(1, &ctx);
+  SymbolicExpr new_s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
   SymbolicMap replaced_change_dims = map_change_dims.ReplaceDimsAndSymbols(
-      {new_d0 * c1 + new_d1}, {new_s0}, 2, 1);
+      {new_d0 * c10 + new_d1}, {new_s0}, 2, 1);
   EXPECT_EQ(replaced_change_dims.GetNumDims(), 2);
   EXPECT_EQ(replaced_change_dims.GetNumSymbols(), 1);
   EXPECT_THAT(replaced_change_dims.GetResults(),
-              ElementsAre((new_d0 * c1 + new_d1) + new_s0 * c2));
+              ElementsAre((new_d0 * c10 + new_d1) + new_s0 * c2));
 }
 
-TEST_F(SymbolicMapTest, Compose) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
+TEST_F(SymbolicMapTest, ReplaceDimsAndSymbolsOnlyDims) {
+  SymbolicMap replaced = sample_map.ReplaceDimsAndSymbols(
+      /*dim_replacements=*/{c10, c2}, /*sym_replacements=*/{},
+      sample_map.GetNumDims(), sample_map.GetNumSymbols());
+  EXPECT_THAT(replaced.GetResults(), ElementsAre(c10 + s0, c2 * s1));
+}
 
+TEST_F(SymbolicMapTest, ReplaceDimsAndSymbolsOnlySymbols) {
+  SymbolicMap replaced = sample_map.ReplaceDimsAndSymbols(
+      /*dim_replacements=*/{}, /*sym_replacements=*/{c10, c2},
+      sample_map.GetNumDims(), sample_map.GetNumSymbols());
+  EXPECT_THAT(replaced.GetResults(), ElementsAre(d0 + c10, d1 * c2));
+}
+
+TEST_F(SymbolicMapTest, Compose) {
   // Composition without Symbols
   SymbolicMap map1_no_symbols = SymbolicMap::Get(&ctx, 1, 0, {d0 * 2});
   SymbolicMap map2_no_symbols = SymbolicMap::Get(&ctx, 1, 0, {d0 + 5});
@@ -156,8 +175,10 @@ TEST_F(SymbolicMapTest, Compose) {
   EXPECT_THAT(composed_no_symbols.GetResults(), ElementsAre((d0 + 5) * 2));
 
   // Composition with Symbols
-  SymbolicExpr s0_map1 = ctx.CreateVariable(/*map1_dims*/ 2);
-  SymbolicExpr s0_map2 = ctx.CreateVariable(/*map2_dims*/ 1);
+  SymbolicExpr s0_map1 =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
+  SymbolicExpr s0_map2 =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
   SymbolicMap map1_symbols =
       SymbolicMap::Get(&ctx, 2, 1, {d0 + s0_map1, d1 * 2});
   SymbolicMap map2_symbols =
@@ -166,9 +187,10 @@ TEST_F(SymbolicMapTest, Compose) {
   EXPECT_EQ(compose_with_symbols.GetNumDims(), 1);
   EXPECT_EQ(compose_with_symbols.GetNumSymbols(), 2);
   SymbolicExpr new_d0 = d0;
-  SymbolicExpr new_s0_map1 = ctx.CreateVariable(/*compose_dims*/ 1);
+  SymbolicExpr new_s0_map1 =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
   SymbolicExpr new_s0_map2 =
-      ctx.CreateVariable(/*compose_dims + map1_symbols.GetNumSymbols()*/ 2);
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, &ctx);
   EXPECT_THAT(
       compose_with_symbols.GetResults(),
       ElementsAre((new_d0 - 10) + new_s0_map1, (new_d0 + new_s0_map2) * 2));
@@ -187,18 +209,19 @@ TEST_F(SymbolicMapTest, Compose) {
       id_2dim_1sym.Compose(map1_symbols);
   EXPECT_EQ(compose_left_with_id2dim_1sym.GetNumDims(), 2);
   EXPECT_EQ(compose_left_with_id2dim_1sym.GetNumSymbols(), 2);
+  // The composed map has 2 dims and 2 symbols:
+  //    d0 and d1 (from map1_symbols)
+  //    s0 (from id_2dim_1sym) and s0 (from map1_symbols)
+  // The reindexed symbol from map1_symbols is the second symbol in the composed
+  // map.
   SymbolicExpr reindexed_map1_s0 =
-      ctx.CreateVariable(compose_left_with_id2dim_1sym.GetNumDims() +
-                         id_2dim_1sym.GetNumSymbols());
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/2, &ctx);
   EXPECT_THAT(compose_left_with_id2dim_1sym.GetResults(),
               ElementsAre(d0 + reindexed_map1_s0, d1 * 2));
 }
 
 TEST_F(SymbolicMapTest, Replace) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
-  SymbolicExpr c5 = ctx.CreateConstant(5);
+  SymbolicExpr c5 = CreateSymbolicConstant(5, &ctx);
 
   SymbolicExpr expr0 = (d0 + c2) * d1;
   SymbolicExpr expr1 = d1 + c2;
@@ -211,20 +234,21 @@ TEST_F(SymbolicMapTest, Replace) {
   SymbolicMap replaced_just_one = map.Replace(d1 + c2, c5);
   EXPECT_THAT(replaced_just_one.GetResults(), ElementsAre(expr0, c5));
 
-  SymbolicMap no_replacement_map = map.Replace(ctx.CreateVariable(99), c5);
+  SymbolicMap no_replacement_map =
+      map.Replace(CreateSymbolicVariable(99, &ctx), c5);
   EXPECT_EQ(no_replacement_map, map);
 }
 
 TEST_F(SymbolicMapTest, GetUnusedVariables) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
+  [[maybe_unused]] SymbolicExpr d2 = CreateDimExpr(2, &ctx);
   // d2 is unused.
-  SymbolicExpr s0 = ctx.CreateVariable(3);
-  SymbolicExpr s1 = ctx.CreateVariable(4);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
+  [[maybe_unused]] SymbolicExpr s0_3dims =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/3, &ctx);
+  SymbolicExpr s1_3dims =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/3, &ctx);
 
   // Map with used and unused dims and symbols.
-  SymbolicMap map = SymbolicMap::Get(&ctx, 3, 2, {d0 + s1, d1 * c2});
+  SymbolicMap map = SymbolicMap::Get(&ctx, 3, 2, {d0 + s1_3dims, d1 * c2});
 
   llvm::SmallBitVector unused_dims = GetUnusedDimensionsBitVector(map);
   EXPECT_EQ(unused_dims.size(), 3);
@@ -258,8 +282,8 @@ TEST_F(SymbolicMapTest, GetUnusedVariables) {
   EXPECT_EQ(no_sym_symbols.size(), 0);
 
   // Map with only symbols
-  s0 = ctx.CreateVariable(0);
-  s1 = ctx.CreateVariable(1);
+  s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/0, &ctx);
+  s1 = CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/0, &ctx);
   SymbolicMap no_dims_map = SymbolicMap::Get(&ctx, 0, 2, {s0 * s1});
   llvm::SmallBitVector no_dim_dims = GetUnusedDimensionsBitVector(no_dims_map);
   EXPECT_EQ(no_dim_dims.size(), 0);
@@ -270,10 +294,10 @@ TEST_F(SymbolicMapTest, GetUnusedVariables) {
 }
 
 TEST_F(SymbolicMapTest, CompressDims) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  [[maybe_unused]] SymbolicExpr d1 = ctx.CreateVariable(1);  // Unused
-  SymbolicExpr d2 = ctx.CreateVariable(2);
-  SymbolicExpr s0 = ctx.CreateVariable(3);
+  SymbolicExpr d0 = CreateDimExpr(0, &ctx);
+  [[maybe_unused]] SymbolicExpr d1 = CreateDimExpr(1, &ctx);  // Unused
+  SymbolicExpr d2 = CreateDimExpr(2, &ctx);
+  SymbolicExpr s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/3, &ctx);
 
   // Map: (d0, d1, d2)[s0] -> {d0 + d2, s0 * 5}
   SymbolicMap map = SymbolicMap::Get(&ctx, 3, 1, {d0 + d2, s0 * 5});
@@ -285,9 +309,9 @@ TEST_F(SymbolicMapTest, CompressDims) {
   EXPECT_EQ(compressed.GetNumDims(), 2);
   EXPECT_EQ(compressed.GetNumSymbols(), 1);
 
-  SymbolicExpr new_d0 = ctx.CreateVariable(0);
-  SymbolicExpr new_d1 = ctx.CreateVariable(1);
-  SymbolicExpr new_s0 = ctx.CreateVariable(2);
+  SymbolicExpr new_d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr new_d1 = CreateDimExpr(1, &ctx);
+  SymbolicExpr new_s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
   EXPECT_THAT(compressed.GetResults(),
               ElementsAre(new_d0 + new_d1, new_s0 * 5));
 
@@ -299,10 +323,11 @@ TEST_F(SymbolicMapTest, CompressDims) {
 }
 
 TEST_F(SymbolicMapTest, CompressSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr s0 = ctx.CreateVariable(1);
-  [[maybe_unused]] SymbolicExpr s1 = ctx.CreateVariable(2);  // Unused
-  SymbolicExpr s2 = ctx.CreateVariable(3);
+  SymbolicExpr d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
+  [[maybe_unused]] SymbolicExpr s1 =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, &ctx);  // Unused
+  SymbolicExpr s2 = CreateSymbolExpr(/*symbol_id=*/2, /*num_dims=*/1, &ctx);
 
   // Map: (d0)[s0, s1, s2] -> {d0 + s2, s0 * 5}
   SymbolicMap map = SymbolicMap::Get(&ctx, 1, 3, {d0 + s2, s0 * 5});
@@ -314,9 +339,10 @@ TEST_F(SymbolicMapTest, CompressSymbols) {
   EXPECT_EQ(compressed.GetNumDims(), 1);
   EXPECT_EQ(compressed.GetNumSymbols(), 2);
 
-  SymbolicExpr new_d0 = ctx.CreateVariable(0);
-  SymbolicExpr new_s0 = ctx.CreateVariable(1);
-  SymbolicExpr new_s1 = ctx.CreateVariable(2);  // Original s2
+  SymbolicExpr new_d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr new_s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
+  SymbolicExpr new_s1 =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, &ctx);  // Original s2
   EXPECT_THAT(compressed.GetResults(),
               ElementsAre(new_d0 + new_s1, new_s0 * 5));
 
@@ -327,6 +353,27 @@ TEST_F(SymbolicMapTest, CompressSymbols) {
                "Attempting to compress a used symbol: 2");
 }
 
+TEST_F(SymbolicMapTest, Hashing) {
+  absl::flat_hash_set<SymbolicMap> set;
+
+  SymbolicExpr d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr d1 = CreateDimExpr(1, &ctx);
+  SymbolicExpr s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
+  SymbolicExpr c42 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c99 = CreateSymbolicConstant(99, &ctx);
+
+  SymbolicMap map1 = SymbolicMap::Get(&ctx, 2, 1, {d0 + s0, d1 * c42});
+  SymbolicMap map2 = SymbolicMap::Get(&ctx, 2, 1, {d0 + s0, d1 * c42});
+  SymbolicMap map3 = SymbolicMap::Get(&ctx, 2, 1, {d0 + s0, d1 * c99});
+
+  set.insert(map1);
+  EXPECT_EQ(set.size(), 1);
+  set.insert(map2);
+  EXPECT_EQ(set.size(), 1);
+  set.insert(map3);
+  EXPECT_EQ(set.size(), 2);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc b/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc
index fa030c75ff1a35..82c47535e7798d 100644
--- a/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tools/hlo_extractor.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -1074,9 +1073,9 @@ optional<int64_t> ComputeWhileLoopTripCountUpperBound(
   HloEvaluator evaluator(/*max_loop_iterations=*/0);
   Literal fake_input = Literal::CreateFromShape(
       new_computation->parameter_instruction(0)->shape());
-  TF_CHECK_OK(fake_input.CopyFrom(while_body_indvar->literal(),
-                                  /*dest_shape_index=*/{0},
-                                  /*src_shape_index=*/{}));
+  CHECK_OK(fake_input.CopyFrom(while_body_indvar->literal(),
+                               /*dest_shape_index=*/{0},
+                               /*src_shape_index=*/{}));
   absl::StatusOr<Literal> eval_result =
       evaluator.Evaluate(*new_computation, {std::move(fake_input)});
 
diff --git a/third_party/xla/xla/hlo/builder/BUILD b/third_party/xla/xla/hlo/builder/BUILD
index 1b0208994b556e..0195f30d94cbc6 100644
--- a/third_party/xla/xla/hlo/builder/BUILD
+++ b/third_party/xla/xla/hlo/builder/BUILD
@@ -34,10 +34,9 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/tsl/lib/math:math_util",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -98,8 +97,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
-        "//xla/tsl/lib/gtl:value_or_die",
-        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
@@ -108,8 +107,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -215,7 +212,5 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/hlo/builder/lib/BUILD b/third_party/xla/xla/hlo/builder/lib/BUILD
index 54e78cfcdaa7ac..7f35288f4a7696 100644
--- a/third_party/xla/xla/hlo/builder/lib/BUILD
+++ b/third_party/xla/xla/hlo/builder/lib/BUILD
@@ -330,9 +330,12 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -371,6 +374,7 @@ cc_library(
         "//xla/hlo/builder:xla_builder",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/hlo/builder/lib/math.cc b/third_party/xla/xla/hlo/builder/lib/math.cc
index 00d273723e088c..2c7d5c3d272b1c 100644
--- a/third_party/xla/xla/hlo/builder/lib/math.cc
+++ b/third_party/xla/xla/hlo/builder/lib/math.cc
@@ -1191,7 +1191,11 @@ XlaOp Acos(XlaOp x, const std::optional<ResultAccuracy>& result_accuracy,
 }
 
 // asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
-XlaOp Asin(XlaOp x) {
+XlaOp Asin(XlaOp x, const std::optional<ResultAccuracy>& result_accuracy,
+           bool expand) {
+  if (!expand) {
+    return x.builder()->UnaryOp(HloOpcode::kAsin, x, result_accuracy);
+  }
   XlaBuilder* b = x.builder();
   auto do_it = [&](XlaOp z) -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(z));
@@ -1271,7 +1275,11 @@ XlaOp Acosh(XlaOp x, const std::optional<ResultAccuracy>& result_accuracy,
 // If x is negative, the above would give us some trouble; we can't approximate
 // the result as x + abs(x) = 0!  But we're saved by the fact that asinh(-x) =
 // -asinh(x).
-XlaOp Asinh(XlaOp x) {
+XlaOp Asinh(XlaOp x, const std::optional<ResultAccuracy>& result_accuracy,
+            bool expand) {
+  if (!expand) {
+    return x.builder()->UnaryOp(HloOpcode::kAsinh, x, result_accuracy);
+  }
   XlaBuilder* b = x.builder();
   auto do_it = [&](XlaOp x) -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
diff --git a/third_party/xla/xla/hlo/builder/lib/math.h b/third_party/xla/xla/hlo/builder/lib/math.h
index ccb6b77d7e14c7..921e7cd3f4a0f2 100644
--- a/third_party/xla/xla/hlo/builder/lib/math.h
+++ b/third_party/xla/xla/hlo/builder/lib/math.h
@@ -89,7 +89,9 @@ XlaOp Acos(XlaOp x,
            bool expand = true);
 
 // Computes the arc sine of 'x'.
-XlaOp Asin(XlaOp x);
+XlaOp Asin(XlaOp x,
+           const std::optional<ResultAccuracy>& result_accuracy = std::nullopt,
+           bool expand = true);
 
 // Computes the arc tangent of 'x'.
 XlaOp Atan(XlaOp x);
@@ -102,7 +104,9 @@ XlaOp Acosh(XlaOp x,
             bool expand = true);
 
 // Computes the inverse hyperbolic sine of 'x'.
-XlaOp Asinh(XlaOp x);
+XlaOp Asinh(XlaOp x,
+            const std::optional<ResultAccuracy>& result_accuracy = std::nullopt,
+            bool expand = true);
 
 // Computes the inverse hyperbolic tangent of 'x'.
 XlaOp Atanh(XlaOp x,
diff --git a/third_party/xla/xla/hlo/builder/lib/pooling.cc b/third_party/xla/xla/hlo/builder/lib/pooling.cc
index 913a399ad4a972..a48c74ee459b73 100644
--- a/third_party/xla/xla/hlo/builder/lib/pooling.cc
+++ b/third_party/xla/xla/hlo/builder/lib/pooling.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/lib/constants.h"
@@ -29,6 +31,7 @@ limitations under the License.
 #include "xla/hlo/builder/padding.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/shape.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -217,8 +220,8 @@ XlaOp AvgPoolGrad(XlaOp out_backprop, absl::Span<const int64_t> gradients_size,
     const int num_dims = kernel_size.size();
     const int num_gradients = gradients_size.size();
     if (num_gradients != num_dims) {
-      return tsl::errors::InvalidArgument("gradients must be ", num_dims,
-                                          "-dimensional");
+      return absl::InvalidArgumentError(
+          absl::StrCat("gradients must be ", num_dims, "-dimensional"));
     }
 
     TF_ASSIGN_OR_RETURN(Shape out_backprop_xla_shape,
@@ -226,8 +229,8 @@ XlaOp AvgPoolGrad(XlaOp out_backprop, absl::Span<const int64_t> gradients_size,
     const int backprop_xla_num_dims =
         out_backprop_xla_shape.dimensions().size();
     if (backprop_xla_num_dims != num_dims) {
-      return tsl::errors::InvalidArgument("out_backprop must be ", num_dims,
-                                          "-dimensional");
+      return absl::InvalidArgumentError(
+          absl::StrCat("out_backprop must be ", num_dims, "-dimensional"));
     }
 
     // We can think of average-pooling as:
diff --git a/third_party/xla/xla/hlo/builder/lib/prng.cc b/third_party/xla/xla/hlo/builder/lib/prng.cc
index 661981f19c17ea..2eae8d0092757d 100644
--- a/third_party/xla/xla/hlo/builder/lib/prng.cc
+++ b/third_party/xla/xla/hlo/builder/lib/prng.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -181,48 +182,39 @@ struct SplitShapePair {
 // Split the shape on a dimension > 1 into two halves.
 SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
   SplitShapePair pair;
-  if (shape.dimensions().size() == 0) {
+  const auto& dims = shape.dimensions();
+  if (dims.empty()) {
     pair.half_shape = ShapeUtil::MakeShape(shape.element_type(), {1});
     pair.concat_shape = ShapeUtil::MakeShape(shape.element_type(), {2});
     pair.split_dim = 0;
     pair.new_concat_dim = 0;
     return pair;
   }
-  pair.split_dim = -1;
-  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
-    if (shape.dimensions(i) % 2 == 0) {
-      pair.split_dim = i;
-      break;
-    }
-  }
-  if (pair.split_dim == -1) {
-    // No even dims. Find a dimension with maximum size.
-    for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
-      if (pair.split_dim == -1 ||
-          shape.dimensions(i) > shape.dimensions(pair.split_dim)) {
-        pair.split_dim = i;
-      }
-    }
-  }
-  if (pair.split_dim < 0) {
-    LOG(ERROR) << "This point shouldn't have been reached.";
+
+  if (auto it = absl::c_find_if(dims, [](int64_t dim) { return dim % 2 == 0; });
+      it != dims.end()) {
+    pair.split_dim = std::distance(dims.begin(), it);
+  } else {
+    pair.split_dim = std::distance(dims.begin(), absl::c_max_element(dims));
   }
+
   std::vector<int64_t> half_shape_dims;
   std::vector<int64_t> concat_shape_dims;
-  const auto rank = shape.dimensions().size();
+  const auto rank = dims.size();
   half_shape_dims.reserve(rank + 1);
   concat_shape_dims.reserve(rank + 1);
   for (int64_t i = 0; i < rank; ++i) {
     if (i == pair.split_dim) {
       // Create a new trivial dim for the later concat, which is more friendly
       // to sharding propagation.
-      half_shape_dims.push_back(CeilOfRatio<int64_t>(shape.dimensions(i), 2));
+      auto dim_size = CeilOfRatio<int64_t>(dims[i], 2);
+      half_shape_dims.push_back(dim_size);
       half_shape_dims.push_back(1);
-      concat_shape_dims.push_back(half_shape_dims[i]);
+      concat_shape_dims.push_back(dim_size);
       concat_shape_dims.push_back(2);
     } else {
-      half_shape_dims.push_back(shape.dimensions(i));
-      concat_shape_dims.push_back(shape.dimensions(i));
+      half_shape_dims.push_back(dims[i]);
+      concat_shape_dims.push_back(dims[i]);
     }
   }
   pair.new_concat_dim = pair.split_dim + 1;
@@ -236,7 +228,7 @@ SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
 XlaOp CombineShapePair(absl::Span<const XlaOp> pair,
                        const SplitShapePair& shape_pair,
                        const Shape& original_shape) {
-  if (original_shape.dimensions().size() == 0) {
+  if (original_shape.dimensions().empty()) {
     return Reshape(pair[0], {});
   }
   XlaBuilder* builder = pair[0].builder();
@@ -433,6 +425,27 @@ std::pair<Philox4x32State, XlaOp> GeneratePhiloxBits(int64_t num_elems,
   return std::make_pair(outputs, new_state);
 }
 
+// Interleaves slices of Philox results in a round-robin fashion to align with
+// non-XLA implementations.
+XlaOp InterleavePhiloxResults(XlaBuilder* builder,
+                              absl::Span<const XlaOp> results,
+                              int64_t num_elems) {
+  const int kNumResults = results.size();
+  CHECK_GT(kNumResults, 0);
+  int64_t bits_len = CeilOfRatio<int64_t>(num_elems, kNumResults);
+  std::vector<XlaOp> reshaped_results;
+  reshaped_results.reserve(kNumResults);
+  for (const auto& result : results) {
+    reshaped_results.push_back(Reshape(result, {bits_len, 1}));
+  }
+  XlaOp numbers = ConcatInDim(builder, reshaped_results,
+                              /*dimension=*/1);
+  numbers = Reshape(numbers, {bits_len * kNumResults});
+  return Slice(numbers, /*start_indices=*/{0},
+               /*limit_indices=*/{num_elems},
+               /*strides=*/{1});
+}
+
 // Generates an array of primitive type U32 with the given shape containing
 // random bits generated by the Philox algorithm. Returns the array and the new
 // state of the random number generator.
@@ -445,18 +458,7 @@ RngOutput PhiloxRngBit32(XlaOp op_key, XlaOp initial_state,
   Philox4x32State bits;
   XlaOp new_state;
   std::tie(bits, new_state) = GeneratePhiloxBits(num_elems, initial_state, key);
-  // Combining bits[i] in a round-robin fashion, to align with non-XLA
-  // implementations
-  int64_t bits_len = (num_elems + 3) / 4;
-  for (auto i = 0; i < 4; ++i) {
-    bits[i] = Reshape(bits[i], {bits_len, 1});
-  }
-  XlaOp numbers = ConcatInDim(builder, {bits[0], bits[1], bits[2], bits[3]},
-                              /*dimension=*/1);
-  numbers = Reshape(numbers, {bits_len * 4});
-  numbers = Slice(numbers, /*start_indices=*/{0},
-                  /*limit_indices=*/{num_elems},
-                  /*strides=*/{1});
+  XlaOp numbers = InterleavePhiloxResults(builder, bits, num_elems);
   return {Reshape(numbers, shape.dimensions()), new_state};
 }
 
@@ -488,25 +490,14 @@ RngOutput PhiloxRngBit64(XlaOp op_key, XlaOp initial_state,
   Philox4x32Key key = Uint64ToUint32s(op_key);
   Philox4x32State bits32;
   XlaOp new_state;
-  std::tie(bits32, new_state) =
-      GeneratePhiloxBits(num_elems * 2, initial_state, key);
+  constexpr int kNum32BitIntsFor64BitInt = sizeof(uint64_t) / sizeof(uint32_t);
+  std::tie(bits32, new_state) = GeneratePhiloxBits(
+      num_elems * kNum32BitIntsFor64BitInt, initial_state, key);
 
   std::array<XlaOp, 2> bits64;
   bits64[0] = Uint32sToUint64({bits32[0], bits32[1]});
   bits64[1] = Uint32sToUint64({bits32[2], bits32[3]});
-
-  // Combining bits64[i] in a round-robin fashion, to align with non-XLA
-  // implementations
-  int64_t bits64_len = (num_elems + 1) / 2;
-  for (auto i = 0; i < 2; ++i) {
-    bits64[i] = Reshape(bits64[i], {bits64_len, 1});
-  }
-  XlaOp numbers = ConcatInDim(builder, {bits64[0], bits64[1]},
-                              /*dimension=*/1);
-  numbers = Reshape(numbers, {bits64_len * 2});
-  numbers = Slice(numbers, /*start_indices=*/{0},
-                  /*limit_indices=*/{num_elems},
-                  /*strides=*/{1});
+  XlaOp numbers = InterleavePhiloxResults(builder, bits64, num_elems);
   return {Reshape(numbers, shape.dimensions()), new_state};
 }
 
@@ -529,6 +520,7 @@ XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
           primitive_util::LowercasePrimitiveTypeName(bit_type));
     }
 
+    XlaOp values;
     if (value_type == F16 && bit_type == U16) {
       // This path follows the approach of the non-XLA kernels (see
       // `tsl::random::Uint16ToHalf`). IEEE754 halfs are formatted as follows
@@ -539,11 +531,19 @@ XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
       //    exponent == 15  -- an excess 15 representation of a zero exponent
       //    mantissa == 10 random bits
 
-      auto mantissa = bits & ScalarLike(bits, 0x3ffu);  // 10 bit mantissa
-      auto exponent = ScalarLike(bits, static_cast<uint16_t>(15) << 10);
+      const int trailing_significand_width =
+          primitive_util::SignificandWidth(F16) - 1;
+      const uint16_t trailing_significand_mask =
+          LsbMask<uint16_t>(trailing_significand_width);
+      auto mantissa =
+          bits &
+          ScalarLike(bits, trailing_significand_mask);  // 10 bit mantissa
+      auto exponent = ScalarLike(
+          bits, static_cast<uint16_t>(primitive_util::ExponentBias(F16))
+                    << trailing_significand_width);
       auto u16_result = exponent | mantissa;
       auto result = BitcastConvertType(u16_result, F16);
-      return result - ScalarLike(result, 1.0);
+      values = result - ScalarLike(result, 1.0);
     } else {
       // TODO: b/256715195 - Consider using the approach in the F16 case.
       // Form random mantissa bits for float/double, with a leading 1 bit.
@@ -565,15 +565,15 @@ XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
 
       // We have an integer-valued floating point number in the range
       // [0, 2**{num_mantissa_bits}).
-      XlaOp values = ConvertElementType(bits, value_type);
+      values = ConvertElementType(bits, value_type);
 
       // Multiply by 2**{-num_mantissa_bits} to get a number in the range
       // [0.0, 1.0).
       values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
+    }
 
       // Multiply and add to shift to the range [minval, maxval).
-      return values * (maxval - minval) + minval;
-    }
+    return values * (maxval - minval) + minval;
   });
 }
 
diff --git a/third_party/xla/xla/hlo/builder/padding.cc b/third_party/xla/xla/hlo/builder/padding.cc
index 08fc4c0cb9f5ee..467dac3d153624 100644
--- a/third_party/xla/xla/hlo/builder/padding.cc
+++ b/third_party/xla/xla/hlo/builder/padding.cc
@@ -21,12 +21,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/tsl/lib/math/math_util.h"
 #include "xla/util.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
@@ -59,8 +58,8 @@ std::vector<std::pair<int64_t, int64_t>> MakePadding(
     absl::Span<const int64_t> input_dimensions,
     absl::Span<const int64_t> window_dimensions,
     absl::Span<const int64_t> window_strides, Padding padding) {
-  TF_CHECK_OK(ValidatePaddingValues(input_dimensions, window_dimensions,
-                                    window_strides));
+  CHECK_OK(ValidatePaddingValues(input_dimensions, window_dimensions,
+                                 window_strides));
   std::vector<std::pair<int64_t, int64_t>> low_high_padding;
   switch (padding) {
     case Padding::kValid:
diff --git a/third_party/xla/xla/hlo/builder/value_inference.cc b/third_party/xla/xla/hlo/builder/value_inference.cc
index 584cf6b173798c..78cb2addf241e8 100644
--- a/third_party/xla/xla/hlo/builder/value_inference.cc
+++ b/third_party/xla/xla/hlo/builder/value_inference.cc
@@ -44,12 +44,10 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/tsl/lib/gtl/value_or_die.h"
-#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -399,7 +397,7 @@ struct PostorderDFSVisitor {
   bool IsInstructionOverLimit(const HloInstructionProto* proto,
                               const InferenceContext& context) {
     auto shape = Shape::FromProto(proto->shape());
-    TF_CHECK_OK(shape.status());
+    CHECK_OK(shape.status());
     auto subshape = std::make_unique<Shape>(
         ShapeUtil::GetSubshape(*shape, context.shape_index));
 
@@ -411,7 +409,8 @@ struct PostorderDFSVisitor {
     for (int64_t operand_id : proto->operand_ids()) {
       const HloInstructionProto* operand =
           handle_to_instruction(operand_id).value();
-      auto operand_shape = std::make_unique<Shape>(operand->shape());
+      auto operand_shape = Shape::FromProto(operand->shape());
+      CHECK_OK(operand_shape.status());
 
       if (operand_shape->IsArray() &&
           ShapeUtil::ElementsIn(*operand_shape) > kLargeShapeElementLimit &&
@@ -1715,7 +1714,12 @@ absl::StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
   TF_ASSIGN_OR_RETURN(auto* inst, builder_->LookUpInstructionByHandle(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(inst->opcode()));
   std::vector<Literal> operands;
-  auto output_shape = std::make_unique<const Shape>(inst->shape());
+  std::unique_ptr<Shape> output_shape;
+  {
+    TF_ASSIGN_OR_RETURN(auto output_shape_stack,
+                        Shape::FromProto(inst->shape()));
+    output_shape = std::make_unique<Shape>(std::move(output_shape_stack));
+  }
   switch (opcode) {
     case HloOpcode::kSlice:
     case HloOpcode::kConcatenate:
diff --git a/third_party/xla/xla/hlo/builder/xla_builder.h b/third_party/xla/xla/hlo/builder/xla_builder.h
index 34b870dd360ead..c3553750054c39 100644
--- a/third_party/xla/xla/hlo/builder/xla_builder.h
+++ b/third_party/xla/xla/hlo/builder/xla_builder.h
@@ -1742,6 +1742,12 @@ class XlaBuilder {
   friend XlaOp Acosh(XlaOp x,
                      const std::optional<ResultAccuracy>& result_accuracy,
                      bool expand);
+  friend XlaOp Asin(XlaOp x,
+                    const std::optional<ResultAccuracy>& result_accuracy,
+                    bool expand);
+  friend XlaOp Asinh(XlaOp x,
+                     const std::optional<ResultAccuracy>& result_accuracy,
+                     bool expand);
   friend XlaOp Atan2(XlaOp y, XlaOp x,
                      absl::Span<const int64_t> broadcast_dimensions);
   friend XlaOp Atanh(XlaOp x,
diff --git a/third_party/xla/xla/hlo/builder/xla_builder_test.cc b/third_party/xla/xla/hlo/builder/xla_builder_test.cc
index 67c2c94392a765..88f3592a530d26 100644
--- a/third_party/xla/xla/hlo/builder/xla_builder_test.cc
+++ b/third_party/xla/xla/hlo/builder/xla_builder_test.cc
@@ -62,8 +62,6 @@ limitations under the License.
 #include "xla/tuple_tree.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -75,7 +73,6 @@ using ::testing::_;
 using ::testing::HasSubstr;
 using ::testing::Property;
 using ::testing::Test;
-using ::tsl::testing::StatusIs;
 
 HloInstruction* GetRoot(HloModule& module) {
   return module.entry_computation()->root_instruction();
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index 2420a3bdbc3f60..e9ba8489827022 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -69,19 +69,16 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:tuple_points_to_analysis",
         "//xla/hlo/ir:hlo",
-        "//xla/service:compilation_environments",
         "//xla/service:dynamic_dimension_inference",
         "//xla/service:gather_scatter_utils",
         "//xla/service:hlo_module_config",
         "//xla/service:logical_buffer",
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
-        "//xla/service/cpu:runtime_single_threaded_matmul",
         "//xla/tsl/lib/core:bitmap",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -167,9 +164,9 @@ xla_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",  # fixdeps: keep
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:endian",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index c8fc2f709fad60..3f209f9ed8b20a 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 
 #include <algorithm>
+#include <array>
 #include <atomic>
 #include <cmath>
 #include <complex>
@@ -69,8 +70,6 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
-#include "xla/service/compilation_environments.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "xla/service/gather_scatter_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/logical_buffer.h"
@@ -82,13 +81,19 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
 
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
 namespace xla {
 
 namespace {
@@ -105,7 +110,8 @@ absl::StatusOr<Literal> Compare(const Shape& shape, Comparison comparison,
     // If layout is the same, we can use linear indexing into the literals.
     const Layout& lhs_layout = lhs_literal.shape().layout();
     const Layout& rhs_layout = rhs_literal.shape().layout();
-    bool same_layout = LayoutUtil::Equal(lhs_layout, rhs_layout);
+    bool same_layout = LayoutUtil::Equal(lhs_layout, rhs_layout) &&
+                       LayoutUtil::Equal(lhs_layout, shape.layout());
 
     if (same_layout) {
       TF_RETURN_IF_ERROR(result.PopulateLinearParallel<bool>(
@@ -236,10 +242,15 @@ absl::Status MakeEvalErrorDueToParamOrInfeed(
       eval_instruction.parent()->name(), ")."));
   std::string error_payload;
   error_payload.resize(sizeof(internal::EvalErrorDetail));
-  absl::little_endian::Store32(
-      const_cast<char*>(error_payload.data()),
-      static_cast<uint32_t>(
-          internal::EvalErrorDetail::kDynamicValueDependence));
+
+  uint32_t error_detail =
+      static_cast<uint32_t>(internal::EvalErrorDetail::kDynamicValueDependence);
+  // Ensure that the error detail is also in little endian.
+  if constexpr (absl::endian::native != absl::endian::little) {
+    DCHECK(absl::endian::native == absl::endian::big);
+    error_detail = absl::byteswap(error_detail);
+  }
+  (*error_payload.data()) = error_detail;
   error.SetPayload(internal::kEvalErrorDetailUrl, absl::Cord(error_payload));
   return error;
 }
@@ -2320,7 +2331,7 @@ class FftTransform {
     }
 
     // Check input-related values.
-    TF_CHECK_OK(ShapeUtil::ValidateShape(input_shape));
+    CHECK_OK(ShapeUtil::ValidateShape(input_shape));
     if (!input_shape.IsArray()) {
       return Unimplemented("Only array input shapes are supported.");
     }
@@ -2339,7 +2350,7 @@ class FftTransform {
     }
 
     // Check output-related values.
-    TF_CHECK_OK(ShapeUtil::ValidateShape(output_shape));
+    CHECK_OK(ShapeUtil::ValidateShape(output_shape));
     if (!output_shape.IsArray()) {
       return Unimplemented("Only array output shapes are supported.");
     }
@@ -4626,7 +4637,7 @@ absl::Status HloEvaluator::HandleReduce(const HloInstruction* hlo) {
   if (is_tuple) {
     Literal tuple_result(inferred_return_shape);
     for (int64_t i = 0; i < num_args; ++i) {
-      TF_CHECK_OK(tuple_result.MoveFrom(std::move(results[i]), {i}));
+      CHECK_OK(tuple_result.MoveFrom(std::move(results[i]), {i}));
     }
     SetEvaluatedLiteralFor(reduce, std::move(tuple_result));
   } else {
@@ -4887,76 +4898,68 @@ absl::Status HloEvaluator::Postprocess(const HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
-namespace {
 template <typename T>
-std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
-    const Array2D<T>& lhs, const Array2D<T>& rhs,
-    const std::function<void(const void* run_options_ptr, T* out, T* lhs,
-                             T* rhs, int64_t m, int64_t n, int64_t k,
-                             int32_t transpose_lhs, int32_t transpose_rhs)>&
-        impl_fn) {
+static std::unique_ptr<Array2D<T>> MatmulArray2DImpl(const Array2D<T>& lhs,
+                                                     const Array2D<T>& rhs) {
   CHECK_EQ(lhs.width(), rhs.height());
   int m = lhs.height();
-  int n = rhs.width();
   int k = lhs.width();
+  int n = rhs.width();
   auto result = std::make_unique<Array2D<T>>(m, n);
-  // Because Eigen is a header-oriented library, make sure that the Eigen code
-  // is the same as the code used by the CPU backend (otherwise the linker will
-  // randomly pick *some* definition).
-  impl_fn(
-      /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
-      k,
-      /*transpose_lhs=*/0,
-      /*transpose_rhs=*/0);
+
+  using ConstTensor = Eigen::Tensor<const T, 2, Eigen::RowMajor>;
+  using Tensor = Eigen::Tensor<T, 2, Eigen::RowMajor>;
+
+  Eigen::TensorMap<ConstTensor> A(lhs.data(), m, k);
+  Eigen::TensorMap<ConstTensor> B(rhs.data(), k, n);
+  Eigen::TensorMap<Tensor> C(result->data(), m, n);
+
+  using DimPair = typename ConstTensor::DimensionPair;
+  std::array<DimPair, 1> dims({DimPair(1, 0)});
+
+  C = A.contract(B, dims);
+
   return result;
 }
-}  // namespace
 
 std::unique_ptr<Array2D<Eigen::half>> HloEvaluator::MatmulArray2D(
     const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs) {
-  return MatmulArray2DImpl<Eigen::half>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF16);
+  return MatmulArray2DImpl<Eigen::half>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<float>> HloEvaluator::MatmulArray2D(
     const Array2D<float>& lhs, const Array2D<float>& rhs) {
-  return MatmulArray2DImpl<float>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
+  return MatmulArray2DImpl<float>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<double>> HloEvaluator::MatmulArray2D(
     const Array2D<double>& lhs, const Array2D<double>& rhs) {
-  return MatmulArray2DImpl<double>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
+  return MatmulArray2DImpl<double>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<std::complex<float>>> HloEvaluator::MatmulArray2D(
     const Array2D<std::complex<float>>& lhs,
     const Array2D<std::complex<float>>& rhs) {
-  return MatmulArray2DImpl<std::complex<float>>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC64);
+  return MatmulArray2DImpl<std::complex<float>>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<std::complex<double>>> HloEvaluator::MatmulArray2D(
     const Array2D<std::complex<double>>& lhs,
     const Array2D<std::complex<double>>& rhs) {
-  return MatmulArray2DImpl<std::complex<double>>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC128);
+  return MatmulArray2DImpl<std::complex<double>>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<int32_t>> HloEvaluator::MatmulArray2D(
     const Array2D<int32_t>& lhs, const Array2D<int32_t>& rhs) {
-  return MatmulArray2DImpl<int32_t>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulS32);
+  return MatmulArray2DImpl<int32_t>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
     const Array2D<uint8_t>& lhs, const Array2D<uint8_t>& rhs) {
-  return MatmulArray2DImpl<uint8_t>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulU8);
+  return MatmulArray2DImpl<uint8_t>(lhs, rhs);
 }
 
-/* static */ std::unique_ptr<Array2D<float>> Array2DF8E5M2ToF32(
+std::unique_ptr<Array2D<float>> Array2DF8E5M2ToF32(
     const Array2D<tsl::float8_e5m2>& input) {
   auto result = std::make_unique<Array2D<float>>(input.height(), input.width());
   for (int64_t rowno = 0; rowno < input.height(); ++rowno) {
@@ -4967,7 +4970,7 @@ std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<float>> Array2DF8E4M3FNToF32(
+std::unique_ptr<Array2D<float>> Array2DF8E4M3FNToF32(
     const Array2D<tsl::float8_e4m3fn>& input) {
   auto result = std::make_unique<Array2D<float>>(input.height(), input.width());
   for (int64_t rowno = 0; rowno < input.height(); ++rowno) {
@@ -4978,7 +4981,7 @@ std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<tsl::float8_e5m2>> Array2DF32ToF8E5M2(
+std::unique_ptr<Array2D<tsl::float8_e5m2>> Array2DF32ToF8E5M2(
     const Array2D<float>& input) {
   auto result = std::make_unique<Array2D<tsl::float8_e5m2>>(input.height(),
                                                             input.width());
@@ -4991,7 +4994,7 @@ std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<tsl::float8_e4m3fn>> Array2DF32ToF8E4M3FN(
+std::unique_ptr<Array2D<tsl::float8_e4m3fn>> Array2DF32ToF8E4M3FN(
     const Array2D<float>& input) {
   auto result = std::make_unique<Array2D<tsl::float8_e4m3fn>>(input.height(),
                                                               input.width());
@@ -5014,10 +5017,8 @@ std::unique_ptr<Array2D<tsl::float8_e5m2>> HloEvaluator::MatmulArray2D(
     auto rhs_float = Array2DF8E5M2ToF32(rhs);
     auto result = MatmulArray2D(*lhs_float, *rhs_float);
     return Array2DF32ToF8E5M2(*result);
-  } else {
-    return MatmulArray2DImpl<tsl::float8_e5m2>(
-        lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF8E5M2);
   }
+  return MatmulArray2DImpl<tsl::float8_e5m2>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<tsl::float8_e4m3fn>> HloEvaluator::MatmulArray2D(
@@ -5028,10 +5029,8 @@ std::unique_ptr<Array2D<tsl::float8_e4m3fn>> HloEvaluator::MatmulArray2D(
     auto rhs_float = Array2DF8E4M3FNToF32(rhs);
     auto result = MatmulArray2D(*lhs_float, *rhs_float);
     return Array2DF32ToF8E4M3FN(*result);
-  } else {
-    return MatmulArray2DImpl<tsl::float8_e4m3fn>(
-        lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF8E4M3FN);
   }
+  return MatmulArray2DImpl<tsl::float8_e4m3fn>(lhs, rhs);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
index c3e920d146c684..4e2715ed65e756 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
@@ -567,10 +567,19 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault,
     TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
 
     Literal result(shape);
-    TF_RETURN_IF_ERROR(
-        result.PopulateLinearParallel<ReturnT>([&](int64_t linear_index, int) {
-          return unary_op(operand_literal.GetLinear<NativeT>(linear_index));
-        }));
+    bool same_layout =
+        LayoutUtil::Equal(operand->shape().layout(), shape.layout());
+    if (same_layout) {
+      TF_RETURN_IF_ERROR(result.PopulateLinearParallel<ReturnT>(
+          [&](int64_t linear_index, int /*thread_id*/) {
+            return unary_op(operand_literal.GetLinear<NativeT>(linear_index));
+          }));
+    } else {
+      TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+          [&](absl::Span<const int64_t> multi_index, int /*thread_id*/) {
+            return unary_op(operand_literal.Get<NativeT>(multi_index));
+          }));
+    }
     return result;
   }
 
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
index ec0945eb6b937c..69b0462f04b041 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/internal/endian.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/numeric/bits.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/transforms/simplifiers/hlo_element_type_converter.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -114,7 +115,8 @@ class HloEvaluatorTest : public HloHardwareIndependentTestBase {
     TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
     auto element_type = expected.shape().element_type();
-    if (element_type == F32 || element_type == F64) {
+    if (element_type == F32 || element_type == F64 || element_type == C64 ||
+        element_type == C128) {
       ErrorSpec error(aabs);
       EXPECT_TRUE(LiteralTestUtil::Near(expected, result, error));
     } else {
@@ -375,6 +377,17 @@ TEST_F(HloEvaluatorTest, DoesAdd) {
                std::move(rhs));
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise addition with 2 operands and a different result layout.
+TEST_F(HloEvaluatorTest, DoesAddDifferentResultLayout) {
+  auto lhs = LiteralUtil::CreateR2<int64_t>({{1, 0}, {-100, 4}});
+  auto rhs = LiteralUtil::CreateR2<int64_t>({{2, 4}, {4, 4}});
+  Layout layout({0, 1});
+  auto expected =
+      LiteralUtil::CreateR2WithLayout<int64_t>({{3, 4}, {-96, 8}}, layout);
+  TestBinaryOp(HloOpcode::kAdd, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise and with 2 operands.
 TEST_P(HloEvaluatorBf16Test, DoesAnd) {
   auto lhs = LiteralUtil::CreateR2<int64_t>({{1, 0}, {-100, 4}});
@@ -435,6 +448,25 @@ TEST_F(HloEvaluatorTest, DoesClampS64) {
                 std::move(value), std::move(high));
 }
 
+TEST_F(HloEvaluatorTest, DoesClampS64DifferentResultLayout) {
+  auto low = LiteralUtil::CreateR2<int64_t>(
+      {{-8616761059752331528LL, 6780561065411491190LL},
+       {-8616761059752331528LL, 0LL}});
+  auto value = LiteralUtil::CreateR2<int64_t>(
+      {{-6780561065411491190LL, 6780561065411491180LL},
+       {4241131823772864090LL, -1LL}});
+  auto high = LiteralUtil::CreateR2<int64_t>(
+      {{-6780561065411491180LL, 8616761059752331528LL},
+       {3832151243857508051LL, 1LL}});
+  Layout layout({0, 1});
+  auto expected = LiteralUtil::CreateR2WithLayout<int64_t>(
+      {{-6780561065411491190LL, 6780561065411491190LL},
+       {3832151243857508051LL, 0}},
+      layout);
+  TestTernaryOp(HloOpcode::kClamp, std::move(expected), std::move(low),
+                std::move(value), std::move(high));
+}
+
 TEST_P(HloEvaluatorBf16Test, DoesDivideDouble) {
   auto lhs = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
   auto rhs = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
@@ -475,6 +507,14 @@ TEST_F(HloEvaluatorTest, DoesNegateR2) {
       {{0, std::numeric_limits<int>::min()}, {1, -4}});
   TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
 }
+TEST_F(HloEvaluatorTest, DoesNegateR2DifferentResultLayout) {
+  auto operand = LiteralUtil::CreateR2<int32_t>(
+      {{0, std::numeric_limits<int32_t>::min()}, {-1, 4}});
+  Layout layout({0, 1});
+  auto expected = LiteralUtil::CreateR2WithLayout<int32_t>(
+      {{0, std::numeric_limits<int>::min()}, {1, -4}}, layout);
+  TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
+}
 TEST_P(HloEvaluatorBf16Test, DoesCosR2) {
   auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = LiteralUtil::CreateR2<float>({{1, -1}, {-1, 1}});
@@ -493,6 +533,108 @@ TEST_P(HloEvaluatorBf16Test, DoesTanR2) {
   TestUnaryOp(HloOpcode::kTan, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
+TEST_F(HloEvaluatorTest, DoesSinC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({1.2985, 0.6350});
+  TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesSinC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({1.298457581, 0.634963915});
+  TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesCosC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.8337, -0.9889});
+  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesCosC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected =
+      LiteralUtil::CreateR0<complex128>({0.833730025, -0.988897706});
+  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesTanC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.2718, 1.0839});
+  TestUnaryOp(HloOpcode::kTan, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesTanC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.271752585, 1.083923327});
+  TestUnaryOp(HloOpcode::kTan, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesSinhC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.6350, 1.2985});
+  TestUnaryOp(HloOpcode::kSinh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesSinhC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.634963915, 1.298457581});
+  TestUnaryOp(HloOpcode::kSinh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesCoshC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.8337, 0.9889});
+  TestUnaryOp(HloOpcode::kCosh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesCoshC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.833730025, 0.988897706});
+  TestUnaryOp(HloOpcode::kCosh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAsinC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.4523, 0.5306});
+  TestUnaryOp(HloOpcode::kAsin, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAsinC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.452278447, 0.530637531});
+  TestUnaryOp(HloOpcode::kAsin, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAcosC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({1.1185, -0.5306});
+  TestUnaryOp(HloOpcode::kAcos, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAcosC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected =
+      LiteralUtil::CreateR0<complex128>({1.118517880, -0.530637531});
+  TestUnaryOp(HloOpcode::kAcos, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAsinhC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.5306, 0.4523});
+  TestUnaryOp(HloOpcode::kAsinh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAsinhC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.530637531, 0.452278447});
+  TestUnaryOp(HloOpcode::kAsinh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAcoshC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({1.0613, 0.9046});
+  TestUnaryOp(HloOpcode::kAcosh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAcoshC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({1.061275062, 0.904556894});
+  TestUnaryOp(HloOpcode::kAcosh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAtanhC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.4024, 0.5536});
+  TestUnaryOp(HloOpcode::kAtanh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAtanhC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.402359478, 0.553574359});
+  TestUnaryOp(HloOpcode::kAtanh, std::move(expected), std::move(operand), 1e-9);
+}
 TEST_F(HloEvaluatorTest, DoesNotR2) {
   auto operand =
       LiteralUtil::CreateR2<int32_t>({{0, std::numeric_limits<int>::min()},
@@ -7662,10 +7804,16 @@ TEST(EvalErrorTest, Payload) {
   absl::Status s = absl::InternalError("hmm");
   std::string payload;
   payload.resize(sizeof(internal::EvalErrorDetail));
-  absl::little_endian::Store32(
-      const_cast<char*>(payload.data()),
-      static_cast<uint32_t>(
-          internal::EvalErrorDetail::kDynamicValueDependence));
+
+  uint32_t error_detail =
+      static_cast<uint32_t>(internal::EvalErrorDetail::kDynamicValueDependence);
+  // Ensure that the error detail is also in little endian.
+  if constexpr (absl::endian::native != absl::endian::little) {
+    DCHECK(absl::endian::native == absl::endian::big);
+    error_detail = absl::byteswap(error_detail);
+  }
+  (*payload.data()) = error_detail;
+
   s.SetPayload(internal::kEvalErrorDetailUrl, absl::Cord(payload));
 
   EXPECT_EQ(internal::ParseEvalErrorDetail(s),
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index 19f72d61ee3be7..b8e95065719d75 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -54,7 +54,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
@@ -240,42 +239,31 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleAcos(const HloInstruction* acos) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(acos, [](ElementwiseT elem_operand) {
-            return std::acos(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(acos, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(acos);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(acos, [](ElementwiseT elem_operand) {
+                          return std::acos(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(acos, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleAcosh(const HloInstruction* acosh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(acosh, [](ElementwiseT elem_operand) {
-            return std::acosh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(acosh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(acosh);
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        ElementWiseUnaryOp(acosh, [](ElementwiseT elem_operand) {
+          return std::acosh(elem_operand);
+        }));
+    parent_->SetEvaluatedLiteralFor(acosh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleAsin(const HloInstruction* asin) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(asin, [](ElementwiseT elem_operand) {
-            return std::asin(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(asin, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(asin);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(asin, [](ElementwiseT elem_operand) {
+                          return std::asin(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(asin, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleRound(const HloInstruction* round) override {
@@ -474,6 +462,16 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     return absl::OkStatus();
   }
 
+  absl::Status HandleAsinh(const HloInstruction* asinh) override {
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        ElementWiseUnaryOp(asinh, [](ElementwiseT elem_operand) {
+          return std::asinh(elem_operand);
+        }));
+    parent_->SetEvaluatedLiteralFor(asinh, std::move(literal));
+    return absl::OkStatus();
+  }
+
   absl::Status HandleAtan2(const HloInstruction* atan2) override {
     if constexpr (std::is_floating_point_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(Literal literal,
@@ -499,16 +497,13 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleAtanh(const HloInstruction* atanh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(atanh, [](ElementwiseT elem_operand) {
-            return std::atanh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(atanh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(atanh);
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        ElementWiseUnaryOp(atanh, [](ElementwiseT elem_operand) {
+          return std::atanh(elem_operand);
+        }));
+    parent_->SetEvaluatedLiteralFor(atanh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleTanh(const HloInstruction* tanh) override {
@@ -882,8 +877,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     const Shape& lhs_shape = lhs_literal.shape();
     const Shape& rhs_shape = rhs_literal.shape();
 
-    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
-    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
+    CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
+    CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
     CHECK(lhs_shape.IsArray());
     CHECK(rhs_shape.IsArray());
     CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
@@ -1079,8 +1074,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     Shape lhs_shape = GetShapeWithLayout(lhs->shape());
     Shape rhs_shape = GetShapeWithLayout(rhs->shape());
 
-    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
-    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
+    CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
+    CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
     CHECK(lhs_shape.IsArray());
     CHECK(rhs_shape.IsArray());
 
@@ -2070,70 +2065,48 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleSin(const HloInstruction* sin) override {
-    if constexpr (std::is_floating_point_v<ElementwiseT> ||
-                  is_complex_v<ElementwiseT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
-            return std::sin(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(sin, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(sin);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
+                          return std::sin(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(sin, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleSinh(const HloInstruction* sinh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(sinh, [](ElementwiseT elem_operand) {
-            return std::sinh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(sinh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(sinh);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(sinh, [](ElementwiseT elem_operand) {
+                          return std::sinh(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(sinh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleCos(const HloInstruction* cos) override {
-    if constexpr (std::is_floating_point_v<ElementwiseT> ||
-                  is_complex_v<ElementwiseT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
-            return std::cos(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(cos, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(cos);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
+                          return std::cos(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(cos, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleCosh(const HloInstruction* cosh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(cosh, [](ElementwiseT elem_operand) {
-            return std::cosh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(cosh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(cosh);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(cosh, [](ElementwiseT elem_operand) {
+                          return std::cosh(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(cosh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleTan(const HloInstruction* tan) override {
-    if constexpr (std::is_floating_point_v<ElementwiseT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(tan, [](ElementwiseT elem_operand) {
-            return std::tan(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(tan, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(tan);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(tan, [](ElementwiseT elem_operand) {
+                          return std::tan(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(tan, std::move(literal));
+    return absl::OkStatus();
   }
 
   template <typename NativeT, typename std::enable_if_t<
@@ -2402,7 +2375,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     // If layout is the same, we can use linear indexing into the literals.
     const Layout& lhs_layout = lhs_literal.shape().layout();
     const Layout& rhs_layout = rhs_literal.shape().layout();
-    bool same_layout = LayoutUtil::Equal(lhs_layout, rhs_layout);
+    bool same_layout = LayoutUtil::Equal(lhs_layout, rhs_layout) &&
+                       LayoutUtil::Equal(lhs_layout, shape.layout());
 
     if (same_layout) {
       TF_RETURN_IF_ERROR(result.PopulateLinearParallel<ReturnT>(
@@ -2450,7 +2424,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     const Layout& rhs_layout = rhs_literal.shape().layout();
     const Layout& ehs_layout = ehs_literal.shape().layout();
     bool same_layout = LayoutUtil::Equal(lhs_layout, rhs_layout) &&
-                       LayoutUtil::Equal(rhs_layout, ehs_layout);
+                       LayoutUtil::Equal(rhs_layout, ehs_layout) &&
+                       LayoutUtil::Equal(lhs_layout, shape.layout());
 
     if (same_layout) {
       TF_RETURN_IF_ERROR(result.PopulateLinearParallel<ReturnT>(
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index 5693da78467d33..6b56d6c2e98ef2 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -39,6 +39,7 @@ cc_library(
         ":auto_sharding_device_mesh",
         ":auto_sharding_iopddl",
         ":auto_sharding_option",
+        ":auto_sharding_proto_cc",
         ":auto_sharding_solver",
         ":auto_sharding_strategy",
         ":auto_sharding_util",
@@ -86,6 +87,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf_lite",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -143,6 +145,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_ortools//ortools/linear_solver:linear_solver_cc_proto",
+        "@com_google_protobuf//:protobuf_lite",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:hash",
         "@local_tsl//tsl/platform:protobuf",
@@ -283,6 +286,7 @@ cc_library(
         "//xla/service/spmd:spmd_partitioner",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -348,9 +352,14 @@ xla_cc_binary(
     compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding",
+        ":auto_sharding_option",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/ir:hlo",
         "//xla/tools:hlo_module_loader",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:platform_port",
     ],
@@ -456,6 +465,7 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf_lite",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:statusor",
     ] + if_google(["@com_google_ortools//ortools/linear_solver:linear_solver_scip"]),
@@ -487,8 +497,8 @@ cc_test(
     srcs = ["iopddl_test.cc"],
     deps = [
         ":iopddl_lib",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index f1c76862ac47e5..681ade850f5c39 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
 
 #include <algorithm>
-#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
@@ -50,6 +49,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h"
@@ -3566,7 +3566,7 @@ absl::StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
           /*instruction_to_shard_group_id=*/nullptr,
           /*shard_group_id_to_shard_as_group=*/nullptr,
           /*shard_group_id_to_shard_like_group=*/nullptr,
-          /*allow_spmd_sharding_propagation_to_parameters_vector=*/nullptr,
+          /*allow_spmd_sharding_propagation_to_parameters_vector=*/{},
           /*remove_unknown_shardings=*/true));
 
   DumpHloModuleIfEnabled(*module, "after_spmd_calls");
@@ -3956,7 +3956,7 @@ std::vector<int> FindAllIndices(std::vector<int64_t> vec, int64_t element) {
   return result;
 }
 
-absl::StatusOr<bool> AutoSharding::Run(
+absl::StatusOr<bool> AutoSharding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!option_.enable) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 9d5e8a26fa670b..49ffe55af7f532 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -109,11 +109,6 @@ class AutoSharding : public HloModulePass {
   ~AutoSharding() override = default;
   absl::string_view name() const override { return "auto_sharding"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   double GetSolverOptimalObjectiveValue() {
     return solver_optimal_objective_value_;
   }
@@ -125,6 +120,10 @@ class AutoSharding : public HloModulePass {
   // Backend-specific aliasing information.
   const AliasInfo* alias_info_;
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Stores the optimal value of the objective the solver found.
   double solver_optimal_objective_value_ = -1.0;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cpsat_for_request_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cpsat_for_request_test.cc
index abd7fac1ea65ca..6cb9947e68bbf9 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cpsat_for_request_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cpsat_for_request_test.cc
@@ -10,20 +10,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
-
 #include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/iopddl.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 3c748b2ff9119f..895ba5b49a20ff 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "google/protobuf/repeated_field.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
@@ -52,6 +53,7 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/sharding_propagation.h"
 #include "xla/shape.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
index 0bb737a624c17d..af33044dbd6766 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
@@ -14,13 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstddef>
-#include <cstdint>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
-#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
index 4d5fc89ad7ab21..c9295b09e2d2be 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "google/protobuf/repeated_field.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
index a9e0938cf7a0d8..62e0baf0c4499e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_set.h"
+#include "google/protobuf/repeated_field.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
index 56f02d37428d59..ec855f3d1fb272 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_set.h"
+#include "google/protobuf/repeated_field.h"
 
 namespace xla {
 namespace spmd {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
index fcdecad75ebe2b..2f4762b3ee6cd7 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
@@ -14,14 +14,20 @@ limitations under the License.
 ==============================================================================*/
 
 #include <iostream>
+#include <memory>
 #include <ostream>
 #include <string>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/tools/hlo_module_loader.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/init_main.h"
 
 namespace xla {
@@ -56,6 +62,6 @@ absl::Status RunAutoShardingPassFromFile(const std::string& file_name) {
 int main(int argc, char** argv) {
   tsl::port::InitMain("Run AutoSharding Pass", &argc, &argv);
   QCHECK(argc == 2) << "Must specify a single input file";
-  TF_CHECK_OK(xla::spmd::RunAutoShardingPassFromFile(argv[1]));
+  CHECK_OK(xla::spmd::RunAutoShardingPassFromFile(argv[1]));
   return 0;
 }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
index 82891d75e78fae..428cd16fb57145 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass_test.cc
index b27cd190ee9430..840f67674593b1 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass_test.cc
@@ -82,7 +82,9 @@ TEST_F(AutoShardingTest, OpenDimensionsInputSharding) {
   CHECK: %arg0: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {}]>}
   CHECK-SAME: %arg1: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {}]>}
   CHECK-SAME: -> (tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"x"}, {"y"}]>}
-  CHECK: %0 = stablehlo.dot %arg0, %arg1, {{.*}} {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {"y"}]>]>}
+  CHECK: %0 = stablehlo.reshape %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {}]>]>}
+  CHECK: %1 = stablehlo.reshape %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {"y"}]>]>}
+  CHECK: %2 = stablehlo.dot %0, %1, {{.*}} {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {"y"}]>]>}
   )";
 
   mlir::OwningOpRef<mlir::ModuleOp> module =
@@ -106,7 +108,9 @@ TEST_F(AutoShardingTest, ClosedDimensionsInputSharding) {
   CHECK: %arg0: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {}]>}
   CHECK-SAME: %arg1: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {}]>}
   CHECK-SAME: -> (tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"x"}, {"y"}]>}
-  CHECK: %0 = stablehlo.dot %arg0, %arg1, {{.*}} {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {"y"}]>]>}
+  CHECK: %0 = stablehlo.reshape %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {}]>]>}
+  CHECK: %1 = stablehlo.reshape %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {"y"}]>]>}
+  CHECK: %2 = stablehlo.dot %0, %1, {{.*}} {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {"y"}]>]>}
   )";
 
   mlir::OwningOpRef<mlir::ModuleOp> module =
@@ -130,7 +134,8 @@ TEST_F(AutoShardingTest, HybridDimensionsInputSharding) {
   CHECK: %arg0: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"x"}, {}]>}
   CHECK-SAME: %arg1: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {}]>}
   CHECK-SAME: -> (tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"x"}, {"y"}]>}
-  CHECK: %0 = stablehlo.dot %arg0, %arg1, {{.*}} {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {"y"}]>]>}
+  CHECK: %0 = stablehlo.reshape %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {"y"}]>]>}
+  CHECK: %1 = stablehlo.dot %arg0, %0, {{.*}} {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"x"}, {"y"}]>]>}
   )";
 
   mlir::OwningOpRef<mlir::ModuleOp> module =
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index fcb6911eeff516..c2e4a7a8560453 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -30,7 +29,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 4a608ed50afacc..beecbbddc3e95d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <initializer_list>
 #include <memory>
 #include <optional>
 #include <queue>
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 14f7b738100c7e..1eb757dc4f0568 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <initializer_list>
 #include <optional>
 #include <string>
 #include <utility>
@@ -46,7 +47,6 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace spmd {
@@ -212,7 +212,7 @@ inline void ReplaceOperand(HloInstruction* inst,
                            HloInstruction* new_operand) {
   for (int i = 0; i < inst->operand_count(); ++i) {
     if (inst->operand(i) == old_operand) {
-      TF_CHECK_OK(inst->ReplaceOperandWith(i, new_operand));
+      CHECK_OK(inst->ReplaceOperandWith(i, new_operand));
     }
   }
 }
@@ -344,7 +344,7 @@ inline void ForceOperandSharding(HloInstruction* inst, int operand_num,
   HloInstruction* replace_with = inst->parent()->AddInstruction(
       HloInstruction::CreateReshape(operand->shape(), operand));
   replace_with->set_sharding(sharding);
-  TF_CHECK_OK(inst->ReplaceOperandWith(operand_num, replace_with));
+  CHECK_OK(inst->ReplaceOperandWith(operand_num, replace_with));
 }
 
 // Return whether the sharding is fully tiled.
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
index cc41ea7503e3a7..42647cd04f3c4f 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/types/span.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl_test.cc
index c6ab065b2e4c2b..0551e0dec00f6c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl_test.cc
@@ -16,20 +16,16 @@ limitations under the License.
 
 #include "xla/hlo/experimental/auto_sharding/iopddl.h"
 
-#include <string>
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/time/time.h"
 #include "xla/hlo/experimental/auto_sharding/solver.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace iopddl {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 Problem GetExampleProblem() {
   return {
       .name = "example",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/metrics.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/metrics.cc
index 0300fa7ff0442c..b435713d3caa0d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/metrics.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/metrics.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/metrics.h"
 
 #include <cstdint>
-#include <string>
 
 #include "xla/tsl/lib/monitoring/counter.h"
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils.cc
index 297264c02bef0a..3fe242e52294de 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils.cc
@@ -73,7 +73,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToShardyStablehlo(
   mlir::PassManager pm(context);
   llvm::SmallVector<bool> prop_args = {false};
   llvm::SmallVector<bool> prop_results = {false};
-  xla::sdy::addStablehloImportPipeline(pm, prop_args, prop_results);
+  xla::sdy::addStablehloImportPipeline(
+      pm, prop_args, prop_results,
+      /*enableStablehloCanonicalizeFromHloImport=*/false);
   // TODO(hanruobing): Explore reinserting the original mesh and calling
   // xla::sdy::createDedupMeshesPass
   if (mlir::failed(pm.run(stablehlo_module.get()))) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils_test.cc
index a2ee4c91802570..1770af3588ae47 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/stablehlo_utils_test.cc
@@ -184,7 +184,8 @@ ENTRY %main.4 (Arg_0.1: f32[400,400], Arg_1.2: f32[400,400]) -> f32[400,400] {
   CHECK: %arg0: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>}
   CHECK-SAME: %arg1: tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>}
   CHECK-SAME: -> (tensor<400x400xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"_axis_0"}, {}]>})
-  CHECK: %0 = stablehlo.dot %arg0, %arg1, {{.*}} {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"_axis_0"}, {}]>]>}
+  CHECK: %0 = stablehlo.reshape %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"_axis_0"}, {}]>]>}
+  CHECK: %1 = stablehlo.dot %0, %arg1, precision = [DEFAULT, DEFAULT] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"_axis_0"}, {}]>]>}
   )";
 
   ConvertHloAndCompare(kHloString, kExpectedShardyPattern);
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index ba3c7948a6b6ef..071b9a2ab6012b 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -60,6 +60,8 @@ cc_library(
     deps = [
         ":backend_config",
         ":hlo_sharding",
+        ":mesh_and_axis",
+        ":named_sharding",
         ":ptrvec",
         ":tile_assignment",
         "//xla:array",
@@ -122,6 +124,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf_lite",
         "@highwayhash",
         "@highwayhash//:arch_specific",
         "@highwayhash//:hh_types",
@@ -177,12 +180,46 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "mesh_and_axis",
+    srcs = ["mesh_and_axis.cc"],
+    hdrs = ["mesh_and_axis.h"],
+    deps = [
+        ":tile_assignment",
+        "//xla:array",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "named_sharding",
+    hdrs = ["named_sharding.h"],
+    deps = [
+        ":mesh_and_axis",
+        ":tile_assignment",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "hlo_sharding",
     srcs = ["hlo_sharding.cc"],
     hdrs = ["hlo_sharding.h"],
     deps = [
         ":hlo_op_metadata",
+        ":named_sharding",
         ":tile_assignment",
         "//xla:array",
         "//xla:printer",
@@ -203,7 +240,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -226,20 +262,29 @@ xla_cc_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:alias_info",
+        "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:compilation_environments",
         "//xla/service:hlo_module_config",
+        "//xla/service:hlo_proto_util",
+        "//xla/service:logical_buffer",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -251,9 +296,11 @@ cc_library(
         "//xla:util",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:human_readable_json",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -360,11 +407,29 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "tile_assignment_test",
+    srcs = ["tile_assignment_test.cc"],
+    deps = [
+        ":tile_assignment",
+        "//xla:array",
+        "//xla:array2d",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 xla_cc_test(
     name = "replica_group_test",
     srcs = ["replica_group_test.cc"],
     deps = [
         ":hlo",
+        ":mesh_and_axis",
+        ":tile_assignment",
+        "//xla:array",
+        "//xla:array2d",
         "//xla:xla_data_proto_cc",
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/platform:test_main",
@@ -372,6 +437,33 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "mesh_and_axis_test",
+    srcs = ["mesh_and_axis_test.cc"],
+    deps = [
+        ":mesh_and_axis",
+        ":tile_assignment",
+        "//xla:array",
+        "//xla:array2d",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "named_sharding_test",
+    srcs = ["named_sharding_test.cc"],
+    deps = [
+        ":mesh_and_axis",
+        ":named_sharding",
+        "//xla:xla_data_proto_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "hlo_casting_utils_test",
     srcs = ["hlo_casting_utils_test.cc"],
@@ -391,6 +483,7 @@ cc_library(
     hdrs = ["collective_op_group_mode.h"],
     deps = [
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/hlo/ir/backend_config.cc b/third_party/xla/xla/hlo/ir/backend_config.cc
index d5bd6d9a3c6329..852f3fe1311caa 100644
--- a/third_party/xla/xla/hlo/ir/backend_config.cc
+++ b/third_party/xla/xla/hlo/ir/backend_config.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "google/protobuf/message.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "tsl/platform/human_readable_json.h"
diff --git a/third_party/xla/xla/hlo/ir/backend_config.h b/third_party/xla/xla/hlo/ir/backend_config.h
index 436df938f6f79a..90347de0c64490 100644
--- a/third_party/xla/xla/hlo/ir/backend_config.h
+++ b/third_party/xla/xla/hlo/ir/backend_config.h
@@ -16,18 +16,26 @@ limitations under the License.
 #ifndef XLA_HLO_IR_BACKEND_CONFIG_H_
 #define XLA_HLO_IR_BACKEND_CONFIG_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "google/protobuf/message.h"
+#include "tsl/platform/human_readable_json.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
 
+template <typename T>
+using EnableIfProto = typename std::enable_if_t<
+    std::is_base_of<tsl::protobuf::Message, T>::value>;
+
 // Returns a string representation of a proto in the format used by
 // HloInstruction::raw_backend_config_string.
 //
@@ -48,7 +56,7 @@ std::unique_ptr<tsl::protobuf::Message> CloneBackendConfigProto(
 // A wrapper around the BackendConfig proto. It can be initialized either with
 // a proto object or a string representing the JSON encoding of a proto. Once
 // the wrapper is initialized (either during construction or via an assignment)
-// it becomes immutable and any further assignment attempts will fail.
+// it can only be mutated by calling the ApplyFnOnProto method.
 //
 // When the wrapper is initialized only the provided format is stored. If the
 // other format is requested from the wrapper later, it is lazily computed and
@@ -92,6 +100,55 @@ class BackendConfigWrapper {
   }
   absl::Status GetProto(tsl::protobuf::Message* output_proto) const;
 
+  // Type trait to check if a type has the mutable_custom_call_metadata method.
+  template <typename T, typename = void>
+  struct has_mutable_custom_call_metadata : std::false_type {};
+
+  template <typename T>
+  struct has_mutable_custom_call_metadata<
+      T,
+      std::void_t<decltype(std::declval<T>().mutable_custom_call_metadata())>>
+      : std::true_type {};
+
+  // Applies a function `fn` to the underlying proto. The function receives a
+  // mutable pointer to a proto of type `ConfigProto`.
+  //
+  // If there is no proto initialized, will try to initialize from the
+  // raw_string_. If raw_string_ is also empty, will return an error.
+  template <typename ConfigProto, EnableIfProto<ConfigProto>* = nullptr>
+  absl::Status ApplyFnOnProto(
+      const std::function<absl::Status(ConfigProto*)>& fn) {
+    absl::WriterMutexLock lock{mutex_};
+    if (proto_ == nullptr) {
+      if (raw_string_.empty()) {
+        return absl::InvalidArgumentError(
+            "Has no proto to apply the modifier function on.");
+      }
+      auto proto = std::make_unique<ConfigProto>();
+      absl::Status status =
+          tsl::HumanReadableJsonToProto(raw_string_, proto.get());
+      if (!status.ok()) {
+        // If the proto is unparsable, move the raw string to the custom call
+        // metadata field. This preserves the original string for lowering
+        // emitters that might depend on it.
+        if constexpr (has_mutable_custom_call_metadata<ConfigProto>::value) {
+          auto* custom_call_metadata = proto->mutable_custom_call_metadata();
+          custom_call_metadata->set_metadata(raw_string_);
+          VLOG(1) << "Moving the unparsable backend config " << raw_string_
+                  << " to custom call metadata.";
+          raw_string_.clear();
+        } else {
+          return status;
+        }
+      }
+      proto_ = std::move(proto);
+    }
+    absl::Status status = fn(static_cast<ConfigProto*>(proto_.get()));
+    // Invalidate string cache, as the proto might have been mutated.
+    raw_string_.clear();
+    return status;
+  }
+
   bool empty() const {
     absl::MutexLock lock{mutex_};
     return proto_ == nullptr && raw_string_.empty();
diff --git a/third_party/xla/xla/hlo/ir/backend_config_test.cc b/third_party/xla/xla/hlo/ir/backend_config_test.cc
index b5fe5b3313f885..6e47a558c2d567 100644
--- a/third_party/xla/xla/hlo/ir/backend_config_test.cc
+++ b/third_party/xla/xla/hlo/ir/backend_config_test.cc
@@ -37,7 +37,7 @@ const int kNumRepetitions = 100;
 // since the == operator does not canonicalize the raw strings before comparing
 // them.
 constexpr absl::string_view kRawString =
-    R"({"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1","is_tma_allowed":false}},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"})";
+    R"({"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1","is_tma_allowed":false,"is_warp_specialization_allowed":false}},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"})";
 
 template <typename Input, typename CheckFn>
 void RunThreaded(Input input, CheckFn check_fn) {
diff --git a/third_party/xla/xla/hlo/ir/collective_op_group_mode.cc b/third_party/xla/xla/hlo/ir/collective_op_group_mode.cc
index 45c5a7c62904ab..7331f16decc6c1 100644
--- a/third_party/xla/xla/hlo/ir/collective_op_group_mode.cc
+++ b/third_party/xla/xla/hlo/ir/collective_op_group_mode.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
@@ -31,11 +32,15 @@ struct CollectiveOpGroupModeInfo {
 };
 
 const CollectiveOpGroupModeInfo kGroupModeInfos[] = {
-    {CollectiveOpGroupMode::kCrossReplica, "cross_replica"},
-    {CollectiveOpGroupMode::kCrossPartition, "cross_partition"},
-    {CollectiveOpGroupMode::kCrossReplicaAndPartition,
+    {CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
+     "cross_replica"},
+    {CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION,
+     "cross_partition"},
+    {CollectiveOpGroupMode::
+         COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION,
      "cross_replica_and_partition"},
-    {CollectiveOpGroupMode::kFlattenedID, "flattened_id"},
+    {CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID,
+     "flattened_id"},
 };
 
 }  // namespace
@@ -70,15 +75,16 @@ absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
       return InvalidArgument(
           "Cannot have use_global_device_ids=true without channel_id");
     }
-    return CollectiveOpGroupMode::kCrossReplica;
+    return CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA;
   }
   if (!use_global_device_ids.has_value()) {
-    return CollectiveOpGroupMode::kCrossPartition;
+    return CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION;
   }
   if (!*use_global_device_ids) {
-    return CollectiveOpGroupMode::kCrossReplicaAndPartition;
+    return CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION;
   }
-  return CollectiveOpGroupMode::kFlattenedID;
+  return CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/collective_op_group_mode.h b/third_party/xla/xla/hlo/ir/collective_op_group_mode.h
index 13eb6da8ca8ca7..8bc99930697075 100644
--- a/third_party/xla/xla/hlo/ir/collective_op_group_mode.h
+++ b/third_party/xla/xla/hlo/ir/collective_op_group_mode.h
@@ -20,48 +20,10 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
-// There are broadly 4 modes that collective communication ops use to describe
-// which sets of devices are participating with a given device in the operation.
-// These modes are determined by the values of channel_id (optional) and
-// use_global_device_ids (optional). The modes are as follows:
-//
-// kCrossReplica:
-//    implied by: no channel id, use_global_device_ids = false, or
-//                no channel_id, no use_global_device_ids:
-//    replica_groups contain replica_id, group contains all replicas for the
-//    current partition
-//
-// kCrossPartition:
-//    implied by: channel_id is set, no use_global_device_ids:
-//    replica_groups contain partition_id, group contains all partitions for the
-//    current replica.
-//
-// kCrossReplicaAndPartition:
-//    implied by: channel_id is set, use_global_device_ids = false:
-//    replica_groups contain replica_id, group contains all replicas for all
-//    partitions (as opposed to just current partition).
-//
-// kFlattenedID:
-//    implied by: channel_id is set, use_global_device_ids = true:
-//    replica_groups contain flattened-ids, group contains devices that are
-//    listed in the flattened-id list.
-//
-// Rest of the combinations are invalid.
-//
-// Since the actual value of channel_id does not matter, we use a bool argument
-// `has_channel_id`, and optional<bool> for use_global_device_ids.
-// Note that use_global_device_ids true requires channel_id to be set as well.
-// Additionally, if use_global_device_ids = true, replica groups cannot be
-// empty (verified in the HLO verifier).
-enum class CollectiveOpGroupMode {
-  kCrossReplica,
-  kCrossPartition,
-  kCrossReplicaAndPartition,
-  kFlattenedID,
-};
 
 absl::string_view CollectiveOpGroupModeToString(
     CollectiveOpGroupMode group_mode);
diff --git a/third_party/xla/xla/hlo/ir/collective_op_group_mode_test.cc b/third_party/xla/xla/hlo/ir/collective_op_group_mode_test.cc
index 0f4065f4d38c3c..41ff4fb647db88 100644
--- a/third_party/xla/xla/hlo/ir/collective_op_group_mode_test.cc
+++ b/third_party/xla/xla/hlo/ir/collective_op_group_mode_test.cc
@@ -28,28 +28,33 @@ namespace xla {
 namespace {
 
 TEST(CollectiveOpGroupModeTest, ToString) {
-  EXPECT_EQ(CollectiveOpGroupModeToString(CollectiveOpGroupMode::kCrossReplica),
+  EXPECT_EQ(CollectiveOpGroupModeToString(
+                CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA),
             "cross_replica");
   EXPECT_EQ(
-      CollectiveOpGroupModeToString(CollectiveOpGroupMode::kCrossPartition),
+      CollectiveOpGroupModeToString(
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION),
       "cross_partition");
   EXPECT_EQ(CollectiveOpGroupModeToString(
-                CollectiveOpGroupMode::kCrossReplicaAndPartition),
+                CollectiveOpGroupMode::
+                    COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION),
             "cross_replica_and_partition");
-  EXPECT_EQ(CollectiveOpGroupModeToString(CollectiveOpGroupMode::kFlattenedID),
+  EXPECT_EQ(CollectiveOpGroupModeToString(
+                CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID),
             "flattened_id");
 }
 
 TEST(CollectiveOpGroupModeTest, FromString) {
   EXPECT_EQ(StringToCollectiveOpGroupMode("cross_replica").value(),
-            CollectiveOpGroupMode::kCrossReplica);
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA);
   EXPECT_EQ(StringToCollectiveOpGroupMode("cross_partition").value(),
-            CollectiveOpGroupMode::kCrossPartition);
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION);
   EXPECT_EQ(
       StringToCollectiveOpGroupMode("cross_replica_and_partition").value(),
-      CollectiveOpGroupMode::kCrossReplicaAndPartition);
+      CollectiveOpGroupMode::
+          COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION);
   EXPECT_EQ(StringToCollectiveOpGroupMode("flattened_id").value(),
-            CollectiveOpGroupMode::kFlattenedID);
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID);
 }
 
 // Tests for GetCollectOpGroupMode
@@ -72,14 +77,20 @@ struct TestCase {
 
 std::vector<TestCase> GetTestCases() {
   const std::vector<TestCase> test_cases = {
-      // clang-format off
       // has_channel_id, use_global_device_ids, expected mode
-      {false, std::nullopt,  CollectiveOpGroupMode::kCrossReplica},
-      {false, false,         CollectiveOpGroupMode::kCrossReplica},
-      {false, true,          std::nullopt},
-      {true,  std::nullopt,  CollectiveOpGroupMode::kCrossPartition},
-      {true,  false,         CollectiveOpGroupMode::kCrossReplicaAndPartition},
-      {true,  true,          CollectiveOpGroupMode::kFlattenedID},
+      // clang-format off
+      {false, std::nullopt,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
+      {false, false,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
+      {false, true, std::nullopt},
+      {true, std::nullopt,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION},
+      {true, false,
+       CollectiveOpGroupMode::
+           COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION},
+      {true, true,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID},
       // clang-format on
   };
   return test_cases;
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
index 85689be9997aff..074fa4ffccc426 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
@@ -174,6 +174,9 @@ class DfsHloVisitorBase {
   virtual absl::Status HandleAsin(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual absl::Status HandleAsinh(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual absl::Status HandleAtan2(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 243997a7ed1093..57989017c4e43a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <stack>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/functional/overload.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
@@ -42,6 +44,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -50,7 +53,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_original_value.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/ptrvec.h"
 #include "xla/literal.h"
@@ -68,11 +70,9 @@ limitations under the License.
 #include "xla/tsl/lib/gtl/iterator_range.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -152,7 +152,7 @@ std::unique_ptr<HloComputation> HloComputation::Builder::Build(
 HloComputation::HloComputation(
     const std::string& name, int parameter_count,
     std::vector<std::unique_ptr<HloInstruction>>* instructions,
-    HloInstruction* root_instruction, bool from_proto)
+    HloInstruction* root_instruction, bool preserve_instruction_ids)
     : unique_id_(-1),
       root_instruction_(root_instruction),
       instruction_count_(0),
@@ -160,7 +160,7 @@ HloComputation::HloComputation(
   param_instructions_.resize(parameter_count, nullptr);
   bool root_found = false;
 
-  if (from_proto) {
+  if (preserve_instruction_ids) {
     // Pre-allocate all instructions in the vector since it state should be
     // identical.
     int32_t max_instruction_local_id = 0;
@@ -191,7 +191,7 @@ HloComputation::HloComputation(
       param_instructions_[param_no] = instruction.get();
     }
     root_found |= instruction.get() == root_instruction_;
-    AddInstructionInternal(std::move(instruction), from_proto);
+    AddInstructionInternal(std::move(instruction), preserve_instruction_ids);
   }
   CHECK(root_found)
       << "\nERROR: root instruction is not present in computation.";
@@ -420,7 +420,7 @@ absl::flat_hash_map<HloInstruction*, int>* const HloComputation::GetCallersMap()
 }
 
 HloInstruction* HloComputation::AddInstructionInternal(
-    std::unique_ptr<HloInstruction> instruction, bool from_proto) {
+    std::unique_ptr<HloInstruction> instruction, bool preserve_unique_id) {
   if (parent() != nullptr) {
     instruction->UniquifyName(parent());
   }
@@ -430,10 +430,10 @@ HloInstruction* HloComputation::AddInstructionInternal(
   info.opcode_ = pinst->opcode();
   info.inst_ = pinst;
 
-  if (from_proto && pinst->local_id_ >= 0) {
-    // Already set unique id from proto sources therefore it is preserved.
-    // Calls for AddInstructionInternal from proto sources assume that all space
-    // in instructions_ vector has been pre-allocated.
+  if (preserve_unique_id && pinst->local_id_ >= 0) {
+    // Already set unique id previously therefore it is preserved.
+    // Calls for AddInstructionInternal from preserving sources assume that all
+    // space in instructions_ vector has been pre-allocated.
     CHECK(pinst->local_id() < instructions_.size())
         << "Instruction local_id " << pinst->local_id()
         << " is out of range [0, " << instructions_.size()
@@ -441,7 +441,7 @@ HloInstruction* HloComputation::AddInstructionInternal(
     next_instruction_unique_id_ = instructions_.size();
     instructions_[pinst->local_id()] = info;
   } else {
-    // Unset instructions from proto sources and regular instructions get
+    // Unset instructions from preserving sources and regular instructions get
     // assigned a new unique id.
     pinst->ClearUniqueIdInternal();
     // Must match the size of the instructions_ vector.
@@ -552,10 +552,9 @@ HloInstruction* HloComputation::ReplaceParameter(
   HloInstruction* new_instruction =
       AddInstructionInternal(std::move(instruction));
   HloInstruction* old_instruction = param_instructions_[param_no];
-  TF_CHECK_OK(
-      old_instruction->ReplaceAllUsesWithDifferentShape(new_instruction));
+  CHECK_OK(old_instruction->ReplaceAllUsesWithDifferentShape(new_instruction));
   param_instructions_[param_no] = new_instruction;
-  TF_CHECK_OK(ForceRemoveInstruction(old_instruction));
+  CHECK_OK(ForceRemoveInstruction(old_instruction));
   return new_instruction;
 }
 
@@ -598,7 +597,8 @@ bool HloComputation::IsSafelyRemovable(
     const HloInstruction* instruction, bool ignore_control_dependency,
     std::optional<
         absl::FunctionRef<std::vector<HloInstruction*>(const HloComputation*)>>
-        computation_callers) const {
+        computation_callers,
+    bool remove_dead_param_from_entry) const {
   // If the instruction has control predecessors or successors then we cannot
   // remove the instruction without violating ordering constraints (added, for
   // example, to avert interference due to buffer aliasing).
@@ -611,8 +611,11 @@ bool HloComputation::IsSafelyRemovable(
     if (instruction->parent() == nullptr) {
       return true;
     }
-    // Entry computation parameters can never be removed.
-    if (instruction->parent()->IsEntryComputation()) {
+    // Entry computation parameters can never be removed unless explicitly
+    // specified. Then user needs to update the entry computation layout
+    // accordingly.
+    if (instruction->parent()->IsEntryComputation() &&
+        !remove_dead_param_from_entry) {
       return false;
     }
     // We generally want to be using the call graph to determine who the caller
@@ -627,11 +630,10 @@ bool HloComputation::IsSafelyRemovable(
     }
     std::vector<HloInstruction*> callers =
         (*computation_callers)(instruction->parent());
-    if (callers.empty()) {
+    if (callers.empty() && !instruction->parent()->IsEntryComputation()) {
       return false;
     }
-    for (HloInstruction* caller :
-         (*computation_callers)(instruction->parent())) {
+    for (HloInstruction* caller : callers) {
       if (caller->opcode() != HloOpcode::kFusion &&
           caller->opcode() != HloOpcode::kCall &&
           caller->opcode() != HloOpcode::kAsyncStart) {
@@ -664,12 +666,14 @@ absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
     bool ignore_control_dependencies,
     std::optional<
         absl::FunctionRef<std::vector<HloInstruction*>(const HloComputation*)>>
-        computation_callers) {
+        computation_callers,
+    bool remove_dead_parameters_from_entry_computation) {
   TF_RET_CHECK(root_instruction() != instruction);
 
   TF_RET_CHECK(instruction->IsDead());
   TF_RET_CHECK(IsSafelyRemovable(instruction, ignore_control_dependencies,
-                                 computation_callers))
+                                 computation_callers,
+                                 remove_dead_parameters_from_entry_computation))
       << "Cannot remove instruction: " << instruction->ToString();
   // Remember the parent, in case we lose all references to it, in order to
   // clean up the callers.
@@ -684,7 +688,8 @@ absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
 
     if (removed.contains(item) || !item->IsDead() ||
         !IsSafelyRemovable(item, ignore_control_dependencies,
-                           computation_callers) ||
+                           computation_callers,
+                           remove_dead_parameters_from_entry_computation) ||
         (item->HasSideEffect() && item != instruction)) {
       continue;
     }
@@ -729,8 +734,11 @@ absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
       callers = {FusionInstruction()};
     }
   }
+  const bool is_entry_computation = parent != nullptr &&
+                                    parent->parent() != nullptr &&
+                                    parent->IsEntryComputation();
   // Only attempt to remove parameters if we can fixup the caller.
-  if (callers.empty()) {
+  if (callers.empty() && !is_entry_computation) {
     return absl::OkStatus();
   }
   for (HloInstruction* param : parameters_to_be_removed) {
@@ -1308,22 +1316,31 @@ HloComputationProto HloComputation::ToProto() const {
 HloComputation::CreateFromProto(
     const HloComputationProto& proto,
     const absl::flat_hash_map<int64_t, HloComputation*>& computation_map,
-    bool prohibit_empty_literal) {
+    bool prohibit_empty_literal, bool preserve_instruction_ids,
+    absl::flat_hash_map<int64_t, int64_t>* id_remap_map) {
+  // Instruction_map uses the ids of the instructions as defined in the proto.
+  // The final instruction ids will change if preserve_instruction_ids is false.
   absl::flat_hash_map<int64_t, HloInstruction*> instruction_map;
   absl::flat_hash_map<HloInstruction*, int64_t> to_proto_id;
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  tsl::protobuf::internal::RepeatedPtrIterator<const xla::HloInstructionProto>
-      instruction_with_max_id = absl::c_max_element(
-          proto.instructions(),
-          [](const HloInstructionProto& a, const HloInstructionProto& b) {
-            return HloInstruction::CalculateLocalId(a.id()) <
-                   HloInstruction::CalculateLocalId(b.id());
-          });
-  int32_t max_proto_instruction_local_id =
-      instruction_with_max_id == proto.instructions().end()
-          ? 0
-          : HloInstruction::CalculateLocalId(instruction_with_max_id->id());
-  instructions.resize(max_proto_instruction_local_id + 1);
+
+  if (preserve_instruction_ids) {
+    // If preserve_instruction_ids is true, we need to reserve space for all
+    // instructions in the proto, even if they are gaps to keep the condition
+    // that instructions[instruction->local_id_] == instruction.
+    tsl::protobuf::internal::RepeatedPtrIterator<const xla::HloInstructionProto>
+        instruction_with_max_id = absl::c_max_element(
+            proto.instructions(),
+            [](const HloInstructionProto& a, const HloInstructionProto& b) {
+              return HloInstruction::CalculateLocalId(a.id()) <
+                     HloInstruction::CalculateLocalId(b.id());
+            });
+    int32_t max_proto_instruction_local_id =
+        instruction_with_max_id == proto.instructions().end()
+            ? 0
+            : HloInstruction::CalculateLocalId(instruction_with_max_id->id());
+    instructions.resize(max_proto_instruction_local_id + 1);
+  }
 
   int64_t parameter_count = 0;
 
@@ -1335,37 +1352,54 @@ HloComputation::CreateFromProto(
     if (instruction->opcode() == HloOpcode::kParameter) {
       parameter_count++;
     }
+
     int32_t local_proto_id =
         HloInstruction::CalculateLocalId(instruction_proto.id());
+
     TF_RET_CHECK(!ContainsKey(instruction_map, local_proto_id));
     instruction_map[local_proto_id] = instruction.get();
     to_proto_id[instruction.get()] = local_proto_id;
-    // The instruction's id is the same as the index in the instructions
-    // vector. This will be reproduced when placing the instruction in the
-    // instructions vector.
-    TF_RET_CHECK(instruction->local_id_ >= 0 &&
-                 instruction->local_id_ < instructions.size())
-        << "Instruction local id is out of bounds" << " Value is "
-        << instruction->local_id_ << " and size is " << instructions.size();
-    TF_RET_CHECK(instructions[instruction->local_id_] == nullptr)
-        << "Instruction " << instruction->name() << " has duplicate local id "
-        << instruction->local_id_;
-    instructions[instruction->local_id_] = std::move(instruction);
+    if (preserve_instruction_ids) {
+      // The instruction's id is the same as the index in the instructions
+      // vector. This will be reproduced when placing the instruction in the
+      // instructions vector.
+      TF_RET_CHECK(instruction->local_id_ >= 0 &&
+                   instruction->local_id_ < instructions.size())
+          << "Instruction local id is out of bounds" << " Value is "
+          << instruction->local_id_ << " and size is " << instructions.size();
+      TF_RET_CHECK(instructions[instruction->local_id_] == nullptr)
+          << "Instruction " << instruction->name() << " has duplicate local id "
+          << instruction->local_id_;
+      instructions[instruction->local_id_] = std::move(instruction);
+    } else {
+      // Instructions will be placed sequentially in the instructions vector.
+      // The local id will be assigned sequentially starting from 0.
+      instruction->local_id_ = instructions.size();
+      if (id_remap_map != nullptr) {
+        // When creating a new HloComputation from proto, the computation ids
+        // are preserved.
+        (*id_remap_map)[instruction_proto.id()] =
+            HloInstruction::CalculateUniqueId(proto.id(),
+                                              instruction->local_id_);
+      }
+      instructions.push_back(std::move(instruction));
+    }
   }
-
   TF_RET_CHECK(proto.root_id() != -1);
   int32_t root_local_id = HloInstruction::CalculateLocalId(proto.root_id());
-  TF_RET_CHECK(ContainsKey(instruction_map, root_local_id));
+  TF_RET_CHECK(ContainsKey(instruction_map, root_local_id))
+      << "Root instruction not found in instruction map";
   HloInstruction* root = instruction_map.at(root_local_id);
 
   // Check if each computation's instructions are unique in their local id
   // (lowers 32bits)
   absl::flat_hash_set<int32_t> instruction_local_ids;
   for (const auto& instruction : instructions) {
-    // Since the instructions vector replicates the instructions from the proto,
-    // if there are any gaps in the sequence of instruction ids in the proto, it
-    // will be represented as a null instruction in the vector.
-    if (instruction == nullptr) {
+    // Since the instructions vector replicates the instructions from the proto
+    // when preserve_instruction_ids is true, if there are any gaps in the
+    // sequence of instruction ids in the proto, it will be represented as a
+    // null instruction in the vector.
+    if (instruction == nullptr && preserve_instruction_ids) {
       continue;
     }
     TF_RET_CHECK(to_proto_id.contains(instruction.get()))
@@ -1373,13 +1407,17 @@ HloComputation::CreateFromProto(
         << " local_id: " << instruction->local_id_;
     int32_t local_id_from_proto =
         HloInstruction::CalculateLocalId(to_proto_id[instruction.get()]);
-    TF_RET_CHECK(local_id_from_proto == instruction->local_id_)
-        << "Instruction has different local id from proto: proto: "
-        << local_id_from_proto << " vs local: " << instruction->local_id_;
-    TF_RET_CHECK(!instruction_local_ids.contains(local_id_from_proto))
+
+    if (preserve_instruction_ids) {
+      TF_RET_CHECK(local_id_from_proto == instruction->local_id_)
+          << "Instruction has different local id from proto: proto: "
+          << local_id_from_proto << " vs local: " << instruction->local_id_;
+    }
+
+    TF_RET_CHECK(!instruction_local_ids.contains(instruction->local_id_))
         << "Instruction " << instruction->name()
-        << " has duplicate internal unique id " << local_id_from_proto;
-    instruction_local_ids.insert(local_id_from_proto);
+        << " has duplicate internal unique id " << instruction->local_id_;
+    instruction_local_ids.insert(instruction->local_id_);
   }
   TF_RETURN_IF_ERROR([&]() -> absl::Status {
     std::vector<bool> parameters_seen(parameter_count);
@@ -1406,8 +1444,13 @@ HloComputation::CreateFromProto(
     return absl::OkStatus();
   }());
 
-  auto computation = absl::WrapUnique(new HloComputation(
-      proto.name(), parameter_count, &instructions, root, /*from_proto=*/true));
+  // Because we have formed the instructions vector manually, we can have the
+  // creator of the HLO computation assume the instructions vector is well
+  // formed and that the instruction ids are consistent. Will also assume that
+  // instructions[instruction->local_id_] == instruction.
+  auto computation = absl::WrapUnique(
+      new HloComputation(proto.name(), parameter_count, &instructions, root,
+                         /*preserve_instruction_ids=*/true));
   computation->SetUniqueIdHelper(proto.id());
   if (proto.is_fusion_computation()) {
     computation->instruction_and_type_ =
@@ -1423,18 +1466,18 @@ void HloComputation::AppendInstructionsIntoCalledComputation(
     absl::Span<HloInstruction* const> instructions_to_append,
     HloInstruction* caller) {
   HloInstruction* root = instructions_to_append.front();
-  TF_CHECK_OK(caller->CopyAllControlDepsFrom(root));
-  TF_CHECK_OK(root->DropAllControlDeps());
-  TF_CHECK_OK(root->ReplaceAllUsesWith(caller));
+  CHECK_OK(caller->CopyAllControlDepsFrom(root));
+  CHECK_OK(root->DropAllControlDeps());
+  CHECK_OK(root->ReplaceAllUsesWith(caller));
   if (root == root_instruction()) {
     set_root_instruction(caller);
   }
-  TF_CHECK_OK(RemoveInstruction(root));
+  CHECK_OK(RemoveInstruction(root));
   for (size_t i = 1; i < instructions_to_append.size(); ++i) {
     HloInstruction* instruction = instructions_to_append[i];
     caller->AppendInstructionIntoCalledComputation(instruction);
     if (instruction->IsDead()) {
-      TF_CHECK_OK(RemoveInstruction(instruction));
+      CHECK_OK(RemoveInstruction(instruction));
     }
   }
 }
@@ -1929,7 +1972,7 @@ void SortClonedInstructions(
       continue;
     }
     ++num_mapped_instructions;
-    if (!dynamic_cast<const HloParameterInstruction*>(instruction.get())) {
+    if (!HloParameterInstruction::ClassOf(instruction.get())) {
       continue;
     }
     mapped_index_of_last_parameter_plus_one = num_mapped_instructions;
@@ -1937,7 +1980,7 @@ void SortClonedInstructions(
   auto unmapped_ptr_index =
       [num_mapped_instructions,
        mapped_index_of_last_parameter_plus_one](const HloInstruction* i) {
-        if (dynamic_cast<const HloParameterInstruction*>(i)) {
+        if (HloParameterInstruction::ClassOf(i)) {
           if (num_mapped_instructions > 0 &&
               mapped_index_of_last_parameter_plus_one > 0) {
             return mapped_index_of_last_parameter_plus_one - 1;
@@ -1985,7 +2028,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
                               std::unique_ptr<HloInstruction>>* replacements,
     absl::Span<const HloInstruction* const> extra_parameters,
     HloCloneContext* context, const std::string& suffix,
-    const HloInstruction* new_root) {
+    std::variant<const HloInstruction*, const absl::Span<HloInstruction* const>>
+        new_root) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
     context_ptr = std::make_unique<HloCloneContext>(parent(), suffix);
@@ -2000,11 +2044,9 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
     const absl::flat_hash_map<const HloInstruction*,
                               std::unique_ptr<HloInstruction>>* replacements,
     absl::Span<const HloInstruction* const> extra_parameters,
-    const std::string& suffix, const HloInstruction* new_root) const {
-  if (new_root == nullptr) {
-    new_root = root_instruction();
-  }
-
+    const std::string& suffix,
+    std::variant<const HloInstruction*, const absl::Span<HloInstruction* const>>
+        new_root) const {
   // Look up instr in the replacements map, and return either the replacement,
   // or instr, if the replacement isn't present.
   //
@@ -2097,8 +2139,39 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
   for (auto& instr : instructions) {
     builder.AddInstruction(std::move(instr));
   }
+
+  // Figure out the new root instruction for the clone. There are three cases:
+  // 1. The new root is just the old root (nullptr `new_root` instruction)
+  // 2. The new root is a different instruction in the computation (non-null
+  // `new_root` instruction)
+  // 3. The new root is a tuple of instructions, where the instructions are part
+  // of the computation, but the tuple did not previously exist (`new_root`
+  // span).
+  HloInstruction* new_root_instruction;
+  std::visit(absl::Overload{
+                 [&](const HloInstruction* arg) {
+                   if (arg == nullptr) {
+                     new_root_instruction =
+                         context.GetInstruction(replace(root_instruction()));
+                   } else {
+                     new_root_instruction =
+                         context.GetInstruction(replace(arg));
+                   }
+                 },
+                 [&](const absl::Span<HloInstruction* const> arg) {
+                   std::vector<HloInstruction*> root_replacements;
+                   for (HloInstruction* instr : arg) {
+                     root_replacements.push_back(
+                         context.GetInstruction(replace(instr)));
+                   }
+                   new_root_instruction = builder.AddInstruction(
+                       HloInstruction::CreateTuple(root_replacements));
+                 },
+             },
+             new_root);
+
   auto result = builder.Build(
-      /*root_instruction=*/context.GetInstruction(replace(new_root)));
+      /*root_instruction=*/new_root_instruction);
 
   // Clone control dependencies.
   for (auto instr : postorder) {
@@ -2108,7 +2181,7 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
       // successor may not have been remapped, because it might have been
       // removed by the replacements map.
       if (replaced_successor != nullptr) {
-        TF_CHECK_OK(new_instr->AddControlDependencyTo(
+        CHECK_OK(new_instr->AddControlDependencyTo(
             context.GetInstruction(replaced_successor)));
       }
     }
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index 5717e176a73c89..f13f1a8a937aa0 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -325,6 +326,12 @@ class HloComputation {
   // the control dependencies from the predecessors to the successors of the
   // removed instructions, so that the logical exeuction order of the remaining
   // unremoved instructions are preserved.
+  //
+  // Parameters from the entry computation can never be removed.
+  // If caller allows this with `remove_dead_parameters_from_entry_computation`
+  // then they need to update the entry computation layout of the module too.
+  // Note: This breaks the contract with the frontend. Use only in tests.
+  //
   absl::Status RemoveInstructionAndUnusedOperands(
       HloInstruction* instruction,
       std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup =
@@ -332,7 +339,8 @@ class HloComputation {
       bool ignore_control_dependencies = false,
       std::optional<absl::FunctionRef<
           std::vector<HloInstruction*>(const HloComputation*)>>
-          computation_callers = std::nullopt);
+          computation_callers = std::nullopt,
+      bool remove_dead_parameters_from_entry_computation = false);
 
   // Set the root of the computation to the given instruction. The instruction
   // must have already been added to the computation. In addition it must have
@@ -424,10 +432,18 @@ class HloComputation {
   //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
+  //   preserve_instruction_ids: if true, the instruction ids in the proto will
+  //     be preserved. Otherwise, the instruction ids will be remapped to start
+  //     from 0.
+  //   id_remap_map: if not null, it will be populated with a map from the
+  //     original instruction ids in the proto as is, to the remapped
+  //     instructions full unique ids using the proto's computation id. This is
+  //     only meaningful if preserve_instruction_ids is false.
   static absl::StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
       const HloComputationProto& proto,
       const absl::flat_hash_map<int64_t, HloComputation*>& computation_map,
-      bool prohibit_empty_literal = true);
+      bool prohibit_empty_literal = true, bool preserve_instruction_ids = true,
+      absl::flat_hash_map<int64_t, int64_t>* id_remap_map = nullptr);
 
   // Generates a hash value of an HLO computation. Hash considers
   // information on opcode, shape, operands, and typically a root instruction.
@@ -731,6 +747,11 @@ class HloComputation {
   // 'extra_parameters' allows to specify additional parameters that should be
   // added to the computation.
   //
+  // 'new_root' allows specifying a new root instruction for the clone. If it's
+  // a pointer to an instruction in the computation being cloned, the new root
+  // is that instruction. If it's a span, the new root is a tuple instruction,
+  // where the instructions in the span are the tuple elements.
+  //
   // All relevant instructions are cloned, *including* unique_ptr in the
   // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
@@ -738,7 +759,9 @@ class HloComputation {
                                 std::unique_ptr<HloInstruction>>* replacements,
       absl::Span<const HloInstruction* const> extra_parameters = {},
       HloCloneContext* context = nullptr, const std::string& suffix = "clone",
-      const HloInstruction* new_root = nullptr);
+      std::variant<const HloInstruction*,
+                   const absl::Span<HloInstruction* const>>
+          new_root = nullptr);
 
   // Like CloneWithReplacements(), but this is a const method and `context` must
   // be specified.
@@ -749,7 +772,9 @@ class HloComputation {
           nullptr,
       absl::Span<const HloInstruction* const> extra_parameters = {},
       const std::string& suffix = "clone",
-      const HloInstruction* new_root = nullptr) const;
+      std::variant<const HloInstruction*,
+                   const absl::Span<HloInstruction* const>>
+          new_root = nullptr) const;
 
   // Convenience overloads for CloneWithReplacements.  You want to do
   //
@@ -780,6 +805,11 @@ class HloComputation {
   // via computation_callers. This is expected to be equivalent to
   // CallGraph::GetComputationCallers().
   //
+  // Parameters from the entry computation can never be removed.
+  // If caller allows this with `remove_dead_parameters_from_entry_computation`
+  // then they need to update the entry computation layout of the module too.
+  // Note: This breaks the contract with the frontend. Use only in tests.
+  //
   // Note that IsSafelyRemovable() is a necessary condition to remove an
   // instruction rather than a sufficient condition. For example, instructions
   // with side-effect (e.g., Send, Infeed) may be removed from a computation,
@@ -790,7 +820,8 @@ class HloComputation {
       const HloInstruction* instruction, bool ignore_control_dependency = false,
       std::optional<absl::FunctionRef<
           std::vector<HloInstruction*>(const HloComputation*)>>
-          computation_callers = std::nullopt) const;
+          computation_callers = std::nullopt,
+      bool remove_dead_parameters_from_entry_computation = false) const;
 
   // Returns a map from an instruction to the group of instructions associated
   // with the same channel. These instructions will be considered as a single
@@ -965,12 +996,13 @@ class HloComputation {
   explicit HloComputation(
       const std::string& name, int parameter_count,
       std::vector<std::unique_ptr<HloInstruction>>* instructions,
-      HloInstruction* root_instruction, bool from_proto = false);
+      HloInstruction* root_instruction, bool preserve_instruction_ids = false);
 
   // Internal helper for adding instructions. Only assigns a unique id if it is
   // not already set.
   HloInstruction* AddInstructionInternal(
-      std::unique_ptr<HloInstruction> instruction, bool from_proto = false);
+      std::unique_ptr<HloInstruction> instruction,
+      bool preserve_unique_id = false);
 
   // Internal helper for comparison with different options.
   bool EqualInternal(
diff --git a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
index 527853d6d06d2c..b1613071d5e68f 100644
--- a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
+++ b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
@@ -41,8 +41,8 @@ class HloModule;
 // parameter index in the entry computation.
 class HloInputOutputAliasConfig {
  public:
-  // The kind of aliases which can be set. A kMayAlias is one setup at
-  // compilation time by the user, and has to be respected. A kMustAlias one
+  // The kind of aliases which can be set. A kMustAlias is one setup at
+  // compilation time by the user, and has to be respected. A kMayAlias one
   // might be setup by the compiler, if it decides it is convenient to do so.
   enum AliasKind {
     kMayAlias,
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index 1f31928e2abc64..3ab16fc702690c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -60,6 +60,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_op_metadata.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
 #include "xla/hlo/ir/ptrvec.h"
@@ -82,11 +83,9 @@ limitations under the License.
 #include "xla/tsl/lib/gtl/map_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -997,7 +996,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             proto.operand_shapes_with_layout();
         operand_shapes.reserve(operand_shapes_with_layout.size());
         for (const ShapeProto& shape_proto : operand_shapes_with_layout) {
-          operand_shapes.emplace_back(shape_proto);
+          TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(shape_proto));
+          operand_shapes.emplace_back(std::move(shape));
         }
         TF_RET_CHECK(proto.called_computation_ids_size() <= 1);
         if (proto.called_computation_ids_size() == 1) {
@@ -1304,6 +1304,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       }
       break;
       case HloOpcode::kAsin:
+      case HloOpcode::kAsinh:
       case HloOpcode::kAcos:
       case HloOpcode::kAcosh:
       case HloOpcode::kAtanh:
@@ -1501,6 +1502,7 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAtanh:
     case HloOpcode::kCos:
     case HloOpcode::kCosh:
@@ -2727,6 +2729,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     // Unary ops.
     case HloOpcode::kAbs:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAtanh:
@@ -3199,6 +3202,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAllGatherDone:
     case HloOpcode::kAllReduceDone:
     case HloOpcode::kAtan2:
@@ -3835,6 +3839,7 @@ bool HloInstruction::IsOpElementwise(HloOpcode opcode) {
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAtanh:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kRoundNearestEven:
@@ -4524,7 +4529,7 @@ HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
       shape_is_canonicalized_(false),
       shape_(std::make_shared<Shape>(shape)),
       name_(HloOpcodeString(opcode)) {
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(*shape_));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(*shape_));
 }
 
 template <typename HloInstructionPtr>
@@ -4539,6 +4544,8 @@ absl::Status HloInstruction::Visit(
       return visitor->HandleAcosh(this);
     case HloOpcode::kAsin:
       return visitor->HandleAsin(this);
+    case HloOpcode::kAsinh:
+      return visitor->HandleAsinh(this);
     case HloOpcode::kAtan2:
       return visitor->HandleAtan2(this);
     case HloOpcode::kAtanh:
@@ -5254,6 +5261,7 @@ bool IsUnaryOpWithResultAccuracy(HloOpcode opcode) {
     opcode == HloOpcode::kAcos ||
     opcode == HloOpcode::kAcosh ||
     opcode == HloOpcode::kAsin ||
+    opcode == HloOpcode::kAsinh ||
     opcode == HloOpcode::kAtanh ||
     opcode == HloOpcode::kCbrt ||
     opcode == HloOpcode::kCos ||
@@ -6034,7 +6042,13 @@ void HloInstruction::set_output_to_operand_aliasing(
 }
 
 std::shared_ptr<OriginalValue> HloInstruction::original_value() const {
-  return original_value_;
+  if (original_value_ != nullptr || opcode_ != HloOpcode::kGetTupleElement) {
+    return original_value_;
+  }
+  const HloInstruction* tuple = operand(0);
+  return tuple->opcode() == HloOpcode::kTuple
+             ? tuple->operand(tuple_index())->original_value()
+             : nullptr;
 }
 
 void HloInstruction::set_original_value(
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index b529d516079f60..d62cb20233986d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -315,8 +315,9 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // Creates an instruction from the given proto. Arguments:
   //
   //   proto: the proto to convert from.
-  //   instruction_map: a map from local instruction id to HloInstruction*. This
-  //     map must contain all operands of the newly constructed instruction.
+  //   instruction_map: a map from local instruction id (as defined in the
+  //     proto) to HloInstruction*. This map must contain all operands of the
+  //     newly constructed instruction.
   //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed instruction
   //     calls.
@@ -1887,6 +1888,13 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
     return static_cast<int32_t>(unique_id & 0xFFFFFFFF);
   }
 
+  // Returns the parent ID of the instruction by extracting it from the more
+  // general unique ID. The method does not differentiate between a parentless
+  // unique id and a unique id with a parent id of 0.
+  static int32_t CalculateParentId(int64_t unique_id) {
+    return static_cast<int32_t>(unique_id >> 32);
+  }
+
   bool has_backend_config() const { return !backend_config_.empty(); }
 
   void clear_backend_config() { backend_config_ = BackendConfigWrapper(); }
@@ -2029,6 +2037,16 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
     return proto;
   }
 
+  // Applies a function `fn` to the underlying backend config proto. The
+  // function receives a mutable pointer to a proto of type `ConfigProto`.
+  //
+  // If the proto is not already parsed, it will return an error.
+  template <typename ConfigProto, EnableIfProto<ConfigProto>* = nullptr>
+  absl::Status MutateBackendConfig(
+      const std::function<absl::Status(ConfigProto*)>& fn) {
+    return backend_config_.ApplyFnOnProto(fn);
+  }
+
   absl::Status set_backend_config(const tsl::protobuf::Message& proto) {
     backend_config_ = BackendConfigWrapper(proto);
     return absl::OkStatus();
@@ -2449,7 +2467,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   std::shared_ptr<OriginalValue> original_value() const;
   void set_original_value(std::shared_ptr<OriginalValue> original_value);
 
-  // Copy original value from the input instruction if the source and
+  // Copies original value from the input instruction if the source and
   // destination shapes are compatible. This performs a deep copy if clone is
   // set to true. Otherwise, it performs a shallow copy. Print a warning if the
   // shapes are not compatible and issue_warning is set to true.
@@ -2457,8 +2475,8 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
                          bool issue_warning = false);
 
  protected:
-  // Internal constructor for a given opcode/shape, other fields must be filled
-  // by factory methods.
+  // Internal constructor for a given opcode/shape, other fields must be
+  // filled by factory methods.
   HloInstruction(HloOpcode opcode, const Shape& shape);
 
   void RemoveAllOperands() { operands_.clear(); }
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 084f7962a3c207..55c3b05fe0d10f 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -63,7 +63,6 @@ limitations under the License.
 #include "xla/tsl/lib/gtl/iterator_range.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/status.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
@@ -848,7 +847,7 @@ HloSendDoneInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  HloSendInstruction* send = dynamic_cast<HloSendInstruction*>(new_operands[0]);
+  HloSendInstruction* send = DynCast<HloSendInstruction>(new_operands[0]);
   if (send != nullptr) {
     return std::make_unique<HloSendDoneInstruction>(send, is_host_transfer());
   }
@@ -907,7 +906,7 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  HloRecvInstruction* recv = dynamic_cast<HloRecvInstruction*>(new_operands[0]);
+  HloRecvInstruction* recv = DynCast<HloRecvInstruction>(new_operands[0]);
   if (recv != nullptr) {
     return std::make_unique<HloRecvDoneInstruction>(recv, is_host_transfer());
   }
@@ -1960,68 +1959,6 @@ HloInstruction* HloCallableInstruction::AppendInstructionIntoCalledComputation(
                                                         add_output);
 }
 
-namespace {
-
-// Returns the original value that can be used for multi output fusion. The
-// placeholder is just to provide a matching tuple tree of empty original arrays
-// for the fusion logic in `SetOriginalValueOnFusedInstruction` to correctly
-// populate the fused original value.
-std::shared_ptr<OriginalValue> GetOriginalValueOrPlaceholderForFusion(
-    HloInstruction* inst) {
-  if (inst->original_value()) {
-    if (!inst->original_value()->IsCompatibleWith(inst->shape())) {
-      LOG(ERROR) << "Instruction '" << inst->name()
-                 << "' has original value incompatible with its "
-                    "shape.\nOriginal value: "
-                 << inst->original_value()->ToString()
-                 << "\nShape: " << inst->shape().ToString();
-      // Return nullptr to bail out of original value tracking.
-      return nullptr;
-    }
-    return inst->original_value();
-  }
-  return std::make_shared<OriginalValue>(inst->shape());
-}
-
-// Sets the original value on the final (aka remaining) instruction after
-// fusion. This function assumes the final instruction to have a fused
-// shape. This fused shape should be a tuple containing elements from
-// `first_fused_ov` and `second_fused_ov`.
-void SetOriginalValueOnFusedInstruction(
-    HloInstruction* final_instr, std::shared_ptr<OriginalValue> first_fused_ov,
-    std::shared_ptr<OriginalValue> second_fused_ov) {
-  if (!first_fused_ov || !second_fused_ov) {
-    return;
-  }
-  if (first_fused_ov->is_synthetic_call() ||
-      second_fused_ov->is_synthetic_call() ||
-      (first_fused_ov->IsEmpty() && second_fused_ov->IsEmpty())) {
-    // Synthetic calls are generated by optimization passes and usually they
-    // should be inlined immediately. If somehow this multi output pass needs to
-    // fuse synthetic calls, we just ignore the original value because it's not
-    // clear how to fuse them.
-    final_instr->set_original_value(nullptr);
-    return;
-  }
-
-  std::vector<std::optional<OriginalArray>> new_leaves;
-  for (const auto& [index, value] : first_fused_ov->original_arrays()) {
-    new_leaves.push_back(value);
-  }
-  for (const auto& [index, value] : second_fused_ov->original_arrays()) {
-    new_leaves.push_back(value);
-  }
-
-  auto new_ov = std::make_shared<OriginalValue>(final_instr->shape());
-  int64_t leaf_index = 0;
-  for (auto& [index, value] : new_ov->mutable_original_arrays()) {
-    CHECK_LT(leaf_index, new_leaves.size());
-    value = new_leaves[leaf_index++];
-  }
-  final_instr->set_original_value(new_ov);
-}
-}  // namespace
-
 HloInstruction*
 HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     HloInstruction* instruction_to_append, bool add_output) {
@@ -2072,11 +2009,11 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
         // the clone.
         HloInstruction* called_computation_parameter =
             called_computation_parameters[operand_num];
-        TF_CHECK_OK(called_computation_parameter->ReplaceAllUsesWith(clone));
+        CHECK_OK(called_computation_parameter->ReplaceAllUsesWith(clone));
 
         // Remove the corresponding called computation parameter and operand
         // from their respective vectors.
-        TF_CHECK_OK(called_computation()->RemoveParameter(operand_num));
+        CHECK_OK(called_computation()->RemoveParameter(operand_num));
         RemoveOperandAt(operand_num);
         break;
       }
@@ -2123,7 +2060,7 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
       // original value is saved in the corresponding argument.
       called_computation_parameter = AddCallOperand(operand);
     }
-    TF_CHECK_OK(
+    CHECK_OK(
         clone->ReplaceOperandWith(operand_num, called_computation_parameter));
   }
 
@@ -2144,16 +2081,13 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
         if (root->operand(i) == clone) {
           HloInstruction* new_gte = AddInstruction(
               HloInstruction::CreateGetTupleElement(clone->shape(), this, i));
-          TF_CHECK_OK(instruction_to_append->ReplaceAllUsesWith(new_gte));
+          CHECK_OK(instruction_to_append->ReplaceAllUsesWith(new_gte));
           return clone;
         }
       }
     }
     // If this is already a multioutput instruction, expand the root tuple
     // by 1.
-    auto second_fused_ov =
-        GetOriginalValueOrPlaceholderForFusion(instruction_to_append);
-    auto first_fused_ov = GetOriginalValueOrPlaceholderForFusion(this);
     HloInstruction::InstructionVector tuple_elements;
     bool newly_created_tuple_instr = false;
     if (root->opcode() == HloOpcode::kTuple) {
@@ -2171,19 +2105,26 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     }
     HloInstruction* new_root = called_computation()->AddInstruction(
         HloInstruction::CreateTuple(tuple_elements));
+    new_root->set_original_value(
+        xla::OriginalValue::CreateFromInstruction(new_root));
 
     // No need to create an original value for a new root with added outputs
     // as the original value is saved in the get-tuple-element instructions
     // that use it.
     called_computation()->set_root_instruction(new_root,
                                                /*accept_different_shape=*/true);
+
+    // Update the shape of the fusion instruction to the shape of the new root.
     *mutable_shape() = new_root->shape();
-    SetOriginalValueOnFusedInstruction(this, first_fused_ov, second_fused_ov);
+    // Update the original value of the fusion instruction to the original value
+    // of the new root.
+    set_original_value(new_root->original_value());
+
     // The instruction might have an existing sharding, which will no longer
     // be valid after we change the shape. So clear the sharding.
     clear_sharding();
     if (root->opcode() == HloOpcode::kTuple) {
-      TF_CHECK_OK(called_computation()->RemoveInstruction(root));
+      CHECK_OK(called_computation()->RemoveInstruction(root));
     }
 
     // If this is a newly created multioutput instruction, we need to update
@@ -2191,7 +2132,7 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     if (newly_created_tuple_instr) {
       HloInstruction* new_instr = AddInstruction(
           HloInstruction::CreateGetTupleElement(root->shape(), this, 0));
-      TF_CHECK_OK(ReplaceAllUsesWithDifferentShape(new_instr));
+      CHECK_OK(ReplaceAllUsesWithDifferentShape(new_instr));
     }
     int64_t index = tuple_elements.size();
     if (do_not_clone) {
@@ -2206,17 +2147,17 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
         HloInstruction* new_gte =
             AddInstruction(HloInstruction::CreateGetTupleElement(
                 old_gte->shape(), this, index + old_tuple_index));
-        TF_CHECK_OK(old_gte->ReplaceAllUsesWith(new_gte));
+        CHECK_OK(old_gte->ReplaceAllUsesWith(new_gte));
         to_be_removed.push_back(old_gte);
       }
       for (auto old_gte : to_be_removed) {
-        TF_CHECK_OK(parent()->RemoveInstruction(old_gte));
+        CHECK_OK(parent()->RemoveInstruction(old_gte));
       }
     } else {
       HloInstruction* new_gte =
           AddInstruction(HloInstruction::CreateGetTupleElement(
               clone->shape(), this, index - 1));
-      TF_CHECK_OK(instruction_to_append->ReplaceAllUsesWith(new_gte));
+      CHECK_OK(instruction_to_append->ReplaceAllUsesWith(new_gte));
     }
   }
 
@@ -2397,7 +2338,7 @@ void HloFusionInstruction::MergeFusionInstruction(
        fused_it != fused_instructions.rend(); ++fused_it) {
     auto fused_instruction = *fused_it;
     if (fused_instruction->opcode() == HloOpcode::kParameter) {
-      TF_CHECK_OK(
+      CHECK_OK(
           fused_instruction->ReplaceAllUsesWith(cloned_fusion->mutable_operand(
               fused_instruction->parameter_number())));
     } else {
@@ -2418,7 +2359,7 @@ void HloFusionInstruction::MergeFusionInstruction(
   CHECK(unfused_root == cloned_fusion->fused_expression_root() ||
         unfused_instructions.empty());
   // Replace instruction_to_merge use of 'this' with unfused_root.
-  TF_CHECK_OK(instruction_to_merge->ReplaceUseWith(this, unfused_root));
+  CHECK_OK(instruction_to_merge->ReplaceUseWith(this, unfused_root));
 
   // Build a dummy root for the cloned fusion as we may remove the original
   // root in the fusion process.
@@ -2436,19 +2377,16 @@ void HloFusionInstruction::MergeFusionInstruction(
   // decide if a side-effectful instruction is fusible).
   for (auto& instruction : unfused_instructions) {
     auto* fused = FuseInstruction(instruction);
-    TF_CHECK_OK(instruction->ReplaceAllUsesWith(fused));
-    TF_CHECK_OK(instruction->parent()->RemoveInstruction(instruction));
+    CHECK_OK(instruction->ReplaceAllUsesWith(fused));
+    CHECK_OK(instruction->parent()->RemoveInstruction(instruction));
   }
   CHECK_EQ(0, cloned_fusion->user_count());
-  TF_CHECK_OK(GetModule()->RemoveEmbeddedComputation(
+  CHECK_OK(GetModule()->RemoveEmbeddedComputation(
       cloned_fusion->fused_instructions_computation()));
 }
 
 void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput(
     HloFusionInstruction* instruction_to_merge) {
-  auto first_fused_ov = GetOriginalValueOrPlaceholderForFusion(this);
-  auto second_fused_ov =
-      GetOriginalValueOrPlaceholderForFusion(instruction_to_merge);
   // Add all non-parameter fused instructions to 'unfused_instructions' to be
   // merged into 'this'. `old_to_new' maps the instructions in the fused node
   // to the disassembled fusion instructions.
@@ -2517,8 +2455,8 @@ void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput(
       for (HloInstruction* gte : instruction_to_merge->users()) {
         if (gte->opcode() == HloOpcode::kGetTupleElement &&
             gte->tuple_index() == tuple_index) {
-          TF_CHECK_OK(gte->ReplaceAllUsesWith(new_root));
-          TF_CHECK_OK(gte->parent()->RemoveInstruction(gte));
+          CHECK_OK(gte->ReplaceAllUsesWith(new_root));
+          CHECK_OK(gte->parent()->RemoveInstruction(gte));
         }
       }
     }
@@ -2534,12 +2472,12 @@ void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput(
                       ->parameter_number())
             : unfused_instructions.back();
     new_roots.insert(unfused_root);
-    TF_CHECK_OK(instruction_to_merge->ReplaceAllUsesWith(unfused_root));
+    CHECK_OK(instruction_to_merge->ReplaceAllUsesWith(unfused_root));
   }
-  TF_CHECK_OK(
+  CHECK_OK(
       instruction_to_merge->parent()->RemoveInstruction(instruction_to_merge));
   if (GetModule()) {
-    TF_CHECK_OK(GetModule()->RemoveEmbeddedComputation(computation_to_merge));
+    CHECK_OK(GetModule()->RemoveEmbeddedComputation(computation_to_merge));
   }
   for (int64_t i = unfused_instructions.size() - 1; i >= 0; --i) {
     HloInstruction* instruction = unfused_instructions[i];
@@ -2548,10 +2486,7 @@ void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput(
     } else {
       FuseInstruction(instruction);
     }
-    TF_CHECK_OK(instruction->parent()->RemoveInstruction(instruction));
-  }
-  if (this->IsMultiOutputFusion()) {
-    SetOriginalValueOnFusedInstruction(this, first_fused_ov, second_fused_ov);
+    CHECK_OK(instruction->parent()->RemoveInstruction(instruction));
   }
 }
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 9435d5e5869f0e..7c6753c80071a6 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -18,7 +18,6 @@ limitations under the License.
 #ifndef XLA_HLO_IR_HLO_INSTRUCTIONS_H_
 #define XLA_HLO_IR_HLO_INSTRUCTIONS_H_
 
-#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -51,7 +50,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/gtl/iterator_range.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/status.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
 
@@ -1827,7 +1825,7 @@ class HloInfeedInstruction : public HloInstruction {
   // as the shape of the infeed instruction which produces a tuple containing
   // the infeed data shape and a TOKEN.
   const Shape& infeed_shape() const {
-    TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape()));
+    DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape()));
     return ShapeUtil::GetSubshape(shape(), {0});
   }
   // Returns a serialized representation of this instruction.
@@ -2930,6 +2928,8 @@ inline constexpr absl::string_view kPinCustomCallTarget = "Pin";
 inline constexpr absl::string_view kUnpinCustomCallTarget = "Unpin";
 inline constexpr absl::string_view kCreateBufferCustomCallTarget =
     "CreateBuffer";
+inline constexpr absl::string_view kCollectiveMetadataCustomCallTarget =
+    "CollectiveMetadata";
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index 53c7f414b594f1..f79b6ccfa396c7 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -73,7 +73,6 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tuple_tree.h"
 #include "xla/util.h"
@@ -339,7 +338,7 @@ void HloModule::ReplaceComputations(
     }
     auto& computation = *iterator;
     if (replacements.contains(computation.get())) {
-      TF_CHECK_OK(RemoveEmbeddedComputation(iterator));
+      CHECK_OK(RemoveEmbeddedComputation(iterator));
     }
   }
 
@@ -659,7 +658,6 @@ absl::Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions()
   }
   return absl::OkStatus();
 }
-
 /* static */
 absl::Status HloModule::UpdateIdsInSchedule(
     HloModuleProto& proto, int64_t computation_proto_id,
@@ -723,13 +721,29 @@ absl::StatusOr<HloModuleProto> HloModule::RemapInstructionIds(
   return proto_copy;
 }
 
+/* static */
+absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
+    const HloModuleProto& proto, const HloModuleConfig& module_config,
+    BufferAssignmentProto* buffer_assignment_proto,
+    bool preserve_instruction_ids) {
+  return CreateFromProto(proto, module_config, /*prohibit_empty_literal=*/true,
+                         /*comp_envs=*/nullptr, preserve_instruction_ids,
+                         buffer_assignment_proto);
+}
+
 /* static */
 absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config,
     bool prohibit_empty_literal,
-    std::unique_ptr<CompilationEnvironments> comp_envs) {
+    std::unique_ptr<CompilationEnvironments> comp_envs,
+    bool preserve_instruction_ids,
+    BufferAssignmentProto* buffer_assignment_proto) {
   VLOG(2) << "CreateFromProto()";
   XLA_VLOG_LINES(3, proto.DebugString());
+  bool buffer_assignment_needs_remap =
+      !preserve_instruction_ids && buffer_assignment_proto != nullptr;
+  bool requires_remap_memorization =
+      proto.has_schedule() || buffer_assignment_needs_remap;
 
   // The ProgramShape in the passed in module config must match the shapes of
   // the entry parameters and root.
@@ -763,11 +777,20 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   absl::flat_hash_map<HloComputation*, int64_t> to_proto_id;
   std::vector<std::unique_ptr<HloComputation>> computations;
   HloComputation* entry = nullptr;
+  // Only used for fixing the schedule or buffer assignment later
+  absl::flat_hash_map<int64_t, absl::flat_hash_map<int64_t, int64_t>>
+      computation_id_to_id_remap_map;
   for (const HloComputationProto& computation_proto : proto.computations()) {
+    // Old instruction ids to new instruction ids after they are loaded into the
+    // computation and potentially changed. Only used for fixing the schedule
+    // or buffer assignment later.
+    absl::flat_hash_map<int64_t, int64_t> id_remap_map;
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloComputation> computation,
-        HloComputation::CreateFromProto(computation_proto, computation_map,
-                                        prohibit_empty_literal));
+        HloComputation::CreateFromProto(
+            computation_proto, computation_map, prohibit_empty_literal,
+            preserve_instruction_ids,
+            requires_remap_memorization ? &id_remap_map : nullptr));
     CHECK_NE(computation.get(), nullptr);
     int64_t computation_id = computation_proto.id();
     TF_RET_CHECK(computation_id != -1);
@@ -778,6 +801,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
       entry = computation.get();
     }
     computations.push_back(std::move(computation));
+    if (requires_remap_memorization) {
+      computation_id_to_id_remap_map[computation_id] = std::move(id_remap_map);
+    }
   }
   TF_RET_CHECK(entry != nullptr);
 
@@ -816,10 +842,20 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   if (proto.has_schedule()) {
     TF_ASSIGN_OR_RETURN(
         HloSchedule schedule,
-        HloSchedule::CreateFromProto(module.get(), proto.schedule()));
+        HloSchedule::CreateFromProto(module.get(), proto.schedule(),
+                                     preserve_instruction_ids
+                                         ? nullptr
+                                         : &computation_id_to_id_remap_map));
     TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
   }
 
+  // If a pointer to a buffer assignment proto is provided, that means we need
+  // to keep the HloModule and the Buffer Assignment proto consistent.
+  if (buffer_assignment_needs_remap) {
+    TF_RETURN_IF_ERROR(UpdateBufferAssignmentProto(
+        buffer_assignment_proto, computation_id_to_id_remap_map));
+  }
+
   for (const auto& prefetch : proto.cross_program_prefetches()) {
     module->AddCrossProgramPrefetch(
         prefetch.parameter(),
@@ -879,6 +915,42 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   return module;
 }
 
+/* static */
+absl::Status HloModule::UpdateBufferAssignmentProto(
+    BufferAssignmentProto* buffer_assignment_proto,
+    const absl::flat_hash_map<int64_t, absl::flat_hash_map<int64_t, int64_t>>&
+        computation_id_to_id_remap_map) {
+  for (xla::LogicalBufferProto& logical_buffer :
+       *buffer_assignment_proto->mutable_logical_buffers()) {
+    int64_t computation_id = HloInstruction::CalculateParentId(
+        logical_buffer.defined_at().instruction_id());
+    TF_RET_CHECK(computation_id_to_id_remap_map.contains(computation_id))
+        << "Computation id " << computation_id << " not found in id remap map.";
+    TF_RET_CHECK(computation_id_to_id_remap_map.at(computation_id)
+                     .contains(logical_buffer.defined_at().instruction_id()))
+        << "Instruction id " << logical_buffer.defined_at().instruction_id()
+        << " not found in id remap map for computation id " << computation_id;
+    logical_buffer.mutable_defined_at()->set_instruction_id(
+        computation_id_to_id_remap_map.at(computation_id)
+            .at(logical_buffer.defined_at().instruction_id()));
+  }
+  for (xla::BufferAssignmentProto::BufferAlias& buffer_alias :
+       *buffer_assignment_proto->mutable_buffer_aliases()) {
+    int64_t computation_id = HloInstruction::CalculateParentId(
+        buffer_alias.location().instruction_id());
+    TF_RET_CHECK(computation_id_to_id_remap_map.contains(computation_id))
+        << "Computation id " << computation_id << " not found in id remap map.";
+    TF_RET_CHECK(computation_id_to_id_remap_map.at(computation_id)
+                     .contains(buffer_alias.location().instruction_id()))
+        << "Instruction id " << buffer_alias.location().instruction_id()
+        << " not found in id remap map for computation id " << computation_id;
+    buffer_alias.mutable_location()->set_instruction_id(
+        computation_id_to_id_remap_map.at(computation_id)
+            .at(buffer_alias.location().instruction_id()));
+  }
+  return absl::OkStatus();
+}
+
 /* static */
 absl::StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
     const ProgramShape& program_shape, const DebugOptions& debug_options,
@@ -986,15 +1058,27 @@ absl::StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
   return config;
 }
 
+/* static */
+absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProtoWithConfig(
+    const HloModuleProtoWithConfig& proto,
+    BufferAssignmentProto* buffer_assignment_proto,
+    bool preserve_instruction_ids) {
+  return CreateFromProtoWithConfig(
+      proto, /*prohibit_empty_literal=*/true,
+      /*comp_envs=*/nullptr, preserve_instruction_ids, buffer_assignment_proto);
+}
+
 absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProtoWithConfig(
     const HloModuleProtoWithConfig& proto, bool prohibit_empty_literal,
-    std::unique_ptr<CompilationEnvironments> comp_envs) {
+    std::unique_ptr<CompilationEnvironments> comp_envs,
+    bool preserve_instruction_ids,
+    BufferAssignmentProto* buffer_assignment_proto) {
   const auto& hlo_module_proto = proto.hlo_module();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> config_ptr,
                       HloModuleConfig::CreateFromProto(proto.config()));
-  return HloModule::CreateFromProto(hlo_module_proto, *config_ptr,
-                                    prohibit_empty_literal,
-                                    std::move(comp_envs));
+  return HloModule::CreateFromProto(
+      hlo_module_proto, *config_ptr, prohibit_empty_literal,
+      std::move(comp_envs), preserve_instruction_ids, buffer_assignment_proto);
 }
 
 namespace {
@@ -1059,7 +1143,7 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
             parameter_count, old_operand->shape(), "p"));
         ++parameter_count;
       }
-      TF_CHECK_OK(
+      CHECK_OK(
           outlined_instruction->ReplaceOperandWith(operand_num, *operand_slot));
     }
 
@@ -1100,10 +1184,10 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
   VLOG(2) << "as a call " << call->ToString();
   VLOG(2) << "to " << nested_computation->ToString();
 
-  TF_CHECK_OK(output->ReplaceAllUsesWith(call));
+  CHECK_OK(output->ReplaceAllUsesWith(call));
   for (auto i = instructions_to_outline.rbegin();
        i != instructions_to_outline.rend(); ++i) {
-    TF_CHECK_OK(computation->RemoveInstruction(*i));
+    CHECK_OK(computation->RemoveInstruction(*i));
   }
 
   return call;
@@ -1370,7 +1454,7 @@ void HloModule::Clone(const std::string& suffix, HloCloneContext* context,
         }
       }
     }
-    TF_CHECK_OK(module->set_schedule(std::move(clone_schedule)));
+    CHECK_OK(module->set_schedule(std::move(clone_schedule)));
   }
   for (const auto& [parameter, indices, offset] : CrossProgramPrefetches()) {
     module->AddCrossProgramPrefetch(parameter, indices, offset);
@@ -1609,33 +1693,32 @@ void HloModule::OriginalValueRecoveryTable::AddRecoveryComputation(
   for (const auto& [shape_index, old_original_array] :
        old_original_value->original_arrays()) {
     if (!old_original_array || table_.contains(*old_original_array)) {
-      // If the replaced array is already tracked by the recovery table, we can
+      // If the original array is already tracked by the recovery table, we can
       // ignore it since it is already handled by another path.
       continue;
     }
-    // If build_recovery_computation is not provided, we can just propagate the
-    // replaced original array.
     std::optional<std::unique_ptr<HloModule>> recovery_computation(nullptr);
     if (build_recovery_computation) {
       recovery_computation = build_recovery_computation(
           shape_index, *old_original_array,
           ShapeUtil::GetSubshape(old_inst->shape(), shape_index),
           ShapeUtil::GetSubshape(new_inst->shape(), shape_index));
-    }
-    if (!recovery_computation) {
-      continue;
+      if (!recovery_computation) {
+        // Skips if build_recovery_computation returns a nullopt, which
+        // indicates
+        // the original array is not recoverable.
+        continue;
+      }
     }
     std::optional<OriginalArray>* new_original_array =
         new_inst->original_value()->mutable_original_array(shape_index);
-    if (recovery_computation->get() == nullptr &&
-        !new_original_array->has_value()) {
-      // If the recovery computation is the identity computation and the
-      // replacing original array is not set, we can just propagate the replaced
-      // original array without setting any recovery computation.
-      new_original_array->emplace(*old_original_array);
-      continue;
-    }
     if (!*new_original_array) {
+      if (recovery_computation->get() == nullptr) {
+        // If the recovery computation is a nullptr, it means this is an
+        // identity computation and we can just pass through the original array.
+        new_original_array->emplace(*old_original_array);
+        continue;
+      }
       new_original_array->emplace(
           OriginalArray{GetOriginalValuePlaceholderInstructionName(
                             old_original_array->instruction_name),
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 442e028ab556fd..3d83f8f6a8b3b6 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
@@ -492,33 +493,63 @@ class HloModule {
   // for loading a proto that had its ids manually created, created incorrectly
   // or in an older version of the compiler. Instructions will only have the
   // local id in the id field.
+  ABSL_DEPRECATED(
+      "Use CreateFromProto with preserve_instruction_ids=false "
+      "instead.")
   static absl::StatusOr<HloModuleProto> RemapInstructionIds(
       const HloModuleProto& proto);
 
   // Updates the instruction ids in the computation's schedule to match the new
   // instruction ids as defined by the old_instr_id_to_new_id map. The map only
   // needs to be consistent and unique within the computation level.
+  ABSL_DEPRECATED(
+      "Use CreateFromProto with preserve_instruction_ids=false "
+      "instead when loading the HLO module.")
   static absl::Status UpdateIdsInSchedule(
       HloModuleProto& proto, int64_t computation_proto_id,
       absl::flat_hash_map<int64_t, int64_t>& old_instr_id_to_new_id);
 
-  // Convert an HloModule to or from a proto.
+  // Updates all instruction ids in the buffer assignment proto with modified
+  // instruction ids as defined in the map.
+  static absl::Status UpdateBufferAssignmentProto(
+      BufferAssignmentProto* buffer_assignment_proto,
+      const absl::flat_hash_map<int64_t, absl::flat_hash_map<int64_t, int64_t>>&
+          computation_id_to_id_remap_map);
+
+  // Convert an HloModule to a proto.
   HloModuleProto ToProto() const;
 
-  // Converts an HloModuleProto to an HloModule. If the module had its ids
-  // manually changed or was created in an older version of the compiler, it
-  // might be necessary to call RemapInstructionIds to make the ids consistent
-  // and compact.
+  // Converts an HloModuleProto to an HloModule. If preserve_instruction_ids is
+  // true, the instruction ids in the proto will be preserved. Otherwise, the
+  // instruction ids will be remapped to be consecutive starting from 0. If the
+  // conversion is using too much memory, preserve_instruction_ids should be
+  // set to false. If a pointer to a buffer assignment proto is provided, that
+  // means the proto will be updated to keep the HloModule and the Buffer
+  // Assignment proto consistent.
   static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProto(
       const HloModuleProto& proto, const HloModuleConfig& module_config,
       bool prohibit_empty_literal = true,
-      std::unique_ptr<CompilationEnvironments> comp_envs = nullptr);
+      std::unique_ptr<CompilationEnvironments> comp_envs = nullptr,
+      bool preserve_instruction_ids = true,
+      BufferAssignmentProto* buffer_assignment_proto = nullptr);
+
+  static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProto(
+      const HloModuleProto& proto, const HloModuleConfig& module_config,
+      BufferAssignmentProto* buffer_assignment_proto,
+      bool preserve_instruction_ids = true);
 
   // Convert an HloModule to or from a proto that includes module configuration
   HloModuleProtoWithConfig ToProtoWithConfig() const;
   static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProtoWithConfig(
       const HloModuleProtoWithConfig& proto, bool prohibit_empty_literal = true,
-      std::unique_ptr<CompilationEnvironments> comp_envs = nullptr);
+      std::unique_ptr<CompilationEnvironments> comp_envs = nullptr,
+      bool preserve_instruction_ids = true,
+      BufferAssignmentProto* buffer_assignment_proto = nullptr);
+
+  static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProtoWithConfig(
+      const HloModuleProtoWithConfig& proto,
+      BufferAssignmentProto* buffer_assignment_proto,
+      bool preserve_instruction_ids = true);
 
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
@@ -979,8 +1010,8 @@ class HloModule {
     // `std::optional<std::unique_ptr<HloModule>>(
     //     const ShapeIndex& index,
     //     const OriginalArray& old_original_array,
-    //     const xla::Shape& old_array_shape,
-    //     const xla::Shape& new_array_shape)`
+    //     const xla::Shape& old_shape,
+    //     const xla::Shape& new_shape)`
     //
     // It is called for each `OriginalArray` in `old_inst` and should
     // return one of the following:
@@ -1011,9 +1042,8 @@ class HloModule {
         const HloInstruction* old_inst, HloInstruction* new_inst,
         std::function<std::optional<std::unique_ptr<HloModule>>(
             const ShapeIndex& index, const OriginalArray& old_original_array,
-            const xla::Shape& old_array_shape,
-            const xla::Shape& new_array_shape)>&& build_recovery_computation =
-            nullptr);
+            const xla::Shape& old_shape, const xla::Shape& new_shape)>&&
+            build_recovery_computation = nullptr);
 
     // Similar to `AddRecoveryComputation`, but the callback is provided an
     // HLO module builder so that caller can directly build the recovery
@@ -1023,8 +1053,8 @@ class HloModule {
         std::function<std::optional<HloInstruction*>(
             xla::HloComputation::Builder& builder, const ShapeIndex& index,
             const OriginalArray& old_original_array,
-            const xla::Shape& old_array_shape,
-            const xla::Shape& new_array_shape)>&& build_recovery_computation);
+            const xla::Shape& old_shape, const xla::Shape& new_shape)>&&
+            build_recovery_computation);
 
     bool empty() const { return table_.empty(); }
 
@@ -1039,6 +1069,8 @@ class HloModule {
     const_iterator begin() const { return table_.begin(); }
     const_iterator end() const { return table_.end(); }
 
+    size_t size() const { return table_.size(); }
+
    private:
     friend class HloModule;
     Table table_;
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_group.cc b/third_party/xla/xla/hlo/ir/hlo_module_group.cc
index 8d5bf16ef79118..5c79393d3033c8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_group.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module_group.cc
@@ -87,10 +87,8 @@ HloModuleGroupProto HloModuleGroup::ToProto() const {
   return HloModuleGroup(std::move(module));
 }
 
-void HloModuleGroup::ReplaceModule(int index,
-                                   std::unique_ptr<HloModule> module) {
-  CHECK_EQ(index, 0);
-  module_->MoveMetadataToModule(module.get());
+void HloModuleGroup::AddModule(std::unique_ptr<HloModule> module) {
+  CHECK_EQ(module_, nullptr);
   module_ = std::move(module);
 }
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_group.h b/third_party/xla/xla/hlo/ir/hlo_module_group.h
index f079bd0dbad624..63fba3704d07f7 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_group.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module_group.h
@@ -54,9 +54,8 @@ class HloModuleGroup {
     return *module_;
   }
 
-  // Replaces the existing module at the given index with the given module. The
-  // existing module is discarded.
-  void ReplaceModule(int index, std::unique_ptr<HloModule> module);
+  // Adds a module to the group, taking ownership of it.
+  void AddModule(std::unique_ptr<HloModule> module);
 
   // Moves all modules from the group into the returned vector. After this
   // method runs, the module group will be empty.
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_test.cc b/third_party/xla/xla/hlo/ir/hlo_module_test.cc
index 51ea184146296d..0ce1ae5dce9507 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_test.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
@@ -26,9 +27,15 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/btree_map.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/hash/hash.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
+#include "xla/hlo/analysis/alias_info.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_print_options.h"
@@ -36,17 +43,21 @@ limitations under the License.
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/compilation_environments.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_proto_util.h"
+#include "xla/service/logical_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
@@ -54,8 +65,17 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::Eq;
 using ::testing::IsEmpty;
+using ::testing::Pointwise;
+using ::testing::Property;
 using ::testing::UnorderedElementsAre;
 
+// Adapts the internal equals proto to work with PointWise
+MATCHER(EqualsProto, "") {
+  const auto& a = ::testing::get<0>(arg);
+  const auto& b = ::testing::get<1>(arg);
+  return ::testing::Matches(tsl::proto_testing::EqualsProto(b))(a);
+}
+
 TEST(HloModuleTest, AbslHashValue) {
   HloModule module1("temp_module", HloModuleConfig());
   HloModule module2("temp_module3", HloModuleConfig());
@@ -203,7 +223,7 @@ TEST(HloModuleTest, CloneGeneral) {
   CreateComputation(m1, "TestComputation1", true, schedule);
   CreateComputation(m1, "TestComputation3", false, schedule);
   CreateComputation(m1, "TestComputation2", false, schedule);
-  TF_CHECK_OK(m1.set_schedule(schedule));
+  CHECK_OK(m1.set_schedule(schedule));
   m1.AddCrossProgramPrefetch(7, ShapeIndex({8}), 100);
 
   std::unique_ptr<HloModule> m2 = m1.Clone(kCloneSuffix);
@@ -245,7 +265,7 @@ TEST(HloModuleTest, CloneWithContextGeneral) {
   CreateComputation(m1, "TestComputation1", true, schedule);
   CreateComputation(m1, "TestComputation3", false, schedule);
   CreateComputation(m1, "TestComputation2", false, schedule);
-  TF_CHECK_OK(m1.set_schedule(schedule));
+  CHECK_OK(m1.set_schedule(schedule));
   m1.AddCrossProgramPrefetch(7, ShapeIndex({8}), 100);
 
   auto [m2, clone_context] = m1.CloneWithContext(kCloneSuffix);
@@ -881,5 +901,328 @@ computations {
               Eq(remapped_hlo_module_proto.computations(1).root_id()));
 }
 
+TEST(HloModuleTest, LoadAndFixNonConsecutiveInstructionIds) {
+  xla::HloModuleProto hlo_module_proto;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      R"pb(
+        name: "some_module"
+        entry_computation_name: "entry_computation"
+        computations {
+          name: "comp2"
+          instructions {
+            name: "arg0.comp2"
+            opcode: "parameter"
+            shape {
+              element_type: S32
+              layout { tail_padding_alignment_in_elements: 1 }
+            }
+            id: 21474836499
+          }
+          instructions {
+            name: "arg1.comp2"
+            opcode: "parameter"
+            shape {
+              element_type: S32
+              layout { tail_padding_alignment_in_elements: 1 }
+            }
+            parameter_number: 1
+            id: 21474836480
+          }
+          instructions {
+            name: "add.comp2"
+            opcode: "tuple"
+            shape {
+              element_type: TUPLE
+              tuple_shapes {
+                element_type: S32
+                layout { tail_padding_alignment_in_elements: 1 }
+              }
+            }
+            id: 21474836488
+            operand_ids: 0
+            operand_ids: 19
+          }
+          instructions {
+            name: "XLA_Retvals.comp2"
+            opcode: "tuple"
+            shape {
+              element_type: TUPLE
+              tuple_shapes {
+                element_type: S32
+                layout { tail_padding_alignment_in_elements: 1 }
+              }
+            }
+            id: 21474836487
+            operand_ids: 0
+          }
+          id: 21
+          root_id: 21474836487
+        }
+        computations {
+          name: "entry_computation"
+          instructions {
+            name: "arg0.1"
+            opcode: "parameter"
+            shape {
+              element_type: S32
+              layout { tail_padding_alignment_in_elements: 1 }
+            }
+            id: 4294967297
+          }
+          instructions {
+            name: "arg1.1"
+            opcode: "parameter"
+            shape {
+              element_type: S32
+              layout { tail_padding_alignment_in_elements: 1 }
+            }
+            parameter_number: 1
+            id: 4294967298
+          }
+          instructions {
+            name: "XLA_Retvals.1"
+            opcode: "tuple"
+            shape {
+              element_type: TUPLE
+              tuple_shapes {
+                element_type: S32
+                layout { tail_padding_alignment_in_elements: 1 }
+              }
+            }
+            id: 4294967303
+            operand_ids: 1
+          }
+          id: 1
+          root_id: 4294967303
+        }
+        host_program_shape {
+          parameters {
+            element_type: S32
+            layout { tail_padding_alignment_in_elements: 1 }
+          }
+          parameters {
+            element_type: S32
+            layout { tail_padding_alignment_in_elements: 1 }
+          }
+          result {
+            element_type: TUPLE
+            tuple_shapes {
+              element_type: S32
+              layout { tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          parameter_names: "arg0"
+          parameter_names: "arg1"
+        }
+        id: 1
+        entry_computation_id: 1
+        schedule {
+          sequences {
+            key: 1
+            value {
+              instruction_ids: 4294967297
+              instruction_ids: 4294967298
+              instruction_ids: 4294967303
+            }
+          }
+          sequences {
+            key: 21
+            value {
+              instruction_ids: 21474836499
+              instruction_ids: 21474836480
+              instruction_ids: 21474836488
+              instruction_ids: 21474836487
+            }
+          }
+        }
+      )pb",
+      &hlo_module_proto));
+
+  TF_ASSERT_OK_AND_ASSIGN(HloModuleConfig config,
+                          xla::HloModule::CreateModuleConfigFromProto(
+                              hlo_module_proto, xla::DebugOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      xla::HloModule::CreateFromProto(hlo_module_proto, config,
+                                      /* prohibit_empty_literal= */ true,
+                                      /* comp_envs= */ nullptr,
+                                      /* preserve_instruction_ids= */ false));
+
+  EXPECT_EQ(module->computation_count(), 2);
+  HloComputation* entry_computation = module->entry_computation();
+  HloComputation* computation_2 = *std::next(module->computations().begin());
+  EXPECT_EQ(entry_computation->instruction_count(), 3);
+
+  EXPECT_EQ(computation_2->instruction_count(), 4);
+  // Check that ids are consecutive
+  EXPECT_THAT(entry_computation->instructions(),
+              ElementsAre(Property(&xla::HloInstruction::local_id, 0),
+                          Property(&xla::HloInstruction::local_id, 1),
+                          Property(&xla::HloInstruction::local_id, 2)));
+  // Check correct operand translation for entry computation
+  EXPECT_EQ(entry_computation->parameter_instruction(0)->name(), "arg0.1");
+  EXPECT_EQ(entry_computation->parameter_instruction(0)->local_id(), 0);
+  EXPECT_THAT(entry_computation->root_instruction()->operands(),
+              ElementsAre(entry_computation->parameter_instruction(0)));
+  // Check correct operand translation for computation 2
+  EXPECT_THAT(computation_2->parameter_instructions(),
+              ElementsAre(Property(&xla::HloInstruction::local_id, 0),
+                          Property(&xla::HloInstruction::local_id, 1)));
+  EXPECT_THAT(computation_2->parameter_instructions(),
+              ElementsAre(Property(&xla::HloInstruction::name, "arg0.comp2"),
+                          Property(&xla::HloInstruction::name, "arg1.comp2")));
+  // Retvals has operand with local id 0, which in the proto was arg1.comp2
+  EXPECT_THAT(computation_2->root_instruction()->operands(),
+              ElementsAre(computation_2->parameter_instruction(1)));
+  // Check operands for add.comp2
+  EXPECT_THAT(computation_2->GetInstructionWithName("add.comp2")->operands(),
+              ElementsAre(computation_2->parameter_instruction(1),
+                          computation_2->parameter_instruction(0)));
+  // Check Hlo Schedule
+  EXPECT_THAT(
+      module->schedule().GetOrCreateSequence(entry_computation).instructions(),
+      ElementsAre(Property(&xla::HloInstruction::local_id, 0),
+                  Property(&xla::HloInstruction::local_id, 1),
+                  Property(&xla::HloInstruction::local_id, 2)));
+  EXPECT_THAT(
+      module->schedule().GetOrCreateSequence(computation_2).instructions(),
+      ElementsAre(Property(&xla::HloInstruction::local_id, 0),
+                  Property(&xla::HloInstruction::local_id, 1),
+                  Property(&xla::HloInstruction::local_id, 2),
+                  Property(&xla::HloInstruction::local_id, 3)));
+}
+
+TEST(HloModuleTest, TestHloModuleToFromProtoInvarianceInComputation) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(R"(
+  HloModule test_module, is_scheduled=true, entry_computation_layout={(f32[]{:T(256)}, f32[100]{0:T(256)}, f32[100]{0:T(256)})->f32[100]{0:T(256)}}
+
+  %fused_computation (param_0.1: f32[100], param_1.3: f32[100], param_2.1: f32[]) -> f32[100] {
+    %param_2.1 = f32[]{:T(256)S(6)} parameter(2)
+    %broadcast.1 = f32[100]{0:T(256)} broadcast(%param_2.1), dimensions={}
+    %param_0.1 = f32[100]{0:T(256)} parameter(0)
+    %param_1.3 = f32[100]{0:T(256)} parameter(1)
+    %multiply.1 = f32[100]{0:T(256)} multiply(%broadcast.1, %param_1.3)
+    %add.1 = f32[100]{0:T(256)} add(%multiply.1, %param_0.1)
+    ROOT %subtract.1 = f32[100]{0:T(256)} subtract(%add.1, %param_0.1)
+  }
+
+  ENTRY %EntryComputation (p: f32[], p1: f32[100], p2: f32[100]) -> f32[100] {
+    %p = f32[]{:T(256)} parameter(0)
+    %copy = f32[]{:T(256)S(6)} copy(%p)
+    %p2 = f32[100]{0:T(256)} parameter(2)
+    %p1 = f32[100]{0:T(256)} parameter(1)
+    ROOT %add_subtract_fusion = f32[100]{0:T(256)} fusion(%p2, %p1, %copy), kind=kLoop, calls=%fused_computation
+                            })"));
+  HloModuleProto module_proto = module->ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module_from_proto,
+      HloModule::CreateFromProto(module_proto, module->config(),
+                                 /*buffer_assignment_proto=*/nullptr,
+                                 /*preserve_instruction_ids=*/true));
+
+  EXPECT_THAT(
+      module_proto.computations(),
+      Pointwise(EqualsProto(), module_from_proto->ToProto().computations()));
+}
+
+TEST(HloModuleTest, TestCreateFromProtoUpdatesBufferAssignment) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(R"(
+  HloModule test_module, is_scheduled=true, entry_computation_layout={(f32[]{:T(256)}, f32[100]{0:T(256)}, f32[100]{0:T(256)})->f32[100]{0:T(256)}}
+
+  %fused_computation (param_0.1: f32[100], param_1.3: f32[100], param_2.1: f32[]) -> f32[100] {
+    %param_2.1 = f32[]{:T(256)S(6)} parameter(2)
+    %broadcast.1 = f32[100]{0:T(256)} broadcast(%param_2.1), dimensions={}
+    %param_0.1 = f32[100]{0:T(256)} parameter(0)
+    %param_1.3 = f32[100]{0:T(256)} parameter(1)
+    %multiply.1 = f32[100]{0:T(256)} multiply(%broadcast.1, %param_1.3)
+    %add.1 = f32[100]{0:T(256)} add(%multiply.1, %param_0.1)
+    ROOT %subtract.1 = f32[100]{0:T(256)} subtract(%add.1, %param_0.1)
+  }
+
+  ENTRY %EntryComputation (p: f32[], p1: f32[100], p2: f32[100]) -> f32[100] {
+    %p = f32[]{:T(256)} parameter(0)
+    %copy = f32[]{:T(256)S(6)} copy(%p)
+    %p2 = f32[100]{0:T(256)} parameter(2)
+    %p1 = f32[100]{0:T(256)} parameter(1)
+    ROOT %add_subtract_fusion = f32[100]{0:T(256)} fusion(%p2, %p1, %copy), kind=kLoop, calls=%fused_computation
+                            })"));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloModuleConfig config,
+      HloModule::CreateModuleConfigFromShape(
+          module->entry_computation()->ComputeProgramShape(), DebugOptions()));
+
+  module->set_config(std::move(config));
+
+  // Create and save the HLO proto and the buffer assignment proto for the HLO
+  // module.
+  HloProto opt_hlo_module_proto = MakeHloProto(*module);
+
+  AliasInfo alias_info;
+  BufferValue::SizeFunction buffer_size_func =
+      [](const BufferValue& buffer) -> int64_t {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), sizeof(void*));
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer_assignment,
+      BufferAssigner::Run(
+          /*module=*/module.get(),
+          /*hlo_ordering=*/
+          std::make_unique<DependencyHloOrdering>(module.get()),
+          /*buffer_size=*/std::move(buffer_size_func),
+          /*alias_info=*/&alias_info,
+          /*color_alignment=*/[](LogicalBuffer::Color) -> int64_t { return 1; },
+          /*allocate_buffers_for_constants=*/true));
+
+  BufferAssignmentProto buffer_assignment_proto = buffer_assignment->ToProto();
+  *opt_hlo_module_proto.mutable_buffer_assignment() = buffer_assignment_proto;
+
+  // Replace instruction ids with non-consecutive ones
+  absl::flat_hash_map<std::string, std::string> instruction_id_remap_map = {
+      {"4294967298", "4294967323"},
+      {"4294967299", "4294967324"},
+      {"4294967296", "4294967363"},
+      {"4294967297", "4294967423"},
+      {"4294967300", "4294967523"}};
+
+  std::string opt_hlo_module_proto_str;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::PrintToString(
+      opt_hlo_module_proto, &opt_hlo_module_proto_str));
+
+  ASSERT_GT(
+      absl::StrReplaceAll(instruction_id_remap_map, &opt_hlo_module_proto_str),
+      5);
+
+  // Load modified HloProto from string and reassign ids instead of preserving
+  // them.
+  HloProto opt_hlo_module_proto_modified;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      opt_hlo_module_proto_str, &opt_hlo_module_proto_modified));
+
+  // Recreate the hlo module from the altered protos.
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloModuleConfig module_config_recreated,
+      HloModule::CreateModuleConfigFromProto(
+          opt_hlo_module_proto_modified.hlo_module(), DebugOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> hlo_module_recreated,
+      HloModule::CreateFromProto(
+          opt_hlo_module_proto_modified.hlo_module(), module_config_recreated,
+          opt_hlo_module_proto_modified.mutable_buffer_assignment(),
+          /*preserve_instruction_ids=*/false));
+
+  buffer_size_func = [](const BufferValue& buffer) -> int64_t {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), sizeof(void*));
+  };
+  // Will fail if buffer assignment is not updated in the HLO proto.
+  TF_EXPECT_OK(BufferAssignment::FromProto(
+      opt_hlo_module_proto_modified.buffer_assignment(),
+      hlo_module_recreated.get(), std::move(buffer_size_func), &alias_info));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_opcode.h b/third_party/xla/xla/hlo/ir/hlo_opcode.h
index 63d07893fe99f1..f705ca2e6fcd50 100644
--- a/third_party/xla/xla/hlo/ir/hlo_opcode.h
+++ b/third_party/xla/xla/hlo/ir/hlo_opcode.h
@@ -62,6 +62,7 @@ namespace xla {
   V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                             \
   V(kAnd, "and", 2)                                                            \
   V(kAsin, "asin", 1)                                                          \
+  V(kAsinh, "asinh", 1)                                                        \
   V(kAsyncDone, "async-done", 1)                                               \
   V(kAsyncStart, "async-start", kHloOpcodeIsVariadic)                          \
   V(kAsyncUpdate, "async-update", 1)                                           \
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.cc b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
index c42bb9664e4059..9ca6b693639a7f 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
@@ -219,12 +219,14 @@ std::shared_ptr<OriginalValue> OriginalValue::CreateFromInstruction(
   if (instruction->opcode() == HloOpcode::kTuple) {
     auto original_value = std::make_shared<OriginalValue>(
         TupleTree<std::optional<OriginalArray>>(instruction->shape()));
+    bool has_original_value = false;
     for (int64_t i = 0; i < instruction->operand_count(); ++i) {
       const HloInstruction* operand = instruction->operand(i);
       auto op_original_value = operand->original_value();
       if (!op_original_value || op_original_value->is_synthetic_call()) {
-        return nullptr;
+        continue;
       }
+      has_original_value = true;
       const auto& op_tree = op_original_value->tree();
       op_tree.ForEachElement([&](const ShapeIndex& index,
                                  const std::optional<OriginalArray>& value) {
@@ -233,7 +235,7 @@ std::shared_ptr<OriginalValue> OriginalValue::CreateFromInstruction(
         *original_value->mutable_tree()->mutable_element(dest_index) = value;
       });
     }
-    return original_value;
+    return has_original_value ? original_value : nullptr;
   }
 
   // Default case: create a new tree with leaves pointing to this instruction.
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.h b/third_party/xla/xla/hlo/ir/hlo_original_value.h
index 039745fa3726d0..5ccfebc6839308 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.h
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.h
@@ -129,6 +129,8 @@ class OriginalValue {
 
   bool IsCompatibleWith(const Shape& shape) const;
 
+  bool IsTuple() const { return tree().IsTuple(); }
+
   bool operator==(const OriginalValue& other) const;
 
   bool operator!=(const OriginalValue& other) const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
index b1387ad14971bb..b70a8f1576587e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
@@ -291,7 +291,7 @@ ENTRY main {
   EXPECT_EQ(p0->original_value()->ToString(), "({\"p0\" {0}}, {\"p0\" {1}})");
 }
 
-TEST_F(OriginalValueHloTest, CreateFromInstructionTupleWithSynthetic) {
+TEST_F(OriginalValueHloTest, CreateFromInstructionTupleWithSyntheticElement) {
   const char* hlo_string = R"(
 HloModule test
 
@@ -312,7 +312,8 @@ ENTRY main {
       std::make_shared<OriginalValue>(OriginalValue::SyntheticCall()));
   tuple->set_original_value(OriginalValue::CreateFromInstruction(tuple));
 
-  EXPECT_EQ(tuple->original_value(), nullptr);
+  ASSERT_NE(tuple->original_value(), nullptr);
+  EXPECT_EQ(tuple->original_value()->ToString(), "({\"p0\"}, {})");
 }
 
 TEST_F(OriginalValueHloTest, CopyOriginalValue) {
@@ -434,5 +435,24 @@ ENTRY main {
   EXPECT_EQ(p0->original_value(), p1->original_value());
 }
 
+TEST_F(OriginalValueHloTest, InferGetTupleElementOriginalValue) {
+  const char* hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  p0 = f32[] parameter(0), origin={{"p0"}}
+  p1 = f32[] parameter(1)
+  tuple = (f32[], f32[]) tuple(p0, p1)
+  ROOT gte = f32[] get-tuple-element(tuple), index=0
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloInstruction* gte = module->entry_computation()->root_instruction();
+
+  EXPECT_NE(gte->original_value(), nullptr);
+  EXPECT_EQ(gte->original_value()->ToString(), R"({"p0"})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.cc b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
index ed4d2662c506bd..6ce39f9d7de319 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
@@ -46,7 +46,9 @@ limitations under the License.
 namespace xla {
 
 /* static */ absl::StatusOr<HloSchedule> HloSchedule::CreateFromProto(
-    const HloModule* module, const HloScheduleProto& proto) {
+    const HloModule* module, const HloScheduleProto& proto,
+    const absl::flat_hash_map<int64_t, absl::flat_hash_map<int64_t, int64_t>>*
+        computation_id_to_instruction_id_remap) {
   absl::flat_hash_map<int64_t, const HloComputation*> id_to_computation;
   for (const HloComputation* computation : module->computations()) {
     id_to_computation[computation->unique_id()] = computation;
@@ -71,14 +73,31 @@ namespace xla {
 
     HloInstructionSequence& sequence =
         schedule.GetOrCreateSequence(computation);
+    if (computation_id_to_instruction_id_remap != nullptr) {
+      TF_RET_CHECK(
+          computation_id_to_instruction_id_remap->contains(computation_id))
+          << "Computation id " << computation_id
+          << " not found in computation_id_to_instruction_id_remap";
+    }
+
     for (const int64_t instruction_id : id_sequence.second.instruction_ids()) {
+      int64_t corrected_instruction_id = instruction_id;
+      if (computation_id_to_instruction_id_remap != nullptr) {
+        TF_RET_CHECK(computation_id_to_instruction_id_remap->at(computation_id)
+                         .contains(instruction_id))
+            << "Instruction id " << instruction_id
+            << " not found in its computation's proto_id_to_instruction_id_map";
+        corrected_instruction_id =
+            computation_id_to_instruction_id_remap->at(computation_id)
+                .at(instruction_id);
+      }
       int64_t complete_unique_id = HloInstruction::CalculateUniqueId(
-          computation->unique_id(), instruction_id);
+          computation->unique_id(), corrected_instruction_id);
       auto instr_it = id_to_instruction.find(complete_unique_id);
       TF_RET_CHECK(instr_it != id_to_instruction.end())
           << "No instruction exists in HLO computation " << computation->name()
-          << " with unique id " << instruction_id << " (complete unique id "
-          << complete_unique_id << ")";
+          << " with unique id " << corrected_instruction_id
+          << " (complete unique id " << complete_unique_id << ")";
       sequence.push_back(instr_it->second);
     }
   }
@@ -336,45 +355,47 @@ absl::Status HloSchedule::Verify() const {
     // For each computation verify the set of instructions is the same and
     // that each dependency and control edge is honored.
     for (const HloComputation* computation : nonfusion_computations) {
-      absl::flat_hash_map<const HloInstruction*, int> instruction_position;
-      int pos = 0;
-      for (const HloInstruction* instruction :
-           sequence(computation).instructions()) {
-        TF_RET_CHECK(instruction_position.insert({instruction, pos}).second)
-            << "Instruction " << instruction->name()
-            << " appears more than once in the schedule";
-        pos++;
-      }
+      TF_RETURN_IF_ERROR(Verify(computation));
+    }
+  }
 
-      TF_RET_CHECK(instruction_position.size() ==
-                   computation->instruction_count())
-          << "Schedule for computation " << computation->name() << " has "
-          << instruction_position.size() << " instructions, expected "
-          << computation->instruction_count();
-      for (const HloInstruction* instruction : computation->instructions()) {
-        TF_RET_CHECK(instruction_position.contains(instruction))
-            << "Instruction " << instruction->name() << " is not in schedule";
-      }
+  return absl::OkStatus();
+}
 
-      for (const HloInstruction* instruction : computation->instructions()) {
-        for (const HloInstruction* operand : instruction->operands()) {
-          TF_RET_CHECK(instruction_position.at(operand) <
-                       instruction_position.at(instruction))
-              << "Instruction " << instruction->name()
-              << " is not scheduled after its operand " << operand->name();
-        }
+absl::Status HloSchedule::Verify(const HloComputation* computation) const {
+  absl::flat_hash_map<const HloInstruction*, int> instruction_position;
+  int pos = 0;
+  for (const HloInstruction* instruction :
+       sequence(computation).instructions()) {
+    TF_RET_CHECK(instruction_position.insert({instruction, pos}).second)
+        << "Instruction " << instruction->name()
+        << " appears more than once in the schedule";
+    pos++;
+  }
+  TF_RET_CHECK(instruction_position.size() == computation->instruction_count())
+      << "Schedule for computation " << computation->name() << " has "
+      << instruction_position.size() << " instructions, expected "
+      << computation->instruction_count();
+  for (const HloInstruction* instruction : computation->instructions()) {
+    TF_RET_CHECK(instruction_position.contains(instruction))
+        << "Instruction " << instruction->name() << " is not in schedule";
+  }
 
-        for (const HloInstruction* pred : instruction->control_predecessors()) {
-          TF_RET_CHECK(instruction_position.at(pred) <
-                       instruction_position.at(instruction))
-              << "Instruction " << instruction->name()
-              << " is not scheduled after its control predecessor "
-              << pred->name();
-        }
-      }
+  for (const HloInstruction* instruction : computation->instructions()) {
+    for (const HloInstruction* operand : instruction->operands()) {
+      TF_RET_CHECK(instruction_position.at(operand) <
+                   instruction_position.at(instruction))
+          << "Instruction " << instruction->name()
+          << " is not scheduled after its operand " << operand->name();
     }
-  }
 
+    for (const HloInstruction* pred : instruction->control_predecessors()) {
+      TF_RET_CHECK(instruction_position.at(pred) <
+                   instruction_position.at(instruction))
+          << "Instruction " << instruction->name()
+          << " is not scheduled after its control predecessor " << pred->name();
+    }
+  }
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.h b/third_party/xla/xla/hlo/ir/hlo_schedule.h
index 63078c4d0a3b24..aef51e1e44791e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.h
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.h
@@ -157,9 +157,16 @@ class HloSchedule {
  public:
   explicit HloSchedule(const HloModule* module) : module_(module) {}
 
-  // (De)Serialize an HloSchedule to/from a HloScheduleProto.
+  // (De)Serialize an HloSchedule to/from a HloScheduleProto. If
+  // proto_id_to_instruction_id_map is provided, it will be used to map the
+  // instruction ids in the proto to the instruction ids in the HloModule. This
+  // is necessary if the HloModuleProto was created with
+  // preserve_instruction_ids=false. The map must use full instruction unique
+  // ids as keys.
   static absl::StatusOr<HloSchedule> CreateFromProto(
-      const HloModule* module, const HloScheduleProto& proto);
+      const HloModule* module, const HloScheduleProto& proto,
+      const absl::flat_hash_map<int64_t, absl::flat_hash_map<int64_t, int64_t>>*
+          computation_id_to_instruction_id_remap = nullptr);
   absl::StatusOr<HloScheduleProto> ToProto() const;
 
   // Returns a reference to the sequence for the given computation.
@@ -235,6 +242,9 @@ class HloSchedule {
   // satisfied in the schedule.
   absl::Status Verify() const;
 
+  // Verifies that the given schedule is valid for the given computation.
+  absl::Status Verify(const HloComputation* computation) const;
+
   std::string ToString() const;
 
   bool empty() const { return sequences_.empty(); }
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index edb48ebd198042..9a5f8d681b488a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/hlo/ir/hlo_op_metadata.h"
+#include "xla/hlo/ir/named_sharding.h"
 #include "xla/overflow_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
@@ -140,7 +140,11 @@ bool NextIndex(absl::InlinedVector<int64_t, 6>* index,
 }  // namespace
 
 HloSharding HloSharding::AssignDevice(int64_t device_id,
-                                      absl::Span<const OpMetadata> metadata) {
+                                      absl::Span<const OpMetadata> metadata,
+                                      bool use_named_sharding) {
+  if (use_named_sharding) {
+    return HloSharding(NamedSharding::MaximalSharding(device_id, metadata));
+  }
   return HloSharding(device_id, metadata);
 }
 
@@ -305,7 +309,9 @@ HloSharding HloSharding::Subgroup(
                   kOrderedTypes[6] == 5);
     for (OpSharding::Type type : kOrderedTypes) {
       auto& dims = type_to_dims[type];
-      if (dims.empty()) continue;
+      if (dims.empty()) {
+        continue;
+      }
       int64_t dim_size = 1;
       for (int64_t dim : dims) {
         perm.push_back(dim);
@@ -521,7 +527,9 @@ bool HloSharding::UsesDevice(int64_t device) const {
       return s.UsesDevice(device);
     });
   }
-  return replicated_ || manual_ || tile_assignment_.UsesDevice(device);
+
+  return IsReplicatedLeaf() || IsManualLeaf() ||
+         TileAgnosticDeviceAssignment().UsesDevice(device);
 }
 
 std::map<int64_t, int64_t> HloSharding::UsedDevices(int64_t* count) const {
@@ -725,9 +733,8 @@ absl::StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
       index_to_sharding.second = *it++;
     }
     return result;
-  } else {
-    return ShapeTree<HloSharding>(shape, *this);
   }
+  return ShapeTree<HloSharding>(shape, *this);
 }
 
 absl::StatusOr<HloSharding> HloSharding::GetTupleSharding(
@@ -761,8 +768,10 @@ std::optional<int64_t> HloSharding::UniqueDevice() const {
     }
     return unique_device;
   }
-  if (!replicated_ && maximal_) {
-    return static_cast<int64_t>(*tile_assignment_.array().begin());
+
+  if (!IsReplicatedLeaf() && IsTileMaximalLeaf()) {
+    return static_cast<int64_t>(
+        *TileAgnosticDeviceAssignment().array().begin());
   }
   return std::nullopt;
 }
@@ -776,7 +785,7 @@ int64_t HloSharding::GetUniqueDevice() const {
 absl::Status HloSharding::ValidateTuple(
     const Shape& shape, std::optional<int64_t> num_devices) const {
   if (!shape.IsTuple()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Sharding is tuple-shaped but validation shape is not.");
   }
   TF_RETURN_IF_ERROR(CheckLeafCount(shape));
@@ -817,72 +826,103 @@ absl::Status HloSharding::Validate(const Shape& shape,
   return status;
 }
 
+namespace {
+absl::Status DeviceInRange(int64_t device, std::optional<int64_t> num_devices) {
+  if (device < 0) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("device %d is negative in tile assignment", device));
+  }
+  if (num_devices.has_value() && device >= *num_devices) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("device %d >= num_devices (%d) in tile assignment",
+                        device, *num_devices));
+  }
+  return absl::OkStatus();
+}
+}  // namespace
+
 absl::Status HloSharding::ValidateNonTuple(
     const Shape& shape, std::optional<int64_t> num_devices) const {
   if (shape.IsTuple()) {
     return absl::InvalidArgumentError(
         "Validation shape is a tuple but sharding is not.");
   }
-  if (replicated_) {
+  if (IsReplicatedLeaf() || IsManualLeaf() || IsUnreducedLeaf() ||
+      IsUnknownLeaf()) {
     return absl::OkStatus();
   }
 
-  // All tile assignments must be less than the number of available devices and
-  // unique.
-  bool all_devices_seen;
-  if (!tile_assignment_.iota_) {
-    absl::flat_hash_set<int64_t> seen_devices;
-    absl::Status status = tile_assignment_.array().EachStatus(
-        [&num_devices, &seen_devices](absl::Span<const int64_t> indices,
-                                      int32_t device) {
-          if (num_devices.has_value() && device >= *num_devices) {
-            return absl::InvalidArgumentError(
-                absl::StrCat("device ", device, " > num_devices (",
-                             *num_devices, ") in tile assignment"));
-          } else if (seen_devices.contains(device)) {
-            return absl::InvalidArgumentError(absl::StrCat(
-                "device ", device, " is not unique in tile assignment"));
-          }
-          seen_devices.insert(device);
-          return absl::OkStatus();
-        });
-    TF_RETURN_IF_ERROR(status);
-    all_devices_seen =
-        !num_devices.has_value() || seen_devices.size() == *num_devices;
-  } else {
-    all_devices_seen = !num_devices.has_value() ||
-                       tile_assignment_.iota_->num_elements() == *num_devices;
+  if (IsTileMaximalLeaf()) {
+    CHECK(!TileAgnosticDeviceAssignment().iota_);
+    if (TileAgnosticDeviceAssignment().array().num_elements() != 1) {
+      return absl::InvalidArgumentError(
+          "Tile maximal sharding must have a single device assignment.");
+    }
+    return DeviceInRange(TileAgnosticDeviceAssignment().first(), num_devices);
   }
 
-  if (IsTileMaximal() || IsManual() || IsUnreduced() || IsUnknown()) {
-    return absl::OkStatus();
+  // The correct constructor has to be used to create tile maximal shardings.
+  if (TileAgnosticDeviceAssignment().num_elements() == 1) {
+    return absl::InvalidArgumentError(
+        "Tile assignment only contains a single device. If a replicated "
+        "sharding was intended, use HloSharding::Replicated(). If a device "
+        "placement was intended, use HloSharding::AssignDevice()");
   }
 
   // The tile assignment tensor must have the same rank as the tiled data rank.
   if (shape.dimensions().size() != TiledDataRank()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         "Number of tile assignment dimensions (excluding subgroups) is "
-        "different than the input rank. "
-        "sharding=",
-        ToString(), ", input_shape=", ShapeUtil::HumanString(shape));
+        "different than the input rank. sharding=",
+        ToString(), ", input_shape=", ShapeUtil::HumanString(shape)));
   }
 
-  // All devices should be seen in the tile assignment.
-  if (!all_devices_seen) {
-    return tsl::errors::InvalidArgument("tile_assignment should have ",
-                                        *num_devices, " devices");
+  if (tile_assignment_.iota_) {
+    if (num_devices.has_value() &&
+        tile_assignment_.iota_->num_elements() != *num_devices) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "tile_assignment should have %d devices but has %d", *num_devices,
+          tile_assignment_.iota_->num_elements()));
+    }
+    return absl::OkStatus();
   }
 
-  // The correct constructor has to be used to create tile maximal shardings.
-  if (tile_assignment_.num_elements() == 1) {
-    return tsl::errors::InvalidArgument(
-        "Tile assignment only contains a single device. If a replicated "
-        "sharding was intended, use HloSharding::Replicated(). If a device "
-        "placement was intended, use HloSharding::AssignDevice()");
+  absl::flat_hash_set<int64_t> seen_devices;
+  absl::Status status = tile_assignment_.array().EachStatus(
+      [&num_devices, &seen_devices](absl::Span<const int64_t> indices,
+                                    int64_t device) {
+        TF_RETURN_IF_ERROR(DeviceInRange(device, num_devices));
+        if (!seen_devices.insert(device).second) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "device ", device, " is not unique in tile assignment"));
+        }
+        return absl::OkStatus();
+      });
+  TF_RETURN_IF_ERROR(status);
+  if (num_devices.has_value() && seen_devices.size() != *num_devices) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("tile_assignment should have %d devices but has %d",
+                        *num_devices, seen_devices.size()));
   }
+
   return absl::OkStatus();
 }
 
+const TileAssignment& HloSharding::TileAgnosticDeviceAssignment() const {
+  // Returns device assignment regardless of sharding tiling.
+  //  - named_sharding_->device_assignment() only contains the information of
+  //    the mesh without information of how axes are used.
+  //  - tile_assignment_ keeps the information of mesh and how axes are used.
+  //
+  // For example, a NamedSharding [mesh= ['a'=2, 'b'=2] {'a'}, {}] and
+  // HloSharding [2,1,2]<=4 last_tile_dim_replicate would have the same
+  // underlying device order as: {{0, 1}, {2, 3}}.
+  if (UseNamedShardingLeaf()) {
+    return named_sharding_->device_assignment();
+  }
+  return tile_assignment_;
+}
+
 /*static*/ absl::StatusOr<HloSharding> HloSharding::FromProto(
     const OpSharding& proto) {
   std::vector<OpMetadata> metadata(proto.metadata().begin(),
@@ -1165,9 +1205,8 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
         absl::MakeConstSpan(
             &*begin_it,
             &*(begin_it + ShapeUtil::GetLeafCountTuple(*sub_shape))));
-  } else {
-    return tuple_elements_[sharding_index];
   }
+  return tuple_elements_[sharding_index];
 }
 
 std::optional<HloSharding> HloSharding::ExtractSingleSharding() const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index 071435dfe80841..92b1695feee4ec 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
+#include "xla/hlo/ir/named_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"  // IWYU pragma: export
 #include "xla/printer.h"
 #include "xla/shape.h"
@@ -53,55 +54,13 @@ class HloSharding {
   static inline constexpr absl::string_view kShardingFrontendAttrName =
       "xla.sdy.sharding";
 
-  // C++ representation for corresponding proto types in `xla_data.proto` so
-  // same documentation applies, except AxisRef elements are pointers to
-  // `MeshAxis` elements instead of indices.
-  //
-  // TODO(b/449783607): Move mesh, axis to mesh_and_axis.h and move
-  // NamedSharding out of HloSharding to match proto after change to using
-  // mesh_and_axis.h. Currently simply moving this out will cause name
-  // clashes with proto as they both use same xla namespace.
-  struct MeshAxis {
-    std::string name;
-    int64_t size;
-  };
-
-  struct Mesh {
-    std::vector<MeshAxis> axes;
-    std::vector<int64_t> device_ids;
-  };
-
-  struct AxisRef {
-    struct SubAxis {
-      int64_t pre_size;
-      int64_t size;
-    };
-
-    const MeshAxis* axis;
-    std::optional<SubAxis> sub_axis_info;
-  };
-
-  // C++ representation for corresponding `OpSharding::NamedSharding` proto.
-  //
-  // TODO(b/450770542): Add corresponding IFTTT in attrs.td
-  class NamedSharding {
-    struct DimensionSharding {
-      std::vector<AxisRef> axes;
-      bool is_closed;
-    };
-
-    std::vector<NamedSharding> tuple_shardings_;
-
-    Mesh mesh_;
-    std::vector<DimensionSharding> dim_shardings_;
-    std::vector<AxisRef> replicated_axes_;
-    std::vector<AxisRef> unreduced_axes_;
-    std::vector<OpMetadata> metadata_;
-  };
-
   // Creates a trivial sharding that replicates a maximal tile across all
   // devices.
-  static HloSharding Replicate(absl::Span<const OpMetadata> metadata = {}) {
+  static HloSharding Replicate(absl::Span<const OpMetadata> metadata = {},
+                               bool use_named_sharding = false) {
+    if (use_named_sharding) {
+      return HloSharding(NamedSharding::Replicate(metadata));
+    }
     return HloSharding(/*manual=*/false, /*replicated=*/true, /*unknown=*/false,
                        /*unreduced=*/false, metadata);
   }
@@ -126,7 +85,8 @@ class HloSharding {
   // Creates a sharding that emulates device placement; a tile shape equal to
   // the input shape (one tile) assigned to a single device.
   static HloSharding AssignDevice(int64_t device_id,
-                                  absl::Span<const OpMetadata> metadata = {});
+                                  absl::Span<const OpMetadata> metadata = {},
+                                  bool use_named_sharding = false);
 
   // Creates a new sharding which splits a shape into tiles amongst the devices
   // specified by `tile_assignment`.
@@ -233,26 +193,35 @@ class HloSharding {
   absl::Status Validate(const Shape& shape,
                         std::optional<int64_t> num_devices = {}) const;
 
+  // Returns true if the sharding is represented using `NamedSharding` format.
+  bool UseNamedShardingLeaf() const {
+    DCHECK(!IsTuple());
+    return named_sharding_.has_value();
+  }
+
   // Returns true if the sharding has tuple type.
   bool IsTuple() const { return tuple_; }
 
   // Returns true if the sharding is trivial: replicate on all devices.
   bool IsReplicated() const {
     if (!IsTuple()) {
-      return replicated_;
+      return IsReplicatedLeaf();
     }
     return absl::c_all_of(
         tuple_elements_, [](const HloSharding& s) { return s.IsReplicated(); });
   }
   bool IsReplicatedLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->IsReplicated();
+    }
     return replicated_;
   }
 
   // Returns true if the tile size is the same as the input size.
   bool IsTileMaximal() const {
     if (!IsTuple()) {
-      return maximal_;
+      return IsTileMaximalLeaf();
     }
     return absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
       return s.IsTileMaximal();
@@ -260,32 +229,44 @@ class HloSharding {
   }
   bool IsTileMaximalLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->IsTileMaximal();
+    }
     return maximal_;
   }
 
   // Returns whether the sharding represents manual partitioning.
   bool IsManual() const {
     if (!IsTuple()) {
-      return manual_;
+      return IsManualLeaf();
     }
     return absl::c_all_of(tuple_elements_,
                           [](const HloSharding& s) { return s.IsManual(); });
   }
   bool IsManualLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      // ManualSharding is represented by separate ManualComputationOp in named
+      // sharding format.
+      return false;
+    }
     return manual_;
   }
 
   // Returns whether the sharding represents a placeholder sharding.
   bool IsUnknown() const {
     if (!IsTuple()) {
-      return unknown_;
+      return IsUnknownLeaf();
     }
     return absl::c_all_of(tuple_elements_,
                           [](const HloSharding& s) { return s.IsUnknown(); });
   }
   bool IsUnknownLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      // There is no Unknown sharding type in named sharding format.
+      return false;
+    }
     return unknown_;
   }
 
@@ -303,6 +284,11 @@ class HloSharding {
 
   bool IsShardGroup() const {
     if (!IsTuple()) {
+      if (UseNamedShardingLeaf()) {
+        // Sharding groups are represented by separate ShardingGroup op in named
+        // sharding format.
+        return false;
+      }
       return shard_group_.shard_group_id != -1 &&
              (shard_group_.shard_like || shard_group_.shard_as);
     }
@@ -314,6 +300,11 @@ class HloSharding {
 
   bool IsShardAs() const {
     if (!IsTuple()) {
+      if (UseNamedShardingLeaf()) {
+        // Sharding groups are represented by separate ShardingGroup op in named
+        // sharding format.
+        return false;
+      }
       return shard_group_.shard_group_id != -1 && shard_group_.shard_as;
     }
     return !tuple_elements_.empty() &&
@@ -323,6 +314,11 @@ class HloSharding {
 
   bool IsShardLike() const {
     if (!IsTuple()) {
+      if (UseNamedShardingLeaf()) {
+        // Sharding groups are represented by separate ShardingGroup op in named
+        // sharding format.
+        return false;
+      }
       return shard_group_.shard_group_id != -1 && shard_group_.shard_like;
     }
     return !tuple_elements_.empty() &&
@@ -492,7 +488,8 @@ class HloSharding {
            tuple_elements_ == other.tuple_elements_ &&
            replicate_on_last_tile_dim_ == other.replicate_on_last_tile_dim_ &&
            subgroup_types_ == other.subgroup_types_ &&
-           shard_group_ == other.shard_group_;
+           shard_group_ == other.shard_group_ &&
+           named_sharding_ == other.named_sharding_;
   }
   bool operator!=(const HloSharding& other) const { return !(*this == other); }
 
@@ -680,7 +677,8 @@ class HloSharding {
         manual_(manual),
         unknown_(unknown),
         unreduced_(unreduced),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
   // device_id values:
   // -2: magic number to mean unassigned device, used by spatial partitioning
   // -1: the id of the host
@@ -696,7 +694,8 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
   explicit HloSharding(TileAssignment tile_assignment,
                        bool replicate_on_last_tile_dim,
                        absl::Span<const OpMetadata> metadata = {})
@@ -708,7 +707,8 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(replicate_on_last_tile_dim) {}
+        replicate_on_last_tile_dim_(replicate_on_last_tile_dim),
+        named_sharding_(std::nullopt) {}
   explicit HloSharding(TileAssignment tile_assignment,
                        absl::Span<const OpSharding::Type> subgroup_types,
                        absl::Span<const OpMetadata> metadata = {})
@@ -721,7 +721,8 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
   explicit HloSharding(std::vector<HloSharding> tuple_shardings)
       : tuple_elements_(std::move(tuple_shardings)),
         replicated_(false),
@@ -730,7 +731,17 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
+  explicit HloSharding(NamedSharding named_sharding)
+      : replicated_(false),
+        maximal_(false),
+        tuple_(false),
+        manual_(false),
+        unknown_(false),
+        unreduced_(false),
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::move(named_sharding)) {}
 
   // Test-only constructor for sharding format code coverage. Copies the
   // original sharding with provided tile assignment.
@@ -745,7 +756,8 @@ class HloSharding {
         manual_(other.manual_),
         unknown_(other.unknown_),
         unreduced_(other.unreduced_),
-        replicate_on_last_tile_dim_(other.replicate_on_last_tile_dim_) {
+        replicate_on_last_tile_dim_(other.replicate_on_last_tile_dim_),
+        named_sharding_(std::nullopt) {
     CHECK(tile_assignment_ == other.tile_assignment_)
         << tile_assignment_.ToString() << " v.s. "
         << other.tile_assignment_.ToString();
@@ -764,6 +776,8 @@ class HloSharding {
   absl::Status ValidateNonTuple(const Shape& shape,
                                 std::optional<int64_t> num_devices) const;
 
+  const TileAssignment& TileAgnosticDeviceAssignment() const;
+
   // This field is only used if replicated_ is false. If maximal_ is true, then
   // the field contains a rank 1 array with a single element, which is the
   // device the HLO is assigned to. If maximal_ is false, the field contains an
@@ -819,6 +833,13 @@ class HloSharding {
   // or else are ignored. This is to facilitate migration from the old sharding
   // format.
   //
+  // Note that for tuple NamedShardings, we reuse HloSharding's tuple_elements_
+  // field. If named sharding format is enabled each element in tuple_elements_
+  // will be an HloSharding, which itself can be a tuple or should only have
+  // named_sharding_ populated. This approach is taken to maintain backward
+  // compatibility with the existing `tuple_elements()` method, which provides a
+  // modifiable reference to a `std::vector<HloSharding>`.
+  //
   // Note that instead of reusing HloSharding's fields like metadata, we have
   // separate fields in NamedSharding to treat it as a standalone message which
   // is more clear and will help in future cleanup.
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
new file mode 100644
index 00000000000000..6a7807f51d698e
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
@@ -0,0 +1,334 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/mesh_and_axis.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+absl::Status Mesh::ValidateMesh() {
+  // TODO(varcho): An empty mesh is valid in Shardy. If support for such meshes
+  // is required, update this validation.
+  if (device_assignment_.dimensions().empty() || axes_names_.empty()) {
+    return absl::InvalidArgumentError("Mesh must have at least one axis.");
+  }
+
+  if (device_assignment_.dimensions().size() != axes_names_.size()) {
+    return absl::InvalidArgumentError(
+        "Number of axes names must match number of dimensions in the device "
+        "assignment.");
+  }
+
+  absl::flat_hash_set<std::string> seen_axis_names;
+  for (const std::string& axis_name : axes_names_) {
+    if (!seen_axis_names.insert(axis_name).second) {
+      return absl::InvalidArgumentError("Mesh has duplicate axis names.");
+    }
+  }
+
+  // Validate device ids are permutation of iota in non-iota cases.
+  if (device_assignment_.iota().has_value()) {
+    return absl::OkStatus();
+  }
+  std::vector<int64_t> device_ids(device_assignment_.array().begin(),
+                                  device_assignment_.array().end());
+  for (int64_t device_id : device_ids) {
+    if (device_id < 0) {
+      return absl::InvalidArgumentError(
+          "Mesh device ids must be non-negative.");
+    }
+  }
+  std::vector<int64_t> iota(device_ids.size());
+  std::iota(iota.begin(), iota.end(), 0);
+
+  // For non-iota cases the device ids should be a non-identity permutation
+  // of iota.
+  if (device_ids == iota) {
+    return absl::InvalidArgumentError(
+        "Non-iota device assignment has iota device id list [0,1,2,3...].");
+  }
+  absl::c_sort(device_ids);
+  if (device_ids != iota) {
+    return absl::InvalidArgumentError(
+        "Device ids must be a permutation of [0,1,2,3...].");
+  }
+  return absl::OkStatus();
+}
+
+Mesh::Mesh(TileAssignment device_assignment,
+           absl::Span<const absl::string_view> axes_names)
+    : device_assignment_(std::move(device_assignment)),
+      axes_names_(axes_names.begin(), axes_names.end()) {
+  CHECK_OK(ValidateMesh());
+}
+
+MeshProto Mesh::ToProto() const {
+  MeshProto proto;
+  int64_t num_axes = axes_names_.size();
+
+  if (num_axes == 0) {
+    if (device_assignment_.num_elements() == 0) {
+      return MeshProto();
+    }
+    // Maximal mesh
+    // TODO(b/454008727): Validate device_ids_size is 1.
+    proto.add_device_ids(*device_assignment_.array().begin());
+    return proto;
+  }
+
+  std::vector<MeshProto::MeshAxis> axes;
+  axes.reserve(num_axes);
+
+  for (auto [name, size] :
+       llvm::zip_equal(axes_names_, device_assignment_.dimensions())) {
+    MeshProto::MeshAxis axis;
+    axis.set_name(name);
+    axis.set_size(size);
+    axes.push_back(std::move(axis));
+  }
+  proto.mutable_axes()->Assign(axes.begin(), axes.end());
+
+  std::optional<IotaTileAssignment> iota = device_assignment_.iota();
+  // Only add device ids for non-iota cases.
+  if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
+    proto.mutable_device_ids()->Assign(device_assignment_.array().begin(),
+                                       device_assignment_.array().end());
+  }
+  return proto;
+}
+
+Mesh Mesh::FromProto(const MeshProto& proto) {
+  // TODO(b/454008727): Add validators for Mesh and AxisRef FromProto methods.
+  if (proto.axes_size() == 0) {
+    if (proto.device_ids_size() == 0) {
+      return Mesh();
+    }
+    // Maximal mesh
+    // TODO(b/454008727): Validate device_ids_size is 1.
+    return Mesh(proto.device_ids(0));
+  }
+
+  std::vector<int64_t> mesh_axis_sizes;
+  std::vector<absl::string_view> mesh_axis_names;
+  mesh_axis_sizes.reserve(proto.axes_size());
+  mesh_axis_names.reserve(proto.axes_size());
+  for (const auto& axis : proto.axes()) {
+    mesh_axis_sizes.push_back(axis.size());
+    mesh_axis_names.push_back(axis.name());
+  }
+  absl::Span<const absl::string_view> mesh_axis_names_span =
+      absl::MakeSpan(mesh_axis_names);
+
+  // If device ids are not specified, create a mesh with iota tiling.
+  if (proto.device_ids_size() == 0) {
+    TileAssignment device_assignment =
+        TileAssignment(IotaTileAssignment::Create(mesh_axis_sizes));
+    return Mesh(device_assignment, mesh_axis_names_span);
+  }
+  // Otherwise, create a mesh with the specific device id ordering.
+  std::vector<int64_t> device_ids(proto.device_ids().begin(),
+                                  proto.device_ids().end());
+  Array<int64_t> device_ids_array(mesh_axis_sizes);
+  absl::c_copy(device_ids, device_ids_array.begin());
+
+  TileAssignment tile_assignment =
+      TileAssignment(std::make_shared<Array<int64_t>>(device_ids_array));
+  return Mesh(tile_assignment, mesh_axis_names_span);
+}
+
+absl::Status AxisRef::Validate(const Mesh& mesh) const {
+  if (mesh_axis_index_ >= mesh.axis_names().size()) {
+    return absl::InvalidArgumentError(
+        "Axis index must be less than number of axes.");
+  }
+  if (!sub_axis_info_.has_value()) {
+    return absl::OkStatus();
+  }
+
+  int64_t axis_size = mesh.axis_size(mesh_axis_index_);
+  if (axis_size % sub_axis_info_->pre_size != 0 ||
+      axis_size % sub_axis_info_->size != 0) {
+    return absl::InvalidArgumentError(
+        "Pre-size and size must divide the full axis size.");
+  }
+  if (sub_axis_info_->size >= axis_size) {
+    return absl::InvalidArgumentError(
+        "Sub-axis size must be strictly less than the full axis size.");
+  }
+  return absl::OkStatus();
+}
+
+AxisRefProto AxisRef::ToProto() const {
+  AxisRefProto proto;
+  proto.set_mesh_axis_index(mesh_axis_index_);
+  if (sub_axis_info_.has_value()) {
+    proto.mutable_sub_axis_info()->set_pre_size(sub_axis_info_->pre_size);
+    proto.mutable_sub_axis_info()->set_size(sub_axis_info_->size);
+  }
+  return proto;
+}
+
+AxisRef AxisRef::FromProto(const AxisRefProto& proto) {
+  AxisRef axis_ref(proto.mesh_axis_index());
+  if (proto.has_sub_axis_info()) {
+    axis_ref.sub_axis_info_ = {proto.sub_axis_info().pre_size(),
+                               proto.sub_axis_info().size()};
+  }
+  return axis_ref;
+}
+
+AxisRef::AxisRef(int64_t mesh_axis_index) : mesh_axis_index_(mesh_axis_index) {}
+
+AxisRef::AxisRef(int64_t mesh_axis_index, SubAxis sub_axis_info)
+    : mesh_axis_index_(mesh_axis_index), sub_axis_info_(sub_axis_info) {
+  CHECK_GT(sub_axis_info_->pre_size, 0) << "sub-axis pre-size must be >= 1";
+  CHECK_GT(sub_axis_info_->size, 1) << "sub-axis size must be > 1";
+}
+
+bool canSubAxesCoexist(int64_t minPreSize, int64_t maxPreSize,
+                       int64_t minNextPreSize, int64_t maxNextPreSize) {
+  if (minNextPreSize > maxPreSize) {
+    // Sub-axes overlap, check if overlapping and non-overlapping parts are
+    // valid.
+    return minNextPreSize % maxPreSize == 0 && maxPreSize % minPreSize == 0 &&
+           maxNextPreSize % minNextPreSize == 0;
+  }
+  // Sub-axes don't overlap, check if the gap is valid.
+  return maxPreSize % minNextPreSize == 0;
+}
+
+bool AxisRef::CanCoexist(const AxisRef& other) const {
+  if (mesh_axis_index() != other.mesh_axis_index()) {
+    return true;
+  }
+  if (!sub_axis_info_.has_value() || !other.sub_axis_info_.has_value()) {
+    // If one is a full axis and the other is a sub-axis, they can coexist.
+    return true;
+  }
+
+  const SubAxis& this_sub_axis = sub_axis_info_.value();
+  const SubAxis& other_sub_axis = other.sub_axis_info_.value();
+
+  int64_t this_pre_size = this_sub_axis.pre_size;
+  int64_t other_pre_size = other_sub_axis.pre_size;
+  int64_t this_next_pre_size = this_sub_axis.next_pre_size();
+  int64_t other_next_pre_size = other_sub_axis.next_pre_size();
+
+  auto [min_pre_size, max_pre_size] =
+      std::minmax(this_pre_size, other_pre_size);
+  auto [min_next_pre_size, max_next_pre_size] =
+      std::minmax(this_next_pre_size, other_next_pre_size);
+
+  return canSubAxesCoexist(min_pre_size, max_pre_size, min_next_pre_size,
+                           max_next_pre_size);
+}
+
+bool AxisRef::Overlaps(const AxisRef& other) const {
+  if (mesh_axis_index() != other.mesh_axis_index()) {
+    return false;
+  }
+
+  // If one is a full axis then they must overlap.
+  if (!sub_axis_info_.has_value() || !other.sub_axis_info_.has_value()) {
+    return true;
+  }
+
+  const SubAxis& this_sub_axis = sub_axis_info_.value();
+  const SubAxis& other_sub_axis = other.sub_axis_info_.value();
+
+  return this_sub_axis.pre_size < other_sub_axis.next_pre_size() &&
+         other_sub_axis.pre_size < this_sub_axis.next_pre_size();
+}
+
+bool AxisRef::CanCoexistWithoutOverlap(const AxisRef& other) const {
+  // Check if the axes are on different mesh dimensions. If so, they can always
+  // coexist and never overlap.
+  if (mesh_axis_index() != other.mesh_axis_index()) {
+    return true;
+  }
+
+  // If one AxisRef is a full axis it will always overlap the other axis on the
+  // same dimension.
+  if (!sub_axis_info_.has_value() || !other.sub_axis_info_.has_value()) {
+    return false;
+  }
+
+  const SubAxis& this_sub_axis = sub_axis_info_.value();
+  const SubAxis& other_sub_axis = other.sub_axis_info_.value();
+
+  int64_t this_pre_size = this_sub_axis.pre_size;
+  int64_t other_pre_size = other_sub_axis.pre_size;
+  int64_t this_next_pre_size = this_sub_axis.next_pre_size();
+  int64_t other_next_pre_size = other_sub_axis.next_pre_size();
+
+  // Check for overlapping sub-axes
+  bool overlaps = (this_next_pre_size > other_pre_size) &&
+                  (other_next_pre_size > this_pre_size);
+  if (overlaps) {
+    return false;
+  }
+  // Assert that sub-axes can coexist.
+  auto [min_pre_size, max_pre_size] =
+      std::minmax(this_pre_size, other_pre_size);
+  auto [min_next_pre_size, max_next_pre_size] =
+      std::minmax(this_next_pre_size, other_next_pre_size);
+
+  // Sub-axes don't overlap, check if the gap is valid.
+  return max_pre_size % min_next_pre_size == 0;
+}
+
+bool AxesCanCoexistWithoutOverlap(absl::Span<const AxisRef> axes) {
+  for (int64_t i = 0; i < axes.size() - 1; ++i) {
+    for (int64_t j = i + 1; j < axes.size(); ++j) {
+      if (!axes[i].CanCoexistWithoutOverlap(axes[j])) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+absl::Status ValidateSpanOfAxes(absl::Span<const AxisRef> axes,
+                                const Mesh& mesh) {
+  for (const AxisRef& axis : axes) {
+    TF_RETURN_IF_ERROR(axis.Validate(mesh));
+  }
+  if (!AxesCanCoexistWithoutOverlap(axes)) {
+    return absl::InvalidArgumentError("Axes cannot coexist or axes overlap.");
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.h b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
new file mode 100644
index 00000000000000..2b913f7638dad0
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
@@ -0,0 +1,218 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_MESH_AND_AXIS_H_
+#define XLA_HLO_IR_MESH_AND_AXIS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class AxisRef;
+
+// C++ representation for corresponding OpSharding::Mesh proto so same
+// documentation applies, except device assignment is represented in the array
+// format instead of list of device ids to align with various array specific
+// queries. `TileAssignment` is used instead of `xla::Array` for optimized array
+// representation in the most common iota-based cases.
+//
+// - device_assignment_.dimensions() represents the axis sizes.
+// - device_assignment_.array() represents the list of device IDs.
+//
+// For maximal mesh, axes_names is empty and device_assignment_ contains the
+// single device id.
+//
+// Example: device_assignment {{3, 0, 2}, {1, 4, 5}} with axes names
+// {"data", "model"} represents the mesh ["data"=2, "model"=3].
+class Mesh {
+ public:
+  // Empty mesh
+  explicit Mesh() = default;
+
+  // Maximal Mesh
+  explicit Mesh(int64_t device_id) : device_assignment_(device_id) {}
+
+  // Constructs an iota device assignment mesh with given axes sizes and names.
+  //
+  // Example: axes_sizes {2, 3} and axes_names {"data", "model"} represent the
+  // mesh ["data"=2, "model"=3] with iota device list. We use `TileAssignment`
+  // optimized for iota based cases which will not store the entire array.
+  explicit Mesh(absl::Span<const int64_t> axes_sizes,
+                absl::Span<const absl::string_view> axes_names)
+      : Mesh(TileAssignment(axes_sizes), axes_names) {}
+
+  // Constructs a mesh with given device assignment and axes names. This ctor
+  // should **ONLY** be used for non-iota based device assignments.
+  explicit Mesh(Array<int64_t> device_assignment,
+                absl::Span<const absl::string_view> axes_names)
+      : Mesh(TileAssignment(std::make_shared<Array<int64_t>>(
+                 std::move(device_assignment))),
+             axes_names) {}
+
+  explicit Mesh(TileAssignment device_assignment,
+                absl::Span<const absl::string_view> axes_names);
+
+  // Returns whether this mesh is a maximal-sharding mesh.
+  //
+  // A maximal-sharding mesh contains an empty axis list and a single device id.
+  bool IsMaximal() const {
+    return axes_names_.empty() && device_assignment_.num_elements() == 1;
+  }
+
+  bool operator==(const Mesh& other) const {
+    return device_assignment_ == other.device_assignment_ &&
+           axes_names_ == other.axes_names_;
+  }
+
+  bool operator!=(const Mesh& other) const { return !(*this == other); }
+
+  std::string ToString() const {
+    std::string mesh_str = "@mesh";
+    // Add the mesh axes names and sizes.
+    std::vector<std::string> formatted_axes_names;
+    formatted_axes_names.reserve(axes_names_.size());
+    for (int64_t i = 0; i < axes_names_.size(); ++i) {
+      formatted_axes_names.push_back(
+          absl::StrCat(axes_names_[i], "=", device_assignment_.dim(i)));
+    }
+
+    // Add the device assignment if it is not an iota case.
+    std::optional<IotaTileAssignment> iota = device_assignment_.iota();
+    std::string device_assignment_str = "";
+    if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
+      device_assignment_str =
+          absl::StrCat("(", device_assignment_.ArrayToString(), ")");
+    }
+    absl::StrAppend(&mesh_str, "<", absl::StrJoin(formatted_axes_names, ","),
+                    ">", device_assignment_str);
+    return mesh_str;
+  }
+
+  bool DeviceAssignmentEquals(const Mesh& other) const {
+    return device_assignment_ == other.device_assignment_;
+  }
+
+  MeshProto ToProto() const;
+
+  static Mesh FromProto(const MeshProto& proto);
+
+  const TileAssignment& device_assignment() const { return device_assignment_; }
+  std::vector<std::string> axis_names() const { return axes_names_; }
+  absl::Span<const int64_t> axis_sizes() const {
+    return device_assignment_.dimensions();
+  }
+  int64_t axis_size(int64_t axis_index) const {
+    return device_assignment_.dim(axis_index);
+  }
+
+ private:
+  absl::Status ValidateMesh();
+  // Dimensions of the `device_assignment_` array correspond to the axes of the
+  // mesh.
+  TileAssignment device_assignment_;
+  // Axes names correspond to names of axes represented by dimensions of
+  // `device_assignment_`. Size of `axes_names_` should be equal to the number
+  // of dimensions in the device_assignment_.
+  std::vector<std::string> axes_names_;
+};
+
+// C++ representation for corresponding `OpSharding::AxisRef`proto so same
+// documentation applies.
+class AxisRef {
+ private:
+  struct SubAxis {
+    int64_t pre_size;
+    int64_t size;
+    int64_t next_pre_size() const { return pre_size * size; }
+  };
+
+  // Index corresponding to axis in the mesh. It should be a valid index into
+  // `mesh.axes_names_`.
+  int64_t mesh_axis_index_;
+  std::optional<SubAxis> sub_axis_info_;
+
+ public:
+  explicit AxisRef(int64_t mesh_axis_index);
+
+  explicit AxisRef(int64_t mesh_axis_index, SubAxis sub_axis_info);
+
+  bool operator==(const xla::AxisRef& other) const {
+    if (mesh_axis_index_ != other.mesh_axis_index_) {
+      return false;
+    }
+    if (sub_axis_info_.has_value() != other.sub_axis_info_.has_value()) {
+      return false;
+    }
+    if (sub_axis_info_.has_value()) {
+      return sub_axis_info_->pre_size == other.sub_axis_info_->pre_size &&
+             sub_axis_info_->size == other.sub_axis_info_->size;
+    }
+    return true;
+  }
+
+  bool operator!=(const xla::AxisRef& other) const { return !(*this == other); }
+
+  std::string ToString(const Mesh& mesh) const {
+    CHECK_GE(mesh_axis_index_, 0);
+    CHECK_LT(mesh_axis_index_, mesh.axis_names().size());
+    std::string axis_str = mesh.axis_names()[mesh_axis_index()];
+    if (sub_axis_info_.has_value()) {
+      absl::StrAppend(&axis_str, ":(", sub_axis_info_->pre_size, ")",
+                      sub_axis_info_->size);
+    }
+    return axis_str;
+  }
+
+  AxisRefProto ToProto() const;
+
+  static AxisRef FromProto(const AxisRefProto& proto);
+
+  bool CanCoexist(const AxisRef& other) const;
+  bool Overlaps(const AxisRef& other) const;
+  bool CanCoexistWithoutOverlap(const AxisRef& other) const;
+
+  // Validates that the given mesh is compatible for this axis ref.
+  absl::Status Validate(const Mesh& mesh) const;
+  int64_t mesh_axis_index() const { return mesh_axis_index_; }
+  std::optional<SubAxis> sub_axis_info() const { return sub_axis_info_; }
+
+ private:
+  absl::Status ValidateAxisRef();
+};
+
+bool AxesCanCoexistWithoutOverlap(absl::Span<const AxisRef> axes);
+
+// The span of axes is valid if (1) all axes are valid for the given mesh, and
+// (2) the axes can coexist without overlap.
+absl::Status ValidateSpanOfAxes(absl::Span<const AxisRef> axes,
+                                const Mesh& mesh);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_MESH_AND_AXIS_H_
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
new file mode 100644
index 00000000000000..3dbcc53db77246
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
@@ -0,0 +1,353 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/mesh_and_axis.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+using ::tsl::proto_testing::EqualsProto;
+
+namespace xla {
+
+TEST(MeshAndAxisTest, AxisRefEquality) {
+  EXPECT_EQ(AxisRef(1), AxisRef(1));
+  EXPECT_EQ(AxisRef(3, {1, 2}), AxisRef(3, {1, 2}));
+  EXPECT_NE(AxisRef(2), AxisRef(4));
+  EXPECT_NE(AxisRef(0), AxisRef(0, {1, 2}));
+  EXPECT_NE(AxisRef(2, {1, 2}), AxisRef(3, {1, 2}));
+  EXPECT_NE(AxisRef(2, {1, 2}), AxisRef(2, {2, 2}));
+  EXPECT_NE(AxisRef(2, {1, 2}), AxisRef(2, {1, 4}));
+}
+
+TEST(MeshAndAxisTest, MeshEquality) {
+  std::vector<absl::string_view> axes_abc = {"a", "b", "c"};
+  std::vector<absl::string_view> axes_abcd = {"a", "b", "c", "d"};
+  std::vector<absl::string_view> axes_efgh = {"e", "f", "g", "h"};
+  EXPECT_EQ(Mesh({1, 2, 3}, axes_abc), Mesh({1, 2, 3}, axes_abc));
+  EXPECT_NE(Mesh({1, 2, 3, 4}, axes_abcd), Mesh({1, 2, 3, 4}, axes_efgh));
+  EXPECT_NE(Mesh({1, 2, 3}, axes_abc), Mesh({1, 2, 3, 4}, axes_abcd));
+}
+
+TEST(MeshAndAxisTest, DeviceAssignmentEquality) {
+  std::vector<absl::string_view> axes_abcd = {"a", "b", "c", "d"};
+  std::vector<absl::string_view> axes_efgh = {"e", "f", "g", "h"};
+  Mesh mesh({1, 2, 3, 4}, axes_abcd);
+  Mesh mesh_diff_axis_names({1, 2, 3, 4}, axes_efgh);
+  EXPECT_TRUE(mesh.DeviceAssignmentEquals(mesh_diff_axis_names));
+  Mesh mesh_other({2, 1, 4, 3}, axes_efgh);
+  EXPECT_FALSE(mesh.DeviceAssignmentEquals(mesh_other));
+}
+
+TEST(MeshAndAxisTest, AxesToProto) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(123);
+  EXPECT_THAT(AxisRef(123).ToProto(), EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, AxesToProtoWithSubAxis) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(2);
+  expected.mutable_sub_axis_info()->set_pre_size(2);
+  expected.mutable_sub_axis_info()->set_size(8);
+  EXPECT_THAT(AxisRef(2, {2, 8}).ToProto(), EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, AxesFromProto) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(1);
+  EXPECT_THAT(AxisRef(1), AxisRef::FromProto(expected));
+}
+
+TEST(MeshAndAxisTest, AxesFromProtoWithSubAxis) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(10);
+  expected.mutable_sub_axis_info()->set_pre_size(4);
+  expected.mutable_sub_axis_info()->set_size(32);
+  EXPECT_THAT(AxisRef(10, {4, 32}), AxisRef::FromProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshToAndFromProtoIotaTiling) {
+  MeshProto proto;
+  proto.add_axes()->set_name("a");
+  proto.add_axes()->set_name("b");
+  proto.add_axes()->set_name("c");
+  proto.mutable_axes(0)->set_size(2);
+  proto.mutable_axes(1)->set_size(3);
+  proto.mutable_axes(2)->set_size(6);
+
+  Mesh mesh({2, 3, 6}, {"a", "b", "c"});
+
+  EXPECT_THAT(mesh.ToProto(), EqualsProto(proto));
+  EXPECT_EQ(mesh, Mesh::FromProto(proto));
+}
+
+TEST(MeshAndAxisTest, MeshToProtoIotaTilingWithReshapeDims) {
+  MeshProto expected;
+  expected.add_axes()->set_name("axis1");
+  expected.add_axes()->set_name("axis2");
+  expected.add_axes()->set_name("axis3");
+  expected.mutable_axes(0)->set_size(4);
+  expected.mutable_axes(1)->set_size(4);
+  expected.mutable_axes(2)->set_size(1);
+  // When dims=[4,4,1] reshape_dims=[4,2,2], transpose_perm=[1,0,2] (swap dim 0
+  // and dim 1) corresponds to [4,4,1]<=[4,2,2]T(1,0,2) which in full array V1
+  // format is [0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15].
+  std::vector<int> expected_device_ids = {0, 1, 4, 5, 8,  9,  12, 13,
+                                          2, 3, 6, 7, 10, 11, 14, 15};
+  for (int i = 0; i < expected_device_ids.size(); ++i) {
+    expected.add_device_ids(expected_device_ids[i]);
+  }
+
+  std::vector<absl::string_view> axes_names = {"axis1", "axis2", "axis3"};
+  EXPECT_THAT(
+      Mesh(TileAssignment(/*dims=*/{4, 4, 1}, /*reshape_dims=*/{4, 2, 2},
+                          /*transpose_perm=*/{1, 0, 2}),
+           axes_names)
+          .ToProto(),
+      EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshToProtoNonIotaTiling) {
+  MeshProto expected;
+  expected.add_axes()->set_name("x");
+  expected.add_axes()->set_name("y");
+  expected.mutable_axes(0)->set_size(4);
+  expected.mutable_axes(1)->set_size(2);
+  std::vector<int> random_expected_device_ids = {6, 3, 0, 1, 5, 2, 7, 4};
+  for (int i = 0; i < random_expected_device_ids.size(); ++i) {
+    expected.add_device_ids(random_expected_device_ids[i]);
+  }
+
+  Array2D<int64_t> array({{6, 3}, {0, 1}, {5, 2}, {7, 4}});
+  std::vector<absl::string_view> axes_xy = {"x", "y"};
+  EXPECT_THAT(Mesh(array, axes_xy).ToProto(), EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshFromProtoNonIotaTiling) {
+  MeshProto expected;
+  expected.add_axes()->set_name("x");
+  expected.add_axes()->set_name("y");
+  expected.mutable_axes(0)->set_size(4);
+  expected.mutable_axes(1)->set_size(2);
+  std::vector<int> random_expected_device_ids = {0, 1, 6, 3, 7, 4, 5, 2};
+  for (int i = 0; i < random_expected_device_ids.size(); ++i) {
+    expected.add_device_ids(random_expected_device_ids[i]);
+  }
+
+  Array2D<int64_t> array({{0, 1}, {6, 3}, {7, 4}, {5, 2}});
+  std::vector<absl::string_view> axes_xy = {"x", "y"};
+  EXPECT_EQ(Mesh(array, axes_xy), Mesh::FromProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshRoundtripProto) {
+  // Iota tiling.
+  std::vector<absl::string_view> axes_xy = {"data", "model"};
+  Mesh mesh_iota({5, 3}, axes_xy);
+  EXPECT_THAT(mesh_iota, Mesh::FromProto(mesh_iota.ToProto()));
+
+  // Non-iota tiling.
+  Array2D<int64_t> array(
+      {{14, 7, 6}, {12, 0, 8}, {11, 10, 5}, {1, 9, 3}, {2, 13, 4}});
+  Mesh mesh_non_iota(array, axes_xy);
+  EXPECT_THAT(mesh_non_iota, Mesh::FromProto(mesh_non_iota.ToProto()));
+}
+
+TEST(MeshAndAxisTest, ValidatesAxisRef) {
+  EXPECT_DEATH(
+      { AxisRef axis_ref_invalid_pre_size(3, {0, 2}); },
+      "sub-axis pre-size must be ");
+  EXPECT_DEATH(
+      { AxisRef axis_ref_invalid_subaxis_size(0, {1, 1}); },
+      "sub-axis size must be");
+}
+
+TEST(MeshAndAxisTest, ValidatesMesh) {
+  EXPECT_DEATH(
+      { Mesh mesh_dims_axes_mismatch({2, 3, 4}, {"x", "y"}); },
+      "Number of axes names must match number of dimensions");
+
+  Array2D<int64_t> negative_device_ids({{0, 1, 2}, {3, -4, 5}});
+  EXPECT_DEATH(
+      { Mesh mesh_invalid_non_iota(negative_device_ids, {"x", "y"}); },
+      "Mesh device ids must be non-negative");
+
+  Array2D<int64_t> invalid_non_iota_device_ids({{10, 11, 12}, {13, 14, 15}});
+  EXPECT_DEATH(
+      { Mesh mesh_invalid_non_iota(invalid_non_iota_device_ids, {"x", "y"}); },
+      "Device ids must be a permutation of");
+
+  EXPECT_DEATH(
+      {
+        Mesh mesh_with_duplicate_axis_names({1, 2, 3, 4}, {"x", "y", "z", "x"});
+      },
+      "Mesh has duplicate axis names");
+
+  EXPECT_DEATH(
+      { Mesh mesh_with_empty_dims(TileAssignment({}), {}); },
+      "Mesh must have at least one axis");
+}
+
+TEST(MeshAndAxisTest, MeshAxesToString) {
+  Mesh mesh_uvw({10, 12, 15}, {"u", "v", "w"});
+  EXPECT_EQ(mesh_uvw.ToString(), "@mesh<u=10,v=12,w=15>");
+
+  Mesh mesh_abcd(
+      TileAssignment(/*dims=*/{2, 4, 4, 2}, /*reshape_dims=*/{1, 4, 1, 16},
+                     /*transpose_perm=*/{2, 3, 0, 1}),
+      {"a", "b", "c", "d"});
+  EXPECT_EQ(mesh_abcd.ToString(), "@mesh<a=2,b=4,c=4,d=2>([4,16]T(1,0))");
+
+  Array<int64_t> array({{8, 3, 7, 5, 4, 2, 6, 0, 1, 9}});
+  array.Reshape({10});
+  Mesh mesh_ooo(array, {"ooo"});
+  EXPECT_EQ(mesh_ooo.ToString(), "@mesh<ooo=10>(8,3,7,5,4,2,6,0,1,9)");
+}
+
+TEST(MeshAndAxisTest, ValidateAxisForMesh) {
+  Mesh mesh({2 * 7, 3 * 11, 5 * 13}, {"fdr", "jfk", "lbj"});
+
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(3, {1, 2}).Validate(mesh)); },
+      "Axis index must be less than number of axes");
+
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(0, {5, 19}).Validate(mesh)); },
+      "Pre-size and size must divide the full axis size");
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(0, {2, 5}).Validate(mesh)); },
+      "Pre-size and size must divide the full axis size");
+
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(1, {1, 3 * 11}).Validate(mesh)); },
+      "Sub-axis size must be strictly less than the full axis size");
+}
+
+TEST(MeshAndAxisTest, AxisRefCanCoexist) {
+  auto canCoexist = [](AxisRef a, AxisRef b, bool expected) {
+    EXPECT_EQ(a.CanCoexist(b), expected);
+    EXPECT_EQ(b.CanCoexist(a), expected);
+  };
+
+  canCoexist(AxisRef(0), AxisRef(1), true);
+  canCoexist(AxisRef(0), AxisRef(1, {2, 2}), true);
+  canCoexist(AxisRef(0), AxisRef(0), true);
+  canCoexist(AxisRef(0), AxisRef(0, {2, 2}), true);
+  canCoexist(AxisRef(0, {2, 2}), AxisRef(0, {2, 2}), true);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {1, 4}), true);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {2, 4}), true);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {6, 2}), true);
+  canCoexist(AxisRef(0, {1, 4}), AxisRef(0, {2, 2}), true);
+  canCoexist(AxisRef(0, {1, 4}), AxisRef(0, {2, 4}), true);
+
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {1, 3}), false);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {3, 2}), false);
+  canCoexist(AxisRef(0, {1, 3}), AxisRef(0, {2, 3}), false);
+}
+
+TEST(MeshAndAxisTest, AxisRefOverlaps) {
+  auto overlaps = [](AxisRef a, AxisRef b, bool expected) {
+    EXPECT_EQ(a.Overlaps(b), expected);
+    EXPECT_EQ(b.Overlaps(a), expected);
+  };
+
+  overlaps(AxisRef(0), AxisRef(0), true);
+  overlaps(AxisRef(0, {2, 4}), AxisRef(0), true);
+  overlaps(AxisRef(0, {2, 2}), AxisRef(0, {2, 2}), true);
+  overlaps(AxisRef(0, {1, 4}), AxisRef(0, {2, 4}), true);
+  overlaps(AxisRef(0, {2, 8}), AxisRef(0, {4, 2}), true);
+  overlaps(AxisRef(2, {1, 4}), AxisRef(2, {1, 2}), true);
+
+  overlaps(AxisRef(0), AxisRef(1), false);
+  overlaps(AxisRef(0), AxisRef(1, {1, 2}), false);
+  overlaps(AxisRef(0, {1, 4}), AxisRef(0, {4, 2}), false);
+  overlaps(AxisRef(0, {1, 4}), AxisRef(0, {8, 2}), false);
+  overlaps(AxisRef(0, {4, 2}), AxisRef(0, {1, 2}), false);
+}
+
+TEST(MeshAndAxisTest, AxisRefCanCoexistWithoutOverlap) {
+  auto coexistWithoutOverlap = [](AxisRef a, AxisRef b, bool expected) {
+    EXPECT_EQ(a.CanCoexistWithoutOverlap(b), expected);
+    EXPECT_EQ(b.CanCoexistWithoutOverlap(a), expected);
+  };
+
+  coexistWithoutOverlap(AxisRef(0), AxisRef(1), true);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(1, {1, 2}), true);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(1, {2, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {2, 4}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {6, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {4, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {8, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {4, 2}), AxisRef(0, {1, 2}), true);
+
+  coexistWithoutOverlap(AxisRef(0), AxisRef(0), false);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(0, {2, 2}), false);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(0, {2, 4}), false);
+  coexistWithoutOverlap(AxisRef(0, {2, 2}), AxisRef(0, {2, 2}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {1, 4}), false);
+  coexistWithoutOverlap(AxisRef(2, {1, 2}), AxisRef(2, {1, 4}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {2, 2}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {2, 4}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {1, 3}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {3, 2}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 3}), AxisRef(0, {2, 3}), false);
+  coexistWithoutOverlap(AxisRef(0, {2, 8}), AxisRef(0, {4, 2}), false);
+}
+
+TEST(MeshAndAxisTest, EmptyMesh) {
+  Mesh empty_mesh;
+  EXPECT_EQ(empty_mesh, Mesh());
+  EXPECT_NE(empty_mesh, Mesh(5));
+  EXPECT_NE(empty_mesh, Mesh({1}, {"a"}));
+  EXPECT_FALSE(empty_mesh.IsMaximal());
+  EXPECT_THAT(empty_mesh.ToProto(), EqualsProto(MeshProto()));
+  EXPECT_EQ(empty_mesh, Mesh::FromProto(MeshProto()));
+  EXPECT_EQ(empty_mesh, Mesh::FromProto(empty_mesh.ToProto()));
+}
+
+TEST(MeshAndAxisTest, MaximalMesh) {
+  Mesh maximal_mesh(5);
+  EXPECT_TRUE(maximal_mesh.IsMaximal());
+  Mesh non_maximal_mesh({2, 3}, {"a", "b"});
+  EXPECT_FALSE(non_maximal_mesh.IsMaximal());
+  Mesh mesh_single_axis({1}, {"a"});
+  EXPECT_FALSE(mesh_single_axis.IsMaximal());
+
+  EXPECT_EQ(maximal_mesh, Mesh(5));
+  EXPECT_NE(maximal_mesh, Mesh(6));
+
+  MeshProto expected_proto;
+  expected_proto.add_device_ids(5);
+  EXPECT_THAT(maximal_mesh.ToProto(), EqualsProto(expected_proto));
+
+  MeshProto from_proto;
+  from_proto.add_device_ids(7);
+  EXPECT_EQ(Mesh(7), Mesh::FromProto(from_proto));
+
+  EXPECT_EQ(maximal_mesh, Mesh::FromProto(maximal_mesh.ToProto()));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
new file mode 100644
index 00000000000000..7763086e2b4b88
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -0,0 +1,121 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_NAMED_SHARDING_H_
+#define XLA_HLO_IR_NAMED_SHARDING_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// C++ representation for corresponding `OpSharding::NamedSharding` proto so
+// same documentation applies.
+class NamedSharding {
+ public:
+  class DimensionSharding {
+   public:
+    bool operator==(const DimensionSharding& other) const {
+      return axes_ == other.axes_ && is_closed_ == other.is_closed_;
+    }
+
+    explicit DimensionSharding(std::vector<AxisRef> axes, bool is_closed)
+        : axes_(std::move(axes)), is_closed_(is_closed) {}
+
+    absl::Span<const AxisRef> axes() const { return axes_; }
+
+   private:
+    std::vector<AxisRef> axes_;
+    bool is_closed_;
+  };
+
+  // Shardings using mesh with similar device assignment should compare equal
+  bool operator==(const NamedSharding& other) const {
+    return mesh_.DeviceAssignmentEquals(other.mesh_) &&
+           dim_shardings_ == other.dim_shardings_ &&
+           replicated_axes_ == other.replicated_axes_ &&
+           unreduced_axes_ == other.unreduced_axes_;
+  }
+
+  bool operator!=(const NamedSharding& other) const {
+    return !(*this == other);
+  }
+
+  // TODO(b/456212087): Add validation checks
+  explicit NamedSharding(Mesh mesh,
+                         absl::Span<const DimensionSharding> dim_shardings = {},
+                         absl::Span<const AxisRef> replicated_axes = {},
+                         absl::Span<const AxisRef> unreduced_axes = {},
+                         absl::Span<const OpMetadata> metadata = {})
+      : mesh_(std::move(mesh)),
+        dim_shardings_(dim_shardings.begin(), dim_shardings.end()),
+        replicated_axes_(replicated_axes.begin(), replicated_axes.end()),
+        unreduced_axes_(unreduced_axes.begin(), unreduced_axes.end()),
+        metadata_(metadata.begin(), metadata.end()) {}
+
+ private:
+  friend class HloSharding;
+
+  // Creates a sharding with empty mesh and no sharding axes depicting it is
+  // replicated across all devices.
+  static NamedSharding Replicate(absl::Span<const OpMetadata> metadata = {}) {
+    return NamedSharding(/*mesh=*/Mesh(), /*dim_shardings=*/{},
+                         /*replicated_axes=*/{},
+                         /*unreduced_axes=*/{}, metadata);
+  }
+
+  static NamedSharding MaximalSharding(
+      int64_t device_id, absl::Span<const OpMetadata> metadata = {}) {
+    return NamedSharding(Mesh(device_id), /*dim_shardings=*/{},
+                         /*replicated_axes=*/{},
+                         /*unreduced_axes=*/{}, metadata);
+  }
+
+  bool IsReplicated() const {
+    return !IsMaximal() &&
+           absl::c_all_of(dim_shardings_, [](const DimensionSharding& s) {
+             return s.axes().empty();
+           });
+  }
+
+  bool IsMaximal() const { return mesh_.IsMaximal(); }
+
+  // Returns true if the tile size is the same as the input size.
+  //
+  // This checks for both replicated and maximal sharding, as in both cases tile
+  // size is same as input size.
+  bool IsTileMaximal() const { return IsReplicated() || IsMaximal(); }
+
+  const TileAssignment& device_assignment() const {
+    return mesh_.device_assignment();
+  }
+
+  Mesh mesh_;
+  std::vector<DimensionSharding> dim_shardings_;
+  std::vector<AxisRef> replicated_axes_;
+  std::vector<AxisRef> unreduced_axes_;
+  std::vector<OpMetadata> metadata_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_NAMED_SHARDING_H_
diff --git a/third_party/xla/xla/hlo/ir/named_sharding_test.cc b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
new file mode 100644
index 00000000000000..36e9cfbbba67bb
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/named_sharding.h"
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using DimensionSharding = NamedSharding::DimensionSharding;
+
+TEST(NamedShardingTest, Equality) {
+  Mesh mesh_abcd({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  DimensionSharding ds_ab_open({axis_a, axis_b}, /*is_closed=*/false);
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+
+  NamedSharding base(mesh_abcd, /*dim_shardings=*/{ds_ab, ds_dc},
+                     /*replicated_axes=*/{axis_b},
+                     /*unreduced_axes=*/{axis_c});
+
+  EXPECT_EQ(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_b}, {axis_c}));
+
+  // Equal even with different mesh axis names
+  Mesh mesh_cadb({2, 4, 3, 8}, {"c", "a", "d", "b"});
+  EXPECT_EQ(base,
+            NamedSharding(mesh_cadb, {ds_ab, ds_dc}, {axis_b}, {axis_c}, {}));
+
+  // Equal even with different metadata.
+  OpMetadata metadata;
+  metadata.set_op_name("foo");
+  EXPECT_EQ(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_b}, {axis_c},
+                                {metadata}));
+
+  // Different dim_shardings
+  EXPECT_NE(base,
+            NamedSharding(mesh_abcd, {ds_ab_open, ds_dc}, {axis_b}, {axis_c}));
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_dc, ds_ab}, {axis_b}, {axis_c}));
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_ab}, {axis_b}, {axis_c}));
+
+  // Different replicated_axes
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_d}, {axis_c}));
+
+  // Different unreduced_axes
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_b}, {axis_a}));
+
+  // Different mesh shape
+  Mesh mesh_diff_shape({2, 4, 3, 9}, {"a", "b", "c", "d"});
+  EXPECT_NE(base,
+            NamedSharding(mesh_diff_shape, {ds_ab, ds_dc}, {axis_b}, {axis_c}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/replica_group.cc b/third_party/xla/xla/hlo/ir/replica_group.cc
index f693c50db3b540..e78141fc8fee05 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.cc
+++ b/third_party/xla/xla/hlo/ir/replica_group.cc
@@ -15,22 +15,31 @@ limitations under the License.
 
 #include "xla/hlo/ir/replica_group.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <numeric>
+#include <optional>
 #include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/tile_assignment.h"
 #include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -45,6 +54,269 @@ std::string ReplicaGroupsToString(
   return absl::StrCat("{", absl::StrJoin(replica_group_str, ","), "}");
 }
 
+/************** MeshAxesReplicaGroupList implementation ***********************/
+
+void HandleSingleAxisRefPerDimension(const AxisRef& axis,
+                                     int64_t full_axis_size,
+                                     std::vector<int64_t>& out_reshape_dims,
+                                     std::vector<int64_t>& out_aggregate_axes) {
+  if (axis.sub_axis_info().has_value()) {
+    out_reshape_dims = {axis.sub_axis_info()->pre_size,
+                        axis.sub_axis_info()->size,
+                        full_axis_size / axis.sub_axis_info()->next_pre_size()};
+    // The aggregation axis is the second dimension.
+    out_aggregate_axes = {1};
+  } else {
+    out_reshape_dims = {full_axis_size};
+    out_aggregate_axes = {0};
+  }
+}
+
+void HandleMultiAxisRefPerDimension(std::vector<AxisRef>& axes,
+                                    int64_t full_axis_size,
+                                    std::vector<int64_t>& out_reshape_dims,
+                                    std::vector<int64_t>& out_aggregate_axes) {
+  // --- 1. Sort Axes and Original Indices Together ---
+  // Sort both the axes and the original indices based on
+  // sub_axis_info()->pre_size. This allows us to maintain user specified order
+  // of AxisRef while still building the reshape and aggregate axes.
+  std::vector<int> original_order(axes.size());
+  std::iota(original_order.begin(), original_order.end(), 0);
+  std::sort(original_order.begin(), original_order.end(),
+            [&axes](int i, int j) {
+              return axes[i].sub_axis_info()->pre_size <
+                     axes[j].sub_axis_info()->pre_size;
+            });
+  std::sort(axes.begin(), axes.end(), [](const AxisRef& a, const AxisRef& b) {
+    return a.sub_axis_info()->pre_size < b.sub_axis_info()->pre_size;
+  });
+
+  // --- 2. Build Reshape Dims and Aggregation Axes ---
+  int64_t current_dim_index = 0;  // Index in the new reshaped tensor
+  int64_t prefix_product = 1;     // Product of the size of all prior dimensions
+
+  for (const AxisRef& axis : axes) {
+    int64_t pre_size = axis.sub_axis_info()->pre_size;
+    int64_t size = axis.sub_axis_info()->size;
+
+    // Insert "padding" dimension if the current prefix product doesn't match
+    // the required pre_size
+    if (pre_size != prefix_product) {
+      int64_t padding_size = pre_size / prefix_product;
+      out_reshape_dims.push_back(padding_size);
+      current_dim_index++;
+      prefix_product *= padding_size;
+    }
+
+    // Insert the sharded size (the part to aggregate)
+    out_reshape_dims.push_back(size);
+    out_aggregate_axes.push_back(
+        current_dim_index);  // This is the axis we aggregate over
+    current_dim_index++;
+    prefix_product *= size;
+  }
+
+  // Insert "suffix" dimension if the full size hasn't been reached
+  if (prefix_product != full_axis_size) {
+    out_reshape_dims.push_back(full_axis_size / prefix_product);
+  }
+
+  // --- 3. Permute Aggregate Axes back to Original Order ---
+  // The aggregate axes were calculated based on the sorted list.
+  // We must map them back to the original order to compute the correct
+  // flattened replica groups.
+  std::vector<int64_t> permuted_aggregate_axes(original_order.size());
+  for (int64_t i = 0; i < original_order.size(); ++i) {
+    permuted_aggregate_axes[original_order[i]] = out_aggregate_axes[i];
+  }
+  out_aggregate_axes = permuted_aggregate_axes;
+}
+
+MeshAxesReplicaGroupList::MeshAxesReplicaGroupList(Mesh mesh,
+                                                   std::vector<AxisRef> axes)
+    : mesh_(std::move(mesh)), axes_(std::move(axes)) {
+  CHECK_GT(num_devices_per_group(), 1)
+      << "MeshAxesReplicaGroupList: " << ToString()
+      << " has only one device per replica group.";
+
+  CHECK_OK(ValidateSpanOfAxes(axes_, mesh_));
+}
+
+int64_t MeshAxesReplicaGroupList::num_replica_groups() const {
+  return mesh_.device_assignment().num_elements() / num_devices_per_group();
+}
+
+int64_t MeshAxesReplicaGroupList::num_devices_per_group() const {
+  // Number of devices per replica group is equal to the product of the sizes of
+  // all axes.
+  int64_t devices_per_group = 1;
+  for (const AxisRef& axis : axes_) {
+    int64_t axis_size = axis.sub_axis_info().has_value()
+                            ? axis.sub_axis_info()->size
+                            : mesh_.axis_size(axis.mesh_axis_index());
+    devices_per_group *= axis_size;
+  }
+  return devices_per_group;
+}
+
+std::vector<std::vector<int64_t>> get_replica_groups_for_full_axes(
+    const Mesh& mesh, absl::Span<const int64_t> axis_sizes,
+    const absl::Span<const int64_t> grouped_axes,
+    const int64_t num_replica_groups, const int64_t num_devices_per_group) {
+  // Reshape the device assignment array bases on the axis sizes and transpose
+  // grouped axes to the end.
+  std::vector<int> transpose_axes;
+  transpose_axes.reserve(axis_sizes.size());
+  for (int64_t i = 0; i < axis_sizes.size(); ++i) {
+    if (!absl::c_linear_search(grouped_axes, i)) {
+      transpose_axes.push_back(i);
+    }
+  }
+  for (int64_t grouped_axis : grouped_axes) {
+    transpose_axes.push_back(grouped_axis);
+  }
+
+  TileAssignment device_assignment =
+      mesh.device_assignment().Reshape(axis_sizes).Transpose(transpose_axes);
+
+  std::vector<std::vector<int64_t>> replica_groups;
+  replica_groups.reserve(num_replica_groups);
+  for (auto it = device_assignment.array().begin();
+       it != device_assignment.array().end(); it += num_devices_per_group) {
+    std::vector<int64_t> group(it, it + num_devices_per_group);
+    replica_groups.emplace_back(std::move(group));
+  }
+  return replica_groups;
+}
+
+void MeshAxesReplicaGroupList::InitializeDimToReshapeAndAggregateAxes() {
+  absl::flat_hash_map<int64_t, std::vector<AxisRef>> dim_to_axes;
+  for (const AxisRef& axis : axes_) {
+    dim_to_axes[axis.mesh_axis_index()].push_back(axis);
+  }
+  absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes> dim_map;
+  // For each dimension determine the reshape that is consistent with it's
+  // AxisRef(s). Then maintain this reshape and the aggregated dims for easier
+  // computation of replica groups. As an example for @mesh<"a"=8>
+  // {a}               -> no reshape, aggregate over [0]
+  // {a:(1)2}          -> reshape [8]->[1,2,4], aggregate over [1]
+  // {a:(1)2, a:(4)2}  -> reshape [8]->[2,2,2], aggregate over [0,2]
+  for (auto& [dim, axes] : dim_to_axes) {
+    int64_t full_axis_size = mesh_.axis_size(dim);
+    ReshapeAndAggregateAxes reshape_and_aggregate_axes;
+    if (axes.size() == 1) {
+      HandleSingleAxisRefPerDimension(
+          axes[0], full_axis_size, reshape_and_aggregate_axes.reshape_dims,
+          reshape_and_aggregate_axes.aggregate_axes);
+    } else {
+      // Otherwise dimension is a set of axes with sub-axes info.
+      HandleMultiAxisRefPerDimension(axes, full_axis_size,
+                                     reshape_and_aggregate_axes.reshape_dims,
+                                     reshape_and_aggregate_axes.aggregate_axes);
+    }
+    dim_map[dim] = reshape_and_aggregate_axes;
+  }
+  dim_to_reshape_and_aggregate_axes_ = dim_map;
+}
+
+std::pair<std::vector<int64_t>, std::vector<int64_t>>
+MeshAxesReplicaGroupList::ComputeReindexedAxes() {
+  if (!dim_to_reshape_and_aggregate_axes_.has_value()) {
+    InitializeDimToReshapeAndAggregateAxes();
+  }
+  std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
+  absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes> dim_map =
+      dim_to_reshape_and_aggregate_axes_.value();
+  for (int64_t i = 0; i < mesh_.axis_sizes().size(); ++i) {
+    int64_t axis_size = mesh_.axis_size(i);
+    auto it = dim_map.find(i);
+    if (it == dim_map.end()) {
+      reindex_axis_sizes.push_back(axis_size);
+      continue;
+    }
+    int64_t offset_index = reindex_axis_sizes.size();
+    const ReshapeAndAggregateAxes& reshape_and_aggregate_axes = it->second;
+    for (int64_t reshape_dim : reshape_and_aggregate_axes.reshape_dims) {
+      reindex_axis_sizes.push_back(reshape_dim);
+    }
+    for (int64_t aggregate_dim : reshape_and_aggregate_axes.aggregate_axes) {
+      reindexed_grouped_axes.push_back(aggregate_dim + offset_index);
+    }
+  }
+  return std::make_pair(reindex_axis_sizes, reindexed_grouped_axes);
+}
+
+std::vector<std::vector<int64_t>>
+MeshAxesReplicaGroupList::flattened_replica_groups() {
+  std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
+  std::tie(reindex_axis_sizes, reindexed_grouped_axes) = ComputeReindexedAxes();
+  return get_replica_groups_for_full_axes(
+      mesh_, reindex_axis_sizes, reindexed_grouped_axes, num_replica_groups(),
+      num_devices_per_group());
+}
+
+void MeshAxesReplicaGroupList::Print(Printer* printer) const {
+  printer->Append(ToString());
+}
+
+std::string MeshAxesReplicaGroupList::ToString() const {
+  std::string rg_str = "";
+  // Add the axes defining the replica group, using names from the mesh.
+  std::vector<std::string> group_axes_str;
+  group_axes_str.reserve(axes_.size());
+  for (const AxisRef& axis : axes_) {
+    std::string axis_str = axis.ToString(mesh_);
+    group_axes_str.push_back(axis_str);
+  }
+  absl::StrAppend(&rg_str, mesh_.ToString(), " {",
+                  absl::StrJoin(group_axes_str, ","), "}");
+  return rg_str;
+}
+
+MeshAxesReplicaGroupListProto MeshAxesReplicaGroupList::ToProto() const {
+  MeshAxesReplicaGroupListProto proto;
+  *proto.mutable_mesh() = mesh_.ToProto();
+  for (const AxisRef& axis : axes_) {
+    *proto.add_axes() = axis.ToProto();
+  }
+  return proto;
+}
+
+MeshAxesReplicaGroupList MeshAxesReplicaGroupList::FromProto(
+    const MeshAxesReplicaGroupListProto& proto) {
+  Mesh mesh = Mesh::FromProto(proto.mesh());
+  std::vector<AxisRef> axes;
+  for (const AxisRefProto& axis_proto : proto.axes()) {
+    axes.push_back(AxisRef::FromProto(axis_proto));
+  }
+  return MeshAxesReplicaGroupList(mesh, axes);
+}
+
+IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() {
+  CHECK(mesh_.device_assignment().iota().has_value());
+  std::vector<int64_t> reshape_dims, reindexed_grouped_axes;
+  std::tie(reshape_dims, reindexed_grouped_axes) = ComputeReindexedAxes();
+
+  std::vector<int> transpose_perm;
+  for (int64_t reshape_dim = 0; reshape_dim < reshape_dims.size();
+       ++reshape_dim) {
+    if (!absl::c_linear_search(reindexed_grouped_axes, reshape_dim)) {
+      transpose_perm.push_back(reshape_dim);
+    }
+  }
+  for (int64_t grouped_axis : reindexed_grouped_axes) {
+    transpose_perm.push_back(grouped_axis);
+  }
+
+  return IotaReplicaGroupList(num_replica_groups(), num_devices_per_group(),
+                              reshape_dims, transpose_perm);
+}
+
+CollectiveDeviceList MeshAxesReplicaGroupList::ToCollectiveDeviceList() {
+  return CollectiveDeviceList(flattened_replica_groups());
+}
+
+/************** IotaReplicaGroupList implementation ***************************/
 int64_t IotaReplicaGroupList::num_replica_groups() const {
   DCHECK_GE(num_replica_groups_, 0);
   return num_replica_groups_;
@@ -121,6 +393,7 @@ std::shared_ptr<std::vector<ReplicaGroup>> ExpandIota(
 }
 }  // namespace
 
+/************** CollectiveDeviceList implementation ***************************/
 const std::vector<ReplicaGroup>& CollectiveDeviceList::replica_groups() const {
   if (replica_groups_ == nullptr) {
     CHECK(iota_replica_group_list_.has_value());
@@ -130,6 +403,16 @@ const std::vector<ReplicaGroup>& CollectiveDeviceList::replica_groups() const {
   return *replica_groups_;
 }
 
+std::vector<std::vector<int64_t>>
+CollectiveDeviceList::flattened_replica_groups() const {
+  std::vector<std::vector<int64_t>> result;
+  result.reserve(replica_groups().size());
+  for (const ReplicaGroup& group : replica_groups()) {
+    result.emplace_back(group.replica_ids().begin(), group.replica_ids().end());
+  }
+  return result;
+}
+
 std::string CollectiveDeviceList::ToString(
     bool print_full_replica_group_list) const {
   if (iota_replica_group_list_.has_value() && !print_full_replica_group_list) {
diff --git a/third_party/xla/xla/hlo/ir/replica_group.h b/third_party/xla/xla/hlo/ir/replica_group.h
index f1b612fe8c77b5..b6e30d24071ef3 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.h
+++ b/third_party/xla/xla/hlo/ir/replica_group.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_HLO_IR_REPLICA_GROUP_H_
 #define XLA_HLO_IR_REPLICA_GROUP_H_
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -24,8 +25,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/array.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
@@ -34,6 +38,53 @@ limitations under the License.
 
 namespace xla {
 
+class IotaReplicaGroupList;
+class CollectiveDeviceList;
+
+class MeshAxesReplicaGroupList {
+  struct ReshapeAndAggregateAxes {
+    std::vector<int64_t> reshape_dims;
+    std::vector<int64_t> aggregate_axes;
+  };
+
+ public:
+  explicit MeshAxesReplicaGroupList(Mesh mesh, std::vector<AxisRef> axes);
+
+  bool operator==(const MeshAxesReplicaGroupList& other) const {
+    return mesh_ == other.mesh_ && axes_ == other.axes_;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const MeshAxesReplicaGroupList& c) {
+    return H::combine(std::move(h), c.mesh_, c.axes_);
+  }
+
+  int64_t num_replica_groups() const;
+  int64_t num_devices_per_group() const;
+  std::vector<std::vector<int64_t>> flattened_replica_groups();
+
+  void Print(Printer* printer) const;
+
+  std::string ToString() const;
+
+  MeshAxesReplicaGroupListProto ToProto() const;
+
+  static MeshAxesReplicaGroupList FromProto(
+      const MeshAxesReplicaGroupListProto& proto);
+
+  // Methods for converting to V2 and V1 representations.
+  IotaReplicaGroupList ToIotaReplicaGroupList();
+  CollectiveDeviceList ToCollectiveDeviceList();
+
+ private:
+  void InitializeDimToReshapeAndAggregateAxes();
+  std::pair<std::vector<int64_t>, std::vector<int64_t>> ComputeReindexedAxes();
+  Mesh mesh_;
+  std::vector<AxisRef> axes_;
+  std::optional<absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>>
+      dim_to_reshape_and_aggregate_axes_;
+};
+
 std::string ReplicaGroupsToString(
     absl::Span<const ReplicaGroup> replica_groups);
 
@@ -156,6 +207,7 @@ class CollectiveDeviceList {
 
   // Lazyly explands iota if applicable.
   const std::vector<ReplicaGroup>& replica_groups() const;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const;
   const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
     return iota_replica_group_list_;
   }
diff --git a/third_party/xla/xla/hlo/ir/replica_group_test.cc b/third_party/xla/xla/hlo/ir/replica_group_test.cc
index 74fb67f876f0ac..b038eaafd041c0 100644
--- a/third_party/xla/xla/hlo/ir/replica_group_test.cc
+++ b/third_party/xla/xla/hlo/ir/replica_group_test.cc
@@ -16,10 +16,15 @@ limitations under the License.
 #include "xla/hlo/ir/replica_group.h"
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/tile_assignment.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -37,6 +42,409 @@ CollectiveDeviceListProto CreateDeviceListProto(
   return proto;
 }
 
+TEST(MeshAxesReplicaGroupListTest, MaterializedReplicaGroups) {
+  Mesh mesh_xy({2, 2}, {"x", "y"});
+
+  EXPECT_DEATH(
+      { MeshAxesReplicaGroupList replica_group_none(mesh_xy, {}); },
+      "has only one device per replica group");
+
+  MeshAxesReplicaGroupList replica_group_x(mesh_xy, {AxisRef(0)});
+  std::vector<std::vector<int64_t>> expected_replica_groups_x = {{0, 2},
+                                                                 {1, 3}};
+  EXPECT_EQ(replica_group_x.flattened_replica_groups(),
+            expected_replica_groups_x);
+
+  MeshAxesReplicaGroupList replica_group_y(mesh_xy, {AxisRef(1)});
+  std::vector<std::vector<int64_t>> expected_replica_groups_y = {{0, 1},
+                                                                 {2, 3}};
+  EXPECT_EQ(replica_group_y.flattened_replica_groups(),
+            expected_replica_groups_y);
+
+  MeshAxesReplicaGroupList replica_group_xy(mesh_xy, {AxisRef(0), AxisRef(1)});
+  std::vector<std::vector<int64_t>> expected_replica_groups_xy = {{0, 1, 2, 3}};
+  EXPECT_EQ(replica_group_xy.flattened_replica_groups(),
+            expected_replica_groups_xy);
+}
+
+TEST(MeshAxesReplicaGroupListTest, MaterializedReplicaGroupsWithSubaxes) {
+  Mesh mesh({6, 6}, {"a", "b"});
+
+  // a:(1)2
+  MeshAxesReplicaGroupList replica_group_a_1_2(mesh, {AxisRef(0, {1, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_2 = {
+      {0, 18},  {1, 19},  {2, 20},  {3, 21},  {4, 22},  {5, 23},
+      {6, 24},  {7, 25},  {8, 26},  {9, 27},  {10, 28}, {11, 29},
+      {12, 30}, {13, 31}, {14, 32}, {15, 33}, {16, 34}, {17, 35}};
+  EXPECT_EQ(replica_group_a_1_2.flattened_replica_groups(),
+            expected_replica_groups_a_1_2);
+
+  // a:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3(mesh, {AxisRef(0, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_3 = {
+      {0, 12, 24}, {1, 13, 25}, {2, 14, 26},  {3, 15, 27},
+      {4, 16, 28}, {5, 17, 29}, {6, 18, 30},  {7, 19, 31},
+      {8, 20, 32}, {9, 21, 33}, {10, 22, 34}, {11, 23, 35}};
+  EXPECT_EQ(replica_group_a_1_3.flattened_replica_groups(),
+            expected_replica_groups_a_1_3);
+
+  // a:(3)2
+  MeshAxesReplicaGroupList replica_group_a_3_2(mesh, {AxisRef(0, {3, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_3_2 = {
+      {0, 6},   {1, 7},   {2, 8},   {3, 9},   {4, 10},  {5, 11},
+      {12, 18}, {13, 19}, {14, 20}, {15, 21}, {16, 22}, {17, 23},
+      {24, 30}, {25, 31}, {26, 32}, {27, 33}, {28, 34}, {29, 35}};
+  EXPECT_EQ(replica_group_a_3_2.flattened_replica_groups(),
+            expected_replica_groups_a_3_2);
+
+  // b:(1)2
+  MeshAxesReplicaGroupList replica_group_b_1_2(mesh, {AxisRef(1, {1, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_1_2 = {
+      {0, 3},   {1, 4},   {2, 5},   {6, 9},   {7, 10},  {8, 11},
+      {12, 15}, {13, 16}, {14, 17}, {18, 21}, {19, 22}, {20, 23},
+      {24, 27}, {25, 28}, {26, 29}, {30, 33}, {31, 34}, {32, 35}};
+  EXPECT_EQ(replica_group_b_1_2.flattened_replica_groups(),
+            expected_replica_groups_b_1_2);
+
+  // b:(1)3
+  MeshAxesReplicaGroupList replica_group_b_1_3(mesh, {AxisRef(1, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_1_3 = {
+      {0, 2, 4},    {1, 3, 5},    {6, 8, 10},   {7, 9, 11},
+      {12, 14, 16}, {13, 15, 17}, {18, 20, 22}, {19, 21, 23},
+      {24, 26, 28}, {25, 27, 29}, {30, 32, 34}, {31, 33, 35}};
+  EXPECT_EQ(replica_group_b_1_3.flattened_replica_groups(),
+            expected_replica_groups_b_1_3);
+
+  // b:(3)2
+  MeshAxesReplicaGroupList replica_group_b_3_2(mesh, {AxisRef(1, {3, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_3_2 = {
+      {0, 1},   {2, 3},   {4, 5},   {6, 7},   {8, 9},   {10, 11},
+      {12, 13}, {14, 15}, {16, 17}, {18, 19}, {20, 21}, {22, 23},
+      {24, 25}, {26, 27}, {28, 29}, {30, 31}, {32, 33}, {34, 35}};
+  EXPECT_EQ(replica_group_b_3_2.flattened_replica_groups(),
+            expected_replica_groups_b_3_2);
+
+  // a:(1)2, b:(1)2
+  MeshAxesReplicaGroupList replica_group_a_1_2_b_1_2(
+      mesh, {AxisRef(0, {1, 2}), AxisRef(1, {1, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_2_b_1_2 = {
+      {0, 3, 18, 21},   {1, 4, 19, 22},   {2, 5, 20, 23},
+      {6, 9, 24, 27},   {7, 10, 25, 28},  {8, 11, 26, 29},
+      {12, 15, 30, 33}, {13, 16, 31, 34}, {14, 17, 32, 35}};
+  EXPECT_EQ(replica_group_a_1_2_b_1_2.flattened_replica_groups(),
+            expected_replica_groups_a_1_2_b_1_2);
+
+  // a:(1)3, b:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3_b_1_3(
+      mesh, {AxisRef(0, {1, 3}), AxisRef(1, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_3_b_1_3 = {
+      {0, 2, 4, 12, 14, 16, 24, 26, 28},
+      {1, 3, 5, 13, 15, 17, 25, 27, 29},
+      {6, 8, 10, 18, 20, 22, 30, 32, 34},
+      {7, 9, 11, 19, 21, 23, 31, 33, 35}};
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            expected_replica_groups_a_1_3_b_1_3);
+
+  //  b:(1)3, a:(1)3 (Reverse order from above). This should produce the same
+  // replica groups as the above but with ids in a different order.
+  MeshAxesReplicaGroupList replica_group_b_1_3_a_1_3(
+      mesh, {AxisRef(1, {1, 3}), AxisRef(0, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_1_3_a_1_3 = {
+      {0, 12, 24, 2, 14, 26, 4, 16, 28},
+      {1, 13, 25, 3, 15, 27, 5, 17, 29},
+      {6, 18, 30, 8, 20, 32, 10, 22, 34},
+      {7, 19, 31, 9, 21, 33, 11, 23, 35}};
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            expected_replica_groups_a_1_3_b_1_3);
+
+  // a:(3)2, b:(3)2
+  MeshAxesReplicaGroupList replica_group_a_3_2_b_3_2(
+      mesh, {AxisRef(0, {3, 2}), AxisRef(1, {3, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_3_2_b_3_2 = {
+      {0, 1, 6, 7},     {2, 3, 8, 9},     {4, 5, 10, 11},
+      {12, 13, 18, 19}, {14, 15, 20, 21}, {16, 17, 22, 23},
+      {24, 25, 30, 31}, {26, 27, 32, 33}, {28, 29, 34, 35}};
+  EXPECT_EQ(replica_group_a_3_2_b_3_2.flattened_replica_groups(),
+            expected_replica_groups_a_3_2_b_3_2);
+}
+
+TEST(MeshAxesReplicaGroupListTest, MaterializedReplicaGroupsMatchExpectedV2) {
+  Mesh mesh({8}, {"a"});
+
+  // a:(1)2 -> replica_groups=[4,2]<=[2,4]T(1,0)
+  MeshAxesReplicaGroupList v3_subaxis_1_2(mesh, {AxisRef(0, {1, 2})});
+  IotaReplicaGroupList v2_subaxis_1_2(4, 2, {2, 4}, {1, 0});
+  EXPECT_EQ(v3_subaxis_1_2.flattened_replica_groups(),
+            v2_subaxis_1_2.flattened_replica_groups());
+
+  // a:(1)4 -> replica_groups=[2,4]<=[4,2]T(1,0)
+  MeshAxesReplicaGroupList v3_subaxis_1_4(mesh, {AxisRef(0, {1, 4})});
+  IotaReplicaGroupList v2_subaxis_1_4(2, 4, {4, 2}, {1, 0});
+  EXPECT_EQ(v3_subaxis_1_4.flattened_replica_groups(),
+            v2_subaxis_1_4.flattened_replica_groups());
+
+  // a:(2)2 -> replica_groups=[4,2]<=[2,2,2]T(0,2,1)
+  MeshAxesReplicaGroupList v3_subaxis_2_2(mesh, {AxisRef(0, {2, 2})});
+  IotaReplicaGroupList v2_subaxis_2_2(4, 2, {2, 2, 2}, {0, 2, 1});
+  EXPECT_EQ(v3_subaxis_2_2.flattened_replica_groups(),
+            v2_subaxis_2_2.flattened_replica_groups());
+
+  // a:(2)4 -> replica_groups=[2,4]<=[8]
+  MeshAxesReplicaGroupList v3_subaxis_2_4(mesh, {AxisRef(0, {2, 4})});
+  IotaReplicaGroupList v2_subaxis_2_4(2, 4, {8}, {0});
+  EXPECT_EQ(v3_subaxis_2_4.flattened_replica_groups(),
+            v2_subaxis_2_4.flattened_replica_groups());
+
+  // a:(4)2 -> replica_groups=[4,2]<=[8]
+  MeshAxesReplicaGroupList v3_subaxis_4_2(mesh, {AxisRef(0, {4, 2})});
+  IotaReplicaGroupList v2_subaxis_4_2(4, 2, {8}, {0});
+  EXPECT_EQ(v3_subaxis_4_2.flattened_replica_groups(),
+            v2_subaxis_4_2.flattened_replica_groups());
+
+  //  {a:(1)2, a:(4)2} -> replica_groups=[2,4]<=[2,2,2]T(1,0,2)
+  MeshAxesReplicaGroupList v3_subaxis_1_2_and_4_2(
+      mesh, {AxisRef(0, {1, 2}), AxisRef(0, {4, 2})});
+  IotaReplicaGroupList v2_subaxis_1_2_and_4_2(2, 4, {2, 2, 2}, {1, 0, 2});
+  EXPECT_EQ(v3_subaxis_1_2_and_4_2.flattened_replica_groups(),
+            v2_subaxis_1_2_and_4_2.flattened_replica_groups());
+
+  //  {a:(4)2, a:(1)2} -> replica_groups=[2,4]<=[2,2,2]T(1,2,0)
+  MeshAxesReplicaGroupList v3_subaxis_4_2_and_1_2(
+      mesh, {AxisRef(0, {4, 2}), AxisRef(0, {1, 2})});
+  IotaReplicaGroupList v2_subaxis_4_2_and_1_2(2, 4, {2, 2, 2}, {1, 2, 0});
+  EXPECT_EQ(v3_subaxis_4_2_and_1_2.flattened_replica_groups(),
+            v2_subaxis_4_2_and_1_2.flattened_replica_groups());
+
+  // a      -> replica_groups=[1,8]<=[8]
+  MeshAxesReplicaGroupList v3_no_subaxis(mesh, {AxisRef(0)});
+  IotaReplicaGroupList v2_no_subaxis(1, 8, {8}, {0});
+  EXPECT_EQ(v3_no_subaxis.flattened_replica_groups(),
+            v2_no_subaxis.flattened_replica_groups());
+}
+
+TEST(MeshAxesReplicaGroupListTest,
+     MaterializedReplicaGroupsRespectNonIotaDeviceOrdering) {
+  // Create a mesh with non-iota device ordering.
+  Array2D<int64_t> array({{3, 1}, {0, 2}});
+  TileAssignment tile_assignment(std::make_shared<Array<int64_t>>(array));
+  Mesh mesh_xy(tile_assignment, {"x", "y"});
+
+  // Reduce along x axis.
+  MeshAxesReplicaGroupList replica_group_x(mesh_xy, {AxisRef(0)});
+  // With iota device ordering, the expected replica groups would be
+  // {{0, 2}, {1, 3}}.
+  std::vector<std::vector<int64_t>> expected_replica_groups_x = {{3, 0},
+                                                                 {1, 2}};
+  EXPECT_THAT(replica_group_x.flattened_replica_groups(),
+              testing::UnorderedElementsAreArray(expected_replica_groups_x));
+
+  // Reduce along y axis.
+  MeshAxesReplicaGroupList replica_group_y(mesh_xy, {AxisRef(1)});
+  // With iota device ordering, the expected replica groups would be
+  // {{0, 1}, {2, 3}}.
+  std::vector<std::vector<int64_t>> expected_replica_groups_y = {{3, 1},
+                                                                 {0, 2}};
+  EXPECT_THAT(replica_group_y.flattened_replica_groups(),
+              testing::UnorderedElementsAreArray(expected_replica_groups_y));
+}
+
+TEST(MeshAxesReplicaGroupListTest, NumReplicaGroups) {
+  Mesh all_axes({4, 4}, {"x", "y"});
+  MeshAxesReplicaGroupList replica_group_across_all_axes(
+      all_axes, {AxisRef(0), AxisRef(1)});
+  EXPECT_EQ(replica_group_across_all_axes.num_replica_groups(), 1);
+  EXPECT_EQ(replica_group_across_all_axes.num_devices_per_group(), 16);
+
+  Mesh one_axes({3, 5}, {"a", "b"});
+  MeshAxesReplicaGroupList replica_group_across_a(one_axes, {AxisRef(0)});
+  MeshAxesReplicaGroupList replica_group_across_b(one_axes, {AxisRef(1)});
+  EXPECT_EQ(replica_group_across_a.num_replica_groups(), 5);
+  EXPECT_EQ(replica_group_across_a.num_devices_per_group(), 3);
+  EXPECT_EQ(replica_group_across_b.num_replica_groups(), 3);
+  EXPECT_EQ(replica_group_across_b.num_devices_per_group(), 5);
+}
+
+TEST(MeshAxesReplicaGroupListTest, ValidateSubAxesCoexistenceCheck) {
+  Mesh mesh({8}, {"a"});
+  MeshAxesReplicaGroupList replica_group_multiple_subaxes1(
+      mesh, {AxisRef(0, {1, 2}), AxisRef(0, {4, 2})});
+  MeshAxesReplicaGroupList replica_group_multiple_subaxes2(
+      mesh, {AxisRef(0, {4, 2}), AxisRef(0, {1, 2})});
+
+  Mesh overlap_mesh({2 * 3 * 5}, {"u"});
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList overlapping_subaxes(
+            overlap_mesh, {AxisRef(0, {6, 5}), AxisRef(0, {10, 3})});
+      },
+      "Axes cannot coexist or axes overlap.");
+}
+
+TEST(MeshAxesReplicaGroupListTest, ReplicaGroupsCountAndSizeForSubaxes) {
+  Mesh mesh_one_subaxis({2, 6, 10}, {"axis1", "axis2", "axis3"});
+  MeshAxesReplicaGroupList replica_group_across_axis2_subaxis(
+      mesh_one_subaxis, {AxisRef(1, {2, 3})});
+  MeshAxesReplicaGroupList replica_group_across_axis3_subaxis(
+      mesh_one_subaxis, {AxisRef(2, {1, 2})});
+  EXPECT_EQ(replica_group_across_axis2_subaxis.num_replica_groups(), 40);
+  EXPECT_EQ(replica_group_across_axis2_subaxis.num_devices_per_group(), 3);
+  EXPECT_EQ(replica_group_across_axis3_subaxis.num_replica_groups(), 60);
+  EXPECT_EQ(replica_group_across_axis3_subaxis.num_devices_per_group(), 2);
+
+  Mesh mesh_multiple_subaxis({2 * 3, 5 * 7, 11 * 13},
+                             {"alpha", "beta", "gamma"});
+  MeshAxesReplicaGroupList replica_group_across_multiple_subaxis1(
+      mesh_multiple_subaxis,
+      {AxisRef(0, {1, 2}), AxisRef(1, {1, 5}), AxisRef(2, {1, 11})});
+  MeshAxesReplicaGroupList replica_group_across_multiple_subaxis2(
+      mesh_multiple_subaxis,
+      {AxisRef(0, {2, 3}), AxisRef(1, {5, 7}), AxisRef(2, {11, 13})});
+  EXPECT_EQ(replica_group_across_multiple_subaxis1.num_replica_groups(),
+            3 * 7 * 13);
+  EXPECT_EQ(replica_group_across_multiple_subaxis1.num_devices_per_group(),
+            2 * 5 * 11);
+  EXPECT_EQ(replica_group_across_multiple_subaxis2.num_replica_groups(),
+            2 * 5 * 11);
+  EXPECT_EQ(replica_group_across_multiple_subaxis2.num_devices_per_group(),
+            3 * 7 * 13);
+}
+
+TEST(MeshAxesReplicaGroupListTest, MeshAxesToString) {
+  // No subaxes and iota device assignment.
+  Mesh mesh_uvw({10, 12, 15}, {"u", "v", "w"});
+  MeshAxesReplicaGroupList replica_group_across_uv(mesh_uvw,
+                                                   {AxisRef(0), AxisRef(1)});
+  EXPECT_EQ(replica_group_across_uv.ToString(), "@mesh<u=10,v=12,w=15> {u,v}");
+
+  // Subaxes and replica group v2 iota style device assignment.
+  Mesh mesh_abcd(
+      TileAssignment(/*dims=*/{2, 4, 4, 2}, /*reshape_dims=*/{1, 4, 1, 16},
+                     /*transpose_perm=*/{2, 3, 0, 1}),
+      {"a", "b", "c", "d"});
+  MeshAxesReplicaGroupList rg_abcd_across_multiple_axes_and_subaxes(
+      mesh_abcd, {AxisRef(0), AxisRef(1, {1, 2}), AxisRef(3)});
+  EXPECT_EQ(rg_abcd_across_multiple_axes_and_subaxes.ToString(),
+            "@mesh<a=2,b=4,c=4,d=2>([4,16]T(1,0)) {a,b:(1)2,d}");
+
+  // Subaxes and random device assignment.
+  Array<int64_t> array({{8, 3, 7, 5, 4, 2, 6, 0, 1, 9}});
+  array.Reshape({10});
+  TileAssignment tile_assignment(std::make_shared<Array<int64_t>>(array));
+  Mesh mesh_ooo(tile_assignment, {"ooo"});
+  MeshAxesReplicaGroupList rg_ooo_across_ooo_5_2(mesh_ooo,
+                                                 {AxisRef(0, {5, 2})});
+  EXPECT_EQ(rg_ooo_across_ooo_5_2.ToString(),
+            "@mesh<ooo=10>(8,3,7,5,4,2,6,0,1,9) {ooo:(5)2}");
+}
+
+TEST(MeshAxesReplicaGroupListTest, ValidatesIncompatibleAxes) {
+  Mesh mesh({10}, {"u"});
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(1, {1, 2})});
+      },
+      "Axis index must be less than number of axes");
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(0, {8, 2})});
+      },
+      "Pre-size and size must divide the full axis size");
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(0, {2, 8})});
+      },
+      "Pre-size and size must divide the full axis size");
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(0, {1, 10})});
+      },
+      "Sub-axis size must be strictly less than the full axis size");
+}
+
+TEST(MeshAxesReplicaGroupListTest, ToReplicaGroupV2) {
+  Mesh mesh_ab({6, 6}, {"a", "b"});
+
+  // a:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3(mesh_ab, {AxisRef(0, {1, 3})});
+  EXPECT_EQ(
+      replica_group_a_1_3.flattened_replica_groups(),
+      replica_group_a_1_3.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // b:(3)2
+  MeshAxesReplicaGroupList replica_group_b_3_2(mesh_ab, {AxisRef(1, {3, 2})});
+  EXPECT_EQ(
+      replica_group_b_3_2.flattened_replica_groups(),
+      replica_group_b_3_2.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // a:(1)2, b:(1)2
+  MeshAxesReplicaGroupList replica_group_a_1_2_b_1_2(
+      mesh_ab, {AxisRef(0, {1, 2}), AxisRef(1, {1, 2})});
+  EXPECT_EQ(replica_group_a_1_2_b_1_2.flattened_replica_groups(),
+            replica_group_a_1_2_b_1_2.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  // a:(1)3, b:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3_b_1_3(
+      mesh_ab, {AxisRef(0, {1, 3}), AxisRef(1, {1, 3})});
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            replica_group_a_1_3_b_1_3.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  // b:(1)3, a:(1)3 (Reverse order from above). This should produce the same
+  // replica groups as the above but with ids in a different order.
+  MeshAxesReplicaGroupList replica_group_b_1_3_a_1_3(
+      mesh_ab, {AxisRef(1, {1, 3}), AxisRef(0, {1, 3})});
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            replica_group_a_1_3_b_1_3.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  Mesh mesh_cd({8, 6}, {"c", "d"});
+
+  // c
+  MeshAxesReplicaGroupList replica_group_c(mesh_cd, {AxisRef(0)});
+  EXPECT_EQ(
+      replica_group_c.flattened_replica_groups(),
+      replica_group_c.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // d
+  MeshAxesReplicaGroupList replica_group_d(mesh_cd, {AxisRef(1)});
+  EXPECT_EQ(
+      replica_group_d.flattened_replica_groups(),
+      replica_group_d.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // c:(1)2, d:(4)2
+  MeshAxesReplicaGroupList replica_group_c_1_2_c_4_2(
+      mesh_cd, {AxisRef(0, {1, 2}), AxisRef(0, {4, 2})});
+  EXPECT_EQ(replica_group_c_1_2_c_4_2.flattened_replica_groups(),
+            replica_group_c_1_2_c_4_2.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  // c:(2)3, d:(1)2
+  MeshAxesReplicaGroupList replica_group_d_2_3_d_1_2(
+      mesh_cd, {AxisRef(1, {2, 3}), AxisRef(1, {1, 2})});
+  EXPECT_EQ(replica_group_d_2_3_d_1_2.flattened_replica_groups(),
+            replica_group_d_2_3_d_1_2.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+}
+
+TEST(MeshAxesReplicaGroupListTest, ToCollectiveDeviceList) {
+  Mesh mesh({6, 6}, {"a", "b"});
+
+  MeshAxesReplicaGroupList replica_group_b(mesh, {AxisRef(0)});
+  EXPECT_EQ(
+      replica_group_b.flattened_replica_groups(),
+      replica_group_b.ToCollectiveDeviceList().flattened_replica_groups());
+
+  MeshAxesReplicaGroupList replica_group_a_1_3(mesh, {AxisRef(0, {1, 3})});
+  EXPECT_EQ(
+      replica_group_a_1_3.flattened_replica_groups(),
+      replica_group_a_1_3.ToCollectiveDeviceList().flattened_replica_groups());
+}
+
 TEST(CollectiveDeviceListTest, DefaultListToString) {
   EXPECT_EQ(CollectiveDeviceList().ToString(true), "{}");
   EXPECT_EQ(CollectiveDeviceList().ToString(false), "{}");
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.cc b/third_party/xla/xla/hlo/ir/tile_assignment.cc
index ba2a9d463e9762..ee755c1f6d81fe 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.cc
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.cc
@@ -126,10 +126,14 @@ TransposeKind GetTransposeKind(absl::Span<const int64_t> dims,
   for (int i = 0; i < perm.size(); ++i) {
     const auto& d = perm[i];
     if (dims[d] == 1) {
-      if (d != i && dims[i] != 1) kind = TransposeKind::kReshape;
+      if (d != i && dims[i] != 1) {
+        kind = TransposeKind::kReshape;
+      }
       continue;
     }
-    if (d <= prev_non_one_dim) return TransposeKind::kTranspose;
+    if (d <= prev_non_one_dim) {
+      return TransposeKind::kTranspose;
+    }
     prev_non_one_dim = d;
   }
   return kind;
@@ -268,7 +272,9 @@ std::optional<IotaTileAssignment> IotaTileAssignment::Transpose(
   DCHECK_EQ(ndims_, perm.size());
   auto dims = this->dims();
   const TransposeKind kind = GetTransposeKind(dims, perm);
-  if (kind == TransposeKind::kNoop) return *this;
+  if (kind == TransposeKind::kNoop) {
+    return *this;
+  }
   absl::InlinedVector<int64_t, 6> new_dims(ndims_);
   for (int64_t i = 0; i < ndims_; ++i) {
     new_dims[i] = dims[perm[i]];
@@ -304,7 +310,9 @@ std::optional<IotaTileAssignment> IotaTileAssignment::Transpose(
     absl::InlinedVector<int, 6> new_perm;
     new_perm.reserve(non_one_dims.size());
     for (int i = 0; i < ndims_; ++i) {
-      if (dims[perm[i]] == 1) continue;
+      if (dims[perm[i]] == 1) {
+        continue;
+      }
       new_perm.push_back(transpose_perm[one_to_non_one[perm[i]]]);
     }
     CHECK_EQ(reshape_ndims_, new_perm.size());
@@ -430,10 +438,8 @@ std::optional<IotaTileAssignment> IotaTileAssignment::Transpose(
                                     flattened_transpose_perm);
 }
 
-void IotaTileAssignment::Print(Printer* printer) const {
+void IotaTileAssignment::PrintArray(Printer* printer) const {
   printer->Append("[");
-  AppendJoin(printer, dims(), ",");
-  printer->Append("]<=[");
   AppendJoin(printer, reshape_dims(), ",");
   printer->Append("]");
   if (reshape_ndims_ > 1) {
@@ -443,6 +449,19 @@ void IotaTileAssignment::Print(Printer* printer) const {
   }
 }
 
+void IotaTileAssignment::Print(Printer* printer) const {
+  printer->Append("[");
+  AppendJoin(printer, dims(), ",");
+  printer->Append("]<=");
+  PrintArray(printer);
+}
+
+std::string IotaTileAssignment::ArrayToString() const {
+  StringPrinter printer;
+  PrintArray(&printer);
+  return std::move(printer).ToString();
+}
+
 std::string IotaTileAssignment::ToString() const {
   StringPrinter printer;
   Print(&printer);
@@ -581,10 +600,8 @@ absl::Status TileAssignment::EachStatus(
     absl::Span<const int64_t> new_dimensions) const {
   if (iota_) {
     CHECK_EQ(Product(new_dimensions), iota_->num_elements());
-    return TileAssignment(
-        IotaTileAssignment(new_dimensions, iota_->reshape_dims(),
-                           iota_->transpose_perm()),
-        /*shared_array=*/nullptr);
+    return TileAssignment(new_dimensions, iota_->reshape_dims(),
+                          iota_->transpose_perm());
   }
   std::shared_ptr<Array<int64_t>> reshaped = shared_array_clone();
   reshaped->Reshape(new_dimensions);
@@ -608,6 +625,14 @@ absl::Status TileAssignment::EachStatus(
   return TileAssignment(std::move(cloned_array));
 }
 
+void TileAssignment::PrintArray(Printer* printer) const {
+  if (iota_) {
+    iota_->PrintArray(printer);
+  } else {
+    AppendJoin(printer, array(), ",");
+  }
+}
+
 void TileAssignment::Print(Printer* printer) const {
   if (iota_) {
     printer->Append("devices=");
@@ -620,6 +645,12 @@ void TileAssignment::Print(Printer* printer) const {
   }
 }
 
+std::string TileAssignment::ArrayToString() const {
+  StringPrinter printer;
+  PrintArray(&printer);
+  return std::move(printer).ToString();
+}
+
 std::string TileAssignment::ToString() const {
   StringPrinter printer;
   Print(&printer);
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.h b/third_party/xla/xla/hlo/ir/tile_assignment.h
index 8b6dd9d28c22e1..e7fb204a8c3fef 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.h
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.h
@@ -110,8 +110,10 @@ class IotaTileAssignment {
   std::optional<IotaTileAssignment> Transpose(absl::Span<const int> perm) const;
 
   void Print(Printer* printer) const;
+  void PrintArray(Printer* printer) const;
 
   std::string ToString() const;
+  std::string ArrayToString() const;
 
   // Materializes array representation of IotaTileAssignment.
   Array<int64_t> ToArray() const;
@@ -147,18 +149,6 @@ class IotaTileAssignment {
   int size_bytes() const {
     return ndims_ * kPerDimBytes + reshape_ndims_ * kPerReshapeDimBytes;
   }
-
-  bool next_index(absl::Span<int64_t> index) const {
-    DCHECK_EQ(index.size(), ndims_);
-    for (int64_t i = ndims_ - 1; i >= 0; --i) {
-      index[i]++;
-      if (index[i] < dims_ptr()[i]) {
-        return true;
-      }
-      index[i] = 0;
-    }
-    return false;
-  }
   int32_t ndims_;
   int32_t reshape_ndims_;
   // Contiguous buffer storing `int64_t dims[]`, `int64_t reshape_dims[]`,
@@ -181,14 +171,14 @@ Array<int64_t> ToArray(absl::Span<const int64_t> reshape_dims,
 class TileAssignment {
  public:
   TileAssignment() : array_(ReplicatedArray()) {}
+
   explicit TileAssignment(std::shared_ptr<const Array<int64_t>> array)
       : shared_array_(std::move(array)), array_(shared_array_.get()) {}
   explicit TileAssignment(int64_t device_id)
       : TileAssignment(std::make_shared<const Array<int64_t>>(
             std::initializer_list<int64_t>{1}, device_id)) {}
+
   explicit TileAssignment(IotaTileAssignment iota) : iota_(std::move(iota)) {}
-  explicit TileAssignment(std::initializer_list<int64_t> dims)
-      : iota_(IotaTileAssignment::Create(dims)) {}
   explicit TileAssignment(absl::Span<const int64_t> dims)
       : iota_(IotaTileAssignment::Create(dims)) {}
   explicit TileAssignment(absl::Span<const int64_t> dims,
@@ -249,8 +239,10 @@ class TileAssignment {
   [[nodiscard]] TileAssignment Transpose(absl::Span<const int> perm) const;
 
   void Print(Printer* printer) const;
+  void PrintArray(Printer* printer) const;
 
   std::string ToString() const;
+  std::string ArrayToString() const;
 
   bool UsesDevice(int64_t device) const;
 
@@ -276,13 +268,6 @@ class TileAssignment {
 
  private:
   friend class HloSharding;
-  // TODO(b/281892190): Consider changing int64_t to int32_t since it's unlikely
-  // to have so many devices to overflow int32_t in practice.
-  explicit TileAssignment(IotaTileAssignment iota,
-                          std::shared_ptr<const Array<int64_t>> shared_array)
-      : iota_(std::move(iota)),
-        shared_array_(std::move(shared_array)),
-        array_(shared_array_.get()) {}
 
   void MaybeMaterializeFullArray() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment_test.cc b/third_party/xla/xla/hlo/ir/tile_assignment_test.cc
new file mode 100644
index 00000000000000..fae834f0ca6689
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/tile_assignment_test.cc
@@ -0,0 +1,298 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/tile_assignment.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array2d.h"
+
+namespace xla {
+namespace {
+
+TEST(IotaTileAssignmentTest, Create) {
+  // Test with dims only
+  IotaTileAssignment iota1 = IotaTileAssignment::Create({2, 3});
+  EXPECT_EQ(iota1.dims(), absl::MakeConstSpan(std::vector<int64_t>{2, 3}));
+  EXPECT_EQ(iota1.reshape_dims(), absl::MakeConstSpan(std::vector<int64_t>{6}));
+  EXPECT_EQ(iota1.transpose_perm(), absl::MakeConstSpan(std::vector<int>{0}));
+  EXPECT_EQ(iota1.num_elements(), 6);
+
+  // Test with reshape_dims and transpose_perm
+  IotaTileAssignment iota2 = IotaTileAssignment::Create({2, 6}, {3, 4}, {0, 1});
+  EXPECT_EQ(iota2.dims(), absl::MakeConstSpan(std::vector<int64_t>{2, 6}));
+  EXPECT_EQ(iota2.reshape_dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{12}));
+  EXPECT_EQ(iota2.transpose_perm(), absl::MakeConstSpan(std::vector<int>{0}));
+  EXPECT_EQ(iota2.num_elements(), 12);
+
+  // Test canonicalization: remove size one dims
+  IotaTileAssignment iota3 = IotaTileAssignment::Create(
+      {1, 3, 1, 4, 1, 5}, {1, 3, 1, 4, 1, 5}, {4, 3, 2, 5, 1, 0});
+  EXPECT_EQ(iota3.dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{1, 3, 1, 4, 1, 5}));
+  EXPECT_EQ(iota3.reshape_dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{3, 20}));
+  EXPECT_EQ(iota3.transpose_perm(),
+            absl::MakeConstSpan(std::vector<int>{1, 0}));
+
+  // Test canonicalization: merge major to minor
+  IotaTileAssignment iota4 =
+      IotaTileAssignment::Create({2, 3, 4}, {2, 3, 4}, {0, 1, 2});
+  EXPECT_EQ(iota4.reshape_dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{24}));
+  EXPECT_EQ(iota4.transpose_perm(), absl::MakeConstSpan(std::vector<int>{0}));
+
+  IotaTileAssignment iota5 =
+      IotaTileAssignment::Create({2, 3, 4}, {2, 3, 4}, {1, 0, 2});
+  EXPECT_EQ(iota5.reshape_dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{2, 3, 4}));
+  EXPECT_EQ(iota5.transpose_perm(),
+            absl::MakeConstSpan(std::vector<int>{1, 0, 2}));
+}
+
+TEST(IotaTileAssignmentTest, ToArray) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3}, {6}, {0});
+  Array<int64_t> array = iota.ToArray();
+  Array2D<int64_t> expected({{0, 1, 2}, {3, 4, 5}});
+  EXPECT_EQ(array, expected);
+
+  IotaTileAssignment iota2 = IotaTileAssignment::Create({3, 2}, {2, 3}, {1, 0});
+  Array<int64_t> array2 = iota2.ToArray();
+  Array2D<int64_t> expected2({{0, 3}, {1, 4}, {2, 5}});
+  EXPECT_EQ(array2, expected2);
+
+  IotaTileAssignment iota3 =
+      IotaTileAssignment::Create({3, 4, 5}, {3, 4, 5}, {2, 0, 1});
+  Array<int64_t> array3 = iota3.ToArray();
+
+  Array<int64_t> expected3({3, 4, 5});
+  expected3.FillIota(0);
+  expected3.TransposeDimensions({2, 0, 1});
+  expected3.Reshape({3, 4, 5});
+  EXPECT_EQ(array3, expected3);
+}
+
+TEST(IotaTileAssignmentTest, Transpose) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3}, {6}, {0});
+
+  // Noop transpose
+  auto transposed1 = iota.Transpose({0, 1});
+  EXPECT_TRUE(transposed1.has_value());
+  EXPECT_EQ(transposed1->dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{2, 3}));
+  EXPECT_EQ(transposed1->reshape_dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{6}));
+  EXPECT_EQ(transposed1->transpose_perm(),
+            absl::MakeConstSpan(std::vector<int>{0}));
+
+  // Reshape transpose
+  IotaTileAssignment iota2 = IotaTileAssignment::Create({2, 1, 3}, {6}, {0});
+  auto transposed2 = iota2.Transpose({0, 2, 1});
+  EXPECT_TRUE(transposed2.has_value());
+  EXPECT_EQ(transposed2->dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{2, 3, 1}));
+  EXPECT_EQ(transposed2->reshape_dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{6}));
+  EXPECT_EQ(transposed2->transpose_perm(),
+            absl::MakeConstSpan(std::vector<int>{0}));
+
+  // Regular transpose
+  IotaTileAssignment iota3 = IotaTileAssignment::Create({2, 3}, {2, 3}, {0, 1});
+  auto transposed3 = iota3.Transpose({1, 0});
+  EXPECT_TRUE(transposed3.has_value());
+  EXPECT_EQ(transposed3->dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{3, 2}));
+  EXPECT_EQ(transposed3->reshape_dims(),
+            absl::MakeConstSpan(std::vector<int64_t>{2, 3}));
+  EXPECT_EQ(transposed3->transpose_perm(),
+            absl::MakeConstSpan(std::vector<int>{1, 0}));
+}
+
+TEST(IotaTileAssignmentTest, ValueAt) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3}, {6}, {0});
+  EXPECT_EQ(iota.value_at({0, 0}), 0);
+  EXPECT_EQ(iota.value_at({0, 1}), 1);
+  EXPECT_EQ(iota.value_at({1, 0}), 3);
+  EXPECT_EQ(iota.value_at({1, 2}), 5);
+
+  IotaTileAssignment iota2 = IotaTileAssignment::Create({3, 2}, {2, 3}, {1, 0});
+  EXPECT_EQ(iota2.value_at({0, 0}), 0);
+  EXPECT_EQ(iota2.value_at({0, 1}), 3);
+  EXPECT_EQ(iota2.value_at({1, 0}), 1);
+  EXPECT_EQ(iota2.value_at({2, 1}), 5);
+}
+
+TEST(IotaTileAssignmentTest, ToString) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3});
+  EXPECT_EQ(iota.ArrayToString(), "[6]");
+  EXPECT_EQ(iota.ToString(), "[2,3]<=[6]");
+
+  IotaTileAssignment iota2 = IotaTileAssignment::Create({2, 6}, {3, 4}, {0, 1});
+  EXPECT_EQ(iota2.ArrayToString(), "[12]");
+  EXPECT_EQ(iota2.ToString(), "[2,6]<=[12]");
+
+  IotaTileAssignment iota3 =
+      IotaTileAssignment::Create({3, 4, 5}, {3, 4, 5}, {2, 0, 1});
+  EXPECT_EQ(iota3.ArrayToString(), "[12,5]T(1,0)");
+  EXPECT_EQ(iota3.ToString(), "[3,4,5]<=[12,5]T(1,0)");
+}
+
+TEST(TileAssignmentTest, FromIota) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3});
+  TileAssignment tile_assignment(iota);
+
+  EXPECT_EQ(tile_assignment.ToString(), "devices=[2,3]<=[6]");
+  EXPECT_EQ(tile_assignment.dimensions(),
+            absl::MakeConstSpan(std::vector<int64_t>{2, 3}));
+  EXPECT_EQ(tile_assignment.num_elements(), 6);
+  EXPECT_EQ(tile_assignment({1, 1}), 4);
+}
+
+TEST(TileAssignmentTest, FromArray) {
+  Array2D<int64_t> array({{0, 1}, {2, 3}});
+  TileAssignment tile_assignment(std::make_shared<Array<int64_t>>(array));
+
+  EXPECT_EQ(tile_assignment.ToString(), "devices=[2,2]0,1,2,3");
+  EXPECT_EQ(tile_assignment.ArrayToString(), "0,1,2,3");
+  EXPECT_EQ(tile_assignment.dimensions(),
+            absl::MakeConstSpan(std::vector<int64_t>{2, 2}));
+  EXPECT_EQ(tile_assignment.num_elements(), 4);
+  EXPECT_EQ(tile_assignment({1, 1}), 3);
+}
+
+TEST(TileAssignmentTest, Equality) {
+  IotaTileAssignment iota1 = IotaTileAssignment::Create({2, 3});
+  IotaTileAssignment iota2 = IotaTileAssignment::Create({2, 3});
+  IotaTileAssignment iota3 = IotaTileAssignment::Create({3, 2});
+
+  TileAssignment ta1(iota1);
+  TileAssignment ta2(iota2);
+  TileAssignment ta3(iota3);
+
+  EXPECT_EQ(ta1, ta2);
+  EXPECT_NE(ta1, ta3);
+
+  Array2D<int64_t> array1({{0, 1}, {2, 3}});
+  Array2D<int64_t> array2({{0, 1}, {2, 3}});
+  Array2D<int64_t> array3({{0, 1}, {2, 4}});
+
+  TileAssignment ta4(std::make_shared<Array<int64_t>>(array1));
+  TileAssignment ta5(std::make_shared<Array<int64_t>>(array2));
+  TileAssignment ta6(std::make_shared<Array<int64_t>>(array3));
+
+  EXPECT_EQ(ta4, ta5);
+  EXPECT_NE(ta4, ta6);
+
+  // Iota vs Array
+  Array2D<int64_t> array_iota({{0, 1, 2}, {3, 4, 5}});
+  TileAssignment ta7(std::make_shared<Array<int64_t>>(array_iota));
+  EXPECT_EQ(ta1, ta7);
+}
+
+TEST(TileAssignmentTest, Reshape) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 6});
+  TileAssignment ta(iota);
+  TileAssignment reshaped = ta.Reshape({3, 4});
+  EXPECT_EQ(reshaped.dimensions(),
+            absl::MakeConstSpan(std::vector<int64_t>{3, 4}));
+  EXPECT_EQ(reshaped.num_elements(), 12);
+  EXPECT_EQ(reshaped({1, 1}), 5);
+
+  Array2D<int64_t> array({{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}});
+  TileAssignment ta2(std::make_shared<Array<int64_t>>(array));
+  TileAssignment reshaped2 = ta2.Reshape({6, 2});
+  EXPECT_EQ(reshaped2.dimensions(),
+            absl::MakeConstSpan(std::vector<int64_t>{6, 2}));
+  EXPECT_EQ(reshaped2({1, 1}), 3);
+}
+
+TEST(TileAssignmentTest, Transpose) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3});
+  TileAssignment ta(iota);
+  TileAssignment transposed = ta.Transpose({1, 0});
+  EXPECT_EQ(transposed.dimensions(),
+            absl::MakeConstSpan(std::vector<int64_t>{3, 2}));
+  EXPECT_EQ(transposed({1, 0}), 1);
+
+  Array2D<int64_t> array({{0, 1}, {2, 3}});
+  TileAssignment ta2(std::make_shared<Array<int64_t>>(array));
+  TileAssignment transposed2 = ta2.Transpose({1, 0});
+  EXPECT_EQ(transposed2.dimensions(),
+            absl::MakeConstSpan(std::vector<int64_t>{2, 2}));
+  EXPECT_EQ(transposed2({1, 0}), 1);
+}
+
+TEST(TileAssignmentTest, UsesDevice) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3});
+  TileAssignment ta(iota);
+  EXPECT_TRUE(ta.UsesDevice(0));
+  EXPECT_TRUE(ta.UsesDevice(5));
+  EXPECT_FALSE(ta.UsesDevice(6));
+
+  Array2D<int64_t> array({{1, 2}, {3, 4}});
+  TileAssignment ta2(std::make_shared<Array<int64_t>>(array));
+  EXPECT_TRUE(ta2.UsesDevice(1));
+  EXPECT_FALSE(ta2.UsesDevice(0));
+}
+
+TEST(TileAssignmentTest, MaterializeArray) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 3});
+  TileAssignment ta(iota);
+  const Array<int64_t>& array = ta.array();
+  Array2D<int64_t> expected({{0, 1, 2}, {3, 4, 5}});
+  EXPECT_EQ(array, expected);
+
+  std::shared_ptr<const Array<int64_t>> shared_array = ta.shared_array();
+  EXPECT_EQ(*shared_array, expected);
+}
+
+TEST(TileAssignmentTest, Each) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 2});
+  TileAssignment ta(iota);
+  std::vector<std::pair<std::vector<int64_t>, int64_t>> values;
+  ta.Each([&](absl::Span<const int64_t> indices, int64_t value) {
+    values.push_back({{indices.begin(), indices.end()}, value});
+  });
+  EXPECT_THAT(values, ::testing::ElementsAre(
+                          ::testing::Pair(std::vector<int64_t>({0, 0}), 0),
+                          ::testing::Pair(std::vector<int64_t>({0, 1}), 1),
+                          ::testing::Pair(std::vector<int64_t>({1, 0}), 2),
+                          ::testing::Pair(std::vector<int64_t>({1, 1}), 3)));
+}
+
+TEST(TileAssignmentTest, EachStatus) {
+  IotaTileAssignment iota = IotaTileAssignment::Create({2, 2});
+  TileAssignment ta(iota);
+  absl::Status status =
+      ta.EachStatus([&](absl::Span<const int64_t> indices, int64_t value) {
+        if (value == 3) {
+          return absl::InternalError("Test Error");
+        }
+        return absl::OkStatus();
+      });
+  EXPECT_FALSE(status.ok());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/parser/BUILD b/third_party/xla/xla/hlo/parser/BUILD
index 4b534ff3fe1de2..b0b06203228426 100644
--- a/third_party/xla/xla/hlo/parser/BUILD
+++ b/third_party/xla/xla/hlo/parser/BUILD
@@ -48,7 +48,6 @@ cc_library(
         "//xla/tsl/lib/gtl:map_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -61,8 +60,8 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser.cc b/third_party/xla/xla/hlo/parser/hlo_parser.cc
index 127a94acffce06..5af5b556af200d 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "absl/strings/strip.h"
 #include "absl/types/span.h"
 #include "Eigen/Core"
+#include "google/protobuf/descriptor.h"
 #include "xla/array.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/collective_op_group_mode.h"
@@ -79,12 +80,10 @@ limitations under the License.
 #include "xla/tsl/lib/gtl/map_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tuple_tree.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -121,6 +120,7 @@ bool CanInferShape(HloOpcode code) {
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAdd:
     case HloOpcode::kAddDependency:
     case HloOpcode::kAfterAll:
@@ -1144,7 +1144,7 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
   module->set_name(name);
 
   if (is_scheduled.value_or(false)) {
-    TF_CHECK_OK(module->set_schedule(ScheduleFromInstructionOrder(module)));
+    CHECK_OK(module->set_schedule(ScheduleFromInstructionOrder(module)));
   }
   HloModuleConfig config = module->config();
   bool default_config = true;
@@ -1171,14 +1171,14 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
     for (int64_t p = 0; p < entry_computation->num_parameters(); p++) {
       const Shape& param_shape =
           entry_computation->parameter_instruction(p)->shape();
-      TF_CHECK_OK(module->mutable_entry_computation_layout()
-                      ->mutable_parameter_layout(p)
-                      ->CopyLayoutFromShape(param_shape));
+      CHECK_OK(module->mutable_entry_computation_layout()
+                   ->mutable_parameter_layout(p)
+                   ->CopyLayoutFromShape(param_shape));
     }
     const Shape& result_shape = entry_computation->root_instruction()->shape();
-    TF_CHECK_OK(module->mutable_entry_computation_layout()
-                    ->mutable_result_layout()
-                    ->CopyLayoutFromShape(result_shape));
+    CHECK_OK(module->mutable_entry_computation_layout()
+                 ->mutable_result_layout()
+                 ->CopyLayoutFromShape(result_shape));
   }
   if (frontend_attributes) {
     module->set_frontend_attributes(frontend_attributes.value());
@@ -1651,6 +1651,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAtanh:
     case HloOpcode::kExpm1:
     case HloOpcode::kLog:
@@ -2294,7 +2295,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         return nullptr;
       }
 
-      if (dynamic_cast<const HloChannelInstruction*>(operands[0]) != nullptr) {
+      if (HloChannelInstruction::ClassOf(operands[0])) {
         if (channel_id != operands[0]->channel_id()) {
           return nullptr;
         }
@@ -2331,7 +2332,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         return nullptr;
       }
 
-      if (dynamic_cast<const HloChannelInstruction*>(operands[0]) != nullptr) {
+      if (DynCast<const HloChannelInstruction>(operands[0]) != nullptr) {
         if (channel_id != operands[0]->channel_id()) {
           return nullptr;
         }
@@ -7559,7 +7560,7 @@ bool HloParserImpl::ParseSingleInstruction(HloModule* module) {
   for (auto& comp : computations_) {
     module->AddEmbeddedComputation(std::move(comp));
   }
-  TF_CHECK_OK(module->set_schedule(ScheduleFromInstructionOrder(module)));
+  CHECK_OK(module->set_schedule(ScheduleFromInstructionOrder(module)));
   return true;
 }
 
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
index 5d3180b10e7a6a..8c70265bbe2b06 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
@@ -4111,7 +4111,7 @@ TEST_F(HloParserTest, ParseFrontendAttributes) {
 TEST_F(HloParserTest, ParseWindow) {
   Window original = window_util::MakeWindow({1, 2, 3});
   TF_ASSERT_OK_AND_ASSIGN(Window parsed,
-                          ParseWindow(window_util::ToString(original)))
+                          ParseWindow(window_util::ToString(original)));
   EXPECT_EQ(window_util::ToString(original), window_util::ToString(parsed));
 }
 
diff --git a/third_party/xla/xla/hlo/pass/BUILD b/third_party/xla/xla/hlo/pass/BUILD
index 6daaf01f0350b6..3c19e8e25d4004 100644
--- a/third_party/xla/xla/hlo/pass/BUILD
+++ b/third_party/xla/xla/hlo/pass/BUILD
@@ -24,6 +24,7 @@ package_group(
 
 cc_library(
     name = "hlo_pass",
+    srcs = ["hlo_pass_interface.cc"],
     hdrs = [
         "hlo_pass_fix.h",
         "hlo_pass_interface.h",
@@ -88,13 +89,11 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/service:compilation_stats",
         "//xla/service:dump",
         "//xla/service:hlo_graph_dumper",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
@@ -108,6 +107,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_pass_interface",
+    srcs = ["hlo_pass_interface.cc"],
+    hdrs = ["hlo_pass_interface.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 xla_cc_test(
     name = "hlo_pass_pipeline_test",
     srcs = ["hlo_pass_pipeline_test.cc"],
diff --git a/third_party/xla/xla/hlo/pass/README.md b/third_party/xla/xla/hlo/pass/README.md
index ff52a2dcb75297..ecc9055eb74dca 100644
--- a/third_party/xla/xla/hlo/pass/README.md
+++ b/third_party/xla/xla/hlo/pass/README.md
@@ -13,9 +13,8 @@ Defines the foundational classes for HLO passes:
 * `HloPassInterface`: Abstract base class for all HLO passes.
 * `HloModulePass`: Subclass for passes that operate on individual HloModules.
 
-Provides core methods like `Run`, `RunOnModuleGroup`, and
-`RunOnChangedComputations` that passes must implement to perform their
-transformations.
+Provides core methods like `Run`, and`RunOnChangedComputations` that passes must
+implement to perform their transformations.
 
 ### `hlo_pass_fix.h`
 
@@ -49,7 +48,7 @@ auto status = pipeline.Run(&module);
 ## Important Considerations
 
 When creating custom HLO passes, inherit from `HloModulePass`. Implement
-the required virtual methods (e.g., `Run`) to define the pass's behavior.
+the required virtual methods (e.g., `RunImpl`) to define the pass's behavior.
 Utilize `HloPassFix` when your pass's transformations may trigger further
 optimizations upon repeated application. Construct `HloPassPipelines` to
 orchestrate the execution of multiple passes in a defined sequence.
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_fix.h b/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
index 85a6508fd46e4d..ae699bfcfcabea 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
@@ -64,55 +64,15 @@ class HloPassFix : public Pass {
     return absl::OkStatus();
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     RunState run_state(module);
     TF_RETURN_IF_ERROR(RunToFixPoint(module, &run_state, execution_threads));
     return !run_state.changed.empty();
   }
 
-  using HloPassInterface::RunOnModuleGroup;
-  absl::StatusOr<bool> RunOnModuleGroup(
-      HloModuleGroup* module_group,
-      const absl::flat_hash_set<absl::string_view>& execution_threads)
-      override {
-    bool changed = false;
-    bool changed_this_iteration = true;
-    int64_t iteration_count = 0;
-    VLOG(3) << "Running HloPassFix.";
-    while (changed_this_iteration) {
-      TF_ASSIGN_OR_RETURN(
-          changed_this_iteration,
-          Pass::RunOnModuleGroup(module_group, execution_threads));
-      changed |= changed_this_iteration;
-      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
-      ++iteration_count;
-      if (iteration_count == iteration_limit_) {
-        const DebugOptions& debug_options =
-            module_group->module(0).config().debug_options();
-        if (debug_options
-                .xla_unsupported_crash_on_hlo_pass_fix_max_iterations()) {
-          LOG(FATAL) << "Unexpectedly high number of iterations "
-                     << iteration_count << " in HLO pass '" << Pass::name()
-                     << "' for module group '" << module_group->name() << "'";
-        }
-        VLOG(1) << "Unexpectedly high number of iterations in HLO passes, "
-                   "exiting fixed point loop.";
-        if (debug_options
-                .xla_unsupported_crash_on_hlo_pass_silent_hlo_change()) {
-          // When crash on silent HLO changes is enabled, we can't lie about not
-          // changing the module, as that will lead to an immediate crash.
-          return changed;
-        }
-        // Return false in case this is fixed point is nested.
-        return false;
-      }
-    }
-    return changed;
-  }
-
  private:
   absl::Status RunToFixPoint(
       HloModule* module, RunState* run_state,
@@ -175,8 +135,8 @@ class HloPassFix : public Pass {
     }
     // If Pass does not override the default
     // HloPassInterface::RunOnChangedComputations that calls into
-    // HloPassFix<Pass>::Run, avoid infinite recursion.
-    TF_ASSIGN_OR_RETURN(bool changed, Pass::Run(module, execution_threads));
+    // HloPassFix<Pass>::RunImpl, avoid infinite recursion.
+    TF_ASSIGN_OR_RETURN(bool changed, Pass::RunImpl(module, execution_threads));
     if (changed) {
       auto computations = module->computations(execution_threads);
       run_state->changed_this_iteration.insert(computations.begin(),
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc b/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc
index 183a3b27da14fd..c7020254615036 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <utility>
 
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
@@ -48,10 +47,10 @@ class DecrementPositiveConstants : public HloModulePass {
  public:
   absl::string_view name() const override { return "decrement-constants"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -79,10 +78,10 @@ class FlipAddSubtract : public HloModulePass {
  public:
   absl::string_view name() const override { return "flip-add-subtract"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
new file mode 100644
index 00000000000000..bec1de8aaaa219
--- /dev/null
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
@@ -0,0 +1,36 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+absl::StatusOr<bool> HloPassInterface::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return RunImpl(module, execution_threads);
+}
+
+absl::StatusOr<bool> HloPassInterface::Run(
+    std::unique_ptr<HloModule>& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return RunImpl(module, execution_threads);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
index ece349de29ff97..cfbe9723201e1f 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_HLO_PASS_HLO_PASS_INTERFACE_H_
 
 #include <cstdint>
+#include <memory>
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
@@ -77,16 +78,20 @@ class HloPassInterface {
   // Run the pass on the given HLO module with specified execution_threads.
   // Empty execution_threads list means all execution_threads are included.
   // Returns whether it modified the module.
-  //
-  // Note: C++ hides non-explicitly declared overloaded functions.
-  // You can make all overloaded variants available in the child class  by
-  // adding `using HloPassInterface::Run;` to the child class declaration.
-  absl::StatusOr<bool> Run(HloModule* module) {
-    return Run(module, /*execution_threads=*/{});
-  }
-  virtual absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+  // Same as above, except that this API allows the pass to return a *different*
+  // module, rather than modifying the module in-place.
+  // TODO(b/454418238): Remove this function (and the corresponding RunImpl).
+  ABSL_DEPRECATED(
+      "This interface exists for passes which create an entire new HloModule "
+      "instead of updating the existing one in place. This is not the norm and "
+      "should not be done.")
+  absl::StatusOr<bool> Run(
+      std::unique_ptr<HloModule>& module_ptr,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Run the pass on computation on changed computations from last iteration in
   // given HLO module for specified execution_threads, with caller provided
@@ -107,21 +112,6 @@ class HloPassInterface {
     return absl::OkStatus();
   }
 
-  // Run the pass on the given HLO module group for specified
-  // `execution_threads`. Empty `execution_threads` list means all execution
-  // threads are included. Returns whether it modified the module group.
-  // Ideally, the module group variant would be named "Run" as well, but C++
-  // does not handle overloaded virtual methods well.
-  //
-  // See the caveat about C++ hiding overloaded functions in the Run function
-  // above.
-  absl::StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group) {
-    return RunOnModuleGroup(module_group, /*execution_threads=*/{});
-  }
-  virtual absl::StatusOr<bool> RunOnModuleGroup(
-      HloModuleGroup* module_group,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
-
   virtual bool IsPassPipeline() const { return false; }
 
   // If an HloPassMetadata has previously been created, it adds a (key, value)
@@ -136,25 +126,23 @@ class HloPassInterface {
       LOG(WARNING) << "Failed to set stat: " << status;
     }
   }
+
+ protected:
+  virtual absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
+
+  // TODO(b/454418238): Remove this function.
+  virtual absl::StatusOr<bool> RunImpl(
+      std::unique_ptr<HloModule>& module_ptr,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    return RunImpl(module_ptr.get(), execution_threads);
+  }
 };
 
 // Base class for passes which are module-scoped.
 class HloModulePass : public HloPassInterface {
  public:
-  // Runs the pass on a module group by iterating through each module in the
-  // group.
-  absl::StatusOr<bool> RunOnModuleGroup(
-      HloModuleGroup* module_group,
-      const absl::flat_hash_set<absl::string_view>& execution_threads)
-      override {
-    bool changed = false;
-    for (HloModule* module : module_group->modules()) {
-      TF_ASSIGN_OR_RETURN(bool module_changed, Run(module, execution_threads));
-      changed |= module_changed;
-    }
-    return changed;
-  };
-
   // Update the layout of a Shape to one that is supported by a given backend.
   // One can call this function after modifying the Shape in case that modifying
   // the Shape requires changes to the layout for the given Backend.
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
index 11926ca12eacf4..257fa82cc242f1 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 
 #include <cstddef>
+#include <memory>
 #include <optional>
 #include <string>
 #include <vector>
@@ -27,13 +28,11 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/dump.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -48,16 +47,8 @@ void RecordPassStartMetadata(HloModule& module, const std::string& pass_name,
                              const std::string& pipeline_name) {
   module.metadata()->RecordPassStart();
   // An HloPassMetadata was just created so absl::Status should always be OK.
-  TF_CHECK_OK(module.metadata()->set_current_pass_name(pass_name));
-  TF_CHECK_OK(module.metadata()->set_current_pass_pipeline_name(pipeline_name));
-}
-
-void RecordPassStartMetadata(HloModuleGroup& module_group,
-                             const std::string& pass_name,
-                             const std::string& pipeline_name) {
-  for (HloModule* module : module_group.modules()) {
-    RecordPassStartMetadata(*module, pass_name, pipeline_name);
-  }
+  CHECK_OK(module.metadata()->set_current_pass_name(pass_name));
+  CHECK_OK(module.metadata()->set_current_pass_pipeline_name(pipeline_name));
 }
 
 absl::Status AttemptRecordPassEndMetadata(HloModule& module,
@@ -81,42 +72,17 @@ void RecordPassEndMetadata(HloModule& module, const std::string& pass_name,
     LOG(FATAL) << status;
   }
 }
-
-absl::Status AttemptRecordPassEndMetadata(HloModuleGroup& module_group,
-                                          const std::string& pass_name,
-                                          bool module_changed) {
-  for (HloModule* module : module_group.modules()) {
-    for (HloModule* other_module : module_group.modules()) {
-      TF_RETURN_IF_ERROR(
-          module->metadata()->add_current_pass_module_group_module_id(
-              other_module->unique_id()));
-    }
-    TF_RETURN_IF_ERROR(
-        AttemptRecordPassEndMetadata(*module, pass_name, module_changed));
-  }
-  return absl::OkStatus();
-}
-
-void RecordPassEndMetadata(HloModuleGroup& module_group,
-                           const std::string& pass_name, bool module_changed) {
-  absl::Status status =
-      AttemptRecordPassEndMetadata(module_group, pass_name, module_changed);
-  if (!status.ok()) {
-    LOG(FATAL) << status;
-  }
-}
-
 }  // namespace
 
 template <typename HloT>
 absl::Status HloPassPipeline::RunInvariantCheckers(
-    HloT* hlo, absl::string_view after_pass_name,
+    HloT hlo, absl::string_view after_pass_name,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   tsl::profiler::TraceMe traceme("RunInvariantCheckers");
   for (auto& invariant_checker : invariant_checkers_) {
     VLOG(1) << "    Invariant checker " << invariant_checker->name();
     absl::StatusOr<bool> changed_status =
-        RunHelper(invariant_checker.get(), hlo, execution_threads);
+        RunHelper<HloT>(invariant_checker.get(), hlo, execution_threads);
     VLOG(1) << "    Invariant checker done " << invariant_checker->name();
     if (!changed_status.ok()) {
       VLOG(2) << "Failed invariant check:";
@@ -136,15 +102,9 @@ namespace {
 std::string UniqueId(const HloModule& mod) {
   return std::to_string(mod.unique_id());
 }
-std::string UniqueId(const HloModuleGroup& group) {
-  return absl::StrJoin(group.modules(), "-",
-                       [](std::string* out, const HloModule* mod) {
-                         out->append(std::to_string(mod->unique_id()));
-                       });
-}
 
 template <typename HloT>
-static void VerifyPassChangedReport(const HloT* hlo, bool pass_changed,
+static void VerifyPassChangedReport(const HloT hlo, bool pass_changed,
                                     const DebugOptions& debug_options,
                                     absl::string_view pass_name,
                                     absl::string_view pipeline_name,
@@ -173,7 +133,7 @@ static void VerifyPassChangedReport(const HloT* hlo, bool pass_changed,
 
 template <typename HloT>
 absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
-    HloT* hlo, const DebugOptions& debug_options,
+    HloT hlo, const DebugOptions& debug_options,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto passes = GetEnabledPasses(debug_options);
   // Copy string by value since debug options could get clobbered in an hlo
@@ -188,7 +148,7 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
   }};
 
   TF_RETURN_IF_ERROR(
-      RunInvariantCheckers(hlo, kPipelineStart, execution_threads));
+      RunInvariantCheckers<HloT>(hlo, kPipelineStart, execution_threads));
 
   RecordPassStartMetadata(*hlo, std::string(kPipelineStart), pipeline_name);
   MaybeDumpHloAndSaveFilenames(*hlo,
@@ -217,20 +177,21 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
       hash_before = absl::HashOf(*hlo);
       VLOG(2) << "  Module hash " << hash_before.value();
     }
+    VLOG(2) << "  Number of instructions: " << hlo->instruction_count();
     tsl::profiler::TraceMe traceme(pass->name());
     if (!pass->IsPassPipeline()) {
       compilation_stats_->StartPass(pass_name);
     }
     RecordPassStartMetadata(*hlo, pass_name, pipeline_name);
-    auto status_or_changed = RunHelper(pass, hlo, execution_threads);
+    auto status_or_changed = RunHelper<HloT>(pass, hlo, execution_threads);
     if (auto status = status_or_changed.status(); !status.ok()) {
       compilation_stats_->RecordPassError(
           pass_name, absl::StatusCodeToString(status.code()));
     }
     TF_ASSIGN_OR_RETURN(bool pass_changed, status_or_changed);
     if (verify_pass_changed_report) {
-      VerifyPassChangedReport(hlo, pass_changed, debug_options, pass_name,
-                              pipeline_name, hash_before.value());
+      VerifyPassChangedReport<HloT>(hlo, pass_changed, debug_options, pass_name,
+                                    pipeline_name, hash_before.value());
     }
     if (!dump_regex.empty() && (pass_changed || dump_regex != ".*")) {
       MaybeDumpHloAndSaveFilenames(*hlo,
@@ -243,7 +204,8 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
     changed |= pass_changed;
     if (pass_changed) {
       VLOG(3) << "  Pass caused changes " << pass_name;
-      auto status = RunInvariantCheckers(hlo, pass_name, execution_threads);
+      auto status =
+          RunInvariantCheckers<HloT>(hlo, pass_name, execution_threads);
       if (!status.ok()) {
         compilation_stats_->RecordPassError(
             pass_name, absl::StatusCodeToString(status.code()));
@@ -326,15 +288,7 @@ void HloPassPipeline::MaybeDumpHloAndSaveFilenames(
   }
 }
 
-void HloPassPipeline::MaybeDumpHloAndSaveFilenames(
-    HloModuleGroup& module_group, absl::string_view after_pass_name,
-    absl::string_view before_pass_name) {
-  for (HloModule* module : module_group.modules()) {
-    MaybeDumpHloAndSaveFilenames(*module, after_pass_name, before_pass_name);
-  }
-}
-
-absl::StatusOr<bool> HloPassPipeline::Run(
+absl::StatusOr<bool> HloPassPipeline::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   run_called_ = true;
@@ -348,28 +302,19 @@ absl::StatusOr<bool> HloPassPipeline::Run(
   return RunPassesInternal(module, debug_options, execution_threads);
 }
 
-absl::StatusOr<bool> HloPassPipeline::RunOnModuleGroup(
-    HloModuleGroup* module_group,
+absl::StatusOr<bool> HloPassPipeline::RunImpl(
+    std::unique_ptr<HloModule>& module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   run_called_ = true;
 
-  VLOG(1) << "Running HLO pass pipeline on module group "
-          << module_group->name() << ": " << name();
-
-  if (module_group->modules().empty()) {
-    VLOG(1) << "Module group is empty. Nothing to do.";
-    return false;
-  }
-
-  if (module_group->modules().size() > 1) {
-    return absl::UnimplementedError(
-        "HloPassPipeline::RunOnModuleGroup only supports module groups with a "
-        "single module.");
-  }
+  VLOG(1) << "Running HLO pass pipeline on module " << module->name() << ": "
+          << name();
 
+  tsl::profiler::TraceMe traceme(name());
   // Copy debug options by value as passes may modify module config.
-  DebugOptions debug_options = module_group->module(0).config().debug_options();
-  return RunPassesInternal(module_group, debug_options, execution_threads);
+  DebugOptions debug_options = module->config().debug_options();
+  return RunPassesInternal<std::unique_ptr<HloModule>&>(module, debug_options,
+                                                        execution_threads);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
index 2f3550db574c0d..5fa1bfb1d17ec1 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
@@ -87,15 +87,6 @@ class HloPassPipeline : public HloPassInterface {
 #endif  // NDEBUG
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  using HloPassInterface::RunOnModuleGroup;
-  absl::StatusOr<bool> RunOnModuleGroup(
-      HloModuleGroup* module_group,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   bool IsPassPipeline() const override { return true; }
 
   // Return size of passes_.
@@ -103,40 +94,45 @@ class HloPassPipeline : public HloPassInterface {
   // Return reference to pass specified by index.
   HloPassInterface& GetPass(int index) { return *passes_[index]; }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  absl::StatusOr<bool> RunImpl(
+      std::unique_ptr<HloModule>& module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Returns the set of passes which are enabled. DebugOptions can selectively
   // disable passes via --xla_disable_hlo_passes flag.
   std::vector<HloPassInterface*> GetEnabledPasses(
       const DebugOptions& debug_options);
 
-  // Maybe dumps the given module or module group depending on flag values
-  // contained in DebugOptions of module config. If it is dumped, saves the
-  // filenames of the dumps into module metadata.
-  void MaybeDumpHloAndSaveFilenames(HloModuleGroup& module_group,
-                                    absl::string_view after_pass_name,
-                                    absl::string_view before_pass_name);
+  // Maybe dumps the given module depending on flag values contained in
+  // DebugOptions of module config. If it is dumped, saves the filenames of the
+  // dumps into module metadata.
   void MaybeDumpHloAndSaveFilenames(HloModule& module,
                                     absl::string_view after_pass_name,
                                     absl::string_view before_pass_name);
 
   // Runs the invariant checker on the given HLO for specified
   // `execution_threads`. Empty `execution_threads` means all execution threads
-  // are included. HloT can be either HloModule or HloModuleGroup.
+  // are included.
+
   template <typename HloT>
-  absl::Status RunInvariantCheckers(HloT* hlo,
+  absl::Status RunInvariantCheckers(HloT hlo,
                                     absl::string_view after_pass_name) {
     return RunInvariantCheckers(hlo, after_pass_name, /*execution_threads=*/{});
   }
   template <typename HloT>
   absl::Status RunInvariantCheckers(
-      HloT* hlo, absl::string_view after_pass_name,
+      HloT hlo, absl::string_view after_pass_name,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
-  // Helper which runs the given pass on the given HLO. HloT can be either
-  // HloModule or HloModuleGroup.
+  // Helper which runs the given pass on the given HLO.
   template <typename HloT>
   absl::StatusOr<bool> RunPassesInternal(
-      HloT* hlo, const DebugOptions& debug_options,
+      HloT hlo, const DebugOptions& debug_options,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
   // Helpers which run the given passes on the given HLO construct. Only
@@ -144,21 +140,14 @@ class HloPassPipeline : public HloPassInterface {
   // empty thread list means all `execution_threads` are considered. These
   // helpers enable templating of the core of the pipeline logic by providing
   // HloModule and HloModuleGroup specific methods with the same name.
+  template <typename HloT>
   static absl::StatusOr<bool> RunHelper(
-      HloPassInterface* pass, HloModule* module,
+      HloPassInterface* pass, HloT module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) {
     TF_ASSIGN_OR_RETURN(bool changed, pass->Run(module, execution_threads));
     module->Cleanup();
     return changed;
   }
-  static absl::StatusOr<bool> RunHelper(
-      HloPassInterface* pass, HloModuleGroup* module_group,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) {
-    TF_ASSIGN_OR_RETURN(
-        bool changed, pass->RunOnModuleGroup(module_group, execution_threads));
-    module_group->Cleanup();
-    return changed;
-  }
 
   const std::string name_;
   std::vector<std::unique_ptr<HloPassInterface>> passes_;
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
index b2f5d05dcf108b..5e56e70503d419 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -47,23 +46,16 @@ using ::testing::ElementsAre;
 using ::testing::SizeIs;
 using ::testing::StrEq;
 
-class HloPassPipelineTest : public HloHardwareIndependentTestBase {
- protected:
-  absl::StatusOr<HloModuleGroup> ParseModuleGroup(std::string hlo_string) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> module,
-                        ParseAndReturnVerifiedModule(hlo_string));
-    return HloModuleGroup(std::move(module));
-  }
-};
+using HloPassPipelineTest = HloHardwareIndependentTestBase;
 
 // A module pass which renames instructions named 'foo' to 'bar'.
 class FooToBarModulePass : public HloModulePass {
   absl::string_view name() const override { return "foo2bar"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -83,10 +75,10 @@ class FooToBarModulePass : public HloModulePass {
 class ReverseStringModulePass : public HloModulePass {
   absl::string_view name() const override { return "reverse"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -104,9 +96,9 @@ class ReverseStringModulePass : public HloModulePass {
 class BazToQuxModulePass : public HloModulePass {
   absl::string_view name() const override { return "baz2qux"; }
 
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -126,10 +118,10 @@ class BazToQuxModulePass : public HloModulePass {
 class BarBlowerUpper : public HloModulePass {
   absl::string_view name() const override { return "bar-blower-upper"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     for (HloComputation* computation :
          module->computations(execution_threads)) {
       for (HloInstruction* instruction : computation->instructions()) {
@@ -265,19 +257,17 @@ ENTRY main {
   ROOT baz = f32[] multiply(a, b)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(HloModuleGroup module_group,
-                          ParseModuleGroup(module_0_str));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(module_0_str));
 
   HloPassPipeline pipeline(TestName());
   pipeline.AddPass<BazToQuxModulePass>();
   pipeline.AddPass<FooToBarModulePass>();
 
-  HloInstruction* root0 =
-      module_group.module(0).entry_computation()->root_instruction();
+  HloInstruction* root0 = module->entry_computation()->root_instruction();
   EXPECT_EQ(root0->name(), "baz");
 
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          pipeline.RunOnModuleGroup(&module_group));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pipeline.Run(module.get()));
   EXPECT_TRUE(changed);
 
   EXPECT_EQ(root0->name(), "qux");
@@ -333,48 +323,42 @@ ENTRY main {
   }
 }
 
-// Test that metadata is set when a module group goes through a pass pipeline.
+// Test that metadata is set when a module goes through a pass pipeline.
 TEST_F(HloPassPipelineTest, SetHloModuleMetadata) {
-  HloModuleGroup module_group(CreateNewVerifiedModule());
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
 
   HloPassPipeline pipeline(TestName());
   pipeline.AddPass<BazToQuxModulePass>();
   pipeline.AddPass<FooToBarModulePass>();
-  TF_ASSERT_OK(pipeline.RunOnModuleGroup(&module_group).status());
-  ASSERT_THAT(module_group.modules(), SizeIs(1));
+  TF_ASSERT_OK(pipeline.Run(module.get()).status());
 
   std::vector<std::string> pass_names = {"pipeline-start", "baz2qux",
                                          "foo2bar"};
   std::string pipeline_name = std::string(pipeline.name());
-  for (const HloModule* module : module_group.modules()) {
-    const HloModuleMetadataProto& metadata = module->metadata().proto();
-    EXPECT_EQ(metadata.canonical_module_id(), module->unique_id());
-    EXPECT_EQ(metadata.module_group_name(), module_group.name());
-
-    ASSERT_THAT(metadata.pass_metadata(), SizeIs(3));
-    for (int pass = 0; pass < metadata.pass_metadata().size(); pass++) {
-      const HloPassMetadata& pass_metadata = metadata.pass_metadata(pass);
-      EXPECT_NE(pass_metadata.pass_id(), 0);
-      EXPECT_THAT(pass_metadata.pass_name(), StrEq(pass_names[pass]));
-      EXPECT_THAT(pass_metadata.pipeline_name(), StrEq(pipeline_name));
-      EXPECT_FALSE(pass_metadata.module_changed());
-      EXPECT_EQ(pass_metadata.module_id(), module->unique_id());
-      EXPECT_THAT(pass_metadata.module_group_module_ids(),
-                  ElementsAre(module_group.module(0).unique_id()));
-      EXPECT_GT(pass_metadata.start_timestamp_usec(), 0);
-      EXPECT_LE(pass_metadata.start_timestamp_usec(),
-                pass_metadata.end_timestamp_usec());
-    }
+  const HloModuleMetadataProto& metadata = module->metadata()->proto();
+  EXPECT_EQ(metadata.canonical_module_id(), module->unique_id());
+
+  ASSERT_THAT(metadata.pass_metadata(), SizeIs(3));
+  for (int pass = 0; pass < metadata.pass_metadata().size(); pass++) {
+    const HloPassMetadata& pass_metadata = metadata.pass_metadata(pass);
+    EXPECT_NE(pass_metadata.pass_id(), 0);
+    EXPECT_THAT(pass_metadata.pass_name(), StrEq(pass_names[pass]));
+    EXPECT_THAT(pass_metadata.pipeline_name(), StrEq(pipeline_name));
+    EXPECT_FALSE(pass_metadata.module_changed());
+    EXPECT_EQ(pass_metadata.module_id(), module->unique_id());
+    EXPECT_GT(pass_metadata.start_timestamp_usec(), 0);
+    EXPECT_LE(pass_metadata.start_timestamp_usec(),
+              pass_metadata.end_timestamp_usec());
   }
 }
 
 class NoOpModulePass : public HloModulePass {
   absl::string_view name() const override { return "noop"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     return false;
   }
 };
@@ -401,5 +385,7 @@ ENTRY main {
   TF_EXPECT_OK(status);
 }
 
+// TODO: Add test.
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/separate_compilation/BUILD b/third_party/xla/xla/hlo/separate_compilation/BUILD
new file mode 100644
index 00000000000000..be56adb2f4c536
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/BUILD
@@ -0,0 +1,120 @@
+# Targets related to hlo_module_splitting, linking, and more generally
+# incremental compilation of HLO.
+
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "hlo_linking_manifest",
+    hdrs = ["hlo_linking_manifest.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compilation_environments",
+        "//xla/service:hlo_module_config",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "hlo_module_splitting",
+    srcs = ["hlo_module_splitting.cc"],
+    hdrs = ["hlo_module_splitting.h"],
+    deps = [
+        ":hlo_linking_manifest",
+        "//xla:status_macros",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compilation_environments",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "hlo_module_linking",
+    srcs = ["hlo_module_linking.cc"],
+    hdrs = ["hlo_module_linking.h"],
+    deps = [
+        ":hlo_linking_manifest",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/service:compilation_environments",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# Tests below
+
+xla_test(
+    name = "hlo_module_splitting_test",
+    size = "small",
+    srcs = ["hlo_module_splitting_test.cc"],
+    backends = ["cpu"],
+    deps = [
+        ":hlo_module_splitting",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_test(
+    name = "hlo_module_linking_test",
+    size = "small",
+    srcs = ["hlo_module_linking_test.cc"],
+    backends = ["cpu"],
+    deps = [
+        ":hlo_linking_manifest",
+        ":hlo_module_linking",
+        ":hlo_module_splitting",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:hlo_verifier",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_linking_manifest.h b/third_party/xla/xla/hlo/separate_compilation/hlo_linking_manifest.h
new file mode 100644
index 00000000000000..1558a2f6c47788
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_linking_manifest.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_SEPARATE_COMPILATION_HLO_LINKING_MANIFEST_H_
+#define XLA_HLO_SEPARATE_COMPILATION_HLO_LINKING_MANIFEST_H_
+
+#include <memory>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/service/hlo_module_config.h"
+
+namespace xla::separate_compilation {
+
+// Metadata to guide linking process of HLO modules.
+//
+// Manifest contains Caller/Callee information:
+// When splitting modules, sometimes callers and their callees end up
+// in different sub-modules. During linking, we must be able to find
+// the callees and replace the stubs we inserted in the caller's sub-module.
+// Because, HLO serialization includes ids keeping this information in
+// the module interferes with caching of artifacts, by making artifacts
+// with the same semantics appear different when serialized. This class
+// externalizes the information.
+struct HloLinkingManifest {
+  // Maps from a stub computation to the cloned computation.
+  // Note that these `HloComputation` pointers might be from different modules.
+  absl::flat_hash_map<const HloComputation* absl_nonnull,
+                      const HloComputation* absl_nonnull>
+      stub_links;
+  std::shared_ptr<const HloModuleConfig> module_config;
+  std::unique_ptr<CompilationEnvironments> compilation_environment;
+};
+
+}  // namespace xla::separate_compilation
+#endif  // XLA_HLO_SEPARATE_COMPILATION_HLO_LINKING_MANIFEST_H_
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
new file mode 100644
index 00000000000000..82cf884716c34e
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
@@ -0,0 +1,221 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/separate_compilation/hlo_module_linking.h"
+
+#include <memory>
+#include <stack>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/separate_compilation/hlo_linking_manifest.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::separate_compilation {
+
+namespace {
+struct TraversalState {
+  // The principal computation to link
+  const HloComputation* absl_nonnull principal;
+  // The original stub this principal replaces
+  const HloComputation* absl_nonnull stub;
+  // True if we started linking `computation`; callees pushed.
+  bool entered = false;
+};
+
+// Helper class to encapsulate the state and logic of linking HLO computations.
+class HloLinker {
+ public:
+  HloLinker(HloModule* module, const HloLinkingManifest& linking_manifest,
+            const HloComputation* root_computation)
+      : module_(module),
+        linking_manifest_(linking_manifest),
+        root_computation_(root_computation),
+        clone_context_(module) {}
+
+  absl::StatusOr<HloComputation*> Link() {
+    // Slightly abusing `TraversalState` here to avoid adding more boilerplate.
+    // Stub field should hold a stub, but here stub == root_computation.
+    stack_.push({root_computation_, root_computation_, false});
+
+    HloComputation* result = nullptr;
+
+    while (!stack_.empty()) {
+      TraversalState& current = stack_.top();
+
+      if (SkipIfFinished(current)) {
+        VLOG(6) << "Already linked: " << current.stub->name();
+
+      } else if (!current.entered) {
+        VLOG(6) << "First visit to link: " << current.principal->name();
+        TF_RETURN_IF_ERROR(HandleFirstVisit(current));
+
+      } else {
+        VLOG(6) << "Second visit to link: " << current.principal->name();
+        HloComputation* linked_computation = HandleSecondVisit(current);
+        if (current.principal == root_computation_) {
+          result = linked_computation;
+        }
+      }
+    }
+
+    CHECK(result != nullptr)
+        << "Failed to produce a linked version of the root computation: "
+        << root_computation_->name();
+    return result;
+  }
+
+ private:
+  // If the principal computation corresponding to state.stub has already been
+  // linked, map state.stub to the existing linked computation and pops state
+  // from stack_. Returns true if we popped, false otherwise.
+  bool SkipIfFinished(const TraversalState& state) {
+    if (auto it = finished_principals_.find(state.principal);
+        it != finished_principals_.end()) {
+      clone_context_.MapComputation(state.stub, it->second);
+      stack_.pop();
+      return true;
+    }
+    return false;
+  }
+
+  // First time visiting `state.principal`: check for cycles, mark as entered,
+  // and push dependencies to stack_.
+  absl::Status HandleFirstVisit(TraversalState& state) {
+    if (!being_linked_.insert(state.principal).second) {
+      // Computation is currently being linked, which indicates we entered
+      // but have not finished all children and exited, i.e. we came back
+      // to it through its descendants forming a cycle.
+      return absl::InternalError(absl::StrCat(
+          "Discovered a cycle during linking, involving computation: ",
+          state.principal->name()));
+    }
+    state.entered = true;
+
+    PushDependencies(state);
+    return absl::OkStatus();
+  }
+
+  // Pushes dependencies of `state.principal` onto stack_ if they are stubs
+  // that need to be linked.
+  void PushDependencies(const TraversalState& state) {
+    std::vector<HloInstruction*> post_order_instrs =
+        state.principal->MakeInstructionPostOrder();
+    VLOG(6) << "Processing callees:";
+    for (HloInstruction* caller : post_order_instrs) {
+      if (caller->opcode() != HloOpcode::kCall) {
+        continue;
+      }
+      HloComputation* callee = caller->to_apply();
+
+      // If callee is a stub it will be in the linking manifest.
+      // Non-stub callees within the same source module are handled
+      // automatically by CloneInContext.
+      if (auto it = linking_manifest_.stub_links.find(callee);
+          it != linking_manifest_.stub_links.end()) {
+        // Only push if its principal hasn't been mapped yet.
+        if (clone_context_.FindComputation(callee) == nullptr) {
+          const HloComputation* principal = it->second;
+          stack_.push({principal, callee, false});
+        }
+      }
+    }
+  }
+
+  // Second time visiting `state.principal`: all dependencies are linked.
+  // Clone `state.principal` into module_, update context, and pop from stack_.
+  HloComputation* HandleSecondVisit(TraversalState& state) {
+    // We are missing const overload for DeepCloneComputations with non-null
+    // context.
+    HloComputation* mutable_principal = const_cast<HloComputation*>(  // NOLINT
+        state.principal);  // This method is effectively const.
+
+    HloComputation* linked_computation =
+        module_->DeepCloneComputation(mutable_principal, &clone_context_);
+    // TODO: b/429370488 - Add original names to `HloLinkingManifest`.
+    linked_computation->SetAndSanitizeName(
+        absl::StrCat("linked.", linked_count_++));
+    VLOG(6) << "Processed: " << linked_computation->ToString() << " unique_id("
+            << linked_computation->unique_id() << ")";
+
+    clone_context_.MapComputation(state.stub, linked_computation);
+    finished_principals_.insert({state.principal, linked_computation});
+
+    being_linked_.erase(state.principal);
+    stack_.pop();
+    return linked_computation;
+  }
+
+  HloModule* module_;
+  const HloLinkingManifest& linking_manifest_;
+  const HloComputation* root_computation_;
+
+  // Cycle detection.
+  absl::flat_hash_set<const HloComputation*> being_linked_;
+  absl::flat_hash_map<const HloComputation*, HloComputation*>
+      finished_principals_;
+  std::stack<TraversalState, std::vector<TraversalState>> stack_;
+
+  HloCloneContext clone_context_;
+  int linked_count_ = 0;
+};
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<HloModule>> LinkComputation(
+    const HloLinkingManifest& linking_manifest,
+    const HloComputation* absl_nonnull root_computation) {
+  VLOG(6) << "Root computation: " << root_computation->name();
+  auto linked_module = std::make_unique<HloModule>(
+      "linked_module", linking_manifest.module_config,
+      std::make_unique<CompilationEnvironments>(
+          *linking_manifest.compilation_environment));
+
+  HloLinker linker(linked_module.get(), linking_manifest, root_computation);
+  TF_ASSIGN_OR_RETURN(HloComputation * linked_clone_ptr, linker.Link());
+
+  linked_module->ReplaceEntryComputation(linked_clone_ptr);
+  xla::HloDCE dce_pass;
+  TF_RETURN_IF_ERROR(dce_pass.Run(linked_module.get()).status());
+
+  if (VLOG_IS_ON(6)) {
+    for (const HloComputation* comp : linked_module->computations()) {
+      LOG(INFO) << comp->name() << " [" << comp->unique_id() << "]";
+      for (const HloInstruction* instr : comp->instructions()) {
+        LOG(INFO) << "  " << instr->ToString() << " [" << instr->unique_id()
+                  << "]";
+      }
+    }
+    LOG(INFO) << linked_module->ToString(HloPrintOptions().set_print_ids(true));
+  }
+  return linked_module;
+}
+
+}  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.h b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.h
new file mode 100644
index 00000000000000..a6a68aae1d6e2c
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_LINKING_H_
+#define XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_LINKING_H_
+
+#include <memory>
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/separate_compilation/hlo_linking_manifest.h"
+
+namespace xla::separate_compilation {
+
+// Link all the callees together with the `root_computation` in the provided
+// module. Returns the pointer to the linked version of `root_computation`
+// in the `module`.
+absl::StatusOr<HloComputation* absl_nonnull> LinkComputationInto(
+    HloModule* module, const HloLinkingManifest& linking_manifest,
+    const HloComputation* absl_nonnull root_computation);
+
+// Create a new module by linking computations starting from `root_computation`.
+absl::StatusOr<std::unique_ptr<HloModule>> LinkComputation(
+    const HloLinkingManifest& linking_manifest,
+    const HloComputation* absl_nonnull root_computation);
+
+}  // namespace xla::separate_compilation
+
+#endif  // XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_LINKING_H_
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
new file mode 100644
index 00000000000000..2ebba94ada5c87
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
@@ -0,0 +1,309 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/separate_compilation/hlo_module_linking.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/separate_compilation/hlo_linking_manifest.h"
+#include "xla/hlo/separate_compilation/hlo_module_splitting.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/hlo_verifier.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::separate_compilation {
+namespace {
+
+// Function to normalize an HloModule by removing/replacing names.
+void NormalizeHloModule(xla::HloModule& module) {
+  // 1. Clear names from module (as clone keeps the original names)
+  module.set_name("module");
+  VLOG(6) << module.name() << " instr count: " << module.instruction_count();
+
+  // 2. Get computations in post order
+  std::vector<HloComputation*> computations_post_order =
+      module.MakeComputationPostOrder(/*dfs_postorder=*/true);
+
+  // 3. Initialize visited set and counter
+  absl::flat_hash_set<const HloComputation*> visited;
+  int computation_counter = 0;
+  // 4. Number computations in post order
+  for (HloComputation* computation : computations_post_order) {
+    if (!visited.insert(computation).second) {
+      continue;
+    }
+    VLOG(6) << computation->ToString()
+            << " instr count: " << computation->instruction_count();
+    computation->SetAndSanitizeName(
+        absl::StrCat("computation_", computation_counter++));
+    // 5. Number instructions inside the computation
+    int instruction_counter = 0;
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      VLOG(6) << "  " << instruction->ToString();
+      instruction->SetAndSanitizeName(
+          absl::StrCat("instruction_", instruction_counter++));
+    }
+  }
+}
+
+bool AreHloModulesEquivalent(xla::HloModule& module1, xla::HloModule& module2) {
+  auto module1_copy = module1.Clone("module1_copy");
+  auto module2_copy = module2.Clone("module2_copy");
+
+  NormalizeHloModule(*module1_copy);
+  NormalizeHloModule(*module2_copy);
+
+  xla::HloPrintOptions options;
+  options.set_canonicalize_computations(true);
+
+  auto module1_str = module1_copy->ToString(options);
+  auto module2_str = module2_copy->ToString(options);
+
+  VLOG(6) << module2_str;
+  VLOG(6) << "Module 1 normalized string: " << module1_str << "\n"
+          << "Module 2 normalized string: " << module2_str;
+  return module1_str == module2_str;
+}
+
+class LinkingTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(LinkingTest, SingleCallLinking) {
+  constexpr absl::string_view module_text = R"(
+    HloModule simple_module
+
+    %comp {
+      %p = f32[] parameter(0)
+      ROOT %result = f32[] negate(%p)
+    }
+
+    ENTRY %main {
+      %p = f32[] parameter(0)
+      ROOT %result = f32[] call(%p), to_apply=%comp
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto original_module,
+                          ParseAndReturnVerifiedModule(module_text));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*original_module));
+  // LOG the split group.
+  for (const auto& split : module_split_group->module_splits) {
+    LOG(INFO) << "Split: " << split->submodule->name();
+    LOG(INFO) << "Split module: " << split->submodule->ToString();
+    LOG(INFO) << "Computation map:";
+    for (const auto& [original, cloned] : split->computation_map) {
+      LOG(INFO) << "original: " << original->name()
+                << " ==>> clones: " << cloned->name();
+    }
+    LOG(INFO) << "Call sites:";
+    for (const auto* call_site : split->call_sites) {
+      LOG(INFO) << "  " << call_site->name();
+    }
+    LOG(INFO) << " Stub links:";
+    for (const auto& [stub, comp] : split->stub_map) {
+      LOG(INFO) << " stub: " << stub->name()
+                << " ==>> original: " << comp->name();
+    }
+  }
+  // LOG the address book.
+  for (const auto& [comp, split] : module_split_group->address_book) {
+    LOG(INFO) << "Original: " << comp->name()
+              << " Split: " << split->submodule->name();
+  }
+  // LOG the linking manifest stub links
+  LOG(INFO) << "Linking manifest stub links:";
+  for (const auto& [stub, comp] :
+       module_split_group->linking_manifest.stub_links) {
+    LOG(INFO) << "  " << stub->name() << " ==>> " << comp->name();
+  }
+
+  const HloLinkingManifest& linking_manifest =
+      module_split_group->linking_manifest;
+  auto* original_root = FindComputation(original_module.get(), "main");
+  auto* split_group_root = module_split_group->address_book.at(original_root)
+                               ->computation_map.at(original_root);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
+                          LinkComputation(linking_manifest, split_group_root));
+  HloVerifier verifier(HloVerifierOpts{});
+  TF_ASSERT_OK(verifier.Run(linked_module.get()));
+
+  EXPECT_TRUE(AreHloModulesEquivalent(*original_module, *linked_module));
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_EXPECT_OK(compiler->RunHloPasses(original_module->Clone(),
+                                      /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  VLOG(6) << linked_module->ToString();
+  TF_ASSERT_OK(compiler->RunHloPasses(std::move(linked_module),
+                                      /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+}
+
+TEST_F(LinkingTest, ChainGraphLinking) {
+  constexpr absl::string_view module_text = R"(
+    HloModule nested_calls
+
+    %comp3 {
+      %p = f32[] parameter(0)
+      ROOT %result = f32[] exponential(%p)
+    }
+
+    %comp2 {
+      %p = f32[] parameter(0)
+      %c = f32[] constant(3.0)
+      %call_res = f32[] call(%p), to_apply=%comp3
+      ROOT %result = f32[] add(%call_res, %c)
+    }
+
+    %comp1 {
+      %p = f32[] parameter(0)
+      %c = f32[] constant(2.0)
+      %call_res = f32[] call(%p), to_apply=%comp2
+      ROOT %result = f32[] multiply(%call_res, %c)
+    }
+
+    ENTRY %main {
+      %p = f32[] parameter(0)
+      %call_res = f32[] call(%p), to_apply=%comp1
+      ROOT %result = f32[] add(%call_res, %p)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto original_module,
+                          ParseAndReturnVerifiedModule(module_text));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*original_module));
+
+  const HloLinkingManifest& linking_manifest =
+      module_split_group->linking_manifest;
+  auto* original_root = FindComputation(original_module.get(), "main");
+  auto* split_group_root = module_split_group->address_book.at(original_root)
+                               ->computation_map.at(original_root);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
+                          LinkComputation(linking_manifest, split_group_root));
+  HloVerifier verifier(HloVerifierOpts{});
+  TF_ASSERT_OK(verifier.Run(linked_module.get()));
+
+  EXPECT_TRUE(AreHloModulesEquivalent(*original_module, *linked_module));
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  VLOG(6) << linked_module->ToString();
+  TF_EXPECT_OK(compiler->RunHloPasses(std::move(linked_module),
+                                      /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  TF_EXPECT_OK(compiler->RunHloPasses(original_module->Clone(),
+                                      /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+}
+
+TEST_F(LinkingTest, DiamondGraphLinking) {
+  constexpr absl::string_view module_text = R"(
+    HloModule shared_callee_module
+
+    %fusion.2 (x: f32[]) -> f32[] {
+      %x = f32[] parameter(0)
+      ROOT %add = f32[] negate(f32[] %x)
+    }
+
+    %y {
+      %p = f32[] parameter(0)
+      ROOT %result = f32[] exponential(%p)
+    }
+
+    %x {
+      %p = f32[] parameter(0)
+      %call_y = f32[] call(%p), to_apply=%y
+      ROOT %result = f32[] cosine(%call_y)
+    }
+
+    %a {
+      %p = f32[] parameter(0)
+      %c = f32[] constant(5.0)
+      %call_x = f32[] call(%p), to_apply=%x
+      ROOT %result = f32[] add(%call_x, %c)
+    }
+
+    %b {
+      %p = f32[] parameter(0)
+      %f = f32[] fusion(%p), kind=kLoop, calls=%fusion.2
+      %call_x = f32[] call(%p), to_apply=%x
+      ROOT %result = f32[] subtract(%call_x, %f)
+    }
+
+    ENTRY %main {
+      %p_entry = f32[] parameter(0)
+      %call_a = f32[] call(%p_entry), to_apply=%a
+      %call_b = f32[] call(%p_entry), to_apply=%b
+      ROOT %result = f32[] add(%call_a, %call_b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto original_module,
+                          ParseAndReturnVerifiedModule(module_text));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*original_module));
+
+  const HloLinkingManifest& linking_manifest =
+      module_split_group->linking_manifest;
+  auto* original_root = FindComputation(original_module.get(), "main");
+  ASSERT_TRUE(module_split_group->address_book.contains(original_root));
+  auto* split = module_split_group->address_book.at(original_root);
+  ASSERT_TRUE(split->computation_map.contains(original_root));
+  auto* split_root = split->computation_map.at(original_root);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
+                          LinkComputation(linking_manifest, split_root));
+  HloVerifier verifier(HloVerifierOpts{});
+  TF_ASSERT_OK(verifier.Run(linked_module.get()));
+
+  EXPECT_TRUE(AreHloModulesEquivalent(*original_module, *linked_module));
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_EXPECT_OK(compiler->RunHloPasses(original_module->Clone(),
+                                      /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  VLOG(6) << linked_module->ToString();
+  TF_EXPECT_OK(compiler->RunHloPasses(std::move(linked_module),
+                                      /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+}
+
+}  // namespace
+}  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
new file mode 100644
index 00000000000000..ac8afaeaeaf0a2
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
@@ -0,0 +1,335 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/separate_compilation/hlo_module_splitting.h"
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/separate_compilation/hlo_linking_manifest.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::separate_compilation {
+namespace {
+
+constexpr absl::string_view kEntryName = "entry";
+constexpr absl::string_view kModulePrefix = "module";
+constexpr absl::string_view kStubPrefix = "stub";
+
+// Provide a name for the module containing the split.
+// The name should be stable across compilations meaning that the same
+// split should get the same name.
+std::string GetSplitModuleName(
+    absl::flat_hash_set<const HloComputation*> split) {
+  // If multiple `HloComputation` elements are in a split, we have to worry
+  // about their ordering when hashing, or use some ordering-invariant hash.
+  CHECK(split.size() == 1) << "The current implementation only supports "
+                              "singleton splits.";
+  return absl::StrCat(kModulePrefix, absl::HashOf(*split.begin()));
+}
+
+std::string GetStubName(int32_t callee_index) {
+  return absl::StrCat(kStubPrefix, ".", callee_index);
+}
+
+absl::StatusOr<std::unique_ptr<HloComputation>> CreateCalleeStub(
+    HloComputation* callee, int32_t callee_index) {
+  // Bind to keep alive for the duration of the scope.
+  std::string stub_name = GetStubName(callee_index);
+  auto comp_builder = HloComputation::Builder(stub_name);
+
+  std::vector<HloInstruction*> operands;
+  for (const HloInstruction* parameter : callee->parameter_instructions()) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * cloned_parameter,
+        comp_builder.AddParameter(parameter->Clone(/*suffix=*/"")));
+    operands.push_back(cloned_parameter);
+  }
+
+  comp_builder.AddInstruction(HloInstruction::CreateCustomCall(
+      callee->root_instruction()->shape(), operands,
+      /*custom_call_target=*/kStubPrefix));
+  return comp_builder.Build();
+}
+
+// Returns all `kCall` instructions in the given computation.
+std::vector<const HloInstruction*> CollectCallInstructions(
+    const HloComputation* computation) {
+  // TODO: b/419473710 - Maybe guarantee order of operand traversal?
+  std::vector<const HloInstruction*> call_sites;
+  for (const HloInstruction* caller : computation->MakeInstructionPostOrder()) {
+    if (caller->opcode() == HloOpcode::kCall) {
+      call_sites.push_back(caller);
+    }
+  }
+  return call_sites;
+}
+
+// Composes two maps into one. The first map's value type must be second map's
+// key type.
+template <typename K, typename KV, typename V>
+absl::StatusOr<absl::flat_hash_map<K, V>> ComposeMaps(
+    const absl::flat_hash_map<K, KV>& first,
+    const absl::flat_hash_map<KV, V>& second) {
+  absl::flat_hash_map<K, V> result;
+  for (const auto [k, kv] : first) {
+    if (auto it = second.find(kv); it != second.end()) {
+      result.insert({k, it->second});
+    }
+  }
+  return result;
+}
+
+// Merges `from` into `into`. If `error_on_duplicate_key` is true, returns an
+// error if any key is encountered more than once.
+template <typename K, typename V>
+absl::Status MergeMapInto(absl::flat_hash_map<K, V>& into,
+                          const absl::flat_hash_map<K, V>& from,
+                          bool error_on_duplicate_key = true) {
+  for (const auto& [k, v] : from) {
+    if (!into.insert({k, v}).second) {
+      if (error_on_duplicate_key) {
+        return absl::AlreadyExistsError("Duplicate key encountered.");
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+}  // namespace
+
+// Assigns entry & all kCall-ed computations to splits in stable way.
+// Computations are assigned to exactly one split.
+//
+// Other computations, not explicitly kCall-ed, are not assigned and will be
+// cloned with any explicitly assigned computation that depends on them. Note
+// that this means that they might be replicated to multiple splits!
+absl::StatusOr<std::vector<absl::flat_hash_set<const HloComputation*>>>
+GroupComputationsForSplitting(const HloModule& module) {
+  std::vector<absl::flat_hash_set<const HloComputation*>> result;
+
+  const HloComputation* entry_computation = module.entry_computation();
+  TF_RET_CHECK(entry_computation != nullptr)
+      << "Module has no entry computation.";
+
+  // Perform a BFS traversal of the graph along kCall edges.
+  // All other edges are ignored.
+  std::deque<const HloComputation*> computations_to_visit;
+  absl::flat_hash_set<const HloComputation*> seen;
+
+  computations_to_visit.push_back(entry_computation);
+  seen.insert(entry_computation);
+
+  while (!computations_to_visit.empty()) {
+    const HloComputation* current_computation = computations_to_visit.front();
+    computations_to_visit.pop_front();
+
+    // Each reachable computation is added as a separate split.
+    // If grouping is possible, this logic might change.
+    result.push_back({current_computation});
+
+    // Process callees.
+    for (const HloInstruction* op : current_computation->instructions()) {
+      if (op->opcode() == HloOpcode::kCall) {
+        const HloComputation* callee = op->to_apply();
+        TF_RET_CHECK(callee != nullptr)
+            << "HloOpcode::kCall has a null callee.";
+        if (!seen.contains(callee)) {
+          seen.insert(callee);
+          computations_to_visit.push_back(callee);
+        }
+      }
+    }
+  }
+  return result;
+}
+
+absl::StatusOr<std::unique_ptr<HloModuleSplit>> CreateHloModuleSplit(
+    const HloModule& module, absl::flat_hash_set<const HloComputation*> split) {
+  // If multiple `HloComputation` elements are in a split, we have to worry
+  // about their ordering when hashing, or use some ordering-invariant hash.
+  CHECK(split.size() == 1)
+      << "The current implementation supports singleton splits.";
+  // TODO: b/419184359 - Revisit when we reconfigure the pipeline
+  // (global->local->global). Check what data is needed by which
+  // set of passes.
+  std::shared_ptr<const HloModuleConfig> sub_module_config =
+      module.shared_config();
+  auto sub_module_env =
+      std::make_unique<CompilationEnvironments>(module.comp_envs());
+  auto submodule = std::make_unique<HloModule>(
+      GetSplitModuleName(split), sub_module_config, std::move(sub_module_env));
+  HloCloneContext clone_context(submodule.get());
+  // The plan is:
+  // 1. Prepare stubs as substitutions for callees.
+  // 2. Clone the computation(s) with replacements of calls.
+  // 3. Set the ENTRY computation.
+
+  const HloComputation* computation = *split.begin();
+  VLOG(4) << "Splitting out: " << computation->name();
+  std::vector<const HloInstruction*> call_instructions =
+      CollectCallInstructions(computation);
+  // stub -> original callee
+  absl::flat_hash_map<const HloComputation*, const HloComputation*> stub_map;
+  // original computation -> split computation
+  absl::flat_hash_map<const HloComputation*, const HloComputation*>
+      computation_map;
+
+  // We want to give a unique name to every call site by inserting a unique
+  // stub call into every call site. However, `HloCloneContext` wants a map
+  // of callee_computation -> new_computation, and this does not allow different
+  // call sites to replace the same `callee_computation` with a different
+  // `new_computation_2` at a different call site.
+  //
+  // We create a fresh stub for every call site. Since we only handle kCall
+  // instructions, it can have single callee (call site). We map
+  // original callee to these new stub, even though some aliasing might
+  // happen. Specifically, if multiple call sites refer to the same callee
+  // that callee will map to the stub for the last encountered call site.
+  //
+  // We tolerate this to avoid cloning the actual callees into the new module.
+  // After cloning we go into the cloned computation and patch callee pointers.
+
+  absl::flat_hash_map<const HloInstruction*, HloComputation*>
+      callee_replacements;
+  int32_t callee_index = 0;
+  for (const HloInstruction* caller : call_instructions) {
+    HloComputation* callee = caller->to_apply();
+    // Skip callee that is part of the current split.
+    if (split.contains(callee)) {
+      // Remember the original callee, so that we can patch the call site later.
+      callee_replacements[caller] = callee;
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> stub,
+                        CreateCalleeStub(callee, callee_index));
+    VLOG(4) << "Stubbing " << stub->name() << " --> " << callee->name() << " "
+            << stub->ToString();
+    HloComputation* stub_raw_ptr =
+        submodule->AddComputationAndUnifyNamesAndIds(std::move(stub),
+                                                     /*is_entry=*/false);
+    clone_context.MapComputation(callee, stub_raw_ptr);
+    callee_replacements[caller] = stub_raw_ptr;
+    stub_map.insert({stub_raw_ptr, callee});
+    ++callee_index;
+  }
+
+  HloComputation* entry_computation = submodule->AddEntryComputationWithLayouts(
+      computation->CloneInContext(clone_context, nullptr,
+                                  /*extra_parameters=*/{}, /*suffix=*/""));
+
+  // Patch call sites.
+  for (const HloInstruction* caller : call_instructions) {
+    HloInstruction* mapped_call_instruction =
+        clone_context.GetInstruction(caller);
+    // Adjust `callee_replacement` to only contain pointer to computations in
+    // the submodule. This is currently not the case because we skipped some
+    // callees when stubbing and remembered pointer to their original.
+    HloComputation* replacement = callee_replacements[caller];
+    if (replacement->parent() != submodule.get()) {
+      replacement = clone_context.GetComputation(replacement);
+    }
+    mapped_call_instruction->set_to_apply(callee_replacements[caller]);
+  }
+
+  entry_computation->SetAndSanitizeName(kEntryName);
+  computation_map.insert({computation, entry_computation});
+
+  VLOG(3) << submodule->ToString();
+  return std::make_unique<HloModuleSplit>(
+      module, std::move(submodule), std::move(stub_map),
+      std::move(computation_map), std::move(call_instructions));
+}
+
+absl::StatusOr<std::unique_ptr<HloModuleSplitGroup>> CreateHloModuleSplitGroup(
+    const HloModule& module) {
+  absl::flat_hash_map<const HloComputation*, const HloModuleSplit*>
+      computation_address_book;
+  std::vector<std::unique_ptr<HloModuleSplit>> module_splits;
+
+  // See `HloModuleSplit::stub_map`.
+  absl::flat_hash_map<const HloComputation*, const HloComputation*>
+      global_stub_map;
+  absl::flat_hash_map<const HloComputation*, const HloComputation*>
+      global_computation_map;
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<absl::flat_hash_set<const HloComputation*>> splits,
+      GroupComputationsForSplitting(module));
+
+  for (const auto& split : splits) {
+    TF_ASSIGN_OR_RETURN(auto module_split, CreateHloModuleSplit(module, split));
+    module_splits.push_back(std::move(module_split));
+    for (const auto* original_comp : split) {
+      computation_address_book.insert(
+          {original_comp, module_splits.back().get()});
+    }
+    TF_RETURN_IF_ERROR(
+        MergeMapInto(global_stub_map, module_splits.back()->stub_map));
+    TF_RETURN_IF_ERROR(MergeMapInto(global_computation_map,
+                                    module_splits.back()->computation_map));
+  }
+
+  if (VLOG_IS_ON(5)) {
+    VLOG(5) << "Split group:";
+    for (const auto& split : module_splits) {
+      VLOG(5) << "Split: " << split->submodule->name();
+      VLOG(5) << " Stub links:";
+      for (const auto& [stub, comp] : split->stub_map) {
+        VLOG(5)
+            << "  " << stub->name() << " ==>> " << comp->name() << "("
+            << computation_address_book[comp]->computation_map.at(comp)->name()
+            << " @ " << computation_address_book[comp]->submodule->name()
+            << ")";
+      }
+    }
+  }
+  // Compose at the end once all planned cloning operations are finished and
+  // we know where each original computation ended up.
+  TF_ASSIGN_OR_RETURN(auto stub_links,
+                      ComposeMaps(global_stub_map, global_computation_map));
+
+  HloLinkingManifest linking_manifest{
+      std::move(stub_links), module.shared_config(),
+      std::make_unique<CompilationEnvironments>(module.comp_envs())};
+
+  return std::make_unique<HloModuleSplitGroup>(
+      std::move(computation_address_book), std::move(module_splits),
+      std::move(linking_manifest));
+}
+
+}  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
new file mode 100644
index 00000000000000..4569edd925e5ca
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
@@ -0,0 +1,107 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_SPLITTING_H_
+#define XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_SPLITTING_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/separate_compilation/hlo_linking_manifest.h"
+
+namespace xla::separate_compilation {
+
+// Returns a list of sets of computations that can be split into separate
+// modules. Adjacent computations in the same set can be compiled together.
+absl::StatusOr<std::vector<absl::flat_hash_set<const HloComputation*>>>
+GroupComputationsForSplitting(const HloModule& module);
+
+// Represents one part of a module that was split into multiple parts for
+// separate compilation.
+struct HloModuleSplit {
+  using StubComputation = HloComputation;
+  using OriginalComputation = HloComputation;
+  using ClonedComputation = HloComputation;
+
+  // The original `HloModule` that this split originated from.
+  const HloModule& module;
+  // An `HloModule` containing computations belonging to this split. If any
+  // computation in `submodule` calls a computation which is part of another
+  // split, that call is replaced with a call to a stub computation.
+  std::unique_ptr<HloModule> submodule;
+  // Maps stub computations defined in `submodule` to the original computations
+  // in `module` which they replace.
+  absl::flat_hash_map<const StubComputation*, const OriginalComputation*>
+      stub_map;
+  // Maps computations from `module` to their cloned versions in `submodule`.
+  absl::flat_hash_map<const OriginalComputation*, const ClonedComputation*>
+      computation_map;
+  // All `kCall` instructions in `submodule` which originally belonged to
+  // computations cloned into this split. This includes calls to stubbed out
+  // computations, and calls to computations within this split.
+  std::vector<const HloInstruction*> call_sites;
+
+  HloModuleSplit(
+      const HloModule& module, std::unique_ptr<HloModule> submodule,
+      absl::flat_hash_map<const StubComputation*, const OriginalComputation*>
+          stub_map,
+      absl::flat_hash_map<const OriginalComputation*, const ClonedComputation*>
+          computation_map,
+      std::vector<const HloInstruction*> call_sites)
+      : module{std::move(module)},
+        submodule{std::move(submodule)},
+        stub_map{std::move(stub_map)},
+        computation_map{std::move(computation_map)},
+        call_sites{std::move(call_sites)} {}
+};
+
+// Creates an `HloModule` that only contains the requested computations
+// from the original module and, potentially, insert callee stubs.
+absl::StatusOr<std::unique_ptr<HloModuleSplit>> CreateHloModuleSplit(
+    const HloModule& module, absl::flat_hash_set<const HloComputation*> split);
+
+// Represents a group of `HloModuleSplit`s.
+struct HloModuleSplitGroup {
+  absl::flat_hash_map<const HloComputation*, const HloModuleSplit*>
+      address_book;
+  std::vector<std::unique_ptr<HloModuleSplit>> module_splits;
+  HloLinkingManifest linking_manifest;
+
+  HloModuleSplitGroup(
+      absl::flat_hash_map<const HloComputation*, const HloModuleSplit*>&&
+          address_book,
+      std::vector<std::unique_ptr<HloModuleSplit>>&& module_splits,
+      HloLinkingManifest&& linking_manifest)
+      : address_book(std::move(address_book)),
+        module_splits(std::move(module_splits)),
+        linking_manifest(std::move(linking_manifest)) {}
+};
+
+// Split the given module. Returns a mapping from `HloComputation*` to
+// the `ModulePartition` data where that computation was assigned. If multiple
+// computations are assigned to the same module there are multiple keys pointing
+// to the same `ModulePartition` structure.
+absl::StatusOr<std::unique_ptr<HloModuleSplitGroup>> CreateHloModuleSplitGroup(
+    const HloModule& module);
+
+}  // namespace xla::separate_compilation
+#endif  // XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_SPLITTING_H_
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting_test.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting_test.cc
new file mode 100644
index 00000000000000..79f0809d66f16a
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting_test.cc
@@ -0,0 +1,376 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/separate_compilation/hlo_module_splitting.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::separate_compilation {
+namespace {
+
+using ::testing::UnorderedElementsAreArray;
+
+class SplittingTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(SplittingTest, AllComputationsInBuckets) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.3 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.3
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto splits, GroupComputationsForSplitting(*module));
+
+  absl::flat_hash_set<const HloComputation*> all_computations_in_splits;
+  for (const auto& bucket : splits) {
+    all_computations_in_splits.insert(bucket.begin(), bucket.end());
+  }
+  EXPECT_THAT(all_computations_in_splits,
+              UnorderedElementsAreArray(module->computations()));
+}
+
+TEST_F(SplittingTest, CreateModule) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.3 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.3
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  auto* main = FindComputation(module.get(), "main");
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split,
+                          CreateHloModuleSplit(*module, {main}));
+
+  const int kMainModuleComputationCount = 5;  // the main + 4 stubs
+  // const int kMainModuleClonedComputations = 1;
+  const int kMainModuleCallSitesCount = 4;
+  EXPECT_EQ(module_split->module.computation_count(),
+            kMainModuleComputationCount);
+  EXPECT_EQ(module_split->stub_map.size(), kMainModuleCallSitesCount);
+  EXPECT_EQ(module_split->call_sites.size(), kMainModuleCallSitesCount);
+}
+
+TEST_F(SplittingTest, SplitModule) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%fusion.1 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%fusion.2 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.2
+  %fusion = f32[] fusion(%res.3), kind=kLoop, calls=%fusion.2
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+  %res.fu23 = f32[] add(f32[] %fusion, f32[] %res.23)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.fu23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*module));
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_ASSERT_OK(compiler->RunHloPasses(module->Clone(), /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  for (auto& split : module_split_group->module_splits) {
+    TF_EXPECT_OK(compiler->RunHloPasses(split->submodule->Clone(),
+                                        /*executor=*/nullptr,
+                                        Compiler::CompileOptions{}));
+  }
+}
+
+TEST_F(SplittingTest, SplitDiamondGraphModule) {
+  constexpr absl::string_view module_text = R"(
+    HloModule shared_callee_module
+
+    %y {
+      %p = f32[] parameter(0)
+      ROOT %result = f32[] exponential(%p)
+    }
+
+    %x {
+      %p = f32[] parameter(0)
+      %call_y = f32[] call(%p), to_apply=%y
+      ROOT %result = f32[] cosine(%call_y)
+    }
+
+    %a {
+      %p = f32[] parameter(0)
+      %c = f32[] constant(5.0)
+      %call_x = f32[] call(%p), to_apply=%x
+      ROOT %result = f32[] add(%call_x, %c)
+    }
+
+    %b {
+      %p = f32[] parameter(0)
+      %c = f32[] constant(10.0)
+      %call_x = f32[] call(%p), to_apply=%x
+      ROOT %result = f32[] subtract(%call_x, %c)
+    }
+
+    ENTRY %entry {
+      %p_entry = f32[] parameter(0)
+      %call_a = f32[] call(%p_entry), to_apply=%a
+      %call_b = f32[] call(%p_entry), to_apply=%b
+      ROOT %result = f32[] add(%call_a, %call_b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*module));
+
+  // Expect all computations to be assigned somewhere.
+  for (auto original_comp : module->computations()) {
+    EXPECT_TRUE(module_split_group->address_book.contains(original_comp));
+  }
+
+  EXPECT_EQ(module_split_group->module_splits.size(),
+            module->computation_count());
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_ASSERT_OK(compiler->RunHloPasses(module->Clone(), /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  for (auto& split : module_split_group->module_splits) {
+    TF_EXPECT_OK(compiler->RunHloPasses(split->submodule->Clone(),
+                                        /*executor=*/nullptr,
+                                        Compiler::CompileOptions{}));
+  }
+}
+
+TEST_F(SplittingTest, SplitModuleWithSharedComputations) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%fusion.1 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+%fusion.2 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.1f = f32[] fusion(%res.1), kind=kLoop, calls=%fusion.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.2
+  %fusion = f32[] fusion(%res.3), kind=kLoop, calls=%fusion.2
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1f)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+  %res.f = f32[] add(f32[] %fusion, f32[], %res.23)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.f)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*module));
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_ASSERT_OK(compiler->RunHloPasses(module->Clone(), /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  for (auto& split : module_split_group->module_splits) {
+    TF_EXPECT_OK(compiler->RunHloPasses(split->submodule->Clone(),
+                                        /*executor=*/nullptr,
+                                        Compiler::CompileOptions{}));
+  }
+}
+
+}  // namespace
+}  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
index b34c2e953f8b72..2b5dcf146d4a2a 100644
--- a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
+++ b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/hlo/tools/BUILD b/third_party/xla/xla/hlo/tools/BUILD
index a6a2232d30ebb7..87ae4bead709e8 100644
--- a/third_party/xla/xla/hlo/tools/BUILD
+++ b/third_party/xla/xla/hlo/tools/BUILD
@@ -29,15 +29,13 @@ xla_cc_binary(
     deps = [
         "//xla/tsl/lib/io:buffered_inputstream",
         "//xla/tsl/lib/io:random_inputstream",
+        "//xla/tsl/platform:env",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -46,12 +44,11 @@ xla_cc_binary(
     srcs = ["show_literal.cc"],
     deps = [
         "//xla:literal",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -60,13 +57,11 @@ xla_cc_binary(
     srcs = ["convert_computation.cc"],
     deps = [
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/platform:env",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -75,14 +70,14 @@ xla_cc_binary(
     srcs = ["hlo_module_metadata_processor.cc"],
     deps = [
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/platform:env",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -107,12 +102,11 @@ xla_cc_binary(
     deps = [
         "//xla:util",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/platform:env",
         "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/third_party/xla/xla/hlo/tools/convert_computation.cc b/third_party/xla/xla/hlo/tools/convert_computation.cc
index b013ac383ab8b6..c38c0cbf4820d8 100644
--- a/third_party/xla/xla/hlo/tools/convert_computation.cc
+++ b/third_party/xla/xla/hlo/tools/convert_computation.cc
@@ -16,9 +16,6 @@ limitations under the License.
 // Usage: convert_computation <txt2bin|bin2txt> serialized_computation_proto
 //
 // bin2txt spits out the result to stdout. txt2bin modifies the file in place.
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "tsl/platform/status.h"
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -26,11 +23,12 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "google/protobuf/text_format.h"
 #include "xla/service/hlo.pb.h"
-#include "tsl/platform/env.h"
+#include "xla/tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace tools {
@@ -39,10 +37,10 @@ void RealMain(const std::string& mode, const std::string& path) {
   HloSnapshot module;
   tsl::Env* env = tsl::Env::Default();
   if (mode == "txt2bin") {
-    TF_CHECK_OK(tsl::ReadTextProto(env, path, &module));
-    TF_CHECK_OK(tsl::WriteBinaryProto(env, path, module));
+    CHECK_OK(tsl::ReadTextProto(env, path, &module));
+    CHECK_OK(tsl::WriteBinaryProto(env, path, module));
   } else if (mode == "bin2txt") {
-    TF_CHECK_OK(tsl::ReadBinaryProto(env, path, &module));
+    CHECK_OK(tsl::ReadBinaryProto(env, path, &module));
     std::string out;
     tsl::protobuf::TextFormat::PrintToString(module, &out);
     fprintf(stdout, "%s", out.c_str());
diff --git a/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc b/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc
index efbcef00e0f376..d2e484aae6fb08 100644
--- a/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc
+++ b/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc
@@ -25,12 +25,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/io/buffered_inputstream.h"
 #include "xla/tsl/lib/io/random_inputstream.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/util/command_line_flags.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/file_system.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 using std::string;
 
@@ -57,7 +55,7 @@ int main(int argc, char** argv) {
   }
 
   std::unique_ptr<tsl::RandomAccessFile> file;
-  TF_CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(input_file, &file));
+  CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(input_file, &file));
 
   std::vector<float> floats;
   std::string line;
@@ -72,7 +70,6 @@ int main(int argc, char** argv) {
 
   absl::string_view content(absl::bit_cast<const char*>(floats.data()),
                             floats.size() * sizeof(float));
-  TF_CHECK_OK(
-      tsl::WriteStringToFile(tsl::Env::Default(), output_file, content));
+  CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), output_file, content));
   return 0;
 }
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/BUILD
index ea0bba4fb7e8e1..43dbaf42977ca4 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/BUILD
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/BUILD
@@ -201,7 +201,6 @@ xla_cc_binary(
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
index f2f1118d58ea38..a80769523cc7fd 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
@@ -63,7 +63,6 @@ HloPrintOptions CreateHloPrintOptions(
     const HloGumgraphFingerprintOptions& fingerprint_options) {
   HloPrintOptions hlo_print_options =
       HloPrintOptions::Fingerprint()
-          .set_include_layout_in_shapes(false)
           .set_print_subcomputation_mode(
               HloPrintOptions::PrintSubcomputationMode::kOff)
           .set_print_parameter_number(false)
@@ -137,11 +136,10 @@ absl::Status HloGumgraph::ConstructGraph(const HloModule& hlo_module) {
 
       HloInstructionNode* node = node_and_inserted.first;
       node->props.fingerprint = GetHloInstructionFingerprint(
-          instruction, CreateHloPrintOptions(fingerprint_options_));
+          instruction, CreateHloPrintOptions(fingerprint_options_)
+                           .set_include_layout_in_shapes(false));
       node->props.canonical_fingerprint = GetHloInstructionFingerprint(
-          instruction, HloPrintOptions::Fingerprint()
-                           .set_print_parameter_number(false)
-                           .set_print_only_essential_constants(false));
+          instruction, CreateHloPrintOptions(fingerprint_options_));
 
       bool inline_called_computations = false;
       switch (instruction->opcode()) {
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
index c806e524c39847..1bf46508a9ab33 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
@@ -235,25 +235,25 @@ ENTRY entry {
                   /*generation=*/1,
                   /*height=*/3, /*subgraph_fingerprint=*/10174981490612213786U,
                   /*fingerprint=*/7968662072287666665U,
-                  /*canonical_fingerprint=*/962574172336760684U));
+                  /*canonical_fingerprint=*/7968662072287666665U));
   EXPECT_THAT(entry->children[0]->props,
               FieldsAre(
                   /*generation=*/2,
                   /*height=*/2, /*subgraph_fingerprint=*/12866517545790127195U,
                   /*fingerprint=*/7968662072287666665U,
-                  /*canonical_fingerprint=*/962574172336760684U));
+                  /*canonical_fingerprint=*/7968662072287666665U));
   EXPECT_THAT(entry->children[1]->props,
               FieldsAre(
                   /*generation=*/3,
                   /*height=*/1, /*subgraph_fingerprint=*/3741348072536313129U,
                   /*fingerprint=*/3741348072536313129U,
-                  /*canonical_fingerprint=*/12841472793063608770U));
+                  /*canonical_fingerprint=*/3741348072536313129U));
   EXPECT_THAT(entry->children[0]->children[0]->props,
               FieldsAre(
                   /*generation=*/3,
                   /*height=*/1, /*subgraph_fingerprint=*/856105463456541506U,
                   /*fingerprint=*/856105463456541506U,
-                  /*canonical_fingerprint=*/1668459129586447343U));
+                  /*canonical_fingerprint=*/856105463456541506U));
 
   EXPECT_THAT(
       graph->AllComputationProps(),
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
index fd8bb74e39895d..ccba8a9cd6b391 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/service/hlo_module_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
@@ -57,16 +56,15 @@ names, parameter ordering etc, layouts (in some instances).
       bazel run hlo_diff -- \
         --{first_hlo_snapshot,first_hlo_proto,first_hlo_module_proto,first_hlo_text}=path/to/first/binary_proto
         --{second_hlo_snapshot,second_hlo_proto,second_hlo_module_proto,second_hlo_text}=path/to/second/binary_proto
-        [--ignore_shape_during_instruction_matching]
+        [--ignore_shape]
         [--text_output=path/to/file/to/save/text]
         [--html_output=path/to/file/to/save/html]
 
 first and second hlo file paths are required flags. Optionally the following
 flags can be used:
 
-If --ignore_shape_during_instruction_matching is specified, the tool ignores
-array/tensor shapes when matching instructions allowing for more permissive
-matches.
+If --ignore_shape is specified, the tool ignores array/tensor shapes when
+matching instructions and reporting diffs, allowing for more permissive matches.
 If --text_output is specified, the full diff result will be printed in text
 format and saved to the specified file.
 if --html_output is specified, the diff result will be rendered in HTML
@@ -121,8 +119,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> LoadHLOModule(
     const Options::HloPath& hlo_path) {
   if (!hlo_path.hlo_snapshot.empty()) {
     HloSnapshot snapshot;
-    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), hlo_path.hlo_snapshot,
-                                     &snapshot))
+    CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), hlo_path.hlo_snapshot,
+                                  &snapshot))
         << "Can't open, read, or parse HloSnapshot proto at "
         << hlo_path.hlo_snapshot;
     return BuildHloModule(snapshot.hlo().hlo_module());
@@ -201,23 +199,23 @@ absl::Status RunGumgraphDiff(HloModule& first_module, HloModule& second_module,
 }
 
 void RealMain(const Options& opts) {
-  TF_CHECK_OK(CheckGroupFlags(opts.first))
+  CHECK_OK(CheckGroupFlags(opts.first))
       << "Can only specify one and ony one of --first_hlo_snapshot, "
          "--first_hlo_proto, --first_hlo_module_proto, --first_hlo_text";
-  TF_CHECK_OK(CheckGroupFlags(opts.second))
+  CHECK_OK(CheckGroupFlags(opts.second))
       << "Can only specify one and ony one of --second_hlo_snapshot, "
          "--second_hlo_proto, --second_hlo_module_proto, --second_hlo_text";
 
   LOG(INFO) << "Loading first module";
   absl::StatusOr<std::unique_ptr<HloModule>> first_module =
       LoadHLOModule(opts.first);
-  TF_CHECK_OK(first_module.status()) << "Failed to build first HLO module";
+  CHECK_OK(first_module.status()) << "Failed to build first HLO module";
   LOG(INFO) << "Loaded first module";
 
   LOG(INFO) << "Loading second module";
   absl::StatusOr<std::unique_ptr<HloModule>> second_module =
       LoadHLOModule(opts.second);
-  TF_CHECK_OK(second_module.status()) << "Failed to build second HLO module";
+  CHECK_OK(second_module.status()) << "Failed to build second HLO module";
   LOG(INFO) << "Loaded second module";
 
   CHECK_OK(
@@ -248,9 +246,10 @@ int main(int argc, char** argv) {
                 "second XLA hlo module proto to compare"),
       tsl::Flag("second_hlo_text", &opts.second.hlo_text,
                 "second XLA hlo text to compare"),
-      tsl::Flag("ignore_shape_during_instruction_matching",
-                &opts.diff_options.fingerprint_options.ignore_shape,
-                "Ignore array/tensor shapes when matching instructions"),
+      tsl::Flag(
+          "ignore_shape", &opts.diff_options.fingerprint_options.ignore_shape,
+          "If true, ignore array/tensor shapes when matching instructions "
+          "and reporting diffs."),
       tsl::Flag("text_output", &opts.render_options.text_output,
                 "file to save diff blocks as text"),
       tsl::Flag("html_output", &opts.render_options.html_output,
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.cc
index f4d65e71b3b764..3864da0384028e 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.cc
@@ -492,5 +492,44 @@ DiffSummaryProto DiffSummary::ToProto() const {
   return proto;
 }
 
+DiffSummary DiffSummary::FromProto(const DiffSummaryProto& proto,
+                                   const HloModule& left_module,
+                                   const HloModule& right_module) {
+  DiffSummary summary;
+  absl::flat_hash_map<std::string, const HloComputation*> left_computation_map;
+  for (const HloComputation* computation : left_module.computations()) {
+    left_computation_map[computation->name()] = computation;
+  }
+  absl::flat_hash_map<std::string, const HloComputation*> right_computation_map;
+  for (const HloComputation* computation : right_module.computations()) {
+    right_computation_map[computation->name()] = computation;
+  }
+
+  for (const auto& diff_pattern_proto : proto.computation_diff_patterns()) {
+    ComputationDiffPattern diff_pattern;
+    diff_pattern.fingerprint = diff_pattern_proto.fingerprint();
+    diff_pattern.diff_metrics.changed_instruction_count =
+        diff_pattern_proto.changed_instruction_count();
+    diff_pattern.diff_metrics.left_unmatched_instruction_count =
+        diff_pattern_proto.left_unmatched_instruction_count();
+    diff_pattern.diff_metrics.right_unmatched_instruction_count =
+        diff_pattern_proto.right_unmatched_instruction_count();
+    for (const auto& group_proto : diff_pattern_proto.computation_group()) {
+      ComputationGroup group;
+      for (const auto& computation_details : group_proto.left_computations()) {
+        group.left_computations.push_back(
+            left_computation_map.at(computation_details.name()));
+      }
+      for (const auto& computation_details : group_proto.right_computations()) {
+        group.right_computations.push_back(
+            right_computation_map.at(computation_details.name()));
+      }
+      diff_pattern.computation_groups.push_back(group);
+    }
+    summary.computation_diff_patterns.push_back(diff_pattern);
+  }
+  return summary;
+}
+
 }  // namespace hlo_diff
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.h b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.h
index 901d1dc1291f75..7ebcdf2e78f7e7 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.h
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.h
@@ -105,6 +105,11 @@ struct DiffSummary {
 
   // Converts the diff summary to a proto.
   DiffSummaryProto ToProto() const;
+
+  // Constructs the diff summary from a proto.
+  static DiffSummary FromProto(const DiffSummaryProto& proto,
+                               const HloModule& left_module,
+                               const HloModule& right_module);
 };
 
 // Constructs the diff summary from the diff result.
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary_test.cc
index 01932e46fb9d0f..bf10f584e8e168 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary_test.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary_test.cc
@@ -133,7 +133,7 @@ TEST_F(HloDiffTest, FindMainMatchedComputationWorks) {
                          Pointee(Property(&HloComputation::name, "entry")),
                          /*max_matched_instruction_count=*/7,
                          /*split_allegiance_instruction=*/0,
-                         /*diff_fingerprint=*/5339951275989465718U,
+                         /*diff_fingerprint=*/7996767109468280310U,
                          /*all_unchanged=*/true)),
           Pair(Pointee(Property(&HloComputation::name, "fused_computation.1")),
                FieldsAre(/*side=*/DiffSide::kLeft,
@@ -142,7 +142,7 @@ TEST_F(HloDiffTest, FindMainMatchedComputationWorks) {
                                           "fused_computation.1")),
                          /*max_matched_instruction_count=*/2,
                          /*split_allegiance_instruction=*/1,
-                         /*diff_fingerprint=*/3869737221237740494U,
+                         /*diff_fingerprint=*/15082173510391049604U,
                          /*all_unchanged=*/true)),
           Pair(Pointee(Property(&HloComputation::name, "fused_computation.2")),
                FieldsAre(/*side=*/DiffSide::kLeft,
@@ -151,7 +151,7 @@ TEST_F(HloDiffTest, FindMainMatchedComputationWorks) {
                                           "fused_computation.2")),
                          /*max_matched_instruction_count=*/2,
                          /*split_allegiance_instruction=*/1,
-                         /*diff_fingerprint=*/3869737221237740494U,
+                         /*diff_fingerprint=*/15082173510391049604U,
                          /*all_unchanged=*/true))));
   EXPECT_THAT(
       right_computation_summary,
@@ -162,7 +162,7 @@ TEST_F(HloDiffTest, FindMainMatchedComputationWorks) {
                          Pointee(Property(&HloComputation::name, "entry")),
                          /*max_matched_instruction_count=*/7,
                          /*split_allegiance_instruction=*/0,
-                         /*diff_fingerprint=*/5339951275989465718U,
+                         /*diff_fingerprint=*/7996767109468280310U,
                          /*all_unchanged=*/true)),
           Pair(Pointee(Property(&HloComputation::name, "fused_computation.1")),
                FieldsAre(/*side=*/DiffSide::kRight,
@@ -171,7 +171,7 @@ TEST_F(HloDiffTest, FindMainMatchedComputationWorks) {
                                           "fused_computation.1")),
                          /*max_matched_instruction_count=*/2,
                          /*split_allegiance_instruction=*/1,
-                         /*diff_fingerprint=*/3869737221237740494U,
+                         /*diff_fingerprint=*/15082173510391049604U,
                          /*all_unchanged=*/true)),
           Pair(Pointee(Property(&HloComputation::name, "fused_computation.2")),
                FieldsAre(/*side=*/DiffSide::kRight,
@@ -180,7 +180,7 @@ TEST_F(HloDiffTest, FindMainMatchedComputationWorks) {
                                           "fused_computation.2")),
                          /*max_matched_instruction_count=*/2,
                          /*split_allegiance_instruction=*/1,
-                         /*diff_fingerprint=*/3869737221237740494U,
+                         /*diff_fingerprint=*/15082173510391049604U,
                          /*all_unchanged=*/true))));
 }
 
@@ -314,6 +314,7 @@ TEST_F(HloDiffTest, ComputationDiffFingerprintWorks) {
       right_computation_summary[computation] = it->second;
     }
   }
+
   EXPECT_THAT(left_computation_summary,
               UnorderedElementsAre(Pair(
                   Pointee(Property(&HloComputation::name, "entry")),
@@ -322,7 +323,7 @@ TEST_F(HloDiffTest, ComputationDiffFingerprintWorks) {
                             Pointee(Property(&HloComputation::name, "entry")),
                             /*max_matched_instruction_count=*/1,
                             /*split_allegiance_instruction=*/0,
-                            /*diff_fingerprint=*/9262444860155808583U,
+                            /*diff_fingerprint=*/16489949661223615780U,
                             /*all_unchanged=*/false))));
   EXPECT_THAT(right_computation_summary,
               UnorderedElementsAre(Pair(
@@ -332,11 +333,11 @@ TEST_F(HloDiffTest, ComputationDiffFingerprintWorks) {
                             Pointee(Property(&HloComputation::name, "entry")),
                             /*max_matched_instruction_count=*/1,
                             /*split_allegiance_instruction=*/0,
-                            /*diff_fingerprint=*/9262444860155808583U,
+                            /*diff_fingerprint=*/16489949661223615780U,
                             /*all_unchanged=*/false))));
   EXPECT_THAT(diff_summary->computation_diff_patterns,
               UnorderedElementsAre(FieldsAre(
-                  /*fingerprint=*/16389208373574701189U,
+                  /*fingerprint=*/11934792681379436682U,
                   /*computation_groups=*/
                   UnorderedElementsAre(FieldsAre(
                       /*left_computations=*/UnorderedElementsAre(
@@ -415,7 +416,7 @@ TEST_F(HloDiffTest, FindConnectedComponentsWorks) {
       diff_summary->computation_diff_patterns,
       UnorderedElementsAre(
           FieldsAre(
-              /*fingerprint=*/16389208373574701189U,
+              /*fingerprint=*/11934792681379436682U,
               /*computation_groups=*/
               UnorderedElementsAre(
                   FieldsAre(/*left_computations=*/UnorderedElementsAre(
@@ -434,7 +435,7 @@ TEST_F(HloDiffTest, FindConnectedComponentsWorks) {
               FieldsAre(/*changed_instruction_count=*/0,
                         /*left_unmatched_instruction_count=*/2,
                         /*right_unmatched_instruction_count=*/2)),
-          FieldsAre(/*fingerprint=*/7215429851241772700U,
+          FieldsAre(/*fingerprint=*/9579284718983015170U,
                     /*computation_groups=*/
                     UnorderedElementsAre(FieldsAre(
                         /*left_computations=*/UnorderedElementsAre(
@@ -490,10 +491,11 @@ subtract.0 = f32[] subtract(constant.0, constant.1)
       ConstructDiffResult(*graph_l, *graph_r, *mappings);
   std::unique_ptr<const DiffSummary> diff_summary =
       ConstructDiffSummary(*graph_l, *graph_r, *diff_result);
+
   EXPECT_THAT(diff_summary->computation_diff_patterns,
               UnorderedElementsAre(
                   FieldsAre(
-                      /*fingerprint=*/6836711927496849530U,
+                      /*fingerprint=*/9117565696400115256U,
                       /*computation_groups=*/
                       UnorderedElementsAre(FieldsAre(
                           /*left_computations=*/UnorderedElementsAre(Pointee(
@@ -504,7 +506,7 @@ subtract.0 = f32[] subtract(constant.0, constant.1)
                                 /*left_unmatched_instruction_count=*/3,
                                 /*right_unmatched_instruction_count=*/0)),
                   FieldsAre(
-                      /*fingerprint=*/4208103866131904389U,
+                      /*fingerprint=*/2981450499210857672U,
                       /*computation_groups=*/
                       UnorderedElementsAre(FieldsAre(
                           /*left_computations=*/IsEmpty(),
@@ -663,7 +665,7 @@ TEST_F(HloDiffTest, DiffSummaryToProtoWorks) {
       proto.computation_diff_patterns(),
       ElementsAre(AllOf(
           Property(&ComputationDiffPatternProto::fingerprint,
-                   16389208373574701189U),
+                   11934792681379436682U),
           Property(
               &ComputationDiffPatternProto::computation_group,
               ElementsAre(AllOf(
@@ -690,6 +692,50 @@ TEST_F(HloDiffTest, DiffSummaryToProtoWorks) {
               2))));
 }
 
+TEST_F(HloDiffTest, DiffSummaryFromProtoWorks) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    parameter.0 = f32[] parameter(0)
+    parameter.1 = f32[] parameter(1)
+    add.0 = f32[] add(parameter.0, parameter.1)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    parameter.0 = f32[] parameter(0)
+    parameter.1 = f32[] parameter(1)
+    add.0 = f32[] add(parameter.1, parameter.0)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  HloGumgraphMappings mappings;
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "add.0"), GetNodeByName(*graph_r, "add.0"),
+      mappings, true));
+  std::unique_ptr<const DiffResult> diff_result =
+      ConstructDiffResult(*graph_l, *graph_r, mappings);
+  std::unique_ptr<const DiffSummary> diff_summary =
+      ConstructDiffSummary(*graph_l, *graph_r, *diff_result);
+
+  DiffSummaryProto proto = diff_summary->ToProto();
+  DiffSummary summary_from_proto =
+      DiffSummary::FromProto(proto, *module_l, *module_r);
+
+  EXPECT_THAT(summary_from_proto.computation_diff_patterns,
+              UnorderedPointwise(EqualsComputationDiffPattern(),
+                                 diff_summary->computation_diff_patterns));
+}
+
 }  // namespace
 }  // namespace hlo_diff
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_module_metadata_processor.cc b/third_party/xla/xla/hlo/tools/hlo_module_metadata_processor.cc
index c9544724909f7c..c0280af7522074 100644
--- a/third_party/xla/xla/hlo/tools/hlo_module_metadata_processor.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_module_metadata_processor.cc
@@ -36,14 +36,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "google/protobuf/text_format.h"
 #include "xla/service/hlo.pb.h"
-#include "tsl/platform/env.h"
+#include "xla/tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace tools {
@@ -99,8 +99,7 @@ int main(int argc, char** argv) {
 
   QCHECK_EQ(argc, 2) << "usage: " << argv[0] << " <filepath>";
   std::string serialized;
-  TF_CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), std::string(argv[1]),
-                                    &serialized));
+  CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), argv[1], &serialized));
   xla::tools::ProcessMetadata(serialized);
   return 0;
 }
diff --git a/third_party/xla/xla/hlo/tools/hlo_opt/BUILD b/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
index 8547e012c2768a..cd5f53d7544fe6 100644
--- a/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
@@ -41,7 +41,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:alias_info",
-        "//xla/hlo/analysis:indexed_array_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/tools/tests:hlo_opt_test_only_passes",
diff --git a/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc b/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
index a1969ab8693dbf..70f69508aec264 100644
--- a/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/hlo/analysis/indexed_array_analysis.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
@@ -249,7 +248,8 @@ void OptProvider::RegisterAllHardwareIndependentPasses() {
   RegisterPass<BFloat16ConversionFolding>(
       /*bfloat16_support=*/bfloat16_support);
   RegisterPass<BFloat16MixedPrecisionRemoval>();
-  RegisterPass<BFloat16Propagation>(/*bfloat16_support=*/bfloat16_support);
+  RegisterPass<BFloat16Propagation>(/*bfloat16_support=*/bfloat16_support,
+                                    alias_info_.get());
   RegisterPass<BatchDotSimplification>();
   RegisterPass<BroadcastCanonicalizer>();
   RegisterPass<CholeskyExpander>();
@@ -295,7 +295,6 @@ void OptProvider::RegisterAllHardwareIndependentPasses() {
   RegisterPass<HostOffloadLegalize>();
   RegisterPass<HostOffloadingPrepare>(
       /*rewrite=*/HostOffloadingPrepare::Rewrite::kElideMoveToHost);
-  RegisterPass<IndexedArrayAnalysisPrinterPass>();
   RegisterPass<InfeedTokenPropagation>();
   RegisterPass<InstructionHoister>();
   RegisterPass<LiteralCanonicalizer>(
diff --git a/third_party/xla/xla/hlo/tools/hlo_proto_to_json.cc b/third_party/xla/xla/hlo/tools/hlo_proto_to_json.cc
index 9746bccdac444e..662831b2bf7c1e 100644
--- a/third_party/xla/xla/hlo/tools/hlo_proto_to_json.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_proto_to_json.cc
@@ -28,14 +28,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "xla/util.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 using std::string;
 
@@ -58,14 +57,14 @@ absl::StatusOr<std::string> ToJson(const tsl::protobuf::Message& message) {
 
 void RealMain(const std::string& input, const std::string& output) {
   HloProto hlo_proto;
-  TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), input, &hlo_proto))
+  CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), input, &hlo_proto))
       << "Can't open, read, or parse input file " << input;
 
   auto statusor = ToJson(hlo_proto);
   QCHECK(statusor.ok()) << "Error converting " << input << " to JSON."
                         << statusor.status();
 
-  TF_CHECK_OK(
+  CHECK_OK(
       tsl::WriteStringToFile(tsl::Env::Default(), output, statusor.value()));
 }
 
diff --git a/third_party/xla/xla/hlo/tools/hlo_translate.cc b/third_party/xla/xla/hlo/tools/hlo_translate.cc
index 33ac1c7a4376f1..0c43379259ea6e 100644
--- a/third_party/xla/xla/hlo/tools/hlo_translate.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_translate.cc
@@ -121,15 +121,18 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOText(
   auto status = ConvertHloToMlirHlo(*module, hlo_module.get(),
                                     /*import_all_computations=*/true,
                                     /*flatten_computation_args_result*/ true);
-  if (!status.ok()) return status;
+  if (!status.ok()) {
+    return status;
+  }
   return module;
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOProto(
     const std::string& content, mlir::MLIRContext* context, bool emit_mhlo) {
   xla::HloProto hlo_proto;
-  if (!LoadHloProto(content, &hlo_proto))
+  if (!LoadHloProto(content, &hlo_proto)) {
     return absl::InvalidArgumentError(kLoadHloError);
+  }
 
   // For emitting StableHLO, use new APIs by defualt.
   if (!emit_mhlo) {
@@ -143,7 +146,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOProto(
       ConvertHloToMlirHlo(module.get(), hlo_proto.mutable_hlo_module(),
                           /*import_all_computations=*/true,
                           /*flatten_computation_args_result=*/true);
-  if (!status.ok()) return status;
+  if (!status.ok()) {
+    return status;
+  }
   return module;
 }
 
@@ -164,7 +169,9 @@ mlir::OwningOpRef<mlir::ModuleOp> GetModuleFromHloInput(
 
   // Try HLO Text
   auto module_from_text = GetModuleFromHLOText(content, context, emit_mhlo);
-  if (module_from_text.ok()) return std::move(module_from_text).value();
+  if (module_from_text.ok()) {
+    return std::move(module_from_text).value();
+  }
   if (module_from_text.status().message().rfind(kLoadHloError, 0) != 0) {
     emitError() << "Failed to convert HLO to MLIR: "
                 << module_from_text.status().message();
@@ -174,7 +181,9 @@ mlir::OwningOpRef<mlir::ModuleOp> GetModuleFromHloInput(
   // Try HLO Proto
   auto module_from_proto =
       GetModuleFromHLOProto(std::string(content), context, emit_mhlo);
-  if (module_from_proto.ok()) return std::move(module_from_proto).value();
+  if (module_from_proto.ok()) {
+    return std::move(module_from_proto).value();
+  }
   if (module_from_text.status().message().rfind(kLoadHloError, 0) != 0) {
     emitError() << "Failed to convert HLO to MLIR: "
                 << module_from_proto.status().message();
@@ -195,7 +204,9 @@ static mlir::OwningOpRef<mlir::ModuleOp> HloToMlirTranslate(
   mlir::OwningOpRef<mlir::ModuleOp> module =
       GetModuleFromHloInput(sourceMgr, context, emit_mhlo);
 
-  if (!module) return nullptr;
+  if (!module) {
+    return nullptr;
+  }
 
   return module;
 }
diff --git a/third_party/xla/xla/hlo/tools/show_literal.cc b/third_party/xla/xla/hlo/tools/show_literal.cc
index e21a94a2d443d2..e13b0f72f4d933 100644
--- a/third_party/xla/xla/hlo/tools/show_literal.cc
+++ b/third_party/xla/xla/hlo/tools/show_literal.cc
@@ -22,13 +22,12 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "xla/literal.h"
-#include "xla/types.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 int main(int argc, char **argv) {
   tsl::port::InitMain(argv[0], &argc, &argv);
@@ -39,8 +38,7 @@ int main(int argc, char **argv) {
   }
 
   xla::LiteralProto literal_proto;
-  TF_CHECK_OK(
-      tsl::ReadBinaryProto(tsl::Env::Default(), argv[1], &literal_proto));
+  CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), argv[1], &literal_proto));
   xla::Literal literal = xla::Literal::CreateFromProto(literal_proto).value();
   LOG(INFO) << "literal: " << literal_proto.ShortDebugString();
   fprintf(stderr, "%s\n", literal.ToString().c_str());
diff --git a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
index fb44caf056f6e7..c9b5de93c5af4d 100644
--- a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
+++ b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
@@ -221,7 +221,7 @@ absl::StatusOr<bool> XlaBuilderTestPass::ReplaceWithExpandedClientHlo(
       "Unsupported xla_builder custom call target: ", custom_call_target));
 }
 
-absl::StatusOr<bool> XlaBuilderTestPass::Run(
+absl::StatusOr<bool> XlaBuilderTestPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
index bc32ef0f37544a..20b31c1712b84d 100644
--- a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
+++ b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
@@ -33,10 +33,10 @@ class FooToBarModulePass : public HloModulePass {
  public:
   absl::string_view name() const override { return "test-only-foo2bar"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
@@ -55,10 +55,10 @@ class BarToHelloModulePass : public HloModulePass {
  public:
   absl::string_view name() const override { return "test-only-bar2hello"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
@@ -103,8 +103,8 @@ class XlaBuilderTestPass : public HloModulePass {
  public:
   absl::string_view name() const override { return "test-only-xla-builder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 816aef105cab7f..89f779559bcd72 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -28,6 +28,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/analysis:hlo_operand_index",
         "//xla/hlo/ir:hlo",
@@ -61,6 +62,7 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/analysis:alias_info",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
@@ -340,6 +342,7 @@ xla_cc_test(
         "//xla/service:memory_annotations_hdr",
         "//xla/service:pattern_matcher",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -395,6 +398,7 @@ cc_library(
     hdrs = ["despecializer.h"],
     deps = [
         ":defuser",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -405,6 +409,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -565,3 +570,50 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
     ],
 )
+
+cc_library(
+    name = "call_splitter",
+    srcs = ["call_splitter.cc"],
+    hdrs = ["call_splitter.h"],
+    deps = [
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "call_splitter_test",
+    srcs = ["call_splitter_test.cc"],
+    deps = [
+        ":call_splitter",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
+        "//xla/hlo/transforms/simplifiers:call_parameter_cleanup",
+        "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/hlo/transforms/simplifiers:tuple_simplifier",
+        "//xla/service:call_inliner",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",  # fixdeps: keep
+    ],
+)
diff --git a/third_party/xla/xla/hlo/transforms/add_original_value.cc b/third_party/xla/xla/hlo/transforms/add_original_value.cc
index 91c44c876c540e..2d3ff322d0b8d0 100644
--- a/third_party/xla/xla/hlo/transforms/add_original_value.cc
+++ b/third_party/xla/xla/hlo/transforms/add_original_value.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AddOriginalValue::Run(
+absl::StatusOr<bool> AddOriginalValue::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/add_original_value.h b/third_party/xla/xla/hlo/transforms/add_original_value.h
index f253b8ad42ec09..0cfa6247d9018c 100644
--- a/third_party/xla/xla/hlo/transforms/add_original_value.h
+++ b/third_party/xla/xla/hlo/transforms/add_original_value.h
@@ -30,8 +30,9 @@ namespace xla {
 class AddOriginalValue : public HloModulePass {
  public:
   absl::string_view name() const override { return "add-original-value"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
index 9f812fe3e3753a..5c80a87f29c2ab 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/analysis/hlo_operand_index.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -50,8 +51,9 @@ limitations under the License.
 
 namespace xla {
 
-BFloat16Propagation::BFloat16Propagation(const FloatSupport* bfloat16_support)
-    : bfloat16_support_(bfloat16_support) {
+BFloat16Propagation::BFloat16Propagation(const FloatSupport* bfloat16_support,
+                                         const AliasInfo* alias_info)
+    : bfloat16_support_(bfloat16_support), alias_info_(alias_info) {
   DCHECK_EQ(bfloat16_support->LowPrecisionType(), BF16);
 }
 
@@ -744,7 +746,7 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
         // HloAliasAnalysis (e.g., their computation graphs may not have been
         // flattened yet).
         for (const auto& operand_and_output_index :
-             HloDataflowAnalysis::GetInPlaceInputOutputPairs(hlo)) {
+             alias_info_->GetInPlaceInputOutputPairs(hlo)) {
           if (operand_and_output_index.second == index) {
             const HloOperandIndex& operand_index =
                 operand_and_output_index.first;
@@ -979,7 +981,7 @@ absl::Status BFloat16Propagation::SkipNoopConversions(HloModule* module) {
 // their users. During the backward pass, the potential changes are stored in
 // changes_to_bf16_ which are subject to further adjustments then applied to the
 // HLOs.
-absl::StatusOr<bool> BFloat16Propagation::Run(
+absl::StatusOr<bool> BFloat16Propagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   consider_using_bfloat16_.clear();
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
index 6d412d4265e10e..b547418e180a92 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -66,20 +67,14 @@ namespace xla {
 // pass.
 class BFloat16Propagation : public HloModulePass {
  public:
-  explicit BFloat16Propagation(const FloatSupport* bfloat16_support);
+  BFloat16Propagation(const FloatSupport* bfloat16_support,
+                      const AliasInfo* alias_info);
 
   ~BFloat16Propagation() override = default;
 
   static constexpr absl::string_view kName = "bfloat16-propagation";
   absl::string_view name() const override { return kName; }
 
-  // Runs the pass on the given module. Returns whether the module was changed
-  // (precision reductions were added).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Returns whether we should avoid changing the precision of inst regardless
   // of the producers and users.
   virtual bool ShouldKeepPrecisionUnchanged(const HloInstruction* inst);
@@ -91,6 +86,14 @@ class BFloat16Propagation : public HloModulePass {
  protected:
   const FloatSupport* bfloat16_support_;
 
+  const AliasInfo* alias_info_;
+
+  // Runs the pass on the given module. Returns whether the module was changed
+  // (precision reductions were added).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // ***************************
   // Function called and state produced by the forward analysis pass (from
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc b/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc
index f56a988efdf01c..c2dc7c22711396 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -82,7 +83,7 @@ class BFloat16PropagationTest : public HloHardwareIndependentTestBase {
   // module is changed after this pass.
   bool PropagatePrecision(HloModule* module) {
     TestBFloat16Support bfloat16_support;
-    BFloat16Propagation propagation(&bfloat16_support);
+    BFloat16Propagation propagation(&bfloat16_support, &alias_info_);
     absl::StatusOr<bool> result = propagation.Run(module);
     EXPECT_IS_OK(result.status());
     return result.value();
@@ -108,6 +109,7 @@ class BFloat16PropagationTest : public HloHardwareIndependentTestBase {
     return HloInstruction::CreateDot(shape, lhs, rhs, dot_dnums,
                                      DefaultPrecisionConfig(2));
   }
+  AliasInfo alias_info_;
 };
 
 // Tests that BF16 can propagate through select over non-tuple buffers, but not
diff --git a/third_party/xla/xla/hlo/transforms/call_splitter.cc b/third_party/xla/xla/hlo/transforms/call_splitter.cc
new file mode 100644
index 00000000000000..4f38b34a6c9e27
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/call_splitter.cc
@@ -0,0 +1,304 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/call_splitter.h"
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+
+namespace {
+
+// Returns all instructions in the body that match the boundary predicate.
+std::vector<HloInstruction*> GetBoundaryInstructions(
+    HloComputation* body, HloPredicate boundary_predicate) {
+  std::vector<HloInstruction*> boundary_instructions;
+  for (HloInstruction* instruction : body->instructions()) {
+    if (boundary_predicate(instruction)) {
+      boundary_instructions.push_back(instruction);
+    }
+  }
+  return boundary_instructions;
+}
+
+// Returns all instructions that must go into the second call, because they
+// depend on the boundary instructions.
+absl::flat_hash_set<HloInstruction*> GetSecondCallInstructions(
+    HloComputation* body,
+    const std::vector<HloInstruction*>& boundary_instructions) {
+  absl::flat_hash_set<HloInstruction*> second_call_instructions(
+      boundary_instructions.begin(), boundary_instructions.end());
+  std::vector<HloInstruction*> worklist(boundary_instructions.begin(),
+                                        boundary_instructions.end());
+  while (!worklist.empty()) {
+    HloInstruction* curr = worklist.back();
+    worklist.pop_back();
+    auto process = [&](HloInstruction* user) {
+      if (second_call_instructions.contains(user)) {
+        return;
+      }
+      second_call_instructions.insert(user);
+      worklist.push_back(user);
+    };
+    for (HloInstruction* user : curr->users()) {
+      process(user);
+    }
+    for (HloInstruction* successor : curr->control_successors()) {
+      process(successor);
+    }
+  }
+  return second_call_instructions;
+}
+
+// Create new call ops, connect them together, and splice them
+// where the original call was.
+absl::Status SplitCallSite(HloInstruction* call,
+                           HloComputation* first_call_computation,
+                           HloComputation* second_call_computation) {
+  HloComputation* enclosing_computation = call->parent();
+  HloInstruction* first_call =
+      enclosing_computation->AddInstruction(call->CloneWithNewOperands(
+          first_call_computation->root_instruction()->shape(),
+          call->operands()));
+  first_call->set_to_apply(first_call_computation);
+  std::vector<HloInstruction*> first_call_output_gtes;
+  int num_outputs =
+      first_call_computation->root_instruction()->shape().tuple_shapes().size();
+  first_call_output_gtes.reserve(num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    first_call_output_gtes.push_back(enclosing_computation->AddInstruction(
+        HloInstruction::CreateGetTupleElement(first_call, i)));
+  }
+  HloInstruction* second_call =
+      enclosing_computation->AddInstruction(call->CloneWithNewOperands(
+          second_call_computation->root_instruction()->shape(),
+          first_call_output_gtes));
+  second_call->set_to_apply(second_call_computation);
+  return call->ReplaceAllUsesWith(second_call);
+}
+}  // namespace
+
+std::pair<HloComputation*, HloComputation*> CallSplitter::SplitCallBody(
+    HloComputation* body, HloPredicate boundary_predicate) {
+  // We need to do several things here:
+  // 1. Figure out which instructions go into the first call and which into the
+  // second. In particular:
+  //    a) The boundary instructions go into the second call.
+  //    b) Anything that consumes the results of the boundary instructions goes
+  //    into the second call.
+  //    c) Anything that feeds the instructions from (a) and (b) goes into the
+  //    first call.
+  //    d) The remaining instructions go into the first call.
+  // 2. Figure out the outputs of the first call and the inputs to the second
+  // call, and how to connect them.
+  // 3. Materialized the two new computations and the calls, and put them in the
+  // enclosing computation.
+
+  // TODO(mkuper): This splits "down". We also want a version that splits "up",
+  // i.e. the boundary ends up in the first call, and the "irrelevant"
+  // instructions end up in the second one.
+  HloModule* module = body->parent();
+
+  std::vector<HloInstruction*> boundary_instructions =
+      GetBoundaryInstructions(body, boundary_predicate);
+  if (boundary_instructions.empty()) {
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  absl::flat_hash_set<HloInstruction*> second_call_instructions =
+      GetSecondCallInstructions(body, boundary_instructions);
+
+  absl::flat_hash_set<HloInstruction*> first_call_instructions;
+  for (HloInstruction* instruction : body->instructions()) {
+    if (!second_call_instructions.contains(instruction)) {
+      first_call_instructions.insert(instruction);
+    }
+  }
+  if (first_call_instructions.empty() || second_call_instructions.empty()) {
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "First call instructions: ";
+    for (HloInstruction* instruction : first_call_instructions) {
+      VLOG(1) << instruction->ToString();
+    }
+    VLOG(1) << "Second call instructions: ";
+    for (HloInstruction* instruction : second_call_instructions) {
+      VLOG(1) << instruction->ToString();
+    }
+  }
+
+  // The outputs of the first call are instructions that will be in the first
+  // call that are directly used by instructions that will be in the second
+  // call. It's convenient to have both a set and a vector representation. We
+  // could use a single ordered associative container, but this is simpler.
+  absl::flat_hash_set<HloInstruction*> first_call_outputs;
+  for (HloInstruction* instruction : second_call_instructions) {
+    for (HloInstruction* control_pred : instruction->control_predecessors()) {
+      // Don't break the function if it would create a control edge that needs
+      // to be threaded between the two new functions.
+      if (first_call_instructions.contains(control_pred)) {
+        return std::make_pair(nullptr, nullptr);
+      }
+    }
+    for (HloInstruction* data_pred : instruction->operands()) {
+      if (first_call_instructions.contains(data_pred)) {
+        first_call_outputs.insert(data_pred);
+      }
+    }
+  }
+
+  // Make sure the order of outputs is deterministic.
+  std::vector<HloInstruction*> first_call_outputs_vec(
+      first_call_outputs.begin(), first_call_outputs.end());
+  std::sort(first_call_outputs_vec.begin(), first_call_outputs_vec.end(),
+            [](HloInstruction* a, HloInstruction* b) {
+              return a->unique_id() < b->unique_id();
+            });
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "First call outputs: ";
+    for (HloInstruction* instruction : first_call_outputs_vec) {
+      VLOG(1) << instruction->ToString();
+    }
+  }
+
+  // Construct the first call body. We delete everything that goes into the
+  // second call from the call body, and construct a new output tuple based on
+  // the inputs the second call needs.
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      first_call_replacements;
+  for (HloInstruction* instruction : second_call_instructions) {
+    first_call_replacements.insert({instruction, nullptr});
+  }
+  HloComputation* first_call_computation =
+      module->AddEmbeddedComputation(body->CloneWithReplacements(
+          &first_call_replacements, /*extra_parameters=*/{},
+          /*context=*/nullptr, /*suffix=*/"first", first_call_outputs_vec));
+
+  // Now construct the second call body. In the call body, the instructions
+  // that were assigned to the first call that are directly used are replaced by
+  // parameters, and the rest are deleted.
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      second_call_replacements;
+  for (int i = 0; i < first_call_outputs_vec.size(); ++i) {
+    second_call_replacements.insert(
+        {first_call_outputs_vec[i],
+         HloInstruction::CreateParameter(i, first_call_outputs_vec[i]->shape(),
+                                         absl::StrCat("first_output_", i))});
+  }
+  for (HloInstruction* instruction : first_call_instructions) {
+    if (first_call_outputs.contains(instruction)) {
+      continue;
+    }
+    second_call_replacements.insert({instruction, nullptr});
+  }
+  HloComputation* second_call_computation =
+      module->AddEmbeddedComputation(body->CloneWithReplacements(
+          &second_call_replacements, /*extra_parameters=*/{},
+          /*context=*/nullptr, /*suffix=*/"second", /*new_root=*/nullptr));
+
+  return std::make_pair(first_call_computation, second_call_computation);
+}
+
+absl::StatusOr<bool> CallSplitter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  // Find all the call instructions that match the predicate. We don't process
+  // them immediately since we're going to change their enclosing computation.
+  // process all calls in a computation together. Note that we want to process
+  // them in the same order as we encounter them, because for nested calls, we
+  // want to process the deeper call first.
+
+  // TODO(mkuper): Support unflattened graphs properly - if a function has
+  // several callsites, we should only split it once, and then reuse the
+  // resulting computations.
+  std::vector<HloInstruction*> calls_to_process;
+  for (HloComputation* computation :
+       module->MakeComputationPostOrder(execution_threads)) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() != HloOpcode::kCall) {
+        continue;
+      }
+      // TODO(mkuper): Support calls with control dependencies, if that appears
+      // useful.
+      if (instruction->HasControlDependencies()) {
+        continue;
+      }
+      if (!execution_threads.empty() &&
+          !execution_threads.contains(
+              instruction->to_apply()->execution_thread())) {
+        continue;
+      }
+      // TODO(mkuper): We could support removing dead parameters from non-tuple
+      // shaped calls. We could also potentially support pass-through for
+      // tuple-shaped calls where the root *instruction* is not kTuple by doing
+      // more complex analysis.
+      if (instruction->to_apply()->root_instruction()->opcode() !=
+          HloOpcode::kTuple) {
+        continue;
+      }
+      VLOG(1) << "Found matching call: " << instruction->ToString();
+      calls_to_process.push_back(instruction);
+    }
+  }
+
+  bool changed = false;
+  split_call_bodies_.clear();
+  for (HloInstruction* call : calls_to_process) {
+    // We may have already split this callee when wer processed another
+    // callsite, in which case we can reuse the results.
+    auto get_split = [&](HloComputation* body) {
+      auto it = split_call_bodies_.find(body);
+      if (it != split_call_bodies_.end()) {
+        return it->second;
+      }
+      auto split = SplitCallBody(body, boundary_predicate_);
+      split_call_bodies_[body] = split;
+      return split;
+    };
+
+    auto split_result = get_split(call->to_apply());
+    if (split_result.first != nullptr) {
+      changed |= true;
+      TF_RETURN_IF_ERROR(
+          SplitCallSite(call, split_result.first, split_result.second));
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/call_splitter.h b/third_party/xla/xla/hlo/transforms/call_splitter.h
new file mode 100644
index 00000000000000..513fc2bbcceefe
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/call_splitter.h
@@ -0,0 +1,100 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_CALL_SPLITTER_H_
+#define XLA_HLO_TRANSFORMS_CALL_SPLITTER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// This pass allows splitting a single function call into two calls to two
+// functions, where the original called computation is split across a specified
+// boundary.
+//
+// For example, given the call
+//
+// (x) = call(a, b, c), to_apply={(mul(p0, (add(p1, p2)))}
+//
+// with the boundary predicate "opcode == kMultiply" we will get:
+//
+// (t) = call(b, c), to_apply={(add(p0, p1))}
+// (y) = call(a, t), to_apply={(mul(p0, p1))}
+//
+// This also allow splitting functions "vertically" as opposed to "horizontally"
+// e.g. for the same predicated, given:
+//
+// (x, y) = call(a, b, c), to_apply={(add(p0, p1), mul(p1, p2))}
+//
+// we should get:
+//
+// (x) = call(a, b), to_apply={(add(p0, p1))}
+// (y) = call(b, c), to_apply={(mul(p0, p1))}
+//
+// More precisely:
+//
+// a) The instructions matching the boundary predicate go into the second call.
+// b) Anything that consumes the results of the boundary instructions goes
+//    into the second call.
+// c) Anything that feeds the instructions from (a) and (b) goes into the
+//    first call.
+// d) The remaining instructions go into the first call.
+// TODO(mkuper): This is not quite ready for production use yet.
+class CallSplitter : public HloModulePass {
+ public:
+  // The `call_predicate` is used to select the calls that should be split. The
+  // `boundary_predicate` is used to select the instructions that form the
+  // boundary between the two calls.
+  explicit CallSplitter(const HloPredicate& call_predicate,
+                        const HloPredicate& boundary_predicate)
+      : call_predicate_(call_predicate),
+        boundary_predicate_(boundary_predicate) {}
+
+  ~CallSplitter() override = default;
+
+  static constexpr absl::string_view kName = "call-splitter";
+  absl::string_view name() const override { return kName; }
+
+ protected:
+  // Runs the pass on the given module. Returns whether the module was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Splits the body of the given call into two computations, according to the
+  // given boundary predicate. Returns the two new computations.
+  std::pair<HloComputation*, HloComputation*> SplitCallBody(
+      HloComputation* body, HloPredicate boundary_predicate);
+
+  absl::flat_hash_map<HloComputation*,
+                      std::pair<HloComputation*, HloComputation*>>
+      split_call_bodies_;
+
+ protected:
+  HloPredicate call_predicate_;
+  HloPredicate boundary_predicate_;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_CALL_SPLITTER_H_
diff --git a/third_party/xla/xla/hlo/transforms/call_splitter_test.cc b/third_party/xla/xla/hlo/transforms/call_splitter_test.cc
new file mode 100644
index 00000000000000..04e8f980ade7c1
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/call_splitter_test.cc
@@ -0,0 +1,384 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/call_splitter.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
+#include "xla/hlo/transforms/simplifiers/call_parameter_cleanup.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
+#include "xla/service/call_inliner.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class CallSplitterTest : public HloHardwareIndependentTestBase {
+ protected:
+  CallSplitterTest()
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/false,
+            /*allow_mixed_precision_in_hlo_verifier=*/true) {}
+};
+
+namespace {
+
+namespace m = ::xla::match;
+
+TEST_F(CallSplitterTest, SplitDownOneInstructionBasic) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+addmul {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, b)
+  mul = s32[] multiply(add, c)
+  ROOT tuple = (s32[]) tuple(mul)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[]) call(p0, p1, p2), to_apply=addmul
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(cleanup.Run(module.get()).status());
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // Verify we got the two-call structure, with the mul in the second call.
+  HloInstruction* call1;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call2, m::Parameter(2),
+                                 m::GetTupleElement(m::Call(&call1), 0))));
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add())));
+  EXPECT_THAT(call2->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Multiply())));
+
+  // Verify we hooked up all the parameters correctly by simplifying again and
+  // making sure it's equivalent to what we had in the beginning.
+  CallInliner call_inliner;
+  CHECK_OK(call_inliner.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Multiply(
+                  m::Add(m::Parameter(0), m::Parameter(1)), m::Parameter(2)))));
+}
+
+TEST_F(CallSplitterTest, SplitDownOneInstructionIndependent) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+addmul {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, b)
+  mul = s32[] multiply(b, c)
+  ROOT tuple = (s32[], s32[]) tuple(add, mul)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2), to_apply=addmul
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(cleanup.Run(module.get()).status());
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call1;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::GetTupleElement(m::Call(&call1), 0),
+                                  m::GetTupleElement(m::Call(&call2), 0))));
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+  EXPECT_THAT(
+      call2->to_apply()->root_instruction(),
+      GmockMatch(m::Tuple(m::Multiply(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallSplitterTest, SplitDownMultipleInstructionsParallel) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+func {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  d = s32[] parameter(3)
+  x = s32[] add(a, b)
+  y = s32[] add(c, d)
+  mul = s32[] multiply(x, x)
+  sub = s32[] subtract(y, y)
+  add = s32[] add(mul, sub)
+  ROOT tuple = (s32[], s32[], s32[]) tuple(mul, sub, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  p3 = s32[] parameter(3)
+  ROOT call = (s32[], s32[], s32[]) call(p0, p1, p2, p3), to_apply=func
+}
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply ||
+           instruction->opcode() == HloOpcode::kSubtract;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(cleanup.Run(module.get()).status());
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call1;
+  HloInstruction* call1_copy;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call2, m::GetTupleElement(m::Call(&call1), 0),
+                                 m::GetTupleElement(m::Call(&call1_copy), 1))));
+  EXPECT_EQ(call1, call1_copy);
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)),
+                                  m::Add(m::Parameter(2), m::Parameter(3)))));
+}
+
+TEST_F(CallSplitterTest, SplitDownMultipleInstructionsDependent) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+func {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  d = s32[] parameter(3)
+  x = s32[] add(a, b)
+  y = s32[] add(c, d)
+  mul = s32[] multiply(x, x)
+  sub = s32[] subtract(mul, y)
+  ROOT tuple = (s32[], s32[]) tuple(mul, sub)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  p3 = s32[] parameter(3)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2, p3), to_apply=func
+}
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply ||
+           instruction->opcode() == HloOpcode::kSubtract;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(cleanup.Run(module.get()).status());
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call1;
+  HloInstruction* call1_copy;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call2, m::GetTupleElement(m::Call(&call1), 0),
+                                 m::GetTupleElement(m::Call(&call1_copy), 1))));
+  EXPECT_EQ(call1, call1_copy);
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)),
+                                  m::Add(m::Parameter(2), m::Parameter(3)))));
+  EXPECT_THAT(call2->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Multiply(), m::Subtract())));
+}
+
+TEST_F(CallSplitterTest, SplitDownMultipleCallsites) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+addmul {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, b)
+  mul = s32[] multiply(add, c)
+  ROOT tuple = (s32[]) tuple(mul)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  call0 = (s32[]) call(p0, p1, p2), to_apply=addmul
+  call1 = (s32[]) call(p2, p1, p0), to_apply=addmul
+  ROOT out = tuple(call0, call1)
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(cleanup.Run(module.get()).status());
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call0_first;
+  HloInstruction* call1_first;
+  HloInstruction* call0_second;
+  HloInstruction* call1_second;
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Call(&call0_second, m::Parameter(2),
+                          m::GetTupleElement(m::Call(&call0_first), 0)),
+                  m::Call(&call1_second, m::Parameter(0),
+                          m::GetTupleElement(m::Call(&call1_first), 0)))));
+
+  EXPECT_EQ(call0_first->to_apply(), call1_first->to_apply());
+  EXPECT_EQ(call0_second->to_apply(), call1_second->to_apply());
+  EXPECT_NE(call0_first->to_apply(), call0_second->to_apply());
+}
+
+TEST_F(CallSplitterTest, ClearCache) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+addrem {
+  a = u32[] parameter(0)
+  b = u32[] parameter(1)
+  c = u32[] constant(8)
+  add = u32[] add(a, b)
+  rem = u32[] remainder(add, c)
+  ROOT tuple = (u32[]) tuple(rem)
+}
+
+ENTRY entry {
+  p0 = u32[] parameter(0)
+  p1 = u32[] parameter(1)
+  ROOT call = (u32[]) call(p0, p1), to_apply=addrem
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kAnd;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_FALSE(splitter.Run(module.get()).value());
+
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
+  CHECK_OK(simplifier.Run(module.get()).status());
+
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/collectives/BUILD b/third_party/xla/xla/hlo/transforms/collectives/BUILD
index d93d541185ae89..8e6f70b163f06a 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/BUILD
+++ b/third_party/xla/xla/hlo/transforms/collectives/BUILD
@@ -359,13 +359,12 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
index 7697315c9e4c55..562a317fc08d64 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
@@ -35,9 +35,9 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllGatherBroadcastReorder::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> AllGatherBroadcastReorder::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedCollective(*module,
                                                      HloOpcode::kAllGather)) {
     VLOG(1) << "Skip AllGatherBroadcastReorder because the module contains "
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
index 0a1a8d5e27c2d7..b08b79960f1c1d 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
@@ -32,8 +32,8 @@ class AllGatherBroadcastReorder : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-gather-bcast-reorder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
index 61b261248e0e2c..ce1fe5e14ceb85 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
@@ -269,7 +269,7 @@ absl::StatusOr<bool> AllGatherCombiner::RunWithKeyCombiner(
   return changed;
 }
 
-absl::StatusOr<bool> AllGatherCombiner::Run(
+absl::StatusOr<bool> AllGatherCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
index 67cd439ab80b68..eab423c04855d6 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
@@ -41,11 +41,6 @@ class AllGatherCombiner : public HloModulePass {
 
   absl::string_view name() const override { return "all-gather-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // The group key encapsulates all of the properties which must match for it to
   // be possible to combine the instructions.
   // The field of the key corresponds to the following:
@@ -69,6 +64,10 @@ class AllGatherCombiner : public HloModulePass {
       bool combine_by_dim, bool combine_different_dtypes = true);
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   absl::StatusOr<bool> RunWithKeyCombiner(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads,
@@ -76,7 +75,6 @@ class AllGatherCombiner : public HloModulePass {
           const HloInstruction*, const HloDomainMap&, bool, bool)>
           combine_key);
 
- protected:
   // Combine all gather ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc
index 1414658d397809..bdb4d02bde89b4 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllGatherCSE::Run(
+absl::StatusOr<bool> AllGatherCSE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Running AllGatherCSE pass";
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
index 7a67eae5bc50c4..cf33e955fecab0 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
@@ -53,8 +53,8 @@ class AllGatherCSE : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-gather-cse"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc
index 5e11dc5d3585d2..65e1f4f8110902 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc
@@ -155,7 +155,7 @@ absl::Status ReshapeAndReplaceResults(HloInstruction* original_all_gather,
 
 }  // namespace
 
-absl::StatusOr<bool> AllGatherRemoveDegenerateDims::Run(
+absl::StatusOr<bool> AllGatherRemoveDegenerateDims::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h
index 39810f8b295990..8675e6e032e048 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h
@@ -46,8 +46,8 @@ class AllGatherRemoveDegenerateDims : public HloModulePass {
     return "all-gather-remove-degenerate-dims";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc
index a635d43b5e5e44..7b37fd9edd9584 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc
@@ -174,7 +174,7 @@ AllReduceCombiner::AllReduceCombiner(int64_t combine_threshold_in_bytes,
     : combine_threshold_in_bytes_(combine_threshold_in_bytes),
       combine_threshold_count_(combine_threshold_count) {}
 
-absl::StatusOr<bool> AllReduceCombiner::Run(
+absl::StatusOr<bool> AllReduceCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return RunWithKeyCombiner(module, execution_threads, CombineKey);
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
index 044cc74b5a5c45..1f6df279f1b38a 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
@@ -45,11 +45,6 @@ class AllReduceCombiner : public HloModulePass {
 
   absl::string_view name() const override { return "all-reduce-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   using GroupKey = std::tuple<AllReduceKey, /*extra_args*/ std::string>;
 
   static std::string& GetGroupKeyExtraArgs(AllReduceCombiner::GroupKey& key);
@@ -67,6 +62,10 @@ class AllReduceCombiner : public HloModulePass {
           const HloInstruction*, const HloDomainMap&)>
           combine_key);
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // Combine all reduce ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner_test.cc b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner_test.cc
index 2b33cfd33b892f..8c9b75365ec2f2 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner_test.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner_test.cc
@@ -36,16 +36,14 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 using std::nullopt;
 using ::testing::AllOf;
-using tsl::testing::IsOkAndHolds;
 namespace op = xla::testing::opcode_matchers;
 
 int64_t kMaxCombineCount = 256;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc
index 9768600d744536..bc18c84e1b7f4f 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc
@@ -93,7 +93,7 @@ absl::Status ReplaceWithContiguousAllReduce(
 }
 }  // namespace
 
-absl::StatusOr<bool> AllReduceContiguous::Run(
+absl::StatusOr<bool> AllReduceContiguous::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running AllReduceContiguous";
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
index 5262c94366cce0..4ac0130dda98e4 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
@@ -30,8 +30,8 @@ class AllReduceContiguous : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-contiguous"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
index f993a5762d4cca..d2332112c8d614 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
@@ -254,7 +254,7 @@ absl::StatusOr<bool> AsyncCollectiveCreator::ReplaceCollectives(
   return changed;
 }
 
-absl::StatusOr<bool> AsyncCollectiveCreator::Run(
+absl::StatusOr<bool> AsyncCollectiveCreator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
index ad98b84de98eb1..4d686dc354c65e 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
@@ -59,17 +59,17 @@ class AsyncCollectiveCreator : public HloModulePass {
       : config_(std::move(creator_config)) {}
   absl::string_view name() const override { return "async-collective-creator"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule *module,
-      const absl::flat_hash_set<absl::string_view> &execution_threads) override;
-
   std::vector<HloInstruction *> MatchCollectives(HloComputation *computation);
   absl::StatusOr<bool> ReplaceCollectives(
       HloComputation *computation,
       std::vector<HloInstruction *> &supported_collectives);
   const CollectiveCreatorConfig *config() const { return &config_; }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   CollectiveCreatorConfig config_;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc
index eb5b7d10a80c5b..68ea1495311981 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc
@@ -94,7 +94,7 @@ CollectivePermuteCombiner::CollectivePermuteCombiner(
     : combine_threshold_in_bytes_(combine_threshold_in_bytes),
       combine_threshold_count_(combine_threshold_count) {}
 
-absl::StatusOr<bool> CollectivePermuteCombiner::Run(
+absl::StatusOr<bool> CollectivePermuteCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running CollectivePermuteCombiner with threshold of "
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h
index 01d66056ccad53..1d62386115a7bc 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h
@@ -46,16 +46,16 @@ class CollectivePermuteCombiner : public HloModulePass {
     return "collective-permute-combiner";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Combine collective permute ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
   // Combine collective permute ops up to this threshold (number of operands).
   int64_t combine_threshold_count_;
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc
index b3c2ffe79ec00c..0dabd59b68a32b 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc
@@ -318,8 +318,10 @@ absl::StatusOr<bool> MatchDequantization(HloInstruction* instr) {
     // partitions, not replicas.
     TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                         GetCollectiveOpGroupMode(instr));
-    if (group_mode != CollectiveOpGroupMode::kCrossPartition &&
-        group_mode != CollectiveOpGroupMode::kFlattenedID) {
+    if (group_mode !=
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION &&
+        group_mode !=
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
       return false;
     }
     TF_ASSIGN_OR_RETURN(
@@ -387,8 +389,10 @@ absl::StatusOr<bool> MatchQuantization(HloInstruction* instr) {
     // partitions, not replicas.
     TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                         GetCollectiveOpGroupMode(instr));
-    if (group_mode != CollectiveOpGroupMode::kCrossPartition &&
-        group_mode != CollectiveOpGroupMode::kFlattenedID) {
+    if (group_mode !=
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION &&
+        group_mode !=
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
       return false;
     }
     TF_ASSIGN_OR_RETURN(
@@ -442,7 +446,7 @@ absl::StatusOr<bool> MatchQuantization(HloInstruction* instr) {
 
 }  // namespace
 
-absl::StatusOr<bool> CollectiveQuantizer::Run(
+absl::StatusOr<bool> CollectiveQuantizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
index be5722307decae..6830fa9e5715a5 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
@@ -49,7 +49,8 @@ class CollectiveQuantizer : public HloModulePass {
  public:
   absl::string_view name() const override { return "collective-quantizer"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
index 5ae9f7f7f7ca8d..3ca17626a298f9 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
@@ -291,7 +291,7 @@ CollectiveTransformationReorder::ReorderAllReduceTransformations(
   return true;
 }
 
-absl::StatusOr<bool> CollectiveTransformationReorder::Run(
+absl::StatusOr<bool> CollectiveTransformationReorder::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool ag_changed, ReorderAllGatherTransformations(
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
index 9b8071d517d635..3ce31eec4d387f 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
@@ -57,8 +57,9 @@ class CollectiveTransformationReorder : public HloModulePass {
   absl::string_view name() const override {
     return "collective-transformation-reorderer";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
index 2508100d6c846a..e1bf0359873acf 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
@@ -31,8 +31,7 @@ limitations under the License.
 
 namespace xla {
 
-// TODO(b/181653482): Fix for interprocedural collectives as well.
-absl::StatusOr<bool> CollectivesScheduleLinearizer::Run(
+absl::StatusOr<bool> CollectivesScheduleLinearizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (is_enabled_ && !is_enabled_(module)) {
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
index 01928549116813..26559caf456f39 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
@@ -39,8 +39,8 @@ class CollectivesScheduleLinearizer : public HloModulePass {
     return "collectives-schedule-linearizer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc
index ec51d1948d3838..2088c39bd515f5 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc
@@ -218,7 +218,7 @@ absl::StatusOr<bool> ConvertAsyncCollectivesToSync::RunOnComputation(
   return true;
 }
 
-absl::StatusOr<bool> ConvertAsyncCollectivesToSync::Run(
+absl::StatusOr<bool> ConvertAsyncCollectivesToSync::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!module->has_schedule()) {
diff --git a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
index 2c0ccaad6569cf..3a79b18b666928 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
@@ -41,11 +41,6 @@ class ConvertAsyncCollectivesToSync : public HloModulePass {
     return "convert-async-collectives-to-sync";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   virtual absl::Status ConvertAsyncInstructionsToSync(
       HloComputation* computation,
       absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
@@ -63,6 +58,11 @@ class ConvertAsyncCollectivesToSync : public HloModulePass {
   static constexpr char kAsyncCollectiveNameAttributeName[] =
       "async_collective_name";
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
   HloPredicate is_nop_;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
index 9bac9fcde4f198..a4d54b5c8c3be7 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
@@ -519,7 +519,7 @@ absl::Status InfeedTokenPropagation::PropagateToken(
   return PropagateToken(ordering);
 }
 
-absl::StatusOr<bool> InfeedTokenPropagation::Run(
+absl::StatusOr<bool> InfeedTokenPropagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Before InfeedTokenPropagation:";
diff --git a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
index bf012949356729..f6a7ccf633ed7d 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
@@ -40,8 +40,9 @@ namespace xla {
 class InfeedTokenPropagation : public HloModulePass {
  public:
   absl::string_view name() const override { return "infeed-token-propagation"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
index 92c4448b7049a9..9644f1a28b8cb6 100644
--- a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
+++ b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
@@ -110,7 +110,7 @@ ConvertCustomCallWithExternalAnnotationToInternalAnnotation(
 
 }  // namespace
 
-absl::StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::Run(
+absl::StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
index ab2bc8359c0f76..25fe92f4499d07 100644
--- a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
+++ b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
@@ -35,8 +35,9 @@ class ConvertMemoryPlacementToInternalAnnotations : public HloModulePass {
   absl::string_view name() const override {
     return "convert-memory-placement-to-internal-annotations";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/defuser.cc b/third_party/xla/xla/hlo/transforms/defuser.cc
index 16f8152a9d15dc..ba7e97af244a7f 100644
--- a/third_party/xla/xla/hlo/transforms/defuser.cc
+++ b/third_party/xla/xla/hlo/transforms/defuser.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> Defuser::Run(
+absl::StatusOr<bool> Defuser::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Defusing module " << module->name();
diff --git a/third_party/xla/xla/hlo/transforms/defuser.h b/third_party/xla/xla/hlo/transforms/defuser.h
index edc2f8f9d8b632..0289e940b59ff8 100644
--- a/third_party/xla/xla/hlo/transforms/defuser.h
+++ b/third_party/xla/xla/hlo/transforms/defuser.h
@@ -34,10 +34,8 @@ class Defuser : public HloModulePass {
   ~Defuser() override {}
   absl::string_view name() const override { return "defuser"; }
 
-  // Run defusion on the given module. Returns whether the module was
-  // changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/despecializer.cc b/third_party/xla/xla/hlo/transforms/despecializer.cc
index 11297956ae6cab..3f97bbe40ce0ca 100644
--- a/third_party/xla/xla/hlo/transforms/despecializer.cc
+++ b/third_party/xla/xla/hlo/transforms/despecializer.cc
@@ -22,15 +22,22 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/defuser.h"
 #include "xla/hlo/transforms/simplifiers/float_normalization.h"
 #include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
 #include "xla/hlo/transforms/simplifiers/sub_byte_normalization.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -53,7 +60,7 @@ void Despecializer::AddReduceWindowToReduceBroadcastDeconstruct() {
   pipeline_.AddPass<DeconstructReduceWindowToReduceBroadcast>();
 }
 
-absl::StatusOr<bool> Despecializer::Run(
+absl::StatusOr<bool> Despecializer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return pipeline_.Run(module, execution_threads);
@@ -66,7 +73,7 @@ absl::StatusOr<bool> Despecializer::Run(
 // reference platform perspective, i.e., for testing, this custom-call should be
 // a copy since no optimizations are performed and runtime is not the criterion
 // while obtaining reference results.
-absl::StatusOr<bool> AssumeGatherIndicesInBoundRewriteToCopy::Run(
+absl::StatusOr<bool> AssumeGatherIndicesInBoundRewriteToCopy::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> candidates;
@@ -82,12 +89,12 @@ absl::StatusOr<bool> AssumeGatherIndicesInBoundRewriteToCopy::Run(
     auto copy = computation->AddInstruction(
         HloInstruction::CreateUnary(gather_indices->shape(), HloOpcode::kCopy,
                                     gather_indices->mutable_operand(0)));
-    TF_CHECK_OK(computation->ReplaceInstruction(gather_indices, copy));
+    CHECK_OK(computation->ReplaceInstruction(gather_indices, copy));
   }
   return !candidates.empty();
 }
 
-absl::StatusOr<bool> DeconstructReduceWindowToReduceBroadcast::Run(
+absl::StatusOr<bool> DeconstructReduceWindowToReduceBroadcast::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -188,8 +195,8 @@ absl::StatusOr<bool> DeconstructReduceWindowToReduceBroadcast::Run(
     VLOG(2) << "reduce_window:" << reduce_window->ToString();
     VLOG(2) << "reduce:" << reduce_instr->ToString();
     VLOG(2) << "broadcast:" << broadcast_instr->ToString();
-    TF_CHECK_OK(reduce_window->parent()->ReplaceInstruction(reduce_window,
-                                                            broadcast_instr));
+    CHECK_OK(reduce_window->parent()->ReplaceInstruction(reduce_window,
+                                                         broadcast_instr));
     changed = true;
   }
   return changed;
diff --git a/third_party/xla/xla/hlo/transforms/despecializer.h b/third_party/xla/xla/hlo/transforms/despecializer.h
index 1266eed7af03f3..e0cb04d06f2288 100644
--- a/third_party/xla/xla/hlo/transforms/despecializer.h
+++ b/third_party/xla/xla/hlo/transforms/despecializer.h
@@ -45,8 +45,9 @@ class Despecializer : public HloModulePass {
   void AddReduceWindowToReduceBroadcastDeconstruct();
   void AddAssumeGatherIndicesInBoundRewriteToCopy();
   absl::string_view name() const override { return "despecializer"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -60,8 +61,9 @@ class AssumeGatherIndicesInBoundRewriteToCopy : public HloModulePass {
   absl::string_view name() const override {
     return "AssumeGatherIndicesInBoundRewriteToCopy";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
@@ -72,8 +74,9 @@ class DeconstructReduceWindowToReduceBroadcast : public HloModulePass {
   absl::string_view name() const override {
     return "ReduceWindowToReduceAndBroadcast";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
@@ -84,10 +87,10 @@ class ControlDepRemover : public HloModulePass {
   ControlDepRemover() = default;
   absl::string_view name() const override { return "control-dep-remover"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
diff --git a/third_party/xla/xla/hlo/transforms/expanders/BUILD b/third_party/xla/xla/hlo/transforms/expanders/BUILD
index ff4f58a1ce1711..c8708790c4681e 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/BUILD
+++ b/third_party/xla/xla/hlo/transforms/expanders/BUILD
@@ -366,6 +366,7 @@ xla_cc_test(
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",  # fixdeps: keep
@@ -515,9 +516,11 @@ cc_library(
     hdrs = ["dynamic_index_splitter.h"],
     deps = [
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
index 9384ac4d2552ef..165422bf9dd7f5 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
 #include "xla/service/shape_inference.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -89,6 +90,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
   }
   // The canonical form of the lhs is
   // [BatchDims, NonContractingDimsProduct, ContractingsDimsProduct]
+  // However, [ContractingDim, NonContractingDim] is considered canonical too.
   // If NonContractingDimsProduct is 1, it is omitted.
   std::vector<int64_t> lhs_transpose;
   lhs_transpose.reserve(lhs_rank);
@@ -103,8 +105,8 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
   HloInstruction* lhs_operand = original_dot->mutable_operand(0);
   HloInstruction* transposed_lhs = computation->AddInstruction(
       HloInstruction::CreateTranspose(
-          ShapeUtil::PermuteDimensions(lhs_transpose, lhs_shape), lhs_operand,
-          lhs_transpose),
+          ShapeUtil::PermuteDimensionsIgnoringLayout(lhs_transpose, lhs_shape),
+          lhs_operand, lhs_transpose),
       &lhs_operand->metadata());
 
   std::vector<int64_t> lhs_reshape_dims = batch_dim_sizes;
@@ -147,6 +149,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
 
   // The canonical form of the rhs is
   // [BatchDims, ContractingsDimsProduct, NonContractingDimsProduct]
+  // However, [NonContractingDim, ContractingDim] is considered canonical too.
   // If NonContractingDimsProduct is 1, it is omitted.
   std::vector<int64_t> rhs_transpose;
   rhs_transpose.reserve(rhs_rank);
@@ -161,8 +164,8 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
   HloInstruction* rhs_operand = original_dot->mutable_operand(1);
   HloInstruction* transposed_rhs = computation->AddInstruction(
       HloInstruction::CreateTranspose(
-          ShapeUtil::PermuteDimensions(rhs_transpose, rhs_shape), rhs_operand,
-          rhs_transpose),
+          ShapeUtil::PermuteDimensionsIgnoringLayout(rhs_transpose, rhs_shape),
+          rhs_operand, rhs_transpose),
       &rhs_operand->metadata());
 
   std::vector<int64_t> rhs_reshape_dims = batch_dim_sizes;
@@ -219,7 +222,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
 
 }  // namespace
 
-absl::StatusOr<bool> DotDecomposer::Run(
+absl::StatusOr<bool> DotDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Gather all Non-canonical Dot operations.
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
index b399970fe5cffa..392704b5eb621b 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
@@ -31,10 +31,8 @@ class DotDecomposer : public HloModulePass {
  public:
   absl::string_view name() const override { return "dot_decomposer"; }
 
-  // Run DotDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc
index 2793c7e9bb504d..2c508759fef7b4 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc
@@ -20,17 +20,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/pattern_matcher.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -63,6 +60,63 @@ TEST_F(DotDecomposerTest, CanonicalizeMultipleNonContractingDims) {
                                 op::Shape("f32[4032,512]"))));
 }
 
+TEST_F(DotDecomposerTest,
+       DontCanonicalizeLhsContractingDim0AndRhsContractingDim1) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[512,64]{1,0} parameter(0)
+    p1 = f32[1024,512]{1,0} parameter(1)
+    ROOT dot = f32[64,1024]{1,0} dot(p0, p1), lhs_contracting_dims={0},
+                                              rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_FALSE(canonicalized) << module->ToString();
+}
+
+TEST_F(DotDecomposerTest, TransposeContractingDimsUponCanonicalization) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[512,32,32]{2,1,0} parameter(0)
+    p1 = f32[1024,512]{1,0} parameter(1)
+    // This dot is considered non-canonical because the LHS has two
+    // non-contracting dimensions. Both, LHS and RHS operands are canonicalized,
+    // which involves transposing the contracting dimensions to be 1 and 0 on
+    // the LHS and RHS, respectively.
+    // TODO(tjoerg): Consider leaving the RHS alone, since it is canonical.
+    ROOT dot = f32[32,32,1024]{2,1,0} dot(p0, p1), lhs_contracting_dims={0},
+                                                   rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized) << module->ToString();
+  const HloInstruction* dot = nullptr;
+  const HloInstruction* lhs_transpose = nullptr;
+  const HloInstruction* rhs_transpose = nullptr;
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Reshape(
+          m::Op(&dot)
+              .WithOperand(0, m::Reshape(m::Transpose(&lhs_transpose)))
+              .WithOperand(1, m::Reshape(m::Transpose(&rhs_transpose))))));
+  EXPECT_THAT(dot, AllOf(op::Dot(op::Reshape(), op::Reshape(),
+                                 /*lhs_contracting_dim=*/1,
+                                 /*rhs_contracting_dim=*/0),
+                         op::Shape("f32[1024,1024]")));
+  EXPECT_THAT(lhs_transpose, op::ShapeWithLayout("f32[32,32,512]"));
+  EXPECT_THAT(rhs_transpose, op::ShapeWithLayout("f32[512,1024]"));
+}
+
 TEST_F(DotDecomposerTest, DontCanonicalizeIfNoNoncontractingDims) {
   absl::string_view module_string = R"(
   HloModule module
@@ -185,6 +239,30 @@ TEST_F(DotDecomposerTest, AddRhsNonContractingDimIfZero) {
                                 op::Shape("f32[64,0]"))));
 }
 
+TEST_F(DotDecomposerTest, CanonicalizeBatchDims) {
+  absl::string_view module_string = R"(
+  ENTRY main {
+    p0 = f32[64,4,32,8] parameter(0)
+    p1 = f32[128,4,8,32] parameter(1)
+    ROOT dot = f32[32,8,64,128] dot(p0, p1), lhs_batch_dims={2,3},
+                                             lhs_contracting_dims={1},
+                                             rhs_batch_dims={3,2},
+                                             rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized);
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(AllOf(op::Dot(op::Reshape(), op::Reshape(),
+                                        /*lhs_contracting_dim=*/3,
+                                        /*rhs_contracting_dim=*/2),
+                                op::Shape("f32[32,8,64,128]"))));
+}
+
 template <typename Arg0, typename Arg1, typename Arg2>
 auto SparseDotMatcher(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2) {
   return match::Op()
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
index faae8fdec809b0..3479d636fe94f9 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -29,10 +30,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape_util.h"
+#include "xla/status_macros.h"
 
 namespace xla {
 
-absl::StatusOr<bool> DynamicIndexSplitter::Run(
+absl::StatusOr<bool> DynamicIndexSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -56,11 +58,11 @@ absl::StatusOr<bool> DynamicIndexSplitter::Run(
         // If the operand rank is 0, directly replace R0 DS/DUS with the
         // operand (for DS) or update (for DUS).
         if (is_update) {
-          TF_CHECK_OK(parent->ReplaceInstruction(
-              dynamic_op, dynamic_op->mutable_operand(1)));
+          CHECK_OK(parent->ReplaceInstruction(dynamic_op,
+                                              dynamic_op->mutable_operand(1)));
         } else {
-          TF_CHECK_OK(parent->ReplaceInstruction(
-              dynamic_op, dynamic_op->mutable_operand(0)));
+          CHECK_OK(parent->ReplaceInstruction(dynamic_op,
+                                              dynamic_op->mutable_operand(0)));
         }
         changed = true;
         continue;
@@ -95,8 +97,8 @@ absl::StatusOr<bool> DynamicIndexSplitter::Run(
                     dynamic_op->shape(), dynamic_op->mutable_operand(0),
                     absl::MakeSpan(index_array),
                     dynamic_op->dynamic_slice_sizes());
-      TF_CHECK_OK(parent->ReplaceWithNewInstruction(dynamic_op,
-                                                    std::move(new_dynamic_op)));
+      CHECK_OK(parent->ReplaceWithNewInstruction(dynamic_op,
+                                                 std::move(new_dynamic_op)));
       changed = true;
     }
   }
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
index 910b149d136755..28a836d45ec90a 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
@@ -30,8 +30,9 @@ class DynamicIndexSplitter : public HloModulePass {
  public:
   DynamicIndexSplitter() = default;
   absl::string_view name() const override { return "dynamic-index-splitter"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc
index 25ba442542d2c8..fe4e3c8cb1bb87 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc
@@ -29,7 +29,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> OpExpanderPass::Run(
+absl::StatusOr<bool> OpExpanderPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> matching_instructions;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
index c30120ee2370f5..76cd9abe705fe7 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
@@ -32,11 +32,6 @@ namespace xla {
 // does not support into other HLO instructions.
 class OpExpanderPass : public HloModulePass {
  public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // extra_filter: Optional extra filtering criteria for matching instructions,
   // used in conjunction with InstructionMatchesPattern.
   // preserve_sharding and relay_control_dependency: If we preserve sharding and
@@ -49,6 +44,10 @@ class OpExpanderPass : public HloModulePass {
         relay_control_dependency_(relay_control_dependency) {}
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // Returns `true` if `instruction` should be expanded by this pass.
   virtual bool InstructionMatchesPattern(HloInstruction* instruction) = 0;
 
diff --git a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc
index 10dcc7a2eef96c..705ad85f3280f6 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> OptimizationBarrierExpander::Run(
+absl::StatusOr<bool> OptimizationBarrierExpander::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> barriers;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
index a18b8e9a310239..8592815c24bb69 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
@@ -30,8 +30,8 @@ class OptimizationBarrierExpander : public HloModulePass {
 
   absl::string_view name() const override { return "cse_barrier_expander"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc
index cd7a138ddb5cef..62968d989416ff 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc
@@ -341,7 +341,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> RaggedToGeneral(
 
 }  // namespace
 
-absl::StatusOr<bool> RaggedDotRewriter::Run(
+absl::StatusOr<bool> RaggedDotRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (module->config()
diff --git a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h
index 37e430a13e0a8e..d26078aeb20b92 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h
@@ -29,10 +29,8 @@ class RaggedDotRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "ragged_dot_rewriter"; }
 
-  // Run DotDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
index 7b7a5367b81348..cd47ac1a1ca8f8 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
@@ -153,7 +153,7 @@ class ReduceDecomposerVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> ReduceDecomposer::Run(
+absl::StatusOr<bool> ReduceDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed1,
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
index 22bcabf831ca6f..0011d6202897a3 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
@@ -69,8 +69,8 @@ class ReduceDecomposer : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-decomposer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc
index 50924428832c5d..ee506a662f4d78 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc
@@ -77,7 +77,7 @@ class ReshapeDecomposerVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> ReshapeDecomposer::Run(
+absl::StatusOr<bool> ReshapeDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return ReshapeDecomposerVisitor{}.RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
index f169cdc666a803..49544fe152627c 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
@@ -32,8 +32,8 @@ class ReshapeDecomposer : public HloModulePass {
  public:
   absl::string_view name() const override { return "reshape-decomposer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc
index 7c5ab5fa62a752..f6158b6b5bdbb5 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc
@@ -139,7 +139,7 @@ absl::Status DecomposeStochasticConvert(HloComputation* comp,
                        PrimitiveType_Name(to_type));
 }
 
-absl::StatusOr<bool> StochasticConvertDecomposer::Run(
+absl::StatusOr<bool> StochasticConvertDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
index e0574e4fa5e85f..20b0284732fcee 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
@@ -31,8 +31,9 @@ class StochasticConvertDecomposer : public HloModulePass {
   absl::string_view name() const override {
     return "stochastic_convert_decomposer";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc b/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
index 1efd19941fab0e..868e450502626f 100644
--- a/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
@@ -1025,7 +1025,7 @@ HostOffloadLegalize::FindStartingInstructionsOfHostMemoryOffload(
   return starting_instructions;
 }
 
-absl::StatusOr<bool> HostOffloadLegalize::Run(
+absl::StatusOr<bool> HostOffloadLegalize::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/host_offload_legalize.h b/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
index efe8845c2a5804..75b9760bcbe0f1 100644
--- a/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
+++ b/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
@@ -39,8 +39,8 @@ class HostOffloadLegalize : public HloModulePass {
 
   absl::string_view name() const override { return "host-offload-legalize"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.cc b/third_party/xla/xla/hlo/transforms/host_offloader.cc
index f21fee8383bfcf..1fdd3360d98b82 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.cc
@@ -144,8 +144,19 @@ bool HostOffloader::InstructionIsAllowedBetweenDsAndMoveToDevice(
     return ShapeUtil::ReshapeIsBitcast(instruction->operand(0)->shape(),
                                        instruction->shape());
   }
-  return instruction->opcode() == HloOpcode::kBitcast ||
-         instruction->opcode() == HloOpcode::kCopy;
+  if (instruction->opcode() == HloOpcode::kBitcast ||
+      instruction->opcode() == HloOpcode::kCopy) {
+    return true;
+  }
+  // Allow an annotation to sit inside a loop.
+  if (instruction->opcode() == HloOpcode::kTuple ||
+      instruction->opcode() == HloOpcode::kOptimizationBarrier ||
+      instruction->opcode() == HloOpcode::kGetTupleElement ||
+      instruction->opcode() == HloOpcode::kParameter ||
+      instruction->opcode() == HloOpcode::kWhile) {
+    return true;
+  }
+  return false;
 }
 
 absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
@@ -1268,10 +1279,24 @@ absl::StatusOr<bool> HostOffloader::HandleDynamicUpdateSlices() {
         operand_memory_space == Layout::kDefaultMemorySpace;
     if (host_to_device) {
       // This is only supported via host compute.
+      if (dus->GetModule()
+              ->config()
+              .debug_options()
+              .xla_disable_automatic_host_compute_offload()) {
+        return absl::InvalidArgumentError(
+            "Automatic host compute offloading is disabled.");
+      }
       host_offload_utils::SetHostComputeFrontendAttribute(*dus);
       changed = true;
     } else if (host_to_host) {
       // Host to host. Execute as host compute. Also set as host memory space.
+      if (dus->GetModule()
+              ->config()
+              .debug_options()
+              .xla_disable_automatic_host_compute_offload()) {
+        return absl::InvalidArgumentError(
+            "Automatic host compute offloading is disabled.");
+      }
       host_offload_utils::SetHostComputeFrontendAttribute(*dus);
       SetMemorySpace(dus->mutable_shape(), Layout::kHostMemorySpace);
       changed = true;
@@ -1355,7 +1380,7 @@ absl::StatusOr<bool> HostOffloader::HandlePallasKernels(HloModule* module) {
   return changed;
 }
 
-absl::StatusOr<bool> HostOffloader::Run(
+absl::StatusOr<bool> HostOffloader::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Start by removing all host memory space from all shapes. Host memory space
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.h b/third_party/xla/xla/hlo/transforms/host_offloader.h
index 72d0110559a106..3095591c22985c 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.h
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.h
@@ -62,15 +62,14 @@ class HostOffloader : public HloModulePass {
 
   absl::string_view name() const override { return "host-offloader"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   virtual absl::StatusOr<std::vector<int64_t>>
   GetPallasCustomCallOutputMemorySpaces(HloInstruction* instruction) const;
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Process the next "MoveToHost" instruction that resides at the beginning of
   // a host memory offload instruction chain. This ensures that redundant
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
index 90a8af4d5145f1..c49891e9b03026 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
@@ -1076,6 +1077,50 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, DsWithMoveToDeviceInWhileBody) {
+  const std::string& hlo_string = R"(
+HloModule my_module, entry_computation_layout={(f32[1024,2048]{1,0:T(8,128)S(5)})->f32[8,2048]{1,0:T(8,128)}}
+while_body {
+  param = (s32[], f32[8,2048]) parameter(0)
+  current_iteration_index.0 = s32[] get-tuple-element(param), index=0
+  gte.1 = f32[8,2048] get-tuple-element(param), index=1
+  offload_custom_call = f32[8,2048] custom-call(gte.1), custom_call_target="MoveToDevice"
+  double = f32[8,2048] add(offload_custom_call, offload_custom_call)
+  constant_1 = s32[] constant(1)
+  incremented_index.0 = s32[] add(current_iteration_index.0, constant_1)
+  ROOT tuple = (s32[], f32[8,2048]) tuple(incremented_index.0, double)
+}
+while_condition {
+  param = (s32[], f32[8,2048]) parameter(0)
+  current_iteration_index.0 = get-tuple-element(param), index=0
+  constant_2 = s32[] constant(2)
+  ROOT pred_result = pred[] compare(current_iteration_index.0, constant_2), direction=LT
+}
+ENTRY main {
+  data_param = f32[1024,2048] parameter(0)
+  constant = s32[] constant(0)
+  ds = f32[8,2048] slice(data_param), slice={[0:8], [0:2048]}
+  tuple = (s32[], f32[8,2048]) tuple(constant, ds)
+  while = (s32[], f32[8,2048]) while(tuple), condition=while_condition, body=while_body
+  ROOT gte = f32[8,2048] get-tuple-element(while), index=1
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+  VLOG(1) << "module after: " << module->ToString();
+
+  EXPECT_TRUE(changed);
+  // ds should be rewritten into a dynamic-slice.
+  HloInstruction* ds = FindInstruction(module.get(), "dynamic-slice");
+  EXPECT_NE(ds, nullptr);
+  EXPECT_TRUE(ds->shape().layout().memory_space() ==
+              Layout::kDefaultMemorySpace);
+  EXPECT_FALSE(ds->has_frontend_attributes());
+}
+
 TEST_F(HostOffloaderTest, NoCopyWithOptBarrier) {
   const std::string& hlo_string = R"(
 HloModule my_module
diff --git a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
index 4714f7b86c29e2..84c33554c8e466 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
@@ -133,7 +133,7 @@ absl::StatusOr<bool> ConvertToCustomCall(HloModule* module) {
 
 }  // namespace
 
-absl::StatusOr<bool> HostOffloadingPrepare::Run(
+absl::StatusOr<bool> HostOffloadingPrepare::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   switch (rewrite_) {
diff --git a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
index d45336e6111a05..e651b0aca3baa0 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
+++ b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
@@ -76,8 +76,8 @@ class HostOffloadingPrepare : public HloModulePass {
 
   absl::string_view name() const override { return pass_name_; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc
index 6b7418447b6ad0..3f45ff1051f4a0 100644
--- a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc
@@ -60,7 +60,7 @@ LiteralCanonicalizer::LiteralCanonicalizer(LiteralPool* literal_pool,
                                            size_t min_size_bytes)
     : literal_pool_(literal_pool), min_size_bytes_(min_size_bytes) {}
 
-absl::StatusOr<bool> LiteralCanonicalizer::Run(
+absl::StatusOr<bool> LiteralCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Every time we canonicalize literals in a module, we garbage collect expired
diff --git a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
index 26d1768f374a79..311ea8df029899 100644
--- a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
@@ -33,14 +33,13 @@ class LiteralCanonicalizer : public HloModulePass {
  public:
   LiteralCanonicalizer(LiteralPool* literal_pool, size_t min_size_bytes);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override { return "literal-canonicalizer"; }
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   LiteralPool* literal_pool_;
   size_t min_size_bytes_;
 };
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc b/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
index d5637aa6ddeef1..624879fc482a97 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
@@ -56,7 +56,7 @@ bool MemorySpacePropagation::RunOnComputation(HloComputation* computation) {
   return modified;
 }
 
-absl::StatusOr<bool> MemorySpacePropagation::Run(
+absl::StatusOr<bool> MemorySpacePropagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation.h b/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
index 33b4f08c4e31ec..f0eda8291dc285 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
@@ -39,15 +39,16 @@ class MemorySpacePropagation : public HloModulePass {
       : dataflow_analysis_(std::move(dataflow_analysis)) {}
   ~MemorySpacePropagation() override = default;
   absl::string_view name() const override { return "memory-space-propagation"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Propagates the memory space (and associated split config) in the layout to
   // a given fusion computation. Returns true if the computation is modified.
   bool RunOnComputation(HloComputation* computation);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Given the shape index (operand or output) and its corresponding instruction
   // in the fused computation (parameter or root), propagates the memory space
diff --git a/third_party/xla/xla/hlo/transforms/offloaded_instruction_wrapper.cc b/third_party/xla/xla/hlo/transforms/offloaded_instruction_wrapper.cc
index cd9ac11353cdd7..e2bb673ea7af1e 100644
--- a/third_party/xla/xla/hlo/transforms/offloaded_instruction_wrapper.cc
+++ b/third_party/xla/xla/hlo/transforms/offloaded_instruction_wrapper.cc
@@ -47,6 +47,13 @@ absl::Status ClearComputeTypeFrontendAttribute(HloInstruction* instr) {
   return absl::OkStatus();
 }
 
+void ClearSideEffects(HloInstruction* instr) {
+  if (instr->opcode() == HloOpcode::kCustomCall) {
+    static_cast<HloCustomCallInstruction*>(instr)
+        ->set_custom_call_has_side_effect(false);
+  }
+}
+
 }  // namespace
 
 absl::Status RecursivelyClearComputeTypeFrontendAttribute(
@@ -126,6 +133,7 @@ FindAndWrapOffloadedComputations(
             clear_backend_config_device_type(offloaded_call_instr));
         TF_RETURN_IF_ERROR(
             ClearComputeTypeFrontendAttribute(offloaded_call_instr));
+        ClearSideEffects(instr);
         offloaded_instr = instr;
         continue;
       }
@@ -161,6 +169,7 @@ FindAndWrapOffloadedComputations(
 
           offloaded_call_instr->AppendInstructionIntoCalledComputation(
               instr, /*add_output=*/instr_escapes_offloaded_computation);
+          ClearSideEffects(instr);
           offloaded_instr = instr;
         } else {
           unmerged_ancestors.insert(instr);
@@ -176,6 +185,7 @@ FindAndWrapOffloadedComputations(
         // computation, include it anyway.
         offloaded_call_instr->AppendInstructionIntoCalledComputation(
             instr, /*add_output=*/true);
+        ClearSideEffects(instr);
         offloaded_instr = instr;
       } else {
         unmerged_ancestors.insert(instr);
@@ -191,13 +201,6 @@ FindAndWrapOffloadedComputations(
         std::pair(offloaded_instr, offloaded_call_instr));
 
     for (HloInstruction* instr : computation.instructions()) {
-      // If an offloaded instruction is a custom call marked with side effects.
-      // Remove the annotation so it can be properly removed from the
-      // original computation during DCE.
-      if (instr->opcode() == HloOpcode::kCustomCall && should_offload(instr)) {
-        static_cast<HloCustomCallInstruction*>(instr)
-            ->set_custom_call_has_side_effect(false);
-      }
       // If an offloaded instruction is a Sharding custom call or has control
       // dependencies (such as those around elided copies), remove it
       // explicitly since it won't be removed by HloDCE.
diff --git a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc
index 5d555bcf873e6a..4d58a9be040fad 100644
--- a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc
@@ -50,7 +50,7 @@ class ShapeCanonicalizerVisitor : public DfsHloRewriteVisitor {
 ShapeCanonicalizer::ShapeCanonicalizer(ShapePool* shape_pool)
     : shape_pool_(shape_pool) {}
 
-absl::StatusOr<bool> ShapeCanonicalizer::Run(
+absl::StatusOr<bool> ShapeCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Every time we canonicalize shapes in a module, we garbage collect expired
diff --git a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h
index 1d2efb1f3849ca..e68cb8d7788bb5 100644
--- a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h
@@ -34,14 +34,13 @@ class ShapeCanonicalizer : public HloModulePass {
 
   explicit ShapeCanonicalizer(ShapePool* shape_pool);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override { return "shape-canonicalizer"; }
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   ShapePool* shape_pool_;
 };
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
index a9a1517345c96c..1382fdbad70b86 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
@@ -113,6 +113,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "computation_canonicalizers",
+    srcs = ["computation_canonicalizers.cc"],
+    hdrs = ["computation_canonicalizers.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "computation_canonicalizers_test",
+    srcs = ["computation_canonicalizers_test.cc"],
+    deps = [
+        ":computation_canonicalizers",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "broadcast_canonicalizer_test",
     srcs = ["broadcast_canonicalizer_test.cc"],
@@ -135,14 +162,12 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:float_support",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -392,10 +417,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -425,10 +446,10 @@ xla_cc_test(
         "//xla/service:shape_inference",
         "//xla/tests:test_utils",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -462,10 +483,10 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -509,23 +530,22 @@ cc_library(
     srcs = ["convolution_group_converter.cc"],
     hdrs = ["convolution_group_converter.h"],
     deps = [
+        "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_creation_utils",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -815,11 +835,13 @@ cc_library(
     hdrs = ["hlo_dce.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
+        "//xla:shape_layout",
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:call_graph",
+        "//xla/service:computation_layout",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -839,14 +861,12 @@ xla_cc_test(
         ":hlo_dce",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:pattern_matcher",
-        "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -924,7 +944,6 @@ cc_library(
     hdrs = ["hlo_rematerialization_data_structures.h"],
     deps = [
         "//xla/hlo/ir:hlo",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1316,10 +1335,9 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1339,6 +1357,8 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/service:call_graph",
         "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -1346,9 +1366,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1579,7 +1596,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -1590,15 +1606,16 @@ cc_library(
     deps = [
         "//xla:shape_layout",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -1822,3 +1839,46 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "call_parameter_cleanup",
+    srcs = ["call_parameter_cleanup.cc"],
+    hdrs = ["call_parameter_cleanup.h"],
+    deps = [
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "call_parameter_cleanup_test",
+    srcs = ["call_parameter_cleanup_test.cc"],
+    deps = [
+        ":call_parameter_cleanup",
+        ":hlo_dce",
+        ":tuple_simplifier",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",  # fixdeps: keep
+    ],
+)
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
index b0099ab1c14b87..fef77cbdc6ec68 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cmath>
-#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <limits>
@@ -75,10 +74,6 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -581,7 +576,7 @@ bool AlgebraicSimplifierVisitor::Run(HloComputation* computation,
                                      const AlgebraicSimplifierOptions& options,
                                      AlgebraicSimplifier* simplifier) {
   ResetState(computation);
-  TF_CHECK_OK(computation->Accept(this));
+  CHECK_OK(computation->Accept(this));
   return changed();
 }
 
@@ -798,7 +793,7 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(HloInstruction* instruction,
 
   auto bitcast = instruction->AddInstruction(
       HloInstruction::CreateBitcast(instruction->shape(), operand));
-  TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
+  CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
 // Replace the old instruction with the new one if they are compatible, i.e.,
@@ -1587,7 +1582,7 @@ bool AlgebraicSimplifierVisitor::SwapCopyBitcastCopy(
             bitcast->CloneWithNewOperands(new_shape.value(), {op})));
     VLOG(2) << "Replace with " << repl->operand(0)->ToString() << "\n"
             << repl->ToString() << "\n";
-    TF_CHECK_OK(ReplaceWithNewInstruction(root_copy, std::move(repl)));
+    CHECK_OK(ReplaceWithNewInstruction(root_copy, std::move(repl)));
     return true;
   }
 
@@ -1603,7 +1598,7 @@ bool AlgebraicSimplifierVisitor::SwapCopyBitcastCopy(
             root_copy->CloneWithNewOperands(new_shape.value(), {op})));
     VLOG(2) << "Replace with " << repl->operand(0)->ToString() << "\n"
             << repl->ToString() << "\n";
-    TF_CHECK_OK(ReplaceWithNewInstruction(root_copy, std::move(repl)));
+    CHECK_OK(ReplaceWithNewInstruction(root_copy, std::move(repl)));
     return true;
   }
   return false;
@@ -6142,6 +6137,119 @@ absl::Status AlgebraicSimplifierVisitor::HandleRemainder(
   return absl::OkStatus();
 }
 
+absl::StatusOr<bool>
+AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
+    HloInstruction* reshape) {
+  // Detect a chain of transposes and reshapes that can be replaced with a
+  // nop. All reshapes only add, remove or shuffle degenerate dimensions, such
+  // as [x,y,z]->[x,y,1,z] or its reverse, [x,y,1,z]->[x,1,y,z], etc. And all
+  // the shapes in the chain have at most one degenerate dimension. Then all
+  // the transposes in the chain effectively permute x,y,z, while the
+  // degenerate dimension is ignored. As long as all transposes compose to
+  // identity permutation, the chain can be replaced with a nop if the
+  // starting shape and ending shape match. This is done before layout
+  // assignment so layout is completely ignored.
+
+  auto get_num_of_degenerate_dimensions = [](const Shape& shape) {
+    return std::accumulate(shape.dimensions().begin(), shape.dimensions().end(),
+                           0, [](int64_t count, int64_t element) {
+                             return count + (element == 1 ? 1 : 0);
+                           });
+  };
+
+  auto is_valid_reshape = [&](const HloInstruction* reshape) {
+    CHECK(reshape->opcode() == HloOpcode::kReshape);
+    return get_num_of_degenerate_dimensions(reshape->shape()) <= 1 &&
+           get_num_of_degenerate_dimensions(reshape->operand(0)->shape()) <=
+               1 &&
+           reshape->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
+  };
+
+  auto get_degenerate_dimension = [](const Shape& shape) {
+    for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
+      if (shape.dimensions(i) == 1) {
+        return i;
+      }
+    }
+    return int64_t{-1};
+  };
+
+  // Returns the effective permutation of the transpose. The degenerate
+  // dimension is ignored.
+  auto get_effective_permutation =
+      [&](absl::Span<const int64_t> permutation, const Shape& from_shape,
+          const Shape& to_shape) -> DimensionVector {
+    CHECK_EQ(from_shape.dimensions().size(), permutation.size());
+    CHECK_EQ(to_shape.dimensions().size(), permutation.size());
+    if (get_num_of_degenerate_dimensions(from_shape) == 1) {
+      int64_t degenerate_dimension = get_degenerate_dimension(from_shape);
+      DimensionVector new_permutation;
+      new_permutation.reserve(permutation.size() - 1);
+      for (int64_t i = 0; i < permutation.size(); ++i) {
+        if (permutation[i] == degenerate_dimension) {
+          CHECK_EQ(to_shape.dimensions(i), 1);
+          // Skip degenerate dimension.
+          continue;
+        }
+        if (permutation[i] > degenerate_dimension) {
+          new_permutation.push_back(permutation[i] - 1);
+        } else {
+          new_permutation.push_back(permutation[i]);
+        }
+      }
+      return new_permutation;
+    }
+
+    // The shapes should have no degenerate dimension.
+    return DimensionVector(permutation.begin(), permutation.end());
+  };
+
+  if (!options_.is_layout_sensitive() && is_valid_reshape(reshape)) {
+    int64_t effective_size = ShapeUtil::TrueNumDimensions(reshape->shape());
+    std::vector<int64_t> permutation(effective_size);
+    // Init with identity permutation.
+    std::iota(permutation.begin(), permutation.end(), 0);
+
+    bool is_nop = true;
+    HloInstruction* starting_instruction = nullptr;
+    HloInstruction* current = reshape->mutable_operand(0);
+    while (current->opcode() == HloOpcode::kReshape ||
+           current->opcode() == HloOpcode::kTranspose) {
+      if (current->opcode() == HloOpcode::kReshape &&
+          !is_valid_reshape(current)) {
+        is_nop = false;
+        break;
+      }
+      if (current->opcode() == HloOpcode::kTranspose) {
+        if (IsIdentityPermutation(
+                Cast<HloTransposeInstruction>(current)->dimensions())) {
+          // This transpose will be eliminated separately in HandleTranspose.
+          is_nop = false;
+          break;
+        }
+        permutation = ComposePermutations(
+            get_effective_permutation(current->dimensions(),
+                                      current->operand(0)->shape(),
+                                      current->shape()),
+            permutation);
+      }
+      starting_instruction = current;
+      current = current->mutable_operand(0);
+    }
+
+    if (is_nop && starting_instruction != nullptr &&
+        Shape::Equal().IgnoreLayout()(
+            reshape->shape(), starting_instruction->operand(0)->shape()) &&
+        IsIdentityPermutation(permutation)) {
+      VLOG(2) << "Deleting reshape-transpose chain: " << reshape->ToString();
+      TF_RETURN_IF_ERROR(ReplaceInstruction(
+          reshape, starting_instruction->mutable_operand(0)));
+      return true;
+    }
+  }
+  return false;
+}
+
 absl::Status AlgebraicSimplifierVisitor::HandleReshape(
     HloInstruction* reshape) {
   auto operand = reshape->mutable_operand(0);
@@ -6179,6 +6287,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleReshape(
     return ReplaceInstruction(reshape, operand);
   }
 
+  TF_ASSIGN_OR_RETURN(bool reshape_transpose_chain_removed,
+                      TryRemovingReshapeTransposeChain(reshape));
+  if (reshape_transpose_chain_removed) {
+    return absl::OkStatus();
+  }
+
   if (options_.is_layout_sensitive()) {
     // Try to reorder copy-reshape to reshape-copy.
     HloInstruction* copy_before = nullptr;
@@ -6785,6 +6899,88 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     }
   }
 
+  // Simplify:
+  //    Txx[...,1]   slice(Txx[..., K] reshape(Txx[...,N*K])) // N > 1
+  // To:
+  //    Txx[...,1] reshape(Txx[..., N]   slice(Txx[...,N*K], stride(-1)=K)
+  //
+  // Maintaining data-parallelism to improve throughput on some architectures.
+  HloInstruction* reshape;
+  if (Match(slice, m::Slice(m::Reshape(&reshape, m::Op())))) {
+    HloInstruction* input = reshape->mutable_operand(0);
+    const Shape& input_shape = input->shape();
+    const Shape& reshape_shape = reshape->shape();
+
+    const int64_t input_rank = input_shape.dimensions().size();
+    const int64_t reshape_rank = reshape_shape.dimensions().size();
+    const int64_t slice_rank = slice->shape().dimensions().size();
+
+    // Reshape must have at least 2 dimensions and same number of
+    // dimensions as slice.
+    if (reshape_rank >= 2 && reshape_rank == slice_rank) {
+      bool is_valid_reshape_slice = true;
+      for (int64_t i = 0; i < slice_rank; ++i) {
+        if (i == slice_rank - 1) {
+          // Continue if we are slicing exactly one element from the last
+          // dimension.
+          if (slice->slice_limits()[i] - slice->slice_starts()[i] == 1 &&
+              slice->slice_strides()[i] == 1) {
+            continue;
+          }
+        } else {
+          // Continue if we are not slicing any other dimension.
+          if (slice->slice_starts()[i] == 0 &&
+              slice->slice_limits()[i] == reshape_shape.dimensions(i) &&
+              slice->slice_strides()[i] == 1) {
+            continue;
+          }
+        }
+        // If the rules above are not met, prevent a match.
+        is_valid_reshape_slice = false;
+        break;
+      }
+
+      // Check if slice is selecting a single element from the last dimension.
+      if (is_valid_reshape_slice) {
+        int64_t slice_index = slice->slice_starts()[slice_rank - 1];
+        int64_t K = reshape_shape.dimensions(reshape_rank - 1);
+
+        // Check if input shape can be viewed as [..., N*K], where N is two or
+        // more, e.g. Input [1, 2024, 4, 128], Reshape [518144, 2].
+        // Last dim of input 128 is multiple of 2.
+        if (!input_shape.dimensions().empty()) {
+          int64_t last_dim = input_shape.dimensions(input_rank - 1);
+          if (last_dim % K == 0 && last_dim / K > 1 &&
+              ShapeUtil::ElementsIn(reshape_shape) ==
+                  ShapeUtil::ElementsIn(input_shape)) {
+            // It matches!
+            DimensionVector starts(input_rank, 0);
+            DimensionVector limits(input_shape.dimensions().begin(),
+                                   input_shape.dimensions().end());
+            DimensionVector strides(input_rank, 1);
+
+            starts[input_rank - 1] = slice_index;
+            limits[input_rank - 1] = last_dim;
+            strides[input_rank - 1] = K;
+
+            Shape new_slice_shape = input_shape;
+            new_slice_shape.set_dimensions(
+                input_rank - 1, input_shape.dimensions(input_rank - 1) / K);
+            simplifier_->UpdateLayout(&new_slice_shape);
+
+            HloInstruction* new_slice =
+                slice->parent()->AddInstruction(HloInstruction::CreateSlice(
+                    new_slice_shape, input, starts, limits, strides));
+            HloInstruction* new_reshape = slice->parent()->AddInstruction(
+                HloInstruction::CreateReshape(slice->shape(), new_slice));
+
+            return ReplaceInstruction(slice, new_reshape);
+          }
+        }
+      }
+    }
+  }
+
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
       hlo_instruction_utils::IsUnstridedSlice(slice) &&
       hlo_instruction_utils::IsUnstridedSlice(slice->operand(0))) {
@@ -9069,7 +9265,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
       *new_dot->mutable_shape()->mutable_layout() = transpose->shape().layout();
 
       dot->SetupDerivedInstruction(new_dot);
-      TF_CHECK_OK(ReplaceInstruction(transpose, new_dot));
+      CHECK_OK(ReplaceInstruction(transpose, new_dot));
       return true;
     }());
     if (did_transform) {
@@ -9914,7 +10110,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleMap(HloInstruction* map) {
   return ReplaceWithNewInstruction(map, std::move(clone));
 }
 
-absl::StatusOr<bool> AlgebraicSimplifier::Run(
+absl::StatusOr<bool> AlgebraicSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
index 9461a1a1dd7de6..37c9215218cc34 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
@@ -423,13 +423,6 @@ class AlgebraicSimplifier : public HloModulePass {
   ~AlgebraicSimplifier() override = default;
   absl::string_view name() const override { return "algsimp"; }
 
-  // Run algebraic simplification on the given computation. Returns whether the
-  // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Create constant from literal with tiles and element size updated in the
   // constant's layout.
   std::unique_ptr<HloInstruction> CreateConstantWithLayoutUpdated(
@@ -440,6 +433,12 @@ class AlgebraicSimplifier : public HloModulePass {
   }
 
  protected:
+  // Run algebraic simplification on the given computation. Returns whether the
+  // computation was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   AlgebraicSimplifierOptions options_;
 };
 
@@ -845,6 +844,11 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
                                         bool multi_output_reduce,
                                         HloReduceInstruction* reduce);
 
+  // Detects a chain of transposes and reshapes that can be replaced with a
+  // nop.
+  absl::StatusOr<bool> TryRemovingReshapeTransposeChain(
+      HloInstruction* reshape);
+
   // Helper function for HandleReduce. Reorders reduce dot
   // to a dot reduce. reduce(dot(A, B)) to dot(A, reduce(B))
   std::optional<absl::Status> ReorderReduceDotToDotReduce(
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
index cf24ab205f9664..19951fdedca0a4 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -61,7 +62,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -71,8 +71,6 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
 namespace m = match;
 namespace op = xla::testing::opcode_matchers;
 
@@ -327,6 +325,62 @@ TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
                   m::ConstantScalar(0.125))));
 }
 
+// Reshape-transpose chain is eliminated since effective transposes
+// compose to identity permutation.
+TEST_F(AlgebraicSimplifierTest, EliminateReshapeTransposeChain) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule m
+    test {
+       %param = f32[224,4,1,4096] parameter(0)
+  %transpose.7800 = f32[224,4096,4,1] transpose(%param), dimensions={0,3,1,2}
+  %reshape.96335 = f32[224,4096,4] reshape(%transpose.7800)
+  %transpose.8665 = f32[224,4,4096] transpose(%reshape.96335), dimensions={0,2,1}
+  ROOT %reshape.96336 = f32[224,4,1,4096] reshape(%transpose.8665)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier(default_options_).Run(m.get()).value();
+  VLOG(2) << "Module after: " << m->ToString();
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
+
+// Reshape-transpose chain is not eliminated since effective transposes
+// do not compose to identity permutation.
+TEST_F(AlgebraicSimplifierTest, NotEliminateReshapeTransposeChain) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule m
+    test {
+      %param = f32[4,4,1,4] parameter(0)
+      %transpose.7800 = f32[4,4,4,1] transpose(%param), dimensions={1,0,3,2}
+      %reshape.96335 = f32[4,4,4] reshape(%transpose.7800)
+      %transpose.8665 = f32[4,4,4] transpose(%reshape.96335), dimensions={0,2,1}
+      ROOT %reshape.96336 = f32[4,4,1,4] reshape(%transpose.8665)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+}
+
+// Reshape-transpose chain is not eliminated since the transpose is identity
+// permutation.
+TEST_F(AlgebraicSimplifierTest, NotEliminateReshapeTransposeChain2) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule m
+    test {
+      %param = f32[4,4,1,4] parameter(0)
+      %reshape.96335 = f32[4,4,4] reshape(%param)
+      %transpose.8665 = f32[4,4,4] transpose(%reshape.96335), dimensions={0,1,2}
+      ROOT %reshape.96336 = f32[4,4,1,4] reshape(%transpose.8665)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape()));
+  VLOG(2) << "Module after: " << m->ToString();
+}
+
 // (Abs(A)) * (Abs(A)) => (A*A)
 TEST_F(AlgebraicSimplifierTest, SquareOfAbs) {
   constexpr absl::string_view kModuleStr = R"(
@@ -5245,6 +5299,72 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceWithReshape) {
+  const absl::string_view hlo_string = R"hlo(
+  HloModule SliceWithReshape
+
+  ENTRY main {
+    %arg = f32[1,2024,4,128]{3,2,1,0} parameter(0)
+    %reshape.1 = f32[2,259072,2]{2,1,0} reshape(%arg)
+    %slice = f32[2,259072,1]{2,1,0} slice(%reshape.1), slice={[0:2], [0:259072], [1:2]}
+    ROOT %reshape.2 = f32[518144]{0} reshape(%slice)
+  }
+)hlo";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(module.get()).value());
+
+  auto* root = module->entry_computation()->root_instruction();
+  LOG(INFO) << module->ToString();
+
+  // Expected: Reshape(Slice(Arg))
+  // AlgebraicSimplifier merges the two reshapes.
+  ASSERT_EQ(root->opcode(), HloOpcode::kReshape);
+  ASSERT_EQ(root->operand_count(), 1);
+  ASSERT_EQ(root->operand(0)->opcode(), HloOpcode::kSlice);
+  ASSERT_EQ(root->operand(0)->operand_count(), 1);
+  ASSERT_EQ(root->operand(0)->operand(0)->opcode(), HloOpcode::kParameter);
+
+  auto* slice = root->operand(0);
+  EXPECT_EQ(slice->slice_strides()[3], 2);
+  EXPECT_EQ(slice->slice_starts()[3], 1);
+  EXPECT_EQ(slice->slice_limits()[3], 128);
+  EXPECT_EQ(slice->shape().dimensions(3), 64);
+}
+
+TEST_F(AlgebraicSimplifierTest, SmallSliceWithReshape) {
+  const absl::string_view hlo_string = R"hlo(
+  HloModule SliceWithReshape
+
+  ENTRY main {
+    %arg = f32[2]{0} parameter(0)
+    %reshape.1 = f32[2,1]{1,0} reshape(%arg)
+    %slice = f32[1,1]{1,0} slice(%reshape.1), slice={[0:1], [0:1]}
+    ROOT %reshape.2 = f32[1]{0} reshape(%slice)
+  }
+)hlo";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(module.get()).value());
+
+  auto* root = module->entry_computation()->root_instruction();
+  LOG(INFO) << module->ToString();
+
+  // Expected: Reshape(Slice(Arg))
+  // AlgebraicSimplifier merges the two reshapes.
+  ASSERT_EQ(root->opcode(), HloOpcode::kReshape);
+  ASSERT_EQ(root->operand_count(), 1);
+  ASSERT_EQ(root->operand(0)->opcode(), HloOpcode::kSlice);
+  ASSERT_EQ(root->operand(0)->operand_count(), 1);
+  ASSERT_EQ(root->operand(0)->operand(0)->opcode(), HloOpcode::kParameter);
+
+  auto* slice = root->operand(0);
+  EXPECT_EQ(slice->slice_strides()[0], 1);
+  EXPECT_EQ(slice->slice_starts()[0], 0);
+  EXPECT_EQ(slice->slice_limits()[0], 1);
+  EXPECT_EQ(slice->shape().dimensions(0), 1);
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfBroadcastToBroadcast) {
   HloComputation::Builder builder(TestName());
   const int64_t dim0 = 11;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc
index adf5e9d800cfe0..7c0908f6e6250b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc
@@ -769,7 +769,9 @@ std::optional<PatternMatchResult> MatchDynamicSlicePadAllGather(
   // Match all-gather for kFlattenedID collective mode.
   absl::StatusOr<CollectiveOpGroupMode> mode = GetCollectiveOpGroupMode(ag_hlo);
 
-  if (!mode.ok() || mode.value() != CollectiveOpGroupMode::kFlattenedID) {
+  if (!mode.ok() ||
+      mode.value() !=
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
     VLOG(2) << "AG does not use global device ids or channel id "
             << ag_hlo->ToString();
     return std::nullopt;
@@ -958,7 +960,7 @@ absl::Status AllGatherPadDsSimplifierVisitor::HandleDynamicSlice(
   return ReplaceInstruction(dynamic_slice, *selected);
 }
 
-absl::StatusOr<bool> AllGatherPadDsSimplifier::Run(
+absl::StatusOr<bool> AllGatherPadDsSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h
index 9e95ceee85d1e6..e2a957d278b906 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h
@@ -157,8 +157,9 @@ class AllGatherPadDsSimplifier : public HloModulePass {
   absl::string_view name() const override {
     return "all-gather-pad-ds-simplifier";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc
index c1305a3809aaf7..5463c1cfaf950b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc
@@ -88,7 +88,7 @@ AllGatherDynamicSlicePermutedOffsetSimplifierVisitor::HandleDynamicSlice(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> AllGatherDynamicSlicePermutedOffsetSimplifier::Run(
+absl::StatusOr<bool> AllGatherDynamicSlicePermutedOffsetSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h
index 71ce4e5cce8ccc..899944cafdeb18 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h
@@ -66,8 +66,8 @@ class AllGatherDynamicSlicePermutedOffsetSimplifier : public HloModulePass {
     return "all-gather-to-collective-permute-simplifier";
   }
 
-  using HloModulePass::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc
index 734cb745154049..caeafbd1826709 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc
@@ -148,9 +148,9 @@ std::optional<std::vector<ReplicaGroup>> FoldReplicaGroups(
 
 }  // namespace
 
-absl::StatusOr<bool> AllReduceFolder::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> AllReduceFolder::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
     VLOG(1) << "Skip AllReduceFolder because the module contains all-reduce "
                "with constrained layouts";
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
index ed43d9be54838b..7653cfa2ff1068 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
@@ -39,8 +39,8 @@ class AllReduceFolder : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-folder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc
index b3dd302d221236..b56ddac8e108c6 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc
@@ -44,10 +44,9 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -578,8 +577,8 @@ absl::StatusOr<bool> ArCrsCombiner::RewriteGraph() {
       auto channel_id = all_reduce->channel_id();
       auto prev = all_reduce->mutable_operand(0);
       auto next = all_reduce->users()[0];
-      TF_CHECK_OK(all_reduce->ReplaceUseWith(next, prev));
-      TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+      CHECK_OK(all_reduce->ReplaceUseWith(next, prev));
+      CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
       while (!next->IsCrossReplicaAllReduce()) {
         switch (next->opcode()) {
           case HloOpcode::kBitcast:
@@ -598,7 +597,7 @@ absl::StatusOr<bool> ArCrsCombiner::RewriteGraph() {
             // other_operand is a cross-module AR, which can be eliminated.
             if (other_operand->IsCrossModuleAllReduce() &&
                 other_operand->user_count() == 1) {
-              TF_CHECK_OK(other_operand->ReplaceAllUsesWith(
+              CHECK_OK(other_operand->ReplaceAllUsesWith(
                   other_operand->mutable_operand(0)));
             } else {
               auto shape = other_operand->shape();
@@ -609,7 +608,7 @@ absl::StatusOr<bool> ArCrsCombiner::RewriteGraph() {
               auto division = parent_computation->AddInstruction(
                   HloInstruction::CreateBinary(shape, HloOpcode::kDivide,
                                                other_operand, divisor));
-              TF_CHECK_OK(other_operand->ReplaceUseWith(next, division));
+              CHECK_OK(other_operand->ReplaceUseWith(next, division));
             }
             break;
           }
@@ -632,7 +631,7 @@ absl::StatusOr<bool> ArCrsCombiner::RewriteGraph() {
   return true;
 }
 
-absl::StatusOr<bool> ArCrsCombiner::Run(
+absl::StatusOr<bool> ArCrsCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   call_graph_ = CallGraph::Build(module);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
index 3b5ffc22bf83c8..d602a5bc9357c5 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
@@ -86,15 +86,16 @@ class ArCrsCombiner : public HloModulePass {
       : num_spatial_partitions_(num_spatial_partitions),
         spmd_partition_(spmd_partition) {}
   absl::string_view name() const override { return "ar-crs-combiner"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Helper method to allow testing of InstructionsComputeSameValue.
   static bool TestInstructionsComputeSameValue(HloInstruction* i1,
                                                HloInstruction* i2);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // We used this struct because multiple ARs could be paired with the same CRS.
   // In this case, we want to select the AR that is furthest from the CRS,
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc
index 0a0ef5127151a8..6aed9f6e007067 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc
@@ -121,7 +121,7 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
   return true;
 }
 
-absl::StatusOr<bool> BatchDotSimplification::Run(
+absl::StatusOr<bool> BatchDotSimplification::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
index 4d82376d61de08..c6a7066e1e98a3 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
@@ -31,11 +31,12 @@ namespace xla {
 // run the DotDecomposer.
 class BatchDotSimplification : public HloModulePass {
  public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::string_view name() const override { return "batch-dot-simplification"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  absl::string_view name() const override { return "batch-dot-simplification"; }
 
  private:
   absl::StatusOr<bool> ElideDegenerateBatchDimensionFromBatchDot(
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc
index 0ed91d0a1b2a09..b1a15d5615ea97 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc
@@ -19,7 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -28,11 +30,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/float_support.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
@@ -55,7 +55,7 @@ class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
                   BFloat16ConversionFolding* bfloat16_conversion_folding) {
     BFloat16ConversionFoldingVisitor visitor(computation, bfloat16_support,
                                              bfloat16_conversion_folding);
-    TF_CHECK_OK(computation->Accept(&visitor));
+    CHECK_OK(computation->Accept(&visitor));
     return visitor.changed_;
   }
 
@@ -268,11 +268,11 @@ absl::Status BFloat16ConversionFoldingVisitor::HandleAllReduce(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> BFloat16ConversionFolding::Run(
+absl::StatusOr<bool> BFloat16ConversionFolding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "BFloat16ConversionFolding::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "BFloat16ConversionFolding::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
     if (BFloat16ConversionFoldingVisitor::Run(comp, bfloat16_support_, this)) {
@@ -280,7 +280,7 @@ absl::StatusOr<bool> BFloat16ConversionFolding::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "BFloat16ConversionFolding::Run(), after:\n" + module->ToString());
+      2, "BFloat16ConversionFolding::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
index b21e512d60bc83..badc0649f1daec 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
@@ -46,10 +46,10 @@ class BFloat16ConversionFolding : public HloModulePass {
   ~BFloat16ConversionFolding() override = default;
   absl::string_view name() const override { return "bfloat16-fold"; }
 
+ protected:
   // Run BF16 conversion folding on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
index 98ee931bdc9f85..491174d70ff8da 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
@@ -34,7 +34,7 @@ namespace xla {
 
 BroadcastCanonicalizer::BroadcastCanonicalizer() {}
 
-absl::StatusOr<bool> BroadcastCanonicalizer::Run(
+absl::StatusOr<bool> BroadcastCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
index fe21b1b997579c..1347df910665ce 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
@@ -32,8 +32,9 @@ class BroadcastCanonicalizer : public HloModulePass {
   explicit BroadcastCanonicalizer();
 
   absl::string_view name() const override { return "broadcast_canonicalizer"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.cc b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.cc
new file mode 100644
index 00000000000000..8b6a935799ec60
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.cc
@@ -0,0 +1,261 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/call_parameter_cleanup.h"
+
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+
+namespace {
+
+// Construct a mapping from parameter numbers in the old computation to
+// parameter numbers in the new computation. This is basically a compaction of
+// the parameters after skipping the ones we'll remove.
+// Also figures out if we need to adjust the parameters (for dead and pure
+// pass-through parameters) and the root (for any kind of pass-through).
+absl::flat_hash_map<int, int> BuildParameterMap(HloComputation* computation,
+                                                bool& adjust_params,
+                                                bool& adjust_root) {
+  adjust_params = false;
+  adjust_root = false;
+
+  absl::flat_hash_map<int, int> old_to_new_parameter_number;
+  int curr_old = 0, curr_new = 0;
+  for (HloInstruction* parameter : computation->parameter_instructions()) {
+    bool dead = false;
+    if (parameter->users().empty()) {
+      // Case 1: Dead parameter, we want to remove it.
+      dead = true;
+    } else {
+      bool found_root_use = false;
+      for (HloInstruction* user : parameter->users()) {
+        if (user == computation->root_instruction()) {
+          found_root_use = true;
+          break;
+        }
+      }
+      if (found_root_use) {
+        // Case 2: Pass-through parameter, we want to remove it from the root
+        // tuple and forward the users to the call operand.
+        adjust_root = true;
+        if (parameter->users().size() == 1) {
+          // Case 2b: Pure pass-through parameter, we want to remove it from
+          // both the root tuple *and* the parameter list.
+          dead = true;
+        }
+      }
+    }
+
+    if (dead) {
+      adjust_params = true;
+    } else {
+      old_to_new_parameter_number[curr_old] = curr_new;
+      ++curr_new;
+    }
+    ++curr_old;
+  }
+
+  return old_to_new_parameter_number;
+}
+
+// Similarly, construct a mapping from output numbers (i.e. tuple indices) in
+// the original computation to output numbers in the new computation, by
+// skipping the ones we'll remove.
+// Also collects the outputs we want to keep into `new_outputs`.
+absl::flat_hash_map<int, int> BuildOutputMap(
+    HloComputation* computation, std::vector<HloInstruction*>& new_outputs) {
+  absl::flat_hash_map<int, int> old_to_new_output_number;
+  int curr_old = 0, curr_new = 0;
+  for (HloInstruction* output : computation->root_instruction()->operands()) {
+    if (output->opcode() == HloOpcode::kParameter) {
+      ++curr_old;
+      continue;
+    }
+    old_to_new_output_number[curr_old] = curr_new;
+    ++curr_old;
+    ++curr_new;
+    new_outputs.push_back(output);
+  }
+  return old_to_new_output_number;
+}
+
+absl::Status ReplaceCallSite(
+    HloInstruction* old_call, HloComputation* new_computation,
+    const absl::flat_hash_map<int, int>& old_to_new_parameter_number,
+    const absl::flat_hash_map<int, int>& old_to_new_output_number,
+    bool adjust_root) {
+  // Create a new call instruction with the new computation and new parameters.
+  std::vector<HloInstruction*> new_call_operands;
+  new_call_operands.reserve(old_call->operands().size());
+
+  for (int i = 0; i < old_call->operands().size(); ++i) {
+    if (old_to_new_parameter_number.find(i) !=
+        old_to_new_parameter_number.end()) {
+      new_call_operands.push_back(old_call->mutable_operand(i));
+    }
+  }
+
+  HloComputation* enclosing_computation = old_call->parent();
+  HloInstruction* new_call =
+      enclosing_computation->AddInstruction(old_call->CloneWithNewOperands(
+          new_computation->root_instruction()->shape(), new_call_operands));
+  new_call->set_to_apply(new_computation);
+
+  // If we didn't remove any pass-through parameters, we're done with this
+  // callsite. Note that we can't unconditionally replace here, because the
+  // output will create a mismatch.
+  if (!adjust_root) {
+    return old_call->ReplaceAllUsesWith(new_call);
+  }
+
+  // The old call produced a tuple. To ensure the shapes match up, create a new
+  // tuple instruction with the right shape, and populate it based on the call's
+  // operands (for pass-through parameters) and the new call's outputs (for
+  // everything else). This creates some cruft, but the tuple simplifier will
+  // clean it up later.
+  HloInstruction* old_root = old_call->to_apply()->root_instruction();
+  std::vector<HloInstruction*> tuple_inputs;
+  for (int i = 0; i < old_root->operands().size(); ++i) {
+    auto iter = old_to_new_output_number.find(i);
+    if (iter != old_to_new_output_number.end()) {
+      HloInstruction* gte = enclosing_computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(new_call, iter->second));
+      tuple_inputs.push_back(gte);
+    } else {
+      tuple_inputs.push_back(
+          old_call->mutable_operand(old_root->operand(i)->parameter_number()));
+    }
+  }
+
+  HloInstruction* new_tuple = enclosing_computation->AddInstruction(
+      HloInstruction::CreateTuple(tuple_inputs));
+  return old_call->ReplaceAllUsesWith(new_tuple);
+}
+
+absl::StatusOr<bool> RemoveDeadParameters(HloComputation* computation) {
+  bool adjust_params, adjust_root;
+  absl::flat_hash_map<int, int> old_to_new_parameter_number =
+      BuildParameterMap(computation, adjust_params, adjust_root);
+
+  // If we don't need to adjust anything, we're done.
+  if (!adjust_params && !adjust_root) {
+    return false;
+  }
+
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  // If we're removing parameters, we need to (a) replace the ones being removed
+  // with null, and (b) adjust the parameter numbers on the remaining ones so
+  // that we don't have "holes".
+  if (adjust_params) {
+    for (HloInstruction* parameter : computation->parameter_instructions()) {
+      auto iter =
+          old_to_new_parameter_number.find(parameter->parameter_number());
+      if (iter == old_to_new_parameter_number.end()) {
+        replacements.insert({parameter, nullptr});
+      } else {
+        replacements.insert({parameter, HloInstruction::CreateParameter(
+                                            iter->second, parameter->shape(),
+                                            parameter->name())});
+      }
+    }
+  }
+
+  HloComputation* new_computation;
+  absl::flat_hash_map<int, int> old_to_new_output_number;
+  if (adjust_root) {
+    replacements.insert({computation->root_instruction(), nullptr});
+    std::vector<HloInstruction*> new_outputs;
+    old_to_new_output_number = BuildOutputMap(computation, new_outputs);
+    new_computation = computation->parent()->AddEmbeddedComputation(
+        computation->CloneWithReplacements(
+            &replacements, /*extra_parameters=*/{},
+            /*context=*/nullptr, /*suffix=*/"undead",
+            /*new_root=*/new_outputs));
+  } else {
+    // Don't fill old_to_new_output_number here, we won't need it.
+    new_computation = computation->parent()->AddEmbeddedComputation(
+        computation->CloneWithReplacements(
+            &replacements, /*extra_parameters=*/{},
+            /*context=*/nullptr, /*suffix=*/"undead"));
+  }
+
+  // The new call computation is ready, now make all the call sites use it.
+  for (HloInstruction* old_call : computation->caller_instructions()) {
+    TF_RETURN_IF_ERROR(ReplaceCallSite(old_call, new_computation,
+                                       old_to_new_parameter_number,
+                                       old_to_new_output_number, adjust_root));
+  }
+
+  return true;
+}
+
+bool ShouldProcessComputation(HloComputation* computation) {
+  // Only process computations with tuple roots. In theory we could also remove
+  // completely dead parameters from a computation with a non-tuple root, but
+  // since pass-through is only a thing for tuples, and it complicates the code,
+  // we don't bother for now.
+  if (computation->root_instruction()->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+  for (HloInstruction* instruction : computation->caller_instructions()) {
+    if (instruction->opcode() != HloOpcode::kCall) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+absl::StatusOr<bool> CallParameterCleanup::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  std::vector<HloComputation*> computations_to_process;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    if (ShouldProcessComputation(computation)) {
+      computations_to_process.push_back(computation);
+    }
+  }
+
+  bool changed = false;
+  for (HloComputation* computation : computations_to_process) {
+    TF_ASSIGN_OR_RETURN(bool removed, RemoveDeadParameters(computation));
+    changed |= removed;
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.h b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.h
new file mode 100644
index 00000000000000..a40572a01a66b3
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_CALL_PARAMETER_CLEANUP_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_CALL_PARAMETER_CLEANUP_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// This pass:
+// a) Removes dead (unused) parameters.
+// b) Rewrites calls that pass a parameter through s.t. the users of the
+//    pass-through parameter instead directly use the operand to the call. If
+//    this transformation would make the parameter dead, it is removed.
+class CallParameterCleanup : public HloModulePass {
+ public:
+  CallParameterCleanup() = default;
+  ~CallParameterCleanup() override = default;
+
+  static constexpr absl::string_view kName = "call-parameter-cleanup";
+  absl::string_view name() const override { return kName; }
+
+ protected:
+  // Runs the pass on the given module. Returns whether the module was changed
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_CALL_PARAMETER_CLEANUP_H_
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup_test.cc
new file mode 100644
index 00000000000000..b3cc640097d3b0
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup_test.cc
@@ -0,0 +1,310 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/call_parameter_cleanup.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class CallParameterCleanupTest : public HloHardwareIndependentTestBase {
+ protected:
+  CallParameterCleanupTest()
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/false,
+            /*allow_mixed_precision_in_hlo_verifier=*/true) {}
+};
+
+namespace {
+
+namespace m = ::xla::match;
+
+TEST_F(CallParameterCleanupTest, DeadParameter) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[]) tuple(add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 1 to be removed.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call, m::Parameter(0), m::Parameter(2))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, PassThroughParameter) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[]) tuple(b, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 1 to be passed through directly from the
+  // entry computation parameter, and removed from the call parameters.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(1),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(2)), 0))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, UsedPassThroughParameter) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[]) tuple(c, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 2 to be passed through directly from the
+  // entry computation parameter, but not removed from the call parameters.
+  // Parameter 1 gets removed.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(2),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(2)), 0))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, DeadPassThroughParameterMultipleUses) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[], s32[]) tuple(b, add, b)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[], s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(1),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(2)), 0),
+                  m::Parameter(1))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, UsedPassThroughParameterNoDeadParams) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  add = s32[] add(a, b)
+  ROOT tuple = (s32[], s32[]) tuple(b, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  ROOT call = (s32[], s32[]) call(p0, p1), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 1 to be passed through directly from the
+  // entry computation parameter, but not removed from the call parameters.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(1),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(1)), 0))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, MultipleCallSites) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[]) tuple(c, add)
+}
+
+wrap {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p1, p2, p0), to_apply=add
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  call0 = (s32[], s32[]) call(p0, p1, p2), to_apply=add
+  call1 = (s32[], s32[]) call(p0, p1, p2), to_apply=wrap
+  gte0 = s32[] get-tuple-element(call0), index=1
+  gte1 = s32[] get-tuple-element(call1), index=0
+  ROOT mul = s32[] multiply(gte0, gte1)
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  CHECK_OK(dce.Run(module.get()).status());
+  CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect both call sites to use the same computation, for 3 computations
+  // total, rather than 4.
+  EXPECT_EQ(module->computation_count(), 3);
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers.cc b/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers.cc
new file mode 100644
index 00000000000000..e75d6c5300c2b6
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/computation_canonicalizers.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace xla {
+
+namespace {
+static bool IsConstant(const HloInstruction* hlo) {
+  return HloPredicateIsOp<HloOpcode::kConstant>(hlo);
+}
+
+static bool IsParameter(const HloInstruction* hlo) {
+  return HloPredicateIsOp<HloOpcode::kParameter>(hlo);
+}
+
+static bool IsGetTupleElement(const HloInstruction* hlo) {
+  return HloPredicateIsOp<HloOpcode::kGetTupleElement>(hlo);
+}
+}  // namespace
+
+absl::StatusOr<bool> MoveGTEsRightAfterTupleDefinition(
+    HloComputation& computation) {
+  HloInstructionSequence new_sequence;
+  HloSchedule& schedule = computation.parent()->schedule();
+  const HloInstructionSequence sequence =
+      schedule.GetOrCreateSequence(&computation);
+
+  absl::flat_hash_set<HloInstruction*> moved_gtes;
+
+  for (HloInstruction* inst : sequence.instructions()) {
+    if (!moved_gtes.contains(inst)) {
+      new_sequence.push_back(inst);
+    }
+    if (!inst->shape().IsTuple()) {
+      continue;
+    }
+    for (HloInstruction* user : inst->users()) {
+      if (IsGetTupleElement(user) && !user->HasControlDependencies()) {
+        new_sequence.push_back(user);
+        moved_gtes.insert(user);
+      }
+    }
+  }
+
+  bool changed = new_sequence != sequence;
+  schedule.set_sequence(&computation, std::move(new_sequence));
+  return changed;
+}
+
+absl::StatusOr<bool> MoveParametersAndConstantsToFront(
+    HloComputation& computation) {
+  HloInstructionSequence new_sequence;
+  HloSchedule& schedule = computation.parent()->schedule();
+  HloInstructionSequence& sequence = schedule.GetOrCreateSequence(&computation);
+
+  for (HloInstruction* inst : sequence.instructions()) {
+    if (IsParameter(inst) || IsConstant(inst)) {
+      new_sequence.push_back(inst);
+
+      // Because we move instruction to the front of the computation we can't
+      // have any control predecessors, however silently dropping them is unsafe
+      // as we can have transitive dependencies that define schedule order, so
+      // we forward control predecessors to all users.
+      for (HloInstruction* control_predecessor : inst->control_predecessors()) {
+        for (HloInstruction* user : inst->users()) {
+          TF_RETURN_IF_ERROR(control_predecessor->AddControlDependencyTo(user));
+        }
+      }
+      TF_RETURN_IF_ERROR(inst->DropAllControlDeps());
+    }
+  }
+
+  for (HloInstruction* inst : sequence.instructions()) {
+    if (!IsParameter(inst) && !IsConstant(inst)) {
+      new_sequence.push_back(inst);
+    }
+  }
+
+  schedule.set_sequence(&computation, new_sequence);
+  const auto& old_instructions = sequence.instructions();
+  const auto& new_instructions = new_sequence.instructions();
+  for (size_t i = 0; i < old_instructions.size(); ++i) {
+    if (old_instructions[i] != new_instructions[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers.h b/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers.h
new file mode 100644
index 00000000000000..2b9736cfedf063
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers.h
@@ -0,0 +1,39 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_COMPUTATION_CANONICALIZERS_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_COMPUTATION_CANONICALIZERS_H_
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+
+namespace xla {
+
+// This function moves kParameter and kConstant instructions in a computation to
+// the beginning of the computation. This simplifies other transformations like
+// the construction of command buffer computations because we don't need to deal
+// with parameters and constants that have users outside of a command buffer.
+// Returns true if there is a change in the order of instructions, false
+// otherwise.
+absl::StatusOr<bool> MoveParametersAndConstantsToFront(HloComputation&);
+
+// Moves GetTupleElement instructions to right after the instruction that
+// produces the tuple. Returns whether the computation was changed. This is run,
+// for instance, before command buffer scheduling.
+absl::StatusOr<bool> MoveGTEsRightAfterTupleDefinition(HloComputation&);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_COMPUTATION_CANONICALIZERS_H_
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers_test.cc
new file mode 100644
index 00000000000000..62217c091037d5
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/computation_canonicalizers_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/hlo/transforms/simplifiers/computation_canonicalizers.h"
+
+#include <gtest/gtest.h>
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using ComputationCanonicalizersTest = HloHardwareIndependentTestBase;
+
+TEST_F(ComputationCanonicalizersTest, MoveParametersToFront) {
+  const char* hlo = R"(
+      HloModule TestModule, is_scheduled=true
+
+      %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      %fused_computation.1 (param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      ENTRY %main (a: s32[], b: s32[], c: s32[]) -> s32[] {
+        %a = s32[] parameter(0)
+        %b = s32[] parameter(1)
+        %fusion = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop, calls=%fused_computation
+        %c = s32[] parameter(2)
+        ROOT %fusion.1 = s32[] fusion(s32[] %a, s32[] %c), kind=kLoop, calls=%fused_computation.1
+      })";
+
+  const char* expected = R"(
+// CHECK: ENTRY %main (a: s32[], b: s32[], c: s32[]) -> s32[] {
+// CHECK:   %a = s32[] parameter(0)
+// CHECK:   %b = s32[] parameter(1)
+// CHECK:   %c = s32[] parameter(2)
+// CHECK:   %fusion = s32[] fusion(%a, %b), kind=kLoop, calls=%fused_computation
+// CHECK:   ROOT %fusion.1 = s32[] fusion(%a, %c), kind=kLoop, calls=%fused_computation.1
+// CHECK: })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  TF_ASSERT_OK(MoveParametersAndConstantsToFront(*module->entry_computation()));
+  EXPECT_THAT(
+      RunFileCheck(
+          module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
+          expected),
+      absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(ComputationCanonicalizersTest, MoveGTEsRightAfterTupleDefinition) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule m, is_scheduled=true
+e {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  t = tuple(a, b)
+  x = s32[] add(a, b)
+  g0 = s32[] get-tuple-element(t), index=0
+  g1 = s32[] get-tuple-element(t), index=1
+  r = s32[] multiply(g0, g1)
+})"));
+  EXPECT_THAT(MoveGTEsRightAfterTupleDefinition(*module->entry_computation()),
+              absl_testing::IsOkAndHolds(true));
+  EXPECT_THAT(RunFileCheck(module->ToString(),
+                           R"(
+// CHECK:      parameter
+// CHECK-NEXT: parameter
+// CHECK-NEXT: tuple
+// CHECK-NEXT: get-tuple-element
+// CHECK-NEXT: get-tuple-element
+// CHECK-NEXT: add
+// CHECK-NEXT: multiply
+)"),
+              absl_testing::IsOkAndHolds(true));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc
index 9eaab45fdcfe4a..b11a058d98e7e5 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc
@@ -119,11 +119,11 @@ absl::StatusOr<bool> CanonicalizeNonTupleConditional(
 
 }  // namespace
 
-absl::StatusOr<bool> ConditionalCanonicalizer::Run(
+absl::StatusOr<bool> ConditionalCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      2, "ConditionalCanonicalizer::Run(), before:\n" + module->ToString());
+      2, "ConditionalCanonicalizer::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
     for (auto* inst : comp->MakeInstructionPostOrder()) {
@@ -133,7 +133,7 @@ absl::StatusOr<bool> ConditionalCanonicalizer::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "ConditionalCanonicalizer::Run(), after:\n" + module->ToString());
+      2, "ConditionalCanonicalizer::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
index 9920f6d0c21cf3..4fa2b36626c96b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
@@ -34,8 +34,8 @@ class ConditionalCanonicalizer : public HloModulePass {
     return "conditional-canonicalizer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc
index 7e9678bb515f79..92a5c97183aa6f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc
@@ -86,7 +86,7 @@ HloInstructionSequence DeferConstants(const HloInstructionSequence& sequence) {
   return new_sequence;
 }
 
-absl::StatusOr<bool> ConstantDeferring::Run(
+absl::StatusOr<bool> ConstantDeferring::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h
index bf50f9cfe2f363..76968a46476f85 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h
@@ -34,8 +34,9 @@ HloInstructionSequence DeferConstants(const HloInstructionSequence& sequence);
 class ConstantDeferring : public HloModulePass {
  public:
   absl::string_view name() const override { return "constant-deferring"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc
index 40c91f1c5e9f81..3f60e885eeb501 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc
@@ -211,7 +211,7 @@ absl::StatusOr<bool> MoveConvertPrecisionOps(HloComputation* comp) {
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> ConvertMover::Run(
+absl::StatusOr<bool> ConvertMover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
index 43732be420ea21..d69549333b20a2 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
@@ -43,8 +43,9 @@ class ConvertMover : public HloModulePass {
   ConvertMover() = default;
 
   absl::string_view name() const override { return "convert-mover"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
index a2391d2e95cd9b..0a5b3a7226ec84 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
@@ -22,23 +22,24 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/primitive_util.h"
 #include "xla/service/hlo_creation_utils.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
@@ -104,7 +105,7 @@ bool ConvolutionVisitor::Run(
     bool convert_batch_groups_only, bool filter_expansion) {
   ConvolutionVisitor visitor(computation, should_expand, is_cost_viable,
                              convert_batch_groups_only, filter_expansion);
-  TF_CHECK_OK(computation->Accept(&visitor));
+  CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
 
@@ -317,7 +318,7 @@ absl::Status ConvolutionVisitor::HandleBatchGroupCount(
             /*preferred_element_type=*/convolution->shape().element_type())
             .value();
     convolution->SetupDerivedInstruction(new_convolution);
-    TF_CHECK_OK(computation_->ReplaceInstruction(
+    CHECK_OK(computation_->ReplaceInstruction(
         convolution,
         MakeReshapeHlo(convolution->shape(), new_convolution).value()));
     changed_ = true;
@@ -416,7 +417,7 @@ absl::Status ConvolutionVisitor::HandleBatchGroupCount(
     auto reduce_window_converted =
         HloInstruction::CreateConvert(convert_back_shape, reduce_window);
 
-    TF_CHECK_OK(computation_->ReplaceWithNewInstruction(
+    CHECK_OK(computation_->ReplaceWithNewInstruction(
         convolution, std::move(reduce_window_converted)));
     changed_ = true;
   }
@@ -682,11 +683,11 @@ absl::Status ConvolutionVisitor::HandleConvolution(
 
 }  // namespace
 
-absl::StatusOr<bool> ConvolutionGroupConverter::Run(
+absl::StatusOr<bool> ConvolutionGroupConverter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "ConvolutionGroupConverter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "ConvolutionGroupConverter::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
     if (ConvolutionVisitor::Run(comp, should_expand_, is_cost_viable_,
@@ -696,7 +697,7 @@ absl::StatusOr<bool> ConvolutionGroupConverter::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "ConvolutionGroupConverter::Run(), after:\n" + module->ToString());
+      2, "ConvolutionGroupConverter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
index 8e7059d6109aed..02dac368f4098f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
@@ -44,13 +44,6 @@ class ConvolutionGroupConverter : public HloModulePass {
     return "convolution-group-converter";
   }
 
-  // Run convolution rewriting on the given computation. Returns whether the
-  // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Predicate that determines whether this pass should rewrite a given
   // convolution.
   std::function<bool(HloInstruction*)> should_expand_;
@@ -64,6 +57,13 @@ class ConvolutionGroupConverter : public HloModulePass {
 
   // Tells whether filter expansion is required.
   bool filter_expansion_;
+
+ protected:
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc
index 8090c572d34d3f..c2f6d31379d679 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc
@@ -151,7 +151,7 @@ class BatchDimensionMerger : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> DotDimensionMerger::Run(
+absl::StatusOr<bool> DotDimensionMerger::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return BatchDimensionMerger().RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
index 52bf94d24154a6..3a6fd5019351c1 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
@@ -29,10 +29,8 @@ class DotDimensionMerger : public HloModulePass {
  public:
   absl::string_view name() const override { return "dot_dimension_merger"; }
 
-  // Run the pass on computations in 'module'.
-  // Return whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
index 0b74800c14853d..db0040ed1ba75b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
@@ -564,8 +564,18 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
     }
   }
 
-  // Now it's finally safe to delete the old instructions from the graph.
-  for (HloInstruction* instr : dead_instrs) {
+  // Now it's finally safe to delete the old instructions from the graph. We
+  // need to sort by unique id again to make the removal order deterministic.
+  // Otherwise the users list of HloInstructions would be non-deterministic, as
+  // removal works by swapping with the last element of the vector and then
+  // popping the last element.
+  std::vector<HloInstruction*> sorted_dead_instrs(dead_instrs.begin(),
+                                                  dead_instrs.end());
+  absl::c_sort(sorted_dead_instrs,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->unique_id() < b->unique_id();
+               });
+  for (HloInstruction* instr : sorted_dead_instrs) {
     TF_RETURN_IF_ERROR(comp->RemoveInstruction(instr));
   }
 
@@ -574,7 +584,7 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> DotMerger::Run(
+absl::StatusOr<bool> DotMerger::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
index c303f45df4824d..c89ba6153f2feb 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
@@ -65,8 +65,9 @@ class DotMerger : public HloModulePass {
       : max_size_to_merge_(max_size_to_merge), can_merge_(can_merge) {}
 
   absl::string_view name() const override { return "dot-merger"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
index 7592826817976a..3e6d3e109c6196 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
@@ -59,7 +59,7 @@ TEST_F(DotMergerTest, MergeRHS) {
     rhs1 = f32[100, 50] parameter(2)
     dot0 = f32[200, 10] dot(lhs, rhs0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
     dot1 = f32[200, 50] dot(lhs, rhs1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    ROOT tuple = (f32[200,10], f32[200,50]) tuple(dot0, dot1)
+    ROOT tuple = (f32[200,10], f32[200,50], f32[200,100]) tuple(dot0, dot1, lhs)
   })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(module_string));
@@ -68,14 +68,18 @@ TEST_F(DotMergerTest, MergeRHS) {
   EXPECT_TRUE(changed);
   const HloInstruction* dot0 = nullptr;
   const HloInstruction* dot1 = nullptr;
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(m::Slice(m::Op(&dot0)), m::Slice(m::Op(&dot1)))));
+  const HloInstruction* lhs = nullptr;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Slice(m::Op(&dot0)),
+                                  m::Slice(m::Op(&dot1)), m::Op(&lhs))));
   EXPECT_EQ(dot0, dot1);
   EXPECT_THAT(dot0,
               GmockMatch(m::Dot(m::Parameter(0),
                                 m::Concatenate().WithBinaryOperandsAnyOrder(
                                     m::Parameter(1), m::Parameter(2)))));
+  EXPECT_TRUE(lhs != nullptr);
+  // We want a deterministic first user.
+  EXPECT_EQ(lhs->users()[0], dot0);
 }
 
 TEST_F(DotMergerTest, MergeRHSWithLHS) {
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
index d6a1734e0b230b..00bf452d035a1e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
@@ -170,11 +170,11 @@ absl::StatusOr<bool> IdentityReshapeRemoving(HloInstruction* reshape) {
 
 }  // namespace
 
-absl::StatusOr<bool> DynamicDimensionSimplifier::Run(
+absl::StatusOr<bool> DynamicDimensionSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "DynamicDimensionSimplifier::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "DynamicDimensionSimplifier::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
 
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
@@ -215,8 +215,8 @@ absl::StatusOr<bool> DynamicDimensionSimplifier::Run(
       changed |= local_changed;
     }
   }
-  XLA_VLOG_LINES(
-      2, "DynamicDimensionSimplifier::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "DynamicDimensionSimplifier::RunImpl(), after:\n" +
+                        module->ToString());
   return changed;
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
index eb52ffaa82c08f..2e670b29832749 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
@@ -33,8 +33,8 @@ class DynamicDimensionSimplifier : public HloModulePass {
     return "dynamic-dimension-simplifier";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
index e52c9dade29f6b..f644881c517c8f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
@@ -112,7 +112,7 @@ absl::Status AnnotateNode(const CallGraphNode& node) {
 
 }  // namespace
 
-absl::StatusOr<bool> FlattenCallGraph::Run(
+absl::StatusOr<bool> FlattenCallGraph::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(3, "Before flatten call graph:\n" + module->ToString());
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
index 25a3abb97b009c..fa36a6f1e05481 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
@@ -32,10 +32,10 @@ class FlattenCallGraph : public HloModulePass {
  public:
   absl::string_view name() const override { return "flatten-call-graph"; }
 
+ protected:
   // Duplicates computations called from multiple call- or while-nodes to
   // flatten the call graph.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
index f0f9e09f90a561..e0ec47713d19f0 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
@@ -669,10 +669,10 @@ CloneComputationsForNonNormalizingInstructions(
 }
 }  // namespace
 
-absl::StatusOr<bool> FloatNormalization::Run(
+absl::StatusOr<bool> FloatNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(2, "FloatNormalization::Run() for " +
+  XLA_VLOG_LINES(2, "FloatNormalization::RunImpl() for " +
                         primitive_util::LowercasePrimitiveTypeName(
                             float_support_->LowPrecisionType()) +
                         ", before:\n" + module->ToString());
@@ -686,7 +686,7 @@ absl::StatusOr<bool> FloatNormalization::Run(
     if (computations_to_skip.contains(comp)) continue;
     TF_RETURN_IF_ERROR(comp->Accept(&visitor));
   }
-  XLA_VLOG_LINES(2, "FloatNormalization::Run() for " +
+  XLA_VLOG_LINES(2, "FloatNormalization::RunImpl() for " +
                         primitive_util::LowercasePrimitiveTypeName(
                             float_support_->LowPrecisionType()) +
                         ", after:\n" + module->ToString());
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
index 84a19601ca289d..7ac05015e55273 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
@@ -43,10 +43,10 @@ class FloatNormalization : public HloModulePass {
   ~FloatNormalization() override = default;
   absl::string_view name() const override { return name_; }
 
+ protected:
   // Run float normalization on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -73,12 +73,12 @@ class BFloat16MixedPrecisionRemoval : public HloModulePass {
     return "bf16-mixed-precision-removal";
   }
 
+ protected:
   // Run mixed precision removal on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     FloatNormalization normalization(&no_mixed_precision_support_);
     return normalization.Run(module, execution_threads);
   }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc
index 7b691b4b95983b..29cdd78ee8ed80 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc
@@ -95,7 +95,7 @@ bool ProcessComputation(HloComputation* computation) {
   return changed;
 }
 
-absl::StatusOr<bool> FusionConstantSinking::Run(
+absl::StatusOr<bool> FusionConstantSinking::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(3) << "HLO module before FusionConstantSinking:";
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
index 0fe609f8582f11..a8f2d7441ef4a7 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
@@ -29,10 +29,10 @@ class FusionConstantSinking : public HloModulePass {
  public:
   absl::string_view name() const override { return "fusion_constant_sinking"; }
 
+ protected:
   // Run fusion constant sinking operations on the given module. Returns whether
   // the module was changed (constant expressions folded).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
index 2fe0fb20da8dff..49e83965bee11a 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
@@ -24,11 +24,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/literal_util.h"
-#include "xla/permutation_util.h"
 #include "xla/service/gather_scatter_utils.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape_util.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -47,20 +45,17 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
   const auto& dims = gather->gather_dimension_numbers();
   int operand_rank =
       dims.collapsed_slice_dims().size() + dims.offset_dims().size();
-
-  // Make the operand conform to start_index_map.
-  auto [operand_permutation, operand_permutation_inverse] =
-      MakeOperandStartIndexPermutations(dims.start_index_map(), operand_rank);
   auto* operand = gather->operands()[0];
   auto* start_indices = gather->operands()[1];
-  TF_ASSIGN_OR_RETURN(operand, MaybeTranspose(operand, operand_permutation));
+
+  // Make the start_indices a two-dimensional tensor.
   TF_ASSIGN_OR_RETURN(
       start_indices,
       TransformStartIndices(start_indices, dims.index_vector_dim()));
 
   // Permute the slice sizes according to start_index_map and compute the new
   // output shape for the Gather op.
-  auto slice_sizes = Permute(gather->gather_slice_sizes(), operand_permutation);
+  const auto slice_sizes = gather->gather_slice_sizes();
   std::vector<int64_t> output_dims = {start_indices->shape().dimensions(0)};
   absl::c_copy(slice_sizes, std::back_inserter(output_dims));
   Shape output_shape =
@@ -68,23 +63,15 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
 
   std::vector<int64_t> offset_dims(operand_rank);
   absl::c_iota(offset_dims, 1);
-  std::vector<int64_t> start_index_map(dims.start_index_map().size());
-  absl::c_iota(start_index_map, 0);
 
   auto* result = gather->AddInstruction(HloInstruction::CreateGather(
       output_shape, operand, start_indices,
-      HloGatherInstruction::MakeGatherDimNumbers(
-          offset_dims,
-          /*collapsed_slice_dims=*/{}, start_index_map, /*index_vector_dim=*/1),
+      HloGatherInstruction::MakeGatherDimNumbers(offset_dims,
+                                                 /*collapsed_slice_dims=*/{},
+                                                 dims.start_index_map(),
+                                                 /*index_vector_dim=*/1),
       slice_sizes, gather->indices_are_sorted()));
 
-  // Undo the start_index_map transpose.
-  std::vector<int64_t> output_permutation(1 +  // start index dimension.
-                                          operand_rank);
-  absl::c_transform(operand_permutation_inverse, output_permutation.begin() + 1,
-                    [](int64_t dim) { return dim + 1; });
-  TF_ASSIGN_OR_RETURN(result, MaybeTranspose(result, output_permutation));
-
   // Collapse the requested slice dimensions.
   if (!dims.collapsed_slice_dims().empty()) {
     std::vector<int64_t> collapsed_slice_dims(
@@ -131,9 +118,7 @@ bool GatherSimplifier::IsSimplifiedGather(const HloGatherInstruction* gather) {
   auto* start_indices = gather->operands()[1];
   const auto& dims = gather->gather_dimension_numbers();
   return start_indices->shape().dimensions().size() == 2 &&
-         dims.index_vector_dim() == 1 &&
-         IsIdentityPermutation(dims.start_index_map()) &&
-         dims.collapsed_slice_dims().empty() &&
+         dims.index_vector_dim() == 1 && dims.collapsed_slice_dims().empty() &&
          *dims.offset_dims().begin() == 1 &&
          *dims.offset_dims().rbegin() == dims.offset_dims().size();
 }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
index 15edb691c661de..a264bcba8574cd 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
@@ -29,7 +29,6 @@ namespace xla {
 // The output gather's attributes will have the following characteristics:
 // - start_indices is a two-dimensional tensor
 // - index_vector_dim is 1
-// - start_index_map is [0, 1, ...]
 // - collapsed_slice_dims is []
 // - offset_dims is [1, 2, ...]
 //
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc
index 6cfa33dfaf45c4..b7a530f0cf1b24 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc
@@ -84,8 +84,8 @@ TEST_F(GatherSimplifierTest, RemovesCollapsedSliceDims) {
   )");
 }
 
-TEST_F(GatherSimplifierTest, MakesStartIndexMapIdentity) {
-  // Verifies that GatherSimplifier ensures start_index_map is {0, 1, ...}.
+TEST_F(GatherSimplifierTest, KeepsStartIndexIntact) {
+  // Verifies that GatherSimplifier does not change the start_index_map.
   constexpr absl::string_view kModuleStr = R"(
     HloModule gather_simplifier
 
@@ -100,13 +100,8 @@ TEST_F(GatherSimplifierTest, MakesStartIndexMapIdentity) {
           slice_sizes={1,2,3}
     })";
 
-  RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), R"(
-  %operand = f32[33,34,35]{2,1,0} parameter(0)
-           CHECK: %[[OPERAND:.*]] = f32[35,33,34]{2,1,0} transpose(%operand)
-           CHECK: %[[GATHER:.*]] = f32[42,3,1,2]{{.*}} gather(%[[OPERAND]],
-      CHECK-SAME:    start_index_map={0,1,2},
-           CHECK: ROOT {{.*}} = f32[42,1,2,3]{{.*}} transpose(%[[GATHER]])
-  )");
+  // Expect unchanged.
+  RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), std::nullopt);
 }
 
 TEST_F(GatherSimplifierTest, CollapsesSomeDims) {
@@ -176,8 +171,8 @@ TEST_F(GatherSimplifierTest, ZeroSizeSlice) {
 
   // The shape check is sufficient.
   RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), R"(
-      CHECK: %[[ZERO:.*]] = f32[] constant(0) 
-      CHECK: ROOT {{.*}} = f32[3,2]{1,0} broadcast(%[[ZERO]]), dimensions={} 
+      CHECK: %[[ZERO:.*]] = f32[] constant(0)
+      CHECK: ROOT {{.*}} = f32[3,2]{1,0} broadcast(%[[ZERO]]), dimensions={}
   )");
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
index d3dc79e6b9c65b..de2bc2387b4c91 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
@@ -44,7 +44,7 @@ bool HloComputationDeduplicator::ContainsLargeConstants(HloComputation* comp) {
   return false;
 }
 
-absl::StatusOr<bool> HloComputationDeduplicator::Run(
+absl::StatusOr<bool> HloComputationDeduplicator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<std::string, HloComputation*> unique_comps;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
index 7d04caede058dd..4a548b35f92186 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
@@ -36,8 +36,8 @@ class HloComputationDeduplicator : public HloModulePass {
       : mark_fusion_duplications_(mark_fusion_duplications) {}
   absl::string_view name() const override { return "computation-deduplicator"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
index 0afddf0eefd4ed..e5848c4ba7c283 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
@@ -283,7 +283,7 @@ absl::StatusOr<bool> PropagateIdenticalConstantArguments(
 
 }  // namespace
 
-absl::StatusOr<bool> HloConstantFolding::Run(
+absl::StatusOr<bool> HloConstantFolding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Limit the constant folding to 0 iterations to skip folding loops in the
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
index 084185d4ec45e3..00bec34045f771 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
@@ -48,10 +48,10 @@ class HloConstantFolding : public HloModulePass {
   explicit HloConstantFolding(Level level = Level::kDefault) : level_(level) {}
   absl::string_view name() const override { return "constant_folding"; }
 
+ protected:
   // Run constant folding operations on the given module. Returns whether the
   // module was changed (constant expressions folded).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc
index 61d269bfbf3bdd..1f6c3fe7f6f7f1 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc
@@ -114,7 +114,7 @@ absl::StatusOr<bool> DuplicateConstantExpressionPerUser(
 
 }  // namespace
 
-absl::StatusOr<bool> HloConstantSplitter::Run(
+absl::StatusOr<bool> HloConstantSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
index 2f58909fb8a7d4..98162cae363f66 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
@@ -45,8 +45,9 @@ class HloConstantSplitter : public HloModulePass {
       : split_expressions_(split_expressions),
         extra_constraints_(extra_constraints) {}
   absl::string_view name() const override { return "hlo-constant-splitter"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
index 783426fa988e44..7493654247e567 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
@@ -41,7 +41,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/call_graph.h"
+#include "xla/service/computation_layout.h"
 #include "xla/shape.h"
+#include "xla/shape_layout.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -51,6 +53,9 @@ namespace xla {
 
 namespace {
 
+const absl::string_view kDceSideEffectFrontendAttribute =
+    "xla_allow_dce_side_effecting_op";
+
 // Checks if the instruction is a removable while given
 // remove_cross_partition_collective_ops
 bool IsRemovableWhile(const HloInstruction* instruction,
@@ -208,7 +213,12 @@ bool CanRemoveInstruction(
                             !maybe_collective_op->constrain_layout();
     bool allow_while =
         IsRemovableWhile(instruction, remove_cross_partition_collective_ops);
-    if (!allow_collective && !allow_while) {
+    bool allow_custom_call = instruction->IsCustomCall("tpu_custom_call") &&
+                             instruction->frontend_attributes().map().contains(
+                                 kDceSideEffectFrontendAttribute) &&
+                             instruction->frontend_attributes().map().at(
+                                 kDceSideEffectFrontendAttribute) == "true";
+    if (!allow_collective && !allow_while && !allow_custom_call) {
       return false;
     }
   }
@@ -241,31 +251,66 @@ absl::StatusOr<bool> RemoveDeadRoots(
   }
   return changed;
 }
+absl::Status RemoveDeadParametersFromEntryComputationLayout(
+    HloModule* module, std::vector<int64_t>& dead_parameter_indexes) {
+  if (dead_parameter_indexes.empty()) {
+    return absl::OkStatus();
+  }
+  const ComputationLayout& old_layout = module->entry_computation_layout();
+  ShapeLayout result_layout = old_layout.result_layout();
+  ComputationLayout new_layout(result_layout);
+  for (int i = 0; i < old_layout.parameter_count(); ++i) {
+    if (absl::c_linear_search(dead_parameter_indexes, i)) {
+      continue;
+    }
+    new_layout.add_parameter_layout(old_layout.parameter_layout(i));
+  }
+  *module->mutable_entry_computation_layout() = std::move(new_layout);
+  return absl::OkStatus();
+}
 
 absl::StatusOr<bool> RemoveDeadParameters(
     HloComputation* computation,
     const std::function<std::vector<HloInstruction*>(const HloComputation*)>&
-        computation_callers) {
+        computation_callers,
+    bool remove_dead_parameters_from_entry_computation) {
   bool changed = false;
+  bool update_entry_computation_layout =
+      computation->IsEntryComputation() &&
+      remove_dead_parameters_from_entry_computation;
   auto parameters = computation->parameter_instructions();
   // Sort into decreasing order by parameter number, otherwise the renumbering
   // of parameters when one parameter is deleted will cause issues.
   absl::c_reverse(parameters);
+  std::vector<int64_t> dead_parameters;
   for (HloInstruction* parameter : parameters) {
     if (parameter->IsDead() &&
         computation->IsSafelyRemovable(
             parameter,
             /*ignore_control_dependency=*/false,
-            /*computation_callers=*/computation_callers)) {
+            /*computation_callers=*/computation_callers,
+            remove_dead_parameters_from_entry_computation)) {
       VLOG(1) << "Removing dead parameter " << parameter->ToString()
               << " and its unused operands";
+      int64_t num_parameters = computation->num_parameters();
+      int64_t parameter_number = parameter->parameter_number();
       TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(
           parameter, /*cleanup=*/std::nullopt,
           /*ignore_control_dependencies=*/false,
-          /*computation_callers=*/computation_callers));
-      changed = true;
+          /*computation_callers=*/computation_callers,
+          remove_dead_parameters_from_entry_computation));
+      if (computation->num_parameters() < num_parameters) {
+        changed = true;
+        if (update_entry_computation_layout) {
+          dead_parameters.push_back(parameter_number);
+        }
+      }
     }
   }
+  if (update_entry_computation_layout) {
+    TF_RETURN_IF_ERROR(RemoveDeadParametersFromEntryComputationLayout(
+        computation->parent(), dead_parameters));
+  }
   return changed;
 }
 
@@ -289,7 +334,8 @@ absl::StatusOr<bool> ProcessAgenda(
     HloModule* module, std::stack<HloComputation*>& agenda,
     absl::flat_hash_set<HloComputation*>& to_remove,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
-    bool remove_cross_partition_collective_ops, CallGraph* call_graph) {
+    bool remove_cross_partition_collective_ops, CallGraph* call_graph,
+    bool remove_dead_parameters_from_entry_computation) {
   bool changed = false;
   while (!agenda.empty()) {
     HloComputation* computation = agenda.top();
@@ -300,7 +346,8 @@ absl::StatusOr<bool> ProcessAgenda(
       TF_ASSIGN_OR_RETURN(
           bool computation_changed,
           xla::HloDCE::RunOnComputation(
-              computation, remove_cross_partition_collective_ops, call_graph));
+              computation, remove_cross_partition_collective_ops, call_graph,
+              remove_dead_parameters_from_entry_computation));
       changed |= computation_changed;
     }
 
@@ -353,7 +400,7 @@ absl::StatusOr<bool> RemoveDanglingComputations(
 
 /*static*/ absl::StatusOr<bool> HloDCE::RunOnComputation(
     HloComputation* computation, bool remove_cross_partition_collective_ops,
-    CallGraph* call_graph) {
+    CallGraph* call_graph, bool remove_dead_parameters_from_entry_computation) {
   auto computation_callers =
       [call_graph](
           const HloComputation* computation) -> std::vector<HloInstruction*> {
@@ -374,14 +421,16 @@ absl::StatusOr<bool> RemoveDanglingComputations(
                       computation_callers));
   changed |= dead_roots_changed;
 
-  TF_ASSIGN_OR_RETURN(bool dead_parameters_changed,
-                      RemoveDeadParameters(computation, computation_callers));
+  TF_ASSIGN_OR_RETURN(
+      bool dead_parameters_changed,
+      RemoveDeadParameters(computation, computation_callers,
+                           remove_dead_parameters_from_entry_computation));
   changed |= dead_parameters_changed;
 
   return changed;
 }
 
-absl::StatusOr<bool> HloDCE::Run(
+absl::StatusOr<bool> HloDCE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Before dce; threads: " << absl::StrJoin(execution_threads, ",");
@@ -401,7 +450,8 @@ absl::StatusOr<bool> HloDCE::Run(
   TF_ASSIGN_OR_RETURN(
       bool agenda_changed,
       ProcessAgenda(module, agenda, to_remove, execution_threads,
-                    remove_cross_partition_collective_ops_, call_graph.get()));
+                    remove_cross_partition_collective_ops_, call_graph.get(),
+                    remove_dead_parameters_from_entry_computation_));
   changed |= agenda_changed;
 
   TF_ASSIGN_OR_RETURN(
@@ -411,6 +461,10 @@ absl::StatusOr<bool> HloDCE::Run(
   changed |= dangling_computations_removed;
 
   if (changed) {
+    // Update the schedule to reflect the removed instructions.
+    if (module->has_schedule()) {
+      TF_RETURN_IF_ERROR(module->schedule().Update(execution_threads));
+    }
     VLOG(2) << "After dce:";
     XLA_VLOG_LINES(2, module->ToString());
   }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
index 65afe534f1e03f..bec84edb975775 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
@@ -48,21 +48,36 @@ class HloDCE : public HloModulePass {
   absl::string_view name() const override { return "dce"; }
 
   // Run DCE on a computation.
+  // If `remove_dead_parameters_from_entry_computation` is true, then remove
+  // dead parameters from the entry computation and update the entry computation
+  // layout of the module.
   static absl::StatusOr<bool> RunOnComputation(
       HloComputation* computation,
       bool remove_cross_partition_collective_ops = false,
-      CallGraph* call_graph = nullptr);
+      CallGraph* call_graph = nullptr,
+      bool remove_dead_parameters_from_entry_computation = false);
 
+  // If `remove_dead_parameters_from_entry_computation` is true, then this pass
+  // will remove dead parameters from the entry computation and update the entry
+  // computation layout of the module.
+  // Note: The caller needs to be aware of the entry computation layout changes.
+  void set_remove_dead_parameters_from_entry_computation(
+      bool remove_dead_parameters_from_entry_computation) {
+    remove_dead_parameters_from_entry_computation_ =
+        remove_dead_parameters_from_entry_computation;
+  }
+
+ protected:
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   bool remove_cross_partition_collective_ops_;
   bool use_call_analysis_;
+  bool remove_dead_parameters_from_entry_computation_ = false;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc
index be14d1eb443749..b04d6ee1b2aaf5 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc
@@ -979,5 +979,40 @@ dangling_computation {
   EXPECT_EQ(module->GetComputationWithName("dangling_computation"), nullptr);
 }
 
+TEST_F(HloDceTest, RemoveDeadEntryParametersAndLayout) {
+  constexpr absl::string_view kHlo = R"(
+HloModule test_module, entry_computation_layout={(f32[2,3]{1,0}, f32[4,5]{1,0}, f32[2,3]{1,0})->f32[2,3]{1,0}}
+
+ENTRY entry_computation {
+  p0 = f32[2,3]{1,0} parameter(0)
+  p1 = f32[4,5]{1,0} parameter(1) // dead parameter
+  p2 = f32[2,3]{1,0} parameter(2)
+  ROOT add = f32[2,3]{1,0} add(p0, p2)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloComputation* computation = module->entry_computation();
+
+  EXPECT_EQ(computation->num_parameters(), 3);
+  EXPECT_EQ(computation->instruction_count(), 4);
+  EXPECT_EQ(module->entry_computation_layout().parameter_count(), 3);
+
+  HloDCE dce;
+  dce.set_remove_dead_parameters_from_entry_computation(true);
+  EXPECT_TRUE(dce.Run(module.get()).value());
+
+  EXPECT_EQ(computation->num_parameters(), 2);
+  const Shape p0_p2_shape = ShapeUtil::MakeShape(F32, {2, 3});
+  EXPECT_TRUE(computation->parameter_instruction(0)->shape() == p0_p2_shape);
+  EXPECT_TRUE(computation->parameter_instruction(1)->shape() == p0_p2_shape);
+
+  EXPECT_EQ(module->entry_computation_layout().parameter_count(), 2);
+  EXPECT_TRUE(module->entry_computation_layout().parameter_layout(0) ==
+              ShapeLayout(p0_p2_shape));
+  EXPECT_TRUE(module->entry_computation_layout().parameter_layout(1) ==
+              ShapeLayout(p0_p2_shape));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc
index 98a9122da5fc35..8552bd446ef1c9 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc
@@ -116,11 +116,11 @@ HloElementTypeConverter::HloElementTypeConverter(
 
 // This routine converts the arithmetic operations in the given module that use
 // eliminate_type_ to operations that use replace_with_type_.
-absl::StatusOr<bool> HloElementTypeConverter::Run(
+absl::StatusOr<bool> HloElementTypeConverter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      3, "HloElementTypeConverter::Run(), before:\n" + module->ToString());
+      3, "HloElementTypeConverter::RunImpl(), before:\n" + module->ToString());
 
   if (eliminate_type_ == replace_with_type_) {
     return false;
@@ -232,7 +232,7 @@ absl::StatusOr<bool> HloElementTypeConverter::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "HloElementTypeConverter::Run(), after:\n" + module->ToString());
+      2, "HloElementTypeConverter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
index 52a2f6cd924142..77307db447aef8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
@@ -39,8 +39,8 @@ class HloElementTypeConverter : public HloModulePass {
   absl::string_view name() const override { return "element_type_converter"; }
 
   // Returns the pass on the module and returns whether the module was modified.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
index f10e5a8c7e832b..03c1386dd6bf4c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
@@ -657,7 +657,7 @@ absl::StatusOr<HloSchedule> ScheduleModule(
       execution_threads, peak_memory);
 }
 
-absl::StatusOr<bool> HloMemoryScheduler::Run(
+absl::StatusOr<bool> HloMemoryScheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(HloSchedule schedule,
@@ -666,7 +666,7 @@ absl::StatusOr<bool> HloMemoryScheduler::Run(
   return true;
 }
 
-absl::StatusOr<bool> HloTrivialScheduler::Run(
+absl::StatusOr<bool> HloTrivialScheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   HloSchedule schedule(module);
@@ -688,7 +688,7 @@ absl::StatusOr<bool> HloTrivialScheduler::Run(
   return true;
 }
 
-absl::StatusOr<bool> HloDescheduler::Run(
+absl::StatusOr<bool> HloDescheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = module->has_schedule();
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
index f06330b53ff0ff..e3d11bb038063f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
@@ -275,8 +275,8 @@ class HloMemoryScheduler : public HloModulePass {
 
   absl::string_view name() const override { return "hlo-memory-scheduler"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -290,8 +290,9 @@ class HloTrivialScheduler : public HloModulePass {
  public:
   HloTrivialScheduler() = default;
   absl::string_view name() const override { return "hlo-trivial-scheduler"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
@@ -302,8 +303,9 @@ class HloDescheduler : public HloModulePass {
  public:
   HloDescheduler() = default;
   absl::string_view name() const override { return "hlo-descheduler"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc
index 127342aec288d9..d758e09cf7a593 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc
@@ -3016,7 +3016,7 @@ HloRematerialization::GetRematAlgorithmFunction(
   }
 }
 
-absl::StatusOr<bool> HloRematerialization::Run(
+absl::StatusOr<bool> HloRematerialization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (options_.remat_mode_config.host_offload) {
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
index 9ef2558bfabe73..cab8902d8359dd 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
@@ -52,7 +52,7 @@ using RematAlgorithmFunction = std::function<absl::StatusOr<bool>(
 // CSE will undo the effects of this optimization and should not be run after
 // this pass. In general, this pass should be run very late, immediately before
 // code generation.
-class HloRematerialization : public HloModulePass {
+class HloRematerialization : public HloPassInterface {
  public:
   // The minimum cost estimate memory limit in bytes for a computation to be
   // considered for rematerialization. Only in use for peak priority
@@ -219,17 +219,6 @@ class HloRematerialization : public HloModulePass {
         std::max(max_rematerialized_block_size_, new_rematerialized_block_size);
   }
 
-  // Runs rematerialization on the given module. Returns whether the module was
-  // changed. Requires that the module has a schedule set
-  // (HloModule::has_schedule() is true) before running. Returns whether any
-  // instructions were rematerialized. If memory use is already below the limit
-  // specified in the constructor then no instructions are rematerialized and
-  // false is returned.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   int64_t GetBlockSizeLimit() const { return options_.block_size_limit; }
 
   // Holds references to data structures and some constants that are used during
@@ -414,6 +403,16 @@ class HloRematerialization : public HloModulePass {
   // rematerialized.
   absl::AnyInvocable<absl::Status(HloInstruction*, HloInstruction*)>
       on_rematerialized_;
+
+  // Runs rematerialization on the given module. Returns whether the module was
+  // changed. Requires that the module has a schedule set
+  // (HloModule::has_schedule() is true) before running. Returns whether any
+  // instructions were rematerialized. If memory use is already below the limit
+  // specified in the constructor then no instructions are rematerialized and
+  // false is returned.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_data_structures.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_data_structures.h
index 169e7ac479e617..0e9cba5212a032 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_data_structures.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_data_structures.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/tsl/platform/status.h"
 
 namespace xla {
 // Type holding a unique identifier for each Buffer object.
@@ -116,7 +115,7 @@ using UsesList = absl::InlinedVector<HloRematItemUse, 3>;
 class HloRematInstructionList {
  public:
   explicit HloRematInstructionList(const HloInstructionSequence& order) {
-    TF_CHECK_OK(UpdateFromSequence(order, /*preserve_denylist=*/false));
+    CHECK_OK(UpdateFromSequence(order, /*preserve_denylist=*/false));
   }
 
   absl::Status UpdateFromSequence(const HloInstructionSequence& order,
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
index 153a0d2785fba6..332d18e864476c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
@@ -197,7 +197,7 @@ class HostMemoryTransferAsyncifierVisitor : public DfsHloVisitorWithDefault {
 
 }  // namespace
 
-absl::StatusOr<bool> HostMemoryTransferAsyncifier::Run(
+absl::StatusOr<bool> HostMemoryTransferAsyncifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   HostMemoryTransferAsyncifierVisitor visitor(kHostMemorySpaceColor);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
index 7ec27b9d85f332..d760c360012a23 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
@@ -45,8 +45,8 @@ class HostMemoryTransferAsyncifier : public HloModulePass {
     return "host-memory-transfer-asyncifier";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc
index aa8edadb330178..2a5aa472ef44f5 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace {
@@ -72,7 +71,7 @@ bool HoistParameters(
     modified = true;
     HloInstructionSequence new_sequence;
     for (HloInstruction* parameter : computation->parameter_instructions()) {
-      TF_CHECK_OK(parameter->DropAllControlDeps());
+      CHECK_OK(parameter->DropAllControlDeps());
       new_sequence.push_back(parameter);
     }
     for (HloInstruction* instruction : sequence.instructions()) {
@@ -141,7 +140,7 @@ bool HoistConstantOperations(
 }
 }  // namespace
 
-absl::StatusOr<bool> InstructionHoister::Run(
+absl::StatusOr<bool> InstructionHoister::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
index 64cad2f68da73b..45ded03e85236e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
@@ -35,8 +35,9 @@ class InstructionHoister : public HloModulePass {
   ~InstructionHoister() override = default;
 
   absl::string_view name() const override { return "instruction-hoister"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc
index 97b4edbd2fd1b0..3da5222cf01825 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc
@@ -142,7 +142,7 @@ absl::StatusOr<bool> OptimizeInputOutputBufferAlias::Build(
   return changed;
 }
 
-absl::StatusOr<bool> OptimizeInputOutputBufferAlias::Run(
+absl::StatusOr<bool> OptimizeInputOutputBufferAlias::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // We exactly follow HloInputOutputAliasConfig::Verify to create input_shapes
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
index d33512c0dda2bc..9e0db94ced7798 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
@@ -58,8 +58,8 @@ class OptimizeInputOutputBufferAlias : public HloModulePass {
     return "optimize_input_output_buffer_alias";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias_test.cc
index c0086b1add3192..ae4cc622f869e9 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias_test.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "absl/status/status.h"
+#include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/layout.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 
@@ -70,7 +70,7 @@ class OptimizeInputOutputBufferAliasTest
 
     auto changed = optimize_pass_->Build(input_shapes, output_shape,
                                          &alias_config_, &buffer_donor_config_);
-    TF_CHECK_OK(changed.status());
+    CHECK_OK(changed.status());
 
     return changed.value();
   }
@@ -189,7 +189,7 @@ TEST_F(OptimizeInputOutputBufferAliasTest, BufferDonorOnly) {
   std::vector<Shape> input = {ShapeUtil::MakeTupleShape({r1f32_, r2f32_})};
   Shape output = ShapeUtil::MakeTupleShape({r2f32_, r1f32_});
 
-  TF_CHECK_OK(buffer_donor_config_.AddBufferDonor(0, {0}));
+  CHECK_OK(buffer_donor_config_.AddBufferDonor(0, {0}));
   EXPECT_TRUE(buffer_donor_config_.ParameterIsBufferDonor(0, {0}));
 
   bool changed = BuildAliasConfig(input, output);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc
index 70410c93bbf254..e50dcf0904eb0a 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc
@@ -27,7 +27,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> ReduceWindowResizer::Run(
+absl::StatusOr<bool> ReduceWindowResizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h
index f12955ba3d3a09..ffa38bfef197fe 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h
@@ -33,7 +33,8 @@ class ReduceWindowResizer : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-window-resizer"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
index bc9fd7233e0c3a..0caa95ea5c287f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
@@ -498,7 +498,7 @@ absl::StatusOr<bool> ReduceWindowRewriter::TryOptimizeCumSumOrProd(
   return true;
 }
 
-absl::StatusOr<bool> ReduceWindowRewriter::Run(
+absl::StatusOr<bool> ReduceWindowRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
index d60997958cb9cd..a389dc9b4b31de 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
@@ -55,8 +55,8 @@ class ReduceWindowRewriter : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-window-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
index 3c450fbf7c402a..f74bf40c913990 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
@@ -395,7 +395,7 @@ absl::StatusOr<bool> ReshapeMover::TryReshapeMoveOnCandidates(
   return true;
 }
 
-absl::StatusOr<bool> ReshapeMover::Run(
+absl::StatusOr<bool> ReshapeMover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
index 20a16b292185b3..fddcd9c873ac64 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
@@ -54,8 +54,8 @@ class ReshapeMover : public HloModulePass {
 
   absl::string_view name() const override { return "reshape-mover"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc
index 179742831b9de0..410ad1aeacbc4f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc
@@ -53,7 +53,7 @@ void SinkNontupleRoot(HloComputation* computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> RootInstructionSinker::Run(
+absl::StatusOr<bool> RootInstructionSinker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RET_CHECK(module->has_schedule());
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
index 4ead2aa5c501f3..df33e3afba1f73 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
@@ -36,8 +36,9 @@ class RootInstructionSinker : public HloModulePass {
  public:
   ~RootInstructionSinker() override = default;
   absl::string_view name() const override { return "root-instruction-sinker"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc
index 43c7171fafd09f..c9eea633cb00ea 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc
@@ -64,11 +64,11 @@ absl::StatusOr<bool> RunOnComputation(HloComputation& computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> SimplifyFPConversions::Run(
+absl::StatusOr<bool> SimplifyFPConversions::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      2, absl::StrFormat("SimplifyFPConversions::Run() with before:\n%s",
+      2, absl::StrFormat("SimplifyFPConversions::RunImpl() with before:\n%s",
                          module->ToString()));
   bool changed = false;
   for (HloComputation* computation :
@@ -76,9 +76,9 @@ absl::StatusOr<bool> SimplifyFPConversions::Run(
     TF_ASSIGN_OR_RETURN(bool comp_changed, RunOnComputation(*computation));
     changed |= comp_changed;
   }
-  XLA_VLOG_LINES(2,
-                 absl::StrFormat("SimplifyFPConversions::Run() with after:\n%s",
-                                 module->ToString()));
+  XLA_VLOG_LINES(
+      2, absl::StrFormat("SimplifyFPConversions::RunImpl() with after:\n%s",
+                         module->ToString()));
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
index a2266f34721f66..c56954a226fedd 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
@@ -36,8 +36,8 @@ class SimplifyFPConversions : public HloModulePass {
 
   absl::string_view name() const override { return "simplify-fp-conversions"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions_test.cc
index 8485e34321b57f..223a8ccbbd2c66 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -34,7 +34,6 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 
 using ::testing::AllOf;
-using ::tsl::testing::IsOkAndHolds;
 
 using SimplifyFPConversionsTest = HloHardwareIndependentTestBase;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc
index 488c97bd570d27..23e26621917ce8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc
@@ -174,7 +174,7 @@ absl::StatusOr<bool> HoistSliceOperations(HloComputation* computation) {
 }
 }  // anonymous namespace
 
-absl::StatusOr<bool> SliceHoister::Run(
+absl::StatusOr<bool> SliceHoister::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h
index f66aeee1f7685e..4da673305acebe 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h
@@ -30,8 +30,9 @@ class SliceHoister : public HloModulePass {
   SliceHoister() = default;
 
   absl::string_view name() const override { return "slice-hoister"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc
index d34ba3de5f4059..5780e5d463017e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc
@@ -252,7 +252,7 @@ absl::Status SinkSlices(
 // This pass currently doesn't transform non-elementwise instructions. We may
 // extend this pass to transform non-elementwise instructions, such as dot,
 // broadcast and reduce in the future.
-absl::StatusOr<bool> SliceSinker::Run(
+absl::StatusOr<bool> SliceSinker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
index 66a58ed24f0a80..a50d2977f6e13a 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
@@ -30,8 +30,8 @@ class SliceSinker : public HloModulePass {
  public:
   absl::string_view name() const override { return "slice-sinker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc
index cf9b37f695613e..4b5bdbde4aabe8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc
@@ -148,7 +148,7 @@ absl::StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
 }
 }  // namespace
 
-absl::StatusOr<bool> SortSimplifier::Run(
+absl::StatusOr<bool> SortSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before SortSimplifier:";
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
index 44779f1e10c452..9967955b17ce0f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
@@ -29,8 +29,9 @@ namespace xla {
 class SortSimplifier : public HloModulePass {
  public:
   absl::string_view name() const override { return "simplify-sorts"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc
index e58bae332fb781..be89794d7c6f71 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc
@@ -47,7 +47,7 @@ HloInstruction* ReshapeAndCastToWiderType(HloInstruction* input,
 
   std::vector<int64_t> bitcast_dimensions(input_shape.dimensions().begin(),
                                           input_shape.dimensions().end());
-  bitcast_dimensions.back() /= ratio;
+  bitcast_dimensions[input_shape.layout().minor_to_major(0)] /= ratio;
   bitcast_dimensions.push_back(ratio);
   Shape bitcast_shape =
       ShapeUtil::MakeShape(input_shape.element_type(), bitcast_dimensions);
@@ -97,10 +97,9 @@ bool CanBeRepresentedAs(const Shape& shape, const PrimitiveType casted_type) {
   const int64_t ratio = primitive_util::BitWidth(casted_type) /
                         primitive_util::BitWidth(shape.element_type());
   return primitive_util::IsSubByteNonPredType(shape.element_type()) &&
-         ShapeUtil::LastDimIsMinorMost(shape) &&
          shape.layout().element_size_in_bits() ==
              primitive_util::BitWidth(shape.element_type()) &&
-         shape.dimensions().back() % ratio == 0;
+         shape.dimensions(shape.layout().minor_to_major(0)) % ratio == 0;
 }
 
 class SubByteCollectiveNormalizationVisitor : public DfsHloRewriteVisitor {
@@ -173,7 +172,7 @@ SubByteCollectiveNormalizationVisitor::ProcessCollectiveInstruction(
 
   std::vector<int64_t> new_collective_dimensions(
       hlo.shape().dimensions().begin(), hlo.shape().dimensions().end());
-  new_collective_dimensions.back() /= ratio;
+  new_collective_dimensions[hlo.shape().layout().minor_to_major(0)] /= ratio;
   Shape new_collective_shape =
       ShapeUtil::MakeShape(casted_type_, new_collective_dimensions);
   if (hlo.shape().has_layout()) {
@@ -193,7 +192,7 @@ SubByteCollectiveNormalizationVisitor::ProcessCollectiveInstruction(
 
 }  // namespace
 
-absl::StatusOr<bool> SubByteCollectiveNormalization::Run(
+absl::StatusOr<bool> SubByteCollectiveNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   SubByteCollectiveNormalizationVisitor visitor;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h
index 6e7dd5b31465d0..6e2d78e2f287fb 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h
@@ -35,8 +35,8 @@ class SubByteCollectiveNormalization : public HloModulePass {
     return "sub-byte-collective-normalization";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization_test.cc
index 262b2dec03cb44..92be90bbaa391c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization_test.cc
@@ -44,16 +44,6 @@ e {
                                      /*expect_change=*/false));
 }
 
-TEST_F(SubByteCollectiveNormalizationTest, SkipNonMinorMost) {
-  TF_ASSERT_OK(RunAndCheckHloRewrite(R"(
-e {
- a = s4[32,16]{0,1:E(4)} parameter(0)
- b = s4[32,16]{0,1:E(4)} all-gather(a), dimensions={0}
-})",
-                                     SubByteCollectiveNormalization(),
-                                     /*expect_change=*/false));
-}
-
 TEST_F(SubByteCollectiveNormalizationTest, SkipOddElementCount) {
   TF_ASSERT_OK(RunAndCheckHloRewrite(R"(
 e {
@@ -91,6 +81,23 @@ CHECK-NEXT: s4[8,8]{1,0:E(4)} bitcast
 )");
 }
 
+TEST_F(SubByteCollectiveNormalizationTest,
+       TransformAllGatherWithNonMinorMostLastDim) {
+  RunAndFilecheckHloRewrite(R"(
+e {
+ a = s4[32,16]{0,1:E(4)} parameter(0)
+ b = s4[32,16]{0,1:E(4)} all-gather(a), dimensions={0}
+})",
+                            SubByteCollectiveNormalization(), R"(
+CHECK: s4[32,16]{0,1:E(4)} parameter
+CHECK-NEXT: s4[16,16,2]{2,0,1:E(4)} bitcast
+CHECK-NEXT: s8[16,16]{0,1} bitcast-convert
+CHECK-NEXT: s8[16,16]{0,1} all-gather
+CHECK-NEXT: s4[16,16,2]{2,0,1:E(4)} bitcast-convert
+CHECK-NEXT: s4[32,16]{0,1:E(4)} bitcast
+)");
+}
+
 TEST_F(SubByteCollectiveNormalizationTest, SkipTinyAllToAll) {
   TF_ASSERT_OK(RunAndCheckHloRewrite(R"(
 HloModule m, replica_count=2
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc
index c77a66944fda58..f00a844e69a988 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -27,9 +28,9 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
@@ -82,14 +83,14 @@ bool ProcessInputOrOutputLayout(ShapeLayout* shape_layout,
   Shape shape = shape_layout->shape();
   bool changed = UpdateShape(&shape, mode);
   if (changed) {
-    TF_CHECK_OK(shape_layout->CopyLayoutFromShape(shape));
+    CHECK_OK(shape_layout->CopyLayoutFromShape(shape));
   }
   return changed;
 }
 
 }  // namespace
 
-absl::StatusOr<bool> SubByteNormalization::Run(
+absl::StatusOr<bool> SubByteNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
index b8918d63c6ae21..895046fe52afa7 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
@@ -52,8 +52,9 @@ class SubByteNormalization : public HloModulePass {
         return "sub-byte-size-setter";
     }
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
index 973d1be515c104..207ccfa6a30635 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
@@ -42,10 +42,14 @@ namespace xla {
 
 class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit ReductionRewriterVisitor(int64_t reduce_window_size)
-      : reduce_window_size_(reduce_window_size) {}
+  ReductionRewriterVisitor(int64_t reduce_window_size, HloPredicate filter)
+      : reduce_window_size_(reduce_window_size), filter_(std::move(filter)) {}
 
   absl::Status HandleReduce(HloInstruction *hlo) override {
+    if (filter_ && !filter_(hlo)) {
+      return absl::OkStatus();
+    }
+
     HloInstruction *reduced_op = hlo->mutable_operand(0);
     HloInstruction *initial_value = hlo->mutable_operand(1);
     const Shape &input_shape = reduced_op->shape();
@@ -112,12 +116,13 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
 
  private:
   int64_t reduce_window_size_;
+  HloPredicate filter_;
 };
 
-absl::StatusOr<bool> TreeReductionRewriter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
-  ReductionRewriterVisitor visitor(reduce_window_size_);
+absl::StatusOr<bool> TreeReductionRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  ReductionRewriterVisitor visitor(reduce_window_size_, filter_);
   bool changed = false;
   for (const auto &computation :
        module->MakeNonfusionComputations(execution_threads)) {
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
index b5b6f91d3ca334..52703c0da0545c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
@@ -44,18 +44,20 @@ namespace xla {
 // increased to a larger value.
 class TreeReductionRewriter : public HloModulePass {
  public:
-  explicit TreeReductionRewriter(int64_t reduce_window_size = 32)
-      : reduce_window_size_(reduce_window_size) {}
+  explicit TreeReductionRewriter(int64_t reduce_window_size = 32,
+                                 HloPredicate filter = nullptr)
+      : reduce_window_size_(reduce_window_size), filter_(std::move(filter)) {}
   ~TreeReductionRewriter() override = default;
   absl::string_view name() const override { return "tree_reduction_rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   int64_t reduce_window_size_;
+  HloPredicate filter_;
 };
 
 }  // end namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc
index 645410128700fa..e0b3dec8679fd8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc
@@ -64,7 +64,7 @@ absl::StatusOr<HloInstruction*> TupleSimplifier::RemoveWholeTuple(
   return nullptr;
 }
 
-absl::StatusOr<bool> TupleSimplifier::Run(
+absl::StatusOr<bool> TupleSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Initially add all GTE and Tuple instructions to the worklist.
@@ -132,7 +132,7 @@ absl::StatusOr<bool> TupleSimplifier::Run(
     if (module->has_schedule()) {
       for (HloInstruction* instr : replaced_instrs) {
         // Remove the replaced instructions from the schedule since we did not
-        // create new insructions for them, but their properties such as their
+        // create new instructions for them, but their properties such as their
         // control predecessors may have changed, so we want to reschedule them.
         module->schedule().remove_instruction(computation, instr);
       }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
index 7fa890ea2a3b99..e63a12f1fedda0 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
@@ -34,11 +34,10 @@ class TupleSimplifier : public HloModulePass {
   ~TupleSimplifier() override {}
   absl::string_view name() const override { return "tuple-simplifier"; }
 
+ protected:
   // Runs tuple simplification on the given module. Returns whether the module
   // was changed.
-  using HloPassInterface::Run;
-  using HloPassInterface::RunOnModuleGroup;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc
index 471c38efc0c499..9dc7b880462f74 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc
@@ -45,36 +45,61 @@ limitations under the License.
 namespace xla {
 
 namespace {
-
 // Struct to hold all call instructions and called computations in a module.
 struct HloCalls {
-  std::vector<HloInstruction*> call_sites;
-  std::vector<HloComputation*> targets;
+  // All callsites are guaranteed to be `kCall` instructions.
+  absl::flat_hash_set<HloInstruction*> call_sites;
+  absl::flat_hash_set<HloComputation*> targets;
 };
 
 // Iterates through all instructions in the module's computations
-// and collects all `HloInstruction`s with opcode `kCall` into 'calls_sites'
+// and collects all `HloInstruction`s with opcode `kCall` into 'call_sites'
 // and all unique computations targeted by these calls into 'targets'.
+// It only retains call sites and targets that could potentially be duplicates
+// by filtering out computations with unique properties (instruction count,
+// parameter count).
 HloCalls CollectHloCalls(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph =
+      CallGraph::Build(module, execution_threads);
   HloCalls calls;
+  absl::flat_hash_map<uint64_t, uint64_t> count_num_instructions,
+      count_num_params;
+
   for (const CallGraphNode& node : call_graph->nodes()) {
     for (const CallSite& callsite : node.callsites()) {
       if (callsite.instruction()->opcode() == HloOpcode::kCall) {
-        calls.call_sites.push_back(callsite.instruction());
-        calls.targets.push_back(callsite.instruction()->to_apply());
+        calls.call_sites.insert(callsite.instruction());
+        HloComputation* target = callsite.instruction()->to_apply();
+        calls.targets.insert(target);
+        ++count_num_instructions[target->instruction_count()];
+        ++count_num_params[target->num_parameters()];
       }
     }
   }
+
+  // Remove computations that cannot be duplicates: if a computation is unique
+  // in terms of instruction count or parameter count it implies it cannot be
+  // identical to any other computation.
+  for (auto it = calls.call_sites.begin(), end = calls.call_sites.end();
+       it != end;) {
+    // `erase()` will invalidate `it`, so advance `it` first.
+    auto copy_it = it++;
+    HloComputation* computation = (*copy_it)->to_apply();
+    if (count_num_instructions[computation->instruction_count()] == 1 ||
+        count_num_params[computation->num_parameters()] == 1) {
+      calls.targets.erase(computation);
+      calls.call_sites.erase(copy_it);
+    }
+  }
   return calls;
 }
 }  // namespace
 
 absl::StatusOr<std::vector<UnflattenCallGraph::ComputationHashResult>>
 UnflattenCallGraph::HashComputations(
-    const std::vector<HloComputation*>& called_computations) {
+    const absl::flat_hash_set<HloComputation*>& called_computations) {
   auto hash_computation =
       [&](HloComputation* computation) -> ComputationHashResult {
     // Secret key used for hashing. Since we're not worried about attackers,
@@ -131,7 +156,7 @@ absl::Status UnflattenCallGraph::ValidateComputationHashes(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> UnflattenCallGraph::Run(
+absl::StatusOr<bool> UnflattenCallGraph::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running UnflattenCallGraph on module " << module->name();
@@ -188,13 +213,6 @@ absl::StatusOr<bool> UnflattenCallGraph::Run(
   }
 
   if (changed) {
-    // Clean up any computations that are now no longer called.
-    for (const ComputationHashResult& result : hash_results) {
-      if (!hash_to_canonical.contains(result.hash)) {
-        TF_RETURN_IF_ERROR(
-            module->RemoveEmbeddedComputation(result.computation));
-      }
-    }
     TF_RETURN_IF_ERROR(module->RemoveUnusedComputations());
     module->CleanupComputations();
   }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h
index 4111b4e8ccdce1..792915df4fc77d 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h
@@ -49,10 +49,10 @@ class UnflattenCallGraph : public HloModulePass {
 
   absl::string_view name() const override { return "unflatten-call-graph"; }
 
+ protected:
   // Find called computations that are identical and replace them with calls to
   // a single computation. Returns true if the module was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -67,7 +67,7 @@ class UnflattenCallGraph : public HloModulePass {
   // Hashes computations to produce a fingerprint and hash value.
   // Uses canonical HLO text without IDs for stable, content-based hashing.
   absl::StatusOr<std::vector<ComputationHashResult>> HashComputations(
-      const std::vector<HloComputation*>& called_computations);
+      const absl::flat_hash_set<HloComputation*>& called_computations);
 
   // Verifies that computations with the same hash are identical to prevent
   // incorrect merging due to hash collisions, using progressively more
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
index a1d3bf08730d98..b87e57699aa3a4 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
@@ -47,7 +47,7 @@ bool ShouldSkipForSideEffect(HloInstruction* instruction) {
 
 }  // namespace
 
-absl::StatusOr<bool> ZeroSizedHloElimination::Run(
+absl::StatusOr<bool> ZeroSizedHloElimination::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
index 7823b7e4b4e71a..ff2826299817db 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
@@ -26,13 +26,14 @@ limitations under the License.
 namespace xla {
 class ZeroSizedHloElimination : public HloModulePass {
  public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   absl::string_view name() const override {
     return "zero_sized_hlo_elimination";
   }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 }  // namespace xla
 #endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_ZERO_SIZED_HLO_ELIMINATION_H_
diff --git a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
index 9ed713d6473a17..6317837901a99b 100644
--- a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
+++ b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> WhileLoopTripCountAnnotator::Run(
+absl::StatusOr<bool> WhileLoopTripCountAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
index 0cab15b749d9ad..671615536836a3 100644
--- a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
+++ b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
@@ -42,8 +42,9 @@ class WhileLoopTripCountAnnotator : public HloModulePass {
   absl::string_view name() const override {
     return "while-loop-trip-count-annotator";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/translate/BUILD b/third_party/xla/xla/hlo/translate/BUILD
index de2251b788a249..ae2dcfb2e44c35 100644
--- a/third_party/xla/xla/hlo/translate/BUILD
+++ b/third_party/xla/xla/hlo/translate/BUILD
@@ -131,5 +131,6 @@ cc_library(
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:UBDialect",
         "@stablehlo//:stablehlo_passes",
+        "@stablehlo//:stablehlo_passes_optimization",
     ],
 )
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
index 100461ac46b60c..97c728901b6c07 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
@@ -77,12 +77,12 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncStart(
     std::function<absl::Status(sync_op)> mutate_op) {
   auto context = builder->getContext();
   if (!llvm::isa<mlir::TupleType>(result_type)) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "expected async_bundle tuple result type");
   }
   auto result_types = mlir::cast<mlir::TupleType>(result_type).getTypes();
   if (result_types.size() < 2) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "async_bundle must contain at least two values");
   }
   auto func_type = mlir::FunctionType::get(context, Untuple(result_types[0]),
@@ -150,7 +150,9 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncDone(
   assert(operands.size() == 1 &&
          "*-done ops must take only a single async_bundle operand");
   auto async_start = operands[0].getDefiningOp<mlir::mhlo::AsyncStartOp>();
-  if (!async_start) return InvalidArgument("*-start requires *-done as input");
+  if (!async_start) {
+    return InvalidArgument("*-start requires *-done as input");
+  }
   attributes.push_back(builder->getNamedAttr(
       "called_computation",
       mlir::FlatSymbolRefAttr::get(builder->getContext(),
@@ -167,13 +169,11 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncDone(
     auto op = builder->create<mlir::mhlo::AsyncDoneOp>(loc, result_type,
                                                        operands, attributes);
     return {op};
-  } else {
-    if (useBundleResult) result_type = async_bundle.getTypes()[1];
-    auto op = builder->create<mlir::mhlo::AsyncDoneOp>(
-        loc, Untuple(result_type), operands, attributes);
-    return CreateTupleFromOpResults(builder, loc, op.getOperation(),
-                                    result_type);
   }
+  if (useBundleResult) result_type = async_bundle.getTypes()[1];
+  auto op = builder->create<mlir::mhlo::AsyncDoneOp>(loc, Untuple(result_type),
+                                                     operands, attributes);
+  return CreateTupleFromOpResults(builder, loc, op.getOperation(), result_type);
 }
 
 }  // namespace
@@ -213,8 +213,9 @@ absl::StatusOr<mlir::Operation*> ImportSend(
     // `send-done` ops to use the new-style async API, we need to reorder the
     // arguments to be in (args, token, sync flag) order.
     auto result_types = mlir::cast<mlir::TupleType>(result_type).getTypes();
-    if (result_types.size() != 3)
+    if (result_types.size() != 3) {
       return InvalidArgument("send should return a 3-tuple");
+    }
     auto async_arg_type = mlir::TupleType::get(
         builder->getContext(), {result_types[0], result_types[2]});
     auto async_bundled_tuple = mlir::TupleType::get(
@@ -276,8 +277,9 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
   // Currently only consolidates async recv with result, 0-result recv uses old
   // style, unclear if this support is needed.
   auto result_types = llvm::cast<mlir::TupleType>(result_type).getTypes();
-  if (result_types.size() != 3)
+  if (result_types.size() != 3) {
     return InvalidArgument("recv should return a 3-tuple");
+  }
 
   bool isPipelined =
       instruction->users().front()->opcode() != HloOpcode::kRecvDone;
@@ -351,14 +353,17 @@ absl::StatusOr<mlir::Operation*> ImportAllGatherStart(
       builder->getI64IntegerAttr(all_gather_start->all_gather_dimension())));
   attributes.push_back(
       ConvertReplicaGroups(all_gather_start->replica_groups(), builder));
-  if (all_gather_start->channel_id().has_value())
+  if (all_gather_start->channel_id().has_value()) {
     attributes.push_back(stablehlo::ConvertChannelHandle(
         all_gather_start->channel_id().value(), builder));
-  if (all_gather_start->use_global_device_ids())
+  }
+  if (all_gather_start->use_global_device_ids()) {
     attributes.push_back(ConvertUseGlobalDeviceIds(builder));
-  if (all_gather_start->operands().size() > 1)
+  }
+  if (all_gather_start->operands().size() > 1) {
     return InvalidArgument(
         "Async tuple all-gather is not supported in StableHLO");
+  }
 
   if (!llvm::isa<mlir::TupleType>(result_type)) {
     // Async AllGather's output type is bundle<input_type,output_type>
@@ -384,14 +389,17 @@ absl::StatusOr<mlir::Operation*> ImportAllReduceStart(
   auto all_reduce_start = Cast<HloAllReduceInstruction>(instruction);
   attributes.push_back(
       ConvertReplicaGroups(all_reduce_start->replica_groups(), builder));
-  if (all_reduce_start->channel_id().has_value())
+  if (all_reduce_start->channel_id().has_value()) {
     attributes.push_back(stablehlo::ConvertChannelHandle(
         all_reduce_start->channel_id().value(), builder));
-  if (all_reduce_start->use_global_device_ids())
+  }
+  if (all_reduce_start->use_global_device_ids()) {
     attributes.push_back(ConvertUseGlobalDeviceIds(builder));
-  if (all_reduce_start->operands().size() > 1)
+  }
+  if (all_reduce_start->operands().size() > 1) {
     return InvalidArgument(
         "Async tuple all-reduce is not supported in StableHLO");
+  }
 
   if (!llvm::isa<mlir::TupleType>(result_type)) {
     // Async AllReduce's output type is bundle<input_type,output_type>
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
index fe882c0726b4c1..b784f882ac546a 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
@@ -98,7 +98,9 @@ mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel,
 mlir::NamedAttribute ConvertChannelHandle(std::optional<int64_t> channel_id,
                                           mlir::Builder* builder) {
   ChannelHandle channel_handle;
-  if (channel_id) channel_handle.set_handle(*channel_id);
+  if (channel_id) {
+    channel_handle.set_handle(*channel_id);
+  }
   return stablehlo::ConvertChannelHandle(channel_handle, builder);
 }
 
@@ -197,6 +199,12 @@ mlir::stablehlo::DotAlgorithmAttr ConvertDotAlgorithm(
       numPrimitiveOperations = 6;
       break;
     }
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9: {
+      lhs = rhs = builder->getBF16Type();
+      accum = builder->getF32Type();
+      numPrimitiveOperations = 9;
+      break;
+    }
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32: {
       lhs = rhs = builder->getTF32Type();
       accum = builder->getF32Type();
@@ -256,7 +264,9 @@ mlir::ArrayAttr ConvertOutputOperandAliasing(
 
 mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
                                        mlir::Builder* builder) {
-  if (!config) return {};
+  if (!config) {
+    return {};
+  }
 
   // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
   // case and the parser sticks it in. Maybe we should too.
@@ -297,7 +307,9 @@ mlir::stablehlo::ResultAccuracyAttr ConvertResultAccuracy(
 
 mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
                                        mlir::Builder* builder) {
-  if (!config) return {};
+  if (!config) {
+    return {};
+  }
 
   // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
   // case and the parser sticks it in. Maybe we should too.
@@ -425,9 +437,10 @@ absl::StatusOr<mlir::mhlo::CustomCallApiVersion> ConvertCustomCallApiVersion(
                       stablehlo::ConvertCustomCallApiVersion(api_version));
   auto mhlo_api_version = mlir::mhlo::symbolizeCustomCallApiVersion(
       mlir::stablehlo::stringifyCustomCallApiVersion(stablehlo_api_version));
-  if (!mhlo_api_version.has_value())
+  if (!mhlo_api_version.has_value()) {
     return InvalidArgument("Unknown CustomCallApiVersion enum value #%d",
                            api_version);
+  }
   return mhlo_api_version.value();
 }
 
@@ -442,7 +455,9 @@ mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel,
 mlir::NamedAttribute ConvertChannelHandle(std::optional<int64_t> channel_id,
                                           mlir::Builder* builder) {
   ChannelHandle channel_handle;
-  if (channel_id) channel_handle.set_handle(*channel_id);
+  if (channel_id) {
+    channel_handle.set_handle(*channel_id);
+  }
   return ConvertChannelHandle(channel_handle, builder);
 }
 
@@ -461,8 +476,9 @@ mlir::NamedAttribute ConvertReplicaGroups(
   std::vector<int64_t> attr(num_groups * group_size, -1);
   for (int i = 0; i < num_groups; ++i) {
     int index = i * group_size;
-    for (const int64_t& id : replica_groups[i].replica_ids())
+    for (const int64_t& id : replica_groups[i].replica_ids()) {
       attr[index++] = id;
+    }
   }
   auto type = mlir::RankedTensorType::get({num_groups, group_size},
                                           builder->getIntegerType(64));
@@ -492,9 +508,10 @@ absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromShapes(
     const absl::Span<const Shape> shapes_with_layouts, mlir::Builder* builder) {
   std::vector<mlir::Attribute> layouts;
   for (auto& shape_and_layout : shapes_with_layouts) {
-    if (shape_and_layout.IsTuple())
+    if (shape_and_layout.IsTuple()) {
       return Unimplemented(
           "Layout support for nested tuples is not implemented.");
+    }
     // XLA can have invalid layout for certain values (such as token types).
     // These are imported as empty layout in MHLO.
     if (!shape_and_layout.IsArray()) {
@@ -511,15 +528,18 @@ absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromShapes(
     }
 
     const xla::Layout& xla_layout = shape_and_layout.layout();
-    if (!xla_layout.tiles().empty())
+    if (!xla_layout.tiles().empty()) {
       return Unimplemented("Tiled layout is not supported yet");
-    if (xla_layout.memory_space() != xla::Layout::kDefaultMemorySpace)
+    }
+    if (xla_layout.memory_space() != xla::Layout::kDefaultMemorySpace) {
       return Unimplemented(
           "Layout support for non-default memory space is not yet implemented");
+    }
 
     llvm::SmallVector<int64_t> layout;
-    for (int64_t dim_index : xla_layout.minor_to_major())
+    for (int64_t dim_index : xla_layout.minor_to_major()) {
       layout.push_back(dim_index);
+    }
     layouts.push_back(builder->getIndexTensorAttr(layout));
   }
   return builder->getArrayAttr(layouts);
@@ -527,7 +547,9 @@ absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromShapes(
 
 absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromTuple(
     const Shape shape, mlir::Builder* builder) {
-  if (!shape.IsTuple()) return InvalidArgument("Expected shape to be Tuple");
+  if (!shape.IsTuple()) {
+    return InvalidArgument("Expected shape to be Tuple");
+  }
   return ExtractLayoutsFromShapes(shape.tuple_shapes(), builder);
 }
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
index 958ae263167e2d..09f82597e2ac3c 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
@@ -134,12 +134,12 @@ mlir::NamedAttribute ConvertUseGlobalDeviceIds(mlir::Builder* builder);
 // Extracts layouts from shapes and converts it into layout attributes (array of
 // rank-1 index tensors). Returns an error if any of the shapes is a tuple.
 absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromShapes(
-    const absl::Span<const Shape> shapes_with_layouts, mlir::Builder* builder);
+    absl::Span<const Shape> shapes_with_layouts, mlir::Builder* builder);
 
 // Extracts the layouts of each element from a tuple shape and returns them as
 // an array of rank-1 index tensors. Returns an error in presence of nested
 // tuple shapes.
-absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromTuple(const xla::Shape shape,
+absl::StatusOr<mlir::ArrayAttr> ExtractLayoutsFromTuple(xla::Shape shape,
                                                         mlir::Builder* builder);
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
index ede0bec69c1bcb..f71b9561b49551 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
@@ -151,11 +151,10 @@ mlir::Type getQuantizedType(mlir::DictionaryAttr& backend_config) {
     return mlir::quant::UniformQuantizedPerAxisType::get(
         is_signed, storage_type, expressed_type, scales, zero_points,
         quantization_dimension, storage_min, storage_max);
-  } else {
-    return mlir::quant::UniformQuantizedType::get(
-        is_signed, storage_type, expressed_type, scales[0], zero_points[0],
-        storage_min, storage_max);
   }
+  return mlir::quant::UniformQuantizedType::get(
+      is_signed, storage_type, expressed_type, scales[0], zero_points[0],
+      storage_min, storage_max);
 }
 
 absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc
index 33c63a107fea2f..8cc887ab61183f 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -232,15 +232,17 @@ absl::StatusOr<mlir::Value> createConstantZeroLike(mlir::Value operand,
 
   LLVM_DEBUG(llvm::dbgs() << "CreateConstantZeroLike: " << operand << ", "
                           << type << '\n');
-  if (type.hasStaticShape())
+  if (type.hasStaticShape()) {
     return builder
         ->create<mlir::stablehlo::ConstantOp>(loc, builder->getZeroAttr(type))
         ->getResult(0);
+  }
 
   // Note: Currently this only supports a single bounded dimension.
-  if (!mlir::hlo::hasSingleBoundedDimension(type))
+  if (!mlir::hlo::hasSingleBoundedDimension(type)) {
     return Internal(
         "Currently HLO to MHLO only supports a single bounded dimension.");
+  }
 
   auto bounded_dim = std::distance(type.getShape().begin(),
                                    llvm::find_if(type.getShape(), [](auto dim) {
@@ -286,17 +288,23 @@ void HloFunctionImporter::ReplaceBlockArgumentsWithImplicitOperands(
 
 static bool IsNestedTupleInData(Type type) {
   auto tuple_type = mlir::dyn_cast<mlir::TupleType>(type);
-  if (!tuple_type) return false;
+  if (!tuple_type) {
+    return false;
+  }
 
   assert(llvm::isa<mlir::stablehlo::TokenType>(tuple_type.getType(1)) &&
          "Infeed: Non token type");
   auto data_type = tuple_type.getType(0);
 
   auto data_tuple_type = mlir::dyn_cast<mlir::TupleType>(data_type);
-  if (!data_tuple_type) return false;
+  if (!data_tuple_type) {
+    return false;
+  }
 
   for (auto child_type : data_tuple_type.getTypes()) {
-    if (mlir::isa<mlir::TupleType>(child_type)) return true;
+    if (mlir::isa<mlir::TupleType>(child_type)) {
+      return true;
+    }
   }
 
   return false;
@@ -682,9 +690,10 @@ absl::StatusOr<Value> HloFunctionImporter::ImportInstructions(
     mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
     bool flatten_computation_args_result) {
   mlir::Block* block = builder->getBlock();
-  if (block == nullptr)
+  if (block == nullptr) {
     return InvalidArgument(
         "ImportInstructions requires a valid block in the builder");
+  }
 
   HloFunctionImporter importer(symbol_table, {}, builder,
                                flatten_computation_args_result);
@@ -697,9 +706,10 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
     bool flatten_computation_args_result, DynamicShapeHandlingMode mode) {
   mlir::Block* block = builder->getBlock();
-  if (block == nullptr)
+  if (block == nullptr) {
     return InvalidArgument(
         "ImportInstructions requires a valid block in the builder");
+  }
 
   HloFunctionImporter importer(symbol_table, {}, builder,
                                flatten_computation_args_result);
@@ -759,7 +769,9 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
 
       const Literal& literal = constant->literal();
       auto attr = CreateDenseElementsAttrFromLiteral(literal, *builder_);
-      if (!attr.ok()) return attr.status();
+      if (!attr.ok()) {
+        return attr.status();
+      }
       mlir::Operation* new_operation =
           func_builder->create<mlir::stablehlo::ConstantOp>(loc, attr.value());
       for (auto attr : attributes) {
@@ -800,7 +812,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             ->create<mlir::mhlo::AsyncStartOp>(loc, bundle_result_type,
                                                operands, attributes)
             .getOperation();
-      } else if (instruction->opcode() == HloOpcode::kAsyncUpdate) {
+      }
+      if (instruction->opcode() == HloOpcode::kAsyncUpdate) {
         auto bundle_result_type = mlir::mhlo::AsyncBundleType::get(
             context_, llvm::cast<mlir::TupleType>(result_type).getTypes());
         // XLA Feature -- MHLO Only
@@ -848,7 +861,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                       .getOperation();
 
         return CreateTupleFromOpResults(func_builder, loc, op, result_type);
-      } else if (instruction->opcode() == HloOpcode::kBatchNormInference) {
+      }
+      if (instruction->opcode() == HloOpcode::kBatchNormInference) {
         return func_builder
             ->create<mlir::stablehlo::BatchNormInferenceOp>(
                 loc, result_type, operands, attributes)
@@ -1053,9 +1067,10 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto collective_broadcast = Cast<HloChannelInstruction>(instruction);
       attributes.push_back(ConvertReplicaGroups(
           collective_broadcast->replica_groups(), builder_));
-      if (collective_broadcast->channel_id().has_value())
+      if (collective_broadcast->channel_id().has_value()) {
         attributes.push_back(stablehlo::ConvertChannelHandle(
             collective_broadcast->channel_id().value(), builder_));
+      }
       return func_builder
           ->create<mlir::stablehlo::CollectiveBroadcastOp>(loc, result_type,
                                                            operands, attributes)
@@ -1066,9 +1081,10 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto collective_permute = Cast<HloChannelInstruction>(instruction);
       attributes.push_back(ConvertSourceTargetPairs(
           collective_permute->source_target_pairs(), builder_));
-      if (collective_permute->channel_id().has_value())
+      if (collective_permute->channel_id().has_value()) {
         attributes.push_back(stablehlo::ConvertChannelHandle(
             collective_permute->channel_id().value(), builder_));
+      }
       return func_builder
           ->create<mlir::stablehlo::CollectivePermuteOp>(loc, result_type,
                                                          operands, attributes)
@@ -1143,9 +1159,10 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         } else {
           mlir::Attribute attr =
               mlir::parseAttribute(raw_backend_config, builder_->getContext());
-          if (!mlir::isa<mlir::DictionaryAttr>(attr))
+          if (!mlir::isa<mlir::DictionaryAttr>(attr)) {
             return Internal(
                 "Couldn't parse backend config into a dictionary attribute");
+          }
 
           attributes.push_back(builder_->getNamedAttr("backend_config", attr));
         }
@@ -1157,7 +1174,9 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       if (custom_call->HasLiteral()) {
         const Literal& literal = custom_call->literal();
         auto attr = CreateDenseElementsAttrFromLiteral(literal, *builder_);
-        if (!attr.ok()) return attr.status();
+        if (!attr.ok()) {
+          return attr.status();
+        }
         attributes.push_back(
             builder_->getNamedAttr("mhlo.literal", attr.value()));
       }
@@ -1219,8 +1238,9 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(ConvertComparisonDirection(compare->direction()));
       auto default_type = Comparison::DefaultComparisonType(
           compare->operand(0)->shape().element_type());
-      if (compare->type() != default_type)
+      if (compare->type() != default_type) {
         attributes.push_back(ConvertComparisonType(compare->type()));
+      }
       return func_builder
           ->create<mlir::stablehlo::CompareOp>(loc, result_type, operands,
                                                attributes)
@@ -1551,11 +1571,13 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           builder_->getI64IntegerAttr(all_gather->all_gather_dimension())));
       attributes.push_back(
           ConvertReplicaGroups(all_gather->replica_groups(), builder_));
-      if (all_gather->channel_id().has_value())
+      if (all_gather->channel_id().has_value()) {
         attributes.push_back(stablehlo::ConvertChannelHandle(
             all_gather->channel_id().value(), builder_));
-      if (all_gather->use_global_device_ids())
+      }
+      if (all_gather->use_global_device_ids()) {
         attributes.push_back(ConvertUseGlobalDeviceIds(builder_));
+      }
       auto all_gather_op = func_builder->create<mlir::stablehlo::AllGatherOp>(
           loc, result_types, operands, attributes);
       if (result_tuple_ty) {
@@ -1582,11 +1604,13 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
 
       attributes.push_back(
           ConvertReplicaGroups(all_reduce->replica_groups(), builder_));
-      if (all_reduce->channel_id().has_value())
+      if (all_reduce->channel_id().has_value()) {
         attributes.push_back(stablehlo::ConvertChannelHandle(
             all_reduce->channel_id().value(), builder_));
-      if (all_reduce->use_global_device_ids())
+      }
+      if (all_reduce->use_global_device_ids()) {
         attributes.push_back(ConvertUseGlobalDeviceIds(builder_));
+      }
       auto all_reduce_op = func_builder->create<mlir::stablehlo::AllReduceOp>(
           loc, result_types, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*all_reduce->to_apply(),
@@ -1851,11 +1875,13 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           builder_->getI64IntegerAttr(reduce_scatter->scatter_dimension())));
       attributes.push_back(
           ConvertReplicaGroups(reduce_scatter->replica_groups(), builder_));
-      if (reduce_scatter->channel_id().has_value())
+      if (reduce_scatter->channel_id().has_value()) {
         attributes.push_back(stablehlo::ConvertChannelHandle(
             reduce_scatter->channel_id().value(), builder_));
-      if (reduce_scatter->use_global_device_ids())
+      }
+      if (reduce_scatter->use_global_device_ids()) {
         attributes.push_back(ConvertUseGlobalDeviceIds(builder_));
+      }
       auto reduce_scatter_op =
           func_builder->create<mlir::stablehlo::ReduceScatterOp>(
               loc, result_type, operands, attributes);
@@ -2012,12 +2038,11 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             ->create<mlir::stablehlo::OrOp>(loc, result_type, operands,
                                             attributes)
             .getOperation();
-      } else {
-        return func_builder
-            ->create<mlir::stablehlo::AddOp>(loc, result_type, operands,
-                                             attributes)
-            .getOperation();
       }
+      return func_builder
+          ->create<mlir::stablehlo::AddOp>(loc, result_type, operands,
+                                           attributes)
+          .getOperation();
     }
     case HloOpcode::kAfterAll: {
       // HLO AfterAll ops without any token input are used to create a token.
@@ -2027,12 +2052,11 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             ->create<mlir::stablehlo::CreateTokenOp>(loc, result_type, operands,
                                                      attributes)
             .getOperation();
-      } else {
-        return func_builder
-            ->create<mlir::stablehlo::AfterAllOp>(loc, result_type, operands,
-                                                  attributes)
-            .getOperation();
       }
+      return func_builder
+          ->create<mlir::stablehlo::AfterAllOp>(loc, result_type, operands,
+                                                attributes)
+          .getOperation();
     }
 
     case HloOpcode::kConvert: {
@@ -2171,6 +2195,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       NO_ATTRIBUTE_CASE_MHLO(kAcos, AcosOp);
       NO_ATTRIBUTE_CASE_MHLO(kAsin, AsinOp);
       NO_ATTRIBUTE_CASE_MHLO(kAcosh, AcoshOp);
+      NO_ATTRIBUTE_CASE_MHLO(kAsinh, AsinhOp);
       NO_ATTRIBUTE_CASE_MHLO(kAtanh, AtanhOp);
       NO_ATTRIBUTE_CASE_MHLO(kAddDependency, AddDependencyOp);
       NO_ATTRIBUTE_CASE_MHLO(kCosh, CoshOp);
@@ -2379,7 +2404,9 @@ mlir::DenseIntElementsAttr HloFunctionImporter::ConvertDimensions(
     absl::Span<const int64_t> op_dimensions) {
   llvm::SmallVector<APInt, 8> dimensions;
   dimensions.reserve(op_dimensions.size());
-  for (auto value : op_dimensions) dimensions.emplace_back(APInt(64, value));
+  for (auto value : op_dimensions) {
+    dimensions.emplace_back(APInt(64, value));
+  }
 
   return DenseIntElementsAttr::get(
       RankedTensorType::get(dimensions.size(), builder_->getIntegerType(64)),
@@ -2517,7 +2544,9 @@ mlir::Attribute ConvertSharding(const HloSharding& sharding,
 mlir::Attribute ConvertSharding(const OpSharding& sharding,
                                 mlir::Builder* builder) {
   auto hlo_sharding = HloSharding::FromProto(sharding);
-  if (!hlo_sharding.ok()) return {};
+  if (!hlo_sharding.ok()) {
+    return {};
+  }
   return ConvertSharding(hlo_sharding.value(), builder);
 }
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
index 15374458194b83..2f65724b80aaae 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
@@ -139,13 +139,17 @@ absl::StatusOr<mlir::MemRefType> ConvertTensorShapeToMemRefType(
     const Shape& shape, mlir::Builder builder) {
   auto element_type_or =
       ConvertPrimitiveTypeToMlirType(shape.element_type(), builder);
-  if (!element_type_or.ok()) return element_type_or.status();
+  if (!element_type_or.ok()) {
+    return element_type_or.status();
+  }
 
   using mlir::MemRefType;
   auto dimensions = shape.dimensions();
   llvm::SmallVector<int64_t, 4> array(dimensions.begin(), dimensions.end());
   auto permutation_or = GetPermutationIfAvailable(shape, builder);
-  if (!permutation_or.ok()) return permutation_or.status();
+  if (!permutation_or.ok()) {
+    return permutation_or.status();
+  }
   return MemRefType::get(array, element_type_or.value(),
                          permutation_or.value());
 }
@@ -194,9 +198,10 @@ mlir::Value CreateTupleValue(mlir::OpBuilder* func_builder, mlir::Location loc,
   }
 
   llvm::SmallVector<mlir::Value> flatten_sub_values;
-  for (auto child_type : tuple_type.getTypes())
+  for (auto child_type : tuple_type.getTypes()) {
     flatten_sub_values.push_back(
         CreateTupleValue(func_builder, loc, flatten_values, child_type));
+  }
 
   return func_builder->create<mlir::stablehlo::TupleOp>(loc, flatten_sub_values)
       .getResult();
@@ -206,7 +211,9 @@ mlir::Operation* CreateTupleFromOpResults(mlir::OpBuilder* func_builder,
                                           mlir::Location loc,
                                           mlir::Operation* op,
                                           mlir::Type type) {
-  if (!mlir::isa<mlir::TupleType>(type)) return op;
+  if (!mlir::isa<mlir::TupleType>(type)) {
+    return op;
+  }
 
   mlir::ValueRange flattened_results_ref(op->getResults());
   auto result =
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
index 70ec7004bc6305..b58e082cb42dae 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
@@ -53,7 +53,7 @@ absl::StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
 // Creates an DenseIntElementsAttr using the elements of the vector and the
 // optional shape.
 mlir::DenseIntElementsAttr CreateDenseIntElementsAttrFromVector(
-    const llvm::ArrayRef<int64_t> vector, mlir::Builder builder,
+    llvm::ArrayRef<int64_t> vector, mlir::Builder builder,
     llvm::ArrayRef<int64_t> shape = {});
 
 // Converts the given XLA shape for tensors to the template MLIR type.
@@ -62,7 +62,9 @@ static absl::StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
                                                       mlir::Builder builder) {
   auto element_type_or =
       ConvertPrimitiveTypeToMlirType(xla_ty.element_type(), builder);
-  if (!element_type_or.ok()) return element_type_or.status();
+  if (!element_type_or.ok()) {
+    return element_type_or.status();
+  }
 
   bool is_bounded_dynamic = false;
   int64_t rank = xla_ty.dimensions().size();
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc
index 8d9700036412ac..47b06d7f3ba22e 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc
@@ -75,17 +75,19 @@ mlir::ArrayAttr ConvertCrossProgramPrefetches(
                                     param_map[index] = arg_index++;
                                   });
     }
-    for (const auto& [parameter, index, alt_memory_offset] : prefetches)
+    for (const auto& [parameter, index, alt_memory_offset] : prefetches) {
       shapes.push_back(mlir::mhlo::CrossProgramPrefetchAttr::get(
           builder->getContext(),
           original_param_index_to_flattened_arg_index[parameter][index],
           /*indices=*/{}, alt_memory_offset));
+    }
   } else {
-    for (const auto& [parameter, index, alt_memory_offset] : prefetches)
+    for (const auto& [parameter, index, alt_memory_offset] : prefetches) {
       shapes.push_back(mlir::mhlo::CrossProgramPrefetchAttr::get(
           builder->getContext(), parameter,
           llvm::ArrayRef<int64_t>(index.data(), index.size()),
           alt_memory_offset));
+    }
   }
 
   return mlir::ArrayAttr::get(builder->getContext(), shapes);
@@ -236,12 +238,14 @@ void ImportFrontendAttributes(const HloModule& hlo_module,
                               mlir::ModuleOp module, mlir::Builder builder) {
   if (!hlo_module.frontend_attributes().map().empty()) {
     llvm::SmallVector<mlir::NamedAttribute, 4> frontend_attributes;
-    for (const auto& [k, v] : hlo_module.frontend_attributes().map())
+    for (const auto& [k, v] : hlo_module.frontend_attributes().map()) {
       frontend_attributes.push_back(
           builder.getNamedAttr(k, builder.getStringAttr(v)));
-    if (!frontend_attributes.empty())
+    }
+    if (!frontend_attributes.empty()) {
       module->setAttr(xla::kMhloFrontendAttributes,
                       builder.getDictionaryAttr(frontend_attributes));
+    }
   }
 }
 
@@ -279,10 +283,11 @@ void ImportNumReplicas(const HloModule& hlo_module, mlir::ModuleOp module,
 
 void ImportSpmdOutputSharding(const xla::HloModule& hlo_module,
                               mlir::ModuleOp module, mlir::Builder builder) {
-  if (hlo_module.has_spmd_output_sharding())
+  if (hlo_module.has_spmd_output_sharding()) {
     module->setAttr(
         xla::kMhloSpmdOutputSharding,
         ConvertSharding(hlo_module.spmd_output_sharding(), &builder));
+  }
 }
 
 void ImportSpmdParametersShardings(const HloModule& hlo_module,
@@ -294,10 +299,12 @@ void ImportSpmdParametersShardings(const HloModule& hlo_module,
     parameter_shardings.reserve(hlo_module.spmd_parameters_shardings().size());
     for (const auto& root_sharding : hlo_module.spmd_parameters_shardings()) {
       llvm::ArrayRef<HloSharding> shardings = root_sharding;
-      if (root_sharding.IsTuple() && flatten_computation_args_result)
+      if (root_sharding.IsTuple() && flatten_computation_args_result) {
         shardings = root_sharding.tuple_elements();
-      for (const auto& sharding : shardings)
+      }
+      for (const auto& sharding : shardings) {
         parameter_shardings.push_back(ConvertSharding(sharding, &builder));
+      }
     }
     module->setAttr(xla::kMhloSpmdParametersShardings,
                     builder.getArrayAttr(parameter_shardings));
@@ -319,7 +326,9 @@ mlir::DictionaryAttr AppendAutoLayoutModeAttribute(mlir::Builder builder,
   llvm::SmallVector<mlir::NamedAttribute> attrs;
   if (dict) {
     for (auto attr : dict.getValue()) {
-      if (attr.getName() != xla::kMhloLayoutMode) attrs.push_back(attr);
+      if (attr.getName() != xla::kMhloLayoutMode) {
+        attrs.push_back(attr);
+      }
     }
   }
   attrs.push_back(builder.getNamedAttr(xla::kMhloLayoutMode,
@@ -342,9 +351,12 @@ void ImportParameterLayoutModes(mlir::func::FuncOp main,
   CHECK_EQ(parameter_shapes.size(), main.getNumArguments());
   for (size_t i = 0; i < main.getNumArguments(); ++i) {
     const Shape& shape = *parameter_shapes[i];
-    if (shape.IsTuple() || (shape.IsArray() && shape.dimensions().size() == 0))
+    if (shape.IsTuple() || (shape.IsArray() && shape.dimensions().empty())) {
+      continue;
+    }
+    if (LayoutUtil::HasAnyLayout(*parameter_shapes[i])) {
       continue;
-    if (LayoutUtil::HasAnyLayout(*parameter_shapes[i])) continue;
+    }
     main.setArgAttrs(
         i, AppendAutoLayoutModeAttribute(builder, main.getArgAttrDict(i)));
   }
@@ -362,9 +374,12 @@ void ImportResultLayoutModes(mlir::func::FuncOp main,
   CHECK_EQ(result_shapes.size(), main.getNumResults());
   for (size_t i = 0; i < main.getNumResults(); ++i) {
     const Shape& shape = *result_shapes[i];
-    if (shape.IsTuple() || (shape.IsArray() && shape.dimensions().size() == 0))
+    if (shape.IsTuple() || (shape.IsArray() && shape.dimensions().empty())) {
       continue;
-    if (LayoutUtil::HasAnyLayout(shape)) continue;
+    }
+    if (LayoutUtil::HasAnyLayout(shape)) {
+      continue;
+    }
     main.setResultAttrs(
         i, AppendAutoLayoutModeAttribute(builder, main.getResultAttrDict(i)));
   }
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo
index 6602b160319b4e..d7c1d2ee712cdb 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo
@@ -206,6 +206,14 @@ add {
   ROOT %atanh.3 = f32[1,16,16,3]{3,2,1,0} atanh(f32[1,16,16,3]{3,2,1,0} %arg0.1)
 }
 
+// CHECK-LABEL:  func private @test_asinh(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
+%test_asinh (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
+  %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
+
+  // CHECK-NEXT:  mhlo.asinh %arg0 : tensor<1x16x16x3xf32>
+  ROOT %asinh.3 = f32[1,16,16,3]{3,2,1,0} asinh(f32[1,16,16,3]{3,2,1,0} %arg0.1)
+}
+
 // CHECK-LABEL:  func private @test_broadcast_in_dim
 %test_broadcast_in_dim {
   %Arg_0.1 = f32[1, 2] parameter(0)
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD
index a41c19d1ad2746..e15ec955ad1305 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD
@@ -279,7 +279,6 @@ xla_cc_test(
         "//xla/hlo/translate:register",
         "//xla/mlir/utils:error_util",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.td b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.td
index 69435b32005695..5fe06a73860452 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.td
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.td
@@ -203,6 +203,8 @@ defvar CustomHloConverterOps = [
   MHLO_AllGatherOp,
   MHLO_AllReduceOp,
   MHLO_AllToAllOp,
+  MHLO_AsinOp,
+  MHLO_AsinhOp,
   MHLO_AsyncDoneOp,
   MHLO_AsyncStartOp,
   MHLO_AsyncUpdateOp,
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
index d8cf68c1cc61f8..72ffe8250e14ff 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
@@ -37,7 +37,8 @@ absl::Status RewriteLayoutWithShardedShape(
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
     xla::Shape* xla_shape) {
-  if (sharding && !sharding->IsTileMaximal() && !sharding->IsManual()) {
+  if (sharding && !sharding->IsTileMaximal() && !sharding->IsManual() &&
+      !sharding->IsUnreduced()) {
     // After sharding, per core shape might have different layout. For example,
     // before sharding, a shape [128, 128] will be assigned default
     // minor-to-major {1, 0}. But after we shard this shape to [128, 64] * 2,
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 34446cf4f9aecd..ab3246513d97ff 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -136,6 +136,9 @@ using ::tsl::uint8;
 constexpr char kAggregateToTopk[] = "aggregate_to_topk";
 constexpr char kApiVersion[] = "api_version";
 constexpr char kApproxTopK[] = "ApproxTopK";
+constexpr char kSparseActivationsUnstack[] = "SparseActivationsUnstack";
+constexpr char kSparseActivationsUnstackInterleaved[] =
+    "SparseActivationsUnstackInterleaved";
 constexpr char kBackendConfig[] = "backend_config";
 constexpr char kCallTargetName[] = "call_target_name";
 constexpr char kCalledComputations[] = "called_computations";
@@ -170,6 +173,11 @@ T* Unwrap(const std::unique_ptr<T>& t) {
   return t.get();
 }
 
+constexpr bool CustomCallOpReturnTuple(absl::string_view name) {
+  return name == kSparseActivationsUnstack ||
+         name == kSparseActivationsUnstackInterleaved;
+}
+
 static mlir::LogicalResult GetXlaOp(
     mlir::Value val, const llvm::DenseMap<mlir::Value, xla::XlaOp>& val_map,
     xla::XlaOp* result, mlir::Operation* op) {
@@ -1315,7 +1323,6 @@ void BuildGetTupleElementsForTupleResults(
     mlir::Operation* op, xla::XlaOp tuple, xla::XlaBuilder* builder,
     llvm::DenseMap<mlir::Value, xla::XlaOp>& values,
     unsigned num_implicit_results = 0) {
-
   const std::optional<xla::OpSharding>& sharding = builder->sharding();
   if (sharding.has_value()) {
     bool is_tuple_sharding = sharding->type() == xla::OpSharding::TUPLE;
@@ -2354,7 +2361,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
       auto name = attr.getName();
       return name == kCallTargetName || name == kBackendConfig ||
              name == kApiVersion || name == kCalledComputations ||
-             name == kHasSideEffect;
+             name == kHasSideEffect || name == xla::kMhloFrontendAttributes;
     };
     for (const auto& attr : op->getAttrs()) {
       if (!isSupportedAttrName(attr))
@@ -2658,6 +2665,11 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
     }
     result_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
   }
+  bool return_tuple = false;
+  if (!result_shape.IsTuple() && CustomCallOpReturnTuple(call_target_name)) {
+    return_tuple = true;
+    result_shape = xla::ShapeUtil::MakeTupleShape({result_shape});
+  }
 
   xla::XlaOp custom_call;
   if (op.getCalledComputations().size() == 1 && op.getOperandLayouts() &&
@@ -2705,8 +2717,17 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
         custom_call_schedule, *xla_api_version);
   }
 
-  if (op->getNumResults() == 1) {
+  if (op->getNumResults() == 1 && !return_tuple) {
     value_map[op.getResult(0)] = custom_call;
+  } else if (op.getCallTargetName() ==
+                 xla::sdy::kGlobalToLocalShapeCallTargetName ||
+             op.getCallTargetName() ==
+                 xla::sdy::kLocalToGlobalShapeCallTargetName) {
+    // ShardyXLA has the hlo -> stablehlo -> hlo round trip. These
+    // get-tuple-elements do not need to hold the frontend attributes.
+    xla::XlaScopedFrontendAttributesAssignment frontend_attributes_scope(
+        ctx.builder, xla::FrontendAttributes());
+    BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   } else {
     BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   }
@@ -4071,7 +4092,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
       auto name = attr.getName();
       return name == kCallTargetName || name == kBackendConfig ||
              name == kApiVersion || name == kCalledComputations ||
-             name == kHasSideEffect;
+             name == kHasSideEffect || name == xla::kMhloFrontendAttributes;
     };
     for (const auto& attr : op->getAttrs()) {
       if (!isSupportedAttrName(attr))
@@ -4364,6 +4385,11 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
     }
     result_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
   }
+  bool return_tuple = false;
+  if (!result_shape.IsTuple() && CustomCallOpReturnTuple(call_target_name)) {
+    return_tuple = true;
+    result_shape = xla::ShapeUtil::MakeTupleShape({result_shape});
+  }
 
   xla::XlaOp custom_call;
   if (op.getCalledComputations().size() == 1 && op.getOperandLayouts() &&
@@ -4409,8 +4435,17 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
         *custom_call_schedule, *xla_api_version);
   }
 
-  if (op->getNumResults() == 1) {
+  if (op->getNumResults() == 1 && !return_tuple) {
     value_map[op.getResult(0)] = custom_call;
+  } else if (op.getCallTargetName() ==
+                 xla::sdy::kGlobalToLocalShapeCallTargetName ||
+             op.getCallTargetName() ==
+                 xla::sdy::kLocalToGlobalShapeCallTargetName) {
+    // ShardyXLA has the hlo -> stablehlo -> hlo round trip. These
+    // get-tuple-elements do not need to hold the frontend attributes.
+    xla::XlaScopedFrontendAttributesAssignment frontend_attributes_scope(
+        ctx.builder, xla::FrontendAttributes());
+    BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   } else {
     BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   }
@@ -5208,6 +5243,28 @@ LogicalResult ExportElementwiseXlaOp(Op op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(AsinOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op))) {
+    return failure();
+  }
+  value_map[op] =
+      xla::Asin(operand, /*result_accuracy=*/std::nullopt, /*expand=*/false);
+  return success();
+}
+
+LogicalResult ExportXlaOp(AsinhOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op))) {
+    return failure();
+  }
+  value_map[op] =
+      xla::Asinh(operand, /*result_accuracy=*/std::nullopt, /*expand=*/false);
+  return success();
+}
+
 LogicalResult ExportXlaOp(AcosOp op, OpLoweringContext ctx) {
   return ExportElementwiseXlaOp<AcosOp, xla::Acos>(op, ctx);
 }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo_test.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo_test.cc
index f1380079de9439..f1fc93cf915d0b 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo_test.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/translate/register.h"
 #include "xla/mlir/utils/error_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 // This file should contain tests for interfaces that can't be tested at the
@@ -37,7 +36,6 @@ namespace {
 using testing::_;
 using testing::AllOf;
 using testing::HasSubstr;
-using tsl::testing::StatusIs;
 
 TEST(ConvertMlirHloToHloModuleTest, PropagatesDiagnostics) {
   const std::string mlir_source = R"mlir(
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/BUILD b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/BUILD
index 19b645f6f2a195..a5d320ac8841ca 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/BUILD
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/BUILD
@@ -12,6 +12,8 @@ lit_test_suite(
         [
             "acos.mlir",
             "acosh.mlir",
+            "asin.mlir",
+            "asinh.mlir",
             "add.mlir",
             "atanh.mlir",
             "attributes.mlir",
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/asin.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/asin.mlir
new file mode 100644
index 00000000000000..657bf5e5f75116
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/asin.mlir
@@ -0,0 +1,7 @@
+// RUN: xla-translate -mlir-hlo-to-hlo-text %s | FileCheck %s
+
+func.func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: f32[4] asin
+  %0 = "mhlo.asin"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  func.return %0 : tensor<4xf32>
+}
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/asinh.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/asinh.mlir
new file mode 100644
index 00000000000000..8dbd0ca95037ba
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/asinh.mlir
@@ -0,0 +1,7 @@
+// RUN: xla-translate -mlir-hlo-to-hlo-text %s | FileCheck %s
+
+func.func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: f32[4] asinh
+  %0 = "mhlo.asinh"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  func.return %0 : tensor<4xf32>
+}
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
index e32d833d0d7c2d..2b7e05c8e09c23 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
@@ -906,6 +906,33 @@ func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: te
 
 // -----
 
+// CHECK:  HloModule
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      top_k = 4 : i64,
+      is_fallback = true
+      },
+    mhlo.frontend_attributes = {_some_attribute = "some_value"}
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// CHECK: ENTRY
+// CHECK: frontend_attributes={_some_attribute="some_value"}
+
+// -----
+
 // CHECK:  HloModule
 func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
   %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
@@ -3342,3 +3369,49 @@ func.func @main(%arg0: tensor<i1>, %arg1: memref<2xf32>) -> memref<2xf32> {
     }
   func.return %0#1: memref<2xf32>
 }
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstack"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "SparseActivationsUnstack", backend_config = "", operand_layouts = [dense<[1, 0]> : tensor<2xindex>], result_layout = [dense<[0, 1]> : tensor<2xindex>], result_layouts = [dense<[0, 1]> : tensor<2xindex>], xla_shape = "(f32[8,6]{0,1})"} : (tensor<8x8xf32>) -> tensor<8x6xf32>
+  func.return %0: tensor<8x6xf32>
+}
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstackInterleaved"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "SparseActivationsUnstackInterleaved", backend_config = "", operand_layouts = [dense<[1, 0]> : tensor<2xindex>], result_layout = [dense<[0, 1]> : tensor<2xindex>], result_layouts = [dense<[0, 1]> : tensor<2xindex>], xla_shape = "(f32[8,6]{0,1})"} : (tensor<8x8xf32>) -> tensor<8x6xf32>
+  func.return %0: tensor<8x6xf32>
+}
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstack"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "SparseActivationsUnstack", backend_config = "", operand_layouts = [dense<[1, 0]> : tensor<2xindex>], result_layout = [dense<[0, 1]> : tensor<2xindex>], result_layouts = [dense<[0, 1]> : tensor<2xindex>], xla_shape = "(f32[8,6]{0,1})"} : (tensor<8x8xf32>) -> tuple<tensor<8x6xf32>>
+  %1 = mhlo.get_tuple_element %0[0] : (tuple<tensor<8x6xf32>>) -> tensor<8x6xf32>
+  func.return %1: tensor<8x6xf32>
+}
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6], f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstack"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "mhlo.custom_call"(%arg0, %arg0) {call_target_name = "SparseActivationsUnstack", backend_config = "", xla_shape = "(f32[8,6]{0,1}, f32[8,6]{0,1})"} : (tensor<8x8xf32>, tensor<8x8xf32>) -> tuple<tensor<8x6xf32>, tensor<8x6xf32>>
+  %1 = mhlo.get_tuple_element %0[0] : (tuple<tensor<8x6xf32>, tensor<8x6xf32>>) -> tensor<8x6xf32>
+  func.return %1: tensor<8x6xf32>
+}
diff --git a/third_party/xla/xla/hlo/translate/stablehlo.cc b/third_party/xla/xla/hlo/translate/stablehlo.cc
index f63514c26ad02a..3479fe34329acb 100644
--- a/third_party/xla/xla/hlo/translate/stablehlo.cc
+++ b/third_party/xla/xla/hlo/translate/stablehlo.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 #include "stablehlo/transforms/Passes.h"
+#include "stablehlo/transforms/optimization/Passes.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h"
 #include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
@@ -73,9 +74,11 @@ absl::Status StablehloToMhlo(mlir::ModuleOp module, bool run_canonicalizer) {
       mlir::mhlo::createChloLegalizeToHighLevelMhloPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::createChloLegalizeToStablehloPass());
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   if (run_canonicalizer) {
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::stablehlo::createStablehloTargetIndependentOptimizationPass({
+            /*assumeNoUndeclaredSideEffects=*/true,
+        }));
     pm.addPass(mlir::stablehlo_ext::
                    createStablehloSanitizeDiscardableAttributesPass());
   }
@@ -108,6 +111,7 @@ absl::Status ConvertStablehloToHloProtoInternal(mlir::ModuleOp module,
   mlir::MlirToHloConversionOptions options;
   options.return_tuple = return_tuple;
   options.use_tuple_args = use_tuple_args;
+  options.direct_stablehlo_to_hlo = true;
   TF_RETURN_IF_ERROR(mlir::ConvertMlirHloToHlo(module, hlo_proto, options));
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/hlo/translate/tests/stablehlo.mlir b/third_party/xla/xla/hlo/translate/tests/stablehlo.mlir
index c86236f84e88bc..fa099a9e75b81d 100644
--- a/third_party/xla/xla/hlo/translate/tests/stablehlo.mlir
+++ b/third_party/xla/xla/hlo/translate/tests/stablehlo.mlir
@@ -2169,3 +2169,49 @@ func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> tensor<?xf32> {
   %cast = tensor.cast %0 : tensor<?xf32, #stablehlo.bounds<4>> to tensor<?xf32>
   return %cast : tensor<?xf32>
 }
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstackInterleaved"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "stablehlo.custom_call"(%arg0) {call_target_name = "SparseActivationsUnstackInterleaved", backend_config = "", operand_layouts = [dense<[1, 0]> : tensor<2xindex>], result_layout = [dense<[0, 1]> : tensor<2xindex>], result_layouts = [dense<[0, 1]> : tensor<2xindex>], xla_shape = "(f32[8,6]{0,1})"} : (tensor<8x8xf32>) -> tensor<8x6xf32>
+  func.return %0: tensor<8x6xf32>
+}
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstack"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "stablehlo.custom_call"(%arg0) {call_target_name = "SparseActivationsUnstack", backend_config = "", operand_layouts = [dense<[1, 0]> : tensor<2xindex>], result_layout = [dense<[0, 1]> : tensor<2xindex>], result_layouts = [dense<[0, 1]> : tensor<2xindex>], xla_shape = "(f32[8,6]{0,1})"} : (tensor<8x8xf32>) -> tensor<8x6xf32>
+  func.return %0: tensor<8x6xf32>
+}
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstack"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "stablehlo.custom_call"(%arg0) {call_target_name = "SparseActivationsUnstack", backend_config = "", operand_layouts = [dense<[1, 0]> : tensor<2xindex>], result_layout = [dense<[0, 1]> : tensor<2xindex>], result_layouts = [dense<[0, 1]> : tensor<2xindex>], xla_shape = "(f32[8,6]{0,1})"} : (tensor<8x8xf32>) -> tuple<tensor<8x6xf32>>
+  %1 = stablehlo.get_tuple_element %0[0] : (tuple<tensor<8x6xf32>>) -> tensor<8x6xf32>
+  func.return %1: tensor<8x6xf32>
+}
+
+// -----
+
+// CHECK: HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<8x8xf32>) -> tensor<8x6xf32> {
+  // CHECK: [[RES:%custom.*]] = (f32[8,6], f32[8,6]) custom-call(%{{.*}}), custom_call_target="SparseActivationsUnstack"
+  // CHECK-NEXT: ROOT %[[ROOT:.*]] = f32[8,6] get-tuple-element([[RES]]), index=0
+  %0 = "stablehlo.custom_call"(%arg0, %arg0) {call_target_name = "SparseActivationsUnstack", backend_config = "", xla_shape = "(f32[8,6]{0,1}, f32[8,6]{0,1})"} : (tensor<8x8xf32>, tensor<8x8xf32>) -> tuple<tensor<8x6xf32>, tensor<8x6xf32>>
+  %1 = stablehlo.get_tuple_element %0[0] : (tuple<tensor<8x6xf32>, tensor<8x6xf32>>) -> tensor<8x6xf32>
+  func.return %1: tensor<8x6xf32>
+}
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers.h b/third_party/xla/xla/hlo/utils/hlo_matchers.h
index a3993e4e8c8635..7979347c6e442d 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers.h
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers.h
@@ -299,6 +299,7 @@ HLO_MATCHER(Abs);
 HLO_MATCHER(Acos);
 HLO_MATCHER(Acosh);
 HLO_MATCHER(Asin);
+HLO_MATCHER(Asinh);
 HLO_MATCHER(Atanh);
 HLO_MATCHER(Add);
 HLO_MATCHER(AddDependency);
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 29f930b5382b74..56db86ab617b02 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -3057,8 +3057,8 @@ Shape TileLeafShape(const HloSharding& sharding, const Shape& shape) {
 }
 
 absl::Status CanonicalizeLayoutAfterShardingPropagation(
-    HloModule* module, const std::vector<bool>& update_output_layout,
-    const std::vector<bool>& update_parameters_layout) {
+    HloModule* module, absl::Span<const bool> update_output_layout,
+    absl::Span<const bool> update_parameters_layout) {
   if (!module->layout_canonicalization_callback()) {
     VLOG(4) << "There is no registered layout_canonicalization_callback.";
     return absl::OkStatus();
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 693f31cffd5077..8e0f0768c932ef 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -574,8 +574,8 @@ Shape TileLeafShape(const HloSharding& sharding, const Shape& shape);
 // DetermineArgumentLayoutsFromCompileOptions() in
 // tensorflow/compiler/xla/pjrt/utils.h.
 absl::Status CanonicalizeLayoutAfterShardingPropagation(
-    HloModule* module, const std::vector<bool>& update_output_layout,
-    const std::vector<bool>& update_parameters_layout);
+    HloModule* module, absl::Span<const bool> update_output_layout,
+    absl::Span<const bool> update_parameters_layout);
 
 // Returns true iff the specified hlo or sharding has a spatially partitioned
 // sharding (tiled or replicated) that can be propagated by sharding
diff --git a/third_party/xla/xla/layout.h b/third_party/xla/xla/layout.h
index 717345f004e813..940c149d9f59e7 100644
--- a/third_party/xla/xla/layout.h
+++ b/third_party/xla/xla/layout.h
@@ -292,12 +292,13 @@ class Layout {
 
     Equal& MinorToMajorOnly() {
       return IgnoreTiles()
+          .IgnoreTailPaddingAlignmentInElements()
+          .IgnoreElementSize()
           .IgnoreIndexPrimitiveType()
           .IgnorePointerPrimitiveType()
           .IgnoreMemorySpace()
-          .IgnorePhysicalShape()
-          .IgnoreElementSize()
-          .IgnoreTailPaddingAlignmentInElements();
+          .IgnoreSplitConfigs()
+          .IgnorePhysicalShape();
     }
 
    private:
diff --git a/third_party/xla/xla/layout_test.cc b/third_party/xla/xla/layout_test.cc
index 91e6893f09711b..79baac745f2f85 100644
--- a/third_party/xla/xla/layout_test.cc
+++ b/third_party/xla/xla/layout_test.cc
@@ -165,6 +165,12 @@ TEST(Layout, Equality) {
   EXPECT_TRUE(Layout::Equal().IgnoreSplitConfigs()(
       Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
       Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {3}))));
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
+                      Layout({0, 1, 2})));
+  EXPECT_TRUE(Layout::Equal().MinorToMajorOnly()(
+      Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
+      Layout({0, 1, 2})));
 }
 
 TEST(Layout, LayoutToFromProto) {
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index 223ebfa1758d56..46931593d4d966 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -632,4 +632,39 @@ Layout LayoutUtil::MoveDimToMinor(const Layout& layout, const int64_t dim) {
              : std::nullopt;
 }
 
+/*static*/ bool LayoutUtil::IsUntiledLayout(absl::Span<const Tile> tiles,
+                                            absl::Span<const int64_t> shape) {
+  // Tiles are applied recursively to expand current_shape
+  // Example: (t0, t1) tile applied to (..., n, m) expands it to
+  // (..., ceildiv(n, t0), ceildiv(m, t1), t0, t1)
+  std::vector<int64_t> current_shape(shape.begin(), shape.end());
+  for (const Tile& tile : tiles) {
+    const int64_t tile_ndims = tile.dimensions().size();
+    CHECK_LE(tile_ndims, current_shape.size());
+    const absl::Span<const int64_t> tiled_shape =
+        absl::Span<const int64_t>(current_shape).last(tile_ndims);
+    // new_tiled_shape will hold the tiled shape after the tile is applied.
+    std::vector<int64_t> new_tiled_shape(2 * tile_ndims);
+    bool allow_multiple_tiles = true;
+    for (int64_t i = 0; i < tile_ndims; ++i) {
+      if (tiled_shape[i] % tile.dimension(i) != 0) {
+        return false;
+      }
+      CHECK_GT(tile.dimension(i), 0);
+      new_tiled_shape[i] = tiled_shape[i] / tile.dimension(i);
+      new_tiled_shape[tile_ndims + i] = tile.dimension(i);
+      if (!allow_multiple_tiles && new_tiled_shape[i] != 1) {
+        return false;
+      }
+      if (tile.dimension(i) != 1) {
+        allow_multiple_tiles = false;
+      }
+    }
+    current_shape.erase(current_shape.end() - tile_ndims, current_shape.end());
+    current_shape.insert(current_shape.end(), new_tiled_shape.begin(),
+                         new_tiled_shape.end());
+  }
+  return true;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/layout_util.h b/third_party/xla/xla/layout_util.h
index e5f807c0286eb3..2c9a28edd85864 100644
--- a/third_party/xla/xla/layout_util.h
+++ b/third_party/xla/xla/layout_util.h
@@ -273,6 +273,13 @@ class LayoutUtil {
 
   // Returns a shape's split config if present.
   static std::optional<SplitConfig> GetSplitConfig(const Shape& shape);
+
+  // Returns true if the layout tiling is equivalent to having no tiles at all.
+  // This is not a complete check and may return false for some unusual tilings
+  // even if they _are_ effectively untiled.
+  // The tiling should be valid for the provided shape.
+  static bool IsUntiledLayout(absl::Span<const Tile> tiles,
+                              absl::Span<const int64_t> shape);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/layout_util_test.cc b/third_party/xla/xla/layout_util_test.cc
index 3107b88330824c..d69aea71a647b9 100644
--- a/third_party/xla/xla/layout_util_test.cc
+++ b/third_party/xla/xla/layout_util_test.cc
@@ -26,9 +26,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -36,8 +34,6 @@ namespace {
 
 using ::testing::ContainsRegex;
 using ::testing::HasSubstr;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 class LayoutUtilTest : public ::testing::Test {
  protected:
@@ -513,5 +509,25 @@ TEST_F(LayoutUtilTest, MaxElementsInPerSplit) {
   EXPECT_EQ(LayoutUtil::MaxElementsInPerSplit(shape), 150 * 90 * 70);
 }
 
+struct IsUntiledLayoutTestCase {
+  std::vector<int64_t> shape;
+  std::vector<Tile> tiles;
+  bool expected_result;
+};
+
+using IsUntiledLayoutTest = ::testing::TestWithParam<IsUntiledLayoutTestCase>;
+
+TEST_P(IsUntiledLayoutTest, IsUntiledLayout) {
+  IsUntiledLayoutTestCase params = GetParam();
+  EXPECT_EQ(LayoutUtil::IsUntiledLayout(params.tiles, params.shape),
+            params.expected_result);
+}
+
+INSTANTIATE_TEST_SUITE_P(IsUntiledLayoutTests, IsUntiledLayoutTest,
+                         ::testing::ValuesIn<IsUntiledLayoutTestCase>(
+                             {{{24, 128}, {Tile({8, 128})}, true},
+                              {{4, 256}, {Tile({1, 128})}, true},
+                              {{2, 3, 4}, {Tile({8, 128})}, false}}));
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl
index e5593b817409b3..2d67642b096085 100644
--- a/third_party/xla/xla/lit.bzl
+++ b/third_party/xla/xla/lit.bzl
@@ -60,6 +60,7 @@ def lit_test_suite(
         hermetic_cuda_data_dir = None,
         exec_properties = {},
         tags = [],
+        gpu_suffix = "",
         **kwargs):
     """Creates one lit test per source file and a test suite that bundles them.
 
@@ -91,6 +92,8 @@ def lit_test_suite(
       tags: string list. Tags applied to all tests and the test suite.
       exec_properties: string_dict. Properties to pass to the test rule, e.g.
         requirement to run on a GPU.
+      gpu_suffix: string. A suffix derived from the gpu name that can be added
+        to make (file) names unique.
       **kwargs: additional keyword arguments to pass to all generated rules.
 
     See https://llvm.org/docs/CommandGuide/lit.html for details on lit
@@ -109,7 +112,7 @@ def lit_test_suite(
         # It's generally good practice to prefix any generated names with the
         # macro name, but it's also nice to have the test name just match the
         # file name.
-        test_name = "%s.test" % (test_file)
+        test_name = "%s.test%s" % (test_file, gpu_suffix)
         tests.append(test_name)
         lit_test(
             name = test_name,
@@ -124,6 +127,7 @@ def lit_test_suite(
             tags = tags + default_tags + tags_override.get(test_file, []),
             hermetic_cuda_data_dir = hermetic_cuda_data_dir,
             exec_properties = exec_properties,
+            gpu_suffix = gpu_suffix,
             **kwargs
         )
 
@@ -134,6 +138,101 @@ def lit_test_suite(
         **kwargs
     )
 
+def lit_test_suite_for_gpus(
+        name,
+        srcs,
+        cfg,
+        tools = None,
+        args = [],
+        data = [],
+        visibility = None,
+        env = None,
+        timeout = None,
+        default_tags = None,
+        tags_override = None,
+        hermetic_cuda_data_dir = None,
+        exec_properties = {},
+        tags = [],
+        gpus = ["a6000"],
+        disabled_on_gpus = {},
+        **kwargs):
+    """Creates one lit test suite per gpu.
+
+    Args:
+      name: string. the name prefix of the generated test suite. Each test suite
+        will get the gpu name as suffix.
+      srcs: label_list. The files which contain the lit tests.
+      cfg: label. The lit config file. It must list the file extension of
+        the files in `srcs` in config.suffixes and must be in a parent directory
+        of `srcs`.
+      tools: label list. Tools invoked in the lit RUN lines. These binaries will
+        be symlinked into a directory which is on the path. They must therefore
+        have unique basenames. Note that tools that are xla_cc_binary targets
+        will also need to have linkopts = ["-Wl,-rpath,$$ORIGIN/../lit_lib"],
+        otherwise they will not work properly with hermetic cuda.
+      args: string list. Additional arguments to pass to lit. Note that the test
+        file, `-v`, and a `--path` argument for the directory to which `tools`
+        are symlinked are added automatically.
+      data: label list. Additional data dependencies of the test. Note that
+        targets in `cfg` and `tools`, as well as their data dependencies, are
+        added automatically.
+      visibility: visibility of the generated test targets and test suite.
+      env: string_dict. Environment variables available during test execution.
+        See the common Bazel test attribute.
+      timeout: timeout argument passed to the individual tests.
+      default_tags: string list. Tags applied to all tests.
+      tags_override: string_dict. Tags applied in addition to only select tests.
+      hermetic_cuda_data_dir: string. If set, the tests will be run with a
+        `--xla_gpu_cuda_data_dir` flag set to the hermetic CUDA data directory.
+      tags: string list. Tags applied to all tests and the test suite.
+      exec_properties: string_dict. Properties to pass to the test rule, e.g.
+        requirement to run on a GPU.
+      gpus: string list. GPU names for which a lit test suite should be
+        generated. Supported GPU names are: p100, v100, a100_pcie, a6000, h100,
+        b200, mi200.
+      disabled_on_gpus: string_dict. For a gpu name (key) contains a list of
+        test files that should be skipped.
+      **kwargs: additional keyword arguments to pass to all generated rules.
+
+    See https://llvm.org/docs/CommandGuide/lit.html for details on lit
+    """
+    # If there are kwargs that need to be passed to only some of the generated
+    # rules, they should be extracted into separate named arguments.
+
+    for gpu in gpus:
+        filtered_srcs = [src for src in srcs if src not in disabled_on_gpus.get(gpu, [])]
+        gpu_args = args + [
+            "--param=PTX=%s" % ("GCN" if gpu == "mi200" else "PTX"),
+            "--param=GPU=%s" % (gpu),
+        ]
+        gpu_data = data + [
+            "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
+            "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
+            "//xla/tools/hlo_opt:gpu_specs/b200.txtpb",
+            "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+            "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
+            "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
+            "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
+        ]
+        lit_test_suite(
+            "%s_%s" % (name, gpu),
+            filtered_srcs,
+            cfg,
+            tools,
+            gpu_args,
+            gpu_data,
+            visibility,
+            env,
+            timeout,
+            default_tags,
+            tags_override,
+            hermetic_cuda_data_dir,
+            exec_properties,
+            tags + ["rocm-only"] if gpu == "mi200" else ["cuda-only"],
+            "_%s" % (gpu),
+            **kwargs
+        )
+
 def lit_script_with_xla_gpu_cuda_data_dir(
         name,
         input_file,
@@ -163,6 +262,7 @@ def lit_test(
         timeout = None,
         hermetic_cuda_data_dir = None,
         exec_properties = {},
+        gpu_suffix = "",
         **kwargs):
     """Runs a single test file with LLVM's lit tool.
 
@@ -191,6 +291,8 @@ def lit_test(
         `--xla_gpu_cuda_data_dir` flag set to the hermetic CUDA data directory.
       exec_properties: string_dict. Properties to pass to the test rule, e.g.
         requirement to run on a GPU.
+      gpu_suffix: string. A suffix derived from the gpu name that can be added
+        to make (file) names unique.
       **kwargs: additional keyword arguments to pass to all generated rules.
 
     See https://llvm.org/docs/CommandGuide/lit.html for details on lit
@@ -255,8 +357,8 @@ def lit_test(
     # copybara:comment_end
 
     if hermetic_cuda_data_dir:
-        output_file = "with_xla_gpu_cuda_data_dir_{}".format(test_file)
-        rule_name = "script_{}".format(output_file)
+        output_file = "with_xla_gpu_cuda_data_dir%s_%s" % (gpu_suffix, test_file)
+        rule_name = "script%s_%s" % (gpu_suffix, output_file)
         lit_script_with_xla_gpu_cuda_data_dir(
             rule_name,
             test_file,
diff --git a/third_party/xla/xla/literal_comparison.cc b/third_party/xla/xla/literal_comparison.cc
index 0e4898b5ae9012..3d5fce6c2291bd 100644
--- a/third_party/xla/xla/literal_comparison.cc
+++ b/third_party/xla/xla/literal_comparison.cc
@@ -965,7 +965,7 @@ absl::Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
 }
 
 std::string ToStringTruncated(const LiteralSlice& literal) {
-  return RecursiveElementCount(literal.shape()) < 1000
+  return VLOG_IS_ON(2) || RecursiveElementCount(literal.shape()) < 1000
              ? literal.ToString()
              : "[TRUNCATED, Literal with more than 1000 values]";
 }
diff --git a/third_party/xla/xla/literal_util.h b/third_party/xla/xla/literal_util.h
index ffad481a489b6d..15afb44badcdd8 100644
--- a/third_party/xla/xla/literal_util.h
+++ b/third_party/xla/xla/literal_util.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define XLA_LITERAL_UTIL_H_
 
 #include <array>
+#include <cmath>
 #include <cstdint>
 #include <initializer_list>
 #include <iterator>
@@ -40,6 +41,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
index d65b5bdd4099f6..7c7f8a5d1d7d08 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
@@ -118,7 +118,5 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/symbol_finder_test.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/symbol_finder_test.cc
index 6e25790f4e4107..043f93f9ac41df 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/symbol_finder_test.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/symbol_finder_test.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/test.h"
 
 namespace mlir::interpreter {
 
@@ -28,8 +26,6 @@ namespace mlir::interpreter {
 
 namespace {
 using ::testing::NotNull;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(SymbolFinderTest, FindSymbolInProcess) {
   // `malloc` should be available on every platform
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
index 4fef179d7989d6..775e8a0cdb3bba 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
@@ -2,7 +2,7 @@
 
 func.func @no_inputs() -> tensor<4xf32> {
   %init = arith.constant dense<[1.0,2.0,3.0,4.0]> : tensor<4xf32>
-  %zero = linalg.map outs(%init:tensor<4xf32>)() {
+  %zero = linalg.map outs(%init:tensor<4xf32>)(%out: f32) {
     %0 = arith.constant 0.0: f32
     linalg.yield %0: f32
   }
@@ -19,7 +19,7 @@ func.func @binary() -> tensor<4xi32> {
   %rhs = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
   %add = linalg.map ins(%lhs, %rhs: tensor<4xi32>, tensor<4xi32>)
                     outs(%init: tensor<4xi32>)
-    (%lhs_elem: i32, %rhs_elem: i32) {
+    (%lhs_elem: i32, %rhs_elem: i32, %out: i32) {
       %0 = arith.addi %lhs_elem, %rhs_elem: i32
       linalg.yield %0: i32
     }
@@ -36,7 +36,7 @@ func.func @memref() -> memref<4xi32> {
   %rhs = arith.constant dense<[10, 20, 30, 40]> : memref<4xi32>
   linalg.map ins(%lhs, %rhs: memref<4xi32>, memref<4xi32>)
              outs(%alloc: memref<4xi32>)
-    (%lhs_elem: i32, %rhs_elem: i32) {
+    (%lhs_elem: i32, %rhs_elem: i32, %out: i32) {
       %0 = arith.muli %lhs_elem, %rhs_elem: i32
       linalg.yield %0: i32
     }
@@ -49,7 +49,7 @@ func.func @memref() -> memref<4xi32> {
 
 func.func @index() -> memref<4xindex> {
   %alloc = memref.alloc() : memref<4xindex>
-  linalg.map outs(%alloc: memref<4xindex>)() {
+  linalg.map outs(%alloc: memref<4xindex>)(%out: index) {
     %0 = linalg.index 0 : index
     linalg.yield %0: index
   }
@@ -63,7 +63,8 @@ func.func @index() -> memref<4xindex> {
 func.func @vector() -> memref<4xvector<2xindex>> {
   %c = arith.constant dense<42> : vector<2xindex>
   %alloc = memref.alloc() : memref<4xvector<2xindex>>
-  linalg.map outs(%alloc: memref<4xvector<2xindex>>)() {
+  linalg.map outs(%alloc: memref<4xvector<2xindex>>)
+  (%out: vector<2xindex>) {
     linalg.yield %c: vector<2xindex>
   }
   func.return %alloc : memref<4xvector<2xindex>>
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/splat.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
deleted file mode 100644
index bfbb76b38af58f..00000000000000
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
-
-func.func @splat() -> vector<2x4xi32> {
-  %c42 = arith.constant 42 : i32
-  %splat = vector.splat %c42 : vector<2x4xi32>
-  return %splat : vector<2x4xi32>
-}
-
-// CHECK-LABEL: @splat
-// CHECK-NEXT: Results
-// CHECK-NEXT{LITERAL}: vector<2x4xi32>: [[42, 42, 42, 42], [42, 42, 42, 42]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
index b84cbfac8c0fcc..870b1216121a38 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
@@ -607,16 +607,6 @@ InterpreterValue Shuffle(InterpreterState& state, vector::ShuffleOp shuffle,
   return result;
 }
 
-InterpreterValue Splat(InterpreterState&, vector::SplatOp op,
-                       const InterpreterValue& in) {
-  auto out = in.AsUnitTensor(/*is_vector=*/true);
-  auto& view = out.View();
-  view.sizes = llvm::to_vector(
-      mlir::cast<ShapedType>(op->getResultTypes()[0]).getShape());
-  view.strides = SmallVector<int64_t>(view.sizes.size(), 0);
-  return out;
-}
-
 void Store(InterpreterState& state, vector::StoreOp,
            const InterpreterValue& src, InterpreterValue dst,
            ArrayRef<int64_t> offsets) {
@@ -817,7 +807,6 @@ REGISTER_MLIR_INTERPRETER_OP(OuterProduct);
 REGISTER_MLIR_INTERPRETER_OP(Reduction);
 REGISTER_MLIR_INTERPRETER_OP(ShapeCast);
 REGISTER_MLIR_INTERPRETER_OP(Shuffle);
-REGISTER_MLIR_INTERPRETER_OP(Splat);
 REGISTER_MLIR_INTERPRETER_OP(Store);
 REGISTER_MLIR_INTERPRETER_OP(TransferRead);
 REGISTER_MLIR_INTERPRETER_OP(TransferWrite);
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
index cdf5d4addc7f91..f688617539ae56 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
@@ -7,6 +7,7 @@ xla_cc_test(
     srcs = ["tensor_or_memref_test.cc"],
     deps = [
         "//xla/mlir/tools/mlir_interpreter/framework",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
index dce2bca57e5fb9..188996a1701a24 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <optional>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -63,8 +63,12 @@ std::optional<int64_t> GetCollapsedStrideNaive(llvm::ArrayRef<int64_t> dims,
     v[*view.GetPhysicalIndex(view_indices)] = true;
   }
 
-  if (v.count() != f.GetNumElements()) return std::nullopt;
-  if (f.GetNumElements() <= 1) return 0;
+  if (v.count() != f.GetNumElements()) {
+    return std::nullopt;
+  }
+  if (f.GetNumElements() <= 1) {
+    return 0;
+  }
 
   // Check that they have a common stride.
   int64_t min = v.find_first();
@@ -79,21 +83,23 @@ std::optional<int64_t> GetCollapsedStrideNaive(llvm::ArrayRef<int64_t> dims,
 }
 
 TEST(TensorOrMemrefTest, CollapsedStride) {
-  BufferView view{.sizes = {1, 2, 3, 1, 5},
-                  .strides = BufferView::GetDefaultStrides({1, 2, 3, 1, 5})};
+  BufferView view{/*offset=*/0, /*sizes=*/{1, 2, 3, 1, 5},
+                  /*strides=*/BufferView::GetDefaultStrides({1, 2, 3, 1, 5})};
 
   auto check_all = [&]() {
     for (int64_t i = 0; i < (1 << view.num_dimensions()); ++i) {
       SmallVector<int64_t> dims;
       for (int64_t dim = 0; dim < view.num_dimensions(); ++dim) {
-        if (i & (1 << dim)) dims.push_back(dim);
+        if (i & (1 << dim)) {
+          dims.push_back(dim);
+        }
       }
 
       do {
         auto v = view.GetCollapsedStride(dims);
         auto n = GetCollapsedStrideNaive(dims, view);
         EXPECT_EQ(n, v) << "checking " << absl::StrJoin(dims, ", ");
-      } while (std::next_permutation(dims.begin(), dims.end()));
+      } while (absl::c_next_permutation(dims));
     }
   };
 
diff --git a/third_party/xla/xla/mlir_hlo/README.md b/third_party/xla/xla/mlir_hlo/README.md
index 0a8770aae99330..8e68ae82197973 100644
--- a/third_party/xla/xla/mlir_hlo/README.md
+++ b/third_party/xla/xla/mlir_hlo/README.md
@@ -1,5 +1,9 @@
 # MLIR-HLO: A Standalone "HLO" MLIR-based Compiler
 
+> Note:
+> [this project is deprecated](https://groups.google.com/a/openxla.org/g/openxla-discuss/c/Mppuv1Edv1s),
+> and will be removed in the future.
+
 The code here exists in two places:
 
 *   https://github.com/openxla/xla/tree/main/xla/mlir_hlo:
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 30f3cec32c4723..6b598b2371cd30 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -452,7 +452,9 @@ bool simplifyLoopDeallocs(Block& block) {
 
     getAliases(RegionBranchPoint::parent());
     for (auto& region : rbi->getRegions()) {
-      getAliases(region);
+      if (region.empty()) continue;
+      getAliases(RegionBranchPoint(cast<RegionBranchTerminatorOpInterface>(
+          region.front().getTerminator())));
     }
 
     for (auto it = eq.begin(), e = eq.end(); it != e; ++it) {
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
index a59c231ebaf0c3..dc5014afc09bd0 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
@@ -23,8 +23,13 @@ namespace deallocation {
 SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
                                             RegionBranchPoint point) {
   SmallVector<RegionEdge> edges;
-  if (Region* region = point.getRegionOrNull()) {
-    if (region->empty()) {
+  auto* parentRegion =
+      point.getTerminatorPredecessorOrNull()
+          ? point.getTerminatorPredecessorOrNull()->getParentRegion()
+          : nullptr;
+
+  if (parentRegion) {
+    if (parentRegion->empty()) {
       return edges;
     }
   }
@@ -35,9 +40,8 @@ SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
   for (const auto& successor : successors) {
     auto& edge = edges.emplace_back();
     edge.predecessorRegionPoint = point;
-    auto* region = point.getRegionOrNull();
-    edge.predecessorOp =
-        region ? region->front().getTerminator() : op.getOperation();
+    edge.predecessorOp = parentRegion ? parentRegion->front().getTerminator()
+                                      : op.getOperation();
     edge.predecessorOperandIndex = edge.predecessorOp->getNumOperands() -
                                    successor.getSuccessorInputs().size();
 
@@ -46,7 +50,9 @@ SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
       edge.successorOpOrRegion = op.getOperation();
       edge.successorValueIndex = 0;
     } else {
-      edge.successorRegionPoint = successor.getSuccessor();
+      edge.successorRegionPoint =
+          RegionBranchPoint(cast<RegionBranchTerminatorOpInterface>(
+              successor.getSuccessor()->front().getTerminator()));
       edge.successorOpOrRegion = successor.getSuccessor();
       edge.successorValueIndex = llvm::isa<scf::ForOp>(op) ? 1 : 0;
     }
@@ -68,7 +74,8 @@ SmallVector<RegionEdge> getPredecessorRegions(RegionBranchOpInterface op,
   };
   checkPredecessor(point.parent());
   for (Region& region : op->getRegions()) {
-    checkPredecessor(region);
+    checkPredecessor(RegionBranchPoint(cast<RegionBranchTerminatorOpInterface>(
+        region.front().getTerminator())));
   }
   return result;
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 3ceb787371bcc9..6e91413b1149cb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -359,6 +359,7 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AddOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AcosOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AcoshOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AsinOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AsinhOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AndOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Atan2Op)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AtanhOp)
@@ -5131,6 +5132,7 @@ UNARY_FOLDER_FLOAT(RoundOp, Round)
 UNARY_FOLDER_UPCAST_TO_F64(AcosOp, std::acos, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(AcoshOp, std::acosh, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(AsinOp, std::asin, AnyValue)
+UNARY_FOLDER_UPCAST_TO_F64(AsinhOp, std::asinh, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(AtanhOp, std::atanh, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(CoshOp, std::cosh, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(CosineOp, std::cos, AnyValue)
@@ -6612,6 +6614,10 @@ using mlir::hlo::printSelectOpType;
 using mlir::hlo::printTupleOpType;
 using mlir::hlo::printVariadicSameOperandsAndResultType;
 
+using namespace mlir;  // NOLINT
+using mlir::mhlo::AsyncBundleType;
+using mlir::mhlo::TokenType;
+
 #define GET_OP_CLASSES
 #include "mhlo/IR/hlo_ops.cc.inc"
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index 65da6f0d6ed0d7..70fc5152aa3ee1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -266,6 +266,22 @@ def MHLO_AsinOp : MHLO_UnaryElementwiseOp<"asin",
   let hasFolder = 1;
 }
 
+def MHLO_AsinhOp : MHLO_UnaryElementwiseOp<"asinh",
+    [HLO_CompatibleOperandsAndResultType], HLO_AnyFpOrComplexTensor> {
+  let summary = "Asinh operation";
+
+  let description = [{
+     Performs element-wise asinh operation on `operand` tensor and produces a
+    `result` tensor.
+
+    Example:
+    ```mlir
+    %result = mhlo.asinh %operand : tensor<2x2xf32>
+    ```
+  }];
+  let hasFolder = 1;
+}
+
 def MHLO_CbrtOp: MHLO_UnaryElementwiseOp<"cbrt",
     [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpComplexOrQuantizedIntTensor> {
   let summary = "Cbrt operation";
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index a52ae636a60c4f..7e780ce90d52c1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -49,17 +49,17 @@ namespace mhlo {
 
 namespace {
 
-ChloLegalizeToHighLevelMhloPassOptions FromPassOptions(bool enableAcosh,
-                                                       bool enableAcos,
-                                                       bool enableAtanh,
-                                                       bool enableCosh,
-                                                       bool enableSinh) {
+ChloLegalizeToHighLevelMhloPassOptions FromPassOptions(
+    bool enableAcosh, bool enableAcos, bool enableAsin, bool enableAsinh,
+    bool enableAtanh, bool enableCosh, bool enableSinh) {
   ChloLegalizeToHighLevelMhloPassOptions options;
   options.enable_acosh_ = enableAcosh;
   options.enable_acos_ = enableAcos;
   options.enable_atanh_ = enableAtanh;
   options.enable_cosh_ = enableCosh;
   options.enable_sinh_ = enableSinh;
+  options.enable_asin_ = enableAsin;
+  options.enable_asinh_ = enableAsinh;
   return options;
 }
 
@@ -83,6 +83,14 @@ static bool qualifiesForDirectMhloLoweringSinh(chlo::SinhOp op) {
   return llvm::isa<FloatType>(getElementTypeOrSelf(op.getType()));
 }
 
+static bool qualifiesForDirectMhloLoweringAsin(chlo::AsinOp op) {
+  return llvm::isa<FloatType>(getElementTypeOrSelf(op.getType()));
+}
+
+static bool qualifiesForDirectMhloLoweringAsinh(chlo::AsinhOp op) {
+  return llvm::isa<FloatType>(getElementTypeOrSelf(op.getType()));
+}
+
 struct ChloLegalizeToHighLevelMhloPass
     : public impl::ChloLegalizeToHighLevelMhloPassBase<
           ChloLegalizeToHighLevelMhloPass> {
@@ -99,8 +107,9 @@ struct ChloLegalizeToHighLevelMhloPass
 
     chlo::populateChloToHighLevelMhloOpPatterns(
         &context, &conversionPatterns,
-        FromPassOptions(enable_acosh_, enable_acos_, enable_atanh_,
-                        enable_cosh_, enable_sinh_));
+        FromPassOptions(enable_acosh_, enable_acos_, enable_asin_,
+                        enable_asinh_, enable_atanh_, enable_cosh_,
+                        enable_sinh_));
 
     // Consider the mhlo dialect legal for tests. Also add helper dialects
     // that are needed by the patterns.
@@ -132,6 +141,17 @@ struct ChloLegalizeToHighLevelMhloPass
         return !qualifiesForDirectMhloLoweringSinh(op);
       });
     }
+    if (enable_asin_) {
+      conversionTarget.addDynamicallyLegalOp<chlo::AsinOp>([](chlo::AsinOp op) {
+        return !qualifiesForDirectMhloLoweringAsin(op);
+      });
+    }
+    if (enable_asinh_) {
+      conversionTarget.addDynamicallyLegalOp<chlo::AsinhOp>(
+          [](chlo::AsinhOp op) {
+            return !qualifiesForDirectMhloLoweringAsinh(op);
+          });
+    }
     conversionTarget
         .addIllegalOp<chlo::TopKOp, chlo::ErfOp, chlo::RaggedDotOp>();
 
@@ -274,6 +294,24 @@ LogicalResult convertSinhChloToMhlo(chlo::SinhOp op,
   return success();
 }
 
+LogicalResult convertAsinChloToMhlo(chlo::AsinOp op,
+                                    PatternRewriter& rewriter) {
+  if (!mhlo::qualifiesForDirectMhloLoweringAsin(op)) {
+    return failure();
+  }
+  rewriter.replaceOpWithNewOp<mhlo::AsinOp>(op, op->getOperands());
+  return success();
+}
+
+LogicalResult convertAsinhChloToMhlo(chlo::AsinhOp op,
+                                     PatternRewriter& rewriter) {
+  if (!mhlo::qualifiesForDirectMhloLoweringAsinh(op)) {
+    return failure();
+  }
+  rewriter.replaceOpWithNewOp<mhlo::AsinhOp>(op, op->getOperands());
+  return success();
+}
+
 }  // namespace
 
 ChloLegalizeToHighLevelMhloPassOptions getDefaultChloToHighLevelMhloOptions() {
@@ -287,6 +325,8 @@ ChloLegalizeToHighLevelMhloPassOptions getGpuChloToHighLevelMhloOptions() {
   opts.enable_atanh_ = true;
   opts.enable_cosh_ = true;
   opts.enable_sinh_ = true;
+  opts.enable_asin_ = true;
+  opts.enable_asinh_ = true;
   return opts;
 }
 
@@ -316,6 +356,12 @@ void populateChloToHighLevelMhloOpPatterns(
   if (options.enable_sinh_) {
     patterns->add(mhlo::convertSinhChloToMhlo, kBenefit);
   }
+  if (options.enable_asin_) {
+    patterns->add(mhlo::convertAsinChloToMhlo, kBenefit);
+  }
+  if (options.enable_asinh_) {
+    patterns->add(mhlo::convertAsinhChloToMhlo, kBenefit);
+  }
   patterns->add(mhlo::convertRaggedDotChloToMhlo, kBenefit);
   populateWithGenerated(*patterns);
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index 5e79bc76e999ed..e52bab960be48f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -135,6 +135,16 @@ std::optional<int64_t> getPublicFeaturesNotInStablehlo(HloOpTy hloOp) {
     // Version 1: Initial version for CoshOp.
     return 1;
   }
+  // StableHLO doesn't support Asin yet.
+  if constexpr (std::is_same<HloOpTy, mhlo::AsinOp>::value) {
+    // Version 1: Initial version for AsinOp.
+    return 1;
+  }
+  // StableHLO doesn't support Asinh yet.
+  if constexpr (std::is_same<HloOpTy, mhlo::AsinhOp>::value) {
+    // Version 1: Initial version for AsinhOp.
+    return 1;
+  }
   return std::nullopt;
 }
 
@@ -469,6 +479,8 @@ LogicalResult convertAttributes(ConversionPatternRewriter& rewriter,
                   !std::is_same<HloOpTy, mhlo::AtanhOp>::value &&
                   !std::is_same<HloOpTy, mhlo::CoshOp>::value &&
                   !std::is_same<HloOpTy, mhlo::SinhOp>::value &&
+                  !std::is_same<HloOpTy, mhlo::AsinOp>::value &&
+                  !std::is_same<HloOpTy, mhlo::AsinhOp>::value &&
                   !std::is_same<HloOpTy, mhlo::ErfOp>::value &&
                   !std::is_same<HloOpTy, mhlo::TopKOp>::value) {
       if (!stablehloAttr) {
@@ -763,9 +775,9 @@ void populateHloToStablehloPatterns(RewritePatternSet* patterns,
       allowXlaFeatures);
 
   populateHloToStablehloCustomCallPatterns<
-      mhlo::AcosOp, mhlo::AcoshOp, mhlo::AtanhOp, mhlo::CoshOp, mhlo::SinhOp,
-      mhlo::ErfOp, mhlo::TopKOp>(patterns, converter, context,
-                                 allowExperimentalFeatures);
+      mhlo::AcosOp, mhlo::AcoshOp, mhlo::AsinOp, mhlo::AsinhOp, mhlo::AtanhOp,
+      mhlo::CoshOp, mhlo::SinhOp, mhlo::ErfOp, mhlo::TopKOp>(
+      patterns, converter, context, allowExperimentalFeatures);
 }
 
 }  // namespace stablehlo
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index e861dec331848c..0a20c69af6b6fa 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "llvm/ADT/StringExtras.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
@@ -48,7 +49,7 @@ struct EinsumToDotGeneralPattern : public OpRewritePattern<EinsumOp> {
     enum EquationVariable { kIsLhs, kIsRhs, kIsResult };
     EquationVariable currentVariable = kIsLhs;
     while (index < equation.size()) {
-      if (std::isalpha(equation[index])) {
+      if (llvm::isAlpha(equation[index])) {
         if (currentVariable == kIsLhs) {
           lhsTokens.push_back(equation[index]);
         } else if (currentVariable == kIsRhs) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index b4ccd308330315..f24c9f3781f925 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -61,6 +61,10 @@ struct MhloToScalarOp<mhlo::AsinOp> {
   using FOp = ::mlir::math::AsinOp;
 };
 template <>
+struct MhloToScalarOp<mhlo::AsinhOp> {
+  using FOp = ::mlir::math::AsinhOp;
+};
+template <>
 struct MhloToScalarOp<mhlo::AddOp> {
   using FOp = ::mlir::arith::AddFOp;
   using IOp = ::mlir::arith::AddIOp;
@@ -889,7 +893,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ClampOp>(
 }
 
 template <typename U, typename S>
-inline Value makeSafeIntDiv(ImplicitLocOpBuilder& lb, Type originalType,
+inline Value makeSafeIntDiv(ImplicitLocOpBuilder& lb, bool isUnsigned,
                             Value lhs, Value rhs, Value returnedOnZero,
                             Value returnedOnSignedOverflow) {
   Type type = lhs.getType();
@@ -904,7 +908,7 @@ inline Value makeSafeIntDiv(ImplicitLocOpBuilder& lb, Type originalType,
       lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, rhs, zero);
 
   // For unsigned just set the divisor to 1 when it would be 0.
-  if (originalType.isUnsignedInteger()) {
+  if (isUnsigned) {
     Value safeRhs = lb.create<arith::SelectOp>(rhsIsZero, one, rhs);
     Value safeDiv = lb.create<U>(lhs, safeRhs);
     return lb.create<arith::SelectOp>(rhsIsZero, returnedOnZero, safeDiv);
@@ -952,7 +956,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::DivOp>(
   Value minusOne = makeConstant(APInt::getAllOnes(elementType.getWidth()));
   Value smin = makeConstant(APInt::getSignedMinValue(elementType.getWidth()));
   return makeSafeIntDiv<arith::DivUIOp, arith::DivSIOp>(
-      lb, originalType, adaptor.getLhs(), adaptor.getRhs(),
+      lb, originalType.isUnsignedInteger(), adaptor.getLhs(), adaptor.getRhs(),
       /*returnedOnZero=*/minusOne,
       /*returnedOnSignedOverflow=*/smin);
 }
@@ -976,7 +980,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::RemOp>(
   Type type = adaptor.getLhs().getType();
   Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
   return makeSafeIntDiv<arith::RemUIOp, arith::RemSIOp>(
-      lb, originalType, adaptor.getLhs(), adaptor.getRhs(),
+      lb, originalType.isUnsignedInteger(), adaptor.getLhs(), adaptor.getRhs(),
       /*returnedOnZero=*/adaptor.getLhs(),
       /*returnedOnSignedOverflow=*/zero);
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index 5d6c4c5d74d5cc..a05157ea493521 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -32,7 +32,11 @@ def ChloLegalizeToHighLevelMhloPass : Pass<"chlo-legalize-to-high-level-mhlo", "
     Option<"enable_cosh_", "enable-cosh", "bool", /*default=*/"false",
            "Enable chlo.cosh to mhlo.cosh lowering.">,
     Option<"enable_sinh_", "enable-sinh", "bool", /*default=*/"false",
-           "Enable chlo.sinh to mhlo.sinh lowering.">
+           "Enable chlo.sinh to mhlo.sinh lowering.">,
+    Option<"enable_asin_", "enable-asin", "bool", /*default=*/"false",
+           "Enable chlo.asin to mhlo.asin lowering.">,
+    Option<"enable_asinh_", "enable-asinh", "bool", /*default=*/"false",
+           "Enable chlo.asinh to mhlo.asinh lowering.">
   ];
   let dependentDialects = ["mhlo::MhloDialect"];
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 3e7c3c4a396db9..265bb8f4cac0ea 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -130,7 +130,8 @@ void prepareExplicitCapturedConstants(Operation* op) {
       // it explicit and replace uses within the block
       Operation *definingOp = input.getDefiningOp();
       mlir::DenseElementsAttr attr;
-      if (matchPattern(input, m_Constant(&attr))) {
+      if (mlir::isa_and_present<ConstantOp>(input.getDefiningOp()) &&
+          matchPattern(input, m_Constant(&attr))) {
         Operation *clonedOp = builder.clone(*definingOp);
         // Find which uses belong to the block and replace
         // with the cloned/explicit one
@@ -146,9 +147,10 @@ void prepareExplicitCapturedConstants(Operation* op) {
 }  // namespace
 
 void PrepareForExportPass::runOnOperation() {
-  getOperation().walk([&](Operation *op) {
+  getOperation().walk([&](Operation* op) {
     mlir::SplatElementsAttr attr;
-    if (matchPattern(op, m_Constant(&attr))) return prepareConstantOp(op, attr);
+    if (isa<ConstantOp>(op) && matchPattern(op, m_Constant(&attr)))
+      return prepareConstantOp(op, attr);
 
     if (auto bcastOp = dyn_cast<BroadcastInDimOp>(op))
       return prepareBroadcastInDim(bcastOp);
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp
index 949da90ed67a35..37257ab812512a 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp
@@ -275,6 +275,24 @@ struct SinhOpToCustomCallPattern : public OpRewritePattern<chlo::SinhOp> {
   }
 };
 
+struct AsinOpToCustomCallPattern : public OpRewritePattern<chlo::AsinOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::AsinOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOperationInCustomCall(rewriter, op, "mhlo.asin",
+                                         /*version=*/1);
+  }
+};
+
+struct AsinhOpToCustomCallPattern : public OpRewritePattern<chlo::AsinhOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::AsinhOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOperationInCustomCall(rewriter, op, "mhlo.asinh",
+                                         /*version=*/1);
+  }
+};
+
 ///////
 // CHLO to CompositeOp Patterns
 ///////
@@ -358,6 +376,22 @@ struct SinhOpToCompositePattern : public OpRewritePattern<chlo::SinhOp> {
   }
 };
 
+struct AsinOpToCompositePattern : public OpRewritePattern<chlo::AsinOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::AsinOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOpInComposite(op, /*version=*/1, rewriter);
+  }
+};
+
+struct AsinhOpToCompositePattern : public OpRewritePattern<chlo::AsinhOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::AsinhOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOpInComposite(op, /*version=*/1, rewriter);
+  }
+};
+
 }  // namespace
 
 struct ChloPreserveHighLevelOpsPass
@@ -382,6 +416,8 @@ struct ChloPreserveHighLevelOpsPass
       // Deprecated CustomCall encoding.
       patterns.add<
         AcosOpToCustomCallPattern,
+        AsinOpToCustomCallPattern,
+        AsinhOpToCustomCallPattern,
         AcoshOpToCustomCallPattern,
         AtanhOpToCustomCallPattern,
         CoshOpToCustomCallPattern,
@@ -392,6 +428,8 @@ struct ChloPreserveHighLevelOpsPass
     } else {
       patterns.add<
         AcosOpToCompositePattern,
+        AsinOpToCompositePattern,
+        AsinhOpToCompositePattern,
         AcoshOpToCompositePattern,
         AtanhOpToCompositePattern,
         CoshOpToCompositePattern,
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp
index fead6dec92b830..f40d19946241e7 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp
@@ -328,6 +328,38 @@ struct SinhOpRecomposePattern
   }
 };
 
+struct AsinOpRecomposePattern
+    : public OpRewritePattern<stablehlo::CompositeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CompositeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getName() != "chlo.asin") {
+      return rewriter.notifyMatchFailure(op, "not a chlo.asin");
+    }
+    if (op.getVersion() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "unsupported version for chlo.asin composite");
+    }
+    return recomposeChloOpFromCompositeOp<chlo::AsinOp>(op, rewriter);
+  }
+};
+
+struct AsinhOpRecomposePattern
+    : public OpRewritePattern<stablehlo::CompositeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CompositeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getName() != "chlo.asinh") {
+      return rewriter.notifyMatchFailure(op, "not a chlo.asinh");
+    }
+    if (op.getVersion() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "unsupported version for chlo.asinh composite");
+    }
+    return recomposeChloOpFromCompositeOp<chlo::AsinhOp>(op, rewriter);
+  }
+};
+
 struct ErfOpRecomposePattern : public OpRewritePattern<stablehlo::CompositeOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(stablehlo::CompositeOp op,
@@ -467,6 +499,26 @@ struct SinhOpCustomCallRecomposePattern
   }
 };
 
+struct AsinOpCustomCallRecomposePattern
+    : public OpRewritePattern<stablehlo::CustomCallOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
+                                PatternRewriter& rewriter) const override {
+    return recomposeChloOpFromCustomCall<chlo::AsinOp>(
+        op, {"mhlo.asin", "chlo.asin"}, rewriter);
+  }
+};
+
+struct AsinhOpCustomCallRecomposePattern
+    : public OpRewritePattern<stablehlo::CustomCallOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
+                                PatternRewriter& rewriter) const override {
+    return recomposeChloOpFromCustomCall<chlo::AsinhOp>(
+        op, {"mhlo.asinh", "chlo.asinh"}, rewriter);
+  }
+};
+
 }  // namespace
 
 struct ChloRecomposeOpsPass
@@ -488,6 +540,8 @@ struct ChloRecomposeOpsPass
     // CustomCall Patterns
     patterns.add<
       AcosOpCustomCallRecomposePattern,
+      AsinOpCustomCallRecomposePattern,
+      AsinhOpCustomCallRecomposePattern,
       AcoshOpCustomCallRecomposePattern,
       AtanhOpCustomCallRecomposePattern,
       CoshOpCustomCallRecomposePattern,
@@ -500,6 +554,8 @@ struct ChloRecomposeOpsPass
     // Composite Patterns
     patterns.add<
       AcosOpRecomposePattern,
+      AsinOpRecomposePattern,
+      AsinhOpRecomposePattern,
       AcoshOpRecomposePattern,
       AtanhOpRecomposePattern,
       CoshOpRecomposePattern,
diff --git a/third_party/xla/xla/mlir_hlo/tests/BUILD b/third_party/xla/xla/mlir_hlo/tests/BUILD
index d77a1f5c2c890a..ff733c54e85972 100644
--- a/third_party/xla/xla/mlir_hlo/tests/BUILD
+++ b/third_party/xla/xla/mlir_hlo/tests/BUILD
@@ -27,7 +27,7 @@ package(
         tags = [
             "nomsan",  # The execution engine doesn't work with msan, see b/248097619.
         ],
-        deps = ["@pypi_lit//:pkg"],
+        deps = ["@pypi//lit"],
         deps = ["@pypi//lit"],  # copybara:comment
     )
     for src in glob(["**/*.mlir"])
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 885d8e8b144e1e..20786d052801e5 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file -verify-diagnostics %s | FileCheck %s --dump-input-context=20
-// RUN: mlir-hlo-opt --chlo-legalize-to-high-level-mhlo="enable-acosh enable-acos enable-atanh enable-cosh enable-sinh" --split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-HIGH-LEVEL
+// RUN: mlir-hlo-opt --chlo-legalize-to-high-level-mhlo="enable-acosh enable-acos enable-atanh enable-cosh enable-sinh enable-asin enable-asinh" --split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-HIGH-LEVEL
 
 // CHECK-LABEL: func.func @asin_bf16(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<bf16>
@@ -20,6 +20,7 @@ func.func @asin_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
 // -----
 
 // CHECK-LABEL: func.func @asin_f16(
+// CHECK-HIGH-LEVEL-LABEL: func.func @asin_f16(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<f16>
 // CHECK-NEXT:    %[[TMP_1:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f16>
 // CHECK-NEXT:    %[[TMP_2:.*]] = mhlo.subtract %[[TMP_1]], %[[TMP_arg0]] : tensor<f16>
@@ -31,6 +32,7 @@ func.func @asin_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
 // CHECK-NEXT:    %[[TMP_8:.*]] = mhlo.add %[[TMP_7]], %[[TMP_7]] : tensor<f16>
 // CHECK-NEXT:    return %[[TMP_8]] : tensor<f16>
 func.func @asin_f16(%arg : tensor<f16>) -> tensor<f16> {
+  // CHECK-HIGH-LEVEL: mhlo.asin
   %result = "chlo.asin"(%arg) : (tensor<f16>) -> tensor<f16>
   func.return %result : tensor<f16>
 }
@@ -38,6 +40,7 @@ func.func @asin_f16(%arg : tensor<f16>) -> tensor<f16> {
 // -----
 
 // CHECK-LABEL: func.func @asin_f32(
+// CHECK-HIGH-LEVEL-LABEL: func.func @asin_f32(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<f32>) -> tensor<f32>
 // CHECK-NEXT:    %[[TMP_1:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
 // CHECK-NEXT:    %[[TMP_2:.*]] = mhlo.subtract %[[TMP_1]], %[[TMP_arg0]] : tensor<f32>
@@ -49,13 +52,14 @@ func.func @asin_f16(%arg : tensor<f16>) -> tensor<f16> {
 // CHECK-NEXT:    %[[TMP_8:.*]] = mhlo.add %[[TMP_7]], %[[TMP_7]] : tensor<f32>
 // CHECK-NEXT:    return %[[TMP_8]] : tensor<f32>
 func.func @asin_f32(%arg : tensor<f32>) -> tensor<f32> {
+  // CHECK-HIGH-LEVEL: mhlo.asin
   %result = "chlo.asin"(%arg) : (tensor<f32>) -> tensor<f32>
   func.return %result : tensor<f32>
 }
 
 // -----
-
 // CHECK-LABEL:  func.func @asin_f64(
+// CHECK-HIGH-LEVEL-LABEL: func.func @asin_f64(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<f64>) -> tensor<f64>
 // CHECK-NEXT:    %[[TMP_1:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f64>
 // CHECK-NEXT:    %[[TMP_2:.*]] = mhlo.subtract %[[TMP_1]], %[[TMP_arg0]] : tensor<f64>
@@ -67,6 +71,7 @@ func.func @asin_f32(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-NEXT:    %[[TMP_8:.*]] = mhlo.add %[[TMP_7]], %[[TMP_7]] : tensor<f64>
 // CHECK-NEXT:    return %[[TMP_8]] : tensor<f64>
 func.func @asin_f64(%arg : tensor<f64>) -> tensor<f64> {
+  // CHECK-HIGH-LEVEL: mhlo.asin
   %result = "chlo.asin"(%arg) : (tensor<f64>) -> tensor<f64>
   func.return %result : tensor<f64>
 }
@@ -74,6 +79,7 @@ func.func @asin_f64(%arg : tensor<f64>) -> tensor<f64> {
 // -----
 
 // CHECK-LABEL: func.func @asin_complex_f32(
+// CHECK-HIGH-LEVEL-LABEL: func.func @asin_complex_f32(
 // CHECK-SAME:   %[[TEMP_arg0:.*]]: tensor<complex<f32>>) -> tensor<complex<f32>>
 // CHECK: %[[TEMP_0:.*]] = mhlo.real %[[TEMP_arg0:.*]] : (tensor<complex<f32>>) -> tensor<f32>
 // CHECK: %[[TEMP_1:.*]] = mhlo.real %[[TEMP_arg0:.*]] : (tensor<complex<f32>>) -> tensor<f32>
@@ -220,6 +226,7 @@ func.func @asin_complex_f32(%arg : tensor<complex<f32>>) -> tensor<complex<f32>>
   %result = "chlo.asin"(%arg) : (tensor<complex<f32>>) -> tensor<complex<f32>>
   func.return %result : tensor<complex<f32>>
 }
+// CHECK-HIGH-LEVEL-NOT: mhlo.asin
 
 // -----
 
@@ -404,8 +411,10 @@ func.func @asin_complex_f64_dynamic(%arg : tensor<?xcomplex<f64>>) -> tensor<?xc
 // -----
 
 // CHECK-LABEL: @asinh_bf16
+// CHECK-HIGH-LEVEL-LABEL: @asinh_bf16
 // CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
 func.func @asinh_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
+  // CHECK-HIGH-LEVEL: mhlo.asinh
   // Check for the bf16-specific max value.
   // CHECK: mhlo.constant dense<3.389{{.*}}e+38>
   %result = "chlo.asinh"(%arg) : (tensor<bf16>) -> tensor<bf16>
@@ -415,8 +424,10 @@ func.func @asinh_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
 // -----
 
 // CHECK-LABEL: @asinh_f16
+// CHECK-HIGH-LEVEL-LABEL: @asinh_f16
 // CHECK-SAME: %[[ARG:.*]]: tensor<f16>
 func.func @asinh_f16(%arg : tensor<f16>) -> tensor<f16> {
+  // CHECK-HIGH-LEVEL: mhlo.asinh
   // Check for the f16-specific max value.
   // CHECK: mhlo.constant dense<6.550{{.*}}e+04>
   %result = "chlo.asinh"(%arg) : (tensor<f16>) -> tensor<f16>
@@ -426,8 +437,10 @@ func.func @asinh_f16(%arg : tensor<f16>) -> tensor<f16> {
 // -----
 
 // CHECK-LABEL: @asinh_f32
+// CHECK-HIGH-LEVEL-LABEL: @asinh_f32
 // CHECK-SAME: %[[ARG:.*]]: tensor<f32>
 func.func @asinh_f32(%arg : tensor<f32>) -> tensor<f32> {
+  // CHECK-HIGH-LEVEL: mhlo.asinh
   // Check for the f32-specific max value.
   // CHECK: mhlo.constant dense<3.402{{.*}}E+38>
   %result = "chlo.asinh"(%arg) : (tensor<f32>) -> tensor<f32>
@@ -437,8 +450,10 @@ func.func @asinh_f32(%arg : tensor<f32>) -> tensor<f32> {
 // -----
 
 // CHECK-LABEL:  @asinh_f64
+// CHECK-HIGH-LEVEL-LABEL: @asinh_f64
 // CHECK-SAME:   %[[VAL_0:.*]]: tensor<f64>) -> tensor<f64> {
 func.func @asinh_f64(%arg : tensor<f64>) -> tensor<f64> {
+  // CHECK-HIGH-LEVEL: mhlo.asinh
   // CHECK:   %[[VAL_1:.*]] = mhlo.sign %[[VAL_0]] : tensor<f64>
   // CHECK:   %[[VAL_2:.*]] = mhlo.abs %[[VAL_0]] : tensor<f64>
   // CHECK:   %[[VAL_3:.*]] = mhlo.constant dense<1.7976931348623157E+308> : tensor<f64>
@@ -467,6 +482,7 @@ func.func @asinh_f64(%arg : tensor<f64>) -> tensor<f64> {
 // -----
 
 // CHECK-LABEL: func.func @acosh_complex_f32(
+// CHECK-HIGH-LEVEL-LABEL: func.func @acosh_complex_f32(
 // CHECK-SAME:   %[[TEMP_arg0:.*]]: tensor<complex<f32>>) -> tensor<complex<f32>> {
 // CHECK: %[[TEMP_0:.*]] = mhlo.real %[[TEMP_arg0:.*]] : (tensor<complex<f32>>) -> tensor<f32>
 // CHECK: %[[TEMP_1:.*]] = mhlo.abs %[[TEMP_0]] : tensor<f32>
@@ -618,7 +634,8 @@ func.func @acosh_complex_f32(%arg : tensor<complex<f32>>) -> tensor<complex<f32>
 // -----
 
 // Lower statically shaped `constant_like` to constant.
-// CHECK-LABEL: @constant_like_static_shape
+// CHECK-LABEL: func.func @constant_like_static_shape
+// CHECK-HIGH-LEVEL-LABEL: func.func @constant_like_static_shape
 func.func @constant_like_static_shape(%arg : tensor<1x2xi64>) -> tensor<1x2xf32> {
   // CHECK: %[[RESULT:.*]] = mhlo.constant dense<3.200000e+00> : tensor<1x2xf32>
   // CHECK: return %[[RESULT]]
@@ -657,7 +674,8 @@ func.func @conj(%arg0: tensor<3xcomplex<f32>>) -> tensor<3xcomplex<f32>> {
 
 // -----
 
-// CHECK-LABEL: @erf_f64
+// CHECK-LABEL: func.func @erf_f64
+// CHECK-HIGH-LEVEL-LABEL: func.func @erf_f64
 // CHECK-SAME: %[[ARG:.*]]: tensor<f64>
 func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-HIGH-LEVEL: mhlo.erf
@@ -669,7 +687,8 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
 
 // -----
 
-// CHECK-LABEL: @erf_f32
+// CHECK-LABEL: func.func @erf_f32
+// CHECK-HIGH-LEVEL-LABEL: func.func @erf_f32
 // CHECK-SAME: %[[ARG:.*]]: tensor<f32>
 func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
   // CHECK-HIGH-LEVEL: mhlo.erf
@@ -681,7 +700,8 @@ func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
 
 // -----
 
-// CHECK-LABEL: @erf_f16
+// CHECK-LABEL: func.func @erf_f16
+// CHECK-HIGH-LEVEL-LABEL: func.func @erf_f16
 // CHECK-SAME: %[[ARG:.*]]: tensor<f16>
 func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
   // CHECK-HIGH-LEVEL: mhlo.erf
@@ -693,7 +713,8 @@ func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
 
 // -----
 
-// CHECK-LABEL: @erf_bf16
+// CHECK-LABEL: func.func @erf_bf16
+// CHECK-HIGH-LEVEL-LABEL: func.func @erf_bf16
 // CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
 func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
   // CHECK-HIGH-LEVEL: mhlo.erf
@@ -705,7 +726,8 @@ func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
 
 // -----
 
-// CHECK-LABEL: @acos
+// CHECK-LABEL: func.func @acos
+// CHECK-HIGH-LEVEL-LABEL: func.func @acos
 func.func @acos(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-HIGH-LEVEL: mhlo.acos
   %1 = "chlo.acos"(%arg) : (tensor<f64>) -> tensor<f64>
@@ -714,7 +736,8 @@ func.func @acos(%arg : tensor<f64>) -> tensor<f64> {
 
 // -----
 
-// CHECK-LABEL: @acosh
+// CHECK-LABEL: func.func @acosh
+// CHECK-HIGH-LEVEL-LABEL: func.func @acosh
 // CHECK-SAME:  %[[VAL_0:.*]]: tensor<f16>) -> tensor<f16> {
 func.func @acosh(%arg: tensor<f16>) -> tensor<f16> {
   // CHECK-DAG:   %[[VAL_1:.*]] = mhlo.constant dense<6.550400e+04> : tensor<f16>
@@ -742,158 +765,8 @@ func.func @acosh(%arg: tensor<f16>) -> tensor<f16> {
 
 // -----
 
-// CHECK-LABEL: func.func @acosh_complex_f32(
-// CHECK-SAME:   %[[TEMP_arg0:.*]]: tensor<complex<f32>>) -> tensor<complex<f32>> {
-// CHECK: %[[TEMP_0:.*]] = mhlo.real %[[TEMP_arg0:.*]] : (tensor<complex<f32>>) -> tensor<f32>
-// CHECK: %[[TEMP_1:.*]] = mhlo.abs %[[TEMP_0]] : tensor<f32>
-// CHECK: %[[TEMP_2:.*]] = mhlo.imag %[[TEMP_arg0:.*]] : (tensor<complex<f32>>) -> tensor<f32>
-// CHECK: %[[TEMP_3:.*]] = mhlo.abs %[[TEMP_2]] : tensor<f32>
-// CHECK: %[[TEMP_4:.*]] = mhlo.maximum %[[TEMP_1]], %[[TEMP_3]] : tensor<f32>
-// CHECK: %[[TEMP_5:.*]] = mhlo.constant dense<3.40282347E+38> : tensor<f32>
-// CHECK: %[[TEMP_6:.*]] = mhlo.sqrt %[[TEMP_5]] : tensor<f32>
-// CHECK: %[[TEMP_7:.*]] = mhlo.constant dense<8.000000e+00> : tensor<f32>
-// CHECK: %[[TEMP_8:.*]] = mhlo.divide %[[TEMP_6]], %[[TEMP_7]] : tensor<f32>
-// CHECK: %[[TEMP_9:.*]] = mhlo.compare  GE, %[[TEMP_4]], %[[TEMP_8]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_10:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-// CHECK: %[[TEMP_11:.*]] = mhlo.compare  LE, %[[TEMP_1]], %[[TEMP_10]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_12:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
-// CHECK: %[[TEMP_13:.*]] = mhlo.add %[[TEMP_1]], %[[TEMP_10]] : tensor<f32>
-// CHECK: %[[TEMP_14:.*]] = mhlo.abs %[[TEMP_13]] : tensor<f32>
-// CHECK: %[[TEMP_15:.*]] = mhlo.maximum %[[TEMP_14]], %[[TEMP_3]] : tensor<f32>
-// CHECK: %[[TEMP_16:.*]] = mhlo.minimum %[[TEMP_14]], %[[TEMP_3]] : tensor<f32>
-// CHECK: %[[TEMP_17:.*]] = mhlo.compare  EQ, %[[TEMP_15]], %[[TEMP_16]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_18:.*]] = mhlo.constant dense<1.41421354> : tensor<f32>
-// CHECK: %[[TEMP_19:.*]] = mhlo.multiply %[[TEMP_18]], %[[TEMP_15]] : tensor<f32>
-// CHECK: %[[TEMP_20:.*]] = mhlo.divide %[[TEMP_16]], %[[TEMP_15]] : tensor<f32>
-// CHECK: %[[TEMP_21:.*]] = mhlo.multiply %[[TEMP_20]], %[[TEMP_20]] : tensor<f32>
-// CHECK: %[[TEMP_22:.*]] = mhlo.add %[[TEMP_10]], %[[TEMP_21]] : tensor<f32>
-// CHECK: %[[TEMP_23:.*]] = mhlo.sqrt %[[TEMP_22]] : tensor<f32>
-// CHECK: %[[TEMP_24:.*]] = mhlo.compare  EQ, %[[TEMP_23]], %[[TEMP_10]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_25:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK: %[[TEMP_26:.*]] = mhlo.compare  GT, %[[TEMP_21]], %[[TEMP_25]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_27:.*]] = mhlo.and %[[TEMP_24]], %[[TEMP_26]] : tensor<i1>
-// CHECK: %[[TEMP_28:.*]] = mhlo.multiply %[[TEMP_15]], %[[TEMP_21]] : tensor<f32>
-// CHECK: %[[TEMP_29:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
-// CHECK: %[[TEMP_30:.*]] = mhlo.divide %[[TEMP_28]], %[[TEMP_29]] : tensor<f32>
-// CHECK: %[[TEMP_31:.*]] = mhlo.add %[[TEMP_15]], %[[TEMP_30]] : tensor<f32>
-// CHECK: %[[TEMP_32:.*]] = mhlo.multiply %[[TEMP_15]], %[[TEMP_23]] : tensor<f32>
-// CHECK: %[[TEMP_33:.*]] = mhlo.select %[[TEMP_27]], %[[TEMP_31]], %[[TEMP_32]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_34:.*]] = mhlo.select %[[TEMP_17]], %[[TEMP_19]], %[[TEMP_33]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_35:.*]] = mhlo.subtract %[[TEMP_1]], %[[TEMP_10]] : tensor<f32>
-// CHECK: %[[TEMP_36:.*]] = mhlo.abs %[[TEMP_35]] : tensor<f32>
-// CHECK: %[[TEMP_37:.*]] = mhlo.maximum %[[TEMP_36]], %[[TEMP_3]] : tensor<f32>
-// CHECK: %[[TEMP_38:.*]] = mhlo.minimum %[[TEMP_36]], %[[TEMP_3]] : tensor<f32>
-// CHECK: %[[TEMP_39:.*]] = mhlo.compare  EQ, %[[TEMP_37]], %[[TEMP_38]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_40:.*]] = mhlo.multiply %[[TEMP_18]], %[[TEMP_37]] : tensor<f32>
-// CHECK: %[[TEMP_41:.*]] = mhlo.divide %[[TEMP_38]], %[[TEMP_37]] : tensor<f32>
-// CHECK: %[[TEMP_42:.*]] = mhlo.multiply %[[TEMP_41]], %[[TEMP_41]] : tensor<f32>
-// CHECK: %[[TEMP_43:.*]] = mhlo.add %[[TEMP_10]], %[[TEMP_42]] : tensor<f32>
-// CHECK: %[[TEMP_44:.*]] = mhlo.sqrt %[[TEMP_43]] : tensor<f32>
-// CHECK: %[[TEMP_45:.*]] = mhlo.compare  EQ, %[[TEMP_44]], %[[TEMP_10]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_46:.*]] = mhlo.compare  GT, %[[TEMP_42]], %[[TEMP_25]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_47:.*]] = mhlo.and %[[TEMP_45]], %[[TEMP_46]] : tensor<i1>
-// CHECK: %[[TEMP_48:.*]] = mhlo.multiply %[[TEMP_37]], %[[TEMP_42]] : tensor<f32>
-// CHECK: %[[TEMP_49:.*]] = mhlo.divide %[[TEMP_48]], %[[TEMP_29]] : tensor<f32>
-// CHECK: %[[TEMP_50:.*]] = mhlo.add %[[TEMP_37]], %[[TEMP_49]] : tensor<f32>
-// CHECK: %[[TEMP_51:.*]] = mhlo.multiply %[[TEMP_37]], %[[TEMP_44]] : tensor<f32>
-// CHECK: %[[TEMP_52:.*]] = mhlo.select %[[TEMP_47]], %[[TEMP_50]], %[[TEMP_51]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_53:.*]] = mhlo.select %[[TEMP_39]], %[[TEMP_40]], %[[TEMP_52]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_54:.*]] = mhlo.add %[[TEMP_34]], %[[TEMP_53]] : tensor<f32>
-// CHECK: %[[TEMP_55:.*]] = mhlo.multiply %[[TEMP_12]], %[[TEMP_54]] : tensor<f32>
-// CHECK: %[[TEMP_56:.*]] = mhlo.add %[[TEMP_55]], %[[TEMP_1]] : tensor<f32>
-// CHECK: %[[TEMP_57:.*]] = mhlo.multiply %[[TEMP_12]], %[[TEMP_56]] : tensor<f32>
-// CHECK: %[[TEMP_58:.*]] = mhlo.multiply %[[TEMP_3]], %[[TEMP_3]] : tensor<f32>
-// CHECK: %[[TEMP_59:.*]] = mhlo.add %[[TEMP_34]], %[[TEMP_13]] : tensor<f32>
-// CHECK: %[[TEMP_60:.*]] = mhlo.divide %[[TEMP_58]], %[[TEMP_59]] : tensor<f32>
-// CHECK: %[[TEMP_61:.*]] = mhlo.subtract %[[TEMP_53]], %[[TEMP_35]] : tensor<f32>
-// CHECK: %[[TEMP_62:.*]] = mhlo.add %[[TEMP_60]], %[[TEMP_61]] : tensor<f32>
-// CHECK: %[[TEMP_63:.*]] = mhlo.multiply %[[TEMP_57]], %[[TEMP_62]] : tensor<f32>
-// CHECK: %[[TEMP_64:.*]] = mhlo.sqrt %[[TEMP_63]] : tensor<f32>
-// CHECK: %[[TEMP_65:.*]] = mhlo.divide %[[TEMP_57]], %[[TEMP_59]] : tensor<f32>
-// CHECK: %[[TEMP_66:.*]] = mhlo.add %[[TEMP_53]], %[[TEMP_35]] : tensor<f32>
-// CHECK: %[[TEMP_67:.*]] = mhlo.divide %[[TEMP_57]], %[[TEMP_66]] : tensor<f32>
-// CHECK: %[[TEMP_68:.*]] = mhlo.add %[[TEMP_65]], %[[TEMP_67]] : tensor<f32>
-// CHECK: %[[TEMP_69:.*]] = mhlo.sqrt %[[TEMP_68]] : tensor<f32>
-// CHECK: %[[TEMP_70:.*]] = mhlo.multiply %[[TEMP_3]], %[[TEMP_69]] : tensor<f32>
-// CHECK: %[[TEMP_71:.*]] = mhlo.select %[[TEMP_11]], %[[TEMP_64]], %[[TEMP_70]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_72:.*]] = mhlo.select %[[TEMP_9]], %[[TEMP_3]], %[[TEMP_71]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_73:.*]] = mhlo.constant dense<9.99999995E+11> : tensor<f32>
-// CHECK: %[[TEMP_74:.*]] = mhlo.multiply %[[TEMP_8]], %[[TEMP_73]] : tensor<f32>
-// CHECK: %[[TEMP_75:.*]] = mhlo.compare  LT, %[[TEMP_1]], %[[TEMP_74]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_76:.*]] = mhlo.constant dense<9.99999997E-7> : tensor<f32>
-// CHECK: %[[TEMP_77:.*]] = mhlo.multiply %[[TEMP_8]], %[[TEMP_76]] : tensor<f32>
-// CHECK: %[[TEMP_78:.*]] = mhlo.constant dense<1.000000e+02> : tensor<f32>
-// CHECK: %[[TEMP_79:.*]] = mhlo.multiply %[[TEMP_8]], %[[TEMP_78]] : tensor<f32>
-// CHECK: %[[TEMP_80:.*]] = mhlo.select %[[TEMP_75]], %[[TEMP_77]], %[[TEMP_79]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_81:.*]] = mhlo.compare  GE, %[[TEMP_3]], %[[TEMP_80]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_82:.*]] = mhlo.select %[[TEMP_81]], %[[TEMP_3]], %[[TEMP_1]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_83:.*]] = mhlo.select %[[TEMP_81]], %[[TEMP_80]], %[[TEMP_8]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_84:.*]] = mhlo.compare  GE, %[[TEMP_82]], %[[TEMP_83]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_85:.*]] = mhlo.log %[[TEMP_29]] : tensor<f32>
-// CHECK: %[[TEMP_86:.*]] = mhlo.log %[[TEMP_82]] : tensor<f32>
-// CHECK: %[[TEMP_87:.*]] = mhlo.add %[[TEMP_85]], %[[TEMP_86]] : tensor<f32>
-// CHECK: %[[TEMP_88:.*]] = mhlo.constant dense<0x7F800000> : tensor<f32>
-// CHECK: %[[TEMP_89:.*]] = mhlo.compare  EQ, %[[TEMP_3]], %[[TEMP_88]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_90:.*]] = mhlo.not %[[TEMP_89]] : tensor<i1>
-// CHECK: %[[TEMP_91:.*]] = mhlo.and %[[TEMP_81]], %[[TEMP_90]] : tensor<i1>
-// CHECK: %[[TEMP_92:.*]] = mhlo.divide %[[TEMP_1]], %[[TEMP_3]] : tensor<f32>
-// CHECK: %[[TEMP_93:.*]] = mhlo.select %[[TEMP_91]], %[[TEMP_92]], %[[TEMP_25]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_94:.*]] = mhlo.multiply %[[TEMP_93]], %[[TEMP_93]] : tensor<f32>
-// CHECK: %[[TEMP_95:.*]] = mhlo.log_plus_one %[[TEMP_94]] : tensor<f32>
-// CHECK: %[[TEMP_96:.*]] = mhlo.multiply %[[TEMP_12]], %[[TEMP_95]] : tensor<f32>
-// CHECK: %[[TEMP_97:.*]] = mhlo.add %[[TEMP_87]], %[[TEMP_96]] : tensor<f32>
-// CHECK: %[[TEMP_98:.*]] = mhlo.constant dense<1.17549435E-38> : tensor<f32>
-// CHECK: %[[TEMP_99:.*]] = mhlo.sqrt %[[TEMP_98]] : tensor<f32>
-// CHECK: %[[TEMP_100:.*]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
-// CHECK: %[[TEMP_101:.*]] = mhlo.multiply %[[TEMP_99]], %[[TEMP_100]] : tensor<f32>
-// CHECK: %[[TEMP_102:.*]] = mhlo.compare  LT, %[[TEMP_3]], %[[TEMP_101]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_103:.*]] = mhlo.compare  LT, %[[TEMP_1]], %[[TEMP_10]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_104:.*]] = mhlo.and %[[TEMP_102]], %[[TEMP_103]] : tensor<i1>
-// CHECK: %[[TEMP_105:.*]] = mhlo.multiply %[[TEMP_13]], %[[TEMP_35]] : tensor<f32>
-// CHECK: %[[TEMP_106:.*]] = mhlo.add %[[TEMP_55]], %[[TEMP_10]] : tensor<f32>
-// CHECK: %[[TEMP_107:.*]] = mhlo.divide %[[TEMP_105]], %[[TEMP_106]] : tensor<f32>
-// CHECK: %[[TEMP_108:.*]] = mhlo.negate %[[TEMP_107]] : tensor<f32>
-// CHECK: %[[TEMP_109:.*]] = mhlo.compare  GE, %[[TEMP_1]], %[[TEMP_10]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_110:.*]] = mhlo.multiply %[[TEMP_12]], %[[TEMP_58]] : tensor<f32>
-// CHECK: %[[TEMP_111:.*]] = mhlo.divide %[[TEMP_110]], %[[TEMP_59]] : tensor<f32>
-// CHECK: %[[TEMP_112:.*]] = mhlo.multiply %[[TEMP_12]], %[[TEMP_66]] : tensor<f32>
-// CHECK: %[[TEMP_113:.*]] = mhlo.add %[[TEMP_111]], %[[TEMP_112]] : tensor<f32>
-// CHECK: %[[TEMP_114:.*]] = mhlo.constant dense<1.500000e+00> : tensor<f32>
-// CHECK: %[[TEMP_115:.*]] = mhlo.compare  LE, %[[TEMP_55]], %[[TEMP_114]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_116:.*]] = mhlo.divide %[[TEMP_110]], %[[TEMP_61]] : tensor<f32>
-// CHECK: %[[TEMP_117:.*]] = mhlo.add %[[TEMP_111]], %[[TEMP_116]] : tensor<f32>
-// CHECK: %[[TEMP_118:.*]] = mhlo.subtract %[[TEMP_55]], %[[TEMP_10]] : tensor<f32>
-// CHECK: %[[TEMP_119:.*]] = mhlo.select %[[TEMP_115]], %[[TEMP_117]], %[[TEMP_118]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_120:.*]] = mhlo.select %[[TEMP_109]], %[[TEMP_113]], %[[TEMP_119]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_121:.*]] = mhlo.select %[[TEMP_104]], %[[TEMP_108]], %[[TEMP_120]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_122:.*]] = mhlo.multiply %[[TEMP_121]], %[[TEMP_106]] : tensor<f32>
-// CHECK: %[[TEMP_123:.*]] = mhlo.sqrt %[[TEMP_122]] : tensor<f32>
-// CHECK: %[[TEMP_124:.*]] = mhlo.divide %[[TEMP_3]], %[[TEMP_123]] : tensor<f32>
-// CHECK: %[[TEMP_125:.*]] = mhlo.add %[[TEMP_121]], %[[TEMP_123]] : tensor<f32>
-// CHECK: %[[TEMP_126:.*]] = mhlo.log_plus_one %[[TEMP_125]] : tensor<f32>
-// CHECK: %[[TEMP_127:.*]] = mhlo.select %[[TEMP_104]], %[[TEMP_124]], %[[TEMP_126]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_128:.*]] = mhlo.select %[[TEMP_84]], %[[TEMP_97]], %[[TEMP_127]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_129:.*]] = mhlo.complex %[[TEMP_72]], %[[TEMP_128]] : tensor<complex<f32>>
-// CHECK: %[[TEMP_130:.*]] = mhlo.imag %[[TEMP_129]] : (tensor<complex<f32>>) -> tensor<f32>
-// CHECK: %[[TEMP_131:.*]] = mhlo.imag %[[TEMP_arg0:.*]] : (tensor<complex<f32>>) -> tensor<f32>
-// CHECK: %[[TEMP_132:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK: %[[TEMP_133:.*]] = mhlo.compare  LT, %[[TEMP_131]], %[[TEMP_132]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK: %[[TEMP_134:.*]] = mhlo.real %[[TEMP_129]] : (tensor<complex<f32>>) -> tensor<f32>
-// CHECK: %[[TEMP_135:.*]] = mhlo.real %[[TEMP_arg0:.*]] : (tensor<complex<f32>>) -> tensor<f32>
-// CHECK: %[[TEMP_136:.*]] = mhlo.atan2 %[[TEMP_134]], %[[TEMP_135]] : tensor<f32>
-// CHECK: %[[TEMP_137:.*]] = mhlo.negate %[[TEMP_136]] : tensor<f32>
-// CHECK: %[[TEMP_138:.*]] = mhlo.select %[[TEMP_133]], %[[TEMP_137]], %[[TEMP_136]] : tensor<i1>, tensor<f32>
-// CHECK: %[[TEMP_139:.*]] = mhlo.complex %[[TEMP_130]], %[[TEMP_138]] : tensor<complex<f32>>
-// CHECK: return %[[TEMP_139]] : tensor<complex<f32>>
-func.func @acosh_complex_f32(%arg : tensor<complex<f32>>) -> tensor<complex<f32>> {
-  %result = "chlo.acosh"(%arg) : (tensor<complex<f32>>) -> tensor<complex<f32>>
-  func.return %result : tensor<complex<f32>>
-}
-// CHECK-HIGH-LEVEL-NOT: mhlo.acosh
-
-// -----
-
-// CHECK-LABEL: @erfc_f64
+// CHECK-LABEL: func.func @erfc_f64
+// CHECK-HIGH-LEVEL-LABEL: func.func @erfc_f64
 // CHECK-SAME: %[[ARG:.*]]: tensor<f64>
 func.func @erfc_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-NEXT: %[[TMP_0:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
@@ -2616,7 +2489,8 @@ func.func @polygamma_f16(%lhs : tensor<f16>, %rhs : tensor<f16>) -> tensor<f16>
 // -----
 
 
-// CHECK-LABEL: @sinh_f32
+// CHECK-LABEL: func.func @sinh_f32
+// CHECK-HIGH-LEVEL-LABEL: func.func @sinh_f32
 // CHECK-SAME: (%[[X:.*]]: tensor<f32>)
 func.func @sinh_f32(%x : tensor<f32>) -> tensor<f32> {
   // CHECK: %[[HALF:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
@@ -2644,7 +2518,8 @@ func.func @sinh_f32(%x : tensor<f32>) -> tensor<f32> {
 
 // -----
 
-// CHECK-LABEL: @sinh_f16
+// CHECK-LABEL: func.func @sinh_f16
+// CHECK-HIGH-LEVEL-LABEL: func.func @sinh_f16
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<f16>)
 func.func @sinh_f16(%x : tensor<f16>) -> tensor<f16> {
   // CHECK: mhlo.convert %[[ARG0]] : (tensor<f16>) -> tensor<f32>
@@ -2657,7 +2532,8 @@ func.func @sinh_f16(%x : tensor<f16>) -> tensor<f16> {
 
 // -----
 
-// CHECK-LABEL: @sinh_complex
+// CHECK-LABEL: func.func @sinh_complex
+// CHECK-HIGH-LEVEL-LABEL: func.func @sinh_complex
 // CHECK-SAME: (%[[X:.*]]: tensor<2xcomplex<f32>>)
 func.func @sinh_complex(%x : tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>> {
   // CHECK: %[[HALF:.*]] = mhlo.constant dense<(5.000000e-01,0.000000e+00)> : tensor<2xcomplex<f32>>
@@ -2675,7 +2551,8 @@ func.func @sinh_complex(%x : tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>> {
 
 // -----
 
-// CHECK-LABEL: @cosh_f32
+// CHECK-LABEL: func.func @cosh_f32
+// CHECK-HIGH-LEVEL-LABEL: func.func @cosh_f32
 // CHECK-SAME: (%[[X:.*]]: tensor<f32>)
 func.func @cosh_f32(%x : tensor<f32>) -> tensor<f32> {
   // CHECK-HIGH-LEVEL: mhlo.cosh
@@ -2693,7 +2570,8 @@ func.func @cosh_f32(%x : tensor<f32>) -> tensor<f32> {
 
 // -----
 
-// CHECK-LABEL: @cosh_f16
+// CHECK-LABEL: func.func @cosh_f16
+// CHECK-HIGH-LEVEL-LABEL: func.func @cosh_f16
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<f16>)
 func.func @cosh_f16(%x : tensor<f16>) -> tensor<f16> {
   // CHECK: mhlo.convert %[[ARG0]] : (tensor<f16>) -> tensor<f32>
@@ -2722,7 +2600,8 @@ func.func @cosh_complex_f32(%x : tensor<complex<f32>>) -> tensor<complex<f32>> {
 
 // -----
 
-// CHECK-LABEL: @atanh_f32
+// CHECK-LABEL: func.func @atanh_f32
+// CHECK-HIGH-LEVEL-LABEL: func.func @atanh_f32
 // CHECK-SAME: %[[ARG:.*]]: tensor<f32>
 func.func @atanh_f32(%arg : tensor<f32>) -> tensor<f32> {
   // CHECK-NEXT: %[[TMP_0:.*]] = mhlo.abs %[[ARG]]
@@ -2744,7 +2623,8 @@ func.func @atanh_f32(%arg : tensor<f32>) -> tensor<f32> {
 
 // -----
 
-// CHECK-LABEL: @atanh_complex_f32
+// CHECK-LABEL: func.func @atanh_complex_f32
+// CHECK-HIGH-LEVEL-LABEL: func.func @atanh_complex_f32
 // CHECK-SAME: %[[ARG:.*]]: tensor<complex<f32>>
 func.func @atanh_complex_f32(%arg : tensor<complex<f32>>) -> tensor<complex<f32>> {
   // CHECK-NEXT: %[[REAL:.*]] = mhlo.real %[[ARG]]
@@ -2823,7 +2703,8 @@ func.func @atanh_complex_f32(%arg : tensor<complex<f32>>) -> tensor<complex<f32>
 
 // -----
 
-// CHECK-LABEL: @next_after_f32
+// CHECK-LABEL: func.func @next_after_f32
+// CHECK-HIGH-LEVEL-LABEL: func.func @next_after_f32
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2xf32>, %[[ARG1:.*]]: tensor<2xf32>)
 func.func @next_after_f32(%x: tensor<2xf32>, %y: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK: %[[X_AS_INT:.*]] = mhlo.bitcast_convert %[[ARG0]] : (tensor<2xf32>) -> tensor<2xi32>
@@ -2863,7 +2744,8 @@ func.func @next_after_f32(%x: tensor<2xf32>, %y: tensor<2xf32>) -> tensor<2xf32>
 
 // -----
 
-// CHECK-LABEL: @top_k
+// CHECK-LABEL: func.func @top_k
+// CHECK-HIGH-LEVEL-LABEL: func.func @top_k
 // CHECK-SAME: (%[[ARG:.*]]: tensor<16x16xf32>)
 func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
   // CHECK-HIGH-LEVEL: mhlo.topk
@@ -2874,7 +2756,8 @@ func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32
 
 // -----
 
-// CHECK-LABEL: @dyn_top_k
+// CHECK-LABEL: func.func @dyn_top_k
+// CHECK-HIGH-LEVEL-LABEL: func.func @dyn_top_k
 // CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
 // CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
 func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
@@ -2885,7 +2768,6 @@ func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2
 }
 
 // -----
-
 func.func @unranked_top_k(%arg : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi32>) {
   // expected-error@+1 {{failed to legalize operation 'chlo.top_k' that was explicitly marked illegal}}
   %1:2 = chlo.top_k(%arg, k=8) : tensor<*xf32> -> (tensor<*xf32>, tensor<*xi32>)
@@ -3662,7 +3544,8 @@ func.func @erf_inv_wide(%arg0 : tensor<16x16xf64>) {
 }
 
 // -----
-
+// CHECK-LABEL: func.func @ragged_dot_non_contracting
+// CHECK-HIGH-LEVEL-LABEL: func.func @ragged_dot_non_contracting
 func.func @ragged_dot_non_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3x2x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<2x11x7xf32> {
   // CHECK-HIGH-LEVEL: mhlo.ragged_dot
   %0 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
@@ -3680,7 +3563,8 @@ func.func @ragged_dot_non_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3
 }
 
 // -----
-
+// CHECK-LABEL: func.func @ragged_dot_contracting
+// CHECK-HIGH-LEVEL-LABEL: func.func @ragged_dot_contracting
 func.func @ragged_dot_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<2x5x7xf32>, %group_sizes : tensor<2x3xi64>) -> tensor<3x2x11x7xf32> {
   // CHECK-HIGH-LEVEL: mhlo.ragged_dot
   %0 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
@@ -3698,7 +3582,8 @@ func.func @ragged_dot_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<2x5x7
 }
 
 // -----
-
+// CHECK-LABEL: func.func @ragged_dot_batch
+// CHECK-HIGH-LEVEL-LABEL: func.func @ragged_dot_batch
 func.func @ragged_dot_batch(%lhs : tensor<19x17x11x5xf32>, %rhs : tensor<19x17x5x7xf32>, %group_sizes : tensor<19x3xi64>) -> tensor<19x17x11x7xf32> {
   // CHECK-HIGH-LEVEL: mhlo.ragged_dot
   %0 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
@@ -3716,7 +3601,8 @@ func.func @ragged_dot_batch(%lhs : tensor<19x17x11x5xf32>, %rhs : tensor<19x17x5
 }
 
 // -----
-
+// CHECK-LABEL: func.func @ragged_dot_frontend_attributes
+// CHECK-HIGH-LEVEL-LABEL: func.func @ragged_dot_frontend_attributes
 func.func @ragged_dot_frontend_attributes(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3x2x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<2x11x7xf32> {
   // CHECK-HIGH-LEVEL: mhlo.ragged_dot
   // CHECK-HIGH-LEVEL: mhlo.frontend_attributes = {foo = "bar"}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
index a214de54fe8b50..9bc777043898b5 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
@@ -11,6 +11,16 @@ func.func @splat_constants() -> tensor<1x64x224x224xf32> {
 
 // -----
 
+// CHECK-LABEL: @non_mhlo_constant
+func.func @non_mhlo_constant() -> tensor<128x1014x508xcomplex<f64>> {
+// CHECK:     arith.constant dense<(1.000000e+00,2.000000e+00)> : tensor<128x1014x508xcomplex<f64>>
+// CHECK-NOT: mhlo.broadcast_in_dim
+  %0 = arith.constant dense<(1.000000e+00,2.000000e+00)> : tensor<128x1014x508xcomplex<f64>>
+  func.return %0 : tensor<128x1014x508xcomplex<f64>>
+}
+
+// -----
+
 // CHECK-LABEL: @splat_constant_complex_float
 func.func @splat_constant_complex_float() -> tensor<128x1014x508xcomplex<f64>> {
 // CHECK: %[[CST:.*]] = mhlo.constant dense<(1.000000e+00,2.000000e+00)> : tensor<complex<f64>>
diff --git a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_preserve_high_level_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_preserve_high_level_ops.mlir
index 8e33b9d34361b6..979150d3a1fc95 100644
--- a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_preserve_high_level_ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_preserve_high_level_ops.mlir
@@ -123,6 +123,26 @@ func.func @sinh_preserve(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
 
 // -----
 
+// CHECK-LABEL: func @asin_preserve
+func.func @asin_preserve(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK-CC: stablehlo.custom_call @mhlo.asin(%arg0) {mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  // CHECK: stablehlo.composite "chlo.asin" %arg0 {decomposition = @chlo.asin.impl, version = 1 : i32}
+  %0 = chlo.asin %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @asinh_preserve
+func.func @asinh_preserve(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK-CC: stablehlo.custom_call @mhlo.asinh(%arg0) {mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  // CHECK: stablehlo.composite "chlo.asinh" %arg0 {decomposition = @chlo.asinh.impl, version = 1 : i32}
+  %0 = chlo.asinh %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+
+// -----
+
 // CHECK-LABEL: func @tan_no_preserve
 func.func @tan_no_preserve(%arg0: tensor<16xf32>) -> tensor<?xf32> {
   // CHECK: chlo.tan
diff --git a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir
index e8c6136148914a..5a16a15ad58788 100644
--- a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir
@@ -93,6 +93,36 @@ func.func private @chlo.sinh.impl(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20
 
 // -----
 
+// CHECK-LABEL: func @asin_recompose_composite
+func.func @asin_recompose_composite(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK-NEXT: chlo.asin
+  // CHECK-NOT: stablehlo.composite
+  %0 = stablehlo.composite "chlo.asin" %arg0 {decomposition = @chlo.asin.impl, version = 1 : i32} : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+// CHECK-NOT: @chlo.asin.impl
+func.func private @chlo.asin.impl(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  %0 = chlo.asin %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @asinh_recompose_composite
+func.func @asinh_recompose_composite(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK-NEXT: chlo.asinh
+  // CHECK-NOT: stablehlo.composite
+  %0 = stablehlo.composite "chlo.asinh" %arg0 {decomposition = @chlo.asinh.impl, version = 1 : i32} : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+// CHECK-NOT: @chlo.asinh.impl
+func.func private @chlo.asinh.impl(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  %0 = chlo.asinh %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+
+// -----
+
 // CHECK-LABEL: func @ragged_dot_recompose_composite
 func.func @ragged_dot_recompose_composite(%arg0: tensor<2x11x5xf32>, %arg1: tensor<3x2x5x7xf32>, %arg2: tensor<3xi64>) -> tensor<2x11x7xf32> {
   // CHECK: "chlo.ragged_dot"(%arg0, %arg1, %arg2) <{precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>], ragged_dot_dimension_numbers = #chlo.ragged_dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2], lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>}> : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
@@ -196,12 +226,26 @@ func.func @cosh_recompose_cc(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16
 
 // -----
 
-// CHECK-LABEL: @sinh_recompose_cc
-func.func @sinh_recompose_cc(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
-  // CHECK: %0 = chlo.sinh %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+// CHECK-LABEL: @asin_recompose_cc
+func.func @asin_recompose_cc(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK: %0 = chlo.asin %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  %0 = "stablehlo.custom_call"(%arg0) {
+    backend_config = "",
+    call_target_name = "mhlo.asin",
+    mhlo.attributes = {},
+    mhlo.version = 1 : i64
+  } : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  func.return %0 : tensor<?x20x20xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: @asinh_recompose_cc
+func.func @asinh_recompose_cc(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK: %0 = chlo.asinh %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
   %0 = "stablehlo.custom_call"(%arg0) {
     backend_config = "",
-    call_target_name = "mhlo.sinh",
+    call_target_name = "mhlo.asinh",
     mhlo.attributes = {},
     mhlo.version = 1 : i64
   } : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
diff --git a/third_party/xla/xla/permutation_util_test.cc b/third_party/xla/xla/permutation_util_test.cc
index 3e3ca4035960f7..9cc93f66df69b9 100644
--- a/third_party/xla/xla/permutation_util_test.cc
+++ b/third_party/xla/xla/permutation_util_test.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "xla/permutation_util.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "xla/hlo/testlib/test.h"
 
 namespace xla {
@@ -53,11 +53,11 @@ TEST(PermutationUtilTest, IsIdentityPermutation_TrueCases) {
 
 TEST(PermutationUtilTest, IsIdentityPermutation_FalseCases) {
   std::vector<int> v{0, 1, 2, 3};
-  std::next_permutation(v.begin(), v.end());
+  absl::c_next_permutation(v);
 
   do {
     EXPECT_FALSE(IsIdentityPermutation(v));
-  } while (std::next_permutation(v.begin(), v.end()));
+  } while (absl::c_next_permutation(v));
 }
 
 TEST(PermutationUtilTest, PermuteInverse) {
@@ -90,7 +90,7 @@ TEST(PermutationUtilTest, ComposeAndInversePermutations) {
 
   do {
     EXPECT_EQ(ComposePermutations(InversePermutation(p), p), id);
-  } while (std::next_permutation(p.begin(), p.end()));
+  } while (absl::c_next_permutation(p));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index f9243e4d06767d..e2cd10045c16d3 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -28,7 +28,7 @@ package_group(
         "//third_party/gxlang/...",
         "//third_party/openxla_pjrt_plugin/...",
         "//third_party/py/jax/...",
-        "//third_party/torch_tpu/...",
+        "//third_party/py/torch_tpu/...",
     ],
 )
 
@@ -101,13 +101,12 @@ cc_library(
     srcs = ["abstract_tracked_device_buffer.cc"],
     hdrs = ["abstract_tracked_device_buffer.h"],
     visibility = [
-        "//third_party/torch_tpu/pjrt:__pkg__",
+        "//third_party/py/torch_tpu/pjrt:__pkg__",
         "//xla:internal",
     ],
     deps = [
         ":device_event",
         ":pjrt_client",
-        ":pjrt_future",
         ":raw_buffer",
         "//xla:future",
         "//xla:util",
@@ -136,7 +135,7 @@ cc_library(
     ],
     visibility = internal_visibility([
         ":friends",
-        "//third_party/torch_tpu:__pkg__",
+        "//third_party/py/torch_tpu:__pkg__",
     ]),
     deps = [
         ":abstract_tracked_device_buffer",
@@ -144,7 +143,6 @@ cc_library(
         ":device_event",
         ":host_callback",
         ":pjrt_client",
-        ":pjrt_future",
         ":raw_buffer",
         "//xla:future",
         "//xla:literal",
@@ -181,7 +179,6 @@ xla_cc_test(
     deps = [
         ":pjrt_client",
         ":pjrt_common",
-        ":pjrt_future",
         ":pjrt_stream_executor_client",
         "//xla:literal",
         "//xla:literal_util",
@@ -194,7 +191,9 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/service:cpu_plugin",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -279,9 +278,10 @@ xla_cc_test(
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -295,7 +295,6 @@ cc_library(
         ":pjrt_compiler",
         ":pjrt_device_description",
         ":pjrt_executable",
-        ":pjrt_future",
         ":pjrt_layout",
         ":scoped_async_tracking_event",
         ":utils",
@@ -310,6 +309,7 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_cost_analysis",
         "//xla/tsl/framework:allocator",
+        "//xla/tsl/lib/gtl:int_type",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
@@ -353,6 +353,7 @@ cc_library(
     deps = [
         ":pjrt_client",
         ":pjrt_compiler",
+        "//xla:literal",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:alignment",
@@ -361,6 +362,7 @@ cc_library(
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test",
         "//xla/tests:literal_test_util",
+        "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -369,6 +371,34 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "pjrt_device_dimensions",
+    srcs = ["pjrt_device_dimensions.cc"],
+    hdrs = ["pjrt_device_dimensions.h"],
+    visibility = internal_visibility(["//xla:friends"]),
+    deps = [
+        "//xla/pjrt/proto:pjrt_device_dimensions_proto_cc",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_device_dimensions_test",
+    srcs = ["pjrt_device_dimensions_test.cc"],
+    deps = [
+        ":pjrt_device_dimensions",
+        "//xla/pjrt/proto:pjrt_device_dimensions_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "pjrt_executable",
     srcs = ["pjrt_executable.cc"],
@@ -376,6 +406,7 @@ cc_library(
     visibility = internal_visibility([":friends"]),
     deps = [
         ":pjrt_common",
+        ":pjrt_device_dimensions",
         ":pjrt_layout",
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -446,11 +477,14 @@ cc_library(
     hdrs = ["pjrt_compiler.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
+        ":pjrt_common",
         ":pjrt_device_description",
+        ":pjrt_device_dimensions",
         ":pjrt_executable",
         "//xla/hlo/builder:xla_computation",
         "//xla/pjrt/proto:pjrt_partial_program_proto_cc",
         "//xla/pjrt/proto:topology_description_proto_cc",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -494,6 +528,7 @@ cc_library(
     deps = [
         "//xla/pjrt/proto:pjrt_value_type_proto_cc",
         "//xla/tsl/lib/gtl:int_type",
+        "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
@@ -646,7 +681,6 @@ cc_library(
         ":pjrt_common",
         ":pjrt_compiler",
         ":pjrt_executable",
-        ":pjrt_future",
         ":pjrt_stream_executor_device_description",
         ":raw_buffer",
         ":semaphore",
@@ -666,6 +700,7 @@ cc_library(
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/mlir_hlo:mhlo_passes",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/dump",
         "//xla/pjrt/profiling:device_time_measurement",
@@ -685,7 +720,6 @@ cc_library(
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
@@ -693,7 +727,6 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -730,7 +763,6 @@ xla_cc_test(
         ":local_device_state",
         ":pjrt_client",
         ":pjrt_executable",
-        ":pjrt_future",
         ":pjrt_stream_executor_client",
         "//xla:future",
         "//xla:literal",
@@ -862,6 +894,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_constants",
+    hdrs = ["tpu_constants.h"],
+    visibility = internal_visibility([":friends"]),
+    deps = [
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 # Transitional forwarding target. Use pjrt/plugin/xla_cpu:xla_cpu_pjrt_client instead.
 cc_library(
     name = "tfrt_cpu_pjrt_client",
@@ -972,7 +1013,6 @@ cc_library(
         ":pjrt_common",
         ":pjrt_compiler",
         ":pjrt_executable",
-        ":pjrt_future",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
@@ -1023,7 +1063,6 @@ cc_library(
     deps = [
         ":pjrt_client",
         ":pjrt_executable",
-        ":pjrt_future",
         "//xla:future",
         "//xla:shape_util",
         "//xla/ffi:ffi_api",
@@ -1034,7 +1073,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -1160,7 +1198,6 @@ cc_library(
     hdrs = ["device_event.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":pjrt_future",
         "//xla:future",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
@@ -1180,7 +1217,6 @@ cc_library(
         ":async_work_runner",
         ":device_event",
         ":pjrt_client",
-        ":pjrt_future",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
@@ -1200,7 +1236,6 @@ xla_cc_test(
         ":raw_buffer",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -1223,6 +1258,7 @@ cc_library(
 cc_library(
     name = "async_work_runner",
     hdrs = ["async_work_runner.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
@@ -1305,11 +1341,10 @@ xla_cc_test(
         "//xla/pjrt/plugin/xla_cpu:cpu_topology",
         "//xla/pjrt/plugin/xla_cpu:cpu_topology_description",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.cc b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.cc
index e92e92105585c8..554bd8cdd8b0a8 100644
--- a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.cc
@@ -289,7 +289,7 @@ absl::Status CommonPjRtBuffer::AcquireScopedRawBuffer(
   TF_ASSIGN_OR_RETURN(
       auto device_event,
       std::move(scoped_acquire)(
-          device_buffer.buffer()->GetRawBuffer(memory_space_),
+          device_buffer.buffer()->raw_buffer(),
           device_buffer.buffer()->GetAsyncValueDefinitionEvents()));
   device_buffer.ConvertUsageHold(std::move(device_event));
   return absl::OkStatus();
diff --git a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
index fbe2fb39d8aba5..d900cdc664fc8d 100644
--- a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -40,16 +39,20 @@ namespace xla {
 class AbstractTrackedDeviceBuffer {
  public:
   virtual ~AbstractTrackedDeviceBuffer() = default;
+  explicit AbstractTrackedDeviceBuffer(
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer)
+      : raw_buffer_(std::move(raw_buffer)) {}
 
   // Construct (or return) a vector of tsl::AsyncValue events which
   // will become ready when this buffer is ready.
   virtual std::vector<tsl::RCReference<tsl::AsyncValue>>
   GetAsyncValueDefinitionEvents() = 0;
 
-  // Construct (or return) a raw buffer which aliases the same
+  // Returns a raw buffer which aliases the same
   // underlying memory as this AbstractTrackedDeviceBuffer.
-  virtual tsl::RCReference<CommonPjRtRawBuffer> GetRawBuffer(
-      PjRtMemorySpace* memory_space) = 0;
+  tsl::RCReference<CommonPjRtRawBuffer> raw_buffer() const {
+    return raw_buffer_;
+  }
 
   // Only to be called via the result of
   // CommonPjRtBuffer::ScopedHold::ConvertUsageHold with an optional device
@@ -94,6 +97,14 @@ class AbstractTrackedDeviceBuffer {
     return absl::UnimplementedError(
         "WaitUntilBufferReadyOnStream is only implemented for GPU.");
   }
+
+ protected:
+  void ReleaseDeviceMemory() {
+    raw_buffer_ = tsl::RCReference<CommonPjRtRawBuffer>();
+  }
+
+ private:
+  tsl::RCReference<CommonPjRtRawBuffer> raw_buffer_;
 };
 
 class CommonPjRtBuffer : public PjRtBuffer {
diff --git a/third_party/xla/xla/pjrt/buffer_sequencing_event.cc b/third_party/xla/xla/pjrt/buffer_sequencing_event.cc
index 7ebea180b415c0..6a86c996f2b044 100644
--- a/third_party/xla/xla/pjrt/buffer_sequencing_event.cc
+++ b/third_party/xla/xla/pjrt/buffer_sequencing_event.cc
@@ -65,7 +65,7 @@ void BufferSequencingEvent::WaitForEventOnStream(se::Stream* stream) {
     return;
   }
 
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   // The set of defined streams is expected to be very small indeed (usually
   // 1-2), so a simple linear scan should be fast enough.
   if (std::find(streams_defined_on_.begin(), streams_defined_on_.end(),
@@ -103,7 +103,7 @@ bool BufferSequencingEvent::IsPredeterminedErrorOrDefinedOn(
 
   // The set of defined streams is expected to be very small indeed (usually
   // 1-2), so a simple linear scan should be fast enough.
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   return absl::c_find(streams_defined_on_, stream) != streams_defined_on_.end();
 }
 
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 4f997f24b3654f..9313c2cd13a087 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -63,6 +63,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_c_api_tpu_topology_extension_hdrs",
+    hdrs = ["pjrt_c_api_tpu_topology_extension.h"],
+    deps = [":pjrt_c_api_hdrs"],
+)
+
 cc_library(
     name = "pjrt_c_api_ffi_extension_hdrs",
     hdrs = ["pjrt_c_api_ffi_extension.h"],
@@ -82,9 +88,10 @@ cc_library(
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
         ":pjrt_c_api_wrapper_impl",
+        "//xla/ffi",
         "//xla/ffi:execution_context",
         "//xla/ffi:ffi_api",
-        "//xla/ffi:type_id_registry",
+        "//xla/ffi:type_registry",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:ffi",
         "@com_google_absl//absl/status",
@@ -368,8 +375,8 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/cpu:cpu_client",
         "//xla/pjrt/plugin/xla_cpu:cpu_client_options",
-        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -530,7 +537,6 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/service:computation_placer_hdr",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -563,11 +569,10 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:util",
         "//xla/client:client_library",
         "//xla/ffi:execution_context",
         "//xla/ffi:ffi_api",
-        "//xla/ffi:type_id_registry",
+        "//xla/ffi:type_registry",
         "//xla/ffi/api:ffi",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
@@ -578,8 +583,6 @@ xla_test(
         "//xla/stream_executor/gpu:gpu_init",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -637,7 +640,6 @@ cc_library(
     deps = [
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
-        ":pjrt_c_api_memory_descriptions_extension_hdrs",
         ":pjrt_c_api_test_base",
         "//xla:future",
         "//xla:literal",
@@ -654,7 +656,6 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_proto_cc",
         "//xla/tests:literal_test_util",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 67db6b050a763d..9906e9b6698830 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,13 @@
 # PJRT C API changelog
 
+## 0.82
+
+* Add `PJRT_Client_CreateErrorBuffer`.
+
+## 0.81
+
+* Added `PJRT_Layouts_PJRT_Executable_GetOutputLayouts`.
+
 ## 0.80
 
 * Added `PJRT_Extension_Type::PJRT_Extension_Type_HostAllocator`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index c8334878913e53..ab3d899ce016b9 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -69,6 +69,7 @@ typedef enum {
   PJRT_Extension_Type_ExecutableMetadata,
   PJRT_Extension_Type_Callback,
   PJRT_Extension_Type_HostAllocator,  // Experimental.
+  PJRT_Extension_Type_TpuTopology,
 } PJRT_Extension_Type;
 
 // PJRT_Extension_Base contains a type and a pointer to next
@@ -103,7 +104,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 80
+#define PJRT_API_MINOR 82
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -993,6 +994,34 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateUninitializedBuffer_Args, buffer);
 typedef PJRT_Error* PJRT_Client_CreateUninitializedBuffer(
     PJRT_Client_CreateUninitializedBuffer_Args* args);
 
+struct PJRT_Client_CreateErrorBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+
+  // Status fields.
+  PJRT_Error_Code error_code;
+  const char* error_message;
+  size_t error_message_size;
+
+  // Shape fields.
+  const int64_t* shape_dims;
+  size_t shape_num_dims;
+  PJRT_Buffer_Type shape_element_type;
+  PJRT_Buffer_MemoryLayout* shape_layout;
+
+  // Destination memory space for the error buffer.
+  PJRT_Memory* memory;
+
+  // Output device buffer. The caller is responsible for calling
+  // PJRT_Buffer_Destroy.
+  PJRT_Buffer* buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateErrorBuffer_Args, buffer);
+
+typedef PJRT_Error* PJRT_Client_CreateErrorBuffer(
+    PJRT_Client_CreateErrorBuffer_Args* args);
+
 struct PJRT_Client_CreateAliasBuffer_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2678,11 +2707,12 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateAliasBuffer);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_FulfillAliasBuffer);
   _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetDeviceAssignment);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateErrorBuffer);
 } PJRT_Api;
 
 enum {
   PJRT_Api_STRUCT_SIZE =
-      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_LoadedExecutable_GetDeviceAssignment)
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Client_CreateErrorBuffer)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
index 10bb1226737e6f..763c3249dd7906 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
@@ -32,11 +32,11 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_phase_compile_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_phase_compile_internal.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
-#include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 
 namespace pjrt {
 namespace cpu_plugin {
@@ -62,7 +62,7 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
   }
 
   PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> client,
-                        xla::GetXlaPjrtCpuClient(std::move(options)));
+                        xla::GetPjRtCpuClient(std::move(options)));
   args->client = pjrt::CreateWrapperClient(std::move(client));
   return nullptr;
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
index d3bd5c4fc99ef5..992b3950a4ca3a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
@@ -30,31 +30,35 @@ extern "C" {
 // and GPU backends it gives access to the XLA FFI internals.
 //
 // See: https://en.wikipedia.org/wiki/Foreign_function_interface
-#define PJRT_API_FFI_EXTENSION_VERSION 2
+#define PJRT_API_FFI_EXTENSION_VERSION 3
 
-struct PJRT_FFI_TypeID_Register_Args {
+typedef struct PJRT_FFI_Type_Info {
+  void (*deleter)(void* object);
+  void (*serialize)();    // placeholder for future use
+  void (*deserialize)();  // placeholder for future use
+} PJRT_FFI_Type_Info;
+
+typedef struct PJRT_FFI_Type_Register_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
 
   const char* type_name;
   size_t type_name_size;
   int64_t type_id;  // in-out
-};
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_TypeID_Register_Args, type_id);
+  PJRT_FFI_Type_Info* type_info;
+} PJRT_FFI_Type_Register_Args;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_Type_Register_Args, type_info);
 
 // Registers external type in a static type registry. If `type_id` is set to `0`
 // XLA will assign a unique type id to it and return via out argument, otherwise
 // it will verify that user-provided type id matches previously registered type
 // id for the given type name.
-typedef PJRT_Error* PJRT_FFI_TypeID_Register(
-    PJRT_FFI_TypeID_Register_Args* args);
+typedef PJRT_Error* PJRT_FFI_Type_Register(PJRT_FFI_Type_Register_Args* args);
 
-// User-data that will be forwarded to the FFI handlers. Deleter is optional,
-// and can be nullptr. Deleter will be called when the context is destroyed.
+// User-data that will be forwarded to the FFI handlers.
 typedef struct PJRT_FFI_UserData {
   int64_t type_id;
   void* data;
-  void (*deleter)(void* data);
 } PJRT_FFI_UserData;
 
 struct PJRT_FFI_UserData_Add_Args {
@@ -91,7 +95,7 @@ typedef PJRT_Error* PJRT_FFI_Register_Handler(
 
 typedef struct PJRT_FFI_Extension {
   PJRT_Extension_Base base;
-  PJRT_FFI_TypeID_Register* type_id_register;
+  PJRT_FFI_Type_Register* type_register;
   PJRT_FFI_UserData_Add* user_data_add;
   PJRT_FFI_Register_Handler* register_handler;
 } PJRT_FFI;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
index 9a7a4c810ff9df..9249fab66d316d 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
@@ -19,8 +19,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/execution_context.h"
+#include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
-#include "xla/ffi/type_id_registry.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_ffi_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
@@ -28,27 +29,29 @@ limitations under the License.
 
 namespace pjrt {
 
-static PJRT_Error* PJRT_FFI_TypeID_Register(
-    PJRT_FFI_TypeID_Register_Args* args) {
+static PJRT_Error* PJRT_FFI_Type_Register(PJRT_FFI_Type_Register_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
-      "PJRT_FFI_TypeID_Register_Args",
-      PJRT_FFI_TypeID_Register_Args_STRUCT_SIZE, args->struct_size));
+      "PJRT_FFI_Type_Register_Args", PJRT_FFI_Type_Register_Args_STRUCT_SIZE,
+      args->struct_size));
 
   absl::string_view type_name(args->type_name, args->type_name_size);
-  xla::ffi::TypeIdRegistry::TypeId type_id(args->type_id);
+  xla::ffi::TypeRegistry::TypeId type_id(args->type_id);
+  xla::ffi::TypeRegistry::TypeInfo type_info = {
+      args->type_info->deleter,
+  };
 
-  if (type_id == xla::ffi::TypeIdRegistry::kUnknownTypeId) {
+  if (type_id == xla::ffi::TypeRegistry::kUnknownTypeId) {
     // If type_id is unknown, we are registering a new type and XLA will assign
     // a unique type id to it.
     PJRT_ASSIGN_OR_RETURN(
         auto assigned_type_id,
-        xla::ffi::TypeIdRegistry::AssignExternalTypeId(type_name));
+        xla::ffi::TypeRegistry::AssignExternalTypeId(type_name, type_info));
     args->type_id = assigned_type_id.value();
 
   } else {
     // If type_id is set, we are relying on the caller-provided unique type id.
-    PJRT_RETURN_IF_ERROR(
-        xla::ffi::TypeIdRegistry::RegisterExternalTypeId(type_name, type_id));
+    PJRT_RETURN_IF_ERROR(xla::ffi::TypeRegistry::RegisterExternalTypeId(
+        type_name, type_id, type_info));
   }
 
   return nullptr;
@@ -64,9 +67,9 @@ static PJRT_Error* PJRT_FFI_UserData_Add(PJRT_FFI_UserData_Add_Args* args) {
         "PJRT FFI extension requires execute context to be not nullptr")};
   }
 
-  xla::ffi::TypeIdRegistry::TypeId type_id(args->user_data.type_id);
+  xla::ffi::TypeRegistry::TypeId type_id(args->user_data.type_id);
   PJRT_RETURN_IF_ERROR(args->context->execute_context->ffi_context().Insert(
-      type_id, args->user_data.data, args->user_data.deleter));
+      type_id, args->user_data.data));
   return nullptr;
 }
 
@@ -99,7 +102,7 @@ PJRT_FFI_Extension CreateFfiExtension(PJRT_Extension_Base* next) {
           /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_FFI,
           /*next=*/next,
       },
-      /*type_id_register=*/PJRT_FFI_TypeID_Register,
+      /*type_register=*/PJRT_FFI_Type_Register,
       /*user_data_add=*/PJRT_FFI_UserData_Add,
       /*register_handler=*/PJRT_FFI_Register_Handler,
   };
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index eaa34ad1306390..6f9d0d70e56df0 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -248,7 +248,7 @@ absl::StatusOr<TargetConfigAndDevices> GetTargetConfigFromOptions(
        xla_client->backend().stream_executors()) {
     device_ids.push_back(executor->device_ordinal());
   }
-  auto gpu_target_config = xla::Compiler::TargetConfig(executor);
+  auto gpu_target_config = xla::Compiler::GpuTargetConfig(executor);
   return {{gpu_target_config.ToProto(), device_ids}};
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index f032c3eb87cd5e..1b994f5d53cf1b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "xla/ffi/api/ffi.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/ffi_api.h"
-#include "xla/ffi/type_id_registry.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/future.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -69,10 +69,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
 #include "tsl/platform/mem.h"
 
 namespace pjrt {
@@ -100,7 +97,7 @@ class PjrtCApiGpuTest : public PjrtCApiTestBase {
 TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
   // Prepares a device memory ptr on GPU.
   auto [buffer, buffer_future] = create_iota_buffer();
-  TF_CHECK_OK(buffer_future.Await());
+  CHECK_OK(buffer_future.Await());
   PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args device_buffer_ptr_args;
   device_buffer_ptr_args.struct_size =
       PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args_STRUCT_SIZE;
@@ -171,7 +168,7 @@ TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
   ASSERT_EQ(to_host_error, nullptr);
   xla::Future<> transfer_to_host =
       ::pjrt::ConvertCEventToCppFuture(to_host_args.event, api_);
-  TF_CHECK_OK(transfer_to_host.Await());
+  CHECK_OK(transfer_to_host.Await());
   ASSERT_EQ(literal->data<float>().size(), 4);
   std::vector<float> float_data(4);
   std::iota(float_data.begin(), float_data.end(), 41.0f);
@@ -208,7 +205,7 @@ class PjrtCApiGpuBufferTest : public PjrtCApiGpuTest {
 
 TEST_F(PjrtCApiGpuBufferTest, CopyRawToHost) {
   auto [buffer, buffer_future] = create_iota_buffer();
-  TF_CHECK_OK(buffer_future.Await());
+  CHECK_OK(buffer_future.Await());
 
   size_t size = buffer_->buffer->GetOnDeviceSizeInBytes().value();
   PJRT_Buffer_CopyRawToHost_Args args;
@@ -359,14 +356,13 @@ TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
   add_args.extension_start = nullptr;
   add_args.user_data.type_id = 42;
   add_args.user_data.data = &string_data;
-  add_args.user_data.deleter = nullptr;
   add_args.context = create_arg.context;
   EXPECT_EQ(ffi_extension->user_data_add(&add_args), nullptr);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto lookup_user_data,
       create_arg.context->execute_context->ffi_context().Lookup(
-          xla::ffi::TypeIdRegistry::TypeId(42)));
+          xla::ffi::TypeRegistry::TypeId(42)));
   EXPECT_EQ(lookup_user_data, &string_data);
 
   PJRT_ExecuteContext_Destroy_Args destroy_args;
@@ -380,13 +376,17 @@ TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
 TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
   size_t dma_size = 1024 * 1024;
   size_t alignment = 1024 * 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+  void* host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
 
   PJRT_Client_DmaMap_Args dma_args;
   dma_args.struct_size = PJRT_Client_DmaMap_Args_STRUCT_SIZE;
   dma_args.extension_start = nullptr;
   dma_args.client = client_;
-  dma_args.data = host_dma_ptr.get();
+  dma_args.data = host_dma_ptr;
   dma_args.size = dma_size;
   PJRT_Error* dma_error = api_->PJRT_Client_DmaMap(&dma_args);
   ASSERT_EQ(dma_error, nullptr);
@@ -396,7 +396,7 @@ TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
   unmap_args.struct_size = PJRT_Client_DmaUnmap_Args_STRUCT_SIZE;
   unmap_args.extension_start = nullptr;
   unmap_args.client = client_;
-  unmap_args.data = host_dma_ptr.get();
+  unmap_args.data = host_dma_ptr;
   PJRT_Error* unmap_error = api_->PJRT_Client_DmaUnmap(&unmap_args);
   ASSERT_EQ(unmap_error, nullptr);
   MakeErrorDeleter(api_)(unmap_error);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index a9050f3d2b50b1..9f072392bffadd 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -210,6 +210,7 @@ absl::StatusCode PjrtErrorCodeToStatusCode(PJRT_Error_Code code) {
 
 PJRT_Error_Code StatusCodeToPjrtErrorCode(absl::StatusCode code) {
   switch (static_cast<tsl::error::Code>(code)) {
+    case tsl::error::OK:
     case tsl::error::CANCELLED:
     case tsl::error::UNKNOWN:
     case tsl::error::INVALID_ARGUMENT:
@@ -227,9 +228,6 @@ PJRT_Error_Code StatusCodeToPjrtErrorCode(absl::StatusCode code) {
     case tsl::error::UNAVAILABLE:
     case tsl::error::DATA_LOSS:
       return static_cast<PJRT_Error_Code>(code);
-    case tsl::error::OK:
-      CHECK(false) << "Status::OK() cannot be converted to PJRT_Error code, "
-                      "use nullptr instead";
     case tensorflow::error::
         DO_NOT_USE_RESERVED_FOR_FUTURE_EXPANSION_USE_DEFAULT_IN_SWITCH_INSTEAD_:
       CHECK(false) << "got DO_NOT_USE_RESERVED_FOR_FUTURE_EXPANSION_"
@@ -525,8 +523,8 @@ static absl::StatusOr<PJRT_NamedValue> ConvertToPjRtNamedValue(
     c_value.bool_value = std::get<bool>(value);
     c_value.value_size = 1;
   } else {
-    return tsl::errors::InvalidArgument("Unexpected PjRtValueType: '",
-                                        value.index(), " with name: ", name);
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Unexpected PjRtValueType: '", value.index(), " with name: ", name));
   }
 
   return c_value;
@@ -603,8 +601,8 @@ static absl::StatusOr<PJRT_NamedValue_Type> GetPjrtNamedValueType(
   if (std::holds_alternative<bool>(cpp_value)) {
     return PJRT_NamedValue_Type::PJRT_NamedValue_kBool;
   }
-  return tsl::errors::InvalidArgument("Unexpected PjRtValueType with index",
-                                      cpp_value.index());
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unexpected PjRtValueType with index", cpp_value.index()));
 }
 
 absl::Status ValidateCreateOptions(
@@ -614,16 +612,16 @@ absl::Status ValidateCreateOptions(
   for (const auto& [name, value] : value_map) {
     auto it = expected_name_and_types.find(name);
     if (it == expected_name_and_types.end()) {
-      return tsl::errors::InvalidArgument(
-          "Unexpected option name passed to PJRT_Client_Create: ", name);
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Unexpected option name passed to PJRT_Client_Create: ", name));
     }
     TF_ASSIGN_OR_RETURN(PJRT_NamedValue_Type type,
                         GetPjrtNamedValueType(value));
     if (type != it->second) {
-      return tsl::errors::InvalidArgument(
-          "Option passed to PJRT_Client_Create with name ", name,
-          " has type index ", value.index(), " but expected type index is ",
-          it->second);
+      return absl::InvalidArgumentError(
+          absl::StrCat("Option passed to PJRT_Client_Create with name ", name,
+                       " has type index ", value.index(),
+                       " but expected type index is ", it->second));
     }
   }
   return absl::OkStatus();
@@ -688,7 +686,7 @@ absl::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
                                               size_t expected_size,
                                               size_t actual_size) {
   if (actual_size < expected_size) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         StructSizeErrorMsg(struct_name, expected_size, actual_size));
   }
   if (actual_size > expected_size) {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
index 34c6a056931368..3f6b3ecd9b03ef 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
@@ -26,8 +26,9 @@ extern "C" {
 #endif
 
 // This extension provides capabilities around custom on-device memory layouts
-// for PJRT_Buffers. The extension is both optional and experimental, meaning
-// ABI-breaking and other incompatible changes may be introduced at any time.
+// for PJRT_Buffers and PJRT_Executables. The extension is both optional and
+// experimental, meaning ABI-breaking and other incompatible changes may be
+// introduced at any time.
 //
 // If this extension is provided, JAX and possibly other frameworks will assume
 // that the compiler MLIR input can contain "mhlo.layout_mode" attributes on
@@ -36,7 +37,7 @@ extern "C" {
 // https://github.com/openxla/xla/blob/main/xla/pjrt/layout_mode.h for more
 // details.
 
-#define PJRT_API_LAYOUTS_EXTENSION_VERSION 2
+#define PJRT_API_LAYOUTS_EXTENSION_VERSION 3
 
 // -------------------------------- Data types ---------------------------------
 
@@ -124,6 +125,23 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Topology_GetDefaultLayout_Args,
 typedef PJRT_Error* PJRT_Layouts_PJRT_Topology_GetDefaultLayout(
     PJRT_Layouts_PJRT_Topology_GetDefaultLayout_Args* args);
 
+// Returns output layouts for an executable.
+struct PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_outputs;  // out
+  // Layout data is owned by and has the lifetime of `executable`.
+  // Has length `num_outputs`.
+  PJRT_Layouts_MemoryLayout** layouts;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args,
+                          layouts);
+
+// Returns a list of layouts for executable outputs. Each output has a layout.
+typedef PJRT_Error* PJRT_Layouts_PJRT_Executable_GetOutputLayouts(
+    PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args* args);
+
 // --------------------------- Extension entrypoint ----------------------------
 
 typedef struct PJRT_Layouts_Extension {
@@ -136,9 +154,11 @@ typedef struct PJRT_Layouts_Extension {
   PJRT_Layouts_PJRT_Buffer_MemoryLayout* PJRT_Layouts_PJRT_Buffer_MemoryLayout;
   PJRT_Layouts_PJRT_Topology_GetDefaultLayout*
       PJRT_Layouts_PJRT_Topology_GetDefaultLayout;
+  PJRT_Layouts_PJRT_Executable_GetOutputLayouts*
+      PJRT_Layouts_PJRT_Executable_GetOutputLayouts;
 } PJRT_Layouts_Extension;
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_Extension,
-                          PJRT_Layouts_PJRT_Topology_GetDefaultLayout);
+                          PJRT_Layouts_PJRT_Executable_GetOutputLayouts);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc
index 6b4de640120a37..3ac4ff2e5f5445 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc
@@ -42,7 +42,7 @@ namespace {
 absl::StatusOr<xla::CompileOptions> ParseCompileOptions(
     absl::string_view options_str) {
   xla::CompileOptionsProto options_proto;
-  if (!options_proto.ParseFromArray(options_str.data(), options_str.size())) {
+  if (!options_proto.ParseFromString(options_str)) {
     return absl::InvalidArgumentError(
         "PJRT_Client_Compile: failed to deserialize CompileOptionsProto");
   }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 37e7f7504e92b2..d45ce89ba062be 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
-#include "xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_test_base.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_device_description.h"
@@ -55,7 +54,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -664,7 +662,7 @@ TEST_F(PjrtCApiBufferTest, ReadyEvent) {
 
 TEST_F(PjrtCApiBufferTest, ToHostBufferNoHostLayout) {
   auto [buffer, buffer_future] = create_iota_buffer();
-  TF_CHECK_OK(buffer_future.Await());
+  CHECK_OK(buffer_future.Await());
 
   PJRT_Buffer_ToHostBuffer_Args args;
   args.struct_size = PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE;
@@ -680,7 +678,7 @@ TEST_F(PjrtCApiBufferTest, ToHostBufferNoHostLayout) {
   PJRT_Error* error = api_->PJRT_Buffer_ToHostBuffer(&args);
   xla::Future<> transfer_to_host =
       ::pjrt::ConvertCEventToCppFuture(args.event, api_);
-  TF_CHECK_OK(transfer_to_host.Await());
+  CHECK_OK(transfer_to_host.Await());
 
   EXPECT_EQ(error, nullptr);
   ASSERT_EQ(literal->data<float>().size(), 4);
@@ -945,6 +943,9 @@ FieldOffsetsAndSizesForVersion(int major_version, int minor_version) {
     if (minor_version >= 79) {
       add_field("PJRT_LoadedExecutable_GetDeviceAssignment", kFnPtrSize);
     }
+    if (minor_version >= 82) {
+      add_field("PJRT_Client_CreateErrorBuffer", kFnPtrSize);
+    }
     return version_offsets_and_sizes;
   }
   LOG(FATAL) << "Unsupported API version: " << major_version << "."
@@ -1336,6 +1337,9 @@ TEST_F(PjrtCAbiTestBase, FieldOffsetsAndSizes) {
           {"PJRT_LoadedExecutable_GetDeviceAssignment",
            {offsetof(PJRT_Api, PJRT_LoadedExecutable_GetDeviceAssignment),
             sizeof(PJRT_Api::PJRT_LoadedExecutable_GetDeviceAssignment)}},
+          {"PJRT_Client_CreateErrorBuffer",
+           {offsetof(PJRT_Api, PJRT_Client_CreateErrorBuffer),
+            sizeof(PJRT_Api::PJRT_Client_CreateErrorBuffer)}},
       };
   ASSERT_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
   ASSERT_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
index 1adba750e24c54..093adac9c61a90 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/future.h"
@@ -39,7 +38,6 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla_data.pb.h"
 
 namespace pjrt {
@@ -136,7 +134,7 @@ std::string PjrtCApiTestBase::BuildSingleDeviceCompileOptionStr() {
   xla::CompileOptions options;
   options.executable_build_options = build_options;
   absl::StatusOr<xla::CompileOptionsProto> options_proto = options.ToProto();
-  TF_CHECK_OK(options_proto.status());
+  CHECK_OK(options_proto.status());
   return options_proto->SerializeAsString();
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_tpu_topology_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_tpu_topology_extension.h
new file mode 100644
index 00000000000000..c6a9c1b89778a7
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_tpu_topology_extension.h
@@ -0,0 +1,501 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_TPU_TOPOLOGY_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_TPU_TOPOLOGY_EXTENSION_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This extension provides functionality related to TPU topology.
+
+#define PJRT_API_TPU_TOPOLOGY_EXTENSION_VERSION 1
+
+struct PJRT_TpuTopology_Subslice_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  const int32_t* chips_per_host_bounds;
+  size_t chips_per_host_bounds_num_dims;
+  const int32_t* host_bounds;
+  size_t host_bounds_num_dims;
+
+  // Owned by the caller. Should be destroyed by calling
+  // PJRT_TpuTopology_Destroy.
+  PJRT_TopologyDescription* subslice_topology;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_Subslice_Args, subslice_topology);
+
+// Returns a subslice topology of the given topology.
+typedef PJRT_Error* PJRT_TpuTopology_Subslice(
+    PJRT_TpuTopology_Subslice_Args* args);
+
+struct PJRT_TpuTopology_IsSubsliceTopology_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  bool is_subslice_topology;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_IsSubsliceTopology_Args,
+                          is_subslice_topology);
+
+// Returns true if the topology is a subslice topology.
+typedef PJRT_Error* PJRT_TpuTopology_IsSubsliceTopology(
+    PJRT_TpuTopology_IsSubsliceTopology_Args* args);
+
+typedef struct PJRT_TpuTopology_SubsliceDeviceIdFromFullDeviceId_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* client_topology;
+  const PJRT_TopologyDescription* subslice_topology;
+  const int32_t* subslice_origin;
+  size_t subslice_origin_dim_num;
+  int32_t full_device_id;
+
+  int32_t subslice_device_id;  // out
+} PJRT_TpuTopology_SubsliceDeviceIdFromFullDeviceId_Args;
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_TpuTopology_SubsliceDeviceIdFromFullDeviceId_Args, subslice_device_id);
+
+// Returns the subslice device id for the given full device id.
+typedef PJRT_Error* PJRT_TpuTopology_SubsliceDeviceIdFromFullDeviceId(
+    PJRT_TpuTopology_SubsliceDeviceIdFromFullDeviceId_Args* args);
+
+typedef struct PJRT_TpuTopology_ReplaceHostBounds_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  const int32_t* host_bounds;
+  size_t host_bounds_dim_num;
+
+  // Owned by the caller. Should be destroyed by calling
+  // PJRT_TpuTopology_Destroy.
+  PJRT_TopologyDescription* new_topology;  // out
+} PJRT_TpuTopology_ReplaceHostBounds_Args;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ReplaceHostBounds_Args,
+                          new_topology);
+
+// Returns a new PjRtTopologyDescription by replacing the host bounds of the
+// input `topology` with the provided `host_bounds`.
+typedef PJRT_Error* PJRT_TpuTopology_ReplaceHostBounds(
+    PJRT_TpuTopology_ReplaceHostBounds_Args* args);
+
+typedef struct PJRT_TpuTopology_IsEnhancedBarrierEnabled_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  bool is_enhanced_barrier_enabled;  // out
+} PJRT_TpuTopology_IsEnhancedBarrierEnabled_Args;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_IsEnhancedBarrierEnabled_Args,
+                          is_enhanced_barrier_enabled);
+
+// Returns true if the enhanced barrier is enabled in the given TPU topology.
+typedef PJRT_Error* PJRT_TpuTopology_IsEnhancedBarrierEnabled(
+    PJRT_TpuTopology_IsEnhancedBarrierEnabled_Args* args);
+
+typedef struct PJRT_TpuTopology_HasLimitedIciConnectivity_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  bool has_limited_ici_connectivity;  // out
+} PJRT_TpuTopology_HasLimitedIciConnectivity_Args;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_HasLimitedIciConnectivity_Args,
+                          has_limited_ici_connectivity);
+
+// Returns true if the given TPU topology has limited ICI connectivity.
+typedef PJRT_Error* PJRT_TpuTopology_HasLimitedIciConnectivity(
+    PJRT_TpuTopology_HasLimitedIciConnectivity_Args* args);
+
+typedef struct PJRT_TpuTopology_IsReachableOverLimitedIci_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t source_chip_id;
+  int32_t dest_chip_id;
+  bool is_reachable_over_limited_ici;  // out
+} PJRT_TpuTopology_IsReachableOverLimitedIci_Args;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_IsReachableOverLimitedIci_Args,
+                          is_reachable_over_limited_ici);
+
+// Returns true if `source_chip_id` can directly reach `dest_chip_id` on a TPU
+// topology with limited ICI routing.
+typedef PJRT_Error* PJRT_TpuTopology_IsReachableOverLimitedIci(
+    PJRT_TpuTopology_IsReachableOverLimitedIci_Args* args);
+
+struct PJRT_TpuTopology_ProcessCount_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t process_count;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ProcessCount_Args, process_count);
+
+// Returns the number of processes in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_ProcessCount(
+    PJRT_TpuTopology_ProcessCount_Args* args);
+
+struct PJRT_TpuTopology_ChipsPerProcess_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t chips_per_process;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ChipsPerProcess_Args,
+                          chips_per_process);
+
+// Returns the number of chips per process in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_ChipsPerProcess(
+    PJRT_TpuTopology_ChipsPerProcess_Args* args);
+
+struct PJRT_TpuTopology_CoreCountPerChip_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t core_count_of_default_type_per_chip;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_CoreCountPerChip_Args,
+                          core_count_of_default_type_per_chip);
+
+// Returns the number of cores of default type per chip in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_CoreCountPerChip(
+    PJRT_TpuTopology_CoreCountPerChip_Args* args);
+
+struct PJRT_TpuTopology_ChipCount_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t chip_count;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ChipCount_Args, chip_count);
+
+// Returns the number of total chips in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_ChipCount(
+    PJRT_TpuTopology_ChipCount_Args* args);
+
+struct PJRT_TpuTopology_CoreCount_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t core_count_of_default_type;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_CoreCount_Args,
+                          core_count_of_default_type);
+
+// Returns the number of total cores of default type in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_CoreCount(
+    PJRT_TpuTopology_CoreCount_Args* args);
+
+struct PJRT_TpuTopology_LogiDeviceCount_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t logical_device_count_of_default_type;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_LogiDeviceCount_Args,
+                          logical_device_count_of_default_type);
+
+// Returns the number of total logical devices of default type in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_LogiDeviceCount(
+    PJRT_TpuTopology_LogiDeviceCount_Args* args);
+
+struct PJRT_TpuTopology_LogiDeviceCountPerProcess_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t logical_device_count_of_default_type_per_process;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_LogiDeviceCountPerProcess_Args,
+                          logical_device_count_of_default_type_per_process);
+
+// Returns the number of logical devices of default type per process in this
+// topology.
+typedef PJRT_Error* PJRT_TpuTopology_LogiDeviceCountPerProcess(
+    PJRT_TpuTopology_LogiDeviceCountPerProcess_Args* args);
+
+struct PJRT_TpuTopology_LogiDeviceCountPerChip_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t logical_device_count_of_default_type_per_chip;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_LogiDeviceCountPerChip_Args,
+                          logical_device_count_of_default_type_per_chip);
+
+// Returns the number of logical devices of default type per chip in this
+// topology.
+typedef PJRT_Error* PJRT_TpuTopology_LogiDeviceCountPerChip(
+    PJRT_TpuTopology_LogiDeviceCountPerChip_Args* args);
+
+struct PJRT_TpuTopology_CoreCountPerProcess_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t core_count_of_default_type_per_process;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_CoreCountPerProcess_Args,
+                          core_count_of_default_type_per_process);
+
+// Returns the number of cores per process in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_CoreCountPerProcess(
+    PJRT_TpuTopology_CoreCountPerProcess_Args* args);
+
+struct PJRT_TpuTopology_ProcessIds_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  // The maximum number of process IDs that can be returned. If the topology has
+  // more than max_process_ids processes, an error is returned.
+  int32_t max_process_ids;
+  // Points to an array of size max_process_ids. The process IDs will be
+  // filled in this array.
+  int32_t* process_ids;
+  size_t num_process_ids;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ProcessIds_Args, num_process_ids);
+
+// Returns the process IDs in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_ProcessIds(
+    PJRT_TpuTopology_ProcessIds_Args* args);
+
+struct PJRT_TpuTopology_LogiDeviceIdsOnProcess_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t process_id;
+  // The maximum number of device IDs that can be returned. If the topology has
+  // more than max_logical_device_ids devices on the process, an error is
+  // returned.
+  int32_t max_logical_device_ids;
+  // Points to an array of size max_logical_device_ids. The device IDs will be
+  // filled in this array.
+  int32_t* logical_device_of_default_type_ids;
+  size_t num_logical_device_ids;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_LogiDeviceIdsOnProcess_Args,
+                          num_logical_device_ids);
+
+// Returns the logical device of default type IDs on the given process.
+typedef PJRT_Error* PJRT_TpuTopology_LogiDeviceIdsOnProcess(
+    PJRT_TpuTopology_LogiDeviceIdsOnProcess_Args* args);
+
+struct PJRT_TpuTopology_ProcIdAndIdxOnProcForChip_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t chip_id;
+  int32_t process_id;        // out
+  int32_t index_on_process;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ProcIdAndIdxOnProcForChip_Args,
+                          index_on_process);
+
+// Returns the process ID and index on process for the given chip.
+typedef PJRT_Error* PJRT_TpuTopology_ProcIdAndIdxOnProcForChip(
+    PJRT_TpuTopology_ProcIdAndIdxOnProcForChip_Args* args);
+
+struct PJRT_TpuTopology_ProcIdAndIdxOnProcForLogiDevice_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t device_id;
+  int32_t process_id;        // out
+  int32_t index_on_process;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ProcIdAndIdxOnProcForLogiDevice_Args,
+                          index_on_process);
+
+// Returns the process ID and index on process for the given logical device of
+// default type.
+typedef PJRT_Error* PJRT_TpuTopology_ProcIdAndIdxOnProcForLogiDevice(
+    PJRT_TpuTopology_ProcIdAndIdxOnProcForLogiDevice_Args* args);
+
+struct PJRT_TpuTopology_ProcessCoordFromId_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t process_id;
+  // The maximum dimension of coordinates that can be returned.
+  // If the process has more than max_coords dimensions, an error is returned.
+  size_t coords_max_dims;
+  // Points to an array of size max_dims. The coordinates of the process will
+  // be stored in this array.
+  int32_t* coords;
+  size_t coords_num_dims;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ProcessCoordFromId_Args,
+                          coords_num_dims);
+
+// Returns the coordinates of the process with the given process ID.
+typedef PJRT_Error* PJRT_TpuTopology_ProcessCoordFromId(
+    PJRT_TpuTopology_ProcessCoordFromId_Args* args);
+
+struct PJRT_TpuTopology_ChipIdFromCoord_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  const int32_t* coords;
+  size_t coords_num_dims;
+  int32_t chip_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ChipIdFromCoord_Args, chip_id);
+
+// Returns the chip ID for the given coordinates.
+typedef PJRT_Error* PJRT_TpuTopology_ChipIdFromCoord(
+    PJRT_TpuTopology_ChipIdFromCoord_Args* args);
+
+struct PJRT_TpuTopology_LogiDeviceIdFromChipCoordAndIdx_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  const int32_t* chip_coords;
+  size_t chip_coords_num_dims;
+  int32_t logical_device_index_on_chip;
+  int32_t logical_device_of_default_type_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_LogiDeviceIdFromChipCoordAndIdx_Args,
+                          logical_device_of_default_type_id);
+
+// Returns the logical device of default type ID for the chip with the given
+// coordinates and logical device index on chip.
+typedef PJRT_Error* PJRT_TpuTopology_LogiDeviceIdFromChipCoordAndIdx(
+    PJRT_TpuTopology_LogiDeviceIdFromChipCoordAndIdx_Args* args);
+
+struct PJRT_TpuTopology_ChipCoordAndIdxForLogiDevice_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  int32_t device_id;
+  // The maximum dimension of coordinates that can be returned.
+  // If the device has more than max_coords dimensions, an error is returned.
+  size_t chip_coords_max_dims;
+  // Points to an array of size max_dims. The coordinates of the device will
+  // be stored in this array.
+  int32_t* chip_coords;
+  size_t chip_coords_num_dims;   // out
+  int32_t device_index_on_chip;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ChipCoordAndIdxForLogiDevice_Args,
+                          device_index_on_chip);
+
+// Returns the coordinates of the chip containing the given logical device of
+// default type, and the index of the logical device on the chip.
+typedef PJRT_Error* PJRT_TpuTopology_ChipCoordAndIdxForLogiDevice(
+    PJRT_TpuTopology_ChipCoordAndIdxForLogiDevice_Args* args);
+
+struct PJRT_TpuTopology_ChipsPerProcessBounds_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  // The maximum dimension of coordinates that can be returned.
+  // If the process has more than max_coords dimensions, an error is returned.
+  size_t chip_per_process_bounds_max_dims;
+  // Points to an array of size max_dims. The bounds of the chips per process
+  // will be stored in this array.
+  int32_t* chip_per_process_bounds;
+  size_t chip_per_process_bounds_num_dims;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ChipsPerProcessBounds_Args,
+                          chip_per_process_bounds_num_dims);
+
+// Returns the bounds of the chips per process in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_ChipsPerProcessBounds(
+    PJRT_TpuTopology_ChipsPerProcessBounds_Args* args);
+
+struct PJRT_TpuTopology_ChipBounds_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  // The maximum dimension of coordinates that can be returned.
+  // If the chip has more than max_coords dimensions, an error is returned.
+  size_t chip_bounds_max_dims;
+  // Points to an array of size max_dims. The bounds of the chip will be stored
+  // in this array.
+  int32_t* chip_bounds;
+  size_t chip_bounds_num_dims;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ChipBounds_Args,
+                          chip_bounds_num_dims);
+
+// Returns the bounds of the chip in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_ChipBounds(
+    PJRT_TpuTopology_ChipBounds_Args* args);
+
+struct PJRT_TpuTopology_ProcessBounds_Args {
+  size_t struct_size;
+
+  const PJRT_TopologyDescription* topology;
+  // The maximum dimension of coordinates that can be returned.
+  // If the process has more than max_coords dimensions, an error is returned.
+  size_t process_bounds_max_dims;
+  // Points to an array of size max_dims. The bounds of the process will be
+  // stored in this array.
+  int32_t* process_bounds;
+  size_t process_bounds_num_dims;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_ProcessBounds_Args,
+                          process_bounds_num_dims);
+
+// Returns the bounds of the process in this topology.
+typedef PJRT_Error* PJRT_TpuTopology_ProcessBounds(
+    PJRT_TpuTopology_ProcessBounds_Args* args);
+
+typedef struct PJRT_TpuTopology_Extension {
+  PJRT_Extension_Base base;
+  PJRT_TpuTopology_Subslice* subslice;
+  PJRT_TpuTopology_IsSubsliceTopology* is_subslice_topology;
+  PJRT_TpuTopology_SubsliceDeviceIdFromFullDeviceId*
+      subslice_device_id_from_full_device_id;
+  PJRT_TpuTopology_ReplaceHostBounds* replace_host_bounds;
+  PJRT_TpuTopology_IsEnhancedBarrierEnabled* is_enhanced_barrier_enabled;
+  PJRT_TpuTopology_HasLimitedIciConnectivity* has_limited_ici_connectivity;
+  PJRT_TpuTopology_IsReachableOverLimitedIci* is_reachable_over_limited_ici;
+
+  PJRT_TpuTopology_ProcessCount* process_count;
+  PJRT_TpuTopology_ChipsPerProcess* chips_per_process;
+  PJRT_TpuTopology_CoreCountPerChip* core_count_per_chip;
+  PJRT_TpuTopology_ChipCount* chip_count;
+  PJRT_TpuTopology_CoreCount* core_count;
+  PJRT_TpuTopology_LogiDeviceCountPerProcess* logical_device_count_per_process;
+  PJRT_TpuTopology_LogiDeviceCount* logical_device_count;
+  PJRT_TpuTopology_LogiDeviceCountPerChip* logical_device_count_per_chip;
+  PJRT_TpuTopology_CoreCountPerProcess* core_count_per_process;
+  PJRT_TpuTopology_ProcessIds* process_ids;
+  PJRT_TpuTopology_LogiDeviceIdsOnProcess* logical_device_ids_on_process;
+  PJRT_TpuTopology_ProcIdAndIdxOnProcForChip* proc_id_and_idx_on_proc_for_chip;
+  PJRT_TpuTopology_ProcIdAndIdxOnProcForLogiDevice*
+      proc_id_and_idx_on_proc_for_logi_device;
+  PJRT_TpuTopology_ProcessCoordFromId* process_coord_from_id;
+  PJRT_TpuTopology_ChipIdFromCoord* chip_id_from_coord;
+  PJRT_TpuTopology_LogiDeviceIdFromChipCoordAndIdx*
+      logical_device_id_from_chip_coord_and_idx;
+  PJRT_TpuTopology_ChipCoordAndIdxForLogiDevice*
+      chip_coord_and_idx_for_logi_device;
+  PJRT_TpuTopology_ChipsPerProcessBounds* chips_per_process_bounds;
+  PJRT_TpuTopology_ChipBounds* chip_bounds;
+  PJRT_TpuTopology_ProcessBounds* process_bounds;
+} PJRT_TpuTopology_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TpuTopology_Extension, process_bounds);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_TPU_TOPOLOGY_EXTENSION_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index ee59265b6c46e9..a789991ce722f2 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -219,6 +219,33 @@ static absl::Status EnsureExecutableOutputDimensionsPopulated(
   return absl::OkStatus();
 }
 
+static absl::Status PopulateExecutableOutputLayouts(
+    PJRT_Executable* executable) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::shared_ptr<const xla::PjRtLayout>> cpp_out_layouts,
+      executable->get()->GetOutputLayouts());
+  executable->out_layouts.reserve(cpp_out_layouts.size());
+  executable->out_layouts_pointers.reserve(cpp_out_layouts.size());
+  for (std::shared_ptr<const xla::PjRtLayout>& layout : cpp_out_layouts) {
+    executable->out_layouts.push_back(
+        PJRT_Layouts_MemoryLayout{std::move(layout)});
+  }
+  for (PJRT_Layouts_MemoryLayout& layout : executable->out_layouts) {
+    executable->out_layouts_pointers.push_back(&layout);
+  }
+  return absl::OkStatus();
+}
+
+static absl::Status EnsureExecutableOutputLayoutsPopulated(
+    PJRT_Executable* executable) {
+  absl::MutexLock lock(executable->mutex);
+  if (!executable->out_layouts_ran) {
+    TF_RETURN_IF_ERROR(PopulateExecutableOutputLayouts(executable));
+    executable->out_layouts_ran = true;
+  }
+  return absl::OkStatus();
+}
+
 static absl::Status PopulateExecutableOutputMemoryKinds(
     PJRT_Executable* executable) {
   TF_ASSIGN_OR_RETURN(
@@ -806,8 +833,8 @@ absl::StatusOr<xla::CompileOptions> ParseCompileOptions(
     absl::string_view options_str) {
   xla::CompileOptionsProto options_proto;
   // Open source ParseFromString doesn't support string_view.
-  if (!options_proto.ParseFromArray(options_str.data(), options_str.size())) {
-    return tsl::errors::InvalidArgument(
+  if (!options_proto.ParseFromString(options_str)) {
+    return absl::InvalidArgumentError(
         "PJRT_Client_Compile: failed to deserialize CompileOptionsProto");
   }
   return xla::CompileOptions::FromProto(options_proto);
@@ -833,13 +860,13 @@ ParsePjrtProgram(std::optional<mlir::MLIRContext>& context,
   } else if (format_str == pjrt::kHloFormat) {
     xla::HloModuleProto module_proto;
     // Open source ParseFromString doesn't support string_view.
-    if (!module_proto.ParseFromArray(module_str.data(), module_str.size())) {
-      return tsl::errors::InvalidArgument(
+    if (!module_proto.ParseFromString(module_str)) {
+      return absl::InvalidArgumentError(
           "PJRT_Client_Compile: failed to deserialize HloModuleProto");
     }
     return ProgramVariant(xla::XlaComputation(module_proto));
   } else {
-    return tsl::errors::InvalidArgument(ProgramFormatErrorMsg(format_str));
+    return absl::InvalidArgumentError(ProgramFormatErrorMsg(format_str));
   }
 }
 
@@ -950,6 +977,29 @@ PJRT_Error* PJRT_Client_CreateUninitializedBuffer(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Client_CreateErrorBuffer(
+    PJRT_Client_CreateErrorBuffer_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Client_CreateErrorBuffer_Args",
+      PJRT_Client_CreateErrorBuffer_Args_STRUCT_SIZE, args->struct_size));
+
+  absl::Status error = absl::Status(
+      pjrt::PjrtErrorCodeToStatusCode(args->error_code),
+      absl::string_view(args->error_message, args->error_message_size));
+
+  PJRT_ASSIGN_OR_RETURN(
+      xla::Shape shape,
+      pjrt::BuildXlaShapeFromC(args->shape_element_type, args->shape_dims,
+                               args->shape_num_dims, args->shape_layout));
+
+  PJRT_ASSIGN_OR_RETURN(auto error_buffer,
+                        args->client->client->CreateErrorBuffer(
+                            error, shape, args->memory->memory_space));
+
+  args->buffer = new PJRT_Buffer{std::move(error_buffer), args->client};
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Client_CreateAliasBuffer(
     PJRT_Client_CreateAliasBuffer_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -1790,8 +1840,6 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
     options.call_location = std::string(args->options->call_location);
   }
   options.strict_shape_checking = true;
-  options.arguments_are_tupled = false;
-  options.untuple_result = true;
   options.context = args->options->context
                         ? args->options->context->execute_context.get()
                         : nullptr;
@@ -2549,8 +2597,8 @@ PJRT_Error* PJRT_TopologyDescription_Deserialize(
       PJRT_TopologyDescription_Attributes_Args_STRUCT_SIZE, args->struct_size));
 
   xla::PjRtTopologyDescriptionProto proto;
-  if (!proto.ParseFromArray(args->serialized_topology,
-                            args->serialized_topology_size)) {
+  if (!proto.ParseFromString(absl::string_view(
+          args->serialized_topology, args->serialized_topology_size))) {
     return new PJRT_Error{xla::InvalidArgument(
         "Failed to parse PjRtTopologyDescriptionProto at the C API level, "
         "from binary string of size: %d",
@@ -2692,6 +2740,19 @@ PJRT_Error* PJRT_Layouts_PJRT_Topology_GetDefaultLayout(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Layouts_PJRT_Executable_GetOutputLayouts(
+    PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args",
+      PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args_STRUCT_SIZE,
+      args->struct_size));
+  PJRT_RETURN_IF_ERROR(
+      EnsureExecutableOutputLayoutsPopulated(args->executable));
+  args->num_outputs = args->executable->out_layouts_pointers.size();
+  args->layouts = args->executable->out_layouts_pointers.data();
+  return nullptr;
+}
+
 static std::vector<PJRT_NamedValue> PopulatePjrtAttributes(
     const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>&
         attributes) {
@@ -3086,6 +3147,8 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       pjrt::PJRT_Client_FulfillAliasBuffer,
       /*PJRT_LoadedExecutable_GetDeviceAssignment=*/
       pjrt::PJRT_LoadedExecutable_GetDeviceAssignment,
+      /*PJRT_Client_CreateErrorBuffer=*/
+      pjrt::PJRT_Client_CreateErrorBuffer,
   };
 }
 
@@ -3106,6 +3169,8 @@ PJRT_Layouts_Extension CreateLayoutsExtension(PJRT_Extension_Base* next) {
       pjrt::PJRT_Layouts_PJRT_Buffer_MemoryLayout,
       /*PJRT_Layouts_PJRT_Topology_GetDefaultLayout=*/
       &PJRT_Layouts_PJRT_Topology_GetDefaultLayout,
+      /*PJRT_Layouts_PJRT_Executable_GetOutputLayouts=*/
+      &PJRT_Layouts_PJRT_Executable_GetOutputLayouts,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index dcb59ef6c1436b..387c8642694872 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -158,6 +158,10 @@ struct PJRT_Executable {
   std::vector<int64_t> out_dimensions;
   std::vector<size_t> out_dimension_sizes;
 
+  bool out_layouts_ran ABSL_GUARDED_BY(mutex) = false;
+  std::vector<PJRT_Layouts_MemoryLayout> out_layouts;
+  std::vector<PJRT_Layouts_MemoryLayout*> out_layouts_pointers;
+
   explicit PJRT_Executable(std::shared_ptr<xla::PjRtExecutable> executable);
   explicit PJRT_Executable(xla::PjRtExecutable* executable);
 
diff --git a/third_party/xla/xla/pjrt/c_api_client/BUILD b/third_party/xla/xla/pjrt/c_api_client/BUILD
index e14baa666935a8..c2fe9027d68117 100644
--- a/third_party/xla/xla/pjrt/c_api_client/BUILD
+++ b/third_party/xla/xla/pjrt/c_api_client/BUILD
@@ -20,7 +20,7 @@ package_group(
         "//third_party/gxlang/...",
         "//third_party/openxla_pjrt_plugin/...",
         "//third_party/py/jax/...",
-        "//third_party/torch_tpu/...",
+        "//third_party/py/torch_tpu/...",
     ],
 )
 
@@ -54,8 +54,8 @@ cc_library(
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:string_utils",
         "//xla/pjrt/c:pjrt_c_api_ffi_extension_hdrs",
@@ -66,6 +66,7 @@ cc_library(
         "//xla/pjrt/c:pjrt_c_api_phase_compile_extension_hdrs",
         "//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
         "//xla/pjrt/c:pjrt_c_api_stream_extension_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_tpu_topology_extension_hdrs",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/extensions/cross_host_transfers:pjrt_c_api_cross_host_transfer_extension",
         "//xla/pjrt/extensions/executable_metadata:executable_metadata_extension",
@@ -78,9 +79,9 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/framework:allocator",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -113,6 +114,7 @@ xla_cc_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:types",
         "//xla/backends/cpu:alignment",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
@@ -130,13 +132,33 @@ xla_cc_test(
         "//xla/service:computation_placer_hdr",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@stablehlo//:version",
     ],
 )
+
+xla_cc_test(
+    name = "pjrt_c_api_client_tpu_test",
+    srcs = ["pjrt_c_api_client_tpu_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":pjrt_c_api_client",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_dimensions",
+        "//xla/pjrt/plugin/xla_tpu:xla_tpu_pjrt_client",  # buildcleaner: keep
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 616180e00b400a..51b25541d12053 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -59,6 +60,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_phase_compile_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_stream_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_tpu_topology_extension.h"
 #include "xla/pjrt/c_api_client/pjrt_c_api_phase_compiler.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h"
@@ -71,6 +73,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
@@ -80,7 +83,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/util.h"
@@ -91,6 +93,8 @@ limitations under the License.
 
 namespace xla {
 
+constexpr int kMaxDims = 4;
+
 // Helper macros
 
 // Return error future if not success and frees the PJRT_Error returned by
@@ -516,8 +520,16 @@ PjRtCApiClient::CompileAndLoad(mlir::ModuleOp module, CompileOptions options) {
   if (!pjrt_c_api()) llvm::report_fatal_error("pjrt_c_api is null");
 
   std::string version_string = GetPluginStablehloVersionOrDefault(this);
-  TF_ASSIGN_OR_RETURN(std::string serialized,
-                      xla::Serialize(module, version_string));
+
+  TF_ASSIGN_OR_RETURN(
+      std::string serialized,
+      xla::Serialize(module, version_string,
+                     /*inplace=*/options.allow_in_place_mlir_modification));
+  if (options.allow_in_place_mlir_modification) {
+    // If we're allowed to modify the computation, free the functions in the
+    // MLIR. We don't use them anymore, and this reduces peak memory.
+    module.getBody()->clear();
+  }
   std::string format(pjrt::kMlirFormat);
   return InitializeArgsAndCompile(this, c_api_, c_client_.get(), options,
                                   serialized, format);
@@ -597,6 +609,48 @@ PjRtCApiClient::CreateUninitializedBuffer(const Shape& shape,
   return buffer;
 }
 
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::CreateErrorBuffer(
+    absl::Status error, const Shape& shape, PjRtMemorySpace* memory) {
+  if (c_api_->pjrt_api_version.major_version == 0 &&
+      c_api_->pjrt_api_version.minor_version < 82) {
+    return absl::UnimplementedError(
+        "PJRT_Client_CreateErrorBuffer requires PJRT C API version 0.82 or "
+        "higher.");
+  }
+
+  PJRT_Client_CreateErrorBuffer_Args args;
+  args.struct_size = PJRT_Client_CreateErrorBuffer_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+
+  args.error_code = pjrt::StatusCodeToPjrtErrorCode(error.code());
+  args.error_message = error.message().data();
+  args.error_message_size = error.message().size();
+
+  args.shape_dims = shape.dimensions().data();
+  args.shape_num_dims = shape.dimensions().size();
+  args.shape_element_type = pjrt::ConvertToPjRtBufferType(shape.element_type());
+
+  pjrt::BufferMemoryLayoutData c_layout_data;
+  if (shape.has_layout()) {
+    TF_ASSIGN_OR_RETURN(c_layout_data,
+                        pjrt::ConvertToBufferMemoryLayoutData(shape.layout()));
+    args.shape_layout = &c_layout_data.c_layout;
+  } else {
+    args.shape_layout = nullptr;
+  }
+
+  args.memory = tensorflow::down_cast<PjRtCApiMemorySpace*>(memory)->c_memory();
+
+  RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_Client_CreateErrorBuffer(&args),
+                              c_api_);
+
+  auto buffer = std::unique_ptr<PjRtBuffer>(
+      std::make_unique<PjRtCApiBuffer>(this, args.buffer));
+
+  return buffer;
+}
+
 absl::Status FulfillAliasBuffer(
     const PJRT_Api* pjrt_c_api, absl::StatusOr<PjRtBuffer*> real_buffer_or,
     PJRT_FulfillAliasBufferCallback* fulfill_alias_buffer_cb) {
@@ -817,6 +871,25 @@ PjRtCApiClient::BufferFromHostBuffer(
       std::move(on_done_with_host_buffer), memory_space, device_layout);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtCApiClient::BufferFromHostLiteral(const LiteralSlice& literal,
+                                      PjRtMemorySpace* memory_space,
+                                      const Layout* device_layout) {
+  if (literal.shape().is_dynamic()) {
+    return Unimplemented(
+        "PJRT C API does not support dynamic shapes for "
+        "BufferFromHostLiteral.");
+  }
+  absl::InlinedVector<int64_t, 4> strides(literal.shape().dimensions().size());
+  TF_RETURN_IF_ERROR(
+      ShapeUtil::UnpackedByteStrides(literal.shape(), absl::MakeSpan(strides)));
+  return BufferFromHostBufferInternalImpl(
+      literal.untyped_data(), literal.shape().element_type(),
+      literal.shape().dimensions(), strides,
+      HostBufferSemantics::kImmutableUntilTransferCompletes,
+      /*on_done_with_host_buffer=*/nullptr, memory_space, device_layout);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtCApiClient::CreateViewOfDeviceBuffer(
     void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
@@ -937,67 +1010,15 @@ absl::Status PjRtCApiClient::DmaUnmap(void* data) {
   return absl::OkStatus();
 }
 
-PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
-    const PJRT_Api* c_api, xla::PjRtCrossHostRecvNotifier cpp_notifier) {
-  auto notifier_function = new PjRtCApiClient::CrossHostRecvNotifierFunction(
-      [cpp_notifier = std::move(cpp_notifier), c_api](
-          PJRT_Error* error, const char** serialized_descriptors,
-          size_t* descriptors_sizes, size_t num_descriptors) {
-        if (error != nullptr) {
-          absl::Status state = ::pjrt::PjrtErrorToStatus(error, c_api);
-          return cpp_notifier(std::move(state));
-        }
-        xla::PjRtCrossHostRecvState state;
-        state.descriptors.reserve(num_descriptors);
-        for (int i = 0; i < num_descriptors; ++i) {
-          xla::PjRtCrossHostRecvDescriptors descriptors;
-          descriptors.serialized_descriptors.push_back(
-              std::string(serialized_descriptors[i], descriptors_sizes[i]));
-          state.descriptors.push_back(std::move(descriptors));
-        }
-
-        // TODO(emilyaf): Support cancellation.
-        xla::PjRtCrossHostSendCancelNotifier cancel_notifier =
-            [](absl::string_view, absl::Status,
-               std::function<void(absl::Status)>) {
-              LOG(FATAL) << "MakeCrossHostReceiveBuffers: Cancellation is not "
-                            "supported in PJRT C API.";
-            };
-        state.cancel_notifier = cancel_notifier;
-        return cpp_notifier(std::move(state));
-      });
-  return PJRT_Transfers_CrossHostRecvNotifierInfo{
-      /*user_arg=*/notifier_function,
-      /*notifier=*/
-      [](PJRT_Error* error, const char** serialized_descriptors,
-         size_t* descriptors_sizes, size_t num_descriptors, void* user_arg) {
-        PjRtCApiClient::CrossHostRecvNotifierFunction* notifier_fn =
-            reinterpret_cast<PjRtCApiClient::CrossHostRecvNotifierFunction*>(
-                user_arg);
-        (*notifier_fn)(error, serialized_descriptors, descriptors_sizes,
-                       num_descriptors);
-        delete notifier_fn;
-      }};
-}
+// Helper struct and method used to serialize shapes past the C API boundary.
+struct ShapesInfo {
+  std::vector<size_t> shape_num_dims;
+  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  std::vector<const int64_t*> num_dims;
+  std::vector<PJRT_Buffer_Type> element_type_list;
+};
 
-absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-PjRtCApiClient::MakeCrossHostReceiveBuffers(
-    absl::Span<const Shape> shapes, PjRtDevice* device,
-    PjRtCrossHostRecvNotifier notifier) {
-  const PJRT_Api* c_api = pjrt_c_api();
-  PJRT_CrossHostTransfers_Extension* extension =
-      FindExtension<PJRT_CrossHostTransfers_Extension>(
-          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
-  if (extension == nullptr) {
-    return absl::UnimplementedError(
-        "MakeCrossHostReceiveBuffers is not implemented in this PJRT plugin.");
-  }
-  PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args args;
-  args.struct_size =
-      PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args_STRUCT_SIZE;
-  args.extension_start = nullptr;
-  args.client = c_client_.get();
-  args.num_shapes = shapes.size();
+ShapesInfo MakeShapesInfo(absl::Span<const Shape> shapes) {
   std::vector<size_t> shape_num_dims;
   shape_num_dims.reserve(shapes.size());
   std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
@@ -1006,6 +1027,7 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
   num_dims.reserve(shapes.size());
   std::vector<PJRT_Buffer_Type> element_type_list;
   element_type_list.reserve(shapes.size());
+
   for (int i = 0; i < shapes.size(); ++i) {
     shape_num_dims.push_back(shapes[i].dimensions().size());
 
@@ -1022,12 +1044,53 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
     //   layout_list.push_back(&(c_layout_data.c_layout));
     layout_list.push_back(nullptr);
   }
-  args.shape_num_dims = shape_num_dims.data();
-  args.num_dims = num_dims.data();
-  args.element_types = element_type_list.data();
-  args.layouts = layout_list.data();
 
-  args.notifier = CppCrossHostRecvNotifierToC(c_api, std::move(notifier));
+  return ShapesInfo{
+      /*shape_num_dims=*/std::move(shape_num_dims),
+      /*layout_list=*/std::move(layout_list),
+      /*num_dims=*/std::move(num_dims),
+      /*element_type_list=*/std::move(element_type_list),
+  };
+}
+
+// Helper method to convert a list of PJRT_Buffer* to a list of PjRtBuffer*.
+std::vector<std::unique_ptr<PjRtBuffer>> MakePjRtBuffersFromPJRT_Buffers(
+    PjRtCApiClient* client, PJRT_Buffer** c_buffers, size_t num_buffers) {
+  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+  buffers.reserve(num_buffers);
+  for (int i = 0; i < num_buffers; ++i) {
+    buffers.emplace_back(
+        std::make_unique<PjRtCApiBuffer>(client, c_buffers[i]));
+  }
+  return buffers;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+PjRtCApiClient::MakeCrossHostReceiveBuffers(
+    absl::Span<const Shape> shapes, PjRtDevice* device,
+    PjRtCrossHostRecvNotifier notifier) {
+  const PJRT_Api* c_api = pjrt_c_api();
+  PJRT_CrossHostTransfers_Extension* extension =
+      FindExtension<PJRT_CrossHostTransfers_Extension>(
+          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "MakeCrossHostReceiveBuffers is not implemented in this PJRT plugin.");
+  }
+  PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args args;
+  args.struct_size =
+      PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+
+  ShapesInfo shapes_info = MakeShapesInfo(shapes);
+  args.num_shapes = shapes.size();
+  args.shape_num_dims = shapes_info.shape_num_dims.data();
+  args.num_dims = shapes_info.num_dims.data();
+  args.element_types = shapes_info.element_type_list.data();
+  args.layouts = shapes_info.layout_list.data();
+
+  args.notifier = pjrt::CppCrossHostRecvNotifierToC(c_api, std::move(notifier));
   args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
 
   std::vector<PJRT_Buffer*> temp_buffers(shapes.size());
@@ -1035,13 +1098,102 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
   RETURN_STATUS_IF_PJRT_ERROR(
       extension->PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers(&args),
       c_api);
-  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
-  buffers.reserve(args.num_buffers);
+
+  return MakePjRtBuffersFromPJRT_Buffers(this, args.buffers,
+                                         temp_buffers.size());
+}
+
+absl::StatusOr<std::vector<Future<>>> PjRtCApiClient::CrossHostSendBuffers(
+    absl::Span<PjRtBuffer* const> buffers,
+    absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Get C API extension.
+  const PJRT_Api* c_api = pjrt_c_api();
+  PJRT_CrossHostTransfers_Extension* extension =
+      FindExtension<PJRT_CrossHostTransfers_Extension>(
+          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "CrossHostSendBuffers is not implemented in this PJRT plugin.");
+  }
+
+  // Form inputs.
+  PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args args;
+  args.struct_size =
+      PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+  args.num_buffers = buffers.size();
+
+  std::vector<PJRT_Buffer*> c_buffers;
+  c_buffers.reserve(buffers.size());
+  for (PjRtBuffer* buffer : buffers) {
+    c_buffers.push_back(
+        tensorflow::down_cast<const PjRtCApiBuffer*>(buffer)->c_buffer());
+  }
+
+  args.buffers = c_buffers.data();
+  args.dst_global_device_ids = dst_global_device_ids.data();
+  args.transfer_keys = transfer_keys.data();
+
+  auto send_events = std::vector<PJRT_Event*>(args.num_buffers);
+  args.send_events = send_events.data();
+
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(&args), c_api);
+
+  std::vector<Future<>> send_futures;
+  send_futures.reserve(args.num_buffers);
   for (int i = 0; i < args.num_buffers; ++i) {
-    buffers.emplace_back(std::unique_ptr<PjRtBuffer>(
-        std::make_unique<PjRtCApiBuffer>(this, args.buffers[i])));
+    send_futures.push_back(
+        pjrt::ConvertCEventToCppFuture(args.send_events[i], c_api));
   }
-  return buffers;
+
+  return send_futures;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+PjRtCApiClient::CrossHostReceiveBuffers(
+    xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+    absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Get C API extension.
+  const PJRT_Api* c_api = pjrt_c_api();
+  PJRT_CrossHostTransfers_Extension* extension =
+      FindExtension<PJRT_CrossHostTransfers_Extension>(
+          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "CrossHostReceiveBuffers is not implemented in this PJRT plugin.");
+  }
+
+  // Form inputs.
+  PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args args;
+  args.struct_size =
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+
+  ShapesInfo shapes_info = MakeShapesInfo(shapes);
+  args.num_shapes = shapes.size();
+  args.shape_num_dims = shapes_info.shape_num_dims.data();
+  args.num_dims = shapes_info.num_dims.data();
+  args.element_types = shapes_info.element_type_list.data();
+  args.layouts = shapes_info.layout_list.data();
+
+  args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
+  args.src_global_device_ids = src_global_device_ids.data();
+  args.transfer_keys = transfer_keys.data();
+
+  std::vector<PJRT_Buffer*> temp_buffers(shapes.size());
+  args.buffers = temp_buffers.data();
+
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(&args),
+      c_api);
+
+  return MakePjRtBuffersFromPJRT_Buffers(this, args.buffers,
+                                         temp_buffers.size());
 }
 
 class PjRtCApiAsyncHostToDeviceTransferManager
@@ -1174,7 +1326,7 @@ class PjRtCApiAsyncHostToDeviceTransferManager
     };
     absl::StatusOr<std::vector<PJRT_NamedValue>> result =
         pjrt::ConvertToPjRtNamedValueList(pjrt_metadata);
-    TF_CHECK_OK(result.status());
+    CHECK_OK(result.status());
     std::vector<PJRT_NamedValue> c_metadata = result.value();
     args.transfer_metadata = c_metadata.data();
     args.num_metadata = c_metadata.size();
@@ -1425,6 +1577,18 @@ absl::StatusOr<PjRtMemorySpace*> PjRtCApiDevice::default_memory_space() const {
   return client_->GetCppMemory(args.memory);
 }
 
+absl::StatusOr<PjRtMemorySpace*> PjRtCApiDevice::memory_space_by_kind(
+    absl::string_view kind) const {
+  auto it = absl::c_find_if(memory_spaces_, [kind](PjRtMemorySpace* ms) {
+    return ms->kind() == kind;
+  });
+  if (it != memory_spaces_.end()) {
+    return *it;
+  }
+  return absl::InternalError(
+      absl::StrCat("No memory space found (kind: ", kind, ")"));
+}
+
 absl::StatusOr<tsl::AllocatorStats> PjRtCApiDevice::GetAllocatorStats() const {
   PJRT_Device_MemoryStats_Args args;
   args.struct_size = PJRT_Device_MemoryStats_Args_STRUCT_SIZE;
@@ -1661,6 +1825,46 @@ PjRtCApiExecutable::GetOutputElementTypes() const {
   return std::vector<std::vector<PrimitiveType>>{std::move(out)};
 }
 
+static absl::StatusOr<Shape> GetOutputShapeHelper(
+    const std::vector<PrimitiveType>& element_types,
+    const std::vector<DimensionVector>& dimensions,
+    const std::vector<std::shared_ptr<const PjRtLayout>>& layouts) {
+  CHECK_EQ(element_types.size(), dimensions.size());
+  CHECK_EQ(element_types.size(), layouts.size());
+
+  std::vector<xla::Shape> shapes;
+  shapes.reserve(element_types.size());
+  for (int i = 0; i < element_types.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape, ShapeUtil::MakeValidatedShape(
+                                              element_types[i], dimensions[i]));
+    *shape.mutable_layout() = layouts[i]->xla_layout();
+    shapes.push_back(std::move(shape));
+  }
+  if (shapes.size() == 1) {
+    return shapes[0];
+  }
+  return ShapeUtil::MakeTupleShape(shapes);
+}
+
+absl::StatusOr<std::vector<Shape>> PjRtCApiExecutable::GetOutputShapes() const {
+  TF_ASSIGN_OR_RETURN(std::vector<std::vector<PrimitiveType>> element_types,
+                      GetOutputElementTypes());
+  TF_ASSIGN_OR_RETURN(std::vector<std::vector<DimensionVector>> dimensions,
+                      GetOutputDimensions());
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<const PjRtLayout>> layouts,
+                      GetOutputLayouts());
+
+  // `PjRtExecutable::GetOutputLayouts` doesn't support MPMD executables.
+  // Only one output is expected.
+  CHECK_EQ(element_types.size(), 1);
+  CHECK_EQ(dimensions.size(), 1);
+
+  TF_ASSIGN_OR_RETURN(
+      Shape shape,
+      GetOutputShapeHelper(element_types[0], dimensions[0], layouts));
+  return std::vector<Shape>{shape};
+}
+
 absl::StatusOr<std::string>
 PjRtCApiExecutable::GetSerializedExecutableMetadata() const {
   auto executable_metadata_extension =
@@ -1716,6 +1920,63 @@ PjRtCApiExecutable::GetOutputDimensions() const {
   return std::vector<std::vector<DimensionVector>>{std::move(out)};
 }
 
+absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+PjRtCApiExecutable::GetOutputLayouts() const {
+  const PJRT_Api* c_api = pjrt_c_api();
+  if (c_api->pjrt_api_version.major_version == 0 &&
+      c_api->pjrt_api_version.minor_version < 81) {
+    // If the PJRT C API version is too old, fall back to the default
+    // implementation.
+    return this->PjRtExecutable::GetOutputLayouts();
+  }
+  PJRT_Layouts_Extension* extension =
+      pjrt::FindExtension<PJRT_Layouts_Extension>(
+          c_api, PJRT_Extension_Type::PJRT_Extension_Type_Layouts);
+  if (extension == nullptr ||
+      extension->PJRT_Layouts_MemoryLayout_Serialize == nullptr ||
+      extension->PJRT_Layouts_PJRT_Executable_GetOutputLayouts == nullptr) {
+    // If we can't find PJRT_Layouts_PJRT_Executable_GetOutputLayouts support,
+    // fall back to the default implementation.
+    return this->PjRtExecutable::GetOutputLayouts();
+  }
+
+  PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args args;
+  args.struct_size =
+      PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.executable = c_executable();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Layouts_PJRT_Executable_GetOutputLayouts(&args), c_api);
+
+  std::vector<std::shared_ptr<const PjRtLayout>> layouts;
+  layouts.reserve(args.num_outputs);
+  for (int i = 0; i < args.num_outputs; ++i) {
+    // TODO(b/343274093): returns a PjRtLayout that wraps a C API layout
+    // directly instead of de/serializing into an xla::Layout.
+    PJRT_Layouts_MemoryLayout_Serialize_Args serialize_args;
+    serialize_args.struct_size =
+        PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE;
+    serialize_args.extension_start = nullptr;
+    serialize_args.layout = args.layouts[i];
+    pjrt::LogFatalIfPjrtError(
+        extension->PJRT_Layouts_MemoryLayout_Serialize(&serialize_args), c_api);
+
+    // Clean up `PJRT_Layouts_SerializedLayout`.
+    absl::Cleanup cleanup = [&serialize_args] {
+      serialize_args.serialized_layout_deleter(
+          serialize_args.serialized_layout);
+    };
+
+    std::string serialized_layout(serialize_args.serialized_bytes,
+                                  serialize_args.serialized_bytes_size);
+    absl::StatusOr<std::shared_ptr<const PjRtLayout>> pjrt_layout =
+        PjRtLayout::Deserialize(serialized_layout);
+    CHECK_OK(pjrt_layout.status());
+    layouts.push_back(*std::move(pjrt_layout));
+  }
+  return layouts;
+}
+
 absl::StatusOr<std::vector<std::vector<absl::string_view>>>
 PjRtCApiExecutable::GetOutputMemoryKinds() const {
   PJRT_Executable_OutputMemoryKinds_Args args;
@@ -2278,7 +2539,7 @@ static absl::StatusOr<PJRT_ExecuteContext*> ForwardExecuteContext(
         PJRT_FFI_UserData_Add_Args_STRUCT_SIZE,
         nullptr,
         create_args.context,
-        PJRT_FFI_UserData{type_id.value(), data, /*deleter=*/nullptr},
+        PJRT_FFI_UserData{type_id.value(), data},
     };
     RETURN_STATUS_IF_PJRT_ERROR(ffi_extension->user_data_add(&add_args), c_api);
     return absl::OkStatus();
@@ -2572,7 +2833,7 @@ std::shared_ptr<const PjRtLayout> PjRtCApiBuffer::layout() const {
                                       serialize_args.serialized_bytes_size);
         absl::StatusOr<std::shared_ptr<const PjRtLayout>> pjrt_layout =
             PjRtLayout::Deserialize(serialized_layout);
-        TF_CHECK_OK(pjrt_layout.status());
+        CHECK_OK(pjrt_layout.status());
         layout_ = *std::move(pjrt_layout);
       }
     }
@@ -2800,8 +3061,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
     TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
     absl::InlinedVector<int64_t, 4> byte_strides(
         literal->shape().dimensions().size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        literal->shape(), absl::MakeSpan(byte_strides)));
     // Avoid use-after-free on `literal` due to unsequenced move and use.
     Literal* literal_pointer = literal.get();
     return dst_memory->client()->BufferFromHostBuffer(
@@ -2892,6 +3153,7 @@ void PjRtCApiBuffer::MakePromiseTrackEvent() {
 }
 
 Future<> PjRtCApiBuffer::GetReadyFuture() {
+  absl::MutexLock l(mu_);
   if (readiness_promise_ == nullptr) {
     auto [promise, future] = Future<>::MakePromise();
     readiness_promise_ = std::move(promise).ToShared();
@@ -2950,6 +3212,9 @@ void PjRtCApiBuffer::CopyToRemoteDevice(
                        << descriptor.status();
   args.serialized_descriptor = descriptor->c_str();
   args.serialized_descriptor_size = descriptor->size();
+  PJRT_Transfers_CrossHostRemoteSendCallbackInfo on_done_info =
+      pjrt::CppCrossHostRemoteSendCallbackToC(pjrt_c_api(), std::move(on_done));
+  args.on_done = on_done_info;
   extension->PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice(&args);
 }
 
@@ -2988,6 +3253,8 @@ PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
     const PJRT_Api* c_api, PJRT_TopologyDescription* c_topology, bool owned)
     : compiler_(std::make_unique<PjRtCApiCompiler>(c_api)),
       c_api_(c_api),
+      tpu_topology_extension_(pjrt::FindExtension<PJRT_TpuTopology_Extension>(
+          c_api, PJRT_Extension_Type::PJRT_Extension_Type_TpuTopology)),
       c_topology_(c_topology),
       platform_name_(::pjrt::PlatformName(c_api, c_topology)),
       platform_id_(tsl::Fingerprint64(platform_name_)) {
@@ -3110,6 +3377,396 @@ void PjRtCApiTopologyDescription::InitAttributes() {
       pjrt::ConvertFromPjRtNamedValueList(args.attributes, args.num_attributes);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>>
+PjRtCApiTopologyDescription::Subslice(
+    const PjRtDeviceDimensions& chips_per_host_bounds,
+    const PjRtDeviceDimensions& host_bounds) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("Subslice is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_Subslice_Args args;
+  args.struct_size = PJRT_TpuTopology_Subslice_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.chips_per_host_bounds = chips_per_host_bounds.data();
+  args.chips_per_host_bounds_num_dims = chips_per_host_bounds.size();
+  args.host_bounds = host_bounds.data();
+  args.host_bounds_num_dims = host_bounds.size();
+  args.subslice_topology = nullptr;
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->subslice(&args), c_api_);
+  return std::make_unique<PjRtCApiTopologyDescription>(c_api_,
+                                                       args.subslice_topology,
+                                                       /*owned=*/true);
+}
+
+bool PjRtCApiTopologyDescription::is_subslice_topology() const {
+  CHECK(tpu_topology_extension_ != nullptr)
+      << "Subslice is not supported by the PJRT C API.";
+  PJRT_TpuTopology_IsSubsliceTopology_Args args;
+  args.struct_size = PJRT_TpuTopology_IsSubsliceTopology_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  pjrt::LogFatalIfPjrtError(
+      tpu_topology_extension_->is_subslice_topology(&args), c_api_);
+  return args.is_subslice_topology;
+}
+
+absl::StatusOr<PjRtTopologyDescriptionProto>
+PjRtCApiTopologyDescription::ToProto() const {
+  TF_ASSIGN_OR_RETURN(std::string serialized, Serialize());
+  PjRtTopologyDescriptionProto proto;
+  if (!proto.ParseFromString(serialized)) {
+    return Internal("Failed to parse serialized PjRtTopologyDescriptionProto.");
+  }
+  return proto;
+}
+
+absl::StatusOr<int> PjRtCApiTopologyDescription::ProcessCount() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("ProcessCount is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ProcessCount_Args args;
+  args.struct_size = PJRT_TpuTopology_ProcessCount_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->process_count(&args),
+                              c_api_);
+  return args.process_count;
+}
+
+absl::StatusOr<int> PjRtCApiTopologyDescription::ChipsPerProcess() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("ChipsPerProcess is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ChipsPerProcess_Args args;
+  args.struct_size = PJRT_TpuTopology_ChipsPerProcess_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->chips_per_process(&args),
+                              c_api_);
+  return args.chips_per_process;
+}
+
+absl::StatusOr<int> PjRtCApiTopologyDescription::CoreCountOfDefaultTypePerChip()
+    const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "CoreCountOfDefaultTypePerChip is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_CoreCountPerChip_Args args;
+  args.struct_size = PJRT_TpuTopology_CoreCountPerChip_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->core_count_per_chip(&args), c_api_);
+  return args.core_count_of_default_type_per_chip;
+}
+
+absl::StatusOr<int> PjRtCApiTopologyDescription::ChipCount() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("ChipCount is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ChipCount_Args args;
+  args.struct_size = PJRT_TpuTopology_ChipCount_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->chip_count(&args),
+                              c_api_);
+  return args.chip_count;
+}
+
+absl::StatusOr<int> PjRtCApiTopologyDescription::CoreCountOfDefaultType()
+    const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "CoreCountOfDefaultType is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_CoreCount_Args args;
+  args.struct_size = PJRT_TpuTopology_CoreCount_Args_STRUCT_SIZE;
+
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->core_count(&args),
+                              c_api_);
+  return args.core_count_of_default_type;
+}
+
+absl::StatusOr<int>
+PjRtCApiTopologyDescription::LogicalDeviceCountOfDefaultTypePerProcess() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "LogicalDeviceCountOfDefaultTypePerProcess is not supported by the "
+        "PJRT C API.");
+  }
+  PJRT_TpuTopology_LogiDeviceCountPerProcess_Args args;
+  args.struct_size =
+      PJRT_TpuTopology_LogiDeviceCountPerProcess_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->logical_device_count_per_process(&args), c_api_);
+  return args.logical_device_count_of_default_type_per_process;
+}
+
+absl::StatusOr<int>
+PjRtCApiTopologyDescription::LogicalDeviceCountOfDefaultType() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "LogicalDeviceCountOfDefaultType is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_LogiDeviceCount_Args args;
+  args.struct_size = PJRT_TpuTopology_LogiDeviceCount_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->logical_device_count(&args), c_api_);
+  return args.logical_device_count_of_default_type;
+}
+
+absl::StatusOr<int>
+PjRtCApiTopologyDescription::LogicalDeviceCountOfDefaultTypePerChip() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "LogicalDeviceCountOfDefaultTypePerChip is not supported by the PJRT C "
+        "API.");
+  }
+  PJRT_TpuTopology_LogiDeviceCountPerChip_Args args;
+  args.struct_size = PJRT_TpuTopology_LogiDeviceCountPerChip_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->logical_device_count_per_chip(&args), c_api_);
+  return args.logical_device_count_of_default_type_per_chip;
+}
+
+absl::StatusOr<int>
+PjRtCApiTopologyDescription::CoreCountOfDefaultTypePerProcess() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "CoreCountOfDefaultTypePerProcess is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_CoreCountPerProcess_Args args;
+  args.struct_size = PJRT_TpuTopology_CoreCountPerProcess_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->core_count_per_process(&args), c_api_);
+  return args.core_count_of_default_type_per_process;
+}
+
+absl::StatusOr<PjRtIdContainer<PjRtProcessId>>
+PjRtCApiTopologyDescription::ProcessIds() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("ProcessIds is not supported by the PJRT C API.");
+  }
+  TF_ASSIGN_OR_RETURN(int process_count, ProcessCount());
+  std::vector<int> process_ids_storage(process_count);
+  PJRT_TpuTopology_ProcessIds_Args args;
+  args.struct_size = PJRT_TpuTopology_ProcessIds_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.max_process_ids = process_count;
+  args.process_ids = process_ids_storage.data();
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->process_ids(&args),
+                              c_api_);
+  PjRtIdContainer<PjRtProcessId> ids;
+  ids.reserve(args.num_process_ids);
+  for (size_t i = 0; i < args.num_process_ids; ++i) {
+    ids.push_back(PjRtProcessId(args.process_ids[i]));
+  }
+  return ids;
+}
+
+absl::StatusOr<PjRtIdContainer<PjRtGlobalDeviceId>>
+PjRtCApiTopologyDescription::LogicalDeviceOfDefaultTypeIdsOnProcess(
+    PjRtProcessId process_id) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "LogicalDeviceOfDefaultTypeIdsOnProcess is not supported by the PJRT "
+        "C API.");
+  }
+  TF_ASSIGN_OR_RETURN(int logical_device_count,
+                      LogicalDeviceCountOfDefaultTypePerProcess());
+  std::vector<int> logical_device_ids_storage(logical_device_count);
+  PJRT_TpuTopology_LogiDeviceIdsOnProcess_Args args;
+  args.struct_size = PJRT_TpuTopology_LogiDeviceIdsOnProcess_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.process_id = process_id.value();
+  args.max_logical_device_ids = logical_device_count;
+  args.logical_device_of_default_type_ids = logical_device_ids_storage.data();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->logical_device_ids_on_process(&args), c_api_);
+  PjRtIdContainer<PjRtGlobalDeviceId> ids;
+  ids.reserve(args.num_logical_device_ids);
+  for (size_t i = 0; i < args.num_logical_device_ids; ++i) {
+    ids.push_back(
+        PjRtGlobalDeviceId(args.logical_device_of_default_type_ids[i]));
+  }
+  return ids;
+}
+
+absl::StatusOr<std::pair<PjRtProcessId, int>>
+PjRtCApiTopologyDescription::ProcessIdAndIndexOnProcessForChip(
+    PjRtGlobalChipId chip_id) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "ProcessIdAndIndexOnProcessForChip is not supported by the PJRT C "
+        "API.");
+  }
+  PJRT_TpuTopology_ProcIdAndIdxOnProcForChip_Args args;
+  args.struct_size =
+      PJRT_TpuTopology_ProcIdAndIdxOnProcForChip_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.chip_id = chip_id.value();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->proc_id_and_idx_on_proc_for_chip(&args), c_api_);
+  return std::make_pair(PjRtProcessId(args.process_id), args.index_on_process);
+}
+
+absl::StatusOr<std::pair<PjRtProcessId, int>> PjRtCApiTopologyDescription::
+    ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType(
+        xla::PjRtGlobalDeviceId device_id) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType is not "
+        "supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ProcIdAndIdxOnProcForLogiDevice_Args args;
+  args.struct_size =
+      PJRT_TpuTopology_ProcIdAndIdxOnProcForLogiDevice_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.device_id = device_id.value();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->proc_id_and_idx_on_proc_for_logi_device(&args),
+      c_api_);
+  return std::make_pair(PjRtProcessId(args.process_id), args.index_on_process);
+}
+
+absl::StatusOr<PjRtDeviceDimensions>
+PjRtCApiTopologyDescription::ProcessCoordFromId(
+    PjRtProcessId process_id) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "ProcessCoordFromId is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ProcessCoordFromId_Args args;
+  args.struct_size = PJRT_TpuTopology_ProcessCoordFromId_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.process_id = process_id.value();
+  std::vector<int32_t> coords(kMaxDims);
+  args.coords = coords.data();
+  args.coords_max_dims = kMaxDims;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->process_coord_from_id(&args), c_api_);
+  return PjRtDeviceDimensions(
+      absl::MakeSpan(coords.data(), args.coords_num_dims));
+}
+
+absl::StatusOr<PjRtGlobalChipId> PjRtCApiTopologyDescription::ChipIdFromCoord(
+    const PjRtDeviceDimensions& chip) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("ChipIdFromCoord is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ChipIdFromCoord_Args args;
+  args.struct_size = PJRT_TpuTopology_ChipIdFromCoord_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.coords = chip.data();
+  args.coords_num_dims = chip.size();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->chip_id_from_coord(&args), c_api_);
+  return PjRtGlobalChipId(args.chip_id);
+}
+
+absl::StatusOr<xla::PjRtGlobalDeviceId> PjRtCApiTopologyDescription::
+    LogicalDeviceOfDefaultTypeIdFromChipCoordAndCoreIndex(
+        const PjRtDeviceDimensions& chip, int core_index) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "LogicalDeviceOfDefaultTypeIdFromChipCoordAndCoreIndex is not "
+        "supported by the PJRT C API.");
+  }
+  std::vector<int32_t> chip_coords_storage(chip.size());
+  for (size_t i = 0; i < chip.size(); ++i) {
+    chip_coords_storage[i] = chip.data()[i];
+  }
+  PJRT_TpuTopology_LogiDeviceIdFromChipCoordAndIdx_Args args;
+  args.struct_size =
+      PJRT_TpuTopology_LogiDeviceIdFromChipCoordAndIdx_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.chip_coords = chip_coords_storage.data();
+  args.chip_coords_num_dims = chip.size();
+  args.logical_device_index_on_chip = core_index;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->logical_device_id_from_chip_coord_and_idx(&args),
+      c_api_);
+  return PjRtGlobalDeviceId(args.logical_device_of_default_type_id);
+}
+
+absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
+PjRtCApiTopologyDescription::ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+    xla::PjRtGlobalDeviceId device_id) const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType is not supported "
+        "by the PJRT C API.");
+  }
+  std::vector<int32_t> chip_coords_storage(kMaxDims);
+  PJRT_TpuTopology_ChipCoordAndIdxForLogiDevice_Args args;
+  args.struct_size =
+      PJRT_TpuTopology_ChipCoordAndIdxForLogiDevice_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  args.device_id = device_id.value();
+  args.chip_coords_max_dims = kMaxDims;
+  args.chip_coords = chip_coords_storage.data();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->chip_coord_and_idx_for_logi_device(&args),
+      c_api_);
+  return std::make_pair(
+      PjRtDeviceDimensions(absl::MakeSpan(chip_coords_storage.data(),
+                                          args.chip_coords_num_dims)),
+      args.device_index_on_chip);
+}
+
+absl::StatusOr<PjRtDeviceDimensions>
+PjRtCApiTopologyDescription::ChipsPerProcessBounds() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented(
+        "ChipsPerProcessBounds is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ChipsPerProcessBounds_Args args;
+  args.struct_size = PJRT_TpuTopology_ChipsPerProcessBounds_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  std::vector<int32_t> bounds(kMaxDims);
+  args.chip_per_process_bounds = bounds.data();
+  args.chip_per_process_bounds_max_dims = kMaxDims;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      tpu_topology_extension_->chips_per_process_bounds(&args), c_api_);
+  return PjRtDeviceDimensions(
+      absl::MakeSpan(bounds.data(), args.chip_per_process_bounds_num_dims));
+}
+
+absl::StatusOr<PjRtDeviceDimensions> PjRtCApiTopologyDescription::ChipBounds()
+    const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("ChipBounds is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ChipBounds_Args args;
+  args.struct_size = PJRT_TpuTopology_ChipBounds_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  std::vector<int32_t> bounds(kMaxDims);
+  args.chip_bounds = bounds.data();
+  args.chip_bounds_max_dims = kMaxDims;
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->chip_bounds(&args),
+                              c_api_);
+  return PjRtDeviceDimensions(
+      absl::MakeSpan(bounds.data(), args.chip_bounds_num_dims));
+}
+
+absl::StatusOr<PjRtDeviceDimensions>
+PjRtCApiTopologyDescription::ProcessBounds() const {
+  if (tpu_topology_extension_ == nullptr) {
+    return Unimplemented("ProcessBounds is not supported by the PJRT C API.");
+  }
+  PJRT_TpuTopology_ProcessBounds_Args args;
+  args.struct_size = PJRT_TpuTopology_ProcessBounds_Args_STRUCT_SIZE;
+  args.topology = c_topology_;
+  std::vector<int32_t> bounds(kMaxDims);
+  args.process_bounds = bounds.data();
+  args.process_bounds_max_dims = kMaxDims;
+  RETURN_STATUS_IF_PJRT_ERROR(tpu_topology_extension_->process_bounds(&args),
+                              c_api_);
+  return PjRtDeviceDimensions(
+      absl::MakeSpan(bounds.data(), args.process_bounds_num_dims));
+}
+
 // Initializes `PJRT_Compile_Args`, which will be used to call
 // API PJRT_Compile().
 static absl::StatusOr<std::unique_ptr<PjRtExecutable>>
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index 78f422e882ddf9..ea773cddabf832 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -44,13 +44,14 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_tpu_topology_extension.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/service/computation_placer.h"
@@ -166,6 +167,9 @@ class PjRtCApiDevice : public PjRtDevice {
 
   absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
 
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
+      absl::string_view kind) const override;
+
   std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
       absl::string_view description) const override {
     LOG(FATAL)
@@ -252,9 +256,54 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
       PrimitiveType element_type,
       absl::Span<const int64_t> dims) const override;
 
+  absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> Subslice(
+      const PjRtDeviceDimensions& chips_per_host_bounds,
+      const PjRtDeviceDimensions& host_bounds) const override;
+
+  bool is_subslice_topology() const override;
+
+  absl::StatusOr<PjRtTopologyDescriptionProto> ToProto() const override;
+
+  absl::StatusOr<int> ProcessCount() const override;
+
+  absl::StatusOr<int> ChipsPerProcess() const override;
+
+  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override;
+
+  absl::StatusOr<int> ChipCount() const override;
+  absl::StatusOr<int> CoreCountOfDefaultType() const override;
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override;
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerProcess()
+      const override;
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerChip() const override;
+  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override;
+  absl::StatusOr<PjRtIdContainer<PjRtProcessId>> ProcessIds() const override;
+  absl::StatusOr<PjRtIdContainer<PjRtGlobalDeviceId>>
+  LogicalDeviceOfDefaultTypeIdsOnProcess(
+      PjRtProcessId process_id) const override;
+  absl::StatusOr<std::pair<PjRtProcessId, int>>
+  ProcessIdAndIndexOnProcessForChip(PjRtGlobalChipId chip_id) const override;
+  absl::StatusOr<std::pair<PjRtProcessId, int>>
+  ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType(
+      xla::PjRtGlobalDeviceId device_id) const override;
+  absl::StatusOr<PjRtDeviceDimensions> ProcessCoordFromId(
+      PjRtProcessId process_id) const override;
+  absl::StatusOr<PjRtGlobalChipId> ChipIdFromCoord(
+      const PjRtDeviceDimensions& chip) const override;
+  absl::StatusOr<xla::PjRtGlobalDeviceId>
+  LogicalDeviceOfDefaultTypeIdFromChipCoordAndCoreIndex(
+      const PjRtDeviceDimensions& chip, int core_index) const override;
+  absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
+  ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+      xla::PjRtGlobalDeviceId device_id) const override;
+  absl::StatusOr<PjRtDeviceDimensions> ChipsPerProcessBounds() const override;
+  absl::StatusOr<PjRtDeviceDimensions> ChipBounds() const override;
+  absl::StatusOr<PjRtDeviceDimensions> ProcessBounds() const override;
+
  private:
   std::unique_ptr<PjRtCApiCompiler> compiler_;
   const PJRT_Api* c_api_;
+  const PJRT_TpuTopology_Extension* tpu_topology_extension_;
   // nullptr iff the PJRT_TopologyDescription isn't owned by this wrapper
   // (i.e. by the caller).
   std::unique_ptr<PJRT_TopologyDescription,
@@ -337,6 +386,9 @@ class PjRtCApiClient : public PjRtClient {
       std::pair<std::unique_ptr<PjRtBuffer>, PjRtFulfillAliasBufferCallback>>
   CreateAliasBuffer(const Shape& shape, PjRtMemorySpace* memory_space) override;
 
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) override;
+
   absl::StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
       const override;
 
@@ -357,12 +409,7 @@ class PjRtCApiClient : public PjRtClient {
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtMemorySpace* memory_space,
-      const Layout* device_layout) override {
-    return Unimplemented(
-        "PJRT C API does not support BufferFromHostLiteral. Please report an "
-        "issue at https://github.com/google/jax/issues if you need this "
-        "feature.");
-  }
+      const Layout* device_layout) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
       void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
@@ -377,6 +424,17 @@ class PjRtCApiClient : public PjRtClient {
                               PjRtDevice* device,
                               PjRtCrossHostRecvNotifier notifier) override;
 
+  absl::StatusOr<std::vector<Future<>>> CrossHostSendBuffers(
+      absl::Span<PjRtBuffer* const> buffers,
+      absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  CrossHostReceiveBuffers(
+      xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+      absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
+
   absl::Status DmaMap(void* data, size_t size) override;
 
   absl::Status DmaUnmap(void* data) override;
@@ -415,6 +473,9 @@ class PjRtCApiClient : public PjRtClient {
   void InitAttributes();
   PJRT_Extension_Base* FindExtensionImpl(PJRT_Extension_Type type) const;
 
+  std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter> CreatePjRtError(
+      const absl::Status& error) const;
+
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBufferInternalImpl(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
@@ -578,11 +639,7 @@ class PjRtCApiExecutable : public PjRtExecutable {
     return pjrt::GetCompiledMemoryStats(c_api_, executable_.get());
   }
 
-  absl::StatusOr<std::vector<Shape>> GetOutputShapes() const override {
-    LOG(FATAL) << "PjRtExecutable::GetOutputShapes() not implemented in PJRT C "
-                  "API. Please use PjRtExecutable::GetOutputElementTypes() or "
-                  "PjRtExecutable::GetOutputDimensions().";
-  }
+  absl::StatusOr<std::vector<Shape>> GetOutputShapes() const override;
 
   absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
   GetOutputElementTypes() const override;
@@ -590,6 +647,9 @@ class PjRtCApiExecutable : public PjRtExecutable {
   absl::StatusOr<std::vector<std::vector<DimensionVector>>>
   GetOutputDimensions() const override;
 
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const override;
+
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override;
 
@@ -655,10 +715,7 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
   }
 
   absl::StatusOr<std::vector<Shape>> GetOutputShapes() const override {
-    LOG(FATAL)
-        << "PjRtLoadedExecutable::GetOutputShapes() not implemented in PJRT C "
-           "API. Please use PjRtLoadedExecutable::GetOutputElementTypes() or "
-           "PjRtLoadedExecutable::GetOutputDimensions().";
+    return executable_->GetOutputShapes();
   }
 
   absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
@@ -671,6 +728,11 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
     return executable_->GetOutputDimensions();
   }
 
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const override {
+    return executable_->GetOutputLayouts();
+  }
+
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override {
     return executable_->GetOutputMemoryKinds();
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
index 21eb1b686a77c0..94b266d84876a8 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -57,8 +58,14 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/types.h"
+
+using ::testing::ElementsAreArray;
+using ::testing::HasSubstr;
 
 namespace xla {
 namespace {
@@ -126,6 +133,72 @@ TEST(PjRtCApiClientTest, FulfillAliasBuffer) {
       LiteralUtil::CreateR2<int32_t>({{2, 3, 4}, {5, 6, 7}}), *alias_literal));
 }
 
+TEST(PjRtCApiClientTest, CreateErrorBuffer) {
+  SetUpCpuPjRtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetCApiClient("cpu"));
+
+  absl::Status error = absl::InternalError("Test Error");
+  Shape shape = ShapeUtil::MakeShape(S32, {2, 3});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto error_buffer,
+      client->CreateErrorBuffer(error, shape, client->memory_spaces()[0]));
+
+  absl::Status awaited_status = error_buffer->GetReadyFuture().Await();
+  EXPECT_TRUE(absl::IsInternal(awaited_status));
+  EXPECT_THAT(awaited_status.message(), HasSubstr("Test Error"));
+}
+
+TEST(PjRtCApiClientTest, ConcurrentGetReadyFuture) {
+  const PJRT_Api* c_api = ::pjrt::cpu_plugin::GetCpuPjrtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          WrapClientAroundCApi(c_api));
+
+  constexpr int kNumThreads = 4;
+  tsl::thread::ThreadPool thread_pool(
+      tsl::Env::Default(), "GetReadyWithConcurrentUsage", kNumThreads);
+
+  std::vector<int32_t> data{1, 2, 3, 4, 5, 6};
+  Shape shape = ShapeUtil::MakeShape(S32, {2, 3});
+
+  // Create a buffer from host data.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto param,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->memory_spaces()[0], /*device_layout=*/nullptr));
+
+  // Define a simple "add one" kernel.
+  XlaBuilder builder("add_one");
+  auto input = Parameter(&builder, 0, shape, "input");
+  auto one = ConstantR0<int32_t>(&builder, 1);
+  auto add = Add(input, one);
+  auto computation = builder.Build(add).value();
+
+  // Compile and load the executable.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtLoadedExecutable> executable,
+      client->CompileAndLoad(computation, CompileOptions()));
+  for (size_t i = 0; i < 100; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results,
+        executable->Execute({{param.get()}}, ExecuteOptions()));
+    auto buffer = std::move(results[0][0]);
+
+    absl::BlockingCounter blocking_counter(kNumThreads);
+    for (size_t j = 0; j < kNumThreads; ++j) {
+      thread_pool.Schedule([&, buffer = buffer.get()]() {
+        TF_EXPECT_OK(buffer->GetReadyFuture().Await());
+        blocking_counter.DecrementCount();
+      });
+    }
+    blocking_counter.Wait();
+  }
+}
+
 TEST(PjRtCApiClientTest, IsDynamicDimension) {
   SetUpCpuPjRtApi();
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
@@ -331,6 +404,21 @@ TEST(PjRtClientTest, CompileUsesStableHloVersion) {
   const_cast<PJRT_Api*>(c_api)->PJRT_Client_Compile = PJRT_Client_Compile_Orig;
 }
 
+TEST(PjRtClientTest, CompileWorksInplace) {
+  SetUpCpuPjRtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetCApiClient("cpu"));
+  constexpr char kProgram[] = "func.func @main() {return}";
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                          ParseMlirModuleString(kProgram, context));
+  CompileOptions options;
+  options.allow_in_place_mlir_modification = true;
+  std::unique_ptr<PjRtLoadedExecutable> executable =
+      client->CompileAndLoad(*module, options).value();
+  EXPECT_NE(executable.get(), nullptr);
+}
+
 TEST(PjRtClientTest, CanQueryMemoryDescriptions) {
   SetUpCpuPjRtApi();
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
@@ -483,5 +571,45 @@ TEST(PjRtClientTest, DeserializeExecutableWithDifferentDeviceAssignment) {
       deserialized_executable->addressable_devices()[0]->global_device_id(), 1);
 }
 
+TEST(PjRtCApiClientTest, GetOutputShapes) {
+  static constexpr char const* kProgram = R"(
+    HloModule ffi_handler
+    ENTRY main {
+      ROOT %custom-call = f32[4] custom-call(),
+                          custom_call_target="MemsetFromValue",
+                          api_version=API_VERSION_TYPED_FFI
+    })";
+
+  const PJRT_Api* c_api = ::pjrt::cpu_plugin::GetCpuPjrtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          WrapClientAroundCApi(c_api));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnUnverifiedModule(kProgram, {}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      client->CompileAndLoad(XlaComputation(hlo_module->ToProto()), {}));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Shape> output_shapes,
+                          executable->GetOutputShapes());
+  EXPECT_EQ(output_shapes.size(), 1);
+  Shape expected_shape = ShapeUtil::MakeShape(F32, {4});
+  EXPECT_EQ(output_shapes[0], expected_shape);
+}
+
+TEST(PjRtClientTest, BufferFromLiteralInt4) {
+  SetUpCpuPjRtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetCApiClient("cpu"));
+  xla::Shape shape = xla::ShapeUtil::MakeShape(S4, {128, 256});
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
+  EXPECT_THAT(received_literal->data<s4>(),
+              ElementsAreArray(literal.data<s4>()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_tpu_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_tpu_test.cc
new file mode 100644
index 00000000000000..7f815f0ff8ee0b
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_tpu_test.cc
@@ -0,0 +1,232 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using ::absl_testing::IsOkAndHolds;
+using ::testing::ElementsAre;
+using ::testing::Pair;
+
+using PjRtDeviceDimensionsAndInt = std::pair<PjRtDeviceDimensions, int32_t>;
+
+// Helper to get a TPU topology description.
+absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetTpuTopology() {
+  return GetCApiTopology("tpu", "TPU v2:4x4");
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, IsSubsliceTopology) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  // The default TPU topology is not a subslice.
+  EXPECT_THAT(topology->is_subslice_topology(), false);
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, SubsliceTopology) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  PjRtDeviceDimensions chips_per_host_bounds = {2, 2, 1};
+  PjRtDeviceDimensions host_bounds = {1, 1, 1};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtTopologyDescription> subslice_topology,
+      topology->Subslice(chips_per_host_bounds, host_bounds));
+  EXPECT_THAT(subslice_topology->is_subslice_topology(), true);
+  EXPECT_THAT(subslice_topology->DeviceDescriptions().size(), 8);
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ProcessCount) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  // Assuming a single process for a default test setup.
+  EXPECT_THAT(topology->ProcessCount(), IsOkAndHolds(4));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ChipsPerProcess) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->ChipsPerProcess(), IsOkAndHolds(4));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, CoreCountOfDefaultTypePerChip) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  // TPU chips typically have 2 cores of the default type (TensorCores).
+  EXPECT_THAT(topology->CoreCountOfDefaultTypePerChip(), IsOkAndHolds(2));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ToProto) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(PjRtTopologyDescriptionProto proto,
+                          topology->ToProto());
+  EXPECT_EQ(proto.platform_name(), "tpu");
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ChipCount) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->ChipCount(), IsOkAndHolds(16));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, CoreCountOfDefaultType) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->CoreCountOfDefaultType(), IsOkAndHolds(32));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest,
+     LogicalDeviceCountOfDefaultTypePerProcess) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->LogicalDeviceCountOfDefaultTypePerProcess(),
+              IsOkAndHolds(8));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, LogicalDeviceCountOfDefaultType) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->LogicalDeviceCountOfDefaultType(), IsOkAndHolds(32));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest,
+     LogicalDeviceCountOfDefaultTypePerChip) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->LogicalDeviceCountOfDefaultTypePerChip(),
+              IsOkAndHolds(2));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, CoreCountOfDefaultTypePerProcess) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->CoreCountOfDefaultTypePerProcess(), IsOkAndHolds(8));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ProcessIds) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(PjRtIdContainer<PjRtProcessId> process_ids,
+                          topology->ProcessIds());
+  EXPECT_THAT(process_ids, ElementsAre(PjRtProcessId(0), PjRtProcessId(1),
+                                       PjRtProcessId(2), PjRtProcessId(3)));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest,
+     LogicalDeviceOfDefaultTypeIdsOnProcess) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(
+      PjRtIdContainer<PjRtGlobalDeviceId> device_ids,
+      topology->LogicalDeviceOfDefaultTypeIdsOnProcess(PjRtProcessId(0)));
+  EXPECT_THAT(device_ids,
+              ElementsAre(PjRtGlobalDeviceId(0), PjRtGlobalDeviceId(1),
+                          PjRtGlobalDeviceId(2), PjRtGlobalDeviceId(3),
+                          PjRtGlobalDeviceId(8), PjRtGlobalDeviceId(9),
+                          PjRtGlobalDeviceId(10), PjRtGlobalDeviceId(11)));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ProcessIdAndIndexOnProcessForChip) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->ProcessIdAndIndexOnProcessForChip(PjRtGlobalChipId(2)),
+              IsOkAndHolds(Pair(PjRtProcessId(1), 0)));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest,
+     ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType(
+                  PjRtGlobalDeviceId(3)),
+              IsOkAndHolds(Pair(PjRtProcessId(0), 3)));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ProcessCoordFromId) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(PjRtDeviceDimensions coords,
+                          topology->ProcessCoordFromId(PjRtProcessId(2)));
+  EXPECT_THAT(coords, (PjRtDeviceDimensions{0, 1, 0}));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ChipIdFromCoord) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->ChipIdFromCoord({1, 0, 0}),
+              IsOkAndHolds(PjRtGlobalChipId(1)));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest,
+     LogicalDeviceOfDefaultTypeIdFromChipCoordAndCoreIndex) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  EXPECT_THAT(topology->LogicalDeviceOfDefaultTypeIdFromChipCoordAndCoreIndex(
+                  {1, 1, 0}, 0),
+              IsOkAndHolds(PjRtGlobalDeviceId(10)));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest,
+     ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(
+      const PjRtDeviceDimensionsAndInt& result,
+      topology->ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          PjRtGlobalDeviceId(10)));
+  EXPECT_THAT(absl::MakeConstSpan(result.first.data(), result.first.size()),
+              ElementsAre(1, 1, 0));
+  EXPECT_EQ(result.second, 0);
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ChipsPerProcessBounds) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(PjRtDeviceDimensions bounds,
+                          topology->ChipsPerProcessBounds());
+  EXPECT_THAT(bounds, (PjRtDeviceDimensions{2, 2, 1}));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ChipBounds) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(PjRtDeviceDimensions bounds, topology->ChipBounds());
+  EXPECT_THAT(bounds, (PjRtDeviceDimensions{4, 4, 1}));
+}
+
+TEST(PjRtCApiTopologyDescriptionTpuTest, ProcessBounds) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtTopologyDescription> topology,
+                          GetTpuTopology());
+  TF_ASSERT_OK_AND_ASSIGN(PjRtDeviceDimensions bounds,
+                          topology->ProcessBounds());
+  EXPECT_THAT(bounds, (PjRtDeviceDimensions{2, 2, 1}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index 4f680bbe1b5f67..4a843d1080ea16 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
@@ -153,7 +152,7 @@ CommonPjRtClient::BufferFromHostLiteral(const LiteralSlice& literal,
       LinearizeInto(literal, device_shape,
                     HostBufferSemantics::kImmutableUntilTransferCompletes,
                     raw_buffer));
-  return DefineBuffer(device_shape, std::move(raw_buffer),
+  return DefineBuffer(device_shape, memory_space, std::move(raw_buffer),
                       {std::move(definition_event)},
                       /*raw_buffer_is_mutable=*/true);
 }
@@ -187,10 +186,10 @@ CommonPjRtClient::CreateUninitializedBuffer(const Shape& shape,
                                         /*allocate_after=*/{}));
   TF_ASSIGN_OR_RETURN(auto definition_event,
                       raw_buffer->MakeAllocationReadyEvent());
-  TF_ASSIGN_OR_RETURN(
-      auto output_buffer,
-      DefineBuffer(device_shape, raw_buffer, {std::move(definition_event)},
-                   /*raw_buffer_is_mutable=*/true));
+  TF_ASSIGN_OR_RETURN(auto output_buffer,
+                      DefineBuffer(device_shape, memory_space, raw_buffer,
+                                   {std::move(definition_event)},
+                                   /*raw_buffer_is_mutable=*/true));
   return output_buffer;
 }
 
@@ -199,16 +198,71 @@ absl::StatusOr<
 CommonPjRtClient::CreateAliasBuffer(const Shape& shape,
                                     PjRtMemorySpace* memory_space) {
   tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
-  tsl::RCReference<PjRtDeviceEvent> definition_event;
-  PjRtFulfillAliasBufferCallback fulfill_cb;
+  PjRtFulfillAliasRawBufferCallback buffer_promise;
 
-  TF_ASSIGN_OR_RETURN(std::tie(raw_buffer, definition_event, fulfill_cb),
-                      CreateRawBufferChannel(shape, memory_space));
+  TF_ASSIGN_OR_RETURN(std::tie(raw_buffer, buffer_promise),
+                      CreateRawBufferChannel(memory_space));
 
+  tsl::RCReference<xla::PjRtDeviceEventPromise> definition_event_promise;
+  tsl::RCReference<xla::PjRtDeviceEvent> definition_event;
   TF_ASSIGN_OR_RETURN(
-      auto result_buffer,
-      DefineBuffer(shape, std::move(raw_buffer), {std::move(definition_event)},
-                   /*raw_buffer_is_mutable=*/true));
+      std::tie(definition_event_promise, definition_event),
+      CreateLinkedEventPromise(memory_space, "CreateRawBufferChannel"));
+
+  PjRtFulfillAliasBufferCallback fulfill_cb =
+      [buffer_promise = std::move(buffer_promise),
+       definition_event_promise = std::move(definition_event_promise),
+       memory_space,
+       shape](absl::StatusOr<xla::PjRtBuffer*> buffer_or) mutable {
+        if (!buffer_or.ok()) {
+          definition_event_promise->SetError(buffer_or.status());
+          std::move(buffer_promise)(buffer_or.status()).IgnoreError();
+          return buffer_or.status();
+        }
+        xla::PjRtBuffer* buffer = buffer_or.value();
+        if (buffer->on_device_shape() != shape) {
+          auto status = absl::InvalidArgumentError(absl::StrFormat(
+              "Shape mismatch in CreateRawBufferChannel fulfill: expected %s, "
+              "got %s",
+              shape.ToString(), buffer->on_device_shape().ToString()));
+          definition_event_promise->SetError(status);
+          std::move(buffer_promise)(status).IgnoreError();
+          return status;
+        }
+        xla::CommonPjRtBuffer* common_buffer =
+            dynamic_cast<xla::CommonPjRtBuffer*>(buffer);
+        if (common_buffer == nullptr) {
+          auto status =
+              absl::InternalError("Failed to cast to CommonPjRtBuffer");
+          definition_event_promise->SetError(status);
+          std::move(buffer_promise)(status).IgnoreError();
+          return status;
+        }
+        xla::CommonPjRtBuffer::ScopedHold hold =
+            common_buffer->GetBufferWithHold(
+                xla::CommonPjRtBuffer::ScopedHold::kDonation);
+        auto device_event_or = hold.buffer()->GetDefinitionEvent(memory_space);
+        if (!device_event_or.ok()) {
+          auto status = device_event_or.status();
+          definition_event_promise->SetError(status);
+          std::move(buffer_promise)(status).IgnoreError();
+          return status;
+        }
+        auto status = std::move(buffer_promise)(hold.buffer()->raw_buffer());
+        if (!status.ok()) {
+          definition_event_promise->SetError(status);
+          return status;
+        }
+
+        definition_event_promise->Set(std::move(*device_event_or));
+        hold.ConfirmDonation();
+        return absl::OkStatus();
+      };
+
+  TF_ASSIGN_OR_RETURN(auto result_buffer,
+                      DefineBuffer(shape, memory_space, std::move(raw_buffer),
+                                   {std::move(definition_event)},
+                                   /*raw_buffer_is_mutable=*/true));
 
   return std::make_pair(std::move(result_buffer), std::move(fulfill_cb));
 }
@@ -243,7 +297,7 @@ CommonPjRtClient::BufferFromHostBuffer(
       TF_ASSIGN_OR_RETURN(
           auto output_buffer,
           DefineBuffer(
-              device_shape, raw_buffer,
+              device_shape, memory_space, raw_buffer,
               absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
               /*raw_buffer_is_mutable=*/host_buffer_semantics ==
                   PjRtClient::HostBufferSemantics::kMutableZeroCopy));
@@ -262,10 +316,10 @@ CommonPjRtClient::BufferFromHostBuffer(
       LinearizeHostBufferInto(
           data, type, dims, byte_strides, host_buffer_semantics,
           std::move(on_done_with_host_buffer), device_shape, raw_buffer));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtBuffer> output_buffer,
-      DefineBuffer(device_shape, raw_buffer, {std::move(definition_event)},
-                   /*raw_buffer_is_mutable=*/true));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> output_buffer,
+                      DefineBuffer(device_shape, memory_space, raw_buffer,
+                                   {std::move(definition_event)},
+                                   /*raw_buffer_is_mutable=*/true));
   return output_buffer;
 }
 
@@ -291,7 +345,7 @@ CommonPjRtClient::CreateViewOfDeviceBuffer(
                           on_device_bytes_count, memory_space));
   TF_ASSIGN_OR_RETURN(
       auto output_buffer,
-      DefineBuffer(device_shape, raw_buffer,
+      DefineBuffer(device_shape, memory_space, raw_buffer,
                    absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
                    /*raw_buffer_is_mutable=*/false));
   return output_buffer;
@@ -393,10 +447,11 @@ CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
   TF_ASSIGN_OR_RETURN(
       std::tie(definition_event_promise, definition_event),
       dst_client->CreateLinkedEventPromise(dst_memory_space, ""));
-  TF_ASSIGN_OR_RETURN(auto buffer,
-                      dst_client->DefineBuffer(dst_shape, dst_raw_buffer,
-                                               {std::move(definition_event)},
-                                               /*raw_buffer_is_mutable=*/true));
+  TF_ASSIGN_OR_RETURN(
+      auto buffer,
+      dst_client->DefineBuffer(dst_shape, dst_memory_space, dst_raw_buffer,
+                               {std::move(definition_event)},
+                               /*raw_buffer_is_mutable=*/true));
   auto* base_ptr = dst_raw_buffer->GetHostPointer();
   std::unique_ptr<MutableLiteralBase> literal;
   bool needs_second_copy = false;
@@ -424,16 +479,16 @@ CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
             *literal, dst_shape,
             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes,
             dst_raw_buffer);
+        if (!status_or_h2d_transfer_event.ok()) {
+          definition_event_promise->SetError(status);
+        } else {
+          status_or_h2d_transfer_event.value()->AndThen(
+              [literal = std::move(literal)] {});
+          definition_event_promise->Set(
+              *std::move(status_or_h2d_transfer_event));
+        }
       } else {
-        status_or_h2d_transfer_event =
-            dst_raw_buffer->MakeAllocationReadyEvent();
-      }
-      if (!status_or_h2d_transfer_event.ok()) {
-        definition_event_promise->SetError(status);
-      } else {
-        status_or_h2d_transfer_event.value()->AndThen(
-            [literal = std::move(literal)] {});
-        definition_event_promise->Set(*std::move(status_or_h2d_transfer_event));
+        definition_event_promise->SetReady();
       }
     }
   });
@@ -504,9 +559,10 @@ static absl::Status CommonCopyToMemorySpace(
         dst_client->AllocateRawBuffer(dst_memory_space, on_device_bytes_count,
                                       /*retry_on_oom=*/true, allocation_event));
     TF_ASSIGN_OR_RETURN(
-        dst_buffer, dst_client->DefineBuffer(dst_shape, dst_raw_buffer,
-                                             {std::move(definition_event)},
-                                             /*raw_buffer_is_mutable=*/true));
+        dst_buffer,
+        dst_client->DefineBuffer(dst_shape, dst_memory_space, dst_raw_buffer,
+                                 {std::move(definition_event)},
+                                 /*raw_buffer_is_mutable=*/true));
     TF_RETURN_IF_ERROR(src_buffer->AcquireScopedRawBuffer(
         [&](tsl::RCReference<CommonPjRtRawBuffer> buf_raw_buffer,
             std::vector<tsl::RCReference<tsl::AsyncValue>>
@@ -663,7 +719,8 @@ CommonPjRtBufferImpl::CopyToMemorySpace(PjRtMemorySpace* dst_memory_space) {
     TF_ASSIGN_OR_RETURN(auto dest_shape, client()->GetCopyDestinationShape(
                                              on_device_shape(), memory_space(),
                                              dst_memory_space));
-    if (dest_shape == on_device_shape()) {
+    if (xla::Shape::Equal().IgnoreMemorySpaceInLayout()(dest_shape,
+                                                        on_device_shape())) {
       return DirectCopyToMemorySpace(dst_memory_space);
     }
     if (!primitive_util::IsSubByteNonPredType(dest_shape.element_type())) {
@@ -695,8 +752,8 @@ CommonPjRtBufferImpl::CopyToMemorySpaceSyncThroughLiteral(
   TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
   absl::InlinedVector<int64_t, 4> byte_strides(
       literal->shape().dimensions().size());
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+  TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+      literal->shape(), absl::MakeSpan(byte_strides)));
   // Avoid use-after-free on `literal` due to unsequenced move and use.
   Literal* literal_pointer = literal.get();
   return dst_memory_space->client()->BufferFromHostBuffer(
@@ -966,7 +1023,7 @@ CommonPjRtBufferImpl::AcquireExternalReference() {
     tsl::RCReference<CommonPjRtRawBuffer> raw_buffer_;
   };
 
-  auto raw_buffer = hold.buffer()->GetRawBuffer(memory_space_);
+  auto raw_buffer = hold.buffer()->raw_buffer();
   return std::unique_ptr<ExternalReference>(
       std::make_unique<ScopedHoldAsExternalReference>(std::move(hold),
                                                       std::move(raw_buffer)));
@@ -1186,7 +1243,7 @@ CommonPjRtBufferImpl::ReleaseDeviceMemoryOwnership(
   std::unique_ptr<PjRtBuffer::ExternalReference> ref;
   if (device_buffer) {
     ref = std::make_unique<RawBufferAsExternalReference>(
-        device_buffer->GetRawBuffer(memory_space_));
+        device_buffer->raw_buffer());
   }
   return ref;
 }
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index bb9f2b6f632f9e..5bdf12b09d45c0 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value.h"
@@ -100,7 +99,7 @@ class CommonPjRtClient : public PjRtClient {
 
   // Defines a pjrt buffer from a shape, raw_buffer and definition events.
   virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> DefineBuffer(
-      const Shape& on_device_shape,
+      const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
           definition_device_events,
@@ -187,15 +186,12 @@ class CommonPjRtClient : public PjRtClient {
   // Creates a raw buffer channel. Returns a tuple containing:
   // 1.  A tsl::RCReference<CommonPjRtRawBuffer> which is an alias for a future
   //     raw buffer.
-  // 2.  A tsl::RCReference<PjRtDeviceEvent> which is the definition event
-  //     for the alias raw buffer.
-  // 3.  A PjRtFulfillAliasBufferCallback to fulfill the alias.
-  // TODO(b/447164755 jparkerh): Rework this API to share a bit more code
-  // between children of this class.
-  virtual absl::StatusOr<std::tuple<tsl::RCReference<CommonPjRtRawBuffer>,
-                                    tsl::RCReference<PjRtDeviceEvent>,
-                                    PjRtFulfillAliasBufferCallback>>
-  CreateRawBufferChannel(const Shape& shape, PjRtMemorySpace* memory_space) {
+  // 3.  A PjRtFulfillAliasRawBufferCallback to fulfill the alias.
+  using PjRtFulfillAliasRawBufferCallback = absl::AnyInvocable<absl::Status(
+      absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>) &&>;
+  virtual absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                                   PjRtFulfillAliasRawBufferCallback>>
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
     return absl::UnimplementedError("CreateRawBufferChannel is not supported");
   }
 
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 2abd7ba1578598..140c9adba8426d 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -27,9 +27,12 @@ xla_cc_test(
     srcs = ["tracked_cpu_device_buffer_test.cc"],
     deps = [
         ":abstract_cpu_buffer",
+        ":cpu_client",
         ":cpu_event",
         "//xla:util",
+        "//xla/pjrt:pjrt_client",
         "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
@@ -52,7 +55,6 @@ cc_library(
     visibility = internal_visibility(["//xla/pjrt/cpu:legacy_cpu_buffer_internal_users"]),
     deps = [
         ":cpu_event",
-        "//xla:cpu_function_runtime",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_tree",
@@ -74,6 +76,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -110,7 +113,6 @@ cc_library(
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:semaphore",
         "//xla/pjrt/plugin/xla_cpu:cpu_device_description",
         "//xla/service/cpu:cpu_xfeed",
@@ -155,6 +157,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/backends/cpu/runtime:buffer_allocations",
@@ -204,6 +207,7 @@ cc_library(
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_executable_run_options",
+        "//xla/service/cpu:executable_proto_cc",
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
@@ -264,7 +268,6 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
@@ -308,3 +311,58 @@ cc_library(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+cc_library(
+    name = "cpu_pjrt_compiler",
+    srcs = ["cpu_pjrt_compiler.cc"],
+    hdrs = ["cpu_pjrt_compiler.h"],
+    visibility = internal_visibility(["//xla/pjrt/cpu:legacy_cpu_client_users"]),
+    deps = [
+        "//xla:status_macros",
+        "//xla/backends/cpu/collectives:cpu_collectives",
+        "//xla/core/collectives:clique_id",
+        "//xla/core/collectives:clique_key",
+        "//xla/core/collectives:communicator",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/plugin/xla_cpu:cpu_client_options",
+        "//xla/pjrt/plugin/xla_cpu:cpu_topology_description",
+        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "//xla/stream_executor/platform:initialize",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:casts",
+    ],
+    alwayslink = True,
+)
+
+xla_cc_test(
+    name = "cpu_pjrt_compiler_test",
+    srcs = ["cpu_pjrt_compiler_test.cc"],
+    deps = [
+        ":cpu_client",
+        ":cpu_pjrt_compiler",
+        "//xla:debug_options_flags",
+        "//xla/backends/cpu/codegen:cpu_features",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/mlir_hlo",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/plugin/xla_cpu:cpu_topology",
+        "//xla/pjrt/plugin/xla_cpu:cpu_topology_description",
+        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
index e78c0290b95433..3a98e4200f496a 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
@@ -101,37 +101,6 @@ void PackOrCopy(PrimitiveType element_type, const LiteralSlice& literal,
   }
 }
 
-/*static*/ absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>>
-AbstractCpuBuffer::AllocateTrackedDeviceBuffer(
-    const Shape& on_device_shape,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events) {
-  if (on_device_shape.IsTuple()) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Tuples are not supported for cpu-buffers: ",
-                     on_device_shape.ToString()));
-  }
-  size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
-  TF_ASSIGN_OR_RETURN(tsl::AsyncValueRef<CpuDeviceMemory> device_buffer,
-                      CpuDeviceMemory::Allocate(byte_size));
-  return std::make_unique<TrackedCpuDeviceBuffer>(
-      /*owns_buffers=*/true, std::move(device_buffer),
-      std::move(definition_events));
-}
-
-/*static*/ void AbstractCpuBuffer::AllocateAvsAndEvents(
-    const Shape& shape,
-    absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events) {
-  // Nested tuple shapes are not supported here.
-  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes().size() : 1;
-  for (int i = 0; i < num_leaf_buffers; ++i) {
-    tsl::AsyncValueRef<CpuEvent> definition_event =
-        tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-    definition_events->push_back(definition_event.CopyRef());
-    avs->push_back(std::move(definition_event));
-  }
-}
-
 /*static*/ bool AbstractCpuBuffer::BufferFromHostBufferSupportsZeroCopy(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides, const Shape& shape) {
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.h b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.h
index bd1f5fb5798075..1be7191c37d086 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.h
@@ -78,21 +78,6 @@ class MarkEventReadyOnExit {
 
 class AbstractCpuBuffer {
  public:
-  // Allocates a new `TrackedCpuDeviceBuffer` with the given shape and
-  // definition events.
-  static absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>>
-  AllocateTrackedDeviceBuffer(
-      const Shape& on_device_shape,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
-
-  // Allocates new cpu events to `avs` and `definition_events`. If `shape` is a
-  // tuple, multiple events will be allocated. Otherwise, `avs` and
-  // `definition_events` will only contain one event.
-  static void AllocateAvsAndEvents(
-      const Shape& shape,
-      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events);
-
   // A helper function to determine if a BufferFromHostBuffer call is eligible
   // for zero copy construction.
   static bool BufferFromHostBufferSupportsZeroCopy(
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 216c6b3ed5feca..c63289caca5e2b 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -54,6 +55,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/backends/cpu/runtime/xfeed_manager.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
@@ -103,6 +105,7 @@ limitations under the License.
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
+#include "xla/service/cpu/executable.pb.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
@@ -143,6 +146,16 @@ static int CpuDeviceCount() {
   return GetDebugOptionsFromFlags().xla_force_host_platform_device_count();
 }
 
+namespace cpu {
+
+PjRtPlatformId PlatformId() { return tsl::Fingerprint64(CpuName()); }
+
+absl::string_view PlatformName() { return CpuName(); }
+
+absl::string_view PlatformVersion() { return CpuName(); }
+
+}  // namespace cpu
+
 namespace {
 
 // A custom memory allocator function passed via the CPU client options.
@@ -189,22 +202,50 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetPjRtCpuClient(
   int cpu_device_count = options.cpu_device_count.value_or(CpuDeviceCount());
   size_t num_threads = std::max(DefaultThreadPoolSize(), cpu_device_count);
 
+  std::unique_ptr<CpuDeviceMemory::Allocator> allocator =
+      options.allocator ? std::make_unique<CustomAllocator>(options.allocator)
+                        : CpuDeviceMemory::MakeDefaultAllocator();
+
+  std::unique_ptr<CpuTopologyDescription> topology = nullptr;
+  if (!options.topology) {
+    std::vector<CpuTopology::CpuDevice> cpu_topology_devices;
+    cpu_topology_devices.reserve(cpu_device_count);
+    for (int i = 0; i < cpu_device_count; ++i) {
+      cpu_topology_devices.push_back(
+          CpuTopology::CpuDevice{options.process_id, i});
+    }
+
+    topology = std::make_unique<CpuTopologyDescription>(
+        cpu::PlatformId(), cpu::PlatformName(), cpu::PlatformVersion(),
+        cpu_topology_devices,
+        cpu::DetectMachineAttributes(
+            cpu::CpuFeatureFromString(
+                GetDebugOptionsFromFlags().xla_cpu_max_isa()))
+            .features);
+  } else {
+    topology = std::make_unique<CpuTopologyDescription>(*options.topology);
+    if (topology->cpu_topology().number_of_devices() != cpu_device_count) {
+      return InvalidArgument(
+          "Number of devices in topology (%d) does not match "
+          "cpu_device_count (%d)",
+          topology->cpu_topology().number_of_devices(), cpu_device_count);
+    }
+  }
+
   std::vector<std::unique_ptr<PjRtCpuDevice>> devices;
-  for (int i = 0; i < cpu_device_count; ++i) {
+  devices.reserve(topology->cpu_topology().number_of_devices());
+  for (const auto& topology_device : topology->cpu_topology().devices()) {
     auto device = std::make_unique<PjRtCpuDevice>(
-        options.process_id, /*local_device_id=*/i,
+        topology_device.process_id, topology_device.local_device_id,
         options.max_inflight_computations_per_device);
     devices.push_back(std::move(device));
   }
 
-  std::unique_ptr<CpuDeviceMemory::Allocator> allocator =
-      options.allocator ? std::make_unique<CustomAllocator>(options.allocator)
-                        : CpuDeviceMemory::MakeDefaultAllocator();
-
   return std::unique_ptr<PjRtClient>(new PjRtCpuClient(
       options.process_id, std::move(devices), std::move(allocator),
       std::move(options.collectives), num_threads, options.asynchronous,
-      std::move(options.customize_hlo_module_config)));
+      std::move(options.customize_hlo_module_config),
+      options.max_transpose_threads, std::move(topology)));
 }
 
 // An upper bound on the number of threads to use for intra-op parallelism. It
@@ -222,25 +263,13 @@ static tsl::ThreadOptions GetThreadOptions() {
   return thread_options;
 }
 
-// Returns the CPU devices from the given PjRtCpuDevices.
-// Precondition: `devices` doesn't contain nullptr.
-static std::vector<CpuTopology::CpuDevice> GetCpuDevices(
-    absl::Span<const std::unique_ptr<PjRtCpuDevice>> devices) {
-  std::vector<CpuTopology::CpuDevice> cpu_devices;
-  cpu_devices.reserve(devices.size());
-  for (const auto& device : devices) {
-    cpu_devices.push_back(CpuTopology::CpuDevice{
-        device->process_index(), device->local_hardware_id().value()});
-  }
-  return cpu_devices;
-}
-
 PjRtCpuClient::PjRtCpuClient(
     int process_index, std::vector<std::unique_ptr<PjRtCpuDevice>> devices,
     std::shared_ptr<CpuDeviceMemory::Allocator> allocator,
     std::shared_ptr<cpu::CpuCollectives> collectives, size_t num_threads,
     bool asynchronous,
-    std::function<void(HloModuleConfig&)> customize_hlo_module_config)
+    std::function<void(HloModuleConfig&)> customize_hlo_module_config,
+    int max_transpose_threads, std::unique_ptr<CpuTopologyDescription> topology)
     : process_index_(process_index),
       owned_devices_(std::move(devices)),
       computation_placer_(std::make_unique<ComputationPlacer>()),
@@ -249,8 +278,7 @@ PjRtCpuClient::PjRtCpuClient(
           tsl::MakeAvailableAsyncValueRef<CpuEvent>()),
       transpose_cache_(1024),
       collectives_(std::move(collectives)),
-      topology_(platform_id(), platform_name(), platform_version(),
-                GetCpuDevices(owned_devices_), cpu::DetectMachineAttributes()),
+      topology_(std::move(topology)),
       asynchronous_(asynchronous),
       customize_hlo_module_config_(std::move(customize_hlo_module_config)),
       eigen_intraop_pool_(new tsl::thread::ThreadPool(
@@ -263,7 +291,8 @@ PjRtCpuClient::PjRtCpuClient(
           new tsl::thread::ThreadPool(tsl::Env::Default(), GetThreadOptions(),
                                       "XLAPjRtCpuClient", num_threads)),
       async_work_runner_(
-          MakeThreadPoolAsyncWorkRunner(pjrt_client_thread_pool_.get())) {
+          MakeThreadPoolAsyncWorkRunner(pjrt_client_thread_pool_.get())),
+      max_transpose_threads_(max_transpose_threads) {
   for (const std::unique_ptr<PjRtCpuDevice>& device : owned_devices_) {
     devices_.push_back(device.get());
     CHECK(
@@ -440,7 +469,7 @@ PjRtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
     return Internal(
         "PjRtCpuClient::DeserializeExecutable proto too large (>2GB)");
   }
-  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+  if (!proto.ParseFromString(serialized)) {
     return Internal(
         "PjRtCpuClient::DeserializeExecutable proto deserialization failed");
   }
@@ -540,31 +569,10 @@ PjRtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
 }
 
 static absl::StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
-    const XlaComputation& computation,
-    const absl::Span<const Shape* const> argument_layouts,
+    std::unique_ptr<HloModule> hlo_module,
     const ExecutableBuildOptions& build_options,
     const ExecutionOptions& execution_options,
-    const xla::Compiler::CompileOptions& compile_options, int num_threads,
-    std::function<void(HloModuleConfig&)> customize_hlo_module_config) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation.GetProgramShape());
-  // Unoptimized HloModuleConfig.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> hlo_module_config,
-      CreateModuleConfig(program_shape, argument_layouts, &execution_options,
-                         execution_options.num_replicas(), num_threads,
-                         /*aot_options=*/nullptr));
-
-  // Apply the user-provided callback to customize the HloModuleConfig.
-  if (customize_hlo_module_config) {
-    customize_hlo_module_config(*hlo_module_config);
-  }
-
-  // Unoptimized HloModule.
-  const xla::HloModuleProto& hlo_module_proto = computation.proto();
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      xla::HloModule::CreateFromProto(hlo_module_proto, *hlo_module_config));
+    const xla::Compiler::CompileOptions& compile_options) {
   VLOG(3) << "Unoptimized HLO module: " << hlo_module->ToString();
   static constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
@@ -589,32 +597,10 @@ static absl::StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
 }
 
 static absl::StatusOr<std::unique_ptr<xla::Executable>> CompileAheadOfTime(
-    const XlaComputation& computation,
-    const absl::Span<const Shape* const> argument_layouts,
+    std::unique_ptr<HloModule> hlo_module,
     const ExecutableBuildOptions& build_options,
     const ExecutionOptions& execution_options,
-    const xla::AotCompilationOptions& compile_options, int num_threads,
-    std::function<void(HloModuleConfig&)> customize_hlo_module_config) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation.GetProgramShape());
-  // Unoptimized HloModuleConfig.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> hlo_module_config,
-      CreateModuleConfig(program_shape, argument_layouts, &execution_options,
-                         execution_options.num_replicas(), num_threads,
-                         /*aot_options=*/&compile_options));
-
-  // Apply the user-provided callback to customize the HloModuleConfig.
-  if (customize_hlo_module_config) {
-    customize_hlo_module_config(*hlo_module_config);
-  }
-
-  // Unoptimized HloModule.
-  const xla::HloModuleProto& hlo_module_proto = computation.proto();
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      xla::HloModule::CreateFromProto(hlo_module_proto, *hlo_module_config));
-
+    const xla::AotCompilationOptions& compile_options) {
   cpu::CpuCompiler compiler;
   // TODO (basioli): honor build_options.run_backend_only() for AOT.
   // Compile AOT.
@@ -639,7 +625,7 @@ static absl::StatusOr<std::unique_ptr<xla::Executable>> CompileAheadOfTime(
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtCpuClient::CompileAndLoad(mlir::ModuleOp module, CompileOptions options) {
-  TF_RETURN_IF_ERROR(pjrt::MaybeDumpCompileInputs(options, module, topology_));
+  TF_RETURN_IF_ERROR(pjrt::MaybeDumpCompileInputs(options, module, *topology_));
 
   XlaComputation xla_computation;
   ExecutableBuildOptions& exec_build_options = options.executable_build_options;
@@ -814,13 +800,29 @@ PjRtCpuClient::CompileInternal(
   ExecutionOptions execution_options =
       CreateExecutionOptions(build_options, &program_shape);
 
+  // Unoptimized HloModuleConfig.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> hlo_module_config,
+      CreateModuleConfig(program_shape, argument_layout_pointers,
+                         &execution_options, execution_options.num_replicas(),
+                         eigen_intraop_device()->getPool()->NumThreads(),
+                         aot_options));
+
+  // Apply the user-provided callback to customize the HloModuleConfig.
+  if (customize_hlo_module_config_) {
+    customize_hlo_module_config_(*hlo_module_config);
+  }
+
+  // Unoptimized HloModule.
+  const xla::HloModuleProto& hlo_module_proto = computation.proto();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      xla::HloModule::CreateFromProto(hlo_module_proto, *hlo_module_config));
+
   if (aot_options) {
-    TF_ASSIGN_OR_RETURN(
-        cpu_executable,
-        CompileAheadOfTime(computation, argument_layout_pointers, build_options,
-                           execution_options, *aot_options,
-                           eigen_intraop_device()->getPool()->NumThreads(),
-                           customize_hlo_module_config_));
+    TF_ASSIGN_OR_RETURN(cpu_executable,
+                        CompileAheadOfTime(std::move(hlo_module), build_options,
+                                           execution_options, *aot_options));
   } else {
     xla::Compiler::CompileOptions compile_options{
         build_options.device_allocator(), build_options.compile_thread_pool(),
@@ -828,12 +830,19 @@ PjRtCpuClient::CompileInternal(
     if (!compile_options.thread_pool) {
       compile_options.thread_pool = pjrt_client_thread_pool();
     }
-    TF_ASSIGN_OR_RETURN(
-        cpu_executable,
-        JitCompile(computation, argument_layout_pointers, build_options,
-                   execution_options, compile_options,
-                   eigen_intraop_device()->getPool()->NumThreads(),
-                   customize_hlo_module_config_));
+
+    cpu::TargetMachineOptions target_machine_options(
+        hlo_module->config().debug_options());
+    // Overwrite the features with the machine attributes from the topology.
+
+    TF_RETURN_IF_ERROR(target_machine_options.SetFeatures(
+        absl::StrJoin(topology_->cpu_topology().machine_attributes(), ",")));
+
+    compile_options.cpu_target_config.emplace(target_machine_options);
+
+    TF_ASSIGN_OR_RETURN(cpu_executable,
+                        JitCompile(std::move(hlo_module), build_options,
+                                   execution_options, compile_options));
   }
 
   auto cpu_executable_ptr =
@@ -895,12 +904,13 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::CreateErrorBuffer(
   // Create a dummy buffer because the rest of the code expects a buffer
   // regardless of whether the definition event is an error.
   TF_ASSIGN_OR_RETURN(
-      auto buffer,
-      CpuDeviceMemory::Allocate(ShapeUtil::ByteSizeOf(shape), *allocator_));
+      auto raw_buffer,
+      CpuRawBuffer::Allocate(memory_space, ShapeUtil::ByteSizeOf(shape),
+                             *allocator_));
   return std::make_unique<CommonPjRtBufferImpl>(
       shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
-          /*owns_buffers=*/true, std::move(buffer),
+          /*owns_buffers=*/true, std::move(raw_buffer),
           absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>{
               tsl::AsyncValueRef<CpuEvent>(
                   tsl::MakeErrorAsyncValueRef(std::move(error)))}),
@@ -936,7 +946,8 @@ PjRtCpuClient::LinearizeHostBufferInto(
       ->CopyFromHostBuffer(
           data, type, dims, byte_strides, host_buffer_semantics,
           std::move(on_done_with_host_buffer), device_shape,
-          async_work_runner(), &transpose_mu_, &transpose_cache_);
+          async_work_runner(), &transpose_mu_, &transpose_cache_,
+          eigen_intraop_pool(), max_transpose_threads_);
 }
 
 absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> PjRtCpuClient::LinearizeInto(
@@ -982,11 +993,17 @@ PjRtCpuClient::CreateLinkedEventPromise(PjRtMemorySpace* memory_space,
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::DefineBuffer(
-    const Shape& on_device_shape,
+    const Shape& on_device_shape, PjRtMemorySpace* memory_space,
     tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
         definition_device_events,
     bool raw_buffer_is_mutable) {
+  if (raw_buffer && raw_buffer->memory_space() != memory_space) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("DefineBuffer: Mismatch in memory spaces: %s vs %s",
+                        raw_buffer->memory_space()->DebugString(),
+                        memory_space->DebugString()));
+  }
   absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
   for (auto& ev : definition_device_events) {
     definition_events.push_back(
@@ -995,10 +1012,9 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::DefineBuffer(
   return std::unique_ptr<PjRtBuffer>(std::make_unique<CommonPjRtBufferImpl>(
       on_device_shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
-          /*owns_buffers=*/raw_buffer_is_mutable,
-          tsl::down_cast<CpuRawBuffer*>(raw_buffer.get())->buffer(),
+          /*owns_buffers=*/raw_buffer_is_mutable, std::move(raw_buffer),
           ShapeUtil::ByteSizeOf(on_device_shape), std::move(definition_events)),
-      raw_buffer->memory_space()));
+      memory_space));
 }
 
 absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>
@@ -1012,71 +1028,37 @@ PjRtCpuClient::AllocateRawBuffer(PjRtMemorySpace* memory_space,
                                      *allocator_);
 }
 
-absl::StatusOr<std::tuple<tsl::RCReference<CommonPjRtRawBuffer>,
-                          tsl::RCReference<PjRtDeviceEvent>,
-                          PjRtFulfillAliasBufferCallback>>
-PjRtCpuClient::CreateRawBufferChannel(const Shape& shape,
-                                      PjRtMemorySpace* memory_space) {
+absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                         CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
+PjRtCpuClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
   auto buffer_promise = tsl::MakeIndirectAsyncValue();
   auto raw_buffer = tsl::MakeRef<CpuRawBuffer>(
       memory_space, tsl::AsyncValueRef<CpuDeviceMemory>(buffer_promise));
 
-  tsl::RCReference<xla::PjRtDeviceEventPromise> definition_event_promise;
-  tsl::RCReference<xla::PjRtDeviceEvent> definition_event;
-  TF_ASSIGN_OR_RETURN(
-      std::tie(definition_event_promise, definition_event),
-      CreateLinkedEventPromise(memory_space, "CreateRawBufferChannel"));
-
-  PjRtFulfillAliasBufferCallback fulfill_alias_buffer_cb =
-      [buffer_promise = std::move(buffer_promise),
-       definition_event_promise = std::move(definition_event_promise),
-       memory_space,
-       shape](absl::StatusOr<xla::PjRtBuffer*> buffer_or) mutable {
-        tsl::RCReference<xla::PjRtDeviceEvent> device_event;
-        if (!buffer_or.ok()) {
-          definition_event_promise->SetError(buffer_or.status());
-          buffer_promise->SetError(buffer_or.status());
-          return buffer_or.status();
-        }
-        xla::PjRtBuffer* buffer = buffer_or.value();
-        if (buffer->on_device_shape() != shape) {
-          auto status = absl::InvalidArgumentError(absl::StrFormat(
-              "Shape mismatch in CreateRawBufferChannel fulfill: expected %s, "
-              "got "
-              "%s",
-              shape.ToString(), buffer->on_device_shape().ToString()));
-          definition_event_promise->SetError(status);
-          buffer_promise->SetError(status);
-          return status;
-        }
-        xla::CommonPjRtBuffer* common_buffer =
-            dynamic_cast<xla::CommonPjRtBuffer*>(buffer);
-        if (common_buffer == nullptr) {
-          auto status =
-              absl::InternalError("Failed to cast to CommonPjRtBuffer");
-          definition_event_promise->SetError(status);
-          buffer_promise->SetError(status);
-          return status;
-        }
-        xla::CommonPjRtBuffer::ScopedHold hold =
-            common_buffer->GetBufferWithHold(
-                xla::CommonPjRtBuffer::ScopedHold::kDonation);
-        TF_ASSIGN_OR_RETURN(device_event,
-                            hold.buffer()->GetDefinitionEvent(memory_space));
-
-        auto* tracked_cpu_buffer =
-            tensorflow::down_cast<TrackedCpuDeviceBuffer*>(hold.buffer());
-        tsl::AsyncValueRef<CpuDeviceMemory> real_cpu_buffer =
-            tracked_cpu_buffer->buffer();
-
-        buffer_promise->ForwardTo(real_cpu_buffer.CopyRCRef());
-        definition_event_promise->Set(device_event);
-        hold.ConfirmDonation();
-        return absl::OkStatus();
-      };
+  auto buffer_promise_cb =
+      [buffer_promise = std::move(buffer_promise), memory_space](
+          absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> raw_buffer)
+      -> absl::Status {
+    if (!raw_buffer.ok()) {
+      buffer_promise->SetError(raw_buffer.status());
+      return raw_buffer.status();
+    }
+    if (memory_space != (*raw_buffer)->memory_space()) {
+      auto status = absl::InvalidArgumentError(absl::StrFormat(
+          "Memory space mismatch when forarding raw buffers: %s vs %s",
+          memory_space->DebugString(),
+          (*raw_buffer)->memory_space()->DebugString()));
+      buffer_promise->SetError(status);
+      return status;
+    }
+    buffer_promise->ForwardTo(
+        tensorflow::down_cast<xla::CpuRawBuffer*>(raw_buffer->get())
+            ->buffer()
+            .CopyRCRef());
+    return absl::OkStatus();
+  };
 
-  return std::make_tuple(std::move(raw_buffer), std::move(definition_event),
-                         std::move(fulfill_alias_buffer_cb));
+  return std::make_pair(std::move(raw_buffer), std::move(buffer_promise_cb));
 }
 
 absl::StatusOr<int64_t> PjRtCpuClient::GetOnDeviceBytesCount(
@@ -1403,20 +1385,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
   CHECK_EQ(device->process_index(), client_->process_index());
 
   // Handle inputs.
-  if (options.arguments_are_tupled) {
-    if (!parameter_is_tupled_arguments_) {
-      return InvalidArgument(
-          "Arguments may only be supplied as a tuple when the executable was "
-          "compiled with a single tupled parameter");
-    }
-    if (argument_handles.size() != 1) {
-      return InvalidArgument(
-          "Option arguments_are_tupled was true but %d buffers were passed to "
-          "execution",
-          argument_handles.size());
-    }
-  }
-
   // `execute_event` indicates whether cpu computation is complete and whether
   // there was an error.
   auto execute_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
@@ -1516,7 +1484,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
   // Tuplize the inputs if compiler expects a single tuple argument but runtime
   // gets many inputs that are not yet tupled.
   tsl::AsyncValueRef<CpuDeviceMemory> tuple_index_table;
-  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
+  if (parameter_is_tupled_arguments_) {
     absl::InlinedVector<tsl::AsyncValueRef<CpuDeviceMemory>, 4> leaf_buffers;
     leaf_buffers.reserve(tracked_buffers.size());
     for (const auto& tracked_buffer : tracked_buffers) {
@@ -1540,8 +1508,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
         });
   }
 
-  auto* cpu_executable =
-      tsl::down_cast<cpu::CpuExecutable*>(cpu_executable_.get());
+  auto cpu_executable =
+      tsl::down_pointer_cast<cpu::CpuExecutable>(cpu_executable_);
   // `buffer_alloc` and `buffer_alloc_and_copy` are used to do real memory
   // allocation and copy work.
   BufferAlloc buffer_alloc;
@@ -1676,6 +1644,12 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
                             cpu::Thunk::XnnParams::Create(&run_options));
       }
 
+      std::optional<cpu::Thunk::YnnParams> ynn_params;
+      if (cpu_executable->has_ynn_fusions()) {
+        TF_ASSIGN_OR_RETURN(ynn_params,
+                            cpu::Thunk::YnnParams::Create(&run_options));
+      }
+
       cpu::ThreadPoolTaskRunner task_runner(
           run_options.intra_op_thread_pool()->getPool());
 
@@ -1688,6 +1662,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           &collective_params,
           &custom_call_execute_params,
           xnn_params ? &*xnn_params : nullptr,
+          ynn_params ? &*ynn_params : nullptr,
           run_options.run_id().ToInt(),
           run_options.device_ordinal(),
       };
@@ -1748,7 +1723,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
          buffer_alloc_and_copy = std::move(buffer_alloc_and_copy),
          buffer_table = std::move(buffer_table),
          run_options = std::move(run_options),
-         cpu_executable_copy = cpu_executable_,
          device_assignment = std::move(device_assignment),
          cpu_run_options = std::move(cpu_run_options),
          compute_reservation = std::move(compute_reservation),
@@ -1814,6 +1788,12 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
               xnn_params = cpu::Thunk::XnnParams::Create(&run_options);
             }
 
+            absl::StatusOr<std::optional<cpu::Thunk::YnnParams>> ynn_params(
+                std::nullopt);
+            if (cpu_executable->has_ynn_fusions()) {
+              ynn_params = cpu::Thunk::YnnParams::Create(&run_options);
+            }
+
             cpu::ThreadPoolTaskRunner task_runner(
                 run_options.intra_op_thread_pool()->getPool());
 
@@ -1827,6 +1807,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
                   &*collective_params,
                   &*custom_call_params,
                   *xnn_params ? &**xnn_params : nullptr,
+                  *ynn_params ? &**ynn_params : nullptr,
                   run_options.run_id().ToInt(),
                   run_options.device_ordinal(),
               };
@@ -1869,6 +1850,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
   // Create output buffers.
   const Shape& result_shape = cpu_executable_->result_shape();
   std::vector<std::unique_ptr<PjRtBuffer>> res;
+  auto* memory_space = *device->default_memory_space();
   if (result_shape.IsTuple()) {
     res.reserve(result_buffers_info.size());
     for (int i = 0; i < result_buffers_info.size(); ++i) {
@@ -1878,11 +1860,12 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
       auto leaf_tracked_device_buffer =
           std::make_unique<TrackedCpuDeviceBuffer>(
               result_buffers_info[i].owns_buffer,
-              std::move(result_buffers_info[i].buffer),
+              tsl::MakeRef<CpuRawBuffer>(
+                  memory_space, std::move(result_buffers_info[i].buffer)),
               result_buffers_info[i].buffer_size, std::move(definition_events));
       auto leaf_buffer = std::make_unique<CommonPjRtBufferImpl>(
           result_shape.tuple_shapes(i), std::move(leaf_tracked_device_buffer),
-          *device->default_memory_space());
+          memory_space);
       res.push_back(std::move(leaf_buffer));
     }
   } else {
@@ -1890,12 +1873,12 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
     // Program execution writes to output buffers so it's a definition event.
     auto tracked_device_buffer = std::make_unique<TrackedCpuDeviceBuffer>(
         result_buffers_info[0].owns_buffer,
-        std::move(result_buffers_info[0].buffer),
+        tsl::MakeRef<CpuRawBuffer>(memory_space,
+                                   std::move(result_buffers_info[0].buffer)),
         result_buffers_info[0].buffer_size,
         /*definition_event=*/execute_event);
     auto output_buffer = std::make_unique<CommonPjRtBufferImpl>(
-        result_shape, std::move(tracked_device_buffer),
-        *device->default_memory_space());
+        result_shape, std::move(tracked_device_buffer), memory_space);
     res.push_back(std::move(output_buffer));
   }
 
@@ -1959,14 +1942,6 @@ PjRtCpuExecutable::Execute(
     std::optional<std::vector<Future<>>>& returned_futures) const {
   RunId run_id(options.launch_id);
   tsl::profiler::TraceMe trace_me("PjRtCpuExecutable::Execute");
-  if (!options.untuple_result && cpu_executable_->module()
-                                     .config()
-                                     .entry_computation_layout()
-                                     .result_shape()
-                                     .IsTuple()) {
-    return InvalidArgument(
-        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
-  }
   if (device_assignment_ == nullptr) {
     return InvalidArgument("Execute expects a non-null device_assignment");
   }
@@ -2100,14 +2075,6 @@ PjRtCpuExecutable::ExecuteSharded(
   if (device_assignment_ == nullptr) {
     return InvalidArgument("ExecuteShard expects a non-null device_assignment");
   }
-  if (!options.untuple_result && cpu_executable_->module()
-                                     .config()
-                                     .entry_computation_layout()
-                                     .result_shape()
-                                     .IsTuple()) {
-    return InvalidArgument(
-        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
-  }
   for (int i = 0; i < addressable_devices_.size(); ++i) {
     if (addressable_devices_[i] == device) {
       VLOG(1) << "ExecuteShard executes computation " << name()
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index bbe42478ef3c39..e2e29df765002a 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -78,6 +78,16 @@ limitations under the License.
 
 namespace xla {
 
+namespace cpu {
+
+PjRtPlatformId PlatformId();
+
+absl::string_view PlatformName();
+
+absl::string_view PlatformVersion();
+
+}  // namespace cpu
+
 class PjRtCpuClient final : public CommonPjRtClient {
  public:
   ~PjRtCpuClient() override;
@@ -104,13 +114,15 @@ class PjRtCpuClient final : public CommonPjRtClient {
 
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
 
-  PjRtPlatformId platform_id() const override {
-    return tsl::Fingerprint64(CpuName());
-  }
+  PjRtPlatformId platform_id() const override { return cpu::PlatformId(); }
 
-  absl::string_view platform_name() const override { return CpuName(); }
+  absl::string_view platform_name() const override {
+    return cpu::PlatformName();
+  }
 
-  absl::string_view platform_version() const override { return CpuName(); }
+  absl::string_view platform_version() const override {
+    return cpu::PlatformVersion();
+  }
 
   absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
@@ -172,6 +184,10 @@ class PjRtCpuClient final : public CommonPjRtClient {
     return async_work_runner_.get();
   }
 
+  tsl::thread::ThreadPool* eigen_intraop_pool() const {
+    return eigen_intraop_pool_.get();
+  }
+
   Eigen::ThreadPoolDevice* eigen_intraop_device() const {
     return eigen_intraop_device_.get();
   }
@@ -197,14 +213,12 @@ class PjRtCpuClient final : public CommonPjRtClient {
 
   absl::StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
       const override {
-    return &topology_;
+    return topology_.get();
   }
 
-  absl::StatusOr<std::tuple<tsl::RCReference<CommonPjRtRawBuffer>,
-                            tsl::RCReference<PjRtDeviceEvent>,
-                            PjRtFulfillAliasBufferCallback>>
-  CreateRawBufferChannel(const Shape& shape,
-                         PjRtMemorySpace* memory_space) override;
+  absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                           PjRtFulfillAliasRawBufferCallback>>
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
 
   absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> AllocateRawBuffer(
       PjRtMemorySpace* memory_space, size_t on_device_bytes_count,
@@ -216,7 +230,7 @@ class PjRtCpuClient final : public CommonPjRtClient {
                            absl::string_view debug_info) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> DefineBuffer(
-      const Shape& on_device_shape,
+      const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
           definition_device_events,
@@ -258,7 +272,9 @@ class PjRtCpuClient final : public CommonPjRtClient {
       std::shared_ptr<CpuDeviceMemory::Allocator> allocator,
       std::shared_ptr<cpu::CpuCollectives> collectives, size_t num_threads,
       bool asynchronous,
-      std::function<void(HloModuleConfig&)> customize_hlo_module_config);
+      std::function<void(HloModuleConfig&)> customize_hlo_module_config,
+      int max_transpose_threads,
+      std::unique_ptr<CpuTopologyDescription> topology);
 
   absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileInternal(
       const XlaComputation& computation,
@@ -311,7 +327,7 @@ class PjRtCpuClient final : public CommonPjRtClient {
 
   std::shared_ptr<cpu::CpuCollectives> collectives_;
 
-  xla::CpuTopologyDescription topology_;
+  std::unique_ptr<xla::CpuTopologyDescription> topology_;
 
   // Used to control whether asynchronous computation dispatch is available for
   // this client. Only applies to non-parallel computations.
@@ -332,6 +348,10 @@ class PjRtCpuClient final : public CommonPjRtClient {
   // Thread pool for running PjRtClient tasks.
   std::unique_ptr<tsl::thread::ThreadPool> pjrt_client_thread_pool_;
   std::unique_ptr<AsyncWorkRunner> async_work_runner_;
+
+  // Maximum number of threads to use for any one transpose. We will use the
+  // the lesser of this number and the thread pool size. 1 = no threading.
+  int max_transpose_threads_;
 };
 
 class PjRtCpuExecutable final : public PjRtLoadedExecutable {
@@ -502,18 +522,6 @@ inline absl::StatusOr<std::unique_ptr<PjRtClient>> ABSL_DEPRECATED(
   return GetPjRtCpuClient(std::move(options));
 }
 
-// Deprecated. Use the overload that takes 'options' instead.
-inline absl::StatusOr<std::unique_ptr<PjRtClient>> GetPjRtCpuClient(
-    bool asynchronous, int cpu_device_count,
-    int max_inflight_computations_per_device = 32) {
-  CpuClientOptions options;
-  options.asynchronous = asynchronous;
-  options.cpu_device_count = cpu_device_count;
-  options.max_inflight_computations_per_device =
-      max_inflight_computations_per_device;
-  return GetPjRtCpuClient(std::move(options));
-}
-
 }  // namespace xla
 
 #endif  // XLA_PJRT_CPU_CPU_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index befeb9c5be200a..f936afbe72e944 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <array>
 
-#include "absl/status/status_matchers.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_memory.h"
@@ -36,6 +35,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -62,7 +62,6 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -1033,6 +1032,59 @@ TEST(PjRtCpuClientTest, CustomAllocator) {
   EXPECT_THAT(data, ElementsAreArray(literal.data<float>()));
 }
 
+TEST(PjRtCpuClientTest, SerializeYnnFusions) {
+  constexpr absl::string_view kProgram = R"(
+    HloModule add_and_multiply
+
+    ynn_fusion {
+      %lhs = f32[4] parameter(0)
+      %rhs = f32[4] parameter(1)
+      %add = f32[4] add(%lhs, %rhs)
+      ROOT %mul = f32[4] multiply(%add, %add)
+    }
+
+    ENTRY entry {
+      %p0 = f32[4] parameter(0)
+      %p1 = f32[4] parameter(1)
+      ROOT %fusion = f32[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetPjRtCpuClient(CpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnUnverifiedModule(kProgram, {}));
+
+  XlaComputation xla_computation(m->ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->CompileAndLoad(xla_computation, {}));
+
+  Literal literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  TF_ASSERT_OK_AND_ASSIGN(auto buf, client->BufferFromHostLiteral(
+                                        literal, client->memory_spaces()[0]));
+
+  ExecuteOptions opts;
+  auto result = executable->Execute({{buf.get(), buf.get()}}, opts);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          result->at(0).at(0)->ToLiteralSync());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
+      *result_literal));
+
+  // Check that serialized/deserialized executable works and produces the same
+  // result.
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          executable->SerializeExecutable());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto reloaded_executable,
+      client->LoadSerializedExecutable(serialized, std::nullopt, {}));
+
+  result = executable->Execute({{buf.get(), buf.get()}}, opts);
+  TF_ASSERT_OK_AND_ASSIGN(result_literal, result->at(0).at(0)->ToLiteralSync());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
+      *result_literal));
+}
+
 }  // namespace
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_device.h b/third_party/xla/xla/pjrt/cpu/cpu_device.h
index a37a29f98e5175..acccb1a51351ca 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_device.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_device.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "xla/pjrt/cpu/execution_stream_event_map.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_device_description.h"
 #include "xla/pjrt/semaphore.h"
 
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler.cc
new file mode 100644
index 00000000000000..a6336c19a68c58
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler.cc
@@ -0,0 +1,117 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/cpu_pjrt_compiler.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/backends/cpu/collectives/cpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
+#include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
+#include "xla/stream_executor/platform/initialize.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/casts.h"
+
+namespace xla::cpu {
+
+namespace {
+
+// A dummy CpuCollectives implementation used for compilation.
+class DummyCpuCollectives : public xla::cpu::CpuCollectives {
+ public:
+  absl::StatusOr<std::vector<std::unique_ptr<xla::Communicator>>>
+  CreateCommunicators(const xla::CliqueKey& clique_key,
+                      const std::optional<xla::CliqueIds>& clique_ids,
+                      absl::Span<const DeviceRank> ranks,
+                      const Config& config) final {
+    return absl::UnimplementedError(
+        "DummyCpuCollectives::CreateCommunicators is not implemented");
+  }
+};
+
+// Creates a PjRt CPU client from the given topology description.
+//
+absl::StatusOr<std::unique_ptr<xla::PjRtClient>>
+CreatePjRtCpuClientFromTopology(
+    const xla::PjRtTopologyDescription& topology_description) {
+  xla::CpuClientOptions options;
+  TF_ASSIGN_OR_RETURN(options.cpu_device_count,
+                      topology_description.CoreCountOfDefaultTypePerProcess());
+  CHECK_GE(*options.cpu_device_count, 1);
+  auto cpu_topology_description =
+      tsl::down_cast<const xla::CpuTopologyDescription*>(&topology_description);
+  if (cpu_topology_description == nullptr) {
+    return absl::InvalidArgumentError(
+        "Topology description is not a CpuTopologyDescription");
+  }
+  options.topology = cpu_topology_description;
+  // We need to provide `CpuCollectives` to be able to compile multi-host/-slice
+  // CPU computations. The details of the collectives is not important because
+  // the compilation only checks if any `CpuCollectives` exists.
+  options.collectives = std::make_shared<DummyCpuCollectives>();
+  return xla::GetXlaPjrtCpuClient(options);
+}
+
+template <typename T>
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> CompileInternal(
+    const T& computation, CompileOptions options,
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
+  TF_ASSIGN_OR_RETURN(auto cpu_client,
+                      CreatePjRtCpuClientFromTopology(topology));
+
+  TF_ASSIGN_OR_RETURN(auto loaded_executable,
+                      cpu_client->CompileAndLoad(computation, options));
+
+  return std::make_unique<xla::PjRtExecutableForwarder>(
+      std::move(loaded_executable));
+}
+
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> CpuPjRtCompiler::Compile(
+    CompileOptions options, const XlaComputation& computation,
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
+  return CompileInternal(computation, options, topology, client);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> CpuPjRtCompiler::Compile(
+    CompileOptions options, mlir::ModuleOp module,
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
+  return CompileInternal(module, options, topology, client);
+}
+
+}  // namespace xla::cpu
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_cpu_compiler, {
+  std::unique_ptr<xla::PjRtCompiler> compiler =
+      std::make_unique<xla::cpu::CpuPjRtCompiler>();
+  PjRtRegisterCompiler(xla::CpuName(), std::move(compiler));
+});
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler.h
new file mode 100644
index 00000000000000..b9225428552444
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler.h
@@ -0,0 +1,46 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_CPU_PJRT_COMPILER_H_
+#define XLA_PJRT_CPU_CPU_PJRT_COMPILER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+
+namespace xla::cpu {
+
+class CpuPjRtCompiler : public PjRtCompiler {
+ public:
+  // Compiles the 'computation' and returns a 'PjRtExecutable'. The returned
+  // PjRtExecutable must be loaded by a compatible client before execution.
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, const XlaComputation& computation,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+  // Variant of `Compile` that accepts an MLIR module.
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, mlir::ModuleOp module,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_PJRT_CPU_CPU_PJRT_COMPILER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
new file mode 100644
index 00000000000000..e8fd02e09bc7c8
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/cpu_pjrt_compiler.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Parser/Parser.h"
+#include "xla/backends/cpu/codegen/cpu_features.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/pjrt/cpu/cpu_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::cpu {
+namespace {
+
+constexpr absl::string_view kProgram = R"(HloModule Computation
+
+ENTRY Computation() -> s32[] {
+  ROOT result = s32[] constant(2)
+})";
+
+constexpr absl::string_view kMlirProgram = R"mlir(
+  module {
+    func.func @main() -> tensor<i32> {
+      %0 = mhlo.constant dense<2> : tensor<i32>
+      return %0 : tensor<i32>
+    }
+  })mlir";
+
+using CpuPjrtCompilerTest = xla::HloHardwareIndependentTestBase;
+
+std::unique_ptr<CpuTopologyDescription> GetDefaultCpuTopologyDescription() {
+  constexpr int kCpuDeviceCount = 1;
+  constexpr int kProcessId = 0;
+  std::vector<CpuTopology::CpuDevice> cpu_topology_devices;
+  cpu_topology_devices.reserve(kCpuDeviceCount);
+  for (int i = 0; i < kCpuDeviceCount; ++i) {
+    cpu_topology_devices.push_back(CpuTopology::CpuDevice{kProcessId, i});
+  }
+
+  return std::make_unique<CpuTopologyDescription>(
+      cpu::PlatformId(), cpu::PlatformName(), cpu::PlatformVersion(),
+      cpu_topology_devices,
+      DetectMachineAttributes(
+          CpuFeatureFromString(GetDebugOptionsFromFlags().xla_cpu_max_isa()))
+          .features);
+}
+
+TEST_F(CpuPjrtCompilerTest, CompileXlaComputationSuccess) {
+  xla::CompileOptions options;
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kProgram));
+  xla::XlaComputation computation(module->ToProto());
+
+  auto topology_description = GetDefaultCpuTopologyDescription();
+
+  xla::cpu::CpuPjRtCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(options, computation, *topology_description,
+                       /*client=*/nullptr));
+}
+
+TEST_F(CpuPjrtCompilerTest, CompileMlirOpSuccess) {
+  xla::CompileOptions options;
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::func::FuncDialect, mlir::mhlo::MhloDialect>();
+  auto mlir_module =
+      mlir::parseSourceString<mlir::ModuleOp>(kMlirProgram, &context);
+
+  auto topology_description = GetDefaultCpuTopologyDescription();
+
+  xla::cpu::CpuPjRtCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(options, *mlir_module, *topology_description,
+                       /*client=*/nullptr));
+}
+
+TEST_F(CpuPjrtCompilerTest, CompileXlaComputationWithAvx512FeatureOn) {
+  xla::CompileOptions options;
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kProgram));
+  xla::XlaComputation computation(module->ToProto());
+
+  constexpr int kCpuDeviceCount = 1;
+  constexpr int kProcessId = 0;
+  std::vector<CpuTopology::CpuDevice> cpu_topology_devices;
+  cpu_topology_devices.reserve(kCpuDeviceCount);
+  for (int i = 0; i < kCpuDeviceCount; ++i) {
+    cpu_topology_devices.push_back(CpuTopology::CpuDevice{kProcessId, i});
+  }
+
+  // Set custom topology.
+  auto topology_description = std::make_unique<CpuTopologyDescription>(
+      PlatformId(), PlatformName(), PlatformVersion(), cpu_topology_devices,
+      std::vector<std::string>{"+avx512"});
+
+  xla::cpu::CpuPjRtCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(options, computation, *topology_description,
+                       /*client=*/nullptr));
+
+  // We serialize and then load the executable to confirm that the target
+  // machine options were set correctly.
+
+  TF_ASSERT_OK_AND_ASSIGN(auto serialized_executable,
+                          executable->SerializeExecutable());
+
+  ExecutableAndOptionsProto proto;
+  EXPECT_TRUE(proto.ParseFromString(serialized_executable));
+
+  xla::cpu::CompilationResultProto compilation_result_proto;
+
+  EXPECT_TRUE(
+      compilation_result_proto.ParseFromString(proto.serialized_executable()));
+
+  EXPECT_THAT(compilation_result_proto.target_machine_options().features(),
+              testing::HasSubstr("+avx512"));
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
index 2de138d3761c01..ab453cceaba7c4 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "xla/pjrt/cpu/raw_buffer.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -54,6 +56,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -199,7 +202,8 @@ CpuRawBuffer::CopyFromHostBuffer(
     PjRtClient::HostBufferSemantics host_buffer_semantics,
     absl::AnyInvocable<void() &&> on_done_with_host_buffer, const Shape& shape,
     AsyncWorkRunner* async_work_runner, absl::Mutex* transpose_mu,
-    TransposePlanCache* transpose_cache) {
+    TransposePlanCache* transpose_cache, tsl::thread::ThreadPool* thread_pool,
+    int max_transpose_threads) {
   tsl::AsyncValueRef<CpuDeviceMemory> device_buffer = buffer_;
   bool has_default_layout =
       !byte_strides || HasMajorToMinorLayout(type, dims, *byte_strides);
@@ -230,18 +234,28 @@ CpuRawBuffer::CopyFromHostBuffer(
       if (byte_strides) {
         options.input_layout = TransposePlan::Striding{*byte_strides};
       }
+      if (thread_pool) {
+        options.num_threads =
+            std::min(thread_pool->NumThreads(), max_transpose_threads);
+      }
       absl::MutexLock lock(*transpose_mu);
       TF_ASSIGN_OR_RETURN(transpose, transpose_cache->GetOrCreate(options));
     }
+    std::optional<std::function<void(std::function<void(void)>)>> schedule_work;
+    if (thread_pool && max_transpose_threads > 1) {
+      schedule_work = [thread_pool](std::function<void(void)> work) {
+        thread_pool->Schedule(std::move(work));
+      };
+    }
     if (!is_packed) {
-      transpose->Execute(data, dst_data_ptr);
+      transpose->Execute(data, dst_data_ptr, schedule_work);
     } else {
       // First transpose the unpacked data into a new temporary buffer, then
       // pack the data.
       // TODO(reedwm): Fuse the transpose and packing by having TransposePlan
       // support packing.
       auto data_transposed = std::make_unique<char[]>(byte_size);
-      transpose->Execute(data, data_transposed.get());
+      transpose->Execute(data, data_transposed.get(), schedule_work);
       absl::Span<const char> src_data_span(data_transposed.get(), byte_size);
       absl::Span<char> dst_data_span(static_cast<char*>(dst_data_ptr),
                                      dst_byte_size);
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.h b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
index 7eb565af70b6c9..0d5f91fc08f74a 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -144,7 +145,8 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
       PjRtClient::HostBufferSemantics host_buffer_semantics,
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       const Shape& shape, AsyncWorkRunner* async_work_runner,
-      absl::Mutex* transpose_mu, TransposePlanCache* transpose_cache);
+      absl::Mutex* transpose_mu, TransposePlanCache* transpose_cache,
+      tsl::thread::ThreadPool* thread_pool, int max_transpose_threads);
 
   void ReadDynamicShape(tsl::AsyncValueRef<xla::Shape> output_shape,
                         xla::Shape shape) override;
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
index f5691607e8d785..78d53986c7db66 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/future.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/common_pjrt_client.h"
 #include "xla/pjrt/cpu/cpu_event.h"
 #include "xla/pjrt/cpu/raw_buffer.h"
@@ -208,40 +209,51 @@ absl::Status CpuDeviceMemory::AllocateInto(
 //===----------------------------------------------------------------------===//
 
 TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
-    : TrackedCpuDeviceBuffer(owns_buffers, std::move(buffer),
+    : TrackedCpuDeviceBuffer(owns_buffers, std::move(raw_buffer),
                              AfterAll(definition_events)) {}
 
 TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     size_t buffer_size,
     absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
-    : TrackedCpuDeviceBuffer(owns_buffers, std::move(buffer), buffer_size,
+    : TrackedCpuDeviceBuffer(owns_buffers, std::move(raw_buffer), buffer_size,
                              AfterAll(definition_events)) {}
 
 TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     tsl::AsyncValueRef<CpuEvent> definition_event)
-    : owns_buffers_(owns_buffers),
-      buffer_(std::move(buffer)),
+    : AbstractTrackedDeviceBuffer(std::move(raw_buffer)),
+      owns_buffers_(owns_buffers),
       definition_event_(std::move(definition_event)) {
   DCHECK(definition_event_);
-  CHECK(buffer_.IsConcrete());
-  buffer_size_ = buffer_->size_bytes();
+  CHECK(tensorflow::down_cast<CpuRawBuffer*>(this->raw_buffer().get())
+            ->buffer()
+            .IsConcrete());
+  buffer_size_ = this->raw_buffer()->GetOnDeviceSizeInBytes();
 }
 
 TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     size_t buffer_size, tsl::AsyncValueRef<CpuEvent> definition_event)
-    : owns_buffers_(owns_buffers),
-      buffer_(std::move(buffer)),
+    : AbstractTrackedDeviceBuffer(std::move(raw_buffer)),
+      owns_buffers_(owns_buffers),
       buffer_size_(buffer_size),
       definition_event_(std::move(definition_event)) {
   DCHECK(definition_event_);
 }
 
-TrackedCpuDeviceBuffer::~TrackedCpuDeviceBuffer() { ReleaseDeviceMemory(); }
+TrackedCpuDeviceBuffer::~TrackedCpuDeviceBuffer() = default;
+
+const tsl::AsyncValueRef<CpuDeviceMemory>& TrackedCpuDeviceBuffer::buffer() {
+  if (raw_buffer()) {
+    return tensorflow::down_cast<CpuRawBuffer*>(this->raw_buffer().get())
+        ->buffer();
+  }
+  static absl::NoDestructor<tsl::AsyncValueRef<CpuDeviceMemory>> missing_buffer;
+  return *missing_buffer;
+}
 
 size_t TrackedCpuDeviceBuffer::BufferSize() { return buffer_size_; }
 
@@ -271,8 +283,8 @@ TrackedCpuDeviceBuffer::LockUseAndTransferUsageEvents() {
   return std::move(usage_events_);
 }
 
-void TrackedCpuDeviceBuffer::ReleaseDeviceMemory() {
-  buffer_ = tsl::AsyncValueRef<CpuDeviceMemory>();
+void TrackedCpuDeviceBuffer::ConfirmDonation() {
+  ReleaseDeviceMemory();
   definition_event_.reset();
   usage_events_.clear();
 }
@@ -284,14 +296,6 @@ TrackedCpuDeviceBuffer::GetAsyncValueDefinitionEvents() {
   return result;
 }
 
-tsl::RCReference<CommonPjRtRawBuffer> TrackedCpuDeviceBuffer::GetRawBuffer(
-    PjRtMemorySpace* memory_space) {
-  if (!buffer_) {
-    return tsl::RCReference<CommonPjRtRawBuffer>();
-  }
-  return tsl::MakeRef<CpuRawBuffer>(memory_space, buffer_);
-}
-
 void TrackedCpuDeviceBuffer::AddUsageEvent(
     tsl::RCReference<PjRtDeviceEvent> event) {
   if (event) {
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
index 6b019eaf6a533d..580846b00078c5 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
@@ -147,12 +147,12 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
   // Constructor for allocated cpu memory, i.e., `buffer` should have concrete
   // states. Definition event is after the list of `definition_events`.
   TrackedCpuDeviceBuffer(
-      bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+      bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
 
   // Variant with single definition event.
   TrackedCpuDeviceBuffer(bool owns_buffers,
-                         tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+                         tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
                          tsl::AsyncValueRef<CpuEvent> definition_event);
 
   // Constructor for unallocated cpu memory, i.e., `buffer` will have
@@ -161,13 +161,13 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
   // list of `definition_events`. Callers need to ensure cpu memory is allocated
   // before the definition event is ready.
   TrackedCpuDeviceBuffer(
-      bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+      bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       size_t buffer_size,
       absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
 
   // Variant with single definition event.
   TrackedCpuDeviceBuffer(bool owns_buffers,
-                         tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+                         tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
                          size_t buffer_size,
                          tsl::AsyncValueRef<CpuEvent> definition_event);
 
@@ -177,7 +177,7 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
 
   ~TrackedCpuDeviceBuffer();
 
-  const tsl::AsyncValueRef<CpuDeviceMemory>& buffer() { return buffer_; }
+  const tsl::AsyncValueRef<CpuDeviceMemory>& buffer();
 
   size_t BufferSize();
 
@@ -204,9 +204,6 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
   absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> GetDefinitionEvent(
       PjRtMemorySpace* memory_space) override;
 
-  tsl::RCReference<CommonPjRtRawBuffer> GetRawBuffer(
-      PjRtMemorySpace* memory_space) override;
-
   void AddUsageEvent(tsl::RCReference<PjRtDeviceEvent> event) override;
 
   void Delete(PjRtMemorySpace* memory_space) override;
@@ -217,18 +214,11 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
       PjRtMemorySpace* memory_space) override;
 
  private:
-  // Relinquishes ownership of the buffer's device memory, e.g., after the
-  // buffer is passed to a computation that aliases its inputs to outputs.
-  void ReleaseDeviceMemory();
-
-  void ConfirmDonation() override { ReleaseDeviceMemory(); }
+  void ConfirmDonation() override;
 
   bool owns_buffers_;
 
-  // If non-tuple, `buffers_` contains 1 buffer; otherwise all leaf buffers.
-  tsl::AsyncValueRef<CpuDeviceMemory> buffer_;
-  // Should correspond to size of each buffer in `buffers_` when `buffers_` is
-  // available.
+  // Should equal raw_buffer()->GetOnDeviceSizeInBytes();
   size_t buffer_size_;
   // The definition event are associated with CPU operations that write to the
   // buffers.
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
index cae9bb2a508060..4d54ae1dacb9a4 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
@@ -20,9 +20,13 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/pjrt/cpu/raw_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -36,9 +40,11 @@ using ::tsl::MakeConstructedAsyncValueRef;
 using ::tsl::thread::ThreadPool;
 
 TEST(TrackedCpuDeviceBufferTest, Basic) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetPjRtCpuClient(CpuClientOptions()));
+  PjRtMemorySpace* memory_space = client->memory_spaces()[0];
   std::string expected = "tracked_cpu_device_buffer_test";
-  TF_ASSERT_OK_AND_ASSIGN(auto buffer,
-                          CpuDeviceMemory::Allocate(expected.size()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer, CpuRawBuffer::Allocate(memory_space, expected.size()));
 
   auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
 
@@ -46,7 +52,8 @@ TEST(TrackedCpuDeviceBufferTest, Basic) {
                          /*num_threads=*/4);
 
   thread_pool.Schedule([&]() {
-    std::memcpy(buffer->untyped_data(), expected.data(), expected.size());
+    std::memcpy(buffer->buffer()->untyped_data(), expected.data(),
+                expected.size());
     definition_event.SetStateConcrete();
   });
 
@@ -63,7 +70,10 @@ TEST(TrackedCpuDeviceBufferTest, Basic) {
 }
 
 TEST(TrackedCpuDeviceBufferTest, BasicError) {
-  TF_ASSERT_OK_AND_ASSIGN(auto buffer, CpuDeviceMemory::Allocate(64));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetPjRtCpuClient(CpuClientOptions()));
+  PjRtMemorySpace* memory_space = client->memory_spaces()[0];
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer,
+                          CpuRawBuffer::Allocate(memory_space, 64));
 
   auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
 
@@ -86,6 +96,8 @@ TEST(TrackedCpuDeviceBufferTest, BasicError) {
 }
 
 TEST(TrackedCpuDeviceBufferTest, DelayedAllocation) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetPjRtCpuClient(CpuClientOptions()));
+  PjRtMemorySpace* memory_space = client->memory_spaces()[0];
   std::string expected = "tracked_cpu_device_buffer_test";
 
   auto buffer = CpuDeviceMemory::CreateDelayedMemory();
@@ -95,8 +107,9 @@ TEST(TrackedCpuDeviceBufferTest, DelayedAllocation) {
   });
 
   auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
-  TrackedCpuDeviceBuffer tracked_buffer(/*owns_buffers=*/true, buffer,
-                                        expected.size(), definition_event);
+  TrackedCpuDeviceBuffer tracked_buffer(
+      /*owns_buffers=*/true, tsl::MakeRef<CpuRawBuffer>(memory_space, buffer),
+      expected.size(), definition_event);
   auto result = tracked_buffer.buffer();
   ASSERT_FALSE(result.IsAvailable());
   ASSERT_EQ(tracked_buffer.BufferSize(), expected.size());
diff --git a/third_party/xla/xla/pjrt/device_event.h b/third_party/xla/xla/pjrt/device_event.h
index 095bc2232c26ea..9aa231ebca926f 100644
--- a/third_party/xla/xla/pjrt/device_event.h
+++ b/third_party/xla/xla/pjrt/device_event.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/future.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index c282b1a2435482..2ddb630c1003a3 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -1,6 +1,5 @@
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -29,6 +28,7 @@ cc_library(
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
@@ -39,7 +39,7 @@ cc_library(
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:random",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 xla_cc_test(
@@ -71,6 +71,7 @@ cc_library(
     deps = [
         ":key_value_store_interface",
         ":util",
+        "//xla/service:global_device_id",
         "//xla/tsl/distributed_runtime/coordination:coordination_client",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
@@ -78,6 +79,8 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -86,15 +89,16 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:statusor",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
     name = "util",
     hdrs = ["util.h"],
     deps = [
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/status",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
@@ -105,8 +109,9 @@ cc_library(
         ":client",
         ":service",
         "//xla/tsl/platform:grpc_credentials",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/status:statusor",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
@@ -147,12 +152,19 @@ xla_cc_test(
         ":service",
         ":topology_util",
         "//xla:status_macros",
+        "//xla/service:global_device_id",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
@@ -162,7 +174,7 @@ xla_cc_test(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index 254a50bbe9ac4e..a1ac70e568e9d7 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
@@ -67,6 +69,8 @@ class DistributedRuntimeCoordinationServiceClient
   absl::Status WaitAtBarrier(
       std::string barrier_id, absl::Duration timeout,
       std::optional<absl::Span<const int32_t>> process_ids) override;
+  absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+  GetLiveNodesWithIncarnations(absl::Span<const int32_t> nodes) override;
   absl::StatusOr<std::vector<int32_t>> GetLiveNodes(
       absl::Span<const int32_t> nodes) override;
   absl::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
@@ -208,8 +212,8 @@ absl::Status DistributedRuntimeCoordinationServiceClient::WaitAtBarrier(
   return coord_agent_->WaitAtBarrier(barrier_id, timeout, tasks);
 }
 
-absl::StatusOr<std::vector<int32_t>>
-DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
+absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+DistributedRuntimeCoordinationServiceClient::GetLiveNodesWithIncarnations(
     absl::Span<const int32_t> nodes) {
   // Note that jax.distributed uses terms "process" and "node", and the
   // coordination service uses the term "task". These all refer to the same
@@ -227,13 +231,29 @@ DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
   }
 
   // Get the set of live tasks.
-  TF_ASSIGN_OR_RETURN(const std::vector<tensorflow::CoordinatedTask> live_tasks,
-                      coord_agent_->GetAliveTasks(tasks));
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<tsl::CoordinationServiceAgent::AliveTask> live_tasks,
+      coord_agent_->GetAliveTasks(tasks));
 
   // Extract the node ids from the live tasks.
-  std::vector<int32_t> live_nodes(live_tasks.size());
-  for (int i = 0; i < live_tasks.size(); ++i) {
-    live_nodes[i] = live_tasks[i].task_id();
+  absl::flat_hash_map<int32_t, IncarnationId> live_nodes;
+  for (const tsl::CoordinationServiceAgent::AliveTask& task : live_tasks) {
+    live_nodes[task.task_id] = task.incarnation_id;
+  }
+  return live_nodes;
+}
+
+absl::StatusOr<std::vector<int32_t>>
+DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
+    absl::Span<const int32_t> nodes) {
+  absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+      live_nodes_with_incarnations = GetLiveNodesWithIncarnations(nodes);
+  if (!live_nodes_with_incarnations.ok()) {
+    return live_nodes_with_incarnations.status();
+  }
+  std::vector<int32_t> live_nodes;
+  for (const auto& [task_id, unused] : *live_nodes_with_incarnations) {
+    live_nodes.push_back(task_id);
   }
   return live_nodes;
 }
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index f6d2a412a471ae..f152ec59db94ef 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
 #include "xla/tsl/platform/env.h"
 
 namespace tsl {
@@ -149,8 +151,15 @@ class DistributedRuntimeClient {
       std::string barrier_id, absl::Duration timeout,
       std::optional<absl::Span<const int32_t>> nodes) = 0;
 
+  // Returns the subset of live nodes, along with their incarnations. See
+  // CoordinationService.GetAliveTasks for detailed semantics.
+  virtual absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+  GetLiveNodesWithIncarnations(absl::Span<const int32_t> nodes) = 0;
+
   // Returns the subset of live nodes. See CoordinationService.GetAliveTasks for
   // detailed semantics.
+  //
+  // TODO: mwhittaker - Remove this function.
   virtual absl::StatusOr<std::vector<int32_t>> GetLiveNodes(
       absl::Span<const int32_t> nodes) = 0;
 
diff --git a/third_party/xla/xla/pjrt/distributed/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
index d65de0654f9ed0..a32b3a7c13c53f 100644
--- a/third_party/xla/xla/pjrt/distributed/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
@@ -21,8 +21,11 @@ limitations under the License.
 #include <vector>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/barrier.h"
@@ -42,27 +45,25 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/distributed/topology_util.h"
+#include "xla/service/global_device_id.h"
 #include "xla/status_macros.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
 
 using ::testing::IsEmpty;
+using ::testing::Key;
 using ::testing::Matches;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 constexpr absl::Duration kHeartbeatTimeout = absl::Milliseconds(2500);
 constexpr absl::Duration kBarrierTimeout = absl::Milliseconds(200);
@@ -1001,10 +1002,10 @@ TEST_F(ClientServerTest, GetLiveTasksSucceeds) {
       TF_ASSERT_OK(client->Connect());
 
       // Get the set of live nodes. All three nodes should be live.
-      absl::StatusOr<std::vector<int32_t>> live_nodes =
-          client->GetLiveNodes(std::vector<int>{0, 1, 2});
+      absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>> live_nodes =
+          client->GetLiveNodesWithIncarnations(std::vector<int>{0, 1, 2});
       TF_ASSERT_OK(live_nodes.status());
-      EXPECT_THAT(*live_nodes, UnorderedElementsAre(0, 1, 2));
+      EXPECT_THAT(*live_nodes, UnorderedElementsAre(Key(0), Key(1), Key(2)));
     });
   }
 }
@@ -1023,7 +1024,7 @@ TEST_F(ClientServerTest, GetLiveTasksWithoutBeingAMember) {
       // Get the set of live nodes but don't include ourselves.
       std::vector<int> nodes{0, 1, 2};
       nodes.erase(nodes.begin() + i);
-      EXPECT_THAT(client->GetLiveNodes(nodes),
+      EXPECT_THAT(client->GetLiveNodesWithIncarnations(nodes),
                   absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
     });
   }
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD
index c1028b5db21831..6ac5c74070fa68 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD
@@ -1,3 +1,8 @@
+load("//xla/tests:build_defs.bzl", "xla_test")
+load(
+    "//xla/tsl:tsl.bzl",
+    "if_google",
+)
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -13,11 +18,63 @@ cc_library(
         "//xla:future",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_helpers",
+        "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "pjrt_c_api_cross_host_transfers_extension_gpu_test",
+    srcs = ["pjrt_c_api_cross_host_transfers_extension_gpu_test.cc"],
+    backends = ["gpu"],
+    tags = if_google([
+        "config-cuda-only",
+    ]) + [
+        "multi_gpu",
+        "no_oss",
+    ],
+    deps = [
+        ":pjrt_c_api_cross_host_transfer_extension",
+        "//xla:debug_options_flags",
+        "//xla:future",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla/client:client_library",
+        "//xla/ffi/api:ffi",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt/c:pjrt_c_api_gpu",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
         "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
+        "//xla/pjrt/distributed",
+        "//xla/pjrt/distributed:client",
+        "//xla/pjrt/distributed:in_memory_key_value_store",
+        "//xla/pjrt/distributed:service",
+        "//xla/tests:literal_test_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:subprocess",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
index 2a754b33fbea37..5b70b483d15dcd 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
@@ -16,23 +16,96 @@ limitations under the License.
 #include "xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h"
 
 #include <cstddef>
+#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/future.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/shape.h"
 
 namespace pjrt {
 
+PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args",
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  std::vector<xla::Shape> shapes;
+  shapes.reserve(args->num_shapes);
+  for (int i = 0; i < args->num_shapes; ++i) {
+    PJRT_ASSIGN_OR_RETURN(
+        xla::Shape shape,
+        pjrt::BuildXlaShapeFromC(args->element_types[i], args->num_dims[i],
+                                 args->shape_num_dims[i], args->layouts[i]));
+    shapes.push_back(std::move(shape));
+  }
+
+  std::vector<xla::PjRtGlobalDeviceId> src_global_device_ids;
+  src_global_device_ids.reserve(args->num_shapes);
+
+  std::vector<xla::CrossHostTransferKey> transfer_keys;
+  transfer_keys.reserve(args->num_shapes);
+
+  for (int i = 0; i < args->num_shapes; ++i) {
+    src_global_device_ids.push_back(args->src_global_device_ids[i]);
+    transfer_keys.push_back(args->transfer_keys[i]);
+  }
+
+  PJRT_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<xla::PjRtBuffer>> buffers,
+                        args->client->client->CrossHostReceiveBuffers(
+                            args->device->device, shapes, src_global_device_ids,
+                            std::move(transfer_keys)));
+
+  for (int i = 0; i < buffers.size(); ++i) {
+    args->buffers[i] = new PJRT_Buffer{std::move(buffers[i]), args->client};
+  }
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args* args) {
+  std::vector<xla::PjRtBuffer*> buffers;
+  buffers.reserve(args->num_buffers);
+  for (int i = 0; i < args->num_buffers; ++i) {
+    buffers.push_back(args->buffers[i]->buffer.get());
+  }
+
+  std::vector<xla::PjRtGlobalDeviceId> dst_global_device_ids;
+  dst_global_device_ids.reserve(args->num_buffers);
+
+  std::vector<xla::CrossHostTransferKey> transfer_keys;
+  transfer_keys.reserve(args->num_buffers);
+
+  for (int i = 0; i < args->num_buffers; ++i) {
+    dst_global_device_ids.push_back(args->dst_global_device_ids[i]);
+    transfer_keys.push_back(args->transfer_keys[i]);
+  }
+
+  PJRT_ASSIGN_OR_RETURN(
+      std::vector<tsl::Future<>> send_futures,
+      args->client->client->CrossHostSendBuffers(buffers, dst_global_device_ids,
+                                                 std::move(transfer_keys)));
+
+  for (int i = 0; i < buffers.size(); ++i) {
+    args->send_events[i] = new PJRT_Event{std::move(send_futures[i])};
+  }
+  return nullptr;
+}
+
 namespace {
 static xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
     const PJRT_Transfers_CrossHostRecvNotifierInfo& c_notifier) {
@@ -40,7 +113,8 @@ static xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
              absl::StatusOr<xla::PjRtCrossHostRecvState> recv_state) {
     if (!recv_state.ok()) {
       auto error = new PJRT_Error{recv_state.status()};
-      return notifier(error, nullptr, nullptr, 0, user_arg);
+      notifier(error, nullptr, nullptr, 0, user_arg);
+      return;
     }
     auto& descriptors = recv_state->descriptors;
     std::vector<size_t> descriptors_sizes;
@@ -53,12 +127,78 @@ static xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
       descriptors_sizes.push_back(
           descriptors[i].serialized_descriptors.front().size());
     }
-    return notifier(nullptr, serialized_descriptors.data(),
-                    descriptors_sizes.data(), descriptors.size(), user_arg);
+    notifier(nullptr, serialized_descriptors.data(), descriptors_sizes.data(),
+             descriptors.size(), user_arg);
   };
 }
 }  // namespace
 
+PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
+    const PJRT_Api* c_api, xla::PjRtCrossHostRecvNotifier cpp_notifier) {
+  using CrossHostRecvNotifierFunction =
+      std::function<void(PJRT_Error*, const char**, size_t*, size_t)>;
+  auto notifier_function = new CrossHostRecvNotifierFunction(
+      [cpp_notifier = std::move(cpp_notifier), c_api](
+          PJRT_Error* error, const char** serialized_descriptors,
+          size_t* descriptors_sizes, size_t num_descriptors) {
+        if (error != nullptr) {
+          absl::Status state = ::pjrt::PjrtErrorToStatus(error, c_api);
+          return cpp_notifier(std::move(state));
+        }
+        xla::PjRtCrossHostRecvState state;
+        state.descriptors.reserve(num_descriptors);
+        for (int i = 0; i < num_descriptors; ++i) {
+          xla::PjRtCrossHostRecvDescriptors descriptors;
+          descriptors.serialized_descriptors.push_back(
+              std::string(serialized_descriptors[i], descriptors_sizes[i]));
+          state.descriptors.push_back(std::move(descriptors));
+        }
+
+        // TODO(emilyaf): Support cancellation.
+        xla::PjRtCrossHostSendCancelNotifier cancel_notifier =
+            [](absl::string_view, absl::Status,
+               std::function<void(absl::Status)>) {
+              LOG(FATAL) << "MakeCrossHostReceiveBuffers: Cancellation is not "
+                            "supported in PJRT C API.";
+            };
+        state.cancel_notifier = cancel_notifier;
+        return cpp_notifier(std::move(state));
+      });
+  return PJRT_Transfers_CrossHostRecvNotifierInfo{
+      /*user_arg=*/notifier_function,
+      /*notifier=*/
+      [](PJRT_Error* error, const char** serialized_descriptors,
+         size_t* descriptors_sizes, size_t num_descriptors, void* user_arg) {
+        CrossHostRecvNotifierFunction* notifier_fn =
+            reinterpret_cast<CrossHostRecvNotifierFunction*>(user_arg);
+        (*notifier_fn)(error, serialized_descriptors, descriptors_sizes,
+                       num_descriptors);
+        delete notifier_fn;
+      }};
+}
+
+PJRT_Transfers_CrossHostRemoteSendCallbackInfo
+CppCrossHostRemoteSendCallbackToC(
+    const PJRT_Api* c_api, xla::PjRtBuffer::RemoteSendCallback cpp_callback) {
+  using RemoteSendCallbackFunction =
+      std::function<void(PJRT_Error * error, bool sends_were_enqueued)>;
+  auto on_done_function = new RemoteSendCallbackFunction(
+      [cpp_callback = std::move(cpp_callback), c_api](
+          PJRT_Error* error, bool sends_were_enqueued) {
+        absl::Status status = ::pjrt::PjrtErrorToStatus(error, c_api);
+        cpp_callback(status, sends_were_enqueued);
+      });
+  return PJRT_Transfers_CrossHostRemoteSendCallbackInfo{
+      /*user_arg=*/on_done_function,
+      /*on_done=*/
+      [](PJRT_Error* error, bool sends_were_enqueued, void* user_arg) {
+        RemoteSendCallbackFunction* on_done_fn =
+            reinterpret_cast<RemoteSendCallbackFunction*>(user_arg);
+        (*on_done_fn)(error, sends_were_enqueued);
+        delete on_done_fn;
+      }};
+}
+
 PJRT_Error* PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers(
     PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -93,9 +233,14 @@ void PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice(
       args->serialized_descriptor, args->serialized_descriptor_size);
   xla::Future<std::string> descriptor_future(std::move(serialized_descriptor));
 
-  // TODO(emilyaf): Support on_done callback.
   xla::PjRtBuffer::RemoteSendCallback on_done =
-      [](absl::Status status, bool sends_were_enqueued) { CHECK_OK(status); };
+      [user_arg = args->on_done.user_arg, on_done = args->on_done.on_done](
+          absl::Status status, bool sends_were_enqueued) {
+        auto error = new PJRT_Error{status};
+        on_done(error, sends_were_enqueued, user_arg);
+        delete error;
+      };
+
   args->buffer->buffer->CopyToRemoteDevice(descriptor_future, on_done);
 }
 
@@ -110,7 +255,12 @@ PJRT_CrossHostTransfers_Extension CreateCrossHostTransfersExtension(
       /*PJRT_CrossHostTransfers_PJRT_Client_MakeCrossHostReceiveBuffers=*/
       PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers,
       /*PJRT_CrossHostTransfers_PJRT_Buffer_CopyToRemoteDevice=*/
-      PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice};
+      PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice,
+      /*PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers=*/
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers,
+      /*PJRT_Transfers_PJRT_Client_CrossHostSendBuffers=*/
+      PJRT_Transfers_PJRT_Client_CrossHostSendBuffers,
+  };
 }
 
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
index 173cf0c8b81d3c..160a5f4771498b 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,10 +31,58 @@ extern "C" {
 // are supported with the PjRtClient::MakeCrossHostReceiveBuffers() and
 // PjRtBuffer::CopyToRemoteDevice() APIs.
 
-#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 1
+// Version 2 adds an alternate API for cross-host transfers:
+// CrossHostSendBuffers and CrossHostReceiveBuffers. These methods allow PjRt
+// clients to implement various optimizations for cross-host transfers.
+
+#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 3
 
 // ---------------------------------- Methods ----------------------------------
 
+// Structs and methods prefixed with
+// PJRT_Transfers_PJRT_Client_CrossHost{Send,Receive}Buffers correspond to the
+// second cross-host transfers API.
+struct PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  size_t num_buffers;
+  PJRT_Buffer** buffers;
+  const xla::PjRtGlobalDeviceId*
+      dst_global_device_ids;                       // Has size num_buffers.
+  const xla::CrossHostTransferKey* transfer_keys;  // Has size num_buffers.
+  PJRT_Event** send_events;  // Output; has size num_buffers.
+};
+
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args,
+                          send_events);
+
+typedef PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args* args);
+
+struct PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  size_t num_shapes;
+  size_t* shape_num_dims;
+  const int64_t** num_dims;
+  PJRT_Buffer_Type* element_types;
+  PJRT_Buffer_MemoryLayout** layouts;
+  PJRT_Device* device;
+  const xla::PjRtGlobalDeviceId* src_global_device_ids;  // Has size num_shapes.
+  const xla::CrossHostTransferKey* transfer_keys;        // Has size num_shapes.
+  PJRT_Buffer** buffers;  // Output; has size num_shapes.
+};
+
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args, buffers);
+
+typedef PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args* args);
+
+// The structs and methods below correspond to the original cross-host transfers
+// API.
 typedef void (*PJRT_Transfers_CrossHostRecvNotifier)(
     PJRT_Error* error, const char** serialized_descriptors,
     size_t* descriptors_sizes, size_t num_descriptors, void* user_arg);
@@ -64,30 +113,48 @@ PJRT_DEFINE_STRUCT_TRAITS(
 typedef PJRT_Error* PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers(
     PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args* args);
 
+typedef void (*PJRT_Transfers_CrossHostRemoteSendCallback)(
+    PJRT_Error* error, bool sends_were_enqueued, void* user_arg);
+
+struct PJRT_Transfers_CrossHostRemoteSendCallbackInfo {
+  void* user_arg;
+  PJRT_Transfers_CrossHostRemoteSendCallback on_done;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Transfers_CrossHostRemoteSendCallbackInfo,
+                          on_done);
+
 struct PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   const char* serialized_descriptor;
   size_t serialized_descriptor_size;
+  PJRT_Transfers_CrossHostRemoteSendCallbackInfo on_done;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice_Args,
-                          serialized_descriptor_size);
+                          on_done);
 
 typedef void PJRT_Buffer_CopyToRemoteDevice(
     PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice_Args* args);
 
 // --------------------------- Extension entrypoint ----------------------------
 
+// NOLINTBEGIN: Non-lowercase struct member names follow the convention of the
+// PJRT C API.
 typedef struct PJRT_CrossHostTransfers_Extension {
   PJRT_Extension_Base base;
-
   PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers*
       PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers;
   PJRT_Buffer_CopyToRemoteDevice* PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice;
+  PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers*
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers;
+  PJRT_Transfers_PJRT_Client_CrossHostSendBuffers*
+      PJRT_Transfers_PJRT_Client_CrossHostSendBuffers;
 } PJRT_CrossHostTransfers_Extension;
+// NOLINTEND
+
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_CrossHostTransfers_Extension,
-                          PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice);
+                          PJRT_Transfers_PJRT_Client_CrossHostSendBuffers);
 
 #ifdef __cplusplus
 }
@@ -96,6 +163,11 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_CrossHostTransfers_Extension,
 namespace pjrt {
 PJRT_CrossHostTransfers_Extension CreateCrossHostTransfersExtension(
     PJRT_Extension_Base* next = nullptr);
+PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
+    const PJRT_Api* c_api, xla::PjRtCrossHostRecvNotifier cpp_notifier);
+PJRT_Transfers_CrossHostRemoteSendCallbackInfo
+CppCrossHostRemoteSendCallbackToC(
+    const PJRT_Api* c_api, xla::PjRtBuffer::RemoteSendCallback cpp_callback);
 }  // namespace pjrt
 
 #endif  // XLA_PJRT_EXTENSIONS_CROSS_HOST_TRANSFERS_PJRT_C_API_CROSS_HOST_TRANSFERS_EXTENSION_H_
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension_gpu_test.cc b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension_gpu_test.cc
new file mode 100644
index 00000000000000..73d544a3b20cd3
--- /dev/null
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension_gpu_test.cc
@@ -0,0 +1,379 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/client/client_library.h"
+#include "xla/debug_options_flags.h"
+#include "xla/ffi/api/ffi.h"
+#include "xla/future.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_gpu.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/distributed/client.h"
+#include "xla/pjrt/distributed/distributed.h"
+#include "xla/pjrt/distributed/service.h"
+#include "xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/subprocess.h"
+#include "xla/tsl/util/command_line_flags.h"
+
+namespace pjrt {
+namespace {
+
+static std::string SuccessfulCrossHostTransferTestName(
+    const ::testing::TestParamInfo<int>& info) {
+  return absl::StrFormat("num_arrays_%d", info.param);
+}
+
+absl::StatusOr<PJRT_Client_Create_Args> BuildCreateArg(
+    ::pjrt::PJRT_KeyValueCallbackData* kv_callback_data,
+    const std::vector<PJRT_NamedValue>& c_options) {
+  PJRT_Client_Create_Args args;
+  args.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.create_options = c_options.data();
+  args.num_options = c_options.size();
+  args.kv_get_callback = kv_callback_data->c_kv_get;
+  args.kv_get_user_arg = &kv_callback_data->kv_get_c_func;
+  args.kv_put_callback = kv_callback_data->c_kv_put;
+  args.kv_put_user_arg = &kv_callback_data->kv_put_c_func;
+  args.kv_try_get_user_arg = &kv_callback_data->kv_try_get_c_func;
+  args.kv_try_get_callback = kv_callback_data->c_kv_try_get;
+  args.client = nullptr;
+  return args;
+}
+
+absl::Span<PJRT_Device* const> GetClientAddressableDevices(
+    PJRT_Client* client, const PJRT_Api* api) {
+  PJRT_Client_AddressableDevices_Args addr_args;
+  addr_args.struct_size = PJRT_Client_AddressableDevices_Args_STRUCT_SIZE;
+  addr_args.extension_start = nullptr;
+  addr_args.client = client;
+  PJRT_Error* error = api->PJRT_Client_AddressableDevices(&addr_args);
+  CHECK(error == nullptr);
+  return absl::MakeSpan(addr_args.addressable_devices,
+                        addr_args.num_addressable_devices);
+}
+
+class SuccessfulCrossHostTransferTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(SuccessfulCrossHostTransferTest, SuccessfulCrossHostTransfer) {
+  int num_arrays = GetParam();
+
+  tsl::SubProcess sender;
+  tsl::SubProcess receiver;
+  absl::string_view log_dir = std::getenv("TEST_UNDECLARED_OUTPUTS_DIR");
+
+  std::vector<std::string> sender_argv;
+  sender_argv.push_back("successful_cross_host_transfer_test");
+  sender_argv.push_back("--cross_host_test_role=sender");
+  sender_argv.push_back(absl::StrFormat("--num_arrays=%d", num_arrays));
+  sender_argv.push_back(absl::StrFormat("--log_dir=%s", log_dir));
+
+  std::vector<std::string> receiver_argv;
+  receiver_argv.push_back("successful_cross_host_transfer_test");
+  receiver_argv.push_back("--cross_host_test_role=receiver");
+  receiver_argv.push_back(absl::StrFormat("--num_arrays=%d", num_arrays));
+  receiver_argv.push_back(absl::StrFormat("--log_dir=%s", log_dir));
+
+  sender.SetProgram("/proc/self/exe", sender_argv);
+  sender.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  sender.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+
+  receiver.SetProgram("/proc/self/exe", receiver_argv);
+  receiver.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  receiver.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+
+  ASSERT_TRUE(receiver.Start());
+  ASSERT_TRUE(sender.Start());
+
+  std::string sender_stdout, sender_stderr;
+  std::string receiver_stdout, receiver_stderr;
+
+  int sender_status =
+      sender.Communicate(nullptr, &sender_stdout, &sender_stderr);
+  int receiver_status =
+      receiver.Communicate(nullptr, &receiver_stdout, &receiver_stderr);
+
+  EXPECT_EQ(sender_status, 0) << "sender stdout:\n"
+                              << sender_stdout << "\nsender stderr:\n"
+                              << sender_stderr;
+  EXPECT_EQ(receiver_status, 0) << "receiver stdout:\n"
+                                << receiver_stdout << "\nreceiver stderr:\n"
+                                << receiver_stderr;
+}
+
+INSTANTIATE_TEST_SUITE_P(SuccessfulCrossHostTransfer,
+                         SuccessfulCrossHostTransferTest,
+                         ::testing::ValuesIn({1, 2, 3}),
+                         SuccessfulCrossHostTransferTestName);
+
+absl::Status SuccessfulCrossHostTransferTestBody(bool is_sender,
+                                                 int num_arrays) {
+  std::string log_prefix = is_sender ? "sender" : "receiver";
+
+  // Sender creates a coordination service on so both processes can find each
+  // other via the distributed runtime (port chosen arbitrarily).
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+  if (is_sender) {
+    TF_ASSIGN_OR_RETURN(
+        service, xla::GetDistributedRuntimeService(
+                     "127.0.0.1:12347",
+                     xla::CoordinationServiceImpl::Options{/*num_nodes=*/2}));
+  }
+
+  // Connect to the coordination service.
+  int32_t node_id = is_sender ? 0 : 1;
+  xla::DistributedRuntimeClient::Options distributed_options;
+  distributed_options.node_id = node_id;
+  distributed_options.init_timeout = absl::Seconds(120);
+  auto distributed_client =
+      GetDistributedRuntimeClient("127.0.0.1:12347", distributed_options);
+  TF_QCHECK_OK(distributed_client->Connect());
+
+  auto kv_store = xla::GetDistributedKeyValueStore(distributed_client, "foo");
+  std::shared_ptr<::pjrt::PJRT_KeyValueCallbackData> kv_callback_data =
+      ::pjrt::ConvertToCKeyValueCallbacks(kv_store);
+  xla::ClientLibrary::DestroyLocalInstances();
+
+  auto api = GetPjrtApi();
+  PJRT_CrossHostTransfers_Extension* cross_host_transfers_extension =
+      pjrt::FindExtension<PJRT_CrossHostTransfers_Extension>(
+          api, PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
+  CHECK_NE(cross_host_transfers_extension, nullptr);
+  CHECK_NE(cross_host_transfers_extension
+               ->PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice,
+           nullptr);
+
+  // Create the GPU client.
+  absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
+      {"num_nodes", static_cast<int64_t>(2)},
+      {"node_id", static_cast<int64_t>(node_id)},
+      {"visible_devices", std::vector<int64_t>({node_id})}};
+  TF_ASSIGN_OR_RETURN(std::vector<PJRT_NamedValue> c_options,
+                      ::pjrt::ConvertToPjRtNamedValueList(options));
+  TF_ASSIGN_OR_RETURN(PJRT_Client_Create_Args create_arg,
+                      BuildCreateArg(kv_callback_data.get(), c_options));
+  std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter> error(
+      api->PJRT_Client_Create(&create_arg), ::pjrt::MakeErrorDeleter(api));
+  if (error != nullptr) {
+    return error->status;
+  }
+  std::unique_ptr<PJRT_Client, ::pjrt::PJRT_ClientDeleter> client_deleter(
+      create_arg.client, ::pjrt::MakeClientDeleter(api));
+
+  std::vector<int64_t> shape = {2, 3};
+  xla::Shape xla_shape =
+      xla::ShapeUtil::MakeShape(xla::F32, /*dimensions=*/shape);
+
+  // Sender logic.
+  if (is_sender) {
+    std::vector<PJRT_Buffer*> raw_buffers;
+    std::vector<xla::PjRtGlobalDeviceId> dst_device_ids;
+    std::vector<xla::CrossHostTransferKey> transfer_keys;
+    raw_buffers.reserve(num_arrays);
+    dst_device_ids.reserve(num_arrays);
+    transfer_keys.reserve(num_arrays);
+    for (int i = 0; i < num_arrays; ++i) {
+      // Create buffers to send.
+      std::vector<float> data = {1, 2, 3, 4, 5, 6 * static_cast<float>(i)};
+      PJRT_Client_BufferFromHostBuffer_Args args;
+      args.struct_size = PJRT_Client_BufferFromHostBuffer_Args_STRUCT_SIZE;
+      args.extension_start = nullptr;
+      args.data = data.data();
+      args.type = ::pjrt::ConvertToPjRtBufferType(xla_shape.element_type());
+      args.dims = xla_shape.dimensions().data();
+      args.num_dims = xla_shape.dimensions().size();
+      args.byte_strides = nullptr;
+      args.num_byte_strides = 0;
+      args.device_layout = nullptr;
+      args.host_buffer_semantics = ::pjrt::ConvertToPjRtHostBufferSemantics(
+          xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall);
+      args.client = create_arg.client;
+      args.device = GetClientAddressableDevices(create_arg.client, api)[0];
+      args.memory = nullptr;
+
+      auto transfer_error =
+          std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter>{
+              api->PJRT_Client_BufferFromHostBuffer(&args),
+              ::pjrt::MakeErrorDeleter(api)};
+      if (transfer_error != nullptr) {
+        return transfer_error->status;
+      }
+      CHECK_OK(args.buffer->buffer->GetReadyFuture().Await());
+      std::unique_ptr<PJRT_Event, PJRT_EventDeleter> event(
+          args.done_with_host_buffer, MakeEventDeleter(api));
+
+      raw_buffers.push_back(args.buffer);
+      CHECK_OK(event->future.Await());
+      xla::PjRtGlobalDeviceId src_device_id =
+          args.device->device->global_device_id();
+      dst_device_ids.push_back(1 - src_device_id);
+      transfer_keys.push_back(xla::CrossHostTransferKey(i));
+    };
+
+    // Send the list of buffers.
+    PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args send_args;
+    send_args.struct_size =
+        PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args_STRUCT_SIZE;
+    send_args.extension_start = nullptr;
+    send_args.client = create_arg.client;
+    send_args.num_buffers = raw_buffers.size();
+    send_args.buffers = raw_buffers.data();
+    send_args.dst_global_device_ids = dst_device_ids.data();
+    send_args.transfer_keys = transfer_keys.data();
+    std::vector<PJRT_Event*> temp_events(raw_buffers.size());
+    send_args.send_events = temp_events.data();
+    cross_host_transfers_extension
+        ->PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(&send_args);
+
+    for (int i = 0; i < num_arrays; ++i) {
+      CHECK_OK(send_args.send_events[i]->future.Await());
+      std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> buffer(
+          raw_buffers[i], ::pjrt::MakeBufferDeleter(api));
+      std::unique_ptr<PJRT_Event, PJRT_EventDeleter> send_event(
+          send_args.send_events[i], MakeEventDeleter(api));
+      CHECK_OK(send_event->future.Await());
+    }
+  } else {
+    // Receive some data.
+    std::vector<xla::Literal> expected_literals;
+    expected_literals.reserve(num_arrays);
+    for (int i = 0; i < num_arrays; ++i) {
+      expected_literals.push_back(xla::LiteralUtil::CreateR2<float>(
+          {{1, 2, 3}, {4, 5, 6 * static_cast<float>(i)}}));
+    }
+    std::vector<xla::Shape> shapes;
+    std::vector<xla::PjRtGlobalDeviceId> src_device_ids;
+    std::vector<xla::CrossHostTransferKey> transfer_keys;
+    std::vector<size_t> shape_num_dims;
+    std::vector<const int64_t*> num_dims;
+    std::vector<PJRT_Buffer_Type> element_types;
+    std::vector<PJRT_Buffer_MemoryLayout*> layouts;
+    shapes.reserve(num_arrays);
+    src_device_ids.reserve(num_arrays);
+    transfer_keys.reserve(num_arrays);
+    shape_num_dims.reserve(num_arrays);
+    num_dims.reserve(num_arrays);
+    element_types.reserve(num_arrays);
+    layouts.reserve(num_arrays);
+    xla::PjRtGlobalDeviceId dst_device_id =
+        GetClientAddressableDevices(create_arg.client, api)[0]
+            ->device->global_device_id();
+    for (int i = 0; i < num_arrays; ++i) {
+      shapes.push_back(xla_shape);
+      src_device_ids.push_back(xla::PjRtGlobalDeviceId(1 - dst_device_id));
+      transfer_keys.push_back(xla::CrossHostTransferKey(i));
+      shape_num_dims.push_back(shapes.back().dimensions().size());
+      num_dims.push_back(shapes.back().dimensions().data());
+      element_types.push_back(
+          ::pjrt::ConvertToPjRtBufferType(shapes.back().element_type()));
+      layouts.push_back(nullptr);
+    }
+
+    PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args recv_args;
+    recv_args.struct_size =
+        PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args_STRUCT_SIZE;
+    recv_args.extension_start = nullptr;
+    recv_args.client = create_arg.client;
+    recv_args.num_shapes = shapes.size();
+    recv_args.shape_num_dims = shape_num_dims.data();
+    recv_args.num_dims = num_dims.data();
+    recv_args.element_types = element_types.data();
+    recv_args.layouts = layouts.data();
+    recv_args.device = GetClientAddressableDevices(create_arg.client, api)[0];
+    recv_args.src_global_device_ids = src_device_ids.data();
+    recv_args.transfer_keys = transfer_keys.data();
+    std::vector<PJRT_Buffer*> temp_buffers(shapes.size());
+    recv_args.buffers = temp_buffers.data();
+    cross_host_transfers_extension
+        ->PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(&recv_args);
+
+    for (int i = 0; i < num_arrays; ++i) {
+      TF_RETURN_IF_ERROR(
+          recv_args.buffers[i]->buffer->GetReadyFuture().Await());
+      TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::Literal> recv_literal,
+                          recv_args.buffers[i]->buffer->ToLiteral().Await());
+
+      TF_RET_CHECK(
+          xla::LiteralTestUtil::Equal(expected_literals[i], *recv_literal));
+      std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> buffer(
+          recv_args.buffers[i], ::pjrt::MakeBufferDeleter(api));
+    }
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace
+}  // namespace pjrt
+
+int main(int argc, char* argv[]) {
+  // Variables used by SuccessfulCrossHostTransfer.
+  std::string cross_host_test_role;
+  int num_arrays = -1;
+
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("cross_host_test_role", &cross_host_test_role,
+                "Test parameter for SuccessfulCrossHostTransfer; either "
+                "'sender' or 'receiver'."),
+      tsl::Flag("num_arrays", &num_arrays,
+                "Test parameter for SuccessfulCrossHostTransfer; number of "
+                "arrays to transfer.")};
+
+  xla::AppendDebugOptionsFlags(&flag_list);
+  std::string usage = tsl::Flags::Usage(argv[0], flag_list);
+  tsl::Flags::Parse(&argc, argv, flag_list);
+
+  testing::InitGoogleTest(&argc, argv);
+  if (cross_host_test_role == "sender") {
+    return pjrt::SuccessfulCrossHostTransferTestBody(/*is_sender=*/true,
+                                                     num_arrays)
+        .raw_code();
+  }
+  if (cross_host_test_role == "receiver") {
+    return pjrt::SuccessfulCrossHostTransferTestBody(/*is_sender=*/false,
+                                                     num_arrays)
+        .raw_code();
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD b/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD
index 4c5f9f9f10791d..9108b2b6d0b672 100644
--- a/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD
+++ b/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD
@@ -10,9 +10,9 @@ cc_library(
     srcs = ["executable_metadata_extension.cc"],
     hdrs = ["executable_metadata_extension.h"],
     deps = [
+        "//xla:future",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
         "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index a39417a5689199..b5bd5df0bd6b72 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -79,6 +79,8 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/builder:xla_computation",
+        "//xla/pjrt:abstract_tracked_device_buffer",
+        "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:device_event",
         "//xla/pjrt:event_pool",
         "//xla/pjrt:host_memory_spaces",
@@ -90,6 +92,7 @@ cc_library(
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_stream_executor_client",
+        "//xla/pjrt:raw_buffer",
         "//xla/pjrt:stream_executor_executable",
         "//xla/pjrt:stream_executor_executable_proto_cc",
         "//xla/pjrt:utils",
@@ -138,6 +141,7 @@ cc_library(
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
@@ -172,7 +176,6 @@ cc_library(
         "//xla/service/gpu:stream_executor_util",
     ]) + if_cuda([
         # keep sorted
-        "//xla/service/gpu/model:gpu_collective_performance_model",
         "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
         "//xla/stream_executor/gpu:gpu_stream",
         "@local_config_cuda//cuda:cuda_headers",
@@ -185,12 +188,28 @@ cc_library(
 xla_test(
     name = "se_gpu_pjrt_client_test",
     srcs = ["se_gpu_pjrt_client_test.cc"],
-    backend_tags = {"gpu": [
-        "multi_gpu_h100",
-        "no_oss",
-        "noasan",
-        "nomsan",
-    ]},
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+            "no_oss",
+            "noasan",
+            "nomsan",
+        ],
+        "b200": [
+            # TODO(b/452317320): Re-enable once fixed.
+            "broken",
+            "no_oss",
+        ],
+        "h100": [
+            # TODO(b/452317320): Re-enable once fixed.
+            "broken",
+            "no_oss",
+        ],
+        "nvgpu_any": [
+            "broken",
+            "no_oss",
+        ],
+    },
     backends = ["gpu"],
     deps = [
         ":gpu_topology",
@@ -217,6 +236,7 @@ xla_test(
         "//xla/pjrt:local_device_state",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
@@ -245,6 +265,7 @@ xla_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:subprocess",
         "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -257,11 +278,11 @@ xla_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -290,52 +311,43 @@ xla_test(
 xla_test(
     name = "nvshmem_gpu_collectives_test",
     srcs = ["nvshmem_gpu_collectives_test.cc"],
-    backend_tags = {"gpu": [
-        "multi_gpu_h100",
-        "no_oss",
-        "noasan",
-        "nomsan",
-    ]},
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+            "no_oss",
+            "noasan",
+            "nomsan",
+        ],
+        "nvgpu_any": [
+            "broken",
+            "no_oss",
+        ],
+    },
     backends = ["gpu"],
     env = {
         "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
     },
     deps = [
-        ":gpu_topology",
         ":se_gpu_pjrt_client",
         "//xla:debug_options_flags",
         "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/ffi",
-        "//xla/ffi:ffi_api",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test",
-        "//xla/pjrt:host_memory_spaces",
-        "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_stream_executor_client",
         "//xla/pjrt/distributed",
         "//xla/pjrt/distributed:client",
-        "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/pjrt/distributed:service",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
-        "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream",
-        "//xla/tests:literal_test_util",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:subprocess",
         "//xla/tsl/util:command_line_flags",
@@ -345,11 +357,8 @@ xla_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -357,12 +366,14 @@ xla_test(
 xla_test(
     name = "se_gpu_pjrt_client_nvshmem_test",
     srcs = ["se_gpu_pjrt_client_nvshmem_test.cc"],
-    backend_tags = {"gpu": [
-        "multi_gpu_h100",
-        "no_oss",
-        "noasan",
-        "nomsan",
-    ]},
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+            "no_oss",
+            "noasan",
+            "nomsan",
+        ],
+    },
     backends = ["gpu"],
     env = {
         "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
@@ -467,7 +478,6 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:local_service_utils",
         "//xla/service:platform_util",
-        "//xla/service/gpu:executable_proto_cc",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/tsl/platform:errors",
@@ -569,7 +579,6 @@ xla_test(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
         "//xla/tests:literal_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -625,11 +634,15 @@ cc_library(
     ]),
     deps = [
         ":gpu_topology",
+        ":gpu_topology_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_stream_executor_device_description",
+        "//xla/pjrt/proto:topology_description_proto_cc",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/lib/strings:proto_serialization",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -640,6 +653,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -649,9 +663,12 @@ xla_cc_test(
     deps = [
         ":gpu_topology",
         ":se_gpu_topology_description",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_stream_executor_device_description",
+        "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
index f4929724d9d67c..54358393bc1e70 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
@@ -90,7 +90,10 @@ void EnablePeerAccess(absl::Span<se::StreamExecutor* const> executors) {
 // Builds a BFCAllocator for all local GPUs.
 absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
     se::StreamExecutor* executor, double memory_fraction, bool preallocate,
-    std::optional<int64_t> gpu_system_memory_size) {
+    std::optional<int64_t> gpu_system_memory_size,
+    const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_alloc_visitors,
+    const std::vector<tsl::SubAllocator::Visitor>&
+        sub_allocator_free_visitors) {
   bool enable_unified_memory;
   absl::Status status = tsl::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY",
                                                 false, &enable_unified_memory);
@@ -108,10 +111,12 @@ absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
         executor->CreateMemoryAllocator(stream_executor::MemoryType::kUnified));
     sub_allocator = std::make_unique<se::StreamExecutorAllocator>(
         std::move(unified_memory_allocator),
-        stream_executor::MemoryType::kUnified, device_ordinal);
+        stream_executor::MemoryType::kUnified, device_ordinal,
+        sub_allocator_alloc_visitors, sub_allocator_free_visitors);
   } else {
     sub_allocator = std::make_unique<se::DeviceMemAllocator>(
-        executor, tsl::PlatformDeviceId(device_ordinal));
+        executor, tsl::PlatformDeviceId(device_ordinal),
+        sub_allocator_alloc_visitors, sub_allocator_free_visitors);
   }
 
   int64_t free_memory;
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.h b/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
index ebc36dcf51539d..dc00d4f41c7b06 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef XLA_PJRT_GPU_GPU_HELPERS_H_
 #define XLA_PJRT_GPU_GPU_HELPERS_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <set>
 #include <string>
+#include <vector>
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/framework/bfc_allocator.h"
 #include "xla/types.h"
 
@@ -47,7 +50,9 @@ absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> GetGpuHostAllocator(
 // Builds a BFCAllocator for all local GPUs.
 absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
     se::StreamExecutor* executor, double memory_fraction, bool preallocate,
-    std::optional<int64_t> gpu_system_memory_size);
+    std::optional<int64_t> gpu_system_memory_size,
+    const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_alloc_visitors,
+    const std::vector<tsl::SubAllocator::Visitor>& sub_allocator_free_visitors);
 
 // Builds a BFCAllocator for all local GPUs that uses collective memory.
 absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
diff --git a/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc b/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc
index 955b33b119a5eb..924130dbdbde8f 100644
--- a/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc
@@ -82,6 +82,12 @@ absl::StatusOr<std::string> GetDataTypeString(xla::PrimitiveType data_type) {
       return "s32";
     case xla::PrimitiveType::S64:
       return "s64";
+    case xla::PrimitiveType::PRED:
+      return "pred";
+    case xla::PrimitiveType::S8:
+      return "s8";
+    case xla::PrimitiveType::U8:
+      return "u8";
     default:
       return absl::InvalidArgumentError("Invalida data type.");
   }
@@ -127,6 +133,18 @@ TEST(NvshmemGpuCollectivesTest, NvshmemAllReduceFloat) {
   RunNvshmemTest(PrimitiveType::F32, "all_reduce");
 }
 
+TEST(NvshmemGpuCollectivesTest, NvshmemAllReducePred) {
+  RunNvshmemTest(PrimitiveType::PRED, "all_reduce");
+}
+
+TEST(NvshmemGpuCollectivesTest, NvshmemAllReduceInt8) {
+  RunNvshmemTest(PrimitiveType::S8, "all_reduce");
+}
+
+TEST(NvshmemGpuCollectivesTest, NvshmemAllReduceUint8) {
+  RunNvshmemTest(PrimitiveType::U8, "all_reduce");
+}
+
 absl::Status NvshmemCollectiveTestBody(int rank_id, int num_ranks,
                                        int input_data_type,
                                        absl::string_view test_case) {
@@ -335,6 +353,45 @@ absl::Status NvshmemCollectiveTestBody(int rank_id, int num_ranks,
                 /*device_layout=*/nullptr));
         break;
       }
+      case xla::PrimitiveType::PRED: {
+        std::vector<uint8_t> data_array{10};
+        TF_ASSIGN_OR_RETURN(
+            input,
+            client->BufferFromHostBuffer(
+                data_array.data(), shape.element_type(), shape.dimensions(),
+                /*byte_strides=*/std::nullopt,
+                PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+                /*on_done_with_host_buffer=*/nullptr,
+                *device->default_memory_space(),
+                /*device_layout=*/nullptr));
+        break;
+      }
+      case xla::PrimitiveType::S8: {
+        std::vector<int8_t> data_array{10};
+        TF_ASSIGN_OR_RETURN(
+            input,
+            client->BufferFromHostBuffer(
+                data_array.data(), shape.element_type(), shape.dimensions(),
+                /*byte_strides=*/std::nullopt,
+                PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+                /*on_done_with_host_buffer=*/nullptr,
+                *device->default_memory_space(),
+                /*device_layout=*/nullptr));
+        break;
+      }
+      case xla::PrimitiveType::U8: {
+        std::vector<uint8_t> data_array{10};
+        TF_ASSIGN_OR_RETURN(
+            input,
+            client->BufferFromHostBuffer(
+                data_array.data(), shape.element_type(), shape.dimensions(),
+                /*byte_strides=*/std::nullopt,
+                PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+                /*on_done_with_host_buffer=*/nullptr,
+                *device->default_memory_space(),
+                /*device_layout=*/nullptr));
+        break;
+      }
       default:
         return absl::InvalidArgumentError("Invalida data type.");
     }
@@ -480,6 +537,21 @@ absl::Status NvshmemCollectiveTestBody(int rank_id, int num_ranks,
         TF_RET_CHECK(literal->data<int64_t>()[0] == ref_data[0]);
         break;
       }
+      case xla::PrimitiveType::PRED: {
+        std::vector<uint8_t> ref_data{20};
+        TF_RET_CHECK(literal->data<uint8_t>()[0] == ref_data[0]);
+        break;
+      }
+      case xla::PrimitiveType::S8: {
+        std::vector<int8_t> ref_data{20};
+        TF_RET_CHECK(literal->data<int8_t>()[0] == ref_data[0]);
+        break;
+      }
+      case xla::PrimitiveType::U8: {
+        std::vector<uint8_t> ref_data{20};
+        TF_RET_CHECK(literal->data<uint8_t>()[0] == ref_data[0]);
+        break;
+      }
       default:
         return absl::InvalidArgumentError("Invalida data type.");
     }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 1c9ef71a0ea629..4fa963b9040ba1 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
@@ -60,6 +61,9 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/buffer_sequencing_event.h"
+#include "xla/pjrt/common_pjrt_client.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
@@ -71,6 +75,7 @@ limitations under the License.
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/host_to_device_transfer_manager.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -80,9 +85,11 @@ limitations under the License.
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/pjrt/raw_buffer.h"
 #include "xla/pjrt/se_raw_buffer.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/worker_thread.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
@@ -100,6 +107,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
@@ -131,7 +139,6 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "xla/service/gpu/model/gpu_collective_performance_model.h"
 #include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
 #elif TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -157,378 +164,6 @@ absl::Status RunCallbackOnStream(se::Stream* stream,
       });
 }
 
-class GpuAsyncHostToDeviceTransferManager
-    : public xla::PjRtClient::AsyncHostToDeviceTransferManager {
- public:
-  static absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
-  Create(absl::Span<const PjRtClient::ShapeSpec> shape_specs,
-         std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
-         PjRtStreamExecutorDevice* device, PjRtStreamExecutorClient* client,
-         PjRtMemorySpace* memory_space) {
-    if (device_layouts.has_value() &&
-        device_layouts->size() != shape_specs.size()) {
-      return InvalidArgument(
-          "Number of layouts %d does not match the number of shapes %d",
-          device_layouts->size(), shape_specs.size());
-    }
-    absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers;
-    absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs;
-    absl::InlinedVector<BufferSequencingEventRef, 4> definition_events;
-    absl::InlinedVector<Shape, 4> device_shapes;
-    buffers.reserve(shape_specs.size());
-    buffer_ptrs.reserve(shape_specs.size());
-    definition_events.reserve(shape_specs.size());
-    device_shapes.reserve(shape_specs.size());
-    for (int i = 0; i < shape_specs.size(); ++i) {
-      const PjRtClient::ShapeSpec& shape_spec = shape_specs[i];
-      if (shape_spec.element_type == TUPLE) {
-        return Unimplemented(
-            "Async buffer transfer of tuples not implemented.");
-      }
-      // Initialize a definition event for each async buffer. The definition
-      // event will block the buffer usage until the transfer is done.
-      definition_events.push_back(
-          BufferSequencingEvent::Create(client->thread_pool()));
-      Shape& device_shape = device_shapes.emplace_back(
-          ShapeUtil::MakeShape(shape_spec.element_type, shape_spec.dims));
-      if (device_layouts.has_value() && (*device_layouts)[i].has_value()) {
-        *device_shape.mutable_layout() = *(*device_layouts)[i];
-      } else {
-        TF_ASSIGN_OR_RETURN(device_shape,
-                            client->client()
-                                ->backend()
-                                .transfer_manager()
-                                ->ChooseCompactLayoutForShape(device_shape));
-      }
-      LocalDeviceState* local_device = device->local_device_state();
-      se::Stream* h2d_stream = local_device->host_to_device_stream();
-      TF_ASSIGN_OR_RETURN(auto buffer,
-                          AllocateDestinationBuffer(
-                              device_shape, device, local_device, h2d_stream,
-                              /*is_uninitialized_create=*/true, client,
-                              definition_events.back(), memory_space));
-      // Get a temporary hold just so we can fish out a shared_ptr to the
-      // TrackedDeviceBuffer. It's ok to drop the hold before return the
-      // buffers, because the invariants of this class ensure that the buffer
-      // definition event will not fire until after all of this class' uses of
-      // the TrackedDeviceBuffer have completed.
-      auto* se_buffer =
-          tensorflow::down_cast<PjRtStreamExecutorBuffer*>(buffer.get());
-      DCHECK(se_buffer);
-      auto hold = se_buffer->GetBufferWithUsageHold();
-      buffer_ptrs.push_back(hold->device_memory());
-      buffers.push_back(std::move(buffer));
-    }
-
-    return std::make_unique<GpuAsyncHostToDeviceTransferManager>(
-        std::move(buffers), std::move(buffer_ptrs),
-        std::move(definition_events), std::move(device_shapes), device);
-  }
-
-  GpuAsyncHostToDeviceTransferManager(
-      absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers,
-      absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs,
-      absl::InlinedVector<BufferSequencingEventRef, 4> definition_events,
-      absl::InlinedVector<Shape, 4> device_shapes,
-      PjRtStreamExecutorDevice* device)
-      : buffers_(std::move(buffers)),
-        buffer_ptrs_(std::move(buffer_ptrs)),
-        definition_events_(std::move(definition_events)),
-        device_shapes_(std::move(device_shapes)),
-        remaining_buffer_count_(buffer_ptrs_.size()),
-        transfers_in_flight_(0),
-        device_(device) {
-    buffer_sizes_.reserve(buffer_ptrs_.size());
-    for (const auto& ptr : buffer_ptrs_) {
-      DCHECK(ptr);
-      buffer_sizes_.push_back(ptr->mem().size());
-    }
-    last_transfer_started_.resize(buffer_ptrs_.size(), false);
-  }
-
-  ~GpuAsyncHostToDeviceTransferManager() override {
-    auto transfers_finished = [this]() {
-      mu_.AssertHeld();
-      return transfers_in_flight_ == 0;
-    };
-    {
-      absl::MutexLock l(mu_);
-      // Make sure we don't leave dangling pointers in cleanup routines even
-      // if the client lets the object go out of scope.
-      mu_.Await(absl::Condition(&transfers_finished));
-    }
-  }
-
-  size_t buffer_count() const override { return buffers_.size(); };
-
-  size_t buffer_size(int buffer_index) const override {
-    DCHECK_LT(buffer_index, buffer_sizes_.size());
-    return buffer_sizes_[buffer_index];
-  }
-
-  PjRtDevice* device() const override { return device_; }
-
-  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override {
-    DCHECK_LT(buffer_index, buffers_.size());
-    return std::move(buffers_[buffer_index]);
-  };
-
-  absl::Status TransferLiteralToBuffer(
-      int buffer_index, const LiteralSlice& literal,
-      absl::AnyInvocable<void() &&> on_done) override {
-    tsl::profiler::TraceMe traceme(
-        "GpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer");
-    auto* stream = device_->local_device_state()->host_to_device_stream();
-    auto* se_client =
-        tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
-    DCHECK(se_client);
-
-    TransferManager* transfer_manager =
-        se_client->client()->backend().transfer_manager();
-
-    tsl::RCReference<RawSEDeviceMemory> buffer;
-    {
-      absl::MutexLock l(mu_);
-
-      DCHECK_LT(buffer_index, buffer_ptrs_.size());
-      if (last_transfer_started_[buffer_index]) {
-        return InvalidArgument(
-            "TransferLiteralToBuffer requested for buffer index %d which has "
-            "already been fully transferred",
-            buffer_index);
-      }
-      last_transfer_started_[buffer_index] = true;
-      buffer = buffer_ptrs_[buffer_index];
-      DCHECK(buffer);
-      ++transfers_in_flight_;
-    }
-
-    // The host to device transfer is performed on a thread pool, mostly because
-    // it includes linearization that may be slow.
-    // TODO(misard) assess if it would be preferable to introduce a heuristic to
-    // put the transfer into the calling thread for small literals.
-    auto transfer_h2d = [this, se_client, buffer_index, stream,
-                         transfer_manager, literal, device = device_,
-                         device_buffer = buffer,
-                         local_device =
-                             std::move(device_->local_device_state()),
-                         on_done = std::move(on_done)]() mutable {
-      tsl::profiler::TraceMe traceme(
-          "GpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer::"
-          "transfer_"
-          "h2d");
-
-      auto event = local_device->event_pool().AllocateEvent(stream->parent());
-
-      // Initiate linearization and transfer of the buffer on the stream.
-      ShapedBuffer buffer =
-          device_buffer->AsShapedBuffer(device, device_shapes_[buffer_index]);
-      TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-          stream, literal, buffer));
-      local_device->event_pool().ThenRecordEvent(stream, event.value());
-
-      // Call cleanup once the transfer has finished on the stream.
-      auto cleanup = [this, buffer_index, stream, on_done = std::move(on_done),
-                      event = std::move(event).value()]() mutable {
-        CleanUp(buffer_index, std::move(event), stream,
-                /*is_last_transfer=*/true, std::move(on_done));
-      };
-      auto status = RunCallbackOnStream(stream, se_client->thread_pool(),
-                                        std::move(cleanup));
-      if (!status.ok()) {
-        LOG(ERROR) << "DoHostCallback failed: " << status;
-      }
-    };
-    se_client->thread_pool()->Schedule(
-        ([ptr = new absl::AnyInvocable<void()>(std::move(transfer_h2d))]() {
-          (*ptr)();
-          delete ptr;
-        }));
-    return absl::OkStatus();
-  }
-
-  absl::Status TransferRawDataToBuffer(
-      int buffer_index, absl::string_view data,
-      absl::AnyInvocable<void() &&> on_done) override {
-    return TransferRawDataToSubBuffer(buffer_index, data.data(),
-                                      /*offset=*/0, data.size(),
-                                      /*is_last_transfer=*/true,
-                                      std::move(on_done));
-  }
-
-  absl::Status TransferRawDataToSubBuffer(
-      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
-      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
-    auto* stream = device_->local_device_state()->host_to_device_stream();
-
-    auto* client =
-        tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
-    bool should_stage_host_to_device_transfers =
-        client->should_stage_host_to_device_transfers() &&
-        (!client->IsDmaMapped(data, transfer_size));
-
-    std::shared_ptr<void> staging_buffer;
-    if (should_stage_host_to_device_transfers) {
-      auto* host_memory_allocator = client->host_memory_allocator();
-      if (host_memory_allocator == nullptr) {
-        return InvalidArgument(
-            "host_memory_allocator should be initialized for staging buffer "
-            "transfer.");
-      }
-
-      void* ptr = host_memory_allocator->AllocateRaw(
-          tsl::Allocator::kAllocatorAlignment, transfer_size);
-      staging_buffer = std::shared_ptr<void>(
-          ptr, [host_memory_allocator = host_memory_allocator](void* ptr) {
-            host_memory_allocator->DeallocateRaw(ptr);
-          });
-    }
-
-    absl::ReleasableMutexLock l(mu_);
-    DCHECK_LT(buffer_index, buffer_ptrs_.size());
-    if (last_transfer_started_[buffer_index]) {
-      return InvalidArgument(
-          "TransferRawData requested for buffer index %d which has "
-          "already been fully transferred",
-          buffer_index);
-    }
-    if (is_last_transfer) {
-      last_transfer_started_[buffer_index] = true;
-    }
-    DCHECK(buffer_ptrs_[buffer_index]);
-    auto& buffer_memory = buffer_ptrs_[buffer_index]->mem();
-    se::DeviceMemoryBase sub_buffer;
-    CHECK_LE(offset, buffer_memory.size());
-    CHECK_LE(transfer_size, buffer_memory.size() - offset);
-    if (transfer_size < buffer_memory.size()) {
-      sub_buffer = buffer_memory.GetByteSlice(offset, transfer_size);
-    } else {
-      sub_buffer = buffer_memory;
-    }
-
-    ++transfers_in_flight_;
-    // Release the lock before transfer in case transfer or cleanup could be
-    // called on this thread, to avoid deadlock.
-    l.Release();
-
-    auto event = device_->local_device_state()->event_pool().AllocateEvent(
-        stream->parent());
-
-    if (transfer_size != 0) {
-      if (staging_buffer != nullptr) {
-        auto copy_to_staging_buffer = [data, transfer_size,
-                                       staging_buffer]() mutable {
-          std::memcpy(staging_buffer.get(), data, transfer_size);
-        };
-        if (auto status =
-                stream->DoHostCallback(std::move(copy_to_staging_buffer));
-            !status.ok()) {
-          return status;
-        }
-        if (auto status = stream->Memcpy(&sub_buffer, staging_buffer.get(),
-                                         transfer_size);
-            !status.ok()) {
-          return status;
-        }
-      } else if (auto status = stream->Memcpy(&sub_buffer, data, transfer_size);
-                 !status.ok()) {
-        return status;
-      }
-    }
-    device_->local_device_state()->event_pool().ThenRecordEvent(stream,
-                                                                event.value());
-
-    auto cleanup = [this, buffer_index, event = std::move(event).value(),
-                    stream, is_last_transfer, on_done = std::move(on_done),
-                    staging_buffer = std::move(staging_buffer)]() mutable {
-      CleanUp(buffer_index, std::move(event), stream, is_last_transfer,
-              std::move(on_done));
-    };
-    return RunCallbackOnStream(stream, client->thread_pool(),
-                               std::move(cleanup));
-  }
-
-  void SetBufferError(int buffer_index, absl::Status error) override {
-    BufferSequencingEventRef event;
-    {
-      absl::MutexLock l(mu_);
-      // For a given buffer_index, SetBufferError can't be called twice, or
-      // called after the last transfer has been enqueued.
-      event = std::move(definition_events_[buffer_index]);
-      CHECK(event);
-      CHECK(!event->IsDefined());
-    }
-    VLOG(1) << "SetBufferError sets the " << buffer_index
-            << "th buffer error: " << error;
-    tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client())
-        ->SetEventAsError(event, error);
-  }
-
-  void AddTransferMetadata(const TransferMetadata& meta) override {}
-
- private:
-  absl::Mutex mu_;
-  // The newly created buffers, which will be returned to the caller via
-  // Retrieve.
-  absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers_;
-  // Cached versions of the sizes of all the buffers, so we can return them
-  // without acquiring mu_.
-  absl::InlinedVector<size_t, 4> buffer_sizes_;
-  // References to the underlying storage for all the buffers, which ensures
-  // that the buffers can't be freed before all transfers complete.
-  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs_
-      ABSL_GUARDED_BY(mu_);
-  // True if the last transfer for a buffer has been initiated. Used to prevent
-  // a client initiating another transfer after the last transfer has already
-  // been initiated.
-  absl::InlinedVector<bool, 4> last_transfer_started_ ABSL_GUARDED_BY(mu_);
-  // The buffer definition events on all the buffers, unblocked once the
-  // corresponding buffer transfer has completed.
-  absl::InlinedVector<BufferSequencingEventRef, 4> definition_events_
-      ABSL_GUARDED_BY(mu_);
-  // Device shapes for all buffers with either compact or custom layout.
-  const absl::InlinedVector<Shape, 4> device_shapes_;
-  // Count of buffers that have not yet been fully transferred.
-  size_t remaining_buffer_count_ ABSL_GUARDED_BY(mu_);
-  // Count of transfers that have been started but have not yet called cleanup.
-  // Used to block in the destructor to avoid dangling pointers in cleanup.
-  int transfers_in_flight_ ABSL_GUARDED_BY(mu_);
-
-  PjRtStreamExecutorDevice* device_;  // not owned.
-
-  void CleanUp(int buffer_index, EventPool::Handle device_event,
-               se::Stream* stream, bool is_last_transfer,
-               absl::AnyInvocable<void() &&> on_done) {
-    BufferSequencingEventRef event;
-    {
-      absl::MutexLock l(mu_);
-
-      CHECK_GT(transfers_in_flight_, 0);
-      --transfers_in_flight_;
-      if (is_last_transfer) {
-        // Drop our reference to the TrackedDeviceBuffer for this buffer.
-        CHECK(buffer_ptrs_[buffer_index]);
-        buffer_ptrs_[buffer_index] = tsl::RCReference<xla::RawSEDeviceMemory>();
-        CHECK_GT(remaining_buffer_count_, 0);
-        --remaining_buffer_count_;
-        definition_events_[buffer_index]->SetSequencingEvent(
-            std::move(device_event), stream);
-        event = std::move(definition_events_[buffer_index]);
-        if (remaining_buffer_count_ == 0) {
-          VLOG(1) << "TransferLiteralToBuffer for all buffers is done.";
-        }
-      }
-    }
-
-    // Call on_done after finishing all housekeeping and releasing the lock.
-    std::move(on_done)();
-    // CleanUp happens after the events have already been waited on.
-    if (event) {
-      event.SetStateConcrete();
-    }
-  }
-};
-
 static std::optional<stream_executor::GpuTargetConfigProto>
 GetTargetConfigForDevices(absl::Span<PjRtDevice* const> devices) {
   // Temporary ability to disable TargetConfig via env var until
@@ -547,7 +182,7 @@ GetTargetConfigForDevices(absl::Span<PjRtDevice* const> devices) {
         tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
             ->local_device_state();
     if (local_device_state != nullptr) {
-      return xla::Compiler::TargetConfig(local_device_state->executor())
+      return xla::Compiler::GpuTargetConfig(local_device_state->executor())
           .ToProto();
     }
   }
@@ -660,16 +295,11 @@ StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
     absl::Span<const PjRtClient::ShapeSpec> shape_specs,
     std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
     PjRtMemorySpace* memory_space) {
-  CHECK_EQ(memory_space->devices().size(), 1);
-  PjRtDevice* device = memory_space->devices()[0];
-  auto* stream_executor_device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device);
-  return xla::GpuAsyncHostToDeviceTransferManager::Create(
-      shape_specs, std::move(device_layouts), stream_executor_device, this,
-      memory_space);
+  return xla::CreateAsyncHostToDeviceTransferManager(
+      shape_specs, std::move(device_layouts), memory_space);
 }
 
-absl::StatusOr<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
+absl::flat_hash_map<GlobalDeviceId, IncarnationId>
 StreamExecutorGpuClient::GetLatestIncarnations(const ExecuteOptions& options) {
   // Map every device to its incarnation.
   absl::flat_hash_map<GlobalDeviceId, IncarnationId> device_incarnations;
@@ -679,7 +309,9 @@ StreamExecutorGpuClient::GetLatestIncarnations(const ExecuteOptions& options) {
 
     auto it = options.incarnations.find(task_id);
     if (it == options.incarnations.end()) {
-      return FailedPrecondition("Incarnation for task %d not found", task_id);
+      // The task might be dead.
+      LOG(WARNING) << "Incarnation for task " << task_id << " not found";
+      continue;
     }
     device_incarnations[device_id] = it->second;
   }
@@ -688,13 +320,10 @@ StreamExecutorGpuClient::GetLatestIncarnations(const ExecuteOptions& options) {
 
 gpu::GpuExecutableRunOptions* StreamExecutorGpuClient::gpu_run_options(
     const ExecuteOptions& options) {
-  absl::StatusOr<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
-      incarnations = GetLatestIncarnations(options);
-  if (!incarnations.ok()) {
-    VLOG(1) << "Unable to set incarnations in GpuExecutableRunOptions: "
-            << incarnations.status();
-  } else {
-    gpu_run_options_->set_incarnations(*std::move(incarnations));
+  if (!options.incarnations.empty()) {
+    absl::flat_hash_map<GlobalDeviceId, IncarnationId> incarnations =
+        GetLatestIncarnations(options);
+    gpu_run_options_->set_incarnations(std::move(incarnations));
   }
   return gpu_run_options_.get();
 }
@@ -714,267 +343,431 @@ StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
                                                               num_partitions);
 }
 
-Future<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
-    PjRtBuffer* pjrt_buffer, Future<void*> dst, int64_t offset,
-    int64_t transfer_size) {
-  auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buffer);
-  DCHECK(buffer);
-  auto* device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(buffer->device());
-  LocalDeviceState* local_device = device->local_device_state();
-  se::Stream* stream = local_device->GetDeviceToHostStream();
-
-  // Acquire the usage hold inline so that the buffer is kept alive even if
-  // `dst` is not immediately available.
-  PjRtStreamExecutorBuffer::ScopedHold hold(buffer->GetBufferWithUsageHold());
-  if (!hold.ok()) {
-    return Future<>(hold.status());
+absl::Status StreamExecutorGpuClient::UpdateCompileOptionsInternal(
+    CompileOptions* options, ExecutableExtras* returned_extras,
+    bool lookup_addressable_devices) {
+  TF_RETURN_IF_ERROR(PjRtStreamExecutorClient::UpdateCompileOptionsInternal(
+      options, returned_extras, lookup_addressable_devices));
+  if (topology_) {
+    options->executable_build_options.set_slice_size(
+        topology_->gpu_topology().slice_size());
   }
+  return absl::OkStatus();
+}
+
+std::string CrossHostTransferName(PjRtGlobalDeviceId src_global_device_id,
+                                  PjRtGlobalDeviceId dst_global_device_id,
+                                  RunId transfer_run_id) {
+  return absl::StrCat("cross_host_transfer-", src_global_device_id.value(),
+                      "_to_", dst_global_device_id.value(), "-run_",
+                      transfer_run_id.ToInt());
+}
 
-  auto device_memory = hold->device_memory();
-  if (!device_memory) {
-    return Future<>(
-        InvalidArgument("Copy raw buffer called on an invalid buffer"));
+absl::StatusOr<std::unique_ptr<Communicator>> CreateTransferCommunicator(
+    LocalDeviceState* local_device, gpu::GpuCollectives* gpu_collectives,
+    CliqueId clique_id, bool is_sender) {
+  VLOG(3) << "Creating a new communicator for cross host transfer, is_sender = "
+          << is_sender;
+
+  // Create the communicator.
+  //
+  // TODO(mwhittaker): The way we are constructing GpuCliqueKeys is a
+  // big hack. This code doesn't know the GlobalDeviceId of the sending
+  // process. Instead, we use two arbitrary GlobalDeviceIds. This
+  // works because NcclCommunicators don't actually use the
+  // GlobalDeviceIds. Instead, they just need to the know the number
+  // of devices (2 in this case).
+  gpu::GpuCliqueKey clique_key(
+      /*devices=*/{GlobalDeviceId(0), GlobalDeviceId(1)},
+      /*num_local_participants=*/1);
+  CliqueIds clique_ids(clique_id);
+  gpu::GpuCollectives::Device collectives_device(local_device->executor());
+  std::vector<Collectives::DeviceRank> ranks = {
+      Collectives::DeviceRank(&collectives_device, RankId(is_sender ? 1 : 0))};
+  gpu::GpuCollectives::Config config;
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Communicator>> communicators,
+                      gpu_collectives->CreateCommunicators(
+                          clique_key, clique_ids, ranks, config));
+  CHECK_EQ(communicators.size(), 1);
+
+  return std::move(communicators[0]);
+}
+
+absl::StatusOr<std::vector<Future<>>>
+StreamExecutorGpuClient::CrossHostSendBuffers(
+    absl::Span<PjRtBuffer* const> buffers,
+    absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Validate arguments.
+  if (dst_global_device_ids.size() != buffers.size() ||
+      transfer_keys.size() != buffers.size()) {
+    return InvalidArgument(
+        "CrossHostSendBuffers: buffers, "
+        "dst_global_device_ids, and transfer_keys "
+        "must have the same length, but got %d, %d, and %d.",
+        buffers.size(), dst_global_device_ids.size(), transfer_keys.size());
   }
 
-  auto [promise, future] = Future<>::MakePromise();
-  auto usage_event = BufferSequencingEvent::Create(this->thread_pool());
-
-  auto definition_events = hold->definition_events();
-  auto first_definition_event = definition_events[0];
-
-  // When using the ComputeSynchronized allocation model, retain a reference to
-  // the device_buffer until the copy completes, to ensure that the buffer isn't
-  // deleted or donated while it is still in use. The choice of retaining a
-  // reference at the host is a heuristic; the alternative is to ensure, before
-  // freeing the buffer, that the compute stream is synchronized past the
-  // transfer, but it seems better to hold onto the buffer too long than to
-  // stall the compute stream.
-  hold.ConvertUsageHold(stream, usage_event, /*reference_held=*/true);
-
-  auto async_copy = [this, promise = std::move(promise).ToShared(), offset,
-                     transfer_size, stream, local_device,
-                     owning_device_memory = std::move(device_memory),
-                     definition_events = std::move(definition_events),
-                     usage_event = std::move(usage_event)](
-                        absl::StatusOr<void*> dst) mutable {
-    absl::StatusOr<EventPool::Handle> event =
-        local_device->event_pool().AllocateEvent(stream->parent());
-    if (!event.ok()) {
-      promise->Set(event.status());
-      return;
-    }
+  // Perform sends.
+  std::vector<Future<>> out_futures;
+  out_futures.reserve(buffers.size());
+  for (int i = 0; i < buffers.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        Future<> curr_future,
+        CrossHostSendBuffer(buffers[i], dst_global_device_ids[i],
+                            transfer_keys[i]));
+    out_futures.push_back(std::move(curr_future));
+  }
+  return out_futures;
+}
 
-    absl::Status defined_status = definition_events[0]->GetDefinedStatus();
-    if (!defined_status.ok()) {
-      promise->Set(defined_status);
-      return;
-    }
+// Helpers used inside CrossHostSendBuffer to acquire a hold on a send buffer
+// and get its raw buffer and definition events. This is used to ensure that the
+// buffer is not deleted while the send is in progress.
+struct HeldSendBuffer {
+  tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
+  std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events;
+};
 
-    auto& device_memory = owning_device_memory->mem();
-    if (offset < 0 || offset > device_memory.size() ||
-        device_memory.size() - offset < transfer_size) {
-      promise->Set(
-          InvalidArgument("Copy raw buffer called on buffer size %lld with "
-                          "invalid offset %lld, transfer size %lld",
-                          device_memory.size(), offset, transfer_size));
-      return;
-    }
+absl::StatusOr<HeldSendBuffer> AcquireHeldSendBuffer(
+    tsl::RCReference<PjRtDeviceEvent> usage_event,
+    CommonPjRtBufferImpl* buffer_impl, const char* caller_name) {
+  tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
+  std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events;
+
+  TF_RETURN_IF_ERROR(buffer_impl->AcquireScopedRawBuffer(
+      [&](tsl::RCReference<CommonPjRtRawBuffer> buf_raw_buffer,
+          std::vector<tsl::RCReference<tsl::AsyncValue>>
+              buf_definition_events) mutable
+          -> absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> {
+        raw_buffer = std::move(buf_raw_buffer);
+        usage_event->AndThen([raw_buffer]() {});
+        definition_events = std::move(buf_definition_events);
+        return usage_event;
+      },
+      caller_name));
 
-    std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
-    if (transfer_size < device_memory.size()) {
-      sub_buffer = std::make_unique<se::DeviceMemoryBase>(
-          device_memory.GetByteSlice(offset, transfer_size));
-    } else {
-      sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
-    }
+  return HeldSendBuffer{std::move(raw_buffer), std::move(definition_events)};
+}
 
-    WaitForBufferDefinitionEventsOnStream(absl::MakeSpan(definition_events),
-                                          stream);
-
-    if (transfer_size != 0) {
-      if (should_stage_host_to_device_transfers() &&
-          !IsDmaMapped(dst.value(), transfer_size)) {
-        if (host_memory_allocator() == nullptr) {
-          promise->Set(
-              InvalidArgument("host_memory_allocator should be initialized for "
-                              "staging buffer transfer."));
-          return;
-        }
-        void* ptr = host_memory_allocator()->AllocateRaw(
-            tsl::Allocator::kAllocatorAlignment, transfer_size);
+absl::StatusOr<Future<>> StreamExecutorGpuClient::CrossHostSendBuffer(
+    PjRtBuffer* buffer, PjRtGlobalDeviceId dst_global_device_id,
+    CrossHostTransferKey transfer_key) {
+  // Get the default GpuCollectives instance.
+  TF_ASSIGN_OR_RETURN(Collectives * collectives,
+                      CollectivesRegistry::Default("gpu"));
+  gpu::GpuCollectives* gpu_collectives =
+      tsl::down_cast<gpu::GpuCollectives*>(collectives);
 
-        std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
-            ptr, [host_memory_allocator = host_memory_allocator()](void* ptr) {
-              host_memory_allocator->DeallocateRaw(ptr);
-            });
-        if (auto status = stream->Memcpy(staging_buffer.get(), *sub_buffer,
-                                         transfer_size);
-            !status.ok()) {
-          promise->Set(std::move(status));
-          return;
-        }
-        auto copy_to_staging_buffer = [dst, transfer_size,
-                                       staging_buffer]() mutable {
-          std::memcpy(*dst, staging_buffer.get(), transfer_size);
-        };
-        if (auto status = stream->DoHostCallback(copy_to_staging_buffer);
-            !status.ok()) {
-          promise->Set(std::move(status));
-          return;
-        }
-      } else {
-        // D2H request holds a non-owned pointer into sub_buffer base address
-        // that needs to outlive the transfer until the stream callback is
-        // invoked.
-        auto status = stream->Memcpy(*dst, *sub_buffer, transfer_size);
-        if (!status.ok()) {
-          promise->Set(std::move(status));
-          return;
+  // Get the local device and its id.
+  PjRtStreamExecutorDevice* pjrt_se_device =
+      tensorflow::down_cast<PjRtStreamExecutorDevice*>(buffer->device());
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      pjrt_se_device->GetLocalDeviceState());
+  PjRtGlobalDeviceId src_global_device_id = pjrt_se_device->global_device_id();
+
+  // Get the name of the transfer.
+  std::string cross_host_transfer_name = CrossHostTransferName(
+      src_global_device_id, dst_global_device_id, RunId(transfer_key.value()));
+
+  // Get the buffer's shape.
+  TF_ASSIGN_OR_RETURN(Shape shape, buffer->HostShape());
+
+  auto [promise, future] = Future<>::MakePromise();
+
+  // Create an event to track when the send is done.
+  auto usage_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
+      BufferSequencingEvent::Create(this->thread_pool()));
+
+  // Acquire a hold on the buffer and get some metadata.
+  TF_ASSIGN_OR_RETURN(
+      HeldSendBuffer held_send_buffer,
+      AcquireHeldSendBuffer(
+          usage_event, tensorflow::down_cast<CommonPjRtBufferImpl*>(buffer),
+          "CrossHostSendBuffer"));
+
+  auto send = [this, gpu_collectives, promise = std::move(promise),
+               usage_event = std::move(usage_event),
+               held_send_buffer = std::move(held_send_buffer), local_device,
+               cross_host_transfer_name, shape]() mutable {
+    se::Stream* stream = local_device->GetDeviceToDeviceStream();
+
+    auto f = [&]() -> absl::Status {
+      // Wait until the buffer we want to send is fully materialized.
+      for (const auto& event : held_send_buffer.definition_events) {
+        tsl::BlockUntilReady(event.get());
+        if (auto* status = event->GetErrorIfPresent()) {
+          return *status;
         }
       }
-    }
 
-    ThenRecordEvent(usage_event, local_device, std::move(event).value(),
-                    stream);
-
-    auto callback_status = local_device->ThenExecuteCallback(
-        stream, [promise, owning_device_memory =
-                              std::move(owning_device_memory)]() mutable {
-          promise->Set();
-        });
-    if (!callback_status.ok()) {
-      promise->Set(std::move(callback_status));
-      return;
+      // Get the clique ID from the KV store.
+      TF_ASSIGN_OR_RETURN(std::string descriptor,
+                          kv_store_->Get(cross_host_transfer_name,
+                                         cross_host_transfer_timeout_));
+      CliqueId clique_id(descriptor);
+
+      // Create a communicator.
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<Communicator> communicator,
+          CreateTransferCommunicator(local_device, gpu_collectives, clique_id,
+                                     /*is_sender=*/true));
+
+      // Send data to the receiver.
+      auto mem = tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+                     held_send_buffer.raw_buffer.get())
+                     ->device_buffer();
+
+      Future<> send_future = communicator->Send(
+          mem->mem(), shape.element_type(), ShapeUtil::ElementsIn(shape),
+          RankId(0), gpu::GpuCollectives::On(*stream));
+      TF_RETURN_IF_ERROR(send_future.Await());
+
+      // Mark send as done.
+      TF_RETURN_IF_ERROR(
+          AllocateAndRecordEvent(usage_event->event(), local_device, stream));
+
+      return absl::OkStatus();
+    };
+
+    absl::Status s = f();
+    if (!s.ok()) {
+      SetEventAsError(usage_event->event(), s);
     }
+    promise.Set(s);
   };
 
-  first_definition_event->ExecuteOrAddToFutureTasks(
-      absl::StrFormat("async_copy_raw_sub_buffer_to_host_%p", &async_copy),
-      [this, dst, async_copy = std::move(async_copy)]() mutable {
-        dst.OnReady([this, async_copy = std::move(async_copy)](
-                        absl::StatusOr<void*> dst) {
-          // Trampoline through a thread pool since GPUs do not allow calling
-          // D2H inside the callback's context.
-          thread_pool()->Schedule(absl::bind_front(async_copy, std::move(dst)));
-        });
-      });
-
-  return FutureHelpers::WithProfiling(
-      std::move(future),
-      /*on_block_start=*/
-      []() {
-        tsl::profiler::TraceMeProducer traceme(
-            "StreamExecutorGpuClient::CopyRawSubBufferToHost");
-        VLOG(1) << "StreamExecutorGpuClient::CopyRawSubBufferToHost";
-        return FutureHelpers::ProfilingKeys(
-            {/*traceme_context_id =*/traceme.GetContextId()});
-      },
-      /*on_block_end=*/
-      [](FutureHelpers::ProfilingKeys keys) {
-        tsl::profiler::TraceMeConsumer traceme(
-            "StreamExecutorGpuClient::CopyRawSubBufferToHost",
-            keys.traceme_context_id);
-      });
+  local_device->execute_thread()->Schedule(std::move(send));
+  return future;
 }
 
-absl::Status StreamExecutorGpuClient::UpdateCompileOptionsInternal(
-    CompileOptions* options, ExecutableExtras* returned_extras,
-    bool lookup_addressable_devices) {
-  TF_RETURN_IF_ERROR(PjRtStreamExecutorClient::UpdateCompileOptionsInternal(
-      options, returned_extras, lookup_addressable_devices));
-  if (topology_) {
-    options->executable_build_options.set_slice_size(
-        topology_->gpu_topology().slice_size());
-  }
-  return absl::OkStatus();
+absl::StatusOr<StreamExecutorGpuClient::PrepareReceiveBufferResult>
+StreamExecutorGpuClient::PrepareReceiveBuffer(PjRtDevice* device, Shape shape) {
+  TF_ASSIGN_OR_RETURN(auto* memory_space, device->default_memory_space());
+  TF_ASSIGN_OR_RETURN(
+      Shape on_device_shape,
+      MakeDefaultShapeForMemorySpace(
+          memory_space, shape, shape.has_layout() ? &shape.layout() : nullptr));
+  TF_ASSIGN_OR_RETURN(size_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, on_device_shape));
+
+  // Allocate an uninitialized buffer. The buffer will be populated with data
+  // received from the sending process.
+  TF_ASSIGN_OR_RETURN(tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+                      AllocateRawBuffer(memory_space, on_device_bytes_count,
+                                        /*retry_on_oom=*/true,
+                                        /*allocate_after=*/{}));
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
+                          ->GetLocalDeviceState());
+
+  se::Stream* stream = local_device->GetDeviceToDeviceStream();
+
+  BufferSequencingEventRef definition_event =
+      BufferSequencingEvent::Create(this->thread_pool());
+  TF_ASSIGN_OR_RETURN(
+      auto buffer,
+      DefineBuffer(
+          on_device_shape, memory_space, raw_buffer,
+          {tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_event)},
+          /*raw_buffer_is_mutable=*/true));
+
+  return PrepareReceiveBufferResult{std::move(buffer), std::move(raw_buffer),
+                                    local_device, stream,
+                                    std::move(definition_event)};
 }
 
-void StreamExecutorGpuClient::CopyToRemoteDevice(
-    PjRtBuffer* buffer, absl::string_view serialized_descriptor,
-    PjRtBuffer::RemoteSendCallback on_done) {
-  // Get the default GpuCollectives instance.
-  absl::StatusOr<Collectives*> collectives =
-      CollectivesRegistry::Default("gpu");
-  if (!collectives.ok()) {
-    on_done(collectives.status(), /*sends_were_enqueued=*/false);
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+StreamExecutorGpuClient::CrossHostReceiveBuffers(
+    xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+    absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Validate arguments.
+  if (shapes.empty()) {
+    return InvalidArgument("shapes parameter empty in CrossHostReceiveBuffers");
   }
-  gpu::GpuCollectives* gpu_collectives =
-      tsl::down_cast<gpu::GpuCollectives*>(*collectives);
-  if (gpu_collectives == nullptr) {
-    on_done(absl::InternalError("Failed to get GPU collectives"),
-            /*sends_were_enqueued=*/false);
+  if (src_global_device_ids.size() != shapes.size() ||
+      transfer_keys.size() != shapes.size()) {
+    return InvalidArgument(
+        "CrossHostReceiveBuffers: shapes, src_global_device_ids, and "
+        "transfer_keys must have the same length, but got %d, %d, and %d.",
+        shapes.size(), src_global_device_ids.size(), transfer_keys.size());
   }
 
-  // Parse the CliqueId;
-  CliqueId clique_id(serialized_descriptor);
-
-  // Get the local device.
-  absl::StatusOr<LocalDeviceState*> local_device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(buffer->device())
-          ->GetLocalDeviceState();
-  if (!local_device.ok()) {
-    on_done(local_device.status(), /*sends_were_enqueued=*/false);
+  // Perform receives.
+  std::vector<std::unique_ptr<PjRtBuffer>> receive_buffers;
+  receive_buffers.reserve(shapes.size());
+  for (int i = 0; i < shapes.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<PjRtBuffer> receive_buffer,
+        CrossHostReceiveBuffer(shapes[i], device, src_global_device_ids[i],
+                               transfer_keys[i]));
+    receive_buffers.push_back(std::move(receive_buffer));
   }
+  return receive_buffers;
+}
 
-  // Get the buffer's shape.
-  absl::StatusOr<Shape> shape = buffer->HostShape();
-  if (!shape.ok()) {
-    on_done(shape.status(), /*sends_were_enqueued=*/false);
-  }
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+StreamExecutorGpuClient::CrossHostReceiveBuffer(
+    xla::Shape shape, xla::PjRtDevice* device,
+    PjRtGlobalDeviceId src_global_device_id,
+    CrossHostTransferKey transfer_key) {
+  // Get the default GpuCollectives instance.
+  TF_ASSIGN_OR_RETURN(Collectives * collectives,
+                      CollectivesRegistry::Default("gpu"));
+  gpu::GpuCollectives* gpu_collectives =
+      tsl::down_cast<gpu::GpuCollectives*>(collectives);
 
-  // Acquire a hold on the buffer.
-  auto* handle = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(buffer);
-  PjRtStreamExecutorBuffer::ScopedHold hold = handle->GetBufferWithUsageHold();
+  // Get the name of the transfer.
+  PjRtGlobalDeviceId dst_global_device_id = device->global_device_id();
+  std::string cross_host_transfer_name = CrossHostTransferName(
+      src_global_device_id, dst_global_device_id, RunId(transfer_key.value()));
 
-  auto send = [gpu_collectives, clique_id, on_done, mem = hold->device_memory(),
-               local_device = *local_device, shape = *shape,
-               dtype = buffer->element_type(),
-               stream = (*local_device)->GetDeviceToDeviceStream()]() mutable {
+  TF_ASSIGN_OR_RETURN(
+      StreamExecutorGpuClient::PrepareReceiveBufferResult receive_prep_result,
+      PrepareReceiveBuffer(device, shape));
+
+  auto recv = [this, gpu_collectives, cross_host_transfer_name,
+               local_device = receive_prep_result.local_device,
+               definition_event = receive_prep_result.definition_event,
+               stream = receive_prep_result.stream,
+               raw_buffer = std::move(receive_prep_result.raw_buffer), shape,
+               dtype = receive_prep_result.buffer->element_type()]() mutable {
+    WaitForAllocation(stream, *raw_buffer);
     auto f = [&]() -> absl::Status {
+      auto mem =
+          tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(raw_buffer.get())
+              ->device_buffer();
+
+      // Construct the clique ID and set the descriptor in the KV store.
+      TF_ASSIGN_OR_RETURN(CliqueId clique_id,
+                          gpu_collectives->CreateUniqueCliqueId());
+      std::string descriptor = clique_id.ToString();
+      TF_RETURN_IF_ERROR(kv_store_->Set(cross_host_transfer_name, descriptor));
+
       // Create a communicator.
-      //
-      // TODO(mwhittaker): The way we are constructing GpuCliqueKeys is a big
-      // hack. This code doesn't know the GlobalDeviceId of the sending process.
-      // Instead, we use two arbitrary GlobalDeviceIds. This works because
-      // NcclCommunicators don't actually use the GlobalDeviceIds.  Instead,
-      // they just need to the know the number of devices (2 in this case).
-      gpu::GpuCliqueKey clique_key(
-          /*devices=*/{GlobalDeviceId(0), GlobalDeviceId(1)},
-          /*num_local_participants=*/1);
-      CliqueIds clique_ids(clique_id);
-      gpu::GpuCollectives::Device collectives_device(local_device->executor());
-      std::vector<Collectives::DeviceRank> ranks = {
-          Collectives::DeviceRank(&collectives_device, RankId(1))};
-      gpu::GpuCollectives::Config config;
       TF_ASSIGN_OR_RETURN(
-          std::vector<std::unique_ptr<Communicator>> communicators,
-          gpu_collectives->CreateCommunicators(clique_key, clique_ids, ranks,
-                                               config));
-      CHECK_EQ(communicators.size(), 1);
-      std::unique_ptr<Communicator> communicator = std::move(communicators[0]);
+          std::unique_ptr<Communicator> communicator,
+          CreateTransferCommunicator(local_device, gpu_collectives, clique_id,
+                                     /*is_sender=*/false));
 
-      // Send data to the receiver.
-      Future<> send_future = communicator->Send(
+      // Receive data from the sender.
+      Future<> recv_future = communicator->Recv(
           mem->mem(), shape.element_type(), ShapeUtil::ElementsIn(shape),
-          RankId(0), gpu::GpuCollectives::On(*stream));
-      TF_RETURN_IF_ERROR(send_future.Await());
+          RankId(1), gpu::GpuCollectives::On(*stream));
+      TF_RETURN_IF_ERROR(recv_future.Await());
 
-      // Keep mem alive until the Send has finished executing. Note that
-      // send_event is fulfilled when the send is enqueued, but not necessarily
-      // executed.
-      TF_RETURN_IF_ERROR(local_device->ThenRelease(stream, mem));
+      // Keep mem alive until the Recv has finished executing. Note that
+      // recv_event is fulfilled when the receive is enqueued, but not
+      // necessarily executed.
+      definition_event.AndThen([mem]() {});
+
+      // Set definition event.
+      TF_RETURN_IF_ERROR(
+          AllocateAndRecordEvent(definition_event, local_device, stream));
 
       return absl::OkStatus();
     };
 
     if (absl::Status s = f(); !s.ok()) {
-      on_done(s, /*sends_were_enqueued=*/false);
-    } else {
-      on_done(absl::OkStatus(), /*sends_were_enqueued=*/true);
+      SetEventAsError(definition_event, s);
     }
   };
-  thread_pool()->Schedule(send);
+  receive_prep_result.local_device->execute_thread()->Schedule(std::move(recv));
+
+  return std::move(receive_prep_result.buffer);
+}
+
+void StreamExecutorGpuClient::ScheduleRemoteSend(
+    PjRtMemorySpace* memory_space,
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+    std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events,
+    tsl::RCReference<PjRtDeviceEventPromise> usage_event_promise,
+    Future<std::string> serialized_descriptor,
+    PjRtBuffer::RemoteSendCallback on_done) {
+  // Get the default GpuCollectives instance.
+  absl::StatusOr<Collectives*> collectives =
+      CollectivesRegistry::Default("gpu");
+  if (!collectives.ok()) {
+    on_done(collectives.status(), /*sends_were_enqueued=*/false);
+  }
+  gpu::GpuCollectives* gpu_collectives =
+      tsl::down_cast<gpu::GpuCollectives*>(*collectives);
+  if (gpu_collectives == nullptr) {
+    auto error = absl::InternalError("Failed to get GPU collectives");
+    on_done(error, /*sends_were_enqueued=*/false);
+    usage_event_promise->SetError(error);
+    return;
+  }
+
+  BufferSequencingEventRef usage_event =
+      BufferSequencingEvent::Create(this->thread_pool());
+
+  // Keep memory alive until the event is done.
+  usage_event.AndThen([raw_buffer]() {});
+
+  serialized_descriptor.OnReady(
+      [this, gpu_collectives = std::move(gpu_collectives),
+       on_done = std::move(on_done),
+       definition_events = std::move(definition_events),
+       raw_buffer = std::move(raw_buffer), usage_event = usage_event](
+          absl::StatusOr<std::string> serialized_descriptor) mutable {
+        if (!serialized_descriptor.ok()) {
+          on_done(serialized_descriptor.status(),
+                  /*sends_were_enqueued=*/false);
+          SetEventAsError(usage_event, serialized_descriptor.status());
+        }
+        auto events = absl::MakeSpan(definition_events);
+        async_work_runner()->ScheduleWhenReady(
+            events,
+            [this, on_done = std::move(on_done),
+             gpu_collectives = std::move(gpu_collectives),
+             definition_events = std::move(definition_events),
+             raw_buffer = std::move(raw_buffer), usage_event = usage_event,
+             serialized_descriptor =
+                 *std::move(serialized_descriptor)]() mutable {
+              auto status = [&]() {
+                for (const auto& event : definition_events) {
+                  if (auto* status = event->GetErrorIfPresent()) {
+                    return *status;
+                  }
+                }
+                auto* local_device =
+                    tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+                        raw_buffer.get())
+                        ->local_device();
+                auto* stream = local_device->GetDeviceToDeviceStream();
+                auto mem = tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+                               raw_buffer.get())
+                               ->device_buffer();
+                CliqueId clique_id(serialized_descriptor);
+
+                // Create a communicator.
+                TF_ASSIGN_OR_RETURN(
+                    std::unique_ptr<Communicator> communicator,
+                    CreateTransferCommunicator(local_device, gpu_collectives,
+                                               clique_id, /*is_sender=*/true));
+
+                // Send data to the receiver.
+                Future<> send_future = communicator->Send(
+                    mem->mem(), xla::PrimitiveType::U8, mem->mem().size(),
+                    RankId(0), gpu::GpuCollectives::On(*stream));
+                TF_RETURN_IF_ERROR(send_future.Await());
+
+                TF_RETURN_IF_ERROR(
+                    AllocateAndRecordEvent(usage_event, local_device, stream));
+
+                return absl::OkStatus();
+              }();
+              std::move(on_done)(status, /*sends_were_enqueued=*/status.ok());
+              if (!status.ok()) {
+                SetEventAsError(usage_event, status);
+              }
+            });
+      });
+  usage_event_promise->Set(
+      tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(std::move(usage_event)));
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
@@ -1004,31 +797,25 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
     return absl::InternalError("Failed to get GPU collectives");
   }
 
-  // Allocate an uninitialized buffer. The buffer will be populated with data
-  // received from the sending process.
-  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
-                          ->GetLocalDeviceState());
-  se::Stream* stream = local_device->GetDeviceToDeviceStream();
-  BufferSequencingEventRef definition_event =
-      BufferSequencingEvent::Create(this->thread_pool());
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtStreamExecutorBuffer> buffer,
-      AllocateDestinationBuffer(shape, device, local_device,
-                                /*copy_stream=*/stream,
-                                /*is_uninitialized_create=*/true, this,
-                                definition_event));
-
-  // Acquire a hold on the buffer to access the underlying memory.
-  PjRtStreamExecutorBuffer::ScopedHold hold = buffer->GetBufferWithUsageHold();
-
-  auto recv = [this, gpu_collectives, notifier, local_device, definition_event,
-               stream, mem = hold->device_memory(), shape = shapes[0],
-               dtype = buffer->element_type()]() mutable {
+      StreamExecutorGpuClient::PrepareReceiveBufferResult receive_prep_result,
+      PrepareReceiveBuffer(device, shape));
+
+  auto recv = [this, gpu_collectives, notifier = std::move(notifier),
+               local_device = receive_prep_result.local_device,
+               definition_event = receive_prep_result.definition_event,
+               stream = receive_prep_result.stream,
+               raw_buffer = std::move(receive_prep_result.raw_buffer),
+               shape = shapes[0],
+               dtype = receive_prep_result.buffer->element_type()]() mutable {
+    WaitForAllocation(stream, *raw_buffer);
     auto f = [&]() -> absl::Status {
       // Create a CliqueId.
       TF_ASSIGN_OR_RETURN(CliqueId clique_id,
                           gpu_collectives->CreateUniqueCliqueId());
+      auto mem =
+          tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(raw_buffer.get())
+              ->device_buffer();
 
       // Notify the caller with the CliqueId. They will send the id to the
       // sender.
@@ -1041,37 +828,21 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
       });
 
       // Create a communicator.
-      //
-      // TODO(mwhittaker): The way we are constructing GpuCliqueKeys is a big
-      // hack. This code doesn't know the GlobalDeviceId of the sending process.
-      // Instead, we use two arbitrary GlobalDeviceIds. This works because
-      // NcclCommunicators don't actually use the GlobalDeviceIds. Instead, they
-      // just need to the know the number of devices (2 in this case).
-      gpu::GpuCliqueKey clique_key(
-          /*devices=*/{GlobalDeviceId(0), GlobalDeviceId(1)},
-          /*num_local_participants=*/1);
-      CliqueIds clique_ids(clique_id);
-      gpu::GpuCollectives::Device collectives_device(local_device->executor());
-      std::vector<Collectives::DeviceRank> ranks = {
-          Collectives::DeviceRank(&collectives_device, RankId(0))};
-      gpu::GpuCollectives::Config config;
       TF_ASSIGN_OR_RETURN(
-          std::vector<std::unique_ptr<Communicator>> communicators,
-          gpu_collectives->CreateCommunicators(clique_key, clique_ids, ranks,
-                                               config));
-      CHECK_EQ(communicators.size(), 1);
-      std::unique_ptr<Communicator> communicator = std::move(communicators[0]);
+          std::unique_ptr<Communicator> communicator,
+          CreateTransferCommunicator(local_device, gpu_collectives, clique_id,
+                                     /*is_sender=*/false));
 
       // Receive data from the sender.
       Future<> recv_future = communicator->Recv(
-          mem->mem(), shape.element_type(), ShapeUtil::ElementsIn(shape),
-          RankId(1), gpu::GpuCollectives::On(*stream));
+          mem->mem(), xla::PrimitiveType::U8, mem->mem().size(), RankId(1),
+          gpu::GpuCollectives::On(*stream));
       TF_RETURN_IF_ERROR(recv_future.Await());
 
       // Keep mem alive until the Recv has finished executing. Note that
       // recv_event is fulfilled when the receive is enqueued, but not
       // necessarily executed.
-      TF_RETURN_IF_ERROR(local_device->ThenRelease(stream, mem));
+      definition_event.AndThen([mem]() {});
 
       // Set definition event.
       TF_RETURN_IF_ERROR(
@@ -1087,7 +858,7 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
   thread_pool()->Schedule(recv);
 
   std::vector<std::unique_ptr<PjRtBuffer>> buffers;
-  buffers.push_back(std::move(buffer));
+  buffers.push_back(std::move(receive_prep_result.buffer));
   return buffers;
 }
 
@@ -1273,7 +1044,9 @@ GetStreamExecutorGpuDeviceAllocator(
             CreateBFCAllocator(ordinal_and_device.second->executor(),
                                allocator_config.memory_fraction,
                                allocator_config.preallocate,
-                               allocator_config.gpu_system_memory_size));
+                               allocator_config.gpu_system_memory_size,
+                               allocator_config.sub_allocator_alloc_visitors,
+                               allocator_config.sub_allocator_free_visitors));
         allocators.emplace_back(
             std::move(bfc_allocator),
             ordinal_and_device.second->compute_stream(),
@@ -1402,14 +1175,13 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     device_proto->set_core_count(desc->core_count());
     device_proto->set_shared_memory_per_block_optin(
         desc->shared_memory_per_block_optin());
-#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-    if (std::stoi(compute_capability) >= 9) {
-      auto fabric_info = GetDeviceFabricInfo(ordinal_and_device.first);
-      if (fabric_info.ok()) {
-        device_proto->set_fabric_uuid(*fabric_info);
-      }
+
+    stream_executor::DeviceInterconnectInfo info =
+        desc->device_interconnect_info();
+    if (!info.cluster_uuid.empty() && !info.clique_id.empty()) {
+      device_proto->set_fabric_uuid(
+          absl::StrCat(info.cluster_uuid, "/", info.clique_id));
     }
-#endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
   }
 
   GlobalTopologyProto global_topology;
@@ -1456,7 +1228,7 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
         &global_topology, /*assign_global_device_ids=*/true));
   }
 
-  std::map<int, GlobalDeviceId> gpu_device_ids;
+  absl::btree_map<LocalDeviceId, GlobalDeviceId> gpu_device_ids;
   absl::flat_hash_map<GlobalDeviceId, int> device_to_node;
   int curr_partition_index = -1;
   int curr_process_index = -1;
@@ -1485,7 +1257,8 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
             << device_proto.local_device_ordinal();
         TF_RET_CHECK(it->second != nullptr);
         local_device = std::move(it->second);
-        gpu_device_ids[device_proto.local_device_ordinal()] = global_device_id;
+        gpu_device_ids[LocalDeviceId(device_proto.local_device_ordinal())] =
+            global_device_id;
         // Assign some descriptive names for profiling tools.
         NameDeviceAndLauncherThread(node, device_proto,
                                     local_device->execute_thread());
@@ -1526,15 +1299,15 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc) {
   se::GpuComputeCapability cc = desc->gpu_compute_capability();
-  if (std::holds_alternative<se::CudaComputeCapability>(cc)) {
-    auto nvcc = std::get<se::CudaComputeCapability>(cc);
-    return absl::StrCat(nvcc.major, ".", nvcc.minor);
-  } else if (std::holds_alternative<se::RocmComputeCapability>(cc)) {
-    auto rocmcc = std::get<se::RocmComputeCapability>(cc);
-    return rocmcc.gfx_version();
-  } else {
-    return "unknown";
+  if (cc.IsCuda()) {
+    auto* nvcc = cc.cuda_compute_capability();
+    return absl::StrCat(nvcc->major, ".", nvcc->minor);
   }
+  if (cc.IsRocm()) {
+    auto* rocmcc = cc.rocm_compute_capability();
+    return rocmcc->gfx_version();
+  }
+  return "unknown";
 }
 
 StreamExecutorGpuDevice::StreamExecutorGpuDevice(
@@ -1673,61 +1446,6 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
   return devices;
 }
 
-absl::StatusOr<std::string> GetDeviceFabricInfo(const int device_ordinal) {
-#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-  if (!gpu::GpuPerformanceWithCollectiveModel::InitNvml()) {
-    return absl::InternalError("Failed to initialize NVML library.");
-  }
-
-  // NVML library is not a part of the CUDA toolkit, so there might be a
-  // situation when user is using CUDA 12.4 an higher, but the host NVML
-  // version doen't have the required functions.
-  if (xla_nvmlDeviceGetHandleByPciBusId_v2 == nullptr ||
-      xla_nvmlDeviceGetGpuFabricInfoV == nullptr) {
-    return absl::InternalError("NVML library doesn't have required functions.");
-  }
-
-  char pciBusId[] = "00000000:00:00.0";
-  cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), device_ordinal);
-  nvmlDevice_t device;
-  auto get_bus_id_status =
-      xla_nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &device);
-  CHECK_EQ(get_bus_id_status, NVML_SUCCESS);
-
-  nvmlGpuFabricInfoV_t fabricInfo = {
-      .version = nvmlGpuFabricInfo_v2,
-      .state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED};
-  auto get_fabric_info_status =
-      xla_nvmlDeviceGetGpuFabricInfoV(device, &fabricInfo);
-  CHECK_EQ(get_fabric_info_status, NVML_SUCCESS);
-
-  if (fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
-    std::string error_message =
-        "NVML doesn't support extracting fabric info or NVLink is not used by "
-        "the device.";
-    VLOG(2) << error_message;
-    return absl::InternalError(error_message);
-  }
-
-  CHECK_EQ(sizeof(fabricInfo.clusterUuid), 16);
-  std::string uuid_str = absl::StrFormat(
-      "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-      fabricInfo.clusterUuid[0], fabricInfo.clusterUuid[1],
-      fabricInfo.clusterUuid[2], fabricInfo.clusterUuid[3],
-      fabricInfo.clusterUuid[4], fabricInfo.clusterUuid[5],
-      fabricInfo.clusterUuid[6], fabricInfo.clusterUuid[7],
-      fabricInfo.clusterUuid[8], fabricInfo.clusterUuid[9],
-      fabricInfo.clusterUuid[10], fabricInfo.clusterUuid[11],
-      fabricInfo.clusterUuid[12], fabricInfo.clusterUuid[13],
-      fabricInfo.clusterUuid[14], fabricInfo.clusterUuid[15]);
-  return absl::StrCat(uuid_str, "/", std::to_string(fabricInfo.cliqueId));
-#else   // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-  std::string error_message = "NVML usage is not supported";
-  VLOG(2) << error_message;
-  return absl::InternalError(error_message);
-#endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-}
-
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 static absl::Status CheckAlignment(const BufferAllocation& allocation,
                                    se::DeviceMemoryBase buffer, int arg_idx) {
@@ -1867,7 +1585,7 @@ StreamExecutorGpuClient::RunAsync(
 
   std::set<se::DeviceMemoryBase> buffers_in_result;
 
-  xla::ShapeTree<tsl::RCReference<RawSEDeviceMemory>> results(
+  xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       gpu_exec->result_shape());
 
   for (auto& p : results) {
@@ -1941,7 +1659,10 @@ StreamExecutorGpuClient::RunAsync(
     buffers_in_result.insert(result_buffer);
 
     p.second = RawSEDeviceMemory::Create(
-        result_buffer, device->local_device_id(), memory_allocator);
+        result_buffer,
+        tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
+            ->local_device_state(),
+        memory_allocator);
   }
 
   TF_RETURN_IF_ERROR(gpu_exec->ExecuteThunks(buffer_allocations, run_options));
@@ -1949,7 +1670,7 @@ StreamExecutorGpuClient::RunAsync(
   TF_RETURN_IF_ERROR(buffer_allocations.TearDown(buffers_in_result,
                                                  gpu_exec->GetAllocations()));
 
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> to_be_released;
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
 
   // Free allocations for arguments.
   for (ShapeTree<PjRtStreamExecutorExecutionInput>& input : arguments) {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index b1feb2062eee54..207e2e318e216c 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -140,13 +141,28 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
       PjRtMemorySpace* memory_space) override;
 
-  Future<> CopyRawSubBufferToHost(PjRtBuffer* buffer, Future<void*> dst,
-                                  int64_t offset,
-                                  int64_t transfer_size) override;
+  // CrossHostSendBuffers and CrossHostReceiveBuffers are part of the new
+  // cross-host transfers API.
+  absl::StatusOr<std::vector<Future<>>> CrossHostSendBuffers(
+      absl::Span<PjRtBuffer* const> buffers,
+      absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
 
-  void CopyToRemoteDevice(PjRtBuffer* buffer,
-                          absl::string_view serialized_descriptor,
-                          PjRtBuffer::RemoteSendCallback on_done) override;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  CrossHostReceiveBuffers(
+      xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+      absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
+
+  // ScheduleRemoteSend and MakeCrossHostReceiveBuffers are methods implemented
+  // to support the legacy cross-host transfers API.
+  void ScheduleRemoteSend(
+      PjRtMemorySpace* memory_space,
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+      std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events,
+      tsl::RCReference<PjRtDeviceEventPromise> usage_event_promise,
+      Future<std::string> serialized_descriptor,
+      PjRtBuffer::RemoteSendCallback on_done) override;
 
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
@@ -179,13 +195,36 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       bool lookup_addressable_devices) override;
 
  private:
-  absl::StatusOr<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
-  GetLatestIncarnations(const ExecuteOptions& options);
+  absl::flat_hash_map<GlobalDeviceId, IncarnationId> GetLatestIncarnations(
+      const ExecuteOptions& options);
 
   std::optional<int> num_nodes_;
   const bool abort_collectives_on_failure_ = false;
   std::optional<xla::StreamExecutorGpuTopologyDescription> topology_;
   std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  // Helpers for cross host transfers.
+  absl::Duration cross_host_transfer_timeout_ = absl::Minutes(3);
+
+  absl::StatusOr<Future<>> CrossHostSendBuffer(
+      PjRtBuffer* buffer, PjRtGlobalDeviceId dst_global_device_id,
+      CrossHostTransferKey transfer_key);
+
+  struct PrepareReceiveBufferResult {
+    std::unique_ptr<PjRtBuffer> buffer;
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
+    LocalDeviceState* local_device;
+    se::Stream* stream;
+    BufferSequencingEventRef definition_event;
+  };
+
+  absl::StatusOr<PrepareReceiveBufferResult> PrepareReceiveBuffer(
+      PjRtDevice* device, Shape shape);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CrossHostReceiveBuffer(
+      xla::Shape shape, xla::PjRtDevice* device,
+      PjRtGlobalDeviceId src_global_device_ids,
+      CrossHostTransferKey transfer_keys);
 };
 
 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
index ecd1b0757f7255..07cbfb7ba3276f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
@@ -155,6 +155,7 @@ TEST(StreamExecutorGpuClientTest, NvshmemMemoryTest) {
       std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
       executable->Execute({{input.get()}}, ExecuteOptions()));
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
   Shape result_shape = result_buffers[0]->on_device_shape();
   int64_t memory_space = result_shape.layout().memory_space();
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index 0281b6b4a135cc..39c840fdae3e29 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -51,6 +52,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
+#include "google/protobuf/text_format.h"
 #include "xla/debug_options_flags.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
@@ -73,6 +75,7 @@ limitations under the License.
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -93,6 +96,7 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -108,7 +112,6 @@ limitations under the License.
 #include "tsl/platform/casts.h"
 #include "tsl/platform/mem.h"
 #include "tsl/platform/platform.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
@@ -392,11 +395,11 @@ TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
                             std::unique_ptr<CopyToDeviceStream> stream) {
         auto chunk0 = PjRtChunk::AllocateDefault(sizeof(float));
         *reinterpret_cast<float*>(chunk0.data()) = 5.0f;
-        TF_CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
+        CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
 
         auto chunk1 = PjRtChunk::AllocateDefault(sizeof(float));
         *reinterpret_cast<float*>(chunk1.data()) = 6.0f;
-        TF_CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
+        CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
 
         return absl::OkStatus();
       }};
@@ -1061,6 +1064,12 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument,
                                      HasSubstr("invalid offset 1")));
   tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+
+  // The future returned by buffer->CopyRawToHost() may be resolve to an error
+  // before the prior buffer->BufferFromHostLiteral() is done. Make sure
+  // `literal` is alive long enough to avoid use-after-free. See the comment in
+  // PjRtStreamExecutorBuffer::CopyRawToHost() for details.
+  TF_EXPECT_OK(buffer->GetReadyFuture().Await());
 }
 
 TEST(StreamExecutorGpuClientTest, CopyRawToHostFuture) {
@@ -1234,46 +1243,6 @@ TEST(StreamExecutorGpuClientTest, DistributedInit) {
   }
 }
 
-TEST(StreamExecutorGpuClientTest, GetDeviceFabricInfo) {
-  auto kv_store = std::make_shared<InMemoryKeyValueStore>();
-  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
-                                      "PopulateAndRetrieveFabricInfos", 4);
-  constexpr int num_nodes = 2;
-  for (int node_id = 0; node_id < num_nodes; ++node_id) {
-    thread_pool.Schedule([kv_store, node_id] {
-      GpuClientOptions options = DefaultOptions();
-      options.node_id = node_id;
-      options.num_nodes = num_nodes;
-      options.kv_store = kv_store;
-      TF_ASSERT_OK_AND_ASSIGN(auto client, GetStreamExecutorGpuClient(options));
-      for (const auto& device : client->addressable_devices()) {
-        LocalDeviceState* local_device_state =
-            tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
-                ->local_device_state();
-        if (local_device_state != nullptr) {
-          se::StreamExecutor* executor = local_device_state->executor();
-          if (auto* cc = std::get_if<se::CudaComputeCapability>(
-                  &executor->GetDeviceDescription().gpu_compute_capability())) {
-            if (cc->IsAtLeastHopper()) {
-              auto fabric_info =
-                  GetDeviceFabricInfo(executor->device_ordinal());
-              if (!fabric_info.ok()) {
-                // Only allow failures due to insufficient CUDA driver version.
-                EXPECT_THAT(
-                    fabric_info.status().message(),
-                    AnyOf(
-                        HasSubstr("Failed to initialize NVML library."),
-                        HasSubstr(
-                            "NVML library doesn't have required functions.")));
-              }
-            }
-          }
-        }
-      }
-    });
-  }
-}
-
 TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(DefaultOptions()));
@@ -1281,7 +1250,7 @@ TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
 
   for (auto device : client->addressable_devices()) {
     const xla::Literal literal = xla::LiteralUtil::CreateR0<int32_t>(0);
-    TF_ASSERT_OK_AND_ASSIGN(auto* memory_space, device->default_memory_space())
+    TF_ASSERT_OK_AND_ASSIGN(auto* memory_space, device->default_memory_space());
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<PjRtBuffer> buffer,
         client->BufferFromHostLiteral(literal, memory_space));
@@ -1582,6 +1551,37 @@ TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpace) {
                                      *literal));
 }
 
+TEST(StreamExecutorGpuClientTest, CopyFromPinnedHostMemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+  std::vector<int32_t> data{1, 2, 3, 4};
+  Shape shape = ShapeUtil::MakeShape(S32, {4});
+  auto device = client->addressable_devices()[0];
+  auto* device_memory_space = *device->default_memory_space();
+  auto* pinned_memory_space = device->memory_spaces()[1];
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          pinned_memory_space, /*device_layout=*/nullptr));
+
+  EXPECT_EQ(buffer->memory_space()->kind(), "pinned_host");
+  EXPECT_TRUE(buffer->IsOnCpu());
+
+  EXPECT_EQ(pinned_memory_space->kind_id(), PinnedHostMemorySpace::kKindId);
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          buffer->CopyToMemorySpace(device_memory_space));
+
+  EXPECT_EQ(result->memory_space()->kind(), "device");
+
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  std::vector<int32_t> expected{1, 2, 3, 4};
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
+                                     *literal));
+}
+
 TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(DefaultOptions()));
@@ -1745,6 +1745,7 @@ TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTest) {
 
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "pinned_host");
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto memory_stats, executable->GetExecutable()->GetCompiledMemoryStats());
@@ -1776,11 +1777,11 @@ TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   // Untuple the result so that we get separate buffers.
   // This is how JAX invokes XLA.
   ExecuteOptions execute_options;
-  execute_options.untuple_result = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, executable->Execute({{input.get()}}, execute_options));
 
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
   EXPECT_EQ(result_buffers.size(), 2);
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
   EXPECT_EQ(result_buffers[1]->memory_space()->kind(), "pinned_host");
@@ -1799,6 +1800,190 @@ TEST(StreamExecutorGpuClientTest, ExecutablePinnedHostOutputMemoryKindTest) {
   EXPECT_EQ(memory_kinds[0][0], "pinned_host");
 }
 
+TEST(StreamExecutorGpuClientTest,
+     GetCompiledMemoryStatsWithTupleAndNcclUserBuffers) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  xla::CompileOptions options;
+  options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_enable_nccl_user_buffers(true);
+
+  constexpr char const* kProgramWithCollectiveAndTuple = R"(
+ HloModule test
+
+ region_0 {
+   Arg_0 = f32[] parameter(0)
+   Arg_1 = f32[] parameter(1)
+   ROOT add = f32[] add(Arg_0, Arg_1)
+ }
+
+ ENTRY main {
+   p0 = f32[512,128]{1,0} parameter(0)
+   p1 = f32[512,32,128]{2,1,0} parameter(1)
+   p2 = f32[512,8,128]{2,1,0} parameter(2)
+   p3 = f32[512,14336]{1,0} parameter(3)
+   p4 = f32[1024]{0} parameter(4)
+   p5 = f32[1]{0} parameter(5)
+
+   // All-gather operations that will use memory space 1 with NCCL user buffers
+   ag0 = f32[4096,128]{1,0} all-gather(p0), channel_id=1, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+   ag1 = f32[4096,32,128]{2,1,0} all-gather(p1), channel_id=2, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+   ag2 = f32[4096,8,128]{2,1,0} all-gather(p2), channel_id=3, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+   ag3 = f32[4096,14336]{1,0} all-gather(p3), channel_id=4, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+
+   ar0 = f32[1024]{0} all-reduce(p4), channel_id=5, to_apply=region_0
+   ar1 = f32[1]{0} all-reduce(p5), channel_id=6, to_apply=region_0
+
+   // Regular operations with default memory space
+   add0 = f32[512,128]{1,0} add(p0, p0)
+   add1 = f32[512,32,128]{2,1,0} add(p1, p1)
+   add2 = f32[512,8,128]{2,1,0} add(p2, p2)
+
+   // Mix of all-gather results (memory space 1) and regular tensors (memory space 0)
+   ROOT tuple = (f32[4096,128]{1,0}, f32[4096,32,128]{2,1,0}, f32[4096,8,128]{2,1,0}, f32[4096,14336]{1,0},
+                 f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0},
+                 f32[512,128]{1,0}, f32[512,32,128]{2,1,0}, f32[512,8,128]{2,1,0}, f32[512,14336]{1,0},
+                 f32[4096,128]{1,0}, f32[4096,32,128]{2,1,0}, f32[4096,8,128]{2,1,0}, f32[4096,14336]{1,0},
+                 f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0},
+                 f32[512,128]{1,0}, f32[512,32,128]{2,1,0}, f32[512,8,128]{2,1,0}, f32[512,14336]{1,0},
+                 f32[4096,128]{1,0}, f32[4096,32,128]{2,1,0}, f32[4096,8,128]{2,1,0}, f32[4096,14336]{1,0},
+                 f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0},
+                 f32[512,128]{1,0}, f32[512,32,128]{2,1,0}, f32[512,8,128]{2,1,0}, f32[512,14336]{1,0},
+                 f32[4096,128]{1,0}, f32[4096,32,128]{2,1,0}, f32[4096,8,128]{2,1,0}, f32[4096,14336]{1,0},
+                 f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0},
+                 f32[512,128]{1,0}, f32[512,32,128]{2,1,0}, f32[512,8,128]{2,1,0}, f32[512,14336]{1,0},
+                 f32[4096,128]{1,0}, f32[4096,32,128]{2,1,0}, f32[4096,8,128]{2,1,0}, f32[4096,14336]{1,0})
+                tuple(ag0, ag1, ag2, ag3, ar0, ar1, ar0, ar1,
+                      p0, p1, p2, p3, ag0, ag1, ag2, ag3,
+                      ar0, ar1, ar0, ar1, add0, add1, add2, p3,
+                      ag0, ag1, ag2, ag3, ar0, ar1, ar0, ar1,
+                      p0, p1, p2, p3, ag0, ag1, ag2, ag3,
+                      ar0, ar1, ar0, ar1, add0, add1, add2, p3,
+                      ag0, ag1, ag2, ag3)
+ }
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CompileExecutable(kProgramWithCollectiveAndTuple, *client, options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto memory_stats, executable->GetExecutable()->GetCompiledMemoryStats());
+  EXPECT_EQ(memory_stats.output_size_in_bytes, 1764786624);
+  EXPECT_EQ(memory_stats.host_output_size_in_bytes, 0);
+  EXPECT_EQ(memory_stats.peak_memory_in_bytes, 1845010888);
+}
+
+TEST(StreamExecutorGpuClientTest, GetCompiledMemoryStatsMixedTuple) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  xla::CompileOptions options;
+  options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_enable_nccl_user_buffers(true);
+
+  constexpr char const* kSimpleMixedTupleHlo = R"(
+HloModule test
+
+region_0 {
+Arg_0 = f32[] parameter(0)
+Arg_1 = f32[] parameter(1)
+ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+ENTRY main {
+p0 = f32[2]{0} parameter(0)
+// All-gather across 8 replicas to enlarge dim0.
+ag = f32[16]{0} all-gather(p0), channel_id=1, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+add0 = f32[2]{0} add(p0, p0)
+ROOT tuple = (f32[16]{0}, f32[2]{0}, f32[2]{0}) tuple(ag, p0, add0)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CompileExecutable(kSimpleMixedTupleHlo, *client, options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto memory_stats, executable->GetExecutable()->GetCompiledMemoryStats());
+
+  EXPECT_EQ(memory_stats.output_size_in_bytes, 104);
+  EXPECT_EQ(memory_stats.host_output_size_in_bytes, 0);
+  EXPECT_EQ(memory_stats.peak_memory_in_bytes, 120);
+}
+
+TEST(StreamExecutorGpuClientTest, GetCompiledMemoryStatsMixedTupleNotRoot) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  xla::CompileOptions options;
+  options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_enable_nccl_user_buffers(true);
+
+  constexpr char const* kMixedTupleNotRootHlo = R"(
+HloModule test
+
+ENTRY main {
+p0 = f32[2]{0} parameter(0)
+ag = f32[16]{0} all-gather(p0), channel_id=1, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+add0 = f32[2]{0} add(p0, p0)
+t = (f32[16]{0}, f32[2]{0}, f32[2]{0}) tuple(ag, p0, add0)
+ROOT gte0 = f32[16]{0} get-tuple-element(t), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CompileExecutable(kMixedTupleNotRootHlo, *client, options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto memory_stats, executable->GetExecutable()->GetCompiledMemoryStats());
+
+  EXPECT_EQ(memory_stats.output_size_in_bytes, 64);
+  EXPECT_EQ(memory_stats.host_output_size_in_bytes, 0);
+  EXPECT_EQ(memory_stats.peak_memory_in_bytes, 80);
+}
+
+TEST(StreamExecutorGpuClientTest, GetCompiledMemoryStatsCountTupleTable) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  constexpr char const* kManyTuplesHlo = R"(
+HloModule test
+
+ENTRY main {
+p0 = f32[1]{0} parameter(0)
+add0 = f32[1]{0} add(p0, p0)
+ROOT t = (f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0},
+         f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0},
+         f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0},
+         f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0},
+         f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0},
+         f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0},
+         f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0},
+         f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0})
+ tuple(p0, add0, p0, add0,
+       p0, add0, p0, add0,
+       p0, add0, p0, add0,
+       p0, add0, p0, add0,
+       p0, add0, p0, add0,
+       p0, add0, p0, add0,
+       p0, add0, p0, add0,
+       p0, add0, p0, add0)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kManyTuplesHlo, *client));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto memory_stats, executable->GetExecutable()->GetCompiledMemoryStats());
+
+  EXPECT_EQ(memory_stats.output_size_in_bytes, 384);
+  EXPECT_EQ(memory_stats.host_output_size_in_bytes, 0);
+  EXPECT_EQ(memory_stats.peak_memory_in_bytes, 388);
+}
+
 // Verify the output device memory kind with collective memory space shape
 // when NCCL user buffer is enabled.
 TEST(StreamExecutorGpuClientTest,
@@ -1840,6 +2025,7 @@ TEST(StreamExecutorGpuClientTest,
       auto result, executable->Execute({{input.get()}}, ExecuteOptions()));
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
   Shape result_shape = result_buffers[0]->on_device_shape();
   auto memory_space = result_shape.layout().memory_space();
   EXPECT_EQ(memory_space, 1);
@@ -1873,6 +2059,7 @@ TEST(StreamExecutorGpuClientTest, CollectiveMemorySpaceSmoke) {
   TF_ASSERT_OK_AND_ASSIGN(auto results,
                           exe->Execute({{input.get()}}, ExecuteOptions()));
   auto& buf = results[0][0];
+  TF_ASSERT_OK(buf->GetReadyFuture().Await());
 
   // Override default memory space to collective memory space.
   EXPECT_EQ(buf->on_device_shape().layout().memory_space(),
@@ -2311,11 +2498,11 @@ TEST(StreamExecutorGpuClientTest, NonZeroGPUDeviceTimeMeasurementSingleGPU) {
                             std::unique_ptr<CopyToDeviceStream> stream) {
         auto chunk0 = PjRtChunk::AllocateDefault(sizeof(float));
         *reinterpret_cast<float*>(chunk0.data()) = 5.0f;
-        TF_CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
+        CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
 
         auto chunk1 = PjRtChunk::AllocateDefault(sizeof(float));
         *reinterpret_cast<float*>(chunk1.data()) = 6.0f;
-        TF_CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
+        CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
 
         return absl::OkStatus();
       }};
@@ -2395,8 +2582,10 @@ TEST(StreamExecutorGpuClientTest, NonZeroGPUDeviceTimeMeasurementMultiGPU) {
   auto measurement0 = CreateDeviceTimeMeasurement();
 
   // Test that running the program does not crash/hang.
-  TF_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto res,
       executable->Execute(absl::MakeSpan(input_ptrs), ExecuteOptions()));
+  TF_ASSERT_OK(res[0][0]->GetReadyFuture().Await());
 
   // Check measurement after execution completes.
   EXPECT_GT(
@@ -2411,13 +2600,17 @@ TEST(StreamExecutorGpuClientTest, DmaMapUnmap) {
       tensorflow::down_cast<PjRtStreamExecutorClient*>(gpu_client.get());
   size_t dma_size = 1024;
   size_t alignment = 4096;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
-  EXPECT_TRUE(client->IsDmaMapped(host_dma_ptr.get(), dma_size));
-  EXPECT_FALSE(client->IsDmaMapped(
-      reinterpret_cast<char*>(host_dma_ptr.get()) + 5, dma_size));
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
-  EXPECT_FALSE(client->IsDmaMapped(host_dma_ptr.get(), dma_size));
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_cleanup =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
+  EXPECT_TRUE(client->IsDmaMapped(host_dma_ptr, dma_size));
+  EXPECT_FALSE(
+      client->IsDmaMapped(reinterpret_cast<char*>(host_dma_ptr) + 5, dma_size));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
+  EXPECT_FALSE(client->IsDmaMapped(host_dma_ptr, dma_size));
 }
 
 TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
@@ -2445,10 +2638,14 @@ TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   size_t dma_size = 2 * 1024 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_cleanup =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
 
-  auto result = first_buffer->CopyRawToHost(host_dma_ptr.get(), 0, size);
+  auto result = first_buffer->CopyRawToHost(host_dma_ptr, 0, size);
   TF_EXPECT_OK(result.Await());
 
   PjRtDevice* const second_device = client->addressable_devices()[1];
@@ -2459,12 +2656,12 @@ TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
   auto second_buffer = transfer_manager->RetrieveBuffer(0);
 
   TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
-      0, host_dma_ptr.get(), 0, size, true, []() {}));
+      0, host_dma_ptr, 0, size, true, []() {}));
   TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
   EXPECT_EQ(literal->element_count(), test_length);
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
 
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
 }
 
 TEST(StreamExecutorGpuClientTest, RawBuffer) {
@@ -2553,9 +2750,13 @@ ENTRY main.5 {
 
   size_t dma_size = 4 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
-  memset(host_dma_ptr.get(), 0, dma_size);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
+  memset(host_dma_ptr, 0, dma_size);
   Shape shape =
       ShapeUtil::MakeShape(S32, {static_cast<int64_t>(dma_size * 1024)});
 
@@ -2579,11 +2780,10 @@ ENTRY main.5 {
     }
     last_opaque_ptr = opaque_ptr;
 
-    memcpy(host_dma_ptr.get(), &i, sizeof(int32_t));
+    memcpy(host_dma_ptr, &i, sizeof(int32_t));
     absl::Notification done;
     TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
-        0, host_dma_ptr.get(), 0, dma_size, true,
-        [&done]() { done.Notify(); }));
+        0, host_dma_ptr, 0, dma_size, true, [&done]() { done.Notify(); }));
     done.WaitForNotification();
 
     std::vector<std::vector<xla::PjRtBuffer*>> input_ptrs = {
@@ -2610,7 +2810,7 @@ ENTRY main.5 {
 
   EXPECT_TRUE(clobbered);
 
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
 }
 
 TEST(StreamExecutorGpuClientTest, EventCaching) {
@@ -2671,9 +2871,9 @@ TEST(StreamExecutorGpuClientTest, LinkedEventPromise) {
   TF_ASSERT_OK_AND_ASSIGN(std::tie(promise, event),
                           client->CreateLinkedEventPromise(memory_space, ""));
   TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer,
-      client->DefineBuffer(device_shape, raw_buffer, {std::move(event)},
-                           /*raw_buffer_is_mutable=*/true));
+      auto buffer, client->DefineBuffer(device_shape, memory_space, raw_buffer,
+                                        {std::move(event)},
+                                        /*raw_buffer_is_mutable=*/true));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto definition_event,
@@ -2687,6 +2887,283 @@ TEST(StreamExecutorGpuClientTest, LinkedEventPromise) {
   ASSERT_EQ(literal, *new_literal);
 }
 
+TEST(StreamExecutorGpuClientTest, FailedCrossHostSendArgsSizeMismatch) {
+  // Create the client.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  // Create a buffer to try to send.
+  std::vector<int32_t> data(256);
+  std::iota(data.begin(), data.end(), 1);
+
+  Shape shape = ShapeUtil::MakeShape(S32, {256});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          *client->addressable_devices()[0]->default_memory_space(),
+          /*device_layout=*/nullptr));
+
+  // Try to send some data, giving an extra dst_global_device_id.
+  EXPECT_THAT(
+      client->CrossHostSendBuffers(
+          {buffer.get()}, {PjRtGlobalDeviceId(1), PjRtGlobalDeviceId(2)},
+          {CrossHostTransferKey(0)}),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq("CrossHostSendBuffers: buffers, "
+                           "dst_global_device_ids, and transfer_keys "
+                           "must have the same length, but got 1, 2, and 1.")));
+
+  // Try to send some data, giving and extra transfer key.
+  EXPECT_THAT(
+      client->CrossHostSendBuffers(
+          {buffer.get()}, {PjRtGlobalDeviceId(1)},
+          {CrossHostTransferKey(0), CrossHostTransferKey(1)}),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq("CrossHostSendBuffers: buffers, "
+                           "dst_global_device_ids, and transfer_keys "
+                           "must have the same length, but got 1, 1, and 2.")));
+}
+
+TEST(StreamExecutorGpuClientTest, FailedCrossHostReceiveArgsSizeMismatch) {
+  // Create the client.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  // Create shapes to receive.
+  std::vector<Shape> shapes = {ShapeUtil::MakeShape(S32, {256})};
+
+  // Check InvalidArgument status when we don't give enough
+  // src_global_device_ids.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+      mismatch_status_or_1 = client->CrossHostReceiveBuffers(
+          /*device=*/client->addressable_devices()[0],
+          /*shapes=*/shapes,
+          /*src_global_device_ids=*/{},
+          /*transfer_keys=*/{CrossHostTransferKey(0)});
+  EXPECT_THAT(
+      mismatch_status_or_1.status(),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq(
+              "CrossHostReceiveBuffers: shapes, src_global_device_ids, and "
+              "transfer_keys must have the same length, but got 1, 0, and "
+              "1.")));
+
+  // Check InvalidArgument status when we give too many
+  // transfer_keys.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+      mismatch_status_or_2 = client->CrossHostReceiveBuffers(
+          /*device=*/client->addressable_devices()[0],
+          /*shapes=*/shapes,
+          /*src_global_device_ids=*/{PjRtGlobalDeviceId(0)},
+          /*transfer_keys=*/{CrossHostTransferKey(0), CrossHostTransferKey(1)});
+  EXPECT_THAT(
+      mismatch_status_or_2.status(),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq(
+              "CrossHostReceiveBuffers: shapes, src_global_device_ids, and "
+              "transfer_keys must have the same length, but got 1, 1, and "
+              "2.")));
+}
+
+static std::string SuccessfulCrossHostTransferTestName(
+    const ::testing::TestParamInfo<int>& info) {
+  return absl::StrFormat("num_arrays_%d", info.param);
+}
+
+class SuccessfulCrossHostTransferTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(SuccessfulCrossHostTransferTest, SuccessfulCrossHostTransfer) {
+  int num_arrays = GetParam();
+
+  tsl::SubProcess sender;
+  tsl::SubProcess receiver;
+
+  std::vector<std::string> sender_argv;
+  sender_argv.push_back("successful_cross_host_transfer_test");
+  sender_argv.push_back("--test_to_run=SuccessfulCrossHostTransferHelper");
+  sender_argv.push_back("--cross_host_test_role=sender");
+  sender_argv.push_back(absl::StrFormat("--num_arrays=%d", num_arrays));
+
+  std::vector<std::string> receiver_argv;
+  receiver_argv.push_back("successful_cross_host_transfer_test");
+  receiver_argv.push_back("--test_to_run=SuccessfulCrossHostTransferHelper");
+  receiver_argv.push_back("--cross_host_test_role=receiver");
+  receiver_argv.push_back(absl::StrFormat("--num_arrays=%d", num_arrays));
+
+  sender.SetProgram("/proc/self/exe", sender_argv);
+  sender.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  sender.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+
+  receiver.SetProgram("/proc/self/exe", receiver_argv);
+  receiver.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  receiver.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+
+  ASSERT_TRUE(sender.Start());
+  ASSERT_TRUE(receiver.Start());
+
+  std::string sender_stdout, sender_stderr;
+  std::string receiver_stdout, receiver_stderr;
+
+  int sender_status =
+      sender.Communicate(nullptr, &sender_stdout, &sender_stderr);
+  int receiver_status =
+      receiver.Communicate(nullptr, &receiver_stdout, &receiver_stderr);
+
+  EXPECT_EQ(sender_status, 0) << "sender stdout:\n"
+                              << sender_stdout << "\nsender stderr:\n"
+                              << sender_stderr;
+  EXPECT_EQ(receiver_status, 0) << "receiver stdout:\n"
+                                << receiver_stdout << "\nreceiver stderr:\n"
+                                << receiver_stderr;
+}
+
+INSTANTIATE_TEST_SUITE_P(SuccessfulCrossHostTransfer,
+                         SuccessfulCrossHostTransferTest,
+                         ::testing::ValuesIn({1, 2, 3}),
+                         SuccessfulCrossHostTransferTestName);
+
+absl::Status SuccessfulCrossHostTransferTestBody(bool is_sender,
+                                                 int num_arrays) {
+  std::string log_prefix = is_sender ? "sender" : "receiver";
+
+  // Sender creates a coordination service on so both processes can find each
+  // other via the distributed runtime (port chosen arbitrarily).
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+  if (is_sender) {
+    LOG(INFO) << log_prefix << ": creating coordination service";
+    TF_ASSIGN_OR_RETURN(
+        service, xla::GetDistributedRuntimeService(
+                     "127.0.0.1:12347",
+                     xla::CoordinationServiceImpl::Options{/*num_nodes=*/2}));
+    LOG(INFO) << log_prefix << ": created service";
+  }
+
+  // Connect to the coordination service.
+  int32_t node_id = is_sender ? 0 : 1;
+  xla::DistributedRuntimeClient::Options distributed_options;
+  distributed_options.node_id = node_id;
+  distributed_options.init_timeout = absl::Seconds(120);
+  auto distributed_client =
+      GetDistributedRuntimeClient("127.0.0.1:12347", distributed_options);
+
+  LOG(INFO) << log_prefix << ": connecting distributed client";
+  TF_QCHECK_OK(distributed_client->Connect());
+  LOG(INFO) << log_prefix << ": distributed client connected";
+
+  // Create the GPU client.
+  GpuClientOptions options = DefaultOptions();
+  options.node_id = node_id;
+  options.num_nodes = 2;
+  options.kv_store =
+      GetDistributedKeyValueStore(distributed_client, /*key_prefix=*/"cross:");
+  options.allowed_devices = {node_id};
+
+  LOG(INFO) << log_prefix << ": creating PjRtClient";
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                      GetStreamExecutorGpuClient(options));
+  LOG(INFO) << log_prefix << ": PjRtClient created";
+
+  // Sender logic.
+  if (is_sender) {
+    LOG(INFO) << log_prefix << ": creating buffers";
+    std::vector<int32_t> data(256);
+    std::iota(data.begin(), data.end(), 1);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
+
+    // Create the data to send.
+    std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+    for (int i = 0; i < num_arrays; ++i) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<PjRtBuffer> buffer,
+          client->BufferFromHostBuffer(
+              data.data(), shape.element_type(), shape.dimensions(),
+              /*byte_strides=*/std::nullopt,
+              PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+              nullptr,
+              *client->addressable_devices()[0]->default_memory_space(),
+              /*device_layout=*/nullptr));
+      TF_RETURN_IF_ERROR(buffer->GetReadyFuture().Await());
+      buffers.push_back(std::move(buffer));
+    }
+
+    // Send some data.
+    LOG(INFO) << log_prefix << ": issuing CrossHostSendBuffers";
+
+    std::vector<PjRtBuffer*> raw_buffers;
+    std::vector<PjRtGlobalDeviceId> dst_device_ids;
+    std::vector<CrossHostTransferKey> transfer_keys;
+    for (int i = 0; i < buffers.size(); ++i) {
+      raw_buffers.push_back(buffers[i].get());
+      dst_device_ids.push_back(PjRtGlobalDeviceId(1));
+      transfer_keys.push_back(CrossHostTransferKey(i));
+    };
+
+    TF_ASSIGN_OR_RETURN(
+        std::vector<Future<>> send_futures,
+        client->CrossHostSendBuffers(raw_buffers, dst_device_ids,
+                                     std::move(transfer_keys)));
+
+    EXPECT_EQ(send_futures.size(), num_arrays);
+    for (int i = 0; i < num_arrays; ++i) {
+      LOG(INFO) << log_prefix << ": waiting for send " << i << " to complete";
+      TF_RETURN_IF_ERROR(send_futures[i].Await());
+      LOG(INFO) << log_prefix << ": send " << i << " completed";
+    }
+  } else {
+    // Receiver logic.
+    // Expected data to receive.
+    std::vector<int32_t> expected_data(256);
+    std::iota(expected_data.begin(), expected_data.end(), 1);
+    auto expected_literal = LiteralUtil::CreateR1<int32_t>(expected_data);
+
+    // Receive some data.
+    std::vector<Shape> shapes;
+    std::vector<PjRtGlobalDeviceId> src_device_ids;
+    std::vector<CrossHostTransferKey> transfer_keys;
+    for (int i = 0; i < num_arrays; ++i) {
+      shapes.push_back(ShapeUtil::MakeShape(S32, {256}));
+      src_device_ids.push_back(PjRtGlobalDeviceId(0));
+      transfer_keys.push_back(CrossHostTransferKey(i));
+    }
+
+    LOG(INFO) << log_prefix << ": calling CrossHostReceiveBuffers";
+    TF_ASSIGN_OR_RETURN(
+        std::vector<std::unique_ptr<PjRtBuffer>> receive_buffers,
+        client->CrossHostReceiveBuffers(client->addressable_devices()[0],
+                                        shapes, src_device_ids,
+                                        std::move(transfer_keys)));
+    LOG(INFO) << log_prefix
+              << ": CrossHostReceiveBuffers returned, waiting for ready";
+
+    // Verify we received the expected data.
+    EXPECT_EQ(receive_buffers.size(), num_arrays);
+
+    for (int i = 0; i < num_arrays; ++i) {
+      LOG(INFO) << log_prefix << ": waiting for receive " << i
+                << " to complete";
+      TF_RETURN_IF_ERROR(receive_buffers[i]->GetReadyFuture().Await());
+      LOG(INFO) << log_prefix << ": receive " << i << " completed";
+
+      TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::Literal> recv_literal,
+                          receive_buffers[i]->ToLiteralSync());
+
+      EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *recv_literal));
+      LOG(INFO) << log_prefix << ": verification of receive " << i
+                << " complete";
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 struct ShardedAutotuningTestInfo {
   bool use_xla_computation;
   int num_active_nodes;
@@ -2725,6 +3202,7 @@ TEST_P(ShardedAutotuningTest, ShardedAutotuningWorks) {
       std::vector<std::string> argv;
       argv.reserve(6);
       argv.push_back("sharded_autotuning_test");
+      argv.push_back("--test_to_run=ShardedAutotuningWorksHelper");
       argv.push_back(absl::StrFormat("--node_id=%d", node_id));
       argv.push_back(absl::StrFormat("--use_xla_computation=%d",
                                      param.use_xla_computation));
@@ -2879,13 +3357,30 @@ INSTANTIATE_TEST_SUITE_P(
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
-  // Save name of binary so that it may invoke itself.
+  // Populated by a command line flag. Will be either
+  // 'ShardedAutotuningWorksHelper', 'SuccessfulCrossHostTransferHelper', or
+  // empty. If empty, all tests are run. Otherwise, the test body for
+  // 'ShardedAutotuningWorks' or 'SuccessfulCrossHostTransfer' will be run.
+  std::string test_to_run;
+
+  // Variables used by ShardedAutotuningWorks.
   int node_id = -1;
   int num_active_nodes = -1;
   int num_nodes_using_cache = -1;
   std::string cache_dir;
   bool use_xla_computation = false;
+
+  // Variables used by SuccessfulCrossHostTransfer.
+  std::string cross_host_test_role;
+  int num_arrays = -1;
+
   std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("test_to_run", &test_to_run,
+                "Which test(s) to execute. Allowed values: '' (runs "
+                "all tests), 'ShardedAutotuningWorksHelper' or "
+                "'SuccessfulCrossHostTransferHelper'."),
+
+      // Flags for ShardedAutotuningWorks.
       tsl::Flag("node_id", &node_id,
                 "Node ID for ShardedAutotuningWorks test."),
       tsl::Flag("num_active_nodes", &num_active_nodes,
@@ -2896,12 +3391,25 @@ int main(int argc, char* argv[]) {
                 "Test parameter for ShardedAutotuningWorks."),
       tsl::Flag("use_xla_computation", &use_xla_computation,
                 "Test parameter for ShardedAutotuningWorks."),
-  };
+
+      // Flags for SuccessfulCrossHostTransfer.
+      tsl::Flag("cross_host_test_role", &cross_host_test_role,
+                "Test parameter for SuccessfulCrossHostTransfer; either "
+                "'sender' or 'receiver'."),
+      tsl::Flag("num_arrays", &num_arrays,
+                "Test parameter for SuccessfulCrossHostTransfer; number of "
+                "arrays to transfer.")};
+
   xla::AppendDebugOptionsFlags(&flag_list);
   std::string usage = tsl::Flags::Usage(argv[0], flag_list);
   tsl::Flags::Parse(&argc, argv, flag_list);
+
   testing::InitGoogleTest(&argc, argv);
-  if (node_id >= 0) {
+  if (test_to_run.empty()) {
+    return RUN_ALL_TESTS();
+  }
+
+  if (test_to_run == "ShardedAutotuningWorksHelper") {
     absl::Status result = xla::ShardedAutotuningWorksTestBody(
         node_id, num_active_nodes, num_nodes_using_cache, cache_dir,
         use_xla_computation);
@@ -2910,5 +3418,23 @@ int main(int argc, char* argv[]) {
     }
     return result.raw_code();
   }
-  return RUN_ALL_TESTS();
+  if (test_to_run == "SuccessfulCrossHostTransferHelper") {
+    absl::Status s;
+    if (cross_host_test_role == "sender") {
+      s = xla::SuccessfulCrossHostTransferTestBody(/*is_sender=*/true,
+                                                   num_arrays);
+    } else if (cross_host_test_role == "receiver") {
+      s = xla::SuccessfulCrossHostTransferTestBody(/*is_sender=*/false,
+                                                   num_arrays);
+    } else {
+      LOG(ERROR) << "cross_host_test_role must be 'sender' or 'receiver'.";
+      return 1;
+    }
+    if (!s.ok()) {
+      LOG(ERROR) << s;
+    }
+    return s.raw_code();
+  }
+  LOG(ERROR) << "Unrecognized multiprocess test name " << test_to_run << ".";
+  return 1;
 }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 8084f148d4ff9a..5d0632a7f7aa4d 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/pjrt/utils.h"
 #include "xla/service/compiler.h"
 #include "xla/service/dump.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
@@ -125,7 +124,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                       GetCompilerForPlatform(requested_platform_id_));
 
   CompileOptions input_options = options;
-  if (!options.target_config) {
+  if (!options.gpu_target_config) {
     if (client != nullptr) {
       TF_RET_CHECK(IsGpuClient(*client))
           << "GPU compilation requires a GPU PjRt client.";
@@ -139,9 +138,9 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
             topology);
     if (gpu_topology.target_config().has_value()) {
       TF_ASSIGN_OR_RETURN(
-          Compiler::TargetConfig target_config,
-          Compiler::TargetConfig::FromProto(*gpu_topology.target_config()));
-      options.target_config.emplace(std::move(target_config));
+          Compiler::GpuTargetConfig target_config,
+          Compiler::GpuTargetConfig::FromProto(*gpu_topology.target_config()));
+      options.gpu_target_config.emplace(std::move(target_config));
     } else {
       return absl::UnimplementedError(
           "Compilation without client and without target_config specified is "
@@ -169,10 +168,10 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                   gpu_compiler.get(), std::placeholders::_1));
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
   Compiler::CompileOptions opts;
-  opts.target_config = options.target_config;
+  opts.gpu_target_config = options.gpu_target_config;
 
   AotCompilationOptions aot_options(gpu_compiler->PlatformId());
-  aot_options.set_target_config(*options.target_config);
+  aot_options.set_gpu_target_config(*options.gpu_target_config);
   aot_options.set_run_backend_only(
       options.executable_build_options.run_backend_only());
 
@@ -194,7 +193,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    mlir::ModuleOp module,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-  if (!options.target_config && client != nullptr) {
+  if (!options.gpu_target_config && client != nullptr) {
     TF_RET_CHECK(IsGpuClient(*client))
         << "GPU compilation requires a GPU PjRt client.";
     TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 93a13c531ecefe..1c9da024d93e4b 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -85,7 +85,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
-  Compiler::TargetConfig gpu_target_config = xla::Compiler::TargetConfig(
+  Compiler::GpuTargetConfig gpu_target_config = xla::Compiler::GpuTargetConfig(
       se_client->client()->backend().default_stream_executor());
   StreamExecutorGpuCompiler compiler(se_client->client()->platform()->id());
 
@@ -95,7 +95,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
   TF_ASSERT_OK_AND_ASSIGN(auto topology, se_client->GetTopologyDescription());
   xla::CompileOptions opts;
-  opts.target_config = gpu_target_config;
+  opts.gpu_target_config = gpu_target_config;
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           compiler.Compile(opts, mlir_module.get(), *topology,
@@ -115,7 +115,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
-  Compiler::TargetConfig gpu_target_config{
+  Compiler::GpuTargetConfig gpu_target_config{
       se_client->client()->backend().default_stream_executor()};
   StreamExecutorGpuCompiler compiler(se_client->client()->platform()->id());
 
@@ -124,7 +124,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
                           se_client->GetTopologyDescription());
   xla::CompileOptions opts;
-  opts.target_config = gpu_target_config;
+  opts.gpu_target_config = gpu_target_config;
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
@@ -145,7 +145,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
   StreamExecutorGpuCompiler compiler(se_client->client()->platform()->id());
   xla::CompileOptions opts;
-  opts.target_config = Compiler::TargetConfig(
+  opts.gpu_target_config = Compiler::GpuTargetConfig(
       se_client->client()->backend().default_stream_executor());
 
   TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
@@ -182,7 +182,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) {
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
   StreamExecutorGpuCompiler compiler(se_client->client()->platform()->id());
   xla::CompileOptions opts;
-  opts.target_config = Compiler::TargetConfig(
+  opts.gpu_target_config = Compiler::GpuTargetConfig(
       se_client->client()->backend().default_stream_executor());
 
   TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
@@ -225,7 +225,7 @@ TEST(StreamExecutorGpuCompilerTest, UnloadedExecutableMemoryStats) {
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
   StreamExecutorGpuCompiler compiler(se_client->client()->platform()->id());
   xla::CompileOptions options;
-  options.target_config = Compiler::TargetConfig(
+  options.gpu_target_config = Compiler::GpuTargetConfig(
       se_client->client()->backend().default_stream_executor());
 
   // Build the output shape with the correct memory space set.
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index 02668b0d72f219..357c7c0a2d1483 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -44,14 +44,11 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using ::tsl::testing::StatusIs;
-
 constexpr absl::string_view kFakeDeviceName = "Fake_device";
 
 constexpr absl::string_view kProgram = R"(HloModule Computation
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
index 3a71ee97753b31..f725d4359fe496 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
@@ -28,14 +28,19 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
 
 namespace xla {
 
@@ -74,61 +79,65 @@ StreamExecutorGpuTopologyDescription::DeviceDescriptions() const {
     return devices;
   }
   devices.reserve(gpu_topology_->number_of_devices());
+  for (int device_id = 0; device_id < gpu_topology_->number_of_devices();
+       ++device_id) {
+    devices.push_back(CreateDeviceDescription(device_id));
+  }
+  return devices;
+}
+
+std::unique_ptr<PjRtStreamExecutorDeviceDescription>
+StreamExecutorGpuTopologyDescription::CreateDeviceDescription(
+    int device_id) const {
   // Instead of "host", we use "process", as it's more accurate and consistent
   // with PjRt terminology. In a multi-process setting, a host can have multiple
   // processes, e.g., one process per GPU.
   const int32_t num_devices_per_process = gpu_topology_->num_devices_per_host();
   const int32_t num_processes_per_partition =
       gpu_topology_->num_hosts_per_partition();
-  for (int device_id = 0; device_id < gpu_topology_->number_of_devices();
-       ++device_id) {
-    // The local_device_id, process_index and partition_index are inferred from
-    // the global device id. It requires the global topology is symmetric:
-    //  - all partitions have the same number of processes.
-    //  - all processes have the same number of devices.
-    //  - processes of the same partition are adjacent to each other.
-    //
-    // And it also requires the ids assignments follows the PjRt topology
-    // exchange protocol in xla/pjrt/distributed/topology_util.cc:
-    //  - ids are densely assigned and start from 0
-    //  - from lower process index to higher process index
-    //  - within the process, from lower device ordinal to higher device ordinal
-    //
-    // If the above requirements are not met, users should get the device
-    // description by looking up individual device from PjRt client.
-    const int local_device_id = num_devices_per_process == -1
-                                    ? 0
-                                    : (device_id % num_devices_per_process);
-    const int process_index = num_devices_per_process == -1
-                                  ? 0
-                                  : (device_id / num_devices_per_process);
-    const int process_index_in_partition =
-        process_index == -1 ? 0 : (process_index % num_processes_per_partition);
-    const int partition_index =
-        num_processes_per_partition == -1
-            ? 0
-            : (process_index / num_processes_per_partition);
-    auto description = std::make_unique<PjRtStreamExecutorDeviceDescription>(
-        device_id, local_device_id, process_index, process_index_in_partition,
-        partition_index, std::string(platform_version()));
-    if (target_config_.has_value()) {
-      std::string compute_capability = "<unknown compute-capability>";
-      std::string gpu_vendor = "<unknown gpu vendor>";
-      if (target_config_->gpu_device_info().has_cuda_compute_capability()) {
-        const auto& cap =
-            target_config_->gpu_device_info().cuda_compute_capability();
-        compute_capability = absl::StrCat(cap.major(), ".", cap.minor());
-        gpu_vendor = "NVIDIA Corporation";
-      }
-
-      StreamExecutorGpuTopologyDescription::SetupDeviceDescription(
-          *description, gpu_vendor, compute_capability,
-          target_config_->gpu_device_info().core_count(),
-          target_config_->gpu_device_info().shared_memory_per_block_optin(), 0);
+  // The local_device_id, process_index and partition_index are inferred from
+  // the global device id. It requires the global topology is symmetric:
+  //  - all partitions have the same number of processes.
+  //  - all processes have the same number of devices.
+  //  - processes of the same partition are adjacent to each other.
+  //
+  // And it also requires the ids assignments follows the PjRt topology
+  // exchange protocol in xla/pjrt/distributed/topology_util.cc:
+  //  - ids are densely assigned and start from 0
+  //  - from lower process index to higher process index
+  //  - within the process, from lower device ordinal to higher device ordinal
+  //
+  // If the above requirements are not met, users should get the device
+  // description by looking up individual device from PjRt client.
+  const int local_device_id =
+      num_devices_per_process == -1 ? 0 : (device_id % num_devices_per_process);
+  const int process_index =
+      num_devices_per_process == -1 ? 0 : (device_id / num_devices_per_process);
+  const int process_index_in_partition =
+      process_index == -1 ? 0 : (process_index % num_processes_per_partition);
+  const int partition_index =
+      num_processes_per_partition == -1
+          ? 0
+          : (process_index / num_processes_per_partition);
+  auto description = std::make_unique<PjRtStreamExecutorDeviceDescription>(
+      device_id, local_device_id, process_index, process_index_in_partition,
+      partition_index, std::string(platform_version()));
+  if (target_config_.has_value()) {
+    std::string compute_capability = "<unknown compute-capability>";
+    std::string gpu_vendor = "<unknown gpu vendor>";
+    if (target_config_->gpu_device_info().has_cuda_compute_capability()) {
+      const auto& cap =
+          target_config_->gpu_device_info().cuda_compute_capability();
+      compute_capability = absl::StrCat(cap.major(), ".", cap.minor());
+      gpu_vendor = "NVIDIA Corporation";
     }
-    devices.push_back(std::move(description));
+
+    StreamExecutorGpuTopologyDescription::SetupDeviceDescription(
+        *description, gpu_vendor, compute_capability,
+        target_config_->gpu_device_info().core_count(),
+        target_config_->gpu_device_info().shared_memory_per_block_optin(), 0);
   }
-  return devices;
+  return description;
 }
 
 absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
@@ -140,6 +149,26 @@ absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
   return result;
 }
 
+absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
+StreamExecutorGpuTopologyDescription::
+    ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+        xla::PjRtGlobalDeviceId device_id) const {
+  if (device_id.value() < 0 ||
+      device_id.value() >= gpu_topology_->number_of_devices()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Chip id ", device_id.value(), " is out of range [0, ",
+                     gpu_topology_->number_of_devices(), ")"));
+  }
+  auto device_desc = CreateDeviceDescription(device_id.value());
+  const auto& coords = device_desc->coords();
+  if (coords.size() != 3) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "GPU topology must have 3 dimensions, but got ", coords.size()));
+  }
+  return std::make_pair(PjRtDeviceDimensions{coords[0], coords[1], coords[2]},
+                        0);
+}
+
 absl::StatusOr<Layout> StreamExecutorGpuTopologyDescription::GetDefaultLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dims) const {
   Shape shape = ShapeUtil::MakeShape(element_type, dims);
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
index 33bbc77797ebdd..50dc6f291c4fc3 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
@@ -25,9 +25,12 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -84,22 +87,22 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
     return gpu_topology_->number_of_hosts();
   }
 
-  absl::StatusOr<int> CoreCountOfDefaultType() const override {
-    return gpu_topology_->number_of_devices();
+  absl::StatusOr<int> ChipsPerProcess() const override {
+    return gpu_topology_->num_devices_per_host();
   }
 
-  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
-    return gpu_topology_->number_of_devices();
-  }
-
-  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
-    return gpu_topology_->number_of_devices();
+  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
+    return 1;
   }
 
-  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerChip() const override {
     return 1;
   }
 
+  absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
+  ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+      xla::PjRtGlobalDeviceId device_id) const override;
+
   absl::StatusOr<std::string> Serialize() const override;
 
   const std::optional<stream_executor::GpuTargetConfigProto>& target_config()
@@ -123,6 +126,9 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
   FromProto(const xla::PjRtTopologyDescriptionProto& proto);
 
  private:
+  std::unique_ptr<PjRtStreamExecutorDeviceDescription> CreateDeviceDescription(
+      int device_id) const;
+
   const PjRtPlatformId platform_id_;
   const std::string platform_name_;
   std::shared_ptr<const GpuTopology> gpu_topology_;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
index c5e3e7debf6e85..3d73431e71b2d5 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
@@ -16,14 +16,18 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -99,5 +103,88 @@ TEST(StreamExecutorGpuTopologyDescriptionTest, AsymmetricTopology) {
   EXPECT_EQ(device_descs.size(), 0);
 }
 
+TEST(PjRtTopologyUtilsGPUTest, GetDeviceCoords) {
+  std::shared_ptr<xla::GpuTopology> gpu_topology =
+      std::make_shared<xla::GpuTopology>(
+          /*platform_version=*/"12.3", /*num_partitions=*/1,
+          /*num_hosts_per_partition=*/1, /*num_devices_per_host=*/4);
+  StreamExecutorGpuTopologyDescription topology_desc(
+      xla::CudaId(), xla::CudaName(), gpu_topology);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core,
+      topology_desc.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(1)));
+  auto [device_coords, core_id] = std::move(device_core);
+  ASSERT_EQ(device_coords, (PjRtDeviceDimensions{0, 0, 1}));
+  ASSERT_EQ(core_id, 0);
+}
+
+TEST(PjRtTopologyUtilsGPUTest, GetDeviceCoordsSingleHostScopedPartition) {
+  std::shared_ptr<xla::GpuTopology> gpu_topology =
+      std::make_shared<xla::GpuTopology>(
+          /*platform_version=*/"12.3", /*num_partitions=*/4,
+          /*num_hosts_per_partition=*/1, /*num_devices_per_host=*/4);
+  StreamExecutorGpuTopologyDescription topology_desc(
+      xla::CudaId(), xla::CudaName(), gpu_topology);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core1,
+      topology_desc.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(1)));
+  auto [device_coords1, core_id1] = std::move(device_core1);
+  ASSERT_EQ(device_coords1, (PjRtDeviceDimensions{0, 0, 1}));
+  ASSERT_EQ(core_id1, 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core2,
+      topology_desc.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(6)));
+  auto [device_coords2, core_id2] = std::move(device_core2);
+  ASSERT_EQ(device_coords2, (PjRtDeviceDimensions{1, 0, 2}));
+  ASSERT_EQ(core_id2, 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core3,
+      topology_desc.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(10)));
+  auto [device_coords3, core_id3] = std::move(device_core3);
+  ASSERT_EQ(device_coords3, (PjRtDeviceDimensions{2, 0, 2}));
+  ASSERT_EQ(core_id3, 0);
+}
+
+TEST(PjRtTopologyUtilsGPUTest, GetDeviceCoordsMultipleHostScopedPartition) {
+  std::shared_ptr<xla::GpuTopology> gpu_topology =
+      std::make_shared<xla::GpuTopology>(
+          /*platform_version=*/"12.3", /*num_partitions=*/1,
+          /*num_hosts_per_partition=*/4, /*num_devices_per_host=*/4);
+  StreamExecutorGpuTopologyDescription topology_desc(
+      xla::CudaId(), xla::CudaName(), gpu_topology);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core1,
+      topology_desc.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(1)));
+  auto [device_coords1, core_id1] = std::move(device_core1);
+  ASSERT_EQ(device_coords1, (PjRtDeviceDimensions{0, 0, 1}));
+  ASSERT_EQ(core_id1, 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core2,
+      topology_desc.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(6)));
+  auto [device_coords2, core_id2] = std::move(device_core2);
+  ASSERT_EQ(device_coords2, (PjRtDeviceDimensions{0, 1, 2}));
+  ASSERT_EQ(core_id2, 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core3,
+      topology_desc.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(10)));
+  auto [device_coords3, core_id3] = std::move(device_core3);
+  ASSERT_EQ(device_coords3, (PjRtDeviceDimensions{0, 2, 2}));
+  ASSERT_EQ(core_id3, 0);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index aaf106d9a62402..605d6af61b2378 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -50,14 +50,17 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_cliques",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/core/collectives",
+        "//xla/core/collectives:clique_key",
         "//xla/core/collectives:collectives_registry",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/mlir_hlo:mhlo_passes",
         "//xla/pjrt:abstract_tracked_device_buffer",
         "//xla/pjrt:device_event",
         "//xla/pjrt:host_callback",
@@ -69,7 +72,6 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_stream_executor_device_description",
         "//xla/pjrt:raw_buffer",
         "//xla/pjrt:semaphore",
@@ -88,6 +90,7 @@ cc_library(
         "//xla/pjrt/gpu:se_gpu_topology_description",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_allocator_config",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
@@ -151,7 +154,6 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm([
         # keep sorted
-        "@local_config_rocm//rocm:rocm_config",
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
@@ -172,8 +174,8 @@ xla_test(
         ":gpu_event",
         ":tfrt_gpu_client",
         ":tracked_gpu_device_buffer",
-        "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -186,6 +188,7 @@ xla_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
+        "@com_google_protobuf//:protobuf",
         "//xla:future",
         "//xla:literal",
         "//xla:literal_util",
@@ -206,7 +209,6 @@ xla_test(
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:raw_buffer",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/pjrt/gpu:gpu_topology",
@@ -252,7 +254,6 @@ xla_cc_test(
     deps = [
         ":gpu_event",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
@@ -305,6 +306,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
@@ -313,7 +315,6 @@ xla_cc_test(
         "//xla/client:local_client",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_future",
         "//xla/service:gpu_plugin",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_memory",
@@ -347,6 +348,7 @@ xla_cc_test(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc
index fbf2833ae37930..a7b080474bb249 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc
@@ -20,13 +20,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace {
 
-using ::tsl::testing::StatusIs;
-
 TEST(GpuEventTest, AfterAllEmpty) { EXPECT_TRUE(AfterAll({}).IsAvailable()); }
 
 TEST(GpuEventTest, AfterAllSingle) {
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
index aadbded1e2e3b0..7ec8d2dc198fee 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -232,14 +231,10 @@ absl::Status TfrtGpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer(
         buffer->AsShapedBuffer(device_shapes_[buffer_index], device_);
 
     auto stream = device_->stream();
-    TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(stream, literal,
-                                                               shaped_buffer));
+    CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(stream, literal,
+                                                            shaped_buffer));
 
-    absl::Status status;
-    {
-      tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-      status = stream->BlockHostUntilDone();
-    }
+    absl::Status status = BlockHostUntilDoneWithHostCallback(stream);
     VLOG(3) << "Finish transfer h2d for literal with shape "
             << literal.shape().ToString() << " on device "
             << device_->DebugString() << " with status " << status;
@@ -336,14 +331,10 @@ TfrtGpuAsyncHostToDeviceTransferManager::TransferRawDataToSubBuffer(
       VLOG(3) << "H2D copy: " << host_data_ptr << " -> " << sub_buffer.opaque()
               << " (" << transfer_size << " bytes) on device "
               << device_->DebugString();
-      TF_CHECK_OK(stream->Memcpy(&sub_buffer, host_data_ptr, transfer_size))
+      CHECK_OK(stream->Memcpy(&sub_buffer, host_data_ptr, transfer_size))
           << "Failed to copy data to GPU";
 
-      absl::Status status;
-      {
-        tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-        status = stream->BlockHostUntilDone();
-      }
+      absl::Status status = BlockHostUntilDoneWithHostCallback(stream);
       VLOG(3) << "H2D copy done: " << status;
       CHECK_OK(status) << "Failed to block host until done";
     }
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index fa3c36a8157078..65e7a6ace81d0b 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -148,10 +148,7 @@ absl::StatusOr<Shape> TfrtGpuBuffer::logical_on_device_shape() {
     auto stream = device_->stream();
     TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
         stream, &shaped_buffer, &ret_shape));
-    {
-      tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-      TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-    }
+    TF_RETURN_IF_ERROR(BlockHostUntilDoneWithHostCallback(stream));
     return ret_shape;
   };
 
@@ -372,7 +369,7 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
           if (on_device_shape.layout() != literal_layout) {
             absl::InlinedVector<int64_t, 4> byte_strides(
                 on_device_shape.dimensions().size());
-            absl::Status s = ShapeUtil::ByteStrides(
+            absl::Status s = ShapeUtil::UnpackedByteStrides(
                 on_device_shape, absl::MakeSpan(byte_strides));
             if (!s.ok()) {
               promise.Set(s);
@@ -463,6 +460,13 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
 
             auto d2h_stream = device->d2h_stream();
 
+            // If we do not have a CudaEvent, we need to fall back to the host
+            // event to check readiness.
+            // TODO: Remove this once cuda events are always set/not nullptr.
+            if (device_buffer->GetCudaEvent() == nullptr) {
+              tsl::BlockUntilReady(device_buffer->ready_event());
+            }
+
             absl::Status cuda_event_wait_status =
                 WaitForEventOnStream(d2h_stream, device_buffer->GetCudaEvent());
             if (!cuda_event_wait_status.ok()) {
@@ -479,17 +483,21 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
                 buffer_ptr, device_buffer->buffer()->buffer(), byte_size))
                 << "stream->Memcpy failed copying from GPU to host";
 
-            absl::Status status;
-            {
-              tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-              status = d2h_stream->BlockHostUntilDone();
-            }
+            absl::Status status =
+                BlockHostUntilDoneWithHostCallback(d2h_stream);
             VLOG(3) << "D2H copy done. " << status;
             if (!status.ok()) {
-              VLOG(3) << "stream->BlockHostUntilDone failed: " << status;
+              VLOG(3) << "stream BlockHostUntilDoneWithHostCallback failed: "
+                      << status;
               promise.Set(status);
               return;
             }
+
+            tsl::BlockUntilReady(device_buffer->ready_event());
+            if (device_buffer->ready_event().IsError()) {
+              promise.Set(device_buffer->ready_event().GetError());
+              return;
+            }
           }
           void* buffer;
           if (should_unpack) {
@@ -520,7 +528,7 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
           }
           if (on_device_shape.IsArray() && !should_unpack &&
               transpose == nullptr) {
-            std::memcpy(literal->untyped_data(), buffer, byte_size);
+            std::memcpy(literal->untyped_data(), buffer, literal->size_bytes());
           }
           promise.Set(absl::OkStatus());
         });
@@ -624,10 +632,11 @@ Future<> TfrtGpuBuffer::CopyRawToHostFuture(Future<void*> dst_future,
       return;
     }
 
-    status = d2h_stream->BlockHostUntilDone();
+    status = BlockHostUntilDoneWithHostCallback(d2h_stream);
 
     if (!status.ok()) {
-      LOG(ERROR) << "d2h_stream->BlockHostUntilDone() failed: " << status;
+      LOG(ERROR) << "d2h_stream BlockHostUntilDoneWithHostCallback failed: "
+                 << status;
       promise.Set(status);
       return;
     }
@@ -743,8 +752,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
     Literal* literal_pointer = literal.get();
     absl::InlinedVector<int64_t, 4> byte_strides(
         literal->shape().dimensions().size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        literal->shape(), absl::MakeSpan(byte_strides)));
     return dst_device->client()->BufferFromHostBuffer(
         literal_pointer->untyped_data(),
         literal_pointer->shape().element_type(),
@@ -786,11 +795,23 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
        src_device(gpu_src_device), dst_device(gpu_dst_device),
        src_usage_event(src_usage_event.CopyRef()),
        dst_usage_event(dst_usage_event.CopyRef())]() {
+        MarkGpuEventReadyOnExit ready_on_exit_src(std::move(src_usage_event));
+        MarkGpuEventReadyOnExit ready_on_exit_dst(std::move(dst_usage_event));
+
+        // If the source buffer has an error, propagate it to the destination
+        // buffer.
+        if (const absl::Status* error =
+                src_definition_event.GetErrorIfPresent()) {
+          dst_definition_event.SetError(*error);
+          return;
+        }
+
         VLOG(3) << "Request to transfer D2D from "
                 << src_buffer->buffer().opaque() << " on device "
                 << src_device->id() << " to "
                 << allocated_dst_buffer->buffer().opaque() << " on device "
                 << dst_device->id();
+
         tsl::profiler::TraceMe trace([&] {
           return tsl::profiler::TraceMeEncode(
               "CopyToMemorySpace::D2D_copy",
@@ -801,23 +822,6 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
               });
         });
 
-        MarkGpuEventReadyOnExit ready_on_exit_src(std::move(src_usage_event));
-        MarkGpuEventReadyOnExit ready_on_exit_dst(std::move(dst_usage_event));
-
-        if (const absl::Status* error =
-                dst_definition_event.GetErrorIfPresent()) {
-          allocated_dst_buffer.SetError(*error);
-          dst_definition_event.SetError(*error);
-          return;
-        }
-
-        if (const absl::Status* error =
-                src_definition_event.GetErrorIfPresent()) {
-          allocated_dst_buffer.SetError(*error);
-          dst_definition_event.SetError(*error);
-          return;
-        }
-
         auto stream = dst_device->stream();
 
         se::DeviceMemoryBase dst(allocated_dst_buffer->buffer());
@@ -830,10 +834,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
           dst_definition_event.SetError(status);
           return;
         }
-        {
-          tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-          status = stream->BlockHostUntilDone();
-        }
+
+        status = BlockHostUntilDoneWithHostCallback(stream);
         if (status.ok()) {
           VLOG(3) << "D2D copy done. dst: " << dst.opaque();
           dst_definition_event.SetStateConcrete();
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer_test.cc
index f5627c894f09ff..174709b9117316 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -309,6 +310,27 @@ TEST(TfrtGpuBufferTest, IsDeviceShapeWhenStaticShape) {
   }
 }
 
+TEST(TfrtGpuBufferTest, CopyPoisonedBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  const char* errmsg = "injected error";
+
+  for (auto src_memory_space : client->memory_spaces()) {
+    for (auto dst_memory_space : client->memory_spaces()) {
+      TF_ASSERT_OK_AND_ASSIGN(auto src_buffer, client->CreateErrorBuffer(
+                                                   absl::InternalError(errmsg),
+                                                   shape, src_memory_space));
+
+      TF_ASSERT_OK_AND_ASSIGN(auto dst_buffer,
+                              src_buffer->CopyToMemorySpace(dst_memory_space));
+
+      EXPECT_THAT(
+          dst_buffer->GetReadyFuture().Await(),
+          testing::status::StatusIs(absl::StatusCode::kInternal, errmsg));
+    }
+  }
+}
+
 // TODO: b/382117736 - Add test for logical shape when shape is dynamic after
 // TfrtGpuClient::Execute() is ready.
 
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index 219b80263325c5..eec0d99679e068 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -48,9 +50,11 @@ limitations under the License.
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/maybe_owning.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
@@ -66,6 +70,7 @@ limitations under the License.
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/layout_mode.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -96,10 +101,11 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -850,8 +856,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
   absl::InlinedVector<int64_t, 4> tmp_strides;
   if (!byte_strides) {
     tmp_strides.resize(dims.size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(tmp_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        device_shape, absl::MakeSpan(tmp_strides)));
     byte_strides = tmp_strides;
   }
 
@@ -868,8 +874,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
 
   absl::InlinedVector<int64_t, 4> shape_strides(
       device_shape.dimensions().size());
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
+  TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+      device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
       (byte_size == 0 || *byte_strides == shape_strides);
 
@@ -978,10 +984,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
       return;
     }
 
-    {
-      tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-      status = stream->BlockHostUntilDone();
-    }
+    status = BlockHostUntilDoneWithHostCallback(stream);
     VLOG(3) << "H2D copy done. " << status;
 
     if (status.ok()) {
@@ -1111,14 +1114,10 @@ TfrtGpuClient::BufferFromHostLiteral(const LiteralSlice& literal,
 
         ShapedBuffer shaped_buffer = buffer->AsShapedBuffer(shape, device);
 
-        TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-            stream, literal, shaped_buffer));
+        CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(stream, literal,
+                                                                shaped_buffer));
 
-        absl::Status status;
-        {
-          tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-          status = stream->BlockHostUntilDone();
-        }
+        absl::Status status = BlockHostUntilDoneWithHostCallback(stream);
         CHECK_OK(status) << "Failed to block host until done";
         VLOG(3) << "BufferFromHostLiteral done for device_buffer: "
                 << device_buffer;
@@ -1216,7 +1215,8 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
   TF_ASSIGN_OR_RETURN(
       DeviceTopologyPair device_topology_pair,
       BuildDistributedDevices(
-          pjrt_platform_name, xla_client, options.node_id, options.num_nodes,
+          pjrt_platform_name, xla_client, options.node_id,
+          options.max_inflight_computations, options.num_nodes,
           gpu_run_options.get(), kv_store, options.enable_mock_nccl,
           options.mock_gpu_topology, options.partition_index, absl::Minutes(2),
           absl::Minutes(5)));
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
index 2a285cef797ce0..73b30b99146e82 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -44,6 +45,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
+#include "google/protobuf/text_format.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/future.h"
@@ -84,7 +86,6 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/types.h"
@@ -93,7 +94,6 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -239,7 +239,6 @@ ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
                           CompileExecutable(kAddProgram, *client));
 
   ExecuteOptions options;
-  options.untuple_result = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto result,
       executable->Execute({{buffer.get(), buffer.get()}}, /*options=*/options));
@@ -273,11 +272,11 @@ TEST(TfrtGpuClientTest, SendRecvChunked) {
                             std::unique_ptr<CopyToDeviceStream> stream) {
         auto chunk0 = PjRtChunk::AllocateDefault(sizeof(float));
         *reinterpret_cast<float*>(chunk0.data()) = 5.0f;
-        TF_CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
+        CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
 
         auto chunk1 = PjRtChunk::AllocateDefault(sizeof(float));
         *reinterpret_cast<float*>(chunk1.data()) = 6.0f;
-        TF_CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
+        CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
 
         return absl::OkStatus();
       }};
@@ -1412,7 +1411,7 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTest) {
   EXPECT_EQ(memory_stats.host_output_size_in_bytes, 16);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          result_buffers[0]->ToLiteralSync())
+                          result_buffers[0]->ToLiteralSync());
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
 }
 
@@ -1440,7 +1439,6 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   // Untuple the result so that we get separate buffers.
   // This is how JAX invokes XLA.
   ExecuteOptions execute_options;
-  execute_options.untuple_result = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, executable->Execute({{input.get()}}, execute_options));
 
@@ -1450,10 +1448,10 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   EXPECT_EQ(result_buffers[1]->memory_space()->kind(), "pinned_host");
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          result_buffers[0]->ToLiteralSync())
+                          result_buffers[0]->ToLiteralSync());
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> another_literal,
-                          result_buffers[1]->ToLiteralSync())
+                          result_buffers[1]->ToLiteralSync());
   EXPECT_THAT(another_literal->data<int32_t>(), ElementsAreArray(kData));
 }
 
@@ -1708,9 +1706,9 @@ TEST(TfrtGpuClientTest, DeviceAttributes) {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::se::DeviceDescription> desc,
                             platform->DescriptionForDevice(0));
     stream_executor::GpuComputeCapability cc = desc->gpu_compute_capability();
-    auto nvcc = std::get<stream_executor::CudaComputeCapability>(cc);
+    auto* nvcc = cc.cuda_compute_capability();
     std::string expected_compute_capability =
-        absl::StrCat(nvcc.major, ".", nvcc.minor);
+        absl::StrCat(nvcc->major, ".", nvcc->minor);
     EXPECT_EQ(compute_capability, expected_compute_capability);
 
     // Attribute `coords`.
@@ -1740,11 +1738,15 @@ TEST(TfrtGpuClientTest, DmaMapUnmap) {
   auto client = tensorflow::down_cast<TfrtGpuClient*>(gpu_client.get());
   size_t dma_size = 8192;
   size_t alignment = 4096;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
 
   // DmaMap the first half of the buffer.
   size_t dma_map_size = dma_size / 2;
-  char* first_half_ptr = static_cast<char*>(host_dma_ptr.get());
+  char* first_half_ptr = static_cast<char*>(host_dma_ptr);
   char* second_half_ptr = first_half_ptr + dma_map_size;
   int offset = 5;
   TF_EXPECT_OK(client->DmaMap(first_half_ptr, dma_map_size));
@@ -1813,10 +1815,15 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   size_t dma_size = 2 * 1024 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
 
-  auto result = first_buffer->CopyRawToHost(host_dma_ptr.get(), 0, size);
+  auto result = first_buffer->CopyRawToHost(host_dma_ptr, 0, size);
   TF_EXPECT_OK(result.Await());
 
   PjRtDevice* const second_device = client->addressable_devices()[1];
@@ -1827,12 +1834,12 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
   auto second_buffer = transfer_manager->RetrieveBuffer(0);
 
   TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
-      0, host_dma_ptr.get(), 0, size, true, []() {}));
+      0, host_dma_ptr, 0, size, true, []() {}));
   TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
   EXPECT_EQ(literal->element_count(), test_length);
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
 
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
 }
 
 TEST(TfrtGpuClientTest, HostExecuteRuntimeTest) {
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
index 854537b3d3b035..588227432b216a 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
@@ -109,13 +109,13 @@ TfrtGpuDevice::~TfrtGpuDevice() {
   // Block the host until all pending work on the stream is done. This is to
   // avoid user-after-free errors in host callbacks.
   if (stream_ != nullptr) {
-    absl::Status status = stream_->BlockHostUntilDone();
+    absl::Status status = BlockHostUntilDoneWithHostCallback(stream_.get());
     if (!status.ok()) {
       LOG(ERROR) << "Failed to wait for stream to finish: " << status;
     }
   }
   if (d2h_stream_ != nullptr) {
-    absl::Status status = d2h_stream_->BlockHostUntilDone();
+    absl::Status status = BlockHostUntilDoneWithHostCallback(d2h_stream_.get());
     if (!status.ok()) {
       LOG(ERROR) << "Failed to wait for d2h stream to finish: " << status;
     }
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
index 39529177b683f7..95961906b1ace8 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "xla/executable_run_options.h"
+#include "xla/future.h"
 #include "xla/literal.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/semaphore.h"
 #include "xla/service/hlo.pb.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 37f3b22d16578e..a7c3dea624405f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -33,10 +33,15 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_cliques.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
+#include "xla/core/collectives/clique_key.h"
 #include "xla/executable_run_options.h"
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -53,6 +58,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/semaphore.h"
 #include "xla/pjrt/utils.h"
@@ -377,29 +383,11 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
   }
 
   // Handle inputs.
-  if (options.arguments_are_tupled) {
-    if (!parameter_is_tupled_arguments_) {
-      return InvalidArgument(
-          "Arguments may only be supplied as a tuple when the executable was"
-          "compiled with a single tupled parameter");
-    }
-    if (argument_handles.size() != 1) {
-      return InvalidArgument(
-          "Option arguments_are_tupled was true but %d buffers were passed to"
-          "execution",
-          argument_handles.size());
-    }
-  }
-
   // SPMD sharding produces a single executable for multiple partitions.
   int executable_idx = executables_.size() > 1 ? partition : 0;
 
   TF_ASSIGN_OR_RETURN(std::vector<Shape> output_shapes, GetOutputShapes());
   const Shape& result_shape = output_shapes[executable_idx];
-  if (!options.untuple_result && result_shape.IsTuple()) {
-    return InvalidArgument(
-        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
-  }
 
   // `scheduled_event` indicates whether gpu computation is dispatched to the
   // stream and whether there was an error.
@@ -424,6 +412,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
   // the single definition event.
   std::vector<tsl::RCReference<tsl::AsyncValue>> prepare_input_deps;
   std::vector<tsl::RCReference<tsl::AsyncValue>> input_deps;
+  std::vector<tsl::RCReference<tsl::AsyncValue>> ready_deps;
   input_deps.reserve(argument_handles.size() + 1);
 
   absl::Span<int const> donated_params =
@@ -447,69 +436,57 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         options.non_donatable_input_indices.contains(i);
     bool must_donate = donate_it != donated_params.end() && *donate_it == i &&
                        !donation_denied_at_runtime;
-    auto tracked_buffer_or = [&]() -> absl::StatusOr<TrackedGpuDeviceBuffer*> {
-      TrackedGpuDeviceBuffer* tracked_buffer = nullptr;
-      if (must_donate) {
-        VLOG(3) << "Buffer for argument_handles[" << i << "] is donated";
-
-        ++donate_it;
-        TF_RETURN_IF_ERROR(TestBufferDonationClashes(
-            handle, donation_clashes, must_donate, i, replica, partition));
-        TF_ASSIGN_OR_RETURN(auto donation_transaction,
-                            tfrt_buffer->AcquireDonation());
-
-        // After acquiring the buffer for donation, we retrieve the dependent
-        // usage events. Note that we don't need any locking here as
-        // AcquireDonation() is supposed to synchronize with other usages.
-        input_deps.push_back(
-            donation_transaction.device_buffer()->AfterAllUsageEvents());
-        tracked_buffer = donation_transaction.device_buffer();
-        donation_transactions.push_back(std::move(donation_transaction));
-        buffer_is_donated.push_back(true);
-      } else {
-        tracked_buffer = tfrt_buffer->AcquireUsage(complete_event);
-        if (!tracked_buffer) {
-          return InvalidArgument(
-              "Invalid buffer passed: buffer has been deleted or donated.");
-        }
-        buffer_is_donated.push_back(false);
-      }
-      return tracked_buffer;
-    }();
-
-    if (!tracked_buffer_or.ok()) {
-      // If something failed when preparing the input, we still need to add it
-      // to the input deps so that it can poison the output buffers.
-      auto error_av = tsl::MakeErrorAsyncValueRef(tracked_buffer_or.status());
-      prepare_input_deps.push_back(error_av);
-      input_deps.push_back(error_av);
-
-      LOG(ERROR) << "argument_handles[" << i
-                 << "]: failed to get tracked buffer with status "
-                 << tracked_buffer_or.status();
+
+    // Prepare the tracked buffer for the input.
+    TrackedGpuDeviceBuffer* tracked_buffer = nullptr;
+    if (must_donate) {
+      VLOG(3) << "Buffer for argument_handles[" << i << "] is donated";
+
+      ++donate_it;
+      TF_RETURN_IF_ERROR(TestBufferDonationClashes(
+          handle, donation_clashes, must_donate, i, replica, partition));
+      TF_ASSIGN_OR_RETURN(auto donation_transaction,
+                          tfrt_buffer->AcquireDonation());
+
+      // After acquiring the buffer for donation, we retrieve the dependent
+      // usage events. Note that we don't need any locking here as
+      // AcquireDonation() is supposed to synchronize with other usages.
+      input_deps.push_back(
+          donation_transaction.device_buffer()->AfterAllUsageEvents());
+      tracked_buffer = donation_transaction.device_buffer();
+      donation_transactions.push_back(std::move(donation_transaction));
+      buffer_is_donated.push_back(true);
     } else {
-      TrackedGpuDeviceBuffer* tracked_buffer = tracked_buffer_or.value();
-      tracked_buffers.push_back(tracked_buffer);
-      prepare_input_deps.push_back(tracked_buffer->buffer().CopyRCRef());
-
-      VLOG(3) << "argument_handles[" << i << "]: addr = "
-              << (tracked_buffer->buffer().IsAvailable()
-                      ? tracked_buffer->buffer()->buffer().opaque()
-                      : "NotReady")
-              << ", logical shape = "
-              << tfrt_buffer->logical_on_device_shape()->ToString();
-
-      // Definition events are never modified after buffer construction. If they
-      // are available and have no error, they can be skipped in input deps.
-      // In contrast, already known errors in the input are taken as deps so
-      // that they can poison output buffers.
-      const auto& definition_event = tracked_buffer->definition_event();
-      if (!definition_event.IsAvailable() || definition_event.IsError()) {
-        VLOG(3) << "definition_event is not available: AsyncValue pointer: "
-                << definition_event.GetAsyncValue();
-        input_deps.push_back(definition_event.CopyRCRef());
+      tracked_buffer = tfrt_buffer->AcquireUsage(complete_event);
+      if (!tracked_buffer) {
+        return InvalidArgument(
+            "Invalid buffer passed: buffer has been deleted or donated.");
       }
+      buffer_is_donated.push_back(false);
+    }
+
+    // By now, the tracked buffer is guaranteed to be valid.
+    tracked_buffers.push_back(tracked_buffer);
+    prepare_input_deps.push_back(tracked_buffer->buffer().CopyRCRef());
+
+    VLOG(3) << "argument_handles[" << i << "]: addr = "
+            << (tracked_buffer->buffer().IsAvailable()
+                    ? tracked_buffer->buffer()->buffer().opaque()
+                    : "NotReady")
+            << ", logical shape = "
+            << tfrt_buffer->logical_on_device_shape()->ToString();
+
+    // Definition events are never modified after buffer construction. If they
+    // are available and have no error, they can be skipped in input deps.
+    // In contrast, already known errors in the input are taken as deps so
+    // that they can poison output buffers.
+    const auto& definition_event = tracked_buffer->definition_event();
+    if (!definition_event.IsAvailable() || definition_event.IsError()) {
+      VLOG(3) << "definition_event is not available: AsyncValue pointer: "
+              << definition_event.GetAsyncValue();
+      input_deps.push_back(definition_event.CopyRCRef());
     }
+    ready_deps.push_back(tracked_buffer->ready_event().CopyRCRef());
   }
 
   {
@@ -530,9 +507,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
   std::vector<tsl::AsyncValueRef<GpuDeviceMemory>> output_buffers;
   std::vector<std::unique_ptr<PjRtBuffer>> outputs;
   auto gpu_executable = executables_[executable_idx];
-  bool untuple_result = options.untuple_result;
   bool result_is_tuple = result_shape.IsTuple();
-  if (options.untuple_result && result_shape.IsTuple()) {
+  if (result_shape.IsTuple()) {
     output_buffers.reserve(result_shape.tuple_shapes().size());
     outputs.reserve(output_buffers.size());
     for (int i = 0; i < result_shape.tuple_shapes().size(); ++i) {
@@ -601,18 +577,20 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
       [replica, partition, device, launch_id(options.launch_id),
        output_buffers(output_buffers), complete_event(complete_event.CopyRef()),
        scheduled_event(scheduled_event.CopyRef()),
-       untuple_result(untuple_result), result_is_tuple(result_is_tuple),
+       result_is_tuple(result_is_tuple),
        donation_transactions(std::move(donation_transactions)),
        parameter_shapes(on_device_executable_parameter_shapes_[executable_idx]),
        gpu_executable(std::move(gpu_executable)),
        device_assignment(device_assignment), executable_name(name()),
        ffi_context(ffi_context), inputs_avs(CopyAsyncValues(input_deps)),
+       ready_deps(std::move(ready_deps)),
        execution_profile(options.execution_profile),
        send_device_memory(std::move(send_device_memory)),
        recv_device_memory(std::move(recv_device_memory)),
        output_cuda_execute_event(std::move(output_cuda_execute_event)),
        compute_reservation(std::move(compute_reservation)), client = client_,
-       task_incarnations = options.incarnations](
+       task_incarnations = options.incarnations,
+       time_measurement_key = xla::GetDeviceTimeMeasurementKey()](
           std::vector<ExecutionInput> execution_inputs) mutable {
         VLOG(1) << "execute_fn for " << executable_name
                 << ", launch_id: " << launch_id << ", replica: " << replica
@@ -646,14 +624,9 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         // Set the incarnations in gpu_run_options.
         gpu::GpuExecutableRunOptions* gpu_run_options =
             CHECK_NOTNULL(client->gpu_run_options());
-        absl::StatusOr<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
-            device_incarnations =
-                GetLatestIncarnations(client->devices(), task_incarnations);
-        if (!device_incarnations.ok()) {
-          VLOG(1) << "Unable to set incarnations in GpuExecutableRunOptions: "
-                  << device_incarnations.status();
-        } else {
-          gpu_run_options->set_incarnations(*std::move(device_incarnations));
+        if (!task_incarnations.empty()) {
+          gpu_run_options->set_incarnations(
+              GetLatestIncarnations(client->devices(), task_incarnations));
         }
 
         auto stream = device->stream();
@@ -679,6 +652,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         run_options.set_send_device_memory_function(&send_device_memory);
         run_options.set_recv_device_memory_function(&recv_device_memory);
         run_options.set_execution_profile(execution_profile);
+        std::vector<std::unique_ptr<CliqueKey>> clique_keys;
+        run_options.set_clique_keys(&clique_keys);
 
         // TODO(phawkins): *technically* this should probably happen after
         // calling RunAsync(). But that causes a large performance problem: it
@@ -689,6 +664,26 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
           std::move(donation_transaction).Commit();
         }
 
+        ////////////////////////////////////////////////////////////////////////
+        // Record the start time of the execution by placing a callback on the
+        // stream directly before the execution. If this callback is added,
+        // another callback will be added directly after the execution to record
+        // the elapsed device time.
+        ////////////////////////////////////////////////////////////////////////
+        auto start_time = std::make_shared<absl::Time>();
+        if (time_measurement_key.has_value()) {
+          absl::Status host_callback_status = stream->DoHostCallback(
+              [start_time]() mutable { *start_time = absl::Now(); });
+
+          if (!host_callback_status.ok()) {
+            LOG(WARNING) << "Failed to do host callback for to register device "
+                            "start time";
+          }
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Start calling RunAsync for the executable.
+        ////////////////////////////////////////////////////////////////////////
         VLOG(1) << "Start calling RunAsync for " << executable_name
                 << ", device=" << device->DebugString()
                 << ", launch_id=" << launch_id << ", replica=" << replica
@@ -749,9 +744,32 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
           return;
         }
 
+        ////////////////////////////////////////////////////////////////////////
+        // Record the end time of the execution by placing a callback on the
+        // stream directly after the execution. If this callback is added,
+        // another callback will be added directly before the execution to
+        // record the elapsed device time.
+        ////////////////////////////////////////////////////////////////////////
+        if (time_measurement_key.has_value()) {
+          absl::Status host_callback_status = stream->DoHostCallback(
+              [executable_name, time_measurement_key, start_time]() mutable {
+                auto elapsed = absl::Now() - *start_time;
+                VLOG(1) << "Device execution time for " << executable_name
+                        << " is " << elapsed;
+
+                xla::RecordDeviceTimeMeasurement(
+                    *time_measurement_key, elapsed,
+                    DeviceTimeMeasurement::DeviceType::kGpu);
+              });
+          if (!host_callback_status.ok()) {
+            LOG(WARNING) << "Failed to do host callback for to register device "
+                            "time measurement";
+          }
+        }
+
         ExecutionOutput& execution_output = result_buffer_or_status.value();
         ScopedShapedBuffer output = execution_output.ConsumeResult();
-        if (untuple_result && result_is_tuple) {
+        if (result_is_tuple) {
           for (int i = 0; i < output_buffers.size(); ++i) {
             ScopedShapedBuffer tuple_buffer = output.TakeSubTree({i});
             stream_executor::DeviceMemoryBase* elem =
@@ -775,25 +793,60 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         // has completed, so that the next execute_fn can start.
         scheduled_event.SetStateConcrete();
 
-        absl::Status status;
-        {
-          tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-          status = stream->BlockHostUntilDone();
-        }
-        if (!status.ok()) {
-          LOG(ERROR) << "BlockHostUntilDone failed for executable "
-                     << executable_name << " on device "
-                     << device->DebugString() << ", status = " << status;
-          complete_event.SetError(status);
-        } else {
-          complete_event.SetStateConcrete();
-        }
-
+        absl::Status status = BlockHostUntilDoneWithHostCallback(stream);
         VLOG(1) << "execute_fn for " << executable_name
                 << ", launch_id: " << launch_id << ", replica=" << replica
                 << ", partition=" << partition
                 << ", device: " << device->DebugString()
                 << " is done with status " << status;
+
+        if (!status.ok()) {
+          LOG(ERROR)
+              << "BlockHostUntilDoneWithHostCallback failed for executable "
+              << executable_name << " on device " << device->DebugString()
+              << ", status = " << status;
+          complete_event.SetError(status);
+          return;
+        }
+
+        // Propagate errors (if any) from dependencies.
+        absl::Status ready_deps_status;
+        for (const tsl::RCReference<tsl::AsyncValue>& ready : ready_deps) {
+          tsl::BlockUntilReady(ready.get());
+          if (!ready->IsError()) {
+            continue;
+          }
+          absl::Status err = ready->GetError();
+          LOG(ERROR) << "Computation has failed dependency: " << err;
+          if (ready_deps_status.ok()) {
+            ready_deps_status = err;
+          } else {
+            ready_deps_status = absl::Status(
+                err.code(),
+                absl::StrCat(ready_deps_status.message(), "; ", err.message()));
+          }
+        }
+        if (!ready_deps_status.ok()) {
+          complete_event.SetError(ready_deps_status);
+          return;
+        }
+
+        // If any collective is stale, then the collective may have aborted.
+        // Note that NCCL doesn't provide a way to *know* if the collective was
+        // aborted, but we conservatively assume it was.
+        for (const std::unique_ptr<CliqueKey>& clique_key : clique_keys) {
+          gpu::GpuCliqueKey* gpu_clique_key = CHECK_NOTNULL(
+              tensorflow::down_cast<gpu::GpuCliqueKey*>(clique_key.get()));
+          if (absl::Status s = CheckCliqueKeyIsntStale(*gpu_clique_key);
+              !s.ok()) {
+            VLOG(1) << "GPU clique key " << gpu_clique_key->ToString()
+                    << " is stale";
+            complete_event.SetError(s);
+            return;
+          }
+        }
+
+        complete_event.SetStateConcrete();
       };
 
   auto prepare_inputs =
@@ -808,7 +861,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
        execute_fn(std::move(execute_fn)), input_deps(std::move(input_deps)),
        parameter_shapes(on_device_executable_parameter_shapes_[executable_idx]),
        parameter_is_tupled_arguments(parameter_is_tupled_arguments_),
-       arguments_are_tupled(options.arguments_are_tupled),
        input_buffer_sizes_in_bytes(
            input_buffer_sizes_in_bytes_[executable_idx])]() mutable {
         tsl::profiler::TraceMeConsumer activity(
@@ -849,7 +901,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         }
 
         std::vector<ExecutionInput> inputs;
-        if (parameter_is_tupled_arguments && !arguments_are_tupled) {
+        if (parameter_is_tupled_arguments) {
           inputs.emplace_back(
               ShapeTree<MaybeOwningDeviceMemory>(&parameter_shapes->front()));
           ExecutionInput& input = inputs.back();
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
index 9060728211c31a..85592ad4984daf 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
index a2660958486c55..2aaf31b53fc27a 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -72,6 +74,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
@@ -284,7 +287,7 @@ std::optional<stream_executor::GpuTargetConfigProto> GetTargetConfigForDevices(
     se::StreamExecutor* executor =
         tensorflow::down_cast<const TfrtGpuDevice*>(device)->executor();
     if (executor != nullptr) {
-      return xla::Compiler::TargetConfig(executor).ToProto();
+      return xla::Compiler::GpuTargetConfig(executor).ToProto();
     }
   }
   return std::nullopt;
@@ -458,10 +461,7 @@ SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
       }
 
       // Wait for the data to be available on the host.
-      {
-        tsl::profiler::TraceMe traceme("BlockHostUntilDone");
-        status = stream->BlockHostUntilDone();
-      }
+      status = BlockHostUntilDoneWithHostCallback(stream);
       VLOG(3) << "D2H copy done. " << status;
       if (!status.ok()) {
         done_event.SetError(absl::InternalError(absl::StrFormat(
@@ -642,7 +642,9 @@ absl::StatusOr<std::unique_ptr<tsl::Allocator>> CreateAllocatorForDevice(
       LOG_FIRST_N(INFO, 1) << "Using BFC allocator.";
       return CreateBFCAllocator(executor, allocator_config.memory_fraction,
                                 allocator_config.preallocate,
-                                allocator_config.gpu_system_memory_size);
+                                allocator_config.gpu_system_memory_size,
+                                allocator_config.sub_allocator_alloc_visitors,
+                                allocator_config.sub_allocator_free_visitors);
     case GpuAllocatorConfig::Kind::kPlatform:
       LOG(FATAL) << "Platform allocator should be handled before calling this "
                     "function.";
@@ -706,7 +708,8 @@ using DeviceTopologyPair =
 
 absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     absl::string_view platform_name, LocalClient* xla_client, int node_id,
-    int num_nodes, gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    int max_inflight_computations, int num_nodes,
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     std::optional<absl::string_view> mock_gpu_topology,
     std::optional<int> partition_index,
@@ -728,13 +731,13 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
   auto make_compute_capability_string =
       [](const stream_executor::DeviceDescription* desc) -> std::string {
     stream_executor::GpuComputeCapability cc = desc->gpu_compute_capability();
-    if (std::holds_alternative<stream_executor::CudaComputeCapability>(cc)) {
-      auto nvcc = std::get<stream_executor::CudaComputeCapability>(cc);
-      return absl::StrCat(nvcc.major, ".", nvcc.minor);
+    if (cc.IsCuda()) {
+      auto* nvcc = cc.cuda_compute_capability();
+      return absl::StrCat(nvcc->major, ".", nvcc->minor);
     }
-    if (std::holds_alternative<stream_executor::RocmComputeCapability>(cc)) {
-      auto rocmcc = std::get<stream_executor::RocmComputeCapability>(cc);
-      return rocmcc.gfx_version();
+    if (cc.IsRocm()) {
+      auto* rocmcc = cc.rocm_compute_capability();
+      return rocmcc->gfx_version();
     }
     return "unknown";
   };
@@ -811,7 +814,7 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
         &global_topology, /*assign_global_device_ids=*/true));
   }
 
-  std::map<int, GlobalDeviceId> gpu_device_ids;
+  absl::btree_map<LocalDeviceId, GlobalDeviceId> gpu_device_ids;
   absl::flat_hash_map<GlobalDeviceId, int> device_to_node;
   int curr_partition_index = -1;
   int curr_process_index = -1;
@@ -835,7 +838,8 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
       device_to_node[global_device_id] = node.node_id();
       TfrtGpuDevice::Options options;
       if (node.node_id() == node_id) {
-        gpu_device_ids[device_proto.local_device_ordinal()] = global_device_id;
+        gpu_device_ids[LocalDeviceId(device_proto.local_device_ordinal())] =
+            global_device_id;
         // Assign some descriptive names for profiling tools.
         // TODO: hhb
         // NameDeviceAndLauncherThread(node, device_proto,
@@ -856,7 +860,7 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
       options.process_index = node.node_id();
       options.process_index_in_partition = curr_process_index_in_partition;
       options.partition_index = device_proto.partition_index();
-      options.max_inflight_computations = 8;
+      options.max_inflight_computations = max_inflight_computations;
       options.platform_version = device_proto.name();
       options.device_vendor = device_proto.vendor();
       options.compute_capability = device_proto.compute_capability();
@@ -868,8 +872,8 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
   }
   for (se::StreamExecutor* executor :
        xla_client->backend().stream_executors()) {
-    TF_RET_CHECK(gpu_device_ids.find(executor->device_ordinal()) !=
-                 gpu_device_ids.end());
+    TF_RET_CHECK(gpu_device_ids.find(LocalDeviceId(
+                     executor->device_ordinal())) != gpu_device_ids.end());
   }
   gpu_executable_run_options->set_gpu_global_device_ids(
       std::move(gpu_device_ids));
@@ -939,8 +943,7 @@ void EnqueueWorkWhenReady(
   });
 }
 
-absl::StatusOr<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
-GetLatestIncarnations(
+absl::flat_hash_map<GlobalDeviceId, IncarnationId> GetLatestIncarnations(
     absl::Span<PjRtDevice* const> devices,
     const absl::flat_hash_map<int, IncarnationId>& incarnations) {
   // Map every device to its incarnation.
@@ -949,7 +952,9 @@ GetLatestIncarnations(
     int task_id = device->process_index();
     auto it = incarnations.find(task_id);
     if (it == incarnations.end()) {
-      return FailedPrecondition("Incarnation for task %d not found", task_id);
+      // The task might be dead.
+      LOG(WARNING) << "Incarnation for task " << task_id << " not found";
+      continue;
     }
     GlobalDeviceId device_id(device->global_device_id().value());
     device_incarnations[device_id] = it->second;
@@ -957,4 +962,19 @@ GetLatestIncarnations(
   return device_incarnations;
 }
 
+absl::Status BlockHostUntilDoneWithHostCallback(se::Stream* stream) {
+  absl::Notification event;
+
+  tsl::profiler::TraceMe traceme("BlockHostUntilDoneWithHostCallback");
+  auto status = stream->DoHostCallback([&event]() {
+    tsl::profiler::TraceMe traceme(
+        "BlockHostUntilDoneWithHostCallback::Callback");
+    event.Notify();
+  });
+
+  event.WaitForNotification();
+
+  return status;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
index cb75028d68305b..9fdf52226cecba 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
@@ -163,7 +163,8 @@ using DeviceTopologyPair =
 
 absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     absl::string_view platform_name, LocalClient* xla_client, int node_id,
-    int num_nodes, gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    int num_nodes, int max_inflight_computations,
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     std::optional<absl::string_view> mock_gpu_topology,
     std::optional<int> partition_index,
@@ -179,11 +180,12 @@ void EnqueueWorkWhenReady(
     absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
     absl::AnyInvocable<void()> callee);
 
-absl::StatusOr<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
-GetLatestIncarnations(
+absl::flat_hash_map<GlobalDeviceId, IncarnationId> GetLatestIncarnations(
     absl::Span<PjRtDevice* const> devices,
     const absl::flat_hash_map<int, IncarnationId>& incarnations);
 
+absl::Status BlockHostUntilDoneWithHostCallback(se::Stream* stream);
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_GPU_TFRT_UTILS_H_
diff --git a/third_party/xla/xla/pjrt/host_callback.cc b/third_party/xla/xla/pjrt/host_callback.cc
index 508b10ee70ffab..7d96b6b00dedf6 100644
--- a/third_party/xla/xla/pjrt/host_callback.cc
+++ b/third_party/xla/xla/pjrt/host_callback.cc
@@ -27,8 +27,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/shape_util.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
@@ -37,6 +35,9 @@ static thread_local int on_send_guard = 0;
 void EnterHostCallback() { ++on_send_guard; }
 void LeaveHostCallback() { --on_send_guard; }
 
+HostCallbackScope::HostCallbackScope() { ++on_send_guard; }
+HostCallbackScope::~HostCallbackScope() { --on_send_guard; }
+
 bool ThisThreadIsInsideHostCallback() { return on_send_guard > 0; }
 
 absl::Status HostCallbackContext::OnSend(int arg_num,
@@ -51,7 +52,7 @@ absl::Status HostCallbackContext::OnSend(int arg_num,
     DCHECK_GE(data.size(), host_size);
 
     auto delinearized = PjRtChunk::AllocateDefault(host_size);
-    TF_CHECK_OK(host_memory_for_device_manager_->ToHostLayout(
+    CHECK_OK(host_memory_for_device_manager_->ToHostLayout(
         data.data(), data.size(), device_shape, delinearized.data(),
         delinearized.size(), host_shape));
 
@@ -90,9 +91,11 @@ absl::Status HostCallbackContext::OnSend(int arg_num,
     result_ptrs.push_back(results.back().data());
   }
 
-  EnterHostCallback();
-  auto status = host_callback_.callback(result_ptrs.data(), arg_ptrs.data());
-  LeaveHostCallback();
+  absl::Status status;
+  {
+    xla::HostCallbackScope scope;
+    status = host_callback_.callback(result_ptrs.data(), arg_ptrs.data());
+  }
 
   // TODO(chky): Consider populating garbage data in results upon errors.
 
@@ -119,7 +122,7 @@ void HostCallbackContext::Receive(int res_num,
   result_channel->Pop().OnReady(
       [this, res_num, metadata,
        stream = std::move(stream)](absl::StatusOr<PjRtChunk> chunk) mutable {
-        TF_CHECK_OK(chunk.status());
+        CHECK_OK(chunk.status());
 
         if (!use_major_to_minor_data_layout_for_callbacks_) {
           const auto& host_shape = host_callback_.results.at(res_num).shape;
@@ -131,7 +134,7 @@ void HostCallbackContext::Receive(int res_num,
         }
 
         stream->AddChunk(*std::move(chunk)).OnReady([](absl::Status s) {
-          TF_CHECK_OK(s);
+          CHECK_OK(s);
         });
       });
 }
@@ -175,7 +178,11 @@ CreateHostCallbackStateAndAppendSendRecvCallbacks(
 
 // First 64 bits of SHA-512 of "xla::FfiLoadedHostCallbacks".
 ffi::TypeId FfiLoadedHostCallbacks::id = {7357244197867843242};
+ffi::TypeInfo FfiLoadedHostCallbacks::info =
+    ffi::MakeTypeInfo<FfiLoadedHostCallbacks>();
+
 XLA_FFI_REGISTER_TYPE(ffi::GetXlaFfiApi(), "FfiLoadedHostCallbacks",
-                      &FfiLoadedHostCallbacks::id);
+                      &FfiLoadedHostCallbacks::id,
+                      &FfiLoadedHostCallbacks::info);
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_callback.h b/third_party/xla/xla/pjrt/host_callback.h
index 9a2d2001af77ab..ac365b511cf8cd 100644
--- a/third_party/xla/xla/pjrt/host_callback.h
+++ b/third_party/xla/xla/pjrt/host_callback.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "tsl/platform/logging.h"
 
@@ -47,9 +46,17 @@ namespace xla {
 
 bool ThisThreadIsInsideHostCallback();
 
-void EnterHostCallback();
+ABSL_DEPRECATED("Use HostCallbackScope") void EnterHostCallback();
 
-void LeaveHostCallback();
+ABSL_DEPRECATED("Use HostCallbackScope") void LeaveHostCallback();
+
+class HostCallbackScope {
+ public:
+  HostCallbackScope();
+  ~HostCallbackScope();
+  HostCallbackScope(const HostCallbackScope& o) = delete;
+  HostCallbackScope& operator=(const HostCallbackScope& o) = delete;
+};
 
 // A thread-safe queue for passing PjRtChunk objects for e.g. from Send ops to
 // Recv ops.
@@ -57,7 +64,7 @@ class ThreadSafePjRtChunkQueue {
  public:
   // Push a PjRtChunk into the queue.
   void Push(PjRtChunk chunk) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     if (promises_.empty()) {
       queue_.push_back(std::move(chunk));
       return;
@@ -69,7 +76,7 @@ class ThreadSafePjRtChunkQueue {
 
   // Pop a PjRtChunk future from the queue.
   Future<PjRtChunk> Pop() {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     if (queue_.empty()) {
       auto [promise, future] = Future<PjRtChunk>::MakePromise();
       promises_.push_back(std::move(promise));
@@ -175,6 +182,8 @@ CreateHostCallbackStateAndAppendSendRecvCallbacks(
 
 struct FfiLoadedHostCallbacks {
   static ffi::TypeId id;
+  static ffi::TypeInfo info;
+
   void** callbacks;
   uint32_t num_callbacks;
 };
diff --git a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
index 88e385492ba8ff..b01c428036110d 100644
--- a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
@@ -151,10 +151,11 @@ class CommonAsyncHostToDeviceTransferManager
       }
       definition_events.push_back(std::move(definition_event_promise));
 
-      TF_ASSIGN_OR_RETURN(auto buffer,
-                          client->DefineBuffer(device_shape, raw_buffer,
-                                               {std::move(definition_event)},
-                                               /*raw_buffer_is_mutable=*/true));
+      TF_ASSIGN_OR_RETURN(
+          auto buffer,
+          client->DefineBuffer(device_shape, memory_space, raw_buffer,
+                               {std::move(definition_event)},
+                               /*raw_buffer_is_mutable=*/true));
       device_shapes.push_back(std::move(device_shape));
       buffers.push_back(std::move(buffer));
       undispatched_buffer_refs.push_back(raw_buffer);
@@ -175,7 +176,7 @@ class CommonAsyncHostToDeviceTransferManager
       return transfers_in_flight_ == 0;
     };
     {
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       // Make sure we don't leave dangling pointers in cleanup routines even
       // if the client lets the object go out of scope.
       mu_.Await(absl::Condition(&transfers_finished));
@@ -215,7 +216,7 @@ class CommonAsyncHostToDeviceTransferManager
   absl::Status TransferLiteralToBuffer(
       int buffer_index, const LiteralSlice& literal,
       absl::AnyInvocable<void() &&> on_done) override {
-    absl::ReleasableMutexLock l(&mu_);
+    absl::ReleasableMutexLock l(mu_);
 
     DCHECK_LT(buffer_index, undispatched_buffer_refs_.size());
     tsl::RCReference<CommonPjRtRawBuffer>& undispatched_buffer_ref =
@@ -264,7 +265,7 @@ class CommonAsyncHostToDeviceTransferManager
                     definition_event = std::move(definition_event),
                     on_done = std::move(on_done)]() mutable {
       {
-        absl::MutexLock l(&mu_);
+        absl::MutexLock l(mu_);
 
         CHECK_GT(transfers_in_flight_, 0);
         --transfers_in_flight_;
@@ -306,7 +307,7 @@ class CommonAsyncHostToDeviceTransferManager
   absl::Status TransferRawDataToSubBuffer(
       int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
       bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
-    absl::ReleasableMutexLock l(&mu_);
+    absl::ReleasableMutexLock l(mu_);
     DCHECK_LT(buffer_index, undispatched_buffer_refs_.size());
     tsl::RCReference<CommonPjRtRawBuffer> undispatched_buffer_ref;
     // Drop reference to the buffer if this is the last transfer.
@@ -351,7 +352,7 @@ class CommonAsyncHostToDeviceTransferManager
             data, offset, transfer_size));
     if (client_->event_tracking_enabled()) {
       // Acquire when logging, for the sake of definition_events_.
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       std::string op_name = debug_info_.has_value()
                                 ? absl::StrCat(" Op:", debug_info_.value())
                                 : "";
@@ -367,7 +368,7 @@ class CommonAsyncHostToDeviceTransferManager
                                  on_done = std::move(on_done)]() mutable {
       tsl::RCReference<PjRtDeviceEventPromise> definition_event;
       {
-        absl::MutexLock l(&mu_);
+        absl::MutexLock l(mu_);
 
         CHECK_GT(transfers_in_flight_, 0);
         --transfers_in_flight_;
@@ -414,7 +415,7 @@ class CommonAsyncHostToDeviceTransferManager
   }
 
   void SetBufferError(int buffer_index, absl::Status error) override {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     // For a given buffer_index, SetBufferError can't be called twice, or
     // called after the last transfer has been enqueued.
     auto definition_event = std::move(definition_events_[buffer_index]);
@@ -427,7 +428,7 @@ class CommonAsyncHostToDeviceTransferManager
 
   void AddTransferMetadata(const TransferMetadata& meta) override {
     if (client_->event_tracking_enabled()) {
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       std::string annotation =
           absl::StrCat(" ", absl::StrJoin(meta, " ", absl::PairFormatter(":")));
       for (int i = 0; i < definition_events_.size(); ++i) {
diff --git a/third_party/xla/xla/pjrt/interpreter/BUILD b/third_party/xla/xla/pjrt/interpreter/BUILD
index 25477d26ab4222..c8cb8170f82a97 100644
--- a/third_party/xla/xla/pjrt/interpreter/BUILD
+++ b/third_party/xla/xla/pjrt/interpreter/BUILD
@@ -38,7 +38,6 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:utils",
         "//xla/service:batchnorm_expander",
         "//xla/service:computation_placer_hdr",
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
index 27d7fab4b7e2a9..ece4b088dda58c 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
@@ -122,7 +122,7 @@ absl::StatusOr<std::tuple<std::vector<Literal*>, std::unique_ptr<Literal>>>
 ExtractInterpreterInputLiteralsFromBuffers(
     const absl::Span<PjRtBuffer* const> buffers,
     const HloComputation& entry_computation,
-    const bool parameter_is_tupled_arguments, const bool arguments_are_tupled) {
+    const bool parameter_is_tupled_arguments) {
   std::vector<Literal*> literals;
   for (PjRtBuffer* const buffer : buffers) {
     InterpreterLiteralWrapperBuffer* interpreter_buffer =
@@ -135,7 +135,7 @@ ExtractInterpreterInputLiteralsFromBuffers(
   }
 
   // Return early if arguments don't need to be re-tupled.
-  if (!parameter_is_tupled_arguments || arguments_are_tupled) {
+  if (!parameter_is_tupled_arguments) {
     return std::make_tuple(std::move(literals), nullptr);
   }
 
@@ -248,8 +248,7 @@ InterpreterLoadedExecutable::ExecuteSharded(
   TF_ASSIGN_OR_RETURN(const auto literals_and_storage,
                       ExtractInterpreterInputLiteralsFromBuffers(
                           argument_handles, computation,
-                          compile_options_.parameter_is_tupled_arguments,
-                          options.arguments_are_tupled));
+                          compile_options_.parameter_is_tupled_arguments));
   const absl::Span<const Literal* const> literals =
       std::get<0>(literals_and_storage);
   if (computation.num_parameters() != literals.size()) {
@@ -283,8 +282,7 @@ InterpreterLoadedExecutable::ExecuteSharded(
   // Transform the result literal back into a one or more
   // InterpreterLiteralWrapperBuffer.
   std::vector<std::unique_ptr<PjRtBuffer>> result;
-  // Untuple result if requested.
-  if (options.untuple_result && result_literal.shape().IsTuple()) {
+  if (result_literal.shape().IsTuple()) {
     const int tuple_count = result_literal.shape().tuple_shapes().size();
     result.reserve(tuple_count);
     // DecomposeTuple invalidates result_literal. move(...) to make it obvious.
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.h b/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
index 409db1b20e4d4e..6b725814e690cc 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/dynamic_dimension_inference.h"
 #include "xla/service/hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/pjrt/local_device_state.cc b/third_party/xla/xla/pjrt/local_device_state.cc
index 5391c82dc3395f..b3e16c8e8f20ab 100644
--- a/third_party/xla/xla/pjrt/local_device_state.cc
+++ b/third_party/xla/xla/pjrt/local_device_state.cc
@@ -139,15 +139,16 @@ LocalDeviceState::~LocalDeviceState() {
     LOG(ERROR) << "Error when closing device: " << status;
   }
 
-  // Explicitly delete all the streams to ensure that their callbacks are
-  // executed before the destruction of the LocalDeviceState and its callback
-  // threads.
+  // Explicitly delete all the streams and events to ensure that their callbacks
+  // are executed before the destruction of the LocalDeviceState and its
+  // callback threads.
   external_ready_event_streams_.clear();
   fixed_size_pool_usage_streams_.clear();
   device_to_device_streams_.clear();
   device_to_host_streams_.clear();
   host_to_device_stream_.reset();
   compute_stream_.reset();
+  compute_events_.clear();
 }
 
 absl::Status LocalDeviceState::SynchronizeAllActivity() {
@@ -159,7 +160,7 @@ absl::Status LocalDeviceState::SynchronizeAllActivity() {
   // fixed, we could remove the BlockHostUntilDone call.
   status.Update(compute_stream_->BlockHostUntilDone());
   if (callback_stream_map_.has_value()) {
-    absl::MutexLock lock(&callback_stream_map_mu_);
+    absl::MutexLock lock(callback_stream_map_mu_);
     for (auto& callback_stream : callback_stream_map_.value()) {
       status.Update(callback_stream.second->BlockHostUntilDone());
     }
@@ -188,7 +189,7 @@ absl::Status LocalDeviceState::ThenExecuteCallback(
   tsl::profiler::TraceMe traceme("ThenExecuteCallback");
   if (callback_stream_map_.has_value()) {
     // Prevent concurrent updates to the callback stream map.
-    absl::MutexLock lock(&callback_stream_map_mu_);
+    absl::MutexLock lock(callback_stream_map_mu_);
     auto callback_stream = callback_stream_map_->find(stream);
     if (callback_stream == callback_stream_map_->end()) {
       TF_ASSIGN_OR_RETURN(auto new_stream, executor_->CreateStream());
@@ -207,7 +208,7 @@ absl::Status LocalDeviceState::ThenExecuteCallback(
 }
 
 se::Stream* LocalDeviceState::GetDeviceToHostStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_device_to_host_stream_;
   next_device_to_host_stream_ =
       (next_device_to_host_stream_ + 1) % device_to_host_streams_.size();
@@ -215,7 +216,7 @@ se::Stream* LocalDeviceState::GetDeviceToHostStream() {
 }
 
 se::Stream* LocalDeviceState::GetDeviceToDeviceStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_device_to_device_stream_;
   next_device_to_device_stream_ =
       (next_device_to_device_stream_ + 1) % device_to_device_streams_.size();
@@ -223,7 +224,7 @@ se::Stream* LocalDeviceState::GetDeviceToDeviceStream() {
 }
 
 se::Stream* LocalDeviceState::GetFixedSizePoolUsageStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_fixed_size_pool_usage_stream_;
   next_fixed_size_pool_usage_stream_ =
       (next_fixed_size_pool_usage_stream_ + 1) %
@@ -232,7 +233,7 @@ se::Stream* LocalDeviceState::GetFixedSizePoolUsageStream() {
 }
 
 se::Stream* LocalDeviceState::GetExternalReadyEventStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_external_ready_event_stream_;
   next_external_ready_event_stream_ = (next_external_ready_event_stream_ + 1) %
                                       external_ready_event_streams_.size();
@@ -256,7 +257,7 @@ absl::StatusOr<se::Stream*> LocalDeviceState::GetStreamFromExternalStream(
 }
 
 std::vector<se::Stream*> LocalDeviceState::GetDeviceToDeviceStreams() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   std::vector<se::Stream*> result;
   result.reserve(device_to_device_streams_.size());
   for (const auto& stream : device_to_device_streams_) {
@@ -267,7 +268,7 @@ std::vector<se::Stream*> LocalDeviceState::GetDeviceToDeviceStreams() {
 
 std::unique_ptr<se::Stream> LocalDeviceState::BorrowStreamFromPool() {
   {
-    absl::MutexLock lock(&stream_pool_mu_);
+    absl::MutexLock lock(stream_pool_mu_);
     if (!usage_stream_pool_.empty()) {
       std::unique_ptr<se::Stream> stream = std::move(usage_stream_pool_.top());
       usage_stream_pool_.pop();
@@ -294,12 +295,12 @@ void LocalDeviceState::ReturnStreamToPool(std::unique_ptr<se::Stream> stream) {
   if (status.code() != tsl::error::ABORTED) {
     CHECK(stream->ok()) << status;
   }
-  absl::MutexLock lock(&stream_pool_mu_);
+  absl::MutexLock lock(stream_pool_mu_);
   usage_stream_pool_.push(std::move(stream));
 }
 
 int LocalDeviceState::GetNewPrngSeed() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int x = 0;
   do {
     x = prng_seed_distribution_(prng_seed_generator_);
@@ -324,14 +325,17 @@ absl::Status LocalDeviceState::AllocateAndRecordEvent(
 
 absl::StatusOr<BufferSequencingEventRef>
 LocalDeviceState::GetEventForComputeStreamSyncPoint(
-    size_t sync_point, tsl::thread::ThreadPool* thread_pool) {
+    size_t sync_point, tsl::thread::ThreadPool* thread_pool,
+    bool nullptr_if_past) {
   mu_.lock();
   size_t cur_sync_point = next_compute_stream_sync_point_.load();
   if (sync_point < base_compute_event_sequence_id_ + compute_events_.size()) {
     BufferSequencingEventRef event;
     if (sync_point < base_compute_event_sequence_id_) {
-      DCHECK_GT(compute_events_.size(), 0);
-      event = compute_events_.front();
+      if (!nullptr_if_past) {
+        DCHECK_GT(compute_events_.size(), 0);
+        event = compute_events_.front();
+      }
     } else {
       event = compute_events_[sync_point - base_compute_event_sequence_id_];
     }
@@ -352,7 +356,7 @@ LocalDeviceState::GetEventForComputeStreamSyncPoint(
   }
   mu_.unlock();
   event.AndThen([this, cur_sync_point]() {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     while (base_compute_event_sequence_id_ < cur_sync_point) {
       compute_events_.pop_front();
       ++base_compute_event_sequence_id_;
diff --git a/third_party/xla/xla/pjrt/local_device_state.h b/third_party/xla/xla/pjrt/local_device_state.h
index da3f32d45ac7de..675b6b81459f05 100644
--- a/third_party/xla/xla/pjrt/local_device_state.h
+++ b/third_party/xla/xla/pjrt/local_device_state.h
@@ -220,7 +220,8 @@ class LocalDeviceState {
   // which only incur the expense of constructing a cuda event if they're really
   // needed. This allows constructing a definition event per buffer.
   absl::StatusOr<BufferSequencingEventRef> GetEventForComputeStreamSyncPoint(
-      size_t sync_point, tsl::thread::ThreadPool* thread_pool);
+      size_t sync_point, tsl::thread::ThreadPool* thread_pool,
+      bool nullptr_if_past = false);
 
  private:
   absl::Status SynchronizeAllActivity();
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index 840c27ef48a252..3cbdf0576d8ea2 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -230,6 +230,7 @@ absl::Status ExportShardyForGSPMD(mlir::ModuleOp module) {
   // to handle.
   xla::sdy::StablehloExportPipelineOptions options;
   options.keepHloShardingConstraints = true;
+  options.addMissingShardingToControlFlow = false;
   xla::sdy::addStablehloExportPipeline(pm, options);
   mlir::BaseScopedDiagnosticHandler diagnostic_handler(context);
   if (!mlir::succeeded(pm.run(module))) {
diff --git a/third_party/xla/xla/pjrt/pjrt_api_test.cc b/third_party/xla/xla/pjrt/pjrt_api_test.cc
index 5ba59c79071769..6e56e4dc7c9d83 100644
--- a/third_party/xla/xla/pjrt/pjrt_api_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_api_test.cc
@@ -16,16 +16,17 @@ limitations under the License.
 
 #include <string>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
-#include "tsl/platform/status_matchers.h"
 namespace {
 
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 TEST(PjRtApiTest, SetAndGetGlobalPjRtApi) {
   PJRT_Api api;
diff --git a/third_party/xla/xla/pjrt/pjrt_client.cc b/third_party/xla/xla/pjrt/pjrt_client.cc
index 8aeb459a59135d..6e70b5de3bc9dc 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
@@ -26,13 +28,15 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/substitute.h"
+#include "absl/types/span.h"
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/utils.h"
 #include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -100,4 +104,65 @@ PjRtExecutable* PjRtLoadedExecutable::GetExecutable() const {
   return executable_forwarder_.get();
 }
 
+absl::StatusOr<Shape> PjRtBuffer::HostShape() {
+  Shape device_shape;
+  if (!IsTuple()) {
+    absl::Span<const int64_t> literal_dims;
+    std::optional<std::vector<int64_t>> logical_dims_storage;
+    if (has_dynamic_dimensions()) {
+      TF_ASSIGN_OR_RETURN(std::vector<int64_t> logical_dims,
+                          logical_dimensions());
+      logical_dims_storage.emplace(std::move(logical_dims));
+      literal_dims = *logical_dims_storage;
+    } else {
+      literal_dims = dimensions();
+    }
+    if (element_type() == TOKEN) {
+      device_shape = ShapeUtil::MakeTokenShape();
+    } else {
+      device_shape = ShapeUtil::MakeShape(element_type(), literal_dims);
+      // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
+      *device_shape.mutable_layout() = layout()->xla_layout();
+    }
+  } else {
+    // TODO(skyewm): does anything need to create tuple literals? The PJRT C
+    // API doesn't support tuples or {logical_}on_device_shape(), so we prefer
+    // to use the above non-tuple code path where possible.
+    device_shape = on_device_shape();
+    if (device_shape.is_dynamic()) {
+      TF_ASSIGN_OR_RETURN(device_shape, logical_on_device_shape());
+    }
+  }
+  return ShapeUtil::DeviceShapeToHostShape(device_shape);
+}
+
+xla::Future<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral() {
+  absl::StatusOr<Shape> host_shape = HostShape();
+  if (!host_shape.ok()) {
+    return xla::Future<std::shared_ptr<Literal>>(host_shape.status());
+  }
+  auto [promise, future] = xla::Future<std::shared_ptr<Literal>>::MakePromise();
+  auto shared_literal = std::make_shared<Literal>();
+  Literal* literal = shared_literal.get();
+  LazyToLiteral([literal, host_shape = *std::move(
+                              host_shape)]() -> Future<MutableLiteralBase*> {
+    auto literal_or = Literal::Make(host_shape);
+    if (!literal_or.ok()) {
+      return Future<MutableLiteralBase*>(literal_or.status());
+    }
+    *literal = *std::move(literal_or);
+    return Future<MutableLiteralBase*>(literal);
+  })
+      .OnReady(
+          [promise = std::move(promise),
+           shared_literal = std::move(shared_literal)](absl::Status s) mutable {
+            if (!s.ok()) {
+              std::move(promise).Set(s);
+            } else {
+              std::move(promise).Set(std::move(shared_literal));
+            }
+          });
+  return future;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index cc6b342f0730ff..f35a13e9aafe40 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -58,6 +59,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/lib/gtl/int_type.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -197,7 +199,7 @@ class PjRtDevice {
 
   // Returns a scoped event that the caller uses to tell the PjRtClient that
   // there is asynchronous work happening that depends on activity on the
-  // PjRtDevice. See comment on class definition in pjrt_future.h.
+  // PjRtDevice. See comment on class definition in future.h.
   //
   // Only some PjRtDevice implementations support ScopedAsyncTrackingEvent, and
   // those that do not will return nullptr.
@@ -389,14 +391,14 @@ class CopyToDeviceStream {
   // Returns the amount of data the stream currently has either transferred or
   // has buffered to transfer.
   int64_t current_bytes() const ABSL_LOCKS_EXCLUDED(mu_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return current_bytes_;
   }
 
   // Returns true if the stream is complete; all expected bytes have been
   // transferred or are buffered to transfer.
   bool IsComplete() const ABSL_LOCKS_EXCLUDED(mu_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return IsCompleteLocked();
   }
 
@@ -442,6 +444,10 @@ struct PjRtPluginAttributes {
   absl::flat_hash_map<std::string, PjRtValueType> attributes;
 };
 
+// Each cross-host transfer in the second transfers API is associated with a
+// unique CrossHostTransferKey.
+TSL_LIB_GTL_DEFINE_INT_TYPE(CrossHostTransferKey, int64_t);
+
 // Encapsulates the state of Python session with XLA.
 //
 // It is the responsibility of the client of this API to keep the PjRtClient
@@ -449,7 +455,7 @@ struct PjRtPluginAttributes {
 //
 // A note on the semantics of cross-device copies.
 //
-// There are two mechanisms to transfer a buffer from one device to another.
+// There are three mechanisms to transfer a buffer from one device to another.
 // When both devices are on the same host (more specifically, the user program
 // ends up with pointers to both the source and destination buffers in the same
 // address space), the caller can use:
@@ -459,18 +465,25 @@ struct PjRtPluginAttributes {
 // made via native device networking (as opposed to the user program fetching
 // the buffer and sending it using its own networking code), the caller can
 // use:
+//   DstHost: dst_client->CrossHostReceiveBuffers(...)
+//   SrcHost: src_client->CrossHostSendBuffers(...)
+//
+// The caller can also use the original cross-host transfers API:
 //   DstHost: dst_client->MakeCrossHostReceiveBuffers(...)
 //   DstHost: [...]
 //   DstHost: gets callback containing PjRtCrossHostRecvDescriptors
 //   DstHost: sends cross-host recv serialized descriptors to SrcHost
 //   SrcHost: src_buffer->CopyToRemoteDevice(serialized_descriptors)
 //
+// See subclass documentation for platform-specific tradeoffs between the
+// two cross-host transfer methods.
+//
 // Note that in the cross-host case, the dst_client may call
-// MakeCrossHostReceiveBuffers before the action that produces src_buffer has
+// (Make)CrossHostReceiveBuffers before the action that produces src_buffer has
 // been enqueued at SrcHost.
 //
 // On some platforms, device-to-device transfers consume scarce hardware
-// resources. If dst_client->MakeCrossHostReceiveBuffers immediately claimed
+// resources. If dst_client->(Make)CrossHostReceiveBuffers immediately claimed
 // those resources, then there would be a risk of system-wide deadlock, if the
 // resources claimed by the recv prevented other transfers that are necessary
 // to generate src_buffer from acquiring enough resources to proceed.
@@ -964,16 +977,16 @@ class PjRtClient {
   virtual absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
       PjRtBuffer* buffer);
 
-  // Returns a vector of PjRtBuffers that can be used to receive
-  // cross host transfers using `client` on `device'. Asynchronously calls
-  // `notifier` once receive descriptors are ready to be communicated to the
-  // sender. `shapes` must be the exact shapes, with identical layouts,
-  // corresponding to the buffers that will be sent. When resources for the
-  // transfer are available, notifier will be called with a vector of
-  // PjRtCrossHostRecvDescriptors structs, one for each shape in `shapes`. Each
-  // struct contains an opaque string that should be transmitted to the sending
-  // host and used in a call to CopyToRemoteDevice. None of the recv buffers
-  // will become ready until *all* of the sends have completed.
+  // Part of original cross-host transfers API. Returns a vector of PjRtBuffers
+  // that can be used to receive cross host transfers using `client` on
+  // `device'. Asynchronously calls `notifier` once receive descriptors are
+  // ready to be communicated to the sender. `shapes` must be the exact shapes,
+  // with identical layouts, corresponding to the buffers that will be sent.
+  // When resources for the transfer are available, notifier will be called with
+  // a vector of PjRtCrossHostRecvDescriptors structs, one for each shape in
+  // `shapes`. Each struct contains an opaque string that should be transmitted
+  // to the sending host and used in a call to CopyToRemoteDevice. None of the
+  // recv buffers will become ready until *all* of the sends have completed.
   //
   // If MakeCrossHostReceiveBuffers returns an error, then `notifier` will not
   // be called. Otherwise `notifier` will be called exactly once. In the case
@@ -1011,6 +1024,28 @@ class PjRtClient {
         "DmaUnmap not supported on platform %s", platform_name()));
   }
 
+  // CrossHostSendBuffers and CrossHostReceiveBuffers are part of the second
+  // cross-host transfers API.
+
+  // Send buffers to remote devices specified by dst_global_device_ids.
+  virtual absl::StatusOr<std::vector<Future<>>> CrossHostSendBuffers(
+      absl::Span<PjRtBuffer* const> buffers,
+      absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) {
+    return absl::InternalError(
+        "Cross-host data transfers are not supported by this client.");
+  }
+
+  // Places buffers from a cross-host send onto device.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  CrossHostReceiveBuffers(
+      xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+      absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) {
+    return absl::UnimplementedError(
+        "Cross-host data transfers are not supported.");
+  }
+
  private:
   std::unique_ptr<PjRtHostMemoryForDeviceManager>
       host_memory_for_device_manager_;
@@ -1136,58 +1171,21 @@ class PjRtBuffer {
 
   // Synchronous overload of ToLiteral, as a convenience.
   absl::Status ToLiteralSync(MutableLiteralBase* literal) {
-    absl::Notification done;
-    absl::Status status;
-    ToLiteral(literal).OnReady([&](absl::Status s) {
-      status = std::move(s);
-      done.Notify();
-    });
-    done.WaitForNotification();
-    return status;
-  }
-
-  absl::StatusOr<Shape> HostShape() {
-    Shape device_shape;
-    if (!IsTuple()) {
-      absl::Span<const int64_t> literal_dims;
-      std::optional<std::vector<int64_t>> logical_dims_storage;
-      if (has_dynamic_dimensions()) {
-        TF_ASSIGN_OR_RETURN(std::vector<int64_t> logical_dims,
-                            logical_dimensions());
-        logical_dims_storage.emplace(std::move(logical_dims));
-        literal_dims = *logical_dims_storage;
-      } else {
-        literal_dims = dimensions();
-      }
-      if (element_type() == TOKEN) {
-        device_shape = ShapeUtil::MakeTokenShape();
-      } else {
-        device_shape = ShapeUtil::MakeShape(element_type(), literal_dims);
-        // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
-        *device_shape.mutable_layout() = layout()->xla_layout();
-      }
-    } else {
-      // TODO(skyewm): does anything need to create tuple literals? The PJRT C
-      // API doesn't support tuples or {logical_}on_device_shape(), so we prefer
-      // to use the above non-tuple code path where possible.
-      device_shape = on_device_shape();
-      if (device_shape.is_dynamic()) {
-        TF_ASSIGN_OR_RETURN(device_shape, logical_on_device_shape());
-      }
-    }
-    return ShapeUtil::DeviceShapeToHostShape(device_shape);
+    return ToLiteral(literal).Await();
   }
 
+  absl::StatusOr<Shape> HostShape();
+
   // Convenience synchronous overload that allocates a literal with a default
   // layout.
+  ABSL_DEPRECATE_AND_INLINE()
   absl::StatusOr<std::shared_ptr<Literal>> ToLiteralSync() {
-    TF_ASSIGN_OR_RETURN(Shape host_shape, HostShape());
-    TF_ASSIGN_OR_RETURN(auto literal, Literal::Make(host_shape));
-    auto shared_literal = std::make_shared<Literal>(std::move(literal));
-    TF_RETURN_IF_ERROR(ToLiteralSync(shared_literal.get()));
-    return shared_literal;
+    return ToLiteral().Await();
   }
 
+  // ToLiteral overload which async allocates a literal with default layout.
+  xla::Future<std::shared_ptr<Literal>> ToLiteral();
+
   // Returns the number of bytes of the buffer storage on the device.
   virtual absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const = 0;
 
@@ -1261,13 +1259,14 @@ class PjRtBuffer {
   virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
       PjRtMemorySpace* dst_memory_space) = 0;
 
-  // Prepares to send a copy of the buffer to a remote device. The destination
-  // device is encoded in `serialized_descriptor`, which must be fulfilled by
-  // the result of call to MakeCrossHostReceiveBuffers on the remote host's
-  // destination device. MakeCrossHostReceiveBuffers takes an array of shapes to
-  // construct the destination buffers, and a callback supplies an array
-  // containing both the destination buffers, and a serialized descriptor for
-  // each buffer. For each destination buffer there should be a matching call to
+  // Part of original cross-host transfers API. Prepares to send a copy of the
+  // buffer to a remote device. The destination device is encoded in
+  // `serialized_descriptor`, which must be fulfilled by the result of call to
+  // MakeCrossHostReceiveBuffers on the remote host's destination device.
+  // MakeCrossHostReceiveBuffers takes an array of shapes to construct the
+  // destination buffers, and a callback supplies an array containing both the
+  // destination buffers, and a serialized descriptor for each buffer. For each
+  // destination buffer there should be a matching call to
   // src->CopyToRemoteDevice on a remote host for a src buffer of the
   // corresponding shape. If `serialized_descriptor` is fulfilled with a non-Ok
   // status, then the transfer is canceled, otherwise it must be the string
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index 664f5ddb2a9b4b..4fddbe0dc5d331 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -31,11 +31,13 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -46,13 +48,13 @@ class TestClientFactory {
  public:
   void Register(
       std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()> factory) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     CHECK(!factory_);
     factory_ = std::move(factory);
   }
 
   std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()> Get() const {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return factory_;
   }
 
@@ -531,6 +533,35 @@ TEST(PjRtClientTest, CreateViewOfUnalignedBufferReturnsErrorCpuOnly) {
               ::testing::HasSubstr("unaligned data"));
 }
 
+TEST(PjRtClientTest, FulfillAliasBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetClient());
+
+  std::vector<int32_t> data{1, 2, 3, 4, 5, 6};
+  Shape shape = ShapeUtil::MakeShape(S32, {2, 3});
+  TF_ASSERT_OK_AND_ASSIGN(
+      *shape.mutable_layout(),
+      client->GetDefaultLayout(shape.element_type(), shape.dimensions()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto alias_buffer,
+      client->CreateAliasBuffer(shape, client->memory_spaces()[0]));
+  auto future = alias_buffer.first->ToLiteral();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto param,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->memory_spaces()[0], /*device_layout=*/nullptr));
+
+  ASSERT_NE(alias_buffer.second, nullptr);
+  TF_ASSERT_OK(std::move(alias_buffer.second)(param.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto shared_literal, future.Await());
+
+  std::vector<int32_t> expected = {1, 2, 3, 4, 5, 6};
+  EXPECT_EQ(shared_literal->data<int32_t>(), expected);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> MakeFloatBuffer(
     PjRtClient* client, const std::vector<float>& data,
     absl::Span<const int64_t> dimensions) {
@@ -571,7 +602,6 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
                           MakeFloatBuffer(client.get(), data, {2, 2}));
 
   xla::ExecuteOptions options;
-  options.untuple_result = true;
   {
     auto result = pjrt_executable->Execute(/*argument_handles=*/{{
                                                buffer0.get(),
diff --git a/third_party/xla/xla/pjrt/pjrt_common.h b/third_party/xla/xla/pjrt/pjrt_common.h
index 61adc9f6a7bd01..e217faf87e7268 100644
--- a/third_party/xla/xla/pjrt/pjrt_common.h
+++ b/third_party/xla/xla/pjrt/pjrt_common.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "xla/pjrt/proto/pjrt_value_type.pb.h"
 #include "xla/tsl/lib/gtl/int_type.h"
 
@@ -36,8 +37,23 @@ xla::PjRtValueTypeProto PjRtValueTypeToProto(const PjRtValueType& value);
 
 PjRtValueType PjRtValueTypeFromProto(const xla::PjRtValueTypeProto& value);
 
+template <typename Id>
+using PjRtIdContainer = absl::InlinedVector<Id, 4>;
+
+template <typename Id>
+PjRtIdContainer<Id> MakeContinuousIds(int start, int size) {
+  PjRtIdContainer<Id> container;
+  container.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    container.push_back(Id(start + i));
+  }
+  return container;
+}
+
 // The strong-typed integer classes to better disambiguate different IDs for
 // PJRT devices.
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtProcessId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtGlobalChipId, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtGlobalDeviceId, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalDeviceId, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalHardwareId, int32_t);
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.cc b/third_party/xla/xla/pjrt/pjrt_compiler.cc
index a1dcf0173c38f8..a5e830f2a158fc 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.cc
@@ -47,14 +47,14 @@ CompilerRegistry() {
 void PjRtRegisterCompiler(absl::string_view platform_name,
                           std::unique_ptr<PjRtCompiler> compiler) {
   CHECK(compiler != nullptr);
-  absl::MutexLock l(&registry_mutex);
+  absl::MutexLock l(registry_mutex);
   auto* compiler_registry = CompilerRegistry();
   CHECK(!compiler_registry->contains(platform_name));
   (*compiler_registry)[platform_name] = std::move(compiler);
 }
 
 absl::StatusOr<PjRtCompiler*> GetPjRtCompiler(absl::string_view platform_name) {
-  absl::ReaderMutexLock l(&registry_mutex);
+  absl::ReaderMutexLock l(registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(platform_name);
   if (it == compiler_registry->end()) {
@@ -72,7 +72,7 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     return (*topology_compiler)
         ->Compile(std::move(options), computation, topology, client);
   }
-  absl::ReaderMutexLock l(&registry_mutex);
+  absl::ReaderMutexLock l(registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(topology.platform_name());
   if (it == compiler_registry->end()) {
@@ -90,7 +90,7 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     return (*topology_compiler)
         ->Compile(std::move(options), module, topology, client);
   }
-  absl::ReaderMutexLock l(&registry_mutex);
+  absl::ReaderMutexLock l(registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(topology.platform_name());
   if (it == compiler_registry->end()) {
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index c08cbb2715694f..84b8ac8bf50335 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -31,10 +31,13 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/pjrt_partial_program.pb.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/fingerprint.h"
 
 namespace xla {
@@ -119,21 +122,54 @@ class PjRtTopologyDescription {
     return absl::UnimplementedError("ProcessCount is unsupported.");
   }
 
+  // Returns the number of chips per process.
+  virtual absl::StatusOr<int> ChipsPerProcess() const {
+    return absl::UnimplementedError("ChipsPerProcess is unsupported.");
+  }
+
+  // Returns the number of chips.
+  virtual absl::StatusOr<int> ChipCount() const {
+    TF_ASSIGN_OR_RETURN(int process_count, ProcessCount());
+    TF_ASSIGN_OR_RETURN(int chips_per_process, ChipsPerProcess());
+    return process_count * chips_per_process;
+  }
+
   // Returns the total number of cores of the default type.
   virtual absl::StatusOr<int> CoreCountOfDefaultType() const {
-    return absl::UnimplementedError("CoreCountOfDefaultType is unsupported.");
+    TF_ASSIGN_OR_RETURN(int process_count, ProcessCount());
+    TF_ASSIGN_OR_RETURN(int cores_per_process,
+                        CoreCountOfDefaultTypePerProcess());
+    return process_count * cores_per_process;
+  }
+
+  // As above, but returns the number of logical devices per host.
+  virtual absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerProcess()
+      const {
+    TF_ASSIGN_OR_RETURN(int logical_devices_per_chip,
+                        LogicalDeviceCountOfDefaultTypePerChip());
+    TF_ASSIGN_OR_RETURN(int chips_per_process, ChipsPerProcess());
+    return chips_per_process * logical_devices_per_chip;
   }
 
   // Returns the total number of logical devices of the default type.
   virtual absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const {
+    TF_ASSIGN_OR_RETURN(int process_count, ProcessCount());
+    TF_ASSIGN_OR_RETURN(int logical_devices_per_process,
+                        LogicalDeviceCountOfDefaultTypePerProcess());
+    return process_count * logical_devices_per_process;
+  }
+
+  // Returns the number of logical devices of the default type per chip.
+  virtual absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerChip() const {
     return absl::UnimplementedError(
-        "LogicalDeviceCountOfDefaultType is unsupported.");
+        "LogicalDeviceCountOfDefaultTypePerChip is unsupported.");
   }
 
   // Returns the number of cores of the default type per process.
   virtual absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const {
-    return absl::UnimplementedError(
-        "CoreCountOfDefaultTypePerProcess is unsupported.");
+    TF_ASSIGN_OR_RETURN(int cores_per_chip, CoreCountOfDefaultTypePerChip());
+    TF_ASSIGN_OR_RETURN(int chips_per_process, ChipsPerProcess());
+    return cores_per_chip * chips_per_process;
   }
 
   // Returns the number of cores per chip for the default type.
@@ -142,6 +178,84 @@ class PjRtTopologyDescription {
         "CoreCountOfDefaultTypePerChip is unsupported.");
   }
 
+  // Returns the ids for all processes.
+  virtual absl::StatusOr<PjRtIdContainer<PjRtProcessId>> ProcessIds() const {
+    return absl::UnimplementedError("ProcessIds is unsupported.");
+  }
+
+  // Returns the ids for all the logical devices on a specific process.
+  virtual absl::StatusOr<PjRtIdContainer<PjRtGlobalDeviceId>>
+  LogicalDeviceOfDefaultTypeIdsOnProcess(PjRtProcessId process_id) const {
+    return absl::UnimplementedError(
+        "LogicalDeviceOfDefaultTypeIdsOnProcess is unsupported.");
+  }
+
+  // Returns the process ID and the index of the chip within that process for a
+  // given chip.
+  virtual absl::StatusOr<std::pair<PjRtProcessId, int>>
+  ProcessIdAndIndexOnProcessForChip(PjRtGlobalChipId chip_id) const {
+    return absl::UnimplementedError(
+        "ProcessIdAndIndexOnProcessForChip is unsupported.");
+  }
+
+  // Returns the process ID and the index on process for a logical device.
+  virtual absl::StatusOr<std::pair<PjRtProcessId, int>>
+  ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType(
+      xla::PjRtGlobalDeviceId device_id) const {
+    return absl::UnimplementedError(
+        "ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType is "
+        "unsupported.");
+  }
+
+  // Returns the coordinates of a process given its ID.
+  virtual absl::StatusOr<PjRtDeviceDimensions> ProcessCoordFromId(
+      PjRtProcessId process_id) const {
+    return absl::UnimplementedError("ProcessCoordForId is unsupported.");
+  }
+
+  // Returns the chip ID for a given chip coordinate.
+  virtual absl::StatusOr<PjRtGlobalChipId> ChipIdFromCoord(
+      const PjRtDeviceDimensions& chip) const {
+    return absl::UnimplementedError("IdForChip is unsupported.");
+  }
+
+  // Returns a unique integer ID for the logical device of the default type on
+  // the chip at the given coordinates and with the given core index.
+  virtual absl::StatusOr<xla::PjRtGlobalDeviceId>
+  LogicalDeviceOfDefaultTypeIdFromChipCoordAndCoreIndex(
+      const PjRtDeviceDimensions& chip, int core_index) const {
+    return absl::UnimplementedError(
+        "LogicalDeviceOfDefaultTypeIdFromChipCoordAndCoreIndex is "
+        "unsupported.");
+  }
+
+  // Returns the chip coordinates and core index of the logical device of the
+  // default type for the given unique device ID.
+  virtual absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
+  ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+      xla::PjRtGlobalDeviceId device_id) const {
+    return absl::UnimplementedError(
+        "LogicalDeviceCoordsOfDefaultTypeForId is unsupported.");
+  }
+
+  // Returns the bounds of the chips within a single host.
+  // The product of all dimensions should equal to ChipsPerProcess().
+  virtual absl::StatusOr<PjRtDeviceDimensions> ChipsPerProcessBounds() const {
+    return absl::UnimplementedError("GetChipsPerProcessBounds is unsupported.");
+  }
+
+  // Returns the total bounds of all chips in the topology.
+  // The product of all dimensions should equal to ChipCount().
+  virtual absl::StatusOr<PjRtDeviceDimensions> ChipBounds() const {
+    return absl::UnimplementedError("ChipBounds is unsupported.");
+  }
+
+  // Returns the total bounds of all hosts in the topology.
+  // The product of all dimensions should equal to ProcessCount().
+  virtual absl::StatusOr<PjRtDeviceDimensions> ProcessBounds() const {
+    return absl::UnimplementedError("ProcessBounds is unsupported.");
+  }
+
   // Serializes the topology for use in cache keys. (No guarantees on
   // stability).
   virtual absl::StatusOr<std::string> Serialize() const = 0;
@@ -162,8 +276,31 @@ class PjRtTopologyDescription {
   virtual absl::StatusOr<PjRtTopologyDescriptionProto> ToProto() const {
     return absl::UnimplementedError("ToProto is unsupported.");
   }
+
+  // Returns a new `PjRtTopologyDescription` representing a subslice of the
+  // current topology, defined by `chips_per_host_bounds` and `host_bounds`.
+  virtual absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> Subslice(
+      const PjRtDeviceDimensions& chips_per_host_bounds,
+      const PjRtDeviceDimensions& host_bounds) const {
+    return absl::UnimplementedError("Subslice is not supported.");
+  }
 };
 
+// Returns true if it's TPU id.
+inline bool IsTpuId(PjRtPlatformId platform_id) {
+  return platform_id == xla::TpuId();
+}
+
+// Returns true if it's GPU id.
+inline bool IsGpuId(PjRtPlatformId platform_id) {
+  return platform_id == xla::CudaId() || platform_id == xla::RocmId();
+}
+
+// Returns true if it's CPU id.
+inline bool IsCpuId(PjRtPlatformId platform_id) {
+  return platform_id == xla::CpuId();
+}
+
 // Abstract interface that all registered compilers must implement.
 class PjRtCompiler {
  public:
diff --git a/third_party/xla/xla/pjrt/pjrt_device_dimensions.cc b/third_party/xla/xla/pjrt/pjrt_device_dimensions.cc
new file mode 100644
index 00000000000000..16e78be6d062e4
--- /dev/null
+++ b/third_party/xla/xla/pjrt/pjrt_device_dimensions.cc
@@ -0,0 +1,83 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/pjrt_device_dimensions.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+
+namespace xla {
+
+PjRtDeviceDimensionsProto PjRtDeviceDimensions::ToProto() const {
+  PjRtDeviceDimensionsProto proto;
+  for (int32_t dim : dimensions_) {
+    proto.add_dimensions(dim);
+  }
+  return proto;
+}
+
+std::string PjRtDeviceDimensions::ToString(absl::string_view sep) const {
+  return absl::StrJoin(dimensions_, sep);
+}
+
+absl::StatusOr<PjRtDeviceDimensions> PjRtDeviceDimensions::FromString(
+    absl::string_view text) {
+  if (text.empty()) {
+    return PjRtDeviceDimensions({});
+  }
+  std::vector<std::string> bounds_str = absl::StrSplit(text, ',');
+
+  absl::InlinedVector<int32_t, 3> dims;
+  for (auto const& b : bounds_str) {
+    int32_t bound;
+    if (!absl::SimpleAtoi(b, &bound)) {
+      return absl::InvalidArgumentError(
+          absl::StrFormat("Number parsing error for pjrt device dimensions %s "
+                          "while parsing %s.",
+                          text, b));
+    }
+    dims.push_back(bound);
+  }
+
+  return PjRtDeviceDimensions(dims);
+}
+
+bool AbslParseFlag(absl::string_view text, PjRtDeviceDimensions* bounds,
+                   std::string* err) {
+  const auto status_or_dimensions = PjRtDeviceDimensions::FromString(text);
+  if (!status_or_dimensions.ok()) {
+    *err = status_or_dimensions.status().ToString();
+    return false;
+  }
+  *bounds = status_or_dimensions.value();
+  return true;
+}
+
+std::string AbslUnparseFlag(PjRtDeviceDimensions bounds) {
+  return bounds.ToString();
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_device_dimensions.h b/third_party/xla/xla/pjrt/pjrt_device_dimensions.h
new file mode 100644
index 00000000000000..324fbeb5a36962
--- /dev/null
+++ b/third_party/xla/xla/pjrt/pjrt_device_dimensions.h
@@ -0,0 +1,108 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PJRT_DEVICE_DIMENSIONS_H_
+#define XLA_PJRT_PJRT_DEVICE_DIMENSIONS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/proto/pjrt_device_dimensions.pb.h"
+
+namespace xla {
+
+// Represents device dimensions (e.g., mesh bounds or chip coordinates).
+class PjRtDeviceDimensions {
+ public:
+  using DimensionsContainer = absl::InlinedVector<int32_t, 4>;
+  using iterator = DimensionsContainer::iterator;
+  using const_iterator = DimensionsContainer::const_iterator;
+
+  PjRtDeviceDimensions() = default;
+  PjRtDeviceDimensions(std::initializer_list<int32_t> dims)
+      : dimensions_(dims) {}
+  explicit PjRtDeviceDimensions(absl::Span<const int32_t> dims)
+      : dimensions_(dims.begin(), dims.end()) {}
+
+  int32_t& operator[](size_t i) { return dimensions_[i]; }
+
+  iterator begin() { return dimensions_.begin(); }
+  const_iterator begin() const { return dimensions_.begin(); }
+
+  iterator end() { return dimensions_.end(); }
+  const_iterator end() const { return dimensions_.end(); }
+
+  const int32_t& operator[](size_t i) const { return dimensions_[i]; }
+
+  const int32_t* data() const { return dimensions_.data(); }
+  size_t size() const { return dimensions_.size(); }
+
+  friend bool operator==(const PjRtDeviceDimensions& a,
+                         const PjRtDeviceDimensions& b) {
+    return a.dimensions_ == b.dimensions_;
+  }
+
+  friend bool operator!=(const PjRtDeviceDimensions& a,
+                         const PjRtDeviceDimensions& b) {
+    return !(a == b);
+  }
+
+  friend bool operator<(const PjRtDeviceDimensions& a,
+                        const PjRtDeviceDimensions& b) {
+    return a.dimensions_ < b.dimensions_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const PjRtDeviceDimensions& d) {
+    return os << d.ToString();
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const PjRtDeviceDimensions& c) {
+    return H::combine(std::move(h), c.dimensions_);
+  }
+
+  static absl::StatusOr<PjRtDeviceDimensions> FromProto(
+      const PjRtDeviceDimensionsProto& proto) {
+    return PjRtDeviceDimensions(proto.dimensions());
+  }
+
+  PjRtDeviceDimensionsProto ToProto() const;
+
+  std::string ToString(absl::string_view sep = ",") const;
+
+  static absl::StatusOr<PjRtDeviceDimensions> FromString(
+      absl::string_view text);
+
+ private:
+  DimensionsContainer dimensions_;
+};
+
+// Support for absl flags.
+bool AbslParseFlag(absl::string_view text, PjRtDeviceDimensions* bounds,
+                   std::string* err);
+std::string AbslUnparseFlag(PjRtDeviceDimensions bounds);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PJRT_DEVICE_DIMENSIONS_H_
diff --git a/third_party/xla/xla/pjrt/pjrt_device_dimensions_test.cc b/third_party/xla/xla/pjrt/pjrt_device_dimensions_test.cc
new file mode 100644
index 00000000000000..d9151caaf449a9
--- /dev/null
+++ b/third_party/xla/xla/pjrt/pjrt_device_dimensions_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/pjrt_device_dimensions.h"
+
+#include <sstream>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "xla/pjrt/proto/pjrt_device_dimensions.pb.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using ::testing::HasSubstr;
+
+TEST(PjRtDeviceDimensionsTest, Equality) {
+  EXPECT_EQ((PjRtDeviceDimensions{1, 2, 3}), (PjRtDeviceDimensions{1, 2, 3}));
+  EXPECT_NE((PjRtDeviceDimensions{1, 2, 3}), (PjRtDeviceDimensions{1, 2, 4}));
+}
+
+TEST(PjRtDeviceDimensionsTest, LessThan) {
+  // Same size comparisons
+  EXPECT_TRUE((PjRtDeviceDimensions{1, 2, 3}) <
+              (PjRtDeviceDimensions{1, 2, 4}));
+  EXPECT_TRUE((PjRtDeviceDimensions{1, 2, 3}) <
+              (PjRtDeviceDimensions{1, 3, 0}));
+  EXPECT_TRUE((PjRtDeviceDimensions{1, 2, 3}) <
+              (PjRtDeviceDimensions{2, 0, 0}));
+  EXPECT_FALSE((PjRtDeviceDimensions{1, 2, 3}) <
+               (PjRtDeviceDimensions{1, 2, 3}));
+  EXPECT_FALSE((PjRtDeviceDimensions{1, 2, 4}) <
+               (PjRtDeviceDimensions{1, 2, 3}));
+
+  // Different size comparisons (shorter is less than longer if prefixes match)
+  EXPECT_TRUE((PjRtDeviceDimensions{1, 2}) < (PjRtDeviceDimensions{1, 2, 3}));
+  EXPECT_FALSE((PjRtDeviceDimensions{1, 2, 3}) < (PjRtDeviceDimensions{1, 2}));
+  EXPECT_TRUE((PjRtDeviceDimensions{}) < (PjRtDeviceDimensions{1}));
+  EXPECT_FALSE((PjRtDeviceDimensions{1}) < (PjRtDeviceDimensions{}));
+
+  // Different size comparisons with different prefixes
+  EXPECT_TRUE((PjRtDeviceDimensions{1, 1}) < (PjRtDeviceDimensions{1, 2, 3}));
+  EXPECT_TRUE((PjRtDeviceDimensions{0, 2}) < (PjRtDeviceDimensions{1, 2, 3}));
+}
+
+TEST(PjRtDeviceDimensionsTest, Ostream) {
+  std::stringstream ss;
+  ss << PjRtDeviceDimensions{1, 2, 3};
+  EXPECT_EQ(ss.str(), "1,2,3");
+}
+
+TEST(PjRtDeviceDimensionsTest, AbslHashValue) {
+  absl::flat_hash_set<PjRtDeviceDimensions> hash_set;
+  hash_set.insert({1, 2, 3});
+  hash_set.insert({0, 0, 0});
+  hash_set.insert({1, 2, 3});  // Inserting again should not change size
+
+  EXPECT_EQ(hash_set.size(), 2);
+  EXPECT_TRUE(hash_set.contains({1, 2, 3}));
+  EXPECT_TRUE(hash_set.contains({0, 0, 0}));
+  EXPECT_FALSE(hash_set.contains({1, 2, 4}));
+}
+
+TEST(PjRtDeviceDimensionsTest, FromProto) {
+  PjRtDeviceDimensionsProto proto;
+  proto.add_dimensions(1);
+  proto.add_dimensions(2);
+  proto.add_dimensions(3);
+  TF_ASSERT_OK_AND_ASSIGN(PjRtDeviceDimensions dims,
+                          PjRtDeviceDimensions::FromProto(proto));
+  EXPECT_EQ(dims, PjRtDeviceDimensions({1, 2, 3}));
+}
+
+TEST(PjRtDeviceDimensionsTest, ToProto) {
+  PjRtDeviceDimensions bounds = {1, 2, 3};
+  PjRtDeviceDimensionsProto proto = bounds.ToProto();
+  EXPECT_THAT(proto.dimensions(), testing::ElementsAre(1, 2, 3));
+}
+
+TEST(AbslParseFlagTest, ValidInputs) {
+  PjRtDeviceDimensions bounds;
+  std::string err;
+
+  EXPECT_TRUE(AbslParseFlag("1,2,3", &bounds, &err));
+  EXPECT_EQ(bounds, (PjRtDeviceDimensions{1, 2, 3}));
+  EXPECT_EQ(err, "");
+
+  EXPECT_TRUE(AbslParseFlag("1,2", &bounds, &err));
+  EXPECT_EQ(bounds, (PjRtDeviceDimensions{1, 2}));
+  EXPECT_EQ(err, "");
+
+  EXPECT_TRUE(AbslParseFlag("1,2,3,4", &bounds, &err));
+  EXPECT_EQ(bounds, (PjRtDeviceDimensions{1, 2, 3, 4}));
+  EXPECT_EQ(err, "");
+
+  EXPECT_TRUE(AbslParseFlag("", &bounds, &err));
+  EXPECT_EQ(bounds, (PjRtDeviceDimensions{}));
+  EXPECT_EQ(err, "");
+}
+
+TEST(AbslParseFlagTest, InvalidInputs) {
+  PjRtDeviceDimensions bounds;
+  std::string err;
+
+  EXPECT_FALSE(AbslParseFlag("1,a,3", &bounds, &err));
+  EXPECT_THAT(err, HasSubstr("Number parsing error"));
+
+  EXPECT_FALSE(AbslParseFlag("1,2.5,3", &bounds, &err));
+  EXPECT_THAT(err, HasSubstr("Number parsing error"));
+}
+
+TEST(AbslUnparseFlagTest, ConvertsCorrectly) {
+  EXPECT_EQ(AbslUnparseFlag(PjRtDeviceDimensions{1, 2, 3}), "1,2,3");
+  EXPECT_EQ(AbslUnparseFlag(PjRtDeviceDimensions{0, 0, 0}), "0,0,0");
+}
+
+TEST(PjRtDeviceDimensionsTest, Iterator) {
+  const PjRtDeviceDimensions const_dims = {4, 5, 6};
+  int i = 4;
+  for (int d : const_dims) {
+    EXPECT_EQ(d, i);
+    i++;
+  }
+
+  PjRtDeviceDimensions mutable_dims = {7, 8, 9};
+  for (int& d : mutable_dims) {
+    d *= 2;
+  }
+  EXPECT_EQ(mutable_dims, (PjRtDeviceDimensions{14, 16, 18}));
+}
+
+TEST(PjRtDeviceDimensionsTest, SubscriptAccess) {
+  PjRtDeviceDimensions dims = {10, 20, 30};
+  EXPECT_EQ(dims[0], 10);
+  EXPECT_EQ(dims[1], 20);
+  EXPECT_EQ(dims[2], 30);
+
+  dims[1] = 25;
+  EXPECT_EQ(dims[1], 25);
+  EXPECT_EQ(dims, (PjRtDeviceDimensions{10, 25, 30}));
+
+  const PjRtDeviceDimensions const_dims = {1, 2, 3};
+  EXPECT_EQ(const_dims[0], 1);
+}
+
+TEST(PjRtDeviceDimensionsTest, Size) {
+  EXPECT_EQ((PjRtDeviceDimensions{1, 2, 3}).size(), 3);
+  EXPECT_EQ((PjRtDeviceDimensions{1, 2}).size(), 2);
+  EXPECT_EQ((PjRtDeviceDimensions{}).size(), 0);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index 1638398240e57a..1ed70191869b18 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -83,6 +83,7 @@ absl::StatusOr<CompileOptionsProto> CompileOptions::ToProto() const {
     }
   }
   output.set_allow_in_place_mlir_modification(allow_in_place_mlir_modification);
+  output.set_matrix_unit_operand_precision(matrix_unit_operand_precision);
   output.set_parameter_is_tupled_arguments(parameter_is_tupled_arguments);
   TF_ASSIGN_OR_RETURN(*output.mutable_executable_build_options(),
                       executable_build_options.ToProto());
@@ -100,8 +101,8 @@ absl::StatusOr<CompileOptionsProto> CompileOptions::ToProto() const {
                env_option_override.second);
   }
 
-  if (target_config.has_value()) {
-    *output.mutable_target_config() = target_config->ToProto();
+  if (gpu_target_config.has_value()) {
+    *output.mutable_target_config() = gpu_target_config->ToProto();
   }
   return output;
 }
@@ -126,6 +127,7 @@ absl::StatusOr<CompileOptions> CompileOptions::FromProto(
   }
   output.allow_in_place_mlir_modification =
       proto.allow_in_place_mlir_modification();
+  output.matrix_unit_operand_precision = proto.matrix_unit_operand_precision();
   output.parameter_is_tupled_arguments = proto.parameter_is_tupled_arguments();
   TF_ASSIGN_OR_RETURN(
       ExecutableBuildOptions executable_build_options,
@@ -137,8 +139,9 @@ absl::StatusOr<CompileOptions> CompileOptions::FromProto(
                       LoadEnvOptionOverrides(proto.env_option_overrides()));
 
   if (proto.has_target_config()) {
-    TF_ASSIGN_OR_RETURN(output.target_config, Compiler::TargetConfig::FromProto(
-                                                  proto.target_config()));
+    TF_ASSIGN_OR_RETURN(
+        output.gpu_target_config,
+        Compiler::GpuTargetConfig::FromProto(proto.target_config()));
   }
   return output;
 }
@@ -148,8 +151,8 @@ MultiSliceConfig::~MultiSliceConfig() = default;
 absl::StatusOr<ExecuteOptionsProto> ExecuteOptions::ToProto() const {
   ExecuteOptionsProto proto;
 
-  proto.set_arguments_are_tupled(arguments_are_tupled);
-  proto.set_untuple_result(untuple_result);
+  proto.set_arguments_are_tupled(false);
+  proto.set_untuple_result(true);
   proto.set_launch_id(launch_id);
   if (context != nullptr) {
     return absl::UnimplementedError(
@@ -197,8 +200,6 @@ absl::StatusOr<ExecuteOptions> ExecuteOptions::FromProto(
     const ExecuteOptionsProto& proto) {
   ExecuteOptions options;
 
-  options.arguments_are_tupled = proto.arguments_are_tupled();
-  options.untuple_result = proto.untuple_result();
   options.launch_id = proto.launch_id();
   options.strict_shape_checking = proto.strict_shape_checking();
   options.use_major_to_minor_data_layout_for_callbacks =
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index 8b3171536215ec..4d24c3b57bab4c 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/proto/executable_metadata.pb.h"
@@ -111,7 +112,7 @@ struct CompileOptions {
       std::vector<std::pair<std::string, OptionOverride>>;
   EnvironmentOptionOverrides env_option_overrides;
 
-  std::optional<xla::Compiler::TargetConfig> target_config;
+  std::optional<xla::Compiler::GpuTargetConfig> gpu_target_config;
 
   // Allow to modify the input MLIR / XLA program.
   // This is used to run passes on the MLIR parameter without having to clone it
@@ -120,7 +121,6 @@ struct CompileOptions {
   bool allow_in_place_mlir_modification = false;
 
   // Used to indicate the precision configuration.
-  // TODO(b/450278657): Not serialized into the proto. Should it be?
   PrecisionConfig::Precision matrix_unit_operand_precision =
       PrecisionConfig::DEFAULT;
 
@@ -147,12 +147,7 @@ struct CompileOptions {
 
 struct LoadOptions {
   // Origin of the subslice of the target topology to run computation on.
-  struct ComputationOrigin {
-    int x = 0;
-    int y = 0;
-    int z = 0;
-  };
-  std::optional<ComputationOrigin> computation_origin;
+  std::optional<xla::PjRtDeviceDimensions> computation_origin;
 
   // multi_slice_config to associate with the executable during load of a multi
   // slice operation.
@@ -219,14 +214,6 @@ struct RecvCallback {
 };
 
 struct ExecuteOptions {
-  // If true, the client must pass a single PjRtBuffer which contains all of
-  // the arguments as a single XLA tuple, otherwise each argument must be
-  // passed in its own PjRtBuffer. May only be true if the executable was
-  // compiled with parameter_is_tupled_arguments==true.
-  bool arguments_are_tupled = false;
-  // If true, the computation must return a tuple, which will be destructured
-  // into its elements.
-  bool untuple_result = false;
   // If non-zero, identifies this execution as part of a potentially
   // multi-device launch. This can be used to detect scheduling errors, e.g. if
   // multi-host programs are launched in different orders on different hosts,
diff --git a/third_party/xla/xla/pjrt/pjrt_executable_test.cc b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
index 9d2cc1577e2cd1..b52c69bead4cd1 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
@@ -33,14 +33,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using ::tsl::testing::StatusIs;
-
 TEST(CompileOptionsTest, Serialization) {
   CompileOptions src;
   src.compile_portable_executable = true;
   src.parameter_is_tupled_arguments = true;
   src.profile_version = 1;
   src.argument_layouts = {ShapeUtil::MakeShape(S32, {1})};
+  src.matrix_unit_operand_precision = PrecisionConfig::HIGHEST;
   src.allow_in_place_mlir_modification = true;
   ExecutableBuildOptions build_option;
   build_option.set_device_assignment(DeviceAssignment(1, 1));
@@ -71,12 +70,11 @@ TEST(CompileOptionsTest, Defaults) {
   EXPECT_EQ(src.compile_portable_executable, false);
   EXPECT_EQ(src.parameter_is_tupled_arguments, false);
   EXPECT_EQ(src.allow_in_place_mlir_modification, false);
+  EXPECT_EQ(src.matrix_unit_operand_precision, PrecisionConfig::DEFAULT);
 }
 
 TEST(ExecuteOptionsTest, Serialization) {
   ExecuteOptions src;
-  src.arguments_are_tupled = true;
-  src.untuple_result = false;
   src.launch_id = 1234;
   src.strict_shape_checking = true;
   src.execution_mode = ExecuteOptions::ExecutionMode::kAsynchronous;
diff --git a/third_party/xla/xla/pjrt/pjrt_future.h b/third_party/xla/xla/pjrt/pjrt_future.h
index 32aca761138588..a09fae27f6e58e 100644
--- a/third_party/xla/xla/pjrt/pjrt_future.h
+++ b/third_party/xla/xla/pjrt/pjrt_future.h
@@ -16,16 +16,6 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_FUTURE_H_
 #define XLA_PJRT_PJRT_FUTURE_H_
 
-#include "absl/base/macros.h"
 #include "xla/future.h"
 
-namespace xla {
-
-template <typename T = void>
-using PjRtFuture ABSL_DEPRECATE_AND_INLINE() = ::xla::Future<T>;
-
-using PjRtFutureHelpers ABSL_DEPRECATE_AND_INLINE() = ::xla::FutureHelpers;
-
-}  // namespace xla
-
 #endif  // XLA_PJRT_PJRT_FUTURE_H_
diff --git a/third_party/xla/xla/pjrt/pjrt_phase_compiler_test.cc b/third_party/xla/xla/pjrt/pjrt_phase_compiler_test.cc
index 7760d5c14c238f..fca8f06a8a961d 100644
--- a/third_party/xla/xla/pjrt/pjrt_phase_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_phase_compiler_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 
 namespace pjrt {
@@ -47,8 +46,6 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 constexpr absl::string_view kStablehloModuleStr = R"(
   module {
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index df32d31e342757..69635421f80399 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -105,9 +105,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/layout.h"
-#include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/common_pjrt_client.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
@@ -123,7 +124,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/pjrt/profiling/profiling_context.h"
 #include "xla/pjrt/raw_buffer.h"
@@ -139,7 +139,6 @@ limitations under the License.
 #include "xla/service/computation_layout.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
-#include "xla/service/generic_transfer_manager.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/maybe_owning_device_memory.h"
@@ -151,11 +150,11 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
@@ -391,49 +390,11 @@ void StallStreamOnError(LocalDeviceState* local_device, se::Stream* stream) {
       // This will stall the calling thread but that's ok in this very rare
       // error case. If the stall fails just crash, since we have no other
       // way to synchronize.
-      TF_CHECK_OK(stream->BlockHostUntilDone());
+      CHECK_OK(stream->BlockHostUntilDone());
       break;
   }
 }
 
-// Does all necessary bookkeeping, after a buffer is successfully enqueued onto
-// a stream, to ensure that the buffer will be kept alive until its use on that
-// stream is complete.
-//
-//   device_buffer:              the buffer that was enqueued.
-//   buffer_local_device:        the device the buffer was allocated on.
-//   stream_local_device:        the device that manages usage_stream.
-//   event:                      an event that was recorded on usage_stream
-//                               after the usage of device_buffer was enqueued.
-//   usage_stream:               the stream the operation using device_buffer
-//                               was enqueued on.
-void RecordUsage(PjRtStreamExecutorBuffer::ScopedHold device_buffer,
-                 LocalDeviceState* buffer_local_device,
-                 LocalDeviceState* stream_local_device,
-                 BufferSequencingEventRef event, se::Stream* usage_stream,
-                 std::vector<tsl::RCReference<RawSEDeviceMemory>>*
-                     buffers_to_release = nullptr) {
-  tsl::profiler::TraceMe traceme("RecordUsage");
-  bool retain_buffer_until_completion =
-      // If the buffer wasn't allocated on the same device as the stream, always
-      // retain a reference.
-      (stream_local_device != buffer_local_device) ||
-      // In the synchronous allocation model, always retain a reference.
-      (stream_local_device->allocation_model() ==
-       LocalDeviceState::kSynchronous);
-  if (retain_buffer_until_completion) {
-    if (buffers_to_release) {
-      buffers_to_release->push_back(device_buffer->device_memory());
-    } else {
-      buffer_local_device
-          ->ThenRelease(usage_stream, device_buffer->device_memory())
-          .IgnoreError();
-    }
-  }
-  device_buffer.ConvertUsageHold(usage_stream, event,
-                                 retain_buffer_until_completion);
-}
-
 // Adds necessary synchronization after a copy has been enqueued to a buffer.
 // definition_event was added when the buffer was allocated, but has not yet
 // had an event recorded.
@@ -533,8 +494,8 @@ PjRtStreamExecutorClient::AllocateRawBuffer(
       auto buffer,
       allocator()->Allocate(local_device->local_device_id().value(),
                             on_device_bytes_count, true, layout_memory_space));
-  auto mem = RawSEDeviceMemory::Create(buffer.Release(),
-                                       device->local_device_id(), allocator());
+  auto mem =
+      RawSEDeviceMemory::Create(buffer.Release(), local_device, allocator());
   if (local_device->allocation_model() !=
       LocalDeviceState::kComputeSynchronized) {
     DCHECK(client()->backend().transfer_manager()->CanBufferBeAccessedNow(
@@ -546,11 +507,17 @@ PjRtStreamExecutorClient::AllocateRawBuffer(
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::DefineBuffer(
-    const Shape& on_device_shape,
+    const Shape& on_device_shape, PjRtMemorySpace* memory_space,
     tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
         definition_device_events,
     bool raw_buffer_is_mutable) {
+  if (raw_buffer && raw_buffer->memory_space() != memory_space) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("DefineBuffer: Mismatch in memory spaces: %s vs %s",
+                        raw_buffer->memory_space()->DebugString(),
+                        memory_space->DebugString()));
+  }
   absl::InlinedVector<BufferSequencingEventRef, 2> definition_events;
   definition_events.reserve(definition_device_events.size());
   for (auto& ev : definition_device_events) {
@@ -558,232 +525,73 @@ PjRtStreamExecutorClient::DefineBuffer(
         tensorflow::down_cast<PjRtStreamExecutorDeviceEvent*>(ev.get())
             ->event());
   }
-  auto* memory_space = raw_buffer->memory_space();
   auto* device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(
       memory_space->devices()[0]);
 
   auto dst_device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device,
-      tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(raw_buffer.get())
-          ->device_buffer(),
-      definition_events);
+      device, std::move(raw_buffer), definition_events);
 
-  auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      on_device_shape, std::move(dst_device_buffer), this, device,
-      memory_space);
+  auto py_buffer = std::make_unique<CommonPjRtBufferImpl>(
+      on_device_shape, std::move(dst_device_buffer), memory_space);
   return py_buffer;
 }
 
+absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                         CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
+PjRtStreamExecutorClient::CreateRawBufferChannel(
+    PjRtMemorySpace* memory_space) {
+  auto buffer_promise = tsl::MakeIndirectAsyncValue();
+  auto* device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(
+      memory_space->devices()[0]);
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      device->GetLocalDeviceState());
+  auto raw_buffer = tsl::MakeRef<PjRtStreamExecutorRawBuffer>(
+      this, memory_space, local_device,
+      tsl::AsyncValueRef<RawSEDeviceMemory>(buffer_promise));
+
+  auto buffer_promise_cb =
+      [buffer_promise = std::move(buffer_promise), memory_space](
+          absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> raw_buffer)
+      -> absl::Status {
+    if (!raw_buffer.ok()) {
+      buffer_promise->SetError(raw_buffer.status());
+      return raw_buffer.status();
+    }
+    if (memory_space != (*raw_buffer)->memory_space()) {
+      auto status = absl::InvalidArgumentError(absl::StrFormat(
+          "Memory space mismatch when forarding raw buffers: %s vs %s",
+          memory_space->DebugString(),
+          (*raw_buffer)->memory_space()->DebugString()));
+      buffer_promise->SetError(status);
+      return status;
+    }
+    buffer_promise->ForwardTo(
+        tensorflow::down_cast<xla::PjRtStreamExecutorRawBuffer*>(
+            raw_buffer->get())
+            ->device_buffer()
+            .CopyRCRef());
+    return absl::OkStatus();
+  };
+
+  return std::make_pair(std::move(raw_buffer), std::move(buffer_promise_cb));
+}
+
 void PjRtStreamExecutorClient::WaitForAllocation(
     se::Stream* stream, const CommonPjRtRawBuffer& raw_buffer) {
-  auto* local_device =
+  auto event =
       tensorflow::down_cast<const PjRtStreamExecutorRawBuffer*>(&raw_buffer)
-          ->local_device();
-  if (local_device->allocation_model() ==
-      LocalDeviceState::kComputeSynchronized) {
-    CHECK(stream);
-    CHECK_OK(stream->WaitFor(local_device->compute_stream()));
-  }
-}
-
-absl::StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>>
-AllocateDestinationBuffer(const Shape& on_host_shape, PjRtDevice* device,
-                          LocalDeviceState* local_device,
-                          se::Stream* copy_stream, bool is_uninitialized_create,
-                          PjRtStreamExecutorClient* client,
-                          BufferSequencingEventRef definition_event,
-                          PjRtMemorySpace* memory_space) {
-  if (on_host_shape.IsTuple()) {
-    return InvalidArgument(
-        "Cannot allocate a PjRtStreamExecutorBuffer for a tuple.");
-  }
-
-  if (!memory_space) {
-    memory_space = device->default_memory_space().value_or(nullptr);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      Shape on_device_shape,
-      client->MakeDefaultShapeForMemorySpace(
-          memory_space, on_host_shape,
-          on_host_shape.has_layout() ? &on_host_shape.layout() : nullptr));
-  TF_ASSIGN_OR_RETURN(
-      size_t on_device_bytes_count,
-      client->GetOnDeviceBytesCount(memory_space, on_device_shape));
-  tsl::RCReference<RawSEDeviceMemory> mem;
-  {
-    bool is_pinned_host_memory =
-        memory_space && (memory_space->kind() == PinnedHostMemorySpace::kKind);
-    // Only allow pinned host memory or device memory.
-    PjRtMemorySpace* default_memory_space =
-        device->default_memory_space().value_or(nullptr);
-    if (memory_space != default_memory_space && !is_pinned_host_memory) {
-      return InvalidArgument("Buffer allocation: invalid memory space");
-    }
-
-    auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client);
-    TransferManager* transfer_manager =
-        se_client->client()->backend().transfer_manager();
-
-    // Communicate the desired memory space to the allocator via the shape
-    // callback.
-    auto memory_space_shape_fn = [is_pinned_host_memory,
-                                  transfer_manager](const Shape& shape) {
-      Shape result = transfer_manager->HostShapeToDeviceShape(shape);
-      if (is_pinned_host_memory) {
-        result.mutable_layout()->set_memory_space(Layout::kHostMemorySpace);
-      }
-      return result;
-    };
-
-    TF_ASSIGN_OR_RETURN(
-        ScopedShapedBuffer dst_buffer,
-        transfer_manager->AllocateScopedShapedBuffer(
-            on_host_shape, se_client->allocator(),
-            local_device->local_device_id().value(),
-            local_device->local_hardware_id().value(), memory_space_shape_fn));
-    Shape old_on_device_shape = dst_buffer.on_device_shape();
-    DCHECK_EQ(on_device_shape, old_on_device_shape)
-        << on_device_shape.ToString(true) << " vs "
-        << old_on_device_shape.ToString(true);
-    DCHECK_EQ(on_device_bytes_count, dst_buffer.buffer({}).size());
-    mem = RawSEDeviceMemory::Create(dst_buffer.buffer({}),
-                                    device->local_device_id(),
-                                    dst_buffer.memory_allocator());
-    dst_buffer.clear();
-    if (local_device->allocation_model() !=
-        LocalDeviceState::kComputeSynchronized) {
-      DCHECK(client->client()
-                 ->backend()
-                 .transfer_manager()
-                 ->CanBufferBeAccessedNow(
-                     local_device->compute_stream()->parent(), mem->mem()));
-    }
-  }
-  if (local_device->allocation_model() ==
-      LocalDeviceState::kComputeSynchronized) {
-    if (copy_stream == nullptr) {
-      CHECK(is_uninitialized_create);
-    } else {
-      CHECK(copy_stream->WaitFor(local_device->compute_stream()).ok());
-    }
-  }
-
-  absl::InlinedVector<BufferSequencingEventRef, 2> definition_events;
-  if (is_uninitialized_create) {
-    // There is not going to be any copy into the buffer so in general we don't
-    // need a definition event.
-    // But if the caller provided a definition event then we record that. Also
-    // put it as the first definition event so that we can guarantee only the
-    // first one might not have event recorded.
-    if (definition_event) {
-      definition_events.push_back(definition_event);
-    }
-    if (local_device->allocation_model() ==
-        LocalDeviceState::kComputeSynchronized) {
-      // The allocation is not valid until the compute stream passes this point,
-      // so add a definition event in the compute stream.
-      definition_events.emplace_back(
-          BufferSequencingEvent::Create(client->thread_pool()));
-      TF_RETURN_IF_ERROR(
-          client->AllocateAndRecordEvent(definition_events.back(), local_device,
-                                         local_device->compute_stream()));
-    }
-  } else {
-    // We have at least one definition event, for the copy completing to
-    // the device buffers.
-    if (definition_event) {
-      definition_events.push_back(definition_event);
-    } else {
-      definition_events.emplace_back(
-          BufferSequencingEvent::Create(client->thread_pool()));
-    }
+          ->device_buffer()
+          ->GetDefinitionEvent(thread_pool(), /*nullptr_if_past=*/true);
+  CHECK_OK(event.status());
+  if (*event) {
+    (*event)->WaitForEventOnStream(stream);
   }
-
-  auto dst_device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device, std::move(mem), definition_events);
-
-  auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      on_device_shape, std::move(dst_device_buffer), client, device,
-      memory_space);
-  return py_buffer;
-}
-
-void PjRtStreamExecutorBuffer::ScopedHold::ConvertUsageHold(
-    se::Stream* usage_stream, BufferSequencingEventRef event,
-    bool reference_held) {
-  CHECK(ok());
-  CHECK_EQ(type(), kUsage);
-  parent()->ConvertUsageHold(buffer(), usage_stream, std::move(event),
-                             reference_held);
-  SetState(kConverted);
-}
-
-bool PjRtStreamExecutorBuffer::IsOnCpu() const {
-  return memory_space() != nullptr &&
-         memory_space()->kind() == PinnedHostMemorySpace::kKind;
 }
 
 bool PjRtStreamExecutorClient::IsOnCpu(PjRtMemorySpace* memory_space) {
   return memory_space->kind() == PinnedHostMemorySpace::kKind;
 }
 
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorBuffer::DonateWithControlDependency(Future<> dependency) {
-  VLOG(1) << "PjRtStreamExecutorBuffer::DonateWithControlDependency";
-  std::unique_ptr<PjRtBuffer> new_buffer;
-
-  auto tracked_buffer =
-      GetBufferWithHold(PjRtStreamExecutorBuffer::ScopedHold::kDonation);
-
-  if (!tracked_buffer.ok()) {
-    return InvalidArgument(
-        "Invalid buffer passed to DonateWithControlDependency: %s",
-        tracked_buffer.status().ToString());
-  }
-
-  // Copy all the data in the existing tracked_buffer.
-  const auto& original_definition_events = tracked_buffer->definition_events();
-  absl::InlinedVector<BufferSequencingEventRef, 4> definition_events;
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-
-  auto definition_event_for_status =
-      BufferSequencingEvent::Create(se_client->thread_pool());
-  // definition_event_for_status must be the first one so that it blocks other
-  // actions like D2H transfer from execution before the buffer is ready.
-  definition_events.push_back(definition_event_for_status);
-  definition_events.insert(definition_events.end(),
-                           original_definition_events.begin(),
-                           original_definition_events.end());
-
-  auto new_device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device(), tracked_buffer->device_memory(), std::move(definition_events));
-
-  // Make the new buffer which is identical to the old, except for the new
-  // definition event.
-  new_buffer =
-      std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
-          on_device_shape(), std::move(new_device_buffer), se_client, device(),
-          device()->default_memory_space().value_or(nullptr)));
-
-  auto* device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(this->device());
-  LocalDeviceState* local_device = device->local_device_state();
-  dependency.OnReady(
-      [definition_event_for_status = std::move(definition_event_for_status),
-       local_device, client = se_client](absl::Status status) mutable {
-        // Forward the absl::Status from the supplied dependency to the
-        // definition event.
-        auto stream = local_device->BorrowStreamFromPool();
-        TF_CHECK_OK(client->AllocateAndRecordEvent(definition_event_for_status,
-                                                   local_device, stream.get()));
-        local_device->ReturnStreamToPool(std::move(stream));
-      });
-
-  tracked_buffer.ConfirmDonation();
-  return new_buffer;
-}
-
 absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
 PjRtStreamExecutorClient::LinearizeHostBufferInto(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
@@ -804,8 +612,8 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
   absl::InlinedVector<int64_t, 4> tmp_strides;
   if (!byte_strides) {
     tmp_strides.resize(dims.size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(on_host_shape, absl::MakeSpan(tmp_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        on_host_shape, absl::MakeSpan(tmp_strides)));
     byte_strides = tmp_strides;
   }
   int64_t size = ShapeUtil::ByteSizeOf(on_host_shape);
@@ -814,8 +622,8 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
 
   absl::InlinedVector<int64_t, 4> shape_strides(
       device_shape.dimensions().size());
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
+  TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+      device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
       (size == 0 || *byte_strides == shape_strides);
 
@@ -835,7 +643,7 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
     options.dims = dims;
     options.permutation = permutation;
     options.input_layout = TransposePlan::Striding{*byte_strides};
-    absl::MutexLock lock(&transpose_mu_);
+    absl::MutexLock lock(transpose_mu_);
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
 
@@ -916,7 +724,7 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
                      std::move(on_done_with_host_buffer))
                : nullptr,
        host_buffer_semantics, transpose{std::move(transpose)}]() mutable {
-        // This function uses TF_CHECK_OK and value() since we have no way
+        // This function uses CHECK_OK and value() since we have no way
         // to report failures from a callback. However, the operations here are
         // unlikely to fail and not recoverable even if we were to fail: DMAs to
         // memory that has already been allocated, and a possible Event
@@ -958,14 +766,14 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
               }
             }
           }
-          TF_CHECK_OK(local_device->host_to_device_stream()->Memcpy(
+          CHECK_OK(local_device->host_to_device_stream()->Memcpy(
               &device_memory, staging_buffer.get(), packed_size));
         } else {
-          TF_CHECK_OK(local_device->host_to_device_stream()->Memcpy(
+          CHECK_OK(local_device->host_to_device_stream()->Memcpy(
               &device_memory, data, packed_size));
         }
 
-        TF_CHECK_OK(AddDestinationBufferSynchronization(
+        CHECK_OK(AddDestinationBufferSynchronization(
             this, local_device, event, local_device->host_to_device_stream()));
 
         event.AndThen([raw_buffer = std::move(raw_buffer),
@@ -1023,11 +831,11 @@ PjRtStreamExecutorClient::CreateErrorBuffer(absl::Status error,
 
   // Create an empty buffer.
   auto dummy_device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device, tsl::RCReference<RawSEDeviceMemory>(),
+      device, tsl::RCReference<CommonPjRtRawBuffer>(),
       absl::MakeSpan(&definition_event, 1));
 
-  return std::make_unique<PjRtStreamExecutorBuffer>(
-      shape, std::move(dummy_device_buffer), this, device, memory);
+  return std::make_unique<CommonPjRtBufferImpl>(
+      shape, std::move(dummy_device_buffer), memory);
 }
 
 absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
@@ -1053,38 +861,36 @@ PjRtStreamExecutorClient::LinearizeInto(
   // it includes linearization that may be slow.
   // TODO(misard) assess if it would be preferable to introduce a heuristic to
   // put the transfer into the calling thread for small literals.
-  auto transfer_h2d =
-      [this, local_client = client(), transfer_manager, local_device,
-       raw_buffer, device, event, literal,
-       on_device_shape = std::move(on_device_shape)]() mutable {
-        // This function uses TF_CHECK_OK and value() since we have no way
-        // to report failures from a callback. However, the operations here are
-        // unlikely to fail and not recoverable even if we were to fail: DMAs to
-        // memory that has already been allocated, and a possible Event
-        // allocation.
-        auto device_memory =
-            tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
-                raw_buffer.get())
-                ->device_buffer();
-
-        se::Stream* h2d_stream = local_device->host_to_device_stream();
-
-        ShapedBuffer buffer =
-            device_memory->AsShapedBuffer(device, on_device_shape);
-        TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-            h2d_stream, literal, buffer));
-
-        TF_CHECK_OK(AddDestinationBufferSynchronization(this, local_device,
-                                                        event, h2d_stream));
-
-        local_device->ThenRelease(h2d_stream, device_memory).IgnoreError();
-
-        // This can sometimes catch the case where the literal memory has been
-        // freed before the H2D transfer was issued.
-        h2d_stream->RefreshStatus()
-            .IgnoreError();  // Can return error::Unimplemented
-        QCHECK(h2d_stream->ok());
-      };
+  auto transfer_h2d = [this, local_client = client(), transfer_manager,
+                       local_device, raw_buffer, device, event, literal,
+                       on_device_shape = std::move(on_device_shape)]() mutable {
+    // This function uses CHECK_OK and value() since we have no way
+    // to report failures from a callback. However, the operations here are
+    // unlikely to fail and not recoverable even if we were to fail: DMAs to
+    // memory that has already been allocated, and a possible Event
+    // allocation.
+    auto device_memory =
+        tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(raw_buffer.get())
+            ->device_buffer();
+
+    se::Stream* h2d_stream = local_device->host_to_device_stream();
+
+    ShapedBuffer buffer =
+        device_memory->AsShapedBuffer(device, on_device_shape);
+    CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(h2d_stream, literal,
+                                                            buffer));
+
+    CHECK_OK(AddDestinationBufferSynchronization(this, local_device, event,
+                                                 h2d_stream));
+
+    local_device->ThenRelease(h2d_stream, device_memory).IgnoreError();
+
+    // This can sometimes catch the case where the literal memory has been
+    // freed before the H2D transfer was issued.
+    h2d_stream->RefreshStatus()
+        .IgnoreError();  // Can return error::Unimplemented
+    QCHECK(h2d_stream->ok());
+  };
   thread_pool()->Schedule(WrapClosureAsCopyable(std::move(transfer_h2d)));
   return tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(event);
 }
@@ -1120,10 +926,12 @@ PjRtStreamExecutorClient::CreateViewOfDeviceBuffer(
                                             local_device, definition_stream));
 
   auto device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device, std::move(buffer), definition_events);
-  return std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
-      shape, std::move(device_buffer), this, device,
-      device->default_memory_space().value_or(nullptr)));
+      device,
+      tsl::MakeRef<PjRtStreamExecutorRawBuffer>(
+          this, memory_space, local_device, std::move(buffer)),
+      definition_events);
+  return std::unique_ptr<PjRtBuffer>(std::make_unique<CommonPjRtBufferImpl>(
+      shape, std::move(device_buffer), memory_space));
 }
 
 absl::Status PjRtStreamExecutorClient::DmaMap(void* data, size_t buffer_size) {
@@ -1138,7 +946,7 @@ absl::Status PjRtStreamExecutorClient::DmaMap(void* data, size_t buffer_size) {
     return absl::InternalError(absl::StrFormat(
         "Failed to register host memory at address: %ps", data));
   }
-  absl::MutexLock lock(&dma_maps_mutex_);
+  absl::MutexLock lock(dma_maps_mutex_);
   dma_maps_.insert({data, buffer_size});
   return absl::OkStatus();
 }
@@ -1155,7 +963,7 @@ absl::Status PjRtStreamExecutorClient::DmaUnmap(void* data) {
     return absl::InternalError(absl::StrFormat(
         "Failed to unregister host memory at address: %ps", data));
   }
-  absl::MutexLock lock(&dma_maps_mutex_);
+  absl::MutexLock lock(dma_maps_mutex_);
   dma_maps_.erase(data);
   return absl::OkStatus();
 }
@@ -1262,683 +1070,6 @@ absl::Span<PjRtMemorySpace* const> PjRtStreamExecutorClient::memory_spaces()
   return memory_spaces_;
 }
 
-PjRtStreamExecutorBuffer::PjRtStreamExecutorBuffer(
-    Shape on_device_shape, std::unique_ptr<TrackedDeviceBuffer> device_buffer,
-    PjRtClient* client, PjRtDevice* device, PjRtMemorySpace* memory_space)
-    : CommonPjRtBufferImpl(std::move(on_device_shape), std::move(device_buffer),
-                           memory_space) {}
-
-PjRtStreamExecutorBuffer::~PjRtStreamExecutorBuffer() { Delete(); }
-
-absl::StatusOr<tsl::RCReference<RawSEDeviceMemory>>
-PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
-  tsl::profiler::TraceMe trace_me("PjRtStreamExecutorBuffer::Release");
-  std::unique_ptr<TrackedDeviceBuffer> device_buffer(
-      static_cast<TrackedDeviceBuffer*>(ReleaseBuffer().release()));
-  if (device_buffer == nullptr) {
-    return tsl::RCReference<RawSEDeviceMemory>();
-  }
-  TrackedDeviceBuffer::StreamAndEventContainer events =
-      device_buffer->LockUseAndTransferUsageEvents();
-  auto device_memory = device_buffer->device_memory();
-  auto* se_device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device());
-  LocalDeviceState* local_device_state = se_device->local_device_state();
-  if (wait_for_operations_to_complete) {
-    // Block the host until all usage events have completed. Usage events
-    // dominate definition events, so this also waits for the buffer to be
-    // defined.
-    std::unique_ptr<se::Stream> stream;
-    for (const auto& stream_and_event : events) {
-      if (!stream_and_event.event->IsComplete()) {
-        if (stream == nullptr) {
-          stream = local_device_state->BorrowStreamFromPool();
-        }
-        stream_and_event.event->WaitForEventOnStream(stream.get());
-      }
-    }
-    if (stream != nullptr) {
-      TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-      local_device_state->ReturnStreamToPool(std::move(stream));
-    }
-  } else {
-    if (local_device_state->allocation_model() ==
-        LocalDeviceState::kComputeSynchronized) {
-      se::Stream* block_stream = nullptr;
-      // If an event is not defined yet, we wait for it to be defined in a new
-      // thread in the thread pool.
-      // This allows the host to schedule:
-      //   create buffer -> use -> delete -> fulfill
-      absl::InlinedVector<BufferSequencingEventRef, 5>
-          events_to_wait_for_in_a_different_thread;
-      auto maybe_wait_for_event_on_block_stream_or_add_to_events_to_wait =
-          [&events_to_wait_for_in_a_different_thread, local_device_state,
-           &block_stream](const BufferSequencingEventRef& event) {
-            if (local_device_state->allow_delete_before_fulfill() &&
-                !event->IsDefined()) {
-              // Wait for the event to be defined in a different thread.
-              events_to_wait_for_in_a_different_thread.push_back(event);
-            } else {
-              MaybeWaitForEventOnStream(event, local_device_state,
-                                        block_stream);
-            }
-          };
-      for (const auto& stream_and_event : events) {
-        VLOG(4)
-            << "Checking whether need to wait for stream_and_event: stream: "
-            << (stream_and_event.event->IsDefined()
-                    ? stream_and_event.event->definition_stream()
-                    : nullptr)
-            << "; event: " << &*stream_and_event.event
-            << "; reference_held: " << stream_and_event.reference_held
-            << "; is_predetermined_error: "
-            << stream_and_event.event->IsPredeterminedError();
-        // We only need to do something for events that didn't already acquire a
-        // reference to the buffer and for other situations described in the
-        // comment of MaybeWaitForEventOnStream()
-        if (!stream_and_event.reference_held) {
-          maybe_wait_for_event_on_block_stream_or_add_to_events_to_wait(
-              stream_and_event.event);
-        }
-      }
-      for (const auto& definition_event : device_buffer->definition_events()) {
-        VLOG(4) << "Checking whether need to wait for definition_event: "
-                << &*definition_event << "; is_predetermined_error: "
-                << definition_event->IsPredeterminedError();
-        // Here we wait for the definition events to complete on block_stream as
-        // well, in case they are not also usage events.
-        maybe_wait_for_event_on_block_stream_or_add_to_events_to_wait(
-            definition_event);
-      }
-      if (!events_to_wait_for_in_a_different_thread.empty()) {
-        VLOG(3) << "Going to wait for "
-                << events_to_wait_for_in_a_different_thread.size()
-                << " events in a different thread.";
-        // We always use the cleanup_thread instead of using the
-        // client->thread_pool() here to avoid exhausting the client thread
-        // pool.
-        local_device_state->cleanup_thread()->Schedule(
-            [events_to_wait_for_in_a_different_thread =
-                 std::move(events_to_wait_for_in_a_different_thread),
-             local_device_state, device_memory, block_stream]() mutable {
-              for (const auto& event :
-                   events_to_wait_for_in_a_different_thread) {
-                MaybeWaitForEventOnStream(event, local_device_state,
-                                          block_stream);
-              }
-              if (block_stream != nullptr) {
-                TF_CHECK_OK(local_device_state->ThenExecuteCallback(
-                    block_stream, [device_memory]() {
-                      // Drops device_memory shared pointer.
-                    }));
-              }
-            });
-      } else if (block_stream != nullptr) {
-        TF_RETURN_IF_ERROR(local_device_state->ThenExecuteCallback(
-            block_stream, [device_memory]() {
-              // Drops device_memory shared pointer.
-            }));
-      }
-    }
-  }
-  return device_memory;
-}
-
-void PjRtStreamExecutorBuffer::Delete() {
-  VLOG(3) << "PjRtStreamExecutorBuffer::Delete";
-
-  // When wait_for_reads_to_complete is false, Release should never fail.
-  //
-  // The only usage events that
-  // Release(/*wait_for_operations_to_complete=*/false) doesn't wait for are
-  // events defined on the compute stream. All streams other than the compute
-  // stream are expected to WaitFor compute stream before any write operations.
-  TF_CHECK_OK(Release(/*wait_for_operations_to_complete=*/false).status());
-}
-
-void PjRtStreamExecutorBuffer::ConvertUsageHold(TrackedDeviceBuffer* buffer,
-                                                se::Stream* usage_stream,
-                                                BufferSequencingEventRef event,
-                                                bool reference_held) {
-  absl::MutexLock lock(&mu_);
-  CHECK(device_buffer() == buffer || device_buffer() == nullptr);
-  buffer->AddUsageEvent(std::move(event), reference_held);
-  DecrementUsage();
-}
-
-Future<> PjRtStreamExecutorBuffer::LazyToLiteral(
-    absl::AnyInvocable<Future<MutableLiteralBase*>() &&> generator) {
-  auto buffer = std::move(generator)();
-  return ToLiteralHelper(std::move(buffer));
-}
-
-Future<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
-  return ToLiteralHelper(Future<MutableLiteralBase*>(literal));
-}
-
-Future<> PjRtStreamExecutorBuffer::ToLiteralHelper(
-    Future<MutableLiteralBase*> literal) {
-  VLOG(3) << "PjRtStreamExecutorBuffer::ToLiteral";
-  auto* se_device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device());
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-  LocalDeviceState* local_device = se_device->local_device_state();
-  se::Stream* stream = local_device->GetDeviceToHostStream();
-  auto device_buffer = GetBufferWithUsageHold();
-  if (!device_buffer.ok()) {
-    return Future<>(
-        InvalidArgument("ToLiteral() called on deleted or donated buffer: %s",
-                        device_buffer.status().ToString()));
-  }
-
-  auto [promise, future] = Future<>::MakePromise();
-  auto usage_event = BufferSequencingEvent::Create(se_client->thread_pool());
-
-  TransferManager* transfer_manager =
-      se_client->client()->backend().transfer_manager();
-
-  auto device_memory = device_buffer->device_memory();
-  auto definition_events = device_buffer->definition_events();
-  auto first_definition_event = definition_events[0];
-
-  // When using the ComputeSynchronized allocation model, retain a
-  // reference to the device_buffer until the copy completes, to
-  // ensure that the buffer isn't deleted or donated while it is still
-  // in use. The choice of retaining a reference at the host is a
-  // heuristic; the alternative is to ensure, before freeing the
-  // buffer, that the compute stream is synchronized past the
-  // transfer, but it seems better to hold onto the buffer too long
-  // than to stall the compute stream, particularly since the
-  // overwhelmingly common use case of CopyToHostAsync will hold onto
-  // the reference long enough to read the buffer in a subsequent call
-  // to ToLiteral.
-  device_buffer.ConvertUsageHold(stream, usage_event, /*reference_held=*/true);
-
-  auto [literal_and_transpose_promise, literal_and_transpose_future] =
-      Future<std::pair<MutableLiteralBase*,
-                       std::shared_ptr<TransposePlan>>>::MakePromise();
-
-  literal.OnReady(
-      [client = se_client, on_device_shape{on_device_shape()},
-       promise = std::move(literal_and_transpose_promise)](
-          const absl::StatusOr<MutableLiteralBase*>& value) mutable {
-        if (!value.ok()) {
-          promise.Set(value.status());
-          return;
-        }
-
-        MutableLiteralBase* literal = *std::move(value);
-
-        std::shared_ptr<TransposePlan> transpose;
-        if (on_device_shape.IsArray()) {
-          xla::Layout literal_layout;
-          if (literal->shape().has_layout()) {
-            literal_layout = literal->shape().layout();
-          } else {
-            literal_layout = LayoutUtil::MakeDescendingLayout(
-                on_device_shape.dimensions().size());
-          }
-
-          if (on_device_shape.layout() != literal_layout) {
-            absl::InlinedVector<int64_t, 4> byte_strides(
-                on_device_shape.dimensions().size());
-            absl::Status s = ShapeUtil::ByteStrides(
-                on_device_shape, absl::MakeSpan(byte_strides));
-            if (!s.ok()) {
-              promise.Set(s);
-              return;
-            }
-            absl::Span<const int64_t> dims = on_device_shape.dimensions();
-            absl::InlinedVector<int64_t, 4> permutation(dims.size());
-            absl::c_reverse_copy(literal_layout.minor_to_major(),
-                                 permutation.begin());
-            TransposePlan::Options options;
-            options.elem_size_in_bytes =
-                primitive_util::ByteWidth(on_device_shape.element_type());
-            options.dims = on_device_shape.dimensions();
-            options.permutation = permutation;
-            options.input_layout = TransposePlan::Striding{byte_strides};
-            {
-              absl::MutexLock lock(&client->transpose_mu_);
-              absl::StatusOr<std::shared_ptr<TransposePlan>> t =
-                  client->transpose_cache_.GetOrCreate(options);
-              if (!t.ok()) {
-                promise.Set(t.status());
-                return;
-              }
-              transpose = *std::move(t);
-            }
-          }
-        }
-        promise.Set(std::make_pair(literal, std::move(transpose)));
-      });
-
-  auto async_to_literal = [client = se_client, usage_event,
-                           device_memory = std::move(device_memory),
-                           definition_events = std::move(definition_events),
-                           stream, device = se_device,
-                           transfer_manager = std::move(transfer_manager),
-                           on_device_shape{on_device_shape()},
-                           literal_and_transpose =
-                               std::move(literal_and_transpose_future),
-                           promise = std::move(promise).ToShared(),
-                           local_device]() mutable {
-    absl::StatusOr<EventPool::Handle> event_or =
-        local_device->event_pool().AllocateEvent(stream->parent());
-    if (!event_or.ok()) {
-      promise->Set(event_or.status());
-      return;
-    }
-
-    absl::Status defined_status = definition_events[0]->GetDefinedStatus();
-    if (!defined_status.ok()) {
-      promise->Set(defined_status);
-      return;
-    }
-
-    literal_and_transpose.OnReady(
-        [client, usage_event = std::move(usage_event),
-         device_memory = std::move(device_memory),
-         definition_events = std::move(definition_events),
-         stream = std::move(stream), device,
-         transfer_manager = std::move(transfer_manager),
-         on_device_shape = std::move(on_device_shape),
-         promise = std::move(promise), local_device = std::move(local_device),
-         event_or = std::move(event_or)](
-            const absl::StatusOr<
-                std::pair<MutableLiteralBase*, std::shared_ptr<TransposePlan>>>&
-                value) mutable {
-          if (!value.ok()) {
-            promise->Set(value.status());
-            return;
-          }
-
-          auto [literal, transpose] = *std::move(value);
-
-          WaitForBufferDefinitionEventsOnStream(
-              absl::MakeSpan(definition_events), stream);
-
-          ShapedBuffer shaped_buffer =
-              device_memory->AsShapedBuffer(device, on_device_shape);
-
-          GenericTransferManager::LiteralFromDeviceMetadata transfer_metadata;
-          // We never call device functions from the `done` callback.
-          transfer_metadata.callback_is_host_callback_safe = true;
-
-          TransferManager::TransferMetadata* transfer_metadata_ptr =
-              (dynamic_cast<GenericTransferManager*>(transfer_manager) !=
-               nullptr)
-                  ? &transfer_metadata
-                  : nullptr;
-
-          if (transpose) {
-            // Copy the device buffer to a temporary literal with descending
-            // layout and transpose to the requested layout.
-
-            Shape stage_shape = literal->shape();
-            *stage_shape.mutable_layout() = LayoutUtil::MakeDescendingLayout(
-                stage_shape.dimensions().size());
-            auto staged = std::make_shared<Literal>(stage_shape);
-
-            transfer_manager->TransferLiteralFromDevice(
-                stream, shaped_buffer, staged.get(),
-                [transpose = std::move(transpose), promise, staged,
-                 literal = std::move(literal)](absl::Status status) mutable {
-                  if (status.ok()) {
-                    transpose->Execute(staged->untyped_data(),
-                                       literal->untyped_data());
-                  }
-                  promise->Set(std::move(status));
-                },
-                transfer_metadata_ptr);
-          } else {
-            transfer_manager->TransferLiteralFromDevice(
-                stream, shaped_buffer, literal,
-                [promise](absl::Status status) mutable {
-                  promise->Set(std::move(status));
-                },
-                transfer_metadata_ptr);
-          }
-
-          client->ThenRecordEvent(usage_event, local_device,
-                                  std::move(event_or).value(), stream);
-
-          absl::Status defined_status =
-              local_device->ThenRelease(stream, device_memory);
-          if (!defined_status.ok()) {
-            promise->Set(defined_status);
-          }
-        });
-  };
-
-  first_definition_event->ExecuteOrAddToFutureTasks(
-      "async_to_literal", std::move(async_to_literal));
-
-  return FutureHelpers::WithProfiling(
-      std::move(future),
-      /*on_block_start=*/
-      []() {
-        tsl::profiler::TraceMeProducer traceme(
-            "PjRtStreamExecutorBuffer::ToLiteral");
-        VLOG(3) << "PjRtStreamExecutorBuffer::ToLiteral";
-        return FutureHelpers::ProfilingKeys(
-            {/*traceme_context_id =*/traceme.GetContextId()});
-      },
-      /*on_block_end=*/
-      [](FutureHelpers::ProfilingKeys keys) {
-        tsl::profiler::TraceMeConsumer traceme(
-            "PjRtStreamExecutorBuffer::ToLiteral", keys.traceme_context_id);
-      });
-}
-
-absl::StatusOr<size_t> PjRtStreamExecutorBuffer::GetOnDeviceSizeInBytes()
-    const {
-  absl::MutexLock lock(&mu_);
-  if (device_buffer() == nullptr || !device_buffer()->device_memory()) {
-    return InvalidArgument(
-        "GetOnDeviceSizeInBytes called on deleted or donated buffer");
-  }
-  return device_buffer()->device_memory()->mem().size();
-}
-
-Future<> PjRtStreamExecutorBuffer::CopyRawToHost(void* dst, int64_t offset,
-                                                 int64_t transfer_size) {
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-  return se_client->CopyRawSubBufferToHost(this, Future<void*>(dst), offset,
-                                           transfer_size);
-}
-
-Future<> PjRtStreamExecutorBuffer::CopyRawToHostFuture(Future<void*> dst,
-                                                       int64_t offset,
-                                                       int64_t transfer_size) {
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-  return se_client->CopyRawSubBufferToHost(this, dst, offset, transfer_size);
-}
-
-PjRtStreamExecutorBuffer::ScopedHold
-PjRtStreamExecutorBuffer::GetBufferWithHold(ScopedHold::Type type) {
-  absl::MutexLock lock(&mu_);
-  // Ensure that at most one donation hold can be in progress at a time.
-  WaitForOutstandingDonationHold();
-  ScopedHold hold(this, type);
-  AcquireHoldLocked(&hold);
-  return hold;
-}
-
-absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>>
-PjRtStreamExecutorBuffer::CopyToDeviceHelper(
-    PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
-    PjRtMemorySpace* dst_memory_space, LocalDeviceState* transfer_local_device,
-    LocalDeviceState* src_local_device, se::Stream* transfer_stream,
-    const TrackedDeviceBuffer& src_device_buffer) {
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtStreamExecutorBuffer> py_buffer,
-                      AllocateDestinationBuffer(
-                          ShapeUtil::DeviceShapeToHostShape(on_device_shape()),
-                          dst_device, dst_local_device, transfer_stream,
-                          /*is_uninitialized_create=*/false, se_client,
-                          /*definition_event=*/nullptr, dst_memory_space));
-
-  ScopedHold dst_device_buffer(py_buffer->GetBufferWithUsageHold());
-  CHECK(dst_device_buffer.ok());
-
-  BufferSequencingEventRef copy_event =
-      dst_device_buffer->definition_events()[0];
-
-  // Copy the leaf buffers.
-  auto async_copy_to_device = [src_memory = src_device_buffer.device_memory(),
-                               src_definition_events =
-                                   src_device_buffer.definition_events(),
-                               dst_memory = dst_device_buffer->device_memory(),
-                               transfer_stream = std::move(transfer_stream),
-                               copy_event,
-                               on_device_shape{py_buffer->on_device_shape()},
-                               src_local_device = std::move(src_local_device),
-                               transfer_local_device =
-                                   std::move(transfer_local_device),
-                               dst_local_device = std::move(dst_local_device),
-                               client = se_client]() mutable {
-    tsl::profiler::TraceMe traceme(
-        "PjRtStreamExecutorBuffer::CopyToDeviceHelper::async_copy_to_"
-        "device");
-    VLOG(3)
-        << "PjRtStreamExecutorBuffer::CopyToDeviceHelper::async_copy_to_device";
-
-    absl::Status defined_status = src_definition_events[0]->GetDefinedStatus();
-    // Only proceeds to transfer when the buffer doesn't hold an error.
-    if (defined_status.ok()) {
-      WaitForBufferDefinitionEventsOnStream(src_definition_events,
-                                            transfer_stream);
-
-      const se::DeviceMemoryBase& input_buffer = src_memory->mem();
-      const se::DeviceMemoryBase& output_buffer = dst_memory->mem();
-      CHECK_EQ(input_buffer.size(), output_buffer.size());
-      if (input_buffer.size() != 0) {
-        auto status = transfer_local_device->ThenMemcpyDeviceToDevice(
-            transfer_stream, dst_local_device->compute_stream(), input_buffer,
-            output_buffer);
-        if (!status.ok()) {
-          LOG(ERROR) << "D2D memory copy failed due to: " << status;
-          StallStreamOnError(transfer_local_device, transfer_stream);
-          if (transfer_local_device == dst_local_device) {
-            // Some copies may have been enqueued before the error was
-            // returned, and StallStreamOnError only makes sure the
-            // destination device is ok, so make sure that the src buffer
-            // remains valid until after any transfers have completed.
-            auto status =
-                src_local_device->ThenRelease(transfer_stream, src_memory);
-            if (!status.ok()) {
-              LOG(ERROR) << "ThenRelease failed due to: " << status;
-            }
-          }
-          return;
-        }
-      }
-
-      absl::Status status = client->AllocateAndRecordEvent(
-          copy_event, transfer_local_device, transfer_stream);
-      if (!status.ok()) {
-        StallStreamOnError(transfer_local_device, transfer_stream);
-        LOG(ERROR) << status;
-        return;
-      }
-    } else {
-      client->SetEventAsError(copy_event, defined_status);
-    }
-
-    auto status =
-        src_local_device->ThenRelease(transfer_stream, std::move(src_memory));
-    if (!status.ok()) {
-      LOG(ERROR) << "ThenRelease failed due to: " << status;
-    }
-  };
-
-  src_device_buffer.definition_events()[0]->ExecuteOrAddToFutureTasks(
-      absl::StrFormat("async_copy_to_device_%p", dst_device_buffer.buffer()),
-      std::move(async_copy_to_device));
-
-  RecordUsage(std::move(dst_device_buffer), transfer_local_device,
-              transfer_local_device, copy_event, transfer_stream);
-
-  return std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>(
-      std::unique_ptr<PjRtStreamExecutorBuffer>(std::move(py_buffer)),
-      std::move(copy_event));
-}
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorBuffer::CopyToDeviceMemorySpace(
-    PjRtDevice* dst_device, PjRtMemorySpace* dst_memory_space) {
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-  auto* se_device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device());
-  // Copying across PjRtClients involves a copy through the host.
-  if (dst_device->client() != se_client) {
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
-    // Avoid use-after-free on `literal` due to unsequenced move and use.
-    Literal* literal_pointer = literal.get();
-    absl::InlinedVector<int64_t, 4> byte_strides(
-        literal->shape().dimensions().size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
-    return dst_device->client()->BufferFromHostBuffer(
-        literal_pointer->untyped_data(),
-        literal_pointer->shape().element_type(),
-        literal_pointer->shape().dimensions(), byte_strides,
-        PjRtStreamExecutorClient::HostBufferSemantics::kImmutableZeroCopy,
-        [literal{std::move(literal)}]() { /* frees literal */ },
-        dst_memory_space, /*device_layout=*/nullptr);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      LocalDeviceState * dst_local_device,
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(dst_device)
-          ->GetLocalDeviceState());
-  LocalDeviceState* transfer_local_device =
-      se_client->EnqueueD2DTransfersOnSrcStream()
-          ? se_device->local_device_state()
-          : dst_local_device;
-  CHECK_EQ(dst_local_device->allocation_model(),
-           transfer_local_device->allocation_model());
-
-  se::Stream* transfer_stream =
-      transfer_local_device->GetDeviceToDeviceStream();
-
-  auto src_device_buffer = GetBufferWithUsageHold();
-  if (!src_device_buffer.ok()) {
-    return InvalidArgument(
-        "CopyToDevice() called on deleted or donated buffer: %s",
-        src_device_buffer.status().ToString());
-  }
-
-  absl::StatusOr<
-      std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>>
-      buffer_and_event_or = CopyToDeviceHelper(
-          dst_device, dst_local_device, dst_memory_space, transfer_local_device,
-          se_device->local_device_state(), transfer_stream, *src_device_buffer);
-  if (!buffer_and_event_or.ok()) {
-    return buffer_and_event_or.status();
-  }
-
-  auto& buffer_and_event = buffer_and_event_or.value();
-  std::unique_ptr<PjRtBuffer>& buffer = buffer_and_event.first;
-  BufferSequencingEventRef& event = buffer_and_event.second;
-
-  src_device_buffer.ConvertUsageHold(transfer_stream, event,
-                                     /*reference_held=*/true);
-
-  return std::move(buffer);
-}
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorBuffer::CopyToMemorySpace(PjRtMemorySpace* dst_memory_space) {
-  if (dst_memory_space->devices().size() == 1) {
-    return CopyToDeviceMemorySpace(dst_memory_space->devices()[0],
-                                   dst_memory_space);
-  }
-  return Unimplemented("CopyToMemorySpace is not supported");
-}
-
-void PjRtStreamExecutorBuffer::CopyToRemoteDevice(
-    Future<std::string> serialized_descriptor, RemoteSendCallback on_done) {
-  VLOG(3) << "PjRtStreamExecutorBuffer::CopyToRemoteDevice";
-  auto desc = serialized_descriptor.Await();
-  if (desc.ok()) {
-    auto* se_client =
-        tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-    se_client->CopyToRemoteDevice(this, *desc, std::move(on_done));
-  } else {
-    on_done(desc.status(), /*sends_enqueued=*/false);
-  }
-}
-
-Future<> PjRtStreamExecutorBuffer::GetReadyFuture() {
-  absl::InlinedVector<BufferSequencingEventRef, 2> definition_events;
-  Promise<> definition_promise;
-  Future<> definition_future;
-  {
-    absl::MutexLock lock(&mu_);
-    if (device_buffer() == nullptr) {
-      return Future<>(InvalidArgument(
-          "GetReadyFuture() called on deleted or donated buffer"));
-    }
-    if (!definition_future_) {
-      definition_events = device_buffer()->definition_events();
-      std::tie(definition_promise, definition_future_) =
-          Future<>::MakePromise();
-    }
-    definition_future = definition_future_;
-  }
-
-  if (!definition_events.empty()) {
-    auto* se_device =
-        tensorflow::down_cast<PjRtStreamExecutorDevice*>(device());
-    LocalDeviceState* local_device_state = se_device->local_device_state();
-    auto first_definition_event = definition_events[0];
-    auto async_wait_for_events =
-        [definition_events = std::move(definition_events),
-         local_device_state = std::move(local_device_state),
-         definition_promise = std::make_shared<Promise<>>(
-             std::move(definition_promise))]() mutable {
-          std::unique_ptr<se::Stream> stream;
-          absl::Status defined_status =
-              definition_events[0]->GetDefinedStatus();
-          if (!defined_status.ok()) {
-            definition_promise->Set(defined_status);
-            return;
-          }
-          for (auto& event : definition_events) {
-            if (!event->IsComplete()) {
-              if (stream == nullptr) {
-                stream = local_device_state->BorrowStreamFromPool();
-              }
-              event->WaitForEventOnStream(stream.get());
-            }
-          }
-
-          if (stream != nullptr) {
-            auto* stream_ptr = stream.release();
-            // We already borrowed a stream from the pool so we can safely do
-            // the callback directly on that stream instead of bouncing through
-            // local_device_state->ThenExecuteCallback. The direct callback
-            // saves significant time.
-            auto status = stream_ptr->DoHostCallback(
-                [definition_promise, stream_ptr, local_device_state,
-                 event_with_status = definition_events[0]]() mutable {
-                  local_device_state->ReturnStreamToPool(
-                      std::unique_ptr<se::Stream>(stream_ptr));
-                  definition_promise->Set(
-                      event_with_status->GetDefinedStatus());
-                });
-            if (!status.ok()) {
-              definition_promise->Set(status);
-              return;
-            }
-          } else {
-            // All events are already complete; set the `definition_promise`
-            // with the status of the buffer's first definition event which may
-            // have error status to propagate.
-            definition_promise->Set(definition_events[0]->GetDefinedStatus());
-          }
-        };
-    first_definition_event->ExecuteOrAddToFutureTasks(
-        absl::StrFormat("async_wait_for_events_%p", &async_wait_for_events),
-        std::move(async_wait_for_events));
-  }
-
-  return FutureHelpers::WithProfiling(
-      std::move(definition_future),
-      /*on_block_start=*/
-      [] {
-        tsl::profiler::TraceMeProducer traceme(
-            "PjRtStreamExecutorBuffer::Await");
-        VLOG(3) << "PjRtStreamExecutorBuffer::Await";
-        return FutureHelpers::ProfilingKeys(
-            {/*traceme_context_id=*/traceme.GetContextId()});
-      },
-      /*on_block_end=*/
-      [](FutureHelpers::ProfilingKeys keys) {
-        tsl::profiler::TraceMeConsumer traceme(
-            "PjRtStreamExecutorBuffer::Await", keys.traceme_context_id);
-      });
-}
-
 namespace {
 
 // Helper struct for the tuple that is transiently constructed to hold the
@@ -2002,12 +1133,12 @@ absl::Status CheckCompatibleShapes(bool strict_shape_checking,
 // Makes a tuple from the arguments to an execution.
 static absl::StatusOr<std::pair<ShapeTree<PjRtStreamExecutorExecutionInput>,
                                 BufferSequencingEventRef>>
-MakeTupleHelper(
-    PjRtStreamExecutorClient* client, LocalDeviceState* local_device,
-    bool strict_shape_checking, const Shape& tupled_parameter_shape,
-    absl::Span<PjRtBuffer* const> py_buffers,
-    absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
-    int device_ordinal) {
+MakeTupleHelper(PjRtStreamExecutorClient* client,
+                LocalDeviceState* local_device, bool strict_shape_checking,
+                const Shape& tupled_parameter_shape,
+                absl::Span<PjRtBuffer* const> py_buffers,
+                absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
+                int device_ordinal) {
   se::DeviceMemoryAllocator* allocator = client->allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
@@ -2045,16 +1176,16 @@ MakeTupleHelper(
   auto iterator_end = execution_input.end();
   // First set the root tuple table which is the first buffer in the ShapeTree.
   input_iterator->second = {
-      true,
-      RawSEDeviceMemory::Create(owned_root_table_memory.Release(),
-                                local_device->local_device_id(), allocator)};
+      true, RawSEDeviceMemory::Create(owned_root_table_memory.Release(),
+                                      local_device, allocator)};
   ++input_iterator;
   // Then set each sub-tuple in turn from the parameters.
-  for (const PjRtStreamExecutorBuffer::ScopedHold& device_buffer :
-       device_buffers) {
+  for (const CommonPjRtBuffer::ScopedHold& device_buffer : device_buffers) {
     input_iterator->second = {
-        device_buffer.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation,
-        device_buffer->device_memory()};
+        device_buffer.type() == CommonPjRtBuffer::ScopedHold::kDonation,
+        tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+            device_buffer.buffer()->raw_buffer().get())
+            ->device_buffer()};
     ++input_iterator;
   }
   CHECK(input_iterator == iterator_end);
@@ -2081,20 +1212,16 @@ MakeTupleHelper(
 // Converts a ScopedShapedBuffer returned from an execution into a
 // PjRtBuffer.
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> OutputBufferHelper(
-    ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
+    ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer,
     BufferSequencingEventRef definition_event, PjRtClient* client,
-    PjRtDevice* device, LocalDeviceState* local_device,
-    std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release) {
+    PjRtDevice* device, LocalDeviceState* local_device) {
   if (result_buffer.shape().IsTuple()) {
     return absl::InternalError("OutputBufferHelper called on tuple.");
   }
-  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 1> buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<RawSEDeviceMemory>, 1> buffers;
   for (auto& item : result_buffer) {
     buffers.push_back(std::move(item.second));
   }
-  auto out_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device, std::move(buffers[0]),
-      absl::Span<const BufferSequencingEventRef>{definition_event});
   const Shape& shape = result_buffer.shape();
   PjRtMemorySpace* memory_space =
       device->default_memory_space().value_or(nullptr);
@@ -2117,12 +1244,14 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> OutputBufferHelper(
                          shape.layout().memory_space()));
     }
   }
-  auto pjrt_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      result_buffer.shape(), std::move(out_buffer), client, device,
-      memory_space);
-  RecordUsage(pjrt_buffer->GetBufferWithUsageHold(), local_device, local_device,
-              definition_event, local_device->compute_stream(),
-              &buffers_to_release);
+  auto raw_buffer = tsl::MakeRef<PjRtStreamExecutorRawBuffer>(
+      tensorflow::down_cast<PjRtStreamExecutorClient*>(client), memory_space,
+      local_device, buffers[0]);
+  auto out_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device, std::move(raw_buffer),
+      absl::Span<const BufferSequencingEventRef>{definition_event});
+  auto pjrt_buffer = std::make_unique<CommonPjRtBufferImpl>(
+      result_buffer.shape(), std::move(out_buffer), memory_space);
   return std::unique_ptr<PjRtBuffer>(std::move(pjrt_buffer));
 }
 
@@ -2238,7 +1367,7 @@ PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
     int device_ordinal, const ExecuteOptions& options,
     absl::Span<const Shape> executable_parameter_shapes,
     absl::Span<PjRtBuffer* const> argument_handles,
-    absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
+    absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
     absl::flat_hash_set<BufferSequencingEvent*>& events) const {
   std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> execution_inputs;
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
@@ -2247,7 +1376,7 @@ PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
   // Lift tuple_write_event outside the conditional so that the event it
   // returns is not destroyed until after the loop below that waits on events.
   BufferSequencingEventRef tuple_write_event;
-  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
+  if (parameter_is_tupled_arguments_) {
     TF_ASSIGN_OR_RETURN(
         auto tuple_handle,
         MakeTupleHelper(client_, device_state, options.strict_shape_checking,
@@ -2274,11 +1403,12 @@ PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
           execution_inputs.back();
       auto input_iterator = execution_input.begin();
       auto iterator_end = execution_input.end();
-      const auto& buf = device_buffers[i]->device_memory();
+      const auto& buf = tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+                            device_buffers[i].buffer()->raw_buffer().get())
+                            ->device_buffer();
       CHECK(input_iterator != iterator_end);
       input_iterator->second = {
-          device_buffers[i].type() ==
-              PjRtStreamExecutorBuffer::ScopedHold::kDonation,
+          device_buffers[i].type() == CommonPjRtBuffer::ScopedHold::kDonation,
           buf};
       ++input_iterator;
       CHECK(input_iterator == iterator_end);
@@ -2410,7 +1540,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
           {{"channel_id", channel_id_}});
     });
 
-    absl::ReleasableMutexLock lock(&mu_);
+    absl::ReleasableMutexLock lock(mu_);
 
     VLOG(3) << "Add chunk to a H2D channel #" << channel_id_ << ": "
             << "size=" << chunk.size() << ", "
@@ -2558,7 +1688,7 @@ PjRtStreamExecutorClient::RunAsync(
       ExecutionOutput output,
       exec.RunAsync(std::move(xla_arguments), std::move(run_options)));
   ScopedShapedBuffer ssb = output.ConsumeResult();
-  xla::ShapeTree<tsl::RCReference<RawSEDeviceMemory>> results(
+  xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       ssb.on_device_shape());
   auto it = results.begin();
   se::DeviceMemoryAllocator* allocator = ssb.memory_allocator();
@@ -2566,7 +1696,10 @@ PjRtStreamExecutorClient::RunAsync(
   for (auto& buf : released_ssb.buffers()) {
     CHECK(it != results.end());
     it->second = RawSEDeviceMemory::Create(
-        buf.second, device->local_device_id(), allocator);
+        buf.second,
+        tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
+            ->local_device_state(),
+        allocator);
     ++it;
   }
   CHECK(it == results.end());
@@ -2586,12 +1719,12 @@ PjRtStreamExecutorClient::RunAsync(
 // converted on success.
 // When `options` has non-zero `launch_id`, use `launch_id` instead of `run_id`
 // to initialize `run_options`.
-absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+absl::StatusOr<ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>>>
 PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
     PjRtDevice* device,
-    std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
+    std::vector<CommonPjRtBuffer::ScopedHold>* device_buffers,
     std::shared_ptr<DeviceAssignment> device_assignment,
     std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const {
   int device_ordinal = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
@@ -2614,7 +1747,7 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
   donation_clashes.reserve(argument_handles.size());
   for (int i = 0; i < argument_handles.size(); ++i) {
     auto* handle =
-        tensorflow::down_cast<PjRtStreamExecutorBuffer*>(argument_handles[i]);
+        tensorflow::down_cast<CommonPjRtBuffer*>(argument_handles[i]);
     if (handle->device() != device) {
       return InvalidArgument(
           "Buffer passed to Execute() as argument %d to replica %d is on "
@@ -2631,16 +1764,17 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     TF_RETURN_IF_ERROR(TestBufferDonationClashes(
         handle, donation_clashes, must_donate, i, replica, partition));
     device_buffers->emplace_back(handle->GetBufferWithHold(
-        must_donate ? PjRtStreamExecutorBuffer::ScopedHold::kDonation
-                    : PjRtStreamExecutorBuffer::ScopedHold::kUsage));
-    PjRtStreamExecutorBuffer::ScopedHold& device_buffer =
-        device_buffers->back();
-    if (!device_buffer.ok()) {
+        must_donate ? CommonPjRtBuffer::ScopedHold::kDonation
+                    : CommonPjRtBuffer::ScopedHold::kUsage));
+    CommonPjRtBuffer::ScopedHold& hold = device_buffers->back();
+    if (!hold.ok()) {
       return InvalidArgument(
           "Invalid buffer passed to Execute() as argument %d to replica %d: "
           "%s",
-          i, replica, device_buffer.status().ToString());
+          i, replica, hold.status().ToString());
     }
+    auto* device_buffer =
+        tensorflow::down_cast<TrackedDeviceBuffer*>(hold.buffer());
     // If we are trying to donate the buffer wait on the usage events as well
     // as the definition events to ensure that all reads have been completed
     // before the buffer is mutated. Usage holds are excluded during a donation
@@ -2656,20 +1790,6 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
                           &events);
   }
 
-  if (options.arguments_are_tupled) {
-    if (!parameter_is_tupled_arguments_) {
-      return InvalidArgument(
-          "Arguments may only be supplied as a tuple when the executable was "
-          "compiled with a single tupled parameter");
-    }
-    if (argument_handles.size() != 1) {
-      return InvalidArgument(
-          "Option arguments_are_tupled was true but %d buffers were passed to "
-          "execution",
-          argument_handles.size());
-    }
-  }
-
   TF_ASSIGN_OR_RETURN(
       std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> execution_inputs,
       MakeExecutionInputsAndWaitForEvents(
@@ -2843,11 +1963,9 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     int device_ordinal, const ExecuteOptions& options,
-    ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
+    ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer,
     BufferSequencingEventRef definition_event, PjRtDevice* device,
-    std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks,
-    std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release)
-    const {
+    std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const {
   tsl::profiler::TraceMe traceme("MakeOutputBuffers");
   std::vector<std::unique_ptr<PjRtBuffer>> outputs;
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
@@ -2858,12 +1976,12 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     // in result_buffer.
     for (int i = 0; i < tuple_count; ++i) {
       TF_ASSIGN_OR_RETURN(
-          ShapeTree<tsl::RCReference<RawSEDeviceMemory>> tuple_buffer,
+          ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> tuple_buffer,
           result_buffer.SubShapeTree({i}));
       TF_ASSIGN_OR_RETURN(
           std::unique_ptr<PjRtBuffer> buffer,
           OutputBufferHelper(std::move(tuple_buffer), definition_event, client_,
-                             device, device_state, buffers_to_release));
+                             device, device_state));
       outputs.push_back(std::move(buffer));
     }
     if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
@@ -2875,7 +1993,7 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<PjRtBuffer> buffer,
         OutputBufferHelper(std::move(result_buffer), definition_event, client_,
-                           device, device_state, buffers_to_release));
+                           device, device_state));
     outputs.push_back(std::move(buffer));
   }
   return outputs;
@@ -2884,13 +2002,15 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
 static absl::Status GetFirstInputError(
     absl::Span<PjRtBuffer* const> argument_handles) {
   for (auto* handle : argument_handles) {
-    auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(handle);
-    PjRtStreamExecutorBuffer::ScopedHold hold =
-        buffer->GetBufferWithUsageHold();
+    auto* buffer = tensorflow::down_cast<CommonPjRtBuffer*>(handle);
+    CommonPjRtBuffer::ScopedHold hold =
+        buffer->GetBufferWithHold(CommonPjRtBuffer::ScopedHold::kUsage);
     if (!hold.ok()) {
       return hold.status();
     }
-    for (const auto& event : hold->definition_events()) {
+    auto* tracked_buffer =
+        tensorflow::down_cast<TrackedDeviceBuffer*>(hold.buffer());
+    for (const auto& event : tracked_buffer->definition_events()) {
       if (event->IsPredeterminedError()) {
         return event->GetDefinedStatus();
       }
@@ -2961,9 +2081,9 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   int executable_idx = executables_.size() > 1 ? partition : 0;
 
   std::vector<absl::AnyInvocable<void() &&>> compute_callbacks;
-  std::vector<PjRtStreamExecutorBuffer::ScopedHold> device_buffers;
+  std::vector<CommonPjRtBuffer::ScopedHold> device_buffers;
   device_buffers.reserve(argument_handles.size());
-  absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+  absl::StatusOr<ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>>>
       result_buffer_or_status =
           EnqueueExecution(argument_handles, replica, partition, executable_idx,
                            run_id, options, device, &device_buffers,
@@ -2974,39 +2094,52 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
                << " failed: " << result_buffer_or_status.status();
     return result_buffer_or_status.status();
   }
-  ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer =
+  ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer =
       std::move(result_buffer_or_status).value();
 
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
   se::Stream* stream = device_state->compute_stream();
 
-  auto definition_event = device_state->GetEventForComputeStreamSyncPoint(
+  auto definition_event_or = device_state->GetEventForComputeStreamSyncPoint(
       device_state->GetNextComputeStreamSyncPoint(), client_->thread_pool());
-  if (!definition_event.ok()) {
+  if (!definition_event_or.ok()) {
     StallStreamOnError(device_state, stream);
-    for (PjRtStreamExecutorBuffer::ScopedHold& b : device_buffers) {
-      if (b.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation) {
+    for (CommonPjRtBuffer::ScopedHold& b : device_buffers) {
+      if (b.type() == CommonPjRtBuffer::ScopedHold::kDonation) {
         // Even though there was an error we need to call ConfirmDonation, which
         // renders b invalid, since the computation has been enqueued and b has
         // been donated.
         b.ConfirmDonation();
       }
     }
-    return definition_event.status();
+    return definition_event_or.status();
+  }
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> leaves_to_release;
+  if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
+    leaves_to_release.reserve(result_buffer.leaf_count());
+    for (auto& node : result_buffer.leaves()) {
+      leaves_to_release.push_back(node.second);
+    }
   }
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> buffers_to_release;
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> buffers_to_release;
+  auto definition_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
+      *definition_event_or, "PjRtStreamExecutorLoadedExecutable", "Execute");
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<PjRtBuffer>> outputs,
       MakeOutputBuffers(device_ordinal, options, std::move(result_buffer),
-                        *definition_event, device, compute_callbacks,
-                        buffers_to_release));
+                        *definition_event_or, device, compute_callbacks));
 
-  for (PjRtStreamExecutorBuffer::ScopedHold& b : device_buffers) {
-    if (b.type() == PjRtStreamExecutorBuffer::ScopedHold::kUsage) {
-      RecordUsage(std::move(b), device_state, device_state, *definition_event,
-                  stream, &buffers_to_release);
+  for (CommonPjRtBuffer::ScopedHold& b : device_buffers) {
+    if (b.type() == CommonPjRtBuffer::ScopedHold::kUsage) {
+      if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
+        buffers_to_release.push_back(
+            tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+                b.buffer()->raw_buffer().get())
+                ->device_buffer());
+      }
+      b.ConvertUsageHold(definition_event);
     } else {
-      CHECK(b.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation);
+      CHECK(b.type() == CommonPjRtBuffer::ScopedHold::kDonation);
       b.ConfirmDonation();
     }
   }
@@ -3020,7 +2153,8 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   }
   definition_event->AndThen(
       [callbacks{std::move(compute_callbacks)},
-       buffers_to_release{std::move(buffers_to_release)}]() mutable {
+       buffers_to_release{std::move(buffers_to_release)},
+       leaves_to_release = std::move(leaves_to_release)]() mutable {
         for (auto& fn : callbacks) {
           std::move(fn)();
         }
@@ -3122,7 +2256,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
             ExecuteHelper(argument_handles[i], replica, partition, run_id,
                           options, returned_futures.has_value());
 
-        absl::MutexLock lock(&mu);
+        absl::MutexLock lock(mu);
         --running;
         if (!results[i].ok()) {
           if (failed == 0) {
@@ -3137,7 +2271,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
       mu.AssertHeld();
       return running == 0 || failed > 0;
     };
-    absl::MutexLock lock(&mu);
+    absl::MutexLock lock(mu);
     mu.Await(absl::Condition(&done_running_or_failed));
     if (failed > 0) {
       auto done_running = [&]() {
@@ -3834,7 +2968,7 @@ PjRtStreamExecutorClient::Load(std::unique_ptr<PjRtExecutable> executable,
 
 bool PjRtStreamExecutorClient::IsDmaMapped(const void* data_start,
                                            int64_t transfer_size) {
-  absl::MutexLock lock(&dma_maps_mutex_);
+  absl::MutexLock lock(dma_maps_mutex_);
   if (!dma_maps_.empty()) {
     void* data_end = (char*)data_start + transfer_size;
     for (const auto& [map_start, map_size] : dma_maps_) {
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 899a3853efd16b..5d69ea692412a7 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -54,7 +54,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/transpose.h"
@@ -84,13 +83,13 @@ struct PjRtStreamExecutorExecutionInput {
   // Donation is not complete until ReleaseDeviceMemory() is called on the
   // TrackedDeviceBuffer that provides buf.
   bool is_donated;
-  tsl::RCReference<RawSEDeviceMemory> buf;
+  tsl::AsyncValueRef<RawSEDeviceMemory> buf;
 };
 
 struct PjRtStreamExecutorExecutionOutput {
-  ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result;
+  ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result;
   // Donated inputs which must be freed.
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> to_be_released;
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
   // For PjRtStreamExecutorClient implementations that
   // use OwningDeviceMemory for donated inputs.
   std::vector<se::OwningDeviceMemory> se_to_be_released;
@@ -393,12 +392,16 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
       bool retry_on_oom, tsl::AsyncValueRef<bool> allocate_after) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> DefineBuffer(
-      const Shape& on_device_shape,
+      const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
           definition_device_events,
       bool raw_buffer_is_mutable) override;
 
+  absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                           PjRtFulfillAliasRawBufferCallback>>
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
+
   absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> LinearizeInto(
       const LiteralSlice& literal, const xla::Shape& device_shape,
       HostBufferSemantics host_buffer_semantics,
@@ -421,38 +424,8 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
                          const CommonPjRtRawBuffer& raw_buffer);
 
  protected:
-  friend class PjRtStreamExecutorBuffer;
   friend class PjRtStreamExecutorRawBuffer;
 
-  virtual void CopyToRemoteDevice(PjRtBuffer* buffer,
-                                  absl::string_view serialized_descriptor,
-                                  PjRtBuffer::RemoteSendCallback on_done) {
-    on_done(Unimplemented("Cross host sends not implemented."),
-            /*sends_were_enqueued=*/false);
-  }
-
-  virtual Future<> CopyRawSubBufferToHost(PjRtBuffer* buffer, Future<void*> dst,
-                                          int64_t offset,
-                                          int64_t transfer_size) {
-    return Future<>(Unimplemented("Raw copies to host not implemented."));
-  }
-
-  virtual tsl::RCReference<PjRtDeviceEvent> CopyRawHostToDevice(
-      LocalDeviceState* local_device,
-      tsl::RCReference<RawSEDeviceMemory> device_buffer, const void* src,
-      int64_t offset, int64_t transfer_size) {
-    return CreateErrorDeviceEvent(
-        Unimplemented("Raw copies h2d not implemented."));
-  }
-
-  virtual tsl::RCReference<PjRtDeviceEvent> CopyRawDeviceToHost(
-      LocalDeviceState* local_device,
-      tsl::RCReference<RawSEDeviceMemory> device_buffer, void* dst,
-      int64_t offset, int64_t transfer_size) {
-    return CreateErrorDeviceEvent(
-        Unimplemented("Raw copies d2h not implemented."));
-  }
-
   // Helper function for creating PjRtStreamExecutorExecutables. Modifies
   // `options` in-place.
   struct ExecutableExtras {
@@ -555,164 +528,6 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
 absl::StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
     absl::Span<const std::vector<PjRtDevice*>> devices);
 
-class PjRtStreamExecutorBuffer : public CommonPjRtBufferImpl {
- public:
-  class ScopedHold : public CommonPjRtBuffer::ScopedHold {
-   public:
-    // Converts the hold into a usage event. Only valid for holds of type
-    // kUsage.
-    //
-    //   usage_stream:   the stream that the buffer was used on.
-    //   event:          an event that has been recorded on usage_stream after
-    //                   the buffer was used.
-    //   reference_held: true if and only if the caller has caused a
-    //                   reference to this->buffer() to stay live until after
-    //                   the host is sure that the usage (transfer or execution)
-    //                   has completed.
-    void ConvertUsageHold(se::Stream* usage_stream,
-                          BufferSequencingEventRef event, bool reference_held);
-
-    TrackedDeviceBuffer* buffer() const {
-      return static_cast<TrackedDeviceBuffer*>(
-          CommonPjRtBuffer::ScopedHold::buffer());
-    }
-    TrackedDeviceBuffer* operator->() const { return buffer(); }
-    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
-
-    PjRtStreamExecutorBuffer* parent() const {
-      return static_cast<PjRtStreamExecutorBuffer*>(
-          CommonPjRtBuffer::ScopedHold::parent());
-    }
-
-   private:
-    using CommonPjRtBuffer::ScopedHold::ScopedHold;
-    friend class PjRtStreamExecutorBuffer;
-    friend class PjRtStreamExecutorClient;
-  };
-  PjRtStreamExecutorBuffer(Shape on_device_shape,
-                           std::unique_ptr<TrackedDeviceBuffer> device_buffer,
-                           PjRtClient* client, PjRtDevice* device,
-                           PjRtMemorySpace* memory_space);
-  ~PjRtStreamExecutorBuffer() override;
-
-  PjRtStreamExecutorBuffer(const PjRtStreamExecutorBuffer&) = delete;
-  PjRtStreamExecutorBuffer(PjRtStreamExecutorBuffer&&) = delete;
-  PjRtStreamExecutorBuffer& operator=(const PjRtStreamExecutorBuffer&) = delete;
-  PjRtStreamExecutorBuffer& operator=(PjRtStreamExecutorBuffer&&) = delete;
-
-  using PjRtBuffer::ToLiteralSync;
-  Future<> ToLiteral(MutableLiteralBase* literal) override;
-  Future<> LazyToLiteral(
-      absl::AnyInvocable<Future<MutableLiteralBase*>() &&> generator) override;
-
-  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
-
-  Future<> CopyRawToHost(void* dst, int64_t offset,
-                         int64_t transfer_size) override;
-
-  Future<> CopyRawToHostFuture(Future<void*> dst, int64_t offset,
-                               int64_t transfer_size) override;
-
-  // Drops the buffer's reference to its associated device memory, leaving the
-  // buffer in an invalid state. The memory will be freed lazily when all async
-  // operations using the buffer have completed, according to the allocation
-  // semantics of the underlying platform. Delete may briefly block if another
-  // thread is in the process of enqueuing an operation on this buffer, but it
-  // will never block for a stream operation to complete. If an external
-  // framework holds a reference to the TrackedDeviceBuffer via
-  // GetBufferWithExternalReference, the memory will not be freed until the
-  // external framework drops the reference.
-  void Delete() override;
-
-  // Returns a hold on the TrackedDeviceBuffer holding the device
-  // buffers. See comment on ScopedHold.
-  ScopedHold GetBufferWithHold(ScopedHold::Type type);
-  ScopedHold GetBufferWithUsageHold() {
-    return GetBufferWithHold(ScopedHold::kUsage);
-  }
-  ScopedHold GetBufferWithExternalReference() {
-    return GetBufferWithHold(ScopedHold::kExternalReference);
-  }
-
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
-      PjRtMemorySpace* dst_memory_space) override;
-
-  void CopyToRemoteDevice(Future<std::string> serialized_descriptor,
-                          RemoteSendCallback on_done) override;
-
-  Future<> GetReadyFuture() override;
-
-  bool IsOnCpu() const override;
-
-  // Similar to Delete, drops the buffer's reference to its associated device
-  // memory, leaving the buffer in an invalid state, but returns the
-  // TrackedDeviceBuffer rather than freeing the device memory, so that another
-  // framework can take ownership of it.
-  //
-  // When called with wait_for_operations_to_complete=false, the buffer returned
-  // from Release should be dropped on the compute stream, since the only events
-  // that Release doesn't wait for are events defined on the compute stream.
-  //
-  // If wait_for_operations_to_complete=true, the host will block until any
-  // potentially outstanding asynchronous operations have completed before
-  // returning, in which case it is safe to read or mutate the returned buffer.
-  // If the buffer was shared via an external reference it is the client's
-  // responsibility that accesses via that reference do not interfere with
-  // accesses via the buffer returned from Release.
-  absl::StatusOr<tsl::RCReference<RawSEDeviceMemory>> Release(
-      bool wait_for_operations_to_complete);
-
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
-      Future<> dependency) override;
-
- private:
-  friend class PjRtClient;
-
-  TrackedDeviceBuffer* device_buffer() const
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    return static_cast<TrackedDeviceBuffer*>(CommonPjRtBuffer::device_buffer());
-  }
-
-  // Drops a usage hold and calls device_buffer_->AddUsageEvent. Does a sanity
-  // check that buffer==device_buffer_ or device_buffer_==nullptr. Called after
-  // device_buffer_ was successfully enqueued on a stream.
-  void ConvertUsageHold(TrackedDeviceBuffer* buffer, se::Stream* usage_stream,
-                        BufferSequencingEventRef event, bool reference_held);
-
-  absl::StatusOr<
-      std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>>
-  CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
-                     PjRtMemorySpace* dst_memory_space,
-                     LocalDeviceState* transfer_local_device,
-                     LocalDeviceState* src_local_device,
-                     se::Stream* transfer_stream,
-                     const TrackedDeviceBuffer& src_device_buffer);
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceMemorySpace(
-      PjRtDevice* dst_device, PjRtMemorySpace* dst_memory_space = nullptr);
-
-  Future<> ToLiteralHelper(Future<MutableLiteralBase*> literal);
-};
-
-// Allocates the device buffers for a buffer that will be used as the
-// destination of a copy, either from the host or another device. copy_stream
-// may be nullptr, e.g., when allocating a buffer for a cross-host copy. If the
-// buffer is a tuple then the tuple tables are allocated, and all necessary
-// synchronization for them is dealt with, before the buffer is returned.
-//
-// It is safe to delete the returned PjRtBuffer without further
-// synchronization if an error occurs before the buffer is used.
-//
-// The caller may optionally provide a definition event to be recorded in
-// the buffer.
-// TODO(phawkins): replace on_host_shape here with on_device_shape.
-absl::StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>>
-AllocateDestinationBuffer(const Shape& on_host_shape, PjRtDevice* device,
-                          LocalDeviceState* local_device,
-                          se::Stream* copy_stream, bool is_uninitialized_create,
-                          PjRtStreamExecutorClient* client,
-                          BufferSequencingEventRef definition_event = nullptr,
-                          PjRtMemorySpace* memory_space = nullptr);
-
 // Wraps one or more XLA LocalExecutables (one per partition, as specified by
 // the build options).
 class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
@@ -858,26 +673,24 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
       int device_ordinal, const ExecuteOptions& options,
       absl::Span<const Shape> executable_parameter_shapes,
       absl::Span<PjRtBuffer* const> argument_handles,
-      absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
+      absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
       absl::flat_hash_set<BufferSequencingEvent*>& events) const;
 
-  absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+  absl::StatusOr<ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>>>
   EnqueueExecution(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
       const ExecuteOptions& options, PjRtDevice* device,
-      std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
+      std::vector<CommonPjRtBuffer::ScopedHold>* device_buffers,
       std::shared_ptr<DeviceAssignment> device_assignment,
       std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const;
 
   virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeOutputBuffers(
       int device_ordinal, const ExecuteOptions& options,
-      ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
+      ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer,
       BufferSequencingEventRef definition_event, PjRtDevice* device,
-      std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks,
-      std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release)
-      const;
+      std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const;
 
   absl::StatusOr<Result> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
index e0b2173fb27b75..4c76380257f487 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -114,7 +113,6 @@ absl::Status ExecuteWithSameInputBuffer(
   TF_ASSIGN_OR_RETURN(auto executable,
                       ToyExecutable(*client, shape, std::move(set_up_aliases)));
   xla::ExecuteOptions options;
-  options.untuple_result = true;
   return executable->Execute({{buffer.get(), buffer.get()}}, options).status();
 }
 
@@ -164,7 +162,7 @@ TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
       ShapeUtil::DeviceShapeToHostShape(blocked_buffer->on_device_shape()));
   bool got_literal = false;
   blocked_buffer->ToLiteral(result_literal.get()).OnReady([&](absl::Status s) {
-    absl::MutexLock l(&mu);
+    absl::MutexLock l(mu);
     TF_ASSERT_OK(s);
     got_literal = true;
   });
@@ -176,7 +174,7 @@ TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
   EXPECT_TRUE(future.IsReady());
 
   {
-    absl::MutexLock l(&mu);
+    absl::MutexLock l(mu);
     mu.Await(absl::Condition(&got_literal));
   }
 
diff --git a/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD b/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD
index 9c1d4dcd2f35fa..b29d8d4866c04f 100644
--- a/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD
@@ -28,7 +28,6 @@ xla_cc_test(
     deps = [
         ":myplugin_cpp_pjrt",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
     ],
diff --git a/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_cpp_pjrt_test.cc b/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_cpp_pjrt_test.cc
index 5a6ad059067b19..c10f41c7fa2622 100644
--- a/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_cpp_pjrt_test.cc
+++ b/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_cpp_pjrt_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status_matchers.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace {
 
diff --git a/third_party/xla/xla/pjrt/plugin/test/BUILD b/third_party/xla/xla/pjrt/plugin/test/BUILD
index fcd529d2ee593f..48447fdc07f253 100644
--- a/third_party/xla/xla/pjrt/plugin/test/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/test/BUILD
@@ -18,8 +18,6 @@ xla_cc_test(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/c_api_client:pjrt_c_api_client",
         "//xla/pjrt/plugin/xla_tpu:tpu_static_registration",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -42,8 +40,6 @@ xla_test(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/c_api_client:pjrt_c_api_client",
         "//xla/pjrt/plugin/xla_gpu:gpu_static_registration",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -80,8 +76,6 @@ xla_cc_test(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/c_api_client:pjrt_c_api_client",
         "//xla/pjrt/plugin/xla_cpu:cpu_static_registration",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/pjrt/plugin/test/plugin_registration_test.cc b/third_party/xla/xla/pjrt/plugin/test/plugin_registration_test.cc
index 6c497f1c19ab1f..c1f2a42236371a 100644
--- a/third_party/xla/xla/pjrt/plugin/test/plugin_registration_test.cc
+++ b/third_party/xla/xla/pjrt/plugin/test/plugin_registration_test.cc
@@ -24,12 +24,10 @@ limitations under the License.
 #include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/plugin/test/plugin_test_fixture.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace {
 
-using ::tsl::testing::StatusIs;
 using ::xla::PluginTestFixture;
 
 TEST_F(PluginTestFixture, PluginReportsValidName) {
diff --git a/third_party/xla/xla/pjrt/plugin/testing/BUILD b/third_party/xla/xla/pjrt/plugin/testing/BUILD
new file mode 100644
index 00000000000000..cb28ab02647b21
--- /dev/null
+++ b/third_party/xla/xla/pjrt/plugin/testing/BUILD
@@ -0,0 +1,67 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+load("//xla:xla.default.bzl", "xla_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "testing_cpp_pjrt",
+    srcs = [
+        "testing_cpp_pjrt.cc",
+    ],
+    hdrs = ["testing_cpp_pjrt.h"],
+    deps = [
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:fingerprint",
+    ],
+)
+
+xla_cc_test(
+    name = "testing_cpp_pjrt_test",
+    srcs = ["testing_cpp_pjrt_test.cc"],
+    deps = [
+        ":testing_cpp_pjrt",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "testing_c_pjrt_internal",
+    srcs = [
+        "testing_c_pjrt_internal.cc",
+    ],
+    hdrs = ["testing_c_pjrt_internal.h"],
+    deps = [
+        ":testing_cpp_pjrt",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_layouts_extension_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
+        "@com_google_absl//absl/status",
+    ],
+    alwayslink = 1,
+)
+
+xla_cc_test(
+    name = "testing_c_pjrt_internal_test",
+    srcs = ["testing_c_pjrt_internal_test.cc"],
+    deps = [
+        ":testing_c_pjrt_internal",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_helpers",
+        "//xla/pjrt/c:pjrt_c_api_layouts_extension_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal.cc b/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal.cc
new file mode 100644
index 00000000000000..d520bdd649d6e2
--- /dev/null
+++ b/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal.cc
@@ -0,0 +1,69 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/plugin/testing/testing_c_pjrt_internal.h"
+
+#include <cstdio>
+#include <memory>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/plugin/testing/testing_cpp_pjrt.h"
+
+namespace testing {
+
+// Create C++ client. Called by the C API and is the glue between the C API
+// and the C++ API.
+PJRT_Error* PJRT_TestingClient_Create(PJRT_Client_Create_Args* args) {
+  std::unique_ptr<xla::PjRtClient> client = CreateTestingPjrtClient();
+  args->client = pjrt::CreateWrapperClient(std::move(client));
+  printf("Creating PJRT Client from myplugin_pjrt.cc\n");
+  return nullptr;
+}
+
+PJRT_Error* PJRT_TestingExecuteContext_Create(
+    PJRT_ExecuteContext_Create_Args* args) {
+  return new PJRT_Error{absl::UnimplementedError(
+      "ExecuteContext not supported for Testing execution.")};
+}
+
+PJRT_Error* PJRT_TestingDeviceTopology_Create(
+    PJRT_TopologyDescription_Create_Args* args) {
+  return new PJRT_Error{absl::UnimplementedError(
+      "Topology not supported for Testing compilation.")};
+}
+
+const PJRT_Api* GetTestingPjrtApi(PJRT_Extension_Base* extension_base) {
+  printf("C++ Calling GetPjrtApi");
+  static PJRT_Layouts_Extension layouts_extension;
+  // Static for memory storage but reassigning to avoid reusing the same
+  // extension when testing with many calls to GetPjrtApi.
+  layouts_extension = pjrt::CreateLayoutsExtension(extension_base);
+
+  static const PJRT_Api pjrt_api = pjrt::CreatePjrtApi(
+      testing::PJRT_TestingClient_Create,
+      testing::PJRT_TestingExecuteContext_Create,
+      testing::PJRT_TestingDeviceTopology_Create,
+      pjrt::PJRT_Plugin_Initialize_NoOp, &layouts_extension.base,
+      pjrt::PJRT_Plugin_Attributes_Xla);
+
+  return &pjrt_api;
+}
+
+}  // namespace testing
diff --git a/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal.h b/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal.h
new file mode 100644
index 00000000000000..c0b5cc6b6652be
--- /dev/null
+++ b/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal.h
@@ -0,0 +1,29 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_TESTING_TESTING_C_PJRT_INTERNAL_H_
+#define XLA_PJRT_PLUGIN_TESTING_TESTING_C_PJRT_INTERNAL_H_
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+namespace testing {
+
+// Basic fake C API PJRT_Api for testing extensions.
+// Does not pass ownership of returned PJRT_Api* to caller.
+const PJRT_Api* GetTestingPjrtApi(PJRT_Extension_Base* extension_base);
+
+}  // namespace testing
+
+#endif  // XLA_PJRT_PLUGIN_TESTING_TESTING_C_PJRT_INTERNAL_H_
diff --git a/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal_test.cc b/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal_test.cc
new file mode 100644
index 00000000000000..0d5f4d3aae3ee6
--- /dev/null
+++ b/third_party/xla/xla/pjrt/plugin/testing/testing_c_pjrt_internal_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/plugin/testing/testing_c_pjrt_internal.h"
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+
+namespace testing {
+namespace {
+
+TEST(PjRtCApiTest, GetPjrtApi) {
+  const PJRT_Api* pjrt_api = GetTestingPjrtApi(nullptr);
+  ASSERT_NE(pjrt_api, nullptr);
+}
+
+TEST(PjRtCApiTest, TopologyCreate) {
+  const PJRT_Api* pjrt_api = GetTestingPjrtApi(nullptr);
+  PJRT_TopologyDescription_Create_Args args;
+  args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
+  PJRT_Error* error = pjrt_api->PJRT_TopologyDescription_Create(&args);
+  ASSERT_NE(error, nullptr);
+  PJRT_Error_Destroy_Args destroy_args = {PJRT_Error_Destroy_Args_STRUCT_SIZE,
+                                          nullptr, error};
+  pjrt_api->PJRT_Error_Destroy(&destroy_args);
+}
+
+TEST(PjRtCApiTest, ClientApi) {
+  const PJRT_Api* pjrt_api = GetTestingPjrtApi(nullptr);
+  ASSERT_NE(pjrt_api, nullptr);
+
+  PJRT_Client_Create_Args create_args;
+  create_args.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
+  create_args.create_options = nullptr;
+  create_args.num_options = 0;
+  create_args.kv_get_callback = nullptr;
+  create_args.kv_get_user_arg = nullptr;
+  create_args.kv_put_callback = nullptr;
+  create_args.kv_put_user_arg = nullptr;
+  PJRT_Error* error = pjrt_api->PJRT_Client_Create(&create_args);
+  CHECK(error == nullptr);
+  PJRT_Client* client = create_args.client;
+
+  PJRT_Client_PlatformName_Args pname_args;
+  pname_args.struct_size = PJRT_Client_PlatformName_Args_STRUCT_SIZE;
+  pname_args.client = client;
+  error = pjrt_api->PJRT_Client_PlatformName(&pname_args);
+  CHECK(error == nullptr);
+  EXPECT_EQ(absl::string_view(pname_args.platform_name,
+                              pname_args.platform_name_size),
+            "testing_pjrt_client");
+
+  PJRT_Client_Destroy_Args destroy_args;
+  destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
+  destroy_args.client = client;
+  error = pjrt_api->PJRT_Client_Destroy(&destroy_args);
+  CHECK(error == nullptr);
+}
+
+TEST(PjRtCApiTest, Extension) {
+  struct MyExtension {
+    PJRT_Extension_Base base;
+    int foo;
+  };
+  MyExtension my_extension;
+  my_extension.base.struct_size = sizeof(MyExtension);
+  my_extension.base.type = PJRT_Extension_Type_Unknown;
+  my_extension.base.next = nullptr;
+  my_extension.foo = 42;
+
+  const PJRT_Api* pjrt_api = GetTestingPjrtApi(&my_extension.base);
+  ASSERT_NE(pjrt_api, nullptr);
+
+  const auto* layouts_extension = pjrt::FindExtension<PJRT_Layouts_Extension>(
+      pjrt_api, PJRT_Extension_Type_Layouts);
+  ASSERT_NE(layouts_extension, nullptr);
+
+  const auto* unknown_extension =
+      pjrt::FindExtension<MyExtension>(pjrt_api, PJRT_Extension_Type_Unknown);
+  ASSERT_NE(unknown_extension, nullptr);
+  EXPECT_EQ(&unknown_extension->base, &my_extension.base);
+  EXPECT_EQ(unknown_extension->foo, 42);
+}
+
+}  // namespace
+}  // namespace testing
diff --git a/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt.cc b/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt.cc
new file mode 100644
index 00000000000000..7a74e99fc4bc10
--- /dev/null
+++ b/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt.cc
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/plugin/testing/testing_cpp_pjrt.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace testing {
+
+class TestingPjrtClient : public xla::PjRtClient {
+ public:
+  TestingPjrtClient() = default;
+  ~TestingPjrtClient() override = default;
+  absl::string_view platform_name() const override;
+  int process_index() const override;
+  int device_count() const override;
+  int addressable_device_count() const override;
+  absl::Span<xla::PjRtDevice* const> devices() const override;
+  absl::Span<xla::PjRtDevice* const> addressable_devices() const override;
+  absl::Span<xla::PjRtMemorySpace* const> memory_spaces() const override;
+  xla::PjRtPlatformId platform_id() const override;
+  absl::string_view platform_version() const override;
+
+ private:
+  std::vector<xla::PjRtDevice*> devices_;
+  std::vector<xla::PjRtMemorySpace*> memory_spaces_;
+};  // end class
+
+absl::string_view TestingPjrtClient::platform_name() const {
+  return "testing_pjrt_client";
+}
+
+int TestingPjrtClient::process_index() const { return 0; }
+
+xla::PjRtPlatformId TestingPjrtClient::platform_id() const {
+  constexpr char kTestingBackendName[] = "testing_backend";
+  static const uint64_t kTestingBackendId =
+      tsl::Fingerprint64(kTestingBackendName);
+  return kTestingBackendId;
+}
+
+int TestingPjrtClient::device_count() const { return 0; }
+
+int TestingPjrtClient::addressable_device_count() const { return 0; }
+
+absl::Span<xla::PjRtDevice* const> TestingPjrtClient::addressable_devices()
+    const {
+  return devices_;
+}
+
+absl::Span<xla::PjRtDevice* const> TestingPjrtClient::devices() const {
+  return devices_;
+}
+
+absl::Span<xla::PjRtMemorySpace* const> TestingPjrtClient::memory_spaces()
+    const {
+  return memory_spaces_;
+}
+
+absl::string_view TestingPjrtClient::platform_version() const {
+  return "Testing platform version";
+}
+
+std::unique_ptr<xla::PjRtClient> CreateTestingPjrtClient() {
+  return std::make_unique<testing::TestingPjrtClient>();
+}
+
+}  // namespace testing
diff --git a/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt.h b/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt.h
new file mode 100644
index 00000000000000..af624486a17e73
--- /dev/null
+++ b/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PLUGIN_TESTING_TESTING_CPP_PJRT_H_
+#define XLA_PJRT_PLUGIN_TESTING_TESTING_CPP_PJRT_H_
+
+#include <memory>
+
+#include "xla/pjrt/pjrt_client.h"
+
+namespace testing {
+
+// Minimal fake PjRtClient for testing using the PJRT C API to test extensions.
+// This is not a full backend. It's a lightweight test dependency that can be
+// more convenient for testing than a real backend (e.g. CPU).
+std::unique_ptr<xla::PjRtClient> CreateTestingPjrtClient();
+
+}  // namespace testing
+
+#endif  // XLA_PJRT_PLUGIN_TESTING_TESTING_CPP_PJRT_H_
diff --git a/third_party/xla/xla/service/cpu/windows_compatibility.h b/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt_test.cc
similarity index 52%
rename from third_party/xla/xla/service/cpu/windows_compatibility.h
rename to third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt_test.cc
index 4e10087e5fa74d..5688d47b116c51 100644
--- a/third_party/xla/xla/service/cpu/windows_compatibility.h
+++ b/third_party/xla/xla/pjrt/plugin/testing/testing_cpp_pjrt_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,19 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
-#define XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
+#include "xla/pjrt/plugin/testing/testing_cpp_pjrt.h"
 
-#ifdef _MSC_VER
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 
-extern "C" {
+namespace {
 
-// MSVC does not have sincos[f].
-void sincos(double x, double *sinv, double *cosv);
-void sincosf(float x, float *sinv, float *cosv);
+using ::testing::CreateTestingPjrtClient;
 
+TEST(TestingCPPTest, HasDeviceCount) {
+  auto client = CreateTestingPjrtClient();
+  EXPECT_EQ(client->device_count(), 0);
 }
 
-#endif  // _MSC_VER
+TEST(TestingCPPTest, GetHloCostAnalysis) {
+  auto client = CreateTestingPjrtClient();
 
-#endif  // XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
+  EXPECT_THAT(client->GetHloCostAnalysis(), testing::Not(absl_testing::IsOk()));
+}
+
+}  // namespace
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD b/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD
index 91c879b2046920..ccdffc69bee103 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD
@@ -43,6 +43,7 @@ cc_library(
     hdrs = ["cpu_client_options.h"],
     deps = [
         ":cpu_memory",
+        ":cpu_topology_description",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/service:hlo_module_config",
         "@com_google_absl//absl/status:statusor",
@@ -81,8 +82,10 @@ cc_library(
         ":cpu_topology",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_device_dimensions",
         "//xla/tsl/lib/strings:proto_serialization",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
@@ -115,9 +118,10 @@ xla_cc_test(
     deps = [
         ":cpu_topology",
         ":cpu_topology_description",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_dimensions",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
index d9e71ff37fca39..3af357c6f86763 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/collectives/cpu_collectives.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_memory.h"
+#include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
 #include "xla/service/hlo_module_config.h"
 
 namespace xla {
@@ -60,6 +61,15 @@ struct CpuClientOptions {
   std::function<absl::StatusOr<std::unique_ptr<CpuMemory>>(size_t size_bytes,
                                                            size_t alignment)>
       allocator;
+
+  // Maximum number of threads to use for any one transpose. We will use the
+  // the lesser of this number and the thread pool size. 1 = no threading.
+  int max_transpose_threads = 8;
+
+  // CPU topology description. Optional. If not provided, the topology will be
+  // detected from the host and will try to use cpu_device_count and process_id
+  // to build the topology.
+  const CpuTopologyDescription* topology = nullptr;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc
index bc6e2840642fe7..7a6d29c9a04370 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -27,8 +28,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_device_description.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
 #include "xla/shape.h"
@@ -51,6 +54,12 @@ absl::StatusOr<std::string> CpuTopologyDescription::Serialize() const {
   return result;
 }
 
+absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
+CpuTopologyDescription::ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+    xla::PjRtGlobalDeviceId device_id) const {
+  return std::make_pair(PjRtDeviceDimensions{0, 0, device_id.value()}, 0);
+}
+
 std::vector<std::unique_ptr<const PjRtDeviceDescription>>
 CpuTopologyDescription::DeviceDescriptions() const {
   std::vector<std::unique_ptr<const PjRtDeviceDescription>> devices;
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
index c445227616b5b8..2f2a4a9213029f 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
@@ -29,8 +29,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
 
 namespace xla {
@@ -79,16 +81,12 @@ class CpuTopologyDescription : public PjRtTopologyDescription {
   // correctly report process count.
   absl::StatusOr<int> ProcessCount() const override { return 1; }
 
-  absl::StatusOr<int> CoreCountOfDefaultType() const override {
+  absl::StatusOr<int> ChipsPerProcess() const override {
     return cpu_topology_.number_of_devices();
   }
 
-  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
-    return cpu_topology_.number_of_devices();
-  }
-
-  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
-    return cpu_topology_.number_of_devices();
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerChip() const override {
+    return 1;
   }
 
   absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
@@ -97,6 +95,10 @@ class CpuTopologyDescription : public PjRtTopologyDescription {
 
   absl::StatusOr<std::string> Serialize() const override;
 
+  absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
+  ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+      xla::PjRtGlobalDeviceId device_id) const override;
+
   // Returns vendor specific attributes about the topology.
   const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
       const override {
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description_test.cc b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description_test.cc
index 87b7ac4f8c77a0..bba46ec74d94d4 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description_test.cc
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description_test.cc
@@ -17,11 +17,14 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -100,5 +103,20 @@ TEST(CpuTopologyDescriptionTest, FromProto) {
               ElementsAre("attrA", "attrB"));
 }
 
+TEST(CpuTopologyDescriptionTest,
+     ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType) {
+  std::vector<CpuTopology::CpuDevice> cpu_devices = {{0, 0}, {0, 1}};
+  std::vector<std::string> machine_attributes = {"attr1", "attr2"};
+  CpuTopologyDescription topology(xla::CpuId(), "cpu", "1.0", cpu_devices,
+                                  machine_attributes);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_core,
+      topology.ChipCoordAndCoreIndexForLogicalDeviceOfDefaultType(
+          xla::PjRtGlobalDeviceId(1)));
+  auto [device_coords, core_id] = std::move(device_core);
+  ASSERT_EQ(device_coords, (PjRtDeviceDimensions{0, 0, 1}));
+  ASSERT_EQ(core_id, 0);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
index 3757f008f6ae75..c7e1c6daa747d6 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
@@ -39,6 +39,7 @@ cc_library(
     srcs = [],
     hdrs = ["xla_gpu_allocator_config.h"],
     deps = [
+        "//xla/tsl/framework:allocator",
     ],
 )
 
@@ -63,9 +64,11 @@ cc_library(
     name = "gpu_static_registration",
     srcs = ["gpu_static_registration.cc"],
     deps = [
+        ":xla_gpu_client_options",
         "//xla/pjrt/c:pjrt_c_api_gpu_internal",
         "//xla/pjrt/plugin:plugin_names",
         "//xla/pjrt/plugin:static_registration",
+        "//xla/service:gpu_plugin",
     ],
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h
index 56947853363803..fe75d777ece0ab 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h
@@ -19,6 +19,9 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <optional>
+#include <vector>
+
+#include "xla/tsl/framework/allocator.h"
 
 namespace xla {
 
@@ -57,6 +60,11 @@ struct GpuAllocatorConfig {
   // should be set to a multiple of 512MB to avoid wasting memory due to
   // granularity requirements.
   size_t collective_memory_size = 0;
+
+  // Callbacks that get called when the underlying suballocator allocates or
+  // deallocates memory. See `SubAllocator::Visitor` for more details.
+  std::vector<tsl::SubAllocator::Visitor> sub_allocator_alloc_visitors;
+  std::vector<tsl::SubAllocator::Visitor> sub_allocator_free_visitors;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
index 7f4c679ce94a03..8e7aa87b935372 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <set>
 #include <string>
 
-#include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
 
@@ -44,9 +43,6 @@ struct GpuClientOptions {
   // kv_store must be non-null if num_nodes > 1.
   std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
 
-  std::shared_ptr<DistributedRuntimeClient> distributed_runtime_client =
-      nullptr;
-
   bool abort_collectives_on_failure = false;
 
   bool enable_mock_nccl = false;
@@ -56,6 +52,8 @@ struct GpuClientOptions {
   std::optional<int> partition_index;
 
   bool use_tfrt_gpu_client = false;
+
+  int max_inflight_computations = 8;
 };
 
 }  //  namespace xla
diff --git a/third_party/xla/xla/pjrt/proto/BUILD b/third_party/xla/xla/pjrt/proto/BUILD
index b685483bdbe5b9..32e6cb28ba0823 100644
--- a/third_party/xla/xla/pjrt/proto/BUILD
+++ b/third_party/xla/xla/pjrt/proto/BUILD
@@ -41,6 +41,13 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library(
+    name = "pjrt_device_dimensions_proto",
+    srcs = ["pjrt_device_dimensions.proto"],
+    compatible_with = (get_compatible_with_libtpu_portable() + get_compatible_with_portable()),
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "pjrt_value_type_proto",
     srcs = ["pjrt_value_type.proto"],
diff --git a/third_party/xla/xla/pjrt/proto/compile_options.proto b/third_party/xla/xla/pjrt/proto/compile_options.proto
index c51645e7373b5e..699d583abe48e8 100644
--- a/third_party/xla/xla/pjrt/proto/compile_options.proto
+++ b/third_party/xla/xla/pjrt/proto/compile_options.proto
@@ -168,6 +168,7 @@ message CompileOptionsProto {
   map<string, OptionOverrideProto> env_option_overrides = 7;
   stream_executor.GpuTargetConfigProto target_config = 8;
   bool allow_in_place_mlir_modification = 9;
+  PrecisionConfig.Precision matrix_unit_operand_precision = 10;
 }
 
 // Helper for serializing opaque executables alongside CompileOptions.
diff --git a/third_party/xla/xla/pjrt/proto/pjrt_device_dimensions.proto b/third_party/xla/xla/pjrt/proto/pjrt_device_dimensions.proto
new file mode 100644
index 00000000000000..c37e2663a67f4f
--- /dev/null
+++ b/third_party/xla/xla/pjrt/proto/pjrt_device_dimensions.proto
@@ -0,0 +1,10 @@
+edition = "2023";
+
+package xla;
+
+option features.field_presence = IMPLICIT;
+option java_multiple_files = true;
+
+message PjRtDeviceDimensionsProto {
+  repeated int32 dimensions = 1;
+}
diff --git a/third_party/xla/xla/pjrt/raw_buffer.cc b/third_party/xla/xla/pjrt/raw_buffer.cc
index ae70ebc3f3f7be..9ca4f6be7bc6ff 100644
--- a/third_party/xla/xla/pjrt/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/raw_buffer.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/pjrt/raw_buffer.h b/third_party/xla/xla/pjrt/raw_buffer.h
index 196288bbe2974b..b7c5554cda6cad 100644
--- a/third_party/xla/xla/pjrt/raw_buffer.h
+++ b/third_party/xla/xla/pjrt/raw_buffer.h
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/future.h"
 #include "xla/literal.h"
 #include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/device_event.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/ref_count.h"
diff --git a/third_party/xla/xla/pjrt/raw_buffer_test.cc b/third_party/xla/xla/pjrt/raw_buffer_test.cc
index 0823960f6e7431..031e822b586ac3 100644
--- a/third_party/xla/xla/pjrt/raw_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/raw_buffer_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.cc b/third_party/xla/xla/pjrt/se_raw_buffer.cc
index 80dc97fd1a0320..1d5fc0516f7e10 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.cc
@@ -16,26 +16,39 @@ limitations under the License.
 #include "xla/pjrt/se_raw_buffer.h"
 
 #include <cstdint>
-#include <optional>
+#include <cstring>
+#include <memory>
 #include <utility>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/future.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/pjrt/tracked_device_buffer.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/primitive_util.h"
+#include "xla/service/generic_transfer_manager.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "tsl/platform/casts.h"
 #include "tsl/profiler/lib/connected_traceme.h"
@@ -92,8 +105,9 @@ void PjRtStreamExecutorDeviceEventPromise::SetFromSEEvent(
   event.AndThen([event = event_, original_event = event]() {
     if (auto* error = original_event.GetErrorIfPresent()) {
       event.SetError(*error);
+    } else {
+      event.SetStateConcrete();
     }
-    event.SetStateConcrete();
   });
 }
 
@@ -117,22 +131,56 @@ PjRtStreamExecutorRawBuffer::CopyRawHostToDeviceAndReturnEvent(
     const void* src, int64_t offset, int64_t transfer_size) {
   se::Stream* stream = local_device_->host_to_device_stream();
   auto device_event = BufferSequencingEvent::Create(client_->thread_pool());
-  client_->thread_pool()->Schedule(
-      [client = client_, device_event, local_device = local_device_, stream,
-       buffer = device_buffer_, src, offset, transfer_size]() mutable {
-        se::DeviceMemoryBase sub_buffer = buffer->mem();
-        if (transfer_size < sub_buffer.size()) {
-          sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
-        }
-        auto status = stream->Memcpy(&sub_buffer, src, transfer_size);
-        if (status.ok()) {
-          status = client->AllocateAndRecordEvent(device_event, local_device,
-                                                  stream);
-        }
-        if (!status.ok()) {
-          client->SetEventAsError(device_event, status);
+  device_event.AndThen([device_buffer = device_buffer_]() {});
+  client_->thread_pool()->Schedule([client = client_, device_event,
+                                    local_device = local_device_, stream, src,
+                                    offset, transfer_size,
+                                    buf = tsl::FormRef(this)]() mutable {
+    se::DeviceMemoryBase sub_buffer = buf->device_buffer_->mem();
+    if (transfer_size < sub_buffer.size()) {
+      sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
+    }
+    client->WaitForAllocation(stream, *buf);
+    std::shared_ptr<void> staging_buffer;
+    auto status = [&]() -> absl::Status {
+      if (transfer_size > 0) {
+        if (client->should_stage_host_to_device_transfers() &&
+            !client->IsDmaMapped(src, transfer_size)) {
+          if (client->host_memory_allocator() == nullptr) {
+            return absl::InvalidArgumentError(
+                "host_memory_allocator should be initialized for "
+                "staging buffer transfer.");
+          }
+          void* ptr = client->host_memory_allocator()->AllocateRaw(
+              tsl::Allocator::kAllocatorAlignment, transfer_size);
+          staging_buffer = std::shared_ptr<void>(
+              ptr,
+              [host_memory_allocator = client->host_memory_allocator()](
+                  void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+          auto copy_to_staging_buffer = [src, transfer_size,
+                                         staging_buffer]() mutable {
+            std::memcpy(staging_buffer.get(), src, transfer_size);
+          };
+          TF_RETURN_IF_ERROR(stream->DoHostCallback(copy_to_staging_buffer));
+          TF_RETURN_IF_ERROR(
+              stream->Memcpy(&sub_buffer, staging_buffer.get(), transfer_size));
+        } else {
+          TF_RETURN_IF_ERROR(stream->Memcpy(&sub_buffer, src, transfer_size));
         }
-      });
+      }
+      return absl::OkStatus();
+    }();
+    if (status.ok()) {
+      status =
+          client->AllocateAndRecordEvent(device_event, local_device, stream);
+      if (staging_buffer) {
+        device_event.AndThen([staging_buffer = std::move(staging_buffer)]() {});
+      }
+    }
+    if (!status.ok()) {
+      client->SetEventAsError(device_event, status);
+    }
+  });
   return tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
       std::move(device_event), "PjRtStreamExecutorRawBuffer",
       "CopyRawHostToDevice");
@@ -143,22 +191,53 @@ PjRtStreamExecutorRawBuffer::CopyRawDeviceToHostAndReturnEvent(
     void* dst, int64_t offset, int64_t transfer_size) {
   se::Stream* stream = local_device_->GetDeviceToHostStream();
   auto device_event = BufferSequencingEvent::Create(client_->thread_pool());
-  client_->thread_pool()->Schedule(
-      [client = client_, device_event, local_device = local_device_, stream,
-       buffer = device_buffer_, dst, offset, transfer_size]() mutable {
-        se::DeviceMemoryBase sub_buffer = buffer->mem();
-        if (transfer_size < sub_buffer.size()) {
-          sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
-        }
-        auto status = stream->Memcpy(dst, sub_buffer, transfer_size);
-        if (status.ok()) {
-          status = client->AllocateAndRecordEvent(device_event, local_device,
-                                                  stream);
-        }
-        if (!status.ok()) {
-          client->SetEventAsError(device_event, status);
+  device_event.AndThen([device_buffer = device_buffer_]() {});
+  client_->thread_pool()->Schedule([client = client_, device_event,
+                                    local_device = local_device_, stream, dst,
+                                    offset, transfer_size,
+                                    buf = tsl::FormRef(this)]() mutable {
+    se::DeviceMemoryBase sub_buffer = buf->device_buffer_->mem();
+    if (transfer_size < sub_buffer.size()) {
+      sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
+    }
+    client->WaitForAllocation(stream, *buf);
+    auto status = [&]() -> absl::Status {
+      if (transfer_size > 0) {
+        if (client->should_stage_host_to_device_transfers() &&
+            !client->IsDmaMapped(dst, transfer_size)) {
+          if (client->host_memory_allocator() == nullptr) {
+            return absl::InvalidArgumentError(
+                "host_memory_allocator should be initialized for "
+                "staging buffer transfer.");
+          }
+          void* ptr = client->host_memory_allocator()->AllocateRaw(
+              tsl::Allocator::kAllocatorAlignment, transfer_size);
+          std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
+              ptr,
+              [host_memory_allocator = client->host_memory_allocator()](
+                  void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+          TF_RETURN_IF_ERROR(
+              stream->Memcpy(staging_buffer.get(), sub_buffer, transfer_size));
+          auto copy_from_staging_buffer = [dst, transfer_size,
+                                           staging_buffer]() mutable {
+            std::memcpy(dst, staging_buffer.get(), transfer_size);
+          };
+          // TODO(parkers): This failing maybe consitutes a race.
+          TF_RETURN_IF_ERROR(stream->DoHostCallback(copy_from_staging_buffer));
+        } else {
+          TF_RETURN_IF_ERROR(stream->Memcpy(dst, sub_buffer, transfer_size));
         }
-      });
+      }
+      return absl::OkStatus();
+    }();
+    if (status.ok()) {
+      status =
+          client->AllocateAndRecordEvent(device_event, local_device, stream);
+    }
+    if (!status.ok()) {
+      client->SetEventAsError(device_event, status);
+    }
+  });
   return tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
       std::move(device_event), "PjRtStreamExecutorRawBuffer",
       "CopyRawDeviceToHost");
@@ -198,21 +277,132 @@ void PjRtStreamExecutorRawBuffer::ReadDynamicShape(
 void PjRtStreamExecutorRawBuffer::CopyToLiteralAsync(
     Promise<> promise, tsl::RCReference<PjRtDeviceEventPromise> device_promise,
     MutableLiteralBase* literal, xla::Shape shape) {
-  device_promise->SetError(
-      absl::UnimplementedError("Cannot CopyToLiteralAsync."));
-  promise.Set(absl::UnimplementedError("Cannot CopyToLiteralAsync."));
+  auto usage_event = BufferSequencingEvent::Create(client_->thread_pool());
+  usage_event.AndThen([device_buffer = device_buffer_]() {});
+  client_->async_work_runner()->Schedule(
+      [usage_event, local_device = local_device_,
+       on_device_shape = std::move(shape), promise = std::move(promise),
+       literal, client = client_, memory_space = memory_space_,
+       device_buffer = device_buffer_]() mutable {
+        std::shared_ptr<TransposePlan> transpose;
+        se::Stream* stream = local_device->GetDeviceToHostStream();
+        TransferManager* transfer_manager =
+            client->client()->backend().transfer_manager();
+        if (on_device_shape.IsArray()) {
+          xla::Layout literal_layout;
+          if (literal->shape().has_layout()) {
+            literal_layout = literal->shape().layout();
+          } else {
+            literal_layout = LayoutUtil::MakeDescendingLayout(
+                on_device_shape.dimensions().size());
+          }
+
+          if (on_device_shape.layout() != literal_layout) {
+            absl::InlinedVector<int64_t, 4> byte_strides(
+                on_device_shape.dimensions().size());
+            absl::Status s = ShapeUtil::UnpackedByteStrides(
+                on_device_shape, absl::MakeSpan(byte_strides));
+            if (!s.ok()) {
+              promise.Set(s);
+              client->SetEventAsError(usage_event, s);
+              return;
+            }
+            absl::Span<const int64_t> dims = on_device_shape.dimensions();
+            absl::InlinedVector<int64_t, 4> permutation(dims.size());
+            absl::c_reverse_copy(literal_layout.minor_to_major(),
+                                 permutation.begin());
+            TransposePlan::Options options;
+            options.elem_size_in_bytes =
+                primitive_util::ByteWidth(on_device_shape.element_type());
+            options.dims = on_device_shape.dimensions();
+            options.permutation = permutation;
+            options.input_layout = TransposePlan::Striding{byte_strides};
+            {
+              absl::MutexLock lock(client->transpose_mu_);
+              absl::StatusOr<std::shared_ptr<TransposePlan>> t =
+                  client->transpose_cache_.GetOrCreate(options);
+              if (!t.ok()) {
+                promise.Set(t.status());
+                client->SetEventAsError(usage_event, t.status());
+                return;
+              }
+              transpose = *std::move(t);
+            }
+          }
+        }
+
+        absl::StatusOr<EventPool::Handle> event_or =
+            local_device->event_pool().AllocateEvent(stream->parent());
+        if (!event_or.ok()) {
+          promise.Set(event_or.status());
+          client->SetEventAsError(usage_event, event_or.status());
+          return;
+        }
+
+        ShapedBuffer shaped_buffer = device_buffer->AsShapedBuffer(
+            memory_space->devices()[0], on_device_shape);
+
+        GenericTransferManager::LiteralFromDeviceMetadata transfer_metadata;
+        // We never call device functions from the `done` callback.
+        transfer_metadata.callback_is_host_callback_safe = true;
+
+        TransferManager::TransferMetadata* transfer_metadata_ptr =
+            (dynamic_cast<GenericTransferManager*>(transfer_manager) != nullptr)
+                ? &transfer_metadata
+                : nullptr;
+
+        if (transpose) {
+          // Copy the device buffer to a temporary literal with descending
+          // layout and transpose to the requested layout.
+
+          Shape stage_shape = literal->shape();
+          *stage_shape.mutable_layout() =
+              LayoutUtil::MakeDescendingLayout(stage_shape.dimensions().size());
+          auto staged = std::make_shared<Literal>(stage_shape);
+
+          transfer_manager->TransferLiteralFromDevice(
+              stream, shaped_buffer, staged.get(),
+              [transpose = std::move(transpose),
+               promise = std::move(promise).ToShared(), staged, client,
+               literal = std::move(literal)](absl::Status status) mutable {
+                if (status.ok()) {
+                  transpose->Execute(staged->untyped_data(),
+                                     literal->untyped_data());
+                }
+                client->async_work_runner()->Schedule(
+                    [promise = std::move(promise),
+                     status = std::move(status)]() {
+                      promise->Set(std::move(status));
+                    });
+              },
+              transfer_metadata_ptr);
+        } else {
+          transfer_manager->TransferLiteralFromDevice(
+              stream, shaped_buffer, literal,
+              [promise =
+                   std::move(promise).ToShared()](absl::Status status) mutable {
+                promise->Set(std::move(status));
+              },
+              transfer_metadata_ptr);
+        }
+
+        client->ThenRecordEvent(usage_event, local_device,
+                                std::move(event_or).value(), stream);
+      });
+
+  device_promise->Set(
+      tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(std::move(usage_event)));
 }
 
 absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
 PjRtStreamExecutorRawBuffer::MakeAllocationReadyEvent() {
   auto* client =
       tensorflow::down_cast<PjRtStreamExecutorClient*>(memory_space_->client());
-  auto result = BufferSequencingEvent::Create(client->thread_pool());
-  if (local_device_->allocation_model() ==
-      LocalDeviceState::kComputeSynchronized) {
-    TF_RETURN_IF_ERROR(client->AllocateAndRecordEvent(
-        result, local_device_, local_device_->compute_stream()));
-  } else {
+  TF_ASSIGN_OR_RETURN(auto result,
+                      device_buffer_->GetDefinitionEvent(
+                          client->thread_pool(), /*nullptr_if_past=*/false));
+  if (!result) {
+    result = BufferSequencingEvent::Create(client->thread_pool());
     auto stream = local_device_->BorrowStreamFromPool();
     auto status =
         client->AllocateAndRecordEvent(result, local_device_, stream.get());
@@ -227,12 +417,119 @@ void PjRtStreamExecutorRawBuffer::CopyTo(
     tsl::RCReference<PjRtDeviceEventPromise> definition_event_promise,
     tsl::RCReference<PjRtDeviceEventPromise> src_usage_event_promise,
     ::tsl::AsyncValueRef<bool> allocation_event) {
-  auto status = absl::UnimplementedError("CopyTo not implemented");
-  src_usage_event_promise->SetError(status);
   if (allocation_event) {
-    allocation_event.SetError(status);
+    allocation_event.SetStateConcrete();
+  }
+  if (dst_raw_buffer->memory_space()->client() == memory_space()->client()) {
+    auto usage_event = BufferSequencingEvent::Create(client_->thread_pool());
+    client_->thread_pool()->Schedule(
+        [client = client_, local_device = local_device_,
+         src_buffer = device_buffer_,
+         dst_raw_buffer = std::move(dst_raw_buffer),
+         src_raw_buffer = tsl::FormRef(this), usage_event]() {
+          se::Stream* stream = local_device->GetDeviceToDeviceStream();
+
+          absl::StatusOr<EventPool::Handle> event_or =
+              local_device->event_pool().AllocateEvent(stream->parent());
+          if (!event_or.ok()) {
+            client->SetEventAsError(usage_event, event_or.status());
+            return;
+          }
+
+          auto dst_buffer =
+              tensorflow::down_cast<const PjRtStreamExecutorRawBuffer*>(
+                  dst_raw_buffer.get())
+                  ->device_buffer();
+          auto dst_buffer_mem = dst_buffer->mem();
+          client->WaitForAllocation(stream, *src_raw_buffer);
+          client->WaitForAllocation(stream, *dst_raw_buffer);
+          auto status = stream->MemcpyD2D(&dst_buffer_mem, src_buffer->mem(),
+                                          dst_buffer_mem.size());
+          if (!status.ok()) {
+            client->SetEventAsError(usage_event, status);
+            return;
+          }
+
+          client->ThenRecordEvent(usage_event, local_device,
+                                  std::move(event_or).value(), stream);
+          usage_event.AndThen([src_buffer, dst_buffer]() {});
+        });
+
+    definition_event_promise->Set(
+        tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(usage_event));
+    src_usage_event_promise->Set(
+        tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(std::move(usage_event)));
+  } else if (auto* src_ptr = GetHostPointer()) {
+    auto h2d_event = dst_raw_buffer->CopyRawHostToDeviceAndReturnEvent(
+        src_ptr, 0, GetOnDeviceSizeInBytes());
+    if (!h2d_event.ok()) {
+      definition_event_promise->SetError(h2d_event.status());
+      src_usage_event_promise->SetError(h2d_event.status());
+      return;
+    }
+    (*h2d_event)
+        ->AndThen([src_usage_event_promise = std::move(src_usage_event_promise),
+                   src_buffer = tsl::FormRef(this)]() {
+          src_usage_event_promise->SetReady();
+        });
+    definition_event_promise->Set(*std::move(h2d_event));
+    return;
+  } else if (auto* dst_ptr = dst_raw_buffer->GetHostPointer()) {
+    auto d2h_event =
+        CopyRawDeviceToHostAndReturnEvent(dst_ptr, 0, GetOnDeviceSizeInBytes());
+    if (!d2h_event.ok()) {
+      definition_event_promise->SetError(d2h_event.status());
+      src_usage_event_promise->SetError(d2h_event.status());
+      return;
+    }
+    (*d2h_event)
+        ->AndThen(
+            [definition_event_promise = std::move(definition_event_promise),
+             d2h_event = *d2h_event, dst_buffer = dst_raw_buffer]() {
+              if (const absl::Status* error =
+                      d2h_event->async_value()->GetErrorIfPresent()) {
+                definition_event_promise->SetError(*error);
+              } else {
+                definition_event_promise->SetReady();
+              }
+            });
+    src_usage_event_promise->Set(*std::move(d2h_event));
+    return;
+  } else {
+    void* ptr = client_->host_memory_allocator()->AllocateRaw(
+        tsl::Allocator::kAllocatorAlignment, GetOnDeviceSizeInBytes());
+    std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
+        ptr, [host_memory_allocator = client_->host_memory_allocator()](
+                 void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+    auto d2h_event =
+        CopyRawDeviceToHostAndReturnEvent(ptr, 0, GetOnDeviceSizeInBytes());
+    if (!d2h_event.ok()) {
+      definition_event_promise->SetError(d2h_event.status());
+      src_usage_event_promise->SetError(d2h_event.status());
+      return;
+    }
+    (*d2h_event)
+        ->AndThen([staging_buffer, dst_raw_buffer,
+                   definition_event_promise =
+                       std::move(definition_event_promise),
+                   d2h_event = *d2h_event]() {
+          if (const absl::Status* error =
+                  d2h_event->async_value()->GetErrorIfPresent()) {
+            definition_event_promise->SetError(*error);
+          } else {
+            auto h2d_event = dst_raw_buffer->CopyRawHostToDeviceAndReturnEvent(
+                staging_buffer.get(), 0,
+                dst_raw_buffer->GetOnDeviceSizeInBytes());
+            if (!h2d_event.ok()) {
+              definition_event_promise->SetError(*error);
+            } else {
+              (*h2d_event)->AndThen([staging_buffer]() {});
+              definition_event_promise->Set(*std::move(h2d_event));
+            }
+          }
+        });
+    src_usage_event_promise->Set(*std::move(d2h_event));
   }
-  definition_event_promise->SetError(status);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.h b/third_party/xla/xla/pjrt/se_raw_buffer.h
index c99413f8b362e6..ae523bc80b14bb 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.h
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <cstdint>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/future.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/pjrt/tracked_device_buffer.h"
@@ -92,10 +92,10 @@ class PjRtStreamExecutorDeviceEventPromise : public PjRtDeviceEventPromise {
 
 class PjRtStreamExecutorRawBuffer : public CommonPjRtRawBuffer {
  public:
-  PjRtStreamExecutorRawBuffer(PjRtStreamExecutorClient* client,
-                              PjRtMemorySpace* memory_space,
-                              LocalDeviceState* local_device,
-                              tsl::RCReference<RawSEDeviceMemory> device_buffer)
+  PjRtStreamExecutorRawBuffer(
+      PjRtStreamExecutorClient* client, PjRtMemorySpace* memory_space,
+      LocalDeviceState* local_device,
+      tsl::AsyncValueRef<RawSEDeviceMemory> device_buffer)
       : client_(client),
         memory_space_(memory_space),
         local_device_(local_device),
@@ -105,7 +105,7 @@ class PjRtStreamExecutorRawBuffer : public CommonPjRtRawBuffer {
 
   LocalDeviceState* local_device() const { return local_device_; }
 
-  const tsl::RCReference<RawSEDeviceMemory>& device_buffer() const {
+  const tsl::AsyncValueRef<RawSEDeviceMemory>& device_buffer() const {
     return device_buffer_;
   }
 
@@ -151,7 +151,7 @@ class PjRtStreamExecutorRawBuffer : public CommonPjRtRawBuffer {
   PjRtStreamExecutorClient* client_;
   PjRtMemorySpace* memory_space_;
   LocalDeviceState* local_device_;
-  tsl::RCReference<RawSEDeviceMemory> device_buffer_;
+  tsl::AsyncValueRef<RawSEDeviceMemory> device_buffer_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/semaphore.cc b/third_party/xla/xla/pjrt/semaphore.cc
index 58118c3cf435e6..7a86c6df0dc1a8 100644
--- a/third_party/xla/xla/pjrt/semaphore.cc
+++ b/third_party/xla/xla/pjrt/semaphore.cc
@@ -41,12 +41,12 @@ void Semaphore::Acquire(int64_t amount) {
 
   mu_.LockWhen(absl::Condition(&CanAcquire, &args));
   value_ -= amount;
-  mu_.Unlock();
+  mu_.unlock();
 }
 
 bool Semaphore::TryAcquire(int64_t amount) {
   CHECK_GE(amount, 0);
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (value_ >= amount) {
     value_ -= amount;
     return true;
@@ -56,7 +56,7 @@ bool Semaphore::TryAcquire(int64_t amount) {
 
 void Semaphore::Release(int64_t amount) {
   CHECK_GE(amount, 0);
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   value_ += amount;
 }
 
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
index 9a2640867cffdc..c45d7137dc157f 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "xla/pjrt/stream_executor_executable.h"
 
+#include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <variant>
@@ -26,8 +28,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/client/local_client.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/stream_executor_executable.pb.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
 #include "xla/shape.h"
@@ -77,6 +81,92 @@ absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
   return proto.SerializeAsString();
 }
 
+StreamExecutorExecutable::StreamExecutorExecutable(
+    const CompileOptions& compile_options,
+    std::optional<HloModuleProto> unoptimized_hlo_module_proto,
+    std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+    LocalClient* local_client, int num_replicas, int num_partitions,
+    absl::string_view name, absl::string_view fingerprint,
+    absl::string_view default_memory_kind)
+    : compile_options_(compile_options),
+      unoptimized_hlo_module_proto_(std::move(unoptimized_hlo_module_proto)),
+      executables_(std::move(local_executables)),
+      local_client_(local_client),
+      num_replicas_(num_replicas),
+      num_partitions_(num_partitions),
+      name_(name),
+      fingerprint_(fingerprint),
+      default_memory_kind_(default_memory_kind) {
+  std::vector<std::shared_ptr<HloModule>> hlo_modules;
+  for (const auto& local_executable :
+       std::get<std::vector<std::unique_ptr<LocalExecutable>>>(executables_)) {
+    hlo_modules.push_back(local_executable->executable()->shared_module());
+  }
+  hlo_modules_ = std::move(hlo_modules);
+}
+
+absl::StatusOr<CompiledMemoryStats>
+StreamExecutorExecutable::GetCompiledMemoryStats() const {
+  CompiledMemoryStats memory_stats = CompiledMemoryStats();
+  if (auto* aot_executables =
+          std::get_if<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+              &executables_)) {
+    if (aot_executables->size() != 1) {
+      return Unimplemented(
+          "Retrieving CompiledMemoryStats is not supported for multiple "
+          "executables.");
+    }
+    const auto& aot_executable = (*aot_executables)[0];
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffers,
+                        aot_executable->buffer_assignment());
+
+    BufferAssignmentProto proto = buffers->ToProto();
+    memory_stats.serialized_buffer_assignment = proto.SerializeAsString();
+    std::vector<const BufferAllocation*> alloc_ptrs;
+    alloc_ptrs.reserve(buffers->Allocations().size());
+    for (const BufferAllocation& alloc : buffers->Allocations()) {
+      alloc_ptrs.push_back(&alloc);
+    }
+    memory_stats.PopulateBufferStatsFromAllocations(alloc_ptrs);
+    TF_ASSIGN_OR_RETURN(int64_t peak_memory, ComputePeakMemory(proto));
+    memory_stats.peak_memory_in_bytes = peak_memory;
+    return memory_stats;
+  }
+
+  const auto& local_executables =
+      std::get<std::vector<std::unique_ptr<LocalExecutable>>>(executables_);
+  if (local_executables.size() != 1) {
+    return absl::UnimplementedError(
+        "Retrieving CompiledMemoryStats is not supported for multiple "
+        "executables.");
+  }
+  const BufferAssignmentProto* proto =
+      local_executables[0]->executable()->buffer_assignment_proto();
+  if (proto != nullptr) {
+    memory_stats.serialized_buffer_assignment = proto->SerializeAsString();
+    TF_ASSIGN_OR_RETURN(int64_t peak_memory, ComputePeakMemory(*proto));
+    memory_stats.peak_memory_in_bytes = peak_memory;
+  }
+  memory_stats.PopulateBufferStatsFromAllocations(
+      local_executables[0]->executable()->GetAllocations());
+  memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
+  return memory_stats;
+}
+
+int64_t StreamExecutorExecutable::SizeOfGeneratedCodeInBytes() const {
+  if (std::holds_alternative<
+          std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+          executables_)) {
+    return 0;
+  }
+  int64_t size = 0;
+  for (auto& executable :
+       std::get<std::vector<std::unique_ptr<LocalExecutable>>>(executables_)) {
+    size += executable->executable()->SizeOfGeneratedCodeInBytes();
+  }
+  return size;
+}
+
 namespace {
 
 absl::StatusOr<absl::string_view> MemoryKindFromSimpleShape(
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
index 84857f81354285..cff9d15b53edde 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -58,24 +58,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
       std::vector<std::unique_ptr<LocalExecutable>> local_executables,
       LocalClient* local_client, int num_replicas, int num_partitions,
       absl::string_view name, absl::string_view fingerprint,
-      absl::string_view default_memory_kind)
-      : compile_options_(compile_options),
-        unoptimized_hlo_module_proto_(std::move(unoptimized_hlo_module_proto)),
-        executables_(std::move(local_executables)),
-        local_client_(local_client),
-        num_replicas_(num_replicas),
-        num_partitions_(num_partitions),
-        name_(name),
-        fingerprint_(fingerprint),
-        default_memory_kind_(default_memory_kind) {
-    std::vector<std::shared_ptr<HloModule>> hlo_modules;
-    for (const auto& local_executable :
-         std::get<std::vector<std::unique_ptr<LocalExecutable>>>(
-             executables_)) {
-      hlo_modules.push_back(local_executable->executable()->shared_module());
-    }
-    hlo_modules_ = std::move(hlo_modules);
-  }
+      absl::string_view default_memory_kind);
 
   absl::StatusOr<std::string> SerializeExecutable() const override;
 
@@ -93,52 +76,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
     return *hlo_modules_;
   }
 
-  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
-    CompiledMemoryStats memory_stats = CompiledMemoryStats();
-    if (auto* aot_executables = std::get_if<
-            std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
-            &executables_)) {
-      if (aot_executables->size() != 1) {
-        return Unimplemented(
-            "Retrieving CompiledMemoryStats is not supported for multiple "
-            "executables.");
-      }
-      const auto& aot_executable = (*aot_executables)[0];
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffers,
-                          aot_executable->buffer_assignment());
-
-      BufferAssignmentProto proto = buffers->ToProto();
-      memory_stats.serialized_buffer_assignment = proto.SerializeAsString();
-      std::vector<const BufferAllocation*> alloc_ptrs;
-      alloc_ptrs.reserve(buffers->Allocations().size());
-      for (const BufferAllocation& alloc : buffers->Allocations()) {
-        alloc_ptrs.push_back(&alloc);
-      }
-      memory_stats.PopulateBufferStatsFromAllocations(alloc_ptrs);
-      TF_ASSIGN_OR_RETURN(int64_t peak_memory, ComputePeakMemory(proto));
-      memory_stats.peak_memory_in_bytes = peak_memory;
-      return memory_stats;
-    } else {
-      const auto& local_executables =
-          std::get<std::vector<std::unique_ptr<LocalExecutable>>>(executables_);
-      if (local_executables.size() != 1) {
-        return absl::UnimplementedError(
-            "Retrieving CompiledMemoryStats is not supported for multiple "
-            "executables.");
-      }
-      const BufferAssignmentProto* proto =
-          local_executables[0]->executable()->buffer_assignment_proto();
-      if (proto != nullptr) {
-        memory_stats.serialized_buffer_assignment = proto->SerializeAsString();
-        TF_ASSIGN_OR_RETURN(int64_t peak_memory, ComputePeakMemory(*proto));
-        memory_stats.peak_memory_in_bytes = peak_memory;
-      }
-      memory_stats.PopulateBufferStatsFromAllocations(
-          local_executables[0]->executable()->GetAllocations());
-    }
-    memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
-    return memory_stats;
-  }
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override;
 
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override;
@@ -148,20 +86,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
     return absl::UnimplementedError("GetCostAnalysis is not supported.");
   }
 
-  int64_t SizeOfGeneratedCodeInBytes() const override {
-    if (std::holds_alternative<
-            std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
-            executables_)) {
-      return 0;
-    }
-    int64_t size = 0;
-    for (auto& executable :
-         std::get<std::vector<std::unique_ptr<LocalExecutable>>>(
-             executables_)) {
-      size += executable->executable()->SizeOfGeneratedCodeInBytes();
-    }
-    return size;
-  }
+  int64_t SizeOfGeneratedCodeInBytes() const override;
 
   const CompileOptions& compile_options() const { return compile_options_; }
 
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.cc b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
index a8fd3b2f464a0f..1013b3ff24d36f 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 
 namespace xla {
 
@@ -162,7 +161,7 @@ static int GetMutexId(
 void TfPjRtClient::TrackBuffer(TfPjRtBuffer* buffer) {
   int mutex_id = GetMutexId(buffer, mutex_id_from_device_id_);
   {
-    absl::MutexLock lock(&alive_buffers_[mutex_id].mu);
+    absl::MutexLock lock(alive_buffers_[mutex_id].mu);
     alive_buffers_[mutex_id].alive_buffers.insert(buffer);
   }
 }
@@ -173,7 +172,7 @@ void TfPjRtClient::UntrackBuffer(const TfPjRtBuffer* buffer) {
   }
   int mutex_id = GetMutexId(buffer, mutex_id_from_device_id_);
   {
-    absl::MutexLock lock(&alive_buffers_[mutex_id].mu);
+    absl::MutexLock lock(alive_buffers_[mutex_id].mu);
     alive_buffers_[mutex_id].alive_buffers.erase(buffer);
   }
 }
@@ -181,7 +180,7 @@ void TfPjRtClient::UntrackBuffer(const TfPjRtBuffer* buffer) {
 void TfPjRtClient::DestroyWrappedBuffersAndClient() {
   int num_mutexes = alive_buffers_.size();
   for (int i = 0; i < num_mutexes; ++i) {
-    absl::MutexLock lock(&alive_buffers_[i].mu);
+    absl::MutexLock lock(alive_buffers_[i].mu);
     for (auto* buffer : alive_buffers_[i].alive_buffers) {
       buffer->DestroyWrappedBuffer();
     }
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/xla/xla/pjrt/tf_pjrt_client.h
index da91ae5fbc2511..f1923d31970936 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/codegen/mlir_kernel_emitter.h b/third_party/xla/xla/pjrt/tpu_constants.h
similarity index 66%
rename from third_party/xla/xla/codegen/mlir_kernel_emitter.h
rename to third_party/xla/xla/pjrt/tpu_constants.h
index c653acbdbe8b4f..0d5c84cd27bdb3 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_emitter.h
+++ b/third_party/xla/xla/pjrt/tpu_constants.h
@@ -1,5 +1,4 @@
-
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_MLIR_KERNEL_EMITTER_H_
-#define XLA_CODEGEN_MLIR_KERNEL_EMITTER_H_
+#ifndef XLA_PJRT_TPU_CONSTANTS_H_
+#define XLA_PJRT_TPU_CONSTANTS_H_
 
-#include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "absl/strings/string_view.h"
 
 namespace xla {
 
-using MlirKernelEmitter = KernelEmitter<MlirKernelDefinition>;
+inline constexpr absl::string_view kTpuHbmMemorySpaceKind = "device";
 
 }  // namespace xla
 
-#endif  // XLA_CODEGEN_MLIR_KERNEL_EMITTER_H_
+#endif  // XLA_PJRT_TPU_CONSTANTS_H_
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.cc b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
index 0d2793e62bd89a..f5595d2ea39040 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
@@ -15,39 +15,41 @@ limitations under the License.
 
 #include "xla/pjrt/tracked_device_buffer.h"
 
-#include <algorithm>
-#include <atomic>
-#include <cstdint>
-#include <functional>
+#include <cstddef>
 #include <iterator>
+#include <limits>
 #include <memory>
-#include <string>
 #include <utility>
+#include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/device_event.h"
-#include "xla/pjrt/event_pool.h"
+#include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/pjrt/raw_buffer.h"
 #include "xla/pjrt/se_raw_buffer.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/event.h"
+#include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/logging.h"
-#include "tsl/profiler/lib/connected_traceme.h"
-#include "tsl/profiler/lib/context_types.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "tsl/platform/casts.h"
 
 namespace xla {
 
@@ -66,15 +68,22 @@ ShapedBuffer RawSEDeviceMemory::AsShapedBuffer(
 
 class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
  public:
-  AllocatedRawSEDeviceMemory(se::DeviceMemoryBase value, int device_ordinal,
+  AllocatedRawSEDeviceMemory(se::DeviceMemoryBase value,
+                             LocalDeviceState* local_device,
                              se::DeviceMemoryAllocator* allocator)
       : RawSEDeviceMemory(value),
         allocator_(allocator),
-        device_ordinal_(device_ordinal) {}
+        local_device_(local_device) {
+    if (local_device_->allocation_model() ==
+        LocalDeviceState::kComputeSynchronized) {
+      sync_point_ = local_device_->GetNextComputeStreamSyncPoint();
+    }
+  }
 
   ~AllocatedRawSEDeviceMemory() override {
     if (allocator_) {
-      absl::Status status = allocator_->Deallocate(device_ordinal_, mem());
+      absl::Status status = allocator_->Deallocate(
+          local_device_->local_device_id().value(), mem());
       if (!status.ok()) {
         LOG(ERROR) << "Buffer deallocation failed: " << status;
       }
@@ -83,16 +92,27 @@ class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
 
   void UnsafeReleaseMemory() override { allocator_ = nullptr; }
 
+  absl::StatusOr<BufferSequencingEventRef> GetDefinitionEvent(
+      tsl::thread::ThreadPool* thread_pool,
+      bool nullptr_if_past) const override {
+    if (sync_point_ != std::numeric_limits<size_t>::max()) {
+      return local_device_->GetEventForComputeStreamSyncPoint(
+          sync_point_, thread_pool, nullptr_if_past);
+    }
+    return BufferSequencingEventRef();
+  }
+
  private:
   se::DeviceMemoryAllocator* allocator_;
-  int device_ordinal_;
+  LocalDeviceState* local_device_;
+  size_t sync_point_ = std::numeric_limits<size_t>::max();
 };
 
-tsl::RCReference<RawSEDeviceMemory> RawSEDeviceMemory::Create(
-    se::DeviceMemoryBase value, PjRtLocalDeviceId device_id,
+tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::Create(
+    se::DeviceMemoryBase value, LocalDeviceState* local_device,
     se::DeviceMemoryAllocator* allocator) {
-  return tsl::MakeRef<AllocatedRawSEDeviceMemory>(value, device_id.value(),
-                                                  allocator);
+  return tsl::MakeAvailableAsyncValueRef<AllocatedRawSEDeviceMemory>(
+      value, local_device, allocator);
 }
 
 class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
@@ -112,44 +132,24 @@ class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
   absl::AnyInvocable<void() &&> on_delete_callback_;
 };
 
-tsl::RCReference<RawSEDeviceMemory> RawSEDeviceMemory::CreateForeign(
+tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::CreateForeign(
     se::DeviceMemoryBase value,
     absl::AnyInvocable<void() &&> on_delete_callback) {
-  return tsl::MakeRef<ForeignRawSEDeviceMemory>(value,
-                                                std::move(on_delete_callback));
-}
-
-ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(
-    const Shape& on_device_shape) const {
-  ShapedBuffer shaped_buffer(on_device_shape,
-                             device_->local_device_id().value(),
-                             device_->local_hardware_id().value());
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
-      shaped_buffer.buffers().begin();
-  if (device_memory_) {
-    CHECK(iterator != shaped_buffer.buffers().end());
-    iterator->second = device_memory_->mem();
-    ++iterator;
-  }
-  CHECK(iterator == shaped_buffer.buffers().end());
-  return shaped_buffer;
+  return tsl::MakeAvailableAsyncValueRef<ForeignRawSEDeviceMemory>(
+      value, std::move(on_delete_callback));
 }
 
 TrackedDeviceBuffer::TrackedDeviceBuffer(
-    PjRtDevice* device, tsl::RCReference<RawSEDeviceMemory> device_memory,
+    PjRtDevice* device, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::Span<const BufferSequencingEventRef> definition_events)
-    : device_(device),
-      device_memory_(std::move(device_memory)),
+    : AbstractTrackedDeviceBuffer(std::move(raw_buffer)),
+      device_(device),
       definition_events_(std::make_move_iterator(definition_events.begin()),
                          std::make_move_iterator(definition_events.end())),
       in_use_(true) {}
 
 TrackedDeviceBuffer::~TrackedDeviceBuffer() = default;
 
-void TrackedDeviceBuffer::ReleaseDeviceMemory() {
-  device_memory_ = tsl::RCReference<RawSEDeviceMemory>();
-}
-
 void TrackedDeviceBuffer::ConfirmDonation() {
   // As a sanity check ensure no more usage events can be added to the buffer.
   LockUseAndTransferUsageEvents();
@@ -184,6 +184,78 @@ void TrackedDeviceBuffer::AddUsageEvent(BufferSequencingEventRef event,
   usage_events_.push_back({event, reference_held});
 }
 
+absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>>
+TrackedDeviceBuffer::CloneWithControlDependency(PjRtMemorySpace* memory_space,
+                                                Future<> dependency) {
+  auto* se_client =
+      tensorflow::down_cast<PjRtStreamExecutorClient*>(memory_space->client());
+
+  // Copy all the data in the existing tracked_buffer.
+  const auto& original_definition_events = definition_events();
+  absl::InlinedVector<BufferSequencingEventRef, 4> definition_events;
+
+  auto definition_event_for_status =
+      BufferSequencingEvent::Create(se_client->thread_pool());
+  // definition_event_for_status must be the first one so that it blocks other
+  // actions like D2H transfer from execution before the buffer is ready.
+  definition_events.push_back(definition_event_for_status);
+  definition_events.insert(definition_events.end(),
+                           original_definition_events.begin(),
+                           original_definition_events.end());
+
+  auto new_device_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device_, raw_buffer(), std::move(definition_events));
+
+  auto* device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(
+      memory_space->devices()[0]);
+  LocalDeviceState* local_device = device->local_device_state();
+  dependency.OnReady(
+      [definition_event_for_status = std::move(definition_event_for_status),
+       local_device, client = se_client](absl::Status status) mutable {
+        // Forward the absl::Status from the supplied dependency to the
+        // definition event.
+        if (!status.ok()) {
+          client->SetEventAsError(definition_event_for_status, status);
+          return;
+        }
+        auto stream = local_device->BorrowStreamFromPool();
+        CHECK_OK(client->AllocateAndRecordEvent(definition_event_for_status,
+                                                local_device, stream.get()));
+        local_device->ReturnStreamToPool(std::move(stream));
+      });
+  return new_device_buffer;
+}
+
+Future<> TrackedDeviceBuffer::GetReadyFuture(PjRtMemorySpace* memory_space) {
+  auto [promise, future] = Future<>::MakePromise();
+  std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events;
+  definition_events.reserve(definition_events_.size());
+  for (const auto& event : definition_events_) {
+    definition_events.push_back(event.CopyRCRef());
+  }
+  absl::Span<tsl::RCReference<tsl::AsyncValue> const> definition_events_span =
+      definition_events;
+  tsl::RunWhenReady(
+      definition_events_span,
+      [promise = std::move(promise),
+       definition_events = std::move(definition_events)]() mutable {
+        for (auto& event : definition_events) {
+          if (const absl::Status* error = event->GetErrorIfPresent()) {
+            promise.Set(*error);
+            return;
+          }
+        }
+        promise.Set();
+      });
+  return future;
+}
+
+void TrackedDeviceBuffer::Delete(PjRtMemorySpace* memory_space) {
+  std::unique_ptr<TrackedDeviceBuffer> device_buffer(this);
+  // All events already hold onto refs to the buffer to ensure liveness so there
+  // is no work to do.
+}
+
 TrackedDeviceBuffer::StreamAndEventContainer
 TrackedDeviceBuffer::LockUseAndTransferUsageEvents() {
   CHECK(in_use_);
@@ -201,15 +273,14 @@ TrackedDeviceBuffer::GetAsyncValueDefinitionEvents() {
   return avs;
 }
 
-tsl::RCReference<CommonPjRtRawBuffer> TrackedDeviceBuffer::GetRawBuffer(
-    PjRtMemorySpace* memory_space) {
-  return tsl::MakeRef<PjRtStreamExecutorRawBuffer>(
-      tensorflow::down_cast<PjRtStreamExecutorClient*>(memory_space->client()),
-      memory_space,
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(
-          memory_space->devices()[0])
-          ->local_device_state(),
-      device_memory_);
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+TrackedDeviceBuffer::GetDefinitionEvent(PjRtMemorySpace* memory_space) {
+  if (definition_events_.size() != 1) {
+    return absl::InternalError(
+        "GetMergedDefinitionEvent only supported on TPU for buffers with "
+        "exactly 1 definition event.");
+  }
+  return tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_events_[0]);
 }
 
 void TrackedDeviceBuffer::AddUsageEvent(
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index faf8d2ea2a6490..ecc4a64dc73c45 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/event_pool.h"
+#include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/service/executable.h"
@@ -50,8 +51,7 @@ limitations under the License.
 
 namespace xla {
 
-// TODO(parkers): Implement PjRtRawBuffer API.
-class RawSEDeviceMemory : public tsl::ReferenceCounted<RawSEDeviceMemory> {
+class RawSEDeviceMemory {
  public:
   explicit RawSEDeviceMemory(se::DeviceMemoryBase value) : value_(value) {}
 
@@ -69,13 +69,20 @@ class RawSEDeviceMemory : public tsl::ReferenceCounted<RawSEDeviceMemory> {
   ShapedBuffer AsShapedBuffer(PjRtDevice* device,
                               const Shape& on_device_shape) const;
 
-  static tsl::RCReference<RawSEDeviceMemory> Create(
-      se::DeviceMemoryBase value, PjRtLocalDeviceId device_id,
+  static tsl::AsyncValueRef<RawSEDeviceMemory> Create(
+      se::DeviceMemoryBase value, LocalDeviceState* local_device,
       se::DeviceMemoryAllocator* allocator);
-  static tsl::RCReference<RawSEDeviceMemory> CreateForeign(
+  static tsl::AsyncValueRef<RawSEDeviceMemory> CreateForeign(
       se::DeviceMemoryBase value,
       absl::AnyInvocable<void() &&> on_delete_callback);
 
+  // Returns a definition event (or nullptr if the definition is known to be in
+  // the past).
+  virtual absl::StatusOr<BufferSequencingEventRef> GetDefinitionEvent(
+      tsl::thread::ThreadPool* thread_pool, bool nullptr_if_past) const {
+    return BufferSequencingEventRef();
+  }
+
  private:
   se::DeviceMemoryBase value_;
 };
@@ -96,9 +103,6 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
     bool reference_held;
   };
 
-  // Builds a ShapedBuffer view onto the buffers of 'tree'.
-  ShapedBuffer AsShapedBuffer(const Shape& on_device_shape) const;
-
   // Adds the owned device buffers in order to 'iterator'. Used to add the
   // buffers to an ExecutionInput. We require but do not verify that 'iterator'
   // when passed in is pointing to a sub-tuple of the ExecutionInput whose
@@ -122,10 +126,6 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
       ExecutionInput* execution_input,
       se::DeviceMemoryAllocator* allocator) const;
 
-  const tsl::RCReference<RawSEDeviceMemory>& device_memory() const {
-    return device_memory_;
-  }
-
   const absl::InlinedVector<BufferSequencingEventRef, 2>& definition_events()
       const {
     return definition_events_;
@@ -134,10 +134,6 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
     return usage_events_;
   }
 
-  // Relinquishes ownership of the buffer's device memory, e.g., after the
-  // buffer is passed to a computation that aliases its inputs to outputs.
-  void ReleaseDeviceMemory();
-
   // Only to be called by ScopedHold to mark a successful donation.
   void ConfirmDonation() override;
 
@@ -160,21 +156,16 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
   StreamAndEventContainer LockUseAndTransferUsageEvents();
 
   TrackedDeviceBuffer(
-      PjRtDevice* device, tsl::RCReference<RawSEDeviceMemory> device_memory,
+      PjRtDevice* device, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::Span<const BufferSequencingEventRef> definition_events);
   ~TrackedDeviceBuffer() override;
 
   std::vector<tsl::RCReference<tsl::AsyncValue>> GetAsyncValueDefinitionEvents()
       override;
 
-  tsl::RCReference<CommonPjRtRawBuffer> GetRawBuffer(
-      PjRtMemorySpace* memory_space) override;
-
   void AddUsageEvent(tsl::RCReference<PjRtDeviceEvent> event) override;
 
-  void Delete(PjRtMemorySpace* memory_space) override {
-    LOG(FATAL) << "Implement";
-  }
+  void Delete(PjRtMemorySpace* memory_space) override;
 
   absl::Status WaitUntilBufferReadyOnStream(std::intptr_t stream) override {
     for (const BufferSequencingEventRef& event : definition_events()) {
@@ -183,12 +174,17 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
     return absl::OkStatus();
   }
 
- private:
-  PjRtDevice* device_;
+  absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>>
+  CloneWithControlDependency(PjRtMemorySpace* memory_space,
+                             Future<> dependency) override;
 
-  // Each host-side buffer may have several buffers on-device.
-  tsl::RCReference<RawSEDeviceMemory> device_memory_;
+  Future<> GetReadyFuture(PjRtMemorySpace* memory_space) override;
 
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> GetDefinitionEvent(
+      PjRtMemorySpace* memory_space) override;
+
+ private:
+  PjRtDevice* device_;
   // Events that are triggered when the content of one or more buffers is ready
   // during multistream execution. May be nullptr, which is used in the
   // single-stream execution case where events are not necessary for buffer
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index 75ea054801ede1..f4d2b8664df143 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/pjrt/tracked_device_buffer.h"
 
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "absl/log/log.h"
@@ -30,12 +31,13 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -81,9 +83,9 @@ class TestDevice : public PjRtDevice {
   }
 };
 
-absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> MakeArray(
-    const Shape& shape, LocalClient* client, PjRtDevice* device) {
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> device_buffers;
+absl::StatusOr<tsl::AsyncValueRef<RawSEDeviceMemory>> MakeArray(
+    const Shape& shape, LocalClient* client) {
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> device_buffers;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
       [&](const Shape& subshape, const ShapeIndex&) -> absl::Status {
@@ -93,13 +95,12 @@ absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> MakeArray(
                 /*device_ordinal=*/0,
                 client->backend().transfer_manager()->GetByteSizeRequirement(
                     subshape)));
-        device_buffers.push_back(RawSEDeviceMemory::Create(
-            device_memory.Release(), device->local_device_id(),
-            client->backend().memory_allocator()));
+        auto se_mem = *device_memory;
+        device_buffers.push_back(RawSEDeviceMemory::CreateForeign(
+            se_mem, [device_memory = std::move(device_memory)]() {}));
         return absl::OkStatus();
       }));
-  return std::make_shared<TrackedDeviceBuffer>(
-      device, device_buffers[0], absl::Span<const BufferSequencingEventRef>());
+  return device_buffers[0];
 }
 
 TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
@@ -109,18 +110,20 @@ TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
   Shape a_shape = ShapeUtil::MakeShape(F32, {3, 101, 4});
   Shape b_shape = ShapeUtil::MakeShape(S8, {77});
   Shape c_shape = ShapeUtil::MakeShape(S64, {});
-  TF_ASSERT_OK_AND_ASSIGN(auto a_buffer, MakeArray(a_shape, client, &device));
-  TF_ASSERT_OK_AND_ASSIGN(auto b_buffer, MakeArray(b_shape, client, &device));
-  TF_ASSERT_OK_AND_ASSIGN(auto c_buffer, MakeArray(c_shape, client, &device));
+  TF_ASSERT_OK_AND_ASSIGN(auto a_buffer, MakeArray(a_shape, client));
+  TF_ASSERT_OK_AND_ASSIGN(auto b_buffer, MakeArray(b_shape, client));
+  TF_ASSERT_OK_AND_ASSIGN(auto c_buffer, MakeArray(c_shape, client));
 
   std::vector<se::DeviceMemoryBase> expected_buffer_sequence = {
-      a_buffer->device_memory()->mem(), b_buffer->device_memory()->mem(),
-      c_buffer->device_memory()->mem()};
+      a_buffer->mem(), b_buffer->mem(), c_buffer->mem()};
   ShapedBuffer shaped_a = a_buffer->AsShapedBuffer(
+      &device,
       client->backend().transfer_manager()->HostShapeToDeviceShape(a_shape));
   ShapedBuffer shaped_b = b_buffer->AsShapedBuffer(
+      &device,
       client->backend().transfer_manager()->HostShapeToDeviceShape(b_shape));
   ShapedBuffer shaped_c = c_buffer->AsShapedBuffer(
+      &device,
       client->backend().transfer_manager()->HostShapeToDeviceShape(c_shape));
   auto expected_it = expected_buffer_sequence.begin();
   for (auto it = shaped_a.buffers().begin(); it != shaped_a.buffers().end();
diff --git a/third_party/xla/xla/pjrt/triton_cuda.cc b/third_party/xla/xla/pjrt/triton_cuda.cc
index b530ca83faccdf..5b4b8a69395d45 100644
--- a/third_party/xla/xla/pjrt/triton_cuda.cc
+++ b/third_party/xla/xla/pjrt/triton_cuda.cc
@@ -93,7 +93,6 @@ absl::StatusOr<std::unique_ptr<llvm::TargetMachine>> CreateTargetMachine(
   if (enable_fp_fusion) {
     opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
   }
-  opt.UnsafeFPMath = false;
   opt.NoInfsFPMath = false;
   opt.NoNaNsFPMath = true;
   opt.TrapUnreachable = true;
@@ -233,8 +232,9 @@ absl::StatusOr<CompilationResult> Compile(absl::string_view module,
   TF_ASSIGN_OR_RETURN(
       auto cuda_cc,
       stream_executor::CudaComputeCapability::FromString(arch_name));
-  xla::gpu::CreateTritonPipeline(&pm, cuda_cc, num_warps, num_ctas, num_stages,
-                                 cluster_info);
+  xla::gpu::CreateTritonPipeline(&pm,
+                                 stream_executor::GpuComputeCapability(cuda_cc),
+                                 num_warps, num_ctas, num_stages, cluster_info);
   if (failed(pm.run(*module_op))) {
     return absl::InternalError("Failed to compile Triton IR to LLVM IR");
   }
diff --git a/third_party/xla/xla/pjrt/worker_thread.cc b/third_party/xla/xla/pjrt/worker_thread.cc
index d186c734272d51..0154f8d84413cd 100644
--- a/third_party/xla/xla/pjrt/worker_thread.cc
+++ b/third_party/xla/xla/pjrt/worker_thread.cc
@@ -30,13 +30,13 @@ WorkerThread::WorkerThread(tsl::Env* env, const std::string& name) {
 }
 
 WorkerThread::~WorkerThread() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   work_queue_.push(nullptr);
 }
 
 void WorkerThread::Schedule(absl::AnyInvocable<void() &&> fn) {
   CHECK(fn != nullptr);
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   work_queue_.push(std::move(fn));
 }
 
@@ -46,7 +46,7 @@ void WorkerThread::WorkLoop() {
   while (true) {
     absl::AnyInvocable<void() &&> fn;
     {
-      absl::MutexLock lock(&mu_);
+      absl::MutexLock lock(mu_);
       mu_.Await(absl::Condition(this, &WorkerThread::WorkAvailable));
       fn = std::move(work_queue_.front());
       work_queue_.pop();
diff --git a/third_party/xla/xla/protobuf_util.cc b/third_party/xla/xla/protobuf_util.cc
index cf16e3768301ac..ae8793d85bac67 100644
--- a/third_party/xla/xla/protobuf_util.cc
+++ b/third_party/xla/xla/protobuf_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/hash/hash.h"
+#include "google/protobuf/message.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/protobuf_util.h b/third_party/xla/xla/protobuf_util.h
index e8e063463dcc15..7ad883889c270d 100644
--- a/third_party/xla/xla/protobuf_util.h
+++ b/third_party/xla/xla/protobuf_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "google/protobuf/message.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/py_strict.bzl b/third_party/xla/xla/py_strict.bzl
index 35862b51ddb151..cf2b93ee5689dc 100644
--- a/third_party/xla/xla/py_strict.bzl
+++ b/third_party/xla/xla/py_strict.bzl
@@ -1,9 +1,9 @@
 """Default (OSS) build versions of Python strict rules."""
 
-load("@local_xla//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("@rules_python//python:py_binary.bzl", "py_binary")
 load("@rules_python//python:py_library.bzl", "py_library")
 load("@rules_python//python:py_test.bzl", "py_test")
+load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 7986d0b501e8ce..569d72ac04afe9 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -12,7 +12,7 @@ load(
     "if_google",
     "internal_visibility",
 )
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_pybind_extension")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable", "get_compatible_with_portable", "tsl_pybind_extension")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -88,6 +88,25 @@ py_strict_test(
     ] + xla_py_test_deps(),
 )
 
+cc_library(
+    name = "dlpack_types",
+    srcs = ["dlpack_types.cc"],
+    hdrs = ["dlpack_types.h"],
+    compatible_with = get_compatible_with_libtpu_portable(),
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = internal_visibility([":friends"]),
+    deps = [
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/status:statusor",
+        "@dlpack",
+    ],
+)
+
 cc_library(
     name = "types",
     srcs = ["types.cc"],
@@ -105,6 +124,7 @@ cc_library(
     deps = [
         ":nb_numpy",
         ":safe_static_init",
+        ":strides",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -126,6 +146,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "strides",
+    srcs = ["strides.cc"],
+    hdrs = ["strides.h"],
+    compatible_with = get_compatible_with_libtpu_portable(),
+    visibility = internal_visibility([":friends"]),
+    deps = [
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "literal_type_casters",
     hdrs = ["literal_type_casters.h"],
@@ -365,6 +399,10 @@ nanobind_pywrap_extension(
     name = "_profiler",
     srcs = ["profiler.cc"],
     pytype_srcs = ["_profiler.pyi"],
+    visibility = internal_visibility([
+        ":jax",
+        "//third_party/py/torch_tpu:__subpackages__",
+    ]),
     deps = [
         ":aggregate_profile",
         ":profiler_utils",
@@ -401,6 +439,10 @@ nanobind_pywrap_extension(
         "-fno-strict-aliasing",
     ],
     pytype_srcs = ["_profile_data.pyi"],
+    visibility = internal_visibility([
+        ":jax",
+        "//third_party/py/torch_tpu:__subpackages__",
+    ]),
     deps = [
         "//xla/python/profiler:profile_data_lib",
         "@local_tsl//tsl/platform:protobuf",
@@ -575,6 +617,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nb_status",
+    hdrs = ["nb_status.h"],
+    compatible_with = [],
+    copts = ["-fexceptions"],
+    features = ["-use_header_modules"],
+    visibility = internal_visibility([":friends"]),
+    deps = [
+        "@com_google_absl//absl/status",
+        "@nanobind",
+    ],
+)
+
 cc_library(
     name = "aggregate_profile",
     srcs = ["aggregate_profile.cc"],
diff --git a/third_party/xla/xla/python/custom_partition_callback.cc b/third_party/xla/xla/python/custom_partition_callback.cc
index 56bf633cbab8ed..b43965fcc094b2 100644
--- a/third_party/xla/xla/python/custom_partition_callback.cc
+++ b/third_party/xla/xla/python/custom_partition_callback.cc
@@ -236,7 +236,7 @@ absl::StatusOr<xla::HloSharding> ReadHloSharding(
     JAX_CustomCallPartitioner_string data) {
   xla::OpSharding proto;
   if (data.size > std::numeric_limits<int>::max() ||
-      !proto.ParseFromArray(data.data, data.size)) {
+      !proto.ParseFromString(absl::string_view(data.data, data.size))) {
     return absl::InternalError(
         "custom_call_sharding.cc: error parsing OpShardingProto");
   }
@@ -246,7 +246,7 @@ absl::StatusOr<xla::HloSharding> ReadHloSharding(
 absl::StatusOr<xla::Shape> ReadHloShape(JAX_CustomCallPartitioner_string data) {
   xla::ShapeProto proto;
   if (data.size > std::numeric_limits<int>::max() ||
-      !proto.ParseFromArray(data.data, data.size)) {
+      !proto.ParseFromString(absl::string_view(data.data, data.size))) {
     return absl::InternalError(
         "custom_call_sharding.cc: error parsing xla::Shape");
   }
diff --git a/third_party/xla/xla/python/dlpack_types.cc b/third_party/xla/xla/python/dlpack_types.cc
new file mode 100644
index 00000000000000..e3a37a8ae92c94
--- /dev/null
+++ b/third_party/xla/xla/python/dlpack_types.cc
@@ -0,0 +1,223 @@
+/* Copyright 2025 The JAX Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/dlpack_types.h"
+
+#include "absl/status/statusor.h"
+#include "include/dlpack/dlpack.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+absl::StatusOr<DLDataType> PrimitiveTypeToDLDataType(PrimitiveType type) {
+  switch (type) {
+    case S8:
+      return DLDataType{kDLInt, 8, 1};
+    case S16:
+      return DLDataType{kDLInt, 16, 1};
+    case S32:
+      return DLDataType{kDLInt, 32, 1};
+    case S64:
+      return DLDataType{kDLInt, 64, 1};
+    case U8:
+      return DLDataType{kDLUInt, 8, 1};
+    case U16:
+      return DLDataType{kDLUInt, 16, 1};
+    case U32:
+      return DLDataType{kDLUInt, 32, 1};
+    case U64:
+      return DLDataType{kDLUInt, 64, 1};
+    case F4E2M1FN:
+      return DLDataType{kDLFloat4_e2m1fn, 4, 1};
+    case F8E3M4:
+      return DLDataType{kDLFloat8_e3m4, 8, 1};
+    case F8E4M3:
+      return DLDataType{kDLFloat8_e4m3, 8, 1};
+    case F8E4M3B11FNUZ:
+      return DLDataType{kDLFloat8_e4m3b11fnuz, 8, 1};
+    case F8E4M3FN:
+      return DLDataType{kDLFloat8_e4m3fn, 8, 1};
+    case F8E4M3FNUZ:
+      return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
+    case F8E5M2:
+      return DLDataType{kDLFloat8_e5m2, 8, 1};
+    case F8E5M2FNUZ:
+      return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
+    case F8E8M0FNU:
+      return DLDataType{kDLFloat8_e8m0fnu, 8, 1};
+    case BF16:
+      return DLDataType{kDLBfloat, 16, 1};
+    case F16:
+      return DLDataType{kDLFloat, 16, 1};
+    case F32:
+      return DLDataType{kDLFloat, 32, 1};
+    case F64:
+      return DLDataType{kDLFloat, 64, 1};
+    case PRED:
+      return DLDataType{kDLBool, 8, 1};
+    case C64:
+      return DLDataType{kDLComplex, 64, 1};
+    case C128:
+      return DLDataType{kDLComplex, 128, 1};
+    default:
+      return Unimplemented("XLA type %s has no DLPack equivalent",
+                           PrimitiveType_Name(type));
+  }
+}
+
+absl::StatusOr<PrimitiveType> DLDataTypeToPrimitiveType(DLDataType type) {
+  if (type.lanes != 1) {
+    return Unimplemented("DLPack types with lanes != 1 not implemented, got %d",
+                         type.lanes);
+  }
+  switch (type.code) {
+    case kDLBool:
+      switch (type.bits) {
+        case 8:
+          return PRED;
+        default:
+          return Unimplemented(
+              "Only 8-bit DLPack booleans are supported, got %d bits",
+              type.bits);
+      }
+    case kDLInt:
+      switch (type.bits) {
+        case 8:
+          return S8;
+        case 16:
+          return S16;
+        case 32:
+          return S32;
+        case 64:
+          return S64;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack integer width: %d bits",
+              type.bits);
+      }
+    case kDLUInt:
+      switch (type.bits) {
+        case 8:
+          return U8;
+        case 16:
+          return U16;
+        case 32:
+          return U32;
+        case 64:
+          return U64;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack unsigned integer width: %d bits",
+              type.bits);
+      }
+    case kDLFloat4_e2m1fn:
+      if (type.bits == 4) {
+        return F4E2M1FN;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float4_e2m1fn width: %d bits",
+          type.bits);
+    case kDLFloat8_e3m4:
+      if (type.bits == 8) {
+        return F8E3M4;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e3m4 width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3:
+      if (type.bits == 8) {
+        return F8E4M3;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3 width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3b11fnuz:
+      if (type.bits == 8) {
+        return F8E4M3B11FNUZ;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3b11fnuz width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3fn:
+      if (type.bits == 8) {
+        return F8E4M3FN;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3fn width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3fnuz:
+      if (type.bits == 8) {
+        return F8E4M3FNUZ;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3fnuz width: %d bits",
+          type.bits);
+    case kDLFloat8_e5m2:
+      if (type.bits == 8) {
+        return F8E5M2;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e5m2 width: %d bits",
+          type.bits);
+    case kDLFloat8_e5m2fnuz:
+      if (type.bits == 8) {
+        return F8E5M2FNUZ;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e5m2fnuz width: %d bits",
+          type.bits);
+    case kDLFloat8_e8m0fnu:
+      if (type.bits == 8) {
+        return F8E8M0FNU;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e8m0fnu width: %d bits",
+          type.bits);
+    case kDLBfloat:
+      if (type.bits == 16) {
+        return BF16;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack bfloat width: %d bits", type.bits);
+    case kDLFloat:
+      switch (type.bits) {
+        case 16:
+          return F16;
+        case 32:
+          return F32;
+        case 64:
+          return F64;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack float width: %d bits", type.bits);
+      }
+    case kDLComplex:
+      switch (type.bits) {
+        case 64:
+          return C64;
+        case 128:
+          return C128;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack complex width: %d bits",
+              type.bits);
+      }
+    default:
+      return Unimplemented("Unknown or invalid DLPack type code %d", type.code);
+  }
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/mlir_kernel_definition.h b/third_party/xla/xla/python/dlpack_types.h
similarity index 62%
rename from third_party/xla/xla/codegen/mlir_kernel_definition.h
rename to third_party/xla/xla/python/dlpack_types.h
index dd81d8ae81f043..df64c9baf3ad99 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_definition.h
+++ b/third_party/xla/xla/python/dlpack_types.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The JAX Authors
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_MLIR_KERNEL_DEFINITION_H_
-#define XLA_CODEGEN_MLIR_KERNEL_DEFINITION_H_
+#ifndef XLA_PYTHON_DLPACK_TYPES_H_
+#define XLA_PYTHON_DLPACK_TYPES_H_
 
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/mlir_kernel_source.h"
+#include "absl/status/statusor.h"
+#include "include/dlpack/dlpack.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
-using MlirKernelDefinition = KernelDefinition<MlirKernelSource>;
+absl::StatusOr<DLDataType> PrimitiveTypeToDLDataType(PrimitiveType type);
+absl::StatusOr<PrimitiveType> DLDataTypeToPrimitiveType(DLDataType type);
 
 }  // namespace xla
 
-#endif  // XLA_CODEGEN_MLIR_KERNEL_DEFINITION_H_
+#endif  // XLA_PYTHON_DLPACK_TYPES_H_
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index d4b4c2ceccf984..8fb83172a627f4 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -277,7 +277,6 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/pjrt:pjrt_layout",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -373,7 +372,6 @@ xla_cc_test(
         ":serdes_test_util",
         ":serdes_version",
         ":shape_proto_cc",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/status",
@@ -391,7 +389,6 @@ xla_cc_test(
         ":ifrt",
         "//xla/python/ifrt/ir:sharding_param",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/status:status_matchers",
@@ -525,10 +522,10 @@ cc_library(
     deps = [
         ":ifrt",
         ":test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:status_matchers",
     ],
     alwayslink = True,
 )
@@ -693,7 +690,6 @@ xla_cc_test(
         ":serdes",
         ":serdes_proto_cc",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -884,6 +880,7 @@ xla_cc_test(
         ":serdes_version",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1152,7 +1149,11 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -1193,6 +1194,7 @@ cc_library(
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:string_view",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -1226,6 +1228,7 @@ xla_cc_test(
         ":user_context_test_util",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:status_to_from_proto",
+        "//xla/tsl/protobuf:status_proto_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
@@ -1273,10 +1276,10 @@ xla_cc_test(
         ":serdes_test_util",
         ":serdes_version",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index 92b5d5f9172fff..e6e5b7a2ae29d2 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -77,8 +77,9 @@ class Array : public llvm::RTTIExtends<Array, Value> {
   virtual const Sharding& sharding() const = 0;
   virtual ShardingRef shared_ptr_sharding() const = 0;
   // The device memory layout for each shard of the Array. All shards are
-  // assumed to have the same layout. Cannot be nullptr; implementations should
-  // return UNIMPLEMENTED instead.
+  // assumed to have the same layout. `nullptr` indicates a default layout; the
+  // user can obtain a concrete layout by calling
+  // `Client::GetDefaultPjRtLayout()`.
   virtual absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> pjrt_layout()
       const = 0;
   virtual CustomLayoutRef layout() const {
diff --git a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
index be6017ca399df7..c55275a8acb82f 100644
--- a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
@@ -302,10 +302,6 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferDefaultLayout) {
   for (Memory* const memory : device->Memories()) {
     SCOPED_TRACE(absl::StrCat(memory->Kind()));
 
-    TF_ASSERT_OK_AND_ASSIGN(auto default_layout,
-                            client->GetDefaultPjRtLayout(
-                                dtype, shape.dims(), device, memory->Kind()));
-
     TF_ASSERT_OK_AND_ASSIGN(
         auto array,
         client->MakeArrayFromHostBuffer(
@@ -316,8 +312,14 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferDefaultLayout) {
     TF_ASSERT_OK(array->GetReadyFuture().Await());
 
     TF_ASSERT_OK_AND_ASSIGN(auto layout, array->pjrt_layout());
-    ASSERT_NE(layout, nullptr);
-    EXPECT_EQ(*layout, *default_layout);
+    // `layout` should be either nullptr or a concrete default layout.
+    if (layout != nullptr) {
+      TF_ASSERT_OK_AND_ASSIGN(auto default_layout,
+                              client->GetDefaultPjRtLayout(
+                                  dtype, shape.dims(), device, memory->Kind()));
+
+      EXPECT_EQ(*layout, *default_layout);
+    }
   }
 }
 
@@ -460,7 +462,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferReplicated) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto single_device_arrays,
                           array->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy,
+                              ArrayCopySemantics::kReuseInput,
                               SingleDeviceShardSemantics::kAddressableShards));
   ASSERT_EQ(single_device_arrays.size(), devices.size());
   for (int i = 0; i < single_device_arrays.size(); ++i) {
@@ -545,7 +547,7 @@ TEST(ArrayImplTest, MakeArraysFromHostBufferShardsAndCopyToHostBuffer) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         arrays[i]->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
     ASSERT_EQ(single_device_arrays.size(), devices.size());
     for (int j = 0; j < single_device_arrays.size(); ++j) {
@@ -843,7 +845,7 @@ TEST(ArrayImplTest,
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         arrays[i]->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
     ASSERT_EQ(single_device_arrays.size(), devices.size());
     for (int j = 0; j < single_device_arrays.size(); ++j) {
@@ -902,7 +904,7 @@ TEST(ArrayImplTest, HostBufferRoundTripAllMemoryKinds) {
     std::vector<float> new_data(6);
     tsl::Future<> future = array->CopyToHostBuffer(
         static_cast<void*>(new_data.data()), /*byte_strides=*/std::nullopt,
-        ArrayCopySemantics::kReuseInput);
+        ArrayCopySemantics::kAlwaysCopy);
     TF_ASSERT_OK(future.Await());
     EXPECT_THAT(new_data, ElementsAreArray(data));
   }
@@ -1122,7 +1124,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleArray) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         assembled_array->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
 
     ASSERT_THAT(single_device_arrays, SizeIs(2));
@@ -1178,7 +1180,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleSingleDeviceArray) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto single_device_arrays,
                           assembled_array->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy,
+                              ArrayCopySemantics::kReuseInput,
                               SingleDeviceShardSemantics::kAddressableShards));
 
   ASSERT_THAT(single_device_arrays, SizeIs(1));
@@ -1274,7 +1276,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleNonAddressableArray) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         assembled_array->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
 
     ASSERT_THAT(single_device_arrays, SizeIs(0));
@@ -1349,7 +1351,7 @@ TEST(ArrayImplTest, CopyToDifferentDevice) {
 
     TF_ASSERT_OK_AND_ASSIGN(
         auto shards, arrays[i]->DisassembleIntoSingleDeviceArrays(
-                         ArrayCopySemantics::kAlwaysCopy,
+                         ArrayCopySemantics::kReuseInput,
                          SingleDeviceShardSemantics::kAddressableShards));
     for (const auto& shard : shards) {
       std::vector<float> out_data(6);
@@ -1451,12 +1453,14 @@ TEST(ArrayImplTest, CopyPreservesDefaultLayouts) {
       TF_ASSERT_OK(array->GetReadyFuture().Await());
 
       TF_ASSERT_OK_AND_ASSIGN(auto src_layout, array->pjrt_layout());
-      ASSERT_NE(src_layout, nullptr);
-      TF_ASSERT_OK_AND_ASSIGN(
-          auto src_default_layout,
-          client->GetDefaultPjRtLayout(dtype, shape.dims(), device,
-                                       src_memory->Kind()));
-      EXPECT_EQ(*src_layout, *src_default_layout);
+      // `layout` should be either nullptr or a concrete default layout.
+      if (src_layout != nullptr) {
+        TF_ASSERT_OK_AND_ASSIGN(
+            auto src_default_layout,
+            client->GetDefaultPjRtLayout(dtype, shape.dims(), device,
+                                         src_memory->Kind()));
+        EXPECT_EQ(*src_layout, *src_default_layout);
+      }
 
       TF_ASSERT_OK_AND_ASSIGN(
           auto new_arrays, client->CopyArrays(absl::MakeSpan(&array, 1),
@@ -1464,12 +1468,14 @@ TEST(ArrayImplTest, CopyPreservesDefaultLayouts) {
                                               ArrayCopySemantics::kAlwaysCopy));
       ASSERT_THAT(new_arrays, SizeIs(1));
       TF_ASSERT_OK_AND_ASSIGN(auto dst_layout, new_arrays[0]->pjrt_layout());
-      ASSERT_NE(dst_layout, nullptr);
-      TF_ASSERT_OK_AND_ASSIGN(
-          auto dst_default_layout,
-          client->GetDefaultPjRtLayout(dtype, shape.dims(), device,
-                                       dst_memory->Kind()));
-      EXPECT_EQ(*dst_layout, *dst_default_layout);
+      // `layout` should be either nullptr or a concrete default layout.
+      if (dst_layout != nullptr) {
+        TF_ASSERT_OK_AND_ASSIGN(
+            auto dst_default_layout,
+            client->GetDefaultPjRtLayout(dtype, shape.dims(), device,
+                                         dst_memory->Kind()));
+        EXPECT_EQ(*dst_layout, *dst_default_layout);
+      }
     }
   }
 }
diff --git a/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
index 17027c102bf3b9..678e520b5b0ca5 100644
--- a/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status_matchers.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -28,7 +28,6 @@ using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::NotNull;
 using ::testing::SizeIs;
-using ::tsl::testing::IsOk;
 
 TEST(ClientImplTest, RuntimeTypeAndPlatform) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
diff --git a/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
index ec4f9cfcfd64c7..84a27cb33a355a 100644
--- a/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/cord.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -47,7 +47,6 @@ namespace {
 
 using ::testing::MatchesRegex;
 using ::testing::SizeIs;
-using ::tsl::testing::StatusIs;
 
 using CustomCallProgramSerDesTestParam =
     std::tuple<SerDesVersion, test_util::DeviceTestParam>;
diff --git a/third_party/xla/xla/python/ifrt/dtype.cc b/third_party/xla/xla/python/ifrt/dtype.cc
index bbd86ae240aec0..729cdeda6f87a0 100644
--- a/third_party/xla/xla/python/ifrt/dtype.cc
+++ b/third_party/xla/xla/python/ifrt/dtype.cc
@@ -138,11 +138,13 @@ absl::StatusOr<DType> DType::FromProto(const DTypeProto& dtype_proto) {
 #define CASE(X)              \
   case DTypeProto::KIND_##X: \
     return DType(DType::Kind::k##X);
+      CASE(S2);
       CASE(S4);
       CASE(S8);
       CASE(S16);
       CASE(S32);
       CASE(S64);
+      CASE(U2);
       CASE(U4);
       CASE(U8);
       CASE(U16);
@@ -154,15 +156,15 @@ absl::StatusOr<DType> DType::FromProto(const DTypeProto& dtype_proto) {
       CASE(BF16);
       CASE(C64);
       CASE(C128);
-      CASE(F4E2M1FN);
       CASE(F8E3M4);
       CASE(F8E4M3);
-      CASE(F8E8M0FNU);
       CASE(F8E4M3FN);
       CASE(F8E4M3B11FNUZ);
       CASE(F8E4M3FNUZ);
       CASE(F8E5M2);
       CASE(F8E5M2FNUZ);
+      CASE(F8E8M0FNU);
+      CASE(F4E2M1FN);
 #undef CASE
     case DTypeProto::KIND_STRING:
       return DType(DType::Kind::kString);
@@ -195,11 +197,13 @@ DTypeProto DType::ToProto(SerDesVersion version) const {
   case DType::Kind::k##X:                       \
     dtype_proto.set_kind(DTypeProto::KIND_##X); \
     break;
+      CASE(S2);
       CASE(S4);
       CASE(S8);
       CASE(S16);
       CASE(S32);
       CASE(S64);
+      CASE(U2);
       CASE(U4);
       CASE(U8);
       CASE(U16);
@@ -211,15 +215,15 @@ DTypeProto DType::ToProto(SerDesVersion version) const {
       CASE(BF16);
       CASE(C64);
       CASE(C128);
-      CASE(F4E2M1FN);
       CASE(F8E3M4);
       CASE(F8E4M3);
-      CASE(F8E8M0FNU);
       CASE(F8E4M3FN);
       CASE(F8E4M3B11FNUZ);
       CASE(F8E4M3FNUZ);
       CASE(F8E5M2);
       CASE(F8E5M2FNUZ);
+      CASE(F8E8M0FNU);
+      CASE(F4E2M1FN);
 #undef CASE
     case DType::Kind::kString:
       dtype_proto.set_kind(DTypeProto::KIND_STRING);
diff --git a/third_party/xla/xla/python/ifrt/dtype_test.cc b/third_party/xla/xla/python/ifrt/dtype_test.cc
index 7683866f3dfa4a..d92985bacd02ae 100644
--- a/third_party/xla/xla/python/ifrt/dtype_test.cc
+++ b/third_party/xla/xla/python/ifrt/dtype_test.cc
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/python/ifrt/dtype.pb.h"
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
@@ -39,16 +40,24 @@ class DTypeSerDesTest : public testing::TestWithParam<SerDesVersion> {
   SerDesVersion version_;
 };
 
+TEST_P(DTypeSerDesTest, Invalid) {
+  DType dtype(DType::kInvalid);
+  EXPECT_THAT(DType::FromProto(dtype.ToProto(version())),
+              absl_testing::IsOkAndHolds(dtype));
+}
+
 TEST_P(DTypeSerDesTest, FromToFromProto) {
   // Unlike other round-trip tests, this test starts from a proto because it is
   // easier to enumerate `DTypeProto::Kind`. This is not a fundamental
   // restriction, and this test may be rewritten as `ToFromToProto` if needed.
-  for (int i = 0; i < DTypeProto::Kind_descriptor()->value_count(); ++i) {
+  for (int i = 1; i < DTypeProto::Kind_descriptor()->value_count(); ++i) {
+    SCOPED_TRACE(DTypeProto::Kind_descriptor()->value(i)->name());
     DTypeProto proto;
     proto.set_version_number(version().version_number().value());
     proto.set_kind(static_cast<DTypeProto::Kind>(
         DTypeProto::Kind_descriptor()->value(i)->number()));
     TF_ASSERT_OK_AND_ASSIGN(DType dtype, DType::FromProto(proto));
+    EXPECT_NE(dtype.kind(), DType::kInvalid);
     TF_ASSERT_OK_AND_ASSIGN(DType dtype_copy,
                             DType::FromProto(dtype.ToProto(version())));
     EXPECT_EQ(dtype_copy, dtype);
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 53b12fd279b92a..bf9a4e380cec3b 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -279,8 +279,9 @@ class LoadedExecutable
       std::optional<DeviceListRef> devices) = 0;
 
   // Returns the list of devices where the executable has been compiled and
-  // loaded onto.
-  virtual const DeviceListRef& devices() const = 0;
+  // loaded onto. Returns `std::nullopt` if the executable is not bound to a
+  // particular device list, e.g., portable executables.
+  virtual std::optional<DeviceListRef> devices() const = 0;
 
   // The following APIs are taken from xla::PjRtLoadedExecutable for fast
   // prototyping.
diff --git a/third_party/xla/xla/python/ifrt/hlo/BUILD b/third_party/xla/xla/python/ifrt/hlo/BUILD
index 91d4c4f5161bf1..d764035ed4a44a 100644
--- a/third_party/xla/xla/python/ifrt/hlo/BUILD
+++ b/third_party/xla/xla/python/ifrt/hlo/BUILD
@@ -85,7 +85,6 @@ xla_cc_test(
         "//xla/python/ifrt:serdes_proto_cc",
         "//xla/python/ifrt:serdes_test_util",
         "//xla/python/ifrt:serdes_version",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
index a624c9a9b847b2..78c4985ee65e66 100644
--- a/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/python/ifrt/serdes.pb.h"
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -45,7 +44,6 @@ namespace {
 
 using ::testing::HasSubstr;
 using ::testing::Not;
-using ::tsl::testing::StatusIs;
 
 class HloProgramSerDesTest : public testing::TestWithParam<SerDesVersion> {
  public:
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index 383b35ae1b5b87..74bfdaecf48e64 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -261,9 +261,9 @@ xla_cc_test(
         "//xla/python/ifrt:serdes_test_util",
         "//xla/python/ifrt:serdes_version",
         "//xla/python/ifrt/support:module_parsing",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
index 3c904e92b96307..49ea8fe1b2986f 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
@@ -137,67 +137,37 @@ absl::StatusOr<std::vector<xla::ifrt::ArraySpec>> ExtractOutSpecs(
   return out_specs;
 }
 
+absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> BuildDefaultLayout(
+    const xla::ifrt::ArraySpec& arg_spec, xla::ifrt::Client* client) {
+  TF_ASSIGN_OR_RETURN(auto shard_shape,
+                      arg_spec.sharding->GetShardShape(arg_spec.shape));
+  return client->GetDefaultPjRtLayout(
+      arg_spec.dtype, shard_shape.dims(),
+      arg_spec.sharding->devices()->devices().front(),
+      arg_spec.sharding->memory_kind());
+}
+
 absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>>
-GetParameterLayoutFromConsumer(
+GetParameterLayoutFromLoadedExecutable(
     xla::ifrt::Client* client,
     const AtomExecutableMap& atom_program_executables,
     absl::Span<const xla::ifrt::ArraySpec> in_specs,
     absl::Span<const xla::ifrt::ArraySpec> out_specs,
-    mlir::SymbolTableCollection& symbol_table, mlir::OpOperand& param_operand) {
-  if (auto call_op = llvm::dyn_cast<xla::ifrt::CallLoadedExecutableOp>(
-          param_operand.getOwner())) {
-    // The parameter is used by a CallLoadedExecutableOp, return the layout
-    // from the atom program executable.
-    xla::ifrt::LoadedExecutableOp loaded_exec_op =
-        call_op.getCalleeOp(symbol_table);
-    auto atom_program_name = loaded_exec_op.getSymName().str();
-    auto exec_it = atom_program_executables.find(atom_program_name);
-    if (exec_it != atom_program_executables.end()) {
-      TF_ASSIGN_OR_RETURN(auto exec_layouts,
-                          exec_it->second->GetParameterLayouts());
-      return std::move(exec_layouts[param_operand.getOperandNumber()]);
-    } else {
-      return absl::FailedPreconditionError(absl::StrFormat(
-          "Could not find SPMD executable %s", atom_program_name));
-    }
-  } else if (auto return_op = llvm::dyn_cast<mlir::func::ReturnOp>(
-                 param_operand.getOwner())) {
-    // TODO(b/382761415): AUTO layouts should be handled during IFRT IR program
-    // compilation so that by the time this method is called there should be no
-    // AUTO layouts.
-    // The parameter is not used by any atom program, return device default
-    // layout.
-    const auto& out_spec = out_specs[param_operand.getOperandNumber()];
-    TF_ASSIGN_OR_RETURN(auto shard_shape,
-                        out_spec.sharding->GetShardShape(out_spec.shape));
-    return client->GetDefaultPjRtLayout(
-        out_spec.dtype, shard_shape.dims(),
-        out_spec.sharding->devices()->devices().front(),
-        out_spec.sharding->memory_kind());
-  } else if (auto copy_arrays =
-                 llvm::dyn_cast<ifrt::CopyArraysOp>(param_operand.getOwner())) {
-    // If the parameter is used by a CopyArraysOp, we assume a default layout.
-    if (auto arg = llvm::dyn_cast<mlir::BlockArgument>(param_operand.get())) {
-      const auto& arg_spec = in_specs[arg.getArgNumber()];
-      TF_ASSIGN_OR_RETURN(auto shard_shape,
-                          arg_spec.sharding->GetShardShape(arg_spec.shape));
-      return client->GetDefaultPjRtLayout(
-          arg_spec.dtype, shard_shape.dims(),
-          arg_spec.sharding->devices()->devices().front(),
-          arg_spec.sharding->memory_kind());
-    } else {
-      return absl::FailedPreconditionError(absl::StrFormat(
-          "Parameter used by CopyArraysOp does not originate from a block "
-          "argument. Parameter used by %s",
-          xla::ifrt::OperationToString(param_operand.getOwner(),
-                                       mlir::OpPrintingFlags())));
-    }
+    mlir::SymbolTableCollection& symbol_table,
+    ifrt::CallLoadedExecutableOp& call_op, int param_operand_number) {
+  // The parameter is used by a CallLoadedExecutableOp, return the layout
+  // from the atom program executable.
+  xla::ifrt::LoadedExecutableOp loaded_exec_op =
+      call_op.getCalleeOp(symbol_table);
+  auto atom_program_name = loaded_exec_op.getSymName().str();
+  auto exec_it = atom_program_executables.find(atom_program_name);
+  if (exec_it != atom_program_executables.end()) {
+    TF_ASSIGN_OR_RETURN(auto exec_layouts,
+                        exec_it->second->GetParameterLayouts());
+    return std::move(exec_layouts[param_operand_number]);
   } else {
     return absl::FailedPreconditionError(absl::StrFormat(
-        "Layouts are supported only for programs that have parameters used "
-        "only by CallLoadedExecutableOp ops. Used by %s",
-        xla::ifrt::OperationToString(param_operand.getOwner(),
-                                     mlir::OpPrintingFlags())));
+        "Could not find SPMD executable %s", atom_program_name));
   }
 }
 
@@ -213,36 +183,73 @@ absl::Status PopulateLayouts(mlir::ModuleOp mlir_module,
     std::shared_ptr<const xla::PjRtLayout> parameter_layout;
     if (arg.use_empty()) {
       // The argument is not used. Return device default layout.
-      const auto& arg_spec = in_specs[arg.getArgNumber()];
-      TF_ASSIGN_OR_RETURN(auto shard_shape,
-                          arg_spec.sharding->GetShardShape(arg_spec.shape));
-      TF_ASSIGN_OR_RETURN(parameter_layout,
-                          client->GetDefaultPjRtLayout(
-                              arg_spec.dtype, shard_shape.dims(),
-                              arg_spec.sharding->devices()->devices().front(),
-                              arg_spec.sharding->memory_kind()));
+      TF_ASSIGN_OR_RETURN(
+          parameter_layout,
+          BuildDefaultLayout(in_specs[arg.getArgNumber()], client));
     } else {
-      mlir::OpOperand& first_use = *arg.getUses().begin();
-      TF_ASSIGN_OR_RETURN(parameter_layout,
-                          GetParameterLayoutFromConsumer(
-                              client, atom_program_executables, in_specs,
-                              out_specs, symbol_table, first_use));
-      for (mlir::OpOperand& use : llvm::drop_begin(arg.getUses())) {
-        TF_ASSIGN_OR_RETURN(auto layout_from_executable,
-                            GetParameterLayoutFromConsumer(
-                                client, atom_program_executables, in_specs,
-                                out_specs, symbol_table, use));
-        // Verify that all uses of the parameter have the same layout.
-        if (*parameter_layout != *layout_from_executable) {
+      bool found_copy_arrays_user = false;
+      // Find the layout from the first LoadedExecutableOp consumer or just
+      // return any of the users otherwise. Possible users: CopyArraysOp,
+      // ReturnOp, and other LoadedExecutableOp.
+      for (mlir::OpOperand& use : arg.getUses()) {
+        if (llvm::isa<mlir::func::ReturnOp>(use.getOwner())) {
+          continue;
+        }
+        if (llvm::isa<ifrt::CopyArraysOp>(use.getOwner())) {
+          found_copy_arrays_user = true;
+          continue;
+        }
+
+        if (!llvm::isa<ifrt::CallLoadedExecutableOp>(use.getOwner())) {
+          return absl::FailedPreconditionError(absl::StrFormat(
+              "Layouts are supported only for programs that have parameters "
+              "used only by CallLoadedExecutableOp ops. Parameter %d is used "
+              "by %s",
+              arg.getArgNumber(),
+              xla::ifrt::OperationToString(use.getOwner(),
+                                           mlir::OpPrintingFlags())));
+        }
+        auto call_op = llvm::cast<ifrt::CallLoadedExecutableOp>(use.getOwner());
+        TF_ASSIGN_OR_RETURN(
+            std::shared_ptr<const xla::PjRtLayout> consumer_layout,
+            GetParameterLayoutFromLoadedExecutable(
+                client, atom_program_executables, in_specs, out_specs,
+                symbol_table, call_op, use.getOperandNumber()));
+        if (!parameter_layout) {
+          parameter_layout = std::move(consumer_layout);
+          continue;
+        }
+        if (*parameter_layout != *consumer_layout) {
           return absl::InternalError(absl::StrFormat(
               "Parameter %d is used by atom programs with incompatible "
               "layouts: %s vs. %s. This happens because support for layout "
               "progation within MPMD programs is limited. Contact "
               "ml-pathways-team@ for help",
               arg.getArgNumber(), parameter_layout->ToString(),
-              layout_from_executable->ToString()));
+              consumer_layout->ToString()));
+        }
+      }
+      if (parameter_layout && found_copy_arrays_user) {
+        // Need to check if the layout is compatible with the CopyArraysOp.
+        TF_ASSIGN_OR_RETURN(
+            std::shared_ptr<const xla::PjRtLayout> default_layout,
+            BuildDefaultLayout(in_specs[arg.getArgNumber()], client));
+        if (*parameter_layout != *default_layout) {
+          return absl::InternalError(absl::StrFormat(
+              "Parameter %d is used by atom program with layout: %s and in "
+              "transfer op with layout: %s. This happens because support for "
+              "layout progation within MPMD programs is limited. Contact "
+              "ml-pathways-team@ for help",
+              arg.getArgNumber(), parameter_layout->ToString(),
+              default_layout->ToString()));
         }
       }
+      if (!parameter_layout) {
+        // The argument was skipped above, meaning only used by ReturnOp.
+        TF_ASSIGN_OR_RETURN(
+            parameter_layout,
+            BuildDefaultLayout(in_specs[arg.getArgNumber()], client));
+      }
     }
     in_specs[arg.getArgNumber()].layout = std::move(parameter_layout);
   }
@@ -252,15 +259,9 @@ absl::Status PopulateLayouts(mlir::ModuleOp mlir_module,
     auto& out_spec = out_specs[return_operand.getOperandNumber()];
     if (mlir::BlockArgument block_arg =
             llvm::dyn_cast<mlir::BlockArgument>(return_operand.get())) {
-      // The output is an argument of the IFRT IR program. Assume device
-      // default layout.
-      TF_ASSIGN_OR_RETURN(auto shard_shape,
-                          out_spec.sharding->GetShardShape(out_spec.shape));
-      TF_ASSIGN_OR_RETURN(out_spec.layout,
-                          client->GetDefaultPjRtLayout(
-                              out_spec.dtype, shard_shape.dims(),
-                              out_spec.sharding->devices()->devices().front(),
-                              out_spec.sharding->memory_kind()));
+      // If result is a main func BlockArg, then it has already propagated the
+      // layout above.
+      out_spec.layout = in_specs[block_arg.getArgNumber()].layout;
       continue;
     }
     auto op_result = llvm::cast<mlir::OpResult>(return_operand.get());
@@ -283,6 +284,11 @@ absl::Status PopulateLayouts(mlir::ModuleOp mlir_module,
         return absl::FailedPreconditionError(absl::StrFormat(
             "Could not find SPMD executable %s", atom_program_name));
       }
+    } else if (llvm::isa<ifrt::CopyArraysOp>(op_result.getOwner())) {
+      // The output is produced by a CopyArraysOp. Must be device
+      // default layout.
+      TF_ASSIGN_OR_RETURN(out_spec.layout,
+                          BuildDefaultLayout(out_spec, client));
     } else {
       return absl::FailedPreconditionError(absl::StrFormat(
           "Layouts are supported only for programs that have outputs produced "
@@ -299,11 +305,16 @@ absl::Status PopulateLayouts(mlir::ModuleOp mlir_module,
 
 absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     std::unique_ptr<xla::ifrt::IfrtIRProgram> ifrt_ir_program,
-    std::unique_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options,
+    std::unique_ptr<xla::ifrt::IfrtIRCompileOptions> ifrt_ir_compile_options,
     xla::ifrt::Client* client,
     std::shared_ptr<xla::ifrt::AtomProgramCompiler> atom_program_compiler) {
   TraceMe traceme([]() { return "ProgramCompiler::CompileForInterpreter"; });
 
+  // Sharing the compile options with the passes and when pipeline is done add
+  // it to the CompiledIfrtIrProgram.
+  std::shared_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options =
+      std::move(ifrt_ir_compile_options);
+
   std::vector<xla::ifrt::Device*> devices;
   devices.reserve(compile_options->device_assignments.size());
   for (const auto& device_id : compile_options->device_assignments) {
@@ -388,8 +399,7 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     }
     TF_RETURN_IF_ERROR(xla::ifrt::createOutlinedAtomProgramsToCompiledPipeline(
         pm, std::move(atom_program_compiler), compile_pipeline_options,
-        std::move(compile_options), atom_executable_map,
-        std::move(bound_executable_map)));
+        compile_options, atom_executable_map, std::move(bound_executable_map)));
 
     {
       TraceMe traceme(
@@ -438,6 +448,7 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
       /*donatable_input_indices=*/std::move(donatable_input_indices),
       /*program=*/std::move(ifrt_ir_program),
       /*device_assignments=*/std::move(device_assignments),
+      /*compile_options=*/compile_options,
   };
 }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
index 5896dfc52c41d9..509750627489e1 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
@@ -60,6 +60,9 @@ struct CompiledIfrtIrProgram {
   // ids obtained from IFRT client.
   std::vector<xla::ifrt::DeviceId> device_assignments;
 
+  // The compile options used to compile the program.
+  std::shared_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options;
+
   // Compiles an IFRT IR program.
   static absl::StatusOr<CompiledIfrtIrProgram> Create(
       std::unique_ptr<xla::ifrt::IfrtIRProgram> ifrt_ir_program,
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc
index eb462b2349d1d8..97a353b77dd1ca 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
 #include "xla/python/ifrt/support/module_parsing.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -43,7 +43,6 @@ namespace {
 
 using ::testing::HasSubstr;
 using ::testing::Not;
-using ::tsl::testing::StatusIs;
 
 std::string PrintModule(mlir::ModuleOp module) {
   std::string module_str;
diff --git a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
index b71ac31a4b8dac..7e8612f830303f 100644
--- a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
+++ b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
@@ -227,7 +227,8 @@ absl::StatusOr<ExecuteResult> ProgramInterpreter::Execute(
   for (const auto [idx, arg] : llvm::enumerate(main_func.getArguments())) {
     // Add to the environment the arrays that are used.
     bool is_donated = main_func.getArgAttr(
-                          idx, xla::ifrt::kIfrtDonatedArgAttrName) != nullptr;
+                          idx, xla::ifrt::kIfrtDonatedArgAttrName) != nullptr &&
+                      !options.non_donatable_input_indices.contains(idx);
     if (!arg.use_empty()) {
       env.AssociateArray(arg, ArrayState{/*array=*/arrays[idx],
                                          /*can_be_donated=*/is_donated});
@@ -300,10 +301,6 @@ absl::Status ProgramInterpreter::ExecuteOp(
   execute_options.fill_status = env.fill_status;
   llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
   for (const auto [idx, input] : llvm::enumerate(call_loaded_op.getInputs())) {
-    bool is_donated = donated_arg_idxs.contains(idx);
-    if (is_donated || liveness_.isDeadAfter(input, call_loaded_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
     auto array_it = env.value_to_array.find(input);
     TF_RET_CHECK(array_it != env.value_to_array.end())
         << "Input array #" << idx << " not found. "
@@ -316,14 +313,20 @@ absl::Status ProgramInterpreter::ExecuteOp(
           PrettyPrint(call_loaded_op)));
     }
     inputs.push_back(array_it->second.array);
+
+    bool is_donated = donated_arg_idxs.contains(idx);
+    if (is_donated && !array_it->second.can_be_donated) {
+      VLOG(2) << "Atom program donates input #" << idx
+              << ", but it has not been donated to the IFRT IR program. "
+                 "Input will not be donated. \n"
+              << PrettyPrint(call_loaded_op);
+      is_donated = false;
+    }
+    if (is_donated || liveness_.isDeadAfter(input, call_loaded_op)) {
+      array_values_to_gc_from_env.insert(input);
+    }
     if (!is_donated) {
       execute_options.non_donatable_input_indices.insert(idx);
-    } else if (!array_it->second.can_be_donated) {
-      LOG(WARNING) << "Atom program donates input #" << idx
-                   << ", but it has not been donated to the IFRT IR program. "
-                      "Input will not be donated. \n"
-                   << PrettyPrint(call_loaded_op);
-      execute_options.non_donatable_input_indices.insert(idx);
     }
   }
 
@@ -413,17 +416,11 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
   input_specs.reserve(remap_op.getInputs().size());
   // Get the input specs of the remap plan and the input arrays.
   llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
+  std::optional<bool> is_donated;
   for (const auto [idx, input] : llvm::enumerate(remap_op.getInputs())) {
-    if (remap_op.getDonated() || liveness_.isDeadAfter(input, remap_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
     auto array_it = env.value_to_array.find(input);
     TF_RET_CHECK(array_it != env.value_to_array.end())
         << "Input array #" << idx << " not found. " << PrettyPrint(remap_op);
-    if (remap_op.getDonated() && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " cannot be donated. ", PrettyPrint(remap_op)));
-    }
     if (array_it->second.array->IsDeleted()) {
       // We explicitly check here for deletion in order to provide a more
       // informative error message.
@@ -436,7 +433,31 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
         /*dtype=*/array_it->second.array->dtype(),
         /*shape=*/array_it->second.array->shape(),
         /*sharding=*/array_it->second.array->shared_ptr_sharding()});
+
+    // The default buffer donation semantic is finalized at compilation time.
+    // Users can override the donation semantic at runtime. In the meantime, the
+    // IFRT client RemapArrays API requires all input arrays have the same
+    // donation semantic.
+    if (!is_donated.has_value()) {
+      is_donated = remap_op.getDonated() && array_it->second.can_be_donated;
+    }
+    if (*is_donated && !array_it->second.can_be_donated) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Donation semantic must be consistent across all input arrays of "
+          "RemapArraysOp. Input array #",
+          idx,
+          " cannot be donated, but previous input arrays can be donated. It's "
+          "likely due to a MPMD program argument is marked as non-donatable. ",
+          PrettyPrint(remap_op)));
+    }
+    if (*is_donated || liveness_.isDeadAfter(input, remap_op)) {
+      array_values_to_gc_from_env.insert(input);
+    }
   }
+  TF_RET_CHECK(is_donated.has_value())
+      << "Unable to determine the donation semantic of the remap op. The remap "
+         "op has no inputs. "
+      << PrettyPrint(remap_op);
 
   // Get the output specs of the remap plan.
   std::vector<xla::ifrt::ArraySpec> output_specs;
@@ -455,8 +476,8 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
 
   // Apply the remap arrays operation.
   xla::ifrt::ArrayCopySemantics copy_semantics =
-      remap_op.getDonated() ? xla::ifrt::ArrayCopySemantics::kDonateInput
-                            : xla::ifrt::ArrayCopySemantics::kReuseInput;
+      *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
+                  : xla::ifrt::ArrayCopySemantics::kReuseInput;
   TF_ASSIGN_OR_RETURN(
       auto out_arrays,
       client_->RemapArrays({
@@ -500,20 +521,12 @@ absl::Status ProgramInterpreter::ExecuteOp(
   std::vector<ArrayRef> inputs;
   inputs.reserve(copy_arrays_op.getInputs().size());
   llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
+  std::optional<bool> is_donated;
   for (const auto [idx, input] : llvm::enumerate(copy_arrays_op.getInputs())) {
-    if (copy_arrays_op.getDonated() ||
-        liveness_.isDeadAfter(input, copy_arrays_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
     auto array_it = env.value_to_array.find(input);
     TF_RET_CHECK(array_it != env.value_to_array.end())
         << "Input array #" << idx << " not found. "
         << PrettyPrint(copy_arrays_op);
-    if (copy_arrays_op.getDonated() && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Input array #", idx, " cannot be donated. ",
-                       PrettyPrint(copy_arrays_op)));
-    }
     if (array_it->second.array->IsDeleted()) {
       // We explicitly check here for deletion in order to provide a more
       // informative error message.
@@ -522,7 +535,32 @@ absl::Status ProgramInterpreter::ExecuteOp(
           PrettyPrint(copy_arrays_op)));
     }
     inputs.push_back(array_it->second.array);
+
+    // The default buffer donation semantic is finalized at compilation time.
+    // Users can override the donation semantic at runtime. In the meantime, the
+    // IFRT client CopyArrays API requires all input arrays have the same
+    // donation semantic.
+    if (!is_donated.has_value()) {
+      is_donated =
+          copy_arrays_op.getDonated() && array_it->second.can_be_donated;
+    }
+    if (*is_donated && !array_it->second.can_be_donated) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Donation semantic must be consistent across all input arrays of "
+          "CopyArraysOp. Input array #",
+          idx,
+          " cannot be donated, but previous input arrays can be donated. It's "
+          "likely due to a MPMD program argument is marked as non-donatable. ",
+          PrettyPrint(copy_arrays_op)));
+    }
+    if (*is_donated || liveness_.isDeadAfter(input, copy_arrays_op)) {
+      array_values_to_gc_from_env.insert(input);
+    }
   }
+  TF_RET_CHECK(is_donated.has_value())
+      << "Unable to determine the donation semantic of the copy arrays op. The "
+         "copy arrays op has no inputs. "
+      << PrettyPrint(copy_arrays_op);
 
   const auto out_array_type = llvm::cast<xla::ifrt::IfrtArrayType>(
       copy_arrays_op.getOutputs().front().getType());
@@ -530,7 +568,7 @@ absl::Status ProgramInterpreter::ExecuteOp(
       << "Output array #0 is not of type `IfrtArrayType`. "
       << PrettyPrint(copy_arrays_op);
   auto new_sharding = array_type_to_sharding_.at(out_array_type);
-  auto array_copy_semantics = copy_arrays_op.getDonated()
+  auto array_copy_semantics = *is_donated
                                   ? xla::ifrt::ArrayCopySemantics::kDonateInput
                                   : xla::ifrt::ArrayCopySemantics::kAlwaysCopy;
   // It is safe to get the devices and memory kind from the first output
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/BUILD b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
index 19de11f791cc1a..5cf59b43f365c4 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
@@ -144,7 +144,6 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
index e71542292ba4b5..df7d7774895e68 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
@@ -36,12 +36,12 @@ limitations under the License.
 #include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/ir/tests/executable_impl_test_base.h"
 #include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -49,7 +49,6 @@ namespace xla {
 namespace ifrt {
 namespace {
 
-using ::tsl::testing::IsOk;
 using ::xla::ifrt::test_util::AssertPerShardData;
 
 class IfrtIrExecutableImplTest
@@ -616,6 +615,170 @@ module {
               testing::Not(absl_testing::IsOk()));
 }
 
+TEST_F(IfrtIrExecutableImplTest, CallLoadedExecutableOpDonationOverride) {
+  std::string source = R"(
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
+module {
+  func.func @main(%arg0: !array {ifrt.donated}) -> !array
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @add_one(%arg0) on devices [0,1]
+        {io_aliases=[array<i32: 0, 0>]} : (!array) -> !array
+    return %0 : !array
+  }
+
+  func.func private @add_one(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+    %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+    return %1 : tensor<2x2xi32>
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutableRef loaded_exec,
+      client_->GetDefaultCompiler()->CompileAndLoad(
+          std::make_unique<IfrtIRProgram>(*mlir_module),
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  ExecuteOptions options;
+  options.fill_status = true;
+  options.non_donatable_input_indices.insert(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), options,
+                           /*devices=*/std::nullopt));
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(
+      AssertPerShardData<int>(result.outputs[0], DType(DType::kS32),
+                              Shape({1, 2}), {{1, 2}, {3, 4}}, devices));
+  // Not using `CopyToHostBuffer` because some implementations don't support it.
+  ASSERT_FALSE(input->IsDeleted());
+  EXPECT_THAT(input->DisassembleIntoSingleDeviceArrays(
+                  ArrayCopySemantics::kAlwaysCopy,
+                  SingleDeviceShardSemantics::kAddressableShards),
+              absl_testing::IsOk());
+}
+
+TEST_F(IfrtIrExecutableImplTest, CopyArraysOpDonationOverride) {
+  std::string source = R"(
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
+module {
+  func.func @main(%arg0: !array {ifrt.donated}) -> !array
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.CopyArrays(%arg0) : (!array) -> !array
+    return %0 : !array
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutableRef loaded_exec,
+      client_->GetDefaultCompiler()->CompileAndLoad(
+          std::make_unique<IfrtIRProgram>(*mlir_module),
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  ExecuteOptions options;
+  options.fill_status = true;
+  options.non_donatable_input_indices.insert(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), options,
+                           /*devices=*/std::nullopt));
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(
+      AssertPerShardData<int>(result.outputs[0], DType(DType::kS32),
+                              Shape({1, 2}), {{0, 1}, {2, 3}}, devices));
+  // Not using `CopyToHostBuffer` because some implementations don't support it.
+  ASSERT_FALSE(input->IsDeleted());
+  EXPECT_THAT(input->DisassembleIntoSingleDeviceArrays(
+                  ArrayCopySemantics::kAlwaysCopy,
+                  SingleDeviceShardSemantics::kAddressableShards),
+              absl_testing::IsOk());
+}
+
+TEST_F(IfrtIrExecutableImplTest, RemapArraysOpDonationOverride) {
+  std::string source = R"(
+!array = !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
+!array0 = !ifrt.array<tensor<1x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+!array1 = !ifrt.array<tensor<1x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [1]>
+module {
+  func.func @main(%arg0: !array {ifrt.donated}) -> (!array0, !array1)
+      attributes {ifrt.function} {
+    %0, %1 = ifrt.RemapArrays(%arg0)
+      mappings=[#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>,
+                #ifrt.array_mapping<0, 1, [#ifrt.mapping<[1:2:1] to [0:1:1]>]>]
+      : (!array) -> (!array0, !array1)
+    return %0, %1 : !array0, !array1
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutableRef loaded_exec,
+      client_->GetDefaultCompiler()->CompileAndLoad(
+          std::make_unique<IfrtIRProgram>(*mlir_module),
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data_shard0 = {0, 1};
+  std::vector<int> data_shard1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  ExecuteOptions options;
+  options.fill_status = true;
+  options.non_donatable_input_indices.insert(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), options, devices));
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 2);
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef device_list0,
+                          client_->MakeDeviceList({devices->devices()[0]}));
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[0], DType(DType::kS32), Shape({1, 2}), {{0, 1}},
+      std::move(device_list0)));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef device_list1,
+                          client_->MakeDeviceList({devices->devices()[1]}));
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[1], DType(DType::kS32), Shape({1, 2}), {{2, 3}},
+      std::move(device_list1)));
+  // Not using `CopyToHostBuffer` because some implementations don't support it.
+  ASSERT_FALSE(input->IsDeleted());
+  EXPECT_THAT(input->DisassembleIntoSingleDeviceArrays(
+                  ArrayCopySemantics::kAlwaysCopy,
+                  SingleDeviceShardSemantics::kAddressableShards),
+              absl_testing::IsOk());
+}
+
 TEST_F(IfrtIrExecutableImplTest, DonateOutputOfCall) {
   std::string source = R"(
 !array = !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>,
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
index 7ab409d6c7e2b2..804251f7017366 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
@@ -109,6 +109,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf_lite",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:FuncDialect",
@@ -209,6 +210,5 @@ xla_cc_test(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/debug_test.cc b/third_party/xla/xla/python/ifrt/ir/transforms/debug_test.cc
index 6f709a624276a0..a8dba8262a1b5f 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/debug_test.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/debug_test.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace ifrt {
@@ -47,7 +46,6 @@ namespace {
 
 using ::testing::Contains;
 using ::testing::ContainsRegex;
-using ::tsl::testing::IsOkAndHolds;
 
 class NopPass : public mlir::PassWrapper<NopPass, mlir::OperationPass<>> {
  public:
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_from_vhlo_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_from_vhlo_pass.cc
index 43b9cbfe058375..e03bc80a1b2d60 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_from_vhlo_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_from_vhlo_pass.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/InitAllDialects.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/TypeID.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "stablehlo/dialect/Register.h"
 #include "stablehlo/dialect/Serialization.h"
 #include "xla/python/ifrt/ir/ifrt_ir_program.pb.h"
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_to_vhlo_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_to_vhlo_pass.cc
index e3aca5ddfaf61d..55cadf3d34c391 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_to_vhlo_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_atom_programs_to_vhlo_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/TypeID.h"
 #include "mlir/Support/WalkResult.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "stablehlo/dialect/Register.h"
 #include "stablehlo/dialect/Serialization.h"
 #include "stablehlo/dialect/Version.h"
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
index 1d436920af7131..e5aebbd2c0676f 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
@@ -60,27 +60,6 @@ namespace ifrt {
 
 namespace {
 
-// Lazily initialized shared thread pool.
-tsl::thread::ThreadPool* thread_pool() {
-  static tsl::thread::ThreadPool* thread_pool = []() {
-    constexpr int kMaxParallelism = 32;
-    return new tsl::thread::ThreadPool(tsl::Env::Default(),
-                                       tsl::ThreadOptions(),
-                                       "CompileAtomPrograms", kMaxParallelism);
-  }();
-  return thread_pool;
-}
-
-void ScheduleWork(tsl::thread::ThreadPool* pool,
-                  absl::AnyInvocable<void()> callee) {
-  // ThreadPool expects std::function that must be copyable, but we can avoid
-  // this by using AnyInvocable.
-  pool->Schedule([ptr = new absl::AnyInvocable<void()>(std::move(callee))]() {
-    (*ptr)();
-    delete ptr;
-  });
-}
-
 // Construct a bool vector with a True entry for each input sharding that must
 // be inferred.
 llvm::SmallVector<bool> GetInputShardingPropagation(
@@ -125,7 +104,7 @@ absl::StatusOr<CompileFuture> MultiThreadedAtomProgramCompiler::CompileModule(
   auto module_type =
       call_op->getAttrOfType<mlir::StringAttr>(kIfrtModuleTypeAttrName);
   if (module_type == kIfrtModuleTypeXla) {
-    return CompileXla(call_op, module_op, thread_pool());
+    return CompileXla(call_op, module_op);
   } else if (module_type == kIfrtModuleTypeMpmdReshard) {
     return CompileMpmdReshard(module_op);
   } else if (module_type == nullptr) {
@@ -190,8 +169,7 @@ MultiThreadedAtomProgramCompiler::GetXlaCompileOptions(
 }
 
 absl::StatusOr<CompileFuture> MultiThreadedAtomProgramCompiler::CompileXla(
-    CallOp call_op, mlir::ModuleOp module_op,
-    tsl::thread::ThreadPool* thread_pool) {
+    CallOp call_op, mlir::ModuleOp module_op) {
   TF_ASSIGN_OR_RETURN(xla::CompileOptions compile_options,
                       GetXlaCompileOptions(call_op, module_op));
 
@@ -203,8 +181,7 @@ absl::StatusOr<CompileFuture> MultiThreadedAtomProgramCompiler::CompileXla(
       /*context=*/nullptr,  // Shares the same long-living context.
       mlir::OwningOpRef<mlir::ModuleOp>(module_op.clone()));
   auto [promise, future] = CompileFuture::MakePromise();
-  ScheduleWork(
-      thread_pool,
+  tsl::Env::Default()->SchedClosure(
       WithCurrentUserContext([this, hlo_program = std::move(hlo_program),
                               compile_options = std::move(compile_options),
                               promise = std::move(promise)]() mutable {
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.h b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.h
index 468c30ce09a27c..0b87eb11012228 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.h
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.h
@@ -60,9 +60,8 @@ class MultiThreadedAtomProgramCompiler {
   //
   // Note that the method runs `ifrt-compile-xla-preprocessing-pipeline`
   // before dispatching compilation.
-  absl::StatusOr<CompileFuture> CompileXla(
-      CallOp call_op, mlir::ModuleOp module_op,
-      tsl::thread::ThreadPool* thread_pool);
+  absl::StatusOr<CompileFuture> CompileXla(CallOp call_op,
+                                           mlir::ModuleOp module_op);
 
   // Returns a future of a AtomProgramCompileResult for the MPMD reshard module.
   absl::StatusOr<CompileFuture> CompileMpmdReshard(mlir::ModuleOp module_op);
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/passes.h b/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
index 006f1172def4b2..f02644ae933ef5 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/ir/atom_program_compiler.h"
 #include "xla/python/ifrt/ir/ifrt_ir_program.h"
diff --git a/third_party/xla/xla/python/ifrt/layout_test.cc b/third_party/xla/xla/python/ifrt/layout_test.cc
index 486717bc553185..699002130b81ad 100644
--- a/third_party/xla/xla/python/ifrt/layout_test.cc
+++ b/third_party/xla/xla/python/ifrt/layout_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -47,8 +46,6 @@ using ::testing::HasSubstr;
 using ::testing::Optional;
 using ::testing::Return;
 using ::testing::ReturnRef;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(CompactLayoutTest, Create) {
   {
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index e4c12253de94ee..cca55d1ca94b7c 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -317,13 +317,6 @@ class MockExecutable : public llvm::RTTIExtends<MockExecutable, Executable> {
 class MockLoadedExecutable
     : public llvm::RTTIExtends<MockLoadedExecutable, LoadedExecutable> {
  public:
-  MockLoadedExecutable() {
-    static absl::NoDestructor<DeviceListRef> kEmptyDeviceList(
-        BasicDeviceList::Create({}));
-    ON_CALL(*this, devices())
-        .WillByDefault(testing::ReturnRef(*kEmptyDeviceList));
-  }
-
   MOCK_METHOD(Client*, client, (), (const, final));
   MOCK_METHOD(absl::string_view, name, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::optional<std::string>>, Fingerprint, (),
@@ -363,7 +356,7 @@ class MockLoadedExecutable
               (final));
   MOCK_METHOD(absl::Span<Device* const>, addressable_devices, (),
               (const, final));
-  MOCK_METHOD(const DeviceListRef&, devices, (), (const, final));
+  MOCK_METHOD(std::optional<DeviceListRef>, devices, (), (const, final));
 
   static char ID;  // NOLINT
 };
diff --git a/third_party/xla/xla/python/ifrt/serdes_test.cc b/third_party/xla/xla/python/ifrt/serdes_test.cc
index da22683fecf175..f1cd3e315027b0 100644
--- a/third_party/xla/xla/python/ifrt/serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/serdes_test.cc
@@ -31,15 +31,12 @@ limitations under the License.
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/serdes.pb.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
 namespace {
 
-using ::tsl::testing::StatusIs;
-
 struct TestNumberSerializeOptions;
 struct TestNumberDeserializeOptions;
 
diff --git a/third_party/xla/xla/python/ifrt/shape_test.cc b/third_party/xla/xla/python/ifrt/shape_test.cc
index dd29fda934aa65..658510344e5839 100644
--- a/third_party/xla/xla/python/ifrt/shape_test.cc
+++ b/third_party/xla/xla/python/ifrt/shape_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
 #include "xla/python/ifrt/shape.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -39,7 +38,6 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 TEST(ShapeTest, LargeDim) {
   Shape shape({std::numeric_limits<int64_t>::max()});
diff --git a/third_party/xla/xla/python/ifrt/sharding_test.cc b/third_party/xla/xla/python/ifrt/sharding_test.cc
index b17ade75aa6e75..4f472db29212f0 100644
--- a/third_party/xla/xla/python/ifrt/sharding_test.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -45,8 +44,6 @@ using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::SizeIs;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 class ShardingTest : public testing::TestWithParam<test_util::DeviceTestParam> {
  public:
diff --git a/third_party/xla/xla/python/ifrt/support/BUILD b/third_party/xla/xla/python/ifrt/support/BUILD
index cc3e641b0a08a4..99fbda232fc998 100644
--- a/third_party/xla/xla/python/ifrt/support/BUILD
+++ b/third_party/xla/xla/python/ifrt/support/BUILD
@@ -71,7 +71,6 @@ xla_cc_test(
         "//xla/python/ifrt/ir:sharding_param",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
index d64efd0070b523..bdd6eae8f532e0 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -53,7 +52,6 @@ namespace support {
 namespace {
 
 using ::testing::Return;
-using ::tsl::testing::StatusIs;
 using xla::HloSharding;
 
 absl::StatusOr<HloSharding> ToHloShardingViaOpSharding(
diff --git a/third_party/xla/xla/python/ifrt/test_util.cc b/third_party/xla/xla/python/ifrt/test_util.cc
index ce3bbee8157fc4..5e2f317119c0f6 100644
--- a/third_party/xla/xla/python/ifrt/test_util.cc
+++ b/third_party/xla/xla/python/ifrt/test_util.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 
 #include "absl/base/thread_annotations.h"
@@ -123,7 +125,11 @@ absl::StatusOr<DeviceListRef> GetAddressableDevices(
   return client->MakeDeviceList(std::move(devices));
 }
 
-UserContextRef MakeUserContext(uint64_t id) {
+UserContextRef MakeUserContext(uint64_t id,
+                               std::optional<std::string> debug_string) {
+  if (debug_string.has_value()) {
+    return TestUserContext::Create(UserContextId(id), *std::move(debug_string));
+  }
   return TestUserContext::Create(UserContextId(id));
 }
 
diff --git a/third_party/xla/xla/python/ifrt/test_util.h b/third_party/xla/xla/python/ifrt/test_util.h
index 65cd5cc8785db9..efd122a7d92950 100644
--- a/third_party/xla/xla/python/ifrt/test_util.h
+++ b/third_party/xla/xla/python/ifrt/test_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/status/statusor.h"
@@ -68,7 +69,7 @@ void AssertPerShardData(
               testing::ElementsAreArray(GetDeviceIds(expected_device_list)));
   TF_ASSERT_OK_AND_ASSIGN(auto actual_per_shard_arrays,
                           actual->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy,
+                              ArrayCopySemantics::kReuseInput,
                               SingleDeviceShardSemantics::kAddressableShards));
   ASSERT_EQ(actual_per_shard_arrays.size(), expected_per_shard_data.size());
   for (int i = 0; i < actual_per_shard_arrays.size(); ++i) {
@@ -97,8 +98,11 @@ absl::StatusOr<DeviceListRef> GetAddressableDevices(
     Client* client, absl::Span<const int> device_indices);
 
 // Returns a new `UserContext` for testing. The created `UserContext` has an
-// ID equal to `id`.
-UserContextRef MakeUserContext(uint64_t id);
+// ID equal to `id`. `debug_string` defaults to `TestUserContext(id)`. This can
+// be overridden with a custom debug string for the tests that must use
+// multiple `UserContext`s with the same debug string.
+UserContextRef MakeUserContext(
+    uint64_t id, std::optional<std::string> debug_string = std::nullopt);
 
 }  // namespace test_util
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/user_context.cc b/third_party/xla/xla/python/ifrt/user_context.cc
index 046f28f5f396db..ddf132faf57fc1 100644
--- a/third_party/xla/xla/python/ifrt/user_context.cc
+++ b/third_party/xla/xla/python/ifrt/user_context.cc
@@ -50,21 +50,30 @@ ABSL_CONST_INIT thread_local
 absl_nonnull UserContextRef
 AnnotatedUserContext::Create(UserContextRef user_context, std::string msg) {
   return tsl::MakeRef<AnnotatedUserContext>(std::move(user_context),
-                                            std::move(msg));
+                                            std::move(msg), kAfter);
+}
+
+absl_nonnull UserContextRef
+AnnotatedUserContext::Create(std::string msg, UserContextRef user_context) {
+  return tsl::MakeRef<AnnotatedUserContext>(std::move(user_context),
+                                            std::move(msg), kBefore);
 }
 
 AnnotatedUserContext::AnnotatedUserContext(UserContextRef user_context,
-                                           std::string msg)
+                                           std::string msg,
+                                           MessagePosition msg_position)
     : id_(tsl::random::ThreadLocalNew64()),
       user_context_(std::move(user_context)),
-      msg_(std::move(msg)) {}
+      msg_(std::move(msg)),
+      msg_position_(msg_position) {}
 
 UserContextId AnnotatedUserContext::Id() const { return id_; }
 
 std::string AnnotatedUserContext::DebugString() const {
-  return absl::StrCat(
-      (user_context_ ? user_context_->DebugString() : "(nullptr user context)"),
-      "; ", msg_);
+  const std::string context_str =
+      user_context_ ? user_context_->DebugString() : "(nullptr user context)";
+  return msg_position_ == kBefore ? absl::StrCat(msg_, context_str)
+                                  : absl::StrCat(context_str, msg_);
 }
 
 absl_nonnull UserContextRef
@@ -124,9 +133,6 @@ UserContextScope::~UserContextScope() {
 
 absl_nullable const UserContextRef& UserContextScope::current() {
   if (current_context == nullptr) {
-#ifdef IFRT_REQUIRE_USER_CONTEXT
-    CHECK(false) << "User context is required but not set";
-#endif
     return *kNullContext;
   }
   return *current_context;
diff --git a/third_party/xla/xla/python/ifrt/user_context.h b/third_party/xla/xla/python/ifrt/user_context.h
index ee71b96bad7358..481f268427f1b5 100644
--- a/third_party/xla/xla/python/ifrt/user_context.h
+++ b/third_party/xla/xla/python/ifrt/user_context.h
@@ -75,9 +75,16 @@ using UserContextRef = tsl::RCReference<UserContext>;
 class AnnotatedUserContext
     : public llvm::RTTIExtends<AnnotatedUserContext, UserContext> {
  public:
+  // `DebugString()` will contain `msg` appended to `user_context`'s
+  // debug string (with no delimiter in between).
   static absl_nonnull UserContextRef Create(UserContextRef user_context,
                                             std::string msg);
 
+  // Like the above, but `DebugString()` prepends `msg` to
+  // `user_context`'s debug string.
+  static absl_nonnull UserContextRef Create(std::string msg,
+                                            UserContextRef user_context);
+
   const UserContextRef& user_context() const { return user_context_; }
   absl::string_view msg() const { return msg_; }
 
@@ -92,11 +99,19 @@ class AnnotatedUserContext
   template <typename T, typename... Args>
   friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
 
-  explicit AnnotatedUserContext(UserContextRef user_context, std::string msg);
+  enum MessagePosition {
+    kBefore,
+    kAfter,
+  };
+
+  explicit AnnotatedUserContext(UserContextRef user_context, std::string msg,
+                                MessagePosition msg_position);
 
   UserContextId id_;
   UserContextRef user_context_;
   std::string msg_;
+
+  MessagePosition msg_position_;
 };
 
 // `ChainedUserContext` represents a chain of `UserContext`s of the operations,
diff --git a/third_party/xla/xla/python/ifrt/user_context_registry.cc b/third_party/xla/xla/python/ifrt/user_context_registry.cc
index 0b30c04d53bc40..6d19f6424c2098 100644
--- a/third_party/xla/xla/python/ifrt/user_context_registry.cc
+++ b/third_party/xla/xla/python/ifrt/user_context_registry.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "xla/python/ifrt/user_context_registry.h"
 
+#include <limits>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/base/no_destructor.h"
 #include "absl/base/nullability.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/user_context.h"
 
@@ -97,5 +103,27 @@ void UserContextRegistry::Unregister(
   }
 }
 
+CustomStatusExpanderRegistry& CustomStatusExpanderRegistry::Get() {
+  static absl::NoDestructor<CustomStatusExpanderRegistry> registry;
+  return *registry;
+}
+
+void CustomStatusExpanderRegistry::Register(absl::string_view payload_name,
+                                            PayloadExpanderFn expander,
+                                            std::optional<int> process_order) {
+  absl::WriterMutexLock lock(mu_);
+  std::pair<int, std::string> key = {
+      process_order.value_or(std::numeric_limits<int>::max()),
+      std::string(payload_name)};
+  CHECK(registry_.insert({std::move(key), std::move(expander)}).second);
+}
+
+void CustomStatusExpanderRegistry::Process(absl::Status& status) {
+  absl::ReaderMutexLock lock(mu_);
+  for (const auto& [_, expander] : registry_) {
+    expander(status);
+  }
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/user_context_registry.h b/third_party/xla/xla/python/ifrt/user_context_registry.h
index 884116cb576649..acff474af6dc97 100644
--- a/third_party/xla/xla/python/ifrt/user_context_registry.h
+++ b/third_party/xla/xla/python/ifrt/user_context_registry.h
@@ -16,13 +16,19 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_USER_CONTEXT_REGISTRY_H_
 #define XLA_PYTHON_IFRT_USER_CONTEXT_REGISTRY_H_
 
+#include <functional>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/user_context.h"
 
@@ -122,6 +128,36 @@ class TrackedUserContext {
   absl_nonnull const UserContextRef user_context_;
 };
 
+// CustomStatusExpanderRegistry allows registering 'payload expanders' that
+// errors returned by the IFRT backend are processed through before the error
+// message is returned to IFRT users.
+class CustomStatusExpanderRegistry {
+ public:
+  static CustomStatusExpanderRegistry& Get();
+
+  using PayloadExpanderFn = std::function<void(absl::Status&)>;
+  // Registers a payload expander. `expander` is expected to take the entire
+  // `absl::Status` object, remove the payload from the object, and modify the
+  // contents of the `absl::Status` accordingly.
+  //
+  // The optional `process_order`, if supplied, determines the order in which
+  // the expander is processed in relation to other expanders. Expanders with
+  // lower process orders are processed first; please use a positive value
+  // unless you have discussed with IFRT maintainers about writing a
+  // a critical expander function that needs to be processed earlier. Order
+  // among expanders of the same `process_order` is unspecified.
+  void Register(absl::string_view payload_name, PayloadExpanderFn expander,
+                std::optional<int> process_order = std::nullopt);
+
+  // Invokes all registered expanders on the given status.
+  void Process(absl::Status& status);
+
+ private:
+  mutable absl::Mutex mu_;
+  absl::btree_map<std::pair<int, std::string>, PayloadExpanderFn> registry_
+      ABSL_GUARDED_BY(mu_);
+};
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/user_context_status_util.cc b/third_party/xla/xla/python/ifrt/user_context_status_util.cc
index d85bf48d3e7711..1a7722feae2a02 100644
--- a/third_party/xla/xla/python/ifrt/user_context_status_util.cc
+++ b/third_party/xla/xla/python/ifrt/user_context_status_util.cc
@@ -100,14 +100,15 @@ absl::Status ReattachUserContextRefs(absl::Status status) {
   return status;
 }
 
-absl::Status ExpandUserContexts(absl::Status status) {
+static void ExpandStandardUserContext(absl::Status& status) {
   if (status.ok()) {
-    return status;
+    return;
   }
+
   std::optional<absl::Cord> payload =
       status.GetPayload(kIfrtUserContextPayloadUrl);
   if (!payload.has_value()) {
-    return status;
+    return;
   }
 
   status.ErasePayload(kIfrtUserContextPayloadUrl);
@@ -117,7 +118,7 @@ absl::Status ExpandUserContexts(absl::Status status) {
     tsl::errors::AppendToMessage(
         &status, "\n(failed to parse a user context ID: ", payload->Flatten(),
         ")");
-    return status;
+    return;
   }
   TrackedUserContextRef user_context =
       UserContextRegistry::Get().Lookup(UserContextId(user_context_id));
@@ -125,12 +126,23 @@ absl::Status ExpandUserContexts(absl::Status status) {
     tsl::errors::AppendToMessage(
         &status, "\n(failed to find a user context for ID: ", user_context_id,
         ")");
-    return status;
+    return;
   }
   tsl::errors::AppendToMessage(&status, "\n",
                                user_context->user_context()->DebugString());
+}
+
+absl::Status ExpandUserContexts(absl::Status status) {
+  CustomStatusExpanderRegistry::Get().Process(status);
   return status;
 }
 
+static const bool register_standard_user_context = []() {
+  xla::ifrt::CustomStatusExpanderRegistry::Get().Register(
+      kIfrtUserContextPayloadUrl, ExpandStandardUserContext,
+      /*process_order=*/-1);
+  return true;
+}();
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/user_context_status_util_test.cc b/third_party/xla/xla/python/ifrt/user_context_status_util_test.cc
index eccf4e5e401229..75eacdbe3babaa 100644
--- a/third_party/xla/xla/python/ifrt/user_context_status_util_test.cc
+++ b/third_party/xla/xla/python/ifrt/user_context_status_util_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/python/ifrt/user_context_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/status_to_from_proto.h"
+#include "xla/tsl/protobuf/status.pb.h"
 
 namespace xla {
 namespace ifrt {
diff --git a/third_party/xla/xla/python/ifrt/user_context_test.cc b/third_party/xla/xla/python/ifrt/user_context_test.cc
index 7771698eeddbc6..46e96e52b8e9c2 100644
--- a/third_party/xla/xla/python/ifrt/user_context_test.cc
+++ b/third_party/xla/xla/python/ifrt/user_context_test.cc
@@ -35,15 +35,15 @@ TEST(AnnotatedUserContextTest, Id) {
   UserContextRef context = TestUserContext::Create(kUserContextId);
 
   UserContextRef annotated_context1 =
-      AnnotatedUserContext::Create(context, "test annotation");
+      AnnotatedUserContext::Create(context, "; test annotation");
   EXPECT_NE(annotated_context1->Id(), context->Id());
 
   UserContextRef annotated_context2 =
-      AnnotatedUserContext::Create(context, "test annotation 2");
+      AnnotatedUserContext::Create(context, "; test annotation 2");
   EXPECT_NE(annotated_context2->Id(), annotated_context1->Id());
 
   UserContextRef annotated_context3 =
-      AnnotatedUserContext::Create(UserContextRef(), "test annotation");
+      AnnotatedUserContext::Create(UserContextRef(), "; test annotation");
   EXPECT_NE(annotated_context3->Id(), annotated_context1->Id());
 }
 
@@ -52,13 +52,21 @@ TEST(AnnotatedUserContextTest, DebugString) {
     const UserContextId kUserContextId(100);
     UserContextRef context = TestUserContext::Create(kUserContextId);
     UserContextRef annotated_context =
-        AnnotatedUserContext::Create(context, "test annotation");
+        AnnotatedUserContext::Create(context, "; test annotation");
     EXPECT_EQ(annotated_context->DebugString(),
               "TestUserContext(100); test annotation");
   }
+  {
+    const UserContextId kUserContextId(200);
+    UserContextRef context = TestUserContext::Create(kUserContextId);
+    UserContextRef annotated_context =
+        AnnotatedUserContext::Create("test annotation: ", context);
+    EXPECT_EQ(annotated_context->DebugString(),
+              "test annotation: TestUserContext(200)");
+  }
   {
     UserContextRef annotated_context =
-        AnnotatedUserContext::Create(UserContextRef(), "test annotation");
+        AnnotatedUserContext::Create(UserContextRef(), "; test annotation");
     EXPECT_EQ(annotated_context->DebugString(),
               "(nullptr user context); test annotation");
   }
diff --git a/third_party/xla/xla/python/ifrt/user_context_test_util.h b/third_party/xla/xla/python/ifrt/user_context_test_util.h
index e2b02b74cce21b..35b79f86459ed6 100644
--- a/third_party/xla/xla/python/ifrt/user_context_test_util.h
+++ b/third_party/xla/xla/python/ifrt/user_context_test_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PYTHON_IFRT_USER_CONTEXT_TEST_UTIL_H_
 
 #include <string>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -29,21 +30,26 @@ namespace ifrt {
 class TestUserContext : public llvm::RTTIExtends<TestUserContext, UserContext> {
  public:
   static UserContextRef Create(UserContextId id) {
-    return tsl::TakeRef<TestUserContext>(new TestUserContext(id));
+    return Create(id, absl::StrCat("TestUserContext(", id.value(), ")"));
+  }
+
+  static UserContextRef Create(UserContextId id, std::string debug_string) {
+    return tsl::TakeRef<TestUserContext>(
+        new TestUserContext(id, std::move(debug_string)));
   }
 
   UserContextId Id() const override { return id_; }
 
-  std::string DebugString() const override {
-    return absl::StrCat("TestUserContext(", id_.value(), ")");
-  }
+  std::string DebugString() const override { return debug_string_; }
 
   // No new `ID` is not defined because tests below do not exercise RTTI.
 
  private:
-  explicit TestUserContext(UserContextId id) : id_(id) {}
+  explicit TestUserContext(UserContextId id, std::string debug_string)
+      : id_(id), debug_string_(std::move(debug_string)) {}
 
   UserContextId id_;
+  std::string debug_string_;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index eadbc5bcb9b48e..59d73720342ed0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -85,7 +85,6 @@ ifrt_proxy_cc_test(
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -103,8 +102,12 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes_any_version_accessor",
         "//xla/python/ifrt:serdes_version",
+        "//xla/python/ifrt:user_context",
+        "//xla/python/ifrt:user_context_registry",
+        "//xla/python/ifrt:user_context_status_util",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:prof_util",
+        "//xla/python/ifrt_proxy/common:proto_util",
         "//xla/python/ifrt_proxy/common:test_utils",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/tsl/concurrency:future",
@@ -112,6 +115,8 @@ cc_library(
         "//xla/tsl/platform:status_to_from_proto",
         "//xla/tsl/profiler/utils:xplane_schema",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/functional:function_ref",
@@ -122,6 +127,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:status_to_from_proto",
@@ -174,6 +180,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:versions",
+        "//xla/python/ifrt_proxy/contrib/pathways:status_annotator_util",  # build_cleaner: keep
         "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
@@ -226,6 +233,7 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -313,7 +321,6 @@ ifrt_proxy_cc_test(
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -322,6 +329,7 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
@@ -361,6 +369,7 @@ cc_library(
         "//xla/pjrt:host_callback",
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt:user_context_status_util",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:versions",
         "//xla/python/ifrt_proxy/server:host_callback",
@@ -406,11 +415,9 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -436,6 +443,7 @@ cc_library(
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt:user_context",
+        "//xla/python/ifrt:user_context_status_util",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:versions",
@@ -629,6 +637,7 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
     ],
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index 83ab0b9d69f2c0..57ad0dbfd67a05 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -589,10 +589,11 @@ absl::StatusOr<xla::ifrt::ArrayRef> Array::AssembleArrayFromSingleDeviceArrays(
   // We assume that all shards have the same layout.
   const xla::ifrt::ArrayRef& rcref = arrays[0];
   Array* array = llvm::cast<Array>(rcref.get());
-
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<const xla::PjRtLayout> layout,
+                      array->pjrt_layout());
   return xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
       client, std::move(rpc_helper), dtype, std::move(shape),
-      std::move(sharding), result_handle, array->custom_layout()));
+      std::move(sharding), result_handle, std::move(layout)));
 }
 
 absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Array::RemapArrays(
@@ -664,7 +665,9 @@ absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Array::RemapArrays(
     if (output_layouts[mapping.out_array] == nullptr) {
       const xla::ifrt::ArrayRef& rcref = arrays[mapping.in_array];
       Array* array = llvm::cast<Array>(rcref.get());
-      output_layouts[mapping.out_array] = array->custom_layout();
+      TF_ASSIGN_OR_RETURN(std::shared_ptr<const xla::PjRtLayout> layout,
+                          array->pjrt_layout());
+      output_layouts[mapping.out_array] = std::move(layout);
     }
   }
 
@@ -751,8 +754,7 @@ Array::DisassembleIntoSingleDeviceArrays(
   for (int i = 0; i < result_handles.size(); ++i) {
     result.push_back(xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
         client_, rpc_helper_, dtype_, std::move(shape_and_shardings[i].first),
-        std::move(shape_and_shardings[i].second), result_handles[i],
-        this->custom_layout())));
+        std::move(shape_and_shardings[i].second), result_handles[i], layout_)));
   }
 
   return result;
@@ -792,7 +794,7 @@ absl::StatusOr<xla::ifrt::ArrayRef> Array::FullyReplicatedShard(
 
   return xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
       client_, rpc_helper_, dtype_, shape_, std::move(single_device_sharding),
-      result_handle, this->custom_layout()));
+      result_handle, layout_));
 }
 
 tsl::Future<> Array::CopyToStringHostBuffer(
@@ -940,15 +942,7 @@ tsl::Future<> Array::CopyToHostBuffer(
 }
 
 absl::StatusOr<std::shared_ptr<const PjRtLayout>> Array::pjrt_layout() const {
-  absl::MutexLock l(mu_);
-  if (custom_layout_ != nullptr) {
-    return custom_layout_;
-  }
-
-  TF_ASSIGN_OR_RETURN(auto shard_shape, sharding_->GetShardShape(shape_));
-  return client_->GetDefaultPjRtLayout(dtype_, shard_shape.dims(),
-                                       sharding_->devices()->devices().front(),
-                                       sharding_->memory_kind());
+  return layout_;
 }
 
 xla::ifrt::Client* Array::client() const { return client_; }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/xla/xla/python/ifrt_proxy/client/array.h
index 722cc360a5b1ff..c6240d05e9cdcd 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -108,7 +108,7 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
         dtype_(dtype),
         shape_(std::move(shape)),
         sharding_(std::move(sharding)),
-        custom_layout_(std::move(layout)),
+        layout_(std::move(layout)),
         user_context_(UserContextScope::current()),
         handle_(arr_handle) {}
 
@@ -140,10 +140,6 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
     return handle_;
   }
 
-  std::shared_ptr<const xla::PjRtLayout> custom_layout() const {
-    return custom_layout_;
-  }
-
   xla::ifrt::Client* client() const override;
   tsl::Future<> GetReadyFuture() const override;
   tsl::Future<> Delete() override;
@@ -191,11 +187,7 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
   const DType dtype_;
   const Shape shape_;
   const ShardingRef sharding_;
-
-  // This is layout explicitly supplied at creation time. we explicitly
-  // distinguish it from default layouts since some functions
-  // behaves differently depending on where the layout came from.
-  const std::shared_ptr<const xla::PjRtLayout> custom_layout_;
+  const std::shared_ptr<const xla::PjRtLayout> layout_;
 
   const UserContextRef user_context_;
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
index 1c97620f61e63b..95cdb866096cef 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
@@ -25,6 +25,7 @@
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
@@ -51,7 +52,6 @@
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
@@ -59,7 +59,6 @@ using ::testing::_;
 using ::testing::ElementsAre;
 using ::testing::Return;
 using ::tsl::protobuf::TextFormat;
-using ::tsl::testing::IsOk;
 
 namespace xla {
 namespace ifrt {
@@ -161,14 +160,11 @@ TEST_F(ArrayTest, FullyReplicatedShard) {
 }
 
 TEST_F(ArrayTest, GetDefaultPjRtLayoutSuccess) {
-  ON_CALL(*mock_client_, GetDefaultPjRtLayout).WillByDefault(Return(kLayout1));
-
   auto array = tsl::MakeRef<Array>(
       mock_client_.get(), rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
       sharding_, ArrayHandle{1234}, /*layout=*/nullptr);
   TF_ASSERT_OK_AND_ASSIGN(auto layout_1, array->pjrt_layout());
-  ASSERT_NE(layout_1, nullptr);
-  EXPECT_EQ(*layout_1, *kLayout1);
+  EXPECT_EQ(layout_1, nullptr);
 }
 
 TEST_F(ArrayTest, GetCustomLayoutSuccess) {
@@ -306,8 +302,7 @@ TEST_F(ArrayTest, AssembleArrayFromSingleDeviceArraysDefaultPjRtLayoutSuccess) {
       SingleDeviceShardSemantics::kAllShards);
   TF_ASSERT_OK(result.status());
   TF_ASSERT_OK_AND_ASSIGN(auto layout, result.value()->pjrt_layout());
-  ASSERT_NE(layout, nullptr);
-  EXPECT_EQ(*layout, *kLayout1);
+  EXPECT_EQ(layout, nullptr);
 }
 
 TEST_F(ArrayTest, RemapArraysSuccess) {
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index 4e70dcce93f733..c6871dedb946fa 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -361,10 +361,12 @@ absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Client::CopyArrays(
         arrays[i]->sharding().WithDeviceAssignment(devices, memory_kind));
     auto* proxy_array = llvm::cast<xla::ifrt::proxy::Array>(arrays[i].get());
     CHECK(proxy_array != nullptr);
-    new_arrays.push_back(tsl::MakeRef<Array>(
-        this, rpc_helper_, arrays[i]->dtype(), arrays[i]->shape(),
-        std::move(new_sharding), ArrayHandle{result_handles[i]},
-        /*layout=*/proxy_array->custom_layout()));
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<const xla::PjRtLayout> layout,
+                        proxy_array->pjrt_layout());
+    new_arrays.push_back(
+        tsl::MakeRef<Array>(this, rpc_helper_, arrays[i]->dtype(),
+                            arrays[i]->shape(), std::move(new_sharding),
+                            ArrayHandle{result_handles[i]}, std::move(layout)));
   }
   return new_arrays;
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
index fdeacb6221a535..4087c59262e446 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
@@ -27,6 +27,7 @@
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
@@ -382,10 +383,12 @@ TEST_P(ClientTest, CopyArraysDefaultLayoutSuccess) {
       client_->CopyArrays(absl::MakeSpan(arrays), std::move(device_list),
                           MemoryKind("mock"), ArrayCopySemantics::kAlwaysCopy));
   ASSERT_THAT(copied_arrays, SizeIs(2));
-  EXPECT_EQ(llvm::cast<Array>(copied_arrays[0].get())->custom_layout(),
-            nullptr);
-  EXPECT_EQ(llvm::cast<Array>(copied_arrays[1].get())->custom_layout(),
-            nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const xla::PjRtLayout> layout_1,
+                          copied_arrays[0].get()->pjrt_layout());
+  EXPECT_EQ(layout_1, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const xla::PjRtLayout> layout_2,
+                          copied_arrays[1].get()->pjrt_layout());
+  EXPECT_EQ(layout_2, nullptr);
 }
 
 TEST_P(ClientTest, CopyArraysCustomLayoutSuccess) {
@@ -418,12 +421,12 @@ TEST_P(ClientTest, CopyArraysCustomLayoutSuccess) {
       client_->CopyArrays(absl::MakeSpan(arrays), std::move(device_list),
                           MemoryKind("mock"), ArrayCopySemantics::kAlwaysCopy));
   ASSERT_THAT(copied_arrays, SizeIs(2));
-  EXPECT_EQ(
-      llvm::cast<Array>(copied_arrays[0].get())->custom_layout()->ToString(),
-      layout_1_->ToString());
-  EXPECT_EQ(
-      llvm::cast<Array>(copied_arrays[1].get())->custom_layout()->ToString(),
-      layout_2_->ToString());
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const xla::PjRtLayout> layout_1,
+                          copied_arrays[0].get()->pjrt_layout());
+  EXPECT_EQ(layout_1->ToString(), layout_1_->ToString());
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const xla::PjRtLayout> layout_2,
+                          copied_arrays[1].get()->pjrt_layout());
+  EXPECT_EQ(layout_2->ToString(), layout_2_->ToString());
 }
 
 TEST_P(ClientTest, GetDefaultDeviceAssignmentSuccess) {
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
index 9d11e6f31d1fe3..0774883ed997bc 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -36,6 +36,7 @@
 #include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/topology.h"
+#include "xla/python/ifrt/user_context_status_util.h"
 #include "xla/python/ifrt_proxy/client/executable.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
@@ -145,7 +146,8 @@ absl::StatusOr<xla::ifrt::LoadedExecutableRef> Compiler::CompileAndLoad(
       fingerprint = response->fingerprint_value();
       break;
     case CompileResponse::kFingerprintError:
-      fingerprint = tsl::StatusFromProto(response->fingerprint_error());
+      fingerprint = xla::ifrt::ReattachUserContextRefs(
+          tsl::StatusFromProto(response->fingerprint_error()));
       break;
     default:
       fingerprint = std::nullopt;
@@ -175,8 +177,10 @@ absl::StatusOr<xla::ifrt::LoadedExecutableRef> Compiler::CompileAndLoad(
       devices.push_back(device);
     }
   }
-  TF_ASSIGN_OR_RETURN(DeviceListRef device_list,
-                      client_->MakeDeviceList(devices));
+  std::optional<DeviceListRef> device_list;
+  if (!devices.empty()) {
+    TF_ASSIGN_OR_RETURN(device_list, client_->MakeDeviceList(devices));
+  }
 
   return std::make_unique<LoadedExecutable>(
       client_, rpc_helper_, response->loaded_executable_handle(),
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
index 38539e33074d36..3ee4a7cfc6aff1 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
@@ -28,6 +28,7 @@
 #include "absl/time/time.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "google/protobuf/text_format.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
@@ -46,9 +47,6 @@
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace ifrt {
@@ -61,8 +59,6 @@ using ::testing::Invoke;
 using ::testing::Optional;
 using ::testing::Return;
 using ::tsl::protobuf::TextFormat;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 struct TestProgram : llvm::RTTIExtends<TestProgram, Program> {
   static char ID;  // NOLINT
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index 252b9fddac6f5f..ce1235204004f5 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -54,6 +54,7 @@
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/user_context.h"
+#include "xla/python/ifrt/user_context_status_util.h"
 #include "xla/python/ifrt_proxy/client/array.h"
 #include "xla/python/ifrt_proxy/client/host_buffer.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
@@ -336,7 +337,8 @@ class LoadedExecutable::OutputSpecCache {
 
 LoadedExecutable::LoadedExecutable(
     xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
-    uint64_t handle, std::string name, int num_devices, DeviceListRef devices,
+    uint64_t handle, std::string name, int num_devices,
+    std::optional<DeviceListRef> devices,
     std::vector<xla::ifrt::Device*> addressable_devices,
     absl::StatusOr<std::optional<std::string>> fingerprint,
     tsl::Future<> ready_future,
@@ -420,8 +422,8 @@ LoadedExecutable::LoadedExecutable(
       info->parameter_layouts =
           parse_layouts(response.value()->parameter_layouts_list());
     } else if (response.value()->has_parameter_layouts_error()) {
-      info->parameter_layouts =
-          tsl::StatusFromProto(response.value()->parameter_layouts_error());
+      info->parameter_layouts = xla::ifrt::ReattachUserContextRefs(
+          tsl::StatusFromProto(response.value()->parameter_layouts_error()));
     } else {
       info->parameter_layouts = absl::UnimplementedError(
           "IFRT Proxy server did not return parameter layouts");
@@ -430,8 +432,8 @@ LoadedExecutable::LoadedExecutable(
       info->output_layouts =
           parse_layouts(response.value()->output_layouts_list());
     } else if (response.value()->has_output_layouts_error()) {
-      info->output_layouts =
-          tsl::StatusFromProto(response.value()->output_layouts_error());
+      info->output_layouts = xla::ifrt::ReattachUserContextRefs(
+          tsl::StatusFromProto(response.value()->output_layouts_error()));
     } else {
       info->output_layouts = absl::UnimplementedError(
           "IFRT Proxy server did not return output layouts");
@@ -442,7 +444,8 @@ LoadedExecutable::LoadedExecutable(
           response.value()->compiled_memory_stats());
     } else if (response.value()->has_compiled_memory_stats_error()) {
       info->compiled_memory_stats =
-          tsl::StatusFromProto(response.value()->compiled_memory_stats_error());
+          xla::ifrt::ReattachUserContextRefs(tsl::StatusFromProto(
+              response.value()->compiled_memory_stats_error()));
     } else {
       info->compiled_memory_stats = absl::UnimplementedError(
           "IFRT Proxy server did not return compiled memory stats");
@@ -451,8 +454,9 @@ LoadedExecutable::LoadedExecutable(
     info->size_of_generated_code_in_bytes =
         response.value()->size_of_generated_code_in_bytes();
 
-    if (const absl::Status s = tsl::StatusFromProto(
-            response.value()->output_memory_kinds().status());
+    if (const absl::Status s =
+            xla::ifrt::ReattachUserContextRefs(tsl::StatusFromProto(
+                response.value()->output_memory_kinds().status()));
         !s.ok()) {
       info->output_memory_kinds = s;
     } else {
@@ -485,7 +489,8 @@ LoadedExecutable::LoadedExecutable(
                                    info->donatable_input_indices->end());
     } else if (response.value()->has_donated_input_indices_error()) {
       info->donatable_input_indices =
-          tsl::StatusFromProto(response.value()->donated_input_indices_error());
+          xla::ifrt::ReattachUserContextRefs(tsl::StatusFromProto(
+              response.value()->donated_input_indices_error()));
     } else {
       info->donatable_input_indices = absl::UnimplementedError(
           "IFRT Proxy server did not return donated input indices");
@@ -643,8 +648,8 @@ absl::StatusOr<xla::ifrt::AttributeMap> LoadedExecutable::GetCostAnalysis()
       cost_analysis_response_ =
           AttributeMap::FromProto(response.value()->attributes());
     } else {
-      cost_analysis_response_ =
-          tsl::StatusFromProto(response.value()->status());
+      cost_analysis_response_ = xla::ifrt::ReattachUserContextRefs(
+          tsl::StatusFromProto(response.value()->status()));
     }
   }
   return *cost_analysis_response_;
@@ -680,8 +685,8 @@ absl::StatusOr<std::string> LoadedExecutable::GetHumanReadableProgramText()
     } else if ((*response)->has_human_readable_program_text()) {
       human_readable_program_text_ = (*response)->human_readable_program_text();
     } else {
-      human_readable_program_text_ =
-          tsl::StatusFromProto((*response)->status());
+      human_readable_program_text_ = xla::ifrt::ReattachUserContextRefs(
+          tsl::StatusFromProto((*response)->status()));
     }
   }
   return *human_readable_program_text_;
@@ -744,6 +749,15 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
       output_spec_cache_->Retrieve().has_value();
 
   xla::ifrt::LoadedExecutable::ExecuteResult result;
+  // TODO(hyeontaek): `GetOutputLayouts()` uses a concrete layout for a
+  // default layout. This will change as proper IFRT layout support is fleshed
+  // out. While the code here using `layouts` will automatically benefit from
+  // the semantics change for `GetOutputLayouts()`, we would have a slightly
+  // inconsistent state here until the change happens where output arrays use a
+  // concrete layout for a default layout. This will not cause an issue for the
+  // time being when the user always uses concrete layouts, but we would need to
+  // resolve this issue before the user begins to use `nullptr` default layouts
+  // without resolving it to a concrete layout.
   absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>> layouts =
       GetOutputLayouts();
 
@@ -830,7 +844,9 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
   return result;
 }
 
-const DeviceListRef& LoadedExecutable::devices() const { return devices_; }
+std::optional<DeviceListRef> LoadedExecutable::devices() const {
+  return devices_;
+}
 
 absl::Span<xla::ifrt::Device* const> LoadedExecutable::addressable_devices()
     const {
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
index f32df1dbd92510..749c7af950543f 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -59,7 +59,8 @@ class LoadedExecutable final
  public:
   LoadedExecutable(xla::ifrt::Client* client,
                    std::shared_ptr<RpcHelper> rpc_helper, uint64_t handle,
-                   std::string name, int num_devices, DeviceListRef devices,
+                   std::string name, int num_devices,
+                   std::optional<DeviceListRef> devices,
                    std::vector<xla::ifrt::Device*> addressable_devices,
                    absl::StatusOr<std::optional<std::string>> fingerprint,
                    tsl::Future<> ready_future,
@@ -111,7 +112,7 @@ class LoadedExecutable final
       absl::Span<xla::ifrt::ArrayRef> args, const ExecuteOptions& options,
       std::optional<xla::ifrt::DeviceListRef> devices) override;
 
-  const DeviceListRef& devices() const override;
+  std::optional<DeviceListRef> devices() const override;
   absl::Span<xla::ifrt::Device* const> addressable_devices() const override;
 
   static char ID;  // NOLINT
@@ -148,7 +149,7 @@ class LoadedExecutable final
   const uint64_t handle_;
   const std::string name_;
   const int num_devices_;
-  const DeviceListRef devices_;
+  const std::optional<DeviceListRef> devices_;
   const std::vector<xla::ifrt::Device*> addressable_devices_;
   const absl::StatusOr<std::optional<std::string>> fingerprint_;
   const tsl::Future<> ready_future_;
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 9f0a7ac5f0f6bb..fd1f0e488bee9c 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -26,6 +26,7 @@
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/basic_device_list.h"
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
index 07b36d5946a2b2..c8f430d6de3522 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
@@ -53,7 +53,6 @@
 #include "xla/python/ifrt_proxy/common/test_utils.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -64,7 +63,6 @@ namespace proxy {
 namespace {
 
 using ::testing::Not;
-using ::tsl::testing::IsOk;
 
 // Sufficient time for all processing (that are not explicitly waiting for
 // further input) to have finished.
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
index 9f6c2ad35fc280..3cf34a97508166 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
@@ -23,6 +23,9 @@
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/container/btree_map.h"
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/functional/bind_front.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -33,6 +36,9 @@
 #include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "xla/python/ifrt/user_context.h"
+#include "xla/python/ifrt/user_context_registry.h"
+#include "xla/python/ifrt/user_context_status_util.h"
 #include "xla/python/ifrt_proxy/client/client_session.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/prof_util.h"
@@ -95,6 +101,80 @@ class BatchedOps {
       batched_ ABSL_GUARDED_BY(mu_);
 };
 
+// Tracks user contexts referenced by the proxy server.
+class UserContextsReferencedByProxyServer {
+ public:
+  // Registers a user context referenced by the proxy server. If a user context
+  // with the same ID is already registered, increments its ref count.
+  void RegisterUserContext(TrackedUserContextRef tracked_user_context) {
+    if (tracked_user_context == nullptr) {
+      return;
+    }
+    absl::MutexLock l(mu_);
+    UserContextId user_context_id = tracked_user_context->user_context()->Id();
+    auto [it, inserted] = user_contexts_.insert(
+        {user_context_id,
+         TrackedUserContextRefWithRefCount{std::move(tracked_user_context)}});
+    ++it->second.ref_count;
+  }
+
+  // Schedules to decrement the ref count of user contexts referenced by the
+  // proxy server once all responses before/on `seq_num` have been processed.
+  // Once the ref count reaches 0, the user context is unregistered.
+  void ScheduleToUnregisterUserContexts(
+      int64_t seq_num, std::vector<UserContextId> user_context_ids) {
+    absl::MutexLock l(mu_);
+    pending_unregistration_.insert({seq_num, std::move(user_context_ids)});
+  }
+
+  // Informs that a response with the given `seq_num` has been processed. User
+  // context unregistration will happen for all responses with a range
+  // [`seq_num`, ...,  N] that is contiguous and stop at any gap indicating a
+  // response that has not been processed yet.
+  void DoneProcessingResponse(int64_t seq_num) {
+    // Collect user contexts to erase outside of the lock when we return from
+    // this method.
+    std::vector<xla::ifrt::TrackedUserContextRef> user_contexts_to_erase;
+
+    absl::MutexLock l(mu_);
+    pending_seq_nums_.insert(seq_num);
+    while (!pending_seq_nums_.empty() &&
+           *pending_seq_nums_.begin() == next_seq_num_to_process_) {
+      auto node = pending_unregistration_.extract(next_seq_num_to_process_);
+      for (UserContextId user_context_id : node.mapped()) {
+        auto it = user_contexts_.find(UserContextId(user_context_id));
+        CHECK(it != user_contexts_.end());
+        if (--it->second.ref_count == 0) {
+          user_contexts_to_erase.push_back(
+              std::move(it->second.tracked_user_context));
+          user_contexts_.erase(it);
+        }
+      }
+      pending_seq_nums_.erase(pending_seq_nums_.begin());
+      ++next_seq_num_to_process_;
+    }
+  }
+
+ private:
+  struct TrackedUserContextRefWithRefCount {
+    TrackedUserContextRef tracked_user_context;
+    // Multiple instances of UserContext may be created on the proxy server
+    // for the same `UserContextId`. To ensure that we keep the user context
+    // alive as long as any of its instances are alive on the proxy server,
+    // we keep track of the number of references to each `UserContextId` on
+    // the proxy server.
+    int ref_count = 0;
+  };
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<UserContextId, TrackedUserContextRefWithRefCount>
+      user_contexts_ ABSL_GUARDED_BY(mu_);
+  int64_t next_seq_num_to_process_ ABSL_GUARDED_BY(mu_) = 0;
+  absl::btree_set<int64_t> pending_seq_nums_ ABSL_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64_t, std::vector<UserContextId>>
+      pending_unregistration_ ABSL_GUARDED_BY(mu_);
+};
+
 }  // namespace
 
 // Batches any requested operations and flushes them periodically in the
@@ -104,7 +184,9 @@ class BatchedOps {
 class RpcHelper::Batcher {
  public:
   explicit Batcher(std::shared_ptr<ClientSession> session)
-      : session_(std::move(session)) {
+      : session_(std::move(session)),
+        user_contexts_referenced_by_proxy_server_(
+            std::make_shared<UserContextsReferencedByProxyServer>()) {
     thread_pool_.emplace(tsl::Env::Default(), "IfrtProxyRpcHelperBatcher",
                          /*num_threads=*/1);
     thread_pool_->Schedule(absl::bind_front(&Batcher::PeriodicFlusher, this));
@@ -153,6 +235,11 @@ class RpcHelper::Batcher {
     LOG(INFO) << "RpcHelper::Batcher::Finish(): done.";
   }
 
+  std::shared_ptr<UserContextsReferencedByProxyServer>
+  user_contexts_referenced_by_proxy_server() {
+    return user_contexts_referenced_by_proxy_server_;
+  }
+
  private:
   void PeriodicFlusher() {
     while (true) {
@@ -226,6 +313,12 @@ class RpcHelper::Batcher {
   }
 
   const std::shared_ptr<ClientSession> session_;
+  // Tracks user contexts referenced by the proxy server. Uses a shared pointer
+  // because the reference to `UserContextsReferencedByProxyServer` is captured
+  // in OnReady callbacks and may outlive the `RpcHelper::Batcher` in a certain
+  // situation (e.g., when the proxy client is destroyed).
+  const std::shared_ptr<UserContextsReferencedByProxyServer>
+      user_contexts_referenced_by_proxy_server_;
 
   BatchedOps batched_;
 
@@ -251,9 +344,25 @@ tsl::Future<std::shared_ptr<Resp>> DoRpc(RpcHelper::Batcher* batcher,
   XFlowHelper x_flow_helper(profiling_name);
   auto traceme = x_flow_helper.Span<XFlowHelper::kSend>();
 
+  std::shared_ptr<UserContextsReferencedByProxyServer>
+      user_contexts_referenced_by_proxy_server =
+          batcher->user_contexts_referenced_by_proxy_server();
+
+  const UserContextRef& user_context = UserContextScope::current();
+  if (user_context != nullptr && user_context->Id() != UserContextId(0)) {
+    ifrt_req->mutable_request_metadata()->set_user_context_id(
+        user_context->Id().value());
+    TrackedUserContextRef tracked_user_context =
+        UserContextRegistry::Get().Register(user_context);
+    user_contexts_referenced_by_proxy_server->RegisterUserContext(
+        std::move(tracked_user_context));
+  }
+
   auto [promise, future] = tsl::Future<std::shared_ptr<Resp>>::MakePromise();
-  auto on_ready = [promise = std::move(promise), has_resp, get_resp,
-                   profiling_name, x_flow_helper](
+  auto on_ready = [promise = std::move(promise),
+                   user_contexts_referenced_by_proxy_server =
+                       std::move(user_contexts_referenced_by_proxy_server),
+                   has_resp, get_resp, profiling_name, x_flow_helper](
                       absl::StatusOr<std::shared_ptr<IfrtResponse>> r) mutable {
     if (!r.ok()) {
       VLOG(3) << profiling_name << " response: " << r.status();
@@ -274,8 +383,8 @@ tsl::Future<std::shared_ptr<Resp>> DoRpc(RpcHelper::Batcher* batcher,
             "IFRT server sent a message without metadata: ", r->DebugString()));
       }
 
-      const absl::Status metadata_status =
-          tsl::StatusFromProto(r->response_metadata().status());
+      const absl::Status metadata_status = xla::ifrt::ReattachUserContextRefs(
+          tsl::StatusFromProto(r->response_metadata().status()));
       const bool has_expected_response = (r.get()->*has_resp)();
       const auto has_some_response =
           r->response_case() != IfrtResponse::RESPONSE_NOT_SET;
@@ -295,11 +404,32 @@ tsl::Future<std::shared_ptr<Resp>> DoRpc(RpcHelper::Batcher* batcher,
       // there may be an error _instead_ of an actual response value. So, check
       // if an actual response value exists, and if so return it irrespective of
       // what the metadata_status says.
+      absl::StatusOr<std::shared_ptr<Resp>> result;
       if (!has_some_response) {
-        return metadata_status;
+        result = std::move(metadata_status);
       } else {
-        return std::make_shared<Resp>(*std::move((r.get()->*get_resp)()));
+        const int64_t seq_num = r->response_metadata().seq_num();
+
+        std::vector<UserContextId> user_context_ids;
+        user_context_ids.reserve(
+            r->response_metadata().destroyed_user_context_ids_size());
+        for (uint64_t user_context_id :
+             r->response_metadata().destroyed_user_context_ids()) {
+          user_context_ids.push_back(UserContextId(user_context_id));
+        }
+        user_contexts_referenced_by_proxy_server
+            ->ScheduleToUnregisterUserContexts(seq_num,
+                                               std::move(user_context_ids));
+
+        result = std::shared_ptr<Resp>(
+            new Resp(*std::move((r.get()->*get_resp)())),
+            [user_contexts_referenced_by_proxy_server, seq_num](Resp* resp) {
+              delete resp;
+              user_contexts_referenced_by_proxy_server->DoneProcessingResponse(
+                  seq_num);
+            });
       }
+      return result;
     }(*std::move(r));
 
     if (!result.ok()) {
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/BUILD b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
index 7440662ac9c1ee..acabde24e52999 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
@@ -85,6 +85,7 @@ tf_proto_library(
         ":types_proto",
         # copybara:uncomment "@com_google_protobuf//:any_proto",
         "//xla:xla_data_proto",
+        "//xla/pjrt/proto:executable_metadata_proto",
         "//xla/pjrt/proto:execute_options_proto",
         "//xla/python/ifrt:array_spec_proto",
         "//xla/python/ifrt:attribute_map_proto",
@@ -96,7 +97,6 @@ tf_proto_library(
         "//xla/python/ifrt:sharding_proto",
         "//xla/tsl/protobuf:status_proto",
     ],
-    deps = ["//xla/pjrt/proto:executable_metadata_proto"],
 )
 
 tf_proto_library(
@@ -145,7 +145,6 @@ ifrt_proxy_cc_test(
         ":types",
         ":types_proto_cc",
         "//xla/pjrt:pjrt_common",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:status_matchers",
@@ -181,7 +180,6 @@ ifrt_proxy_cc_test(
     deps = [
         ":array_util",
         "//xla/python/ifrt",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:cord",
@@ -209,6 +207,7 @@ cc_library(
     hdrs = ["proto_util.h"],
     deps = [
         ":ifrt_service_proto_cc",
+        "//xla/python/ifrt:user_context_status_util",
         "//xla/tsl/platform:status_to_from_proto",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/array_util_test.cc b/third_party/xla/xla/python/ifrt_proxy/common/array_util_test.cc
index 0913b326e2ea09..e789b5440bb36b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/array_util_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/common/array_util_test.cc
@@ -27,7 +27,6 @@
 #include "absl/strings/string_view.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/shape.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -39,8 +38,6 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::Not;
 using ::testing::TestWithParam;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
 
 constexpr DType::Kind kF64 = DType::Kind::kF64;
 constexpr DType::Kind kS32 = DType::Kind::kS32;
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index 6aa8b468cedd5d..041f6f425e766b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -166,6 +166,9 @@ message RequestMetadata {
   // resync after transient connectivity failures.
   fixed64 op_id = 1;
 
+  // User context ID of the operation.
+  fixed64 user_context_id = 5;
+
   // Implementation-specific payloads.
   repeated google.protobuf.Any payloads = 4;
 
@@ -194,6 +197,13 @@ message ResponseMetadata {
   //    specific error type or string when this is not OK, they should check the
   //    response message for individual absl::Statuses.
   tensorflow.StatusProto status = 2;
+
+  // The sequence number of the response.
+  int64 seq_num = 3;
+
+  // IDs of the destroyed user contexts since the last response identified by
+  // `seq_num` - 1.
+  repeated fixed64 destroyed_user_context_ids = 4;
 }
 
 // InitRequest allows the client to specify the optional startup configuration
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc b/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
index fa7dfbab07bfa0..043ebea3fc1174 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
@@ -23,7 +23,6 @@
 #include "absl/status/status_matchers.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -32,8 +31,6 @@ namespace ifrt {
 namespace proxy {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 class VariantTest : public testing::TestWithParam<xla::PjRtValueType> {};
 
 TEST_P(VariantTest, ToFromVariantProto) {
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/BUILD b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/BUILD
new file mode 100644
index 00000000000000..ed69558c0c0665
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/BUILD
@@ -0,0 +1,67 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "cc_library", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = default_ifrt_proxy_visibility,
+)
+
+cc_library(
+    name = "status_annotator_util",
+    srcs = ["status_annotator_util.cc"],
+    hdrs = ["status_annotator_util.h"],
+    deps = [
+        ":status_annotator_proto_cc",
+        "//xla/python/ifrt:user_context",
+        "//xla/python/ifrt:user_context_registry",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_to_from_proto",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:fingerprint",
+    ],
+    alwayslink = 1,
+)
+
+ifrt_proxy_cc_test(
+    name = "status_annotator_util_test",
+    srcs = ["status_annotator_util_test.cc"],
+    deps = [
+        ":status_annotator_proto_cc",
+        ":status_annotator_util",
+        "//xla/python/ifrt:test_util",
+        "//xla/python/ifrt:user_context_registry",
+        "//xla/python/ifrt:user_context_status_util",
+        "//xla/tsl/platform:status_to_from_proto",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "status_annotator_proto",
+    srcs = ["status_annotator.proto"],
+    visibility = default_ifrt_proxy_visibility,
+    deps = ["//xla/tsl/protobuf:status_proto"],
+)
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator.proto b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator.proto
new file mode 100644
index 00000000000000..e4e85af6ead7c9
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator.proto
@@ -0,0 +1,57 @@
+// Copyright 2025 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+edition = "2023";
+
+package ifrt_proxy_contrib_pathways;
+
+import "xla/tsl/protobuf/status.proto";
+
+option java_multiple_files = true;
+
+// Summarizes a Pathways worker-side ObjectStore in a way relevant to users
+// debugging an HBM OOM.
+message ObjectStoreDumpProto {
+  // TODO(b/456800998): Rename 'ErrorContext' to 'UserContext'.
+  message PerErrorContext {
+    message PerCreator {
+      string creator = 1;  // The kind of operation that created this object.
+
+      // Count and size of all objects whose `GetReadyFuture().ready()` is true.
+      uint64 ready_obj_count = 2;
+      uint64 ready_total_size = 3;
+
+      // Count and size of objects whose `GetReadyFuture().ready()` is false.
+      uint64 not_ready_obj_count = 4;
+      uint64 not_ready_total_size = 5;
+    }
+    uint64 error_context_id = 1;
+    repeated PerCreator per_creator = 2;
+  }
+
+  message Dump {
+    repeated PerErrorContext per_error_context = 1;
+    string warning = 2;
+  }
+
+  string device = 1;
+  fixed64 dump_timestamp_ns = 2;
+
+  oneof result_statusor {
+    tensorflow.StatusProto dump_failed = 5;
+    Dump dump = 6;
+  }
+
+  reserved 3, 4;
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.cc b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.cc
new file mode 100644
index 00000000000000..5d0af1a5399102
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.cc
@@ -0,0 +1,195 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/python/ifrt/user_context.h"
+#include "xla/python/ifrt/user_context_registry.h"
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator.pb.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace ifrt_proxy_contrib_pathways {
+
+static constexpr absl::string_view kObjectStoreDumpPayloadUrl =
+    "type.googleapis.com/ifrt_proxy_contrib_pathways.ObjectStoreDumpProto";
+
+static constexpr int kMaxCitedTraces = 20;
+
+struct PerStackTrace {
+  struct PerCreator {
+    int64_t ready_obj_count = 0;
+    int64_t ready_total_size = 0;
+    int64_t not_ready_obj_count = 0;
+    int64_t not_ready_total_size = 0;
+  };
+  std::string stack_trace;
+  absl::flat_hash_map<std::string, PerCreator> per_creator;
+  int64_t total_size = 0;
+};
+
+std::vector<std::unique_ptr<PerStackTrace>> SortedPerStackTrace(
+    const ObjectStoreDumpProto& object_store_dump) {
+  std::vector<std::unique_ptr<PerStackTrace>> result;
+  absl::flat_hash_map<uint64_t, PerStackTrace*> per_stack_trace;
+  for (const auto& per_error_context :
+       object_store_dump.dump().per_error_context()) {
+    xla::ifrt::TrackedUserContextRef tracked_user_context =
+        xla::ifrt::UserContextRegistry::Get().Lookup(
+            xla::ifrt::UserContextId(per_error_context.error_context_id()));
+    std::string stack_trace = "unknown";
+    if (tracked_user_context != nullptr) {
+      stack_trace = tracked_user_context->user_context()->DebugString();
+    }
+    uint64_t stack_trace_fprint = tsl::Fingerprint64(stack_trace);
+
+    PerStackTrace* entry = nullptr;
+    if (auto it = per_stack_trace.find(stack_trace_fprint);
+        it != per_stack_trace.end()) {
+      entry = it->second;
+    } else {
+      entry = result.emplace_back(std::make_unique<PerStackTrace>()).get();
+      entry->stack_trace = stack_trace;
+      per_stack_trace.insert(it, {stack_trace_fprint, entry});
+    }
+
+    for (const auto& per_creator : per_error_context.per_creator()) {
+      PerStackTrace::PerCreator& creator =
+          entry->per_creator[per_creator.creator()];
+      creator.ready_obj_count += per_creator.ready_obj_count();
+      creator.ready_total_size += per_creator.ready_total_size();
+      creator.not_ready_obj_count += per_creator.not_ready_obj_count();
+      creator.not_ready_total_size += per_creator.not_ready_total_size();
+      entry->total_size +=
+          per_creator.ready_total_size() + per_creator.not_ready_total_size();
+    }
+  }
+
+  auto is_bigger = [](const std::unique_ptr<PerStackTrace>& a,
+                      const std::unique_ptr<PerStackTrace>& b) {
+    return (a->total_size > b->total_size);
+  };
+
+  std::sort(result.begin(), result.end(), is_bigger);
+
+  return result;
+}
+
+static void ExpandObjectStoreDump(absl::Status& status) {
+  std::optional<absl::Cord> payload =
+      status.GetPayload(kObjectStoreDumpPayloadUrl);
+  if (!payload.has_value()) {
+    return;
+  }
+  ObjectStoreDumpProto object_store_dump;
+  if (!object_store_dump.ParseFromString(payload->Flatten())) {
+    LOG(WARNING) << "Unable to expand string to ObjectStoreDumpProto: "
+                 << payload->Flatten();
+    tsl::errors::AppendToMessage(
+        &status,
+        "\nWARNING: Unable to parse attached payload string to "
+        "ObjectStoreDumpProto. Please see logs for the actual payload string.");
+    return;
+  }
+  status.ErasePayload(kObjectStoreDumpPayloadUrl);
+
+  std::string header = absl::StrCat(
+      "Pathways object-store summary for device ", object_store_dump.device(),
+      " at ", absl::FromUnixNanos(object_store_dump.dump_timestamp_ns()));
+
+  if (object_store_dump.has_dump_failed()) {
+    absl::Status error = tsl::StatusFromProto(object_store_dump.dump_failed());
+    tsl::errors::AppendToMessage(
+        &status, "\n", header, " got error while dumping: ", error.ToString());
+    return;
+  }
+
+  auto sorted_per_stack_trace = SortedPerStackTrace(object_store_dump);
+  absl::StrAppend(&header, " (showing ",
+                  std::min(kMaxCitedTraces,
+                           static_cast<int>(sorted_per_stack_trace.size())),
+                  " of ", sorted_per_stack_trace.size(), " entries)");
+
+  if (!object_store_dump.dump().warning().empty()) {
+    absl::StrAppend(&header, " (warning: ", object_store_dump.dump().warning(),
+                    ")");
+  }
+
+  std::vector<std::string> cited_traces;
+  tsl::errors::AppendToMessage(&status, "\n", header, ":");
+  for (const auto& per_error_context : sorted_per_stack_trace) {
+    std::string stack_trace = std::move(per_error_context->stack_trace);
+    if (stack_trace != "unknown") {
+      absl::StrReplaceAll({{"\n", "\t"}}, &stack_trace);
+      absl::StrReplaceAll({{"\t", "\n                "}}, &stack_trace);
+      cited_traces.push_back(stack_trace);
+      tsl::errors::AppendToMessage(
+          &status, "  - The following entries arise from user stack [",
+          cited_traces.size(), "]:");
+    } else {
+      tsl::errors::AppendToMessage(
+          &status,
+          "  - The following entries arise from an unknown user stack:");
+    }
+    for (const auto& [creator, per_creator] : per_error_context->per_creator) {
+      tsl::errors::AppendToMessage(
+          &status, "      + ", creator, " with ", per_creator.ready_obj_count,
+          " 'ready' buffers of total size ", per_creator.ready_total_size,
+          " and ", per_creator.not_ready_obj_count,
+          " 'not ready' buffers of total size ",
+          per_creator.not_ready_total_size);
+    }
+    if (cited_traces.size() >= kMaxCitedTraces) {
+      break;
+    }
+  }
+  for (int i = 0; i < cited_traces.size(); ++i) {
+    tsl::errors::AppendToMessage(
+        &status, absl::StrFormat("[%3d]   %s", i + 1, cited_traces[i]));
+  }
+}
+
+void AnnotateIfrtUserStatusWithObjectStoreDump(
+    absl::Status& status, const ObjectStoreDumpProto& object_store_dump) {
+  status.SetPayload(kObjectStoreDumpPayloadUrl,
+                    object_store_dump.SerializeAsCord());
+}
+
+static const bool register_expanders = []() {
+  xla::ifrt::CustomStatusExpanderRegistry::Get().Register(
+      kObjectStoreDumpPayloadUrl, ExpandObjectStoreDump);
+  return true;
+}();
+
+}  // namespace ifrt_proxy_contrib_pathways
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h
new file mode 100644
index 00000000000000..ca1f0a64d9be43
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CONTRIB_PATHWAYS_STATUS_ANNOTATOR_UTIL_H_
+#define XLA_PYTHON_IFRT_PROXY_CONTRIB_PATHWAYS_STATUS_ANNOTATOR_UTIL_H_
+
+#include "absl/status/status.h"
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator.pb.h"
+
+namespace ifrt_proxy_contrib_pathways {
+
+// Attaches the given `object_store_dump` to the given `status` as a payload.
+void AnnotateIfrtUserStatusWithObjectStoreDump(
+    absl::Status& status, const ObjectStoreDumpProto& object_store_dump);
+
+}  // namespace ifrt_proxy_contrib_pathways
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CONTRIB_PATHWAYS_STATUS_ANNOTATOR_UTIL_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util_test.cc b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util_test.cc
new file mode 100644
index 00000000000000..4ffffa6694395a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util_test.cc
@@ -0,0 +1,187 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h"
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/python/ifrt/user_context_registry.h"
+#include "xla/python/ifrt/user_context_status_util.h"
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator.pb.h"
+#include "xla/tsl/platform/status_to_from_proto.h"
+
+namespace ifrt_proxy_contrib_pathways {
+namespace {
+
+using ::testing::HasSubstr;
+using ::testing::Not;
+using ::xla::ifrt::TrackedUserContextRef;
+
+TEST(StatusAnnotatorUtilTest, SimpleAnnotateAndExpand) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  EXPECT_THAT(xla::ifrt::ExpandUserContexts(status).message(),
+              HasSubstr("tpu:0"));
+}
+
+TEST(StatusAnnotatorUtilTest, ExpandFailedDump) {
+  ObjectStoreDumpProto object_store_dump;
+  *object_store_dump.mutable_dump_failed() =
+      tsl::StatusToProto(absl::InternalError("dump failed"));
+  object_store_dump.set_device("tpu:0");
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  EXPECT_THAT(xla::ifrt::ExpandUserContexts(status).message(),
+              HasSubstr("tpu:0"));
+  EXPECT_THAT(xla::ifrt::ExpandUserContexts(status).message(),
+              HasSubstr("dump failed"));
+}
+
+static TrackedUserContextRef RegisterNewContext(
+    int context, std::optional<std::string> debug_string = std::nullopt) {
+  return xla::ifrt::UserContextRegistry::Get().Register(
+      xla::ifrt::test_util::MakeUserContext(context, debug_string));
+}
+
+TEST(StatusAnnotatorUtilTest, DumpWithManyContentsExpandsToAllDetails) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  std::vector<TrackedUserContextRef> user_context_refs;
+
+  for (int context : {1000, 2000, 3000}) {
+    user_context_refs.push_back(RegisterNewContext(context));
+
+    auto* per_error_context =
+        object_store_dump.mutable_dump()->add_per_error_context();
+    per_error_context->set_error_context_id(context);
+
+    for (int creator : {100, 200}) {
+      auto* per_creator = per_error_context->add_per_creator();
+      *per_creator->mutable_creator() = absl::StrCat("creator", creator);
+      per_creator->set_ready_obj_count(context + creator + 1);
+      per_creator->set_ready_total_size(context + creator + 2);
+      per_creator->set_not_ready_obj_count(context + creator + 3);
+      per_creator->set_not_ready_total_size(context + creator + 4);
+    }
+  }
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  std::string expanded(xla::ifrt::ExpandUserContexts(status).message());
+
+  for (int context : {1000, 2000, 3000}) {
+    EXPECT_THAT(expanded,
+                HasSubstr(absl::StrCat("TestUserContext(", context, ")")));
+    for (int creator : {100, 200}) {
+      EXPECT_THAT(expanded, HasSubstr(absl::StrCat("creator", creator)));
+      EXPECT_THAT(expanded,
+                  HasSubstr(absl::StrCat(context + creator + 1,
+                                         " 'ready' buffers of total size ",
+                                         context + creator + 2)));
+      EXPECT_THAT(expanded,
+                  HasSubstr(absl::StrCat(context + creator + 3,
+                                         " 'not ready' buffers of total size ",
+                                         context + creator + 4)));
+    }
+  }
+}
+
+TEST(StatusAnnotatorUtilTest, DumpWithDuplicateStackTraces) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  std::vector<TrackedUserContextRef> user_context_refs;
+
+  for (int context : {1000, 2000}) {
+    user_context_refs.push_back(
+        RegisterNewContext(context, /*debug_string=*/"foobar"));
+
+    auto* per_error_context =
+        object_store_dump.mutable_dump()->add_per_error_context();
+    per_error_context->set_error_context_id(context);
+
+    auto* per_creator = per_error_context->add_per_creator();
+    *per_creator->mutable_creator() = "creator1";
+    per_creator->set_ready_obj_count(1);
+    per_creator->set_ready_total_size(1);
+    per_creator->set_not_ready_obj_count(1);
+    per_creator->set_not_ready_total_size(1);
+  }
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  std::string expanded(xla::ifrt::ExpandUserContexts(status).message());
+
+  EXPECT_THAT(expanded, HasSubstr("foobar"));
+  EXPECT_THAT(expanded, HasSubstr("2 'ready' buffers of total size 2"));
+  EXPECT_THAT(expanded, HasSubstr("2 'not ready' buffers of total size 2"));
+
+  EXPECT_EQ(expanded.find("foobar"), expanded.rfind("foobar"));
+  EXPECT_EQ(expanded.find("'ready' buffers"),
+            expanded.rfind("'ready' buffers"));
+}
+
+TEST(StatusAnnotatorUtilTest, CitationsWithUnknownStackTraces) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  std::vector<TrackedUserContextRef> user_context_refs;
+
+  for (int context : {1000, 2000, 3000}) {
+    if (context != 2000) {
+      user_context_refs.push_back(RegisterNewContext(context));
+    }
+
+    auto* per_error_context =
+        object_store_dump.mutable_dump()->add_per_error_context();
+    per_error_context->set_error_context_id(context);
+
+    auto* per_creator = per_error_context->add_per_creator();
+    *per_creator->mutable_creator() = absl::StrCat("creator", context);
+    per_creator->set_ready_obj_count(context + 1);
+    per_creator->set_ready_total_size(context + 2);
+    per_creator->set_not_ready_obj_count(context + 3);
+    per_creator->set_not_ready_total_size(context + 4);
+  }
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  std::string expanded(xla::ifrt::ExpandUserContexts(status).message());
+  EXPECT_THAT(expanded, HasSubstr("unknown user stack"));
+  EXPECT_THAT(expanded, HasSubstr("[  1]   TestUserContext(3000)"));
+  EXPECT_THAT(expanded, HasSubstr("[  2]   TestUserContext(1000)"));
+  EXPECT_THAT(expanded, Not(HasSubstr("[  3]")));
+  EXPECT_THAT(expanded, Not(HasSubstr("[3]")));
+}
+
+}  // namespace
+}  // namespace ifrt_proxy_contrib_pathways
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
index 056fc64a88c876..324fc7fa26f358 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
@@ -127,6 +127,7 @@ ifrt_proxy_cc_test(
     size = "small",
     srcs = ["mock_array_test.cc"],
     deps = [
+        "//xla:future",
         "//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "//xla/python/ifrt",
@@ -139,7 +140,6 @@ ifrt_proxy_cc_test(
         "//xla/python/pjrt_ifrt",
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/array_impl_test_tfrt_cpu.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/array_impl_test_tfrt_cpu.cc
index ba6d445da2bd28..bd2736b76c41ff 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/array_impl_test_tfrt_cpu.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/array_impl_test_tfrt_cpu.cc
@@ -24,6 +24,14 @@ int main(int argc, char** argv) {
       // destination literal.
       "ArrayImplTest.MakeArrayFromHostBufferAndCopyToHostBufferWithByteStrides",
 
+      // Arrays created using `MakeArraysFromHostBufferShards()` do not indicate
+      // correct custom layouts even if the given layout is a concrete default
+      // layout. PjRt-IFRT uses `ClientMakeArraysFromHostBufferShards()`
+      // internally, which lowers `MakeArraysFromHostBufferShards()` call into
+      // legacy API calls that do not yet support custom layouts, and thus the
+      // output arrays only can have default layouts.
+      "ArrayImplTest.MakeArraysFromHostBufferShardsWithLayout",
+
       // `ShardingParamSharding` does not support serialization yet.
       // TODO(b/282757875): Enable the test once IFRT implements
       // `ShardingParamShardingSerDes`.
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
index 8db9a6eb1ea0f6..2a02a744cef988 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
@@ -35,6 +35,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/python/ifrt/array.h"
@@ -53,7 +54,6 @@
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -62,9 +62,6 @@ namespace ifrt {
 namespace proxy {
 namespace {
 
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
-
 constexpr absl::StatusCode kInternal = absl::StatusCode::kInternal;
 
 constexpr absl::Duration kSomeTime = absl::Seconds(1);
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 44d0087641dd5d..e640453ba1b312 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -51,7 +51,6 @@ ifrt_proxy_cc_test(
     deps = [
         ":grpc_server",
         "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
@@ -109,7 +108,6 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt_proxy/client:grpc_host_buffer",
         "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/log",
@@ -129,7 +127,9 @@ cc_library(
     deps = [
         ":host_buffer",
         ":host_callback",
+        ":ifrt_backend_user_context",
         ":version",
+        "//xla:future",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
@@ -142,6 +142,7 @@ cc_library(
         "//xla/python/ifrt:serdes_version",
         "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt:user_context",
+        "//xla/python/ifrt:with_user_context",
         "//xla/python/ifrt_proxy/common:array_util",
         "//xla/python/ifrt_proxy/common:common_serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
@@ -249,6 +250,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ifrt_backend_user_context",
+    srcs = ["ifrt_backend_user_context.cc"],
+    hdrs = ["ifrt_backend_user_context.h"],
+    deps = [
+        "//xla/python/ifrt:user_context",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "ifrt_backend_user_context_test",
+    srcs = ["ifrt_backend_user_context_test.cc"],
+    deps = [
+        ":ifrt_backend_user_context",
+        "//xla/python/ifrt:user_context",
+        "//xla/tsl/concurrency:future",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "host_buffer",
     srcs = ["host_buffer.cc"],
@@ -317,7 +343,6 @@ ifrt_proxy_cc_test(
     deps = [
         ":version",
         "//xla/python/ifrt:serdes_version",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
@@ -331,7 +356,6 @@ ifrt_proxy_cc_test(
         ":host_buffer",
         "//xla/tsl/concurrency:future",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server_test.cc
index 16fa8d038bbcf6..9eb753edb52a0c 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server_test.cc
@@ -23,7 +23,6 @@
 #include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -33,8 +32,6 @@ namespace proxy {
 namespace {
 
 using ::testing::Not;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 // A fake IFRT service that fails all the Session creation attempts.
 class FakeIfrtService : public grpc::GrpcIfrtService::Service {};
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
index 19950a5ac51209..b89d2dd5c43af4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
@@ -69,9 +69,9 @@ ::grpc::Status GrpcServiceImpl::IfrtSession(
     ::grpc::ServerReaderWriter<IfrtResponse, IfrtRequest>* stream) {
   GrpcIfrtSessionMetadata metadata;
   {
-    const auto it = context->client_metadata().find(
+    const auto [it, end] = context->client_metadata().equal_range(
         "ifrt-proxy-grpc-ifrt-session-metadata-bin");
-    if (it == context->client_metadata().end()) {
+    if (it == end) {
       return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
                             "Missing metadata for GrpcIfrtService.IfrtSession: "
                             "ifrt-proxy-grpc-ifrt-session-metadata-bin");
@@ -158,9 +158,9 @@ ::grpc::Status GrpcServiceImpl::HostBufferStore(
     ::grpc::ServerReader<GrpcHostBufferStoreRequest>* stream,
     GrpcHostBufferStoreResponse* response) {
   tsl::profiler::TraceMe traceme("HostBufferStore");
-  const auto it = context->client_metadata().find(
+  const auto [it, end] = context->client_metadata().equal_range(
       "ifrt-proxy-grpc-host-buffer-store-metadata-bin");
-  if (it == context->client_metadata().end()) {
+  if (it == end) {
     LOG(WARNING) << "Missing gRPC metadata for GrpcHostBufferService.Store";
     return ::grpc::Status(
         ::grpc::StatusCode::INTERNAL,
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl_test.cc
index 39121951e20843..de0b7a065b9d61 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl_test.cc
@@ -38,7 +38,6 @@
 #include "xla/python/ifrt_proxy/server/grpc_server.h"
 #include "xla/python/ifrt_proxy/server/host_buffer.h"
 #include "xla/python/ifrt_proxy/server/version.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
@@ -46,10 +45,6 @@ namespace ifrt {
 namespace proxy {
 namespace {
 
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
-
 IfrtProxyVersion Version() {
   IfrtProxyVersion version;
   version.set_protocol_version(kServerMaxVersion);
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_buffer_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer_test.cc
index 185facc7ff3a3e..0bf706bf7d51b0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/host_buffer_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer_test.cc
@@ -27,7 +27,6 @@
 #include "absl/time/time.h"
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace ifrt {
@@ -36,9 +35,6 @@ namespace {
 
 using ::testing::Not;
 using ::testing::Pointee;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(HostBufferStoreTest, ReadAfterWrite) {
   HostBufferStore store;
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index c9fd9eaabe3ffa..a3914e53d400ec 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -44,6 +44,7 @@
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "xla/future.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
@@ -66,6 +67,7 @@
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
+#include "xla/python/ifrt/with_user_context.h"
 #include "xla/python/ifrt_proxy/common/array_util.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/prof_util.h"
@@ -75,6 +77,7 @@
 #include "xla/python/ifrt_proxy/common/versions.h"
 #include "xla/python/ifrt_proxy/server/host_buffer.h"
 #include "xla/python/ifrt_proxy/server/host_callback.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend_user_context.h"
 #include "xla/python/ifrt_proxy/server/version.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/status_macros.h"
@@ -177,11 +180,17 @@ ParseMakeArraysFromHostBufferShardsSpecHostBufferProto(
 // Returns a string_view that is guaranteed to be valid and constant until this
 // process dies.
 absl::string_view GetRequestName(const IfrtRequest* req) {
-  if (IfrtRequest::descriptor() == nullptr) return "unknown";
-  if (req == nullptr) return "unknown";
+  if (IfrtRequest::descriptor() == nullptr) {
+    return "unknown";
+  }
+  if (req == nullptr) {
+    return "unknown";
+  }
   auto* field =
       IfrtRequest::descriptor()->FindFieldByNumber(req->request_case());
-  if (field == nullptr) return "unknown";
+  if (field == nullptr) {
+    return "unknown";
+  }
   return field->name();
 }
 
@@ -385,7 +394,9 @@ class IfrtBackend::InOrderRequestsProcessor {
       return shutdown_msg_.has_value() || !entries_.empty();
     };
     mu_.Await(absl::Condition(&cond));
-    if (shutdown_msg_.has_value()) return std::nullopt;
+    if (shutdown_msg_.has_value()) {
+      return std::nullopt;
+    }
     auto result = std::move(entries_.front());
     entries_.pop_front();
     return result;
@@ -398,7 +409,7 @@ class IfrtBackend::InOrderRequestsProcessor {
       int request_case = entry->req->request_case();
       auto span = entry->xflow.Span<XFlowHelper::kRecvSend>();
       parent_->ProcessInternal(std::move(entry->req))
-          .OnReady([p = std::move(entry->promise),
+          .OnReady([parent = parent_, p = std::move(entry->promise),
                     xflow = std::move(entry->xflow), request_case,
                     op_id](absl::StatusOr<Response> r) mutable {
             auto span = xflow.Span<XFlowHelper::kRecv>();
@@ -412,6 +423,7 @@ class IfrtBackend::InOrderRequestsProcessor {
               LOG(WARNING) << "Responding " << request_type << "(" << op_id
                            << "): " << r.status();
             } else {
+              parent->UpdateResponseWithDestroyedUserContextIds(*r);
               VLOG(3) << "Responding " << op_id << ": "
                       << (*r)->ShortDebugString();
             }
@@ -449,7 +461,9 @@ IfrtBackend::IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
           // TODO(b/282757875): Consider making this configurable.
           /*num_threads=*/32),
       in_order_requests_processor_(
-          std::make_unique<InOrderRequestsProcessor>(this)) {}
+          std::make_unique<InOrderRequestsProcessor>(this)),
+      destroyed_user_context_ids_(std::make_shared<DestroyedUserContextIds>()) {
+}
 
 absl::StatusOr<std::unique_ptr<IfrtBackend>> IfrtBackend::Create(
     IfrtProxyVersion version, uint64_t session_id,
@@ -524,6 +538,13 @@ tsl::Future<BackendInterface::Response> IfrtBackend::Process(
 
 tsl::Future<BackendInterface::Response> IfrtBackend::ProcessInternal(
     std::unique_ptr<IfrtRequest> request) {
+  UserContextScope user_context_scope(IfrtBackendUserContext::Create(
+      UserContextId(request->request_metadata().user_context_id()),
+      [destroyed_user_context_ids =
+           destroyed_user_context_ids_](UserContextId id) {
+        absl::MutexLock l(destroyed_user_context_ids->mutex);
+        destroyed_user_context_ids->ids.push_back(id);
+      }));
   std::optional<ArrayStore::Reservation> asr;
   switch (request->request_case()) {
     case IfrtRequest::RequestCase::kInitRequest:
@@ -677,14 +698,15 @@ tsl::Future<BackendInterface::Response> IfrtBackend::AsyncExecute(
     ++in_flight_count_;
   }
   auto [promise, future] = tsl::Future<Response>::MakePromise();
-  auto f = [this, promise = std::move(promise).ToShared(),
-            handle_fn = std::move(handle_fn)]() mutable {
-    promise->Set(handle_fn());
-    {
-      absl::MutexLock lock(in_flight_count_mutex_);
-      --in_flight_count_;
-    }
-  };
+  auto f =
+      WithCurrentUserContext([this, promise = std::move(promise).ToShared(),
+                              handle_fn = std::move(handle_fn)]() mutable {
+        promise->Set(handle_fn());
+        {
+          absl::MutexLock lock(in_flight_count_mutex_);
+          --in_flight_count_;
+        }
+      });
   if (thread_pool != nullptr) {
     thread_pool->Schedule(std::move(f));
   } else {
@@ -838,7 +860,9 @@ IfrtBackend::HandleMakeArrayFromHostBufferRequest(
       Sharding::FromProto(client_.get(), make_array_request->sharding()));
 
   const auto byte_strides = [&]() -> std::optional<std::vector<int64_t>> {
-    if (!make_array_request->has_byte_strides()) return std::nullopt;
+    if (!make_array_request->has_byte_strides()) {
+      return std::nullopt;
+    }
     return FromByteStridesProto(make_array_request->byte_strides());
   }();
   TF_ASSIGN_OR_RETURN(const auto shape,
@@ -1191,7 +1215,7 @@ IfrtBackend::HandleDisassembleIntoSingleDeviceArraysRequest(
   // TODO(b/282757875): Consider other ArrayCopySemantics.
   TF_ASSIGN_OR_RETURN(auto single_device_arrays,
                       array->DisassembleIntoSingleDeviceArrays(
-                          xla::ifrt::ArrayCopySemantics::kAlwaysCopy,
+                          xla::ifrt::ArrayCopySemantics::kReuseInput,
                           single_device_shard_semantics));
 
   std::vector<uint64_t> response_handles =
@@ -1432,8 +1456,12 @@ tsl::Future<BackendInterface::Response> IfrtBackend::HandleCompileRequest(
     for (const auto* device : executable->addressable_devices()) {
       compile_resp->add_addressable_device_ids(device->Id().value());
     }
-    for (const auto* device : executable->devices()->devices()) {
-      compile_resp->add_device_ids(device->Id().value());
+    if (std::optional<xla::ifrt::DeviceListRef> device_list =
+            executable->devices();
+        device_list.has_value()) {
+      for (const auto* device : (*device_list)->devices()) {
+        compile_resp->add_device_ids(device->Id().value());
+      }
     }
     // TODO(b/282757875): Consider making fingerprint calculation asynchronous
     // if it is expected to take long.
@@ -2050,6 +2078,26 @@ IfrtBackend::GetLoadedExecutable(uint64_t handle) {
   return it->second;
 }
 
+void IfrtBackend::UpdateResponseWithDestroyedUserContextIds(
+    IfrtBackend::Response& response) {
+  absl::MutexLock l(destroyed_user_context_ids_->mutex);
+
+  response->mutable_response_metadata()->set_seq_num(
+      destroyed_user_context_ids_->next_seq_num);
+  ++destroyed_user_context_ids_->next_seq_num;
+
+  std::vector<UserContextId>& ids = destroyed_user_context_ids_->ids;
+  if (!ids.empty()) {
+    auto* ids_proto = response->mutable_response_metadata()
+                          ->mutable_destroyed_user_context_ids();
+    ids_proto->Reserve(ids.size());
+    for (UserContextId id : ids) {
+      ids_proto->AddAlreadyReserved(id.value());
+    }
+    ids.clear();
+  }
+}
+
 absl::StatusOr<IfrtArrayRef> IfrtBackend::ArrayStore::Find(uint64_t handle) {
   absl::MutexLock l(mu_);
   auto it = arrays_.find(handle);
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
index 67658af3e2666c..dd64ce305e1f62 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -35,6 +35,7 @@
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/serdes_any_version_accessor.h"
 #include "xla/python/ifrt/serdes_version.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/server/host_buffer.h"
 #include "xla/python/ifrt_proxy/server/host_callback.h"
@@ -250,6 +251,25 @@ class IfrtBackend final : public BackendInterface {
   absl::StatusOr<std::shared_ptr<LoadedExecutableWithInfo>> GetLoadedExecutable(
       uint64_t handle);
 
+  //////////////////////////////////////////////////////////////////////////
+  // Methods for tracking destroyed user context IDs.
+  //
+
+  // Updates `response` with a sequence number and the destroyed user context
+  // IDs in `destroyed_user_context_ids_->ids`.
+  //
+  // The sequence number is strictly increasing and indicates the order of
+  // responses. It is sent as part of every valid response so that the client
+  // can determine when it has received all in-flight responses that potentially
+  // reference a deleted user context ID and it can apply the received
+  // `destroyed_user_context_ids_` to drop its own references to those user
+  // contexts.
+  //
+  // `destroyed_user_context_ids_->ids` will be cleared to avoid sending the
+  // same IDs again in the next response.
+  void UpdateResponseWithDestroyedUserContextIds(
+      IfrtBackend::Response& response);
+
   HandleGenerator handle_generator_;
 
   // Must not change during the life of this object.
@@ -286,6 +306,18 @@ class IfrtBackend final : public BackendInterface {
 
   class InOrderRequestsProcessor;
   std::unique_ptr<InOrderRequestsProcessor> in_order_requests_processor_;
+
+  // Tracks destroyed user context IDs. `ids` contains the ID of `UserContext`s
+  // destroyed since the last response identified by `next_seq_num` - 1.
+  //
+  // Uses a shared pointer because `IfrtBackendUserContext` may outlive the
+  // `IfrtBackend`.
+  struct DestroyedUserContextIds {
+    absl::Mutex mutex;
+    int64_t next_seq_num ABSL_GUARDED_BY(mutex) = 0;
+    std::vector<UserContextId> ids ABSL_GUARDED_BY(mutex);
+  };
+  std::shared_ptr<DestroyedUserContextIds> destroyed_user_context_ids_;
 };
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index 37e25cf5794b68..fd4cefa44035b4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -318,7 +318,7 @@ class IfrtBackendHandlerTest : public IfrtBackendTest {
   }
 
   uint64_t NewOpId() {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return current_op_id_++;
   }
 
@@ -1157,7 +1157,8 @@ TEST_P(IfrtBackendHandlerTest, CompileSuccess) {
   auto executable = std::make_unique<MockLoadedExecutable>();
   EXPECT_CALL(*executable, name()).WillOnce(Return("executable_name"));
   EXPECT_CALL(*executable, num_devices()).WillOnce(Return(4));
-  EXPECT_CALL(*executable, devices()).WillOnce(ReturnRef(device_list));
+  EXPECT_CALL(*executable, devices())
+      .WillOnce(Return(std::make_optional(device_list)));
   EXPECT_CALL(*executable, addressable_devices())
       .WillOnce(Return(absl::MakeSpan(addressable_devices)));
   EXPECT_CALL(*executable, Fingerprint()).WillOnce(Return("fingerprint"));
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context.cc
new file mode 100644
index 00000000000000..93a1e12cc2970b
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context.cc
@@ -0,0 +1,57 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt_proxy/server/ifrt_backend_user_context.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/strings/str_cat.h"
+#include "xla/python/ifrt/user_context.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+[[maybe_unused]] char IfrtBackendUserContext::ID = 0;
+
+UserContextRef IfrtBackendUserContext::Create(
+    UserContextId original_id,
+    absl::AnyInvocable<void(UserContextId) &&> on_destroyed) {
+  if (original_id.value() == 0) {
+    return UserContextRef();
+  }
+  return UserContextRef(tsl::MakeRef<IfrtBackendUserContext>(
+      original_id, std::move(on_destroyed)));
+}
+
+IfrtBackendUserContext::IfrtBackendUserContext(
+    UserContextId original_id,
+    absl::AnyInvocable<void(UserContextId) &&> on_destroyed)
+    : original_id_(original_id), on_destroyed_(std::move(on_destroyed)) {}
+
+IfrtBackendUserContext::~IfrtBackendUserContext() {
+  std::move(on_destroyed_)(original_id_);
+}
+
+std::string IfrtBackendUserContext::DebugString() const {
+  return absl::StrCat("IfrtProxyServerUserContext(", original_id_.value(), ")");
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context.h
new file mode 100644
index 00000000000000..6d3433567e78b7
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context.h
@@ -0,0 +1,66 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_USER_CONTEXT_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_USER_CONTEXT_H_
+
+#include <string>
+
+#include "absl/functional/any_invocable.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/user_context.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class IfrtBackendUserContext
+    : public llvm::RTTIExtends<IfrtBackendUserContext, UserContext> {
+ public:
+  // Creates a UserContextRef with the given `original_id` and `on_destroyed`
+  // callback called at its destruction. If `original_id` is 0, returns a
+  // nullptr; `on_destroyed` will not be called this case.
+  static UserContextRef Create(
+      UserContextId original_id,
+      absl::AnyInvocable<void(UserContextId) &&> on_destroyed);
+
+  // UserContext implementation.
+
+  ~IfrtBackendUserContext() override;
+
+  UserContextId Id() const override { return original_id_; }
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  IfrtBackendUserContext(
+      UserContextId original_id,
+      absl::AnyInvocable<void(UserContextId) &&> on_destroyed);
+
+  UserContextId original_id_;
+  absl::AnyInvocable<void(UserContextId) &&> on_destroyed_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_USER_CONTEXT_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context_test.cc
new file mode 100644
index 00000000000000..55a6a3c8a503a1
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_user_context_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt_proxy/server/ifrt_backend_user_context.h"
+
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "xla/python/ifrt/user_context.h"
+#include "xla/tsl/concurrency/future.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+TEST(IfrtBackendUserContextTest, CreateWithZeroId) {
+  UserContextRef user_context =
+      IfrtBackendUserContext::Create(UserContextId(0), [](UserContextId) {});
+  EXPECT_EQ(user_context, UserContextRef());
+}
+
+TEST(IfrtBackendUserContextTest, CreateWithNonZeroId) {
+  UserContextRef user_context =
+      IfrtBackendUserContext::Create(UserContextId(100), [](UserContextId) {});
+  EXPECT_EQ(user_context->Id(), UserContextId(100));
+  EXPECT_EQ(user_context->DebugString(), "IfrtProxyServerUserContext(100)");
+}
+
+TEST(IfrtBackendUserContextTest, DestroyUserContext) {
+  auto [promise, future] = tsl::Future<UserContextId>::MakePromise();
+  {
+    UserContextRef user_context = IfrtBackendUserContext::Create(
+        UserContextId(100), [promise = std::move(promise)](
+                                UserContextId id) mutable { promise.Set(id); });
+  }
+  TF_ASSERT_OK_AND_ASSIGN(UserContextId id, future.Await());
+  EXPECT_EQ(id, UserContextId(100));
+}
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/version_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/version_test.cc
index ec3039b6355d74..583eb604e9d183 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/version_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/version_test.cc
@@ -21,16 +21,12 @@
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "xla/python/ifrt/serdes_version.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace ifrt {
 namespace proxy {
 namespace {
 
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
-
 struct Param {
   int client_min_version;
   int client_max_version;
diff --git a/third_party/xla/xla/python/inspect_sharding.cc b/third_party/xla/xla/python/inspect_sharding.cc
index 81f278999ff776..10b83d32d97f26 100644
--- a/third_party/xla/xla/python/inspect_sharding.cc
+++ b/third_party/xla/xla/python/inspect_sharding.cc
@@ -46,7 +46,8 @@ std::optional<xla::HloSharding> InspectShardingReadArgs(
     JAX_InspectSharding_Callback_Args* args) {
   xla::OpSharding proto;
   if (args->sharding_spec_size > std::numeric_limits<int>::max() ||
-      !proto.ParseFromArray(args->sharding_spec, args->sharding_spec_size)) {
+      !proto.ParseFromString(
+          absl::string_view(args->sharding_spec, args->sharding_spec_size))) {
     InspectShardingSetError(args,
                             "inspect_sharding: error parsing OpShardingProto");
     return std::nullopt;
diff --git a/third_party/xla/xla/python/nb_status.h b/third_party/xla/xla/python/nb_status.h
new file mode 100644
index 00000000000000..d18741ba4ccbcb
--- /dev/null
+++ b/third_party/xla/xla/python/nb_status.h
@@ -0,0 +1,55 @@
+/* Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_STATUS_H_
+#define XLA_PYTHON_NB_STATUS_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "nanobind/nanobind.h"
+
+namespace nanobind {
+
+namespace detail {
+
+// A Nanobind type caster for absl::Status. This type caster allows Nanobind to
+// automatically convert a non-OK return status to a Python exception.
+template <>
+struct type_caster<absl::Status> {
+  NB_TYPE_CASTER(absl::Status, const_name("Status"))
+
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    value = absl::OkStatus();
+    return true;
+  }
+
+  template <typename T>
+  static handle from_cpp(T &&value, rv_policy policy,
+                         cleanup_list *cleanup) noexcept {
+    if (!value.ok()) {
+      PyErr_Format(PyExc_RuntimeError, "absl::Status not ok: %s",
+                   value.ToString().c_str());
+      return nullptr;
+    } else {
+      return none().release();
+    }
+  }
+};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_NB_STATUS_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 02b9fb13ebc757..1b706d40dd3aea 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -231,7 +231,6 @@ xla_cc_test(
         "//xla/python/ifrt:device_test_util",
         "//xla/python/ifrt:tuple_impl_test_lib",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/status:status_matchers",
@@ -261,7 +260,6 @@ cc_library(
         "//xla/python/ifrt:test_util",
         "//xla/tsl/concurrency:future",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -318,23 +316,24 @@ cc_library(
         ":transfer_server_interface",
         ":xla_executable_version_serdes",
         ":xla_ifrt",
+        "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/ffi:execution_context",
-        "//xla/ffi:type_id_registry",
+        "//xla/ffi:type_registry",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/translate/mhlo_to_hlo:type_to_shape",
         "//xla/pjrt:host_callback",
         "//xla/pjrt:host_memory_spaces",
+        "//xla/pjrt:layout_mode",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:client",
@@ -490,7 +489,6 @@ xla_cc_test(
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:mock",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -578,7 +576,7 @@ xla_cc_test(
     deps = [
         ":basic_string_array",
         ":pjrt_cpu_client_multi_process_test_lib",
-        "//xla/pjrt:pjrt_future",
+        "//xla:future",
         "//xla/python/ifrt",
         "//xla/python/ifrt:test_util",
         "//xla/tsl/concurrency:future",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
index 8fe3f79ca54047..33841c00f8a99f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index c51adef049b751..05be2489eaafee 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -174,16 +174,20 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
 }
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
-    PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer) {
+    PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer,
+    bool has_custom_layout) {
   TF_ASSIGN_OR_RETURN(auto dtype, ToDType(pjrt_buffer->element_type()));
   Shape shape(pjrt_buffer->dimensions());
   TF_ASSIGN_OR_RETURN(auto device,
                       client->LookupPjRtDevice(pjrt_buffer->device()));
   auto sharding = SingleDeviceSharding::Create(
       device, MakeMemoryKindFromPjRtBuffer(pjrt_buffer.get()));
-  auto layout = (dtype.kind() == DType::kToken)
-                    ? std::make_shared<xla::PjRtLayout>(xla::Layout())
-                    : pjrt_buffer->layout();
+  std::shared_ptr<const xla::PjRtLayout> layout;
+  if (has_custom_layout) {
+    layout = (dtype.kind() == DType::kToken)
+                 ? std::make_shared<xla::PjRtLayout>(xla::Layout())
+                 : pjrt_buffer->layout();
+  }
   return tsl::MakeRef<PjRtArray>(
       client, dtype, std::move(shape), std::move(sharding),
       PjRtBuffers({std::move(pjrt_buffer)}), std::move(layout));
@@ -195,7 +199,8 @@ absl::StatusOr<ArrayRef> PjRtArray::FullyReplicatedShard(
     return FailedPrecondition(
         "FullyReplicatedShard: Array has no addressable shards.");
   }
-  return PjRtArray::Create(client(), GetPjRtBuffer(semantics, 0));
+  return PjRtArray::Create(client(), GetPjRtBuffer(semantics, 0),
+                           /*has_custom_layout=*/(layout_ != nullptr));
 }
 
 std::shared_ptr<PjRtBuffer> PjRtArray::GetPjRtBuffer(
@@ -215,7 +220,8 @@ std::shared_ptr<PjRtBuffer> PjRtArray::GetPjRtBuffer(
 }
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
-    PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers) {
+    PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers,
+    bool has_custom_layout) {
   if (pjrt_buffers.empty()) {
     return InvalidArgument("PjRtBuffers must be non-empty.");
   }
@@ -239,14 +245,19 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
       BasicDeviceList::Create(std::move(devices)), memory_kind,
       /*shape=*/shape,
       /*shard_shapes=*/shapes);
-  auto layout = pjrt_buffers.front()->layout();
+  std::shared_ptr<const xla::PjRtLayout> layout;
+  if (has_custom_layout) {
+    layout = (dtype.kind() == DType::kToken)
+                 ? std::make_shared<xla::PjRtLayout>(xla::Layout())
+                 : pjrt_buffers.front()->layout();
+  }
   return PjRtArray::Create(client, dtype, std::move(shape), std::move(sharding),
                            std::move(pjrt_buffers), std::move(layout));
 }
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
     PjRtCompatibleClient* client, DynamicShape dynamic_shape,
-    PjRtBuffers pjrt_buffers) {
+    PjRtBuffers pjrt_buffers, bool has_custom_layout) {
   if (pjrt_buffers.empty()) {
     return InvalidArgument("PjRtBuffers must be non-empty.");
   }
@@ -276,7 +287,12 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
       BasicDeviceList::Create(std::move(devices)), memory_kind,
       /*dynamic_shape=*/dynamic_shape,
       /*shard_dynamic_shapes=*/dynamic_shapes);
-  auto layout = pjrt_buffers.front()->layout();
+  std::shared_ptr<const xla::PjRtLayout> layout;
+  if (has_custom_layout) {
+    layout = (dtype.kind() == DType::kToken)
+                 ? std::make_shared<xla::PjRtLayout>(xla::Layout())
+                 : pjrt_buffers.front()->layout();
+  }
   return PjRtArray::Create(client, dtype, std::move(dynamic_shape),
                            std::move(sharding), std::move(pjrt_buffers),
                            std::move(layout));
@@ -534,13 +550,22 @@ absl::StatusOr<ArrayRef> PjRtArray::Copy(
   if (new_client == nullptr) {
     new_client = client_;
   }
+  std::shared_ptr<const xla::PjRtLayout> layout;
+  static MemoryKind kUnpinnedHostMemoryKind(UnpinnedHostMemorySpace::kKind);
+  // Unpinned host supports default layouts only; a custom layout would be
+  // ignored.
+  // TODO(hyeontaek): This behavior should be informed by the underlying PjRt
+  // client instead of following a convention.
+  if (layout_ != nullptr &&
+      canonicalized_sharding_memory_kind != kUnpinnedHostMemoryKind) {
+    layout = layout_;
+  }
   return std::visit(
-      [this, new_client, &new_sharding, &buffers](const auto& shape) {
-        std::shared_ptr<const xla::PjRtLayout> buffer_layout =
-            buffers[0]->layout();
+      [this, new_client, &new_sharding, &buffers,
+       layout = std::move(layout)](const auto& shape) {
         return PjRtArray::Create(new_client, dtype_, shape,
                                  std::move(new_sharding), std::move(buffers),
-                                 std::move(buffer_layout));
+                                 layout);
       },
       shape_);
 }
@@ -609,10 +634,10 @@ absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> PjRtArray::pjrt_layout()
   for (int i = 1; i < pjrt_buffers_.size(); ++i) {
     std::shared_ptr<const xla::PjRtLayout> layout_i =
         pjrt_buffers_[i]->layout();
-    DCHECK(*layout_ == *layout_i)
+    DCHECK(*pjrt_buffers_[0]->layout() == *layout_i)
         << "PjRtArray has mismatched layouts across shards! "
-        << "shard 0: " << layout_->ToString() << ", shard " << i << ": "
-        << layout_i->ToString();
+        << "shard 0: " << pjrt_buffers_[0]->layout()->ToString() << ", shard "
+        << i << ": " << layout_i->ToString();
   }
 #endif
   return layout_;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
index 6145f1b3974127..6bb7c5cdf5f7db 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
@@ -68,34 +68,44 @@ class PjRtArray final
   using PjRtBuffers =
       absl::InlinedVector<std::shared_ptr<PjRtBuffer>, kPjRtBufferInlineSize>;
 
-  // General array construction (with static shape). pjrt_buffers may be empty.
+  // General array construction (with static shape). `pjrt_buffers` may be
+  // empty. `layout == nullptr` indicates a default layout.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, DType dtype, Shape shape,
       ShardingRef sharding, PjRtBuffers pjrt_buffers,
       std::shared_ptr<const xla::PjRtLayout> layout);
 
-  // General array construction (with dynamic shape). pjrt_buffers may be empty.
+  // General array construction (with dynamic shape). `pjrt_buffers` may be
+  // empty. `layout == nullptr` indicates a default layout.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, DType dtype, DynamicShape dynamic_shape,
       ShardingRef sharding, PjRtBuffers pjrt_buffers,
       std::shared_ptr<const xla::PjRtLayout> layout);
 
   // Shorthand for a single-shard array construction.
+  // See `PjRtCompatibleClient::CreatePjRtArray()` for the meaning of
+  // `has_custom_layout`.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
-      PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer);
+      PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer,
+      bool has_custom_layout);
 
   // Shorthand for a multi-shard array construction using ConcreteSharding.
-  // pjrt_buffers must be non-empty.
+  // `pjrt_buffers` must be non-empty.
+  // See `PjRtCompatibleClient::CreatePjRtArray()` for the meaning of
+  // `has_custom_layout`.
   // TODO(hyeontaek): Remove this once IFRT Sharding and JAX Sharding is unified
   // so that ConcreteSharding can be replaced with a real Sharding.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
-      PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers);
+      PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers,
+      bool has_custom_layout);
 
   // Shorthand for a multi-shard array construction using ConcreteSharding with
-  // DynamicShape. pjrt_buffers must be non-empty.
+  // DynamicShape. `pjrt_buffers` must be non-empty.
+  // See `PjRtCompatibleClient::CreatePjRtArray()` for the meaning of
+  // `has_custom_layout`.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, DynamicShape dynamic_shape,
-      PjRtBuffers pjrt_buffers);
+      PjRtBuffers pjrt_buffers, bool has_custom_layout);
 
   // PjRtCompatibleArray implementation.
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array_impl_test_cpu.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array_impl_test_cpu.cc
index a1c986ea3b4c06..7a5eb7e7651c1e 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array_impl_test_cpu.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array_impl_test_cpu.cc
@@ -18,11 +18,18 @@ limitations under the License.
 #include "xla/python/ifrt/test_util.h"
 
 int main(int argc, char** argv) {
-  // CpuBuffer::ToLiteral() currently does not respect the layout of the
-  // destination literal.
   static constexpr absl::string_view kFilter =
+      // CpuBuffer::ToLiteral() currently does not respect the layout of the
+      // destination literal.
       "-ArrayImplTest."
-      "MakeArrayFromHostBufferAndCopyToHostBufferWithByteStrides";
+      "MakeArrayFromHostBufferAndCopyToHostBufferWithByteStrides:"
+      // Arrays created using `MakeArraysFromHostBufferShards()` do not indicate
+      // correct custom layouts even if the given layout is a concrete default
+      // layout. PjRt-IFRT uses `ClientMakeArraysFromHostBufferShards()`
+      // internally, which lowers `MakeArraysFromHostBufferShards()` call into
+      // legacy API calls that do not yet support custom layouts, and thus the
+      // output arrays only can have default layouts.
+      "ArrayImplTest.MakeArraysFromHostBufferShardsWithLayout";
   xla::ifrt::test_util::SetTestFilterIfNotUserSpecified(kFilter);
 
   testing::InitGoogleTest(&argc, argv);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index f6ef50f4df60ba..e0ef67a5de0c12 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "xla/future.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
@@ -725,6 +725,17 @@ const char kKeyPrefix[] = "ifrt_cross_host_transfer_";
 char PjRtCompatibleClient::ID = 0;
 char PjRtClient::ID = 0;
 
+absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>>
+PjRtCompatibleClient::CreatePjRtArray(std::shared_ptr<PjRtBuffer> pjrt_buffer) {
+  return CreatePjRtArray(std::move(pjrt_buffer), /*has_custom_layout=*/true);
+}
+
+absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>>
+PjRtCompatibleClient::CreatePjRtArray(Shape shape, PjRtBuffers pjrt_buffers) {
+  return CreatePjRtArray(std::move(shape), std::move(pjrt_buffers),
+                         /*has_custom_layout=*/true);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtClient>> PjRtClient::Create(
     PjRtClient::CreateOptions options) {
   auto client =
@@ -955,16 +966,21 @@ absl::StatusOr<DeviceListRef> PjRtClient::MakeDeviceList(
 const AttributeMap& PjRtClient::Attributes() const { return attributes_; }
 
 absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>>
-PjRtClient::CreatePjRtArray(std::shared_ptr<PjRtBuffer> pjrt_buffer) {
-  TF_ASSIGN_OR_RETURN(auto array,
-                      PjRtArray::Create(this, std::move(pjrt_buffer)));
+PjRtClient::CreatePjRtArray(std::shared_ptr<PjRtBuffer> pjrt_buffer,
+                            bool has_custom_layout) {
+  TF_ASSIGN_OR_RETURN(
+      auto array,
+      PjRtArray::Create(this, std::move(pjrt_buffer), has_custom_layout));
   return tsl::RCReference<PjRtCompatibleArray>(std::move(array));
 }
 
 absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>>
-PjRtClient::CreatePjRtArray(Shape shape, PjRtBuffers pjrt_buffers) {
+PjRtClient::CreatePjRtArray(Shape shape, PjRtBuffers pjrt_buffers,
+                            bool has_custom_layout) {
+  std::shared_ptr<const xla::PjRtLayout> layout;
   TF_ASSIGN_OR_RETURN(auto array, PjRtArray::Create(this, std::move(shape),
-                                                    std::move(pjrt_buffers)));
+                                                    std::move(pjrt_buffers),
+                                                    has_custom_layout));
   return tsl::RCReference<PjRtCompatibleArray>(std::move(array));
 }
 
@@ -1055,7 +1071,8 @@ absl::StatusOr<ArrayRef> PjRtClient::MakeArrayFromHostBuffer(
     }
     buffers.push_back(std::move(buffer));
   }
-  auto layout = buffers.front()->layout();
+  // `MakeArrayFromHostBuffer` only creates buffers with a default layout.
+  std::shared_ptr<const xla::PjRtLayout> layout = nullptr;
   return PjRtArray::Create(this, dtype, std::move(shape), std::move(sharding),
                            std::move(buffers), std::move(layout));
 }
@@ -1121,12 +1138,11 @@ absl::StatusOr<std::vector<ArrayRef>> PjRtClient::MakeErrorArrays(
               error, xla_shape,
               tensorflow::down_cast<PjRtMemory*>(memory)->pjrt_memory()));
     }
-    auto layout = buffers.front()->layout();
     TF_ASSIGN_OR_RETURN(
         arrays.emplace_back(),
         PjRtArray::Create(this, array_spec.dtype, std::move(shard_shape),
                           array_spec.sharding, std::move(buffers),
-                          std::move(layout)));
+                          array_spec.layout));
   }
   return arrays;
 }
@@ -1211,16 +1227,8 @@ absl::StatusOr<ArrayRef> PjRtClient::AssembleArrayFromSingleDeviceArrays(
   }
   // TODO(emilyaf): Remove the following logic once layout is plumbed through.
   std::shared_ptr<const xla::PjRtLayout> layout;
-  if (dtype.kind() == DType::kToken) {
-    layout = std::make_shared<xla::PjRtLayout>(xla::Layout());
-  } else if (buffers.empty()) {
-    TF_ASSIGN_OR_RETURN(auto shard_shape, sharding->GetShardShape(shape));
-    TF_ASSIGN_OR_RETURN(
-        layout, GetDefaultPjRtLayout(dtype, shard_shape.dims(),
-                                     sharding->devices()->devices().front(),
-                                     sharding->memory_kind()));
-  } else {
-    layout = buffers.front()->layout();
+  if (!arrays.empty()) {
+    TF_ASSIGN_OR_RETURN(layout, arrays.front()->pjrt_layout());
   }
   return PjRtArray::Create(this, dtype, std::move(shape), std::move(sharding),
                            std::move(buffers), std::move(layout));
@@ -1387,16 +1395,6 @@ PjRtClient::CopyArraysForCrossHost(absl::Span<ArrayRef> arrays,
                         arrays[i]->shared_ptr_sharding()->WithDeviceAssignment(
                             dst_devices, memory_kind));
     TF_ASSIGN_OR_RETURN(auto new_layout, arrays[i]->pjrt_layout());
-    if (new_layout == nullptr) {
-      TF_ASSIGN_OR_RETURN(
-          xla::ifrt::Shape shard_shape,
-          arrays[i]->sharding().GetShardShape(arrays[i]->shape()));
-      TF_ASSIGN_OR_RETURN(
-          new_layout, GetDefaultPjRtLayout(
-                          arrays[i]->dtype(), shard_shape.dims(),
-                          arrays[i]->sharding().devices()->devices().front(),
-                          arrays[i]->sharding().memory_kind()));
-    }
     TF_ASSIGN_OR_RETURN(
         new_arrays.emplace_back(),
         PjRtArray::Create(this, arrays[i]->dtype(), arrays[i]->shape(),
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index 7a9fda82f21c24..d4f559b1416d23 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -91,10 +92,32 @@ class PjRtCompatibleClient
   // operations.
   virtual xla::PjRtClient* pjrt_client() = 0;
   virtual std::shared_ptr<xla::PjRtClient> shared_ptr_pjrt_client() = 0;
+
+  // Creates an IFRT `PjRtCompatibleArray` from `PjRtBuffer`(s).
+  //
+  // Most array properties will be inferred from the input `PjRtBuffer`(s),
+  // except for the layout's defaultness that is absent information at the PjRt
+  // level.
+  //
+  // `has_custom_layout` indicates that the layout of the input `PjRtBuffer`(s)
+  // is intended to be a user-chosen custom layout, and
+  // `PjRtCompatibleArray::pjrt_layout()` should return a non-null value.
+  // Treating a default layout as a custom layout is typically allowed in PjRt
+  // if their concrete layouts match, but it may not pass a strict check that
+  // unconditionally says a default layout != any non-default layout designed
+  // for portability. Thus, it is useful for the caller to provide as accurate
+  // information as possible.
   virtual absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
-      std::shared_ptr<PjRtBuffer> pjrt_buffer) = 0;
+      std::shared_ptr<PjRtBuffer> pjrt_buffer, bool has_custom_layout) = 0;
   virtual absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
-      Shape shape, PjRtBuffers pjrt_buffers) = 0;
+      Shape shape, PjRtBuffers pjrt_buffers, bool has_custom_layout) = 0;
+
+  // Temporary overloads for API transition.
+  absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      std::shared_ptr<PjRtBuffer> pjrt_buffer);
+  absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      Shape shape, PjRtBuffers pjrt_buffers);
+
   virtual absl::StatusOr<PjRtCompatibleDevice*> LookupPjRtDevice(
       xla::PjRtDevice* pjrt_device) const = 0;
   virtual absl::StatusOr<PjRtCompatibleMemory*> LookupPjRtMemory(
@@ -178,9 +201,9 @@ class PjRtClient final
     return pjrt_client_;
   }
   absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
-      std::shared_ptr<PjRtBuffer> pjrt_buffer) override;
+      std::shared_ptr<PjRtBuffer> pjrt_buffer, bool has_custom_layout) override;
   absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
-      Shape shape, PjRtBuffers pjrt_buffers) override;
+      Shape shape, PjRtBuffers pjrt_buffers, bool has_custom_layout) override;
 
   // Client implementation.
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 0456a1bce16476..1ce48e498067cc 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/pjrt_executable.h"
 
+#include <cstddef>
 #include <memory>
 #include <optional>
 #include <string>
@@ -27,22 +28,25 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/ffi/execution_context.h"
-#include "xla/ffi/type_id_registry.h"
+#include "xla/ffi/type_registry.h"
+#include "xla/future.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
 #include "xla/layout.h"
 #include "xla/pjrt/host_callback.h"
+#include "xla/pjrt/layout_mode.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/utils.h"
 #include "xla/primitive_util.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/attribute_map.h"
@@ -147,6 +151,7 @@ absl::StatusOr<std::optional<xla::HloSharding>> GetFirstModuleOutputSharding(
 }
 
 // Returns the flattened output memory_kinds of the first module in a
+// `PjRtLoadedExecutable`.
 // `UnimplementedError` will be converted into `std::nullopt`.
 absl::StatusOr<std::optional<std::vector<absl::string_view>>>
 GetFirstModuleOutputMemoryKinds(
@@ -165,6 +170,39 @@ GetFirstModuleOutputMemoryKinds(
   return std::move(output_memory_kinds)->front();
 }
 
+// Returns the flattened output layouts of the first module in a
+// `PjRtLoadedExecutable`.
+// `UnimplementedError` will be converted into a vector of `nullptr`.
+absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+GetFirstModuleOutputLayouts(
+    xla::PjRtLoadedExecutable* pjrt_loaded_executable,
+    absl::Span<const xla::LayoutMode> output_layout_modes) {
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+      executable_output_layouts = pjrt_loaded_executable->GetOutputLayouts();
+  // An unimplemented error is converted into all-default layouts.
+  if (absl::IsUnimplemented(executable_output_layouts.status())) {
+    return std::vector<std::shared_ptr<const xla::PjRtLayout>>(
+        /*size=*/output_layout_modes.size(), /*value=*/nullptr);
+  }
+  TF_RETURN_IF_ERROR(executable_output_layouts.status());
+  std::vector<std::shared_ptr<const xla::PjRtLayout>> output_layouts;
+  if (executable_output_layouts->size() != output_layout_modes.size()) {
+    return FailedPrecondition(
+        "Output memory kinds and output layout modes have different sizes: %d "
+        "vs. %d",
+        executable_output_layouts->size(), output_layout_modes.size());
+  }
+  output_layouts.reserve(executable_output_layouts->size());
+  for (int i = 0; i < executable_output_layouts->size(); ++i) {
+    if (output_layout_modes[i].mode == xla::LayoutMode::Mode::kDefault) {
+      output_layouts.push_back(nullptr);
+    } else {
+      output_layouts.push_back(std::move((*executable_output_layouts)[i]));
+    }
+  }
+  return output_layouts;
+}
+
 struct ShapePartialInfo {
   std::vector<xla::PrimitiveType> element_types;
   std::vector<xla::DimensionVector> dimensions;
@@ -188,6 +226,36 @@ absl::StatusOr<ShapePartialInfo> CreateShapePartialInfo(
   return partial_info;
 }
 
+// Special `xla::GetLayoutModes()` implementation for obtaining layout modes
+// from `hlo_module` without serializing it into proto.
+
+static const char* kDelimiter = ";";
+
+static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModesFromFrontendAttr(
+    absl::string_view attr) {
+  // SkipEmpty() needed to avoid returning the empty string when attr is empty.
+  std::vector<std::string> str_modes =
+      absl::StrSplit(attr, kDelimiter, absl::SkipEmpty());
+  std::vector<LayoutMode> result;
+  for (const std::string& str_mode : str_modes) {
+    TF_ASSIGN_OR_RETURN(LayoutMode mode, LayoutMode::FromString(str_mode));
+    result.emplace_back(std::move(mode));
+  }
+  return result;
+}
+
+static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModes(
+    const HloModule& hlo_module, absl::string_view frontend_attr_name,
+    size_t num_values) {
+  const auto& frontend_attrs = hlo_module.frontend_attributes().map();
+  auto iter = frontend_attrs.find(frontend_attr_name);
+  if (iter == frontend_attrs.end()) {
+    // Return all default layouts if frontend attr isn't present.
+    return std::vector<LayoutMode>(num_values);
+  }
+  return GetLayoutModesFromFrontendAttr(iter->second);
+}
+
 }  // namespace
 
 char PjRtCompatibleExecutable::ID = 0;
@@ -228,11 +296,39 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
   TF_ASSIGN_OR_RETURN(
       auto result_memory_kinds,
       GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
+  // Obtaining output layout modes and output layouts directly from
+  // `PjRtLoadedExecutable` may fail because the currently PjRt implementations
+  // often fetch and serialize the optimized HLO. For now, we gracefully
+  // handle it by omitting output layouts at creation time and using output
+  // `PjRtBuffer`'s concrete layouts.
+  // TODO(hyeontaek): Add a way to obtain output layout modes and
+  // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
+  // HLO to be serialized and fetched.
+  std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+      output_layouts;
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> hlo_modules =
+      pjrt_loaded_executable->GetHloModules();
+  if (hlo_modules.ok()) {
+    if (hlo_modules->empty()) {
+      return FailedPrecondition("Requires at least one HloModule.");
+    }
+    absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
+        GetLayoutModes(*hlo_modules->front(), "out_layout_modes",
+                       result_element_types.size());
+    if (output_layout_modes.ok()) {
+      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          first_module_output_layouts = GetFirstModuleOutputLayouts(
+              pjrt_loaded_executable.get(), *output_layout_modes);
+      if (first_module_output_layouts.ok()) {
+        output_layouts = *std::move(first_module_output_layouts);
+      }
+    }
+  }
   return CreateInternal(client, std::move(pjrt_loaded_executable),
                         result_element_types, result_dimensions,
                         /*result_hlo_sharding=*/std::nullopt,
-                        result_memory_kinds, loaded_host_callbacks,
-                        std::move(executable_devices));
+                        result_memory_kinds, output_layouts,
+                        loaded_host_callbacks, std::move(executable_devices));
 }
 
 static absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
@@ -271,7 +367,10 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
 
   // We have to do process the MLIR before the compile call, since the latter
   // will use the MLIR as scratch space, or possibly even deallocate it.
-  TF_ASSIGN_OR_RETURN(auto result_shapes, ResultShapesOfModule(module));
+  TF_ASSIGN_OR_RETURN(const std::vector<xla::Shape> result_shapes,
+                      ResultShapesOfModule(module));
+  absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
+      GetOutputLayoutModes(module);
 
   TF_ASSIGN_OR_RETURN(auto pjrt_loaded_executable,
                       client->pjrt_client()->CompileAndLoad(
@@ -290,10 +389,29 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
     TF_ASSIGN_OR_RETURN(
         auto result_memory_kinds,
         GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
+    // Obtaining output layout modes and output layouts directly from
+    // `PjRtLoadedExecutable` may fail because the currently PjRt
+    // implementations often fetch and serialize the optimized HLO. For now, we
+    // gracefully handle it by omitting output layouts at creation time and
+    // using output `PjRtBuffer`'s concrete layouts.
+    // TODO(hyeontaek): Add a way to obtain output layout modes and
+    // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
+    // HLO to be serialized and fetched.
+    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+        output_layouts;
+    if (output_layout_modes.ok()) {
+      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          first_module_output_layouts = GetFirstModuleOutputLayouts(
+              pjrt_loaded_executable.get(), *output_layout_modes);
+      if (first_module_output_layouts.ok()) {
+        output_layouts = *std::move(first_module_output_layouts);
+      }
+    }
     return CreateInternal(client, std::move(pjrt_loaded_executable),
                           result_element_types, result_dimensions,
                           /*result_hlo_sharding=*/std::nullopt,
-                          result_memory_kinds, std::move(loaded_host_callbacks),
+                          result_memory_kinds, output_layouts,
+                          std::move(loaded_host_callbacks),
                           std::move(executable_devices));
   } else {
     VLOG(3) << "Using full shape";
@@ -319,11 +437,29 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
     TF_ASSIGN_OR_RETURN(
         auto result_memory_kinds,
         GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
-    return CreateInternal(client, std::move(pjrt_loaded_executable),
-                          shape_partial_info.element_types,
-                          shape_partial_info.dimensions, result_hlo_sharding,
-                          result_memory_kinds, std::move(loaded_host_callbacks),
-                          std::move(executable_devices));
+    // Obtaining output layout modes and output layouts directly from
+    // `PjRtLoadedExecutable` may fail because the currently PjRt
+    // implementations often fetch and serialize the optimized HLO. For now, we
+    // gracefully handle it by omitting output layouts at creation time and
+    // using output `PjRtBuffer`'s concrete layouts.
+    // TODO(hyeontaek): Add a way to obtain output layout modes and
+    // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
+    // HLO to be serialized and fetched.
+    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+        output_layouts;
+    if (output_layout_modes.ok()) {
+      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          first_module_output_layouts = GetFirstModuleOutputLayouts(
+              pjrt_loaded_executable.get(), *output_layout_modes);
+      if (first_module_output_layouts.ok()) {
+        output_layouts = *std::move(first_module_output_layouts);
+      }
+    }
+    return CreateInternal(
+        client, std::move(pjrt_loaded_executable),
+        shape_partial_info.element_types, shape_partial_info.dimensions,
+        result_hlo_sharding, result_memory_kinds, output_layouts,
+        std::move(loaded_host_callbacks), std::move(executable_devices));
   }
 }
 
@@ -334,6 +470,8 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
     absl::Span<const xla::DimensionVector> result_dimensions,
     const std::optional<xla::HloSharding>& result_hlo_sharding,
     const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
+    const std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
+        output_layouts,
     std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
     DeviceListRef executable_devices) {
   // For jit(pmap(...)), the device assignment (passed as `executable_devices`)
@@ -493,7 +631,8 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
       client, std::move(pjrt_loaded_executable), std::move(executable_devices),
       std::move(addressable_devices), std::move(loaded_host_callbacks),
       std::move(host_send_and_recv_callbacks), std::move(output_dtypes),
-      std::move(output_shapes), std::move(output_shardings)));
+      std::move(output_shapes), std::move(output_shardings),
+      std::move(output_layouts)));
 }
 
 PjRtLoadedExecutable::PjRtLoadedExecutable(
@@ -504,7 +643,9 @@ PjRtLoadedExecutable::PjRtLoadedExecutable(
     std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
         host_send_recv_callbacks,
     std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
-    std::vector<ShardingRef> output_shardings)
+    std::vector<ShardingRef> output_shardings,
+    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+        output_layouts)
     : client_(client),
       pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
       devices_(std::move(devices)),
@@ -516,6 +657,7 @@ PjRtLoadedExecutable::PjRtLoadedExecutable(
       output_dtypes_(std::move(output_dtypes)),
       output_shapes_(std::move(output_shapes)),
       output_shardings_(std::move(output_shardings)),
+      output_layouts_(std::move(output_layouts)),
       user_context_(UserContextScope::current()) {}
 
 PjRtLoadedExecutable::~PjRtLoadedExecutable() = default;
@@ -572,7 +714,6 @@ PjRtLoadedExecutable::Execute(absl::Span<ArrayRef> args,
   }
 
   xla::ExecuteOptions opts;
-  opts.untuple_result = true;
   opts.launch_id = options.launch_id;
   opts.use_major_to_minor_data_layout_for_callbacks = true;
   opts.non_donatable_input_indices = options.non_donatable_input_indices;
@@ -626,8 +767,7 @@ PjRtLoadedExecutable::Execute(absl::Span<ArrayRef> args,
     }
     ffi_callbacks->callbacks = callbacks->data();
     ffi_callbacks->num_callbacks = callbacks->size();
-    auto type_id = xla::ffi::TypeIdRegistry::TypeId(
-        xla::FfiLoadedHostCallbacks::id.type_id);
+    ffi::TypeRegistry::TypeId type_id(FfiLoadedHostCallbacks::id.type_id);
     CHECK_OK(context->ffi_context().Insert(type_id, ffi_callbacks.get()));
     opts.context = context.get();
   }
@@ -719,11 +859,13 @@ PjRtLoadedExecutable::Execute(absl::Span<ArrayRef> args,
   // memory_kind shares the same Sharding object.
   absl::flat_hash_map<MemoryKind, ShardingRef> single_device_shardings;
 
-  // TODO(emilyaf): Simplify the handling of layouts here when they're plumbed
-  // through from JAX.
   std::vector<std::shared_ptr<const xla::PjRtLayout>> layouts;
   layouts.reserve(num_outputs);
-  if (!pjrt_outputs.empty()) {
+  if (output_layouts_.has_value()) {
+    // TODO(hyeontaek): Once we can get `output_layouts_` reliably, only keep
+    // this path.
+    layouts = *output_layouts_;
+  } else if (!pjrt_outputs.empty()) {
     for (int i = 0; i < num_outputs; ++i) {
       auto layout = output_dtypes_[i].kind() == xla::ifrt::DType::kToken
                         ? std::make_shared<xla::PjRtLayout>(xla::Layout())
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index 4f8be95fa40473..8a69517b935db2 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/device.h"
@@ -127,6 +128,9 @@ class PjRtExecutable final
 
   absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
   GetOutputLayouts() const override {
+    // TODO(hyeontaek): Return `output_layouts_` instead, which can distinguish
+    // between default and custom layouts, once the users of
+    // `GetOutputLayouts()` understand `nullptr` elements.
     DCHECK(this);
     return pjrt_executable_->GetOutputLayouts();
   }
@@ -312,7 +316,14 @@ class PjRtLoadedExecutable final
       absl::Span<ArrayRef> args, const ExecuteOptions& options,
       std::optional<DeviceListRef> devices) override;
 
-  const DeviceListRef& devices() const override { return devices_; }
+  std::optional<DeviceListRef> devices() const override {
+    if (pjrt_loaded_executable_->addressable_devices().empty()) {
+      // Portable executable.
+      return std::nullopt;
+    } else {
+      return devices_;
+    }
+  }
 
   absl::Span<Device* const> addressable_devices() const override {
     DCHECK(this);
@@ -335,6 +346,8 @@ class PjRtLoadedExecutable final
       absl::Span<const xla::DimensionVector> result_dimensions,
       const std::optional<xla::HloSharding>& result_hlo_sharding,
       const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
+      const std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
+          output_layouts,
       std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
       DeviceListRef executable_devices);
 
@@ -347,7 +360,9 @@ class PjRtLoadedExecutable final
       std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
           host_send_recv_callbacks,
       std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
-      std::vector<ShardingRef> output_shardings);
+      std::vector<ShardingRef> output_shardings,
+      std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          output_layouts);
 
   PjRtClient* client_;
   std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable_;
@@ -366,6 +381,8 @@ class PjRtLoadedExecutable final
   std::vector<DType> output_dtypes_;
   std::vector<Shape> output_shapes_;
   std::vector<ShardingRef> output_shardings_;
+  std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+      output_layouts_;
   const xla::ifrt::UserContextRef user_context_;
 };
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_test.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_test.cc
index a9a54db8f3bf2f..aa0630b6dc763d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/mock.h"
 #include "xla/python/ifrt/shape.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -42,7 +41,6 @@ namespace {
 
 using ::testing::Optional;
 using ::testing::Return;
-using ::tsl::testing::IsOkAndHolds;
 
 TEST(PjRtLayoutTest, Create) {
   EXPECT_EQ(PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(
diff --git a/third_party/xla/xla/python/pjrt_ifrt/reshard_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/reshard_impl_test_lib.cc
index b7fb8305e9e96e..f340031bd165be 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/reshard_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/reshard_impl_test_lib.cc
@@ -56,7 +56,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -66,8 +65,6 @@ namespace {
 
 using ::testing::Eq;
 using ::testing::HasSubstr;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 absl::StatusOr<ArrayRef> MakeArrayFromLiteral(Client* absl_nonnull client,
                                               const xla::LiteralBase& literal,
@@ -401,8 +398,18 @@ TEST_F(ReshardTest, DifferentDestinationLayout) {
 
   // Make sure that the destination layout is actually different from the source
   // layout in order to ensure the test coverage.
-  TF_ASSERT_OK_AND_ASSIGN(const auto src_layout, src_array->pjrt_layout());
-  ASSERT_NE(src_layout, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const xla::PjRtLayout> src_layout,
+                          src_array->pjrt_layout());
+  if (src_layout == nullptr) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        Shape shard_shape,
+        src_array->sharding().GetShardShape(src_array->shape()));
+    TF_ASSERT_OK_AND_ASSIGN(
+        src_layout, client_->GetDefaultPjRtLayout(
+                        src_array->dtype(), shard_shape.dims(),
+                        src_array->sharding().devices()->devices().front(),
+                        src_array->sharding().memory_kind()));
+  }
   ASSERT_NE(src_layout->xla_layout(), dst_array_spec.layout->xla_layout());
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index a5a79f7d97386f..1a5d16b4ff3ca0 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -406,6 +406,7 @@ TEST_P(LoadedExecutableImplTest, CompileAndExecutePortable) {
                          /*replicated=*/false, serialize));
   }
   EXPECT_EQ(loaded_executable->user_context()->Id(), UserContextId(20));
+  EXPECT_EQ(loaded_executable->devices(), std::nullopt);
 
   DType dtype(DType::kF32);
   Shape shape({2, 3});
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
index 28c7f0162f128e..1d082485f94a83 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -47,8 +47,6 @@ using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::SizeIs;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 class HloShardingTest
     : public testing::TestWithParam<test_util::DeviceTestParam> {
@@ -950,7 +948,8 @@ TEST_P(HloShardingTest, DisassembleFailsWithMismatchingShapeDimsSize) {
 
 TEST_P(HloShardingTest, DisassembleFailsWithDynamicShape) {
   auto device_list = GetDevices({0, 1});
-  auto xla_hlo_sharding = xla::HloSharding::Tile(xla::TileAssignment({2}));
+  auto xla_hlo_sharding =
+      xla::HloSharding::Tile(xla::TileAssignment(absl::Span<const int64_t>{2}));
   std::shared_ptr<const HloSharding> sharding =
       HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
 
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index cfa529ccf94ac3..5b88793589d2e2 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -175,43 +175,48 @@ NB_MODULE(_profiler, m) {
              new (wrapper)
                  ProfilerSessionWrapper(tsl::ProfilerSession::Create(options));
            })
-      .def("stop_and_export",
-           [](ProfilerSessionWrapper* sess,
-              const std::string& tensorboard_dir) -> void {
-             tensorflow::profiler::XSpace xspace;
-             // Disables the ProfilerSession
-             xla::ThrowIfError(sess->session->CollectData(&xspace));
-             xla::ThrowIfError(tsl::profiler::ExportToTensorBoard(
-                 xspace, tensorboard_dir, /* also_export_trace_json= */ true));
-           })
+      .def(
+          "stop_and_export",
+          [](ProfilerSessionWrapper* sess, const std::string& tensorboard_dir) {
+            tensorflow::profiler::XSpace xspace;
+            // Disables the ProfilerSession
+            xla::ThrowIfError(sess->session->CollectData(&xspace));
+            xla::ThrowIfError(tsl::profiler::ExportToTensorBoard(
+                xspace, tensorboard_dir, /* also_export_trace_json= */ true));
+          },
+          nb::call_guard<nb::gil_scoped_release>())
       .def("stop",
            [](ProfilerSessionWrapper* sess) -> nb::bytes {
-             tensorflow::profiler::XSpace xspace;
+             std::string xspace_str;
              // Disables the ProfilerSession
-             xla::ThrowIfError(sess->session->CollectData(&xspace));
-             std::string xspace_str = xspace.SerializeAsString();
+             {
+               nb::gil_scoped_release release;
+               tensorflow::profiler::XSpace xspace;
+               xla::ThrowIfError(sess->session->CollectData(&xspace));
+               xspace_str = xspace.SerializeAsString();
+             }
              return nb::bytes(xspace_str.data(), xspace_str.size());
            })
-      .def("stop_and_get_profile_data",
-           [](ProfilerSessionWrapper* sess)
-               -> tensorflow::profiler::python::ProfileData {
-             auto xspace = std::make_shared<tensorflow::profiler::XSpace>();
-             // Disables the ProfilerSession
-             xla::ThrowIfError(sess->session->CollectData(xspace.get()));
-             return tensorflow::profiler::python::ProfileData(xspace);
-           })
-      .def("export",
-           [](ProfilerSessionWrapper* sess, nb::bytes xspace,
-              const std::string& tensorboard_dir) -> void {
-             tensorflow::profiler::XSpace xspace_proto;
-             // TODO(phawkins): change to absl::string_view when protobuf is
-             // updated in XLA.
-             xspace_proto.ParseFromString(
-                 std::string(xspace.c_str(), xspace.size()));
-             xla::ThrowIfError(tsl::profiler::ExportToTensorBoard(
-                 xspace_proto, tensorboard_dir,
-                 /* also_export_trace_json= */ true));
-           });
+      .def(
+          "stop_and_get_profile_data",
+          [](ProfilerSessionWrapper* sess)
+              -> tensorflow::profiler::python::ProfileData {
+            auto xspace = std::make_shared<tensorflow::profiler::XSpace>();
+            // Disables the ProfilerSession
+            xla::ThrowIfError(sess->session->CollectData(xspace.get()));
+            return tensorflow::profiler::python::ProfileData(xspace);
+          },
+          nb::call_guard<nb::gil_scoped_release>())
+      .def("export", [](ProfilerSessionWrapper* sess, nb::bytes xspace,
+                        const std::string& tensorboard_dir) {
+        tensorflow::profiler::XSpace xspace_proto;
+        absl::string_view bytes(xspace.c_str(), xspace.size());
+        nb::gil_scoped_release release;
+        xspace_proto.ParseFromString(bytes);
+        xla::ThrowIfError(tsl::profiler::ExportToTensorBoard(
+            xspace_proto, tensorboard_dir,
+            /* also_export_trace_json= */ true));
+      });
 
   nb::class_<tensorflow::ProfileOptions> profile_options_class(
       m, "ProfileOptions");
diff --git a/third_party/xla/xla/python/profiler/BUILD b/third_party/xla/xla/python/profiler/BUILD
index d260d26abea1a7..2ede8186703ca6 100644
--- a/third_party/xla/xla/python/profiler/BUILD
+++ b/third_party/xla/xla/python/profiler/BUILD
@@ -16,13 +16,14 @@ cc_library(
         "//xla/python:__pkg__",
     ],
     deps = [
+        "//xla/pjrt:status_casters",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:logging",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_protobuf//:protobuf_lite",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@nanobind",
     ],
diff --git a/third_party/xla/xla/python/profiler/profile_data_lib.cc b/third_party/xla/xla/python/profiler/profile_data_lib.cc
index 485dec21587dd7..bbb0a12cef3b0f 100644
--- a/third_party/xla/xla/python/profiler/profile_data_lib.cc
+++ b/third_party/xla/xla/python/profiler/profile_data_lib.cc
@@ -18,15 +18,16 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <stdexcept>
 #include <string>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "nanobind/nanobind.h"
 #include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "tsl/platform/env.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/logging.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow::profiler::python {
@@ -186,8 +187,8 @@ VisitorIterator<nb::tuple, XStat> ProfilePlane::stats_end() {
 /*static*/ ProfileData ProfileData::from_file(
     const std::string& proto_file_path) {
   std::string serialized_xspace;
-  TF_CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), proto_file_path,
-                                    &serialized_xspace));
+  xla::ThrowIfError(tsl::ReadFileToString(tsl::Env::Default(), proto_file_path,
+                                          &serialized_xspace));
   return ProfileData(serialized_xspace.c_str(), serialized_xspace.size());
 }
 
@@ -198,14 +199,16 @@ VisitorIterator<nb::tuple, XStat> ProfilePlane::stats_end() {
   return ProfileData(proto_ptr);
 }
 
-ProfileData::ProfileData(const char* serialized_xspace_ptr,
+ProfileData::ProfileData(const void* serialized_xspace_ptr,
                          size_t serialized_xspace_size) {
   CHECK_NOTNULL(serialized_xspace_ptr);
 
   if (!xspace_) {
     xspace_ = std::make_shared<XSpace>();
   }
-  CHECK(xspace_->ParseFromArray(serialized_xspace_ptr, serialized_xspace_size));
+  if (!xspace_->ParseFromArray(serialized_xspace_ptr, serialized_xspace_size)) {
+    throw std::runtime_error("Failed to parse XSpace from array");
+  }
 }
 
 /*explicit*/ ProfileData::ProfileData(std::shared_ptr<XSpace> xspace_ptr) {
@@ -216,8 +219,10 @@ ProfileData::ProfileData(const char* serialized_xspace_ptr,
   if (!xspace_) {
     xspace_ = std::make_shared<XSpace>();
   }
-  CHECK(xspace_->ParseFromArray(serialized_xspace.data(),
-                                serialized_xspace.size()));
+  if (!xspace_->ParseFromArray(serialized_xspace.data(),
+                               serialized_xspace.size())) {
+    throw std::runtime_error("Failed to parse XSpace from array");
+  }
 }
 
 VisitorIterator<ProfilePlane, XPlane> ProfileData::planes_begin() {
diff --git a/third_party/xla/xla/python/profiler/profile_data_lib.h b/third_party/xla/xla/python/profiler/profile_data_lib.h
index fbe16b03d02c48..cdfeee940e30e2 100644
--- a/third_party/xla/xla/python/profiler/profile_data_lib.h
+++ b/third_party/xla/xla/python/profiler/profile_data_lib.h
@@ -165,7 +165,7 @@ class ProfileData {
 
   ProfileData() = delete;
 
-  ProfileData(const char* serialized_xspace_ptr, size_t serialized_xspace_size);
+  ProfileData(const void* serialized_xspace_ptr, size_t serialized_xspace_size);
 
   explicit ProfileData(std::shared_ptr<XSpace> xspace_ptr);
 
diff --git a/third_party/xla/xla/python/strides.cc b/third_party/xla/xla/python/strides.cc
new file mode 100644
index 00000000000000..a1048c0a8a8a5b
--- /dev/null
+++ b/third_party/xla/xla/python/strides.cc
@@ -0,0 +1,68 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/strides.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns the strides for `shape`.
+std::vector<int64_t> ByteStridesForShape(const Shape& shape) {
+  std::vector<int64_t> strides;
+  CHECK(shape.IsArray());
+  CHECK(shape.has_layout());
+  return ByteStridesForShape(shape.element_type(), shape.dimensions(),
+                             shape.layout());
+}
+
+static std::vector<int64_t> StridesForShapeHelper(
+    PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+    const xla::Layout& layout, int64_t innermost_stride_size) {
+  CHECK_EQ(dimensions.size(), layout.minor_to_major().size());
+  std::vector<int64_t> strides;
+  strides.resize(dimensions.size());
+  int64_t stride = innermost_stride_size;
+  for (int i : layout.minor_to_major()) {
+    strides[i] = stride;
+    stride *= dimensions[i];
+  }
+  return strides;
+}
+
+std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
+                                         absl::Span<const int64_t> dimensions,
+                                         const xla::Layout& layout) {
+  return StridesForShapeHelper(
+      element_type, dimensions, layout,
+      ShapeUtil::ByteSizeOfPrimitiveType(element_type));
+}
+
+std::vector<int64_t> StridesForShape(PrimitiveType element_type,
+                                     absl::Span<const int64_t> dimensions,
+                                     const xla::Layout& layout) {
+  return StridesForShapeHelper(element_type, dimensions, layout,
+                               /*innermost_stride_size=*/1);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/strides.h b/third_party/xla/xla/python/strides.h
new file mode 100644
index 00000000000000..d86d6f2d5c3694
--- /dev/null
+++ b/third_party/xla/xla/python/strides.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_STRIDES_H_
+#define XLA_PYTHON_STRIDES_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns the strides for `shape`.
+std::vector<int64_t> ByteStridesForShape(const Shape& shape);
+std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
+                                         absl::Span<const int64_t> dimensions,
+                                         const xla::Layout& layout);
+std::vector<int64_t> StridesForShape(PrimitiveType element_type,
+                                     absl::Span<const int64_t> dimensions,
+                                     const xla::Layout& layout);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_STRIDES_H_
diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
deleted file mode 100644
index db2cb0f2a71de9..00000000000000
--- a/third_party/xla/xla/python/tools/BUILD
+++ /dev/null
@@ -1,92 +0,0 @@
-load("//xla:py_strict.bzl", "py_strict_test")
-
-# NOTE: We can't use `pytype_pybind_extension` nor `pytype_strict_contrib_test`
-# because the OSS versions of these files do not include ports of those rules.
-# We must instead use `tsl_pybind_extension` and `py_strict_test`.
-load("//xla:pytype.bzl", "pytype_strict_library")
-load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
-
-exports_files([
-    "__init__.py",
-    "types.py",
-    "_types.pyi",
-])
-
-# NOTE: This wrapper library is necessary in order to capture the Python
-# dependencies of our extension (namely `ml_dtypes`).  Although the
-# underlying `pybind_extension` rule has a `py_deps` argument for capturing
-# such dependencies directly, the `tsl_pybind_extension` rule doesn't expose
-# that `py_deps` argument for us to use.
-#
-# NOTE: On the OSS side, the `pytype_strict_library` rule is changed into
-# the non-typed rule, which in turn causes an error about the `pytype_srcs`
-# field.  The "..:xla_client" target gets around this by adding a custom
-# copybara rule; but in lieu of adding yet another custom rule to maintain,
-# we just use the generic copybara mechanism for commenting the field out
-# on the OSS side.
-# TODO(wrengr,phawkins): Once cl/619904840 lands, we can remove the
-# pragma and the preceding commentary.
-pytype_strict_library(
-    name = "types",
-    srcs = ["types.py"],
-    # copybara:uncomment pytype_srcs = ["_types.pyi"],
-    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
-    # dependency isn't part of the public API.
-    tags = ["no_oss"],
-    # TODO(dsuo): Should this be public given note above?
-    visibility = ["//visibility:public"],
-    deps = [
-        ":_types",  # buildcleaner: keep
-        "//third_party/py/numpy",
-        "//xla:xla_data_proto_py",
-        "@ml_dtypes_py//ml_dtypes",
-    ],
-)
-
-# NOTE: Copybara detects the `tsl_pybind_extension` rule and automatically
-# injects the @com_google_protobuf//:protobuf_python python dependency
-# required by "@pybind11_protobuf//pybind11_protobuf:native_proto_caster".
-tsl_pybind_extension(
-    name = "_types",
-    srcs = ["_types.cc"],
-    pytype_deps = ["//third_party/py/numpy"],
-    pytype_srcs = ["_types.pyi"],
-    # Users should depend on ":types" instead.
-    visibility = ["//visibility:private"],
-    deps = [
-        "//xla:literal",
-        "//xla:xla_data_proto_cc",
-        "//xla/pjrt:status_casters",
-        "//xla/python:logging",
-        "//xla/python:nb_numpy",
-        "//xla/python:types",
-        "//xla/tsl/python/lib/core:numpy",
-        "@com_google_absl//absl/strings",
-        "@nanobind",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:import_status_module",
-        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
-    ],
-)
-
-py_strict_test(
-    name = "types_test",
-    size = "small",
-    srcs = ["types_test.py"],
-    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
-    # dependency isn't part of the public API.
-    tags = ["no_oss"],
-    deps = [
-        ":types",
-        "@absl_py//absl/testing:absltest",
-        "@absl_py//absl/testing:parameterized",
-        # copybara:uncomment "//third_party/py/google/protobuf:use_fast_cpp_protos",
-        "//third_party/py/numpy",
-        "//xla:xla_data_proto_py",
-    ],
-)
diff --git a/third_party/xla/xla/python/tools/_types.cc b/third_party/xla/xla/python/tools/_types.cc
deleted file mode 100644
index fb44e3c64ef2e2..00000000000000
--- a/third_party/xla/xla/python/tools/_types.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/strings/str_cat.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "pybind11/detail/common.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/pytypes.h"
-// The "pybind11_abseil/status_casters.h" header says
-// it's deprecated and that we should import the other headers directly.
-#include "pybind11_abseil/import_status_module.h"
-#include "pybind11_protobuf/native_proto_caster.h"
-#include "xla/literal.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/logging.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/types.h"
-#include "xla/xla_data.pb.h"
-// NOTE: The tsl-numpy header forbids importing the actual NumPy arrayobject.h
-// header before tsl-numpy (whereas, importing pybind11-numpy before tsl-numpy
-// is fine); however, tsl-numpy does reexport NumPy's arrayobject.h header.
-// Since one of the TF headers above already includes tsl-numpy, therefore
-// we must include it down here rather than including actual NumPy directly.
-#include "xla/tsl/python/lib/core/numpy.h"
-
-namespace py = ::pybind11;
-namespace nb = ::nanobind;
-
-namespace {
-py::object MakeNdarray(const xla::LiteralProto& proto) {
-  auto m_lit = xla::Literal::CreateFromProto(proto);
-  if (!m_lit.ok()) {
-    // NOTE: The OSS version of XLA is still using an old version of
-    // Abseil (LTS branch, Aug 2023, Patch 1) which does not have the
-    // `AbslStringify` interface for implicitly converting `absl::Status`
-    // into the `absl::AlphaNum` required by `absl::StrCat`.  Therefore we
-    // inline the latest definition of the `AbslStringify` overload.
-    throw py::value_error(absl::StrCat(
-        "Cannot `xla::Literal::CreateFromProto`: ",
-        m_lit.status().ToString(absl::StatusToStringMode::kWithEverything)));
-  }
-
-  // Move (not copy) the literal onto the heap, for sharing with Python.
-  auto lit = std::make_shared<xla::Literal>(std::move(m_lit).value());
-
-  auto nbobj = xla::ValueOrThrow(xla::LiteralToPython(std::move(lit)));
-  return py::reinterpret_steal<py::object>(nbobj.release().ptr());
-}
-
-// Partial reversion of cl/617156835, until we can get the proto-casters
-// (and hence the extension) switched over to nanobind.
-// TODO(wrengr): Or can we mix `{py,nb}::module_::def` calls??
-xla::PrimitiveType DtypeToEtype(const py::dtype& py_d) {
-  auto nb_d = nb::borrow<xla::nb_dtype>(py_d.ptr());
-  return xla::ValueOrThrow(xla::DtypeToPrimitiveType(nb_d));
-}
-
-py::dtype EtypeToDtype(xla::PrimitiveType p) {
-  auto nb_d = xla::ValueOrThrow(xla::PrimitiveTypeToNbDtype(p));
-  return py::reinterpret_steal<py::dtype>(nb_d.release().ptr());
-}
-}  // namespace
-
-// NOTE: It seems insurmountable to get "native_proto_caster.h" to work
-// with nanobind modules; therefore, we define our extension as a pybind11
-// module so that we can use `pybind11::module_::def`.
-PYBIND11_MODULE(_types, py_m) {
-  // Initialize ABSL logging because code within XLA uses it.
-  // (As per `xla::Init` in "xla.cc"; though we don't need it ourselves.)
-#ifndef PLATFORM_GOOGLE
-  xla::InitializeAbslLogging();
-#endif  // PLATFORM_GOOGLE
-
-  // Normally this would happen at the start of NB_MODULE, but since
-  // this is a pybind11 module we have to do this ourselves.
-  // (As per `xla::Init` in "xla.cc".)
-  nb::detail::init(NB_DOMAIN_STR);
-
-  // Import implicit conversions from Python protobuf objects to C++
-  // protobuf objects.
-  pybind11_protobuf::ImportNativeProtoCasters();
-
-  // Import dependencies for converting `absl::StatusOr` to Python exceptions.
-  // This also brings into scope pybind11 casters for doing conversions
-  // implicitly; however, towards the goal of converting everything to
-  // nanobind, we call `xla::ValueOrThrow` to make make the conversions
-  // explicit (since `nb::detail::type_caster` disallows raising exceptions,
-  // and therefore nanobind cannot do this implicitly).
-  py::google::ImportStatusModule();
-
-  // Import the 'ml_dtypes' module; which is implicitly required by
-  // `xla::LiteralToPython`.
-  // NOTE: If the `tsl_pybind_extension` build rule allowed us to specify
-  // this as a py_dep, then importing the module here would mean that
-  // client Python code need not import the hidden dependency themselves.
-  // However, since `tsl_pybind_extension` does not allow specifying py_deps,
-  // if client rules do not themselves declare the dependency then this will
-  // generate a `ModuleNotFoundError` / `ImportError` exception.  Hence why
-  // we define the "types.py" wrapper library to encapsulate the dependency.
-  py::module_::import("ml_dtypes");
-
-  // Ensure that tsl-numpy initializes datastructures of the actual-NumPy
-  // implementation, and does whatever else tsl-numpy needs.  This is
-  // also necessary for using the `xla::nb_dtype` type.
-  tsl::ImportNumpy();
-
-  // Declare that C++ can `nb::cast` from `std::shared_ptr<xla::Literal>`
-  // to `nb::object`; which is implicitly required by `xla::LiteralToPython`.
-  // (FWIW: This also enables using `nb::type<xla::Literal>()` to get
-  // the Python-type-object associated with the C++ class.)
-  //
-  // NOTE: This does *not* mean that C++ can `py::cast` from `xla::Literal`
-  // to `py::object`.  It's unclear whether we can simultaneously provide
-  // both nanobind and pybind11 bindings (if we wanted the latter).
-  nb::module_ nb_m = nb::cast<nb::module_>(nb::borrow(py_m.ptr()));
-  nb::class_<xla::Literal>(nb_m, "Literal")
-      .def("__repr__", &xla::Literal::ToString);
-
-  // We do not define `py_m.doc()` here, since it wouldn't be inherited
-  // by the "types.py" wrapper library.  See there for the python docstring.
-
-  // LINT.IfChange
-  py_m.def("make_ndarray", &MakeNdarray, py::arg("proto").none(false),
-           py::pos_only(), R"pbdoc(
-    Converts `tensorflow.compiler.xla.xla_data_pb2.LiteralProto`
-    into an `xla::Literal` and then converts that literal into a tree
-    of tuples with leaves being `numpy.ndarray` views of array-shaped
-    sub-literals.
-  )pbdoc");
-
-  // This method name is based on `xla_client.dtype_to_etype`.
-  // NOTE: `xla_client` uses a Python class wrapping the protobuf-enum,
-  // rather than using the protobuf-enum directly.  See the module docstring
-  // in "types.py" for more explanation on why.
-  py_m.def("dtype_to_etype", &DtypeToEtype, py::arg("dtype").none(false),
-           py::pos_only(), R"pbdoc(
-    Converts `numpy.dtype` into
-    `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType`.
-  )pbdoc");
-
-  py_m.def("etype_to_dtype", &EtypeToDtype, py::arg("ptype").none(false),
-           py::pos_only(), R"pbdoc(
-    Converts `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType` into
-    `numpy.dtype`.
-  )pbdoc");
-  // LINT.ThenChange(_types.pyi)
-}
diff --git a/third_party/xla/xla/python/tools/types.py b/third_party/xla/xla/python/tools/types.py
deleted file mode 100644
index 189758f1e749c8..00000000000000
--- a/third_party/xla/xla/python/tools/types.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""tensorflow.compiler.xla.python.tools.types.
-
-This module provides Python bindings for various functions in
-'tensorflow/compiler/xla/python/types.h'.  It is primarily intended
-to assist internal users in debugging things; and is not considered
-part of the public API for OpenXLA.
-
-NOTE: This module *does* depend on Python protocol buffers; so beware!
-The XLA Python bindings are currently packaged both as part of jaxlib and
-as part of TensorFlow.  Therefore, since we use protocol buffers here,
-importing both jaxlib and TensorFlow may fail with duplicate protocol
-buffer message definitions.
-"""
-
-from typing import Union
-# NOTE: `ml_dtypes` is implicitly required by `xla::LiteralToPython`.
-# The entire goal of this wrapper library is to capture this dependency,
-# so that client code need not be aware of it.
-import ml_dtypes  # pylint: disable=unused-import
-import numpy
-# NOTE: These protos are not part of TensorFlow's public API, therefore
-# we cannot abide by [g-direct-tensorflow-import].
-# pylint: disable=g-direct-tensorflow-import,unused-import
-from local_xla.xla import xla_data_pb2
-# pylint: enable=g-direct-tensorflow-import,unused-import
-
-# NOTE: `import <name> as <name>` is required for names to be exported.
-# See PEP 484 & <https://github.com/google/jax/issues/7570>
-# pylint: disable=g-importing-member,useless-import-alias,unused-import,g-multiple-import
-# LINT.IfChange
-from ._types import (
-    make_ndarray as make_ndarray,
-    dtype_to_etype as dtype_to_etype,
-    etype_to_dtype as etype_to_dtype,
-)
-# TODO(wrengr): We can't import the `NdarrayTree` defined in the pyi file.
-# So re-defining it here for now.
-NdarrayTree = Union[numpy.ndarray, tuple['NdarrayTree', ...]]
-# LINT.ThenChange(_types.pyi)
diff --git a/third_party/xla/xla/python/tools/types_test.py b/third_party/xla/xla/python/tools/types_test.py
deleted file mode 100644
index a6cdb1d0f76b13..00000000000000
--- a/third_party/xla/xla/python/tools/types_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import itertools
-import math
-import re
-from typing import List, NamedTuple
-
-from absl.testing import absltest
-from absl.testing import parameterized
-import numpy as np
-
-# NOTE: These protos are not part of the public API, therefore we cannot
-# abide by [g-direct-tensorflow-import].
-# pylint: disable=g-direct-tensorflow-import
-from local_xla.xla import xla_data_pb2
-from xla.python.tools import types
-# pylint: enable=g-direct-tensorflow-import
-
-
-class MakeNdarrayInvalidTest(absltest.TestCase):
-  """Tests for invalid/unsupported arguments to `make_ndarray`."""
-
-  def setUp(self):
-    super().setUp()
-    self.assert_cannot_create_from_proto = self.assertRaisesRegex(
-        ValueError, re.escape('Cannot `xla::Literal::CreateFromProto`')
-    )
-
-  # NOTE: The `Literal(const Shape&, bool, ArrayValueState)` ctor does
-  # a CHECK forbidding `element_size_in_bits` from being specified;
-  # so we can't test anything about custom sizes here.
-
-  def testMissingLayout(self):
-    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
-    # Though in principle it could use a default ctor instead, like we
-    # do in `make_named_parameter` below`.
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.F64,
-            dimensions=[1, 2, 3],
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-  def testMissingMinorToMajor(self):
-    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
-    # Though in principle it could use a default ctor instead, like we
-    # do in `make_named_parameter` below`.
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.F64,
-            dimensions=[1, 2, 3],
-            layout=xla_data_pb2.LayoutProto(),
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-  def testInvalidPrimitiveType(self):
-    # NOTE: The `is_dynamic_dimension` field isn't required by
-    # `CreateFromProto`; however, the `Shape(const ShapeProto&)` ctor
-    # will log warnings if we leave it unspecified.
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.PRIMITIVE_TYPE_INVALID,
-            dimensions=[1, 2, 3],
-            is_dynamic_dimension=[False, False, False],
-            layout=xla_data_pb2.LayoutProto(
-                minor_to_major=[0, 1, 2],
-            ),
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-  def testHasDimLevelTypes(self):
-    # NOTE: `CreateFromProto` forbids `dim_level_types` (even if all-dense).
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.F64,
-            dimensions=[1, 2, 3],
-            is_dynamic_dimension=[False, False, False],
-            layout=xla_data_pb2.LayoutProto(
-                dim_level_types=[
-                    xla_data_pb2.DimLevelType.DIM_DENSE,
-                    xla_data_pb2.DimLevelType.DIM_DENSE,
-                    xla_data_pb2.DimLevelType.DIM_DENSE,
-                ],
-                minor_to_major=[0, 1, 2],
-            ),
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-
-class MakeNdarrayValidTestParameter(NamedTuple):
-  testcase_name: str
-  proto: xla_data_pb2.LiteralProto
-  arr: np.ndarray
-
-
-def make_named_parameter(
-    testcase_name: str,
-    dimensions: List[int],
-    data: List[float],
-) -> MakeNdarrayValidTestParameter:
-  """Helper function to construct parameters for `MakeNdarrayValidTest`."""
-  assert math.prod(dimensions) == len(data)
-  nd = len(dimensions)
-  proto = xla_data_pb2.LiteralProto(
-      shape=xla_data_pb2.ShapeProto(
-          element_type=xla_data_pb2.PrimitiveType.F64,
-          dimensions=dimensions,
-          is_dynamic_dimension=itertools.repeat(False, nd),
-          layout=xla_data_pb2.LayoutProto(
-              minor_to_major=range(nd),
-          ),
-      ),
-      f64s=data,
-  )
-  arr = types.make_ndarray(proto)
-  return MakeNdarrayValidTestParameter(testcase_name, proto, arr)
-
-
-@parameterized.named_parameters(
-    make_named_parameter('A', [2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
-    make_named_parameter('B', [1, 2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
-    make_named_parameter('C', [2, 3], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
-    make_named_parameter('D', [3, 2], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
-)
-class MakeNdarrayValidTest(parameterized.TestCase):
-  """Correctness tests for valid arguments to `make_ndarray`."""
-
-  def testHasCorrectDtype(self, proto, arr):
-    """Test that the result has the right dtype."""
-    e = proto.shape.element_type
-    d = arr.dtype
-    with self.subTest(msg='etype_to_dtype'):
-      self.assertEqual(types.etype_to_dtype(e), d)
-    with self.subTest(msg='dtype_to_etype'):
-      self.assertEqual(e, types.dtype_to_etype(d))
-
-  def testHasCorrectRank(self, proto, arr):
-    """Test that the result has the right rank."""
-    self.assertLen(proto.shape.dimensions, arr.ndim)
-
-  def testHasCorrectShape(self, proto, arr):
-    """Test that the result has the same/right shape."""
-    self.assertTupleEqual(tuple(proto.shape.dimensions), arr.shape)
-
-  def testHasCorrectData(self, proto, arr):
-    """Test that the result has the same/right data."""
-    # TODO(wrengr): Figure out a way to abstract away the name of the
-    # proto field containing the data; so that we can test multiple types.
-    self.assertSequenceAlmostEqual(proto.f64s, list(np.nditer(arr)))
-
-  # TODO(wrengr): Add tests for:
-  # * dynamic dimension sizes.
-  # * non-trivial `minor_to_major`.
-  # * problematic types {PRED,F16,C64,C128} are all handled correctly.
-  # * BF16 is handled correctly.
-  # * tuples are handled correctly
-
-
-if __name__ == '__main__':
-  absltest.main()
diff --git a/third_party/xla/xla/python/transfer/BUILD b/third_party/xla/xla/python/transfer/BUILD
index e5bd78b1c68bbd..789a1c0dca6ef9 100644
--- a/third_party/xla/xla/python/transfer/BUILD
+++ b/third_party/xla/xla/python/transfer/BUILD
@@ -34,7 +34,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":transfer_socket_proto_cc",
-        "//xla/pjrt:pjrt_future",
+        "//xla:future",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
@@ -74,12 +74,15 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -108,9 +111,9 @@ cc_library(
     deps = [
         ":streaming",
         ":transfer_socket_proto_cc",
+        "//xla:future",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:raw_buffer",
         "//xla/python/ifrt",
         "//xla/tsl/concurrency:ref_count",
@@ -150,6 +153,7 @@ xla_cc_test(
         ":streaming",
         ":streaming_ifrt",
         ":test_pattern",
+        "//xla:future",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:raw_buffer",
@@ -161,7 +165,9 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -219,6 +225,7 @@ cc_library(
         ":streaming",
         ":transfer_socket_proto_cc",
         "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -266,6 +273,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_layout",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
diff --git a/third_party/xla/xla/python/transfer/event_loop.cc b/third_party/xla/xla/python/transfer/event_loop.cc
index a2699c1a572ad7..3794d3e0dca32e 100644
--- a/third_party/xla/xla/python/transfer/event_loop.cc
+++ b/third_party/xla/xla/python/transfer/event_loop.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <arpa/inet.h>
 #include <linux/tcp.h>
+#include <netdb.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -25,23 +26,30 @@ limitations under the License.
 
 #include <atomic>
 #include <cerrno>
+#include <charconv>
 #include <memory>
 #include <queue>
 #include <string>
+#include <system_error>  // NOLINT
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "tsl/platform/env.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace aux {
 
@@ -219,8 +227,8 @@ class SocketListener::Handler : public PollEventLoop::Handler {
   }
 
   absl::StatusOr<int> Accept(SocketAddress& recv_addr) {
-    socklen_t addr_len = sizeof(recv_addr);
-    int cfd = accept4(fd_, reinterpret_cast<sockaddr*>(&recv_addr), &addr_len,
+    socklen_t addr_len = sizeof(recv_addr.mutable_address());
+    int cfd = accept4(fd_, &recv_addr.mutable_address(), &addr_len,
                       accept_flags_ | SOCK_CLOEXEC);
     if (cfd == -1) {
       return absl::ErrnoToStatus(errno, "accept");
@@ -249,7 +257,7 @@ absl::StatusOr<std::unique_ptr<SocketListener>> SocketListener::Listen(
     absl::AnyInvocable<void(int socket_fd, const SocketAddress& addr)>
         on_accept,
     int accept_flags) {
-  std::unique_ptr<SocketListener> result(new SocketListener());
+  auto result = std::make_unique<SocketListener>();
   int sfd = socket(addr.address().sa_family,
                    SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0);
   if (sfd == -1) {
@@ -263,9 +271,7 @@ absl::StatusOr<std::unique_ptr<SocketListener>> SocketListener::Listen(
   if (setsockopt(sfd, SOL_SOCKET, SO_REUSEPORT, &value, sizeof(value)) != 0) {
     return absl::ErrnoToStatus(errno, "setsockopt");
   }
-  if (bind(sfd, reinterpret_cast<const struct sockaddr*>(&addr.address()),
-           addr.address().sa_family == AF_INET6 ? sizeof(sockaddr_in6)
-                                                : sizeof(sockaddr_in)) != 0) {
+  if (bind(sfd, &addr.address(), addr.len()) != 0) {
     return absl::ErrnoToStatus(errno, "bind");
   }
   if (listen(sfd, 1024) != 0) {
@@ -273,27 +279,13 @@ absl::StatusOr<std::unique_ptr<SocketListener>> SocketListener::Listen(
   }
   result->handler_ =
       new SocketListener::Handler(sfd, accept_flags, std::move(on_accept));
-  if (addr.address().sa_family == AF_INET6) {
-    sockaddr_in6 new_sock_name = addr.address_ipv6();
-    sockaddr_in6 new_sock_name2;
-    socklen_t addr_len = sizeof(new_sock_name2);
-    if (getsockname(sfd, reinterpret_cast<struct sockaddr*>(&new_sock_name2),
-                    &addr_len) != 0) {
-      return absl::ErrnoToStatus(errno, "getsockname");
-    }
-    new_sock_name.sin6_port = new_sock_name2.sin6_port;
-    result->addr_ = SocketAddress(new_sock_name);
-  } else {
-    sockaddr_in new_sock_name = addr.address_ipv4();
-    sockaddr_in new_sock_name2;
-    socklen_t addr_len = sizeof(new_sock_name2);
-    if (getsockname(sfd, reinterpret_cast<struct sockaddr*>(&new_sock_name2),
-                    &addr_len) != 0) {
-      return absl::ErrnoToStatus(errno, "getsockname");
-    }
-    new_sock_name.sin_port = new_sock_name2.sin_port;
-    result->addr_ = SocketAddress(new_sock_name);
+
+  sockaddr_storage bound_storage;
+  socklen_t bound_len = sizeof(bound_storage);
+  if (getsockname(sfd, (sockaddr*)&bound_storage, &bound_len) != 0) {
+    return absl::ErrnoToStatus(errno, "getsockname");
   }
+  result->addr_ = SocketAddress(bound_storage);
   return result;
 }
 
@@ -314,105 +306,314 @@ void SocketListener::Start() {
 }
 
 SocketAddress::SocketAddress() {
-  memset(this, 0, sizeof(SocketAddress));
-  saddr6_.sin6_family = AF_INET6;
+  memset(&storage_, 0, sizeof(storage_));
+  storage_.ss_family = AF_INET6;
 }
 
 SocketAddress::SocketAddress(const sockaddr_in& saddr) {
-  memcpy(&saddr4_, &saddr, sizeof(saddr));
+  memcpy(&storage_, &saddr, sizeof(saddr));
 }
 
 SocketAddress::SocketAddress(const sockaddr_in6& saddr) {
-  memcpy(&saddr6_, &saddr, sizeof(saddr));
+  memcpy(&storage_, &saddr, sizeof(saddr));
+}
+
+SocketAddress::SocketAddress(const sockaddr_storage& saddr) : storage_(saddr) {}
+
+socklen_t SocketAddress::len() const {
+  switch (storage_.ss_family) {
+    case AF_INET6:
+      return sizeof(sockaddr_in6);
+    case AF_INET:
+      return sizeof(sockaddr_in);
+    default:
+      return sizeof(sockaddr_storage);
+  }
 }
 
 std::string SocketAddress::ToString() const {
-  if (saddr_.sa_family == AF_INET6) {
-    char tmp[INET6_ADDRSTRLEN + 16];
-    tmp[0] = '[';
-    inet_ntop(AF_INET6, &saddr6_.sin6_addr, &tmp[1], sizeof(tmp) - 1);
-    int pos = strlen(&tmp[0]);
-    pos += snprintf(&tmp[pos], sizeof(tmp) - pos, "]:%d",
-                    ntohs(saddr6_.sin6_port));
-    return std::string(tmp, pos);
-  } else if (saddr_.sa_family == AF_INET) {
-    char tmp[INET_ADDRSTRLEN + 16];
-    inet_ntop(AF_INET, &saddr4_.sin_addr, &tmp[0], sizeof(tmp) - 1);
-    int pos = strlen(&tmp[0]);
-    pos +=
-        snprintf(&tmp[pos], sizeof(tmp) - pos, ":%d", ntohs(saddr4_.sin_port));
-    return std::string(tmp, pos);
-  } else {
-    LOG(FATAL) << "Invalid IPAddress";
+  char host[NI_MAXHOST], serv[NI_MAXSERV];
+  int flags = NI_NUMERICHOST | NI_NUMERICSERV;
+  if (getnameinfo(&address(), len(), host, sizeof(host), serv, sizeof(serv),
+                  flags) == 0) {
+    if (storage_.ss_family == AF_INET6) {
+      return absl::StrCat("[", host, "]:", serv);
+    }
+    return absl::StrCat(host, ":", serv);
   }
+  LOG(FATAL) << "Invalid IPAddress";
 }
 
-int ParsePort(const std::string& addr, size_t it, uint32_t& parsed_port) {
-  size_t port_pos = addr.find(':', it);
-  if (port_pos == std::string::npos) {
-    return -1;
+absl::StatusOr<uint16_t> ParsePort(absl::string_view colon_port) {
+  if (!absl::ConsumePrefix(&colon_port, ":")) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Missing colon for port: '", colon_port, "'"));
   }
-  for (size_t i = port_pos + 1; i < addr.size(); ++i) {
-    if (!(addr[i] >= '0' && addr[i] <= '9')) {
-      return -1;
-    }
-    parsed_port = parsed_port * 10 + (addr[i] - '0');
-    if (parsed_port >= 65536) {
-      return -1;
-    }
+  uint16_t parsed_port;
+  const char* last = colon_port.data() + colon_port.size();
+  auto [ptr, ec] =
+      std::from_chars(colon_port.data(), last, parsed_port, /*base=*/10);
+  if (ec != std::errc{}) {
+    return absl::ErrnoToStatus(static_cast<int>(ec),
+                               absl::StrCat("std::from_chars could not parse '",
+                                            colon_port, "' as a valid port"));
+  }
+  if (ptr != last) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Encountered non-numeric characters while parsing port: '",
+                     colon_port, "'"));
   }
-  return 0;
+  return parsed_port;
 }
 
-int SocketAddress::Parse(const std::string& addr, SocketAddress& out) {
-  memset(&out, 0, sizeof(SocketAddress));
-  if (!addr.empty() && addr.data()[0] == '[') {
+absl::StatusOr<SocketAddress> SocketAddress::Parse(absl::string_view addr) {
+  SocketAddress out;
+  memset(&out.storage_, 0, sizeof(out.storage_));
+  std::string ip_address;
+  absl::string_view colon_port;
+  if (absl::ConsumePrefix(&addr, "[")) {
     size_t it = addr.find(']');
     if (it == std::string::npos) {
-      return -1;
-    }
-    if (it - 1 >= INET6_ADDRSTRLEN) {
-      return -1;
+      return absl::InvalidArgumentError(
+          absl::StrCat("IPv6 address missing closing bracket: '", addr, "'"));
     }
-    char tmp[INET6_ADDRSTRLEN];
-    uint32_t parsed_port = 0;
-    if (ParsePort(addr, it, parsed_port) != 0) {
-      return -1;
-    }
-    out.saddr6_.sin6_family = AF_INET6;
-    out.saddr6_.sin6_port = htons(static_cast<uint16_t>(parsed_port));
-    memcpy(&tmp[0], &addr.data()[1], it - 1);
-    tmp[it - 1] = 0;
-    return inet_pton(AF_INET6, &tmp[0], &out.saddr6_.sin6_addr);
+    ip_address = addr.substr(0, it);
+    colon_port = addr.substr(it + 1);
+    out.storage_.ss_family = AF_INET6;
   } else {
     size_t it = addr.find(':');
     if (it == std::string::npos) {
-      return -1;
+      return absl::InvalidArgumentError(
+          absl::StrCat("IPv4 address missing colon for port: '", addr, "'"));
     }
-    if (it >= INET_ADDRSTRLEN) {
-      return -1;
+    ip_address = addr.substr(0, it);
+    colon_port = addr.substr(it);
+    out.storage_.ss_family = AF_INET;
+  }
+  absl::StatusOr<uint16_t> parsed_port = ParsePort(colon_port);
+  if (!parsed_port.ok()) {
+    return parsed_port.status();
+  }
+
+  void* sin_addr_dst;
+  if (out.storage_.ss_family == AF_INET6) {
+    sockaddr_in6* v6 = (sockaddr_in6*)&out.storage_;
+    v6->sin6_port = htons(*parsed_port);
+    sin_addr_dst = &v6->sin6_addr;
+  } else {
+    CHECK_EQ(out.storage_.ss_family, AF_INET);
+    sockaddr_in* v4 = (sockaddr_in*)&out.storage_;
+    v4->sin_port = htons(*parsed_port);
+    sin_addr_dst = &v4->sin_addr;
+  }
+  if (inet_pton(out.storage_.ss_family, ip_address.c_str(), sin_addr_dst) ==
+      -1) {
+    return absl::ErrnoToStatus(
+        errno, absl::StrCat("inet_pton failed when parsing address: '",
+                            ip_address, "'"));
+  }
+  return out;
+}
+
+SocketFdPacketState::~SocketFdPacketState() { CHECK_EQ(fd_, -1); }
+
+void SocketFdPacketState::PopulatePollInfo(pollfd& events) {
+  events.fd = fd_;
+  events.events = 0;
+  if (!read_closed_) {
+    events.events |= POLLIN;
+  }
+  if (!can_send_ && !write_closed_) {
+    events.events |= POLLOUT;
+  }
+}
+
+bool SocketFdPacketState::HandleEvents(const pollfd& events) {
+  tsl::profiler::TraceMe __trace("SocketServer::HandleEvents");
+  if (!is_connected_) {
+    // If HUP with an error happens, then schedule a reconnect.
+    if ((events.revents & POLLHUP) && (events.revents & POLLERR)) {
+      mu_.lock();
+      read_closed_ = true;
+      write_closed_ = true;
+      bool result = CloseIfNeeded();
+      mu_.unlock();
+      ConnectFailed();
+      return result;
     }
-    uint32_t parsed_port = 0;
-    if (ParsePort(addr, it, parsed_port) != 0) {
-      return -1;
+    if (!(events.revents & POLLOUT)) {
+      return true;
     }
-    char tmp[INET_ADDRSTRLEN];
-    memcpy(&tmp[0], &addr.data()[0], it);
-    tmp[it] = 0;
-    out.saddr4_.sin_family = AF_INET;
-    out.saddr4_.sin_port = htons(static_cast<uint16_t>(parsed_port));
-    return inet_pton(AF_INET, &tmp[0], &out.saddr4_.sin_addr);
-  }
-  return -1;
+    absl::MutexLock l(mu_);
+    is_connected_ = true;
+  }
+  if ((events.revents & POLLIN) && !read_closed_) {
+    ssize_t recv_size =
+        recv(fd_, network_buffer_.get() + recv_count_, 4096 - recv_count_, 0);
+    if (recv_size > 0) {
+      recv_count_ += recv_size;
+      while (recv_count_ >= sizeof(uint32_t)) {
+        uint32_t frame_size;
+        memcpy(&frame_size, network_buffer_.get(), sizeof(uint32_t));
+        if (frame_size > 4096 - sizeof(uint32_t)) {
+          mu_.lock();
+          shutdown(fd_, SHUT_RD);
+          read_closed_ = true;
+          bool result = CloseIfNeeded();
+          mu_.unlock();
+          RecvClosed(absl::InternalError(
+              absl::StrFormat("frame_size is too large: %lu", frame_size)));
+          return result;
+        }
+        size_t total_frame_size =
+            static_cast<size_t>(frame_size) + sizeof(uint32_t);
+        // Needs more input.
+        if (total_frame_size > recv_count_) {
+          break;
+        }
+        absl::string_view buffer(network_buffer_.get() + sizeof(uint32_t),
+                                 frame_size);
+        HandlePacket(buffer);
+        if (total_frame_size < recv_count_) {
+          memmove(network_buffer_.get(),
+                  network_buffer_.get() + total_frame_size,
+                  recv_count_ - total_frame_size);
+        }
+        recv_count_ -= total_frame_size;
+      }
+    } else if (recv_size == -1 && errno == EAGAIN) {
+    } else {
+      mu_.lock();
+      shutdown(fd_, SHUT_RD);
+      read_closed_ = true;
+      bool result = CloseIfNeeded();
+      mu_.unlock();
+      if (recv_size == 0) {
+        RecvClosed(absl::OkStatus());
+      } else {
+        RecvClosed(absl::InternalError(
+            absl::StrFormat("%ld = recv() failed errno: %d err: %s", recv_size,
+                            errno, strerror(errno))));
+      }
+      return result;
+    }
+  }
+  if (events.revents & POLLOUT) {
+    can_send_ = true;
+  }
+  if (can_send_ && !write_closed_) {
+    mu_.lock();
+    while (!frames_.empty() && can_send_) {
+      auto& packet_to_send = frames_.front();
+      if (packet_to_send.empty()) {
+        shutdown(fd_, SHUT_WR);
+        write_closed_ = true;
+        bool result = CloseIfNeeded();
+        mu_.unlock();
+        SendClosed(absl::OkStatus());
+        return result;
+      }
+      const void* base = packet_to_send.data() + write_offset_;
+      size_t size = packet_to_send.size() - write_offset_;
+      ssize_t send_size = send(fd_, base, size, 0);
+      if (send_size > 0) {
+        write_offset_ += send_size;
+        if (send_size == size) {
+          write_offset_ = 0;
+          frames_.pop_front();
+        } else {
+          can_send_ = false;
+        }
+      } else if (send_size < 0 && errno == EAGAIN) {
+        can_send_ = false;
+      } else {
+        shutdown(fd_, SHUT_WR);
+        write_closed_ = true;
+        bool result = CloseIfNeeded();
+        mu_.unlock();
+        if (send_size == 0) {
+          SendClosed(absl::OkStatus());
+        } else {
+          SendClosed(absl::InternalError(
+              absl::StrFormat("%ld = send() failed errno: %d err: %s",
+                              send_size, errno, strerror(errno))));
+        }
+        return result;
+      }
+    }
+    mu_.unlock();
+  }
+  if ((events.revents & POLLHUP) && !write_closed_) {
+    int error = 0;
+    socklen_t len = sizeof(error);
+    if (getsockopt(fd_, SOL_SOCKET, SO_ERROR, &error, &len) != 0) {
+      error = errno;
+    }
+
+    mu_.lock();
+    write_closed_ = true;
+    bool result = CloseIfNeeded();
+    mu_.unlock();
+    if (error == 0) {
+      SendClosed(absl::OkStatus());
+    } else {
+      SendClosed(absl::InternalError(absl::StrFormat(
+          "fd failed with hup: errno: %d err: %s", error, strerror(error))));
+    }
+    return result;
+  }
+  return true;
 }
 
-absl::StatusOr<SocketAddress> SocketAddress::Parse(const std::string& addr) {
-  SocketAddress out;
-  if (Parse(addr, out) == -1) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Could not parse ip address: ", addr));
+bool SocketFdPacketState::SendRawFrame(std::string opacket) {
+  bool should_send_wake = false;
+  {
+    absl::MutexLock l(mu_);
+    // Allow buffering only before connect.
+    if (is_connected_ && write_closed_) {
+      return false;
+    }
+    should_send_wake = frames_.empty() && fd_ != -1;
+    frames_.push_back(std::move(opacket));
   }
-  return out;
+  if (should_send_wake) {
+    loop()->SendWake(this);
+  }
+  return true;
+}
+
+void SocketFdPacketState::RegisterFd(int fd, bool start_connected) {
+  {
+    absl::MutexLock l(mu_);
+    fd_ = fd;
+    is_connected_ = start_connected;
+    read_closed_ = false;
+    write_closed_ = false;
+  }
+  Register();
+}
+
+void SocketFdPacketState::Shutdown(int how) {
+  absl::MutexLock l(mu_);
+  shutdown(fd_, how);
+}
+
+bool SocketFdPacketState::CloseIfNeeded() {
+  if (is_connected_ && write_closed_) {
+    frames_.clear();
+  }
+  if (write_closed_) {
+    shutdown(fd_, SHUT_WR);
+  }
+  if (read_closed_) {
+    shutdown(fd_, SHUT_RD);
+  }
+  bool result = !read_closed_ || !write_closed_;
+  if (!result) {
+    close(fd_);
+    fd_ = -1;
+  }
+  return result;
 }
 
 }  // namespace aux
diff --git a/third_party/xla/xla/python/transfer/event_loop.h b/third_party/xla/xla/python/transfer/event_loop.h
index 228e050d655e0a..d1f87c5cb789a4 100644
--- a/third_party/xla/xla/python/transfer/event_loop.h
+++ b/third_party/xla/xla/python/transfer/event_loop.h
@@ -17,12 +17,17 @@ limitations under the License.
 
 #include <netinet/in.h>
 #include <poll.h>
+#include <sys/socket.h>
 
+#include <deque>
 #include <memory>
+#include <string>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 
 // socket.h in conda sysroot include directory does not define
@@ -87,26 +92,38 @@ class SocketAddress {
   SocketAddress();
   explicit SocketAddress(const sockaddr_in& saddr);
   explicit SocketAddress(const sockaddr_in6& saddr);
+  explicit SocketAddress(const sockaddr_storage& saddr);
 
   // Fetch address.
-  const sockaddr& address() const { return saddr_; }
-  const sockaddr_in6& address_ipv6() const { return saddr6_; }
-  const sockaddr_in& address_ipv4() const { return saddr4_; }
+  const sockaddr& address() const {
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<const sockaddr*>(&storage_);
+  }
+  sockaddr& mutable_address() {
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<sockaddr*>(&storage_);
+  }
+  const sockaddr_in6& address_ipv6() const {
+    CHECK_EQ(storage_.ss_family, AF_INET6);
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<const sockaddr_in6*>(&storage_);
+  }
+  const sockaddr_in& address_ipv4() const {
+    CHECK_EQ(storage_.ss_family, AF_INET);
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<const sockaddr_in*>(&storage_);
+  }
+
+  socklen_t len() const;
 
   // To String (parsable with Parse).
   std::string ToString() const;
 
   // Inverse of ToString().
-  static absl::StatusOr<SocketAddress> Parse(const std::string& addr);
+  static absl::StatusOr<SocketAddress> Parse(absl::string_view addr);
 
  private:
-  static int Parse(const std::string& addr, SocketAddress& out);
-
-  union {
-    sockaddr saddr_;
-    sockaddr_in saddr4_;
-    sockaddr_in6 saddr6_;
-  };
+  sockaddr_storage storage_;
 };
 
 // Calls accept() on sockets.
@@ -131,6 +148,52 @@ class SocketListener {
   Handler* handler_ = nullptr;
 };
 
+class SocketFdPacketState : public PollEventLoop::Handler {
+ public:
+  // Must be closed and cleared properly before destruction.
+  ~SocketFdPacketState() override;
+
+  // Subclasses must handle the incoming packet entirely during this call
+  // (or else copy).
+  virtual void HandlePacket(absl::string_view packet) = 0;
+
+  // All of these may destroy the handler if both directions are closed.
+  virtual void ConnectFailed() = 0;
+  // Clean half close.
+  virtual void RecvClosed(absl::Status error) = 0;
+  // Clean half close
+  virtual void SendClosed(absl::Status error) = 0;
+
+  // Schedules the frame (returns false if send is closed).
+  bool SendRawFrame(std::string opacket);
+
+  // Starts listening for fd.
+  // ConnectFailed() only called if start_connected=false.
+  void RegisterFd(int fd, bool start_connected);
+
+  // Calls shutdown on the fd.
+  void Shutdown(int how);
+
+ private:
+  void PopulatePollInfo(pollfd& events) override;
+
+  bool HandleEvents(const pollfd& events) override;
+
+  bool CloseIfNeeded() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Mutex mu_;
+  int fd_ = -1;
+  bool can_send_ = false;
+  bool is_connected_ = false;
+  bool read_closed_ = false;
+  bool write_closed_ = false;
+  size_t write_offset_ = 0;
+  size_t recv_count_ = 0;
+  std::unique_ptr<char[]> network_buffer_ =
+      std::unique_ptr<char[]>(new char[4096]);
+  std::deque<std::string> frames_ ABSL_GUARDED_BY(mu_);
+};
+
 }  // namespace aux
 
 #endif  // XLA_PYTHON_TRANSFER_EVENT_LOOP_H_
diff --git a/third_party/xla/xla/python/transfer/event_loop_test.cc b/third_party/xla/xla/python/transfer/event_loop_test.cc
index 0519a0a02fe520..22f4f36d5a330c 100644
--- a/third_party/xla/xla/python/transfer/event_loop_test.cc
+++ b/third_party/xla/xla/python/transfer/event_loop_test.cc
@@ -19,6 +19,9 @@ limitations under the License.
 #include <sys/socket.h>
 
 #include <atomic>
+#include <cstdint>
+#include <deque>
+#include <optional>
 #include <string>
 
 #include <gtest/gtest.h>
@@ -26,6 +29,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -166,5 +170,88 @@ TEST(EventLoopTest, TestScheduleAt) {
   ASSERT_GE(absl::Now(), wake_time);
 }
 
+class TestSocketFdPacketState : public SocketFdPacketState {
+ public:
+  void HandlePacket(absl::string_view packet) override {
+    absl::MutexLock l(mu_);
+    messages_.push_back(std::string(packet));
+  }
+
+  void ConnectFailed() override { LOG(FATAL) << "Not tested."; }
+
+  void RecvClosed(absl::Status error) override {
+    absl::MutexLock l(mu_);
+    recv_closed_status_ = error;
+  }
+
+  void SendClosed(absl::Status error) override {
+    absl::MutexLock l(mu_);
+    send_closed_status_ = error;
+  }
+
+  std::string ReadMessage() {
+    absl::MutexLock l(mu_);
+    auto cond = [this]() -> bool { return !messages_.empty(); };
+    mu_.Await(absl::Condition(&cond));
+    auto out = messages_.front();
+    messages_.pop_front();
+    return out;
+  }
+
+  absl::Status WaitForRecvClosed() {
+    absl::MutexLock l(mu_);
+    auto cond = [this]() -> bool { return recv_closed_status_.has_value(); };
+    mu_.Await(absl::Condition(&cond));
+    return *recv_closed_status_;
+  }
+
+  absl::Status WaitForSendClosed() {
+    absl::MutexLock l(mu_);
+    auto cond = [this]() -> bool { return send_closed_status_.has_value(); };
+    mu_.Await(absl::Condition(&cond));
+    return *send_closed_status_;
+  }
+
+ private:
+  absl::Mutex mu_;
+  std::deque<std::string> messages_;
+  std::optional<absl::Status> recv_closed_status_;
+  std::optional<absl::Status> send_closed_status_;
+};
+
+std::string MakeTestMessage(std::string msg) {
+  uint32_t len = static_cast<uint32_t>(msg.size());
+  return std::string(reinterpret_cast<const char*>(&len), sizeof(len)) + msg;
+}
+
+TEST(EventLoopTest, SocketFdPacketState) {
+  int fd[2];
+  ASSERT_NE(-1, socketpair(PF_LOCAL, SOCK_STREAM | SOCK_NONBLOCK, 0, fd))
+      << strerror(errno) << " " << errno;
+  auto* conn1 = new TestSocketFdPacketState();
+  auto* conn2 = new TestSocketFdPacketState();
+  conn2->RegisterFd(fd[1], /*start_connected=*/true);
+
+  conn1->SendRawFrame(MakeTestMessage("secret"));
+  conn1->RegisterFd(fd[0], /*start_connected=*/true);
+
+  EXPECT_EQ(conn2->ReadMessage(), "secret");
+  conn2->Shutdown(SHUT_RDWR);
+
+  EXPECT_TRUE(conn1->WaitForRecvClosed().ok());
+  EXPECT_TRUE(conn1->WaitForSendClosed().ok());
+  EXPECT_TRUE(conn2->WaitForRecvClosed().ok());
+  EXPECT_TRUE(conn2->WaitForSendClosed().ok());
+  PollEventLoop::GetDefault()->SendWake(conn1);
+  PollEventLoop::GetDefault()->SendWake(conn2);
+  delete conn1;
+  delete conn2;
+  // Wakes are safe.
+  absl::Notification done_notify;
+  PollEventLoop::GetDefault()->Schedule(
+      [&done_notify]() { done_notify.Notify(); });
+  done_notify.WaitForNotification();
+}
+
 }  // namespace
 }  // namespace aux
diff --git a/third_party/xla/xla/python/transfer/pjrt_transfer_server.cc b/third_party/xla/xla/python/transfer/pjrt_transfer_server.cc
index 4f783982a458da..27b57346270bcb 100644
--- a/third_party/xla/xla/python/transfer/pjrt_transfer_server.cc
+++ b/third_party/xla/xla/python/transfer/pjrt_transfer_server.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
@@ -248,8 +249,19 @@ absl::Status PjRtTransferServer::CrossHostPull(
         xla::DimensionVector(shape.dims().begin(), shape.dims().end())};
     shape_specs.push_back(shape_spec);
 
-    auto pjrt_layout = arrays[i]->pjrt_layout();
+    absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> pjrt_layout =
+        arrays[i]->pjrt_layout();
     std::optional<xla::Layout> layout;
+    if (pjrt_layout.ok() && *pjrt_layout == nullptr) {
+      TF_ASSIGN_OR_RETURN(
+          xla::ifrt::Shape shard_shape,
+          arrays[i]->sharding().GetShardShape(arrays[i]->shape()));
+      TF_ASSIGN_OR_RETURN(
+          pjrt_layout, arrays[i]->client()->GetDefaultPjRtLayout(
+                           arrays[i]->dtype(), shard_shape.dims(),
+                           arrays[i]->sharding().devices()->devices().front(),
+                           arrays[i]->sharding().memory_kind()));
+    }
     if (pjrt_layout.ok()) {
       layout = (*pjrt_layout)->xla_layout();
     }
@@ -345,6 +357,16 @@ PjRtTransferServer::CopyArraysForCrossHost(
                         arrays[i]->shared_ptr_sharding()->WithDeviceAssignment(
                             dst_devices, memory_kind));
     TF_ASSIGN_OR_RETURN(auto new_layout, arrays[i]->pjrt_layout());
+    if (new_layout == nullptr) {
+      TF_ASSIGN_OR_RETURN(
+          xla::ifrt::Shape shard_shape,
+          arrays[i]->sharding().GetShardShape(arrays[i]->shape()));
+      TF_ASSIGN_OR_RETURN(
+          new_layout, arrays[i]->client()->GetDefaultPjRtLayout(
+                          arrays[i]->dtype(), shard_shape.dims(),
+                          arrays[i]->sharding().devices()->devices().front(),
+                          arrays[i]->sharding().memory_kind()));
+    }
     PjRtArray::PjRtBuffers array_buffers;
     array_buffers.reserve(buffers_by_device.size());
     for (auto& [_, bufs] : buffers_by_device) {
diff --git a/third_party/xla/xla/python/transfer/socket-server.cc b/third_party/xla/xla/python/transfer/socket-server.cc
index 529c69922833aa..f70f2ee3ce8298 100644
--- a/third_party/xla/xla/python/transfer/socket-server.cc
+++ b/third_party/xla/xla/python/transfer/socket-server.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -46,24 +47,23 @@ limitations under the License.
 
 namespace aux {
 
-class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
+class SocketServer::SocketNetworkState : public SocketFdPacketState {
  public:
   explicit SocketNetworkState(std::shared_ptr<PullTable> table,
                               std::shared_ptr<BulkTransportFactory> factory,
                               int fd)
-      : table_(std::move(table)), factory_(std::move(factory)), fd_(fd) {
-    is_connected_ = true;
+      : table_(std::move(table)), factory_(std::move(factory)) {
+    RegisterFd(fd, /*start_connected=*/true);
   }
   explicit SocketNetworkState(std::shared_ptr<PullTable> table,
                               std::shared_ptr<BulkTransportFactory> factory,
                               const SocketAddress& addr)
       : table_(std::move(table)),
         factory_(std::move(factory)),
-        fd_(-1),
         remote_addr_(addr) {
-    StartConnect();
   }
-  ~SocketNetworkState() override { close(fd_); }
+
+  ~SocketNetworkState() override = default;
 
   void StartConnect() {
     int send_fd = socket(remote_addr_.address().sa_family,
@@ -77,153 +77,59 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
     CHECK_GE(
         setsockopt(send_fd, IPPROTO_TCP, TCP_NODELAY, &value, sizeof(value)), 0)
         << strerror(errno) << " " << errno;
-    fd_ = send_fd;
+    RegisterFd(send_fd, /*start_connected=*/false);
   }
 
-  void PopulatePollInfo(pollfd& events) override {
-    events.fd = fd_;
-    events.events = POLLIN;
-    if (!can_send_) {
-      events.events = POLLOUT;
-    }
+  void ConnectFailed() override {
+    loop()->ScheduleAt(absl::Now() + absl::Seconds(2),
+                       [this]() { StartConnect(); });
   }
 
-  bool HandleEvents(const pollfd& events) override {
-    tsl::profiler::TraceMe __trace("SocketServer::HandleEvents");
-    if (!is_connected_) {
-      // poll() may remind us that fd_ is invalid while waiting to reconnect.
-      if (fd_ == -1) {
-        return true;
-      }
-      // If HUP with an error happens, then schedule a reconnect.
-      if ((events.revents & POLLHUP) && (events.revents & POLLERR)) {
-        fd_ = -1;
-        loop()->ScheduleAt(absl::Now() + absl::Seconds(2),
-                           [this]() { StartConnect(); });
-        return true;
-      }
-      if (!(events.revents & POLLOUT)) {
-        return true;
-      }
-      is_connected_ = true;
-    }
-    if (events.revents & POLLIN) {
-      ssize_t recv_size =
-          recv(fd_, network_buffer_.get() + recv_count_, 4096 - recv_count_, 0);
-      if (recv_size == 0) {
-        {
-          absl::MutexLock l(mu_);
-          is_poisoned_ = true;
-          peer_is_closed_ = true;
-          poison_status_ = absl::InternalError(
-              "SocketServer: Connection closed recv() == 0.");
-        }
-        ClearDestTable();
-      } else if (recv_size == -1 && errno == EAGAIN) {
-      } else {
-        if (recv_size < 0) {
-          Poison(absl::InternalError(
-              absl::StrFormat("%ld = recv() failed errno: %d err: %s",
-                              recv_size, errno, strerror(errno))));
-          return true;
-        }
-        recv_count_ += recv_size;
-        while (recv_count_ >= sizeof(uint32_t)) {
-          uint32_t frame_size;
-          memcpy(&frame_size, network_buffer_.get(), sizeof(uint32_t));
-          if (frame_size < 0 || frame_size > 4096 - sizeof(uint32_t)) {
-            Poison(absl::InternalError(
-                absl::StrFormat("frame_size is too large: %lu", frame_size)));
-            return true;
-          }
-          size_t total_frame_size =
-              static_cast<size_t>(frame_size) + sizeof(uint32_t);
-          // Needs more input.
-          if (total_frame_size > recv_count_) {
-            break;
-          }
-          absl::string_view buffer(network_buffer_.get() + sizeof(uint32_t),
-                                   frame_size);
-          SocketTransferRequest req;
-          if (!req.ParseFromArray(buffer.data(), buffer.size())) {
-            Poison(
-                absl::InternalError("Could not parse SocketTransferRequest."));
-            return true;
-          }
-          HandlePacket(req);
-          if (total_frame_size < recv_count_) {
-            memmove(network_buffer_.get(),
-                    network_buffer_.get() + total_frame_size,
-                    recv_count_ - total_frame_size);
-          }
-          recv_count_ -= total_frame_size;
-        }
-      }
-    }
-    if (events.revents & POLLOUT) {
-      can_send_ = true;
+  void RecvClosed(absl::Status error) override {
+    Shutdown(SHUT_RDWR);
+    if (error.ok()) {
+      error =
+          absl::InternalError("SocketServer: Connection closed recv() == 0.");
     }
-    mu_.lock();
-    while (!frames_.empty() && can_send_) {
-      auto& packet_to_send = frames_.front();
-      if (packet_to_send.empty()) {
-        shutdown(fd_, SHUT_WR);
-        break;
-      }
-      const void* base = packet_to_send.data() + write_offset_;
-      size_t size = packet_to_send.size() - write_offset_;
-      ssize_t send_size = send(fd_, base, size, 0);
-      if (send_size > 0) {
-        write_offset_ += send_size;
-        if (send_size == size) {
-          write_offset_ = 0;
-          frames_.pop_front();
-        } else {
-          can_send_ = false;
-        }
-      } else if (send_size < 0 && errno == EAGAIN) {
-        can_send_ = false;
-      } else {
-        mu_.unlock();
-        Poison(absl::InternalError(
-            absl::StrFormat("%ld = send() failed errno: %d err: %s", send_size,
-                            errno, strerror(errno))));
-        return true;
-      }
-    }
-    if (peer_is_closed_ && num_refs_ == 0) {
-      mu_.unlock();
-      delete this;
-      return false;
-    }
-    mu_.unlock();
-    return true;
+    Poison(error);
+    DropSysRef();
   }
 
-  bool can_send_ = false;
-  bool is_connected_ = false;
-  size_t write_offset_ = 0;
-  std::deque<std::string> frames_;
+  void SendClosed(absl::Status error) override {
+    Shutdown(SHUT_RDWR);
+    {
+      absl::MutexLock l(mu_);
+      is_poisoned_ = true;
+      poison_status_ =
+          absl::InternalError("SocketServer: Connection closed recv() == 0.");
+    }
+    DropSysRef();
+  }
 
-  void SendFrame(const SocketTransferRequest& req) {
+  bool SendFrame(const SocketTransferRequest& req) {
     uint32_t header = req.ByteSizeLong();
     std::string opacket = std::string(absl::string_view(
         reinterpret_cast<const char*>(&header), sizeof(header)));
     req.AppendToString(&opacket);
-    {
-      absl::MutexLock l(mu_);
-      frames_.push_back(std::move(opacket));
-    }
-    loop()->SendWake(this);
+    return SendRawFrame(std::move(opacket));
   }
 
-  tsl::RCReference<ChunkDestination> GetNextDest(size_t req_id, size_t offset,
-                                                 size_t size, bool is_largest) {
+  std::optional<tsl::RCReference<ChunkDestination>> GetNextDest(
+      size_t req_id, size_t offset, size_t size, bool is_largest) {
     tsl::RCReference<ChunkDestination> dest;
     {
       absl::MutexLock l(mu_);
+      if (is_poisoned_) {
+        return std::nullopt;
+      }
       auto it = dests_.find(req_id);
-      CHECK(it != dests_.end());
+      if (it == dests_.end()) {
+        Shutdown(SHUT_RDWR);
+        is_poisoned_ = true;
+        poison_status_ =
+            absl::InternalError("SocketServer: it != dests_.end()");
+        return std::nullopt;
+      }
       if (is_largest) {
         it->second.transferred_size += offset;
       } else {
@@ -232,6 +138,7 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
       if (it->second.transferred_size == 0) {
         dest = std::move(it->second.dest);
         dests_.erase(it);
+        CheckSendNoMorePulls();
       } else {
         dest = it->second.dest;
       }
@@ -285,6 +192,15 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
 
   BulkTransportInterface* bulk_transport() { return bulk_transport_.get(); }
 
+  void HandlePacket(absl::string_view buffer) override {
+    SocketTransferRequest req;
+    if (!req.ParseFromString(buffer)) {
+      Poison(absl::InternalError("Could not parse SocketTransferRequest."));
+      return;
+    }
+    HandlePacket(req);
+  }
+
   void HandlePacket(const SocketTransferPullRequest& req) {
     class SocketConnectionState : public ConnectionState {
      public:
@@ -299,10 +215,14 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
         msg.data = const_cast<void*>(data);
         msg.size = size;
         msg.on_send = [val = tsl::FormRef(this), offset, req_id, is_largest](
-                          int bond_id, size_t size) {
+                          absl::StatusOr<int> bond_id, size_t size) {
+          if (!bond_id.ok()) {
+            val->SendError(req_id, offset, size, is_largest, bond_id.status());
+            return;
+          }
           SocketTransferRequest response;
           auto* packet = response.mutable_packet();
-          packet->set_bulk_transport_id(bond_id);
+          packet->set_bulk_transport_id(*bond_id);
           packet->set_offset(offset);
           packet->set_size(size);
           packet->set_req_id(req_id);
@@ -362,7 +282,10 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
   void HandlePacket(const SocketTransferPacketErrorHeader& packet) {
     auto dest = GetNextDest(packet.req_id(), packet.offset(), packet.size(),
                             packet.is_largest());
-    dest->Poison(absl::InternalError(
+    if (!dest.has_value()) {
+      return;
+    }
+    (*dest)->Poison(absl::InternalError(
         absl::StrCat("Error while transferring: ", packet.error_message())));
   }
 
@@ -382,45 +305,81 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
   void HandlePacket(const SocketTransferPacketHeader& packet) {
     auto dest = GetNextDest(packet.req_id(), packet.offset(), packet.size(),
                             packet.is_largest());
+    if (!dest.has_value()) {
+      return;
+    }
     bulk_transport_->Recv(
         packet.size(), packet.bulk_transport_id(),
-        [offset = packet.offset(), dest = std::move(dest)](
+        [offset = packet.offset(), dest = *std::move(dest)](
             absl::StatusOr<BulkTransportInterface::Message> msgor) {
-          auto msg = std::move(msgor).value();
-          CHECK_OK(
-              dest->Put(msg.data, offset, msg.size, std::move(msg.on_done)));
+          if (!msgor.ok()) {
+            dest->Poison(msgor.status());
+          } else {
+            auto msg = std::move(msgor).value();
+            CHECK_OK(
+                dest->Put(msg.data, offset, msg.size, std::move(msg.on_done)));
+          }
         });
   }
 
-  void DropRef() {
-    {
-      absl::MutexLock l(mu_);
-      CHECK_NE(num_refs_, 0);
-      --num_refs_;
-      ShutdownIfNeeded();
+  std::unique_ptr<SocketNetworkState> DropRef() {
+    absl::MutexLock l(mu_);
+    CHECK_NE(num_refs_, 0);
+    --num_refs_;
+    ShutdownIfNeeded();
+    return ReturnCheckIfRefsAreZero();
+  }
+
+  std::unique_ptr<SocketNetworkState> DropSysRef() {
+    absl::MutexLock l(mu_);
+    CHECK_NE(num_sys_refs_, 0);
+    --num_sys_refs_;
+    ShutdownIfNeeded();
+    return ReturnCheckIfRefsAreZero();
+  }
+
+  std::unique_ptr<SocketNetworkState> ReturnCheckIfRefsAreZero()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (num_refs_ == 0 && num_sys_refs_ == 0) {
+      // destroy outside of mutex scope.
+      return std::unique_ptr<SocketNetworkState>(this);
     }
+    return {};
   }
 
-  void NoMorePulls() {
-    SocketTransferRequest msg;
-    msg.mutable_half_close();
-    SendFrame(msg);
-    DropRef();
+  void CheckSendNoMorePulls() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (dests_.empty() && no_more_pulls_) {
+      SocketTransferRequest msg;
+      msg.mutable_half_close();
+      SendFrame(msg);
+    }
+  }
+
+  void IncRef() {
+    absl::MutexLock l(mu_);
+    ++num_refs_;
+  }
+
+  std::unique_ptr<SocketNetworkState> NoMorePulls() {
+    absl::MutexLock l(mu_);
+    no_more_pulls_ = true;
+    CheckSendNoMorePulls();
+    CHECK_NE(num_refs_, 0);
+    --num_refs_;
+    return ReturnCheckIfRefsAreZero();
   }
 
   void HandlePacket(const SocketTransferHalfClose& half_close) {
     mu_.lock();
-    CHECK(!peer_is_closed_);
-    peer_is_closed_ = true;
+    peer_half_closed_ = true;
     ShutdownIfNeeded();
     mu_.unlock();
   }
 
-  void ShutdownIfNeeded() {
-    if (!peer_is_closed_ || num_refs_ != 0) {
-      return;
+  void ShutdownIfNeeded() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (num_refs_ == 0 && peer_half_closed_) {
+      Shutdown(SHUT_RDWR);
     }
-    loop()->SendWake(this);
   }
 
   void Pull(uint64_t uuid, int buf_id,
@@ -461,23 +420,22 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
     }
   }
 
-  void InjectFailure() {
-    uint32_t header = 12341024;
-    std::string opacket = std::string(absl::string_view(
-        reinterpret_cast<const char*>(&header), sizeof(header)));
-    opacket += "Injected Failure.";
-    {
-      absl::MutexLock l(mu_);
-      frames_.push_back(std::move(opacket));
+  void InjectFailure(Connection::FailureKind kind) {
+    if (kind == Connection::kProtocolFailure) {
+      uint32_t header = 12341024;
+      std::string opacket = std::string(absl::string_view(
+          reinterpret_cast<const char*>(&header), sizeof(header)));
+      opacket += "Injected Failure.";
+      SendRawFrame(std::move(opacket));
+    } else {
+      Poison(absl::InternalError("RECOVERABLE InjectFailure"));
     }
-    loop()->SendWake(this);
   }
 
   static void Accept(std::shared_ptr<PullTable> table,
                      std::shared_ptr<BulkTransportFactory> factory,
                      int sockfd) {
-    auto* remote = new SocketNetworkState(table, factory, sockfd);
-    remote->Register();
+    new SocketNetworkState(table, factory, sockfd);
   }
 
   void ClearDestTable() {
@@ -497,7 +455,7 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
     {
       absl::MutexLock l(mu_);
       is_poisoned_ = true;
-      shutdown(fd_, SHUT_RDWR);
+      Shutdown(SHUT_RDWR);
       poison_status_ = s;
     }
     ClearDestTable();
@@ -507,15 +465,13 @@ class SocketServer::SocketNetworkState : public PollEventLoop::Handler {
   std::shared_ptr<PullTable> table_;
   std::shared_ptr<BulkTransportFactory> factory_;
   absl::Mutex mu_;
-  size_t num_refs_ = 1;
-  bool peer_is_closed_ = false;
+  size_t num_refs_ ABSL_GUARDED_BY(mu_) = 0;
+  size_t num_sys_refs_ ABSL_GUARDED_BY(mu_) = 2;
+  bool no_more_pulls_ = false;
+  bool peer_half_closed_ = false;
   bool is_poisoned_ = false;
   absl::Status poison_status_;
-  int fd_ = -1;
   SocketAddress remote_addr_;
-  size_t recv_count_ = 0;
-  std::unique_ptr<char[]> network_buffer_ =
-      std::unique_ptr<char[]>(new char[4096]);
 
   uint64_t next_req_id_ = 0;
   struct DestState {
@@ -543,7 +499,9 @@ void SocketServer::Connection::Pull(
   local_->Pull(uuid, buffer_ids, std::move(dests));
 }
 
-void SocketServer::Connection::InjectFailure() { local_->InjectFailure(); }
+void SocketServer::Connection::InjectFailure(FailureKind kind) {
+  local_->InjectFailure(kind);
+}
 
 absl::Status SocketServer::Start(
     const SocketAddress& addr,
@@ -573,8 +531,9 @@ tsl::RCReference<SocketServer::Connection> SocketServer::Connect(
     const SocketAddress& other_addr) {
   auto* local_ =
       new SocketNetworkState(pull_table_, bulk_transport_factory_, other_addr);
-  local_->Register();
   local_->StartBulkTransporting();
+  local_->IncRef();
+  local_->StartConnect();
   return tsl::MakeRef<Connection>(local_);
 }
 
diff --git a/third_party/xla/xla/python/transfer/socket-server.h b/third_party/xla/xla/python/transfer/socket-server.h
index c7c439637371b8..c694eb7d561edd 100644
--- a/third_party/xla/xla/python/transfer/socket-server.h
+++ b/third_party/xla/xla/python/transfer/socket-server.h
@@ -64,7 +64,12 @@ class SocketServer {
     void Pull(uint64_t uuid, absl::Span<const int> buffer_ids,
               std::vector<tsl::RCReference<ChunkDestination>> dests);
 
-    void InjectFailure();
+    enum FailureKind {
+      kPoison,
+      kProtocolFailure,
+    };
+
+    void InjectFailure(FailureKind kind = kProtocolFailure);
 
    private:
     SocketNetworkState* local_;
diff --git a/third_party/xla/xla/python/transfer/socket_bulk_transport.cc b/third_party/xla/xla/python/transfer/socket_bulk_transport.cc
index a31010ad91e1b5..b4e8b20831c609 100644
--- a/third_party/xla/xla/python/transfer/socket_bulk_transport.cc
+++ b/third_party/xla/xla/python/transfer/socket_bulk_transport.cc
@@ -169,7 +169,7 @@ void ZeroCopySendAckTable::ClearAll() {
 
 class SendConnectionHandler : public PollEventLoop::Handler {
  public:
-  SendConnectionHandler(int fd, int bond_id,
+  SendConnectionHandler(std::shared_ptr<int> fd, int bond_id,
                         std::shared_ptr<SharedSendMsgQueue> msg_queue,
                         std::shared_ptr<SharedSendWorkQueue> work_queue,
                         size_t artificial_send_limit)
@@ -180,10 +180,10 @@ class SendConnectionHandler : public PollEventLoop::Handler {
         artificial_send_limit_(artificial_send_limit) {}
 
   ~SendConnectionHandler() override {
+    msg_queue_->Poison(absl::InternalError("A send connection has failed."));
 #ifdef MSG_ZEROCOPY
     table_.ClearAll();
 #endif
-    close(fd_);
   }
 
   void ScheduleSendWork(aux::BulkTransportInterface::SendMessage msg) {
@@ -197,7 +197,7 @@ class SendConnectionHandler : public PollEventLoop::Handler {
     size_t offset = 0;
     while (offset < msg.size) {
       ssize_t send_count =
-          send(fd_, reinterpret_cast<char*>(msg.data) + offset,
+          send(*fd_, reinterpret_cast<char*>(msg.data) + offset,
                std::min(msg.size - offset, artificial_send_limit_), 0);
       if (send_count <= 0) {
         break;
@@ -214,7 +214,7 @@ class SendConnectionHandler : public PollEventLoop::Handler {
     size_t offset = 0;
     while (offset < msg.size) {
       ssize_t send_count = send(
-          fd_, reinterpret_cast<char*>(msg.data) + offset,
+          *fd_, reinterpret_cast<char*>(msg.data) + offset,
           std::min(msg.size - offset, artificial_send_limit_), MSG_ZEROCOPY);
       if (send_count <= 0) {
         break;
@@ -253,7 +253,7 @@ class SendConnectionHandler : public PollEventLoop::Handler {
   }
 
   void PopulatePollInfo(pollfd& events) override {
-    events.fd = fd_;
+    events.fd = *fd_;
     events.events = POLLERR | POLLRDHUP;
     if (state_.load() == SocketState::kNotReady) {
       events.events |= POLLOUT;
@@ -262,11 +262,18 @@ class SendConnectionHandler : public PollEventLoop::Handler {
 
   bool HandleEvents(const pollfd& events) override {
     if (events.revents & POLLRDHUP) {
-      HandleRdHup();
+      TransitionToErrorState();
       return false;
     } else if (events.revents & POLLERR) {
 #ifdef MSG_ZEROCOPY
-      CHECK_OK(table_.HandleSocketErrors(fd_));
+      auto status = table_.HandleSocketErrors(*fd_);
+      if (!status.ok()) {
+        TransitionToErrorState();
+        return false;
+      }
+#else
+      TransitionToErrorState();
+      return false;
 #endif
     } else if (events.revents & POLLOUT) {
       if (no_more_messages_.load() == true) {
@@ -280,7 +287,7 @@ class SendConnectionHandler : public PollEventLoop::Handler {
     return true;
   }
 
-  void HandleRdHup() {
+  void TransitionToErrorState() {
     while (true) {
       auto state = state_.load();
       if (state == SocketState::kNotReady) {
@@ -312,7 +319,7 @@ class SendConnectionHandler : public PollEventLoop::Handler {
 #ifdef MSG_ZEROCOPY
   ZeroCopySendAckTable table_;
 #endif
-  int fd_;
+  std::shared_ptr<int> fd_;
   int bond_id_;
   std::shared_ptr<SharedSendMsgQueue> msg_queue_;
   std::shared_ptr<SharedSendWorkQueue> work_queue_;
@@ -356,6 +363,19 @@ std::shared_ptr<SharedSendWorkQueue> SharedSendWorkQueue::Start() {
   return result;
 }
 
+void SharedSendMsgQueue::Poison(absl::Status s) {
+  mu_.lock();
+  poison_status_ = s;
+  auto work_items = std::move(work_items_);
+  mu_.unlock();
+  while (!work_items.empty()) {
+    auto work = std::move(work_items.front());
+    work_items.pop_front();
+    std::move(work.on_send)(s, work.size);
+    std::move(work.on_done)();
+  }
+}
+
 void SharedSendMsgQueue::ReportReadyToSend(SendConnectionHandler* handler) {
   mu_.lock();
   if (!work_items_.empty()) {
@@ -375,6 +395,13 @@ void SharedSendMsgQueue::ReportReadyToSend(SendConnectionHandler* handler) {
 void SharedSendMsgQueue::ScheduleSendWork(
     aux::BulkTransportInterface::SendMessage msg) {
   mu_.lock();
+  if (!poison_status_.ok()) {
+    auto s = poison_status_;
+    mu_.unlock();
+    std::move(msg.on_send)(std::move(s), msg.size);
+    std::move(msg.on_done)();
+    return;
+  }
   DCHECK(!shutdown_);
   if (work_items_.empty() && !handlers_.empty()) {
     auto* handler = handlers_.front();
@@ -402,7 +429,8 @@ void SharedSendMsgQueue::NoMoreMessages() {
 }
 
 void SharedSendMsgQueue::StartSubConnectionSender(
-    int fd, int bond_id, std::shared_ptr<SharedSendMsgQueue> msg_queue,
+    std::shared_ptr<int> fd, int bond_id,
+    std::shared_ptr<SharedSendMsgQueue> msg_queue,
     std::shared_ptr<SharedSendWorkQueue> work_queue,
     size_t artificial_send_limit) {
   auto* handler =
@@ -443,7 +471,7 @@ void RecvThreadState::DoRecvWork() {
 }
 
 void RecvThreadState::ScheduleRecvWork(
-    size_t recv_size, int fd,
+    size_t recv_size, std::shared_ptr<int> fd,
     absl::AnyInvocable<
         void(absl::StatusOr<aux::BulkTransportInterface::Message> msg) &&>
         on_recv) {
@@ -496,11 +524,11 @@ absl::Status RecvThreadState::HandleRecvItem(recv_work_item& work,
       zc.length = work.recv_size - offset;
 
       struct pollfd fds[1];
-      fds[0] = {.fd = work.fd, .events = POLLIN, .revents = 0};
+      fds[0] = {.fd = *work.fd, .events = POLLIN, .revents = 0};
       poll(&fds[0], 1, -1);
 
       res =
-          getsockopt(work.fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len);
+          getsockopt(*work.fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len);
       if (res == -1) {
         std::move(alloc.on_done)();
         return absl::ErrnoToStatus(errno, "zero-copy-recv");
@@ -528,7 +556,7 @@ absl::Status RecvThreadState::HandleRecvItem(recv_work_item& work,
     }
     while (offset != work.recv_size) {
       ssize_t recv_count =
-          recv(work.fd, reinterpret_cast<char*>(alloc.data) + offset,
+          recv(*work.fd, reinterpret_cast<char*>(alloc.data) + offset,
                work.recv_size - offset, 0);
       if (recv_count == 0) {
         std::move(alloc.on_done)();
@@ -578,7 +606,7 @@ class SocketBulkTransport : public BulkTransportInterface {
       override {
     auto& conn = connections_[bond_id];
     absl::MutexLock l(conn->mu);
-    if (conn->fd == -1) {
+    if (!conn->fd) {
       conn->pending_recvs.push_back({size, std::move(on_recv)});
     } else {
       conn->thread_state->ScheduleRecvWork(size, conn->fd, std::move(on_recv));
@@ -596,16 +624,21 @@ class SocketBulkTransport : public BulkTransportInterface {
     std::shared_ptr<SharedSendMsgQueue> send_msg_queue;
     std::shared_ptr<SharedSendWorkQueue> send_work_queue;
     std::vector<PendingRecv> pending_recvs;
-    int fd = -1;
+    std::shared_ptr<int> fd;
 
     void AcceptSock(int accept_fd) {
+      auto shared_fd =
+          std::shared_ptr<int>(new int{accept_fd}, [](int* fd_ptr) {
+            close(*fd_ptr);
+            delete fd_ptr;
+          });
       SharedSendMsgQueue::StartSubConnectionSender(
-          accept_fd, connection_id, send_msg_queue, send_work_queue);
+          shared_fd, connection_id, send_msg_queue, send_work_queue);
       {
         absl::MutexLock l(mu);
-        fd = accept_fd;
+        fd = shared_fd;
         for (auto& pending_recv : pending_recvs) {
-          thread_state->ScheduleRecvWork(pending_recv.size, accept_fd,
+          thread_state->ScheduleRecvWork(pending_recv.size, fd,
                                          std::move(pending_recv.on_recv));
         }
       }
diff --git a/third_party/xla/xla/python/transfer/socket_bulk_transport.h b/third_party/xla/xla/python/transfer/socket_bulk_transport.h
index 47609de4482632..de3c4d73a2742d 100644
--- a/third_party/xla/xla/python/transfer/socket_bulk_transport.h
+++ b/third_party/xla/xla/python/transfer/socket_bulk_transport.h
@@ -125,10 +125,13 @@ class SharedSendMsgQueue {
 
   // Starts a sender for 1 part of the thread.
   static void StartSubConnectionSender(
-      int fd, int bond_id, std::shared_ptr<SharedSendMsgQueue> msg_queue,
+      std::shared_ptr<int> fd, int bond_id,
+      std::shared_ptr<SharedSendMsgQueue> msg_queue,
       std::shared_ptr<SharedSendWorkQueue> work_queue,
       size_t artificial_send_limiti = std::numeric_limits<size_t>::max());
 
+  void Poison(absl::Status s);
+
  private:
   friend class SendConnectionHandler;
 
@@ -136,6 +139,7 @@ class SharedSendMsgQueue {
 
   absl::Mutex mu_;
   bool shutdown_ = false;
+  absl::Status poison_status_;
   std::deque<SendConnectionHandler*> handlers_;
   std::deque<aux::BulkTransportInterface::SendMessage> work_items_;
 };
@@ -145,7 +149,7 @@ class RecvThreadState {
  public:
   // Schedules recv() syscall on a particular fd.
   void ScheduleRecvWork(
-      size_t recv_size, int fd,
+      size_t recv_size, std::shared_ptr<int> fd,
       absl::AnyInvocable<
           void(absl::StatusOr<aux::BulkTransportInterface::Message> msg) &&>
           on_recv);
@@ -163,7 +167,7 @@ class RecvThreadState {
 
   struct recv_work_item {
     size_t recv_size;
-    int fd;
+    std::shared_ptr<int> fd;
     absl::AnyInvocable<
         void(absl::StatusOr<aux::BulkTransportInterface::Message> msg) &&>
         on_recv;
diff --git a/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc b/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
index 384f263cf5fdcc..d0434425791472 100644
--- a/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
+++ b/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
@@ -69,6 +69,13 @@ absl::Status SetupSocketPairUsingEventLoop(int& send_fd, int& recv_fd) {
   return absl::OkStatus();
 }
 
+std::shared_ptr<int> WrapSocket(int fd) {
+  return std::shared_ptr<int>(new int{fd}, [](int* fd_ptr) {
+    close(*fd_ptr);
+    delete fd_ptr;
+  });
+};
+
 TEST(SendQueue, TestZeroCopyQueueCleanRemoteShutdown) {
   int send_fd, recv_fd;
   auto status = SetupSocketPairUsingEventLoop(send_fd, recv_fd);
@@ -77,15 +84,15 @@ TEST(SendQueue, TestZeroCopyQueueCleanRemoteShutdown) {
   auto work_queue = SharedSendWorkQueue::Start();
   auto msg_queue = std::make_shared<SharedSendMsgQueue>();
 
-  SharedSendMsgQueue::StartSubConnectionSender(send_fd, 0, msg_queue,
-                                               work_queue);
+  SharedSendMsgQueue::StartSubConnectionSender(WrapSocket(send_fd), 0,
+                                               msg_queue, work_queue);
 
   std::string txt_msg("hello world");
   absl::Notification notify;
   BulkTransportInterface::SendMessage msg;
   msg.data = txt_msg.data();
   msg.size = txt_msg.size();
-  msg.on_send = [](int id, size_t size) {};
+  msg.on_send = [](absl::StatusOr<int> id, size_t size) {};
   msg.on_done = [&notify]() { notify.Notify(); };
   msg_queue->ScheduleSendWork(std::move(msg));
   notify.WaitForNotification();
@@ -101,15 +108,16 @@ TEST(SendQueue, SendAndRecvQueuesArtificialLimit) {
                            packet_size);
   auto recv_thread = RecvThreadState::Create(allocator, uallocator);
 
-  int send_fd, recv_fd;
-  auto status = SetupSocketPairUsingEventLoop(send_fd, recv_fd);
+  int send_fd, raw_recv_fd;
+  auto status = SetupSocketPairUsingEventLoop(send_fd, raw_recv_fd);
   ASSERT_TRUE(status.ok()) << status;
+  auto recv_fd = WrapSocket(raw_recv_fd);
 
   auto work_queue = SharedSendWorkQueue::Start();
   auto msg_queue = std::make_shared<SharedSendMsgQueue>();
 
-  SharedSendMsgQueue::StartSubConnectionSender(send_fd, 0, msg_queue,
-                                               work_queue, 64);
+  SharedSendMsgQueue::StartSubConnectionSender(WrapSocket(send_fd), 0,
+                                               msg_queue, work_queue, 64);
 
   std::string txt_msg;
 
@@ -124,7 +132,7 @@ TEST(SendQueue, SendAndRecvQueuesArtificialLimit) {
     BulkTransportInterface::SendMessage msg;
     msg.data = txt_msg.data();
     msg.size = txt_msg.size();
-    msg.on_send = [](int id, size_t size) {};
+    msg.on_send = [](absl::StatusOr<int> id, size_t size) {};
     msg.on_done = [&mu, &send_count]() {
       absl::MutexLock l(mu);
       --send_count;
@@ -162,9 +170,10 @@ TEST(SendQueue, SendAndRecvQueuesEarlyClose) {
                            packet_size);
   auto recv_thread = RecvThreadState::Create(std::nullopt, uallocator);
 
-  int send_fd, recv_fd;
-  auto status = SetupSocketPairUsingEventLoop(send_fd, recv_fd);
+  int send_fd, raw_recv_fd;
+  auto status = SetupSocketPairUsingEventLoop(send_fd, raw_recv_fd);
   ASSERT_TRUE(status.ok()) << status;
+  auto recv_fd = WrapSocket(raw_recv_fd);
 
   close(send_fd);
 
@@ -182,6 +191,50 @@ TEST(SendQueue, SendAndRecvQueuesEarlyClose) {
   }
 }
 
+TEST(SendQueue, SendAndRecvQueuesRecvEarlyClose) {
+  size_t packet_size = 1024 * 8;
+  SlabAllocator uallocator(AllocateAlignedMemory(packet_size * 4).value(),
+                           packet_size);
+  auto recv_thread = RecvThreadState::Create(std::nullopt, uallocator);
+
+  int send_fd, recv_fd;
+  auto status = SetupSocketPairUsingEventLoop(send_fd, recv_fd);
+  ASSERT_TRUE(status.ok()) << status;
+
+  close(recv_fd);
+  auto work_queue = SharedSendWorkQueue::Start();
+  auto msg_queue = std::make_shared<SharedSendMsgQueue>();
+
+  SharedSendMsgQueue::StartSubConnectionSender(WrapSocket(send_fd), 0,
+                                               msg_queue, work_queue, 64);
+
+  std::string txt_msg;
+
+  while (txt_msg.size() < packet_size) {
+    txt_msg += "hello world";
+  }
+  absl::Mutex mu;
+  size_t send_count = 10;
+
+  for (size_t i = 0; i < 10; ++i) {
+    txt_msg.resize(packet_size);
+    BulkTransportInterface::SendMessage msg;
+    msg.data = txt_msg.data();
+    msg.size = txt_msg.size();
+    msg.on_send = [](absl::StatusOr<int> id, size_t size) {};
+    msg.on_done = [&mu, &send_count]() {
+      absl::MutexLock l(mu);
+      --send_count;
+    };
+    msg_queue->ScheduleSendWork(std::move(msg));
+  }
+  {
+    absl::MutexLock l(mu);
+    auto cond = [&]() { return send_count == 0; };
+    mu.Await(absl::Condition(&cond));
+  }
+}
+
 TEST(SocketBulkTransportFactoryTest, SendAndRecvWithFactory) {
   size_t packet_size = 1024 * 8;
   SlabAllocator allocator(AllocateNetworkPinnedMemory(packet_size * 4).value(),
@@ -230,9 +283,9 @@ TEST(SocketBulkTransportFactoryTest, SendAndRecvWithFactory) {
     BulkTransportInterface::SendMessage msg;
     msg.data = txt_msgs[i].data();
     msg.size = txt_msgs[i].size();
-    msg.on_send = [&, i](int id, size_t size) {
+    msg.on_send = [&, i](absl::StatusOr<int> id, size_t size) {
       absl::MutexLock l(mu);
-      send_queue.push_back({i, id});
+      send_queue.push_back({i, id.value()});
     };
     msg.on_done = [&mu, &send_count]() {
       absl::MutexLock l(mu);
diff --git a/third_party/xla/xla/python/transfer/streaming.cc b/third_party/xla/xla/python/transfer/streaming.cc
index b024becf875bb8..7a88aa12c6daa8 100644
--- a/third_party/xla/xla/python/transfer/streaming.cc
+++ b/third_party/xla/xla/python/transfer/streaming.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "xla/python/transfer/transfer_socket.pb.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
@@ -151,7 +151,10 @@ BulkTransportInterface::SendMessage BulkTransportInterface::MakeMessage(
   SendMessage result;
   result.data = tmp->data();
   result.size = tmp->size();
-  result.on_send = std::move(on_send);
+  result.on_send = [on_send = std::move(on_send)](absl::StatusOr<int> bond_id,
+                                                  size_t size) mutable {
+    std::move(on_send)(bond_id.value(), size);
+  };
   result.on_done = [tmp = std::move(tmp)]() {};
   return result;
 }
@@ -319,7 +322,9 @@ void PullTable::Handle(tsl::RCReference<ConnectionState> state,
   if (entry->Handle(std::move(state), req, base_req_id)) {
     absl::MutexLock l(mu_);
     auto it = entries_.find(req.uuid());
-    entries_.erase(it);
+    if (it != entries_.end()) {
+      entries_.erase(it);
+    }
   }
 }
 
diff --git a/third_party/xla/xla/python/transfer/streaming.h b/third_party/xla/xla/python/transfer/streaming.h
index a724925692f01d..60b76db29759a6 100644
--- a/third_party/xla/xla/python/transfer/streaming.h
+++ b/third_party/xla/xla/python/transfer/streaming.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "xla/python/transfer/transfer_socket.pb.h"
 
 namespace aux {
@@ -107,7 +107,8 @@ class BulkTransportInterface {
     // There may be some delay between Send() and when the message
     // is actually sent. on_send gets called when the message actually
     // gets sent.
-    absl::AnyInvocable<void(int bond_id, size_t size) &&> on_send;
+    absl::AnyInvocable<void(absl::StatusOr<int> bond_id, size_t size) &&>
+        on_send;
   };
 
   // Schedules a send over a BulkTransportInterface connection.
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt.cc b/third_party/xla/xla/python/transfer/streaming_ifrt.cc
index 5ba223e11f6246..f4cda2bd3488e8 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt.cc
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt.cc
@@ -30,9 +30,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/python/transfer/streaming.h"
 #include "xla/python/transfer/transfer_socket.pb.h"
@@ -116,7 +116,6 @@ PremappedCopierState::PremappedCopierState(
       max_num_parallel_copies_(max_num_parallel_copies),
       xfer_size_(xfer_size) {
   max_copies_ = scratch->size() / xfer_size_;
-  max_copies_ = std::min(max_copies_, size_t(8));
   available_copy_offsets_.reserve(max_copies_);
   for (size_t i = 0; i < max_copies_; ++i) {
     available_copy_offsets_.push_back(reinterpret_cast<char*>(scratch->data()) +
@@ -171,7 +170,8 @@ PremappedCopierState::WorkList PremappedCopierState::FindWorkLocked() {
 void PremappedCopierState::StartWorkUnlocked(const WorkList& work_list) {
   for (WorkQueueItem* work_item : work_list) {
     auto& wu = work_item->work;
-    wu.copy_fn(work_item->dest_buffer, wu.offset, wu.size)
+    auto copy_fn = std::move(wu.copy_fn);
+    std::move(copy_fn)(work_item->dest_buffer, wu.offset, wu.size)
         .OnReady([this, this_shared = shared_from_this(),
                   work_item](absl::Status s) {
           WorkList work_list2;
@@ -180,7 +180,9 @@ void PremappedCopierState::StartWorkUnlocked(const WorkList& work_list) {
             --num_parallel_copies_;
             work_item->is_ready = true;
             work_item->result_status = s;
-            FlushReadyWorkItemsInOrder();
+            if (!currently_flushing_) {
+              FlushReadyWorkItemsInOrder();
+            }
             work_list2 = FindWorkLocked();
           }
           StartWorkUnlocked(work_list2);
@@ -194,14 +196,21 @@ void PremappedCopierState::FlushReadyWorkItemsInOrder() {
     if (!work_item->is_ready) {
       return;
     }
-    if (work_item->result_status.ok()) {
-      std::move(work_item->on_done)(this, work_item->dest_buffer,
-                                    work_item->work);
-    } else {
-      std::move(work_item->on_done)(this, work_item->result_status,
-                                    work_item->work);
+    if (!work_item->result_status.ok()) {
       available_copy_offsets_.push_back(work_item->dest_buffer);
     }
+    currently_flushing_ = true;
+    mu_.unlock();
+    {
+      auto on_done_fn = std::move(work_item->on_done);
+      if (work_item->result_status.ok()) {
+        std::move(on_done_fn)(this, work_item->dest_buffer, work_item->work);
+      } else {
+        std::move(on_done_fn)(this, work_item->result_status, work_item->work);
+      }
+    }
+    mu_.lock();
+    currently_flushing_ = false;
     work_queue_.pop_front();
     ++base_seq_id_;
   }
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt.h b/third_party/xla/xla/python/transfer/streaming_ifrt.h
index 0228a84c6545a9..6c4821ec98aad6 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt.h
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/transfer/streaming.h"
@@ -53,7 +53,7 @@ absl::StatusOr<std::shared_ptr<absl::Span<uint8_t>>> AllocateAndMapPjrtMemory(
 // with an assigned 'buffer_id'.
 struct DmaCopyChunk {
   absl::AnyInvocable<xla::Future<>(void* dst, int64_t offset,
-                                   int64_t transfer_size)>
+                                   int64_t transfer_size) &&>
       copy_fn;
   size_t buffer_id;
   size_t offset;
@@ -117,6 +117,7 @@ class PremappedCopierState
   size_t num_parallel_copies_ = 0;
   std::deque<WorkQueueItem> work_queue_ ABSL_GUARDED_BY(mu_);
   std::shared_ptr<absl::Span<uint8_t>> scratch_;
+  bool currently_flushing_ ABSL_GUARDED_BY(mu_) = false;
   size_t max_num_parallel_copies_;
   size_t xfer_size_;
   size_t max_copies_;
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc b/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
index d49cd6c7da2390..c1d57b2da99daa 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
@@ -24,11 +24,13 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
+#include "xla/future.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/python/ifrt/array.h"
@@ -44,6 +46,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
 
@@ -89,8 +92,10 @@ absl::StatusOr<SingleBufferCopyPlan> SetupTransferDestList(
   size_t copy_size = xla::ShapeUtil::ByteSizeOf(shape);
 
   results.dests.push_back(MakeDmaDestination(atm, 0, copy_size));
-  TF_ASSIGN_OR_RETURN(auto arr,
-                      ifrt_client->CreatePjRtArray(atm->RetrieveBuffer(0)));
+  // `CreateBuffersForAsyncHostToDevice` uses a default layout.
+  TF_ASSIGN_OR_RETURN(
+      auto arr, ifrt_client->CreatePjRtArray(atm->RetrieveBuffer(0),
+                                             /*has_custom_layout=*/false));
   results.arrays.push_back(std::move(arr));
   return results;
 }
@@ -152,6 +157,60 @@ absl::StatusOr<std::vector<int32_t>> FetchResult(
   return result;
 }
 
+TEST(PremappedCopierState, FreeCycle) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, xla::ifrt::test_util::GetClient());
+  std::shared_ptr<xla::PjRtClient> pjrt_client =
+      tensorflow::down_cast<xla::ifrt::PjRtClient*>(client.get())
+          ->shared_ptr_pjrt_client();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto scratch, AllocateAndMapPjrtMemory(pjrt_client, 1024 * 1024 * 16));
+  auto cstate = std::make_shared<PremappedCopierState>(scratch, 4, 4096);
+  std::vector<void*> buffers_to_return;
+  for (size_t i = 0; i < 2; ++i) {
+    cstate->ScheduleCopy(
+        {/*copy_fn=*/[](void* dst, int64_t offset,
+                        int64_t transfer_size) -> xla::Future<> {
+           return xla::Future<>(absl::OkStatus());
+         },
+         /*buffer_id=*/0,
+         /*offset=*/100,
+         /*size=*/100},
+        [&buffers_to_return](PremappedCopierState* state,
+                             absl::StatusOr<void*> buf,
+                             const DmaCopyChunk& chunk) {
+          TF_CHECK_OK(buf.status());
+          buffers_to_return.push_back(buf.value());
+        });
+  }
+  class BufferReturner {
+   public:
+    explicit BufferReturner(absl::AnyInvocable<void() &&> on_done)
+        : on_done_(std::move(on_done)) {}
+    ~BufferReturner() { std::move(on_done_)(); }
+
+   private:
+    absl::AnyInvocable<void() &&> on_done_;
+  };
+  cstate->ScheduleCopy(
+      {/*copy_fn=*/[buffer = std::make_unique<BufferReturner>(
+                        [b = buffers_to_return[0], cstate]() {
+                          cstate->ReturnBuffer(b);
+                        })](void* dst, int64_t offset,
+                            int64_t transfer_size) -> xla::Future<> {
+         return xla::Future<>(absl::OkStatus());
+       },
+       /*buffer_id=*/0,
+       /*offset=*/100,
+       /*size=*/100},
+      [buffer = std::make_unique<BufferReturner>(
+           [b = buffers_to_return[1], cstate]() { cstate->ReturnBuffer(b); })](
+          PremappedCopierState* state, absl::StatusOr<void*> buf,
+          const DmaCopyChunk& chunk) {
+        TF_CHECK_OK(buf.status());
+        state->ReturnBuffer(buf.value());
+      });
+}
+
 TEST(PremappedCopierState, RoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, xla::ifrt::test_util::GetClient());
   size_t xfer_size = 1024 * 1024;
diff --git a/third_party/xla/xla/python/transfer/streaming_test.cc b/third_party/xla/xla/python/transfer/streaming_test.cc
index 443a63c05552ff..af7988e3a26487 100644
--- a/third_party/xla/xla/python/transfer/streaming_test.cc
+++ b/third_party/xla/xla/python/transfer/streaming_test.cc
@@ -242,5 +242,29 @@ TEST(InvalidAllocator, InvalidAlignedAlloc) {
   ASSERT_FALSE(alloc2_or.ok());
 }
 
+class SelfResettingPullTableEntry : public PullTable::Entry {
+ public:
+  explicit SelfResettingPullTableEntry(std::shared_ptr<PullTable> table)
+      : table_(std::move(table)) {}
+
+  bool Handle(tsl::RCReference<ConnectionState> state,
+              const SocketTransferPullRequest& req,
+              size_t base_req_id) override {
+    table_->Reset();
+    return true;
+  }
+
+ private:
+  std::shared_ptr<PullTable> table_;
+};
+
+TEST(PullTable, PullTableRace) {
+  auto table = std::make_shared<PullTable>();
+  table->AwaitPull(6, tsl::MakeRef<SelfResettingPullTableEntry>(table));
+  SocketTransferPullRequest req;
+  req.set_uuid(6);
+  table->Handle({}, req, 0);
+}
+
 }  // namespace
 }  // namespace aux
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index b9d4901d56236f..ade86533648836 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
 #include "xla/python/safe_static_init.h"
+#include "xla/python/strides.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -470,44 +471,6 @@ absl::StatusOr<nb::str> TypeDescriptorForPrimitiveType(PrimitiveType type) {
   }
 }
 
-// Returns the strides for `shape`.
-std::vector<int64_t> ByteStridesForShape(const Shape& shape) {
-  std::vector<int64_t> strides;
-  CHECK(shape.IsArray());
-  CHECK(shape.has_layout());
-  return ByteStridesForShape(shape.element_type(), shape.dimensions(),
-                             shape.layout());
-}
-
-static std::vector<int64_t> StridesForShapeHelper(
-    PrimitiveType element_type, absl::Span<const int64_t> dimensions,
-    const xla::Layout& layout, int64_t innermost_stride_size) {
-  CHECK_EQ(dimensions.size(), layout.minor_to_major().size());
-  std::vector<int64_t> strides;
-  strides.resize(dimensions.size());
-  int64_t stride = innermost_stride_size;
-  for (int i : layout.minor_to_major()) {
-    strides[i] = stride;
-    stride *= dimensions[i];
-  }
-  return strides;
-}
-
-std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
-                                         absl::Span<const int64_t> dimensions,
-                                         const xla::Layout& layout) {
-  return StridesForShapeHelper(
-      element_type, dimensions, layout,
-      ShapeUtil::ByteSizeOfPrimitiveType(element_type));
-}
-
-std::vector<int64_t> StridesForShape(PrimitiveType element_type,
-                                     absl::Span<const int64_t> dimensions,
-                                     const xla::Layout& layout) {
-  return StridesForShapeHelper(element_type, dimensions, layout,
-                               /*innermost_stride_size=*/1);
-}
-
 absl::StatusOr<nb::object> LiteralToPython(
     std::shared_ptr<xla::Literal> literal) {
   xla::Literal& m = *literal;
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index 16222f87e7f354..095db6ba1e07d7 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/nb_numpy.h"
+#include "xla/python/strides.h"  // IWYU pragma: keep
 #include "xla/python/version.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
@@ -96,15 +97,6 @@ struct NumpyScalarTypes {
 };
 const NumpyScalarTypes& GetNumpyScalarTypes();
 
-// Returns the strides for `shape`.
-std::vector<int64_t> ByteStridesForShape(const Shape& shape);
-std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
-                                         absl::Span<const int64_t> dimensions,
-                                         const xla::Layout& layout);
-std::vector<int64_t> StridesForShape(PrimitiveType element_type,
-                                     absl::Span<const int64_t> dimensions,
-                                     const xla::Layout& layout);
-
 // Converts a literal to (possibly-nested tuples of) NumPy arrays.
 // The literal's leaf arrays are not copied; instead the NumPy arrays share
 // buffers with the literals. Takes ownership of `literal` and keeps the
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index f2d37c306e4e6f..6e52cd86ecc1a0 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER 33
+#define JAX_IFRT_VERSION_NUMBER \
+  37  // `LoadedExecutable::devices()` returns nullopt for portable executables.
 
 #endif  // XLA_PYTHON_VERSION_H_
diff --git a/third_party/xla/xla/pytype.bzl b/third_party/xla/xla/pytype.bzl
index deb4cf089a2ae2..88902bae682f77 100644
--- a/third_party/xla/xla/pytype.bzl
+++ b/third_party/xla/xla/pytype.bzl
@@ -1,8 +1,8 @@
 """Default (OSS) build versions of Python pytype rules."""
 
-load("@local_xla//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("@rules_python//python:py_binary.bzl", "py_binary")
 load("@rules_python//python:py_library.bzl", "py_library")
+load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
index 1e4e829d47630a..a12f6af3c87c31 100644
--- a/third_party/xla/xla/runtime/BUILD
+++ b/third_party/xla/xla/runtime/BUILD
@@ -19,8 +19,10 @@ cc_library(
     srcs = ["buffer_use.cc"],
     hdrs = ["buffer_use.h"],
     deps = [
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:span",
     ],
@@ -31,6 +33,7 @@ xla_cc_test(
     srcs = ["buffer_use_test.cc"],
     deps = [
         ":buffer_use",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -91,6 +94,7 @@ xla_cc_test(
         ":buffer_use",
         ":execution_graph",
         ":resource_use",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
@@ -159,6 +163,7 @@ cc_library(
     name = "device_id",
     hdrs = ["device_id.h"],
     deps = [
+        "//xla/tsl/distributed_runtime/coordination:coordination_service",
         "//xla/tsl/lib/gtl:int_type",
     ],
 )
diff --git a/third_party/xla/xla/runtime/buffer_use.h b/third_party/xla/xla/runtime/buffer_use.h
index b1811150d77437..4d0fa1cc40ee1a 100644
--- a/third_party/xla/xla/runtime/buffer_use.h
+++ b/third_party/xla/xla/runtime/buffer_use.h
@@ -17,11 +17,14 @@ limitations under the License.
 #define XLA_RUNTIME_BUFFER_USE_H_
 
 #include <cstdint>
+#include <optional>
 #include <tuple>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
 
 namespace xla {
 
@@ -31,6 +34,8 @@ namespace xla {
 //   conflicts. Synchronization primitives are specific to the target backend.
 // - Determine whether a buffer has defined contents before/after we execute a
 //   thunk. This is used to detect non-deterministic behavior via checksumming.
+// - We also use shape to know how the bytes in the slice are reinterpreted by
+//   thunks. Shape can be used by rewriters in ThunkPassPipeline.
 class BufferUse {
  public:
   enum class MemoryAccess {
@@ -63,26 +68,40 @@ class BufferUse {
                       : ContentValidity::kDefinedOnOutput) {}
 
   BufferUse(BufferAllocation::Slice slice, MemoryAccess access,
-            ContentValidity content_validity)
+            ContentValidity content_validity,
+            std::optional<Shape> shape = std::nullopt)
       : slice_(slice), access_(access), content_validity_(content_validity) {}
 
+  ABSL_DEPRECATED("Please provide shape as well.")
   static BufferUse Read(BufferAllocation::Slice slice) {
     return BufferUse(slice, MemoryAccess::kRead,
                      ContentValidity::kDefinedOnInputAndOutput);
   }
 
+  ABSL_DEPRECATED("Please provide shape as well.")
   static BufferUse Write(BufferAllocation::Slice slice) {
     return BufferUse(slice, MemoryAccess::kWrite,
                      ContentValidity::kDefinedOnOutput);
   }
 
-  static BufferUse Scratch(BufferAllocation::Slice slice) {
-    return BufferUse(slice, MemoryAccess::kWrite, ContentValidity::kUndefined);
+  static BufferUse Read(BufferAllocation::Slice slice, Shape shape) {
+    return BufferUse(slice, MemoryAccess::kRead,
+                     ContentValidity::kDefinedOnInputAndOutput, shape);
+  }
+
+  static BufferUse Write(BufferAllocation::Slice slice, Shape shape) {
+    return BufferUse(slice, MemoryAccess::kWrite,
+                     ContentValidity::kDefinedOnOutput, shape);
+  }
+
+  static BufferUse Scratch(BufferAllocation::Slice slice, Shape shape) {
+    return BufferUse(slice, MemoryAccess::kWrite, ContentValidity::kUndefined,
+                     shape);
   }
 
-  static BufferUse Consume(BufferAllocation::Slice slice) {
+  static BufferUse Consume(BufferAllocation::Slice slice, Shape shape) {
     return BufferUse(slice, MemoryAccess::kWrite,
-                     ContentValidity::kDefinedOnInput);
+                     ContentValidity::kDefinedOnInput, shape);
   }
 
   // Returns true if the buffer contains initialized data when thunk starts
@@ -146,6 +165,7 @@ class BufferUse {
 
  private:
   BufferAllocation::Slice slice_;
+  std::optional<Shape> shape_;
   MemoryAccess access_;
   ContentValidity content_validity_;
 };
diff --git a/third_party/xla/xla/runtime/buffer_use_test.cc b/third_party/xla/xla/runtime/buffer_use_test.cc
index 055724ea072cef..2110753016c5ce 100644
--- a/third_party/xla/xla/runtime/buffer_use_test.cc
+++ b/third_party/xla/xla/runtime/buffer_use_test.cc
@@ -18,19 +18,22 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 
 namespace xla {
 namespace {
 
 TEST(BufferUseTest, Equality) {
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
-  BufferAllocation::Slice slice0(&alloc, 0, 10);
+  Shape slice0_shape = ShapeUtil::MakeShape(F32, {2});
+  BufferAllocation::Slice slice0(&alloc, 0, 8);
 
-  BufferUse use_read0 = BufferUse::Read(slice0);
-  BufferUse use_read1 = BufferUse::Read(slice0);
-  BufferUse use_write = BufferUse::Write(slice0);
-  BufferUse use_scratch = BufferUse::Scratch(slice0);
-  BufferUse use_consume = BufferUse::Consume(slice0);
+  BufferUse use_read0 = BufferUse::Read(slice0, slice0_shape);
+  BufferUse use_read1 = BufferUse::Read(slice0, slice0_shape);
+  BufferUse use_write = BufferUse::Write(slice0, slice0_shape);
+  BufferUse use_scratch = BufferUse::Scratch(slice0, slice0_shape);
+  BufferUse use_consume = BufferUse::Consume(slice0, slice0_shape);
 
   EXPECT_EQ(use_read0, use_read1);
   EXPECT_NE(use_read0, use_write);
@@ -45,41 +48,43 @@ TEST(BufferUseTest, Equality) {
 
 TEST(BufferUseTest, HasDefinedContents) {
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
-  BufferAllocation::Slice slice(&alloc, 0, 10);
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {2});
+  BufferAllocation::Slice slice(&alloc, 0, 8);
 
-  BufferUse read = BufferUse::Read(slice);
+  BufferUse read = BufferUse::Read(slice, slice_shape);
   EXPECT_TRUE(read.HasDefinedContentsOnInput());
   EXPECT_TRUE(read.HasDefinedContentsOnOutput());
 
-  BufferUse write = BufferUse::Write(slice);
+  BufferUse write = BufferUse::Write(slice, slice_shape);
   EXPECT_FALSE(write.HasDefinedContentsOnInput());
   EXPECT_TRUE(write.HasDefinedContentsOnOutput());
 
-  BufferUse scratch = BufferUse::Scratch(slice);
+  BufferUse scratch = BufferUse::Scratch(slice, slice_shape);
   EXPECT_FALSE(scratch.HasDefinedContentsOnInput());
   EXPECT_FALSE(scratch.HasDefinedContentsOnOutput());
 
-  BufferUse consume = BufferUse::Consume(slice);
+  BufferUse consume = BufferUse::Consume(slice, slice_shape);
   EXPECT_TRUE(consume.HasDefinedContentsOnInput());
   EXPECT_FALSE(consume.HasDefinedContentsOnOutput());
 }
 
 TEST(BufferUseTest, AbslStringify) {
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
-  BufferAllocation::Slice slice(&alloc, 0, 10);
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {2});
+  BufferAllocation::Slice slice(&alloc, 0, 8);
 
   EXPECT_EQ(
-      absl::StrCat(BufferUse::Read(slice)),
-      "{slice: {index:0, offset:0, size:10}, access: R, content_validity: IO}");
+      absl::StrCat(BufferUse::Read(slice, slice_shape)),
+      "{slice: {index:0, offset:0, size:8}, access: R, content_validity: IO}");
   EXPECT_EQ(
-      absl::StrCat(BufferUse::Write(slice)),
-      "{slice: {index:0, offset:0, size:10}, access: W, content_validity: O}");
+      absl::StrCat(BufferUse::Write(slice, slice_shape)),
+      "{slice: {index:0, offset:0, size:8}, access: W, content_validity: O}");
   EXPECT_EQ(
-      absl::StrCat(BufferUse::Scratch(slice)),
-      "{slice: {index:0, offset:0, size:10}, access: W, content_validity: }");
+      absl::StrCat(BufferUse::Scratch(slice, slice_shape)),
+      "{slice: {index:0, offset:0, size:8}, access: W, content_validity: }");
   EXPECT_EQ(
-      absl::StrCat(BufferUse::Consume(slice)),
-      "{slice: {index:0, offset:0, size:10}, access: W, content_validity: I}");
+      absl::StrCat(BufferUse::Consume(slice, slice_shape)),
+      "{slice: {index:0, offset:0, size:8}, access: W, content_validity: I}");
 }
 
 TEST(BufferUseTest, ReadWriteSet) {
@@ -87,17 +92,18 @@ TEST(BufferUseTest, ReadWriteSet) {
 
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
 
-  BufferAllocation::Slice slice0(&alloc, 0, 10);
-  BufferAllocation::Slice slice1(&alloc, 5, 10);
-  BufferAllocation::Slice slice2(&alloc, 10, 10);
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {2});
+  BufferAllocation::Slice slice0(&alloc, 0, 8);
+  BufferAllocation::Slice slice1(&alloc, 4, 8);
+  BufferAllocation::Slice slice2(&alloc, 8, 8);
 
   rwset.Add(BufferUse::Read(slice0));
-  EXPECT_FALSE(rwset.HasConflicts({BufferUse::Read(slice1)}));
-  EXPECT_TRUE(rwset.HasConflicts({BufferUse::Write(slice1)}));
-  EXPECT_FALSE(rwset.HasConflicts({BufferUse::Write(slice2)}));
+  EXPECT_FALSE(rwset.HasConflicts({BufferUse::Read(slice1, slice_shape)}));
+  EXPECT_TRUE(rwset.HasConflicts({BufferUse::Write(slice1, slice_shape)}));
+  EXPECT_FALSE(rwset.HasConflicts({BufferUse::Write(slice2, slice_shape)}));
 
   rwset.Add(BufferUse::Read(slice1));
-  EXPECT_TRUE(rwset.HasConflicts({BufferUse::Write(slice2)}));
+  EXPECT_TRUE(rwset.HasConflicts({BufferUse::Write(slice2, slice_shape)}));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/runtime/device_id.h b/third_party/xla/xla/runtime/device_id.h
index e118242957fbcb..86b0e1926d18b8 100644
--- a/third_party/xla/xla/runtime/device_id.h
+++ b/third_party/xla/xla/runtime/device_id.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 #include "xla/tsl/lib/gtl/int_type.h"
 
 namespace xla {
@@ -29,6 +30,8 @@ namespace xla {
 TSL_LIB_GTL_DEFINE_INT_TYPE(GlobalDeviceId, int64_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(LocalDeviceId, int64_t);
 
+using ::tsl::IncarnationId;  // NOLINT(misc-unused-using-decls)
+
 }  // namespace xla
 
 #endif  // XLA_RUNTIME_DEVICE_ID_H_
diff --git a/third_party/xla/xla/runtime/execution_graph_test.cc b/third_party/xla/xla/runtime/execution_graph_test.cc
index 4e23673a9fe9cc..77dd0ecf7bda9e 100644
--- a/third_party/xla/xla/runtime/execution_graph_test.cc
+++ b/third_party/xla/xla/runtime/execution_graph_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -66,17 +67,18 @@ TEST(ExecutionGraphTest, EdgePriority) {
 TEST(ExecutionGraphTest, DependencyOrdering) {
   BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
 
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10});
   BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
   BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
   BufferAllocation::Slice slice2(&alloc, /*offset=*/20, /*size=*/40);
 
   std::vector<Operation> operations;
-  operations.push_back(
-      Operation({BufferUse::Read(slice0), BufferUse::Write(slice0)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice2), BufferUse::Write(slice2)}));
+  operations.push_back(Operation({BufferUse::Read(slice0, slice_shape),
+                                  BufferUse::Write(slice0, slice_shape)}));
+  operations.push_back(Operation({BufferUse::Read(slice1, slice_shape),
+                                  BufferUse::Write(slice1, slice_shape)}));
+  operations.push_back(Operation({BufferUse::Read(slice2, slice_shape),
+                                  BufferUse::Write(slice2, slice_shape)}));
 
   TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
                           ExecutionGraph::Create<Operation>(operations));
@@ -99,15 +101,16 @@ TEST(ExecutionGraphTest, DependencyOrdering) {
 
 TEST(ExecutionGraphTest, SequentialOrdering) {
   BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10});
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
 
   std::vector<Operation> operations;
-  operations.push_back(
-      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+  operations.push_back(Operation({BufferUse::Read(slice, slice_shape),
+                                  BufferUse::Write(slice, slice_shape)}));
+  operations.push_back(Operation({BufferUse::Read(slice, slice_shape),
+                                  BufferUse::Write(slice, slice_shape)}));
+  operations.push_back(Operation({BufferUse::Read(slice, slice_shape),
+                                  BufferUse::Write(slice, slice_shape)}));
 
   TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
                           ExecutionGraph::Create<Operation>(operations));
@@ -133,18 +136,19 @@ TEST(ExecutionGraphTest, SequentialOrdering) {
 TEST(ExecutionGraphTest, TokenResourceOrdering) {
   BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
 
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10});
   BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
   BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
 
   auto resource = Resource::Create(Resource::Kind::kToken);
 
   std::vector<Operation> operations;
-  operations.push_back(
-      Operation({BufferUse::Read(slice0), BufferUse::Write(slice0)},
-                {ResourceUse::Write(resource)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)},
-                {ResourceUse::Write(resource)}));
+  operations.push_back(Operation({BufferUse::Read(slice0, slice_shape),
+                                  BufferUse::Write(slice0, slice_shape)},
+                                 {ResourceUse::Write(resource)}));
+  operations.push_back(Operation({BufferUse::Read(slice1, slice_shape),
+                                  BufferUse::Write(slice1, slice_shape)},
+                                 {ResourceUse::Write(resource)}));
 
   TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
                           ExecutionGraph::Create<Operation>(operations));
@@ -165,18 +169,19 @@ TEST(ExecutionGraphTest, TokenResourceOrdering) {
 TEST(ExecutionGraphTest, CollectivesResourceOrdering) {
   BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
 
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10});
   BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
   BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
 
   auto resource = Resource::Create(Resource::Kind::kCollectiveCommunicator);
 
   std::vector<Operation> operations;
-  operations.push_back(
-      Operation({BufferUse::Read(slice0), BufferUse::Write(slice0)},
-                {ResourceUse::Write(resource)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)},
-                {ResourceUse::Write(resource)}));
+  operations.push_back(Operation({BufferUse::Read(slice0, slice_shape),
+                                  BufferUse::Write(slice0, slice_shape)},
+                                 {ResourceUse::Write(resource)}));
+  operations.push_back(Operation({BufferUse::Read(slice1, slice_shape),
+                                  BufferUse::Write(slice1, slice_shape)},
+                                 {ResourceUse::Write(resource)}));
   operations.push_back(
       Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)},
                 {ResourceUse::Write(resource)}));
@@ -208,15 +213,16 @@ TEST(ExecutionGraphTest, CollectivesResourceOrdering) {
 
 TEST(ExecutionGraphTest, TransitiveReduction) {
   BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10});
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
 
   std::vector<Operation> operations;
-  operations.push_back(
-      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
-  operations.push_back(
-      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+  operations.push_back(Operation({BufferUse::Read(slice, slice_shape),
+                                  BufferUse::Write(slice, slice_shape)}));
+  operations.push_back(Operation({BufferUse::Read(slice, slice_shape),
+                                  BufferUse::Write(slice, slice_shape)}));
+  operations.push_back(Operation({BufferUse::Read(slice, slice_shape),
+                                  BufferUse::Write(slice, slice_shape)}));
 
   TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
                           ExecutionGraph::Create<Operation>(operations));
@@ -240,6 +246,7 @@ TEST(ExecutionGraphTest, TransitiveReduction) {
 
 TEST(ExecutionGraphTest, TransitiveReductionKeepsExecutionEdge) {
   BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10});
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
 
   auto resource = Resource::Create(Resource::Kind::kCollectiveCommunicator);
@@ -249,12 +256,12 @@ TEST(ExecutionGraphTest, TransitiveReductionKeepsExecutionEdge) {
   // All three operations connected with scheduling edges, but because execution
   // edge provides stronger ordering guarantee, we must keep an 0-2 execution
   // edge, or we might get a data race.
-  operations.push_back(
-      Operation({BufferUse::Write(slice)}, {ResourceUse::Write(resource)}));
+  operations.push_back(Operation({BufferUse::Write(slice, slice_shape)},
+                                 {ResourceUse::Write(resource)}));
   operations.push_back(
       Operation(/*buffers=*/{}, {ResourceUse::Write(resource)}));
-  operations.push_back(
-      Operation({BufferUse::Write(slice)}, {ResourceUse::Write(resource)}));
+  operations.push_back(Operation({BufferUse::Write(slice, slice_shape)},
+                                 {ResourceUse::Write(resource)}));
 
   TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
                           ExecutionGraph::Create<Operation>(operations));
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 482973ee5cef99..893ed8c26c29c6 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -231,6 +231,36 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "all_reduce_reduce_scatter_reorder",
+    srcs = ["all_reduce_reduce_scatter_reorder.cc"],
+    hdrs = ["all_reduce_reduce_scatter_reorder.h"],
+    deps = [
+        ":collective_ops_utils",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "all_reduce_reduce_scatter_reorder_test",
+    srcs = ["all_reduce_reduce_scatter_reorder_test.cc"],
+    deps = [
+        ":all_reduce_reduce_scatter_reorder",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "float_support",
     srcs = ["float_support.cc"],
@@ -263,8 +293,8 @@ xla_cc_test(
         ":source_target_pairs",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest_main",
     ],
@@ -391,6 +421,7 @@ cc_library(
         ":hlo_value",
         "//xla:comparison_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -491,16 +522,16 @@ xla_cc_test(
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/hlo/utils:hlo_query",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -536,6 +567,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Transforms",
@@ -594,19 +626,18 @@ cc_library(
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -625,11 +656,11 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -667,6 +698,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -910,6 +942,7 @@ xla_cc_test(
     name = "hlo_sharding_test",
     srcs = ["hlo_sharding_test.cc"],
     deps = [
+        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:tile_assignment",
@@ -932,18 +965,20 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -1019,7 +1054,6 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -1124,12 +1158,10 @@ cc_library(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -1139,11 +1171,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
     alwayslink = 1,
@@ -1172,6 +1200,7 @@ cc_library(
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
@@ -1218,7 +1247,6 @@ cc_library(
         ":hlo_buffer",
         ":hlo_cost_analysis",
         ":hlo_value",
-        "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:side_effect_util",
         "//xla:status_macros",
@@ -1232,7 +1260,6 @@ cc_library(
         "//xla/service/heap_simulator",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -1264,7 +1291,7 @@ xla_cc_test(
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test_helpers",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/transforms/collectives:async_collective_creator",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
@@ -1354,7 +1381,9 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
@@ -1459,16 +1488,12 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -1517,6 +1542,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/client:executable_build_options",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
@@ -1531,6 +1557,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -1573,8 +1600,8 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory_allocator",
@@ -1591,7 +1618,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -1658,6 +1685,8 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
@@ -1827,14 +1856,15 @@ xla_cc_test(
         "//xla/service/memory_space_assignment",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1955,6 +1985,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1992,17 +2023,21 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/hlo/analysis:hlo_dataflow_analysis",
+        "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -2087,6 +2122,7 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
     ],
@@ -2104,6 +2140,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
@@ -2111,8 +2148,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2196,12 +2231,12 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2250,6 +2285,7 @@ cc_library(
         ":hlo_module_config",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:constants",
@@ -2259,8 +2295,10 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -2612,7 +2650,6 @@ cc_library(
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:tuple_simplifier",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2670,17 +2707,16 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/lib/core:bitmap",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2742,46 +2778,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_unstacker",
-    srcs = ["hlo_unstacker.cc"],
-    hdrs = ["hlo_unstacker.h"],
-    deps = [
-        ":hlo_creation_utils",
-        ":pattern_matcher",
-        ":tuple_util",
-        ":while_loop_unroller",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "hlo_unstacker_test",
-    srcs = ["hlo_unstacker_test.cc"],
-    tags = if_google(["requires-net:external"]),
-    deps = [
-        ":hlo_unstacker",
-        "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "while_loop_unroller",
     srcs = ["while_loop_unroller.cc"],
@@ -2856,6 +2852,7 @@ cc_library(
         ":call_inliner",
         ":hlo_creation_utils",
         ":pattern_matcher",
+        ":while_util",
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -2998,7 +2995,6 @@ cc_library(
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3062,6 +3058,7 @@ xla_test(
     deps = [
         ":dynamic_dimension_inference",
         ":dynamic_padder",
+        ":hlo_module_config",
         ":pattern_matcher",
         "//xla:error_spec",
         "//xla:literal",
@@ -3087,7 +3084,6 @@ xla_test(
         "//xla/tests:xla_test_backend_predicates",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/log",
@@ -3104,20 +3100,22 @@ xla_cc_test(
     srcs = ["dynamic_dimension_inference_test.cc"],
     deps = [
         ":dynamic_dimension_inference",
-        ":hlo_runner",
-        "//xla:literal",
+        "//xla:comparison_util",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
-        "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test_benchmark",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -3324,9 +3322,9 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -3386,11 +3384,10 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3399,6 +3396,7 @@ xla_cc_test(
     srcs = ["hlo_module_test.cc"],
     deps = [
         ":buffer_value",
+        ":compilation_environments",
         ":computation_placer_hdr",
         ":hlo_module_config",
         ":test_compilation_environment_proto_cc",
@@ -3420,6 +3418,7 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/lib/strings:proto_serialization",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
@@ -3428,7 +3427,6 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -3589,6 +3587,8 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:tuple_simplifier",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3640,7 +3640,6 @@ cc_library(
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:tuple_simplifier",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3706,14 +3705,13 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_query",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_benchmark",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_benchmark//:benchmark",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test_benchmark",
     ],
 )
 
@@ -3913,13 +3911,12 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3934,7 +3931,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3967,7 +3963,6 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -4298,17 +4293,20 @@ cc_library(
     srcs = ["transpose_folding.cc"],
     hdrs = ["transpose_folding.h"],
     deps = [
-        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -4318,7 +4316,7 @@ xla_cc_test(
     deps = [
         ":shape_inference",
         ":transpose_folding",
-        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
@@ -4327,12 +4325,13 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -4408,9 +4407,9 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor/integrations:tf_allocator_adapter",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:die_if_null",
@@ -4444,22 +4443,20 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -4478,6 +4475,7 @@ cc_library(
         ":computation_layout",
         ":computation_placer_hdr",
         ":hlo_runner_interface",
+        "//xla:future",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_layout",
@@ -4492,13 +4490,14 @@ cc_library(
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:recordphase",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status",
@@ -4520,7 +4519,9 @@ xla_cc_test(
         ":computation_placer_hdr",
         ":hlo_runner_interface",
         ":hlo_runner_pjrt",
+        ":test_compilation_environment_proto_cc",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
@@ -4530,13 +4531,16 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:path",
     ],
@@ -4678,6 +4682,7 @@ xla_cc_test(
     srcs = ["while_util_test.cc"],
     deps = [
         ":while_util",
+        "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -4714,11 +4719,13 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -4735,11 +4742,12 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -4800,6 +4808,7 @@ cc_library(
         "while_loop_invariant_code_motion.h",
     ],
     deps = [
+        ":memory_annotations_hdr",
         ":while_util",
         "//xla:shape_util",
         "//xla:util",
@@ -4945,8 +4954,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5224,7 +5231,6 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -5273,12 +5279,19 @@ cc_library(
         ":pattern_matcher",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:comparators",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -5303,6 +5316,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -5474,12 +5488,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_protobuf//:any_cc_proto",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",  # fixdeps: keep
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5514,6 +5524,7 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:protobuf",
@@ -5625,10 +5636,10 @@ xla_cc_test(
         ":scatter_simplifier",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -5746,7 +5757,6 @@ xla_cc_binary(
     ] + if_cuda_or_rocm_is_configured([
         # keep sorted
         ":gpu_plugin",
-        "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
         "//xla/stream_executor/gpu:gpu_init",
     ]) + if_cuda_is_configured([
@@ -5857,6 +5867,7 @@ xla_cc_test(
         "//xla/service/cpu:cpu_compiler",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
@@ -5907,6 +5918,9 @@ tf_proto_library(
     name = "buffer_assignment_proto",
     srcs = ["buffer_assignment.proto"],
     make_default_target_header_only = True,
+    protodeps = [
+        "//xla:xla_data_proto",
+    ],
 )
 
 cc_library(
@@ -5933,6 +5947,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -5949,11 +5964,11 @@ xla_cc_test(
         "//xla:parse_flags_from_env",
         "//xla:xla_proto_cc",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6115,10 +6130,10 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -6232,9 +6247,10 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
diff --git a/third_party/xla/xla/service/algorithm_util.cc b/third_party/xla/xla/service/algorithm_util.cc
index 9af1b73358ebcb..dcc6cf8e9f1a9c 100644
--- a/third_party/xla/xla/service/algorithm_util.cc
+++ b/third_party/xla/xla/service/algorithm_util.cc
@@ -223,28 +223,20 @@ bool IsSupportedDotAlgorithmOnGpu(
     PrimitiveType output_storage_type) {
   // Note: We may want to add some complex types here if people request that.
   const bool is_cuda_ge_ampere =
-      std::holds_alternative<se::CudaComputeCapability>(
-          gpu_compute_capability) &&
-      std::get<se::CudaComputeCapability>(gpu_compute_capability)
-          .IsAtLeastAmpere();
+      gpu_compute_capability.IsCuda() &&
+      gpu_compute_capability.cuda_compute_capability()->IsAtLeastAmpere();
 
   const bool is_cuda_ge_ada =
-      std::holds_alternative<se::CudaComputeCapability>(
-          gpu_compute_capability) &&
-      std::get<se::CudaComputeCapability>(gpu_compute_capability)
-          .IsAtLeast(8, 9);
+      gpu_compute_capability.IsCuda() &&
+      gpu_compute_capability.cuda_compute_capability()->IsAtLeast(8, 9);
 
   const bool is_rocm_mi100_and_above =
-      std::holds_alternative<se::RocmComputeCapability>(
-          gpu_compute_capability) &&
-      std::get<se::RocmComputeCapability>(gpu_compute_capability)
-          .gfx9_mi100_or_later();
+      gpu_compute_capability.IsRocm() &&
+      gpu_compute_capability.rocm_compute_capability()->gfx9_mi100_or_later();
 
-  const bool is_rocm_bf16 =
-      std::holds_alternative<se::RocmComputeCapability>(
-          gpu_compute_capability) &&
-      std::get<se::RocmComputeCapability>(gpu_compute_capability)
-          .has_bf16_dtype_support();
+  const bool is_rocm_bf16 = gpu_compute_capability.IsRocm() &&
+                            gpu_compute_capability.rocm_compute_capability()
+                                ->has_bf16_dtype_support();
 
   switch (algorithm) {
     case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
diff --git a/third_party/xla/xla/service/all_gather_decomposer.cc b/third_party/xla/xla/service/all_gather_decomposer.cc
index ce3ed5f5f44026..c21b03c7834f23 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.cc
+++ b/third_party/xla/xla/service/all_gather_decomposer.cc
@@ -112,7 +112,7 @@ absl::Status AllGatherDecomposer::DecomposeAllGather(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> AllGatherDecomposer::Run(
+absl::StatusOr<bool> AllGatherDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/all_gather_decomposer.h b/third_party/xla/xla/service/all_gather_decomposer.h
index cf2713243e3c6e..742ee210a8916d 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.h
+++ b/third_party/xla/xla/service/all_gather_decomposer.h
@@ -46,13 +46,6 @@ class AllGatherDecomposer : public HloModulePass {
             [](const HloAllGatherInstruction& ag) { return true; }) {}
   absl::string_view name() const override { return "all_gather_decomposer"; }
 
-  // Run AllGatherDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   virtual HloInstruction* TranslateAllGatherToAllReducePerOperand(
       CollectiveOpGroupMode group_mode, const HloAllGatherInstruction& ag,
@@ -66,6 +59,12 @@ class AllGatherDecomposer : public HloModulePass {
   absl::Status DecomposeAllGather(HloAllGatherInstruction* ag,
                                   HloComputation* comp);
 
+  // Run AllGatherDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
 };
diff --git a/third_party/xla/xla/service/all_gather_simplifier.cc b/third_party/xla/xla/service/all_gather_simplifier.cc
index 3e83ab88c958cb..d1e6f1a285f32a 100644
--- a/third_party/xla/xla/service/all_gather_simplifier.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllGatherSimplifier::Run(
+absl::StatusOr<bool> AllGatherSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/all_gather_simplifier.h b/third_party/xla/xla/service/all_gather_simplifier.h
index 726a4a0cdf7fe1..bf4c5c0101a64b 100644
--- a/third_party/xla/xla/service/all_gather_simplifier.h
+++ b/third_party/xla/xla/service/all_gather_simplifier.h
@@ -37,10 +37,10 @@ class AllGatherSimplifier : public HloModulePass {
   static constexpr absl::string_view kName = "all-gather-simplifier";
   absl::string_view name() const override { return kName; }
 
+ protected:
   // Run all-gather simplification on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/all_reduce_promotion.cc b/third_party/xla/xla/service/all_reduce_promotion.cc
index bdc4422a6aff7b..59aea1c4a128f4 100644
--- a/third_party/xla/xla/service/all_reduce_promotion.cc
+++ b/third_party/xla/xla/service/all_reduce_promotion.cc
@@ -73,7 +73,7 @@ AllReducePromotion::AllReducePromotion(
     absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types)
     : pass_(from_to_types, IsAllReduce, CloneAllReduce) {}
 
-absl::StatusOr<bool> AllReducePromotion::Run(
+absl::StatusOr<bool> AllReducePromotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return pass_.Run(module, execution_threads);
diff --git a/third_party/xla/xla/service/all_reduce_promotion.h b/third_party/xla/xla/service/all_reduce_promotion.h
index e6459f82e00dc2..3a480896fb88b6 100644
--- a/third_party/xla/xla/service/all_reduce_promotion.h
+++ b/third_party/xla/xla/service/all_reduce_promotion.h
@@ -35,8 +35,8 @@ class AllReducePromotion : public HloModulePass {
       absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types);
   absl::string_view name() const override { return "all-reduce-promotion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/all_reduce_reassociate.cc b/third_party/xla/xla/service/all_reduce_reassociate.cc
index 6063eef7b6e6b0..3b3c55257f6949 100644
--- a/third_party/xla/xla/service/all_reduce_reassociate.cc
+++ b/third_party/xla/xla/service/all_reduce_reassociate.cc
@@ -179,7 +179,7 @@ bool MatchOperandsToAllReduceWithOptionalConvert(HloInstruction* inst,
 }
 }  // namespace
 
-absl::StatusOr<bool> AllReduceReassociate::Run(
+absl::StatusOr<bool> AllReduceReassociate::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
diff --git a/third_party/xla/xla/service/all_reduce_reassociate.h b/third_party/xla/xla/service/all_reduce_reassociate.h
index 9fbeb32e6bf81f..00f9d143f0c361 100644
--- a/third_party/xla/xla/service/all_reduce_reassociate.h
+++ b/third_party/xla/xla/service/all_reduce_reassociate.h
@@ -39,8 +39,8 @@ class AllReduceReassociate : public HloModulePass {
 
   absl::string_view name() const override { return "all-reduce-reassociate"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder.cc b/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder.cc
new file mode 100644
index 00000000000000..e738bc918a9256
--- /dev/null
+++ b/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder.cc
@@ -0,0 +1,90 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/all_reduce_reduce_scatter_reorder.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace xla {
+
+namespace {
+// Returns true if
+// 1. `inst` is an all-reduce instruction.
+// 2. `inst` has a single user.
+// 3. The user is a reduce-scatter instruction.
+// 4. The all-reduce and reduce-scatter have the same reduction type.
+bool IsAllReduceReduceScatter(const HloInstruction* ar) {
+  if (ar->opcode() != HloOpcode::kAllReduce) {
+    return false;
+  }
+  if (ar->user_count() != 1) {
+    return false;
+  }
+  const HloInstruction* rs = ar->users().front();
+  if (rs->opcode() != HloOpcode::kReduceScatter) {
+    return false;
+  }
+  if (MatchReductionComputation(ar->to_apply()) &&
+      MatchReductionComputation(ar->to_apply()) !=
+          MatchReductionComputation(rs->to_apply())) {
+    return false;
+  }
+  return true;
+}
+
+// Before: operand -> old_ar -> old_rs -> users
+// After:  operand -> new_rs -> new_ar -> users
+//
+// old_rs, new_rs, and new_ar share the same shape, while old_ar has a larger
+// shape.
+absl::Status ReorderAllReduceReduceScatter(HloInstruction* old_ar) {
+  HloComputation* computation = old_ar->parent();
+  HloInstruction* old_rs = old_ar->users().front();
+  HloInstruction* new_rs = computation->AddInstruction(
+      old_rs->CloneWithNewOperands(old_rs->shape(), old_ar->operands()));
+  HloInstruction* new_ar = computation->AddInstruction(
+      old_ar->CloneWithNewOperands(old_rs->shape(), {new_rs}));
+  TF_RETURN_IF_ERROR(old_rs->ReplaceUsesWith(old_rs->users(), new_ar));
+
+  TF_RETURN_IF_ERROR(computation->RemoveInstruction(old_rs));
+  TF_RETURN_IF_ERROR(computation->RemoveInstruction(old_ar));
+  return absl::OkStatus();
+}
+}  // namespace
+
+absl::StatusOr<bool> AllReduceReduceScatterReorder::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (auto computation : module->computations(execution_threads)) {
+    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+      if (IsAllReduceReduceScatter(inst)) {
+        TF_RETURN_IF_ERROR(ReorderAllReduceReduceScatter(inst));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder.h b/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder.h
new file mode 100644
index 00000000000000..ed4915b7f428f5
--- /dev/null
+++ b/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder.h
@@ -0,0 +1,49 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_ALL_REDUCE_REDUCE_SCATTER_REORDER_H_
+#define XLA_SERVICE_ALL_REDUCE_REDUCE_SCATTER_REORDER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// Rewrites all-reduce + reduce-scatter to reduce-scatter + all-reduce such that
+// the all-reduce is on a smaller tensor. The proof of equivalence is below.
+//
+//   AR_1 + RS_2
+// = AR_1 + AR_2 + DS_2
+// = AR_2 + AR_1 + DS_2
+// = AR_2 + DS_2 + AR_1
+// = RS_2 + AR_1
+class AllReduceReduceScatterReorder : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "all-reduce-reduce-scatter-reorder";
+  }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_ALL_REDUCE_REDUCE_SCATTER_REORDER_H_
diff --git a/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder_test.cc b/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder_test.cc
new file mode 100644
index 00000000000000..708c7d30ec4c1e
--- /dev/null
+++ b/third_party/xla/xla/service/all_reduce_reduce_scatter_reorder_test.cc
@@ -0,0 +1,131 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/all_reduce_reduce_scatter_reorder.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class AllReduceReduceScatterReorderTest
+    : public HloHardwareIndependentTestBase {
+ public:
+  AllReduceReduceScatterReorder pass_;
+};
+
+TEST_F(AllReduceReduceScatterReorderTest, KeepingReplicaGroups) {
+  absl::string_view hlo_text = R"(
+  sum {
+    a = f32[] parameter(0)
+    b = f32[] parameter(1)
+    ROOT sum = f32[] add(a, b)
+  }
+
+  ENTRY main {
+    p0 = f32[8] parameter(0)
+    ar = f32[8] all-reduce(p0), replica_groups={{0,1}, {2,3}}, to_apply=sum
+    ROOT rs = f32[4] reduce-scatter(ar), dimensions={0}, replica_groups={{0,2}, {1,3}}, to_apply=sum
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass_, module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* param = module->entry_computation()->parameter_instruction(0);
+  HloInstruction* new_rs = param->users().front();
+  HloInstruction* new_ar = new_rs->users().front();
+
+  EXPECT_EQ(new_rs->replica_groups().size(), 2);
+  EXPECT_THAT(new_rs->replica_groups()[0].replica_ids(),
+              ::testing::ElementsAre(0, 2));
+  EXPECT_THAT(new_rs->replica_groups()[1].replica_ids(),
+              ::testing::ElementsAre(1, 3));
+
+  EXPECT_EQ(new_ar->replica_groups().size(), 2);
+  EXPECT_THAT(new_ar->replica_groups()[0].replica_ids(),
+              ::testing::ElementsAre(0, 1));
+  EXPECT_THAT(new_ar->replica_groups()[1].replica_ids(),
+              ::testing::ElementsAre(2, 3));
+
+  EXPECT_EQ(new_rs->opcode(), HloOpcode::kReduceScatter);
+  EXPECT_EQ(new_ar->opcode(), HloOpcode::kAllReduce);
+
+  EXPECT_EQ(new_rs->shape().dimensions(0), 4);
+  EXPECT_EQ(new_ar->shape().dimensions(0), 4);
+
+  EXPECT_EQ(new_ar, module->entry_computation()->root_instruction());
+}
+
+TEST_F(AllReduceReduceScatterReorderTest, AllReduceMultipleUsers) {
+  absl::string_view hlo_text = R"(
+  sum {
+    a = f32[] parameter(0)
+    b = f32[] parameter(1)
+    ROOT sum = f32[] add(a, b)
+  }
+
+  ENTRY main {
+    p0 = f32[8] parameter(0)
+    ar = f32[8] all-reduce(p0), replica_groups={}, to_apply=sum
+    rs = f32[4] reduce-scatter(ar), dimensions={0}, replica_groups={}, to_apply=sum
+    ROOT tuple = (f32[8], f32[4]) tuple(ar, rs)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass_, module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(AllReduceReduceScatterReorderTest, DifferentReductionFunctions) {
+  absl::string_view hlo_text = R"(
+  sum {
+    a = f32[] parameter(0)
+    b = f32[] parameter(1)
+    ROOT sum = f32[] add(a, b)
+  }
+
+  product {
+    a = f32[] parameter(0)
+    b = f32[] parameter(1)
+    ROOT product = f32[] multiply(a, b)
+  }
+
+  ENTRY main {
+    p0 = f32[8] parameter(0)
+    ar = f32[8] all-reduce(p0), replica_groups={}, to_apply=sum
+    ROOT rs = f32[4] reduce-scatter(ar), dimensions={0}, replica_groups={}, to_apply=product
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass_, module.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/all_reduce_simplifier.cc b/third_party/xla/xla/service/all_reduce_simplifier.cc
index 13a6db2c62c1a7..02a40c7bb2ff41 100644
--- a/third_party/xla/xla/service/all_reduce_simplifier.cc
+++ b/third_party/xla/xla/service/all_reduce_simplifier.cc
@@ -40,7 +40,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllReduceSimplifier::Run(
+absl::StatusOr<bool> AllReduceSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/all_reduce_simplifier.h b/third_party/xla/xla/service/all_reduce_simplifier.h
index ea041c39637c1b..40019304bd9030 100644
--- a/third_party/xla/xla/service/all_reduce_simplifier.h
+++ b/third_party/xla/xla/service/all_reduce_simplifier.h
@@ -33,10 +33,10 @@ class AllReduceSimplifier : public HloModulePass {
   static constexpr absl::string_view kName = "all-reduce-simplifier";
   absl::string_view name() const override { return kName; }
 
+ protected:
   // Run all-reduce simplification on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/batchnorm_expander.cc b/third_party/xla/xla/service/batchnorm_expander.cc
index a0052f1b0e651e..7830dd4d6e8b8d 100644
--- a/third_party/xla/xla/service/batchnorm_expander.cc
+++ b/third_party/xla/xla/service/batchnorm_expander.cc
@@ -37,10 +37,9 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -146,7 +145,7 @@ bool BatchNormExpanderVisitor::Run(HloComputation* computation,
       /*rewrite_training_op=*/rewrite_training_op,
       /*rewrite_inference_op=*/rewrite_inference_op,
       /*rewrite_grad_op=*/rewrite_grad_op);
-  TF_CHECK_OK(computation->Accept(&visitor));
+  CHECK_OK(computation->Accept(&visitor));
   return visitor.changed();
 }
 
@@ -294,7 +293,7 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     }
     tuple->set_sharding(sharding);
   }
-  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
+  CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
   return absl::OkStatus();
 }
 
@@ -386,7 +385,7 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormInference(
     }
     shifted_normalized->set_sharding(sharding);
   }
-  TF_CHECK_OK(ReplaceInstruction(batch_norm, shifted_normalized));
+  CHECK_OK(ReplaceInstruction(batch_norm, shifted_normalized));
   return absl::OkStatus();
 }
 
@@ -576,15 +575,16 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormGrad(
     tuple->set_sharding(sharding);
   }
 
-  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
+  CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
 
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> BatchNormExpander::Run(
+absl::StatusOr<bool> BatchNormExpander::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      2, "BatchNormExpander::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
@@ -594,7 +594,8 @@ absl::StatusOr<bool> BatchNormExpander::Run(
       changed = true;
     }
   }
-  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2,
+                 "BatchNormExpander::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/batchnorm_expander.h b/third_party/xla/xla/service/batchnorm_expander.h
index 15738efdc44158..4d0931ef202ac5 100644
--- a/third_party/xla/xla/service/batchnorm_expander.h
+++ b/third_party/xla/xla/service/batchnorm_expander.h
@@ -41,10 +41,10 @@ class BatchNormExpander : public HloModulePass {
   ~BatchNormExpander() override = default;
   absl::string_view name() const override { return "batchnorm_expander"; }
 
+ protected:
   // Run operation expander on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index 94e1f217233211..fef83711d7664a 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -245,6 +245,7 @@ BufferAllocation::Slice::ToProto() const {
   proto.set_offset(offset());
   proto.set_size(size());
   proto.set_buffer_allocation_index(allocation() == nullptr ? -1 : index());
+  proto.set_element_type(element_type());
   return proto;
 }
 
@@ -259,13 +260,14 @@ absl::StatusOr<BufferAllocation::Slice> BufferAllocation::Slice::FromProto(
   }
   const BufferAllocation& allocation =
       buffer_allocations[proto.buffer_allocation_index()];
-  return BufferAllocation::Slice(&allocation, proto.offset(), proto.size());
+  return BufferAllocation::Slice(&allocation, proto.offset(), proto.size(),
+                                 proto.element_type());
 }
 
 BufferAllocation::Slice BufferAllocation::GetSlice(
     const HloValue& buffer) const {
   const OffsetSize os = FindOrDie(assigned_buffers_, &buffer);
-  return Slice(this, os.offset, os.size);
+  return Slice(this, os.offset, os.size, buffer.shape().element_type());
 }
 
 absl::Status BufferAllocation::AddAssignment(const HloValue& buffer,
@@ -331,6 +333,8 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     proto_assigned->set_logical_buffer_id(buffer_offset_size.first->id());
     proto_assigned->set_offset(buffer_offset_size.second.offset);
     proto_assigned->set_size(buffer_offset_size.second.size);
+    proto_assigned->set_element_type(
+        buffer_offset_size.first->shape().element_type());
   }
   absl::c_sort(*proto.mutable_assigned(),
                [](const BufferAllocationProto::Assigned& assign1,
@@ -990,13 +994,21 @@ std::string BufferAllocation::MemoryUsageReport(const std::string& prefix,
 
   // Group the values by their offset in the allocation.
   absl::flat_hash_map<int64_t, OffsetInfo> offset_to_buffers;
+  std::vector<const HloValue*> sorted_hlo_values;
+  sorted_hlo_values.reserve(assigned_buffers_.size());
   for (const auto& element : assigned_buffers_) {
-    const HloValue* value = element.first;
-    OffsetInfo& offset_info = offset_to_buffers[element.second.offset];
+    sorted_hlo_values.push_back(element.first);
+  }
+  absl::c_sort(sorted_hlo_values, [](const HloValue* a, const HloValue* b) {
+    return a->id() < b->id();
+  });
+  for (const HloValue* value : sorted_hlo_values) {
+    const OffsetSize& offset_size = assigned_buffers_.find(value)->second;
+    OffsetInfo& offset_info = offset_to_buffers[offset_size.offset];
     offset_info.values.push_back(value);
-    offset_info.offset_size.offset = element.second.offset;
+    offset_info.offset_size.offset = offset_size.offset;
     offset_info.offset_size.size =
-        std::max(offset_info.offset_size.size, element.second.size);
+        std::max(offset_info.offset_size.size, offset_size.size);
   }
 
   // Sort the offset infos by the max size of the values in the group.
@@ -1008,7 +1020,14 @@ std::string BufferAllocation::MemoryUsageReport(const std::string& prefix,
   }
   absl::c_sort(sorted_offset_infos,
                [](const OffsetInfo& a, const OffsetInfo& b) {
-                 return a.offset_size.size > b.offset_size.size;
+                 if (a.offset_size.size != b.offset_size.size) {
+                   return a.offset_size.size > b.offset_size.size;
+                 }
+                 // Each HloValue appears in just one OffsetInfo `values`
+                 // vector. Therefore we can use the id of any element of the
+                 // `values` vector as tie breaker, as long as the `values`
+                 // vector has a deterministic order.
+                 return a.values.back()->id() < b.values.back()->id();
                });
 
   StrAppend(&output, prefix,
@@ -1030,15 +1049,21 @@ std::string BufferAllocation::MemoryUsageReport(const std::string& prefix,
     // of the line.
     absl::flat_hash_map<std::string, int64_t> shapes;
     for (auto& value : offset_info.values) shapes[value->shape().ToString()]++;
+    std::vector<std::pair<int64_t, std::string>> sorted_shapes;
+    sorted_shapes.reserve(shapes.size());
+    for (const auto& entry : shapes) {
+      sorted_shapes.emplace_back(entry.second, entry.first);
+    }
+    absl::c_sort(sorted_shapes);
 
-    StrAppend(
-        &output,
-        absl::StrJoin(shapes, ", ", [](std::string* out, const auto& pair) {
-          if (pair.second == 1) {
-            return absl::StrAppend(out, pair.first);
-          }
-          return absl::StrAppend(out, pair.second, "×", pair.first);
-        }));
+    StrAppend(&output,
+              absl::StrJoin(
+                  sorted_shapes, ", ", [](std::string* out, const auto& pair) {
+                    if (pair.first == 1) {
+                      return absl::StrAppend(out, pair.second);
+                    }
+                    return absl::StrAppend(out, pair.first, "×", pair.second);
+                  }));
 
     StrAppend(&output, "\n");
 
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index c33ca009589fa8..27b126b59b6362 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -190,15 +190,25 @@ class BufferAllocation {
   class Slice {
    public:
     Slice() = default;
-    Slice(const BufferAllocation* allocation, int64_t offset, int64_t size)
-        : allocation_(allocation), offset_(offset), size_(size) {}
+    Slice(const BufferAllocation* allocation, int64_t offset, int64_t size,
+          PrimitiveType element_type = PrimitiveType::PRIMITIVE_TYPE_INVALID)
+        : allocation_(allocation),
+          offset_(offset),
+          size_(size),
+          element_type_(element_type) {}
 
     const BufferAllocation* allocation() const { return allocation_; }
     Index index() const { return allocation_->index(); }
     int64_t offset() const { return offset_; }
     int64_t size() const { return size_; }
+    PrimitiveType element_type() const { return element_type_; }
 
     bool operator==(const Slice& other) const {
+      if (allocation_ == nullptr) {
+        return other.allocation_ == nullptr;
+      }
+      // We don't compare element_type_ because it's not always set, and it's
+      // not relevant for the comparison here.
       return index() == other.index() && offset_ == other.offset_ &&
              size_ == other.size_;
     }
@@ -252,6 +262,7 @@ class BufferAllocation {
     const BufferAllocation* allocation_ = nullptr;
     int64_t offset_ = 0;
     int64_t size_ = 0;
+    PrimitiveType element_type_ = PrimitiveType::PRIMITIVE_TYPE_INVALID;
   };
 
   // GetSlice returns the Slice of contiguous memory that holds the value
diff --git a/third_party/xla/xla/service/buffer_assignment.proto b/third_party/xla/xla/service/buffer_assignment.proto
index 6f6b2ac35aef27..df6714b0faa069 100644
--- a/third_party/xla/xla/service/buffer_assignment.proto
+++ b/third_party/xla/xla/service/buffer_assignment.proto
@@ -15,6 +15,8 @@ limitations under the License.
 
 syntax = "proto3";
 
+import "xla/xla_data.proto";
+
 package xla.buffer_assignment;
 
 // This defines the buffer isolation configuration, which is a debugging tool to
@@ -108,4 +110,5 @@ message BufferAllocationSliceProto {
   int64 offset = 1;
   int64 size = 2;
   int64 buffer_allocation_index = 3;
+  xla.PrimitiveType element_type = 4;
 }
diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index 94e09dda795807..bc7097e7e3c461 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -25,8 +25,10 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -64,7 +66,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
@@ -76,7 +77,6 @@ using memory_space_assignment::PresetAssignments;
 using ::testing::HasSubstr;
 using ::testing::UnorderedElementsAre;
 using ::tsl::proto_testing::EqualsProto;
-using tsl::testing::StatusIs;
 
 // DFS visitor that collects the instructions referenced by a computation
 // without descending into nested computations, i.e., only from the operands.
@@ -104,7 +104,7 @@ class InstructionListVisitor : public DfsHloVisitorWithDefault {
 
 std::vector<const HloInstruction*> GetInstructions(HloInstruction* root) {
   InstructionListVisitor main_list(root);
-  TF_CHECK_OK(root->Accept(&main_list));
+  CHECK_OK(root->Accept(&main_list));
   return main_list.GetInstructions();
 }
 
@@ -2215,6 +2215,14 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
   EXPECT_FALSE(buffer.IsInputOrOutput());
   EXPECT_TRUE(buffer.IsPreallocatedTempBuffer());
   ASSERT_EQ(buffer.assigned_buffers().size(), 4);
+  const char* const kExpectedMemoryUsageReport =
+      R"(cumulative_size;       size;       offset; used_by_n_values; shapes_list
+------------------------------------------------------------
+     800B( 50%);       800B;            0;                2; f32[100], f32[200]
+   1.2KiB( 75%);       400B;          800;                1; f32[100]
+   1.6KiB(100%);       400B;         1200;                1; f32[100]
+)";
+  EXPECT_EQ(buffer.MemoryUsageReport(""), kExpectedMemoryUsageReport);
 
   const std::vector<const HloValue*>& peak_buffers =
       buffer.PeakMemoryLogicalBuffers();
diff --git a/third_party/xla/xla/service/call_graph.cc b/third_party/xla/xla/service/call_graph.cc
index 335cbf942f3ee9..2ee5b15a2aa538 100644
--- a/third_party/xla/xla/service/call_graph.cc
+++ b/third_party/xla/xla/service/call_graph.cc
@@ -18,20 +18,27 @@ limitations under the License.
 #include <deque>
 #include <memory>
 #include <queue>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/map_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
 
 namespace xla {
 
@@ -587,8 +594,7 @@ absl::flat_hash_set<const T*> CallGraph::NearestCommonAncestorsHelper(
         return nearest_common_ancestors.contains(nca);
       })) {
     absl::erase_if(nearest_common_ancestors, [&starting_nodes](const T* nca) {
-      return std::find(starting_nodes.begin(), starting_nodes.end(), nca) ==
-             starting_nodes.end();
+      return absl::c_find(starting_nodes, nca) == starting_nodes.end();
     });
   }
 
diff --git a/third_party/xla/xla/service/call_inliner.cc b/third_party/xla/xla/service/call_inliner.cc
index 60346bf5f8e9c4..216d15d48df8f6 100644
--- a/third_party/xla/xla/service/call_inliner.cc
+++ b/third_party/xla/xla/service/call_inliner.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -261,7 +262,7 @@ bool InlineComposites(
 
 // Introduces a specific attribute so that the frontend has the direct
 // control over inlining specific calls.
-bool InlineInstruction(HloInstruction* instruction) {
+bool FrontendAttributesAllowInlining(HloInstruction* instruction) {
   auto it = instruction->frontend_attributes().map().find("inlineable");
   if (it != instruction->frontend_attributes().map().end()) {
     return it->second == "true";
@@ -293,18 +294,26 @@ CallInliner::Inline(HloInstruction* call) {
   // inlined instructions.
   if (call->has_frontend_attributes()) {
     const FrontendAttributes& call_attributes = call->frontend_attributes();
-    std::string has_fuse =
-        call_attributes.map().contains("MUST_FUSE")      ? "MUST_FUSE"
-        : call_attributes.map().contains("MAXIMAL_FUSE") ? "MAXIMAL_FUSE"
-                                                         : "";
-    if (!has_fuse.empty()) {
+    for (auto maybe_attribute :
+         {call_attributes.map().contains("MUST_FUSE")
+              ? std::make_optional("MUST_FUSE")
+          : call_attributes.map().contains("MAXIMAL_FUSE")
+              ? std::make_optional("MAXIMAL_FUSE")
+              : std::nullopt,
+          call_attributes.map().contains("mosaic_fusion_group")
+              ? std::make_optional("mosaic_fusion_group")
+              : std::nullopt}) {
+      if (!maybe_attribute.has_value()) {
+        continue;
+      }
+      const auto attribute = *maybe_attribute;
       for (auto instruction : callee->instructions()) {
         // Do so for only fusible instructions.
         if (instruction->IsFusible()) {
           FrontendAttributes frontend_attributes =
               instruction->frontend_attributes();
           frontend_attributes.mutable_map()->insert(
-              {has_fuse, call_attributes.map().at(has_fuse)});
+              {attribute, call_attributes.map().at(attribute)});
           instruction->set_frontend_attributes(frontend_attributes);
         }
       }
@@ -324,11 +333,6 @@ bool CallInliner::IsInlineableCallOp(HloInstruction* instruction) const {
   if (!prerequisite) {
     return false;
   }
-  if (!InlineInstruction(instruction)) {
-    // Always prioritize user's explicit requests after fulfilling the
-    // prerequisites.
-    return false;
-  }
   if (instruction->GetModule()->config().use_shardy_partitioner() &&
       (absl::StrContains(instruction->to_apply()->name(), "shmap_body") ||
        absl::StrContains(instruction->to_apply()->name(),
@@ -351,16 +355,30 @@ bool CallInliner::IsInlineableCallOp(HloInstruction* instruction) const {
 
 bool CallInliner::ShouldInline(const CallGraph& call_graph,
                                HloInstruction* instruction) const {
+  // Check this is an inlineable call op (but not frontend attributes)
   if (!IsInlineableCallOp(instruction)) {
     return false;
   }
 
-  if (should_inline_.has_value()) {
-    if (!(*should_inline_)(call_graph, instruction)) {
+  // Check the override policy, if any.
+  InlineOverridePolicy policy = InlineOverridePolicy::kAllowInline;
+  if (override_policy_.has_value()) {
+    policy = (*override_policy_)(call_graph, instruction);
+  }
+
+  // If the policy is to never inline, we're done.
+  if (policy == InlineOverridePolicy::kProhibitInline) {
+    return false;
+  }
+
+  // If the policy is to ignore frontend attributes, do so.
+  if (policy != InlineOverridePolicy::kAllowIgnoreFrontendAttributes) {
+    if (!FrontendAttributesAllowInlining(instruction)) {
       return false;
     }
   }
 
+  // If we're only inlining calls with a single call site, check that.
   if (single_call_site_) {
     return call_graph.GetNode(instruction->to_apply())
                .caller_callsites()
@@ -421,13 +439,13 @@ absl::StatusOr<bool> CallInliner::InlineAndLegalize(
   }
   if (did_node_mutate && uniquify_channel_ids_) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (!dynamic_cast<HloChannelInstruction*>(instruction)) {
+      if (!HloChannelInstruction::ClassOf(instruction)) {
         continue;
       }
       // Channel IDs for host transfers are part of the ABI, and can never be
       // uniquified.
       HloSendRecvInstruction* send_recv =
-          dynamic_cast<HloSendRecvInstruction*>(instruction);
+          DynCast<HloSendRecvInstruction>(instruction);
       if (send_recv && send_recv->is_host_transfer()) {
         continue;
       }
@@ -448,7 +466,7 @@ absl::StatusOr<bool> CallInliner::RunWithInlineMap(
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
         HloChannelInstruction* channel_instruction =
-            dynamic_cast<HloChannelInstruction*>(instruction);
+            DynCast<HloChannelInstruction>(instruction);
         if (channel_instruction &&
             channel_instruction->channel_id().has_value()) {
           next_unique_channel_id_ =
@@ -495,7 +513,7 @@ absl::StatusOr<bool> CallInliner::RunWithInlineMap(
   return did_mutate;
 }
 
-absl::StatusOr<bool> CallInliner::Run(
+absl::StatusOr<bool> CallInliner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return RunWithInlineMap(module, std::nullopt, execution_threads);
@@ -506,7 +524,7 @@ bool IsInlineableComputation(HloComputation* computation) {
     bool prerequisite = instruction->opcode() == HloOpcode::kCall &&
                         !instruction->has_backend_config() &&
                         !instruction->parent()->IsAsyncComputation();
-    if (!prerequisite || !InlineInstruction(instruction)) {
+    if (!prerequisite || (!FrontendAttributesAllowInlining(instruction))) {
       return false;
     }
     return true;
diff --git a/third_party/xla/xla/service/call_inliner.h b/third_party/xla/xla/service/call_inliner.h
index 54b41c9f37b518..b1a7bf57db417f 100644
--- a/third_party/xla/xla/service/call_inliner.h
+++ b/third_party/xla/xla/service/call_inliner.h
@@ -39,6 +39,13 @@ namespace xla {
 // called function, and proceed recursively.
 class CallInliner : public HloModulePass {
  public:
+  enum class InlineOverridePolicy {
+    kAllowInline,                    // Allow inlining as normal.
+    kProhibitInline,                 // Prohibit inlining of this callsite.
+    kAllowIgnoreFrontendAttributes,  // Allow even with the 'inlineable'
+                                     // frontend attribute is set != 'true'.
+  };
+
   using InlinedInstructionMap =
       absl::flat_hash_map<HloInstruction*, HloInstruction*>;
 
@@ -52,27 +59,23 @@ class CallInliner : public HloModulePass {
   // are being inlined if necessary.
   // If `uniquify_channel_ids` is true, the channel ids of the resulting
   // computation will be uniquified.
-  // If the callback `should_inline` is provided, only functions callsite for
-  // which it returns true will be inlined.
+  // If the callback `override_policy` is provided, callsites will be inlined
+  // according to the policy returned.
   explicit CallInliner(
       bool single_call_site = false, bool update_domain = false,
       absl::flat_hash_set<std::string> composites_to_preserve = {},
       bool uniquify_channel_ids = false,
-      std::optional<std::function<bool(const CallGraph&, HloInstruction*)>>
-          should_inline = std::nullopt)
+      std::optional<std::function<InlineOverridePolicy(const CallGraph&,
+                                                       const HloInstruction*)>>
+          override_policy = std::nullopt)
       : single_call_site_(single_call_site),
         update_domain_(update_domain),
         uniquify_channel_ids_(uniquify_channel_ids),
         composites_to_preserve_(std::move(composites_to_preserve)),
-        should_inline_(std::move(should_inline)) {}
+        override_policy_(std::move(override_policy)) {}
   ~CallInliner() override = default;
   absl::string_view name() const override { return "call-inliner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::StatusOr<bool> RunWithInlineMap(
       HloModule* module, std::optional<InlinedInstructionMap*> inline_map,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
@@ -84,6 +87,11 @@ class CallInliner : public HloModulePass {
   // Maximum length of an op_name that can be formed during inlining.
   static constexpr int kMaxOpNameSize = 1024;
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   absl::StatusOr<bool> InlineAndLegalize(
       const CallGraph& call_graph, HloComputation* computation,
@@ -97,9 +105,9 @@ class CallInliner : public HloModulePass {
   bool update_domain_;
   bool uniquify_channel_ids_;
   absl::flat_hash_set<std::string> composites_to_preserve_;
-  std::optional<
-      std::function<bool(const CallGraph& call_graph, HloInstruction*)>>
-      should_inline_;
+  std::optional<std::function<InlineOverridePolicy(const CallGraph& call_graph,
+                                                   const HloInstruction*)>>
+      override_policy_;
   int64_t next_unique_channel_id_ = 1;
 };
 
diff --git a/third_party/xla/xla/service/call_inliner_test.cc b/third_party/xla/xla/service/call_inliner_test.cc
index f7fa2b17e6d71c..ebc774166ff38b 100644
--- a/third_party/xla/xla/service/call_inliner_test.cc
+++ b/third_party/xla/xla/service/call_inliner_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -358,6 +358,41 @@ ENTRY %main_outer (p0: u32[]) -> u32[] {
   }
 }
 
+TEST_F(CallInlinerTest, PropagateFrontendAttributes) {
+  const absl::string_view hlo_string = R"(
+    HloModule inliner_fe_attr_prop
+
+    %add_one (p: f32[]) -> f32[] {
+      %p = f32[] parameter(0)
+      %one = f32[] constant(1)
+      ROOT %add = f32[] add(%p, %one)
+    }
+
+    ENTRY %main () -> f32[] {
+      %c = f32[] constant(10)
+      ROOT %call = f32[] call(%c), to_apply=%add_one, frontend_attributes={mosaic_fusion_group="1"}
+    })";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CallInliner call_inliner;
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
+  ASSERT_TRUE(mutated);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add());
+  ASSERT_TRUE(root->has_frontend_attributes());
+  EXPECT_EQ(root->frontend_attributes().map().at("mosaic_fusion_group"), "1");
+
+  const HloInstruction* c = root->operand(0);
+  EXPECT_THAT(c, op::Constant());
+  EXPECT_FALSE(c->frontend_attributes().map().contains("mosaic_fusion_group"));
+
+  const HloInstruction* one = root->operand(1);
+  EXPECT_THAT(one, op::Constant());
+  ASSERT_TRUE(one->has_frontend_attributes());
+  EXPECT_EQ(one->frontend_attributes().map().at("mosaic_fusion_group"), "1");
+}
+
 TEST_F(CallInlinerTest, InlineCompositeCall) {
   const absl::string_view hlo_string = R"(
   HloModule composite
@@ -432,7 +467,7 @@ TEST_F(CallInlinerTest, DontInlineCallWithAttributeInlineableFalse) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hloString));
   module->mutable_config().set_use_shardy_partitioner(true);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, CallInliner().Run(module.get()))
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, CallInliner().Run(module.get()));
   // The single call in the module is not inlined.
   EXPECT_FALSE(changed);
 
@@ -442,6 +477,40 @@ TEST_F(CallInlinerTest, DontInlineCallWithAttributeInlineableFalse) {
   EXPECT_EQ(call->to_apply()->name(), "test");
 }
 
+TEST_F(CallInlinerTest, InlineCallWithOverriddenAttributeInlineableFalse) {
+  const char* const hloString = R"(
+    HloModule jit_f, entry_computation_layout={(f32[8,8]{1,0})->f32[8,8]{1,0}}
+    %test (Arg_0.5: f32[1,8]) -> f32[1,8] {
+      %Arg_0.5 = f32[1,8]{1,0} parameter(0)
+      ROOT %add.6 = f32[1,8]{1,0} add(f32[1,8]{1,0} %Arg_0.5, f32[1,8]{1,0} %Arg_0.5), metadata={source_file="-" source_line=11}
+    }
+    ENTRY %main.10 (Arg_0.1: f32[8,8]) -> f32[8,8] {
+      %Arg_0.1 = f32[8,8]{1,0} parameter(0)
+      %custom-call.3 = f32[1,8]{1,0} custom-call(f32[8,8]{1,0} %Arg_0.1), custom_call_target="SPMDFullToShardShape", sharding={manual}, metadata={source_file="-" source_line=4}
+      %call.7 = f32[1,8]{1,0} call(f32[1,8]{1,0} %custom-call.3), to_apply=%test, frontend_attributes={inlineable="false"}
+      ROOT %custom-call.9 = f32[8,8]{1,0} custom-call(f32[1,8]{1,0} %call.7), custom_call_target="SPMDShardToFullShape", sharding={devices=[8,1]<=[8]}, metadata={source_file="-" source_line=7}
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hloString));
+  module->mutable_config().set_use_shardy_partitioner(true);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      CallInliner(
+          /*single_call_site=*/false, /*update_domain=*/false,
+          /*composites_to_preserve=*/absl::flat_hash_set<std::string>{},
+          /*uniquify_channel_ids=*/false,
+          /*override_policy=*/
+          [](const CallGraph&, const HloInstruction*) {
+            return CallInliner::InlineOverridePolicy::
+                kAllowIgnoreFrontendAttributes;
+          })
+          .Run(module.get()));
+  // The single call will be inlined despite the inlineable attribute being
+  // false because we set override_frontend_attributes to true.
+  EXPECT_TRUE(changed);
+  HloInstruction* call = FindInstruction(module.get(), xla::HloOpcode::kCall);
+  EXPECT_EQ(call, nullptr);
+}
+
 TEST_F(CallInlinerTest, UseShardyMhloToHloShmapBodyNotInlined) {
   const char* const hloString = R"(
     HloModule jit_f, entry_computation_layout={(f32[8,8]{1,0})->f32[8,8]{1,0}}
@@ -519,7 +588,7 @@ TEST_F(CallInlinerTest, UseShardManualComputationBodySurroundedNotInlined) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hloString));
   module->mutable_config().set_use_shardy_partitioner(true);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, CallInliner().Run(module.get()))
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, CallInliner().Run(module.get()));
   // The single call in the module is not inlined.
   EXPECT_FALSE(changed);
 
@@ -941,15 +1010,20 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
                           ParseAndReturnVerifiedModule(hlo));
 
+  using InlineOverridePolicy = CallInliner::InlineOverridePolicy;
   auto inline_trivial_only = [](const CallGraph& call_graph,
-                                HloInstruction* instruction) {
+                                const HloInstruction* instruction) {
     HloComputation* callee = instruction->to_apply();
-    return (callee->root_instruction()->opcode() == HloOpcode::kParameter);
+    InlineOverridePolicy policy = InlineOverridePolicy::kProhibitInline;
+    if (callee->root_instruction()->opcode() == HloOpcode::kParameter) {
+      policy = InlineOverridePolicy::kAllowInline;
+    }
+    return policy;
   };
   CallInliner call_inliner(/*single_call_site=*/false, /*update_domain=*/false,
                            /*composites_to_preserve=*/{},
                            /*uniquify_channel_ids=*/false,
-                           /*should_inline=*/inline_trivial_only);
+                           /*override_policy=*/inline_trivial_only);
 
   ASSERT_THAT(call_inliner.Run(m.get()), absl_testing::IsOkAndHolds(true));
   EXPECT_THAT(m->entry_computation()->root_instruction(),
@@ -980,8 +1054,7 @@ ENTRY main {
   ASSERT_THAT(call_inliner.Run(m.get()), absl_testing::IsOkAndHolds(true));
   HloComputation* entry = m->entry_computation();
   for (HloInstruction* inst : entry->instructions()) {
-    HloSendRecvInstruction* send_recv =
-        dynamic_cast<HloSendRecvInstruction*>(inst);
+    HloSendRecvInstruction* send_recv = DynCast<HloSendRecvInstruction>(inst);
     if (send_recv && send_recv->is_host_transfer()) {
       EXPECT_EQ(send_recv->channel_id(), 1);
     }
@@ -1025,8 +1098,7 @@ ENTRY main {
   absl::flat_hash_set<int64_t> channel_ids;
   for (HloComputation* comp : m->computations()) {
     for (HloInstruction* inst : comp->instructions()) {
-      HloChannelInstruction* channel =
-          dynamic_cast<HloChannelInstruction*>(inst);
+      HloChannelInstruction* channel = DynCast<HloChannelInstruction>(inst);
       if (channel && channel->channel_id().has_value()) {
         channel_ids.insert(channel->channel_id().value());
       }
diff --git a/third_party/xla/xla/service/change_op_data_type.cc b/third_party/xla/xla/service/change_op_data_type.cc
index 2f004ec2b3d625..b62e1a34b1fb73 100644
--- a/third_party/xla/xla/service/change_op_data_type.cc
+++ b/third_party/xla/xla/service/change_op_data_type.cc
@@ -40,7 +40,7 @@ std::optional<PrimitiveType> GetUniformOperandType(
 }
 }  // namespace
 
-absl::StatusOr<bool> ChangeOpDataType::Run(
+absl::StatusOr<bool> ChangeOpDataType::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -68,8 +68,8 @@ absl::StatusOr<bool> ChangeOpDataType::Run(
 #ifdef XLA_ONEDNN
       // TODO(penporn): Move this logic outside of this pass.
       const DebugOptions& debug_options = module->config().debug_options();
-      if (debug_options.xla_cpu_use_onednn() &&
-          !debug_options.xla_cpu_experimental_onednn_custom_call() &&
+      if ((debug_options.xla_cpu_use_onednn() ||
+           debug_options.xla_cpu_experimental_onednn_custom_call()) &&
           cpu::OneDnnContractionRewriter::ShouldRewriteInstr(instr, true)) {
         continue;
       }
diff --git a/third_party/xla/xla/service/change_op_data_type.h b/third_party/xla/xla/service/change_op_data_type.h
index 43e929d6e8e7c8..811c7281646dbb 100644
--- a/third_party/xla/xla/service/change_op_data_type.h
+++ b/third_party/xla/xla/service/change_op_data_type.h
@@ -63,8 +63,9 @@ class ChangeOpDataType : public HloModulePass {
   }
 
   absl::string_view name() const override { return "change-op-data-type"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/collective_decomposer_utils.cc b/third_party/xla/xla/service/collective_decomposer_utils.cc
index 807760567786c7..27488fcc5d2bbd 100644
--- a/third_party/xla/xla/service/collective_decomposer_utils.cc
+++ b/third_party/xla/xla/service/collective_decomposer_utils.cc
@@ -64,11 +64,12 @@ CreateStartIndicesForCollectiveDecomposition(
 
   HloInstruction *participant_id;
   switch (group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA:
       participant_id =
           computation->AddInstruction(HloInstruction::CreateReplicaId());
       break;
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition:
+    case CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION:
       // For this mode, the replica groups contain replica_id's, but the
       // participant are replicas with the given replica_id across all
       // partitions (ordered in partition id order, see
@@ -82,14 +83,18 @@ CreateStartIndicesForCollectiveDecomposition(
       participant_id =
           computation->AddInstruction(HloInstruction::CreateReplicaId());
       break;
-    case CollectiveOpGroupMode::kCrossPartition:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION:
       participant_id =
           computation->AddInstruction(HloInstruction::CreatePartitionId());
       break;
-    case CollectiveOpGroupMode::kFlattenedID:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID:
       participant_id = create_flattened_id(
           computation->AddInstruction(HloInstruction::CreateReplicaId()));
       break;
+    default: {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported group mode: ", group_mode));
+    }
   }
 
   auto is_trivial_group = [](absl::Span<const ReplicaGroup> replica_groups) {
@@ -152,7 +157,8 @@ CreateStartIndicesForCollectiveDecomposition(
 
   // For cross-replica and partition mode, we need to scale the index (which is
   // the replica index) by num_partitions and add partition_id;
-  if (group_mode == CollectiveOpGroupMode::kCrossReplicaAndPartition) {
+  if (group_mode == CollectiveOpGroupMode::
+                        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION) {
     index = create_flattened_id(index);
   }
 
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index 5f0135b5c37060..c9cba232b6b235 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -271,14 +271,17 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
 
   // If replica groups are empty, assume a group with all replicas.
   if (replica_groups.empty()) {
-    if (group_mode == CollectiveOpGroupMode::kFlattenedID) {
+    if (group_mode ==
+        CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
       // replica groups contain flattened-ids and cannot be empty.
       TF_RET_CHECK(!replica_groups.empty())
-          << "replica groups cannot be empty for kFlattenedID mode";
+          << "replica groups cannot be empty for "
+             "COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID mode";
     }
 
     int total_participant_count;
-    if (group_mode == CollectiveOpGroupMode::kCrossPartition) {
+    if (group_mode ==
+        CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION) {
       // replica group are partition ids.
       total_participant_count = partition_count;
     } else {
@@ -295,7 +298,7 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
 
   std::vector<std::vector<GlobalDeviceId>> groups;
   switch (group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA: {
       for (const auto& replica_group : participating_replica_groups) {
         // replica_group contains replica id, participants contains all
         // replica_group's replica_ids for the current partition.
@@ -313,7 +316,7 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
       }
       return groups;
     }
-    case CollectiveOpGroupMode::kCrossPartition: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION: {
       for (const auto& replica_group : participating_replica_groups) {
         // replica_group contains partition id, participants contains all
         // replica_group's partition_ids for the current replica_id.
@@ -330,7 +333,8 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
       }
       return groups;
     }
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition: {
+    case CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION: {
       for (const auto& replica_group : participating_replica_groups) {
         std::vector<GlobalDeviceId> participants;
         participants.reserve(replica_group.replica_ids().size() *
@@ -349,7 +353,7 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
       }
       return groups;
     }
-    case CollectiveOpGroupMode::kFlattenedID: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID: {
       for (const auto& replica_group : participating_replica_groups) {
         std::vector<GlobalDeviceId> participants;
         participants.reserve(replica_group.replica_ids().size());
@@ -365,6 +369,10 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
       }
       return groups;
     }
+    default: {
+      return InvalidArgument("Invalid collective op group mode: %d",
+                             static_cast<int>(group_mode));
+    }
   }
 }
 
@@ -391,7 +399,8 @@ absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
 absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
     const CollectiveDeviceList& collective_device_list,
     CollectiveOpGroupMode group_mode, int replica_count, int partition_count) {
-  if (group_mode == CollectiveOpGroupMode::kFlattenedID) {
+  if (group_mode ==
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
     return collective_device_list;
   }
   std::vector<ReplicaGroup> filled_empty_replica_group;
@@ -401,14 +410,17 @@ absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
   if (collective_device_list.replica_groups().empty()) {
     filled_empty_replica_group.emplace_back();
     const int64_t id_count =
-        group_mode == CollectiveOpGroupMode::kCrossPartition ? partition_count
-                                                             : replica_count;
+        group_mode ==
+                CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION
+            ? partition_count
+            : replica_count;
     for (int i = 0; i < id_count; ++i) {
       filled_empty_replica_group.back().add_replica_ids(i);
     }
     original_replica_groups = filled_empty_replica_group;
   }
-  if (group_mode == CollectiveOpGroupMode::kCrossReplica) {
+  if (group_mode ==
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA) {
     flattened_replica_groups.resize(original_replica_groups.size() *
                                     partition_count);
     for (int64_t i = 0, current_group_offset = 0;
@@ -424,7 +436,8 @@ absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
         }
       }
     }
-  } else if (group_mode == CollectiveOpGroupMode::kCrossPartition) {
+  } else if (group_mode ==
+             CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION) {
     flattened_replica_groups.resize(original_replica_groups.size() *
                                     replica_count);
     for (int64_t i = 0, current_group_offset = 0;
@@ -440,7 +453,9 @@ absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
       }
     }
   } else {
-    CHECK(group_mode == CollectiveOpGroupMode::kCrossReplicaAndPartition);
+    CHECK(group_mode ==
+          CollectiveOpGroupMode::
+              COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION);
     flattened_replica_groups.resize(original_replica_groups.size());
     for (int64_t i = 0; i < original_replica_groups.size(); ++i) {
       for (int64_t replica_id : original_replica_groups.at(i).replica_ids()) {
@@ -498,7 +513,7 @@ absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
 
   std::vector<GlobalDeviceId> participants;
   switch (group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA: {
       // This is a cross replica operation. replica group contains replica id.
       // use current replica id to find the set of participating replicas. If
       // replica groups are empty, assume a group with all replicas.
@@ -518,7 +533,7 @@ absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
       return participants;
     }
 
-    case CollectiveOpGroupMode::kCrossPartition: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION: {
       // replica_groups contain partition_id, group contains all partitions for
       // the current replica.
       TF_ASSIGN_OR_RETURN(std::vector<int> participating_partitions,
@@ -534,7 +549,8 @@ absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
       return participants;
     }
 
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition: {
+    case CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION: {
       // replica_groups contain replica_ids. Group contains replicas for all
       // partitions.
       TF_ASSIGN_OR_RETURN(std::vector<int> participating_replicas,
@@ -553,7 +569,7 @@ absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
       return participants;
     }
 
-    case CollectiveOpGroupMode::kFlattenedID: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID: {
       // replica groups contain flattened-ids and cannot be empty.
       TF_RET_CHECK(!replica_groups.empty())
           << "replica groups cannot be empty for kFlattenedID mode";
@@ -580,6 +596,10 @@ absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
       }
       return participants;
     }
+    default: {
+      return InvalidArgument("Invalid collective op group mode: %d",
+                             static_cast<int>(group_mode));
+    }
   }
 }
 
@@ -592,14 +612,16 @@ absl::StatusOr<std::vector<int64_t>> GetPariticipantCountsForReplicaGroups(
   // If replica groups are empty, assume a group with all replicas.
   std::optional<ReplicaGroup> all_replica_groups;
   if (replica_groups.empty()) {
-    if (group_mode == CollectiveOpGroupMode::kFlattenedID) {
+    if (group_mode ==
+        CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
       // replica groups contain flattened-ids and cannot be empty.
       TF_RET_CHECK(!replica_groups.empty())
           << "replica groups cannot be empty for kFlattenedID mode";
     }
 
     int total_participant_count;
-    if (group_mode == CollectiveOpGroupMode::kCrossPartition) {
+    if (group_mode ==
+        CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION) {
       // replica group are partition ids.
       total_participant_count = num_partitions;
     } else {
@@ -616,7 +638,7 @@ absl::StatusOr<std::vector<int64_t>> GetPariticipantCountsForReplicaGroups(
   }
 
   switch (group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA: {
       for (const auto& replica_group : replica_groups) {
         for (int partition_id = 0; partition_id < num_partitions;
              ++partition_id) {
@@ -625,25 +647,30 @@ absl::StatusOr<std::vector<int64_t>> GetPariticipantCountsForReplicaGroups(
       }
       return participant_counts;
     }
-    case CollectiveOpGroupMode::kCrossPartition: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION: {
       for (const auto& replica_group : replica_groups) {
         participant_counts.push_back(replica_group.replica_ids().size());
       }
       return participant_counts;
     }
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition: {
+    case CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION: {
       for (const auto& replica_group : replica_groups) {
         participant_counts.push_back(replica_group.replica_ids().size() *
                                      num_partitions);
       }
       return participant_counts;
     }
-    case CollectiveOpGroupMode::kFlattenedID: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID: {
       for (const auto& replica_group : replica_groups) {
         participant_counts.push_back(replica_group.replica_ids().size());
       }
       return participant_counts;
     }
+    default: {
+      return InvalidArgument("Invalid collective op group mode: %d",
+                             static_cast<int>(group_mode));
+    }
   }
 }
 
@@ -896,25 +923,30 @@ int64_t GetSubgroupSize(const HloCollectiveInstruction* hlo,
                         CollectiveOpGroupMode group_mode) {
   const HloModuleConfig& config = hlo->GetModule()->config();
   switch (group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica:
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA:
+    case CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION: {
       int64_t replica_subgroup_size =
           hlo->replica_groups().empty()
               ? config.replica_count()
               : hlo->replica_groups()[0].replica_ids_size();
-      if (group_mode == CollectiveOpGroupMode::kCrossReplicaAndPartition) {
+      if (group_mode ==
+          CollectiveOpGroupMode::
+              COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION) {
         // Replicas from all partitions participate.
         replica_subgroup_size *= config.num_partitions();
       }
       return replica_subgroup_size;
     }
-    case CollectiveOpGroupMode::kFlattenedID:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID:
       // Empty replica groups not allowed in this mode.
       return hlo->replica_groups()[0].replica_ids_size();
-    case CollectiveOpGroupMode::kCrossPartition:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION:
       return hlo->replica_groups().empty()
                  ? config.num_partitions()
                  : hlo->replica_groups()[0].replica_ids_size();
+    default:
+      LOG(FATAL) << "Invalid collective op group mode: " << group_mode;
   }
 }
 
diff --git a/third_party/xla/xla/service/collective_ops_utils_test.cc b/third_party/xla/xla/service/collective_ops_utils_test.cc
index 4f15de483f4b32..4e3bdcfccb0d89 100644
--- a/third_party/xla/xla/service/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/collective_ops_utils_test.cc
@@ -71,9 +71,10 @@ std::vector<ReplicaGroup> CreateReplicaGroups(
 
 TEST(CollectiveOpsUtilsTest, GetParticipatingIDs_NoReplicaGroups) {
   std::vector<int> actual =
-      GetParticipatingIDs(CollectiveOpGroupMode::kFlattenedID,
-                          /*current_id=*/0, /*total_participant_count=*/3,
-                          /*groups=*/{})
+      GetParticipatingIDs(
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID,
+          /*current_id=*/0, /*total_participant_count=*/3,
+          /*groups=*/{})
           .value();
   std::vector<int> expected = {0, 1, 2};
   EXPECT_EQ(actual, expected);
@@ -89,10 +90,10 @@ TEST(CollectiveOpsUtilsTest, GetParticipatingIDs_ReplicaGroups) {
   replica_groups[2].add_replica_ids(3);
 
   std::vector<int> actual =
-      GetParticipatingIDs(CollectiveOpGroupMode::kFlattenedID,
-                          /*current_id=*/1,
-                          /*total_participant_count=*/std::nullopt,
-                          replica_groups)
+      GetParticipatingIDs(
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID,
+          /*current_id=*/1,
+          /*total_participant_count=*/std::nullopt, replica_groups)
           .value();
   std::vector<int> expected = {1, 5};
   EXPECT_EQ(actual, expected);
@@ -556,12 +557,19 @@ std::vector<TestCase> GetTestCases() {
   const std::vector<TestCase> test_cases = {
       // clang-format off
       // has_channel_id, use_global_device_ids, expected mode
-      {false, std::nullopt, CollectiveOpGroupMode::kCrossReplica},
-      {false, false,         CollectiveOpGroupMode::kCrossReplica},
-      {false, true,          std::nullopt},
-      {true,  std::nullopt, CollectiveOpGroupMode::kCrossPartition},
-      {true,  false,         CollectiveOpGroupMode::kCrossReplicaAndPartition},
-      {true,  true,          CollectiveOpGroupMode::kFlattenedID},
+      // No channel id, no global device ids.
+      {false, std::nullopt,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
+      {false, false,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
+      {false, true, std::nullopt},
+      {true, std::nullopt,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION},
+      {true, false,
+       CollectiveOpGroupMode::
+           COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION},
+      {true, true,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID},
       // clang-format on
   };
   return test_cases;
@@ -595,32 +603,36 @@ struct TestCaseForInstruction {
 std::vector<TestCaseForInstruction> GetTestCasesForInstruction() {
   return std::vector<TestCaseForInstruction>{
       //  opcode, has_channel_id, use_global_device_ids, expected_group_mode
-      {HloOpcode::kAllGather, true, true, CollectiveOpGroupMode::kFlattenedID},
+      {HloOpcode::kAllGather, true, true,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID},
       {HloOpcode::kAllGather, true, false,
-       CollectiveOpGroupMode::kCrossReplicaAndPartition},
+       CollectiveOpGroupMode::
+           COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION},
       {HloOpcode::kAllGather, false, false,
-       CollectiveOpGroupMode::kCrossReplica},
-      {HloOpcode::kAllReduce, true, true, CollectiveOpGroupMode::kFlattenedID},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
+      {HloOpcode::kAllReduce, true, true,
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID},
       {HloOpcode::kAllReduce, true, false,
-       CollectiveOpGroupMode::kCrossReplicaAndPartition},
+       CollectiveOpGroupMode::
+           COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION},
       {HloOpcode::kAllReduce, false, false,
-       CollectiveOpGroupMode::kCrossReplica},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
       {HloOpcode::kAllToAll, true, std::nullopt,
-       CollectiveOpGroupMode::kCrossPartition},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION},
       {HloOpcode::kAllToAll, false, std::nullopt,
-       CollectiveOpGroupMode::kCrossReplica},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
       {HloOpcode::kCollectiveBroadcast, true, std::nullopt,
-       CollectiveOpGroupMode::kCrossPartition},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION},
       {HloOpcode::kCollectiveBroadcast, false, std::nullopt,
-       CollectiveOpGroupMode::kCrossReplica},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
       {HloOpcode::kCollectivePermute, true, std::nullopt,
-       CollectiveOpGroupMode::kCrossPartition},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION},
       {HloOpcode::kCollectivePermute, false, std::nullopt,
-       CollectiveOpGroupMode::kCrossReplica},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA},
       {HloOpcode::kRaggedAllToAll, true, std::nullopt,
-       CollectiveOpGroupMode::kCrossPartition},
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION},
       {HloOpcode::kRaggedAllToAll, false, std::nullopt,
-       CollectiveOpGroupMode::kCrossReplica}};
+       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA}};
 }
 
 class GetCollectOpGroupModeTestForInstruction
@@ -1211,7 +1223,7 @@ std::vector<TestCase> GetTestCases() {
       {
           "CrossReplicaEmptyGroup",
           {},
-          CollectiveOpGroupMode::kCrossReplica,
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
           8,
           1,
           {8},
@@ -1219,7 +1231,7 @@ std::vector<TestCase> GetTestCases() {
       {
           "CrossReplicaWithPartitions",
           {{0, 1}, {2, 3}},
-          CollectiveOpGroupMode::kCrossReplica,
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
           4,
           2,
           {2, 2, 2, 2},
@@ -1227,7 +1239,8 @@ std::vector<TestCase> GetTestCases() {
       {
           "CrossReplicaAndPartition",
           {{0, 1}, {2, 3}},
-          CollectiveOpGroupMode::kCrossReplicaAndPartition,
+          CollectiveOpGroupMode::
+              COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION,
           4,
           2,
           {4, 4},
@@ -1235,7 +1248,7 @@ std::vector<TestCase> GetTestCases() {
       {
           "FlattenedID",
           {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}},
-          CollectiveOpGroupMode::kFlattenedID,
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID,
           4,
           2,
           {1, 1, 1, 1, 1, 1, 1, 1},
diff --git a/third_party/xla/xla/service/collective_opt_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
index 0855be437a30bb..83372b1bffe252 100644
--- a/third_party/xla/xla/service/collective_opt_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -655,10 +655,12 @@ MatchPermutedSliceAndPartitionOffset(const HloAllGatherInstruction* ag,
     return std::nullopt;
   }
 
-  // Only matches for kFlattenedID collective mode.
+  // Only matches for COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID collective mode.
   absl::StatusOr<CollectiveOpGroupMode> mode = GetCollectiveOpGroupMode(ag);
 
-  if (!mode.ok() || mode.value() != CollectiveOpGroupMode::kFlattenedID) {
+  if (!mode.ok() ||
+      mode.value() !=
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
     VLOG(2) << "AG does not use global device ids or channel id "
             << ag->ToString();
     return std::nullopt;
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index 4c8314be1128df..7d1f735df02342 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -307,7 +307,7 @@ void RemoveAllButOne(std::vector<HloCollectivePermuteInstruction*>& cps) {
   cps = {cps[cp_index]};
 }
 
-absl::StatusOr<bool> CollectivePermuteDecomposer::Run(
+absl::StatusOr<bool> CollectivePermuteDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.h b/third_party/xla/xla/service/collective_permute_decomposer.h
index 0787e50b0c9cad..09d2284858303c 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.h
+++ b/third_party/xla/xla/service/collective_permute_decomposer.h
@@ -69,9 +69,8 @@ class CollectivePermuteDecomposer : public HloModulePass {
     return "collective-permute-decomposer";
   }
 
-  // Runs CollectivePermuteDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index fbe77cc48ec347..562a7a18c509d8 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -348,7 +348,7 @@ CheckStoreIntoSliceIsCompatible(
     if (direction ==
         collective_pipeliner_utils::PipeliningDirection::kForwardSink) {
       // TODO(maggioni): Support these ops in forward sink.
-      if (HloPredicateIsOp<HloOpcode::kConcatenate, HloOpcode::kGetTupleElement,
+      if (HloPredicateIsOp<HloOpcode::kGetTupleElement,
                            HloOpcode::kReduceScatter>(i)) {
         return false;
       }
@@ -359,8 +359,8 @@ CheckStoreIntoSliceIsCompatible(
                             HloOpcode::kAllReduce, HloOpcode::kTranspose,
                             HloOpcode::kBroadcast, HloOpcode::kAllGather,
                             HloOpcode::kReduce, HloOpcode::kGetTupleElement,
-                            HloOpcode::kConcatenate, HloOpcode::kReduceScatter>(
-               i) ||
+                            HloOpcode::kConcatenate, HloOpcode::kReduceScatter,
+                            HloOpcode::kBitcast>(i) ||
            (multi_uses_pipelining && i->IsElementwise()) ||
            (i->opcode() == HloOpcode::kCustomCall &&
             !Cast<HloCustomCallInstruction>(i)
@@ -514,8 +514,7 @@ std::optional<std::vector<HloInstruction*>> CollectIndependentOperandChain(
     HloPredicate should_allow_loop_variant_parameter_in_chain,
     const absl::flat_hash_set<const HloInstruction*>&
         loop_invariant_instructions,
-    bool should_add_loop_invariant_op_in_chain,
-    HloPredicate acceptable_formatting) {
+    bool should_add_loop_invariant_op_in_chain) {
   std::vector<HloInstruction*> chain;
   absl::flat_hash_set<const HloInstruction*> visited_set({instr});
   std::vector<std::pair<HloInstruction*, int>> stack(1, {instr, 0});
@@ -550,9 +549,6 @@ std::optional<std::vector<HloInstruction*>> CollectIndependentOperandChain(
     }
   }
   for (auto* chain_instr : chain) {
-    if (!acceptable_formatting(chain_instr)) {
-      return std::nullopt;
-    }
     // Allow tokens in the chain.
     if (chain_instr->opcode() == HloOpcode::kAfterAll) {
       continue;
@@ -604,15 +600,14 @@ std::optional<std::vector<HloInstruction*>> CollectChainsToPushBackwards(
     bool should_allow_control_dependencies,
     const absl::flat_hash_set<const HloInstruction*>&
         loop_invariant_instructions,
-    bool should_add_loop_invariant_op_in_chain,
-    HloPredicate acceptable_formatting) {
+    bool should_add_loop_invariant_op_in_chain) {
   if (instr->HasControlDependencies() && !should_allow_control_dependencies) {
     return std::nullopt;
   }
   return CollectIndependentOperandChain(
       instr, loop_iter, loop_invariant_params,
       should_allow_loop_variant_parameter_in_chain, loop_invariant_instructions,
-      should_add_loop_invariant_op_in_chain, acceptable_formatting);
+      should_add_loop_invariant_op_in_chain);
 }
 
 // Given a dynamic-update-slice find the output index of the loop we feed into.
@@ -824,13 +819,17 @@ class WhileLoopAnalysis {
       HloInstruction* while_instr, int64_t max_pipelining_per_loop,
       bool pipeline_use_tree, bool process_different_sized_options,
       TuplePointsToAnalysis* tuple_points_to_analysis,
-      std::optional<ConstantValue> known_start = std::nullopt)
+      std::optional<ConstantValue> known_start = std::nullopt,
+      bool delay_sinking_large_collectives = false,
+      int64_t collective_size_threshold = INT64_MAX)
       : while_(while_instr),
         loop_start_(known_start),
         max_pipelining_per_loop_(max_pipelining_per_loop),
         tuple_points_to_analysis_(tuple_points_to_analysis),
         pipeline_use_tree_(pipeline_use_tree),
-        process_different_sized_options_(process_different_sized_options) {}
+        process_different_sized_options_(process_different_sized_options),
+        delay_sinking_large_collectives_(delay_sinking_large_collectives),
+        collective_size_threshold_(collective_size_threshold) {}
   std::optional<ConstantValue> GetLoopIterationCount() const;
   std::optional<ConstantValue> GetLoopStart() const;
   std::optional<ConstantValue> GetLoopIncrement() const;
@@ -931,6 +930,8 @@ class WhileLoopAnalysis {
 
   bool pipeline_use_tree_;
   bool process_different_sized_options_;
+  bool delay_sinking_large_collectives_;
+  int64_t collective_size_threshold_;
 };
 
 int64_t WhileLoopAnalysis::GetDUSIndex(const HloInstruction* dus) const {
@@ -1373,6 +1374,17 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
     if (!should_process(instr)) {
       continue;
     }
+    if (delay_sinking_large_collectives_ &&
+        direction ==
+            collective_pipeliner_utils::PipeliningDirection::kForwardSink &&
+        ShapeUtil::ElementsIn(instr->shape()) >= collective_size_threshold_) {
+      VLOG(1) << "Delay sinking " << instr->name() << " because its size "
+              << ShapeUtil::ElementsIn(instr->shape())
+              << " is greater than the threshold "
+              << collective_size_threshold_;
+      continue;
+    }
+
     if (direction ==
             collective_pipeliner_utils::PipeliningDirection::kForward ||
         direction ==
@@ -1483,7 +1495,7 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
           invariant_loop_parameters_,
           should_allow_loop_variant_parameter_in_chain,
           should_allow_control_dependencies, invariant_loop_instructions_,
-          should_add_loop_invariant_op_in_chain, acceptable_formatting);
+          should_add_loop_invariant_op_in_chain);
       if (!chain_collected.has_value()) {
         VLOG(5) << "Skipping " << instr->name()
                 << " because didn't find compatible slice of parameter";
@@ -1887,6 +1899,14 @@ absl::Status TransformLoopForward(
         MapNewOperands(instr->operands(), while_body_to_peeled);
     HloInstruction* cloned_instr = loop_computation->AddInstruction(
         instr->CloneWithNewOperands(instr->shape(), new_operands));
+    if (cloned_instr->opcode() == HloOpcode::kWhile) {
+      cloned_instr->set_while_condition(
+          loop_computation->parent()->AddEmbeddedComputation(
+              instr->while_condition()->CloneWithReplacements(nullptr)));
+      cloned_instr->set_while_body(
+          loop_computation->parent()->AddEmbeddedComputation(
+              instr->while_body()->CloneWithReplacements(nullptr)));
+    }
     TF_RETURN_IF_ERROR(
         UpdateControlDependencies(instr, cloned_instr, while_body_to_peeled));
     UpdateInstructionChannelId(cloned_instr, next_channel_id);
@@ -2270,6 +2290,210 @@ absl::Status TransformLoopForward(
   return absl::OkStatus();
 }
 
+absl::Status TransformFormattingOp(
+    HloInstruction* formatting_op, const WhileMoveInfo& to_move,
+    HloComputation* loop_computation, InstructionMap& pipelined_map,
+    const absl::flat_hash_set<HloInstruction*>& to_add_batch_set,
+    int64_t& next_channel_id) {
+  auto collect_operands = [&pipelined_map, &to_add_batch_set, loop_computation,
+                           &to_move](HloInstruction* instr) {
+    std::vector<HloInstruction*> operands;
+    for (auto* operand : instr->mutable_operands()) {
+      if (operand->opcode() == HloOpcode::kConstant) {
+        if (instr->opcode() == HloOpcode::kPad &&
+            instr->operand_index(operand) == 1) {
+          // No need to broadcast the padding value.
+          operands.push_back(loop_computation->AddInstruction(
+              operand->CloneWithNewOperands(operand->shape(), {})));
+          continue;
+        }
+
+        // Broadcast constant into full shape.
+        HloInstruction* cloned_constant = loop_computation->AddInstruction(
+            operand->CloneWithNewOperands(operand->shape(), {}));
+        if (!to_add_batch_set.contains(instr)) {
+          operands.push_back(cloned_constant);
+          continue;
+        }
+        Shape full_shape =
+            ComputeFullOutputShape(to_move, cloned_constant->shape());
+        absl::InlinedVector<int64_t, 4> operand_dims;
+        operand_dims.resize(cloned_constant->shape().dimensions().size());
+        absl::c_iota(operand_dims, 1);
+        HloInstruction* broadcasted =
+            loop_computation->AddInstruction(HloInstruction::CreateBroadcast(
+                full_shape, cloned_constant, operand_dims));
+        operands.push_back(broadcasted);
+        continue;
+      }
+      auto it = pipelined_map.find(operand);
+      CHECK(it != pipelined_map.end());
+      operands.push_back(it->second);
+    }
+    return operands;
+  };
+  const int64_t new_dim_limit =
+      to_move.dynamic_update_slices[0]->shape().dimensions(0);
+  if (pipelined_map.contains(formatting_op)) {
+    return absl::OkStatus();
+  }
+  if (!to_add_batch_set.contains(formatting_op) &&
+      formatting_op->opcode() != HloOpcode::kBroadcast) {
+    HloInstruction* cloned_not_to_batch =
+        loop_computation->AddInstruction(formatting_op->CloneWithNewOperands(
+            formatting_op->shape(), collect_operands(formatting_op)));
+    UpdateInstructionChannelId(cloned_not_to_batch, next_channel_id);
+    pipelined_map[formatting_op] = cloned_not_to_batch;
+    return absl::OkStatus();
+  }
+  if (formatting_op->IsElementwise() ||
+      formatting_op->opcode() == HloOpcode::kReshape ||
+      formatting_op->opcode() == HloOpcode::kAllReduce ||
+      formatting_op->opcode() == HloOpcode::kConvert ||
+      formatting_op->opcode() == HloOpcode::kCollectivePermute) {
+    HloInstruction* cloned_elementwise =
+        loop_computation->AddInstruction(formatting_op->CloneWithNewOperands(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            collect_operands(formatting_op)));
+    pipelined_map[formatting_op] = cloned_elementwise;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kAllGather) {
+    auto* all_gather_instruction = Cast<HloAllGatherInstruction>(formatting_op);
+    auto operands = collect_operands(formatting_op);
+    HloInstruction* expanded_all_gather =
+        loop_computation->AddInstruction(HloInstruction::CreateAllGather(
+            ComputeFullOutputShape(to_move, formatting_op->shape()), operands,
+            all_gather_instruction->all_gather_dimension() + 1,
+            all_gather_instruction->replica_groups(),
+            all_gather_instruction->constrain_layout(),
+            all_gather_instruction->channel_id(),
+            all_gather_instruction->use_global_device_ids()));
+    pipelined_map[formatting_op] = expanded_all_gather;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kReduce) {
+    auto operands = collect_operands(formatting_op);
+    std::vector<int64_t> dimensions(formatting_op->dimensions().begin(),
+                                    formatting_op->dimensions().end());
+    for (auto& dim : dimensions) {
+      ++dim;
+    }
+    // Look through broadcast for reduce init value.
+    if (operands[1]->opcode() == HloOpcode::kBroadcast) {
+      CHECK(operands[1]->operand(0)->opcode() == HloOpcode::kConstant);
+      operands[1] = operands[1]->mutable_operand(0);
+    }
+    HloInstruction* expanded_reduce =
+        loop_computation->AddInstruction(HloInstruction::CreateReduce(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            operands[0], operands[1], dimensions, formatting_op->to_apply()));
+    pipelined_map[formatting_op] = expanded_reduce;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kBroadcast) {
+    auto operands = collect_operands(formatting_op);
+    std::vector<int64_t> dimensions(1, 0);
+    for (const int64_t dim : formatting_op->dimensions()) {
+      dimensions.push_back(dim + 1);
+    }
+    // Constant scalars don't get expanded ahead of time and are kept
+    // scalar.
+    if (operands[0]->shape().dimensions().empty()) {
+      dimensions.clear();
+    }
+    HloInstruction* expanded_broadcast =
+        loop_computation->AddInstruction(HloInstruction::CreateBroadcast(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            operands[0], dimensions));
+    pipelined_map[formatting_op] = expanded_broadcast;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kSlice) {
+    std::vector<int64_t> slice_start = formatting_op->slice_starts();
+    std::vector<int64_t> slice_limits = formatting_op->slice_limits();
+    std::vector<int64_t> slice_strides = formatting_op->slice_strides();
+    slice_start.insert(slice_start.begin(), 0);
+    slice_limits.insert(slice_limits.begin(), new_dim_limit);
+    slice_strides.insert(slice_strides.begin(), 1);
+    HloInstruction* expanded_slice =
+        loop_computation->AddInstruction(HloInstruction::CreateSlice(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            collect_operands(formatting_op)[0], slice_start, slice_limits,
+            slice_strides));
+    pipelined_map[formatting_op] = expanded_slice;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kDynamicSlice) {
+    std::vector<int64_t> dynamic_slice_sizes =
+        formatting_op->dynamic_slice_sizes();
+    dynamic_slice_sizes.insert(dynamic_slice_sizes.begin(), new_dim_limit);
+    HloDynamicSliceInstruction* dynslice =
+        Cast<HloDynamicSliceInstruction>(formatting_op);
+    HloInstruction* zero = loop_computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(
+            formatting_op->operand(dynslice->first_index_operand_number())
+                ->shape()
+                .element_type())));
+    std::vector<HloInstruction*> indices(1, zero);
+    auto collected_operands = collect_operands(formatting_op);
+    indices.insert(indices.end(), std::next(collected_operands.begin()),
+                   collected_operands.end());
+    HloInstruction* expanded_dynslice =
+        loop_computation->AddInstruction(HloInstruction::CreateDynamicSlice(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            collected_operands[0], indices, dynamic_slice_sizes));
+    pipelined_map[formatting_op] = expanded_dynslice;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kPad) {
+    HloPadInstruction* pad_instruction = Cast<HloPadInstruction>(formatting_op);
+    PaddingConfig p_config = pad_instruction->padding_config();
+    PaddingConfig new_p_config;
+    new_p_config.add_dimensions();
+    for (auto& dim : p_config.dimensions()) {
+      auto* new_dim = new_p_config.add_dimensions();
+      *new_dim = dim;
+    }
+    auto new_operands = collect_operands(formatting_op);
+    HloInstruction* expanded_pad =
+        loop_computation->AddInstruction(HloInstruction::CreatePad(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            new_operands[0], new_operands[1], new_p_config));
+    pipelined_map[formatting_op] = expanded_pad;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kTranspose) {
+    HloTransposeInstruction* transpose_instruction =
+        Cast<HloTransposeInstruction>(formatting_op);
+    std::vector<int64_t> new_dims(transpose_instruction->dimensions().begin(),
+                                  transpose_instruction->dimensions().end());
+    new_dims.insert(new_dims.begin(), 0);
+    for (int64_t i = 1; i < new_dims.size(); ++i) {
+      ++new_dims[i];
+    }
+    HloInstruction* expanded_transpose =
+        loop_computation->AddInstruction(HloInstruction::CreateTranspose(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            collect_operands(formatting_op)[0], new_dims));
+    pipelined_map[formatting_op] = expanded_transpose;
+    return absl::OkStatus();
+  }
+  if (formatting_op->opcode() == HloOpcode::kConcatenate) {
+    HloConcatenateInstruction* concat =
+        Cast<HloConcatenateInstruction>(formatting_op);
+    HloInstruction* expanded_concat =
+        loop_computation->AddInstruction(HloInstruction::CreateConcatenate(
+            ComputeFullOutputShape(to_move, formatting_op->shape()),
+            collect_operands(formatting_op),
+            concat->concatenate_dimension() + 1));
+    pipelined_map[formatting_op] = expanded_concat;
+    return absl::OkStatus();
+  }
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unsupported instruction ", formatting_op->ToString()));
+}
+
 // Function that does the work of sinking all-reduces the output of which are
 // concatenated after the loop. Rough transformation: while (i < LAYERS) {
 //   p0 = param(0)
@@ -2569,8 +2793,6 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
           HloInstruction::CreateGetTupleElement(new_while, gte_index));
       pipelined_map[collective->mutable_operand(0)] = to_sink;
     }
-    const int64_t new_dim_limit =
-        to_move.dynamic_update_slices[0]->shape().dimensions(0);
     auto pipelined_instrs = CollectDependenciesToPipeline(
         absl::MakeSpan(to_move.collectives_to_move),
         absl::MakeSpan(to_move.formatting_ops));
@@ -2608,44 +2830,6 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
       pipelined_map[collective] = pipelined_instr_cloned;
     }
     absl::flat_hash_set<HloInstruction*> to_add_batch_set;
-    auto collect_operands = [&pipelined_map, &to_add_batch_set,
-                             loop_computation,
-                             &to_move](HloInstruction* instr) {
-      std::vector<HloInstruction*> operands;
-      for (auto* operand : instr->mutable_operands()) {
-        if (operand->opcode() == HloOpcode::kConstant) {
-          if (instr->opcode() == HloOpcode::kPad &&
-              instr->operand_index(operand) == 1) {
-            // No need to broadcast the padding value.
-            operands.push_back(loop_computation->AddInstruction(
-                operand->CloneWithNewOperands(operand->shape(), {})));
-            continue;
-          }
-
-          // Broadcast constant into full shape.
-          HloInstruction* cloned_constant = loop_computation->AddInstruction(
-              operand->CloneWithNewOperands(operand->shape(), {}));
-          if (!to_add_batch_set.contains(instr)) {
-            operands.push_back(cloned_constant);
-            continue;
-          }
-          Shape full_shape =
-              ComputeFullOutputShape(to_move, cloned_constant->shape());
-          absl::InlinedVector<int64_t, 4> operand_dims;
-          operand_dims.resize(cloned_constant->shape().dimensions().size());
-          absl::c_iota(operand_dims, 1);
-          HloInstruction* broadcasted =
-              loop_computation->AddInstruction(HloInstruction::CreateBroadcast(
-                  full_shape, cloned_constant, operand_dims));
-          operands.push_back(broadcasted);
-          continue;
-        }
-        auto it = pipelined_map.find(operand);
-        CHECK(it != pipelined_map.end());
-        operands.push_back(it->second);
-      }
-      return operands;
-    };
     for (auto* current : to_move.formatting_ops) {
       if (IsLoopInvariant(current, invariant_cache)) {
         continue;
@@ -2657,141 +2841,9 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
     //  an effect on the instruction itself (like say broadcast, slices ...
     //  etc).
     for (HloInstruction* formatting_op : to_move.formatting_ops) {
-      if (pipelined_map.contains(formatting_op)) {
-        continue;
-      }
-      if (!to_add_batch_set.contains(formatting_op) &&
-          formatting_op->opcode() != HloOpcode::kBroadcast) {
-        HloInstruction* cloned_not_to_batch = loop_computation->AddInstruction(
-            formatting_op->CloneWithNewOperands(
-                formatting_op->shape(), collect_operands(formatting_op)));
-        UpdateInstructionChannelId(cloned_not_to_batch, next_channel_id);
-        pipelined_map[formatting_op] = cloned_not_to_batch;
-        continue;
-      }
-      if (formatting_op->IsElementwise() ||
-          formatting_op->opcode() == HloOpcode::kReshape ||
-          formatting_op->opcode() == HloOpcode::kAllReduce ||
-          formatting_op->opcode() == HloOpcode::kConvert ||
-          formatting_op->opcode() == HloOpcode::kCollectivePermute) {
-        HloInstruction* cloned_elementwise = loop_computation->AddInstruction(
-            formatting_op->CloneWithNewOperands(
-                ComputeFullOutputShape(to_move, formatting_op->shape()),
-                collect_operands(formatting_op)));
-        pipelined_map[formatting_op] = cloned_elementwise;
-        continue;
-      }
-      if (formatting_op->opcode() == HloOpcode::kReduce) {
-        auto operands = collect_operands(formatting_op);
-        std::vector<int64_t> dimensions(formatting_op->dimensions().begin(),
-                                        formatting_op->dimensions().end());
-        for (auto& dim : dimensions) {
-          ++dim;
-        }
-        // Look through broadcast for reduce init value.
-        if (operands[1]->opcode() == HloOpcode::kBroadcast) {
-          CHECK(operands[1]->operand(0)->opcode() == HloOpcode::kConstant);
-          operands[1] = operands[1]->mutable_operand(0);
-        }
-        HloInstruction* expanded_reduce =
-            loop_computation->AddInstruction(HloInstruction::CreateReduce(
-                ComputeFullOutputShape(to_move, formatting_op->shape()),
-                operands[0], operands[1], dimensions,
-                formatting_op->to_apply()));
-        pipelined_map[formatting_op] = expanded_reduce;
-        continue;
-      }
-      if (formatting_op->opcode() == HloOpcode::kBroadcast) {
-        auto operands = collect_operands(formatting_op);
-        std::vector<int64_t> dimensions(1, 0);
-        for (const int64_t dim : formatting_op->dimensions()) {
-          dimensions.push_back(dim + 1);
-        }
-        // Constant scalars don't get expanded ahead of time and are kept
-        // scalar.
-        if (operands[0]->shape().dimensions().empty()) {
-          dimensions.clear();
-        }
-        HloInstruction* expanded_broadcast =
-            loop_computation->AddInstruction(HloInstruction::CreateBroadcast(
-                ComputeFullOutputShape(to_move, formatting_op->shape()),
-                operands[0], dimensions));
-        pipelined_map[formatting_op] = expanded_broadcast;
-        continue;
-      }
-      if (formatting_op->opcode() == HloOpcode::kSlice) {
-        std::vector<int64_t> slice_start = formatting_op->slice_starts();
-        std::vector<int64_t> slice_limits = formatting_op->slice_limits();
-        std::vector<int64_t> slice_strides = formatting_op->slice_strides();
-        slice_start.insert(slice_start.begin(), 0);
-        slice_limits.insert(slice_limits.begin(), new_dim_limit);
-        slice_strides.insert(slice_strides.begin(), 1);
-        HloInstruction* expanded_slice =
-            loop_computation->AddInstruction(HloInstruction::CreateSlice(
-                ComputeFullOutputShape(to_move, formatting_op->shape()),
-                collect_operands(formatting_op)[0], slice_start, slice_limits,
-                slice_strides));
-        pipelined_map[formatting_op] = expanded_slice;
-        continue;
-      }
-      if (formatting_op->opcode() == HloOpcode::kDynamicSlice) {
-        std::vector<int64_t> dynamic_slice_sizes =
-            formatting_op->dynamic_slice_sizes();
-        dynamic_slice_sizes.insert(dynamic_slice_sizes.begin(), new_dim_limit);
-        HloDynamicSliceInstruction* dynslice =
-            Cast<HloDynamicSliceInstruction>(formatting_op);
-        HloInstruction* zero = loop_computation->AddInstruction(
-            HloInstruction::CreateConstant(LiteralUtil::Zero(
-                formatting_op->operand(dynslice->first_index_operand_number())
-                    ->shape()
-                    .element_type())));
-        std::vector<HloInstruction*> indices(1, zero);
-        auto collected_operands = collect_operands(formatting_op);
-        indices.insert(indices.end(), std::next(collected_operands.begin()),
-                       collected_operands.end());
-        HloInstruction* expanded_dynslice =
-            loop_computation->AddInstruction(HloInstruction::CreateDynamicSlice(
-                ComputeFullOutputShape(to_move, formatting_op->shape()),
-                collected_operands[0], indices, dynamic_slice_sizes));
-        pipelined_map[formatting_op] = expanded_dynslice;
-        continue;
-      }
-      if (formatting_op->opcode() == HloOpcode::kPad) {
-        HloPadInstruction* pad_instruction =
-            Cast<HloPadInstruction>(formatting_op);
-        PaddingConfig p_config = pad_instruction->padding_config();
-        PaddingConfig new_p_config;
-        new_p_config.add_dimensions();
-        for (auto& dim : p_config.dimensions()) {
-          auto* new_dim = new_p_config.add_dimensions();
-          *new_dim = dim;
-        }
-        auto new_operands = collect_operands(formatting_op);
-        HloInstruction* expanded_pad =
-            loop_computation->AddInstruction(HloInstruction::CreatePad(
-                ComputeFullOutputShape(to_move, formatting_op->shape()),
-                new_operands[0], new_operands[1], new_p_config));
-        pipelined_map[formatting_op] = expanded_pad;
-        continue;
-      }
-      if (formatting_op->opcode() == HloOpcode::kTranspose) {
-        HloTransposeInstruction* transpose_instruction =
-            Cast<HloTransposeInstruction>(formatting_op);
-        std::vector<int64_t> new_dims(
-            transpose_instruction->dimensions().begin(),
-            transpose_instruction->dimensions().end());
-        new_dims.insert(new_dims.begin(), 0);
-        for (int64_t i = 1; i < new_dims.size(); ++i) {
-          ++new_dims[i];
-        }
-        HloInstruction* expanded_transpose =
-            loop_computation->AddInstruction(HloInstruction::CreateTranspose(
-                ComputeFullOutputShape(to_move, formatting_op->shape()),
-                collect_operands(formatting_op)[0], new_dims));
-        pipelined_map[formatting_op] = expanded_transpose;
-        continue;
-      }
-      CHECK(false) << "Unsupported instruction " << formatting_op->ToString();
+      TF_RETURN_IF_ERROR(TransformFormattingOp(
+          formatting_op, to_move, loop_computation, pipelined_map,
+          to_add_batch_set, next_channel_id));
     }
     for (int64_t i = 0; i < to_move.output_indices.size(); ++i) {
       HloDynamicUpdateSliceInstruction* d_update =
@@ -3035,6 +3087,14 @@ static absl::Status TransformLoopBackward(
           MapNewOperands(instr->operands(), while_body_replacement_map);
       cloned_instr = body_builder.AddInstruction(
           instr->CloneWithNewOperands(instr->shape(), new_operands));
+      if (cloned_instr->opcode() == HloOpcode::kWhile) {
+        cloned_instr->set_while_condition(
+            while_loop->GetModule()->AddEmbeddedComputation(
+                instr->while_condition()->CloneWithReplacements(nullptr)));
+        cloned_instr->set_while_body(
+            while_loop->GetModule()->AddEmbeddedComputation(
+                instr->while_body()->CloneWithReplacements(nullptr)));
+      }
       TF_RETURN_IF_ERROR(UpdateControlDependencies(instr, cloned_instr,
                                                    while_body_replacement_map));
       UpdateInstructionChannelId(cloned_instr, next_channel_id);
@@ -3201,24 +3261,6 @@ static absl::Status TransformLoopBackward(
   return absl::OkStatus();
 }
 
-bool IsForwardSinkIterationFeasible(HloInstruction* while_inst,
-                                    int64_t collective_size_threshold) {
-  for (HloInstruction* inst :
-       while_inst->while_body()->root_instruction()->operands()) {
-    if (inst->opcode() == HloOpcode::kDynamicUpdateSlice &&
-        inst->operand(1)->IsCustomCall(
-            CollectivePipeliner::kSunkByPreviousStep)) {
-      HloInstruction* cc = inst->mutable_operand(1);
-      if (ShapeUtil::ElementsIn(cc->shape()) >= collective_size_threshold) {
-        VLOG(1) << "Encountered a large collective which was sunk by the "
-                   "previous step, should stop the iteration.";
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
 absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -3248,7 +3290,9 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
       auto loop_analysis = std::make_unique<WhileLoopAnalysis>(
           instruction, config_.max_pipelining_per_loop,
           config_.pipeline_use_tree, config_.process_different_sized_ops,
-          tuple_points_to_analysis.get());
+          tuple_points_to_analysis.get(), /*known_start=*/std::nullopt,
+          config_.delay_sinking_large_collectives,
+          config_.collective_size_threshold_to_delay_sinking);
       loop_analysis->ComputeLoopStatistics();
       if (loop_analysis->GetLoopIterationCount() &&
           loop_analysis->GetLoopIterationCount()->GetUnsignedValue() > 1) {
@@ -3265,12 +3309,6 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
   for (auto& [instruction, loop_analysis] : loop_analyses) {
     VLOG(1) << "While iterations: "
             << loop_analysis->GetLoopIterationCount()->ToString();
-    if (config_.pipelining_direction ==
-            collective_pipeliner_utils::PipeliningDirection::kForwardSink &&
-        !IsForwardSinkIterationFeasible(
-            instruction, config_.collective_size_threshold_to_stop_sinking)) {
-      continue;
-    }
     loop_analysis->CollectCollectivesToMove(
         config_.level_to_operate_on, config_.pipelining_direction,
         config_.should_process, config_.acceptable_formatting,
@@ -3350,7 +3388,7 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
   return changed;
 }
 
-absl::StatusOr<bool> CollectivePipeliner::Run(
+absl::StatusOr<bool> CollectivePipeliner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   CHECK(config_.acceptable_formatting);
@@ -3361,20 +3399,24 @@ absl::StatusOr<bool> CollectivePipeliner::Run(
     return RunPipeliner(module, execution_threads);
   }
 
-  // If the pipelining direction is kForwardSink, run the pipeliner until it
-  // does not change the module anymore. The maximum number of iterations should
-  // be equal to the maximum number of pipelineable collectives in a chain of
-  // users plus one. In each iteration, we pipeline the last pipelineable
-  // collectives, which do not have any other pipelineable collectives in their
-  // user subtree.
+  // If the pipelining direction is kForwardSink, first run the pipeliner on
+  // small collectives iteratively until it does not change the module anymore.
+  // In each iteration, we pipeline the last pipelineable collectives, which do
+  // not have any other pipelineable collectives in their user subtrees. Then
+  // run the pipeliner one last time on the large collectives.
   bool changed = true;
   int64_t iter = 0;
   while (changed) {
     TF_ASSIGN_OR_RETURN(changed, RunPipeliner(module, execution_threads));
-    VLOG(1) << "Finished running pipeliner's iteration: " << iter;
+    VLOG(1) << "Finished running pipeliner's iteration for small collectives: "
+            << iter;
     iter++;
   }
-  return iter > 1;
+  config_.delay_sinking_large_collectives = false;
+  TF_ASSIGN_OR_RETURN(changed, RunPipeliner(module, execution_threads));
+  VLOG(1) << "Finished running pipeliner's iteration for large collectives: "
+          << iter;
+  return iter > 1 || changed;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/collective_pipeliner.h b/third_party/xla/xla/service/collective_pipeliner.h
index f366d1582a845b..646334f20dd017 100644
--- a/third_party/xla/xla/service/collective_pipeliner.h
+++ b/third_party/xla/xla/service/collective_pipeliner.h
@@ -108,7 +108,8 @@ class CollectivePipeliner : public HloModulePass {
     bool should_add_loop_invariant_op_in_chain = false;
     // Postprocessing hook which runs for every successfully pipelined op.
     HloPostprocessor postprocess_pipelined_ops;
-    int64_t collective_size_threshold_to_stop_sinking = INT64_MAX;
+    int64_t collective_size_threshold_to_delay_sinking = INT64_MAX;
+    bool delay_sinking_large_collectives = true;
   };
   static const char* const kInsertedByPreviousStep;
   static const char* const kSunkByPreviousStep;
@@ -149,13 +150,13 @@ class CollectivePipeliner : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  const Config config_;
+  Config config_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index 5b93a77053a660..c3372dcdd1be1d 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -29,7 +29,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -53,16 +55,13 @@ limitations under the License.
 #include "xla/service/legalize_scheduling_annotations.h"
 #include "xla/service/memory_annotations.h"
 #include "xla/service/scheduling_annotations_util.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 using ::testing::_;
-using ::tsl::testing::IsOkAndHolds;
 namespace op = xla::testing::opcode_matchers;
 
 class CollectivePipelinerTest : public HloHardwareIndependentTestBase {
@@ -107,7 +106,7 @@ absl::StatusOr<bool> RunOptimizer(
     CollectivePipeliner::HloPostprocessor postprocess_backward_peeled_trailing =
         {},
     bool should_add_loop_invariant_op_in_chain = false,
-    int64_t collective_size_threshold_to_stop_sinking = INT64_MAX) {
+    int64_t collective_size_threshold_to_delay_sinking = INT64_MAX) {
   CollectivePipeliner::Config config = {
       /*level_to_operate_on=*/level_to_operate_on,
       /*max_pipelining_per_loop=*/INT64_MAX,
@@ -124,7 +123,7 @@ absl::StatusOr<bool> RunOptimizer(
       postprocess_backward_rotated, postprocess_backward_peeled_trailing,
       should_add_loop_invariant_op_in_chain,
       /*postprocess_pipelined_ops=*/{},
-      collective_size_threshold_to_stop_sinking};
+      collective_size_threshold_to_delay_sinking};
   HloPassPipeline pass("optimizer");
   pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
                             /*allow_mixed_precision=*/false);
@@ -2447,6 +2446,89 @@ ENTRY entry {
   EXPECT_EQ(select_instr_loop->opcode(), HloOpcode::kSelect);
 }
 
+TEST_F(CollectivePipelinerTest, ForwardSinkAllReduceFeedingAllGather) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.35 = bf16[3,8,128] get-tuple-element(param), index=2
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.35, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  t2 = bf16[1,128,8] transpose(mul), dimensions={0,2,1}
+  rs.1 = bf16[1,128,2] reduce-scatter(t2), replica_groups={{0,1,2,3}}, dimensions={2}, to_apply=add
+  ar.1 = bf16[1,128,2] all-reduce(rs.1), replica_groups={{0,1,2,3}}, to_apply=add
+  ag.1 = bf16[1,128,8] all-gather(ar.1), replica_groups={{0,1,2,3}}, dimensions={2}
+  %c = bf16[] custom-call(), custom_call_target="Boh"
+  %b = bf16[1,128,8] broadcast(c), dimensions={}
+  %a = bf16[1,128,8] add(ag.1, b)
+  %t = bf16[1,8,128] transpose(a), dimensions={0,2,1}
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, t, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.35), control-predecessors={select.1348}
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(c0, p0, p0)
+  while = (s32[], bf16[3,8,128], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
+                  .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* while_instr =
+      FindInstruction(module.get(), HloOpcode::kWhile);
+  const HloComputation* comp = while_instr->while_body();
+  const HloInstruction* root_loop = comp->root_instruction();
+  EXPECT_TRUE(root_loop->HasControlDependencies());
+  EXPECT_EQ(root_loop->control_predecessors().size(), 1);
+  const HloInstruction* select_instr_loop =
+      root_loop->control_predecessors()[0];
+  const HloInstruction* transpose_instr =
+      FindInstruction(module.get(), HloOpcode::kTranspose);
+  EXPECT_EQ(transpose_instr->dimensions(), std::vector<int64_t>({0, 1, 3, 2}));
+  EXPECT_EQ(select_instr_loop->opcode(), HloOpcode::kSelect);
+  // All-reduce is sunk outside the loop with all-gather consumer.
+  const HloInstruction* all_reduce =
+      FindInstruction(module.get(), HloOpcode::kAllReduce);
+  EXPECT_EQ(all_reduce->users().size(), 1);
+  EXPECT_EQ(all_reduce->users()[0]->opcode(), HloOpcode::kAllGather);
+}
+
 TEST_F(CollectivePipelinerTest, ForwardSinkLinearShape4097) {
   constexpr absl::string_view hlo_string = R"(
 HloModule module
@@ -2728,7 +2810,7 @@ TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
   auto should_pipeline = [](const HloInstruction* instruction) {
     if (!HloPredicateIsOp<HloOpcode::kRecvDone>(instruction)) return false;
     const HloRecvDoneInstruction* recv_done =
-        dynamic_cast<const HloRecvDoneInstruction*>(instruction);
+        DynCast<const HloRecvDoneInstruction>(instruction);
     if (recv_done->is_host_transfer()) return false;
     // Check that the recv-done is used for non-trivial computation, which can
     // also help avoid repeatedly pipelining a loop.
@@ -2826,7 +2908,7 @@ TEST_F(CollectivePipelinerTest,
         !HloPredicateIsOp<HloOpcode::kSend>(instr))
       return false;
     const HloSendRecvInstruction* send_recv =
-        dynamic_cast<const HloSendRecvInstruction*>(instr);
+        DynCast<const HloSendRecvInstruction>(instr);
     // Check that the Send or Recv is used for non-trivial computation, which
     // also help avoid repeatedly pipelining a loop.
     return (send_recv->user_count() == 1 && send_recv->parent() != nullptr &&
@@ -3171,6 +3253,104 @@ ENTRY main.3813_spmd {
   ASSERT_IS_OK(verifier.Run(module.get()).status());
 }
 
+TEST_F(CollectivePipelinerTest, ClonedWhileLoopHasUniqueBodyAndCondition) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+to_apply0 {
+  Arg_0.732 = bf16[] parameter(0)
+  Arg_1.733 = bf16[] parameter(1)
+  ROOT add.734 = bf16[] add(Arg_0.732, Arg_1.733)
+}
+
+body_inner {
+ p2 = (s32[], bf16[3,4096,4096]{2,1,0}) parameter(0)
+ gte1 = s32[] get-tuple-element(p2), index=0
+  gte2 = bf16[3,4096,4096]{2,1,0} get-tuple-element(p2), index=1
+  add = bf16[3,4096,4096]{2,1,0} add(gte2, gte2)
+  ROOT tuple = (s32[], bf16[3,4096,4096]{2,1,0}) tuple(gte1, gte2)
+
+}
+
+condition_inner {
+  cond_p1 = (s32[], bf16[3,4096,4096]{2,1,0}) parameter(0)
+  gte1 = s32[] get-tuple-element(cond_p1), index=0
+  c1 = s32[] constant(9)
+  ROOT comp0 = pred[] compare(gte1, c1), direction=LT
+}
+
+body {
+  p2 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) parameter(0)
+  gte2 = bf16[3,4096,4096]{2,1,0} get-tuple-element(p2), index=1
+  gte3 = bf16[10,512,3,4096]{3,2,1,0} get-tuple-element(p2), index=2
+  c2 = s32[] constant(9)
+  gte4 = s32[] get-tuple-element(p2), index=0
+  sub0 = s32[] subtract(c2, gte4)
+  c3 = s32[] constant(0)
+  comp1 = pred[] compare(sub0, c3), direction=LT
+  c4 = s32[] constant(19)
+  sub2 = s32[] subtract(c4, gte4)
+  sel0 = s32[] select(comp1, sub2, sub0)
+
+  rsp0 = bf16[3,4096,4096]{2,1,0} reshape(gte2)
+  t = (s32[], bf16[3,4096,4096]{2,1,0}) tuple(c3, rsp0)
+  w0 = (s32[], bf16[3,4096,4096]{2,1,0}) while(t), condition=condition_inner, body=body_inner
+  rsp0_postwhile = bf16[3,4096,4096]{2,1,0} get-tuple-element(w0), index=1
+  rs0 = bf16[3,4096,512]{2,1,0} reduce-scatter(rsp0_postwhile), channel_id=75, replica_groups={{0,1,2,3}}, dimensions={2}, to_apply=to_apply0
+  tran0 = bf16[512,3,4096]{0,2,1} transpose(rs0), dimensions={2,0,1}
+  rsp1 = bf16[1,512,3,4096]{3,2,1,0} bitcast(tran0)
+  dus0 = bf16[10,512,3,4096]{3,2,1,0} dynamic-update-slice(gte3, rsp1, sel0, c3, c3, /*index=5*/c3)
+  c5 = s32[] constant(1)
+  add0 = s32[] add(gte4, c5)
+  ROOT t1 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) tuple(add0, rsp0, dus0)
+} // body
+
+condition {
+  cond_p1 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) parameter(0)
+  gte1 = s32[] get-tuple-element(cond_p1), index=0
+  c1 = s32[] constant(9)
+  ROOT comp0 = pred[] compare(gte1, c1), direction=LT
+}
+
+ENTRY main.3813_spmd {
+  p0 = bf16[3,4096,4096]{2,1,0} parameter(0)
+  p1 = bf16[10,512,3,4096]{3,2,1,0} parameter(1)
+  c0 = s32[] constant(0)
+
+  t0 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) tuple(c0, p0, p1)
+  w0 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) while(t0), condition=condition, body=body
+  ROOT gte0 = bf16[3,4096,4096]{2,1,0} get-tuple-element(w0), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  XLA_VLOG_LINES(1, absl::StrCat("Before: ", module->ToString()));
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   HloPredicateIsOp<HloOpcode::kReduceScatter>)
+          .value());
+  XLA_VLOG_LINES(1, absl::StrCat("After: ", module->ToString()));
+  HloVerifier verifier(/*layout_sensitive=*/false,
+                       /*allow_mixed_precision*/ true);
+  ASSERT_IS_OK(verifier.Run(module.get()).status());
+  absl::flat_hash_set<HloComputation*> body_computations_seen;
+  absl::flat_hash_set<HloComputation*> condition_computations_seen;
+  for (auto* computation : module->computations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        ASSERT_TRUE(
+            !body_computations_seen.contains(instruction->while_body()));
+        ASSERT_TRUE(!condition_computations_seen.contains(
+            instruction->while_condition()));
+        body_computations_seen.insert(instruction->while_body());
+        condition_computations_seen.insert(instruction->while_condition());
+      }
+    }
+  }
+}
+
 TEST_F(CollectivePipelinerTest,
        PipelineBackwardIncludeInvariantMultiConsumerInChain) {
   constexpr absl::string_view hlo_string = R"(
@@ -3569,7 +3749,7 @@ ENTRY entry {
 }
 
 TEST_F(CollectivePipelinerTest,
-       ForwardSinkDependentPipelineableCollectivesDoNotPipeline) {
+       ForwardSinkDoNotStopPipeliningAfterLargeCollectives) {
   constexpr absl::string_view hlo_string = R"(
 HloModule module
 
@@ -3580,71 +3760,66 @@ add {
 }
 
 add.1 {
-  lhs.1 = bf16[] parameter(0)
-  rhs.1 = bf16[] parameter(1)
-  ROOT add.1 = bf16[] add(lhs.1, rhs.1)
-}
-
-while_body.clone {
-  sink_param.1 = (s32[], bf16[3,8,128]{2,1,0}, bf16[3,8,128]{2,1,0}, bf16[3,1,8,128]{3,2,1,0}) parameter(0)
-  get-tuple-element.0 = s32[] get-tuple-element(sink_param.1), index=0
-  constant.5 = s32[] constant(1)
-  add.2 = s32[] add(get-tuple-element.0, constant.5)
-  get-tuple-element.1 = bf16[3,8,128]{2,1,0} get-tuple-element(sink_param.1), index=1
-  get-tuple-element.2 = bf16[3,8,128]{2,1,0} get-tuple-element(sink_param.1), index=2
-  get-tuple-element.3 = bf16[3,1,8,128]{3,2,1,0} get-tuple-element(sink_param.1), index=3
-  constant.6 = s32[] constant(3)
-  subtract.0 = s32[] subtract(constant.6, get-tuple-element.0)
-  constant.7 = s32[] constant(-1)
-  add.3 = s32[] add(subtract.0, constant.7)
-  constant.8 = s32[] constant(0)
-  compare.0 = pred[] compare(add.3, constant.8), direction=LT
-  constant.9 = s32[] constant(2)
-  add.4 = s32[] add(subtract.0, constant.9)
-  select.0 = s32[] select(compare.0, add.4, add.3)
-  dynamic-slice.0 = bf16[1,8,128]{2,1,0} dynamic-slice(get-tuple-element.2, select.0, constant.8, constant.8), dynamic_slice_sizes={1,8,128}
-  mul.1 = bf16[1,8,128]{2,1,0} multiply(dynamic-slice.0, dynamic-slice.0)
-  ar.0 = bf16[1,8,128]{2,1,0} all-reduce(mul.1), channel_id=1, replica_groups={}, to_apply=add
-  b.0 = bf16[1,8,128,32]{3,2,1,0} broadcast(ar.0), dimensions={0,1,2}
-  constant.10 = bf16[] constant(0)
-  reduce.1 = bf16[1,8,128]{2,1,0} reduce(b.0, constant.10), dimensions={3}, to_apply=add.1
-  reshape.1 = bf16[1,1,8,128]{3,2,1,0} reshape(reduce.1)
-  custom-call.2 = bf16[1,1,8,128]{3,2,1,0} custom-call(reshape.1), custom_call_target="SunkByPreviousStep"
-  constant.12 = s32[] constant(0)
-  dynamic-update-slice.1 = bf16[3,1,8,128]{3,2,1,0} dynamic-update-slice(get-tuple-element.3, custom-call.2, select.0, constant.12, constant.12, constant.12)
-  ROOT tuple.3 = (s32[], bf16[3,8,128]{2,1,0}, bf16[3,8,128]{2,1,0}, bf16[3,1,8,128]{3,2,1,0}) tuple(add.2, get-tuple-element.1, get-tuple-element.2, dynamic-update-slice.1)
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
 }
 
-while_cond.clone {
-  sink_param = (s32[], bf16[3,8,128]{2,1,0}, bf16[3,8,128]{2,1,0}, bf16[3,1,8,128]{3,2,1,0}) parameter(0)
-  gte.1 = s32[] get-tuple-element(sink_param), index=0
-  constant.13 = s32[] constant(3)
-  ROOT cmp.1 = pred[] compare(gte.1, constant.13), direction=LT
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,128,128], bf16[3,128,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,128,128], bf16[3,128,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.35 = bf16[3,8,128] get-tuple-element(param), index=2
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.35, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  b.1 = bf16[1,8,128,32] broadcast(ar.1), dimensions={0,1,2}
+  constant = bf16[] constant(0)
+  reduce = bf16[1,8,128] reduce(b.1, constant), dimensions={3}, to_apply=add.1
+  ar.2 = bf16[1,8,128] all-reduce(reduce), replica_groups={}, to_apply=add, channel_id=2
+  c1 = bf16[] constant(2.0)
+  bc = bf16[1,8,128] broadcast(c1)
+  mul1 = bf16[1,8,128] multiply(ar.2, bc)
+  mul3 = bf16[1,8,128] multiply(mul1, ar.2)
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, mul3, select.1348, constant.2561, constant.2561)
+  get-tuple-element.396 = bf16[3,128,128] get-tuple-element(param), index=3
+  get-tuple-element.36 = bf16[3,128,128] get-tuple-element(param), index=4
+  dynamic-slice.100 = bf16[1,128,128] dynamic-slice(get-tuple-element.36, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,128,128}
+  large-ar = bf16[1,128,128] all-reduce(dynamic-slice.100), replica_groups={}, to_apply=add, channel_id=3
+  dynamic-update-slice.36 = bf16[3,128,128] dynamic-update-slice(get-tuple-element.396, large-ar, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,128,128], bf16[3,128,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.35, dynamic-update-slice.36, get-tuple-element.36)
 }
 
 ENTRY entry {
   c0 = s32[] constant(0)
-  p0 = bf16[3,8,128]{2,1,0} parameter(0)
-  constant.2 = bf16[] constant(0)
-  broadcast = bf16[3,1,8,128]{3,2,1,0} broadcast(constant.2), dimensions={}
-  tuple.2 = (s32[], bf16[3,8,128]{2,1,0}, bf16[3,8,128]{2,1,0}, bf16[3,1,8,128]{3,2,1,0}) tuple(c0, p0, p0, broadcast)
-  while.1 = (s32[], bf16[3,8,128]{2,1,0}, bf16[3,8,128]{2,1,0}, bf16[3,1,8,128]{3,2,1,0}) while(tuple.2), condition=while_cond.clone, body=while_body.clone
-  get-tuple-element.5 = s32[] get-tuple-element(while.1), index=0
-  get-tuple-element.4 = bf16[3,1,8,128]{3,2,1,0} get-tuple-element(while.1), index=3
-  ar.4 = bf16[3,1,8,128]{3,2,1,0} all-reduce(get-tuple-element.4), channel_id=3, replica_groups={}, to_apply=add
-  c1.3 = bf16[] constant(2)
-  broadcast.1 = bf16[3,1,8,128]{3,2,1,0} broadcast(c1.3), dimensions={}
-  mul1.2 = bf16[3,1,8,128]{3,2,1,0} multiply(ar.4, broadcast.1)
-  mul3.2 = bf16[3,1,8,128]{3,2,1,0} multiply(mul1.2, ar.4)
-  reshape.2 = bf16[3,8,128]{2,1,0} reshape(mul3.2)
-  get-tuple-element.6 = bf16[3,8,128]{2,1,0} get-tuple-element(while.1), index=2
-  tuple.4 = (s32[], bf16[3,8,128]{2,1,0}, bf16[3,8,128]{2,1,0}) tuple(get-tuple-element.5, reshape.2, get-tuple-element.6)
-  ROOT gte1 = bf16[3,8,128]{2,1,0} get-tuple-element(tuple.4), index=1
+  p0 = bf16[3,8,128] parameter(0)
+  p1 = bf16[3,128,128] parameter(1)
+  tuple = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,128,128], bf16[3,128,128]) tuple(c0, p0, p0, p1, p1)
+  while = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,128,128], bf16[3,128,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
 }
 )";
   config_.set_use_spmd_partitioning(true);
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_FALSE(
+  EXPECT_TRUE(
       RunOptimizer(
           module.get(), /*last_run=*/false,
           /*level_to_operate_on=*/0,
@@ -3660,8 +3835,15 @@ ENTRY entry {
           /*postprocess_backward_rotated=*/{},
           /*postprocess_backward_peeled_trailing=*/{},
           /*should_add_loop_invariant_op_in_chain=*/false,
-          /*collective_size_threshold_to_stop_sinking=*/1024)
+          /*collective_size_threshold_to_delay_sinking=*/2048)
           .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* while_instr =
+      FindInstruction(module.get(), HloOpcode::kWhile);
+  EXPECT_TRUE(absl::c_none_of(while_instr->while_body()->instructions(),
+                              [](const HloInstruction* instr) {
+                                return instr->opcode() == HloOpcode::kAllReduce;
+                              }));
 }
 
 TEST_F(CollectivePipelinerTest, ForwardSinkFirstDimNotMatchingLoopCount) {
@@ -5281,74 +5463,5 @@ ENTRY entry {
   EXPECT_EQ(fusion_count, 4);
 }
 
-TEST_F(CollectivePipelinerTest, barrier) {
-  constexpr absl::string_view hlo_string = R"(
-HloModule module
-
-while_cond {
-param = (s32[], bf16[1,8,2048,32768]{3,2,1,0}, bf16[1,8,2048,32768]{3,2,1,0}) parameter(0)
-gte = s32[] get-tuple-element(param), index=0
-constant.1 = s32[] constant(3)
-ROOT cmp = pred[] compare(gte, constant.1), direction=LT
-}
-
-while_body {
-param = (s32[], bf16[1,8,2048,32768]{3,2,1,0}, bf16[1,8,2048,32768]{3,2,1,0}) parameter(0)
-get-tuple-element.394 = s32[] get-tuple-element(param), index=0
-get-tuple-element.395 = bf16[1,8,2048,32768]{3,2,1,0} get-tuple-element(param), index=1
-get-tuple-element.397 = bf16[1,8,2048,32768]{3,2,1,0} get-tuple-element(param), index=2
-
-constant.1 = bf16[] constant(2)
-broadcast.3593 = bf16[1,8,2048,32768]{3,2,1,0} broadcast(constant.1), dimensions={}
-
-add.2 = bf16[1,8,2048,32768]{3,2,1,0} add(broadcast.3593, get-tuple-element.395)
-tuple.39 = (bf16[1,8,2048,32768]{3,2,1,0}) tuple(add.2)
-opt-barrier.13 = (bf16[1,8,2048,32768]{3,2,1,0}) opt-barrier(%tuple.39)
-get-tuple-element.937 = bf16[1,8,2048,32768]{3,2,1,0} get-tuple-element(%opt-barrier.13), index=0
-
-all-gather.1 = bf16[1,64,2048,32768]{3,2,1,0} all-gather(get-tuple-element.937), channel_id=1, dimensions={1}, replica_groups={}
-slice.2 = bf16[1,8,2048,32768]{3,2,1,0} slice(all-gather.1), slice={[0:1], [8:16], [0:2048], [0:32768]}
-tuple.40 = (bf16[1,8,2048,32768]{3,2,1,0}) tuple(slice.2)
-opt-barrier.14 = (bf16[1,8,2048,32768]{3,2,1,0}) opt-barrier(tuple.40)
-get-tuple-element.938 = bf16[1,8,2048,32768]{3,2,1,0} get-tuple-element(opt-barrier.14), index=0
-
-constant.2 = s32[] constant(1)
-add.230 = s32[] add(get-tuple-element.394, constant.2)
-ROOT tuple = (s32[], bf16[1,8,2048,32768]{3,2,1,0}, bf16[1,8,2048,32768]{3,2,1,0}) tuple(add.230, add.2, get-tuple-element.938)
-}
-
-ENTRY entry {
-c0 = s32[] constant(0)
-p0 = bf16[1,8,2048,32768]{3,2,1,0} parameter(0)
-p1 = bf16[1,8,2048,32768]{3,2,1,0} parameter(1)
-
-tuple = (s32[], bf16[1,8,2048,32768]{3,2,1,0}, bf16[1,8,2048,32768]{3,2,1,0}) tuple(c0, p0, p1)
-while = (s32[], bf16[1,8,2048,32768]{3,2,1,0}, bf16[1,8,2048,32768]{3,2,1,0}) while(tuple), condition=while_cond, body=while_body
-ROOT gte1 = bf16[1,8,2048,32768]{3,2,1,0} get-tuple-element(while), index=1
-}
-)";
-
-  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-
-  // We don't expect the graph to change as the all-gather has an opt-barrier
-  // in the producer chain.
-  EXPECT_FALSE(
-      RunOptimizer(
-          module.get(), /*last_run=*/true, 0,
-          /*pipeline_use_tree=*/false,
-          /*process_different_sized_ops=*/false,
-          /*direction=*/
-          collective_pipeliner_utils::PipeliningDirection::kBackward,
-          /*should_process=*/IsAllGather,
-          /*acceptable_formatting=*/
-          HloPredicateIsNotOp<HloOpcode::kOptimizationBarrier>,
-          /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
-          /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateTrue,
-          /*postprocess_backward_peeled=*/{},
-          /*postprocess_backward_rotated=*/{},
-          /*postprocess_backward_peeled_trailing=*/{},
-          /*should_add_loop_invariant_op_in_chain=*/true)
-          .value());
-}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/collective_utils.h b/third_party/xla/xla/service/collective_utils.h
index 75d05aaef9d4b8..b2f55f29870fe5 100644
--- a/third_party/xla/xla/service/collective_utils.h
+++ b/third_party/xla/xla/service/collective_utils.h
@@ -70,6 +70,11 @@ constexpr char kSolChunkSizeBytes[] = "chunk_size_bytes";
 // cost model.
 constexpr char kSolGpusPerNode[] = "gpus_per_node";
 
+// Defines the partition size (number of devices per fast-interconnect domain)
+// used by the SoL cost model. This is necessary for AOT compilation when the
+// partition is larger than a node.
+constexpr char kSolPartitionSize[] = "partition_size";
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_COLLECTIVE_UTILS_H_
diff --git a/third_party/xla/xla/service/compilation_environments.cc b/third_party/xla/xla/service/compilation_environments.cc
index 5bf2a56c53093b..8227421765dfa0 100644
--- a/third_party/xla/xla/service/compilation_environments.cc
+++ b/third_party/xla/xla/service/compilation_environments.cc
@@ -210,7 +210,7 @@ absl::Status CompilationEnvironments::InitializeAllKnownEnvs() {
 absl::Status CompilationEnvironments::AddEnv(
     std::unique_ptr<tsl::protobuf::Message> env) {
   if (!env) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Can not add a null compilation environment.");
   }
   const tsl::protobuf::Descriptor& descriptor = *env->GetDescriptor();
diff --git a/third_party/xla/xla/service/compilation_environments.h b/third_party/xla/xla/service/compilation_environments.h
index 060bfe44910755..cbfa2a8aaf88d8 100644
--- a/third_party/xla/xla/service/compilation_environments.h
+++ b/third_party/xla/xla/service/compilation_environments.h
@@ -1,4 +1,3 @@
-#include "tsl/platform/status.h"
 /* Copyright 2022 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,17 +16,15 @@ limitations under the License.
 #ifndef XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
 #define XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
 
-#include <cstdint>
 #include <functional>
 #include <memory>
-#include <typeindex>
-#include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/casts.h"
-#include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
@@ -156,17 +153,12 @@ T& CompilationEnvironments::GetMutableEnv() {
   }
 
   if (it == environments_.end()) {
-    TF_CHECK_OK(AddEnvImpl(*descriptor, nullptr));
+    CHECK_OK(AddEnvImpl(*descriptor, nullptr));
     DefaultEnvCreatedByCompilationEnvironments(descriptor->full_name());
     it = environments_.find(descriptor);
   }
 
-  // TODO(b/302086111): Remove after XLA has an updated protobuf version.
-#if TSL_IS_IN_OSS
-  return tensorflow::down_cast<T&>(*it->second);
-#else
   return tsl::protobuf::DownCastToGenerated<T>(*it->second);
-#endif
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/service/compilation_environments_test.cc b/third_party/xla/xla/service/compilation_environments_test.cc
index 5d3870798d1efc..4237fd72aebb5e 100644
--- a/third_party/xla/xla/service/compilation_environments_test.cc
+++ b/third_party/xla/xla/service/compilation_environments_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/test_compilation_environment.pb.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -30,8 +31,6 @@ limitations under the License.
 
 namespace xla {
 
-using ::tsl::testing::StatusIs;
-
 // In order to use TestCompilationEnvironment* with CompilationEnvironments, we
 // must define ProcessNewEnv for them.
 std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv1(
diff --git a/third_party/xla/xla/service/compiler.cc b/third_party/xla/xla/service/compiler.cc
index b0241163816446..bcd1e5c5a29c1d 100644
--- a/third_party/xla/xla/service/compiler.cc
+++ b/third_party/xla/xla/service/compiler.cc
@@ -41,7 +41,7 @@ namespace xla {
 
 /* static */ absl::Mutex Compiler::platform_compiler_mutex_(absl::kConstInit);
 
-Compiler::TargetConfig::TargetConfig(se::StreamExecutor* s)
+Compiler::GpuTargetConfig::GpuTargetConfig(se::StreamExecutor* s)
     : device_description(s->GetDeviceDescription()),
       platform_name(s->GetPlatform()->Name()),
       device_description_str(s->GetDeviceDescription().name()) {
@@ -54,9 +54,9 @@ Compiler::TargetConfig::TargetConfig(se::StreamExecutor* s)
   }
 }
 
-absl::StatusOr<Compiler::TargetConfig> Compiler::TargetConfig::FromProto(
+absl::StatusOr<Compiler::GpuTargetConfig> Compiler::GpuTargetConfig::FromProto(
     const se::GpuTargetConfigProto& proto) {
-  TargetConfig target_config;
+  GpuTargetConfig target_config;
   TF_ASSIGN_OR_RETURN(
       target_config.device_description,
       stream_executor::DeviceDescription::FromProto(proto.gpu_device_info()));
@@ -68,10 +68,15 @@ absl::StatusOr<Compiler::TargetConfig> Compiler::TargetConfig::FromProto(
                                       proto.runtime_version().minor(),
                                       proto.runtime_version().patch());
   target_config.device_description.set_runtime_version(runtime_version);
+  se::SemanticVersion dnn_version(
+      static_cast<unsigned>(proto.dnn_version_info().major()),
+      static_cast<unsigned>(proto.dnn_version_info().minor()),
+      static_cast<unsigned>(proto.dnn_version_info().patch()));
+  target_config.device_description.set_dnn_version(dnn_version);
   return target_config;
 }
 
-se::GpuTargetConfigProto Compiler::TargetConfig::ToProto() const {
+se::GpuTargetConfigProto Compiler::GpuTargetConfig::ToProto() const {
   stream_executor::GpuTargetConfigProto proto;
   *proto.mutable_gpu_device_info() = device_description.ToGpuProto();
   proto.set_platform_name(platform_name);
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 8e6b6a2440fb1b..7fbdc995bbff68 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -28,12 +28,15 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "google/protobuf/message.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -53,7 +56,6 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
-#include "tsl/platform/protobuf.h"
 
 namespace mlir {
 class DialectRegistry;
@@ -131,15 +133,15 @@ class AotCompilationMetadata {
 class Compiler {
  public:
   // Description of a target device for compilation.
-  struct TargetConfig {
-    explicit TargetConfig(se::StreamExecutor* s);
+  struct GpuTargetConfig {
+    explicit GpuTargetConfig(se::StreamExecutor* s);
 
-    static absl::StatusOr<TargetConfig> FromProto(
+    static absl::StatusOr<GpuTargetConfig> FromProto(
         const se::GpuTargetConfigProto& proto);
 
     se::GpuTargetConfigProto ToProto() const;
 
-    bool operator==(const TargetConfig& other) const {
+    bool operator==(const GpuTargetConfig& other) const {
       // TODO(cheshire): More efficient comparator, this is currently just for
       // tests.
       return ToProto().SerializeAsString() ==
@@ -154,7 +156,18 @@ class Compiler {
     std::string device_description_str;
 
    private:
-    TargetConfig() = default;
+    GpuTargetConfig() = default;
+  };
+
+  // Description of a target CPU for compilation.
+  struct CpuTargetConfig {
+    explicit CpuTargetConfig(
+        const cpu::TargetMachineOptions& target_machine_options)
+        : cpu_target_machine_options(target_machine_options) {};
+
+    // If not set, we default to the options inferred from the host machine.
+    std::optional<cpu::TargetMachineOptions> cpu_target_machine_options =
+        std::nullopt;
   };
 
   struct CompileOptions {
@@ -172,11 +185,15 @@ class Compiler {
         const HloModule& module)>
         layout_canonicalization_callback = {};
 
+    ABSL_DEPRECATED("This field is being deprecated, please do not rely on it.")
     bool is_autotuning_compilation = false;
 
     // AOT device description. If provided, used instead of querying the device
     // on which compilation is performed.
-    std::optional<TargetConfig> target_config;
+    std::optional<GpuTargetConfig> gpu_target_config;
+
+    // CPU specific target information.
+    std::optional<CpuTargetConfig> cpu_target_config;
 
     MultiProcessKeyValueStore key_value_store;
 
@@ -354,6 +371,13 @@ class Compiler {
     return Unimplemented("DeserializeExecutable unimplemented");
   }
 
+  // Creates an `Executable` based on the given `aot_result`.
+  virtual absl::StatusOr<std::unique_ptr<Executable>>
+  LoadExecutableFromAotResult(const AotCompilationResult& aot_result,
+                              const se::StreamExecutor& stream_exec) {
+    return Unimplemented("LoadExecutableFromAotResult unimplemented");
+  }
+
  private:
   // Mutex that guards the platform-compiler map.
   static absl::Mutex platform_compiler_mutex_;
@@ -468,11 +492,12 @@ class AotCompilationOptions {
     sanitize_abilists_dataflow_ = abilists;
   }
 
-  const std::optional<Compiler::TargetConfig>& target_config() const {
-    return target_config_;
+  const std::optional<Compiler::GpuTargetConfig>& gpu_target_config() const {
+    return gpu_target_config_;
   }
-  void set_target_config(const Compiler::TargetConfig& target_config) {
-    target_config_ = target_config;
+  void set_gpu_target_config(
+      const Compiler::GpuTargetConfig& gpu_target_config) {
+    gpu_target_config_ = gpu_target_config;
   }
 
  protected:
@@ -493,7 +518,7 @@ class AotCompilationOptions {
   bool sanitize_dataflow_ = false;
   std::vector<std::string> sanitize_abilists_dataflow_;
   // Contains target-specific information required by AOT compilation.
-  std::optional<Compiler::TargetConfig> target_config_;
+  std::optional<Compiler::GpuTargetConfig> gpu_target_config_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/compiler_test.cc b/third_party/xla/xla/service/compiler_test.cc
index aa6745d3167c49..b5df2881583cf6 100644
--- a/third_party/xla/xla/service/compiler_test.cc
+++ b/third_party/xla/xla/service/compiler_test.cc
@@ -35,7 +35,7 @@ TEST(TargetConfigTest, ExecutorConstructorFillsAllFields) {
   TF_ASSERT_OK_AND_ASSIGN(
       stream_executor::StreamExecutor * executor,
       stream_executor::GPUMachineManager()->ExecutorForDevice(0));
-  Compiler::TargetConfig config(executor);
+  Compiler::GpuTargetConfig config(executor);
   stream_executor::GpuTargetConfigProto target = config.ToProto();
 
   // We don't attempt to validate values because doing so would require talking
@@ -63,7 +63,7 @@ TEST(TargetConfigTest, ProtoConstructorFillsAllFields) {
   config_proto.set_device_description_str("foo");
 
   TF_ASSERT_OK_AND_ASSIGN(auto config,
-                          Compiler::TargetConfig::FromProto(config_proto));
+                          Compiler::GpuTargetConfig::FromProto(config_proto));
   stream_executor::GpuTargetConfigProto target = config.ToProto();
 
   EXPECT_EQ(target.dnn_version_info().major(),
diff --git a/third_party/xla/xla/service/conditional_code_motion.cc b/third_party/xla/xla/service/conditional_code_motion.cc
index 2c85399cf42082..84d630eefff3f6 100644
--- a/third_party/xla/xla/service/conditional_code_motion.cc
+++ b/third_party/xla/xla/service/conditional_code_motion.cc
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
@@ -102,7 +101,7 @@ HloInstruction* CloneNestedTuples(HloInstruction* tuple) {
                                             /* accept_different_shape =*/true);
     } else {
       for (auto tuple_user : tuple_users) {
-        TF_CHECK_OK(tuple->ReplaceUseWithDifferentShape(tuple_user, new_tuple));
+        CHECK_OK(tuple->ReplaceUseWithDifferentShape(tuple_user, new_tuple));
       }
     }
     return new_tuple;
@@ -635,7 +634,7 @@ absl::StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
     // The shape can vary since the operands to convert are now
     // being returned through the branches' root.
     cur_branch->set_root_instruction(new_branch_root, true /*new shape*/);
-    TF_CHECK_OK(cur_branch->RemoveInstruction(old_root));
+    CHECK_OK(cur_branch->RemoveInstruction(old_root));
 
     // Only one of the branches needs to change the conditional->parent().
     if (branch != 0) {
@@ -651,7 +650,7 @@ absl::StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
     // Ensure that all the users of conditional refer to the new one.
     TF_RETURN_IF_ERROR(
         conditional->ReplaceAllUsesWithDifferentShape(newconditional));
-    TF_CHECK_OK(conditional_parent->RemoveInstruction(conditional));
+    CHECK_OK(conditional_parent->RemoveInstruction(conditional));
     conditional = newconditional;
     // Add the hoisted instructions in the parent.
     for (HloInstruction* hoist : to_hoist_set) {
@@ -671,7 +670,7 @@ absl::StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
           hoist->CloneWithNewOperands(hoist->shape(), new_operands));
       VLOG(2) << "Hoisted instruction in parent:" << hoisted->ToString();
       TF_RETURN_IF_ERROR(gte_hoist->ReplaceAllUsesWith(hoisted));
-      TF_CHECK_OK(conditional_parent->RemoveInstruction(gte_hoist));
+      CHECK_OK(conditional_parent->RemoveInstruction(gte_hoist));
     }
     // No need to explicitly delete a hoisted instruction since if its dead
     // then the subsequent DCE will remove it.
@@ -1043,9 +1042,7 @@ class MoveOperandIntoBranch {
         VLOG(1) << "matching_tuple_indices: "
                 << matching_tuple_indices[matching_index][0] << "\n";
         if (matching_tuple_indices[matching_index].end() ==
-            std::find(matching_tuple_indices[matching_index].begin(),
-                      matching_tuple_indices[matching_index].end(),
-                      tuple_index)) {
+            absl::c_find(matching_tuple_indices[matching_index], tuple_index)) {
           continue;
         }
         for (HloInstruction* param_user : param_users) {
@@ -1065,7 +1062,7 @@ class MoveOperandIntoBranch {
           } else {
             VLOG(1) << "new_param_shape=" << new_param_shape->ToString();
             *param_user->mutable_shape() = *new_param_shape;
-            TF_CHECK_OK(param_user->ReplaceAllUsesWith(branch_param));
+            CHECK_OK(param_user->ReplaceAllUsesWith(branch_param));
           }
         }
       }
@@ -1115,8 +1112,7 @@ class MoveOperandIntoBranch {
       }
       while (repl_count < new_operands.size()) {
         HloInstruction* new_input = new_operands[repl_count++];
-        auto new_input_in_user = std::find(user->operands().begin(),
-                                           user->operands().end(), new_input);
+        auto new_input_in_user = absl::c_find(user->operands(), new_input);
         int64_t opd_index = (new_input_in_user == user->operands().end())
                                 ? user->operand_count()
                                 : new_input_in_user - user->operands().begin();
@@ -1943,16 +1939,15 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
                           ? Decision::Direction::kMoveOutOfBranch
                           : Decision::Direction::kMoveIntoBranch,
                       benefit);
-    } else {
-      connect.clear_recently_visited();
     }
+    connect.clear_recently_visited();
   } else {
     connect.AddNewBoundaries(new_boundaries);
   }
   return Decision(Decision::Direction::kNoChange, 0);
 }
 
-absl::StatusOr<bool> ConditionalCodeMotion::Run(
+absl::StatusOr<bool> ConditionalCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Begin a new pass of conditional code motion optimization.\n";
diff --git a/third_party/xla/xla/service/conditional_code_motion.h b/third_party/xla/xla/service/conditional_code_motion.h
index 2a345e27cbaff8..a141d450a18844 100644
--- a/third_party/xla/xla/service/conditional_code_motion.h
+++ b/third_party/xla/xla/service/conditional_code_motion.h
@@ -185,10 +185,6 @@ class ConditionalCodeMotion : public HloModulePass {
   }
 
   absl::string_view name() const override { return "conditional-code-motion"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Optimization decision for each boundary of the conditional instruction.
   class Decision {
@@ -216,6 +212,11 @@ class ConditionalCodeMotion : public HloModulePass {
       std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries,
       absl::flat_hash_map<HloInstruction*, int>& visited_count);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const bool is_layout_sensitive_;
   const bool pursue_full_conditional_code_motion_;
diff --git a/third_party/xla/xla/service/conditional_simplifier.cc b/third_party/xla/xla/service/conditional_simplifier.cc
index 2bf4145c3257a0..0308e66d7d3bfc 100644
--- a/third_party/xla/xla/service/conditional_simplifier.cc
+++ b/third_party/xla/xla/service/conditional_simplifier.cc
@@ -606,11 +606,11 @@ static bool InstructionCallsChannelInstructions(
   return false;
 }
 
-absl::StatusOr<bool> ConditionalSimplifier::Run(
+absl::StatusOr<bool> ConditionalSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      3, "ConditionalSimplifier::Run(), before:\n" + module->ToString());
+      3, "ConditionalSimplifier::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
 
   // Gather all the conditional ops in our module. We do this ahead of time so
@@ -674,8 +674,8 @@ absl::StatusOr<bool> ConditionalSimplifier::Run(
     changed |= result;
   }
 
-  XLA_VLOG_LINES(3,
-                 "ConditionalSimplifier::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "ConditionalSimplifier::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/conditional_simplifier.h b/third_party/xla/xla/service/conditional_simplifier.h
index 01363f94580b68..ff99641290605b 100644
--- a/third_party/xla/xla/service/conditional_simplifier.h
+++ b/third_party/xla/xla/service/conditional_simplifier.h
@@ -28,8 +28,9 @@ namespace xla {
 class ConditionalSimplifier : public HloModulePass {
  public:
   absl::string_view name() const override { return "simplify-conditional"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/conditional_to_select.cc b/third_party/xla/xla/service/conditional_to_select.cc
index 95febe9bee07d9..e750fec2e2cf8f 100644
--- a/third_party/xla/xla/service/conditional_to_select.cc
+++ b/third_party/xla/xla/service/conditional_to_select.cc
@@ -104,7 +104,7 @@ static absl::StatusOr<bool> DoConditionalToSelect(HloInstruction* conditional) {
   return true;
 }
 
-absl::StatusOr<bool> ConditionalToSelect::Run(
+absl::StatusOr<bool> ConditionalToSelect::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
diff --git a/third_party/xla/xla/service/conditional_to_select.h b/third_party/xla/xla/service/conditional_to_select.h
index 8bba94a9329ff0..883d4e4ecb0ecf 100644
--- a/third_party/xla/xla/service/conditional_to_select.h
+++ b/third_party/xla/xla/service/conditional_to_select.h
@@ -31,10 +31,8 @@ class ConditionalToSelect : public HloModulePass {
   ~ConditionalToSelect() override = default;
   absl::string_view name() const override { return "conditional-to-select"; }
 
-  // Run conditional to select on the given computation. Returns whether the
-  // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/copy_insertion.cc b/third_party/xla/xla/service/copy_insertion.cc
index 2a0490b7029163..57b1cfa96e20f0 100644
--- a/third_party/xla/xla/service/copy_insertion.cc
+++ b/third_party/xla/xla/service/copy_insertion.cc
@@ -1183,7 +1183,7 @@ absl::Status CopyInsertion::AddCopiesToResolveInterference(
         // have been copied.
         absl::flat_hash_set<int64_t> copied_operands;
         for (const auto& operand_and_output_index :
-             HloDataflowAnalysis::GetInPlaceInputOutputPairs(
+             alias_info_->GetInPlaceInputOutputPairs(
                  // Input/output buffer aliasing analysis needs to be done
                  // directly with the wrapped instruction when the compiler sees
                  // an async box.
@@ -1499,7 +1499,7 @@ absl::Status CopyInsertion::RemoveUnnecessaryCopies(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> CopyInsertion::Run(
+absl::StatusOr<bool> CopyInsertion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Copy insertion is performed in three steps:
diff --git a/third_party/xla/xla/service/copy_insertion.h b/third_party/xla/xla/service/copy_insertion.h
index beb08a99ea8e5b..cce623ebc236f7 100644
--- a/third_party/xla/xla/service/copy_insertion.h
+++ b/third_party/xla/xla/service/copy_insertion.h
@@ -70,13 +70,6 @@ class CopyInsertion : public HloModulePass {
         use_region_based_live_range_analysis_(
             use_region_based_live_range_analysis) {}
 
-  // Run the pass on the given module. Returns whether the module was changed
-  // (copies were inserted).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Try to remove as many copies from the module as possible without
   // introducing live range interference. Only copy instructions that are
   // eligible for copy elision are considered for removal.
@@ -126,6 +119,12 @@ class CopyInsertion : public HloModulePass {
   // with its operand.
   const AliasInfo* alias_info_;
 
+  // Run the pass on the given module. Returns whether the module was changed
+  // (copies were inserted).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   absl::Status AddCopiesToResolveInterference(
       HloModule* module,
diff --git a/third_party/xla/xla/service/copy_insertion_test.cc b/third_party/xla/xla/service/copy_insertion_test.cc
index 884f22eca08656..76b139413ab71b 100644
--- a/third_party/xla/xla/service/copy_insertion_test.cc
+++ b/third_party/xla/xla/service/copy_insertion_test.cc
@@ -46,10 +46,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test_benchmark.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test_benchmark.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -804,9 +802,9 @@ ENTRY %DependentTupleElements.While () -> (s32[], f32[8]) {
       outer_while_body->AddInstruction(HloInstruction::CreateWhile(
           while_hlo->shape(), while_hlo->while_condition(),
           while_hlo->while_body(), dual_init));
-  TF_CHECK_OK(outer_while_body->ReplaceInstruction(
+  CHECK_OK(outer_while_body->ReplaceInstruction(
       outer_while_body->root_instruction(), dual_while));
-  TF_CHECK_OK(while_hlo->parent()->ReplaceInstruction(while_hlo, outer_while));
+  CHECK_OK(while_hlo->parent()->ReplaceInstruction(while_hlo, outer_while));
   InsertCopies(module_.get());
 }
 
@@ -863,9 +861,9 @@ ENTRY %DependentTupleElements.While () -> (s32[], f32[8]) {
       outer_while_body->AddInstruction(HloInstruction::CreateWhile(
           while_hlo->shape(), while_hlo->while_condition(),
           while_hlo->while_body(), outer_param));
-  TF_CHECK_OK(outer_while_body->ReplaceInstruction(
+  CHECK_OK(outer_while_body->ReplaceInstruction(
       outer_while_body->root_instruction(), dual_while));
-  TF_CHECK_OK(while_hlo->parent()->ReplaceInstruction(while_hlo, outer_while));
+  CHECK_OK(while_hlo->parent()->ReplaceInstruction(while_hlo, outer_while));
   InsertCopies(module_.get());
 }
 
@@ -930,9 +928,9 @@ ENTRY %DependentTupleElements.While () -> (s32[], f32[8]{0}, s32[], f32[8]{0}, s
       outer_while_body->AddInstruction(HloInstruction::CreateWhile(
           while_hlo->shape(), while_hlo->while_condition(),
           while_hlo->while_body(), dual_init));
-  TF_CHECK_OK(outer_while_body->ReplaceInstruction(
+  CHECK_OK(outer_while_body->ReplaceInstruction(
       outer_while_body->root_instruction(), dual_while));
-  TF_CHECK_OK(while_hlo->parent()->ReplaceInstruction(while_hlo, outer_while));
+  CHECK_OK(while_hlo->parent()->ReplaceInstruction(while_hlo, outer_while));
   InsertCopies(module_.get());
 }
 
@@ -2923,56 +2921,6 @@ ENTRY main {
                   op::Fusion(), op::Iota(), op::Iota(), op::Iota()));
 }
 
-TEST_F(CopyInsertionTest, HorizontalLoopFusionNoCopy) {
-  const std::string& hlo_string = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      add0 = f32[10, 20] add(p0, p1)
-      sub0 = f32[10, 10] subtract(p2, p3)
-      reshape0 = f32[200] reshape(add0)
-      reshape1 = f32[100] reshape(sub0)
-      concat0 = f32[300] concatenate(reshape0, reshape1), dimensions={0}
-      slice0 = f32[200] slice(concat0), slice={[0:200]}
-      slice1 = f32[100] slice(concat0), slice={[200:300]}
-      ROOT tuple = (f32[200], f32[100]) tuple(slice0, slice1)
-    }
-
-    ENTRY test {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      fusion = (f32[200], f32[100]) fusion(p0, p1, p2, p3), kind=kInput, calls=fused_computation
-      gte0 = f32[200] get-tuple-element(fusion), index=0
-      gte1 = f32[100] get-tuple-element(fusion), index=1
-      bitcast0 = f32[10,20] bitcast(gte0)
-      bitcast1 = f32[10,10] bitcast(gte1)
-      ROOT tuple = (f32[10,20], f32[10,10]) tuple(bitcast0, bitcast1)
-    }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0},
-      /*param_number=*/0,
-      /*param_index=*/{}));
-  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1},
-      /*param_number=*/3,
-      /*param_index=*/{}));
-
-  InsertCopies(module.get());
-
-  // There should be no copies inserted.
-  EXPECT_EQ(CountCopies(*module), 0);
-}
-
 TEST_F(CopyInsertionTest, NestedWhileAndConditional3) {
   const std::string& hlo_string = R"(
 HloModule TestModule
diff --git a/third_party/xla/xla/service/copy_removal.cc b/third_party/xla/xla/service/copy_removal.cc
index b30ff8493194f3..95270c1204d62a 100644
--- a/third_party/xla/xla/service/copy_removal.cc
+++ b/third_party/xla/xla/service/copy_removal.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/util.h"
 
 using absl::StrAppend;
@@ -320,7 +319,7 @@ bool ComputeRelativeLocation::AddControlDependenceForUnorderedOps() {
         VLOG(3) << "   Adding control dependence between:";
         VLOG(3) << "     predecessor: " << entry2->name();
         VLOG(3) << "       successor: " << entry1->name();
-        TF_CHECK_OK(entry2->AddControlDependencyTo(entry1));
+        CHECK_OK(entry2->AddControlDependencyTo(entry1));
       }
       reachability_map.UpdateReachabilityThroughInstruction(entry1);
       for (HloInstruction* entry2 : instr_it.second) {
@@ -347,8 +346,9 @@ bool ComputeRelativeLocation::ForceRuntimeOrder(
                "kBeforeStart or kAfterEnd";
     return false;
   }
-  auto ModifiesNonCopy = [](HloInstruction* instr, const HloInstruction* op) {
-    auto in_place = HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr);
+  auto ModifiesNonCopy = [this](HloInstruction* instr,
+                                const HloInstruction* op) {
+    auto in_place = alias_info_->GetInPlaceInputOutputPairs(instr);
     if (in_place.empty()) {
       return false;
     }
@@ -436,7 +436,7 @@ bool ComputeRelativeLocation::InstructionCanIntercept(
     // If the instruction only uses the value, it can intercept only if it
     // modifies the buffer in place.
     for (const auto& operand_and_output_index :
-         HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr)) {
+         alias_info_->GetInPlaceInputOutputPairs(instr)) {
       const HloOperandIndex& operand_index = operand_and_output_index.first;
       if (region.contains(
               instr->mutable_operand(operand_index.operand_number))) {
@@ -731,7 +731,7 @@ CopyRemover::CopyRemover(
   CreateCopyMap(module, value_to_node);
 
   XLA_VLOG_LINES(3, ToString());
-  TF_DCHECK_OK(Verify());
+  DCHECK_OK(Verify());
 }
 
 // Add a list containing the given values to CopyRemover. This
@@ -1193,7 +1193,7 @@ bool CopyRemover::TryElideCopy(
   RemoveCopyValue(copy_node.dest);
 
   XLA_VLOG_LINES(4, ToString());
-  TF_DCHECK_OK(Verify());
+  DCHECK_OK(Verify());
   VLOG(3) << "TryElideCopy succeeded for: " << copy->name();
   return true;
 }
@@ -1294,7 +1294,7 @@ bool CopyRemover::ValuesInterfere(const ValueNode* src, const ValueNode* dest,
   VLOG(5) << "    ValuesInterfere destination live range:\n"
           << dest_live_range.ToString();
 
-  ComputeRelativeLocation relative_location_analysis(ordering_);
+  ComputeRelativeLocation relative_location_analysis(ordering_, alias_info_);
   auto rel1 = relative_location_analysis.ComputeBetweenLiveRangeRegions(
       src_live_range, dest_live_range);
   VLOG(3) << "    ValuesInterfere - location of dest in relation to src: ";
diff --git a/third_party/xla/xla/service/copy_removal.h b/third_party/xla/xla/service/copy_removal.h
index f30efeaf7b9ba1..ac1e7b65c367be 100644
--- a/third_party/xla/xla/service/copy_removal.h
+++ b/third_party/xla/xla/service/copy_removal.h
@@ -289,8 +289,8 @@ class Relation {
 class ComputeRelativeLocation {
  public:
   typedef LiveRangeRegions::InstructionEntry InstructionEntry;
-  explicit ComputeRelativeLocation(HloOrdering* ordering)
-      : ordering_(ordering) {
+  ComputeRelativeLocation(HloOrdering* ordering, const AliasInfo* alias_info)
+      : ordering_(ordering), alias_info_(alias_info) {
     VLOG(3) << "New analysis";
   }
 
@@ -352,6 +352,7 @@ class ComputeRelativeLocation {
                                                 HloInstruction* instr2);
 
   HloOrdering* ordering_;
+  const AliasInfo* alias_info_;
   absl::flat_hash_map<
       HloInstruction*,
       absl::flat_hash_map<HloInstruction*, Relation::RuntimeOrder>>
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index dfa8a446eaa45c..ef1c70ed5b7664 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -1,11 +1,3 @@
-# Description:
-#    LLVM-based CPU backend for XLA.
-
-load(
-    "//third_party/compute_library:build_defs.bzl",
-    "acl_deps",
-    "if_enable_acl",
-)
 load(
     "//xla:xla.default.bzl",
     "xla_cc_test",
@@ -20,10 +12,12 @@ load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 load(":build_defs.bzl", "runtime_copts")
 
 package(
@@ -58,32 +52,9 @@ filegroup(
     name = "runtime_srcs",
     srcs = [
         # Single-threaded support.
-        "runtime_custom_call_status.cc",
-        "runtime_fp16.cc",
-        "runtime_key_value_sort.cc",
-        "runtime_pow.cc",
-        "runtime_single_threaded_conv2d.cc",
-        "runtime_single_threaded_conv3d.cc",
-        "runtime_single_threaded_matmul_c128.cc",
-        "runtime_single_threaded_matmul_c64.cc",
         "runtime_single_threaded_matmul_common.h",
-        "runtime_single_threaded_matmul_f8.cc",
-        "runtime_single_threaded_matmul_f16.cc",
         "runtime_single_threaded_matmul_f32.cc",
-        "runtime_single_threaded_matmul_f64.cc",
-        "runtime_single_threaded_matmul_s32.cc",
-        "runtime_single_threaded_matmul_u8.cc",
-        "runtime_topk.cc",
         # Multi-threaded support.
-        "runtime_conv2d.cc",
-        "runtime_conv3d.cc",
-        "runtime_matmul_c128.cc",
-        "runtime_matmul_c64.cc",
-        "runtime_matmul_common.h",
-        "runtime_matmul_f16.cc",
-        "runtime_matmul_f32.cc",
-        "runtime_matmul_f64.cc",
-        "runtime_matmul_s32.cc",
         "//xla/backends/cpu/runtime:runtime_srcs",
     ],
     visibility = internal_visibility([":friends"]),
@@ -93,19 +64,8 @@ filegroup(
     name = "runtime_hdrs",
     srcs = [
         # Single-threaded support.
-        "runtime_custom_call_status.h",
-        "runtime_fp16.h",
-        "runtime_key_value_sort.h",
-        "runtime_pow.h",
-        "runtime_single_threaded_conv2d.h",
-        "runtime_single_threaded_conv3d.h",
         "runtime_single_threaded_matmul.h",
-        "runtime_topk.h",
         # Multi-threaded support.
-        "runtime_conv2d.h",
-        "runtime_conv3d.h",
-        "runtime_lightweight_check.h",
-        "runtime_matmul.h",
         "//xla/backends/cpu/runtime:runtime_hdrs",
     ],
     visibility = internal_visibility([":friends"]),
@@ -170,25 +130,12 @@ cc_library(
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
 
-cc_library(
-    name = "buffer_info_util",
-    srcs = ["buffer_info_util.cc"],
-    hdrs = ["buffer_info_util.h"],
-    deps = [
-        "//xla:cpu_function_runtime",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "cpu_compiler_pure",
     srcs = ["cpu_compiler.cc"],
     hdrs = ["cpu_compiler.h"],
     copts = tsl_copts(),
     deps = [
-        ":buffer_info_util",
         ":conv_canonicalization",
         ":cpu_aot_compilation_result",
         ":cpu_aot_loader",
@@ -206,10 +153,8 @@ cc_library(
         ":ir_emitter2",
         ":metrics",
         ":parallel_task_assignment",
-        ":runtime_symbol_generator",
         ":small_while_loop_hoisting_pass",
         ":thunk_emitter",
-        "//xla:cpu_function_runtime",
         "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:literal_pool",
@@ -223,7 +168,9 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu:xnn_support",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:compiled_function_library",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
@@ -242,7 +189,6 @@ cc_library(
         "//xla/backends/cpu/transforms/collectives:all_reduce_combiner",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/analysis:indexed_array_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/pass:hlo_pass",
@@ -411,6 +357,8 @@ cc_library(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
@@ -419,6 +367,8 @@ cc_library(
         ":onednn_contraction_rewriter",
         ":onednn_float_support",
         ":onednn_ops_rewriter",
+    ]) + if_ynnpack([
+        "//xla/backends/cpu:ynn_support",
     ]),
 )
 
@@ -427,12 +377,13 @@ cc_library(
     srcs = ["cpu_aot_compilation_result.cc"],
     hdrs = ["cpu_aot_compilation_result.h"],
     deps = [
-        ":buffer_info_util",
         ":cpu_executable",
         ":executable_proto_cc",
-        "//xla:cpu_function_runtime",
         "//xla:util",
+        "//xla/backends/cpu:buffer_allocation_info",
+        "//xla/backends/cpu:buffer_allocation_info_util",
         "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/backends/cpu/runtime:thunk_proto_cc",
@@ -516,19 +467,19 @@ xla_test(
     ],
 )
 
-xla_test(
+xla_cc_test(
     name = "cpu_compiler_internals_test",
     srcs = ["cpu_compiler_internals_test.cc"],
-    backends = [
-        "cpu",
-    ],
     deps = [
+        ":cpu_compiler_pure",
         "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:llvm_compiler",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/base:nullability",
@@ -547,10 +498,12 @@ xla_test(
     deps = [
         ":cpu_aot_compilation_result",
         ":test_header_helper",
+        "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:literal_util",
+        "//xla/backends/cpu:target_machine_options",
+        "//xla/backends/cpu/codegen:cpu_features",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/service:compiler",
         "//xla/service:executable",
         "//xla/service:hlo_runner",
@@ -562,7 +515,11 @@ xla_test(
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -576,82 +533,11 @@ tf_proto_library(
     ],
 )
 
-cc_library(
-    name = "runtime_symbol_generator",
-    srcs = [
-        "runtime_symbol_generator.cc",
-        "windows_compatibility.cc",
-        "windows_compatibility.h",
-    ],
-    hdrs = ["runtime_symbol_generator.h"],
-    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(),
-    deps = [
-        ":cpu_runtime",
-        ":runtime_conv2d",
-        ":runtime_conv2d_acl",
-        ":runtime_conv3d",
-        ":runtime_custom_call_status",
-        ":runtime_fp16",
-        ":runtime_key_value_sort",
-        ":runtime_matmul",
-        ":runtime_matmul_acl",
-        ":runtime_pow",
-        ":runtime_single_threaded_conv2d",
-        ":runtime_single_threaded_conv3d",
-        ":runtime_single_threaded_matmul",
-        ":runtime_topk",
-        "//xla/service:custom_call_target_registry",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:OrcShared",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:mlir_c_runner_utils",
-        "@local_tsl//tsl/platform:logging",
-    ] + if_onednn([
-        ":onednn_convolution",
-        ":onednn_matmul",
-    ]),
-)
-
-cc_library(
-    name = "runtime_lightweight_check",
-    hdrs = ["runtime_lightweight_check.h"],
-    compatible_with = get_compatible_with_portable(),
-    copts = runtime_copts(),
-)
-
-cc_library(
-    name = "runtime_fp16",
-    srcs = [
-        "runtime_fp16.cc",
-    ],
-    hdrs = [
-        "runtime_fp16.h",
-    ],
-    copts = runtime_copts(),
-    deps = ["@com_google_absl//absl/base:core_headers"],
-)
-
-cc_library(
-    name = "runtime_pow",
-    srcs = [
-        "runtime_pow.cc",
-    ],
-    hdrs = [
-        "runtime_pow.h",
-    ],
-    copts = runtime_copts(),
-    deps = ["@com_google_absl//absl/base:core_headers"],
-)
-
 cc_library(
     name = "cpu_executable",
     srcs = ["cpu_executable.cc"],
     hdrs = ["cpu_executable.h"],
     deps = [
-        ":cpu_runtime",
         ":executable_proto_cc",
         "//xla:executable_run_options",
         "//xla:shape_tree",
@@ -660,6 +546,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu/runtime:buffer_allocations",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/backends/cpu/runtime:thread_pool_task_runner",
@@ -773,10 +660,11 @@ xla_cc_test(
         ":cpu_options",
         ":ir_emitter",
         ":ir_function",
-        ":runtime_symbol_generator",
         ":target_machine_features_stub",
         "//xla:shape_util",
         "//xla/backends/cpu:alignment",
+        "//xla/backends/cpu:target_machine_options",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
         "//xla/backends/cpu/codegen:ir_compiler",
@@ -929,7 +817,7 @@ cc_library(
     srcs = ["thunk_emitter.cc"],
     hdrs = ["thunk_emitter.h"],
     copts = tsl_copts(),
-    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
+    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]) + if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":backend_config_proto_cc",
         ":cpu_options",
@@ -938,7 +826,6 @@ cc_library(
         ":ir_emitter2",
         ":parallel_fusion_emitter",
         "//xla:comparison_util",
-        "//xla:cpu_function_runtime",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
@@ -984,13 +871,12 @@ cc_library(
         "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters:computation_fingerprint",
         "//xla/codegen/emitters:kernel_api_builder",
         "//xla/codegen/emitters:kernel_arguments",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:resource_use",
         "//xla/runtime:work_group",
@@ -1001,12 +887,14 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:pattern_matcher",
         "//xla/service/llvm_ir:llvm_util",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1020,6 +908,11 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:traceme",
     ] + if_onednn([
         "//xla/backends/cpu/runtime/onednn:onednn_op_thunk",
+    ]) + if_ynnpack([
+        "//xla/backends/cpu:ynn_emitter",
+        "//xla/backends/cpu:ynn_support",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
     ]),
 )
 
@@ -1034,6 +927,7 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service/llvm_ir:kernel_support_library",
         "//xla/tsl/lib/math:math_util",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
@@ -1126,188 +1020,39 @@ cc_library(
 
 cc_library(
     name = "runtime_conv2d",
-    srcs = ["runtime_conv2d.cc"],
-    hdrs = ["runtime_conv2d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
 )
 
 cc_library(
     name = "runtime_conv3d",
-    srcs = ["runtime_conv3d.cc"],
-    hdrs = ["runtime_conv3d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_custom_call_status",
-    srcs = ["runtime_custom_call_status.cc"],
-    hdrs = ["runtime_custom_call_status.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/service:custom_call_status_internal",
-        "@com_google_absl//absl/base:core_headers",
-    ],
 )
 
 cc_library(
     name = "runtime_matmul",
-    srcs = [
-        "runtime_matmul_c128.cc",
-        "runtime_matmul_c64.cc",
-        "runtime_matmul_common.h",
-        "runtime_matmul_f16.cc",
-        "runtime_matmul_f32.cc",
-        "runtime_matmul_f64.cc",
-        "runtime_matmul_s32.cc",
-    ],
-    hdrs = ["runtime_matmul.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_matmul_acl",
-    srcs = ["runtime_matmul_acl.cc"],
-    hdrs = ["runtime_matmul_acl.h"],
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        ":runtime_matmul",
-        "//xla:executable_run_options",
-        "//xla/tsl/platform:dynamic_annotations",
-        "@com_google_absl//absl/base",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:types",
-    ] + acl_deps(),
-)
-
-cc_library(
-    name = "runtime_conv2d_acl",
-    srcs = [
-        "runtime_conv2d_acl.cc",
-    ],
-    hdrs = ["runtime_conv2d_acl.h"],
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_conv2d",
-        ":runtime_lightweight_check",
-        ":runtime_single_threaded_conv2d",
-        "//xla:executable_run_options",
-        "//xla/tsl/framework/convolution:eigen_helpers",
-        "//xla/tsl/platform:dynamic_annotations",
-        "@com_google_absl//absl/base",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:types",
-    ] + acl_deps(),
 )
 
 cc_library(
     name = "runtime_single_threaded_conv2d",
-    srcs = ["runtime_single_threaded_conv2d.cc"],
-    hdrs = ["runtime_single_threaded_conv2d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
 )
 
 cc_library(
     name = "runtime_single_threaded_conv3d",
-    srcs = ["runtime_single_threaded_conv3d.cc"],
-    hdrs = ["runtime_single_threaded_conv3d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_single_threaded_matmul_impl",
-    srcs = [
-        "runtime_single_threaded_matmul_c128.cc",
-        "runtime_single_threaded_matmul_c64.cc",
-        "runtime_single_threaded_matmul_common.h",
-        "runtime_single_threaded_matmul_f16.cc",
-        "runtime_single_threaded_matmul_f32.cc",
-        "runtime_single_threaded_matmul_f64.cc",
-        "runtime_single_threaded_matmul_f8.cc",
-        "runtime_single_threaded_matmul_s32.cc",
-        "runtime_single_threaded_matmul_u8.cc",
-    ],
-    hdrs = ["runtime_single_threaded_matmul.h"],
-    compatible_with = get_compatible_with_portable(),
-    copts = runtime_copts(),
-    linkstatic = 1,
-    visibility = ["//visibility:private"],
-    deps = [
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl",
-        "@com_google_absl//absl/base:core_headers",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:ml_dtypes",
-    ],
 )
 
 cc_library(
     name = "runtime_single_threaded_matmul",
-    hdrs = ["runtime_single_threaded_matmul.h"],
-    compatible_with = get_compatible_with_portable(),
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_single_threaded_matmul_impl",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:ml_dtypes",
-    ],
 )
 
 cc_library(
     name = "runtime_single_threaded_matmul_nomkl",
+    srcs = [
+        "runtime_single_threaded_matmul_common.h",
+        "runtime_single_threaded_matmul_f32.cc",
+    ],
     hdrs = ["runtime_single_threaded_matmul.h"],
     compatible_with = get_compatible_with_portable(),
     copts = runtime_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        ":runtime_single_threaded_matmul_impl",
         "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl",
         "@com_google_absl//absl/base:core_headers",
         "@eigen_archive//:eigen3",
@@ -1317,53 +1062,7 @@ cc_library(
 
 cc_library(
     name = "runtime_key_value_sort",
-    srcs = ["runtime_key_value_sort.cc"],
-    hdrs = ["runtime_key_value_sort.h"],
-    copts = runtime_copts(),
     visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_topk",
-    srcs = ["runtime_topk.cc"],
-    hdrs = ["runtime_topk.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-    ],
-)
-
-xla_cc_test(
-    name = "cpu_runtime_test",
-    srcs = ["cpu_runtime_test.cc"],
-    shard_count = 10,
-    tags = ["optonly"],
-    deps = [
-        ":cpu_runtime",
-        ":runtime_custom_call_status",
-        ":runtime_matmul",
-        ":runtime_matmul_acl",
-        ":runtime_single_threaded_matmul",
-        "//xla:array2d",
-        "//xla:executable_run_options",
-        "//xla:types",
-        "//xla/client:local_client",
-        "//xla/service:custom_call_status_internal",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings:str_format",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
-    ],
 )
 
 xla_cc_test(
@@ -1382,10 +1081,12 @@ xla_cc_test(
         "//xla/service:transpose_folding",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1564,11 +1265,11 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1608,21 +1309,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "orc_jit_memory_mapper",
-    srcs = ["orc_jit_memory_mapper.cc"],
-    hdrs = ["orc_jit_memory_mapper.h"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/synchronization",
-        "@llvm-project//llvm:ExecutionEngine",
-    ],
-)
-
 xla_cc_test(
     name = "cpu_eigen_tensor_alignment_test",
     size = "small",
@@ -1695,7 +1381,8 @@ tf_proto_library(
     srcs = ["backend_config.proto"],
     protodeps = [
         ":onednn_config_proto",
-        "//xla/backends/cpu:xnnpack_config_proto",
+        "//xla/backends/cpu:xnn_fusion_options_proto",
+        "//xla/backends/cpu:ynn_fusion_options_proto",
     ],
 )
 
@@ -1730,7 +1417,6 @@ onednn_cc_library(
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        ":runtime_lightweight_check",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -1738,6 +1424,7 @@ onednn_cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/mkl:onednn",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
@@ -1756,7 +1443,6 @@ onednn_cc_library(
         ":onednn_config_proto_cc",
         ":onednn_memory_util",
         ":onednn_util",
-        ":runtime_lightweight_check",
         "//xla:executable_run_options",
         "//xla:literal",
         "//xla:shape_util",
@@ -1765,6 +1451,7 @@ onednn_cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
@@ -1783,12 +1470,12 @@ onednn_cc_library(
         ":onednn_config_proto_cc",
         ":onednn_memory_util",
         ":onednn_util",
-        ":runtime_lightweight_check",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/mkl:onednn",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@eigen_archive//:eigen3",
     ],
@@ -1806,11 +1493,11 @@ onednn_cc_library(
         ":backend_config_proto_cc",
         ":onednn_config_proto_cc",
         ":onednn_memory_util",
-        ":runtime_lightweight_check",
         "//xla:executable_run_options",
         "//xla/tsl/mkl:onednn",
         "//xla/tsl/platform:env",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:platform_port",
@@ -1829,11 +1516,11 @@ onednn_cc_library(
         ":backend_config_proto_cc",
         ":onednn_config_proto_cc",
         ":onednn_memory_util",
-        ":runtime_lightweight_check",
         "//xla:executable_run_options",
         "//xla/tsl/mkl:onednn",
         "//xla/tsl/platform:env",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:platform_port",
@@ -2051,11 +1738,11 @@ cc_library(
     deps = [
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:fusion_emitter",
+        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:env",
@@ -2080,7 +1767,6 @@ xla_test(
     deps = [
         ":parallel_fusion_emitter",
         "//xla/backends/cpu/codegen:fusion_compiler",
-        "//xla/codegen:llvm_kernel_definition",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -2104,9 +1790,9 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
+        "//xla/hlo/analysis:alias_info",
         "//xla/hlo/ir:hlo",
         "//xla/service:multi_output_fusion",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -2117,6 +1803,7 @@ xla_cc_test(
     srcs = ["cpu_multi_output_fusion_test.cc"],
     deps = [
         ":cpu_multi_output_fusion",
+        "//xla/hlo/analysis:alias_info",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
@@ -2165,8 +1852,9 @@ cc_library(
     deps = [
         ":cpu_aot_compilation_result",
         ":executable_proto_cc",
-        ":runtime_symbol_generator",
         "//xla:util",
+        "//xla/backends/cpu:target_machine_options",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
         "//xla/backends/cpu/codegen:ir_compiler",
@@ -2176,19 +1864,25 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
     ] + if_llvm_aarch64_available([
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index d5c3af6bfe337e..3da0ca6b17a641 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -2,7 +2,8 @@ syntax = "proto3";
 
 package xla.cpu;
 
-import "xla/backends/cpu/xnnpack_config.proto";
+import "xla/backends/cpu/xnn_fusion_options.proto";
+import "xla/backends/cpu/ynn_fusion_options.proto";
 import "xla/service/cpu/onednn_config.proto";
 
 // Backend config for a general custom call instruction, e.g. XLA FFI.
@@ -19,7 +20,8 @@ message CustomCallBackendConfig {
 message FusionBackendConfig {
   string kind = 1;
   oneof custom_fusion_config_oneof {
-    XnnFusionBackendConfig xnn_fusion_config = 2;
+    XnnFusionOptions xnn_fusion_options = 2;
+    YnnFusionOptions ynn_fusion_options = 3;
   }
 }
 
diff --git a/third_party/xla/xla/service/cpu/buffer_info_util.cc b/third_party/xla/xla/service/cpu/buffer_info_util.cc
deleted file mode 100644
index cbcbdfa853eb7e..00000000000000
--- a/third_party/xla/xla/service/cpu/buffer_info_util.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/buffer_info_util.h"
-
-#include <cassert>
-#include <cstdint>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "xla/cpu_function_runtime.h"
-
-namespace xla {
-namespace cpu {
-
-using BufferInfo = cpu_function_runtime::BufferInfo;
-
-std::vector<BufferInfo> CreateBufferInfosFromBufferAssignment(
-    const HloModule& module, const BufferAssignment& buffer_assignment) {
-  std::vector<BufferInfo> buffer_infos;
-  for (const BufferAllocation& allocation : buffer_assignment.Allocations()) {
-    if (allocation.is_thread_local()) {
-      buffer_infos.push_back(BufferInfo::MakeOnStackBuffer(allocation.size()));
-    } else if (allocation.is_constant()) {
-      buffer_infos.push_back(BufferInfo::MakeConstant(allocation.size()));
-    } else if (allocation.is_entry_computation_parameter()) {
-      buffer_infos.push_back(BufferInfo::MakeEntryParameter(
-          /*size=*/allocation.size(),
-          /*param_number=*/allocation.parameter_number()));
-    } else {
-      buffer_infos.push_back(BufferInfo::MakeTempBuffer(allocation.size()));
-    }
-  }
-
-  // Fill in the result parameters' indices, expanding all tuples.
-  auto root_instr = module.entry_computation()->root_instruction();
-  auto output_allocation = buffer_assignment.GetUniqueTopLevelOutputSlice();
-  if (output_allocation->allocation()->is_tuple()) {
-    int out_index = 0;
-    ShapeUtil::ForEachSubshape(
-        root_instr->shape(),
-        [&](const Shape& subshape, const ShapeIndex& index) {
-          if (subshape.IsTuple()) {
-            return;
-          }
-          int64_t result_index =
-              buffer_assignment.GetUniqueSlice(root_instr, index)->index();
-          assert(result_index < buffer_infos.size());
-          buffer_infos[result_index].set_result_parameter_number(out_index++);
-        });
-  }
-
-  return buffer_infos;
-}
-
-std::vector<int32_t> CreateArgIndexTableFromBufferInfos(
-    absl::Span<const BufferInfo> buffer_infos) {
-  std::vector<int32_t> ret;
-  for (int64_t i = 0; i < buffer_infos.size(); i++) {
-    if (!buffer_infos[i].is_entry_parameter()) {
-      continue;
-    }
-    uint64_t param_index = buffer_infos[i].entry_parameter_number();
-    if (param_index >= ret.size()) {
-      ret.resize(param_index + 1);
-    }
-    ret[param_index] = i;
-  }
-  return ret;
-}
-
-std::vector<int32_t> CreateResultIndexTableFromBufferInfos(
-    absl::Span<const BufferInfo> buffer_infos) {
-  std::vector<int32_t> ret;
-  for (int64_t i = 0; i < buffer_infos.size(); i++) {
-    if (!buffer_infos[i].is_result_parameter()) {
-      continue;
-    }
-    uint64_t result_index = buffer_infos[i].result_parameter_number();
-    if (result_index >= ret.size()) {
-      ret.resize(result_index + 1);
-    }
-    ret[result_index] = i;
-  }
-  return ret;
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/conv_canonicalization.cc b/third_party/xla/xla/service/cpu/conv_canonicalization.cc
index d00870f01e3b9d..47bcb617260dcb 100644
--- a/third_party/xla/xla/service/cpu/conv_canonicalization.cc
+++ b/third_party/xla/xla/service/cpu/conv_canonicalization.cc
@@ -35,7 +35,7 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-absl::StatusOr<bool> ConvCanonicalization::Run(
+absl::StatusOr<bool> ConvCanonicalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/cpu/conv_canonicalization.h b/third_party/xla/xla/service/cpu/conv_canonicalization.h
index ee6c1a44207409..bbb6fe42edb082 100644
--- a/third_party/xla/xla/service/cpu/conv_canonicalization.h
+++ b/third_party/xla/xla/service/cpu/conv_canonicalization.h
@@ -44,8 +44,9 @@ class ConvCanonicalization : public HloModulePass {
   absl::string_view name() const override {
     return "convolution-canonicalization";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
index 1ce0f7a582addc..3efb1518106fbd 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
@@ -29,19 +29,20 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
+#include "xla/backends/cpu/buffer_allocation_info_util.h"
 #include "xla/backends/cpu/constant_allocation.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
-#include "xla/cpu_function_runtime.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/compiler.h"
-#include "xla/service/cpu/buffer_info_util.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/executable.pb.h"
 #include "xla/service/executable.h"
@@ -55,7 +56,6 @@ limitations under the License.
 #include "xla/util.h"
 
 namespace xla::cpu {
-using BufferInfo = cpu_function_runtime::BufferInfo;
 
 CpuAotCompilationOptions::CpuAotCompilationOptions(
     std::string triple, std::string cpu_name, std::string features,
@@ -80,18 +80,19 @@ CpuAotCompilationResult::Create(
     absl::string_view function_name, std::vector<ObjFileProto> obj_files,
     std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
     std::unique_ptr<FunctionLibrary> function_library,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data) {
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+    TargetMachineOptionsProto target_machine_options) {
   ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-      &buffer_assignment->Allocations());
+      hlo_module, &buffer_assignment->Allocations());
   TF_ASSIGN_OR_RETURN(ThunkSequenceProto thunk_proto,
                       thunk_sequence_serdes.ToProto(thunks));
 
-  std::vector<cpu_function_runtime::BufferInfo> buffer_infos;
+  std::vector<cpu::BufferAllocationInfo> buffer_allocation_infos;
   std::optional<size_t> temp_allocation_index;
 
   if (buffer_assignment) {
-    buffer_infos =
-        CreateBufferInfosFromBufferAssignment(*hlo_module, *buffer_assignment);
+    buffer_allocation_infos =
+        CreateBufferAllocationInfos(*hlo_module, *buffer_assignment);
 
     // Find temp allocation index if it exists
     for (const BufferAllocation& allocation :
@@ -108,8 +109,8 @@ CpuAotCompilationResult::Create(
   return absl::WrapUnique(new CpuAotCompilationResult(
       hlo_module, buffer_assignment, function_name, std::move(obj_files),
       std::move(symbols), thunk_proto, std::move(temp_allocation_index),
-      std::move(buffer_infos), std::move(function_library),
-      std::move(hlo_profile_printer_data)));
+      std::move(buffer_allocation_infos), std::move(function_library),
+      std::move(hlo_profile_printer_data), std::move(target_machine_options)));
 }
 
 CpuAotCompilationResult::CpuAotCompilationResult(
@@ -117,18 +118,20 @@ CpuAotCompilationResult::CpuAotCompilationResult(
     absl::string_view function_name, std::vector<ObjFileProto> obj_files,
     std::vector<SymbolProto> symbols, const ThunkSequenceProto& thunks,
     std::optional<size_t> temp_allocation_index,
-    std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
+    std::vector<BufferAllocationInfo> buffer_allocation_infos,
     std::unique_ptr<FunctionLibrary> function_library,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+    TargetMachineOptionsProto target_machine_options)
     : temp_allocation_index_(temp_allocation_index),
-      buffer_infos_(std::move(buffer_infos)),
+      buffer_allocation_infos_(std::move(buffer_allocation_infos)),
       function_library_(std::move(function_library)),
       hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {
   *proto_.mutable_hlo_module()->mutable_hlo_module() = hlo_module->ToProto();
   *proto_.mutable_hlo_module()->mutable_config() =
       hlo_module->config().ToProto();
   *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
-  proto_.set_entry_function_name(std::string(function_name));
+  proto_.set_entry_function_name(function_name);
+  *proto_.mutable_target_machine_options() = std::move(target_machine_options);
   for (ObjFileProto& obj_file : obj_files) {
     *proto_.add_object_files() = std::move(obj_file);
   }
@@ -141,7 +144,7 @@ CpuAotCompilationResult::CpuAotCompilationResult(
   module_ = hlo_module->Clone();
 
   ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-      &buffer_assignment->Allocations());
+      hlo_module, &buffer_assignment->Allocations());
   *proto_.mutable_thunk_sequence() = thunks;
 }
 
@@ -179,7 +182,7 @@ CpuAotCompilationResult::LoadExecutable(
   }
 
   ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-      &buffer_assignment->Allocations());
+      module.get(), &buffer_assignment->Allocations());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<ThunkSequence> thunks,
                       thunk_sequence_serdes.FromProto(proto_.thunk_sequence()));
 
@@ -189,12 +192,16 @@ CpuAotCompilationResult::LoadExecutable(
   TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
                       CreateConstantAllocations(*buffer_assignment));
 
+  TF_ASSIGN_OR_RETURN(
+      TargetMachineOptions target_machine_options,
+      TargetMachineOptions::FromProto(proto_.target_machine_options()));
+
   TF_ASSIGN_OR_RETURN(
       cpu_executable,
       CpuExecutable::Create(std::move(function_library_),
                             std::move(buffer_assignment), std::move(module),
                             std::move(*thunks), std::move(constants), nullptr,
-                            nullptr));
+                            nullptr, target_machine_options));
 
   // Dump computation proto state and buffer assignment for
   // GetCompiledMemoryStats results.
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 9cbe2ad22d4092..52154f376c03ad 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_SERVICE_CPU_CPU_AOT_COMPILATION_RESULT_H_
 
 #include <cstddef>
-#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -26,10 +25,10 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/cpu/buffer_allocation_info.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/buffer_assignment.h"
@@ -107,7 +106,9 @@ class CpuAotCompilationResult : public AotCompilationResult {
       absl::string_view function_name, std::vector<ObjFileProto> obj_files,
       std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
       std::unique_ptr<FunctionLibrary> function_library,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      TargetMachineOptionsProto target_machine_options =
+          TargetMachineOptionsProto());
 
   ~CpuAotCompilationResult() override = default;
 
@@ -148,8 +149,8 @@ class CpuAotCompilationResult : public AotCompilationResult {
     return temp_allocation_index_;
   }
 
-  const std::vector<cpu_function_runtime::BufferInfo>& buffer_infos() const {
-    return buffer_infos_;
+  absl::Span<const BufferAllocationInfo> buffer_allocation_infos() const {
+    return buffer_allocation_infos_;
   }
 
   const HloProfilePrinterData* hlo_profile_printer_data() const {
@@ -184,9 +185,10 @@ class CpuAotCompilationResult : public AotCompilationResult {
       absl::string_view function_name, std::vector<ObjFileProto> obj_files,
       std::vector<SymbolProto> symbols, const ThunkSequenceProto& thunks,
       std::optional<size_t> temp_allocation_index,
-      std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
+      std::vector<BufferAllocationInfo> buffer_allocation_infos,
       std::unique_ptr<FunctionLibrary> function_library,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      TargetMachineOptionsProto target_machine_options);
 
   explicit CpuAotCompilationResult(
       CompilationResultProto proto, std::unique_ptr<HloModule> module,
@@ -198,7 +200,7 @@ class CpuAotCompilationResult : public AotCompilationResult {
   CompilationResultProto proto_;
   std::unique_ptr<HloModule> module_;
   std::optional<size_t> temp_allocation_index_;
-  std::vector<cpu_function_runtime::BufferInfo> buffer_infos_;
+  std::vector<BufferAllocationInfo> buffer_allocation_infos_;
 
   std::unique_ptr<FunctionLibrary> function_library_;
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
index 92dbd751a92015..b3544407d61546 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
@@ -17,9 +17,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/StringMap.h"  // IWYU pragma: keep
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
+#include "xla/backends/cpu/codegen/cpu_features.h"
+#include "xla/backends/cpu/target_machine_options.h"
+#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/compiler.h"
@@ -34,6 +41,7 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "tsl/platform/casts.h"
 
 namespace xla {
 namespace cpu {
@@ -125,6 +133,132 @@ ENTRY e {
   test("Test kHloAdd2", kHloAdd2, 1, 3);
 }
 
+TEST_F(CpuAotCompilerTest,
+       ExportedExecutableTargetMachineOptionsMatchTheCompilationMachine) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  a_plus_b = f32[] add(a, b)
+  ROOT result = f32[] add(a_plus_b, b)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          se::PlatformManager::PlatformWithName("host"));
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
+                          platform->ExecutorForDevice(0));
+
+  Compiler* compiler = backend().compiler();
+  ASSERT_NE(compiler, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(module_string));
+
+  xla::Compiler::CompileOptions compile_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      hlo_module, compiler->RunHloPasses(std::move(hlo_module), stream_exec,
+                                         compile_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable, compiler->RunBackend(std::move(hlo_module), stream_exec,
+                                            compile_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto aot_result, compiler->Export(executable.get()));
+
+  CpuAotCompilationResult* cpu_aot_result =
+      tsl::down_cast<CpuAotCompilationResult*>(aot_result.get());
+  ASSERT_NE(cpu_aot_result, nullptr);
+
+  EXPECT_EQ(
+      llvm::Triple(cpu_aot_result->proto().target_machine_options().triple())
+          .getArchName(),
+      llvm::Triple(kTargetTripleForHost).getArchName());
+
+  auto host_machine_features = llvm::sys::getHostCPUFeatures();
+
+  for (const auto& [feature, supported] : host_machine_features) {
+    if (supported && absl::StartsWith(feature, "avx512")) {
+      host_machine_features["prefer-no-scatter"] = true;
+      host_machine_features["prefer-no-gather"] = true;
+    }
+  }
+
+  for (const auto& feature : absl::StrSplit(
+           cpu_aot_result->proto().target_machine_options().features(), ',')) {
+    if (feature[0] == '+') {
+      EXPECT_TRUE(host_machine_features[feature.substr(1)]);
+    } else {
+      EXPECT_EQ(feature[0], '-');
+      // On some architectures, we disable features - like "sve" on AArch64
+      // depending on the max isa. Once a cpp encapsulation of target options
+      // lands we can remove this hack.
+      auto xla_cpu_max_isa =
+          CpuFeatureFromString(GetDebugOptionsFromFlags().xla_cpu_max_isa());
+      EXPECT_FALSE(
+          host_machine_features[feature.substr(1)] &&
+          (xla_cpu_max_isa &&
+           ShouldEnableCpuFeature(feature.substr(1), xla_cpu_max_isa.value())));
+    }
+  }
+}
+
+TEST_F(CpuAotCompilerTest,
+       ExportedExecutableTargetMachineOptionsMatchTheOptionsSetInTargetConfig) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  a_plus_b = f32[] add(a, b)
+  ROOT result = f32[] add(a_plus_b, b)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          se::PlatformManager::PlatformWithName("host"));
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
+                          platform->ExecutorForDevice(0));
+
+  Compiler* compiler = backend().compiler();
+  ASSERT_NE(compiler, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(module_string));
+
+  xla::Compiler::CompileOptions compile_options;
+  TargetMachineOptions target_machine_options(
+      kTargetTripleForHost, kTargetCpuForHost, "+foo-feature,-bar-feature");
+  compile_options.cpu_target_config.emplace(target_machine_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      hlo_module, compiler->RunHloPasses(std::move(hlo_module), stream_exec,
+                                         compile_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable, compiler->RunBackend(std::move(hlo_module), stream_exec,
+                                            compile_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto aot_result, compiler->Export(executable.get()));
+
+  CpuAotCompilationResult* cpu_aot_result =
+      tsl::down_cast<CpuAotCompilationResult*>(aot_result.get());
+  ASSERT_NE(cpu_aot_result, nullptr);
+
+  EXPECT_EQ(
+      llvm::Triple(cpu_aot_result->proto().target_machine_options().triple())
+          .getArchName(),
+      llvm::Triple(kTargetTripleForHost).getArchName());
+
+  std::vector<absl::string_view> aot_result_features = absl::StrSplit(
+      cpu_aot_result->proto().target_machine_options().features(), ',');
+
+  EXPECT_EQ(aot_result_features.size(), 2);
+  EXPECT_EQ(aot_result_features[0], "+foo-feature");
+  EXPECT_EQ(aot_result_features[1], "-bar-feature");
+}
+
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
index 419d0f53fcd569..21ebdebf6730f2 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
@@ -22,24 +22,29 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Host.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/object_loader.h"
 #include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/service/cpu/executable.pb.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_ir/llvm_command_line_options.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -85,20 +90,24 @@ GetCompiledSymbolsFromProto(
 
 absl::StatusOr<std::unique_ptr<FunctionLibrary>> LoadFunctionLibrary(
     const std::vector<FunctionLibrary::Symbol>& compiled_symbols,
-    absl::Span<const ObjFileProto> obj_files, const HloModule* hlo_module) {
+    absl::Span<const ObjFileProto> obj_files, const HloModule* hlo_module,
+    const TargetMachineOptions& target_machine_options) {
   const HloModuleConfig& config = hlo_module->config();
-  const DebugOptions& debug_options = config.debug_options();
+
+  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
+      config.debug_options().xla_backend_extra_options());
+  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<llvm::TargetMachine> target_machine,
       IrCompiler::InferTargetMachine(
           std::move(CompilerTargetOptions(hlo_module->config())),
-          IrCompiler::GetCodeGenOptLevel(config),
-          CpuFeatureFromString(debug_options.xla_cpu_max_isa())));
+          IrCompiler::GetCodeGenOptLevel(config), target_machine_options));
 
   // Definition generator to link with XLA:CPU host runtime symbols.
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   ObjectLoader object_loader(/*num_dylibs=*/1,
@@ -107,7 +116,7 @@ absl::StatusOr<std::unique_ptr<FunctionLibrary>> LoadFunctionLibrary(
 
   for (size_t i = 0; i < object_loader.num_dylibs(); ++i) {
     object_loader.dylib(i).value()->addGenerator(
-        std::make_unique<RuntimeSymbolGenerator>(
+        std::make_unique<BuiltinDefinitionGenerator>(
             target_machine->createDataLayout()));
   }
 
@@ -158,6 +167,58 @@ CpuAotLoader::LoadAotCompilationResult(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> hlo_module,
       HloModule::CreateFromProtoWithConfig(aot_result_proto.hlo_module()));
+
+  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
+      hlo_module->config().debug_options().xla_backend_extra_options());
+  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
+
+  TF_ASSIGN_OR_RETURN(TargetMachineOptions target_machine_options,
+                      TargetMachineOptions::FromProto(
+                          aot_result_proto.target_machine_options()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<llvm::TargetMachine> target_machine,
+      IrCompiler::InferTargetMachine(
+          std::move(CompilerTargetOptions(hlo_module->config())),
+          IrCompiler::GetCodeGenOptLevel(hlo_module->config()),
+          target_machine_options));
+
+  llvm::Triple triple(target_machine_options.triple());
+  llvm::Triple expected_triple(target_machine->getTargetTriple());
+  if (triple.getArchName() != expected_triple.getArchName()) {
+    return Internal("Target arch mismatch expected %s got %s.",
+                    expected_triple.getArchName(), triple.getArchName());
+  }
+
+  llvm::StringMap<bool> host_machine_features = llvm::sys::getHostCPUFeatures();
+  std::vector<std::string> compile_machine_features =
+      target_machine_options.GetTargetMachineFeaturesVector();
+  // Convert the supported features to a vector of strings.
+  std::vector<std::string> host_machine_features_vector;
+  for (const auto& [feature, supported] : host_machine_features) {
+    if (supported) {
+      host_machine_features_vector.push_back(feature.str());
+    }
+  }
+
+  for (const absl::string_view feature : compile_machine_features) {
+    if (feature[0] == '+' &&
+        (!host_machine_features.contains(feature.substr(1)) ||
+         !host_machine_features[feature.substr(1)])) {
+      // TODO: b/457415427 - Turn this warning into an error once a mechanism
+      // for passing target machine features to the CPU compiler is implemented.
+      LOG(ERROR)
+          << "Loading XLA:CPU AOT result. Target machine feature " << feature
+          << " is not  supported on the host machine. Machine type used for "
+             "XLA:CPU compilation doesn't match the machine type for "
+             "execution. Compile machine features: ["
+          << absl::StrJoin(compile_machine_features, ",")
+          << "] vs host machine features: ["
+          << absl::StrJoin(host_machine_features_vector, ",") << "]"
+          << ". This could lead to execution errors such as SIGILL.";
+    }
+  }
+
   std::vector<SymbolProto> compiled_symbols_proto;
   for (const auto& symbol_proto : aot_result_proto.compiled_symbols()) {
     compiled_symbols_proto.push_back(symbol_proto);
@@ -173,7 +234,8 @@ CpuAotLoader::LoadAotCompilationResult(
 
   TF_ASSIGN_OR_RETURN(
       auto function_library,
-      LoadFunctionLibrary(compiled_symbols, obj_files, hlo_module.get()));
+      LoadFunctionLibrary(compiled_symbols, obj_files, hlo_module.get(),
+                          target_machine_options));
 
   return CpuAotCompilationResult::FromProto(aot_result_proto,
                                             std::move(function_library));
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.h b/third_party/xla/xla/service/cpu/cpu_aot_loader.h
index 635154729b709f..1f8e8def43db40 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Target/TargetOptions.h"
 #include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/executable.pb.h"
@@ -36,7 +37,8 @@ llvm::TargetOptions CompilerTargetOptions(const HloModuleConfig& module_config);
 
 absl::StatusOr<std::unique_ptr<FunctionLibrary>> LoadFunctionLibrary(
     const std::vector<FunctionLibrary::Symbol>& compiled_symbols,
-    absl::Span<const ObjFileProto> obj_files, const HloModule* hlo_module);
+    absl::Span<const ObjFileProto> obj_files, const HloModule* hlo_module,
+    const TargetMachineOptions& target_machine_options);
 
 absl::StatusOr<std::vector<FunctionLibrary::Symbol>>
 GetCompiledSymbolsFromProto(
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 893d75c2e57b62..3128fb1aed991d 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include <stack>
 #include <string>
 #include <tuple>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -35,7 +34,6 @@ limitations under the License.
 // IWYU pragma: no_include "llvm/Config/Disassemblers.def.inc"
 // IWYU pragma: no_include "llvm/Config/Targets.def.inc"
 
-#include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -87,26 +85,23 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "xla/backends/cpu/alignment.h"
-#include "xla/backends/cpu/codegen/cpu_features.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/jit_compiler.h"
-#include "xla/backends/cpu/codegen/object_loader.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/constant_allocation.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
-#include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/backends/cpu/transforms/collectives/all_reduce_combiner.h"
 #include "xla/backends/cpu/transforms/library_rewriter.h"
 #include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
 #include "xla/backends/cpu/xnn_support.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/indexed_array_analysis.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -172,7 +167,6 @@ limitations under the License.
 #include "xla/service/conditional_simplifier.h"
 #include "xla/service/conditional_to_select.h"
 #include "xla/service/copy_insertion.h"
-#include "xla/service/cpu/buffer_info_util.h"
 #include "xla/service/cpu/conv_canonicalization.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/service/cpu/cpu_aot_loader.h"
@@ -189,7 +183,6 @@ limitations under the License.
 #include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/cpu/metrics.h"
 #include "xla/service/cpu/parallel_task_assignment.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/cpu/small_while_loop_hoisting_pass.h"
 #include "xla/service/cpu/thunk_emitter.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
@@ -236,7 +229,6 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
@@ -259,6 +251,10 @@ limitations under the License.
 #include "xla/service/cpu/onednn_ops_rewriter.h"
 #endif  // XLA_ONEDNN
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/ynn_support.h"
+#endif  // XLA_YNNPACK
+
 namespace xla {
 namespace {
 
@@ -302,8 +298,8 @@ ModuleComputationsTransitivelyContainCustomCall(const HloModule& module) {
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
 
   // Can never fail because we always return an OK status from the visitor.
-  TF_CHECK_OK(call_graph->VisitNodes([&custom_call_map](
-                                         const CallGraphNode& node) {
+  CHECK_OK(call_graph->VisitNodes([&custom_call_map](
+                                      const CallGraphNode& node) {
     const HloComputation* computation = node.computation();
 
     for (const HloInstruction* instruction : computation->instructions()) {
@@ -503,11 +499,26 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
       !absl::c_contains(module->config()
                             .debug_options()
                             .xla_cpu_experimental_xnn_fusion_type(),
+                        DebugOptions::LIBRARY_FUSION_TYPE_REDUCE) &&
+      !absl::c_contains(module->config()
+                            .debug_options()
+                            .xla_cpu_experimental_ynn_fusion_type(),
                         DebugOptions::LIBRARY_FUSION_TYPE_REDUCE)) {
-    // Needs to happen after algebraic simplifier.
     pipeline->AddPass<TreeReductionRewriter>();
   }
 
+#ifdef XLA_YNNPACK
+  if (absl::c_contains(module->config()
+                           .debug_options()
+                           .xla_cpu_experimental_ynn_fusion_type(),
+                       DebugOptions::LIBRARY_FUSION_TYPE_REDUCE)) {
+    pipeline->AddPass<TreeReductionRewriter>(
+        /*reduce_window_size=*/32, [](const HloInstruction* hlo) {
+          return !IsReduceOpOffloadedToYnn(hlo);
+        });
+  }
+#endif
+
   // BatchNormExpander can create zero-sized ops, so zero-sized HLO
   // elimination has to come after that pass.
   pipeline->AddPass<ZeroSizedHloElimination>();
@@ -531,6 +542,49 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
   return pipeline;
 }
 
+auto LibrarySupportsDot(HloModule* module,
+                        TargetMachineFeatures* target_machine_features) {
+  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
+  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
+  // `XnnFusionThunk`.
+  const bool xnnpack_enabled =
+      module->config().debug_options().xla_cpu_use_xnnpack();
+  const auto xnn_graph_fusion_mode =
+      module->config()
+          .debug_options()
+          .xla_cpu_experimental_xnn_graph_fusion_mode();
+  const bool xnnpack_use_cost_model =
+      xnn_graph_fusion_mode !=
+      DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
+  const bool xnnpack_dot_enabled =
+      xnnpack_enabled &&
+      xnn_graph_fusion_mode != DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED;
+  const bool ynnpack_dot_enabled = absl::c_linear_search(
+      module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
+  return [=](const HloInstruction& instr) {
+#ifdef XLA_YNNPACK
+    if (ynnpack_dot_enabled &&
+        IsDotSupportedByYnn(instr.dot_dimension_numbers(),
+                            instr.operand(0)->shape(),
+                            instr.operand(1)->shape(), instr.shape())
+            .value_or(false)) {
+      return true;
+    }
+#endif  // XLA_YNNPACK
+
+    if (xnnpack_dot_enabled &&
+        IsDotSupportedByXnn(instr.dot_dimension_numbers(),
+                            instr.operand(0)->shape(),
+                            instr.operand(1)->shape(), instr.shape(),
+                            target_machine_features, xnnpack_use_cost_model)
+            .value_or(false)) {
+      return true;
+    }
+    return false;
+  };
+}
+
 }  // namespace
 
 absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
@@ -571,10 +625,14 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*update_domain=*/false,
         /*composites_to_preserve=*/absl::flat_hash_set<std::string>{},
         /*uniquify_channel_ids=*/false,
-        /*should_inline=*/
-        [](const xla::CallGraph& call_graph, xla::HloInstruction* instruction) {
-          return absl::StrContains(instruction->to_apply()->name(),
-                                   sdy::kInlineableManualComputationFuncName);
+        /*override_policy=*/
+        [](const xla::CallGraph& call_graph,
+           const xla::HloInstruction* instruction) {
+          if (absl::StrContains(instruction->to_apply()->name(),
+                                sdy::kInlineableManualComputationFuncName)) {
+            return CallInliner::InlineOverridePolicy::kAllowInline;
+          }
+          return CallInliner::InlineOverridePolicy::kProhibitInline;
         });
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(module).status());
   } else {
@@ -609,36 +667,30 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<BatchedGatherScatterNormalizer>();
   pipeline.AddPass<ResultCaster>();
 
-  // If XNNPACK is enabled, we only need to upcast dots that XnnDotThunk does
-  // not support. `upcaster_filter` returns false if the instruction shouldn't
-  // be processed.
-  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
-  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
-  // `XnnFusionThunk`.
-  bool xnnpack_enabled = module->config().debug_options().xla_cpu_use_xnnpack();
+  auto library_supports_dot =
+      LibrarySupportsDot(module, target_machine_features);
+
   auto call_library_for_dot = [&](const HloInstruction& instr) {
-    if (!xnnpack_enabled) return false;
-    DotImplementationStrategy strategy = GetDotImplementationStrategy(
+    if (instr.opcode() != HloOpcode::kDot) {
+      return false;
+    }
+
+    auto dot_strategy = GetDotImplementationStrategy(
         module->config(), instr, *target_machine_features,
         /*allow_runtime_calls=*/true);
-    return strategy == DotImplementationStrategy::kEigen;
+    if (dot_strategy != DotImplementationStrategy::kEigen) {
+      // We aren't going to call a library for this dot.
+      return false;
+    }
+
+    return library_supports_dot(instr);
   };
+
+  // If YNNPACK is enabled, we only need to upcast dots that YnnDotThunk does
+  // not support. `upcaster_filter` returns false if the instruction shouldn't
+  // be processed.
   HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
-    if (instr->opcode() != HloOpcode::kDot) {
-      return true;
-    }
-    if (!call_library_for_dot(*instr)) {
-      return true;
-    }
-    bool use_cost_model = module->config()
-                              .debug_options()
-                              .xla_cpu_experimental_xnn_graph_fusion_mode() !=
-                          DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
-    return !IsDotSupportedByXnn(instr->dot_dimension_numbers(),
-                                instr->operand(0)->shape(),
-                                instr->operand(1)->shape(), instr->shape(),
-                                target_machine_features, use_cost_model)
-                .value_or(false);
+    return !call_library_for_dot(*instr);
   };
 
   // xla::cpu::GetDotImplementationStrategy (used by call_library_for_dot)
@@ -702,11 +754,13 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
-  CpuFloatSupport bf16_support(BF16, call_library_for_dot,
-                               target_machine_features);
+  CpuFloatSupport bf16_support(BF16, call_library_for_dot);
 #ifdef XLA_ONEDNN
+  bool use_onednn_graph =
+      module->config().debug_options().xla_cpu_use_onednn() &&
+      IsOneDnnCompatible(is_aot_compile);
   OneDnnFloatSupport onednn_bf16_support(BF16);
-  if (use_onednn_custom_call) {
+  if (use_onednn_custom_call || use_onednn_graph) {
     pipeline.AddPass<FloatNormalization>(&onednn_bf16_support);
   } else {
     pipeline.AddPass<FloatNormalization>(&bf16_support);
@@ -828,7 +882,6 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<TopkRewriter>([](const HloSortInstruction* sort, int64_t) {
     return sort->operand(0)->shape().element_type() == F32;
   });
-  pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
   pipeline.AddPass<TransposeFolding>(
       [&](const HloInstruction& dot, int64_t operand) -> absl::StatusOr<bool> {
         if (DotImplementationCanHandleTranspose(dot, *target_machine_features,
@@ -931,16 +984,20 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // XNNPACK ops availability checks depend on the layout information,
   // so until another solution is developed the passes creating XNNPACK fusions
   // have to run after layout assignment.
+  const bool use_ynnpack = absl::c_linear_search(
+      debug_options.xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_REDUCE);
   LibraryRewriterOptions options = {
-      /*use_onednn=*/module->config().debug_options().xla_cpu_use_onednn(),
-      /*use_xnnpack=*/module->config().debug_options().xla_cpu_use_xnnpack(),
+      /*use_onednn=*/debug_options.xla_cpu_use_onednn(),
+      /*use_xnnpack=*/debug_options.xla_cpu_use_xnnpack(),
+      /*use_ynnpack=*/use_ynnpack,
       /*onednn_fusion_types=*/
-      &module->config()
-           .debug_options()
-           .xla_cpu_experimental_onednn_fusion_type(),
+      &debug_options.xla_cpu_experimental_onednn_fusion_type(),
       /*xnn_fusion_types=*/
-      &module->config().debug_options().xla_cpu_experimental_xnn_fusion_type()};
-  if (options.use_onednn || options.use_xnnpack) {
+      &debug_options.xla_cpu_experimental_xnn_fusion_type(),
+      /*ynn_fusion_types=*/
+      &debug_options.xla_cpu_experimental_ynn_fusion_type()};
+  if (options.use_onednn || options.use_xnnpack || options.use_ynnpack) {
     HloPassPipeline lib_pipeline("dot-library-passes");
     lib_pipeline.AddPass<DotDecomposer>();
     lib_pipeline.AddPass<LibraryRewriter>(target_machine_features, options);
@@ -963,8 +1020,9 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     pipeline.AddPass<FusionWrapper>(use_experimental_loop_fusion);
   }
 
+  AliasInfo alias_info;
   if (use_multi_output_fusion) {
-    pipeline.AddPass<CpuMultiOutputFusion>();
+    pipeline.AddPass<CpuMultiOutputFusion>(&alias_info);
     pipeline.AddPass<TupleSimplifier>();
   }
 
@@ -1027,7 +1085,6 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   pipeline.AddPass<OptimizeInputOutputBufferAlias>(true);
 
   // If enabled we'll use more precise region based analysis for copy removal.
-  AliasInfo alias_info;
   if (debug_options.xla_cpu_copy_insertion_use_region_analysis()) {
     pipeline.AddPass<CopyInsertion>(
         &alias_info,
@@ -1163,12 +1220,20 @@ absl::StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
         module->config().debug_options().xla_backend_extra_options());
     llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
 
+    // Use default target machine options if not specified in the target config.
+    TargetMachineOptions target_machine_options(
+        module->config().debug_options());
+    if (options.cpu_target_config &&
+        options.cpu_target_config->cpu_target_machine_options) {
+      target_machine_options =
+          options.cpu_target_config->cpu_target_machine_options.value();
+    }
+
     TF_ASSIGN_OR_RETURN(
         jit_target_machine,
-        IrCompiler::InferTargetMachine(
-            CompilerTargetOptions(config),
-            IrCompiler::GetCodeGenOptLevel(config),
-            CpuFeatureFromString(config.debug_options().xla_cpu_max_isa())));
+        IrCompiler::InferTargetMachine(CompilerTargetOptions(config),
+                                       IrCompiler::GetCodeGenOptLevel(config),
+                                       target_machine_options));
   }
 
   TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false,
@@ -1633,7 +1698,7 @@ CpuCompiler::CompileCpuExecutable(
   // Definition generator to link with XLA:CPU host runtime symbols.
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   std::unique_ptr<LlvmMultipleModuleCompiler> llvm_module_compiler;
@@ -1944,13 +2009,21 @@ CpuCompiler::CompileCpuExecutable(
   TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
                       CreateConstantAllocations(*assignment));
 
+  // We don't use the target machine options from the
+  // CompileOptions::target_config field as we consider TargetMachine to be the
+  // source of truth at this point. This is because the AOT path might set its
+  // own target machine options.
+  TargetMachineOptions target_machine_options(
+      target_machine->getTargetTriple().normalize(),
+      target_machine->getTargetCPU(), target_machine->getTargetFeatureString());
+
   TF_ASSIGN_OR_RETURN(
       auto cpu_executable,
-      CpuExecutable::Create(std::move(function_library), std::move(assignment),
-                            std::move(module), std::move(thunks),
-                            std::move(constants),
-                            std::move(hlo_profile_printer_data),
-                            std::move(hlo_profile_index_map)));
+      CpuExecutable::Create(
+          std::move(function_library), std::move(assignment), std::move(module),
+          std::move(thunks), std::move(constants),
+          std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map),
+          std::move(target_machine_options)));
 
   // Save object files to be able to export them to AOT compilation
   // result.
@@ -1991,12 +2064,19 @@ absl::StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       module->config().debug_options().xla_backend_extra_options());
   llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
 
+  TargetMachineOptions target_machine_options(module->config().debug_options());
+  if (options.cpu_target_config &&
+      options.cpu_target_config->cpu_target_machine_options) {
+    target_machine_options =
+        options.cpu_target_config->cpu_target_machine_options.value();
+  }
+
   // Options for compiling LLVM IR to machine code.
   IrCompiler::Options ir_compiler_options{
       /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(module->config()),
       /*optimize_for_size=*/options::OptimizeForSizeRequested(module->config()),
-      /*max_cpu_isa=*/
-      CpuFeatureFromString(module->config().debug_options().xla_cpu_max_isa()),
+      /*target_machine_options=*/
+      target_machine_options,
       /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
       /*disable_expensive_passes=*/
       module->config().debug_options().xla_llvm_disable_expensive_passes(),
@@ -2048,7 +2128,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   llvm::Triple triple(llvm::Triple::normalize(options.triple()));
   std::string error;
   const llvm::Target* target =
-      llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
+      llvm::TargetRegistry::lookupTarget(triple, error);
   if (target == nullptr) {
     return Internal("TargetRegistry::lookupTarget failed: %s", error);
   }
@@ -2134,12 +2214,15 @@ CpuCompiler::CompileAheadOfTimeThunks(
       /*compile_copy_as_llvm_kernel=*/aot_options.compile_copy_as_llvm_kernel(),
       /*is_aot_compilation=*/true};
 
+  TargetMachineOptions target_machine_options(
+      triple.normalize(), target_machine->getTargetCPU(),
+      target_machine->getTargetFeatureString());
+
   IrCompiler::Options ir_compiler_options = {
       /*optimization_level=*/target_machine->getOptLevel(),
       /*optimize_for_size=*/
       options::OptimizeForSizeRequested(module->config()),
-      /*max_cpu_isa=*/
-      CpuFeatureFromString(module->config().debug_options().xla_cpu_max_isa()),
+      /*target_machine_options=*/target_machine_options,
       /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
       /*disable_expensive_passes=*/
       module->config().debug_options().xla_llvm_disable_expensive_passes(),
@@ -2187,7 +2270,8 @@ CpuCompiler::CompileAheadOfTimeThunks(
       cpu_executable->module_name(), std::move(obj_files),
       cpu_executable->get_compiled_symbols_proto(), thunk_sequence,
       std::move(*cpu_executable).consume_function_library(),
-      std::move(executable_hlo_profile_printer_data));
+      std::move(executable_hlo_profile_printer_data),
+      cpu_executable->target_machine_options().ToProto());
 }
 
 se::Platform::Id CpuCompiler::PlatformId() const {
@@ -2228,16 +2312,19 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
   TF_ASSIGN_OR_RETURN(auto compiled_symbols,
                       GetCompiledSymbolsFromProto(compiled_symbols_proto));
 
-  TF_ASSIGN_OR_RETURN(auto function_library,
-                      LoadFunctionLibrary(compiled_symbols, obj_files,
-                                          &cpu_executable->module()));
+  TF_ASSIGN_OR_RETURN(
+      auto function_library,
+      LoadFunctionLibrary(compiled_symbols, obj_files,
+                          &cpu_executable->module(),
+                          cpu_executable->target_machine_options()));
 
   return CpuAotCompilationResult::Create(
       &cpu_executable->module(), &cpu_executable->buffer_assignment(),
       cpu_executable->module_name(), std::move(obj_files),
       std::move(compiled_symbols_proto), *thunk_sequence,
       std::move(function_library),
-      std::move(executable_hlo_profile_printer_data));
+      std::move(executable_hlo_profile_printer_data),
+      cpu_executable->target_machine_options().ToProto());
 }
 
 absl::StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc b/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
index 38e27955521f90..9c570ddcb87742 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
@@ -28,9 +28,12 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/llvm_compiler.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -38,7 +41,7 @@ namespace xla {
 namespace cpu {
 namespace {
 
-using CpuCompilerInternalsTest = HloTestBase;
+using CpuCompilerInternalsTest = HloHardwareIndependentTestBase;
 
 std::optional<int64_t> GetMetadataInt(llvm::Metadata* absl_nullable value) {
   if (value == nullptr) {
@@ -106,8 +109,12 @@ TEST_F(CpuCompilerInternalsTest, DylibWithThunks) {
   DebugOptions& debug_options =
       hlo_module->mutable_config().mutable_debug_options();
   debug_options.set_xla_cpu_use_fusion_emitters(false);
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(hlo_module)));
+
+  CpuCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> optimized_module,
+      compiler.RunHloPasses(std::move(hlo_module), /*stream_exec=*/nullptr,
+                            /*options=*/{}));
 
   int64_t max_seen = -1;
   auto pre_opt_hook = [&](const llvm::Module& llvm_module) {
@@ -117,14 +124,11 @@ TEST_F(CpuCompilerInternalsTest, DylibWithThunks) {
     }
   };
 
-  LLVMCompiler* compiler = static_cast<LLVMCompiler*>(backend().compiler());
-  compiler->SetPreOptimizationHook(pre_opt_hook);
-  ASSERT_TRUE(compiler
-                  ->RunBackend(std::move(optimized_module),
-                               backend().default_stream_executor(),
-                               /*device_allocator=*/nullptr)
-                  .ok());
-  compiler->RemovePreOptimizationHook();
+  compiler.SetPreOptimizationHook(pre_opt_hook);
+  TF_ASSERT_OK(compiler.RunBackend(std::move(optimized_module),
+                                   /*stream_exec=*/nullptr,
+                                   /*options=*/{}));
+  compiler.RemovePreOptimizationHook();
 
   EXPECT_GT(max_seen, 0) << "max dylib_index(" << max_seen << ") too low; "
                          << "expected to use more dylibs.";
@@ -137,8 +141,12 @@ TEST_F(CpuCompilerInternalsTest, JustOneDylibWithThunks) {
       hlo_module->mutable_config().mutable_debug_options();
   debug_options.set_xla_cpu_use_fusion_emitters(false);
   debug_options.set_xla_cpu_parallel_codegen_split_count(1);
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(hlo_module)));
+
+  CpuCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> optimized_module,
+      compiler.RunHloPasses(std::move(hlo_module), /*stream_exec=*/nullptr,
+                            /*options=*/{}));
 
   int64_t max_seen = -1;
   auto pre_opt_hook = [&](const llvm::Module& llvm_module) {
@@ -148,14 +156,10 @@ TEST_F(CpuCompilerInternalsTest, JustOneDylibWithThunks) {
     }
   };
 
-  LLVMCompiler* compiler = static_cast<LLVMCompiler*>(backend().compiler());
-  compiler->SetPreOptimizationHook(pre_opt_hook);
-  ASSERT_TRUE(compiler
-                  ->RunBackend(std::move(optimized_module),
-                               backend().default_stream_executor(),
-                               /*device_allocator=*/nullptr)
-                  .ok());
-  compiler->RemovePreOptimizationHook();
+  compiler.SetPreOptimizationHook(pre_opt_hook);
+  TF_ASSERT_OK(compiler.RunBackend(std::move(optimized_module),
+                                   /*stream_exec=*/nullptr, /*options=*/{}));
+  compiler.RemovePreOptimizationHook();
 
   EXPECT_EQ(max_seen, 0) << "max dylib_index(" << max_seen
                          << ") != 0, but only "
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 47848f1ecd10b6..de6b36916ae056 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -45,13 +45,13 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/backends/cpu/runtime/xfeed_manager.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_metadata.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/cpu/cpu_runtime.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/executable.h"
@@ -83,47 +83,21 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
-    std::unique_ptr<FunctionLibrary> function_library,
-    std::unique_ptr<BufferAssignment> assignment,
-    std::unique_ptr<HloModule> hlo_module,
-    const std::string& entry_function_name,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) {
-  VLOG(2) << "Create CpuExecutable from a jit compiled function: "
-          << entry_function_name << ", module=" << hlo_module->name();
-
-  std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
-      std::move(hlo_module), std::move(hlo_profile_printer_data),
-      std::move(hlo_profile_index_map), std::move(assignment)));
-  executable->function_library_ = std::move(function_library);
-  executable->module_name_ = entry_function_name;
-
-  TF_ASSIGN_OR_RETURN(
-      executable->compute_function_,
-      executable->function_library_
-          ->ResolveFunction<std::remove_pointer_t<ComputeFunctionType>>(
-              entry_function_name));
-
-  VLOG(1) << "compute_function_ at address "
-          << reinterpret_cast<void*>(executable->compute_function_);
-
-  return executable;
-}
-
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<FunctionLibrary> function_library,
     std::unique_ptr<BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
     std::vector<ConstantAllocation> constants,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) {
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+    TargetMachineOptions target_machine_options) {
   VLOG(2) << "Create CpuExecutable from a thunk sequence; module="
           << hlo_module->name() << ", constants=" << constants.size();
 
   std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
-      std::move(hlo_profile_index_map), std::move(assignment)));
+      std::move(hlo_profile_index_map), std::move(assignment),
+      std::move(target_machine_options)));
   executable->function_library_ = std::move(function_library);
 
   ThunkExecutor::Options thunk_executor_options;
@@ -138,6 +112,12 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     executable->has_xnn_fusions_ |= thunk.kind() == Thunk::Kind::kXnnFusion;
   });
 
+  // Find if the thunk sequence contains any YNN fusion thunks. If we do have
+  // any, we will prepare the YNNPACK thread pool for them at run time.
+  executable->thunks_->thunk_sequence().ForEach([&](const Thunk& thunk) {
+    executable->has_ynn_fusions_ |= thunk.kind() == Thunk::Kind::kYnnFusion;
+  });
+
   // Re-index constants by their allocation index to allow efficient lookup.
   for (auto& constant : constants) {
     if (executable->constants_.size() <= constant.index) {
@@ -153,10 +133,12 @@ CpuExecutable::CpuExecutable(
     std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-    std::unique_ptr<BufferAssignment> assignment)
+    std::unique_ptr<BufferAssignment> assignment,
+    TargetMachineOptions target_machine_options)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
-      assignment_(std::move(assignment)) {
+      assignment_(std::move(assignment)),
+      target_machine_options_(std::move(target_machine_options)) {
   if (assignment_ && has_module()) {
     XlaDebugInfoManager::Get()->RegisterModule(shared_module(), assignment_);
   }
@@ -246,58 +228,14 @@ CpuExecutable::CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
   return std::move(buffers);
 }
 
-absl::Status CpuExecutable::ExecuteComputeFunction(
-    const ExecutableRunOptions* run_options,
-    absl::Span<MaybeOwningDeviceMemory const> buffers) {
-  uint64_t start_micros = tsl::Env::Default()->NowMicros();
-
-  size_t profile_counters_size = 0;
-  int64_t* profile_counters = nullptr;
-
-  // Call the computation function following the calling convention. See the
-  // definition of 'ComputeFunctionType' for the details of the calling
-  // convention of JITed functions.
-  std::vector<void*> buffer_pointers;
-  for (auto& buffer : buffers) {
-    buffer_pointers.push_back(
-        const_cast<void*>(buffer.AsDeviceMemoryBase().opaque()));
+static int32_t GetDeviceOrdinal(const ExecutableRunOptions* run_options) {
+  if (!run_options) {
+    return 0;
   }
-
-  VLOG(3) << "Executing compute function:";
-  VLOG(3) << absl::StrFormat("  Number of buffer table entries: %u",
-                             buffer_pointers.size());
-  auto ptr_printer = [](std::string* out, const void* p) {
-    absl::StrAppend(out, absl::StrFormat("%p", p));
-  };
-  VLOG(3) << absl::StrFormat("  Buffer table: [%s]",
-                             absl::StrJoin(buffer_pointers, ", ", ptr_printer));
-  VLOG(3) << absl::StrFormat("  Number of profile counters: %u",
-                             profile_counters_size);
-  VLOG(3) << absl::StrFormat("  Profile counters: %p", profile_counters);
-
-  auto record_profile = [&]() {
-    uint64_t end_micros = tsl::Env::Default()->NowMicros();
-    if (run_options->execution_profile()) {
-      const double nanoseconds = (end_micros - start_micros) * 1000.0;
-      run_options->execution_profile()->set_compute_time_ns(
-          std::max(nanoseconds, 1.0));
-    }
-  };
-
-  XlaCustomCallStatus status;
-  // For the entry computation (like all global computations), all inputs and
-  // outputs are in the buffer table, and both the result pointer and args
-  // array pointers are unused (so we set them to 'nullptr').
-  compute_function_(nullptr, run_options, nullptr, buffer_pointers.data(),
-                    &status, profile_counters);
-  record_profile();
-  std::optional<absl::string_view> error_message =
-      CustomCallStatusGetMessage(&status);
-  if (error_message) {
-    return Internal("CustomCall failed: %s", *error_message);
+  if (run_options->device_ordinal() != -1) {
+    return run_options->device_ordinal();
   }
-
-  return absl::OkStatus();
+  return run_options->stream()->parent()->device_ordinal();
 }
 
 absl::Status CpuExecutable::ExecuteThunks(
@@ -337,6 +275,12 @@ absl::Status CpuExecutable::ExecuteThunks(
     TF_ASSIGN_OR_RETURN(xnn_params, Thunk::XnnParams::Create(run_options));
   }
 
+  // Prepare for executing YNNPACK fusions.
+  std::optional<Thunk::YnnParams> ynn_params;
+  if (has_ynn_fusions()) {
+    TF_ASSIGN_OR_RETURN(ynn_params, Thunk::YnnParams::Create(run_options));
+  }
+
   // Use the intra-op thread pool to offload thunk executor tasks.
   auto* intra_op_thread_pool = run_options->intra_op_thread_pool();
   ThreadPoolTaskRunner task_runner(
@@ -345,12 +289,13 @@ absl::Status CpuExecutable::ExecuteThunks(
   Thunk::ExecuteParams execute_params = {
       &*function_library_,
       &allocations,
-      GetXfeedManager(runtime::GetDeviceOrdinal(run_options)),
+      GetXfeedManager(GetDeviceOrdinal(run_options)),
       intra_op_thread_pool,
       &task_runner,
       &collective_execute_params,
       &custom_call_execute_params,
-      xnn_params ? &*xnn_params : nullptr};
+      xnn_params ? &*xnn_params : nullptr,
+      ynn_params ? &*ynn_params : nullptr};
 
   auto executed_event = thunks_->Execute(execute_params);
 
@@ -532,14 +477,8 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   tsl::port::ScopedFlushDenormal flush;
   tsl::port::ScopedSetRound round(FE_TONEAREST);
 
-  if (has_compute_function()) {
-    TF_RETURN_IF_ERROR(
-        ExecuteComputeFunction(&run_options->run_options(), buffers));
-  } else if (has_thunks()) {
-    TF_RETURN_IF_ERROR(ExecuteThunks(&run_options->run_options(), buffers));
-  } else {
-    return Internal("No compute function or thunks found.");
-  }
+  DCHECK(has_thunks());
+  TF_RETURN_IF_ERROR(ExecuteThunks(&run_options->run_options(), buffers));
 
   MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
   return std::move(result);
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index 9f573cb1d675c3..c71dc382544f6f 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -55,16 +56,6 @@ namespace cpu {
 // architecture, so JIT-ed code and host code share the same ABI.
 class CpuExecutable : public Executable {
  public:
-  // Creates a CpuExecutable from JIT compiled cpu function by resolving
-  // `entry_function_name` in the `jit`.
-  static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
-      std::unique_ptr<FunctionLibrary> function_library,
-      std::unique_ptr<BufferAssignment> assignment,
-      std::unique_ptr<HloModule> hlo_module,
-      const std::string& entry_function_name,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
-
   // Creates a CpuExecutable from a thunk sequence.
   static absl::StatusOr<std::unique_ptr<CpuExecutable>> Create(
       std::unique_ptr<FunctionLibrary> function_library,
@@ -72,7 +63,8 @@ class CpuExecutable : public Executable {
       std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
       std::vector<ConstantAllocation> constants,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+      TargetMachineOptions target_machine_options);
 
   ~CpuExecutable() override;
 
@@ -80,12 +72,6 @@ class CpuExecutable : public Executable {
       const ServiceExecutableRunOptions* run_options,
       std::vector<ExecutionInput> arguments) override;
 
-  // Calls the generated function performing the computation with the given
-  // arguments using the supplied buffers.
-  absl::Status ExecuteComputeFunction(
-      const ExecutableRunOptions* run_options,
-      absl::Span<MaybeOwningDeviceMemory const> buffers);
-
   // Calls emitted thunk sequence with the given arguments using the supplied
   // buffers.
   absl::Status ExecuteThunks(const ExecutableRunOptions* run_options,
@@ -140,19 +126,11 @@ class CpuExecutable : public Executable {
 
   static int64_t ShapeSizeBytes(const Shape& shape);
 
-  // Type of the computation function we expect in the JIT.
-  using ComputeFunctionType =
-      void (*)(void* /*result*/, const ExecutableRunOptions* /*run_options*/,
-               const void** /*args*/, void** /*buffer_table*/,
-               XlaCustomCallStatus* /*status*/, int64_t* /*profile_counters*/);
-
-  bool has_compute_function() const { return compute_function_ != nullptr; }
-  ComputeFunctionType compute_function() const { return compute_function_; }
-
   bool has_thunks() const { return thunks_.has_value(); }
   ThunkExecutor& thunks() { return *thunks_; }
 
   bool has_xnn_fusions() const { return has_xnn_fusions_; }
+  bool has_ynn_fusions() const { return has_ynn_fusions_; }
 
   const BufferAssignment& buffer_assignment() const { return *assignment_; }
   absl::Span<const ConstantAllocation> constants() const { return constants_; }
@@ -174,6 +152,10 @@ class CpuExecutable : public Executable {
   // structures that might have been used at compile time.
   void Finalize();
 
+  const TargetMachineOptions& target_machine_options() const {
+    return target_machine_options_;
+  }
+
  private:
   // Creates an array suitable for passing as the "buffer_table" argument to the
   // JIT compiled function pointer.
@@ -247,9 +229,6 @@ class CpuExecutable : public Executable {
   // We are currently transitioning from (1) to (2) with a long term plan to
   // unify thunk-based runtime with all XLA backends.
 
-  // A function pointer to the jit-compiled entry function.
-  ComputeFunctionType compute_function_ = nullptr;
-
   // A thunk executor created from the compiled thunk sequence.
   std::optional<ThunkExecutor> thunks_;
   // Vector indexed by BufferAllocation::Index for efficient access.
@@ -258,13 +237,19 @@ class CpuExecutable : public Executable {
   // Whether the thunk executor contains any XNN fusion thunks.
   bool has_xnn_fusions_ = false;
 
+  // Whether the thunk executor contains any YNN fusion thunks.
+  bool has_ynn_fusions_ = false;
+
+  TargetMachineOptions target_machine_options_;
+
   // Entry function name for the computation.
   std::string entry_function_name_;
 
   CpuExecutable(std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-                std::unique_ptr<BufferAssignment> assignment);
+                std::unique_ptr<BufferAssignment> assignment,
+                TargetMachineOptions target_machine_options);
   CpuExecutable(const CpuExecutable&) = delete;
   CpuExecutable& operator=(const CpuExecutable&) = delete;
 };
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.h b/third_party/xla/xla/service/cpu/cpu_float_support.h
index 8486665e1fbf34..7447220cb6a0fe 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.h
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.h
@@ -18,8 +18,6 @@ limitations under the License.
 
 #include <functional>
 
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/xnn_support.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -34,20 +32,13 @@ class CpuFloatSupport : public FloatSupport {
   using DotStrategyChecker = std::function<bool(const HloInstruction& hlo)>;
 
   explicit CpuFloatSupport(PrimitiveType low_precision_type,
-                           DotStrategyChecker call_library_for_dot,
-                           TargetMachineFeatures* cpu_features)
+                           DotStrategyChecker library_supports_dot)
       : FloatSupport(low_precision_type),
-        call_library_for_dot_(call_library_for_dot),
-        cpu_features_(cpu_features) {}
+        library_supports_dot_(library_supports_dot) {}
 
-  // Skip trying to upcast the dot if XNNPACK is enabled and the dot is
-  // supported by XNNPACK.
+  // Skip trying to upcast the dot if the dot is supported by a library.
   bool ShouldSkipInstruction(const HloInstruction& hlo) const override {
-    return hlo.opcode() == HloOpcode::kDot && call_library_for_dot_(hlo) &&
-           IsDotSupportedByXnn(hlo.dot_dimension_numbers(),
-                               hlo.operand(0)->shape(), hlo.operand(1)->shape(),
-                               hlo.shape(), cpu_features_)
-               .value_or(false);
+    return hlo.opcode() == HloOpcode::kDot && library_supports_dot_(hlo);
   }
 
   // Makes FloatNormalization skip custom fusion computations for CPU backend.
@@ -58,8 +49,7 @@ class CpuFloatSupport : public FloatSupport {
   }
 
  private:
-  DotStrategyChecker call_library_for_dot_;
-  TargetMachineFeatures* cpu_features_;
+  DotStrategyChecker library_supports_dot_;
 };
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support_test.cc b/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
index 79ac03cfdddf90..2c294d1d38388e 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
@@ -42,8 +42,6 @@ namespace {
 struct SkipInstructionTestSpec {
   HloOpcode op;
   bool call_library_for_dot;
-  std::string cpu_name;
-  std::string features;
   bool upcast;
 };
 
@@ -56,10 +54,7 @@ class SkipInstructionTest
     absl::string_view op = HloOpcodeString(info.param.op);
     absl::string_view dot_strategy =
         info.param.call_library_for_dot ? "LibDot" : "NoLibDot";
-    absl::string_view bf16_strategy =
-        absl::StrContains(info.param.features, "+avx512bf16") ? "Bf16"
-                                                              : "NoBf16";
-    return absl::StrCat(op, "_", dot_strategy, "_", bf16_strategy);
+    return absl::StrCat(op, "_", dot_strategy);
   }
 
   void SetUp() override { TargetMachineTestBase::SetUp(); }
@@ -105,9 +100,7 @@ TEST_P(SkipInstructionTest, Bf16InF32Out) {
   // Create CpuFloatSupport.
   CpuFloatSupport::DotStrategyChecker call_library_for_dot =
       [&spec](const HloInstruction& hlo) { return spec.call_library_for_dot; };
-  std::unique_ptr<TargetMachineFeatures> features = CreateTargetMachineFeatures(
-      "x86_64-unknown-linux-gnu", spec.cpu_name, spec.features);
-  CpuFloatSupport cpu_float_support(BF16, call_library_for_dot, features.get());
+  CpuFloatSupport cpu_float_support(BF16, call_library_for_dot);
 
   // Run FloatNormalization and check the results.
   FloatNormalization float_normalization(&cpu_float_support);
@@ -122,27 +115,16 @@ std::vector<SkipInstructionTestSpec> GetSkipInstructionTestSpecs() {
       // Add op, always upcast.
       SkipInstructionTestSpec{HloOpcode::kAdd,
                               /*call_library_for_dot=*/true,
-                              /*cpu_name=*/"sapphirerapids",
-                              /*features=*/"+avx512bf16",
                               /*upcast=*/true},
-      // CPU has BF16, but library dot is disabled.
+      // Library dot is disabled.
       SkipInstructionTestSpec{HloOpcode::kDot,
                               /*call_library_for_dot=*/false,
-                              /*cpu_name=*/"sapphirerapids",
-                              /*features=*/"+avx512bf16",
                               /*upcast=*/true},
-      // Library dot is enabled, but CPU does not have BF16.
+      // Library dot is enabled.
       SkipInstructionTestSpec{HloOpcode::kDot,
                               /*call_library_for_dot=*/true,
-                              /*cpu_name=*/"znver3",
-                              /*features=*/"+avx2",
-                              /*upcast=*/true},
-      // Library dot is enabled and CPU has BF16. Use mixed precision.
-      SkipInstructionTestSpec{HloOpcode::kDot,
-                              /*call_library_for_dot=*/true,
-                              /*cpu_name=*/"sapphirerapids",
-                              /*features=*/"+avx512bf16",
-                              /*upcast=*/false}};
+                              /*upcast=*/false},
+  };
 }
 
 INSTANTIATE_TEST_SUITE_P(SkipInstructionTestSuite, SkipInstructionTest,
diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
index 86645dd94178dc..3f639cd4b07ec1 100644
--- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
+++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
@@ -35,15 +35,6 @@ class CpuInstructionFusion : public InstructionFusion {
       : InstructionFusion(CpuInstructionFusion::IsExpensive, may_duplicate) {}
   ~CpuInstructionFusion() override = default;
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
-    fusion_node_evaluations_.clear();
-    ComputeInstructionsToSkip(module, execution_threads);
-    return InstructionFusion::Run(module, execution_threads);
-  }
-
   // Returns the threshold for a constant to be considered a large constant.
   static constexpr int64_t GetLargeConstantThresholdBytes() {
     constexpr int64_t kLargeConstantThresholdBytes = 10000;
@@ -56,6 +47,14 @@ class CpuInstructionFusion : public InstructionFusion {
   HloInstruction::FusionKind ChooseKind(
       const HloInstruction* producer, const HloInstruction* consumer) override;
 
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
+    fusion_node_evaluations_.clear();
+    ComputeInstructionsToSkip(module, execution_threads);
+    return InstructionFusion::RunImpl(module, execution_threads);
+  }
+
  private:
   HloInstruction* FuseInstruction(HloInstruction* fusion_instruction,
                                   HloInstruction* producer) override;
diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc b/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc
index f0159f209deb33..c626bb5e99e9d5 100644
--- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/cpu/cpu_instruction_fusion.h"
 
-#include <algorithm>
 #include <memory>
 #include <set>
 #include <string>
@@ -23,6 +22,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -35,8 +36,8 @@ limitations under the License.
 #include "xla/service/transpose_folding.h"
 #include "xla/shape.h"
 #include "xla/tests/test_utils.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -276,9 +277,8 @@ class OpcodeFusionTest : public InstructionFusionTest {
     EXPECT_EQ(root->fusion_kind(), fusion_kind);
 
     std::vector<HloOpcode> fused_opcodes(root->fused_instruction_count());
-    std::transform(root->fused_instructions().begin(),
-                   root->fused_instructions().end(), fused_opcodes.begin(),
-                   [](const HloInstruction* hlo) { return hlo->opcode(); });
+    absl::c_transform(root->fused_instructions(), fused_opcodes.begin(),
+                      [](const HloInstruction* hlo) { return hlo->opcode(); });
 
     EXPECT_EQ(
         std::multiset<HloOpcode>(fused_opcodes.begin(), fused_opcodes.end()),
diff --git a/third_party/xla/xla/service/cpu/cpu_multi_output_fusion.h b/third_party/xla/xla/service/cpu/cpu_multi_output_fusion.h
index bb82582869f9bc..060f11bfc53aa8 100644
--- a/third_party/xla/xla/service/cpu/cpu_multi_output_fusion.h
+++ b/third_party/xla/xla/service/cpu/cpu_multi_output_fusion.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/multi_output_fusion.h"
 
@@ -26,7 +27,8 @@ namespace xla::cpu {
 
 class CpuMultiOutputFusion final : public MultiOutputFusion {
  public:
-  CpuMultiOutputFusion() = default;
+  explicit CpuMultiOutputFusion(const AliasInfo* alias_info)
+      : MultiOutputFusion(alias_info) {}
 
   absl::string_view name() const override { return "cpu_multi_output_fusion"; }
 
diff --git a/third_party/xla/xla/service/cpu/cpu_multi_output_fusion_test.cc b/third_party/xla/xla/service/cpu/cpu_multi_output_fusion_test.cc
index 3e1ddb664cb2b7..ffc54e6bf347ab 100644
--- a/third_party/xla/xla/service/cpu/cpu_multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_multi_output_fusion_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
@@ -28,7 +29,10 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla::cpu {
 namespace {
 
-using MultiOutputFusionTest = HloHardwareIndependentTestBase;
+class MultiOutputFusionTest : public HloHardwareIndependentTestBase {
+ protected:
+  AliasInfo alias_info_;
+};
 
 TEST_F(MultiOutputFusionTest, TrivialReusedInput) {
   // The current implementation of the multi-output fusion pass only fuses when
@@ -55,8 +59,8 @@ TEST_F(MultiOutputFusionTest, TrivialReusedInput) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
                           ParseAndReturnVerifiedModule(kTrivialReusedInput));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          CpuMultiOutputFusion().Run(hlo_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, CpuMultiOutputFusion(&alias_info_).Run(hlo_module.get()));
   EXPECT_TRUE(changed);
   HloComputation* entry_computation = hlo_module->entry_computation();
   EXPECT_THAT(entry_computation->instructions(),
diff --git a/third_party/xla/xla/service/cpu/cpu_options.cc b/third_party/xla/xla/service/cpu/cpu_options.cc
index 03106fd196640c..3bed8d7cc44269 100644
--- a/third_party/xla/xla/service/cpu/cpu_options.cc
+++ b/third_party/xla/xla/service/cpu/cpu_options.cc
@@ -161,4 +161,10 @@ bool UseMultiOutputFusion(const HloModuleConfig& config) {
   return extra_options_map.count(kUseMultiOutputFusion) > 0;
 }
 
+bool EnableTiledEmitter(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kEnableTiledEmitter) > 0;
+}
+
 }  // namespace xla::cpu::options
diff --git a/third_party/xla/xla/service/cpu/cpu_options.h b/third_party/xla/xla/service/cpu/cpu_options.h
index c5f4db39ae2610..800b66104412b9 100644
--- a/third_party/xla/xla/service/cpu/cpu_options.h
+++ b/third_party/xla/xla/service/cpu/cpu_options.h
@@ -52,6 +52,8 @@ inline constexpr absl::string_view kUseMultiOutputFusion =
     "xla_cpu_use_multi_output_fusion";
 inline constexpr absl::string_view kDisablePlatformDependentMath =
     "xla_cpu_disable_platform_dependent_math";
+inline constexpr absl::string_view kEnableTiledEmitter =
+    "xla_cpu_enable_tiled_emitter";
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
@@ -68,6 +70,7 @@ absl::StatusOr<int64_t> SmallWhileLoopByteThreshold(
 bool UseExperimentalLoopFusion(const HloModuleConfig& config);
 bool FlattenAfterFusion(const HloModuleConfig& config);
 bool UseMultiOutputFusion(const HloModuleConfig& config);
+bool EnableTiledEmitter(const HloModuleConfig& config);
 
 }  // namespace xla::cpu::options
 
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index f242cecf917068..d2722faab2a2b4 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -71,17 +71,6 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-// TODO(zhangqiaorjc): Prefer to make callers set and use device_ordinal
-// directly since callers may not have a Stream*.
-int GetDeviceOrdinal(const xla::ExecutableRunOptions* run_options) {
-  if (!run_options) {
-    return 0;
-  } else if (run_options->device_ordinal() != -1) {
-    return run_options->device_ordinal();
-  }
-  return run_options->stream()->parent()->device_ordinal();
-}
-
 extern const char* const kEigenMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenMatMulF16";
 extern const char* const kEigenMatMulF32SymbolName =
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index bbdfc6ce6ad9ee..2131c3224acbc6 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -95,8 +95,6 @@ extern const char* const kHandleFfiCallSymbolName;
 // prefix.
 extern const char* const kXlaCpuRuntimeSymbolNamePrefix;
 
-int GetDeviceOrdinal(const xla::ExecutableRunOptions* run_options);
-
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime_test.cc b/third_party/xla/xla/service/cpu/cpu_runtime_test.cc
deleted file mode 100644
index 79df0e29e8ea91..00000000000000
--- a/third_party/xla/xla/service/cpu/cpu_runtime_test.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdint>
-#include <utility>
-#define EIGEN_USE_THREADS
-#include "xla/service/cpu/cpu_runtime.h"
-
-#include <memory>
-#include <string>
-#include <tuple>
-
-#include "absl/strings/str_format.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "xla/array2d.h"
-#include "xla/client/local_client.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/cpu/runtime_custom_call_status.h"
-#include "xla/service/cpu/runtime_matmul.h"
-#include "xla/service/cpu/runtime_matmul_acl.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "xla/service/custom_call_status_internal.h"
-#include "xla/types.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class CpuRuntimeTest : public ::testing::Test {};
-
-template <typename T>
-std::unique_ptr<Array2D<float>> MaybeTransposeArray2D(const Array2D<T>& array,
-                                                      bool transpose) {
-  int64_t output_height = array.height();
-  int64_t output_width = array.width();
-  if (transpose) {
-    std::swap(output_width, output_height);
-  }
-  auto output = std::make_unique<Array2D<float>>(output_height, output_width);
-  for (int y = 0; y < array.height(); y++) {
-    for (int x = 0; x < array.width(); x++) {
-      if (transpose) {
-        (*output)(x, y) = array(y, x);
-      } else {
-        (*output)(y, x) = array(y, x);
-      }
-    }
-  }
-  return output;
-}
-
-// Verifies that matrix 'c' equals the result of matrix 'a' times matrix 'b'.
-// Each element is compared to within a small error bound.
-void CheckMatrixMultiply(const Array2D<float>& a, const Array2D<float>& b,
-                         const Array2D<float>& c) {
-  for (int i = 0; i < a.height(); ++i) {
-    for (int j = 0; j < b.width(); ++j) {
-      float sum = 0.0;
-      for (int k = 0; k < a.width(); ++k) {
-        sum += a(i, k) * b(k, j);
-      }
-      EXPECT_NEAR(sum, c(i, j), 0.01);
-    }
-  }
-}
-
-std::unique_ptr<Array2D<float>> EigenMatrixMultiply(const Array2D<float>& a,
-                                                    const Array2D<float>& b,
-                                                    bool transpose_lhs,
-                                                    bool transpose_rhs,
-                                                    bool single_threaded) {
-  CHECK_EQ(a.width(), b.height());
-  int64_t m = a.height();
-  int64_t n = b.width();
-  int64_t k = a.width();
-
-  // The Eigen matmul runtime function expects the matrix to be in column major
-  // order and array2d is in row-major order. Create transposes of a and b. The
-  // 'data' buffer in the transposed array is the original array in column major
-  // order.
-  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
-  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
-
-  // Since we're going to transpose c before returning it. Swap the order of the
-  // dimension sizes to ensure the returned array is properly dimensioned.
-  auto c_transpose = std::make_unique<Array2D<float>>(n, m);
-  if (single_threaded) {
-    __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
-        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
-        m, n, k, transpose_lhs, transpose_rhs);
-  } else {
-    tsl::thread::ThreadPool pool(tsl::Env::Default(), "XLAEigen", 2);
-    Eigen::ThreadPoolDevice device(pool.AsEigenThreadPool(), pool.NumThreads());
-    ExecutableRunOptions run_options;
-    run_options.set_intra_op_thread_pool(&device);
-
-    __xla_cpu_runtime_EigenMatMulF32(&run_options, c_transpose->data(),
-                                     a_transpose->data(), b_transpose->data(),
-                                     m, n, k, transpose_lhs, transpose_rhs);
-  }
-  return MaybeTransposeArray2D(*c_transpose, true);
-}
-
-struct MatMulShape {
-  int64_t m;
-  int64_t k;
-  int64_t n;
-};
-
-MatMulShape MatMulShapes[] = {
-    MatMulShape{2, 2, 3},     MatMulShape{256, 512, 1024},
-    MatMulShape{128, 128, 1}, MatMulShape{1, 128, 128},
-    MatMulShape{1, 32, 128},  MatMulShape{1, 32, 16},
-    MatMulShape{32, 16, 1},   MatMulShape{32, 128, 1},
-};
-
-// This takes 4 parameters:
-// * shape of the matmul
-// * transpose_lhs
-// * transpose_rhs
-// * single_threaded
-using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
-
-class EigenMatMulTest : public CpuRuntimeTest,
-                        public ::testing::WithParamInterface<MatMulTestParam> {
- public:
-  static std::string Name(
-      const ::testing::TestParamInfo<MatMulTestParam>& info) {
-    MatMulShape shape = std::get<0>(info.param);
-    bool transpose_lhs = std::get<1>(info.param);
-    bool transpose_rhs = std::get<2>(info.param);
-    bool single_threaded = std::get<3>(info.param);
-
-    return absl::StrFormat("EigenMatMul_%d_%d_%d_%s%s%s_threaded", shape.m,
-                           shape.k, shape.n, transpose_lhs ? "Tlhs_" : "",
-                           transpose_rhs ? "Trhs_" : "",
-                           single_threaded ? "single" : "multi");
-  }
-};
-
-TEST_P(EigenMatMulTest, DoIt) {
-  MatMulShape shape = std::get<0>(GetParam());
-  bool transpose_lhs = std::get<1>(GetParam());
-  bool transpose_rhs = std::get<2>(GetParam());
-  bool single_threaded = std::get<3>(GetParam());
-
-  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
-  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
-  auto c = EigenMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs,
-                               single_threaded);
-  CheckMatrixMultiply(*a, *b, *c);
-}
-
-INSTANTIATE_TEST_SUITE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
-                         ::testing::Combine(::testing::ValuesIn(MatMulShapes),
-                                            ::testing::Bool(),
-                                            ::testing::Bool(),
-                                            ::testing::Bool()),
-                         EigenMatMulTest::Name);
-
-TEST_F(CpuRuntimeTest, SuccessStatus) {
-  XlaCustomCallStatus success_status;
-  // Success is the default state.
-  ASSERT_TRUE(__xla_cpu_runtime_StatusIsSuccess(&success_status));
-}
-
-TEST_F(CpuRuntimeTest, FailureStatus) {
-  XlaCustomCallStatus success_status;
-  XlaCustomCallStatusSetFailure(&success_status, "Failed", 6);
-  ASSERT_FALSE(__xla_cpu_runtime_StatusIsSuccess(&success_status));
-}
-
-// When run_options is null, the process should not crash and the device ordinal
-// should be 0.
-TEST_F(CpuRuntimeTest, GetDeviceOrdinalWhenRunOptionsEmpty) {
-  EXPECT_EQ(cpu::runtime::GetDeviceOrdinal(/*run_options=*/nullptr), 0);
-}
-
-// When the device ordinal is set directly in run options, it should be returned
-// (and NOT the value from stream).
-TEST_F(CpuRuntimeTest, GetDeviceOrdinalWhenSetInRunOptions) {
-  // GetDeviceOrdinal implementation bases on the fact that device ordinal is
-  // -1 by default. So we need to assert for that here to avoid crash in case
-  // the default value changes in the future.
-  ExecutableRunOptions run_options;
-  ASSERT_EQ(run_options.device_ordinal(), -1);
-
-  // Actual test - set device ordinal in run options and check that it is
-  // returned.
-  run_options.set_device_ordinal(3);
-  EXPECT_EQ(cpu::runtime::GetDeviceOrdinal(&run_options), 3);
-}
-
-// TODO(abanas): Add test case for the device ordinal with stream case. It
-// requires mocking the stream and stream executor.
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
index 7106f16f80befb..3095efd36ec5e8 100644
--- a/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
@@ -49,6 +49,11 @@ absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtanh(
   return Unimplemented("atanh");
 }
 
+absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAsinh(
+    PrimitiveType prim_type, llvm::Value* value) {
+  return Unimplemented("asinh");
+}
+
 absl::StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(
     PrimitiveType prim_type, llvm::Value* value) {
   if (prim_type == F32 || prim_type == F64 || prim_type == F16) {
diff --git a/third_party/xla/xla/service/cpu/elemental_ir_emitter.h b/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
index e452b6dd8f2239..21df392547aa8a 100644
--- a/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
@@ -55,6 +55,9 @@ class CpuElementalIrEmitter final : public ElementalIrEmitter {
   absl::StatusOr<llvm::Value*> EmitAtanh(PrimitiveType prim_type,
                                          llvm::Value* value) override;
 
+  absl::StatusOr<llvm::Value*> EmitAsinh(PrimitiveType prim_type,
+                                         llvm::Value* value) override;
+
   absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                         llvm::Value* value) override;
 
diff --git a/third_party/xla/xla/service/cpu/executable.proto b/third_party/xla/xla/service/cpu/executable.proto
index 947256b475fb86..4fedcfcc4067b3 100644
--- a/third_party/xla/xla/service/cpu/executable.proto
+++ b/third_party/xla/xla/service/cpu/executable.proto
@@ -36,6 +36,12 @@ message ObjFileProto {
   string name = 2;
 }
 
+message TargetMachineOptionsProto {
+  string triple = 1;
+  string cpu = 2;
+  string features = 3;
+}
+
 message CompilationResultProto {
   enum ObjFileKind {
     UNKNOWN = 0;
@@ -58,4 +64,5 @@ message CompilationResultProto {
   ThunkSequenceProto thunk_sequence = 6;
   repeated SymbolProto compiled_symbols = 7;
   repeated ObjFileProto object_files = 8;
+  TargetMachineOptionsProto target_machine_options = 9;
 }
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.cc b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
index 0751dc4767e6b1..af4cf569643a95 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
@@ -21,7 +21,8 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-bool FusionWrapper::MustWrapInstruction(HloOpcode opcode) {
+bool FusionWrapper::MustWrapInstruction(const HloInstruction& instruction) {
+  const HloOpcode opcode = instruction.opcode();
   switch (opcode) {
     case HloOpcode::kScatter:
       return true;
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.h b/third_party/xla/xla/service/cpu/fusion_wrapper.h
index 8b777bc250c389..5f430f93afa8c7 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper.h
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.h
@@ -34,7 +34,7 @@ class FusionWrapper : public emitters::FusionWrapperBase {
 
   absl::string_view name() const override { return "fusion-wrapper"; }
 
-  bool MustWrapInstruction(HloOpcode opcode) override;
+  bool MustWrapInstruction(const HloInstruction& instruction) override;
 
  private:
   bool using_new_fusion_emitter_;
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index ffa2bbeb912735..42b3f7e1a09a82 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -75,7 +76,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
-#include "xla/literal_util.h"
 #include "xla/map_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
@@ -106,7 +106,6 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/tsl/lib/math/math_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -168,7 +167,7 @@ IrEmitter::IrEmitter(mlir::MLIRContext* mlir_context,
       &hlo_module, &thread_local_computations_, &global_computations_);
   absl::c_sort(thread_local_computations_);
   absl::c_sort(global_computations_);
-  TF_CHECK_OK(s) << "Should have failed buffer assignment.";
+  CHECK_OK(s) << "Should have failed buffer assignment.";
   SetModuleMemoryRegionName(*module_, "ir_emitter");
 }
 
@@ -3634,9 +3633,8 @@ llvm_ir::IrArray IrEmitter::GetIrArrayFor(const HloInstruction* hlo) {
 std::vector<llvm_ir::IrArray> IrEmitter::GetIrArraysForOperandsOf(
     const HloInstruction* hlo) {
   std::vector<llvm_ir::IrArray> arrays;
-  std::transform(
-      hlo->operands().begin(), hlo->operands().end(),
-      std::back_inserter(arrays),
+  absl::c_transform(
+      hlo->operands(), std::back_inserter(arrays),
       [&](const HloInstruction* operand) { return GetIrArrayFor(operand); });
   return arrays;
 }
diff --git a/third_party/xla/xla/service/cpu/ir_emitter_test.cc b/third_party/xla/xla/service/cpu/ir_emitter_test.cc
index 0a16c676370004..bd87c2ad04ace4 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter_test.cc
@@ -40,11 +40,13 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/cpu/alignment.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/jit_compiler.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -59,7 +61,6 @@ limitations under the License.
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/ir_function.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_util.h"
@@ -232,7 +233,8 @@ CreateIrEmitterForConstantEmissionTests(HloModule& module,
   IrCompiler::Options ir_compiler_options{
       /*optimization_level=*/llvm::CodeGenOptLevel::Default,
       /*optimize_for_size=*/options::OptimizeForSizeRequested(config),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
+      /*target_machine_options=*/
+      TargetMachineOptions(module.config().debug_options()),
       /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config),
       /*disable_expensive_passes=*/
       debug_options.xla_llvm_disable_expensive_passes(),
@@ -242,7 +244,7 @@ CreateIrEmitterForConstantEmissionTests(HloModule& module,
   // Definition generator to link with XLA:CPU host runtime symbols.
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   // Options for orchestrating the JIT compilation process.
diff --git a/third_party/xla/xla/service/cpu/onednn_config.proto b/third_party/xla/xla/service/cpu/onednn_config.proto
index c8e982e0bcfd0d..85b46db3793e70 100644
--- a/third_party/xla/xla/service/cpu/onednn_config.proto
+++ b/third_party/xla/xla/service/cpu/onednn_config.proto
@@ -54,9 +54,8 @@ message OneDnnFusionConfig {
     SWISH = 12;
   }
   repeated FusionKind ops = 1;
-  // To avoid protobuf failures for specific decimal values,
-  // the original float value alpha is type-casted to int32.
-  repeated int32 alpha_typecast = 2;
+  reserved 2;  // was alpha_typecast
+  repeated float alpha = 3;
 }
 
 message OneDnnTensorLayoutProto {
diff --git a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
index bc3df0361d082a..a530fc45da2969 100644
--- a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
@@ -679,22 +679,20 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
           (*it).window_reversal()) {
         return absl::OkStatus();
       }
-      // Changing the input subspace of uint repeated fields from whole numbers
-      // to natural nummbers to avoid misinterpretation of buffer values.
-      conv_config->mutable_window()->add_pad_left((*it).padding_low() + 1);
-      conv_config->mutable_window()->add_pad_right((*it).padding_high() + 1);
-      conv_config->mutable_window()->add_strides((*it).stride() + 1);
+      conv_config->mutable_window()->add_pad_left((*it).padding_low());
+      conv_config->mutable_window()->add_pad_right((*it).padding_high());
+      conv_config->mutable_window()->add_strides((*it).stride());
       conv_config->mutable_window()->add_window_dilations(
-          (*it).window_dilation() + 1);
+          (*it).window_dilation());
     }
 
     for (int i = 0; i < dims; i++) {
       conv_config->mutable_input()->mutable_data()->add_spatial_dims(
-          conv_dims.input_spatial_dimensions()[i] + 1);
+          conv_dims.input_spatial_dimensions()[i]);
       conv_config->mutable_kernel()->mutable_filter()->add_spatial_dims(
-          conv_dims.kernel_spatial_dimensions()[i] + 1);
+          conv_dims.kernel_spatial_dimensions()[i]);
       conv_config->mutable_output()->mutable_data()->add_spatial_dims(
-          conv_dims.output_spatial_dimensions()[i] + 1);
+          conv_dims.output_spatial_dimensions()[i]);
     }
 
     HloInstruction* custom_call =
@@ -1065,10 +1063,7 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
       auto backend_config = custom_call->backend_config<BackendConfig>();
       auto fusions_config = GetFusionsConfig(&backend_config);
       fusions_config->add_ops(OneDnnFusionConfig::LINEAR);
-      // Casting to int32 because of issues in proto config for decimal types
-      // handling.
-      fusions_config->add_alpha_typecast(
-          *(reinterpret_cast<int32_t*>(&constant_value.value())));
+      fusions_config->add_alpha(constant_value.value());
       TF_RETURN_IF_ERROR(custom_call->set_backend_config(*backend_config));
       HloInstruction* new_instr;
       if (optional_convert != nullptr &&
@@ -1530,11 +1525,11 @@ EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(
     OneDnnConvolutionConfig, onednn_conv_config, optimization_config,
     user_scratchpad);
 
-absl::StatusOr<bool> OneDnnContractionRewriter::Run(
+absl::StatusOr<bool> OneDnnContractionRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      3, "OneDnnContractionRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(3, "OneDnnContractionRewriter::RunImpl(), before:\n" +
+                        module->ToString());
   OneDnnContractionRewriteVisitor visitor(graph_enabled_);
   TF_ASSIGN_OR_RETURN(auto result,
                       visitor.RunOnModule(module, execution_threads));
@@ -1544,7 +1539,7 @@ absl::StatusOr<bool> OneDnnContractionRewriter::Run(
   TF_ASSIGN_OR_RETURN(auto result2,
                       reorder_visitor.RunOnModule(module, execution_threads));
   XLA_VLOG_LINES(
-      3, "OneDnnContractionRewriter::Run(), after:\n" + module->ToString());
+      3, "OneDnnContractionRewriter::RunImpl(), after:\n" + module->ToString());
   return {result || result2};
 }
 
diff --git a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
index 61d4d35bcb2bc6..19913b382af66b 100644
--- a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
@@ -48,11 +48,6 @@ class OneDnnContractionRewriter : public HloModulePass {
     return "onednn-contraction-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static bool ShouldRewriteDot(const HloInstruction* dot_instr,
                                bool before_layout_assignment = false);
   static bool ShouldRewriteConv(const HloInstruction* conv_instr);
@@ -62,6 +57,11 @@ class OneDnnContractionRewriter : public HloModulePass {
            ShouldRewriteConv(instr);
   }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   int intra_op_parallelism_;
   const tsl::thread::ThreadPool* compile_threadpool_;
diff --git a/third_party/xla/xla/service/cpu/onednn_convolution.cc b/third_party/xla/xla/service/cpu/onednn_convolution.cc
index 9af97ca03e048b..ca30c115bb9314 100644
--- a/third_party/xla/xla/service/cpu/onednn_convolution.cc
+++ b/third_party/xla/xla/service/cpu/onednn_convolution.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "oneapi/dnnl/dnnl.hpp"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/service/cpu/onednn_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/onednn_util.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
 #include "xla/shape.h"
 #include "xla/tsl/util/onednn_threadpool.h"
 
@@ -56,7 +56,7 @@ using dnnl::prop_kind;
 using dnnl::stream;
 
 memory::dims GetPrimitiveParameter(
-    const tsl::protobuf::RepeatedField<uint64_t>& field, int offset) {
+    const tsl::protobuf::RepeatedField<uint64_t>& field, int offset = 0) {
   memory::dims param_field(field.begin(), field.end());
   // Subtract the offset so that values are interpreted accurately
   for (int64_t& n : param_field) {
@@ -73,7 +73,7 @@ std::vector<int> ComputePermutations(
   perm_axes[dim1] = 1;
   int index = 2;
   for (uint64_t n : spatial_dims) {
-    perm_axes[n - 1] = index++;
+    perm_axes[n] = index++;
   }
   return perm_axes;
 }
@@ -134,14 +134,13 @@ CreateOneDnnPrimDesc<dnnl::convolution_forward::primitive_desc>(
   memory::desc weights_md = ShapeToMemDesc(weight_shape);
   memory::desc output_md = ShapeToMemDesc(output_shape);
 
-  memory::dims strides =
-      GetPrimitiveParameter(conv_config.window().strides(), 1);
+  memory::dims strides = GetPrimitiveParameter(conv_config.window().strides());
   memory::dims pad_left =
-      GetPrimitiveParameter(conv_config.window().pad_left(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_left());
   memory::dims pad_right =
-      GetPrimitiveParameter(conv_config.window().pad_right(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_right());
   memory::dims rhs_dilations =
-      GetPrimitiveParameter(conv_config.window().window_dilations(), 2);
+      GetPrimitiveParameter(conv_config.window().window_dilations(), 1);
 
   uint64_t groups = conv_config.feature_groups();
 
@@ -241,14 +240,13 @@ void ExecuteOneDnnConvolution(absl::Span<MemrefInfoHandler> arguments,
       conv_config.output().data().feature_dim(),
       conv_config.output().data().spatial_dims()));
 
-  memory::dims strides =
-      GetPrimitiveParameter(conv_config.window().strides(), 1);
+  memory::dims strides = GetPrimitiveParameter(conv_config.window().strides());
   memory::dims pad_left =
-      GetPrimitiveParameter(conv_config.window().pad_left(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_left());
   memory::dims pad_right =
-      GetPrimitiveParameter(conv_config.window().pad_right(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_right());
   memory::dims rhs_dilations =
-      GetPrimitiveParameter(conv_config.window().window_dilations(), 2);
+      GetPrimitiveParameter(conv_config.window().window_dilations(), 1);
 
   uint64_t groups = conv_config.feature_groups();
 
@@ -332,12 +330,12 @@ void ExecuteOneDnnConvolution(absl::Span<MemrefInfoHandler> arguments,
       {DNNL_ARG_DST, resources.dst_mem}};
 
   if (conv_config.optimization_config().user_scratchpad()) {
-    XLA_LIGHTWEIGHT_CHECK(results.size() > 1);
+    CHECK_GT(results.size(), 1);
     MemrefInfo scratch_minfo(results[1].get());
 
     size_t required_size = conv_pd->scratchpad_desc().get_size();
     size_t provided_size = scratch_minfo.GetOneDnnDims()[0];  // bytes (u8)
-    XLA_LIGHTWEIGHT_CHECK(required_size <= provided_size);
+    CHECK_LE(required_size, provided_size);
 
     resources.scratch_mem =
         memory(conv_pd->scratchpad_desc(), cpu_engine, scratch_minfo.Data());
diff --git a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
index 4ed33044444446..3a3f881e0a0e33 100644
--- a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
+++ b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "oneapi/dnnl/dnnl_types.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/cpu/backend_config.pb.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
 
 // Eigen Tensor must come after `onednn_threadpool.h`
 #include "unsupported/Eigen/CXX11/Tensor"  // NOLINT
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index 76d64a37203b77..86e39519354d58 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/cpu/onednn_matmul.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <initializer_list>
@@ -29,6 +28,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/service/cpu/onednn_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/onednn_util.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
 #include "xla/shape.h"
 #include "tsl/platform/cpu_info.h"
 
@@ -112,7 +111,7 @@ dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
 
   // extend bias rank to match result rank
   auto missed_rank = output_md.get_ndims() - bias_md.get_ndims();
-  XLA_LIGHTWEIGHT_CHECK(missed_rank >= 0);
+  CHECK_GE(missed_rank, 0);
   if (!bias_md.is_zero() && missed_rank > 0) {
     auto bias_dims = bias_md.get_dims();
     bias_dims.insert(bias_dims.begin(), missed_rank, 1);
@@ -180,9 +179,8 @@ std::unique_ptr<matmul::primitive_desc> CreateMatMulPrimDesc(
   TransposeIfNecessary(matmul_config.result().tensor().dimensions(), false,
                        output_md);
   std::vector<memory::desc> fused_mds;
-  std::transform(fused_shapes.begin(), fused_shapes.end(),
-                 std::back_inserter(fused_mds),
-                 [](const Shape& shape) { return ShapeToMemDesc(shape); });
+  absl::c_transform(fused_shapes, std::back_inserter(fused_mds),
+                    [](const Shape& shape) { return ShapeToMemDesc(shape); });
   return CreateMatMulPrimDesc(engine(engine::kind::cpu, 0), input_md,
                               weights_md, output_md, fused_mds, matmul_config);
 }
@@ -218,9 +216,8 @@ CreateOneDnnPrimDesc<dnnl::matmul::primitive_desc>(HloInstruction* instr) {
   auto fused_operands =
       HloInstruction::InstructionVector(operands.begin() + 2, operands.end());
   std::vector<Shape> fused_shapes;
-  std::transform(fused_operands.begin(), fused_operands.end(),
-                 std::back_inserter(fused_shapes),
-                 [](const HloInstruction* instr) { return instr->shape(); });
+  absl::c_transform(fused_operands, std::back_inserter(fused_shapes),
+                    [](const HloInstruction* instr) { return instr->shape(); });
 
   return CreateMatMulPrimDesc(input_shape, weight_shape, output_shape,
                               fused_shapes, matmul_config);
@@ -371,7 +368,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
     // extend bias rank to match result rank
     if (!bias_md.is_zero()) {
       auto missed_rank = output_md.get_ndims() - bias_md.get_ndims();
-      XLA_LIGHTWEIGHT_CHECK(missed_rank >= 0);
+      CHECK_GE(missed_rank, 0);
       if (missed_rank > 0) {
         auto bias_dims = bias_md.get_dims();
         bias_dims.insert(bias_dims.begin(), missed_rank, 1);
@@ -411,7 +408,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
       CreateMatMulPrimDesc(cpu_engine, input_md, weights_md, output_md,
                            fused_mds, matmul_config, &fused_operands_ref);
 
-  XLA_LIGHTWEIGHT_CHECK(num_args == arg_indx);
+  CHECK_EQ(num_args, arg_indx);
 
   auto lhs_mem = memory(input_md, cpu_engine, input_minfo.Data());
   auto rhs_mem = memory(matmul_pd->weights_desc(), cpu_engine, rhs_data);
@@ -428,7 +425,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
                                               {DNNL_ARG_DST, result_mem}};
 
   if (matmul_config.optimization_config().user_scratchpad()) {
-    XLA_LIGHTWEIGHT_CHECK(scratch != nullptr);
+    CHECK(scratch != nullptr);
     MemrefInfo scratch_minfo(scratch);
     auto scratchpad_md = matmul_pd->scratchpad_desc();
     auto scratch_mem = memory(scratchpad_md, cpu_engine, scratch_minfo.Data());
@@ -482,7 +479,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMulReorder(
     bias_md = bias_minfo.GetOneDnnMemDesc();
   }
 
-  XLA_LIGHTWEIGHT_CHECK(num_args >= arg_indx);
+  CHECK_GE(num_args, arg_indx);
 
   // Update dims and strides for transposed inputs.
   TransposeIfNecessary(matmul_config.lhs().tensor().dimensions(),
@@ -493,7 +490,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMulReorder(
   // extend bias rank to match result rank
   if (!bias_md.is_zero()) {
     auto missed_rank = output_md.get_ndims() - bias_md.get_ndims();
-    XLA_LIGHTWEIGHT_CHECK(missed_rank >= 0);
+    CHECK_GE(missed_rank, 0);
     if (missed_rank > 0) {
       auto bias_dims = bias_md.get_dims();
       bias_dims.insert(bias_dims.begin(), missed_rank, 1);
@@ -504,8 +501,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMulReorder(
   auto result_md = OneDnnMatMulOptWeightsDesc(cpu_engine, input_md, weight_md,
                                               bias_md, output_md);
 
-  XLA_LIGHTWEIGHT_CHECK(result_minfo.GetOneDnnMemDesc().get_size() ==
-                        result_md.get_size());
+  CHECK_EQ(result_minfo.GetOneDnnMemDesc().get_size(), result_md.get_size());
 
   auto weight_mem = dnnl::memory{weight_md, cpu_engine, weight_minfo.Data()};
   auto result_mem = dnnl::memory{result_md, cpu_engine, result_minfo.Data()};
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.h b/third_party/xla/xla/service/cpu/onednn_memory_util.h
index fc1280b5b60bd1..e920a9f5f9d40f 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.h
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "oneapi/dnnl/dnnl.hpp"
 #include "oneapi/dnnl/dnnl_common_types.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
 #include "xla/literal.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
@@ -127,7 +127,7 @@ absl::StatusOr<dnnl::memory::desc> TransposeLastTwoDims(
 #define TRANSPOSE_LAST_TWO_DIMS_IF(pred, mem_desc)        \
   if (pred) {                                             \
     auto trans_mem_desc = TransposeLastTwoDims(mem_desc); \
-    XLA_LIGHTWEIGHT_CHECK(trans_mem_desc.ok());           \
+    CHECK(trans_mem_desc.ok());                           \
     mem_desc = *trans_mem_desc;                           \
   }
 
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index bea079e3d74de6..7b52bfa90b30bd 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -621,14 +621,16 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> OneDnnOpsRewriter::Run(
+absl::StatusOr<bool> OneDnnOpsRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(3, "OneDnnOpsRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "OneDnnOpsRewriter::RunImpl(), before:\n" + module->ToString());
   OneDnnOpsRewriterVisitor visitor;
   TF_ASSIGN_OR_RETURN(auto result,
                       visitor.RunOnModule(module, execution_threads));
-  XLA_VLOG_LINES(3, "OneDnnOpsRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(3,
+                 "OneDnnOpsRewriter::RunImpl(), after:\n" + module->ToString());
   return result;
 }
 
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
index fec2a4e4cc11d0..0cdc66c0c1116b 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
@@ -29,8 +29,8 @@ class OneDnnOpsRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "onednn-ops-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/cpu/onednn_softmax.cc b/third_party/xla/xla/service/cpu/onednn_softmax.cc
index e035d16793dafd..4cee18c1938002 100644
--- a/third_party/xla/xla/service/cpu/onednn_softmax.cc
+++ b/third_party/xla/xla/service/cpu/onednn_softmax.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "oneapi/dnnl/dnnl_types.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/cpu/backend_config.pb.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
 // Below must come after `onednn_threadpool.h`
 #include "unsupported/Eigen/CXX11/Tensor"  // NOLINT
 
diff --git a/third_party/xla/xla/service/cpu/onednn_util.cc b/third_party/xla/xla/service/cpu/onednn_util.cc
index f78bedb94aa487..46519fadafc8b6 100644
--- a/third_party/xla/xla/service/cpu/onednn_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_util.cc
@@ -114,9 +114,7 @@ dnnl::post_ops PopulateOneDnnPostOps(
         fused_operand_idx++;
       } break;
       case OneDnnFusionConfig::LINEAR: {
-        float const_float;
-        *(reinterpret_cast<int32_t*>(&const_float)) =
-            fusion_config->alpha_typecast()[linear_scale_idx];
+        float const_float = fusion_config->alpha()[linear_scale_idx];
         post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, const_float,
                                 0.f);
         linear_scale_idx++;
diff --git a/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.h b/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.h
deleted file mode 100644
index 52b1bf0cc0062f..00000000000000
--- a/third_party/xla/xla/service/cpu/orc_jit_memory_mapper.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
-#define XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
-
-#include <memory>
-
-#include "absl/strings/string_view.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-
-namespace xla {
-namespace cpu {
-
-namespace orc_jit_memory_mapper {
-// Registers (if needed) a memory mapper by name and returns it if the
-// memory mapper getter has been set.  Otherwise returns nullptr.
-llvm::SectionMemoryManager::MemoryMapper* GetInstance(
-    absl::string_view allocation_region_name);
-
-class Registrar {
- public:
-  using MemoryMapperGetter =
-      std::unique_ptr<llvm::SectionMemoryManager::MemoryMapper>(
-          absl::string_view allocation_region_name);
-  // Registers the `mapper_getter`.  This is a no-op if `mapper_getter` is
-  // null.  Precondition:  no other memory mapper getter has been registered
-  // yet.
-  explicit Registrar(MemoryMapperGetter* mapper_getter);
-};
-}  // namespace orc_jit_memory_mapper
-
-#define XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_GETTER(mapper_instance, \
-                                                           ctr)             \
-  static ::xla::cpu::orc_jit_memory_mapper::Registrar                       \
-  XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_GETTER_NAME(ctr)(             \
-      mapper_instance)
-
-// __COUNTER__ must go through another macro to be properly expanded
-#define XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_GETTER_NAME(ctr) \
-  __orc_jit_memory_mapper_registrar_##ctr
-
-// Registers the MemoryMapperGetter.
-#define XLA_REGISTER_ORC_JIT_MEMORY_MAPPER_GETTER(factory) \
-  XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_GETTER(factory, __COUNTER__)
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
diff --git a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc
index 71291804f37e31..5de636cc5a76f4 100644
--- a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc
+++ b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc
@@ -31,10 +31,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/fusion_emitter.h"
+#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
@@ -44,7 +43,7 @@ limitations under the License.
 namespace xla::cpu {
 
 struct ParallelFusionEmitter::CompilerInstance {
-  std::unique_ptr<mlir::MLIRContext> context;
+  std::unique_ptr<mlir::MLIRContext> mlir_context;
   std::unique_ptr<FusionCompiler> compiler;
 };
 
@@ -97,12 +96,14 @@ auto ParallelFusionEmitter::FusionCompilerPool::GetInstance()
     return CreateSharedInstance(std::move(instance));
   }
 
-  std::unique_ptr<mlir::MLIRContext> context = FusionCompiler::CreateContext();
+  std::unique_ptr<mlir::MLIRContext> mlir_context =
+      FusionCompiler::CreateContext();
 
-  auto compiler = std::make_unique<FusionCompiler>(context.get(), options_,
+  auto compiler = std::make_unique<FusionCompiler>(mlir_context.get(), options_,
                                                    GetNestedHooks());
 
-  return CreateSharedInstance({std::move(context), std::move(compiler)});
+  return CreateSharedInstance({std::move(mlir_context),
+                               std::move(compiler)});
 }
 
 auto ParallelFusionEmitter::FusionCompilerPool::CreateSharedInstance(
@@ -142,12 +143,14 @@ ParallelFusionEmitter::FusionCompilerPool::GetNestedHooks() const {
 ParallelFusionEmitter::ParallelFusionEmitter(
     tsl::thread::ThreadPool& thread_pool, FusionCompiler::Options options,
     FusionCompiler::CompilationHooks hooks,
-    const BufferAssignment* buffer_assignment, bool use_unique_c_name)
+    const BufferAssignment* buffer_assignment, bool use_unique_c_name,
+    bool enable_tiled_emitter)
     : thread_pool_(thread_pool),
       fusion_compiler_pool_(
           std::make_unique<FusionCompilerPool>(options, std::move(hooks))),
       buffer_assignment_(buffer_assignment),
-      use_unique_c_name_(use_unique_c_name) {}
+      use_unique_c_name_(use_unique_c_name),
+      enable_tiled_emitter_(enable_tiled_emitter) {}
 
 ParallelFusionEmitter::~ParallelFusionEmitter() {
   absl::MutexLock lock(kernels_mutex_);
@@ -163,9 +166,11 @@ absl::StatusOr<KernelSpec> ParallelFusionEmitter::AddFusion(
   // returned immediately, we have to do it in the main thread. This can be
   // fixed but will require a rework of the ThunkEmitter.
   auto compiler_instance = fusion_compiler_pool_->GetInstance();
-  TF_ASSIGN_OR_RETURN(MlirKernelDefinition mlir_kernel_definition,
-                      EmitFusionKernel(*compiler_instance->context, *fusion,
-                                       buffer_assignment_, use_unique_c_name_));
+  TF_ASSIGN_OR_RETURN(
+      KernelDefinition mlir_kernel_definition,
+      EmitFusionKernel(*compiler_instance->mlir_context, *fusion,
+                       buffer_assignment_, use_unique_c_name_,
+                       enable_tiled_emitter_));
 
   {
     absl::MutexLock lock(kernels_mutex_);
@@ -173,8 +178,8 @@ absl::StatusOr<KernelSpec> ParallelFusionEmitter::AddFusion(
   }
 
   KernelSpec spec = mlir_kernel_definition.spec();
-  auto shared_source =
-      std::make_shared<MlirKernelDefinition>(std::move(mlir_kernel_definition));
+  auto shared_source = std::make_shared<KernelDefinition<MlirKernelSource>>(
+      std::move(mlir_kernel_definition));
 
   thread_pool_.Schedule(absl::bind_front(&ParallelFusionEmitter::CompileFusion,
                                          this, std::move(shared_source),
@@ -183,7 +188,7 @@ absl::StatusOr<KernelSpec> ParallelFusionEmitter::AddFusion(
   return spec;
 }
 
-absl::StatusOr<std::vector<LlvmKernelDefinition>>
+absl::StatusOr<std::vector<KernelDefinition<LlvmKernelSource>>>
 ParallelFusionEmitter::ConsumeKernels() {
   absl::MutexLock lock(kernels_mutex_);
 
@@ -196,8 +201,8 @@ ParallelFusionEmitter::ConsumeKernels() {
   }
 
   // Sort the kernels by name to ensure a deterministic order.
-  absl::c_sort(kernels_, [](const LlvmKernelDefinition& lhs,
-                            const LlvmKernelDefinition& rhs) {
+  absl::c_sort(kernels_, [](const KernelDefinition<LlvmKernelSource>& lhs,
+                            const KernelDefinition<LlvmKernelSource>& rhs) {
     return lhs.spec().name() < rhs.spec().name();
   });
 
@@ -205,11 +210,12 @@ ParallelFusionEmitter::ConsumeKernels() {
 }
 
 void ParallelFusionEmitter::CompileFusion(
-    std::shared_ptr<MlirKernelDefinition> mlir_kernel_definition,
+    std::shared_ptr<KernelDefinition<MlirKernelSource>> mlir_kernel,
     std::shared_ptr<CompilerInstance> compiler_instance) {
-  auto [spec, source] = std::move(*mlir_kernel_definition).ReleaseStorage();
-  absl::StatusOr<LlvmIrKernelSource> llvm_kernel_source =
-      compiler_instance->compiler->Compile(std::move(source));
+  KernelSpec spec = mlir_kernel->spec();
+  absl::StatusOr<LlvmKernelSource> llvm_kernel_source =
+      compiler_instance->compiler->Compile(
+          std::move(*mlir_kernel).TakeSource());
 
   absl::MutexLock lock(kernels_mutex_);
   outstanding_kernels_--;
diff --git a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h
index 2fc984baa1271c..555ff29eb42abb 100644
--- a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h
+++ b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h
@@ -25,9 +25,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
+#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -43,7 +44,7 @@ class ParallelFusionEmitter {
                         FusionCompiler::Options options,
                         FusionCompiler::CompilationHooks hooks,
                         const BufferAssignment* buffer_assignment,
-                        bool use_unique_c_name);
+                        bool use_unique_c_name, bool enable_tiled_emitter);
 
   ~ParallelFusionEmitter();
 
@@ -53,25 +54,28 @@ class ParallelFusionEmitter {
 
   // Returns the kernels for all the added fusions, blocks until all kernels
   // have been compiled.
-  absl::StatusOr<std::vector<LlvmKernelDefinition>> ConsumeKernels();
+  absl::StatusOr<std::vector<KernelDefinition<LlvmKernelSource>>>
+  ConsumeKernels();
 
  private:
   struct CompilerInstance;
   class FusionCompilerPool;
 
   void CompileFusion(
-      std::shared_ptr<MlirKernelDefinition> mlir_kernel_definition,
+      std::shared_ptr<KernelDefinition<MlirKernelSource>> mlir_kernel,
       std::shared_ptr<CompilerInstance> compiler_instance);
 
   tsl::thread::ThreadPool& thread_pool_;
   std::unique_ptr<FusionCompilerPool> fusion_compiler_pool_;
   const BufferAssignment* buffer_assignment_;
   bool use_unique_c_name_;
+  bool enable_tiled_emitter_;
 
   absl::Mutex kernels_mutex_;
   int64_t outstanding_kernels_ ABSL_GUARDED_BY(kernels_mutex_) = 0;
   absl::Status kernels_status_ ABSL_GUARDED_BY(kernels_mutex_);
-  std::vector<LlvmKernelDefinition> kernels_ ABSL_GUARDED_BY(kernels_mutex_);
+  std::vector<KernelDefinition<LlvmKernelSource>> kernels_
+      ABSL_GUARDED_BY(kernels_mutex_);
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc b/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc
index 9abba57fce23c2..74acf74402e85d 100644
--- a/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc
+++ b/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
-#include "xla/codegen/llvm_kernel_definition.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -117,16 +116,17 @@ TEST_F(ParallelFusionEmitterTest, HappyPathSingleFusion) {
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_pool", 4);
 
   xla::cpu::ParallelFusionEmitter fussion_emitter(
-      thread_pool, CreateDefaultOptions(), CreateMockHooks(1), nullptr, false);
+      thread_pool, CreateDefaultOptions(), CreateMockHooks(1), nullptr, false,
+      false);
 
   TF_ASSERT_OK_AND_ASSIGN(auto kernel_spec, fussion_emitter.AddFusion(fusion));
   EXPECT_EQ(kernel_spec.name(), expected_name);
 
   TF_ASSERT_OK_AND_ASSIGN(auto kernels, fussion_emitter.ConsumeKernels());
   ASSERT_EQ(kernels.size(), 1);
-  LlvmKernelDefinition& lowered_kernel = kernels[0];
-  auto [spec, source] = std::move(lowered_kernel).ReleaseStorage();
-  EXPECT_EQ(spec.name(), expected_name);
+  KernelDefinition<LlvmKernelSource>& lowered_kernel = kernels[0];
+  EXPECT_EQ(lowered_kernel.spec().name(), expected_name);
+  auto source = std::move(lowered_kernel).TakeSource();
 
   llvm::orc::ThreadSafeModule thread_safe_llvm_module =
       std::move(source).thread_safe_module();
@@ -171,7 +171,8 @@ TEST_F(ParallelFusionEmitterTest, FusionsAreSorted) {
 
   xla::cpu::ParallelFusionEmitter fussion_emitter(
       thread_pool, CreateDefaultOptions(), CreateMockHooks(2),
-      /*buffer_assignment=*/nullptr, /*use_unique_c_name=*/false);
+      /*buffer_assignment=*/nullptr, /*use_unique_c_name=*/false,
+      /*enable_tiled_emitter=*/false);
 
   // Add the fusions in reverse order.
   TF_ASSERT_OK_AND_ASSIGN(auto kernel_spec_1,
@@ -215,7 +216,8 @@ TEST_F(ParallelFusionEmitterTest, Error) {
 
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_pool", 4);
   xla::cpu::ParallelFusionEmitter fussion_emitter(
-      thread_pool, CreateDefaultOptions(), CreateMockHooks(0), nullptr, false);
+      thread_pool, CreateDefaultOptions(), CreateMockHooks(0), nullptr, false,
+      false);
 
   EXPECT_THAT(fussion_emitter.AddFusion(fusion), Not(IsOk()));
 }
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
index 9e8dc4861f7dbb..6386d3ce976c63 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "xla/shape.h"
 #include "xla/shape_partition.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/util.h"
 #include "tsl/platform/cpu_info.h"
 
@@ -202,7 +202,7 @@ int64_t ParallelTaskAssignment::GetTargetParallelTaskCount(
   return 1;
 }
 
-absl::StatusOr<bool> ParallelTaskAssigner::Run(
+absl::StatusOr<bool> ParallelTaskAssigner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(2, "ParallelTaskAssigner ENTRY");
@@ -286,7 +286,7 @@ bool ParallelTaskAssigner::AssignParallelTasksHelper(
     absl::c_copy(dim_partition_counts,
                  tsl::protobuf::RepeatedFieldBackInserter(
                      backend_config.mutable_outer_dimension_partitions()));
-    TF_CHECK_OK(instruction->set_backend_config(backend_config));
+    CHECK_OK(instruction->set_backend_config(backend_config));
 
     VLOG(2) << "Assigned parallel task count: " << total_partition_count
             << " to instruction: " << instruction->name();
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment.h b/third_party/xla/xla/service/cpu/parallel_task_assignment.h
index 9b0f06c505820b..507662b4094110 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment.h
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment.h
@@ -87,11 +87,8 @@ class ParallelTaskAssigner : public HloModulePass {
     return "cpu-parallel-task-assigner";
   }
 
-  // Run parallel task assigner on computations with specified
-  // `execution_threads` in 'module'. By default, all `execution_threads` are
-  // included. Returns true if the computation was changed, false otherwise.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/cpu/runtime_conv2d.cc b/third_party/xla/xla/service/cpu/runtime_conv2d.cc
deleted file mode 100644
index c388b0982fecfe..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv2d.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_conv2d.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-
-#define EIGEN_USE_THREADS
-
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_rows, int64_t input_cols,
-    int64_t input_channels, int64_t kernel_rows, int64_t kernel_cols,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_rows,
-    int64_t output_cols, int64_t row_stride, int64_t col_stride,
-    int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
-    int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  xla::cpu::internal::EigenConv2D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch,
-      input_rows, input_cols, input_channels, kernel_rows, kernel_cols,
-      kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
-      col_stride, padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
-    const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  xla::cpu::internal::EigenConv2D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch,
-      input_rows, input_cols, input_channels, kernel_rows, kernel_cols,
-      kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
-      col_stride, padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_conv2d.h b/third_party/xla/xla/service/cpu/runtime_conv2d.h
deleted file mode 100644
index 14bb219587a120..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv2d.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
-#define XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
-
-#include <stdint.h>
-
-#include "Eigen/Core"
-
-extern "C" {
-
-extern void __xla_cpu_runtime_EigenConv2DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count);
-
-extern void __xla_cpu_runtime_EigenConv2DF16(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
-    int64_t input_rows, int64_t input_cols, int64_t input_channels,
-    int64_t kernel_rows, int64_t kernel_cols, int64_t kernel_channels,
-    int64_t kernel_filters, int64_t output_rows, int64_t output_cols,
-    int64_t row_stride, int64_t col_stride, int64_t padding_top,
-    int64_t padding_bottom, int64_t padding_left, int64_t padding_right,
-    int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count);
-
-}  // extern "C"
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CONV2D_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.cc b/third_party/xla/xla/service/cpu/runtime_conv2d_acl.cc
deleted file mode 100644
index 9db849566865f0..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/executable_run_options.h"
-#include "xla/tsl/platform/dynamic_annotations.h"
-#include "tsl/platform/types.h"
-#ifdef XLA_CPU_USE_ACL
-#include "absl/base/call_once.h"
-#include "xla/service/cpu/runtime_conv2d.h"
-#include "xla/service/cpu/runtime_conv2d_acl.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
-#include "tsl/platform/logging.h"
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace {
-int32_t ACLDepthwiseConvImpl(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_rows, int64_t input_cols,
-    int64_t input_channels, int64_t kernel_rows, int64_t kernel_cols,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_rows,
-    int64_t output_cols, int64_t row_stride, int64_t col_stride,
-    int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
-    int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count) {
-  VLOG(1) << "Accelerate with ACLDepthwiseConvImpl";
-  /* TODO: optimize this object creation along with tensor init and
-   * gemm configuration by caching the shapes, similar to onednn
-   * primitive caching feature
-   */
-  struct acl_depthwise_conv_obj_t acl_conv_obj;
-  struct acl_conv_conf_t acl_conf;
-
-  /* ir_emitter HandleConvolution ensures the below preconditions before
-   * dispatching it to ACL layout: NHWC format: FP32 Number of feature groups
-   * matches the input channels source and kernel dilation is: 1
-   */
-  acl_conf.dilation_info =
-      arm_compute::Size2D(lhs_col_dilation, lhs_row_dilation);
-  acl_conf.padstride_info = arm_compute::PadStrideInfo(
-      col_stride, row_stride, static_cast<unsigned int>(padding_left),
-      static_cast<unsigned int>(padding_right),
-      static_cast<unsigned int>(padding_top),
-      static_cast<unsigned int>(padding_bottom),
-      arm_compute::DimensionRoundingType::FLOOR);
-  acl_conf.with_bias = false;
-
-  acl_conf.act_info = arm_compute::ActivationLayerInfo();
-
-  acl_conf.input_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(input_channels, input_cols, input_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.kernel_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(kernel_filters, kernel_cols, kernel_rows), 1,
-      arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.output_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(kernel_filters, output_cols, output_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-
-  auto acl_st = arm_compute::NEDepthwiseConvolutionLayer::validate(
-      &acl_conf.input_info, &acl_conf.kernel_info,
-      /*acp.with_bias */ nullptr, &acl_conf.output_info,
-      acl_conf.padstride_info, kernel_channels /*depth_multiplier*/,
-      acl_conf.act_info, acl_conf.dilation_info);
-  if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-    VLOG(1) << " Gemm conv validation failed";
-    return -1;
-  }
-
-  static absl::once_flag flag_once;
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  const Eigen::ThreadPoolDevice* tpd =
-      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
-  // The threads in Compute Library are bound for the cores 0..max_threads-1
-  const int max_threads = tpd->numThreads();
-
-  // arm_compute::Scheduler does not support concurrent access thus a
-  // workaround here restricts it to only one call
-  absl::call_once(flag_once, [&]() {
-    arm_compute::Scheduler::get().set_num_threads(max_threads);
-  });
-
-  // configure the acl obj with the config
-  acl_conv_obj.input_tensor.allocator()->init(acl_conf.input_info);
-  acl_conv_obj.kernel_tensor.allocator()->init(acl_conf.kernel_info);
-  acl_conv_obj.output_tensor.allocator()->init(acl_conf.output_info);
-
-  acl_conv_obj.depthwise_conv.configure(
-      &acl_conv_obj.input_tensor, &acl_conv_obj.kernel_tensor, nullptr,
-      &acl_conv_obj.output_tensor, acl_conf.padstride_info,
-      kernel_channels /*depth_multiplier*/, acl_conf.act_info,
-      acl_conf.dilation_info);
-
-  /* import_memory() and free() methods do not allocate/free any additional
-   * memory, only acquire/release pointers.
-   */
-  acl_conv_obj.input_tensor.allocator()->import_memory(lhs);
-  acl_conv_obj.kernel_tensor.allocator()->import_memory(rhs);
-  acl_conv_obj.output_tensor.allocator()->import_memory(out);
-
-  acl_conv_obj.depthwise_conv.run();
-
-  acl_conv_obj.input_tensor.allocator()->free();
-  acl_conv_obj.kernel_tensor.allocator()->free();
-  acl_conv_obj.output_tensor.allocator()->free();
-
-  return 0;
-}
-
-int32_t ACLGemmConvImpl(const void* run_options_ptr, float* out, float* lhs,
-                        float* rhs, int64_t input_batch, int64_t input_rows,
-                        int64_t input_cols, int64_t input_channels,
-                        int64_t kernel_rows, int64_t kernel_cols,
-                        int64_t kernel_channels, int64_t kernel_filters,
-                        int64_t output_rows, int64_t output_cols,
-                        int64_t row_stride, int64_t col_stride,
-                        int64_t padding_top, int64_t padding_bottom,
-                        int64_t padding_left, int64_t padding_right,
-                        int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-                        int64_t rhs_row_dilation, int64_t rhs_col_dilation) {
-  VLOG(1) << "Accelerate with ACLGemmConvImpl";
-  /* TODO: optimize this object creation along with tensor init and
-   * gemm configuration by caching the shapes, similar to onednn
-   * primitive caching feature
-   */
-  struct acl_gemm_conv_obj_t acl_conv_obj;
-  struct acl_conv_conf_t acl_conf;
-
-  /* TODO: add TF_XLA_* flag for runtime control of fast math mode
-   */
-  acl_conf.fast_math = true;
-
-  /* ir_emitter HandleConvolution ensures the below preconditions before
-   * dispatching it to ACL layout: NHWC format: FP32 Number of feature groups: 1
-   *  source and kernel dilation is: 1
-   */
-  acl_conf.dilation_info =
-      arm_compute::Size2D(lhs_col_dilation, lhs_row_dilation);
-  acl_conf.padstride_info = arm_compute::PadStrideInfo(
-      col_stride, row_stride, static_cast<unsigned int>(padding_left),
-      static_cast<unsigned int>(padding_right),
-      static_cast<unsigned int>(padding_top),
-      static_cast<unsigned int>(padding_bottom),
-      arm_compute::DimensionRoundingType::FLOOR);
-  acl_conf.with_bias = false;
-
-  acl_conf.input_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(input_channels, input_cols, input_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.kernel_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(input_channels, kernel_cols, kernel_rows,
-                               kernel_filters),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.output_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(kernel_filters, output_cols, output_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.act_info = arm_compute::ActivationLayerInfo();
-
-  // Validate convolution manually to check for return status
-  auto acl_st = arm_compute::NEGEMMConvolutionLayer::validate(
-      &acl_conf.input_info, &acl_conf.kernel_info,
-      /*acp.with_bias */ nullptr, &acl_conf.output_info,
-      acl_conf.padstride_info, acl_conf.kernel_wei_info, acl_conf.dilation_info,
-      acl_conf.act_info, acl_conf.fast_math);
-  if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-    VLOG(1) << " Gemm conv validation failed";
-    return -1;
-  }
-
-  static absl::once_flag flag_once;
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  const Eigen::ThreadPoolDevice* tpd =
-      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
-  // The threads in Compute Library are bound for the cores 0..max_threads-1
-  const int max_threads = tpd->numThreads();
-
-  // arm_compute::Scheduler does not support concurrent access thus a
-  // workaround here restricts it to only one call
-  absl::call_once(flag_once, [&]() {
-    arm_compute::Scheduler::get().set_num_threads(max_threads);
-  });
-
-  // configure the acl obj with the config
-  acl_conv_obj.input_tensor.allocator()->init(acl_conf.input_info);
-  acl_conv_obj.kernel_tensor.allocator()->init(acl_conf.kernel_info);
-  acl_conv_obj.output_tensor.allocator()->init(acl_conf.output_info);
-
-  // Configure GEMM
-  acl_conv_obj.gemm_conv.configure(
-      &acl_conv_obj.input_tensor, &acl_conv_obj.kernel_tensor, nullptr,
-      &acl_conv_obj.output_tensor, acl_conf.padstride_info,
-      acl_conf.kernel_wei_info, acl_conf.dilation_info, acl_conf.act_info,
-      acl_conf.fast_math);
-
-  /* import_memory() and free() methods do not allocate/free any additional
-   * memory, only acquire/release pointers.
-   */
-  acl_conv_obj.input_tensor.allocator()->import_memory(lhs);
-  acl_conv_obj.kernel_tensor.allocator()->import_memory(rhs);
-  acl_conv_obj.output_tensor.allocator()->import_memory(out);
-
-  acl_conv_obj.gemm_conv.run();
-
-  acl_conv_obj.input_tensor.allocator()->free();
-  acl_conv_obj.kernel_tensor.allocator()->free();
-  acl_conv_obj.output_tensor.allocator()->free();
-
-  return 0;
-}
-}  // namespace
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ACLConv2DF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_rows, int64_t input_cols,
-    int64_t input_channels, int64_t kernel_rows, int64_t kernel_cols,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_rows,
-    int64_t output_cols, int64_t row_stride, int64_t col_stride,
-    int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
-    int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count) {
-  bool fallback_to_eigen = false;
-
-  if (lhs_row_dilation > 1 || lhs_col_dilation > 1 ||
-      ((feature_group_count > 1) && (input_channels != feature_group_count))) {
-    fallback_to_eigen = true;
-  } else if ((feature_group_count > 1) &&
-             (input_channels == feature_group_count)) {
-    if (ACLDepthwiseConvImpl(
-            run_options_ptr, out, lhs, rhs, input_batch, input_rows, input_cols,
-            input_channels, kernel_rows, kernel_cols, kernel_channels,
-            kernel_filters, output_rows, output_cols, row_stride, col_stride,
-            padding_top, padding_bottom, padding_left, padding_right,
-            lhs_row_dilation, lhs_col_dilation, rhs_row_dilation,
-            rhs_col_dilation, feature_group_count) < 0)
-      fallback_to_eigen = true;
-  } else {
-    if (ACLGemmConvImpl(run_options_ptr, out, lhs, rhs, input_batch, input_rows,
-                        input_cols, input_channels, kernel_rows, kernel_cols,
-                        kernel_channels, kernel_filters, output_rows,
-                        output_cols, row_stride, col_stride, padding_top,
-                        padding_bottom, padding_left, padding_right,
-                        lhs_row_dilation, lhs_col_dilation, rhs_row_dilation,
-                        rhs_col_dilation) < 0)
-      fallback_to_eigen = true;
-  }
-
-  if (fallback_to_eigen) {
-    VLOG(1) << "XLA conv2d not supported by ACL, fallback to Eigen runtime";
-    __xla_cpu_runtime_EigenConv2DF32(
-        run_options_ptr, out, lhs, rhs, input_batch, input_rows, input_cols,
-        input_channels, kernel_rows, kernel_cols, kernel_channels,
-        kernel_filters, output_rows, output_cols, row_stride, col_stride,
-        padding_top, padding_bottom, padding_left, padding_right,
-        lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-        feature_group_count);
-  }
-}
-#endif  // XLA_CPU_USE_ACL
diff --git a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h b/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h
deleted file mode 100644
index 69a2429ff49f64..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
-#define XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
-
-#include "tsl/platform/types.h"
-
-#ifdef XLA_CPU_USE_ACL
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "utils/Utils.h"
-
-extern "C" {
-struct acl_depthwise_conv_obj_t {
-  arm_compute::NEDepthwiseConvolutionLayer depthwise_conv;
-  arm_compute::NEArithmeticAddition add;
-  arm_compute::NEActivationLayer act;
-  arm_compute::Tensor input_tensor;
-  arm_compute::Tensor kernel_tensor;
-  arm_compute::Tensor bia_tensor;
-  arm_compute::Tensor output_tensor;
-  arm_compute::Tensor output_acc_tensor;
-};
-
-struct acl_gemm_conv_obj_t {
-  arm_compute::NEGEMMConvolutionLayer gemm_conv;
-  arm_compute::NEArithmeticAddition add;
-  arm_compute::NEActivationLayer act;
-  arm_compute::Tensor input_tensor;
-  arm_compute::Tensor kernel_tensor;
-  arm_compute::Tensor bia_tensor;
-  arm_compute::Tensor output_tensor;
-  arm_compute::Tensor output_acc_tensor;
-};
-
-struct acl_conv_conf_t {
-  bool with_bias;
-  bool is_int8;
-  bool sum_with_eltwise;
-  bool fast_math;
-  arm_compute::TensorInfo input_info;
-  arm_compute::TensorInfo kernel_info;
-  arm_compute::TensorInfo bia_info;
-  arm_compute::TensorInfo output_info;
-  arm_compute::PadStrideInfo padstride_info;
-  arm_compute::Size2D dilation_info;
-  arm_compute::WeightsInfo kernel_wei_info;
-  arm_compute::ActivationLayerInfo act_info;
-};
-
-extern void __xla_cpu_runtime_ACLConv2DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count);
-}
-#else
-#include <iostream>
-
-extern "C" {
-inline extern void __xla_cpu_runtime_ACLConv2DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count) {
-  std::cerr
-      << "Attempt to call ACL Conv2D runtime library without defining "
-         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
-  exit(1);
-}
-}
-#endif  // XLA_CPU_USE_ACL
-#endif  // XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_conv3d.cc b/third_party/xla/xla/service/cpu/runtime_conv3d.cc
deleted file mode 100644
index f5ae5297ebf47b..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv3d.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_conv3d.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-
-#define EIGEN_USE_THREADS
-
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_x, int64_t input_y, int64_t input_z,
-    int64_t input_channels, int64_t kernel_x, int64_t kernel_y,
-    int64_t kernel_z, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_x, int64_t output_y, int64_t output_z, int64_t x_stride,
-    int64_t y_stride, int64_t z_stride, int64_t padding_x_before,
-    int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
-    int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
-    int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
-    int64_t feature_group_count) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  xla::cpu::internal::EigenConv3D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch, input_x,
-      input_y, input_z, input_channels, kernel_x, kernel_y, kernel_z,
-      kernel_channels, kernel_filters, output_x, output_y, output_z, x_stride,
-      y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
-      padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
-      lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
-    const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64_t input_batch, int64_t input_x, int64_t input_y,
-    int64_t input_z, int64_t input_channels, int64_t kernel_x, int64_t kernel_y,
-    int64_t kernel_z, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_x, int64_t output_y, int64_t output_z, int64_t x_stride,
-    int64_t y_stride, int64_t z_stride, int64_t padding_x_before,
-    int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
-    int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
-    int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
-    int64_t feature_group_count) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  xla::cpu::internal::EigenConv3D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch, input_x,
-      input_y, input_z, input_channels, kernel_x, kernel_y, kernel_z,
-      kernel_channels, kernel_filters, output_x, output_y, output_z, x_stride,
-      y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
-      padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
-      lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_conv3d.h b/third_party/xla/xla/service/cpu/runtime_conv3d.h
deleted file mode 100644
index 7ae829e362a94e..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv3d.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_CONV3D_H_
-#define XLA_SERVICE_CPU_RUNTIME_CONV3D_H_
-
-#include <stdint.h>
-
-#include "Eigen/Core"
-
-extern "C" {
-
-extern void __xla_cpu_runtime_EigenConv3DF16(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
-    int64_t input_x, int64_t input_y, int64_t input_z, int64_t input_channels,
-    int64_t kernel_x, int64_t kernel_y, int64_t kernel_z,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_x,
-    int64_t output_y, int64_t output_z, int64_t x_stride, int64_t y_stride,
-    int64_t z_stride, int64_t padding_x_before, int64_t padding_x_after,
-    int64_t padding_y_before, int64_t padding_y_after, int64_t padding_z_before,
-    int64_t padding_z_after, int64_t lhs_x_dilation, int64_t lhs_y_dilation,
-    int64_t lhs_z_dilation, int64_t rhs_x_dilation, int64_t rhs_y_dilation,
-    int64_t rhs_z_dilation, int64_t feature_group_count);
-
-extern void __xla_cpu_runtime_EigenConv3DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_x,
-    int64_t input_y, int64_t input_z, int64_t input_channels, int64_t kernel_x,
-    int64_t kernel_y, int64_t kernel_z, int64_t kernel_channels,
-    int64_t kernel_filters, int64_t output_x, int64_t output_y,
-    int64_t output_z, int64_t x_stride, int64_t y_stride, int64_t z_stride,
-    int64_t padding_x_before, int64_t padding_x_after, int64_t padding_y_before,
-    int64_t padding_y_after, int64_t padding_z_before, int64_t padding_z_after,
-    int64_t lhs_x_dilation, int64_t lhs_y_dilation, int64_t lhs_z_dilation,
-    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation,
-    int64_t feature_group_count);
-
-}  // extern "C"
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CONV3D_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_custom_call_status.cc b/third_party/xla/xla/service/cpu/runtime_custom_call_status.cc
deleted file mode 100644
index d943fa63aeafc3..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_custom_call_status.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/service/cpu/runtime_custom_call_status.h"
-
-#include "absl/base/attributes.h"
-#include "xla/service/custom_call_status_internal.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY bool __xla_cpu_runtime_StatusIsSuccess(
-    const void* status_ptr) {
-  auto status = static_cast<const XlaCustomCallStatus*>(status_ptr);
-  return !xla::CustomCallStatusGetMessage(status).has_value();
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_custom_call_status.h b/third_party/xla/xla/service/cpu/runtime_custom_call_status.h
deleted file mode 100644
index e243b46e0e3bc3..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_custom_call_status.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
-#define XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
-
-extern "C" {
-
-// Returns true iff the given 'XlaCustomCallStatus' is in a success state, so
-// that generated code can return early if a CustomCall fails.
-extern bool __xla_cpu_runtime_StatusIsSuccess(
-    const void* /* XlaCustomCallStatus* */ status_ptr);
-}
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_key_value_sort.cc b/third_party/xla/xla/service/cpu/runtime_key_value_sort.cc
deleted file mode 100644
index a63007ae046106..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_key_value_sort.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/service/cpu/runtime_key_value_sort.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <numeric>
-#include <string>
-
-#include "absl/base/attributes.h"
-#include "absl/base/dynamic_annotations.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
-    int64_t a, int64_t b, int64_t c, char** values, int32_t values_count,
-    int32_t* values_primitive_type_size_in_bytes, bool is_stable,
-    char* run_options, int64_t* prof_counters,
-    void (*less_than)(char*, char*, char**, char**, int64_t*)) {
-  // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
-  // code, so msan can't tell they are initialized.
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values_primitive_type_size_in_bytes,
-                                      values_count * sizeof(int32_t));
-
-  // High-level idea of the iteration/sorting logic:
-  // Conceptually we have a 3-dimensional shape [a, b, c]. b corresponds to the
-  // dimension to sort, c is the product of the more minor dimensions (set to 1
-  // if b is the most minor dimension), and a is the product of the more major
-  // dimensions (set to 1 if b is the most major dimension). There are a * c
-  // many rows that we need to sort. We iterate through these, calculate a
-  // 'base_offset' value which points to the first element in that row, and add
-  // i * c for accessing the 'i'-th element in that row.
-
-  int64_t sort_dimension_elements = b;
-  int64_t num_iteration_elements = a * c;
-  int64_t sort_dimension_offset = c;
-
-  std::unique_ptr<int64_t[]> indices(new int64_t[sort_dimension_elements]);
-  std::unique_ptr<char*[]> comparison_values(new char*[2 * values_count]);
-  std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
-  std::unique_ptr<std::string[]> reordered_values(
-      new std::string[sort_dimension_elements]);
-  for (int64_t index = 0; index < num_iteration_elements; ++index) {
-    // If the sort should be stable, we have to reinitialize indices to iota to
-    // guarantee that we still keep the relative order in case of ties.
-    if (is_stable && index > 0) {
-      std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
-    }
-    // 'index' can be split into two values which index into the 'c' dimension
-    // and the 'a' dimension, respectively. 'index' % 'c' is the index into the
-    // 'c' dimension, 'index' / 'c' is the index into the 'a' dimension. When
-    // calculating the base offset, we need to multiply the index into the 'a'
-    // dimension with 'b' * 'c'.
-    // 'index' / 'c' * 'c' * 'b' = ('index' - 'index' % 'c') * 'b'.
-    int64_t base_offset =
-        index % sort_dimension_offset +
-        (index - index % sort_dimension_offset) * sort_dimension_elements;
-    auto compare_function = [&](int64_t a, int64_t b) -> bool {
-      for (int32_t i = 0; i < values_count; ++i) {
-        int64_t memory_index_lhs = (base_offset + a * sort_dimension_offset) *
-                                   values_primitive_type_size_in_bytes[i];
-        int64_t memory_index_rhs = (base_offset + b * sort_dimension_offset) *
-                                   values_primitive_type_size_in_bytes[i];
-        comparison_values[i * 2] = values[i] + memory_index_lhs;
-        comparison_values[i * 2 + 1] = values[i] + memory_index_rhs;
-      }
-      char result = 0;  // Overwritten by less_than.
-      less_than(&result, run_options, comparison_values.get(), nullptr,
-                prof_counters);
-      return result != 0u;
-    };
-    if (is_stable) {
-      std::stable_sort(indices.get(), indices.get() + sort_dimension_elements,
-                       compare_function);
-    } else {
-      std::sort(indices.get(), indices.get() + sort_dimension_elements,
-                compare_function);
-    }
-
-    // Reorder the values according to the order defined by 'indices'.
-    for (int32_t idx = 0; idx < values_count; ++idx) {
-      for (int64_t i = 0; i < sort_dimension_elements; ++i) {
-        int64_t memory_index =
-            (base_offset + indices[i] * sort_dimension_offset) *
-            values_primitive_type_size_in_bytes[idx];
-
-        reordered_values[i] =
-            std::string(values[idx] + memory_index,
-                        values_primitive_type_size_in_bytes[idx]);
-      }
-      for (int64_t i = 0; i < sort_dimension_elements; ++i) {
-        int64_t memory_index = (base_offset + i * sort_dimension_offset) *
-                               values_primitive_type_size_in_bytes[idx];
-        memcpy(values[idx] + memory_index, reordered_values[i].c_str(),
-               values_primitive_type_size_in_bytes[idx]);
-      }
-    }
-  }
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_key_value_sort.h b/third_party/xla/xla/service/cpu/runtime_key_value_sort.h
deleted file mode 100644
index dfd99f64cf2209..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_key_value_sort.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
-#define XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
-
-#include <stdint.h>
-
-#include "unsupported/Eigen/CXX11/Tensor"
-
-extern "C" {
-
-// Each entry in 'values' represents a 3-dimensional shape with dimensions
-// [a, b, c]. The 'b' dimension of each shape is sorted into ascending order
-// according to the results of comparisons using the provided 'less_than'
-// function. 'values_count' must be > 0 and specifies the number of entries in
-// 'values' and 'values_primitive_type_size_in_bytes'. The size of the primitive
-// type of the i-th shape has exactly 'values_primitive_type_size_in_bytes[i]'
-// bytes. 'is_stable' specifies whether the sorting should be stable.
-// 'run_options' and 'prof_counters' are passed through to the less-than
-// function, which expects the following arguments:
-// - pointer to the return value buffer (char*)
-// - xla::ExecutableRunOptions = 'run_options' (char*)
-// - pointers to the parameter buffers (char**)
-// - pointers to the buffer tables = nullptr for thread local functions (char**)
-// - profile counters = 'prof_counters' (int64_t*)
-extern void __xla_cpu_runtime_KeyValueSort(
-    int64_t a, int64_t b, int64_t c, char** values, int32_t values_count,
-    int32_t* values_primitive_type_size_in_bytes, bool is_stable,
-    char* run_options, int64_t* prof_counters,
-    void (*less_than)(char*, char*, char**, char**, int64_t*));
-}
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_lightweight_check.h b/third_party/xla/xla/service/cpu/runtime_lightweight_check.h
deleted file mode 100644
index 49fc9cb3457a5c..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_lightweight_check.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_LIGHTWEIGHT_CHECK_H_
-#define XLA_SERVICE_CPU_RUNTIME_LIGHTWEIGHT_CHECK_H_
-
-#include <cstdlib>
-#include <iostream>
-
-// Aborts the program if the condition is false.
-//
-// This is like QCHECK, except it doesn't pull in the TF/XLA logging framework.
-// This makes it suitable for use from within the XLA:CPU runtime files, which
-// need to be lightweight.
-#define XLA_LIGHTWEIGHT_CHECK(cond)                                         \
-  do {                                                                      \
-    if (!(cond)) {                                                          \
-      std::cerr << __FILE__ << ":" << __LINE__                              \
-                << " Failed XLA_LIGHTWEIGHT_QCHECK " << #cond << std::endl; \
-      std::abort();                                                         \
-    }                                                                       \
-  } while (0)
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_LIGHTWEIGHT_CHECK_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul.h b/third_party/xla/xla/service/cpu/runtime_matmul.h
deleted file mode 100644
index a7d79bf1c0e580..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
-#define XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
-
-#include <stdint.h>
-
-#include <complex>
-
-#include "Eigen/Core"
-
-extern "C" {
-
-// Performs a multi-threaded matrix multiplication using Eigen. 'lhs' and 'rhs'
-// are pointers to buffers containing input matrices in column-major
-// order. 'out' is a pointer to a buffer sufficiently large to hold the result
-// of the operation. Following standard nomenclature: lhs is m x k,
-// rhs is k x n, and out is m x n.
-extern void __xla_cpu_runtime_EigenMatMulF16(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenMatMulF64(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
-    double* lhs, double* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenMatMulC64(
-    const void* run_options_ptr, std::complex<float>* out,
-    std::complex<float>* lhs, std::complex<float>* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenMatMulC128(
-    const void* run_options_ptr, std::complex<double>* out,
-    std::complex<double>* lhs, std::complex<double>* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenMatMulS32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int32_t* out,
-    int32_t* lhs, int32_t* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenBatchMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-}  // extern "C"
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_acl.cc b/third_party/xla/xla/service/cpu/runtime_matmul_acl.cc
deleted file mode 100644
index fa947ca89ce74f..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_acl.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef XLA_CPU_USE_ACL
-#include "xla/service/cpu/runtime_matmul_acl.h"
-
-#include "absl/base/call_once.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
-#include "xla/service/cpu/runtime_matmul.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/types.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "xla/tsl/platform/dynamic_annotations.h"
-
-namespace {
-// ACL GEMM API for 32-bit Matrix Multiplication.
-
-// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
-// Since XLA MatMul does not use alpha, beta, we set them to 1.0 and 0.0.
-// Matrix lhs, rhs and out are all column-major.
-int32_t MatMulF32(const void* run_options_ptr, float* out, float* lhs,
-                  float* rhs, int64_t m, int64_t n, int64_t k,
-                  int64_t batch_size, int32_t transpose_lhs,
-                  int32_t transpose_rhs) {
-  const float alpha = 1.0f, beta = 0.0f;
-
-  /* TODO: optimize this object creation along with tensor init and
-   * gemm configuration by caching the shapes, similar to onednn
-   * primitive caching feature
-   */
-  struct acl_matmul_obj_t acl_obj;
-  struct acl_matmul_conf_t acl_conf;
-
-  acl_conf.is_trans_lhs = (bool)transpose_lhs;
-  acl_conf.is_trans_rhs = (bool)transpose_rhs;
-
-  if (acl_conf.is_trans_lhs) {
-    acl_conf.lhs_acc_info =
-        arm_compute::TensorInfo(arm_compute::TensorShape(k, m, batch_size), 1,
-                                arm_compute::DataType::F32);
-  }
-  if (acl_conf.is_trans_rhs) {
-    acl_conf.rhs_acc_info =
-        arm_compute::TensorInfo(arm_compute::TensorShape(n, k, 1, batch_size),
-                                1, arm_compute::DataType::F32);
-  }
-
-  acl_conf.lhs_info =
-      arm_compute::TensorInfo(arm_compute::TensorShape(m, k, batch_size), 1,
-                              arm_compute::DataType::F32);
-  acl_conf.rhs_info =
-      arm_compute::TensorInfo(arm_compute::TensorShape(k, n, 1, batch_size), 1,
-                              arm_compute::DataType::F32);
-  acl_conf.out_info =
-      arm_compute::TensorInfo(arm_compute::TensorShape(m, n, 1, batch_size), 1,
-                              arm_compute::DataType::F32);
-
-  /* TODO: add TF_XLA_* flag for runtime control of fast math mode*/
-  bool is_fastmath_enabled = true;
-  acl_conf.gemm_info.set_fast_math(is_fastmath_enabled);
-
-  // Fused ReLU activation
-  acl_conf.gemm_info.set_activation_info(arm_compute::ActivationLayerInfo());
-
-  // Set alpha (output scaling)
-  acl_conf.alpha = alpha;
-
-  // Validate ACL transpose
-  if (acl_conf.is_trans_lhs) {
-    auto acl_trans_lhs_st = arm_compute::NETranspose::validate(
-        &acl_conf.lhs_acc_info, &acl_conf.lhs_info);
-    if (acl_trans_lhs_st.error_code() != arm_compute::ErrorCode::OK) {
-      VLOG(1) << "lhs transpose validation failed";
-      return -1;
-    }
-  }
-  if (acl_conf.is_trans_rhs) {
-    auto acl_trans_rhs_st = arm_compute::NETranspose::validate(
-        &acl_conf.rhs_acc_info, &acl_conf.rhs_info);
-    if (acl_trans_rhs_st.error_code() != arm_compute::ErrorCode::OK) {
-      VLOG(1) << "rhs transpose validation failed";
-      return -1;
-    }
-  }
-
-  // Validate ACL GEMM
-  auto acl_st = arm_compute::NEGEMM::validate(
-      &acl_conf.rhs_info, &acl_conf.lhs_info, nullptr, &acl_conf.out_info,
-      acl_conf.alpha, 0.0f, acl_conf.gemm_info);
-  if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-    VLOG(1) << "validate acl GEMM FAILED";
-    return -1;
-  }
-
-  static absl::once_flag flag_once;
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  const Eigen::ThreadPoolDevice* tpd =
-      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
-  // The threads in Compute Library are bound for the cores 0..max_threads-1
-  const int max_threads = tpd->numThreads();
-
-  // arm_compute::Scheduler does not support concurrent access thus a
-  // workaround here restricts it to only one call
-  absl::call_once(flag_once, [&]() {
-    arm_compute::Scheduler::get().set_num_threads(max_threads);
-  });
-
-  // configure the acl obj with the config
-  acl_obj.lhs_tensor.allocator()->init(acl_conf.lhs_info);
-  acl_obj.rhs_tensor.allocator()->init(acl_conf.rhs_info);
-  acl_obj.out_tensor.allocator()->init(acl_conf.out_info);
-
-  // Configure transpose kernel for src, wei or both
-  if (acl_conf.is_trans_lhs) {
-    acl_obj.lhs_acc_tensor.allocator()->init(acl_conf.lhs_acc_info);
-    acl_obj.trans_lhs.configure(&acl_obj.lhs_acc_tensor, &acl_obj.lhs_tensor);
-  }
-  if (acl_conf.is_trans_rhs) {
-    acl_obj.rhs_acc_tensor.allocator()->init(acl_conf.rhs_acc_info);
-    acl_obj.trans_rhs.configure(&acl_obj.rhs_acc_tensor, &acl_obj.rhs_tensor);
-  }
-  // Configure GEMM
-  acl_obj.gemm.configure(&acl_obj.rhs_tensor, &acl_obj.lhs_tensor, nullptr,
-                         &acl_obj.out_tensor, acl_conf.alpha, 0.0f,
-                         acl_conf.gemm_info);
-
-  // Run transpose kernel
-  if (transpose_lhs && !transpose_rhs) {
-    acl_obj.lhs_tensor.allocator()->allocate();
-    acl_obj.lhs_acc_tensor.allocator()->import_memory(lhs);
-    acl_obj.trans_lhs.run();
-    acl_obj.rhs_tensor.allocator()->import_memory(rhs);
-  } else if (transpose_rhs && !transpose_lhs) {
-    acl_obj.rhs_tensor.allocator()->allocate();
-    acl_obj.rhs_acc_tensor.allocator()->import_memory(rhs);
-    acl_obj.trans_rhs.run();
-    acl_obj.lhs_tensor.allocator()->import_memory(lhs);
-  } else if (transpose_rhs && transpose_lhs) {
-    acl_obj.lhs_tensor.allocator()->allocate();
-    acl_obj.lhs_acc_tensor.allocator()->import_memory(lhs);
-    acl_obj.rhs_tensor.allocator()->allocate();
-    acl_obj.rhs_acc_tensor.allocator()->import_memory(rhs);
-    acl_obj.trans_lhs.run();
-    acl_obj.trans_rhs.run();
-  } else {
-    acl_obj.lhs_tensor.allocator()->import_memory(lhs);
-    acl_obj.rhs_tensor.allocator()->import_memory(rhs);
-  }
-
-  acl_obj.out_tensor.allocator()->import_memory(out);
-
-  // Execute the function
-  acl_obj.gemm.run();
-
-  acl_obj.lhs_tensor.allocator()->free();
-  acl_obj.rhs_tensor.allocator()->free();
-  acl_obj.out_tensor.allocator()->free();
-  if (acl_conf.is_trans_lhs) acl_obj.lhs_acc_tensor.allocator()->free();
-  if (acl_conf.is_trans_rhs) acl_obj.rhs_acc_tensor.allocator()->free();
-
-  return 0;
-}
-
-}  // namespace
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ACLMatMulF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs, int64_t m,
-    int64_t n, int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  if (MatMulF32(run_options_ptr, out, lhs, rhs, m, n, k, 1 /*batch_size*/,
-                transpose_lhs, transpose_rhs) < 0) {
-    VLOG(1) << "ACL matmul failed, fallback to Eigen matmul";
-    __xla_cpu_runtime_EigenMatMulF32(run_options_ptr, out, lhs, rhs, m, n, k,
-                                     transpose_lhs, transpose_rhs);
-  }
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ACLBatchMatMulF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs, int64_t m,
-    int64_t n, int64_t k, int64_t batch_size, int32_t transpose_lhs,
-    int32_t transpose_rhs) {
-  if (MatMulF32(run_options_ptr, out, lhs, rhs, m, n, k, batch_size,
-                transpose_lhs, transpose_rhs) < 0) {
-    VLOG(1) << "ACL batch matmul failed, fallback to Eigen batch matmul";
-    __xla_cpu_runtime_EigenBatchMatMulF32(run_options_ptr, out, lhs, rhs, m, n,
-                                          k, batch_size, transpose_lhs,
-                                          transpose_rhs);
-  }
-}
-
-#endif  // XLA_CPU_USE_ACL
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_acl.h b/third_party/xla/xla/service/cpu/runtime_matmul_acl.h
deleted file mode 100644
index 94f4f56d65f100..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_acl.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
-#define XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
-
-#include <iostream>
-
-#include "tsl/platform/types.h"
-
-#ifdef XLA_CPU_USE_ACL
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "utils/Utils.h"
-
-extern "C" {
-struct acl_matmul_obj_t {
-  arm_compute::NEGEMM gemm;
-  arm_compute::NETranspose trans_lhs;
-  arm_compute::NETranspose trans_rhs;
-  arm_compute::Tensor rhs_tensor;
-  arm_compute::Tensor rhs_acc_tensor;
-  arm_compute::Tensor lhs_tensor;
-  arm_compute::Tensor lhs_acc_tensor;
-  arm_compute::Tensor out_tensor;
-};
-
-struct acl_matmul_conf_t {
-  bool with_bias;
-  bool is_trans_lhs;
-  bool is_trans_rhs;
-  arm_compute::TensorInfo lhs_info;
-  arm_compute::TensorInfo lhs_acc_info;
-  arm_compute::TensorInfo rhs_info;
-  arm_compute::TensorInfo rhs_acc_info;
-  arm_compute::TensorInfo out_info;
-  arm_compute::GEMMInfo gemm_info;
-  float alpha;
-};
-
-extern void __xla_cpu_runtime_ACLMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_ACLBatchMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-}  // extern "C"
-#else
-extern "C" {
-inline extern void __xla_cpu_runtime_ACLMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs) {
-  std::cerr
-      << "Attempt to call ACL MatMul runtime library without defining "
-         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
-  exit(1);
-}
-
-inline extern void __xla_cpu_runtime_ACLBatchMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
-    int32_t transpose_lhs, int32_t transpose_rhs) {
-  std::cerr
-      << "Attempt to call ACL MatMul runtime library without defining "
-         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
-  exit(1);
-}
-}  // extern "C"
-#endif  // XLA_CPU_USE_ACL
-#endif  // XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_c128.cc b/third_party/xla/xla/service/cpu/runtime_matmul_c128.cc
deleted file mode 100644
index 0890c1c6599426..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_c128.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_matmul.h"
-
-#include <complex>
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC128(
-    const void* run_options_ptr, std::complex<double>* out,
-    std::complex<double>* lhs, std::complex<double>* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  xla::MatMulDispatch<std::complex<double>>(run_options_ptr, out, lhs, rhs, m,
-                                            n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_c64.cc b/third_party/xla/xla/service/cpu/runtime_matmul_c64.cc
deleted file mode 100644
index 0152cf74927f1d..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_c64.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_matmul.h"
-
-#include <complex>
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC64(
-    const void* run_options_ptr, std::complex<float>* out,
-    std::complex<float>* lhs, std::complex<float>* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  xla::MatMulDispatch<std::complex<float>>(run_options_ptr, out, lhs, rhs, m, n,
-                                           k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_common.h b/third_party/xla/xla/service/cpu/runtime_matmul_common.h
deleted file mode 100644
index 069915aad64d1d..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_common.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_MATMUL_COMMON_H_
-#define XLA_SERVICE_CPU_RUNTIME_MATMUL_COMMON_H_
-
-#include <cstdint>
-
-#define EIGEN_USE_THREADS
-
-#include "absl/base/dynamic_annotations.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "xla/executable_run_options.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
-
-#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
-#endif
-
-namespace xla {
-
-static inline bool Is16BytesAligned(void* ptr) {
-  return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
-}
-
-template <typename T, Eigen::AlignmentType Alignment>
-void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64_t m,
-            int64_t n, int64_t k, int32_t transpose_lhs,
-            int32_t transpose_rhs) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-
-  int64_t lhs_rows = m;
-  int64_t lhs_cols = k;
-  if (transpose_lhs) {
-    std::swap(lhs_rows, lhs_cols);
-  }
-
-  int64_t rhs_rows = k;
-  int64_t rhs_cols = n;
-  if (transpose_rhs) {
-    std::swap(rhs_rows, rhs_cols);
-  }
-
-  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> A(lhs, lhs_rows,
-                                                                 lhs_cols);
-  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> B(rhs, rhs_rows,
-                                                                 rhs_cols);
-  Eigen::TensorMap<Eigen::Tensor<T, 2>, Alignment> C(out, m, n);
-
-  typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
-  int lhs_contract_dim = transpose_lhs ? 0 : 1;
-  int rhs_contract_dim = transpose_rhs ? 1 : 0;
-  const Eigen::array<DimPair, 1> dims(
-      {DimPair(lhs_contract_dim, rhs_contract_dim)});
-
-  // Matrix multiply is a special case of the "contract" operation where
-  // the contraction is performed along dimension 1 of the lhs and dimension
-  // 0 of the rhs.
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  C.device(*run_options->intra_op_thread_pool()) = A.contract(B, dims);
-}
-
-template <typename T, Eigen::AlignmentType Alignment>
-void MatMul_Batch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
-                  int64_t m, int64_t n, int64_t k, Eigen::Index batch_size,
-                  int32_t transpose_lhs, int32_t transpose_rhs) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-
-  int64_t lhs_rows = m;
-  int64_t lhs_cols = k;
-  if (transpose_lhs) {
-    std::swap(lhs_rows, lhs_cols);
-  }
-
-  int64_t rhs_rows = k;
-  int64_t rhs_cols = n;
-  if (transpose_rhs) {
-    std::swap(rhs_rows, rhs_cols);
-  }
-
-  const Eigen::TensorMap<Eigen::Tensor<const T, 3>, Alignment> A(
-      lhs, lhs_rows, lhs_cols, batch_size);
-  const Eigen::TensorMap<Eigen::Tensor<const T, 3>, Alignment> B(
-      rhs, rhs_rows, rhs_cols, batch_size);
-  Eigen::TensorMap<Eigen::Tensor<T, 3>, Alignment> C(out, m, n, batch_size);
-
-  typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
-  int lhs_contract_dim = transpose_lhs ? 0 : 1;
-  int rhs_contract_dim = transpose_rhs ? 1 : 0;
-
-  const Eigen::array<DimPair, 1> dims(
-      {DimPair(lhs_contract_dim, rhs_contract_dim)});
-
-  // Matrix multiply is a special case of the "contract" operation where
-  // the contraction is performed along dimension 1 of the lhs and dimension
-  // 0 of the rhs.
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-
-  for (int64_t i = 0; i < batch_size; ++i) {
-    C.chip(i, 2).device(*run_options->intra_op_thread_pool()) =
-        A.chip(i, 2).contract(B.chip(i, 2), dims);
-  }
-}
-
-template <typename T>
-void MatMulDispatch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
-                    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
-                    int32_t transpose_rhs) {
-  bool all_buffers_16b_aligned =
-      Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
-
-  if (!all_buffers_16b_aligned) {
-    MatMul<T, Eigen::Unaligned>(run_options_ptr, out, lhs, rhs, m, n, k,
-                                transpose_lhs, transpose_rhs);
-    return;
-  }
-
-  MatMul<T, Eigen::Aligned16>(run_options_ptr, out, lhs, rhs, m, n, k,
-                              transpose_lhs, transpose_rhs);
-}
-
-template <typename T>
-void BatchMatMulDispatch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
-                         int64_t m, int64_t n, int64_t k, int64_t batch_size,
-                         int32_t transpose_lhs, int32_t transpose_rhs) {
-  bool all_buffers_16b_aligned =
-      Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
-
-  if (!all_buffers_16b_aligned) {
-    MatMul_Batch<T, Eigen::Unaligned>(run_options_ptr, out, lhs, rhs, m, n, k,
-                                      batch_size, transpose_lhs, transpose_rhs);
-    return;
-  }
-  MatMul_Batch<T, Eigen::Aligned16>(run_options_ptr, out, lhs, rhs, m, n, k,
-                                    batch_size, transpose_lhs, transpose_rhs);
-}
-
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_MATMUL_COMMON_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_f16.cc b/third_party/xla/xla/service/cpu/runtime_matmul_f16.cc
deleted file mode 100644
index 31f21424ccc7e8..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_f16.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_matmul.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "Eigen/Core"
-#include "xla/service/cpu/runtime_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF16(
-    const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
-    int32_t transpose_rhs) {
-  xla::MatMulDispatch<Eigen::half>(run_options_ptr, out, lhs, rhs, m, n, k,
-                                   transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_f32.cc b/third_party/xla/xla/service/cpu/runtime_matmul_f32.cc
deleted file mode 100644
index e49e53c0de3e32..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_f32.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_matmul.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs, int64_t m,
-    int64_t n, int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  xla::MatMulDispatch<float>(run_options_ptr, out, lhs, rhs, m, n, k,
-                             transpose_lhs, transpose_rhs);
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenBatchMatMulF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs, int64_t m,
-    int64_t n, int64_t k, int64_t batch_size, int32_t transpose_lhs,
-    int32_t transpose_rhs) {
-  xla::BatchMatMulDispatch<float>(run_options_ptr, out, lhs, rhs, m, n, k,
-                                  batch_size, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_f64.cc b/third_party/xla/xla/service/cpu/runtime_matmul_f64.cc
deleted file mode 100644
index 318ef91babe69b..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_f64.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_matmul.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF64(
-    const void* run_options_ptr, double* out, double* lhs, double* rhs,
-    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
-    int32_t transpose_rhs) {
-  xla::MatMulDispatch<double>(run_options_ptr, out, lhs, rhs, m, n, k,
-                              transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_s32.cc b/third_party/xla/xla/service/cpu/runtime_matmul_s32.cc
deleted file mode 100644
index 6bc5a9bb2abd71..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_s32.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_matmul.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulS32(
-    const void* run_options_ptr, int32_t* out, int32_t* lhs, int32_t* rhs,
-    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
-    int32_t transpose_rhs) {
-  xla::MatMulDispatch<int32_t>(run_options_ptr, out, lhs, rhs, m, n, k,
-                               transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.cc
deleted file mode 100644
index fa1753116dca48..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_single_threaded_conv2d.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedConv2DF16(
-    const void* /*run_options_ptr*/, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count) {
-  xla::cpu::internal::EigenConv2D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
-      input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
-      kernel_filters, output_rows, output_cols, row_stride, col_stride,
-      padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedConv2DF32(
-    const void* /*run_options_ptr*/, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_rows, int64_t input_cols,
-    int64_t input_channels, int64_t kernel_rows, int64_t kernel_cols,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_rows,
-    int64_t output_cols, int64_t row_stride, int64_t col_stride,
-    int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
-    int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count) {
-  xla::cpu::internal::EigenConv2D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
-      input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
-      kernel_filters, output_rows, output_cols, row_stride, col_stride,
-      padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.h b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.h
deleted file mode 100644
index 8f154f27b41ddd..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
-#define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
-
-#include <stdint.h>
-
-#include "Eigen/Core"
-
-extern "C" {
-
-extern void __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
-    int64_t input_rows, int64_t input_cols, int64_t input_channels,
-    int64_t kernel_rows, int64_t kernel_cols, int64_t kernel_channels,
-    int64_t kernel_filters, int64_t output_rows, int64_t output_cols,
-    int64_t row_stride, int64_t col_stride, int64_t padding_top,
-    int64_t padding_bottom, int64_t padding_left, int64_t padding_right,
-    int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count);
-
-}  // extern "C"
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV2D_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.cc
deleted file mode 100644
index a82e950953a72d..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_single_threaded_conv3d.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedConv3DF32(
-    const void* /*run_options_ptr*/, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_x, int64_t input_y, int64_t input_z,
-    int64_t input_channels, int64_t kernel_x, int64_t kernel_y,
-    int64_t kernel_z, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_x, int64_t output_y, int64_t output_z, int64_t x_stride,
-    int64_t y_stride, int64_t z_stride, int64_t padding_x_before,
-    int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
-    int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
-    int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
-    int64_t feature_group_count) {
-  xla::cpu::internal::EigenConv3D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_x, input_y,
-      input_z, input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
-      kernel_filters, output_x, output_y, output_z, x_stride, y_stride,
-      z_stride, padding_x_before, padding_x_after, padding_y_before,
-      padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
-      lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedConv3DF16(
-    const void* /*run_options_ptr*/, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64_t input_batch, int64_t input_x, int64_t input_y,
-    int64_t input_z, int64_t input_channels, int64_t kernel_x, int64_t kernel_y,
-    int64_t kernel_z, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_x, int64_t output_y, int64_t output_z, int64_t x_stride,
-    int64_t y_stride, int64_t z_stride, int64_t padding_x_before,
-    int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
-    int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
-    int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
-    int64_t feature_group_count) {
-  xla::cpu::internal::EigenConv3D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_x, input_y,
-      input_z, input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
-      kernel_filters, output_x, output_y, output_z, x_stride, y_stride,
-      z_stride, padding_x_before, padding_x_after, padding_y_before,
-      padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
-      lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.h b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.h
deleted file mode 100644
index d0b7088072b5e6..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV3D_H_
-#define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV3D_H_
-
-#include <stdint.h>
-
-#include "Eigen/Core"
-
-extern "C" {
-
-extern void __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t input_batch,
-    int64_t input_x, int64_t input_y, int64_t input_z, int64_t input_channels,
-    int64_t kernel_x, int64_t kernel_y, int64_t kernel_z,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_x,
-    int64_t output_y, int64_t output_z, int64_t x_stride, int64_t y_stride,
-    int64_t z_stride, int64_t padding_x_before, int64_t padding_x_after,
-    int64_t padding_y_before, int64_t padding_y_after, int64_t padding_z_before,
-    int64_t padding_z_after, int64_t lhs_x_dilation, int64_t lhs_y_dilation,
-    int64_t lhs_z_dilation, int64_t rhs_x_dilation, int64_t rhs_y_dilation,
-    int64_t rhs_z_dilation, int64_t feature_group_count);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_x,
-    int64_t input_y, int64_t input_z, int64_t input_channels, int64_t kernel_x,
-    int64_t kernel_y, int64_t kernel_z, int64_t kernel_channels,
-    int64_t kernel_filters, int64_t output_x, int64_t output_y,
-    int64_t output_z, int64_t x_stride, int64_t y_stride, int64_t z_stride,
-    int64_t padding_x_before, int64_t padding_x_after, int64_t padding_y_before,
-    int64_t padding_y_after, int64_t padding_z_before, int64_t padding_z_after,
-    int64_t lhs_x_dilation, int64_t lhs_y_dilation, int64_t lhs_z_dilation,
-    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation,
-    int64_t feature_group_count);
-
-}  // extern "C"
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_CONV3D_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul.h b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul.h
index f23291b5510671..ba09bbd056c766 100644
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul.h
+++ b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul.h
@@ -16,12 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 #define XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 
-#include <complex>
 #include <cstdint>
 
-#include "Eigen/Core"
-#include "tsl/platform/ml_dtypes.h"
-
 extern "C" {
 
 // Performs a single-threaded matrix multiplication using Eigen. 'lhs' and 'rhs'
@@ -29,55 +25,11 @@ extern "C" {
 // 'out' is a pointer to a buffer sufficiently large to hold the result of the
 // operation. Following standard nomenclature: lhs is m x k, rhs is k x n, and
 // out is m x n.
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF16(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs);
-
 extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
     int32_t transpose_lhs, int32_t transpose_rhs);
 
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF64(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
-    double* lhs, double* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC64(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    std::complex<float>* out, std::complex<float>* lhs,
-    std::complex<float>* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC128(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    std::complex<double>* out, std::complex<double>* lhs,
-    std::complex<double>* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulS32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int32_t* out,
-    int32_t* lhs, int32_t* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulU8(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, uint8_t* out,
-    uint8_t* lhs, uint8_t* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF8E5M2(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    tsl::float8_e5m2* out, tsl::float8_e5m2* lhs, tsl::float8_e5m2* rhs,
-    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
-    int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF8E4M3FN(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    tsl::float8_e4m3fn* out, tsl::float8_e4m3fn* lhs, tsl::float8_e4m3fn* rhs,
-    int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
-    int32_t transpose_rhs);
-
 }  // extern "C"
 
 #endif  // XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_c128.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_c128.cc
deleted file mode 100644
index a6897e5494be1d..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_c128.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-
-#include <complex>
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulC128(
-    const void* run_options_ptr, std::complex<double>* out,
-    std::complex<double>* lhs, std::complex<double>* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<std::complex<double>>(
-      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_c64.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_c64.cc
deleted file mode 100644
index 64963d8119e915..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_c64.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-
-#include <complex>
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulC64(
-    const void* run_options_ptr, std::complex<float>* out,
-    std::complex<float>* lhs, std::complex<float>* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<std::complex<float>>(
-      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f16.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f16.cc
deleted file mode 100644
index bad8527b3c2bf4..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f16.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "Eigen/Core"
-#include "xla/service/cpu/runtime_single_threaded_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulF16(
-    const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
-    Eigen::half* rhs, int64_t m, int64_t n, int64_t k, int32_t transpose_lhs,
-    int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<Eigen::half>(
-      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f64.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f64.cc
deleted file mode 100644
index 6d6c42726e8887..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f64.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr,
-                                               double* out, double* lhs,
-                                               double* rhs, int64_t m,
-                                               int64_t n, int64_t k,
-                                               int32_t transpose_lhs,
-                                               int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<double>(run_options_ptr, out, lhs, rhs, m,
-                                            n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f8.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f8.cc
deleted file mode 100644
index d29015456a5f3e..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_f8.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul_common.h"
-#include "tsl/platform/ml_dtypes.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulF8E5M2(
-    const void* run_options_ptr, tsl::float8_e5m2* out, tsl::float8_e5m2* lhs,
-    tsl::float8_e5m2* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<tsl::float8_e5m2>(
-      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulF8E4M3FN(
-    const void* run_options_ptr, tsl::float8_e4m3fn* out,
-    tsl::float8_e4m3fn* lhs, tsl::float8_e4m3fn* rhs, int64_t m, int64_t n,
-    int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<tsl::float8_e4m3fn>(
-      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_s32.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_s32.cc
deleted file mode 100644
index 7602f188a21026..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_s32.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulS32(const void* run_options_ptr,
-                                               int32_t* out, int32_t* lhs,
-                                               int32_t* rhs, int64_t m,
-                                               int64_t n, int64_t k,
-                                               int32_t transpose_lhs,
-                                               int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<int32_t>(
-      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_u8.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_u8.cc
deleted file mode 100644
index db39d81df59ace..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_matmul_u8.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-
-#include "absl/base/attributes.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul_common.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_EigenSingleThreadedMatMulU8(const void* run_options_ptr,
-                                              uint8_t* out, uint8_t* lhs,
-                                              uint8_t* rhs, int64_t m,
-                                              int64_t n, int64_t k,
-                                              int32_t transpose_lhs,
-                                              int32_t transpose_rhs) {
-  xla::SingleThreadedMatMulDispatch<uint8_t>(
-      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
deleted file mode 100644
index 240f9d4cb109e6..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_symbol_generator.h"
-
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/strings/string_view.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/CoreContainers.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Support/Error.h"
-#include "mlir/ExecutionEngine/CRunnerUtils.h"
-#include "xla/service/cpu/cpu_runtime.h"
-#include "xla/service/cpu/runtime_conv2d.h"
-#include "xla/service/cpu/runtime_conv2d_acl.h"
-#include "xla/service/cpu/runtime_conv3d.h"
-#include "xla/service/cpu/runtime_custom_call_status.h"
-#include "xla/service/cpu/runtime_fp16.h"
-#include "xla/service/cpu/runtime_key_value_sort.h"
-#include "xla/service/cpu/runtime_matmul.h"
-#include "xla/service/cpu/runtime_matmul_acl.h"
-#include "xla/service/cpu/runtime_pow.h"
-#include "xla/service/cpu/runtime_single_threaded_conv2d.h"
-#include "xla/service/cpu/runtime_single_threaded_conv3d.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "xla/service/cpu/runtime_topk.h"
-#include "xla/service/cpu/windows_compatibility.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "tsl/platform/logging.h"
-
-#ifdef XLA_ONEDNN
-#include "xla/service/cpu/onednn_convolution.h"
-#include "xla/service/cpu/onednn_matmul.h"
-#endif  // XLA_ONEDNN
-
-namespace xla::cpu {
-
-RuntimeSymbolGenerator::RuntimeSymbolGenerator(llvm::DataLayout data_layout)
-    : data_layout_(std::move(data_layout)) {}
-
-llvm::Error RuntimeSymbolGenerator::tryToGenerate(
-    llvm::orc::LookupState&, llvm::orc::LookupKind kind,
-    llvm::orc::JITDylib& jit_dylib, llvm::orc::JITDylibLookupFlags,
-    const llvm::orc::SymbolLookupSet& names) {
-  llvm::orc::SymbolMap symbols;
-  symbols.reserve(names.size());
-
-  for (const auto& [name, flags] : names) {
-    if (auto symbol = ResolveRuntimeSymbol(*name)) {
-      symbols[name] = *symbol;
-    }
-  }
-
-  cantFail(jit_dylib.define(llvm::orc::absoluteSymbols(std::move(symbols))));
-  return llvm::Error::success();
-}
-
-std::optional<llvm::orc::ExecutorSymbolDef>
-RuntimeSymbolGenerator::ResolveRuntimeSymbol(llvm::StringRef name) {
-  void* fn_addr = nullptr;
-  if (name.size() > 1 && name.front() == data_layout_.getGlobalPrefix()) {
-    // On Mac OS X, 'name' may have a leading underscore prefix, even though the
-    // registered name may not.
-    std::string stripped_name(name.begin() + 1, name.end());
-    fn_addr = CustomCallTargetRegistry::Global()->Lookup(stripped_name, "Host");
-  } else {
-    fn_addr = CustomCallTargetRegistry::Global()->Lookup(name.str(), "Host");
-  }
-
-  // We register runtime symbols as weak, because during concurrent compilation
-  // different threads may race to register their symbols in the same dylib and
-  // we get spurious "symbol already defined" errors.
-  return llvm::orc::ExecutorSymbolDef{
-      llvm::orc::ExecutorAddr(reinterpret_cast<uint64_t>(fn_addr)),
-      llvm::JITSymbolFlags::Weak};
-}
-
-//===----------------------------------------------------------------------===//
-// Register XLA:CPU runtime symbols with the CustomCallTargetRegistry.
-//===----------------------------------------------------------------------===//
-
-#if defined(PLATFORM_WINDOWS)
-// This function is used by compiler-generated code on windows, but it's not
-// declared anywhere. The signature does not matter, we just need the address.
-extern "C" void __chkstk(size_t);
-#endif
-
-extern "C" {
-// Provided by compiler-rt and MLIR.
-// Converts an F32 value to a BF16.
-uint16_t __truncsfbf2(float);
-// Converts an F64 value to a BF16.
-uint16_t __truncdfbf2(double);
-
-#ifdef __APPLE__
-// Converts an F32 value to a F16.
-uint16_t __truncsfhf2(float);
-
-float __extendhfsf2(uint16_t a);
-#endif  // __APPLE__
-
-}  // extern "C"
-
-#define REGISTER_CPU_RUNTIME_SYMBOL(base_name)                               \
-  do {                                                                       \
-    auto* function_address =                                                 \
-        reinterpret_cast<void*>(__xla_cpu_runtime_##base_name);              \
-    registry->Register(xla::cpu::runtime::k##base_name##SymbolName,          \
-                       function_address, "Host");                            \
-    CHECK_EQ(absl::string_view(xla::cpu::runtime::k##base_name##SymbolName), \
-             "__xla_cpu_runtime_" #base_name);                               \
-  } while (false)
-
-// Register both the f32 (float) and f64 (double) versions of a libm symbol.
-// Unfortunately the double versions are overloaded on some systems, e.g.
-// Mac so we need an explicit cast. This requires passing the function signature
-// for that case.
-#define REGISTER_LIBM_SYMBOL(name, double_sig)                                 \
-  do {                                                                         \
-    registry->Register(#name "f", reinterpret_cast<void*>(name##f), "Host");   \
-    registry->Register(#name,                                                  \
-                       reinterpret_cast<void*>(static_cast<double_sig>(name)), \
-                       "Host");                                                \
-  } while (false)
-
-static bool RegisterKnownJITSymbols() {
-  xla::CustomCallTargetRegistry* registry =
-      xla::CustomCallTargetRegistry::Global();
-  registry->Register("printf", reinterpret_cast<void*>(&printf), "Host");
-  registry->Register("puts", reinterpret_cast<void*>(&puts), "Host");
-
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv2DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv2DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv3DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv3DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC128);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulS32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenBatchMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(ACLMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(ACLBatchMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(ACLConv2DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv2DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv2DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv3DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv3DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF8E4M3FN);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF8E5M2);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC128);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulS32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulU8);
-  REGISTER_CPU_RUNTIME_SYMBOL(StatusIsSuccess);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSort);
-  REGISTER_CPU_RUNTIME_SYMBOL(TopKF32);
-#ifdef XLA_ONEDNN
-  REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul);
-  REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMulReorder);
-#endif  // XLA_ONEDNN
-
-  registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee),
-                     "Host");
-  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee),
-                     "Host");
-  registry->Register("__truncdfhf2", reinterpret_cast<void*>(__truncdfhf2),
-                     "Host");
-  registry->Register("__truncdfbf2", reinterpret_cast<void*>(__truncdfbf2),
-                     "Host");
-  registry->Register("__truncsfbf2", reinterpret_cast<void*>(__truncsfbf2),
-                     "Host");
-
-#ifdef __APPLE__
-  registry->Register("__truncsfhf2", reinterpret_cast<void*>(__truncsfhf2),
-                     "Host");
-  registry->Register("__extendhfsf2", reinterpret_cast<void*>(__extendhfsf2),
-                     "Host");
-#endif  // __APPLE__
-  registry->Register("__powisf2", reinterpret_cast<void*>(__powisf2), "Host");
-  registry->Register("__powidf2", reinterpret_cast<void*>(__powidf2), "Host");
-
-  REGISTER_LIBM_SYMBOL(acos, double (*)(double));
-  REGISTER_LIBM_SYMBOL(acosh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(asin, double (*)(double));
-  REGISTER_LIBM_SYMBOL(asinh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(atan, double (*)(double));
-  REGISTER_LIBM_SYMBOL(atan2, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(atanh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(cbrt, double (*)(double));
-  REGISTER_LIBM_SYMBOL(ceil, double (*)(double));
-  REGISTER_LIBM_SYMBOL(copysign, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(cos, double (*)(double));
-  REGISTER_LIBM_SYMBOL(cosh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(erf, double (*)(double));
-  REGISTER_LIBM_SYMBOL(erfc, double (*)(double));
-  REGISTER_LIBM_SYMBOL(exp, double (*)(double));
-  REGISTER_LIBM_SYMBOL(exp2, double (*)(double));
-  REGISTER_LIBM_SYMBOL(expm1, double (*)(double));
-  REGISTER_LIBM_SYMBOL(fabs, double (*)(double));
-  REGISTER_LIBM_SYMBOL(fdim, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(floor, double (*)(double));
-  REGISTER_LIBM_SYMBOL(fma, double (*)(double, double, double));
-  REGISTER_LIBM_SYMBOL(fmax, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(fmin, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(fmod, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(frexp, double (*)(double, int*));
-  REGISTER_LIBM_SYMBOL(hypot, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(ilogb, int (*)(double));
-  REGISTER_LIBM_SYMBOL(ldexp, double (*)(double, int));
-  REGISTER_LIBM_SYMBOL(lgamma, double (*)(double));
-  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));   // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));  // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(log, double (*)(double));
-  REGISTER_LIBM_SYMBOL(log10, double (*)(double));
-  REGISTER_LIBM_SYMBOL(log1p, double (*)(double));
-  REGISTER_LIBM_SYMBOL(log2, double (*)(double));
-  REGISTER_LIBM_SYMBOL(logb, double (*)(double));
-  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));   // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(lround, long (*)(double));  // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(modf, double (*)(double, double*));
-  REGISTER_LIBM_SYMBOL(nan, double (*)(const char*));
-  REGISTER_LIBM_SYMBOL(nearbyint, double (*)(double));
-  REGISTER_LIBM_SYMBOL(nextafter, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(nexttoward, double (*)(double, long double));
-  REGISTER_LIBM_SYMBOL(pow, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(remainder, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(remquo, double (*)(double, double, int*));
-  REGISTER_LIBM_SYMBOL(rint, double (*)(double));
-  REGISTER_LIBM_SYMBOL(round, double (*)(double));
-  REGISTER_LIBM_SYMBOL(scalbln,
-                       double (*)(double, long));  // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
-  REGISTER_LIBM_SYMBOL(sin, double (*)(double));
-#ifdef __APPLE__
-  REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
-  registry->Register("__sincosf_stret",
-                     reinterpret_cast<void*>(__sincosf_stret), "Host");
-  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret),
-                     "Host");
-#else
-  REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
-#endif
-  REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
-  REGISTER_LIBM_SYMBOL(tan, double (*)(double));
-  REGISTER_LIBM_SYMBOL(tanh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(tgamma, double (*)(double));
-  REGISTER_LIBM_SYMBOL(trunc, double (*)(double));
-
-  registry->Register("memcpy", reinterpret_cast<void*>(memcpy), "Host");
-  registry->Register("memmove", reinterpret_cast<void*>(memmove), "Host");
-  registry->Register("memset", reinterpret_cast<void*>(memset), "Host");
-
-  // Used by MLIR lowering.
-  registry->Register("malloc", reinterpret_cast<void*>(malloc), "Host");
-  registry->Register("calloc", reinterpret_cast<void*>(calloc), "Host");
-  registry->Register("free", reinterpret_cast<void*>(free), "Host");
-#ifndef _WIN32
-  // TODO(b/246980307): fails to link on windows because it's marked dllimport.
-  registry->Register("memrefCopy", reinterpret_cast<void*>(memrefCopy), "Host");
-#endif
-
-#ifdef __APPLE__
-  registry->Register("__bzero", reinterpret_cast<void*>(bzero), "Host");
-  registry->Register("bzero", reinterpret_cast<void*>(bzero), "Host");
-  registry->Register("memset_pattern16",
-                     reinterpret_cast<void*>(memset_pattern16), "Host");
-#endif
-
-#ifdef MEMORY_SANITIZER
-  registry->Register("__msan_unpoison",
-                     reinterpret_cast<void*>(__msan_unpoison), "Host");
-#endif
-
-#if defined(PLATFORM_WINDOWS)
-  registry->Register("__chkstk", reinterpret_cast<void*>(__chkstk), "Host");
-#endif
-
-  return true;
-}
-
-#undef REGISTER_CPU_RUNTIME_SYMBOL
-#undef REGISTER_LIBM_SYMBOL
-
-static bool unused = RegisterKnownJITSymbols();
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime_topk.h b/third_party/xla/xla/service/cpu/runtime_topk.h
deleted file mode 100644
index 13e922d16401cd..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_topk.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_TOPK_H_
-#define XLA_SERVICE_CPU_RUNTIME_TOPK_H_
-
-#include <stdint.h>
-
-extern "C" {
-
-// Calculates `batch_size` topk operations with `input_size` inputs each. The
-// outputs are written to `out_values` and `out_indices`.
-extern void __xla_cpu_runtime_TopKF32(int64_t batch_size, int64_t input_size,
-                                      int64_t k, const float* values,
-                                      float* out_values, int32_t* out_indices);
-}
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_TOPK_H_
diff --git a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
index d032db343c362c..8b9a8619759c4d 100644
--- a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
+++ b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
@@ -47,7 +47,10 @@ static bool InstructionIsUnavailable(const HloInstruction* instr) {
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFft:
+    case HloOpcode::kPartitionId:
+    case HloOpcode::kReplicaId:
       return true;
     default:
       return IsCollective(instr);
@@ -83,7 +86,7 @@ SmallWhileLoopHoistingPass::SmallWhileLoopHoistingPass(
     int64_t small_buffer_access_size)
     : small_buffer_access_size_(small_buffer_access_size) {}
 
-absl::StatusOr<bool> SmallWhileLoopHoistingPass::Run(
+absl::StatusOr<bool> SmallWhileLoopHoistingPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> while_instrs;
diff --git a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
index a190d2a38039a8..5ee2e37fa02bdd 100644
--- a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
+++ b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
@@ -34,8 +34,9 @@ class SmallWhileLoopHoistingPass final : public HloModulePass {
   explicit SmallWhileLoopHoistingPass(int64_t small_buffer_access_size);
 
   absl::string_view name() const final { return "small-while-loop-hoisting"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) final;
 
diff --git a/third_party/xla/xla/service/cpu/test_target_triple_helper.h b/third_party/xla/xla/service/cpu/test_target_triple_helper.h
index 0b057bf4001791..2da6b059b32b2c 100644
--- a/third_party/xla/xla/service/cpu/test_target_triple_helper.h
+++ b/third_party/xla/xla/service/cpu/test_target_triple_helper.h
@@ -23,6 +23,9 @@ static const char kTargetTripleForHost[] = "aarch64-unknown-linux-gnu";
        defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
 static const char kTargetCpuForHost[] = "ppc";
 static const char kTargetTripleForHost[] = "ppc64le-ibm-linux-gnu";
+#elif defined(__riscv) && (__riscv_xlen == 64)
+static const char kTargetCpuForHost[] = "";
+static const char kTargetTripleForHost[] = "riscv64-unknown-linux-gnu";
 #elif defined(__s390x__)
 static const char kTargetCpuForHost[] = "s390x";
 static const char kTargetTripleForHost[] = "systemz-none-linux-gnu";
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 529106d33da8c4..3ea7a53b3b6206 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -394,11 +394,10 @@ xla_cc_test(
         "notap",
     ],
     deps = [
-        "//xla:literal",
         "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
-        "//xla/hlo/testlib:test_helpers",
         "//xla/service:cpu_plugin",
         "//xla/service:pattern_matcher",
         "//xla/service/cpu:backend_config_proto_cc",
@@ -407,6 +406,7 @@ xla_cc_test(
         "//xla/service/cpu:onednn_util",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -422,6 +422,7 @@ xla_cc_test(
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
index ae26a37a35a9e0..35acc45c65cee1 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -61,10 +61,10 @@ class CpuUnaryIntrinsicTest
     auto spec = info.param;
 
     std::string opcode(HloOpcodeString(spec.opcode));
-    opcode[0] = toupper(opcode[0]);
+    opcode[0] = absl::ascii_toupper(opcode[0]);
 
     std::string type(PrimitiveType_Name(spec.type));
-    type[0] = toupper(type[0]);
+    type[0] = absl::ascii_toupper(type[0]);
 
     std::string triple{spec.triple.data(), spec.triple.size()};
     if (triple == kTriple_x86_64) {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
index 892ff2303e4de4..b659c9a48f821b 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -63,7 +63,7 @@ class CpuVectorizationTest
     auto spec = info.param;
 
     std::string opcode(HloOpcodeString(spec.opcode));
-    opcode[0] = toupper(opcode[0]);
+    opcode[0] = absl::ascii_toupper(opcode[0]);
 
     std::string triple{spec.triple.data(), spec.triple.size()};
     if (triple == kTriple_x86_64) {
@@ -256,7 +256,7 @@ class JitVectorizationTest
   static std::string Name(
       const ::testing::TestParamInfo<JitVectorizationTestSpec>& info) {
     std::string op_name(HloOpcodeString(info.param.opcode));
-    op_name[0] = toupper(op_name[0]);
+    op_name[0] = absl::ascii_toupper(op_name[0]);
     return absl::StrCat(op_name, "_max_", info.param.max_isa);
   }
 
diff --git a/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc b/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
index 2991dd47cbb899..dd28c5432c8bd5 100644
--- a/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_replace.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/test.h"
@@ -300,20 +301,20 @@ TEST_P(ConvolutionTest, ConvInsufficientScratchTest) {
             "dims":"3",
             "input":{
               "dims":"3",
-              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["2"]}
+              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["1"]}
             },
             "kernel":{
               "dims":"3",
               "filter":{"input_feature_dim":"1","output_feature_dim":"2",
-                "spatial_dims":["1"],"shape":[]}
+                "spatial_dims":["0"],"shape":[]}
             },
             "output":{
               "dims":"3",
-              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["2"]}
+              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["1"]}
             },
             "window":{
-              "size":[],"pad_left":["1"],"pad_right":["1"],
-              "strides":["2"],"window_dilations":["2"]
+              "size":[],"pad_left":["0"],"pad_right":["0"],
+              "strides":["1"],"window_dilations":["1"]
             },
             "feature_groups":"1",
             "optimization_config":{"user_scratchpad":true}
@@ -756,8 +757,7 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(F32, BF16, F16),
     [](const ::testing::TestParamInfo<ConvolutionTest::ParamType>& info) {
       auto test_name = primitive_util::LowercasePrimitiveTypeName(info.param);
-      std::transform(test_name.begin(), test_name.end(), test_name.begin(),
-                     [](auto c) { return std::toupper(c); });
+      absl::AsciiStrToUpper(&test_name);
       return test_name;
     });
 
diff --git a/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc b/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc
index 882d6e690692ef..9e26ce45dcba31 100644
--- a/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "xla/error_spec.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/test.h"
+#include "tsl/platform/cpu_info.h"
 
 namespace xla::cpu {
 namespace {
@@ -26,7 +27,8 @@ using OneDnnFusionTest = HloTestBase;
 
 inline constexpr bool IsOneDnnGraphEnabled() {
 #if defined(XLA_ONEDNN_USE_GRAPH_API)
-  return true;
+  // Some Aarch64 CPUs have failures. Only test on x86 for now.
+  return tsl::port::IsX86CPU();
 #endif  // XLA_ONEDNN_USE_GRAPH_API
   return false;
 }
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 5eef29119c077f..1a6599959d479d 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
@@ -77,12 +78,9 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/comparison_util.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -108,6 +106,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -126,6 +125,13 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/onednn/onednn_fusion_thunk.h"
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_emitter.h"
+#include "xla/backends/cpu/ynn_support.h"
+#endif  // XLA_YNNPACK
+
 namespace xla::cpu {
 
 namespace {
@@ -208,7 +214,8 @@ ThunkEmitter::ThunkEmitter(IrEmitter2& ir_emitter,
           thread_pool, FusionCompilerOptions(hlo_module_config_),
           FusionCompilerHooks(hlo_module), &buffer_assignment,
           hlo_module_config_.debug_options()
-              .xla_cpu_generate_unique_c_style_kernel_entry_points()) {}
+              .xla_cpu_generate_unique_c_style_kernel_entry_points(),
+          options::EnableTiledEmitter(hlo_module_config_)) {}
 
 static Thunk::Info ThunkInfo(const HloInstruction* instruction) {
   const HloModule* module = instruction->GetModule();
@@ -228,13 +235,15 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
 absl::StatusOr<std::vector<ThunkEmitter::EmittedKernel>>
 ThunkEmitter::ConsumeKernels() {
   tsl::profiler::TraceMe trace("ThunkEmitter::ConsumeKernels");
-  TF_ASSIGN_OR_RETURN(std::vector<LlvmKernelDefinition> fusion_kernels,
-                      parallel_fusion_emitter_.ConsumeKernels());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<KernelDefinition<LlvmKernelSource>> fusion_kernels,
+      parallel_fusion_emitter_.ConsumeKernels());
 
   kernels_.reserve(kernels_.size() + fusion_kernels.size());
-  for (LlvmKernelDefinition& kernel : fusion_kernels) {
-    auto [spec, source] = std::move(kernel).ReleaseStorage();
-    kernels_.push_back({spec.name(), std::move(source).thread_safe_module()});
+  for (KernelDefinition<LlvmKernelSource>& kernel : fusion_kernels) {
+    std::string name(kernel.spec().name());
+    auto source = std::move(kernel).TakeSource();
+    kernels_.push_back({name, std::move(source).thread_safe_module()});
   }
 
   return std::move(kernels_);
@@ -330,6 +339,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAdd:
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
@@ -439,6 +449,12 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
           return EmitXnnFusionThunk(instruction);
         }
 
+#ifdef XLA_YNNPACK
+        if (backend_config.fusion_config().kind() == kYnnFusionKind) {
+          return EmitYnnFusionThunk(instruction);
+        }
+#endif  // XLA_YNNPACK
+
         return Internal("Unsupported custom fusion kind: %s",
                         backend_config.DebugString());
       }
@@ -674,11 +690,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCallThunk(
       maybe_small_call.has_value() && *maybe_small_call == "true") {
     ComputationKernelEmitter emitter(instruction, &buffer_assignment_,
                                      &target_machine_features_);
-    TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+    TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                         emitter.EmitKernelDefinition());
 
-    auto [kernel_spec, kernel_source] =
-        std::move(kernel_definition).ReleaseStorage();
+    auto kernel_spec = kernel_definition.spec();
+    auto kernel_source = std::move(kernel_definition).TakeSource();
 
     kernels_.push_back(
         {kernel_spec.name(), std::move(kernel_source).thread_safe_module()});
@@ -698,11 +714,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConcatenateKernelThunk(
     const HloInstruction* instruction) {
   ConcatenateKernelEmitter emitter(instruction, &buffer_assignment_,
                                    &target_machine_features_);
-  TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+  TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                       emitter.EmitKernelDefinition());
 
-  auto [kernel_spec, kernel_source] =
-      std::move(kernel_definition).ReleaseStorage();
+  auto kernel_spec = kernel_definition.spec();
+  auto kernel_source = std::move(kernel_definition).TakeSource();
 
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instruction->backend_config<BackendConfig>());
@@ -770,8 +786,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
       TF_ASSIGN_OR_RETURN(auto output_buffer, GetAllocationSlice(instruction));
 
       ConvolutionThunk::Options options;
-      options.multi_threaded =
-          hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
       return ThunkSequence::Of<ConvolutionThunk>(
           ThunkInfo(instruction), options, input_buffer, input_shape,
           kernel_buffer, kernel_shape, output_buffer, output_shape,
@@ -804,11 +818,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(
     const HloInstruction* instruction) {
   ElementalKernelEmitter emitter(instruction, &buffer_assignment_,
                                  &target_machine_features_);
-  TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+  TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                       emitter.EmitKernelDefinition());
 
-  auto [kernel_spec, kernel_source] =
-      std::move(kernel_definition).ReleaseStorage();
+  auto kernel_spec = kernel_definition.spec();
+  auto kernel_source = std::move(kernel_definition).TakeSource();
 
   kernels_.push_back(
       {kernel_spec.name(), std::move(kernel_source).thread_safe_module()});
@@ -836,17 +850,17 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
     auto kernel_emitter = std::make_unique<CpuScatterFusion>(
         buffer_assignment_, fusion, mlir_context_.get());
 
-    TF_ASSIGN_OR_RETURN(MlirKernelDefinition kernel_definition,
+    TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                         kernel_emitter->EmitKernelDefinition());
 
-    auto [kernel_spec, kernel_source] =
-        std::move(kernel_definition).ReleaseStorage();
+    auto kernel_spec = kernel_definition.spec();
+    auto kernel_source = std::move(kernel_definition).TakeSource();
 
-    TF_ASSIGN_OR_RETURN(LlvmIrKernelSource llvm_ir_kernel_source,
+    TF_ASSIGN_OR_RETURN(LlvmKernelSource llvm_kernel_source,
                         fusion_compiler_.Compile(std::move(kernel_source)));
 
     kernels_.push_back({kernel_spec.name(),
-                        std::move(llvm_ir_kernel_source).thread_safe_module()});
+                        std::move(llvm_kernel_source).thread_safe_module()});
 
     return MakeKernelThunkSequence(instruction, std::move(kernel_spec),
                                    /*min_alignment=*/MinAlign());
@@ -1048,11 +1062,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
     case DotImplementationStrategy::kTiledLlvmIrGemv: {
       DotKernelEmitter emitter(instruction, &buffer_assignment_,
                                &target_machine_features_);
-      TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+      TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                           emitter.EmitKernelDefinition());
 
-      auto [kernel_spec, kernel_source] =
-          std::move(kernel_definition).ReleaseStorage();
+      auto kernel_spec = kernel_definition.spec();
+      auto kernel_source = std::move(kernel_definition).TakeSource();
 
       kernels_.push_back(
           {kernel_spec.name(), std::move(kernel_source).thread_safe_module()});
@@ -1070,6 +1084,22 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
       TF_ASSIGN_OR_RETURN(BufferAllocation::Slice out_slice,
                           GetAllocationSlice(instruction));
 
+#ifdef XLA_YNNPACK
+      const bool use_ynn = absl::c_linear_search(
+          hlo_module_config_.debug_options()
+              .xla_cpu_experimental_ynn_fusion_type(),
+          DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
+      if (use_ynn) {
+        TF_ASSIGN_OR_RETURN(
+            auto is_dot_supported,
+            IsDotSupportedByYnn(dnums, lhs->shape(), rhs->shape(),
+                                instruction->shape()));
+        if (is_dot_supported) {
+          return EmitYnnFusionThunk(instruction);
+        }
+      }
+#endif  // XLA_YNNPACK
+
       // Decide whether to use XNNPACK or Eigen.
       bool use_xnn = hlo_module_config_.debug_options().xla_cpu_use_xnnpack();
       if (use_xnn) {
@@ -1493,6 +1523,64 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitXnnFusionThunk(
       [b = std::move(builder)](auto, auto) mutable { return b(); });
 }
 
+absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
+    const HloInstruction* instruction) {
+#ifdef XLA_YNNPACK
+  // Collect YNNPACK fusion arguments.
+  std::vector<YnnFusionThunk::Argument> arguments;
+  for (HloInstruction* operand : instruction->operands()) {
+    for (auto& indexed : ShapeUtil::GetLeafShapes(operand->shape())) {
+      TF_ASSIGN_OR_RETURN(
+          BufferAllocation::Slice slice,
+          buffer_assignment_.GetUniqueSlice(operand, indexed.index));
+      arguments.push_back(YnnFusionThunk::Argument{slice, indexed.shape});
+    }
+  }
+
+  // Collect YNNPACK fusion results.
+  std::vector<YnnFusionThunk::Result> results;
+  for (auto& indexed : ShapeUtil::GetLeafShapes(instruction->shape())) {
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice slice,
+        buffer_assignment_.GetUniqueSlice(instruction, indexed.index));
+    results.push_back(YnnFusionThunk::Result{slice, indexed.shape});
+  }
+
+  absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers)>
+      builder;
+  absl::Span<const int64_t> captured_arguments_ids;
+  if (instruction->opcode() == HloOpcode::kDot) {
+    const HloDotInstruction* dot = Cast<HloDotInstruction>(instruction);
+    // TODO(b/455903737): If we know the RHS is a constant, we should capture it
+    // here.
+    bool capture_rhs = false;
+    // Construct YNNPACK subgraph builder from the dot instruction.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnDotBuilder(dot, capture_rhs));
+    static constexpr int64_t kCapturedIds[1] = {1};
+    if (capture_rhs) {
+      captured_arguments_ids = kCapturedIds;
+    }
+  } else {
+    auto* fusion = Cast<HloFusionInstruction>(instruction);
+    const HloComputation* computation =
+        fusion->fused_instructions_computation();
+    // Construct YNNPACK subgraph builder from the fusion computation.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnFusionBuilder(computation));
+  }
+
+  return ThunkSequence::Of<YnnFusionThunk>(
+      YnnFusionThunk::Options{}, ThunkInfo(instruction), instruction,
+      std::move(arguments), std::move(results),
+      [b = std::move(builder)](auto, auto, auto arg_buffers) mutable {
+        return b(arg_buffers);
+      },
+      captured_arguments_ids);
+#else
+  return Unimplemented("XLA is not built with YNNPACK.");
+#endif  // XLA_YNNPACK
+}
+
 absl::StatusOr<ThunkEmitter::HostKernelAllocationSlices>
 ThunkEmitter::GetHostKernelAllocationSlices(const HloInstruction* instruction) {
   HostKernelAllocationSlices slices;
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index 28cbe30c5c337f..6d034ddc6234aa 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -20,11 +20,13 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
 #include "mlir/IR/MLIRContext.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/codegen/kernel_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -68,6 +71,9 @@ class ThunkEmitter {
   };
 
   struct EmittedKernel {
+    EmittedKernel(absl::string_view name, llvm::orc::ThreadSafeModule module)
+        : kernel_name(name), module(std::move(module)) {}
+
     std::string kernel_name;
     llvm::orc::ThreadSafeModule module;
   };
@@ -216,6 +222,9 @@ class ThunkEmitter {
   absl::StatusOr<ThunkSequence> EmitXnnFusionThunk(
       const HloInstruction* instruction);
 
+  absl::StatusOr<ThunkSequence> EmitYnnFusionThunk(
+      const HloInstruction* instruction);
+
   absl::StatusOr<ThunkSequence> EmitOneDnnFusionThunk(
       const HloInstruction* instruction);
 
diff --git a/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc b/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
index f7344f0122a1cf..d894307fac1b4d 100644
--- a/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
+++ b/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "xla/service/cpu/tiled_dot_emitter.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/numeric/bits.h"
 #include "absl/strings/str_cat.h"
@@ -565,9 +565,8 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
                         &scalar_accumulators);
 
   std::vector<llvm::Value*> accumulator_values;
-  std::transform(
-      vector_accumulators.begin(), vector_accumulators.end(),
-      std::back_inserter(accumulator_values),
+  absl::c_transform(
+      vector_accumulators, std::back_inserter(accumulator_values),
       [](const VectorVariable& vector_var) { return vector_var.Get(); });
 
   std::vector<llvm::Value*> horizontal_sums;
diff --git a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 20e109d1d5b9ac..23a21b04905d92 100644
--- a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -50,9 +50,10 @@ absl::StatusOr<unsigned int> GetTargetVectorRegisterByteSize(
   // Unfortunately we need a lot of boilerplate to get to an
   // llvm::TargetMachine.
 
+  llvm::Triple target_triple(triple);
   std::string error;
   const llvm::Target* target =
-      llvm::TargetRegistry::lookupTarget(triple, error);
+      llvm::TargetRegistry::lookupTarget(target_triple, error);
   if (target == nullptr) {
     return Internal("TargetRegistry::lookupTarget failed: %s", error);
   }
@@ -65,7 +66,7 @@ absl::StatusOr<unsigned int> GetTargetVectorRegisterByteSize(
 
   std::unique_ptr<llvm::TargetMachine> target_machine =
       absl::WrapUnique(target->createTargetMachine(
-          /*TT=*/llvm::Triple(triple), /*CPU=*/"", /*Features=*/"",
+          /*TT=*/target_triple, /*CPU=*/"", /*Features=*/"",
           llvm::TargetOptions{},
           /*RM=*/std::nullopt));
   cpu::TargetMachineFeatures target_machine_features(target_machine.get());
diff --git a/third_party/xla/xla/service/debug/BUILD b/third_party/xla/xla/service/debug/BUILD
index 75c1830e23d973..2ae5b9612315cb 100644
--- a/third_party/xla/xla/service/debug/BUILD
+++ b/third_party/xla/xla/service/debug/BUILD
@@ -1,6 +1,7 @@
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
-load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -78,3 +79,29 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+xla_test(
+    name = "float_check_device_test",
+    srcs = ["float_check_device_test.cc"],
+    backend_args = if_google(
+        {
+            "b200": ["--heap_check="],
+            "h100": ["--heap_check="],
+        },
+        {},
+    ),
+    backends = [
+        "h100",
+        "b200",
+    ],
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:log_severity",
+        "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/service/debug/float_check_device_test.cc b/third_party/xla/xla/service/debug/float_check_device_test.cc
new file mode 100644
index 00000000000000..a99ab11f0ef160
--- /dev/null
+++ b/third_party/xla/xla/service/debug/float_check_device_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/log_severity.h"
+#include "absl/log/scoped_mock_log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+using ::testing::_;
+
+namespace {
+
+TEST_F(GpuCodegenTest, OnNanShouldLogHloInstruction) {
+  static constexpr absl::string_view kHloModule = R"hlo(
+    HloModule test_module
+    ENTRY main {
+      zero = f32[] constant(0)
+      zero_init = f32[1024] broadcast(zero), dimensions={}
+      ROOT div = f32[1024] divide(zero_init, zero_init)
+    }
+  )hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kHloModule));
+  module->mutable_config().mutable_debug_options().set_xla_gpu_detect_nan(
+      DebugOptions::DETECTION_MODE_WARNING);
+  absl::ScopedMockLog log;
+  EXPECT_CALL(
+      log, Log(absl::LogSeverity::kError, _,
+               ::testing::HasSubstr("Found entry with non zero nan count ")));
+  EXPECT_CALL(log, Log(absl::LogSeverity::kError, _,
+                       ::testing::HasSubstr("HLO instruction with id ")));
+  EXPECT_CALL(log,
+              Log(absl::LogSeverity::kError, _,
+                  ::testing::HasSubstr("HLO fusion instruction computation")));
+  log.StartCapturingLogs();
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/true));
+  log.StopCapturingLogs();
+}
+
+TEST_F(GpuCodegenTest, OnInfShouldLogHloInstruction) {
+  static constexpr absl::string_view kHloModule = R"hlo(
+    HloModule test_module
+    ENTRY main {
+      p0 = f32[1024] parameter(0)
+      zero = f32[] constant(0)
+      zero_init = f32[1024] broadcast(zero), dimensions={}
+      ROOT div = f32[1024] divide(p0, zero_init)
+    }
+  )hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kHloModule));
+  module->mutable_config().mutable_debug_options().set_xla_gpu_detect_inf(
+      DebugOptions::DETECTION_MODE_WARNING);
+  absl::ScopedMockLog log;
+  EXPECT_CALL(
+      log, Log(absl::LogSeverity::kError, _,
+               ::testing::HasSubstr("Found entry with non zero inf count ")));
+  EXPECT_CALL(log, Log(absl::LogSeverity::kError, _,
+                       ::testing::HasSubstr("HLO instruction with id ")));
+  EXPECT_CALL(log,
+              Log(absl::LogSeverity::kError, _,
+                  ::testing::HasSubstr("HLO fusion instruction computation")));
+  log.StartCapturingLogs();
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/true));
+  log.StopCapturingLogs();
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_detector.cc b/third_party/xla/xla/service/debug/unstable_reduction_detector.cc
index 31911ca44f9e5d..a7c60d54d1d9ec 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_detector.cc
+++ b/third_party/xla/xla/service/debug/unstable_reduction_detector.cc
@@ -74,11 +74,11 @@ std::string UniqueReductionOpsAsString(
   return result;
 }
 
-absl::StatusOr<bool> UnstableReductionDetector::Run(
+absl::StatusOr<bool> UnstableReductionDetector::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (module->config().debug_options().xla_detect_unstable_reductions() ==
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE) {
+      DebugOptions::DETECTION_MODE_NONE) {
     return false;
   }
   std::vector<const HloInstruction*> unstable_reductions =
@@ -94,7 +94,7 @@ absl::StatusOr<bool> UnstableReductionDetector::Run(
     LOG(WARNING) << "Unstable reduction: " << reduction->ToString();
   }
   if (module->config().debug_options().xla_detect_unstable_reductions() ==
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL) {
+      DebugOptions::DETECTION_MODE_FAIL) {
     std::string reduction_ops_string =
         UniqueReductionOpsAsString(unstable_reductions);
     return absl::FailedPreconditionError(absl::StrFormat(
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_detector.h b/third_party/xla/xla/service/debug/unstable_reduction_detector.h
index f2559e4abe887f..3e767d70daaaba 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_detector.h
+++ b/third_party/xla/xla/service/debug/unstable_reduction_detector.h
@@ -33,13 +33,14 @@ namespace xla {
 // accumulation type is a floating point type smaller than f32.
 class UnstableReductionDetector : public HloModulePass {
  public:
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override {
     return "unstable-reduction-detector";
   }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc b/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc
index b2c9342628173e..f0a6454e2f8639 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc
+++ b/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc
@@ -62,6 +62,22 @@ static constexpr absl::string_view kUnstableReductionNoMetadataHloModule = R"(
   }
 )";
 
+static constexpr absl::string_view kNoOpUnstableReductionHloModule = R"(
+  red {
+      p0 = bf16[] parameter(0)
+      p1 = bf16[] parameter(1)
+      ROOT red = bf16[] add(p0, p1)
+  }
+
+  ENTRY main {
+      p0 = bf16[1] parameter(0)
+      init = bf16[] constant(1.0)
+      ROOT red = bf16[] reduce(p0, init),
+          to_apply=red,
+          dimensions={0}
+  }
+)";
+
 using ::absl::LogSeverity;
 using ::absl_testing::IsOkAndHolds;
 using ::absl_testing::StatusIs;
@@ -73,8 +89,7 @@ TEST(UnstableReductionDetectorTest, FailOnUnstableReductions) {
       auto module, ParseAndReturnUnverifiedModule(kUnstableReductionHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_FAIL);
   UnstableReductionDetector detector;
   ::absl::ScopedMockLog log;
   EXPECT_CALL(
@@ -101,8 +116,7 @@ TEST(UnstableReductionDetectorTest, WarningOnUnstableReduction) {
       auto module, ParseAndReturnUnverifiedModule(kUnstableReductionHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_WARNING);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_WARNING);
   UnstableReductionDetector detector;
   ::absl::ScopedMockLog log;
   EXPECT_CALL(log, Log(LogSeverity::kWarning, _,
@@ -123,8 +137,7 @@ TEST(UnstableReductionDetectorTest, FailOnUnstableReductionNoMetadata) {
       ParseAndReturnUnverifiedModule(kUnstableReductionNoMetadataHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_FAIL);
   UnstableReductionDetector detector;
   ::absl::ScopedMockLog log;
   EXPECT_CALL(log, Log(LogSeverity::kWarning, _,
@@ -143,8 +156,7 @@ TEST(UnstableReductionDetectorTest, DoNothingOnUnstableReduction) {
       auto module, ParseAndReturnUnverifiedModule(kUnstableReductionHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_NONE);
   ::absl::ScopedMockLog log;
   EXPECT_CALL(log, Log(LogSeverity::kWarning, _, _)).Times(0);
   UnstableReductionDetector detector;
@@ -153,5 +165,21 @@ TEST(UnstableReductionDetectorTest, DoNothingOnUnstableReduction) {
               IsOkAndHolds(false));
 }
 
+TEST(UnstableReductionDetectorTest, NoOpUnstableReduction) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(
+                                           kNoOpUnstableReductionHloModule));
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_WARNING);
+  UnstableReductionDetector detector;
+  ::absl::ScopedMockLog log;
+  EXPECT_CALL(log, Log(LogSeverity::kWarning, _, _)).Times(0);
+  EXPECT_CALL(log, Log(LogSeverity::kError, _, _)).Times(0);
+  log.StartCapturingLogs();
+  EXPECT_THAT(detector.Run(module.get(), /*execution_threads=*/{}),
+              IsOkAndHolds(false));
+  log.StopCapturingLogs();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_finder.cc b/third_party/xla/xla/service/debug/unstable_reduction_finder.cc
index 764b115d326e76..07f866c4a007ae 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_finder.cc
+++ b/third_party/xla/xla/service/debug/unstable_reduction_finder.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -62,7 +63,13 @@ bool IsKnownStableReduction(const HloReduceInstruction* reduction) {
     return true;
   }
 
-  return false;
+  Shape operand_shape = reduction->operand(0)->shape();
+  for (auto dim : reduction->dimensions()) {
+    if (operand_shape.dimensions(dim) != 1) {
+      return false;
+    }
+  }
+  return true;
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index c5e490b3c1b22f..5a2f963624881e 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -45,6 +45,8 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Transforms/LocationSnapshot.h"
+#include "google/protobuf/descriptor.h"
+#include "google/protobuf/text_format.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -64,7 +66,6 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/regexp.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
@@ -210,6 +211,7 @@ struct CanonicalDebugOptions {
         return RE2::PartialMatch(module_name, pattern);
       };
     } else if (!opts.xla_dump_hlo_pass_re().empty() ||
+               !opts.xla_dump_emitter_re().empty() ||
                !opts.xla_dump_to().empty() || output_format_specified) {
       should_dump_module = [](string_view) { return true; };
     } else {
@@ -227,6 +229,15 @@ struct CanonicalDebugOptions {
       should_dump_pass = [](string_view) { return false; };
     }
 
+    if (!opts.xla_dump_emitter_re().empty()) {
+      std::string pattern = opts.xla_dump_emitter_re();
+      should_dump_emitter = [pattern](string_view emitter_name) {
+        return RE2::PartialMatch(emitter_name, pattern);
+      };
+    } else {
+      should_dump_emitter = [](string_view) { return false; };
+    }
+
     // Initialize should_dump_pipeline. If the option was not specified, dump
     // all pipelines. Otherwise dump only those pipelines that user asked for
     // explicitly.
@@ -251,6 +262,7 @@ struct CanonicalDebugOptions {
                       "is not set, so cannot dump anywhere.";
         should_dump_module = [](string_view) { return false; };
         should_dump_pass = [](string_view) { return false; };
+        should_dump_emitter = [](string_view) { return false; };
         should_dump_pipeline = [](string_view) { return false; };
       }
     }
@@ -269,6 +281,7 @@ struct CanonicalDebugOptions {
   std::string dump_to;
   std::function<bool(string_view module_name)> should_dump_module;
   std::function<bool(string_view pass_name)> should_dump_pass;
+  std::function<bool(string_view emitter_name)> should_dump_emitter;
   std::function<bool(string_view pipeline_name)> should_dump_pipeline;
 
   // dump_ir isn't present here because this file is mostly concerned with
@@ -1015,6 +1028,11 @@ bool DumpingEnabledForHloPass(string_view hlo_pass_name,
   return CanonicalDebugOptions(opts).should_dump_pass(hlo_pass_name);
 }
 
+bool DumpingEnabledForEmitter(string_view emitter_name,
+                              const DebugOptions& opts) {
+  return CanonicalDebugOptions(opts).should_dump_emitter(emitter_name);
+}
+
 bool DumpingToStdout(const DebugOptions& opts) {
   return CanonicalDebugOptions(opts).dumping_to_stdout();
 }
@@ -1176,6 +1194,8 @@ void DumpHloUnoptimizedSnapshotIfEnabled(
       return;
     }
     tsl::WritableFileCopyingOutputStream output_stream(file.get());
+    // TODO - b/457711066: Add missing include once capybara can re-write the
+    // dependency correctly.
     tsl::protobuf::io::CopyingOutputStreamAdaptor adaptor(&output_stream);
     if (!SerializeHloUnoptimizedSnapshot(hlo_snapshot, &adaptor).ok()) {
       LOG(ERROR) << "Failed to serialize HLO unoptimized snapshot proto";
diff --git a/third_party/xla/xla/service/dump.h b/third_party/xla/xla/service/dump.h
index 717426ac6271ce..76fe27c938f5f2 100644
--- a/third_party/xla/xla/service/dump.h
+++ b/third_party/xla/xla/service/dump.h
@@ -190,6 +190,10 @@ bool DumpingEnabledForHloModule(absl::string_view hlo_module_name,
 bool DumpingEnabledForHloPass(absl::string_view hlo_pass_name,
                               const DebugOptions& opts);
 
+// Returns true if we should dump data for an emitter.
+bool DumpingEnabledForEmitter(absl::string_view emitter_name,
+                              const DebugOptions& opts);
+
 inline bool DumpingEnabledForHloModule(const HloModule& module) {
   return DumpingEnabledForHloModule(module.name(),
                                     module.config().debug_options());
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.cc b/third_party/xla/xla/service/dynamic_dimension_inference.cc
index 6ff32dcacc7375..5d606be110b90e 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.cc
@@ -62,7 +62,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -1352,7 +1351,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReshape(
             auto orig_reshape_pair = find_reshape_group_pair(op, op_dim_index);
             if (is_reverse_reshape_group_pair(op, orig_reshape_pair, hlo,
                                               reshape_pair)) {
-              TF_CHECK_OK(ForEachOperandDynamicDimension(
+              CHECK_OK(ForEachOperandDynamicDimension(
                   op,
                   [&](HloInstruction* operand, ShapeIndex index,
                       int64_t op_dynamic_dimension, int64_t operand_index,
@@ -2553,7 +2552,7 @@ absl::Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
           HloInstruction* tuple =
               element->AddInstruction(HloInstruction::CreateVariadic(
                   subshape, HloOpcode::kTuple, children));
-          TF_CHECK_OK(ForEachOperandDynamicDimension(
+          CHECK_OK(ForEachOperandDynamicDimension(
               tuple,
               [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
                   int64_t operand_index, HloInstruction* dynamic_size) {
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference_test.cc b/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
index 760c92b931be97..ca9c4a19b4e8ed 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
@@ -15,6 +15,15 @@ limitations under the License.
 
 #include "xla/service/dynamic_dimension_inference.h"
 
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -22,17 +31,17 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/literal.h"
-#include "xla/service/hlo_runner.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test_benchmark.h"
 
 namespace xla {
 namespace {
@@ -620,7 +629,7 @@ TEST_F(DynamicDimensionInferenceTest, ReshapeIntoScalar) {
   module_->AddEntryComputation(builder.Build());
 
   SCOPED_TRACE(module_->ToString());
-  TF_CHECK_OK(RunInference());
+  CHECK_OK(RunInference());
 }
 
 TEST_F(DynamicDimensionInferenceTest, GatherTest) {
diff --git a/third_party/xla/xla/service/dynamic_padder.cc b/third_party/xla/xla/service/dynamic_padder.cc
index a094e1f86c1838..b0cf5cd49a2862 100644
--- a/third_party/xla/xla/service/dynamic_padder.cc
+++ b/third_party/xla/xla/service/dynamic_padder.cc
@@ -2079,7 +2079,7 @@ absl::Status DynamicShapeRemovingVisitor::HandleSetDimensionSize(
 
 }  // namespace
 
-absl::StatusOr<bool> DynamicPadder::Run(
+absl::StatusOr<bool> DynamicPadder::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Pre DynamicPadder HLO:";
diff --git a/third_party/xla/xla/service/dynamic_padder.h b/third_party/xla/xla/service/dynamic_padder.h
index 8ec029acee1292..4b428194e2d781 100644
--- a/third_party/xla/xla/service/dynamic_padder.h
+++ b/third_party/xla/xla/service/dynamic_padder.h
@@ -74,8 +74,8 @@ class DynamicPadder : public HloModulePass {
 
   absl::string_view name() const override { return "dynamic_padder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/dynamic_padder_test.cc b/third_party/xla/xla/service/dynamic_padder_test.cc
index e90ed8c1ae1781..7d1b36edb04fc6 100644
--- a/third_party/xla/xla/service/dynamic_padder_test.cc
+++ b/third_party/xla/xla/service/dynamic_padder_test.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/dynamic_dimension_inference.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -53,7 +54,6 @@ limitations under the License.
 #include "xla/tests/llvm_irgen_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/util.h"
@@ -766,9 +766,9 @@ class ExecutionTest : public HloTestBase {
     DynamicPadderOptions options;
     options.slice_dynamic_output = slice_dynamic_output;
     DynamicPadder padder(options);
-    TF_CHECK_OK(padder.Run(module.get()).status());
+    CHECK_OK(padder.Run(module.get()).status());
     HloDCE dce;
-    TF_CHECK_OK(dce.Run(module.get()).status());
+    CHECK_OK(dce.Run(module.get()).status());
     return Execute(std::move(module), {arguments});
   }
 };
@@ -825,7 +825,7 @@ ENTRY main {
   Literal updates_padded = LiteralUtil::CreateR2<int32_t>(
       {{10, 20, 30}, {70, 80, 90}, {30, 22, 11}, {-1, 20, -1}});
   DynamicPadder padder;
-  TF_CHECK_OK(padder.Run(module_padded.get()).status());
+  CHECK_OK(padder.Run(module_padded.get()).status());
   TF_ASSERT_OK_AND_ASSIGN(Literal padded,
                           PadAndExecute(std::move(module_padded),
                                         {&operand, &scatter_indices_padded,
@@ -917,7 +917,7 @@ ENTRY main {
 
   auto module_padded = GetHloModule(hlo_text);
   DynamicPadder padder;
-  TF_CHECK_OK(padder.Run(module_padded.get()).status());
+  CHECK_OK(padder.Run(module_padded.get()).status());
   TF_ASSERT_OK_AND_ASSIGN(
       Literal not_padded,
       PadAndExecute(std::move(module_padded),
@@ -973,7 +973,7 @@ ENTRY main {
       LiteralUtil::CreateR3<int32_t>({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
   auto module = GetHloModule(hlo_text);
   DynamicPadder padder;
-  TF_CHECK_OK(padder.Run(module.get()).status());
+  CHECK_OK(padder.Run(module.get()).status());
   TF_ASSERT_OK_AND_ASSIGN(Literal result,
                           PadAndExecute(std::move(module), {&operand}));
 
@@ -1025,7 +1025,7 @@ ENTRY main {
   Literal operand_padded = LiteralUtil::CreateR2<int32_t>(
       {{1, 2, 3, 4}, {4, 5, 6, 7}, {1, 2, 3, 4}, {4, 5, 6, 7}});
   DynamicPadder padder;
-  TF_CHECK_OK(padder.Run(module_padded.get()).status());
+  CHECK_OK(padder.Run(module_padded.get()).status());
   TF_ASSERT_OK_AND_ASSIGN(Literal padded,
                           PadAndExecute(std::move(module_padded),
                                         {&operand_padded, &dynamic_size}));
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index f15829bcbaca12..f7814ea7fe330a 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -713,6 +713,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       return EmitAcos(op->shape().element_type(), operand_value);
     case HloOpcode::kAsin:
       return EmitAsin(op->shape().element_type(), operand_value);
+    case HloOpcode::kAsinh:
+      return EmitAsinh(op->shape().element_type(), operand_value);
     case HloOpcode::kAcosh:
       return EmitAcosh(op->shape().element_type(), operand_value);
     case HloOpcode::kAtanh:
@@ -1199,14 +1201,38 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      TF_ASSIGN_OR_RETURN(
-          auto exp_a,
-          EmitExp(component_type, EmitExtractReal(operand_value), ""));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
-      return EmitComposeComplex(op, FMul(exp_a, cos_b), FMul(exp_a, sin_b));
+      // Handle special cases as StableHLO implementation does:
+      // 1. When b == 0, set imag(exp(z)) = 0
+      // 2. When exp(a) == inf, use exp(a/2) * (cos(b) + I*sin(b)) * exp(a/2)
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      auto type = a->getType();
+      auto zero = llvm::ConstantFP::get(type, 0.0);
+      auto half = llvm::ConstantFP::get(type, 0.5);
+      auto pos_inf = llvm::ConstantFP::getInfinity(type);
+
+      TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a, ""));
+      auto a_half = FMul(a, half);
+      TF_ASSIGN_OR_RETURN(auto exp_a_half, EmitExp(component_type, a_half, ""));
+      TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b));
+      TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
+
+      auto exp_a_is_inf = FCmpOEQ(exp_a, pos_inf);
+      auto b_is_zero = FCmpOEQ(b, zero);
+
+      // Real part: select between exp(a)*cos(b) and exp(a/2)*cos(b)*exp(a/2)
+      auto real_normal = FMul(exp_a, cos_b);
+      auto real_overflow = FMul(FMul(exp_a_half, cos_b), exp_a_half);
+      auto real_result = Select(exp_a_is_inf, real_overflow, real_normal);
+
+      // Imaginary part: if b == 0 return 0 else select between exp(a)*sin(b)
+      // and exp(a/2)*sin(b)*exp(a/2)
+      auto imag_normal = FMul(exp_a, sin_b);
+      auto imag_overflow = FMul(FMul(exp_a_half, sin_b), exp_a_half);
+      auto imag_nonzero = Select(exp_a_is_inf, imag_overflow, imag_normal);
+      auto imag_result = Select(b_is_zero, zero, imag_nonzero);
+
+      return EmitComposeComplex(op, real_result, imag_result);
     }
     case HloOpcode::kExpm1: {
       // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
@@ -2215,6 +2241,11 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitAtanh(
   return Unimplemented("atanh");
 }
 
+absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitAsinh(
+    PrimitiveType prim_type, llvm::Value* value) {
+  return Unimplemented("asinh");
+}
+
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitTanh(
     PrimitiveType prim_type, llvm::Value* value) {
   return Unimplemented("tanh");
@@ -3196,6 +3227,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
   switch (hlo->opcode()) {
     case HloOpcode::kAbs:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAtanh:
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/xla/xla/service/elemental_ir_emitter.h
index 40d083454bfdb2..a64dc039542b43 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -162,6 +162,9 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual absl::StatusOr<llvm::Value*> EmitAtanh(PrimitiveType prim_type,
                                                  llvm::Value* value);
 
+  virtual absl::StatusOr<llvm::Value*> EmitAsinh(PrimitiveType prim_type,
+                                                 llvm::Value* value);
+
   virtual absl::StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
                                                llvm::Value* value);
 
diff --git a/third_party/xla/xla/service/executable.cc b/third_party/xla/xla/service/executable.cc
index 71a6fc8f258d05..7fe3ed7fab8444 100644
--- a/third_party/xla/xla/service/executable.cc
+++ b/third_party/xla/xla/service/executable.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/service_executable_run_options.h"
@@ -60,9 +61,9 @@ ExecutionInput::ExecutionInput(ExecutionInput&& rhs) noexcept
 absl::Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
   const Shape& input_shape = shape();
   if (!ShapeUtil::DynamicShapeIsCompatible(input_shape, dynamic_shape)) {
-    return tsl::errors::InvalidArgument(
-        "Cannot set dynamic shape: ", input_shape.ToString(), " vs. ",
-        dynamic_shape.ToString());
+    return absl::InvalidArgumentError(
+        absl::StrCat("Cannot set dynamic shape: ", input_shape.ToString(),
+                     " vs. ", dynamic_shape.ToString()));
   }
   dynamic_shape_ = std::make_unique<Shape>(std::move(dynamic_shape));
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/executable.h b/third_party/xla/xla/service/executable.h
index 4dfb0179c94c8f..bb8e494db1d64d 100644
--- a/third_party/xla/xla/service/executable.h
+++ b/third_party/xla/xla/service/executable.h
@@ -26,10 +26,12 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
@@ -444,6 +446,16 @@ class Executable {
     return {};
   }
 
+  // Gives the executable a chance to dump itself with the given
+  // `ExecutableBuildOptions`
+  // Whether dumping is enabled, and how/where is determined by the
+  // `debug_options`.
+  virtual absl::Status DumpExecutableIfEnabled(
+      const ExecutableBuildOptions& options,
+      const DebugOptions& debug_options) const {
+    return absl::OkStatus();
+  }
+
  protected:
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
diff --git a/third_party/xla/xla/service/global_device_id.h b/third_party/xla/xla/service/global_device_id.h
index 07ed206acaca65..aff76b21cf7f9f 100644
--- a/third_party/xla/xla/service/global_device_id.h
+++ b/third_party/xla/xla/service/global_device_id.h
@@ -20,18 +20,12 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "xla/runtime/device_id.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 
 namespace xla {
 
-// DEPRECATED: Use GlobalDeviceId from device_id.h instead.
-using GlobalDeviceId = GlobalDeviceId;
-
 // Returns a comma-separated string of global device IDs.
 std::string GlobalDeviceIdsToString(absl::Span<GlobalDeviceId const> ids);
 
-using ::tsl::IncarnationId;  // NOLINT(misc-unused-using-decls)
-
 }  // namespace xla
 
 #endif  // XLA_SERVICE_GLOBAL_DEVICE_ID_H_
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 315584c4d653b2..a9d64d9310e0ab 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -6,6 +6,10 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
+load(
+    "@local_config_sycl//sycl:build_defs.bzl",
+    "if_sycl_is_configured",
+)
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
 load("//xla/tests:build_defs.bzl", "xla_test")
@@ -38,7 +42,7 @@ package_group(
     ],
 )
 
-# Filegroup used to collect source files for dependency checking.
+#Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
     data = glob([
@@ -68,10 +72,10 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -86,7 +90,8 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
-        "//xla/service:global_device_id",
+        "//xla/runtime:device_id",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status:statusor",
@@ -108,6 +113,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":backend_configs_cc",
+        ":ir_emission_utils",
         "//xla:xla_proto_cc",
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/analysis:hlo_ordering",
@@ -118,7 +124,6 @@ cc_library(
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -178,11 +183,11 @@ xla_cc_test(
     deps = [
         ":launch_dimensions",
         "//xla/stream_executor:launch_dim",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -194,9 +199,12 @@ xla_test(
     tags = ["no-oneapi"],  # TODO(intel-tf): Remove it when macro substitutions for SYCL are available in xla/stream_executor/sycl/*.
     deps = [
         "//xla:debug_options_flags",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu:ffi",
         "//xla/ffi",
         "//xla/ffi:execution_context",
         "//xla/ffi:ffi_api",
@@ -221,7 +229,10 @@ xla_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -329,6 +340,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
     ],
 )
 
@@ -344,6 +356,7 @@ cc_library(
         ":gpu_constants",
         ":ir_emitter_context",
         ":kernel_call",
+        "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/hlo/ir:hlo",
@@ -372,6 +385,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ptx_custom_kernel_emitter_sycl",
+    srcs = ["custom_kernel_emitter_sycl_stub.cc"],
+    hdrs = ["custom_kernel_emitter.h"],
+    tags = [
+        "gpu",
+        "oneapi-only",
+    ],
+    deps = [
+        ":ir_emitter_context",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 cc_library(
     name = "ptx_custom_kernel_emitter",
     hdrs = ["custom_kernel_emitter.h"],
@@ -380,6 +409,8 @@ cc_library(
         ":ptx_custom_kernel_emitter_cuda",
     ]) + if_rocm_is_configured([
         ":ptx_custom_kernel_emitter_rocm",
+    ]) + if_sycl_is_configured([
+        ":ptx_custom_kernel_emitter_sycl",
     ]) + [
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status:statusor",
@@ -388,37 +419,34 @@ cc_library(
 
 cc_library(
     name = "ir_emitter_context",
-    srcs = ["ir_emitter_context.cc"],
     hdrs = ["ir_emitter_context.h"],
     deps = [
         ":execution_stream_assignment",
-        ":gpu_constants",
         ":gpu_executable",
         ":ir_emission_utils",
         ":kernel_reuse_cache",
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:host_execute_thunk",
         "//xla/backends/gpu/runtime:thunk_id",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:call_inliner",
         "//xla/service:name_uniquer",
+        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
     ],
 )
 
 cc_library(
-    name = "ir_emitter_unnested",
-    srcs = ["ir_emitter_unnested.cc"],
-    hdrs = ["ir_emitter_unnested.h"],
+    name = "thunk_emitter",
+    srcs = ["thunk_emitter.cc"],
+    hdrs = ["thunk_emitter.h"],
     tags = ["gpu"],
     deps = [
         ":backend_configs_cc",
@@ -427,6 +455,7 @@ cc_library(
         ":gpu_asm_opts_util",
         ":gpu_constants",
         ":gpu_conv_runner",
+        ":gpu_executable",
         ":gpu_norm_runner",
         ":hlo_fusion_analysis",
         ":ir_emission_utils",
@@ -449,25 +478,29 @@ cc_library(
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/triton:fusion_emitter",
+        "//xla/backends/gpu/codegen/triton:xtile_compiler",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/runtime:all_gather_thunk",
         "//xla/backends/gpu/runtime:all_reduce_thunk",
         "//xla/backends/gpu/runtime:all_to_all_thunk",
-        "//xla/backends/gpu/runtime:cholesky_thunk",
         "//xla/backends/gpu/runtime:collective_broadcast_thunk",
         "//xla/backends/gpu/runtime:collective_group_thunk",
+        "//xla/backends/gpu/runtime:collective_kernel_thunk",
+        "//xla/backends/gpu/runtime:collective_metadata_thunk",
         "//xla/backends/gpu/runtime:collective_permute_thunk",
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:command_buffer_cmd",
         "//xla/backends/gpu/runtime:command_buffer_cmd_emitter",
         "//xla/backends/gpu/runtime:command_buffer_thunk",
         "//xla/backends/gpu/runtime:conditional_thunk",
+        "//xla/backends/gpu/runtime:convolution_reorder_thunk",
         "//xla/backends/gpu/runtime:convolution_thunk",
         "//xla/backends/gpu/runtime:copy_thunk",
         "//xla/backends/gpu/runtime:cub_sort_thunk",
         "//xla/backends/gpu/runtime:cudnn_thunk",
         "//xla/backends/gpu/runtime:custom_call_target",
         "//xla/backends/gpu/runtime:custom_call_thunk",
+        "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:fft_thunk",
         "//xla/backends/gpu/runtime:gemm_thunk",
         "//xla/backends/gpu/runtime:gpublas_lt_matmul_thunk",
@@ -489,6 +522,7 @@ cc_library(
         "//xla/backends/gpu/runtime:select_k_thunk",
         "//xla/backends/gpu/runtime:send_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/backends/gpu/runtime:topk",
         "//xla/backends/gpu/runtime:triangular_solve_thunk",
@@ -508,10 +542,12 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:global_device_id",
+        "//xla/service:hlo_creation_utils",
         "//xla/service:name_uniquer",
         "//xla/service:platform_util",
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/model:block_level_parameters",
+        "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/service/llvm_ir:buffer_assignment_util",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
@@ -531,6 +567,7 @@ cc_library(
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -545,6 +582,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
@@ -577,34 +615,52 @@ cc_library(
         "ir_emitter_nested.h",
     ],
     deps = [
+        ":gpu_constants",
+        ":gpu_executable",
         ":hlo_to_ir_bindings",
         ":ir_emission_utils",
         ":ir_emitter_context",
+        ":launch_dimensions",
+        ":parallel_loop_emitter",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
+        "//xla/backends/gpu/codegen:fusion_emitter",
+        "//xla/backends/gpu/runtime:kernel_thunk",
+        "//xla/backends/gpu/runtime:thunk",
+        "//xla/backends/gpu/runtime:thunk_id",
         "//xla/codegen/emitters:computation_fingerprint",
+        "//xla/codegen/emitters:kernel_arguments",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:elemental_ir_emitter",
         "//xla/service/llvm_ir:buffer_assignment_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:ir_builder_mixin",
+        "//xla/service/llvm_ir:kernel_support_library",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
+        "//xla/service/llvm_ir:sort_util",
         "//xla/service/llvm_ir:tuple_ops",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -614,6 +670,7 @@ cc_library(
     srcs = ["kernel_call.cc"],
     hdrs = ["kernel_call.h"],
     deps = [
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -634,6 +691,7 @@ xla_cc_test(
     srcs = ["kernel_call_test.cc"],
     deps = [
         ":kernel_call",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -716,7 +774,6 @@ cc_library(
         ":gpu_executable_proto_cc",
         ":gpu_executable_run_options",
         ":ir_emission_utils",
-        ":resource_requests",
         ":stream_executor_util",
         "//xla:executable_run_options",
         "//xla:shape_tree",
@@ -724,14 +781,23 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/runtime:annotation",
+        "//xla/backends/gpu/runtime:collective_clique_requests",
+        "//xla/backends/gpu/runtime:collective_cliques",
+        "//xla/backends/gpu/runtime:collective_params",
         "//xla/backends/gpu/runtime:command_buffer_conversion_pass",
         "//xla/backends/gpu/runtime:nvshmem_collective_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
-        "//xla/backends/gpu/runtime:thunk_checksum_tracing_pass",
+        "//xla/backends/gpu/runtime:thunk_buffer_debug_pass",
         "//xla/backends/gpu/runtime:thunk_pass_pipeline",
+        "//xla/backends/gpu/runtime:thunk_proto_deserialization",
+        "//xla/client:executable_build_options",
+        "//xla/core/collectives:clique_key",
+        "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:computation_layout",
         "//xla/service:dump",
@@ -758,7 +824,6 @@ cc_library(
         "//xla/tsl/platform:env_time",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
@@ -774,7 +839,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -803,12 +867,14 @@ xla_cc_test(
         ":gpu_executable",
         ":launch_dimensions",
         "//xla:debug_options_flags",
+        "//xla:literal_util",
         "//xla:shape_layout",
         "//xla:shape_util",
         "//xla/backends/gpu/runtime:copy_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
+        "//xla/client:executable_build_options",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
@@ -843,6 +909,7 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":ir_emission_utils_proto_cc",
+        ":launch_dimensions",
         ":target_util",
         "//xla:literal",
         "//xla:permutation_util",
@@ -868,6 +935,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
@@ -889,17 +957,15 @@ xla_cc_test(
         ":ir_emission_utils_proto_cc",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:shape_util",
         "//xla:types",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
-        "//xla/service:buffer_assignment",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
@@ -1048,15 +1114,13 @@ xla_cc_test(
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service/gpu/transforms:gemm_fusion",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1140,11 +1204,10 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_blas_lt",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -1168,11 +1231,10 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1183,6 +1245,7 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
+        ":gpu_conv_runner_proto_cc",
         ":stream_executor_util",
         "//xla:shape_util",
         "//xla:util",
@@ -1204,6 +1267,30 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "gpu_conv_runner_proto",
+    srcs = ["gpu_conv_runner.proto"],
+    deps = [
+        ":backend_configs",
+        "//xla:xla_data_proto",
+        "//xla/tsl/protobuf:dnn_proto",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_conv_runner_test",
+    srcs = ["gpu_conv_runner_test.cc"],
+    deps = [
+        ":backend_configs_cc",
+        ":gpu_conv_runner",
+        ":gpu_conv_runner_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "gpu_norm_runner",
     srcs = ["gpu_norm_runner.cc"],
@@ -1269,7 +1356,6 @@ cc_library(
         "//xla/tools:hlo_module_loader",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -1277,6 +1363,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -1332,15 +1419,16 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1354,13 +1442,14 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
         "@com_google_absl//absl/functional:overload",
     ],
 )
 
 tf_proto_library(
-    name = "executable_proto",
-    srcs = ["executable.proto"],
+    name = "kernel_reuse_cache_proto",
+    srcs = ["kernel_reuse_cache.proto"],
     protodeps = [
         "//xla/service:hlo_proto",
         "//xla:xla_proto",
@@ -1444,14 +1533,14 @@ cc_library(
     tags = ["gpu"],
     deps = [
         ":alias_info",
-        ":executable_proto_cc",
         ":execution_stream_assignment",
         ":gpu_constants",
         ":gpu_executable",
         ":gpu_memory_space_assignment",
         ":ir_emitter_context",
-        ":ir_emitter_unnested",
+        ":kernel_reuse_cache_proto_cc",
         ":metrics",
+        ":thunk_emitter",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
@@ -1460,6 +1549,7 @@ cc_library(
         "//xla/backends/gpu/runtime:runtime_intrinsics",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:buffer_value",
@@ -1499,13 +1589,13 @@ cc_library(
     hdrs = ["fusion_dispatch_pipeline.h"],
     deps = [
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
         "//xla/service/gpu/transforms:fusion_dynamic_memcpy_rewriter",
         "//xla/stream_executor:device_description",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1514,7 +1604,9 @@ cc_library(
     srcs = ["fusion_pipeline.cc"],
     hdrs = ["fusion_pipeline.h"],
     deps = [
+        ":alias_info",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -1529,6 +1621,7 @@ cc_library(
         "//xla/service/gpu/transforms:priority_fusion",
         "//xla/service/gpu/transforms:variadic_op_splitter",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:env",
     ],
@@ -1571,13 +1664,14 @@ cc_library(
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
         ":cublas_cudnn",
-        ":executable_proto_cc",
         ":execution_stream_assignment",
         ":flag_utils",
         ":fusion_dispatch_pipeline",
         ":fusion_pipeline",
+        ":gpu_aot_compilation_result",
         ":gpu_constants",
         ":gpu_executable",
+        ":gpu_executable_proto_cc",
         ":gpu_float_support",
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
@@ -1586,13 +1680,14 @@ cc_library(
         ":ir_emission_utils",
         ":ir_emitter",
         ":ir_emitter_context",
-        ":ir_emitter_unnested",
         ":kernel_reuse_cache",
+        ":legacy_gpu_aot_compilation_result",
         ":matmul_utils",
         ":metrics",
         ":pre_scheduling_copy_insertion_pipeline",
         ":reduction_utils",
         ":stream_executor_util",
+        ":thunk_emitter",
         "//xla:autotune_results_proto_cc",
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -1610,11 +1705,13 @@ cc_library(
         "//xla/backends/gpu/runtime:runtime_intrinsics",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
+        "//xla/backends/gpu/runtime:thunk_proto_cc",
         "//xla/core/host_offloading:hlo_host_device_type_call_wrapper",
         "//xla/core/host_offloading:host_compute_asyncifier",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
@@ -1637,6 +1734,7 @@ cc_library(
         "//xla/hlo/transforms/collectives:collectives_schedule_linearizer",
         "//xla/hlo/transforms/collectives:convert_async_collectives_to_sync",
         "//xla/hlo/transforms/expanders:bitcast_dtypes_expander",
+        "//xla/hlo/transforms/expanders:cholesky_expander",
         "//xla/hlo/transforms/expanders:comparison_expander",
         "//xla/hlo/transforms/expanders:convolution_4d_expander",
         "//xla/hlo/transforms/expanders:convolution_pred_expander",
@@ -1686,6 +1784,7 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/service:all_reduce_promotion",
         "//xla/service:all_reduce_reassociate",
         "//xla/service:all_reduce_simplifier",
@@ -1831,6 +1930,7 @@ cc_library(
         "//xla/tsl/lib/monitoring:counter",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -1845,6 +1945,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
@@ -1878,7 +1979,7 @@ xla_test(
     name = "gpu_compiler_test",
     srcs = ["gpu_compiler_test.cc"],
     backend_tags = {
-        # TODO(b/445172709): Re-enable once fixed.
+        # TODO(b/445172709): Re-enable once nvbug 5552596 is fixed.
         "b200": ["broken"],
     },
     backends = ["gpu"],
@@ -1895,11 +1996,11 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -1925,26 +2026,102 @@ xla_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/testing:temporary_directory",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:regexp",
     ],
 )
 
+cc_library(
+    name = "legacy_gpu_aot_compilation_result",
+    srcs = ["legacy_gpu_aot_compilation_result.cc"],
+    hdrs = ["legacy_gpu_aot_compilation_result.h"],
+    # Explicitely restrict the visibility since this is an internal implementation detail of the
+    # gpu_compiler.
+    visibility = [],
+    deps = [
+        ":gpu_executable_proto_cc",
+        ":gpu_latency_hiding_scheduler",
+        ":ir_emission_utils",
+        "//xla:util",
+        "//xla/hlo/analysis:alias_info",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "gpu_aot_compilation_result",
+    hdrs = ["gpu_aot_compilation_result.h"],
+    deps = [
+        ":gpu_executable",
+        ":gpu_executable_proto_cc",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_aot_compilation_result_test",
+    srcs = ["gpu_aot_compilation_result_test.cc"],
+    deps = [
+        ":gpu_aot_compilation_result",
+        ":gpu_executable",
+        ":launch_dimensions",
+        "//xla:literal_util",
+        "//xla/backends/gpu/runtime:kernel_thunk",
+        "//xla/backends/gpu/runtime:sequential_thunk",
+        "//xla/backends/gpu/runtime:thunk",
+        "//xla/codegen/emitters:kernel_arguments",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:mock_platform",
+        "//xla/stream_executor:mock_stream_executor",
+        "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_test(
     name = "gpu_offloading_test",
     srcs = ["gpu_offloading_test.cc"],
@@ -2080,7 +2257,6 @@ cc_library(
         "//xla/service/gpu/transforms:cudnn_norm_rewriter",
         "//xla/service/gpu/transforms:cudnn_pad_for_convolutions",
         "//xla/service/gpu/transforms:cudnn_simplify_padding",
-        "//xla/service/gpu/transforms:gpusolver_rewriter",
         "//xla/service/gpu/transforms:triangular_solve_rewriter",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -2094,7 +2270,6 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/cuda:cuda_diagnostics",
         "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/stream_executor/cuda:cuda_solver_context",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -2143,6 +2318,7 @@ xla_test(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
@@ -2230,6 +2406,7 @@ xla_cc_test(
         ":amdgpu_compiler_impl",
     ]) + [
         ":gpu_transfer_manager",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/ir:hlo",
@@ -2245,8 +2422,11 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -2321,7 +2501,6 @@ cc_library(
         "//xla/service/gpu/transforms:conv_rewriter",
         "//xla/service/gpu/transforms:cublas_pad_for_gemms",
         "//xla/service/gpu/transforms:cudnn_fused_conv_rewriter",
-        "//xla/service/gpu/transforms:gpusolver_rewriter",
         "//xla/service/gpu/transforms:triangular_solve_rewriter",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
@@ -2388,9 +2567,11 @@ cc_library(
         ":backend_configs_cc",
         ":flag_utils",
         ":gpu_latency_hiding_scheduler",
+        ":hlo_fusion_analysis",
         ":ir_emission_utils",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/collectives:async_collective_creator",
@@ -2422,7 +2603,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -2454,15 +2634,16 @@ xla_test(
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
     ],
 )
@@ -2570,12 +2751,13 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:repeat_buffer_kernel",
         "//xla/tsl/platform:statusor",
@@ -2593,8 +2775,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
@@ -2687,6 +2869,7 @@ cc_library(
     name = "alias_info",
     srcs = ["alias_info.cc"],
     hdrs = ["alias_info.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":backend_configs_cc",
         ":hlo_fusion_analysis",
@@ -2733,16 +2916,38 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
+xla_cc_test(
+    name = "nvptx_alias_info_test",
+    srcs = ["nvptx_alias_info_test.cc"],
+    deps = [
+        ":gpu_device_info_for_tests",
+        ":nvptx_alias_info",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
+        "//xla/service:copy_insertion",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "gpu_fusible",
     srcs = ["gpu_fusible.cc"],
     hdrs = ["gpu_fusible.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":alias_info",
         ":backend_configs_cc",
         ":hlo_fusion_analysis",
         ":ir_emission_utils",
@@ -2752,7 +2957,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:side_effect_util",
         "//xla:util",
-        "//xla/codegen:ir_emission_utils",
+        "//xla:xla_proto_cc",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_instruction_utils",
@@ -2778,7 +2983,11 @@ xla_cc_test(
         "nomsan",
     ],
     deps = [
+        ":alias_info",
+        ":gpu_device_info_for_tests",
         ":gpu_fusible",
+        ":hlo_fusion_analysis",
+        "//xla:debug_options_flags",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -2843,16 +3052,13 @@ cc_library(
         "//xla/service/gpu/autotuning:gpu_autotuning_proto_cc",
         "//xla/stream_executor:dnn",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -2870,9 +3076,9 @@ xla_cc_test(
         "//xla/stream_executor:dnn",
         "//xla/tests:test_utils",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:path",
@@ -2901,7 +3107,6 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3016,7 +3221,8 @@ cc_library(
     srcs = ["kernel_reuse_cache.cc"],
     hdrs = ["kernel_reuse_cache.h"],
     deps = [
-        ":executable_proto_cc",
+        ":gpu_executable_proto_cc",
+        ":kernel_reuse_cache_proto_cc",
         ":launch_dimensions",
         "//xla:status_macros",
         "//xla:util",
@@ -3027,6 +3233,8 @@ cc_library(
         "//xla/stream_executor/gpu:tma_metadata",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -3038,7 +3246,6 @@ xla_cc_test(
     name = "kernel_reuse_cache_test",
     srcs = ["kernel_reuse_cache_test.cc"],
     deps = [
-        ":executable_proto_cc",
         ":kernel_reuse_cache",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
@@ -3051,10 +3258,6 @@ xla_cc_test(
 xla_test(
     name = "determinism_test",
     srcs = ["determinism_test.cc"],
-    backend_tags = {
-        # TODO(b/445172709): Re-enable once fixed.
-        "b200": ["broken"],
-    },
     backends = ["gpu"],
     deps = [
         "//xla:literal",
@@ -3119,9 +3322,9 @@ xla_cc_test(
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
@@ -3159,6 +3362,7 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
@@ -3166,17 +3370,17 @@ xla_cc_test(
         "//xla/service:profile_guided_latency_estimator",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3202,10 +3406,9 @@ xla_cc_test(
         ":fusion_deduplication_cache",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -3259,32 +3462,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "resource_requests",
-    srcs = ["resource_requests.cc"],
-    hdrs = ["resource_requests.h"],
-    deps = [
-        ":gpu_executable_run_options",
-        "//xla/backends/gpu/collectives:gpu_clique",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_cliques",
-        "//xla/backends/gpu/runtime:thunk",
-        "//xla/core/collectives:rank_id",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
-    ],
-)
-
 cc_library(
     name = "intel_gpu_compiler",
     srcs = [
@@ -3293,10 +3470,14 @@ cc_library(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":intel_gpu_compiler_impl",
-        "//xla/service:compiler",
         "//xla/stream_executor/sycl:sycl_platform_id",
     ],
     alwayslink = True,  # Contains compiler registration
@@ -3313,12 +3494,19 @@ cc_library(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":gpu_compiler",
         ":target_constants",
+        "//xla:debug_options_flags",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/service:dump",
         "//xla/service/gpu/llvm_gpu_backend:spirv_backend",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
@@ -3335,7 +3523,12 @@ xla_test(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":intel_gpu_compiler",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index 5b27e98c72c45c..3ece795912077c 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -60,7 +60,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/conv_rewriter.h"
 #include "xla/service/gpu/transforms/cublas_pad_for_gemms.h"
 #include "xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h"
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
 #include "xla/service/gpu/transforms/triangular_solve_rewriter.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
@@ -110,7 +109,7 @@ class ConvBfloat16Support : public FloatSupport {
 }  // namespace
 
 absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+    HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
     se::dnn::VersionInfo dnn_version,
     const se::SemanticVersion& toolkit_version) {
   // Convert convolutions into CustomCalls to MIOpen, then canonicalize them
@@ -121,16 +120,13 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
       /*allow_mixed_precision=*/false);
 
   // Convert unsupported bf16 convolutions to f32.
-  ConvBfloat16Support conv_bf16_support(
-      std::get<se::RocmComputeCapability>(gpu_version));
+  ConvBfloat16Support conv_bf16_support(*gpu_version.rocm_compute_capability());
   pipeline.AddPass<FloatNormalization>(&conv_bf16_support);
 
-  pipeline.AddPass<GpusolverRewriter>(
-      stream_executor::RocmSolverContext::Create);
   pipeline.AddPass<ConvRewriter>(gpu_version);
   pipeline.AddPass<ConvPaddingLegalization>();
-  auto rcc = std::get<se::RocmComputeCapability>(gpu_version);
-  pipeline.AddPass<CudnnFusedConvRewriter>(rcc, dnn_version, toolkit_version);
+  auto rcc = gpu_version.rocm_compute_capability();
+  pipeline.AddPass<CudnnFusedConvRewriter>(*rcc, dnn_version, toolkit_version);
 
   // The conv padding/vectorization passes which we need to get rid of.  They
   // also leave behind unnecessary tuple/get-tuple-element pairs that
@@ -185,18 +181,16 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
 
 absl::Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    const CompileOptions& options, const TargetConfig& gpu_target_config,
+    const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const GpuAliasInfo* alias_info, tsl::thread::ThreadPool* thread_pool) {
   HloPassPipeline pre_pipeline("AMDGPU post-layout_assignment part 1");
 
-  auto rocm_compute_capability = std::get<se::RocmComputeCapability>(
-      gpu_target_config.device_description.gpu_compute_capability());
-
   pre_pipeline.AddPass<DotDimensionMerger>();
 
   for (const auto& req : HipblasPaddingRequirements) {
-    pre_pipeline.AddPass<CublasPadForGemms>(rocm_compute_capability,
-                                            req.data_type, req.multiple_of);
+    pre_pipeline.AddPass<CublasPadForGemms>(
+        gpu_target_config.device_description.gpu_compute_capability(),
+        req.data_type, req.multiple_of);
   }
   // Padding a gemm operand that's a constant results in pad(constant).  Run
   // constant-folding to simplify this into a new constant.
@@ -249,7 +243,7 @@ absl::Status AMDGPUCompiler::AddConvAndGemmAutotuningPasses(
     const CompileOptions& options, HloModule* hlo_module,
     AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
     se::StreamExecutor* stream_exec,
-    const Compiler::TargetConfig* target_config) {
+    const Compiler::GpuTargetConfig* target_config) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
   if (hlo_module->config()
           .debug_options()
@@ -305,7 +299,7 @@ AMDGPUCompiler::CompileTargetBinary(
     // parallelized compilation of LLVM modules.
     XLA_SCOPED_LOGGING_TIMER_IF(
         "AMDGPUCompiler::CompileTargetBinary - CompileToHsaco",
-        !options.is_autotuning_compilation);
+        module_config.debug_options().xla_enable_scoped_logging_timers());
     TF_ASSIGN_OR_RETURN(
         hsaco, amdgpu::CompileToHsaco(
                    llvm_module, device_description.gpu_compute_capability(),
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.h b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
index 5a4973801962cb..37f1f691bcbc95 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
@@ -47,13 +47,13 @@ class AMDGPUCompiler : public GpuCompiler {
   AMDGPUCompiler();
 
   absl::Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
       se::dnn::VersionInfo dnn_version,
       const se::SemanticVersion& toolkit_version) override;
 
   absl::Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      const CompileOptions& options, const TargetConfig& gpu_target_config,
+      const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
       const GpuAliasInfo* alias_info,
       tsl::thread::ThreadPool* thread_pool) override;
 
@@ -66,7 +66,7 @@ class AMDGPUCompiler : public GpuCompiler {
       const CompileOptions& options, HloModule* hlo_module,
       AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
       se::StreamExecutor* stream_exec,
-      const Compiler::TargetConfig* target_config) override;
+      const Compiler::GpuTargetConfig* target_config) override;
 
   absl::StatusOr<BackendCompileResult> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 6054ac5e269d10..8d677c1e066a9d 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -48,6 +48,8 @@ cc_library(
         ":triton_configs",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu/codegen/triton:tma_utils",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -55,13 +57,16 @@ cc_library(
         "//xla/service:executable",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu/transforms:block_scaling_rewriter",
         "//xla/service/gpu/transforms:cudnn_fusion_compiler",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:env",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -69,7 +74,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
         "@local_config_cuda//cuda:cuda_headers",
     ],
 )
@@ -91,6 +95,7 @@ cc_library(
         ":triton_configs",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -107,7 +112,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
         "@local_config_rocm//rocm:rocm_headers",
     ],
 )
@@ -122,7 +126,9 @@ cc_library(
     deps = if_cuda_is_configured([":gemm_fusion_autotuner_cuda"]) + if_rocm_is_configured([
         ":gemm_fusion_autotuner_rocm",
     ]) + [
+        ":autotune_cache_key",
         ":autotuner_compile_util",
+        ":autotuner_pass",
         ":autotuner_status_key",
         ":autotuner_util",
         ":dot_search_space",
@@ -134,8 +140,13 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/backends/gpu/autotuner:cublas",
+        "//xla/backends/gpu/autotuner:fission_backend",
+        "//xla/backends/gpu/autotuner:triton",
         "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/backends/gpu/runtime:buffer_comparator",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -145,6 +156,7 @@ cc_library(
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:algorithm_util",
         "//xla/service:call_inliner",
+        "//xla/service:compiler",
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:hlo_cost_analysis",
@@ -154,6 +166,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_float_support",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:split_k_gemm_rewriter",
@@ -161,6 +174,7 @@ cc_library(
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:custom_kernel_fusion",
         "//xla/service/gpu/kernels:custom_kernel_fusion_pattern",
+        "//xla/service/gpu/transforms:block_scaling_rewriter",
         "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:fusion_wrapper",
@@ -199,6 +213,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
@@ -225,6 +240,7 @@ xla_test(
         "no_mac",
     ],
     deps = [
+        ":autotune_cache_key",
         ":autotuner_util",
         ":gemm_fusion_autotuner",
         "//xla:autotune_results_proto_cc",
@@ -232,6 +248,7 @@ xla_test(
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
@@ -261,7 +278,6 @@ xla_test(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -272,6 +288,7 @@ xla_test(
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
     ],
@@ -285,17 +302,18 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:util",
+        "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:matmul_utils",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/lib/core:bits",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_protobuf//:protobuf_lite",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -314,8 +332,8 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
@@ -345,9 +363,7 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -404,6 +420,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -428,11 +445,11 @@ cc_library(
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -443,6 +460,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:base64",
         "@local_tsl//tsl/platform:path",
@@ -464,6 +482,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu/autotuner:gpu_codegen_backend",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
@@ -540,7 +559,7 @@ cc_library(
     hdrs = ["conv_algorithm_picker.h"],
     tags = ["gpu"],
     deps = [
-        ":autotuner_compile_util",
+        ":autotune_cache_key",
         ":autotuner_util",
         ":gpu_autotuning_proto_cc",
         ":redzone_buffers",
@@ -555,12 +574,10 @@ cc_library(
         "//xla/backends/gpu/runtime:buffer_comparator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/service:executable",
         "//xla/service:hlo_module_config",
         "//xla/service:slow_operation_alarm",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/service/gpu:gpu_asm_opts_util",
         "//xla/service/gpu:gpu_conv_runner",
         "//xla/service/gpu:hlo_algorithm_denylist",
         "//xla/service/gpu:stream_executor_util",
@@ -568,8 +585,8 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:lazy_op_runner",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
@@ -580,7 +597,6 @@ cc_library(
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util:env_var",
@@ -629,7 +645,6 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -676,7 +691,6 @@ xla_cc_test(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:semantic_version",
         "//xla/stream_executor/host:host_platform",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -684,7 +698,6 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -693,6 +706,7 @@ xla_cc_test(
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -715,7 +729,9 @@ cc_library(
         "//xla/backends/gpu/autotuner:legacy_cache",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:compiler",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc
index c61bdac92138a3..bb134b965c0f9d 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc
@@ -55,12 +55,12 @@ std::string AutotuneCacheKey::HloInstructionToCanonicalString(
 std::string AutotuneCacheKey::DeviceDescriptionToCacheKey(
     const se::DeviceDescription& device_description) {
   std::string compute_capability;
-  if (auto* ccc = std::get_if<se::CudaComputeCapability>(
-          &device_description.gpu_compute_capability())) {
+  if (auto* ccc = device_description.gpu_compute_capability()
+                      .cuda_compute_capability()) {
     compute_capability = absl::StrCat("CUDA: ", ccc->major, ".", ccc->minor);
   } else {
-    auto* rcc = std::get_if<se::RocmComputeCapability>(
-        &device_description.gpu_compute_capability());
+    auto* rcc =
+        device_description.gpu_compute_capability().rocm_compute_capability();
     CHECK(rcc != nullptr) << "Unknown compute capability type";
     compute_capability = absl::StrCat("ROCM: ", rcc->gfx_version());
   }
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
index 3345e6d5595763..c133650da12dda 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
@@ -32,7 +32,7 @@ class AutotuneCacheKey {
   // Tie a version to the cache key in order to invalidate the cache when
   // necessary. This should be incremented on triton upgrades or any other
   // changes that may affect the autotuning results.
-  static constexpr int kCurrentVersion = 13;
+  static constexpr int kCurrentVersion = 19;
 
   AutotuneCacheKey(const se::DeviceDescription& device_description,
                    const HloInstruction& instruction,
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
index f6fbe1c9eff5d2..35c5669ade26df 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
index 6d3843e21bcd6b..abcbbe6035e242 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -137,7 +138,9 @@ absl::StatusOr<std::unique_ptr<Executable>> AutotunerCompileUtil::Compile(
   if (!new_hlo_module.status().ok()) {
     return new_hlo_module.status();
   }
-
+  GpuCodegenBackend::AdjustDebugOptionsForAutotuning(
+      new_hlo_module->get()->mutable_config().mutable_debug_options(),
+      /*force_allow_register_spills=*/false);
   absl::StatusOr<std::unique_ptr<Executable>> out = compiler_->RunBackend(
       std::move(*new_hlo_module), &stream_executor_,
       Compiler::CompileOptions{&allocator_, /*thread_pool=*/nullptr,
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
index 640b7ab7cbb811..a48e0a653ef87c 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
@@ -34,7 +34,9 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/compiler.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
@@ -71,6 +73,8 @@ AutotuneConfig GetAutotuneConfig(const DebugOptions& debug_options,
 
   autotune_config.expect_all_instructions_in_cache =
       debug_options.xla_gpu_require_complete_aot_autotune_results();
+  autotune_config.dump_hlos =
+      debug_options.xla_gpu_dump_autotuned_gemm_fusions();
 
   return autotune_config;
 }
@@ -89,8 +93,9 @@ absl::StatusOr<std::unique_ptr<AutotunerPass>> AutotunerPass::Create(
     const DebugOptions& debug_options,
     stream_executor::StreamExecutor* stream_executor,
     tsl::thread::ThreadPool* thread_pool, InstructionFilterFn should_autotune,
-    const Compiler::TargetConfig* target_config,
-    se::DeviceMemoryAllocator* allocator, bool optimize_scratch_bytes) {
+    const Compiler::GpuTargetConfig* target_config,
+    se::DeviceMemoryAllocator* allocator, bool optimize_scratch_bytes,
+    MultiProcessKeyValueStore key_value_store) {
   std::unique_ptr<Profiler> profiler = nullptr;
   bool is_deviceless = stream_executor == nullptr;
   AutotuneConfig autotune_config =
@@ -111,16 +116,24 @@ absl::StatusOr<std::unique_ptr<AutotunerPass>> AutotunerPass::Create(
       std::unique_ptr<Autotuner> autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler),
                         autotune_config, std::move(cache), thread_pool));
-  return absl::WrapUnique(
-      new AutotunerPass(std::move(autotuner), should_autotune));
+  return absl::WrapUnique(new AutotunerPass(
+      std::move(autotuner), should_autotune, std::move(key_value_store),
+      debug_options.xla_gpu_shard_autotuning()));
 }
 
-absl::StatusOr<bool> AutotunerPass::Run(
+absl::StatusOr<bool> AutotunerPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running Autotuner Pass";
 
-  TF_RETURN_IF_ERROR(autotuner_->Autotune(module, should_autotune_));
+  bool shard_autotuning =
+      enable_sharding_ && key_value_store_.process_count > 1;
+  if (shard_autotuning) {
+    TF_RETURN_IF_ERROR(
+        autotuner_->Autotune(module, should_autotune_, key_value_store_));
+  } else {
+    TF_RETURN_IF_ERROR(autotuner_->Autotune(module, should_autotune_));
+  }
   return true;
 }
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h
index 372a5a9b334fbf..7aab960df615f9 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -36,6 +37,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// HloModulePass that runs the autotuner.
 class AutotunerPass : public HloModulePass {
  public:
   // Note: the target_config must outlive the pass.
@@ -43,24 +45,32 @@ class AutotunerPass : public HloModulePass {
       std::vector<std::unique_ptr<CodegenBackend>> backends,
       const DebugOptions& debug_options, se::StreamExecutor* stream_executor,
       tsl::thread::ThreadPool* thread_pool, InstructionFilterFn should_autotune,
-      const Compiler::TargetConfig* target_config,
+      const Compiler::GpuTargetConfig* target_config,
       se::DeviceMemoryAllocator* allocator = nullptr,
-      bool optimize_scratch_bytes = true);
+      bool optimize_scratch_bytes = true,
+      MultiProcessKeyValueStore key_value_store = MultiProcessKeyValueStore());
 
   absl::string_view name() const override { return "autotuner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   explicit AutotunerPass(std::unique_ptr<Autotuner> autotuner,
-                         InstructionFilterFn should_autotune)
-      : autotuner_(std::move(autotuner)), should_autotune_(should_autotune) {}
+                         InstructionFilterFn should_autotune,
+                         MultiProcessKeyValueStore key_value_store,
+                         bool enable_sharding)
+      : autotuner_(std::move(autotuner)),
+        should_autotune_(should_autotune),
+        key_value_store_(std::move(key_value_store)),
+        enable_sharding_(enable_sharding) {}
 
   std::unique_ptr<Autotuner> autotuner_;
   InstructionFilterFn should_autotune_;
+  MultiProcessKeyValueStore key_value_store_;
+  bool enable_sharding_ = false;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass_test.cc
index e5297800cd9bb2..da4b987f7f9b74 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass_test.cc
@@ -106,7 +106,7 @@ TEST_F(AutotunerPassTest, CublasGemmIsAutotuned) {
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuning",
                                       /*num_threads=*/4);
   std::vector<std::unique_ptr<CodegenBackend>> backends;
-  GpuCompiler::TargetConfig target_config(stream_executor_);
+  GpuCompiler::GpuTargetConfig target_config(stream_executor_);
   backends.push_back(std::make_unique<CublasBackend>(
       stream_executor_, &module->config().debug_options(), &compiler_,
       &target_config));
@@ -134,7 +134,7 @@ TEST_F(AutotunerPassTest, CublasGemmIsNotAutotunedWhenFilterReturnsFalse) {
 
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuning",
                                       /*num_threads=*/4);
-  GpuCompiler::TargetConfig target_config(stream_executor_);
+  GpuCompiler::GpuTargetConfig target_config(stream_executor_);
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::make_unique<CublasBackend>(
       stream_executor_, &module->config().debug_options(), &compiler_,
@@ -172,7 +172,7 @@ TEST_F(AutotunerPassTest, CublasGemmIsAutotunedAndCached) {
 
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuning",
                                       /*num_threads=*/4);
-  GpuCompiler::TargetConfig target_config(stream_executor_);
+  GpuCompiler::GpuTargetConfig target_config(stream_executor_);
 
   // Run the pass for the first time, this should populate the cache.
   {
@@ -255,7 +255,7 @@ TEST_F(AutotunerPassTest, CublasGemmIsAutotunedWithCacheOnly) {
 
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuning",
                                       /*num_threads=*/4);
-  GpuCompiler::TargetConfig target_config(stream_executor_);
+  GpuCompiler::GpuTargetConfig target_config(stream_executor_);
 
   // Run the pass for the first time, this should populate the cache.
   {
@@ -319,7 +319,7 @@ TEST_F(AutotunerPassTest, DevicelessUsesDefaultConfigIfNoCache) {
 
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuning",
                                       /*num_threads=*/4);
-  GpuCompiler::TargetConfig target_config(stream_executor_);
+  GpuCompiler::GpuTargetConfig target_config(stream_executor_);
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::make_unique<CublasBackend>(
@@ -376,7 +376,7 @@ ENTRY %main (arg0: f32[100,100], arg1: f32[100,100]) -> f32[100,100] {
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuning",
                                       /*num_threads=*/4);
   std::vector<std::unique_ptr<CodegenBackend>> backends;
-  GpuCompiler::TargetConfig target_config(stream_executor_);
+  GpuCompiler::GpuTargetConfig target_config(stream_executor_);
 
   backends.push_back(std::make_unique<CublasBackend>(
       stream_executor_, &module->config().debug_options(), &compiler_,
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc
index aaccefd00edc10..16885ec44f9728 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc
@@ -18,10 +18,13 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <iterator>
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/const_init.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
@@ -36,6 +39,7 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SHA256.h"
+#include "google/protobuf/text_format.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -293,7 +297,7 @@ TryFindInAllCacheTypes(const AutotuneCacheKey& key, absl::string_view cache_dir)
 }
 }  // namespace
 
-AutotuneConfig AutotuneConfig::FromDebugOptions(
+absl::StatusOr<AutotuneConfig> AutotuneConfig::FromDebugOptions(
     const DeviceOrDevicelessConfig& config, const DebugOptions& opts) {
   int autotune_level = opts.xla_gpu_autotune_level();
 
@@ -313,12 +317,32 @@ AutotuneConfig AutotuneConfig::FromDebugOptions(
   std::string autotune_cache_dir = opts.xla_gpu_per_fusion_autotune_cache_dir();
   DebugOptions_AutotuneCacheMode autotune_cache_mode =
       opts.xla_gpu_experimental_autotune_cache_mode();
-  return AutotuneConfig(config, should_init_buffers,
-                        should_reinit_output_buffer, should_check_correctness,
-                        should_skip_wrong_results,
-                        should_crash_on_check_failure, exhaustive_tiling_search,
-                        should_require_complete_aot_autotune_results,
-                        autotune_cache_dir, autotune_cache_mode);
+
+  std::optional<std::vector<AutotuneResult::TritonGemmKey>>
+      gemm_config_overrides;
+  const std::string& override_file =
+      opts.xla_gpu_gemm_autotuner_override_file();
+  if (!override_file.empty()) {
+    std::string file_content;
+    TF_RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(), override_file,
+                                             &file_content));
+    TritonGemmConfigsProto configs;
+    if (!tsl::protobuf::TextFormat::ParseFromString(file_content, &configs)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Could not parse override file: ", override_file));
+    }
+    gemm_config_overrides.emplace();
+    absl::c_copy(configs.config(), std::back_inserter(*gemm_config_overrides));
+    LOG(INFO) << "Loaded " << gemm_config_overrides->size()
+              << " gemm config overrides from " << override_file;
+  }
+
+  return AutotuneConfig(
+      config, should_init_buffers, should_reinit_output_buffer,
+      should_check_correctness, should_skip_wrong_results,
+      should_crash_on_check_failure, exhaustive_tiling_search,
+      should_require_complete_aot_autotune_results, autotune_cache_dir,
+      autotune_cache_mode, gemm_config_overrides);
 }
 
 /*static*/ absl::StatusOr<bool> AutotunerUtil::IsInCache(
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
index d055218cbcd668..7cbf9df09aadf0 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <variant>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -123,6 +124,10 @@ class AutotuneConfig {
   const DebugOptions::AutotuneCacheMode& autotune_cache_mode() const {
     return autotune_cache_mode_;
   }
+  const std::optional<std::vector<AutotuneResult::TritonGemmKey>>&
+  gemm_config_overrides() const {
+    return gemm_config_overrides_;
+  }
 
   AutotuneConfig(const DeviceOrDevicelessConfig& config,
                  bool should_init_buffers, bool should_reinit_output_buffer,
@@ -131,7 +136,9 @@ class AutotuneConfig {
                  bool exhaustive_tiling_search,
                  bool should_require_complete_aot_autotune_results,
                  absl::string_view autotune_cache_dir,
-                 DebugOptions::AutotuneCacheMode autotune_cache_mode)
+                 DebugOptions::AutotuneCacheMode autotune_cache_mode,
+                 std::optional<std::vector<AutotuneResult::TritonGemmKey>>
+                     gemm_config_overrides)
       : config_(config),
         should_init_buffers_(should_init_buffers),
         should_reinit_output_buffer_(should_reinit_output_buffer),
@@ -142,11 +149,12 @@ class AutotuneConfig {
         should_require_complete_aot_autotune_results_(
             should_require_complete_aot_autotune_results),
         autotune_cache_dir_(autotune_cache_dir),
-        autotune_cache_mode_(autotune_cache_mode) {}
+        autotune_cache_mode_(autotune_cache_mode),
+        gemm_config_overrides_(gemm_config_overrides) {}
 
   // Derives the autotune config parameters from the DebugOptions `opts`.
-  static AutotuneConfig FromDebugOptions(const DeviceOrDevicelessConfig& config,
-                                         const DebugOptions& opts);
+  static absl::StatusOr<AutotuneConfig> FromDebugOptions(
+      const DeviceOrDevicelessConfig& config, const DebugOptions& opts);
 
   se::StreamExecutor* GetExecutor() const { return config_.GetExecutor(); }
 
@@ -181,6 +189,8 @@ class AutotuneConfig {
   bool should_require_complete_aot_autotune_results_;
   std::string autotune_cache_dir_;
   DebugOptions::AutotuneCacheMode autotune_cache_mode_;
+  std::optional<std::vector<AutotuneResult::TritonGemmKey>>
+      gemm_config_overrides_;
 };
 
 using AutotuneNoCacheFn = std::function<absl::StatusOr<AutotuneResult>()>;
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc
index dfbb6a54509bd8..6082d2ad156f7d 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -48,7 +49,6 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
@@ -77,7 +77,7 @@ static constexpr absl::string_view kDeviceDescriptionTextProto = R"pb(
 static constexpr absl::string_view kDotFusionHloText = R"hlo(
     HloModule module
     fused_computation {
-          tmp_0 = f16[1,16,17,3]{3,2,1,0} parameter(0) 
+          tmp_0 = f16[1,16,17,3]{3,2,1,0} parameter(0)
           tmp_1 = f16[16,51]{1,0} bitcast(f16[1,16,17,3]{3,2,1,0} tmp_0)
           tmp_2 = s8[16,17,3]{2,1,0} parameter(1)
           tmp_3 = s8[51,16]{0,1} bitcast(s8[16,17,3]{2,1,0} tmp_2)
@@ -85,9 +85,9 @@ static constexpr absl::string_view kDotFusionHloText = R"hlo(
           tmp_5 = f16[16,16]{1,0} dot(f16[16,51]{1,0} tmp_1, f16[51,16]{0,1} tmp_4), lhs_contracting_dims={1}, rhs_contracting_dims={0}
           ROOT tmp_6 = f16[1,16,16]{2,1,0} bitcast(f16[16,16]{1,0} tmp_5)
     }
-    
+
     ENTRY main {
-          p0 = f16[1,16,17,3]{3,2,1,0} parameter(0) 
+          p0 = f16[1,16,17,3]{3,2,1,0} parameter(0)
           p1 = s8[16,17,3]{2,1,0} parameter(1)
           ROOT fusion = f16[1,16,16]{2,1,0} fusion(p0, p1), kind=kCustom, calls=fused_computation
     }
@@ -224,8 +224,10 @@ TEST_F(AutotunerUtilTest, LoadAutotuneResultsFromFile_TextProto1) {
   auto options = DebugOptions();
   options.set_xla_gpu_require_complete_aot_autotune_results(true);
   stream_executor::StreamExecutor* executor = NewStreamExecutor();
-  AutotuneConfig config = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{DeviceConfig{executor}}, options);
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig config,
+      AutotuneConfig::FromDebugOptions(
+          DeviceOrDevicelessConfig{DeviceConfig{executor}}, options));
 
   EXPECT_THAT(AutotunerUtil::IsInCache(key, config),
               absl_testing::IsOkAndHolds(true))
@@ -277,8 +279,10 @@ TEST_F(AutotunerUtilTest, FailIfRequireCompleteAotAutotuning) {
   stream_executor::StreamExecutor* executor = NewStreamExecutor();
   auto options = DebugOptions();
   options.set_xla_gpu_require_complete_aot_autotune_results(true);
-  AutotuneConfig config = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{DeviceConfig{executor}}, options);
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig config,
+      AutotuneConfig::FromDebugOptions(
+          DeviceOrDevicelessConfig{DeviceConfig{executor}}, options));
   absl::Status s = AutotunerUtil::Autotune(instruction, config, [&] {
                      return AutotuneResult();
                    }).status();
@@ -306,8 +310,10 @@ TEST_F(AutotunerUtilTest, OkIfJitAutotuningDisabledButAlreadyLoadedAOT) {
 
   {
     // By default, JIT autotuning is OK.
-    AutotuneConfig config = AutotuneConfig::FromDebugOptions(
-        DeviceOrDevicelessConfig{DeviceConfig{executor}}, DebugOptions());
+    TF_ASSERT_OK_AND_ASSIGN(
+        AutotuneConfig config,
+        AutotuneConfig::FromDebugOptions(
+            DeviceOrDevicelessConfig{DeviceConfig{executor}}, DebugOptions()));
     TF_EXPECT_OK(AutotunerUtil::Autotune(instruction, config, [&] {
                    return AutotuneResult();
                  }).status());
@@ -319,8 +325,10 @@ TEST_F(AutotunerUtilTest, OkIfJitAutotuningDisabledButAlreadyLoadedAOT) {
   auto options = DebugOptions();
   options.set_xla_gpu_require_complete_aot_autotune_results(true);
 
-  AutotuneConfig config = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{DeviceConfig{executor}}, options);
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig config,
+      AutotuneConfig::FromDebugOptions(
+          DeviceOrDevicelessConfig{DeviceConfig{executor}}, options));
   // Even though JIT autotuning is disabled, there is no cache miss when running
   // autotuning for the same entry, so no error should be raised either.
   TF_EXPECT_OK(AutotunerUtil::Autotune(instruction, config, [&] {
@@ -355,16 +363,16 @@ class FileBasedCacheTest : public AutotunerUtilTest {
 
   static std::string Read(const absl::string_view filepath) {
     std::string file_content;
-    TF_CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(),
-                                      std::string(filepath), &file_content));
+    CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), std::string(filepath),
+                                   &file_content));
     return file_content;
   }
 
   void Write(const absl::string_view filepath,
              const absl::string_view content) {
-    TF_CHECK_OK(CreateDirIfNeeded(cache_dir_, tsl::Env::Default()));
-    TF_CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(),
-                                       std::string(filepath), content));
+    CHECK_OK(CreateDirIfNeeded(cache_dir_, tsl::Env::Default()));
+    CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), std::string(filepath),
+                                    content));
   }
 
   stream_executor::StreamExecutor* executor_ = NewStreamExecutor();
@@ -394,7 +402,8 @@ class FileBasedCacheTest : public AutotunerUtilTest {
         /*exhaustive_tiling_search=*/true,
         /*should_require_complete_aot_autotune_results=*/false,
         /*autotune_cache_dir=*/cache_dir_,
-        /*autotune_cache_mode=*/GetCacheMode());
+        /*autotune_cache_mode=*/GetCacheMode(),
+        /*gemm_config_overrides=*/std::nullopt);
   }
 
   AutotuneCacheKey GetCacheKey() const {
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
index dbe4817536b3d0..0f29eea297fb4f 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/literal_util.h"
+#include "xla/service/gpu/autotuning/autotune_cache_key.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/autotuning/gpu_autotuning.pb.h"
 #include "xla/service/gpu/autotuning/redzone_buffers.h"
@@ -62,9 +63,9 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/lazy_op_runner.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -72,7 +73,6 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/env_var.h"
@@ -112,8 +112,8 @@ class ScratchAllocator : public se::ScratchAllocator {
 
   static int64_t GetDefaultMemoryLimitInBytes() {
     int64_t value;
-    TF_CHECK_OK(tsl::ReadInt64FromEnvVar("TF_CUDNN_WORKSPACE_LIMIT_IN_MB",
-                                         1LL << 12, &value));
+    CHECK_OK(tsl::ReadInt64FromEnvVar("TF_CUDNN_WORKSPACE_LIMIT_IN_MB",
+                                      1LL << 12, &value));
     return value * (1LL << 20);
   }
 
@@ -155,7 +155,7 @@ absl::StatusOr<se::DeviceMemory<uint8_t>> ScratchAllocator::AllocateBytes(
 
 absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
     const GpuConvConfig& config, se::Stream* stream, bool use_fallback,
-    const se::NumericOptions& numeric_options) {
+    const se::EngineOptions& engine_options) {
   TF_ASSIGN_OR_RETURN(se::dnn::DataType input_type,
                       GetDNNDataTypeFromPrimitiveType(config.input_type));
 
@@ -189,7 +189,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
           /* leakyrelu_alpha = */ config.fusion->leakyrelu_alpha, stream,
           config.input_descriptor, config.filter_descriptor,
           config.bias_descriptor, config.output_descriptor, config.conv_desc,
-          use_fallback, config.fusion->mode, numeric_options, &runners));
+          use_fallback, config.fusion->mode, engine_options, &runners));
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
             auto runner_cache,
@@ -207,7 +207,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
       TF_RETURN_IF_ERROR(dnn->GetGraphConvolveRunners(
           kind, input_type, output_type, stream, config.input_descriptor,
           config.filter_descriptor, config.output_descriptor, config.conv_desc,
-          use_fallback, numeric_options, &runners, config.serialized_graph));
+          use_fallback, engine_options, &runners, config.serialized_graph));
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
             auto runner_cache,
@@ -231,7 +231,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
           /* filter_data = */ DeviceMemoryBase(nullptr),
           config.output_descriptor,
           /* output_data = */ DeviceMemoryBase(nullptr), config.conv_desc,
-          use_fallback, nullptr, numeric_options, &runners));
+          use_fallback, nullptr, engine_options, &runners));
 
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
@@ -253,7 +253,7 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
                     absl::Span<se::DeviceMemoryBase> result_buffers,
                     se::StreamExecutor* stream_exec,
                     ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    const se::NumericOptions& numeric_options) {
+                    const se::EngineOptions& engine_options) {
   TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
 
   TF_ASSIGN_OR_RETURN(se::dnn::DataType dtype,
@@ -274,8 +274,7 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
       params.config->filter_descriptor, params.filter_buf,
       params.config->output_descriptor, params.output_buf,
       params.config->conv_desc,
-      /* use_fallback = */ false, scratch_allocator, numeric_options,
-      &runners));
+      /* use_fallback = */ false, scratch_allocator, engine_options, &runners));
 
   return runners;
 }
@@ -812,8 +811,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
         instr->precision_config().operand_precision(),
         [](int precision) { return precision <= PrecisionConfig::HIGH; });
   }
-  const se::NumericOptions numeric_options{
-      RequireDeterminism(instr->GetModule()->config()), allow_tf32};
+  const se::EngineOptions engine_options{
+      RequireDeterminism(instr->GetModule()->config()), allow_tf32,
+      /*require_command_buffer=*/false};
 
   // Use the first algorithm that's supported as reference. There isn't a
   // particular reason to use it, as any algorithm suffices. It doesn't make
@@ -827,7 +827,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
   TF_ASSIGN_OR_RETURN(
       std::vector<GenericConvRunner> runners,
       GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                    /* use_fallback = */ false, numeric_options));
+                    /* use_fallback = */ false, engine_options));
 
   std::vector<AutotuneResult> profile_results;
   for (auto& runner_cache : runners) {
@@ -850,7 +850,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
     TF_ASSIGN_OR_RETURN(
         std::vector<GenericConvRunner> fallback_runners,
         GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                      /* use_fallback = */ true, numeric_options));
+                      /* use_fallback = */ true, engine_options));
 
     for (auto& runner_cache : fallback_runners) {
       TF_ASSIGN_OR_RETURN(
@@ -915,8 +915,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   const bool allow_tf32 = absl::c_all_of(
       instr->precision_config().operand_precision(),
       [](int precision) { return precision <= PrecisionConfig::HIGH; });
-  const se::NumericOptions numeric_options{
-      RequireDeterminism(instr->GetModule()->config()), allow_tf32};
+  const se::EngineOptions engine_options{
+      RequireDeterminism(instr->GetModule()->config()), allow_tf32,
+      /*require_command_buffer=*/false};
 
   se::StreamExecutor* stream_exec = config_.GetExecutor();
   const auto device_ordinal = stream_exec->device_ordinal();
@@ -970,7 +971,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
       std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners,
       GetMIOpenAlgorithms(instr, absl::MakeSpan(operand_buffers),
                           absl::MakeSpan(result_buffers), stream_exec,
-                          &scratch_allocator, stream, numeric_options));
+                          &scratch_allocator, stream, engine_options));
 
   std::vector<AutotuneResult> profile_results;
 
@@ -1153,7 +1154,7 @@ absl::StatusOr<bool> GpuConvAlgorithmPicker::RunOnComputation(
   return changed;
 }
 
-absl::StatusOr<bool> GpuConvAlgorithmPicker::Run(
+absl::StatusOr<bool> GpuConvAlgorithmPicker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER(
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
index 39ac8351b23408..4f3c1e8e770135 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
@@ -93,8 +93,8 @@ class GpuConvAlgorithmPicker : public HloModulePass {
     return IsCustomCallToDnnConvolution(*instr);
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
index 4c0ae7875b2716..d8e9450def4b3b 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
@@ -88,8 +88,10 @@ ENTRY main {
   changed = false;
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
 
-  AutotuneConfig cfg = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{DeviceConfig{stream_exec, nullptr}}, opts);
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig cfg,
+      AutotuneConfig::FromDebugOptions(
+          DeviceOrDevicelessConfig{DeviceConfig{stream_exec, nullptr}}, opts));
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
@@ -130,9 +132,8 @@ ENTRY main {
   // Algorithm 14 is disabled for cuDNN 9 on V100
   TF_ASSERT_OK_AND_ASSIGN(auto dnn_version, GetDnnVersionInfo(stream_exec));
   if (dnn_version.major_version() >= 9 && dnn_version.major_version() < 10 &&
-      std::holds_alternative<stream_executor::CudaComputeCapability>(cc) &&
-      std::get<stream_executor::CudaComputeCapability>(cc).major == 7 &&
-      std::get<stream_executor::CudaComputeCapability>(cc).minor == 0) {
+      cc.IsCuda() && cc.cuda_compute_capability()->major == 7 &&
+      cc.cuda_compute_capability()->minor == 0) {
     EXPECT_TRUE(conv->backend_config<GpuBackendConfig>()
                     ->has_cudnn_conv_backend_config() &&
                 conv->backend_config<GpuBackendConfig>()
@@ -201,8 +202,10 @@ ENTRY main {
   ASSERT_TRUE(changed);
 
   DebugOptions opts = DefaultDebugOptionsIgnoringFlags();
-  AutotuneConfig cfg = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{DeviceConfig{stream_exec, nullptr}}, opts);
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig cfg,
+      AutotuneConfig::FromDebugOptions(
+          DeviceOrDevicelessConfig{DeviceConfig{stream_exec, nullptr}}, opts));
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GpuConvAlgorithmPicker(cfg), m.get()));
   ASSERT_TRUE(changed);
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
index af7271a0ce9608..43a2806d38e1f9 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/str_format.h"
 #include "llvm/ADT/STLExtras.h"
+#include "google/protobuf/repeated_field.h"
+#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -122,7 +124,8 @@ TritonDotFusionSearchSpace::TritonDotFusionSearchSpace(
 }
 
 std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
-    std::optional<int64_t> force_contracting_split, bool autotune_tma) const {
+    std::optional<int64_t> force_contracting_split, bool autotune_tma,
+    bool autotune_warp_specialization) const {
   std::vector<ConfigWithNotes> configs;
   if (force_contracting_split.has_value()) {
     ConfigWithNotes config;
@@ -150,8 +153,22 @@ std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddCtaSizeParameter);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddContractingTiling);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddPipeliningParameter);
+
+  if (autotune_warp_specialization && !autotune_tma) {
+    LOG(WARNING)
+        << "Warp specialization is requested, but TMA is not enabled, hence "
+           "warp specialization will be ignored. Set both "
+           "`is_warp_specialization_allowed` and `is_tma_allowed` "
+           "to true on the configuration to enable warp specialization.";
+  }
   if (autotune_tma) {
+    VLOG(10) << "Parameterizing all currently constructed configs with "
+                "TMA.";
     ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddTmaParameter);
+    if (autotune_warp_specialization) {
+      ExtendConfigs(
+          configs, &TritonDotFusionSearchSpace::AddWarpSpecializationParameter);
+    }
   }
 
   std::vector<TritonGemmConfig> result;
@@ -174,7 +191,10 @@ std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::OptimizeConfigSet(
 
   absl::flat_hash_map<std::pair<int, int>, std::pair<int, int>>
       m_n_to_split_limits;
-  std::pair<int, int> global_split_limits;
+  // Init with first config vals, otherwise they would be 0 and min comparison
+  // won't update properly.
+  std::pair<int, int> global_split_limits{configs.front().split_k,
+                                          configs.front().split_k};
   auto update_split_limits = [](auto& limits, int value) {
     limits = std::minmax({limits.first, limits.second, value});
   };
@@ -328,12 +348,11 @@ bool TritonDotFusionSearchSpace::ShouldOptimizeForOccupancy() const {
 
 TritonDotFusionSearchSpace::OutputTile
 TritonDotFusionSearchSpace::GetMinOutputTile() const {
-  // Triton currently doesn't support tiles smaller than 16x16.
-  // TODO: b/395572776 - Lift this restriction, and calculate a smaller tile
-  // based on the requested algorithm (e.g., if we want to use wgmma vs mma
-  // vs fma, the minimal reasonable tile size is different).
-  constexpr OutputTile kMinSupportedTile = {16, 16};
-  constexpr OutputTile kMinWgmmaTile = {64, 16};
+  // TODO: b/395572776 - Calculate tile sizes based on the requested algorithm
+  // (e.g., if we want to use wgmma vs mma vs fma, the minimal reasonable tile
+  // size is different).
+  constexpr OutputTile kMinSupportedTile = {16, 8};
+  constexpr OutputTile kMinWgmmaTile = {64, 8};
   if (device_description_.cuda_compute_capability().IsAtLeastHopper() &&
       !should_optimize_for_occupancy_) {
     VLOG(5) << "Computing output_tile: Want to use wgmma, so output_tile >= "
@@ -626,13 +645,35 @@ void TritonDotFusionSearchSpace::AddTmaParameter(
     std::vector<ConfigWithNotes>& updated_configs) const {
   ConfigWithNotes new_config = config;
   new_config.config.is_tma_allowed = false;
-  VLOG(10) << "Adding TMA (disabled) parameter: config = "
-           << new_config.ToString();
   updated_configs.push_back(new_config);
-  new_config.config.is_tma_allowed = true;
-  VLOG(10) << "Adding TMA (enabled) parameter: config = "
-           << new_config.ToString();
+
+  if (IsTmaRecommended(config.config)) {
+    new_config.config.is_tma_allowed = true;
+    updated_configs.push_back(new_config);
+  }
+}
+
+void TritonDotFusionSearchSpace::AddWarpSpecializationParameter(
+    const ConfigWithNotes& config,
+    std::vector<ConfigWithNotes>& updated_configs) const {
+  ConfigWithNotes new_config = config;
+  new_config.config.is_warp_specialization_allowed = false;
   updated_configs.push_back(new_config);
+
+  // Warp specialization probably only makes sense if TMA is enabled. Other
+  // restrictions are required for compatibility with Triton, including:
+  // - num_warps must be a multiple of 4.
+  // - num_warps must be <= 16. This is because the next step for num_warps is
+  // 32, which would break with auto warp specialization, because the feature
+  // will employ `worker warps` that will mean we exceed the maximum block size
+  // of 1024 threads.
+  if (config.config.is_tma_allowed && config.config.num_warps <= 16 &&
+      config.config.num_warps % 4 == 0) {
+    new_config.config.is_warp_specialization_allowed = true;
+    VLOG(10) << "Adding warp specialization parameter: config = "
+             << new_config.ToString();
+    updated_configs.push_back(new_config);
+  }
 }
 
 void TritonDotFusionSearchSpace::EliminateLowOccupancyConfigs(
@@ -655,6 +696,14 @@ void TritonDotFusionSearchSpace::EliminateLowOccupancyConfigs(
 
   ConfigWithNotes last_config = configs.back();  // Largest split.
   auto has_too_few_tiles = [](const ConfigWithNotes& config) {
+    // Small dots frequently lead to large split_k values that are not
+    // compatible with codegen. We skip occupancy optimization for them to be
+    // able to consider smaller splits in non-exhaustive mode.
+    // The value of 4 was found by running exhaustive autotuning and noting that
+    // the majority of optimal configs with block_n == 8 had split_k <= 4.
+    if (config.config.block_n == 8 && config.config.split_k <= 4) {
+      return false;
+    }
     if (config.not_enough_tiles) {
       VLOG(10) << "Skipping due to fewer tiles than cores, config = "
                << config.ToString();
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
index 042d5159b7237c..3cbae90aa0a9c7 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
@@ -42,13 +42,19 @@ class TritonDotFusionSearchSpace {
                              const HloDotInstruction* dot);
 
   // Generates the list of promising configs in the search space for the
-  // autotuner to try. If `force_contracting_split` is set, the search space
+  // autotuner to try.
+  // If `force_contracting_split` is set, the search space
   // will be restricted to only include configs with the given split_k factor.
-  // If `autotune_tma` is set, the search space will be extended with TMA
-  // parameterization.
+  //
+  // If true, `autotune_tma` and `autotune_warp_specialization` extend the
+  // search space with TMA parameterization and warp specialization
+  // respectively. Setting 'autotune_warp_specialization' to true also requires
+  // `autotune_tma` to be true, given that warp specialization is probably not
+  // useful without TMA.
   std::vector<TritonGemmConfig> GenerateConfigs(
       std::optional<int64_t> force_contracting_split = std::nullopt,
-      bool autotune_tma = false) const;
+      bool autotune_tma = false,
+      bool autotune_warp_specialization = false) const;
 
   // Restrict the set of configs to the ones compatible with the hints list.
   // Generally, this will mean that configs are restricted to the ones that
@@ -214,6 +220,11 @@ class TritonDotFusionSearchSpace {
   void AddTmaParameter(const ConfigWithNotes& config,
                        std::vector<ConfigWithNotes>& updated_configs) const;
 
+  // Extend the passed configs with automatic warp specialization.
+  void AddWarpSpecializationParameter(
+      const ConfigWithNotes& config,
+      std::vector<ConfigWithNotes>& updated_configs) const;
+
   // The order of these fields is important: the values of those defined earlier
   // are used to compute the values of later ones.
   se::DeviceDescription device_description_;
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
index d70c2a3b711c43..2f684375bd48a4 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -45,6 +45,7 @@ void PrintTo(const TritonGemmConfig& config, std::ostream* os) {
 namespace {
 
 using ::testing::AllOf;
+using ::testing::Contains;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::Eq;
@@ -103,7 +104,7 @@ class DefaultDeviceDotSearchSpaceTest : public HloHardwareIndependentTestBase {
   se::DeviceDescription device_description_ = []() {
     auto device_description_ = se::DeviceDescription::FromProto(
         se::GpuDeviceInfoProto::default_instance());
-    TF_CHECK_OK(device_description_.status());
+    CHECK_OK(device_description_.status());
     return *device_description_;
   }();
 
@@ -153,7 +154,7 @@ class DotSearchSpaceTest : public DefaultDeviceDotSearchSpaceTest {
     device_description_.set_threads_per_warp(32);
     device_description_.set_shared_memory_per_block_optin(227 * 1024);
     device_description_.set_gpu_compute_capability(
-        se::CudaComputeCapability::H100Family());
+        se::CudaComputeCapability::H100Accelerated());
   }
 };
 
@@ -195,7 +196,7 @@ TEST_F(DotSearchSpaceTest, SerializesSearchSpace) {
 
   EXPECT_EQ(search_space.ToString(),
             "problem_size_BxMxNxKxE: 1x1024x1024x1024x(16->16) "
-            "tile_range_SxMxNxK: [1-64]x[16-256]x[16-512]x[16-?] "
+            "tile_range_SxMxNxK: [1-64]x[16-256]x[8-512]x[16-?] "
             "desired_total_warps: 2640 occupancy_optimization: 1 "
             "warps_per_cta: [2-?]");
 }
@@ -306,16 +307,15 @@ TEST_F(DotSearchSpaceTest, FindsGoodDataReuseTilesForLowOccupancyProblem) {
               Contains(AllOf(BlockMIs(Ge(32)), SplitKIs(Ge(2)))));
 }
 
-TEST_F(DotSearchSpaceTest,
-       FindsUniqueOccupancyMaximizingTilingForSmallProblem) {
+TEST_F(DotSearchSpaceTest, FindsOccupancyMaximizingTilingForSmallProblem) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<VerifiedHloModule> module,
       GetDefaultDotModule(/*lhs_parallel_dim=*/64, /*rhs_parallel_dim=*/64,
                           /*contracting_dim=*/64));
   TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
-  EXPECT_THAT(search_space.GenerateConfigs(),
-              AllOf(SizeIs(1), Each(AllOf(BlockMIs(Eq(16)), BlockNIs(Eq(16)),
-                                          SplitKIs(Eq(4))))));
+  EXPECT_THAT(
+      search_space.GenerateConfigs(),
+      Contains(AllOf(BlockMIs(Eq(16)), BlockNIs(Eq(8)), SplitKIs(Eq(4)))));
 }
 
 TEST_F(DotSearchSpaceTest, FindsGoodDataReuseTilesForForcedHugeSplit) {
@@ -348,7 +348,7 @@ TEST_F(DotSearchSpaceTest, HonorsMinimumOutputTileSizeForTinyProblem) {
 
   EXPECT_THAT(
       search_space.GenerateConfigs(),
-      AllOf(Not(IsEmpty()), Each(BlockMIs(Ge(16))), Each(BlockNIs(Ge(16)))));
+      AllOf(Not(IsEmpty()), Each(BlockMIs(Ge(16))), Each(BlockNIs(Ge(8)))));
 }
 
 TEST_F(DotSearchSpaceTest, AssignsEnoughWarpsPerScheduler) {
@@ -619,6 +619,7 @@ TEST_F(DotSearchSpaceTest, RestrictsSplitKPerNMTile) {
       // Greater than max split K for this tile size.
       make_config(16, 16, 16, 8),
       // Does not have a per-tile split K limit.
+      make_config(16, 32, 16, 1),
       make_config(16, 32, 16, 32),
   };
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
index e85c4f3b863e16..165177e36946f9 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -42,8 +42,13 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
+#include "google/protobuf/text_format.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/cublas.h"
+#include "xla/backends/gpu/autotuner/fission_backend.h"
+#include "xla/backends/gpu/autotuner/triton.h"
 #include "xla/backends/gpu/runtime/buffer_comparator.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -60,15 +65,19 @@ limitations under the License.
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/call_inliner.h"
+#include "xla/service/compiler.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
+#include "xla/service/gpu/autotuning/autotune_cache_key.h"
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuning/autotuner_pass.h"
 #include "xla/service/gpu/autotuning/autotuner_status_key.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/autotuning/dot_search_space.h"
 #include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_float_support.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion.h"
@@ -76,6 +85,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/split_k_gemm_rewriter.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
@@ -88,12 +98,10 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/matmul_indexing_utils.h"
 #include "xla/service/shaped_buffer.h"
-#include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/ptx_compiler_helpers.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
@@ -103,7 +111,6 @@ limitations under the License.
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/util/proto/proto_utils.h"
@@ -111,7 +118,6 @@ limitations under the License.
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 #include "tsl/profiler/lib/traceme.h"
 
@@ -126,12 +132,29 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using ::mlir::MLIRContext;
+
 using BackendConfig = GemmFusionAutotunerImpl::BackendConfig;
 using BackendConfigs = GemmFusionAutotunerImpl::BackendConfigs;
 using ProfilingOutput = AutotunerCompileUtil::ProfilingOutput;
 
 namespace {
 
+std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
+    const se::DeviceDescription& device_description) {
+  auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
+  for (GemmRewriterOptions::DType dtype :
+       {GemmRewriterOptions::DType::kFp8Only,
+        GemmRewriterOptions::DType::kNonFp8Only}) {
+    auto gemm_rewriter = std::make_unique<GemmRewriter>(
+        device_description.gpu_compute_capability(),
+        device_description.runtime_version(), GemmRewriterOptions{dtype});
+    pipeline->AddPass(std::move(gemm_rewriter));
+  }
+  return pipeline;
+}
+
 using AutoTuneCacheKeyCount = absl::flat_hash_map<AutotuneCacheKey, uint64_t>;
 
 using KeysAndInstructions =
@@ -177,8 +200,7 @@ class GemmFusionCollector : public ConstDfsHloVisitorWithDefault {
         gpu_config.fusion_backend_config();
     if (backend_config.kind() != kTritonGemmFusionKind &&
         backend_config.kind() != kCuDnnFusionKind &&
-        backend_config.kind() != kCustomFusionKind &&
-        backend_config.kind() != kTritonScaledDotFusionKind) {
+        backend_config.kind() != kCustomFusionKind) {
       return absl::OkStatus();
     }
 
@@ -275,14 +297,13 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     const TritonGemmConfig& config,
     const se::DeviceDescription& gpu_device_info,
     const HloFusionInstruction* fusion, DebugOptions debug_opts,
-    mlir::MLIRContext* mlir_context,
+    MLIRContext* mlir_context,
     bool allow_filtering_kernels_spilling_registers) {
   tsl::profiler::TraceMe traceme("TritonGemmAutotuneExtractor");
   std::unique_ptr<HloModule> new_module =
       ExtractInstructionIntoNewModule(*fusion);
   if (!allow_filtering_kernels_spilling_registers) {
-    debug_opts.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
-        false);
+    debug_opts.set_xla_gpu_fail_ptx_compilation_on_register_spilling(true);
   }
   new_module->mutable_config().set_debug_options(debug_opts);
 
@@ -327,7 +348,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CublasGemmAutotuneExtractor(
     const AutotuneConfig& config, const se::DeviceDescription& gpu_device_info,
     const se::SemanticVersion& toolkit_version,
     const HloFusionInstruction* fusion, const DebugOptions& debug_opts,
-    mlir::MLIRContext* mlir_context) {
+    MLIRContext* mlir_context) {
   tsl::profiler::TraceMe traceme("CublasGemmAutotuneExtractor");
   const HloComputation* fusion_computation = fusion->called_computation();
   std::unique_ptr<HloModule> new_module =
@@ -383,7 +404,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CustomFusionKernelAutotuneExtractor(
     const GemmFusionAutotunerImpl::CustomKernelFusionConfig& cutlass_config,
     const AutotuneConfig& config, const se::SemanticVersion& toolkit_version,
     const HloFusionInstruction* fusion, const DebugOptions& debug_opts,
-    mlir::MLIRContext* mlir_context) {
+    MLIRContext* mlir_context) {
   tsl::profiler::TraceMe traceme("CustomFusionKernelAutotuneExtractor");
   const HloComputation* fusion_computation = fusion->called_computation();
   std::unique_ptr<HloModule> new_module =
@@ -422,16 +443,25 @@ absl::StatusOr<std::unique_ptr<HloModule>> CuDnnFusionExtractor(
   tsl::profiler::TraceMe traceme("CuDnnFusionExtractor");
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       FusionExtractor(fusion, debug_opts));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  // Swizzle scale tensors for block scaled dot.
+  HloInstruction* scaled_dot = hlo_query::GetFirstInstructionWithOpcode(
+      *root->called_computations()[0], HloOpcode::kScaledDot);
+  if (scaled_dot != nullptr) {
+    TF_ASSIGN_OR_RETURN(root, CudnnScaledDotHelper::AddScaleSwizzle(
+                                  Cast<HloFusionInstruction>(root)));
+  }
 
+  // Update backend config of the root fusion.
   GpuBackendConfig gpu_config;
   FusionBackendConfig& backend_config =
       *gpu_config.mutable_fusion_backend_config();
-  backend_config.set_kind(std::string(kCuDnnFusionKind));
+  backend_config.set_kind(kCuDnnFusionKind);
   // Provided a plan ID the autotuner just compiles one plan.
   backend_config.mutable_cudnn_fusion_config()->set_plan_id(plan_id);
-  TF_RETURN_IF_ERROR(
-      module->entry_computation()->root_instruction()->set_backend_config(
-          gpu_config));
+  TF_RETURN_IF_ERROR(root->set_backend_config(gpu_config));
+
   return module;
 }
 
@@ -492,7 +522,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> GetAutotunedModule(
     const AutotuneConfig& autotune_config,
     const se::SemanticVersion& toolkit_version, AutotunerCompileUtil& util,
     const AutotuneResult result, const HloFusionInstruction* fusion,
-    int fusion_id, mlir::MLIRContext* mlir_context) {
+    int fusion_id, MLIRContext* mlir_context) {
   TritonGemmConfig triton_gemm_config;
   if (result.has_triton()) {
     TF_ASSIGN_OR_RETURN(triton_gemm_config,
@@ -562,14 +592,10 @@ bool IsScaledDotFusion(const HloInstruction* fusion_instr) {
   if (fusion_instr->fusion_kind() != HloInstruction::FusionKind::kCustom) {
     return false;
   }
-  auto config = fusion_instr->backend_config<GpuBackendConfig>();
-  if (!config.ok()) {
-    return false;
-  }
-  if (config->fusion_backend_config().kind() != kTritonScaledDotFusionKind) {
-    return false;
-  }
-  return true;
+  return IsGpuFusionKind(*fusion_instr, kTritonGemmFusionKind) &&
+         hlo_query::GetFirstInstructionWithOpcode(
+             *fusion_instr->fused_instructions_computation(),
+             HloOpcode::kScaledDot) != nullptr;
 }
 
 absl::Status RewriteGemmFusionToCall(HloInstruction* fusion_instr) {
@@ -659,26 +685,6 @@ std::string GetSelectedGemmBackendAsString(const HloModule* module) {
   return "";
 }
 
-bool HasBroadcastProducer(const HloInstruction& instr) {
-  return HloBfsFindIf({&instr},
-                      [](const HloInstruction* node) {
-                        return node->opcode() == HloOpcode::kBroadcast;
-                      })
-      .has_value();
-}
-
-// CUDA_ERROR_MISALIGNED_ADDRESS errors are happening for some cases when
-// pipelining stages are > 2. The pattern observed is that these happen in the
-// presence of a broadcast.
-void RestrictTmaConfigs(std::vector<TritonGemmConfig>& configs) {
-  configs.erase(std::remove_if(configs.begin(), configs.end(),
-                               [&](const TritonGemmConfig& config) {
-                                 return config.is_tma_allowed &&
-                                        config.num_stages > 2;
-                               }),
-                configs.end());
-}
-
 }  // anonymous namespace
 
 absl::Status GemmFusionAutotunerRewriterVisitor::HandleFusion(
@@ -692,8 +698,7 @@ absl::Status GemmFusionAutotunerRewriterVisitor::HandleFusion(
   // Only autotune Triton, cuDNN, and custom kernel fusions.
   if (fusion_backend_config.kind() != kTritonGemmFusionKind &&
       fusion_backend_config.kind() != kCuDnnFusionKind &&
-      fusion_backend_config.kind() != kCustomFusionKind &&
-      fusion_backend_config.kind() != kTritonScaledDotFusionKind) {
+      fusion_backend_config.kind() != kCustomFusionKind) {
     return absl::OkStatus();
   }
 
@@ -756,23 +761,20 @@ absl::Status GemmFusionAutotunerRewriterVisitor::HandleFusion(
 
   // Autotune result has a cuDNN fusion.
   CHECK(autotune_result.has_algorithm());
+  if (IsScaledDotFusion(fusion_instr)) {
+    TF_ASSIGN_OR_RETURN(fusion_instr,
+                        CudnnScaledDotHelper::AddScaleSwizzle(
+                            Cast<HloFusionInstruction>(fusion_instr)));
+  }
   fusion_backend_config.set_kind(kCuDnnFusionKind);
   fusion_backend_config.mutable_cudnn_fusion_config()->set_plan_id(
       autotune_result.algorithm().algo_id());
   TF_RETURN_IF_ERROR(fusion_instr->set_backend_config(gpu_config));
+
   MarkAsChanged();
   return absl::OkStatus();
 }
 
-bool GemmFusionAutotunerImpl::IsFusionKind(const HloInstruction& hlo,
-                                           absl::string_view kind) {
-  auto gpu_config = hlo.backend_config<GpuBackendConfig>();
-  if (!gpu_config.ok()) {
-    return false;
-  }
-  return gpu_config->fusion_backend_config().kind() == kind;
-}
-
 // Methods required for sorting the configs.
 bool GemmFusionAutotunerImpl::CuBlasConfig::operator<(
     const CuBlasConfig& other) const {
@@ -792,6 +794,12 @@ bool GemmFusionAutotunerImpl::IsAutotuningEnabled() const {
          !debug_options_.xla_gpu_deterministic_ops();
 }
 
+bool GemmFusionAutotunerImpl::IsWarpSpecializationAvailable() const {
+  auto compute_capability = config_.GetGpuComputeCapability();
+  return compute_capability.IsCuda() &&
+         compute_capability.cuda_compute_capability()->IsAtLeastBlackwell();
+}
+
 static std::vector<BackendConfig> GenerateCustomKernelFusionConfigs(
     const HloFusionInstruction& fusion,
     se::DeviceDescription device_description) {
@@ -902,14 +910,16 @@ GemmFusionAutotunerImpl::GenerateDotConfigs(const HloFusionInstruction& fusion,
     }
 
     // Add lib (e.g. cuDNN) plans, if available.
-    if (AddLibConfigs(fusion, dot, configs)) return configs;
+    if (AddLibConfigs(fusion, dot, configs)) {
+      return configs;
+    }
   }
 
   // Add CustomKernelFusion (Cutlass) configs, if available.
   // Go through all the instructions in the fusion body try to match them to
   // a custom kernel fusion pattern.
-  if ((IsFusionKind(fusion, kCustomFusionKind) ||
-       IsFusionKind(fusion, kTritonGemmFusionKind)) &&
+  if ((IsGpuFusionKind(fusion, kCustomFusionKind) ||
+       IsGpuFusionKind(fusion, kTritonGemmFusionKind)) &&
       IsAutotuningEnabled() && !config_.IsDeviceless()) {
     std::vector<BackendConfig> custom_kernel_fusion_configs =
         GenerateCustomKernelFusionConfigs(fusion,
@@ -936,6 +946,10 @@ GemmFusionAutotunerImpl::GenerateScaledDotConfigs(
       IsAutotuningEnabled() && !config_.IsDeviceless()) {
     // Add cuBLAS reference config, if available.
     configs.push_back(CuBlasConfig{});
+    // Add lib (e.g. cuDNN) plans, if available.
+    if (AddLibConfigs(fusion, dot, configs)) {
+      return configs;
+    }
   }
 
   // TODO(b/436988479): fine tune the search space.
@@ -956,12 +970,7 @@ absl::StatusOr<std::vector<TritonGemmConfig>>
 GemmFusionAutotunerImpl::GenerateTritonConfigs(
     const HloScaledDotInstruction& dot) {
   tsl::profiler::TraceMe traceme("GenerateTritonConfigs");
-  // TODO(b/421858850): Restricting configs for dots from broadcasts is a
-  // temporary solution. We should remove this once we have a fix for the error.
   auto configs = GetDefaultTritonConfigs();
-  if (HasBroadcastProducer(dot)) {
-    RestrictTmaConfigs(configs);
-  }
 
   if (!IsAutotuningEnabled()) {
     // Keep the first config, which likely does not spill registers.
@@ -987,6 +996,16 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
   bool autotune_tma = debug_options_.xla_gpu_experimental_enable_triton_tma() &&
                       stream_executor::gpu::IsTmaAvailableForDevice(
                           config_.GetDeviceDescription());
+  bool autotune_warp_specialization =
+      debug_options_.xla_gpu_experimental_enable_triton_warp_specialization() &&
+      IsWarpSpecializationAvailable();
+  if (autotune_warp_specialization && !autotune_tma) {
+    return absl::InvalidArgumentError(
+        "Warp specialization is requested, but TMA is not enabled. If you wish "
+        "to enable warp specialization, set both "
+        "`xla_gpu_experimental_enable_triton_tma` and "
+        "`xla_gpu_experimental_enable_triton_warp_specialization` to true.");
+  }
   TritonDotFusionSearchSpace search_space(config_.GetDeviceDescription(), &dot);
   VLOG(1) << "Generating configs from search space: "
           << search_space.ToString();
@@ -996,20 +1015,23 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
       /*force_contracting_split=*/autotune_contracting_split
           ? std::nullopt
           : std::make_optional(1),
-      /*autotune_tma=*/autotune_tma);
-
-  // TODO(b/421858850): Restricting configs for dots from broadcasts is a
-  // temporary solution. We should remove this once we have a fix for the error.
-  auto default_configs = GetDefaultTritonConfigs();
-  if (HasBroadcastProducer(dot)) {
-    RestrictTmaConfigs(configs);
-    RestrictTmaConfigs(default_configs);
-  }
-
-  if (!debug_options_.xla_gpu_exhaustive_tiling_search()) {
-    VLOG(1) << "Restricting configs to the default set.";
+      /*autotune_tma=*/autotune_tma,
+      /*autotune_warp_specialization=*/autotune_warp_specialization);
+
+  if (auto overrides = config_.gemm_config_overrides(); overrides.has_value()) {
+    VLOG(1) << "Restricting configs to the overridden set.";
+    std::vector<TritonGemmConfig> allowed_configs;
+    for (const AutotuneResult::TritonGemmKey& key : *overrides) {
+      TF_ASSIGN_OR_RETURN(TritonGemmConfig config,
+                          TritonGemmConfig::FromProto(key));
+      allowed_configs.push_back(std::move(config));
+    }
     configs =
-        search_space.OptimizeConfigSet(configs, /*hints=*/default_configs);
+        search_space.OptimizeConfigSet(configs, /*hints=*/allowed_configs);
+  } else if (!debug_options_.xla_gpu_exhaustive_tiling_search()) {
+    VLOG(1) << "Restricting configs to the default set.";
+    configs = search_space.OptimizeConfigSet(
+        configs, /*hints=*/GetDefaultTritonConfigs());
   }
   if (!IsAutotuningEnabled()) {
     // Keep the first config, which likely does not spill registers.
@@ -1066,13 +1088,6 @@ GemmFusionAutotunerImpl::CompileAll(AutotunerCompileUtil& compile_util,
             fusion, opts, mlir_context_,
             allow_filtering_kernels_spilling_registers);
       });
-      if (absl::c_contains(
-              debug_options_
-                  .xla_gpu_unsupported_generic_triton_emitter_features(),
-              DebugOptions::
-                  GENERIC_TRITON_EMITTER_MUST_ACCEPT_ALL_AUTOTUNER_CONFIGS)) {
-        return executable_or;
-      }
       absl::StatusCode code = executable_or.status().code();
       // TODO(b/447113513): we should not silently ignore that wide range of
       // errors as we might hide real regressions and drop the optimal
@@ -1081,8 +1096,7 @@ GemmFusionAutotunerImpl::CompileAll(AutotunerCompileUtil& compile_util,
       if (code == absl::StatusCode::kInternal ||
           code == absl::StatusCode::kFailedPrecondition ||
           code == absl::StatusCode::kUnimplemented ||
-          (debug_options_.xla_gpu_exhaustive_tiling_search() &&
-           code == absl::StatusCode::kInvalidArgument)) {
+          code == absl::StatusCode::kInvalidArgument) {
         VLOG(5) << "Compilation failed with status " << executable_or.status()
                 << " that is ignored";
         return nullptr;
@@ -1151,7 +1165,7 @@ GemmFusionAutotunerImpl::CompileAll(AutotunerCompileUtil& compile_util,
                   << Serialize(config) << "'";
           absl::StatusOr<std::unique_ptr<Executable>> executable =
               compile(fusion, config, gemm_config_set.size() > 1);
-          TF_CHECK_OK(executable.status())
+          CHECK_OK(executable.status())
               << " - Failure occured when compiling fusion " << fusion->name()
               << " with config '" << ConfigToString(config)
               << "'\nFused HLO computation:\n"
@@ -1536,12 +1550,17 @@ static absl::Status ExchangeResults(KeyValueStoreInterface& key_value_store,
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> GemmFusionAutotuner::Run(
+absl::StatusOr<bool> GemmFusionAutotuner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("GEMM fusion autotuner");
 
   const DebugOptions& debug_options = module->config().debug_options();
+
+  if (debug_options.xla_gpu_experimental_use_autotuner_pass()) {
+    return RunViaNewInfra(module, execution_threads);
+  }
+
   GemmFusionAutotunerImpl autotuner(config_, toolkit_version_, debug_options,
                                     thread_pool_, mlir_context_);
   GemmFusionCollector fusion_collector(&autotuner);
@@ -1662,5 +1681,47 @@ absl::StatusOr<bool> GemmFusionAutotuner::Run(
       module, execution_threads);
 }
 
+absl::StatusOr<bool> GemmFusionAutotuner::RunViaNewInfra(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  const DebugOptions& debug_options = module->config().debug_options();
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+
+  se::StreamExecutor* stream_exec = config_.GetExecutor();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
+                      Compiler::GetForPlatform(stream_exec->GetPlatform()));
+  se::DeviceMemoryAllocator* device_allocator = config_.GetAllocator();
+  std::unique_ptr<Compiler::GpuTargetConfig> target_config;
+  target_config = std::make_unique<Compiler::GpuTargetConfig>(stream_exec);
+  backends.push_back(std::make_unique<TritonBackend>(
+      &debug_options, compiler.get(), target_config.get(), mlir_context_));
+  backends.push_back(std::make_unique<FissionBackend>(
+      &debug_options, compiler.get(), target_config.get(),
+      std::make_unique<CublasBackend>(stream_exec, &debug_options,
+                                      compiler.get(), target_config.get()),
+      GetCublasRewriterPipeline(target_config->device_description),
+      mlir_context_));
+  auto should_autotune = [](const HloInstruction& instruction) -> bool {
+    if (instruction.opcode() != HloOpcode::kFusion) {
+      return false;
+    }
+    auto gpu_config = instruction.backend_config<GpuBackendConfig>();
+    const FusionBackendConfig& backend_config =
+        gpu_config->fusion_backend_config();
+    if (backend_config.kind() == kTritonGemmFusionKind ||
+        backend_config.kind() == kCuDnnFusionKind) {
+      return true;
+    }
+    return false;
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<AutotunerPass> autotuner_pass,
+      AutotunerPass::Create(std::move(backends), debug_options, stream_exec,
+                            thread_pool_, should_autotune, target_config.get(),
+                            device_allocator, false, key_value_store_));
+  return autotuner_pass->Run(module, execution_threads);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
index bc937b6c8b7f9b..5d9f32948a6ca2 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
@@ -28,8 +28,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/autotuning.pb.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -88,11 +88,15 @@ class GemmFusionAutotuner : public HloModulePass {
 
   absl::string_view name() const override { return "gemm-fusion-autotuner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+  absl::StatusOr<bool> RunViaNewInfra(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
  private:
   AutotuneConfig config_;
   se::SemanticVersion toolkit_version_;
@@ -164,6 +168,7 @@ class GemmFusionAutotunerImpl {
   // Helper methods.
   const AutotuneConfig& GetConfig() const { return config_; }
   bool IsAutotuningEnabled() const;
+  bool IsWarpSpecializationAvailable() const;
 
   static const int64_t BLAS_GEMM_DEFAULT;
 
@@ -201,15 +206,8 @@ class GemmFusionAutotunerImpl {
     return config_.GetGpuComputeCapability();
   }
 
-  bool isRocm() const {
-    return std::holds_alternative<se::RocmComputeCapability>(
-        GetComputeCapability());
-  }
-
-  bool IsFusionKind(const HloInstruction& hlo, absl::string_view kind);
-
   bool AddLibConfigs(const HloFusionInstruction& fusion,
-                     const HloDotInstruction* dot,
+                     const HloInstruction* dot,
                      std::vector<BackendConfig>& configs);
 
   std::vector<TritonGemmConfig> GetDefaultTritonConfigs() const;
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
index 297222cd26fdfc..aea76b846fc2ca 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
@@ -17,19 +17,24 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/autotuning/gemm_fusion_autotuner.h"
 #include "xla/service/gpu/autotuning/triton_configs.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
 
 namespace xla {
 namespace gpu {
@@ -48,28 +53,39 @@ int GetCuDnnPlanCount(const HloInstruction& hlo,
 }
 
 bool GemmFusionAutotunerImpl::AddLibConfigs(
-    const HloFusionInstruction& fusion, const HloDotInstruction* dot,
+    const HloFusionInstruction& fusion, const HloInstruction* dot,
     std::vector<BackendConfig>& configs) {
   // Add cuDNN plans, if available.
-  auto cc = std::get<se::CudaComputeCapability>(GetComputeCapability());
-  bool is_cudnn_enabled =
-      !config_.IsDeviceless() &&
-      GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9 &&
+  stream_executor::CudaComputeCapability cc =
+      *GetComputeCapability().cuda_compute_capability();
+  auto dnn_version = GetDnnVersionInfoOrDefault(
+      !config_.IsDeviceless() ? config_.GetExecutor() : nullptr);
+
+  bool is_cudnn_fusion = IsGpuFusionKind(fusion, kCuDnnFusionKind);
+  bool is_supported_triton_dot_fusion =
+      IsGpuFusionKind(fusion, kTritonGemmFusionKind) &&
+      dnn_version.major_version() >= 9 &&
+      algorithm_util::IsSupportedByCudnn(dot->precision_config().algorithm()) &&
       ((cc.IsAtLeastAmpere() &&
         debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 1) ||
        (cc.IsAtLeastBlackwell() &&
         debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0));
-  if ((IsFusionKind(fusion, kCuDnnFusionKind) && IsAutotuningEnabled()) ||
-      (IsFusionKind(fusion, kTritonGemmFusionKind) && is_cudnn_enabled &&
-       algorithm_util::IsSupportedByCudnn(
-           dot->precision_config().algorithm()) &&
-       IsAutotuningEnabled())) {
+  bool is_cudnn_supported_scaled_dot_fusion =
+      IsGpuFusionKind(fusion, kTritonNestedGemmFusionKind) &&
+      dot->opcode() == HloOpcode::kScaledDot &&
+      dnn_version >= kCudnnSupportsBlockScaledDot &&
+      CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(dot)) &&
+      cc.IsAtLeastBlackwell();
+
+  if (IsAutotuningEnabled() &&
+      (is_cudnn_fusion || is_supported_triton_dot_fusion ||
+       is_cudnn_supported_scaled_dot_fusion)) {
     const int plan_count = GetCuDnnPlanCount(fusion, config_);
     for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
       configs.push_back(CuDnnConfig{plan_id});
     }
   }
-  if (IsFusionKind(fusion, kCuDnnFusionKind)) {
+  if (IsGpuFusionKind(fusion, kCuDnnFusionKind)) {
     if (!IsAutotuningEnabled()) {
       configs.push_back(CuDnnConfig{-1});
     }
@@ -80,8 +96,8 @@ bool GemmFusionAutotunerImpl::AddLibConfigs(
 
 std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
     const {
-  auto compute_capability =
-      std::get<se::CudaComputeCapability>(GetComputeCapability());
+  stream_executor::CudaComputeCapability compute_capability =
+      *GetComputeCapability().cuda_compute_capability();
   std::vector<TritonGemmConfig> configs;
 
   if (compute_capability.IsAtLeastBlackwell()) {
@@ -93,20 +109,39 @@ std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
   }
 
   if (!debug_options_.xla_gpu_experimental_enable_triton_tma() ||
-      !compute_capability.IsAtLeastHopper()) {
+      !stream_executor::gpu::IsTmaAvailableForDevice(
+          config_.GetDeviceDescription())) {
     return configs;
   }
-
-  // Hopper+ devices support TMA. Add TMA parameterized configs.
   std::vector<TritonGemmConfig> tma_parameterized_configs;
   for (auto& config : configs) {
     config.is_tma_allowed = false;
     tma_parameterized_configs.push_back(config);
 
-    config.is_tma_allowed = true;
-    tma_parameterized_configs.push_back(config);
+    if (IsTmaRecommended(config)) {
+      config.is_tma_allowed = true;
+      tma_parameterized_configs.push_back(config);
+    }
   }
-  return tma_parameterized_configs;
+
+  // TODO(b/449668102): Currently only supporting warp specialization on
+  // Blackwell+. Potentially extend support to Hopper.
+  if (!compute_capability.IsAtLeastBlackwell()) {
+    return tma_parameterized_configs;
+  }
+  std::vector<TritonGemmConfig> warp_specialized_configs;
+  for (auto& config : tma_parameterized_configs) {
+    config.is_warp_specialization_allowed = false;
+    warp_specialized_configs.push_back(config);
+
+    if (config.is_tma_allowed && config.num_warps <= 16 &&
+        config.num_warps % 4 == 0) {
+      config.is_warp_specialization_allowed = true;
+      warp_specialized_configs.push_back(config);
+    }
+  }
+
+  return warp_specialized_configs;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
index e128e3b1eff800..b1ac498f97cd34 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
@@ -28,7 +28,7 @@ namespace gpu {
 const int64_t GemmFusionAutotunerImpl::BLAS_GEMM_DEFAULT = HIPBLAS_GEMM_DEFAULT;
 
 bool GemmFusionAutotunerImpl::AddLibConfigs(
-    const HloFusionInstruction& fusion, const HloDotInstruction* dot,
+    const HloFusionInstruction& fusion, const HloInstruction* dot,
     std::vector<BackendConfig>& configs) {
   return false;
 }
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
index 62915b7c319ea7..88a5511d75f79e 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -39,6 +38,7 @@ limitations under the License.
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
+#include "xla/service/gpu/autotuning/autotune_cache_key.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -85,6 +86,8 @@ namespace {
 
 namespace m = ::xla::match;
 
+using ::mlir::MLIRContext;
+
 using HloExtractionTest = HloTestBase;
 
 TEST_F(HloExtractionTest, InstructionExtractionIsCorrect) {
@@ -192,22 +195,24 @@ class StatelessAutotunerTest : public HloTestBase {
       const HloModule& module,
       const se::GpuComputeCapability& compute_capability,
       const se::SemanticVersion& toolkit_version,
-      const DebugOptions& debug_options, mlir::MLIRContext* mlir_context) {
+      const DebugOptions& debug_options, MLIRContext* mlir_context) {
     const HloFusionInstruction& fusion = *Cast<HloFusionInstruction>(
         module.entry_computation()->root_instruction());
-    if (!isRocm()) {
-      auto cu_compute_capability =
-          std::get<se::CudaComputeCapability>(compute_capability);
+    if (GpuComputeComp().IsCuda()) {
+      auto* cu_compute_capability =
+          compute_capability.cuda_compute_capability();
       se::GpuDeviceInfoProto deviceless_proto;
       auto ccc = deviceless_proto.mutable_cuda_compute_capability();
-      ccc->set_major(cu_compute_capability.major);
-      ccc->set_minor(cu_compute_capability.minor);
+      ccc->set_major(cu_compute_capability->major);
+      ccc->set_minor(cu_compute_capability->minor);
     }
 
     DeviceConfig test_config{backend().default_stream_executor(),
                              backend().memory_allocator()};
-    AutotuneConfig autotune_config = AutotuneConfig::FromDebugOptions(
-        DeviceOrDevicelessConfig{test_config}, debug_options);
+    TF_ASSIGN_OR_RETURN(
+        AutotuneConfig autotune_config,
+        AutotuneConfig::FromDebugOptions(DeviceOrDevicelessConfig{test_config},
+                                         debug_options));
     GemmFusionAutotunerImpl autotuner(autotune_config, toolkit_version,
                                       debug_options, nullptr, mlir_context);
     return autotuner.GenerateConfigs(fusion);
@@ -234,17 +239,15 @@ class StatelessAutotunerTest : public HloTestBase {
         .gpu_compute_capability();
   }
 
-  bool isRocm() {
-    return std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp());
-  }
-
   // Returns the config for the current device.
   absl::StatusOr<std::vector<GemmFusionAutotunerImpl::BackendConfig>>
   GetPossibleMatmulAutotuneConfigs(const HloModule& module) {
     DeviceConfig device_config{backend().default_stream_executor(),
                                backend().memory_allocator()};
-    AutotuneConfig autotune_config = AutotuneConfig::FromDebugOptions(
-        DeviceOrDevicelessConfig{device_config}, GetDebugOptionsForTest());
+    TF_ASSIGN_OR_RETURN(
+        AutotuneConfig autotune_config,
+        AutotuneConfig::FromDebugOptions(
+            DeviceOrDevicelessConfig{device_config}, GetDebugOptionsForTest()));
     GemmFusionAutotunerImpl autotuner(autotune_config, GetToolkitVersion(),
                                       GetDebugOptionsForTest(), nullptr,
                                       &mlir_context_);
@@ -317,7 +320,7 @@ TEST_F(StatelessAutotunerTest, CublasFallbackForBf16Bf16F32Algorithm) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto configs,
                           GetPossibleMatmulAutotuneConfigs(*module));
-  if (!isRocm()) {
+  if (!GpuComputeComp().IsRocm()) {
     switch (GetCudaComputeCapability().major) {
       case se::CudaComputeCapability::kAmpere:
         EXPECT_TRUE(hasCublasConfig(configs))
@@ -330,6 +333,7 @@ TEST_F(StatelessAutotunerTest, CublasFallbackForBf16Bf16F32Algorithm) {
                "Hopper";
         break;
       case se::CudaComputeCapability::kBlackwell:
+      case se::CudaComputeCapability::kBlackwell_11:
       case se::CudaComputeCapability::kBlackwell_12:
         EXPECT_TRUE(hasCublasConfig(configs))
             << "There should be a cublas fallback for dot_bf16_bf16_f32 on "
@@ -357,8 +361,8 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
   }
 
   stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (isRocm()) {
-      return GetRocmComputeCapability();
+    if (GpuComputeComp().IsRocm()) {
+      return GpuComputeComp();
     } else {
       return stream_executor::GpuComputeCapability{
           stream_executor::CudaComputeCapability{
@@ -377,13 +381,14 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
                                         tsl::port::MaxParallelism());
     DebugOptions opts;
     MultiProcessKeyValueStore key_value_store;
-    pipeline.AddPass<GemmFusionAutotuner>(
-        AutotuneConfig::FromDebugOptions(
-            DeviceOrDevicelessConfig{
-                DeviceConfig{backend().default_stream_executor(),
-                             backend().memory_allocator()}},
-            opts),
-        GetToolkitVersion(), &thread_pool, key_value_store, &mlir_context_);
+    absl::StatusOr<AutotuneConfig> config = AutotuneConfig::FromDebugOptions(
+        DeviceOrDevicelessConfig{DeviceConfig{
+            backend().default_stream_executor(), backend().memory_allocator()}},
+        opts);
+    CHECK_OK(config.status());
+    pipeline.AddPass<GemmFusionAutotuner>(*config, GetToolkitVersion(),
+                                          &thread_pool, key_value_store,
+                                          &mlir_context_);
 
     RunAndFilecheckHloRewrite(
         hlo, std::move(pipeline), expected, [](const HloModule* m) {
@@ -417,11 +422,12 @@ absl::StatusOr<std::vector<TritonGemmConfig>>
 GetPossibleMatmulAutotuneTritonConfigs(
     const D& dot, const se::CudaComputeCapability& compute_capability,
     const se::SemanticVersion& toolkit_version,
-    const DebugOptions& debug_options, mlir::MLIRContext* mlir_context) {
+    const DebugOptions& debug_options, MLIRContext* mlir_context) {
   TF_ASSIGN_OR_RETURN(se::DeviceDescription device_description,
                       se::DeviceDescription::FromProto(
                           se::GpuDeviceInfoProto::default_instance()));
-  device_description.set_gpu_compute_capability(compute_capability);
+  device_description.set_gpu_compute_capability(
+      se::GpuComputeCapability{compute_capability});
   // Using H100 numbers as the most relevant example here.
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
   // https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/#nvidia_h100_gpu_architecture_in-depth
@@ -431,15 +437,17 @@ GetPossibleMatmulAutotuneTritonConfigs(
   device_description.set_threads_per_warp(32);
   device_description.set_shared_memory_per_block_optin(227 * 1024);
   DevicelessConfig test_config = {device_description};
-  AutotuneConfig autotune_config = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{test_config}, debug_options);
+  TF_ASSIGN_OR_RETURN(
+      AutotuneConfig autotune_config,
+      AutotuneConfig::FromDebugOptions(DeviceOrDevicelessConfig{test_config},
+                                       debug_options));
   GemmFusionAutotunerImpl autotuner(autotune_config, toolkit_version,
                                     debug_options, nullptr, mlir_context);
   return autotuner.GenerateTritonConfigs(dot);
 }
 
 TEST_F(GemmFusionAutotunerTest, AmpereUsesMoreThanTwoStages) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
@@ -575,7 +583,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: reduce
 ; CHECK: ENTRY
-; CHECK: f32[16,7,18]{2,1,0} fusion({{.*}})
+; CHECK: f32[{{.*}},7,18]{2,1,0} fusion({{.*}})
 ; CHECK: ROOT {{.*}} f16[7,18]{1,0} fusion({{.*}})
 )");
 
@@ -616,112 +624,34 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-// TODO(b/344770374): Make this test not fragile.
-TEST_F(GemmFusionAutotunerTest, DoNotRunAutotuningKernelSpillingRegisters) {
-  if (isRocm()) {
-    GTEST_SKIP() << "Not supported on ROCm.";
-  }
+TEST_F(GemmFusionAutotunerTest, RunAutotuningKernelNotSpillingRegisters) {
   const std::string kHloText = R"(
 HloModule m
 
-%triton_gemm_dot {
-  %p1 = s8[4,12288]{1,0} parameter(1)
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %convert.p0 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
-  %convert.p1 = f16[4,12288]{1,0} convert(s8[4,12288]{1,0} %p1)
-  %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %convert.p1, f16[12288,1536]{1,0} %convert.p0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT %convert = s8[4,1536]{1,0} convert(f16[4,1536]{1,0} %dot)
+rhs_computation {
+  %p0 = s8[12288,1536] parameter(0)
+  ROOT %convert = f16[12288,1536] convert(%p0)
 }
 
-ENTRY %e {
-  %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
-  %convert = s8[4,12288]{1,0} parameter(1)
-  ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
-})";
-
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  EXPECT_THAT(backend().compiler()->RunBackend(
-                  std::move(module), backend().default_stream_executor(),
-                  {/*device_allocator=*/nullptr,
-                   /*thread_pool=*/nullptr,
-                   /*layout_canonicalization_callback=*/{},
-                   /*is_autotuning_compilation=*/true}),
-              ::testing::AnyOf(
-                  absl_testing::StatusIs(
-                      tsl::error::CANCELLED,
-                      "Compilation result discarded due to register spilling"),
-                  // Hopper can't spill registers since wgmma instructions are
-                  // asynchronous, instead it just runs out of them.
-                  absl_testing::StatusIs(
-                      tsl::error::RESOURCE_EXHAUSTED,
-                      ::testing::HasSubstr("Register allocation failed")),
-                  absl_testing::StatusIs(
-                      tsl::error::RESOURCE_EXHAUSTED,
-                      ::testing::HasSubstr("Insufficient registers"))));
+lhs_computation {
+  ROOT %p1 = f16[4,12288] parameter(0)
 }
 
-// TODO(b/344770374): Make this test not fragile.
-TEST_F(GemmFusionAutotunerTest,
-       DoNotFilterOutAutotuningKernelSpillingRegisters) {
-  if (GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "Hopper and newer runs out of registers for such HLOs";
-  }
-  const std::string kHloText = R"(
-HloModule m
-
 %triton_gemm_dot {
-  %p1 = s8[4,12288]{1,0} parameter(1)
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %convert.p0 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
-  %convert.p1 = f16[4,12288]{1,0} convert(s8[4,12288]{1,0} %p1)
-  %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %convert.p1, f16[12288,1536]{1,0} %convert.p0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT %convert = s8[4,1536]{1,0} convert(f16[4,1536]{1,0} %dot)
+  %p0 = s8[12288,1536] parameter(0)
+  %p1 = f16[4,12288] parameter(1)
+  %rhs = f16[12288,1536] fusion(%p0), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","16"]}]}}}
+  %lhs = f16[4,12288] fusion(%p1), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","32"]}]}}}
+  ROOT %dot = f16[4,1536] dot(%lhs, %rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY %e {
-  %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
-  %convert = s8[4,12288]{1,0} parameter(1)
-  ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
-})";
-
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  HloModuleConfig config = module->config();
-  DebugOptions debug_options = config.debug_options();
-  debug_options.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
-      false);
-  config.set_debug_options(debug_options);
-  module->set_config(config);
-
-  std::unique_ptr<Executable> executable =
-      backend()
-          .compiler()
-          ->RunBackend(std::move(module), backend().default_stream_executor(),
-                       {/*device_allocator=*/nullptr,
-                        /*thread_pool=*/nullptr,
-                        /*layout_canonicalization_callback=*/{},
-                        /*is_autotuning_compilation=*/true})
-          .value();
-  EXPECT_NE(executable, nullptr);
-}
-
-TEST_F(GemmFusionAutotunerTest, RunAutotuningKernelNotSpillingRegisters) {
-  const std::string kHloText = R"(
-HloModule m
-
-%triton_gemm_dot {
-  %p1 = f16[4,12288]{1,0} parameter(1)
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %convert.10406 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
-  ROOT %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %p1, f16[12288,1536]{1,0} %convert.10406), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY %e {
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %p1 = f16[4,12288]{1,0} parameter(1)
-  ROOT %triton_dot = f16[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %p0, f16[4,12288]{1,0} %p1), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"32","block_k":"16","split_k":"1","num_stages":"1","num_warps":"2","num_ctas":"1"}}}
+  %p0 = s8[12288,1536] parameter(0)
+  %p1 = f16[4,12288] parameter(1)
+  ROOT %triton_dot = f16[4,1536] fusion(%p0, %p1), kind=kCustom, calls=%triton_gemm_dot,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion","block_level_fusion_config":{"output_tiles":[{"sizes":["16","32"]}],"num_stages":"1","num_warps":"2","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
@@ -759,10 +689,12 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   DebugOptions opts;
-  AutotuneConfig autotune_config = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{DeviceConfig{backend().default_stream_executor(),
-                                            backend().memory_allocator()}},
-      opts);
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig autotune_config,
+      AutotuneConfig::FromDebugOptions(DeviceOrDevicelessConfig{DeviceConfig{
+                                           backend().default_stream_executor(),
+                                           backend().memory_allocator()}},
+                                       opts));
   AutotuneCacheKey cache_key(autotune_config.GetDeviceDescription(),
                              *module->entry_computation()->root_instruction());
 
@@ -801,11 +733,10 @@ ENTRY main {
 
   TF_EXPECT_OK(HloTestBase::RunHloPass(&pipeline, module.get()));
   const bool is_at_least_hopper =
-      std::holds_alternative<se::CudaComputeCapability>(
-          autotune_config.GetGpuComputeCapability()) &&
-      std::get<se::CudaComputeCapability>(
-          autotune_config.GetGpuComputeCapability())
-          .IsAtLeastHopper();
+      autotune_config.GetGpuComputeCapability().IsCuda() &&
+      autotune_config.GetGpuComputeCapability()
+          .cuda_compute_capability()
+          ->IsAtLeastHopper();
   TF_ASSERT_OK_AND_ASSIGN(
       bool filecheck_matches,
       RunFileCheck(module->ToString(), is_at_least_hopper
@@ -815,8 +746,9 @@ ENTRY main {
 }
 
 TEST_F(GemmFusionAutotunerDumpTest, DumpingWorks) {
-  if (isRocm() || GetDebugOptionsForTest()
-                      .xla_gpu_experimental_disable_binary_libraries()) {
+  if (GpuComputeComp().IsRocm() ||
+      GetDebugOptionsForTest()
+          .xla_gpu_experimental_disable_binary_libraries()) {
     GTEST_SKIP() << "Not supported on ROCm or with binary libraries disabled.";
   }
   HloModuleConfig config;
@@ -884,8 +816,9 @@ CHECK: cublas
 }
 
 TEST_F(GemmFusionAutotunerTest, AutotuneCuDnnFusion) {
-  if (isRocm() || GetDebugOptionsForTest()
-                      .xla_gpu_experimental_disable_binary_libraries()) {
+  if (GpuComputeComp().IsRocm() ||
+      GetDebugOptionsForTest()
+          .xla_gpu_experimental_disable_binary_libraries()) {
     GTEST_SKIP() << "Not supported on ROCm or with binary libraries disabled.";
   }
   const std::string kHlo = R"(
@@ -909,48 +842,43 @@ ENTRY e {
 )");
 }
 
-// TODO(b/281489442): Write a testcase called
-// `SkipConfigsProducingDeviantResults` or similar.
-
-// TODO(b/393299275): remove when the legacy GEMM emitter is removed.
-class GemmFusionAutotunerLevelLegacyEmitterTest
-    : public StatelessAutotunerTest,
-      public ::testing::WithParamInterface<int> {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options =
-        StatelessAutotunerTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_autotune_level(GetParam());
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    return debug_options;
+TEST_F(GemmFusionAutotunerTest, AutotuneScaledDotCuDnnFusion) {
+  if (GpuComputeComp().IsRocm() ||
+      GetDebugOptionsForTest()
+          .xla_gpu_experimental_disable_binary_libraries()) {
+    GTEST_SKIP() << "Not supported on ROCm or with binary libraries disabled.";
   }
-};
-
-TEST_P(GemmFusionAutotunerLevelLegacyEmitterTest,
-       AllAutotuningLevelsWorkCorrectly) {
-  const std::string kHloText = R"(
-HloModule m
+  if (!GetCudaComputeCapability().IsAtLeastBlackwell()) {
+    GTEST_SKIP() << "Not supported on pre-Blackwell GPUs.";
+  }
+  const std::string kHlo = R"(
+fusion1 {
+  %lhs = f8e4m3fn[4,192,224] parameter(0)
+  %rhs = f8e4m3fn[4,256,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,192,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,256,7] parameter(3)
+  ROOT %result = f32[4,192,256] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
 
 ENTRY e {
-  p0 = pred[64,10] parameter(0)
-  p0c = f32[64,10] convert(p0)
-  p1 = f32[10,128] parameter(1)
-  ROOT r = f32[64,128] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %lhs = f8e4m3fn[4,192,224] parameter(0)
+  %rhs = f8e4m3fn[4,256,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,192,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,256,7] parameter(3)
+  ROOT _ = f32[4,192,256] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion1,
+      backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
 })";
 
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: kind=kCustom
-; CHECK-SAME: __triton_gemm
-      )");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  CheckTritonAutotuning(kHlo, R"(
+// CHECK: "plan_id":
+)");
 }
 
-INSTANTIATE_TEST_SUITE_P(GemmFusionAutotunerLevelSweep,
-                         GemmFusionAutotunerLevelLegacyEmitterTest,
-                         ::testing::Range(0, 5));
+// TODO(b/281489442): Write a testcase called
+// `SkipConfigsProducingDeviantResults` or similar.
 
 class GemmFusionAutotunerLevelTest : public StatelessAutotunerTest,
                                      public ::testing::WithParamInterface<int> {
@@ -960,10 +888,6 @@ class GemmFusionAutotunerLevelTest : public StatelessAutotunerTest,
         StatelessAutotunerTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_autotune_level(GetParam());
     debug_options.set_xla_gpu_cublas_fallback(false);
-    // TODO(b/393299275): remove when the flag is enabled by default.
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
     return debug_options;
   }
 };
@@ -1013,12 +937,15 @@ ENTRY e {
                                       tsl::port::MaxParallelism());
   DebugOptions opts;
   MultiProcessKeyValueStore key_value_store;
-  pipeline.AddPass<GemmFusionAutotuner>(
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig config,
       AutotuneConfig::FromDebugOptions(
           DeviceOrDevicelessConfig{DevicelessConfig{
               backend().default_stream_executor()->GetDeviceDescription()}},
-          opts),
-      GetToolkitVersion(), &thread_pool, key_value_store, &mlir_context_);
+          opts));
+  pipeline.AddPass<GemmFusionAutotuner>(config, GetToolkitVersion(),
+                                        &thread_pool, key_value_store,
+                                        &mlir_context_);
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
@@ -1152,15 +1079,17 @@ TEST_F(GemmFusionAutotunerTest, SplitKFLoatNormalization) {
   ccc->set_minor(compute_capability.minor);
   DeviceConfig test_config{backend().default_stream_executor(),
                            backend().memory_allocator()};
-  AutotuneConfig autotune_config = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{test_config}, GetDebugOptionsForTest());
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig autotune_config,
+      AutotuneConfig::FromDebugOptions(DeviceOrDevicelessConfig{test_config},
+                                       GetDebugOptionsForTest()));
   GemmFusionAutotunerImpl autotuner(autotune_config, GetToolkitVersion(),
                                     GetDebugOptionsForTest(), nullptr,
                                     &mlir_context_);
   TF_ASSERT_OK_AND_ASSIGN(
       AutotunerCompileUtil compile_util,
       AutotunerCompileUtil::Create(autotune_config.DeviceConfig(),
-                                   GetDebugOptionsForTest()))
+                                   GetDebugOptionsForTest()));
 
   std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
 HloModule module
@@ -1194,7 +1123,7 @@ ENTRY entry {
 }
 
 TEST_F(GemmFusionAutotunerTest, CreatesCustomKernelFusionConfigs) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1233,7 +1162,7 @@ TEST_F(GemmFusionAutotunerTest, CreatesCustomKernelFusionConfigs) {
 }
 
 TEST_F(GemmFusionAutotunerTest, GeneratesTwoConfigsForUpcastGemmWithPrologue) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1265,8 +1194,8 @@ TEST_F(GemmFusionAutotunerTest, GeneratesTwoConfigsForUpcastGemmWithPrologue) {
   TF_ASSERT_OK_AND_ASSIGN(
       const std::vector<GemmFusionAutotunerImpl::BackendConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
-          *module, compute_capability, GetToolkitVersion(),
-          GetDebugOptionsForTest(), &mlir_context_));
+          *module, se::GpuComputeCapability{compute_capability},
+          GetToolkitVersion(), GetDebugOptionsForTest(), &mlir_context_));
   EXPECT_EQ(
       2, std::count_if(
              configs.begin(), configs.end(),
@@ -1279,7 +1208,7 @@ TEST_F(GemmFusionAutotunerTest, GeneratesTwoConfigsForUpcastGemmWithPrologue) {
 TEST_F(GemmFusionAutotunerTest, GeneratesOneConfigForUpcastGemmWithPrologue) {
   // Same as GeneratesTwoConfigsForUpcastGemmWithPrologue, but with contracting
   // dimension size = 128 which is not supported by the SplitK kernel.
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1311,8 +1240,8 @@ TEST_F(GemmFusionAutotunerTest, GeneratesOneConfigForUpcastGemmWithPrologue) {
   TF_ASSERT_OK_AND_ASSIGN(
       const std::vector<GemmFusionAutotunerImpl::BackendConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
-          *module, compute_capability, GetToolkitVersion(),
-          GetDebugOptionsForTest(), &mlir_context_));
+          *module, se::GpuComputeCapability{compute_capability},
+          GetToolkitVersion(), GetDebugOptionsForTest(), &mlir_context_));
   EXPECT_EQ(
       1, std::count_if(
              configs.begin(), configs.end(),
@@ -1324,7 +1253,7 @@ TEST_F(GemmFusionAutotunerTest, GeneratesOneConfigForUpcastGemmWithPrologue) {
 
 TEST_F(GemmFusionAutotunerTest,
        GeneratesConfigForUpcastGemmWithPrologueAndEpilogue) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1360,8 +1289,8 @@ TEST_F(GemmFusionAutotunerTest,
   TF_ASSERT_OK_AND_ASSIGN(
       const std::vector<GemmFusionAutotunerImpl::BackendConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
-          *module, compute_capability, GetToolkitVersion(),
-          GetDebugOptionsForTest(), &mlir_context_));
+          *module, se::GpuComputeCapability{compute_capability},
+          GetToolkitVersion(), GetDebugOptionsForTest(), &mlir_context_));
   EXPECT_EQ(
       2, std::count_if(
              configs.begin(), configs.end(),
@@ -1461,9 +1390,11 @@ class GemmFusionShardedAutotunerTest : public GemmFusionAutotunerTest {
  protected:
   AutotuneConfig GetAutotuneConfigForTest() const {
     return AutotuneConfig::FromDebugOptions(
-        DeviceOrDevicelessConfig{DeviceConfig{
-            backend().default_stream_executor(), backend().memory_allocator()}},
-        GetDebugOptionsForTest());
+               DeviceOrDevicelessConfig{
+                   DeviceConfig{backend().default_stream_executor(),
+                                backend().memory_allocator()}},
+               GetDebugOptionsForTest())
+        .value();
   }
 
   GemmFusionAutotuner GemmFusionAutotunerForKeyValueStore(
@@ -1477,7 +1408,7 @@ class GemmFusionShardedAutotunerTest : public GemmFusionAutotunerTest {
 TEST_F(
     GemmFusionShardedAutotunerTest,
     AutotuningSucceedsWhenKeyValueStoreAlreadyContainsAutotuningResultsForTheInputModule) {  // NOLINT(whitespace/line_length)
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1536,7 +1467,7 @@ TEST_F(
 TEST_F(
     GemmFusionShardedAutotunerTest,
     AutotuningStoresDifferentResultsForTheSameFusionInDifferentModules) {  // NOLINT(whitespace/line_length)
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo1 = R"(
@@ -1617,7 +1548,7 @@ TEST_F(
 }
 
 TEST_F(GemmFusionAutotunerTest, RewritesGemmFusionToCustomKernelFusion) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1642,10 +1573,12 @@ TEST_F(GemmFusionAutotunerTest, RewritesGemmFusionToCustomKernelFusion) {
       ParseAndReturnVerifiedModule(kHlo).value();
 
   DebugOptions opts;
-  AutotuneConfig autotune_config = AutotuneConfig::FromDebugOptions(
-      DeviceOrDevicelessConfig{DeviceConfig{backend().default_stream_executor(),
-                                            backend().memory_allocator()}},
-      opts);
+  TF_ASSERT_OK_AND_ASSIGN(
+      AutotuneConfig autotune_config,
+      AutotuneConfig::FromDebugOptions(DeviceOrDevicelessConfig{DeviceConfig{
+                                           backend().default_stream_executor(),
+                                           backend().memory_allocator()}},
+                                       opts));
   AutotuneCacheKey cache_key(autotune_config.GetDeviceDescription(),
                              *module->entry_computation()->root_instruction());
   TF_ASSERT_OK_AND_ASSIGN(AutotuneResults autotune_results_override,
@@ -1702,7 +1635,7 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTest, VerifyHopperConfigsAreDifferentFromBlackwell) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1741,7 +1674,7 @@ TEST_F(GemmFusionAutotunerTest, VerifyHopperConfigsAreDifferentFromBlackwell) {
 }
 
 TEST_F(GemmFusionAutotunerTest, ScaledDotConfigsAreGenerated) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1769,7 +1702,7 @@ TEST_F(GemmFusionAutotunerTest, ScaledDotConfigsAreGenerated) {
 }
 
 TEST_F(GemmFusionAutotunerTest, ScaledDotConfigsHaveCuBlasFallback) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1801,23 +1734,9 @@ TEST_F(GemmFusionAutotunerTest, ScaledDotConfigsHaveCuBlasFallback) {
          "scaled-dot.";
 }
 
-// TODO(b/315957220): Remove the experimental flags once TMA is enabled by
-// default.
-class GemmFusionAutotunerEnableTma : public GemmFusionAutotunerTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options =
-        GemmFusionAutotunerTest::GetDebugOptionsForTest();
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(true);
-    return debug_options;
-  }
-};
-
-TEST_F(GemmFusionAutotunerEnableTma,
+TEST_F(GemmFusionAutotunerTest,
        TmaConfigsAreGeneratedOnlyForHopperAndWorkCorrectly) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1851,27 +1770,27 @@ TEST_F(GemmFusionAutotunerEnableTma,
   std::set<TritonGemmConfig> hopper_configs_set(hopper_configs.begin(),
                                                 hopper_configs.end());
 
-  // Expect that both configs are greater than zero and that the number of
-  // configs for Hopper is twice the number of configs for Ampere. This is
-  // because Hopper expects the same configs with and without TMA.
+  // Expect that both configs sets are non-empty, that Hopper configs include
+  // TMA options, and Ampere configs do not.
   EXPECT_GT(ampere_configs_set.size(), 0);
   EXPECT_GT(hopper_configs_set.size(), 0);
-  EXPECT_EQ(ampere_configs_set.size() * 2, hopper_configs_set.size());
 
-  auto count_tma_allowed = [](const std::vector<TritonGemmConfig>& configs) {
-    return std::count_if(
+  auto any_tma_allowed = [](const std::vector<TritonGemmConfig>& configs) {
+    return std::any_of(
         configs.begin(), configs.end(),
         [](const TritonGemmConfig& config) { return config.is_tma_allowed; });
   };
-  EXPECT_EQ(count_tma_allowed(hopper_configs), hopper_configs.size() / 2);
+  EXPECT_FALSE(any_tma_allowed(ampere_configs));
+  EXPECT_TRUE(any_tma_allowed(hopper_configs));
 
   EXPECT_TRUE(RunAndCompare(std::move(module),
                             ErrorSpec{/*aabs=*/5e-3, /*arel=*/5e-3}));
 }
 
-TEST_F(GemmFusionAutotunerEnableTma,
-       TmaConfigsGeneratedAndRunCorrectlyForDotsOfBroadcasts) {
-  if (isRocm()) {
+// Context in b/421858850. This test ensures that we work around the issue
+// correctly.
+TEST_F(GemmFusionAutotunerTest, TmaRunCorrectlyForDotsOfBroadcasts) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1885,26 +1804,59 @@ TEST_F(GemmFusionAutotunerEnableTma,
     })")
                                                   .value();
 
+  EXPECT_TRUE(RunAndCompare(std::move(module),
+                            ErrorSpec{/*aabs=*/5e-3, /*arel=*/5e-3}));
+}
+
+TEST_F(GemmFusionAutotunerTest, ReadsOverrideFile) {
+  if (GpuComputeComp().IsRocm()) {
+    GTEST_SKIP() << "Not supported on ROCm.";
+  }
+  std::string output_directory;
+  if (!tsl::io::GetTestUndeclaredOutputsDir(&output_directory)) {
+    output_directory = tsl::testing::TmpDir();
+  }
+  const std::string override_file =
+      tsl::io::JoinPath(output_directory, "override.textproto");
+  // Block M 126 is not really a valid config, but allows us to check that the
+  // override file was used.
+  TF_ASSERT_OK(tsl::WriteStringToFile(tsl::Env::Default(), override_file,
+                                      R"pb(config {
+                                             block_m: 126
+                                             block_n: 32
+                                             block_k: 16
+                                             split_k: 1
+                                             num_stages: 1
+                                             num_warps: 32
+                                             num_ctas: 1
+                                           })pb"));
+
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_gemm_autotuner_override_file(override_file);
+
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+    ENTRY e {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      ROOT r = f32[64,64] dot(p0, p1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })")
+                                                  .value();
+
+  const se::CudaComputeCapability compute_capability{
+      se::CudaComputeCapability::kAmpere, /*minor=*/0};
   TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> hopper_configs,
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneTritonConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          se::CudaComputeCapability(se::CudaComputeCapability::kHopper, 0),
-          GetToolkitVersion(), GetDebugOptionsForTest(), &mlir_context_));
-
-  auto is_disallowed_tma_config = [](const TritonGemmConfig& c) {
-    return c.num_stages > 2 && c.is_tma_allowed;
-  };
-  auto is_allowed_tma_config = [](const TritonGemmConfig& c) {
-    return c.num_stages <= 2 && c.is_tma_allowed;
-  };
-  EXPECT_FALSE(absl::c_any_of(hopper_configs, is_disallowed_tma_config));
-  EXPECT_TRUE(absl::c_any_of(hopper_configs, is_allowed_tma_config));
-
-  EXPECT_TRUE(RunAndCompare(std::move(module),
-                            ErrorSpec{/*aabs=*/5e-3, /*arel=*/5e-3}));
+          compute_capability, GetToolkitVersion(), debug_options,
+          &mlir_context_));
+  EXPECT_TRUE(std::any_of(
+      configs.begin(), configs.end(),
+      [](const TritonGemmConfig& config) { return config.block_m == 126; }));
 }
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
index b795ab00cce562..7cb0896477b419 100644
--- a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
+++ b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
@@ -65,7 +65,7 @@ static const std::vector<TritonGemmConfig>* const kHopperAmpereConfigs =
          Config(128, 16, 32, 8, 4, 2),   Config(128, 16, 64, 16, 3, 2),
          Config(128, 16, 64, 16, 1, 4),  Config(128, 32, 32, 8, 4, 2),
          Config(128, 128, 32, 8, 4, 8),  Config(128, 256, 32, 1, 4, 8),
-         Config(128, 256, 64, 1, 4, 8)});
+         Config(128, 256, 64, 1, 4, 8),  Config(64, 8, 128, 2, 3, 4, 1)});
 
 static const std::vector<TritonGemmConfig>* const kDefaultCudaConfigs =
     new std::vector<TritonGemmConfig>(
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index aef6b4c18b222e..4d3c77a3a63c38 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -136,6 +136,11 @@ message CollectiveBackendConfig {
   reserved 2, 4;
 }
 
+// Backend config for collective metadata operation.
+message CollectiveMetadataBackendConfig {
+  CollectiveDeviceListProto collective_devices = 1;
+}
+
 // Backend config for cost model estimates.
 message ReificationCost {
   // Total execution time of the reified op.
@@ -170,6 +175,14 @@ message CustomFusionConfig {
 
 message CuDnnFusionConfig {
   int64 plan_id = 1;
+
+  // Conv type.
+  enum Kind {
+    CONV_FPROP = 0;
+    CONV_WGRAD = 1;
+    CONV_DGRAD = 2;
+  }
+  optional Kind kind = 2;
 }
 
 // Output tile sizes for a fusion root.
@@ -197,6 +210,9 @@ message BlockLevelFusionConfig {
 
   // Allow/disallow TMA usage for all arguments of the kernel (where possible).
   bool is_tma_allowed = 6;
+
+  // Allow/disallow automatic warp specialization.
+  bool is_warp_specialization_allowed = 7;
 }
 
 message DynamicMemcpyConfig {
@@ -350,7 +366,7 @@ enum DeviceType {
 }
 
 // Generic backend config for XLA:GPU
-// Next-Id: 16
+// Next-Id: 17
 message GpuBackendConfig {
   // Specifies which operation queue the current instruction will run on.
   // A backend may have multiple operation queues to run instructions
@@ -384,6 +400,8 @@ message GpuBackendConfig {
     BlockScaledDotBackendConfig block_scaled_dot_backend_config = 13;
 
     NativeEmitterBackendConfig native_emitter_backend_config = 15;
+
+    CollectiveMetadataBackendConfig collective_metadata_backend_config = 16;
   }
 
   // This attribute instructs the latency-hiding scheduler to
diff --git a/third_party/xla/xla/service/gpu/backend_configs_test.cc b/third_party/xla/xla/service/gpu/backend_configs_test.cc
index f09ef9a9cd23e5..94721cb3deb97c 100644
--- a/third_party/xla/xla/service/gpu/backend_configs_test.cc
+++ b/third_party/xla/xla/service/gpu/backend_configs_test.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -33,7 +33,6 @@ namespace {
 
 using ::testing::HasSubstr;
 using ::testing::IsFalse;
-using ::tsl::testing::IsOk;
 
 using BackendConfigsTest = HloHardwareIndependentTestBase;
 
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 6613dabb587de5..086142041b3088 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -11,6 +11,7 @@ def get_cub_sort_kernel_types(name = ""):
     """ List of supported types for CUB sort kernels.
     """
     return [
+        "bf16",
         "f16",
         "f32",
         "f64",
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 5f15b88cc114b0..0e19ca1bb3bbb2 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -59,8 +59,8 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_memory_space_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/metrics.h"
+#include "xla/service/gpu/thunk_emitter.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
@@ -79,9 +79,10 @@ limitations under the License.
 #include "tsl/profiler/lib/traceme.h"
 
 namespace xla::gpu {
-
 namespace {
 
+using ::mlir::MLIRContext;
+
 using tsl::profiler::ScopedAnnotation;
 
 // Prints mlir diagnostic messages to VLOG level 2.
@@ -145,16 +146,14 @@ CompileModuleResults InitializeResults(const HloModule* hlo_module,
 }
 
 std::string GetDumpName(const se::DeviceDescription& device_desc) {
-  struct GetCcStr {
-    std::string operator()(const se::CudaComputeCapability& cc) const {
-      return absl::StrCat("sm_", cc.ToString());
-    }
-    std::string operator()(const se::RocmComputeCapability& cc) const {
-      return cc.gfx_version();
-    }
-  };
-  std::string prefix =
-      std::visit(GetCcStr(), device_desc.gpu_compute_capability());
+  std::string prefix;
+  if (auto* cc =
+          device_desc.gpu_compute_capability().cuda_compute_capability()) {
+    prefix = absl::StrCat("sm_", cc->ToString());
+  } else if (auto* cc = device_desc.gpu_compute_capability()
+                            .rocm_compute_capability()) {
+    prefix = cc->gfx_version();
+  }
   return absl::StrCat(prefix, "_gpu_", kAfterOptimizationsDumpName);
 }
 
@@ -191,14 +190,13 @@ absl::StatusOr<std::unique_ptr<SequentialThunk>> LowerHlo(
     TF_RETURN_IF_ERROR(
         LoadCache(ir_emitter_context, options.xla_gpu_kernel_cache_file()));
   }
-  std::unique_ptr<IrEmitterUnnested> ir_emitter =
-      IrEmitterUnnested::Create(&ir_emitter_context);
+  std::unique_ptr<ThunkEmitter> thunk_emitter =
+      ThunkEmitter::Create(&ir_emitter_context);
   {
     XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
         "GpuCompiler::RunBackend - IR emission for ", hlo_module->name()));
 
-    TF_RETURN_IF_ERROR(
-        ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
+    TF_RETURN_IF_ERROR(thunk_emitter->EmitHloEntryComputation(hlo_module));
 
     RemoveUnusedAndUninitializedGlobals(
         platform_id, options, ir_emitter_context.llvm_module_constants(),
@@ -209,7 +207,7 @@ absl::StatusOr<std::unique_ptr<SequentialThunk>> LowerHlo(
     uint64_t end_usecs = tsl::Env::Default()->NowMicros();
     RecordHloToLlvmDuration(end_usecs - start_usecs);
   }
-  return ir_emitter->ConsumeThunkSequence();
+  return thunk_emitter->ConsumeThunkSequence();
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index e40088194fdfaf..5c7f2756f7f7e9 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -31,10 +31,10 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/alias_info.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/kernel_reuse_cache.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
index 54799e7b4ce471..34682660e4e0ee 100644
--- a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
+++ b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
@@ -34,11 +34,11 @@ class ConvolutionLayoutNormalizationTest : public HloTestBase {
         .cuda_compute_capability();
   }
   bool IsRocm() {
-    return std::holds_alternative<se::RocmComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability()
+        .IsRocm();
   }
 };
 
@@ -118,8 +118,8 @@ ENTRY TestComputation {
 }
 
 TEST_F(ConvolutionLayoutNormalizationTest, GraphConvF8) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::kHopper)) {
+  if (IsRocm() || !GetCudaComputeCapability().IsAtLeast(
+                      se::CudaComputeCapability::kHopper)) {
     GTEST_SKIP() << "FP8 convolutions require Hopper or newer architecture.";
   }
   const char* hlo = R"(
diff --git a/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc b/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
index 5e7800be91789e..7f538a63124797 100644
--- a/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
+++ b/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -33,26 +34,24 @@ namespace {
 
 bool DimensionRequiresPadding(const int64_t size, const PrimitiveType data_type,
                               const se::GpuComputeCapability& gpu_cc) {
-  return std::visit(
-      absl::Overload(
-          [&](const se::CudaComputeCapability& cc) {
-            for (const auto& req : CublasPaddingRequirements) {
-              if (cc.SupportsAllFeaturesOf(req.min_compute_capability) &&
-                  data_type == req.data_type && size % req.multiple_of != 0) {
-                return true;
-              }
-            }
-            return false;
-          },
-          [&](const se::RocmComputeCapability& cc) {
-            for (const auto& req : HipblasPaddingRequirements) {
-              if (data_type == req.data_type && size % req.multiple_of != 0) {
-                return true;
-              }
-            }
-            return false;
-          }),
-      gpu_cc);
+  if (const se::CudaComputeCapability* cc = gpu_cc.cuda_compute_capability()) {
+    for (const auto& req : CublasPaddingRequirements) {
+      if (cc->SupportsAllFeaturesOf(req.min_compute_capability) &&
+          data_type == req.data_type && size % req.multiple_of != 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+  if (const se::RocmComputeCapability* cc = gpu_cc.rocm_compute_capability()) {
+    for (const auto& req : HipblasPaddingRequirements) {
+      if (data_type == req.data_type && size % req.multiple_of != 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+  return false;
 }
 
 bool ShapeRequiresPadding(const Shape& shape, int batch_dimensions_size,
diff --git a/third_party/xla/xla/service/gpu/cublas_padding_requirements.h b/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
index 746e8953de5677..27e40c90fc029d 100644
--- a/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
+++ b/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
@@ -42,8 +42,8 @@ constexpr std::array<CublasPaddingRequirement, 3> CublasPaddingRequirements{
      {se::CudaComputeCapability::Volta(), F16, 8},
      {se::CudaComputeCapability::Ampere(), BF16, 8}}};
 
-constexpr std::array<HipblasPaddingRequirement, 2> HipblasPaddingRequirements{
-    {{/*rocm gpu arch,*/ F16, 8}, {/*rocm gpu arch,*/ BF16, 8}}};
+// No padding requirements for ROCM
+constexpr std::array<HipblasPaddingRequirement, 0> HipblasPaddingRequirements;
 
 // Tell if either of the operands of the dot requires padding.
 bool CublasRequiresPadding(const HloDotInstruction& dot,
diff --git a/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc b/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc
index 7807d6caf0b9aa..18e16a3a830961 100644
--- a/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/cudnn_support_utils.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -24,6 +23,10 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -35,19 +38,14 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 class CudnnSupportUtilsTest : public HloHardwareIndependentTestBase {
  public:
   // Gets the custom call with `target` from the `module`. Expects that there is
@@ -388,11 +386,11 @@ TEST_P(ReorderFilterRank4Test, InferTransposeRank4) {
 }
 
 std::vector<std::string> GeneratePermutations(std::string input_dims) {
-  std::sort(input_dims.begin(), input_dims.end());
+  absl::c_sort(input_dims);
   std::vector<std::string> permutations;
   do {
     permutations.push_back(input_dims);
-  } while (std::next_permutation(input_dims.begin(), input_dims.end()));
+  } while (absl::c_next_permutation(input_dims));
   return permutations;
 }
 
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index 03078f8cbca8de..0e9ef621b3f53e 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "xla/shape.h"
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "xla/literal_util.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
@@ -34,11 +36,13 @@ limitations under the License.
 #endif
 
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/ffi.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
@@ -48,9 +52,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/literal.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
@@ -95,6 +101,8 @@ XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(::xla::Range, StructMember<int64_t>("lo"),
 
 namespace xla {
 namespace {
+using ::absl_testing::StatusIs;
+using ::testing::HasSubstr;
 
 using CustomCallTest = ClientLibraryTestRunnerMixin<HloTestBase>;
 
@@ -364,9 +372,12 @@ TEST_F(CustomCallTest, ExportedFfiUnknownTarget) {
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
   auto status = ExecuteAndTransfer(&b, {}).status();
-  EXPECT_EQ(status.code(), absl::StatusCode::kUnimplemented);
-  EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("No registered implementation"));
+  EXPECT_THAT(
+      status,
+      StatusIs(
+          absl::StatusCode::kNotFound,
+          HasSubstr(
+              "No FFI handler registered for __xla_test$$unknown_target")));
 }
 
 // Memcpy and SubBuffers tests are already ported in
@@ -772,6 +783,92 @@ TEST_F(CustomCallTest, FfiExecutionState) {
   TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
+//===----------------------------------------------------------------------===//
+// Asynchronous custom calls example.
+//===----------------------------------------------------------------------===//
+
+// This is an example of how to implement an asynchronous custom call:
+//
+//   1. Start custom call initiates async operations and extends the lifetime of
+//      the input buffer by aliasing it with the output.
+//   2. Done custom call waits for the async operations to complete and returns
+//      the result.
+//
+// Because HLO type system doesn't allow to express arbitrary values passed
+// between operations, we rely on a "side channel" to communicate between
+// start and done custom calls. In this example, this side channel is
+// implemented as a global static map.
+static absl::NoDestructor<absl::flat_hash_map<int32_t, void*>> async_work_map;
+
+static absl::Status AsyncStartCustomCall(ffi::AnyBuffer arg,
+                                         ffi::Result<ffi::AnyBuffer> ret,
+                                         int32_t channel) {
+  // Inside that start custom call we alias input with output and by doing that
+  // extend the lifetime of the input buffer until the linked done custom call.
+  EXPECT_EQ(arg.untyped_data(), ret->untyped_data());
+  EXPECT_EQ(arg.element_type(), F32);
+  EXPECT_EQ(ret->element_type(), F32);
+
+  EXPECT_TRUE(async_work_map->empty());
+  async_work_map->insert({channel, arg.untyped_data()});
+
+  return absl::OkStatus();
+}
+
+static absl::Status AsyncDoneCustomCall(ffi::AnyBuffer arg,
+                                        ffi::Result<ffi::AnyBuffer> ret,
+                                        int32_t channel) {
+  // In done custom call we "allocate" real result buffer.
+  EXPECT_NE(arg.untyped_data(), ret->untyped_data());
+  EXPECT_EQ(arg.element_type(), F32);
+
+  // Chat that argument is the same as the one we put into a map earlier.
+  EXPECT_EQ(async_work_map->at(channel), arg.untyped_data());
+
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(
+    kAsyncStartCustomCall, AsyncStartCustomCall,
+    ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
+        "channel"));
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_start_custom_call",
+                         PLATFORM, kAsyncStartCustomCall);
+
+XLA_FFI_DEFINE_HANDLER(
+    kAsyncDoneCustomCall, AsyncDoneCustomCall,
+    ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
+        "channel"));
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_done_custom_call",
+                         PLATFORM, kAsyncDoneCustomCall);
+
+TEST_F(CustomCallTest, AsyncCustomCalls) {
+  auto shape = ShapeUtil::MakeShape(F32, {});
+
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, shape, "p0");
+
+  auto start = CustomCall(
+      &b, "xla.gpu.async_start_custom_call",
+      /*operands=*/{Copy(p0)}, ShapeUtil::MakeShape(F32, {}),
+      /*opaque=*/"{channel = 0 : i32}",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{{{}, {0, {}}}}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+
+  CustomCall(&b, "xla.gpu.async_done_custom_call",
+             /*operands=*/{start}, ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"{channel = 0 : i32}",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+
+  Literal literal = LiteralUtil::CreateR0<float>(42.0f);
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {&literal}).status());
+}
+
 //===----------------------------------------------------------------------===//
 // Testing the use of buffers in custom calls.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc b/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc
index 53ea119163965f..7c12784e2bfc9b 100644
--- a/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc
+++ b/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc
@@ -14,11 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/custom_kernel_emitter.h"
@@ -61,8 +62,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> EmitPtxCustomKernelThunk(
           call.name, call.kernel_data, kernel_arguments.args().size(),
           call.block_dim, call.thread_dim, call.shared_mem));
 
+  Thunk::ThunkInfo thunk_info =
+      Thunk::ThunkInfo::WithProfileAnnotation(instr, context->GetNextThunkId());
   return std::make_unique<CustomKernelThunk>(
-      instr, ptx_custom_kernel, kernel_arguments, context->GetNextThunkId());
+      std::move(thunk_info), ptx_custom_kernel, kernel_arguments);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/custom_kernel_emitter_sycl_stub.cc b/third_party/xla/xla/service/gpu/custom_kernel_emitter_sycl_stub.cc
new file mode 100644
index 00000000000000..08344224e30bfc
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/custom_kernel_emitter_sycl_stub.cc
@@ -0,0 +1,31 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/custom_kernel_emitter.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<std::unique_ptr<Thunk>> EmitPtxCustomKernelThunk(
+    const HloCustomCallInstruction* /*instr*/, IrEmitterContext* /*context*/) {
+  return absl::UnimplementedError(
+      "Custom kernel emitter for PTX custom call is not yet implemented in "
+      "SYCL platform.");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index 3f1da611322e97..aad712107a7e1f 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -50,10 +50,11 @@ class DeterminismTest : public GpuCodegenTest {
  public:
   DeterminismTest() : debug_options_(HloTestBase::GetDebugOptionsForTest()) {
     debug_options_.set_xla_gpu_exclude_nondeterministic_ops(true);
-    // TODO(b/393299275): remove when the flag is enabled by default.
-    debug_options_.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    debug_options_.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
+  }
+
+  se::CudaComputeCapability get_cuda_cc() const {
+    se::StreamExecutor* executor = backend().default_stream_executor();
+    return executor->GetDeviceDescription().cuda_compute_capability();
   }
 
   // Runs the HLO several times with the same random inputs, and asserts the
@@ -145,20 +146,14 @@ class DeterminismTest : public GpuCodegenTest {
     EXPECT_TRUE(filecheck_result.value());
   }
 
-  bool IsAmpereOrLater() const {
+  bool IsAmpereOrLater() const { return get_cuda_cc().IsAtLeastAmpere(); }
+
+  bool IsRocm() const {
     return backend()
         .default_stream_executor()
         ->GetDeviceDescription()
-        .cuda_compute_capability()
-        .IsAtLeastAmpere();
-  }
-
-  bool IsRocm() const {
-    return std::holds_alternative<stream_executor::RocmComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
+        .gpu_compute_capability()
+        .IsRocm();
   }
 
   bool HasHipblasLt() const {
@@ -201,6 +196,10 @@ TEST_F(DeterminismTest, DeterministicTritonGemmUsesDefaultConfig) {
     GTEST_SKIP() << "Triton is not supported on non-NVIDIA and "
                     "pre-Ampere NVIDIA GPUs.";
   }
+  if (get_cuda_cc().IsAtLeastBlackwell()) {
+    // TODO(b/445172709): Re-enable once fixed.
+    GTEST_SKIP();
+  }
 
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
diff --git a/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc b/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc
index 1e4f8878842523..ac7338b5049f03 100644
--- a/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc
+++ b/third_party/xla/xla/service/gpu/dot_algorithm_support_test.cc
@@ -158,11 +158,10 @@ TEST_P(DotAlgorithmSupportTest, AlgorithmIsSupportedFromCudaCapability) {
   bool is_algorithm_supported = false;
   auto gpu_cc = GetGpuComputeCapability();
 
-  if (const auto* ccc = std::get_if<se::CudaComputeCapability>(&gpu_cc)) {
+  if (const auto* ccc = gpu_cc.cuda_compute_capability()) {
     is_algorithm_supported =
         ccc->SupportsAllFeaturesOf(params.min_cuda_capability);
-  } else if (const auto* rcc =
-                 std::get_if<se::RocmComputeCapability>(&gpu_cc)) {
+  } else if (const auto* rcc = gpu_cc.rocm_compute_capability()) {
     is_algorithm_supported = rcc->gfx9_mi100_or_later();
     if (GetDeviceDescription().runtime_version() < params.min_rocm_version &&
         (params.lhs_storage_type == F8E5M2 ||
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
index b6ef2ecf871591..b79d660b501467 100644
--- a/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -28,12 +29,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
-
 using AsyncExecutionStreamIds =
     ::xla::gpu::ExecutionStreamAssignment::AsyncExecutionStreamIds;
 
diff --git a/third_party/xla/xla/service/gpu/float_support_test.cc b/third_party/xla/xla/service/gpu/float_support_test.cc
index 4824af66d084e3..4fc8c1c87d2204 100644
--- a/third_party/xla/xla/service/gpu/float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/float_support_test.cc
@@ -75,15 +75,9 @@ ENTRY e {
 }
 
 TEST_F(FloatSupportTestWithTriton, MixedTypeDotWithBF16IsNotUpcasted) {
-  bool skip_test =
-      std::visit(absl::Overload(
-                     [](const se::CudaComputeCapability& cc) {
-                       return !cc.IsAtLeast(se::CudaComputeCapability::kAmpere);
-                     },
-                     [](const se::RocmComputeCapability&) { return true; }),
-                 GetGpuComputeCapability());
-
-  if (skip_test) {
+  if (GetGpuComputeCapability().IsRocm() ||
+      !GetGpuComputeCapability().cuda_compute_capability()->IsAtLeast(
+          se::CudaComputeCapability::kAmpere)) {
     GTEST_SKIP() << "Not supported on this GPU architecture";
   }
 
diff --git a/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc b/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc
index 58a709346f053f..38c0ba3c926b0f 100644
--- a/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc
+++ b/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc
@@ -16,13 +16,12 @@ limitations under the License.
 #include "xla/service/gpu/fusion_deduplication_cache.h"
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -37,7 +36,7 @@ HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
     fusion_instruction =
         computation->AddInstruction(HloInstruction::CreateFusion(
             consumer->shape(), HloInstruction::FusionKind::kLoop, consumer));
-    TF_CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
+    CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
   }
 
   if (producer->opcode() == HloOpcode::kFusion) {
@@ -56,7 +55,7 @@ HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
 
   // In case of multi-output fusion, `producer` would already be deleted.
   if (!allow_multi_output && producer->user_count() == 0) {
-    TF_CHECK_OK(computation->RemoveInstruction(producer));
+    CHECK_OK(computation->RemoveInstruction(producer));
   }
 
   return fusion_instruction;
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
index a8b83730b145a9..992bef7b2ae8e2 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/fusion_dispatch_pipeline.h"
 
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/gpu/transforms/fusion_block_level_rewriter.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
index 2fae2a309bf1ac..8d8b26a2ef9387 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index e0b8d2b2ade589..ce4176081976e6 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -19,11 +19,13 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/transforms/multi_output_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/layout_assignment.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/threadpool.h"
 
@@ -42,7 +45,7 @@ namespace gpu {
 HloPassPipeline FusionPipeline(
     const DebugOptions& debug_options,
     HloCostAnalysis::ShapeSizeFunction shape_size_bytes_function,
-    tsl::thread::ThreadPool* thread_pool,
+    const GpuAliasInfo* alias_info, tsl::thread::ThreadPool* thread_pool,
     const se::DeviceDescription& gpu_device_info,
     mlir::MLIRContext* mlir_context) {
   HloPassFix<HloPassPipeline> fusion("fusion");
@@ -52,7 +55,6 @@ HloPassPipeline FusionPipeline(
   HloVerifierOpts opts =
       HloVerifierOpts().MakeLayoutSensitive().WithInstructionCanChangeLayout(
           LayoutAssignment::InstructionCanChangeLayout);
-  opts.verify_unique_channel_ids = !debug_options.xla_ignore_channel_id();
   fusion.AddInvariantCheckerDebug<HloVerifier>(
       std::make_unique<CpuGpuVerifierMetadata>(std::move(opts)),
       "hlo verifier (debug)");
@@ -72,8 +74,8 @@ HloPassPipeline FusionPipeline(
       /*is_layout_sensitive=*/true, /*ignore_control_dependencies=*/false,
       /*should_eliminate_computation=*/&HloComputation::IsFusionComputation);
   fusion.AddPass<HloDCE>();
-  fusion.AddPass<MultiOutputFusion>(gpu_device_info, shape_size_bytes_function,
-                                    mlir_context);
+  fusion.AddPass<MultiOutputFusion>(gpu_device_info, alias_info,
+                                    shape_size_bytes_function, mlir_context);
   fusion.AddPass<HloCSE>(
       /*is_layout_sensitive=*/true, /*ignore_control_dependencies=*/false,
       /*should_eliminate_computation=*/&HloComputation::IsFusionComputation);
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.h b/third_party/xla/xla/service/gpu/fusion_pipeline.h
index 4a1daef5a4ba44..64b4ceea176562 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
@@ -31,7 +32,7 @@ namespace gpu {
 HloPassPipeline FusionPipeline(
     const DebugOptions& debug_options,
     HloCostAnalysis::ShapeSizeFunction shape_size_bytes_function,
-    tsl::thread::ThreadPool* thread_pool,
+    const GpuAliasInfo* alias_info, tsl::thread::ThreadPool* thread_pool,
     const se::DeviceDescription& gpu_device_info,
     mlir::MLIRContext* mlir_context);
 
diff --git a/third_party/xla/xla/service/gpu/fusion_process_dump.cc b/third_party/xla/xla/service/gpu/fusion_process_dump.cc
index 86569de8cdc8b1..559c9734623d39 100644
--- a/third_party/xla/xla/service/gpu/fusion_process_dump.cc
+++ b/third_party/xla/xla/service/gpu/fusion_process_dump.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/path.h"
@@ -59,7 +59,7 @@ HloInstruction* AddFusionInstruction(HloInstruction* producer,
   auto fusion_instruction = computation->AddInstruction(
       HloInstruction::CreateFusion(consumer->shape(), kind, consumer),
       /*new_name=*/fusion_name);
-  TF_CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
+  CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
 
   return fusion_instruction;
 }
@@ -76,7 +76,7 @@ HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
   }
 
   if (producer->user_count() == 0) {
-    TF_CHECK_OK(computation->RemoveInstruction(producer));
+    CHECK_OK(computation->RemoveInstruction(producer));
   }
 
   return fusion_instruction;
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
new file mode 100644
index 00000000000000..48ca5e869b37e3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_AOT_COMPILATION_RESULT_H_
+#define XLA_SERVICE_GPU_GPU_AOT_COMPILATION_RESULT_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+// `AotCompilationResult` implementation for GPU, containing a serialized
+// `GpuExecutable`.
+//
+// Unlike `LegacyGpuAotCompilationResult`, this result contains the entire
+// optimized executable, including the Thunks, as opposed to just the optimized
+// HLO.
+class GpuAotCompilationResult : public AotCompilationResult {
+ public:
+  static absl::StatusOr<std::unique_ptr<GpuAotCompilationResult>> FromProto(
+      GpuExecutableProto executable) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        HloModule::CreateFromProtoWithConfig(
+                            executable.hlo_module_with_config()));
+
+    return absl::WrapUnique(
+        new GpuAotCompilationResult(std::move(executable), std::move(module)));
+  }
+
+  absl::StatusOr<std::string> SerializeAsString() const final {
+    std::string serialized = executable_.SerializeAsString();
+    if (serialized.empty()) {
+      return absl::InternalError("Failed to serialize GpuExecutableProto.");
+    }
+    return serialized;
+  }
+
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
+      final {
+    return GpuExecutable::FromProto(executable_,
+                                    stream_exec->GetDeviceDescription(),
+                                    stream_exec->GetPlatform()->Name());
+  }
+
+  const HloModule* optimized_module() const final { return hlo_module_.get(); };
+
+  std::unique_ptr<HloModule> consume_optimized_module() final {
+    return std::move(hlo_module_);
+  };
+
+ private:
+  explicit GpuAotCompilationResult(GpuExecutableProto executable,
+                                   std::unique_ptr<HloModule> hlo_module)
+      : executable_(std::move(executable)),
+        hlo_module_(std::move(hlo_module)) {}
+
+  GpuExecutableProto executable_;
+  std::unique_ptr<HloModule> hlo_module_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_GPU_AOT_COMPILATION_RESULT_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
new file mode 100644
index 00000000000000..7edc828fd9e76a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_aot_compilation_result.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/stream_executor/mock_platform.h"
+#include "xla/stream_executor/mock_stream_executor.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::stream_executor::DeviceDescription;
+using ::stream_executor::GpuComputeCapability;
+using ::stream_executor::MockPlatform;
+using ::stream_executor::MockStreamExecutor;
+using ::testing::Return;
+using ::testing::ReturnRef;
+using ::tsl::proto_testing::EqualsProto;
+
+DeviceDescription GetDeviceDescription() {
+  DeviceDescription device_description;
+  device_description.set_gpu_compute_capability(
+      GpuComputeCapability{::stream_executor::CudaComputeCapability::Volta()});
+  device_description.set_driver_version({12, 3, 0});
+  device_description.set_runtime_version({12, 3, 0});
+  return device_description;
+}
+
+class GpuAotCompilationResultTest : public ::testing::Test {
+ public:
+  GpuAotCompilationResultTest() : device_description_(GetDeviceDescription()) {
+    EXPECT_CALL(executor_, GetDeviceDescription())
+        .WillRepeatedly(ReturnRef(device_description_));
+    EXPECT_CALL(executor_, GetPlatform()).WillRepeatedly(Return(&platform_));
+    EXPECT_CALL(platform_, Name()).WillRepeatedly(ReturnRef(platform_name_));
+  }
+
+  // Creates a dummy GpuExecutableProto, the actual values don't matter much.
+  absl::StatusOr<GpuExecutableProto> CreateGpuExecutableProto() {
+    Thunk::ThunkInfo thunk_info;
+    thunk_info.thunk_id = 123;
+
+    ThunkSequence thunk_sequence;
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        thunk_info,
+        /*kernel_name=*/"test_kernel", emitters::KernelArguments({}),
+        LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0, ::stream_executor::gpu::TmaMetadata()));
+
+    auto hlo_module = std::make_unique<HloModule>("test_module_with_shape",
+                                                  HloModuleConfig());
+    auto builder = HloComputation::Builder("entry");
+    auto constant = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+    hlo_module->AddEntryComputation(builder.Build(constant));
+
+    GpuExecutable::Params params;
+    params.debug_module = std::move(hlo_module);
+    params.asm_text = "test_asm_text";
+    params.binary = {1, 2, 3};
+    params.dnn_compiled_graphs = {{"test_dnn_compiled_graph", "test_json"}};
+
+    thunk_info.thunk_id = 456;
+    params.executable = std::make_unique<SequentialThunk>(
+        thunk_info, std::move(thunk_sequence));
+    params.device_description = device_description_;
+
+    params.module_name = "test_module";
+    params.enable_debug_info_manager = false;
+    params.mlir_allocations = {BufferAllocation(0, 1024, 0)};
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<GpuExecutable> executable,
+                        GpuExecutable::Create(std::move(params)));
+    return executable->ToProto();
+  }
+
+  DeviceDescription device_description_;
+  MockStreamExecutor executor_;
+  MockPlatform platform_;
+  const std::string platform_name_ = "gpu";
+};
+
+TEST_F(GpuAotCompilationResultTest, CreateAndSerialize) {
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
+                          CreateGpuExecutableProto());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GpuAotCompilationResult> result,
+      GpuAotCompilationResult::FromProto(reference_executable));
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_result,
+                          result->SerializeAsString());
+  GpuExecutableProto deserialized_executable;
+  ASSERT_TRUE(deserialized_executable.ParseFromString(serialized_result))
+      << "Failed to parse serialized result.";
+
+  EXPECT_THAT(deserialized_executable, EqualsProto(reference_executable));
+}
+
+TEST_F(GpuAotCompilationResultTest, LoadExecutable) {
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
+                          CreateGpuExecutableProto());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GpuAotCompilationResult> result,
+      GpuAotCompilationResult::FromProto(reference_executable));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      std::move(*result).LoadExecutable(/*compiler=*/nullptr, &executor_));
+
+  auto* gpu_executable = dynamic_cast<GpuExecutable*>(executable.get());
+  ASSERT_NE(gpu_executable, nullptr) << "Executable is not a GpuExecutable.";
+
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto executable_proto,
+                          gpu_executable->ToProto());
+  // HLO module is re-created from proto, and will have a new ID, so we clear
+  // it for comparison purposes.
+  executable_proto.mutable_hlo_module_with_config()
+      ->mutable_hlo_module()
+      ->clear_id();
+  reference_executable.mutable_hlo_module_with_config()
+      ->mutable_hlo_module()
+      ->clear_id();
+  EXPECT_THAT(executable_proto, EqualsProto(reference_executable));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
index 6d44b7f07224c6..bf02596ba3a477 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -21,11 +21,17 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
@@ -36,22 +42,39 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
-using GpuAotCompilationTest = HloTestBase;
+class GpuAotCompilationTest : public HloTestBase,
+                              public ::testing::WithParamInterface<bool> {
+ protected:
+  void SetUp() override { debug_options_ = GetDebugOptionsForTest(); }
 
-TEST_F(GpuAotCompilationTest, ExportAndLoadExecutable) {
-  const absl::string_view hlo_string = R"(
-HloModule Test
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_experimental_aot_compiled_thunks(GetParam());
+    return debug_options;
+  }
 
-ENTRY main {
-  a = f32[100, 200]{1,0} parameter(0)
-  ROOT b = f32[100, 200]{0,1} copy(a)
-}
-)";
+  DebugOptions debug_options_;
+};
+INSTANTIATE_TEST_SUITE_P(NewAotFlow, GpuAotCompilationTest, ::testing::Bool(),
+                         [](const ::testing::TestParamInfo<bool>& info) {
+                           return info.param ? "NewAotFlowEnabled"
+                                             : "NewAotFlowDisabled";
+                         });
+
+TEST_P(GpuAotCompilationTest, ExportAndLoadExecutable) {
+  const absl::string_view hlo_string = R"hlo(
+    HloModule Test
+
+    ENTRY main {
+      a = f32[100, 200]parameter(0)
+      ROOT b = f32[100, 200] copy(a)
+    }
+)hlo";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
@@ -84,15 +107,15 @@ ENTRY main {
       std::move(*aot_result).LoadExecutable(compiler, stream_exec));
 }
 
-TEST_F(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
-  const absl::string_view hlo_string = R"(
-HloModule Test
+TEST_P(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
+  const absl::string_view hlo_string = R"hlo(
+    HloModule Test
 
-ENTRY main {
-  a = f32[100, 200]{1,0} parameter(0)
-  ROOT b = f32[100, 200]{0,1} copy(a)
-}
-)";
+    ENTRY main {
+      a = f32[100, 200] parameter(0)
+      ROOT b = f32[100, 200] copy(a)
+    }
+)hlo";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
@@ -105,9 +128,9 @@ ENTRY main {
                           platform->ExecutorForDevice(0));
 
   // Stream executor is not passed as an option.
-  Compiler::TargetConfig gpu_target_config(stream_exec);
+  Compiler::GpuTargetConfig gpu_target_config(stream_exec);
   AotCompilationOptions aot_options(compiler->PlatformId());
-  aot_options.set_target_config(gpu_target_config);
+  aot_options.set_gpu_target_config(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
@@ -136,20 +159,20 @@ std::string CreateTritonCustomCallBackendConfig() {
   mlir::Builder builder(&context_);
 
   // Create the backend_config for the triton custom call.
-  const std::string kMLIRText = R"(
-  module {
-    tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
-      %0 = tt.get_program_id x : i32
-      %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
-      %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
-      %cst = arith.constant 1.000000e+00 : f32
-      %3 = arith.addf %1, %cst : f32
-      tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : !tt.ptr<f32>
-      tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : !tt.ptr<f32>
-      tt.return
+  const std::string kMLIRText = R"mlir(
+    module {
+      tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
+        %0 = tt.get_program_id x : i32
+        %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
+        %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
+        %cst = arith.constant 1.000000e+00 : f32
+        %3 = arith.addf %1, %cst : f32
+        tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : !tt.ptr<f32>
+        tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : !tt.ptr<f32>
+        tt.return
+      }
     }
-  }
-  )";
+  )mlir";
 
   NamedAttribute name =
       builder.getNamedAttr("name", builder.getStringAttr("add_one"));
@@ -183,7 +206,7 @@ std::string CreateTritonCustomCallBackendConfig() {
 
 }  // namespace
 
-TEST_F(GpuAotCompilationTest, ExportAndLoadExecutableWithTriton) {
+TEST_P(GpuAotCompilationTest, ExportAndLoadExecutableWithTriton) {
   auto triton_support =
       EnsureTritonSupportsComputeCapability(backend()
                                                 .default_stream_executor()
@@ -193,7 +216,7 @@ TEST_F(GpuAotCompilationTest, ExportAndLoadExecutableWithTriton) {
     GTEST_SKIP() << triton_support;
   }
 
-  const absl::string_view hlo_string_template = R"(
+  const absl::string_view hlo_string_template = R"hlo(
     HloModule Test
 
     ENTRY main {
@@ -201,7 +224,7 @@ TEST_F(GpuAotCompilationTest, ExportAndLoadExecutableWithTriton) {
     b = f32[] parameter(1)
     ROOT c = (f32[],f32[]) custom-call(a, b), custom_call_target="__gpu$xla.gpu.triton", backend_config="%s"
     }
-    )";
+    )hlo";
 
   std::string hlo_string =
       absl::StrFormat(hlo_string_template,
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index a0ff6f970b78aa..882165e03b6755 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -37,8 +37,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/AsmParser/Parser.h"
@@ -58,8 +56,8 @@ limitations under the License.
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
-#include "mlir/IR/Diagnostics.h"
 #include "mlir/Support/LLVM.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/cpu/nanort/nanort_client.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
@@ -67,16 +65,17 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/runtime_intrinsics.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/core/host_offloading/hlo_host_device_type_call_wrapper.h"
 #include "xla/core/host_offloading/host_compute_asyncifier.h"
 #include "xla/hlo/analysis/alias_info.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
@@ -89,6 +88,7 @@ limitations under the License.
 #include "xla/hlo/transforms/collectives/collectives_schedule_linearizer.h"
 #include "xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h"
 #include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h"
+#include "xla/hlo/transforms/expanders/cholesky_expander.h"
 #include "xla/hlo/transforms/expanders/comparison_expander.h"
 #include "xla/hlo/transforms/expanders/convolution_4d_expander.h"
 #include "xla/hlo/transforms/expanders/convolution_pred_expander.h"
@@ -142,6 +142,7 @@ limitations under the License.
 #include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/maybe_owning.h"
+#include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/service/all_reduce_promotion.h"
 #include "xla/service/all_reduce_reassociate.h"
 #include "xla/service/all_reduce_simplifier.h"
@@ -173,12 +174,13 @@ limitations under the License.
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
 #include "xla/service/gpu/cublas_cudnn.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/flag_utils.h"
 #include "xla/service/gpu/fusion_dispatch_pipeline.h"
 #include "xla/service/gpu/fusion_pipeline.h"
+#include "xla/service/gpu/gpu_aot_compilation_result.h"
 #include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
@@ -186,8 +188,8 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_stats.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
+#include "xla/service/gpu/legacy_gpu_aot_compilation_result.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/gpu/model/collective_ptable_stats_collection.h"
@@ -198,6 +200,7 @@ limitations under the License.
 #include "xla/service/gpu/pre_scheduling_copy_insertion_pipeline.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/thunk_emitter.h"
 #include "xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h"
 #include "xla/service/gpu/transforms/algebraic_simplifier.h"
 #include "xla/service/gpu/transforms/algorithm_checker.h"
@@ -211,7 +214,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/collectives/all_reduce_splitter.h"
 #include "xla/service/gpu/transforms/collectives/collective_backend_assigner.h"
 #include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h"
 #include "xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h"
 #include "xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.h"
@@ -360,204 +362,20 @@ MaybeOwningThreadPool CreateMaybeOwningThreadPool(
 
 DeviceOrDevicelessConfig GetDeviceConfig(
     se::StreamExecutor* stream_exec, const GpuCompiler::CompileOptions& options,
-    const Compiler::TargetConfig& gpu_target_config) {
+    const Compiler::GpuTargetConfig& gpu_target_config) {
   if (stream_exec) {
     return DeviceOrDevicelessConfig{
         DeviceConfig{stream_exec, options.device_allocator}};
   }
-  se::DeviceDescription device_description =
-      gpu_target_config.device_description;
-  device_description.set_dnn_version(
-      {static_cast<unsigned>(
-           gpu_target_config.dnn_version_info.major_version()),
-       static_cast<unsigned>(
-           gpu_target_config.dnn_version_info.minor_version()),
-       static_cast<unsigned>(gpu_target_config.dnn_version_info.patch())});
-  return DeviceOrDevicelessConfig{DevicelessConfig{device_description}};
+  return DeviceOrDevicelessConfig{
+      DevicelessConfig{gpu_target_config.device_description}};
 }
 
 se::GpuComputeCapability GetGpuVersion(const se::StreamExecutor* stream_exec) {
   return stream_exec->GetDeviceDescription().gpu_compute_capability();
 }
 
-class GpuThunkAotCompilationResult : public AotCompilationResult {
- public:
-  static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
-  FromModule(const HloModule* hlo_module,
-             const BufferAssignment* buffer_assignment,
-             absl::string_view asm_text, absl::Span<const uint8_t> binary,
-             const BinaryMap& dnn_compiled_graphs, int pointer_size) {
-    tsl::profiler::TraceMe traceme("ResultFromModule");
-    CompilationResultProto proto;
-    *proto.mutable_hlo_module_with_config() = hlo_module->ToProtoWithConfig();
-    *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
-    proto.set_asm_text(asm_text);
-    proto.set_binary(binary.data(), binary.size());
-    proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs.cbegin(),
-                                                dnn_compiled_graphs.cend());
-    return std::unique_ptr<GpuThunkAotCompilationResult>(
-        new GpuThunkAotCompilationResult(hlo_module->Clone(), std::move(proto),
-                                         pointer_size));
-  }
-
-  static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
-  FromString(const std::string& serialized, int pointer_size) {
-    tsl::profiler::TraceMe traceme("ResultFromString");
-    CompilationResultProto proto;
-    if (!proto.ParseFromString(serialized)) {
-      return Internal(
-          "Failed to parse serialized GpuThunkAotCompilationResult.");
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModule> module,
-        HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
-    return std::unique_ptr<GpuThunkAotCompilationResult>(
-        new GpuThunkAotCompilationResult(std::move(module), std::move(proto),
-                                         pointer_size));
-  }
-
-  absl::StatusOr<std::string> SerializeAsString() const override {
-    return proto_.SerializeAsString();
-  }
-
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
-      override;
-
-  const HloModule* optimized_module() const override { return module_.get(); }
-  std::unique_ptr<HloModule> consume_optimized_module() override {
-    return std::move(module_);
-  }
-
-  absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
-      const override;
-
- private:
-  GpuThunkAotCompilationResult(std::unique_ptr<HloModule> module,
-                               CompilationResultProto proto, int pointer_size)
-      : module_(std::move(module)),
-        proto_(std::move(proto)),
-        pointer_size_(pointer_size) {}
-
-  std::unique_ptr<HloModule> module_;
-  CompilationResultProto proto_;
-  int pointer_size_;
-};
-
-}  // end anonymous namespace
-
-absl::StatusOr<std::unique_ptr<BufferAssignment>>
-GpuThunkAotCompilationResult::buffer_assignment() const {
-  auto buffer_size_bytes_function =
-      [pointer_size = pointer_size_](const BufferValue& buffer) {
-        return gpu::ShapeSizeBytesFunction(pointer_size)(buffer.shape());
-      };
-
-  // Recreate BufferAssignment from proto.
-  // Technically, we should pass the proper GpuAliasInfo, but the FromProto()
-  // method does not actually make use of the MayAlias function. And for now, we
-  // don't have backend-specific MustAlias rules.
-  // TODO(b/424109294): This needs to be fixed when we implement
-  // backend-specific MustAlias rules.
-  AliasInfo alias_info;
-  return BufferAssignment::FromProto(proto_.buffer_assignment(), module_.get(),
-                                     buffer_size_bytes_function, &alias_info);
-}
-
-absl::StatusOr<std::unique_ptr<Executable>>
-GpuThunkAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* stream_exec) && {
-  tsl::profiler::TraceMe traceme("LoadExecutable");
-  // Recreate HloModule+HloModuleConfig from proto.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloModule::CreateFromProtoWithConfig(proto_.hlo_module_with_config()));
-
-  ExecutionStreamAssignment execution_stream_assignment(hlo_module.get());
-
-  std::vector<uint8_t> binary(proto_.binary().begin(), proto_.binary().end());
-
-  // Build the executable, which should be a thunk sequence.
-  TF_ASSIGN_OR_RETURN(
-      se::Platform * platform,
-      se::PlatformManager::PlatformWithId(compiler->PlatformId()));
-  std::string platform_name = platform->Name();
-  const se::DeviceDescription& gpu_device_info =
-      stream_exec->GetDeviceDescription();
-  llvm::LLVMContext llvm_context;
-  auto* gpu_compiler = dynamic_cast<GpuCompiler*>(compiler);
-  if (gpu_compiler == nullptr) {
-    return Internal("Compiler is not a GpuCompiler.");
-  }
-  auto llvm_module = std::make_unique<llvm::Module>("", llvm_context);
-  llvm_module->setTargetTriple(llvm::Triple(gpu_compiler->target_triple()));
-  llvm_module->setDataLayout(gpu_compiler->data_layout());
-
-  // Recreate BufferAssignment from proto.
-  std::unique_ptr<GpuAliasInfo> alias_info =
-      gpu_compiler->GetAliasInfo(gpu_device_info);
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssignment::FromProto(proto_.buffer_assignment(), hlo_module.get(),
-                                  compiler->BufferSizeBytesFunction(),
-                                  alias_info.get()));
-
-  IrEmitterContext ir_emitter_context(
-      hlo_module.get(), buffer_assignment.get(), &execution_stream_assignment,
-      platform_name, gpu_device_info, gpu_compiler->mlir_context(),
-      llvm_module.get(),
-      /*llvm_module_constants=*/nullptr,
-      /*emit_kernels=*/false);
-
-  absl::string_view cache_file_path =
-      hlo_module->config().debug_options().xla_gpu_kernel_cache_file();
-  if (!cache_file_path.empty() &&
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_llvm_module_compilation_parallelism()) {
-    TF_RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
-  }
-
-  auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
-  TF_RETURN_IF_ERROR(
-      ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
-
-  // Get all other fields required by GpuExecutable.
-  std::vector<GpuExecutable::ConstantInfo> constants =
-      std::move(ir_emitter_context.constants());
-  TF_ASSIGN_OR_RETURN(auto output_info,
-                      GetOutputInfo(*hlo_module, *buffer_assignment));
-  ProgramShape program_shape =
-      hlo_module->entry_computation_layout().ComputeProgramShape();
-  *program_shape.mutable_result() = hlo_module->result_shape();
-  DebugOptions debug_options = hlo_module->config().debug_options();
-  std::string hlo_module_name = hlo_module->name();
-
-  {
-    tsl::profiler::TraceMe traceme("CreateGpuExecutable");
-    std::unique_ptr<GpuAliasInfo> alias_info =
-        gpu_compiler->GetAliasInfo(gpu_device_info);
-    return GpuExecutable::Create(GpuExecutable::Params{
-        /*asm_text=*/proto_.asm_text(),
-        /*binary=*/binary,
-        /*dnn_compiled_graphs=*/
-        BinaryMap(proto_.dnn_compiled_graphs().cbegin(),
-                  proto_.dnn_compiled_graphs().cend()),
-        /*executable=*/ir_emitter->ConsumeThunkSequence(),
-        /*constants=*/std::move(constants),
-        /*output_info=*/std::move(output_info),
-        /*module_name=*/std::move(hlo_module_name),
-        /*program_shape=*/std::move(program_shape),
-        /*mlir_allocations=*/std::nullopt,
-        /*buffer_assignment=*/std::move(buffer_assignment),
-        /*alias_info=*/std::move(alias_info),
-        /*debug_options=*/std::move(debug_options),
-        /*device_description=*/gpu_device_info,
-        /*debug_module=*/std::move(hlo_module),
-        /*enable_debug_info_manager=*/true});
-  }
-}
+}  // namespace
 
 GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
                          const char* target_triple, const char* data_layout)
@@ -569,10 +387,8 @@ GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
 
 namespace {
 // Adds the HloVerifier for GPU to the given pipeline.
-void AddHloVerifier(HloPassPipeline* pipeline,
-                    bool verify_unique_channel_ids = false,
-                    HloVerifierOpts&& opts = {}, bool debug_only = false) {
-  opts.verify_unique_channel_ids = verify_unique_channel_ids;
+void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
+                    bool debug_only = false) {
   opts.verify_no_collective_deadlocks = true;
   std::unique_ptr<TargetVerifierMetadata> verifier_metadata =
       std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
@@ -651,7 +467,7 @@ absl::Status RunPreSPMDPartitionerPasses(HloModule* hlo_module) {
 }
 
 absl::Status RunSPMDPasses(
-    HloModule* hlo_module, const Compiler::TargetConfig& gpu_target_config,
+    HloModule* hlo_module, const Compiler::GpuTargetConfig& gpu_target_config,
     const AliasInfo* alias_info,
     const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
     int64_t max_windowed_einsum_iteration) {
@@ -737,7 +553,7 @@ bool BackendConfigDeviceTypeIsHost(HloInstruction* instr) {
 
 absl::Status RunOptimizationPasses(
     HloModule* hlo_module, stream_executor::StreamExecutor* stream_exec,
-    const Compiler::TargetConfig& gpu_target_config,
+    const Compiler::GpuTargetConfig& gpu_target_config,
     const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
     absl::string_view platform_name) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
@@ -745,9 +561,9 @@ absl::Status RunOptimizationPasses(
       gpu_target_config.device_description.gpu_compute_capability();
 
   HloPassPipeline pipeline("optimization");
-  AddHloVerifier(&pipeline, !debug_options.xla_ignore_channel_id());
+  AddHloVerifier(&pipeline);
   if (debug_options.xla_detect_unstable_reductions() !=
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE) {
+      DebugOptions::DETECTION_MODE_NONE) {
     pipeline.AddPass<UnstableReductionDetector>();
   }
   pipeline.AddPass<RaggedDotRewriter>();
@@ -766,8 +582,9 @@ absl::Status RunOptimizationPasses(
   pipeline.AddPass<DotDecomposer>();
 
   HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
-    const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-        &gpu_target_config.device_description.gpu_compute_capability());
+    const auto* cuda_cc =
+        gpu_target_config.device_description.gpu_compute_capability()
+            .cuda_compute_capability();
     if (cuda_cc != nullptr &&
         !cuda_cc->IsAtLeast(se::CudaComputeCapability::kVolta)) {
       return true;
@@ -813,6 +630,9 @@ absl::Status RunOptimizationPasses(
   // Rewrite select-and-scatter as a scatter and a reduce-window.
   pipeline.AddPass<SelectAndScatterExpander>();
 
+  // Rewrite Cholesky.
+  pipeline.AddPass<CholeskyExpander>();
+
   if (RequireDeterminism(hlo_module->config())) {
     // Scatter can be indeterministic if indices are not unique or a non
     // associative combiner function is used. Eliminate these Scatter ops.
@@ -901,8 +721,7 @@ absl::Status RunOptimizationPasses(
   // point.
   [&, &pipeline =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification")] {
-    AddHloVerifier(&pipeline, !debug_options.xla_ignore_channel_id(),
-                   HloVerifierOpts{}, /*debug_only=*/true);
+    AddHloVerifier(&pipeline, HloVerifierOpts{}, /*debug_only=*/true);
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
     // elimination has to come after that pass.
@@ -983,7 +802,7 @@ absl::Status RunCollectiveOptimizationPasses(
 
   if (debug_options.xla_gpu_experimental_enable_nvshmem()) {
     collectives_pipeline.AddPass<CollectiveBackendAssigner>(
-        gpu_version, num_visible_devices_per_process);
+        gpu_version, num_visible_devices_per_process, options.slice_size);
   }
 
   if (debug_options.xla_gpu_unsupported_enable_ragged_all_to_all_decomposer()) {
@@ -1045,8 +864,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipelining_direction=*/
         collective_pipeliner_utils::PipeliningDirection::kForward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
-        /*acceptable_formatting=*/
-        HloPredicateIsNotOp<HloOpcode::kOptimizationBarrier>,
+        /*acceptable_formatting=*/HloPredicateTrue,
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
@@ -1069,8 +887,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipelining_direction=*/
         collective_pipeliner_utils::PipeliningDirection::kBackward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kAllGather>,
-        /*acceptable_formatting=*/
-        HloPredicateIsNotOp<HloOpcode::kOptimizationBarrier>,
+        /*acceptable_formatting=*/HloPredicateTrue,
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
@@ -1093,8 +910,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipelining_direction=*/
         collective_pipeliner_utils::PipeliningDirection::kForward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kReduceScatter>,
-        /*acceptable_formatting=*/
-        HloPredicateIsNotOp<HloOpcode::kOptimizationBarrier>,
+        /*acceptable_formatting=*/HloPredicateTrue,
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
@@ -1107,8 +923,6 @@ absl::Status RunCollectiveOptimizationPasses(
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
 
-  collectives_pipeline.AddPass<ReduceScatterCreator>();
-
   DebugOptions::PipelineParallelismOptLevel pipeline_parallelism_opt_level =
       debug_options.xla_gpu_experimental_pipeline_parallelism_opt_level();
   if (debug_options.xla_gpu_enable_pipelined_p2p()) {
@@ -1130,6 +944,8 @@ absl::Status RunCollectiveOptimizationPasses(
   collectives_pipeline.AddPass<GpuAlgebraicSimplifier>(
       layout_insensitive_algsimp_opts, gpu_version);
 
+  collectives_pipeline.AddPass<ReduceScatterCreator>();
+
   collectives_pipeline.AddPass<AllGatherBroadcastReorder>();
   collectives_pipeline.AddPass<AllGatherRemoveDegenerateDims>();
 
@@ -1195,9 +1011,10 @@ absl::Status RunLayoutAssignmentPasses(
 }
 
 absl::Status RunFusionPasses(HloModule* hlo_module,
-                             const Compiler::TargetConfig& gpu_target_config,
+                             const Compiler::GpuTargetConfig& gpu_target_config,
                              tsl::thread::ThreadPool* thread_pool,
                              HloCostAnalysis::ShapeSizeFunction shape_size_fn,
+                             const GpuAliasInfo* alias_info,
                              mlir::MLIRContext* mlir_context) {
   const se::DeviceDescription& gpu_device_info =
       gpu_target_config.device_description;
@@ -1208,7 +1025,7 @@ absl::Status RunFusionPasses(HloModule* hlo_module,
 
   TF_RETURN_IF_ERROR(
       FusionPipeline(hlo_module->config().debug_options(), shape_size_fn,
-                     thread_pool, gpu_device_info, mlir_context)
+                     alias_info, thread_pool, gpu_device_info, mlir_context)
           .Run(hlo_module, {HloInstruction::kMainExecutionThread})
           .status());
 
@@ -1324,7 +1141,7 @@ absl::Status RunPostFusionPasses(
 absl::Status RunPostFusionSimplificationPasses(
     HloModule* hlo_module, const AlgebraicSimplifierOptions& algsimp_options,
     se::GpuComputeCapability gpu_version,
-    const Compiler::TargetConfig& gpu_target_config) {
+    const Compiler::GpuTargetConfig& gpu_target_config) {
   HloPassPipeline pipeline("post-fusion-simplification-pipeline optimization");
   pipeline.AddPass<GpuAlgebraicSimplifier>(algsimp_options, gpu_version);
 
@@ -1354,7 +1171,7 @@ absl::Status RunPostFusionSimplificationPasses(
 absl::Status RunPostFusionVerificationPasses(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
     const GpuCompiler::CompileOptions& options,
-    const Compiler::TargetConfig& gpu_target_config,
+    const Compiler::GpuTargetConfig& gpu_target_config,
     mlir::MLIRContext* mlir_context) {
   HloPassPipeline pipeline("post-fusion-verification-pipeline optimization");
 
@@ -1487,6 +1304,7 @@ AlgebraicSimplifierOptions GpuCompiler::GetAlgebraicSimplifierOptions(
   }
 
   switch (mode) {
+    case AlgebraicSimplifierMode::kAfterSimplifyFPConversions:
     case AlgebraicSimplifierMode::kPostFusionSimplification:
     case AlgebraicSimplifierMode::kLayoutNormalization:
     case AlgebraicSimplifierMode::kPostLayoutAssignment:
@@ -1522,7 +1340,7 @@ absl::Status GpuCompiler::RunCollectiveScheduleLinearizerPasses(
 // Runs optimization passes on the given HLO module.
 absl::Status GpuCompiler::OptimizeHloModule(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    const CompileOptions& options, const TargetConfig& gpu_target_config,
+    const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const GpuAliasInfo* alias_info) {
   tsl::profiler::TraceMe traceme("OptimizeHloModule");
   const se::DeviceDescription& device_description =
@@ -1620,9 +1438,9 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_RETURN_IF_ERROR(
       RunDynamicSliceFusionPasses(hlo_module, /*platform_id=*/PlatformId()));
 
-  TF_RETURN_IF_ERROR(RunFusionPasses(hlo_module, gpu_target_config,
-                                     thread_pool.get_mutable(),
-                                     ShapeSizeBytesFunction(), &mlir_context_));
+  TF_RETURN_IF_ERROR(
+      RunFusionPasses(hlo_module, gpu_target_config, thread_pool.get_mutable(),
+                      ShapeSizeBytesFunction(), alias_info, &mlir_context_));
   TF_RETURN_IF_ERROR(RunPostFusionPasses(hlo_module, device_description,
                                          alias_info, pointer_size_, options,
                                          &mlir_context_));
@@ -1641,6 +1459,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_RETURN_IF_ERROR(
       RunCollectiveScheduleLinearizerPasses(hlo_module, stream_exec));
 
+  {
+    HloPassPipeline pipeline("invariant-checkers");
+    if (hlo_module->config()
+            .debug_options()
+            .xla_detect_unstable_reductions_post_optimizations() !=
+        DebugOptions::DETECTION_MODE_NONE) {
+      pipeline.AddPass<UnstableReductionDetector>();
+    }
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
   TF_RETURN_IF_ERROR(RunAsyncDotPasses(hlo_module));
   {
     HloPassPipeline pipeline("autotune-fusion-emitters");
@@ -1694,7 +1523,7 @@ void AddGemmRewriterPasses(HloPassPipeline& pipeline,
 
 absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    const CompileOptions& options, const TargetConfig& gpu_target_config,
+    const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const GpuAliasInfo* alias_info, tsl::thread::ThreadPool* thread_pool) {
   // Constants:
   const DebugOptions& debug_options = hlo_module->config().debug_options();
@@ -1707,8 +1536,9 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
           gpu_target_config.platform_name == "ROCM");
   DeviceOrDevicelessConfig device_config =
       GetDeviceConfig(stream_exec, options, gpu_target_config);
-  AutotuneConfig autotune_config =
-      AutotuneConfig::FromDebugOptions(device_config, debug_options);
+  TF_ASSIGN_OR_RETURN(
+      AutotuneConfig autotune_config,
+      AutotuneConfig::FromDebugOptions(device_config, debug_options));
   // Lambdas and related constants:
   const GpuFloatSupport bf16_support(gpu_version, BF16);
   const GpuFloatSupport f8e5m2_support(gpu_version, F8E5M2, F16);
@@ -1771,8 +1601,8 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     se::GpuComputeCapability gpu_version =
         gpu_target_config.device_description.gpu_compute_capability();
     pipeline.AddPass<AlgorithmChecker>(gpu_version);
-    const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
-    const auto* rocm_cc = std::get_if<se::RocmComputeCapability>(&gpu_version);
+    const auto* cuda_cc = gpu_version.cuda_compute_capability();
+    const auto* rocm_cc = gpu_version.rocm_compute_capability();
 
     // Make sure that dots have at least 1 contracting dimension in the
     // operands. Needs to happen shortly before the dot rewrite, as otherwise
@@ -1836,7 +1666,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
       pipeline.AddPass<HloDCE>();
       pipeline.AddPass<SoftmaxRewriterTriton>(
           gpu_target_config.device_description, ShapeSizeBytesFunction(),
-          &mlir_context_,
+          alias_info, &mlir_context_,
           /*only_fuse_if_profitable=*/true);
     }
 
@@ -1856,7 +1686,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   }
 
   HloPassPipeline pipeline("post-layout_assignment");
-  AddHloVerifier(&pipeline, !debug_options.xla_ignore_channel_id(),
+  AddHloVerifier(&pipeline,
                  HloVerifierOpts{}
                      .MakeLayoutSensitive()
                      .WithInstructionCanChangeLayout(
@@ -1943,7 +1773,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
         pipeline.AddPass<HloPassPipeline>(
             "remove-no-op-reduce-precision-algebraic-simplifier");
     AlgebraicSimplifierOptions options = GetAlgebraicSimplifierOptions(
-        AlgebraicSimplifierMode::kPostFusionSimplification, debug_options,
+        AlgebraicSimplifierMode::kAfterSimplifyFPConversions, debug_options,
         gpu_target_config.platform_name == "ROCM");
     remove_no_op_reduce_precision_pipeline
         .AddPass<HloPassFix<GpuAlgebraicSimplifier>>(options, gpu_version);
@@ -1963,7 +1793,6 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
                                  LayoutAssignment::InstructionCanChangeLayout)
                              .VerifyBroadcastDimensionsOrder()
                              .VerifyReshapeIsBitcast();
-  opts.verify_unique_channel_ids = !debug_options.xla_ignore_channel_id();
   pipeline.AddPass<HloVerifier>(
       std::make_unique<DefaultVerifierMetadata>(std::move(opts)),
       "end-of-post-layout_assignment");
@@ -1977,11 +1806,12 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 
 // Returns the TargetConfig, either from the module debug options, or from the
 // CompilationOptions, or if both of those are absent, from the attached GPU.
-/*static*/ absl::StatusOr<Compiler::TargetConfig> GpuCompiler::GetTargetConfig(
-    const Compiler::CompileOptions& options, const DebugOptions& debug_opts,
-    se::StreamExecutor* executor) {
-  if (options.target_config.has_value()) {
-    return *options.target_config;
+/*static*/ absl::StatusOr<Compiler::GpuTargetConfig>
+GpuCompiler::GetTargetConfig(const Compiler::CompileOptions& options,
+                             const DebugOptions& debug_opts,
+                             se::StreamExecutor* executor) {
+  if (options.gpu_target_config.has_value()) {
+    return *options.gpu_target_config;
   }
   if (!debug_opts.xla_gpu_target_config_filename().empty()) {
     std::string gpu_target_config_string;
@@ -1995,10 +1825,11 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
           "Failed to parse GpuTargetConfigProto");
     }
 
-    return Compiler::TargetConfig::FromProto(gpu_target_config_proto);
+    return Compiler::GpuTargetConfig::FromProto(gpu_target_config_proto);
   }
   if (executor) {
-    Compiler::TargetConfig target_config = Compiler::TargetConfig{executor};
+    Compiler::GpuTargetConfig target_config =
+        Compiler::GpuTargetConfig{executor};
     int64_t device_memory_size =
         target_config.device_description.device_memory_size();
     // Checking for device_memory_size == -1 is how we detect that we are
@@ -2021,12 +1852,17 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
+  // TODO rename slice_size to partition_size in CompileOptions
+  if (options.slice_size > 0) {
+    module->mutable_config().set_partition_size(options.slice_size);
+  }
+
   const DebugOptions debug_opts = module->config().debug_options();
   TF_RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
-  bool is_deviceless = options.target_config.has_value() ||
+  bool is_deviceless = options.gpu_target_config.has_value() ||
                        !debug_opts.xla_gpu_target_config_filename().empty();
 
-  TF_ASSIGN_OR_RETURN(TargetConfig gpu_target_config,
+  TF_ASSIGN_OR_RETURN(GpuTargetConfig gpu_target_config,
                       GetTargetConfig(options, debug_opts, stream_exec));
   const std::optional<std::string> unoptimized_fingerprint =
       MaybeUploadUnoptimizedGpuSymbols(module.get(),
@@ -2035,7 +1871,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
   XLA_SCOPED_LOGGING_TIMER_IF(
       absl::StrCat("GpuCompiler::RunHloPasses for ", module->name()),
-      !options.is_autotuning_compilation);
+      debug_opts.xla_enable_scoped_logging_timers());
   uint64_t start_usecs = tsl::Env::Default()->NowMicros();
   tsl::profiler::TraceMe activity(
       [&] { return absl::StrCat("HLO Transforms:", module->name()); },
@@ -2062,8 +1898,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   AutotuneResults autotune_results;
   DeviceOrDevicelessConfig device_config =
       GetDeviceConfig(stream_exec, options, gpu_target_config);
-  AutotuneConfig autotune_config =
-      AutotuneConfig::FromDebugOptions(device_config, debug_opts);
+  TF_ASSIGN_OR_RETURN(
+      AutotuneConfig autotune_config,
+      AutotuneConfig::FromDebugOptions(device_config, debug_opts));
   if (!is_deviceless) {
     TF_RETURN_IF_ERROR(
         AutotunerUtil::SerializeAutotuneResults(&autotune_results));
@@ -2100,14 +1937,15 @@ bool ShouldAddCopyForCollectiveMemorySpace(const HloValue* value) {
            .xla_gpu_experimental_enable_nccl_symmetric_buffers()) {
     return false;
   }
-  // Add copy if a potential collective-memmory-spaced op directly consumes from
+  // Add copy if a potential collective-memory-spaced op directly consumes from
   // module input or a constant as they are allocated by bfc ahead of time and
-  // the alignment might not match collective memory space's requiment.
+  // the alignment might not match collective memory space's requirement.
   if (absl::c_linear_search(
           module->entry_computation()->parameter_instructions(), inst) ||
       (inst->opcode() == HloOpcode::kConstant)) {
     for (auto& use : value->GetUses()) {
-      if (IsCollective(use.instruction)) {
+      if (IsCollective(use.instruction) ||
+          IsCollectiveMosaicGpuInstruction(*use.instruction)) {
         return true;
       }
     }
@@ -2202,6 +2040,7 @@ GpuCompiler::CompileSingleModule(
     const HloModule* debug_module, llvm::Module* llvm_module, bool relocatable,
     const CompileOptions& options, std::optional<int> shard_number) {
   tsl::profiler::TraceMe traceme("CompileSingleModule");
+  const DebugOptions& debug_options = module_config.debug_options();
   {
     // This may print multiple lines per HLO compilation because of the
     // parallelized compilation of LLVM modules.
@@ -2209,7 +2048,7 @@ GpuCompiler::CompileSingleModule(
         absl::StrCat(
             "GpuCompiler::RunBackend - Running LLVM verifier for ",
             (debug_module != nullptr ? debug_module->name() : "(unknown)")),
-        VLOG_IS_ON(4) && !options.is_autotuning_compilation);
+        VLOG_IS_ON(4) && debug_options.xla_enable_scoped_logging_timers());
 
     llvm_module->getContext().setDiagnosticHandlerCallBack(
         NullDiagnosticHandler, nullptr);
@@ -2235,7 +2074,7 @@ GpuCompiler::CompileSingleModule(
                           relocatable, debug_module, options, shard_number));
 
   const bool should_dump = DumpingEnabledForHloModule(
-      debug_module ? debug_module->name() : "", module_config.debug_options());
+      debug_module ? debug_module->name() : "", debug_options);
 
   if (should_dump) {
     if (debug_module) {
@@ -2671,7 +2510,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   }
 
   const DebugOptions& debug_opts = module->config().debug_options();
-  TF_ASSIGN_OR_RETURN(TargetConfig gpu_target_config,
+  TF_ASSIGN_OR_RETURN(GpuTargetConfig gpu_target_config,
                       GetTargetConfig(options, debug_opts, stream_exec));
 
   if (DumpingEnabledForHloModule(*module)) {
@@ -2681,24 +2520,13 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     DumpToFileInDirOrStdout(*module, "", "gpu_target_config.pbtxt", textproto);
   }
 
-  if (!options.is_autotuning_compilation) {
-    VLOG(1) << "Starting to compile HLO module " << module->name();
-  }
-
   XLA_SCOPED_LOGGING_TIMER_IF(
       absl::StrCat("GpuCompiler::RunBackend for ", module->name()),
-      !options.is_autotuning_compilation);
+      debug_opts.xla_enable_scoped_logging_timers());
   std::string slow_compilation_msg =
       absl::StrCat("Compiling module ", module->name(), " for GPU");
   auto slow_compile_alarm = SlowCompilationAlarm(slow_compilation_msg);
 
-  if (options.is_autotuning_compilation) {
-    if (module->config().debug_options().xla_embed_ir_in_executable()) {
-      LOG(WARNING) << "Doing autotuning compilations with "
-                      "xla_embed_ir_in_executable wastes memory!";
-    }
-  }
-
   llvm::LLVMContext llvm_context;
   const se::DeviceDescription& gpu_device_info =
       gpu_target_config.device_description;
@@ -2731,8 +2559,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   // The module is being moved into the GpuExecutable below and we need to
   // read a few config values from the module, before it becomes invalid.
-  bool embed_ir_in_executable =
-      module->config().debug_options().xla_embed_ir_in_executable();
+  bool embed_ir_in_executable = debug_opts.xla_embed_ir_in_executable();
 
   tsl::profiler::ScopedAnnotation annotation([&] {
     return absl::StrFormat("XlaCreateGpuExecutable:#module=%s#",
@@ -2742,7 +2569,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
   const GpuAliasInfo* alias_info_ptr = alias_info.get();
   TF_ASSIGN_OR_RETURN(
-      auto gpu_executable,
+      std::unique_ptr<GpuExecutable> gpu_executable,
       GpuExecutable::Create(GpuExecutable::Params{
           /*asm_text=*/(options.is_autotuning_compilation &&
                         !res.backend_result.binary.empty())
@@ -2764,7 +2591,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
           /*buffer_assignment=*/
           std::move(res.compile_module_results.buffer_assignment),
           /*alias_info=*/std::move(alias_info),
-          /*debug_options=*/std::move(debug_opts),
+          /*debug_options=*/debug_opts,
           /*device_description=*/gpu_device_info,
           /*debug_module=*/options.is_autotuning_compilation
               ? std::unique_ptr<HloModule>()
@@ -2802,6 +2629,34 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   // compilation.
   CHECK_EQ(options.PlatformId(), PlatformId());
 
+  if (hlo_module->config()
+          .debug_options()
+          .xla_gpu_experimental_aot_compiled_thunks()) {
+    return NewCompileAheadOfTime(std::move(hlo_module), options);
+  }
+
+  return LegacyCompileAheadOfTime(std::move(hlo_module), options);
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+GpuCompiler::NewCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                                   const AotCompilationOptions& options) {
+  CompileOptions compile_options;
+  compile_options.device_allocator = options.device_allocator();
+  compile_options.gpu_target_config = options.gpu_target_config();
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      RunBackend(std::move(hlo_module), options.executor(), compile_options));
+
+  std::vector<std::unique_ptr<AotCompilationResult>> results;
+  TF_ASSIGN_OR_RETURN(results.emplace_back(), Export(executable.get()));
+  return results;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                                      const AotCompilationOptions& options) {
   std::unique_ptr<HloModule> optimized_module;
 
   if (!hlo_module->has_schedule()) {
@@ -2811,7 +2666,7 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
     }};
     CompileOptions compile_options;
     compile_options.device_allocator = options.device_allocator();
-    compile_options.target_config = options.target_config();
+    compile_options.gpu_target_config = options.gpu_target_config();
     TF_ASSIGN_OR_RETURN(optimized_module,
                         RunHloPasses(std::move(hlo_module), options.executor(),
                                      compile_options));
@@ -2821,8 +2676,8 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
 
   std::vector<std::unique_ptr<AotCompilationResult>> results;
 
-  const std::optional<Compiler::TargetConfig>& target_config =
-      options.target_config();
+  const std::optional<Compiler::GpuTargetConfig>& target_config =
+      options.gpu_target_config();
   CHECK(target_config.has_value() || options.executor() != nullptr);
   const se::DeviceDescription& gpu_device_info =
       target_config.has_value() ? target_config->device_description
@@ -2836,7 +2691,7 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   // Create GpuThunkAotCompilationResult if thunk runtime is enabled.
   TF_ASSIGN_OR_RETURN(
       results.emplace_back(),
-      GpuThunkAotCompilationResult::FromModule(
+      LegacyGpuAotCompilationResult::FromModule(
           optimized_module.get(),
           res.compile_module_results.buffer_assignment.get(),
           res.backend_result.asm_text, res.backend_result.binary,
@@ -2853,9 +2708,19 @@ HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
 absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
     Executable* executable) const {
   auto* gpu_executable = tensorflow::down_cast<GpuExecutable*>(executable);
-  if (!gpu_executable) return Internal("GpuExecutable is null");
+  if (!gpu_executable) {
+    return Internal("GpuExecutable is null");
+  }
 
-  return GpuThunkAotCompilationResult::FromModule(
+  if (gpu_executable->module()
+          .config()
+          .debug_options()
+          .xla_gpu_experimental_aot_compiled_thunks()) {
+    TF_ASSIGN_OR_RETURN(GpuExecutableProto proto, gpu_executable->ToProto());
+    return GpuAotCompilationResult::FromProto(std::move(proto));
+  }
+
+  return LegacyGpuAotCompilationResult::FromModule(
       &gpu_executable->module(), gpu_executable->buffer_assignment(),
       gpu_executable->text(), gpu_executable->binary(),
       gpu_executable->dnn_compiled_graphs(), pointer_size_);
@@ -3023,13 +2888,13 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     pipeline.AddPass<FusionWrapper>(gpu_device_info);
   }
 
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-      &gpu_device_info.gpu_compute_capability());
+  const auto* cuda_cc =
+      gpu_device_info.gpu_compute_capability().cuda_compute_capability();
   if (cuda_cc != nullptr && cuda_cc->IsAtLeastAmpere()) {
     // This needs to run after every pass affecting fusions. The last passes
     // that create new fusions are FusionWrapper and StreamAttributeAnnotator.
     main_pipeline.AddPass<HloPassPipeline>(FusionDispatchPipeline(
-        gpu_device_info, ShapeSizeBytesFunction(), mlir_context()));
+        gpu_device_info, ShapeSizeBytesFunction(), &mlir_context_));
   }
 
   // Pipeline with passes which wrap a scheduled module into command buffers.
@@ -3054,7 +2919,6 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
   if (module->config().debug_options().xla_gpu_pgle_accuracy_checker() ==
       DebugOptions::PGLE_STRICTNESS_LEVEL_ERROR) {
     AddHloVerifier(&main_pipeline,
-                   module->config().debug_options().xla_ignore_channel_id(),
                    HloVerifierOpts{}.VerifyInstructionNameUnchanged());
   }
   return main_pipeline.Run(module, {HloInstruction::kMainExecutionThread})
@@ -3095,9 +2959,115 @@ absl::Status GpuCompiler::SerializeAutotuneResultsToFile(
 absl::StatusOr<std::unique_ptr<AotCompilationResult>>
 GpuCompiler::LoadAotCompilationResult(
     const std::string& serialized_aot_result) {
-  return GpuThunkAotCompilationResult::FromString(serialized_aot_result,
+  GpuExecutableProto gpu_executable_proto;
+  if (!gpu_executable_proto.ParseFromString(serialized_aot_result)) {
+    return InvalidArgument(
+        "Failed to parse serialized AOT result as GpuExecutableProto.");
+  }
+
+  // If the proto has a thunk set, it's a new OAT format.
+  if (gpu_executable_proto.has_thunk()) {
+    return GpuAotCompilationResult::FromProto(gpu_executable_proto);
+  }
+
+  return LegacyGpuAotCompilationResult::FromProto(gpu_executable_proto,
                                                   pointer_size_);
 }
 
+absl::StatusOr<std::unique_ptr<Executable>>
+GpuCompiler::LoadExecutableFromAotResult(
+    const AotCompilationResult& aot_result,
+    const se::StreamExecutor& stream_exec) {
+  tsl::profiler::TraceMe traceme("LoadExecutableFromAotResult");
+
+  const auto* gpu_aot_result =
+      dynamic_cast<const LegacyGpuAotCompilationResult*>(&aot_result);
+  if (gpu_aot_result == nullptr) {
+    return Internal(
+        "AotCompilationResult is not a GpuThunkAotCompilationResult.");
+  }
+  const GpuExecutableProto& proto = gpu_aot_result->GetGpuExecutableProto();
+
+  // Recreate HloModule+HloModuleConfig from proto.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
+
+  ExecutionStreamAssignment execution_stream_assignment(hlo_module.get());
+
+  std::vector<uint8_t> binary(proto.binary().begin(), proto.binary().end());
+
+  // Build the executable, which should be a thunk sequence.
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      se::PlatformManager::PlatformWithId(PlatformId()));
+  std::string platform_name = platform->Name();
+
+  const se::DeviceDescription& gpu_device_info =
+      stream_exec.GetDeviceDescription();
+  llvm::LLVMContext llvm_context;
+
+  auto llvm_module = std::make_unique<llvm::Module>("", llvm_context);
+  llvm_module->setTargetTriple(llvm::Triple(target_triple()));
+  llvm_module->setDataLayout(data_layout());
+
+  // Recreate BufferAssignment from proto.
+  std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> buffer_assignment,
+      BufferAssignment::FromProto(proto.buffer_assignment(), hlo_module.get(),
+                                  BufferSizeBytesFunction(), alias_info.get()));
+
+  IrEmitterContext ir_emitter_context(
+      hlo_module.get(), buffer_assignment.get(), &execution_stream_assignment,
+      platform_name, gpu_device_info, mlir_context(), llvm_module.get(),
+      /*llvm_module_constants=*/nullptr, /*emit_kernels=*/false);
+
+  absl::string_view cache_file_path =
+      hlo_module->config().debug_options().xla_gpu_kernel_cache_file();
+  if (!cache_file_path.empty() &&
+      hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_llvm_module_compilation_parallelism()) {
+    TF_RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
+  }
+
+  auto ir_emitter = ThunkEmitter::Create(&ir_emitter_context);
+  TF_RETURN_IF_ERROR(ir_emitter->EmitHloEntryComputation(hlo_module.get()));
+
+  // Get all other fields required by GpuExecutable.
+  std::vector<GpuExecutable::ConstantInfo> constants =
+      std::move(ir_emitter_context.constants());
+  TF_ASSIGN_OR_RETURN(auto output_info,
+                      GetOutputInfo(*hlo_module, *buffer_assignment));
+  ProgramShape program_shape =
+      hlo_module->entry_computation_layout().ComputeProgramShape();
+  *program_shape.mutable_result() = hlo_module->result_shape();
+  DebugOptions debug_options = hlo_module->config().debug_options();
+  std::string hlo_module_name = hlo_module->name();
+
+  {
+    tsl::profiler::TraceMe traceme("CreateGpuExecutable");
+    std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
+    return GpuExecutable::Create(GpuExecutable::Params{
+        /*asm_text=*/proto.asm_text(),
+        /*binary=*/binary,
+        /*dnn_compiled_graphs=*/
+        BinaryMap(proto.dnn_compiled_graphs().cbegin(),
+                  proto.dnn_compiled_graphs().cend()),
+        /*executable=*/ir_emitter->ConsumeThunkSequence(),
+        /*constants=*/std::move(constants),
+        /*output_info=*/std::move(output_info),
+        /*module_name=*/std::move(hlo_module_name),
+        /*program_shape=*/std::move(program_shape),
+        /*mlir_allocations=*/std::nullopt,
+        /*buffer_assignment=*/std::move(buffer_assignment),
+        /*alias_info=*/std::move(alias_info),
+        /*debug_options=*/std::move(debug_options),
+        /*device_description=*/gpu_device_info,
+        /*debug_module=*/std::move(hlo_module),
+        /*enable_debug_info_manager=*/true});
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index d5ee8d6a9f369f..96b6312d292655 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -27,9 +27,8 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/autotune_results.pb.h"
-#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
@@ -38,7 +37,6 @@ limitations under the License.
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -53,7 +51,6 @@ limitations under the License.
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace gpu {
@@ -108,7 +105,7 @@ class GpuCompiler : public LLVMCompiler {
 
   int64_t GetPointerSize() const { return pointer_size_; }
 
-  static absl::StatusOr<Compiler::TargetConfig> GetTargetConfig(
+  static absl::StatusOr<Compiler::GpuTargetConfig> GetTargetConfig(
       const Compiler::CompileOptions& options, const DebugOptions& debug_opts,
       se::StreamExecutor* executor);
 
@@ -138,6 +135,10 @@ class GpuCompiler : public LLVMCompiler {
       AlgebraicSimplifierMode mode, const DebugOptions& debug_options,
       bool is_rocm);
 
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutableFromAotResult(
+      const AotCompilationResult& aot_result,
+      const se::StreamExecutor& stream_exec) override;
+
  protected:
   struct BackendCompileResult {
     std::string asm_text;
@@ -151,7 +152,7 @@ class GpuCompiler : public LLVMCompiler {
   // thread_pool is used to speed up compilation during autotuning.
   virtual absl::Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      const CompileOptions& options, const TargetConfig& gpu_target_config,
+      const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
       const GpuAliasInfo* alias_info, tsl::thread::ThreadPool* thread_pool);
 
   // CollectivesScheduleLinearizer enforces a total ordering between collectives
@@ -172,7 +173,7 @@ class GpuCompiler : public LLVMCompiler {
       const CompileOptions& options, HloModule* hlo_module,
       AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
       se::StreamExecutor* stream_exec,
-      const Compiler::TargetConfig* target_config) {
+      const Compiler::GpuTargetConfig* target_config) {
     return absl::OkStatus();
   }
 
@@ -191,7 +192,7 @@ class GpuCompiler : public LLVMCompiler {
       HloPassPipeline* pipeline, HloModule* hlo_module,
       const CompileOptions& options, tsl::thread::ThreadPool* thread_pool,
       stream_executor::StreamExecutor* stream_executor,
-      const Compiler::TargetConfig* target_config,
+      const Compiler::GpuTargetConfig* target_config,
       HloCostAnalysis::ShapeSizeFunction shape_size_fn) {
     return absl::OkStatus();
   }
@@ -244,11 +245,11 @@ class GpuCompiler : public LLVMCompiler {
   absl::Status OptimizeHloModule(HloModule* hlo_module,
                                  se::StreamExecutor* stream_exec,
                                  const CompileOptions& options,
-                                 const TargetConfig& gpu_target_config,
+                                 const GpuTargetConfig& gpu_target_config,
                                  const GpuAliasInfo* alias_info);
 
   virtual absl::Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
       se::dnn::VersionInfo dnn_version,
       const se::SemanticVersion& toolkit_version) = 0;
 
@@ -272,6 +273,15 @@ class GpuCompiler : public LLVMCompiler {
     return Unimplemented("LinkModules is not implemented.");
   }
 
+  // New AOT compilation as part of the AOT split project.
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  NewCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                        const AotCompilationOptions& options);
+  // Legacy AOT compilation.
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                           const AotCompilationOptions& options);
+
   se::Platform::Id platform_id_;
 
   // The triple that represents our target.
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index ec93001ddb2f07..e631830d470a7d 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -35,12 +34,14 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/log/scoped_mock_log.h"
+#include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "google/protobuf/text_format.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -48,7 +49,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/testlib/filecheck.h"
@@ -83,15 +83,15 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/testing/temporary_directory.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/regexp.h"
 
 namespace xla {
@@ -107,9 +107,11 @@ using ::testing::IsEmpty;
 using ::testing::IsSupersetOf;
 using ::testing::Matches;
 using ::testing::Not;
+using ::testing::SizeIs;
 using ::testing::StartsWith;
 using ::testing::TempDir;
 using ::tsl::gtl::ValueOrDie;
+using ::tsl::testing::TemporaryDirectory;
 
 class GpuCompilerTest : public HloTestBase {
  public:
@@ -145,8 +147,24 @@ class GpuCompilerTest : public HloTestBase {
                         test_runner().HloModuleFromWrapped(executable.get()));
     return {{optimized_module, std::move(executable)}};
   }
+
+  se::CudaComputeCapability get_cuda_cc() const {
+    se::StreamExecutor* executor = backend().default_stream_executor();
+    return executor->GetDeviceDescription().cuda_compute_capability();
+  }
 };
 
+absl::StatusOr<std::string> ReadNonEmptyFile(absl::string_view file_path) {
+  std::string str;
+  tsl::Env* env = tsl::Env::Default();
+  TF_RETURN_IF_ERROR(tsl::ReadFileToString(env, std::string(file_path), &str));
+  if (str.empty()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("File is empty: ", file_path));
+  }
+  return str;
+}
+
 TEST_F(GpuCompilerTest, CompiledProgramsCount) {
   const char* hlo_text = R"(
 HloModule test
@@ -405,14 +423,6 @@ ENTRY e {
     return filename;
   }
 
-  std::string ExpectToReadNonEmptyFile(absl::string_view file_path) {
-    std::string str;
-    tsl::Env* env = tsl::Env::Default();
-    TF_EXPECT_OK(tsl::ReadFileToString(env, std::string(file_path), &str));
-    EXPECT_THAT(str, Not(IsEmpty()));
-    return str;
-  }
-
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions options = HloTestBase::GetDebugOptionsForTest();
     options.set_xla_gpu_dump_autotune_results_to(
@@ -433,8 +443,9 @@ TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
   // Check that it writes the results on the first compilation.
   TF_EXPECT_OK(GetOptimizedModule(kHloText).status());
   {
-    std::string autotune_results_str =
-        ExpectToReadNonEmptyFile(xla_gpu_dump_autotune_results_to_);
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::string autotune_results_str,
+        ReadNonEmptyFile(xla_gpu_dump_autotune_results_to_));
     AutotuneResults results;
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(autotune_results_str,
                                                            &results));
@@ -448,8 +459,9 @@ TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
   // Check that it writes the results on the second compilation.
   TF_EXPECT_OK(GetOptimizedModule(kHloText).status());
   {
-    std::string autotune_results_str =
-        ExpectToReadNonEmptyFile(xla_gpu_dump_autotune_results_to_);
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::string autotune_results_str,
+        ReadNonEmptyFile(xla_gpu_dump_autotune_results_to_));
     AutotuneResults results;
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(autotune_results_str,
                                                            &results));
@@ -617,10 +629,10 @@ class GpuCompilerTestWithAutotuneDb : public GpuCompilerTest {
     std::string tmp_filepath = ::testing::TempDir();
     ASSERT_TRUE(env->CreateUniqueFileName(&tmp_filepath, ".textproto"));
 
-    absl::Cleanup cleanup = [&] { TF_CHECK_OK(env->DeleteFile(tmp_filepath)); };
+    absl::Cleanup cleanup = [&] { CHECK_OK(env->DeleteFile(tmp_filepath)); };
 
     std::string contents;
-    TF_CHECK_OK(tsl::ReadFileToString(env, path, &contents));
+    CHECK_OK(tsl::ReadFileToString(env, path, &contents));
 
     // The autotuning cache entries depend on the DNN library version, but this
     // is not relevant for these tests. Therefore we replace the DNN version
@@ -645,14 +657,14 @@ class GpuCompilerTestWithAutotuneDb : public GpuCompilerTest {
 
 TEST_F(GpuCompilerTestWithAutotuneDb,
        GemmFusionIsNoOpWhenGemmFusionAutotunerFallsBackToCublas) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-  if (!cc.IsAtLeastAmpere()) {
+  if (!get_cuda_cc().IsAtLeastAmpere()) {
     GTEST_SKIP() << "Autotuning results have only been generated for Ampere "
                  << "and later GPUs";
   }
+  if (get_cuda_cc().IsAtLeastBlackwell()) {
+    // TODO(b/445172709): Re-enable once fixed.
+    GTEST_SKIP();
+  }
   const absl::string_view hlo_string = R"(
 HloModule test
 
@@ -713,14 +725,14 @@ ENTRY main {
 
 TEST_F(GpuCompilerTestWithAutotuneDb,
        CublasF8NumericallySameWithTritonFallbackAndWithoutTriton) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-  if (!cc.IsAtLeastHopper()) {
+  if (!get_cuda_cc().IsAtLeastHopper()) {
     GTEST_SKIP()
         << "Autotuning results have only been generated for Hopper GPUs";
   }
+  if (get_cuda_cc().IsAtLeastBlackwell()) {
+    // TODO(b/445172709): Re-enable once fixed.
+    GTEST_SKIP();
+  }
   const absl::string_view hlo_string = R"(
 HloModule test
 
@@ -821,19 +833,15 @@ ENTRY main {
                     .default_stream_executor()
                     ->GetDeviceDescription()
                     .gpu_compute_capability();
-  bool is_cuda =
-      std::holds_alternative<stream_executor::CudaComputeCapability>(gpu_cc);
-  auto cuda_cc = backend()
-                     .default_stream_executor()
-                     ->GetDeviceDescription()
-                     .cuda_compute_capability();
+  bool is_cuda = gpu_cc.IsCuda();
+  auto cuda_cc = get_cuda_cc();
   auto rocm_cc = backend()
                      .default_stream_executor()
                      ->GetDeviceDescription()
                      .rocm_compute_capability();
 
   const std::string triton_keep_types = absl::Substitute(
-      R"(CHECK: fusion($0{{[^)]*}}, $1{{[^)]*}}){{.*}}"kind":"__triton_gemm")",
+      R"(CHECK: fusion($0{{[^)]*}}, $1{{[^)]*}}){{.*}}"kind":"{{__triton_gemm|__triton_nested_gemm_fusion}}")",
       lhs_name, rhs_name);
   const std::string cublaslt_keep_types = absl::Substitute(
       R"(CHECK: custom-call($0{{[^)]*}}, $1{{[^)]*}}){{.*}}custom_call_target="__cublas$$lt$$matmul$$f8")",
@@ -900,6 +908,117 @@ ENTRY main {
   }
 }
 
+class AotCompilationTest : public GpuCompilerTest,
+                           public ::testing::WithParamInterface<bool> {
+ protected:
+  void SetUp() override {
+    stream_exec_ = backend().default_stream_executor();
+    compiler_ = backend().compiler();
+    aot_options_ =
+        std::make_unique<AotCompilationOptions>(compiler_->PlatformId());
+    aot_options_->set_executor(stream_exec_);
+  }
+
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = GpuCompilerTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_experimental_aot_compiled_thunks(GetParam());
+    return debug_options;
+  }
+
+  se::StreamExecutor* stream_exec_;
+  Compiler* compiler_;
+  std::unique_ptr<AotCompilationOptions> aot_options_;
+};
+
+INSTANTIATE_TEST_SUITE_P(NewAotFlow, AotCompilationTest, ::testing::Bool(),
+                         [](const ::testing::TestParamInfo<bool>& info) {
+                           return info.param ? "NewAotFlowEnabled"
+                                             : "NewAotFlowDisabled";
+                         });
+
+TEST_P(AotCompilationTest, CompileAndLoadAotResult) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> add_1_hlo,
+      ParseAndReturnVerifiedModule(R"hlo(
+    add1 {
+      p = s32[] parameter(0)
+      c = s32[] constant(1)
+      ROOT a = s32[] add(p, c)
+    }
+
+    ENTRY e {
+      p = s32[] parameter(0)
+      ROOT r = s32[] fusion(p), kind=kLoop, calls=add1
+    })hlo",
+                                   GetModuleConfigForTest()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      compiler_->CompileAheadOfTime(std::move(add_1_hlo), *aot_options_));
+  ASSERT_THAT(aot_results, SizeIs(1));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
+                          std::move(aot_results[0])->SerializeAsString());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AotCompilationResult> aot_result,
+      compiler_->LoadAotCompilationResult(serialized_aot_result));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      std::move(*aot_result).LoadExecutable(compiler_, stream_exec_));
+  std::unique_ptr<OpaqueExecutable> wrapped_executable =
+      test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
+
+  const xla::Literal literal_input = xla::LiteralUtil::CreateR0<int32_t>(1);
+  const xla::Literal literal_expected_result =
+      xla::LiteralUtil::CreateR0<int32_t>(2);
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          test_runner_as_hlo_runner().ExecuteWithExecutable(
+                              wrapped_executable.get(), {&literal_input}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(result, literal_expected_result));
+}
+
+TEST_P(AotCompilationTest, ExportAndImportAotResult) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> add_1_hlo,
+      ParseAndReturnVerifiedModule(R"hlo(
+    add1 {
+      p = s32[] parameter(0)
+      c = s32[] constant(1)
+      ROOT a = s32[] add(p, c)
+    }
+
+    ENTRY e {
+      p = s32[] parameter(0)
+      ROOT r = s32[] fusion(p), kind=kLoop, calls=add1
+    })hlo",
+                                   GetModuleConfigForTest()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      compiler_->RunBackend(std::move(add_1_hlo), stream_exec_,
+                            {/*device_allocator=*/nullptr,
+                             /*thread_pool=*/nullptr,
+                             /*layout_canonicalization_callback=*/{},
+                             /*is_autotuning_compilation=*/false}));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<AotCompilationResult> aot_result,
+                          compiler_->Export(executable.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> new_executable,
+      std::move(*aot_result).LoadExecutable(compiler_, stream_exec_));
+  std::unique_ptr<OpaqueExecutable> wrapped_executable =
+      test_runner_as_hlo_runner().WrapExecutable(std::move(new_executable));
+
+  const xla::Literal literal_input = xla::LiteralUtil::CreateR0<int32_t>(1);
+  const xla::Literal literal_expected_result =
+      xla::LiteralUtil::CreateR0<int32_t>(2);
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          test_runner_as_hlo_runner().ExecuteWithExecutable(
+                              wrapped_executable.get(), {&literal_input}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(result, literal_expected_result));
+}
+
 class KernelCacheTest : public HloTestBase {
  public:
   void SetUp() override {
@@ -1119,11 +1238,6 @@ bool HasBlockLevelFusionConfig(const HloInstruction* fusion) {
 
 TEST_F(GpuCompilerTest,
        LoopFusionRootedInTransposeIsRewrittenToBlockLevelByDefaultPostAmpere) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-
   constexpr absl::string_view transpose_fusion_module = R"(
 transpose {
   p0 = f32[1024,1024,1024] parameter(0)
@@ -1144,7 +1258,7 @@ ENTRY main {
       GetOptimizedModuleForExecutable(transpose_fusion_module, config));
   const HloModule* optimized_module = module_and_executable.first;
 
-  if (cc.IsAtLeastAmpere()) {
+  if (get_cuda_cc().IsAtLeastAmpere()) {
     EXPECT_TRUE(HasBlockLevelFusionConfig(
         optimized_module->entry_computation()->root_instruction()));
   } else {
@@ -1156,11 +1270,7 @@ ENTRY main {
 TEST_F(
     GpuCompilerTest,
     FusionBlockLevelRewriterRewritesKLoopTransposeWithBitcastIfTheSmallMinorDimIsAPowerOfTwo) {  // NOLINT(whitespace/line_length)
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-  if (!cc.IsAtLeastAmpere()) {
+  if (!get_cuda_cc().IsAtLeastAmpere()) {
     GTEST_SKIP() << "FusionBlockLevelRewriter requires Ampere+ to run.";
   }
 
@@ -1395,18 +1505,14 @@ using GpuCompilerPassTest = GpuCompilerTest;
 
 TEST_F(GpuCompilerPassTest,
        GpuCompilerRunsTritonGemmRewriterByDefaultFromAmpere) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-
-  bool is_rocm = std::holds_alternative<stream_executor::RocmComputeCapability>(
-      backend()
-          .default_stream_executor()
-          ->GetDeviceDescription()
-          .gpu_compute_capability());
+  bool is_rocm = backend()
+                     .default_stream_executor()
+                     ->GetDeviceDescription()
+                     .gpu_compute_capability()
+                     .IsRocm();
 
-  bool expect_triton_gemm_rewriter_has_run = cc.IsAtLeastAmpere() || is_rocm;
+  bool expect_triton_gemm_rewriter_has_run =
+      get_cuda_cc().IsAtLeastAmpere() || is_rocm;
 
   constexpr absl::string_view constant_module = R"(
 HloModule noop
@@ -1433,13 +1539,8 @@ ENTRY main {
 
 TEST_F(GpuCompilerPassTest,
        GpuCompilerRunsCustomKernelFusionByDefaultFromVolta) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-
   bool expect_custom_kernel_fusion_rewriter_has_run =
-      cc.major == se::CudaComputeCapability::kVolta;
+      get_cuda_cc().major == se::CudaComputeCapability::kVolta;
 
   constexpr absl::string_view constant_module = R"(
 HloModule noop
@@ -1617,11 +1718,7 @@ TEST_F(PassOrderTest, OffloadingPassesAreRunInCorrectOrder) {
 }
 
 TEST_F(PassOrderTest, FusionDispatchRunsAfterAllFusionPasses) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-  if (!cc.IsAtLeastAmpere()) {
+  if (!get_cuda_cc().IsAtLeastAmpere()) {
     GTEST_SKIP() << "fusion-dispatch requires Ampere+ to run.";
   }
 
@@ -1707,11 +1804,7 @@ TEST_F(PassOrderTest, LHSRunsIfProfileDataIsAvailable) {
 }
 
 TEST_F(PassOrderTest, GemmFusionRunsAfterDotNormalizer) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-  if (!cc.IsAtLeastAmpere()) {
+  if (!get_cuda_cc().IsAtLeastAmpere()) {
     GTEST_SKIP() << "GemmFusion requires Ampere+ to run.";
   }
   DebugOptions options = GetDebugOptionsForTest();
@@ -1733,19 +1826,11 @@ TEST_F(PassOrderTest, GemmRewriterRunsAfterDotNormalizer) {
 TEST_F(PassOrderTest, NestGemmFusionRunsAfterGemmFusionAutotuner) {
   // NestGemmFusion expect to see __triton_gemm custom call with a backend
   // config created by gemm_fusion_autotuner.
-  DebugOptions options = GetDebugOptionsForTest();
-  options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-      DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
-  SetDebugOptions(options);
   VerifyPassOrder("gemm-fusion-autotuner", "nest_gemm_fusion");
 }
 
 TEST_F(PassOrderTest, TransposeDimensionGrouperRunsBeforeGemmRewriter) {
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-  if (!cc.IsAtLeastAmpere()) {
+  if (!get_cuda_cc().IsAtLeastAmpere()) {
     GTEST_SKIP() << "triton-gemm-rewriter requires at least Ampere to run.";
   }
   if (!optimized_module_) {
@@ -1985,7 +2070,8 @@ ENTRY %main {
   ASSERT_TRUE(tsl::Env::Default()->LocalTempFilename(&target_file));
   TF_ASSERT_OK(tsl::WriteTextProto(
       tsl::Env::Default(), target_file,
-      Compiler::TargetConfig(backend().default_stream_executor()).ToProto()));
+      Compiler::GpuTargetConfig(backend().default_stream_executor())
+          .ToProto()));
   debug_options.set_xla_gpu_target_config_filename(target_file);
   config.set_debug_options(debug_options);
 
@@ -2022,7 +2108,8 @@ TEST_F(GpuCompilerTest, CompilingAndCollectingMetadata) {
   ASSERT_TRUE(tsl::Env::Default()->LocalTempFilename(&target_file));
   TF_ASSERT_OK(tsl::WriteTextProto(
       tsl::Env::Default(), target_file,
-      Compiler::TargetConfig(backend().default_stream_executor()).ToProto()));
+      Compiler::GpuTargetConfig(backend().default_stream_executor())
+          .ToProto()));
   debug_options.set_xla_gpu_target_config_filename(target_file);
   config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -2103,11 +2190,7 @@ ENTRY main {
 }
 
 TEST_F(GpuCompilerTest, NoCudnnVectorizationOnHopperAndBeyond) {
-  bool is_hopper_or_beyond = backend()
-                                 .default_stream_executor()
-                                 ->GetDeviceDescription()
-                                 .cuda_compute_capability()
-                                 .IsAtLeastHopper();
+  bool is_hopper_or_beyond = get_cuda_cc().IsAtLeastHopper();
 
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
@@ -2182,11 +2265,11 @@ class GpuCompilerSelectKTest
 TEST_P(GpuCompilerSelectKTest, SelectKOrCustomKernelThunk) {
   auto [n, k, expected_impl] = GetParam();
 
-  bool is_rocm = std::holds_alternative<stream_executor::RocmComputeCapability>(
-      backend()
-          .default_stream_executor()
-          ->GetDeviceDescription()
-          .gpu_compute_capability());
+  bool is_rocm = backend()
+                     .default_stream_executor()
+                     ->GetDeviceDescription()
+                     .gpu_compute_capability()
+                     .IsRocm();
 
   if (is_rocm && expected_impl == TopKImpl::kSelectK) {
     GTEST_SKIP() << "raft::select_k is not supported in ROCm.";
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index f91a8a4b7eb028..c44b9350d9dc19 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -169,7 +169,7 @@ results {
   }
 }
 results {
-  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 9.10.0"
+  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
   hlo: "{\n  tmp_0 = f8e4m3fn[12288,4096]{0,1} parameter(0)\n  tmp_1 = f8e4m3fn[4096,12288]{1,0} bitcast(f8e4m3fn[12288,4096]{0,1} tmp_0)\n  ROOT tmp_2 = f8e4m3fn[12288,4096]{1,0} transpose(f8e4m3fn[4096,12288]{1,0} tmp_1), dimensions={1,0}\n}"
   result {
     other {
@@ -182,7 +182,7 @@ results {
   }
 }
 results {
-  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 9.10.0"
+  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
   hlo: "{\n  tmp_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[3,32768,4,1024]{3,2,1,0} bitcast(bf16[3,32,1024,4,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[3,4,32768,1024]{3,2,1,0} transpose(bf16[3,32768,4,1024]{3,2,1,0} tmp_1), dimensions={0,2,1,3}\n  tmp_3 = bf16[3,4,32,1024,1024]{4,3,2,1,0} bitcast(bf16[3,4,32768,1024]{3,2,1,0} tmp_2)\n  tmp_4 = bf16[1,3,32,1024]{3,2,1,0} parameter(1)\n  tmp_5 = bf16[3,32,1024]{2,1,0} bitcast(bf16[1,3,32,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[3,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[3,32,1024]{2,1,0} tmp_5), dimensions={0,2,3}\n  tmp_7 = bf16[3,4,32,1024,1024]{4,3,2,1,0} add(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_3, bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[1:2], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[0:1], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_10 = bf16[] constant({...})\n  tmp_11 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_10), dimensions={}\n  tmp_12 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_11)\n  tmp_13 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_12)\n  tmp_14 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_13), dimensions={0,2,1}\n  ROOT tmp_15 = (bf16[1,4,32,1024,1024]{4,3,2,1,0}, bf16[128,1024,1024]{2,1,0}) tuple(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_14)\n}"
   result {
     other {
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_runner.cc b/third_party/xla/xla/service/gpu/gpu_conv_runner.cc
index 282e27805332ab..cab867ee3e58a1 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_conv_runner.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/gpu_conv_runner.pb.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -652,5 +653,37 @@ absl::Status RunGpuConv(const gpu::GpuConvConfig& config,
   }
 }
 
+absl::StatusOr<GpuConvDescriptor> GpuConvDescriptor::FromProto(
+    const GpuConvDescriptorProto& proto) {
+  GpuConvDescriptor descriptor;
+  TF_ASSIGN_OR_RETURN(descriptor.kind, CudnnConvKindFromProto(proto.kind()));
+  descriptor.backend_config = proto.backend_config();
+  TF_ASSIGN_OR_RETURN(descriptor.operand0_shape,
+                      Shape::FromProto(proto.operand0_shape()));
+  TF_ASSIGN_OR_RETURN(descriptor.operand1_shape,
+                      Shape::FromProto(proto.operand1_shape()));
+  TF_ASSIGN_OR_RETURN(descriptor.result_shape,
+                      Shape::FromProto(proto.result_shape()));
+  descriptor.scratch_size = proto.scratch_size();
+  descriptor.window = proto.window();
+  descriptor.dnums = proto.dnums();
+  descriptor.feature_group_count = proto.feature_group_count();
+  return descriptor;
+}
+
+GpuConvDescriptorProto GpuConvDescriptor::ToProto() const {
+  GpuConvDescriptorProto proto;
+  proto.set_kind(CudnnConvKindToProto(kind));
+  *proto.mutable_backend_config() = backend_config;
+  *proto.mutable_operand0_shape() = operand0_shape.ToProto();
+  *proto.mutable_operand1_shape() = operand1_shape.ToProto();
+  *proto.mutable_result_shape() = result_shape.ToProto();
+  proto.set_scratch_size(scratch_size);
+  *proto.mutable_window() = window;
+  *proto.mutable_dnums() = dnums;
+  proto.set_feature_group_count(feature_group_count);
+  return proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_runner.h b/third_party/xla/xla/service/gpu/gpu_conv_runner.h
index a3d0c0e234621a..db6d2400f81336 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_conv_runner.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/gpu_conv_runner.pb.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
@@ -239,6 +240,11 @@ struct GpuConvDescriptor {
   Window window;
   ConvolutionDimensionNumbers dnums;
   int64_t feature_group_count;
+
+  static absl::StatusOr<GpuConvDescriptor> FromProto(
+      const GpuConvDescriptorProto& proto);
+
+  GpuConvDescriptorProto ToProto() const;
 };
 
 // Returns the convolution configuration given a XLA HLO instruction.
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_runner.proto b/third_party/xla/xla/service/gpu/gpu_conv_runner.proto
new file mode 100644
index 00000000000000..0e5d81731e4ce5
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_conv_runner.proto
@@ -0,0 +1,20 @@
+syntax = "proto3";
+
+package xla.gpu;
+
+import "xla/service/gpu/backend_configs.proto";
+import "xla/tsl/protobuf/dnn.proto";
+import "xla/xla_data.proto";
+
+// Serialization of xla::gpu::GpuConvDescriptor struct.
+message GpuConvDescriptorProto {
+  stream_executor.dnn.ConvolutionKind kind = 1;
+  CudnnConvBackendConfig backend_config = 2;
+  xla.ShapeProto operand0_shape = 3;
+  xla.ShapeProto operand1_shape = 4;
+  xla.ShapeProto result_shape = 5;
+  int64 scratch_size = 6;
+  xla.Window window = 7;
+  xla.ConvolutionDimensionNumbers dnums = 8;
+  int64 feature_group_count = 9;
+}
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_runner_test.cc b/third_party/xla/xla/service/gpu/gpu_conv_runner_test.cc
new file mode 100644
index 00000000000000..1a7333f88e7215
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_conv_runner_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_conv_runner.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_conv_runner.pb.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(GpuConvDescriptorTest, ProtoRoundTrip) {
+  auto proto = ParseTextProtoOrDie<GpuConvDescriptorProto>(R"pb(
+    kind: FORWARD
+    backend_config {
+      algorithm { algo_id: 1 }
+      conv_result_scale: 1.0
+    }
+    operand0_shape {
+      element_type: F32
+      dimensions: [ 1, 1, 1, 1 ]
+      layout {
+        minor_to_major: [ 3, 2, 1, 0 ]
+        tail_padding_alignment_in_elements: 1
+      }
+      is_dynamic_dimension: [ false, false, false, false ]
+    }
+    operand1_shape {
+      element_type: F32
+      dimensions: [ 1, 1, 1, 1 ]
+      layout {
+        minor_to_major: [ 3, 2, 1, 0 ]
+        tail_padding_alignment_in_elements: 1
+      }
+      is_dynamic_dimension: [ false, false, false, false ]
+    }
+    result_shape {
+      element_type: F32
+      dimensions: [ 1, 1, 1, 1 ]
+      layout {
+        minor_to_major: [ 3, 2, 1, 0 ]
+        tail_padding_alignment_in_elements: 1
+      }
+      is_dynamic_dimension: [ false, false, false, false ]
+    }
+    scratch_size: 1024
+    window {
+      dimensions {
+        size: 1
+        stride: 1
+        padding_low: 0
+        padding_high: 0
+        window_dilation: 1
+        base_dilation: 1
+        window_reversal: false
+      }
+    }
+    dnums {
+      input_batch_dimension: 0
+      input_feature_dimension: 1
+      input_spatial_dimensions: 2
+      kernel_input_feature_dimension: 1
+      kernel_output_feature_dimension: 0
+      kernel_spatial_dimensions: [ 2 ]
+      output_batch_dimension: 0
+      output_feature_dimension: 1
+      output_spatial_dimensions: 2
+    }
+    feature_group_count: 1
+  )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(GpuConvDescriptor desc,
+                          GpuConvDescriptor::FromProto(proto));
+
+  EXPECT_THAT(desc.ToProto(), EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
index 0fdcdec2090573..028bd2898b6c42 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/stream_executor/semantic_version.h"
 
 namespace xla {
@@ -101,8 +102,8 @@ stream_executor::DeviceDescription TestGpuDeviceInfo::RTXB200SXMDeviceInfo(
 
 stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() {
   stream_executor::DeviceDescription b;
-  b.set_gpu_compute_capability(
-      stream_executor::RocmComputeCapability("gfx90a"));
+  b.set_gpu_compute_capability(stream_executor::GpuComputeCapability(
+      stream_executor::RocmComputeCapability("gfx90a")));
   b.set_threads_per_block_limit(1024);
   b.set_threads_per_warp(64);
   b.set_shared_memory_per_block(64 * 1024);
@@ -125,8 +126,8 @@ stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() {
 
 stream_executor::DeviceDescription TestGpuDeviceInfo::AMDRX7900DeviceInfo() {
   stream_executor::DeviceDescription b;
-  b.set_gpu_compute_capability(
-      stream_executor::RocmComputeCapability("gfx1100"));
+  b.set_gpu_compute_capability(stream_executor::GpuComputeCapability(
+      stream_executor::RocmComputeCapability("gfx1100")));
   return b;
 }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
index 5f857ce8993522..9261a21f542b5a 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
@@ -26,13 +26,16 @@ class TestGpuDeviceInfo {
  public:
   static stream_executor::DeviceDescription RTXA6000DeviceInfo(
       stream_executor::GpuComputeCapability cc =
-          stream_executor::CudaComputeCapability(8, 9));
+          stream_executor::GpuComputeCapability{
+              stream_executor::CudaComputeCapability(8, 9)});
   static stream_executor::DeviceDescription RTXH100SXMDeviceInfo(
       stream_executor::GpuComputeCapability cc =
-          stream_executor::CudaComputeCapability(9, 0));
+          stream_executor::GpuComputeCapability{
+              stream_executor::CudaComputeCapability(9, 0)});
   static stream_executor::DeviceDescription RTXB200SXMDeviceInfo(
       stream_executor::GpuComputeCapability cc =
-          stream_executor::CudaComputeCapability(10, 0));
+          stream_executor::GpuComputeCapability{
+              stream_executor::CudaComputeCapability(10, 0)});
   static stream_executor::DeviceDescription AMDMI210DeviceInfo();
   static stream_executor::DeviceDescription AMDRX7900DeviceInfo();
   // Returns default RTXA6000 or AMDMI210 device info
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 5b7be1214521c3..a9289d1f82466b 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -40,18 +40,27 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/annotation.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/command_buffer_conversion_pass.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_pass.h"
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/communicator.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/map_util.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
@@ -59,7 +68,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/resource_requests.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/maybe_owning_device_memory.h"
@@ -88,7 +97,6 @@ limitations under the License.
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/random.h"
@@ -176,14 +184,28 @@ static absl::Status RunThunkPasses(const DebugOptions& debug_options,
                                    ThunkPassBufferAllocator& allocator) {
   ThunkPassPipeline pipeline("thunk-passes");
   if (debug_options.xla_gpu_experimental_enable_checksum_tracing_on_thunks()) {
-    pipeline.AddPass(std::make_unique<ThunkChecksumTracingPass>());
+    pipeline.AddPass(std::make_unique<ThunkBufferDebugPass>(
+        ThunkBufferDebugPass::Mode::kChecksum));
+  }
+  if (debug_options.xla_gpu_experimental_enable_buffer_saver_on_thunks()) {
+    pipeline.AddPass(std::make_unique<ThunkBufferDebugPass>(
+        ThunkBufferDebugPass::Mode::kBufferSaver));
+  }
+  if ((debug_options.xla_gpu_detect_nan() !=
+       DebugOptions::DETECTION_MODE_NONE) ||
+      (debug_options.xla_gpu_detect_inf() !=
+       DebugOptions::DETECTION_MODE_NONE)) {
+    LOG(ERROR) << "Adding ThunkBufferDebugPass for nan/inf checking";
+    pipeline.AddPass(std::make_unique<ThunkBufferDebugPass>(
+        ThunkBufferDebugPass::Mode::kFloatChecker));
   }
   if (debug_options.xla_gpu_experimental_enable_command_buffer_on_thunks()) {
     pipeline.AddPass(std::make_unique<CommandBufferConversionPass>(
         hlo_module ? hlo_module->name() : "Anonymous"));
   }
-  TF_ASSIGN_OR_RETURN(bool changed, pipeline.Run(root_thunk, debug_options,
-                                                 device_info, allocator));
+  TF_ASSIGN_OR_RETURN(bool changed,
+                      pipeline.Run(root_thunk, debug_options, hlo_module,
+                                   device_info, allocator));
   if (changed) {
     VLOG(3) << "Thunk passes changed the thunk tree.";
     if (hlo_module && DumpingEnabledForHloModule(*hlo_module)) {
@@ -247,8 +269,7 @@ GpuExecutable::GpuExecutable(
       constants_(std::move(params.constants)),
       output_info_(std::move(params.output_info)),
       enable_debug_info_manager_(params.enable_debug_info_manager) {
-  if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-          gpu_version_)) {
+  if (gpu_version_.IsRocm()) {
     // ROCm uses hsaco hashes to distinguish between modules.
     // Bad things happen if multiple modules with identical code are loaded.
     binary_.resize(binary_.size() + 16);
@@ -277,18 +298,15 @@ absl::Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
     auto cc = main_stream->GetRocmComputeCapability();
     std::string stream_arch = cc.gcn_arch_name();
     std::string gpu_exec_arch =
-        std::get<se::RocmComputeCapability>(gpu_version_).gcn_arch_name();
+        gpu_version_.rocm_compute_capability()->gcn_arch_name();
     TF_RET_CHECK(stream_arch == gpu_exec_arch)
         << "AMDGPU GCN ISA version mismatch; expected {" << gpu_exec_arch
         << ", but was " << stream_arch;
   } else if (platform_id == stream_executor::cuda::kCudaPlatformId) {
-    se::GpuComputeCapability cc = main_stream->GetCudaComputeCapability();
-    TF_RET_CHECK(std::get<se::CudaComputeCapability>(cc) ==
-                 std::get<se::CudaComputeCapability>(gpu_version_))
-        << "Compute capability mismatch; expected {"
-        << std::get<se::CudaComputeCapability>(gpu_version_).ToString()
-        << "}, but was {" << std::get<se::CudaComputeCapability>(cc).ToString()
-        << "}";
+    se::CudaComputeCapability cc = main_stream->GetCudaComputeCapability();
+    TF_RET_CHECK(cc == *gpu_version_.cuda_compute_capability())
+        << "Compute capability mismatch; expected {" << gpu_version_.ToString()
+        << "}, but was {" << cc.ToString() << "}";
   } else if (platform_id == stream_executor::sycl::kSyclPlatformId) {
     // TODO: Add check.
   } else {
@@ -406,29 +424,37 @@ absl::Status ExecuteThunksImpl(
   }
 
   // Parameters for executing collective operations.
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams collective_params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, async_comms_streams,
-                          main_stream->parent()->device_ordinal(),
-                          collective_max_nchannels, p2p_max_nchannels));
+  TF_ASSIGN_OR_RETURN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(
+          *run_options, async_comms_streams,
+          LocalDeviceId(main_stream->parent()->device_ordinal()),
+          collective_max_nchannels, p2p_max_nchannels));
 
-  ResourceRequests resource_requests;
+  CollectiveCliqueRequests clique_requests;
 
-  {  // Collect resource requirements from thunks.
-    Thunk::PrepareParams prepare_params{&collective_params};
+  {  // Prepare thunks for execution and collect requested GPU cliques.
+    Thunk::PrepareParams prepare_params{&collective_params, &clique_requests};
 
     tsl::profiler::TraceMe trace_prepare("Thunks::Prepare");
-    TF_RETURN_IF_ERROR(
-        thunk_sequence.Prepare(prepare_params, resource_requests));
+    TF_RETURN_IF_ERROR(thunk_sequence.Prepare(prepare_params));
+  }
+
+  std::vector<std::unique_ptr<CliqueKey>>* clique_keys =
+      run_options->run_options().clique_keys();
+  if (clique_keys != nullptr) {
+    for (const GpuCliqueKey& clique_key : clique_requests.RequestedCliques()) {
+      clique_keys->push_back(std::make_unique<GpuCliqueKey>(clique_key));
+    }
   }
 
   // Acquire collective cliques requested by thunks.
-  Thunk::CollectiveCliques collective_cliques;
+  CollectiveCliques collective_cliques;
   if (!mock_collectives) {
     TF_ASSIGN_OR_RETURN(
         collective_cliques,
-        resource_requests.AcquireCollectiveCliques(
-            collective_params,
+        AcquireCollectiveCliques(
+            collective_params, clique_requests,
             debug_options
                 ? debug_options->xla_gpu_collectives_use_persistent_cliques()
                 : false));
@@ -683,7 +709,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   // destructor will not race with any operations in flight (deallocate
   // xla::Literal owned by the HLO module).
   if (submitted_mem_copies) {
-    TF_CHECK_OK(stream->BlockHostUntilDone());
+    CHECK_OK(stream->BlockHostUntilDone());
   }
 
   module_handles_.emplace(executor,
@@ -1098,8 +1124,8 @@ GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
   return output;
 }
 
-OutputInfoProto GpuExecutable::OutputInfo::ToProto() const {
-  OutputInfoProto proto;
+GpuExecutableProto::OutputInfoProto GpuExecutable::OutputInfo::ToProto() const {
+  GpuExecutableProto::OutputInfoProto proto;
   proto.set_allocation_index(allocation_index);
   proto.set_passthrough(passthrough);
 
@@ -1124,7 +1150,7 @@ OutputInfoProto GpuExecutable::OutputInfo::ToProto() const {
 }
 
 absl::StatusOr<GpuExecutable::OutputInfo> GpuExecutable::OutputInfo::FromProto(
-    const OutputInfoProto& proto) {
+    const GpuExecutableProto::OutputInfoProto& proto) {
   OutputInfo output_info;
   output_info.allocation_index = proto.allocation_index();
   output_info.passthrough = proto.passthrough();
@@ -1149,5 +1175,183 @@ absl::StatusOr<GpuExecutable::OutputInfo> GpuExecutable::OutputInfo::FromProto(
   }
   return output_info;
 }
+
+GpuExecutableProto::ConstantInfoProto GpuExecutable::ConstantInfo::ToProto()
+    const {
+  GpuExecutableProto::ConstantInfoProto proto;
+  proto.set_symbol_name(symbol_name);
+  *proto.mutable_content() = content.ToProto();
+  proto.set_allocation_index(allocation_index);
+  return proto;
+}
+
+GpuExecutable::ConstantInfo GpuExecutable::ConstantInfo::FromProto(
+    const GpuExecutableProto::ConstantInfoProto& proto) {
+  return ConstantInfo{
+      /*symbol_name=*/proto.symbol_name(),
+      /*content=*/DenseDataIntermediate::FromProto(proto.content()),
+      /*allocation_index=*/static_cast<int>(proto.allocation_index())};
+}
+
+absl::StatusOr<GpuExecutableProto> GpuExecutable::ToProto() const {
+  GpuExecutableProto proto;
+  proto.set_binary(binary_.data(), binary_.size());
+  proto.set_asm_text(text_);
+  proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs_.cbegin(),
+                                              dnn_compiled_graphs_.cend());
+
+  *proto.mutable_gpu_compute_capability() = gpu_version_.ToProto();
+
+  TF_ASSIGN_OR_RETURN(*proto.mutable_thunk(), thunks_->ToProto());
+
+  proto.set_module_name(module_name_);
+  *proto.mutable_program_shape() = program_shape_.ToProto();
+
+  absl::Span<const BufferAllocation* const> allocations = GetAllocations();
+  proto.mutable_buffer_allocations()->mutable_values()->Reserve(
+      allocations.size());
+  for (const auto& allocation : allocations) {
+    proto.mutable_buffer_allocations()->mutable_values()->Add(
+        allocation->ToProto());
+  }
+
+  if (hlo_module_ != nullptr) {
+    *proto.mutable_hlo_module_with_config() = hlo_module_->ToProtoWithConfig();
+  }
+
+  proto.mutable_output_info_map()->Reserve(output_info_.size());
+  for (const auto& [shape_index, output_info] : output_info_) {
+    auto map_entry = proto.add_output_info_map();
+    *map_entry->mutable_shape_index() = shape_index.ToProto();
+    *map_entry->mutable_output_info() = output_info.ToProto();
+  }
+
+  proto.mutable_constants()->Reserve(constants_.size());
+  for (const auto& constant : constants_) {
+    *proto.add_constants() = constant.ToProto();
+  }
+
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::FromProto(
+    const GpuExecutableProto& proto,
+    const se::DeviceDescription& device_description,
+    absl::string_view platform_name) {
+  Params params;
+  params.enable_debug_info_manager = false;
+  params.asm_text = proto.asm_text();
+  const std::string& binary = proto.binary();
+  params.binary.assign(binary.begin(), binary.end());
+  params.buffer_assignment = nullptr;
+  if (proto.has_hlo_module_with_config()) {
+    TF_ASSIGN_OR_RETURN(
+        params.debug_module,
+        HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
+  }
+
+  params.mlir_allocations.emplace();
+  params.mlir_allocations->reserve(proto.buffer_allocations().values_size());
+  for (const BufferAllocationProto& allocation_proto :
+       proto.buffer_allocations().values()) {
+    params.mlir_allocations->push_back(
+        BufferAllocation::FromProto(allocation_proto));
+  }
+
+  for (const auto& [key, value] : proto.dnn_compiled_graphs()) {
+    params.dnn_compiled_graphs.emplace(key, value);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      stream_executor::GpuComputeCapability gpu_compute_capability,
+      stream_executor::GpuComputeCapability::FromProto(
+          proto.gpu_compute_capability()));
+
+  if (gpu_compute_capability != device_description.gpu_compute_capability()) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "GPU compute capability of serialized executable doesn't match target "
+        "device capability. (serialized: %s, target: %s)",
+        gpu_compute_capability.ToString(),
+        device_description.gpu_compute_capability().ToString()));
+  }
+
+  params.device_description = device_description;
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto.thunk(), params.mlir_allocations.value(),
+                            params.debug_module.get(), platform_name));
+
+  if (dynamic_cast<const SequentialThunk*>(thunk.get()) == nullptr) {
+    return absl::InvalidArgumentError(
+        "The top-most serialized thunk in the GPU Executable is not a "
+        "SequentialThunk!");
+  }
+
+  params.executable = unique_ptr_down_cast<SequentialThunk>(std::move(thunk));
+
+  params.constants.reserve(proto.constants().size());
+  for (const auto& constant_proto : proto.constants()) {
+    params.constants.push_back(ConstantInfo::FromProto(constant_proto));
+  }
+
+  params.output_info.reserve(proto.output_info_map().size());
+  for (const auto& output_info_proto : proto.output_info_map()) {
+    ShapeIndex shape_index =
+        ShapeIndex::FromProto(output_info_proto.shape_index());
+    TF_ASSIGN_OR_RETURN(OutputInfo output_info,
+                        OutputInfo::FromProto(output_info_proto.output_info()));
+    params.output_info.emplace(std::move(shape_index), std::move(output_info));
+  }
+
+  params.module_name = proto.module_name();
+  TF_ASSIGN_OR_RETURN(params.program_shape,
+                      ProgramShape::FromProto(proto.program_shape()));
+
+  return Create(std::move(params));
+}
+
+static absl::StatusOr<ExecutableBuildOptionsProto>
+CreateSerializableBuildOptionsProto(const ExecutableBuildOptions& options) {
+  ExecutableBuildOptions serializable_opts = options;
+  // These fields are not serializable, and the toProto will fail if they are
+  // set, but we also don't need them for the dump so just clear them.
+  serializable_opts.set_layout_canonicalization_callback(nullptr);
+  serializable_opts.set_compile_thread_pool(nullptr);
+
+  return serializable_opts.ToProto();
+}
+
+absl::Status GpuExecutable::DumpExecutableIfEnabled(
+    const ExecutableBuildOptions& options,
+    const DebugOptions& debug_options) const {
+  if (!debug_options.has_xla_dump_to() ||
+      !debug_options.xla_gpu_experimental_dump_gpu_executable()) {
+    return absl::OkStatus();
+  }
+
+  TF_ASSIGN_OR_RETURN(GpuExecutableProto gpu_executable_proto, ToProto());
+  std::string serialized_proto = gpu_executable_proto.SerializeAsString();
+  if (serialized_proto.empty()) {
+    return absl::InternalError("Failed to serialize GPU executable proto");
+  }
+
+  ExecutableAndOptionsProto dump_proto;
+  *dump_proto.mutable_serialized_executable() = std::move(serialized_proto);
+  TF_ASSIGN_OR_RETURN(
+      *dump_proto.mutable_compile_options()->mutable_executable_build_options(),
+      CreateSerializableBuildOptionsProto(options));
+
+  constexpr absl::string_view kDumpFilename = "gpu_executable";
+  if (has_module()) {
+    DumpPerModuleProtobufToFile(module(), dump_proto, debug_options,
+                                kDumpFilename);
+  } else {
+    DumpProtobufToFile(dump_proto, debug_options, kDumpFilename);
+  }
+
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index 35a975bbb1f360..80023389d64916 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/annotation.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
@@ -70,6 +71,11 @@ class GpuExecutable : public Executable {
     std::string symbol_name;
     DenseDataIntermediate content;
     int allocation_index = -1;
+
+    GpuExecutableProto::ConstantInfoProto ToProto() const;
+
+    static ConstantInfo FromProto(
+        const GpuExecutableProto::ConstantInfoProto& proto);
   };
 
   struct OutputInfo {
@@ -83,8 +89,9 @@ class GpuExecutable : public Executable {
     // would indicate the aliased parameter), and what kind of alias it is.
     std::optional<HloInputOutputAliasConfig::Alias> alias_config;
 
-    OutputInfoProto ToProto() const;
-    static absl::StatusOr<OutputInfo> FromProto(const OutputInfoProto& proto);
+    GpuExecutableProto::OutputInfoProto ToProto() const;
+    static absl::StatusOr<OutputInfo> FromProto(
+        const GpuExecutableProto::OutputInfoProto& proto);
 
     friend bool operator==(const OutputInfo& lhs, const OutputInfo& rhs) {
       return std::tie(lhs.allocation_index, lhs.passthrough,
@@ -187,7 +194,7 @@ class GpuExecutable : public Executable {
 
   const GpuAliasInfo* alias_info() const { return alias_info_.get(); }
 
-  const SequentialThunk& GetThunk() { return *thunks_; }
+  const SequentialThunk& GetThunk() const { return *thunks_; }
 
   absl::Status ExecuteThunks(const BufferAllocations& buffer_allocations,
                              const ServiceExecutableRunOptions* run_options);
@@ -210,6 +217,17 @@ class GpuExecutable : public Executable {
 
   absl::Status VerboseAllocationError(absl::Status s);
 
+  static absl::StatusOr<std::unique_ptr<GpuExecutable>> FromProto(
+      const GpuExecutableProto&,
+      const se::DeviceDescription& device_description,
+      absl::string_view platform);
+
+  absl::StatusOr<GpuExecutableProto> ToProto() const;
+
+  absl::Status DumpExecutableIfEnabled(
+      const ExecutableBuildOptions& options,
+      const DebugOptions& debug_options) const final;
+
  private:
   // Use GpuExecutable::Create() to create an instance.
   explicit GpuExecutable(Params params,
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.proto b/third_party/xla/xla/service/gpu/gpu_executable.proto
index 251e3541320640..8a898ce36c5ab6 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.proto
+++ b/third_party/xla/xla/service/gpu/gpu_executable.proto
@@ -2,15 +2,94 @@ syntax = "proto3";
 
 package xla.gpu;
 
+import "xla/backends/gpu/runtime/thunk.proto";
+import "xla/service/gpu/ir_emission_utils.proto";
 import "xla/service/hlo.proto";
+import "xla/shape_util.proto";
+import "xla/stream_executor/device_description.proto";
+import "xla/xla.proto";
+import "xla/xla_data.proto";
 
-message OutputInfoProto {
-  // This output is part of the following buffer allocation
-  int64 allocation_index = 1;
+// Serialized representation of a GPU executable, used for AOT compilation.
+//
+// There's a legacy and a new verions of this proto, see
+// `xla::gpu::LegacyGpuAotCompilationResult` and
+// `xla::gpu::GpuAotCompilationResult` respectively for more details.
+//
+// If `thunk` is set, then this is the new format. Otherwise, it's the legacy
+// format.
+message GpuExecutableProto {
+  // The HLO module of the executable - for debugging purposes only.
+  xla.HloModuleProtoWithConfig hlo_module_with_config = 1;
 
-  // True when this output is passed through from an input parameter
-  bool passthrough = 2;
+  // Wrapper around repeated BufferAllocationProto to allow using it in a oneof.
+  message RepeatedBufferAllocations {
+    repeated BufferAllocationProto values = 1;
+  }
 
-  // Describes whether and how this output aliases with an input parameter
-  optional xla.HloInputOutputAliasProto.AliasEntryProto alias_config = 3;
+  // Used in the legacy format.
+  BufferAssignmentProto buffer_assignment = 2;
+  // The buffer allocations of the executable, used in the new format.
+  RepeatedBufferAllocations buffer_allocations = 10;
+
+  // The PTX of the executable. (Only applicable to CUDA)
+  string asm_text = 3;
+
+  // The binary of the executable.
+  //
+  // For CUDA, this is a cubin binary.
+  // For ROCm, this is a hsaco binary.
+  bytes binary = 4;
+
+  // The DNN compiled graphs of the executable.
+  //
+  // The key is the DNN kernel name, and the value is the compiled graph
+  // serialized to JSON. (Only applicable to cuDNN)
+  map<string, bytes> dnn_compiled_graphs = 5;
+
+  // The target compute capability of the executable.
+  stream_executor.GpuComputeCapabilityProto gpu_compute_capability = 6;
+
+  // The thunk tree of the executable.
+  // Not set in the legacy format.
+  optional ThunkProto thunk = 7;
+
+  // The name of the HLO module - for debugging purposes only.
+  string module_name = 8;
+
+  // The shape of the program (parameters and result).
+  xla.ProgramShapeProto program_shape = 9;
+
+  message OutputInfoProto {
+    // This output is part of the following buffer allocation
+    int64 allocation_index = 1;
+
+    // True when this output is passed through from an input parameter
+    bool passthrough = 2;
+
+    // Describes whether and how this output aliases with an input parameter
+    optional xla.HloInputOutputAliasProto.AliasEntryProto alias_config = 3;
+  }
+
+  message OutputInfoMapEntry {
+    xla.ShapeIndexProto shape_index = 1;
+    OutputInfoProto output_info = 2;
+  }
+
+  // Map from output shape index to output info.
+  repeated OutputInfoMapEntry output_info_map = 11;
+
+  message ConstantInfoProto {
+    // The name of the constant in the HLO module.
+    string symbol_name = 1;
+
+    // The content of the constant - this can be large.
+    DenseDataIntermediateProto content = 2;
+
+    // The index of the buffer allocation for this constant.
+    int64 allocation_index = 3;
+  }
+
+  // The constants used by the executable.
+  repeated ConstantInfoProto constants = 12;
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc b/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc
index e4df0cb0b5a450..919efca994ddca 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc
@@ -15,25 +15,23 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_executable_run_options.h"
 
-#include <cstdint>
-#include <map>
 #include <optional>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/executable_run_options.h"
-#include "xla/service/global_device_id.h"
+#include "xla/runtime/device_id.h"
 
 namespace xla::gpu {
 
 GpuExecutableRunOptions& GpuExecutableRunOptions::set_gpu_global_device_ids(
-    std::optional<std::map<int, GlobalDeviceId>> gpu_global_device_ids) {
+    std::optional<DeviceIdMap> gpu_global_device_ids) {
   gpu_global_device_ids_ = std::move(gpu_global_device_ids);
   return *this;
 }
 
-const std::optional<std::map<int, GlobalDeviceId>>&
+const std::optional<GpuExecutableRunOptions::DeviceIdMap>&
 GpuExecutableRunOptions::gpu_global_device_ids() const {
   return gpu_global_device_ids_;
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
index 850b5d50a3fb0f..b97dfdef2334e6 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
@@ -16,21 +16,17 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
 #define XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
 
-#include <cstdint>
 #include <functional>
-#include <map>
 #include <optional>
-#include <vector>
 
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
-#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/executable_run_options.h"
-#include "xla/service/global_device_id.h"
+#include "xla/runtime/device_id.h"
 
 namespace xla::gpu {
 
@@ -46,14 +42,16 @@ using CliqueIdCallback =  // NOLINT
 // dependencies to ExecutableRunOptions.
 class GpuExecutableRunOptions {
  public:
+  // A mapping from local device ordinal to global device ID.
+  using DeviceIdMap = absl::btree_map<LocalDeviceId, GlobalDeviceId>;
+
   // Sets a mapping from local device ordinals to global device IDs.
   // Used only on NVidia GPUs for cross-host NCCL collectives. If set, the
   // elements of `device_assignment` are interpreted as global device IDs, not
   // local device ordinals.
   GpuExecutableRunOptions& set_gpu_global_device_ids(
-      std::optional<std::map<int, GlobalDeviceId>> gpu_global_device_ids);
-  const std::optional<std::map<int, GlobalDeviceId>>& gpu_global_device_ids()
-      const;
+      std::optional<DeviceIdMap> device_ids);
+  const std::optional<DeviceIdMap>& gpu_global_device_ids() const;
 
   // Callback that returns a unique clieque id for a given clique key.
   GpuExecutableRunOptions& set_clique_id_callback(
@@ -92,7 +90,7 @@ class GpuExecutableRunOptions {
  private:
   bool requires_exclusive_lock_on_gpu_ = false;
   bool enable_mock_collectives_ = false;
-  std::optional<std::map<int, GlobalDeviceId>> gpu_global_device_ids_;
+  std::optional<DeviceIdMap> gpu_global_device_ids_;
   CliqueIdCallback clique_id_callback_;
   GpuCollectives* collectives_;
   std::optional<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_test.cc b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
index 7c1118ca0e197c..ed0e5f20bdbb2c 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
@@ -31,13 +31,16 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/literal_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -52,6 +55,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/testing/temporary_directory.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "tsl/platform/path.h"
@@ -60,9 +64,20 @@ namespace xla::gpu {
 namespace {
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::Pair;
+using ::testing::Pointee;
 using ::testing::Property;
 using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
 using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::Partially;
+using ::tsl::testing::TemporaryDirectory;
+
+Thunk::ThunkInfo ThunkInfoWithId(int thunk_id) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.thunk_id = thunk_id;
+  return thunk_info;
+}
 
 TEST(GpuExecutableTest, OuputInfoToAndFromProto) {
   const GpuExecutable::OutputInfo output_info0{/*allocation_index=*/42,
@@ -145,7 +160,7 @@ TEST(GpuExecutableTest, RunThunkPasses) {
     params.module_name = absl::StrCat("test_module", execution_count++);
     se::DeviceDescription device_description;
     device_description.set_gpu_compute_capability(
-        se::CudaComputeCapability::Volta());
+        se::GpuComputeCapability{se::CudaComputeCapability::Volta()});
     device_description.set_driver_version({12, 3, 0});
     device_description.set_runtime_version({12, 3, 0});
     params.device_description = device_description;
@@ -301,9 +316,42 @@ TEST(GpuExecutableTest, MlirAllocationsArePreferred) {
 }
 
 TEST(GpuExecutableTest, ThunkChecksumPassAddsAllocation) {
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice(&alloc, 0, 1024);
+
+  // Set up a thunk graph with a kernel that has some buffers that should be
+  // checked, otherwise the pass is a no-op and doesn't need to allocate.
+  auto make_test_thunk_sequence = [&]() {
+    Thunk::ThunkInfo thunk_info;
+    ThunkSequence thunk_sequence;
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        thunk_info,
+        /*kernel_name=*/"test_kernel",
+        /*kernel_arguments=*/
+        emitters::KernelArguments({
+            emitters::KernelArgument(
+                ShapeUtil::MakeShape(F32, /*dimensions=*/{16}), slice),
+        }),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    return thunk_sequence;
+  };
+  auto make_test_hlo_module = []() {
+    HloComputation::Builder builder("test_computation");
+    HloInstruction* root = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(1)));
+    auto hlo_module =
+        std::make_unique<HloModule>("test_module", HloModuleConfig());
+    hlo_module->AddEntryComputation(builder.Build(/*root_instruction=*/root));
+    return hlo_module;
+  };
+
   GpuExecutable::Params params_without_pass;
-  params_without_pass.executable =
-      std::make_unique<SequentialThunk>(Thunk::ThunkInfo{}, ThunkSequence{});
+  params_without_pass.debug_module = make_test_hlo_module();
+  params_without_pass.executable = std::make_unique<SequentialThunk>(
+      Thunk::ThunkInfo{}, make_test_thunk_sequence());
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GpuExecutable> executable_without_pass,
@@ -312,8 +360,9 @@ TEST(GpuExecutableTest, ThunkChecksumPassAddsAllocation) {
       executable_without_pass->GetAllocations().size();
 
   GpuExecutable::Params params_with_pass;
-  params_with_pass.executable =
-      std::make_unique<SequentialThunk>(Thunk::ThunkInfo{}, ThunkSequence{});
+  params_with_pass.debug_module = make_test_hlo_module();
+  params_with_pass.executable = std::make_unique<SequentialThunk>(
+      Thunk::ThunkInfo{}, make_test_thunk_sequence());
   params_with_pass.debug_options
       .set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(true);
 
@@ -332,34 +381,30 @@ TEST(GpuExecutableTest, DumpsMetadataListProto) {
 
   int execution_count = 0;
   auto create_executable = [&]() {
-    Thunk::ThunkInfo thunk_info;
-    thunk_info.thunk_id = 123;
     BufferAllocation alloc(0, 1024, 0);
     BufferAllocation::Slice slice(&alloc, 0, 1024);
 
     ThunkSequence thunk_sequence;
     thunk_sequence.push_back(std::make_unique<KernelThunk>(
-        thunk_info,
+        ThunkInfoWithId(123),
         /*kernel_name=*/"test_kernel",
         /*kernel_arguments=*/emitters::KernelArguments({}),
         /*launch_dimensions=*/LaunchDimensions(),
         /*cluster_dim=*/std::nullopt,
         /*shmem_bytes=*/0,
         /*tma_metadata=*/se::gpu::TmaMetadata()));
-    thunk_info.thunk_id = 456;
     thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        thunk_info, slice, slice, 1024));
+        ThunkInfoWithId(456), slice, slice, 1024));
 
     GpuExecutable::Params params;
-    thunk_info.thunk_id = 789;
     params.executable = std::make_unique<SequentialThunk>(
-        thunk_info, std::move(thunk_sequence));
+        ThunkInfoWithId(789), std::move(thunk_sequence));
     params.debug_options = debug_options;
 
     params.module_name = absl::StrCat("test_module", execution_count++);
     se::DeviceDescription device_description;
     device_description.set_gpu_compute_capability(
-        se::CudaComputeCapability::Volta());
+        se::GpuComputeCapability{se::CudaComputeCapability::Volta()});
     device_description.set_driver_version({12, 3, 0});
     device_description.set_runtime_version({12, 3, 0});
     params.device_description = device_description;
@@ -397,5 +442,134 @@ TEST(GpuExecutableTest, DumpsMetadataListProto) {
               )pb"));
 }
 
+TEST(GpuExecutableTest, ProtoConversion) {
+  se::DeviceDescription device_description;
+  device_description.set_gpu_compute_capability(
+      se::GpuComputeCapability{se::CudaComputeCapability::Volta()});
+  device_description.set_driver_version({12, 3, 0});
+  device_description.set_runtime_version({12, 3, 0});
+
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::make_unique<KernelThunk>(
+      ThunkInfoWithId(123),
+      /*kernel_name=*/"test_kernel", emitters::KernelArguments({}),
+      LaunchDimensions(),
+      /*cluster_dim=*/std::nullopt,
+      /*shmem_bytes=*/0, se::gpu::TmaMetadata()));
+
+  GpuExecutable::Params params;
+  params.asm_text = "test_asm_text";
+  params.binary = {1, 2, 3};
+  params.dnn_compiled_graphs = {{"test_dnn_compiled_graph", "test_json"}};
+
+  params.executable = std::make_unique<SequentialThunk>(
+      ThunkInfoWithId(456), std::move(thunk_sequence));
+  params.device_description = device_description;
+
+  params.module_name = "test_module";
+  params.enable_debug_info_manager = false;
+  params.mlir_allocations = {BufferAllocation(0, 1024, 0)};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GpuExecutable> reference_executable,
+                          GpuExecutable::Create(std::move(params)));
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto proto,
+                          reference_executable->ToProto());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GpuExecutable> reconstructed_executable,
+      GpuExecutable::FromProto(proto, device_description, "TEST_PLATFORM"));
+  EXPECT_THAT(reconstructed_executable->text(), "test_asm_text");
+  EXPECT_THAT(reconstructed_executable->binary(), ElementsAre(1, 2, 3));
+  EXPECT_THAT(
+      reconstructed_executable->dnn_compiled_graphs(),
+      UnorderedElementsAre(Pair("test_dnn_compiled_graph", "test_json")));
+  EXPECT_THAT(reconstructed_executable->GetThunk().thunks(),
+              ElementsAre(Pointee(Property(&Thunk::kind, Thunk::kKernel))));
+  EXPECT_THAT(reconstructed_executable->GetAllocations(),
+              ElementsAre(Pointee(Property(&BufferAllocation::size, 1024))));
+  EXPECT_THAT(reconstructed_executable->name(), "test_module");
+}
+
+TEST(GpuExecutableTest, GpuExecutableDump) {
+  tsl::Env* env = tsl::Env::Default();
+
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_dump_gpu_executable(true);
+  TF_ASSERT_OK_AND_ASSIGN(TemporaryDirectory temp_dir,
+                          TemporaryDirectory::CreateForCurrentTestcase());
+  debug_options.set_xla_dump_to(temp_dir.path());
+  debug_options.set_xla_enable_dumping(true);
+
+  auto create_executable = [&]() {
+    BufferAllocation alloc(0, 1024, 0);
+    BufferAllocation::Slice slice(&alloc, 0, 1024);
+
+    ThunkSequence thunk_sequence;
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(123),
+        /*kernel_name=*/"test_kernel",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
+        ThunkInfoWithId(456), slice, slice, 1024));
+
+    GpuExecutable::Params params;
+    params.executable = std::make_unique<SequentialThunk>(
+        ThunkInfoWithId(789), std::move(thunk_sequence));
+    params.debug_options = debug_options;
+
+    params.module_name = "test_module";
+    params.debug_module =
+        std::make_unique<HloModule>(params.module_name, HloModuleConfig());
+    params.debug_module->mutable_config().set_debug_options(debug_options);
+    return GpuExecutable::Create(std::move(params));
+  };
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GpuExecutable> executable,
+                          create_executable());
+  ExecutableBuildOptions build_options;
+  build_options.set_num_replicas(2);
+  build_options.set_num_partitions(1);
+  // Thread pool is not serializable, and should be ignored in the dump.
+  tsl::thread::ThreadPool pool(tsl::Env::Default(), "test_pool", 1);
+  build_options.set_compile_thread_pool(&pool);
+  TF_ASSERT_OK(
+      executable->DumpExecutableIfEnabled(build_options, debug_options));
+
+  std::vector<std::string> dump_files;
+  TF_ASSERT_OK(env->GetMatchingPaths(
+      tsl::io::JoinPath(debug_options.xla_dump_to(), "*gpu_executable.txt"),
+      &dump_files));
+  ASSERT_EQ(dump_files.size(), 1);
+
+  ExecutableAndOptionsProto dump_content;
+  TF_ASSERT_OK(tsl::ReadTextProto(env, dump_files[0], &dump_content));
+  EXPECT_THAT(dump_content.compile_options().executable_build_options(),
+              Partially(EqualsProto(R"pb(
+                num_replicas: 2 num_partitions: 1
+              )pb")));
+
+  GpuExecutableProto gpu_executable_proto;
+  ASSERT_TRUE(gpu_executable_proto.ParseFromString(
+      dump_content.serialized_executable()));
+  ASSERT_THAT(gpu_executable_proto, Partially(EqualsProto(R"pb(
+                module_name: "test_module"
+                thunk {
+                  thunk_info { thunk_id: 789 }
+                  sequential_thunk: {
+                    thunks: {
+                      thunk_info: { thunk_id: 123 }
+                      kernel_thunk: { kernel_name: "test_kernel" }
+                    }
+                    thunks: {
+                      thunk_info: { thunk_id: 456 }
+                      device_to_device_copy_thunk: {}
+                    }
+                  }
+                }
+              )pb")));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.cc b/third_party/xla/xla/service/gpu/gpu_float_support.cc
index 94f98ed4a3e719..01576003232bde 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.cc
@@ -80,7 +80,7 @@ bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
             TypeAndCC(F8E5M2, se::CudaComputeCapability::Hopper())}) {
         if (LowPrecisionType() == type) {
           auto* cuda_compute_capability =
-              std::get_if<se::CudaComputeCapability>(&compute_capability_);
+              compute_capability_.cuda_compute_capability();
           // Do not normalize supported types inside Triton fused computations.
           return cuda_compute_capability &&
                  cuda_compute_capability->SupportsAllFeaturesOf(cc) &&
@@ -120,16 +120,14 @@ bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
     case HloOpcode::kExp:
     case HloOpcode::kLog:
       if (LowPrecisionType() == BF16) {
-        auto* cuda_compute_capability =
-            std::get_if<se::CudaComputeCapability>(&compute_capability_);
-        return cuda_compute_capability != nullptr;
+        return compute_capability_.IsCuda();
       }
       return false;
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
       if (LowPrecisionType() == BF16) {
         auto* cuda_compute_capability =
-            std::get_if<se::CudaComputeCapability>(&compute_capability_);
+            compute_capability_.cuda_compute_capability();
         return cuda_compute_capability != nullptr &&
                cuda_compute_capability->IsAtLeastAmpere();
       }
@@ -139,7 +137,7 @@ bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
     case HloOpcode::kSubtract: {
       if (LowPrecisionType() == BF16) {
         auto* cuda_compute_capability =
-            std::get_if<se::CudaComputeCapability>(&compute_capability_);
+            compute_capability_.cuda_compute_capability();
         return cuda_compute_capability != nullptr &&
                cuda_compute_capability->IsAtLeastHopper();
       }
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.h b/third_party/xla/xla/service/gpu/gpu_float_support.h
index ee63dd3d2daabe..0668802ab22644 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.h
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.h
@@ -28,7 +28,7 @@ namespace gpu {
 
 class GpuFloatSupport : public FloatSupport {
  public:
-  explicit GpuFloatSupport(se::GpuComputeCapability cc,
+  explicit GpuFloatSupport(const se::GpuComputeCapability& cc,
                            PrimitiveType low_precision_type,
                            PrimitiveType high_precision_type = F32)
       : FloatSupport(low_precision_type, high_precision_type),
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
index f402003bb36946..35d7468a28f605 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
@@ -155,107 +155,143 @@ class FloatSupportTest : public HloHardwareIndependentTestBase {
 };
 
 TEST_F(FloatSupportTest, ShouldAlwaysConvertFp8Dot) {
-  TestDotConversion(F8E4M3FN, F8E4M3FN, F16,
-                    se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/true, F8E4M3FN);
-
-  TestDotConversion(F8E4M3FN, F8E4M3FN, F32,
-                    se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/true, F8E4M3FN);
-
-  TestDotConversion(F8E4M3FN, F8E4M3FN, F16,
-                    se::CudaComputeCapability::Ampere(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/true, F8E4M3FN);
-
-  TestDotConversion(F8E4M3FN, F8E4M3FN, F32,
-                    se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/true, F8E4M3FN);
-
-  TestDotConversion(F8E5M2, F8E5M2, F16, se::CudaComputeCapability::Ampere(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/true, F8E5M2);
-
-  TestDotConversion(F8E5M2, F8E5M2, F32, se::CudaComputeCapability::Ampere(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/true, F8E5M2);
-
-  TestDotConversion(F8E5M2, F8E4M3FN, F16, se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/false, F8E5M2);
-
-  TestDotConversion(F8E5M2, F8E4M3FN, F32, se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/false, F8E5M2);
-
-  TestDotConversion(F8E5M2, F16, F16, se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/false, F8E5M2);
-
-  TestDotConversion(F8E5M2, F16, F32, se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/true,
-                    /*should_convert_rhs=*/false, F8E5M2);
+  TestDotConversion(
+      F8E4M3FN, F8E4M3FN, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E4M3FN);
+
+  TestDotConversion(
+      F8E4M3FN, F8E4M3FN, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E4M3FN);
+
+  TestDotConversion(
+      F8E4M3FN, F8E4M3FN, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E4M3FN);
+
+  TestDotConversion(
+      F8E4M3FN, F8E4M3FN, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E4M3FN);
+
+  TestDotConversion(
+      F8E5M2, F8E5M2, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E5M2);
+
+  TestDotConversion(
+      F8E5M2, F8E5M2, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E5M2);
+
+  TestDotConversion(
+      F8E5M2, F8E4M3FN, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/false, F8E5M2);
+
+  TestDotConversion(
+      F8E5M2, F8E4M3FN, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/false, F8E5M2);
+
+  TestDotConversion(
+      F8E5M2, F16, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/false, F8E5M2);
+
+  TestDotConversion(
+      F8E5M2, F16, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/false, F8E5M2);
 }
 
 TEST_F(FloatSupportTest, ShouldConvertTritonUnsupportedFp8Dot) {
-  TestTritonFusedDot(F8E4M3FN, F8E4M3FN, F16,
-                     se::CudaComputeCapability::Hopper(),
-                     /*should_convert_lhs=*/true,
-                     /*should_convert_rhs=*/true, F8E4M3FN);
-
-  TestTritonFusedDot(F8E4M3FN, F8E4M3FN, F32,
-                     se::CudaComputeCapability::Hopper(),
-                     /*should_convert_lhs=*/false,
-                     /*should_convert_rhs=*/false, F8E4M3FN);
-
-  TestTritonFusedDot(F8E4M3FN, F8E4M3FN, F16,
-                     se::CudaComputeCapability::Ampere(),
-                     /*should_convert_lhs=*/true,
-                     /*should_convert_rhs=*/true, F8E4M3FN);
-
-  TestTritonFusedDot(F8E4M3FN, F8E4M3FN, F32,
-                     se::CudaComputeCapability::Hopper(),
-                     /*should_convert_lhs=*/false,
-                     /*should_convert_rhs=*/false, F8E4M3FN);
-
-  TestTritonFusedDot(F8E5M2, F8E5M2, F16, se::CudaComputeCapability::Ampere(),
-                     /*should_convert_lhs=*/true,
-                     /*should_convert_rhs=*/true, F8E5M2);
-
-  TestTritonFusedDot(F8E5M2, F8E5M2, F32, se::CudaComputeCapability::Ampere(),
-                     /*should_convert_lhs=*/true,
-                     /*should_convert_rhs=*/true, F8E5M2);
-
-  TestTritonFusedDot(F8E5M2, F8E4M3FN, F16, se::CudaComputeCapability::Hopper(),
-                     /*should_convert_lhs=*/true,
-                     /*should_convert_rhs=*/false, F8E5M2);
-
-  TestTritonFusedDot(F8E5M2, F8E4M3FN, F32, se::CudaComputeCapability::Hopper(),
-                     /*should_convert_lhs=*/false,
-                     /*should_convert_rhs=*/false, F8E5M2);
-
-  TestTritonFusedDot(F8E5M2, F16, F16, se::CudaComputeCapability::Hopper(),
-                     /*should_convert_lhs=*/true,
-                     /*should_convert_rhs=*/false, F8E5M2);
-
-  TestTritonFusedDot(F8E5M2, F16, F32, se::CudaComputeCapability::Hopper(),
-                     /*should_convert_lhs=*/true,
-                     /*should_convert_rhs=*/false, F8E5M2);
+  TestTritonFusedDot(
+      F8E4M3FN, F8E4M3FN, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E4M3FN);
+
+  TestTritonFusedDot(
+      F8E4M3FN, F8E4M3FN, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/false,
+      /*should_convert_rhs=*/false, F8E4M3FN);
+
+  TestTritonFusedDot(
+      F8E4M3FN, F8E4M3FN, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E4M3FN);
+
+  TestTritonFusedDot(
+      F8E4M3FN, F8E4M3FN, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/false,
+      /*should_convert_rhs=*/false, F8E4M3FN);
+
+  TestTritonFusedDot(
+      F8E5M2, F8E5M2, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E5M2);
+
+  TestTritonFusedDot(
+      F8E5M2, F8E5M2, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/true, F8E5M2);
+
+  TestTritonFusedDot(
+      F8E5M2, F8E4M3FN, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/false, F8E5M2);
+
+  TestTritonFusedDot(
+      F8E5M2, F8E4M3FN, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/false,
+      /*should_convert_rhs=*/false, F8E5M2);
+
+  TestTritonFusedDot(
+      F8E5M2, F16, F16,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/false, F8E5M2);
+
+  TestTritonFusedDot(
+      F8E5M2, F16, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/true,
+      /*should_convert_rhs=*/false, F8E5M2);
 }
 
 TEST_F(FloatSupportTest, ShouldKeepBf16OnAmpere) {
-  TestDotConversion(BF16, BF16, F32, se::CudaComputeCapability::Ampere(),
-                    /*should_convert_lhs=*/false,
-                    /*should_convert_rhs=*/false, BF16);
+  TestDotConversion(
+      BF16, BF16, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+      /*should_convert_lhs=*/false,
+      /*should_convert_rhs=*/false, BF16);
 }
 
 TEST_F(FloatSupportTest, ShouldKeepBf16OnHopper) {
-  TestDotConversion(BF16, BF16, F32, se::CudaComputeCapability::Hopper(),
-                    /*should_convert_lhs=*/false,
-                    /*should_convert_rhs=*/false, BF16);
+  TestDotConversion(
+      BF16, BF16, F32,
+      se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+      /*should_convert_lhs=*/false,
+      /*should_convert_rhs=*/false, BF16);
 }
 
 TEST_F(FloatSupportTest, Bf16ReducePrecisionIsNotNormalized) {
@@ -270,7 +306,8 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kHloModule));
-  EXPECT_FALSE(Normalize(module.get(), cc, BF16, F32));
+  EXPECT_FALSE(
+      Normalize(module.get(), se::GpuComputeCapability{cc}, BF16, F32));
 }
 
 TEST_F(FloatSupportTest, Bf16TotalOrderSortIsNotNormalized) {
@@ -300,12 +337,17 @@ ENTRY sort {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  EXPECT_FALSE(Normalize(
+      module.get(),
+      se::GpuComputeCapability{se::CudaComputeCapability::Volta()}, BF16, F32));
   EXPECT_FALSE(
-      Normalize(module.get(), se::CudaComputeCapability::Volta(), BF16, F32));
-  EXPECT_FALSE(
-      Normalize(module.get(), se::CudaComputeCapability::Ampere(), BF16, F32));
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+                BF16, F32));
   EXPECT_FALSE(
-      Normalize(module.get(), se::CudaComputeCapability::Hopper(), BF16, F32));
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+                BF16, F32));
 }
 
 TEST_F(FloatSupportTest, Bf16ExpIsNotNormalized) {
@@ -320,7 +362,8 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kHloModule));
-  EXPECT_FALSE(Normalize(module.get(), cc, BF16, F32));
+  EXPECT_FALSE(
+      Normalize(module.get(), se::GpuComputeCapability{cc}, BF16, F32));
 }
 
 TEST_F(FloatSupportTest, Bf16LogIsNotNormalized) {
@@ -335,7 +378,8 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kHloModule));
-  EXPECT_FALSE(Normalize(module.get(), cc, BF16, F32));
+  EXPECT_FALSE(
+      Normalize(module.get(), se::GpuComputeCapability{cc}, BF16, F32));
 }
 
 TEST_F(FloatSupportTest, Bf16MinimumIsOnlyNormalizedPreAmpere) {
@@ -351,11 +395,16 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kHloModule));
   EXPECT_FALSE(
-      Normalize(module.get(), se::CudaComputeCapability::Hopper(), BF16, F32));
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+                BF16, F32));
   EXPECT_FALSE(
-      Normalize(module.get(), se::CudaComputeCapability::Ampere(), BF16, F32));
-  EXPECT_TRUE(
-      Normalize(module.get(), se::CudaComputeCapability::Volta(), BF16, F32));
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+                BF16, F32));
+  EXPECT_TRUE(Normalize(
+      module.get(),
+      se::GpuComputeCapability{se::CudaComputeCapability::Volta()}, BF16, F32));
 }
 
 TEST_F(FloatSupportTest, Bf16MaximumIsOnlyNormalizedPreAmpere) {
@@ -371,11 +420,16 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kHloModule));
   EXPECT_FALSE(
-      Normalize(module.get(), se::CudaComputeCapability::Hopper(), BF16, F32));
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+                BF16, F32));
   EXPECT_FALSE(
-      Normalize(module.get(), se::CudaComputeCapability::Ampere(), BF16, F32));
-  EXPECT_TRUE(
-      Normalize(module.get(), se::CudaComputeCapability::Volta(), BF16, F32));
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+                BF16, F32));
+  EXPECT_TRUE(Normalize(
+      module.get(),
+      se::GpuComputeCapability{se::CudaComputeCapability::Volta()}, BF16, F32));
 }
 
 TEST_F(FloatSupportTest,
@@ -400,13 +454,15 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(auto module_with_supported_reducer,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(kHloModuleTemplate, "add")));
-  EXPECT_FALSE(Normalize(module_with_supported_reducer.get(), cc, BF16, F32));
+  EXPECT_FALSE(Normalize(module_with_supported_reducer.get(),
+                         se::GpuComputeCapability{cc}, BF16, F32));
 
   // There is no bf16 instruction for divide, however.
   TF_ASSERT_OK_AND_ASSIGN(auto module_with_unsupported_reducer,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(kHloModuleTemplate, "divide")));
-  EXPECT_TRUE(Normalize(module_with_unsupported_reducer.get(), cc, BF16, F32));
+  EXPECT_TRUE(Normalize(module_with_unsupported_reducer.get(),
+                        se::GpuComputeCapability{cc}, BF16, F32));
 }
 
 TEST_F(FloatSupportTest, BF16LogAndExpOnRocmIsNormalized) {
@@ -422,12 +478,14 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module_log,
       ParseAndReturnVerifiedModule(absl::Substitute(kHloModule, "log")));
-  EXPECT_TRUE(Normalize(module_log.get(), cc, BF16, F32));
+  EXPECT_TRUE(
+      Normalize(module_log.get(), se::GpuComputeCapability{cc}, BF16, F32));
 
   TF_ASSERT_OK_AND_ASSIGN(auto module_exp,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(kHloModule, "exponential")));
-  EXPECT_TRUE(Normalize(module_exp.get(), cc, BF16, F32));
+  EXPECT_TRUE(
+      Normalize(module_exp.get(), se::GpuComputeCapability{cc}, BF16, F32));
 }
 
 TEST_F(FloatSupportTest, ScaledDotIsIgnored) {
@@ -448,7 +506,8 @@ TEST_F(FloatSupportTest, ScaledDotIsIgnored) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kHloModule));
-  EXPECT_FALSE(Normalize(module.get(), cc, BF16, F32));
+  EXPECT_FALSE(
+      Normalize(module.get(), se::GpuComputeCapability{cc}, BF16, F32));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 2ed3aa4a31655f..5f665644c5719e 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/permutation_util.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -50,11 +51,25 @@ limitations under the License.
 #include "xla/side_effect_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+bool ContainsTransposeWithSmallMostMinorDim(const HloFusionAdaptor& fusion,
+                                            int64_t unroll_factor) {
+  return HloAnyOf(fusion, [unroll_factor](HloInstructionAdaptor instr) {
+    if (instr.opcode() != HloOpcode::kTranspose) {
+      return false;
+    }
+    const HloInstruction& transpose = instr.instruction();
+    // We can assume that TransposeDimensionGrouper pass has run, so no need
+    // to try to combine adjacent dimensions.
+    return transpose.shape().dimensions().back() < unroll_factor;
+  });
+}
+
 bool HasAnyTiledTransposeRoot(const HloComputation& computation) {
   return absl::c_any_of(GetFusionRoots(computation),
                         [&](const HloInstruction* instr) {
@@ -73,8 +88,8 @@ const Shape& GetElementShape(const HloFusionAnalysis& analysis) {
 }
 
 // Computes the maximum valid unroll factor for a given instruction.
-int ComputeMaxUnrollFactor(int64_t num_elements) {
-  for (int i = MaxUnrollFactor(); i > 1; i /= 2) {
+int ComputeMaxUnrollFactor(int64_t num_elements, int64_t max_unroll) {
+  for (int i = max_unroll; i > 1; i /= 2) {
     if (num_elements % i == 0) {
       return i;
     }
@@ -84,6 +99,61 @@ int ComputeMaxUnrollFactor(int64_t num_elements) {
 
 }  // namespace
 
+int64_t MaxUnrollFactor(const HloFusionAnalysis* analysis) {
+  if (analysis == nullptr) {
+    return 4;
+  }
+
+  // On Blackwell we would like to increase the maximum unroll factor to 8, as
+  // we need more vectorization for full performance.
+  // However we need to check additional conditions:
+  //   - Unrolling is potentially bad for fusions with reductions, where one
+  //     thread will handle the full reduction dimension, so more unrolling
+  //     can hurt parallelism.
+  //   - Unrolling is potentially bad for fusions with many outputs, as that
+  //     might increase register pressure. A thread needs to compute all the
+  //     outputs first before it can write them due to potential in-place
+  //     buffers. More unrolling will increase the number of values that need
+  //     to be computed before writing.
+  //   - Unrolling is potentially bad for transposes if the most minor
+  //     dimension of transpose is smaller than the unroll factor. This could
+  //     potentially be checked with indexing analysis as well, but it is
+  //     tricky to get the conditions right when bad or unknown indexing
+  //     should block more unrolling or not. For now, let's keep it simple and
+  //     only check for transpose.
+
+  // For now, don't allow any multi-output fusions. However register pressure
+  // also does not only depend on the number of outputs, so we might hit it
+  // also for single fusions, or there could be multi-output fusions that
+  // don't face register pressure. This part of the heuristic may need
+  // improvements.
+  constexpr int kMaxNumOutputsForFullUnrolling = 1;
+  // On PTX level, we can vectorize with v4.b32, but not with v8.b32. So
+  // higher unroll factor does not make sense with 32 bit or more.
+  constexpr int kMaxBitsToVectorizeWithVectorSize4 = 32;
+  DebugOptions debug_options = analysis->fusion_root(0)
+                                   .instruction()
+                                   .GetModule()
+                                   ->config()
+                                   .debug_options();
+  if (analysis->device_info().cuda_compute_capability().IsBlackwell() &&
+      analysis->emitter_fusion_kind() ==
+          HloFusionAnalysis::EmitterFusionKind::kLoop &&
+      analysis->input_output_info().smallest_output_dtype_bits <
+          kMaxBitsToVectorizeWithVectorSize4 &&
+      analysis->fusion_root_count() <= kMaxNumOutputsForFullUnrolling &&
+      debug_options.xla_gpu_experimental_allow_unroll_factor_eight() &&
+      !HloAnyOf(
+          analysis->fusion(),
+          [](HloInstructionAdaptor node) {
+            return node.opcode() == HloOpcode::kReduce;
+          }) &&
+      !ContainsTransposeWithSmallMostMinorDim(analysis->fusion(), 8)) {
+    return 8;
+  }
+  return 4;
+}
+
 bool IsPhysicallyTransposing(const HloInstruction& instr) {
   if (instr.opcode() == HloOpcode::kFusion) {
     for (const HloInstruction* fused_instr : instr.fused_instructions()) {
@@ -459,7 +529,8 @@ FusionDecision CanEmitInputFusedScatter(const HloInstruction& producer,
 }
 
 FusionDecision IsProducerMultiOutputFusible(
-    const HloInstruction& producer, const se::DeviceDescription& device_info) {
+    const HloInstruction& producer, const GpuAliasInfo* alias_info,
+    const se::DeviceDescription& device_info) {
   // Skip multiple output fusion. It's not yet supported.
   if (producer.IsMultiOutputFusion()) {
     return FusionDecision::Forbid("Producer is a multi-output fusion");
@@ -489,7 +560,7 @@ FusionDecision IsProducerMultiOutputFusible(
   // is in-place. (We can relax this restriction by establishing an explicit
   // contract that describes what multi-output fusion scenarios are supported by
   // codegen and then changing this check to allow exactly those fusions).
-  if (!HloDataflowAnalysis::GetInPlaceInputOutputPairs(&producer).empty()) {
+  if (!alias_info->GetInPlaceInputOutputPairs(&producer).empty()) {
     return FusionDecision::Forbid("In-place operations are present");
   }
 
@@ -944,7 +1015,8 @@ LaunchDimensionsConfig ComputeLoopFusionConfig(
                           analysis.device_info().core_count();
   if (num_elements >= n_threads_max &&
       !MayCausePerformanceDropIfUnrolled(analysis.fusion())) {
-    unroll_factor = ComputeMaxUnrollFactor(num_elements);
+    unroll_factor =
+        ComputeMaxUnrollFactor(num_elements, MaxUnrollFactor(&analysis));
   }
   // CHECK that unroll_factor is a power-of-2, as needed by the logic below.
   CHECK(absl::has_single_bit(static_cast<uint64_t>(unroll_factor)));
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/xla/xla/service/gpu/gpu_fusible.h
index 7a6c5529b7f33e..a658e42f5e2d95 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.h
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/instruction_fusion.h"
@@ -179,7 +180,8 @@ FusionDecision CanEmitInputFusedScatter(const HloInstruction& producer,
 // That is, the root tuple of the multi-output fusion will contain the results
 // of both, the producer and consumer.
 FusionDecision IsProducerMultiOutputFusible(
-    const HloInstruction& producer, const se::DeviceDescription& device_info);
+    const HloInstruction& producer, const GpuAliasInfo* alias_info,
+    const se::DeviceDescription& device_info);
 // Whether `instr` is a candidate for sibling fusion or as a consumer in
 // a producer-consumer multi-output fusion.
 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr,
@@ -234,7 +236,7 @@ bool IsGenericTritonFusion(const HloInstruction& instr);
 bool MayCausePerformanceDropIfUnrolled(const HloFusionAdaptor& fusion);
 
 // Returns the max loop unroll factor.
-inline constexpr int64_t MaxUnrollFactor() { return 4; }
+int64_t MaxUnrollFactor(const HloFusionAnalysis* analysis = nullptr);
 
 LaunchDimensionsConfig ComputeLoopFusionConfig(
     const HloFusionAnalysis& analysis);
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index 6f601d8620b68c..d1ad49adfd1278 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -20,11 +20,15 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/alias_info.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
@@ -49,6 +53,13 @@ class GpuFusibleTest : public HloHardwareIndependentTestBase {
  public:
   void SetUp() override {
     TF_ASSERT_OK_AND_ASSIGN(device_description_, MakeDeviceDescription());
+    alias_info_ = std::make_unique<GpuAliasInfo>(device_description_);
+  }
+
+  DebugOptions GetDebugOptionsForTest() const override {
+    auto debug_options = GetDebugOptionsFromFlags();
+    debug_options.set_xla_gpu_experimental_allow_unroll_factor_eight(true);
+    return debug_options;
   }
 
   bool IsReduceInputFusion(const HloInstruction& instr) const {
@@ -61,7 +72,7 @@ class GpuFusibleTest : public HloHardwareIndependentTestBase {
 
   FusionDecision IsProducerMultiOutputFusible(
       const HloInstruction& producer) const {
-    return ::xla::gpu::IsProducerMultiOutputFusible(producer,
+    return ::xla::gpu::IsProducerMultiOutputFusible(producer, alias_info_.get(),
                                                     device_description_);
   }
 
@@ -88,6 +99,7 @@ class GpuFusibleTest : public HloHardwareIndependentTestBase {
 
  private:
   se::DeviceDescription device_description_;
+  std::unique_ptr<GpuAliasInfo> alias_info_;
 };
 
 const char kModulePrefix[] = R"(
@@ -1668,6 +1680,128 @@ ENTRY main {
   EXPECT_TRUE(MayCausePerformanceDropIfUnrolled(*fusion_adaptor));
 }
 
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f16[2048,1024]{1,0} parameter(0)
+  ROOT res = f16[2048,1024]{1,0} negate(p0)
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 8);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfig32Bit) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f32[2048,1024]{1,0} parameter(0)
+  ROOT res = f32[2048,1024]{1,0} negate(p0)
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfigForLoopReduce) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+max {
+  p0 = f16[] parameter(0)
+  p1 = f16[] parameter(1)
+  ROOT add = f16[] maximum(p0, p1)
+}
+
+ENTRY main {
+  p0 = f16[270336,8]{1,0} parameter(0)
+  neg_inf = f16[] constant(-inf)
+  ROOT res = f16[270336]{0} reduce(p0, neg_inf), dimensions={1}, to_apply=max
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfigForLoopTransposeSmallMinorDim) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f16[256,2048,4]{2,1,0} parameter(0)
+  ROOT res = f16[2048,256,4]{2,1,0} transpose(p0), dimensions={1,0,2}
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfigForLoopTransposeLargerMinorDim) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f16[256,2048,8]{2,1,0} parameter(0)
+  ROOT res = f16[2048,256,8]{2,1,0} transpose(p0), dimensions={1,0,2}
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 8);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index da6c9bb3f67ab1..2afc4787298d11 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 
-
 #include <cstddef>
 #include <cstdint>
 #include <deque>
@@ -37,7 +36,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -56,6 +55,7 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/flag_utils.h"
 #include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/model/analytical_latency_estimator.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
@@ -341,7 +341,7 @@ std::optional<ProfiledInstructionsProto> ProfileFromConfig(
   ProfiledInstructionsProto profile;
   absl::string_view from_config = config.fdo_profile();
   LOG(INFO) << "Attempting to parse as a binary proto.";
-  if (profile.ParseFromArray(from_config.data(), from_config.size())) {
+  if (profile.ParseFromString(from_config)) {
     LOG(INFO) << "Using PGLE profile from fdo_profile (binary)";
     return profile;
   }
@@ -548,12 +548,8 @@ LegalizeSchedulingAnnotations::Config SchedulingAnnotationsConfig() {
     if (hlo->IsCustomCall("__cublas$gemm")) {
       return true;
     }
-    if (hlo->opcode() == HloOpcode::kFusion && hlo->has_backend_config() &&
-        hlo->backend_config<GpuBackendConfig>().ok()) {
-      GpuBackendConfig gpu_config =
-          hlo->backend_config<GpuBackendConfig>().value();
-      return gpu_config.has_fusion_backend_config() &&
-             gpu_config.fusion_backend_config().kind() == kTritonGemmFusionKind;
+    if (hlo->opcode() == HloOpcode::kFusion) {
+      return IsGpuFusionKind(*hlo, kTritonGemmFusionKind);
     }
     return false;
   };
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
index 1cf84062318131..ef5c5dacda34b6 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index b7acf217a2ac36..4d0823772f502d 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -31,7 +31,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/log/scoped_mock_log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -54,7 +56,6 @@ limitations under the License.
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/profiler/protobuf/profiled_instructions.pb.h"
 
@@ -64,7 +65,6 @@ namespace gpu {
 using ::testing::_;
 using ::testing::ElementsAre;
 using ::testing::EndsWith;
-using ::tsl::testing::StatusIs;
 
 class GpuHloScheduleTest : public HloTestBase {
  protected:
@@ -86,7 +86,7 @@ class GpuHloScheduleTest : public HloTestBase {
   }
 
   SequentialHloOrdering BuildHloOrdering(HloModule* module) {
-    TF_CHECK_OK(ScheduleGpuModule(module).status());
+    CHECK_OK(ScheduleGpuModule(module).status());
     return SequentialHloOrdering{module->schedule()};
   }
 
@@ -223,7 +223,7 @@ TEST_P(GpuHloScheduleParameterizedTest, AsyncCustomCall) {
   static_cast<HloCustomCallInstruction*>(nonblocking_call)
       ->set_custom_call_schedule(SCHEDULE_EARLIEST);
   // In addition, add control_dependency: add1->nonblocking_call.
-  TF_CHECK_OK(add1->AddControlDependencyTo(nonblocking_call));
+  CHECK_OK(add1->AddControlDependencyTo(nonblocking_call));
   // Blocking call, which only add4 depends on.
   HloInstruction* blocking_call =
       builder.AddInstruction(HloInstruction::CreateCustomCall(
@@ -294,7 +294,7 @@ TEST_P(GpuHloScheduleParameterizedTest, AsyncCollectivePermute) {
           collective_permute_start_shape, add0,
           /*source_target_pairs=*/{{0, 1}}, /*channel_id=*/std::nullopt));
   // In addition, add control_dependency: add1->nonblocking_call.
-  TF_CHECK_OK(add1->AddControlDependencyTo(collective_permute_start));
+  CHECK_OK(add1->AddControlDependencyTo(collective_permute_start));
   // Blocking call, which only add4 depends on.
   HloInstruction* collective_permute_done = builder.AddInstruction(
       HloInstruction::CreateUnary(f32_2x2_, HloOpcode::kCollectivePermuteDone,
@@ -1387,7 +1387,7 @@ ENTRY e {
   ROOT t = (f32[1024,1024]{1,0}, f32[1024,1024]{1,0}) tuple(wrapped_exponential, wrapped_negate)
 })")
                     .value();
-  TF_CHECK_OK(ScheduleGpuModule(module.get()).status());
+  CHECK_OK(ScheduleGpuModule(module.get()).status());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
 // CHECK: ENTRY
 // CHECK: wrapped_negate = f32[1024,1024]{1,0}
@@ -1513,7 +1513,7 @@ TEST_P(GpuHloScheduleParameterizedTest, AsyncAllReduce) {
           /*constrain_layout=*/false,
           /*channel_id=*/std::nullopt, /*use_global_device_ids=*/true));
   // In addition, add control_dependency: add1->nonblocking_call.
-  TF_CHECK_OK(add1->AddControlDependencyTo(all_reduce_start));
+  CHECK_OK(add1->AddControlDependencyTo(all_reduce_start));
   // Blocking call, which only add4 depends on.
   HloInstruction* all_reduce_done =
       builder.AddInstruction(HloInstruction::CreateUnary(
@@ -1724,7 +1724,7 @@ TEST_P(GpuHloScheduleParameterizedTest, CopyStartDoneScheduled) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnVerifiedModule(kHloCopyStartDone,
                                                 GetModuleConfig(test_config)));
-  TF_CHECK_OK(ScheduleGpuModule(module.get()).status());
+  CHECK_OK(ScheduleGpuModule(module.get()).status());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
 // CHECK: ENTRY
 // CHECK: copy-start.3 = (f32[512,1024]{1,0}, f32[512,1024]{1,0:S(5)}, u32[]) copy-start
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index 9b0e370a0ff0e5..28eabcb1cd7680 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -22,11 +22,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -38,17 +41,14 @@ limitations under the License.
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/service/profile_guided_latency_estimator.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::Property;
 using ::testing::UnorderedElementsAre;
-using ::tsl::testing::StatusIs;
 
 int GetIndexByName(absl::Span<HloInstruction* const> instruction_sequence,
                    absl::string_view hlo_name) {
diff --git a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.cc b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.cc
index 19d511cdd9358c..c7afb1f4dee036 100644
--- a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include "absl/base/no_destructor.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
-#include "absl/strings/match.h"
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/hlo_value.h"
 
 namespace xla::gpu {
@@ -61,13 +61,6 @@ bool IsNvshmemInstruction(const HloInstruction* inst) {
   return is_nvshmem_collective;
 }
 
-bool IsCollectiveMosaicGpuInstruction(const HloInstruction* inst) {
-  return inst->opcode() == HloOpcode::kCustomCall &&
-         (inst->custom_call_target() == "mosaic_gpu" ||
-          inst->custom_call_target() == "mosaic_gpu_v2") &&
-         absl::StrContains(inst->raw_backend_config_string(), "nvshmem");
-}
-
 bool IsCollectiveMemoryInstruction(const HloInstruction* inst) {
   return kSupportedCollectiveOpcodes->contains(inst->opcode()) ||
          // opcode or async wrapped opcode is in kSupportedCollectiveOpcodes.
@@ -92,11 +85,11 @@ bool HasCollectiveMemoryInstruction(const HloValue* input_alias,
 
 bool HasCollectiveMosaicInstruction(const HloValue* input_alias) {
   for (auto& use : input_alias->GetUses()) {
-    if (IsCollectiveMosaicGpuInstruction(use.instruction)) {
+    if (IsCollectiveMosaicGpuInstruction(*use.instruction)) {
       return true;
     }
   }
-  return IsCollectiveMosaicGpuInstruction(input_alias->instruction());
+  return IsCollectiveMosaicGpuInstruction(*input_alias->instruction());
 }
 
 // Set memory space to MemorySpaceColor::kCollective for all allocations used by
@@ -114,6 +107,10 @@ absl::Status AssignColors(bool use_collective_memory, bool use_nvshmem,
         value->set_color(BufferValue::Color(memory_space));
         continue;
       }
+    } else if (defining_position.shape().IsTuple()) {
+      // Making sure tuples live in default memory space.
+      value->set_color((int)MemorySpaceColor::kDefault);
+      continue;
     }
 
     for (const auto& alias :
diff --git a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment_test.cc b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment_test.cc
index e61c35c3fcc954..f78ef009c8bfec 100644
--- a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment_test.cc
@@ -210,8 +210,13 @@ TEST_P(GpuMosaicMemorySpaceAssignmentTest, TestMosaicMemorySpaceAssignment) {
       EXPECT_EQ(alias_analysis->buffers()[i].values()[0]->has_color(), true);
       EXPECT_EQ(alias_analysis->buffers()[i].values()[0]->color(),
                 (int)(MosaicContainsNvshmem()
-                          ? (UseNvshmem() ? MemorySpaceColor::kCollective
-                                          : MemorySpaceColor::kDefault)
+                          ? ((UseNvshmem() && !alias_analysis->buffers()[i]
+                                                   .values()[0]
+                                                   ->defining_position()
+                                                   .shape()
+                                                   .IsTuple())
+                                 ? MemorySpaceColor::kCollective
+                                 : MemorySpaceColor::kDefault)
                           : MemorySpaceColor::kDefault));
     }
   }
diff --git a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc
index 8b1e5ee5874aad..46146af1c57ae7 100644
--- a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc
@@ -143,10 +143,14 @@ void AddSPMDPasses(
       /*update_domain=*/false,
       /*composites_to_preserve=*/absl::flat_hash_set<std::string>{},
       /*uniquify_channel_ids=*/false,
-      /*should_inline=*/
-      [](const xla::CallGraph& call_graph, xla::HloInstruction* instruction) {
-        return absl::StrContains(instruction->to_apply()->name(),
-                                 sdy::kInlineableManualComputationFuncName);
+      /*override_policy=*/
+      [](const xla::CallGraph& call_graph,
+         const xla::HloInstruction* instruction) {
+        if (absl::StrContains(instruction->to_apply()->name(),
+                              sdy::kInlineableManualComputationFuncName)) {
+          return xla::CallInliner::InlineOverridePolicy::kAllowInline;
+        }
+        return xla::CallInliner::InlineOverridePolicy::kProhibitInline;
       });
   spmd_pipeline.AddPass<CollectivePermuteMotion>();
 }
diff --git a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
index 5a60ee0892dc65..ff3f9583cdc6e9 100644
--- a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
+++ b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/backend_config.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -33,11 +34,7 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
@@ -215,11 +212,11 @@ std::vector<stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
         GetDebugOptionsFromFlags().xla_gpu_algorithm_denylist_path();
     if (!file_path.empty()) {
       std::string denylist_text;
-      TF_CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), file_path,
-                                        &denylist_text));
-      TF_CHECK_OK(ParseTextFormatDenyList(*list, denylist_text));
+      CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), file_path,
+                                     &denylist_text));
+      CHECK_OK(ParseTextFormatDenyList(*list, denylist_text));
     }
-    TF_CHECK_OK(ParseTextFormatDenyList(*list, kDefaultDenylist));
+    CHECK_OK(ParseTextFormatDenyList(*list, kDefaultDenylist));
     return list;
   }();
 
diff --git a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
index 202f29837c9dfe..89d81b50de8df5 100644
--- a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/debug_options_flags.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "tsl/platform/path.h"
@@ -42,7 +42,6 @@ namespace gpu {
 namespace {
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
-using ::tsl::testing::IsOk;
 
 class DenylistTest : public HloHardwareIndependentTestBase {
  protected:
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index f1a772db6aef3d..1f889842500cfb 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -72,7 +71,9 @@ std::optional<TransposeDescription> FindConsistentTransposeHero(
     }
   }
 
-  if (!tiled_transpose_hero) return std::nullopt;
+  if (!tiled_transpose_hero) {
+    return std::nullopt;
+  }
 
   for (auto* root : non_transpose_roots) {
     // Roots that don't have a transpose hero, should have a shape compatible
@@ -89,13 +90,21 @@ std::optional<TransposeDescription> FindConsistentTransposeHero(
 
 bool UseConcatenateFusion(absl::Span<const HloInstructionAdaptor> roots,
                           absl::Span<const HloInstructionAdaptor> heroes) {
-  if (heroes.size() != 1) return false;
-  if (heroes.front().opcode() != HloOpcode::kConcatenate) return false;
+  if (heroes.size() != 1) {
+    return false;
+  }
+  if (heroes.front().opcode() != HloOpcode::kConcatenate) {
+    return false;
+  }
   // The concat emitter does not support multiple outputs yet. TODO(csigg): fix.
-  if (roots.front().shape().IsTuple()) return false;
+  if (roots.front().shape().IsTuple()) {
+    return false;
+  }
   // Limit the number of operands because the concat emitter produces code for
   // each operand, hurting occupancy.
-  if (heroes.front().instruction().operand_count() > 4) return false;
+  if (heroes.front().instruction().operand_count() > 4) {
+    return false;
+  }
   // The loop emitter is faster when warp divergence and occupancy are both low.
   // TODO(csigg): exclude this case.
   return true;
@@ -114,7 +123,7 @@ HloFusionAnalysis::EmitterFusionKind GetEmitterFusionKind(
   if (fusion_backend_config.kind() == kTritonFusionKind ||
       fusion_backend_config.kind() == kTritonGemmFusionKind ||
       fusion_backend_config.kind() == kTritonNestedGemmFusionKind ||
-      fusion_backend_config.kind() == kTritonScaledDotFusionKind) {
+      fusion_backend_config.kind() == kTritonCollectiveFusionKind) {
     return HloFusionAnalysis::EmitterFusionKind::kTriton;
   }
 
@@ -204,6 +213,14 @@ int SmallestBitWidth(const Container& args) {
 
 }  // namespace
 
+bool IsGpuFusionKind(const HloInstruction& hlo, absl::string_view kind) {
+  auto gpu_config = hlo.backend_config<GpuBackendConfig>();
+  if (!gpu_config.ok()) {
+    return false;
+  }
+  return gpu_config->fusion_backend_config().kind() == kind;
+}
+
 HloFusionAnalysis::HloFusionAnalysis(
     FusionBackendConfig fusion_backend_config, HloFusionSpec fusion_spec,
     EmitterFusionKind emitter_fusion_kind,
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index df640ad224e8f7..c5b8980abdd5ab 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -32,6 +32,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// Returns true if the instruction's fusion backend config kind matches the
+// given one.
+bool IsGpuFusionKind(const HloInstruction& hlo, absl::string_view kind);
+
 class HloFusionAnalysis {
  public:
   // The type of emitted fusion.
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc b/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
index 989e242b94b330..9ec845c52c06fc 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
@@ -48,6 +48,7 @@ class OpcodeCollector : public ConstDfsHloVisitorWithDefault {
       case HloOpcode::kAcos:
       case HloOpcode::kAcosh:
       case HloOpcode::kAsin:
+      case HloOpcode::kAsinh:
       case HloOpcode::kAtanh:
       case HloOpcode::kCbrt:
       case HloOpcode::kCeil:
diff --git a/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc b/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc
index b306fc21951c0d..b34d2a1ed9c787 100644
--- a/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc
@@ -29,7 +29,7 @@ IntelGpuCompiler::IntelGpuCompiler()
                   spir::DataLayout()) {}
 
 absl::Status IntelGpuCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+    HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
     se::dnn::VersionInfo dnn_version,
     const se::SemanticVersion& toolkit_version) {
   // Note: this is a stub.
diff --git a/third_party/xla/xla/service/gpu/intel_gpu_compiler.h b/third_party/xla/xla/service/gpu/intel_gpu_compiler.h
index 11bc3eefc81840..86cc56393f3351 100644
--- a/third_party/xla/xla/service/gpu/intel_gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/intel_gpu_compiler.h
@@ -35,7 +35,7 @@ class IntelGpuCompiler : public GpuCompiler {
   IntelGpuCompiler();
 
   absl::Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
       se::dnn::VersionInfo dnn_version,
       const se::SemanticVersion& toolkit_version) override;
 
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 3cf8d687b4f7f4..a749ef0923f87b 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/escaping.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
@@ -56,6 +58,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/matmul_indexing_utils.h"
@@ -132,14 +135,6 @@ absl::StatusOr<bool> IsCublasSupportedMatMul(
       return false;
   }
 }
-const char* const kCusolverCholeskyCallTarget = "__cusolver$cholesky";
-
-bool IsCustomCallToCusolver(const HloInstruction& hlo) {
-  if (hlo.opcode() != HloOpcode::kCustomCall) {
-    return false;
-  }
-  return hlo.custom_call_target() == kCusolverCholeskyCallTarget;
-}
 
 bool IsCustomCallToTopK(const HloInstruction& hlo) {
   return hlo.opcode() == HloOpcode::kCustomCall &&
@@ -151,6 +146,13 @@ bool IsCustomCallToPtxKernel(const HloInstruction& hlo) {
          hlo.custom_call_target() == "__gpu$xla.gpu.ptx";
 }
 
+bool IsCollectiveMosaicGpuInstruction(const HloInstruction& hlo) {
+  return hlo.opcode() == HloOpcode::kCustomCall &&
+         (hlo.custom_call_target() == "mosaic_gpu" ||
+          hlo.custom_call_target() == "mosaic_gpu_v2") &&
+         absl::StrContains(hlo.raw_backend_config_string(), "nvshmem");
+}
+
 static bool IsContiguousSlice(
     const Shape& orig, const Shape& sliced,
     std::optional<absl::Span<const int64_t>> slice_strides) {
@@ -549,14 +551,6 @@ bool IsDynamicSliceFusion(const HloInstruction* instr) {
          name == kDynamicSliceFusionWithDynamicAddressComputationConfigName;
 }
 
-bool IsDynamicMemcpyFusion(const HloInstruction* instr) {
-  absl::StatusOr<GpuBackendConfig> backend_config =
-      instr->backend_config<GpuBackendConfig>();
-  return backend_config.ok() &&
-         backend_config->fusion_backend_config().kind() ==
-             kDynamicMemcpyFusionKind;
-}
-
 namespace {
 
 // Whether the instruction is semantically a call.
@@ -767,5 +761,6 @@ DenseDataIntermediate DenseDataIntermediate::FromProto(
   return DenseDataIntermediate::Own(
       std::vector<uint8_t>(data.begin(), data.end()));
 }
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index f8c4fbd0c535f5..18499b6472dabe 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -86,6 +86,10 @@ inline constexpr absl::string_view kCustomFusionKind = "__custom_fusion";
 // kTritonGemmFusionKind.
 inline constexpr absl::string_view kTritonFusionKind = "__triton";
 
+// Used for fusions that codegen a collective.
+inline constexpr absl::string_view kTritonCollectiveFusionKind =
+    "__triton_collective";
+
 // Fusions that use Triton have FusionBackendConfig.kind equal to this string.
 inline constexpr absl::string_view kTritonGemmFusionKind = "__triton_gemm";
 
@@ -95,10 +99,6 @@ inline constexpr absl::string_view kTritonNestedGemmFusionKind =
     "__triton_nested_gemm_fusion";
 
 // Fusions that use Triton have FusionBackendConfig.kind equal to this string.
-// Used for fusions that implement a scaled dot.
-inline constexpr absl::string_view kTritonScaledDotFusionKind =
-    "__triton_scaled_dot_fusion";
-
 inline constexpr absl::string_view kCuDnnFusionKind = "__cudnn$fusion";
 
 // Fusions that can be emitted using a dynamic memcpy. A dynamic memcpy depends
@@ -140,18 +140,6 @@ std::optional<std::string> GetCustomFusionConfigName(
 // fusion. This is determined by checking the name of custom fusion config.
 bool IsDynamicSliceFusion(const HloInstruction* instr);
 
-// Returns true if the given instruction is a dynamic memcpy fusion. This
-// function only checks the fusion kind, which is populated by the
-// FusionDispatch pipeline.
-bool IsDynamicMemcpyFusion(const HloInstruction* instr);
-
-// Returns true if `hlo` will be implemented as a call to a cuSolver routine.
-//
-// This returns true if `hlo` is a CustomCall HLO with a call target equal to
-// one of the kCusolver... constants, but returns *false* for HLOs with
-// say, a kCholesky opcode.
-bool IsCustomCallToCusolver(const HloInstruction& hlo);
-
 // Returns true if `hlo` will be implemented as a call to a TopK routine.
 bool IsCustomCallToTopK(const HloInstruction& hlo);
 
@@ -159,11 +147,8 @@ bool IsCustomCallToTopK(const HloInstruction& hlo);
 // implementation.
 bool IsCustomCallToPtxKernel(const HloInstruction& hlo);
 
-// Cholesky decomposition. Takes a (batched) matrix as input, and returns a
-// tuple of (result, workspace, info), where result is the result of the
-// Cholesky decomposition, workspace is scratch space for cuSolver, and info
-// is a success/failure code per batch element.
-extern const char* const kCusolverCholeskyCallTarget;
+// Returns true if instruction is a Mosaic GPU collective instruction.
+bool IsCollectiveMosaicGpuInstruction(const HloInstruction& hlo);
 
 // Returns true if `instr` is a slice (or dynamic slice) instruction and
 // operates on a contiguous slice of the input buffer.
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index baba99b4b8e0d9..49345bd960ec58 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -34,11 +35,8 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.pb.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/types.h"
@@ -49,7 +47,6 @@ namespace gpu {
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::SizeIs;
-using ::tsl::testing::IsOkAndHolds;
 
 class IrEmissionUtilsTest : public HloHardwareIndependentTestBase {
  public:
diff --git a/third_party/xla/xla/service/gpu/ir_emitter.cc b/third_party/xla/xla/service/gpu/ir_emitter.cc
index 1f1e03cd385b15..defdd95413c2b5 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter.h"
 
+#include <algorithm>
 #include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -23,21 +27,44 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/primitive_util.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/kernel_support_library.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/service/llvm_ir/sort_util.h"
 #include "xla/service/llvm_ir/tuple_ops.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -156,8 +183,9 @@ absl::Status IrEmitter::HandleCall(HloInstruction* call) {
   for (HloInstruction* operand : call->operands()) {
     operand_addresses.push_back(GetBasePointer(*operand));
   }
-  return CallNestedComputation(&b_, *ir_emitter_context_, *call->to_apply(),
-                               operand_addresses, GetBasePointer(*call));
+  return CallNestedComputation(&b_, *ir_emitter_context_, module_,
+                               *call->to_apply(), operand_addresses,
+                               GetBasePointer(*call));
 }
 
 absl::Status IrEmitter::HandleCustomCall(HloInstruction*) {
@@ -209,6 +237,11 @@ std::vector<llvm_ir::IrArray> IrEmitter::ConstructIrArrayForOutputs(
   return output_arrays;
 }
 
+absl::Status IrEmitter::EmitTargetElementLoop(
+    const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter) {
+  return Internal("This should be unreachable");
+}
+
 void IrEmitter::BindFusionArguments(const HloInstruction* fusion,
                                     FusedIrEmitter* fused_emitter) {
   for (int i = 0; i < fusion->operand_count(); i++) {
@@ -222,5 +255,642 @@ void IrEmitter::BindFusionArguments(const HloInstruction* fusion,
   }
 }
 
+namespace {
+
+struct KernelThunkInfo {
+  std::vector<llvm_ir::IrArray> ir_arrays;
+  std::unique_ptr<Thunk> thunk;
+};
+
+absl::StatusOr<KernelThunkInfo> BuildKernelThunkForNonFusionOp(
+    llvm::Module* llvm_module, const HloInstruction* hlo,
+    const BufferAssignment& buffer_assignment, ThunkId thunk_id,
+    const se::DeviceDescription& gpu_device_info,
+    const std::string& sanitized_kernel_name,
+
+    IrEmitter& ir_emitter, const LaunchDimensions& launch_dimensions) {
+  std::string suggested_kernel_name(hlo->name());
+
+  TF_ASSIGN_OR_RETURN(auto kernel_arguments,
+                      emitters::KernelArguments::Create(
+                          buffer_assignment, GetDefaultBufferAlignment(), hlo));
+
+  VLOG(3) << "Generating (without reuse check): " << suggested_kernel_name;
+
+  TF_ASSIGN_OR_RETURN(
+      llvm::Function * kernel,
+      BuildKernelPrototype(llvm_module, gpu_device_info, suggested_kernel_name,
+                           sanitized_kernel_name, kernel_arguments,
+                           launch_dimensions, ir_emitter.builder()));
+
+  auto thunk = std::make_unique<KernelThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(hlo, thunk_id),
+      kernel->getName().str(), kernel_arguments, launch_dimensions,
+      /*cluster_dim=*/std::nullopt,
+      /*shmem_bytes=*/0,
+      /*tma_metadata=*/se::gpu::TmaMetadata());
+
+  std::vector<llvm_ir::IrArray> ir_arrays;
+  ir_arrays.reserve(kernel_arguments.args().size());
+  for (const auto& [kernel_argument, llvm_arg] :
+       llvm::zip(kernel_arguments.args(), kernel->args())) {
+    llvm::Type* ir_type =
+        llvm_ir::ShapeToIrType(kernel_argument.shape(), llvm_arg.getContext());
+    llvm_ir::IrArray ir_array(&llvm_arg, ir_type, kernel_argument.shape());
+
+    if (!kernel_argument.written()) {
+      ir_array.MarkInvariantOverWholeProgram(&llvm_arg.getContext());
+    }
+    ir_arrays.push_back(ir_array);
+  }
+  return {KernelThunkInfo{ir_arrays, std::move(thunk)}};
+}
+
+llvm::Value* CreateLoad(llvm::Value* address, llvm::Type* data_type,
+                        int alignment_bytes, llvm::IRBuilderBase* b) {
+  int data_bytes = data_type->getPrimitiveSizeInBits() /
+                   primitive_util::BitWidth(PrimitiveType::U8);
+  if (alignment_bytes == 0) {
+    return b->CreateLoad(data_type, address);
+  }
+
+  int alignment_bitwidth =
+      alignment_bytes * primitive_util::BitWidth(PrimitiveType::U8);
+
+  llvm::Value* output = llvm::ConstantInt::get(data_type, 0);
+  for (int offset_bytes = 0; offset_bytes < data_bytes;
+       offset_bytes += alignment_bytes) {
+    llvm::Value* offset_address = b->CreateConstInBoundsGEP1_32(
+        b->getInt8Ty(), address, offset_bytes, "offset_address");
+    llvm::Value* partial_value = b->CreateLoad(b->getIntNTy(alignment_bitwidth),
+                                               offset_address, "partial_value");
+    llvm::Value* zextd =
+        b->CreateZExt(partial_value, output->getType(), "partial_value_zextd");
+    llvm::Value* shifted = b->CreateShl(
+        zextd, llvm::ConstantInt::get(b->getInt32Ty(), offset_bytes),
+        "partial_input_shifted");
+    output = b->CreateAdd(output, shifted, "output_updated");
+  }
+  return output;
+}
+
+void CreateStore(llvm::Value* data, llvm::Value* address, int alignment_bytes,
+                 llvm::IRBuilderBase* b) {
+  int data_bytes = data->getType()->getPrimitiveSizeInBits() /
+                   primitive_util::BitWidth(PrimitiveType::U8);
+  CHECK_GE(data_bytes, alignment_bytes);
+  if (alignment_bytes == 0) {
+    b->CreateStore(data, address);
+    return;
+  }
+
+  int alignment_bitwidth =
+      alignment_bytes * primitive_util::BitWidth(PrimitiveType::U8);
+
+  for (int offset_bytes = 0; offset_bytes < data_bytes;
+       offset_bytes += alignment_bytes) {
+    llvm::Value* offset_address = b->CreateConstInBoundsGEP1_32(
+        b->getInt8Ty(), address, offset_bytes, "offset_address");
+    llvm::Value* shifted_partial = b->CreateTrunc(
+        b->CreateLShr(data,
+                      llvm::ConstantInt::get(b->getInt32Ty(), offset_bytes)),
+        b->getIntNTy(alignment_bitwidth), "truncated_value");
+    b->CreateStore(shifted_partial, offset_address);
+  }
+}
+
+}  // namespace
+
+absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
+    const HloSortInstruction* sort, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context) {
+  std::string op_name(sort->name());
+
+  // Copy of the main context with the local module.
+  IrEmitterContext local_ir_emitter_context(
+      &ir_emitter_context->hlo_module(),
+      &ir_emitter_context->buffer_assignment(),
+      &ir_emitter_context->execution_stream_assignment(),
+      std::string(ir_emitter_context->platform_name()),
+      ir_emitter_context->gpu_device_info(), ir_emitter_context->mlir_context(),
+      llvm_module, ir_emitter_context->llvm_module_constants(),
+      ir_emitter_context->emit_kernels());
+
+  IrEmitter ir_emitter(&local_ir_emitter_context, /*nested=*/false);
+
+  int64_t dimension_to_sort = sort->sort_dimension();
+  const Shape& keys_shape = sort->operand(0)->shape();
+  uint64_t dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  int64_t num_stages = Log2Ceiling(dimension_to_sort_bound);
+  VLOG(2) << op_name << " requires " << num_stages << " stages.";
+  CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
+  CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
+
+  // Naive C++ code for the outer loops:
+  //
+  // for (int64_t stage = 0; stage <
+  // Log2Ceiling(dimension_to_sort_bound);
+  //     ++stage) {
+  //   int64_t first_xor_mask = (1LL << (stage + 1)) - 1;
+  //   SortInPlace(first_xor_mask);
+  //   for (int64_t mask = stage - 1; mask >= 0; --mask) {
+  //     int64_t later_xor_mask = 1LL << mask;
+  //     SortInPlace(later_xor_mask);
+  //   }
+  // }
+  //
+  // This follows the alternative representation of the algorithm
+  // described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+  //
+  // Each mask specifies how to derive from one position in the
+  // array the position with which it should be compared (we
+  // calculate the xor of the position with the mask). As an
+  // optimization, we can move the 'mask' loop to inside the
+  // sorting/comparison loop if the comparisons happen within a
+  // small block of the array. To make this work, we collect all
+  // consecutive masks that are smaller than our chosen power of 2
+  // tile size, and pass them to SortInPlace. Each block then
+  // processes one tile of data.
+
+  const uint64_t kUnrollFactor = 4;
+  // Determine the total element size of all sort operands. We need to choose a
+  // tile size such that we have enough shared memory to store a tile of
+  // elements from each operand.
+  uint64_t total_element_size = 0;
+  for (int64_t i = 0; i < sort->operand_count(); ++i) {
+    total_element_size += ShapeUtil::ByteSizeOfPrimitiveType(
+        sort->operand(i)->shape().element_type());
+  }
+  const uint64_t kMaxSharedMemoryPerBlock =
+      ir_emitter_context->gpu_device_info().shared_memory_per_block();
+  uint64_t max_tile_size_fitting_into_shared_memory =
+      kMaxSharedMemoryPerBlock / total_element_size;
+  const uint64_t kMaxThreadsPerBlock =
+      ir_emitter_context->gpu_device_info().threads_per_block_limit();
+  // Choose the tile size based on actual amount of elements to sort, the amount
+  // of shared memory available, and the maximum number of threads per block.
+  uint64_t tile_size =
+      std::min(std::min(kMaxThreadsPerBlock * kUnrollFactor,
+                        max_tile_size_fitting_into_shared_memory),
+               uint64_t{1} << num_stages);
+  // The tile size needs to be a power of 2.
+  tile_size = uint64_t{1} << Log2Floor(tile_size);
+
+  // If we cannot combine several xor masks together, we don't use
+  // tiling, so we calculate the standard launch dimensions for the
+  // shape. However we only need to iterate through ~half of the
+  // dimension to sort (rounded up to the next highest power of 2),
+  // because each iteration compares one pair of elements.
+  Shape standard_iteration_shape = keys_shape;
+  uint64_t standard_num_iterations_in_sort_dim = 1ULL << (num_stages - 1);
+  standard_iteration_shape.set_dimensions(
+      dimension_to_sort,
+      CeilOfRatio(standard_num_iterations_in_sort_dim, kUnrollFactor));
+
+  LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions(
+      standard_iteration_shape, ir_emitter_context->gpu_device_info());
+
+  // Calculate the launch dimensions for the case where we use
+  // tiling. We split the dimension that should be sorted into tiles
+  // of size 'tile_size'. This means we first need to round
+  // 'dimension_to_sort_bound' up to be a multiple of the tile size.
+  uint64_t rounded_bound = RoundUpTo(dimension_to_sort_bound, tile_size);
+  Shape iteration_shape = keys_shape;
+
+  // We iterate through the element pairs that should be compared.
+  uint64_t num_iterations_in_sort_dim =
+      CeilOfRatio(rounded_bound, kUnrollFactor);
+  iteration_shape.set_dimensions(dimension_to_sort, num_iterations_in_sort_dim);
+  uint64_t num_iterations = ShapeUtil::ElementsIn(iteration_shape);
+
+  // For correctness reasons we need exactly `tile_size` / `kUnrollFactor` many
+  // threads per block. Each thread is responsible for copying
+  // exactly `kUnrollFactor` many adjacent elements into shared memory, and then
+  // does `kUnrollFactor` / 2 many comparisons of two elements taken from shared
+  // memory.
+  const uint64_t kThreadsPerBlock =
+      std::max(uint64_t{1}, tile_size / kUnrollFactor);
+
+  uint64_t num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
+  LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
+  VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
+                                op_name, num_blocks, kThreadsPerBlock);
+  ThunkSequence thunks;
+  auto emit_kernel = [&](absl::Span<const int64_t> xor_masks) {
+    VLOG(2) << absl::StreamFormat(
+        "%s uses kernel for xor masks [%s]", op_name,
+        absl::StrJoin(xor_masks, ", ", [](std::string* out, int64_t xor_mask) {
+          absl::StrAppendFormat(out, "0x%x", xor_mask);
+        }));
+    LaunchDimensions launch_dimensions = xor_masks.size() > 1
+                                             ? tiled_launch_dimensions
+                                             : standard_launch_dimensions;
+    TF_ASSIGN_OR_RETURN(
+        KernelThunkInfo kernel_thunk_info,
+        BuildKernelThunkForNonFusionOp(
+            llvm_module, sort, ir_emitter_context->buffer_assignment(),
+            ir_emitter_context->GetNextThunkId(),
+            ir_emitter_context->gpu_device_info(),
+            ir_emitter_context->GetSanitizedUniqueName(op_name), ir_emitter,
+            launch_dimensions));
+    thunks.push_back(std::move(kernel_thunk_info.thunk));
+
+    // The first `operand_count()` elements of `ir_arrays` are the input
+    // operands and the rest are the output arrays. Inputs are aliases with
+    // outputs, so we need to pass only the outputs to the in-place sort kernel.
+    auto output_arrays_span =
+        absl::Span<const llvm_ir::IrArray>(kernel_thunk_info.ir_arrays)
+            .subspan(sort->operand_count());
+
+    auto* comparator = sort->called_computations().front();
+    auto* builder = ir_emitter.builder();
+    return llvm_ir::EmitSortInPlace(
+        dimension_to_sort, output_arrays_span, llvm_ir::IrName(op_name),
+        xor_masks, ir_emitter.builder(), launch_dimensions,
+        xor_masks.size() > 1 ? num_iterations_in_sort_dim
+                             : standard_num_iterations_in_sort_dim,
+        tile_size, kUnrollFactor,
+        [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
+          return CallNestedComputation(builder, local_ir_emitter_context,
+                                       llvm_module, *comparator, operands,
+                                       output);
+        });
+  };
+  std::vector<int64_t> xor_masks;
+  for (int64_t stage = 0; stage < num_stages; ++stage) {
+    for (int64_t mask = stage; mask >= 0; --mask) {
+      int64_t xor_mask;
+      if (mask == stage) {
+        xor_mask = (1LL << (stage + 1)) - 1;
+      } else {
+        xor_mask = 1LL << mask;
+      }
+      if (xor_mask >= tile_size) {
+        if (!xor_masks.empty()) {
+          TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+          xor_masks.clear();
+        }
+        TF_RETURN_IF_ERROR(emit_kernel({xor_mask}));
+      } else {
+        xor_masks.push_back(xor_mask);
+      }
+    }
+  }
+  if (!xor_masks.empty()) {
+    TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+  }
+  return thunks;
+}
+
+// Input = {dynamic array(with dynamic dimension meta data at the
+// end)} Output = {static array, dynamic_dim0, dynamic_dim1}
+absl::StatusOr<ThunkSequence> EmitPadToStaticLLVMIR(
+    const HloCustomCallInstruction* hlo, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context) {
+  std::string ir_name = std::string(hlo->name());
+
+  // Copy of the main context with the local module.
+  IrEmitterContext local_ir_emitter_context(
+      &ir_emitter_context->hlo_module(),
+      &ir_emitter_context->buffer_assignment(),
+      &ir_emitter_context->execution_stream_assignment(),
+      std::string(ir_emitter_context->platform_name()),
+      ir_emitter_context->gpu_device_info(), ir_emitter_context->mlir_context(),
+      llvm_module, ir_emitter_context->llvm_module_constants(),
+      ir_emitter_context->emit_kernels());
+
+  IrEmitter ir_emitter(&local_ir_emitter_context, /*nested=*/false);
+
+  constexpr int kUnrollFactor = 1;
+  const Shape& input_shape = hlo->operand(0)->shape();
+
+  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+      input_shape, ir_emitter_context->gpu_device_info(), {kUnrollFactor});
+
+  TF_ASSIGN_OR_RETURN(
+      KernelThunkInfo kernel_thunk_info,
+      BuildKernelThunkForNonFusionOp(
+          llvm_module, hlo, ir_emitter_context->buffer_assignment(),
+          ir_emitter_context->GetNextThunkId(),
+          ir_emitter_context->gpu_device_info(),
+          ir_emitter_context->GetSanitizedUniqueName(ir_name), ir_emitter,
+          launch_dimensions));
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(kernel_thunk_info.thunk));
+
+  const llvm_ir::IrArray& source_array = kernel_thunk_info.ir_arrays[0];
+  const llvm_ir::IrArray& output_array = kernel_thunk_info.ir_arrays[1];
+  auto output_dim_arrays =
+      absl::Span<const llvm_ir::IrArray>(kernel_thunk_info.ir_arrays)
+          .subspan(2);
+
+  llvm::Type* index_ty = GetIndexTypeForKernel(
+      hlo, launch_dimensions.launch_bound(), ir_emitter.builder());
+
+  // pseudo code for PadToStatic on a 2d array
+  //   int* source_array = args[0];
+  //   int* dest_array = args[1];
+  llvm::Value* source_buffer = source_array.GetBasePointer();
+
+  // TODO(jurahul): input_shape here is the static shape of the
+  // input (which has a dynamic shape in XLA). Currently, we are
+  // mapping that to a static shaped memref. When we change that to
+  // a more appropriate representation in MLIR, fix this code to
+  // correctly deduce the static shape backing the dynamically
+  // shaped memref.
+  int64_t raw_data_size = ShapeUtil::ByteSizeOf(input_shape);
+
+  //   int* dyn_dim0_size = source_array + meta_data_offset;
+  //   int* dyn_dim1_size = source_array + meta_data_offset +
+  //   sizeof(int);
+  std::vector<llvm::Value*> dynamic_dims;
+  int alignment = raw_data_size % sizeof(int32_t);
+  std::vector<ShapeUtil::IndexedShape> output_shapes =
+      ShapeUtil::GetLeafShapes(hlo->shape());
+
+  for (int64_t i = 1; i < output_shapes.size(); ++i) {
+    // Dynamic size of each dimension is attached at the end of the
+    // source array(operand(0)). We need to extract these value.
+    const Shape& dim_shape = output_shapes[i].shape;
+    TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
+
+    const int64_t dim_index = i - 1;
+    llvm::Value* metadata = ir_emitter.builder()->CreateConstInBoundsGEP1_32(
+        ir_emitter.builder()->getInt8Ty(), source_buffer,
+        raw_data_size + dim_index * sizeof(int32_t));
+    llvm::Value* dyn_dim_size =
+        CreateLoad(metadata, ir_emitter.builder()->getInt32Ty(), alignment,
+                   ir_emitter.builder());
+    dynamic_dims.push_back(dyn_dim_size);
+  }
+
+  // only one thread need to store the dynamic index
+  //   int thread_id = GetThreadId();
+  //   int block_id = GetBlockId();
+  //   if (thread_id == 0 && block_id == 0) {
+  //     *output[1] = *dyn_dim0_size;
+  //     *output[2] = *dyn_dim1_size;
+  //   }
+  KernelSupportLibrary{ir_emitter.builder()}.If(
+      "is_thread_0", IsBlock0Thread0(ir_emitter.builder()), [&] {
+        for (int64_t i = 1; i < output_shapes.size(); ++i) {
+          const int64_t dim_index = i - 1;
+          llvm::Value* dest_dim_size_address =
+              output_dim_arrays[dim_index].GetBasePointer();
+          // output[i] stores dynamic_dim_(i-1)
+          CreateStore(dynamic_dims[dim_index], dest_dim_size_address, alignment,
+                      ir_emitter.builder());
+        }
+      });
+
+  //     int dyn_element_total = 1;
+  //     dyn_element_total *= *dyn_dim0_size;
+  //     dyn_element_total *= *dyn_dim1_size;
+  llvm::Value* dyn_element_total = llvm::ConstantInt::get(index_ty, 1);
+  for (llvm::Value* dynamic_dim : dynamic_dims) {
+    dyn_element_total = ir_emitter.builder()->CreateMul(
+        dyn_element_total,
+        ir_emitter.builder()->CreateIntCast(dynamic_dim,
+                                            dyn_element_total->getType(),
+                                            /*isSigned=*/true),
+        /*Name=*/"dyn_element_total_pad");
+  }
+
+  //   linear_index = block_id * threads_per_block + thread_id;
+  //   if (linear_index < max_num_element) {
+  //     Index static_index =
+  //         delinerized(linerized_index, static_dim0_size,
+  //         static_dim1_size);
+  //     if (linerized_index < dyn_element_total) {
+  //       Index dyn_index =
+  //           delinerized(linerized_index, *dyn_dim0_size,
+  //           *dyn_dim1_size);
+  //       dest_array[dyn_index.dim0][dyn_index.dim1] =
+  //           source_array[static_index.dim0][static_index.dim1];
+  //     }
+  //   }
+  llvm_ir::BodyEmitter body_generator =
+      [&](const llvm_ir::IrArray::Index& array_index) -> absl::Status {
+    llvm::Value* linearIndex =
+        array_index.Linearize(input_shape.dimensions(), ir_emitter.builder());
+    auto if_in_dyn_bounds = llvm_ir::EmitIfThenElse(
+        ir_emitter.builder()->CreateICmpULT(linearIndex, dyn_element_total),
+        llvm_ir::IrName(ir_name, "in_dyn_bounds"), ir_emitter.builder(), false);
+    // Set IR builder insertion point to the body of the if
+    // structure.
+    llvm_ir::SetToFirstInsertPoint(if_in_dyn_bounds.true_block,
+                                   ir_emitter.builder());
+    llvm_ir::IrArray::Index dyn_index(linearIndex, input_shape,
+                                      absl::MakeSpan(dynamic_dims),
+                                      ir_emitter.builder());
+    output_array.EmitWriteArrayElement(
+        dyn_index,
+        source_array.EmitReadArrayElement(array_index, ir_emitter.builder(),
+                                          /*name=*/""),
+        ir_emitter.builder(),
+        /*use_linear_index=*/false);
+    return absl::OkStatus();
+  };
+
+  const Shape& data_shape = hlo->shape().tuple_shapes(0);
+  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
+                                         launch_dimensions,
+                                         ir_emitter.builder(), {kUnrollFactor})
+                         .EmitLoop(ir_name, index_ty));
+  return thunk_sequence;
+}
+
+// Input = {dynamic array(with dynamic dimension meta data at the
+// end)} Output = {static array, dynamic_dim0, dynamic_dim1}
+absl::StatusOr<ThunkSequence> EmitSliceToDynamicLLVMIR(
+    const HloCustomCallInstruction* hlo, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context) {
+  std::string ir_name = std::string(hlo->name());
+
+  // Copy of the main context with the local module.
+  IrEmitterContext local_ir_emitter_context(
+      &ir_emitter_context->hlo_module(),
+      &ir_emitter_context->buffer_assignment(),
+      &ir_emitter_context->execution_stream_assignment(),
+      std::string(ir_emitter_context->platform_name()),
+      ir_emitter_context->gpu_device_info(), ir_emitter_context->mlir_context(),
+      llvm_module, ir_emitter_context->llvm_module_constants(),
+      ir_emitter_context->emit_kernels());
+
+  IrEmitter ir_emitter(&local_ir_emitter_context, /*nested=*/false);
+  // TODO(jurahul): Create an op to represent SliceToDynamic.
+  constexpr int kUnrollFactor = 1;
+  const Shape& input_shape = hlo->operand(0)->shape();
+
+  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+      input_shape, ir_emitter_context->gpu_device_info(), {kUnrollFactor});
+  llvm::Type* index_ty = GetIndexTypeForKernel(
+      hlo, launch_dimensions.launch_bound(), ir_emitter.builder());
+  TF_ASSIGN_OR_RETURN(
+      KernelThunkInfo kernel_thunk_info,
+      BuildKernelThunkForNonFusionOp(
+          llvm_module, hlo, ir_emitter_context->buffer_assignment(),
+          ir_emitter_context->GetNextThunkId(),
+          ir_emitter_context->gpu_device_info(),
+          ir_emitter_context->GetSanitizedUniqueName(ir_name), ir_emitter,
+          launch_dimensions));
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(kernel_thunk_info.thunk));
+
+  const Shape& data_shape = ShapeUtil::MakeStaticShape(hlo->shape());
+  TF_RET_CHECK(data_shape.IsArray());
+
+  // TODO(jurahul): data_shape here is the static shape of the
+  // output (which has a dynamic shape in XLA). Currently, we are
+  // mapping that to a static shaped memref. When we change that to
+  // a more appropriate representation in MLIR, fix this code to
+  // correctly deduce the static shape backing the dynamically
+  // shaped memref.
+
+  // calculate the location where metadata needs to be inserted
+  //   int* dyn_dim0_size = dest_array + meta_data_offset;
+  //   int* dyn_dim1_size = dest_array + meta_data_offset +
+  //   sizeof(int);
+  int32_t raw_data_size = ShapeUtil::ByteSizeOf(data_shape);
+
+  // pseudo code for sliceToDynamic on a 2d array
+  //   int* source_array = args[0];
+  //   int* dest_array = args.back();
+  const auto& ir_arrays = kernel_thunk_info.ir_arrays;
+  const llvm_ir::IrArray& data_array = ir_arrays.back();
+  llvm::Value* dest_buffer = data_array.GetBasePointer();
+
+  // Load dynamic dimensions from memory.
+  std::vector<llvm::Value*> dynamic_dims;
+  int alignment = raw_data_size % sizeof(int32_t);
+  for (int64_t i = 1; i < hlo->operand_count(); ++i) {
+    llvm::Value* source_buffer = ir_arrays[i].GetBasePointer();
+    llvm::Type* source_buffer_pointee_type = ir_arrays[i].GetBasePointeeType();
+    llvm::LoadInst* dyn_dim_size = ir_emitter.builder()->CreateLoad(
+        source_buffer_pointee_type, source_buffer, "dyn_dim_size");
+    dynamic_dims.push_back(dyn_dim_size);
+  }
+
+  // only one thread need to store the dynamic index
+  //   int thread_id = GetThreadId();
+  //   int block_id = GetBlockId();
+  //   if (thread_id == 0 && block_id == 0) {
+  //     *dyn_dim0_size = *output[1];
+  //     *dyn_dim1_size = *output[2];
+  //   }
+  KernelSupportLibrary{ir_emitter.builder()}.If(
+      "is_thread_0", IsBlock0Thread0(ir_emitter.builder()), [&] {
+        for (int64_t i = 1; i < hlo->operand_count(); ++i) {
+          const int64_t dim_index = i - 1;
+          llvm::Value* metadata =
+              ir_emitter.builder()->CreateConstInBoundsGEP1_32(
+                  ir_emitter.builder()->getInt8Ty(), dest_buffer,
+                  raw_data_size + dim_index * sizeof(int32_t));
+          // output[i] stores dynamic_dim_(i-1)
+          CreateStore(dynamic_dims[dim_index], metadata, alignment,
+                      ir_emitter.builder());
+        }
+      });
+
+  //     int dyn_element_total = 1;
+  //     dyn_element_total *= dyn_dim0_size;
+  //     dyn_element_total *= dyn_dim1_size;
+  llvm::Value* dyn_element_total = llvm::ConstantInt::get(index_ty, 1);
+  for (llvm::Value* dynamic_dim : dynamic_dims) {
+    dyn_element_total = ir_emitter.builder()->CreateMul(
+        dyn_element_total,
+        ir_emitter.builder()->CreateIntCast(dynamic_dim,
+                                            dyn_element_total->getType(),
+                                            /*isSigned=*/true),
+        /*Name=*/"dyn_element_total_slice");
+  }
+
+  //   linear_index = block_id * threads_per_block + thread_id;
+  //   if (linear_index < max_num_element) {
+  //     Index static_index =
+  //         delinerized(linerized_index, static_dim0_size,
+  //         static_dim1_size);
+  //     if (linerized_index < dyn_element_total) {
+  //       Index dyn_index =
+  //           delinerized(linerized_index, *dyn_dim0_size,
+  //           *dyn_dim1_size);
+  //       dest_array[static_index.dim0][static_index.di] =
+  //           source_array[dyn_index.dim0][dyn_index.dim1];
+  //     }
+  //   }
+  llvm_ir::BodyEmitter body_generator =
+      [&](const llvm_ir::IrArray::Index& array_index) -> absl::Status {
+    llvm::Value* linearIndex =
+        array_index.Linearize(input_shape.dimensions(), ir_emitter.builder());
+    auto if_in_dyn_bounds = llvm_ir::EmitIfThenElse(
+        ir_emitter.builder()->CreateICmpULT(linearIndex, dyn_element_total),
+        llvm_ir::IrName(ir_name, "in_dyn_bounds"), ir_emitter.builder(), false);
+    // Set IR builder insertion point to the body of the if
+    // structure.
+    llvm_ir::SetToFirstInsertPoint(if_in_dyn_bounds.true_block,
+                                   ir_emitter.builder());
+    llvm_ir::IrArray::Index dyn_index(linearIndex, input_shape,
+                                      absl::MakeSpan(dynamic_dims),
+                                      ir_emitter.builder());
+
+    data_array.EmitWriteArrayElement(
+        array_index,
+        ir_arrays[0].EmitReadArrayElement(dyn_index, ir_emitter.builder(),
+                                          /*name=*/"",
+                                          /*use_linear_index=*/false),
+        ir_emitter.builder());
+    return absl::OkStatus();
+  };
+
+  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
+                                         launch_dimensions,
+                                         ir_emitter.builder(), {kUnrollFactor})
+                         .EmitLoop(ir_name, index_ty));
+  return thunk_sequence;
+}
+
+absl::StatusOr<ThunkSequence> EmitRngGetAndUpdateStateLLVMIR(
+    const HloRngGetAndUpdateStateInstruction* hlo, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context) {
+  std::string ir_name = std::string(hlo->name());
+
+  // Copy of the main context with the local module.
+  IrEmitterContext local_ir_emitter_context(
+      &ir_emitter_context->hlo_module(),
+      &ir_emitter_context->buffer_assignment(),
+      &ir_emitter_context->execution_stream_assignment(),
+      std::string(ir_emitter_context->platform_name()),
+      ir_emitter_context->gpu_device_info(), ir_emitter_context->mlir_context(),
+      llvm_module, ir_emitter_context->llvm_module_constants(),
+      ir_emitter_context->emit_kernels());
+
+  IrEmitter ir_emitter(&local_ir_emitter_context, /*nested=*/false);
+
+  auto& b = *ir_emitter.builder();
+  // Emit a kernel to increment the global state for Philox RNG
+  // algorithm.
+  TF_ASSIGN_OR_RETURN(
+      KernelThunkInfo kernel_thunk_info,
+      BuildKernelThunkForNonFusionOp(
+          llvm_module, hlo, ir_emitter_context->buffer_assignment(),
+          ir_emitter_context->GetNextThunkId(),
+          ir_emitter_context->gpu_device_info(),
+          ir_emitter_context->GetSanitizedUniqueName(ir_name), ir_emitter,
+          LaunchDimensions()));
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(kernel_thunk_info.thunk));
+
+  auto& ir_arrays = kernel_thunk_info.ir_arrays;
+  llvm::Value* old_state =
+      llvm_ir::RngGetAndUpdateState(hlo->delta(), llvm_module, &b);
+  llvm::Value* output_address = ir_arrays[0].EmitArrayElementAddress(
+      llvm_ir::IrArray::Index(
+          /*linear=*/b.getInt64(0), hlo->shape(), &b),
+      &b, "rng_state_address");
+  b.CreateStore(old_state, output_address);
+  return thunk_sequence;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter.h b/third_party/xla/xla/service/gpu/ir_emitter.h
index 672dfd333ecfbf..a5d57e55e05a9e 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter.h
@@ -19,12 +19,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/AtomicOrdering.h"
+#include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_to_ir_bindings.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
@@ -59,6 +61,11 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   IrEmitter(const IrEmitter&) = delete;
   IrEmitter& operator=(const IrEmitter&) = delete;
 
+  // Constructs an IrEmitter with the given IrEmitter context.
+  // ir_emitter_context is owned by the caller and should outlive the IrEmitter
+  // object.
+  explicit IrEmitter(IrEmitterContext* ir_emitter_context, bool is_nested);
+
   absl::Status DefaultAction(HloInstruction* hlo) override;
   absl::Status HandleConstant(HloInstruction* constant) override;
   absl::Status HandleGetTupleElement(
@@ -89,11 +96,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   llvm::IRBuilderBase* builder() { return &b_; }
 
  protected:
-  // Constructs an IrEmitter with the given IrEmitter context.
-  // ir_emitter_context is owned by the caller and should outlive the IrEmitter
-  // object.
-  explicit IrEmitter(IrEmitterContext* ir_emitter_context, bool is_nested);
-
   // Helper for calling HloToIrBindings::GetIrArray.
   //
   // Gets the IrArray which contains inst.  This array has metadata that makes
@@ -121,8 +123,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // nested loops (e.g. one for each dimension of the `hlo`'s shape). The body
   // of the inner-most loop is provided by the body_emitter function.
   virtual absl::Status EmitTargetElementLoop(
-      const HloInstruction& hlo,
-      const llvm_ir::ElementGenerator& body_emitter) = 0;
+      const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter);
 
   IrEmitterContext* ir_emitter_context_;
   llvm::Module* module_;
@@ -139,6 +140,22 @@ class IrEmitter : public DfsHloVisitorWithDefault,
                            FusedIrEmitter* fused_emitter);
 };
 
+absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
+    const HloSortInstruction* sort, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context);
+
+absl::StatusOr<ThunkSequence> EmitPadToStaticLLVMIR(
+    const HloCustomCallInstruction* hlo, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context);
+
+absl::StatusOr<ThunkSequence> EmitSliceToDynamicLLVMIR(
+    const HloCustomCallInstruction* hlo, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context);
+
+absl::StatusOr<ThunkSequence> EmitRngGetAndUpdateStateLLVMIR(
+    const HloRngGetAndUpdateStateInstruction* hlo, llvm::Module* llvm_module,
+    IrEmitterContext* ir_emitter_context);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.cc b/third_party/xla/xla/service/gpu/ir_emitter_context.cc
deleted file mode 100644
index f52700eccb4555..00000000000000
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/ir_emitter_context.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/strings/string_view.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/Alignment.h"
-#include "llvm/TargetParser/Triple.h"
-#include "xla/service/gpu/gpu_constants.h"
-#include "xla/service/gpu/gpu_executable.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-
-namespace xla {
-namespace gpu {
-
-void IrEmitterContext::emit_constant(int64_t num_elements,
-                                     int64_t bytes_per_element,
-                                     absl::string_view symbol_name,
-                                     int allocation_idx,
-                                     DenseDataIntermediate content,
-                                     llvm::IRBuilderBase* b) {
-  // LLVM and PTXAS don't deal well with large constants, so we only emit very
-  // small constants directly in LLVM IR.  Larger constants are emitted with
-  // zero initializers in LLVM IR and are later overwritten when the PTX/CUBIN
-  // is loaded.
-  bool should_emit_initializer = num_elements <= 1;
-
-  // Ptxas has issues if the constant allocation is smaller than 64 bytes.
-  // TODO(b/253259975): Remove when fixed ptxas version is submitted.
-  constexpr int64_t kMinConstAllocationInBytes = 64;
-  bool needs_padding =
-      num_elements * bytes_per_element < kMinConstAllocationInBytes;
-
-  llvm::ArrayType* global_type = llvm::ArrayType::get(
-      b->getInt8Ty(),
-      std::max(num_elements * bytes_per_element, kMinConstAllocationInBytes));
-
-  GpuExecutable::ConstantInfo info;
-  llvm::Constant* initializer = [&]() -> llvm::Constant* {
-    if (!should_emit_initializer) {
-      info.content = std::move(content);
-      return llvm::ConstantAggregateZero::get(global_type);
-    }
-
-    std::vector<uint8_t> padded(kMinConstAllocationInBytes, 0);
-    absl::c_copy(content.span(), padded.begin());
-    return llvm::ConstantDataArray::get<uint8_t>(
-        llvm_module_constants()->getContext(),
-        needs_padding ? llvm::ArrayRef<uint8_t>(padded)
-                      : llvm::ArrayRef<uint8_t>(content.span().data(),
-                                                content.span().size()));
-  }();
-
-  // Explicitly set global addrspace for SPIR backend.
-  int addrspace =
-      llvm::Triple(llvm_module_constants()->getTargetTriple()).isSPIR() ? 1 : 0;
-  // These globals will be looked up by name by GpuExecutable so we need to
-  // give them an external linkage.  Not all of their uses are visible in
-  // the LLVM IR so we can't give then a linkage that merely preserves their
-  // names (like available_externally), we also need to ensure that they stick
-  // around even if they're "unused".
-  //
-  // We may have to be more clever here in the future if we notice that we're
-  // keeping around too many globals because of their linkage.
-  llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
-      global_type, /*isConstant=*/should_emit_initializer,
-      llvm::GlobalValue::ExternalLinkage,
-      /*Initializer=*/initializer, symbol_name,
-      /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
-      /*AddressSpace=*/addrspace,
-      /*isExternallyInitialized=*/false);
-  global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
-  llvm_module_constants()->insertGlobalVariable(global_for_const);
-
-  info.symbol_name.assign(symbol_name);
-  info.allocation_index = allocation_idx;
-  constants_.push_back(std::move(info));
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index eb1362a5ee0425..238ac1408dde74 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/host_execute_thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
+#include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
@@ -99,18 +101,10 @@ class IrEmitterContext {
   const se::GpuComputeCapability& gpu_compute_capability() const {
     return gpu_device_info_.gpu_compute_capability();
   }
-  se::CudaComputeCapability cuda_compute_capability() const {
-    auto* cc =
-        std::get_if<se::CudaComputeCapability>(&gpu_compute_capability());
-    return cc != nullptr ? *cc : se::CudaComputeCapability();
-  }
-  se::RocmComputeCapability rocm_compute_capability() const {
-    auto* cc =
-        std::get_if<se::RocmComputeCapability>(&gpu_compute_capability());
-    return cc != nullptr ? *cc : se::RocmComputeCapability();
-  }
+
   mlir::MLIRContext* mlir_context() { return mlir_context_; }
   llvm::Module* llvm_module() { return llvm_module_; }
+
   // A separate module can optionally be used to emit constants.
   llvm::Module* llvm_module_constants() {
     return (llvm_module_constants_ == nullptr) ? llvm_module_
@@ -130,12 +124,6 @@ class IrEmitterContext {
 
   std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
 
-  // Emit a constant with a given number of element, given byte size of the
-  // element, given symbol name and content.
-  void emit_constant(int64_t num_elements, int64_t bytes_per_element,
-                     absl::string_view symbol_name, int allocation_idx,
-                     DenseDataIntermediate content, llvm::IRBuilderBase* b);
-
   const DebugOptions& debug_options() const {
     return hlo_module_->config().debug_options();
   }
@@ -154,6 +142,11 @@ class IrEmitterContext {
 
   ThunkId GetNextThunkId() { return thunk_id_generator_.GetNextThunkId(); }
 
+  std::string GetSanitizedUniqueName(const std::string& suggested_name) {
+    return name_uniquer_.GetUniqueName(
+        llvm_ir::SanitizeFunctionName(suggested_name));
+  }
+
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
@@ -161,6 +154,7 @@ class IrEmitterContext {
   std::string platform_name_;
   const se::DeviceDescription& gpu_device_info_;
   mlir::MLIRContext* mlir_context_;
+  mlir::MLIRContext expr_context_;
   llvm::Module* llvm_module_;
   llvm::Module* llvm_module_constants_;
   NameUniquer name_uniquer_;
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
index 15cb6c7e5e9216..bfccf3aa4db3fb 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
@@ -14,33 +14,41 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/ir_emitter_nested.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/TargetParser/Triple.h"
 #include "xla/codegen/emitters/computation_fingerprint.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
+#include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
@@ -52,8 +60,11 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/fingerprint.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -110,19 +121,14 @@ IrEmitterNested::IrEmitterNested(const HloComputation& nested_computation,
 // Nested function serves the same purpose on GPU as a thread-local function on
 // a CPU.
 absl::StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
-  // Include a fingerprint of the HLO in the function name. Currently, codegen
-  // is invoked on temporary HLO objects, which means the address of the
-  // computation is not necessarily unique.
-  std::string fingerprint =
-      emitters::GetComputationFingerprint(&nested_computation_, {});
-  size_t hash = absl::Hash<std::string>{}(fingerprint);
-  std::string function_name = llvm_ir::SanitizeFunctionName(
-      absl::StrCat(nested_computation_.name(), "_",
-                   absl::Hex(reinterpret_cast<intptr_t>(&nested_computation_)),
-                   "_", absl::Hex(hash)));
-
-  auto* function =
-      ir_emitter_context_->llvm_module()->getFunction(function_name);
+  // Include a fingerprint of the HLO in the function name to make the name
+  // unique.
+  tsl::Fprint128 fingerprint = tsl::Fingerprint128(
+      emitters::GetComputationFingerprint(&nested_computation_, {}));
+  std::string function_name = llvm_ir::SanitizeFunctionName(absl::StrCat(
+      nested_computation_.name(), "_", fingerprint.low64, fingerprint.high64));
+
+  auto* function = module_->getFunction(function_name);
   if (function) return function;
 
   TF_RETURN_IF_ERROR(EmitConstants(nested_computation_));
@@ -147,8 +153,8 @@ absl::StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
   {
     const Shape& root_shape = root->shape();
     argument_types.push_back(b_.getPtrTy());
-    int64_t root_size = llvm_ir::ByteSizeOf(
-        root_shape, ir_emitter_context_->llvm_module()->getDataLayout());
+    int64_t root_size =
+        llvm_ir::ByteSizeOf(root_shape, module_->getDataLayout());
     argument_dereferenceable_bytes.push_back(root_size);
   }
 
@@ -158,7 +164,7 @@ absl::StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
       function_type,                       // The function type.
       llvm::GlobalValue::InternalLinkage,  // The linkage type.
       function_name,
-      ir_emitter_context_->llvm_module());  // The parent LLVM module.
+      module_);  // The parent LLVM module.
   for (size_t arg_no = 0; arg_no < argument_dereferenceable_bytes.size();
        ++arg_no) {
     int64_t arg_size = argument_dereferenceable_bytes[arg_no];
@@ -274,15 +280,13 @@ absl::Status IrEmitterNested::EmitConstants(const HloComputation& computation) {
     std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
 
     auto base = static_cast<const uint8_t*>(literal.untyped_data());
-    ir_emitter_context_->emit_constant(
-        literal.element_count(),
+    GpuExecutable::ConstantInfo info = AppendGlobalConstant(
+        module_, literal.element_count(),
         ShapeUtil::ByteSizeOfPrimitiveType(literal.shape().element_type()),
-
-        global_name,
-        /*allocation_idx=*/-1,
+        global_name, /*allocation_idx=*/-1,
         DenseDataIntermediate::Alias(
-            absl::MakeSpan(base, base + literal.size_bytes())),
-        &b_);
+            absl::MakeSpan(base, base + literal.size_bytes())));
+    ir_emitter_context_->constants().push_back(std::move(info));
   }
   return absl::OkStatus();
 }
@@ -307,6 +311,7 @@ llvm::Value* AddrCastToDefault(llvm::Value* arg, llvm::IRBuilderBase& b) {
 
 absl::Status CallNestedComputation(llvm::IRBuilderBase* builder,
                                    IrEmitterContext& ir_emitter_context,
+                                   llvm::Module* llvm_module,
                                    const HloComputation& computation,
                                    absl::Span<llvm::Value* const> operands,
                                    llvm::Value* output) {
@@ -332,55 +337,66 @@ absl::Status CallNestedComputation(llvm::IRBuilderBase* builder,
   return absl::OkStatus();
 }
 
-absl::StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalars(
-    llvm::IRBuilderBase* builder, IrEmitterContext& ir_emitter_context,
-    const HloComputation& computation,
-    absl::Span<llvm::Value* const> parameter_elements) {
-  std::vector<llvm::Value*> parameter_buffers;
-  for (llvm::Value* parameter_element : parameter_elements) {
-    parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
-        parameter_element->getType(), "parameter_buffer", builder));
-    builder->CreateStore(parameter_element, parameter_buffers.back());
-  }
-
-  return CallNestedComputationWithScalarAddrs(builder, ir_emitter_context,
-                                              computation, parameter_buffers);
-}
-
-absl::StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalarAddrs(
-    llvm::IRBuilderBase* builder, IrEmitterContext& ir_emitter_context,
-    const HloComputation& computation,
-    absl::Span<llvm::Value* const> parameter_elements_addrs) {
-  const Shape& return_shape = computation.root_instruction()->shape();
-  llvm::Type* return_buffer_type =
-      llvm_ir::ShapeToIrType(return_shape, builder->getContext());
-  llvm::Value* return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
-      return_buffer_type, "return_buffer", builder);
-
-  std::vector<llvm::Value*> allocas_for_returned_scalars;
-  if (!return_shape.IsTuple()) {
-    allocas_for_returned_scalars.push_back(return_buffer);
-  } else {
-    allocas_for_returned_scalars =
-        llvm_ir::EmitTupleAllocasAtFunctionEntry(return_shape, builder);
-    llvm_ir::IrArray tuple_array(return_buffer, return_buffer_type,
-                                 return_shape);
-
-    llvm_ir::EmitTuple(tuple_array, allocas_for_returned_scalars, builder);
-  }
-
-  TF_RETURN_IF_ERROR(
-      CallNestedComputation(builder, ir_emitter_context, computation,
-                            parameter_elements_addrs, return_buffer));
+GpuExecutable::ConstantInfo AppendGlobalConstant(
+    llvm::Module* module, int64_t num_elements, int64_t bytes_per_element,
+    absl::string_view symbol_name, int allocation_idx,
+    DenseDataIntermediate content) {
+  // LLVM and PTXAS don't deal well with large constants, so we only emit very
+  // small constants directly in LLVM IR.  Larger constants are emitted with
+  // zero initializers in LLVM IR and are later overwritten when the PTX/CUBIN
+  // is loaded.
+  bool should_emit_initializer = num_elements <= 1;
+
+  llvm::IRBuilder<> b(module->getContext());
+  // Ptxas has issues if the constant allocation is smaller than 64 bytes.
+  // TODO(b/253259975): Remove when fixed ptxas version is submitted.
+  constexpr int64_t kMinConstAllocationInBytes = 64;
+  bool needs_padding =
+      num_elements * bytes_per_element < kMinConstAllocationInBytes;
+
+  llvm::ArrayType* global_type = llvm::ArrayType::get(
+      b.getInt8Ty(),
+      std::max(num_elements * bytes_per_element, kMinConstAllocationInBytes));
+
+  GpuExecutable::ConstantInfo info;
+  llvm::Constant* initializer = [&]() -> llvm::Constant* {
+    if (!should_emit_initializer) {
+      info.content = std::move(content);
+      return llvm::ConstantAggregateZero::get(global_type);
+    }
 
-  std::vector<llvm::Value*> returned_scalars;
-  returned_scalars.reserve(allocas_for_returned_scalars.size());
-  for (llvm::Value* addr : allocas_for_returned_scalars) {
-    auto alloca = llvm::cast<llvm::AllocaInst>(addr);
-    returned_scalars.push_back(
-        builder->CreateLoad(alloca->getAllocatedType(), alloca));
-  }
-  return returned_scalars;
+    std::vector<uint8_t> padded(kMinConstAllocationInBytes, 0);
+    absl::c_copy(content.span(), padded.begin());
+    return llvm::ConstantDataArray::get<uint8_t>(
+        module->getContext(),
+        needs_padding ? llvm::ArrayRef<uint8_t>(padded)
+                      : llvm::ArrayRef<uint8_t>(content.span().data(),
+                                                content.span().size()));
+  }();
+
+  // Explicitly set global addrspace for SPIR backend.
+  int addrspace = llvm::Triple(module->getTargetTriple()).isSPIR() ? 1 : 0;
+  // These globals will be looked up by name by GpuExecutable so we need to
+  // give them an external linkage.  Not all of their uses are visible in
+  // the LLVM IR so we can't give then a linkage that merely preserves their
+  // names (like available_externally), we also need to ensure that they stick
+  // around even if they're "unused".
+  //
+  // We may have to be more clever here in the future if we notice that we're
+  // keeping around too many globals because of their linkage.
+  auto* global_for_const = new llvm::GlobalVariable(
+      global_type, /*isConstant=*/should_emit_initializer,
+      llvm::GlobalValue::ExternalLinkage,
+      /*Initializer=*/initializer, symbol_name,
+      /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
+      /*AddressSpace=*/addrspace,
+      /*isExternallyInitialized=*/false);
+  global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
+  module->insertGlobalVariable(global_for_const);
+
+  info.symbol_name.assign(symbol_name);
+  info.allocation_index = allocation_idx;
+  return info;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.h b/third_party/xla/xla/service/gpu/ir_emitter_nested.h
index 552ce9fba37edb..5f53c25cdd8042 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.h
@@ -16,14 +16,18 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
 #define XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
 
+#include <cstdint>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 
 namespace xla {
@@ -45,21 +49,19 @@ namespace gpu {
 //   - a pointer to the top-level temp buffer.
 absl::Status CallNestedComputation(llvm::IRBuilderBase* builder,
                                    IrEmitterContext& ir_emitter_context,
+                                   llvm::Module* llvm_module,
                                    const HloComputation& computation,
                                    absl::Span<llvm::Value* const> operands,
                                    llvm::Value* output);
 
-// Like CallNestedComputation, but parameters and results are scalars.
-absl::StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalars(
-    llvm::IRBuilderBase* builder, IrEmitterContext& ir_emitter_context,
-    const HloComputation& computation,
-    absl::Span<llvm::Value* const> parameter_elements);
-
-// Like CallNestedComputationWithScalars, but parameters are scalar addresses.
-absl::StatusOr<std::vector<llvm::Value*>> CallNestedComputationWithScalarAddrs(
-    llvm::IRBuilderBase* builder, IrEmitterContext& ir_emitter_context,
-    const HloComputation& computation,
-    absl::Span<llvm::Value* const> parameter_elements_addrs);
+// Emit a constant with a given number of element, given byte size of the
+// element, given symbol name and content.
+GpuExecutable::ConstantInfo AppendGlobalConstant(llvm::Module* module,
+                                                 int64_t num_elements,
+                                                 int64_t bytes_per_element,
+                                                 absl::string_view symbol_name,
+                                                 int allocation_idx,
+                                                 DenseDataIntermediate content);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/kernel_call.cc b/third_party/xla/xla/service/gpu/kernel_call.cc
index 0038faa9153d93..c5c876e02837bf 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call.cc
@@ -29,9 +29,9 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_call.h b/third_party/xla/xla/service/gpu/kernel_call.h
index c158bbf9cd30c0..104ef7fcbdf548 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.h
+++ b/third_party/xla/xla/service/gpu/kernel_call.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/stream_executor/launch_dim.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernel_call_test.cc b/third_party/xla/xla/service/gpu/kernel_call_test.cc
index de0c4c66ea25d6..030324a94548e4 100644
--- a/third_party/xla/xla/service/gpu/kernel_call_test.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc b/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
index d10d59e2af0568..1319bc85cdf3c4 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
@@ -15,10 +15,13 @@ limitations under the License.
 #include "xla/service/gpu/kernel_reuse_cache.h"
 
 #include <functional>
+#include <optional>
 #include <string>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -27,7 +30,6 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache.h b/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
index 4cfa8160cd090c..4347a899e1c621 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/ir/hlo_computation.h"
-#include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/kernel_reuse_cache.pb.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/executable.proto b/third_party/xla/xla/service/gpu/kernel_reuse_cache.proto
similarity index 80%
rename from third_party/xla/xla/service/gpu/executable.proto
rename to third_party/xla/xla/service/gpu/kernel_reuse_cache.proto
index e7228bb07bae36..1a0888f6c795c1 100644
--- a/third_party/xla/xla/service/gpu/executable.proto
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.proto
@@ -17,16 +17,8 @@ syntax = "proto3";
 
 package xla.gpu;
 
-import "xla/service/hlo.proto";
-import "xla/xla.proto";
-
-message CompilationResultProto {
-  HloModuleProtoWithConfig hlo_module_with_config = 1;
-  BufferAssignmentProto buffer_assignment = 2;
-  string asm_text = 3;
-  bytes binary = 4;
-  map<string, bytes> dnn_compiled_graphs = 5;
-}
+option java_outer_classname = "KernelReuseCache";
+option java_multiple_files = true;
 
 message CompilationCacheEntryProto {
   message LaunchDimensionsProto {
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc b/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc
index 3f32225a72759c..fa428fc8f54c4f 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/kernel_reuse_cache.h"
 
+#include <string>
+
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index f8cc932bba080d..26d4fcdf334684 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -1,8 +1,12 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
-load("//xla:xla.default.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_cc_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_windows")
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tf_proto_library",
+)
 load(
     "//xla/tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -56,19 +60,49 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "custom_kernel_proto",
+    srcs = ["custom_kernel.proto"],
+    protodeps = [
+        "//xla/stream_executor:kernel_spec_proto",
+        "//xla/stream_executor:launch_dim_proto",
+    ],
+    visibility = [":friends"],
+)
+
 cc_library(
     name = "custom_kernel",
     srcs = ["custom_kernel.cc"],
     hdrs = ["custom_kernel.h"],
     visibility = [":friends"],
     deps = [
+        ":custom_kernel_proto_cc",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
     ],
 )
 
+xla_cc_test(
+    name = "custom_kernel_test",
+    srcs = ["custom_kernel_test.cc"],
+    deps = [
+        ":custom_kernel",
+        ":custom_kernel_proto_cc",
+        "//xla/stream_executor:launch_dim",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 # Bundle all custom fusions into a single target, so we can link all fusions and patterns by adding
 # a single dependency.
 cc_library(
@@ -208,16 +242,17 @@ xla_cc_binary(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_platform",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -397,14 +432,15 @@ xla_test(
         ":ptx_custom_kernel",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_platform",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
     ],
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
index f6a27c330eb8f9..d5b413bd166f9d 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
@@ -20,10 +20,12 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
@@ -79,4 +81,39 @@ std::string CustomKernel::ToString() const {
       thread_dims_.y, thread_dims_.z, cluster_dims_str, shared_memory_bytes_);
 }
 
+absl::StatusOr<CustomKernelProto> CustomKernel::ToProto() const {
+  CustomKernelProto proto;
+  proto.set_name(name_);
+  TF_ASSIGN_OR_RETURN(*proto.mutable_kernel_spec(), kernel_spec_.ToProto());
+  *proto.mutable_block_dims() = block_dims_.ToProto();
+  *proto.mutable_thread_dims() = thread_dims_.ToProto();
+  if (cluster_dims_.has_value()) {
+    *proto.mutable_cluster_dim() = cluster_dims_->ToProto();
+  }
+  proto.set_shared_memory_bytes(shared_memory_bytes_);
+  return proto;
+}
+
+absl::StatusOr<CustomKernel> CustomKernel::FromProto(
+    const CustomKernelProto& proto,
+    const std::optional<se::KernelLoaderSpec::SymbolResolver>&
+        symbol_resolver) {
+  TF_ASSIGN_OR_RETURN(
+      se::KernelLoaderSpec kernel_spec,
+      se::KernelLoaderSpec::FromProto(proto.kernel_spec(), symbol_resolver));
+  TF_ASSIGN_OR_RETURN(se::BlockDim block_dims,
+                      se::BlockDim::FromProto(proto.block_dims()));
+  TF_ASSIGN_OR_RETURN(se::ThreadDim thread_dims,
+                      se::ThreadDim::FromProto(proto.thread_dims()));
+  if (proto.has_cluster_dim()) {
+    TF_ASSIGN_OR_RETURN(se::ClusterDim cluster_dims,
+                        se::ClusterDim::FromProto(proto.cluster_dim()));
+    return CustomKernel(proto.name(), std::move(kernel_spec), block_dims,
+                        thread_dims, cluster_dims, proto.shared_memory_bytes());
+  }
+
+  return CustomKernel(proto.name(), std::move(kernel_spec), block_dims,
+                      thread_dims, proto.shared_memory_bytes());
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel.h b/third_party/xla/xla/service/gpu/kernels/custom_kernel.h
index 1716761aa1d364..a05def65fe7380 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel.h
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel.h
@@ -20,7 +20,9 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/service/gpu/kernels/custom_kernel.pb.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 
@@ -67,6 +69,13 @@ class CustomKernel {
 
   std::string ToString() const;
 
+  absl::StatusOr<CustomKernelProto> ToProto() const;
+
+  static absl::StatusOr<CustomKernel> FromProto(
+      const CustomKernelProto& proto,
+      const std::optional<se::KernelLoaderSpec::SymbolResolver>&
+          symbol_resolver = std::nullopt);
+
  private:
   std::string name_;
   se::KernelLoaderSpec kernel_spec_;
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel.proto b/third_party/xla/xla/service/gpu/kernels/custom_kernel.proto
new file mode 100644
index 00000000000000..cdf6929eefd654
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel.proto
@@ -0,0 +1,18 @@
+syntax = "proto3";
+
+package xla.gpu;
+
+import "xla/stream_executor/kernel_spec.proto";
+import "xla/stream_executor/launch_dim.proto";
+
+option java_multiple_files = true;
+option java_outer_classname = "CustomKernel";
+
+message CustomKernelProto {
+  string name = 1;
+  stream_executor.KernelLoaderSpecProto kernel_spec = 2;
+  stream_executor.BlockDimProto block_dims = 3;
+  stream_executor.ThreadDimProto thread_dims = 4;
+  optional stream_executor.ClusterDimProto cluster_dim = 5;
+  int64 shared_memory_bytes = 6;
+}
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel_test.cc
new file mode 100644
index 00000000000000..5da5d56d02c819
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/custom_kernel.h"
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/gpu/kernels/custom_kernel.pb.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+using ::testing::Field;
+using ::testing::Optional;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+void SomeKernel(int* x) { *x = 42; }
+
+TEST(CustomKernelTest, ToProto) {
+  CustomKernel custom_kernel(
+      "kernel_name",
+      stream_executor::KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+          "persistent_kernel_name",
+          /*symbol=*/absl::bit_cast<void*>(&SomeKernel), "kernel_name",
+          /*arity=*/42),
+      stream_executor::BlockDim(1, 2, 3), stream_executor::ThreadDim(4, 5, 6),
+      /*shared_memory_bytes=*/7);
+  TF_ASSERT_OK_AND_ASSIGN(CustomKernelProto proto, custom_kernel.ToProto());
+
+  EXPECT_THAT(
+      proto, tsl::proto_testing::EqualsProto(R"pb(
+        name: "kernel_name"
+        kernel_spec {
+          in_process_symbol { persistent_name: "persistent_kernel_name" }
+          kernel_name: "kernel_name"
+          arity: 42
+        }
+        block_dims { coordinates { x: 1 y: 2 z: 3 } }
+        thread_dims { coordinates { x: 4 y: 5 z: 6 } }
+        shared_memory_bytes: 7
+      )pb"));
+}
+
+TEST(CustomKernelTest, ToProtoWithClusterDims) {
+  CustomKernel custom_kernel(
+      "kernel_name",
+      stream_executor::KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+          "persistent_kernel_name",
+          /*symbol=*/absl::bit_cast<void*>(&SomeKernel), "kernel_name_in_spec",
+          /*arity=*/42),
+      stream_executor::BlockDim(1, 2, 3), stream_executor::ThreadDim(4, 5, 6),
+      stream_executor::ClusterDim(7, 8, 9),
+      /*shared_memory_bytes=*/10);
+  TF_ASSERT_OK_AND_ASSIGN(CustomKernelProto proto, custom_kernel.ToProto());
+
+  EXPECT_THAT(
+      proto, tsl::proto_testing::EqualsProto(R"pb(
+        name: "kernel_name"
+        kernel_spec {
+          in_process_symbol { persistent_name: "persistent_kernel_name" }
+          kernel_name: "kernel_name_in_spec"
+          arity: 42
+        }
+        block_dims { coordinates { x: 1 y: 2 z: 3 } }
+        thread_dims { coordinates { x: 4 y: 5 z: 6 } }
+        cluster_dim { coordinates { x: 7 y: 8 z: 9 } }
+        shared_memory_bytes: 10
+      )pb"));
+}
+
+absl::StatusOr<void*> StaticSymbolResolver(absl::string_view persistent_name) {
+  // Resolves a symbol to the address of SomeKernel - no matter what the
+  // persistent name is.
+  return absl::bit_cast<void*>(&SomeKernel);
+}
+
+TEST(CustomKernelTest, FromProto) {
+  auto proto = ParseTextProtoOrDie<CustomKernelProto>(R"pb(
+    name: "kernel_name"
+    kernel_spec {
+      in_process_symbol { persistent_name: "persistent_kernel_name" }
+      kernel_name: "kernel_name_in_spec"
+      arity: 42
+    }
+    block_dims { coordinates { x: 1 y: 2 z: 3 } }
+    thread_dims { coordinates { x: 4 y: 5 z: 6 } }
+    shared_memory_bytes: 7
+  )pb");
+  TF_ASSERT_OK_AND_ASSIGN(CustomKernel custom_kernel,
+                          CustomKernel::FromProto(proto, StaticSymbolResolver));
+  EXPECT_EQ(custom_kernel.name(), "kernel_name");
+  EXPECT_EQ(custom_kernel.kernel_spec().kernel_name(), "kernel_name_in_spec");
+  EXPECT_EQ(custom_kernel.kernel_spec().arity(), 42);
+  EXPECT_THAT(custom_kernel.kernel_spec().in_process_symbol(),
+              Optional(Field(&stream_executor::InProcessSymbol::symbol,
+                             absl::bit_cast<void*>(&SomeKernel))));
+  EXPECT_THAT(custom_kernel.kernel_spec().in_process_symbol(),
+              Optional(Field(&stream_executor::InProcessSymbol::persistent_name,
+                             "persistent_kernel_name")));
+  EXPECT_EQ(custom_kernel.block_dims(), stream_executor::BlockDim(1, 2, 3));
+  EXPECT_EQ(custom_kernel.thread_dims(), stream_executor::ThreadDim(4, 5, 6));
+  EXPECT_EQ(custom_kernel.cluster_dims(), std::nullopt);
+  EXPECT_EQ(custom_kernel.shared_memory_bytes(), 7);
+}
+
+TEST(CustomKernelTest, FromProtoWithClusterDims) {
+  auto proto = ParseTextProtoOrDie<CustomKernelProto>(R"pb(
+    name: "kernel_name"
+    kernel_spec {
+      in_process_symbol { persistent_name: "persistent_kernel_name" }
+      kernel_name: "kernel_name_in_spec"
+      arity: 42
+    }
+    block_dims { coordinates { x: 1 y: 2 z: 3 } }
+    thread_dims { coordinates { x: 4 y: 5 z: 6 } }
+    cluster_dim { coordinates { x: 7 y: 8 z: 9 } }
+    shared_memory_bytes: 10
+  )pb");
+  TF_ASSERT_OK_AND_ASSIGN(CustomKernel custom_kernel,
+                          CustomKernel::FromProto(proto, StaticSymbolResolver));
+  EXPECT_EQ(custom_kernel.name(), "kernel_name");
+  EXPECT_EQ(custom_kernel.kernel_spec().kernel_name(), "kernel_name_in_spec");
+  EXPECT_EQ(custom_kernel.kernel_spec().arity(), 42);
+  EXPECT_THAT(custom_kernel.kernel_spec().in_process_symbol(),
+              Optional(Field(&stream_executor::InProcessSymbol::symbol,
+                             absl::bit_cast<void*>(&SomeKernel))));
+  EXPECT_THAT(custom_kernel.kernel_spec().in_process_symbol(),
+              Optional(Field(&stream_executor::InProcessSymbol::persistent_name,
+                             "persistent_kernel_name")));
+  EXPECT_EQ(custom_kernel.block_dims(), stream_executor::BlockDim(1, 2, 3));
+  EXPECT_EQ(custom_kernel.thread_dims(), stream_executor::ThreadDim(4, 5, 6));
+  EXPECT_EQ(custom_kernel.cluster_dims(), stream_executor::ClusterDim(7, 8, 9));
+  EXPECT_EQ(custom_kernel.shared_memory_bytes(), 10);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
index 018f40c5a9cbef..5b6c49d234947a 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
@@ -17,15 +17,16 @@ limitations under the License.
 #include <cstring>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -67,18 +68,18 @@ static void BM_RowMajorGemm(benchmark::State& state) {
   se::DeviceMemory<float> b = executor->AllocateArray<float>(k * n, 0);
   se::DeviceMemory<float> c = executor->AllocateArray<float>(m * n, 0);
 
-  TF_CHECK_OK(stream->Memset32(&a, BitPattern(1.1f), a.size()));
-  TF_CHECK_OK(stream->Memset32(&b, BitPattern(1.2f), b.size()));
-  TF_CHECK_OK(stream->MemZero(&c, c.size()));
+  CHECK_OK(stream->Memset32(&a, BitPattern(1.1f), a.size()));
+  CHECK_OK(stream->Memset32(&b, BitPattern(1.2f), b.size()));
+  CHECK_OK(stream->MemZero(&c, c.size()));
 
   se::KernelArgsDeviceMemoryArray args(
       std::vector<se::DeviceMemoryBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
 
   for (auto s : state) {
-    TF_CHECK_OK(gemm->Launch(custom_kernel.thread_dims(),
-                             custom_kernel.block_dims(), stream.get(), args));
-    TF_CHECK_OK(stream->BlockHostUntilDone());
+    CHECK_OK(gemm->Launch(custom_kernel.thread_dims(),
+                          custom_kernel.block_dims(), stream.get(), args));
+    CHECK_OK(stream->BlockHostUntilDone());
   }
 }
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index 50beeddeab1c6c..2f8950b839117d 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -248,8 +248,7 @@ std::optional<CustomKernelFusionPattern::Match>
 CutlassGemmWithDynamicUpdateSlicePattern::TryMatch(
     const se::DeviceDescription& device, HloInstruction* instr) const {
   // This pattern is disabled for VOLTA. See b/380087823.
-  if (std::holds_alternative<se::CudaComputeCapability>(
-          device.gpu_compute_capability())) {
+  if (device.gpu_compute_capability().IsCuda()) {
     if (device.cuda_compute_capability().major ==
         se::CudaComputeCapability::CudaComputeCapabilities::kVolta) {
       return std::nullopt;
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
index 9047fef63470f0..f38969b5fb5524 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
@@ -289,8 +289,9 @@ TEST_F(CutlassFusionTest, DoNotRewriteOnV100) {
   CustomKernelFusionPatternRegistry patterns;
   patterns.Emplace<CutlassGemmWithDynamicUpdateSlicePattern>();
 
-  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo(CudaComputeCapability{
-      CudaComputeCapability::CudaComputeCapabilities::kVolta, 0});
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo(
+      stream_executor::GpuComputeCapability{CudaComputeCapability{
+          CudaComputeCapability::CudaComputeCapabilities::kVolta, 0}});
   CustomKernelFusionRewriter pass(&device, /*kernel_index=*/0, &patterns);
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
diff --git a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
index af0c23cd694e5b..39beb77d4793ec 100644
--- a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
@@ -22,15 +22,16 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/stream_executor/cuda/cuda_platform.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -98,20 +99,20 @@ TEST(PtxCustomKernelTest, GetPtxCustomKernel) {
   se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
   se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
   se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-  TF_CHECK_OK(stream->Memset32(&a, 1, byte_length));
-  TF_CHECK_OK(stream->Memset32(&b, 2, byte_length));
-  TF_CHECK_OK(stream->MemZero(&c, byte_length));
+  CHECK_OK(stream->Memset32(&a, 1, byte_length));
+  CHECK_OK(stream->Memset32(&b, 2, byte_length));
+  CHECK_OK(stream->MemZero(&c, byte_length));
 
   se::KernelArgsDeviceMemoryArray args(
       std::vector<se::DeviceMemoryBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
-  TF_CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
-                             custom_kernel.block_dims(), stream.get(), args));
+  CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
+                          custom_kernel.block_dims(), stream.get(), args));
 
-  TF_CHECK_OK(stream->BlockHostUntilDone());
+  CHECK_OK(stream->BlockHostUntilDone());
 
   std::vector<int32_t> dst(4, 42);
-  TF_CHECK_OK(stream->Memcpy(dst.data(), c, byte_length));
+  CHECK_OK(stream->Memcpy(dst.data(), c, byte_length));
 
   std::vector<int32_t> expected = {3, 3, 3, 3};
   ASSERT_EQ(dst, expected);
@@ -138,20 +139,20 @@ TEST(PtxCustomKernelTest, GetPtxCustomKernelWithClusterDim) {
   se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
   se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
   se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-  TF_CHECK_OK(stream->Memset32(&a, 1, byte_length));
-  TF_CHECK_OK(stream->Memset32(&b, 2, byte_length));
-  TF_CHECK_OK(stream->MemZero(&c, byte_length));
+  CHECK_OK(stream->Memset32(&a, 1, byte_length));
+  CHECK_OK(stream->Memset32(&b, 2, byte_length));
+  CHECK_OK(stream->MemZero(&c, byte_length));
 
   se::KernelArgsDeviceMemoryArray args(
       std::vector<se::DeviceMemoryBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
-  TF_CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
-                             custom_kernel.block_dims(), stream.get(), args));
+  CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
+                          custom_kernel.block_dims(), stream.get(), args));
 
-  TF_CHECK_OK(stream->BlockHostUntilDone());
+  CHECK_OK(stream->BlockHostUntilDone());
 
   std::vector<int32_t> dst(4, 42);
-  TF_CHECK_OK(stream->Memcpy(dst.data(), c, byte_length));
+  CHECK_OK(stream->Memcpy(dst.data(), c, byte_length));
 
   ASSERT_THAT(dst, ElementsAre(3, 3, 3, 3));
   ASSERT_EQ(custom_kernel.ToString(),
@@ -217,20 +218,20 @@ TEST(PtxCustomKernelTest, GetOwnedPtxCustomKernel) {
   se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
   se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
   se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-  TF_CHECK_OK(stream->Memset32(&a, 1, byte_length));
-  TF_CHECK_OK(stream->Memset32(&b, 2, byte_length));
-  TF_CHECK_OK(stream->MemZero(&c, byte_length));
+  CHECK_OK(stream->Memset32(&a, 1, byte_length));
+  CHECK_OK(stream->Memset32(&b, 2, byte_length));
+  CHECK_OK(stream->MemZero(&c, byte_length));
 
   se::KernelArgsDeviceMemoryArray args(
       std::vector<se::DeviceMemoryBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
-  TF_CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
-                             custom_kernel.block_dims(), stream.get(), args));
+  CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
+                          custom_kernel.block_dims(), stream.get(), args));
 
-  TF_CHECK_OK(stream->BlockHostUntilDone());
+  CHECK_OK(stream->BlockHostUntilDone());
 
   std::vector<int32_t> dst(4, 42);
-  TF_CHECK_OK(stream->Memcpy(dst.data(), c, byte_length));
+  CHECK_OK(stream->Memcpy(dst.data(), c, byte_length));
 
   ASSERT_THAT(dst, ElementsAre(3, 3, 3, 3));
 }
diff --git a/third_party/xla/xla/service/gpu/launch_dimensions_test.cc b/third_party/xla/xla/service/gpu/launch_dimensions_test.cc
index a5386450816b31..c8dfe1becdfc1d 100644
--- a/third_party/xla/xla/service/gpu/launch_dimensions_test.cc
+++ b/third_party/xla/xla/service/gpu/launch_dimensions_test.cc
@@ -17,16 +17,15 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "google/protobuf/text_format.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOkAndHolds;
 
 TEST(LaunchDimensionsTest, DefaultConstruction) {
   LaunchDimensions dimensions{};
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
new file mode 100644
index 00000000000000..0985ad404f4b9e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
@@ -0,0 +1,120 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/legacy_gpu_aot_compilation_result.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/alias_info.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
+#include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+LegacyGpuAotCompilationResult::FromModule(
+    const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+    absl::string_view asm_text, absl::Span<const uint8_t> binary,
+    const BinaryMap& dnn_compiled_graphs, int pointer_size) {
+  tsl::profiler::TraceMe traceme("ResultFromModule");
+  GpuExecutableProto proto;
+  *proto.mutable_hlo_module_with_config() = hlo_module->ToProtoWithConfig();
+  *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
+  proto.set_asm_text(asm_text);
+  proto.set_binary(binary.data(), binary.size());
+  proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs.cbegin(),
+                                              dnn_compiled_graphs.cend());
+  return std::unique_ptr<LegacyGpuAotCompilationResult>(
+      new LegacyGpuAotCompilationResult(hlo_module->Clone(), std::move(proto),
+                                        pointer_size));
+}
+
+absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+LegacyGpuAotCompilationResult::FromString(const std::string& serialized,
+                                          int pointer_size) {
+  tsl::profiler::TraceMe traceme("ResultFromString");
+  GpuExecutableProto proto;
+  if (!proto.ParseFromString(serialized)) {
+    return Internal(
+        "Failed to parse serialized LegacyGpuAotCompilationResult.");
+  }
+
+  return FromProto(proto, pointer_size);
+}
+
+absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+LegacyGpuAotCompilationResult::FromProto(const GpuExecutableProto& proto,
+                                         int pointer_size) {
+  tsl::profiler::TraceMe traceme("ResultFromProto");
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
+  return std::unique_ptr<LegacyGpuAotCompilationResult>(
+      new LegacyGpuAotCompilationResult(std::move(module), std::move(proto),
+                                        pointer_size));
+}
+
+absl::StatusOr<std::string> LegacyGpuAotCompilationResult::SerializeAsString()
+    const {
+  return proto_.SerializeAsString();
+}
+
+absl::StatusOr<std::unique_ptr<Executable>>
+LegacyGpuAotCompilationResult::LoadExecutable(
+    Compiler* compiler, const se::StreamExecutor* stream_exec) && {
+  if (stream_exec == nullptr) {
+    return InvalidArgument("Stream executor is null.");
+  }
+
+  return compiler->LoadExecutableFromAotResult(*this, *stream_exec);
+}
+
+absl::StatusOr<std::unique_ptr<BufferAssignment>>
+LegacyGpuAotCompilationResult::buffer_assignment() const {
+  auto buffer_size_bytes_function =
+      [pointer_size = pointer_size_](const BufferValue& buffer) {
+        return gpu::ShapeSizeBytesFunction(pointer_size)(buffer.shape());
+      };
+
+  // Recreate BufferAssignment from proto.
+  // Technically, we should pass the proper GpuAliasInfo, but the FromProto()
+  // method does not actually make use of the MayAlias function. And for now, we
+  // don't have backend-specific MustAlias rules.
+  // TODO(b/424109294): This needs to be fixed when we implement
+  // backend-specific MustAlias rules.
+  AliasInfo alias_info;
+  return BufferAssignment::FromProto(proto_.buffer_assignment(), module_.get(),
+                                     buffer_size_bytes_function, &alias_info);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
new file mode 100644
index 00000000000000..b97cecd5a2ebce
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
@@ -0,0 +1,92 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_LEGACY_GPU_AOT_COMPILATION_RESULT_H_
+#define XLA_SERVICE_GPU_LEGACY_GPU_AOT_COMPILATION_RESULT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// Represents the legacy result of a GPU AOT compilation.
+//
+// This result primarily contains the optimized HLO module. Executables loaded
+// from this result can bypass the HLO optimization passes, since this result
+// already contains the optimized HLO.
+//
+// This class is considered legacy and is expected to be replaced by a
+// new AOT result type as part of the runtime split. The new type will
+// encapsulate the compilation up to the Thunks generation stage.
+class LegacyGpuAotCompilationResult : public AotCompilationResult {
+ public:
+  static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+  FromModule(const HloModule* hlo_module,
+             const BufferAssignment* buffer_assignment,
+             absl::string_view asm_text, absl::Span<const uint8_t> binary,
+             const BinaryMap& dnn_compiled_graphs, int pointer_size);
+
+  static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+  FromString(const std::string& serialized, int pointer_size);
+
+  static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+  FromProto(const GpuExecutableProto& proto, int pointer_size);
+
+  absl::StatusOr<std::string> SerializeAsString() const override;
+
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
+      override;
+
+  const HloModule* optimized_module() const override { return module_.get(); }
+  std::unique_ptr<HloModule> consume_optimized_module() override {
+    return std::move(module_);
+  }
+
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const override;
+
+  const GpuExecutableProto& GetGpuExecutableProto() const { return proto_; }
+
+ private:
+  LegacyGpuAotCompilationResult(std::unique_ptr<HloModule> module,
+                                GpuExecutableProto proto, int pointer_size)
+      : module_(std::move(module)),
+        proto_(std::move(proto)),
+        pointer_size_(pointer_size) {}
+
+  std::unique_ptr<HloModule> module_;
+  GpuExecutableProto proto_;
+  int pointer_size_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_LEGACY_GPU_AOT_COMPILATION_RESULT_H_
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index cdedeafa4ee132..26a04585330adf 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -357,7 +357,12 @@ cc_library(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":llvm_gpu_backend",
         "//xla/service/llvm_ir:llvm_command_line_options",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
index b9fa8a83cad1ea..c927c1eb627731 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
@@ -206,8 +206,8 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
   VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
 
   bool keep_tempfiles = false;
-  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
-                                      /*default_val=*/false, &keep_tempfiles));
+  CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
+                                   /*default_val=*/false, &keep_tempfiles));
   // Prepare filenames for all stages of compilation:
   // IR, binary ISA, and HSACO.
   std::string random_number = std::to_string(tsl::random::New64());
@@ -340,8 +340,7 @@ absl::Status AMDGPUTargetModuleLinker(
     const std::string& device_bitcode_dir_path) {
   // Link the input module with ROCDL.
 
-  auto compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
+  auto compute_capability = gpu_version.rocm_compute_capability();
   if (!compute_capability) {
     return xla::Internal("Incompatible compute capability was specified.");
   }
@@ -421,8 +420,7 @@ std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
 std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
     llvm::Triple target_triple, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options) {
-  auto compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
+  auto compute_capability = gpu_version.rocm_compute_capability();
 
   std::string gcn_arch_name = compute_capability->gcn_arch_name();
   auto arch = GetFeatureStrFromGCNArchName(gcn_arch_name);
@@ -606,8 +604,7 @@ absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
         tsl::profiler::TraceMeLevel::kInfo);
     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
 
-    auto compute_capability =
-        std::get_if<se::RocmComputeCapability>(&gpu_version);
+    auto compute_capability = gpu_version.rocm_compute_capability();
     if (!compute_capability) {
       return xla::Internal("Incompatible compute capability was specified.");
     }
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py b/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
index 15758215ba2835..32f41027529958 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
@@ -75,7 +75,7 @@ def main():
 
 namespace {cpp_namespace} {{
   inline const char kRaw_{cpp_identifier}[] = "{data_string}";
-  constexpr llvm::StringRef {cpp_identifier}{{kRaw_{cpp_identifier}, sizeof(kRaw_{cpp_identifier})}};
+  constexpr llvm::StringRef {cpp_identifier}{{kRaw_{cpp_identifier}, sizeof(kRaw_{cpp_identifier}) - 1}};
 }} // namespace {cpp_namespace}
 """)
 
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 940c864f676ff0..278519790ab1b7 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -264,9 +264,9 @@ absl::Status LinkAndOptimizeModule(
   llvm::ModuleAnalysisManager mam;
 
   xla::codegen::intrinsics::DeviceType device_type;
-  if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+  if (gpu_version.IsCuda()) {
     device_type = xla::codegen::intrinsics::DeviceType::kNvidiaGpu;
-  } else if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+  } else if (gpu_version.IsRocm()) {
     device_type = xla::codegen::intrinsics::DeviceType::kAmdGpu;
   } else {
     LOG(FATAL) << "Unsupported GPU type";
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
index d7fdf1c13b7dc1..b9e837eb5b9801 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
@@ -247,13 +247,16 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
       {12, 1}, {12, 0}, {11, 0}, {10, 3}, {10, 0}, {9, 0}, {8, 9}, {8, 7},
       {8, 6},  {8, 0},  {7, 5},  {7, 2},  {7, 0},  {6, 2}, {6, 1}, {6, 0},
       {5, 3},  {5, 2},  {5, 0},  {3, 7},  {3, 5},  {3, 2}, {3, 0}};
-  auto target_compute_capability = kSupportedVersions[0];
+  // Initialize to the least supported version, which acts as a safe fallback
+  auto target_compute_capability =
+      kSupportedVersions[std::size(kSupportedVersions) - 1];
 
   for (const auto& v : kSupportedVersions) {
-    if (!gpu_compute_capability.CanRunOn(v)) {
+    if (gpu_compute_capability.SupportsAllFeaturesOf(v)) {
+      // Found the most advanced supported capability
+      target_compute_capability = v;
       break;
     }
-    target_compute_capability = v;
   }
 
   if (target_compute_capability.major == gpu_compute_capability.major &&
@@ -273,7 +276,7 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
     // major version supports the forward compatible feature
     // extension.
     target_compute_capability.feature_extension =
-        se::CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures;
+        se::CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures;
   }
 
   // If the current CC isn't supported by LLVM and it is newer then
@@ -318,8 +321,7 @@ absl::StatusOr<std::string> CompileToPtx(
       return std::string();
     }
 
-    auto compute_capability =
-        std::get_if<se::CudaComputeCapability>(&gpu_version);
+    auto compute_capability = gpu_version.cuda_compute_capability();
     if (!compute_capability) {
       return xla::Internal("Incompatible compute capability was specified.");
     }
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
index 2a2463ea880557..7d0f0a75ec360c 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
@@ -40,7 +40,7 @@ TEST(UtilsTest, TestGetSmName) {
                 10, 0, FeatureExtension::kAcceleratedFeatures}),
             "sm_100a");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{
-                10, 0, FeatureExtension::kForwardCompatibleFeatures}),
+                10, 0, FeatureExtension::kFamilyCompatibleFeatures}),
             "sm_100f");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{
                 10, 3, FeatureExtension::kAcceleratedFeatures}),
@@ -56,6 +56,10 @@ TEST(UtilsTest, TestGetSmName) {
             "sm_121a");
   // Do not use the extension for a yet-unknown compute capability.
   // https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes-ptx-release-history
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 9}), "sm_103f");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{
+                10, 9, FeatureExtension::kAcceleratedFeatures}),
+            "sm_103f");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 9}), "sm_121f");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{13, 0}), "sm_121");
 }
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index e30cab9cb9b69a..70ce5def839dba 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -47,8 +46,8 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
@@ -324,7 +323,7 @@ absl::StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   Shape c_matrix_shape = c_shape;
   // hipBlasLt does not yet support the C matrix to be BF16 for fp8 matmul
   // with fp8 output. Thus only do this for CUDA side.
-  if (std::holds_alternative<se::CudaComputeCapability>(gpu_version) &&
+  if (gpu_version.IsCuda() &&
       primitive_util::IsF8Type(lhs_shape.element_type()) &&
       primitive_util::IsF8Type(output_shape.element_type()) && (beta == 0.0)) {
     // By default, if c is not present (i.e., beta is 0), c_shape will be the
@@ -534,7 +533,7 @@ absl::Status DoGemmWithAlgorithm(const se::gpu::MatrixDescriptor& lhs,
                                  PrecisionConfig::Algorithm precision_algorithm,
                                  se::blas::AlgorithmType algorithm,
                                  se::blas::ComputePrecision compute_precision,
-                                 const se::NumericOptions& numeric_options,
+                                 const se::EngineOptions& engine_options,
                                  se::blas::ProfileResult* profile_result,
                                  se::blas::CallContext context) {
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
@@ -560,14 +559,14 @@ absl::Status DoGemmWithAlgorithm(const se::gpu::MatrixDescriptor& lhs,
         alpha, lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
         rhs.cast<Input>(), rhs.leading_dim_stride, rhs.batch_stride, beta,
         &output_data, output.leading_dim_stride, output.batch_stride,
-        output.batch_size, computation_type, algorithm, numeric_options,
+        output.batch_size, computation_type, algorithm, engine_options,
         profile_result, context);
   }
   return blas->BlasGemmWithAlgorithm(
       stream, lhs.transpose, rhs.transpose, output.m, output.n, output.k, alpha,
       lhs.cast<Input>(), lhs.leading_dim_stride, rhs.cast<Input>(),
       rhs.leading_dim_stride, beta, &output_data, output.leading_dim_stride,
-      computation_type, algorithm, numeric_options, profile_result, context);
+      computation_type, algorithm, engine_options, profile_result, context);
 }
 
 template <typename Scale, typename Input, typename Output>
@@ -579,7 +578,7 @@ absl::Status DoGemm(const se::gpu::MatrixDescriptor& lhs,
                     PrecisionConfig::Algorithm precision_algorithm,
                     std::optional<se::blas::AlgorithmType> algorithm,
                     se::blas::ComputePrecision compute_precision,
-                    const se::NumericOptions& numeric_options,
+                    const se::EngineOptions& engine_options,
                     se::blas::ProfileResult* profile_result,
                     se::blas::CallContext context) {
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
@@ -592,8 +591,7 @@ absl::Status DoGemm(const se::gpu::MatrixDescriptor& lhs,
   if (algorithm) {
     return DoGemmWithAlgorithm<Scale, Input, Output>(
         lhs, rhs, output, workspace, alpha, beta, stream, precision_algorithm,
-        *algorithm, compute_precision, numeric_options, profile_result,
-        context);
+        *algorithm, compute_precision, engine_options, profile_result, context);
   }
 
   // Set a workspace for all Blas operations launched below.
@@ -605,14 +603,14 @@ absl::Status DoGemm(const se::gpu::MatrixDescriptor& lhs,
         alpha, lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
         rhs.cast<Input>(), rhs.leading_dim_stride, rhs.batch_stride, beta,
         &output_data, output.leading_dim_stride, output.batch_stride,
-        output.batch_size, numeric_options, context);
+        output.batch_size, engine_options, context);
   }
 
   return blas->BlasGemm(stream, lhs.transpose, rhs.transpose, output.m,
                         output.n, output.k, alpha, lhs.cast<Input>(),
                         lhs.leading_dim_stride, rhs.cast<Input>(),
                         rhs.leading_dim_stride, beta, &output_data,
-                        output.leading_dim_stride, numeric_options, context);
+                        output.leading_dim_stride, engine_options, context);
 }
 
 }  // namespace
@@ -630,10 +628,11 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
       GemmConfig::DescriptorsTuple desc,
       config.GetMatrixDescriptors(lhs_buffer, rhs_buffer, output_buffer));
 
-  se::NumericOptions numeric_options{
+  se::EngineOptions engine_options{
       deterministic_ops,
-      /*allow_tf32=*/IsTf32Allowed(config.precision_algorithm,
-                                   config.compute_precision)};
+      /*allow_tf32=*/
+      IsTf32Allowed(config.precision_algorithm, config.compute_precision),
+      /*require_command_buffer=*/false};
 
   if (!algorithm) {
     algorithm = config.algorithm;
@@ -673,7 +672,7 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
         static_cast<NativeScaleType>(config.alpha.real()),                  \
         static_cast<NativeScaleType>(config.beta), stream,                  \
         config.precision_algorithm, algorithm, config.compute_precision,    \
-        numeric_options, profile_result, context);                          \
+        engine_options, profile_result, context);                           \
   }
 
 #define TYPED_GEMM_COMPLEX(SCALENTYPE, ATYPE, BTYPE, CTYPE)                 \
@@ -687,7 +686,7 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
         static_cast<NativeScaleType>(config.alpha),                         \
         static_cast<NativeScaleType>(config.beta), stream,                  \
         config.precision_algorithm, algorithm, config.compute_precision,    \
-        numeric_options, profile_result, context);                          \
+        engine_options, profile_result, context);                           \
   }
 
   if (config.output_layout.dtype == S32) {
@@ -700,7 +699,7 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
         desc.lhs, desc.rhs, desc.output, workspace_buffer,
         static_cast<int32_t>(config.alpha.real()),
         static_cast<int32_t>(config.beta), stream, PrecisionConfig::ALG_UNSET,
-        *algorithm, se::blas::kDefaultComputePrecision, numeric_options,
+        *algorithm, se::blas::kDefaultComputePrecision, engine_options,
         profile_result, context);
   }
 
@@ -806,10 +805,10 @@ absl::StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
   TF_RET_CHECK(proto.num_warps() > 0);
   TF_RET_CHECK(proto.num_ctas() > 0);
 
-  return TritonGemmConfig(proto.block_m(), proto.block_n(), proto.block_k(),
-                          proto.split_k(), proto.num_stages(),
-                          proto.num_warps(), proto.num_ctas(),
-                          proto.is_tma_allowed());
+  return TritonGemmConfig(
+      proto.block_m(), proto.block_n(), proto.block_k(), proto.split_k(),
+      proto.num_stages(), proto.num_warps(), proto.num_ctas(),
+      proto.is_tma_allowed(), proto.is_warp_specialization_allowed());
 }
 
 AutotuneResult::TritonGemmKey TritonGemmConfig::ToProto() const {
@@ -822,15 +821,17 @@ AutotuneResult::TritonGemmKey TritonGemmConfig::ToProto() const {
   key.set_num_warps(num_warps);
   key.set_num_ctas(num_ctas);
   key.set_is_tma_allowed(is_tma_allowed);
+  key.set_is_warp_specialization_allowed(is_warp_specialization_allowed);
   return key;
 }
 
 std::string TritonGemmConfig::ToString() const {
-  return absl::StrCat("{block_m:", block_m, ",block_n:", block_n,
-                      ",block_k:", block_k, ",split_k:", split_k,
-                      ",num_stages:", num_stages, ",num_warps:", num_warps,
-                      ",num_ctas:", num_ctas,
-                      ",is_tma_allowed:", is_tma_allowed, "}");
+  return absl::StrCat(
+      "{block_m:", block_m, ",block_n:", block_n, ",block_k:", block_k,
+      ",split_k:", split_k, ",num_stages:", num_stages,
+      ",num_warps:", num_warps, ",num_ctas:", num_ctas,
+      ",is_tma_allowed:", is_tma_allowed,
+      ",is_warp_specialization_allowed:", is_warp_specialization_allowed, "}");
 }
 
 absl::StatusOr<bool> IsMatrixMultiplicationTooSmallForRewriting(
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index ae2a861c6cc12d..6328207fad7370 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -179,7 +179,8 @@ struct TritonGemmConfig {
   constexpr TritonGemmConfig() = default;
   constexpr TritonGemmConfig(int block_m, int block_n, int block_k, int split_k,
                              int num_stages, int num_warps, int num_ctas = 1,
-                             bool is_tma_allowed = false)
+                             bool is_tma_allowed = false,
+                             bool is_warp_specialization_allowed = false)
       : block_m(block_m),
         block_n(block_n),
         block_k(block_k),
@@ -187,7 +188,9 @@ struct TritonGemmConfig {
         num_stages(num_stages),
         num_warps(num_warps),
         num_ctas(num_ctas),
-        is_tma_allowed(is_tma_allowed) {}
+        is_tma_allowed(is_tma_allowed),
+        is_warp_specialization_allowed(is_warp_specialization_allowed) {}
+  // LINT.IfChange
   int block_m = 0;
   int block_n = 0;
   int block_k = 0;
@@ -198,6 +201,9 @@ struct TritonGemmConfig {
   int num_ctas = 0;
   // Allow/disallow TMA usage for all arguments of the kernel (where possible).
   bool is_tma_allowed = false;
+  // Allow/disallow automatic warp specialization.
+  bool is_warp_specialization_allowed = false;
+  // LINT.ThenChange(//tensorflow/compiler/xla/autotuning.proto)
 
   // When adding new members, please update all methods, such as ToTuple,
   // FromProto, ToProto, ToString, etc. Updating ToTuple is not enough.
@@ -209,7 +215,8 @@ struct TritonGemmConfig {
  private:
   auto ToTuple() const {
     return std::make_tuple(block_m, block_n, block_k, split_k, num_stages,
-                           num_warps, num_ctas, is_tma_allowed);
+                           num_warps, num_ctas, is_tma_allowed,
+                           is_warp_specialization_allowed);
   }
 
  public:
diff --git a/third_party/xla/xla/service/gpu/matmul_utils_test.cc b/third_party/xla/xla/service/gpu/matmul_utils_test.cc
index 96a13bd42b79a6..d8c0655a194da4 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils_test.cc
@@ -27,15 +27,12 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/shape.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 using CanFoldTransposeOperandIntoDotTest = HloHardwareIndependentTestBase;
 
 TEST_F(CanFoldTransposeOperandIntoDotTest, ArgTransposeFoldGemm) {
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 3bf5c991f37b6d..dadf0cf816aa26 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -5,10 +5,9 @@ load("//xla:xla.default.bzl", "xla_cc_test")
 
 # Libraries for performance modeling of HLO.
 load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/tsl:tsl.bzl", "if_google", "if_oss", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
-load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -32,6 +31,7 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
@@ -59,6 +59,7 @@ cc_library(
         ":sol_gpu_cost_model",
         "//xla:util",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
@@ -77,19 +78,20 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@llvm-project//mlir:IR",
     ],
 )
 
 xla_cc_test(
     name = "sol_latency_estimator_test",
     srcs = ["sol_latency_estimator_test.cc"],
+    tags = ["optonly"],  # Remove once b/463565595 is fixed.
     deps = [
         ":collective_interpolator",
         ":sol_gpu_cost_model",
         ":sol_latency_estimator",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
@@ -156,14 +158,10 @@ xla_test(
         "b200",
         "amdgpu_any",
     ],
-    tags =
-        if_oss(
-            # TODO(b/435404154): Reenable once this is fixed.
-            ["no_oss"],
-        ),
     deps = [
         ":analytical_latency_estimator",
         "//xla:shape_util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
@@ -172,9 +170,9 @@ xla_test(
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -220,17 +218,27 @@ cc_library(
     srcs = ["gpu_cost_model_stats_collection.cc"],
     hdrs = ["gpu_cost_model_stats_collection.h"],
     deps = [
+        ":block_level_parameters",
+        ":gpu_dot_fusion_cost_model",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_graph_dumper",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -240,15 +248,17 @@ xla_cc_test(
     deps = [
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -314,6 +324,7 @@ cc_library(
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/triton:fusion",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:backend_configs_cc",
@@ -327,7 +338,6 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -337,6 +347,7 @@ xla_cc_test(
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model_base",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -364,6 +375,7 @@ cc_library(
         ":gpu_performance_model_base",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -375,7 +387,6 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -390,6 +401,7 @@ xla_cc_test(
         ":gpu_performance_model_base",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -457,7 +469,6 @@ cc_library(
     name = "gpu_collective_performance_model",
     srcs = ["gpu_collective_performance_model.cc"],
     hdrs = ["gpu_collective_performance_model.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":coalescing_analysis",
         ":fusion_analysis_cache",
@@ -493,17 +504,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status",
-    ] + if_cuda_is_configured(["@local_config_cuda//cuda:cuda_headers"]),
-)
-
-xla_cc_test(
-    name = "gpu_collective_performance_model_test",
-    srcs = ["gpu_collective_performance_model_test.cc"],
-    deps = [
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -525,6 +525,7 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_cost_analysis",
@@ -560,6 +561,8 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -574,6 +577,7 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
@@ -586,6 +590,7 @@ cc_library(
     hdrs = [
         "block_level_parameters.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = ["//xla/service/gpu:backend_configs_cc"],
 )
 
@@ -613,6 +618,7 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/stream_executor:device_description",
@@ -634,6 +640,7 @@ xla_cc_test(
     deps = [
         ":triton_emitter_constraints",
         "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -641,7 +648,6 @@ xla_cc_test(
         "//xla/service:instruction_fusion",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log",
@@ -664,11 +670,13 @@ cc_library(
         "//xla/codegen/tiling:affine_map_evaluator",
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
@@ -688,6 +696,10 @@ xla_cc_test(
         "//xla/backends/gpu/codegen:fusions",
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/tiling:tiled_hlo_schedule",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
@@ -695,12 +707,11 @@ xla_cc_test(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -799,6 +810,9 @@ xla_test(
     # expressions like integer add and multiply.
     args = ["--xla_backend_optimization_level=0"],
     backends = ["gpu"],
+    fail_if_no_test_linked = False,  # NOLINT=not based on gtest and thus doesn't support --gunit_fail_if_no_test_linked.
+    fail_if_no_test_selected = False,  # NOLINT=not based on gtest and thus doesn't support --gunit_fail_if_no_test_selected.
+    shuffle_tests = False,  # CANNOT_SHUFFLE_TESTS=This test is not based on gtest and thus doesn't support shuffling.
     # This is a development tool, not a normal test, and thus should only be run
     # manually with --config=cuda.
     tags = [
@@ -816,6 +830,8 @@ xla_test(
         "//xla/service:hlo_runner",
         "//xla/service:platform_util",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
@@ -860,6 +876,7 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":sol_latency_estimator",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -877,7 +894,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -887,7 +903,10 @@ xla_cc_test(
     deps = [
         ":sol_gpu_cost_model_stats_collection",
         "//xla:shape_util",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
@@ -944,12 +963,14 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:overload",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -969,6 +990,7 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
@@ -1093,20 +1115,15 @@ cc_library(
     srcs = ["matmul_ptable_stats_collection.cc"],
     hdrs = ["matmul_ptable_stats_collection.h"],
     deps = [
-        ":block_level_parameters",
-        ":gpu_dot_fusion_cost_model",
         ":hlo_op_profile_proto_cc",
         ":hlo_op_profiles",
         ":matmul_interpolator",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -1119,7 +1136,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
-        "@llvm-project//mlir:IR",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index 000e97ac1cd87f..48fdba774a7b6f 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/time/time.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
@@ -89,7 +89,7 @@ AnalyticalLatencyEstimator::AnalyticalLatencyEstimator(
                                   /*min_latencies_seconds=*/{},
                                   /*count_multiple_input_accesses=*/true},
       gpu_info_);
-  TF_CHECK_OK(computation->Accept(&cost_analysis_.value()));
+  CHECK_OK(computation->Accept(&cost_analysis_.value()));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
index 2b1c51737dea1f..c23b78bbaefeda 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
index 7c660761734a3d..9e9e60691146ac 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
@@ -15,20 +15,19 @@ limitations under the License.
 
 #include "xla/service/gpu/model/analytical_latency_estimator.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <type_traits>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
@@ -40,7 +39,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
 
@@ -52,10 +50,10 @@ namespace {
 
 int64_t GetInstructionIndexInSchedule(
     absl::Span<HloInstruction* const> schedule, absl::string_view hlo_name) {
-  return std::find_if(schedule.begin(), schedule.end(),
-                      [hlo_name](HloInstruction* instruction) {
-                        return instruction->name() == hlo_name;
-                      }) -
+  return absl::c_find_if(schedule,
+                         [hlo_name](HloInstruction* instruction) {
+                           return instruction->name() == hlo_name;
+                         }) -
          schedule.begin();
 }
 
@@ -115,24 +113,21 @@ class AnalyticalLatencyHidingSchedulerTest : public GpuCodegenTest {
 
 TEST_F(AnalyticalLatencyHidingSchedulerTest, TestAnalyticalLatencyEstimator) {
   auto gpu_compute_capability = GetGpuComputeCapability();
-  auto visitor = [](const auto& c) {
-    using cc = std::remove_const_t<std::remove_reference_t<decltype(c)>>;
-    if constexpr (std::is_same_v<stream_executor::CudaComputeCapability, cc>) {
-      if (!c.IsAtLeast(se::CudaComputeCapability::kPascal)) {
-        GTEST_SKIP() << "This test is for Pascal+ GPUs.";
-      }
-      if (c.major == 12 && c.minor == 1) {
-        // Skip this test for Spark. Because of the AllReduce, the test uses
-        // gpu_collective_performance_model, which only makes sense in a
-        // datacenter network setting.
-        GTEST_SKIP() << "This test is for datacenter GPUs.";
-      }
-    } else if (!std::is_same_v<stream_executor::RocmComputeCapability, cc>) {
-      GTEST_SKIP() << "This test is for Pascal+ GPUs.";
-    }
-  };
+  if (gpu_compute_capability.IsRocm()) {
+    GTEST_SKIP() << "This test is for Pascal+ GPUs.";
+  }
+
+  auto* c = gpu_compute_capability.cuda_compute_capability();
+  if (!c->IsAtLeast(se::CudaComputeCapability::kPascal)) {
+    GTEST_SKIP() << "This test is for Pascal+ GPUs.";
+  }
+  if (c->major == 12 && c->minor == 1) {
+    // Skip this test for Spark. Because of the AllReduce, the test uses
+    // gpu_collective_performance_model, which only makes sense in a
+    // datacenter network setting.
+    GTEST_SKIP() << "This test is for datacenter GPUs.";
+  }
 
-  std::visit(visitor, gpu_compute_capability);
   const se::DeviceDescription dev_info =
       backend().default_stream_executor()->GetDeviceDescription();
 
diff --git a/third_party/xla/xla/service/gpu/model/block_level_parameters.h b/third_party/xla/xla/service/gpu/model/block_level_parameters.h
index 3aba427de0fb0f..380f857b2d9892 100644
--- a/third_party/xla/xla/service/gpu/model/block_level_parameters.h
+++ b/third_party/xla/xla/service/gpu/model/block_level_parameters.h
@@ -36,6 +36,7 @@ struct BlockLevelParameters {
   int num_ctas = 1;
   int num_stages = 1;
   bool is_tma_allowed = false;
+  bool is_warp_specialization_allowed = false;
 
   // Returns a BlockLevelParameters struct from a BlockLevelFusionConfig proto.
   static BlockLevelParameters FromBlockLevelFusionConfig(
@@ -45,6 +46,8 @@ struct BlockLevelParameters {
     result.num_ctas = config.num_ctas();
     result.num_stages = config.num_stages();
     result.is_tma_allowed = config.is_tma_allowed();
+    result.is_warp_specialization_allowed =
+        config.is_warp_specialization_allowed();
     result.output_tile_sizes.reserve(config.output_tiles_size());
     for (const auto& tile : config.output_tiles()) {
       result.output_tile_sizes.push_back(
@@ -65,6 +68,7 @@ struct BlockLevelParameters {
     config.set_num_ctas(num_ctas);
     config.set_num_stages(num_stages);
     config.set_is_tma_allowed(is_tma_allowed);
+    config.set_is_warp_specialization_allowed(is_warp_specialization_allowed);
     return config;
   }
 };
diff --git a/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc b/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc
index 7cf1aa13b21c19..9452e82afed4ca 100644
--- a/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc
+++ b/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc
@@ -36,6 +36,7 @@ TEST(BlockLevelParametersTest,
   block_level_fusion_config.set_num_ctas(13);
   block_level_fusion_config.set_num_stages(14);
   block_level_fusion_config.set_is_tma_allowed(true);
+  block_level_fusion_config.set_is_warp_specialization_allowed(true);
 
   BlockLevelParameters block_level_parameters =
       BlockLevelParameters::FromBlockLevelFusionConfig(
@@ -46,6 +47,7 @@ TEST(BlockLevelParametersTest,
   EXPECT_THAT(block_level_parameters.num_ctas, 13);
   EXPECT_THAT(block_level_parameters.num_stages, 14);
   EXPECT_THAT(block_level_parameters.is_tma_allowed, true);
+  EXPECT_THAT(block_level_parameters.is_warp_specialization_allowed, true);
 }
 
 TEST(BlockLevelParametersTest,
@@ -56,6 +58,7 @@ TEST(BlockLevelParametersTest,
   block_level_parameters.num_ctas = 13;
   block_level_parameters.num_stages = 14;
   block_level_parameters.is_tma_allowed = true;
+  block_level_parameters.is_warp_specialization_allowed = true;
 
   BlockLevelFusionConfig block_level_fusion_config =
       block_level_parameters.ToBlockLevelFusionConfig();
@@ -67,6 +70,7 @@ TEST(BlockLevelParametersTest,
   EXPECT_THAT(block_level_fusion_config.num_ctas(), 13);
   EXPECT_THAT(block_level_fusion_config.num_stages(), 14);
   EXPECT_THAT(block_level_fusion_config.is_tma_allowed(), true);
+  EXPECT_THAT(block_level_fusion_config.is_warp_specialization_allowed(), true);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 7c2696f0967c4f..5346566cb7c281 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -35,11 +36,13 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/codegen/tiling/affine_map_evaluator.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/layout.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -344,7 +347,7 @@ std::vector<Interval> FindIntervals(
   FindAllIndices(expr, 0, 0, dimension_ranges, symbol_ranges, &dimensions,
                  &symbols, &linear_indices);
 
-  std::sort(linear_indices.begin(), linear_indices.end());
+  absl::c_sort(linear_indices);
   linear_indices.erase(
       std::unique(linear_indices.begin(), linear_indices.end()),
       linear_indices.end());
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index 550d847d4af1b7..c3ac7c3a9a9054 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 06bc108e0ac839..d682e41e49b365 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -28,6 +29,10 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/tiling/tiled_hlo_schedule.h"
+#include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -38,9 +43,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -64,9 +68,9 @@ class CoalescingTest : public HloHardwareIndependentTestBase {
     auto fusion = dynamic_cast<KernelFusionInterface*>(emitter.get());
     EXPECT_NE(fusion, nullptr);
 
-    CoalescingAnalysis coalescing_analysis =
-        CoalescingAnalysis::Create(root, root->operands(), analysis,
-                                   &mlir_context_, /*use_heuristic=*/false);
+    CoalescingAnalysis coalescing_analysis = CoalescingAnalysis::Create(
+        root, root->operands(), analysis, &mlir_context_,
+        /*use_heuristic=*/false);
 
     std::vector<bool> results;
     for (const HloInstruction* operand : root->operands()) {
@@ -593,6 +597,7 @@ class CoalescingForTiledHloTest : public CoalescingTest {
     TiledHloComputation tiled_hlo_computation =
         *symbolic_tile_analysis.ComputeTiledHloInstructions(
             Tiling({{root, FlatTiling(tile_sizes.begin(), tile_sizes.end())}}),
+            CreateMajorToMinorTiledHloSchedule,
             /*constraints_are_known_satisfied=*/true,
             /*compute_all_tile_offset_indexing_maps=*/true);
 
@@ -616,6 +621,7 @@ class CoalescingForTiledHloTest : public CoalescingTest {
     TiledHloComputation tiled_hlo_computation =
         *symbolic_tile_analysis.ComputeTiledHloInstructions(
             Tiling({{root, FlatTiling(tile_sizes.begin(), tile_sizes.end())}}),
+            CreateMajorToMinorTiledHloSchedule,
             /*constraints_are_known_satisfied=*/true,
             /*compute_all_tile_offset_indexing_maps=*/true);
 
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
index 4d7313b8ddae71..61ac03f237ff80 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
@@ -21,14 +21,17 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/overload.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
+#include "google/protobuf/text_format.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -75,13 +78,24 @@ constexpr int64_t kMaxDefaultNumberOfParticipatingDevices = 8;
 
 constexpr int64_t kMinDefaultNumberOfParticipatingDevices = 2;
 
+// Collective op specification info, for all-gather, all-reduce, reduce-scatter,
+// etc.
+struct CollectiveOpSpecInfo {
+  CollectiveDeviceList collective_device_list;
+  GPUCommunicationType collective_comm;
+};
+
+// Collective permute op specification info, for collective-permute, etc.
+struct PermuteOpSpecInfo {
+  CollectivePermuteCostModelType permute_type;
+};
+
 struct InterpolationSpecification {
   HloOpcode opcode;
-  GPUCommunicationType comm;
   int64_t num_devices;
   int64_t transfer_size;
-  CollectiveDeviceList device_list;
   PrimitiveType data_type;
+  std::variant<CollectiveOpSpecInfo, PermuteOpSpecInfo> collective_params;
 };
 
 // Returns number of participating devices in an input `device_list`. Supports
@@ -105,13 +119,29 @@ absl::StatusOr<InterpolationSpecification> Spec(
         "Cannot construct module from profile: ", profile.DebugString()));
   }
   auto instr = module->entry_computation()->root_instruction();
-  auto collective = Cast<HloCollectiveInstruction>(instr);
 
+  if (instr->opcode() == HloOpcode::kCollectivePermute) {
+    auto* cp = Cast<HloCollectivePermuteInstruction>(instr);
+    GpuHloCostAnalysis analysis(GpuHloCostAnalysis::Options(), device_info);
+    TF_RETURN_IF_ERROR(cp->Accept(&analysis));
+    int64_t bytes_transferred = analysis.BytesTransferred(*cp);
+    CollectivePermuteCostModelType permute_type =
+        GetCollectivePermuteCostModelType(
+            *cp, /*num_devices_per_partition=*/num_devices_per_host);
+    VLOG(10) << "Spec permute_type: " << static_cast<int>(permute_type);
+    return InterpolationSpecification{/*opcode=*/cp->opcode(),
+                                      /*num_devices=*/2,
+                                      /*transfer_size=*/bytes_transferred,
+                                      /*data_type=*/cp->shape().element_type(),
+                                      /*collective_params=*/
+                                      PermuteOpSpecInfo{permute_type}};
+  }
+
+  auto collective = Cast<HloCollectiveInstruction>(instr);
   GpuHloCostAnalysis analysis(GpuHloCostAnalysis::Options(), device_info);
   TF_RETURN_IF_ERROR(collective->Accept(&analysis));
   int64_t bytes_transferred = analysis.BytesTransferred(*collective);
-
-  TF_ASSIGN_OR_RETURN(auto comm,
+  TF_ASSIGN_OR_RETURN(GPUCommunicationType comm,
                       CommunicationType(num_devices_per_host, *collective,
                                         device_info.gpu_compute_capability()));
   TF_ASSIGN_OR_RETURN(int num_devices,
@@ -119,12 +149,11 @@ absl::StatusOr<InterpolationSpecification> Spec(
 
   return InterpolationSpecification{
       /*opcode=*/collective->opcode(),
-      /*comm=*/comm,
       /*num_devices=*/num_devices,
       /*transfer_size=*/bytes_transferred,
-      /*device_list=*/collective->device_list(),
       /*data_type=*/collective->shape().element_type(),
-  };
+      /*collective_params=*/
+      CollectiveOpSpecInfo{collective->device_list(), comm}};
 }
 
 std::unique_ptr<HloModule> AllReduceModule(
@@ -295,6 +324,30 @@ std::unique_ptr<HloModule> AllToAllModule(
   return module;
 }
 
+std::unique_ptr<HloModule> CollectivePermuteModule(
+    const HloInstructionProfile& profile) {
+  HloModuleConfig config;
+
+  auto module = std::make_unique<HloModule>("m", config);
+  absl::StatusOr<Shape> shape = Shape::FromProto(profile.instruction().shape());
+  if (!shape.ok()) {
+    VLOG(1) << "Cannot parse shape: " << profile.DebugString();
+    return nullptr;
+  }
+
+  HloComputation::Builder entry_builder("entry");
+  HloInstruction* p0 = entry_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, *shape, "p0"));
+  std::vector<std::pair<int64_t, int64_t>> pairs;
+  for (const auto& pair : profile.instruction().source_target_pairs()) {
+    pairs.push_back({pair.source(), pair.target()});
+  }
+  entry_builder.AddInstruction(HloInstruction::CreateCollectivePermute(
+      *shape, p0, pairs, profile.instruction().channel_id()));
+  module->AddEntryComputation(entry_builder.Build());
+  return module;
+}
+
 std::optional<CollectiveDeviceList> CanonicalDeviceList(
     const HloCollectiveInstruction& instr) {
   if (instr.device_list().iota_replica_group_list().has_value()) {
@@ -377,13 +430,24 @@ ConstructExactInterpolators(int num_devices_per_host,
     TF_ASSIGN_OR_RETURN(InterpolationSpecification spec,
                         Spec(num_devices_per_host, profile, device_info));
     // Construct exact interpolators.
-    CollectiveInterpolator::ExactInterpolatorKey exact_key{
-        /*opcode=*/spec.opcode,
-        /*device_list=*/spec.device_list,
-        /*data_type=*/
-        RequiresAccumulation(spec.opcode) ? std::make_optional(spec.data_type)
-                                          : std::nullopt,
-    };
+    CollectiveInterpolator::ExactInterpolatorKey exact_key = std::visit(
+        absl::Overload{[&](PermuteOpSpecInfo permute_op_info) {
+                         return CollectiveInterpolator::ExactInterpolatorKey{
+                             /*opcode=*/spec.opcode,
+                             /*collective_params=*/permute_op_info.permute_type,
+                             /*data_type=*/std::nullopt};
+                       },
+                       [&](const CollectiveOpSpecInfo& op_info) {
+                         return CollectiveInterpolator::ExactInterpolatorKey{
+                             /*opcode=*/spec.opcode,
+                             /*collective_params=*/
+                             op_info.collective_device_list,
+                             /*data_type=*/
+                             RequiresAccumulation(spec.opcode)
+                                 ? std::make_optional(spec.data_type)
+                                 : std::nullopt};
+                       }},
+        spec.collective_params);
     auto exact_it = exact_interpolators->find(exact_key);
     if (exact_it == exact_interpolators->end()) {
       auto interpolator = std::make_unique<
@@ -408,29 +472,72 @@ absl::StatusOr<std::unique_ptr<
 ConstructExactNNInterpolators(int num_devices_per_host,
                               const HloInstructionProfileList& profiles,
                               const se::DeviceDescription& device_info) {
+  VLOG(10) << "ConstructExactNNInterpolators called.";
   auto exact_interpolators = std::make_unique<
       absl::flat_hash_map<CollectiveInterpolator::ExactInterpolatorKey,
                           std::unique_ptr<InterpolatorBase<int64_t, 1>>>>();
 
   for (auto& profile : profiles.entries()) {
+    VLOG(10) << "Processing profile: " << profile.DebugString();
     TF_ASSIGN_OR_RETURN(InterpolationSpecification spec,
                         Spec(num_devices_per_host, profile, device_info));
+    std::visit(absl::Overload{
+                   [&](PermuteOpSpecInfo permute_op_info) {
+                     VLOG(10) << "  Spec: opcode=" << spec.opcode
+                              << ", num_devices=" << spec.num_devices
+                              << ", transfer_size=" << spec.transfer_size
+                              << ", data_type=" << spec.data_type
+                              << ", permute_type="
+                              << static_cast<int>(permute_op_info.permute_type);
+                   },
+                   [&](const CollectiveOpSpecInfo& op_info) {
+                     VLOG(10) << "  Spec: opcode=" << spec.opcode
+                              << ", num_devices=" << spec.num_devices
+                              << ", transfer_size=" << spec.transfer_size
+                              << ", data_type=" << spec.data_type
+                              << ", permute_type=nullopt";
+                   }},
+               spec.collective_params);
+
     // Construct exact interpolators.
-    CollectiveInterpolator::ExactInterpolatorKey exact_key{
-        /*opcode=*/spec.opcode,
-        /*device_list=*/spec.device_list,
-        /*data_type=*/spec.data_type,
-    };
+    CollectiveInterpolator::ExactInterpolatorKey exact_key = std::visit(
+        absl::Overload{[&](PermuteOpSpecInfo permute_op_info) {
+                         return CollectiveInterpolator::ExactInterpolatorKey{
+                             /*opcode=*/spec.opcode,
+                             /*collective_params=*/permute_op_info.permute_type,
+                             /*data_type=*/std::nullopt};
+                       },
+                       [&](const CollectiveOpSpecInfo& op_info) {
+                         return CollectiveInterpolator::ExactInterpolatorKey{
+                             /*opcode=*/spec.opcode,
+                             /*collective_params=*/
+                             op_info.collective_device_list,
+                             /*data_type=*/spec.data_type};
+                       }},
+        spec.collective_params);
+    VLOG(10) << "  Constructed exact_key: opcode=" << exact_key.opcode
+             << ", data_type="
+             << (exact_key.data_type.has_value()
+                     ? std::to_string(static_cast<int>(*exact_key.data_type))
+                     : "nullopt");
+
     auto exact_it = exact_interpolators->find(exact_key);
     if (exact_it == exact_interpolators->end()) {
+      VLOG(10) << "  Exact interpolator not found for key, creating new one.";
       auto interpolator =
           std::make_unique<EuclideanNNInterpolator<int64_t, 1>>();
       (*exact_interpolators)[exact_key] = std::move(interpolator);
+    } else {
+      VLOG(10) << "  Exact interpolator found for key.";
     }
     std::array<int64_t, 1> exact_point = {spec.transfer_size};
+    VLOG(10) << "  Adding point {" << exact_point[0] << "} with throughput: "
+             << profile.network_throughput_bytes_per_sec();
     exact_interpolators->at(exact_key)->Add(
         exact_point, profile.network_throughput_bytes_per_sec());
   }
+  VLOG(10) << "ConstructExactNNInterpolators finished. Total keys: "
+           << exact_interpolators->size();
   return exact_interpolators;
 }
 
@@ -447,12 +554,26 @@ ConstructFallbackInterpolators(int num_devices_per_host,
   for (auto& profile : profiles.entries()) {
     TF_ASSIGN_OR_RETURN(InterpolationSpecification spec,
                         Spec(num_devices_per_host, profile, device_info));
+    std::optional<GPUCommunicationType> collective_comm;
+    std::visit(absl::Overload{[&](PermuteOpSpecInfo permute_op_info) {},
+                              [&](const CollectiveOpSpecInfo& op_info) {
+                                collective_comm = op_info.collective_comm;
+                              }},
+               spec.collective_params);
+
+    if (!collective_comm.has_value()) {
+      // Collective-permute uses exact interpolation only.
+      continue;
+    }
+
     CollectiveInterpolator::FallbackInterpolatorKey key{
         /*opcode=*/spec.opcode,
-        /*communication_type=*/spec.comm,
+        /*communication_type=*/collective_comm.value(),
     };
     auto it = fallback_interpolators->find(key);
     if (it == fallback_interpolators->end()) {
+      // For fallback interpolators, we initialize a EuclideanComplement
+      // Interpolator with default min/max context values.
       auto interpolator =
           std::make_unique<EuclideanComplementInterpolator<int64_t, 2>>(
               /*next_context=*/std::array<int64_t, 2>{-1, -1},
@@ -486,12 +607,26 @@ ConstructFallbackNNInterpolators(int num_devices_per_host,
   for (auto& profile : profiles.entries()) {
     TF_ASSIGN_OR_RETURN(InterpolationSpecification spec,
                         Spec(num_devices_per_host, profile, device_info));
+    std::optional<GPUCommunicationType> collective_comm;
+    std::visit(absl::Overload{[&](PermuteOpSpecInfo permute_op_info) {},
+                              [&](const CollectiveOpSpecInfo& op_info) {
+                                collective_comm = op_info.collective_comm;
+                              }},
+               spec.collective_params);
+
+    if (!collective_comm.has_value()) {
+      // Collective-permute uses exact interpolation only.
+      continue;
+    }
+
     CollectiveInterpolator::FallbackInterpolatorKey key{
         /*opcode=*/spec.opcode,
-        /*communication_type=*/spec.comm,
+        /*communication_type=*/collective_comm.value(),
     };
     auto it = fallback_interpolators->find(key);
     if (it == fallback_interpolators->end()) {
+      // For fallback NN interpolators, we initialize a EuclideanNNInterpolator
+      // without default min/max context values.
       auto interpolator =
           std::make_unique<EuclideanNNInterpolator<int64_t, 2>>();
 
@@ -547,16 +682,49 @@ CollectiveInterpolator::Create(int num_devices_per_host,
 }
 
 absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
-    const HloCollectiveInstruction& instr) const {
+    const HloInstruction& instr) const {
   // Exact interpolation.
   int64_t bytes_transferred =
       GetBytesTransferred(instr, device_info_, analysis_);
 
-  std::optional<CollectiveDeviceList> devices = CanonicalDeviceList(instr);
+  if (instr.opcode() == HloOpcode::kCollectivePermute) {
+    auto* cp = Cast<HloCollectivePermuteInstruction>(&instr);
+    const CollectivePermuteCostModelType& permute_type =
+        GetCollectivePermuteCostModelType(
+            *cp, /*num_devices_per_partition=*/cp->GetModule()
+                     ->config()
+                     .num_partitions());
+
+    VLOG(10) << "EstimatedRuntime permute_type: "
+             << static_cast<int>(permute_type)
+             << " for instr: " << instr.ToString() << " num_partitions:"
+             << cp->GetModule()->config().num_partitions();
+    ExactInterpolatorKey exact_key{
+        /*opcode=*/instr.opcode(),
+        /*collective_params=*/permute_type,
+        /*data_type=*/std::nullopt,
+    };
+    VLOG(10) << "Checking exact interpolator for CollectivePermute. Opcode: "
+             << instr.opcode()
+             << ", PermuteType: " << static_cast<int>(permute_type)
+             << ", BytesTransferred: " << bytes_transferred;
+    if (exact_interpolators_->contains(exact_key)) {
+      VLOG(10) << "Exact interpolator found for CollectivePermute.";
+      std::array<int64_t, 1> point({bytes_transferred});
+      return absl::Seconds(1.0 * bytes_transferred /
+                           exact_interpolators_->at(exact_key)->Eval(point));
+    }
+    VLOG(10) << "Exact interpolator NOT found for CollectivePermute.";
+    return absl::NotFoundError(
+        absl::StrCat("Cannot find key for instr: ", instr.ToString()));
+  }
+  auto* collective = Cast<HloCollectiveInstruction>(&instr);
+  std::optional<CollectiveDeviceList> devices =
+      CanonicalDeviceList(*collective);
   if (devices.has_value()) {
     ExactInterpolatorKey exact_key{
         /*opcode=*/instr.opcode(),
-        /*device_list=*/*devices,
+        /*collective_params=*/*devices,
         /*data_type=*/
         RequiresAccumulation(instr.opcode())
             ? std::make_optional(instr.shape().element_type())
@@ -569,14 +737,30 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
                            exact_interpolators_->at(exact_key)->Eval(point));
     }
   }
-  // Fallback interpolation.
+  // No fallback needed for permute.
+  if (instr.opcode() == HloOpcode::kCollectivePermute) {
+    return absl::NotFoundError(
+        absl::StrCat("Cannot find key for instr: ", instr.ToString()));
+  }
+  auto* channel_instr = Cast<HloChannelInstruction>(&instr);
   TF_ASSIGN_OR_RETURN(auto comm,
-                      CommunicationType(num_devices_per_host_, instr,
+                      CommunicationType(num_devices_per_host_, *channel_instr,
                                         device_info_.gpu_compute_capability()));
   TF_ASSIGN_OR_RETURN(auto num_devices, GetReplicaGroupCountAndSize(&instr));
   std::array<int64_t, 2> point({bytes_transferred, num_devices->second});
+  HloOpcode opcode = instr.opcode();
+  if (instr.opcode() == HloOpcode::kAllGatherStart) {
+    opcode = HloOpcode::kAllGather;
+  } else if (instr.opcode() == HloOpcode::kAllReduceStart) {
+    opcode = HloOpcode::kAllReduce;
+  } else if (instr.opcode() == HloOpcode::kAsyncStart) {
+    if (instr.async_wrapped_opcode() == HloOpcode::kReduceScatter) {
+      opcode = HloOpcode::kReduceScatter;
+    }
+  }
+
   CollectiveInterpolator::FallbackInterpolatorKey key{
-      /*opcode=*/AsyncToSyncOpcode(instr),
+      /*opcode=*/opcode,
       /*communication_type=*/comm,
   };
   if (!fallback_interpolators_->contains(key)) {
@@ -590,6 +774,8 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
 /*static*/ std::unique_ptr<HloModule> CollectiveInterpolator::ConstructModule(
     const HloInstructionProfile& profile) {
   switch (*StringToHloOpcode(profile.instruction().opcode())) {
+    case HloOpcode::kCollectivePermute:
+      return CollectivePermuteModule(profile);
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllReduceStart:
       return AllReduceModule(profile);
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.h b/third_party/xla/xla/service/gpu/model/collective_interpolator.h
index 77dfbc335ac3b7..dfd254fc667f95 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.h
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.h
@@ -20,8 +20,10 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
+#include <variant>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/overload.h"
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -56,23 +58,46 @@ class CollectiveInterpolator {
 
   struct ExactInterpolatorKey {
     HloOpcode opcode;
-    CollectiveDeviceList device_list;
+    std::variant<CollectiveDeviceList, CollectivePermuteCostModelType>
+        collective_params;
     std::optional<PrimitiveType> data_type;
 
     template <typename H>
     friend H AbslHashValue(H h, const ExactInterpolatorKey& key) {
-      return H::combine(
-          std::move(h), key.opcode,
-          key.device_list.ToString(/*print_full_replica_group_list=*/true),
-          key.data_type);
+      h = H::combine(std::move(h), key.opcode, key.data_type);
+      std::visit(
+          absl::Overload{[&](CollectivePermuteCostModelType permute_type) {
+                           h = H::combine(std::move(h), permute_type);
+                         },
+                         [&](const CollectiveDeviceList& device_list) {
+                           h = H::combine(std::move(h),
+                                          device_list.ToString(
+                                              /*print_full_replica_group_list=*/
+                                              true));
+                         }},
+          key.collective_params);
+      return h;
     }
 
     bool operator==(const ExactInterpolatorKey& other) const {
-      return opcode == other.opcode &&
-             device_list.ToString(/*print_full_replica_group_list=*/true) ==
-                 other.device_list.ToString(
-                     /*print_full_replica_group_list=*/true) &&
-             data_type == other.data_type;
+      if (opcode != other.opcode ||
+          collective_params.index() != other.collective_params.index()) {
+        return false;
+      }
+      return std::visit(
+          absl::Overload{
+              [&](CollectivePermuteCostModelType permute_type) {
+                return permute_type == std::get<CollectivePermuteCostModelType>(
+                                           other.collective_params);
+              },
+              [&](const CollectiveDeviceList& device_list) {
+                return device_list.ToString(
+                           /*print_full_replica_group_list=*/true) ==
+                       std::get<CollectiveDeviceList>(other.collective_params)
+                           .ToString(
+                               /*print_full_replica_group_list=*/true);
+              }},
+          collective_params);
     }
   };
 
@@ -97,9 +122,10 @@ class CollectiveInterpolator {
   static std::unique_ptr<HloModule> ConstructModule(
       const HloInstructionProfile& profile);
 
-  // Returns the estimated runtime for a supported `collective`.
+  // Returns the estimated runtime for a supported `collective` or
+  // `collective-permute`.
   absl::StatusOr<absl::Duration> EstimatedRuntime(
-      const HloCollectiveInstruction& instr) const;
+      const HloInstruction& instr) const;
 
  private:
   explicit CollectiveInterpolator(
@@ -114,6 +140,10 @@ class CollectiveInterpolator {
         analysis_(analysis) {}
 
   ExactInterpolatorMap exact_interpolators_;
+  // Fallback interpolators are only necessary for collective with complex
+  // dimensions, e.g. async all-reduce, reduce-scatter, etc. Collective-permute
+  // doesn't need fallback interpolators because its
+  // category is simple and exact interpolation can cover all cases.
   FallbackInterpolatorMap fallback_interpolators_;
   const se::DeviceDescription& device_info_;
   int num_devices_per_host_;
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
index 26cfc6a2bbb7df..7187900168cf02 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
@@ -19,9 +19,11 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
@@ -148,13 +150,13 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
                                         int num_hosts) {
     IotaReplicaGroupList iota(1, 1);
     switch (comm) {
-      case GPUCommunicationType::SINGLE_HOST:
+      case GPUCommunicationType::SINGLE_PARTITION:
         iota = IotaReplicaGroupList(num_hosts, kNumGpusPerHost);
         break;
-      case GPUCommunicationType::RAIL_ALIGNED:
+      case GPUCommunicationType::MULTI_HOST_WORLD_LEVEL:
         iota = IotaReplicaGroupList(1, num_hosts * kNumGpusPerHost);
         break;
-      case GPUCommunicationType::NON_RAIL_ALIGNED:
+      case GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL:
         iota = IotaReplicaGroupList(kNumGpusPerHost, num_hosts,
                                     {num_hosts, kNumGpusPerHost}, {1, 0});
         break;
@@ -169,273 +171,273 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
   std::vector<SpaceSpec> test_space_ = {
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllToAll,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/1,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllToAll,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllToAll,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/4096,
@@ -459,7 +461,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -471,7 +473,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -483,7 +485,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -495,7 +497,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -507,7 +509,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -519,7 +521,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -531,7 +533,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -543,7 +545,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -555,7 +557,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -567,67 +569,68 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(2500),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_extrapolate_nodes",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_extrapolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_extrapolate_tensor_size",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_extrapolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Seconds(1),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_interpolate_nodes",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_interpolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_interpolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(625),
         },
         {
-            /*test_name=*/"ARS_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"ARS_SINGLE_PARTITION_aligned_interpolate_tensor_"
+                          "size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduceStart,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -639,7 +642,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -651,7 +654,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -663,7 +666,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -675,7 +678,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -687,7 +690,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -699,7 +702,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -711,7 +714,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -723,7 +726,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -735,7 +738,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1032,
                 /*num_nodes=*/3,
             },
@@ -747,55 +750,55 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(2500),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_extrapolate_nodes",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_extrapolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_extrapolate_tensor_size",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_extrapolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Seconds(1),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_interpolate_nodes",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_interpolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_interpolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -807,7 +810,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -819,7 +822,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -831,7 +834,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -843,7 +846,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1056,
                 /*num_nodes=*/3,
             },
@@ -855,7 +858,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -867,7 +870,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -879,7 +882,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -891,7 +894,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -903,7 +906,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1032,
                 /*num_nodes=*/3,
             },
@@ -915,67 +918,68 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(2500),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_extrapolate_nodes",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_extrapolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_extrapolate_tensor_size",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_extrapolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Seconds(1),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_interpolate_nodes",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_interpolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_interpolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(625),
         },
         {
-            /*test_name=*/"AGS_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"AGS_SINGLE_PARTITION_aligned_interpolate_tensor_"
+                          "size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGatherStart,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -986,7 +990,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllToAll,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -997,18 +1001,18 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllToAll,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(250),
         },
         {
-            /*test_name=*/"A2A_single_host_exact_match",
+            /*test_name=*/"A2A_SINGLE_PARTITION_exact_match",
             {
                 /*opcode=*/HloOpcode::kAllToAll,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/1,
             },
@@ -1079,5 +1083,112 @@ INSTANTIATE_TEST_SUITE_P(
       return info.param.test_name;
     });
 
+struct CollectivePermuteTestCase {
+  std::string test_name;
+  CollectivePermuteCostModelType permute_type;
+  absl::Duration expected_duration;
+};
+
+class CollectivePermuteInterpolationTest
+    : public TestWithParam<CollectivePermuteTestCase> {
+  void SetUp() override {
+    HloInstructionProfileList profiles;
+    for (auto& [permute_type, points] : collective_permute_profiles_) {
+      for (auto& [tensor_size, throughput] : points) {
+        HloInstructionProfile entry =
+            CollectivePermuteInstruction(permute_type, tensor_size);
+        entry.set_network_throughput_bytes_per_sec(throughput);
+        *profiles.add_entries() = entry;
+      }
+    }
+    device_info_ = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+    interpolator_ = *CollectiveInterpolator::Create(kNumGpusPerHost, profiles,
+                                                    device_info_);
+  }
+
+ protected:
+  absl::StatusOr<absl::Duration> EstimateRuntime(
+      CollectivePermuteCostModelType permute_type, int64_t tensor_size) {
+    auto instr = CollectivePermuteInstruction(permute_type, tensor_size);
+    auto module = CollectiveInterpolator::ConstructModule(instr);
+    module->mutable_config().set_num_partitions(kNumGpusPerHost);
+    auto* eval = module->entry_computation()->root_instruction();
+    return interpolator_->EstimatedRuntime(*eval);
+  }
+
+ private:
+  // Creates a collective permute instruction with a given `permute_type` and
+  // `tensor_size`.
+  // The perf table only supports the intra-partition collective permutes,
+  // including one-way, two-way all-mutual and two-way has non-mutual.
+  HloInstructionProfile CollectivePermuteInstruction(
+      CollectivePermuteCostModelType permute_type, int64_t tensor_size) {
+    Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {tensor_size / 4});
+
+    HloInstructionProfile profile;
+    profile.mutable_instruction()->set_opcode(
+        HloOpcodeString(HloOpcode::kCollectivePermute));
+    *profile.mutable_instruction()->mutable_shape() = shape.ToProto();
+    profile.mutable_instruction()->set_channel_id(1);
+
+    std::vector<std::pair<int64_t, int64_t>> pairs;
+    if (permute_type == CollectivePermuteCostModelType::kIntraPartitionOneWay) {
+      pairs = {{0, 2}, {1, 3}};
+    } else if (permute_type ==
+               CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual) {
+      pairs = {{0, 1}, {1, 0}, {2, 3}, {3, 2}};
+    } else if (permute_type == CollectivePermuteCostModelType::
+                                   kIntraPartitionTwoWayHasNonMutual) {
+      pairs = {{0, 1}, {1, 2}, {2, 3}, {3, 0}};
+    }
+    for (const auto& pair : pairs) {
+      auto* p = profile.mutable_instruction()->add_source_target_pairs();
+      p->set_source(pair.first);
+      p->set_target(pair.second);
+    }
+    return profile;
+  }
+
+  se::DeviceDescription device_info_;
+  std::unique_ptr<CollectiveInterpolator> interpolator_;
+  // The collective permute testing profiles for the perf table.
+  // format: {collectove_permute_type: {{tensor_size_1, throughput_1},
+  //                                    {tensor_size_2, throughput_2},
+  //                                    ...}}
+  absl::flat_hash_map<CollectivePermuteCostModelType,
+                      std::vector<std::pair<int64_t, int64_t>>>
+      collective_permute_profiles_ = {
+          {CollectivePermuteCostModelType::kIntraPartitionOneWay,
+           {{512, 512}, {1024, 1024}, {2048, 1536}}},
+          {CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual,
+           {{512, 1024}, {1024, 2048}, {2048, 3072}}},
+          {CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual,
+           {{512, 1536}, {1024, 3072}, {2048, 4096}}}};
+};
+
+TEST_P(CollectivePermuteInterpolationTest, InterpolatesCorrectly) {
+  const auto& [_, permute_type, expected_duration] = GetParam();
+  auto runtime = EstimateRuntime(permute_type, 1024);
+  ASSERT_TRUE(runtime.ok());
+  EXPECT_NEAR(absl::ToDoubleSeconds(*runtime),
+              absl::ToDoubleSeconds(expected_duration), 1e-5);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CollectivePermuteInterpolationTestInstantiation,
+    CollectivePermuteInterpolationTest,
+    ValuesIn<CollectivePermuteTestCase>({
+        {"OneWay", CollectivePermuteCostModelType::kIntraPartitionOneWay,
+         absl::Seconds(1)},
+        {"TwoWayAllMutual",
+         CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual,
+         absl::Milliseconds(500)},
+        {"TwoWayHasNonMutual",
+         CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual,
+         absl::Seconds(1.0 / 3.0)},
+    }),
+    [](const TestParamInfo<CollectivePermuteInterpolationTest::ParamType>&
+           info) { return info.param.test_name; });
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
index 722a64e79c314c..fb1f8a122c8da4 100644
--- a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
@@ -64,7 +64,7 @@ absl::StatusOr<HloInstructionProfileList> CollectProfiles(
 
 }  // namespace
 
-absl::StatusOr<bool> CollectivePerfTableStatsCollection::Run(
+absl::StatusOr<bool> CollectivePerfTableStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(HloInstructionProfileList profiles,
@@ -93,13 +93,13 @@ absl::StatusOr<bool> CollectivePerfTableStatsCollection::Run(
 
         // Set it in the `CollectiveBackendConfig`.
         auto gpu_config = instr->backend_config<GpuBackendConfig>();
-        TF_CHECK_OK(gpu_config.status())
+        CHECK_OK(gpu_config.status())
             << "Cannot parse backend config: " << instr->ToString();
         auto reification_cost = gpu_config->add_reification_cost();
         reification_cost->set_exec_time_us(
             absl::ToDoubleMicroseconds(exec_time));
         *reification_cost->mutable_name() = name();
-        TF_CHECK_OK(instr->set_backend_config(*gpu_config));
+        CHECK_OK(instr->set_backend_config(*gpu_config));
       });
 
   return false;
diff --git a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
index b88e401485190b..9ef844990e1aa9 100644
--- a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
@@ -38,9 +38,8 @@ class CollectivePerfTableStatsCollection : public HloModulePass {
     return "collective-perf-table-stats-collection";
   }
 
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/experimental/BUILD b/third_party/xla/xla/service/gpu/model/experimental/BUILD
index cfc42792a1b906..eb66410b6d9ec4 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/BUILD
+++ b/third_party/xla/xla/service/gpu/model/experimental/BUILD
@@ -1,7 +1,6 @@
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -71,6 +70,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:matmul_indexing_utils",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -204,95 +204,9 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_traversal",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "symbolic_map",
-    srcs = ["symbolic_map.cc"],
-    hdrs = ["symbolic_map.h"],
-    deps = [
-        ":symbolic_expr",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "symbolic_map_test",
-    srcs = ["symbolic_map_test.cc"],
-    deps = [
-        ":symbolic_expr",
-        ":symbolic_map",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "symbolic_expr",
-    srcs = ["symbolic_expr.cc"],
-    hdrs = ["symbolic_expr.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "symbolic_expr_test",
-    srcs = ["symbolic_expr_test.cc"],
-    deps = [
-        ":symbolic_expr",
-        "//xla/hlo/analysis:indexing_test_utils",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "symbolic_map_converter",
-    srcs = ["symbolic_map_converter.cc"],
-    hdrs = ["symbolic_map_converter.h"],
-    deps = [
-        ":symbolic_expr",
-        ":symbolic_map",
-        "//xla/hlo/analysis:interval",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "symbolic_map_converter_test",
-    srcs = ["symbolic_map_converter_test.cc"],
-    deps = [
-        ":symbolic_expr",
-        ":symbolic_map",
-        ":symbolic_map_converter",
-        "//xla/hlo/analysis:interval",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_tile_propagation.cc b/third_party/xla/xla/service/gpu/model/experimental/symbolic_tile_propagation.cc
index 1596f851bfe64e..1bca382ae81b71 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_tile_propagation.cc
+++ b/third_party/xla/xla/service/gpu/model/experimental/symbolic_tile_propagation.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -83,8 +84,7 @@ SymbolicTiles PropagateTileToOutputForBroadcastOp(
   dim_tiles.reserve(output_rank);
   for (auto [output_dim_id, output_dim] :
        llvm::enumerate(output_shape.dimensions())) {
-    auto bcast_dim =
-        std::find(bcast_dims.begin(), bcast_dims.end(), output_dim_id);
+    auto bcast_dim = absl::c_find(bcast_dims, output_dim_id);
     // If the dimension is not a broadcast dimension, create a tile that covers
     // the entire dimension.
     if (bcast_dim == bcast_dims.end()) {
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_tiled_hlo_computation_test.cc b/third_party/xla/xla/service/gpu/model/experimental/symbolic_tiled_hlo_computation_test.cc
index 8ee59963ec7e32..160a0f45e7a19a 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_tiled_hlo_computation_test.cc
+++ b/third_party/xla/xla/service/gpu/model/experimental/symbolic_tiled_hlo_computation_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 #include <utility>
 #include <variant>
-#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -30,23 +30,11 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/model/experimental/test_utils.h"
-#include "xla/tsl/platform/status_matchers.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla::gpu::experimental {
 namespace {
 
-using ::testing::ElementsAre;
-using ::testing::ElementsAreArray;
-using ::testing::ExplainMatchResult;
-using ::testing::IsEmpty;
-using ::testing::Matcher;
-using ::testing::Not;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
-
-
 class SymbolicTileAnalysisTest : public HloHardwareIndependentTestBase {
  public:
   HloInstruction* ParseAndGetRoot(absl::string_view hlo_string) {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
index 30017a1b483c0b..12f031b53d39a7 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <cstdlib>
-#include <variant>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -34,21 +33,10 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "third_party/gpus/cuda/nvml/include/nvml.h"
-#endif  // GOOGLE_CUDA
 namespace xla {
 namespace gpu {
-
 namespace {
 
-// Different algorithms that can be used to perform the collective.
-enum class CollectiveAlgo {
-  RING = 0,
-  TREE,
-};
-
 struct CudaBandwidthSettings {
   // Table for max system bandwidths GB/s for using NCCL's low latency
   // algorithm. This is used for intra-node estimate.
@@ -87,15 +75,22 @@ struct CudaBandwidthSettings {
 
   // Returns NVLink bw in GB/s
   float GetNvlinkBw() const {
-    return compute_capability.IsAtLeast(se::CudaComputeCapability::kHopper)
-               ? kSm90NvlinkBandwidth
-           : compute_capability.IsAtLeast(se::CudaComputeCapability::kAmpere)
-               ? kSm80NvlinkBandwidth
-           : compute_capability.IsAtLeast(se::CudaComputeCapability::kVolta)
-               ? kSm70NvlinkBandwidth
-           : compute_capability.IsAtLeast(se::CudaComputeCapability::kPascal)
-               ? kSm60NvlinkBandwidth
-               : kSm80NvlinkBandwidth;
+    switch (compute_capability.major) {
+      case se::CudaComputeCapability::kBlackwell:
+        return kSm100NvlinkBandwidth;
+      case se::CudaComputeCapability::kHopper:
+        return kSm90NvlinkBandwidth;
+      case se::CudaComputeCapability::kAmpere:
+        return kSm80NvlinkBandwidth;
+      case se::CudaComputeCapability::kVolta:
+        return kSm70NvlinkBandwidth;
+      case se::CudaComputeCapability::kPascal:
+        return kSm60NvlinkBandwidth;
+      default:
+        LOG(WARNING) << "NVLink bandwidth for " << compute_capability.ToString()
+                     << "unknown. Assumes Blackwell.";
+        return kSm100NvlinkBandwidth;
+    }
   }
 
   // Max bandwidth in GB/s for ring low latency 128 algorithm per channel on a
@@ -111,15 +106,13 @@ struct CudaBandwidthSettings {
   static constexpr double kSm70NvlinkBandwidth = 20.0;
   static constexpr double kSm80NvlinkBandwidth = 20.0;
   static constexpr double kSm90NvlinkBandwidth = 20.0;
-
-  // PCIE bandwidth for PCI Gen3 x16
-  static constexpr double kPciBandwidth = 12.0;
+  static constexpr double kSm100NvlinkBandwidth = 40.0;
 
   // Discount factor for ring algorithm
   static constexpr double kRingAlgorithmDiscountFactor = 0.92;
 
   // Maximum number of channels allowed by NCCL
-  static constexpr int64_t kMaxNumChannelsRing = 16;
+  static constexpr int64_t kMaxNumChannelsRing = 32;
 
   // ll128 is by default enabled for Volta, Ampere and Hopper, ll128 by default
   // launches 640 threads.
@@ -211,9 +204,6 @@ struct RocmBandwidthSettings {
   static constexpr double kMi200InfinityFabricBandwidth = 75.0;
   static constexpr double kMi300InfinityFabricBandwidth = 112.0;
 
-  // PCIe bandwidth for PCI Gen4 x16 (approximate)
-  static constexpr double kPciBandwidth = 32.0;
-
   // Discount factor for ring algorithm (based on ROCm NCCL implementation)
   static constexpr double kRingAlgorithmDiscountFactor = 0.90;
 
@@ -245,15 +235,9 @@ float GetMaxLowLatencyBandwidth(const BandwidthSettings& bandwidth_settings) {
 static constexpr absl::Duration kNcclKernelLaunchOverhead =
     absl::Microseconds(5);
 
-int64_t GetNcclMaxNumChannels(CollectiveAlgo algorithm) {
-  int64_t max_nchannels = 0;
-  switch (algorithm) {
-      // Tree and Ring algos share the same max channel number.
-    case CollectiveAlgo::RING:
-    case CollectiveAlgo::TREE:
-      max_nchannels = CudaBandwidthSettings::kMaxNumChannelsRing;
-      break;
-  }
+int64_t GetNcclMaxNumChannels() {
+  int64_t max_nchannels = CudaBandwidthSettings::kMaxNumChannelsRing;
+
   const char* env = std::getenv("NCCL_MAX_NCHANNELS");
   if (env != nullptr) {
     int64_t max_nchannels_from_env;
@@ -264,15 +248,8 @@ int64_t GetNcclMaxNumChannels(CollectiveAlgo algorithm) {
   return max_nchannels;
 }
 
-int64_t GetMinNumberOfChannels(CollectiveAlgo algorithm) {
-  int64_t min_nchannels = 0;
-  switch (algorithm) {
-      // Tree and Ring algos share the same min channel number.
-    case CollectiveAlgo::RING:
-    case CollectiveAlgo::TREE:
-      min_nchannels = 1;
-      break;
-  }
+int64_t GetMinNumberOfChannels() {
+  int64_t min_nchannels = 1;
   const char* env = std::getenv("NCCL_MIN_NCHANNELS");
   if (env != nullptr) {
     int64_t min_nchannels_from_env;
@@ -317,14 +294,13 @@ absl::Duration ComputeAllreduceTimeImpl(
   float bw_intra_node = GetMaxLowLatencyBandwidth(bandwidth_settings);
   int64_t num_devices = cost_analysis->NumOfDevices(instr);
 
-  int64_t min_nchannels =
-      std::max(num_devices, GetMinNumberOfChannels(CollectiveAlgo::RING));
-  int64_t num_channels =
-      std::max(min_nchannels, GetNcclMaxNumChannels(CollectiveAlgo::RING));
-  int default_threads =
-      (bw_intra_node * num_channels <= bandwidth_settings.kPciBandwidth)
-          ? 256
-          : bandwidth_settings.kLL128NumThreads;
+  int64_t min_nchannels = std::max(num_devices, GetMinNumberOfChannels());
+  int64_t num_channels = std::max(min_nchannels, GetNcclMaxNumChannels());
+  int64_t pcie_bandwidth_gbps =
+      gpu_device_info.pcie_bandwidth() / 1024 / 1024 / 1024;
+  int default_threads = (bw_intra_node * num_channels <= pcie_bandwidth_gbps)
+                            ? 256
+                            : bandwidth_settings.kLL128NumThreads;
 
   int warp_size = gpu_device_info.threads_per_warp();
   int num_threads =
@@ -339,16 +315,17 @@ absl::Duration ComputeAllreduceTimeImpl(
           /*num_blocks=*/num_channels, /*num_threads_per_block=*/num_threads);
   total_time += compute_time_per_channel;
 
-  uint32_t supported_p2p =
-      GpuPerformanceWithCollectiveModel::CheckIfNvlinkSupportsP2P();
-
-  if (supported_p2p == 0) {
-    VLOG(8) << "Nvlink doesn't support p2p communication. Model will "
-               "continue using default system bandwidth.";
-  } else {
+  if (gpu_device_info.device_interconnect_info().active_links) {
     VLOG(8) << "Nvlink supports p2p communication, setting intra node "
                "bandwidth to nvlink bw.";
     bw_intra_node = bandwidth_settings.GetNvlinkBw();
+    num_channels =
+        std::min(static_cast<int64_t>(
+                     gpu_device_info.device_interconnect_info().active_links),
+                 num_channels);
+  } else {
+    VLOG(8) << "Nvlink doesn't support p2p communication. Model will "
+               "continue using default system bandwidth.";
   }
 
   double bus_bandwidth = bw_intra_node * num_channels;
@@ -380,104 +357,21 @@ RocmBandwidthSettings CreateSettings(
 
 }  // namespace
 
-/*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() {
-#if GOOGLE_CUDA && (defined(PLATFORM_POSIX) || defined(PLATFORM_GOOGLE))
-  void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
-  CHECK(libhandle != nullptr) << "Failed to open libnvidia-ml.so.1";
-
-  struct SymbolEntry {
-    void** functor;
-    char const* name;
-  };
-
-  std::vector<SymbolEntry> symbols = {
-      {(void**)&xla_nvmlInit, "nvmlInit_v2"},
-      {(void**)&xla_nvmlShutdown, "nvmlShutdown"},
-      {(void**)&xla_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
-      {(void**)&xla_nvmlDeviceGetNvLinkCapability,
-       "nvmlDeviceGetNvLinkCapability"},
-      {(void**)&xla_nvmlSystemGetNVMLVersion, "nvmlSystemGetNVMLVersion"},
-  };
-
-#if GOOGLE_CUDA && CUDA_VERSION >= 12040 && !defined(PLATFORM_GOOGLE)
-  // Some hosts might still have older driver version(b/414617899).
-  symbols.push_back({(void**)&xla_nvmlDeviceGetHandleByPciBusId_v2,
-                     "nvmlDeviceGetHandleByPciBusId_v2"});
-  symbols.push_back({(void**)&xla_nvmlDeviceGetGpuFabricInfoV,
-                     "nvmlDeviceGetGpuFabricInfoV"});
-#endif  // CUDA_VERSION >= 12040
-  for (SymbolEntry se : symbols) {
-    *se.functor = dlsym(libhandle, se.name);
-    if (*se.functor == nullptr) {
-      const char* dlsym_error = dlerror();
-      if (dlsym_error) {
-        VLOG(0) << "Failed to load symbol " << se.name << ": " << dlsym_error;
-        VLOG(0) << "This is likely caused by insufficient CUDA driver version. "
-                   "Please upgrade CUDA driver to 550 or higher.";
-      }
-    }
-  }
-  nvmlReturn_t init_result = xla_nvmlInit();
-  return init_result == NVML_SUCCESS;
-#elif TENSORFLOW_USE_ROCM
-  return true;
-#else
-  return false;
-#endif  // GOOGLE_CUDA
-}
-
-/*static*/ bool GpuPerformanceWithCollectiveModel::ShutdownNvml() {
-#if GOOGLE_CUDA
-  nvmlReturn_t shutdown_result = xla_nvmlShutdown();
-  return shutdown_result == NVML_SUCCESS;
-#elif TENSORFLOW_USE_ROCM
-  return true;
-#else
-  return false;
-#endif  // GOOGLE_CUDA
-}
-
-/*static*/ uint32_t
-GpuPerformanceWithCollectiveModel::CheckIfNvlinkSupportsP2P() {
-#if GOOGLE_CUDA
-  // We will use nvml library to detect nvlink capability
-  // to see if it supports p2p communication.
-  // We first load libnvidia-ml.so and assign symbols to function pointers
-  // to avoid linking errors.
-  // Then gpu 0 will be used to query for nvlink capability, note that
-  // we only look at link 0 of gpu 0 since all other links are assumed
-  // to have the same capability.
-  CHECK(InitNvml()) << "NVML init failed.";
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t get_device_result =
-      xla_nvmlDeviceGetHandleByIndex(0, &nvml_device);
-  CHECK(get_device_result == NVML_SUCCESS);
-
-  uint32_t supported_p2p = 0;
-
-  nvmlReturn_t nvlink_cap_result = xla_nvmlDeviceGetNvLinkCapability(
-      nvml_device, /*nvlink link number*/ 0, NVML_NVLINK_CAP_P2P_SUPPORTED,
-      &supported_p2p);
-  CHECK(nvlink_cap_result == NVML_SUCCESS ||
-        nvlink_cap_result == NVML_ERROR_NOT_SUPPORTED);
-  CHECK(ShutdownNvml()) << "NVML shutdown failed.";
-  return supported_p2p;
-#else
-  return 0;
-#endif  // GOOGLE_CUDA
-}
-
 /*static*/ absl::Duration
 GpuPerformanceWithCollectiveModel::ComputeAllreduceTime(
     const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
     const se::DeviceDescription& gpu_device_info) {
   // We use nccl group call to launch multiple allreduces so launch overhead
   // only occurs once.
-  const auto visitor = [&](const auto& cc) {
+  if (auto ptr =
+          gpu_device_info.gpu_compute_capability().cuda_compute_capability()) {
     return ComputeAllreduceTimeImpl(instr, cost_analysis, gpu_device_info,
-                                    CreateSettings(cc));
-  };
-  return std::visit(visitor, gpu_device_info.gpu_compute_capability());
+                                    CreateSettings(*ptr));
+  }
+  return ComputeAllreduceTimeImpl(
+      instr, cost_analysis, gpu_device_info,
+      CreateSettings(
+          *gpu_device_info.gpu_compute_capability().rocm_compute_capability()));
 }
 
 /*static*/ absl::Duration
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
index c0698e95908d70..d7cf30861a75ac 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
@@ -26,37 +26,6 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/stream_executor/device_description.h"
 
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#if defined(PLATFORM_POSIX) || defined(PLATFORM_GOOGLE)
-#include <dlfcn.h>
-#endif
-
-#include "third_party/gpus/cuda/nvml/include/nvml.h"
-// Below is a list of function pointers to be used
-// for querying device properties through nvml library.
-#define NVML_FUNCTOR(name, rettype, args) \
-  inline rettype(*xla_##name) args = nullptr;
-
-NVML_FUNCTOR(nvmlInit, nvmlReturn_t, ())
-NVML_FUNCTOR(nvmlShutdown, nvmlReturn_t, ())
-NVML_FUNCTOR(nvmlDeviceGetHandleByIndex, nvmlReturn_t,
-             (unsigned int index, nvmlDevice_t* device))
-NVML_FUNCTOR(nvmlDeviceGetNvLinkCapability, nvmlReturn_t,
-             (nvmlDevice_t device, unsigned int link,
-              nvmlNvLinkCapability_t capability, unsigned int* capResult))
-NVML_FUNCTOR(nvmlSystemGetNVMLVersion, nvmlReturn_t,
-             (char* version, size_t versionSize))
-
-#if CUDA_VERSION >= 12040
-NVML_FUNCTOR(nvmlDeviceGetHandleByPciBusId_v2, nvmlReturn_t,
-             (const char* pciBusId, nvmlDevice_t* device))
-NVML_FUNCTOR(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t,
-             (nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo))
-#endif  // CUDA_VERSION >= 12040
-
-#endif
-
 namespace xla {
 namespace gpu {
 
@@ -66,16 +35,6 @@ class GpuPerformanceWithCollectiveModel : public GpuPerformanceModelBase {
       const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
       const se::DeviceDescription& gpu_device_info);
 
-  // Initialize nvml library.
-  static bool InitNvml();
-
-  // Shut down nvml library.
-  static bool ShutdownNvml();
-
-  // This checks if the nvlink supports direct P2P communication,
-  // If not, we will use PCIE bandwidth to estimate latency.
-  static uint32_t CheckIfNvlinkSupportsP2P();
-
  private:
   static absl::Duration ComputeAllreduceTime(
       const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
deleted file mode 100644
index ca5db035776497..00000000000000
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <gtest/gtest.h>
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using GpuPerformanceWithCollectiveModelTest = HloHardwareIndependentTestBase;
-
-TEST_F(GpuPerformanceWithCollectiveModelTest, TestNvmlLibraryLoading) {
-#if GOOGLE_CUDA
-  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
-  // After successful init, we try to use one of the
-  // nvml functions to see if the result is good.
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t get_device_result =
-      xla_nvmlDeviceGetHandleByIndex(0, &nvml_device);
-  EXPECT_TRUE(get_device_result == NVML_SUCCESS);
-
-  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
-
-#endif  // GOOGLE_CUDA
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
index 3aadd7d6557c53..4cf55afee14665 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
@@ -16,31 +16,128 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_cost_model_stats_collection.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/service/gpu/model/gpu_dot_fusion_cost_model.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> GpuCostModelStatsCollection::Run(
+namespace {
+
+absl::StatusOr<absl::Duration> MaybeGetGemmCostModelForGemmTritonFusion(
+    const se::DeviceDescription& device_info,
+    const HloInstruction& instruction) {
+  const HloFusionInstruction* fusion =
+      DynCast<HloFusionInstruction>(&instruction);
+  if (fusion == nullptr ||
+      fusion->fusion_kind() != HloInstruction::FusionKind::kCustom) {
+    return absl::FailedPreconditionError("Not a custom fusion.");
+  }
+
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig config,
+                      fusion->backend_config<GpuBackendConfig>());
+  if (config.fusion_backend_config().kind() != kTritonNestedGemmFusionKind) {
+    return absl::FailedPreconditionError("Not a Triton GeMM fusion.");
+  }
+
+  const HloInstruction* dot_instruction =
+      hlo_query::GetFirstInstructionWithOpcode(
+          *fusion->fused_instructions_computation(), HloOpcode::kDot);
+  if (dot_instruction == nullptr) {
+    return absl::FailedPreconditionError(
+        "No kDot instruction found in Triton fusion.");
+  }
+
+  const HloDotInstruction* dot =
+      DynCast<const HloDotInstruction>(dot_instruction);
+  if (dot == nullptr) {
+    return absl::FailedPreconditionError(
+        "No kDot instruction found in Triton fusion.");
+  }
+
+  if (!config.fusion_backend_config().has_block_level_fusion_config()) {
+    return absl::FailedPreconditionError(
+        "Fusion backend config does not have block level fusion config.");
+  }
+  BlockLevelParameters block_params =
+      BlockLevelParameters::FromBlockLevelFusionConfig(
+          config.fusion_backend_config().block_level_fusion_config());
+
+  return GpuDotFusionCostModel::EstimateRunTimeForDotOpWithBlockParameters(
+      dot, block_params, device_info);
+}
+
+// If `instruction` is a Triton-fused GEMM, computes its runtime estimation
+// using an analytical cost model and adds this as a reification cost.
+// This cost model focuses on the dot operation within the fusion. Fusions
+// with non-trivial operations on dot operands might not be fully accounted for.
+void RecordGemmCostModelEstimateIfApplicable(
+    const se::DeviceDescription& device_info, HloInstruction& instruction) {
+  absl::StatusOr<absl::Duration> duration =
+      MaybeGetGemmCostModelForGemmTritonFusion(device_info, instruction);
+  if (!duration.ok()) {
+    VLOG(3) << "Skipping the GeMM fusion cost model: "
+            << duration.status().ToString(
+                   absl::StatusToStringMode::kWithNoExtraData)
+            << "\nInstruction: " << instruction.ToShortString();
+    return;
+  }
+
+  absl::StatusOr<GpuBackendConfig> gpu_config =
+      instruction.backend_config<GpuBackendConfig>();
+
+  ReificationCost* gemm_reification_cost = gpu_config->add_reification_cost();
+  gemm_reification_cost->set_name("experimental-gemm-cost-model");
+  gemm_reification_cost->set_end_to_end_cycles(
+      absl::ToDoubleNanoseconds(*duration) * device_info.clock_rate_ghz());
+  gemm_reification_cost->set_exec_time_us(
+      absl::ToDoubleMicroseconds(*duration));
+
+  VLOG(1) << "Adding GeMM fusion cost model estimate: "
+          << gemm_reification_cost->DebugString()
+          << "\nInstruction: " << instruction.ToString();
+
+  CHECK_OK(instruction.set_backend_config(*gpu_config));
+}
+
+}  // namespace
+
+absl::StatusOr<bool> GpuCostModelStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Scan all computations for fusion instructions.
 
   GpuPerformanceModelOwning gpu_performance_model{device_info_, mlir_context_};
   for (auto* computation : module->MakeComputationPostOrder()) {
-    TF_CHECK_OK(computation->Accept(&cost_analysis_));
+    CHECK_OK(computation->Accept(&cost_analysis_));
 
     for (auto* fusion_instr : computation->instructions()) {
-      if (fusion_instr->opcode() != HloOpcode::kFusion) continue;
+      if (fusion_instr->opcode() != HloOpcode::kFusion) {
+        continue;
+      }
 
       gpu_performance_model.Get().RecordEstimatedRunTime(fusion_instr,
                                                          &cost_analysis_);
+
+      RecordGemmCostModelEstimateIfApplicable(device_info_, *fusion_instr);
     }
   }
   return false;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
index 2850657f0874ec..8c0fdd437b8614 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
@@ -46,8 +46,8 @@ class GpuCostModelStatsCollection : public HloModulePass {
     return "gpu_cost_model_stats_collection";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
index d3faefb50b8296..2e39608c5780ca 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
@@ -19,23 +19,32 @@ limitations under the License.
 
 #include <memory>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
+namespace {
+
+using ::absl_testing::IsOkAndHolds;
+using ::mlir::MLIRContext;
+using ::testing::Contains;
+using ::testing::Truly;
 
 class GpuCostModelStatsCollectionTest : public HloHardwareIndependentTestBase {
  public:
   GpuCostModelStatsCollection cost_model_stats_{
-      TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo(),
       GpuHloCostAnalysis::Options{.count_multiple_input_accesses = true},
       &mlir_context_};
 
@@ -43,8 +52,8 @@ class GpuCostModelStatsCollectionTest : public HloHardwareIndependentTestBase {
   mlir::MLIRContext mlir_context_;
 };
 
-TEST_F(GpuCostModelStatsCollectionTest, FusinInEntryComputation) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+TEST_F(GpuCostModelStatsCollectionTest, FusionInEntryComputation) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"hlo(
     HloModule test_module
 
     log {
@@ -56,9 +65,9 @@ TEST_F(GpuCostModelStatsCollectionTest, FusinInEntryComputation) {
       %p0 = f32[16384] parameter(0)
       ROOT %res = f32[16384]{0} fusion(p0), kind=kInput, calls=log
     }
-    )"));
+    )hlo"));
 
-  EXPECT_FALSE(cost_model_stats_.Run(module.get()).value());
+  EXPECT_THAT(cost_model_stats_.Run(module.get()), IsOkAndHolds(false));
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
@@ -68,8 +77,8 @@ TEST_F(GpuCostModelStatsCollectionTest, FusinInEntryComputation) {
   EXPECT_GT(gpu_config.reification_cost()[0].end_to_end_cycles(), 0);
 }
 
-TEST_F(GpuCostModelStatsCollectionTest, FusinInWhileComputation) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+TEST_F(GpuCostModelStatsCollectionTest, FusionInWhileComputation) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"hlo(
     HloModule test_module
 
     cond {
@@ -90,9 +99,9 @@ TEST_F(GpuCostModelStatsCollectionTest, FusinInWhileComputation) {
     ENTRY main {
       %p0 = f32[16384] parameter(0)
       ROOT %while = f32[16384] while(%p0), body=%loop, condition=%cond
-    })"));
+    })hlo"));
 
-  EXPECT_FALSE(cost_model_stats_.Run(module.get()).value());
+  EXPECT_THAT(cost_model_stats_.Run(module.get()), IsOkAndHolds(false));
 
   HloInstruction* root = module->entry_computation()
                              ->root_instruction()
@@ -105,5 +114,49 @@ TEST_F(GpuCostModelStatsCollectionTest, FusinInWhileComputation) {
   EXPECT_GT(gpu_config.reification_cost()[0].end_to_end_cycles(), 0);
 }
 
+TEST_F(GpuCostModelStatsCollectionTest, GemmCostModelAddedToGemmFusion) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"hlo(
+  HloModule test_module
+
+  gemm_fusion_dot_computation {
+    p0 = f16[1024,512]{1,0} parameter(0)
+    p1 = f16[512,2048]{1,0} parameter(1)
+    ROOT %dot.1 = f16[1024,2048]{1,0} dot(p0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY main {
+    p0 = f16[1024,512]{1,0} parameter(0)
+    p1 = f16[512,2048]{1,0} parameter(1)
+    ROOT gemm_fusion_dot = f16[1024,2048]{1,0} fusion(p0, p1), kind=kCustom,
+      calls=gemm_fusion_dot_computation,
+      backend_config={
+        "fusion_backend_config": {
+          "kind":"__triton_nested_gemm_fusion",
+          "block_level_fusion_config": {
+            "num_warps":"4",
+            "output_tiles":[{"sizes":["64","128"]}],
+            "num_ctas":1,
+            "num_stages":3
+          }
+        }
+      }
+    }
+    )hlo"));
+
+  EXPECT_THAT(cost_model_stats_.Run(module.get()), IsOkAndHolds(false));
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          root->backend_config<GpuBackendConfig>());
+
+  EXPECT_THAT(gpu_config.reification_cost(),
+              Contains(Truly([](const ReificationCost& cost) {
+                return cost.name() == "experimental-gemm-cost-model" &&
+                       cost.end_to_end_cycles() > 0;
+              })));
+}
+
+}  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
index 42b6285a1ca308..40b2d7a7e9fe33 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -574,6 +574,20 @@ absl::Status GpuHloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
+absl::Status GpuHloCostAnalysis::HandleCollectivePermute(
+    const HloInstruction* hlo) {
+  current_properties_[kCollBytesTransferred] +=
+      ShapeUtil::ByteSizeOf(hlo->operand(0)->shape());
+  return absl::OkStatus();
+}
+
+absl::Status GpuHloCostAnalysis::HandleCollectivePermuteStart(
+    const HloInstruction* hlo) {
+  current_properties_[kCollBytesTransferred] +=
+      ShapeUtil::ByteSizeOf(hlo->operand(0)->shape());
+  return absl::OkStatus();
+}
+
 absl::Status GpuHloCostAnalysis::HandleElementwiseOp(
     const HloInstruction* hlo) {
   current_properties_[kFlopsKey] = GetFlopsForElementwiseOp(hlo);
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
index b05e6f59660a5b..98517c38f0eaeb 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
@@ -79,6 +79,8 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
   absl::Status HandleAsyncStart(const HloInstruction* hlo) override;
   absl::Status HandleReduceScatter(const HloInstruction* hlo) override;
   absl::Status HandleAllToAll(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermute(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteStart(const HloInstruction* hlo) override;
 
   // Estimate the total size of IR accounting for both duplication
   // of producer code by consumer and the total number of basic blocks.
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
index 8c6be3336c888f..3f1afbca789f93 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
@@ -669,6 +669,40 @@ ENTRY entry_computation {
   EXPECT_EQ(analysis_.flop_count(*reduce), 32 * 39 * 6);
 }
 
+TEST_F(GpuHloCostAnalysisTest, CollectivePermute) {
+  absl::string_view hlo_string = R"(
+HloModule m, num_partitions=2
+
+ENTRY entry {
+  p0 = f32[4096] parameter(0)
+  ROOT cp = f32[4096] collective-permute(p0), source_target_pairs={{0,1},{1,0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+  const HloInstruction* cp = module->entry_computation()->root_instruction();
+  EXPECT_EQ(analysis_.BytesTransferred(*cp), 4096 * 4);
+}
+
+TEST_F(GpuHloCostAnalysisTest, CollectivePermuteStart) {
+  absl::string_view hlo_string = R"(
+HloModule m, num_partitions=2
+
+ENTRY entry {
+  p0 = f32[4096] parameter(0)
+  cps = (f32[4096], f32[4096]) collective-permute-start(p0), source_target_pairs={{0,1},{1,0}}
+  ROOT r = f32[4096] collective-permute-done(cps)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+  const HloInstruction* cps =
+      module->entry_computation()->root_instruction()->operand(0);
+  EXPECT_EQ(analysis_.BytesTransferred(*cps), 4096 * 4);
+}
+
 TEST_F(GpuHloCostAnalysisTest, AsyncAllReduce) {
   absl::string_view hlo_string = R"(
 HloModule m
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index 2b8a14210bb96d..f6d737b033aa14 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -254,7 +254,7 @@ int64_t GpuPerformanceModelWithIndexingAnalysis::FlopsPerElement(
   }
 
   // Encountered unexpected instruction, call into `GpuHloCostAnalysis`.
-  TF_CHECK_OK(
+  CHECK_OK(
       cost_analysis_.RevisitInstruction(const_cast<HloInstruction*>(instr)));
 
   return cost_analysis_.flop_count(*instr) /
@@ -601,7 +601,7 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForTriton(
   const auto& fusion_analysis =
       (consumer == nullptr) ? fusion_analysis_cache_->Get(*producer)
                             : fusion_analysis_cache_->Get(*producer, *consumer);
-  auto launch_config = TritonFusion(fusion_analysis).launch_config();
+  auto launch_config = TritonFusion(fusion_analysis).GetLaunchConfig();
 
   if (!launch_config.has_value()) {
     return absl::InvalidArgumentError(
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
index 85f40e8505482d..88c15cd960926a 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
index d7b6acd3ed3345..4e54b437174750 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
@@ -23,11 +23,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
+#include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -53,7 +56,6 @@ namespace {
 
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 class GpuIndexingPerformanceModelTest : public HloHardwareIndependentTestBase {
  public:
@@ -532,7 +534,7 @@ ENTRY main {
           *fusion_adaptor, launch_dimensions, /*output_tile_sizes=*/{{1, 1}}));
 
   EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.read_time), 2932, 2);
-  EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.compute_time), 19, 1);
+  EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.compute_time), 14, 1);
   EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.exec_time), 2932, 2);
 }
 
@@ -680,7 +682,7 @@ ENTRY main {
                           indexing_cost_model_.EstimateRunTimeForTiledFusion(
                               *fusion_adaptor, /*launch_dimensions=*/{1024, 8},
                               /*output_tile_sizes=*/{{4, 4}}));
-  EXPECT_NEAR(absl::ToDoubleMicroseconds(res1.exec_time), 412, 1);
+  EXPECT_NEAR(absl::ToDoubleMicroseconds(res1.exec_time), 292, 1);
 
   TF_ASSERT_OK_AND_ASSIGN(auto res2,
                           indexing_cost_model_.EstimateRunTimeForTiledFusion(
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index df0cf6b00f84af..c1b9edf790b618 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -227,7 +227,11 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
     absl::Span<const HloInstruction* const> fused_consumers) {
   auto cache_result = gpu_performance_model_cache_.Get(*producer);
-  CHECK(cache_result.has_value());
+  CHECK(cache_result.has_value())
+      << "Producer `" << producer->name()
+      << "` not found in cache. This should never happen! HLO module name: "
+      << producer->GetModule()->name()
+      << " HLO Instruction: " << producer->ToString();
   EstimateRunTimeData producer_runtime = *cache_result;
 
   absl::Duration time_unfused =
@@ -240,7 +244,11 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     VLOG(8) << "Fused consumer: " << fused_consumer->name();
 
     auto cache_result = gpu_performance_model_cache_.Get(*fused_consumer);
-    CHECK(cache_result.has_value());
+    CHECK(cache_result.has_value())
+        << "Consumer `" << fused_consumer->name()
+        << "` not found in cache. This should never happen! HLO module name: "
+        << fused_consumer->GetModule()->name()
+        << " HLO Instruction: " << fused_consumer->ToString();
     EstimateRunTimeData consumer_runtime = *cache_result;
 
     time_unfused += consumer_runtime.exec_time;
@@ -297,7 +305,7 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
       absl::ToDoubleNanoseconds(data.exec_time) * device_info_.clock_rate_ghz();
 
   auto gpu_config = instruction->backend_config<GpuBackendConfig>();
-  TF_CHECK_OK(gpu_config.status()) << instruction->ToString();
+  CHECK_OK(gpu_config.status()) << instruction->ToString();
   auto reification_cost = gpu_config->add_reification_cost();
   reification_cost->set_end_to_end_cycles(cycles);
   reification_cost->set_compute_time_us(
@@ -306,7 +314,7 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
       absl::ToDoubleMicroseconds(data.read_time + data.write_time));
   reification_cost->set_exec_time_us(
       absl::ToDoubleMicroseconds(data.exec_time));
-  TF_CHECK_OK(instruction->set_backend_config(*gpu_config));
+  CHECK_OK(instruction->set_backend_config(*gpu_config));
 
   VLOG(8) << "RecordEstimatedRunTime: " << instruction->ToString();
 }
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index c150db6dde043f..9882cec1c2824f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index b815ca9fab3218..3975bbfafe32a0 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/backends/gpu/codegen/triton/fusion.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -143,9 +143,9 @@ void GpuPerformanceModelCache::Invalidate(const HloInstruction& instruction) {
 
 /*static*/
 LaunchDimensions GpuPerformanceModelBase::EstimateFusionLaunchDimensions(
-    const HloFusionAnalysis& fusion_analysis, mlir::MLIRContext* ctx) {
-  auto emitter =
-      GetFusionEmitter(PreBufferAssignmentFusionInfo{fusion_analysis}, ctx);
+    const HloFusionAnalysis& fusion_analysis, mlir::MLIRContext* mlir_context) {
+  auto emitter = GetFusionEmitter(
+      PreBufferAssignmentFusionInfo{fusion_analysis}, mlir_context);
   if (const auto* kernel_emitter =
           dynamic_cast<const KernelFusionInterface*>(emitter.get())) {
     return kernel_emitter->launch_dimensions();
@@ -155,7 +155,7 @@ LaunchDimensions GpuPerformanceModelBase::EstimateFusionLaunchDimensions(
   // launch dimensions only for SoftMax fusions.
   if (const auto* triton_emitter =
           dynamic_cast<const TritonFusion*>(emitter.get())) {
-    if (auto launch_config = triton_emitter->launch_config()) {
+    if (auto launch_config = triton_emitter->GetLaunchConfig()) {
       return launch_config->launch_dimensions;
     }
   }
@@ -345,7 +345,7 @@ int64_t GpuPerformanceModelBase::CalculateEffectiveFlopsPerNs(
       std::min<int64_t>(num_blocks, gpu_device_info.core_count());
   int64_t fpu_count = n_active_core * n_active_fpus_per_core;
 
-  int64_t flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
+  double flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
   return flop_per_ns_per_fpu * fpu_count;
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
index a9d04ed197710e..166fc8e6842e9c 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -155,7 +155,8 @@ class GpuPerformanceModelBase {
   // Uses HloFusionAnalysis for computing the actual number of threads and
   // blocks that the IR emitter will use.
   static LaunchDimensions EstimateFusionLaunchDimensions(
-      const HloFusionAnalysis& fusion_analysis, mlir::MLIRContext* ctx);
+      const HloFusionAnalysis& fusion_analysis,
+      mlir::MLIRContext* mlir_context);
 
   // Returns bytes accessed of operand output by instruction. Returns 0, if the
   // operand is not used by the instruction.
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
index f829fdfcc64f47..cbdf16d34ffcda 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 
+#include <cstdint>
 #include <memory>
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -36,6 +38,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::MLIRContext;
+
 class GpuPerformanceModelBaseTest : public HloHardwareIndependentTestBase {
  public:
   GpuHloCostAnalysis::Options options_;
@@ -43,7 +47,7 @@ class GpuPerformanceModelBaseTest : public HloHardwareIndependentTestBase {
   // on A6000 by profiling the execution of the HLOs.
   se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
   std::unique_ptr<GpuHloCostAnalysis> analysis_;
-  mlir::MLIRContext mlir_context_;
+  MLIRContext mlir_context_;
 
   GpuPerformanceModelBaseTest() {
     options_.count_multiple_input_accesses = true;
@@ -316,6 +320,18 @@ ENTRY e {
   EXPECT_EQ(launch_dimensions.num_threads_per_block(), 128);
 }
 
+TEST_F(GpuPerformanceModelBaseTest,
+       CalculateEffectiveFlopsPerNsForFullOccupancyH100) {
+  se::DeviceDescription h100_device_info =
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo();
+  int64_t flops_per_ns = GpuPerformanceModelBase::CalculateEffectiveFlopsPerNs(
+      h100_device_info, /*num_blocks=*/h100_device_info.core_count(),
+      /*num_threads_per_block=*/h100_device_info.fpus_per_core());
+  // H100 has a peak of 66.9 TFLOPS/s for TF32.
+  EXPECT_GT(flops_per_ns, 66000);
+  EXPECT_LT(flops_per_ns, 68000);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index b90889759aae90..e14d7dfc1ed7a7 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -48,6 +49,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::MLIRContext;
+
 class GpuPerformanceModelTest : public HloHardwareIndependentTestBase {
  public:
   GpuPerformanceModel::RunTimes EstimateRunTimes(
@@ -62,7 +65,7 @@ class GpuPerformanceModelTest : public HloHardwareIndependentTestBase {
                                                    fused_consumers);
   }
 
-  mlir::MLIRContext mlir_context_;
+  MLIRContext mlir_context_;
   GpuHloCostAnalysis::Options options_{.count_multiple_input_accesses = true};
   // The reference times in the test cases below are measured
   // on A6000 by profiling the execution of the HLOs.
@@ -759,8 +762,8 @@ ENTRY entry_computation.1 {
 
   auto t = gpu_performance_model_.EstimateRunTimesForMultiOutputFusion(
       producer, consumer, &analysis_);
-  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_unfused), 162, 1);
-  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_fused), 145, 1);
+  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_unfused), 120, 1);
+  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_fused), 103, 1);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
index 404bd9eddb96f6..05925be364da61 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
@@ -252,7 +252,7 @@ class CuptiKernelTracer : public HloOpProfiler::KernelTracer,
       LOG(ERROR) << "No kernel events";
       return 0;
     }
-    std::sort(kernel_times_ns_.begin(), kernel_times_ns_.end());
+    absl::c_sort(kernel_times_ns_);
     auto i = kernel_times_ns_.size() / 2;
     // Return median value if number of values is odd.
     if (kernel_times_ns_.size() % 2 != 0) {
@@ -397,7 +397,9 @@ absl::StatusOr<HloInstructionProfile> HloOpProfiler::MeasureClockCyclesPerOp(
 
   // Longer chains are too slow to compile.
   constexpr int kMinOpChainLength = 16;
-  constexpr int kMaxOpChainLength = 8192;
+  // If you get "too fast to measure" errors on faster GPUs, try increasing
+  // kMaxOpChainLength.
+  constexpr int kMaxOpChainLength = 16 * 1024;
 
   absl::Duration duration = absl::ZeroDuration();
   int chain_length = kMinOpChainLength;
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
index aabca69e53105b..e3a4e9f413578d 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
@@ -58,9 +58,8 @@ void WriteOutput(const DeviceHloInstructionProfiles& literal,
     file_name = tsl::io::GetTempFilename(absl::StrCat(name, ".textproto"));
   }
   VLOG(0) << "Writing output to " << file_name;
-  TF_CHECK_OK(
-      tsl::WriteStringToFile(tsl::Env::Default(), file_name,
-                             tsl::LegacyUnredactedDebugString(literal)));
+  CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), file_name,
+                                  tsl::LegacyUnredactedDebugString(literal)));
 }
 
 int RunProfiler(int argc, char** argv) {
@@ -84,37 +83,14 @@ int RunProfiler(int argc, char** argv) {
       runner.backend().stream_executors()[0]->GetDeviceDescription();
   VLOG(0) << dev_info.name() << " @ " << dev_info.clock_rate_ghz() << " GHz";
 
-  const std::vector<PrimitiveType> dtypes = {
-      S8, S16, S32, S64, U8, U16, U32, U64, F16, F32, F64, C64, C128,
-  };
-  const std::vector<HloOpcode> ops = {
-      // Unary
-      HloOpcode::kCbrt,
-      HloOpcode::kCos,
-      HloOpcode::kErf,
-      HloOpcode::kExp,
-      HloOpcode::kExpm1,
-      HloOpcode::kLog,
-      HloOpcode::kLog1p,
-      HloOpcode::kLogistic,
-      HloOpcode::kRsqrt,
-      HloOpcode::kSin,
-      HloOpcode::kSinh,
-      HloOpcode::kSqrt,
-      HloOpcode::kTanh,
-      // Binary
-      HloOpcode::kAdd,
-      HloOpcode::kAtan2,
-      HloOpcode::kDivide,
-      HloOpcode::kMultiply,
-      HloOpcode::kPower,
-      HloOpcode::kSubtract,
-  };
-
   HloInstructionProfileList instr_profiles;
 
   for (const PrimitiveType data_type : HloOpProfiler::AllSupportedDtypes()) {
     for (const HloOpcode op : HloOpProfiler::AllSupportedOps()) {
+      if (HloOpProfiler::TooFastToMeasure().count(op) ||
+          HloOpProfiler::Unsupported().count(op)) {
+        continue;
+      }
       auto result = profiler.MeasureClockCyclesPerOp(op, data_type);
       if (result.ok()) {
         instr_profiles.add_entries()->Swap(&*result);
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc
index 511fed82169b14..121bbb0282513b 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc
@@ -44,8 +44,8 @@ namespace gpu {
 
 /*static*/ std::string HloOpProfiles::GetProfileName(
     const se::DeviceDescription& device_info) {
-  if (auto* ptr = std::get_if<stream_executor::CudaComputeCapability>(
-          &device_info.gpu_compute_capability())) {
+  if (auto* ptr =
+          device_info.gpu_compute_capability().cuda_compute_capability()) {
     return absl::StrCat("sm_", ptr->major, ptr->minor);
   }
   return "<unknown>";
diff --git a/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h b/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h
index 3873df5d660e38..f97be52d5a2459 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h
+++ b/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h
@@ -19,6012 +19,8262 @@ limitations under the License.
 // BEGIN_DEFAULT_PERF_TABLE
 constexpr char kDefaultMatmulPTable[] = R"pb(
   entries {
-    key: "sm_90"
+    key: "sm_100"
     value {
       entries {
         b: 1
-        m: 1024
-        n: 2048
+        m: 256
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 139519467775467 }
-        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "bf16xbf16->bf16" value: 6765006451612 }
+        flops { key: "f32xf32->f32" value: 5065584541062 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 6919866364198 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 5531558193208 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 6808935064935 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 332247798870580 }
-        flops { key: "f32xf32->f32" value: 147979854465270 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 13400332268370 }
+        flops { key: "f32xf32->f32" value: 11552567395420 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12404595933456 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 13148288401253 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 13107200000000 }
       }
       entries {
         b: 1
         m: 256
-        n: 2048
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 41838443890274 }
-        flops { key: "f32xf32->f32" value: 32140260536398 }
+        flops { key: "bf16xbf16->bf16" value: 18551171803731 }
+        flops { key: "f32xf32->f32" value: 20161893946222 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21287506423473 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 20453783602560 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21236982278481 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 256
+        b: 1
+        m: 256
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 42366707070707 }
-        flops { key: "f32xf32->f32" value: 31476953095684 }
+        flops { key: "bf16xbf16->bf16" value: 46091252747252 }
+        flops { key: "f32xf32->f32" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41721395088591 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43690666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52093044052008 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 238821580071174 }
-        flops { key: "f32xf32->f32" value: 116711067826086 }
+        m: 256
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 89240510638297 }
+        flops { key: "f32xf32->f32" value: 67378377510040 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 92182505494505 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 94519526760563 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92691801104972 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 256
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 588674245614035 }
-        flops { key: "f32xf32->f32" value: 154518848960722 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 12593143929442 }
+        flops { key: "f32xf32->f32" value: 11244782841823 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 13187043426999 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 12018063037249 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 13357656050955 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
+        m: 256
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 21788592207792 }
-        flops { key: "f32xf32->f32" value: 15650388059701 }
+        flops { key: "bf16xbf16->bf16" value: 21017495772001 }
+        flops { key: "f32xf32->f32" value: 17329596901226 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26540978445718 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21076904522613 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21287506423473 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
+        b: 1
+        m: 256
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 139230008298755 }
-        flops { key: "f32xf32->f32" value: 62368832713754 }
+        flops { key: "bf16xbf16->bf16" value: 39102032920611 }
+        flops { key: "f32xf32->f32" value: 34370737003841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 50676884274117 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 52103155279503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52408327996876 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 1
+        m: 256
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 493447529411764 }
-        flops { key: "f32xf32->f32" value: 205860344429266 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 88738993719008 }
+        flops { key: "f32xf32->f32" value: 63310249056603 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 94769799117387 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93727463687150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93190576636000 }
       }
       entries {
         b: 1
         m: 256
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 285873755058572 }
-        flops { key: "f32xf32->f32" value: 119890779812416 }
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 163281907542579 }
+        flops { key: "f32xf32->f32" value: 125173912800186 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 157903209411764 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 178897338220593 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 181344675561560 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 525314003913894 }
-        flops { key: "f32xf32->f32" value: 175161798368678 }
+        b: 1
+        m: 256
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 20213513253012 }
+        flops { key: "f32xf32->f32" value: 15590397026367 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21024080200501 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 24813778517286 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21126668975287 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 712739345502821 }
-        flops { key: "f32xf32->f32" value: 284776746657826 }
+        b: 1
+        m: 256
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 33156553359683 }
+        flops { key: "f32xf32->f32" value: 31007907589234 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 48770976744186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 41514917414166 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 41323192118226 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 256
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 343707370038412 }
-        flops { key: "f32xf32->f32" value: 144709140700808 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 81245598062954 }
+        flops { key: "f32xf32->f32" value: 61105271113134 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 81245598062954 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 81024888620585 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 96943104369808 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 256
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 631984593290170 }
-        flops { key: "f32xf32->f32" value: 329166714898835 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 156796411214953 }
+        flops { key: "f32xf32->f32" value: 119810513724615 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 148429889964058 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 147816881057268 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 175218966057441 }
       }
       entries {
-        b: 4
+        b: 1
         m: 256
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22550021505376 }
-        flops { key: "f32xf32->f32" value: 16513007874015 }
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 292333739177783 }
+        flops { key: "f32xf32->f32" value: 214042026113824 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 311410041763341 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 343212985136646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 323318826859379 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 36954220264317 }
-        flops { key: "f32xf32->f32" value: 21959706806282 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 34583284720432 }
+        flops { key: "f32xf32->f32" value: 43226321417069 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 44020245326336 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43464290155440 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44136049983558 }
       }
       entries {
         b: 1
         m: 256
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 91678775956284 }
-        flops { key: "f32xf32->f32" value: 46442120415224 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 69024287991771 }
+        flops { key: "f32xf32->f32" value: 49417425625920 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84720042922518 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 85789535314797 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85380234096692 }
       }
       entries {
         b: 1
         m: 256
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22671913513513 }
-        flops { key: "f32xf32->f32" value: 16810837675350 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 136677930753564 }
+        flops { key: "f32xf32->f32" value: 99855093834278 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162442030862329 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153919412844036 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 392162828341855 }
-        flops { key: "f32xf32->f32" value: 183985919122686 }
+        m: 256
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 266768155031055 }
+        flops { key: "f32xf32->f32" value: 186381153272001 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309168391592283 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 299551352768865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 294296786076469 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 1024
+        n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216131607085346 }
-        flops { key: "f32xf32->f32" value: 82697306223043 }
+        flops { key: "bf16xbf16->bf16" value: 464421204152249 }
+        flops { key: "f32xf32->f32" value: 305735143507972 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566170220933298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 459600566720171 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 468371569901853 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 678724288242730 }
-        flops { key: "f32xf32->f32" value: 302806341426443 }
+        b: 1
+        m: 256
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 53081956891437 }
+        flops { key: "f32xf32->f32" value: 37241322974472 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 68759081967213 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 68900271047227 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 67361469510664 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 256
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 211034163522012 }
-        flops { key: "f32xf32->f32" value: 111199443247721 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 104694015600624 }
+        flops { key: "f32xf32->f32" value: 73827133113311 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 135300129032258 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 134739844898983 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 129288600120409 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 763011394582711 }
-        flops { key: "f32xf32->f32" value: 326132189596626 }
+        b: 1
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 205855411042944 }
+        flops { key: "f32xf32->f32" value: 138941747412008 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 260585323140395 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 247548547319884 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 253689739870053 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 256
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 487178686025408 }
-        flops { key: "f32xf32->f32" value: 206250830580099 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 514244168582375 }
-        flops { key: "f32xf32->f32" value: 165649772292502 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 394106009910075 }
+        flops { key: "f32xf32->f32" value: 272800260162601 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 488897814001138 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 509365191650853 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 520097759263744 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 347714321243523 }
-        flops { key: "f32xf32->f32" value: 174082656290531 }
+        m: 256
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 651394145142943 }
+        flops { key: "f32xf32->f32" value: 424382915468603 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 930452187175043 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 906876540540540 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 920777638760853 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216480206451612 }
-        flops { key: "f32xf32->f32" value: 95055048158640 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 13568310553983 }
+        flops { key: "f32xf32->f32" value: 11066765171503 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12863497028943 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 12826617737003 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 12190529336966 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 512
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 588997160724081 }
-        flops { key: "f32xf32->f32" value: 229043545055794 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 23489276863843 }
+        flops { key: "f32xf32->f32" value: 20311399515738 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26214400000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21183353535353 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21290883248730 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 256
+        b: 1
+        m: 512
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 43690666666666 }
-        flops { key: "f32xf32->f32" value: 31595510357815 }
+        flops { key: "bf16xbf16->bf16" value: 47127011235955 }
+        flops { key: "f32xf32->f32" value: 31707471769430 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41521338901778 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51781530864197 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43464290155440 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 783328793548203 }
-        flops { key: "f32xf32->f32" value: 401589990188086 }
+        m: 512
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 90933420054200 }
+        flops { key: "f32xf32->f32" value: 67778173462946 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 93466384401114 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 92166680171673 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92182505494505 }
       }
       entries {
         b: 1
         m: 512
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373345557719054 }
-        flops { key: "f32xf32->f32" value: 162098705314009 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 119623643493761 }
-        flops { key: "f32xf32->f32" value: 54515730300568 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 165241893505694 }
+        flops { key: "f32xf32->f32" value: 114495822563446 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169466828282828 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 171196081632653 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 151830009049773 }
       }
       entries {
         b: 1
         m: 512
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 91056803256445 }
-        flops { key: "f32xf32->f32" value: 79044598351001 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 17924376068376 }
+        flops { key: "f32xf32->f32" value: 18594863951233 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 25885772034715 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 20971520000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 19593828905109 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 378628050954291 }
-        flops { key: "f32xf32->f32" value: 115084868595927 }
+        b: 1
+        m: 512
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 48480306303052 }
+        flops { key: "f32xf32->f32" value: 34304850607028 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 39290903981264 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47384899558693 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52748173707997 }
       }
       entries {
         b: 1
-        m: 256
+        m: 512
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 172294901155327 }
-        flops { key: "f32xf32->f32" value: 73143176021798 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 89478485333333 }
+        flops { key: "f32xf32->f32" value: 65154236893203 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91678775956284 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 102573731753916 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 98962380092165 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 512
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 447019910074937 }
-        flops { key: "f32xf32->f32" value: 193886208739617 }
+        flops { key: "bf16xbf16->bf16" value: 162466609774549 }
+        flops { key: "f32xf32->f32" value: 124737665427509 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161683756060834 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166471600620155 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 181344675561560 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 512
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 291698403694648 }
+        flops { key: "f32xf32->f32" value: 199710187668557 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 328965019607843 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 326514162688155 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 311319751812119 }
+      }
+      entries {
+        b: 1
+        m: 512
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 135847902834008 }
-        flops { key: "f32xf32->f32" value: 65728564152791 }
+        flops { key: "bf16xbf16->bf16" value: 44143308008551 }
+        flops { key: "f32xf32->f32" value: 30890156041426 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41630808933002 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 41425224691358 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47645625843095 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 349525333333333 }
-        flops { key: "f32xf32->f32" value: 149462948775055 }
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 80273760765550 }
+        flops { key: "f32xf32->f32" value: 60897335753176 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 81616131346913 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93727463687150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91149560611205 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 1
+        m: 512
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 626088527113702 }
-        flops { key: "f32xf32->f32" value: 217400652763717 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 157163615925058 }
+        flops { key: "f32xf32->f32" value: 116495803840729 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 154273250574712 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 153523280526165 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 177477987438016 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 1024
+        b: 1
+        m: 512
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 673839272969739 }
-        flops { key: "f32xf32->f32" value: 278534507964558 }
+        flops { key: "bf16xbf16->bf16" value: 289262344827586 }
+        flops { key: "f32xf32->f32" value: 213679964975124 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 284887721942159 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 328060441185456 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
+        b: 1
+        m: 512
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 218240208130081 }
-        flops { key: "f32xf32->f32" value: 83938541588492 }
+        flops { key: "bf16xbf16->bf16" value: 492429178628754 }
+        flops { key: "f32xf32->f32" value: 311703846142680 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 584667478355567 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 583555339130434 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 581029125541125 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 66576253968253 }
-        flops { key: "f32xf32->f32" value: 35734219382321 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 69741609768771 }
+        flops { key: "f32xf32->f32" value: 50595694279521 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84519979848866 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 84506675901149 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 84733414141414 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 619859436746254 }
-        flops { key: "f32xf32->f32" value: 319734034048770 }
+        b: 1
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135283082272899 }
+        flops { key: "f32xf32->f32" value: 100162483582089 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 151487277652370 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 146846529540481 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 150764086492558 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 512
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 523265996101364 }
-        flops { key: "f32xf32->f32" value: 254561835941204 }
+        flops { key: "bf16xbf16->bf16" value: 263107528546924 }
+        flops { key: "f32xf32->f32" value: 187930659665704 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300852290277388 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 286178524520255 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 307750594439667 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 1
+        m: 512
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 358870930481283 }
-        flops { key: "f32xf32->f32" value: 144865329735563 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 750605958755679 }
-        flops { key: "f32xf32->f32" value: 306217422870230 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 471611649939606 }
+        flops { key: "f32xf32->f32" value: 313867823443437 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 579617718758434 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 533668898608349 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 504577924812030 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 306433168949771 }
-        flops { key: "f32xf32->f32" value: 102573731753916 }
+        b: 1
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 761316546308605 }
+        flops { key: "f32xf32->f32" value: 443237079050567 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 972372038940457 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 983280058608058 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 975907133833219 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 512
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 78766272300469 }
-        flops { key: "f32xf32->f32" value: 51781530864197 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 103383576352782 }
+        flops { key: "f32xf32->f32" value: 73252955655614 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 131328500978473 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 135847902834008 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 134993943173246 }
       }
       entries {
         b: 1
         m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
-        flops { key: "f32xf32->f32" value: 9731563805104 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 204893010972235 }
+        flops { key: "f32xf32->f32" value: 139664649323621 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 265679036001484 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 244448907000569 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 242680941123290 }
       }
       entries {
-        b: 4
+        b: 1
         m: 512
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 77852510440835 }
-        flops { key: "f32xf32->f32" value: 49490312684365 }
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 396397535394554 }
+        flops { key: "f32xf32->f32" value: 275813466221423 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 477643160142348 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 516097968757510 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 471767057996485 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 68338965376782 }
-        flops { key: "f32xf32->f32" value: 39709386982248 }
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 669258635917413 }
+        flops { key: "f32xf32->f32" value: 434999472932597 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 920777638760853 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 928741982052113 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 864439427593841 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 304694047673098 }
-        flops { key: "f32xf32->f32" value: 178718679094540 }
+        m: 512
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 990479630095128 }
+        flops { key: "f32xf32->f32" value: 582605438958220 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1472896877914952 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1493382230876217 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1480640281306558 }
       }
       entries {
         b: 1
         m: 1024
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 349981037809648 }
-        flops { key: "f32xf32->f32" value: 160361695702497 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 26214400000000 }
+        flops { key: "f32xf32->f32" value: 19152073059360 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21902370757180 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21672489585015 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 22787390152801 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 441505684210526 }
-        flops { key: "f32xf32->f32" value: 223882782318598 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 42460527681113 }
+        flops { key: "f32xf32->f32" value: 35098778242677 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41630808933002 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51622203076923 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52418561999609 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 301612871910112 }
-        flops { key: "f32xf32->f32" value: 162294713422007 }
+        flops { key: "bf16xbf16->bf16" value: 90443212938005 }
+        flops { key: "f32xf32->f32" value: 68618470347648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 93206755555555 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 92166680171673 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93466384401114 }
       }
       entries {
         b: 1
-        m: 256
-        n: 512
+        m: 1024
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
-        flops { key: "f32xf32->f32" value: 9731563805104 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 729692031260618 }
-        flops { key: "f32xf32->f32" value: 223348685105857 }
+        flops { key: "bf16xbf16->bf16" value: 166885580354367 }
+        flops { key: "f32xf32->f32" value: 116281332466969 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 175218966057441 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167327695808009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 150785258250245 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 596523235555555 }
-        flops { key: "f32xf32->f32" value: 270604520358498 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 274403737285969 }
+        flops { key: "f32xf32->f32" value: 181375308108108 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 243148057971014 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289223386936026 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 291105279652975 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 1
+        m: 1024
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 41221660933660 }
-        flops { key: "f32xf32->f32" value: 25536097412480 }
+        flops { key: "bf16xbf16->bf16" value: 47926344581324 }
+        flops { key: "f32xf32->f32" value: 33884808886644 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51306470948012 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46863731843575 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51463852760736 }
       }
       entries {
         b: 1
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 324589426844014 }
-        flops { key: "f32xf32->f32" value: 131521536501714 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 725746417032781 }
-        flops { key: "f32xf32->f32" value: 317725034149228 }
+        m: 1024
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 88272099967116 }
+        flops { key: "f32xf32->f32" value: 65664250489236 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 101660843022154 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 99827242841204 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91914211950008 }
       }
       entries {
         b: 1
         m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 220029062295081 }
-        flops { key: "f32xf32->f32" value: 94586136715997 }
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 164080352078239 }
+        flops { key: "f32xf32->f32" value: 122210542226269 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163281907542579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 174762666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164836018421860 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 783185953785750 }
-        flops { key: "f32xf32->f32" value: 320546111440999 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 542037204101593 }
-        flops { key: "f32xf32->f32" value: 263236534444716 }
+        flops { key: "bf16xbf16->bf16" value: 293652898673594 }
+        flops { key: "f32xf32->f32" value: 201207125269371 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 333874945273631 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 329773287469287 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 330534654148068 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 503874974234136 }
-        flops { key: "f32xf32->f32" value: 242271975405963 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 621378370370370 }
-        flops { key: "f32xf32->f32" value: 272873920869137 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 467606673489384 }
+        flops { key: "f32xf32->f32" value: 294599581315590 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547687744963019 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 498024964749536 }
       }
       entries {
         b: 1
-        m: 512
-        n: 2048
+        m: 1024
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 232613046793760 }
-        flops { key: "f32xf32->f32" value: 111848106666666 }
+        flops { key: "bf16xbf16->bf16" value: 77314359447004 }
+        flops { key: "f32xf32->f32" value: 61008058181818 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 82620946752847 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 91678775956284 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92436451790633 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 1024
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 370767204419889 }
-        flops { key: "f32xf32->f32" value: 177771825165562 }
+        flops { key: "bf16xbf16->bf16" value: 152824056931397 }
+        flops { key: "f32xf32->f32" value: 114716006837606 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 156044444702804 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 154273250574712 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 170760468193384 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 4096
+        b: 1
+        m: 1024
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 592245904026475 }
-        flops { key: "f32xf32->f32" value: 257739276044167 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 171633923273657 }
-        flops { key: "f32xf32->f32" value: 70640909473684 }
+        flops { key: "bf16xbf16->bf16" value: 288020875536480 }
+        flops { key: "f32xf32->f32" value: 208736746500777 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 332994828345479 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 308457863832232 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325771184466019 }
       }
       entries {
         b: 1
         m: 1024
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 524288000000000 }
-        flops { key: "f32xf32->f32" value: 224069662771285 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 492542121100917 }
+        flops { key: "f32xf32->f32" value: 316178393404004 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 586023645244917 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 498950661710037 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 563940033613445 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 2048
+        b: 1
+        m: 1024
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 600191069871436 }
-        flops { key: "f32xf32->f32" value: 269314937593077 }
+        flops { key: "bf16xbf16->bf16" value: 723362913010526 }
+        flops { key: "f32xf32->f32" value: 425412767036450 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 863136514469453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 967117157396982 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 856082777755630 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 1024
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 368224219478738 }
-        flops { key: "f32xf32->f32" value: 118934628267611 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135283082272899 }
+        flops { key: "f32xf32->f32" value: 99864380952380 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 131296383467840 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 150426145138694 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 158978653242522 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 793131337247852 }
-        flops { key: "f32xf32->f32" value: 346934459512435 }
+        b: 1
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 263624312300515 }
+        flops { key: "f32xf32->f32" value: 183859901369863 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304348589569161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 294296786076469 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302249633779028 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 1024
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 710616693580410 }
-        flops { key: "f32xf32->f32" value: 292535340070665 }
+        flops { key: "bf16xbf16->bf16" value: 468473745200698 }
+        flops { key: "f32xf32->f32" value: 323002729638264 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 575887274872620 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 465932663918420 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 563940033613445 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 394467973548861 }
-        flops { key: "f32xf32->f32" value: 210125601565557 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 760305770224818 }
+        flops { key: "f32xf32->f32" value: 454493893756613 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 979468026453819 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 990194189279538 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 983280058608058 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 488064465454545 }
-        flops { key: "f32xf32->f32" value: 259107583011583 }
+        m: 1024
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 981258235320996 }
+        flops { key: "f32xf32->f32" value: 559714249820811 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1431655765333333 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1437044682894186 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1478983228650137 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 1024
         n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 293051807860262 }
-        flops { key: "f32xf32->f32" value: 117889967501097 }
+        flops { key: "bf16xbf16->bf16" value: 204288779299847 }
+        flops { key: "f32xf32->f32" value: 144165121374865 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 250406208955223 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 260111875968992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 255652815238095 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 76433785876993 }
-        flops { key: "f32xf32->f32" value: 50006605067064 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 394069850077988 }
+        flops { key: "f32xf32->f32" value: 269158820329635 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 487068189612156 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 475949390070922 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 511183920019043 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 481930800718132 }
-        flops { key: "f32xf32->f32" value: 188843726603205 }
+        b: 1
+        m: 1024
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 675256237088279 }
+        flops { key: "f32xf32->f32" value: 442233041186161 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 923847557754355 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 919102781082816 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 925440055160525 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 607491838189533 }
-        flops { key: "f32xf32->f32" value: 289264780044282 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1014878850661625 }
+        flops { key: "f32xf32->f32" value: 584508341861731 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1470879210958904 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1492992889893108 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1490919828516879 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 1024
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1213953447145279 }
+        flops { key: "f32xf32->f32" value: 656584784697406 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1937615652625049 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1934452109447134 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1951814267666439 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 21564544987146 }
-        flops { key: "f32xf32->f32" value: 15307678832116 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 41020088019559 }
+        flops { key: "f32xf32->f32" value: 33091155818540 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 52093044052008 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51140304057915 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43351979328165 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373345557719054 }
-        flops { key: "f32xf32->f32" value: 174762666666666 }
+        m: 2048
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 90687654054054 }
+        flops { key: "f32xf32->f32" value: 68200065040650 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 88504931091328 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 88069375328083 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89225679242147 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
         n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 393312023443223 }
-        flops { key: "f32xf32->f32" value: 198473534935305 }
+        flops { key: "bf16xbf16->bf16" value: 165267327074034 }
+        flops { key: "f32xf32->f32" value: 115879756529246 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 174705796290270 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177009862182657 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169440085845037 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 2048
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 276097151967086 }
+        flops { key: "f32xf32->f32" value: 181084716080613 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 293612749248017 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289887101511879 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 301570516500491 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 406105077155824 }
+        flops { key: "f32xf32->f32" value: 275247840041015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 488953471766848 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 489845722627737 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 421281735752820 }
+      }
+      entries {
+        b: 1
+        m: 2048
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 543116754678806 }
-        flops { key: "f32xf32->f32" value: 234954447264770 }
+        flops { key: "bf16xbf16->bf16" value: 86466566596875 }
+        flops { key: "f32xf32->f32" value: 65027968992248 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89717732620320 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 97506522339266 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93433851722937 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 728585722240481 }
-        flops { key: "f32xf32->f32" value: 267534616782552 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 162098705314009 }
+        flops { key: "f32xf32->f32" value: 120672266127219 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 179916525469168 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 179435465240641 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164030220592728 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 2048
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 445536026556016 }
-        flops { key: "f32xf32->f32" value: 229144359164510 }
+        flops { key: "bf16xbf16->bf16" value: 231784527576902 }
+        flops { key: "f32xf32->f32" value: 203668783004552 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 329672036843721 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330534654148068 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 315759983531833 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 717741860962566 }
-        flops { key: "f32xf32->f32" value: 290910570294045 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 470114633975481 }
+        flops { key: "f32xf32->f32" value: 296531848660591 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 503631249530956 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 509365191650853 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 497045167920379 }
       }
       entries {
         b: 1
-        m: 256
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 39107729603729 }
-        flops { key: "f32xf32->f32" value: 21902370757180 }
+        m: 2048
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 647466238938720 }
+        flops { key: "f32xf32->f32" value: 405798119425548 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 818322815280556 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 867144618614980 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 874204619580704 }
       }
       entries {
         b: 1
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 342392163265306 }
-        flops { key: "f32xf32->f32" value: 157347864009378 }
+        m: 2048
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 150089715403969 }
+        flops { key: "f32xf32->f32" value: 112410157453936 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166059669656665 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 169013351802298 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169013351802298 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 284359593220339 }
+        flops { key: "f32xf32->f32" value: 214405316293929 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 314235242610477 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 311319751812119 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 303574165677127 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 777585153532370 }
-        flops { key: "f32xf32->f32" value: 310086351656483 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 496069218757218 }
+        flops { key: "f32xf32->f32" value: 318357964272477 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 504518653353694 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 582289492407809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 541064159234064 }
       }
       entries {
         b: 1
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 138084082304526 }
-        flops { key: "f32xf32->f32" value: 79324898345153 }
+        m: 2048
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 725378702246242 }
+        flops { key: "f32xf32->f32" value: 426744229320880 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 970614078192090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 979468026453819 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 969081068592057 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 340654131979695 }
-        flops { key: "f32xf32->f32" value: 181990139661016 }
+        b: 1
+        m: 2048
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 911350548193729 }
+        flops { key: "f32xf32->f32" value: 538216453132832 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1423942742146705 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1373071386189258 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1394017298279779 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 136123456389452 }
-        flops { key: "f32xf32->f32" value: 76346830489192 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 261060496960855 }
+        flops { key: "f32xf32->f32" value: 184317539095356 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305735143507972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 256630455066921 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 297559047803796 }
       }
       entries {
         b: 1
-        m: 256
+        m: 2048
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 466033777777777 }
+        flops { key: "f32xf32->f32" value: 327360312195121 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 567516820295983 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 544355804309252 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 72471775377969 }
-        flops { key: "f32xf32->f32" value: 40427026506024 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 760373071788970 }
+        flops { key: "f32xf32->f32" value: 452292259477674 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 985084242201834 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 992138437514437 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 990422528767439 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 135847902834008 }
-        flops { key: "f32xf32->f32" value: 60567566787003 }
+        b: 1
+        m: 2048
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 985762519164562 }
+        flops { key: "f32xf32->f32" value: 556342913989637 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1427729509183079 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1472896877914952 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1425715284979253 }
       }
       entries {
         b: 1
         m: 2048
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 77314359447004 }
-        flops { key: "f32xf32->f32" value: 45039506040268 }
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1172847432004369 }
+        flops { key: "f32xf32->f32" value: 630003087111974 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2035167823728010 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2043276544243577 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2055008275598086 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 2048
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21959706806282 }
-        flops { key: "f32xf32->f32" value: 17084741344195 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 397093869822485 }
+        flops { key: "f32xf32->f32" value: 270583210231210 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 502688119850187 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 496069218757218 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 486296115942029 }
       }
       entries {
         b: 1
         m: 2048
-        n: 256
+        n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 307486203894616 }
-        flops { key: "f32xf32->f32" value: 118152658688894 }
+        flops { key: "bf16xbf16->bf16" value: 666920387577639 }
+        flops { key: "f32xf32->f32" value: 457178912768109 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 855997468061783 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 888675211255948 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 884466082372322 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 458472170794193 }
-        flops { key: "f32xf32->f32" value: 222398886495443 }
+        b: 1
+        m: 2048
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1017704471536046 }
+        flops { key: "f32xf32->f32" value: 574078366103054 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1484735043125054 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1464609478601875 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1514045050145413 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
+        b: 1
+        m: 2048
+        n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 384045003442571 }
-        flops { key: "f32xf32->f32" value: 115134229466009 }
+        flops { key: "bf16xbf16->bf16" value: 1208488265616207 }
+        flops { key: "f32xf32->f32" value: 657577477761616 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1955813887067395 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1964311592042076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1967911704925544 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 2048
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 807671014191940 }
-        flops { key: "f32xf32->f32" value: 340050234669602 }
+        flops { key: "bf16xbf16->bf16" value: 1342544381979447 }
+        flops { key: "f32xf32->f32" value: 700304466981901 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2491280334106728 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2489836113623188 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2492183823021687 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 4096
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 168615236180904 }
-        flops { key: "f32xf32->f32" value: 70455500262467 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 89717732620320 }
+        flops { key: "f32xf32->f32" value: 66841498007968 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84519979848866 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 92420539163367 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85163532994923 }
       }
       entries {
-        b: 2
+        b: 1
         m: 4096
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 546433498218829 }
-        flops { key: "f32xf32->f32" value: 279765978113600 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 161708106024096 }
+        flops { key: "f32xf32->f32" value: 115904773747841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169413351846008 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166059669656665 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168562295761381 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40329846153846 }
-        flops { key: "f32xf32->f32" value: 25653235474006 }
+        b: 1
+        m: 4096
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 271112693851786 }
+        flops { key: "f32xf32->f32" value: 170719743063836 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 225576013445378 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 291777669565217 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 290475266874070 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 629391456037514 }
-        flops { key: "f32xf32->f32" value: 218732021746049 }
+        b: 1
+        m: 4096
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 403586477729750 }
+        flops { key: "f32xf32->f32" value: 277829568277378 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 487123431552682 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 488842168905076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 473431139329806 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 4096
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 358391797062750 }
-        flops { key: "f32xf32->f32" value: 143089262260127 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 514706368985559 }
+        flops { key: "f32xf32->f32" value: 359276196913296 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 666920387577639 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 634552307896875 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 617093002298850 }
       }
       entries {
         b: 1
-        m: 512
+        m: 4096
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 160884300869044 }
+        flops { key: "f32xf32->f32" value: 123333542844015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 176138750656167 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177978091165257 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 175218966057441 }
+      }
+      entries {
+        b: 1
+        m: 4096
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 287943637436310 }
+        flops { key: "f32xf32->f32" value: 200306281876690 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322590303139552 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 324148475169811 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 308546501149425 }
+      }
+      entries {
+        b: 1
+        m: 4096
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216480206451612 }
-        flops { key: "f32xf32->f32" value: 93727463687150 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 465175706270984 }
+        flops { key: "f32xf32->f32" value: 282210874301859 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 538891756085319 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 496183837338262 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 505349723026238 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 39290903981264 }
-        flops { key: "f32xf32->f32" value: 22519753020134 }
+        m: 4096
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 652334036452004 }
+        flops { key: "f32xf32->f32" value: 407956620060790 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 871278485850491 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 857450049111599 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 801150400298451 }
       }
       entries {
         b: 1
-        m: 256
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 31126560296846 }
-        flops { key: "f32xf32->f32" value: 19217887743413 }
+        m: 4096
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 805318951108611 }
+        flops { key: "f32xf32->f32" value: 517965182826821 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1172127255509313 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1034433356454720 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1009037306707388 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 4096
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 217180789644012 }
-        flops { key: "f32xf32->f32" value: 86313651446945 }
+        flops { key: "bf16xbf16->bf16" value: 283683440951122 }
+        flops { key: "f32xf32->f32" value: 210992694831990 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312816263364894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 323318826859379 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305040290909090 }
       }
       entries {
         b: 1
         m: 4096
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 142481664543524 }
-        flops { key: "f32xf32->f32" value: 80854053012048 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 490573077784123 }
+        flops { key: "f32xf32->f32" value: 315065089201877 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 510333566539923 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 569926658174097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 515107615255456 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 4096
+        m: 4096
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 594212409518539 }
-        flops { key: "f32xf32->f32" value: 294579375582990 }
+        flops { key: "bf16xbf16->bf16" value: 715648970424060 }
+        flops { key: "f32xf32->f32" value: 419102975800156 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 850825534072900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 915966580507570 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 919299506849315 }
       }
       entries {
-        b: 4
+        b: 1
         m: 4096
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 808699906572796 }
-        flops { key: "f32xf32->f32" value: 339606581835209 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 913822828936170 }
+        flops { key: "f32xf32->f32" value: 546155556459816 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1332185885856079 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1405076403369592 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1441022411004865 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 1
+        m: 4096
         n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674884867379006 }
-        flops { key: "f32xf32->f32" value: 297355612396257 }
+        flops { key: "bf16xbf16->bf16" value: 1075624166291009 }
+        flops { key: "f32xf32->f32" value: 636857546856465 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1895395982347749 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1751617983686786 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1746986900955867 }
       }
       entries {
         b: 1
         m: 4096
-        n: 4096
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 738736406437117 }
-        flops { key: "f32xf32->f32" value: 324960877363219 }
+        flops { key: "bf16xbf16->bf16" value: 458766000427259 }
+        flops { key: "f32xf32->f32" value: 308457863832232 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 471767057996485 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 472597633802816 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 562757769392033 }
       }
       entries {
         b: 1
-        m: 512
+        m: 4096
         n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 114716006837606 }
-        flops { key: "f32xf32->f32" value: 55097589490968 }
+        flops { key: "bf16xbf16->bf16" value: 762397673914972 }
+        flops { key: "f32xf32->f32" value: 441052299856233 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 979468026453819 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 893296026622296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 890333187396351 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674037554300062 }
-        flops { key: "f32xf32->f32" value: 301869906503957 }
-      }
-      entries {
-        b: 2
-        m: 2048
-        n: 512
+        b: 1
+        m: 4096
+        n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 518715857004830 }
-        flops { key: "f32xf32->f32" value: 168990076764179 }
+        flops { key: "bf16xbf16->bf16" value: 990308345861194 }
+        flops { key: "f32xf32->f32" value: 556757597433321 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1462614437595777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1416195629709010 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1445143773889636 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 91304576870748 }
-        flops { key: "f32xf32->f32" value: 79044598351001 }
+        b: 1
+        m: 4096
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1170011862566826 }
+        flops { key: "f32xf32->f32" value: 629783686498772 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1833497244823906 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1900428007079646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1896651488628836 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 4096
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 486737000906618 }
-        flops { key: "f32xf32->f32" value: 210951242436149 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1277788708367422 }
+        flops { key: "f32xf32->f32" value: 740001257064093 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2484974207564909 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2481116248546774 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2484075937536148 }
       }
       entries {
-        b: 2
+        b: 1
         m: 4096
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 464019802938634 }
-        flops { key: "f32xf32->f32" value: 230617748627424 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 645132151107773 }
+        flops { key: "f32xf32->f32" value: 423733947908445 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 861751062600321 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 913045768707483 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 916161965870307 }
       }
       entries {
         b: 1
         m: 4096
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 649964784503632 }
-        flops { key: "f32xf32->f32" value: 315620759553204 }
+        flops { key: "bf16xbf16->bf16" value: 991451361034164 }
+        flops { key: "f32xf32->f32" value: 584687376510227 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1484991717866712 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1470879210958904 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1485120088520055 }
       }
       entries {
         b: 1
         m: 4096
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 216480206451612 }
-        flops { key: "f32xf32->f32" value: 113073064869418 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1204928403983728 }
+        flops { key: "f32xf32->f32" value: 655520039072039 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1951814267666439 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1924268501792114 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1958936052907639 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 490293070319634 }
-        flops { key: "f32xf32->f32" value: 225860711821623 }
+        m: 4096
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1343541814655509 }
+        flops { key: "f32xf32->f32" value: 698028164472615 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2499435394486069 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2500799764765821 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2502894694638694 }
       }
       entries {
         b: 1
-        m: 256
+        m: 4096
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 78033562790697 }
-        flops { key: "f32xf32->f32" value: 54471480519480 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1416867213789406 }
+        flops { key: "f32xf32->f32" value: 757940978271899 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2764981863158106 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2785491852050019 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2784419640842787 }
       }
       entries {
         b: 2
         m: 256
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 11554556473829 }
-        flops { key: "f32xf32->f32" value: 9709037037037 }
+        flops { key: "bf16xbf16->bf16" value: 13527285627897 }
+        flops { key: "f32xf32->f32" value: 11212842773600 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12942886017357 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 9508198356474 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 11911406460773 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 629391456037514 }
-        flops { key: "f32xf32->f32" value: 227873901528013 }
+        b: 2
+        m: 256
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 24813778517286 }
+        flops { key: "f32xf32->f32" value: 19551016460305 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26132735202492 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21895224796084 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21952523388943 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 347714321243523 }
-        flops { key: "f32xf32->f32" value: 147330107574094 }
+        m: 256
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46337900224408 }
+        flops { key: "f32xf32->f32" value: 31293478200046 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 50830421511077 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43690666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 53071462238038 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        b: 2
+        m: 256
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 88054930621617 }
+        flops { key: "f32xf32->f32" value: 67633019904258 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 95021400353982 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 82620946752847 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92182505494505 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 412026793553338 }
-        flops { key: "f32xf32->f32" value: 177834391073018 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 164836018421860 }
+        flops { key: "f32xf32->f32" value: 111476518272425 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 165700898765432 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 165241893505694 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 167353775561097 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 738474431911967 }
-        flops { key: "f32xf32->f32" value: 291068289477919 }
+        b: 2
+        m: 256
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 22129880956306 }
+        flops { key: "f32xf32->f32" value: 18152248850419 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26296576802507 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 19284156321839 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 26291425661116 }
       }
       entries {
-        b: 4
+        b: 2
         m: 256
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 478494573975044 }
-        flops { key: "f32xf32->f32" value: 166474826876489 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 169681072060682 }
-        flops { key: "f32xf32->f32" value: 69832324661810 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 80854053012048 }
-        flops { key: "f32xf32->f32" value: 50917195751138 }
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 45582519273221 }
+        flops { key: "f32xf32->f32" value: 34309235173824 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46210269581683 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51771544069431 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 39568905660377 }
       }
       entries {
         b: 2
-        m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21732145077720 }
-        flops { key: "f32xf32->f32" value: 16677153081510 }
+        m: 256
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 88301136842105 }
+        flops { key: "f32xf32->f32" value: 65408249512670 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91647475588938 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 94753073067419 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92166680171673 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 78215459207459 }
-        flops { key: "f32xf32->f32" value: 50533783132530 }
+        flops { key: "bf16xbf16->bf16" value: 162442030862329 }
+        flops { key: "f32xf32->f32" value: 118527632630533 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 164080352078239 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159403477434679 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 174308737662337 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 136400130081300 }
-        flops { key: "f32xf32->f32" value: 62543209692451 }
+        m: 256
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 293693059080962 }
+        flops { key: "f32xf32->f32" value: 190870469113856 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 324099554482342 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 323416212048192 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325672376099484 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 508400484848484 }
-        flops { key: "f32xf32->f32" value: 158512199295085 }
-      }
-      entries {
-        b: 1
-        m: 2048
+        m: 256
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43577184415584 }
-        flops { key: "f32xf32->f32" value: 31714964083175 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 40524676328502 }
+        flops { key: "f32xf32->f32" value: 30720468757152 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 40920039024390 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 48489063583815 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47798336182336 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 519217516441005 }
-        flops { key: "f32xf32->f32" value: 167719747578881 }
-      }
-      entries {
-        b: 4
         m: 256
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 204600195121951 }
-        flops { key: "f32xf32->f32" value: 82544728167281 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 132364623274161 }
-        flops { key: "f32xf32->f32" value: 67513947686116 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 304003913929784 }
-        flops { key: "f32xf32->f32" value: 132104062992125 }
+        flops { key: "bf16xbf16->bf16" value: 81245598062954 }
+        flops { key: "f32xf32->f32" value: 61447969783678 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89003798408488 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 81430443197330 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 81430443197330 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 256
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 518215166023166 }
-        flops { key: "f32xf32->f32" value: 170273045353631 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 561874319204604 }
-        flops { key: "f32xf32->f32" value: 170869163590070 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 157140615249524 }
+        flops { key: "f32xf32->f32" value: 115480944719294 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 149421350403562 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 173857160621761 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 173800877954030 }
       }
       entries {
         b: 2
         m: 256
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 226336809443507 }
-        flops { key: "f32xf32->f32" value: 110649404781533 }
+        flops { key: "bf16xbf16->bf16" value: 292333739177783 }
+        flops { key: "f32xf32->f32" value: 200324967164179 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 313592822429906 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307838825688073 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 337124591522762 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 607674484339352 }
-        flops { key: "f32xf32->f32" value: 208697443303227 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 598184860167130 }
-        flops { key: "f32xf32->f32" value: 264863931424695 }
+        flops { key: "bf16xbf16->bf16" value: 488842168905076 }
+        flops { key: "f32xf32->f32" value: 306695750928306 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 573580034188034 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 569851040997744 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 574654441530639 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 256
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40920039024390 }
-        flops { key: "f32xf32->f32" value: 25970922600619 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 68460968120377 }
+        flops { key: "f32xf32->f32" value: 50748739200302 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 76948675935215 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 80829706714844 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 81628540672038 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 21959706806282 }
-        flops { key: "f32xf32->f32" value: 15592208178438 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135283082272899 }
+        flops { key: "f32xf32->f32" value: 99420539259259 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 150131686800894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 154628718894009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160932527577937 }
       }
       entries {
-        b: 4
+        b: 2
         m: 256
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 327760019536019 }
-        flops { key: "f32xf32->f32" value: 147330107574094 }
+        flops { key: "bf16xbf16->bf16" value: 261601126568400 }
+        flops { key: "f32xf32->f32" value: 187717102097902 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305691622491103 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309882200288600 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309882200288600 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 2
+        m: 256
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 758434522012648 }
-        flops { key: "f32xf32->f32" value: 384029980138926 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 460438174957118 }
+        flops { key: "f32xf32->f32" value: 302292180180180 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 570987409731454 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 571139268085106 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 571139268085106 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 256
         n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 764575547036718 }
-        flops { key: "f32xf32->f32" value: 342271935609352 }
+        flops { key: "bf16xbf16->bf16" value: 732242314551189 }
+        flops { key: "f32xf32->f32" value: 452649765084049 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 968862462440785 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 968862462440785 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 967334976576576 }
       }
       entries {
-        b: 4
+        b: 2
         m: 256
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 71392408510638 }
-        flops { key: "f32xf32->f32" value: 37532921700223 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 104034669508768 }
+        flops { key: "f32xf32->f32" value: 73168097035775 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 135300129032258 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 136677930753564 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 131586007843137 }
       }
       entries {
         b: 2
-        m: 512
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 371280022130013 }
-        flops { key: "f32xf32->f32" value: 144865329735563 }
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 205186666157080 }
+        flops { key: "f32xf32->f32" value: 142604664851583 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 249417380720092 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 246242821694759 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 256110154800238 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 598184860167130 }
-        flops { key: "f32xf32->f32" value: 266971285356870 }
+        b: 2
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 395922501474926 }
+        flops { key: "f32xf32->f32" value: 271403936556082 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 472493651925192 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 498024964749536 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 506422272845183 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 256
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 493901482980680 }
-        flops { key: "f32xf32->f32" value: 250640014939309 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 652334036452004 }
+        flops { key: "f32xf32->f32" value: 416501871217998 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 927038052233973 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 820903535168195 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 928641577513513 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 487178686025408 }
-        flops { key: "f32xf32->f32" value: 197016848440366 }
-      }
-      entries {
-        b: 1
         m: 256
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 41838443890274 }
-        flops { key: "f32xf32->f32" value: 25930782071097 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 993056022196531 }
+        flops { key: "f32xf32->f32" value: 552762843758043 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1431655765333333 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1418417204755614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1431536470627447 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 688634012446012 }
-        flops { key: "f32xf32->f32" value: 325083857968683 }
+        b: 2
+        m: 512
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 20164923076923 }
+        flops { key: "f32xf32->f32" value: 19642576906190 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21732145077720 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21841778356387 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 26630501587301 }
       }
       entries {
-        b: 4
+        b: 2
         m: 512
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 343267846547314 }
-        flops { key: "f32xf32->f32" value: 147898322865013 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46725057615317 }
+        flops { key: "f32xf32->f32" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 42581766497461 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51781530864197 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43233283298437 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 512
+        b: 2
+        m: 512
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 298593388209121 }
-        flops { key: "f32xf32->f32" value: 161708106024096 }
+        flops { key: "bf16xbf16->bf16" value: 75551774838164 }
+        flops { key: "f32xf32->f32" value: 67378377510040 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91180521739130 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93727463687150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91914211950008 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 512
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 631241519106408 }
-        flops { key: "f32xf32->f32" value: 232113344376139 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 164482509803921 }
+        flops { key: "f32xf32->f32" value: 114495822563446 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 153919412844036 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 151103549676329 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 174734226851098 }
       }
       entries {
-        b: 4
+        b: 2
         m: 512
-        n: 512
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 77492914549653 }
-        flops { key: "f32xf32->f32" value: 51861564142194 }
+        flops { key: "bf16xbf16->bf16" value: 273913730612244 }
+        flops { key: "f32xf32->f32" value: 176326763116840 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 282489298605630 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 265711908933432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 296941876106194 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 1024
+        b: 2
+        m: 512
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 145572373101952 }
-        flops { key: "f32xf32->f32" value: 80369897005988 }
+        flops { key: "bf16xbf16->bf16" value: 39850869358669 }
+        flops { key: "f32xf32->f32" value: 33825032258064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46995002801120 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47118738985430 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51130563047619 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 512
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 703631601572739 }
-        flops { key: "f32xf32->f32" value: 244339392332690 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 89210852775008 }
+        flops { key: "f32xf32->f32" value: 64902189555125 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 87381333333333 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93174403332176 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 86452642834138 }
       }
       entries {
-        b: 4
+        b: 2
+        m: 512
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 162466609774549 }
+        flops { key: "f32xf32->f32" value: 121574028985507 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163232262693827 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168562295761381 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 179405484377610 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 292971848294679 }
+        flops { key: "f32xf32->f32" value: 202688404719207 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309971658198614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 312134251162790 }
+      }
+      entries {
+        b: 2
         m: 512
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 699734000651678 }
-        flops { key: "f32xf32->f32" value: 276276973537996 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 466844271304347 }
+        flops { key: "f32xf32->f32" value: 287365669476783 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 485307039096045 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 538891756085319 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 549861387274356 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
+        b: 2
+        m: 512
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 235883528998242 }
-        flops { key: "f32xf32->f32" value: 107288351718625 }
+        flops { key: "bf16xbf16->bf16" value: 79324898345153 }
+        flops { key: "f32xf32->f32" value: 61780312082853 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91165038546442 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 88069375328083 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91165038546442 }
       }
       entries {
         b: 2
         m: 512
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 139810133333333 }
-        flops { key: "f32xf32->f32" value: 68548379979570 }
+        flops { key: "bf16xbf16->bf16" value: 151146090090090 }
+        flops { key: "f32xf32->f32" value: 113551377326565 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 167353775561097 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168986752282027 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 149442146694502 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 512
+        b: 2
+        m: 512
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 371794260387811 }
-        flops { key: "f32xf32->f32" value: 135163875125881 }
+        flops { key: "bf16xbf16->bf16" value: 289184439536762 }
+        flops { key: "f32xf32->f32" value: 203978310030395 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 325771184466019 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309257437788018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 306389448994150 }
       }
       entries {
         b: 2
-        m: 256
-        n: 1024
+        m: 512
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 138654677685950 }
-        flops { key: "f32xf32->f32" value: 66182311637080 }
+        flops { key: "bf16xbf16->bf16" value: 495268369003690 }
+        flops { key: "f32xf32->f32" value: 316924977567886 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 562757769392033 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 577202969493347 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 580871963213416 }
       }
       entries {
         b: 2
-        m: 256
+        m: 512
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 228261442176870 }
-        flops { key: "f32xf32->f32" value: 111662003327787 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 718702693440428 }
+        flops { key: "f32xf32->f32" value: 422961967206657 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 836084737395367 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 931865327836841 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 917532000854518 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 680876235890932 }
-        flops { key: "f32xf32->f32" value: 276028393287221 }
+        m: 512
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 133649716704007 }
+        flops { key: "f32xf32->f32" value: 99864380952380 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 156044444702804 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 156067125581395 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 158626358989511 }
       }
       entries {
-        b: 1
+        b: 2
         m: 512
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 430530001603849 }
-        flops { key: "f32xf32->f32" value: 181252839972991 }
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 261123984435797 }
+        flops { key: "f32xf32->f32" value: 176834951251646 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 295633762114537 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 302207099352659 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 301570516500491 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 458864027350427 }
-        flops { key: "f32xf32->f32" value: 221164911803705 }
+        b: 2
+        m: 512
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 466793532876861 }
+        flops { key: "f32xf32->f32" value: 314995767950128 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 542156942186316 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 577125409298575 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462819751724137 }
       }
       entries {
         b: 2
-        m: 2048
+        m: 512
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 756697425931839 }
-        flops { key: "f32xf32->f32" value: 301108029620282 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 757222724964739 }
+        flops { key: "f32xf32->f32" value: 458741500240320 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 974136379224313 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 975907133833219 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 976128930909090 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373345557719054 }
-        flops { key: "f32xf32->f32" value: 186932768802228 }
+        b: 2
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 972372038940457 }
+        flops { key: "f32xf32->f32" value: 558259218301163 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1445143773889636 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1416545941952506 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1431417195800700 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 512
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
-        flops { key: "f32xf32->f32" value: 9218250549450 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 720518760010484 }
-        flops { key: "f32xf32->f32" value: 298160677964925 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 205225883792048 }
+        flops { key: "f32xf32->f32" value: 145100246486486 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 242708368896925 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 259107583011583 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 254682595825426 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 450395060402684 }
-        flops { key: "f32xf32->f32" value: 192289008595988 }
+        b: 2
+        m: 512
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 391876578102189 }
+        flops { key: "f32xf32->f32" value: 271971080040526 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 478334702750863 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 477643160142348 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474162872157209 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 512
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 489845722627737 }
-        flops { key: "f32xf32->f32" value: 198331476807277 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 671088640000000 }
+        flops { key: "f32xf32->f32" value: 438239609815825 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 923748208624583 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 872694767042568 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 923847557754355 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 602887043234138 }
-        flops { key: "f32xf32->f32" value: 205855411042944 }
+        b: 2
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 989623800921659 }
+        flops { key: "f32xf32->f32" value: 581048776811986 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1490919828516879 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1499639418994413 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1495462150417827 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 2
+        m: 512
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 801942750700917 }
-        flops { key: "f32xf32->f32" value: 323686992182136 }
+        flops { key: "bf16xbf16->bf16" value: 1209168720720720 }
+        flops { key: "f32xf32->f32" value: 662241507362578 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1908874353777777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1903797560283688 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1908874353777777 }
       }
       entries {
-        b: 1
+        b: 2
         m: 1024
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 82241254901960 }
-        flops { key: "f32xf32->f32" value: 45221606469002 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 41721395088591 }
+        flops { key: "f32xf32->f32" value: 34450135523613 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51931796479009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 48620803477630 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51306470948012 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 1024
         n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 43129089974293 }
-        flops { key: "f32xf32->f32" value: 31595510357815 }
+        flops { key: "bf16xbf16->bf16" value: 90443212938005 }
+        flops { key: "f32xf32->f32" value: 67378377510040 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 92420539163367 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 88989045582628 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92436451790633 }
       }
       entries {
         b: 2
-        m: 512
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335544320000000 }
-        flops { key: "f32xf32->f32" value: 168403673776662 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 477643160142348 }
-        flops { key: "f32xf32->f32" value: 158743616794795 }
+        m: 1024
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 165292768472906 }
+        flops { key: "f32xf32->f32" value: 113743837288135 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 174280445382243 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 174762666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169413351846008 }
       }
       entries {
         b: 2
-        m: 2048
+        m: 1024
         n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 299593142857142 }
-        flops { key: "f32xf32->f32" value: 161805579264617 }
+        flops { key: "bf16xbf16->bf16" value: 275036327868852 }
+        flops { key: "f32xf32->f32" value: 178956970666666 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 293011822622458 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 240103270125223 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300263373601789 }
       }
       entries {
         b: 2
-        m: 512
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 235057316987740 }
-        flops { key: "f32xf32->f32" value: 110923742148760 }
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 404880024132730 }
+        flops { key: "f32xf32->f32" value: 274754816786079 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 467555769214021 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 481822671752299 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 470836142951107 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
+        b: 2
+        m: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 542293850505050 }
-        flops { key: "f32xf32->f32" value: 267632558325024 }
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 64265131912856 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 92948565096952 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 89448669110296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89928125963149 }
       }
       entries {
         b: 2
-        m: 256
+        m: 1024
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 72944417391304 }
-        flops { key: "f32xf32->f32" value: 37449142857142 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 163680156097560 }
+        flops { key: "f32xf32->f32" value: 122657279415124 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 174762666666666 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167772160000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162836188049742 }
       }
       entries {
-        b: 4
+        b: 2
         m: 1024
-        n: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 541746631685166 }
-        flops { key: "f32xf32->f32" value: 265518896867224 }
+        flops { key: "bf16xbf16->bf16" value: 293612749248017 }
+        flops { key: "f32xf32->f32" value: 197615132787337 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309926922788281 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 310599312698871 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 310599312698871 }
       }
       entries {
         b: 2
-        m: 512
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 112598765100671 }
-        flops { key: "f32xf32->f32" value: 53303307386814 }
-      }
-      entries {
-        b: 1
         m: 1024
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 237553500884955 }
-        flops { key: "f32xf32->f32" value: 97612893090909 }
+        flops { key: "bf16xbf16->bf16" value: 466033777777777 }
+        flops { key: "f32xf32->f32" value: 289242864570004 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 553332555526926 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 500812417910447 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 510333566539923 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 79324898345153 }
-        flops { key: "f32xf32->f32" value: 54120051612903 }
+        b: 2
+        m: 1024
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 648394821256038 }
+        flops { key: "f32xf32->f32" value: 409395414736440 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 868723158576051 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 806112480480480 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 806112480480480 }
       }
       entries {
-        b: 4
+        b: 2
         m: 1024
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 313959597660818 }
-        flops { key: "f32xf32->f32" value: 102888254503641 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 148470938053097 }
+        flops { key: "f32xf32->f32" value: 113144554689146 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171633923273657 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167353775561097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168166299765074 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216131607085346 }
-        flops { key: "f32xf32->f32" value: 96006958512160 }
+        b: 2
+        m: 1024
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 288020875536480 }
+        flops { key: "f32xf32->f32" value: 209326800662832 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322638769230769 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 320999050523168 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 322638769230769 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335544320000000 }
-        flops { key: "f32xf32->f32" value: 176486164365548 }
+        b: 2
+        m: 1024
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 496183837338262 }
+        flops { key: "f32xf32->f32" value: 317229285471600 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 579617718758434 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 580871963213416 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 583396807389296 }
       }
       entries {
-        b: 1
+        b: 2
         m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 738474431911967 }
-        flops { key: "f32xf32->f32" value: 324837989770739 }
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 715768235313723 }
+        flops { key: "f32xf32->f32" value: 426659444295435 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 915966580507570 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 887389937190082 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 894598478650281 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 84307618090452 }
-        flops { key: "f32xf32->f32" value: 59178892416225 }
+        b: 2
+        m: 1024
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 906876540540540 }
+        flops { key: "f32xf32->f32" value: 528920574612850 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1380130879177378 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1335499781094527 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1379798344229379 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 1024
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 135027895372233 }
-        flops { key: "f32xf32->f32" value: 75318590347923 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 263107528546924 }
+        flops { key: "f32xf32->f32" value: 177536677248677 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 306389448994150 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 296286375275938 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 296204641103448 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 309614136101499 }
-        flops { key: "f32xf32->f32" value: 120590950584007 }
+        b: 2
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 466844271304347 }
+        flops { key: "f32xf32->f32" value: 323318826859379 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 457300606473594 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 563940033613445 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 573426875300400 }
       }
       entries {
         b: 2
-        m: 2048
+        m: 1024
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 737206882251973 }
-        flops { key: "f32xf32->f32" value: 287329623508358 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 758292248587570 }
+        flops { key: "f32xf32->f32" value: 456425855047821 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 974136379224313 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 977684337810152 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 976128930909090 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 1024
         n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 759301869386266 }
-        flops { key: "f32xf32->f32" value: 293054307299973 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 443328581337737 }
-        flops { key: "f32xf32->f32" value: 226910782755705 }
+        flops { key: "bf16xbf16->bf16" value: 983055000228885 }
+        flops { key: "f32xf32->f32" value: 551768665981500 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1387263338501292 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1474920087912088 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1486790928948507 }
       }
       entries {
         b: 2
         m: 1024
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81442796116504 }
-        flops { key: "f32xf32->f32" value: 51463852760736 }
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1159939854432516 }
+        flops { key: "f32xf32->f32" value: 630349820543396 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1807646168350168 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1803092903442485 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1900007651404556 }
       }
       entries {
         b: 2
-        m: 256
+        m: 1024
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 60349697841726 }
-        flops { key: "f32xf32->f32" value: 28102539363484 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 394178349486049 }
+        flops { key: "f32xf32->f32" value: 238397385435168 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 495268369003690 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 493334171376062 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 504577924812030 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 1024
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 174535407022106 }
-        flops { key: "f32xf32->f32" value: 71851032119914 }
+        flops { key: "bf16xbf16->bf16" value: 681146189199904 }
+        flops { key: "f32xf32->f32" value: 317287873231633 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 914601212947189 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 808540530120481 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 849479291139240 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 1024
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43464290155440 }
-        flops { key: "f32xf32->f32" value: 32704124756335 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 991222546965151 }
+        flops { key: "f32xf32->f32" value: 574750566525041 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1470753290300488 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1470501513652315 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1482684835073789 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 512
+        m: 1024
+        n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 645277538461538 }
-        flops { key: "f32xf32->f32" value: 180369867965731 }
+        flops { key: "bf16xbf16->bf16" value: 1208488265616207 }
+        flops { key: "f32xf32->f32" value: 655682658779077 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1936087134050825 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1913551925150367 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1937615652625049 }
       }
       entries {
         b: 2
         m: 1024
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 714636821297837 }
-        flops { key: "f32xf32->f32" value: 287946050499886 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1340475504457232 }
+        flops { key: "f32xf32->f32" value: 708857451064532 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2324745491745602 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2363361995253981 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2277213663916227 }
       }
       entries {
         b: 2
         m: 2048
-        n: 1024
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 215784128617363 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        flops { key: "bf16xbf16->bf16" value: 89225679242147 }
+        flops { key: "f32xf32->f32" value: 68182742189484 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91678775956284 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 84720042922518 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85584395345129 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 4096
+        b: 2
+        m: 2048
+        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 209715200000000 }
-        flops { key: "f32xf32->f32" value: 106184911392405 }
+        flops { key: "bf16xbf16->bf16" value: 162049777241171 }
+        flops { key: "f32xf32->f32" value: 115084868595927 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169013351802298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166523235732009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172933133193751 }
       }
       entries {
         b: 2
-        m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 587707621237000 }
-        flops { key: "f32xf32->f32" value: 177157712647589 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 2048
+        m: 2048
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 140395112970711 }
-        flops { key: "f32xf32->f32" value: 80177854241338 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 521740439261418 }
-        flops { key: "f32xf32->f32" value: 259734355104015 }
+        flops { key: "bf16xbf16->bf16" value: 271695805668016 }
+        flops { key: "f32xf32->f32" value: 178451358484294 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 222906751920282 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 238821580071174 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 294256460400109 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 2048
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 737475872335858 }
-        flops { key: "f32xf32->f32" value: 250700363853927 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 404880024132730 }
+        flops { key: "f32xf32->f32" value: 267849535141877 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 470114633975481 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 466844271304347 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 470887764060958 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 2048
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 225197530201342 }
-        flops { key: "f32xf32->f32" value: 101449529856387 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 534731984063745 }
+        flops { key: "f32xf32->f32" value: 354282545244576 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 636857546856465 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 632170635266411 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 642814831400134 }
       }
       entries {
         b: 2
-        m: 512
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 206806976887519 }
-        flops { key: "f32xf32->f32" value: 110558260296540 }
+        m: 2048
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 161319384615384 }
+        flops { key: "f32xf32->f32" value: 123376056991841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 177536677248677 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 176138750656167 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 171606492568323 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 487178686025408 }
-        flops { key: "f32xf32->f32" value: 230819147977965 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 288020875536480 }
+        flops { key: "f32xf32->f32" value: 201528120120120 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 318759633071099 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 320328706443914 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 319470938411187 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 692967184002742 }
-        flops { key: "f32xf32->f32" value: 312069047782530 }
+        m: 2048
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 460339474383708 }
+        flops { key: "f32xf32->f32" value: 290514562770562 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 543322871094244 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 496987652858134 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 537812083145504 }
       }
       entries {
         b: 2
-        m: 256
+        m: 2048
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 142179796610169 }
-        flops { key: "f32xf32->f32" value: 75234152466367 }
+        flops { key: "bf16xbf16->bf16" value: 646783720502974 }
+        flops { key: "f32xf32->f32" value: 412026793553338 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 805885598273759 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 867319728594507 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 807324679699248 }
       }
       entries {
         b: 2
         m: 2048
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 729080438554983 }
-        flops { key: "f32xf32->f32" value: 316974680283028 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 804075127960310 }
+        flops { key: "f32xf32->f32" value: 518465390632544 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1014699024511251 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1163317252437703 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1056832503937007 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21620123711340 }
-        flops { key: "f32xf32->f32" value: 16912516129032 }
+        b: 2
+        m: 2048
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 289808859379217 }
+        flops { key: "f32xf32->f32" value: 211699886435331 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 329773287469287 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307838825688073 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302292180180180 }
       }
       entries {
         b: 2
         m: 2048
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
-        flops { key: "f32xf32->f32" value: 80466263788968 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 217180789644012 }
-        flops { key: "f32xf32->f32" value: 106017162717219 }
+        flops { key: "bf16xbf16->bf16" value: 498776831494599 }
+        flops { key: "f32xf32->f32" value: 315806418823529 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 513199581311984 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 556775641171895 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 573580034188034 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 522247968871595 }
-        flops { key: "f32xf32->f32" value: 225292031892572 }
+        b: 2
+        m: 2048
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 719485266102688 }
+        flops { key: "f32xf32->f32" value: 427444993630573 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 960305711794298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 965595165467625 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 920876349914236 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 391305329446064 }
-        flops { key: "f32xf32->f32" value: 196081414170927 }
+        b: 2
+        m: 2048
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 910529424634301 }
+        flops { key: "f32xf32->f32" value: 532214039157373 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1342177280000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1385473321290322 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1429392560445960 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 140395112970711 }
-        flops { key: "f32xf32->f32" value: 80562861944777 }
+        m: 2048
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1074547734801100 }
+        flops { key: "f32xf32->f32" value: 621198625397743 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1753942744665645 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1747342268510984 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1915151795775040 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 1024
+        b: 2
+        m: 2048
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 631984593290170 }
-        flops { key: "f32xf32->f32" value: 335859187988739 }
+        flops { key: "bf16xbf16->bf16" value: 469190222416430 }
+        flops { key: "f32xf32->f32" value: 330509218622547 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566170220933298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 481067125448028 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 562684042447268 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 2048
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 449640629815745 }
-        flops { key: "f32xf32->f32" value: 179195898531375 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 442233041186161 }
-        flops { key: "f32xf32->f32" value: 223144163969346 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 209061881619937 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 759163463720724 }
+        flops { key: "f32xf32->f32" value: 458447701980039 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 986895058823529 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 988484993325661 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 990422528767439 }
       }
       entries {
         b: 2
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 142481664543524 }
-        flops { key: "f32xf32->f32" value: 79324898345153 }
+        m: 2048
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 979635581000171 }
+        flops { key: "f32xf32->f32" value: 553760610624033 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1464609478601875 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1458516782748960 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1462614437595777 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 2048
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 588029476451259 }
-        flops { key: "f32xf32->f32" value: 174876518566775 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1160566721880699 }
+        flops { key: "f32xf32->f32" value: 632939217625170 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2047172209723546 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1918573810262996 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2045222521904762 }
       }
       entries {
-        b: 1
+        b: 2
         m: 2048
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 299259148272017 }
-        flops { key: "f32xf32->f32" value: 143548372192513 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1277123787094855 }
+        flops { key: "f32xf32->f32" value: 701792041830065 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2489836113623188 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2488393566628041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2341857849509269 }
       }
       entries {
-        b: 1
+        b: 2
         m: 2048
-        n: 4096
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 753643516181744 }
-        flops { key: "f32xf32->f32" value: 383517745845006 }
+        flops { key: "bf16xbf16->bf16" value: 655570067312829 }
+        flops { key: "f32xf32->f32" value: 413932854279105 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 836247526479750 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 911495606112054 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 917434005340168 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 607320036199095 }
-        flops { key: "f32xf32->f32" value: 295554929835275 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 992310355455438 }
+        flops { key: "f32xf32->f32" value: 576660485499463 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1480767900706774 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1484735043125054 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1474793474461327 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 2048
         n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 649571581367211 }
-        flops { key: "f32xf32->f32" value: 180796956357934 }
+        flops { key: "bf16xbf16->bf16" value: 1204421563656758 }
+        flops { key: "f32xf32->f32" value: 651740105614567 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1957150738664844 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1927723202872531 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1910572640569395 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 596543948887114 }
-        flops { key: "f32xf32->f32" value: 260048879631872 }
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1341338943160524 }
+        flops { key: "f32xf32->f32" value: 710330222713788 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2460594268690919 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2463946817353890 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2483537287170220 }
       }
       entries {
         b: 2
-        m: 512
+        m: 2048
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 734684792336640 }
-        flops { key: "f32xf32->f32" value: 251847002279540 }
+        flops { key: "bf16xbf16->bf16" value: 1417378628523105 }
+        flops { key: "f32xf32->f32" value: 760667656280094 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2782615676060900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2778115974126779 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2770443941059082 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 4096
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 6452775384615 }
-        flops { key: "f32xf32->f32" value: 5652700808625 }
+        flops { key: "bf16xbf16->bf16" value: 159783009523809 }
+        flops { key: "f32xf32->f32" value: 114716006837606 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172046438711744 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 176080981305346 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161319384615384 }
       }
       entries {
         b: 2
-        m: 512
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
-        flops { key: "f32xf32->f32" value: 110376421052631 }
+        m: 4096
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 266238984378874 }
+        flops { key: "f32xf32->f32" value: 175893492341715 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 294256460400109 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 297517823219728 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 240479691825307 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 4096
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
-        flops { key: "f32xf32->f32" value: 107805404016064 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 113168404721753 }
-        flops { key: "f32xf32->f32" value: 55692003319502 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 403056240240240 }
+        flops { key: "f32xf32->f32" value: 270310736736106 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 473326790390125 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 463569055153804 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 469241483229542 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
-        flops { key: "f32xf32->f32" value: 80562861944777 }
+        b: 2
+        m: 4096
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 535131733864938 }
+        flops { key: "f32xf32->f32" value: 355779265738899 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 643585419345171 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 639132038095238 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 643681872761333 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 4096
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 338933656565656 }
-        flops { key: "f32xf32->f32" value: 149546215041782 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 793600756836659 }
+        flops { key: "f32xf32->f32" value: 460833400858369 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 903775537061392 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 900790120805369 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 900742892256068 }
       }
       entries {
         b: 2
-        m: 512
-        n: 512
+        m: 4096
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 81049352657004 }
-        flops { key: "f32xf32->f32" value: 50006605067064 }
+        flops { key: "bf16xbf16->bf16" value: 284359593220339 }
+        flops { key: "f32xf32->f32" value: 198546934911242 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 302974555304740 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 322541851607089 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325672376099484 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 286483944503735 }
-        flops { key: "f32xf32->f32" value: 94552820007044 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 459502224884989 }
+        flops { key: "f32xf32->f32" value: 279583862517901 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 538959379595934 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 541132329091596 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 539027020080321 }
       }
       entries {
         b: 2
-        m: 256
+        m: 4096
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 80659692307692 }
-        flops { key: "f32xf32->f32" value: 48841967976710 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81442796116504 }
-        flops { key: "f32xf32->f32" value: 51228140458015 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 641806230723251 }
-        flops { key: "f32xf32->f32" value: 184745668272539 }
+        flops { key: "bf16xbf16->bf16" value: 642959176047904 }
+        flops { key: "f32xf32->f32" value: 400351164802386 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 871455269554631 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 865833544199173 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 798915047619047 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 4096
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 345476777348777 }
-        flops { key: "f32xf32->f32" value: 147735528893780 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 493901482980680 }
-        flops { key: "f32xf32->f32" value: 155299656349435 }
+        flops { key: "bf16xbf16->bf16" value: 804715405124361 }
+        flops { key: "f32xf32->f32" value: 519217516441005 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1103537331963001 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1071531789683777 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1107806885736394 }
       }
       entries {
         b: 2
         m: 4096
-        n: 256
+        n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 234646377622377 }
-        flops { key: "f32xf32->f32" value: 110376421052631 }
+        flops { key: "bf16xbf16->bf16" value: 916773082739667 }
+        flops { key: "f32xf32->f32" value: 619831481906411 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1535013329521086 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1390857284974093 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1541210117879250 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 211699886435331 }
-        flops { key: "f32xf32->f32" value: 113168404721753 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 773528256013687 }
-        flops { key: "f32xf32->f32" value: 315414325188586 }
+        m: 4096
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 481930800718132 }
+        flops { key: "f32xf32->f32" value: 308192257175660 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566170220933298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 514244168582375 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 571063328812657 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 4096
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 236715569664903 }
-        flops { key: "f32xf32->f32" value: 110200833786626 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 718702693440428 }
+        flops { key: "f32xf32->f32" value: 419409920999951 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 909950698305084 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 920580279927124 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 916064262770608 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 4096
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 238397385435168 }
-        flops { key: "f32xf32->f32" value: 124506241187384 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 909180206604572 }
+        flops { key: "f32xf32->f32" value: 530947528633680 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1409110005249343 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1403354777323966 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1416195629709010 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 512
+        b: 2
+        m: 4096
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 141281818947368 }
-        flops { key: "f32xf32->f32" value: 68548379979570 }
+        flops { key: "bf16xbf16->bf16" value: 1074816640640640 }
+        flops { key: "f32xf32->f32" value: 618359039124644 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1870223076856085 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1766930904453358 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1880355626771739 }
       }
       entries {
         b: 2
         m: 4096
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 584190328618063 }
-        flops { key: "f32xf32->f32" value: 171414722860791 }
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1161174646187120 }
+        flops { key: "f32xf32->f32" value: 735439605479452 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2393989783522034 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2398836762523126 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2396745142857143 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 337654661635220 }
-        flops { key: "f32xf32->f32" value: 176486164365548 }
+        b: 2
+        m: 4096
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 725317452672464 }
+        flops { key: "f32xf32->f32" value: 432480847447387 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 954967714508060 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 963645343504599 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 967226054723567 }
       }
       entries {
         b: 2
-        m: 512
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 141281818947368 }
-        flops { key: "f32xf32->f32" value: 76000978482446 }
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 961057797270082 }
+        flops { key: "f32xf32->f32" value: 560700691383812 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1468867064295485 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1448677728644911 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1394469901298701 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 430530001603849 }
-        flops { key: "f32xf32->f32" value: 176196557925828 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1162687410936654 }
+        flops { key: "f32xf32->f32" value: 620447071416963 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1798562519262981 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1893307161560502 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1799598720368721 }
       }
       entries {
-        b: 1
+        b: 2
         m: 4096
         n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 716066571523841 }
-        flops { key: "f32xf32->f32" value: 311322572580572 }
+        flops { key: "bf16xbf16->bf16" value: 1276412138935324 }
+        flops { key: "f32xf32->f32" value: 731097151295281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2484075937536148 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2489204793566849 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2363768462300495 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 138084082304526 }
-        flops { key: "f32xf32->f32" value: 67650064516129 }
+        b: 2
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1344396058651485 }
+        flops { key: "f32xf32->f32" value: 813209750260342 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2901024853765619 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2887676299443219 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2898394176848942 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 139230008298755 }
-        flops { key: "f32xf32->f32" value: 80466263788968 }
+        b: 2
+        m: 4096
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 985027761252221 }
+        flops { key: "f32xf32->f32" value: 507919500473036 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1420293417989418 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1438970532205377 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1420293417989418 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 4096
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 442233041186161 }
-        flops { key: "f32xf32->f32" value: 178243994687915 }
+        flops { key: "bf16xbf16->bf16" value: 1203409161109554 }
+        flops { key: "f32xf32->f32" value: 661769580092833 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1884894309506829 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1893724557319224 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1911848340084576 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 2048
+        m: 4096
+        n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 738998566899666 }
-        flops { key: "f32xf32->f32" value: 285990347902083 }
+        flops { key: "bf16xbf16->bf16" value: 1335473827389859 }
+        flops { key: "f32xf32->f32" value: 686693481118783 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2462181180078825 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2472635173287277 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2460858611853178 }
       }
       entries {
         b: 2
-        m: 256
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 42048160401002 }
-        flops { key: "f32xf32->f32" value: 25653235474006 }
+        m: 4096
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1418402566353963 }
+        flops { key: "f32xf32->f32" value: 756222782991460 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2788939802597402 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2793474664065040 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2781545677521199 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 206488812307692 }
-        flops { key: "f32xf32->f32" value: 103883690402476 }
-      }
-      entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 4096
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 730064133265340 }
-        flops { key: "f32xf32->f32" value: 307093213402808 }
+        flops { key: "bf16xbf16->bf16" value: 1466617794742374 }
+        flops { key: "f32xf32->f32" value: 859296898417253 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3123541589327515 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3120031633511538 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3121732444595868 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 418776062402496 }
-        flops { key: "f32xf32->f32" value: 179076354903268 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 24745156342182 }
+        flops { key: "f32xf32->f32" value: 18636174396001 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21620123711340 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 25575024390243 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21509251282051 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 542293850505050 }
-        flops { key: "f32xf32->f32" value: 274965896030729 }
+        b: 4
+        m: 256
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46329902657921 }
+        flops { key: "f32xf32->f32" value: 29852697508896 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51453988115775 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 50984891927825 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51140304057915 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 256
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 138654677685950 }
-        flops { key: "f32xf32->f32" value: 67581937562940 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 88301136842105 }
+        flops { key: "f32xf32->f32" value: 66296729068905 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 90184933982865 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 90687654054054 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90687654054054 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 146206675381263 }
-        flops { key: "f32xf32->f32" value: 75743638826185 }
+        b: 4
+        m: 256
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 162049777241171 }
+        flops { key: "f32xf32->f32" value: 112386625915846 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166523235732009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159735469205593 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172488646425702 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
-        flops { key: "f32xf32->f32" value: 114033753610875 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 270566164545798 }
+        flops { key: "f32xf32->f32" value: 173590142106539 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 277309355371900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 292333739177783 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 295593069236063 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 300284366636370 }
-        flops { key: "f32xf32->f32" value: 158182354743665 }
+        m: 256
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 38746457274826 }
+        flops { key: "f32xf32->f32" value: 29746836879432 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51306470948012 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46466237839709 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51921751644100 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 488953471766848 }
-        flops { key: "f32xf32->f32" value: 200180247302556 }
+        b: 4
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 74214944982029 }
+        flops { key: "f32xf32->f32" value: 64403900191938 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91397839972761 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93190576636000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93206755555555 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 138654677685950 }
-        flops { key: "f32xf32->f32" value: 81541754556500 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 160164353221957 }
+        flops { key: "f32xf32->f32" value: 117734849122807 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172516359897172 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177477987438016 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 157903209411764 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 336385283208020 }
-        flops { key: "f32xf32->f32" value: 171086970044614 }
+        flops { key: "bf16xbf16->bf16" value: 289223386936026 }
+        flops { key: "f32xf32->f32" value: 192513101568803 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322590303139552 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 311998205433677 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 308546501149425 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 213044012698412 }
-        flops { key: "f32xf32->f32" value: 113551377326565 }
+        b: 4
+        m: 256
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 454205509306260 }
+        flops { key: "f32xf32->f32" value: 286121330757444 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 534598866816031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 536870912000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 537946805611222 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 256
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 622459028405797 }
-        flops { key: "f32xf32->f32" value: 167969819797711 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 78192675793766 }
+        flops { key: "f32xf32->f32" value: 61008058181818 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89958262734584 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 89478485333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 77649828174717 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 238397385435168 }
-        flops { key: "f32xf32->f32" value: 111015490488006 }
+        b: 4
+        m: 256
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 150447222082107 }
+        flops { key: "f32xf32->f32" value: 111824809831285 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 173800877954030 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164886643734643 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 148470938053097 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 213382715421303 }
-        flops { key: "f32xf32->f32" value: 111199443247721 }
+        m: 256
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 283122432168754 }
+        flops { key: "f32xf32->f32" value: 194747768930806 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 301528172985116 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 322638769230769 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 299593142857142 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81840078048780 }
-        flops { key: "f32xf32->f32" value: 51385041347626 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 489734013226909 }
+        flops { key: "f32xf32->f32" value: 291085550389698 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 572357049040511 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 575964502614992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 572204542499333 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 512
+        b: 4
+        m: 256
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 71240832271762 }
-        flops { key: "f32xf32->f32" value: 38971465737514 }
+        flops { key: "bf16xbf16->bf16" value: 686425970273293 }
+        flops { key: "f32xf32->f32" value: 425328510200039 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 960413080500894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 912754711720327 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 854719859900497 }
       }
       entries {
         b: 4
         m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 664033286332714 }
-        flops { key: "f32xf32->f32" value: 230393525114828 }
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135300129032258 }
+        flops { key: "f32xf32->f32" value: 100584714192037 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 148758911609864 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 153545234377234 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153194724497075 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 4096
+        m: 256
+        n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 752455207507089 }
-        flops { key: "f32xf32->f32" value: 329143854183181 }
+        flops { key: "bf16xbf16->bf16" value: 260111875968992 }
+        flops { key: "f32xf32->f32" value: 177302150594451 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309257437788018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 315713561893560 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 250815656155103 }
       }
       entries {
-        b: 1
+        b: 4
         m: 256
-        n: 2048
+        n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 215784128617363 }
-        flops { key: "f32xf32->f32" value: 95257436479772 }
+        flops { key: "bf16xbf16->bf16" value: 462023160068846 }
+        flops { key: "f32xf32->f32" value: 293632822588364 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 465932663918420 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 482797582733812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474162872157209 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 256
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 737966889347079 }
-        flops { key: "f32xf32->f32" value: 366781651896369 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 719666101876675 }
+        flops { key: "f32xf32->f32" value: 452578218756585 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 970614078192090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 958484109796920 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 977573072948674 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 294984017582417 }
-        flops { key: "f32xf32->f32" value: 152954675783475 }
-      }
-      entries {
-        b: 2
-        m: 2048
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 371794260387811 }
-        flops { key: "f32xf32->f32" value: 137871317925012 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 593227527071823 }
-        flops { key: "f32xf32->f32" value: 296941876106194 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 961057797270082 }
+        flops { key: "f32xf32->f32" value: 549914189174482 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1410614104934723 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1264713573616018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1394469901298701 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
+        b: 4
+        m: 256
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 644116271145770 }
-        flops { key: "f32xf32->f32" value: 189975552724699 }
+        flops { key: "bf16xbf16->bf16" value: 202401851837888 }
+        flops { key: "f32xf32->f32" value: 143548372192513 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 267832832127712 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 244476735883424 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 252259326676847 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 725508891943538 }
-        flops { key: "f32xf32->f32" value: 323102203176976 }
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 390096938782924 }
+        flops { key: "f32xf32->f32" value: 263948334316617 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 510212318365407 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 495154172930597 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 500812417910447 }
       }
       entries {
         b: 4
         m: 256
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 228650303236797 }
-        flops { key: "f32xf32->f32" value: 110285725554642 }
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 644502895558223 }
+        flops { key: "f32xf32->f32" value: 387038595656483 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 923847557754355 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 922260531672750 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 922458611683848 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 256
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 743588520775623 }
-        flops { key: "f32xf32->f32" value: 298057220898862 }
+        flops { key: "bf16xbf16->bf16" value: 993343115582538 }
+        flops { key: "f32xf32->f32" value: 540910839835017 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1431417195800700 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1418417204755614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1442837757957504 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
-        n: 1024
+        n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 295308532453245 }
-        flops { key: "f32xf32->f32" value: 95088719801629 }
+        flops { key: "bf16xbf16->bf16" value: 1190401135254989 }
+        flops { key: "f32xf32->f32" value: 652135939265107 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1869002304612706 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1858890844405972 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1870630355400696 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 62368832713754 }
-        flops { key: "f32xf32->f32" value: 28605653878942 }
+        b: 4
+        m: 512
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46329902657921 }
+        flops { key: "f32xf32->f32" value: 35696204255319 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51612277638915 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 52914538931598 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43577184415584 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 512
         n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 42908480818414 }
-        flops { key: "f32xf32->f32" value: 32388447876447 }
+        flops { key: "bf16xbf16->bf16" value: 89003798408488 }
+        flops { key: "f32xf32->f32" value: 67769617773289 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89003798408488 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93711103508465 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 88054930621617 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 451152026890756 }
-        flops { key: "f32xf32->f32" value: 165496581997533 }
+        b: 4
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 162098705314009 }
+        flops { key: "f32xf32->f32" value: 113156478448730 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 168588761815041 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172960989690721 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 151487277652370 }
       }
       entries {
         b: 4
         m: 512
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 272212403092914 }
+        flops { key: "f32xf32->f32" value: 172474793028672 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 291105279652975 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 240479691825307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 285493704865727 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 512
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 634224349675132 }
-        flops { key: "f32xf32->f32" value: 269108226566416 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 404880024132730 }
+        flops { key: "f32xf32->f32" value: 276168164609053 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 449499455363683 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 468371569901853 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462769884279711 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 141579881856540 }
-        flops { key: "f32xf32->f32" value: 81640953771289 }
+        b: 4
+        m: 512
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 85380234096692 }
+        flops { key: "f32xf32->f32" value: 64776895752895 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89210852775008 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 89717732620320 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89702742188805 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 512
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 642190086124401 }
-        flops { key: "f32xf32->f32" value: 177624784780810 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 165292768472906 }
+        flops { key: "f32xf32->f32" value: 120266781362007 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172046438711744 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167301624181988 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178481021276595 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 512
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 214748364800000 }
-        flops { key: "f32xf32->f32" value: 110832145334434 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 289184439536762 }
+        flops { key: "f32xf32->f32" value: 193676375180375 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304262347407197 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 319518471656003 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302207099352659 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 464320788756756 }
+        flops { key: "f32xf32->f32" value: 283458770855332 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 537946805611222 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 532477968757748 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 495268369003690 }
+      }
+      entries {
+        b: 4
+        m: 512
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 298261617777777 }
-        flops { key: "f32xf32->f32" value: 145893790414076 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 630731668404435 }
+        flops { key: "f32xf32->f32" value: 399364665581849 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 857621265175718 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 871278485850491 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 813209750260342 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 60133390681003 }
-        flops { key: "f32xf32->f32" value: 28556963404255 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 149462948775055 }
+        flops { key: "f32xf32->f32" value: 112598765100671 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169013351802298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 171141508447561 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168192641604010 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 312134251162790 }
-        flops { key: "f32xf32->f32" value: 118570170775474 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 286790017094017 }
+        flops { key: "f32xf32->f32" value: 200868361051351 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322541851607089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 324884061724659 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 316457949896846 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 309971658198614 }
-        flops { key: "f32xf32->f32" value: 104857600000000 }
+        b: 4
+        m: 512
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 494242496662830 }
+        flops { key: "f32xf32->f32" value: 310262753449396 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 572357049040511 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 573580034188034 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 512
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 339791716455696 }
-        flops { key: "f32xf32->f32" value: 183608383036935 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 175218966057441 }
-        flops { key: "f32xf32->f32" value: 69977960375391 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43018502564102 }
-        flops { key: "f32xf32->f32" value: 31536120300751 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 706234859163035 }
+        flops { key: "f32xf32->f32" value: 424970790679265 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 906876540540540 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 909757952976064 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 878496071998363 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335125413233458 }
-        flops { key: "f32xf32->f32" value: 158182354743665 }
+        m: 512
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 903062930193439 }
+        flops { key: "f32xf32->f32" value: 527103033902985 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1410961660972404 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1327245765142150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1423942742146705 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 512
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 19463127610208 }
-        flops { key: "f32xf32->f32" value: 14315030716723 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 258608339113680 }
+        flops { key: "f32xf32->f32" value: 180855958228061 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 299509574337517 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 298884293389004 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 297600283813747 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 747739211298869 }
-        flops { key: "f32xf32->f32" value: 298907694304529 }
+        b: 4
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 468473745200698 }
+        flops { key: "f32xf32->f32" value: 309591818352194 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 461229305841924 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 457203246327443 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462023160068846 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 512
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 653127630170316 }
-        flops { key: "f32xf32->f32" value: 307927107542300 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 742495858933356 }
+        flops { key: "f32xf32->f32" value: 442506418297960 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 970833475587703 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 981258235320996 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 976128930909090 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82241254901960 }
-        flops { key: "f32xf32->f32" value: 54120051612903 }
+        b: 4
+        m: 512
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 959286905131498 }
+        flops { key: "f32xf32->f32" value: 553368201507440 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1403240152250265 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1311040078144078 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1312342004736078 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 461229305841924 }
-        flops { key: "f32xf32->f32" value: 180947391978429 }
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1157673125606469 }
+        flops { key: "f32xf32->f32" value: 636315018482166 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1883241346560701 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1880355626771739 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1878813340332458 }
       }
       entries {
         b: 4
         m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 142179796610169 }
-        flops { key: "f32xf32->f32" value: 75658245772266 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 390167813953488 }
+        flops { key: "f32xf32->f32" value: 265514793273986 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 494356272559852 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 477590047370176 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 486186019470228 }
       }
       entries {
         b: 4
-        m: 256
+        m: 512
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 80659692307692 }
-        flops { key: "f32xf32->f32" value: 49344752941176 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 653873379919311 }
+        flops { key: "f32xf32->f32" value: 417149115773115 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 863136514469453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 911302205813706 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 931865327836841 }
       }
       entries {
         b: 4
-        m: 256
+        m: 512
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 140102012526096 }
-        flops { key: "f32xf32->f32" value: 76520939566704 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 995126806302131 }
+        flops { key: "f32xf32->f32" value: 571443227248536 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1454934720867208 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1460749016580222 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1460749016580222 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 366214810368349 }
-        flops { key: "f32xf32->f32" value: 136469474326385 }
+        b: 4
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1207808575928009 }
+        flops { key: "f32xf32->f32" value: 655395001869301 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1912273951914514 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1903481157165808 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1877171020979021 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82040176039119 }
-        flops { key: "f32xf32->f32" value: 51542906298003 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1339848246914543 }
+        flops { key: "f32xf32->f32" value: 665299752505058 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2244634222962600 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2247497276818419 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2265875650751780 }
       }
       entries {
         b: 4
         m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 735943676490747 }
-        flops { key: "f32xf32->f32" value: 237765556725785 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 89687756765786 }
+        flops { key: "f32xf32->f32" value: 67108864000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 93190576636000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 86452642834138 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90443212938005 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 118987347517730 }
-        flops { key: "f32xf32->f32" value: 56775688663282 }
+        b: 4
+        m: 1024
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 164457317200183 }
+        flops { key: "f32xf32->f32" value: 114118591136146 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171579070629594 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168139966176010 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169039959697733 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 4
+        m: 1024
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 713449716943521 }
-        flops { key: "f32xf32->f32" value: 286677555112427 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 271078471093158 }
+        flops { key: "f32xf32->f32" value: 176138750656167 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 280790225941422 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 280167468754076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 297517823219728 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 143395008547008 }
-        flops { key: "f32xf32->f32" value: 79796508917954 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 406643372088619 }
+        flops { key: "f32xf32->f32" value: 270055790744466 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 403662339849624 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 443694968595041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 466033777777777 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 2048
+        m: 1024
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 394758023529411 }
-        flops { key: "f32xf32->f32" value: 200999967053538 }
+        flops { key: "bf16xbf16->bf16" value: 531423817866864 }
+        flops { key: "f32xf32->f32" value: 358391797062750 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 630731668404435 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 642094079234564 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 630870636897767 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 2048
+        b: 4
+        m: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 447019910074937 }
-        flops { key: "f32xf32->f32" value: 230614652920962 }
+        flops { key: "bf16xbf16->bf16" value: 162860886394661 }
+        flops { key: "f32xf32->f32" value: 120916872072072 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 155705020881670 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 158649796690307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 154985829099307 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 462819751724137 }
-        flops { key: "f32xf32->f32" value: 165140237465395 }
+        b: 4
+        m: 1024
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 291065823800487 }
+        flops { key: "f32xf32->f32" value: 194465602463098 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312770703175065 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 319470938411187 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 303660018099547 }
       }
       entries {
         b: 4
-        m: 512
-        n: 4096
+        m: 1024
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 442233041186161 }
-        flops { key: "f32xf32->f32" value: 216305766317485 }
+        flops { key: "bf16xbf16->bf16" value: 464270597340828 }
+        flops { key: "f32xf32->f32" value: 290122081599567 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 532477968757748 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 522247968871595 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 528416251968503 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 376223484232655 }
-        flops { key: "f32xf32->f32" value: 216305766317485 }
+        b: 4
+        m: 1024
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 648248025960304 }
+        flops { key: "f32xf32->f32" value: 408480412382899 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 769018316204118 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 870131137763371 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 862963089411292 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 209715200000000 }
-        flops { key: "f32xf32->f32" value: 86037005128205 }
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 803698970059880 }
+        flops { key: "f32xf32->f32" value: 502452889096864 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1081038836143971 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1033313435823409 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1080223162977867 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22310127659574 }
-        flops { key: "f32xf32->f32" value: 16545577909270 }
+        b: 4
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 286790017094017 }
+        flops { key: "f32xf32->f32" value: 205835679861976 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309257437788018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 322638769230769 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305040290909090 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 135027895372233 }
-        flops { key: "f32xf32->f32" value: 65600062561094 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 644116271145770 }
-        flops { key: "f32xf32->f32" value: 264212189286867 }
+        flops { key: "bf16xbf16->bf16" value: 502570476948279 }
+        flops { key: "f32xf32->f32" value: 307112427314980 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 498776831494599 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 582289492407809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 515169400983567 }
       }
       entries {
         b: 4
-        m: 512
+        m: 1024
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 375434204195804 }
-        flops { key: "f32xf32->f32" value: 150468304932735 }
+        flops { key: "bf16xbf16->bf16" value: 713924085106383 }
+        flops { key: "f32xf32->f32" value: 422068327044025 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 909950698305084 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 958698057142857 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 857621265175718 }
       }
       entries {
-        b: 1
+        b: 4
         m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 470114633975481 }
-        flops { key: "f32xf32->f32" value: 181620741542625 }
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 910722497031382 }
+        flops { key: "f32xf32->f32" value: 518121394052717 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1418300106001816 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1422174601324503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1247086903600464 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 4
+        m: 1024
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 779348875095689 }
-        flops { key: "f32xf32->f32" value: 320786271886174 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1062387556984725 }
+        flops { key: "f32xf32->f32" value: 614093122104661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1734639457189014 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1890390535211267 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1733151998385876 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 512
+        b: 4
+        m: 1024
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 622820083526682 }
-        flops { key: "f32xf32->f32" value: 205542624504983 }
+        flops { key: "bf16xbf16->bf16" value: 470011741737798 }
+        flops { key: "f32xf32->f32" value: 295633762114537 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 558077871101871 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 573426875300400 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 536870912000000 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 779136806172370 }
-        flops { key: "f32xf32->f32" value: 321542765401136 }
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 736259071912231 }
+        flops { key: "f32xf32->f32" value: 455264712317150 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 986668342752125 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 981482471663619 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 965378129017756 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 134217728000000 }
-        flops { key: "f32xf32->f32" value: 69542864248704 }
+        b: 4
+        m: 1024
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 971492263288848 }
+        flops { key: "f32xf32->f32" value: 552229803407264 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1414330220136659 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1468867064295485 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1451002464864864 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 677439636593059 }
-        flops { key: "f32xf32->f32" value: 309037697921444 }
+        b: 4
+        m: 1024
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1152663234861954 }
+        flops { key: "f32xf32->f32" value: 618381296666906 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1828910329908979 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1810312875026343 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1906755736293007 }
       }
       entries {
         b: 4
         m: 1024
         n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 718702693440428 }
-        flops { key: "f32xf32->f32" value: 311503208144837 }
+        flops { key: "bf16xbf16->bf16" value: 1278668417021751 }
+        flops { key: "f32xf32->f32" value: 701040313552665 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2407492878923767 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2384768071071627 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2386921734491143 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
-        flops { key: "f32xf32->f32" value: 54648912052117 }
+        b: 4
+        m: 1024
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 658585800199340 }
+        flops { key: "f32xf32->f32" value: 396489018786060 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 928842408304498 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 916161965870307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 917532000854518 }
       }
       entries {
         b: 4
-        m: 512
+        m: 1024
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 237133795053003 }
-        flops { key: "f32xf32->f32" value: 110837865703225 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 990536738007380 }
+        flops { key: "f32xf32->f32" value: 562334103106281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1466609969609014 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1472518143824462 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1447091407008086 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 228261442176870 }
-        flops { key: "f32xf32->f32" value: 110740699669967 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1203451310567055 }
+        flops { key: "f32xf32->f32" value: 650580118301965 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1905486821650399 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1897070360424028 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1939912961156278 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335125413233458 }
-        flops { key: "f32xf32->f32" value: 168087323731997 }
+        m: 1024
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1341417492748248 }
+        flops { key: "f32xf32->f32" value: 710117355598726 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2349544472647702 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2358090616155377 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2326398210365957 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 1024
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 627185644859813 }
-        flops { key: "f32xf32->f32" value: 233930680610021 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1413879180223646 }
+        flops { key: "f32xf32->f32" value: 756206139665911 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2670999562189054 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2592258501141100 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2671259129501856 }
       }
       entries {
         b: 4
         m: 2048
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 730436614965986 }
-        flops { key: "f32xf32->f32" value: 298181377997240 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 160164353221957 }
+        flops { key: "f32xf32->f32" value: 113924861962864 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162049777241171 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 173379916680122 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164457317200183 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 687861993493688 }
-        flops { key: "f32xf32->f32" value: 298202080903986 }
-      }
-      entries {
-        b: 2
-        m: 1024
+        m: 2048
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 237553500884955 }
-        flops { key: "f32xf32->f32" value: 108854605028386 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 269445878042659 }
+        flops { key: "f32xf32->f32" value: 174521222917513 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 271661435547122 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 284887721942159 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 240533562724014 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 2048
         n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 82040176039119 }
-        flops { key: "f32xf32->f32" value: 54032901771336 }
+        flops { key: "bf16xbf16->bf16" value: 406643372088619 }
+        flops { key: "f32xf32->f32" value: 276666277763463 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 464421204152249 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 464320788756756 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 466844271304347 }
       }
       entries {
         b: 4
         m: 2048
-        n: 1024
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 300936609865470 }
-        flops { key: "f32xf32->f32" value: 162294713422007 }
+        flops { key: "bf16xbf16->bf16" value: 533668898608349 }
+        flops { key: "f32xf32->f32" value: 362199974363299 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 627048294912037 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 642959176047904 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 594541430786268 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 512
+        m: 2048
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 134486701402805 }
-        flops { key: "f32xf32->f32" value: 79986727056019 }
+        flops { key: "bf16xbf16->bf16" value: 794004214262605 }
+        flops { key: "f32xf32->f32" value: 462695103258820 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 904584518955349 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 897027421888053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 906063455724909 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
-        n: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 342392163265306 }
-        flops { key: "f32xf32->f32" value: 167982137672090 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 733304984804507 }
-        flops { key: "f32xf32->f32" value: 275585610792515 }
+        flops { key: "bf16xbf16->bf16" value: 284887721942159 }
+        flops { key: "f32xf32->f32" value: 194465602463098 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 313501262481751 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 290514562770562 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 292413350762527 }
       }
       entries {
         b: 4
-        m: 256
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 117734849122807 }
-        flops { key: "f32xf32->f32" value: 54920046238044 }
-      }
-      entries {
-        b: 2
         m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 393889150403521 }
-        flops { key: "f32xf32->f32" value: 202669275953189 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 298925897550111 }
-        flops { key: "f32xf32->f32" value: 118462248896734 }
-      }
-      entries {
-        b: 2
-        m: 512
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43351979328165 }
-        flops { key: "f32xf32->f32" value: 32201950095969 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 462819751724137 }
+        flops { key: "f32xf32->f32" value: 281065852758327 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 510212318365407 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 498024964749536 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 531423817866864 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 2048
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 80466263788968 }
-        flops { key: "f32xf32->f32" value: 50686453172205 }
+        flops { key: "bf16xbf16->bf16" value: 636715928544955 }
+        flops { key: "f32xf32->f32" value: 406412499621498 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 804753100243582 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 870131137763371 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 861751062600321 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 715350982011992 }
-        flops { key: "f32xf32->f32" value: 316600830834723 }
-      }
-      entries {
-        b: 1
-        m: 1024
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 653525151552039 }
-        flops { key: "f32xf32->f32" value: 266503307024075 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 802497626307922 }
+        flops { key: "f32xf32->f32" value: 503867585171281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1114996701973001 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1074816640640640 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1078054040160642 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 2048
         n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 445906073089701 }
-        flops { key: "f32xf32->f32" value: 226719135135135 }
+        flops { key: "bf16xbf16->bf16" value: 923251783319002 }
+        flops { key: "f32xf32->f32" value: 614093122104661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1533916891428571 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1524998374151169 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1518394023951566 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 4
+        m: 2048
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 137236940695296 }
-        flops { key: "f32xf32->f32" value: 78398205607476 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373865537604456 }
-        flops { key: "f32xf32->f32" value: 136608374554707 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40721398058252 }
-        flops { key: "f32xf32->f32" value: 25970922600619 }
+        flops { key: "bf16xbf16->bf16" value: 482689064508878 }
+        flops { key: "f32xf32->f32" value: 302292180180180 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 544493825557809 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 569926658174097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 534598866816031 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 234236872600349 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 699962075619296 }
+        flops { key: "f32xf32->f32" value: 420415749412686 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 911495606112054 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 909757952976064 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 920580279927124 }
       }
       entries {
         b: 4
         m: 2048
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 745015413610295 }
-        flops { key: "f32xf32->f32" value: 302422553078378 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 912900217014719 }
+        flops { key: "f32xf32->f32" value: 519594398257924 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1365866527587851 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1372742244027167 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1320715650676506 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 206488812307692 }
-        flops { key: "f32xf32->f32" value: 110014531147540 }
+        m: 2048
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1062814759751306 }
+        flops { key: "f32xf32->f32" value: 615666619505814 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1898747699381078 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1764571608874281 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1764118620321404 }
       }
       entries {
         b: 4
-        m: 512
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 210372614420062 }
-        flops { key: "f32xf32->f32" value: 94719638673253 }
-      }
-    }
-  }
-  entries {
-    key: "sm_100"
-    value {
-      entries {
-        b: 1
-        m: 1024
+        m: 2048
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 437815218756371 }
-        flops { key: "f32xf32->f32" value: 273861333673404 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1159626674586567 }
+        flops { key: "f32xf32->f32" value: 734056963937788 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2398083359017309 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2382948773701366 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2380802270509978 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 369650339616145 }
-        flops { key: "f32xf32->f32" value: 249663854909027 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 729258391374480 }
+        flops { key: "f32xf32->f32" value: 418449658612626 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 880116249180327 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 880116249180327 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 884283981058266 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 2048
+        b: 4
+        m: 2048
+        n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 454878976488032 }
-        flops { key: "f32xf32->f32" value: 290475266874070 }
+        flops { key: "bf16xbf16->bf16" value: 964456811542132 }
+        flops { key: "f32xf32->f32" value: 547425968964088 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1462614437595777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1399582010916497 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1472518143824462 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 2048
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 426003500892680 }
-        flops { key: "f32xf32->f32" value: 293011822622458 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 678617047874861 }
-        flops { key: "f32xf32->f32" value: 406643372088619 }
-      }
-      entries {
-        b: 2
-        m: 512
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74051160275862 }
-        flops { key: "f32xf32->f32" value: 54637788723794 }
+        flops { key: "bf16xbf16->bf16" value: 1158688148917515 }
+        flops { key: "f32xf32->f32" value: 626271113444152 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1898747699381078 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1887068231985940 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1893307161560502 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 638229778735418 }
-        flops { key: "f32xf32->f32" value: 407241008486227 }
-      }
-      entries {
-        b: 1
         m: 2048
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 434229834799312 }
-        flops { key: "f32xf32->f32" value: 273303677760101 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1276364723922734 }
+        flops { key: "f32xf32->f32" value: 699734000651678 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2416466584710598 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2347618090188576 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2400177316056023 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 2048
         n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1543980334681405 }
-        flops { key: "f32xf32->f32" value: 845590844317566 }
+        flops { key: "bf16xbf16->bf16" value: 1345554305943628 }
+        flops { key: "f32xf32->f32" value: 816456096568767 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2895097286289048 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2900045439567859 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2894426616797237 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 368192652893270 }
-        flops { key: "f32xf32->f32" value: 264663994084298 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1276483268059812 }
-        flops { key: "f32xf32->f32" value: 730390033968922 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 982155796021038 }
+        flops { key: "f32xf32->f32" value: 489845722627737 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1441143292005704 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1362292378399809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1427848170212766 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 2048
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 103393531439576 }
-        flops { key: "f32xf32->f32" value: 67501214811089 }
+        flops { key: "bf16xbf16->bf16" value: 1181884231150247 }
+        flops { key: "f32xf32->f32" value: 647027311840916 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1903481157165808 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1898747699381078 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1882106615249781 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 2048
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 497016408725337 }
-        flops { key: "f32xf32->f32" value: 344285955591182 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1328554407655872 }
+        flops { key: "f32xf32->f32" value: 706641542612701 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2472635173287277 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2441275950690966 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2451028167635624 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 619139007640190 }
-        flops { key: "f32xf32->f32" value: 393510219982591 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1411135503223951 }
+        flops { key: "f32xf32->f32" value: 755624084447572 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2788939802597402 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2776712800210113 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2781714569948186 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 40414853357422 }
-        flops { key: "f32xf32->f32" value: 24704164918093 }
+        b: 4
+        m: 2048
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1466234461380899 }
+        flops { key: "f32xf32->f32" value: 867100852167768 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3131369836002825 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3128198233137212 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3128767935165898 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 4096
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 83235800310077 }
-        flops { key: "f32xf32->f32" value: 49922904221684 }
+        flops { key: "bf16xbf16->bf16" value: 265777679207920 }
+        flops { key: "f32xf32->f32" value: 174720010414124 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 289808859379217 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 288562704649287 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 237921964103700 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
+        b: 4
+        m: 4096
+        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 518590593576430 }
-        flops { key: "f32xf32->f32" value: 349952521469893 }
+        flops { key: "bf16xbf16->bf16" value: 402451958020989 }
+        flops { key: "f32xf32->f32" value: 266768155031055 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 463619094991364 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 457300606473594 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 478494573975044 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1141633331162574 }
-        flops { key: "f32xf32->f32" value: 626705183088315 }
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 536736727818045 }
+        flops { key: "f32xf32->f32" value: 353204547368421 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 670198532573925 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 644502895558223 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 639132038095238 }
       }
       entries {
         b: 4
-        m: 512
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 418001683309002 }
-        flops { key: "f32xf32->f32" value: 286426628609536 }
-      }
-      entries {
-        b: 1
-        m: 1024
+        m: 4096
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 465882123440720 }
-        flops { key: "f32xf32->f32" value: 299530462096380 }
-      }
-      entries {
-        b: 1
-        m: 256
-        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 9216984480153 }
-        flops { key: "f32xf32->f32" value: 10510393735317 }
+        flops { key: "bf16xbf16->bf16" value: 790642421832574 }
+        flops { key: "f32xf32->f32" value: 460833400858369 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 840666920336660 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 910577685058567 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 909950698305084 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 898740246606157 }
-        flops { key: "f32xf32->f32" value: 610644387005047 }
-      }
-      entries {
-        b: 2
         m: 4096
-        n: 512
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 250844953626912 }
-        flops { key: "f32xf32->f32" value: 165867277979454 }
+        flops { key: "bf16xbf16->bf16" value: 791826754730025 }
+        flops { key: "f32xf32->f32" value: 511199130657303 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1012754984761399 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1034184275463520 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1035212508451086 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 868679232643980 }
-        flops { key: "f32xf32->f32" value: 512434205810415 }
-      }
-      entries {
-        b: 2
-        m: 256
+        m: 4096
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 11846224889673 }
-        flops { key: "f32xf32->f32" value: 10951185378590 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 258546068865880 }
-        flops { key: "f32xf32->f32" value: 169882418163120 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 455602768218945 }
+        flops { key: "f32xf32->f32" value: 280496819226750 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 504459395818651 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 530373832551247 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 531423817866864 }
       }
       entries {
         b: 4
         m: 4096
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 616296067728512 }
-        flops { key: "f32xf32->f32" value: 393240001464933 }
+        flops { key: "bf16xbf16->bf16" value: 632263697335492 }
+        flops { key: "f32xf32->f32" value: 392162828341855 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 798840750674230 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 793967519364081 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 844136654088050 }
       }
       entries {
         b: 4
-        m: 512
-        n: 256
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 81037118792452 }
-        flops { key: "f32xf32->f32" value: 57247911281723 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42683328987120 }
-        flops { key: "f32xf32->f32" value: 30608375826681 }
-      }
-      entries {
-        b: 2
-        m: 2048
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1324942674121775 }
-        flops { key: "f32xf32->f32" value: 703898273387485 }
+        flops { key: "bf16xbf16->bf16" value: 801187762160145 }
+        flops { key: "f32xf32->f32" value: 499879806331471 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1046340774955843 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1085684351870576 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1112685827979274 }
       }
       entries {
         b: 4
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 192824247822573 }
-        flops { key: "f32xf32->f32" value: 131432991492747 }
-      }
-      entries {
-        b: 1
-        m: 2048
-        n: 1024
+        m: 4096
+        n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 271661435547122 }
-        flops { key: "f32xf32->f32" value: 186364978564609 }
+        flops { key: "bf16xbf16->bf16" value: 913239909844779 }
+        flops { key: "f32xf32->f32" value: 614093122104661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1534876189046725 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1524118983676366 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1526285464108031 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 4096
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1462661134166977 }
-        flops { key: "f32xf32->f32" value: 873213190244894 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 1090646850177755 }
+        flops { key: "f32xf32->f32" value: 668548937493311 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1712208215672106 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1717557529017745 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1724410346942360 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 512
+        m: 4096
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 656171002368039 }
-        flops { key: "f32xf32->f32" value: 398789906778087 }
+        flops { key: "bf16xbf16->bf16" value: 683041872773536 }
+        flops { key: "f32xf32->f32" value: 422068327044025 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 936743139803707 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 897777444816053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 872783437512700 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 941723904182426 }
-        flops { key: "f32xf32->f32" value: 570418659406335 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 903775537061392 }
+        flops { key: "f32xf32->f32" value: 524144039539921 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1338830204488778 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1412818189473684 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1401751728459530 }
       }
       entries {
         b: 4
         m: 4096
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1394455753005752 }
-        flops { key: "f32xf32->f32" value: 783958802331816 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1060485752098765 }
+        flops { key: "f32xf32->f32" value: 613205403387289 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1868697360526459 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1755914675388389 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1767476253497942 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 4096
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1185758993960727 }
-        flops { key: "f32xf32->f32" value: 653985389291764 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1163061297046627 }
+        flops { key: "f32xf32->f32" value: 732374979868060 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2400764279485746 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2410194891133558 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2404796918253079 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 4096
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1350485933693623 }
+        flops { key: "f32xf32->f32" value: 769974753061659 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2651214380246913 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2639807803318992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2639807803318992 }
+      }
+      entries {
+        b: 4
+        m: 4096
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 710969590465154 }
-        flops { key: "f32xf32->f32" value: 407569490985006 }
+        flops { key: "bf16xbf16->bf16" value: 945767640187173 }
+        flops { key: "f32xf32->f32" value: 535532081795511 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1343857101376721 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1399696039107055 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1396283256176853 }
+      }
+      entries {
+        b: 4
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1145324612266666 }
+        flops { key: "f32xf32->f32" value: 634177526172019 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1777442365526874 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1777718251655629 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1789196957300562 }
+      }
+      entries {
+        b: 4
+        m: 4096
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1265225848510513 }
+        flops { key: "f32xf32->f32" value: 695520143477424 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2422343992950051 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2410110361449163 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2421746431350437 }
       }
       entries {
         b: 4
         m: 4096
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 768467936303453 }
-        flops { key: "f32xf32->f32" value: 455506129600169 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1345751933573554 }
+        flops { key: "f32xf32->f32" value: 814211809668246 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2884888089502739 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2893208013472549 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2884766985118485 }
       }
       entries {
-        b: 2
+        b: 4
         m: 4096
-        n: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1539854612058775 }
+        flops { key: "f32xf32->f32" value: 844198873934301 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3122477132679026 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3118509563260120 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3129729778020677 }
+      }
+      entries {
+        b: 4
+        m: 4096
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1190854967178456 }
-        flops { key: "f32xf32->f32" value: 645132151107773 }
+        flops { key: "bf16xbf16->bf16" value: 1152431271775951 }
+        flops { key: "f32xf32->f32" value: 600023371891589 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1804608107563025 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1862518341717259 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1852876314063848 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 871322675051985 }
-        flops { key: "f32xf32->f32" value: 511686349486224 }
+        m: 4096
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1322749398213735 }
+        flops { key: "f32xf32->f32" value: 692059948800064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2274424992917190 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2245147567171981 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2428679156600106 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 4
+        m: 4096
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 432873140092723 }
-        flops { key: "f32xf32->f32" value: 272471439193047 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1411903819196038 }
+        flops { key: "f32xf32->f32" value: 783319864536610 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2662049496833175 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2764703763115545 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2767375835051546 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 4096
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 239193990643795 }
-        flops { key: "f32xf32->f32" value: 170516408448467 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1465710635889068 }
+        flops { key: "f32xf32->f32" value: 869531089085859 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3133333032521344 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3124606771972900 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3128661100229916 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 126352297481760 }
-        flops { key: "f32xf32->f32" value: 104175979819540 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1670467342710337 }
+        flops { key: "f32xf32->f32" value: 869747066286761 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3279676742061148 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3280107240848911 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3281752003581683 }
       }
+    }
+  }
+  entries {
+    key: "sm_90"
+    value {
       entries {
         b: 1
-        m: 1024
+        m: 256
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 22482031490787 }
-        flops { key: "f32xf32->f32" value: 17327359669506 }
+        flops { key: "bf16xbf16->bf16" value: 6721641025641 }
+        flops { key: "f32xf32->f32" value: 6026298850574 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 5991862857142 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 5991862857142 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 6009031518624 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 256110154800238 }
-        flops { key: "f32xf32->f32" value: 174720010414124 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 12372578171091 }
+        flops { key: "f32xf32->f32" value: 10155699757869 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 10381940594059 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 10433592039800 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 10356306172839 }
       }
       entries {
         b: 1
         m: 256
-        n: 2048
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 43240247422680 }
-        flops { key: "f32xf32->f32" value: 32443250664732 }
+        flops { key: "bf16xbf16->bf16" value: 23692449779346 }
+        flops { key: "f32xf32->f32" value: 17697485232067 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 15709003745318 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 15887515151515 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16039403441682 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 442871447308723 }
-        flops { key: "f32xf32->f32" value: 280735165435649 }
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 43804741514360 }
+        flops { key: "f32xf32->f32" value: 33893365656565 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30229218018018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30012908765652 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30229218018018 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1329235407913189 }
-        flops { key: "f32xf32->f32" value: 813185692650861 }
+        b: 1
+        m: 256
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 82646384236453 }
+        flops { key: "f32xf32->f32" value: 55924053333333 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59592730824730 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59493673758865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59283448763250 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 256
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 149462948775055 }
-        flops { key: "f32xf32->f32" value: 99540356354871 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
+        flops { key: "f32xf32->f32" value: 9058971922246 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 9279433628318 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 9279433628318 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 9279433628318 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 241344532254439 }
-        flops { key: "f32xf32->f32" value: 160511521638388 }
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 23045626373626 }
+        flops { key: "f32xf32->f32" value: 16644063492063 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16448250980392 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16448250980392 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16416062622309 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 256
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 466742805477070 }
-        flops { key: "f32xf32->f32" value: 297538434083824 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 43804741514360 }
+        flops { key: "f32xf32->f32" value: 26630501587301 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 47662545454545 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46995002801120 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47798336182336 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 4096
+        m: 256
+        n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 620525506898793 }
-        flops { key: "f32xf32->f32" value: 386446580529062 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 696161325228948 }
-        flops { key: "f32xf32->f32" value: 422670599419377 }
+        flops { key: "bf16xbf16->bf16" value: 83055524752475 }
+        flops { key: "f32xf32->f32" value: 51781530864197 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58457198606271 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58457198606271 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58457198606271 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1337930917225602 }
-        flops { key: "f32xf32->f32" value: 814834520883850 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 146206675381263 }
+        flops { key: "f32xf32->f32" value: 81740394640682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89240510638297 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166937472636815 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89597949265687 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 4096
+        b: 1
+        m: 256
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1048000316232538 }
-        flops { key: "f32xf32->f32" value: 609939793158539 }
+        flops { key: "bf16xbf16->bf16" value: 18893261261261 }
+        flops { key: "f32xf32->f32" value: 15252014545454 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12748644376899 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 12671613293051 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 12729298937784 }
       }
       entries {
         b: 1
-        m: 512
+        m: 256
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 133152507936507 }
-        flops { key: "f32xf32->f32" value: 89590473425114 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 41120627450980 }
+        flops { key: "f32xf32->f32" value: 20945338327091 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46733192200557 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46345900552486 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 46603377777777 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 195599202841788 }
-        flops { key: "f32xf32->f32" value: 130912195074372 }
+        m: 256
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 75065843400447 }
+        flops { key: "f32xf32->f32" value: 68618470347648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 90443212938005 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 90933420054200 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90933420054200 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 1024
+        b: 1
+        m: 256
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 679583432911392 }
-        flops { key: "f32xf32->f32" value: 404175156072083 }
+        flops { key: "bf16xbf16->bf16" value: 141579881856540 }
+        flops { key: "f32xf32->f32" value: 72865216069489 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169466828282828 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172074010256410 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172960989690721 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1270066289685253 }
-        flops { key: "f32xf32->f32" value: 740080952204536 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 237553500884955 }
+        flops { key: "f32xf32->f32" value: 111662003327787 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 267899656686626 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 273355861507128 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 269513510040160 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 1
+        m: 256
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 370671208768447 }
-        flops { key: "f32xf32->f32" value: 229812579378243 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 31895847908745 }
+        flops { key: "f32xf32->f32" value: 41221660933660 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 42799020408163 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 44384169312169 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44384169312169 }
       }
       entries {
         b: 1
         m: 256
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 20757458707083 }
-        flops { key: "f32xf32->f32" value: 16008793893129 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 63913203809523 }
+        flops { key: "f32xf32->f32" value: 67243350701402 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 86480494845360 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 87381333333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 87609483028720 }
       }
       entries {
-        b: 4
+        b: 1
         m: 256
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22306419810536 }
-        flops { key: "f32xf32->f32" value: 17655581162851 }
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 119623643493761 }
+        flops { key: "f32xf32->f32" value: 60241350089766 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162098705314009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164482509803921 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162885592233009 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 769018316204118 }
-        flops { key: "f32xf32->f32" value: 453737664316086 }
+        b: 1
+        m: 256
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 222583296849087 }
+        flops { key: "f32xf32->f32" value: 99200094604582 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 277309355371900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 277309355371900 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 275601084188911 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 764059114253947 }
-        flops { key: "f32xf32->f32" value: 504089351368797 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 345921979381443 }
+        flops { key: "f32xf32->f32" value: 143625177100053 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 416825242236024 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 431568257234726 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 416825242236024 }
       }
       entries {
         b: 1
         m: 256
-        n: 1024
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 196188895304220 }
-        flops { key: "f32xf32->f32" value: 132071565067650 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 772302503214205 }
-        flops { key: "f32xf32->f32" value: 496772089870745 }
+        flops { key: "bf16xbf16->bf16" value: 90933420054200 }
+        flops { key: "f32xf32->f32" value: 61342654478976 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 78398205607476 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 78215459207459 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 79891504761904 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 256
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 258546068865880 }
-        flops { key: "f32xf32->f32" value: 158077559661391 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 380658273154302 }
-        flops { key: "f32xf32->f32" value: 254185198319228 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 93077481276005 }
+        flops { key: "f32xf32->f32" value: 108240103225806 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 151146090090090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 151146090090090 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 151146090090090 }
       }
       entries {
         b: 1
         m: 256
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 23168950112204 }
-        flops { key: "f32xf32->f32" value: 16508945633456 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 623407692285361 }
-        flops { key: "f32xf32->f32" value: 378878554693013 }
+        flops { key: "bf16xbf16->bf16" value: 177068242744063 }
+        flops { key: "f32xf32->f32" value: 160355708482676 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 249475330855018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 250874257943925 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 250874257943925 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 442187511170596 }
-        flops { key: "f32xf32->f32" value: 293974489801505 }
+        b: 1
+        m: 256
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 296286375275938 }
+        flops { key: "f32xf32->f32" value: 123589068139963 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 411710822085889 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 410451767584097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 422068327044025 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 1024
+        b: 1
+        m: 256
+        n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1339613176654060 }
-        flops { key: "f32xf32->f32" value: 674540389650163 }
+        flops { key: "bf16xbf16->bf16" value: 418449658612626 }
+        flops { key: "f32xf32->f32" value: 184175269982847 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 542293850505050 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 561580451882845 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 540077622885884 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 512
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 239193990643795 }
-        flops { key: "f32xf32->f32" value: 167523492316093 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 12336188235294 }
+        flops { key: "f32xf32->f32" value: 10230009756097 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 10754625641025 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 10837994832041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 10699755102040 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 2048
+        m: 512
+        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 371762078767419 }
-        flops { key: "f32xf32->f32" value: 263656678698588 }
+        flops { key: "bf16xbf16->bf16" value: 23629881690140 }
+        flops { key: "f32xf32->f32" value: 17623126050420 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16039403441682 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16194223938223 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16008793893129 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1411121014733513 }
-        flops { key: "f32xf32->f32" value: 748508871575072 }
+        b: 1
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45714485013623 }
+        flops { key: "f32xf32->f32" value: 34379540983606 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30393507246376 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30338546112115 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30338546112115 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1416706559656953 }
-        flops { key: "f32xf32->f32" value: 754906067043463 }
+        b: 1
+        m: 512
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 56205078726968 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58970882249560 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59388375221238 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59178892416225 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 512
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 685549448683160 }
-        flops { key: "f32xf32->f32" value: 401155120347452 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141281818947368 }
+        flops { key: "f32xf32->f32" value: 80659692307692 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171196081632653 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 170760468193384 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 171196081632653 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 37866477077161 }
-        flops { key: "f32xf32->f32" value: 26087021963070 }
+        flops { key: "bf16xbf16->bf16" value: 23045626373626 }
+        flops { key: "f32xf32->f32" value: 15768060150375 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16808732373199 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16878486921529 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16644063492063 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 382863905865573 }
-        flops { key: "f32xf32->f32" value: 252022491256894 }
-      }
-      entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 512
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 429410847430513 }
-        flops { key: "f32xf32->f32" value: 273025700591189 }
+        flops { key: "bf16xbf16->bf16" value: 43577184415584 }
+        flops { key: "f32xf32->f32" value: 26800664536741 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 47934902857142 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47259763380281 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 48072252148997 }
       }
       entries {
         b: 1
         m: 512
-        n: 256
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 21672489585015 }
-        flops { key: "f32xf32->f32" value: 14923029575272 }
+        flops { key: "bf16xbf16->bf16" value: 85380234096692 }
+        flops { key: "f32xf32->f32" value: 52510848200312 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58052650519031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58254222222222 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58146963283873 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 512
+        m: 512
+        n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 270532079617032 }
-        flops { key: "f32xf32->f32" value: 182593627072527 }
+        flops { key: "bf16xbf16->bf16" value: 147492008791208 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 177039047650453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 175677654450261 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 176602273684210 }
       }
       entries {
         b: 1
-        m: 256
+        m: 512
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 423358038048299 }
-        flops { key: "f32xf32->f32" value: 276666277763463 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 243589343012704 }
+        flops { key: "f32xf32->f32" value: 133417224652087 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307134389016018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309257437788018 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 512
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 100134460878485 }
-        flops { key: "f32xf32->f32" value: 68048787882628 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 872783437512700 }
-        flops { key: "f32xf32->f32" value: 524192017574906 }
+        flops { key: "bf16xbf16->bf16" value: 41527762376237 }
+        flops { key: "f32xf32->f32" value: 23899168091168 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46345900552486 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46345900552486 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 46733192200557 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 2048
+        b: 1
+        m: 512
+        n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 447299239325140 }
-        flops { key: "f32xf32->f32" value: 276399208185854 }
+        flops { key: "bf16xbf16->bf16" value: 73423264770240 }
+        flops { key: "f32xf32->f32" value: 42632487254824 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91678775956284 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 91428970027247 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90443212938005 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 512
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1186864883177893 }
-        flops { key: "f32xf32->f32" value: 657413916923371 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 146525903930131 }
+        flops { key: "f32xf32->f32" value: 72707328277356 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 178481021276595 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 178007596816976 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178481021276595 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 611339733257419 }
-        flops { key: "f32xf32->f32" value: 390433825371574 }
+        b: 1
+        m: 512
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 241833744144144 }
+        flops { key: "f32xf32->f32" value: 117631663453111 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 321095043062200 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 310689185185185 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 318051488151658 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 516594574933846 }
-        flops { key: "f32xf32->f32" value: 350152233490950 }
+        b: 1
+        m: 512
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 381300363636363 }
+        flops { key: "f32xf32->f32" value: 192980198418404 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 505528165725047 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 493447529411764 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 499879806331471 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 512
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 423274593081699 }
-        flops { key: "f32xf32->f32" value: 298199492883427 }
+        flops { key: "bf16xbf16->bf16" value: 61680941176470 }
+        flops { key: "f32xf32->f32" value: 67786731313131 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 87154368831168 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 87838827225130 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 88069375328083 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1690656401017304 }
-        flops { key: "f32xf32->f32" value: 872390099668027 }
+        b: 1
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 116306523396880 }
+        flops { key: "f32xf32->f32" value: 57456219178082 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171633923273657 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 169039959697733 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169466828282828 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 265186916275623 }
-        flops { key: "f32xf32->f32" value: 189288994975760 }
+        m: 512
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 231011580034423 }
+        flops { key: "f32xf32->f32" value: 96838187590187 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322638769230769 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 319566019047619 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 322638769230769 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1338569417117924 }
-        flops { key: "f32xf32->f32" value: 705104419618305 }
+        m: 512
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 342829445721583 }
+        flops { key: "f32xf32->f32" value: 170111188846641 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 502688119850187 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 502688119850187 }
       }
       entries {
         b: 1
-        m: 256
+        m: 512
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 80841878030417 }
-        flops { key: "f32xf32->f32" value: 59061706490649 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529981156959526 }
+        flops { key: "f32xf32->f32" value: 245370617915904 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 766958445714285 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 766958445714285 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 769156034383954 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
+        m: 512
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 625586963221906 }
-        flops { key: "f32xf32->f32" value: 424382915468603 }
+        flops { key: "bf16xbf16->bf16" value: 92820005532503 }
+        flops { key: "f32xf32->f32" value: 110365075958474 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 153919412844036 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 151487277652370 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153216584474885 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 512
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1320056028583503 }
-        flops { key: "f32xf32->f32" value: 687731197694201 }
+        flops { key: "bf16xbf16->bf16" value: 187454927374301 }
+        flops { key: "f32xf32->f32" value: 158275622641509 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300263373601789 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 291777669565217 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 289887101511879 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 512
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 300936609865470 }
+        flops { key: "f32xf32->f32" value: 123532193281178 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 497102696296296 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 504577924812030 }
+      }
+      entries {
+        b: 1
+        m: 512
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1411845803897398 }
-        flops { key: "f32xf32->f32" value: 754591068657109 }
+        flops { key: "bf16xbf16->bf16" value: 460413495846063 }
+        flops { key: "f32xf32->f32" value: 185064085487762 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 746691115438108 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 748773935843793 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 749819709497206 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 1024
+        b: 1
+        m: 512
+        n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1341469864251273 }
-        flops { key: "f32xf32->f32" value: 693743707963172 }
+        flops { key: "bf16xbf16->bf16" value: 661578449784350 }
+        flops { key: "f32xf32->f32" value: 280570113404755 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 961272895255147 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 975242346957311 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 965595165467625 }
       }
       entries {
         b: 1
         m: 1024
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 44027465310808 }
-        flops { key: "f32xf32->f32" value: 29070333116742 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 23563505617977 }
+        flops { key: "f32xf32->f32" value: 17586180293501 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16578276679841 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16644063492063 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16416062622309 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 868503573327941 }
-        flops { key: "f32xf32->f32" value: 517372438234054 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45466710027100 }
+        flops { key: "f32xf32->f32" value: 33554432000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30559591985428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30504029090909 }
       }
       entries {
         b: 1
-        m: 256
+        m: 1024
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 80854053012048 }
+        flops { key: "f32xf32->f32" value: 56394003361344 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59178892416225 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58661594405594 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59178892416225 }
+      }
+      entries {
+        b: 1
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81024888620585 }
-        flops { key: "f32xf32->f32" value: 55450414377194 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 144320137634408 }
+        flops { key: "f32xf32->f32" value: 83781353308364 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172516359897172 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172074010256410 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172960989690721 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 78951604705882 }
-        flops { key: "f32xf32->f32" value: 53419991243781 }
+        b: 1
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 219668949263502 }
+        flops { key: "f32xf32->f32" value: 108065803542673 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 303660018099547 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 305040290909090 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300936609865470 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 1024
+        m: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 433616082382635 }
-        flops { key: "f32xf32->f32" value: 272177902154626 }
+        flops { key: "bf16xbf16->bf16" value: 43690666666666 }
+        flops { key: "f32xf32->f32" value: 26672839427662 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 47259763380281 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47798336182336 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47798336182336 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 1024
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 434273740748230 }
-        flops { key: "f32xf32->f32" value: 264925197137922 }
+        flops { key: "bf16xbf16->bf16" value: 84307618090452 }
+        flops { key: "f32xf32->f32" value: 46668194714881 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59283448763250 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59283448763250 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59074704225352 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 1
+        m: 1024
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 413494492731298 }
-        flops { key: "f32xf32->f32" value: 269716609897010 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 149130808888888 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 177536677248677 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177536677248677 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178007596816976 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 1024
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 244032232727272 }
+        flops { key: "f32xf32->f32" value: 100087791200596 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 328914634400367 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 324197410628019 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 328965019607843 }
+      }
+      entries {
+        b: 1
+        m: 1024
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 663520360883670 }
-        flops { key: "f32xf32->f32" value: 401173855408182 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 347714321243523 }
+        flops { key: "f32xf32->f32" value: 153298614983759 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 532610031746031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 559240533333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 535799313373253 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 107892064308681 }
-        flops { key: "f32xf32->f32" value: 103224555277831 }
+        b: 1
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 69759733887733 }
+        flops { key: "f32xf32->f32" value: 41425224691358 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91428970027247 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 91678775956284 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91180521739130 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 1024
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81221015431164 }
-        flops { key: "f32xf32->f32" value: 58153261698440 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 144010437768240 }
+        flops { key: "f32xf32->f32" value: 72865216069489 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 178956970666666 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 178481021276595 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 179435465240641 }
       }
       entries {
-        b: 4
+        b: 1
         m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 682878972255346 }
-        flops { key: "f32xf32->f32" value: 422649802794725 }
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 252764082862523 }
+        flops { key: "f32xf32->f32" value: 116609668114682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 331401797530864 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330585536945812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 331401797530864 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 937561077493997 }
-        flops { key: "f32xf32->f32" value: 520176497532322 }
+        b: 1
+        m: 1024
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 384027834048640 }
+        flops { key: "f32xf32->f32" value: 195795372720641 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 578524689655172 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 574808256959314 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 577280550537634 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 133666354288559 }
-        flops { key: "f32xf32->f32" value: 88054930621617 }
+        b: 1
+        m: 1024
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 494356272559852 }
+        flops { key: "f32xf32->f32" value: 240641376961004 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 865920825806451 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 834946986003110 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 867319728594507 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 517684239860182 }
-        flops { key: "f32xf32->f32" value: 351283465914202 }
+        b: 1
+        m: 1024
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 116105301038062 }
+        flops { key: "f32xf32->f32" value: 58304834057341 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 168192641604010 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 169895858227848 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168192641604010 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 1024
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 954225126860697 }
-        flops { key: "f32xf32->f32" value: 562702472372342 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
+        flops { key: "f32xf32->f32" value: 99273467455621 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 329722654383540 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 328160704156479 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 330585536945812 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 613435306148682 }
-        flops { key: "f32xf32->f32" value: 387597445717895 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 359833050938337 }
+        flops { key: "f32xf32->f32" value: 169681072060682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 583555339130434 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 582289492407809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 581029125541125 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 1024
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1338986725692685 }
-        flops { key: "f32xf32->f32" value: 703703655108853 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529458493096646 }
+        flops { key: "f32xf32->f32" value: 249128033410672 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 887389937190082 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 896278651085141 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 877240052287581 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42246688070506 }
-        flops { key: "f32xf32->f32" value: 32764000488221 }
+        b: 1
+        m: 1024
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 639132038095238 }
+        flops { key: "f32xf32->f32" value: 283010496573537 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1201053494407158 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1207808575928009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1209168720720720 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81827604328608 }
-        flops { key: "f32xf32->f32" value: 55273438896324 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 187193483960948 }
+        flops { key: "f32xf32->f32" value: 161903170084439 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 292413350762527 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289223386936026 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 291144746203904 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 424655655131500 }
-        flops { key: "f32xf32->f32" value: 294619789820277 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 312134251162790 }
+        flops { key: "f32xf32->f32" value: 122071603456116 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 556920033195020 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 559240533333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 572357049040511 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 1024
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 472597633802816 }
+        flops { key: "f32xf32->f32" value: 184175269982847 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 824686500768049 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 834946986003110 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 825955249230769 }
+      }
+      entries {
+        b: 1
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 616960036773683 }
-        flops { key: "f32xf32->f32" value: 388948815576182 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 665679990080595 }
+        flops { key: "f32xf32->f32" value: 283758410147991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1254371289719626 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1252907612602100 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1257308927400468 }
       }
       entries {
         b: 1
-        m: 512
+        m: 1024
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 442187511170596 }
-        flops { key: "f32xf32->f32" value: 299196607175200 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 751920044817927 }
+        flops { key: "f32xf32->f32" value: 325820611136398 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1526285464108031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1525201454545454 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1546064541396688 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 2048
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 195902540412333 }
-        flops { key: "f32xf32->f32" value: 131690908689519 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45343827027027 }
+        flops { key: "f32xf32->f32" value: 32961131630648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30611866347360 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30504029090909 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1131259288446976 }
-        flops { key: "f32xf32->f32" value: 630916973338229 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84307618090452 }
+        flops { key: "f32xf32->f32" value: 55924053333333 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59074704225352 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59074704225352 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59074704225352 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 20510044009779 }
-        flops { key: "f32xf32->f32" value: 16976692132557 }
+        b: 1
+        m: 2048
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 145572373101952 }
+        flops { key: "f32xf32->f32" value: 83572682440846 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 173857160621761 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172516359897172 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 173857160621761 }
       }
       entries {
         b: 1
-        m: 256
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 44020245326336 }
-        flops { key: "f32xf32->f32" value: 27817145699481 }
+        m: 2048
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 219668949263502 }
+        flops { key: "f32xf32->f32" value: 114912438356164 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 306433168949771 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 304348589569161 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300936609865470 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 2048
         n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 510212318365407 }
-        flops { key: "f32xf32->f32" value: 348306487389506 }
+        flops { key: "bf16xbf16->bf16" value: 303316899435028 }
+        flops { key: "f32xf32->f32" value: 147492008791208 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 493447529411764 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 503631249530956 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 495268369003690 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 1
+        m: 2048
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 152498483738105 }
-        flops { key: "f32xf32->f32" value: 106680757476403 }
+        flops { key: "bf16xbf16->bf16" value: 78951604705882 }
+        flops { key: "f32xf32->f32" value: 50382030030030 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58052650519031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58559218150087 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58457198606271 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1338543343968523 }
-        flops { key: "f32xf32->f32" value: 815037469664174 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 4096
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 1092363202975726 }
-        flops { key: "f32xf32->f32" value: 668763641402935 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1039220227081632 }
-        flops { key: "f32xf32->f32" value: 610232273079245 }
+        flops { key: "bf16xbf16->bf16" value: 150131686800894 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 178007596816976 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 174308737662337 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178956970666666 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 383959171821920 }
-        flops { key: "f32xf32->f32" value: 253181283659514 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 244922861313868 }
+        flops { key: "f32xf32->f32" value: 133682996015936 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 324982392251816 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330585536945812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 329773287469287 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 2048
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 766889973395232 }
-        flops { key: "f32xf32->f32" value: 500724837773243 }
+        flops { key: "bf16xbf16->bf16" value: 348617475324675 }
+        flops { key: "f32xf32->f32" value: 150891206295671 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 548947762781186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 548947762781186 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 546711723014256 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 103066022653100 }
-        flops { key: "f32xf32->f32" value: 67096283446853 }
+        m: 2048
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 449264361506276 }
+        flops { key: "f32xf32->f32" value: 195367871906841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 848137301737756 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 844136654088050 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 848137301737756 }
       }
       entries {
         b: 1
         m: 2048
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 959394046127212 }
-        flops { key: "f32xf32->f32" value: 548002206826156 }
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
+        flops { key: "f32xf32->f32" value: 68688704196519 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 180886425876010 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 179916525469168 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 181375308108108 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 2048
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 257553807627728 }
-        flops { key: "f32xf32->f32" value: 170286547299976 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 855400775941047 }
-        flops { key: "f32xf32->f32" value: 520381328648452 }
+        flops { key: "bf16xbf16->bf16" value: 257615600767754 }
+        flops { key: "f32xf32->f32" value: 116205825108225 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 327360312195121 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 335544320000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325771184466019 }
       }
       entries {
         b: 1
-        m: 256
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 379112657427840 }
-        flops { key: "f32xf32->f32" value: 252007703808015 }
+        m: 2048
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 386794605187319 }
+        flops { key: "f32xf32->f32" value: 199877480268056 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 595200567627494 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 587386118161925 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 589968035164835 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 2048
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674355047260166 }
-        flops { key: "f32xf32->f32" value: 400556520960596 }
+        flops { key: "bf16xbf16->bf16" value: 498024964749536 }
+        flops { key: "f32xf32->f32" value: 256999000478697 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 940229267950963 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 943534115992970 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 941878792982456 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 271661435547122 }
-        flops { key: "f32xf32->f32" value: 181084716080613 }
-      }
-      entries {
-        b: 2
-        m: 4096
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 774565788277727 }
-        flops { key: "f32xf32->f32" value: 455905028368229 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 596523235555555 }
+        flops { key: "f32xf32->f32" value: 302632983089064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1380130879177378 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1390857284974093 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1367823979617834 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 252259326676847 }
-        flops { key: "f32xf32->f32" value: 163269493499581 }
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 231409875862068 }
+        flops { key: "f32xf32->f32" value: 100688468117029 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 335544320000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330585536945812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 335544320000000 }
       }
       entries {
         b: 1
-        m: 256
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74544697584004 }
-        flops { key: "f32xf32->f32" value: 53762358501902 }
+        m: 2048
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 354136485488126 }
+        flops { key: "f32xf32->f32" value: 192151364352183 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 589968035164835 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 610080581818181 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 593883752212389 }
       }
       entries {
         b: 1
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 149775676384432 }
-        flops { key: "f32xf32->f32" value: 106670159348301 }
-      }
-      entries {
-        b: 4
-        m: 1024
+        m: 2048
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 931865327836841 }
-        flops { key: "f32xf32->f32" value: 541081200088186 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 678563440398135 }
-        flops { key: "f32xf32->f32" value: 403018419442619 }
+        flops { key: "bf16xbf16->bf16" value: 535265116650049 }
+        flops { key: "f32xf32->f32" value: 266834449304174 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 999759612662942 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1001624835820895 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1005376239700374 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1172607274861784 }
-        flops { key: "f32xf32->f32" value: 659394687341675 }
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 641039894925373 }
+        flops { key: "f32xf32->f32" value: 342610664964901 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1412818189473684 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1409110005249343 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1414679610013175 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 1024
+        b: 1
+        m: 2048
+        n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 248063260713873 }
-        flops { key: "f32xf32->f32" value: 169426717790927 }
+        flops { key: "bf16xbf16->bf16" value: 725501232432432 }
+        flops { key: "f32xf32->f32" value: 315250095126247 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1835456109401709 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1838599013698630 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1848092640275387 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 2048
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 38211452811387 }
-        flops { key: "f32xf32->f32" value: 27588433299075 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 333874945273631 }
+        flops { key: "f32xf32->f32" value: 122182729176149 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 550072655737704 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 553475167010309 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 553475167010309 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 258546068865880 }
-        flops { key: "f32xf32->f32" value: 184080545859763 }
+        b: 1
+        m: 2048
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 458080982935153 }
+        flops { key: "f32xf32->f32" value: 174872958449542 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 917728054700854 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 936947490401396 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 935315177700348 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 4096
+        b: 1
+        m: 2048
+        n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1455736066093293 }
-        flops { key: "f32xf32->f32" value: 863592087063613 }
+        flops { key: "bf16xbf16->bf16" value: 663622882571075 }
+        flops { key: "f32xf32->f32" value: 326266127013065 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1416545941952506 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1414679610013175 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1412818189473684 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1458447135366869 }
-        flops { key: "f32xf32->f32" value: 871969454646965 }
+        flops { key: "bf16xbf16->bf16" value: 752183414360770 }
+        flops { key: "f32xf32->f32" value: 372051914067914 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1856079211754537 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1838599013698630 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1840174505569837 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22129880956306 }
-        flops { key: "f32xf32->f32" value: 17115242030094 }
+        b: 1
+        m: 2048
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 752315168330705 }
+        flops { key: "f32xf32->f32" value: 383410756650598 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2112625330054107 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2101256015655577 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2049125618320610 }
       }
       entries {
         b: 1
         m: 4096
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 421281735752820 }
-        flops { key: "f32xf32->f32" value: 285209329703167 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 55188210526315 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58457198606271 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58153261698440 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58457198606271 }
       }
       entries {
         b: 1
         m: 4096
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 448888722408026 }
-        flops { key: "f32xf32->f32" value: 293632822588364 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 83886080000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171196081632653 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 173407917312661 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 173407917312661 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 4096
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 961918767301231 }
-        flops { key: "f32xf32->f32" value: 559185925332812 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1185554425781519 }
-        flops { key: "f32xf32->f32" value: 652309267722215 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 222953036544850 }
+        flops { key: "f32xf32->f32" value: 115010906598114 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304348589569161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307838825688073 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 303660018099547 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
+        m: 4096
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 12861031812955 }
-        flops { key: "f32xf32->f32" value: 10251888787045 }
+        flops { key: "bf16xbf16->bf16" value: 308192257175660 }
+        flops { key: "f32xf32->f32" value: 132757396636993 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 500812417910447 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 507439425330812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 484540534296028 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 2048
+        m: 4096
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 243976783458304 }
-        flops { key: "f32xf32->f32" value: 167340734668432 }
+        flops { key: "bf16xbf16->bf16" value: 398567863400148 }
+        flops { key: "f32xf32->f32" value: 187651489688919 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 779203065312046 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 739491614325068 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 784898994152046 }
       }
       entries {
-        b: 2
+        b: 1
         m: 4096
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 668581459526774 }
-        flops { key: "f32xf32->f32" value: 416724134866346 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 144943550755939 }
+        flops { key: "f32xf32->f32" value: 81740394640682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 175218966057441 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 176138750656167 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 175218966057441 }
       }
       entries {
         b: 1
         m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1416487544543843 }
-        flops { key: "f32xf32->f32" value: 753341957980475 }
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 240103270125223 }
+        flops { key: "f32xf32->f32" value: 130689121713729 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 325771184466019 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 323416212048192 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325771184466019 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 152131173703598 }
-        flops { key: "f32xf32->f32" value: 109643809251506 }
+        flops { key: "bf16xbf16->bf16" value: 354604301188903 }
+        flops { key: "f32xf32->f32" value: 146206675381263 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 551202168377823 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 562757769392033 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 555766989648033 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 4096
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 376916831592803 }
-        flops { key: "f32xf32->f32" value: 259561690699220 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 449640629815745 }
+        flops { key: "f32xf32->f32" value: 196076938346001 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 854889987261146 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 868723158576051 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 858993459200000 }
       }
       entries {
-        b: 4
+        b: 1
         m: 4096
-        n: 1024
+        n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 760844516563330 }
-        flops { key: "f32xf32->f32" value: 496800820797547 }
+        flops { key: "bf16xbf16->bf16" value: 551768665981500 }
+        flops { key: "f32xf32->f32" value: 237448435205661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1332185885856079 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1333840775155279 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1333840775155279 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 248034609378609 }
-        flops { key: "f32xf32->f32" value: 157509435822209 }
+        b: 1
+        m: 4096
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 229040491467576 }
+        flops { key: "f32xf32->f32" value: 114422615515771 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 332222099009901 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 328914634400367 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 332222099009901 }
       }
       entries {
         b: 1
         m: 4096
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 780193877565849 }
-        flops { key: "f32xf32->f32" value: 502438194484251 }
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 377546351617440 }
+        flops { key: "f32xf32->f32" value: 194659503988397 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 597851795100222 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 595200567627494 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 597851795100222 }
       }
       entries {
         b: 1
         m: 4096
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 682011480111155 }
-        flops { key: "f32xf32->f32" value: 409453958339291 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 41831923952002 }
-        flops { key: "f32xf32->f32" value: 27363451172273 }
+        flops { key: "bf16xbf16->bf16" value: 497563403151065 }
+        flops { key: "f32xf32->f32" value: 274053553854007 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 977906943533697 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1011056331450094 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 977906943533697 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 256
+        m: 4096
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74731474387527 }
-        flops { key: "f32xf32->f32" value: 52919754756037 }
+        flops { key: "bf16xbf16->bf16" value: 604584360360360 }
+        flops { key: "f32xf32->f32" value: 303660018099547 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1497547871687587 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1359166865822784 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1516584497175141 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 4096
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1272347282651360 }
-        flops { key: "f32xf32->f32" value: 698908473373744 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 685221329929802 }
+        flops { key: "f32xf32->f32" value: 309034918405526 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1880458535901926 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1920826161001789 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1939912961156278 }
       }
       entries {
         b: 1
-        m: 256
+        m: 4096
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 32896501960784 }
-        flops { key: "f32xf32->f32" value: 23012040805829 }
+        flops { key: "bf16xbf16->bf16" value: 314327231850117 }
+        flops { key: "f32xf32->f32" value: 191193344729344 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 582289492407809 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 563940033613445 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 577280550537634 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 531029586547972 }
+        flops { key: "f32xf32->f32" value: 271283937342091 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1012963984905660 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 997901323420074 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1005376239700374 }
+      }
+      entries {
+        b: 1
+        m: 4096
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 428126724082934 }
-        flops { key: "f32xf32->f32" value: 298863495650963 }
+        flops { key: "bf16xbf16->bf16" value: 643730110311750 }
+        flops { key: "f32xf32->f32" value: 345921979381443 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1574401501466275 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1547178420749279 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1562942975254730 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 4096
         n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1276246201801467 }
-        flops { key: "f32xf32->f32" value: 692610985264770 }
+        flops { key: "bf16xbf16->bf16" value: 727467360433604 }
+        flops { key: "f32xf32->f32" value: 310824091474887 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1999519225325884 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2003249671641791 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2018311699248120 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 735565558486042 }
+        flops { key: "f32xf32->f32" value: 326514162688155 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2301697371918542 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2266473507124010 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2279706632696390 }
+      }
+      entries {
+        b: 1
+        m: 4096
         n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 371183760781263 }
-        flops { key: "f32xf32->f32" value: 248954747043821 }
+        flops { key: "bf16xbf16->bf16" value: 398863976225854 }
+        flops { key: "f32xf32->f32" value: 233625288076588 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 943534115992970 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 932067555555555 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 941878792982456 }
       }
       entries {
         b: 1
-        m: 512
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 266238984378874 }
-        flops { key: "f32xf32->f32" value: 185335604384223 }
+        m: 4096
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 661986327990135 }
+        flops { key: "f32xf32->f32" value: 325567457863518 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1553895548480463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1538312068767908 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1556147571014492 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 4096
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 133384077515527 }
-        flops { key: "f32xf32->f32" value: 101210465076821 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 751656859642982 }
+        flops { key: "f32xf32->f32" value: 301398569907281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2031678001892147 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2025927969811320 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2027721355444083 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 4096
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 447299239325140 }
-        flops { key: "f32xf32->f32" value: 287346443834883 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 759364797736916 }
+        flops { key: "f32xf32->f32" value: 381673091264551 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2315346251212938 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2321603943783784 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2339306806100218 }
       }
       entries {
-        b: 4
+        b: 1
         m: 4096
         n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 780193877565849 }
+        flops { key: "f32xf32->f32" value: 403149640735319 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2423753698474561 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2443098575654152 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2435479045080805 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 789352807737370 }
-        flops { key: "f32xf32->f32" value: 511625396348908 }
+        flops { key: "bf16xbf16->bf16" value: 12336188235294 }
+        flops { key: "f32xf32->f32" value: 10230009756097 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 10381940594059 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 10381940594059 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 10356306172839 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 40227102652480 }
-        flops { key: "f32xf32->f32" value: 24738314994009 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 24036126074498 }
+        flops { key: "f32xf32->f32" value: 17439933471933 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 15917662239089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16039403441682 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 15797755178907 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 618381296666906 }
-        flops { key: "f32xf32->f32" value: 407009457095475 }
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 44267060686015 }
+        flops { key: "f32xf32->f32" value: 33354306163021 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30012908765652 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30012908765652 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30066695340501 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 256
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 903419093103357 }
-        flops { key: "f32xf32->f32" value: 606698067733163 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 80854053012048 }
+        flops { key: "f32xf32->f32" value: 52841625196850 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59493673758865 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59599346358792 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59918628571428 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 1024
+        m: 256
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 242242938296672 }
-        flops { key: "f32xf32->f32" value: 167106345654034 }
+        flops { key: "bf16xbf16->bf16" value: 135573462626262 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84947929113924 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 156796411214953 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 156796411214953 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 4096
+        m: 256
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 902849367212339 }
-        flops { key: "f32xf32->f32" value: 615016438175699 }
+        flops { key: "bf16xbf16->bf16" value: 22795130434782 }
+        flops { key: "f32xf32->f32" value: 16416062622309 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16352062378167 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16163021194605 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16384000000000 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 44034687664041 }
+        flops { key: "f32xf32->f32" value: 26504290679304 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 23269370319001 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43018502564102 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44384169312169 }
+      }
+      entries {
+        b: 2
+        m: 256
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 697121781528972 }
-        flops { key: "f32xf32->f32" value: 429453784221577 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
+        flops { key: "f32xf32->f32" value: 51622203076923 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59178892416225 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59178892416225 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59178892416225 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 256
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 145257281385281 }
+        flops { key: "f32xf32->f32" value: 77492914549653 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 157163615925058 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159783009523809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 87381333333333 }
+      }
+      entries {
+        b: 2
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1142696410522465 }
-        flops { key: "f32xf32->f32" value: 633335883801518 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 263689053045186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 260111875968992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 265777679207920 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 256
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 142481664543524 }
-        flops { key: "f32xf32->f32" value: 99099383848638 }
+        flops { key: "bf16xbf16->bf16" value: 38130036363636 }
+        flops { key: "f32xf32->f32" value: 24244531791907 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 43804741514360 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 44034687664041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 22733355013550 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 2048
+        b: 2
+        m: 256
+        n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 884283981058266 }
-        flops { key: "f32xf32->f32" value: 525201589190180 }
+        flops { key: "bf16xbf16->bf16" value: 76087147392290 }
+        flops { key: "f32xf32->f32" value: 38881149478563 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84947929113924 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 85163532994923 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 84733414141414 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 256
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 877777906396893 }
-        flops { key: "f32xf32->f32" value: 522645163943901 }
+        flops { key: "bf16xbf16->bf16" value: 145888834782608 }
+        flops { key: "f32xf32->f32" value: 108590394822006 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161319384615384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162098705314009 }
       }
       entries {
-        b: 4
+        b: 2
+        m: 256
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 239247286987522 }
+        flops { key: "f32xf32->f32" value: 117838215978928 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 263689053045186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 266834449304174 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 265252426877470 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 367216766073871 }
+        flops { key: "f32xf32->f32" value: 145572373101952 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 404270265060240 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 409825123664122 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 406105077155824 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 61455003663003 }
+        flops { key: "f32xf32->f32" value: 69042041152263 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 82040176039119 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 81840078048780 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 82443321867321 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 125203104477611 }
+        flops { key: "f32xf32->f32" value: 112034831385642 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 155344592592592 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 157163615925058 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 155705020881670 }
+      }
+      entries {
+        b: 2
         m: 256
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 607191248462571 }
-        flops { key: "f32xf32->f32" value: 378277901708648 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 220390357963875 }
+        flops { key: "f32xf32->f32" value: 165089456334563 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 269513510040160 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 262657001956947 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 267365992031872 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 256
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 343267846547314 }
+        flops { key: "f32xf32->f32" value: 154628718894009 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 392449497076023 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 409200390243902 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 389036892753623 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 484103617673579 }
+        flops { key: "f32xf32->f32" value: 168139966176010 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547269023445463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 542293850505050 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 546711723014256 }
+      }
+      entries {
+        b: 2
+        m: 256
         n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1148195100016708 }
-        flops { key: "f32xf32->f32" value: 589037550024000 }
+        flops { key: "bf16xbf16->bf16" value: 92820005532503 }
+        flops { key: "f32xf32->f32" value: 107031681020733 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 147492008791208 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 144631172413793 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 146846529540481 }
       }
       entries {
-        b: 1
+        b: 2
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 190379756028368 }
+        flops { key: "f32xf32->f32" value: 159973454112038 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 247178136279926 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 246723764705882 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 247178136279926 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 298925897550111 }
+        flops { key: "f32xf32->f32" value: 202440012066365 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 407337566009104 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 390735743813682 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 392449497076023 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 421405739403453 }
+        flops { key: "f32xf32->f32" value: 183734056125941 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 535799313373253 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 546711723014256 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 521740439261418 }
+      }
+      entries {
+        b: 2
         m: 256
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 602887043234138 }
+        flops { key: "f32xf32->f32" value: 174308737662337 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 628286614394382 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 626819511967308 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 635726361160450 }
+      }
+      entries {
+        b: 2
+        m: 512
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 5990793072665 }
-        flops { key: "f32xf32->f32" value: 4529485961123 }
+        flops { key: "bf16xbf16->bf16" value: 23497501400560 }
+        flops { key: "f32xf32->f32" value: 17476266666666 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16288559223300 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16480565815324 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16448250980392 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 512
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45100043010752 }
+        flops { key: "f32xf32->f32" value: 33825032258064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30504029090909 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30559591985428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30393507246376 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 81442796116504 }
+        flops { key: "f32xf32->f32" value: 53008581358609 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 60349697841726 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 60025817531305 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 60025817531305 }
+      }
+      entries {
+        b: 2
+        m: 512
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 711912364661031 }
-        flops { key: "f32xf32->f32" value: 431459872017680 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161708106024096 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160547521531100 }
       }
       entries {
-        b: 1
+        b: 2
         m: 512
         n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 214405316293929 }
+        flops { key: "f32xf32->f32" value: 113647525825571 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 280790225941422 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 281378884696016 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 276168164609053 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 272730968757937 }
-        flops { key: "f32xf32->f32" value: 182593627072527 }
+        flops { key: "bf16xbf16->bf16" value: 43240247422680 }
+        flops { key: "f32xf32->f32" value: 26379270440251 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 24070611190817 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 24036126074498 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44384169312169 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 512
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 151103549676329 }
-        flops { key: "f32xf32->f32" value: 102593333078540 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 84733414141414 }
+        flops { key: "f32xf32->f32" value: 51701744221879 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 61794534069981 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 62137837037037 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 61567765137614 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 512
+        b: 2
+        m: 512
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 80635462901772 }
-        flops { key: "f32xf32->f32" value: 56199195226630 }
+        flops { key: "bf16xbf16->bf16" value: 147816881057268 }
+        flops { key: "f32xf32->f32" value: 78398205607476 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166523235732009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 165700898765432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 166111049504950 }
       }
       entries {
-        b: 1
+        b: 2
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 244476735883424 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300936609865470 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 297600283813747 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 296286375275938 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 345921979381443 }
+        flops { key: "f32xf32->f32" value: 184238473575840 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 460438174957118 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 481930800718132 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 454975349152542 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 69184395876288 }
+        flops { key: "f32xf32->f32" value: 36711632385120 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 86037005128205 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 85816961636828 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85163532994923 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 145888834782608 }
+        flops { key: "f32xf32->f32" value: 72160068817204 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 167772160000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167772160000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168615236180904 }
+      }
+      entries {
+        b: 2
         m: 512
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 147451500137324 }
-        flops { key: "f32xf32->f32" value: 104510592174420 }
+        flops { key: "bf16xbf16->bf16" value: 250406208955223 }
+        flops { key: "f32xf32->f32" value: 117625220353836 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300936609865470 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 301612871910112 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305735143507972 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 381300363636363 }
+        flops { key: "f32xf32->f32" value: 146525903930131 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 484540534296028 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 489845722627737 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 491640029304029 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 495268369003690 }
+        flops { key: "f32xf32->f32" value: 215345949810724 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 698141628088426 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 682173966963151 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 693631669250646 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 114520245733788 }
+        flops { key: "f32xf32->f32" value: 107374182400000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 159403477434679 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 158649796690307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 158649796690307 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
+        flops { key: "f32xf32->f32" value: 98762125091979 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305040290909090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307134389016018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 307134389016018 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 371280022130013 }
+        flops { key: "f32xf32->f32" value: 154361964347326 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 492542121100917 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 501748515887850 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 489845722627737 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 530996760338752 }
+        flops { key: "f32xf32->f32" value: 182175402782490 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 743588520775623 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 731431760217983 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 731431760217983 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 641039894925373 }
+        flops { key: "f32xf32->f32" value: 232559516792332 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 972592231884058 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 932016990397656 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956989147950089 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 174308737662337 }
+        flops { key: "f32xf32->f32" value: 155524597914252 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 276168164609053 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 276168164609053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 275601084188911 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 322638769230769 }
+        flops { key: "f32xf32->f32" value: 121025904418394 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 487178686025408 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 482797582733812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 488064465454545 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 474686924845269 }
+        flops { key: "f32xf32->f32" value: 184745668272539 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 690953554697554 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 684784326530612 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 686535693094629 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 662803595061728 }
+        flops { key: "f32xf32->f32" value: 190819588413008 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 947697991173874 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 939406670166229 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 933688542608695 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 747731075208913 }
+        flops { key: "f32xf32->f32" value: 252911063603642 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1045512973709834 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1038403649792982 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1052172292013718 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45221606469002 }
+        flops { key: "f32xf32->f32" value: 33288126984126 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30559591985428 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30504029090909 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30559591985428 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 86258179948586 }
+        flops { key: "f32xf32->f32" value: 54207483037156 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59599346358792 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59493673758865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59811821746880 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161319384615384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 162098705314009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162098705314009 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 217180789644012 }
+        flops { key: "f32xf32->f32" value: 114033753610875 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 286790017094017 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 287404128479657 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 284963329087048 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 301612871910112 }
+        flops { key: "f32xf32->f32" value: 157440150146627 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 457300606473594 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 470939396491228 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 461229305841924 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 81640953771289 }
+        flops { key: "f32xf32->f32" value: 51622203076923 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 57456219178082 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 57358003418803 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 57653663230240 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 150131686800894 }
+        flops { key: "f32xf32->f32" value: 78306725787631 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166937472636815 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166111049504950 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 167353775561097 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 246723764705882 }
+        flops { key: "f32xf32->f32" value: 112316090376569 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 303660018099547 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 303660018099547 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300936609865470 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 348617475324675 }
+        flops { key: "f32xf32->f32" value: 196935544775092 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 515231201535508 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 512281404580152 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 519217516441005 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 456135014443500 }
+        flops { key: "f32xf32->f32" value: 234032655623365 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 749819709497206 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 751920044817927 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 742560044260027 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 140985008403361 }
+        flops { key: "f32xf32->f32" value: 67108864000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 170760468193384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168615236180904 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 170327065989847 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 244922861313868 }
+        flops { key: "f32xf32->f32" value: 113551377326565 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309971658198614 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 312861836829836 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 310689185185185 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 382386689458689 }
+        flops { key: "f32xf32->f32" value: 139737353461738 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 545600520325203 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 546711723014256 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 553475167010309 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 499385767804197 }
+        flops { key: "f32xf32->f32" value: 214405316293929 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 817155117199391 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 858993459200000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 867319728594507 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 597851795100222 }
+        flops { key: "f32xf32->f32" value: 274053553854007 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1179936070329670 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1154561101075268 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1173488332240437 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 233016888888888 }
+        flops { key: "f32xf32->f32" value: 92119236787920 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 314327231850117 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309971658198614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 315065089201877 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 355073354497354 }
+        flops { key: "f32xf32->f32" value: 151572815358554 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 556920033195020 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 553475167010309 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 547827461224489 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 532082172447968 }
+        flops { key: "f32xf32->f32" value: 169734717673095 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 858993459200000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 860370051282051 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 863136514469453 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 642190086124401 }
+        flops { key: "f32xf32->f32" value: 236819987648875 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1202398459126539 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1199711535195530 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1206451487640449 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 721600688172043 }
+        flops { key: "f32xf32->f32" value: 289496312752763 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1460873229931972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1473907788606726 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1487177041551246 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 326563815085158 }
+        flops { key: "f32xf32->f32" value: 110421824763471 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 516222030769230 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 540111581488933 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 529458493096646 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 452673618887015 }
+        flops { key: "f32xf32->f32" value: 183294951177876 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 848137301737756 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 819650247328244 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 819650247328244 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 662394709438618 }
+        flops { key: "f32xf32->f32" value: 180642971736204 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1185145501103752 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1201053494407158 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1207808575928009 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 754827292794376 }
+        flops { key: "f32xf32->f32" value: 300305362606628 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1481023205517241 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1478983228650137 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1490273176960444 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 742688448210271 }
+        flops { key: "f32xf32->f32" value: 313592822429906 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1603796600448095 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1614042576475009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1631206720850740 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84733414141414 }
+        flops { key: "f32xf32->f32" value: 52841625196850 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 60133390681003 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59918628571428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 60787014492753 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 144923987582669 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163281907542579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161708106024096 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 222214781456953 }
+        flops { key: "f32xf32->f32" value: 112504382229673 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 286790017094017 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289262344827586 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 287404128479657 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 304348589569161 }
+        flops { key: "f32xf32->f32" value: 174876518566775 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 464421204152249 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 475107001769911 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462819751724137 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 396800378418329 }
+        flops { key: "f32xf32->f32" value: 216131607085346 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 752974631136044 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 758292248587570 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 757222724964739 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 144320137634408 }
+        flops { key: "f32xf32->f32" value: 64839482125603 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 165700898765432 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164482509803921 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 165292768472906 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 242270267148014 }
+        flops { key: "f32xf32->f32" value: 112693306465155 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305735143507972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 304348589569161 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305040290909090 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 348165312581063 }
+        flops { key: "f32xf32->f32" value: 186924633154894 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 527378106090373 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 536870912000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 526344031372549 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 450395060402684 }
+        flops { key: "f32xf32->f32" value: 240965400359066 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 810983250755287 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 823421644171779 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 818400780487804 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 547827461224489 }
+        flops { key: "f32xf32->f32" value: 285266159404888 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1195703590200445 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1207808575928009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1203746439461883 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 439967967219832 }
-        flops { key: "f32xf32->f32" value: 284002333928453 }
+        flops { key: "bf16xbf16->bf16" value: 229432013675213 }
+        flops { key: "f32xf32->f32" value: 88534121372031 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 312134251162790 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309971658198614 }
       }
       entries {
         b: 2
-        m: 256
+        m: 2048
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21785055672780 }
-        flops { key: "f32xf32->f32" value: 17770121541109 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 382386689458689 }
+        flops { key: "f32xf32->f32" value: 140615744368779 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 565127275789473 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 556847827823155 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 560408050104384 }
       }
       entries {
-        b: 1
+        b: 2
         m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 425286394296465 }
-        flops { key: "f32xf32->f32" value: 297209002560376 }
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 502688119850187 }
+        flops { key: "f32xf32->f32" value: 211117149823043 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 956989147950089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 956989147950089 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956989147950089 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1127177061575304 }
-        flops { key: "f32xf32->f32" value: 626225456878326 }
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 599186285714285 }
+        flops { key: "f32xf32->f32" value: 278315662001036 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1348921889447236 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1352319677581864 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1337162919053549 }
       }
       entries {
         b: 2
-        m: 512
+        m: 2048
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 432916772099586 }
-        flops { key: "f32xf32->f32" value: 271644253747391 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 683911989808917 }
+        flops { key: "f32xf32->f32" value: 309703439284684 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1712506896331738 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1726273028938906 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1662139046439628 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 2048
+        m: 2048
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 952584928416967 }
-        flops { key: "f32xf32->f32" value: 539738271567703 }
+        flops { key: "bf16xbf16->bf16" value: 319566019047619 }
+        flops { key: "f32xf32->f32" value: 110421824763471 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 558077871101871 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 541200516129032 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 553475167010309 }
       }
       entries {
         b: 2
-        m: 512
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 269445878042659 }
-        flops { key: "f32xf32->f32" value: 183828423900017 }
+        m: 2048
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529458493096646 }
+        flops { key: "f32xf32->f32" value: 168721216844751 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 950214003539823 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 948535180212014 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 955286320284697 }
       }
       entries {
         b: 2
-        m: 256
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 197034925038994 }
-        flops { key: "f32xf32->f32" value: 131432991492747 }
+        m: 2048
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 639132038095238 }
+        flops { key: "f32xf32->f32" value: 228115960059485 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1385473321290322 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1383687917525773 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1387263338501292 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 2048
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1045799372028610 }
-        flops { key: "f32xf32->f32" value: 611938562895154 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 723774321572262 }
+        flops { key: "f32xf32->f32" value: 301610224348452 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1806125860386879 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1786592053244592 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1794054843776107 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 2048
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 380658273154302 }
-        flops { key: "f32xf32->f32" value: 252704594963520 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 722807492516277 }
+        flops { key: "f32xf32->f32" value: 321720396704119 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2014524998123827 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2041334266159695 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2008871513564078 }
       }
       entries {
         b: 2
         m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 697971446493865 }
-        flops { key: "f32xf32->f32" value: 427020013521574 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 398863976225854 }
+        flops { key: "f32xf32->f32" value: 242160988723500 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 884466082372322 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 890333187396351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 896278651085141 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 2048
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 958644561352603 }
-        flops { key: "f32xf32->f32" value: 553635692823305 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 438485686166411 }
-        flops { key: "f32xf32->f32" value: 266024607990089 }
+        flops { key: "bf16xbf16->bf16" value: 663597249179188 }
+        flops { key: "f32xf32->f32" value: 179645612180023 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1394469901298701 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1362508460940598 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1399924151238592 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 59905256862307 }
-        flops { key: "f32xf32->f32" value: 59061706490649 }
+        b: 2
+        m: 2048
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 751393858642407 }
+        flops { key: "f32xf32->f32" value: 301655239218991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1794054843776107 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1764571608874281 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1773314325350949 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 2048
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 133666354288559 }
-        flops { key: "f32xf32->f32" value: 100143800037306 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 752183414360770 }
+        flops { key: "f32xf32->f32" value: 316503132981148 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2054025488283118 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2022112662900188 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2069863757108433 }
       }
       entries {
         b: 2
-        m: 512
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 151103549676329 }
-        flops { key: "f32xf32->f32" value: 98112374269005 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43683556712774 }
-        flops { key: "f32xf32->f32" value: 32824095866960 }
+        m: 2048
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 771777749855402 }
+        flops { key: "f32xf32->f32" value: 328536451406402 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2095105998048780 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2093063984405458 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2096128499755978 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 4096
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 65012219907968 }
-        flops { key: "f32xf32->f32" value: 45957105975004 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1197453766222903 }
-        flops { key: "f32xf32->f32" value: 647966854017764 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 143089262260127 }
+        flops { key: "f32xf32->f32" value: 77852510440835 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161319384615384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 162491196125908 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161708106024096 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1143456965889048 }
-        flops { key: "f32xf32->f32" value: 608923713257837 }
+        b: 2
+        m: 4096
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 218595648208469 }
+        flops { key: "f32xf32->f32" value: 113743837288135 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 286178524520255 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 286178524520255 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 286178524520255 }
       }
       entries {
         b: 2
-        m: 512
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1199460251623263 }
-        flops { key: "f32xf32->f32" value: 646503817111031 }
+        m: 4096
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 308192257175660 }
+        flops { key: "f32xf32->f32" value: 176023249836065 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 474267590106007 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 483667488288288 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 470114633975481 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 4096
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 154584195796141 }
-        flops { key: "f32xf32->f32" value: 102124959482594 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 400052840536512 }
+        flops { key: "f32xf32->f32" value: 216480206451612 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 709208602377807 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 702710617801047 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 700875864229765 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 938380444832860 }
-        flops { key: "f32xf32->f32" value: 535682366748776 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 467453993905093 }
+        flops { key: "f32xf32->f32" value: 229285036087977 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 955286320284697 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 916161965870307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 944363961301671 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 620660013872832 }
-        flops { key: "f32xf32->f32" value: 302899770513769 }
+        m: 4096
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 238821580071174 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 307838825688073 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 306433168949771 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 306433168949771 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 256
+        m: 4096
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 152131173703598 }
-        flops { key: "f32xf32->f32" value: 105165702644466 }
+        flops { key: "bf16xbf16->bf16" value: 348617475324675 }
+        flops { key: "f32xf32->f32" value: 191876666190135 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 525314003913894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 528416251968503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 560408050104384 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 256
+        b: 2
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 81221015431164 }
-        flops { key: "f32xf32->f32" value: 55819391973383 }
+        flops { key: "bf16xbf16->bf16" value: 453055621940928 }
+        flops { key: "f32xf32->f32" value: 239140718040089 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 827227907550077 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 836247526479750 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 827227907550077 }
       }
       entries {
         b: 2
-        m: 256
+        m: 4096
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 422649802794725 }
-        flops { key: "f32xf32->f32" value: 276381421879021 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 548667258048032 }
+        flops { key: "f32xf32->f32" value: 284434920264900 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1269198373522458 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1267699910271546 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1261741273795534 }
       }
       entries {
-        b: 4
+        b: 2
         m: 4096
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1350353246924739 }
-        flops { key: "f32xf32->f32" value: 769556557977547 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 376916831592803 }
-        flops { key: "f32xf32->f32" value: 251329351980806 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 613216347230154 }
+        flops { key: "f32xf32->f32" value: 212243886934176 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1569797988304093 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1581357620029455 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1573248093772893 }
       }
       entries {
         b: 2
         m: 4096
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 148779523901898 }
-        flops { key: "f32xf32->f32" value: 102436731921389 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 378078107042253 }
+        flops { key: "f32xf32->f32" value: 146605929000546 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566319527426160 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 550072655737704 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 558077871101871 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 4096
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 75223611041053 }
-        flops { key: "f32xf32->f32" value: 54637788723794 }
+        flops { key: "bf16xbf16->bf16" value: 495725680517082 }
+        flops { key: "f32xf32->f32" value: 210620208709297 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 922458611683848 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 969081068592057 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 974357372050816 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 4096
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 271627074120920 }
-        flops { key: "f32xf32->f32" value: 180127801375608 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 598518296544035 }
+        flops { key: "f32xf32->f32" value: 259604835275095 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1418417204755614 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1439332203753351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1445143773889636 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81037118792452 }
-        flops { key: "f32xf32->f32" value: 49627557034572 }
+        m: 4096
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 684347880178457 }
+        flops { key: "f32xf32->f32" value: 279109202453190 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1893620191127032 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1887068231985940 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1922545790510295 }
       }
       entries {
         b: 2
-        m: 512
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 134739844898983 }
-        flops { key: "f32xf32->f32" value: 88395638758541 }
+        m: 4096
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 691843958762886 }
+        flops { key: "f32xf32->f32" value: 313547035771645 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2218474842975206 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2218474842975206 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2221917897568546 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 150426145138694 }
-        flops { key: "f32xf32->f32" value: 104683808521010 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 941001762830695 }
-        flops { key: "f32xf32->f32" value: 567685595743977 }
+        m: 4096
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 524800500488758 }
+        flops { key: "f32xf32->f32" value: 161316355086480 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 956989147950089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 958698057142857 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 970833475587703 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42360021461259 }
-        flops { key: "f32xf32->f32" value: 28381841404102 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 636480037937166 }
+        flops { key: "f32xf32->f32" value: 212622143366336 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1489239700416088 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1487177041551246 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1508064359550561 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81815134410240 }
-        flops { key: "f32xf32->f32" value: 49485750944787 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 721843243025210 }
+        flops { key: "f32xf32->f32" value: 284887721942159 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1973676740076971 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1970168484403669 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1979247601843318 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 1024
+        m: 4096
+        n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 948325744314418 }
-        flops { key: "f32xf32->f32" value: 539179273263660 }
+        flops { key: "bf16xbf16->bf16" value: 730560860010205 }
+        flops { key: "f32xf32->f32" value: 275264378130895 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2288208468833244 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2284557072340425 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2215042442496132 }
       }
       entries {
         b: 2
-        m: 256
-        n: 1024
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 759163463720724 }
+        flops { key: "f32xf32->f32" value: 326649966778608 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2395366670826289 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2367025238908790 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2378165723145072 }
+      }
+      entries {
+        b: 2
+        m: 4096
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 375861319331408 }
-        flops { key: "f32xf32->f32" value: 253435256741606 }
+        flops { key: "bf16xbf16->bf16" value: 597519100723427 }
+        flops { key: "f32xf32->f32" value: 171059713876055 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1427848170212766 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1464859241473397 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1398101333333333 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 4096
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 272800260162601 }
-        flops { key: "f32xf32->f32" value: 176355723741479 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 741022652864044 }
+        flops { key: "f32xf32->f32" value: 230317851565851 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1959382890510949 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1943424115837104 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1984735349353050 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 4096
         n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1185513520615533 }
-        flops { key: "f32xf32->f32" value: 656785594341966 }
+        flops { key: "bf16xbf16->bf16" value: 746561323831044 }
+        flops { key: "f32xf32->f32" value: 300200412105962 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2312852609585352 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2264083972588297 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2346976664480874 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 4096
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 151465908308647 }
-        flops { key: "f32xf32->f32" value: 110524119814719 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 778986660500019 }
+        flops { key: "f32xf32->f32" value: 319078587332873 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2368289653679803 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2378824312378842 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2397414064192018 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 255591960009521 }
-        flops { key: "f32xf32->f32" value: 186122694401109 }
+        m: 4096
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 804185691260323 }
+        flops { key: "f32xf32->f32" value: 336088741093054 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2537646851403249 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2555767507289497 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2535025702228124 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 4
+        m: 256
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 956030561157484 }
-        flops { key: "f32xf32->f32" value: 486916338860074 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 23899168091168 }
+        flops { key: "f32xf32->f32" value: 17225067761806 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16070130268199 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16100975047984 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16100975047984 }
+      }
+      entries {
+        b: 4
+        m: 256
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45590260869565 }
+        flops { key: "f32xf32->f32" value: 32961131630648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30840470588235 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30783882568807 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30727501831501 }
+      }
+      entries {
+        b: 4
+        m: 256
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 80082176610978 }
+        flops { key: "f32xf32->f32" value: 52758540880503 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 60458436036036 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 60567566787003 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 60458436036036 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 4096
+        b: 4
+        m: 256
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 125657322878876 }
-        flops { key: "f32xf32->f32" value: 105652053921086 }
+        flops { key: "bf16xbf16->bf16" value: 135573462626262 }
+        flops { key: "f32xf32->f32" value: 82040176039119 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 155344592592592 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 155705020881670 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 155705020881670 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 936743139803707 }
-        flops { key: "f32xf32->f32" value: 541353999810934 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 213382715421303 }
+        flops { key: "f32xf32->f32" value: 106437532117367 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 249475330855018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 249475330855018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 247634184501845 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
+        b: 4
+        m: 256
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 272177902154626 }
-        flops { key: "f32xf32->f32" value: 182067286816447 }
+        flops { key: "bf16xbf16->bf16" value: 43233283298437 }
+        flops { key: "f32xf32->f32" value: 26337858712715 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 23172950276243 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 44501899204244 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 23205001383125 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 151787082838563 }
-        flops { key: "f32xf32->f32" value: 100887139340411 }
+        b: 4
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 51701744221879 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58355533913043 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58559218150087 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58867424561403 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 627094071543291 }
-        flops { key: "f32xf32->f32" value: 406335600378429 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 77852510440835 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 88301136842105 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 87953950196592 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 159783009523809 }
       }
       entries {
         b: 4
         m: 256
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 376388335465778 }
-        flops { key: "f32xf32->f32" value: 245314558830249 }
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 234236872600349 }
+        flops { key: "f32xf32->f32" value: 112598765100671 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 260616947572815 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 260111875968992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 262144000000000 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 256
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 254140076686390 }
-        flops { key: "f32xf32->f32" value: 162282449028942 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 346368330322580 }
+        flops { key: "f32xf32->f32" value: 185640011065006 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 375434204195804 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 376487315568022 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 382386689458689 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 383410756650598 }
-        flops { key: "f32xf32->f32" value: 263107528546924 }
+        m: 256
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 73746004395604 }
+        flops { key: "f32xf32->f32" value: 38926255220417 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 85163532994923 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 84307618090452 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 84519979848866 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40814270336019 }
-        flops { key: "f32xf32->f32" value: 29530853245324 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 144320137634408 }
+        flops { key: "f32xf32->f32" value: 59283448763250 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163281907542579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159025744075829 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161708106024096 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1140269417847542 }
-        flops { key: "f32xf32->f32" value: 627620983596975 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 241398791366906 }
+        flops { key: "f32xf32->f32" value: 116914397212543 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 265777679207920 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 269513510040160 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 265252426877470 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 256
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1148482940352636 }
-        flops { key: "f32xf32->f32" value: 730327935213722 }
+        flops { key: "bf16xbf16->bf16" value: 365218307482993 }
+        flops { key: "f32xf32->f32" value: 147411013728720 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 408577558599695 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 400649934328358 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 401248813153961 }
       }
       entries {
-        b: 1
+        b: 4
         m: 256
         n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 255106159182703 }
-        flops { key: "f32xf32->f32" value: 188739993671998 }
+        flops { key: "bf16xbf16->bf16" value: 491640029304029 }
+        flops { key: "f32xf32->f32" value: 189506146134839 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 531029586547972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 534199912437810 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 528416251968503 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81024888620585 }
-        flops { key: "f32xf32->f32" value: 58757897778264 }
+        b: 4
+        m: 256
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 122685308957952 }
+        flops { key: "f32xf32->f32" value: 106861248407643 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 151830009049773 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 153919412844036 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153567194508009 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 256
         n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 927839122056599 }
-        flops { key: "f32xf32->f32" value: 544752804134825 }
+        flops { key: "bf16xbf16->bf16" value: 231409875862068 }
+        flops { key: "f32xf32->f32" value: 157903209411764 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 267365992031872 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 268973402805611 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 270600258064516 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81616131346913 }
-        flops { key: "f32xf32->f32" value: 49774792508807 }
-      }
-      entries {
-        b: 1
-        m: 2048
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 712857642489626 }
-        flops { key: "f32xf32->f32" value: 430422137194969 }
+        flops { key: "bf16xbf16->bf16" value: 356015193633952 }
+        flops { key: "f32xf32->f32" value: 154096128587830 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 397682157037037 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 419430400000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 395922501474926 }
       }
       entries {
         b: 4
-        m: 512
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 461130265836375 }
-        flops { key: "f32xf32->f32" value: 292353638009665 }
-      }
-      entries {
-        b: 2
-        m: 1024
+        m: 256
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 242680941123290 }
-        flops { key: "f32xf32->f32" value: 166484506395844 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 494356272559852 }
+        flops { key: "f32xf32->f32" value: 166937472636815 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547269023445463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 539568755778894 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 548387039836567 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 617669849140720 }
-        flops { key: "f32xf32->f32" value: 389248440819285 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 601536035854341 }
+        flops { key: "f32xf32->f32" value: 201793238864875 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 611818703133903 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 602887043234138 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 581973888346883 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 256
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 258048984378755 }
-        flops { key: "f32xf32->f32" value: 178214410622406 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 186932768802228 }
+        flops { key: "f32xf32->f32" value: 154450780207134 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 241833744144144 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 256140702290076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 235883528998242 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1278192748470137 }
-        flops { key: "f32xf32->f32" value: 692492333712903 }
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 303316899435028 }
+        flops { key: "f32xf32->f32" value: 202899059712774 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 401248813153961 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 397093869822485 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 395922501474926 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1153437120010742 }
-        flops { key: "f32xf32->f32" value: 733524152854276 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 422733001574803 }
+        flops { key: "f32xf32->f32" value: 180699972484590 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 536334577422577 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 545600520325203 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 534199912437810 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 256
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 960198367091437 }
-        flops { key: "f32xf32->f32" value: 565946408749505 }
+        flops { key: "bf16xbf16->bf16" value: 589644054914881 }
+        flops { key: "f32xf32->f32" value: 172571813564770 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 621378370370370 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 604584360360360 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 615324827507163 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1336642743639617 }
-        flops { key: "f32xf32->f32" value: 657395049754623 }
+        flops { key: "bf16xbf16->bf16" value: 660560949861581 }
+        flops { key: "f32xf32->f32" value: 226122317363377 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 643730110311750 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 649768123449319 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 643718050246360 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81628540672038 }
-        flops { key: "f32xf32->f32" value: 48691357881371 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 44739242666666 }
+        flops { key: "f32xf32->f32" value: 33091155818540 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30393507246376 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30559591985428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30559591985428 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 616340287866829 }
-        flops { key: "f32xf32->f32" value: 428105387091951 }
+        b: 4
+        m: 512
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
+        flops { key: "f32xf32->f32" value: 53687091200000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59074704225352 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59493673758865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59493673758865 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 674407991834812 }
-        flops { key: "f32xf32->f32" value: 420662810577864 }
+        b: 4
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 135847902834008 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 160932527577937 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160164353221957 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162098705314009 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 79879617914000 }
-        flops { key: "f32xf32->f32" value: 56299382550335 }
+        b: 4
+        m: 512
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 211699886435331 }
+        flops { key: "f32xf32->f32" value: 112788006722689 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 277883494824016 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 276168164609053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 277883494824016 }
       }
       entries {
         b: 4
-        m: 256
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 673456259662877 }
-        flops { key: "f32xf32->f32" value: 429410847430513 }
+        m: 512
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 299593142857142 }
+        flops { key: "f32xf32->f32" value: 164785424186617 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 417473493001555 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 424068650868878 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 418123763239875 }
       }
       entries {
         b: 4
         m: 512
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 244922861313868 }
-        flops { key: "f32xf32->f32" value: 164457317200183 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 80082176610978 }
+        flops { key: "f32xf32->f32" value: 51228140458015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 57752895008605 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 57358003418803 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58153261698440 }
       }
       entries {
         b: 4
         m: 512
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 255591960009521 }
-        flops { key: "f32xf32->f32" value: 179151050971886 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 146846529540481 }
+        flops { key: "f32xf32->f32" value: 78306725787631 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162491196125908 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 162491196125908 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 163281907542579 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 629299237509157 }
-        flops { key: "f32xf32->f32" value: 416744352416068 }
-      }
-      entries {
-        b: 1
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 52331700165708 }
-        flops { key: "f32xf32->f32" value: 34234849636525 }
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 242270267148014 }
+        flops { key: "f32xf32->f32" value: 111662003327787 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 298261617777777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 296286375275938 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 298925897550111 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 252704594963520 }
-        flops { key: "f32xf32->f32" value: 162257925802795 }
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 345032719794344 }
+        flops { key: "f32xf32->f32" value: 184872903581267 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 455747803056027 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 479349028571428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 481876730169415 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 512
+        b: 4
+        m: 512
+        n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 151146090090090 }
-        flops { key: "f32xf32->f32" value: 109109015750431 }
+        flops { key: "bf16xbf16->bf16" value: 448888722408026 }
+        flops { key: "f32xf32->f32" value: 231509664510564 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 629391456037514 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 626453806301050 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 630130178403755 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 512
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 252704594963520 }
-        flops { key: "f32xf32->f32" value: 163244671075636 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 138941747412008 }
+        flops { key: "f32xf32->f32" value: 71697504273504 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 167772160000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164886643734643 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 166937472636815 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42683328987120 }
-        flops { key: "f32xf32->f32" value: 32009951824469 }
+        b: 4
+        m: 512
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 247178136279926 }
+        flops { key: "f32xf32->f32" value: 117220723144104 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 307134389016018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 302974555304740 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302292180180180 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 149775676384432 }
-        flops { key: "f32xf32->f32" value: 108920858592006 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 382932176890156 }
+        flops { key: "f32xf32->f32" value: 145809590439978 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 484540534296028 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 485416737793851 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 481930800718132 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 4096
+        b: 4
+        m: 512
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1043924724068785 }
-        flops { key: "f32xf32->f32" value: 641195408737193 }
+        flops { key: "bf16xbf16->bf16" value: 494356272559852 }
+        flops { key: "f32xf32->f32" value: 210207874706343 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 690065439588689 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 693631669250646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 691843958762886 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 512
         n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1142639409653979 }
-        flops { key: "f32xf32->f32" value: 733320635321737 }
+        flops { key: "bf16xbf16->bf16" value: 598184860167130 }
+        flops { key: "f32xf32->f32" value: 255409568030447 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 932877344917463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 928792192463642 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 939406670166229 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81221015431164 }
-        flops { key: "f32xf32->f32" value: 55634291398963 }
-      }
-      entries {
-        b: 2
-        m: 4096
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 260080373985709 }
-        flops { key: "f32xf32->f32" value: 181574672190749 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 224444361204013 }
+        flops { key: "f32xf32->f32" value: 97826332361516 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 308546501149425 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 305735143507972 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309971658198614 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 512
         n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 65265124240213 }
-        flops { key: "f32xf32->f32" value: 48128275392200 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 961918767301231 }
-        flops { key: "f32xf32->f32" value: 559714249820811 }
+        flops { key: "bf16xbf16->bf16" value: 358391797062750 }
+        flops { key: "f32xf32->f32" value: 153831206876790 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 496183837338262 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 490741235831809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 494356272559852 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 512
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 270566164545798 }
-        flops { key: "f32xf32->f32" value: 182330077092885 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529458493096646 }
+        flops { key: "f32xf32->f32" value: 176544199934232 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 734433532147742 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 738474431911967 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 735439605479452 }
       }
       entries {
         b: 4
         m: 512
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82430663595885 }
-        flops { key: "f32xf32->f32" value: 58964405491488 }
-      }
-      entries {
-        b: 1
-        m: 1024
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 150785258250245 }
-        flops { key: "f32xf32->f32" value: 103056130530761 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 637992765300059 }
+        flops { key: "f32xf32->f32" value: 228212927523910 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 951055645704163 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 951055645704163 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956136975957257 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 956775962575183 }
-        flops { key: "f32xf32->f32" value: 556811732157905 }
+        b: 4
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 702710617801047 }
+        flops { key: "f32xf32->f32" value: 277919457486734 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1006790270979840 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1018700179904533 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1013442023596035 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 1024
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 621243551891227 }
-        flops { key: "f32xf32->f32" value: 415113062001643 }
+        flops { key: "bf16xbf16->bf16" value: 317675095857988 }
+        flops { key: "f32xf32->f32" value: 121574028985507 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 482797582733812 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 478494573975044 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474267590106007 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82228658600091 }
-        flops { key: "f32xf32->f32" value: 58648777802053 }
-      }
-      entries {
-        b: 1
-        m: 2048
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43226321417069 }
-        flops { key: "f32xf32->f32" value: 31584357689139 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 454590103302286 }
+        flops { key: "f32xf32->f32" value: 184809264027538 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 692736660645161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 687414740076824 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 671928550688360 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
         n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 380152885112409 }
-        flops { key: "f32xf32->f32" value: 249649342943501 }
+        flops { key: "bf16xbf16->bf16" value: 665267548946716 }
+        flops { key: "f32xf32->f32" value: 185895118690284 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 932877344917463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 941827157721616 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956989147950089 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 512
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 614137026667619 }
-        flops { key: "f32xf32->f32" value: 392091226583896 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 747991517938000 }
+        flops { key: "f32xf32->f32" value: 250991543712014 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1027996001914791 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1061009707509881 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1027996001914791 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82241254901960 }
-        flops { key: "f32xf32->f32" value: 49701065728568 }
+        b: 4
+        m: 512
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 724155672905075 }
+        flops { key: "f32xf32->f32" value: 305125619895478 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1118772413649387 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1118772413649387 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1120816100208768 }
       }
       entries {
         b: 4
         m: 1024
-        n: 4096
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 497909494087642 }
-        flops { key: "f32xf32->f32" value: 344065312505006 }
+        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
+        flops { key: "f32xf32->f32" value: 52593153605015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59918628571428 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59178892416225 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59599346358792 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 426680637393204 }
-        flops { key: "f32xf32->f32" value: 264435863563600 }
+        m: 1024
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 160547521531100 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160547521531100 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160932527577937 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 253659774155445 }
-        flops { key: "f32xf32->f32" value: 172253440924039 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 216829932148626 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 284359593220339 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 284359593220339 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 279038935550935 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 1024
         n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 150468304932735 }
-        flops { key: "f32xf32->f32" value: 103234479761561 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 43457253683017 }
-        flops { key: "f32xf32->f32" value: 29582924399382 }
+        flops { key: "bf16xbf16->bf16" value: 302292180180180 }
+        flops { key: "f32xf32->f32" value: 172398639104082 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 466844271304347 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 472597633802816 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 461229305841924 }
       }
       entries {
         b: 4
-        m: 512
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 638229778735418 }
-        flops { key: "f32xf32->f32" value: 400631248169395 }
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 398863976225854 }
+        flops { key: "f32xf32->f32" value: 205934373609512 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 652334036452004 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 683041872773536 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 646832424096385 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 4
+        m: 1024
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 150447222082107 }
-        flops { key: "f32xf32->f32" value: 105330765548361 }
+        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
+        flops { key: "f32xf32->f32" value: 60622280036133 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 164080352078239 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 163680156097560 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164482509803921 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1184900281674598 }
-        flops { key: "f32xf32->f32" value: 657804081020025 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 2048
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 149754787168758 }
-        flops { key: "f32xf32->f32" value: 104673603431468 }
+        flops { key: "bf16xbf16->bf16" value: 243589343012704 }
+        flops { key: "f32xf32->f32" value: 111941391159299 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304348589569161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 300936609865470 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 304348589569161 }
       }
       entries {
-        b: 1
+        b: 4
         m: 1024
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 150764086492558 }
-        flops { key: "f32xf32->f32" value: 110172565565360 }
+        flops { key: "bf16xbf16->bf16" value: 347264496765847 }
+        flops { key: "f32xf32->f32" value: 195652664723032 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 548947762781186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 515231201535508 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 519154755953100 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 379146124293785 }
-        flops { key: "f32xf32->f32" value: 255348828537455 }
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 451911542087542 }
+        flops { key: "f32xf32->f32" value: 237448435205661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 743588520775623 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 741534408839779 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 735439605479452 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 1024
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 923897240333423 }
-        flops { key: "f32xf32->f32" value: 548842539901603 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 549228554475703 }
+        flops { key: "f32xf32->f32" value: 274614277237851 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1047552999024390 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1049601000977517 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1061009707509881 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1268917141886402 }
-        flops { key: "f32xf32->f32" value: 688944686864636 }
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 225955771043771 }
+        flops { key: "f32xf32->f32" value: 83209998760074 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309971658198614 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 312134251162790 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 312861836829836 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 4
+        m: 1024
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 691676833239391 }
-        flops { key: "f32xf32->f32" value: 429067661938061 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 379146124293785 }
+        flops { key: "f32xf32->f32" value: 137942166495375 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547827461224489 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 543391611336032 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 551202168377823 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 133119492189437 }
-        flops { key: "f32xf32->f32" value: 100443575678203 }
+        b: 4
+        m: 1024
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 499414801860465 }
+        flops { key: "f32xf32->f32" value: 209306398440545 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 863136514469453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 861751062600321 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 867319728594507 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 1024
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 685549448683160 }
-        flops { key: "f32xf32->f32" value: 411060659041967 }
+        flops { key: "bf16xbf16->bf16" value: 601199229563269 }
+        flops { key: "f32xf32->f32" value: 269581175997991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1147083473592842 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1150848685959271 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1141064637619553 }
       }
       entries {
         b: 4
-        m: 256
-        n: 512
+        m: 1024
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 152152731188890 }
-        flops { key: "f32xf32->f32" value: 99845808443369 }
+        flops { key: "bf16xbf16->bf16" value: 679798559037670 }
+        flops { key: "f32xf32->f32" value: 303445478027412 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1382796940115904 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1383687917525773 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1393564988968202 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 437102309790352 }
-        flops { key: "f32xf32->f32" value: 273008345792016 }
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 323027022864019 }
+        flops { key: "f32xf32->f32" value: 202287457422758 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 553475167010309 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 528416251968503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 554618710743801 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 962781281327056 }
-        flops { key: "f32xf32->f32" value: 473744462386940 }
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 526344031372549 }
+        flops { key: "f32xf32->f32" value: 165902516406898 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 853530861685214 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 857621265175718 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 861751062600321 }
       }
       entries {
         b: 4
-        m: 512
-        n: 2048
+        m: 1024
+        n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 935264259567750 }
-        flops { key: "f32xf32->f32" value: 546624747335263 }
+        flops { key: "bf16xbf16->bf16" value: 642190086124401 }
+        flops { key: "f32xf32->f32" value: 223789458941225 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1214640072398190 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1203746439461883 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1214640072398190 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674355047260166 }
-        flops { key: "f32xf32->f32" value: 403056240240240 }
+        m: 1024
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 721116067159167 }
+        flops { key: "f32xf32->f32" value: 285038976373772 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1467863053998633 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1406341616240995 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1472896877914952 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1176461630076011 }
-        flops { key: "f32xf32->f32" value: 636538994201448 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 44143308008551 }
-        flops { key: "f32xf32->f32" value: 32506109953984 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 714391657771355 }
+        flops { key: "f32xf32->f32" value: 318546858710969 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1580775596613912 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1603759171415902 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1607397940119760 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 2048
+        b: 4
+        m: 1024
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 609258429108447 }
-        flops { key: "f32xf32->f32" value: 397075513890814 }
+        flops { key: "bf16xbf16->bf16" value: 403359062359128 }
+        flops { key: "f32xf32->f32" value: 242379644243792 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 825955249230769 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 786048187408492 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 825955249230769 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 885742894617446 }
-        flops { key: "f32xf32->f32" value: 526231175421937 }
+        b: 4
+        m: 1024
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 661986327990135 }
+        flops { key: "f32xf32->f32" value: 175821487473391 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1186455054143646 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1185145501103752 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1198372571428571 }
       }
       entries {
         b: 4
         m: 1024
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 432176222177500 }
-        flops { key: "f32xf32->f32" value: 270310736736106 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 749296457780879 }
+        flops { key: "f32xf32->f32" value: 245818255993475 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1464859241473397 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1466860415300546 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1462863520435967 }
       }
       entries {
         b: 4
         m: 1024
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1342964173070158 }
-        flops { key: "f32xf32->f32" value: 700832977094254 }
+        flops { key: "bf16xbf16->bf16" value: 740639299189515 }
+        flops { key: "f32xf32->f32" value: 310440758471456 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1599615380260707 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1602599737313432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1607397940119760 }
       }
       entries {
         b: 4
-        m: 256
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 128039807297877 }
-        flops { key: "f32xf32->f32" value: 88170621120052 }
-      }
-      entries {
-        b: 2
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81024888620585 }
-        flops { key: "f32xf32->f32" value: 48908710212262 }
-      }
-      entries {
-        b: 1
         m: 1024
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 240076427948574 }
-        flops { key: "f32xf32->f32" value: 167353775561097 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 689899172114689 }
-        flops { key: "f32xf32->f32" value: 419348495996875 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 769776377094721 }
+        flops { key: "f32xf32->f32" value: 325561288307750 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1657328688404399 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1663104470861568 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1661496052611218 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 2048
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 11779684746357 }
-        flops { key: "f32xf32->f32" value: 8254472816728 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 273321070128547 }
-        flops { key: "f32xf32->f32" value: 185351600897635 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 143395008547008 }
+        flops { key: "f32xf32->f32" value: 78398205607476 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 160932527577937 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160932527577937 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 4
+        m: 2048
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 934297867304764 }
-        flops { key: "f32xf32->f32" value: 545202284408619 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 220029062295081 }
+        flops { key: "f32xf32->f32" value: 112882866274179 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 285569634042553 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 286790017094017 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 286178524520255 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 4
+        m: 2048
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 80647575784888 }
-        flops { key: "f32xf32->f32" value: 56477057858194 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 196782154128104 }
-        flops { key: "f32xf32->f32" value: 133525066716408 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 303660018099547 }
+        flops { key: "f32xf32->f32" value: 172516359897172 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 466844271304347 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 484540534296028 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 479349028571428 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 512
+        b: 4
+        m: 2048
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 38926255220417 }
-        flops { key: "f32xf32->f32" value: 33412429176001 }
+        flops { key: "bf16xbf16->bf16" value: 400649934328358 }
+        flops { key: "f32xf32->f32" value: 209388031201248 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 702710617801047 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 693631669250646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 702710617801047 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 245763750057221 }
-        flops { key: "f32xf32->f32" value: 160319794550205 }
+        b: 4
+        m: 2048
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 460228486806504 }
+        flops { key: "f32xf32->f32" value: 222122843194042 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 887389937190082 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 890333187396351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 896278651085141 }
       }
       entries {
         b: 4
-        m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 972427077828720 }
-        flops { key: "f32xf32->f32" value: 558931228942317 }
+        m: 2048
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 230614652920962 }
+        flops { key: "f32xf32->f32" value: 113168404721753 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305040290909090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 305735143507972 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305735143507972 }
       }
       entries {
         b: 4
-        m: 512
+        m: 2048
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 150447222082107 }
-        flops { key: "f32xf32->f32" value: 106829352701223 }
+        flops { key: "bf16xbf16->bf16" value: 345921979381443 }
+        flops { key: "f32xf32->f32" value: 188640517217146 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 524288000000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 527378106090373 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 520160747971418 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
+        m: 2048
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 431481544705646 }
-        flops { key: "f32xf32->f32" value: 264403305589756 }
+        flops { key: "bf16xbf16->bf16" value: 451152026890756 }
+        flops { key: "f32xf32->f32" value: 240533562724014 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 823421644171779 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 824686500768049 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 806112480480480 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1193626706315570 }
-        flops { key: "f32xf32->f32" value: 660827740513510 }
+        b: 4
+        m: 2048
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 547548099949005 }
+        flops { key: "f32xf32->f32" value: 280570113404755 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1173488332240437 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1176058952902519 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1170928924754634 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1037713701428528 }
-        flops { key: "f32xf32->f32" value: 612440303869668 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 608697179138322 }
+        flops { key: "f32xf32->f32" value: 296490908187215 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1459880114208021 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1438368150033489 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1468867064295485 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1344409209351462 }
-        flops { key: "f32xf32->f32" value: 695865248354497 }
+        b: 4
+        m: 2048
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 378611362482369 }
+        flops { key: "f32xf32->f32" value: 119195384675158 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 553475167010309 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 555766989648033 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 563940033613445 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 2048
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 17508182624576 }
-        flops { key: "f32xf32->f32" value: 16412047933480 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 493447529411764 }
+        flops { key: "f32xf32->f32" value: 242598694984184 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 958698057142857 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 950108902997456 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 962134250896057 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 512
+        b: 4
+        m: 2048
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 143663610382659 }
-        flops { key: "f32xf32->f32" value: 104358229565555 }
+        flops { key: "bf16xbf16->bf16" value: 597851795100222 }
+        flops { key: "f32xf32->f32" value: 261760561677230 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1320715650676506 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1332185885856079 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1335499781094527 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 1024
+        m: 2048
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 676000203982057 }
-        flops { key: "f32xf32->f32" value: 402961701552751 }
+        flops { key: "bf16xbf16->bf16" value: 685440040855410 }
+        flops { key: "f32xf32->f32" value: 309569503820095 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1707061723370429 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1677721600000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1660853556071152 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 2048
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 688296041025641 }
+        flops { key: "f32xf32->f32" value: 329672036843721 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1912273951914514 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1920826161001789 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1919109605004468 }
+      }
+      entries {
+        b: 4
+        m: 2048
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 134184181954511 }
-        flops { key: "f32xf32->f32" value: 88629122905489 }
+        flops { key: "bf16xbf16->bf16" value: 523744563868056 }
+        flops { key: "f32xf32->f32" value: 156567778361038 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 932067555555555 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 936947490401396 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 940229267950963 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 2048
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 266238984378874 }
-        flops { key: "f32xf32->f32" value: 184333360343347 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 635350191715976 }
+        flops { key: "f32xf32->f32" value: 283758410147991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1374829480153649 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1355734626262626 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1387263338501292 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 888675211255948 }
-        flops { key: "f32xf32->f32" value: 533287884029178 }
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 722328842246888 }
+        flops { key: "f32xf32->f32" value: 301105390914189 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1758790866502866 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1774779874380165 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1779191091963546 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 1024
+        b: 4
+        m: 2048
+        n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 133649716704007 }
-        flops { key: "f32xf32->f32" value: 89099811136005 }
+        flops { key: "bf16xbf16->bf16" value: 723180214850985 }
+        flops { key: "f32xf32->f32" value: 320496029848518 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2012636970946579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1932059062528115 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1944303891353553 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 2048
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 760305770224818 }
-        flops { key: "f32xf32->f32" value: 499298685887003 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 751328137146855 }
+        flops { key: "f32xf32->f32" value: 334316750681092 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2067870628791526 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2076336674149834 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2064391875030040 }
       }
       entries {
         b: 4
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 253689739870053 }
-        flops { key: "f32xf32->f32" value: 163033984816276 }
+        m: 2048
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 573886597541421 }
+        flops { key: "f32xf32->f32" value: 166419997520148 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1330535097893432 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1335499781094527 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1333840775155279 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 2048
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 375861319331408 }
-        flops { key: "f32xf32->f32" value: 249185849152935 }
+        flops { key: "bf16xbf16->bf16" value: 743331134648667 }
+        flops { key: "f32xf32->f32" value: 304823796735273 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1731841651612903 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1711142349003984 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1738853156275303 }
       }
       entries {
         b: 4
-        m: 256
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 259045072135102 }
-        flops { key: "f32xf32->f32" value: 176109861243234 }
+        m: 2048
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 740639299189515 }
+        flops { key: "f32xf32->f32" value: 314325794104059 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2021161080470588 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2012636970946579 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2035529524170616 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 249417380720092 }
-        flops { key: "f32xf32->f32" value: 163455902572689 }
+        m: 2048
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 775330457067102 }
+        flops { key: "f32xf32->f32" value: 328297881650387 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2059936352997602 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2053534447047573 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2050073141391089 }
       }
       entries {
         b: 4
         m: 2048
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 128299895327996 }
-        flops { key: "f32xf32->f32" value: 102280608115831 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 146826449336797 }
-        flops { key: "f32xf32->f32" value: 101353768548234 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 800699997215239 }
+        flops { key: "f32xf32->f32" value: 335092321690103 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2201418398769861 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2221038186052149 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2215328070148291 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 4096
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 20357610799332 }
-        flops { key: "f32xf32->f32" value: 14924688980318 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 210702869701726 }
+        flops { key: "f32xf32->f32" value: 112128427736006 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 285569634042553 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 285569634042553 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 286178524520255 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 4096
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 458815008652921 }
-        flops { key: "f32xf32->f32" value: 287365669476783 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 304003913929784 }
+        flops { key: "f32xf32->f32" value: 173857160621761 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 466033777777777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 477643160142348 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474267590106007 }
       }
       entries {
-        b: 2
+        b: 4
         m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 443603315017558 }
-        flops { key: "f32xf32->f32" value: 289203911925122 }
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 396507320531757 }
+        flops { key: "f32xf32->f32" value: 206966427139552 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 728454426051560 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 709208602377807 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 728454426051560 }
       }
       entries {
         b: 4
-        m: 256
+        m: 4096
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 961111562741258 }
-        flops { key: "f32xf32->f32" value: 525233702772937 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 463419000431592 }
+        flops { key: "f32xf32->f32" value: 219354815934627 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 941878792982456 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 949373849690539 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 941878792982456 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 692624946944041 }
-        flops { key: "f32xf32->f32" value: 424991816346724 }
+        b: 4
+        m: 4096
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 505171406257351 }
+        flops { key: "f32xf32->f32" value: 241724859072489 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1053204339382050 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1061009707509881 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1052655812260653 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
+        b: 4
+        m: 4096
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 257060527651424 }
-        flops { key: "f32xf32->f32" value: 176341242240105 }
+        flops { key: "bf16xbf16->bf16" value: 341520936386768 }
+        flops { key: "f32xf32->f32" value: 169359909148264 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 519217516441005 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 522247968871595 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 519217516441005 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 465983215362916 }
-        flops { key: "f32xf32->f32" value: 294923250429169 }
+        b: 4
+        m: 4096
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 451152026890756 }
+        flops { key: "f32xf32->f32" value: 235057316987740 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 834946986003110 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 834946986003110 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 829785026275115 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
+        b: 4
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 611339733257419 }
-        flops { key: "f32xf32->f32" value: 388684823167420 }
+        flops { key: "bf16xbf16->bf16" value: 547548099949005 }
+        flops { key: "f32xf32->f32" value: 275955236186070 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1267699910271546 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1263225675294117 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1264713573616018 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 607191248462571 }
-        flops { key: "f32xf32->f32" value: 394685471053115 }
+        b: 4
+        m: 4096
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 612680557907312 }
+        flops { key: "f32xf32->f32" value: 289845530119363 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1596642117472119 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1602599737313432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1594271453600594 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 4096
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1411468820638164 }
-        flops { key: "f32xf32->f32" value: 751048949004349 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 620391058211757 }
+        flops { key: "f32xf32->f32" value: 316736541295440 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1777718251655629 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1761676495488105 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1771120534432989 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 960413080500894 }
-        flops { key: "f32xf32->f32" value: 533917679833421 }
+        b: 4
+        m: 4096
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 489399190519598 }
+        flops { key: "f32xf32->f32" value: 163129965474675 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 880116249180327 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 875808991843393 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 960413080500894 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 4
+        m: 4096
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74710675201781 }
-        flops { key: "f32xf32->f32" value: 54109142511590 }
+        flops { key: "bf16xbf16->bf16" value: 596192017767906 }
+        flops { key: "f32xf32->f32" value: 230463065047957 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1424060774535809 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1439332203753351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1435483721925133 }
       }
       entries {
-        b: 2
+        b: 4
         m: 4096
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1131296535229817 }
-        flops { key: "f32xf32->f32" value: 608923713257837 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 677653407384032 }
+        flops { key: "f32xf32->f32" value: 278640670559231 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1869002304612706 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1821444994062765 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1892056077533039 }
       }
       entries {
         b: 4
-        m: 512
+        m: 4096
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 690176329101719 }
+        flops { key: "f32xf32->f32" value: 295713908970028 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2198038534288638 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2210482396294390 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2189076093781855 }
+      }
+      entries {
+        b: 4
+        m: 4096
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1156192824819974 }
-        flops { key: "f32xf32->f32" value: 624665727988364 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 722328842246888 }
+        flops { key: "f32xf32->f32" value: 322142681117569 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2260509103157894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2261104130560674 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2277289128313892 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 4
+        m: 4096
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 68182742189484 }
-        flops { key: "f32xf32->f32" value: 45769046206308 }
+        flops { key: "bf16xbf16->bf16" value: 576970351423965 }
+        flops { key: "f32xf32->f32" value: 153347875464153 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1295225360675512 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1425951957503320 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1405421235602094 }
       }
       entries {
-        b: 1
+        b: 4
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 710146708994709 }
+        flops { key: "f32xf32->f32" value: 243976783458304 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1898747699381078 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1885411455662862 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1830761848252344 }
+      }
+      entries {
+        b: 4
         m: 4096
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 940023483475596 }
-        flops { key: "f32xf32->f32" value: 543873280486260 }
+        flops { key: "bf16xbf16->bf16" value: 716544427093760 }
+        flops { key: "f32xf32->f32" value: 299696362080620 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2243974553814002 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2276082297827239 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2256945504992117 }
       }
       entries {
         b: 4
         m: 4096
-        n: 512
+        n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1147696518404703 }
-        flops { key: "f32xf32->f32" value: 618938256439817 }
+        flops { key: "bf16xbf16->bf16" value: 758158392939099 }
+        flops { key: "f32xf32->f32" value: 292981216299122 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2362468259625962 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2363118182118294 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2372254789284728 }
       }
       entries {
         b: 4
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 149775676384432 }
-        flops { key: "f32xf32->f32" value: 102436731921389 }
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 759530558969895 }
+        flops { key: "f32xf32->f32" value: 340836184539995 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2527938373160683 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2536148388544434 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2547406579342940 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 258079996154308 }
-        flops { key: "f32xf32->f32" value: 186624111236638 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 611296227725590 }
+        flops { key: "f32xf32->f32" value: 166008321583178 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1681662997650744 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1697615532015810 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1672495052959501 }
       }
       entries {
         b: 4
-        m: 256
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 150426145138694 }
-        flops { key: "f32xf32->f32" value: 100896619432437 }
+        m: 4096
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 720624539759440 }
+        flops { key: "f32xf32->f32" value: 264794531196054 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2204808673511293 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2203677422267829 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2192428430832057 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 4096
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 267332708577119 }
-        flops { key: "f32xf32->f32" value: 188723406977766 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 773519549031967 }
+        flops { key: "f32xf32->f32" value: 310430240689167 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2360520635339379 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2388747105672970 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2350830484948002 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 425328510200039 }
-        flops { key: "f32xf32->f32" value: 275036327868852 }
+        m: 4096
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 804715405124361 }
+        flops { key: "f32xf32->f32" value: 335819386905045 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2501072817586257 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2495260593173565 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2554627387955390 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1038246762796881 }
-        flops { key: "f32xf32->f32" value: 607212709292051 }
+        m: 4096
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 788699393418751 }
+        flops { key: "f32xf32->f32" value: 345601440030376 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2592796435858738 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2595734559794515 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2620467860643587 }
       }
     }
   }
diff --git a/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc b/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
index 80b71828223e2f..d2190003d598e4 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
@@ -58,9 +58,49 @@ struct DotSpec {
   int m;
   int n;
   int k;
+  absl::string_view lhs_type;
+  absl::string_view rhs_type;
+  absl::string_view result_type;
   int64_t clock_cycles;
 };
 
+absl::StatusOr<DotContext> Dot(int b, int m, int n, int k,
+                               absl::string_view lhs_type,
+                               absl::string_view rhs_type,
+                               absl::string_view result_type) {
+  absl::string_view kTemplate = R"(
+    HloModule m
+
+    ENTRY r {
+      lhs = $4[$0,$1,$2] parameter(0)
+      rhs = $5[$0,$2,$3] parameter(1)
+      ROOT _ = $6[$0,$1,$3] dot(lhs,rhs),
+       lhs_contracting_dims={2}, rhs_contracting_dims={1},
+       lhs_batch_dims={0}, rhs_batch_dims={0}
+    })";
+  TF_ASSIGN_OR_RETURN(
+      auto module,
+      ParseAndReturnUnverifiedModule(absl::Substitute(
+          kTemplate, b, m, k, n, lhs_type, rhs_type, result_type)));
+  return DotContext{
+      /*dot=*/module->entry_computation()->root_instruction(),
+      /*module=*/std::move(module),
+  };
+}
+
+// Generates a Dot HLO instruction with S8 data type.
+absl::StatusOr<DotContext> DotS8(int b, int m, int n, int k) {
+  // Template uses $0=b, $1=m, $2=k, $3=n for dimensions.
+  return Dot(b, m, n, k, /*lhs_type=*/"s8", /*rhs_type=*/"s8",
+             /*result_type=*/"s8");
+}
+
+// Generates a Dot HLO instruction with BF16 data type.
+absl::StatusOr<DotContext> DotBF16(int b, int m, int n, int k) {
+  return Dot(b, m, n, k, /*lhs_type=*/"bf16", /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16");
+}
+
 struct ParametrizedTestCase {
   std::string test_name;
   DotSpec spec;
@@ -83,26 +123,6 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
   }
 
  protected:
-  absl::StatusOr<DotContext> Dot(int b, int m, int n, int k) {
-    absl::string_view kTemplate = R"(
-    HloModule m
-
-    ENTRY r {
-      lhs = f32[$0,$1,$2] parameter(0)
-      rhs = f32[$0,$2,$3] parameter(1)
-      ROOT _ = f32[$0,$1,$3] dot(lhs,rhs),
-       lhs_contracting_dims={2}, rhs_contracting_dims={1},
-       lhs_batch_dims={0}, rhs_batch_dims={0}
-    })";
-    TF_ASSIGN_OR_RETURN(auto module,
-                        ParseAndReturnUnverifiedModule(
-                            absl::Substitute(kTemplate, b, m, k, n)));
-    return DotContext{
-        /*dot=*/module->entry_computation()->root_instruction(),
-        /*module=*/std::move(module),
-    };
-  }
-
   void AddProfileEntry(DotContext dot_context, int64_t clock_cycles,
                        HloInstructionProfileList& list) {
     HloInstructionProfile profile;
@@ -118,7 +138,8 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
     HloInstructionProfileList list;
     for (DotSpec spec : specs) {
       TF_ASSIGN_OR_RETURN(DotContext dot_context,
-                          Dot(spec.b, spec.m, spec.n, spec.k));
+                          Dot(spec.b, spec.m, spec.n, spec.k, spec.lhs_type,
+                              spec.rhs_type, spec.result_type));
       AddProfileEntry(std::move(dot_context), spec.clock_cycles, list);
     }
     return list;
@@ -138,6 +159,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/1024,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(1)),
       },
       DotSpec{
@@ -145,6 +169,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/2048,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(2)),
       },
       DotSpec{
@@ -152,6 +179,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/64,
           /*n=*/2048,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(3)),
       },
       DotSpec{
@@ -159,6 +189,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/1024,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(4)),
       },
       DotSpec{
@@ -166,6 +199,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/2048,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(5)),
       },
   };
@@ -175,8 +211,8 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
 TEST_P(MatmulInterpolatorParamTest,
        MatmulInteprolatorNextNeighbourInterpolation) {
   const auto& [_, spec, expected_duration] = GetParam();
-  TF_ASSERT_OK_AND_ASSIGN(DotContext context,
-                          Dot(spec.b, spec.m, spec.n, spec.k));
+  TF_ASSERT_OK_AND_ASSIGN(DotContext context, Dot(spec.b, spec.m, spec.n,
+                                                  spec.k, "f32", "f32", "f32"));
   EXPECT_EQ(absl::Trunc(*interpolator().EstimatedRuntime(*context.dot),
                         absl::Milliseconds(1)),
             expected_duration);
@@ -193,6 +229,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/64,
                 /*n=*/64,
                 /*k=*/64,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Milliseconds(1),
         },
@@ -204,6 +243,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/512,
                 /*n=*/2048,
                 /*k=*/1024,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(40),
         },
@@ -215,6 +257,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/128,
                 /*n=*/2048,
                 /*k=*/512,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(12),
         },
@@ -226,6 +271,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/512,
                 /*n=*/2048,
                 /*k=*/512,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(10),
         },
@@ -237,6 +285,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/256,
                 /*n=*/4096,
                 /*k=*/512,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(10),
         },
@@ -248,6 +299,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/256,
                 /*n=*/2048,
                 /*k=*/1024,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(10),
         },
@@ -277,51 +331,6 @@ class MatmulInterpolatorDefaultTableTest
     return GetMatmulInterpolator(TestGpuDeviceInfo::RTXA6000DeviceInfo(
         se::CudaComputeCapability(10, 0)));
   }
-
- protected:
-  // Generates a Dot HLO instruction with BF16 data type.
-  absl::StatusOr<DotContext> DotBF16(int b, int m, int n, int k) {
-    // Template uses $0=b, $1=m, $2=k, $3=n for dimensions.
-    absl::string_view kTemplate = R"(
-    HloModule m
-
-    ENTRY r {
-      lhs = bf16[$0,$1,$2] parameter(0)
-      rhs = bf16[$0,$2,$3] parameter(1)
-      ROOT _ = bf16[$0,$1,$3] dot(lhs,rhs),
-       lhs_contracting_dims={2}, rhs_contracting_dims={1},
-       lhs_batch_dims={0}, rhs_batch_dims={0}
-    })";
-    TF_ASSIGN_OR_RETURN(auto module,
-                        ParseAndReturnUnverifiedModule(
-                            absl::Substitute(kTemplate, b, m, k, n)));
-    return DotContext{
-        /*dot=*/module->entry_computation()->root_instruction(),
-        /*module=*/std::move(module),
-    };
-  }
-
-  // Generates a Dot HLO instruction with FP8 data type.
-  absl::StatusOr<DotContext> DotFP8(int b, int m, int n, int k) {
-    // Template uses $0=b, $1=m, $2=k, $3=n for dimensions.
-    absl::string_view kTemplate = R"(
-    HloModule m
-
-    ENTRY r {
-      lhs = s8[$0,$1,$2] parameter(0)
-      rhs = s8[$0,$2,$3] parameter(1)
-      ROOT _ = s8[$0,$1,$3] dot(lhs,rhs),
-       lhs_contracting_dims={2}, rhs_contracting_dims={1},
-       lhs_batch_dims={0}, rhs_batch_dims={0}
-    })";
-    TF_ASSIGN_OR_RETURN(auto module,
-                        ParseAndReturnUnverifiedModule(
-                            absl::Substitute(kTemplate, b, m, k, n)));
-    return DotContext{
-        /*dot=*/module->entry_computation()->root_instruction(),
-        /*module=*/std::move(module),
-    };
-  }
 };
 
 using H100BF16Test = MatmulInterpolatorDefaultTableTest;
@@ -343,25 +352,41 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"exact_match1_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(12),
         },
         {
             /*test_name=*/"exact_match2_bf16",
             /*spec=*/
-            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256, /*clock_cycles=*/0},
+            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(6),
         },
         {
             /*test_name=*/"exact_match3_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/4096, /*n=*/2048, /*k=*/4096, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/4096, /*n=*/2048, /*k=*/4096,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(90),
         },
         {
             /*test_name=*/"extrapolate_small_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,256,256,256)
             // flops/sec and scaling by new dimensions.
             /*expected_duration=*/absl::Microseconds(0),
@@ -369,7 +394,11 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"extrapolate_slightly_larger_k_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/513, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/513,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             // Expected duration based on (1,1024,4096,512) flops/sec and
             // scaling k.
             /*expected_duration=*/absl::Microseconds(12),
@@ -377,10 +406,14 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"interpolate_mid_n_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/2048, /*k=*/512, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/2048, /*k=*/512,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             // Expected duration based on linear interpolation of flops/sec
             // between (1,1024,1024,512) and (1,1024,4096,512).
-            /*expected_duration=*/absl::Microseconds(9),
+            /*expected_duration=*/absl::Microseconds(8),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
@@ -405,25 +438,33 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"exact_match1_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(9),
         },
         {
             /*test_name=*/"exact_match2_bf16",
             /*spec=*/
-            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256, /*clock_cycles=*/0},
+            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(6),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
            info) { return info.param.test_name; });
 
-using H100F8Test = MatmulInterpolatorDefaultTableTest;
+using H100S8Test = MatmulInterpolatorDefaultTableTest;
 
-TEST_P(H100F8Test, EstimatesRuntimeForFP8) {
+TEST_P(H100S8Test, EstimatesRuntimeForS8) {
   const auto& [_, spec, expected_duration] = GetParam();
   TF_ASSERT_OK_AND_ASSIGN(DotContext context,
-                          DotFP8(spec.b, spec.m, spec.n, spec.k));
+                          DotS8(spec.b, spec.m, spec.n, spec.k));
   // Compare with nanosecond precision.
   EXPECT_EQ(
       absl::Trunc(*GetMatmulInterpolatorH100()->EstimatedRuntime(*context.dot),
@@ -432,31 +473,128 @@ TEST_P(H100F8Test, EstimatesRuntimeForFP8) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    MatmulInterpolatorDefaultTableTestInstantiationFP8, H100F8Test,
+    MatmulInterpolatorDefaultTableTestInstantiationS8, H100S8Test,
     ValuesIn<ParametrizedTestCase>({
         {
-            /*test_name=*/"extrapolate_small_fp8",
+            /*test_name=*/"extrapolate_small_s8",
             /*spec=*/
-            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,512,512,512)
             // flops/sec and scaling by new dimensions.
             /*expected_duration=*/absl::Microseconds(0),
         },
         {
-            /*test_name=*/"interpolate_larger_fp8",
+            /*test_name=*/"interpolate_larger_s8",
             /*spec=*/
-            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,2048,2048,2048)
             // flops/sec and scaling by new dimensions.
-            /*expected_duration=*/absl::Microseconds(72),
+            /*expected_duration=*/absl::Microseconds(67),
         },
         {
-            /*test_name=*/"interpolate_larger_batch_fp8",
+            /*test_name=*/"interpolate_larger_batch_s8",
             /*spec=*/
-            {/*b=*/8, /*m=*/2048, /*n=*/2048, /*k=*/2048, /*clock_cycles=*/0},
+            {/*b=*/8, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,2048,2048,2048)
             // flops/sec and scaling by new dimensions.
-            /*expected_duration=*/absl::Microseconds(280),
+            /*expected_duration=*/absl::Microseconds(275),
+        },
+    }),
+    [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
+           info) { return info.param.test_name; });
+
+using B200S8Test = MatmulInterpolatorDefaultTableTest;
+
+TEST_P(B200S8Test, EstimatesRuntimeForS8) {
+  const auto& [_, spec, expected_duration] = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(DotContext context,
+                          DotS8(spec.b, spec.m, spec.n, spec.k));
+  // Compare with nanosecond precision.
+  EXPECT_EQ(
+      absl::Trunc(*GetMatmulInterpolatorB200()->EstimatedRuntime(*context.dot),
+                  absl::Microseconds(1)),
+      expected_duration);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MatmulInterpolatorDefaultTableTestInstantiationS8, B200S8Test,
+    ValuesIn<ParametrizedTestCase>({
+        {
+            /*test_name=*/"exact_match1_s8",
+            /*spec=*/
+            {/*b=*/1, /*m=*/512, /*n=*/512, /*k=*/512,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(39),
+        },
+        {
+            /*test_name=*/"exact_match2_s8",
+            /*spec=*/
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(59),
+        },
+    }),
+    [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
+           info) { return info.param.test_name; });
+
+using H100F8Test = MatmulInterpolatorDefaultTableTest;
+
+TEST_P(H100F8Test, EstimatesRuntimeForF8) {
+  const auto& [_, spec, expected_duration] = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(DotContext context,
+                          Dot(spec.b, spec.m, spec.n, spec.k, spec.lhs_type,
+                              spec.rhs_type, spec.result_type));
+  // Compare with nanosecond precision.
+  EXPECT_EQ(
+      absl::Trunc(*GetMatmulInterpolatorH100()->EstimatedRuntime(*context.dot),
+                  absl::Microseconds(1)),
+      expected_duration);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MatmulInterpolatorDefaultTableTestInstantiationF8, H100F8Test,
+    ValuesIn<ParametrizedTestCase>({
+        {
+            /*test_name=*/"extrapolate_small_f8e4m3fn_f8e4m3fn_bf16",
+            /*spec=*/
+            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e4m3fn",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            // Expected duration based on nearest point (1,512,512,512)
+            // flops/sec and scaling by new dimensions.
+            /*expected_duration=*/absl::Microseconds(0),
+        },
+        {
+            /*test_name=*/"interpolate_larger_f8e4m3fn_f8e4m3fn_bf16",
+            /*spec=*/
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e4m3fn",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            // Expected duration based on nearest point (1,2048,2048,2048)
+            // flops/sec and scaling by new dimensions.
+            /*expected_duration=*/absl::Microseconds(12),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
@@ -464,10 +602,11 @@ INSTANTIATE_TEST_SUITE_P(
 
 using B200F8Test = MatmulInterpolatorDefaultTableTest;
 
-TEST_P(B200F8Test, EstimatesRuntimeForFP8) {
+TEST_P(B200F8Test, EstimatesRuntimeForF8) {
   const auto& [_, spec, expected_duration] = GetParam();
   TF_ASSERT_OK_AND_ASSIGN(DotContext context,
-                          DotFP8(spec.b, spec.m, spec.n, spec.k));
+                          Dot(spec.b, spec.m, spec.n, spec.k, spec.lhs_type,
+                              spec.rhs_type, spec.result_type));
   // Compare with nanosecond precision.
   EXPECT_EQ(
       absl::Trunc(*GetMatmulInterpolatorB200()->EstimatedRuntime(*context.dot),
@@ -476,19 +615,27 @@ TEST_P(B200F8Test, EstimatesRuntimeForFP8) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    MatmulInterpolatorDefaultTableTestInstantiationFP8, B200F8Test,
+    MatmulInterpolatorDefaultTableTestInstantiationF8, B200F8Test,
     ValuesIn<ParametrizedTestCase>({
         {
-            /*test_name=*/"exact_match1_fp8",
+            /*test_name=*/"exact_match1_f8e4m3fn_f8e5m2_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/512, /*n=*/512, /*k=*/512, /*clock_cycles=*/0},
-            /*expected_duration=*/absl::Microseconds(44),
+            {/*b=*/1, /*m=*/512, /*n=*/512, /*k=*/512,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e5m2",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(5),
         },
         {
-            /*test_name=*/"exact_match2_fp8",
+            /*test_name=*/"exact_match2_f8e4m3fn_f8e4m3fn_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048, /*clock_cycles=*/0},
-            /*expected_duration=*/absl::Microseconds(64),
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e4m3fn",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(12),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
index 38309bb82d4ec1..de0972b7e06126 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
@@ -27,23 +27,15 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
-#include "mlir/IR/MLIRContext.h"
-#include "xla/backends/gpu/codegen/triton/support.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/gpu_dot_fusion_cost_model.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
 #include "xla/service/gpu/model/matmul_interpolator.h"
-#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -55,8 +47,6 @@ namespace xla::gpu {
 
 namespace {
 
-constexpr absl::string_view kGemmCostModelName = "gemm-cost-model";
-
 constexpr absl::string_view kPerfTablesModelName = "perf-table-model";
 
 absl::StatusOr<HloInstructionProfileList> CollectProfiles(
@@ -75,28 +65,6 @@ absl::StatusOr<HloInstructionProfileList> CollectProfiles(
   return profile.entries().at(key);
 }
 
-HloDotInstruction* GetTritonGemmInstruction(const HloInstruction& dot_fusion) {
-  if (!(HloPredicateIsOp<HloOpcode::kFusion>(&dot_fusion) &&
-        IsTritonFusedComputation(
-            *dot_fusion.fused_instructions_computation()))) {
-    return nullptr;
-  }
-
-  HloInstruction* dot = hlo_query::GetFirstInstructionWithOpcode(
-      *dot_fusion.fused_instructions_computation(), HloOpcode::kDot);
-  if (dot == nullptr) {
-    return nullptr;
-  }
-  return DynCast<HloDotInstruction>(dot);
-}
-
-absl::StatusOr<BlockLevelParameters> GetBlockLevelParams(
-    HloDotInstruction& dot, TritonGemmConfig& config) {
-  mlir::MLIRContext ctx;
-  return ::xla::gpu::detail::FindBlockLevelParameters(&dot, config, &ctx,
-                                                      se::DeviceDescription());
-}
-
 absl::Status SetReificationCost(HloInstruction& instr, absl::Duration exec_time,
                                 absl::string_view reification_name) {
   TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
@@ -107,32 +75,6 @@ absl::Status SetReificationCost(HloInstruction& instr, absl::Duration exec_time,
   return instr.set_backend_config(gpu_config);
 }
 
-// Computes the runtime estimation via analytical GEMM cost model and adds a
-// reification cost to `instr`. We do not make any constraints on what fusions
-// do we add the cost to. In particular it can be the case there's a non trivial
-// fusion on dot operands. As of now the analytical GEMM model does not support
-// these cases so result interpretation has take this into consideration.
-absl::Status MaybeRecordGemmCostModelForGemmTritonFusion(
-    const se::DeviceDescription& device_info, HloInstruction& instr) {
-  HloDotInstruction* dot = GetTritonGemmInstruction(instr);
-  if (dot == nullptr) {
-    VLOG(2) << "Cannot get triton gemm: " << instr.ToString();
-    return absl::OkStatus();
-  }
-  auto triton_gemm_key = instr.backend_config<GpuBackendConfig>()
-                             ->fusion_backend_config()
-                             .triton_gemm_config();
-  TF_ASSIGN_OR_RETURN(TritonGemmConfig triton_gemm_config,
-                      TritonGemmConfig::FromProto(triton_gemm_key));
-  TF_ASSIGN_OR_RETURN(BlockLevelParameters block_params,
-                      GetBlockLevelParams(*dot, triton_gemm_config));
-  TF_ASSIGN_OR_RETURN(
-      absl::Duration exec_time,
-      GpuDotFusionCostModel::EstimateRunTimeForDotOpWithBlockParameters(
-          dot, block_params, device_info));
-  return SetReificationCost(instr, exec_time, kGemmCostModelName);
-}
-
 absl::Status MaybeRecordPerfTablesForDotsAndCustomCalls(
     const se::DeviceDescription& device_info, HloInstruction& instr,
     MatmulInterpolator& interpolator) {
@@ -151,7 +93,7 @@ absl::Status MaybeRecordPerfTablesForDotsAndCustomCalls(
 
 }  // namespace
 
-absl::StatusOr<bool> MatmulPerfTableStatsCollection::Run(
+absl::StatusOr<bool> MatmulPerfTableStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(HloInstructionProfileList profiles,
@@ -170,12 +112,6 @@ absl::StatusOr<bool> MatmulPerfTableStatsCollection::Run(
           VLOG(1) << "Cannot record perf tables stats data: "
                   << instr->ToString() << ". Status: " << status;
         }
-        if (absl::Status status = MaybeRecordGemmCostModelForGemmTritonFusion(
-                device_info_, *instr);
-            !status.ok()) {
-          VLOG(1) << "Cannot record GEMM cost model stats data: "
-                  << instr->ToString() << ". Status: " << status;
-        }
       });
 
   return false;
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
index 7d7d254c5ef53a..d5cd2742ce06c3 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
@@ -38,9 +38,8 @@ class MatmulPerfTableStatsCollection : public HloModulePass {
     return "matmul-perf-table-stats-collection";
   }
 
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
index f20361d2036c89..d155a1a5ae01b6 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
@@ -212,58 +212,5 @@ TEST_F(MatmulStatsCollectionTest,
             0);
 }
 
-TEST_F(MatmulStatsCollectionTest,
-       CollectsMatmulGEMMCostModelDataForTritonFusionConfig) {
-  absl::string_view hlo = R"(
-    HloModule m
-
-    comp {
-      p0 = bf16[1024,1024] parameter(0)
-      p1 = bf16[1024,1024] parameter(1)
-      ROOT _ = bf16[1024,1024] dot(p0,p1),
-        lhs_contracting_dims={1},
-        rhs_contracting_dims={0}
-    }
-
-    ENTRY e {
-      p0 = bf16[1024,1024] parameter(0)
-      p1 = bf16[1024,1024] parameter(1)
-      ROOT triton_gemm =  bf16[1024,1024] fusion(p0,p1),
-        kind=kCustom,
-        calls=comp,
-        backend_config={
-          "operation_queue_id":"0",
-          "wait_on_operation_queues":[],
-          "fusion_backend_config": {
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":"128",
-              "block_n":"128",
-              "block_k":"64",
-              "split_k":"1",
-              "num_stages":"1",
-              "num_warps":"8",
-              "num_ctas":"1"
-            }
-          },
-        }
-    }
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed, MatmulPerfTableStatsCollection(profiles_path_, device_info_)
-                        .Run(module.get()));
-
-  VLOG(1) << module->ToString();
-
-  EXPECT_FALSE(changed);
-  EXPECT_THAT(module->entry_computation()
-                  ->root_instruction()
-                  ->backend_config<GpuBackendConfig>()
-                  ->reification_cost(),
-              ElementsAre(Property(&ReificationCost::exec_time_us,
-                                   DoubleNear(199, /*max_abs_error=*/1))));
-}
-
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
index 7db74bdeb0e373..05417dbb997dc3 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
@@ -138,6 +138,9 @@ SolGPUCostModel::Config GetPlatformConfig(
     } else if (option_name == kSolChunkSizeBytes &&
                absl::SimpleAtoi(option_value, &value) && value > 0) {
       config.chunk_size_bytes = value;
+    } else if (option_name == kSolPartitionSize &&
+               absl::SimpleAtoi(option_value, &value) && value > 0) {
+      config.partition_size = value;
     }
   }
   return config;
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
index 6adbb4dcf602da..0634f118c528cd 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
@@ -40,6 +40,8 @@ class SolGPUCostModel {
     absl::Duration rtt;
     int64_t gpus_per_node;
     int64_t chunk_size_bytes;
+    // Partition size (devices per fast-interconnect domain). 0 means unset.
+    int64_t partition_size;
   };
 
   enum CollectiveAlgorithmType {
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
index 21b4e6ad448948..c684a2ff97773e 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
@@ -60,8 +60,18 @@ bool SetReificationCost(HloInstruction* instr, double cost_us) {
     return false;
   }
   auto reification_cost = gpu_config->add_reification_cost();
+  VLOG(3) << "Setting exec_time_us=" << cost_us << " for " << instr->name()
+          << " in SolGpuCostModelStatsCollection";
   reification_cost->set_exec_time_us(cost_us);
   reification_cost->set_name("sol");
+  if (instr->opcode() == HloOpcode::kAsyncStart &&
+      instr->async_wrapped_instruction() != nullptr) {
+    VLOG(9) << "AsyncStart: Setting reification cost for async start "
+            << instr->ToString() << " computation:"
+            << instr->async_wrapped_computation()->ToString();
+    return SetReificationCost(
+        instr->async_wrapped_computation()->root_instruction(), cost_us);
+  }
   return instr->set_backend_config(*gpu_config).ok();
 }
 
@@ -72,15 +82,19 @@ bool RecordReificationCost(HloInstruction& instr,
     HloGraphNode from(&instr, /*original_position=*/-1);
     HloGraphNode to(instr.users()[0], /*original_position=*/-1);
     if (estimator.IsAsyncPair(from, to)) {
+      VLOG(10) << "Recording reification cost for async pair from: "
+               << instr.ToString() << " to: " << instr.users()[0]->ToString();
       return SetReificationCost(&instr, estimator.GetLatencyBetween(from, to));
     }
   }
+  VLOG(10) << "Recording reification cost for single node: "
+           << instr.ToString();
   return SetReificationCost(&instr, estimator.NodeCost(&instr));
 }
 
 }  // namespace
 
-absl::StatusOr<bool> SolGpuCostModelStatsCollection::Run(
+absl::StatusOr<bool> SolGpuCostModelStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto cost_analysis =
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
index 829406f2bc7fd2..6d7ccde25851f5 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/hlo_verifier.h"
@@ -42,9 +42,8 @@ class SolGpuCostModelStatsCollection : public HloModulePass {
     return "sol-gpu-cost-model-stats-collection";
   }
 
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
index 7870330d0c1c4b..ea03349b3ab584 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
@@ -23,9 +23,12 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -35,6 +38,7 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
+using ::mlir::MLIRContext;
 using ::testing::Gt;
 using ::testing::Property;
 
@@ -42,28 +46,11 @@ using ShapeSizeFn = std::function<int64_t(const Shape&)>;
 
 class SolGpuCostModelStatsCollectionTest
     : public HloHardwareIndependentTestBase {
- public:
-  explicit SolGpuCostModelStatsCollectionTest() {
-    ShapeSizeFn shape_size_bytes =
-        [&shape_size_bytes](const Shape& shape) -> int64_t {
-      int64_t shape_size = 0;
-      if (shape.IsTuple()) {
-        for (auto& sub_shape : shape.tuple_shapes()) {
-          shape_size += shape_size_bytes(sub_shape);
-        }
-        return shape_size;
-      }
-      return ShapeUtil::ByteSizeOfElements(shape);
-    };
-    shape_size_fn_ = shape_size_bytes;
-  }
-
  protected:
   se::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo(se::CudaComputeCapability(9, 0));
-  ShapeSizeFn shape_size_fn_;
   int pointer_size_ = 8;
-  mlir::MLIRContext mlir_context_;
+  MLIRContext mlir_context_;
 };
 
 TEST_F(SolGpuCostModelStatsCollectionTest,
@@ -87,10 +74,11 @@ TEST_F(SolGpuCostModelStatsCollectionTest,
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
 
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, SolGpuCostModelStatsCollection(
-                                            device_info_, shape_size_fn_,
-                                            pointer_size_, &mlir_context_)
-                                            .Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          SolGpuCostModelStatsCollection(
+                              device_info_, HloCostAnalysis::DefaultShapeSize,
+                              pointer_size_, &mlir_context_)
+                              .Run(module.get()));
 
   VLOG(1) << module->ToString();
 
@@ -102,6 +90,42 @@ TEST_F(SolGpuCostModelStatsCollectionTest,
                   ->reification_cost(),
               ElementsAre(Property(&ReificationCost::exec_time_us, Gt(0))));
 }
+TEST_F(SolGpuCostModelStatsCollectionTest,
+       RecordsRuntimeInfoForAsyncStartReduceScatter) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule async_rs_test
+    %add.f32 (x: f32[], y: f32[]) -> f32[] {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %add = f32[] add(%x, %y)
+    }
+    %async_rs {
+      %p0 = f32[4096,128256] parameter(0)
+      ROOT %rs = f32[512,128256] reduce-scatter(%p0), channel_id=1,
+        replica_groups={{0,1,2,3,4,5,6,7}}, dimensions={0}, to_apply=%add.f32
+    }
+    ENTRY main {
+      %param = f32[4096,128256] parameter(0)
+      %rs_start = ((f32[4096,128256]), f32[512,128256], u32[])
+        async-start(%param), calls=%async_rs
+      ROOT %rs_done = f32[512,128256] async-done(%rs_start)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          SolGpuCostModelStatsCollection(
+                              device_info_, HloCostAnalysis::DefaultShapeSize,
+                              pointer_size_, &mlir_context_)
+                              .Run(module.get()));
+  VLOG(1) << module->ToString();
+  EXPECT_FALSE(changed);
+  HloInstruction* rs_start = FindInstruction(module.get(), "rs_start");
+  ASSERT_NE(rs_start, nullptr);
+  HloComputation* async_comp = rs_start->async_wrapped_computation();
+  ASSERT_NE(async_comp, nullptr);
+  HloInstruction* rs_instr = async_comp->root_instruction();
 
+  EXPECT_THAT(rs_instr->backend_config<GpuBackendConfig>()->reification_cost(),
+              ElementsAre(Property(&ReificationCost::exec_time_us, Gt(0))));
+}
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
index a4e708595f17b6..7c47c4cea5976e 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -55,9 +54,10 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-
 namespace {
 
+using ::mlir::MLIRContext;
+
 bool IsSupportedCollectiveOp(const HloInstruction& instr) {
   return HloPredicateIsOp<HloOpcode::kAllReduceStart, HloOpcode::kAllReduce,
                           HloOpcode::kReduceScatter, HloOpcode::kAllGatherStart,
@@ -106,7 +106,7 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
     int num_participating_hosts, int num_communicators,
     const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
     const SolGPUCostModel::Config& sol_flags,
-    const GpuHloCostAnalysis& analysis, mlir::MLIRContext* mlir_context) {
+    const GpuHloCostAnalysis& analysis, MLIRContext* mlir_context) {
   SolGPUCostModel sol_model(sol_flags);
   const int64_t msg_size = analysis.BytesTransferred(instr);
 
@@ -188,6 +188,17 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
   return result;
 }
 
+int64_t GetPartitionSize(const HloInstruction& instr,
+                         const SolGPUCostModel::Config& sol_flags) {
+  if (sol_flags.partition_size > 0) {
+    return sol_flags.partition_size;
+  }
+  if (instr.GetModule()->config().partition_size() > 0) {
+    return instr.GetModule()->config().partition_size();
+  }
+  return sol_flags.gpus_per_node;
+}
+
 absl::StatusOr<absl::Duration> DispatchEstimation(
     const absl::StatusOr<GPUCommunicationType>& communication_type,
     const HloCollectiveInstruction& instr,
@@ -195,30 +206,32 @@ absl::StatusOr<absl::Duration> DispatchEstimation(
     const SolGPUCostModel::Config& sol_flags,
     const GpuHloCostAnalysis& analysis,
     const CollectiveInterpolator* collective_interpolator,
-    mlir::MLIRContext* mlir_context) {
+    MLIRContext* mlir_context) {
   TF_RETURN_IF_ERROR(communication_type.status());
 
   GPUCommunicationType comm = *communication_type;
   TF_ASSIGN_OR_RETURN(auto num_groups_and_devices,
                       GetReplicaGroupCountAndSize(&instr));
+  int64_t partition_size = GetPartitionSize(instr, sol_flags);
 
   switch (comm) {
-    case GPUCommunicationType::RAIL_ALIGNED: {
+    case GPUCommunicationType::MULTI_HOST_WORLD_LEVEL: {
       return DCNCollectiveDuration(
-          num_groups_and_devices->second / sol_flags.gpus_per_node,
+          num_groups_and_devices->second / partition_size,
           /*num_communicators=*/num_groups_and_devices->first, instr,
           gpu_device_info, sol_flags, analysis, mlir_context);
     }
-    case GPUCommunicationType::NON_RAIL_ALIGNED: {
+    case GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL: {
       return DCNCollectiveDuration(
           num_groups_and_devices->second,
           /*num_communicators=*/num_groups_and_devices->first, instr,
           gpu_device_info, sol_flags, analysis, mlir_context);
     }
-    case GPUCommunicationType::SINGLE_HOST: {
+    case GPUCommunicationType::SINGLE_PARTITION: {
       if (collective_interpolator == nullptr) {
         return absl::InvalidArgumentError(
-            "Collective interpolator is required for single host collectives");
+            "Collective interpolator is required for single partition "
+            "collectives");
       }
       return collective_interpolator->EstimatedRuntime(instr);
     }
@@ -265,7 +278,7 @@ absl::StatusOr<std::unique_ptr<MatmulInterpolator>> CreateMatmulInterpolator(
 SolLatencyEstimator::ComputeCollectiveTime(
     const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
     HloCostAnalysis::ShapeSizeFunction shape_size_fn,
-    const SolGPUCostModel::Config& sol_flags, mlir::MLIRContext* mlir_context,
+    const SolGPUCostModel::Config& sol_flags, MLIRContext* mlir_context,
     const CollectiveInterpolator* collective_interpolator) {
   GpuHloCostAnalysis analysis(
       GpuHloCostAnalysis::Options{shape_size_fn,
@@ -290,7 +303,7 @@ SolLatencyEstimator::ComputeCollectiveTime(
     const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
     HloCostAnalysis::ShapeSizeFunction shape_size_fn,
     const SolGPUCostModel::Config& sol_flags,
-    const GpuHloCostAnalysis& analysis, mlir::MLIRContext* mlir_context,
+    const GpuHloCostAnalysis& analysis, MLIRContext* mlir_context,
     const CollectiveInterpolator* collective_interpolator) {
   if (HloDataflowAnalysis::IsAsynchronousOperationDone(instr.opcode())) {
     VLOG(8) << "Returning 0 cost for async done op " << instr.name();
@@ -306,9 +319,10 @@ SolLatencyEstimator::ComputeCollectiveTime(
         absl::StrCat("Unsupported collective instruction: ", instr.ToString()));
   }
 
+  int64_t partition_size = GetPartitionSize(*collective_instr, sol_flags);
   TF_ASSIGN_OR_RETURN(
       GPUCommunicationType communication_type,
-      CommunicationType(sol_flags.gpus_per_node, *collective_instr,
+      CommunicationType(partition_size, *collective_instr,
                         gpu_device_info.gpu_compute_capability()));
   TF_ASSIGN_OR_RETURN(
       absl::Duration result,
@@ -324,7 +338,7 @@ SolLatencyEstimator::Create(
     std::unique_ptr<LatencyEstimator> latency_estimator,
     const se::DeviceDescription& gpu_info,
     HloCostAnalysis::ShapeSizeFunction shape_size_function,
-    const HloComputation* computation, mlir::MLIRContext* mlir_context,
+    const HloComputation* computation, MLIRContext* mlir_context,
     std::unique_ptr<GpuHloCostAnalysis> cost_analysis) {
   if (cost_analysis == nullptr) {
     cost_analysis =
@@ -379,11 +393,18 @@ LatencyEstimator::TimeCost SolLatencyEstimator::GetLatencyBetween(
   const HloOpcode from_op = from.GetInstr().opcode();
   if (!config_.schedule_send_recvs &&
       (from_op == HloOpcode::kSend || from_op == HloOpcode::kRecv)) {
+    VLOG(10) << "GetLatencyBetween: Returning kLowLatency for Send/Recv op "
+             << from.GetInstr().name();
     return kLowLatency;
   }
 
-  if (!IsAsyncPair(from, target) || !IsSupportedCollectiveOp(from.GetInstr())) {
-    return latency_estimator_->GetLatencyBetween(from, target);
+  if (!IsAsyncPair(from, target) && !IsSupportedCollectiveOp(from.GetInstr())) {
+    TimeCost latency = latency_estimator_->GetLatencyBetween(from, target);
+    VLOG(10)
+        << "GetLatencyBetween: Not an async pair or unsupported collective "
+        << from.GetInstr().name() << ", returning latency from wrapped "
+        << "estimator: " << latency;
+    return latency;
   }
 
   absl::StatusOr<absl::Duration> coll_time = ComputeCollectiveTime(
@@ -392,22 +413,34 @@ LatencyEstimator::TimeCost SolLatencyEstimator::GetLatencyBetween(
   if (!coll_time.ok()) {
     VLOG(1) << "Failed to compute collective time: " << coll_time.status()
             << " for " << from.GetInstr().name();
-    return latency_estimator_->GetLatencyBetween(from, target);
+    TimeCost latency = latency_estimator_->GetLatencyBetween(from, target);
+    VLOG(10) << "GetLatencyBetween: Fallback to wrapped estimator due to "
+                "ComputeCollectiveTime failure for "
+             << from.GetInstr().name() << ", returning latency: " << latency;
+    return latency;
   }
-  return absl::ToDoubleMicroseconds(*coll_time);
+  TimeCost latency = absl::ToDoubleMicroseconds(*coll_time);
+  VLOG(10) << "GetLatencyBetween: Computed collective time for "
+           << from.GetInstr().name() << ": " << latency << " us";
+  return latency;
 }
 
 LatencyEstimator::TimeCost SolLatencyEstimator::NodeCost(
     const HloInstruction* instr) const {
   if (hlo_query::IsAsyncCollectiveStartOp(instr, /*include_send_recv=*/true) ||
       hlo_query::IsAsyncCollectiveDoneOp(instr, /*include_send_recv=*/true)) {
+    VLOG(10) << "NodeCost: Returning kLowCost for async start/done op "
+             << instr->name();
     return kLowCost;
   }
 
   if (std::optional<absl::Duration> matmul_duration =
           matmul_interpolator_->EstimatedRuntime(*instr);
       matmul_duration.has_value()) {
-    return absl::ToDoubleMicroseconds(*matmul_duration);
+    TimeCost cost = absl::ToDoubleMicroseconds(*matmul_duration);
+    VLOG(10) << "NodeCost: Matmul cost from matmul_interpolator for "
+             << instr->name() << ": " << cost << " us";
+    return cost;
   }
 
   LatencyEstimator::TimeCost cost_in_us;
@@ -420,8 +453,12 @@ LatencyEstimator::TimeCost SolLatencyEstimator::NodeCost(
             .EstimateRunTimeForInstruction(instr, &*cost_analysis_)
             .exec_time;
     cost_in_us = absl::ToDoubleMicroseconds(total_estimated_time);
+    VLOG(10) << "NodeCost: Fusion cost from gpu_performance_model for "
+             << instr->name() << ": " << cost_in_us << " us";
   } else {
     cost_in_us = 0.01 * latency_estimator_->NodeCost(instr);
+    VLOG(10) << "NodeCost: Fallback cost for " << instr->name() << ": "
+             << cost_in_us << " us";
   }
   VLOG(10) << "Analytical estimator calculated cost for: " << instr->name()
            << ". Cost: " << cost_in_us;
@@ -437,7 +474,7 @@ SolLatencyEstimator::SolLatencyEstimator(
     const SolGPUCostModel::Config sol_flags,
     std::unique_ptr<CollectiveInterpolator> collective_interpolator,
     std::unique_ptr<MatmulInterpolator> matmul_interpolator,
-    mlir::MLIRContext* mlir_context)
+    MLIRContext* mlir_context)
     : config_(config),
       gpu_info_(gpu_info),
       gpu_performance_model_(gpu_info, mlir_context),
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
index 2dd1dfb0acccfa..92a284613114b4 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/collective_interpolator.h"
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index 0355ab0f4fe6a3..1914cf8819d19c 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -53,7 +53,7 @@ using ::testing::ValuesIn;
 using ::testing::WithParamInterface;
 
 // Define CostType to distinguish between collective and node costs
-enum class CostType { kCollectiveTime, kNodeCost };
+enum class CostType { kCollectiveTime, kNodeCost, kEdgeCost };
 
 struct EstimatorTestCase {
   std::string test_name;
@@ -110,6 +110,18 @@ class SolLatencyEstimatorTest : public HloHardwareIndependentTestBase,
     return absl::Microseconds(static_cast<int64_t>(cost_val));
   }
 
+  absl::Duration GetLatencyBetween(const HloGraphNode& from,
+                                   const HloGraphNode& target,
+                                   const HloComputation* computation) {
+    std::unique_ptr<SolLatencyEstimator> estimator =
+        *SolLatencyEstimator::Create(
+            scheduler_config_, std::make_unique<DummyLatencyEstimator>(),
+            gpu_device_info_, shape_size_fn_, computation, &mlir_context_);
+    LatencyEstimator::TimeCost cost_val =
+        estimator->GetLatencyBetween(from, target);
+    return absl::Microseconds(static_cast<int64_t>(cost_val));
+  }
+
   HloCostAnalysis::ShapeSizeFunction shape_size_fn_;
   const se::DeviceDescription gpu_device_info_;
   const SolGPUCostModel::Config sol_flags_;
@@ -133,6 +145,11 @@ TEST_P(SolLatencyEstimatorTest, TestLatencyEstimation) {
     actual_time_us = absl::Trunc(time_us, absl::Microseconds(1));
   } else if (test_case.cost_type == CostType::kNodeCost) {
     actual_time_us = ComputeNodeCost(*instr, module->entry_computation());
+  } else if (test_case.cost_type == CostType::kEdgeCost) {
+    actual_time_us = GetLatencyBetween(
+        HloGraphNode(instr, /*original_position=*/-1),
+        HloGraphNode(instr->users().front(), /*original_position=*/-1),
+        module->entry_computation());
   } else {
     LOG(FATAL) << "Unreachable.";
   }
@@ -229,6 +246,37 @@ ENTRY main {
       /*expected_latency=*/absl::Microseconds(18895),
   };
 
+  EstimatorTestCase reduce_scatter_intra_host = {
+      /*test_name=*/"reduce_scatter_intra_host",
+      /*module_string=*/R"(
+HloModule m, num_partitions=8
+
+add {
+  param_0 = bf16[] parameter(0)
+  param_1 = bf16[] parameter(1)
+  ROOT t = bf16[] add(param_0, param_1)
+}
+
+async_comp {
+  param_3 = bf16[8192,128256] parameter(0)
+  ROOT r = bf16[1024,128256] reduce-scatter(param_3),
+    dimensions={0},
+    to_apply=add,
+    replica_groups=[1,8]<=[8],
+    channel_id=1,
+    use_global_device_ids=true
+}
+
+ENTRY main {
+  p = bf16[8192,128256] parameter(0)
+  rs-start = ((bf16[8192,128256]), bf16[1024,128256]) async-start(p), calls=async_comp
+  ROOT rs-done = bf16[1024,128256] async-done(rs-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kAsyncStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(5716),
+  };
+
   EstimatorTestCase matmul_bf16_1024_4096_512 = {
       /*test_name=*/"matmul_bf16_1024_4096_512",
       /*module_string=*/R"(
@@ -292,7 +340,7 @@ ENTRY e {
 })",
       /*opcode_to_find=*/HloOpcode::kFusion,
       /*cost_type=*/CostType::kNodeCost,
-      /*expected_latency=*/absl::Microseconds(9),
+      /*expected_latency=*/absl::Microseconds(8),
   };
 
   EstimatorTestCase cublas_matmul_bf16_batch1_1024_1024_1024 = {
@@ -322,7 +370,127 @@ ENTRY e {
 })",
       /*opcode_to_find=*/HloOpcode::kCustomCall,
       /*cost_type=*/CostType::kNodeCost,
-      /*expected_latency=*/absl::Microseconds(9),
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_mixed_fp8_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublas_matmul_mixed_fp8_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e5m2[1024,1024] parameter(0)
+  p1 = f8e4m3fn[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_f8e5m2_f8e4m3fn_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublaslt_matmul_f8e5m2_f8e4m3fn_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e5m2[1024,1024] parameter(0)
+  p1 = f8e4m3fn[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_f8e4m3fn_f8e5m2_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublaslt_matmul_f8e4m3fn_f8e5m2_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e4m3fn[1024,1024] parameter(0)
+  p1 = f8e5m2[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_f8e4m3fn_f8e4m3fn_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublaslt_matmul_f8e4m3fn_f8e4m3fn_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e4m3fn[1024,1024] parameter(0)
+  p1 = f8e4m3fn[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
   };
 
   EstimatorTestCase simple_fusion_elementwise = {
@@ -363,6 +531,7 @@ ENTRY e {
           all_gather_inter_host_pairwise,
           all_gather_all_ranks,
           reduce_scatter_all_ranks,
+          reduce_scatter_intra_host,
           matmul_bf16_1024_4096_512,
           matmul_f32_batch4_256_1024_256,
           triton_matmul_bf16_batch1_1024_1024_1024,
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
index 02dff4d6fd1ff7..3cde9c3465eff3 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -118,6 +119,8 @@ TritonEmitterConstraints::DeriveCustomConstraints(
         hlo->opcode() == HloOpcode::kBitcast) {
       MLIRContext* ctx = instruction->symbolic_tile().size_map().getContext();
 
+      // TODO(b/446856820): Remove this once we use SymbolicMap here and
+      // therefore we can get the context directly.
       IndexingMap reshape_indexing_map =
           ComputeOutputToInputIndexing(hlo, /*output_id=*/0, ctx)
               .indexing_maps[0]
@@ -157,7 +160,11 @@ TritonEmitterConstraints::DeriveCustomConstraints(
       ConstraintExpression divisibility_constraints =
           ConstraintExpression::GetAlwaysSatisfied();
 
-      for (const HloInstruction* operand : hlo->operands()) {
+      // The last operand of the concat does not require the divisibility
+      // constraint.
+      for (int operand_id = 0; operand_id < hlo->operand_count() - 1;
+           ++operand_id) {
+        const HloInstruction* operand = hlo->operand(operand_id);
         AffineExpr operand_concat_dimension = mlir::getAffineConstantExpr(
             operand->shape().dimensions(concatenate_dimension_index), ctx);
         ConstraintExpression::Constraint divisibility_constraint{
@@ -329,6 +336,10 @@ absl::StatusOr<bool> TritonEmitterConstraints::ParametersSatisfyConstraints(
       // invalid. Otherwise we would for example compute the launch config
       // incorrectly.
       if ((tile_size & (tile_size - 1)) && tile_size != dim_size) {
+        VLOG(5)
+            << "Found a tile size that is not a power of 2 and is not equal "
+               "to the dimension size. Bailing out."
+            << tile_size << " " << dim_size;
         return false;
       }
     }
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
index f5ac9dcadaf486..06875cea1f74ed 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -278,15 +279,15 @@ TEST_F(TritonEmitterConstraintsTest,
 concatenate {
   p0 = bf16[8] parameter(0)
   p1 = bf16[8] parameter(1)
-  p2 = bf16[8] parameter(2)
-  ROOT concatenate = bf16[24] concatenate(p0, p1, p2), dimensions={0}
+  p2 = bf16[4] parameter(2)
+  ROOT concatenate = bf16[20] concatenate(p0, p1, p2), dimensions={0}
 }
 
 ENTRY main {
   p0 = bf16[8] parameter(0)
   p1 = bf16[8] parameter(1)
-  p2 = bf16[8] parameter(2)
-  ROOT fusion = bf16[24] fusion(p0, p1, p2),
+  p2 = bf16[4] parameter(2)
+  ROOT fusion = bf16[20] fusion(p0, p1, p2),
     kind=kCustom, calls=concatenate, backend_config={"fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion"}}
 })"));
@@ -320,9 +321,9 @@ ENTRY main {
                   Tiling({{fusion_root, FlatTiling({16})}})),
               absl_testing::IsOkAndHolds(false));
 
-  // However, (4,) is valid and should still work.
+  // However, (8,) is valid and should still work.
   EXPECT_THAT(analysis_with_triton_constraints->ParametersSatisfyConstraints(
-                  Tiling({{fusion_root, FlatTiling({4})}})),
+                  Tiling({{fusion_root, FlatTiling({8})}})),
               absl_testing::IsOkAndHolds(true));
 }
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_alias_info.cc b/third_party/xla/xla/service/gpu/nvptx_alias_info.cc
index 70599b640dbb0c..fc2d786865262a 100644
--- a/third_party/xla/xla/service/gpu/nvptx_alias_info.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_alias_info.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/match.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -51,11 +52,8 @@ std::optional<bool> NVPTXAliasInfo::MayAlias(
         GemmBackendConfig config =
             std::move(user->backend_config<GpuBackendConfig>())
                 ->gemm_backend_config();
-        return (config.beta() != 0.) && user->operand(2) == operand;
-      }
-      // The operand of cholesky can be shared with the first output.
-      if (user->custom_call_target() == kCusolverCholeskyCallTarget) {
-        return user_index.size() == 1 && user_index[0] == 0;
+        return (config.beta() != 0.) && operand == user->operand(2) &&
+               absl::c_count(user->operands(), operand) == 1;
       }
       return false;
     default:
diff --git a/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
new file mode 100644
index 00000000000000..0535ccd3968e29
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/nvptx_alias_info.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
+#include "xla/service/copy_insertion.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+void ExpectOptionalTrue(std::optional<bool> value) {
+  EXPECT_TRUE(value.has_value());
+  CHECK(value.has_value());
+  EXPECT_TRUE(*value);
+}
+
+void ExpectOptionalFalse(std::optional<bool> value) {
+  EXPECT_TRUE(value.has_value());
+  CHECK(value.has_value());
+  EXPECT_FALSE(*value);
+}
+
+class NVPTXAliasInfoTest : public HloHardwareIndependentTestBase {
+ public:
+  std::optional<bool> MayAlias(const HloInstruction* user,
+                               const HloInstruction* operand,
+                               const ShapeIndex& user_index) {
+    return alias_info_.MayAlias(operand, {}, user, user_index);
+  }
+
+ private:
+  const se::DeviceDescription device_description_{
+      xla::gpu::TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  NVPTXAliasInfo alias_info_{device_description_};
+};
+
+TEST_F(NVPTXAliasInfoTest, BufferCanBeSharedForBiasMatmul) {
+  const char* const kModuleString = R"(
+HloModule m
+
+ENTRY main {
+  lhs = f32[20,20]{1,0} parameter(0)
+  rhs = f32[20,30]{1,0} parameter(1)
+  bias = f32[20,30]{1,0} parameter(2)
+  ROOT cublas-lt-matmul = (f32[20,30]{1,0}, s8[33554432]{0}) custom-call(lhs, rhs, bias), custom_call_target="__cublas$lt$matmul", frontend_attributes={grad_x="false",grad_y="false"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"gemm_backend_config":{"selected_algorithm":"0","alpha_real":1,"beta":1,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["HIGHEST","HIGHEST"],"algorithm":"ALG_UNSET"},"epilogue":"DEFAULT","lhs_stride":"400","rhs_stride":"600","grad_x":false,"grad_y":false,"damax_output":false},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* matmul = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(1), {0}));
+  ExpectOptionalTrue(MayAlias(matmul, matmul->operand(2), {0}));
+}
+
+TEST_F(NVPTXAliasInfoTest, DuplicateOperandBufferCannotBeSharedForBiasMatmul) {
+  const char* const kModuleString = R"(
+HloModule m
+
+ENTRY main {
+  lhs = f32[20,20]{1,0} parameter(0)
+  rhs = f32[20,30]{1,0} parameter(1)
+  ROOT cublas-lt-matmul = (f32[20,30]{1,0}, s8[33554432]{0}) custom-call(lhs, rhs, rhs), custom_call_target="__cublas$lt$matmul", frontend_attributes={grad_x="false",grad_y="false"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"gemm_backend_config":{"selected_algorithm":"0","alpha_real":1,"beta":1,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["HIGHEST","HIGHEST"],"algorithm":"ALG_UNSET"},"epilogue":"DEFAULT","lhs_stride":"400","rhs_stride":"600","grad_x":false,"grad_y":false,"damax_output":false},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* matmul = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(1), {0}));
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(2), {0}));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 64e365479a4b60..cd7fec178cb739 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -91,7 +91,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/cudnn_norm_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_pad_for_convolutions.h"
 #include "xla/service/gpu/transforms/cudnn_simplify_padding.h"
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
 #include "xla/service/gpu/transforms/triangular_solve_rewriter.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
@@ -104,7 +103,6 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/cuda/cuda_solver_context.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/semantic_version.h"
@@ -128,7 +126,7 @@ class ConvBfloat16Support : public FloatSupport {
  public:
   explicit ConvBfloat16Support(
       se::dnn::VersionInfo cudnn_version,
-      se::CudaComputeCapability cuda_compute_capability)
+      const se::CudaComputeCapability& cuda_compute_capability)
       : FloatSupport(BF16),
         is_conv_bf16_supported_(cuda_compute_capability.IsAtLeast(
             se::CudaComputeCapability::kAmpere)) {}
@@ -154,7 +152,7 @@ class ConvBfloat16Support : public FloatSupport {
 class MatmulBfloat16Support : public FloatSupport {
  public:
   explicit MatmulBfloat16Support(
-      se::CudaComputeCapability cuda_compute_capability)
+      const se::CudaComputeCapability& cuda_compute_capability)
       : FloatSupport(BF16),
         is_matmul_bf16_supported_(cuda_compute_capability.IsAtLeast(
             se::CudaComputeCapability::kAmpere)) {}
@@ -179,11 +177,10 @@ class MatmulBfloat16Support : public FloatSupport {
 }  // namespace
 
 absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+    HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
     se::dnn::VersionInfo dnn_version,
     const se::SemanticVersion& toolkit_version) {
-  auto cuda_compute_capability =
-      std::get<se::CudaComputeCapability>(gpu_version);
+  auto* cuda_compute_capability = gpu_version.cuda_compute_capability();
   // Convert convolutions into CustomCalls to cudnn, then canonicalize them
   // (ConvPaddingLegalization). Also expand cuSolver calls.
   HloPassPipeline pipeline("conv_canonicalization");
@@ -192,23 +189,21 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
       /*allow_mixed_precision=*/false);
 
   // Convert unsupported bf16 convolutions to f32.
-  ConvBfloat16Support conv_bf16_support(dnn_version, cuda_compute_capability);
+  ConvBfloat16Support conv_bf16_support(dnn_version, *cuda_compute_capability);
   pipeline.AddPass<FloatNormalization>(&conv_bf16_support);
 
   // Convert unsupported bf16 matmuls to f32.
-  MatmulBfloat16Support matmul_bf16_support(cuda_compute_capability);
+  MatmulBfloat16Support matmul_bf16_support(*cuda_compute_capability);
   pipeline.AddPass<FloatNormalization>(&matmul_bf16_support);
 
-  pipeline.AddPass<GpusolverRewriter>(
-      stream_executor::CudaSolverContext::Create);
   if (!hlo_module->config()
            .debug_options()
            .xla_gpu_experimental_disable_binary_libraries()) {
-    pipeline.AddPass<ConvRewriter>(cuda_compute_capability, dnn_version);
-    pipeline.AddPass<CudnnFusedConvRewriter>(cuda_compute_capability,
+    pipeline.AddPass<ConvRewriter>(gpu_version, dnn_version);
+    pipeline.AddPass<CudnnFusedConvRewriter>(*cuda_compute_capability,
                                              dnn_version, toolkit_version);
     pipeline.AddPass<ConvPaddingLegalization>();
-    pipeline.AddPass<CudnnPadForConvolutions>(cuda_compute_capability);
+    pipeline.AddPass<CudnnPadForConvolutions>(*cuda_compute_capability);
   }
   // The conv padding/vectorization passes which we need to get rid of.  They
   // also leave behind unnecessary tuple/get-tuple-element pairs that
@@ -269,12 +264,13 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
 
 absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    const CompileOptions& options, const TargetConfig& gpu_target_config,
+    const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const GpuAliasInfo* alias_info, tsl::thread::ThreadPool* thread_pool) {
   // This needs to run before GemmRewriter, which is part of
   // OptimizeHloPostLayoutAssignment().
-  auto cuda_compute_capability = std::get<se::CudaComputeCapability>(
-      gpu_target_config.device_description.gpu_compute_capability());
+  auto* cuda_compute_capability =
+      gpu_target_config.device_description.gpu_compute_capability()
+          .cuda_compute_capability();
 
   HloPassPipeline pre_pipeline("nvptx post-layout_assignment part 1");
   if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_layer_norm() &&
@@ -282,11 +278,11 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
            .debug_options()
            .xla_gpu_experimental_disable_binary_libraries()) {
     // Rewrite normalization patterns into cuDNN Custom Calls.
-    pre_pipeline.AddPass<CudnnNormRewriter>(cuda_compute_capability);
+    pre_pipeline.AddPass<CudnnNormRewriter>(*cuda_compute_capability);
   }
 
   pre_pipeline.AddPass<BlockScalingRewriter>(
-      cuda_compute_capability.IsAtLeastBlackwell()
+      cuda_compute_capability->IsAtLeastBlackwell()
           ? gpu_target_config.dnn_version_info
           : se::dnn::VersionInfo{});
   pre_pipeline.AddPass<DotDimensionMerger>();
@@ -296,11 +292,11 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
            .xla_gpu_experimental_disable_binary_libraries()) {
     for (const CublasPaddingRequirement& requirement :
          CublasPaddingRequirements) {
-      if (cuda_compute_capability.SupportsAllFeaturesOf(
+      if (cuda_compute_capability->SupportsAllFeaturesOf(
               requirement.min_compute_capability)) {
-        pre_pipeline.AddPass<CublasPadForGemms>(cuda_compute_capability,
-                                                requirement.data_type,
-                                                requirement.multiple_of);
+        pre_pipeline.AddPass<CublasPadForGemms>(
+            gpu_target_config.device_description.gpu_compute_capability(),
+            requirement.data_type, requirement.multiple_of);
       }
     }
   }
@@ -350,7 +346,7 @@ absl::Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
     const CompileOptions& options, HloModule* hlo_module,
     AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
     se::StreamExecutor* stream_exec,
-    const Compiler::TargetConfig* target_config) {
+    const Compiler::GpuTargetConfig* target_config) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
   if (hlo_module->config()
           .debug_options()
@@ -426,7 +422,7 @@ absl::Status NVPTXCompiler::AddFusionAutotuningPass(
     HloPassPipeline* pipeline, HloModule* hlo_module,
     const CompileOptions& options, tsl::thread::ThreadPool* thread_pool,
     stream_executor::StreamExecutor* stream_executor,
-    const Compiler::TargetConfig* target_config,
+    const Compiler::GpuTargetConfig* target_config,
     HloCostAnalysis::ShapeSizeFunction shape_size_fn) {
   if (stream_executor == nullptr) {
     return absl::OkStatus();
@@ -439,15 +435,15 @@ absl::Status NVPTXCompiler::AddFusionAutotuningPass(
   }
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
+  auto native_backend = std::make_unique<NativeEmitterBackend>(
+      &debug_options, this, target_config);
+  native_backend->AllowRegisterSpills();
+  backends.push_back(std::move(native_backend));
   auto ble_backend = std::make_unique<BlockLevelEmitterBackend>(
       &debug_options, this, shape_size_fn, target_config,
       /*use_default_config=*/true);
   ble_backend->AllowRegisterSpills();
   backends.push_back(std::move(ble_backend));
-  auto native_backend = std::make_unique<NativeEmitterBackend>(
-      &debug_options, this, target_config);
-  native_backend->AllowRegisterSpills();
-  backends.push_back(std::move(native_backend));
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<AutotunerPass> autotuner_pass,
@@ -640,7 +636,7 @@ NVPTXCompiler::CompileTargetBinary(
   } else {
     selected_module = llvm_module;
   }
-
+  const DebugOptions& debug_options = module_config.debug_options();
   std::string ptx;
   if (!(debug_module &&
         MaybeLoadPtxFromFile(module_config, debug_module, &ptx))) {
@@ -650,12 +646,13 @@ NVPTXCompiler::CompileTargetBinary(
         absl::StrCat(
             "NVPTXCompiler::CompileTargetBinary - CompileToPtx for ",
             (debug_module != nullptr ? debug_module->name() : "(unknown")),
-        !options.is_autotuning_compilation);
+        debug_options.xla_enable_scoped_logging_timers());
     uint64_t start_usecs = tsl::Env::Default()->NowMicros();
+
     TF_ASSIGN_OR_RETURN(
         ptx, nvptx::CompileToPtx(selected_module,
                                  device_description.gpu_compute_capability(),
-                                 module_config.debug_options()));
+                                 debug_options));
 
     uint64_t end_usecs = tsl::Env::Default()->NowMicros();
     // This won't record values for calls that error out (because if they error
@@ -663,7 +660,7 @@ NVPTXCompiler::CompileTargetBinary(
     RecordLlvmPassesAndLlvmToPtxDuration(end_usecs - start_usecs);
 
     if (DumpingEnabledForHloModule(debug_module ? debug_module->name() : "",
-                                   module_config.debug_options())) {
+                                   debug_options)) {
       if (debug_module) {
         DumpToFileInDirOrStdout(*debug_module, "",
                                 shard_number.has_value()
@@ -691,8 +688,8 @@ NVPTXCompiler::CompileTargetBinary(
           module_config.debug_options(),
           /*is_autotuning_compilation=*/options.is_autotuning_compilation);
 
-  se::CudaComputeCapability cc = std::get<se::CudaComputeCapability>(
-      device_description.gpu_compute_capability());
+  se::CudaComputeCapability cc =
+      *device_description.gpu_compute_capability().cuda_compute_capability();
 
   // This may print multiple lines per HLO compilation because of the
   // parallelized compilation of LLVM modules.
@@ -701,7 +698,7 @@ NVPTXCompiler::CompileTargetBinary(
   XLA_SCOPED_LOGGING_TIMER_IF(
       absl::StrCat("NVPTXCompiler::CompileTargetBinary - PtxToCubin for ",
                    module_name),
-      !options.is_autotuning_compilation);
+      debug_options.xla_enable_scoped_logging_timers());
   tsl::profiler::ScopedAnnotation annotation([&] {
     return absl::StrFormat("XlaCompileGpuAsm:#module=%s#", module_name);
   });
@@ -751,8 +748,8 @@ absl::StatusOr<std::vector<uint8_t>> NVPTXCompiler::LinkModules(
     return std::vector<uint8_t>{};
   }
 
-  auto cc = std::get<stream_executor::CudaComputeCapability>(
-      device_description.gpu_compute_capability());
+  auto cc =
+      device_description.gpu_compute_capability().cuda_compute_capability();
 
   TF_ASSIGN_OR_RETURN(const se::cuda::CompilationProvider* compilation_provider,
                       GetCompilationProvider(debug_options));
@@ -772,7 +769,7 @@ absl::StatusOr<std::vector<uint8_t>> NVPTXCompiler::LinkModules(
           << compilation_provider->name();
   TF_ASSIGN_OR_RETURN(
       se::cuda::Assembly assembly,
-      compilation_provider->CompileAndLink(cc, inputs, compilation_options));
+      compilation_provider->CompileAndLink(*cc, inputs, compilation_options));
 
   return std::move(assembly.cubin);
 }
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index 19d1f977307a3a..36d21cc3f9dd7b 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -56,13 +56,13 @@ class NVPTXCompiler : public GpuCompiler {
   explicit NVPTXCompiler();
 
   absl::Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
       se::dnn::VersionInfo dnn_version,
       const se::SemanticVersion& toolkit_version) override;
 
   absl::Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      const CompileOptions& options, const TargetConfig& gpu_target_config,
+      const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
       const GpuAliasInfo* alias_info,
       tsl::thread::ThreadPool* thread_pool) override;
 
@@ -75,7 +75,7 @@ class NVPTXCompiler : public GpuCompiler {
       const CompileOptions& options, HloModule* hlo_module,
       AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
       se::StreamExecutor* stream_exec,
-      const Compiler::TargetConfig* target_config) override;
+      const Compiler::GpuTargetConfig* target_config) override;
 
   absl::Status AddGemmFusionAutotuningPasses(
       HloPassPipeline* pipeline, HloModule* hlo_module,
@@ -89,7 +89,7 @@ class NVPTXCompiler : public GpuCompiler {
       HloPassPipeline* pipeline, HloModule* hlo_module,
       const CompileOptions& options, tsl::thread::ThreadPool* thread_pool,
       stream_executor::StreamExecutor* stream_executor,
-      const Compiler::TargetConfig* target_config,
+      const Compiler::GpuTargetConfig* target_config,
       HloCostAnalysis::ShapeSizeFunction shape_size_fn) override;
 
   absl::Status RunCudnnCompilerPasses(HloModule* module,
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index ea17b578aa3c9c..f003564f9b2bfe 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
@@ -179,9 +180,7 @@ ENTRY e {
   if (cc.IsAtLeastAmpere()) {
     MatchOptimizedHlo(hlo_string, R"(
 ; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: __triton_gemm
+; CHECK: __triton_nested_gemm_fusion
     )");
   } else {
     MatchOptimizedHlo(hlo_string, R"(
diff --git a/third_party/xla/xla/service/gpu/pre_scheduling_copy_insertion_pipeline.cc b/third_party/xla/xla/service/gpu/pre_scheduling_copy_insertion_pipeline.cc
index aa68b9bffb5abc..9bb0242f03b43f 100644
--- a/third_party/xla/xla/service/gpu/pre_scheduling_copy_insertion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/pre_scheduling_copy_insertion_pipeline.cc
@@ -52,7 +52,6 @@ HloPassPipeline PreSchedulingCopyInsertionPipeline(
   HloVerifierOpts opts =
       HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
           LayoutAssignment::InstructionCanChangeLayout);
-  opts.verify_unique_channel_ids = !debug_options.xla_ignore_channel_id();
   std::unique_ptr<TargetVerifierMetadata> verifier_metadata =
       std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
   pipeline.AddInvariantCheckerDebug<HloVerifier>(std::move(verifier_metadata),
diff --git a/third_party/xla/xla/service/gpu/ptx_compilation_test.cc b/third_party/xla/xla/service/gpu/ptx_compilation_test.cc
index 8a3ea1e714e857..b39dfbc09dff92 100644
--- a/third_party/xla/xla/service/gpu/ptx_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/ptx_compilation_test.cc
@@ -81,11 +81,25 @@ ENTRY main {
 )";
 
 constexpr absl::string_view kSM90AHlo = R"(
+lhs {
+  ROOT p0 = f16[64,1024]{1,0} parameter(0)
+}
+rhs {
+  p0 = f16[1024,32,32]{2,1,0} parameter(0)
+  ROOT bitcast = f16[1024,1024]{0,1} bitcast(p0)
+}
 gemm_fusion_dot {
-  %p0 = f16[64,1024]{1,0} parameter(0)
-  %p1 = f16[1024,32,32]{2,1,0} parameter(1)
-  %bitcast.74246 = f16[1024,1024]{0,1} bitcast(f16[1024,32,32]{2,1,0} %p1)
-  ROOT %dot.1302 = f16[64,1024]{1,0} dot(f16[64,1024]{1,0} %p0, f16[1024,1024]{0,1} %bitcast.74246), lhs_contracting_dims={1}, rhs_contracting_dims={0}, frontend_attributes={grad_x="false",grad_y="false"}
+  p0 = f16[64,1024]{1,0} parameter(0)
+  p1 = f16[1024,32,32]{2,1,0} parameter(1)
+  lhs = f16[64,1024]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={fusion_backend_config:{kind:"__triton_nested_gemm_fusion",
+      block_level_fusion_config:{output_tiles:[{sizes:[64,32]}]}}}
+  rhs = f16[1024,1024]{0,1} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={fusion_backend_config:{kind:"__triton_nested_gemm_fusion",
+      block_level_fusion_config:{output_tiles:[{sizes:[32,32]}]}}}
+  ROOT dot = f16[64,1024]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    frontend_attributes={grad_x="false",grad_y="false"}
 }
 
 ENTRY e {
@@ -95,11 +109,8 @@ ENTRY e {
   // whether we properly enable SM 9.0A in all compilation and linking paths.
   ROOT triton_gemm_fusion_dot = f16[64,1024]{1,0} fusion(p0, p1), kind=kCustom,
     calls=gemm_fusion_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,
-         "split_k":1,"num_stages":1,"num_warps":4,
-         "num_ctas":1}}}
+    backend_config={fusion_backend_config:{kind:"__triton_nested_gemm_fusion",
+      block_level_fusion_config:{output_tiles:[{sizes:[64,32]}],num_stages:1,num_warps:4,num_ctas:1}}}
 })";
 
 constexpr absl::string_view kResultsInNoPtxHlo = R"(
@@ -149,15 +160,13 @@ class NVPTXCompilationTests
                              PtxCompilationMethod compilation_method,
                              PtxLinkingMethod linking_method) {
     using CudaComputeCapability = stream_executor::CudaComputeCapability;
-    if (!::testing::Value(
-            backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .gpu_compute_capability(),
-            ::testing::VariantWith<CudaComputeCapability>(
-                CudaComputeCapability{9, 0,
-                                      CudaComputeCapability::FeatureExtension::
-                                          kAcceleratedFeatures})) &&
+    auto cc = backend()
+                  .default_stream_executor()
+                  ->GetDeviceDescription()
+                  .gpu_compute_capability();
+    if ((cc.cuda_compute_capability()->major < 9 ||
+         cc.cuda_compute_capability()->feature_extension !=
+             CudaComputeCapability::FeatureExtension::kAcceleratedFeatures) &&
         name == "requires_sm90a") {
       GTEST_SKIP() << "This test requires SM 9.0a";
     }
diff --git a/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options.cc b/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options.cc
index c0a1fc3897c61a..20312990b74842 100644
--- a/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options.cc
+++ b/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options.cc
@@ -24,9 +24,6 @@ stream_executor::cuda::CompilationOptions PtxCompileOptionsFromDebugOptions(
     const DebugOptions& debug_options, bool is_autotuning_compilation) {
   stream_executor::cuda::CompilationOptions compilation_options;
   compilation_options.cancel_if_reg_spill =
-      (debug_options
-           .xla_gpu_filter_kernels_spilling_registers_on_autotuning() &&
-       is_autotuning_compilation) ||
       debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling();
   compilation_options.disable_optimizations =
       debug_options.xla_gpu_disable_gpuasm_optimizations();
diff --git a/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options_test.cc b/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options_test.cc
index 3b229127af55c3..4c9d6fb849fe72 100644
--- a/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options_test.cc
+++ b/third_party/xla/xla/service/gpu/ptx_compile_options_from_debug_options_test.cc
@@ -76,21 +76,6 @@ TEST(PtxCompileOptionsFromDebugOptionsTest, DebugInfoCanBeEnabled) {
       Field(&CompilationOptions::generate_debug_info, true));
 }
 
-TEST(PtxCompileOptionsFromDebugOptionsTest,
-     RegSpillAsErrorCanBeEnabledForAutotuning) {
-  DebugOptions debug_options;
-  debug_options.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
-      true);
-  EXPECT_THAT(
-      PtxCompileOptionsFromDebugOptions(debug_options,
-                                        /*is_autotuning_compilation=*/false),
-      Field(&CompilationOptions::cancel_if_reg_spill, false));
-  EXPECT_THAT(
-      PtxCompileOptionsFromDebugOptions(debug_options,
-                                        /*is_autotuning_compilation=*/true),
-      Field(&CompilationOptions::cancel_if_reg_spill, true));
-}
-
 TEST(PtxCompileOptionsFromDebugOptionsTest,
      RegSpillAsErrorCanBeEnabledForAllKernels) {
   DebugOptions debug_options;
diff --git a/third_party/xla/xla/service/gpu/resource_requests.h b/third_party/xla/xla/service/gpu/resource_requests.h
deleted file mode 100644
index 810a103f41ec83..00000000000000
--- a/third_party/xla/xla/service/gpu/resource_requests.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RESOURCE_REQUESTS_H_
-#define XLA_SERVICE_GPU_RESOURCE_REQUESTS_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-
-namespace xla {
-namespace gpu {
-
-// Shared resources required for thunk initialization and execution.
-class ResourceRequests : public Thunk::ResourceRequestsInterface {
- public:
-  absl::Status AddClique(const GpuCliqueKey& clique_key) final;
-
-  absl::StatusOr<Thunk::CollectiveCliques> AcquireCollectiveCliques(
-      const Thunk::CollectiveExecuteParams& params,
-      bool use_persistent_cliques);
-
- private:
-  struct CliqueRequest {
-    GpuCliqueKey key;
-    int64_t id;
-  };
-
-  // Return clique requests deterministically ordered using a comparison
-  // function that produces identical ordering for all participating ranks.
-  //
-  // Example: 8 ranks splitted in different groups of communicators
-  //
-  // Group #0: [0,1], [2,3], [4,5], [6,7]
-  // Group #1: [0,4], [1,5], [2,6], [3,7]
-  //
-  // Both groups #0 and #1 can be acqured by splitting [0...7] clique. To avoid
-  // deadlocks all participants should acquire all cliques in a group #0 before
-  // acquiring any cliques in a group #1.
-  //
-  // We rely on clique request id to guarantee that the order is identical
-  // on all participating ranks (including ranks running on different hosts).
-  std::vector<CliqueRequest> GetOrderedCliqueRequests();
-
-  absl::flat_hash_map<GpuCliqueKey, CliqueRequest> cliques_;
-};
-}  // namespace gpu
-}  // namespace xla
-#endif  // XLA_SERVICE_GPU_RESOURCE_REQUESTS_H_
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index db066543a66133..9aea360ec38032 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/split_k_gemm_rewriter.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <iterator>
@@ -109,6 +110,17 @@ absl::Status UncompilableMatmul(absl::string_view explanation) {
   return s;
 }
 
+// Returns the padded K dimension so that it is a multiple of split_k and 16B.
+int64_t GetPaddedK(HloInstruction& dot, int64_t k, int64_t split_k) {
+  const int64_t alignment_in_bits = 16 * 8;
+  int64_t min_element_size_in_bits = alignment_in_bits;
+  for (const HloInstruction* p : dot.parent()->parameter_instructions()) {
+    min_element_size_in_bits = std::min(
+        min_element_size_in_bits, ShapeUtil::ElementSizeInBits(p->shape()));
+  }
+  return RoundUpTo(k, split_k * alignment_in_bits / min_element_size_in_bits);
+}
+
 }  // namespace
 
 absl::StatusOr<HloInstruction*> MakeSplitKOperand(
@@ -118,8 +130,9 @@ absl::StatusOr<HloInstruction*> MakeSplitKOperand(
     std::optional<int64_t> padded_k_size = std::nullopt) {
   HloInstruction* operand = dot.mutable_operand(operand_number);
   const int64_t k = operand->shape().dimensions(contracting_dim_idx);
-  const bool need_padding =
-      padded_k_size.has_value() ? k < *padded_k_size : k % config.split_k != 0;
+  padded_k_size =
+      std::max(GetPaddedK(dot, k, config.split_k), padded_k_size.value_or(0));
+  const bool need_padding = k < *padded_k_size;
 
   auto check_if_supported = [&](const HloInstruction& hlo,
                                 bool check_divisibility) {
@@ -172,9 +185,7 @@ absl::StatusOr<HloInstruction*> MakeSplitKOperand(
         dot.parent()->AddInstruction(HloInstruction::CreateConstant(
             LiteralUtil::Zero(operand->shape().element_type())));
 
-    int64_t padding = padded_k_size.has_value()
-                          ? *padded_k_size - k
-                          : config.split_k - k % config.split_k;
+    int64_t padding = *padded_k_size - k;
     PaddingConfig padding_config =
         MakeNoPaddingConfig(operand->shape().dimensions().size());
     padding_config.mutable_dimensions(contracting_dim_idx)
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
index c98b92322e1701..d84bc02f4f87b9 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
@@ -851,10 +851,10 @@ ENTRY %entry_computation {
       GmockMatch(m::Reduce(m::Fusion(&dot_fusion), m::ConstantScalar())));
   EXPECT_THAT(
       dot_fusion->called_computations()[0]->root_instruction(),
-      GmockMatch(m::ScaledDot(m::Op().WithShape(F8E4M3FN, {16, 3, 64}),
-                              m::Op().WithShape(F8E5M2, {32, 3, 64}),
-                              m::Op().WithShape(F8E8M0FNU, {16, 3, 2}),
-                              m::Op().WithShape(F8E8M0FNU, {32, 3, 2}))));
+      GmockMatch(m::ScaledDot(m::Op().WithShape(F8E4M3FN, {16, 3, 512}),
+                              m::Op().WithShape(F8E5M2, {32, 3, 512}),
+                              m::Op().WithShape(F8E8M0FNU, {16, 3, 16}),
+                              m::Op().WithShape(F8E8M0FNU, {32, 3, 16}))));
 }
 
 TEST_F(SplitKTest, ScaledDot_DifferentBlockSize) {
@@ -888,28 +888,30 @@ ENTRY %entry_computation {
       GmockMatch(m::Reduce(m::Fusion(&dot_fusion), m::ConstantScalar())));
   EXPECT_THAT(
       dot_fusion->called_computations()[0]->root_instruction(),
-      GmockMatch(m::ScaledDot(m::Op().WithShape(F8E4M3FN, {16, 3, 64}),
-                              m::Op().WithShape(F8E5M2, {32, 3, 64}),
-                              m::Op().WithShape(F8E8M0FNU, {16, 3, 2}),
-                              m::Op().WithShape(F8E8M0FNU, {32, 3, 4}))));
+      GmockMatch(m::ScaledDot(m::Op().WithShape(F8E4M3FN, {16, 3, 512}),
+                              m::Op().WithShape(F8E5M2, {32, 3, 512}),
+                              m::Op().WithShape(F8E8M0FNU, {16, 3, 16}),
+                              m::Op().WithShape(F8E8M0FNU, {32, 3, 32}))));
 }
 
-TEST_F(SplitKTest, ScaledDot_LhsOnly) {
+// TODO(b/436988479): Re-enable when split-K is fixed for scaled dots.
+TEST_F(SplitKTest, DISABLED_ScaledDot_LhsOnly) {
   const std::string hlo_text = R"(
 triton_gemm_dot {
   lhs = f8e4m3fn[16,128] parameter(0)
-  rhs = f8e5m2[32,128] parameter(1)
+  rhs = bf16[32,128] parameter(1)
   lhs_scale = f8e8m0fnu[16,4] parameter(2)
-  rhs_scale = f8e5m2[] constant(1.0)
+  rhs_scale = bf16[1,1] parameter(3)
   ROOT dot = f32[16,32] scaled-dot(lhs, rhs, lhs_scale, rhs_scale),
     lhs_contracting_dims={1}, rhs_contracting_dims={1}
 }
 
 ENTRY %entry_computation {
   lhs = f8e4m3fn[16,128] parameter(0)
-  rhs = f8e5m2[32,128] parameter(1)
+  rhs = bf16[32,128] parameter(1)
   lhs_scale = f8e8m0fnu[16,4] parameter(2)
-  ROOT fusion = f32[16,32] fusion(lhs, rhs, lhs_scale),
+  rhs_scale = bf16[1,1] constant(1.0)
+  ROOT fusion = f32[16,32] fusion(lhs, rhs, lhs_scale, rhs_scale),
       kind=kCustom, calls=triton_gemm_dot
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -923,28 +925,30 @@ ENTRY %entry_computation {
       module->entry_computation()->root_instruction(),
       GmockMatch(m::Reduce(m::Fusion(&dot_fusion), m::ConstantScalar())));
   EXPECT_THAT(dot_fusion->called_computations()[0]->root_instruction(),
-              GmockMatch(m::ScaledDot(m::Op().WithShape(F8E4M3FN, {16, 3, 64}),
-                                      m::Op().WithShape(F8E5M2, {32, 3, 64}),
-                                      m::Op().WithShape(F8E8M0FNU, {16, 3, 2}),
-                                      m::Op().WithShape(F8E5M2, {}))));
+              GmockMatch(m::ScaledDot(m::Op().WithShape(F8E4M3FN, {16, 3, 512}),
+                                      m::Op().WithShape(BF16, {32, 3, 512}),
+                                      m::Op().WithShape(F8E8M0FNU, {16, 3, 16}),
+                                      m::Op().WithShape(BF16, {1, 1, 1}))));
 }
 
-TEST_F(SplitKTest, ScaledDot_RhsOnly) {
+// TODO(b/436988479): Re-enable once the split-K is fixed for scaled dots.
+TEST_F(SplitKTest, DISABLED_ScaledDot_RhsOnly) {
   const std::string hlo_text = R"(
 triton_gemm_dot {
-  lhs = f8e4m3fn[16,128] parameter(0)
+  lhs = bf16[16,128] parameter(0)
   rhs = f8e5m2[32,128] parameter(1)
-  lhs_scale = f8e4m3fn[] constant(1.0)
-  rhs_scale = f8e8m0fnu[32,4] parameter(2)
+  lhs_scale = bf16[1,1] parameter(2)
+  rhs_scale = f8e8m0fnu[32,4] parameter(3)
   ROOT dot = f32[16,32] scaled-dot(lhs, rhs, lhs_scale, rhs_scale),
     lhs_contracting_dims={1}, rhs_contracting_dims={1}
 }
 
 ENTRY %entry_computation {
-  lhs = f8e4m3fn[16,128] parameter(0)
+  lhs = bf16[16,128] parameter(0)
   rhs = f8e5m2[32,128] parameter(1)
+  lhs_scale = bf16[1,1] constant(1.0)
   rhs_scale = f8e8m0fnu[32,4] parameter(2)
-  ROOT fusion = f32[16,32] fusion(lhs, rhs, rhs_scale),
+  ROOT fusion = f32[16,32] fusion(lhs, rhs, lhs_scale, rhs_scale),
       kind=kCustom, calls=triton_gemm_dot
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -959,10 +963,10 @@ ENTRY %entry_computation {
       GmockMatch(m::Reduce(m::Fusion(&dot_fusion), m::ConstantScalar())));
   EXPECT_THAT(
       dot_fusion->called_computations()[0]->root_instruction(),
-      GmockMatch(m::ScaledDot(m::Op().WithShape(F8E4M3FN, {16, 3, 64}),
-                              m::Op().WithShape(F8E5M2, {32, 3, 64}),
-                              m::Op().WithShape(F8E4M3FN, {}),
-                              m::Op().WithShape(F8E8M0FNU, {32, 3, 2}))));
+      GmockMatch(m::ScaledDot(m::Op().WithShape(BF16, {16, 3, 512}),
+                              m::Op().WithShape(F8E5M2, {32, 3, 512}),
+                              m::Op().WithShape(BF16, {1, 1, 1}),
+                              m::Op().WithShape(F8E8M0FNU, {32, 3, 16}))));
 }
 
 TEST_F(SplitKTest, ScaledDot_IncompatibleBlockSize) {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index 64164d527e6457..a6b5da3f80d33d 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -56,6 +56,8 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/repeat_buffer_kernel.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -67,6 +69,12 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace xla {
 namespace gpu {
@@ -405,9 +413,19 @@ absl::Status ExecuteKernelOnStream(
     se::Kernel& kernel, absl::Span<const se::KernelArgument> args,
     const LaunchDimensions& dims,
     const std::optional<se::ClusterDim>& cluster_dim, se::Stream* stream) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args,
-      se::PackKernelArgs(args, kernel.metadata()));
+  TraceMe trace([] { return TraceMeEncode("ExecuteKernelOnStream", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
+  std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args;
+  {
+    TraceMe trace(
+        [] {
+          return TraceMeEncode("ExecuteKernelOnStream/PackKernelArgs", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
+    TF_ASSIGN_OR_RETURN(kernel_args,
+                        se::PackKernelArgs(args, kernel.metadata()));
+  }
 
   return kernel.Launch(dims.thread_counts_per_block(), dims.block_counts(),
                        cluster_dim, stream, *kernel_args);
@@ -473,8 +491,8 @@ static void InitializeTypedBuffer(se::Stream* stream,
   // Copy the last part of `host_buffer` to the start of `buf` on the device
   int64_t first_size =
       std::min<int64_t>(host_buffer_size - host_index, elements_to_fill);
-  TF_CHECK_OK(stream->Memcpy(&buffer, host_buffer->data() + host_index,
-                             first_size * sizeof(T)));
+  CHECK_OK(stream->Memcpy(&buffer, host_buffer->data() + host_index,
+                          first_size * sizeof(T)));
   elements_to_fill -= first_size;
   if (elements_to_fill == 0) {
     // Nothing more to do
@@ -485,7 +503,7 @@ static void InitializeTypedBuffer(se::Stream* stream,
   CHECK_LE(first_size + second_size, host_buffer_size);
   se::DeviceMemoryBase mem =
       buffer.GetByteSlice(first_size * sizeof(T), second_size * sizeof(T));
-  TF_CHECK_OK(stream->Memcpy(&mem, host_buffer->data(), mem.size()));
+  CHECK_OK(stream->Memcpy(&mem, host_buffer->data(), mem.size()));
   elements_to_fill -= second_size;
   if (elements_to_fill == 0) {
     // Nothing more to do
@@ -507,10 +525,10 @@ static void InitializeTypedBuffer(se::Stream* stream,
   constexpr int threads_per_block = 256;
   constexpr int blocks_per_grid =
       (host_buffer_bytes + threads_per_block - 1) / threads_per_block;
-  TF_CHECK_OK(kernel->Launch(se::ThreadDim(threads_per_block, 1, 1),
-                             se::BlockDim(blocks_per_grid, 1, 1), stream,
-                             buffer, host_buffer_bytes,
-                             static_cast<int64_t>(buffer.size())));
+  CHECK_OK(kernel->Launch(se::ThreadDim(threads_per_block, 1, 1),
+                          se::BlockDim(blocks_per_grid, 1, 1), stream, buffer,
+                          host_buffer_bytes,
+                          static_cast<int64_t>(buffer.size())));
 }
 
 void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
diff --git a/third_party/xla/xla/service/gpu/target_util.cc b/third_party/xla/xla/service/gpu/target_util.cc
index 5018b455d420fa..85eba47bd387d0 100644
--- a/third_party/xla/xla/service/gpu/target_util.cc
+++ b/third_party/xla/xla/service/gpu/target_util.cc
@@ -239,7 +239,7 @@ struct TargetDeviceFunction GetDeviceFunctionRoot(
       return {"__nv_acos", "__ocml_acos", "_Z16__spirv_ocl_acos"};
     }
     case TargetDeviceFunctionID::kAcosh: {
-      return {"__nv_acosh", "__ocml_acosh", "_Z15__spirv_ocl_acosh"};
+      return {"__nv_acosh", "__ocml_acosh", "_Z17__spirv_ocl_acosh"};
     }
     case TargetDeviceFunctionID::kAtan2: {
       return {"__nv_atan2", "__ocml_atan2", "_Z17__spirv_ocl_atan2"};
@@ -247,8 +247,11 @@ struct TargetDeviceFunction GetDeviceFunctionRoot(
     case TargetDeviceFunctionID::kAsin: {
       return {"__nv_asin", "__ocml_asin", "_Z16__spirv_ocl_asin"};
     }
+    case TargetDeviceFunctionID::kAsinh: {
+      return {"__nv_asinh", "__ocml_asinh", "_Z17__spirv_ocl_asinh"};
+    }
     case TargetDeviceFunctionID::kAtanh: {
-      return {"__nv_atanh", "__ocml_atanh", "_Z15__spirv_ocl_atanh"};
+      return {"__nv_atanh", "__ocml_atanh", "_Z17__spirv_ocl_atanh"};
     }
     case TargetDeviceFunctionID::kCos: {
       return {"__nv_cos", "__ocml_cos", "_Z15__spirv_ocl_cos"};
@@ -287,7 +290,7 @@ struct TargetDeviceFunction GetDeviceFunctionRoot(
       return {"__nv_sin", "__ocml_sin", "_Z15__spirv_ocl_sin"};
     }
     case TargetDeviceFunctionID::kSinh: {
-      return {"__nv_sinh", "__ocml_sinh", "_Z15__spirv_ocl_sinh"};
+      return {"__nv_sinh", "__ocml_sinh", "_Z16__spirv_ocl_sinh"};
     }
     case TargetDeviceFunctionID::kSqrt: {
       return {"__nv_sqrt", "__ocml_sqrt", "_Z16__spirv_ocl_sqrt"};
@@ -313,6 +316,8 @@ std::optional<TargetDeviceFunctionID> GetTargetDeviceFunctionID(HloOpcode op) {
       return TargetDeviceFunctionID::kAcosh;
     case HloOpcode::kAsin:
       return TargetDeviceFunctionID::kAsin;
+    case HloOpcode::kAsinh:
+      return TargetDeviceFunctionID::kAsinh;
     case HloOpcode::kAtan2:
       return TargetDeviceFunctionID::kAtan2;
     case HloOpcode::kAtanh:
diff --git a/third_party/xla/xla/service/gpu/target_util.h b/third_party/xla/xla/service/gpu/target_util.h
index cdca43380d2441..17d14a856b5bbc 100644
--- a/third_party/xla/xla/service/gpu/target_util.h
+++ b/third_party/xla/xla/service/gpu/target_util.h
@@ -72,6 +72,7 @@ enum class TargetDeviceFunctionID {
   kAcos,
   kSinh,
   kAsin,
+  kAsinh,
   kCosh,
   kAtanh,
 };
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 2ca1a6068827ad..c98a5b5e2f36dc 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -8,7 +8,7 @@ load(
 
 # copybara:uncomment load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
+load("//xla:lit.bzl", "enforce_glob", "lit_test_suite_for_gpus")
 load(
     "//xla:xla.default.bzl",
     "xla_cc_test",
@@ -86,6 +86,7 @@ xla_test(
         [
             "//xla:error_spec",
             "//xla:shape_util",
+            "//xla/backends/gpu:ffi",
             "//xla/ffi",
             "//xla/ffi:ffi_api",
             "@com_google_absl//absl/algorithm:container",
@@ -167,10 +168,12 @@ xla_test(
         "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
         "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:literal_test_util",
+        "//xla/tests:test_utils",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -394,6 +397,7 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -402,7 +406,6 @@ xla_test(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -462,9 +465,8 @@ xla_test(
     name = "gpu_kernel_tiling_test",
     srcs = ["gpu_kernel_tiling_test.cc"],
     backends = [
-        "p100",
-        "amdgpu_any",
-    ] + if_oss(["nvgpu_any"]),
+        "gpu",
+    ],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -599,6 +601,7 @@ xla_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
+    shard_count = 15,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -608,7 +611,7 @@ xla_test(
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/service/gpu/transforms:sort_rewriter",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
@@ -617,7 +620,7 @@ xla_test(
     ],
 )
 
-lit_test_suite(
+lit_test_suite_for_gpus(
     name = "hlo_lit_tests",
     srcs = enforce_glob(
         [
@@ -628,6 +631,7 @@ lit_test_suite(
             "offload_scan_output.hlo",
             "pad_to_static.hlo",
             "reduce_fold_zero_add.hlo",
+            "reduce-precision.hlo",
             "rng_get_and_update_state.hlo",
             "single_instruction.hlo",
             "slice_to_dynamic.hlo",
@@ -640,32 +644,36 @@ lit_test_suite(
             "*.hlo",
         ],
     ),
-    args = if_cuda_is_configured([
-        "--param=PTX=PTX",
-        "--param=GPU=a6000",
-    ]) + if_rocm_is_configured([
-        "--param=PTX=GCN",
-        "--param=GPU=mi200",
-    ]),
     cfg = "//xla:lit.cfg.py",
-    data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
-    ],
-    default_tags = tf_gpu_tests_tags(),
+    default_tags = ["gpu"],  # Needs to run in a build with a gpu configured.
+    disabled_on_gpus = {
+        "v100": [
+            "kernel_reuse.hlo",
+            "triton_naming.hlo",
+        ],
+        "p100": [
+            "kernel_reuse.hlo",
+            "triton_naming.hlo",
+        ],
+        "mi200": [
+            "element_wise_row_vectorization.hlo",
+            "scatter_bf16.hlo",
+            "single_instruction.hlo",
+            "reduce_unnested.hlo",
+            "reduction_vectorization_sm_all.hlo",
+        ],
+    },
+    gpus = [
+        "a100_pcie_80",
+        "a6000",
+        "b200",
+        "h100_sxm",
+        "mi200",
+        "p100",
+        "v100",
+    ],
     hermetic_cuda_data_dir = "%S/../../../../../cuda_nvcc",
     tags = ["no-oneapi"],
-    tags_override = {
-        "element_wise_row_vectorization.hlo": ["cuda-only"],
-        "scatter_bf16.hlo": ["cuda-only"],
-        "single_instruction.hlo": ["cuda-only"],
-        "reduce_unnested.hlo": ["cuda-only"],
-        "reduction_vectorization_sm_all.hlo": ["cuda-only"],
-    },
     tools = [
         "//xla/tools:hlo-opt",
         "@llvm-project//llvm:FileCheck",
@@ -673,17 +681,6 @@ lit_test_suite(
 )
 
 # copybara:uncomment_begin(triton-opt tool doesn't build in OSS)
-# lit_test_suite(
-#     name = "mlir_lit_tests",
-#     srcs = glob(["*.mlir"]),
-#     cfg = "//xla:lit.cfg.py",
-#     tools = [
-#         ":xla-opt",
-#         "@llvm-project//llvm:FileCheck",
-#         "@triton//:triton-opt",
-#     ],
-# )
-#
 # cc_binary(
 #     name = "xla-opt",
 #     srcs = ["xla-opt.cc"],
@@ -695,6 +692,7 @@ lit_test_suite(
 #         "@llvm-project//mlir:FuncExtensions",
 #         "@llvm-project//mlir:LLVMIRTransforms",
 #         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+#         "@llvm-project//mlir:MemRefDialect",
 #         "@llvm-project//mlir:MlirOptLib",
 #         "@llvm-project//mlir:Pass",
 #         "@llvm-project//mlir:RegisterAllExtensions",  # buildcleaner: keep
@@ -707,6 +705,7 @@ lit_test_suite(
 #         "//xla/backends/gpu/codegen/triton/transforms:passes",
 #         "//xla/codegen/emitters/ir:xla",
 #         "//xla/codegen/emitters/transforms:passes",
+#         "//xla/codegen/xtile/ir:xtile",
 #         "//xla/stream_executor:device_description",
 #         "//xla/stream_executor/cuda:cuda_compute_capability",
 #         "@triton//:AllPassesAndDialects",
@@ -860,6 +859,7 @@ xla_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc b/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
index 8eafc3268052fb..279abad0e10b12 100644
--- a/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -21,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
@@ -31,6 +34,7 @@ limitations under the License.
 #include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tests/test_utils.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
@@ -47,8 +51,8 @@ se::StreamExecutor* GpuExecutor() {
 
 bool IsAtLeastCuda12900(const se::StreamExecutor* stream_executor) {
   const auto& device_description = stream_executor->GetDeviceDescription();
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-      &device_description.gpu_compute_capability());
+  const auto* cuda_cc =
+      device_description.gpu_compute_capability().cuda_compute_capability();
   if (cuda_cc != nullptr) {
     if (device_description.driver_version() >=
             stream_executor::SemanticVersion(12, 9, 0) &&
@@ -70,6 +74,111 @@ class CommandBufferTest
     debug_options.set_xla_gpu_command_buffer_scheduling_mode(GetParam());
     return debug_options;
   }
+
+  // Execute compiled module three times to exercise warm-up, create, and
+  // update paths. Third run uses cloned arguments to encourage device buffer
+  // address changes.
+  void ExecuteThreePhasesAndExpect(std::unique_ptr<HloModule> module,
+                                   absl::Span<const Literal* const> arguments,
+                                   const Literal& expected,
+                                   bool run_hlo_passes) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<OpaqueExecutable> executable,
+        CreateExecutable(std::move(module), run_hlo_passes));
+
+    // 1) Warm-up (may run thunks)
+    TF_ASSERT_OK_AND_ASSIGN(
+        Literal result1,
+        test_runner().ExecuteWithExecutable(executable.get(), arguments));
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result1));
+
+    // 2) Create (record and execute command buffer)
+    TF_ASSERT_OK_AND_ASSIGN(
+        Literal result2,
+        test_runner().ExecuteWithExecutable(executable.get(), arguments));
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result2));
+
+    // 3) Update (execute with cloned arguments to attempt buffer changes)
+    std::vector<Literal> cloned_args_storage;
+    cloned_args_storage.reserve(arguments.size());
+    std::vector<const Literal*> cloned_args;
+    cloned_args.reserve(arguments.size());
+    for (const Literal* arg : arguments) {
+      cloned_args_storage.push_back(arg->Clone());
+      cloned_args.push_back(&cloned_args_storage.back());
+    }
+
+    TF_ASSERT_OK_AND_ASSIGN(Literal result3,
+                            test_runner().ExecuteWithExecutable(
+                                executable.get(), absl::MakeSpan(cloned_args)));
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result3));
+  }
+
+  // Same as above, but generates fake inputs and compares results to a
+  // reference execution. Useful for tests originally using RunAndCompare.
+  ::testing::AssertionResult RunAndCompareThreeIterations(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes,
+      const std::optional<ErrorSpec>& error) {
+    // Verify module then clone for reference.
+    CHECK_OK(this->verifier().Run(module.get()).status());
+    std::unique_ptr<HloModule> reference_module = module->Clone();
+
+    // Prepare fake args for both runners.
+    absl::StatusOr<std::vector<Literal>> fake_args_or =
+        MakeFakeArguments(module.get());
+    if (!fake_args_or.ok()) {
+      return ::testing::AssertionFailure() << fake_args_or.status().message();
+    }
+    std::vector<Literal> fake_args = std::move(*fake_args_or);
+    std::vector<const Literal*> arg_ptrs = LiteralUtil::MakePointers(fake_args);
+
+    // Reference once.
+    absl::StatusOr<Literal> reference = reference_runner().Execute(
+        std::move(reference_module), absl::MakeSpan(arg_ptrs), run_hlo_passes);
+    if (!reference.ok()) {
+      return ::testing::AssertionFailure() << reference.status();
+    }
+
+    // Compile once on test backend and run three iterations.
+    absl::StatusOr<std::unique_ptr<OpaqueExecutable>> exec_or =
+        CreateExecutable(std::move(module), run_hlo_passes);
+    if (!exec_or.ok()) {
+      return ::testing::AssertionFailure() << exec_or.status();
+    }
+    std::unique_ptr<OpaqueExecutable> exec = std::move(*exec_or);
+
+    // 1) Warm-up
+    absl::StatusOr<Literal> r1 = test_runner().ExecuteWithExecutable(
+        exec.get(), absl::MakeSpan(arg_ptrs));
+    if (!r1.ok()) return ::testing::AssertionFailure() << r1.status();
+    if (!LiteralTestUtil::NearOrEqual(*reference, *r1, error))
+      return ::testing::AssertionFailure() << "Mismatch on warm-up run";
+
+    // 2) Create
+    absl::StatusOr<Literal> r2 = test_runner().ExecuteWithExecutable(
+        exec.get(), absl::MakeSpan(arg_ptrs));
+    if (!r2.ok()) return ::testing::AssertionFailure() << r2.status();
+    if (!LiteralTestUtil::NearOrEqual(*reference, *r2, error))
+      return ::testing::AssertionFailure() << "Mismatch on create run";
+
+    // 3) Update with cloned args
+    std::vector<Literal> cloned_args_storage;
+    cloned_args_storage.reserve(arg_ptrs.size());
+    std::vector<const Literal*> cloned_arg_ptrs;
+    cloned_arg_ptrs.reserve(arg_ptrs.size());
+    for (const Literal* a : arg_ptrs) {
+      cloned_args_storage.push_back(a->Clone());
+      cloned_arg_ptrs.push_back(&cloned_args_storage.back());
+    }
+
+    absl::StatusOr<Literal> r3 = test_runner().ExecuteWithExecutable(
+        exec.get(), absl::MakeSpan(cloned_arg_ptrs));
+    if (!r3.ok()) return ::testing::AssertionFailure() << r3.status();
+    if (!LiteralTestUtil::NearOrEqual(*reference, *r3, error))
+      return ::testing::AssertionFailure() << "Mismatch on update run";
+
+    return ::testing::AssertionSuccess();
+  }
 };
 
 // Test fixture that enables loop unrolling for command buffers.
@@ -119,10 +228,8 @@ TEST_P(CommandBufferTest, Fusions) {
   Literal argument = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   Literal expected = LiteralUtil::CreateR2<float>({{3.0, 8.0}, {15.0, 24.0}});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal result,
-      Execute(std::move(module), {&argument}, /*run_hlo_passes=*/false));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  ExecuteThreePhasesAndExpect(std::move(module), {&argument}, expected,
+                              /*run_hlo_passes=*/false);
 }
 
 TEST_P(CommandBufferTest, TrueFalseConditional) {
@@ -170,9 +277,8 @@ TEST_P(CommandBufferTest, TrueFalseConditional) {
 
     Literal pred = LiteralUtil::CreateR0<bool>(true);
     Literal expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&pred, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&pred, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 
   {  // Execute `false` branch.
@@ -180,9 +286,8 @@ TEST_P(CommandBufferTest, TrueFalseConditional) {
 
     Literal pred = LiteralUtil::CreateR0<bool>(false);
     Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&pred, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&pred, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 }
 
@@ -230,9 +335,8 @@ TEST_P(CommandBufferTest, IndexConditional) {
 
     Literal index = LiteralUtil::CreateR0<int32_t>(0);
     Literal expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&index, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&index, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 
   {  // Execute `1` branch.
@@ -240,9 +344,8 @@ TEST_P(CommandBufferTest, IndexConditional) {
 
     Literal index = LiteralUtil::CreateR0<int32_t>(1);
     Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&index, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&index, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 
   {  // Execute `1024` branch (our of bound index executes N-1 branch).
@@ -250,9 +353,8 @@ TEST_P(CommandBufferTest, IndexConditional) {
 
     Literal index = LiteralUtil::CreateR0<int32_t>(1024);
     Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&index, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&index, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 }
 
@@ -313,10 +415,8 @@ TEST_P(CommandBufferTest, WhileLoop) {
   Literal expected_value = LiteralUtil::CreateR0<float>(20.0);
   Literal expected = LiteralUtil::MakeTuple({&expected_cnt, &expected_value});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal result,
-      Execute(std::move(module), {&argument}, /*run_hlo_passes=*/false));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  ExecuteThreePhasesAndExpect(std::move(module), {&argument}, expected,
+                              /*run_hlo_passes=*/false);
 }
 
 TEST_P(CommandBufferTest, ControlDependencyTest) {
@@ -574,8 +674,31 @@ TEST_P(CommandBufferTest, DynamicSliceCopyFusionCmd) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_text, config));
 
-  EXPECT_TRUE(
-      RunAndCompareNoHloPasses(std::move(module), ErrorSpec{1e-3, 2e-3}));
+  EXPECT_TRUE(RunAndCompareThreeIterations(
+      std::move(module), /*run_hlo_passes=*/false, ErrorSpec{1e-3, 2e-3}));
+
+  if (!IsAtLeastCuda12900(GpuExecutor())) {
+    GTEST_SKIP() << "While loop unrolling is not supported for CUDA < 12.9";
+  }
+
+  debug_options.add_xla_gpu_enable_command_buffer(
+      DebugOptions::DYNAMIC_SLICE_COPY_FUSION);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLAS);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLASLT);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUSTOM_CALL);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::WHILE);
+  debug_options.set_xla_gpu_command_buffer_unroll_loops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto unrolled_module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+
+  EXPECT_TRUE(RunAndCompareThreeIterations(std::move(unrolled_module),
+                                           /*run_hlo_passes=*/false,
+                                           ErrorSpec{1e-3, 2e-3}));
 }
 
 TEST_P(CommandBufferUnrollTest, WhileLoop) {
@@ -641,10 +764,8 @@ TEST_P(CommandBufferUnrollTest, WhileLoop) {
   Literal expected_value = LiteralUtil::CreateR0<float>(20.0);
   Literal expected = LiteralUtil::MakeTuple({&expected_cnt, &expected_value});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal result,
-      Execute(std::move(module), {&argument}, /*run_hlo_passes=*/false));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  ExecuteThreePhasesAndExpect(std::move(module), {&argument}, expected,
+                              /*run_hlo_passes=*/false);
 }
 
 TEST_P(CommandBufferUnrollTest, WhileLoopMultiDevice) {
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
index 069cf01b7163b2..3a0b94d13fb44b 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
@@ -168,22 +168,22 @@ TEST(SharedMemoryUseTest, ArrayReversalWorks) {
     }
   }
 
-  TF_CHECK_OK(
+  CHECK_OK(
       stream->Memcpy(&device_buffer, host_buffer.data(), buffer_size_bytes));
   se::DeviceMemory<uint32_t> dev_n_cols = executor->AllocateScalar<uint32_t>();
-  TF_CHECK_OK(stream->Memcpy(&dev_n_cols, &n_cols, sizeof(uint32_t)));
+  CHECK_OK(stream->Memcpy(&dev_n_cols, &n_cols, sizeof(uint32_t)));
   se::DeviceMemory<uint32_t> dev_n_rows = executor->AllocateScalar<uint32_t>();
-  TF_CHECK_OK(stream->Memcpy(&dev_n_rows, &n_rows, sizeof(uint32_t)));
-  TF_CHECK_OK(stream->BlockHostUntilDone());
+  CHECK_OK(stream->Memcpy(&dev_n_rows, &n_rows, sizeof(uint32_t)));
+  CHECK_OK(stream->BlockHostUntilDone());
 
-  TF_CHECK_OK(ExecuteKernelOnStream(
+  CHECK_OK(ExecuteKernelOnStream(
       *kernel, {device_buffer, dev_n_cols, dev_n_rows},
       {/*block_x_count=*/1, /*thread_x_count_per_block=*/n_cols},
       /*cluster_dim=*/{}, stream.get()));
-  TF_CHECK_OK(stream->BlockHostUntilDone());
-  TF_CHECK_OK(
+  CHECK_OK(stream->BlockHostUntilDone());
+  CHECK_OK(
       stream->Memcpy(host_buffer.data(), device_buffer, buffer_size_bytes));
-  TF_CHECK_OK(stream->BlockHostUntilDone());
+  CHECK_OK(stream->BlockHostUntilDone());
 
   for (int row = 0; row < n_rows; ++row) {
     for (int col = 0; col < n_cols; ++col) {
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
index 6238a5cba57141..789f5e65fcedbe 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "xla/backends/gpu/ffi.h"
 #include "xla/error_spec.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
index cffb39e9c8aec7..38bedc99a77a86 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
@@ -412,8 +412,7 @@ TEST_F(GpuCopyTest, UseDynamicMemcpyIntegrationTest) {
                                 .default_stream_executor()
                                 ->GetDeviceDescription()
                                 .gpu_compute_capability();
-  if (auto cc = std::get_if<stream_executor::CudaComputeCapability>(
-          &compute_capability);
+  if (auto cc = compute_capability.cuda_compute_capability();
       !cc || !cc->IsAtLeastAmpere()) {
     GTEST_SKIP() << "Test requires at least Ampere.";
   }
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index b3465d0efcd928..db0e8ccd7740cf 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "xla/array4d.h"
 #include "xla/error_spec.h"
@@ -358,7 +359,7 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   const std::string                                                   // NOLINT
   GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,128,2048]{3,2,1,0},bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,128,2048]{3,2,1,0},bf16[2,6,2048,128]{3,2,1,0},$bias_type[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
 
     region_0.28 {
       Arg_0.29 = bf16[] parameter(0)
@@ -379,8 +380,9 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
       constant.6 = bf16[] constant(2)
       broadcast.7 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(constant.6), dimensions={}
       multiply.11 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.10, broadcast.7)
-      Arg_3.4 = bf16[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
-      add.27 = bf16[2,6,2048,2048]{3,2,1,0} add(multiply.11, Arg_3.4)
+      Arg_3.4 = $bias_type[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
+      convert.1 = bf16[2,6,2048,2048]{3,2,1,0} convert(Arg_3.4)
+      add.27 = bf16[2,6,2048,2048]{3,2,1,0} add(multiply.11, convert.1)
       constant.9 = bf16[] constant(-inf)
       reduce.32 = bf16[2,6,2048]{2,1,0} reduce(add.27, constant.9), dimensions={3}, to_apply=region_0.28
       reshape.33 = bf16[2,6,2048,1]{3,2,1,0} reshape(reduce.32)
@@ -408,15 +410,15 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   const std::string  // NOLINT
   GetModuleFlash_Attention_CuDNN_BMM1_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,128,2048]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,128,2048]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, $bias_type[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
 
     ENTRY main.52 {
       Arg_0.1 = bf16[2,6,2048,128]{3,2,1,0} parameter(0), sharding={replicated}
       Arg_1.2 = bf16[2,6,128,2048]{3,2,1,0} parameter(1), sharding={replicated}
       transpose = bf16[2,6,2048,128]{3,2,1,0} transpose(Arg_1.2), dimensions={0,1,3,2}
       Arg_2.3 = bf16[2,6,2048,128]{3,2,1,0} parameter(2), sharding={replicated}
-      Arg_3.4 = bf16[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
-      fmha-bmm-scale-bias-softmax-bmm = (bf16[2,6,2048,128]{3,2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose, Arg_2.3, Arg_3.4), custom_call_target="__cudnn$fmhaScaleBiasSoftmax", operand_layout_constraints={bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,2048]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"24":"0","17":"1"},"workspace_size":"0"},"fmha_scale":2,"dropout_rate":0,"bmm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","2048","2048"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
+      Arg_3.4 = $bias_type[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
+      fmha-bmm-scale-bias-softmax-bmm = (bf16[2,6,2048,128]{3,2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose, Arg_2.3, Arg_3.4), custom_call_target="__cudnn$fmhaScaleBiasSoftmax", operand_layout_constraints={bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, $bias_type[2,6,2048,2048]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"24":"0","17":"1"},"workspace_size":"0"},"fmha_scale":2,"dropout_rate":0,"bmm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","2048","2048"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
       ROOT get-tuple-element = bf16[2,6,2048,128]{3,2,1,0} get-tuple-element(fmha-bmm-scale-bias-softmax-bmm), index=0
     } // main.52
   )";
@@ -742,8 +744,24 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
         GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_HloString_BF16();
     std::string hlo_string_ref =
         GetModuleFlash_Attention_CuDNN_BMM1_Bias_Softmax_BMM2_HloString_BF16();
-    EXPECT_TRUE(RunAndCompareTwoModules(hlo_string, hlo_string_ref,
-                                        ErrorSpec{1e-3, 1e-5}));
+
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) >=
+        se::dnn::VersionInfo(9, 13, 0)) {
+      // fp32 bias is supported to cudnn 9.13 and above
+      std::string f32_bias_hlo_string =
+          absl::StrReplaceAll(hlo_string, {{"$bias_type", "f32"}});
+      std::string f32_bias_hlo_string_ref =
+          absl::StrReplaceAll(hlo_string_ref, {{"$bias_type", "f32"}});
+      EXPECT_TRUE(RunAndCompareTwoModules(
+          f32_bias_hlo_string, f32_bias_hlo_string_ref, ErrorSpec{1e-3, 1e-5}));
+    }
+
+    std::string bf16_bias_hlo_string =
+        absl::StrReplaceAll(hlo_string, {{"$bias_type", "bf16"}});
+    std::string bf16_bias_hlo_string_ref =
+        absl::StrReplaceAll(hlo_string_ref, {{"$bias_type", "bf16"}});
+    EXPECT_TRUE(RunAndCompareTwoModules(
+        bf16_bias_hlo_string, bf16_bias_hlo_string_ref, ErrorSpec{1e-3, 1e-5}));
   }
 
   void TestImpl_Flash_Attention_Training_BMM1_Bias_Softmax_BMM2() {
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index 29ba79963ac324..91359cd8b6ca91 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -141,10 +141,16 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
         calls=fused_computation.1
     })";
 
-  // Check that a call to llvm.nvvm.barrier0 is generated.
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
+  // Disable autotuning because this test is checking for that the native
+  // emitter generates a kernel correctly. Autotuning may change it to generate
+  // a triton kernel instead, which uses a different barrier.
+  hlo_module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_autotune_level(0);
+  // Check that a call to llvm.nvvm.barrier0 is generated.
   auto expected_ir = R"(
 ; CHECK-LABEL: define KERNEL_ANNOTATION @{{[a-z_]*}}fusion
 ; CHECK: call void BARRIER()
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
index f874525953a414..cc33051cdb219a 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
@@ -44,9 +44,9 @@ HloModule primitive_computation_mul.8
 ENTRY primitive_computation_mul.8 {
   parameter.1 = s8[65536] parameter(0)
   parameter.2 = s8[65536] parameter(1)
-  broadcast.3 = s8[65536,65536,65536,128,2] broadcast(parameter.1), dimensions={0}
-  broadcast.4 = s8[65536,65536,65536,128,2] broadcast(parameter.2), dimensions={1}
-  ROOT multiply.5 = s8[65536,65536,65536,128,2] multiply(broadcast.3, broadcast.4)
+  broadcast.3 = s8[65536,65536,65536,128,4] broadcast(parameter.1), dimensions={0}
+  broadcast.4 = s8[65536,65536,65536,128,4] broadcast(parameter.2), dimensions={1}
+  ROOT multiply.5 = s8[65536,65536,65536,128,4] multiply(broadcast.3, broadcast.4)
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
index 47d42699030d7f..9da5272780ac8f 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
@@ -34,10 +34,10 @@ limitations under the License.
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace gpu {
@@ -117,8 +117,7 @@ class GpuIrEmitterUnnestedTest : public GpuCodegenTest {
   }
 };
 
-TEST_F(GpuIrEmitterUnnestedTest,
-       EmitTritonCustomCallWithCorrectLoweringAndWithoutNoaliasOrAlignment) {
+TEST_F(GpuIrEmitterUnnestedTest, EmitTritonCustomCallWithCorrectLowering) {
   if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
     GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
   }
@@ -150,13 +149,9 @@ TEST_F(GpuIrEmitterUnnestedTest,
   CompileAndVerifyIr(std::move(module),
                      R"(
 ; CHECK: @add_one
-; CHECK-NOT: noalias align
 ; CHECK-SAME: dereferenceable(4) %arg0
-; CHECK-NOT: noalias align
 ; CHECK-SAME: dereferenceable(4) %arg1
-; CHECK-NOT: noalias align
 ; CHECK-SAME: dereferenceable(4) %arg2
-; CHECK-NOT: noalias align
 ; CHECK-SAME: dereferenceable(4) %arg3
 ; CHECK-DAG:  addrspacecast ptr %arg0 to ptr addrspace(1)
 ; CHECK-DAG:  addrspacecast ptr %arg1 to ptr addrspace(1)
@@ -166,6 +161,43 @@ TEST_F(GpuIrEmitterUnnestedTest,
                      /*match_optimized_ir=*/false);
 }
 
+TEST_F(GpuIrEmitterUnnestedTest, EmitTritonCustomCallParseErrorHasEscapedIr) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
+  }
+
+  // Tests that MLIR IR with invalid unicode characters is escaped correctly
+  // on error.
+  constexpr absl::string_view kMlirIrInvalidUnicode = "ML\xef";
+
+  HloComputation::Builder computation_builder(TestName());
+
+  // Create parameters and custom call in the computation builder.
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  computation_builder.AddInstruction(CreateTritonCustomCall(
+      tuple_shape, param_0, param_1, kMlirIrInvalidUnicode, kCallName));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+
+  auto result =
+      CompileToExecutable(std::move(module), /*run_optimization_passes=*/true);
+  EXPECT_FALSE(result.ok());
+  EXPECT_THAT(result.status().message(),
+              HasSubstr("Failed to parse Triton module"));
+
+  // Verify that the error message contains the escaped IR.
+  EXPECT_THAT(result.status().message(), HasSubstr("input ir: \"ML\\xef\""));
+}
+
 TEST_F(GpuIrEmitterUnnestedTest,
        EmitTritonCustomCallWithCorrectKernelParamAttributes) {
   if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index b8d4c20d7e9959..26fa9db336994b 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -55,18 +55,38 @@ ENTRY main {
 
 HloModule t, is_scheduled=true
 
+c0 {
+  p0 = f16[15,19]{1,0} parameter(0)
+  ROOT r = f16[15,19]{1,0} copy(p0)
+}
+c1 {
+  p0 = s8[19,17]{1,0} parameter(0)
+  c = f16[19,17]{1,0} convert(p0)
+  ROOT r = f16[19,17]{1,0} copy(c)
+}
 triton_gemm_dot0 {
   parameter_1 = f16[15,19]{1,0} parameter(1)
   parameter_0 = s8[19,17]{1,0} parameter(0)
-  cp1.1 = f16[19,17]{1,0} convert(parameter_0)
-  ROOT dot0.1 = f16[15,17]{1,0} dot(parameter_1, cp1.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  a = f16[15,19] fusion(parameter_1), kind=kCustom, calls=c0, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  b = f16[19,17] fusion(parameter_0), kind=kCustom, calls=c1, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  ROOT dot0.1 = f16[15,17]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
+c2 {
+  p0 = f16[15,19]{1,0} parameter(0)
+  ROOT r = f16[15,19]{1,0} copy(p0)
+}
+c3 {
+  p0 = s8[19,17]{1,0} parameter(0)
+  c = f16[19,17]{1,0} convert(p0)
+  ROOT r = f16[19,17]{1,0} copy(c)
+}
 triton_gemm_dot1 {
   parameter_1.1 = f16[15,19]{1,0} parameter(1)
   parameter_0.1 = s8[19,17]{1,0} parameter(0)
-  cp3.1 = f16[19,17]{1,0} convert(parameter_0.1)
-  ROOT dot1.1 = f16[15,17]{1,0} dot(parameter_1.1, cp3.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  a = f16[15,19] fusion(parameter_1.1), kind=kCustom, calls=c2, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  b = f16[19,17] fusion(parameter_0.1), kind=kCustom, calls=c3, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  ROOT dot1.1 = f16[15,17]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY e {
@@ -74,8 +94,8 @@ ENTRY e {
   p2 = f16[15,19]{1,0} parameter(2)
   p1 = s8[19,17]{1,0} parameter(1)
   p0 = f16[15,19]{1,0} parameter(0)
-  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\",\"num_ctas\":\"1\"}}}"
-  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\",\"num_ctas\":\"1\"}}}"
+  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}], \"num_stages\":\"4\", \"num_warps\":\"4\", \"num_ctas\":\"1\"}}}"
+  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}], \"num_stages\":\"4\", \"num_warps\":\"4\", \"num_ctas\":\"1\"}}}"
   ROOT tuple = (f16[15,17]{1,0}, f16[15,17]{1,0}) tuple(triton_gemm_dot0, triton_gemm_dot1)
 }
 
diff --git a/third_party/xla/xla/service/gpu/tests/mock_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/mock_custom_call_test.cc
index 7110400d545a81..398edbb8cb3643 100644
--- a/third_party/xla/xla/service/gpu/tests/mock_custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/mock_custom_call_test.cc
@@ -29,7 +29,7 @@ TEST_F(UnknownCustomCallFails, UnknownCustomCallFails) {
 
     ENTRY Test1 {
       a = f32[128] parameter(0)
-      ROOT r1 = f32[128] custom-call(a), custom_call_target="my_custom_call"
+      ROOT r1 = f32[128] custom-call(a), custom_call_target="my_custom_call", api_version=API_VERSION_TYPED_FFI
     }
   )";
 
@@ -50,7 +50,7 @@ TEST_F(MockedCustomCall, CustomCallIgnored) {
 
     ENTRY Test1 {
       a = f32[128] parameter(0)
-      ROOT r1 = f32[128] custom-call(a), custom_call_target="my_custom_call"
+      ROOT r1 = f32[128] custom-call(a), custom_call_target="my_custom_call", api_version=API_VERSION_TYPED_FFI
     }
   )";
 
diff --git a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
new file mode 100644
index 00000000000000..94752f8871c81e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
@@ -0,0 +1,8 @@
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+
+e {
+  a = bf16[] parameter(0)
+  b = bf16[] reduce-precision(a), exponent_bits=8, mantissa_bits=7
+}
+
+// CHECK-NOT: reduce-precision
diff --git a/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc b/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
index f6e72c9a076bde..a8a0be5c78c633 100644
--- a/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
@@ -37,8 +37,7 @@ class SimplifyFPConversionsTest : public HloTestBase {
     const auto& device_description =
         backend().default_stream_executor()->GetDeviceDescription();
     const auto& cc = device_description.gpu_compute_capability();
-    return std::holds_alternative<se::CudaComputeCapability>(cc) &&
-           std::get<se::CudaComputeCapability>(cc).IsAtLeastHopper();
+    return cc.IsCuda() && cc.cuda_compute_capability()->IsAtLeastHopper();
   }
 
   void SetEnableSimplifyFpConversions(bool enable_simplify_all_fp_conversions) {
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index bac8bfdcdc046a..4ecf3fcb847da4 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -163,8 +163,8 @@ compare {
 // CHECK:         %[[VAL_130:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_103]]
 // CHECK:         %[[VAL_131:.*]] = addrspacecast ptr addrspace(3) %[[VAL_130]] to ptr
 // CHECK-GCN:     %[[VAL_11_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_11]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11]])
 // CHECK-PTX:     %[[VAL_132:.*]] = load i8, ptr %[[VAL_11]], align 1
 // CHECK-GCN:     %[[VAL_132:.*]] = load i8, ptr addrspace(5) %[[VAL_11]], align 1
 // CHECK:         %[[VAL_133:.*]] = icmp ne i8 %[[VAL_132]], 0
@@ -185,8 +185,8 @@ compare {
 // CHECK:         %[[VAL_141:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_110]]
 // CHECK:         %[[VAL_142:.*]] = addrspacecast ptr addrspace(3) %[[VAL_141]] to ptr
 // CHECK-GCN:     %[[VAL_10_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_10]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10]])
 // CHECK-PTX:     %[[VAL_143:.*]] = load i8, ptr %[[VAL_10]], align 1
 // CHECK-GCN:     %[[VAL_143:.*]] = load i8, ptr addrspace(5) %[[VAL_10]], align 1
 // CHECK:         %[[VAL_144:.*]] = icmp ne i8 %[[VAL_143]], 0
@@ -207,8 +207,8 @@ compare {
 // CHECK:         %[[VAL_152:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_118]]
 // CHECK:         %[[VAL_153:.*]] = addrspacecast ptr addrspace(3) %[[VAL_152]] to ptr
 // CHECK-GCN:     %[[VAL_9_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_9]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9]])
 // CHECK-PTX:     %[[VAL_154:.*]] = load i8, ptr %[[VAL_9]], align 1
 // CHECK-GCN:     %[[VAL_154:.*]] = load i8, ptr addrspace(5) %[[VAL_9]], align 1
 // CHECK:         %[[VAL_155:.*]] = icmp ne i8 %[[VAL_154]], 0
@@ -229,8 +229,8 @@ compare {
 // CHECK:         %[[VAL_163:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_124]]
 // CHECK:         %[[VAL_164:.*]] = addrspacecast ptr addrspace(3) %[[VAL_163]] to ptr
 // CHECK-GCN:     %[[VAL_8_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_8]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8]])
 // CHECK-PTX:     %[[VAL_165:.*]] = load i8, ptr %[[VAL_8]], align 1
 // CHECK-GCN:     %[[VAL_165:.*]] = load i8, ptr addrspace(5) %[[VAL_8]], align 1
 // CHECK:         %[[VAL_166:.*]] = icmp ne i8 %[[VAL_165]], 0
@@ -283,8 +283,8 @@ compare {
 // CHECK:         %[[VAL_205:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_175]]
 // CHECK:         %[[VAL_206:.*]] = addrspacecast ptr addrspace(3) %[[VAL_205]] to ptr
 // CHECK-GCN:     %[[VAL_7_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_7]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7]])
 // CHECK-PTX:     %[[VAL_207:.*]] = load i8, ptr %[[VAL_7]], align 1
 // CHECK-GCN:     %[[VAL_207:.*]] = load i8, ptr addrspace(5) %[[VAL_7]], align 1
 // CHECK:         %[[VAL_208:.*]] = icmp ne i8 %[[VAL_207]], 0
@@ -305,8 +305,8 @@ compare {
 // CHECK:         %[[VAL_216:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_183]]
 // CHECK:         %[[VAL_217:.*]] = addrspacecast ptr addrspace(3) %[[VAL_216]] to ptr
 // CHECK-GCN:     %[[VAL_6_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_6]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6]])
 // CHECK-PTX:     %[[VAL_218:.*]] = load i8, ptr %[[VAL_6]], align 1
 // CHECK-GCN:     %[[VAL_218:.*]] = load i8, ptr addrspace(5) %[[VAL_6]], align 1
 // CHECK:         %[[VAL_219:.*]] = icmp ne i8 %[[VAL_218]], 0
@@ -327,8 +327,8 @@ compare {
 // CHECK:         %[[VAL_227:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_192]]
 // CHECK:         %[[VAL_228:.*]] = addrspacecast ptr addrspace(3) %[[VAL_227]] to ptr
 // CHECK-GCN:     %[[VAL_5_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_5]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5]])
 // CHECK-PTX:     %[[VAL_229:.*]] = load i8, ptr %[[VAL_5]], align 1
 // CHECK-GCN:     %[[VAL_229:.*]] = load i8, ptr addrspace(5) %[[VAL_5]], align 1
 // CHECK:         %[[VAL_230:.*]] = icmp ne i8 %[[VAL_229]], 0
@@ -349,8 +349,8 @@ compare {
 // CHECK:         %[[VAL_238:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_199]]
 // CHECK:         %[[VAL_239:.*]] = addrspacecast ptr addrspace(3) %[[VAL_238]] to ptr
 // CHECK-GCN:     %[[VAL_4_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_4]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4]])
 // CHECK-PTX:     %[[VAL_240:.*]] = load i8, ptr %[[VAL_4]], align 1
 // CHECK-GCN:     %[[VAL_240:.*]] = load i8, ptr addrspace(5) %[[VAL_4]], align 1
 // CHECK:         %[[VAL_241:.*]] = icmp ne i8 %[[VAL_240]], 0
@@ -399,8 +399,8 @@ compare {
 // CHECK:         %[[VAL_275:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_249]]
 // CHECK:         %[[VAL_276:.*]] = addrspacecast ptr addrspace(3) %[[VAL_275]] to ptr
 // CHECK-GCN:     %[[VAL_3_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_3]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3]])
 // CHECK-PTX:     %[[VAL_277:.*]] = load i8, ptr %[[VAL_3]], align 1
 // CHECK-GCN:     %[[VAL_277:.*]] = load i8, ptr addrspace(5) %[[VAL_3]], align 1
 // CHECK:         %[[VAL_278:.*]] = icmp ne i8 %[[VAL_277]], 0
@@ -421,8 +421,8 @@ compare {
 // CHECK:         %[[VAL_286:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_256]]
 // CHECK:         %[[VAL_287:.*]] = addrspacecast ptr addrspace(3) %[[VAL_286]] to ptr
 // CHECK-GCN:     %[[VAL_2_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_2]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2]])
 // CHECK-PTX:     %[[VAL_288:.*]] = load i8, ptr %[[VAL_2]], align 1
 // CHECK-GCN:     %[[VAL_288:.*]] = load i8, ptr addrspace(5) %[[VAL_2]], align 1
 // CHECK:         %[[VAL_289:.*]] = icmp ne i8 %[[VAL_288]], 0
@@ -443,8 +443,8 @@ compare {
 // CHECK:         %[[VAL_297:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_263]]
 // CHECK:         %[[VAL_298:.*]] = addrspacecast ptr addrspace(3) %[[VAL_297]] to ptr
 // CHECK-GCN:     %[[VAL_1_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_1]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1]])
 // CHECK-PTX:     %[[VAL_299:.*]] = load i8, ptr %[[VAL_1]], align 1
 // CHECK-GCN:     %[[VAL_299:.*]] = load i8, ptr addrspace(5) %[[VAL_1]], align 1
 // CHECK:         %[[VAL_300:.*]] = icmp ne i8 %[[VAL_299]], 0
@@ -465,8 +465,8 @@ compare {
 // CHECK:         %[[VAL_308:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_269]]
 // CHECK:         %[[VAL_309:.*]] = addrspacecast ptr addrspace(3) %[[VAL_308]] to ptr
 // CHECK-GCN:     %[[VAL_0_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_0]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0]])
 // CHECK-PTX:     %[[VAL_310:.*]] = load i8, ptr %[[VAL_0]], align 1
 // CHECK-GCN:     %[[VAL_310:.*]] = load i8, ptr addrspace(5) %[[VAL_0]], align 1
 // CHECK:         %[[VAL_311:.*]] = icmp ne i8 %[[VAL_310]], 0
@@ -756,8 +756,8 @@ compare {
 // CHECK:         %[[VAL_529:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_498]]
 // CHECK:         %[[VAL_530:.*]] = addrspacecast ptr addrspace(3) %[[VAL_529]] to ptr
 // CHECK-GCN:     %[[VAL_353_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_353]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353_2]])
 // CHECK-PTX:     %[[VAL_531:.*]] = load i8, ptr %[[VAL_353]], align 1
 // CHECK-GCN:     %[[VAL_531:.*]] = load i8, ptr addrspace(5) %[[VAL_353]], align 1
 // CHECK:         %[[VAL_532:.*]] = icmp ne i8 %[[VAL_531]], 0
@@ -788,8 +788,8 @@ compare {
 // CHECK:         %[[VAL_548:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_505]]
 // CHECK:         %[[VAL_549:.*]] = addrspacecast ptr addrspace(3) %[[VAL_548]] to ptr
 // CHECK-GCN:     %[[VAL_352_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_352]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352_2]])
 // CHECK-PTX:     %[[VAL_550:.*]] = load i8, ptr %[[VAL_352]], align 1
 // CHECK-GCN:     %[[VAL_550:.*]] = load i8, ptr addrspace(5) %[[VAL_352]], align 1
 // CHECK:         %[[VAL_551:.*]] = icmp ne i8 %[[VAL_550]], 0
@@ -820,8 +820,8 @@ compare {
 // CHECK:         %[[VAL_567:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_513]]
 // CHECK:         %[[VAL_568:.*]] = addrspacecast ptr addrspace(3) %[[VAL_567]] to ptr
 // CHECK-GCN:     %[[VAL_351_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_351]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351_2]])
 // CHECK-PTX:     %[[VAL_569:.*]] = load i8, ptr %[[VAL_351]], align 1
 // CHECK-GCN:     %[[VAL_569:.*]] = load i8, ptr addrspace(5) %[[VAL_351]], align 1
 // CHECK:         %[[VAL_570:.*]] = icmp ne i8 %[[VAL_569]], 0
@@ -852,8 +852,8 @@ compare {
 // CHECK:         %[[VAL_586:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_519]]
 // CHECK:         %[[VAL_587:.*]] = addrspacecast ptr addrspace(3) %[[VAL_586]] to ptr
 // CHECK-GCN:     %[[VAL_350_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_350]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350_2]])
 // CHECK-PTX:     %[[VAL_588:.*]] = load i8, ptr %[[VAL_350]], align 1
 // CHECK-GCN:     %[[VAL_588:.*]] = load i8, ptr addrspace(5) %[[VAL_350]], align 1
 // CHECK:         %[[VAL_589:.*]] = icmp ne i8 %[[VAL_588]], 0
@@ -916,8 +916,8 @@ compare {
 // CHECK:         %[[VAL_636:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_602]]
 // CHECK:         %[[VAL_637:.*]] = addrspacecast ptr addrspace(3) %[[VAL_636]] to ptr
 // CHECK-GCN:     %[[VAL_349_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_349]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349_2]])
 // CHECK-PTX:     %[[VAL_638:.*]] = load i8, ptr %[[VAL_349]], align 1
 // CHECK-GCN:     %[[VAL_638:.*]] = load i8, ptr addrspace(5) %[[VAL_349]], align 1
 // CHECK:         %[[VAL_639:.*]] = icmp ne i8 %[[VAL_638]], 0
@@ -948,8 +948,8 @@ compare {
 // CHECK:         %[[VAL_655:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_610]]
 // CHECK:         %[[VAL_656:.*]] = addrspacecast ptr addrspace(3) %[[VAL_655]] to ptr
 // CHECK-GCN:     %[[VAL_348_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_348]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348_2]])
 // CHECK-PTX:     %[[VAL_657:.*]] = load i8, ptr %[[VAL_348]], align 1
 // CHECK-GCN:     %[[VAL_657:.*]] = load i8, ptr addrspace(5) %[[VAL_348]], align 1
 // CHECK:         %[[VAL_658:.*]] = icmp ne i8 %[[VAL_657]], 0
@@ -980,8 +980,8 @@ compare {
 // CHECK:         %[[VAL_674:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_619]]
 // CHECK:         %[[VAL_675:.*]] = addrspacecast ptr addrspace(3) %[[VAL_674]] to ptr
 // CHECK-GCN:     %[[VAL_347_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_347]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347_2]])
 // CHECK-PTX:     %[[VAL_676:.*]] = load i8, ptr %[[VAL_347]], align 1
 // CHECK-GCN:     %[[VAL_676:.*]] = load i8, ptr addrspace(5) %[[VAL_347]], align 1
 // CHECK:         %[[VAL_677:.*]] = icmp ne i8 %[[VAL_676]], 0
@@ -1012,8 +1012,8 @@ compare {
 // CHECK:         %[[VAL_693:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_626]]
 // CHECK:         %[[VAL_694:.*]] = addrspacecast ptr addrspace(3) %[[VAL_693]] to ptr
 // CHECK-GCN:     %[[VAL_346_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_346]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346_2]])
 // CHECK-PTX:     %[[VAL_695:.*]] = load i8, ptr %[[VAL_346]], align 1
 // CHECK-GCN:     %[[VAL_695:.*]] = load i8, ptr addrspace(5) %[[VAL_346]], align 1
 // CHECK:         %[[VAL_696:.*]] = icmp ne i8 %[[VAL_695]], 0
@@ -1072,8 +1072,8 @@ compare {
 // CHECK:         %[[VAL_738:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_708]]
 // CHECK:         %[[VAL_739:.*]] = addrspacecast ptr addrspace(3) %[[VAL_738]] to ptr
 // CHECK-GCN:     %[[VAL_345_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_345]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345_2]])
 // CHECK-PTX:     %[[VAL_740:.*]] = load i8, ptr %[[VAL_345]], align 1
 // CHECK-GCN:     %[[VAL_740:.*]] = load i8, ptr addrspace(5) %[[VAL_345]], align 1
 // CHECK:         %[[VAL_741:.*]] = icmp ne i8 %[[VAL_740]], 0
@@ -1104,8 +1104,8 @@ compare {
 // CHECK:         %[[VAL_757:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_715]]
 // CHECK:         %[[VAL_758:.*]] = addrspacecast ptr addrspace(3) %[[VAL_757]] to ptr
 // CHECK-GCN:     %[[VAL_344_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_344]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344_2]])
 // CHECK-PTX:     %[[VAL_759:.*]] = load i8, ptr %[[VAL_344]], align 1
 // CHECK-GCN:     %[[VAL_759:.*]] = load i8, ptr addrspace(5) %[[VAL_344]], align 1
 // CHECK:         %[[VAL_760:.*]] = icmp ne i8 %[[VAL_759]], 0
@@ -1136,8 +1136,8 @@ compare {
 // CHECK:         %[[VAL_776:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_722]]
 // CHECK:         %[[VAL_777:.*]] = addrspacecast ptr addrspace(3) %[[VAL_776]] to ptr
 // CHECK-GCN:     %[[VAL_343_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_343]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343_2]])
 // CHECK-PTX:     %[[VAL_778:.*]] = load i8, ptr %[[VAL_343]], align 1
 // CHECK-GCN:     %[[VAL_778:.*]] = load i8, ptr addrspace(5) %[[VAL_343]], align 1
 // CHECK:         %[[VAL_779:.*]] = icmp ne i8 %[[VAL_778]], 0
@@ -1168,8 +1168,8 @@ compare {
 // CHECK:         %[[VAL_795:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_728]]
 // CHECK:         %[[VAL_796:.*]] = addrspacecast ptr addrspace(3) %[[VAL_795]] to ptr
 // CHECK-GCN:     %[[VAL_342_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_342]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342_2]])
 // CHECK-PTX:     %[[VAL_797:.*]] = load i8, ptr %[[VAL_342]], align 1
 // CHECK-GCN:     %[[VAL_797:.*]] = load i8, ptr addrspace(5) %[[VAL_342]], align 1
 // CHECK:         %[[VAL_798:.*]] = icmp ne i8 %[[VAL_797]], 0
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index 20f51dfbd55b23..7e9e5dea71ed67 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/transforms/sort_rewriter.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -60,11 +61,20 @@ ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
 
 ENTRY test {
 p0 = $0[32]{0} parameter(0)
-ROOT sort = $0[32]{0} sort(p0), dimensions={0}, is_stable=true,
+ROOT sort = $0[32]{0} sort(p0), dimensions={0}, is_stable=false,
 to_apply=compare
 })";
+  // It's OK to set kAlways, because we want to check support, not heuristics.
+  // kAlways does not change what's supported, it only forces the rewrite to
+  // happen if it is supported.
+  SortRewriter::SetSortModeForTestingOnly(SortRewriter::Mode::kAlways);
   std::string hlo = absl::Substitute(
       kHloTemplate, primitive_util::LowercasePrimitiveTypeName(GetParam()));
+  // We expect that all types except PRED and F8 types are rewritten to a custom
+  // call.
+  bool rewrite = GetParam() != PRED && !primitive_util::IsF8Type(GetParam());
+  std::string check = rewrite ? "CHECK: custom-call" : "CHECK-NOT: custom-call";
+  MatchOptimizedHlo(hlo, check);
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{0, 0}));
 }
 
@@ -231,18 +241,6 @@ class CubSortPairsTest
           std::tuple<std::shared_ptr<Literal>, PrimitiveType, bool>> {};
 
 TEST_P(CubSortPairsTest, SortPairs) {
-  // TODO(b/380814507): Remove the disabling part once fixed.
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
-  if (cc.IsAtLeastHopper() &&
-      std::get<0>(GetParam())->shape().element_type() == U16 &&
-      std::get<1>(GetParam()) == F64) {
-    GTEST_SKIP()
-        << "CUB sort does not work for pair sorting (U16, F64) on Hopper.";
-  }
-
   constexpr char kHloTemplate[] = R"(
 HloModule TestModule
 
diff --git a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
index bb680f4db76f0f..74a24914d4b718 100644
--- a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
@@ -12,6 +12,21 @@ e {
 
 // -----
 
+e {
+  a = s4[4,2]{1,0:E(4)} parameter(0)
+  b = s4[4,4]{1,0:E(4)} all-gather(a), dimensions={1}
+}
+
+// CHECK-NOT: convert
+// CHECK:      s4[4,2]{1,0:E(4)} parameter
+// CHECK-NEXT: s4[2,4]{1,0:E(4)} transpose
+// CHECK-NEXT: s8[2,2]{0,1} bitcast
+// CHECK:      s8[2,4]{0,1} all-gather-done
+// CHECK-NEXT: s4[4,4]{1,0:E(4)} bitcast
+// CHECK-NEXT: s4[4,4]{1,0:E(4)} transpose
+
+// -----
+
 e {
   a = f4e2m1fn[80,20]{1,0:E(4)} parameter(0)
   b = f4e2m1fn[80,20]{1,0:E(4)} all-to-all(a), dimensions={0}
diff --git a/third_party/xla/xla/service/gpu/tests/swap_conv_operands_test.cc b/third_party/xla/xla/service/gpu/tests/swap_conv_operands_test.cc
index 71eff127e10d01..b338f506ea4d65 100644
--- a/third_party/xla/xla/service/gpu/tests/swap_conv_operands_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/swap_conv_operands_test.cc
@@ -61,8 +61,7 @@ ENTRY swap_conv {
   TF_ASSERT_OK_AND_ASSIGN(se::GpuComputeCapability gpu_compute_capability,
                           GpuComputeCapability());
 
-  if (std::get_if<se::CudaComputeCapability>(&gpu_compute_capability)
-          ->IsAtLeastHopper()) {
+  if (gpu_compute_capability.cuda_compute_capability()->IsAtLeastHopper()) {
     MatchOptimizedHloWithShapes(hlo_text,
                                 R"(
 // CHECK: [[cudnn_conv_1_0:%[^ ]+]] = (f32[1,32,32,128]{3,2,1,0}, u8[{{.*}}]{0}) custom-call(f32[1,30,30,512]{3,2,1,0} {{[^ ]+}}, f32[128,3,3,512]{3,2,1,0} {{[^ ]+}}), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward"
@@ -93,8 +92,7 @@ ENTRY swap_conv {
   TF_ASSERT_OK_AND_ASSIGN(se::GpuComputeCapability gpu_compute_capability,
                           GpuComputeCapability());
 
-  if (std::get_if<se::CudaComputeCapability>(&gpu_compute_capability)
-          ->IsAtLeastHopper()) {
+  if (gpu_compute_capability.cuda_compute_capability()->IsAtLeastHopper()) {
     MatchOptimizedHloWithShapes(hlo_text,
                                 R"(
 // CHECK: [[cudnn_conv_1_0:%[^ ]+]] = (f32[1,32,32,128]{3,2,1,0}, u8[{{[0-9]*}}]{0}) custom-call(f32[1,30,30,512]{3,2,1,0} {{[^ ]+}}, f32[128,3,3,512]{3,2,1,0} {{[^ ]+}}), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward"
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index 1ba0c00af3cffa..0578a2b058fcde 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -3,17 +3,55 @@
 // CHECK-PTX: define ptx_kernel void @triton_gemm_r(
 // CHECK-GCN: define amdgpu_kernel void @triton_gemm_r(
 
-HloModule t, is_scheduled=true, entry_computation_layout={(f16[15,19]{1,0},s8[19,17]{1,0})->f16[15,17]{1,0}}
+HloModule t, is_scheduled=true, entry_computation_layout={(f16[15,19],s8[19,17])->f16[15,17]}
 
-%triton_gemm_r (parameter_0: s8[19,17], parameter_1: f16[15,19]) -> f16[15,17] {
-  %parameter_1 = f16[15,19]{1,0} parameter(1)
-  %parameter_0 = s8[19,17]{1,0} parameter(0)
-  %cp1.1 = f16[19,17]{1,0} convert(%parameter_0)
-  ROOT %r.1 = f16[15,17]{1,0} dot(%parameter_1, %cp1.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+lhs {
+  ROOT p0 = f16[15,19] parameter(0)
 }
 
-ENTRY %e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
-  %p1 = s8[19,17]{1,0} parameter(1)
-  %p0 = f16[15,19]{1,0} parameter(0)
-  ROOT %triton_gemm_r = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom, calls=%triton_gemm_r, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\",\"num_ctas\":\"1\"}}}"
+rhs {
+  p0 = s8[19,17] parameter(0)
+  ROOT convert = f16[19,17] convert(p0)
+
+}
+
+dot_computation {
+  p1 = f16[15,19] parameter(1)
+  lhs = f16[15,19] fusion(p1), kind=kCustom, calls=lhs,
+    backend_config={
+      fusion_backend_config:{
+        kind:"__triton_nested_gemm_fusion",
+        block_level_fusion_config:{
+          output_tiles:[{sizes:[64,64]}]
+        }
+      }
+    }
+  p0 = s8[19,17] parameter(0)
+  rhs = f16[19,17] fusion(p0), kind=kCustom, calls=rhs,
+    backend_config={
+      fusion_backend_config:{
+        kind:"__triton_nested_gemm_fusion",
+        block_level_fusion_config:{
+          output_tiles:[{sizes:[64,32]}]
+        }
+      }
+    }
+  ROOT dot = f16[15,17] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
+  p1 = s8[19,17] parameter(1)
+  p0 = f16[15,19] parameter(0)
+  ROOT triton_gemm_r = f16[15,17] fusion(p1, p0),
+    kind=kCustom, calls=dot_computation, backend_config={
+      fusion_backend_config:{
+        kind:"__triton_nested_gemm_fusion",
+        block_level_fusion_config:{
+          output_tiles:[{sizes:[64,32]}],
+          num_stages:2,
+          num_warps:8,
+          num_ctas:1
+        }
+      }
+    }
 }
diff --git a/third_party/xla/xla/service/gpu/tests/xla-opt.cc b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
index b8fdd6c5212418..850a788aaf73ef 100644
--- a/third_party/xla/xla/service/gpu/tests/xla-opt.cc
+++ b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/InitAllExtensions.h"
 #include "mlir/Pass/PassOptions.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
 #include "xla/codegen/emitters/ir/xla_dialect.h"
 #include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/xtile/ir/xtile_dialect.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "third_party/triton/bin/RegisterTritonDialects.h"
@@ -70,7 +72,8 @@ mlir::PassPipelineRegistration<TritonPipelineOptions>
             gpu_cc = rocm_cc;
           }
           xla::gpu::CreateTritonXlaPipeline(&pm, gpu_cc, options.rewrite_int4,
-                                            options.allow_tma);
+                                            options.allow_tma,
+                                            options.num_stages);
           xla::gpu::CreateTritonPipeline(&pm, gpu_cc, options.num_warps,
                                          options.num_ctas, options.num_stages,
                                          cluster_info);
@@ -88,7 +91,8 @@ int main(int argc, char** argv) {
   registerTritonDialects(registry);  // This registers all passes as well.
   registry.insert<mlir::func::FuncDialect, mlir::tensor::TensorDialect,
                   mlir::triton::xla::XlaTritonDialect, xla::XlaDialect,
-                  mlir::stablehlo::StablehloDialect>();
+                  xla::xtile::XTileDialect, mlir::stablehlo::StablehloDialect,
+                  mlir::memref::MemRefDialect>();
   mlir::triton::xla::registerTritonXlaTransformsPasses();
   xla::emitters::registerTransformsPasses();
   xla::gpu::registerGpuFusionTransformsPasses();
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
similarity index 73%
rename from third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
rename to third_party/xla/xla/service/gpu/thunk_emitter.cc
index 83acda65146668..001e40f25a7b05 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -13,21 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/ir_emitter_unnested.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <variant>
-#include <vector>
+#include "xla/service/gpu/thunk_emitter.h"
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -35,28 +21,21 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/TargetParser/Triple.h"
 #include "mlir/AsmParser/AsmParser.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -72,26 +51,27 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Export.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/gpu/runtime/all_to_all_thunk.h"
-#include "xla/backends/gpu/runtime/cholesky_thunk.h"
 #include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
 #include "xla/backends/gpu/runtime/collective_group_thunk.h"
+#include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
+#include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
 #include "xla/backends/gpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
+#include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/cub_sort_thunk.h"
 #include "xla/backends/gpu/runtime/cudnn_thunk.h"
-#include "xla/backends/gpu/runtime/custom_call_target.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/fft_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
@@ -113,15 +93,14 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/select_k_thunk.h"
 #include "xla/backends/gpu/runtime/send_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/topk.h"
 #include "xla/backends/gpu/runtime/triangular_solve_thunk.h"
 #include "xla/backends/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -139,15 +118,13 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/custom_call_status.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/custom_kernel_emitter.h"
 #include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
+#include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -159,29 +136,16 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/gpu/triton_call.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
-#include "xla/service/llvm_ir/ir_array.h"
-#include "xla/service/llvm_ir/kernel_support_library.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/service/llvm_ir/loop_emitter.h"
-#include "xla/service/llvm_ir/sort_util.h"
-#include "xla/service/name_uniquer.h"
-#include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
-#include "xla/stream_executor/gpu_solver_context.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/platform_object_registry.h"
-#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/errors.h"
@@ -191,13 +155,12 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/human_readable_json.h"
-#include "tsl/platform/platform.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla {
 namespace gpu {
-
 namespace {
+
 // TODO: move into a host_execute specific file.
 bool IsHostExecuteCustomCall(const HloInstruction& hlo) {
   return hlo.opcode() == HloOpcode::kCustomCall &&
@@ -206,23 +169,64 @@ bool IsHostExecuteCustomCall(const HloInstruction& hlo) {
                              // the TPU one
 }
 
+template <typename ThunkType>
+static constexpr bool kRequiresCollectiveKernelThunk =
+    std::is_constructible_v<ThunkType, Thunk::ThunkInfo,
+                            const HloAllReduceInstruction*,
+                            std::vector<CollectiveThunk::Buffer>,
+                            std::unique_ptr<CollectiveKernelThunk>,
+                            /*p2p_memcpy_enabled=*/bool>;
+
+// The signature of this function would change to absl::Status once we lift the
+// CollectiveKernelThunk out as a top level thunk. It would then become a member
+// function of ThunkEmitter.
+// As it stands now the collective kernel thunk is wrapped inside other
+// collective thunks such as AllReduceStart. So this function is only
+// responsible for emitting the collective kernel thunk and its dependencies.
+// If nullptr is returned it means that the collective kernel thunk could not be
+// emitted. This is not an error.
+absl::StatusOr<std::unique_ptr<CollectiveKernelThunk>>
+EmitCollectiveKernelThunk(IrEmitterContext* ir_emitter_context,
+                          const CallGraph* call_graph,
+                          Thunk::ThunkInfo thunk_info,
+                          std::vector<CollectiveThunk::Buffer> buffers,
+                          const HloAllReduceInstruction* instr,
+                          const AllReduceConfig& config) {
+  return std::make_unique<CollectiveKernelThunk>(
+      thunk_info, config.config, config.reduction_kind,
+      /*is_async=*/!IsGPUSyncCollective(*instr), std::move(buffers),
+      /*is_collective_kernel_enabled=*/
+      instr->GetModule()
+          ->config()
+          .debug_options()
+          .xla_gpu_unsupported_use_all_reduce_one_shot_kernel());
+}
+
+std::unique_ptr<llvm::Module> CreateLocalLLVMModule(
+    const std::string& module_name, llvm::Module* global_llvm_module) {
+  auto llvm_module = std::make_unique<llvm::Module>(
+      module_name, global_llvm_module->getContext());
+  llvm_module->setTargetTriple(
+      llvm::Triple(global_llvm_module->getTargetTriple()));
+  llvm_module->setDataLayout(global_llvm_module->getDataLayout());
+  return llvm_module;
+}
+
 }  // namespace
 
-IrEmitterUnnested::IrEmitterUnnested(IrEmitterContext* ir_emitter_context)
-    : IrEmitter(ir_emitter_context, /*is_nested=*/false),
+ThunkEmitter::ThunkEmitter(IrEmitterContext* ir_emitter_context)
+    : ir_emitter_context_(ir_emitter_context),
       send_recv_events_(std::make_shared<HostSendRecvAsyncEvents>()),
       copy_events_(std::make_shared<CopyThunk::AsyncEvents>()),
       nvshmem_buffer_addresses_(std::make_shared<NvshmemBufferAddresses>()),
       call_graph_(CallGraph::Build(&ir_emitter_context->hlo_module())) {}
 
-std::unique_ptr<IrEmitterUnnested> IrEmitterUnnested::Create(
+std::unique_ptr<ThunkEmitter> ThunkEmitter::Create(
     IrEmitterContext* ir_emitter_context) {
-  return std::unique_ptr<IrEmitterUnnested>(
-      new IrEmitterUnnested(ir_emitter_context));
+  return std::unique_ptr<ThunkEmitter>(new ThunkEmitter(ir_emitter_context));
 }
 
-absl::Status IrEmitterUnnested::EmitConstant(
-    const HloConstantInstruction* instr) {
+absl::Status ThunkEmitter::EmitConstant(const HloConstantInstruction* instr) {
   TF_ASSIGN_OR_RETURN(DenseDataIntermediate content,
                       LiteralToXlaFormat(instr->literal()));
 
@@ -236,17 +240,19 @@ absl::Status IrEmitterUnnested::EmitConstant(
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
                       GetAllocationSliceForHlo(instr, {}));
 
-  ir_emitter_context_->emit_constant(num_elements, element_bytes, global_name,
-                                     slice.index(), std::move(content), &b_);
+  GpuExecutable::ConstantInfo info = AppendGlobalConstant(
+      ir_emitter_context_->llvm_module_constants(), num_elements, element_bytes,
+      global_name, slice.index(), std::move(content));
+  ir_emitter_context_->constants().push_back(std::move(info));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitConditional(const HloInstruction* instr) {
+absl::Status ThunkEmitter::EmitConditional(const HloInstruction* instr) {
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.reserve(instr->branch_count());
 
   for (auto comp : instr->branch_computations()) {
-    auto ir_emitter = IrEmitterUnnested::Create(ir_emitter_context_);
+    auto ir_emitter = ThunkEmitter::Create(ir_emitter_context_);
     TF_RETURN_IF_ERROR(ir_emitter->EmitHloComputation(comp));
     Thunk::ThunkInfo branch_thunk_info =
         Thunk::ThunkInfo::WithProfileAnnotation(
@@ -267,320 +273,54 @@ absl::Status IrEmitterUnnested::EmitConditional(const HloInstruction* instr) {
   return absl::OkStatus();
 }
 
-llvm::Value* IrEmitterUnnested::CreateLoad(llvm::Value* address,
-                                           llvm::Type* data_type,
-                                           int alignment_bytes) {
-  int data_bytes = data_type->getPrimitiveSizeInBits() /
-                   primitive_util::BitWidth(PrimitiveType::U8);
-  if (alignment_bytes == 0) {
-    return b_.CreateLoad(data_type, address);
-  }
-
-  int alignment_bitwidth =
-      alignment_bytes * primitive_util::BitWidth(PrimitiveType::U8);
-
-  llvm::Value* output = llvm::ConstantInt::get(data_type, 0);
-  for (int offset_bytes = 0; offset_bytes < data_bytes;
-       offset_bytes += alignment_bytes) {
-    llvm::Value* offset_address = b_.CreateConstInBoundsGEP1_32(
-        b_.getInt8Ty(), address, offset_bytes, "offset_address");
-    llvm::Value* partial_value = b_.CreateLoad(b_.getIntNTy(alignment_bitwidth),
-                                               offset_address, "partial_value");
-    llvm::Value* zextd =
-        b_.CreateZExt(partial_value, output->getType(), "partial_value_zextd");
-    llvm::Value* shifted = b_.CreateShl(
-        zextd, llvm::ConstantInt::get(b_.getInt32Ty(), offset_bytes),
-        "partial_input_shifted");
-    output = b_.CreateAdd(output, shifted, "output_updated");
-  }
-  return output;
-}
-
-void IrEmitterUnnested::CreateStore(llvm::Value* data, llvm::Value* address,
-                                    int alignment_bytes) {
-  int data_bytes = data->getType()->getPrimitiveSizeInBits() /
-                   primitive_util::BitWidth(PrimitiveType::U8);
-  CHECK_GE(data_bytes, alignment_bytes);
-  if (alignment_bytes == 0) {
-    b_.CreateStore(data, address);
-    return;
-  }
-
-  int alignment_bitwidth =
-      alignment_bytes * primitive_util::BitWidth(PrimitiveType::U8);
-
-  for (int offset_bytes = 0; offset_bytes < data_bytes;
-       offset_bytes += alignment_bytes) {
-    llvm::Value* offset_address = b_.CreateConstInBoundsGEP1_32(
-        b_.getInt8Ty(), address, offset_bytes, "offset_address");
-    llvm::Value* shifted_partial = b_.CreateTrunc(
-        b_.CreateLShr(data,
-                      llvm::ConstantInt::get(b_.getInt32Ty(), offset_bytes)),
-        b_.getIntNTy(alignment_bitwidth), "truncated_value");
-    b_.CreateStore(shifted_partial, offset_address);
-  }
-}
-
 // Input = {dynamic array(with dynamic dimension meta data at the
 // end)} Output = {static array, dynamic_dim0, dynamic_dim1}
-absl::Status IrEmitterUnnested::EmitPadToStatic(
+absl::Status ThunkEmitter::EmitPadToStatic(
     const HloCustomCallInstruction* instr) {
-  int unroll_factor = 1;
   std::string ir_name = std::string(instr->name());
+  auto local_llvm_module =
+      CreateLocalLLVMModule(ir_name, ir_emitter_context_->llvm_module());
 
-  const Shape& input_shape = instr->operand(0)->shape();
-
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      input_shape, ir_emitter_context_->gpu_device_info(), {unroll_factor});
-  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                      BuildKernelThunkForNonFusionOp(instr, launch_dimensions));
-
-  const llvm_ir::IrArray& source_array = ir_arrays[0];
-  const llvm_ir::IrArray& output_array = ir_arrays[1];
-  auto output_dim_arrays =
-      absl::Span<const llvm_ir::IrArray>(ir_arrays).subspan(2);
-
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(instr, launch_dimensions.launch_bound(), &b_);
-
-  // pseudo code for PadToStatic on a 2d array
-  //   int* source_array = args[0];
-  //   int* dest_array = args[1];
-  llvm::Value* source_buffer = source_array.GetBasePointer();
-
-  // TODO(jurahul): input_shape here is the static shape of the
-  // input (which has a dynamic shape in XLA). Currently, we are
-  // mapping that to a static shaped memref. When we change that to
-  // a more appropriate representation in MLIR, fix this code to
-  // correctly deduce the static shape backing the dynamically
-  // shaped memref.
-  int64_t raw_data_size = ShapeUtil::ByteSizeOf(input_shape);
-
-  //   int* dyn_dim0_size = source_array + meta_data_offset;
-  //   int* dyn_dim1_size = source_array + meta_data_offset +
-  //   sizeof(int);
-  std::vector<llvm::Value*> dynamic_dims;
-  int alignment = raw_data_size % sizeof(int32_t);
-  std::vector<ShapeUtil::IndexedShape> output_shapes =
-      ShapeUtil::GetLeafShapes(instr->shape());
-
-  for (int64_t i = 1; i < output_shapes.size(); ++i) {
-    // Dynamic size of each dimension is attached at the end of the
-    // source array(operand(0)). We need to extract these value.
-    const Shape& dim_shape = output_shapes[i].shape;
-    TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
-
-    const int64_t dim_index = i - 1;
-    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
-        b_.getInt8Ty(), source_buffer,
-        raw_data_size + dim_index * sizeof(int32_t));
-    llvm::Value* dyn_dim_size =
-        CreateLoad(metadata, b_.getInt32Ty(), alignment);
-    dynamic_dims.push_back(dyn_dim_size);
-  }
-
-  // only one thread need to store the dynamic index
-  //   int thread_id = GetThreadId();
-  //   int block_id = GetBlockId();
-  //   if (thread_id == 0 && block_id == 0) {
-  //     *output[1] = *dyn_dim0_size;
-  //     *output[2] = *dyn_dim1_size;
-  //   }
-  KernelSupportLibrary{&b_}.If("is_thread_0", IsBlock0Thread0(&b_), [&] {
-    for (int64_t i = 1; i < output_shapes.size(); ++i) {
-      const int64_t dim_index = i - 1;
-      llvm::Value* dest_dim_size_address =
-          output_dim_arrays[dim_index].GetBasePointer();
-      // output[i] stores dynamic_dim_(i-1)
-      CreateStore(dynamic_dims[dim_index], dest_dim_size_address, alignment);
-    }
-  });
-
-  //     int dyn_element_total = 1;
-  //     dyn_element_total *= *dyn_dim0_size;
-  //     dyn_element_total *= *dyn_dim1_size;
-  llvm::Value* dyn_element_total = llvm::ConstantInt::get(index_ty, 1);
-  for (llvm::Value* dynamic_dim : dynamic_dims) {
-    dyn_element_total =
-        b_.CreateMul(dyn_element_total,
-                     b_.CreateIntCast(dynamic_dim, dyn_element_total->getType(),
-                                      /*isSigned=*/true),
-                     /*Name=*/"dyn_element_total_pad");
-  }
-
-  //   linear_index = block_id * threads_per_block + thread_id;
-  //   if (linear_index < max_num_element) {
-  //     Index static_index =
-  //         delinerized(linerized_index, static_dim0_size,
-  //         static_dim1_size);
-  //     if (linerized_index < dyn_element_total) {
-  //       Index dyn_index =
-  //           delinerized(linerized_index, *dyn_dim0_size,
-  //           *dyn_dim1_size);
-  //       dest_array[dyn_index.dim0][dyn_index.dim1] =
-  //           source_array[static_index.dim0][static_index.dim1];
-  //     }
-  //   }
-  llvm_ir::BodyEmitter body_generator =
-      [&](const llvm_ir::IrArray::Index& array_index) -> absl::Status {
-    llvm::Value* linearIndex =
-        array_index.Linearize(input_shape.dimensions(), &b_);
-    auto if_in_dyn_bounds = llvm_ir::EmitIfThenElse(
-        b_.CreateICmpULT(linearIndex, dyn_element_total),
-        llvm_ir::IrName(ir_name, "in_dyn_bounds"), &b_, false);
-    // Set IR builder insertion point to the body of the if
-    // structure.
-    llvm_ir::SetToFirstInsertPoint(if_in_dyn_bounds.true_block, &b_);
-    llvm_ir::IrArray::Index dyn_index(linearIndex, input_shape,
-                                      absl::MakeSpan(dynamic_dims), &b_);
-    output_array.EmitWriteArrayElement(
-        dyn_index,
-        source_array.EmitReadArrayElement(array_index, &b_,
-                                          /*name=*/""),
-        &b_,
-        /*use_linear_index=*/false);
-    return absl::OkStatus();
-  };
-
-  const Shape& data_shape = instr->shape().tuple_shapes(0);
-  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
-                                         launch_dimensions, &b_,
-                                         {unroll_factor})
-                         .EmitLoop(ir_name, index_ty));
+  TF_ASSIGN_OR_RETURN(auto thunk_sequence,
+                      EmitPadToStaticLLVMIR(instr, local_llvm_module.get(),
+                                            ir_emitter_context_));
+  for (auto& thunk : thunk_sequence) {
+    AddThunkToThunkSequence(std::move(thunk));
+  }
+  CHECK(!llvm::Linker::linkModules(*ir_emitter_context_->llvm_module(),
+                                   std::move(local_llvm_module),
+                                   llvm::Linker::Flags::OverrideFromSrc));
   return absl::OkStatus();
 }
 
 // Input = {dynamic array(with dynamic dimension meta data at the
 // end)} Output = {static array, dynamic_dim0, dynamic_dim1}
-absl::Status IrEmitterUnnested::EmitSliceToDynamic(
+absl::Status ThunkEmitter::EmitSliceToDynamic(
     const HloCustomCallInstruction* instr) {
-  // TODO(jurahul): Create an op to represent SliceToDynamic.
-  int unroll_factor = 1;
   std::string ir_name = std::string(instr->name());
-
-  const Shape& input_shape = instr->operand(0)->shape();
-
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      input_shape, ir_emitter_context_->gpu_device_info(), {unroll_factor});
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(instr, launch_dimensions.launch_bound(), &b_);
-  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                      BuildKernelThunkForNonFusionOp(instr, launch_dimensions));
-
-  const Shape& data_shape = ShapeUtil::MakeStaticShape(instr->shape());
-  TF_RET_CHECK(data_shape.IsArray());
-
-  // TODO(jurahul): data_shape here is the static shape of the
-  // output (which has a dynamic shape in XLA). Currently, we are
-  // mapping that to a static shaped memref. When we change that to
-  // a more appropriate representation in MLIR, fix this code to
-  // correctly deduce the static shape backing the dynamically
-  // shaped memref.
-
-  // calculate the location where metadata needs to be inserted
-  //   int* dyn_dim0_size = dest_array + meta_data_offset;
-  //   int* dyn_dim1_size = dest_array + meta_data_offset +
-  //   sizeof(int);
-  int32_t raw_data_size = ShapeUtil::ByteSizeOf(data_shape);
-
-  // pseudo code for sliceToDynamic on a 2d array
-  //   int* source_array = args[0];
-  //   int* dest_array = args.back();
-  const llvm_ir::IrArray& data_array = ir_arrays.back();
-  llvm::Value* dest_buffer = data_array.GetBasePointer();
-
-  // Load dynamic dimensions from memory.
-  std::vector<llvm::Value*> dynamic_dims;
-  int alignment = raw_data_size % sizeof(int32_t);
-  for (int64_t i = 1; i < instr->operand_count(); ++i) {
-    llvm::Value* source_buffer = ir_arrays[i].GetBasePointer();
-    llvm::Type* source_buffer_pointee_type = ir_arrays[i].GetBasePointeeType();
-    llvm::LoadInst* dyn_dim_size =
-        Load(source_buffer_pointee_type, source_buffer, "dyn_dim_size");
-    dynamic_dims.push_back(dyn_dim_size);
-  }
-
-  // only one thread need to store the dynamic index
-  //   int thread_id = GetThreadId();
-  //   int block_id = GetBlockId();
-  //   if (thread_id == 0 && block_id == 0) {
-  //     *dyn_dim0_size = *output[1];
-  //     *dyn_dim1_size = *output[2];
-  //   }
-  KernelSupportLibrary{&b_}.If("is_thread_0", IsBlock0Thread0(&b_), [&] {
-    for (int64_t i = 1; i < instr->operand_count(); ++i) {
-      const int64_t dim_index = i - 1;
-      llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
-          b_.getInt8Ty(), dest_buffer,
-          raw_data_size + dim_index * sizeof(int32_t));
-      // output[i] stores dynamic_dim_(i-1)
-      CreateStore(dynamic_dims[dim_index], metadata, alignment);
-    }
-  });
-
-  //     int dyn_element_total = 1;
-  //     dyn_element_total *= dyn_dim0_size;
-  //     dyn_element_total *= dyn_dim1_size;
-  llvm::Value* dyn_element_total = llvm::ConstantInt::get(index_ty, 1);
-  for (llvm::Value* dynamic_dim : dynamic_dims) {
-    dyn_element_total =
-        b_.CreateMul(dyn_element_total,
-                     b_.CreateIntCast(dynamic_dim, dyn_element_total->getType(),
-                                      /*isSigned=*/true),
-                     /*Name=*/"dyn_element_total_slice");
-  }
-
-  //   linear_index = block_id * threads_per_block + thread_id;
-  //   if (linear_index < max_num_element) {
-  //     Index static_index =
-  //         delinerized(linerized_index, static_dim0_size,
-  //         static_dim1_size);
-  //     if (linerized_index < dyn_element_total) {
-  //       Index dyn_index =
-  //           delinerized(linerized_index, *dyn_dim0_size,
-  //           *dyn_dim1_size);
-  //       dest_array[static_index.dim0][static_index.di] =
-  //           source_array[dyn_index.dim0][dyn_index.dim1];
-  //     }
-  //   }
-  llvm_ir::BodyEmitter body_generator =
-      [&](const llvm_ir::IrArray::Index& array_index) -> absl::Status {
-    llvm::Value* linearIndex =
-        array_index.Linearize(input_shape.dimensions(), &b_);
-    auto if_in_dyn_bounds = llvm_ir::EmitIfThenElse(
-        b_.CreateICmpULT(linearIndex, dyn_element_total),
-        llvm_ir::IrName(ir_name, "in_dyn_bounds"), &b_, false);
-    // Set IR builder insertion point to the body of the if
-    // structure.
-    llvm_ir::SetToFirstInsertPoint(if_in_dyn_bounds.true_block, &b_);
-    llvm_ir::IrArray::Index dyn_index(linearIndex, input_shape,
-                                      absl::MakeSpan(dynamic_dims), &b_);
-
-    data_array.EmitWriteArrayElement(
-        array_index,
-        ir_arrays[0].EmitReadArrayElement(dyn_index, &b_, /*name=*/"",
-                                          /*use_linear_index=*/false),
-        &b_);
-    return absl::OkStatus();
-  };
-
-  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
-                                         launch_dimensions, &b_,
-                                         {unroll_factor})
-                         .EmitLoop(ir_name, index_ty));
+  auto local_llvm_module =
+      CreateLocalLLVMModule(ir_name, ir_emitter_context_->llvm_module());
+  TF_ASSIGN_OR_RETURN(auto thunk_sequence,
+                      EmitSliceToDynamicLLVMIR(instr, local_llvm_module.get(),
+                                               ir_emitter_context_));
+  for (auto& thunk : thunk_sequence) {
+    AddThunkToThunkSequence(std::move(thunk));
+  }
+  CHECK(!llvm::Linker::linkModules(*ir_emitter_context_->llvm_module(),
+                                   std::move(local_llvm_module),
+                                   llvm::Linker::Flags::OverrideFromSrc));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCommandBufferThunk(
-    const HloInstruction* instr) {
-  // Spawn a new IrEmitterUnnested to emit thunks for the command
+absl::Status ThunkEmitter::EmitCommandBufferThunk(const HloInstruction* instr) {
+  // Spawn a new ThunkEmitter to emit thunks for the command
   // buffer computation. Then convert emitted thunks to a sequence
   // of CommandBufferCmd. The resulting thunk added to the thunk
   // sequence is a CommandBufferThunk. Thunks emitted from the
   // command buffer computation are discarded.
   DCHECK_EQ(instr->called_computations().size(), 1);
   const HloComputation* command_buffer = instr->called_computations().front();
-  auto ir_emitter = IrEmitterUnnested::Create(ir_emitter_context_);
+  auto ir_emitter = ThunkEmitter::Create(ir_emitter_context_);
   TF_RETURN_IF_ERROR(ir_emitter->EmitHloComputation(command_buffer));
   std::unique_ptr<SequentialThunk> thunk_sequence =
       ir_emitter->ConsumeThunkSequence();
@@ -624,11 +364,10 @@ absl::Status IrEmitterUnnested::EmitCommandBufferThunk(
       std::move(thunk_sequence),
       ir_emitter_context_->debug_options()
           .xla_enable_command_buffers_during_profiling()));
-
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitConvolutionThunk(
+absl::Status ThunkEmitter::EmitConvolutionThunk(
     const HloCustomCallInstruction* instr) {
   std::vector<BufferAllocation::Slice> operand_slices;
   operand_slices.reserve(instr->operand_count());
@@ -676,7 +415,7 @@ absl::Status IrEmitterUnnested::EmitConvolutionThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitGemmThunk(
+absl::Status ThunkEmitter::EmitGemmThunk(
     const HloCustomCallInstruction* instr) {
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a,
                       GetAllocationSliceForHlo(instr->operand(0), {}));
@@ -710,7 +449,7 @@ absl::Status IrEmitterUnnested::EmitGemmThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
+absl::Status ThunkEmitter::EmitCublasLtMatmulThunk(
     const HloCustomCallInstruction* instr) {
   TF_ASSIGN_OR_RETURN(const auto gpu_config,
                       instr->backend_config<xla::gpu::GpuBackendConfig>());
@@ -792,7 +531,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
+absl::Status ThunkEmitter::EmitCublasLtMatmulThunkF8(
     const HloCustomCallInstruction* instr) {
   TF_RET_CHECK(instr->operand_count() > 3 && instr->operand_count() < 8);
   TF_ASSIGN_OR_RETURN(const auto gpu_config,
@@ -831,8 +570,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
       BufferAllocation::Slice b_scale,
       GetAllocationSliceForHlo(instr->operand(a_scale_index + 1)));
 
-  bool is_cuda = std::holds_alternative<stream_executor::CudaComputeCapability>(
-      ir_emitter_context_->gpu_compute_capability());
+  bool is_cuda = ir_emitter_context_->gpu_compute_capability().IsCuda();
   bool is_fp8 = instr->shape().tuple_shapes(0).element_type() == F8E4M3FN ||
                 instr->shape().tuple_shapes(0).element_type() == F8E5M2;
   // cublasLT requires c_scale/d_scale to be null when C/D is not
@@ -889,7 +627,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitConvolutionReorderThunk(
+absl::Status ThunkEmitter::EmitConvolutionReorderThunk(
     const HloCustomCallInstruction* instr) {
   bool has_bias = instr->operand_count() > 1;
   Shape shape = has_bias ? instr->shape().tuple_shapes(0) : instr->shape();
@@ -897,43 +635,38 @@ absl::Status IrEmitterUnnested::EmitConvolutionReorderThunk(
     return Internal("Unexpected shape for convolution reorder: %s",
                     instr->ToString());
   }
-  absl::InlinedVector<int64_t, 4> filter_dims = {
-      shape.dimensions(0), shape.dimensions(1) * 32, shape.dimensions(2),
-      shape.dimensions(3)};
+  ConvolutionFilterDimensions filter_dimensions;
+  filter_dimensions.set_output_feature_map_count(shape.dimensions(0));
+  filter_dimensions.set_input_feature_map_count(shape.dimensions(1) * 32);
+  filter_dimensions.set_input_filter_height(shape.dimensions(2));
+  filter_dimensions.set_input_filter_width(shape.dimensions(3));
 
-  absl::InlinedVector<BufferAllocation::Slice, 2> operand_slices;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
                       GetAllocationSliceForHlo(instr->operand(0)));
-  operand_slices.push_back(filter_input);
+
+  BufferAllocation::Slice filter_output;
+  std::optional<ConvolutionReorderThunk::BiasBuffers> biases;
   if (has_bias) {
+    TF_ASSIGN_OR_RETURN(filter_output, GetAllocationSliceForHlo(instr, {0}));
+
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_input,
                         GetAllocationSliceForHlo(instr->operand(1)));
-    operand_slices.push_back(bias_input);
-  }
-
-  absl::InlinedVector<BufferAllocation::Slice, 2> result_slices;
-  if (has_bias) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
-                        GetAllocationSliceForHlo(instr, {0}));
-    result_slices.push_back(filter_output);
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_output,
                         GetAllocationSliceForHlo(instr, {1}));
-    result_slices.push_back(bias_output);
+    biases = {{bias_input, bias_output}};
   } else {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
-                        GetAllocationSliceForHlo(instr));
-    result_slices.push_back(filter_output);
+    TF_ASSIGN_OR_RETURN(filter_output, GetAllocationSliceForHlo(instr));
   }
 
   auto thunk = std::make_unique<ConvolutionReorderThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
-      absl::MakeSpan(filter_dims), operand_slices, result_slices);
+      std::move(filter_dimensions), filter_input, filter_output, biases);
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitNormThunk(
+absl::Status ThunkEmitter::EmitNormThunk(
     const HloCustomCallInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto const gpu_backend_config,
                       instr->backend_config<xla::gpu::GpuBackendConfig>());
@@ -1014,7 +747,7 @@ absl::Status IrEmitterUnnested::EmitNormThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCuDnnThunk(
+absl::Status ThunkEmitter::EmitCuDnnThunk(
     const HloCustomCallInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto kernel_arguments,
                       emitters::KernelArguments::Create(
@@ -1038,7 +771,7 @@ absl::Status IrEmitterUnnested::EmitCuDnnThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitPtxCustomCall(
+absl::Status ThunkEmitter::EmitPtxCustomCall(
     const HloCustomCallInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto thunk,
                       EmitPtxCustomKernelThunk(instr, ir_emitter_context_));
@@ -1046,14 +779,13 @@ absl::Status IrEmitterUnnested::EmitPtxCustomCall(
   return absl::OkStatus();
 }
 
-absl::StatusOr<BufferAllocation::Slice>
-IrEmitterUnnested::GetAllocationSliceForHlo(const HloInstruction* instr,
-                                            const ShapeIndex& index) const {
+absl::StatusOr<BufferAllocation::Slice> ThunkEmitter::GetAllocationSliceForHlo(
+    const HloInstruction* instr, const ShapeIndex& index) const {
   return xla::gpu::GetAllocationSlice(ir_emitter_context_->buffer_assignment(),
                                       instr, index);
 }
 
-absl::Status IrEmitterUnnested::EmitCubDeviceRadixSort(
+absl::Status ThunkEmitter::EmitCubDeviceRadixSort(
     const HloCustomCallInstruction* instr) {
   if (instr->operand_count() != 1 && instr->operand_count() != 2) {
     return Internal("Invalid number of operands for radix sort");
@@ -1084,121 +816,33 @@ absl::Status IrEmitterUnnested::EmitCubDeviceRadixSort(
   TF_ASSIGN_OR_RETURN(xla::SortOptions options,
                       instr->backend_config<xla::SortOptions>());
   const Shape& operand_shape = instr->operand(0)->shape();
-  auto thunk = std::make_unique<CubSortThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(
-          instr, ir_emitter_context_->GetNextThunkId()),
-      operand_shape.element_type(),
-      instr->operand_count() == 2
-          ? std::optional(instr->operand(1)->shape().element_type())
-          : std::nullopt,
-      operands, results, scratch, options.descending(),
-      Product(operand_shape.dimensions()) /
-          operand_shape.dimensions(operand_shape.dimensions().size() - 1),
-      ir_emitter_context_->platform_name());
-  AddThunkToThunkSequence(std::move(thunk));
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitCholeskyThunk(const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(CholeskyOptions options,
-                      instr->backend_config<CholeskyOptions>());
-  const Shape& shape = instr->operand(0)->shape();
-  int ndim = shape.dimensions().size();
-  CHECK_GE(ndim, 2);
-  int64_t n = shape.dimensions(ndim - 1);
-
-  const absl::Span<const int64_t>& dims = shape.dimensions();
-  int64_t batch_size =
-      std::accumulate(dims.begin(), dims.end() - 2, int64_t{1},
-                      [](int64_t a, int64_t b) { return a * b; });
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice operand_buffer,
-                      GetAllocationSliceForHlo(instr->operand(0), {}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a_buffer,
-                      GetAllocationSliceForHlo(instr, {0}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice workspace_buffer,
-                      GetAllocationSliceForHlo(instr, {1}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice info_buffer,
-                      GetAllocationSliceForHlo(instr, {2}));
-
-  ThunkSequence thunks;
-
-  if (operand_buffer != a_buffer) {
-    thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(
-            instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/operand_buffer,
-        /*destination_buffer=*/a_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
-  }
-
   TF_ASSIGN_OR_RETURN(
-      se::Platform * platform,
-      PlatformUtil::GetPlatform(ir_emitter_context_->platform_name()));
-
-  TF_ASSIGN_OR_RETURN(
-      std::function<
-          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-          solver_creator,
-      stream_executor::PlatformObjectRegistry::GetGlobalRegistry()
-          .FindObject<stream_executor::GpuSolverContextFactory>(
-              platform->id()));
-
-  thunks.push_back(std::make_unique<CholeskyThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(
-          instr, ir_emitter_context_->GetNextThunkId()),
-      options, a_buffer, workspace_buffer, info_buffer, shape.element_type(),
-      batch_size, n, std::move(solver_creator)));
-
-  // Elide the sequential thunk if there's no copy.
-  if (thunks.size() == 1) {
-    AddThunkToThunkSequence(std::move(thunks[0]));
-  } else {
-    AddThunkToThunkSequence(std::make_unique<SequentialThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(
-            instr, ir_emitter_context_->GetNextThunkId()),
-        std::move(thunks)));
-  }
-
+      std::unique_ptr<CubSortThunk> thunk,
+      CubSortThunk::Create(
+          Thunk::ThunkInfo::WithProfileAnnotation(
+              instr, ir_emitter_context_->GetNextThunkId()),
+          operand_shape.element_type(),
+          instr->operand_count() == 2
+              ? std::optional(instr->operand(1)->shape().element_type())
+              : std::nullopt,
+          operands, results, scratch, options.descending(),
+          Product(operand_shape.dimensions()) /
+              operand_shape.dimensions(operand_shape.dimensions().size() - 1),
+          ir_emitter_context_->platform_name()));
+  AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCustomCallThunk(
+absl::Status ThunkEmitter::EmitCustomCallThunk(
     const HloCustomCallInstruction* instr) {
   const std::string& call_target_name = instr->custom_call_target();
 
   // Typed FFI custom calls is a replacement for legacy custom calls
-  // with a rich type safe API. It's under construction and not
-  // fully supported.
+  // with a rich type safe API.
   bool is_ffi_custom_call =
       instr->api_version() == CustomCallApiVersion::API_VERSION_TYPED_FFI;
 
-  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
-      call_target_name, std::string(platform_name()));
-
-  absl::StatusOr<ffi::HandlerRegistration> registration =
-      ffi::FindHandler(call_target_name, platform_name());
-
-  // At least one implementation should be available at run time.
-  bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && registration.ok();
-
-  if (!found_custom_call && !found_ffi_handler) {
-    auto& debug_options = ir_emitter_context_->debug_options();
-
-    // If true, then all custom calls that are not found in custom
-    // call or FFI registries will become no-op (we don't emit any
-    // thunks for them).
-    if (debug_options.xla_gpu_mock_custom_calls()) {
-      return absl::OkStatus();
-    }
-
-    return absl::UnimplementedError(
-        absl::StrCat("No registered implementation for custom call to ",
-                     call_target_name, " for platform ", platform_name()));
-  }
-
-  using Slices = std::vector<std::optional<CustomCallThunk::Slice>>;
+  using Slices = std::vector<NullableShapedSlice>;
 
   Slices operands;
   for (auto* operand : instr->operands()) {
@@ -1213,7 +857,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
           }
           TF_ASSIGN_OR_RETURN(auto slice,
                               GetAllocationSliceForHlo(operand, index));
-          operands.push_back(CustomCallThunk::Slice{slice, subshape});
+          operands.push_back(ShapedSlice{slice, subshape});
           return absl::OkStatus();
         }));
   }
@@ -1229,65 +873,15 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
           return absl::OkStatus();
         }
         TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSliceForHlo(instr, index));
-        results.push_back(CustomCallThunk::Slice{slice, subshape});
+        results.push_back(ShapedSlice{slice, subshape});
         return absl::OkStatus();
       }));
 
-  // For legacy custom calls we convert all API versions into the
-  // latest status-returning one and pass backend config as an
-  // opaque string.
-  CustomCallThunk::CustomCallTarget custom_call_target;
-
   // For XLA FFI handlers we decode opaque backend config into
   // attributes map at IR emission time, so that we do not need to
   // parse MLIR at run time. For FFI handlers backend config must be
   // a compatible MLIR dictionary.
-  CustomCallThunk::AttributesMap attributes;
-
-  // For information about this calling convention, see
-  // xla/g3doc/custom_call.md.
-  switch (instr->api_version()) {
-    case CustomCallApiVersion::API_VERSION_ORIGINAL: {
-      constexpr absl::string_view kErrorMessage =
-          "Custom call API version `API_VERSION_ORIGINAL` is "
-          "not supported "
-          "by XLA:GPU. Prefer "
-          "https://docs.jax.dev/en/latest/ffi.html. It "
-          "will be fully removed in November 2025.";
-      if constexpr (tsl::kIsOpenSource) {
-        LOG(ERROR) << kErrorMessage;
-      } else {
-        LOG(FATAL) << kErrorMessage;
-      }
-
-      custom_call_target = [call_target](stream_executor::Stream* stream,
-                                         void** buffers, const char* opaque,
-                                         size_t opaque_len,
-                                         XlaCustomCallStatus*) {
-        reinterpret_cast<CustomCallWithOpaqueStreamHandle>(call_target)(
-            stream->platform_specific_handle().stream, buffers, opaque,
-            opaque_len);
-      };
-      break;
-    }
-    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
-    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
-      custom_call_target = [call_target](stream_executor::Stream* stream,
-                                         void** buffers, const char* opaque,
-                                         size_t opaque_len,
-                                         XlaCustomCallStatus* status) {
-        reinterpret_cast<CustomCallWithStatusAndOpaqueStreamHandle>(
-            call_target)(stream->platform_specific_handle().stream, buffers,
-                         opaque, opaque_len, status);
-      };
-      break;
-    case CustomCallApiVersion::API_VERSION_TYPED_FFI:
-      // We already checked `handler` above.
-      break;
-    default:
-      return Internal("Unknown custom-call API version enum value: %d",
-                      instr->api_version());
-  }
+  ffi::AttributesMap attributes;
 
   auto backend_config = instr->backend_config<GpuBackendConfig>();
   if (!backend_config.ok()) {
@@ -1317,9 +911,10 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
     return CustomCallThunk::Create(
         Thunk::ThunkInfo::WithProfileAnnotation(
             instr, ir_emitter_context_->GetNextThunkId()),
-        call_target_name, registration->bundle, std::move(operands),
-        std::move(results), std::move(attributes),
-        called_computations.empty() ? nullptr : called_computations[0]);
+        call_target_name, std::move(operands), std::move(results),
+        std::move(attributes),
+        called_computations.empty() ? nullptr : called_computations[0],
+        ir_emitter_context_->platform_name());
   };
 
   auto legacy_thunk =
@@ -1331,18 +926,29 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
     return CustomCallThunk::Create(
         Thunk::ThunkInfo::WithProfileAnnotation(
             instr, ir_emitter_context_->GetNextThunkId()),
-        call_target_name, std::move(custom_call_target), std::move(operands),
-        std::move(results), std::move(opaque));
+        call_target_name, std::move(operands), std::move(results),
+        std::move(opaque), instr->api_version(),
+        ir_emitter_context_->platform_name());
   };
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<CustomCallThunk> custom_call_thunk,
-                      found_ffi_handler ? ffi_thunk() : legacy_thunk());
-  AddThunkToThunkSequence(std::move(custom_call_thunk));
+  absl::StatusOr<std::unique_ptr<CustomCallThunk>> custom_call_thunk =
+      is_ffi_custom_call ? ffi_thunk() : legacy_thunk();
 
-  return absl::OkStatus();
+  if (custom_call_thunk.ok()) {
+    AddThunkToThunkSequence(std::move(custom_call_thunk.value()));
+    return absl::OkStatus();
+  }
+
+  if (ir_emitter_context_->debug_options().xla_gpu_mock_custom_calls()) {
+    // xla_gpu_mock_custom_calls=true means we won't emit thunks for all custom
+    // call targets that couldn't be found.
+    return absl::OkStatus();
+  }
+
+  return custom_call_thunk.status();
 }
 
-absl::Status IrEmitterUnnested::EmitFftThunk(const HloFftInstruction* instr) {
+absl::Status ThunkEmitter::EmitFftThunk(const HloFftInstruction* instr) {
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice arg_slice,
                       GetAllocationSliceForHlo(instr->operand(0)));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dest_slice,
@@ -1358,7 +964,7 @@ absl::Status IrEmitterUnnested::EmitFftThunk(const HloFftInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
+absl::Status ThunkEmitter::EmitTriangularSolveCustomCall(
     const HloInstruction* instr) {
   TF_RET_CHECK(instr->operand_count() == 2);
   auto operands = instr->operands();
@@ -1440,7 +1046,7 @@ absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitTopKCustomCall(
+absl::Status ThunkEmitter::EmitTopKCustomCall(
     const HloCustomCallInstruction* instr) {
   auto operands = instr->operands();
   const auto& shape = instr->shape();
@@ -1476,8 +1082,7 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
                           GetDefaultBufferAlignment(), instr));
 
   auto dtype = data_shape.element_type();
-  bool is_cuda = std::holds_alternative<stream_executor::CudaComputeCapability>(
-      ir_emitter_context_->gpu_compute_capability());
+  bool is_cuda = ir_emitter_context_->gpu_compute_capability().IsCuda();
   if (is_cuda && instr->GetModule()
                      ->config()
                      .debug_options()
@@ -1497,10 +1102,11 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
     VLOG(3) << "EmitTopKCustomCall: dtype=" << dtype << ", n=" << n
             << ", k=" << k << ", use_raft_select_k=" << use_raft_select_k;
 
+    Thunk::ThunkInfo thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(
+        instr, ir_emitter_context_->GetNextThunkId());
     if (use_raft_select_k) {
       AddThunkToThunkSequence(std::make_unique<SelectKThunk>(
-          instr, batch_size, n, k, dtype, kernel_arguments,
-          ir_emitter_context_->GetNextThunkId()));
+          std::move(thunk_info), batch_size, n, k, dtype, kernel_arguments));
       return absl::OkStatus();
     }
   }
@@ -1515,21 +1121,21 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
       kernel::topk::GetTopKKernel("topk", dtype, n, k, batch_size,
                                   platform_name(), wavefront_size));
 
+  Thunk::ThunkInfo thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(
+      instr, ir_emitter_context_->GetNextThunkId());
   AddThunkToThunkSequence(std::make_unique<CustomKernelThunk>(
-      instr, std::move(kernel), kernel_arguments,
-      ir_emitter_context_->GetNextThunkId()));
+      std::move(thunk_info), std::move(kernel), kernel_arguments));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitTritonCustomCall(
+absl::Status ThunkEmitter::EmitTritonCustomCall(
     const HloCustomCallInstruction* instr) {
   auto generate = [this, &instr]() -> absl::StatusOr<KernelReuseCache::Entry> {
     mlir::MLIRContext& mlir_context = *ir_emitter_context_->mlir_context();
     LoadMlirDialectsForTriton(mlir_context);
     auto call =
         TritonCall::Parse(instr->raw_backend_config_string(), &mlir_context);
-    auto kernel_name =
-        ir_emitter_context_->name_uniquer()->GetUniqueName(call.name);
+    auto kernel_name = GetSanitizedUniqueName(*ir_emitter_context_, call.name);
     VLOG(3) << "Generating: " << kernel_name;
 
     mlir::OwningOpRef<mlir::ModuleOp> triton_module;
@@ -1541,7 +1147,7 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
         return absl::InvalidArgumentError(
             absl::StrCat("Failed to parse Triton module: ",
                          diagnostic_handler.ConsumeStatus().message(),
-                         "\ninput ir: ", call.ir));
+                         "\ninput ir: \"", absl::CHexEscape(call.ir), "\""));
       }
     }
 
@@ -1583,58 +1189,14 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
             call.num_warps *
             ir_emitter_context_->gpu_device_info().threads_per_warp()));
 
-    std::string sanitized_kernel_name =
-        GetSanitizedUniqueName(*ir_emitter_context_, kernel_name);
-
     if (emit_kernels) {
-      llvm::Function* impl_fn =
-          ir_emitter_context_->llvm_module()->getFunction(kernel_name);
-      TF_RET_CHECK(impl_fn);
-      impl_fn->setName(ir_emitter_context_->name_uniquer()->GetUniqueName(
-          kernel_name + "_impl"));
-
-      llvm::IRBuilder builder(ir_emitter_context_->llvm_module()->getContext());
-
-      TF_ASSIGN_OR_RETURN(llvm::Function * kernel,
-                          BuildKernelPrototypeFromUniqueName(
-                              *ir_emitter_context_, impl_fn->getName().str(),
-                              sanitized_kernel_name, kernel_arguments,
-                              launch_dimensions, &builder));
-
-      // Move function body into kernel prototype.
-      llvm::Function* prototype_func = builder.GetInsertBlock()->getParent();
-      prototype_func->splice(prototype_func->begin(), impl_fn);
-      for (const auto& [impl_fn_arg, kernel_arg] :
-           llvm::zip(impl_fn->args(), kernel->args())) {
-        impl_fn_arg.replaceAllUsesWith(&kernel_arg);
-      }
-      // Triton's kernel ABI expects additional scratchpad global memory for TMA
-      // and profiling information. For now it is only used for on-device
-      // creation of TMA descriptors, which we do not use yet, so we are just
-      // replacing this argument with a null pointer.
-      // TODO: b/381242007 - Allocate a proper buffer if we want to use
-      // device-side TMA APIs.
-      CHECK_EQ(impl_fn->arg_size(), kernel->arg_size() + 2);
-      auto tma_scratchpad_arg = impl_fn->getArg(impl_fn->arg_size() - 2);
-      tma_scratchpad_arg->replaceAllUsesWith(llvm::ConstantPointerNull::get(
-          llvm::cast<llvm::PointerType>(tma_scratchpad_arg->getType())));
-      auto profiling_scratchpad_arg = impl_fn->getArg(impl_fn->arg_size() - 1);
-      profiling_scratchpad_arg->replaceAllUsesWith(
-          llvm::ConstantPointerNull::get(llvm::cast<llvm::PointerType>(
-              profiling_scratchpad_arg->getType())));
-
-      impl_fn->eraseFromParent();
-
-      for (auto& arg : prototype_func->args()) {
-        // Remove the alignment and aliasing attributes to avoid
-        // recompiling the kernel for each alignment/aliasing
-        // combination.
-        arg.removeAttr(llvm::Attribute::Alignment);
-        arg.removeAttr(llvm::Attribute::NoAlias);
-      }
+      TF_RETURN_IF_ERROR(
+          RemoveUnusedTritonAbiArguments(*ir_emitter_context_, kernel_name,
+                                         launch_dimensions, kernel_arguments)
+              .status());
     }
 
-    return {{sanitized_kernel_name, launch_dimensions, result.cluster_dim,
+    return {{kernel_name, launch_dimensions, result.cluster_dim,
              result.shmem_bytes}};
   };
 
@@ -1656,8 +1218,7 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitAsyncComputation(
-    const HloInstruction* instr) {
+absl::Status ThunkEmitter::EmitAsyncComputation(const HloInstruction* instr) {
   const HloInstruction* wrapped = instr->async_wrapped_instruction();
   const ExecutionStreamAssignment& stream_assignment =
       ir_emitter_context_->execution_stream_assignment();
@@ -1665,7 +1226,7 @@ absl::Status IrEmitterUnnested::EmitAsyncComputation(
                       stream_assignment.GetSyncExecutionStreamId(wrapped));
   TF_RET_CHECK(wrapped->called_computations().size() == 1);
   auto computation = wrapped->called_computations().front();
-  auto ir_emitter = IrEmitterUnnested::Create(ir_emitter_context_);
+  auto ir_emitter = ThunkEmitter::Create(ir_emitter_context_);
   TF_RETURN_IF_ERROR(ir_emitter->EmitHloComputation(computation));
   std::unique_ptr<SequentialThunk> thunk_sequence =
       ir_emitter->ConsumeThunkSequence();
@@ -1689,12 +1250,12 @@ absl::Status IrEmitterUnnested::EmitAsyncComputation(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr) {
+absl::Status ThunkEmitter::EmitFusion(const HloFusionInstruction* instr) {
   const se::DeviceDescription& device_info =
       ir_emitter_context_->gpu_device_info();
   const HloFusionAnalysis fusion_analysis =
       HloFusionAnalysis::Create(*instr, device_info);
-  VLOG(3) << "IrEmitterUnnested::EmitFusion:start";
+  VLOG(3) << "ThunkEmitter::EmitFusion:start";
   std::unique_ptr<FusionInterface> emitter = GetFusionEmitter(
       /*fusion_info=*/HloFusionInfo(
           /*analysis=*/fusion_analysis, instr,
@@ -1704,6 +1265,13 @@ absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr) {
       ir_emitter_context_->mlir_context());
   TF_ASSIGN_OR_RETURN(auto result, emitter->Emit(*ir_emitter_context_, *instr));
 
+  // Use override flag because libdevice functions can be present in both.
+  if (result.module) {
+    TF_RET_CHECK(!llvm::Linker::linkModules(
+        *ir_emitter_context_->llvm_module(), std::move(result.module),
+        llvm::Linker::Flags::OverrideFromSrc));
+  }
+
   const ExecutionStreamAssignment& stream_assignment =
       ir_emitter_context_->execution_stream_assignment();
   for (std::unique_ptr<Thunk>& thunk : result.thunks) {
@@ -1712,11 +1280,11 @@ absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr) {
     thunk->set_execution_stream_id(execution_stream_id);
     AddThunkToThunkSequence(std::move(thunk));
   }
-  VLOG(3) << "IrEmitterUnnested::EmitFusion:complete";
+  VLOG(3) << "ThunkEmitter::EmitFusion:complete";
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCopy(const HloInstruction* instr) {
+absl::Status ThunkEmitter::EmitCopy(const HloInstruction* instr) {
   TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
       instr->operand(0)->shape(), instr->shape(),
       Layout::Equal().MinorToMajorOnly()));
@@ -1733,7 +1301,7 @@ absl::Status IrEmitterUnnested::EmitCopy(const HloInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitAsyncCustomCallStart(
+absl::Status ThunkEmitter::EmitAsyncCustomCallStart(
     const HloInstruction* instr) {
   const HloInstruction* wrapped = instr->async_wrapped_instruction();
   auto* async_start = Cast<HloAsyncInstruction>(instr);
@@ -1775,7 +1343,7 @@ absl::Status IrEmitterUnnested::EmitAsyncCustomCallStart(
                   HloOpcodeString(wrapped->opcode()));
 }
 
-absl::Status IrEmitterUnnested::AssertNonDeterminismIsOkay(
+absl::Status ThunkEmitter::AssertNonDeterminismIsOkay(
     const std::string& op_name) {
   if (RequireDeterminism(ir_emitter_context_->hlo_module().config())) {
     return Unimplemented(
@@ -1787,7 +1355,7 @@ absl::Status IrEmitterUnnested::AssertNonDeterminismIsOkay(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitWhile(const HloInstruction* instr) {
+absl::Status ThunkEmitter::EmitWhile(const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto config,
                       instr->backend_config<xla::WhileLoopBackendConfig>());
 
@@ -1807,26 +1375,27 @@ absl::Status IrEmitterUnnested::EmitWhile(const HloInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitRngGetAndUpdateState(
+absl::Status ThunkEmitter::EmitRngGetAndUpdateState(
     const HloRngGetAndUpdateStateInstruction* instr) {
-  // Emit a kernel to increment the global state for Philox RNG
-  // algorithm.
-  TF_ASSIGN_OR_RETURN(auto ir_arrays, BuildKernelThunkForNonFusionOp(
-                                          instr, LaunchDimensions()));
-  llvm::Value* old_state =
-      llvm_ir::RngGetAndUpdateState(instr->delta(), module_, &b_);
-  llvm::Value* output_address = ir_arrays[0].EmitArrayElementAddress(
-      llvm_ir::IrArray::Index(
-          /*linear=*/b_.getInt64(0), instr->shape(), &b_),
-      &b_, "rng_state_address");
-  Store(old_state, output_address);
+  std::string ir_name = std::string(instr->name());
+  auto local_llvm_module =
+      CreateLocalLLVMModule(ir_name, ir_emitter_context_->llvm_module());
+
+  TF_ASSIGN_OR_RETURN(auto thunk_sequence,
+                      EmitRngGetAndUpdateStateLLVMIR(
+                          instr, local_llvm_module.get(), ir_emitter_context_));
+  for (auto& thunk : thunk_sequence) {
+    AddThunkToThunkSequence(std::move(thunk));
+  }
+  CHECK(!llvm::Linker::linkModules(*ir_emitter_context_->llvm_module(),
+                                   std::move(local_llvm_module),
+                                   llvm::Linker::Flags::OverrideFromSrc));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitSort(const HloSortInstruction* sort) {
+absl::Status ThunkEmitter::EmitSort(const HloSortInstruction* sort) {
   std::string op_name(sort->name());
   const Shape& keys_shape = sort->operand(0)->shape();
-  int64_t dimension_to_sort = sort->sort_dimension();
   for (int64_t i = 0; i < sort->operand_count(); ++i) {
     ShapeIndex shape_index =
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
@@ -1864,162 +1433,23 @@ absl::Status IrEmitterUnnested::EmitSort(const HloSortInstruction* sort) {
     }
   }
 
-  uint64_t dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
-  int64_t num_stages = Log2Ceiling(dimension_to_sort_bound);
-  VLOG(2) << op_name << " requires " << num_stages << " stages.";
-  CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
-  CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
+  auto local_llvm_module =
+      CreateLocalLLVMModule(op_name, ir_emitter_context_->llvm_module());
 
-  // Naive C++ code for the outer loops:
-  //
-  // for (int64_t stage = 0; stage <
-  // Log2Ceiling(dimension_to_sort_bound);
-  //     ++stage) {
-  //   int64_t first_xor_mask = (1LL << (stage + 1)) - 1;
-  //   SortInPlace(first_xor_mask);
-  //   for (int64_t mask = stage - 1; mask >= 0; --mask) {
-  //     int64_t later_xor_mask = 1LL << mask;
-  //     SortInPlace(later_xor_mask);
-  //   }
-  // }
-  //
-  // This follows the alternative representation of the algorithm
-  // described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
-  //
-  // Each mask specifies how to derive from one position in the
-  // array the position with which it should be compared (we
-  // calculate the xor of the position with the mask). As an
-  // optimization, we can move the 'mask' loop to inside the
-  // sorting/comparison loop if the comparisons happen within a
-  // small block of the array. To make this work, we collect all
-  // consecutive masks that are smaller than our chosen power of 2
-  // tile size, and pass them to SortInPlace. Each block then
-  // processes one tile of data.
-
-  const uint64_t kUnrollFactor = 4;
-  // Determine the total element size of all sort operands. We need to choose a
-  // tile size such that we have enough shared memory to store a tile of
-  // elements from each operand.
-  uint64_t total_element_size = 0;
-  for (int64_t i = 0; i < sort->operand_count(); ++i) {
-    total_element_size += ShapeUtil::ByteSizeOfPrimitiveType(
-        sort->operand(i)->shape().element_type());
-  }
-  const uint64_t kMaxSharedMemoryPerBlock =
-      ir_emitter_context_->gpu_device_info().shared_memory_per_block();
-  uint64_t max_tile_size_fitting_into_shared_memory =
-      kMaxSharedMemoryPerBlock / total_element_size;
-  const uint64_t kMaxThreadsPerBlock =
-      ir_emitter_context_->gpu_device_info().threads_per_block_limit();
-  // Choose the tile size based on actual amount of elements to sort, the amount
-  // of shared memory avaiable, and the maximum number of threads per block.
-  uint64_t tile_size =
-      std::min(std::min(kMaxThreadsPerBlock * kUnrollFactor,
-                        max_tile_size_fitting_into_shared_memory),
-               uint64_t{1} << num_stages);
-  // The tile size needs to be a power of 2.
-  tile_size = uint64_t{1} << Log2Floor(tile_size);
-
-  // If we cannot combine several xor masks together, we don't use
-  // tiling, so we calculate the standard launch dimensions for the
-  // shape. However we only need to iterate through ~half of the
-  // dimension to sort (rounded up to the next highest power of 2),
-  // because each iteration compares one pair of elements.
-  Shape standard_iteration_shape = keys_shape;
-  uint64_t standard_num_iterations_in_sort_dim = 1ULL << (num_stages - 1);
-  standard_iteration_shape.set_dimensions(
-      dimension_to_sort,
-      CeilOfRatio(standard_num_iterations_in_sort_dim, kUnrollFactor));
-
-  LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions(
-      standard_iteration_shape, ir_emitter_context_->gpu_device_info());
-
-  // Calculate the launch dimensions for the case where we use
-  // tiling. We split the dimension that should be sorted into tiles
-  // of size 'tile_size'. This means we first need to round
-  // 'dimension_to_sort_bound' up to be a multiple of the tile size.
-  uint64_t rounded_bound = RoundUpTo(dimension_to_sort_bound, tile_size);
-  Shape iteration_shape = keys_shape;
-
-  // We iterate through the element pairs that should be compared.
-  uint64_t num_iterations_in_sort_dim =
-      CeilOfRatio(rounded_bound, kUnrollFactor);
-  iteration_shape.set_dimensions(dimension_to_sort, num_iterations_in_sort_dim);
-  uint64_t num_iterations = ShapeUtil::ElementsIn(iteration_shape);
-
-  // For correctness reasons we need exactly `tile_size` / `kUnrollFactor` many
-  // threads per block. Each thread is responsible for copying
-  // exactly `kUnrollFactor` many adjacent elements into shared memory, and then
-  // does `kUnrollFactor` / 2 many comparisons of two elements taken from shared
-  // memory.
-  const uint64_t kThreadsPerBlock =
-      std::max(uint64_t{1}, tile_size / kUnrollFactor);
-
-  uint64_t num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
-  LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
-  VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
-                                op_name, num_blocks, kThreadsPerBlock);
-  auto emit_kernel = [&](absl::Span<const int64_t> xor_masks) {
-    VLOG(2) << absl::StreamFormat(
-        "%s uses kernel for xor masks [%s]", op_name,
-        absl::StrJoin(xor_masks, ", ", [](std::string* out, int64_t xor_mask) {
-          absl::StrAppendFormat(out, "0x%x", xor_mask);
-        }));
-    LaunchDimensions launch_dimensions = xor_masks.size() > 1
-                                             ? tiled_launch_dimensions
-                                             : standard_launch_dimensions;
-    TF_ASSIGN_OR_RETURN(
-        std::vector<llvm_ir::IrArray> ir_arrays,
-        BuildKernelThunkForNonFusionOp(sort, launch_dimensions));
-
-    // The first `operand_count()` elements of `ir_arrays` are the input
-    // operands and the rest are the output arrays. Inputs are aliases with
-    // outputs, so we need to pass only the outputs to the in-place sort kernel.
-    auto output_arrays_span =
-        absl::Span<const llvm_ir::IrArray>(ir_arrays).subspan(
-            sort->operand_count());
-
-    auto* comparator = sort->called_computations().front();
-    return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, output_arrays_span, llvm_ir::IrName(op_name),
-        xor_masks, &b_, launch_dimensions,
-        xor_masks.size() > 1 ? num_iterations_in_sort_dim
-                             : standard_num_iterations_in_sort_dim,
-        tile_size, kUnrollFactor,
-        [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
-          return CallNestedComputation(&b_, *ir_emitter_context_, *comparator,
-                                       operands, output);
-        });
-  };
-  std::vector<int64_t> xor_masks;
-  for (int64_t stage = 0; stage < num_stages; ++stage) {
-    for (int64_t mask = stage; mask >= 0; --mask) {
-      int64_t xor_mask;
-      if (mask == stage) {
-        xor_mask = (1LL << (stage + 1)) - 1;
-      } else {
-        xor_mask = 1LL << mask;
-      }
-      if (xor_mask >= tile_size) {
-        if (!xor_masks.empty()) {
-          TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
-          xor_masks.clear();
-        }
-        TF_RETURN_IF_ERROR(emit_kernel({xor_mask}));
-      } else {
-        xor_masks.push_back(xor_mask);
-      }
-    }
-  }
-  if (!xor_masks.empty()) {
-    TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+  TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
+                      EmitBitonicSortLLVMIR(sort, local_llvm_module.get(),
+                                            ir_emitter_context_));
+  for (auto& thunk : thunks) {
+    AddThunkToThunkSequence(std::move(thunk));
   }
+  llvm::Linker::linkModules(*ir_emitter_context_->llvm_module(),
+                            std::move(local_llvm_module),
+                            llvm::Linker::Flags::OverrideFromSrc);
   return absl::OkStatus();
 }
 
 template <typename ThunkType>
-absl::Status IrEmitterUnnested::EmitReplicaOrPartitionId(
+absl::Status ThunkEmitter::EmitReplicaOrPartitionId(
     const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
                       GetAllocationSliceForHlo(instr, {}));
@@ -2041,7 +1471,31 @@ bool IsNvshmemCollective(const HloInstruction* instr) {
   return false;
 }
 
-absl::Status IrEmitterUnnested::EmitCollectivePermute(
+absl::Status ThunkEmitter::EmitCollectiveMetadata(const HloInstruction* instr) {
+  std::vector<CollectiveMetadataThunk::Buffer> buffers;
+  buffers.reserve(instr->operands().size());
+  for (const HloInstruction* operand : instr->operands()) {
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                        GetAllocationSliceForHlo(operand, {}));
+    buffers.push_back({slice, operand->shape().layout().memory_space()});
+  }
+
+  // Operation result should be a tuple where the last element is the buffer for
+  // the metadata.
+  ShapeIndex result_shape_index = {static_cast<int64_t>(buffers.size())};
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result,
+                      GetAllocationSliceForHlo(instr, result_shape_index));
+
+  auto thunk = std::make_unique<CollectiveMetadataThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(
+          instr, ir_emitter_context_->GetNextThunkId()),
+      CollectiveMetadataThunk::GetCollectiveConfig(*instr), std::move(buffers),
+      result);
+  AddThunkToThunkSequence(std::move(thunk));
+  return absl::OkStatus();
+}
+
+absl::Status ThunkEmitter::EmitCollectivePermute(
     const HloCollectivePermuteInstruction* instr) {
   // First output is aliased.
   TF_RET_CHECK(
@@ -2124,7 +1578,7 @@ absl::Status IrEmitterUnnested::EmitCollectivePermute(
 }
 
 template <typename CollectiveThunkType, typename HloInstType>
-absl::Status IrEmitterUnnested::EmitCollectiveThunk(
+absl::Status ThunkEmitter::EmitCollectiveThunk(
     Thunk::Kind kind, const HloInstruction* async_start,
     const HloInstType* inst, std::optional<bool> use_global_device_ids) {
   const auto& hlo_config = ir_emitter_context_->hlo_module().config();
@@ -2169,9 +1623,7 @@ absl::Status IrEmitterUnnested::EmitCollectiveThunk(
                                 /*source_buffer=*/src,
                                 /*destination_buffer=*/dst,
                                 /*source_memory_space=*/src_memory_space,
-                                /*destination_memory_space=*/dst_memory_space,
-                                /*source_value=*/nullptr,
-                                /*destination_value=*/nullptr});
+                                /*destination_memory_space=*/dst_memory_space});
   };
 
   if (kind == Thunk::Kind::kAllGatherStart) {
@@ -2240,9 +1692,25 @@ absl::Status IrEmitterUnnested::EmitCollectiveThunk(
     if (ir_emitter_context_->debug_options().xla_syntax_sugar_async_ops()) {
       thunk_info.profile_annotation = async_start->name();
     }
-    auto thunk = std::make_unique<CollectiveThunkType>(
-        thunk_info, inst, /*buffers=*/std::move(buffers),
-        ir_emitter_context_->debug_options().xla_gpu_use_memcpy_local_p2p());
+    std::unique_ptr<CollectiveThunkType> thunk;
+    // TODO(b/828435206) Remove this constexpr once collective kernel thunk is
+    // lifted out of the all reduce thunk.
+    if constexpr (kRequiresCollectiveKernelThunk<CollectiveThunkType>) {
+      TF_ASSIGN_OR_RETURN(
+          auto collective_kernel_thunk,
+          EmitCollectiveKernelThunk(ir_emitter_context_, call_graph_.get(),
+                                    thunk_info, buffers,
+                                    Cast<HloAllReduceInstruction>(inst),
+                                    GetAllReduceConfigInst(inst)));
+      thunk = std::make_unique<CollectiveThunkType>(
+          thunk_info, inst, /*buffers=*/std::move(buffers),
+          std::move(collective_kernel_thunk),
+          ir_emitter_context_->debug_options().xla_gpu_use_memcpy_local_p2p());
+    } else {
+      thunk = std::make_unique<CollectiveThunkType>(
+          thunk_info, inst, /*buffers=*/std::move(buffers),
+          ir_emitter_context_->debug_options().xla_gpu_use_memcpy_local_p2p());
+    }
     GetCollectivesAsyncEvents().insert({async_start, thunk->async_events()});
     AddThunkToThunkSequence(std::move(thunk));
     return absl::OkStatus();
@@ -2418,7 +1886,7 @@ std::vector<const HloInstruction*> GetRealDependencyInstructions(
   }
 }
 
-absl::Status IrEmitterUnnested::EmitCollectiveGroupStartThunk(
+absl::Status ThunkEmitter::EmitCollectiveGroupStartThunk(
     const HloInstruction* instr) {
   emit_group_thunks_ = true;
   std::optional<AsyncStreamKind> stream_kind;
@@ -2436,7 +1904,7 @@ absl::Status IrEmitterUnnested::EmitCollectiveGroupStartThunk(
   }
   auto thunk = std::make_unique<CollectiveGroupThunk>(
       instr, Thunk::Kind::kGroupStart, std::move(scoped_thunk_sequence_),
-      stream_kind.value_or(AsyncStreamKind::kCollective),
+      stream_kind.value_or(AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
       ir_emitter_context_->GetNextThunkId());
   emit_group_thunks_ = false;
 
@@ -2445,8 +1913,8 @@ absl::Status IrEmitterUnnested::EmitCollectiveGroupStartThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCollectiveAsyncDone(
-    Thunk::Kind kind, const HloInstruction* inst) {
+absl::Status ThunkEmitter::EmitCollectiveAsyncDone(Thunk::Kind kind,
+                                                   const HloInstruction* inst) {
   // Partial pipelining is only implemented for send/recv.
   bool is_send_recv =
       kind == Thunk::Kind::kRecvDone || kind == Thunk::Kind::kSendDone;
@@ -2466,7 +1934,7 @@ absl::Status IrEmitterUnnested::EmitCollectiveAsyncDone(
     return absl::OkStatus();
   }
 
-  AsyncStreamKind stream_kind = AsyncStreamKind::kCollective;
+  AsyncStreamKind stream_kind = AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
   if (is_send_recv) {
     stream_kind = GetStreamKindForP2P(start);
   }
@@ -2479,8 +1947,8 @@ absl::Status IrEmitterUnnested::EmitCollectiveAsyncDone(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitNvshmemAsyncDone(
-    Thunk::Kind kind, const HloInstruction* inst) {
+absl::Status ThunkEmitter::EmitNvshmemAsyncDone(Thunk::Kind kind,
+                                                const HloInstruction* inst) {
   bool is_send_recv = kind == Thunk::Kind::kNvshmemRecvDone ||
                       kind == Thunk::Kind::kNvshmemSendDone;
   const HloInstruction* start =
@@ -2499,7 +1967,7 @@ absl::Status IrEmitterUnnested::EmitNvshmemAsyncDone(
     return absl::OkStatus();
   }
 
-  AsyncStreamKind stream_kind = AsyncStreamKind::kCollective;
+  AsyncStreamKind stream_kind = AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
   if (is_send_recv) {
     stream_kind = GetStreamKindForP2P(start);
   }
@@ -2520,7 +1988,7 @@ absl::Status IrEmitterUnnested::EmitNvshmemAsyncDone(
 }
 
 template <typename NvshmemAllReduceThunkType, typename HloAllReduceInstruction>
-absl::Status IrEmitterUnnested::EmitNvshmemThunk(
+absl::Status ThunkEmitter::EmitNvshmemThunk(
     Thunk::Kind kind, const HloInstruction* async_start,
     const HloAllReduceInstruction* inst,
     std::optional<bool> use_global_device_ids) {
@@ -2559,9 +2027,7 @@ absl::Status IrEmitterUnnested::EmitNvshmemThunk(
                                 /*source_buffer=*/src,
                                 /*destination_buffer=*/dst,
                                 /*source_memory_space=*/src_memory_space,
-                                /*destination_memory_space=*/dst_memory_space,
-                                /*source_value=*/nullptr,
-                                /*destination_value=*/nullptr});
+                                /*destination_memory_space=*/dst_memory_space});
   };
 
   // For other operations simply zip operands with results.
@@ -2599,7 +2065,7 @@ absl::Status IrEmitterUnnested::EmitNvshmemThunk(
 }
 
 template <typename HloInstType>
-absl::Status IrEmitterUnnested::EmitDegeneratedCollectiveThunk(
+absl::Status ThunkEmitter::EmitDegeneratedCollectiveThunk(
     std::vector<CollectiveThunk::Buffer>& buffers,
     const HloInstruction* async_start, const HloInstType* inst) {
   // Signal that start thunk not created with nullptr.
@@ -2628,7 +2094,7 @@ absl::Status IrEmitterUnnested::EmitDegeneratedCollectiveThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitInfeed(const HloInfeedInstruction* instr) {
+absl::Status ThunkEmitter::EmitInfeed(const HloInfeedInstruction* instr) {
   // Infeed instruction returns a tuple containing the result data
   // and a token. We only need the result data to construct the
   // infeed thunk.
@@ -2656,8 +2122,7 @@ absl::Status IrEmitterUnnested::EmitInfeed(const HloInfeedInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitOutfeed(
-    const HloOutfeedInstruction* instr) {
+absl::Status ThunkEmitter::EmitOutfeed(const HloOutfeedInstruction* instr) {
   // HLO outfeed instruction has 2 operands, the source and a token,
   // and a single token output.
   const HloInstruction* source = instr->operand(0);
@@ -2685,62 +2150,19 @@ absl::Status IrEmitterUnnested::EmitOutfeed(
   return absl::OkStatus();
 }
 
-absl::StatusOr<std::vector<llvm_ir::IrArray>>
-IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
-    const HloInstruction* instr, const LaunchDimensions& launch_dimensions) {
-  std::string suggested_kernel_name(instr->name());
 
-  TF_ASSIGN_OR_RETURN(auto kernel_arguments,
-                      emitters::KernelArguments::Create(
-                          ir_emitter_context_->buffer_assignment(),
-                          GetDefaultBufferAlignment(), instr));
-
-  VLOG(3) << "Generating (without reuse check): " << suggested_kernel_name;
-
-  TF_ASSIGN_OR_RETURN(
-      llvm::Function * kernel,
-      BuildKernelPrototype(*ir_emitter_context_, suggested_kernel_name,
-                           suggested_kernel_name, kernel_arguments,
-                           launch_dimensions, &b_));
-
-  AddThunkToThunkSequence(std::make_unique<KernelThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(
-          instr, ir_emitter_context_->GetNextThunkId()),
-      kernel->getName().str(), kernel_arguments, launch_dimensions,
-      /*cluster_dim=*/std::nullopt,
-      /*shmem_bytes=*/0,
-      /*tma_metadata=*/se::gpu::TmaMetadata()));
-
-  std::vector<llvm_ir::IrArray> ir_arrays;
-  ir_arrays.reserve(kernel_arguments.args().size());
-  for (const auto& [kernel_argument, llvm_arg] :
-       llvm::zip(kernel_arguments.args(), kernel->args())) {
-    llvm::Type* ir_type =
-        llvm_ir::ShapeToIrType(kernel_argument.shape(), llvm_arg.getContext());
-    llvm_ir::IrArray ir_array(&llvm_arg, ir_type, kernel_argument.shape());
-
-    if (!kernel_argument.written()) {
-      ir_array.MarkInvariantOverWholeProgram(&llvm_arg.getContext());
-    }
-
-    ir_arrays.push_back(ir_array);
-  }
-
-  return ir_arrays;
-}
-
-absl::StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
+absl::StatusOr<std::unique_ptr<Thunk>> ThunkEmitter::BuildWhileThunk(
     const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info,
     std::optional<int64_t> trip_count) {
   HloComputation* condition = instr->while_condition();
   HloComputation* body = instr->while_body();
 
   // Generate thunk sequence for while 'condition'.
-  auto ir_emitter_condition = IrEmitterUnnested::Create(ir_emitter_context_);
+  auto ir_emitter_condition = ThunkEmitter::Create(ir_emitter_context_);
   TF_RETURN_IF_ERROR(ir_emitter_condition->EmitHloComputation(condition));
 
   // Generate thunk sequence for while 'body'.
-  auto ir_emitter_body = IrEmitterUnnested::Create(ir_emitter_context_);
+  auto ir_emitter_body = ThunkEmitter::Create(ir_emitter_context_);
   TF_RETURN_IF_ERROR(ir_emitter_body->EmitHloComputation(body));
 
   // Buffer slice holding while loop predicate.
@@ -2760,11 +2182,6 @@ absl::StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
       ir_emitter_body->ConsumeThunkSequence(body_thunk_info), trip_count));
 }
 
-absl::Status IrEmitterUnnested::EmitTargetElementLoop(
-    const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter) {
-  return Internal("This should be unreachable");
-}
-
 static absl::flat_hash_map<std::string, std::string> ConvertFrontendAttributes(
     const FrontendAttributes& attrs) {
   absl::flat_hash_map<std::string, std::string> result;
@@ -2788,7 +2205,7 @@ absl::StatusOr<bool> ShapeHasHostMemorySpace(Shape shape, int index,
          shape.tuple_shapes(index).layout().memory_space() == host_memory_space;
 }
 
-absl::Status IrEmitterUnnested::EmitCopyStartThunk(
+absl::Status ThunkEmitter::EmitCopyStartThunk(
     const HloCopyStartInstruction* copy_start_instr) {
   // copy-start has a tuple shape: {host, device, context},
   // or {device, host, context}.
@@ -2856,7 +2273,7 @@ absl::Status IrEmitterUnnested::EmitCopyStartThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitCopyDoneThunk(const HloInstruction* instr) {
+absl::Status ThunkEmitter::EmitCopyDoneThunk(const HloInstruction* instr) {
   const HloInstruction* copy_start_instr = instr->operand(0);
   CHECK(copy_start_instr->opcode() == HloOpcode::kCopyStart);
 
@@ -2870,7 +2287,7 @@ absl::Status IrEmitterUnnested::EmitCopyDoneThunk(const HloInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
+absl::Status ThunkEmitter::EmitSendThunk(const HloSendInstruction* instr) {
   const HloInstruction* src = instr->operand(0);
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
                       GetAllocationSliceForHlo(src, {}));
@@ -2950,7 +2367,7 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitSendDoneThunk(
+absl::Status ThunkEmitter::EmitSendDoneThunk(
     const HloSendDoneInstruction* instr) {
   if (!instr->is_host_transfer()) {
     if (IsNvshmemCollective(instr)) {
@@ -2972,7 +2389,7 @@ absl::Status IrEmitterUnnested::EmitSendDoneThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
+absl::Status ThunkEmitter::EmitRecvThunk(const HloRecvInstruction* instr) {
   TF_RET_CHECK(instr->shape().IsTuple());
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
                       GetAllocationSliceForHlo(instr, {0}));
@@ -3054,7 +2471,7 @@ absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitRecvDoneThunk(
+absl::Status ThunkEmitter::EmitRecvDoneThunk(
     const HloRecvDoneInstruction* instr) {
   if (!instr->is_host_transfer()) {
     if (IsNvshmemCollective(instr)) {
@@ -3088,8 +2505,7 @@ std::optional<const HloInstruction*> GetCollectiveHeroForDynamicSliceFusion(
       [](const HloInstruction* instr) { return IsCollective(instr); });
 }
 
-absl::Status IrEmitterUnnested::EmitHloInstruction(
-    const HloInstruction* instr) {
+absl::Status ThunkEmitter::EmitHloInstruction(const HloInstruction* instr) {
   switch (instr->opcode()) {
     case HloOpcode::kAllGatherDone:
       return EmitCollectiveAsyncDone(Thunk::kAllGatherDone, instr);
@@ -3134,6 +2550,8 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
         case HloOpcode::kCollectiveBroadcast:
           return EmitCollectiveAsyncDone(Thunk::kCollectiveBroadcastDone,
                                          instr);
+        case HloOpcode::kCollectivePermute:
+          return EmitCollectiveAsyncDone(Thunk::kCollectivePermuteDone, instr);
         case HloOpcode::kFusion: {
           auto collective_hero = GetCollectiveHeroForDynamicSliceFusion(
               Cast<HloFusionInstruction>(wrapped));
@@ -3368,9 +2786,6 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       if (IsCustomCallToDnnConvolution(*instr)) {
         return EmitConvolutionThunk(custom_call);
       }
-      if (IsCustomCallToCusolver(*instr)) {
-        return EmitCholeskyThunk(instr);
-      }
       if (IsTriangularSolve(*instr)) {
         return EmitTriangularSolveCustomCall(instr);
       }
@@ -3395,6 +2810,9 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
           instr->custom_call_target() == kCreateBufferCustomCallTarget) {
         return absl::OkStatus();
       }
+      if (instr->custom_call_target() == kCollectiveMetadataCustomCallTarget) {
+        return EmitCollectiveMetadata(instr);
+      }
       return EmitCustomCallThunk(custom_call);
     }
     case HloOpcode::kFusion:
@@ -3454,7 +2872,11 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
   return Internal("Unhandled HLO instruction");
 }
 
-absl::Status IrEmitterUnnested::EmitHloComputation(
+absl::Status ThunkEmitter::EmitHloEntryComputation(const HloModule* module) {
+  return EmitHloComputation(module->entry_computation());
+}
+
+absl::Status ThunkEmitter::EmitHloComputation(
     const HloComputation* computation) {
   const HloSchedule& schedule = computation->parent()->schedule();
   if (!schedule.is_computation_scheduled(computation)) {
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/thunk_emitter.h
similarity index 85%
rename from third_party/xla/xla/service/gpu/ir_emitter_unnested.h
rename to third_party/xla/xla/service/gpu/thunk_emitter.h
index 9273537d8af225..7d4610082a4377 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
-#define XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+#ifndef XLA_SERVICE_GPU_THUNK_EMITTER_H_
+#define XLA_SERVICE_GPU_THUNK_EMITTER_H_
 
 #include <array>
 #include <cstdint>
@@ -63,16 +63,16 @@ namespace gpu {
 //  - The true/false computation of an HLO conditional.
 //
 // Note the opportunity for confusion -- the while loop's computation is nested
-// within the root computation, but it's emitted using IrEmitterUnnested!  Don't
+// within the root computation, but it's emitted using ThunkEmitter!  Don't
 // think about it too hard.
 //
 // Examples of things that are not unnested computations:
 //
-//  - The body of a fusion node.  IrEmitterUnnested emits the relevant code
+//  - The body of a fusion node.  ThunkEmitter emits the relevant code
 //    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
 //    really an IrEmitter, but is more an "IR generator generator".)
 //
-class IrEmitterUnnested : public IrEmitter {
+class ThunkEmitter {
  public:
   absl::string_view platform_name() const {
     return ir_emitter_context_->platform_name();
@@ -83,10 +83,10 @@ class IrEmitterUnnested : public IrEmitter {
 
   using ConstantGenerator = std::function<llvm::Value*(int64_t)>;
 
-  IrEmitterUnnested(const IrEmitterUnnested&) = delete;
-  IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
+  ThunkEmitter(const ThunkEmitter&) = delete;
+  ThunkEmitter& operator=(const ThunkEmitter&) = delete;
 
-  static std::unique_ptr<IrEmitterUnnested> Create(
+  static std::unique_ptr<ThunkEmitter> Create(
       IrEmitterContext* ir_emitter_context);
 
   // Transfers the ownership of thunk_sequence_ out.
@@ -96,6 +96,11 @@ class IrEmitterUnnested : public IrEmitter {
                                              std::move(thunk_sequence_));
   }
 
+  absl::Status EmitHloEntryComputation(const HloModule* module);
+
+ private:
+  explicit ThunkEmitter(IrEmitterContext* ir_emitter_context);
+
   // Emits code for the given HLO computation.
   //
   // Also populates related information to 'ir_emitter_context_' for
@@ -105,12 +110,9 @@ class IrEmitterUnnested : public IrEmitter {
   // generated code will have empty 'content'.
   absl::Status EmitHloComputation(const HloComputation* computation);
 
- private:
-  explicit IrEmitterUnnested(IrEmitterContext* ir_emitter_context);
-
   absl::Status EmitCommandBufferThunk(const HloInstruction* instr);
 
-  // IrEmitterUnnested handles the following instructions differently from
+  // ThunkEmitter handles the following instructions differently from
   // IrEmitter. It also mixes in some special handling for custom kernels
   // via the ThunkEmitter.
   absl::Status EmitConstant(const HloConstantInstruction* instr);
@@ -126,7 +128,6 @@ class IrEmitterUnnested : public IrEmitter {
   absl::Status EmitCuDnnThunk(const HloCustomCallInstruction* instr);
   absl::Status EmitPtxCustomCall(const HloCustomCallInstruction* instr);
   absl::Status EmitCubDeviceRadixSort(const HloCustomCallInstruction* instr);
-  absl::Status EmitCholeskyThunk(const HloInstruction* instr);
   absl::Status EmitCustomCallThunk(const HloCustomCallInstruction* instr);
   absl::Status EmitFftThunk(const HloFftInstruction* instr);
   absl::Status EmitAsyncComputation(const HloInstruction* instr);
@@ -177,6 +178,8 @@ class IrEmitterUnnested : public IrEmitter {
   template <typename ThunkType>
   absl::Status EmitReplicaOrPartitionId(const HloInstruction* instr);
 
+  absl::Status EmitCollectiveMetadata(const HloInstruction* instr);
+
   absl::Status EmitCollectivePermute(
       const HloCollectivePermuteInstruction* instr);
 
@@ -188,10 +191,6 @@ class IrEmitterUnnested : public IrEmitter {
 
   absl::Status EmitCollectiveGroupStartThunk(const HloInstruction* instr);
 
-  absl::Status EmitTargetElementLoop(
-      const HloInstruction& hlo,
-      const llvm_ir::ElementGenerator& body_emitter) override;
-
   // Add a owning Thunk object to the thunk sequence.
   void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
     if (emit_group_thunks_) {
@@ -201,30 +200,6 @@ class IrEmitterUnnested : public IrEmitter {
     thunk_sequence_.emplace_back(std::move(thunk));
   }
 
-  // Load data from potentially unaligned address. If address is offset by
-  // `alignment_bytes`, data is read in the unit of `alignment_bytes` to avoid
-  // memory read misalignment in CUDA; otherwise, the entire data are loaded
-  // from the given memory address.
-  //
-  //   address: the memory address to load data from.
-  //   data_type: the type of data to load.
-  //   alignment_bytes: the number of bytes required to align. The number of
-  //     bytes of the data_type must be divisible by alignment_bytes.
-  llvm::Value* CreateLoad(llvm::Value* address, llvm::Type* data_type,
-                          int alignment_bytes);
-
-  // Store data at a potentially unaligned address. If the address is offset by
-  // `alignment_bytes`, data is stored in the unit of `alignment_bytes` to avoid
-  // memory write misalignment in CUDA; otherwise, the entire data is stored at
-  // the given memory address.
-  //
-  //   data: the data to be stored.
-  //   address: the memory address to store data.
-  //   alignment_bytes: the number of bytes required to align. The number of
-  //     bytes of the data_type must be divisible by alignment_bytes.
-  void CreateStore(llvm::Value* data, llvm::Value* address,
-                   int alignment_bytes);
-
   // Input = {static array, dynamic_dim0, dynamic_dim1}
   // Output = {dynamic array(with dynamic dimension meta data at the end)}
   // For a tensor with static dimension [2][<=5] and dynamic dimension [2][3]
@@ -317,9 +292,6 @@ class IrEmitterUnnested : public IrEmitter {
   //   ```
   absl::Status EmitSliceToDynamic(const HloCustomCallInstruction* instr);
 
-  absl::StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunkForNonFusionOp(
-      const HloInstruction* instr, const LaunchDimensions& launch_dimensions);
-
   // Returns a WhileThunk that invokes thunk sequences for 'condition' and
   // 'body' sub-computations of while instruction.
   absl::StatusOr<std::unique_ptr<Thunk>> BuildWhileThunk(
@@ -339,6 +311,7 @@ class IrEmitterUnnested : public IrEmitter {
   GetInstructionToHostExecuteAsyncEvents() {
     return ir_emitter_context_->instruction_to_host_execute_async_events();
   }
+  IrEmitterContext* ir_emitter_context_;
 
   // The thunk sequence this IrEmitter generates for the input computation.
   ThunkSequence thunk_sequence_;
@@ -361,4 +334,4 @@ class IrEmitterUnnested : public IrEmitter {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+#endif  // XLA_SERVICE_GPU_THUNK_EMITTER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index dab796cee0b7dd..2df0bb9513d305 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -6,7 +6,7 @@ load(
     "if_gpu_is_configured",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/tsl:tsl.bzl", "if_google", "if_oss")
+load("//xla/tsl:tsl.bzl", "if_oss")
 load(
     "//xla/tsl/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
@@ -19,6 +19,7 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        "//xla:friends",
         "//xla/backends/gpu:__subpackages__",
         "//xla/hlo/tools/hlo_opt:__subpackages__",
         "//xla/service/gpu:__subpackages__",
@@ -40,7 +41,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_creation_utils",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -227,6 +227,7 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
@@ -329,9 +330,12 @@ xla_cc_test(
     deps = [
         ":block_scaling_rewriter",
         "//xla:error_spec",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
@@ -366,6 +370,7 @@ cc_library(
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/transforms/simplifiers:computation_canonicalizers",
         "//xla/hlo/utils:hlo_longest_prefix",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:backend_configs_cc",
@@ -374,21 +379,19 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:overload",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -437,7 +440,6 @@ cc_library(
         "//xla/service:hlo_creation_utils",
         "//xla/service:shape_inference",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -482,7 +484,6 @@ cc_library(
         "//xla/stream_executor:dnn",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -585,11 +586,11 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -741,6 +742,7 @@ cc_library(
     srcs = if_cuda_is_configured(["cudnn_fusion_compiler.cc"]),
     hdrs = if_cuda_is_configured(["cudnn_fusion_compiler.h"]),
     deps = if_cuda_is_configured([
+        ":block_scaling_rewriter",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cudnn_support_utils",
         "//xla/service/gpu:ir_emission_utils",
@@ -773,7 +775,7 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
     ]) + [
         "//xla/service:matmul_indexing_utils",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
@@ -810,14 +812,13 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:wrappers_cc_proto",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
-    ]) + if_google([
-        "@com_google_protobuf//:wrappers_cc_proto",
     ]),
 )
 
@@ -1203,7 +1204,6 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1287,6 +1287,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu:ffi",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/hlo/builder:xla_builder",
@@ -1730,57 +1731,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "gpusolver_rewriter",
-    srcs = ["gpusolver_rewriter.cc"],
-    hdrs = ["gpusolver_rewriter.h"],
-    tags = ["gpu"],
-    deps = [
-        "//xla:comparison_util",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "gpusolver_rewriter_test",
-    srcs = ["gpusolver_rewriter_test.cc"],
-    tags = ["gpu"],
-    deps = [
-        ":gpusolver_rewriter",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:pattern_matcher_gmock",
-        "//xla/service:pattern_matcher",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/stream_executor:stream",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "layout_assignment",
     srcs = ["layout_assignment.cc"],
@@ -1807,7 +1757,6 @@ cc_library(
         "//xla/stream_executor:dnn",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util:env_var",
@@ -1920,13 +1869,13 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_graph_dumper",
         "//xla/service:instruction_fusion",
+        "//xla/service/gpu:alias_info",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/model:gpu_performance_model",
         "//xla/service/gpu/model:gpu_performance_model_base",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1950,13 +1899,16 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:pattern_matcher",
+        "//xla/service/gpu:alias_info",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:gpu_fusible",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
@@ -1976,6 +1928,8 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile",
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:symbolic_tiled_hlo_instruction",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
@@ -1989,7 +1943,6 @@ cc_library(
         "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/model:triton_emitter_constraints",
         "//xla/stream_executor:device_description",
-        "//xla/tools:hlo_decomposer_lib",
         "//xla/tools:hlo_extractor",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -2004,7 +1957,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -2017,6 +1969,7 @@ xla_cc_test(
     deps = [
         ":nest_gemm_fusion",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -2061,6 +2014,7 @@ cc_library(
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/model:fusion_analysis_cache",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/model:gpu_indexing_performance_model",
@@ -2071,7 +2025,6 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2096,6 +2049,7 @@ xla_cc_test(
     deps = [
         ":priority_fusion",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2166,6 +2120,7 @@ xla_cc_test(
     name = "reduce_scatter_creator_test",
     srcs = ["reduce_scatter_creator_test.cc"],
     deps = [
+        ":algebraic_simplifier",
         ":reduce_scatter_creator",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -2174,6 +2129,7 @@ xla_cc_test(
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
+        "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -2275,10 +2231,12 @@ xla_test(
         "reduction_layout_normalizer_test.cc",
     ],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":reduction_layout_normalizer",
         "//xla:error_spec",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
@@ -2432,8 +2390,14 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -2555,7 +2519,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2563,6 +2527,7 @@ cc_library(
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:instruction_fusion",
+        "//xla/service/gpu:alias_info",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:fusion_pipeline",
         "//xla/service/gpu:ir_emission_utils",
@@ -2595,6 +2560,7 @@ xla_cc_test(
         ":softmax_rewriter_triton",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2602,6 +2568,7 @@ xla_cc_test(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:instruction_fusion",
         "//xla/service:pattern_matcher",
+        "//xla/service/gpu:alias_info",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
@@ -2969,6 +2936,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/runtime:buffer_comparator",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2998,7 +2966,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -3018,6 +2985,7 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/service/gpu/autotuning:autotuner_compile_util",
@@ -3219,12 +3187,15 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -3241,7 +3212,6 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
-        "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
index 380b36668e8272..f365a2b142baf8 100644
--- a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
+++ b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
@@ -26,7 +26,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AddTrackingSuffixToInstructionNames::Run(
+absl::StatusOr<bool> AddTrackingSuffixToInstructionNames::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
index 741937b79336e1..159fc89227d475 100644
--- a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
+++ b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
@@ -40,8 +40,8 @@ class AddTrackingSuffixToInstructionNames : public HloModulePass {
  public:
   absl::string_view name() const override { return "rename-instructions"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
index ff073176f92c9b..fd918e74ead356 100644
--- a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/algebraic_simplifier.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
@@ -99,4 +101,21 @@ GpuAlgebraicSimplifierVisitor::MakeMultiplyForPrecisionAlgorithm(
       lhs, rhs, dot->precision_config().algorithm());
 }
 
+absl::StatusOr<bool> GpuAlgebraicSimplifier::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(
+      2, "GpuAlgebraicSimplifier::RunImpl(), before:\n" + module->ToString());
+  bool changed = false;
+  GpuAlgebraicSimplifierVisitor visitor(options_, compute_capability_, this);
+  for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
+    if (visitor.Run(comp, options_, this)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "GpuAlgebraicSimplifier::RunImpl(), after:\n" + module->ToString());
+  return changed;
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
index 3d61f7c87a5a51..116fbe7bd8cd4c 100644
--- a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
@@ -73,23 +73,10 @@ class GpuAlgebraicSimplifier : public AlgebraicSimplifier {
       : AlgebraicSimplifier(options),
         compute_capability_(std::move(compute_capability)) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
-    XLA_VLOG_LINES(
-        2, "GpuAlgebraicSimplifier::Run(), before:\n" + module->ToString());
-    bool changed = false;
-    GpuAlgebraicSimplifierVisitor visitor(options_, compute_capability_, this);
-    for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
-      if (visitor.Run(comp, options_, this)) {
-        changed = true;
-      }
-    }
-    XLA_VLOG_LINES(
-        2, "GpuAlgebraicSimplifier::Run(), after:\n" + module->ToString());
-    return changed;
-  }
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   se::GpuComputeCapability compute_capability_;
diff --git a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc
index 69ac80a67ce890..8b825222cacd91 100644
--- a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc
+++ b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc
@@ -97,7 +97,7 @@ class AlgorithmCheckerVisitor : public ConstDfsHloVisitorWithDefault {
 
 }  // namespace
 
-absl::StatusOr<bool> AlgorithmChecker::Run(
+absl::StatusOr<bool> AlgorithmChecker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(AlgorithmCheckerVisitor(gpu_compute_capability_)
diff --git a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
index 6fdddf28c413d3..cc41dae5ca1cf8 100644
--- a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
+++ b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
@@ -39,8 +39,8 @@ class AlgorithmChecker : public HloModulePass {
 
   absl::string_view name() const override { return "algorithm-checker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc
index 62f15e3d41a017..bfe580c8db2c6b 100644
--- a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc
+++ b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AliasPassthroughParams::Run(
+absl::StatusOr<bool> AliasPassthroughParams::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   const HloInstruction* root = module->entry_computation()->root_instruction();
diff --git a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
index aeb219589ec42f..cc87e23603f1ff 100644
--- a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
+++ b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
@@ -38,8 +38,8 @@ class AliasPassthroughParams : public HloModulePass {
   ~AliasPassthroughParams() override = default;
   absl::string_view name() const override { return "alias_passthrough_params"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
index ebc4a8b13200cf..a96615edca6eb3 100644
--- a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 namespace xla::gpu {
 
-absl::StatusOr<bool> AsyncWrapper::Run(
+absl::StatusOr<bool> AsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
index 52490dab2a24a2..941d0c712fa031 100644
--- a/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
@@ -38,7 +38,9 @@ class AsyncWrapper : public HloModulePass {
       : predicate_(std::move(predicate)) {}
 
   absl::string_view name() const override { return "async-wrapper"; }
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
index 9d62ffbb66bdba..3efcad134688c2 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -262,6 +263,12 @@ enum class CudnnMxType {
 
 CudnnMxType GetCudnnMxType(const Shape& input_shape, const Shape& scale_shape,
                            std::optional<int64_t> block_size) {
+  // Non-default layout is not supported.
+  if (!LayoutUtil::IsMonotonicWithDim0Major(input_shape.layout()) ||
+      !LayoutUtil::IsMonotonicWithDim0Major(scale_shape.layout())) {
+    return CudnnMxType::UNSUPPORTED_TYPE;
+  }
+
   // Determine the block size from shapes, unless explicitly given.
   int64_t actual_block_size =
       block_size.has_value()
@@ -306,7 +313,8 @@ bool IsSupportedByCudnn(CudnnMxType lhs, CudnnMxType rhs) {
 
 // Reshape inputs to shapes compatible with cuDNN.
 absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
-    XlaOp input_op, XlaOp scale_op, std::optional<int64_t> block_size) {
+    XlaOp input_op, XlaOp scale_op, std::optional<int64_t> block_size,
+    bool pad_input) {
   // Get shapes from the inputs.
   XlaBuilder& builder = *input_op.builder();
   TF_ASSIGN_OR_RETURN(Shape input_shape, builder.GetShape(input_op));
@@ -329,12 +337,6 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
   TF_RET_CHECK(size_noncontracting <= scale_noncontracting);
   TF_RET_CHECK(rank == 2 || scale_shape.dimensions(0) == batch_size);
 
-  // Reshape inputs, if necessary.
-  if (rank != 3) {
-    input_op = Reshape(input_op, {1, size_noncontracting, size_contracting});
-    scale_op = Reshape(scale_op, {1, scale_noncontracting, scale_contracting});
-  }
-
   // cuDNN kernel imposes constraints on the input shape sizes.
   const int64_t kInputNonContractingTileSize = 128;
   const int64_t kScaleContractingTileSize = 4;
@@ -349,9 +351,9 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
         RoundUpTo(scale_contracting, kScaleContractingTileSize);
 
     // Pad input tensor, if necessary.
-    if (size_noncontracting != padded_noncontracting) {
-      PaddingConfig input_padding_config = MakeNoPaddingConfig(/*rank=*/3);
-      input_padding_config.mutable_dimensions(1)->set_edge_padding_high(
+    if (size_noncontracting != padded_noncontracting && pad_input) {
+      PaddingConfig input_padding_config = MakeNoPaddingConfig(rank);
+      input_padding_config.mutable_dimensions(rank - 2)->set_edge_padding_high(
           padded_noncontracting - size_noncontracting);
       input_op = Pad(input_op, Zero(&builder, input_shape.element_type()),
                      input_padding_config);
@@ -360,10 +362,10 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
     // Pad scale tensor, if necessary.
     if (scale_noncontracting != padded_noncontracting ||
         scale_contracting != padded_contracting) {
-      PaddingConfig scale_padding_config = MakeNoPaddingConfig(/*rank=*/3);
-      scale_padding_config.mutable_dimensions(1)->set_edge_padding_high(
+      PaddingConfig scale_padding_config = MakeNoPaddingConfig(rank);
+      scale_padding_config.mutable_dimensions(rank - 2)->set_edge_padding_high(
           padded_noncontracting - scale_noncontracting);
-      scale_padding_config.mutable_dimensions(2)->set_edge_padding_high(
+      scale_padding_config.mutable_dimensions(rank - 1)->set_edge_padding_high(
           padded_contracting - scale_contracting);
       scale_op = Pad(scale_op, Zero(&builder, scale_shape.element_type()),
                      scale_padding_config);
@@ -377,8 +379,8 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
   // using non-vectorized loads or using an extra shared memory buffer).
   // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-a-layout-1x
   TF_ASSIGN_OR_RETURN(Shape scale_valid_shape, builder.GetShape(scale_op));
-  int64_t scale_rows = scale_valid_shape.dimensions(1);
-  int64_t scale_cols = scale_valid_shape.dimensions(2);
+  int64_t scale_rows = scale_valid_shape.dimensions(rank - 2);
+  int64_t scale_cols = scale_valid_shape.dimensions(rank - 1);
   scale_op =
       Reshape(scale_op, {batch_size, scale_rows / kInputNonContractingTileSize,
                          4, 32, scale_cols / kScaleContractingTileSize,
@@ -401,23 +403,27 @@ absl::StatusOr<XlaOp> BuildCudnnScaledDot(XlaOp lhs_input, XlaOp rhs_input,
       cudnn_version >= kCudnnSupportsBlockScaledDotWithGlobalScale;
 
   // Get inputs from parameters.
-  TF_ASSIGN_OR_RETURN(
-      auto lhs_ops_and_size,
-      BuildCudnnScaledDotInput(lhs_input, lhs_scale, block_size));
+  TF_ASSIGN_OR_RETURN(auto lhs_ops_and_size,
+                      BuildCudnnScaledDotInput(lhs_input, lhs_scale, block_size,
+                                               /*pad_input=*/true));
   auto [lhs_input_op, lhs_scale_op, lhs_size] = lhs_ops_and_size;
 
-  TF_ASSIGN_OR_RETURN(
-      auto rhs_ops_and_size,
-      BuildCudnnScaledDotInput(rhs_input, rhs_scale, block_size));
+  TF_ASSIGN_OR_RETURN(auto rhs_ops_and_size,
+                      BuildCudnnScaledDotInput(rhs_input, rhs_scale, block_size,
+                                               /*pad_input=*/true));
   auto [rhs_input_op, rhs_scale_op, rhs_size] = rhs_ops_and_size;
 
   // Calculate output shape.
   XlaBuilder& builder = *lhs_input.builder();
   TF_ASSIGN_OR_RETURN(Shape lhs_shape, builder.GetShape(lhs_input_op));
   TF_ASSIGN_OR_RETURN(Shape rhs_shape, builder.GetShape(rhs_input_op));
-  Shape result_shape = ShapeUtil::MakeShape(
-      result_type, {lhs_shape.dimensions(0), lhs_shape.dimensions(1),
-                    rhs_shape.dimensions(1)});
+  int rank = lhs_shape.dimensions().size();
+  std::vector<int64_t> result_dims{lhs_shape.dimensions(rank - 2),
+                                   rhs_shape.dimensions(rank - 2)};
+  if (rank == 3) {
+    result_dims.insert(result_dims.begin(), lhs_shape.dimensions(0));
+  }
+  Shape result_shape = ShapeUtil::MakeShape(result_type, result_dims);
   Shape scratch_shape = ShapeUtil::MakeShape(PrimitiveType::U8, {0});
   Shape output_shape = ShapeUtil::MakeTupleShape({result_shape, scratch_shape});
 
@@ -439,10 +445,13 @@ absl::StatusOr<XlaOp> BuildCudnnScaledDot(XlaOp lhs_input, XlaOp rhs_input,
   }
 
   // Slice the result, if necessary.
-  if (lhs_size != lhs_shape.dimensions(1) ||
-      rhs_size != rhs_shape.dimensions(1)) {
-    std::vector<int64_t> limit{lhs_shape.dimensions(0), lhs_size, rhs_size};
-    result = Slice(result, {0, 0, 0}, limit, {1, 1, 1});
+  if (lhs_size != lhs_shape.dimensions(rank - 2) ||
+      rhs_size != rhs_shape.dimensions(rank - 2)) {
+    std::vector<int64_t> start(rank, 0);
+    std::vector<int64_t> strides(rank, 1);
+    result_dims[rank - 2] = lhs_size;
+    result_dims[rank - 1] = rhs_size;
+    result = Slice(result, start, result_dims, strides);
   }
   return result;
 }
@@ -608,22 +617,82 @@ absl::StatusOr<HloInstruction*> ExpandBlockScaledDotCustomCall(
   // Build replacement instruction sequence.
   XlaBuilder builder(std::string(instruction->name()));
   auto operands = absl::MakeSpan(instruction->operands());
-  TF_ASSIGN_OR_RETURN(
-      XlaOp block_scaled_dot,
+  TF_RETURN_IF_ERROR(
       BuildBlockScaledDot(builder, operands[0], operands[1], operands[2],
                           operands.size() >= 4 ? operands[3] : nullptr,
                           operands.size() == 5 ? operands[4] : nullptr, dnums,
-                          result_type, block_size, std::move(cudnn_version)));
+                          result_type, block_size, std::move(cudnn_version))
+          .status());
+  return ExpandInstructionUsingBuilder(builder, instruction);
+}
+
+// ----- cuDNN scale swizzling
+
+absl::StatusOr<HloComputation*> CreateScaleSwizzleComputation(
+    const HloInstruction* input, const HloInstruction* scale) {
+  // Create XLA builder and parameters.
+  std::string name = absl::StrCat(scale->name(), "_swizzle");
+  XlaBuilder builder(name);
+  XlaOp input_op = Parameter(&builder, 0, input->shape(), "input");
+  XlaOp scale_op = Parameter(&builder, 1, scale->shape(), "scale");
+
+  // Build swizzle computation.
+  TF_ASSIGN_OR_RETURN(
+      auto ops_and_size,
+      BuildCudnnScaledDotInput(input_op, scale_op, /*block_size=*/std::nullopt,
+                               /*pad_input=*/false));
+  auto [result_input_op, result_scale_op, _] = ops_and_size;
+  Tuple(&builder, {result_input_op, result_scale_op});
 
-  // Reshape to the expected output shape.
-  // This should only happen when a unit-sized dimension is added by the pass.
-  TF_ASSIGN_OR_RETURN(Shape result_shape, builder.GetShape(block_scaled_dot));
-  if (result_shape != instruction->shape()) {
-    CHECK_EQ(ShapeUtil::ElementsIn(instruction->shape()),
-             ShapeUtil::ElementsIn(result_shape));
-    Reshape(instruction->shape(), block_scaled_dot);
+  TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build());
+  TF_ASSIGN_OR_RETURN(
+      HloComputation * computation,
+      XlaComputationToHloComputation(xla_computation, input->GetModule()));
+
+  for (HloInstruction* instr : computation->instructions()) {
+    // Replace reshapes with bitcasts (post layout assignment).
+    if (instr->opcode() == HloOpcode::kReshape) {
+      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(
+          instr, computation->AddInstruction(HloInstruction::CreateBitcast(
+                     instr->shape(), instr->mutable_operand(0)))));
+    }
+    // Fix transpose layouts (generated as no-ops).
+    if (instr->opcode() == HloOpcode::kTranspose) {
+      *instr->mutable_shape()->mutable_layout() =
+          LayoutUtil::GetDefaultLayoutForShape(instr->shape());
+    }
   }
-  return ExpandInstructionUsingBuilder(builder, instruction);
+  return computation;
+}
+
+absl::Status SliceScaledDotOperands(HloInstruction* scaled_dot) {
+  // Create scaled dot operation with noncontracting dimensions sliced.
+  int rank = scaled_dot->shape().dimensions().size();
+  HloComputation* computation = scaled_dot->parent();
+
+  // Create slice operations for LHS/RHS.
+  std::vector<HloInstruction*> new_operands(scaled_dot->operands().begin(),
+                                            scaled_dot->operands().end());
+  for (int i = 0; i < 2; ++i) {
+    const Shape& input_shape = scaled_dot->operand(i)->shape();
+    const Shape& scale_shape = scaled_dot->operand(i + 2)->shape();
+    if (input_shape.dimensions(rank - 2) != scale_shape.dimensions(rank - 2)) {
+      std::vector<int64_t> start(rank, 0);
+      std::vector<int64_t> strides(rank, 1);
+      std::vector<int64_t> limit(input_shape.dimensions().begin(),
+                                 input_shape.dimensions().end());
+      limit[rank - 1] = scale_shape.dimensions(rank - 1);
+      new_operands[i + 2] =
+          computation->AddInstruction(HloInstruction::CreateSlice(
+              ShapeUtil::MakeShape(scale_shape.element_type(), limit),
+              scaled_dot->mutable_operand(i + 2), start, limit, strides));
+    }
+  }
+
+  // Replace scaled dot instruction operands.
+  HloInstruction* new_scaled_dot = computation->AddInstruction(
+      scaled_dot->CloneWithNewOperands(scaled_dot->shape(), new_operands));
+  return computation->ReplaceInstruction(scaled_dot, new_scaled_dot);
 }
 
 }  // namespace
@@ -651,4 +720,90 @@ absl::StatusOr<HloInstruction*> BlockScalingRewriter::ExpandInstruction(
              << instruction->custom_call_target();
 }
 
+bool CudnnScaledDotHelper::IsSupported(
+    const HloScaledDotInstruction* scaled_dot) {
+  const HloInstruction* lhs_input = scaled_dot->operand(0);
+  const HloInstruction* rhs_input = scaled_dot->operand(1);
+  const HloInstruction* lhs_scale = scaled_dot->operand(2);
+  const HloInstruction* rhs_scale = scaled_dot->operand(3);
+
+  // Input fusion is not supported, as the underlying kernel reads from HBM.
+  auto is_parameter = [](const HloInstruction* instr, int index) {
+    return instr->opcode() == HloOpcode::kParameter &&
+           instr->parameter_number() == index && instr->user_count() == 1;
+  };
+  if (!is_parameter(lhs_input, 0) || !is_parameter(rhs_input, 1) ||
+      !is_parameter(lhs_scale, 2) || !is_parameter(rhs_scale, 3)) {
+    return false;
+  }
+
+  // The dot dimension numbers must have fixed order: batch dimension first
+  // (if present) and contracting dimension last.
+  const DotDimensionNumbers& dnums = scaled_dot->dot_dimension_numbers();
+  int rank = lhs_input->shape().dimensions().size();
+  if (dnums.lhs_contracting_dimensions()[0] != rank - 1 ||
+      dnums.rhs_contracting_dimensions()[0] != rank - 1 ||
+      (rank == 3 && (dnums.lhs_batch_dimensions()[0] != 0 ||
+                     dnums.rhs_batch_dimensions()[0] != 0))) {
+    return false;
+  }
+
+  // cuDNN kernel supports a subset of block scaled types.
+  return IsSupportedByCudnn(
+      GetCudnnMxType(lhs_input->shape(), lhs_scale->shape(), std::nullopt),
+      GetCudnnMxType(rhs_input->shape(), rhs_scale->shape(), std::nullopt));
+}
+
+absl::StatusOr<HloInstruction*> CudnnScaledDotHelper::AddScaleSwizzle(
+    HloFusionInstruction* fusion) {
+  HloComputation* parent = fusion->parent();
+  int rank = fusion->shape().dimensions().size();
+
+  // Add swizzling to LHS/RHS.
+  std::vector<HloInstruction*> swizzled_operands(4);
+  for (int i = 0; i < 2; ++i) {
+    TF_ASSIGN_OR_RETURN(HloComputation * swizzle_computation,
+                        CreateScaleSwizzleComputation(fusion->operand(i),
+                                                      fusion->operand(i + 2)));
+    HloInstruction* call = parent->AddInstruction(HloInstruction::CreateCall(
+        swizzle_computation->root_instruction()->shape(),
+        {fusion->mutable_operand(i), fusion->mutable_operand(i + 2)},
+        swizzle_computation));
+    for (int j = 0; j < 2; ++j) {
+      swizzled_operands[i + j * 2] =
+          parent->AddInstruction(HloInstruction::CreateGetTupleElement(
+              call->shape().tuple_shapes(j), call, j));
+    }
+  }
+
+  // Update fusion computation parameter shapes, if needed.
+  HloComputation* computation = fusion->fused_instructions_computation();
+  bool need_slicing = false;
+  for (int i = 0; i < 4; ++i) {
+    HloInstruction* param = computation->parameter_instruction(i);
+    const Shape& swizzled_shape = swizzled_operands[i]->shape();
+    Shape* param_shape = param->mutable_shape();
+    if (*param_shape != swizzled_shape) {
+      need_slicing |= param_shape->dimensions(rank - 2) !=
+                      swizzled_shape.dimensions(rank - 2);
+      *param_shape = swizzled_shape;
+    }
+  }
+
+  // Replace scaled dot if any inputs need slicing.
+  if (need_slicing) {
+    HloInstruction* scaled_dot =
+        computation->parameter_instruction(0)->users()[0];
+    TF_RETURN_IF_ERROR(SliceScaledDotOperands(scaled_dot));
+  }
+
+  // Create new fusion with the swizzled operands.
+  HloInstruction* new_fusion =
+      parent->AddInstruction(HloInstruction::CreateFusion(
+          computation->root_instruction()->shape(), fusion->fusion_kind(),
+          swizzled_operands, fusion->fused_instructions_computation()));
+  TF_RETURN_IF_ERROR(parent->ReplaceInstruction(fusion, new_fusion));
+  return new_fusion;
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
index 2aea00533f6911..4791c32eec3b71 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
@@ -19,12 +19,13 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/transforms/expanders/op_expander_pass.h"
 #include "xla/stream_executor/dnn.h"
 
 namespace xla::gpu {
 
-const se::dnn::VersionInfo kCudnnSupportsBlockScaledDot(9, 7);
+const se::dnn::VersionInfo kCudnnSupportsBlockScaledDot(9, 10);
 const se::dnn::VersionInfo kCudnnSupportsBlockScaledDotWithGlobalScale(9, 13);
 
 // This pass converts the block quantize/dequantize operations (represented as
@@ -98,6 +99,18 @@ class BlockScalingRewriter : public OpExpanderPass {
   se::dnn::VersionInfo cudnn_version_;
 };
 
+// Helper class for building cuDNN scaled dot operations.
+class CudnnScaledDotHelper {
+ public:
+  // Check if the scaled dot fusion is supported by cuDNN.
+  static bool IsSupported(const HloScaledDotInstruction* scaled_dot);
+
+  // Extract scale tensor swizzling from the block scaled dot fusion into
+  // separate computations.
+  static absl::StatusOr<HloInstruction*> AddScaleSwizzle(
+      HloFusionInstruction* fusion);
+};
+
 }  // namespace xla::gpu
 
 #endif  // XLA_SERVICE_GPU_TRANSFORMS_BLOCK_SCALING_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc
index 768a340f21649a..6b15e98ae18c08 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc
@@ -18,9 +18,12 @@ limitations under the License.
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -336,26 +339,21 @@ ENTRY main {
   BlockScalingRewriter pass(kCudnnSupportsBlockScaledDot);
   RunAndFilecheckHloRewrite(hlo_string, std::move(pass), R"(
   CHECK: [[lhs:%.+]] = f8e4m3fn[128,96]{1,0} parameter(0)
-  CHECK: [[lhs_rs:%.+]] = f8e4m3fn[1,128,96]{2,1,0} reshape([[lhs]])
   CHECK: [[rhs:%.+]] = f8e4m3fn[120,96]{1,0} parameter(1)
-  CHECK: [[rhs_rs:%.+]] = f8e4m3fn[1,120,96]{2,1,0} reshape([[rhs]])
-  CHECK: [[rhs_pad:%.+]] = f8e4m3fn[1,128,96]{2,1,0} pad([[rhs_rs]], {{.+}}), padding=0_0x0_8x0_0
+  CHECK: [[rhs_pad:%.+]] = f8e4m3fn[128,96]{1,0} pad([[rhs]], {{.+}}), padding=0_8x0_0
   CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[128,3]{1,0} parameter(2)
-  CHECK: [[lhs_scale_rs:%.+]] = f8e8m0fnu[1,128,3]{2,1,0} reshape([[lhs_scale]])
-  CHECK: [[lhs_scale_pad:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} pad([[lhs_scale_rs]], {{.+}}), padding=0_0x0_0x0_1
+  CHECK: [[lhs_scale_pad:%.+]] = f8e8m0fnu[128,4]{1,0} pad([[lhs_scale]], {{.+}}), padding=0_0x0_1
   CHECK: [[lhs_scale_rs2:%.+]] = f8e8m0fnu[1,1,4,32,1,4]{5,4,3,2,1,0} reshape([[lhs_scale_pad]])
   CHECK: [[lhs_scale_tr2:%.+]] = f8e8m0fnu[1,1,1,32,4,4]{5,2,3,4,1,0} transpose([[lhs_scale_rs2]]), dimensions={0,1,4,3,2,5}
-  CHECK: [[lhs_scale_swizzle:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} reshape([[lhs_scale_tr2]])
+  CHECK: [[lhs_scale_swizzle:%.+]] = f8e8m0fnu[128,4]{1,0} reshape([[lhs_scale_tr2]])
   CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[120,3]{1,0} parameter(3)
-  CHECK: [[rhs_scale_rs:%.+]] = f8e8m0fnu[1,120,3]{2,1,0} reshape([[rhs_scale]])
-  CHECK: [[rhs_scale_pad:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} pad([[rhs_scale_rs]], {{.+}}), padding=0_0x0_8x0_1
+  CHECK: [[rhs_scale_pad:%.+]] = f8e8m0fnu[128,4]{1,0} pad([[rhs_scale]], {{.+}}), padding=0_8x0_1
   CHECK: [[rhs_scale_rs2:%.+]] = f8e8m0fnu[1,1,4,32,1,4]{5,4,3,2,1,0} reshape([[rhs_scale_pad]])
   CHECK: [[rhs_scale_tr2:%.+]] = f8e8m0fnu[1,1,1,32,4,4]{5,2,3,4,1,0} transpose([[rhs_scale_rs2]]), dimensions={0,1,4,3,2,5}
-  CHECK: [[rhs_scale_swizzle:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} reshape([[rhs_scale_tr2]])
-  CHECK: [[call:%.+]] = ({{.+}}) custom-call([[lhs_rs]], [[rhs_pad]], [[lhs_scale_swizzle]], [[rhs_scale_swizzle]])
-  CHECK: [[gte:%.+]] = f16[1,128,128]{2,1,0} get-tuple-element([[call]]), index=0
-  CHECK: [[slice:%.+]] = f16[1,128,120]{2,1,0} slice([[gte]]), slice={[0:1], [0:128], [0:120]}
-  CHECK: ROOT {{.+}} = f16[128,120]{1,0} reshape([[slice]])
+  CHECK: [[rhs_scale_swizzle:%.+]] = f8e8m0fnu[128,4]{1,0} reshape([[rhs_scale_tr2]])
+  CHECK: [[call:%.+]] = ({{.+}}) custom-call([[lhs]], [[rhs_pad]], [[lhs_scale_swizzle]], [[rhs_scale_swizzle]])
+  CHECK: [[gte:%.+]] = f16[128,128]{1,0} get-tuple-element([[call]]), index=0
+  CHECK: ROOT {{.+}} = f16[128,120]{1,0} slice([[gte]]), slice={[0:128], [0:120]}
 })");
 }
 
@@ -394,5 +392,232 @@ ENTRY main {
 })");
 }
 
+TEST_F(BlockScalingRewriterTest, CudnnFusionSupportedE4M3) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e4m3fn[128,256] parameter(0)
+  %rhs = f8e4m3fn[128,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[128,8] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnsupportedE5M2) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e5m2[128,256] parameter(0)
+  %rhs = f8e5m2[128,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[128,8] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnsupportedDimensions) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e4m3fn[128,256] parameter(0)
+  %rhs = f8e4m3fn[256,128] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[8,128] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnupportedLayout) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  %lhs = f8e4m3fn[16,128]{0,1} parameter(0)
+  %rhs = f8e4m3fn[32,128]{1,0} parameter(1)
+  %lhs_scale = f8e8m0fnu[16,4]{1,0} parameter(2)
+  %rhs_scale = f8e8m0fnu[32,4]{1,0} parameter(3)
+  ROOT %result = f16[16,32]{1,0} scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnsupportedInputs) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e4m3fn[128] parameter(0)
+  %lhs_bc = f8e4m3fn[128,256] broadcast(%lhs), dimensions={0}
+  %rhs = f8e4m3fn[128,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[128,8] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs_bc, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionSwizzleSimple) {
+  constexpr absl::string_view hlo_string = R"(
+fusion {
+  %lhs = f8e4m3fn[4,384,256] parameter(0)
+  %rhs = f8e4m3fn[4,512,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,8] parameter(3)
+  ROOT %result = f32[4,384,512] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  %lhs = f8e4m3fn[4,384,256] parameter(0)
+  %rhs = f8e4m3fn[4,512,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,8] parameter(3)
+  ROOT %result = f32[4,384,512] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion,
+      backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(CudnnScaledDotHelper::AddScaleSwizzle(Cast<HloFusionInstruction>(
+      test_module->entry_computation()->root_instruction())));
+
+  constexpr absl::string_view expected = R"(
+  // %lhs_scale.1_swizzle.1
+  CHECK: [[lhs_bc:%.+]] = f8e8m0fnu[4,3,4,32,2,4]{5,4,3,2,1,0} bitcast({{.+}})
+  CHECK: [[lhs_tr:%.+]] = f8e8m0fnu[4,3,2,32,4,4]{5,4,3,2,1,0} transpose([[lhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,384,8]{2,1,0} bitcast([[lhs_tr]])
+  // %rhs_scale.1_swizzle.1
+  CHECK: [[rhs_bc:%.+]] = f8e8m0fnu[4,4,4,32,2,4]{5,4,3,2,1,0} bitcast({{.+}})
+  CHECK: [[rhs_tr:%.+]] = f8e8m0fnu[4,4,2,32,4,4]{5,4,3,2,1,0} transpose([[rhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,512,8]{2,1,0} bitcast([[rhs_tr]])
+  // %fusion
+  CHECK: [[lhs:%.+]] = f8e4m3fn[4,384,256]{2,1,0} parameter(0)
+  CHECK: [[rhs:%.+]] = f8e4m3fn[4,512,256]{2,1,0} parameter(1)
+  CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} parameter(2)
+  CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} parameter(3)
+  CHECK: {{.+}} = f32[4,384,512]{2,1,0} scaled-dot([[lhs]], [[rhs]], [[lhs_scale]], [[rhs_scale]])
+)";
+  EXPECT_THAT(RunFileCheck(test_module->ToString(), expected),
+              absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionSwizzlePadContracting) {
+  constexpr absl::string_view hlo_string = R"(
+fusion {
+  %lhs = f8e4m3fn[4,384,224] parameter(0)
+  %rhs = f8e4m3fn[4,512,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,7] parameter(3)
+  ROOT %result = f32[4,384,512] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  %lhs = f8e4m3fn[4,384,224] parameter(0)
+  %rhs = f8e4m3fn[4,512,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,7] parameter(3)
+  ROOT %result = f32[4,384,512] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion,
+      backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(CudnnScaledDotHelper::AddScaleSwizzle(Cast<HloFusionInstruction>(
+      test_module->entry_computation()->root_instruction())));
+
+  constexpr absl::string_view expected = R"(
+  // %lhs_scale.1_swizzle.1
+  CHECK: [[lhs_pad:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_0x0_1
+  CHECK: [[lhs_bc:%.+]] = f8e8m0fnu[4,3,4,32,2,4]{5,4,3,2,1,0} bitcast([[lhs_pad]])
+  CHECK: [[lhs_tr:%.+]] = f8e8m0fnu[4,3,2,32,4,4]{5,4,3,2,1,0} transpose([[lhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,384,8]{2,1,0} bitcast([[lhs_tr]])
+  // %rhs_scale.1_swizzle.1
+  CHECK: [[rhs_pad:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_0x0_1
+  CHECK: [[rhs_bc:%.+]] = f8e8m0fnu[4,4,4,32,2,4]{5,4,3,2,1,0} bitcast([[rhs_pad]])
+  CHECK: [[rhs_tr:%.+]] = f8e8m0fnu[4,4,2,32,4,4]{5,4,3,2,1,0} transpose([[rhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,512,8]{2,1,0} bitcast([[rhs_tr]])
+  // %fusion
+  CHECK: [[lhs:%.+]] = f8e4m3fn[4,384,224]{2,1,0} parameter(0)
+  CHECK: [[rhs:%.+]] = f8e4m3fn[4,512,224]{2,1,0} parameter(1)
+  CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} parameter(2)
+  CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} parameter(3)
+  CHECK: {{.+}} = f32[4,384,512]{2,1,0} scaled-dot([[lhs]], [[rhs]], [[lhs_scale]], [[rhs_scale]])
+)";
+  EXPECT_THAT(RunFileCheck(test_module->ToString(), expected),
+              absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionSwizzlePadNoncontracting) {
+  constexpr absl::string_view hlo_string = R"(
+fusion {
+  %lhs = f8e4m3fn[4,320,256] parameter(0)
+  %rhs = f8e4m3fn[4,448,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,320,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,448,8] parameter(3)
+  ROOT %result = f32[4,320,448] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  %lhs = f8e4m3fn[4,320,256] parameter(0)
+  %rhs = f8e4m3fn[4,448,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,320,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,448,8] parameter(3)
+  ROOT %result = f32[4,320,448] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion,
+      backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(CudnnScaledDotHelper::AddScaleSwizzle(Cast<HloFusionInstruction>(
+      test_module->entry_computation()->root_instruction())));
+
+  constexpr absl::string_view expected = R"(
+  // %lhs_scale.1_swizzle.1
+  CHECK: [[lhs_pad:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_64x0_0
+  CHECK: [[lhs_bc:%.+]] = f8e8m0fnu[4,3,4,32,2,4]{5,4,3,2,1,0} bitcast([[lhs_pad]])
+  CHECK: [[lhs_tr:%.+]] = f8e8m0fnu[4,3,2,32,4,4]{5,4,3,2,1,0} transpose([[lhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,384,8]{2,1,0} bitcast([[lhs_tr]])
+  // %rhs_scale.1_swizzle.1
+  CHECK: [[rhs_pad:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_64x0_0
+  CHECK: [[rhs_bc:%.+]] = f8e8m0fnu[4,4,4,32,2,4]{5,4,3,2,1,0} bitcast([[rhs_pad]])
+  CHECK: [[rhs_tr:%.+]] = f8e8m0fnu[4,4,2,32,4,4]{5,4,3,2,1,0} transpose([[rhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,512,8]{2,1,0} bitcast([[rhs_tr]])
+  // %fusion
+  CHECK: [[lhs:%.+]] = f8e4m3fn[4,320,256]{2,1,0} parameter(0)
+  CHECK: [[rhs:%.+]] = f8e4m3fn[4,448,256]{2,1,0} parameter(1)
+  CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} parameter(2)
+  CHECK: [[lhs_slice:%.+]] = f8e8m0fnu[4,320,8]{2,1,0} slice([[lhs_scale]]), slice={[0:4], [0:320], [0:8]}
+  CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} parameter(3)
+  CHECK: [[rhs_slice:%.+]] = f8e8m0fnu[4,448,8]{2,1,0} slice([[rhs_scale]]), slice={[0:4], [0:448], [0:8]}
+  CHECK: {{.+}} = f32[4,320,448]{2,1,0} scaled-dot([[lhs]], [[rhs]], [[lhs_slice]], [[rhs_slice]])
+)";
+  EXPECT_THAT(RunFileCheck(test_module->ToString(), expected),
+              absl_testing::IsOkAndHolds(true));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
index 0e75419b2ddfef..681616faea37f7 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
@@ -82,7 +82,6 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -101,11 +100,12 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -134,6 +134,7 @@ cc_library(
     srcs = ["gpu_collective_combiner_utils.cc"],
     hdrs = ["gpu_collective_combiner_utils.h"],
     deps = [
+        ":collective_ops_utils",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
@@ -204,7 +205,6 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:collective_utils",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
@@ -244,7 +244,6 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:collective_utils",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
@@ -284,7 +283,6 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:collective_utils",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
@@ -301,6 +299,7 @@ cc_library(
         ":collective_ops_utils",
         ":convert_async_collectives_to_sync",
         "//xla:util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -316,7 +315,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -325,12 +323,12 @@ xla_cc_test(
     srcs = ["collective_combiner_annotator_test.cc"],
     deps = [
         ":collective_combiner_annotator",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:alias_info",
         "//xla/stream_executor:device_description",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -443,13 +441,14 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:computation_placer_hdr",
+        "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -490,14 +489,13 @@ xla_cc_test(
         "//xla/service:hlo_module_config",
         "//xla/service/gpu/transforms:reduce_scatter_creator",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -594,7 +592,6 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
-        "//xla/service:collective_permute_cycle",
         "//xla/service:source_target_pairs",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tsl/platform:errors",
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
index b30a465670b76b..fe06365394a466 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
@@ -75,7 +75,7 @@ std::optional<AllGatherCombiner::GroupKey> CustomCombinerKey(
 
 }  // namespace
 
-absl::StatusOr<bool> GpuAllGatherCombiner::Run(
+absl::StatusOr<bool> GpuAllGatherCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Combiner threshold is specified. Running parent pass code.
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h
index c165b9c1310f28..9f5784e2e8bf2d 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h
@@ -45,8 +45,8 @@ class GpuAllGatherCombiner : public AllGatherCombiner {
 
   absl::string_view name() const override { return "gpu-all-gather-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc
index cdc09ab2b54e05..0f7a9e26ab21d5 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc
@@ -29,14 +29,12 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/collective_utils.h"
 #include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::Matcher;
-using ::tsl::testing::IsOkAndHolds;
 
 namespace op = xla::testing::opcode_matchers;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
index 1764a29679f1af..17ce38bb0cecb3 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
@@ -33,7 +33,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AllGatherOptimizer::Run(
+absl::StatusOr<bool> AllGatherOptimizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
index a4d6e71953a34a..057486b7225398 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
@@ -37,8 +37,8 @@ class AllGatherOptimizer : public HloModulePass {
   AllGatherOptimizer() = default;
   absl::string_view name() const override { return "all-gather-optimizer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
index efe459be4e5bb4..90781cab12c352 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
@@ -75,21 +75,26 @@ struct DecomposedReplicaGroups {
 static std::optional<GlobalDeviceId> TryConvertingReplicaIdToDeviceId(
     int64_t replica_id, const DeviceAssignment& device_assignment,
     CollectiveOpGroupMode collective_group_mode) {
-  if (collective_group_mode == CollectiveOpGroupMode::kCrossReplica) {
+  if (collective_group_mode ==
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA) {
     if (device_assignment.computation_count() != 1) {
       // If there are multiple partitions, the replica_id may refer to multiple
       // devices on different partitions.
       return std::nullopt;
     }
     return GlobalDeviceId{device_assignment(replica_id, /*computation_id=*/0)};
-  } else if (collective_group_mode == CollectiveOpGroupMode::kFlattenedID) {
+  }
+  if (collective_group_mode ==
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
     int partition_count = device_assignment.computation_count();
     int64_t actual_replica_id = replica_id / partition_count;
     int64_t partition_id = replica_id % partition_count;
     return GlobalDeviceId{device_assignment(actual_replica_id, partition_id)};
   }
 
-  // kCrossPartition and kCrossReplicaAndPartition are unsupported.
+  // COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION and
+  // COLLECTIVE_OP_GROUP_MODE_FLATTENED_CROSS_REPLICA_AND_PARTITION are
+  // unsupported.
   VLOG(1) << "Skip AllReduceBlueConnect because of unsupported "
              "CollectiveOpGroupMode "
           << CollectiveOpGroupModeToString(collective_group_mode);
@@ -334,7 +339,7 @@ static absl::StatusOr<bool> TryDecomposeAllReduce(
   return true;
 }
 
-absl::StatusOr<bool> AllReduceBlueConnect::Run(
+absl::StatusOr<bool> AllReduceBlueConnect::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running AllReduceBlueConnect";
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
index aab921621106fa..cd5176bd4f3d26 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
@@ -42,8 +42,8 @@ class AllReduceBlueConnect : public HloModulePass {
 
   absl::string_view name() const override { return "all-reduce-blueconnect"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect_test.cc
index 3bd99b474d19a1..283a8e1ba5fc24 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -32,17 +33,16 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/computation_placer.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
 namespace m = ::xla::match;
 
 using AllReduceBlueConnectTest = HloHardwareIndependentTestBase;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
index d1e099f2ecb05c..c76c5302220db0 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
@@ -73,7 +73,7 @@ std::optional<AllReduceCombiner::GroupKey> CustomCombinerKey(
 
 }  // namespace
 
-absl::StatusOr<bool> GpuAllReduceCombiner::Run(
+absl::StatusOr<bool> GpuAllReduceCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Combiner threshold is specified. Running parent pass code.
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h
index 9f1013b45a50b3..65eb8f33dc67e4 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h
@@ -42,8 +42,8 @@ class GpuAllReduceCombiner : public AllReduceCombiner {
 
   absl::string_view name() const override { return "gpu-all-reduce-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc
index 46deb3ecbcb607..5c9ca80d400bf2 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc
@@ -29,14 +29,12 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/collective_utils.h"
 #include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::Matcher;
-using ::tsl::testing::IsOkAndHolds;
 
 namespace op = xla::testing::opcode_matchers;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
index 8ed113b873dc12..b0322a453596b2 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
@@ -138,7 +138,7 @@ static absl::StatusOr<bool> DecomposeAllReduce(HloInstruction* hlo,
   return true;
 }
 
-absl::StatusOr<bool> AllReduceDecomposer::Run(
+absl::StatusOr<bool> AllReduceDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
index 844f3e82cd20eb..3c5604004693ed 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
@@ -30,7 +30,8 @@ class AllReduceDecomposer : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-decomposer"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
index 92484beff3e8d3..a0b48fd4a1f7c5 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
@@ -415,7 +415,7 @@ static absl::StatusOr<bool> SplitAllReduce(const HloModule& module,
                         computation);  // changed
 }
 
-absl::StatusOr<bool> AllReduceSplitter::Run(
+absl::StatusOr<bool> AllReduceSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
index f11241f945ba74..ecc1a1fdac2502 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
@@ -66,8 +66,8 @@ class AllReduceSplitter : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter_test.cc
index 569c18c177a4c0..a34a92150de303 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter_test.cc
@@ -36,15 +36,12 @@ limitations under the License.
 #include "xla/service/gpu/transforms/reduce_scatter_creator.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 class AllReduceSplitterTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> PrepareModule(
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc
index 4a045bb8f364df..a2d8a5dbc218ae 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AsyncCollectiveAnnotator::Run(
+absl::StatusOr<bool> AsyncCollectiveAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h
index e3e534e1ae6d76..9e40855193752f 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h
@@ -37,8 +37,8 @@ class AsyncCollectiveAnnotator : public HloModulePass {
     return "async-collective-annotator";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc
index 344e399448c148..6885fc84b4b2bd 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc
@@ -78,7 +78,7 @@ absl::StatusOr<GPUCommunicationType> GetCommunicationType(
 // based on:
 // 1. Communication pattern (intranode vs internode)
 // 2. Message size (compared against threshold_in_bytes)
-absl::StatusOr<bool> CollectiveBackendAssigner::Run(
+absl::StatusOr<bool> CollectiveBackendAssigner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -92,14 +92,17 @@ absl::StatusOr<bool> CollectiveBackendAssigner::Run(
           GPUCommunicationType comm_type,
           GetCommunicationType(instr, num_visible_devices_per_process_,
                                gpu_version_));
+      int64_t shape_size = GetShapeSize(instr->shape());
       VLOG(1) << "CollectiveBackendAssigner: comm_type="
-              << static_cast<int>(comm_type)
-              << " shape_size=" << GetShapeSize(instr->shape())
-              << " threshold_in_bytes_=" << threshold_in_bytes_;
-      bool use_nvshmem = (num_visible_devices_per_process_ == 1 ||
-                          comm_type == GPUCommunicationType::SINGLE_HOST) &&
-                         (!IsAllReduceOp(instr) ||
-                          GetShapeSize(instr->shape()) < threshold_in_bytes_);
+              << static_cast<int>(comm_type) << " shape_size=" << shape_size
+              << " threshold_in_bytes_=" << threshold_in_bytes_
+              << " slice_size_=" << slice_size_;
+      bool use_nvshmem =
+          (num_visible_devices_per_process_ == 1 ||
+           comm_type == GPUCommunicationType::SINGLE_PARTITION ||
+           (slice_size_ > 0 &&
+            IsIntraNVLinkDomain(module->config(), slice_size_))) &&
+          (!IsAllReduceOp(instr) || shape_size < threshold_in_bytes_);
       if (!use_nvshmem) {
         continue;
       }
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h
index 2f0838ce535ccd..ed1ef6ad35d949 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h
@@ -43,17 +43,19 @@ class CollectiveBackendAssigner : public HloModulePass {
  public:
   explicit CollectiveBackendAssigner(
       const se::GpuComputeCapability& gpu_version,
-      int num_visible_devices_per_process,
+      int num_visible_devices_per_process, int64_t slice_size = 0,
       int64_t threshold_in_bytes = kDefaultThresholdInBytes)
       : gpu_version_(gpu_version),
         num_visible_devices_per_process_(num_visible_devices_per_process),
-        threshold_in_bytes_(threshold_in_bytes) {}
+        threshold_in_bytes_(threshold_in_bytes),
+        slice_size_(slice_size) {}
 
   absl::string_view name() const override {
     return "collective-backend-assigner";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -61,6 +63,7 @@ class CollectiveBackendAssigner : public HloModulePass {
   se::GpuComputeCapability gpu_version_;
   int num_visible_devices_per_process_;
   int64_t threshold_in_bytes_;
+  int64_t slice_size_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc
index fef570812387d0..0b41f30026cb86 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/transforms/collectives/collective_backend_assigner.h"
 
+#include <cstdint>
 #include <memory>
 
 #include <gmock/gmock.h>
@@ -27,22 +28,21 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 class CollectiveBackendAssignerTest : public HloHardwareIndependentTestBase {
  protected:
-  absl::StatusOr<bool> RunCollectiveBackendAssigner(HloModule* module) {
+  absl::StatusOr<bool> RunCollectiveBackendAssigner(HloModule* module,
+                                                    int num_devices_per_host,
+                                                    int64_t slice_size = 0) {
     se::GpuComputeCapability gpu_version = se::CudaComputeCapability(8, 0);
-    return RunHloPass(
-        CollectiveBackendAssigner(gpu_version, /*num_devices_per_host=*/1),
-        module);
+    return RunHloPass(CollectiveBackendAssigner(
+                          gpu_version, num_devices_per_host, slice_size),
+                      module);
   }
 
   absl::StatusOr<CollectiveBackendConfig_CollectiveBackend>
@@ -70,7 +70,9 @@ TEST_F(CollectiveBackendAssignerTest, SmallAllReduceUsesNvshmem) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(true));
 
   const HloInstruction* all_reduce =
@@ -96,7 +98,9 @@ TEST_F(CollectiveBackendAssignerTest, LargeAllReduceUsesDefault) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(false));
 
   const HloInstruction* all_reduce =
@@ -117,7 +121,9 @@ TEST_F(CollectiveBackendAssignerTest, SmallCollectivePermuteUsesNvshmem) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(true));
 
   const HloInstruction* permute =
@@ -138,7 +144,9 @@ TEST_F(CollectiveBackendAssignerTest, LargeCollectivePermuteUsesNvshmem) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(true));
 
   const HloInstruction* permute =
@@ -147,6 +155,97 @@ TEST_F(CollectiveBackendAssignerTest, LargeCollectivePermuteUsesNvshmem) {
               absl_testing::IsOkAndHolds(CollectiveBackendConfig::NVSHMEM));
 }
 
+TEST_F(CollectiveBackendAssignerTest, IntraNvlinkDomainUsesNvshmem) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY main {
+      p0 = f32[1024,1024] parameter(0)
+      ROOT result = f32[1024,1024] all-reduce(p0), to_apply=add, replica_groups={{0,1}}, channel_id=5
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  module->mutable_config().set_num_partitions(2);
+  module->mutable_config().set_replica_count(2);
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/2, /*slice_size=*/4),
+              absl_testing::IsOkAndHolds(true));
+
+  const HloInstruction* all_reduce =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(GetCollectiveBackendConfig(all_reduce),
+              absl_testing::IsOkAndHolds(CollectiveBackendConfig::NVSHMEM));
+}
+
+TEST_F(CollectiveBackendAssignerTest,
+       IntraNvlinkDomainLargeAllReduceUsesDefault) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY main {
+      p0 = f32[8192,8192] parameter(0)
+      ROOT result = f32[8192,8192] all-reduce(p0), to_apply=add, replica_groups={{0,1}}, channel_id=8
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  module->mutable_config().set_num_partitions(2);
+  module->mutable_config().set_replica_count(2);
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/2, /*slice_size=*/4),
+              absl_testing::IsOkAndHolds(false));
+
+  const HloInstruction* all_reduce =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(GetCollectiveBackendConfig(all_reduce),
+              absl_testing::IsOkAndHolds(CollectiveBackendConfig::DEFAULT));
+}
+
+TEST_F(CollectiveBackendAssignerTest, NonIntraNvlinkDomainUsesDefault) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY main {
+      p0 = f32[1024,1024] parameter(0)
+      ROOT result = f32[1024,1024] all-reduce(p0), to_apply=add, channel_id=13
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  module->mutable_config().set_num_partitions(1);
+  module->mutable_config().set_replica_count(4);
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/2, /*slice_size=*/2),
+              absl_testing::IsOkAndHolds(false));
+
+  const HloInstruction* all_reduce =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(GetCollectiveBackendConfig(all_reduce),
+              absl_testing::IsOkAndHolds(CollectiveBackendConfig::DEFAULT));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
index 3bfc86beca29c2..d9bbc6808310db 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -122,7 +122,7 @@ int64_t MaxAvailableMemory(const HloModule& module,
 
 }  // namespace
 
-absl::StatusOr<bool> CollectiveCombinerAnnotator::Run(
+absl::StatusOr<bool> CollectiveCombinerAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
index ec5d664fb180fe..c8854d79b7861a 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
@@ -45,14 +45,15 @@ class CollectiveCombinerAnnotator : public HloModulePass {
         pointer_size_(pointer_size),
         mlir_context_(mlir_context) {}
 
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override {
     return "collective-combiner-annotator";
   }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const se::DeviceDescription device_info_;
   const GpuAliasInfo* alias_info_;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
index 573db47cd4ff51..522094a560e749 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
@@ -24,19 +24,18 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::Optional;
-using ::tsl::testing::IsOkAndHolds;
 
 class CollectiveCombinerAnnotatorTest : public HloHardwareIndependentTestBase {
  protected:
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
index 3e1f6d44c69bf6..9ef31c9ad122b7 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
@@ -15,25 +15,27 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <variant>
+#include <optional>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/replica_group.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -42,9 +44,26 @@ namespace xla {
 namespace gpu {
 namespace {
 
-struct CommunicationMetadata {
-  absl::flat_hash_map<int64_t, size_t> node_to_participant_count;
-  int num_devices_per_host;
+// Computes a map from source partition ID to a set of target partition IDs for
+// a collective-permute instruction. A partition ID is computed by dividing the
+// device (replica) ID by the number of devices per host.
+absl::flat_hash_map<int64_t, absl::flat_hash_set<int64_t>>
+GetSourceToTargetsNodeMap(const HloCollectivePermuteInstruction& instr,
+                          int num_devices_per_partition) {
+  absl::flat_hash_map<int64_t, absl::flat_hash_set<int64_t>>
+      source_to_targets_partition_map;
+  for (const auto& [source, target] : instr.source_target_pairs()) {
+    int64_t source_partition = source / num_devices_per_partition;
+    int64_t target_partition = target / num_devices_per_partition;
+    source_to_targets_partition_map[source_partition].insert(target_partition);
+  }
+  return source_to_targets_partition_map;
+}
+
+struct CollectiveMetadata {
+  // map for ops with `replica_groups`, e.g. all-gather.
+  absl::flat_hash_map<int64_t, size_t> partition_to_participant_count;
+  int num_devices_per_partition;
   int64_t replica_count;
 };
 
@@ -60,78 +79,153 @@ bool SameParticipantCounts(const absl::flat_hash_map<int64_t, size_t>& lhs,
   for (const auto& [_, v] : rhs) {
     rhs_counts.push_back(v);
   }
-  std::sort(lhs_counts.begin(), lhs_counts.end());
-  std::sort(rhs_counts.begin(), rhs_counts.end());
+  absl::c_sort(lhs_counts);
+  absl::c_sort(rhs_counts);
   return lhs_counts == rhs_counts;
 }
 
-absl::StatusOr<CommunicationMetadata> CommunicationContext(
-    const HloChannelInstruction& instr, int num_devices_per_host) {
-  absl::flat_hash_map<int64_t, size_t> node_to_participant_count;
-
-  if (auto* collective =
-          dynamic_cast<const HloCollectiveInstruction*>(&instr)) {
-    for (const ReplicaGroup& replica_group :
-         collective->device_list().replica_groups()) {
-      absl::flat_hash_map<int64_t, size_t> buffer;
-      for (int64_t rank : replica_group.replica_ids()) {
-        int64_t node_id = rank / num_devices_per_host;
-        buffer[node_id]++;
-      }
-      if (!node_to_participant_count.empty() &&
-          !SameParticipantCounts(buffer, node_to_participant_count)) {
-        return absl::FailedPreconditionError(
-            absl::StrCat("Non homogenous replica group: ",
-                         collective->device_list().ToString()));
-      }
-      if (node_to_participant_count.empty()) {
-        node_to_participant_count = buffer;
-      }
+absl::StatusOr<CollectiveMetadata> CommunicationContext(
+    const HloCollectiveInstruction& instr, int num_devices_per_partition) {
+  absl::flat_hash_map<int64_t, size_t> partition_to_participant_count;
+
+  for (const ReplicaGroup& replica_group :
+       instr.device_list().replica_groups()) {
+    absl::flat_hash_map<int64_t, size_t> buffer;
+    for (int64_t rank : replica_group.replica_ids()) {
+      int64_t partition_id = rank / num_devices_per_partition;
+      buffer[partition_id]++;
     }
-  } else if (auto* permute =
-                 dynamic_cast<const HloCollectivePermuteInstruction*>(&instr)) {
-    for (const auto& [source, target] : permute->source_target_pairs()) {
-      int64_t source_node = source / num_devices_per_host;
-      int64_t target_node = target / num_devices_per_host;
-      node_to_participant_count[source_node]++;
-      node_to_participant_count[target_node]++;
+    if (!partition_to_participant_count.empty() &&
+        !SameParticipantCounts(buffer, partition_to_participant_count)) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Non homogenous replica group: ", instr.device_list().ToString()));
+    }
+    if (partition_to_participant_count.empty()) {
+      partition_to_participant_count = buffer;
     }
-  } else {
-    return absl::FailedPreconditionError(
-        "Cannot determine communication context for non-collective channel "
-        "instruction");
   }
-
-  return CommunicationMetadata{node_to_participant_count, num_devices_per_host,
-                               instr.GetModule()->config().replica_count()};
+  return CollectiveMetadata{partition_to_participant_count,
+                            num_devices_per_partition,
+                            instr.GetModule()->config().replica_count()};
 }
 
-bool IsSingleHost(const CommunicationMetadata& pattern) {
-  if (pattern.node_to_participant_count.size() == 1) {
+bool IsSingleHost(const CollectiveMetadata& pattern) {
+  if (pattern.partition_to_participant_count.size() == 1) {
     return true;
   }
   return pattern.replica_count > 0 &&
-         pattern.node_to_participant_count.empty() &&
-         pattern.replica_count <= pattern.num_devices_per_host;
+         pattern.partition_to_participant_count.empty() &&
+         pattern.replica_count <= pattern.num_devices_per_partition;
 }
 
-bool IsRailAligned(const CommunicationMetadata& pattern) {
-  if (!IsSingleHost(pattern) && pattern.node_to_participant_count.empty()) {
+bool IsWorldLevelCommunication(const CollectiveMetadata& pattern) {
+  if (!IsSingleHost(pattern) &&
+      pattern.partition_to_participant_count.empty()) {
     return true;
   }
   return absl::c_all_of(
-      pattern.node_to_participant_count, [&pattern](const auto& elem) {
-        const auto& [node_id, participant_count] = elem;
-        return participant_count == pattern.num_devices_per_host;
+      pattern.partition_to_participant_count, [&pattern](const auto& elem) {
+        const auto& [partition_id, participant_count] = elem;
+        return participant_count == pattern.num_devices_per_partition;
       });
 }
 
-bool IsNonRailAligned(const CommunicationMetadata& pattern) {
-  return !IsSingleHost(pattern) && !IsRailAligned(pattern);
+bool IsNonWorldLevelCommunication(const CollectiveMetadata& pattern) {
+  return !IsSingleHost(pattern) && !IsWorldLevelCommunication(pattern);
+}
+
+// Properties of a collective-permute instruction, categorizing its
+// communication pattern.
+struct CollectivePermuteProperty {
+  std::vector<std::pair<int64_t, int64_t>> intra_partition_source_target_pairs;
+  std::vector<std::pair<int64_t, int64_t>> inter_partition_source_target_pairs;
+  // If true, at least one device both sends and receives data. If false, every
+  // device involved in the collective-permute either only sends or only
+  // receives data.
+  bool has_devices_with_two_edges = false;
+  // True if for every pair (s,t) in source_target_pairs, the pair (t,s) is
+  // also present in source_target_pairs.
+  bool is_all_mutual = false;
+};
+
+// TODO(b/460155942): remove the optional wrapper once the HLO verifier stop
+// supporting empty source-target pairs.
+std::optional<CollectivePermuteProperty> GetCollectivePermuteProperty(
+    const HloCollectivePermuteInstruction& instr,
+    int64_t num_devices_per_partition) {
+  CHECK_GT(num_devices_per_partition, 0);
+  if (instr.source_target_pairs().empty()) {
+    return std::nullopt;
+  }
+
+  CollectivePermuteProperty property;
+  absl::flat_hash_set<int64_t> sources, targets;
+  absl::flat_hash_set<std::pair<int64_t, int64_t>> pairs_set;
+  absl::c_for_each(instr.source_target_pairs(),
+                   [&](const auto& pair) { pairs_set.insert(pair); });
+
+  property.is_all_mutual = true;
+
+  for (const auto& [source, target] : instr.source_target_pairs()) {
+    sources.insert(source);
+    targets.insert(target);
+
+    bool is_intra_partition = (source / num_devices_per_partition ==
+                               target / num_devices_per_partition);
+
+    if (is_intra_partition) {
+      property.intra_partition_source_target_pairs.push_back({source, target});
+    } else {
+      property.inter_partition_source_target_pairs.push_back({source, target});
+    }
+    // If anyone of the pair (t,s) is not present in source_target_pairs, the
+    // communication pattern is not all-mutual.
+    if (property.is_all_mutual && !pairs_set.contains({target, source})) {
+      property.is_all_mutual = false;
+    }
+  }
+
+  // If any source device is a target device, then it has two edges.
+  for (int64_t source : sources) {
+    if (targets.contains(source)) {
+      property.has_devices_with_two_edges = true;
+      break;
+    }
+  }
+
+  return property;
 }
 
 }  // namespace
 
+CollectivePermuteCostModelType GetCollectivePermuteCostModelType(
+    const HloCollectivePermuteInstruction& instr,
+    int64_t num_devices_per_partition) {
+  std::optional<CollectivePermuteProperty> property =
+      GetCollectivePermuteProperty(instr, num_devices_per_partition);
+  if (!property) {
+    return CollectivePermuteCostModelType::kUnknown;
+  }
+
+  if (!property->inter_partition_source_target_pairs.empty()) {
+    if (property->has_devices_with_two_edges) {
+      return property->is_all_mutual ? CollectivePermuteCostModelType::
+                                           kInterPartitionTwoWayAllMutual
+                                     : CollectivePermuteCostModelType::
+                                           kInterPartitionTwoWayHasNonMutual;
+    }
+    return CollectivePermuteCostModelType::kInterPartitionOneWay;
+  }
+
+  if (property->has_devices_with_two_edges) {
+    return property->is_all_mutual
+               ? CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual
+               : CollectivePermuteCostModelType::
+                     kIntraPartitionTwoWayHasNonMutual;
+  }
+  return CollectivePermuteCostModelType::kIntraPartitionOneWay;
+}
+
 bool IsGPUSyncCollective(const HloInstruction& instr) {
   auto backend_config = instr.backend_config<GpuBackendConfig>();
   if (!backend_config.ok()) {
@@ -141,26 +235,56 @@ bool IsGPUSyncCollective(const HloInstruction& instr) {
 }
 
 absl::StatusOr<GPUCommunicationType> CommunicationType(
-    int num_devices_per_host, const HloChannelInstruction& instr,
+    int num_devices_per_partition, const HloChannelInstruction& instr,
     const se::GpuComputeCapability& gpu_version) {
-  if (!std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+  if (!gpu_version.IsCuda()) {
     return absl::FailedPreconditionError("Only CUDA is supported.");
   }
 
-  TF_ASSIGN_OR_RETURN(CommunicationMetadata comm,
-                      CommunicationContext(instr, num_devices_per_host));
-  if (IsSingleHost(comm)) {
-    return GPUCommunicationType::SINGLE_HOST;
-  }
-  if (IsRailAligned(comm)) {
-    return GPUCommunicationType::RAIL_ALIGNED;
-  }
-  if (IsNonRailAligned(comm)) {
-    return GPUCommunicationType::NON_RAIL_ALIGNED;
+  if (const auto* collective = DynCast<HloCollectiveInstruction>(&instr)) {
+    TF_ASSIGN_OR_RETURN(
+        CollectiveMetadata comm,
+        CommunicationContext(*collective, num_devices_per_partition));
+    if (IsSingleHost(comm)) {
+      return GPUCommunicationType::SINGLE_PARTITION;
+    }
+    if (IsWorldLevelCommunication(comm)) {
+      return GPUCommunicationType::MULTI_HOST_WORLD_LEVEL;
+    }
+    if (IsNonWorldLevelCommunication(comm)) {
+      return GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL;
+    }
+  } else if (const auto* collective_permute =
+                 DynCast<HloCollectivePermuteInstruction>(&instr)) {
+    const auto source_to_targets_partition_map = GetSourceToTargetsNodeMap(
+        *collective_permute, num_devices_per_partition);
+    for (const auto& [source_partition, target_partition_set] :
+         source_to_targets_partition_map) {
+      if (target_partition_set.size() > 1) {
+        return GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL;
+      }
+      CHECK_EQ(target_partition_set.size(), 1);
+      if (source_partition != *target_partition_set.begin()) {
+        return GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL;
+      }
+    }
+    return GPUCommunicationType::SINGLE_PARTITION;
+  } else {
+    return absl::FailedPreconditionError(
+        "Cannot determine communication type for non-collective channel "
+        "instruction");
   }
 
   return GPUCommunicationType::UNDEFINED;
 }
 
+bool IsIntraNVLinkDomain(const HloModuleConfig& config, int64_t slice_size) {
+  int device_count = config.num_partitions() * config.replica_count();
+  bool is_intra = device_count <= slice_size;
+  VLOG(1) << "IsIntraNVLinkDomain: device_count=" << device_count
+          << " slice_size=" << slice_size << " is_intra=" << is_intra;
+  return is_intra;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
index 65dd348b2461c8..3d0087f51ad673 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
@@ -16,29 +16,76 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_OPS_UTILS_H_
 #define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_OPS_UTILS_H_
 
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
 
 enum class GPUCommunicationType {
+  // The communication type could not be determined.
   UNDEFINED = 0,
-  RAIL_ALIGNED = 1,
-  NON_RAIL_ALIGNED = 2,
-  SINGLE_HOST = 3
+  // Communication involves devices from multiple hosts, and every host
+  // involved in the communication pattern has all of its devices participating.
+  MULTI_HOST_WORLD_LEVEL = 1,
+  // Communication involves devices from multiple hosts, but at least one of
+  // the involved hosts has only a subset of its devices participating.
+  MULTI_HOST_NON_WORLD_LEVEL = 2,
+  // All devices participating in the collective operation reside on the same
+  // fast-interconnect domain.
+  SINGLE_PARTITION = 3
 };
 
 // Returns the type of communication pattern for a channel instruction.
 absl::StatusOr<GPUCommunicationType> CommunicationType(
-    int num_devices_per_host, const HloChannelInstruction& instr,
+    int partition_size, const HloChannelInstruction& instr,
     const se::GpuComputeCapability& gpu_version);
 
+// Enum to categorize collective-permute cost models based on communication
+// patterns. The cost model is determined by the highest-latency pattern
+// present in any device: TwoWayHasNonMutual > TwoWayAllMutual > OneWay.
+enum class CollectivePermuteCostModelType {
+  // This is currently only used for CollectivePermute instructions with empty
+  // source-target pairs.
+  // TODO(b/460155942): Remove this field once the HLO verifier stop supporting
+  // empty source-target pairs.
+  kUnknown,
+  // Intra-partition: All devices only send or only receive data.
+  kIntraPartitionOneWay,
+  // Intra-partition: Devices send/receive, but only with the same peer
+  // (e.g., {{0,1},{1,0}}).
+  kIntraPartitionTwoWayAllMutual,
+  // Intra-partition: At least one device sends to one peer and receives from
+  // another (e.g., {{0,1},{1,2}}).
+  kIntraPartitionTwoWayHasNonMutual,
+  // Inter-partition: All devices only send or only receive data.
+  kInterPartitionOneWay,
+  // Inter-partition: Devices send/receive, but only with the same peer.
+  kInterPartitionTwoWayAllMutual,
+  // Inter-partition: At least one device sends to one peer and receives from
+  // another.
+  kInterPartitionTwoWayHasNonMutual,
+};
+
+// Returns cost model type based on collective-permute properties.
+CollectivePermuteCostModelType GetCollectivePermuteCostModelType(
+    const HloCollectivePermuteInstruction& instr,
+    int64_t num_devices_per_partition);
+
 // Returns true if instruction is a synchronous collective op.
 bool IsGPUSyncCollective(const HloInstruction& instr);
 
+// Returns true if all devices are within the same NVLink domain (slice).
+bool IsIntraNVLinkDomain(const HloModuleConfig& config, int64_t slice_size);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
index 7ec6d51709da03..3d9aea2e020140 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
@@ -62,7 +62,7 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost8Devices) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHost4Devices) {
@@ -85,7 +85,7 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost4Devices) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHost16Devices) {
@@ -106,12 +106,12 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost16Devices) {
 
   HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
       module->entry_computation()->root_instruction());
-  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+  EXPECT_THAT(CommunicationType(/*partition_size=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
-TEST_F(CommunicationTypeTest, DetectRailAlignedAllDevices) {
+TEST_F(CommunicationTypeTest, DetectWorldLevelAllDevices) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -131,10 +131,10 @@ TEST_F(CommunicationTypeTest, DetectRailAlignedAllDevices) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectRailAlignedHalfMesh) {
+TEST_F(CommunicationTypeTest, DetectWorldLevelHalfMesh) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=32
 
@@ -157,10 +157,10 @@ TEST_F(CommunicationTypeTest, DetectRailAlignedHalfMesh) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectNonRailAligned) {
+TEST_F(CommunicationTypeTest, DetectNonWorldLevel) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -180,7 +180,7 @@ TEST_F(CommunicationTypeTest, DetectNonRailAligned) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::NON_RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHost16DevicesForEmptyReplicaGroups) {
@@ -201,10 +201,10 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost16DevicesForEmptyReplicaGroups) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/16, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
-TEST_F(CommunicationTypeTest, DetectsRailAligned8DevicesForEmptyReplicaGroups) {
+TEST_F(CommunicationTypeTest, DetectWorldLevel8DevicesForEmptyReplicaGroups) {
   absl::string_view kHlo = R"(
     HloModule m, replica_count=16
 
@@ -222,10 +222,10 @@ TEST_F(CommunicationTypeTest, DetectsRailAligned8DevicesForEmptyReplicaGroups) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectsNonRailAligned16Devices) {
+TEST_F(CommunicationTypeTest, DetectNonWorldLevel16Devices) {
   absl::string_view kHlo = R"(
     HloModule m, replica_count=16
 
@@ -243,7 +243,7 @@ TEST_F(CommunicationTypeTest, DetectsNonRailAligned16Devices) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::NON_RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHostCollectivePermute) {
@@ -263,10 +263,30 @@ TEST_F(CommunicationTypeTest, DetectsSingleHostCollectivePermute) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSingleHostCollectivePermuteSinglePair) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,7},{7,0}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloChannelInstruction* instr = Cast<HloChannelInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
-TEST_F(CommunicationTypeTest, DetectsNonRailAlignedCollectivePermute) {
+TEST_F(CommunicationTypeTest, DetectNonWorldLevelCollectivePermute) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -284,10 +304,10 @@ TEST_F(CommunicationTypeTest, DetectsNonRailAlignedCollectivePermute) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::NON_RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectsRailAlignedCollectivePermute) {
+TEST_F(CommunicationTypeTest, DetectWorldLevelCollectivePermute) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -304,8 +324,312 @@ TEST_F(CommunicationTypeTest, DetectsRailAlignedCollectivePermute) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsCrossHostCollectivePermuteMixed) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+       source_target_pairs={{0,7},
+                            {0,8},
+                            {1,9},
+                            {2,10},
+                            {3,11},
+                            {4,12},
+                            {5,13},
+                            {6,14},
+                            {7,15}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloChannelInstruction* instr = Cast<HloChannelInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSinglePartitionMultiHost) {
+  // 16 devices across 2 hosts with partition_size=16 (single partition spanning
+  // 2 hosts)
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[2048] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1,16]<=[16]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/16, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
+}
+
+TEST_F(CommunicationTypeTest, DetectsMultiPartitionWith8DevicePartitions) {
+  // 64 devices across 2 partitions with partition_size=32
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[2048] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1, 64]<=[64]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/32, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsMultiPartitionNonRailAligned) {
+  // 64 devices with partition_size=36: partition 0 has 36 devices, partition 1
+  // has 28 devices
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=12
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[1536] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1, 64]<=[64]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  // With partition_size=8, spans 2 partitions but not rail-aligned (8 and 4
+  // devices)
+  EXPECT_THAT(CommunicationType(/*partition_size=*/36, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSinglePartitionSubset) {
+  // 6 devices within a single partition (partition_size=36)
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=4
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[512] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups={{0,1,2,3,4,5}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/36, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
+}
+
+TEST_F(CommunicationTypeTest, DetectsRailAlignedMultiPartition) {
+  // 128 devices across 2 partitions with partition_size=8 (rail-aligned: 64
+  // devices per partition)
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=32
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[4096] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1,128]<=[128]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/64, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
 }  // namespace
+
+TEST_F(CommunicationTypeTest, CollectivePermuteIntraPartitionOneWay) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,1},{2,3},{4,5},{6,7}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kIntraPartitionOneWay);
+}
+
+TEST_F(CommunicationTypeTest, CollectivePermuteIntraPartitionTwoWayMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=4
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,1},{1,0},{2,3},{3,2}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual);
+}
+
+TEST_F(CommunicationTypeTest, CollectivePermuteInterPartitionTwoWayMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,8},{8,0}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kInterPartitionTwoWayAllMutual);
+}
+
+TEST_F(CommunicationTypeTest, CollectivePermuteInterPartitionOneWay) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,8},{1,9}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kInterPartitionOneWay);
+}
+
+TEST_F(CommunicationTypeTest,
+       CollectivePermuteIntraPartitionTwoWayHasNonMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,1},{1,2},{2,0}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual);
+}
+
+TEST_F(CommunicationTypeTest,
+       CollectivePermuteInterPartitionTwoWayHasNonMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,8},{1,9},{8,2}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kInterPartitionTwoWayHasNonMutual);
+}
+
+// TODO(b/460155942): remove once the collective-permute with empty pairs is
+// disallowed by the HLO verifier.
+TEST_F(CommunicationTypeTest, CollectivePermuteEmptyPairs) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kUnknown);
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
index c224383468b7e8..faf8c9375d9a8a 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
@@ -190,14 +190,13 @@ absl::StatusOr<HloInstruction*> CreatePartitionOrReplicaId(
     HloComputation* computation, CollectiveOpGroupMode mode,
     absl::string_view cp_name) {
   switch (mode) {
-    case CollectiveOpGroupMode::kCrossReplica:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA:
       return computation->AddInstruction(HloInstruction::CreateReplicaId(),
                                          absl::StrCat(cp_name, "-rep-id"));
-    case CollectiveOpGroupMode::kCrossPartition:
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION:
       return computation->AddInstruction(HloInstruction::CreatePartitionId(),
                                          absl::StrCat(cp_name, "-part-id"));
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition:
-    case CollectiveOpGroupMode::kFlattenedID:
+    default:
       return absl::InternalError(
           absl::StrFormat("Unexpected collective group mode for %s", cp_name));
   }
@@ -232,7 +231,8 @@ absl::Status DecomposeCollectivePermuteCycle(
       AddCP(cp, computation, back_pairs, "-bwd", attrs.first, cp->channel_id());
 
   // Forward edge.
-  bool is_cross_partition = (mode == CollectiveOpGroupMode::kCrossPartition);
+  bool is_cross_partition =
+      (mode == CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION);
   std::optional<int64_t> fwd_channel_id =
       is_cross_partition ? std::optional(next_channel_id) : std::nullopt;
   HloInstruction* fwd_cp =
@@ -273,7 +273,7 @@ absl::Status DecomposeCollectivePermuteCycle(
 }
 }  // namespace
 
-absl::StatusOr<bool> CollectivePermuteCycleDecomposer::Run(
+absl::StatusOr<bool> CollectivePermuteCycleDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h
index 916c7c8f616a21..3b98f801c6403b 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h
@@ -60,7 +60,8 @@ class CollectivePermuteCycleDecomposer : public HloModulePass {
     return "collective-permute-cycle-decomposer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc
index 3028f190f91acf..68fbda42ff1964 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc
@@ -218,7 +218,7 @@ bool IsTriviallyPipelineable(const HloInstruction& instr) {
   return instr.frontend_attributes().map().contains(kTriviallyPipelineable);
 }
 
-absl::StatusOr<bool> CollectivePipeliningAnalyzer::Run(
+absl::StatusOr<bool> CollectivePipeliningAnalyzer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   const HloModuleConfig& config = module->config();
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h
index 3585120d308394..07fdae7138b903 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h
@@ -69,8 +69,9 @@ class CollectivePipeliningAnalyzer : public HloModulePass {
   absl::string_view name() const override {
     return "collective-pipelining-analyzer";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
index 51012732e6a3c2..7efd2078d59593 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
@@ -108,9 +108,11 @@ static std::optional<FoldableSelect> MatchFoldableSelect(
   // Match replica-id or partition-id.
   CollectiveOpGroupMode collective_mode;
   if (HloPredicateIsOp<HloOpcode::kReplicaId>(id_op)) {
-    collective_mode = CollectiveOpGroupMode::kCrossReplica;
+    collective_mode =
+        CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA;
   } else if (HloPredicateIsOp<HloOpcode::kPartitionId>(id_op)) {
-    collective_mode = CollectiveOpGroupMode::kCrossPartition;
+    collective_mode =
+        CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION;
   } else {
     return std::nullopt;
   }
@@ -210,7 +212,7 @@ static absl::StatusOr<bool> TryFoldColectivePermuteOfSelect(
   return true;
 }
 
-absl::StatusOr<bool> CollectiveSelectFolder::Run(
+absl::StatusOr<bool> CollectiveSelectFolder::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
index 7d2172d18517b4..a9cd0bb99beced 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
@@ -76,7 +76,8 @@ class CollectiveSelectFolder : public HloModulePass {
  public:
   absl::string_view name() const override { return "collective-select-folder"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc
index 9f2e904d1bd3ea..35c0a7012a4b7e 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc
@@ -144,7 +144,7 @@ static absl::Status CreateAsyncStartAndAsyncDone(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> CollectiveSendRecvCombiner::Run(
+absl::StatusOr<bool> CollectiveSendRecvCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h
index 17e1200d5a10b7..c62b572a884fb0 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h
@@ -49,8 +49,8 @@ class CollectiveSendRecvCombiner : public HloModulePass {
     return "collective-send-recv-combiner";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
index 1c99bd414fb5b5..52ea29f9a7ac53 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
@@ -76,17 +77,17 @@ bool EnableHeuristicCollectiveCombining(
   if (!cc.IsAtLeastAmpere()) {
     return false;
   }
-  int hlo_device_count = config.num_partitions() * config.replica_count();
-  if (hlo_device_count <= nvlink_slice_size) {
+  if (IsIntraNVLinkDomain(config, nvlink_slice_size)) {
     VLOG(1) << "Disabled heuristic collective combining for intra-NVLink "
                "domain communication: HLO device count "
-            << hlo_device_count << " <= NVLink slice size "
-            << nvlink_slice_size;
+            << (config.num_partitions() * config.replica_count())
+            << " <= NVLink slice size " << nvlink_slice_size;
     return false;
   }
   VLOG(1) << "Enabled heuristic collective combining for inter-NVLink domain "
              "communication: HLO device count "
-          << hlo_device_count << " > NVLink slice size " << nvlink_slice_size;
+          << (config.num_partitions() * config.replica_count())
+          << " > NVLink slice size " << nvlink_slice_size;
   return true;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
index e822678cc453b9..a670ce12aceec5 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
@@ -73,7 +73,7 @@ std::optional<ReduceScatterCombiner::GroupKey> CustomCombinerKey(
 
 }  // namespace
 
-absl::StatusOr<bool> GpuReduceScatterCombiner::Run(
+absl::StatusOr<bool> GpuReduceScatterCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Combiner threshold is specified. Running parent pass code.
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h
index 7383877c7a97e2..96fb1d4f8ea466 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h
@@ -46,8 +46,8 @@ class GpuReduceScatterCombiner : public ReduceScatterCombiner {
     return "gpu-reduce-scatter-combiner";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc
index 9c584db14c404d..a9b3205c85d87a 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc
@@ -29,14 +29,12 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/collective_utils.h"
 #include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::Matcher;
-using ::tsl::testing::IsOkAndHolds;
 
 namespace op = xla::testing::opcode_matchers;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
index 62354792be770a..236ee1dd1a5d17 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
@@ -22,14 +22,12 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/functional/overload.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -37,7 +35,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/STLExtras.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -46,6 +43,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/transforms/simplifiers/computation_canonicalizers.h"
 #include "xla/hlo/utils/hlo_longest_prefix.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/backend_configs.pb.h"
@@ -54,8 +52,8 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -87,10 +85,6 @@ static bool IsParameter(const HloInstruction* hlo) {
   return HloPredicateIsOp<HloOpcode::kParameter>(hlo);
 }
 
-static bool IsGetTupleElement(const HloInstruction* hlo) {
-  return HloPredicateIsOp<HloOpcode::kGetTupleElement>(hlo);
-}
-
 // Returns true if instruction is no-op at run time and doesn't have a
 // corresponding Thunk or Command (metadata only operation).
 static bool IsNoOp(const HloInstruction* hlo) {
@@ -120,7 +114,8 @@ static bool AsyncStartOrDoneCommandIsSupported(
 
   if (hlo->async_wrapped_opcode() == HloOpcode::kFusion) {
     // We don't currently support dynamic memcpy fusions in command buffers.
-    if (IsDynamicMemcpyFusion(hlo->async_wrapped_instruction())) {
+    if (IsGpuFusionKind(*hlo->async_wrapped_instruction(),
+                        kDynamicMemcpyFusionKind)) {
       return config.enabled_commands.contains(
           DebugOptions::DYNAMIC_SLICE_COPY_FUSION);
     }
@@ -260,7 +255,7 @@ static bool IsCommand(const HloCustomCallInstruction* hlo,
   // Check if FFI handler is compatible with command buffers.
   auto registration = ffi::FindHandler(hlo->custom_call_target(), "gpu");
   return registration.ok()
-             ? ffi::IsCommandBufferCompatible(registration->traits)
+             ? ffi::IsCommandBufferCompatible(registration->metadata)
              : false;
 }
 
@@ -273,7 +268,7 @@ static bool IsCommand(const HloInstruction* hlo,
     if (backend_config.kind() == kCuDnnFusionKind) {
       return config.enabled_commands.contains(DebugOptions::CUDNN);
     }
-    if (IsDynamicMemcpyFusion(fusion)) {
+    if (IsGpuFusionKind(*fusion, kDynamicMemcpyFusionKind)) {
       return config.enabled_commands.contains(
           DebugOptions::DYNAMIC_SLICE_COPY_FUSION);
     }
@@ -324,7 +319,7 @@ static bool IsCommand(const HloInstruction* hlo,
     return config.enabled_commands.contains(DebugOptions::FUSION);
   }
 
-  if (auto* sort = DynCast<HloSortInstruction>(hlo)) {
+  if (DynCast<HloSortInstruction>(hlo)) {
     return config.enabled_commands.contains(DebugOptions::FUSION);
   }
 
@@ -379,73 +374,6 @@ static void RemoveTrailingNoOps(HloInstructionSequence& seq) {
   }
 }
 
-// Moves GetTupleElement instructions to right after the instruction that
-// produces the tuple. Returns whether the computation was changed. This is run
-// before command buffer scheduling.
-//
-// The motivation is to ensure the live range of large elements in the tuple are
-// not extended due to the creation of command buffers. For example, consider
-// the following input HLO to this pass.
-//
-//     x = f32[] parameter(0)
-//     t = (f32[], f32[10000]) custom-call()
-//     ... # Many instructions, none which use t
-//     x_squared = f32[] multiply(x, x)
-//     t0 = f32[] get-tuple-element(t), index=0
-//     y = f32[] add(x_squared, t0)
-//
-// The 10000-element buffer can immediately be freed after the custom-call, as
-// it is unused. However, if `t0` is not moved right after `t`, then the
-// scheudling of command buffers might turn the HLO into the following,
-// extending the live range of the 10000-element buffer as 't' is passed to the
-// command buffer:
-//
-//     command_buffer {
-//       t = (f32[], f32[10000]) paramter(0)
-//       x_squared = f32[] multiply(x, x)
-//       t0 = f32[] get-tuple-element(t), index=0
-//       ROOT y = f32[] add(x_squared, t0)
-//     }
-//
-//     main {
-//       x = f32[] parameter(0)
-//       t = (f32[], f32[10000]) custom-call()
-//       ... # Many instructions, none which use t
-//       ROOT y = f32[] call(t), to_apply=command_buffer
-//     }
-//
-// Moving the GTE right after `t` solves this, as command-buffers never start
-// with a GTE, so it's impossible for a command buffer to contain the GTE but
-// not the custom-call itself.
-static absl::StatusOr<bool> MoveGTEsRightAfterTupleDefinition(
-    HloComputation* computation) {
-  HloInstructionSequence new_sequence;
-  HloSchedule& schedule = computation->parent()->schedule();
-  const HloInstructionSequence sequence =
-      schedule.GetOrCreateSequence(computation);
-
-  absl::flat_hash_set<HloInstruction*> moved_gtes;
-
-  for (HloInstruction* inst : sequence.instructions()) {
-    if (!moved_gtes.contains(inst)) {
-      new_sequence.push_back(inst);
-    }
-    if (!inst->shape().IsTuple()) {
-      continue;
-    }
-    for (HloInstruction* user : inst->users()) {
-      if (IsGetTupleElement(user) && !user->HasControlDependencies()) {
-        new_sequence.push_back(user);
-        moved_gtes.insert(user);
-      }
-    }
-  }
-
-  bool changed = new_sequence != sequence;
-  schedule.set_sequence(computation, std::move(new_sequence));
-  return changed;
-}
-
 //===----------------------------------------------------------------------===//
 // Discovering sequences of compatible Hlo instructions
 //===----------------------------------------------------------------------===//
@@ -531,7 +459,7 @@ CommandBufferScheduling::CollectCommandBufferSequences(
   // are captured by the same command buffer.
   auto collect_async_region = [&](const HloInstruction* start) {
     auto get_index = [&](const HloInstruction* inst) -> size_t {
-      auto it = std::find(instructions.begin(), instructions.end(), inst);
+      auto it = absl::c_find(instructions, inst);
       return std::distance(instructions.begin(), it);
     };
 
@@ -621,51 +549,6 @@ CommandBufferScheduling::CollectCommandBufferSequences(
   return sequences;
 }
 
-// This function moves kParameter and kConstant instructions in a computation to
-// the beginning of the computation. This simplifies the construction of command
-// buffer computations because we don't need to deal with parameters and
-// constants that have users outside of a command buffer.
-// Returns true if there is a change in the order of instructions, false
-// otherwise.
-absl::StatusOr<bool> CommandBufferScheduling::MoveParametersAndConstantsToFront(
-    HloComputation* computation) {
-  HloInstructionSequence new_sequence;
-  HloSchedule& schedule = computation->parent()->schedule();
-  HloInstructionSequence& sequence = schedule.GetOrCreateSequence(computation);
-
-  for (HloInstruction* inst : sequence.instructions()) {
-    if (IsParameter(inst) || IsConstant(inst)) {
-      new_sequence.push_back(inst);
-
-      // Because we move instruction to the front of the computation we can't
-      // have any control predecessors, however silently dropping them is unsafe
-      // as we can have transitive dependencies that define schedule order, so
-      // we forward control predecessors to all users.
-      for (HloInstruction* control_predecessor : inst->control_predecessors()) {
-        for (HloInstruction* user : inst->users()) {
-          TF_RETURN_IF_ERROR(control_predecessor->AddControlDependencyTo(user));
-        }
-      }
-      TF_RETURN_IF_ERROR(inst->DropAllControlDeps());
-    }
-  }
-
-  for (HloInstruction* inst : sequence.instructions()) {
-    if (!IsParameter(inst) && !IsConstant(inst)) {
-      new_sequence.push_back(inst);
-    }
-  }
-
-  schedule.set_sequence(computation, new_sequence);
-  for (auto [old_i, new_i] :
-       llvm::zip(sequence.instructions(), new_sequence.instructions())) {
-    if (old_i != new_i) {
-      return true;
-    }
-  }
-  return false;
-}
-
 //===----------------------------------------------------------------------===//
 // Prepares command buffer from sequence of instructions
 //===----------------------------------------------------------------------===//
@@ -908,7 +791,7 @@ CommandBufferScheduling::CommandBufferScheduling(
     const se::DeviceDescription& device_description)
     : device_description_(device_description) {}
 
-absl::StatusOr<bool> CommandBufferScheduling::Run(
+absl::StatusOr<bool> CommandBufferScheduling::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // We run command buffer scheduling after a regular scheduling to guarantee
@@ -927,8 +810,7 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
     commands.insert(static_cast<DebugOptions::CommandBufferCmdType>(cmd_type));
   }
 
-  CommandBufferConfig config{std::move(commands),
-                             device_description_};
+  CommandBufferConfig config{std::move(commands), device_description_};
 
   // Erase command buffer cmd types that are not supported by the gpu runtime.
   static constexpr auto kRequireConditionals = {DebugOptions::CONDITIONAL,
@@ -959,20 +841,19 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
   };
 
   // Check if CUDA/ROCM driver supports required features.
-  auto erase_cuda = [&](const se::CudaComputeCapability& cuda_comp) {
+  if (auto* cuda_comp = device_description_.gpu_compute_capability()
+                            .cuda_compute_capability()) {
     if (std::min(device_description_.runtime_version(),
                  device_description_.driver_version()) <
         se::SemanticVersion{12, 3, 0}) {
       erase(kRequireTracing);       // cuStreamBeginCaptureToGraph
       erase(kRequireConditionals);  // on-device control flow
     }
-  };
-  auto erase_rocm = [&](const se::RocmComputeCapability& rocm_comp) {
+  } else if (const se::RocmComputeCapability* rocm_comp =
+                 device_description_.gpu_compute_capability()
+                     .rocm_compute_capability()) {
     erase(kRequireConditionals);  // on-device control flow
-  };
-
-  std::visit(absl::Overload(erase_cuda, erase_rocm),
-             device_description_.gpu_compute_capability());
+  }
 
   auto order = module->MakeComputationPostOrder();
   std::reverse(order.begin(), order.end());
@@ -992,9 +873,45 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
       continue;
     }
 
-    TF_ASSIGN_OR_RETURN(bool changed_, MoveParametersAndConstantsToFront(comp));
+    TF_ASSIGN_OR_RETURN(bool changed_,
+                        MoveParametersAndConstantsToFront(*comp));
     changed |= changed_;
-    TF_ASSIGN_OR_RETURN(changed_, MoveGTEsRightAfterTupleDefinition(comp));
+    // The motivation for MoveGTEsRightAfterTupleDefinition is to ensure the
+    // live range of large elements in the tuple are not extended due to the
+    // creation of command buffers. For example, consider the following input
+    // HLO to this pass.
+    //
+    //     x = f32[] parameter(0)
+    //     t = (f32[], f32[10000]) custom-call()
+    //     ... # Many instructions, none which use t
+    //     x_squared = f32[] multiply(x, x)
+    //     t0 = f32[] get-tuple-element(t), index=0
+    //     y = f32[] add(x_squared, t0)
+    //
+    // The 10000-element buffer can immediately be freed after the custom-call,
+    // as it is unused. However, if `t0` is not moved right after `t`, then the
+    // scheudling of command buffers might turn the HLO into the following,
+    // extending the live range of the 10000-element buffer as 't' is passed to
+    // the command buffer:
+    //
+    //     command_buffer {
+    //       t = (f32[], f32[10000]) paramter(0)
+    //       x_squared = f32[] multiply(x, x)
+    //       t0 = f32[] get-tuple-element(t), index=0
+    //       ROOT y = f32[] add(x_squared, t0)
+    //     }
+    //
+    //     main {
+    //       x = f32[] parameter(0)
+    //       t = (f32[], f32[10000]) custom-call()
+    //       ... # Many instructions, none which use t
+    //       ROOT y = f32[] call(t), to_apply=command_buffer
+    //     }
+    //
+    // Moving the GTE right after `t` solves this, as command-buffers never
+    // start with a GTE, so it's impossible for a command buffer to contain the
+    // GTE but not the custom-call itself.
+    TF_ASSIGN_OR_RETURN(changed_, MoveGTEsRightAfterTupleDefinition(*comp));
     changed |= changed_;
 
     std::vector<HloInstructionSequence> sequences =
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
index 9662c0a3b76e99..ac6c884b4228aa 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
@@ -85,24 +85,10 @@ class CommandBufferScheduling : public HloModulePass {
     return "command-buffer-scheduling";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static std::vector<HloInstructionSequence> CollectCommandBufferSequences(
       HloInstructionSequence schedule, const CommandBufferConfig& config,
       int32_t min_num_commands = 1);
 
-  // Moves kParameter and kConstant instructions in a computation to
-  // the beginning of the computation. This simplifies the construction of
-  // command buffer computations because we don't need to deal with parameters
-  // and constants that have users outside of a command buffer.
-  // Returns true if there is a change in the order of instructions, false
-  // otherwise.
-  static absl::StatusOr<bool> MoveParametersAndConstantsToFront(
-      HloComputation* computation);
-
   struct CommandBuffer {
     // Command buffer arguments (call instruction arguments).
     std::vector<HloInstruction*> arguments;
@@ -130,6 +116,11 @@ class CommandBufferScheduling : public HloModulePass {
       HloComputation* parent, const HloInstructionSequence& seq,
       CommandBuffer command_buffer);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   se::DeviceDescription device_description_;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc
index 9f4fa4bfb2b02c..6bfc1b708b58eb 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc
@@ -120,7 +120,7 @@ TEST_F(CommandBufferSchedulingTest, SingleCommandBuffer) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -197,7 +197,7 @@ TEST_F(CommandBufferSchedulingTest, MultipleCommandBuffers) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -235,7 +235,7 @@ TEST_F(CommandBufferSchedulingTest, AllReduceStartFollowedByDone) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -269,7 +269,7 @@ TEST_F(CommandBufferSchedulingTest, AllGatherStartFollowedByDone) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -309,7 +309,7 @@ TEST_F(CommandBufferSchedulingTest, ReduceScatterStartFollowedByDone) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -349,7 +349,7 @@ TEST_F(CommandBufferSchedulingTest, AllReduceStartFollowedByBitcast) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -393,7 +393,7 @@ TEST_F(CommandBufferSchedulingTest, AllReduceStartFollowedAllReduceStart) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -457,7 +457,7 @@ TEST_F(CommandBufferSchedulingTest, DoNotCaptureUnmatchedAsyncDone) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -532,51 +532,6 @@ TEST_F(CommandBufferSchedulingTest, CollectCommandBufferSequence) {
   EXPECT_EQ(seq_1[1]->opcode(), HloOpcode::kFusion);
 }
 
-TEST_F(CommandBufferSchedulingTest, MoveParametersToFront) {
-  const char* hlo = R"(
-      HloModule TestModule, is_scheduled=true
-
-      %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
-        %p0 = s32[] parameter(0)
-        %p1 = s32[] parameter(1)
-        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
-      }
-
-      %fused_computation.1 (param_0: s32[], param_1: s32[]) -> s32[] {
-        %p0 = s32[] parameter(0)
-        %p1 = s32[] parameter(1)
-        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
-      }
-
-      ENTRY %main (a: s32[], b: s32[], c: s32[]) -> s32[] {
-        %a = s32[] parameter(0)
-        %b = s32[] parameter(1)
-        %fusion = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop, calls=%fused_computation
-        %c = s32[] parameter(2)
-        ROOT %fusion.1 = s32[] fusion(s32[] %a, s32[] %c), kind=kLoop, calls=%fused_computation.1
-      })";
-
-  const char* expected = R"(
-// CHECK: ENTRY %main (a: s32[], b: s32[], c: s32[]) -> s32[] {
-// CHECK:   %a = s32[] parameter(0)
-// CHECK:   %b = s32[] parameter(1)
-// CHECK:   %c = s32[] parameter(2)
-// CHECK:   %fusion = s32[] fusion(%a, %b), kind=kLoop, calls=%fused_computation
-// CHECK:   ROOT %fusion.1 = s32[] fusion(%a, %c), kind=kLoop, calls=%fused_computation.1
-// CHECK: })";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo));
-  TF_ASSERT_OK(CommandBufferScheduling::MoveParametersAndConstantsToFront(
-      module->entry_computation()));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool filecheck_matches,
-      RunFileCheck(
-          module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
-          expected));
-  EXPECT_TRUE(filecheck_matches);
-}
-
 TEST_F(CommandBufferSchedulingTest, PrepareCommandBuffer) {
   const char* hlo = R"(
       HloModule TestModule, is_scheduled=true
@@ -704,7 +659,7 @@ TEST_F(CommandBufferSchedulingTest, ForwardControlDependencies) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -743,7 +698,7 @@ TEST_F(CommandBufferSchedulingTest, ForwardControlDependenciesToParams) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -821,13 +776,13 @@ TEST_F(CommandBufferSchedulingTest, WhileNotCommand) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
 TEST_F(CommandBufferSchedulingTest, While) {
   const auto& gpu_desc = GetGpuComputeCapability();
-  if (std::holds_alternative<se::RocmComputeCapability>(gpu_desc)) {
+  if (gpu_desc.IsRocm()) {
     GTEST_SKIP() << "Not supported for ROCm!";
   }
   const char* hlo = R"(
@@ -886,13 +841,13 @@ TEST_F(CommandBufferSchedulingTest, While) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
 TEST_F(CommandBufferSchedulingTest, Conditional) {
   const auto& gpu_desc = GetGpuComputeCapability();
-  if (std::holds_alternative<se::RocmComputeCapability>(gpu_desc)) {
+  if (gpu_desc.IsRocm()) {
     GTEST_SKIP() << "Not supported for ROCm!";
   }
   const char* hlo = R"(
@@ -965,7 +920,7 @@ TEST_F(CommandBufferSchedulingTest, Conditional) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -1017,7 +972,7 @@ ENTRY e {
   RunAndFilecheckHloRewrite(kHloText, CommandBufferScheduling(device_desc()),
                             kExpected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -1048,7 +1003,7 @@ TEST_F(CommandBufferSchedulingTest, AsyncCustomCall) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -1092,7 +1047,7 @@ TEST_F(CommandBufferSchedulingTest, AsyncFusion) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -1121,7 +1076,7 @@ TEST_F(CommandBufferSchedulingTest, AsyncAlltoAll) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -1389,7 +1344,7 @@ TEST_F(CommandBufferSchedulingTest, MoveGTEs) {
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
                             expected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
@@ -1423,7 +1378,7 @@ ENTRY e {
   RunAndFilecheckHloRewrite(kHloText, CommandBufferScheduling(device_desc()),
                             kExpected, [](HloModule* module) {
                               EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
+                              CHECK_OK(module->schedule().Verify());
                             });
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
index 2cfa61ff7eef85..7afd2c38666760 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
@@ -15,10 +15,19 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/composite_rewriter.h"
 
+#include <cstdint>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "mlir/AsmParser/AsmParser.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/LLVM.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -33,6 +42,63 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace {
+
+absl::StatusOr<DotDimensionNumbers> ParseDimensionNumbers(
+    absl::string_view composite_attributes) {
+  mlir::MLIRContext context;
+  mlir::Attribute attr = mlir::parseAttribute(composite_attributes, &context);
+  mlir::DictionaryAttr dict_attrs = mlir::dyn_cast<mlir::DictionaryAttr>(attr);
+  if (!dict_attrs.contains("dimension_numbers")) {
+    return absl::InvalidArgumentError(
+        "dimension_numbers are not set in composite attributes");
+  }
+
+  mlir::ArrayAttr dim_numbers =
+      mlir::dyn_cast<mlir::ArrayAttr>(dict_attrs.get("dimension_numbers"));
+  if (!dim_numbers || dim_numbers.size() != 2) {
+    return absl::InvalidArgumentError(
+        "dimension_numbers must be array of size 2");
+  }
+
+  mlir::ArrayAttr contracting = mlir::dyn_cast<mlir::ArrayAttr>(dim_numbers[0]);
+  mlir::ArrayAttr batch = mlir::dyn_cast<mlir::ArrayAttr>(dim_numbers[1]);
+  if (!contracting || contracting.size() != 2 || !batch || batch.size() != 2) {
+    return absl::InvalidArgumentError(
+        "invalid contracting or batch dimensions");
+  }
+
+  mlir::ArrayAttr lhs_contracting =
+      mlir::dyn_cast<mlir::ArrayAttr>(contracting[0]);
+  mlir::ArrayAttr rhs_contracting =
+      mlir::dyn_cast<mlir::ArrayAttr>(contracting[1]);
+  mlir::ArrayAttr lhs_batch = mlir::dyn_cast<mlir::ArrayAttr>(batch[0]);
+  mlir::ArrayAttr rhs_batch = mlir::dyn_cast<mlir::ArrayAttr>(batch[1]);
+
+  if (!lhs_contracting || !rhs_contracting || !lhs_batch || !rhs_batch) {
+    return absl::InvalidArgumentError("Invalid dimension_numbers structure");
+  }
+
+  DotDimensionNumbers dnums;
+  for (mlir::Attribute dim : lhs_contracting) {
+    dnums.add_lhs_contracting_dimensions(
+        mlir::cast<mlir::IntegerAttr>(dim).getInt());
+  }
+  for (mlir::Attribute dim : rhs_contracting) {
+    dnums.add_rhs_contracting_dimensions(
+        mlir::cast<mlir::IntegerAttr>(dim).getInt());
+  }
+  for (mlir::Attribute dim : lhs_batch) {
+    dnums.add_lhs_batch_dimensions(mlir::cast<mlir::IntegerAttr>(dim).getInt());
+  }
+  for (mlir::Attribute dim : rhs_batch) {
+    dnums.add_rhs_batch_dimensions(mlir::cast<mlir::IntegerAttr>(dim).getInt());
+  }
+  return dnums;
+}
+
+}  // namespace
+
 absl::StatusOr<bool> CompositeRewriter::RewriteComputation(
     HloComputation* computation) {
   bool changed = false;
@@ -48,23 +114,66 @@ absl::StatusOr<bool> CompositeRewriter::RewriteComputation(
       VLOG(3) << "No frontend attributes";
       continue;
     }
-    auto attrs = call->frontend_attributes().map();
+    auto frontend_attrs = call->frontend_attributes().map();
     auto key = "composite.name";
-    if (!attrs.contains(key) || attrs.at(key) != "xla.scaled_dot") {
-      VLOG(3) << key << " is not xla.scaled_dot: " << attrs.at(key);
+    if (!frontend_attrs.contains(key) ||
+        frontend_attrs.at(key) != "xla.scaled_dot") {
+      VLOG(3) << key << " is not xla.scaled_dot: " << frontend_attrs.at(key);
       continue;
     }
-    DotDimensionNumbers dot_dimension_numbers;
-    dot_dimension_numbers.add_lhs_contracting_dimensions(2);
-    dot_dimension_numbers.add_rhs_contracting_dimensions(2);
-    dot_dimension_numbers.add_lhs_batch_dimensions(0);
-    dot_dimension_numbers.add_rhs_batch_dimensions(0);
+    if (!frontend_attrs.contains("composite.attributes")) {
+      return absl::InvalidArgumentError(
+          "composite.attributes is not set for xla.scaled_dot");
+    }
+    TF_ASSIGN_OR_RETURN(
+        DotDimensionNumbers dot_dimension_numbers,
+        ParseDimensionNumbers(frontend_attrs.at("composite.attributes")));
+
+    if (dot_dimension_numbers.lhs_contracting_dimensions_size() != 1 ||
+        dot_dimension_numbers.rhs_contracting_dimensions_size() != 1 ||
+        dot_dimension_numbers.lhs_batch_dimensions_size() > 1 ||
+        dot_dimension_numbers.rhs_batch_dimensions_size() > 1) {
+      LOG(ERROR) << "Unsupported dimension numbers: "
+                 << dot_dimension_numbers.DebugString();
+      continue;
+    }
+
+    const HloInstruction* lhs = call->operand(0);
+    const HloInstruction* rhs = call->operand(1);
+    const HloInstruction* lhs_scale = call->operand(2);
+    const HloInstruction* rhs_scale = call->operand(3);
+
+    if (lhs->shape().element_type() != BF16) {
+      int64_t contracting_dim =
+          dot_dimension_numbers.lhs_contracting_dimensions(0);
+      int64_t scale_factor = lhs->shape().dimensions(contracting_dim) /
+                             lhs_scale->shape().dimensions(contracting_dim);
+      if (scale_factor != 32) {
+        VLOG(2) << "LHS scale_factor is not 32: " << scale_factor
+                << " ignore such scaled_dot. It will be inlined later.";
+        continue;
+      }
+    }
+
+    if (rhs->shape().element_type() != BF16) {
+      int64_t contracting_dim =
+          dot_dimension_numbers.rhs_contracting_dimensions(0);
+      int64_t scale_factor = rhs->shape().dimensions(contracting_dim) /
+                             rhs_scale->shape().dimensions(contracting_dim);
+      if (scale_factor != 32) {
+        VLOG(2) << "RHS scale_factor is not 32: " << scale_factor
+                << " ignore such scaled_dot for now. It will be inlined later.";
+        continue;
+      }
+    }
 
+    PrecisionConfig precision{};
+    precision.mutable_operand_precision()->Resize(2, PrecisionConfig::DEFAULT);
     auto* scaled_dot =
         computation->AddInstruction(HloInstruction::CreateScaledDot(
             call->shape(), call->mutable_operand(0), call->mutable_operand(1),
             call->mutable_operand(2), call->mutable_operand(3),
-            dot_dimension_numbers, PrecisionConfig{}));
+            dot_dimension_numbers, precision));
     TF_RETURN_IF_ERROR(call->ReplaceAllUsesWith(scaled_dot));
     TF_RETURN_IF_ERROR(computation->RemoveInstruction(call));
     changed = true;
@@ -72,7 +181,7 @@ absl::StatusOr<bool> CompositeRewriter::RewriteComputation(
   return changed;
 }
 
-absl::StatusOr<bool> CompositeRewriter::Run(
+absl::StatusOr<bool> CompositeRewriter::RunImpl(
     HloModule* module, const absl::flat_hash_set<absl::string_view>&) {
   bool changed = false;
   for (HloComputation* computation : module->computations()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h
index bcad646641470d..7896a04be027cd 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h
@@ -31,12 +31,12 @@ class CompositeRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "composite-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
index 7968e3bba68561..a2935374ef2c6b 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
@@ -56,14 +56,14 @@ TEST(CompositeRewriterTest, ScaledDotCompositeRewrite) {
 
     ENTRY %main {
       %lhs = f8e4m3fn[3,128,256]{2,1,0} parameter(0)
-      %rhs = f8e4m3fn[3,128,256]{2,1,0} parameter(1)
+      %rhs = f8e4m3fn[3,256,128]{2,1,0} parameter(1)
       %lhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(2)
-      %rhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(3)
+      %rhs_scales = f8e8m0fnu[3,8,128]{2,1,0} parameter(3)
       ROOT %call.1 = bf16[3,128,128]{2,1,0} call(%lhs, %rhs, %lhs_scales, %rhs_scales),
           to_apply=%xla.scaled_dot.1,
           is_composite=true,
           frontend_attributes={
-            composite.attributes={preferred_element_type = bf16},
+            composite.attributes="{dimension_numbers=[[[2],[1]],[[0],[0]]]}",
             composite.name="xla.scaled_dot",
             composite.version="1"
           }
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
index 25d79a931b075f..dddd0c6a6d105e 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/service/shape_inference.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -52,7 +51,7 @@ bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
         conv.custom_call_target() == kCudnnConvForwardGraphCallTarget);
   return window_util::HasSymmetricPadding(conv.window()) &&
          !window_util::HasNegativePadding(conv.window()) &&
-         !window_util::HasDilation(conv.window());
+         !window_util::HasBaseDilation(conv.window());
 }
 
 // If the (positive and negative) padding on the input operand of a convolution
@@ -139,8 +138,10 @@ HloInstruction* MaybePaddedAndSlicedInput(
 // operand.
 HloInstruction* MaybePaddedKernel(const Window& conv_window,
                                   const ConvolutionDimensionNumbers& conv_dnums,
-                                  HloInstruction* kernel) {
-  if (!window_util::HasWindowDilation(conv_window)) {
+                                  HloInstruction* kernel,
+                                  bool preserve_window_dilation = false) {
+  if (!window_util::HasWindowDilation(conv_window) ||
+      preserve_window_dilation) {
     return kernel;
   }
 
@@ -172,6 +173,12 @@ bool ConvPaddingLegalization::CanonicalizeForwardConvolution(
     return false;
   }
 
+  bool has_window_dilation = window_util::HasWindowDilation(conv->window());
+  bool preserve_window_dilation =
+      has_window_dilation && window_util::HasSymmetricPadding(conv->window()) &&
+      !window_util::HasNegativePadding(conv->window()) &&
+      !window_util::HasBaseDilation(conv->window());
+
   // Insert slices and/or pads between the convolution and its input and/or
   // kernel operand.
   Window new_conv_window = conv->window();
@@ -180,17 +187,17 @@ bool ConvPaddingLegalization::CanonicalizeForwardConvolution(
       conv->mutable_operand(0));
   HloInstruction* new_kernel =
       MaybePaddedKernel(new_conv_window, conv->convolution_dimension_numbers(),
-                        conv->mutable_operand(1));
+                        conv->mutable_operand(1), preserve_window_dilation);
 
-  // Remove the window dilation from convolution's window field. These paddings
-  // are made explicit with the pads inserted by MaybePaddedKernel().
   for (size_t i = 0; i < new_conv_window.dimensions_size(); ++i) {
     WindowDimension* dim = new_conv_window.mutable_dimensions(i);
 
     // The size of the kernel may have changed so update the Window to match.
     dim->set_size(new_kernel->shape().dimensions(
         conv->convolution_dimension_numbers().kernel_spatial_dimensions(i)));
-    dim->set_window_dilation(1);
+    if (!preserve_window_dilation) {
+      dim->set_window_dilation(1);
+    }
   }
 
   // The conv CustomCall returns a tuple (conv_result, scratch_buffer).  Extract
@@ -205,7 +212,7 @@ bool ConvPaddingLegalization::CanonicalizeForwardConvolution(
   new_conv->set_window(new_conv_window);
   VLOG(1) << "Replacing:\n  " << conv->ToString() << "\nwith:\n  "
           << new_conv->ToString();
-  TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv));
+  CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv));
   return true;
 }
 
@@ -287,8 +294,7 @@ bool ConvPaddingLegalization::CanonicalizeBackwardFilterConvolution(
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
           << new_backward_conv->ToString();
 
-  TF_CHECK_OK(
-      computation->ReplaceInstruction(backward_conv, new_backward_conv));
+  CHECK_OK(computation->ReplaceInstruction(backward_conv, new_backward_conv));
   return true;
 }
 
@@ -414,7 +420,7 @@ bool ConvPaddingLegalization::CanonicalizeBackwardInputConvolution(
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
           << new_tuple->ToString();
 
-  TF_CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple));
+  CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple));
   return true;
 }
 
@@ -445,7 +451,7 @@ absl::StatusOr<bool> ConvPaddingLegalization::RunOnComputation(
   return changed;
 }
 
-absl::StatusOr<bool> ConvPaddingLegalization::Run(
+absl::StatusOr<bool> ConvPaddingLegalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
index 9be21ebd3f1f2f..9fbf4aa08cae7d 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
@@ -36,8 +36,8 @@ class ConvPaddingLegalization : public HloModulePass {
     return "conv-padding-legalization";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
index dacefe586b2a63..89181cbebf01bb 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
@@ -91,6 +91,74 @@ ENTRY %convolution (operand f64[2,2,2,3]{3,2,1,0}) -> (f64[2,2,4,4]{3,2,1,0}, u8
   EXPECT_TRUE(ShapeUtil::Equal(conv->shape(), expected_conv_shape));
 }
 
+TEST_F(ConvPaddingLegalizationTest, ForwardConvolveWithWindowDilation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (input f32[1,3,5,5]{3,2,1,0}, kernel f32[3,3,3,3]{3,2,1,0}) -> (f32[1,3,5,5]{3,2,1,0}, u8[0]) {
+  %input = f32[1,3,5,5]{3,2,1,0} parameter(0)
+  %kernel = f32[3,3,3,3]{3,2,1,0} parameter(1)
+  ROOT %custom-call = (f32[1,3,5,5]{3,2,1,0}, u8[0]{0}) custom-call(%input, %kernel), window={size=3x3 pad=2_2x2_2 rhs_dilate=2x2}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward"
+}
+                                               )")
+                    .value();
+  EXPECT_FALSE(ConvPaddingLegalization().Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::CustomCall({kCudnnConvForwardCallTarget},
+                                       m::Parameter(0), m::Parameter(1))));
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& dim = root->window().dimensions(i);
+    EXPECT_EQ(2, dim.window_dilation());
+    EXPECT_EQ(3, dim.size());
+  }
+}
+
+TEST_F(ConvPaddingLegalizationTest,
+       ForwardConvolveWithWindowDilationAndAsymmetricPadding) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (input f32[1,3,5,5]{3,2,1,0}, kernel f32[3,3,3,3]{3,2,1,0}) -> (f32[1,3,5,5]{3,2,1,0}, u8[0]) {
+  %input = f32[1,3,5,5]{3,2,1,0} parameter(0)
+  %kernel = f32[3,3,3,3]{3,2,1,0} parameter(1)
+  ROOT %custom-call = (f32[1,3,5,5]{3,2,1,0}, u8[0]{0}) custom-call(%input, %kernel), window={size=3x3 pad=1_2x1_2 rhs_dilate=2x2}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward"
+}
+                                               )")
+                    .value();
+  ASSERT_TRUE(ConvPaddingLegalization().Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::CustomCall({kCudnnConvForwardCallTarget},
+                                       m::Pad(m::Parameter(0), m::Op()),
+                                       m::Pad(m::Parameter(1), m::Op()))));
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& dim = root->window().dimensions(i);
+    EXPECT_EQ(1, dim.window_dilation());
+  }
+}
+
+TEST_F(ConvPaddingLegalizationTest,
+       ForwardConvolveWithWindowDilationAndBaseDilation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (input f32[1,3,5,5]{3,2,1,0}, kernel f32[3,3,3,3]{3,2,1,0}) -> (f32[1,3,9,9]{3,2,1,0}, u8[0]) {
+  %input = f32[1,3,5,5]{3,2,1,0} parameter(0)
+  %kernel = f32[3,3,3,3]{3,2,1,0} parameter(1)
+  ROOT %custom-call = (f32[1,3,9,9]{3,2,1,0}, u8[0]{0}) custom-call(%input, %kernel), window={size=3x3 pad=2_2x2_2 rhs_dilate=2x2 lhs_dilate=2x2}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward"
+}
+                                               )")
+                    .value();
+  ASSERT_TRUE(ConvPaddingLegalization().Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::CustomCall({kCudnnConvForwardCallTarget},
+                                       m::Pad(m::Parameter(0), m::Op()),
+                                       m::Pad(m::Parameter(1), m::Op()))));
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& dim = root->window().dimensions(i);
+    EXPECT_EQ(1, dim.window_dilation());
+  }
+}
+
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
index b96853ca3108ec..47c986922fada8 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -46,7 +45,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -80,28 +78,26 @@ absl::Status CheckTypes(HloInstruction* conv, const se::GpuComputeCapability cc,
             "but got convolution with FP8 type %s: %s",
             primitive_util::LowercasePrimitiveTypeName(type), conv->ToString());
       }
-      if (!std::holds_alternative<se::CudaComputeCapability>(cc)) {
+      if (!cc.IsCuda()) {
         return Unimplemented(
             "FP8 convolutions are only supported on CUDA GPUs, but got "
             "FP8 convolution on ROCm GPU: %s",
             conv->ToString());
       }
       if (dnn_version >= se::dnn::VersionInfo{9, 8, 0}) {
-        if (!std::get<se::CudaComputeCapability>(cc).IsAtLeastAda()) {
+        if (!cc.cuda_compute_capability()->IsAtLeastAda()) {
           return Unimplemented(
               "FP8 convolutions are only supported on CUDA GPUs with compute "
               "capability at least 8.9, but got "
               "FP8 convolution on GPU with compute capability %s: %s",
-              std::get<se::CudaComputeCapability>(cc).ToString(),
-              conv->ToString());
+              cc.ToString(), conv->ToString());
         }
-      } else if (!std::get<se::CudaComputeCapability>(cc).IsAtLeastHopper()) {
+      } else if (!cc.cuda_compute_capability()->IsAtLeastHopper()) {
         return Unimplemented(
             "FP8 convolutions are only supported on CUDA GPUs with compute "
             "capability at least 9.0, but got "
             "FP8 convolution on GPU with compute capability %s: %s",
-            std::get<se::CudaComputeCapability>(cc).ToString(),
-            conv->ToString());
+            cc.ToString(), conv->ToString());
       }
     }
     return absl::OkStatus();
@@ -140,6 +136,42 @@ bool MaybeConv1dToConv2d(HloInstruction* conv) {
   return false;
 }
 
+bool LooksLikeForwardConvolution(const HloInstruction* conv) {
+  const ConvolutionDimensionNumbers& dnums =
+      conv->convolution_dimension_numbers();
+  const Shape& lhs_shape = conv->operand(0)->shape();
+  const Shape& rhs_shape = conv->operand(1)->shape();
+  const Shape& result_shape = conv->shape();
+
+  // Compare batch and output feature counts. Backward-filter convolutions swap
+  // these, so matching values are a strong signal that this is a forward
+  // convolution, even if it has dilation.
+  int64_t lhs_batches = lhs_shape.dimensions(dnums.input_batch_dimension());
+  int64_t result_batches =
+      result_shape.dimensions(dnums.output_batch_dimension());
+  if (lhs_batches != result_batches) {
+    return false;
+  }
+
+  int64_t rhs_output_features =
+      rhs_shape.dimensions(dnums.kernel_output_feature_dimension());
+  int64_t result_output_features =
+      result_shape.dimensions(dnums.output_feature_dimension());
+  if (rhs_output_features != result_output_features) {
+    return false;
+  }
+
+  for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+    int64_t kdim = rhs_shape.dimensions(dnums.kernel_spatial_dimensions(i));
+    int64_t odim = result_shape.dimensions(dnums.output_spatial_dimensions(i));
+    if (kdim > odim) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool CanImplementAsGpuForwardConv(HloInstruction* conv) {
   const ConvolutionDimensionNumbers& dnums =
       conv->convolution_dimension_numbers();
@@ -193,6 +225,12 @@ ConvolutionMatch MatchBackwardFilter(HloInstruction* conv) {
   //              Convolution
   //                 conv
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
+  if (LooksLikeForwardConvolution(conv)) {
+    VLOG(1) << "Convolution " << conv->ToString()
+            << " looks like a forward convolution; skipping backward filter "
+               "rewrite.";
+    return std::nullopt;
+  }
 
   // Step 2: match paddings and dimension numbers of the forward convolution.
   const ConvolutionDimensionNumbers& conv_dnums =
@@ -589,7 +627,7 @@ ConvolutionMatch MatchBackwardInput(HloInstruction* conv) {
     reverse_filter = c->AddInstruction(
         HloInstruction::CreateReverse(reverse_filter->shape(), reverse_filter,
                                       dnums.kernel_spatial_dimensions()));
-    TF_CHECK_OK(conv->ReplaceOperandWith(/*operand_num=*/1, reverse_filter));
+    CHECK_OK(conv->ReplaceOperandWith(/*operand_num=*/1, reverse_filter));
   }
 
   // Calculate the 'rhs' that goes into the backward input convolution.
@@ -861,10 +899,10 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
 }
 }  // namespace
 
-absl::StatusOr<bool> ConvRewriter::Run(
+absl::StatusOr<bool> ConvRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(2, "ConvRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "ConvRewriter::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
@@ -873,7 +911,7 @@ absl::StatusOr<bool> ConvRewriter::Run(
         RunOnComputation(computation, compute_capability_, dnn_version_));
     changed |= result;
   }
-  XLA_VLOG_LINES(2, "ConvRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "ConvRewriter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
index 49fb69c0f5b930..c0b7596dd8f828 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
@@ -47,8 +47,8 @@ class ConvRewriter : public HloModulePass {
 
   static bool ConvIsLowerable(HloInstruction* conv);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
index 4c72a3e0ed8d15..0b1d98628e80f4 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
@@ -754,6 +754,27 @@ TEST_F(ConvRewriterTest, TestConv1dBackwardInputPatternMatch) {
                   0)));
 }
 
+TEST_F(ConvRewriterTest, ForwardConvolutionWithWindowDilation) {
+  // Forward convolution with window dilation should be preserved and not
+  // misclassified as backward filter convolution.
+  const std::string module_str = absl::StrFormat(R"(
+    HloModule Test
+
+    ENTRY Test {
+      input = f32[8,128,32,32] parameter(0)
+      filter = f32[3,3,128,128] parameter(1)
+      ROOT conv = f32[8,128,32,32] convolution(input, filter), window={size=3x3 pad=2_2x2_2 rhs_dilate=2x2}, dim_labels=bf01_01io->bf01
+    })");
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  EXPECT_TRUE(RunPass(m.get()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(
+                  m::CustomCall({kCudnnConvForwardCallTarget}, m::Parameter(0),
+                                m::Parameter(1)),
+                  0)));
+}
+
 TEST_F(ConvRewriterTest, TestInvalidTypes) {
   const std::string module_str = absl::StrFormat(R"(
     HloModule Test
@@ -791,7 +812,10 @@ TEST_F(ConvRewriterTest, TestInvalidTypes) {
                      ::testing::HasSubstr(
                          "FP8 convolutions are only supported on CUDA "
                          "GPUs with compute capability at least 9.0")));
-  s = ConvRewriter(se::RocmComputeCapability{"gfx942"}).Run(m.get()).status();
+  s = ConvRewriter(
+          se::GpuComputeCapability{se::RocmComputeCapability{"gfx942"}})
+          .Run(m.get())
+          .status();
   EXPECT_THAT(s, absl_testing::StatusIs(
                      absl::StatusCode::kUnimplemented,
                      ::testing::HasSubstr(
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
index 2a504f8a9a82a3..434a33236b76d4 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
@@ -204,7 +204,7 @@ absl::StatusOr<bool> CopyFusion::DoCopyFusion(
   return changed;
 }
 
-absl::StatusOr<bool> CopyFusion::Run(
+absl::StatusOr<bool> CopyFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Only for the entry computation we can be sure that the copies do not share
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion.h b/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
index 86e82912d92ea8..c64aac0dde734f 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
@@ -39,8 +39,8 @@ class CopyFusion : public HloModulePass {
 
   absl::string_view name() const override { return "copy_fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
index a78c14a156ea71..e08a0a788843c6 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/gemm_fusion.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -121,7 +121,7 @@ static absl::StatusOr<bool> PadForGemm(HloDotInstruction* dot,
 
   bool is_root = dot->user_count() == 0;
 
-  TF_CHECK_OK(parent->ReplaceInstruction(dot, slice));
+  CHECK_OK(parent->ReplaceInstruction(dot, slice));
 
   if (is_root) {
     parent->set_root_instruction(slice);
@@ -196,7 +196,7 @@ absl::StatusOr<std::vector<HloDotInstruction*>> GetRelevantDots(
 
 }  // namespace
 
-absl::StatusOr<bool> CublasPadForGemms::Run(
+absl::StatusOr<bool> CublasPadForGemms::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
index 9df27d017e2c4a..ca0a5c4d43ea26 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
@@ -39,7 +39,7 @@ namespace gpu {
 // so it should go strictly later.
 class CublasPadForGemms : public HloModulePass {
  public:
-  CublasPadForGemms(const se::GpuComputeCapability gpu_compute_capability,
+  CublasPadForGemms(const se::GpuComputeCapability& gpu_compute_capability,
                     PrimitiveType datatype, int32_t pad_to_multiple_of)
       : gpu_compute_capability_(gpu_compute_capability),
         datatype_(datatype),
@@ -47,8 +47,8 @@ class CublasPadForGemms : public HloModulePass {
 
   absl::string_view name() const override { return "cublas-pad-for-gemms"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
index e0e1138a33c774..aa004d301a32ad 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
@@ -572,9 +572,9 @@ class CuDnnCustomCallVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> CuDnnCustomCallCompiler::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> CuDnnCustomCallCompiler::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER_LEVEL("cuDNN custom call compiler", 8);
   return CuDnnCustomCallVisitor(dnn_support_, compilation_results_)
       .RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
index b29d09d3cddaa7..634bdbe97b0fa2 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
@@ -41,8 +41,8 @@ class CuDnnCustomCallCompiler : public HloModulePass {
     return "cudnn-custom-call-compiler";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc
index 41e6026166d066..c7fe997e832980 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc
@@ -55,9 +55,9 @@ class CustomCallVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> CuDnnCustomCallConverter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> CuDnnCustomCallConverter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return CustomCallVisitor().RunOnModule(module, execution_threads);
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
index 26dda4b14c3c09..86f2d86a8b8d8a 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
@@ -35,8 +35,8 @@ class CuDnnCustomCallConverter : public HloModulePass {
     return "cudnn-custom-call-converter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
index 05b9260e3282c2..4e05b9ebbe5fa4 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -97,10 +96,6 @@ bool IsNonDepthwiseConvCustomCall(const HloInstruction* instr) {
   return IsConvCustomCall(instr) && !IsConvDepthwise(instr);
 }
 
-bool IsROCm(se::GpuComputeCapability cc) {
-  return std::holds_alternative<se::RocmComputeCapability>(cc);
-}
-
 // elu, relu6, and leaky-relu activations are supported in cudnn via the
 // "runtime fusion" engine, which JIT compiles C++ code.  This can be slow to
 // compile, so we guard it with a debug option.
@@ -112,7 +107,7 @@ bool IsROCm(se::GpuComputeCapability cc) {
 // due to apparent bugs in cudnn 8.9.0.  See debug_options_flags.cc for details.
 bool ShouldUseCudnnRuntimeFusion(const DebugOptions& debug_opts,
                                  se::GpuComputeCapability cc) {
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&cc);
+  const auto* cuda_cc = cc.cuda_compute_capability();
   if (cuda_cc != nullptr)
     return debug_opts.xla_gpu_use_runtime_fusion() && cuda_cc->IsAtLeast(7, 5);
   else
@@ -390,11 +385,9 @@ class GraphString {
 
     // Insert op in front of its first use as an operand in graph_ or at the end
     // of graph_ if not an operand of another op.
-    auto pos = std::find_if(
-        graph_.begin(), graph_.end(), [op](OpDescriptor graph_op) -> bool {
-          return std::find(graph_op.operands.begin(), graph_op.operands.end(),
-                           op) != graph_op.operands.end();
-        });
+    auto pos = absl::c_find_if(graph_, [op](OpDescriptor graph_op) -> bool {
+      return absl::c_find(graph_op.operands, op) != graph_op.operands.end();
+    });
     pos = graph_.insert(pos, OpDescriptor{op, element_type, op_name, operands});
 
     // If necessary, move the operands of the op already in the graph in front
@@ -454,19 +447,17 @@ class GraphString {
     auto op_filter = [&](OpDescriptor graph_op) -> bool {
       if (op_name.empty()) {
         return graph_op.instr->unique_id() == op->unique_id();
-      } else {
-        return graph_op.instr->unique_id() == op->unique_id() &&
-               graph_op.name == op_name;
       }
+      return graph_op.instr->unique_id() == op->unique_id() &&
+             graph_op.name == op_name;
     };
-    return std::find_if(graph_.begin(), graph_.end(), op_filter) !=
-           graph_.end();
+    return absl::c_find_if(graph_, op_filter) != graph_.end();
   }
 
   std::vector<HloInstruction*> Operands(HloInstruction* op) const {
-    auto op_it = std::find_if(
-        graph_.begin(), graph_.end(),
-        [op](OpDescriptor graph_op) -> bool { return op == graph_op.instr; });
+    auto op_it = absl::c_find_if(graph_, [op](OpDescriptor graph_op) -> bool {
+      return op == graph_op.instr;
+    });
     if (op_it != graph_.end()) {
       return op_it->operands;
     }
@@ -1454,7 +1445,7 @@ absl::StatusOr<bool> FuseConvertToF16(HloComputation* comp) {
 
 absl::StatusOr<bool> FuseConvertToS8(HloComputation* comp,
                                      se::GpuComputeCapability cc) {
-  if (IsROCm(cc)) return false;
+  if (cc.IsRocm()) return false;
   bool changed = false;
   for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
     HloInstruction* gte = nullptr;
@@ -1686,7 +1677,7 @@ void VlogStats(HloModule* module) {
 
 }  // namespace
 
-absl::StatusOr<bool> CudnnFusedConvRewriter::Run(
+absl::StatusOr<bool> CudnnFusedConvRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool any_changed = false;
@@ -1696,10 +1687,10 @@ absl::StatusOr<bool> CudnnFusedConvRewriter::Run(
     bool changed = false;
     // Rewrite FP8 convolutions and supported adjacent pointwise ops into a
     // ForwardGraph Custom Call.
-    if (!IsROCm(compute_capability_)) {
-      auto cc = std::get<se::CudaComputeCapability>(compute_capability_);
+    if (!compute_capability_.IsRocm()) {
+      auto* cc = compute_capability_.cuda_compute_capability();
       TF_ASSIGN_OR_RETURN(
-          changed, F8GraphConv(comp, cc, dnn_version_, toolkit_version_));
+          changed, F8GraphConv(comp, *cc, dnn_version_, toolkit_version_));
       if (changed) {
         return changed;
       }
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
index 2cecc6873414a5..ec3fc1380219de 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
@@ -106,13 +106,13 @@ namespace gpu {
 // pass returns an error -- cudnn will not be able to run it.
 class CudnnFusedConvRewriter : public HloModulePass {
  public:
-  CudnnFusedConvRewriter(se::CudaComputeCapability cc,
+  CudnnFusedConvRewriter(const se::CudaComputeCapability& cc,
                          se::dnn::VersionInfo dnn_version,
                          se::SemanticVersion toolkit_version)
       : compute_capability_(cc),
         dnn_version_(dnn_version),
         toolkit_version_(toolkit_version) {}
-  CudnnFusedConvRewriter(se::RocmComputeCapability cc,
+  CudnnFusedConvRewriter(const se::RocmComputeCapability& cc,
                          se::dnn::VersionInfo dnn_version,
                          se::SemanticVersion toolkit_version)
       : compute_capability_(cc),
@@ -123,8 +123,8 @@ class CudnnFusedConvRewriter : public HloModulePass {
     return "cudnn-fused-convolution-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
index f2da50e6bcaf55..c2e31f4603ccde 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
@@ -78,11 +78,11 @@ static const std::initializer_list<absl::string_view> kf16f32{"f16", "f32"};
 class CudnnFusedConvRewriterHloTest : public HloTestBase {
  public:
   bool IsCuda() const {
-    return std::holds_alternative<se::CudaComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability()
+        .IsCuda();
   }
   se::CudaComputeCapability GetCudaComputeCapability() const {
     return backend()
@@ -119,11 +119,11 @@ class CudnnFusedConvRewriterHloTest : public HloTestBase {
 class CudnnFusedConvRewriterTest : public GpuCodegenTest {
  public:
   bool IsCuda() const {
-    return std::holds_alternative<se::CudaComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability()
+        .IsCuda();
   }
   se::CudaComputeCapability GetCudaComputeCapability() const {
     return backend()
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
index 05c618bda33341..0a7da242ced35e 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/service/gpu/cudnn_support_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/matmul_indexing_utils.h"
 #include "xla/shape_util.h"
@@ -149,6 +150,8 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
       return t::BFLOAT16;
     case PrimitiveType::S32:
       return t::INT32;
+    case PrimitiveType::S4:
+      return t::INT4;
     case PrimitiveType::S8:
       return t::INT8;
     case PrimitiveType::PRED:
@@ -182,6 +185,12 @@ inline std::optional<fe::DataType_t> GetComputeDataType(
 
 // Extracts dimensions and strides from HLO tensors in the format expected by
 // cuDNN.
+struct Result {
+  std::vector<int64_t> sizes;
+  std::vector<int64_t> strides;
+  std::optional<std::vector<std::pair<int64_t, int64_t>>> slices;
+};
+
 class GemmDimensionAdapter {
   explicit GemmDimensionAdapter(const HloInstruction& dot,
                                 TritonFusionAnalysis analysis)
@@ -208,17 +217,15 @@ class GemmDimensionAdapter {
       VLOG(3) << "Non-default precision is not supported.";
       return std::nullopt;
     }
+    if (dot->precision_config().algorithm() != PrecisionConfig::ALG_UNSET) {
+      VLOG(3) << "Non-default algorithm is not supported.";
+      return std::nullopt;
+    }
     TF_ASSIGN_OR_RETURN(auto analysis,
                         TritonFusionAnalysis::Execute(computation));
     return GemmDimensionAdapter{*dot, std::move(analysis)};
   }
 
-  struct Result {
-    std::vector<int64_t> sizes;
-    std::vector<int64_t> strides;
-    std::optional<std::vector<std::pair<int64_t, int64_t>>> slices;
-  };
-
   std::optional<Result> DimensionsAndStrides(
       const HloInstruction& hlo, const TritonFusionAnalysis::Scope scope) {
     const DotDimensionNumbers& dims = dot_.dot_dimension_numbers();
@@ -359,6 +366,113 @@ class GemmDimensionAdapter {
   const HloInstruction& dot_;
 };
 
+class ConvDimensionAdapter {
+  explicit ConvDimensionAdapter(const HloInstruction& conv,
+                                CuDnnFusionConfig_Kind conv_kind,
+                                ConvolutionDimensionNumbers dums)
+      : conv_(conv), conv_kind_(conv_kind), dums_(dums) {}
+
+ public:
+  const HloInstruction& conv_;
+  CuDnnFusionConfig_Kind conv_kind_;
+
+  static absl::StatusOr<std::optional<ConvDimensionAdapter>> Create(
+      const HloFusionInstruction& fusion, const HloComputation& computation) {
+    const HloInstruction* maybe_conv = hlo_query::GetFirstInstructionWithOpcode(
+        computation, HloOpcode::kConvolution);
+    if (maybe_conv == nullptr) {
+      VLOG(3) << "Not a Conv fusion.";
+      return std::nullopt;
+    }
+
+    // get conv type from backend config
+    TF_ASSIGN_OR_RETURN(auto gpu_config,
+                        fusion.backend_config<GpuBackendConfig>());
+    const FusionBackendConfig& fusion_backend_config =
+        gpu_config.fusion_backend_config();
+    if (!fusion_backend_config.has_cudnn_fusion_config() ||
+        !fusion_backend_config.cudnn_fusion_config().has_kind()) {
+      VLOG(3) << "Can't find cudnn fusion config or conv kind for cudnn conv "
+                 "fusion.";
+      return std::nullopt;
+    }
+    CuDnnFusionConfig_Kind conv_kind =
+        fusion_backend_config.cudnn_fusion_config().kind();
+
+    const ConvolutionDimensionNumbers& dnums =
+        DynCast<HloConvolutionInstruction>(maybe_conv)
+            ->convolution_dimension_numbers();
+    // for fprop, we do nothing, copy directly
+    ConvolutionDimensionNumbers dnums_for_layout = dnums;
+    if (conv_kind == CuDnnFusionConfig::CONV_WGRAD) {
+      dnums_for_layout.set_input_batch_dimension(
+          dnums.input_feature_dimension());
+      dnums_for_layout.set_input_feature_dimension(
+          dnums.input_batch_dimension());
+      dnums_for_layout.set_output_batch_dimension(
+          dnums.output_feature_dimension());
+      dnums_for_layout.set_output_feature_dimension(
+          dnums.output_batch_dimension());
+      dnums_for_layout.set_kernel_input_feature_dimension(
+          dnums.kernel_output_feature_dimension());
+      dnums_for_layout.set_kernel_output_feature_dimension(
+          dnums.kernel_input_feature_dimension());
+    } else if (conv_kind == CuDnnFusionConfig::CONV_DGRAD) {
+      dnums_for_layout.set_kernel_input_feature_dimension(
+          dnums.kernel_output_feature_dimension());
+      dnums_for_layout.set_kernel_output_feature_dimension(
+          dnums.kernel_input_feature_dimension());
+    }
+
+    // make sure input/kernel/output has the same layout
+    TF_RET_CHECK(dnums_for_layout.input_batch_dimension() ==
+                     dnums_for_layout.kernel_output_feature_dimension() &&
+                 dnums_for_layout.kernel_output_feature_dimension() ==
+                     dnums_for_layout.output_batch_dimension());
+    TF_RET_CHECK(dnums_for_layout.input_feature_dimension() ==
+                     dnums_for_layout.kernel_input_feature_dimension() &&
+                 dnums_for_layout.kernel_input_feature_dimension() ==
+                     dnums_for_layout.output_feature_dimension());
+    for (auto i = 0; i < dnums_for_layout.input_spatial_dimensions_size();
+         ++i) {
+      TF_RET_CHECK(dnums_for_layout.input_spatial_dimensions(i) ==
+                       dnums_for_layout.kernel_spatial_dimensions(i) &&
+                   dnums_for_layout.kernel_spatial_dimensions(i) ==
+                       dnums_for_layout.output_spatial_dimensions(i));
+    }
+    return ConvDimensionAdapter{*maybe_conv, conv_kind, dnums_for_layout};
+  }
+
+  std::optional<Result> DimensionsAndStrides(const HloInstruction& hlo) {
+    // placeholder FP32 data type here, it is not used
+    auto desc = se::dnn::TensorDescriptor::For(
+        se::dnn::DataType::kFloat, hlo.shape().dimensions(),
+        hlo.shape().layout().minor_to_major());
+    // logical layout and physical layout should be the same after layout
+    // assignment.
+    std::vector<int64_t> logical_dims = desc.dimensions();
+    std::vector<int64_t> logical_strides = desc.GetLogicalStrides();
+    // cuDNN conv frontend requires logical layout to be NCHW. return logical
+    // NCHW layout. we shouldn't need to know if this hlo is LHS, RHS or Output,
+    // they should have same layout after layout assignment. Use input dums
+    // here.
+    Result result;
+    result.sizes.push_back(logical_dims[dums_.input_batch_dimension()]);
+    result.sizes.push_back(logical_dims[dums_.input_feature_dimension()]);
+    result.strides.push_back(logical_strides[dums_.input_batch_dimension()]);
+    result.strides.push_back(logical_strides[dums_.input_feature_dimension()]);
+    for (auto i = 0; i < dums_.input_spatial_dimensions_size(); ++i) {
+      result.sizes.push_back(logical_dims[dums_.input_spatial_dimensions(i)]);
+      result.strides.push_back(
+          logical_strides[dums_.input_spatial_dimensions(i)]);
+    }
+    return result;
+  }
+
+ private:
+  ConvolutionDimensionNumbers dums_;
+};
+
 template <PrimitiveType XlaT, typename T>
 std::shared_ptr<graph::Tensor_attributes> LiteralToCudnnTensor(
     const HloInstruction& hlo, graph::Graph& graph) {
@@ -426,18 +540,25 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
   VLOG(5) << fusion.ToString();
   VLOG(5) << computation.ToString();
   graph::Graph graph;
+  // Intermediate data type is needed for `block_scale_dequantize` graph nodes.
+  graph.set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT);
+
   std::vector<HloInstruction*> instructions =
       computation.MakeInstructionPostOrder();
   absl::flat_hash_map<const HloInstruction*,
                       std::shared_ptr<graph::Tensor_attributes>>
       hlo_to_cudnn;
-  TF_ASSIGN_OR_RETURN(std::optional<GemmDimensionAdapter> adapter,
+  TF_ASSIGN_OR_RETURN(std::optional<GemmDimensionAdapter> gemm_adapter,
                       GemmDimensionAdapter::Create(computation));
-  if (!adapter.has_value()) {
+  TF_ASSIGN_OR_RETURN(std::optional<ConvDimensionAdapter> conv_adapter,
+                      ConvDimensionAdapter::Create(fusion, computation));
+  if (!gemm_adapter.has_value() && !conv_adapter.has_value()) {
+    VLOG(3) << "No dot or conv found inside cudnn fusion.";
     return std::nullopt;
   }
+
   auto add_parameter = [&](const HloInstruction& parameter,
-                           const GemmDimensionAdapter::Result& dims) {
+                           const Result& dims) {
     const std::optional<fe::DataType_t> data_type =
         ToCudnnDataType(parameter.shape().element_type());
     if (!data_type.has_value()) {
@@ -458,21 +579,21 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     }
     return true;
   };
-  for (const TritonFusionAnalysis::Scope scope :
-       {TritonFusionAnalysis::Scope::LHS,
-        TritonFusionAnalysis::Scope::LHS_SCALE,
-        TritonFusionAnalysis::Scope::RHS,
-        TritonFusionAnalysis::Scope::RHS_SCALE,
-        TritonFusionAnalysis::Scope::OUTPUT}) {
-    if (!adapter->analysis_.is_scaled_dot() &&
-        (scope == TritonFusionAnalysis::Scope::LHS_SCALE ||
-         scope == TritonFusionAnalysis::Scope::RHS_SCALE)) {
-      continue;
+
+  if (conv_adapter.has_value()) {
+    for (const HloInstruction* operand : conv_adapter->conv_.operands()) {
+      if (!HloPredicateIsOp<HloOpcode::kParameter>(operand)) {
+        VLOG(3) << "Conv operands are expected to be parameters.";
+        return std::nullopt;
+      }
     }
     for (const HloInstruction* parameter :
-         adapter->analysis_.ScopeParameters(scope)) {
-      const std::optional<GemmDimensionAdapter::Result> dims =
-          adapter->DimensionsAndStrides(*parameter, scope);
+         computation.parameter_instructions()) {
+      // for now, we assume all parameters have same layout even if they are not
+      // inputs to conv, for example, bias add after conv.
+      const std::optional<Result> dims =
+          conv_adapter->DimensionsAndStrides(*parameter);
+      VLOG(3) << "parameter: " << parameter->ToString() << "\n";
       if (!dims.has_value()) {
         VLOG(3) << "Unsupported dimensions.";
         return std::nullopt;
@@ -481,6 +602,33 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
         return std::nullopt;
       }
     }
+  } else {
+    // dot and scale dot
+    for (const TritonFusionAnalysis::Scope scope :
+         {TritonFusionAnalysis::Scope::LHS,
+          TritonFusionAnalysis::Scope::LHS_SCALE,
+          TritonFusionAnalysis::Scope::RHS,
+          TritonFusionAnalysis::Scope::RHS_SCALE,
+          TritonFusionAnalysis::Scope::OUTPUT}) {
+      if (!gemm_adapter->analysis_.is_scaled_dot() &&
+          (scope == TritonFusionAnalysis::Scope::LHS_SCALE ||
+           scope == TritonFusionAnalysis::Scope::RHS_SCALE)) {
+        continue;
+      }
+      for (const HloInstruction* parameter :
+           gemm_adapter->analysis_.ScopeParameters(scope)) {
+        const std::optional<Result> dims =
+            gemm_adapter->DimensionsAndStrides(*parameter, scope);
+        VLOG(3) << "parameter: " << parameter->ToString() << "\n";
+        if (!dims.has_value()) {
+          VLOG(3) << "Unsupported dimensions.";
+          return std::nullopt;
+        }
+        if (!add_parameter(*parameter, *dims)) {
+          return std::nullopt;
+        }
+      }
+    }
   }
 
   for (const HloInstruction* hlo : instructions) {
@@ -550,13 +698,13 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
           // the cuDNN graph.
           if (hlo->operand(0)->opcode() == HloOpcode::kBroadcast) {
             const std::optional<TritonFusionAnalysis::Scope> scope =
-                adapter->analysis_.QueryInstructionScope(*hlo);
+                gemm_adapter->analysis_.QueryInstructionScope(*hlo);
             if (!scope.has_value()) {
               LOG(FATAL) << "No scope for instruction: "
                          << hlo->ToShortString();
             }
-            const std::optional<GemmDimensionAdapter::Result> dims =
-                adapter->DimensionsAndStrides(*hlo, *scope);
+            const std::optional<Result> dims =
+                gemm_adapter->DimensionsAndStrides(*hlo, *scope);
             if (!dims.has_value()) {
               VLOG(3) << "Unsupported hlo for querying dimensions: "
                       << hlo->ToShortString();
@@ -595,30 +743,74 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       if (!compute_dtype.has_value()) {
         return std::nullopt;
       }
-      const auto& dimension_numbers = hlo->dot_dimension_numbers();
       std::array<std::shared_ptr<graph::Tensor_attributes>, 2> dot_operands;
       for (int i = 0; i < 2; ++i) {
-        const Shape& input_shape = hlo->operand(i)->shape();
         const Shape& scale_shape = hlo->operand(i + 2)->shape();
-        int dim = i == 0 ? dimension_numbers.lhs_contracting_dimensions(0)
-                         : dimension_numbers.rhs_contracting_dimensions(0);
-        int block_size =
-            input_shape.dimensions(dim) / scale_shape.dimensions(dim);
-
+        int block_size = scale_shape.element_type() == F8E8M0FNU
+                             ? BlockScalingRewriter::kBlockSizeMXFP8
+                             : BlockScalingRewriter::kBlockSizeNVFP4;
         auto scale = operand(i + 2);
         scale->set_reordering_type(fe::TensorReordering_t::F8_128x4);
         auto dq_attrs = graph::Block_scale_dequantize_attributes()
                             .set_block_size(block_size)
-                            .set_compute_data_type(fe::DataType_t::FLOAT);
+                            .set_compute_data_type(*compute_dtype);
         dot_operands[i] =
             graph.block_scale_dequantize(operand(i), scale, dq_attrs);
         dot_operands[i]->set_name(
             absl::StrCat(hlo->name(), i == 0 ? "_lhs" : "_rhs", "_dq"));
       }
-      hlo_to_cudnn[hlo] =
-          graph.matmul(dot_operands[0], dot_operands[1],
-                       graph::Matmul_attributes().set_compute_data_type(
-                           compute_dtype.value()));
+      hlo_to_cudnn[hlo] = graph.matmul(
+          dot_operands[0], dot_operands[1],
+          graph::Matmul_attributes().set_compute_data_type(*compute_dtype));
+    } else if (HloPredicateIsOp<HloOpcode::kConvolution>(hlo)) {
+      // translate conv windows to cudnn conv attr
+      const Window& window = DynCast<HloConvolutionInstruction>(hlo)->window();
+      std::vector<int64_t> pre_padding, post_padding, stride, dilation;
+      for (int64_t i = 0; i < window.dimensions_size(); ++i) {
+        const auto& dim = window.dimensions(i);
+        pre_padding.push_back(dim.padding_low());
+        post_padding.push_back(dim.padding_high());
+        stride.push_back(dim.stride());
+        dilation.push_back(dim.window_dilation());
+      }
+      const auto compute_dtype =
+          GetComputeDataType(hlo->shape().element_type());
+      if (!compute_dtype.has_value()) {
+        return std::nullopt;
+      }
+
+      // lower to different conv based on conv_kind set in cudnn fusion backend
+      // config
+      auto set_conv_attr = [&](auto conv_attr) {
+        return conv_attr.set_pre_padding(pre_padding)
+            .set_post_padding(post_padding)
+            .set_stride(stride)
+            .set_dilation(dilation)
+            .set_compute_data_type(compute_dtype.value());
+      };
+      if (conv_adapter->conv_kind_ == CuDnnFusionConfig::CONV_FPROP) {
+        hlo_to_cudnn[hlo] =
+            graph.conv_fprop(operand(0), operand(1),
+                             set_conv_attr(graph::Conv_fprop_attributes()));
+      } else if (conv_adapter->conv_kind_ == CuDnnFusionConfig::CONV_DGRAD) {
+        hlo_to_cudnn[hlo] =
+            graph.conv_dgrad(operand(0), operand(1),
+                             set_conv_attr(graph::Conv_dgrad_attributes()));
+      } else if (conv_adapter->conv_kind_ == CuDnnFusionConfig::CONV_WGRAD) {
+        // cudnn frontend accepts operand in the order of dout, input, but xla
+        // uses reverse order
+        hlo_to_cudnn[hlo] =
+            graph.conv_wgrad(operand(1), operand(0),
+                             set_conv_attr(graph::Conv_wgrad_attributes()));
+      } else {
+        VLOG(3) << "Unimplemented conv type.";
+        return std::nullopt;
+      }
+      // cuDNN requires output dims to be set for conv dgrad and wgrad, it is
+      // not required for fprop but we do it anyway for simplicity
+      const std::optional<Result> dims =
+          conv_adapter->DimensionsAndStrides(*hlo);
+      hlo_to_cudnn[hlo]->set_dim(dims->sizes);
     } else {
       VLOG(3) << "Unimplemented operation.";
       return std::nullopt;
@@ -642,9 +834,12 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
   if (instructions.back()->shape().IsTuple()) {
     output = instructions.back()->operand(0);
   }
-  const std::optional<GemmDimensionAdapter::Result> dims =
-      adapter->DimensionsAndStrides(*output,
-                                    TritonFusionAnalysis::Scope::OUTPUT);
+
+  const std::optional<Result> dims =
+      conv_adapter.has_value()
+          ? conv_adapter->DimensionsAndStrides(*output)
+          : gemm_adapter->DimensionsAndStrides(
+                *output, TritonFusionAnalysis::Scope::OUTPUT);
   if (!dims.has_value()) {
     VLOG(3) << "Unsupported dimensions.";
     return std::nullopt;
@@ -677,9 +872,9 @@ absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
     return absl::InternalError("Construction of cuDNN graph failed.");
   }
   TF_RETURN_IF_ERROR(graph->Prepare(
-      dnn_support,
-      se::NumericOptions{RequireDeterminism(hlo.GetModule()->config()),
-                         /*allow_tf32=*/true}));
+      dnn_support, se::EngineOptions{
+                       RequireDeterminism(hlo.GetModule()->config()),
+                       /*allow_tf32=*/true, /*require_command_buffer=*/false}));
   return *graph;
 }
 
@@ -821,7 +1016,7 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> CuDnnFusionCompiler::Run(
+absl::StatusOr<bool> CuDnnFusionCompiler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("cuDNN fusion compiler");
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
index 0729bd7a4758fd..bdd591fabd072a 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
@@ -40,14 +40,14 @@ class CuDnnFusionCompiler : public HloModulePass {
 
   absl::string_view name() const override { return "cudnn-fusion-compiler"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static int GetAvailablePlanCount(se::StreamExecutor& stream_exec,
                                    const HloFusionInstruction& hlo);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   se::dnn::DnnSupport& dnn_support_;
   BinaryMap& compilation_results_;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
index a8987e4a87c83e..f7a0e33958be54 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
@@ -967,8 +967,7 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       std::vector<int64_t> non_norm_dims;
       for (int64_t x_dim = 0; x_dim < x.instr()->shape().dimensions().size();
            ++x_dim) {
-        if (std::find(norm_dims.begin(), norm_dims.end(), x_dim) ==
-            norm_dims.end()) {
+        if (absl::c_find(norm_dims, x_dim) == norm_dims.end()) {
           non_norm_dims.push_back(x_dim);
         }
       }
@@ -1509,10 +1508,10 @@ absl::StatusOr<bool> RunOnComputation(
 }  // anonymous namespace
 
 CudnnNormRewriter::CudnnNormRewriter(
-    se::CudaComputeCapability cuda_compute_capability)
+    const se::CudaComputeCapability& cuda_compute_capability)
     : cuda_compute_capability_(cuda_compute_capability) {}
 
-absl::StatusOr<bool> CudnnNormRewriter::Run(
+absl::StatusOr<bool> CudnnNormRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
index 33040755644221..7b4899d1f8ab34 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
@@ -30,11 +30,12 @@ namespace gpu {
 // forward and backward passes of layer norm patterns are implemented.
 class CudnnNormRewriter : public HloModulePass {
  public:
-  explicit CudnnNormRewriter(se::CudaComputeCapability cuda_compute_capability);
+  explicit CudnnNormRewriter(
+      const se::CudaComputeCapability& cuda_compute_capability);
   absl::string_view name() const override { return "norm-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
index d988e0ade45141..65d423023e7e64 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
@@ -489,7 +489,7 @@ absl::StatusOr<bool> TryResolvePaddedShapesForIntegerConvolution(
   return changed;
 }
 
-absl::StatusOr<bool> CudnnPadForConvolutions::Run(
+absl::StatusOr<bool> CudnnPadForConvolutions::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
index db17516dfa9376..00a07f857caacc 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
@@ -37,9 +37,9 @@ class CudnnPadForConvolutions : public HloModulePass {
   absl::string_view name() const override {
     return "cudnn_pad_for_convolutions";
   }
-  // Run PadForConvolutions on the given module and return if any change is made
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
index b25d11293f0ff3..6f36dd1f6c94de 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
@@ -236,7 +236,7 @@ absl::StatusOr<bool> TrySimplifyPadding(HloInstruction* instr) {
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> CudnnSimplifyPadding::Run(
+absl::StatusOr<bool> CudnnSimplifyPadding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
index 147379561eb3d8..9da5ab57c4f1e8 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
@@ -38,8 +38,8 @@ class CudnnSimplifyPadding : public HloModulePass {
 
   absl::string_view name() const override { return "cudnn_simplify_padding"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
index c10bd10e2b3ab3..3eba5bd4e72d02 100644
--- a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
@@ -190,7 +190,7 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
 }
 }  // namespace
 
-absl::StatusOr<bool> CustomKernelFusionRewriter::Run(
+absl::StatusOr<bool> CustomKernelFusionRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<CustomKernelFusionPattern::Match> matches;
diff --git a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
index 5738bf5283a345..665603659439df 100644
--- a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
@@ -71,8 +71,8 @@ class CustomKernelFusionRewriter : public HloModulePass {
     return "custom-kernel-fusion-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
index fac30e4daac8c4..5cc56611ae4181 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -186,8 +185,8 @@ void RewriteF32ToBF16X3(HloInstruction* instr) {
   HloInstruction* low_sum = sum(low_high_dot, high_low_dot);
   low_sum = ReplaceNaNWithZeros(low_sum);
   HloInstruction* result = sum(low_sum, high_high_dot);
-  TF_CHECK_OK(original_dot->ReplaceAllUsesWith(result));
-  TF_CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
+  CHECK_OK(original_dot->ReplaceAllUsesWith(result));
+  CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
 }
 
 void RewriteF32ToBF16X6(HloInstruction* instr) {
@@ -226,8 +225,8 @@ void RewriteF32ToBF16X6(HloInstruction* instr) {
   result = ReplaceNaNWithZeros(result);
   result = sum(result, high_high_dot);
 
-  TF_CHECK_OK(original_dot->ReplaceAllUsesWith(result));
-  TF_CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
+  CHECK_OK(original_dot->ReplaceAllUsesWith(result));
+  CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
 }
 
 void RewriteF32ToBF16X9(HloInstruction* instr) {
@@ -272,8 +271,8 @@ void RewriteF32ToBF16X9(HloInstruction* instr) {
   result = ReplaceNaNWithZeros(result);
   result = sum(result, high_high_dot);
 
-  TF_CHECK_OK(original_dot->ReplaceAllUsesWith(result));
-  TF_CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
+  CHECK_OK(original_dot->ReplaceAllUsesWith(result));
+  CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
 }
 
 void RewriteF32ToTF32X3(HloInstruction* instr) {
@@ -303,13 +302,13 @@ void RewriteF32ToTF32X3(HloInstruction* instr) {
   HloInstruction* low_sum = sum(low_high_dot, high_low_dot);
   low_sum = ReplaceNaNWithZeros(low_sum);
   HloInstruction* result = sum(low_sum, high_high_dot);
-  TF_CHECK_OK(original_dot->ReplaceAllUsesWith(result));
-  TF_CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
+  CHECK_OK(original_dot->ReplaceAllUsesWith(result));
+  CHECK_OK(original_dot->parent()->RemoveInstruction(original_dot));
 }
 
 }  // namespace
 
-absl::StatusOr<bool> DotAlgorithmRewriter::Run(
+absl::StatusOr<bool> DotAlgorithmRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
index 3a47c1dda5e5f8..c9902751c59f53 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
@@ -29,11 +29,6 @@ class DotAlgorithmRewriter : public HloModulePass {
  public:
   DotAlgorithmRewriter() = default;
   absl::string_view name() const override { return "dot-algorithm-rewriter"; }
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   static absl::StatusOr<HloInstruction*> MakeMultiplyForBF16BF16F32(
       HloInstruction* lhs, HloInstruction* rhs);
@@ -47,6 +42,11 @@ class DotAlgorithmRewriter : public HloModulePass {
       HloInstruction* lhs, HloInstruction* rhs);
   static absl::StatusOr<HloInstruction*> MakeMultiplyForTF32TF32F32X3(
       HloInstruction* lhs, HloInstruction* rhs);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
index 70b20f88b19a59..5c5613a1dda83b 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
@@ -81,7 +81,7 @@ absl::Status SortDotDimensions(HloDotInstruction* dot) {
 
 }  // namespace
 
-absl::StatusOr<bool> DotDimensionSorter::Run(
+absl::StatusOr<bool> DotDimensionSorter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> dots_to_process;
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
index 6ba9dbf543b351..6bb946441baf8c 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
+++ b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
@@ -36,10 +36,8 @@ class DotDimensionSorter : public HloModulePass {
  public:
   absl::string_view name() const override { return "dot_dimension_sorter"; }
 
-  // Run the pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
index 4fc39636187455..aa5c0c396ef34e 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -407,7 +406,7 @@ absl::Status PeelInstructionsForOddTripCount(HloModule* module,
             old_instr->shape(), new_operands, suffix));
 
     SetChannelIdForNewCollective(new_instr, module);
-    TF_CHECK_OK(SetSendRecvValidationForPeeledInstr(new_instr, old_instr));
+    CHECK_OK(SetSendRecvValidationForPeeledInstr(new_instr, old_instr));
     old_to_new_map[old_instr] = new_instr;
     VLOG(2) << "Added instruction " << new_instr->ToString()
             << " to parent computation.";
@@ -499,8 +498,8 @@ absl::StatusOr<bool> DoubleBufferingUnroll(HloInstruction* while_instr,
       skip_control_dep_injection.insert(old_instr);
     }
     SetChannelIdForNewCollective(new_instr, module);
-    TF_CHECK_OK(SetSendRecvValidation(old_instr, new_instr,
-                                      /*is_peeled=*/peel_one_iteration));
+    CHECK_OK(SetSendRecvValidation(old_instr, new_instr,
+                                   /*is_peeled=*/peel_one_iteration));
     old_to_new_map[old_instr] = new_instr;
     VLOG(2) << "Added instruction " << new_instr->ToString();
   }
@@ -555,7 +554,7 @@ absl::StatusOr<bool> AutoUnroll(HloInstruction* while_instr,
 
 }  // namespace
 
-absl::StatusOr<bool> DoubleBufferLoopUnrolling::Run(
+absl::StatusOr<bool> DoubleBufferLoopUnrolling::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
index aa4803457a1815..f780380f46732c 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
@@ -58,8 +58,8 @@ class DoubleBufferLoopUnrolling : public HloModulePass {
     return "loop-double-buffer-transformer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
index d56c5be5ef4d66..76b27097015bd2 100644
--- a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
@@ -246,7 +246,7 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
 
 }  // namespace
 
-absl::StatusOr<bool> DynamicSliceFusionRewriter::Run(
+absl::StatusOr<bool> DynamicSliceFusionRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<HloInstruction*,
diff --git a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
index 3bf8f37ce1909d..a35c6c6bd646e6 100644
--- a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
@@ -76,8 +76,8 @@ class DynamicSliceFusionRewriter : public HloModulePass {
   explicit DynamicSliceFusionRewriter(std::string platform_name)
       : platform_name_(std::move(platform_name)) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc
index 34cc93e22defe0..bb5eb3ee9fd26e 100644
--- a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "xla/backends/gpu/ffi.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/builder/lib/constants.h"
@@ -1005,7 +1006,7 @@ TEST_F(DynamicSliceFusionRewriterTest, SimpleCustomCallLegacy) {
   //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
   //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   //     }));
-  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %dynamic-slice-fusion{{.*}} {
@@ -1067,7 +1068,7 @@ TEST_F(DynamicSliceFusionRewriterTest, TupleSliceCustomCallLegacy) {
   //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
   //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   //     }));
-  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %dynamic-slice-fusion{{.*}} {
@@ -1141,7 +1142,7 @@ TEST_F(DynamicSliceFusionRewriterTest, TupledOutputCustomCallLegacy) {
   //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
   //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   //     }));
-  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %dynamic-slice-fusion{{.*}} {
@@ -1204,7 +1205,7 @@ TEST_F(DynamicSliceFusionRewriterTest, UnalignedSlice) {
   //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
   //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   //     }));
-  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo->ToString(), DynamicSliceFusionRewriter("gpu"),
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
index 5a1efeb58dcc8f..3607994ec28f0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
@@ -70,7 +70,7 @@ absl::StatusOr<bool> CreateCollectivesGroupAsyncPair(HloInstruction* instr) {
 }
 }  // namespace
 
-absl::StatusOr<bool> ExplicitCollectivesGroupAsyncWrapper::Run(
+absl::StatusOr<bool> ExplicitCollectivesGroupAsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
index 21795c32824bd6..a0734b78263dfb 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
@@ -38,8 +38,8 @@ class ExplicitCollectivesGroupAsyncWrapper : public HloModulePass {
     return "explicit-collectives-group-async-wrapper";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
index 1555e48a4d6bd8..7ad37afa58c0dc 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
@@ -75,7 +75,7 @@ static absl::StatusOr<bool> AsynchronizeInstruction(HloInstruction* instr) {
 }
 }  // namespace
 
-absl::StatusOr<bool> ExplicitStreamAnnotationAsyncWrapper::Run(
+absl::StatusOr<bool> ExplicitStreamAnnotationAsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h
index f5613016159a36..cf897c6937e92c 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h
@@ -37,8 +37,8 @@ class ExplicitStreamAnnotationAsyncWrapper : public HloModulePass {
     return "explicit-stream-annotation-async-wrapper";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
index 0d41d95b66dedb..3f510f05645814 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/MathExtras.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -56,7 +55,6 @@ namespace gpu {
 
 namespace {
 
-using ::mlir::MLIRContext;
 namespace m = ::xla::match;
 
 // Pattern-matches slow loop fusions that can likely be handled better by
@@ -164,7 +162,8 @@ absl::StatusOr<bool> ShouldTryRewriteFusion(
 absl::StatusOr<bool> ProcessFusionInstruction(
     HloFusionInstruction* fusion_instruction,
     const se::DeviceDescription& device_info,
-    HloCostAnalysis::ShapeSizeFunction shape_size, MLIRContext* ctx) {
+    HloCostAnalysis::ShapeSizeFunction shape_size,
+    mlir::MLIRContext* mlir_context) {
   TF_ASSIGN_OR_RETURN(bool should_try_rewrite,
                       ShouldTryRewriteFusion(fusion_instruction, device_info));
   if (!should_try_rewrite) {
@@ -195,7 +194,7 @@ absl::StatusOr<bool> ProcessFusionInstruction(
 
   HloFusionAnalysisCache fusion_analysis_cache(device_info);
   GpuPerformanceModelWithIndexingAnalysis indexing_performance_model(
-      &device_info, &fusion_analysis_cache, shape_size, ctx);
+      &device_info, &fusion_analysis_cache, shape_size, mlir_context);
 
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(
       Cast<HloFusionInstruction>(fusion_instruction));
@@ -237,7 +236,7 @@ absl::StatusOr<bool> ProcessFusionInstruction(
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> FusionBlockLevelRewriter::Run(
+absl::StatusOr<bool> FusionBlockLevelRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(EnsureTritonSupportsComputeCapability(
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
index 9d4320e69fd3a0..81e076423f2830 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
@@ -42,8 +42,8 @@ class FusionBlockLevelRewriter : public HloModulePass {
     return "fusion-block-level-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
index 4e8f2e53b666f7..dfe161f86da219 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
@@ -135,11 +135,11 @@ ENTRY entry {
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  mlir::MLIRContext ctx;
 
   ASSERT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(
       SymbolicTileAnalysis::AnalyzeComputation(
-          *module->GetComputationWithName("fusion_computation"), &ctx)));
+          *module->GetComputationWithName("fusion_computation"),
+          &mlir_context_)));
   EXPECT_THAT(
       FusionBlockLevelRewriter(device_info_, HloCostAnalysis::DefaultShapeSize,
                                &mlir_context_)
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
index 49299410337253..14c633f063b310 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
@@ -267,7 +267,7 @@ absl::Status SetLoopMemcpyConfig(
 
 }  // namespace
 
-absl::StatusOr<bool> FusionDynamicMemcpyRewriter::Run(
+absl::StatusOr<bool> FusionDynamicMemcpyRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool has_changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
index 3da53bee2eaf0e..60a014bcc3d741 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
@@ -33,8 +33,8 @@ class FusionDynamicMemcpyRewriter : public HloModulePass {
     return "fusion-dynamic-memcpy-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc
index 9ea5b0d6028b4e..3a9f3c7ec8112a 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc
@@ -21,12 +21,14 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-bool FusionWrapper::MustWrapInstruction(HloOpcode opcode) {
+bool FusionWrapper::MustWrapInstruction(const HloInstruction& instruction) {
+  const HloOpcode opcode = instruction.opcode();
   switch (opcode) {
     case HloOpcode::kAbs:
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAdd:
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
index c8a7a4498f116d..daab6020dcf832 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
@@ -33,7 +33,7 @@ class FusionWrapper : public emitters::FusionWrapperBase {
 
   absl::string_view name() const override { return "fusion-wrapper"; }
 
-  bool MustWrapInstruction(HloOpcode opcode) override;
+  bool MustWrapInstruction(const HloInstruction& instruction) override;
   HloInstruction::FusionKind ChooseFusionKind(
       const HloInstruction& producer, const HloInstruction& consumer) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
index 72d09238f98a4d..023b6e2c2af5fc 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
@@ -108,9 +108,9 @@ static absl::StatusOr<bool> RunOnComputation(HloComputation *computation) {
   return visitor.changed();
 }
 
-absl::StatusOr<bool> GemmBroadcastFoldingRewriter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> GemmBroadcastFoldingRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
   for (HloComputation *computation :
        module->MakeNonfusionComputations(execution_threads)) {
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
index 1c615b93ed16db..d8b6d2a79e24e1 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
@@ -39,8 +39,8 @@ class GemmBroadcastFoldingRewriter : public HloModulePass {
     return "cublas-gemm-broadcast-folding-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
index 47b688a30b62c5..1b36d029adeb81 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
@@ -106,11 +106,6 @@ class AdjacencyList {
   std::vector<std::vector<NodeId>> adj_;
 };
 
-struct HloAndDimOrder {
-  const HloInstruction* original_hlo = nullptr;
-  DimensionOrder dim_order;
-};
-
 struct HloAndIterSpec {
   const HloInstruction* original_hlo;
   TensorIterationSpec iter_spec;
@@ -159,16 +154,18 @@ struct HlosAndRequirements {
 };
 
 // Clones the hero kDot operation into the fusion.
-HloInstruction& FuseDot(const HloDotInstruction& dot,
-                        const HloInstruction& fused_lhs,
-                        const HloInstruction& fused_rhs,
+HloInstruction& FuseDot(const HloInstruction& dot,
+                        const std::vector<HlosAndRequirements>& hlos_and_reqs,
                         HloComputation::Builder& builder  // append
 ) {
   VLOG(3) << "Fusing " << dot.ToString();
 
-  std::vector<HloInstruction*> hlo_new_operands = {
-      const_cast<HloInstruction*>(&fused_lhs),
-      const_cast<HloInstruction*>(&fused_rhs)};
+  std::vector<HloInstruction*> hlo_new_operands;
+  hlo_new_operands.reserve(dot.operand_count());
+  for (int i = 0; i < hlos_and_reqs.size(); ++i) {
+    hlo_new_operands.push_back(
+        const_cast<HloInstruction*>(hlos_and_reqs[i].fused_hlo));
+  }
   return *builder.AddInstruction(
       dot.CloneWithNewOperands(dot.shape(), hlo_new_operands));
 }
@@ -276,6 +273,87 @@ std::optional<DimOrdersAndReqs> GetUserDimOrdersAndCombinedReqsIfProfitable(
       std::get<DotRequirements>(combined_reqs)};
 }
 
+class FusionPlanBuilder {
+ public:
+  // Builds and returns the FusionPlan. Clears internal state.
+  FusionPlan BuildPlan() {
+    FusionPlan fusion_plan;
+    for (auto& [node_id, node] : node_map_) {
+      CHECK(node.should_fuse.has_value());
+      fusion_plan.map[node_id] =
+          NodeFusionPlan{node.original_hlo, *node.should_fuse};
+    }
+
+    node_map_.clear();
+    node_reuse_map_.clear();
+    fusion_plan.graph = std::move(graph_);
+    return fusion_plan;
+  }
+
+  void ReserveSpaceForOutNeighbors(AdjacencyList::NodeId node_id,
+                                   size_t count) {
+    graph_.ReserveSpaceForOutNeighbors(node_id, count);
+  }
+
+  void AddArc(AdjacencyList::NodeId from, AdjacencyList::NodeId to) {
+    graph_.AddArc(from, to);
+  }
+
+  const HloInstruction* GetOriginalHlo(AdjacencyList::NodeId node_id) const {
+    return node_map_.at(node_id).original_hlo;
+  }
+
+  const DimensionOrder& GetDimOrder(AdjacencyList::NodeId node_id) const {
+    return node_map_.at(node_id).dim_order;
+  }
+
+  // Inserts a node for the given HLO and `dim_order` unless already exists.
+  // Returns the node id and a bool indicating if a new node was inserted.
+  std::pair<AdjacencyList::NodeId, bool> InsertNode(
+      const HloInstruction& hlo, const DimensionOrder& dim_order) {
+    HloAndIterSpec reuse_key{&hlo, dim_order.ToTensorIterationSpec()};
+
+    // Attempt to insert a placeholder. If the key already exists, inserted is
+    // false.
+    auto [it, inserted] = node_reuse_map_.insert({reuse_key, -1});
+    if (!inserted) {
+      return {it->second, false};
+    }
+
+    // Key was not present. Create the node and update the map.
+    AdjacencyList::NodeId node_id = graph_.AddNode();
+    it->second = node_id;
+    CHECK(node_map_
+              .insert({node_id,
+                       Node{&hlo, dim_order, /*should_fuse=*/std::nullopt}})
+              .second);
+    return {node_id, true};
+  }
+
+  // Assigns fusion decision for the specified node.
+  // The node must not have an already assigned decision.
+  void SetShouldFuseNode(AdjacencyList::NodeId node_id, bool should_fuse) {
+    Node& node = node_map_.at(node_id);
+    CHECK(!node.should_fuse.has_value());
+    node.should_fuse = should_fuse;
+  }
+
+ private:
+  AdjacencyList graph_;
+
+  struct Node {
+    const HloInstruction* original_hlo;
+    DimensionOrder dim_order;
+    std::optional<bool> should_fuse;
+  };
+  absl::flat_hash_map<AdjacencyList::NodeId, Node> node_map_;
+
+  // Allows reusing nodes when multiple instructions iterate over the same HLO
+  // using the same iteration spec. In that case we don't duplicate the
+  // instruction in the fusion.
+  absl::flat_hash_map<HloAndIterSpec, AdjacencyList::NodeId> node_reuse_map_;
+};
+
 // Builds the fusion map and the requirements which can later be used to
 // actually fuse that subgraph.
 FusionPlanAndRequirements BuildFusionPlanTowardOperands(
@@ -286,61 +364,32 @@ FusionPlanAndRequirements BuildFusionPlanTowardOperands(
     const DotRequirements& requirements_so_far) {
   CHECK(!max_params.has_value() || max_params.value() >= 1);
 
-  // The graph describing the structure of the fusion that we build - nodes
-  // corresponding to the instructions and arcs pointing from users to operands.
-  // We can build and modify this graph easily without the need to create
-  // HloInstructions at this point.
-  AdjacencyList graph;
-  // Stores the original HLO and the dimension order for each node. This is a
-  // temporary map which is used when processing the nodes in this function.
-  absl::flat_hash_map<AdjacencyList::NodeId, HloAndDimOrder>
-      hlo_and_dim_order_map;
-  // Stores the information needed to build the fused HLO for each node (what
-  // was the original HLO and whether we should fuse it or create a parameter).
-  // This is one of the outputs of this function.
-  absl::flat_hash_map<AdjacencyList::NodeId, NodeFusionPlan> fusion_plan_map;
-  // Allows reusing nodes when multiple instructions iterate over the same HLO
-  // using the same iteration spec. In that case we don't duplicate the
-  // instruction in the fusion.
-  absl::flat_hash_map<HloAndIterSpec, AdjacencyList::NodeId> node_reuse_map;
+  FusionPlanBuilder fusion_builder;
+
   // The requirements imposed by the fusion choices made in this function,
-  // combined with the existing requirements. This is one of the outputs of this
-  // function.
+  // combined with the existing requirements. This is one of the outputs of
+  // this function.
   DotRequirements combined_reqs = requirements_so_far;
 
-  auto get_or_create_fusion_node =
-      [&](const HloInstruction& hlo, const DimensionOrder& dim_order,
-          bool* is_new_node = nullptr) -> AdjacencyList::NodeId {
-    HloAndIterSpec reuse_key = {&hlo, dim_order.ToTensorIterationSpec()};
-    if (auto it = node_reuse_map.find(reuse_key); it != node_reuse_map.end()) {
-      if (is_new_node != nullptr) {
-        *is_new_node = false;
-      }
-      return it->second;
-    }
-    AdjacencyList::NodeId node_id = graph.AddNode();
-    CHECK(hlo_and_dim_order_map.insert({node_id, {&hlo, dim_order}}).second);
-    CHECK(node_reuse_map.insert({reuse_key, node_id}).second);
-    if (is_new_node != nullptr) {
-      *is_new_node = true;
-    }
-    return node_id;
-  };
   AdjacencyList::NodeId root =
-      get_or_create_fusion_node(root_hlo, root_dim_order);
+      fusion_builder.InsertNode(root_hlo, root_dim_order).first;
 
   // Nodes at the fusion edge that can either get fused too or become parameters
   // of the fusion. Used to track the number of parameters.
   absl::flat_hash_set<AdjacencyList::NodeId> inputs({root});
+
   std::queue<AdjacencyList::NodeId> queue({root});
   int64_t num_requeued = 0;
+
   // BFS
+  // If all queued instructions are re-queued, they all exceed the parameter
+  // limit, so stop fusing.
   while (queue.size() > num_requeued) {
     AdjacencyList::NodeId node_id = queue.front();
     queue.pop();
-    const HloAndDimOrder& hlo_and_dim_order = hlo_and_dim_order_map.at(node_id);
-    const HloInstruction& original_hlo = *hlo_and_dim_order.original_hlo;
-    const DimensionOrder& dim_order = hlo_and_dim_order.dim_order;
+    const HloInstruction& original_hlo =
+        *fusion_builder.GetOriginalHlo(node_id);
+    const DimensionOrder& dim_order = fusion_builder.GetDimOrder(node_id);
 
     // Watch the total number of fusion parameters.
     if (max_params.has_value() &&
@@ -353,55 +402,57 @@ FusionPlanAndRequirements BuildFusionPlanTowardOperands(
       continue;
     }
     num_requeued = 0;
+
     if (original_hlo.opcode() == HloOpcode::kParameter) {
-      CHECK(fusion_plan_map
-                .insert({node_id, {&original_hlo, /*should_fuse=*/false}})
-                .second);
+      fusion_builder.SetShouldFuseNode(node_id, false);
+      continue;
+    }
+
+    // TODO(b/393299275): this check cannot be replaced by a
+    // `IsTritonSupportedComputation` because we will do some rewrites
+    // later that might change the decision. For example 'scaled-dot-rewriter'
+    // replaces unsupported F8E8M0FNU with u8. We should have a more principled
+    // way check if we will be able to emit the triton code for the fusion.
+    if (original_hlo.opcode() == HloOpcode::kDynamicSlice) {
+      // TODO(b/417172838): support dynamic slice op.
+      fusion_builder.SetShouldFuseNode(node_id, false);
+      LOG(INFO) << "Not fusing dynamic slice: " << original_hlo.ToString();
       continue;
     }
+
     auto opt_result = GetOperandDimOrdersAndCombinedReqsIfProfitable(
         original_hlo, dim_order, properties, gpu_version, combined_reqs);
     if (!opt_result.has_value()) {
-      CHECK(fusion_plan_map
-                .insert({node_id, {&original_hlo, /*should_fuse=*/false}})
-                .second);
+      fusion_builder.SetShouldFuseNode(node_id, false);
       continue;
     }
+
     const DimOrderMap operand_dim_orders = std::move(opt_result->dim_orders);
     combined_reqs = std::move(opt_result->requirements);
+
     inputs.erase(node_id);
-    graph.ReserveSpaceForOutNeighbors(node_id, original_hlo.operand_count());
-    for (int64_t i = 0; i < original_hlo.operand_count(); ++i) {
-      const HloInstruction& operand = *original_hlo.operand(i);
-      const DimensionOrder& operand_dim_order = operand_dim_orders.at(&operand);
-      bool is_new_node = false;
-      AdjacencyList::NodeId operand_node_id =
-          get_or_create_fusion_node(operand, operand_dim_order, &is_new_node);
-      graph.AddArc(node_id, operand_node_id);
+    fusion_builder.ReserveSpaceForOutNeighbors(node_id,
+                                               original_hlo.operand_count());
+    for (const HloInstruction* operand : original_hlo.operands()) {
+      const DimensionOrder& operand_dim_order = operand_dim_orders.at(operand);
+      auto [operand_node_id, is_new_node] =
+          fusion_builder.InsertNode(*operand, operand_dim_order);
+      fusion_builder.AddArc(node_id, operand_node_id);
       if (is_new_node) {
-        VLOG(6) << "Enqueueing " << operand.ToString() << ":"
+        VLOG(6) << "Enqueueing " << operand->ToString() << ":"
                 << operand_dim_order.ToString();
         inputs.insert(operand_node_id);
         queue.push(operand_node_id);
       }
     }
-    CHECK(
-        fusion_plan_map.insert({node_id, {&original_hlo, /*should_fuse=*/true}})
-            .second);
+    fusion_builder.SetShouldFuseNode(node_id, true);
   }
   // Handle the remaining requeued items.
-  while (!queue.empty()) {
+  for (; !queue.empty(); queue.pop()) {
     AdjacencyList::NodeId node_id = queue.front();
-    queue.pop();
-
-    const HloAndDimOrder& hlo_and_dim_order = hlo_and_dim_order_map.at(node_id);
-    CHECK(fusion_plan_map
-              .insert({node_id,
-                       {hlo_and_dim_order.original_hlo, /*should_fuse=*/false}})
-              .second);
+    fusion_builder.SetShouldFuseNode(node_id, false);
   }
-  return {{std::move(graph), std::move(fusion_plan_map)},
-          std::move(combined_reqs)};
+  return {fusion_builder.BuildPlan(), std::move(combined_reqs)};
 }
 
 // Builds the HLO instructions for the fusion represented by `fusion_plan`,
@@ -672,14 +723,17 @@ absl::StatusOr<Decision> CreateDotFusion(
     return Decision::Deny(is_supported.Explain());
   }
 
+  std::vector<HlosAndRequirements> hlos_and_reqs;
+  hlos_and_reqs.reserve(dot.operand_count());
   TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_hlos_and_reqs,
                       FuseDotOperand(dot, /*operand_index=*/0, gpu_version,
                                      builder, fusion_inputs));
+  hlos_and_reqs.push_back(lhs_hlos_and_reqs);
   TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_hlos_and_reqs,
                       FuseDotOperand(dot, /*operand_index=*/1, gpu_version,
                                      builder, fusion_inputs));
-  HloInstruction& fused_dot = FuseDot(dot, *lhs_hlos_and_reqs.fused_hlo,
-                                      *rhs_hlos_and_reqs.fused_hlo, builder);
+  hlos_and_reqs.push_back(rhs_hlos_and_reqs);
+  HloInstruction& fused_dot = FuseDot(dot, hlos_and_reqs, builder);
   // For now the RHS doesn't support splits, so it also doesn't impose any
   // requirements.
   HlosAndRequirements fused_output_and_reqs =
@@ -739,7 +793,9 @@ absl::StatusOr<Decision> CreateDotFusion(
     return absl::OkStatus();
   });
 
-  if (is_pure_matmul) return Decision::NotProfitable("Pure Matmul");
+  if (is_pure_matmul) {
+    return Decision::NotProfitable("Pure Matmul");
+  }
 
   return Decision::Allow();
 }
@@ -804,7 +860,7 @@ class GemmFusionVisitor : public DfsHloRewriteVisitor {
                         dot_fusion->backend_config<GpuBackendConfig>());
     FusionBackendConfig& backend_config =
         *gpu_config.mutable_fusion_backend_config();
-    backend_config.set_kind(std::string(kTritonGemmFusionKind));
+    backend_config.set_kind(kTritonGemmFusionKind);
     TF_RETURN_IF_ERROR(dot_fusion->set_backend_config(gpu_config));
 
     if (fusion_output->IsRoot()) {
@@ -860,35 +916,44 @@ class GemmFusionVisitor : public DfsHloRewriteVisitor {
     HloComputation::Builder builder(
         absl::StrCat("fusion_", scaled_dot->name()));
 
-    auto create_parameter = [&](int64_t parameter_number,
-                                absl::string_view name) {
-      return builder.AddInstruction(HloInstruction::CreateParameter(
-          parameter_number, scaled_dot->operand(parameter_number)->shape(),
-          name));
-    };
-    std::vector<HloInstruction*> new_operands{
-        create_parameter(0, "lhs"),
-        create_parameter(1, "rhs"),
-        create_parameter(2, "lhs_scale"),
-        create_parameter(3, "rhs_scale"),
-    };
-    builder.AddInstruction(
-        scaled_dot->CloneWithNewOperands(scaled_dot->shape(), new_operands));
+    std::vector<HloInstruction*> fusion_inputs;
+
+    std::vector<HlosAndRequirements> hlos_and_reqs;
+    hlos_and_reqs.reserve(scaled_dot->operand_count());
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/0,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(lhs_hlos_and_reqs);
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/1,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(rhs_hlos_and_reqs);
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_scale_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/2,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(lhs_scale_hlos_and_reqs);
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_scale_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/3,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(rhs_scale_hlos_and_reqs);
+
+    FuseDot(*scaled_dot, hlos_and_reqs, builder);
 
     HloComputation* computation =
         scaled_dot->GetModule()->AddComputationAndUnifyNamesAndIds(
             builder.Build(),
             /*is_entry=*/false);
-    HloInstruction* fusion = scaled_dot->parent()->AddInstruction(
-        HloInstruction::CreateFusion(computation->root_instruction()->shape(),
-                                     HloInstruction::FusionKind::kCustom,
-                                     scaled_dot->operands(), computation));
+
+    HloInstruction* fusion =
+        scaled_dot->parent()->AddInstruction(HloInstruction::CreateFusion(
+            computation->root_instruction()->shape(),
+            HloInstruction::FusionKind::kCustom, fusion_inputs, computation));
 
     TF_ASSIGN_OR_RETURN(auto gpu_config,
                         fusion->backend_config<GpuBackendConfig>());
     FusionBackendConfig& backend_config =
         *gpu_config.mutable_fusion_backend_config();
-    backend_config.set_kind(kTritonScaledDotFusionKind);
+    backend_config.set_kind(kTritonGemmFusionKind);
     TF_RETURN_IF_ERROR(fusion->set_backend_config(gpu_config));
     TF_RETURN_IF_ERROR(ReplaceInstruction(scaled_dot, fusion));
     MarkAsChanged();
@@ -917,7 +982,7 @@ bool ShouldTritonHandleGEMM(HloDotInstruction& dot,
       ->WantToFuse();
 }
 
-absl::StatusOr<bool> GemmFusion::Run(
+absl::StatusOr<bool> GemmFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
index d8656a44d25ce8..bb2ba263358f2a 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
@@ -41,8 +41,8 @@ class GemmFusion : public HloModulePass {
       : compute_capability_(compute_capability) {}
   absl::string_view name() const override { return "triton-gemm-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
index acbb37127cb199..9f1db5d2911073 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
@@ -209,7 +209,7 @@ absl::StatusOr<bool> MaybeSwapOperands(HloComputation* computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> GemmFusionSwapOperands::Run(
+absl::StatusOr<bool> GemmFusionSwapOperands::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool any_changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
index 1eeedef74063ec..8e4cad9366ca51 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
@@ -31,9 +31,8 @@ class GemmFusionSwapOperands : public HloModulePass {
     return "gemm-fusion-swap-operands";
   }
 
- public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
index 6bd7dbded1fa84..c77d76d6954aed 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
@@ -144,9 +144,10 @@ ENTRY e {
   r1 = s8[8,32,8] reshape(p0)
   t1 = s8[32,8,8] transpose(r1), dimensions={1,0,2}
   r0 = s8[32,64] reshape(t1)
+  c1 = f16[32,64] convert(r0)
   p1 = s8[32,32] parameter(1)
   c0 = f16[32,32] convert(p1)
-  ROOT d = f16[64,32] dot(r0, c0),
+  ROOT d = f16[64,32] dot(c1, c0),
     lhs_contracting_dims={0}, rhs_contracting_dims={1}
 })")
                     .value();
@@ -203,8 +204,10 @@ ENTRY e {
   EXPECT_TRUE(CublasRequiresPadding(
       *xla::Cast<HloDotInstruction>(
           module->entry_computation()->root_instruction()),
-      cc));
-  EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
+      stream_executor::GpuComputeCapability{cc}));
+  EXPECT_TRUE(GemmFusion(stream_executor::GpuComputeCapability{cc})
+                  .Run(module.get())
+                  .value());
 }
 
 TEST_F(GemmFusionTest, FuseSliceOfParameterWithOtherUsers) {
@@ -261,7 +264,8 @@ ENTRY e {
   EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
 }
 
-TEST_F(GemmFusionTest, DynamicSliceIsFused) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest, DISABLED_DynamicSliceIsFused) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -285,7 +289,8 @@ ENTRY e {
                                     m::Parameter(), m::Constant()))));
 }
 
-TEST_F(GemmFusionTest, DynamicSlicesAreFusedEvenIfTheyShareIndices) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest, DISABLED_DynamicSlicesAreFusedEvenIfTheyShareIndices) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -316,7 +321,8 @@ ENTRY e {
                             m::Parameter(), m::Parameter()))));
 }
 
-TEST_F(GemmFusionTest, DoNotFuseDynamicSliceOfNonMajorFragments) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest, DISABLED_DoNotFuseDynamicSliceOfNonMajorFragments) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -335,7 +341,9 @@ ENTRY e {
   EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
 }
 
-TEST_F(GemmFusionTest, CanFuseDynamicSliceOfContractingDimIfItIsMajor) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest,
+       DISABLED_CanFuseDynamicSliceOfContractingDimIfItIsMajor) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -902,7 +910,9 @@ ENTRY e {
   ROOT dot = f32[2,2] dot(p0e, p1c),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })"));
-  EXPECT_TRUE(GemmFusion(se::RocmComputeCapability{}).Run(module.get()).ok());
+  EXPECT_TRUE(GemmFusion(se::GpuComputeCapability{se::RocmComputeCapability{}})
+                  .Run(module.get())
+                  .ok());
 }
 
 TEST_F(GemmFusionTest, ParameterUsedElementwiseTwiceIsFused) {
@@ -1394,7 +1404,7 @@ ENTRY e {
 TEST_F(SmallDotGemmFusionTest, Int4DotIsRewritten) {
   constexpr auto kInt4Dot = R"(
     ENTRY e {
-      p0 = s8[16,16] parameter(0)
+      p0 = bf16[16,16] parameter(0)
       p1 = s4[16,16] parameter(1)
       p1c = bf16[16,16] convert(p1)
       ROOT dot = bf16[16,16] dot(p0, p1c),
@@ -1516,7 +1526,7 @@ TEST_F(GemmFusionTest, ScaledDotIsFused) {
     CHECK:   ROOT %[[FUSION:.*]] = bf16[4,4]{1,0} fusion(
     CHECK:     kind=kCustom,
     CHECK:     calls=%[[FUSION_DOT]]
-    CHECK:     {"kind":"__triton_scaled_dot_fusion"}
+    CHECK:     {"kind":"__triton_gemm"}
   )";
   MatchHloModule(*module, kExpectedHloText);
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
index 511df0530507d4..5e7082cc2e6fca 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
@@ -653,7 +653,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                  const_cast<HloInstruction*>(instr->operand(0)))) &&
             (b = MatchFp8Param(
                  const_cast<HloInstruction*>(instr->operand(1))))) {
-          if (IsRocm(gpu_version_) &&
+          if (gpu_version_.IsRocm() &&
               toolkit_version_ < stream_executor::SemanticVersion{6, 2, 0} &&
               instr->shape().element_type() != F16 &&
               instr->shape().element_type() != F32) {
@@ -806,8 +806,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    const auto is_rocm =
-        std::holds_alternative<se::RocmComputeCapability>(gpu_version_);
+    const auto is_rocm = gpu_version_.IsRocm();
     if (is_rocm &&
         toolkit_version_ >= stream_executor::SemanticVersion{7, 0, 0}) {
       // Attempt to match approximate Swish activation
@@ -1049,26 +1048,18 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
-  static bool IsCuda(const se::GpuComputeCapability& gpu_version) {
-    return std::holds_alternative<se::CudaComputeCapability>(gpu_version);
-  }
-
   static absl::StatusOr<se::CudaComputeCapability> GetCudaComputeCapability(
       const se::GpuComputeCapability& gpu_version) {
-    auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
+    auto* cuda_cc = gpu_version.cuda_compute_capability();
     if (cuda_cc == nullptr) {
       return absl::InvalidArgumentError("Compute Capability is not CUDA.");
     }
     return *cuda_cc;
   }
 
-  static bool IsRocm(const se::GpuComputeCapability& gpu_version) {
-    return std::holds_alternative<se::RocmComputeCapability>(gpu_version);
-  }
-
   static absl::StatusOr<se::RocmComputeCapability> GetRocmComputeCapability(
       const se::GpuComputeCapability& gpu_version) {
-    auto rocm_cc = std::get_if<se::RocmComputeCapability>(&gpu_version);
+    auto rocm_cc = gpu_version.rocm_compute_capability();
     if (rocm_cc == nullptr) {
       return absl::InvalidArgumentError("Compute Capability is not ROCm.");
     }
@@ -1082,7 +1073,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     GemmBackendConfig& gemm_backend_config =
         *gpu_backend_config.mutable_gemm_backend_config();
     se::CudaComputeCapability cuda_compute_capability;
-    if (IsCuda(gpu_version_)) {
+    if (gpu_version_.IsCuda()) {
       TF_ASSIGN_OR_RETURN(cuda_compute_capability,
                           GetCudaComputeCapability(gpu_version_));
       // FP8 GEMM kernels are only available on Ada, Hopper, and later
@@ -1101,7 +1092,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       TF_ASSIGN_OR_RETURN(auto rocm_compute_capability,
                           GetRocmComputeCapability(gpu_version_));
       if (!rocm_compute_capability.has_fp8_support()) {
@@ -1120,7 +1111,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     // cuBLASLt FP8 GEMM kernels require one of the two operands to be in
     // F8E4M3FN format.
-    if (IsCuda(gpu_version_)) {
+    if (gpu_version_.IsCuda()) {
       if (a_type == F8E5M2 && b_type == F8E5M2) {
         VLOG(1)
             << "Failed to rewrite " << instr->ToShortString()
@@ -1139,7 +1130,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       TF_ASSIGN_OR_RETURN(auto rocm_compute_capability,
                           GetRocmComputeCapability(gpu_version_));
       if (rocm_compute_capability.has_ocp_fp8_support()) {
@@ -1233,7 +1224,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     PrimitiveType d_type = instr->shape().element_type();
     std::unordered_set<PrimitiveType> supported_d_types = {BF16, F16, F32};
-    if (IsCuda(gpu_version_)) {
+    if (gpu_version_.IsCuda()) {
       supported_d_types.insert(F8E4M3FN);
       supported_d_types.insert(F8E5M2);
       if (supported_d_types.find(d_type) == supported_d_types.end()) {
@@ -1244,7 +1235,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         return false;
       }
     }
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       if (toolkit_version_ < stream_executor::SemanticVersion{6, 2, 0}) {
         if (supported_d_types.find(d_type) == supported_d_types.end()) {
           VLOG(1) << "Failed to rewrite " << instr->ToShortString()
@@ -1477,7 +1468,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                           HloInstruction* clamp_upper,
                           bool mult_scale = false) {
     // TODO: add ROCm support to this fusion pattern
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       return absl::OkStatus();
     }
     // Verify the data types and the operands of clamp.
@@ -1947,7 +1938,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // CUBLAS_STATUS_NOT_SUPPORTED in some cases when fusing gelu into an FP8
     // matmul. We cannot check the patch version, so disable this fusion with
     // CUDA versions less than 12.4.
-    if (IsCuda(gpu_version_) &&
+    if (gpu_version_.IsCuda() &&
         toolkit_version_ < stream_executor::SemanticVersion{12, 4, 0} &&
         IsCublasLtMatmulF8(*gemm)) {
       return absl::OkStatus();
@@ -2003,7 +1994,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // CUBLAS_STATUS_NOT_SUPPORTED in some cases when fusing gelu into an FP8
     // matmul. We cannot check the patch version, so disable this fusion with
     // CUDA versions less than 12.4.
-    if (IsCuda(gpu_version_) &&
+    if (gpu_version_.IsCuda() &&
         toolkit_version_ < stream_executor::SemanticVersion{12, 4, 0} &&
         IsCublasLtMatmulF8(*gemm)) {
       return absl::OkStatus();
@@ -2279,7 +2270,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         {ComputationType::kF64, DataType::kComplexDouble, PrimitiveType::C128,
          PrimitiveType::C128, DataType::kComplexDouble},
     };
-    if (IsCuda(gpu_version_) &&
+    if (gpu_version_.IsCuda() &&
         absl::c_linear_search(supported_cublas_type_combinations,
                               std::tuple{compute_type, scale_type, a_dtype,
                                          b_dtype, output_dtype})) {
@@ -2350,7 +2341,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
          PrimitiveType::F8E4M3FNUZ, DataType::kFloat},
     };
-    if (IsRocm(gpu_version_) &&
+    if (gpu_version_.IsRocm() &&
         absl::c_linear_search(supported_hipblas_type_combinations,
                               std::tuple{compute_type, scale_type, a_dtype,
                                          b_dtype, output_dtype})) {
@@ -2416,8 +2407,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return false;
     }
 
-    if (auto isrocm = std::get_if<se::RocmComputeCapability>(&gpu_version_);
-        isrocm) {
+    if (auto isrocm = gpu_version_.rocm_compute_capability(); isrocm) {
       if (!isrocm->has_hipblaslt()) {
         return false;
       }
@@ -2431,8 +2421,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return true;
     }
 
-    if (std::holds_alternative<se::CudaComputeCapability>(gpu_version_)) {
-      if (std::get<se::CudaComputeCapability>(gpu_version_).IsAtLeastAmpere()) {
+    if (auto* ptr = gpu_version_.cuda_compute_capability()) {
+      if (ptr->IsAtLeastAmpere()) {
         // cuBlasLt has an implementation for complex data with compute type
         // 32F_FAST_32TF that uses tensor cores and that is free from the
         // restriction. This implementation only works on Ampere
@@ -2535,11 +2525,11 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
     // otherwise cuBLAS will use its own internal pool which will be competing
     // with XLA allocator for device memory.
     int64_t workspace = GemmConfig::kDefaultWorkspace;
-    auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version_);
+    auto* cuda_cc = gpu_version_.cuda_compute_capability();
     if (cuda_cc != nullptr && cuda_cc->IsAtLeastHopper()) {
       workspace = GemmConfig::kHopperWorkspace;
     }
-    auto* rocm_cc = std::get_if<se::RocmComputeCapability>(&gpu_version_);
+    auto* rocm_cc = gpu_version_.rocm_compute_capability();
     if (rocm_cc != nullptr) {
       if (rocm_cc->gfx_version() == "gfx942") {
         workspace = GemmConfig::kGFX942Workspace;
@@ -2582,8 +2572,8 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
 
     if (instr->shape().IsTuple()) {
       for (auto user : instr->users()) {
-        auto user_get_tuple =
-            dynamic_cast<HloGetTupleElementInstruction*>(user);
+        HloGetTupleElementInstruction* user_get_tuple =
+            DynCast<HloGetTupleElementInstruction>(user);
         TF_RET_CHECK(user_get_tuple);
         HloInstruction* get_output =
             instr->AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -2622,7 +2612,7 @@ GemmRewriter::GemmRewriter(se::GpuComputeCapability gpu_version,
       toolkit_version_(toolkit_version),
       options_(options) {}
 
-absl::StatusOr<bool> GemmRewriter::Run(
+absl::StatusOr<bool> GemmRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
index 8448d260e58eca..f49f7ed3a1da84 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
@@ -81,8 +81,8 @@ class GemmRewriter : public HloModulePass {
                GemmRewriterOptions options = {});
   absl::string_view name() const override { return "cublas-gemm-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
index 27d587fea3e19e..c76b340ce86906 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
@@ -64,15 +64,15 @@ class ParameterizedFp8GemmRewriteTest
       replacements_[kF8E4M3AmaxPlaceholder] = "448.";
       return;
     }
-    if (IsRocm() && std::get<se::RocmComputeCapability>(Capability())
-                        .has_ocp_fp8_support()) {
+    if (IsRocm() &&
+        Capability().rocm_compute_capability()->has_ocp_fp8_support()) {
       replacements_[kF8E4M3DatatypePlaceholder] = "f8e4m3fn";
       replacements_[kF8E5M2DatatypePlaceholder] = "f8e5m2";
       replacements_[kF8E4M3AmaxPlaceholder] = "448.";
       return;
     }
-    if (IsRocm() && std::get<se::RocmComputeCapability>(Capability())
-                        .has_nanoo_fp8_support()) {
+    if (IsRocm() &&
+        Capability().rocm_compute_capability()->has_nanoo_fp8_support()) {
       replacements_[kF8E4M3DatatypePlaceholder] = "f8e4m3fnuz";
       replacements_[kF8E5M2DatatypePlaceholder] = "f8e5m2fnuz";
       replacements_[kF8E4M3AmaxPlaceholder] = "240.";
@@ -91,7 +91,7 @@ class ParameterizedFp8GemmRewriteTest
     }
 
     if (IsRocm() &&
-        !std::get<se::RocmComputeCapability>(Capability()).has_fp8_support()) {
+        !Capability().rocm_compute_capability()->has_fp8_support()) {
       GTEST_SKIP()
           << "F8 gemm rewrite is only supported on MI300 and newer archs.";
     }
@@ -2963,8 +2963,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
       ROOT out = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
           }
 )";
-  if (IsRocm() && std::get<se::RocmComputeCapability>(Capability())
-                      .has_nanoo_fp8_support()) {
+  if (IsRocm() &&
+      Capability().rocm_compute_capability()->has_nanoo_fp8_support()) {
     EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
     RunAndFilecheckHloRewrite(
         hlo_text,
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
index 8c308438863b8b..43d732d3c666a2 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
@@ -979,8 +979,13 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (!IsCuda() ||
-      HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+  if (!IsCuda()) {
+    MatchOptimizedHlo(hlo_text,
+                      R"(
+; CHECK: {{.*}} custom-call(bf16[12,4]{1,0} {{.*}}, bf16[4,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+  )",
+                      /*print_operand_shape=*/true);
+  } else if (HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(bf16[16,8]{1,0} {{.*}}, bf16[8,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
@@ -1004,8 +1009,13 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (!IsCuda() ||
-      HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+  if (!IsCuda()) {
+    MatchOptimizedHlo(hlo_text,
+                      R"(
+    ; CHECK: {{.*}} custom-call(bf16[3,3,4]{2,1,0} {{.*}}, bf16[3,3,2]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+    )",
+                      /*print_operand_shape=*/true);
+  } else if (HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
     ; CHECK: {{.*}} custom-call(bf16[3,8,8]{2,1,0} {{.*}}, bf16[3,8,8]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc
index 0935bda4697ba1..12ce56e59c1919 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc
@@ -50,19 +50,13 @@ stream_executor::SemanticVersion GemmRewriteTestBase::GetToolkitVersion()
       .runtime_version();
 }
 
-bool GemmRewriteTestBase::IsCuda() const {
-  return std::holds_alternative<stream_executor::CudaComputeCapability>(
-      Capability());
-}
+bool GemmRewriteTestBase::IsCuda() const { return Capability().IsCuda(); }
 
-bool GemmRewriteTestBase::IsRocm() const {
-  return std::holds_alternative<stream_executor::RocmComputeCapability>(
-      Capability());
-}
+bool GemmRewriteTestBase::IsRocm() const { return Capability().IsRocm(); }
 
 bool GemmRewriteTestBase::IsBlackwell() const {
   if (IsCuda()) {
-    return std::get<se::CudaComputeCapability>(Capability()).IsBlackwell();
+    return Capability().cuda_compute_capability()->IsBlackwell();
   }
   return false;
 }
@@ -72,7 +66,8 @@ GemmRewriteTestBase::CudaHopperOrRocmCapability() {
   if (IsCuda()) {
     return se::CudaComputeCapability::Hopper();
   }
-  return std::get<se::RocmComputeCapability>(Capability());
+  return stream_executor::GpuComputeCapability{
+      *Capability().rocm_compute_capability()};
 }
 
 DebugOptions GemmRewriteTestBase::GetDebugOptionsForTest() const {
@@ -86,22 +81,21 @@ DebugOptions GemmRewriteTestBase::GetDebugOptionsForTest() const {
 
 bool GemmRewriteTestBase::SkipGpuBlasLtTest() {
   return !IsCuda() &&
-         !std::get<stream_executor::RocmComputeCapability>(Capability())
-              .has_hipblaslt() &&
+         !Capability().rocm_compute_capability()->has_hipblaslt() &&
          GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
 }
 
 bool GemmRewriteTestBase::HasFp8Support() const {
   if (IsCuda()) {
-    return std::get<se::CudaComputeCapability>(Capability()).IsAtLeast(8, 9);
+    return Capability().cuda_compute_capability()->IsAtLeast(8, 9);
   }
-  return std::get<se::RocmComputeCapability>(Capability()).has_fp8_support();
+  return Capability().rocm_compute_capability()->has_fp8_support();
 }
 
 bool GemmRewriteTestBase::HasCudaComputeCapability(
     const stream_executor::CudaComputeCapability& cc) const {
-  return IsCuda() && std::get<se::CudaComputeCapability>(Capability())
-                         .SupportsAllFeaturesOf(cc);
+  return IsCuda() &&
+         Capability().cuda_compute_capability()->SupportsAllFeaturesOf(cc);
 }
 
 ParameterizedGemmRewriteTestBase::ParameterizedGemmRewriteTestBase() {
diff --git a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
index 1e02700ca84ca7..8497aa18e280ec 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
@@ -158,7 +158,7 @@ class GemvRewriterVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> GemvRewriter::Run(
+absl::StatusOr<bool> GemvRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   GemvRewriterVisitor gemv_rewriter;
diff --git a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
index 89bf9d499cc9be..55bbb9643b3f0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
@@ -32,8 +32,8 @@ class GemvRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "gemv-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc
deleted file mode 100644
index fa582eb58adfea..00000000000000
--- a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/comparison_util.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/literal_util.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-void SetFortranLayout(Shape* shape) {
-  LayoutUtil::SetToDefaultLayout(shape);
-  int n = shape->mutable_layout()->minor_to_major().size();
-  CHECK_GE(n, 2);
-  std::swap(shape->mutable_layout()->mutable_minor_to_major()->at(0),
-            shape->mutable_layout()->mutable_minor_to_major()->at(1));
-}
-
-absl::StatusOr<HloInstruction*> CreateCholesky(
-    stream_executor::GpuSolverContext* context, HloInstruction* operand,
-    const CholeskyOptions& options, const OpMetadata& metadata) {
-  HloComputation* computation = operand->parent();
-
-  Shape a_shape = operand->shape();
-  int ndim = a_shape.dimensions().size();
-  CHECK_GE(ndim, 2);
-  int64_t n = a_shape.dimensions(ndim - 1);
-
-  std::vector<int64_t> batch_dims(a_shape.dimensions().begin(),
-                                  a_shape.dimensions().end() - 2);
-  std::vector<int64_t> batch_dim_ids(batch_dims.size());
-  absl::c_iota(batch_dim_ids, 0);
-  int64_t batch_size = absl::c_accumulate(batch_dims, 1, std::multiplies<>{});
-
-  // Find the workspace size.
-  se::blas::UpperLower uplo = options.lower() ? se::blas::UpperLower::kLower
-                                              : se::blas::UpperLower::kUpper;
-  int64_t workspace_size;  // Number of elements of size a_shape.element_type()
-  TF_ASSIGN_OR_RETURN(
-      workspace_size,
-      context->PotrfBufferSize(a_shape.element_type(), uplo, n, n, batch_size));
-
-  // TODO(phawkins): Ideally we would relax this constraint. What we actually
-  // want is that:
-  // a) the batch dimensions are major, in no particular order.
-  // b) the two minor dimensions are in fortran (column-major) order,
-
-  SetFortranLayout(&a_shape);
-
-  // This call returns a tuple of (cholesky_result, workspace, info) where:
-  // * cholesky_result is the result of the Cholesky decomposition,
-  // * workspace is temporary scratch memory used by cuSolver.
-  // * info contains the Potrf success/failure status.
-  // Currently we have no meaningful way to report an error, so we simply
-  // discard the success/failure information. Obviously this is suboptimal.
-  Shape info_shape = ShapeUtil::MakeShape(S32, batch_dims);
-  Shape call_shape = ShapeUtil::MakeTupleShape(
-      {a_shape,
-       ShapeUtil::MakeShape(operand->shape().element_type(), {workspace_size}),
-       info_shape});
-
-  HloInstruction* custom_call =
-      computation->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, {operand}, kCusolverCholeskyCallTarget, {a_shape}));
-  custom_call->set_metadata(metadata);
-  TF_RETURN_IF_ERROR(custom_call->set_backend_config(options));
-  HloInstruction* out = computation->AddInstruction(
-      HloInstruction::CreateGetTupleElement(a_shape, custom_call, 0));
-  HloInstruction* info = computation->AddInstruction(
-      HloInstruction::CreateGetTupleElement(info_shape, custom_call, 2));
-
-  // If info was non-zero, indicating that the Cholesky decomposition failed,
-  // returns an array full of NaNs for the corresponding batch element.
-  HloInstruction* zero = computation->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-  HloInstruction* zeros =
-      computation->AddInstruction(HloInstruction::CreateBroadcast(
-          info_shape, zero, /*broadcast_dimensions=*/{}));
-  HloInstruction* ok = computation->AddInstruction(
-      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, batch_dims),
-                                    info, zeros, ComparisonDirection::kEq));
-  ok = computation->AddInstruction(HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(PRED, a_shape.dimensions()), ok,
-      /*broadcast_dimensions=*/batch_dim_ids));
-
-  TF_ASSIGN_OR_RETURN(Literal nan_literal,
-                      LiteralUtil::NanValue(a_shape.element_type()));
-  HloInstruction* nan = computation->AddInstruction(
-      HloInstruction::CreateConstant(std::move(nan_literal)));
-  HloInstruction* nans =
-      computation->AddInstruction(HloInstruction::CreateBroadcast(
-          a_shape, nan, /*broadcast_dimensions=*/{}));
-
-  HloInstruction* select =
-      computation->AddInstruction(HloInstruction::CreateTernary(
-          a_shape, HloOpcode::kSelect, ok, out, nans));
-  return select;
-}
-
-// Tries to rewrite a single convolution into a call to cudnn.
-absl::StatusOr<bool> RunOnInstruction(
-    stream_executor::GpuSolverContext* context, HloInstruction* instruction) {
-  if (HloPredicateIsNotOp<HloOpcode::kCholesky>(instruction)) {
-    return false;
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * custom_call,
-      CreateCholesky(context, instruction->mutable_operand(0),
-                     instruction->cholesky_options(), instruction->metadata()));
-
-  VLOG(1) << "Replacing " << instruction->ToString() << " with "
-          << custom_call->ToString();
-
-  TF_RETURN_IF_ERROR(
-      instruction->parent()->ReplaceInstruction(instruction, custom_call));
-  return true;
-}
-
-}  // namespace
-
-// Rewrites the convolutions in the given computation into calls to cudnn.
-// Returns true if it made any changes.
-absl::StatusOr<bool> GpusolverRewriter::RunOnComputation(
-    HloComputation* computation) {
-  std::vector<HloInstruction*> cusolver_calls;
-  for (auto* hlo : computation->instructions()) {
-    if (HloPredicateIsOp<HloOpcode::kCholesky>(hlo)) {
-      cusolver_calls.push_back(hlo);
-    }
-  }
-
-  if (cusolver_calls.empty()) {
-    return false;
-  }
-
-  TF_ASSIGN_OR_RETURN(auto context, solver_context_creator_());
-
-  bool changed = false;
-  for (HloInstruction* instruction : cusolver_calls) {
-    TF_ASSIGN_OR_RETURN(bool result,
-                        RunOnInstruction(context.get(), instruction));
-    changed |= result;
-  }
-  return changed;
-}
-
-GpusolverRewriter::GpusolverRewriter(
-    absl::AnyInvocable<
-        absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-        solver_context_creator)
-    : solver_context_creator_(std::move(solver_context_creator)) {}
-
-absl::StatusOr<bool> GpusolverRewriter::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  for (HloComputation* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
-    changed |= result;
-  }
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h
deleted file mode 100644
index 405b488d7d32a9..00000000000000
--- a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "absl/functional/any_invocable.h"
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
-
-#include <memory>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-
-namespace xla {
-namespace gpu {
-
-// Rewrites Cholesky calls into CustomCall HLOs that call into cuSolver.
-class GpusolverRewriter : public HloModulePass {
- public:
-  explicit GpusolverRewriter(
-      absl::AnyInvocable<
-          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-          solver_context_creator);
-  absl::string_view name() const override { return "gpusolver-rewriter"; }
-
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
- private:
-  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
-  absl::AnyInvocable<
-      absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-      solver_context_creator_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc
deleted file mode 100644
index 840435acf55411..00000000000000
--- a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
-
-#include <complex>
-#include <cstdint>
-#include <memory>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/hlo/testlib/pattern_matcher_gmock.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace m = ::xla::match;
-
-class GpuSolverContextStub : stream_executor::GpuSolverContext {
- public:
-  GpuSolverContextStub() = default;
-  static absl::StatusOr<std::unique_ptr<GpuSolverContext>> Create() {
-    return absl::WrapUnique(
-        static_cast<GpuSolverContext*>(new GpuSolverContextStub));
-  }
-
-  absl::Status SetStream(stream_executor::Stream* stream) override {
-    return UnimplementedError();
-  }
-
-  absl::Status PotrfBatched(stream_executor::blas::UpperLower uplo, int n,
-                            stream_executor::DeviceMemory<float*> as, int lda,
-                            stream_executor::DeviceMemory<int> lapack_info,
-                            int batch_size) override {
-    return UnimplementedError();
-  }
-  absl::Status PotrfBatched(stream_executor::blas::UpperLower uplo, int n,
-                            stream_executor::DeviceMemory<double*> as, int lda,
-                            stream_executor::DeviceMemory<int> lapack_info,
-                            int batch_size) override {
-    return UnimplementedError();
-  }
-  absl::Status PotrfBatched(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<float>*> as, int lda,
-      stream_executor::DeviceMemory<int> lapack_info, int batch_size) override {
-    return UnimplementedError();
-  }
-  absl::Status PotrfBatched(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<double>*> as, int lda,
-      stream_executor::DeviceMemory<int> lapack_info, int batch_size) override {
-    return UnimplementedError();
-  }
-
-  absl::Status Potrf(stream_executor::blas::UpperLower uplo, int n,
-                     stream_executor::DeviceMemory<float> a, int lda,
-                     stream_executor::DeviceMemory<int> lapack_info,
-                     stream_executor::DeviceMemory<float> workspace) override {
-    return UnimplementedError();
-  }
-  absl::Status Potrf(stream_executor::blas::UpperLower uplo, int n,
-                     stream_executor::DeviceMemory<double> a, int lda,
-                     stream_executor::DeviceMemory<int> lapack_info,
-                     stream_executor::DeviceMemory<double> workspace) override {
-    return UnimplementedError();
-  }
-  absl::Status Potrf(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<float>> a, int lda,
-      stream_executor::DeviceMemory<int> lapack_info,
-      stream_executor::DeviceMemory<std::complex<float>> workspace) override {
-    return UnimplementedError();
-  }
-  absl::Status Potrf(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<double>> a, int lda,
-      stream_executor::DeviceMemory<int> lapack_info,
-      stream_executor::DeviceMemory<std::complex<double>> workspace) override {
-    return UnimplementedError();
-  }
-
-  absl::StatusOr<int64_t> PotrfBufferSize(
-      xla::PrimitiveType type, stream_executor::blas::UpperLower uplo, int n,
-      int lda, int batch_size) override {
-    return 0;
-  }
-
- private:
-  static absl::Status UnimplementedError() {
-    return absl::UnimplementedError("Not needed for the unit test");
-  }
-};
-
-class GpusolverRewriterTest : public HloHardwareIndependentTestBase {
- public:
-  GpusolverRewriter gpusolver_rewriter_{GpuSolverContextStub::Create};
-};
-
-TEST_F(GpusolverRewriterTest, CholeskyTest) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-  HloModule CholeskyTest
-
-  ENTRY entry_computation {
-    input = f32[1,256,256] parameter(0)
-    ROOT decomp = f32[1,256,256] cholesky(input)
-  }
-)")
-                    .value();
-
-  EXPECT_TRUE(gpusolver_rewriter_.Run(module.get()).value());
-
-  const HloInstruction* entry_root =
-      module->entry_computation()->root_instruction();
-  ASSERT_THAT(
-      entry_root,
-      GmockMatch(m::Select(
-          m::Broadcast(
-              m::Compare(m::GetTupleElement(), m::Broadcast(m::Constant()))),
-          m::GetTupleElement(m::CustomCall()), m::Broadcast(m::Constant()))));
-}
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
index 10831280e1e330..93e8b37be9a8c5 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -58,7 +57,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/env_var.h"
@@ -140,7 +138,7 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
   // Despite the specialized logic below for Volta, we expect GPUs with Tensor
   // Cores work best using NHWC layouts for cuDNN convolutions---as per
   // https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout.
-  if (auto* cc = std::get_if<se::CudaComputeCapability>(&gpu_version)) {
+  if (auto* cc = gpu_version.cuda_compute_capability()) {
     // TODO(b/383560056): investigate chips below Hopper as well.
     if (cc->IsAtLeast(se::CudaComputeCapability::kHopper)) {
       // With that said, cuDNN's documentation states that NHWC is not supported
@@ -162,20 +160,12 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
     }
   }
 
-  const auto* rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
-  if (rocm_compute_capability && input_ty == F16) {
-    return kAllNHWC;
-  }
-
-  // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
-  // easy: Use NCHW.
   const bool isFloat16 = (input_ty == F16) || (input_ty == BF16);
-  if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+  if (const auto* cuda_compute_capability =
+          gpu_version.cuda_compute_capability()) {
+    // CUDA:
     // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
     // easy: Use NCHW.
-    const auto* cuda_compute_capability =
-        std::get_if<se::CudaComputeCapability>(&gpu_version);
     bool is_volta =
         cuda_compute_capability &&
         cuda_compute_capability->IsAtLeast(se::CudaComputeCapability::kVolta);
@@ -183,13 +173,15 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
         instr->shape().tuple_shapes(0).dimensions().size() != 4) {
       return kAllNCHW;
     }
-  } else if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+  } else if (auto rocm_compute_capability =
+                 gpu_version.rocm_compute_capability()) {
+    // ROCm:
+    // If we do not have NHWC layout support or not fp16/bfloat16, or not
+    // conv2D, or ROCm NHWC is disabled the decision is to use NCHW.
     bool is_enabled = false;
-    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_ROCM_NHWC",
-                                        /*default_val=*/false, &is_enabled));
-    auto rocm_compute_capability =
-        std::get<se::RocmComputeCapability>(gpu_version);
-    if (!isFloat16 || (!rocm_compute_capability.has_nhwc_layout_support()) ||
+    CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_ROCM_NHWC",
+                                     /*default_val=*/false, &is_enabled));
+    if (!isFloat16 || (!rocm_compute_capability->has_nhwc_layout_support()) ||
         instr->shape().tuple_shapes(0).dimensions().size() != 4 ||
         !is_enabled) {
       return kAllNCHW;
@@ -198,7 +190,7 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
 
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
 
-  // For other Volta f16 convolutions, use NHWC.
+  // For other f16 convolutions, use NHWC.
   return kAllNHWC;
 }
 
@@ -451,8 +443,7 @@ absl::Status GpuLayoutAssignment::AddDotBackendConstraints(
                       (rhs.type == PrimitiveType::F8E4M3FN ||
                        rhs.type == PrimitiveType::F8E5M2FNUZ);
 
-  const se::CudaComputeCapability* cc =
-      std::get_if<se::CudaComputeCapability>(&gpu_version_);
+  const se::CudaComputeCapability* cc = gpu_version_.cuda_compute_capability();
   const bool both_operands_require_minor_contraction_dims =
       is_s8_to_s32 || (is_fp8 && !(cc && cc->IsBlackwell()));
 
@@ -569,18 +560,36 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
     } else if (HloPredicateIsOp<HloOpcode::kBitcastConvert>(instruction)) {
       Shape operand_shape = instruction->operand(0)->shape();
       Shape output_shape = instruction->shape();
-      // Make the added or removed dimension the minor most to give the
-      // operation a chance to become a no-op (bitcast).
+
+      // Sets the layouts of the operand and output shapes to make the bitcast a
+      // no-op. The changed dimension is moved to be the most minor one in the
+      // layout of the larger shape, and the layout of the smaller shape is
+      // derived from the larger one.
+      auto assign_layouts = [](Shape* larger_shape, Shape* smaller_shape) {
+        const int changed_dim = larger_shape->dimensions().size() - 1;
+        *larger_shape->mutable_layout() =
+            LayoutUtil::MoveDimToMinor(larger_shape->layout(), changed_dim);
+        *smaller_shape->mutable_layout() =
+            ShapeUtil::DeleteDimension(changed_dim, *larger_shape).layout();
+      };
+
+      bool ranks_differ = true;
       if (operand_shape.dimensions().size() >
           output_shape.dimensions().size()) {
-        *operand_shape.mutable_layout() = LayoutUtil::MoveDimToMinor(
-            operand_shape.layout(), operand_shape.dimensions().size() - 1);
-        TF_RETURN_IF_ERROR(SetOperandLayout(operand_shape, instruction, 0));
+        assign_layouts(&operand_shape, &output_shape);
       } else if (operand_shape.dimensions().size() <
                  output_shape.dimensions().size()) {
-        *output_shape.mutable_layout() = LayoutUtil::MoveDimToMinor(
-            output_shape.layout(), output_shape.dimensions().size() - 1);
-        TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
+        assign_layouts(&output_shape, &operand_shape);
+      } else {
+        ranks_differ = false;
+      }
+
+      if (ranks_differ) {
+        TF_RETURN_IF_ERROR(SetOperandLayout(operand_shape, instruction,
+                                            /*operand_no=*/0,
+                                            /*mandatory=*/true));
+        TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction,
+                                                /*mandatory=*/true));
       }
     } else if (HloPredicateIsOp<HloOpcode::kTriangularSolve>(instruction)) {
       // TODO(phawkins): Ideally we would relax this constraint. What we
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
index 6ad0cc1ce26ae4..161ac37dcc644a 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
@@ -60,6 +60,10 @@ class LayoutAssignmentTest : public HloTestBase {
     return backend().default_stream_executor()->GetDeviceDescription();
   }
 
+  se::RocmComputeCapability GetRocmComputeCapability() {
+    return GetDeviceDescription().rocm_compute_capability();
+  }
+
   se::CudaComputeCapability GetCudaComputeCapability() {
     return GetDeviceDescription().cuda_compute_capability();
   }
@@ -182,6 +186,30 @@ TEST_F(LayoutAssignmentTest, DotLayoutSetToDefaultIfDefaultValid) {
                              .WithShape(F32, {5, 2, 4}, {2, 1, 0})));
 }
 
+TEST_F(LayoutAssignmentTest, BitcastConvertKeepsCompatibleLayouts) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+e {
+  a = u4[5,4,3,2]{3,2,1,0} parameter(0)
+  b = u8[5,4,3]{2,1,0} bitcast-convert(a)
+  c = u8[4,5,3]{2,1,0} transpose(b), dimensions={1,0,2}
+})"));
+
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape());
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion(),
+      GetDeviceDescription());
+  EXPECT_THAT(layout_assignment.Run(module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(m::Transpose(
+                  m::BitcastConvert(
+                      m::Parameter().WithShape(U4, {5, 4, 3, 2}, {3, 2, 1, 0}))
+                      .WithShape(U8, {5, 4, 3}, {2, 1, 0})))));
+}
+
 TEST_F(LayoutAssignmentTest, DotOperandLayoutSetToBatchRowsColsOtherwise) {
   const char* hlo_text = R"(
   HloModule DotLayout
@@ -626,6 +654,168 @@ ENTRY entry {
   EXPECT_EQ(output_layout, LayoutUtil::GetDefaultLayoutForR3());
 }
 
+TEST_F(LayoutAssignmentTest, FP16ROCmConvolutionHasNCHWLayoutRDNA) {
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f16[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f16[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestRDNASupport()},
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {1,2,3,0} for both inputs and for the output, therefore, in order to get to
+  // the desired NCHW_OIHW->NCHW layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(LayoutAssignmentTest, FP32ROCmConvolutionHasNCHWLayoutRDNA) {
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f32[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f32[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestRDNASupport()},
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {1,2,3,0} for both inputs and for the output, therefore, in order to get to
+  // the desired NCHW_OIHW->NCHW layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(LayoutAssignmentTest, FP16ROCmConvolutionHasNHWCLayoutCDNA) {
+  // Enable ROCm NHWC for this test
+  setenv("TF_USE_ROCM_NHWC", "true", 1);
+
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f16[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f16[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestCDNASupport()},
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {3,1,2,0} (transpose the middle dimensions) for both inputs and for the
+  // output, therefore, in order to get to the desired NHWC_OHWI->NHWC layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{3,1,2,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{3,1,2,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{3,1,2,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+
+  // Clean up after the test
+  unsetenv("TF_USE_ROCM_NHWC");
+}
+
+TEST_F(LayoutAssignmentTest, FP32ROCmConvolutionHasNCHWLayoutCDNA) {
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f32[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f32[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestCDNASupport()},
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {1,2,3,0} for both inputs and for the output, therefore, in order to get to
+  // the desired NCHW_OIHW->NCHW layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+}
+
 TEST_F(LayoutAssignmentTest, CuDNNConvolutionHasNHWCLayoutPostHopper) {
   const char* hlo = R"(
 ENTRY entry {
diff --git a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc
index e4570b6cbd3464..ec6e483fc59ecf 100644
--- a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc
+++ b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc
@@ -94,6 +94,11 @@ class MoveCopyToUsersVisitor : public DfsHloRewriteVisitor {
   // Turn copy->reduce_window into reduce_window->copy, as reduce_window is
   // layout-preserving.
   absl::Status HandleReduceWindow(HloInstruction* hlo) override {
+    if (hlo->shape().IsTuple()) {
+      // TODO: Handle variadic reductions.
+      return absl::OkStatus();
+    }
+
     HloInstruction* operand = hlo->mutable_operand(0);
     if (HloPredicateIsOp<HloOpcode::kCopy>(operand)) {
       HloInstruction* copied = operand->mutable_operand(0);
@@ -235,7 +240,7 @@ class MoveCopyToUsersVisitor : public DfsHloRewriteVisitor {
 
 }  // end namespace
 
-absl::StatusOr<bool> MoveCopyToUsers::Run(
+absl::StatusOr<bool> MoveCopyToUsers::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return MoveCopyToUsersVisitor{}.RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
index e3e5550119267b..a5baf5b6aae0ba 100644
--- a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
+++ b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
@@ -28,8 +28,9 @@ namespace xla {
 class MoveCopyToUsers : public HloModulePass {
  public:
   absl::string_view name() const override { return "move_copy_to_users"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
index f5dfb24f67d5b1..fb5ea23e9c09fb 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -231,7 +231,7 @@ FusionDecision ProducerCandidateIsFusible(
 std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
     const HloInstruction* producer, const HloDfsReachability& reachability,
     FusionInfoCache* fusion_info_cache,
-    const se::DeviceDescription& device_info,
+    const se::DeviceDescription& device_info, const GpuAliasInfo* alias_info,
     GpuPerformanceModelOwning& gpu_performance_model,
     GpuHloCostAnalysis* cost_analysis) {
   std::vector<HloInstruction*> fusion_candidates;
@@ -242,7 +242,7 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
 
   // If the producer is not a valid candidate for MOF, no need to check any of
   // its users.
-  if (!IsProducerMultiOutputFusible(*producer, device_info)) {
+  if (!IsProducerMultiOutputFusible(*producer, alias_info, device_info)) {
     return fusion_candidates;
   }
 
@@ -387,8 +387,8 @@ bool MultiOutputFusion::FuseSiblings(HloInstruction* parent,
       fusion_info_cache->Invalidate(*j);
       HloInstruction* remaining = *i;
       HloInstruction* fused = *j;
-      TF_CHECK_OK(cost_analysis->RemoveInstruction(remaining));
-      TF_CHECK_OK(cost_analysis->RemoveInstruction(fused));
+      CHECK_OK(cost_analysis->RemoveInstruction(remaining));
+      CHECK_OK(cost_analysis->RemoveInstruction(fused));
 
       DumpFusionState(*remaining,
                       absl::StrCat("About to fuse sibling |", fused->name(),
@@ -404,12 +404,12 @@ bool MultiOutputFusion::FuseSiblings(HloInstruction* parent,
       } else {
         remaining->FuseInstructionIntoMultiOutput(fused);
         CHECK_EQ(0, fused->user_count());
-        TF_CHECK_OK(computation_->RemoveInstruction(fused));
+        CHECK_OK(computation_->RemoveInstruction(fused));
       }
       DumpFusionState(*remaining,
                       absl::StrCat("Fused into |", remaining->name(),
                                    "| inside multi-output fusion"));
-      TF_CHECK_OK(cost_analysis->RevisitInstruction(remaining));
+      CHECK_OK(cost_analysis->RevisitInstruction(remaining));
       changed = true;
       siblings.erase(j);
       RecomputeReachability();
@@ -454,7 +454,7 @@ absl::StatusOr<bool> MultiOutputFusion::DoMultiOutputFusion() {
     // multi-output fusion will occur before the current op in the order of
     // traversal, and hence, not get into the way of subsequent fusion attempts.
     const auto candidates = GetProducerConsumerMultiOutputFusionCandidates(
-        producer, *reachability_, &fusion_info_cache, device_info_,
+        producer, *reachability_, &fusion_info_cache, device_info_, alias_info_,
         gpu_performance_model, &cost_analysis);
     auto* consumer_for_fusion = SelectPreferredFusionCandidate(candidates);
     if (consumer_for_fusion == nullptr) {
@@ -485,7 +485,7 @@ absl::StatusOr<bool> MultiOutputFusion::DoMultiOutputFusion() {
       VLOG(2) << "Fuse producer " << producer->name() << " and its consumer "
               << consumer_for_fusion->name() << " into "
               << input_fusion->name();
-      TF_CHECK_OK(
+      CHECK_OK(
           computation_->ReplaceInstruction(consumer_for_fusion, input_fusion));
     }
 
@@ -500,7 +500,7 @@ absl::StatusOr<bool> MultiOutputFusion::DoMultiOutputFusion() {
     } else {
       input_fusion->FuseInstructionIntoMultiOutput(producer);
       CHECK_EQ(0, producer->user_count());
-      TF_CHECK_OK(computation_->RemoveInstruction(producer));
+      CHECK_OK(computation_->RemoveInstruction(producer));
     }
     TF_RETURN_IF_ERROR(cost_analysis.RevisitInstruction(input_fusion));
 
@@ -523,7 +523,7 @@ void MultiOutputFusion::DumpFusionState(const HloInstruction& consumer,
   }
 }
 
-absl::StatusOr<bool> MultiOutputFusion::Run(
+absl::StatusOr<bool> MultiOutputFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
index 65cc4a353c066f..593112a91ff38f 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -96,17 +97,18 @@ namespace gpu {
 class MultiOutputFusion : public HloModulePass {
  public:
   explicit MultiOutputFusion(
-      const se::DeviceDescription& device_info,
+      const se::DeviceDescription& device_info, const GpuAliasInfo* alias_info,
       HloCostAnalysis::ShapeSizeFunction shape_size_function,
       mlir::MLIRContext* mlir_context)
       : device_info_(device_info),
+        alias_info_(alias_info),
         shape_size_function_(shape_size_function),
         mlir_context_(mlir_context) {}
 
   absl::string_view name() const override { return "multi_output_fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -129,6 +131,7 @@ class MultiOutputFusion : public HloModulePass {
   std::unique_ptr<HloDfsReachability> reachability_;
 
   se::DeviceDescription device_info_;
+  const GpuAliasInfo* alias_info_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
   mlir::MLIRContext* mlir_context_;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
index a8847e3910be73..70639728160517 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -46,14 +48,16 @@ namespace m = ::xla::match;
 
 class MultiOutputFusionTest : public HloHardwareIndependentTestBase {
  public:
-  MultiOutputFusion mof_{TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+  se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+  GpuAliasInfo alias_info_{device_info_};
+  MultiOutputFusion mof_{device_info_, &alias_info_,
                          HloCostAnalysis::DefaultShapeSize, &mlir_context_};
 
   void CheckMultiOutputFusion(absl::string_view hlo,
                               std::optional<absl::string_view> expected) {
     RunAndFilecheckHloRewrite(
         hlo,
-        MultiOutputFusion{TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+        MultiOutputFusion{device_info_, &alias_info_,
                           HloCostAnalysis::DefaultShapeSize, &mlir_context_},
         expected);
   }
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index 36b275d6a9a7fa..05d23ba75ec1f7 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
-#include <algorithm>
-#include <cstddef>
 #include <cstdint>
 #include <deque>
 #include <memory>
@@ -40,11 +38,12 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/symbolic_tiled_hlo_instruction.h"
+#include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -59,13 +58,13 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/model/triton_emitter_constraints.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/matmul_indexing_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tools/hlo_decomposer.h"
 #include "xla/tools/hlo_extractor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -74,9 +73,10 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
-
 namespace {
 
+using ::mlir::MLIRContext;
+
 // Creates a fusion for instructions starting from 'root' and returns it.
 absl::StatusOr<HloInstruction*> FuseInstructionsFromRoot(HloInstruction& root) {
   std::vector<HloInstruction*> instructions =
@@ -202,6 +202,8 @@ absl::Status AnnotateDotOperandNestedFusionImpl(
   block_level_parameters.num_ctas = config.num_ctas;
   block_level_parameters.num_stages = config.num_stages;
   block_level_parameters.is_tma_allowed = config.is_tma_allowed;
+  block_level_parameters.is_warp_specialization_allowed =
+      config.is_warp_specialization_allowed;
 
   TF_ASSIGN_OR_RETURN(auto gpu_config,
                       nested_fusion.backend_config<GpuBackendConfig>());
@@ -266,7 +268,7 @@ absl::Status FuseAndAnnotateConcatOperands(HloComputation* computation) {
 // Transforms a fusion into an equivalent nested fusion if it has a single dot.
 // Returns ok if the transformation was successful.
 absl::Status MakeNestedFusionFromGemmFusion(
-    HloFusionInstruction* fusion, HloInstruction* dot, mlir::MLIRContext* ctx,
+    HloFusionInstruction* fusion, HloInstruction* dot, MLIRContext* ctx,
     const se::DeviceDescription& device_description) {
   TF_RETURN_IF_ERROR(IsDot(*dot));
   const bool is_scaled_dot = dot->opcode() == HloOpcode::kScaledDot;
@@ -335,11 +337,6 @@ absl::Status MakeNestedFusionFromGemmFusion(
   return absl::OkStatus();
 }
 
-size_t GetDotCount(HloComputation* computation) {
-  return absl::c_count_if(computation->instructions(),
-                          HloPredicateIsOp<HloOpcode::kDot>);
-}
-
 using HloInstructionSetVector =
     llvm::SetVector<HloInstruction*, std::vector<HloInstruction*>,
                     HloInstructionSet>;
@@ -1101,118 +1098,34 @@ absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
   return absl::OkStatus();
 }
 
-bool IsFeatureEnabled(const HloModule* module,
-                      DebugOptions::GenericTritonEmitterFeature feature) {
-  return absl::c_contains(
-      module->config()
-          .debug_options()
-          .xla_gpu_unsupported_generic_triton_emitter_features(),
-      feature);
-}
-
 class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
  public:
   explicit NestGemmFusionVisitor(
-      mlir::MLIRContext* ctx, CallGraph* call_graph,
+      MLIRContext* mlir_context, CallGraph* call_graph,
       const se::DeviceDescription& device_description)
-      : ctx_(ctx),
+      : mlir_context_(mlir_context),
         call_graph_(call_graph),
         device_description_(device_description) {}
 
  private:
-  absl::Status AcceptDotOperand(const HloInstruction* operand,
-                                absl::Span<const int64_t> batch_dims,
-                                absl::Span<const int64_t> contracting_dims,
-                                bool is_lhs) {
-    if (contracting_dims.size() != 1) {
-      return absl::InternalError(
-          absl::StrCat("Expected ", is_lhs ? "LHS" : "RHS",
-                       " operand with exactly one contracting dimension, got ",
-                       contracting_dims.size()));
+  absl::Status AcceptNestedInstruction(const HloInstruction* instruction) {
+    if (instruction->IsElementwise()) {
+      return absl::OkStatus();
     }
-
-    TF_ASSIGN_OR_RETURN(
-        std::vector<int64_t> non_contracting_dimensions,
-        GetNonContractingDims(operand->shape(), batch_dims, contracting_dims));
-
-    if (non_contracting_dimensions.size() != 1) {
-      return absl::InternalError(absl::StrCat(
-          "Expected ", is_lhs ? "LHS" : "RHS",
-          " operand with exactly one non-contracting dimension, got ",
-          non_contracting_dimensions.size()));
+    const DebugOptions& debug_options =
+        instruction->GetModule()->config().debug_options();
+    if (instruction->opcode() == HloOpcode::kScaledDot &&
+        !debug_options.xla_gpu_experimental_scaled_dot_with_triton()) {
+      return absl::InternalError("Scaled dot with Triton is not enabled.");
     }
 
-    if (is_lhs) {
-      if (non_contracting_dimensions[0] >= contracting_dims[0]) {
-        return absl::InternalError(absl::StrCat(
-            "Expected LHS non-contracting dimension to be before contracting "
-            "dimension, got ",
-            non_contracting_dimensions[0], " >= ", contracting_dims[0]));
-      }
-    } else {
-      if (non_contracting_dimensions[0] <= contracting_dims[0]) {
-        return absl::InternalError(absl::StrCat(
-            "Expected RHS non-contracting dimension to be after contracting "
-            "dimension, got ",
-            non_contracting_dimensions[0], " <= ", contracting_dims[0]));
-      }
+    if (instruction->opcode() == HloOpcode::kFusion) {
+      return AcceptResultingFusion(Cast<HloFusionInstruction>(instruction));
     }
-    return absl::OkStatus();
-  }
 
-  absl::Status AcceptDotInstruction(const HloDotInstruction* dot) {
-    if (IsFeatureEnabled(
-            dot->GetModule(),
-            DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES)) {
-      return absl::OkStatus();
-    }
-    const HloInstruction* lhs = dot->operand(0);
-    const HloInstruction* rhs = dot->operand(1);
-    auto dims = dot->dot_dimension_numbers();
-    TF_RETURN_IF_ERROR(AcceptDotOperand(lhs, dims.lhs_batch_dimensions(),
-                                        dims.lhs_contracting_dimensions(),
-                                        /*is_lhs=*/true));
-    TF_RETURN_IF_ERROR(AcceptDotOperand(rhs, dims.rhs_batch_dimensions(),
-                                        dims.rhs_contracting_dimensions(),
-                                        /*is_lhs=*/false));
     return absl::OkStatus();
   }
 
-  absl::Status AcceptNestedInstruction(const HloInstruction* instruction) {
-    if (instruction->IsElementwise()) {
-      return absl::OkStatus();
-    }
-    switch (instruction->opcode()) {
-      case HloOpcode::kParameter:
-      case HloOpcode::kConstant:
-        return absl::OkStatus();
-      case HloOpcode::kBroadcast:
-        return absl::OkStatus();
-      case HloOpcode::kFusion:
-        return AcceptResultingFusion(Cast<HloFusionInstruction>(instruction));
-      case HloOpcode::kScaledDot:
-        if (instruction->GetModule()
-                ->config()
-                .debug_options()
-                .xla_gpu_experimental_scaled_dot_with_triton()) {
-          return absl::OkStatus();
-        }
-        return absl::InternalError("Scaled dot with Triton is not enabled.");
-      case HloOpcode::kDot:
-        return AcceptDotInstruction(Cast<HloDotInstruction>(instruction));
-      default:
-        if (!IsFeatureEnabled(
-                instruction->GetModule(),
-                DebugOptions::
-                    GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION)) {
-          return absl::InternalError(absl::StrCat(
-              "Instruction ", HloOpcodeString(instruction->opcode()),
-              " is not allowed in nested GEMM fusion."));
-        }
-        return absl::OkStatus();
-    }
-  }
-
   // Checks whether all operations are from the "tested" set that we confirmed
   // to not cause regressions.
   // That enables a progressive rollout of the new emitter. Eventually we should
@@ -1243,8 +1156,8 @@ class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
 
     TF_RETURN_IF_ERROR(
         TryHoistBitcastsInComputationToCallers(instr, call_graph));
-    TF_RETURN_IF_ERROR(MakeNestedFusionFromGemmFusion(fusion, instr, ctx_,
-                                                      device_description_));
+    TF_RETURN_IF_ERROR(MakeNestedFusionFromGemmFusion(
+        fusion, instr, mlir_context_, device_description_));
 
     MarkAsChanged();
     bool scaled_dot_enabled =
@@ -1284,46 +1197,11 @@ class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
         return absl::OkStatus();
       }
     }
-    {
-      // Symbolic tile analysis and nesting do not support all HLOs yet and
-      // might leave the module in an invalid state. To avoid that we first dry
-      // run the rewrite on an extracted module.
-      // TODO(b/393299275): remove dry-run once we can handle all HLOs.
-      std::unique_ptr<HloModule> extracted_module =
-          ExtractInstructionIntoNewModule(*fusion);
-      extracted_module->mutable_config().set_debug_options(
-          fusion->GetModule()->config().debug_options());
-      HloComputation* entry = extracted_module->entry_computation();
-      HloFusionInstruction* extracted_fusion =
-          Cast<HloFusionInstruction>(entry->root_instruction());
-      if (extracted_fusion == nullptr) {
-        return absl::InternalError(absl::StrCat(
-            "Failed to create a cloned module for fusion ", fusion->name()));
-      }
-      std::unique_ptr<CallGraph> cloned_call_graph =
-          CallGraph::Build(extracted_module.get(), {});
-      absl::Status status =
-          RewriteFusion(extracted_fusion, cloned_call_graph.get());
-      if (!status.ok()) {
-        VLOG(2) << "Failed to rewrite the fusion " << fusion->ToString()
-                << " in a cloned module: " << status;
-        if (IsFeatureEnabled(
-                fusion->GetModule(),
-                DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM)) {
-          // As legacy emitter is disabled we are doomed to fail now, returning
-          // the dry run result failure as it is a better diagnostic.
-          return status;
-        }
-        return absl::OkStatus();
-      }
-    }
-    absl::Status status = RewriteFusion(fusion, call_graph_);
-    VLOG(2) << "RewriteFusion " << fusion->name() << ": " << status;
-    return status;
+    return RewriteFusion(fusion, call_graph_);
   }
 
  private:
-  mlir::MLIRContext* ctx_;
+  MLIRContext* mlir_context_;
   CallGraph* call_graph_;
   const se::DeviceDescription& device_description_;
 };
@@ -1345,29 +1223,16 @@ absl::StatusOr<bool> NestGemmFusion::RunOnModule(
   return changed;
 }
 
-absl::StatusOr<bool> NestGemmFusion::Run(
+absl::StatusOr<bool> NestGemmFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  VLOG(2) << "--xla_gpu_unsupported_generic_triton_emitter_features="
-          << absl::StrJoin(
-                 module->config()
-                     .debug_options()
-                     .xla_gpu_unsupported_generic_triton_emitter_features(),
-                 ",");
-  if (!IsFeatureEnabled(
-          module, DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM)) {
-    VLOG(1) << "Generic Triton emitter for gemms is disabled, exiting";
-    return false;
-  }
-
-  TF_ASSIGN_OR_RETURN(bool result, RunOnModule(module, execution_threads));
-  return result;
+  return RunOnModule(module, execution_threads);
 }
 
 namespace detail {
 
 absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
-    HloInstruction* dot, const TritonGemmConfig& config, mlir::MLIRContext* ctx,
+    HloInstruction* dot, const TritonGemmConfig& config, MLIRContext* ctx,
     const se::DeviceDescription& device_description) {
   TF_RETURN_IF_ERROR(IsDot(*dot));
   HloComputation* computation = dot->parent();
@@ -1424,7 +1289,7 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
           << computation->root_instruction()->shape().ToString();
   llvm::SmallVector<int64_t> output_tile_sizes = get_tile_sizes(out_rank);
 
-  std::sort(output_tile_sizes.begin(), output_tile_sizes.end());
+  absl::c_sort(output_tile_sizes);
 
   const TilingSpecification& tiling_specification =
       analysis.GetTilingSpecification();
@@ -1464,13 +1329,14 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
       params.num_ctas = config.num_ctas;
       params.num_stages = config.num_stages;
       params.is_tma_allowed = config.is_tma_allowed;
+      params.is_warp_specialization_allowed =
+          config.is_warp_specialization_allowed;
       return params;
     }
     VLOG(4) << "mapped_dot_tile_sizes: "
             << absl::StrJoin(mapped_dot_tile_sizes, ",")
             << " != " << absl::StrJoin(expected_dot_tile_sizes, ",");
-  } while (std::next_permutation(output_tile_sizes.begin(),
-                                 output_tile_sizes.end()));
+  } while (absl::c_next_permutation(output_tile_sizes));
 
   return absl::InternalError(absl::StrCat(
       "Couldn't find output tile sizes that satisfy ", tiled_dot.ToString()));
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
index 915ba8421e7def..2d94ad1c4417f6 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
@@ -52,8 +52,8 @@ class NestGemmFusion : public HloModulePass {
 
   absl::string_view name() const override { return "nest_gemm_fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -77,7 +77,8 @@ namespace detail {
 // function can be removed once `GpuDotFusionCostModel::EstimateRunTimeForDotOp`
 // is implemented.
 absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
-    HloInstruction* dot, const TritonGemmConfig& config, mlir::MLIRContext* ctx,
+    HloInstruction* dot, const TritonGemmConfig& config,
+    mlir::MLIRContext* mlir_context,
     const se::DeviceDescription& device_description);
 
 }  // namespace detail
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index fee36e1cf4fe15..51df59db187a8f 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_print_options.h"
@@ -74,21 +75,8 @@ class NestGemmFusionTest : public HloHardwareIndependentTestBase {
  protected:
   const se::DeviceDescription device_description_{
       TestGpuDeviceInfo::RTXA6000DeviceInfo(
-          se::CudaComputeCapability::Ampere())};
+          se::GpuComputeCapability{se::CudaComputeCapability::Ampere()})};
   mlir::MLIRContext mlir_context_;
-
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions options =
-        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
-    auto* emitter_opts =
-        options.mutable_xla_gpu_unsupported_generic_triton_emitter_features();
-    emitter_opts->Add(DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
-    emitter_opts->Add(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
-    emitter_opts->Add(
-        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES);
-    return options;
-  }
 };
 
 TEST_F(NestGemmFusionTest, BasicTest) {
@@ -190,7 +178,8 @@ ENTRY e {
               GmockMatch(match::Concatenate(match::Fusion(), match::Fusion())));
 }
 
-TEST_F(NestGemmFusionTest, UnsupportedComputationsAreNotChanged) {
+// TODO(b/393299275): update test to use a unsupported operation.
+TEST_F(NestGemmFusionTest, DISABLED_UnsupportedComputationsAreNotChanged) {
   // Fusions other than kTritonNestedGemmFusionKind are not supported.
   // In this case pass should only change the supported fusions.
   absl::string_view hlo = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc
index 1e5ef80ddda776..4d951ce4ea1bf8 100644
--- a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc
+++ b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla::gpu {
 
-absl::StatusOr<bool> PGLEAccuracyChecker::Run(
+absl::StatusOr<bool> PGLEAccuracyChecker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(pgle_estimator_.CheckAccuracy(*module));
diff --git a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
index b2484f26844562..b952a1f50e5119 100644
--- a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
+++ b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
@@ -34,8 +34,8 @@ class PGLEAccuracyChecker : public HloModulePass {
       : pgle_estimator_(pgle_estimator) {}
   absl::string_view name() const override { return "pgle-accuracy-checker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
index a752e0cfe3d7ad..e5669c1fa4440f 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_indexing_performance_model.h"
@@ -71,7 +72,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
@@ -81,6 +81,12 @@ namespace gpu {
 
 namespace {
 
+// Bitcasts are fusible if they don't change the bit width.
+bool IsFusibleBitcast(const HloInstruction& instr) {
+  return instr.opcode() == HloOpcode::kBitcast &&
+         hlo_instruction_utils::KeepsBitwidth(instr);
+}
+
 bool IsFusible(const HloInstruction& instr) {
   // Side-effecting operations are not fusible.
   if (!instr.IsFusible()) {
@@ -92,13 +98,16 @@ bool IsFusible(const HloInstruction& instr) {
     return true;
   }
 
+  // Bitcasts are fusible if they don't change the bit width.
+  if (IsFusibleBitcast(instr)) {
+    return true;
+  }
+
   // Other non-elementwise ops also supported by elemental fusion.
   switch (instr.opcode()) {
     case HloOpcode::kFusion:
       return IsGenericTritonFusion(instr) ||
              instr.fusion_kind() != HloInstruction::FusionKind::kCustom;
-    case HloOpcode::kBitcast:
-      return hlo_instruction_utils::KeepsBitwidth(instr);
     case HloOpcode::kCopy:
     case HloOpcode::kIota:
     case HloOpcode::kConstant:
@@ -170,7 +179,7 @@ class PriorityFusionQueue {
         reachability_(HloDfsReachability::Build(computation)),
         triton_heroless_fusion_enabled_(triton_heroless_fusion_enabled) {
     VLOG(2) << "Running full HLO cost analysis for " << computation_->name();
-    TF_CHECK_OK(computation_->Accept(&cost_analysis_));
+    CHECK_OK(computation_->Accept(&cost_analysis_));
 
     dump_fusion_visualization_ = computation->parent()
                                      ->config()
@@ -180,7 +189,7 @@ class PriorityFusionQueue {
     // Initializes the priority queue.
     std::vector<HloInstruction*> instructions;
     for (auto* instruction : computation->MakeInstructionPostOrder()) {
-      TF_CHECK_OK(UpdatePerformanceModelCache(instruction));
+      CHECK_OK(UpdatePerformanceModelCache(instruction));
       if (HloPredicateIsOp<HloOpcode::kParameter>(instruction) ||
           instruction->user_count() == 0 || !instruction->IsFusible() ||
           HloPredicateIsOp<HloOpcode::kTuple, HloOpcode::kGetTupleElement>(
@@ -264,7 +273,7 @@ class PriorityFusionQueue {
         current_consumers_ = {*preferred_consumer};
       }
 
-      if (HloPredicateIsOp<HloOpcode::kBitcast>(current_producer_)) {
+      if (IsFusibleBitcast(*current_producer_)) {
         // We don't check if bitcasts can be fused with all consumers, so we
         // have to do it here.
         llvm::erase_if(current_consumers_, [&](HloInstruction* consumer) {
@@ -546,7 +555,7 @@ class PriorityFusionQueue {
       preferred_consumer_.erase(producer);
     }
     // Bitcasts should always be fused first, since they are no-ops.
-    if (HloPredicateIsOp<HloOpcode::kBitcast>(producer)) {
+    if (IsFusibleBitcast(*producer)) {
       return absl::InfiniteDuration();
     }
     // We always fuse constants, but the cost model doesn't handle them very
@@ -789,7 +798,7 @@ class PriorityFusionQueue {
       return can_fuse_triton;
     }
 
-    if (HloPredicateIsOp<HloOpcode::kBitcast>(consumer)) {
+    if (IsFusibleBitcast(*consumer)) {
       return FusionDecision::Forbid(
           "not fusing into a single bitcast as consumer");
     }
@@ -925,7 +934,7 @@ class PriorityFusionQueue {
     }
     std::vector<HloInstruction*> possible_consumers;
     for (const auto& user : producer->users()) {
-      if (HloPredicateIsOp<HloOpcode::kBitcast>(user)) {
+      if (IsFusibleBitcast(*user)) {
         continue;
       }
       if (CanFuseTriton(producer, user, /*use_multi_output_fusion=*/true) &&
@@ -959,7 +968,7 @@ class PriorityFusionQueue {
 
     bool has_non_bitcast_user = false;
     for (const auto& user : producer->users()) {
-      if (HloPredicateIsOp<HloOpcode::kBitcast>(user)) {
+      if (IsFusibleBitcast(*user)) {
         continue;
       }
       has_non_bitcast_user = true;
@@ -1126,7 +1135,7 @@ FusionDecision PriorityFusion::CanFuseConstant(const HloInstruction* constant,
   return FusionDecision::Allow();
 }
 
-absl::StatusOr<bool> PriorityFusion::Run(
+absl::StatusOr<bool> PriorityFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool dump_enabled =
@@ -1180,7 +1189,7 @@ absl::StatusOr<bool> PriorityFusion::Run(
       for (auto* consumer : consumers) {
         // Don't fuse into single bitcasts. We ignore them in the check
         // CanFuseWithAllNonBitcastUsers(), so we need to check it here.
-        if (HloPredicateIsOp<HloOpcode::kBitcast>(consumer)) {
+        if (IsFusibleBitcast(*consumer)) {
           continue;
         }
         if (!ConsumeFuel(producer, consumer)) {
@@ -1308,7 +1317,7 @@ HloInstruction* PriorityFusion::Fuse(HloInstruction* producer,
   if (HloPredicateIsNotOp<HloOpcode::kFusion>(fusion_instruction)) {
     fusion_instruction = computation->AddInstruction(
         HloInstruction::CreateFusion(consumer->shape(), kind, consumer));
-    TF_CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
+    CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
   } else if (kind != fusion_instruction->fusion_kind()) {
     fusion_instruction->set_fusion_kind(kind);
   }
@@ -1329,7 +1338,7 @@ HloInstruction* PriorityFusion::Fuse(HloInstruction* producer,
       // the computation. Do the same here, so that we have the invariant that
       // the producer has been cleaned up when multi-output fusion is used.
       CHECK_EQ(0, producer->user_count());
-      TF_CHECK_OK(producer->parent()->RemoveInstruction(producer));
+      CHECK_OK(producer->parent()->RemoveInstruction(producer));
     } else {
       fusion_instruction->FuseInstruction(producer);
     }
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.h b/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
index ebd62961094172..74debe3b28d827 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
@@ -39,6 +39,45 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// PriorityFusion is the main fusion pass for XLA:GPU. It is an HLO pass that
+// assigns a priority to each producer instruction based on the estimated
+// performance benefit of fusing it into its consumers. The benefit is
+// calculated using a performance cost model:
+//
+//   priority = time_unfused - time_fused
+//
+// Note: If fusing a producer into its consumers requires duplicating the
+// producer, the cost model accounts for this duplication.
+//
+// The algorithm can be summarized in the following steps:
+// 1. For each producer, call the cost model to estimate the potential benefit
+//    of fusing it with all its consumers.
+// 2. Put all producers with a positive benefit into a priority queue, ordered
+//    by benefit.
+// 3. Pop the producer with the highest priority from the queue.
+// 4. Fuse the producer with its consumers. This may result in a new fusion
+//    instruction, or merging into an existing fusion.
+// 5. Update the priorities of the operands of the fused instructions and
+//    of instructions whose consumers have changed, and update them in the
+//    priority queue.
+// 6. If the queue is not empty, go to step 3.
+//
+// Example:
+// Consider A -> B -> C, where A, B, and C are fusible operations.
+// The fusible producers are A and B.
+//
+// Priorities are computed:
+//  - P(A) = benefit of fusing A into B.
+//  - P(B) = benefit of fusing B into C.
+//
+// Assuming P(A)=10 and P(B)=5, the queue is [(A,10), (B,5)].
+//  - A is popped and fused into B, creating fusion(A+B).
+//  - The graph becomes fusion(A+B) -> C.
+//  - Priority of fusion(A+B) is computed, P(fusion(A+B))=8.
+//  - The queue becomes [(fusion(A+B),8)].
+//  - fusion(A+B) is popped and fused into C, creating fusion(A+B+C).
+//  - The queue becomes empty, and fusion terminates.
+//
 class PriorityFusion : public HloModulePass {
  public:
   PriorityFusion(tsl::thread::ThreadPool* thread_pool,
@@ -53,11 +92,6 @@ class PriorityFusion : public HloModulePass {
 
   absl::string_view name() const override { return "priority-fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   HloInstruction::FusionKind ChooseKind(const HloInstruction* producer,
                                         const HloInstruction* consumer);
@@ -65,6 +99,10 @@ class PriorityFusion : public HloModulePass {
   HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
                        bool use_multi_output_fusion = false);
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Consumes a unit of compiler fuel and returns true if we should
   // continue with the transformation.
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
index 337d8690342d55..fd559471f03ed4 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -216,17 +217,19 @@ CHECK-NEXT: ROOT %{{.*}} = (f32[512]{0}, s32[512]{0}) tuple(%[[FUSION_F32]], %[[
 }
 
 TEST_F(PriorityFusionTest, DoNotFuseBitWidthChangingBitcast) {
-  EXPECT_TRUE(RunAndCheckHloRewrite(R"(
-e {
-  a = s8[3,5,2]{2,1,0} parameter(0)
-  n = s8[3,5,2]{2,1,0} negate(a)
-  b = s16[3,5]{1,0} bitcast(n)
-  m = s16[3,5]{1,0} multiply(b, b)
-})",
-                                    std::move(priority_fusion_),
-                                    /*expect_change=*/false)
-                  .status()
-                  .ok());
+  // `neg` is the producer that could be fused with `bitcast` and `mul`, but
+  // since `bitcast` changes the bit width, we don't fuse it.
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    ENTRY main {
+      p0 = s8[3,5,2]{2,1,0} parameter(0)
+      neg = s8[3,5,2]{2,1,0} negate(p0)
+      bitcast = s16[3,5]{1,0} bitcast(neg)
+      mul = s8[3,5,2]{2,1,0} add(neg, neg)
+      ROOT result = (s16[3,5]{1,0}, s8[3,5,2]{2,1,0}) tuple(bitcast, mul)
+    })");
+
+  EXPECT_THAT(priority_fusion_.Run(module.get()),
+              absl_testing::IsOkAndHolds(false));
 }
 
 TEST_F(PriorityFusionTest, FuseConvertIntoReduce) {
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc
index efa91c039730c2..ce47d5d64279f2 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc
@@ -66,6 +66,8 @@ absl::StatusOr<bool> CanonicalizeRaggedAllToAll(
           ragged_all_to_all->shape(), new_operands,
           ragged_all_to_all->device_list(),
           /*channel_id=*/ragged_all_to_all->channel_id()));
+  new_ragged_all_to_all->set_frontend_attributes(
+      ragged_all_to_all->frontend_attributes());
   TF_RETURN_IF_ERROR(
       ragged_all_to_all->ReplaceAllUsesWith(new_ragged_all_to_all));
   TF_RETURN_IF_ERROR(
@@ -73,7 +75,7 @@ absl::StatusOr<bool> CanonicalizeRaggedAllToAll(
   return true;
 }
 
-absl::StatusOr<bool> RaggedAllToAllCanonicalizer::Run(
+absl::StatusOr<bool> RaggedAllToAllCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h
index 75c5e740d80078..8b58846f288be9 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h
@@ -35,7 +35,8 @@ class RaggedAllToAllCanonicalizer : public HloModulePass {
     return "ragged-all-to-all-canonicalizer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer_test.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer_test.cc
index 1437e87d0f112d..6c58f8bbf83852 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer_test.cc
@@ -44,7 +44,7 @@ ENTRY main {
   output_offsets = s32[2] parameter(4)
   recv_sizes = s32[2] parameter(5)
   ROOT ra2a = bf16[16] ragged-all-to-all(input, output, input_offsets,
-    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}, frontend_attributes={_scheduling_group_id="0",latency_metadata="150000",sync_collective="false"}
 }
 )"));
 
@@ -60,6 +60,8 @@ ENTRY main {
   EXPECT_EQ(ragged_all_to_all->operand(3)->shape().element_type(), S64);
   EXPECT_EQ(ragged_all_to_all->operand(4)->shape().element_type(), S64);
   EXPECT_EQ(ragged_all_to_all->operand(5)->shape().element_type(), S64);
+  EXPECT_TRUE(ragged_all_to_all->has_frontend_attributes());
+  EXPECT_EQ(ragged_all_to_all->frontend_attributes().map().size(), 3);
 }
 
 TEST_F(RaggedAllToAllCanonicalizerTest, CanonicalRaggedAllToAllIsNotChanged) {
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
index cf06fecc776e32..c7ddadb4a8eb73 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
@@ -401,7 +401,7 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(HloInstruction* hlo,
   return true;
 }
 
-absl::StatusOr<bool> RaggedAllToAllDecomposer::Run(
+absl::StatusOr<bool> RaggedAllToAllDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
index 13fd87d69a76b4..f0b1dff755ba59 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
@@ -45,7 +45,8 @@ class RaggedAllToAllDecomposer : public HloModulePass {
     return "ragged-all-to-all-decomposer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
index 67c07ae0311531..e6da5ffb509679 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
@@ -16,14 +16,19 @@ limitations under the License.
 #include "xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h"
 
 #include <cstdint>
+#include <iterator>
+#include <optional>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -42,81 +47,350 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+
 using hlo_query::NextChannelId;
 
-// Exchanges the metadata between the hosts and computes the intra-host
+// Corrects the offsets in the local metadata to account for the number of input
+// rows in the combined ragged tensor.
+HloInstruction* CorrectOffsets(int64_t offset, HloInstruction* local_metadata,
+                               HloComputation* computation) {
+  const Shape& shape = local_metadata->shape();
+
+  HloInstruction* iota = computation->AddInstruction(
+      HloInstruction::CreateIota(/*shape=*/shape, /*iota_dimension=*/0));
+
+  HloInstruction* num_input_rows_constant = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64_t>(offset)));
+
+  HloInstruction* num_input_rows_constant_broadcast =
+      computation->AddInstruction(HloInstruction::CreateBroadcast(
+          /*shape=*/shape, num_input_rows_constant,
+          /*broadcast_dimensions=*/{}));
+
+  HloInstruction* input_offsets_offset =
+      computation->AddInstruction(HloInstruction::CreateBinary(
+          /*shape=*/shape, HloOpcode::kMultiply,
+          /*lhs=*/iota, /*rhs=*/num_input_rows_constant_broadcast));
+
+  return computation->AddInstruction(HloInstruction::CreateBinary(
+      /*shape=*/shape, HloOpcode::kAdd,
+      /*lhs=*/local_metadata,
+      /*rhs=*/input_offsets_offset));
+}
+
+// Exchanges the metadata operands between the hosts and computes the intra-host
 // metadata.
-//
-// If `correct_offsets` is true, the offsets are corrected to account for the
-// number of input rows in the combined ragged tensor. It's needed for
-// `input_offsets`.
-HloInstruction* GetIntraHostMetadata(
+absl::InlinedVector<HloInstruction*, 4> GetIntraHostMetadata(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
-    HloInstruction* metadata_operand, HloComputation* computation,
-    const std::vector<ReplicaGroup>& replica_groups,
-    int64_t num_updates_per_replica, int64_t fast_interconnect_slice_size,
-    int64_t num_hosts, bool correct_offsets) {
+    HloComputation* computation, absl::Span<ReplicaGroup const> replica_groups,
+    int64_t num_hosts, int64_t num_devices_in_replica) {
+  int64_t num_devices_in_replica_per_host = num_devices_in_replica / num_hosts;
+
+  absl::InlinedVector<HloInstruction*, 4> metadata_operands;
+  metadata_operands.reserve(4);
+  for (int i = 2; i < 6; ++i) {
+    metadata_operands.push_back(ragged_all_to_all->mutable_operand(i));
+  }
+
+  Shape metadata_operand_shape = metadata_operands[0]->shape();
+
+  int64_t num_updates_per_replica =
+      metadata_operand_shape.dimensions(0) / num_devices_in_replica;
+
   Shape new_metadata_shape = ShapeUtil::MakeShape(
-      metadata_operand->shape().element_type(),
-      {num_hosts, fast_interconnect_slice_size, num_updates_per_replica});
+      metadata_operand_shape.element_type(),
+      {num_hosts, num_devices_in_replica_per_host, num_updates_per_replica});
 
   Shape new_metadata_transposed_shape = ShapeUtil::MakeShape(
-      metadata_operand->shape().element_type(),
-      {fast_interconnect_slice_size, num_hosts, num_updates_per_replica});
+      metadata_operand_shape.element_type(),
+      {num_devices_in_replica_per_host, num_hosts, num_updates_per_replica});
 
-  HloInstruction* new_input_offsets = computation->AddInstruction(
-      HloInstruction::CreateReshape(new_metadata_shape, metadata_operand));
+  for (int64_t i = 0; i < metadata_operands.size(); ++i) {
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            new_metadata_shape, metadata_operands[i]));
+  }
+
+  Shape all_to_all_shape =
+      ShapeUtil::MakeShape(metadata_operand_shape.element_type(),
+                           {num_hosts, num_devices_in_replica_per_host,
+                            4 * num_updates_per_replica});
 
-  HloInstruction* new_local_metadata =
+  HloInstruction* all_to_all_input =
+      computation->AddInstruction(HloInstruction::CreateConcatenate(
+          /*shape=*/all_to_all_shape,
+          /*operands=*/metadata_operands,
+          /*dimension=*/2));
+
+  HloInstruction* all_to_all =
       computation->AddInstruction(HloInstruction::CreateAllToAll(
-          /*shape=*/new_metadata_shape,
-          /*operands=*/{new_input_offsets},
+          /*shape=*/all_to_all_shape,
+          /*operands=*/{all_to_all_input},
           /*device_list=*/CollectiveDeviceList(replica_groups),
           /*constrain_layout=*/false,
-          /*channel_id=*/NextChannelId(*computation->parent()),
+          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+              ? std::make_optional(NextChannelId(*computation->parent()))
+              : std::nullopt,
           /*split_dimension=*/0));
 
-  if (correct_offsets) {
-    HloInstruction* iota =
-        computation->AddInstruction(HloInstruction::CreateIota(
+  for (int i = 0; i < metadata_operands.size(); ++i) {
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateSlice(
             /*shape=*/new_metadata_shape,
-            /*iota_dimension=*/0));
-
-    int64_t num_input_rows =
-        ragged_all_to_all->operand(0)->shape().dimensions(0);
-
-    HloInstruction* num_input_rows_constant =
-        computation->AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::CreateR0<int64_t>(num_input_rows)));
-
-    HloInstruction* num_input_rows_constant_broadcast =
-        computation->AddInstruction(HloInstruction::CreateBroadcast(
-            /*shape=*/new_metadata_shape, num_input_rows_constant,
-            /*broadcast_dimensions=*/{}));
-
-    HloInstruction* input_offsets_offset =
-        computation->AddInstruction(HloInstruction::CreateBinary(
-            /*shape=*/new_metadata_shape, HloOpcode::kMultiply,
-            /*lhs=*/iota, /*rhs=*/num_input_rows_constant_broadcast));
-
-    new_local_metadata =
-        computation->AddInstruction(HloInstruction::CreateBinary(
-            /*shape=*/new_metadata_shape, HloOpcode::kAdd,
-            /*lhs=*/new_local_metadata,
-            /*rhs=*/input_offsets_offset));
+            /*operand=*/all_to_all,
+            /*start_indices=*/{0, 0, i * num_updates_per_replica},
+            /*limit_indices=*/
+            {num_hosts, num_devices_in_replica_per_host,
+             (i + 1) * num_updates_per_replica},
+            /*strides=*/{1, 1, 1}));
+  }
+
+  // Correct input offsets that need to be adjusted for the number of input
+  // rows.
+  metadata_operands[0] =
+      CorrectOffsets(ragged_all_to_all->operand(0)->shape().dimensions(0),
+                     metadata_operands[0], computation);
+
+  for (int i = 0; i < metadata_operands.size(); ++i) {
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateTranspose(
+            /*shape=*/new_metadata_transposed_shape,
+            /*operand=*/metadata_operands[i],
+            /*dimensions=*/{1, 0, 2}));
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            metadata_operand_shape, metadata_operands[i]));
   }
 
-  HloInstruction* new_local_metadata_transposed =
-      computation->AddInstruction(HloInstruction::CreateTranspose(
-          /*shape=*/new_metadata_transposed_shape,
-          /*operand=*/new_local_metadata,
-          /*dimensions=*/{1, 0, 2}));
+  return metadata_operands;
+}
+
+// Decomposes a dispatch `ragged-all-to-all` collective into an inter-host
+// `all-gather` and an intra-host `ragged-all-to-all`.
+//
+// Dispatch phase of MoE layer is characterized by the following properties:
+//   - The input is dense and all or most of the rows are significant.
+//   - The output is larger than the input, because we need to have a static
+//   allocation that will accommodate all the possible rows.
+// In case of dispatch phase, doing `all-gather` on inputs first is more
+// efficient, because we're only transferring significant data with up to 2x
+// overhead.
+absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
+    HloRaggedAllToAllInstruction* ragged_all_to_all,
+    HloComputation* computation,
+    absl::Span<ReplicaGroup const> inter_host_replica_groups,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    int64_t num_devices_in_replica) {
+  HloInstruction* input_operand = ragged_all_to_all->mutable_operand(0);
+
+  Shape new_input_shape = input_operand->shape();
+  new_input_shape.set_dimensions(
+      0, num_hosts * input_operand->shape().dimensions(0));
+
+  // The collective can run in two modes: cross-replica and cross-partition. If
+  // the original `ragged-all-to-all` has a channel id set, then it's a
+  // cross-partition collective. In that case `all-gather` needs a channel_id
+  // and `use_global_device_ids=true`.
+  // Otherwise, when `ragged-all-to-all` has no channel id, it's a cross-replica
+  // collective. In that case `all-gather` doesn't need a `channel_id` and
+  // `use_global_device_ids` should be set to false.
+  HloInstruction* all_gather_input =
+      computation->AddInstruction(HloInstruction::CreateAllGather(
+          /*shape=*/new_input_shape,
+          /*operands=*/{ragged_all_to_all->mutable_operand(0)},
+          /*all_gather_dimension=*/0,
+          /*device_list=*/CollectiveDeviceList(inter_host_replica_groups),
+          /*constrain_layout=*/false,
+          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+              ? std::make_optional(NextChannelId(*computation->parent()))
+              : std::nullopt,
+          /*use_global_device_ids=*/
+          ragged_all_to_all->channel_id().has_value()));
+
+  absl::InlinedVector<HloInstruction*, 4> intra_host_metadata =
+      GetIntraHostMetadata(ragged_all_to_all, computation,
+                           inter_host_replica_groups, num_hosts,
+                           num_devices_in_replica);
+
+  HloInstruction* new_ragged_all_to_all =
+      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
+          /*shape=*/ragged_all_to_all->shape(),
+          /*operands=*/
+          {all_gather_input, ragged_all_to_all->mutable_operand(1),
+           intra_host_metadata[0], intra_host_metadata[1],
+           intra_host_metadata[2], intra_host_metadata[3]},
+          /*replica_groups=*/intra_host_replica_groups,
+          /*channel_id=*/ragged_all_to_all->channel_id()));
+
+  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(ragged_all_to_all,
+                                                     new_ragged_all_to_all));
+
+  return true;
+}
+
+// Decomposes a combine `ragged-all-to-all` collective.
+//
+// Combine phase of MoE layer is characterized by the following properties:
+//   - The input is larget than the output, because it contains rows distributed
+//     by the dispatch phase.
+//   - Most of the input rows are not significant, because it's padded to
+//     accommodate all possible rows.
+//   - The distribution of the significant rows depends on the runtime state of
+//     the MoE layer, so we can't reason about it in an HLO rewrite pass.
+//
+// An `all-gather` as a first step would be inefficient in this case, because
+// we would be transferring a lot of padding. An optimal way is to do
+// `ragged-all-to-all` within the hosts to partially gather the significant data
+// into smaller temporary buffer of output size. Exchange the data cross-host
+// and the do another local `ragged-all-to-all` to the final output. This way we
+// transfer more significant data with minimal padding with up to 2x overhead.
+absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
+    HloRaggedAllToAllInstruction* ragged_all_to_all,
+    HloComputation* computation,
+    absl::Span<ReplicaGroup const> inter_host_replica_groups,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    int64_t num_devices_in_replica, int64_t num_participating_devices) {
+  auto* zero = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(
+          ragged_all_to_all->operand(1)->shape().element_type())));
+
+  Shape tmp_output_shape = ragged_all_to_all->shape();
+  tmp_output_shape.set_dimensions(0,
+                                  num_hosts * tmp_output_shape.dimensions(0));
+
+  auto* zero_broadcast =
+      computation->AddInstruction(HloInstruction::CreateBroadcast(
+          /*shape=*/tmp_output_shape, zero, /*broadcast_dimensions=*/{}));
+
+  int64_t num_devices_in_replica_per_host = num_devices_in_replica / num_hosts;
 
-  HloInstruction* intra_host_metadata =
+  int64_t num_updates_per_replica =
+      ragged_all_to_all->operand(2)->shape().dimensions(0) /
+      num_devices_in_replica;
+
+  auto get_intra_host_metadata = [&](HloInstruction* metadata_operand,
+                                     bool correct_offsets) {
+    metadata_operand =
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            /*shape=*/ShapeUtil::MakeShape(
+                metadata_operand->shape().element_type(),
+                {num_hosts, num_devices_in_replica_per_host,
+                 num_updates_per_replica}),
+            /*operand=*/metadata_operand));
+
+    if (correct_offsets) {
+      metadata_operand =
+          CorrectOffsets(ragged_all_to_all->operand(1)->shape().dimensions(0),
+                         metadata_operand, computation);
+    }
+
+    metadata_operand =
+        computation->AddInstruction(HloInstruction::CreateTranspose(
+            /*shape=*/ShapeUtil::MakeShape(
+                metadata_operand->shape().element_type(),
+                {num_devices_in_replica_per_host, num_hosts,
+                 num_updates_per_replica}),
+            /*operand=*/metadata_operand,
+            /*dimensions=*/{1, 0, 2}));
+
+    return computation->AddInstruction(HloInstruction::CreateReshape(
+        /*shape=*/ragged_all_to_all->operand(2)->shape(),
+        /*operand=*/metadata_operand));
+  };
+
+  absl::InlinedVector<HloInstruction*, 4> intra_host_ragged_all_to_all_operands{
+      ragged_all_to_all->mutable_operand(0),
+      zero_broadcast,
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(2),
+                              /*correct_offsets=*/false),
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(3),
+                              /*correct_offsets=*/false),
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(4),
+                              /*correct_offsets=*/true),
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(5),
+                              /*correct_offsets=*/false),
+  };
+
+  HloInstruction* intra_host_ragged_all_to_all =
+      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
+          /*shape=*/zero_broadcast->shape(),
+          /*operands=*/intra_host_ragged_all_to_all_operands,
+          /*device_list=*/CollectiveDeviceList(intra_host_replica_groups),
+          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+              ? std::make_optional(NextChannelId(*computation->parent()))
+              : std::nullopt));
+
+  HloInstruction* local_inputs =
+      computation->AddInstruction(HloInstruction::CreateAllToAll(
+          intra_host_ragged_all_to_all->shape(), {intra_host_ragged_all_to_all},
+          /*device_list=*/CollectiveDeviceList(inter_host_replica_groups),
+          /*constrain_layout=*/false,
+          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+              ? std::make_optional(NextChannelId(*computation->parent()))
+              : std::nullopt,
+          /*split_dimension=*/0));
+
+  absl::InlinedVector<ReplicaGroup, 16> degenerated_replica_groups(
+      num_participating_devices);
+  for (int64_t i = 0; i < num_participating_devices; ++i) {
+    degenerated_replica_groups[i].add_replica_ids(i);
+  }
+
+  HloInstruction* output_offsets = ragged_all_to_all->mutable_operand(4);
+
+  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
+      /*shape=*/ShapeUtil::MakeShape(
+          output_offsets->shape().element_type(),
+          {num_devices_in_replica, num_updates_per_replica}),
+      /*operand=*/output_offsets));
+
+  output_offsets = computation->AddInstruction(HloInstruction::CreateAllToAll(
+      /*shape=*/output_offsets->shape(),
+      /*operands=*/{output_offsets},
+      /*device_list=*/ragged_all_to_all->device_list(),
+      /*constrain_layout=*/false,
+      /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+          ? std::make_optional(NextChannelId(*computation->parent()))
+          : std::nullopt,
+      /*split_dimension=*/0));
+
+  HloInstruction* corrected_output_offsets = output_offsets;
+
+  corrected_output_offsets =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          /*shape=*/ShapeUtil::MakeShape(
+              output_offsets->shape().element_type(),
+              {num_hosts, num_devices_in_replica_per_host,
+               num_updates_per_replica}),
+          /*operand=*/corrected_output_offsets));
+
+  corrected_output_offsets =
+      CorrectOffsets(ragged_all_to_all->operand(1)->shape().dimensions(0),
+                     corrected_output_offsets, computation);
+
+  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
+      /*shape=*/ragged_all_to_all->operand(2)->shape(),
+      /*operand=*/output_offsets));
+
+  corrected_output_offsets =
       computation->AddInstruction(HloInstruction::CreateReshape(
-          metadata_operand->shape(), new_local_metadata_transposed));
+          /*shape=*/ragged_all_to_all->operand(2)->shape(),
+          /*operand=*/corrected_output_offsets));
+
+  HloInstruction* local_ragged_all_to_all =
+      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
+          /*shape=*/ragged_all_to_all->shape(),
+          /*operands=*/
+          {local_inputs, ragged_all_to_all->mutable_operand(1),
+           corrected_output_offsets, ragged_all_to_all->mutable_operand(5),
+           output_offsets, ragged_all_to_all->mutable_operand(5)},
+          /*device_list=*/CollectiveDeviceList(degenerated_replica_groups),
+          /*channel_id=*/ragged_all_to_all->channel_id()));
 
-  return intra_host_metadata;
+  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(ragged_all_to_all,
+                                                     local_ragged_all_to_all));
+
+  return true;
 }
 
 absl::StatusOr<bool> DecomposeRaggedAllToAll(
@@ -126,11 +400,6 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
 
   auto replica_groups = ragged_all_to_all->replica_groups();
 
-  // TODO(b/445380264): Support multiple replica groups.
-  if (replica_groups.size() > 1) {
-    return false;
-  }
-
   // Replica groups can be empty in collective instruction. Empty replica groups
   // mean that all devices are participating in the collective. This semantics
   // is hard to handle in an HLO pass, because we don't have enough information
@@ -141,18 +410,7 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
     return false;
   }
 
-  const auto& replica_ids = replica_groups[0].replica_ids();
-
-  for (int i = 0; i < replica_ids.size(); ++i) {
-    if (i != replica_ids[i]) {
-      return false;
-    }
-  }
-
-  HloInstruction* input_offsets = ragged_all_to_all->mutable_operand(2);
-
-  int64_t num_updates_per_replica =
-      input_offsets->shape().dimensions(0) / replica_ids.size();
+  int64_t num_devices_in_replica = replica_groups[0].replica_ids_size();
 
   int64_t num_participating_devices = 0;
   for (auto& replica_group : replica_groups) {
@@ -172,62 +430,68 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
     return false;
   }
 
-  std::vector<ReplicaGroup> inter_host_replica_groups(
-      fast_interconnect_slice_size);
-  std::vector<ReplicaGroup> intra_host_replica_groups(num_hosts);
-
-  for (int i = 0; i < fast_interconnect_slice_size; ++i) {
-    inter_host_replica_groups[i].add_replica_ids(i);
-    inter_host_replica_groups[i].add_replica_ids(fast_interconnect_slice_size +
-                                                 i);
-
-    intra_host_replica_groups[0].add_replica_ids(i);
-    intra_host_replica_groups[1].add_replica_ids(fast_interconnect_slice_size +
-                                                 i);
-  }
+  // Decompose the replica groups into inter-host and intra-host replica groups.
+  // For example, if the original replica groups were:
+  //   {{0, 2, 4, 6, 8, 10, 12, 14}, {1, 3, 5, 7, 9, 11, 13, 15}}
+  // Then the inter-host replica groups would be:
+  //   {{0, 8}, {2, 10}, {4, 12}, {6, 14}, {1, 9}, {3, 11}, {5, 13}, {7, 15}}}
+  // And the intra-host replica groups would be:
+  //   {{0, 2, 4, 6}, {8, 10, 12, 14}, {1, 3, 5, 7}, {9, 11, 13, 15}}
+  absl::InlinedVector<ReplicaGroup, 8> intra_host_replica_groups;
+  absl::InlinedVector<ReplicaGroup, 8> inter_host_replica_groups;
+
+  for (const auto& replica_group : replica_groups) {
+    absl::InlinedVector<int64_t, 8> replicas_per_host(num_hosts);
+
+    absl::InlinedVector<ReplicaGroup, 8> intra_host_replica_group_split(
+        num_hosts);
+    for (int64_t replica_id : replica_group.replica_ids()) {
+      int64_t host_id = replica_id / fast_interconnect_slice_size;
+
+      intra_host_replica_group_split[host_id].add_replica_ids(replica_id);
+      replicas_per_host[host_id]++;
+    }
 
-  std::vector<HloInstruction*> intra_host_metadata;
+    // Check that each group has the same number of replicas per host.
+    if (!absl::c_all_of(replicas_per_host,
+                        [&](int64_t v) { return v == replicas_per_host[0]; })) {
+      return false;
+    }
 
-  HloInstruction* input_operand = ragged_all_to_all->mutable_operand(0);
+    absl::c_copy(intra_host_replica_group_split,
+                 std::back_inserter(intra_host_replica_groups));
 
-  Shape new_input_shape = input_operand->shape();
-  new_input_shape.set_dimensions(
-      0, num_hosts * input_operand->shape().dimensions(0));
+    for (int64_t i = 0;
+         i < intra_host_replica_group_split[0].replica_ids_size(); ++i) {
+      ReplicaGroup inter_host_replica_group;
 
-  HloInstruction* all_gather_input =
-      computation->AddInstruction(HloInstruction::CreateAllGather(
-          /*shape=*/new_input_shape,
-          /*operands=*/{ragged_all_to_all->mutable_operand(0)},
-          /*all_gather_dimension=*/0,
-          /*device_list=*/CollectiveDeviceList(inter_host_replica_groups),
-          /*constrain_layout=*/false,
-          /*channel_id=*/NextChannelId(*computation->parent()),
-          /*use_global_device_ids=*/true));
+      inter_host_replica_group.mutable_replica_ids()->Reserve(num_hosts);
+      for (int64_t host_id = 0; host_id < num_hosts; ++host_id) {
+        inter_host_replica_group.add_replica_ids(
+            intra_host_replica_group_split[host_id].replica_ids(i));
+      }
 
-  for (int i = 2; i < 6; ++i) {
-    intra_host_metadata.push_back(GetIntraHostMetadata(
-        ragged_all_to_all, ragged_all_to_all->mutable_operand(i), computation,
-        inter_host_replica_groups, num_updates_per_replica,
-        fast_interconnect_slice_size, num_hosts, /*correct_offsets=*/i == 2));
+      inter_host_replica_groups.push_back(inter_host_replica_group);
+    }
   }
 
-  HloInstruction* new_ragged_all_to_all =
-      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
-          /*shape=*/ragged_all_to_all->shape(),
-          /*operands=*/
-          {all_gather_input, ragged_all_to_all->mutable_operand(1),
-           intra_host_metadata[0], intra_host_metadata[1],
-           intra_host_metadata[2], intra_host_metadata[3]},
-          /*replica_groups=*/intra_host_replica_groups,
-          /*channel_id=*/ragged_all_to_all->channel_id()));
+  int64_t num_input_rows = ragged_all_to_all->operand(0)->shape().dimensions(0);
+  int64_t num_output_rows =
+      ragged_all_to_all->operand(1)->shape().dimensions(0);
 
-  TF_RETURN_IF_ERROR(
-      computation->ReplaceInstruction(hlo, new_ragged_all_to_all));
+  if (num_input_rows > num_output_rows) {
+    return DecomposeCombineRaggedAllToAll(
+        ragged_all_to_all, computation, inter_host_replica_groups,
+        intra_host_replica_groups, num_hosts, num_devices_in_replica,
+        num_participating_devices);
+  }
 
-  return true;
+  return DecomposeDispatchRaggedAllToAll(
+      ragged_all_to_all, computation, inter_host_replica_groups,
+      intra_host_replica_groups, num_hosts, num_devices_in_replica);
 }
 
-absl::StatusOr<bool> RaggedAllToAllMultiHostDecomposer::Run(
+absl::StatusOr<bool> RaggedAllToAllMultiHostDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h
index 011b5749ebe199..cd34fc20c0fea1 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h
@@ -37,7 +37,8 @@ class RaggedAllToAllMultiHostDecomposer : public HloModulePass {
     return "ragged-all-to-all-multi-host-decomposer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
index 7b4b3457875e06..83534c80f0853a 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
@@ -62,7 +62,7 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
-    // CHECK-COUNT-4: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
     // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}{{[}]}}
   )"));
 }
@@ -96,7 +96,7 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
-    // CHECK-COUNT-4: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
     // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}{{[}]}}
   )"));
 }
@@ -124,7 +124,41 @@ ENTRY main {
   EXPECT_FALSE(changed);
 }
 
-TEST_F(RaggedAllToAllDecomposerTest, MultipleReplicaGroupsAreNotSupported) {
+TEST_F(RaggedAllToAllDecomposerTest, CombineRaggedAllToAllIsDecomposed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule module, replica_count=16
+
+ENTRY main {
+  input = bf16[4096,128] parameter(0)
+  output = bf16[256,128] parameter(1)
+  input_offsets = s64[16] parameter(2)
+  send_sizes = s64[16] parameter(3)
+  output_offsets = s64[16] parameter(4)
+  recv_sizes = s64[16] parameter(5)
+  ROOT ra2a = bf16[256,128] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes),
+    replica_groups={{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}}
+}
+)"));
+
+  RaggedAllToAllMultiHostDecomposer decomposer(
+      /*fast_interconnect_slice_size=*/8);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
+
+  EXPECT_TRUE(changed);
+  TF_EXPECT_OK(VerifyHloModule(module.get(), true, true));
+  TF_EXPECT_OK(HloDCE().Run(module.get()));
+  TF_EXPECT_OK(HloCSE(true).Run(module.get()));
+
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}{{[}]}}
+    // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15}{{[}]}}
+  )"));
+}
+
+TEST_F(RaggedAllToAllDecomposerTest, MultipleReplicaGroupsAreSupported) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
 HloModule module
 
@@ -137,14 +171,24 @@ ENTRY main {
     recv_sizes = s64[8] parameter(5)
     ROOT ra2a = bf16[256] ragged-all-to-all(input, output, input_offsets,
       send_sizes, output_offsets, recv_sizes),
-      replica_groups={{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}}
+      replica_groups={{0,2,4,6,8,10,12,14},{1,3,5,7,9,11,13,15}}
 }
 )"));
 
   RaggedAllToAllMultiHostDecomposer decomposer(
-      /*fast_interconnect_slice_size=*/4);
+      /*fast_interconnect_slice_size=*/8);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+
+  TF_EXPECT_OK(VerifyHloModule(module.get(), true, true));
+  TF_EXPECT_OK(HloDCE().Run(module.get()));
+  TF_EXPECT_OK(HloCSE(true).Run(module.get()));
+
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{2,10},{4,12},{6,14},{1,9},{3,11},{5,13},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{2,10},{4,12},{6,14},{1,9},{3,11},{5,13},{7,15}{{[}]}}
+    // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,2,4,6},{8,10,12,14},{1,3,5,7},{9,11,13,15}{{[}]}}
+  )"));
 }
 
 TEST_F(RaggedAllToAllDecomposerTest, OnlyDecompositionForTwoHostsIsSupported) {
@@ -193,6 +237,30 @@ ENTRY main {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(RaggedAllToAllDecomposerTest,
+       RaggedAllToAllWithinSingleHostIsNotDecomposed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+ENTRY main {
+    input = bf16[128] parameter(0)
+    output = bf16[256] parameter(1)
+    input_offsets = s64[8] parameter(2)
+    send_sizes = s64[8] parameter(3)
+    output_offsets = s64[8] parameter(4)
+    recv_sizes = s64[8] parameter(5)
+    ROOT ra2a = bf16[256] ragged-all-to-all(input, output, input_offsets,
+      send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}}
+}
+)"));
+
+  RaggedAllToAllMultiHostDecomposer decomposer(
+      /*fast_interconnect_slice_size=*/8);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
index 607c0d1c14a9ff..7217dce5e09ac0 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
@@ -39,9 +39,9 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> ReduceScatterCreator::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReduceScatterCreator::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   const HloModuleConfig &config = module->config();
   int64_t next_channel_id = hlo_query::NextChannelId(*module);
 
diff --git a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
index 23f9c96ec28c85..62da897e4afb4e 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
@@ -31,8 +31,8 @@ class ReduceScatterCreator : public HloModulePass {
   ReduceScatterCreator() = default;
   absl::string_view name() const override { return "reduce-scatter-creator"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc
index 9be719f91bdf22..b9d28d90a980cb 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc
@@ -34,8 +34,10 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/algebraic_simplifier.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
@@ -68,11 +70,27 @@ class GpuReduceScatterCreatorTest : public HloHardwareIndependentTestBase {
   }
 
   size_t ReduceScatterCount(std::unique_ptr<HloModule> &module) {
-    return CollectiveCount(module, HloOpcode::kAllReduce);
+    return CollectiveCount(module, HloOpcode::kReduceScatter);
+  }
+
+  template <typename T>
+  size_t AllReduceCount(std::unique_ptr<T>& module) {
+    return CollectiveCount(module.get(), HloOpcode::kAllReduce);
+  }
+
+  template <typename T>
+  size_t ReduceScatterCount(std::unique_ptr<T>& module) {
+    return CollectiveCount(module.get(), HloOpcode::kReduceScatter);
   }
 
  private:
   size_t CollectiveCount(std::unique_ptr<HloModule> &module, HloOpcode opcode) {
+    return absl::c_count_if(
+        module->entry_computation()->instructions(),
+        [&opcode](HloInstruction* instr) { return instr->opcode() == opcode; });
+  }
+
+  size_t CollectiveCount(HloModule* module, HloOpcode opcode) {
     return absl::c_count_if(
         module->entry_computation()->instructions(),
         [&opcode](HloInstruction *instr) { return instr->opcode() == opcode; });
@@ -699,6 +717,64 @@ ENTRY %SubtractionPattern {
   EXPECT_EQ(AllReduceCount(module), 0);
 }
 
+TEST_F(GpuReduceScatterCreatorTest, AllReduceThroughTuple) {
+  absl::string_view hlo_string = R"(
+HloModule AllReduceThroughTuple
+
+%sum {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(%a, %b)
+}
+
+ENTRY %AllReduce {
+  %param0 = f32[4096,4096]{1,0} parameter(0)
+  %param1 = f32[1024,4096]{1,0} parameter(1)
+  %all-reduce = f32[4096,4096]{1,0} all-reduce(%param0),
+    replica_groups={{0,1,2,3,4,5,6,7}}, channel_id=5, use_global_device_ids=true, to_apply=%sum
+  %tuple = (f32[4096,4096]{1,0}, f32[1024,4096]{1,0}) tuple(%all-reduce, %param1)
+  %get-tuple-element = f32[4096,4096]{1,0} get-tuple-element(%tuple), index=0
+  %pid = u32[] partition-id()
+  %pid_s32 = s32[] convert(%pid)
+  %slice_size = s32[] constant(512)
+  %offset = s32[] multiply(%pid_s32, %slice_size)
+  %zero = s32[] constant(0)
+  ROOT %dynamic-slice = f32[512,4096]{1,0} dynamic-slice(%get-tuple-element, %offset, %zero),
+    dynamic_slice_sizes={512,4096}
+}
+)";
+
+  HloModuleConfig config = GetModuleConfigForTest(
+      /*replica_count=*/1, /*num_partitions=*/8);
+  config.set_use_spmd_partitioning(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_without_algsimp,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed_without,
+      ReduceScatterCreator().Run(module_without_algsimp.get()));
+  EXPECT_FALSE(changed_without) << "ReduceScatterCreator should not transform "
+                                   "without AlgebraicSimplifier";
+  EXPECT_EQ(AllReduceCount(module_without_algsimp), 1);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_with_algsimp,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+
+  AlgebraicSimplifierOptions options;
+  se::GpuComputeCapability compute_capability{se::CudaComputeCapability{8, 0}};
+  GpuAlgebraicSimplifier algsimp(options, compute_capability);
+  TF_ASSERT_OK_AND_ASSIGN(bool algsimp_changed,
+                          algsimp.Run(module_with_algsimp.get(), {}));
+  EXPECT_TRUE(algsimp_changed);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed_with, ReduceScatterCreator().Run(module_with_algsimp.get()));
+  EXPECT_TRUE(changed_with)
+      << "ReduceScatterCreator should transform after AlgebraicSimplifier";
+  EXPECT_GE(ReduceScatterCount(module_with_algsimp), 1)
+      << "Expected at least one ReduceScatter after transformation";
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
index c8ac038a67d7b3..bd2c5f756a57dc 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
@@ -119,9 +119,9 @@ class ReductionDegenerateDimRemoverVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> ReductionDegenerateDimRemover::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionDegenerateDimRemover::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed,
                       ReductionDegenerateDimRemoverVisitor().RunOnModule(
                           module, execution_threads));
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
index fadec68761f8a8..c497b467a74319 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
@@ -44,8 +44,9 @@ class ReductionDegenerateDimRemover : public HloModulePass {
   absl::string_view name() const override {
     return "reduction-degenerate-dim-remover";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
index 333b0843cee9e1..1d041735564557 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
@@ -112,9 +112,9 @@ class ReduceDimensionGroupVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> ReductionDimensionGrouper::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionDimensionGrouper::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed, ReduceDimensionGroupVisitor().RunOnModule(
                                         module, execution_threads));
   return changed;
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
index d0372920fbf93d..8f91b837d01859 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
@@ -44,8 +44,9 @@ class ReductionDimensionGrouper : public HloModulePass {
   absl::string_view name() const override {
     return "reduction-dimension-grouper";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
index 97462a033dbc74..05ad0257d7f62b 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
@@ -183,9 +183,9 @@ class EnforceMinorToMajorReduceOpVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> ReductionLayoutNormalizer::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionLayoutNormalizer::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed,
                       EnforceMinorToMajorReduceOpVisitor().RunOnModule(
                           module, execution_threads));
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
index 454fb7f91ed7f3..a7b9cb80d9d6a9 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
@@ -42,8 +42,8 @@ class ReductionLayoutNormalizer : public HloModulePass {
     return "reduction-layout-normalizer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc
index 8910f5b118ed01..48491a500cf6f6 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc
@@ -23,7 +23,8 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -33,7 +34,8 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class ReductionLayoutNormalizerTest : public HloTestBase {
+class ReductionLayoutNormalizerTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  public:
   void CheckReductionLayoutNormalizer(
       absl::string_view hlo, std::optional<absl::string_view> expected) {
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc
index 12f5ecef9a160b..e1366f8f52ad11 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc
@@ -131,9 +131,9 @@ class ReductionSplitterVisitor : public DfsHloRewriteVisitor {
   const bool ignore_small_dims_;
 };
 
-absl::StatusOr<bool> ReductionSplitter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionSplitter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
       bool changed,
       ReductionSplitterVisitor(device_description_, ignore_small_dims_)
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
index f5abe00c4014e7..439e40d3100d31 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
@@ -47,8 +47,8 @@ class ReductionSplitter : public HloModulePass {
         ignore_small_dims_(ignore_small_dims) {}
   absl::string_view name() const override { return "reduction-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc b/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc
index ac396b3fd5915f..ab2250f88a5cbc 100644
--- a/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc
+++ b/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc
@@ -73,7 +73,7 @@ void RenameFusion(HloModule* module, HloInstruction* instruction) {
 
 }  // namespace
 
-absl::StatusOr<bool> RenameFusions::Run(
+absl::StatusOr<bool> RenameFusions::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/rename_fusions.h b/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
index 9313a72d68669e..45a41e1c1c2b4d 100644
--- a/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
+++ b/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
@@ -35,8 +35,9 @@ namespace gpu {
 
 class RenameFusions : public HloModulePass {
   absl::string_view name() const override { return "rename_fusions"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc
index 4cb6c672044277..3f6c400bed5b0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc
@@ -29,7 +29,7 @@ namespace xla {
 
 namespace gpu {
 
-absl::StatusOr<bool> SanitizeConstantNames::Run(
+absl::StatusOr<bool> SanitizeConstantNames::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
index a7081455d89ca4..8474ec77388232 100644
--- a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
+++ b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
@@ -32,8 +32,8 @@ class SanitizeConstantNames : public HloModulePass {
  public:
   absl::string_view name() const override { return "sanitize-constant-names"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc
index 21e13582793d22..ba91f2ffabf524 100644
--- a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> ScalarConstantSinker::Run(
+absl::StatusOr<bool> ScalarConstantSinker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h
index f3fffefb6854b0..4173f914beefb2 100644
--- a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h
+++ b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h
@@ -34,8 +34,8 @@ class ScalarConstantSinker : public HloModulePass {
  public:
   absl::string_view name() const override { return "scalar-constant-sinker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc
index 324cf215361b91..ff19b68123c5db 100644
--- a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc
@@ -183,7 +183,7 @@ absl::StatusOr<bool> ScaledDotRewriter::RewriteComputation(
   return changed;
 }
 
-absl::StatusOr<bool> ScaledDotRewriter::Run(
+absl::StatusOr<bool> ScaledDotRewriter::RunImpl(
     HloModule* module, const absl::flat_hash_set<absl::string_view>&) {
   bool changed = false;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h
index 587acf8b781864..d3a3a822d5c321 100644
--- a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h
@@ -32,12 +32,12 @@ class ScaledDotRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "scaled-dot-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
index df6fcd183cf1a3..1cf15ae8b37f3c 100644
--- a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
@@ -260,7 +260,7 @@ class ScatterSliceSimplifierVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> ScatterSliceSimplifier::Run(
+absl::StatusOr<bool> ScatterSliceSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return ScatterSliceSimplifierVisitor{}.RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
index 4ee985e49a3076..8d8e23cc880716 100644
--- a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
@@ -47,8 +47,8 @@ class ScatterSliceSimplifier : public HloModulePass {
  public:
   absl::string_view name() const override { return "scatter-slice-simplifier"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc
index e55f11da103cbc..61a85b76a69835 100644
--- a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc
@@ -52,7 +52,7 @@ absl::StatusOr<bool> AnnotateSchedulingInstructionNames(
 
 }  // namespace
 
-absl::StatusOr<bool> SchedulingInstructionAnnotator::Run(
+absl::StatusOr<bool> SchedulingInstructionAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   CHECK(module->has_schedule())
diff --git a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
index e73ad81cc1caaa..90acbc62b2360a 100644
--- a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
@@ -33,8 +33,8 @@ class SchedulingInstructionAnnotator : public HloModulePass {
     return "scheduling-instruction-annotator";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
index 0f77efae4328a3..ef7b6269cacb51 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -43,6 +44,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/layout_util.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -69,6 +71,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::MLIRContext;
+
 using hlo_query::IsBroadcastOfParameter;
 using hlo_query::IsBroadcastOfScalarConstant;
 
@@ -266,7 +270,7 @@ absl::StatusOr<HloFusionInstruction*> MakeFusionForDiamond(
 absl::Status RunFusionPipeline(
     HloModule* module, const se::DeviceDescription& device_info,
     const HloCostAnalysis::ShapeSizeFunction& shape_size,
-    mlir::MLIRContext* mlir_context) {
+    const GpuAliasInfo* alias_info, MLIRContext* mlir_context) {
   HloPassPipeline reduction_pipeline("reduction_pipeline");
   // Passes that run after SoftmaxRewriterTriton and before PriorityFusion and
   // transform reductions.
@@ -279,7 +283,8 @@ absl::Status RunFusionPipeline(
   TF_RETURN_IF_ERROR(reduction_pipeline.Run(module).status());
 
   return FusionPipeline(module->config().debug_options(), shape_size,
-                        /*thread_pool=*/nullptr, device_info, mlir_context)
+                        alias_info, /*thread_pool=*/nullptr, device_info,
+                        mlir_context)
       .Run(module)
       .status();
 }
@@ -298,14 +303,14 @@ EstimateOptimizedHloRunTimeWithoutSoftMaxRewriterTriton(
     const HloFusionInstruction* fusion,
     const se::DeviceDescription& device_info,
     const HloCostAnalysis::ShapeSizeFunction& shape_size,
-    mlir::MLIRContext* mlir_context) {
+    const GpuAliasInfo* alias_info, MLIRContext* mlir_context) {
   auto new_module = ExtractComputationIntoNewModule(
       *fusion->fused_instructions_computation());
 
   // After this call, the `new_module` will have instruction fused without
   // SoftmaxRewriterTriton.
   TF_RETURN_IF_ERROR(RunFusionPipeline(new_module.get(), device_info,
-                                       shape_size, mlir_context));
+                                       shape_size, alias_info, mlir_context));
 
   VLOG(3) << "priority fusion module: " << new_module->ToString();
 
@@ -345,7 +350,8 @@ DecideIfShouldFuseAndMaybeSetBlockLevelParameters(
     GpuPerformanceModelWithIndexingAnalysis& indexing_performance_model,
     const se::DeviceDescription& device_info,
     const HloCostAnalysis::ShapeSizeFunction& shape_size,
-    mlir::MLIRContext* mlir_context, bool use_cost_model_to_evaluate_fusions) {
+    const GpuAliasInfo* alias_info, MLIRContext* mlir_context,
+    bool use_cost_model_to_evaluate_fusions) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(normalization_fusion);
 
   TF_ASSIGN_OR_RETURN(
@@ -362,10 +368,10 @@ DecideIfShouldFuseAndMaybeSetBlockLevelParameters(
       std::get<TiledRunTimeData>(std::move(tiled_runtime_data_or));
 
   if (use_cost_model_to_evaluate_fusions) {
-    TF_ASSIGN_OR_RETURN(
-        absl::Duration run_time_without_softmax_rewriter,
-        EstimateOptimizedHloRunTimeWithoutSoftMaxRewriterTriton(
-            normalization_fusion, device_info, shape_size, mlir_context));
+    TF_ASSIGN_OR_RETURN(absl::Duration run_time_without_softmax_rewriter,
+                        EstimateOptimizedHloRunTimeWithoutSoftMaxRewriterTriton(
+                            normalization_fusion, device_info, shape_size,
+                            alias_info, mlir_context));
 
     VLOG(2) << "run time estimate if normalization diamond fused together: "
             << tiled_runtime_data.runtime_data.exec_time;
@@ -397,18 +403,19 @@ absl::StatusOr<bool> MaybeFuseDiamondImpl(
     GpuPerformanceModelWithIndexingAnalysis& indexing_performance_model,
     const se::DeviceDescription& device_info,
     const HloCostAnalysis::ShapeSizeFunction& shape_size,
-    mlir::MLIRContext* mlir_context, bool use_cost_model_to_evaluate_fusions) {
+    const GpuAliasInfo* alias_info, MLIRContext* mlir_context,
+    bool use_cost_model_to_evaluate_fusions) {
   TF_ASSIGN_OR_RETURN(HloFusionInstruction * normalization_fusion,
                       MakeFusionForDiamond(diamond));
   HloInstruction* root = diamond.root;
 
   VLOG(2) << "MaybeFuseDiamondImpl: " << normalization_fusion->ToString();
 
-  TF_ASSIGN_OR_RETURN(
-      FusionDecision fusion_decision,
-      DecideIfShouldFuseAndMaybeSetBlockLevelParameters(
-          normalization_fusion, indexing_performance_model, device_info,
-          shape_size, mlir_context, use_cost_model_to_evaluate_fusions));
+  TF_ASSIGN_OR_RETURN(FusionDecision fusion_decision,
+                      DecideIfShouldFuseAndMaybeSetBlockLevelParameters(
+                          normalization_fusion, indexing_performance_model,
+                          device_info, shape_size, alias_info, mlir_context,
+                          use_cost_model_to_evaluate_fusions));
 
   if (!fusion_decision.CanFuse()) {
     VLOG(2) << "Not fusing: " << fusion_decision.Explain();
@@ -436,10 +443,10 @@ absl::StatusOr<bool> CanSymbolicTileAnalysisTileDiamond(
     const se::DeviceDescription& device_info) {
   TF_ASSIGN_OR_RETURN(HloFusionInstruction * normalization_fusion,
                       MakeFusionForDiamond(diamond));
-  mlir::MLIRContext context;
+  mlir::MLIRContext mlir_context;
   SymbolicTileAnalysisOrError symbolic_tile_analysis_or_error =
       SymbolicTileAnalysis::AnalyzeComputation(
-          *normalization_fusion->called_computation(), &context,
+          *normalization_fusion->called_computation(), &mlir_context,
           TritonEmitterConstraints::GetBuilder(device_info));
 
   bool can_tile = std::holds_alternative<SymbolicTileAnalysis>(
@@ -636,11 +643,11 @@ absl::StatusOr<bool> SoftmaxRewriterTriton::MaybeFuseNormalizationDiamond(
       &device_info_, &fusion_analysis_cache, shape_size_, mlir_context_);
 
   return MaybeFuseDiamondImpl(diamond, indexing_performance_model, device_info_,
-                              shape_size_, mlir_context_,
+                              shape_size_, alias_info_, mlir_context_,
                               use_cost_model_to_evaluate_fusions_);
 }
 
-absl::StatusOr<bool> SoftmaxRewriterTriton::Run(
+absl::StatusOr<bool> SoftmaxRewriterTriton::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(EnsureTritonSupportsComputeCapability(
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
index 72a31cd65fc8ca..315623af25a736 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
@@ -22,10 +22,11 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
@@ -51,20 +52,17 @@ class SoftmaxRewriterTriton : public HloModulePass {
  public:
   explicit SoftmaxRewriterTriton(const se::DeviceDescription& device_info,
                                  HloCostAnalysis::ShapeSizeFunction shape_size,
+                                 const GpuAliasInfo* alias_info,
                                  mlir::MLIRContext* mlir_context,
                                  bool only_fuse_if_profitable = false)
       : device_info_(device_info),
         shape_size_(shape_size),
+        alias_info_(alias_info),
         use_cost_model_to_evaluate_fusions_(only_fuse_if_profitable),
         mlir_context_(mlir_context) {}
 
   absl::string_view name() const override { return "triton-softmax-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Finds and returns all the fusible normalization diamonds in the module. The
   // resulting vector is sorted according to a post-order matching (i.e. within
   // the same computation, producer diamonds appear before consumer diamonds).
@@ -102,9 +100,15 @@ class SoftmaxRewriterTriton : public HloModulePass {
   DiamondMatchingDecision MatchesTritonCompatibleClosedReductionDiamond(
       HloInstruction* instr) const;
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const se::DeviceDescription& device_info_;
   const HloCostAnalysis::ShapeSizeFunction shape_size_;
+  const GpuAliasInfo* alias_info_;
   bool use_cost_model_to_evaluate_fusions_;
   mlir::MLIRContext* mlir_context_;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
index e40222034df15b..f50392dc9d6385 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
@@ -25,11 +25,13 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -63,8 +65,10 @@ class SoftmaxRewriterTritonTest
  protected:
   se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
   mlir::MLIRContext mlir_context_;
-  SoftmaxRewriterTriton fusion_rewriter_{
-      device_info_, HloCostAnalysis::DefaultShapeSize, &mlir_context_};
+  GpuAliasInfo alias_info_{device_info_};
+  SoftmaxRewriterTriton fusion_rewriter_{device_info_,
+                                         HloCostAnalysis::DefaultShapeSize,
+                                         &alias_info_, &mlir_context_};
 };
 
 TEST_F(SoftmaxRewriterTritonTest, CanFuseSingleNormalizationF32) {
@@ -564,7 +568,7 @@ ENTRY main {
       SoftmaxRewriterTriton(
           TestGpuDeviceInfo::RTXA6000DeviceInfo(
               se::CudaComputeCapability{se::CudaComputeCapability::kVolta, 0}),
-          HloCostAnalysis::DefaultShapeSize, &mlir_context_)
+          HloCostAnalysis::DefaultShapeSize, &alias_info_, &mlir_context_)
           .Run(module.get()),
       absl_testing::StatusIs(
           tsl::error::FAILED_PRECONDITION,
@@ -593,7 +597,7 @@ ENTRY main {
 
   EXPECT_TRUE(SoftmaxRewriterTriton(TestGpuDeviceInfo::AMDMI210DeviceInfo(),
                                     HloCostAnalysis::DefaultShapeSize,
-                                    &mlir_context_)
+                                    &alias_info_, &mlir_context_)
                   .Run(module.get())
                   .ok());
 }
@@ -678,8 +682,9 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
-  SoftmaxRewriterTriton fusion_rewriter(
-      device_info_, HloCostAnalysis::DefaultShapeSize, &mlir_context_);
+  SoftmaxRewriterTriton fusion_rewriter(device_info_,
+                                        HloCostAnalysis::DefaultShapeSize,
+                                        &alias_info_, &mlir_context_);
   EXPECT_FALSE(fusion_rewriter_.Run(module.get()).value());
 }
 
@@ -746,7 +751,6 @@ ENTRY main {
           m::Fusion(m::Parameter()).WithPredicate(HasBlockLevelFusionConfig)));
 }
 
-
 TEST_F(SoftmaxRewriterTritonTest,
        CanFuseBinaryElementwiseOperationWhereOneOperandIsASharedSplatProducer) {
   const std::string hlo_string = R"(
@@ -827,7 +831,8 @@ ENTRY main {
 
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
   SoftmaxRewriterTriton softmax_rewriter_triton(
-      device_info_, HloCostAnalysis::DefaultShapeSize, &mlir_context_);
+      device_info_, HloCostAnalysis::DefaultShapeSize, &alias_info_,
+      &mlir_context_);
   int unmatched = 0, matched = 0;
   for (HloInstruction* instruction :
        module->entry_computation()->MakeInstructionPostOrder()) {
@@ -1082,7 +1087,8 @@ ENTRY main {
     // Verify that SoftmaxRewriterTriton without Cost Model will fuse the
     // normalization diamond.
     SoftmaxRewriterTriton fusion_rewriter_without_cost_model{
-        device_info_, HloCostAnalysis::DefaultShapeSize, &mlir_context_,
+        device_info_, HloCostAnalysis::DefaultShapeSize, &alias_info_,
+        &mlir_context_,
         /*only_fuse_if_profitable=*/false};
 
     auto module = ParseAndReturnVerifiedModule(hlo_string).value();
@@ -1097,7 +1103,8 @@ ENTRY main {
     // SoftmaxRewriterTriton with Cost Model will discard the normalization
     // diamond, because row size is too large.
     SoftmaxRewriterTriton fusion_rewriter_with_cost_model{
-        device_info_, HloCostAnalysis::DefaultShapeSize, &mlir_context_,
+        device_info_, HloCostAnalysis::DefaultShapeSize, &alias_info_,
+        &mlir_context_,
         /*only_fuse_if_profitable=*/true};
 
     auto module = ParseAndReturnVerifiedModule(hlo_string).value();
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
index 4fb3098533609e..724af50e3010dc 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
@@ -398,8 +398,8 @@ bool ShouldRewriteCompatibleSort(se::DeviceDescription device_description,
   }
 
   if (SortRewriter::SortMode() == SortRewriter::Mode::kAuto) {
-    if (auto cuda_cc = std::get_if<se::CudaComputeCapability>(
-            &device_description.gpu_compute_capability())) {
+    if (auto* cuda_cc = device_description.gpu_compute_capability()
+                            .cuda_compute_capability()) {
       int bitwidth = primitive_util::BitWidth(operand_shape.element_type());
       int batch_size = Product(operand_shape.dimensions()) / num_elements;
 
@@ -573,17 +573,17 @@ absl::StatusOr<bool> SortRewriter::RunOnComputation(
 }
 
 // Replace compatible sort operations with custom calls.
-absl::StatusOr<bool> SortRewriter::Run(
+absl::StatusOr<bool> SortRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(3, "SortRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(3, "SortRewriter::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
     changed |= result;
   }
-  XLA_VLOG_LINES(3, "SortRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(3, "SortRewriter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
index 7d8907bf87f30f..c541ac54a452af 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
@@ -59,8 +59,8 @@ class SortRewriter : public HloModulePass {
     sort_mode_ = sort_mode;
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
index 74b9cc129deb4e..e0effeae280f6a 100644
--- a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
@@ -56,6 +56,8 @@ struct DotDimensions {
   int64_t m;  // lhs non-contracting dimensions
   int64_t n;  // rhs non-contracting dimensions
   int64_t k;  // contracting dimensions
+  // LHS and RHS element sizes, after going up the chain of elementwise
+  // operations. That approximates what will be fused.
   int64_t lhs_element_bits;
   int64_t rhs_element_bits;
   int64_t acc_element_bits;
@@ -219,24 +221,32 @@ HloInstruction* PadInstruction(HloInstruction* instr, int64_t dimension_idx,
   PaddingConfig padding_config =
       MakeNoPaddingConfig(instr->shape().dimensions().size());
   padding_config.mutable_dimensions(dimension_idx)
-      ->set_edge_padding_low(new_dimension_size -
-                             instr->shape().dimensions(dimension_idx));
+      ->set_edge_padding_high(new_dimension_size -
+                              instr->shape().dimensions(dimension_idx));
   Shape new_shape = instr->shape();
   new_shape.set_dimensions(dimension_idx, new_dimension_size);
   return computation->AddInstruction(
       HloInstruction::CreatePad(new_shape, instr, zero, padding_config));
 }
 
+// Returns the padded K dimension so that it is a multiple of split_k and 16B.
+int64_t GetPaddedK(HloInstruction& dot, int64_t split_k) {
+  DotDimensions dims = GetDotDimensions(&dot);
+  const int64_t alignment_in_bits = 16 * 8;
+  int64_t min_element_size_in_bits = std::min(
+      {alignment_in_bits, dims.lhs_element_bits, dims.rhs_element_bits});
+  return RoundUpTo(dims.k,
+                   split_k * alignment_in_bits / min_element_size_in_bits);
+}
+
 // The contracting dimension index becomes new batch (split) dimension, and all
 // dimensions after it are shifted by 1.
 HloInstruction* SplitKOperand(HloInstruction* operand,
                               int64_t contracting_dimension_idx,
-                              int64_t split_k) {
+                              int64_t split_k, int64_t padded_k) {
   // if the K dimension is not divisible by split_k, we need to pad it.
   const int64_t src_k = operand->shape().dimensions(contracting_dimension_idx);
-  const bool needs_padding = src_k % split_k != 0;
-  if (needs_padding) {
-    const int64_t padded_k = RoundUpTo(src_k, split_k);
+  if (padded_k != src_k) {
     operand = PadInstruction(operand, contracting_dimension_idx, padded_k);
   }
   const Shape& old_shape = operand->shape();
@@ -296,12 +306,13 @@ absl::StatusOr<HloInstruction*> SplitKDimensionOfDot(HloDotInstruction* src_dot,
       src_dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
   const int64_t rhs_k_idx =
       src_dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
+  const int64_t padded_k = GetPaddedK(*src_dot, split_k);
   // The operands' K dimension are split into [split_k, K/split_k] (shifting
   // right all the dimensions after it).
   HloInstruction* lhs =
-      SplitKOperand(src_dot->mutable_operand(0), lhs_k_idx, split_k);
+      SplitKOperand(src_dot->mutable_operand(0), lhs_k_idx, split_k, padded_k);
   HloInstruction* rhs =
-      SplitKOperand(src_dot->mutable_operand(1), rhs_k_idx, split_k);
+      SplitKOperand(src_dot->mutable_operand(1), rhs_k_idx, split_k, padded_k);
 
   // Update the dot's dimension numbers accordingly (shifting right all the
   // dimensions starting from the K dimension and inserting new batch dims).
@@ -373,7 +384,7 @@ class SplitkRewriterVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> SplitkRewriter::Run(
+absl::StatusOr<bool> SplitkRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!module->config()
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
index 2a06bc9cc16b85..012147bb314d8f 100644
--- a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
@@ -35,12 +35,14 @@ class SplitkRewriter : public HloModulePass {
   explicit SplitkRewriter(se::DeviceDescription device_description)
       : device_description_(device_description) {}
 
- private:
   absl::string_view name() const override { return "splitk-rewriter"; }
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+ private:
   se::DeviceDescription device_description_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter_test.cc
index 6a2156716fdeaf..8447fed691b3f9 100644
--- a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter_test.cc
@@ -94,7 +94,7 @@ TEST_F(SplitkRewriterTest, PaddingIsInserted) {
                           rewriter_.HloModulePass::Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_TRUE(RunFileCheck(module->ToString(), R"(
-CHECK: f32[16,102528]{1,0} pad({{.*}}), padding=0_0x127_0
+CHECK: f32[16,102912]{1,0} pad(%lhs, %constant), padding=0_0x0_511
     )")
                   .value_or(false));
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc
index 3128eca6c1d9c5..95e7766bc6607d 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc
@@ -156,6 +156,9 @@ absl::StatusOr<bool> AnnotateStreamAttributesForUsers(
   std::vector<HloInstruction*> all_consumers;
   for (auto user : instr->users()) {
     if (HloPredicateIsOp<HloOpcode::kGetTupleElement>(user)) {
+      if (user->user_count() == 0) {
+        continue;
+      }
       user = user->users()[0];
     }
     all_consumers.push_back(user);
@@ -177,11 +180,11 @@ absl::StatusOr<bool> AnnotateStreamAttributesForUsers(
 }
 }  // namespace
 
-absl::StatusOr<bool> StreamAttributeAnnotator::Run(
+absl::StatusOr<bool> StreamAttributeAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      5, "StreamAttributeAnnotator::Run(), before:\n" + module->ToString());
+      5, "StreamAttributeAnnotator::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   int64_t channel_id = hlo_query::NextChannelId(*module);
   for (const HloComputation* comp :
@@ -225,7 +228,7 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
     }
   }
   XLA_VLOG_LINES(
-      5, "StreamAttributeAnnotator::Run(), after:\n" + module->ToString());
+      5, "StreamAttributeAnnotator::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
index 74b2002670afca..68d47c5bf367e8 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
@@ -54,8 +54,8 @@ class StreamAttributeAnnotator : public HloModulePass {
     return "stream-attribute-annotator";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
index 29a48e7cbbc172..a15c4d04c7b0e4 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
@@ -159,6 +159,28 @@ TEST_F(StreamAttributeAnnotatorTest, GTEUserIsAnnotated) {
   EXPECT_EQ(gpu_config.wait_on_operation_queues()[0], 1);
 }
 
+TEST_F(StreamAttributeAnnotatorTest, GTENoUserIsHandled) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule ModuleWithAsync
+
+  ENTRY entry {
+    p1_32 = f32[16,32] parameter(0)
+    p2_32 = f32[32,16] parameter(1)
+
+    custom-call.3 = (f32[16,16], s8[1028]{0}) custom-call(p1_32, p2_32), custom_call_target="__cublas$gemm", backend_config={"operation_queue_id":"1","wait_on_operation_queues":[],"gemm_backend_config":{"alpha_real":1,"alpha_imag":0,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT","grad_x":false,"grad_y":false}}
+    ROOT get-tuple-element.24 = f32[16,16] get-tuple-element(custom-call.3), index=0
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+
+  StreamAttributeAnnotator attr_annotator{device_description()};
+  bool changed;
+  TF_ASSERT_OK_AND_ASSIGN(changed, attr_annotator.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
 TEST_F(StreamAttributeAnnotatorTest, FusionIsAnnotated) {
   constexpr absl::string_view kHloString = R"(
   HloModule ModuleWithFusion
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc
index 6420b9d4de4004..80657227db4e98 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc
@@ -55,11 +55,11 @@ static absl::StatusOr<bool> AsynchronizeInstruction(HloInstruction* instr) {
 }
 }  // namespace
 
-absl::StatusOr<bool> StreamAttributeAsyncWrapper::Run(
+absl::StatusOr<bool> StreamAttributeAsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "StreamAttributeAsyncWrapper::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "StreamAttributeAsyncWrapper::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
   for (const HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
@@ -68,8 +68,8 @@ absl::StatusOr<bool> StreamAttributeAsyncWrapper::Run(
       changed |= result;
     }
   }
-  XLA_VLOG_LINES(
-      2, "StreamAttributeAsyncWrapper::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "StreamAttributeAsyncWrapper::RunImpl(), after:\n" +
+                        module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
index 387c6dbaf04f7a..b01d6ebbe04d7d 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
@@ -38,8 +38,8 @@ class StreamAttributeAsyncWrapper : public HloModulePass {
     return "async-stream-attribute-wrapper";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc b/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
index 6b9e46d7d4c807..d176f8dfea25e4 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
@@ -127,9 +127,7 @@ class SpecializeTopkVisitor : public DfsHloRewriteVisitor {
       return absl::OkStatus();
     }
     TF_RET_CHECK(topk->operand_count() == 1);
-    bool is_cuda =
-        std::holds_alternative<stream_executor::CudaComputeCapability>(
-            compute_capability_);
+    bool is_cuda = compute_capability_.IsCuda();
 
     if (auto small_topk = SmallBufferOptimization(
             topk, is_cuda,
@@ -153,7 +151,7 @@ class SpecializeTopkVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> TopkSpecializer::Run(
+absl::StatusOr<bool> TopkSpecializer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return SpecializeTopkVisitor(compute_capability_)
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_specializer.h b/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
index f5e593c175f2d7..24f29d2c3747e9 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
+++ b/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
@@ -35,8 +35,8 @@ class TopkSpecializer : public HloModulePass {
       : compute_capability_(std::move(compute_capability)) {}
   absl::string_view name() const override { return "topk-specializer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc b/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc
index 83054fc24aa131..b3fc5d9ffbfaab 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc
@@ -117,10 +117,10 @@ class GeneralizeTopk : public HloModulePass {
  public:
   absl::string_view name() const override { return "generalized-topk"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     return GeneralizeTopkVisitor().RunOnModule(module, execution_threads);
   }
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc b/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
index 9d7e017a93c014..1f6c1742e03994 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
@@ -145,7 +145,7 @@ class TopkSplitterVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> TopKSplitter::Run(
+absl::StatusOr<bool> TopKSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return TopkSplitterVisitor(split_threshold_)
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_splitter.h b/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
index 0a155c172464ee..9599c8ea436046 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
@@ -37,8 +37,8 @@ class TopKSplitter : public HloModulePass {
       : split_threshold_(split_threshold) {}
   absl::string_view name() const override { return "topk-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
index 457c398396a833..d6dfc08863b5a2 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
@@ -199,9 +199,9 @@ class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
 };
 }  // namespace
 
-absl::StatusOr<bool> TransposeDimensionGrouper::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> TransposeDimensionGrouper::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
       bool changed,
       TransposeDimensionGroupVisitor().RunOnModule(module, execution_threads));
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
index 743d6047484a7a..9a35e5dad09f3a 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
@@ -45,8 +45,9 @@ class TransposeDimensionGrouper : public HloModulePass {
   absl::string_view name() const override {
     return "transpose-dimension-grouper";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc
index 63a53b12a26357..a29e284375c4e4 100644
--- a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc
@@ -289,8 +289,7 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     // Compute dimensions to reduce for inner reduction.
     absl::InlinedVector<int64_t, 2> inner_reduce_dims(
         sorted_dims_to_reduce.begin(), sorted_dims_to_reduce.end());
-    auto split_dim_it = std::find(inner_reduce_dims.begin(),
-                                  inner_reduce_dims.end(), split_params.dim);
+    auto split_dim_it = absl::c_find(inner_reduce_dims, split_params.dim);
     *split_dim_it += 1;
 
     // Compute dimension to reduce for outer reduction.
@@ -371,9 +370,9 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
   const se::DeviceDescription &device_description_;
 };
 
-absl::StatusOr<bool> TreeReductionRewriter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> TreeReductionRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Rewriter input: " << module->ToString();
   TF_ASSIGN_OR_RETURN(bool changed,
                       ReductionRewriterVisitor(device_description_)
diff --git a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
index 864965836910db..da27e4badc4451 100644
--- a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
@@ -82,8 +82,8 @@ class TreeReductionRewriter : public HloModulePass {
   ~TreeReductionRewriter() override = default;
   absl::string_view name() const override { return "tree-reduction-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
index f033a270fb5467..5d31b3648ecabd 100644
--- a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
@@ -37,7 +37,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> TriangularSolveRewriter::Run(
+absl::StatusOr<bool> TriangularSolveRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
index a19e10f878ca01..d6697081a9dbe7 100644
--- a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
@@ -48,8 +48,8 @@ class TriangularSolveRewriter : public HloModulePass {
     return "triangular-solve-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
index a154b7780ae69c..774bd1185fd184 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/runtime/buffer_comparator.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -64,9 +64,10 @@ limitations under the License.
 #include "xla/xla.pb.h"
 
 namespace xla::gpu {
-
 namespace {
 
+using ::mlir::MLIRContext;
+
 using ProfilingOutput = AutotunerCompileUtil::ProfilingOutput;
 
 // Returns the input instruction as a fusion instruction, if it represents a
@@ -125,8 +126,7 @@ absl::Status InlineModuleFusions(HloModule* hlo_module) {
 // days instead of milliseconds).
 absl::StatusOr<std::unique_ptr<HloModule>> NewHloModuleFromFusionComputation(
     const HloFusionInstruction& fusion, const DebugOptions& debug_opts,
-    const se::DeviceDescription& gpu_device_info,
-    mlir::MLIRContext* mlir_context) {
+    const se::DeviceDescription& gpu_device_info, MLIRContext* mlir_context) {
   std::unique_ptr<HloModule> new_module =
       ExtractComputationIntoNewModule(*fusion.fused_instructions_computation());
   new_module->mutable_config().set_debug_options(debug_opts);
@@ -176,7 +176,7 @@ namespace triton_fusion_numerics_pass_internal {
 absl::StatusOr<ScopedShapedBuffer> CompileAndRunFusion(
     AutotunerCompileUtil& util, const HloFusionInstruction& fusion,
     const DeviceOrDevicelessConfig& config, const DebugOptions& debug_opts,
-    bool disable_triton, mlir::MLIRContext* mlir_context) {
+    bool disable_triton, MLIRContext* mlir_context) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       util.Compile([&](const DebugOptions& opts) {
@@ -254,7 +254,7 @@ absl::Status VerifyTritonFusion(AutotunerCompileUtil& util,
                                 const HloFusionInstruction& fusion,
                                 const DeviceOrDevicelessConfig& config,
                                 const DebugOptions& debug_opts,
-                                mlir::MLIRContext* mlir_context) {
+                                MLIRContext* mlir_context) {
   TF_ASSIGN_OR_RETURN(auto triton_result,
                       triton_fusion_numerics_pass_internal::CompileAndRunFusion(
                           util, fusion, config, debug_opts,
@@ -297,10 +297,10 @@ TritonFusionNumericsVerifier::FusionCacheKey CacheKeyForFusion(
 
 }  // namespace
 
-absl::StatusOr<bool> TritonFusionNumericsVerifier::Run(
+absl::StatusOr<bool> TritonFusionNumericsVerifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  VLOG(3) << "TritonFusionNumericsVerifier::Run";
+  VLOG(3) << "TritonFusionNumericsVerifier::RunImpl";
   if (config_.IsDeviceless()) {
     return absl::InternalError(
         "Cannot run TritonFusionNumericsVerifier on a deviceless compilation.");
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
index ca04d8768df2bd..5f7101cb05ae9e 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
@@ -49,15 +49,15 @@ class TritonFusionNumericsVerifier : public HloModulePass {
   static absl::string_view Name() { return "triton-numerics-verifier"; }
   absl::string_view name() const override { return Name(); }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   using FusionCacheKey = std::string;
 
   int CacheHitsForTestingOnly() const { return cache_hits_; }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   DeviceOrDevicelessConfig config_;
   mlir::MLIRContext* mlir_context_;
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index 53d088000bed2f..51a96ed64cb9e1 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/primitive_util.h"
@@ -40,6 +41,8 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
+using ::mlir::MLIRContext;
+
 class TritonFusionNumericsVerifierTest
     : public HloPjRtTestBase,
       public ::testing::WithParamInterface<PrimitiveType> {
@@ -88,7 +91,7 @@ class TritonFusionNumericsVerifierTest
     return std::move(compile_util_or).value();
   }
 
-  mlir::MLIRContext mlir_context_;
+  MLIRContext mlir_context_;
 };
 
 constexpr absl::string_view kSoftmaxHlo = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc
index 319ac2ae0bb4fc..1eabac1257d4a0 100644
--- a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc
@@ -96,7 +96,7 @@ std::vector<HloInstruction*> GetRelevantVariadicOps(HloComputation* comp) {
 
 }  // namespace
 
-absl::StatusOr<bool> VariadicOpSplitter::Run(
+absl::StatusOr<bool> VariadicOpSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
index b0f8a789e0fa04..c28feb57bc186a 100644
--- a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
@@ -31,8 +31,8 @@ class VariadicOpSplitter : public HloModulePass {
  public:
   absl::string_view name() const override { return "variadic-op-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
index d551058516345c..554cf4d5bfa05c 100644
--- a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
@@ -1355,11 +1355,11 @@ class WindowedEinsumVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> WindowedEinsumHandler::Run(
+absl::StatusOr<bool> WindowedEinsumHandler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      5, "WindowedEinsumHandler::Run(), before:\n" + module->ToString());
+      5, "WindowedEinsumHandler::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   int64_t stream_id = hlo_query::NextChannelId(*module);
   std::vector<HloInstruction*> all_windowed_einsum_loops;
@@ -1461,8 +1461,8 @@ absl::StatusOr<bool> WindowedEinsumHandler::Run(
     }
     changed |= result.unrolled;
   }
-  XLA_VLOG_LINES(5,
-                 "WindowedEinsumHandler::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      5, "WindowedEinsumHandler::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
index 5ebe38e0c6b385..b5b64f5cd95eb8 100644
--- a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
+++ b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
@@ -72,16 +72,16 @@ class WindowedEinsumHandler : public HloModulePass {
     bool consumed = false;
   };
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   constexpr static const char* kWindowedEinsumRsLoopName =
       "windowed_dot_general_body_rs";
   constexpr static const char* kWindowedEinsumAgLoopName =
       "windowed_dot_general_body_ag";
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   std::vector<WindowedEinsumAgLoops> all_ag_loops_;
 };
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
index a049fb92417a5c..b578dfeab4742d 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
@@ -21,17 +21,16 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/transforms/gemm_fusion.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -697,8 +696,8 @@ ENTRY e {
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
   ROOT bc = bf16[2,2,100] broadcast(dot), dimensions={0,1}
 })"));
-  EXPECT_TRUE(GemmFusion(se::CudaComputeCapability{
-                             se::CudaComputeCapability::kAmpere, 0})
+  EXPECT_TRUE(GemmFusion(se::GpuComputeCapability{se::CudaComputeCapability{
+                             se::CudaComputeCapability::kAmpere, 0}})
                   .Run(module.get())
                   .value());
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index c9eaf020e76f29..6fd49f3a55e1b2 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
+#include "google/protobuf/descriptor.h"
 #include "xla/parse_flags_from_env.h"
 #include "xla/service/compilation_environments.h"
 #include "xla/tsl/util/command_line_flags.h"
diff --git a/third_party/xla/xla/service/gpu_compilation_environment_test.cc b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
index c9e70b7931af1d..b9d06f15b542ab 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment_test.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
@@ -20,20 +20,18 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/parse_flags_from_env.h"
 #include "xla/service/compilation_environments.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using ::tsl::testing::StatusIs;
-
 void set_xla_flags_env_var(const std::string& xla_flags) {
   int* pargc;
   std::vector<char*>* pargv;
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index 6a8bf2aadec1dc..6ea3da98fd7cba 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -667,6 +667,7 @@ message BufferAllocationProto {
     int64 logical_buffer_id = 1;
     int64 offset = 2;
     int64 size = 3;
+    xla.PrimitiveType element_type = 4;
   }
 
   int64 index = 1;
diff --git a/third_party/xla/xla/service/hlo_computation_test.cc b/third_party/xla/xla/service/hlo_computation_test.cc
index ec64a3cd565a23..82485966cac6a2 100644
--- a/third_party/xla/xla/service/hlo_computation_test.cc
+++ b/third_party/xla/xla/service/hlo_computation_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
@@ -44,8 +46,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -588,7 +588,7 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/add));
 
-  TF_CHECK_OK(negate->AddControlDependencyTo(add));
+  CHECK_OK(negate->AddControlDependencyTo(add));
 
   auto clone = computation->Clone();
 
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.cc b/third_party/xla/xla/service/hlo_cost_analysis.cc
index 13a1a640e0b648..ba82805a7afa38 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis.cc
@@ -150,6 +150,7 @@ absl::Status HloCostAnalysis::HandleElementwiseOp(
       opcode == HloOpcode::kAcos ||
       opcode == HloOpcode::kAcosh ||
       opcode == HloOpcode::kAsin ||
+      opcode == HloOpcode::kAsinh ||
       opcode == HloOpcode::kAtan2 ||
       opcode == HloOpcode::kAtanh ||
       opcode == HloOpcode::kCbrt ||
diff --git a/third_party/xla/xla/service/hlo_cost_analysis_test.cc b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
index 1161c3b4218037..dbb91230a98d95 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/service/service.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -71,7 +71,7 @@ class HloCostAnalysisTest : public ::testing::Test {
       auto half = ConstantR0<float>(&builder, 0.5);
       Exp(Add(x, half));
       auto computation_status = builder.Build();
-      TF_CHECK_OK(computation_status.status());
+      CHECK_OK(computation_status.status());
       add_and_exp_ = std::move(computation_status).value();
     }
 
@@ -82,7 +82,7 @@ class HloCostAnalysisTest : public ::testing::Test {
       auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y");
       Add(x, y);
       auto computation_status = builder.Build();
-      TF_CHECK_OK(computation_status.status());
+      CHECK_OK(computation_status.status());
       add_ = std::move(computation_status).value();
     }
 
@@ -93,7 +93,7 @@ class HloCostAnalysisTest : public ::testing::Test {
       auto one = ConstantR0<float>(&builder, 1.0);
       Div(one, Add(one, Exp(Neg(x))));
       auto computation_status = builder.Build();
-      TF_CHECK_OK(computation_status.status());
+      CHECK_OK(computation_status.status());
       sigmoid_ = std::move(computation_status).value();
     }
 
@@ -104,7 +104,7 @@ class HloCostAnalysisTest : public ::testing::Test {
       auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y");
       Max(x, y);
       auto computation_status = builder.Build();
-      TF_CHECK_OK(computation_status.status());
+      CHECK_OK(computation_status.status());
       max_ = std::move(computation_status).value();
     }
 
@@ -115,7 +115,7 @@ class HloCostAnalysisTest : public ::testing::Test {
       auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y");
       Gt(x, y);
       auto computation_status = builder.Build();
-      TF_CHECK_OK(computation_status.status());
+      CHECK_OK(computation_status.status());
       gt_ = std::move(computation_status).value();
     }
   }
@@ -123,7 +123,7 @@ class HloCostAnalysisTest : public ::testing::Test {
   // Build HLO graph from the given builder and return the HLO module.
   std::unique_ptr<HloModule> BuildHloGraph(XlaBuilder* builder) {
     auto computation_status = builder->Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     auto computation = std::move(computation_status).value();
     auto config = HloModule::CreateModuleConfigFromProto(computation.proto(),
                                                          DebugOptions())
@@ -1632,7 +1632,7 @@ TEST_F(HloCostAnalysisTest, MultioutputScatter) {
     auto y1 = Parameter(&builder, 3, ShapeUtil::MakeShape(S32, {}), "y1");
     Tuple(&builder, {Add(x0, y0), Add(x1, y1)});
     auto computation_status = builder.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }();
   Scatter({operand0, operand1}, indices, {values0, values1}, add, dim_numbers);
diff --git a/third_party/xla/xla/service/hlo_creation_utils.cc b/third_party/xla/xla/service/hlo_creation_utils.cc
index 2d427ef4569cc0..7fd80c89763a36 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils.cc
@@ -949,4 +949,61 @@ HloInstruction* MakeScalarLikeFromLiteral(HloInstruction* base,
       ShapeUtil::MakeStaticShape(base->shape()), scalar, {}));
 }
 
+std::unique_ptr<HloModule> NewModuleWithFusion(
+    const HloInstruction* instruction, HloInstruction::FusionKind fusion_kind) {
+  auto hlo_module = std::make_unique<HloModule>(
+      absl::StrCat("wrapped_module_", instruction->name()),
+      instruction->GetModule()->config());
+
+  // New computation  with a single instruction as given by the instruction
+  // parameter.
+  HloComputation::Builder fusion_builder(
+      absl::StrCat("wrapped_", instruction->name()));
+
+  const auto build_parameter_instructions =
+      [instruction](HloComputation::Builder& builder) {
+        std::vector<HloInstruction*> parameters;
+        parameters.reserve(instruction->operand_count());
+        for (int i = 0; i < instruction->operand_count(); ++i) {
+          const HloInstruction* operand = instruction->operand(i);
+          parameters.push_back(
+              builder.AddInstruction(HloInstruction::CreateParameter(
+                  i, operand->shape(), absl::StrCat("param_", i))));
+        }
+        return parameters;
+      };
+  std::vector<HloInstruction*> fusion_parameters =
+      build_parameter_instructions(fusion_builder);
+  HloInstruction* fused_root =
+      fusion_builder.AddInstruction(instruction->CloneWithNewOperands(
+          instruction->shape(), fusion_parameters));
+
+  // If the original instruction had any sub-computations (like to_apply), clone
+  // them.
+  if (!instruction->called_computations().empty()) {
+    HloCloneContext context(hlo_module.get());
+    fused_root->ReplaceCalledComputations([&](HloComputation* callee) {
+      if (callee->parent() != hlo_module.get()) {
+        return hlo_module->DeepCloneComputation(callee, &context);
+      }
+      return callee;
+    });
+  }
+
+  HloComputation* fused_computation =
+      hlo_module->AddEmbeddedComputation(fusion_builder.Build(fused_root));
+
+  // Entry computation for the new module.
+  HloComputation::Builder entry_builder("entry");
+  std::vector<HloInstruction*> entry_parameters =
+      build_parameter_instructions(entry_builder);
+  HloInstruction* fusion_instruction = entry_builder.AddInstruction(
+      HloInstruction::CreateFusion(instruction->shape(), fusion_kind,
+                                   entry_parameters, fused_computation));
+
+  hlo_module->AddEntryComputation(entry_builder.Build(fusion_instruction));
+
+  return hlo_module;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_creation_utils.h b/third_party/xla/xla/service/hlo_creation_utils.h
index d22d0affbf181a..c7258129cf1279 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.h
+++ b/third_party/xla/xla/service/hlo_creation_utils.h
@@ -442,6 +442,11 @@ absl::StatusOr<HloInstruction*> MakeWithinBounds(HloInstruction* inst,
                                                  HloInstruction* lower_bound,
                                                  HloInstruction* upper_bound);
 
+// Creates a new module with a single computation that contains a fusion of the
+// given instruction with the given fusion kind.
+std::unique_ptr<HloModule> NewModuleWithFusion(
+    const HloInstruction* instruction, HloInstruction::FusionKind fusion_kind);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_HLO_CREATION_UTILS_H_
diff --git a/third_party/xla/xla/service/hlo_creation_utils_test.cc b/third_party/xla/xla/service/hlo_creation_utils_test.cc
index d90e26615dee58..058ced67fc5196 100644
--- a/third_party/xla/xla/service/hlo_creation_utils_test.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils_test.cc
@@ -20,10 +20,13 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array2d.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -559,5 +562,46 @@ TEST_F(HloCreationUtilsTest, DynamicBroadcastShape) {
   EXPECT_TRUE(one_constant->shape().is_static());
 }
 
+TEST_F(HloCreationUtilsTest, NewModuleWithFusion) {
+  static constexpr absl::string_view kModuleStr = R"(
+    HloModule test
+    apply_op {
+      x = f32[] parameter(0)
+      y = f32[] parameter(1)
+      ROOT apply_op = f32[] add(x, y)
+    }
+
+    ENTRY test_computation {
+      param_0 = f32[65536] parameter(0)
+      all-reduce-start = f32[65536] all-reduce-start(param_0), to_apply=apply_op, replica_groups={{0,1}}
+      ROOT all-reduce-done = f32[65536] all-reduce-done(all-reduce-start)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  const HloInstruction* all_reduce_start =
+      module->entry_computation()->GetInstructionWithName("all-reduce-start");
+  std::unique_ptr<HloModule> fusion_module =
+      NewModuleWithFusion(all_reduce_start, HloInstruction::FusionKind::kLoop);
+  EXPECT_EQ(fusion_module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kFusion);
+  auto* fusion_instruction = Cast<HloFusionInstruction>(
+      fusion_module->entry_computation()->root_instruction());
+  EXPECT_EQ(fusion_instruction->fusion_kind(),
+            HloInstruction::FusionKind::kLoop);
+  EXPECT_EQ(fusion_instruction->fused_instructions_computation()
+                ->root_instruction()
+                ->opcode(),
+            HloOpcode::kAllReduceStart);
+  HloAllReduceInstruction* all_reduce = Cast<HloAllReduceInstruction>(
+      fusion_instruction->fused_instructions_computation()->root_instruction());
+  EXPECT_EQ(all_reduce->replica_groups().size(), 1);
+  EXPECT_EQ(all_reduce->replica_groups()[0].replica_ids().size(), 2);
+  // Check that all-reduce has the correct to_apply.
+  HloComputation* to_apply = all_reduce->to_apply();
+  EXPECT_EQ(to_apply->name(), "apply_op");
+  EXPECT_EQ(to_apply->num_parameters(), 2);
+  EXPECT_EQ(to_apply->root_instruction()->opcode(), HloOpcode::kAdd);
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_cse.cc b/third_party/xla/xla/service/hlo_cse.cc
index 22c5ffc2228d26..aeb45642bbd308 100644
--- a/third_party/xla/xla/service/hlo_cse.cc
+++ b/third_party/xla/xla/service/hlo_cse.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -116,8 +115,8 @@ absl::StatusOr<bool> CombineConstants(
 
     if (match != nullptr) {
       // Match found, replace this instruction with the one in the set.
-      TF_CHECK_OK(instruction->ReplaceAllUsesWith(match));
-      TF_CHECK_OK(computation->RemoveInstruction(instruction));
+      CHECK_OK(instruction->ReplaceAllUsesWith(match));
+      CHECK_OK(computation->RemoveInstruction(instruction));
       ++combined;
     }
   }
@@ -264,19 +263,6 @@ bool HloCSE::ShouldEliminateInstruction(const HloInstruction* instruction) {
   return true;
 }
 
-absl::StatusOr<bool> HloCSE::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-
-  for (auto* computation : module->computations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(bool computation_changed,
-                        RunOnComputation(computation));
-    changed |= computation_changed;
-  }
-  return changed;
-}
-
 absl::StatusOr<bool> HloCSE::RunOnComputation(HloComputation* computation) {
   if (should_eliminate_computation_ &&
       !should_eliminate_computation_(computation)) {
@@ -393,4 +379,17 @@ absl::StatusOr<bool> HloCSE::RunOnComputation(HloComputation* computation) {
   return changed;
 }
 
+absl::StatusOr<bool> HloCSE::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+
+  for (auto* computation : module->computations(execution_threads)) {
+    TF_ASSIGN_OR_RETURN(bool computation_changed,
+                        RunOnComputation(computation));
+    changed |= computation_changed;
+  }
+  return changed;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_cse.h b/third_party/xla/xla/service/hlo_cse.h
index a2e9670c98133b..dfe5a86b184acf 100644
--- a/third_party/xla/xla/service/hlo_cse.h
+++ b/third_party/xla/xla/service/hlo_cse.h
@@ -55,19 +55,19 @@ class HloCSE : public HloModulePass {
   ~HloCSE() override = default;
   absl::string_view name() const override { return "cse"; }
 
-  // Run CSE on the given module. Returns whether the module was changed (common
-  // subexpressions were found and eliminated).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Run CSE on the given computation. Returns whether the computation was
   // changed.
   absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
 
   static bool ShouldEliminateInstruction(const HloInstruction* instruction);
 
+ protected:
+  // Run CSE on the given module. Returns whether the module was changed (common
+  // subexpressions were found and eliminated).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const bool is_layout_sensitive_;
   const bool ignore_control_dependencies_;
diff --git a/third_party/xla/xla/service/hlo_cse_test.cc b/third_party/xla/xla/service/hlo_cse_test.cc
index 3828d9cbd75a61..c0bcbe2e410db8 100644
--- a/third_party/xla/xla/service/hlo_cse_test.cc
+++ b/third_party/xla/xla/service/hlo_cse_test.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -55,8 +54,6 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 namespace m = xla::match;
 
-using ::tsl::testing::IsOkAndHolds;
-
 class HloCseTest : public HloTestBase {
  protected:
   HloCseTest() {}
diff --git a/third_party/xla/xla/service/hlo_cycle_detection.h b/third_party/xla/xla/service/hlo_cycle_detection.h
index bb50c18e87187b..0b6d7c7cce239b 100644
--- a/third_party/xla/xla/service/hlo_cycle_detection.h
+++ b/third_party/xla/xla/service/hlo_cycle_detection.h
@@ -52,11 +52,11 @@ class HloCycleDetection : public HloModulePass {
  public:
   absl::string_view name() const override { return "hlo-cycle-detection"; }
 
+ protected:
   // Never returns true; no instructions are ever modified by this pass.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     TF_RETURN_IF_ERROR(visitor_.VerifyNoCycle(module));
     return false;
   }
diff --git a/third_party/xla/xla/service/hlo_domain_isolator.cc b/third_party/xla/xla/service/hlo_domain_isolator.cc
index c5355ef98d6777..072a180f03a737 100644
--- a/third_party/xla/xla/service/hlo_domain_isolator.cc
+++ b/third_party/xla/xla/service/hlo_domain_isolator.cc
@@ -123,7 +123,7 @@ absl::StatusOr<bool> HloDomainIsolator::UpdateDomains(
   return changed;
 }
 
-absl::StatusOr<bool> HloDomainIsolator::Run(
+absl::StatusOr<bool> HloDomainIsolator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   DomainCreator creator = creator_factory_();
diff --git a/third_party/xla/xla/service/hlo_domain_isolator.h b/third_party/xla/xla/service/hlo_domain_isolator.h
index d10dc501babc39..41f6c698778b32 100644
--- a/third_party/xla/xla/service/hlo_domain_isolator.h
+++ b/third_party/xla/xla/service/hlo_domain_isolator.h
@@ -45,8 +45,8 @@ class HloDomainIsolator : public HloModulePass {
   // Update domains for an instruction.
   absl::StatusOr<bool> UpdateDomains(HloInstruction* instruction);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_domain_remover.cc b/third_party/xla/xla/service/hlo_domain_remover.cc
index 514f2466910be2..b9917647ab714f 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.cc
+++ b/third_party/xla/xla/service/hlo_domain_remover.cc
@@ -131,7 +131,7 @@ absl::StatusOr<int64_t> HloDomainRemover::RemoveExitDomains(
   return removed_domains;
 }
 
-absl::StatusOr<bool> HloDomainRemover::Run(
+absl::StatusOr<bool> HloDomainRemover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   RunContext run_context(module, this);
diff --git a/third_party/xla/xla/service/hlo_domain_remover.h b/third_party/xla/xla/service/hlo_domain_remover.h
index 970f4f164a5e2b..da5765e9e178c8 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.h
+++ b/third_party/xla/xla/service/hlo_domain_remover.h
@@ -49,8 +49,8 @@ class HloDomainRemover : public HloModulePass {
   static absl::StatusOr<int64_t> RemoveExitDomains(
       HloInstruction* instruction, absl::string_view domain_kind);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_domain_test.cc b/third_party/xla/xla/service/hlo_domain_test.cc
index f31b087e88d4b4..2fd15f0fd28aa4 100644
--- a/third_party/xla/xla/service/hlo_domain_test.cc
+++ b/third_party/xla/xla/service/hlo_domain_test.cc
@@ -432,16 +432,16 @@ TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
 HloModule Module
 
 ENTRY entry {
-  token0 = token[] after-all(), sharding={maximal device=-1}
+  token0 = token[] after-all(), sharding={maximal device=1}
   a = (f32[4], u32[], token[]) recv(token0), channel_id=1,
-        sharding={{maximal device=-1},{maximal device=-1},{maximal device=-1}}
+        sharding={{maximal device=1},{maximal device=1},{maximal device=1}}
   b = (f32[4], token[]) recv-done(a), channel_id=1,
-        sharding={{maximal device=-1},{maximal device=-1}}
-  b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=-1}
-  c = f32[4] add(b_element, b_element), sharding={maximal device=-1}
+        sharding={{maximal device=1},{maximal device=1}}
+  b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=1}
+  c = f32[4] add(b_element, b_element), sharding={maximal device=1}
   d = (f32[4], u32[], token[]) send(c, token0), channel_id=2,
-        sharding={{maximal device=-1},{maximal device=-1},{maximal device=-1}}
-  ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=-1}
+        sharding={{maximal device=1},{maximal device=1},{maximal device=1}}
+  ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=1}
 }
 )";
 
diff --git a/third_party/xla/xla/service/hlo_domain_verifier.cc b/third_party/xla/xla/service/hlo_domain_verifier.cc
index 519f572c3dfbb5..89b415791ec3b1 100644
--- a/third_party/xla/xla/service/hlo_domain_verifier.cc
+++ b/third_party/xla/xla/service/hlo_domain_verifier.cc
@@ -85,7 +85,7 @@ absl::Status HloDomainVerifier::RunContext::Run(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> HloDomainVerifier::Run(
+absl::StatusOr<bool> HloDomainVerifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   RunContext run_context(module, this);
diff --git a/third_party/xla/xla/service/hlo_domain_verifier.h b/third_party/xla/xla/service/hlo_domain_verifier.h
index 32dce237cb03dd..0950b228c1c412 100644
--- a/third_party/xla/xla/service/hlo_domain_verifier.h
+++ b/third_party/xla/xla/service/hlo_domain_verifier.h
@@ -36,11 +36,6 @@ class HloDomainVerifier : public HloModulePass {
 
   absl::string_view name() const override { return "domain_verifier"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Verify that the whole kDomain frontier bounding the instruction reach set,
   // has matching metadata.
   // A kDomain instruction has two sides of metadata, a user facing and an
@@ -58,6 +53,11 @@ class HloDomainVerifier : public HloModulePass {
   static absl::StatusOr<const DomainMetadata*> VerifyDomain(
       const DomainMetadata::Domain& domain);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   class RunContext;
 
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.cc b/third_party/xla/xla/service/hlo_graph_dumper.cc
index ee4fe6a2a0d985..3ceb12c0487185 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper.cc
@@ -1141,6 +1141,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
   switch (instr->opcode()) {
     case HloOpcode::kAbs:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAdd:
diff --git a/third_party/xla/xla/service/hlo_module_config.cc b/third_party/xla/xla/service/hlo_module_config.cc
index 2aafa418689b01..78644b55eda325 100644
--- a/third_party/xla/xla/service/hlo_module_config.cc
+++ b/third_party/xla/xla/service/hlo_module_config.cc
@@ -110,6 +110,9 @@ std::string HloModuleConfig::compilation_cache_key() const {
     StrAppend(&key, "::device_memory_size=", device_memory_size());
   }
   StrAppend(&key, "::use_shardy_partitioner=", use_shardy_partitioner());
+  if (partition_size() != 0) {
+    StrAppend(&key, "::partition_size=", partition_size());
+  }
   return key;
 }
 
@@ -339,6 +342,7 @@ HloModuleConfigProto HloModuleConfig::ToProto() const {
   proto.set_fdo_profile(fdo_profile_);
   proto.set_device_memory_size(device_memory_size_);
   proto.set_use_shardy_partitioner(use_shardy_partitioner_);
+  proto.set_partition_size(partition_size_);
   *proto.mutable_sharding_config() = ShardingConfig::ToProto(sharding_config_);
   *proto.mutable_schedule_config() = ScheduleConfig::ToProto(schedule_config_);
   return proto;
@@ -418,6 +422,7 @@ HloModuleConfig::CreateFromProto(const HloModuleConfigProto& proto) {
   config->fdo_profile_ = proto.fdo_profile();
   config->device_memory_size_ = proto.device_memory_size();
   config->use_shardy_partitioner_ = proto.use_shardy_partitioner();
+  config->partition_size_ = proto.partition_size();
   config->sharding_config_ = ShardingConfig::FromProto(proto.sharding_config());
   config->schedule_config_ = ScheduleConfig::FromProto(proto.schedule_config());
   return std::move(config);
diff --git a/third_party/xla/xla/service/hlo_module_config.h b/third_party/xla/xla/service/hlo_module_config.h
index 5f7592c874c519..19949ce76c08c0 100644
--- a/third_party/xla/xla/service/hlo_module_config.h
+++ b/third_party/xla/xla/service/hlo_module_config.h
@@ -451,6 +451,12 @@ class HloModuleConfig {
     use_shardy_partitioner_ = use_shardy_partitioner;
   }
 
+  // Number of devices in a fast-interconnect domain.
+  int64_t partition_size() const { return partition_size_; }
+  void set_partition_size(int64_t partition_size) {
+    partition_size_ = partition_size;
+  }
+
   // Do channel IDs in this module carry semantic information.
   bool ChannelIdSensitive() const {
     // TODO(b/430952564): Base this on num_partitions / num_replicas instead
@@ -625,6 +631,9 @@ class HloModuleConfig {
 
   bool use_shardy_partitioner_ = false;
 
+  // Number of devices in a fast-interconnect domain.
+  int64_t partition_size_ = 0;
+
   // Sharding configuration, where sharding_config_.nodes[v] controls the
   // sharding of operation v.
   ShardingConfig sharding_config_;
diff --git a/third_party/xla/xla/service/hlo_module_dce.cc b/third_party/xla/xla/service/hlo_module_dce.cc
index fa4da849792503..504c65f315e561 100644
--- a/third_party/xla/xla/service/hlo_module_dce.cc
+++ b/third_party/xla/xla/service/hlo_module_dce.cc
@@ -105,7 +105,7 @@ absl::StatusOr<bool> RunWhileDCE(
 
 }  // namespace
 
-absl::StatusOr<bool> HloModuleDCE::Run(
+absl::StatusOr<bool> HloModuleDCE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Before HloModuleDCE:";
diff --git a/third_party/xla/xla/service/hlo_module_dce.h b/third_party/xla/xla/service/hlo_module_dce.h
index 453deb26bb847a..b6d59c67a9fe71 100644
--- a/third_party/xla/xla/service/hlo_module_dce.h
+++ b/third_party/xla/xla/service/hlo_module_dce.h
@@ -33,10 +33,10 @@ class HloModuleDCE : public HloModulePass {
   ~HloModuleDCE() override {}
   absl::string_view name() const override { return "hlo-module-dce"; }
 
+ protected:
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/hlo_module_group_test.cc b/third_party/xla/xla/service/hlo_module_group_test.cc
index d388e04e184ac5..039e0f4d815512 100644
--- a/third_party/xla/xla/service/hlo_module_group_test.cc
+++ b/third_party/xla/xla/service/hlo_module_group_test.cc
@@ -69,32 +69,6 @@ ENTRY %entry (x: f32[], y: f32[]) -> f32[] {
   std::vector<std::unique_ptr<HloModule>> modules = group.ConsumeModules();
   EXPECT_EQ(modules.size(), 1);
 }
-
-// Test that metadata is transferred when a module is replaced.
-TEST_F(HloModuleGroupTest, ReplaceModuleMetadata) {
-  auto old_module = CreateNewVerifiedModule();
-  int old_module_id = old_module->unique_id();
-  old_module->metadata()->RecordPassStart();
-  TF_EXPECT_OK(old_module->metadata()->set_current_pass_name("fake pass"));
-
-  HloModuleGroup group(std::move(old_module));
-  EXPECT_EQ(group.module(0).metadata()->proto().module_group_name(),
-            group.name());
-
-  auto new_module = CreateNewVerifiedModule();
-  group.ReplaceModule(0, std::move(new_module));
-
-  EXPECT_NE(group.module(0).unique_id(), old_module_id);
-  const HloModuleMetadataProto& module_metadata =
-      group.module(0).metadata()->proto();
-  EXPECT_EQ(module_metadata.canonical_module_id(), old_module_id);
-
-  const HloPassMetadata& pass_metadata =
-      *module_metadata.pass_metadata().rbegin();
-  EXPECT_THAT(pass_metadata,
-              Property(&HloPassMetadata::pass_name, StrEq("fake pass")));
-}
-
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_module_test.cc b/third_party/xla/xla/service/hlo_module_test.cc
index 9675d51de21bc7..20dc56cf80a375 100644
--- a/third_party/xla/xla/service/hlo_module_test.cc
+++ b/third_party/xla/xla/service/hlo_module_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -24,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/service/buffer_value.h"
+#include "xla/service/compilation_environments.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/test_compilation_environment.pb.h"
@@ -58,7 +59,6 @@ limitations under the License.
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -173,9 +173,8 @@ TEST_F(HloModuleTest, CloneFrontendAttributes) {
   frontend_attributes.mutable_map()->emplace("attribute1", "attribute1_value");
   module->set_frontend_attributes(frontend_attributes);
   std::unique_ptr<HloModule> clone = module->Clone();
-  bool areEqual = std::equal(
-      frontend_attributes.map().begin(), frontend_attributes.map().end(),
-      clone->frontend_attributes().map().begin(),
+  bool areEqual = absl::c_equal(
+      frontend_attributes.map(), clone->frontend_attributes().map(),
       [](const auto& kv1, const auto& kv2) {
         return kv1.first == kv2.first && kv1.second == kv2.second;
       });
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index 6e7d5809cab379..b967f8ee038ff4 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -73,15 +73,20 @@ absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(proto, debug_options));
-  return HloModule::CreateFromProto(proto, config);
+  return HloModule::CreateFromProto(proto, config,
+                                    /*buffer_assignment_proto=*/nullptr,
+                                    /*preserve_instruction_ids=*/false);
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config,
     bool is_module_post_optimizations) {
   VLOG(4) << proto.ShortDebugString();
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProto(proto, module_config));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      HloModule::CreateFromProto(proto, module_config,
+                                 /*buffer_assignment_proto=*/nullptr,
+                                 /*preserve_instruction_ids=*/false));
   TF_RETURN_IF_ERROR(
       HloVerifier(/*layout_sensitive=*/false,
                   /*allow_mixed_precision=*/is_module_post_optimizations)
@@ -133,7 +138,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromModuleBinaryProtofile(
       HloModuleConfig module_config,
       HloModule::CreateModuleConfigFromProto(module_proto, debug_options));
 
-  return HloModule::CreateFromProto(module_proto, module_config);
+  return HloModule::CreateFromProto(module_proto, module_config,
+                                    /*buffer_assignment_proto=*/nullptr,
+                                    /*preserve_instruction_ids=*/false);
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromModuleTextProtoFile(
@@ -146,7 +153,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromModuleTextProtoFile(
       HloModuleConfig module_config,
       HloModule::CreateModuleConfigFromProto(module_proto, debug_options));
 
-  return HloModule::CreateFromProto(module_proto, module_config);
+  return HloModule::CreateFromProto(module_proto, module_config,
+                                    /*buffer_assignment_proto=*/nullptr,
+                                    /*preserve_instruction_ids=*/false);
 }
 
 absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index 7338fcd3650d2f..e45d69b4127f6e 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -16,14 +16,13 @@ limitations under the License.
 #include "xla/service/hlo_runner.h"
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -33,7 +32,6 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/backend.h"
@@ -51,7 +49,6 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -60,7 +57,6 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 
@@ -481,12 +477,12 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
-    std::function<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
+    absl::AnyInvocable<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
         const std::vector<ServiceExecutableRunOptions>&,
         const std::vector<absl::Span<const ShapedBuffer* const>>&)>
         execution_helper,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   std::vector<std::unique_ptr<se::Stream>> streams;
@@ -565,7 +561,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
         VLOG(1) << "Starting infeed on device " << device;
         for (int64_t step = 1;
              options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
-          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralToInfeed(
+          CHECK_OK(backend().transfer_manager()->TransferLiteralToInfeed(
               executor, *options.infeed_values[i]));
           if (step % 100 == 0) {
             VLOG(1) << "Infeed step " << step;
@@ -588,7 +584,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
         for (int64_t step = 1;
              options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
           Literal literal(options.outfeed_shape);
-          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
+          CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
               executor, &literal));
           if (options.outfeed_values) {
             options.outfeed_values->at(i) = std::move(literal);
@@ -671,9 +667,9 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
-    std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   DeviceAssignment computation_device_assignment;
@@ -725,7 +721,8 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         }
         return results;
       },
-      argument_count_provider, argument_provider, options, device_assignment);
+      std::move(argument_count_provider), std::move(argument_provider), options,
+      device_assignment);
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
@@ -816,8 +813,7 @@ bool HloRunner::HasProperty(const HloRunnerPropertyTag::Type tag) const {
   if (tag == HloRunnerPropertyTag::kUsingGpuRocm) {
     const stream_executor::DeviceDescription& device_description =
         backend().default_stream_executor()->GetDeviceDescription();
-    return std::holds_alternative<stream_executor::RocmComputeCapability>(
-        device_description.gpu_compute_capability());
+    return device_description.gpu_compute_capability().IsRocm();
   }
   if (tag == HloRunnerPropertyTag::kCpu) {
     return backend().platform()->Name() == "Host";
@@ -825,8 +821,7 @@ bool HloRunner::HasProperty(const HloRunnerPropertyTag::Type tag) const {
   if (tag == HloRunnerPropertyTag::kUsingGpuCuda) {
     const stream_executor::DeviceDescription& device_description =
         backend().default_stream_executor()->GetDeviceDescription();
-    return std::holds_alternative<stream_executor::CudaComputeCapability>(
-        device_description.gpu_compute_capability());
+    return device_description.gpu_compute_capability().IsCuda();
   }
   return false;
 }
diff --git a/third_party/xla/xla/service/hlo_runner.h b/third_party/xla/xla/service/hlo_runner.h
index e6c40007c0c9cb..0673c0444aafdc 100644
--- a/third_party/xla/xla/service/hlo_runner.h
+++ b/third_party/xla/xla/service/hlo_runner.h
@@ -17,13 +17,13 @@ limitations under the License.
 #define XLA_SERVICE_HLO_RUNNER_H_
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <vector>
 
 #include "absl/base/nullability.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -188,9 +188,9 @@ class HloRunner : public HloRunnerInterface {
   // Note that this call ignores `ReplicatedExecutionOptions::run_hlo_passes`,
   // since we've already compiled the `Executable`.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
@@ -264,12 +264,12 @@ class HloRunner : public HloRunnerInterface {
 
   // Common implementation code for ExecuteReplicated() above.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
-      std::function<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
+      absl::AnyInvocable<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
           const std::vector<ServiceExecutableRunOptions>&,
           const std::vector<absl::Span<const ShapedBuffer* const>>&)>
           execution_helper,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment);
 
diff --git a/third_party/xla/xla/service/hlo_runner_interface.h b/third_party/xla/xla/service/hlo_runner_interface.h
index 4c1e98184a75ec..a7141a9c02482e 100644
--- a/third_party/xla/xla/service/hlo_runner_interface.h
+++ b/third_party/xla/xla/service/hlo_runner_interface.h
@@ -17,12 +17,12 @@ limitations under the License.
 #define XLA_SERVICE_HLO_RUNNER_INTERFACE_H_
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -291,9 +291,9 @@ class HloRunnerInterface {
       DeviceAssignment* device_assignment) = 0;
 
   virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) = 0;
 
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 255a2b6ca61f2f..eebcdc0e0401f3 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "xla/service/hlo_runner_pjrt.h"
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_print_options.h"
@@ -47,7 +49,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_runner_interface.h"
@@ -137,11 +138,6 @@ absl::StatusOr<std::vector<Layout>> FlattenedParameterLayouts(
 
 absl::StatusOr<ExecuteOptions> GenerateExecuteOptions(const HloModule& module) {
   ExecuteOptions execute_options;
-
-  // PjRt requires untuple_result if the output is a tuple.
-  if (module.result_shape().IsTuple()) {
-    execute_options.untuple_result = true;
-  }
   return execute_options;
 }
 
@@ -495,6 +491,12 @@ HloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
   absl::StatusOr<std::unique_ptr<PjRtExecutable>> pjrt_executable =
       pjrt_client_->Compile(computation, compile_options);
   if (pjrt_executable.ok()) {
+    absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> hlo_modules =
+        pjrt_executable->get()->GetHloModules();
+    if (hlo_modules.ok() && !hlo_modules->empty()) {
+      std::shared_ptr<HloModule> exe_module = (*hlo_modules)[0];
+      exe_module->mutable_config().set_seed(module->config().seed());
+    }
     return std::make_unique<HloRunnerPjRtExecutable>(
         this, *std::move(pjrt_executable));
   }
@@ -506,6 +508,14 @@ HloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<PjRtLoadedExecutable> pjrt_loaded_executable,
       pjrt_client_->CompileAndLoad(computation, std::move(compile_options)));
+  if (pjrt_loaded_executable != nullptr) {
+    absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> hlo_modules =
+        pjrt_loaded_executable->GetHloModules();
+    if (hlo_modules.ok() && !hlo_modules->empty()) {
+      std::shared_ptr<HloModule> exe_module = (*hlo_modules)[0];
+      exe_module->mutable_config().set_seed(module->config().seed());
+    }
+  }
   return std::make_unique<HloRunnerPjRtExecutable>(
       this, std::move(pjrt_loaded_executable));
 }
@@ -567,9 +577,10 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
 
   xla::ExecuteOptions execute_options;
-  execute_options.untuple_result = true;
   return ExecuteReplicatedImpl(
-      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices)
+      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
+          absl::AnyInvocable<OpaqueExecutable*(int64_t)>
+              executable_provider_arg)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
@@ -587,15 +598,17 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
-    std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   TF_RET_CHECK(device_assignment->computation_count() == 1)
       << "Only single-computation execution is supported.";
   return ExecuteReplicatedImpl(
-      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices)
+      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
+          absl::AnyInvocable<OpaqueExecutable*(int64_t)>
+              executable_provider_arg)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
@@ -619,7 +632,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
             }
             TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const executable,
                                 HloRunnerPjRtExecutable::TryUnwrap(
-                                    *this, executable_provider(i)));
+                                    *this, executable_provider_arg(i)));
             TF_ASSIGN_OR_RETURN(
                 PjRtLoadedExecutable * pjrt_executable,
                 executable->GetOrLoadExecutable(pjrt_client_.get()));
@@ -631,7 +644,6 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
                            args = argument_buffer_slices[i], device_ptr]() {
               std::optional<Future<>> returned_future = {};
               xla::ExecuteOptions options;
-              options.untuple_result = true;
               per_replica_results[i] = pjrt_executable->ExecuteSharded(
                   args, device_ptr, options,
                   /*returned_future=*/returned_future,
@@ -657,18 +669,19 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
         }
         return results;
       },
-      executable_provider, argument_count_provider, argument_provider, options,
-      device_assignment);
+      std::move(executable_provider), std::move(argument_count_provider),
+      std::move(argument_provider), options, device_assignment);
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
-    std::function<
+    absl::AnyInvocable<
         absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
-            absl::Span<const std::vector<PjRtBuffer*>>)>
+            absl::Span<const std::vector<PjRtBuffer*>>,
+            absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
         execution_helper,
-    std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   TF_RET_CHECK(options.infeed_values.empty() ||
@@ -777,7 +790,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   TF_ASSIGN_OR_RETURN(
       const std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
           result_buffers,
-      execution_helper(BufferMatToPointerMat(argument_buffer_slices)));
+      execution_helper(BufferMatToPointerMat(argument_buffer_slices),
+                       std::move(executable_provider)));
   VLOG(1) << "Replicated execution terminated";
 
   // Get the result from execution.
@@ -892,20 +906,33 @@ absl::StatusOr<DeviceAssignment> HloRunnerPjRt::GetDefaultDeviceAssignment(
 // Split-phase HloRunnerPjRt implementations:
 
 namespace {
+
+std::string SerializeDeterministically(const tsl::protobuf::Message& message) {
+  const size_t size = message.ByteSizeLong();
+  std::string buffer;
+  buffer.resize(size);
+  tsl::protobuf::io::StringOutputStream string_stream(&buffer);
+  tsl::protobuf::io::CodedOutputStream coded_stream(&string_stream);
+  coded_stream.SetSerializationDeterministic(true);
+  message.SerializeWithCachedSizes(&coded_stream);
+  return buffer;
+}
+
 std::string MakeFilename(const HloModule& module, const bool run_hlo_passes) {
-  // TODO: b/415841352 - We need a better way to calculate this fingerprint.
-  // Right now, this fingerprint does not take into account the compilation
-  // environment, flags, etc. Since we don't intend to re-use the compilation
-  // artifacts across test runs, this should probably be fine. Each environment
-  // gets a fresh artifact directory. The fingerprint may need to be generated
-  // within PjRt itself since the environment is not easily accessed at this
-  // level of abstraction.
-  const tsl::Fprint128 module_fingerprint =
+  // Fingerprint the HLO module, including its names.
+  tsl::Fprint128 fingerprint =
       tsl::Fingerprint128(module.ToString(HloPrintOptions::Default()));
-  const tsl::Fprint128 run_hlo_passes_fingerprint =
-      tsl::Fingerprint128(run_hlo_passes ? "true" : "false");
-  const tsl::Fprint128 fingerprint =
-      tsl::FingerprintCat128(module_fingerprint, run_hlo_passes_fingerprint);
+  // Fingerprint the run_hlo_passes flag, as this is an input from the test.
+  fingerprint = tsl::FingerprintCat128(
+      fingerprint, tsl::Fingerprint128(run_hlo_passes ? "true" : "false"));
+  // Fingerprint the HLO module's compilation environment, as this is not
+  // captured in the HLO module fingerprint.
+  fingerprint = tsl::FingerprintCat128(
+      fingerprint, tsl::Fingerprint128(SerializeDeterministically(
+                       module.comp_envs().ToProto())));
+
+  // Convert the fingerprint into a hex string and concatenate it with the .bin
+  // extension.
   const std::array<char, 16> fingerprint_bytes =
       tsl::Fprint128ToBytes(fingerprint);
   const absl::string_view fingerprint_bytes_view(fingerprint_bytes.data(),
@@ -917,7 +944,7 @@ std::string MakeFilename(const HloModule& module, const bool run_hlo_passes) {
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 CompilePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                             const bool run_hlo_passes) {
-  const std::string filename =
+  const std::string path =
       tsl::io::JoinPath(artifact_dir_, MakeFilename(*module, run_hlo_passes));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> wrapped_executable,
@@ -928,19 +955,19 @@ CompilePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
 
   TF_ASSIGN_OR_RETURN(const std::string serialized_executable,
                       executable->executable()->SerializeExecutable());
-  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), filename,
-                                            serialized_executable));
+  TF_RETURN_IF_ERROR(
+      tsl::WriteStringToFile(tsl::Env::Default(), path, serialized_executable));
   return wrapped_executable;
 }
 
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                             const bool run_hlo_passes) {
-  const std::string filename =
-      tsl::io::JoinPath(artifact_dir_, MakeFilename(*module, run_hlo_passes));
+  const std::string filename = MakeFilename(*module, run_hlo_passes);
+  const std::string path = tsl::io::JoinPath(artifact_dir_, filename);
   std::string serialized_executable;
   if (const absl::Status status = tsl::ReadFileToString(
-          tsl::Env::Default(), filename, &serialized_executable);
+          tsl::Env::Default(), path, &serialized_executable);
       !status.ok()) {
     if (!compile_if_not_found_) {
       return absl::NotFoundError(absl::StrCat(
@@ -949,6 +976,31 @@ ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
     LOG(INFO) << "Failed to read serialized executable. " << status;
     return HloRunnerPjRt::CreateExecutable(std::move(module), run_hlo_passes);
   }
+  LOG(INFO) << "ExecutePhase deserializing " << module->name() << " from "
+            << filename;
+
+  // If fail_duplicate_loads_ is enabled, fail if we previously loaded an
+  // executable at this path, and the file at this path exists.
+  if (fail_duplicate_loads_) {
+    if (const auto [unused_it, did_insert] =
+            loaded_executable_paths_.insert(path);
+        !did_insert) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "ExecutePhaseHloRunnerPjRt::CreateExecutable called with a module "
+          "that loads an executable that was previously loaded. The module "
+          "name is %s and the filename is %s. If this is intentional, please "
+          "set fail_duplicate_loads to false. This error exists to snuff out "
+          "accidental duplicate loads originating from fingerprint collisions. "
+          "If you intended to load two different executables, this error "
+          "indicates that their fingerprints are the same. If you wish to "
+          "avoid this issue in a test, you can either force their fingerprints "
+          "to be different through some superficial change in the module (e.g. "
+          "the module name), or by disabling split compilation by setting "
+          "precompile_test = False in the corresponding xla_test.",
+          module->name(), filename));
+    }
+  }
+
   return DeserializeExecutable(serialized_executable);
 }
 
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index c01c62980d283e..488a0777aadcf4 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -17,13 +17,14 @@ limitations under the License.
 #define XLA_SERVICE_HLO_RUNNER_PJRT_H_
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -101,9 +102,9 @@ class HloRunnerPjRt : public HloRunnerInterface {
       DeviceAssignment* device_assignment) override;
 
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
@@ -136,13 +137,14 @@ class HloRunnerPjRt : public HloRunnerInterface {
       HloModule* module, bool run_hlo_passes);
 
   absl::StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
-      std::function<
+      absl::AnyInvocable<
           absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
-              absl::Span<const std::vector<PjRtBuffer*>>)>
+              absl::Span<const std::vector<PjRtBuffer*>>,
+              absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
           execution_helper,
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment);
 
@@ -174,9 +176,9 @@ class CompilePhaseHloRunnerPjRt : public HloRunnerPjRt {
   }
 
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override {
     return absl::UnimplementedError(
@@ -189,18 +191,26 @@ class CompilePhaseHloRunnerPjRt : public HloRunnerPjRt {
 };
 
 // This class works just like a HloRunnerPjRt, but it only runs execution
-// (reading the executable from disk) and does not compile the executable.  If
-// `compile_if_not_found` is true, this class will attempt to compile the
+// (reading the executable from disk) and does not compile the executable.
+//
+// If `compile_if_not_found` is true, this class will attempt to compile the
 // executable if the serialized version from the compile phase could not be
 // found. This effectively makes this class equivalent to HloRunnerPjRt.
+//
+// If `fail_duplicate_loads` is true, calls to CreateExecutable will fail if the
+// executable was previously loaded using the same runner. Most tests do not
+// need to load an executable more than once and setting this can help catch
+// instances where e.g. fingerprints are colliding.
 class ExecutePhaseHloRunnerPjRt : public HloRunnerPjRt {
  public:
   ExecutePhaseHloRunnerPjRt(std::unique_ptr<PjRtClient> pjrt_client,
                             absl::string_view artifact_dir,
-                            bool compile_if_not_found = true)
+                            bool compile_if_not_found = true,
+                            bool fail_duplicate_loads = true)
       : HloRunnerPjRt(std::move(pjrt_client)),
         artifact_dir_(artifact_dir),
-        compile_if_not_found_(compile_if_not_found) {}
+        compile_if_not_found_(compile_if_not_found),
+        fail_duplicate_loads_(fail_duplicate_loads) {}
 
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
       std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
@@ -208,6 +218,9 @@ class ExecutePhaseHloRunnerPjRt : public HloRunnerPjRt {
  private:
   std::string artifact_dir_;
   bool compile_if_not_found_;
+  bool fail_duplicate_loads_;
+
+  absl::flat_hash_set<std::string> loaded_executable_paths_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt_test.cc b/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
index 51444677cbfdea..210382985cfd3c 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
@@ -38,16 +39,23 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_runner_interface.h"
+#include "xla/service/test_compilation_environment.pb.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/path.h"
 
 namespace xla {
 namespace {
 
+using ::absl_testing::StatusIs;
+using ::testing::StartsWith;
+
 class FakeClient : public PjRtClient {
  public:
   class Executable : public PjRtExecutable {
@@ -126,8 +134,15 @@ ENTRY %constant_s32 () -> s32[] {
 )");
 }
 
+// Fake module with run_hlo_passes=false.
 constexpr absl::string_view kModuleSerializedName =
-    "7a8f4b1ac78966508b85e1d9a0bc8f21.bin";
+    "a83d1d9a594b0f1fbc4227408abcdc6a.bin";
+// Fake module with run_hlo_passes=true.
+constexpr absl::string_view kModuleWithRunHloPassesSerializedName =
+    "1f51ba5f389ad07cbe268573dd21a94e.bin";
+// Fake module with a compilation environment set.
+constexpr absl::string_view kModuleWithCompEnvSerializedName =
+    "9385c25a58e7d6d47e56af1dd950b7d1.bin";
 
 class ArtifactDirTest : public ::testing::Test {
  public:
@@ -161,6 +176,49 @@ TEST_F(CompilePhaseHloRunnerPjRtTest, CreateExecutablePlacesFileCorrectly) {
   ASSERT_EQ(children[0], kModuleSerializedName);
 }
 
+// Tests that a CreateExecutable call with different run_hlo_passes value places
+// the file in a different location.
+TEST_F(CompilePhaseHloRunnerPjRtTest,
+       CreateExecutablePlacesFilesCorrectlyWithDifferentRunHloPasses) {
+  CompilePhaseHloRunnerPjRt runner(std::make_unique<FakeClient>(),
+                                   artifact_dir_);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  TF_ASSERT_OK(runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/true));
+
+  std::vector<std::string> children;
+  TF_ASSERT_OK(tsl::Env::Default()->GetChildren(artifact_dir_, &children));
+  ASSERT_EQ(children.size(), 1);
+  ASSERT_EQ(children[0], kModuleWithRunHloPassesSerializedName);
+}
+
+// Tests that a CreateExecutable call with a different compilation
+// environment places the file in a different location.
+TEST_F(CompilePhaseHloRunnerPjRtTest,
+       CreateExecutablePlacesFilesCorrectlyWithCompilationEnvironment) {
+  CompilePhaseHloRunnerPjRt runner(std::make_unique<FakeClient>(),
+                                   artifact_dir_);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  m->comp_envs().RegisterProcessNewEnvFn(
+      test::TestCompilationEnvironment1::GetDescriptor(),
+      [](std::unique_ptr<tsl::protobuf::Message> msg) {
+        std::unique_ptr<test::TestCompilationEnvironment1> env(
+            tensorflow::down_cast<test::TestCompilationEnvironment1*>(
+                msg.release()));
+        if (env == nullptr) {
+          env = std::make_unique<test::TestCompilationEnvironment1>();
+        }
+        env->set_some_flag(42);
+        return env;
+      });
+  TF_ASSERT_OK(m->comp_envs().InitializeAllKnownEnvs());
+  TF_ASSERT_OK(runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/false));
+
+  std::vector<std::string> children;
+  TF_ASSERT_OK(tsl::Env::Default()->GetChildren(artifact_dir_, &children));
+  ASSERT_EQ(children.size(), 1);
+  ASSERT_EQ(children[0], kModuleWithCompEnvSerializedName);
+}
+
 using ExecutePhaseHloRunnerPjRtTest = ArtifactDirTest;
 
 // Tests that a call to CreateExecutable reads the file from the correct path
@@ -189,5 +247,36 @@ TEST_F(ExecutePhaseHloRunnerPjRtTest, CreateExecutableReadsFileCorrectly) {
   ASSERT_EQ(*serialized_representation_read, "hello world");
 }
 
+TEST_F(ExecutePhaseHloRunnerPjRtTest,
+       CreateExecutableFailsOnDuplicateLoadIfFeatureEnabled) {
+  TF_ASSERT_OK(tsl::WriteStringToFile(
+      tsl::Env::Default(),
+      tsl::io::JoinPath(artifact_dir_, kModuleSerializedName), "hello world"));
+  ExecutePhaseHloRunnerPjRt runner(std::make_unique<FakeClient>(),
+                                   artifact_dir_);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  TF_ASSERT_OK(runner.CreateExecutable(m->Clone(""), /*run_hlo_passes=*/false));
+  EXPECT_THAT(
+      runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/false),
+      StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          StartsWith(
+              "ExecutePhaseHloRunnerPjRt::CreateExecutable called with a "
+              "module that loads an executable that was previously loaded.")));
+}
+
+TEST_F(ExecutePhaseHloRunnerPjRtTest,
+       CreateExecutableSucceedsOnDuplicateLoadIfFeatureDisabled) {
+  TF_ASSERT_OK(tsl::WriteStringToFile(
+      tsl::Env::Default(),
+      tsl::io::JoinPath(artifact_dir_, kModuleSerializedName), "hello world"));
+  ExecutePhaseHloRunnerPjRt runner(
+      std::make_unique<FakeClient>(), artifact_dir_,
+      /*compile_if_not_found=*/false, /*fail_duplicate_loads=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  TF_ASSERT_OK(runner.CreateExecutable(m->Clone(""), /*run_hlo_passes=*/false));
+  TF_EXPECT_OK(runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_sharding_test.cc b/third_party/xla/xla/service/hlo_sharding_test.cc
index 7050880018a56e..a00be7f860f013 100644
--- a/third_party/xla/xla/service/hlo_sharding_test.cc
+++ b/third_party/xla/xla/service/hlo_sharding_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -59,31 +60,45 @@ std::vector<OpMetadata> ListMetadata() {
 
 class HloShardingTest : public HloHardwareIndependentTestBase {};
 
-TEST_F(HloShardingTest, Replicate) {
-  HloSharding sharding = HloSharding::Replicate();
+// TODO(b/456418464): Parameterize `HloShardingTest` itself after supporting
+// NamedSharding in all methods.
+class HloShardingRepresentationTest
+    : public HloShardingTest,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(HloShardingRepresentationTest, Replicate) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::Replicate({}, use_named_sharding);
+  EXPECT_EQ(sharding.UseNamedShardingLeaf(), use_named_sharding);
   EXPECT_TRUE(sharding.IsReplicated());
   EXPECT_TRUE(sharding.IsTileMaximal());
   EXPECT_TRUE(sharding.UsesDevice(0));
   EXPECT_TRUE(sharding.UsesDevice(65535));
 
-  HloSharding other = HloSharding::Replicate();
+  HloSharding other = HloSharding::Replicate({}, use_named_sharding);
   EXPECT_EQ(other, sharding);
+  EXPECT_NE(HloSharding::Replicate(),
+            HloSharding::Replicate({}, /*use_named_sharding=*/true));
 
   EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4}),
                                  /*num_devices=*/2));
   EXPECT_FALSE(sharding.HasUniqueDevice());
 }
 
-TEST_F(HloShardingTest, DevicePlacement) {
-  HloSharding sharding = HloSharding::AssignDevice(5);
+TEST_P(HloShardingRepresentationTest, DevicePlacement) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::AssignDevice(5, {}, use_named_sharding);
+  EXPECT_EQ(sharding.UseNamedShardingLeaf(), use_named_sharding);
   EXPECT_FALSE(sharding.IsReplicated());
   EXPECT_TRUE(sharding.IsTileMaximal());
   EXPECT_FALSE(sharding.UsesDevice(0));
   EXPECT_TRUE(sharding.UsesDevice(5));
   EXPECT_EQ(5, sharding.GetUniqueDevice());
 
-  HloSharding other = HloSharding::Replicate();
+  HloSharding other = HloSharding::Replicate({}, use_named_sharding);
   EXPECT_NE(other, sharding);
+  EXPECT_NE(HloSharding::AssignDevice(5),
+            HloSharding::AssignDevice(5, {}, /*use_named_sharding=*/true));
 
   EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4}),
                                  /*num_devices=*/6));
@@ -337,21 +352,31 @@ TEST_F(HloShardingTest, V1V2SubgroupEquivalence) {
 }
 
 // Tests that empty tuple is supported.
-TEST_F(HloShardingTest, EmptySingleTuple) {
-  HloSharding sharding = HloSharding::SingleTuple(ShapeUtil::MakeTupleShape({}),
-                                                  HloSharding::AssignDevice(0));
+TEST_P(HloShardingRepresentationTest, EmptySingleTuple) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::SingleTuple(
+      ShapeUtil::MakeTupleShape({}),
+      HloSharding::AssignDevice(0, {}, use_named_sharding));
   EXPECT_TRUE(sharding.ExtractSingleSharding());
+  EXPECT_EQ(sharding.ExtractSingleSharding()->UseNamedShardingLeaf(),
+            use_named_sharding);
 }
 
 // Tests that empty tuple is not a shard group.
-TEST_F(HloShardingTest, EmptySingleTupleIsNotShardGroup) {
-  HloSharding sharding = HloSharding::SingleTuple(ShapeUtil::MakeTupleShape({}),
-                                                  HloSharding::AssignDevice(0));
+TEST_P(HloShardingRepresentationTest, EmptySingleTupleIsNotShardGroup) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::SingleTuple(
+      ShapeUtil::MakeTupleShape({}),
+      HloSharding::AssignDevice(0, {}, use_named_sharding));
   EXPECT_FALSE(sharding.IsShardGroup());
   EXPECT_FALSE(sharding.IsShardAs());
   EXPECT_FALSE(sharding.IsShardLike());
 }
 
+INSTANTIATE_TEST_SUITE_P(HloShardingRepresentationTest,
+                         HloShardingRepresentationTest,
+                         ::testing::Values(false, true));
+
 TEST_F(HloShardingTest, NestedTuple) {
   // nested_tuple_shape = (f32[], (f32[3]), f32[4, 6])
   Shape nested_tuple_shape = ShapeUtil::MakeTupleShape({
diff --git a/third_party/xla/xla/service/hlo_unstacker.cc b/third_party/xla/xla/service/hlo_unstacker.cc
deleted file mode 100644
index 045ba3d6a27a74..00000000000000
--- a/third_party/xla/xla/service/hlo_unstacker.cc
+++ /dev/null
@@ -1,1503 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/hlo_unstacker.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <deque>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/map_util.h"
-#include "xla/service/hlo_creation_utils.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/tuple_util.h"
-#include "xla/service/while_loop_unroller.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace {
-
-// TODO: b/352400145 - Unify the patterns, handlers and their type into a class
-// or struct.
-enum class PatternType {
-  DSFusionNoBitcastPattern,
-  DSFusionPattern,
-  NestedDSFusionPattern,
-  Other,
-};
-
-static std::string PatternTypeToString(PatternType pattern_type) {
-  switch (pattern_type) {
-    case PatternType::DSFusionNoBitcastPattern:
-      return "DSFusionNoBitcastPattern";
-    case PatternType::DSFusionPattern:
-      return "DSFusionPattern";
-    case PatternType::NestedDSFusionPattern:
-      return "NestedDSFusionPattern";
-    case PatternType::Other:
-      return "Other";
-  }
-}
-
-// Holds the information about custom unstacking patterns.
-struct PatternInfo {
-  PatternType type;
-  std::vector<const HloInstruction*> unstacked_instrs;
-  const HloInstruction* instr;
-  Shape unstacked_shape;
-  HloComputation* unstacking_computation;
-
-  std::string ToString() const {
-    if (unstacking_computation == nullptr) {
-      return absl::StrCat("type: \n\t", PatternTypeToString(type), "\n",
-                          "instr: \n\t", instr->name(), "\n", "shape: \n\t",
-                          unstacked_shape.ToString(true));
-    } else {
-      return absl::StrCat("type: \n\t", PatternTypeToString(type), "\n",
-                          "instr: \n\t", instr->name(), "\n", "shape: \n\t",
-                          unstacked_shape.ToString(true), "\n", "comp: \n",
-                          unstacking_computation->name());
-    }
-  }
-};
-
-// TODO: b/342457472 - Remove this struct and move its field to the
-// UnstackerTransformer as static members. A struct that holds the required
-// information for unstacking that is fixed across different unstacker
-// instastances.
-struct UnstackerMetadata {
-  static absl::StatusOr<UnstackerMetadata> Create(
-      HloModule* module, std::function<bool(HloInstruction*)> unfuse_slice) {
-    UnstackerMetadata metadata;
-    TF_ASSIGN_OR_RETURN(
-        bool prepared,
-        WhileLoopUnroller::PrepareModuleForUnrolling(module, {}));
-    if (prepared) {
-      VLOG(3) << "Prepared module: " << module->name() << " for unstacking.";
-    }
-    std::vector<std::pair<HloInstruction*, WhileLoopConfig>> loops =
-        WhileLoopUnroller::GetUnrollableLoops(module, {},
-                                              /*unroll_config=*/std::nullopt);
-    for (const auto& [instr, while_loop_config] : loops) {
-      metadata.unrollable_loop_bodies[instr->while_body()] = while_loop_config;
-      metadata.bodies[instr->while_body()] = instr;
-    }
-    metadata.unfuse_slice = unfuse_slice;
-    return metadata;
-  }
-  absl::flat_hash_map<HloComputation*, WhileLoopConfig> unrollable_loop_bodies;
-  absl::flat_hash_map<const HloComputation*, HloInstruction*> bodies;
-  // Vector containing pairs of custom patterns and their corresponding handler
-  // lambdas. The patterns are checked in the order in which they are inserted
-  // into this vector.
-  std::vector<
-      std::pair<std::function<std::optional<PatternInfo>(
-                    const UnstackerMetadata&, const HloInstruction*, int64_t)>,
-                std::function<absl::Status(HloInstruction*, const Shape&)>>>
-      custom_handlers;
-  std::function<bool(HloInstruction*)> unfuse_slice;
-};
-
-// Performs the two-step unstacking. Each instance of this class is responsible
-// for a single operand of a while loop.
-class UnstackerTransformer {
- public:
-  // Default unroll_factor of -1 indicates full unrolling
-  explicit UnstackerTransformer(const UnstackerMetadata& metadata)
-      : metadata_(metadata) {}
-
-  // Given an instruction and the index of the its changed operand, it applies
-  // the custom handler and populates body_changes lambdas that unstacks the hlo
-  // graph accordingly.
-  std::vector<const HloInstruction*> HandleInstruction(
-      const HloInstruction* instr, int64_t changed_idx) {
-    // Currently, we only unstack operands that are used within fusion
-    // computations.
-    if (instr->opcode() != HloOpcode::kFusion) {
-      return {};
-    }
-    VLOG(3) << "HandleInstruction(" << instr->shape().ToString()
-            << instr->name() << ", " << changed_idx << ")";
-
-    for (const auto& [custom_pattern, custom_handler] :
-         metadata_.custom_handlers) {
-      std::optional<PatternInfo> stacked_user =
-          custom_pattern(metadata_, instr, changed_idx);
-      // Try the next pattern if current pattern is not found.
-      if (!stacked_user.has_value()) {
-        continue;
-      }
-      PatternInfo& pattern_info = stacked_user.value();
-      pattern_type_ = pattern_info.type;
-      VLOG(3) << "PatternInfo:" << "\n" << pattern_info.ToString();
-
-      if (pattern_info.unstacking_computation != nullptr &&
-          unstacking_computation_ != nullptr) {
-        if (!absl::EqualsIgnoreCase(
-                pattern_info.unstacking_computation->ToString(
-                    HloPrintOptions::Fingerprint()),
-                unstacking_computation_->ToString(
-                    HloPrintOptions::Fingerprint()))) {
-          VLOG(3) << "Seen multiple unstacking computations, cannot handle: "
-                  << "\n previous computations: \n"
-                  << unstacking_computation_->ToString(
-                         HloPrintOptions::Fingerprint())
-                  << "\n current computations: \n"
-                  << pattern_info.unstacking_computation->ToString(
-                         HloPrintOptions::Fingerprint());
-          return {};
-        }
-      }
-
-      if (pattern_info.unstacking_computation != nullptr) {
-        unstacking_computation_ = pattern_info.unstacking_computation;
-      }
-
-      unstacked_shape_ = std::make_unique<Shape>(pattern_info.unstacked_shape);
-      unstacked_instrs_.push_back(instr);
-
-      // Wrapper function around the unstacker lambda which calls the unstacker.
-      std::function<absl::Status()> unstack_wrapper =
-          [&custom_handler = custom_handler,
-           pattern_info]() mutable -> absl::Status {
-        HloInstruction* mutable_dynamic_slicing_fusion =
-            const_cast<HloInstruction*>(pattern_info.instr);
-        return custom_handler(mutable_dynamic_slicing_fusion,
-                              pattern_info.unstacked_shape.tuple_shapes(0));
-      };
-      body_changes_.push_back(unstack_wrapper);
-      return pattern_info.unstacked_instrs;
-    }
-    return {};
-  }
-
-  const UnstackerMetadata& GetMetadata() const { return metadata_; }
-
-  std::vector<const HloInstruction*>& GetUnstackedInstructions() {
-    return unstacked_instrs_;
-  }
-
-  const Shape* GetUnstackedShape() const { return unstacked_shape_.get(); }
-
-  // The function returns a mutable pointer to the unstacking computation since
-  // the pointer is later used to clone the computation.
-  HloComputation* GetUnstackingComputation() const {
-    return unstacking_computation_;
-  }
-
-  std::vector<std::function<void(const UnstackerTransformer&)>>&
-  GetLoopChanges() {
-    return loop_changes_;
-  }
-
-  std::vector<std::function<absl::Status()>>& GetBodyChanges() {
-    return body_changes_;
-  }
-
-  absl::flat_hash_map<HloInstruction*, std::vector<int64_t>>&
-  GetOperandChanges() {
-    return operand_changes_;
-  }
-
-  void AddOperandChange(HloInstruction* instr, int64_t index) {
-    operand_changes_[instr].push_back(index);
-  }
-
-  void AddLoopChange(
-      std::function<void(const UnstackerTransformer&)> loop_change) {
-    loop_changes_.push_back(loop_change);
-  }
-
-  PatternType GetPatternType() const { return pattern_type_; }
-
- private:
-  PatternType pattern_type_;
-  const UnstackerMetadata& metadata_;
-  // This pointer is populated if the unstacker finds unstackable loop input.
-  std::unique_ptr<Shape> unstacked_shape_ = nullptr;
-  // This is a pointer to the computation that is responsible for unstacking. It
-  // is used to hoist the unstacking computations outside the loop bodies.
-  // std::unique_ptr<HloComputation>
-  HloComputation* unstacking_computation_ = nullptr;
-  // A vector of lambdas that describe necessary changes to the shape of the
-  // loops to unstack. The lambdas accept the pointer to the new unstacked
-  // shape.
-  std::vector<std::function<void(const UnstackerTransformer&)>> loop_changes_;
-  // a list of lambdas that captures all the changes to the hlo graph needed for
-  // unstacking.
-  std::vector<std::function<absl::Status()>> body_changes_;
-  // A map that tracks the index of the changed operand for instructions of type
-  // get-tuple-element, tuple, and while during unstacking.
-  absl::flat_hash_map<HloInstruction*, std::vector<int64_t>> operand_changes_;
-  // Holds the list of unstacked instructions that will be used to identify
-  // loops that need to be unrolled.
-  std::vector<const HloInstruction*> unstacked_instrs_;
-};
-
-bool CanUnstackWhileOperand(const HloInstruction* while_instr,
-                            UnstackerTransformer& unstacker, int64_t index);
-
-bool UnstackWhileOperandAtIndex(
-    const UnstackerMetadata& metadata, HloInstruction* while_instr,
-    int64_t index, std::vector<const HloInstruction*>& unstacked_instructions);
-
-// Given a gte and an unstacker instance, this function walks down the graph of
-// the users in BFS manner and propagates the index of the changed input operand
-// for kGetTupleElement, kTuple, and kWhile instructions. Moreover, if checks if
-// the a user should be handled with the provided custom handler(s) inside the
-// unstacker instance. Note that this function does NOT change the shape of any
-// instruction, it merely keeps track of the instructions and where in the input
-// operands the change need to be applied later.
-bool PropagateGteShapeChange(HloInstruction* gte,
-                             UnstackerTransformer& unstacker) {
-  VLOG(5) << "PropagateGteShapeChange(" << gte->name() << ")";
-
-  HloInstruction* parent_while = nullptr;
-  if (unstacker.GetMetadata().bodies.contains(gte->parent())) {
-    parent_while = unstacker.GetMetadata().bodies.at(gte->parent());
-    if (parent_while->while_body() != gte->parent()) {
-      parent_while = nullptr;
-    }
-  }
-
-  std::vector<const HloInstruction*> handled_instrs;
-  // TODO: b/343457903 - Use HloDataflowAnalysis to track the usage of a value
-  // instead of manually applying bfs
-  //
-  // Apply BFS to propagate the index of the changed operand. We put all the
-  // changed instructions along with the index of the changed operand in the
-  // visited map and then propagate the change to the users of the instruction.
-  absl::flat_hash_map<HloInstruction*, int64_t> visited;
-  std::deque<HloInstruction*> worklist;
-  worklist.push_back(gte);
-  visited.insert({gte, gte->tuple_index()});
-  unstacker.AddOperandChange(gte, gte->tuple_index());
-  while (!worklist.empty()) {
-    HloInstruction* changed_instr_to_propagate = worklist.front();
-    // The index of the changed operand that needs to be propagated.
-    int64_t changed_operand_index =
-        FindOrDie(visited, changed_instr_to_propagate);
-    worklist.pop_front();
-    for (HloInstruction* user : changed_instr_to_propagate->users()) {
-      if (ContainsKey(visited, user)) {
-        continue;
-      }
-      // We explicitly propagate the changed index for three types of users,
-      // namely, get-tuple-element, tuple and while users. The rationale is that
-      // the output shape of these three instruction types are inferred only by
-      // their input operand(s). Finally, we check if the user can be handled by
-      // the provided custom handler in HandleInstruction method.
-      if (user->opcode() == HloOpcode::kGetTupleElement) {
-        if (user->tuple_index() != changed_operand_index) {
-          continue;
-        }
-        // Since we insert the gte user only if the index of the gte is equal to
-        // the changed operand of its tuple input, we are sure that this gte
-        // instruction will get the new shape eventually and the
-        // change_operand_index does not matter.
-        visited.insert({user, changed_operand_index});
-        unstacker.AddOperandChange(user, changed_operand_index);
-        worklist.push_back(user);
-      } else if (user->opcode() == HloOpcode::kTuple) {
-        for (int64_t i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) == changed_instr_to_propagate) {
-            visited.insert({user, i});
-            unstacker.AddOperandChange(user, i);
-            worklist.push_back(user);
-            if (parent_while != nullptr && user->IsRoot() &&
-                i != gte->tuple_index()) {
-              bool changed_nested_while =
-                  CanUnstackWhileOperand(parent_while, unstacker, i);
-              if (!changed_nested_while) {
-                return false;
-              }
-            }
-          }
-        }
-      } else if (user->opcode() == HloOpcode::kWhile) {
-        // Recursively check the inner while for unstacking and populate
-        // unstacker instance.
-        bool changed_nested_while =
-            CanUnstackWhileOperand(user, unstacker, changed_operand_index);
-        if (!changed_nested_while) {
-          return false;
-        }
-        visited.insert({user, changed_operand_index});
-        unstacker.AddOperandChange(user, changed_operand_index);
-        worklist.push_back(user);
-      } else {
-        if (absl::c_find(handled_instrs, user) != handled_instrs.end()) {
-          continue;
-        }
-        // If already unstacked, we do not need to handle again.
-        if (user->IsCustomCall("DynamicGte") ||
-            user->IsCustomCall("DynamicTuple")) {
-          continue;
-        }
-        int64_t use_index = user->operand_index(changed_instr_to_propagate);
-        std::vector<const HloInstruction*> curr_handled_instrs =
-            unstacker.HandleInstruction(user, use_index);
-        if (curr_handled_instrs.empty()) {
-          VLOG(3) << "Custom unstacker not found for " << user->name();
-          return false;
-        }
-        for (const HloInstruction* instr : curr_handled_instrs) {
-          // TODO: b/352400145 - Here we check if the user has the same shape as
-          // the stacked tensor (how to capture this more robustly?). if so, we
-          // need to add the user to the worklist to get updated.
-          for (HloInstruction* handled_instr_user : instr->users()) {
-            if (user->shape() == gte->shape()) {
-              visited.insert({handled_instr_user, changed_operand_index});
-              unstacker.AddOperandChange(handled_instr_user,
-                                         changed_operand_index);
-              worklist.push_back(handled_instr_user);
-            }
-          }
-          handled_instrs.push_back(instr);
-        }
-      }
-    }
-  }
-  return true;
-}
-
-// Within the given computation, finds all the gte instruction with the
-// following form: get-tuple-elements(operand), index=idx and collects all the
-// new shapes. new_shape is the new shape at idx of the operand of the gte.
-bool CanPropagateGteShapeChangesInComputation(
-    const HloComputation* comp, const HloInstruction* operand,
-    UnstackerTransformer& shape_transformer, int64_t idx) {
-  VLOG(3) << "Propagating shape change of index " << idx
-          << " in : " << comp->name();
-  for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
-    // We only need to propagate changes through the gte instructions with index
-    // = idx.
-    if (instr->opcode() == HloOpcode::kGetTupleElement &&
-        instr->tuple_index() == idx) {
-      if (instr->operand(0) != operand) {
-        continue;
-      }
-      // If propagation is not possible (no custom handler provided for the
-      // users of the candidate), we bail early.
-      bool can_propagate = PropagateGteShapeChange(instr, shape_transformer);
-      if (!can_propagate) {
-        VLOG(3) << "Failed to propagate shape change for " << instr->name();
-        return false;
-      }
-    }
-  }
-  VLOG(3) << "Finish propagating shape change of index " << idx
-          << " in: " << comp->name();
-  return true;
-}
-
-std::unique_ptr<HloInstruction> DynamicSliceToSlice(
-    HloInstruction* dynamic_slice, HloInstruction* input, int64_t i) {
-  std::vector<int64_t> new_start_indices;
-  new_start_indices.reserve(dynamic_slice->shape().dimensions().size());
-  std::vector<int64_t> new_limit_indices;
-  new_limit_indices.reserve(dynamic_slice->shape().dimensions().size());
-  std::vector<int64_t> new_strides;
-  new_strides.reserve(dynamic_slice->shape().dimensions().size());
-  new_start_indices.push_back(i);
-  new_limit_indices.push_back(i + 1);
-  new_strides.push_back(1);
-  for (int64_t j = 1; j < dynamic_slice->shape().dimensions().size(); ++j) {
-    new_start_indices.push_back(0);
-    new_limit_indices.push_back(
-        dynamic_slice->mutable_operand(0)->shape().dimensions(j));
-    new_strides.push_back(1);
-  }
-  return HloInstruction::CreateSlice(dynamic_slice->shape(), input,
-                                     new_start_indices, new_limit_indices,
-                                     new_strides);
-}
-
-bool ShouldUnfuseSlices(const UnstackerMetadata& metadata, HloInstruction* ds) {
-  HloInstruction* input = ds->mutable_operand(0);
-  for (int64_t i = 0; i < input->shape().dimensions(0); ++i) {
-    HloInstruction* slice =
-        ds->AddInstruction(DynamicSliceToSlice(ds, input, i));
-    if (!metadata.unfuse_slice(slice)) {
-      CHECK_OK(slice->parent()->RemoveInstruction(slice));
-      return false;
-    }
-    CHECK_OK(slice->parent()->RemoveInstruction(slice));
-  }
-  return true;
-}
-
-// This function is responsible for:
-// 1. Hoisting the unstacking computation outside the while_instr.
-// 2. Replacing the input of the while_instr with the new unstacked version.
-void UnstackWhileInput(const UnstackerTransformer& unstacker,
-                       HloInstruction* while_instr, int64_t index) {
-  VLOG(3) << "Unstacking while input: " << while_instr->name() << " at "
-          << index;
-  const Shape* new_shape = unstacker.GetUnstackedShape();
-  HloComputation* unstacking_computation = unstacker.GetUnstackingComputation();
-  const Shape& slice_shape = new_shape->tuple_shapes(0);
-  HloInstruction* old_while_input =
-      while_instr->while_init()->mutable_operand(index);
-  // If the input is a tuple, i.e., while_instr has already been unstacked
-  // during unstacking of its parent, we do not need to unstack it again.
-  if (old_while_input->shape().IsTuple()) {
-    VLOG(3) << "Input is already unstacked: " << old_while_input->name();
-    return;
-  }
-
-  std::vector<HloInstruction*> slices;
-  // If the input is an AllocateBuffer, we simply break it down into a tuple of
-  // AllocateBuffer instructions, one per slice.
-  if (old_while_input->IsCustomCall("AllocateBuffer")) {
-    for (int64_t i = 0; i < new_shape->tuple_shapes().size(); ++i) {
-      slices.push_back(while_instr->AddInstruction(
-          HloInstruction::CreateCustomCall(slice_shape, {}, "AllocateBuffer")));
-    }
-  } else {
-    // TODO: b/341815540 - Instead of creating the unstacked tuple for every
-    // input index, we should reuse if the input and unstacking computations are
-    // the same.
-    //
-    // Hoist the unstacking computation outside the while_instr and create a
-    // tuple of slices.
-    for (int64_t i = 0; i < new_shape->tuple_shapes().size(); ++i) {
-      HloInstruction* root_instr = unstacking_computation->root_instruction();
-      // TODO: b/352400145 - After unifying patterns and handlers, instead of
-      // using the pattern type to determine the unstacked input, we should use
-      // the pattern object to call the appropriate method.
-      //
-      // For DSFusionPattern and NestedDSFusionPattern, we rewrite the
-      // dynamic-slice as a slice instruction in the hope that these slices are
-      // later prefetched using async-slice by MSA. For other patterns, we
-      // resort to the original unstacking computation until we find benefit in
-      // doing otherwise.
-      HloInstruction* slice = nullptr;
-      if (unstacker.GetPatternType() == PatternType::DSFusionPattern ||
-          unstacker.GetPatternType() == PatternType::NestedDSFusionPattern ||
-          unstacker.GetPatternType() == PatternType::DSFusionNoBitcastPattern) {
-        if (unstacker.GetPatternType() == PatternType::DSFusionPattern ||
-            unstacker.GetPatternType() == PatternType::NestedDSFusionPattern) {
-          slice = while_instr->AddInstruction(DynamicSliceToSlice(
-              root_instr->mutable_operand(0), old_while_input, i));
-        } else if (unstacker.GetPatternType() ==
-                   PatternType::DSFusionNoBitcastPattern) {
-          slice = while_instr->AddInstruction(
-              DynamicSliceToSlice(root_instr, old_while_input, i));
-        }
-      }
-      if (slice == nullptr || !unstacker.GetMetadata().unfuse_slice(slice)) {
-        std::vector<HloInstruction*> operands = {
-            old_while_input,
-            while_instr->AddInstruction(MakeScalarConstantWithShape(
-                unstacking_computation->parameter_instruction(1)->shape(), i))};
-        slice = while_instr->AddInstruction(HloInstruction::CreateFusion(
-            slice_shape, HloInstruction::FusionKind::kLoop, operands,
-            while_instr->GetModule()->AddEmbeddedComputation(
-                unstacking_computation->Clone()),
-            "hoisted"));
-      }
-      slices.push_back(slice);
-    }
-  }
-  HloInstruction* new_operand_element =
-      while_instr->AddInstruction(HloInstruction::CreateTuple(slices));
-  HloInstruction* new_while_init =
-      TupleUtil::ReplaceTupleWith(new_operand_element,
-                                  while_instr->while_init(), {index}, false)
-          .value();
-  CHECK_OK(while_instr->ReplaceOperandWithDifferentShape(0, new_while_init));
-}
-
-bool CanUnstackWhileOperand(const HloInstruction* while_instr,
-                            UnstackerTransformer& unstacker, int64_t index) {
-  VLOG(5) << "ReplaceWhileOperandShape: " << while_instr->name() << " at "
-          << index;
-
-  bool body_changes_collected = CanPropagateGteShapeChangesInComputation(
-      while_instr->while_body(),
-      while_instr->while_body()->parameter_instruction(0), unstacker, index);
-  if (!body_changes_collected) {
-    return false;
-  }
-
-  bool condition_changes_collected = CanPropagateGteShapeChangesInComputation(
-      while_instr->while_condition(),
-      while_instr->while_condition()->parameter_instruction(0), unstacker,
-      index);
-  if (!condition_changes_collected) {
-    return false;
-  }
-
-  // Check if we can propagate the changes through the output of the while
-  // at index.
-  bool parent_changes_collected = CanPropagateGteShapeChangesInComputation(
-      while_instr->parent(), while_instr, unstacker, index);
-  if (!parent_changes_collected) {
-    VLOG(3) << "Failed: parent_changes_collected";
-    return false;
-  }
-
-  HloInstruction* root_operand =
-      while_instr->while_body()->root_instruction()->mutable_operand(index);
-  if (root_operand == nullptr) {
-    return false;
-  }
-
-  HloInstruction* gte_operand = nullptr;
-  // Currently, we only support unstacking of while operands that either:
-  // 1. Are parameters of the while_body.
-  // 2. Are get-tuple-elements of another while instruction.
-  if (Match(root_operand, match::GetTupleElement(match::Op(&gte_operand)))) {
-    if (Match(gte_operand, match::While())) {
-      VLOG(3) << "Faced a gte originating from loop: "
-              << root_operand->ToString();
-      bool loop_feeding_root_changes_collected = CanUnstackWhileOperand(
-          root_operand->operand(0), unstacker, root_operand->tuple_index());
-      if (!loop_feeding_root_changes_collected) {
-        VLOG(3) << "Failed: loop " << root_operand->operand(0)->name()
-                << " output at " << index << " is not unstackable";
-        return false;
-      }
-    } else if (!Match(gte_operand, match::Parameter().WithParameterNum(0))) {
-      VLOG(3) << "Failed: root operand of while_body at " << index
-              << " is not a parameter";
-      return false;
-    }
-  }
-
-  auto loop_change = [=](const UnstackerTransformer& unstacker,
-                         HloInstruction* loop, int64_t idx) mutable {
-    Shape old_shape = ShapeUtil::MakeStaticShape(
-        loop->while_body()->parameter_instruction(0)->shape());
-    ShapeUtil::UpdateTupleShape(*unstacker.GetUnstackedShape(), idx,
-                                &old_shape);
-
-    loop->while_body()->ReplaceParameter(
-        0, HloInstruction::CreateParameter(0, old_shape, "unstacked"));
-    loop->while_condition()->ReplaceParameter(
-        0, HloInstruction::CreateParameter(0, old_shape, "unstacked"));
-
-    CHECK_NE(unstacker.GetUnstackingComputation(), nullptr);
-    UnstackWhileInput(unstacker, loop, idx);
-    // Update the input and output shape of the loop.
-    *loop->mutable_shape() = old_shape;
-  };
-  auto loop_change_wrapper = [&loop_change, while_instr,
-                              index](const UnstackerTransformer& unstacker) {
-    HloInstruction* mutable_loop = const_cast<HloInstruction*>(while_instr);
-    loop_change(unstacker, mutable_loop, index);
-  };
-  unstacker.AddLoopChange(loop_change_wrapper);
-  return true;
-}
-
-// Apply the two-step unstacking algorithm to the given while_instr at the given
-// index.
-bool UnstackWhileOperandAtIndex(
-    const UnstackerMetadata& metadata, HloInstruction* while_instr,
-    int64_t index, std::vector<const HloInstruction*>& unstacked_instructions) {
-  UnstackerTransformer unstacker = UnstackerTransformer(metadata);
-
-  // First step of unstacking to determine whether while_instr at index is
-  // unstackable.
-  bool can_unstack = CanUnstackWhileOperand(while_instr, unstacker, index);
-  if (!can_unstack) {
-    VLOG(3) << "Unstacking failed for " << while_instr->name() << " at "
-            << index;
-    return false;
-  }
-
-  // If unstacker has not found an unstackable shape, there is no point in
-  // applying the unstacker changes.
-  if (unstacker.GetUnstackedShape() == nullptr) {
-    VLOG(3) << "Failed: unstacked shape is null";
-    return false;
-  }
-
-  // If unstacker has not found an unstackable shape, there is no point in
-  // applying the unstacker changes.
-  if (unstacker.GetUnstackingComputation() == nullptr) {
-    VLOG(3) << "Failed: unstacking computation is null";
-    return false;
-  }
-
-  // At this point, we have the unstacked_shape at hand. We go ahead and apply
-  // all the changes that required the unstacked shape.
-  //
-  // Update the shape of get-tuple-element, tuple, and, while instructions
-  // based on the unstacked_shape and the index of the changed operand.
-  for (auto& [instr, indices] : unstacker.GetOperandChanges()) {
-    switch (instr->opcode()) {
-      case HloOpcode::kGetTupleElement:
-        VLOG(3) << "Changing shape of: " << instr->name();
-        *instr->mutable_shape() = *unstacker.GetUnstackedShape();
-        break;
-      case HloOpcode::kTuple: {
-        for (int64_t index : indices) {
-          VLOG(3) << "Changing shape of: " << instr->name() << " at " << index;
-          *instr->mutable_shape()->mutable_tuple_shapes(index) =
-              *unstacker.GetUnstackedShape();
-        }
-        break;
-      }
-      case HloOpcode::kWhile:
-        for (int64_t index : indices) {
-          VLOG(3) << "Changing shape of: " << instr->name() << " at " << index;
-          ShapeUtil::UpdateTupleShape(*unstacker.GetUnstackedShape(), index,
-                                      instr->mutable_shape());
-        }
-        break;
-      default:
-        LOG(FATAL) << "Unsupported opcode: " << instr->name();
-    }
-  }
-  // Apply the changes to the body according to the provided custom handler.
-  for (const auto& body_change : unstacker.GetBodyChanges()) {
-    CHECK_OK(body_change());
-  }
-  // Apply the changes to the shape of the loop body and condition computations.
-  for (auto& loop_change : unstacker.GetLoopChanges()) {
-    loop_change(unstacker);
-  }
-  for (const HloInstruction* instr : unstacker.GetUnstackedInstructions()) {
-    unstacked_instructions.push_back(instr);
-  }
-  return true;
-}
-
-Shape MakeUnstackedShapeFromSlice(const Shape& slice_shape, int64_t layers) {
-  std::vector<Shape> shapes;
-  shapes.reserve(layers);
-  for (int64_t i = 0; i < layers; ++i) {
-    shapes.push_back(slice_shape);
-  }
-  return ShapeUtil::MakeTupleShape(shapes);
-}
-
-// Checks if the given instruction is a fusion with num_fusion_params
-// parameters inside an unrollable loop. If so, it returns the loop config.
-std::optional<WhileLoopConfig> IsFusionInsideUnrollableLoopWithNumParameter(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    std::optional<int64_t> num_fusion_params) {
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return std::nullopt;
-  }
-  if (num_fusion_params.has_value()) {
-    if (instr->fused_parameters().size() != num_fusion_params) {
-      VLOG(3) << "Fusion has different number of parameters";
-      return std::nullopt;
-    }
-  }
-  if (!metadata.unrollable_loop_bodies.contains(instr->parent())) {
-    VLOG(5) << "Fusion not inside unrollable while body, " << instr->name()
-            << " inside " << instr->parent()->name();
-    return std::nullopt;
-  }
-  return metadata.unrollable_loop_bodies.at(instr->parent());
-}
-
-// Checks if the instruction is a fusion with num_fusion_params parameters
-// inside an unrollable loop and within its fusion computation there is an
-// effectively static dynamic-slice instruction on the most major dimension of
-// the operand at the given stacked_operand_idx. If so, it returns the
-// dynamic-slice instruction.
-HloInstruction* GetMostMajorEffectivelyStaticDynamicSliceInFusion(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    std::optional<int64_t> num_fusion_params, int64_t stacked_operand_idx) {
-  std::optional<WhileLoopConfig> while_instr_config =
-      IsFusionInsideUnrollableLoopWithNumParameter(metadata, instr,
-                                                   num_fusion_params);
-  if (!while_instr_config.has_value()) {
-    return nullptr;
-  }
-  for (HloInstruction* fused_instr :
-       instr->fused_instructions_computation()->MakeInstructionPostOrder()) {
-    std::optional<int64_t> dynamic_index =
-        MatchEffectivelyStaticDynamicSliceInsideLoop(
-            fused_instr,
-            instr->fused_instructions_computation()->parameter_instruction(
-                stacked_operand_idx),
-            while_instr_config.value());
-    if (dynamic_index.has_value() && dynamic_index.value() == 0) {
-      return fused_instr;
-    }
-  }
-  return nullptr;
-}
-
-// Checks if the instruction is a fusion with num_fusion_params parameters
-// inside an unrollable loop and within its fusion computation looks for the
-// dynamic-index instruction that covers the shape of the operand at the given
-// index.
-HloInstruction* GetMostMajorShapeCoveringDynamicIndexInFusion(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    HloOpcode opcode, int64_t num_fusion_params, int64_t stacked_operand_idx) {
-  std::optional<WhileLoopConfig> while_instr_config =
-      IsFusionInsideUnrollableLoopWithNumParameter(metadata, instr,
-                                                   num_fusion_params);
-  if (!while_instr_config.has_value()) {
-    return nullptr;
-  }
-  for (HloInstruction* fused_instr :
-       instr->fused_instructions_computation()->MakeInstructionPostOrder()) {
-    if (fused_instr->opcode() != opcode) {
-      continue;
-    }
-    std::optional<int64_t> dynamic_index =
-        MatchShapeCoveringDynamicIndexInstruction(
-            fused_instr,
-            instr->fused_instructions_computation()->parameter_instruction(
-                stacked_operand_idx),
-            opcode, while_instr_config.value());
-    if (dynamic_index.has_value() && dynamic_index.value() == 0) {
-      return fused_instr;
-    }
-  }
-  return nullptr;
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, f(loop_iteration_var))
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   slice = dynamic_slice(p0, p1, zero, ...)
-//   ROOT bitcast = bitcast(slice)
-// }
-// where f is a function of loop_iteration_var. It indicates that the slicing
-// offset is effectively static after unrolling.
-std::optional<PatternInfo> GetDSFusionPattern(const UnstackerMetadata& metadata,
-                                              const HloInstruction* instr,
-                                              int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSFusion";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorEffectivelyStaticDynamicSliceInFusion(metadata, instr, 2,
-                                                        stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (!ShouldUnfuseSlices(metadata, shape_covering_instr)) {
-    return std::nullopt;
-  }
-  HloInstruction* bitcast_operand = nullptr;
-  if (Match(instr->fused_instructions_computation()->root_instruction(),
-            match::Bitcast(match::Op(&bitcast_operand)))) {
-    if (bitcast_operand == shape_covering_instr) {
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::DSFusionPattern;
-      pattern_info.instr = instr;
-      const Shape& slice_shape = shape_covering_instr->shape();
-      const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-      pattern_info.unstacking_computation =
-          instr->fused_instructions_computation();
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-absl::Status UnstackDSFusionPattern(
-    HloInstruction* mutable_dynamic_slicing_fusion, const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_slicing_fusion->parent();
-
-  HloInstruction* stacked = mutable_dynamic_slicing_fusion->mutable_operand(0);
-  HloInstruction* offset = mutable_dynamic_slicing_fusion->mutable_operand(1);
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-
-  HloInstruction* bitcast = mutable_dynamic_slicing_fusion->AddInstruction(
-      HloInstruction::CreateBitcast(mutable_dynamic_slicing_fusion->shape(),
-                                    new_operand));
-  return mutable_dynamic_slicing_fusion->ReplaceAllUsesWithDifferentShape(
-      bitcast);
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, f(loop_iteration_var))
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   ROOT slice = dynamic_slice(p0, p1, zero, ...)
-// }
-// where f is a function of loop_iteration_var. It indicates that the slicing
-// offset is effectively static after unrolling.
-std::optional<PatternInfo> GetDSFusionNoBitcastPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSFusionNoBitcast";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorEffectivelyStaticDynamicSliceInFusion(metadata, instr, 2,
-                                                        stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (instr->fused_instructions_computation()->root_instruction() !=
-      shape_covering_instr) {
-    return std::nullopt;
-  }
-  PatternInfo pattern_info;
-  pattern_info.type = PatternType::DSFusionNoBitcastPattern;
-  pattern_info.instr = instr;
-  const Shape& slice_shape = shape_covering_instr->shape();
-  const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-  pattern_info.unstacked_shape =
-      MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-  pattern_info.unstacking_computation = instr->fused_instructions_computation();
-  pattern_info.unstacked_instrs.push_back(instr);
-  return pattern_info;
-}
-
-absl::Status UnstackDSFusionNoBitcastPattern(
-    HloInstruction* mutable_dynamic_slicing_fusion, const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_slicing_fusion->parent();
-
-  HloInstruction* stacked = mutable_dynamic_slicing_fusion->mutable_operand(0);
-  HloInstruction* offset = mutable_dynamic_slicing_fusion->mutable_operand(1);
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-
-  return mutable_dynamic_slicing_fusion->ReplaceAllUsesWithDifferentShape(
-      new_operand);
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, update, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   p2 = parameter(2)
-//   update = bitcast(p1)
-//   ROOT dus = dynamic_update_slice(p0, update, p2, zero, ...)
-// }
-std::optional<PatternInfo> GetDUSFusionPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DUSFusion";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicUpdateSlice, 3,
-          stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (Match(shape_covering_instr->operand(1),
-            match::Bitcast(match::Parameter()))) {
-    if (shape_covering_instr->parent()->root_instruction() ==
-        shape_covering_instr) {
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      pattern_info.unstacked_shape = MakeUnstackedShapeFromSlice(
-          instr->operand(2)->shape(), instr->operand(0)->shape().dimensions(0));
-      pattern_info.unstacking_computation = nullptr;
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-absl::Status UnstackDUSFusionPattern(
-    HloInstruction* mutable_dynamic_update_slicing_fusion,
-    const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_update_slicing_fusion->parent();
-  // TODO: (b/350043079) - automatically find the input, offset and update
-  // indices.
-  HloInstruction* stacked =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(0);
-  HloInstruction* offset =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(1);
-  HloInstruction* update =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(2);
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          stacked->shape(), {stacked, update, offset}, "DynamicTuple"));
-  for (HloInstruction* user : mutable_dynamic_update_slicing_fusion->users()) {
-    TF_RETURN_IF_ERROR(
-        mutable_dynamic_update_slicing_fusion->ReplaceUseWithDifferentShape(
-            user, new_operand));
-  }
-  return absl::OkStatus();
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stackd, update, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   p2 = parameter(2)
-//   pad = pad(p1, ...)
-//   update = bitcast(pad)
-//   ROOT dus = dynamic_update_slice(p0, update, p2, zero, ...)
-// }
-std::optional<PatternInfo> GetDUSFusionWithPadPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DUSFusionWithPad";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicUpdateSlice, 3,
-          stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (Match(
-          shape_covering_instr->operand(1),
-          match::Bitcast(match::Pad(match::Parameter(), match::Constant())))) {
-    if (shape_covering_instr->parent()->root_instruction() ==
-        shape_covering_instr) {
-      const HloInstruction* pad_instr =
-          shape_covering_instr->operand(1)->operand(0);
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      pattern_info.unstacked_shape = MakeUnstackedShapeFromSlice(
-          pad_instr->shape(),
-          shape_covering_instr->operand(0)->shape().dimensions(0));
-      pattern_info.unstacking_computation = nullptr;
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-// Unstacks the DUSFusionWithPad pattern by removing the dynamic-update-slice
-// from the fusion and feeding the padding fusion to the dynamic-tuple
-// custom-call.
-absl::Status UnstackDUSFusionWithPadPattern(
-    HloInstruction* mutable_dynamic_update_slicing_fusion,
-    const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_update_slicing_fusion->parent();
-  HloComputation* fused_computation =
-      mutable_dynamic_update_slicing_fusion->fused_instructions_computation();
-  HloInstruction* stacked =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(
-          fused_computation->root_instruction()
-              ->mutable_operand(0)
-              ->parameter_number());
-  HloInstruction* offset =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(
-          fused_computation->root_instruction()
-              ->mutable_operand(2)
-              ->parameter_number());
-
-  HloInstruction* pad_instr = fused_computation->root_instruction()
-                                  ->mutable_operand(1)
-                                  ->mutable_operand(0);
-  fused_computation->set_root_instruction(pad_instr, true);
-  *mutable_dynamic_update_slicing_fusion->mutable_shape() = pad_instr->shape();
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          stacked->shape(),
-          {stacked, mutable_dynamic_update_slicing_fusion, offset},
-          "DynamicTuple"));
-  for (HloInstruction* user : mutable_dynamic_update_slicing_fusion->users()) {
-    if (user != new_operand) {
-      TF_RETURN_IF_ERROR(
-          mutable_dynamic_update_slicing_fusion->ReplaceUseWithDifferentShape(
-              user, new_operand));
-    }
-  }
-  return absl::OkStatus();
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stackd, update, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   slice = dynamic-slice(p0, p1, zero)
-//   broadcast = broadcast(constant)
-//   add = add(slice, broadcast)
-//   ROOT reduce = reduce(add, zero), apply=+
-// }
-std::optional<PatternInfo> GetDSFusionWithAddPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSFusionWithAdd";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicSlice, 2, stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  HloComputation* fused_computation = instr->fused_instructions_computation();
-  HloInstruction* fusion_root = fused_computation->root_instruction();
-  HloInstruction* add_operand;
-  if (Match(fusion_root,
-            match::Reduce(match::Add(match::Op(&add_operand),
-                                     match::Broadcast(match::Constant())),
-                          match::Constant()))) {
-    if (add_operand == shape_covering_instr) {
-      const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(instr->shape(), num_layers);
-      HloComputation::Builder builder("unstack_add");
-      HloInstruction* p0 =
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              0, fused_computation->parameter_instruction(0)->shape(), "p0"));
-      HloInstruction* p1 =
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, fused_computation->parameter_instruction(1)->shape(), "p1"));
-      HloInstruction* zero =
-          builder.AddInstruction(MakeScalarConstantWithShape(p1->shape(), 0));
-      std::vector<HloInstruction*> slice_starts;
-      slice_starts.reserve(shape_covering_instr->shape().dimensions().size());
-      slice_starts.push_back(p1);
-      for (int64_t i = 0;
-           i < static_cast<int64_t>(
-                   shape_covering_instr->shape().dimensions().size()) -
-                   1;
-           i++) {
-        slice_starts.push_back(zero);
-      }
-      HloInstruction* slice =
-          builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-              shape_covering_instr->shape(), p0, slice_starts,
-              shape_covering_instr->dynamic_slice_sizes()));
-      HloInstruction* zero_reduce =
-          builder.AddInstruction(MakeScalarConstantWithShape(
-              ShapeUtil::MakeScalarShape(slice->shape().element_type()), 0));
-      HloInstruction* reduce =
-          builder.AddInstruction(HloInstruction::CreateReduce(
-              instr->shape(), slice, zero_reduce, fusion_root->dimensions(),
-              fused_computation->root_instruction()->to_apply()));
-      HloComputation* unstack_add =
-          instr->GetModule()->AddEmbeddedComputation(builder.Build());
-      unstack_add->set_root_instruction(reduce);
-      pattern_info.unstacking_computation = unstack_add;
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-absl::Status UnstackDSFusionWithAddPattern(
-    HloInstruction* mutable_dynamic_slice_with_add_fusion,
-    const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_slice_with_add_fusion->parent();
-  HloInstruction* stacked =
-      mutable_dynamic_slice_with_add_fusion->mutable_operand(0);
-  HloInstruction* offset =
-      mutable_dynamic_slice_with_add_fusion->mutable_operand(1);
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  HloInstruction* one = parent_loop->AddInstruction(MakeScalarConstantWithShape(
-      ShapeUtil::MakeScalarShape(slice_shape.element_type()), 1));
-  HloInstruction* broadcast = parent_loop->AddInstruction(
-      HloInstruction::CreateBroadcast(slice_shape, one, {}));
-  HloInstruction* add = mutable_dynamic_slice_with_add_fusion->AddInstruction(
-      HloInstruction::CreateBinary(new_operand->shape(), HloOpcode::kAdd,
-                                   new_operand, broadcast));
-  TF_RETURN_IF_ERROR(
-      mutable_dynamic_slice_with_add_fusion->ReplaceAllUsesWith(add));
-  return absl::OkStatus();
-}
-
-// This method checks if the given instruction is a fusion with the following
-// properties:
-// 1. It is inside the body of an unrollable loop
-// 2. The parameter at stacked_operand_index has a single user inside the
-//    fused computation.
-// 3. The single user is a fusion with two operands with the following form:
-//    fusion(stacked_param, slicing_offset)
-//    (We assume that the stacked parameter is always the first operand and
-//    the slicing offset is the second operand.)
-// 4. The fusion user contains a shape-covering dynamic-slice instruction.
-std::optional<PatternInfo> GetNestedDSFusionPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return std::nullopt;
-  }
-  if (!metadata.unrollable_loop_bodies.contains(instr->parent())) {
-    VLOG(5) << "Instruction not inside unrollable while body, " << instr->name()
-            << " inside " << instr->parent()->name();
-    return std::nullopt;
-  }
-
-  WhileLoopConfig while_instr_config =
-      metadata.unrollable_loop_bodies.at(instr->parent());
-
-  VLOG(3) << "Checking NestedDSFusionPattern";
-
-  HloInstruction* inner_fusion_user = nullptr;
-  for (HloInstruction* fused_instr :
-       instr->fused_instructions_computation()->MakeInstructionPostOrder()) {
-    // Find the changed parameter in the fused computation
-    if (Match(fused_instr, match::Parameter(stacked_operand_idx))) {
-      // There must be a single fusion user
-      if (fused_instr->user_count() != 1) {
-        return std::nullopt;
-      }
-      if (Match(fused_instr->users()[0],
-                match::Fusion(match::Op(), match::Op()))) {
-        inner_fusion_user = fused_instr->users()[0];
-        break;
-      }
-    }
-  }
-  if (inner_fusion_user == nullptr) {
-    return std::nullopt;
-  }
-  for (HloInstruction* inner_fusion_instr :
-       inner_fusion_user->fused_instructions_computation()
-           ->MakeInstructionPostOrder()) {
-    if (!Match(inner_fusion_instr, match::DynamicSlice())) {
-      continue;
-    }
-    std::optional<int64_t> dynamic_index =
-        MatchEffectivelyStaticDynamicSliceInsideLoop(
-            inner_fusion_instr,
-            inner_fusion_user->fused_instructions_computation()
-                ->parameter_instruction(0),
-            while_instr_config);
-    if (dynamic_index.has_value() && dynamic_index.value() == 0) {
-      const int64_t num_layers =
-          inner_fusion_user->operand(0)->shape().dimensions(0);
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::NestedDSFusionPattern;
-      pattern_info.instr = inner_fusion_user;
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(inner_fusion_instr->shape(), num_layers);
-      pattern_info.unstacking_computation =
-          inner_fusion_user->fused_instructions_computation();
-      pattern_info.unstacked_instrs.push_back(inner_fusion_user);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-// The function below captures all the changes necessary to hlo graph for it's
-// corresponding (GetNestedDSFusionPattern) pattern to unstack.
-absl::Status UnstackNestedDSFusionPattern(
-    HloInstruction* mutable_dynamic_slicing_fusion, const Shape& slice_shape) {
-  // We are sure that this lambda is called with a nested fusion.
-  HloInstruction* parent_fusion =
-      mutable_dynamic_slicing_fusion->parent()->FusionInstruction();
-
-  // Under the assumption that the stacked parameter is always the first
-  // operand of the inner fusion.
-  HloInstruction* stacked_in_ds_fusion =
-      mutable_dynamic_slicing_fusion->mutable_operand(0);
-  CHECK_EQ(stacked_in_ds_fusion->opcode(), HloOpcode::kParameter);
-  int64_t stacked_param_number = stacked_in_ds_fusion->parameter_number();
-  HloInstruction* stacked =
-      parent_fusion->mutable_operand(stacked_param_number);
-
-  // Under the assumption that the slicing offset is always the second
-  // operand of the inner fusion.
-  HloInstruction* offset_in_ds_fusion =
-      mutable_dynamic_slicing_fusion->mutable_operand(1);
-  CHECK_EQ(offset_in_ds_fusion->opcode(), HloOpcode::kParameter);
-  HloInstruction* offset =
-      parent_fusion->mutable_operand(offset_in_ds_fusion->parameter_number());
-
-  HloInstruction* sliced_param =
-      parent_fusion->fused_instructions_computation()->ReplaceParameter(
-          stacked_param_number,
-          HloInstruction::CreateParameter(stacked_param_number, slice_shape,
-                                          "sliced"));
-  HloInstruction* bitcast = mutable_dynamic_slicing_fusion->AddInstruction(
-      HloInstruction::CreateBitcast(mutable_dynamic_slicing_fusion->shape(),
-                                    sliced_param));
-  HloInstruction* bitcast_fusion =
-      mutable_dynamic_slicing_fusion->AddInstruction(
-          HloInstruction::CreateFusion(mutable_dynamic_slicing_fusion->shape(),
-                                       HloInstruction::FusionKind::kLoop,
-                                       bitcast));
-  TF_RETURN_IF_ERROR(
-      mutable_dynamic_slicing_fusion->ReplaceAllUsesWith(bitcast_fusion));
-  // Create the custom-call to dynamically get the tuple element given the
-  // loop iteration number. We rely on WhileLoopUnroller to rewrite this as
-  // a get-tuple-element hlo once the iteration number is known and loop
-  // bodies are unrolled.
-  HloInstruction* new_operand =
-      parent_fusion->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  return parent_fusion->ReplaceOperandWithDifferentShape(
-      sliced_param->parameter_number(), new_operand);
-}
-
-// Identifies the following pattern:
-//  computation {
-//     ...
-//     fusion.1 = fusion(...stacked...) // this is GetDSFusionPattern
-//     fusion.2 = fusion(...stacked...) // this is GetDUSFusionPattern
-//     ...
-//   }
-std::optional<PatternInfo> GetDSAndDUSPattern(const UnstackerMetadata& metadata,
-                                              const HloInstruction* instr,
-                                              int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSAndDUSPattern";
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return std::nullopt;
-  }
-  const HloInstruction* stacked = instr->operand(stacked_operand_idx);
-  if (stacked->user_count() != 2) {
-    return std::nullopt;
-  }
-
-  HloInstruction* shape_covering_ds_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicSlice, 2, stacked_operand_idx);
-  if (shape_covering_ds_instr == nullptr) {
-    return std::nullopt;
-  }
-  HloInstruction* bitcast_operand = nullptr;
-  if (!Match(instr->fused_instructions_computation()->root_instruction(),
-             match::Bitcast(match::Op(&bitcast_operand)))) {
-    return std::nullopt;
-  }
-  if (bitcast_operand != shape_covering_ds_instr) {
-    return std::nullopt;
-  }
-  if (!GetDUSFusionPattern(metadata, stacked->users()[1],
-                           stacked->users()[1]->operand_index(stacked))) {
-    return std::nullopt;
-  }
-  PatternInfo pattern_info;
-  pattern_info.type = PatternType::Other;
-  pattern_info.instr = instr;
-  const Shape& slice_shape = instr->shape();
-  const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-  pattern_info.unstacked_shape =
-      MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-  pattern_info.unstacking_computation = instr->fused_instructions_computation();
-  pattern_info.unstacked_instrs.push_back(instr);
-  pattern_info.unstacked_instrs.push_back(stacked->users()[1]);
-  return pattern_info;
-}
-
-absl::Status UnstackDSAndDUSPattern(HloInstruction* mutable_dynamic_slice,
-                                    const Shape& slice_shape) {
-  HloInstruction* stacked_gte = mutable_dynamic_slice->mutable_operand(0);
-  int64_t stacked_gte_index = stacked_gte->tuple_index();
-  HloComputation* parent = stacked_gte->parent();
-  ShapeUtil::UpdateTupleShape(stacked_gte->shape(), stacked_gte_index,
-                              parent->root_instruction()->mutable_shape());
-
-  HloComputation* parent_loop = mutable_dynamic_slice->parent();
-  HloInstruction* stacked = mutable_dynamic_slice->mutable_operand(0);
-  HloInstruction* offset = mutable_dynamic_slice->mutable_operand(1);
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  TF_RETURN_IF_ERROR(
-      mutable_dynamic_slice->ReplaceAllUsesWithDifferentShape(new_operand));
-
-  HloInstruction* mutable_dynamic_update_slice = stacked_gte->users()[1];
-  TF_RETURN_IF_ERROR(
-      UnstackDUSFusionPattern(mutable_dynamic_update_slice, slice_shape));
-  return absl::OkStatus();
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   slice = dynamic_slice(p0, p1, zero, ...)
-//   ROOT reduce = reduce(slice, constant)
-// }
-std::optional<PatternInfo> GetReduceFusionPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking ReduceFusion";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicSlice, 2, stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (!ShouldUnfuseSlices(metadata, shape_covering_instr)) {
-    return std::nullopt;
-  }
-  HloInstruction* reduce_operand = nullptr;
-  HloInstruction* fusion_root =
-      instr->fused_instructions_computation()->root_instruction();
-  if (Match(fusion_root, match::Reduce(match::Op(&reduce_operand),
-                                       match::ConstantScalar())) &&
-      Match(fusion_root->to_apply()->root_instruction(),
-            match::Add(match::Parameter(), match::Parameter()))) {
-    if (reduce_operand == shape_covering_instr) {
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      const Shape& slice_shape = instr->shape();
-      const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-      pattern_info.unstacking_computation =
-          instr->fused_instructions_computation();
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-
-  return std::nullopt;
-}
-
-absl::Status UnstackReduceFusionPattern(HloInstruction* mutable_reduce_fusion,
-                                        const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_reduce_fusion->parent();
-
-  HloInstruction* stacked = mutable_reduce_fusion->mutable_operand(0);
-  HloInstruction* offset = mutable_reduce_fusion->mutable_operand(1);
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  return mutable_reduce_fusion->ReplaceAllUsesWithDifferentShape(new_operand);
-}
-
-};  // namespace
-
-// The entry point of the unstacking algorithm. Given a module, it creates the
-// unstacking metadata and populates the unstacking custom handler(s). Moreover,
-// it attempts unstacking each index of the loops in the entry computation of
-// the module. Finally, it removes the unused computations and unrolls the
-// module.
-absl::StatusOr<bool> HloUnstacker::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  TF_ASSIGN_OR_RETURN(auto metadata,
-                      UnstackerMetadata::Create(module, unfuse_slice_));
-  // The order of the patterns below is important, as it determines the order
-  // in which the unstacking custom handlers are called. For example, applying
-  // GetDSAndDUSPattern after GetDSFusionPattern would result in patterns of
-  // GetDSAndDUSPattern not being recognized since GetDSFusionPattern is a
-  // sub-pattern of GetDSAndDUSPattern.
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDSAndDUSPattern, UnstackDSAndDUSPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDSFusionPattern, UnstackDSFusionPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDUSFusionPattern, UnstackDUSFusionPattern));
-  metadata.custom_handlers.push_back(std::make_pair(
-      GetDUSFusionWithPadPattern, UnstackDUSFusionWithPadPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDSFusionWithAddPattern, UnstackDSFusionWithAddPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetReduceFusionPattern, UnstackReduceFusionPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetNestedDSFusionPattern, UnstackNestedDSFusionPattern));
-  metadata.custom_handlers.push_back(std::make_pair(
-      GetDSFusionNoBitcastPattern, UnstackDSFusionNoBitcastPattern));
-
-  std::vector<HloInstruction*> entry_loops;
-  for (HloInstruction* instr :
-       module->entry_computation()->MakeInstructionPostOrder()) {
-    // Only unstack standard loops with tuple input and output.
-    if (Match(instr, match::While(match::Tuple())) &&
-        Match(instr->while_body()->root_instruction(), match::Tuple())) {
-      entry_loops.push_back(instr);
-    }
-  }
-
-  int64_t num_unstacked = 0;
-  bool unstacked = false;
-  std::vector<const HloInstruction*> unstacked_instructions;
-  for (HloInstruction* loop : entry_loops) {
-    for (int64_t i = 0; i < loop->shape().tuple_shapes().size(); ++i) {
-      // We don't handle tuples and if we see then we assume they come from a
-      // previous unstacking attempt.
-      if (loop->while_init()->operand(i)->shape().IsTuple()) {
-        continue;
-      }
-      VLOG(3) << "Attempting to unstack " << loop->name() << " at " << i
-              << " = " << loop->while_init()->operand(i)->shape().ToString(true)
-              << loop->while_init()->operand(i)->ToShortString();
-      bool current_unstacked =
-          UnstackWhileOperandAtIndex(metadata, loop, i, unstacked_instructions);
-      if (current_unstacked) {
-        num_unstacked++;
-        unstacked = true;
-      }
-      VLOG(3) << "###################";
-    }
-  }
-  if (!unstacked) {
-    return false;
-  }
-  // Unstacking computations are cloned, leaving the original unstacking
-  // computation unused.
-  TF_RETURN_IF_ERROR(module->RemoveUnusedComputations());
-  // We rely on the WhileLoopUnroller pass to unroll unstacked loop bodies
-  // and rewrite custom-calls created by unstacker, i.e., DynamicGte and
-  // DynamicTuple.
-  std::vector<HloInstruction*> loops_to_unroll;
-  for (const HloInstruction* instr : unstacked_instructions) {
-    HloInstruction* loop = metadata.bodies[instr->parent()];
-    if (std::find(loops_to_unroll.begin(), loops_to_unroll.end(), loop) ==
-        loops_to_unroll.end()) {
-      loops_to_unroll.push_back(loop);
-    }
-  }
-  // Go over the loops in reverse order to unroll the inner loops first.
-  for (int64_t i = loops_to_unroll.size() - 1; i >= 0; --i) {
-    HloInstruction* loop = loops_to_unroll[i];
-    TF_ASSIGN_OR_RETURN(UnrollResult unroll_result,
-                        WhileLoopUnroller::UnrollAndReturnReplacement(
-                            loop, /*unroll_factor=*/-1,
-                            /*wrap_in_trivial_loop=*/false,
-                            /*force_unroll=*/true, /*prepare=*/false));
-    bool unrolled = unroll_result.unrolled;
-    CHECK(unrolled);
-  }
-  VLOG(3) << "after unstacking \n" << module->ToString();
-  VLOG(3) << "Num unstacked: " << num_unstacked;
-  return true;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_unstacker.h b/third_party/xla/xla/service/hlo_unstacker.h
deleted file mode 100644
index bd312e89a2a66a..00000000000000
--- a/third_party/xla/xla/service/hlo_unstacker.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_UNSTACKER_H_
-#define XLA_SERVICE_HLO_UNSTACKER_H_
-
-#include <stdbool.h>
-
-#include <functional>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-
-namespace xla {
-// This pass implements unstacking for loop operands. Generally speaking,
-// unstacking is the act of breaking a rank n tensor into n smaller n-1 rank
-// tensors without changing the semantics of the program. There are different
-// patterns that can benefit from unstacking. This pass aims to implement such
-// patterns. The patterns implemented are not exhaustive by any means. Lets
-// consider a simple example:
-// In the pattern below, `I` (the most-major dimension in the stacked tensor),
-// is equal to the trip count of the while loop and `i` is the iteration
-// variable of the loop. The stacked input is used only as input to a
-// shape-covering dynamic-slice (check the definition of a shape-covering
-// dynamic-slice: `tensorflow/compiler/xla/service/while_loop_unroller.h`)
-//
-//   +-while----------------------------------------------------+
-//   | param = tuple(..., [I,x1,...,xn]stacked, ...)            |
-//   | ...                                                      |
-//   | [1,x1,...,xn]slice = ds([I,x1,...,xn]stacked, i, 0, ...) |
-//   | ...                                                      |
-//   | ops using the slice                                      |
-//   | ...                                                      |
-//   | ROOT = tuple(..., stacked, ...)                          |
-//   +----------------------------------------------------------+
-//
-// This pattern can be unstacked and rewritten as following:
-//
-//   +-while-----------------------------------------------------------------+
-//   | param = tuple(..., ([1,x1,...,xn], ..., [1,x1,...,xn])unstacked, ...) |
-//   | ...                                                                   |
-//.  | slice_1 = get_tuple_element(unstacked), index=i                       |
-//   | ops using the slice_i                                                 |
-//   | ...                                                                   |
-//   | ROOT = tuple(..., unstacked, ...)                                     |
-//   +-----------------------------------------------------------------------+
-//
-// where the unstacked input is initialized with the slices outside of the loop:
-// unstacked = tuple(slice_1, ..., slice_n)
-// To get each slice, the pass introduces a dynamic version of the
-// kGetTupleElement instruction using a custom-call. This custom-call is then
-// replaced with a normal get-tuple-element during loop unrolling.
-//
-// Below is a high-level overview of the unstacking algorithm:
-// We unstack a module by unstacking inputs to the while loops within the entry
-// computation for every index. Given a while loop and a candidate for
-// unstacking, the algorithm performs the following two steps:
-// 1. The first step is to determine if unstacking is possible by checking if
-//  the unstacking of the while operand at the given index can be propagated
-//  through the body (and nested bodies if any). Unstacking is possible
-//  if a pair of pattern and handler is provided that can identify and handle
-//  such pattern that involves all the uses of the stacked operand at the given
-//  index.
-// 2. Apply the unstacking by executing the changes gathered in the first phase.
-class HloUnstacker : public HloModulePass {
- public:
-  ~HloUnstacker() override = default;
-
-  explicit HloUnstacker(std::function<bool(HloInstruction*)> unfuse_slice =
-                            [](HloInstruction* instr) { return true; })
-      : unfuse_slice_(unfuse_slice) {}
-
-  absl::string_view name() const override { return "hlo_unstacker"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
- private:
-  std::function<bool(HloInstruction*)> unfuse_slice_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_SERVICE_HLO_UNSTACKER_H_
diff --git a/third_party/xla/xla/service/hlo_unstacker_test.cc b/third_party/xla/xla/service/hlo_unstacker_test.cc
deleted file mode 100644
index 3a552c5542cfe7..00000000000000
--- a/third_party/xla/xla/service/hlo_unstacker_test.cc
+++ /dev/null
@@ -1,1503 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/hlo_unstacker.h"
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include <gtest/gtest.h>
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace {
-
-using UnstackerTest = HloTestBase;
-
-int64_t GetInstrCountWithOpcodeInEntry(HloModule* module, HloOpcode opcode) {
-  int64_t instr_with_opcode_count = 0;
-  for (HloInstruction* instr :
-       module->entry_computation()->MakeInstructionPostOrder()) {
-    if (instr->opcode() == opcode) {
-      instr_with_opcode_count++;
-    }
-  }
-  return instr_with_opcode_count;
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  // Check that the bitcast is unfused and there are not fusions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, NotUnstackDSFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.tuple {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    mult = multiply(param_0.51117, param_0.51117)
-    ROOT out = tuple(param_0.51117, mult)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    fusion_mult = (s8[3,128,128], s8[3,128,128]) fusion(s8[3,128,128] p1), kind=kLoop, calls=%fused_computation.tuple
-    mult = s8[3,128,128] get-tuple-element(fusion_mult), index=1
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, mult)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_FALSE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternMultipleLoopRootUse) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p2 = s8[3,128,128] get-tuple-element(wide_p), index=3
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p2, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(inc, conv, p2, p2)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    zero = s8[] constant(0)
-    buffer = s8[3,128,128] broadcast(zero), dimensions={}
-    while.input = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(init, p1, p0, buffer)
-    while.out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 6);
-  // Check that the bitcast is unfused and there are not fusions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternWithUnusedOperand) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(inc, conv, p1, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    zero = s8[] constant(0)
-    buffer = s8[3,128,128] broadcast(zero), dimensions={}
-    while.input = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(init, p1, p0, buffer)
-    while.out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 6);
-  // Check that the bitcast is unfused and there are not fusions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackReduceFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  dynamic-slice.609.reduce_sub_computation {
-    lhs.53 = s8[] parameter(0)
-    rhs.53 = s8[] parameter(1)
-    ROOT add.3090 = s8[] add(lhs.53, rhs.53)
-  }
-
-  fused_computation.1096.clone {
-    param_0.5572 = s8[3,128,128] parameter(0)
-    param_1.6711 = s32[]{:T(128)} parameter(1)
-    constant.12008 = s32[]{:T(128)} constant(0)
-    dynamic-slice.1545 = s8[1,128,128] dynamic-slice(param_0.5572, param_1.6711, constant.12008, constant.12008), dynamic_slice_sizes={1,128, 128}
-    constant.12009 = s8[] constant(-0)
-    ROOT reduce.919 = s8[128,128] reduce(dynamic-slice.1545, constant.12009), dimensions={0}, to_apply=dynamic-slice.609.reduce_sub_computation
-  } // fused_computation.1096.clone
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.1096.clone
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternNoBitcast) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[1,128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[1,128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    bitcast.102 = s8[128,128] bitcast(s8[1,128,128] %fusion.67830)
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] bitcast.102), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  // Check that all the fusions are removed.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternNoBitcastKeepFused) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[1,128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[1,128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    bitcast.102 = s8[128,128] bitcast(s8[1,128,128] %fusion.67830)
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] bitcast.102), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  auto unfuse = [](HloInstruction* instruction) { return false; };
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked,
-                          HloUnstacker(unfuse).Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 0);
-  // Check that dynamic-slices are still fused.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternKeepFused) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT out = s8[128,128] bitcast(%dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  auto unfuse = [](HloInstruction* instruction) { return false; };
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked,
-                          HloUnstacker(unfuse).Run(module.get()));
-  EXPECT_FALSE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternWithDifferentLayout) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.30.clone (param_0.153: bf16[32,4,64,64,3], param_1.123: s32[]) -> bf16[64,4,64,3] {
-    %param_0.153 = bf16[32,4,64,64,3]{2,1,4,3,0} parameter(0)
-    %param_1.123 = s32[]{:T(128)} parameter(1)
-    %constant.227 = s32[]{:T(128)} constant(0)
-    %dynamic-slice.5 = bf16[1,4,64,64,3]{2,1,4,3,0} dynamic-slice(bf16[32,4,64,64,3]{2,1,4,3,0} %param_0.153, s32[]{:T(128)} %param_1.123, s32[]{:T(128)} %constant.227, s32[]{:T(128)} %constant.227, s32[]{:T(128)} %constant.227, /*index=5*/s32[]{:T(128)} %constant.227), dynamic_slice_sizes={1,4,64,64,3}
-    ROOT %bitcast.102 = bf16[64,4,64,3]{0,1,3,2} bitcast(bf16[1,4,64,64,3]{2,1,4,3,0} %dynamic-slice.5)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], bf16[32,4,64,64,3])) -> (s32[], bf16[8,128], bf16[32,4,64,64,3]) {
-    wide_p = (s32[], bf16[8,128], bf16[32,4,64,64,3]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = bf16[32,4,64,64,3]{2,1,4,3,0} get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = bf16[64,4,64,3]{0,1,3,2} fusion(p1, i), kind=kLoop, calls=%fused_computation.30.clone
-    ROOT out = (s32[], bf16[8,128], bf16[32,4,64,64,3]) tuple(inc, p0, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], bf16[32,4,64,64,3])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], bf16[32,4,64,64,3]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(32)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = bf16[32,4,64,64,3] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], bf16[32,4,64,64,3]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], bf16[32,4,64,64,3]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = bf16[32,4,64,64,3] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice),
-            32);
-  // Check that dynamic-slices are still fused.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[3,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    fusion.conv = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, fusion.conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-// Instead of slicing the entire shape, this test slices only even elements from
-// the first parameter.
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithDynamicIndex) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[6,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[6,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[6,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[6,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[6,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[6,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[6,128,128])) -> (s32[], bf16[8,128], s8[6,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[6,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[6,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    two = s32[] constant(2)
-    mult = s32[] multiply(i, two)
-    fusion.conv = bf16[8,128] fusion(p0, p1, mult), kind=kOutput, calls=%fused_computation.inner
-    ROOT out = (s32[], bf16[8,128], s8[6,128,128]) tuple(inc, fusion.conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[6,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[6,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[6,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[6,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[6,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[6,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithMultipleIndex) {
-  std::string hlo_string = R"(
-    HloModule SimpleLoop
-    %fused_computation.slice.1 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.slice.2 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner.1 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.1
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %fused_computation.inner.2 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.2
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body (wide_param: (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      // to_be_sliced_while_gte
-      p1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      p2 = s8[4,128,128] get-tuple-element(wide_p), index=3
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv.1 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.1
-      fusion.conv.2 = bf16[8,128] fusion(p0, p2, i), kind=kOutput, calls=%fused_computation.inner.2
-      plus = bf16[8,128] add(fusion.conv.1, fusion.conv.2)
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) tuple(inc, plus, p1, p2)
-    }
-
-    %while.cond (wide_param: (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    ENTRY main {
-      p0 = s8[4,128,128] parameter(0)
-      p1 = s8[4,128,128] parameter(1)
-      p2 = bf16[8,128] parameter(2)
-      init = s32[] constant(0)
-      while.input = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) tuple(init, p2, p0, p1)
-      while.out = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) while(while.input), condition=%while.cond , body=%while.body
-      ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions. For each unstacked input, we
-  // create 4 slices, 8 in total.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 8);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithDiffereOperandsOrder) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner (param_1.30691: s8[3,128,128], p2: s32[], param_0.34523: bf16[8,128]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(2)
-    %param_1.30691 = s8[3,128,128] parameter(0)
-    p2 = s32[] parameter(1)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    fusion.conv = bf16[8,128] fusion(p1, i, p0), kind=kOutput, calls=%fused_computation.inner
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, fusion.conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithSameUnstackingComps) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice.1 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.slice.2 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner.1 (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[3,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.1
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %fused_computation.inner.2 (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[3,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.2
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    fusion.conv1 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.1
-    fusion.conv2 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.2
-    add = bf16[8,128] add(fusion.conv1, fusion.conv2)
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, add, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest,
-       NotUnstackNestedDSFusionPatternWithDifferentUnstackingComps) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice.1 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[1,128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-  }
-
-  %fused_computation.slice.2 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67831 = s8[128,128] fusion(p1, i), kind=kLoop, calls=%fused_computation.slice.2
-    %fusion.67830 = s8[1,128,128] fusion(p1, i), kind=kLoop, calls=%fused_computation.slice.1
-    %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %fusion.67830)
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, p0, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  // Currently, we don't unroll if there are multiple nested ds fusions.
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_FALSE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternSingleNestedLoop) {
-  std::string hlo_string = R"(
-    HloModule SimpleLoop
-    %fused_computation.slice (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body.inner (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
-    }
-
-    %while.cond.inner (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    %while.body (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(2)
-      zero = s32[] constant(0)
-      mult = s32[] multiply(i, one)
-      inner.in = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
-      inner.out = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in), condition=%while.cond.inner, body=%while.body.inner
-      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out), index=1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
-    }
-
-    %while.cond (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(20)
-      add = s32[] add(%constant.12857, %constant.12857)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
-    }
-
-    ENTRY main {
-      weight = s8[4,128,128] parameter(0)
-      p1 = bf16[8,128] parameter(1)
-      init = s32[] constant(1)
-      while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init, p1, weight)
-      while.out = (s32[], bf16[8,128], s8[4,128,128]) while(while.input), condition=%while.cond , body=%while.body
-      ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 4);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternTwoNestedLoops) {
-  std::string hlo_string = R"(
-    HloModule SimpleLoop
-    %fused_computation.slice1 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner1 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice1
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body.inner1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
-    }
-
-    %while.cond.inner1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    %while.body1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(2)
-      zero = s32[] constant(0)
-      mult = s32[] multiply(i, one)
-      inner.in.1 = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
-      inner.out.1 = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in.1), condition=%while.cond.inner1, body=%while.body.inner1
-      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out.1), index=1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
-    }
-
-    %while.cond1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(20)
-      add = s32[] add(%constant.12857, %constant.12857)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
-    }
-
-    %fused_computation.slice2 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner2 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice2
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body.inner2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner2
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
-    }
-
-    %while.cond.inner2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    %while.body2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(2)
-      zero = s32[] constant(0)
-      mult = s32[] multiply(i, one)
-      inner.in.2 = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
-      inner.out.2 = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in.2), condition=%while.cond.inner2, body=%while.body.inner2
-      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out.2), index=1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
-    }
-
-    %while.cond2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(20)
-      add = s32[] add(%constant.12857, %constant.12857)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
-    }
-
-    ENTRY main {
-      weight = s8[4,128,128] parameter(0)
-      p1 = bf16[8,128] parameter(1)
-      init = s32[] constant(1)
-      while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init, p1, weight)
-      while.out = (s32[], bf16[8,128], s8[4,128,128]) while(while.input), condition=%while.cond1 , body=%while.body1
-      init2 = s32[] get-tuple-element(while.out), index=0
-      second.while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init2, p1, weight)
-      second.while.out = (s32[], bf16[8,128], s8[4,128,128]) while(second.while.input), condition=%while.cond2 , body=%while.body2
-      out = bf16[8,128] get-tuple-element(while.out), index=1
-      second.out = bf16[8,128] get-tuple-element(second.while.out), index=1
-      ROOT result = bf16[8,128] add(out, second.out)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions. For each loop there is one
-  // unstacked input that creates 4 slices, in total 8 slices for two loops.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 8);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSAndDUSPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s32[4,3], offset: s32[]) -> s32[3] {
-    %param_0.51117 = s32[4,3] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = s32[1,3] dynamic-slice(s32[4,3] %param_0.51117, offset, zero), dynamic_slice_sizes={1,3}
-    ROOT %bitcast.31250 = s32[3] bitcast(s32[1,3] %dynamic-slice.22040)
-  }
-
-  %fused_computation.update.slice (param_0.51117: s32[4,3], p1: s32[], p2: s32[3]) -> s32[4,3] {
-    %param_0.51117 = s32[4,3] parameter(0)
-    %p1 = s32[] parameter(1)
-    %p2 = s32[3] parameter(2)
-    %zero = s32[] constant(0)
-    %bitcast.31250 = s32[1,3] bitcast(%p2)
-    ROOT output_dus = s32[4,3]{1,0} dynamic-update-slice(%param_0.51117, %bitcast.31250, %p1, zero)
-  }
-
-  SimpleLoop.body {
-    loop_var.1 = (s32[], s32[4,3]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    get-tuple-element.2 = s32[4,3] get-tuple-element(loop_var.1), index=1
-    zero = s32[] constant(0)
-
-    some_const = s32[3] constant({0,1,2})
-    constant.1 = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, constant.1)
-    ds = s32[3]{0} fusion(get-tuple-element.2, get-tuple-element.1), kind=kLoop, calls=%fused_computation.slice
-    update = s32[3] add(ds, ds)
-    dus = s32[3] dynamic-update-slice(ds, update, zero)
-    output = s32[4,3] fusion(get-tuple-element.2, get-tuple-element.1, dus), kind=kLoop, calls=%fused_computation.update.slice
-    ROOT tuple = (s32[], s32[4,3]) tuple(idx, output)
-  }
-  SimpleLoop.condition {
-    loop_var.1 = (s32[], s32[4,3]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    constant.2 = s32[] constant(4)
-    ROOT less-than = pred[] compare(get-tuple-element.1, constant.2), direction=LT
-  }
-  ENTRY SimpleLoop {
-    reference = s32[4,3] parameter(0)
-    zero = s32[] constant(0)
-    zero1 = s32[] constant(0)
-    one = s32[] constant(1)
-    tuple.1 = (s32[], s32[4,3]) tuple(zero, reference)
-    while = (s32[], s32[4,3]) while(tuple.1), condition=SimpleLoop.condition, body=SimpleLoop.body
-    ROOT out = s32[] get-tuple-element(while), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-// Unstacking outer loop at index 1 forces to unstacked inner while at index 1
-// as well. This is because the output of the outer loop at index 1 is aliased
-// to the output of the inner while at index 1.
-TEST_F(UnstackerTest, UnstackDSAndDUSPatternNestedLoop) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-
-  %fused_computation.slice (param_0.51117: bf16[4,1,8,257,128], offset: s32[]) -> bf16[1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = bf16[1,1,8,257,128]
-    dynamic-slice(bf16[4,1,8,257,128] %param_0.51117, offset, zero, zero, zero, zero), dynamic_slice_sizes={1,1,8,257,128}
-    ROOT %bitcast.31250 = bf16[1,8,257,128] bitcast(%dynamic-slice.22040)
-  }
-
-  %fused_computation.slice.2 (param_0.51117: bf16[4,1,8,257,128], offset: s32[]) -> bf16[1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = bf16[1,1,8,257,128] dynamic-slice(bf16[4,1,8,257,128] %param_0.51117, offset, zero, zero, zero, zero), dynamic_slice_sizes={1,1,8,257,128}
-    ROOT %bitcast.31250 = bf16[1,8,257,128] bitcast(%dynamic-slice.22040)
-  }
-
-  inner.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    get-tuple-element.3 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=2
-    sliced = bf16[1,8,257,128] fusion(get-tuple-element.2, get-tuple-element.1), kind=kLoop, calls=%fused_computation.slice
-    sliced.2 = bf16[1,8,257,128] fusion(get-tuple-element.3, get-tuple-element.1), kind=kLoop,calls=%fused_computation.slice.2
-    temp = bf16[1,8,257,128] add(sliced, sliced.2)
-    one = s32[] constant(1) idx = s32[] add(get-tuple-element.1, one)
-    ROOT out = tuple(idx, get-tuple-element.2, get-tuple-element.3)
-  }
-  inner.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128])
-    parameter(0) get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),
-    index=0 constant.2 = s32[] constant(4) ROOT less-than = pred[]
-    compare(get-tuple-element.1, constant.2), direction=LT
-  }
-
-  outer.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    get-tuple-element.3 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=2
-    zero = s32[] constant(0)
-    buffer = bf16[4,1,8,257,128] custom-call(), custom_call_target="AllocateBuffer"
-    inner.input = tuple(zero, buffer, get-tuple-element.2)
-    inner = while(inner.input), condition=inner.condition, body=inner.body
-    out1 = bf16[4,1,8,257,128] get-tuple-element(inner), index=1
-    one = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, one)
-    ROOT tuple = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128]) tuple(idx, out1, get-tuple-element.3)
-  }
-  outer.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128])
-    parameter(0) get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),
-    index=0 constant.2 = s32[] constant(4) mul = s32[]
-    multiply(get-tuple-element.1, constant.2) ROOT less-than = pred[]
-    compare(get-tuple-element.1, mul), direction=LT
-  }
-
-  ENTRY SimpleLoop {
-    param1 = bf16[4,1,8,257,128] parameter(0)
-    param2 = bf16[4,1,8,257,128] parameter(1)
-    zero = s32[] constant(0)
-    zero1 = s32[] constant(0)
-    one = s32[] constant(1)
-    tuple.1 = tuple(zero, param1, param2)
-    while = while(tuple.1), condition=outer.condition, body=outer.body
-    ROOT out = s32[] get-tuple-element(while), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-// Unstacking the first loop at index 1 forces to unstack the second loop at
-// index 1 as well.
-TEST_F(UnstackerTest, UnstackDSAndDUSPatternLoopFeedingLoop) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-
-  %fused_computation.update.slice (param_0.51117: bf16[4,1,8,257,128], p1: s32[], param_0.51118: bf16[1,8,257,128]) -> bf16[4,1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %param_0.51118 = bf16[1,8,257,128] parameter(2)
-    bitcast = bf16[1,1,8,257,128] bitcast(param_0.51118)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-update-slice.22040 = bf16[4,1,8,257,128] dynamic-update-slice(bf16[4,1,8,257,128] %param_0.51117, bitcast, p1, s32[] %constant.85694, s32[] %constant.85694, s32[] %constant.85694, s32[] %constant.85694)
-  }
-
-  %fused_computation.slice (param_0.51117: bf16[4,1,8,257,128], offset:s32[]) -> bf16[1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = bf16[1,1,8,257,128] dynamic-slice(bf16[4,1,8,257,128] %param_0.51117, offset, zero, zero, zero, zero), dynamic_slice_sizes={1,1,8,257,128}
-    ROOT %bitcast.31250 = bf16[1,8,257,128] bitcast(%dynamic-slice.22040)
-  }
-
-  first.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    constant = bf16[1,8,257,128] constant({...})
-    sliced = bf16[1,8,257,128] fusion(get-tuple-element.2, get-tuple-element.1), kind=kLoop, calls=%fused_computation.slice
-    tmp = bf16[1,8,257,128] add(sliced, sliced)
-    one = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, one)
-    ROOT out = tuple(idx, get-tuple-element.2)
-  }
-  first.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    constant.2 = s32[] constant(4)
-    ROOT less-than = pred[] compare(get-tuple-element.1, constant.2), direction=LT
-  }
-
-  next.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    constant = bf16[1,8,257,128] constant({...})
-    update.sliced = bf16[4,1,8,257,128] fusion(get-tuple-element.2, get-tuple-element.1, constant), kind=kLoop, calls=%fused_computation.update.slice
-    one = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, one)
-    ROOT out = tuple(idx, update.sliced)
-  }
-  next.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    constant.2 = s32[] constant(4)
-    ROOT less-than = pred[] compare(get-tuple-element.1, constant.2), direction=LT
-  }
-
-  ENTRY SimpleLoop {
-    param1 = bf16[4,1,8,257,128] parameter(0)
-    param2 = bf16[4,1,8,257,128] parameter(1)
-    zero = s32[] constant(0)
-    zero1 = s32[] constant(0)
-    one = s32[] constant(1)
-    tuple.1 = tuple(zero, param1)
-    while = while(tuple.1), condition=first.condition, body=first.body
-    while.out = bf16[4,1,8,257,128] get-tuple-element(while), index=1
-    next.input = tuple(zero, while.out)
-    next = while(next.input), condition=next.condition, body=next.body
-    ROOT out = s32[] get-tuple-element(next), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDUSFusionWithPadPatternLoopFeedingLoop) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  fused_computation.75.clone {
-    param_0.5713 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} parameter(0)
-    param_2.4396 = bf16[1,8,257,128]{3,2,1,0:T(8,128)(2,1)} parameter(2)
-    constant.12166 = bf16[]{:T(256)} constant(0)
-    pad.496 = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} pad(param_2.4396, constant.12166), padding=0_0x0_0x0_256x0_0
-    bitcast.1262 = bf16[1,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} bitcast(pad.496)
-    param_1.6823 = s32[]{:T(128)} parameter(1)
-    constant.12165 = s32[]{:T(128)} constant(0)
-    ROOT dynamic-update-slice.193 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} dynamic-update-slice(param_0.5713, bitcast.1262, param_1.6823, constant.12165, constant.12165, /*index=5*/constant.12165, constant.12165)
-  } // fused_computation.75.clone
-
-  fused_computation.1 {
-    param_0.5712 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}parameter(0)
-    param_1.6822 = s32[]{:T(128)} parameter(1)
-    constant.12164 = s32[]{:T(128)} constant(0)
-    dynamic-slice.1597 = bf16[1,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} dynamic-slice(param_0.5712, param_1.6822, constant.12164, constant.12164, constant.12164, /*index=5*/constant.12164), dynamic_slice_sizes={1,1,8,513,128}
-    ROOT bitcast.1261 = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} bitcast(dynamic-slice.1597)
-  }
-
-  first.body {
-    wide.param.29 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12177 = s32[]{:T(128)} get-tuple-element(wide.param.29), index=0
-    constant.12144..sunk.2 = s32[]{:T(128)} constant(1)
-    add.4517 = s32[]{:T(128)} add(get-tuple-element.12177, constant.12144..sunk.2)
-    get-tuple-element.12178 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} get-tuple-element(wide.param.29), index=1
-    fusion.2381 = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} fusion(get-tuple-element.12178, get-tuple-element.12177), kind=kLoop, calls=fused_computation.1
-    tmp = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} add(fusion.2381, fusion.2381)
-    ROOT tuple.949 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) tuple(add.4517, get-tuple-element.12178)
-  } // wide.region_54.2652.clone_spmd
-
-  first.cond {
-    wide.param.28 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12167 = s32[]{:T(128)} get-tuple-element(wide.param.28), index=0
-    constant.12162 = s32[]{:T(128)} constant(2)
-    ROOT compare.1815 = pred[]{:T(512)} compare(get-tuple-element.12167, constant.12162), direction=LT
-  }
-
-  wide.region_54.2652.clone_spmd {
-    wide.param.29 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12177 = s32[]{:T(128)} get-tuple-element(wide.param.29), index=0
-    constant.12144..sunk.2 = s32[]{:T(128)} constant(1)
-    add.4517 = s32[]{:T(128)} add(get-tuple-element.12177, constant.12144..sunk.2)
-    get-tuple-element.12178 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} get-tuple-element(wide.param.29), index=1
-    update = bf16[1,8,257,128]{3,2,1,0:T(8,128)(2,1)} constant({...})
-    fusion.2382 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} fusion(get-tuple-element.12178, get-tuple-element.12177, update), kind=kLoop, calls=fused_computation.75.clone
-    ROOT tuple.949 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) tuple(add.4517, fusion.2382)
-  } // wide.region_54.2652.clone_spmd
-
-  wide.region_55.2732.clone_spmd {
-    wide.param.28 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12167 = s32[]{:T(128)} get-tuple-element(wide.param.28), index=0
-    constant.12162 = s32[]{:T(128)} constant(2)
-    ROOT compare.1815 = pred[]{:T(512)} compare(get-tuple-element.12167, constant.12162), direction=LT
-  }
-  ENTRY main {
-    p0 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} parameter(0)
-    init = s32[]{:T(128)} constant(0)
-    first.input = tuple(init, p0)
-    first.out = while(first.input), condition=first.cond , body=first.body
-    o1 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} get-tuple-element(first.out), index=1
-    input = tuple(init, o1)
-    out = while(input), condition=wide.region_55.2732.clone_spmd , body=wide.region_54.2652.clone_spmd
-    ROOT res = s32[]{:T(128)} get-tuple-element(out), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDUSFusionWithAddPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-
-  add.2771.reduce_sub_computation {
-    lhs.44 = bf16[] parameter(0)
-    rhs.44 = bf16[] parameter(1)
-    ROOT add.3079 = bf16[] add(lhs.44, rhs.44)
-  }
-
-  fused_computation.75.clone {
-    param_0.31658 = bf16[2,4096]{1,0:T(8,128)(2,1)} parameter(0)
-    param_1.26202 = s32[]{:T(128)} parameter(1)
-    constant.47557 = s32[]{:T(128)} constant(0)
-    dynamic-slice.12289 = bf16[1,4096]{1,0:T(2,128)(2,1)} dynamic-slice(param_0.31658, param_1.26202, constant.47557), dynamic_slice_sizes={1,4096}
-    constant.47559 = bf16[]{:T(256)} constant(1)
-    broadcast.39214 = bf16[1,4096]{1,0:T(2,128)(2,1)} broadcast(constant.47559), dimensions={}
-    add.13176 = bf16[1,4096]{1,0:T(2,128)(2,1)} add(dynamic-slice.12289, broadcast.39214)
-    constant.47558 = bf16[] constant(-0)
-    ROOT reduce.8210 = bf16[4096]{0:T(1024)(128)(2,1)} reduce(add.13176, constant.47558), dimensions={0}, to_apply=add.2771.reduce_sub_computation
-  } // fused_computation.75.clone
-
-  first.body {
-    wide.param.29 = (s32[]{:T(128)}, bf16[2,4096]{1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12177 = s32[]{:T(128)} get-tuple-element(wide.param.29), index=0
-    constant.12144..sunk.2 = s32[]{:T(128)} constant(1)
-    add.4517 = s32[]{:T(128)} add(get-tuple-element.12177, constant.12144..sunk.2)
-    get-tuple-element.12178 = bf16[2,4096]{1,0:T(8,128)(2,1)} get-tuple-element(wide.param.29), index=1
-    fusion.2381 = bf16[4096]{0:T(1024)(128)(2,1)} fusion(get-tuple-element.12178, get-tuple-element.12177), kind=kLoop, calls=fused_computation.75.clone
-    tmp = bf16[4096]{0:T(1024)(128)(2,1)} add(fusion.2381, fusion.2381)
-    ROOT tuple.949 = (s32[]{:T(128)}, bf16[2,4096]{1,0:T(8,128)(2,1)}) tuple(add.4517, get-tuple-element.12178)
-  } // wide.region_54.2652.clone_spmd
-
-  first.cond {
-    wide.param.28 = (s32[]{:T(128)}, bf16[2,4096]{1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12167 = s32[]{:T(128)} get-tuple-element(wide.param.28), index=0
-    constant.12162 = s32[]{:T(128)} constant(2)
-    ROOT compare.1815 = pred[]{:T(512)} compare(get-tuple-element.12167, constant.12162), direction=LT
-  }
-
-  ENTRY main {
-    p0 = bf16[2,4096]{1,0:T(8,128)(2,1)} parameter(0)
-    init = s32[]{:T(128)} constant(0)
-    first.input = tuple(init, p0)
-    first.out = while(first.input), condition=first.cond , body=first.body
-    ROOT o1 = s32[]{:T(128)} get-tuple-element(first.out), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 534636a1b16927..e2a5cd29821fb2 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -93,10 +93,9 @@ absl::Status CheckUnaryOpWithResultAccuracy(HloInstruction* unary) {
   if (unary->has_result_accuracy()) {
     if (IsUnaryOpWithResultAccuracy(unary->opcode())) {
       return absl::OkStatus();
-    } else {
-      return Internal("Unary op with result accuracy is not supported for %s",
-                      HloOpcodeString(opcode));
     }
+    return Internal("Unary op with result accuracy is not supported for %s",
+                    HloOpcodeString(opcode));
   }
   return absl::OkStatus();
 }
@@ -218,41 +217,22 @@ absl::Status ShapeVerifier::HandleRaggedDot(HloInstruction* ragged_dot) {
 absl::StatusOr<bool> IsNoOpScale(const HloInstruction* dot,
                                  const HloInstruction* operand,
                                  const HloInstruction* scale_operand) {
-  // It should be a constant scalar.
-  if (!ShapeUtil::IsScalar(scale_operand->shape()) ||
-      scale_operand->opcode() != HloOpcode::kConstant) {
+  // It should have the same type as the operand, and the shape should have the
+  // same rank as the operand but with dim sizes equal to 1.
+  const Shape& shape = scale_operand->shape();
+  if (shape.element_type() != operand->shape().element_type()) {
     return false;
   }
-  // If the scale operand is a constant, it must be a scalar of the same type
-  // as the operand.
-  if (scale_operand->shape().element_type() !=
-      operand->shape().element_type()) {
-    return absl::FailedPreconditionError(absl::StrFormat(
-        "Dummy scale operand '%s' has a different type than operand '%s'. %s "
-        "vs %s in %s",
-        scale_operand->name(), operand->name(),
-        PrimitiveType_Name(scale_operand->shape().element_type()),
-        PrimitiveType_Name(operand->shape().element_type()), dot->ToString()));
-  }
-  auto constant = Cast<HloConstantInstruction>(scale_operand);
-
-  // If the element type is float, the scale must be 1.0.
-  if (primitive_util::IsFloatingPointType(operand->shape().element_type())) {
-    if (!constant->literal().IsAllFloat(1.0)) {
-      return absl::FailedPreconditionError(absl::StrFormat(
-          "Dummy scale operand %s of %s is not a scalar with value 1.0",
-          scale_operand->name(), dot->ToString()));
-    }
-    return true;  // Dummy constant scale equal to 1.0 with float type found.
-  }
-
-  // If the element type is not float, the scale must be 1.
-  if (!constant->literal().IsAll(1)) {
-    return absl::FailedPreconditionError(absl::StrFormat(
-        "Dummy scale operand %s of %s is not a constant with value 1",
-        scale_operand->name(), dot->ToString()));
+  if (operand->shape().element_type() != BF16) {
+    return false;
   }
-  return true;  // Dummy constant scale equal to 1 with integer type found.
+  // It might be enough to check the types only but for now let's check the
+  // shape as well.
+  return std::all_of(shape.dimensions().begin(), shape.dimensions().end(),
+                     [](int64_t dim) { return dim == 1; }) &&
+         std::any_of(operand->shape().dimensions().begin(),
+                     operand->shape().dimensions().end(),
+                     [](int64_t dim) { return dim != 1; });
 }
 
 absl::Status ScalesShapeVerifier(
@@ -290,9 +270,10 @@ absl::Status ScalesShapeVerifier(
   for (int i = 0; i < operand_dims.size(); ++i) {
     if (operand_dims[i] % scale_operand_dims[i]) {
       return absl::FailedPreconditionError(absl::StrFormat(
-          "Dimension %d of operand %s should be a multiple of dimension "
-          "%d of scale operand %s in %s",
-          i, operand->name(), i, scale_operand->name(), dot->ToString()));
+          "Dimension %d of operand \n%s\n should be a multiple of dimension "
+          "%d of scale operand \n%s\n in %s",
+          i, operand->ToString(), i, scale_operand->ToString(),
+          dot->ToString()));
     }
   }
   return absl::OkStatus();
@@ -431,22 +412,23 @@ static absl::Status CheckReplicaGroups(HloInstruction* hlo,
     int64_t replica_count = hlo->GetModule()->config().replica_count();
     int64_t num_partitions = hlo->GetModule()->config().num_partitions();
     switch (group_mode) {
-      case CollectiveOpGroupMode::kCrossReplica:
-      case CollectiveOpGroupMode::kCrossReplicaAndPartition: {
+      case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA:
+      case CollectiveOpGroupMode::
+          COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION: {
         TF_RET_CHECK(replica_count == 1 || n == replica_count)
             << "In " << CollectiveOpGroupModeToString(group_mode)
             << " mode, replica groups should contain " << replica_count
             << " replicas, but found " << n << ": " << hlo->ToString();
         break;
       }
-      case CollectiveOpGroupMode::kCrossPartition: {
+      case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION: {
         TF_RET_CHECK(num_partitions == 1 || n == num_partitions)
             << "In " << CollectiveOpGroupModeToString(group_mode)
             << " mode, replica groups should contain " << num_partitions
             << " partitions, but found " << n << ": " << hlo->ToString();
         break;
       }
-      case CollectiveOpGroupMode::kFlattenedID: {
+      case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID: {
         const int64_t num_flattened_ids = replica_count * num_partitions;
         TF_RET_CHECK(num_flattened_ids == 1 || n == num_flattened_ids)
             << "In " << CollectiveOpGroupModeToString(group_mode)
@@ -454,6 +436,10 @@ static absl::Status CheckReplicaGroups(HloInstruction* hlo,
             << " flattened IDs, but found " << n << ": " << hlo->ToString();
         break;
       }
+      default: {
+        return InvalidArgument("Invalid collective op group mode: %d",
+                               static_cast<int>(group_mode));
+      }
     }
 
     if (uniform_replica_group_size) {
@@ -464,7 +450,8 @@ static absl::Status CheckReplicaGroups(HloInstruction* hlo,
       }
     }
   } else {
-    TF_RET_CHECK(group_mode != CollectiveOpGroupMode::kFlattenedID)
+    TF_RET_CHECK(group_mode !=
+                 CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID)
         << "Replica groups must be specified in flattened-id mode";
   }
 
@@ -654,15 +641,14 @@ absl::Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
         hlo, ShapeInference::InferAllToAllShape(
                  hlo->operand(0)->shape(), *all_to_all->split_dimension(),
                  *all_to_all->split_dimension(), split_count));
-  } else {
-    TF_RET_CHECK(hlo->operand_count() == split_count);
-    std::vector<const Shape*> operand_shapes;
-    for (const HloInstruction* operand : hlo->operands()) {
-      operand_shapes.push_back(&operand->shape());
-    }
-    return CheckShape(hlo,
-                      ShapeInference::InferAllToAllTupleShape(operand_shapes));
   }
+  TF_RET_CHECK(hlo->operand_count() == split_count);
+  std::vector<const Shape*> operand_shapes;
+  for (const HloInstruction* operand : hlo->operands()) {
+    operand_shapes.push_back(&operand->shape());
+  }
+  return CheckShape(hlo,
+                    ShapeInference::InferAllToAllTupleShape(operand_shapes));
 }
 
 absl::Status ShapeVerifier::HandleRaggedAllToAll(HloInstruction* hlo) {
@@ -814,12 +800,14 @@ absl::Status CheckDuplicatedSourceOrTarget(
   // source-target pairs. Also, based on the group formation mode, check if the
   // source and target IDs are within expected range.
 
-  // Note: for collective-permute, only kCrossReplica and kCrossPartition modes
-  // are valid.
+  // Note: for collective-permute, only COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID
+  // and kCrossPartition modes are valid.
   const HloModuleConfig& config = collective_permute->GetModule()->config();
-  const int64_t limit = group_mode == CollectiveOpGroupMode::kCrossReplica
-                            ? config.replica_count()
-                            : config.num_partitions();
+  const int64_t limit =
+      group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
+          ? config.replica_count()
+          : config.num_partitions();
   absl::flat_hash_map<int64_t, std::vector<int64_t>> seen_source_to_targets;
   absl::flat_hash_map<int64_t, std::vector<int64_t>> seen_target_to_sources;
   int allowed_seen_count = 1;
@@ -852,16 +840,15 @@ absl::Status CheckDuplicatedSourceOrTarget(
             "Source %d appears more than once in instruction's source-target "
             "pairs: %s",
             p.first, collective_permute->ToString());
-      } else {
-        return Internal(
-            "Source %d appears more than %d times in instruction's "
-            "source-target "
-            "pairs: %s",
-            p.first, allowed_seen_count, collective_permute->ToString());
       }
-    } else {
-      seen_source_to_targets[p.first].push_back(p.second);
+      return Internal(
+          "Source %d appears more than %d times in instruction's "
+          "source-target "
+          "pairs: %s",
+          p.first, allowed_seen_count, collective_permute->ToString());
     }
+    seen_source_to_targets[p.first].push_back(p.second);
+
     TF_RET_CHECK(p.second >= 0)
         << "Target " << p.second
         << " in the instruction's source-target pair must be >= 0 : "
@@ -877,16 +864,14 @@ absl::Status CheckDuplicatedSourceOrTarget(
             "Target %d appears more than once in instruction's source-target "
             "pairs: %s",
             p.second, collective_permute->ToString());
-      } else {
-        return Internal(
-            "Target %d appears more than %d times in instruction's "
-            "source-target "
-            "pairs: %s",
-            p.second, allowed_seen_count, collective_permute->ToString());
       }
-    } else {
-      seen_target_to_sources[p.second].push_back(p.first);
+      return Internal(
+          "Target %d appears more than %d times in instruction's "
+          "source-target "
+          "pairs: %s",
+          p.second, allowed_seen_count, collective_permute->ToString());
     }
+    seen_target_to_sources[p.second].push_back(p.first);
   }
   return absl::OkStatus();
 }
@@ -1907,6 +1892,9 @@ absl::Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kAsyncDone:
     case HloOpcode::kAsyncUpdate:
     case HloOpcode::kAsyncStart:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopyDone:
     case HloOpcode::kCopyStart:
     case HloOpcode::kCustomCall:
@@ -2875,7 +2863,7 @@ absl::Status VerifyLayoutConstrainedAllReduce(const HloModule& module) {
 
 namespace {
 std::string FormatShapeIndexValidationError(
-    absl::string_view instruction_name,
+    const HloInstruction* instruction,
     const absl::flat_hash_set<ShapeIndex>& shape_leaf_indices,
     const absl::flat_hash_set<ShapeIndex>& ov_leaf_indices) {
   std::vector<ShapeIndex> shape_only;
@@ -2899,7 +2887,8 @@ std::string FormatShapeIndexValidationError(
       "Mismatched tuple structure in original_value for "
       "instruction %s. Leaf indices in shape and original_value "
       "do not match.\nIn shape only: {%s}\nIn original_value only: {%s}",
-      instruction_name, absl::StrJoin(shape_only, ", ", shape_index_formatter),
+      instruction->ToString(),
+      absl::StrJoin(shape_only, ", ", shape_index_formatter),
       absl::StrJoin(ov_only, ", ", shape_index_formatter));
 }
 
@@ -2928,9 +2917,9 @@ absl::Status VerifyOriginalValue(const HloModule& module) {
         }
 
         if (shape_leaf_indices != ov_leaf_indices) {
-          return Internal("%s", FormatShapeIndexValidationError(
-                                    instruction->name(), shape_leaf_indices,
-                                    ov_leaf_indices));
+          return Internal(
+              "%s", FormatShapeIndexValidationError(
+                        instruction, shape_leaf_indices, ov_leaf_indices));
         }
       }
     }
@@ -3216,7 +3205,7 @@ int64_t CountWriters(const HloInstruction* inst,
 int64_t CountWritersInUser(const HloInstruction* inst,
                            absl::Span<const int64_t> shape_index,
                            const HloInstruction* user) {
-  if (dynamic_cast<const HloCallableInstruction*>(user) ||
+  if (HloCallableInstruction::ClassOf(user) ||
       user->opcode() == HloOpcode::kWhile ||
       user->opcode() == HloOpcode::kConditional) {
     // For HloCallableInstruction, we may overcount here if we will allow
@@ -3653,9 +3642,9 @@ absl::Status InstructionVerifier::HandleTranspose(HloInstruction* transpose) {
   TF_RET_CHECK(shape.dimensions().size() == transpose->dimensions().size());
   TF_RET_CHECK(shape.dimensions().size() ==
                transpose->operand(0)->shape().dimensions().size());
-  TF_RET_CHECK(std::equal(
-      shape.dimensions().begin(), shape.dimensions().end(),
-      Permute(operand->shape().dimensions(), transpose->dimensions()).begin()))
+  TF_RET_CHECK(absl::c_equal(
+      shape.dimensions(),
+      Permute(operand->shape().dimensions(), transpose->dimensions())))
       << "shape: " << shape << ", operand->shape(): " << shape
       << ", dimensions: {" << absl::StrJoin(transpose->dimensions(), ", ")
       << "}";
@@ -3847,11 +3836,11 @@ absl::Status InstructionVerifier::VerifyNoHostMemorySpace(
       });
 }
 
-absl::StatusOr<bool> HloVerifier::Run(
+absl::StatusOr<bool> HloVerifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto disabled = module->config().debug_options().xla_disable_hlo_passes();
-  if (std::find(disabled.begin(), disabled.end(), name()) != disabled.end()) {
+  if (absl::c_find(disabled, name()) != disabled.end()) {
     return false;
   }
   auto status_or_changed = [&]() -> absl::StatusOr<bool> {
@@ -3906,9 +3895,8 @@ absl::StatusOr<bool> HloVerifier::Run(
           *module, [this](const Shape& shape) -> int64_t {
             if (target_metadata_->GetVerifierOpts().IsLayoutSensitive()) {
               return target_metadata_->GetVerifierOpts().ShapeSize(shape);
-            } else {
-              return 0;
             }
+            return 0;
           }));
     }
 
diff --git a/third_party/xla/xla/service/hlo_verifier.h b/third_party/xla/xla/service/hlo_verifier.h
index 9121af8159a50a..2fffe6adfbb414 100644
--- a/third_party/xla/xla/service/hlo_verifier.h
+++ b/third_party/xla/xla/service/hlo_verifier.h
@@ -175,9 +175,6 @@ struct HloVerifierOpts {
   // cloned (".clone" suffix) or rematted (".remat");
   bool verify_instruction_name_unchanged = false;
 
-  // Check if channel instructions all have unique channel ids.
-  bool verify_unique_channel_ids = true;
-
   // Check if a shape has a host memory space color
   bool verify_no_host_memory_space = false;
 
@@ -490,9 +487,8 @@ class HloVerifier : public HloModulePass {
   absl::string_view name() const override { return "hlo-verifier"; }
 
   // Never returns true; no instructions are ever modified by this pass.
-  using HloPassInterface::Run;
-  using HloPassInterface::RunOnModuleGroup;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index 2500073d57143a..24d69e257b8d2a 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -2242,6 +2242,43 @@ TEST_F(HloVerifierTest, CollectivePermuteCrossPartitionTargetOOR) {
   EXPECT_THAT(error_message, HasSubstr("must be < 3"));
 }
 
+TEST_F(HloVerifierTest, CollectivePermuteAsyncMixedPrecisionOperandsAllowed) {
+  const char* const kModuleStr = R"(
+    HloModule test
+    ENTRY entry {
+      p0 = f32[128] parameter(0)
+      p1 = bf16[128] parameter(1)
+      permute-start = ((f32[128], bf16[128]), (f32[128], bf16[128])) collective-permute-start(p0, p1),
+        source_target_pairs={{0,1}, {1,0}}, channel_id=1
+      ROOT permute-done = (f32[128], bf16[128]) collective-permute-done(permute-start)
+    }
+    )";
+  HloModuleConfig config;
+  config.set_num_partitions(2);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, CollectivePermuteMixedPrecisionOperandsAllowed) {
+  const char* const kModuleStr = R"(
+    HloModule test
+    ENTRY entry {
+      p0 = f32[128] parameter(0)
+      p1 = bf16[128] parameter(1)
+      ROOT permute = (f32[128], bf16[128]) collective-permute(p0, p1),
+        source_target_pairs={{0,1}, {1,0}}, channel_id=1
+    }
+    )";
+  HloModuleConfig config;
+  config.set_num_partitions(2);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 TEST_F(HloVerifierTest, FusionMoreOperandsThanParameters) {
   const char* const kModuleStr = R"(
   HloModule test
@@ -3228,7 +3265,30 @@ ENTRY main {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
-              HasSubstr("device 2 > num_devices (2) in tile assignment"));
+              HasSubstr("device 2 >= num_devices (2) in tile assignment"));
+}
+
+TEST_F(HloVerifierTest, NegativeDeviceID) {
+  const char* const hlo = R"(
+HloModule Module
+
+ENTRY main {
+  p = f32[4,2] parameter(0), sharding={maximal device=-1}
+  ROOT r = f32[4,2] copy(p)
+}
+)";
+
+  HloModuleConfig config;
+  config.set_num_partitions(2);
+  config.set_use_spmd_partitioning(true);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo, config));
+  ASSERT_TRUE(module->config().use_spmd_partitioning());
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(),
+              HasSubstr("device -1 is negative in tile assignment"));
 }
 
 TEST_F(HloVerifierTest, InconsistentWhileSharding) {
@@ -3658,7 +3718,6 @@ TEST_F(HloVerifierTest, NoErrorOnDuplicateChannelId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
   HloVerifierOpts opts{};
-  opts.verify_unique_channel_ids = false;
   HloVerifier verifier(std::move(opts));
   ASSERT_IS_OK(verifier.Run(module.get()).status());
 }
@@ -4923,10 +4982,10 @@ TEST_F(HloVerifierTest, ScaledDotWithNoScalesFails) {
   static constexpr absl::string_view kScaledDotHloString = R"(
     HloModule module
     ENTRY entry_computation {
-      a = f32[2,10] parameter(0)
-      b = f32[10,2] parameter(1)
-      a_scale = f32[] constant(1)
-      b_scale = f32[] constant(1)
+      a = bf16[2,10] parameter(0)
+      b = bf16[10,2] parameter(1)
+      a_scale = bf16[] constant(1)
+      b_scale = bf16[] constant(1)
       ROOT dot = f32[2,2] scaled-dot(a, b, a_scale, b_scale),
         lhs_contracting_dims={1},
         rhs_contracting_dims={0}
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index f1a3b90f1f255e..1d9b568f88f9f0 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -53,9 +54,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
@@ -177,6 +175,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAddDependency:
     case HloOpcode::kAfterAll:
     case HloOpcode::kAtan2:
@@ -598,7 +597,7 @@ std::unique_ptr<FusionQueue> InstructionFusion::GetFusionQueue(
   return std::make_unique<ReversePostOrderFusionQueue>(computation);
 }
 
-absl::StatusOr<bool> InstructionFusion::Run(
+absl::StatusOr<bool> InstructionFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -799,7 +798,7 @@ HloInstruction* InstructionFusion::AddFusionInstruction(
     // have the same value as the root of the fused computation. However, we
     // copy the value nontheless to simplify some use cases that involve
     // fusions.
-    TF_CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
+    CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
   }
   return fusion_instruction;
 }
diff --git a/third_party/xla/xla/service/instruction_fusion.h b/third_party/xla/xla/service/instruction_fusion.h
index 5f572d4be546ef..85ff4dde04035c 100644
--- a/third_party/xla/xla/service/instruction_fusion.h
+++ b/third_party/xla/xla/service/instruction_fusion.h
@@ -202,13 +202,6 @@ class InstructionFusion : public HloModulePass {
   ~InstructionFusion() override = default;
   absl::string_view name() const override { return "fusion"; }
 
-  // Run instruction fusion on the given computation. Returns whether the
-  // computation was changed (instructions were fused).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Returns true if the computation of the given instruction is significantly
   // more expensive than just writing all the values of the instructions' result
   // array. Expensive operations will not be duplicated.
@@ -340,6 +333,12 @@ class InstructionFusion : public HloModulePass {
 
   bool may_duplicate() const { return may_duplicate_; }
 
+  // Run instruction fusion on the given computation. Returns whether the
+  // computation was changed (instructions were fused).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Returns the reused operands of `instruction` from reused_fusion_operands_,
   // computing them if they have not previously been computed for that
diff --git a/third_party/xla/xla/service/instruction_fusion_test.cc b/third_party/xla/xla/service/instruction_fusion_test.cc
index 18a65ac494e42c..4a03a505efc363 100644
--- a/third_party/xla/xla/service/instruction_fusion_test.cc
+++ b/third_party/xla/xla/service/instruction_fusion_test.cc
@@ -137,8 +137,12 @@ TEST_F(InstructionFusionTest, FuseInstructionsWithOriginalValue) {
   EXPECT_THAT(fusion->fused_expression_root(),
               op::Subtract(op::Add(), op::Parameter()))
       << module->ToString();
+  absl::string_view expected_origin = "{\"sub\"}";
   ASSERT_NE(fusion->original_value(), nullptr);
-  EXPECT_EQ(fusion->original_value()->ToString(), "{\"sub\"}");
+  EXPECT_EQ(fusion->original_value()->ToString(), expected_origin);
+  ASSERT_NE(fusion->fused_expression_root()->original_value(), nullptr);
+  ASSERT_EQ(fusion->fused_expression_root()->original_value()->ToString(),
+            expected_origin);
 }
 
 TEST_F(InstructionFusionTest,
@@ -158,11 +162,15 @@ TEST_F(InstructionFusionTest,
   HloInstruction* fusion = InstructionFusionForTesting().FuseIntoMultiOutput(
       abs, tanh, module->entry_computation());
 
+  absl::string_view expected_original_value = "({\"tanh\"}, {\"abs\"})";
   ASSERT_THAT(fusion, op::Fusion()) << module->ToString();
   EXPECT_THAT(fusion->fused_expression_root(), op::Tuple(op::Tanh(), op::Abs()))
       << module->ToString();
+  ASSERT_NE(fusion->fused_expression_root()->original_value(), nullptr);
+  ASSERT_EQ(fusion->fused_expression_root()->original_value()->ToString(),
+            expected_original_value);
   ASSERT_NE(fusion->original_value(), nullptr);
-  EXPECT_EQ(fusion->original_value()->ToString(), "({\"tanh\"}, {\"abs\"})");
+  EXPECT_EQ(fusion->original_value()->ToString(), expected_original_value);
 }
 
 TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusible) {
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 7c25ee63443af0..62995d048976f9 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -37,12 +37,12 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/debug_options_flags.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/analysis/hlo_reachability.h"
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/layout.h"
 #include "xla/map_util.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/dump.h"
@@ -65,7 +66,6 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -151,6 +151,18 @@ bool IsCustomCallWithForceDelayAttribute(const HloInstruction* instr) {
          attr.value() == "force_delay";
 }
 
+int GetCustomCallForceDelayPriority(const HloInstruction* instr) {
+  auto attr = instr->get_frontend_attribute("scheduler_delay_priority");
+  if (instr->opcode() == HloOpcode::kCustomCall && attr.has_value()) {
+    int out;
+    CHECK(absl::SimpleAtoi(attr.value(), &out))
+        << "Failed to parse scheduler_delay_priority attribute: "
+        << attr.value();
+    return out;
+  }
+  return 0;
+}
+
 absl::flat_hash_map<int64_t, int64_t>
 GetNumResourcesNeededForAnnotationWithKeepOriginalOrderAttrs(
     const DefaultSchedulerCore::SchedulingState& sched_state,
@@ -208,6 +220,12 @@ int64_t EstimateFragmentationSize(HloModule* module,
     if (!shape.IsArray()) {
       return 0;
     }
+    if (!shape.has_layout()) {
+      return 0;
+    }
+    if (shape.layout().memory_space() != Layout::kDefaultMemorySpace) {
+      return 0;
+    }
     return ShapeUtil::ByteSizeOf(shape);
   };
   auto result =
@@ -226,12 +244,6 @@ CanonicalAsyncOp DefaultGetCanonicalAsyncOp(const HloInstruction& hlo) {
   switch (hlo.opcode()) {
     case HloOpcode::kAsyncStart:
     case HloOpcode::kAsyncDone:
-      if (hlo.async_wrapped_opcode() == HloOpcode::kCall) {
-        return {hlo.opcode(), hlo.async_wrapped_instruction()
-                                  ->called_computations()[0]
-                                  ->root_instruction()
-                                  ->opcode()};
-      }
       return {hlo.opcode(), hlo.async_wrapped_opcode()};
     case HloOpcode::kAllReduceStart:
       return {HloOpcode::kAsyncStart, HloOpcode::kAllReduce};
@@ -353,33 +365,34 @@ bool AsyncTracker::IsSupportedAsyncStart(const HloInstruction& hlo) const {
   return false;
 }
 
+ResourceType AsyncTracker::GetResourceTypeForOp(HloOpcode op) {
+  switch (op) {
+    case HloOpcode::kAllReduce:
+      return ResourceType::kAllReduce;
+    case HloOpcode::kAllGather:
+      return ResourceType::kAllGather;
+    case HloOpcode::kAllToAll:
+      return ResourceType::kAllToAll;
+    case HloOpcode::kRaggedAllToAll:
+      return ResourceType::kRaggedAllToAll;
+    case HloOpcode::kCollectiveBroadcast:
+      return ResourceType::kCollectiveBroadcast;
+    case HloOpcode::kCollectivePermute:
+      return ResourceType::kCollectivePermute;
+    case HloOpcode::kCopy:
+      return ResourceType::kCopy;
+    case HloOpcode::kReduceScatter:
+      return ResourceType::kReduceScatter;
+    default:
+      return ResourceType::kNoResource;
+  }
+}
+
 ResourcesVector AsyncTracker::GetResourcesFromInstructionImpl(
     const HloInstruction& hlo) const {
   CanonicalAsyncOp op = GetCanonicalAsyncOp(hlo);
-  auto get_resource_for_op = [](HloOpcode op) -> ResourceType {
-    switch (op) {
-      case HloOpcode::kAllReduce:
-        return ResourceType::kAllReduce;
-      case HloOpcode::kAllGather:
-        return ResourceType::kAllGather;
-      case HloOpcode::kAllToAll:
-        return ResourceType::kAllToAll;
-      case HloOpcode::kRaggedAllToAll:
-        return ResourceType::kRaggedAllToAll;
-      case HloOpcode::kCollectiveBroadcast:
-        return ResourceType::kCollectiveBroadcast;
-      case HloOpcode::kCollectivePermute:
-        return ResourceType::kCollectivePermute;
-      case HloOpcode::kCopy:
-        return ResourceType::kCopy;
-      case HloOpcode::kReduceScatter:
-        return ResourceType::kReduceScatter;
-      default:
-        return ResourceType::kNoResource;
-    }
-  };
   if (op.outer == HloOpcode::kAsyncStart || op.outer == HloOpcode::kAsyncDone) {
-    ResourceType type = get_resource_for_op(op.inner);
+    ResourceType type = GetResourceTypeForOp(op.inner);
     if (type == ResourceType::kNoResource) {
       return {};
     }
@@ -469,7 +482,7 @@ ResourcesVector AsyncTracker::GetResourcesFromInstructionImpl(
       // kResourceOccupy and a kResourceRelease that follows immediately after.
       ResourcesVector res;
       if (config_.track_sync_op_resource_usage) {
-        ResourceType type = get_resource_for_op(hlo.opcode());
+        ResourceType type = GetResourceTypeForOp(hlo.opcode());
         if (type != ResourceType::kNoResource) {
           res.push_back(std::make_pair(ResourceTypeToIndex(type),
                                        ResourceUsageType::kResourceOccupy));
@@ -1263,8 +1276,13 @@ class ReadySetLt {
     HloGraphNode* bn = b.node;
     // Schedule according to ForceEarly.
     CMP_PROPERTY(GetForceEarly(), "kForceEarly");
-    // Schedule according to ForceDelay first.
+    // Schedule according to ForceDelay, if exactly one of the two instructions
+    // has ForceDelay set.
     CMP_EXPLICIT(!an->GetForceDelay(), !bn->GetForceDelay(), "kForceDelay");
+    // Schedule according to highest ForceDelay first, if both instructions
+    // have ForceDelay set.
+    CMP_EXPLICIT(-an->GetForceDelayPriority(), -bn->GetForceDelayPriority(),
+                 "kForceDelayPriority");
     // Use the preference value (comes from a heuristic) to choose between
     // the two candidates. If two preferences are the same regular LHS logic
     // will run as usual, we take advantage of this fact when initializing
@@ -1288,7 +1306,7 @@ class ReadySetLt {
 
     std::pair<int64_t, int64_t> a_increase = {0, 0};
     std::pair<int64_t, int64_t> b_increase = {0, 0};
-    bool computed_memory_increases = true;
+    bool computed_memory_increases = false;
     if (config_has_memory_limit_ &&
         sched_state_.memory_pressure_tracker->memory_usage() >
             (config_memory_limit_ / 2)) {
@@ -1462,7 +1480,13 @@ class ReadySetLt {
       static_assert(
           std::is_trivially_copyable_v<DefaultSchedulerCore::ScheduleCandidate>,
           "ScheduleCandidate should be is_trivially_copyable");
-      memcpy(&b, &a, sizeof(DefaultSchedulerCore::ScheduleCandidate));
+      if (VLOG_IS_ON(2)) {
+        DefaultSchedulerCore::ScheduleCandidate tmp = b;
+        memcpy(&b, &a, sizeof(DefaultSchedulerCore::ScheduleCandidate));
+        memcpy(&a, &tmp, sizeof(DefaultSchedulerCore::ScheduleCandidate));
+      } else {
+        memcpy(&b, &a, sizeof(DefaultSchedulerCore::ScheduleCandidate));
+      }
     }
     return result;
   }
@@ -1594,7 +1618,9 @@ class ReadySetLt {
             cand_node->GetResources());
     int64_t num_conflicting_resources = 0;
     for (int64_t resource : resources) {
-      if (!sched_state_.resource_occupiers_in_flight.count(resource)) continue;
+      if (!sched_state_.resource_occupiers_in_flight.count(resource)) {
+        continue;
+      }
       num_conflicting_resources +=
           sched_state_.resource_occupiers_in_flight.at(resource).size();
     }
@@ -1644,9 +1670,7 @@ DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
                         early_target_scheduling_rule_};
     // Construct a schedule candidate for caching.
     ScheduleCandidate ready_chosen;
-    ScheduleCandidate ready_chosen_orig;
     bool ready_chosen_valid = false;
-    ScheduleCandidate ready_candidate_orig;
     auto chosen_it = sched_state.ready_set.end();
 
     // Try to pick nodes from the ready set first that are the ones that cause
@@ -1723,10 +1747,6 @@ DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
         continue;
       }
 
-      if (ABSL_PREDICT_FALSE(vlog_2)) {
-        ready_chosen_orig = ready_chosen;
-        ready_candidate_orig = ready_candidate;
-      }
       const char* reason;
       bool new_candidate_selected =
           ready_lt.MaybeUpdate(ready_candidate, ready_chosen, &reason);
@@ -1739,21 +1759,11 @@ DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
               return std::string("N/A");
             };
         VLOG(2) << "Choosing from ready ("
-                << (new_candidate_selected
-                        ? ready_candidate_orig.node->GetInstr().name()
-                        : ready_chosen_orig.node->GetInstr().name())
-                << ") vs ("
-                << (new_candidate_selected
-                        ? ready_chosen_orig.node->GetInstr().name()
-                        : ready_candidate_orig.node->GetInstr().name())
+                << ready_chosen.node->GetInstr().name() << ") vs ("
+                << ready_candidate.node->GetInstr().name()
                 << ") Reason: " << reason << " mem pressure chosen "
-                << print_pressure_change(new_candidate_selected
-                                             ? ready_candidate_orig
-                                             : ready_chosen_orig)
-                << " mem pressure other "
-                << print_pressure_change(new_candidate_selected
-                                             ? ready_chosen_orig
-                                             : ready_candidate_orig);
+                << print_pressure_change(ready_chosen) << " mem pressure other "
+                << print_pressure_change(ready_candidate);
       }
 
       if (new_candidate_selected) {
@@ -1806,8 +1816,7 @@ DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
           sched_state.ongoing_annotation = -1;
         }
         // Remove this annotation from ready_annotations if it's there.
-        auto it = std::find(sched_state.ready_annotations.begin(),
-                            sched_state.ready_annotations.end(), annotation);
+        auto it = absl::c_find(sched_state.ready_annotations, annotation);
         if (it != sched_state.ready_annotations.end()) {
           sched_state.ready_annotations.erase(it);
         }
@@ -2151,8 +2160,7 @@ absl::Status DefaultSchedulerCore::ScheduleAnnotation(
       continue;
     }
     // Delete the node from the ready set.
-    auto node_it = std::find(sched_state->ready_set.begin(),
-                             sched_state->ready_set.end(), node);
+    auto node_it = absl::c_find(sched_state->ready_set, node);
     TF_RET_CHECK(node_it != sched_state->ready_set.end())
         << "Couldn't find the annotated node in ready set: "
         << node->GetInstr().name();
@@ -2166,9 +2174,26 @@ absl::Status DefaultSchedulerCore::ScheduleAnnotation(
     VLOG(2) << "Scheduled annotated node (" << num_scheduled << "/"
             << annotation_size << "): " << node->GetInstr().name();
   }
-  // Check that we scheduled all the nodes in the annotation.
-  TF_RET_CHECK(num_scheduled == annotation_size - non_ready_instr)
-      << "Couldn't schedule all annotated nodes in one go.";
+  // If for some reason we could not schedule all the instructions in the
+  // annotation in one go, we clear the annotation for the remaining
+  // instruction. Currently this should only happen for async-start
+  // instructions.
+  if (num_scheduled < annotation_size - non_ready_instr) {
+    for (auto* inst :
+         annotation_tracker_->GetInstructions(computation, annotation)) {
+      HloGraphNode& node = sched_state->sched_graph.GetNode(inst);
+      if (!node.IsScheduled()) {
+        TF_RET_CHECK(
+            scheduling_context_->GetAsyncTracker()->IsSupportedAsyncStart(
+                node.GetInstr()));
+        VLOG(2) << "Could not schedule all annotated nodes with annotation ID "
+                << annotation << " in one go; clearing annotation for "
+                << node.GetInstr().name();
+        node.ClearAnnotation();
+        sched_state->nodes_holding_annotations.insert(&node);
+      }
+    }
+  }
   return absl::OkStatus();
 }
 
@@ -2223,8 +2248,7 @@ absl::StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
   // was there.
   if (sched_state->config.enable_selective_resources &&
       n->ReleasesSelectiveResource()) {
-    auto it = std::find(sched_state->selective_resource_releasers.begin(),
-                        sched_state->selective_resource_releasers.end(), n);
+    auto it = absl::c_find(sched_state->selective_resource_releasers, n);
     // Perform sanity check node was in selective_resources_releasers.
     if (it == sched_state->selective_resource_releasers.end()) {
       LOG(WARNING) << "Selective resource releasers list does not contain node "
@@ -2596,6 +2620,7 @@ HloScheduleGraph::HloScheduleGraph(
     }
     if (IsCustomCallWithForceDelayAttribute(instr)) {
       n->SetForceDelay(true);
+      n->SetForceDelayPriority(GetCustomCallForceDelayPriority(instr));
     }
   }
 
@@ -2944,7 +2969,7 @@ void HloScheduleGraph::AnnotateGraph(
     for (const HloInstruction* instr :
          annotation_tracker->GetInstructions(comp, annotation)) {
       HloGraphNode& node = GetNode(instr);
-      TF_CHECK_OK(node.SetAnnotation(annotation));
+      CHECK_OK(node.SetAnnotation(annotation));
     }
   }
 }
@@ -3028,8 +3053,7 @@ DefaultSchedulerCore::GetNumResourcesNeededForAnnotation(
         // assuming maximum overlapping, where the resources used by the
         // async-done ops need to be accumulated.
         const HloInstruction* start = instr->operand(0);
-        if (std::find(instrs.begin(), instrs.end(), start) == instrs.end() ||
-            get_max_resources) {
+        if (absl::c_find(instrs, start) == instrs.end() || get_max_resources) {
           num_resources_needed[resource] += usage;
           continue;
         }
@@ -3211,9 +3235,9 @@ DefaultSchedulerCore::ScheduleComputation(
     XLA_VLOG_LINES(2, [&sched_state]() {
       struct LogFormatter {
         void operator()(std::string* out, const HloGraphNode* n) const {
-          out->append(absl::StrCat("\t", n->GetInstr().name(),
-                                   " Ready time: ", n->GetReadyTime(),
-                                   " Depth: ", n->GetGraphDepth()));
+          absl::StrAppend(out, "\t", n->GetInstr().name(),
+                          " Ready time: ", n->GetReadyTime(),
+                          " Depth: ", n->GetGraphDepth());
         }
       };
       return absl::StrJoin(sched_state->ready_set, "\n", LogFormatter());
@@ -3340,6 +3364,7 @@ LatencyHidingScheduler::LatencyHidingStatistics(
     kSend,
     kRecv,
     kCollectiveBroadcast,
+    kCall,
   };
   auto opcode_to_async_kind = [](HloOpcode opcode) {
     switch (opcode) {
@@ -3361,6 +3386,8 @@ LatencyHidingScheduler::LatencyHidingStatistics(
         return AsyncKind::kSend;
       case HloOpcode::kRecv:
         return AsyncKind::kRecv;
+      case HloOpcode::kCall:
+        return AsyncKind::kCall;
       default:
         return AsyncKind::kNotAsync;
     }
@@ -3474,6 +3501,7 @@ LatencyHidingScheduler::LatencyHidingStatistics(
       wasted_time_per_collective[AsyncKind::kReduceScatter],
       /*send_wasted_cycles=*/wasted_time_per_collective[AsyncKind::kSend],
       /*recv_wasted_cycles=*/wasted_time_per_collective[AsyncKind::kRecv],
+      /*call_wasted_cycles=*/wasted_time_per_collective[AsyncKind::kCall],
       /*total_cycles=*/current_time,
       /*memory_pressure_peak=*/
       memory_pressure_state
@@ -3510,6 +3538,8 @@ std::string LatencyHidingScheduler::SchedulerStatistics::ToString() const {
                   "\n");
   absl::StrAppend(&result, "Wasted cycles for recv: ", this->recv_wasted_cycles,
                   "\n");
+  absl::StrAppend(&result, "Wasted cycles for asynchronous call: ",
+                  this->call_wasted_cycles, "\n");
   absl::StrAppend(&result, "Total cycles: ", this->total_cycles, "\n");
   absl::StrAppend(&result,
                   "Memory pressure peak (bytes): ", this->memory_pressure_peak,
@@ -3529,6 +3559,7 @@ LatencyHidingScheduler::SchedulerStatistics::ToProto() const {
   proto.set_reduce_scatter_wasted_cycles(reduce_scatter_wasted_cycles);
   proto.set_send_wasted_cycles(send_wasted_cycles);
   proto.set_recv_wasted_cycles(recv_wasted_cycles);
+  proto.set_call_wasted_cycles(call_wasted_cycles);
   proto.set_total_wasted_cycles(this->GetTotalWastedCycles());
   proto.set_total_cycles(total_cycles);
   proto.set_memory_pressure_peak(memory_pressure_peak);
@@ -3576,7 +3607,7 @@ LatencyHidingScheduler::ScheduleWithPreferences(
   return std::make_pair(new_schedule, schedule_info);
 }
 
-absl::StatusOr<bool> LatencyHidingScheduler::Run(
+absl::StatusOr<bool> LatencyHidingScheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Original module:";
@@ -3597,7 +3628,8 @@ absl::StatusOr<bool> LatencyHidingScheduler::Run(
       if (scheduling_context_->GetAsyncTracker()->IsSupportedAsyncStart(
               *instr) ||
           scheduling_context_->GetAsyncTracker()->IsSupportedAsyncDone(
-              *instr)) {
+              *instr) ||
+          IsCustomCallWithForceDelayAttribute(instr)) {
         computations_to_schedule_.push_back(computation);
         break;
       }
@@ -3611,7 +3643,7 @@ absl::StatusOr<bool> LatencyHidingScheduler::Run(
     return false;
   }
   TF_RETURN_IF_ERROR(scheduler_core_->InitializeScheduler(module));
-  const auto& debug_options = xla::GetDebugOptionsFromFlags();
+  const auto& debug_options = module->config().debug_options();
   if (debug_options.xla_dump_latency_hiding_schedule()) {
     TF_RETURN_IF_ERROR(scheduler_core_->CaptureScheduleProto());
   }
@@ -3656,9 +3688,8 @@ absl::StatusOr<bool> LatencyHidingScheduler::Run(
        iter++) {
     LOG(INFO) << "LatencyHidingScheduler current memory usage: "
               << scheduler_core_->GetMemoryPeak() + fragmentation_size
-              << " bytes, does not fit in limit: "
-              << scheduler_core_->GetMemoryLimit()
-              << ". Setting the new limit to "
+              << " bytes, does not fit in initial limit: "
+              << initial_memory_limit << ". Setting the new limit to "
               << static_cast<uint64_t>(scheduler_core_->GetMemoryLimit() * 0.9);
     TF_RETURN_IF_ERROR(scheduler_core_->InitializeScheduler(module));
     scheduler_core_->SetMemoryLimit(scheduler_core_->GetMemoryLimit() * 0.9);
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 4d133d0975b4e9..01630bbaa5bf5e 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -245,6 +245,9 @@ class AsyncTracker {
   virtual ResourcesVector GetResourcesFromInstructionImpl(
       const HloInstruction& hlo) const;
 
+  // Gets the resource type associated with the given op.
+  static ResourceType GetResourceTypeForOp(HloOpcode op);
+
   // Returns resources used (i.e., occupied or released) by this instruction
   absl::Span<const ResourcePair> GetResourcesFromInstruction(
       const HloInstruction& hlo) const;
@@ -702,6 +705,10 @@ class HloGraphNode {
   void SetGraphDepth(TimeCost graph_depth) { graph_depth_ = graph_depth; }
   bool GetForceDelay() const { return force_delay_; }
   void SetForceDelay(bool force_delay) { force_delay_ = force_delay; }
+  int GetForceDelayPriority() const { return force_delay_priority_; }
+  void SetForceDelayPriority(int force_delay_priority) {
+    force_delay_priority_ = force_delay_priority;
+  }
   bool GetForceEarly() const { return force_early_; }
   void SetForceEarly(bool force_early) { force_early_ = force_early; }
   bool GetForceDelayAfterTarget() const { return force_delay_after_target_; }
@@ -944,6 +951,9 @@ class HloGraphNode {
   // bitfields
   // Force the scheduling of the nodes with attribute set as late as possible.
   bool force_delay_ = false;
+  // If multiple nodes are there with force_delay_ = true, the one with the
+  // lowest delay priority will be scheduled first.
+  int force_delay_priority_ = 0;
   // Force the scheduling of the nodes with attribute set as early as possible.
   bool force_early_ = false;
   // If has_rare_ is false, then all the fields in rare can assumed to be
@@ -1520,7 +1530,6 @@ class DefaultSchedulerCore : public SchedulerCore {
     bool has_pressure_change = false;
     bool has_estimated_connected_send_ready_time = false;
     bool has_resource_constrained = false;
-    bool unused = false;
 
     int64_t pressure_change_first;
     int64_t pressure_change_second;
@@ -1784,6 +1793,7 @@ class LatencyHidingScheduler : public HloModulePass {
     double reduce_scatter_wasted_cycles = 0;
     double send_wasted_cycles = 0;
     double recv_wasted_cycles = 0;
+    double call_wasted_cycles = 0;
     double total_cycles = 0;
     int64_t memory_pressure_peak = 0;
 
@@ -1792,7 +1802,7 @@ class LatencyHidingScheduler : public HloModulePass {
              collective_broadcast_wasted_cycles +
              collective_permute_wasted_cycles + all_to_all_wasted_cycles +
              ragged_all_to_all_wasted_cycles + reduce_scatter_wasted_cycles +
-             send_wasted_cycles + recv_wasted_cycles;
+             send_wasted_cycles + recv_wasted_cycles + call_wasted_cycles;
     }
 
     ScheduleProto::SchedulerStatisticsProto ToProto() const;
@@ -1828,14 +1838,13 @@ class LatencyHidingScheduler : public HloModulePass {
                           const std::vector<double>& preferences,
                           const HloComputation* computation);
 
-  using HloPassInterface::Run;
+  virtual void LogScheduleStatistics(const HloComputation* computation);
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  virtual void LogScheduleStatistics(const HloComputation* computation);
-
  protected:
   std::shared_ptr<const SchedulingContext> scheduling_context_;
   std::shared_ptr<SchedulerCore> scheduler_core_;
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 5d973e6be77677..9a3dd47c32949b 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/hlo/testlib/test_helpers.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/transforms/collectives/async_collective_creator.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -63,7 +63,7 @@ constexpr int kMaxConcurrentAsyncCollectivePermutes = 5;
 
 int PositionInVector(absl::Span<HloInstruction* const> vec,
                      const HloInstruction* element) {
-  return std::distance(vec.begin(), std::find(vec.begin(), vec.end(), element));
+  return std::distance(vec.begin(), absl::c_find(vec, element));
 }
 
 bool MaxConcurrentCollectivePermutesBelowThreshold(
@@ -793,6 +793,25 @@ ENTRY entry {
             GetIndex(new_instruction_sequence, cp_start->name()));
 }
 
+TEST_F(LatencyHidingSchedulerTest, ForceDelayCustomCall) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY %module {
+  %p0 = f32[100] parameter(0)
+  %custom-call = f32[100] custom-call(%p0), custom_call_target="foo", frontend_attributes={scheduler_hint="force_delay"}
+  ROOT %copy = f32[100] copy(%custom-call)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  // We expect RunScheduler to return true because of the force_delay attribute,
+  // even though there are no async collectives.
+  auto result = RunScheduler(hlo_module.get());
+  TF_ASSERT_OK(result);
+  EXPECT_TRUE(result.value());
+}
+
 TEST_F(LatencyHidingSchedulerTest, WhileLoopAliasingBug2) {
   // Like WhileLoopAliasingBug above, but this time the input buffer of the
   // first collective permute aliases with the output buffer of the second
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 10d8d9bdd90629..8c8de371fa2018 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/layout_assignment.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <deque>
 #include <iterator>
@@ -58,6 +57,8 @@ limitations under the License.
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -309,7 +310,7 @@ absl::Status LayoutAssignment::SetBufferLayout(const Layout& layout,
           << buffer_constraint->ToString();
   PushAddedConstraints(buffer_constraint.get());
   const HloInstruction* instruction = buffer.instruction();
-  if (dynamic_cast<const HloCallableInstruction*>(instruction) != nullptr) {
+  if (HloCallableInstruction::ClassOf(instruction)) {
     // Check and propagate via output-operand aliasing
     VLOG(3) << "Propagating aliasing:" << instruction->ToString() << "\n";
     for (const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>&
@@ -455,9 +456,8 @@ absl::Status LayoutAssignment::SetInstructionLayout(
         if (subshape.IsArray()) {
           return SetBufferLayout(layout, *buffers[0], mandatory,
                                  /*dfs=*/true, priority);
-        } else {
-          return absl::OkStatus();
         }
+        return absl::OkStatus();
       });
 }
 
@@ -497,9 +497,8 @@ absl::Status LayoutAssignment::SetInstructionLayout(
         if (subshape.IsArray() && subshape.has_layout()) {
           return SetBufferLayout(subshape.layout(), *buffers[0], mandatory,
                                  /*dfs=*/dfs, priority);
-        } else {
-          return absl::OkStatus();
         }
+        return absl::OkStatus();
       }));
   VLOG(3) << "Setting operand layout?\n";
   if (shape_with_layout.IsArray() &&
@@ -1412,7 +1411,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
         ShapeUtil::AlignLayouts(output_shape_with_layout, operand_shape);
     if (aligned_operand_shape) {
       auto operand_layout = aligned_operand_shape.value().layout();
-      TF_CHECK_OK(
+      CHECK_OK(
           LayoutUtil::ValidateLayoutForShape(operand_layout, operand_shape));
       return std::make_unique<Layout>(operand_layout);
     }
@@ -1428,7 +1427,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
       new_minor_to_major[i] = operand_dim;
     }
     Layout operand_layout = LayoutUtil::MakeLayout(new_minor_to_major);
-    TF_CHECK_OK(
+    CHECK_OK(
         LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape()));
     return std::make_unique<Layout>(operand_layout);
   }
@@ -1458,7 +1457,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
       new_minor_to_major.push_back(output_to_operand_mapping[output_dim]);
     }
     Layout operand_layout = LayoutUtil::MakeLayout(new_minor_to_major);
-    TF_CHECK_OK(
+    CHECK_OK(
         LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape()));
     return std::make_unique<Layout>(operand_layout);
   }
@@ -1551,8 +1550,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
         ShapeUtil::AlignLayouts(operand_shape_with_layout, output_shape);
     if (aligned_user_shape) {
       auto user_layout = aligned_user_shape.value().layout();
-      TF_CHECK_OK(
-          LayoutUtil::ValidateLayoutForShape(user_layout, output_shape));
+      CHECK_OK(LayoutUtil::ValidateLayoutForShape(user_layout, output_shape));
       return std::make_unique<Layout>(user_layout);
     }
   }
@@ -1568,7 +1566,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
       new_minor_to_major[i] = user_dim;
     }
     Layout user_layout = LayoutUtil::MakeLayout(new_minor_to_major);
-    TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(user_layout, user->shape()));
+    CHECK_OK(LayoutUtil::ValidateLayoutForShape(user_layout, user->shape()));
     return std::make_unique<Layout>(user_layout);
   }
 
@@ -2635,7 +2633,7 @@ absl::Status LayoutAssignment::PropagateComputationLayouts(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> LayoutAssignment::Run(
+absl::StatusOr<bool> LayoutAssignment::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Running layout assignment on module " << module->name();
@@ -2861,6 +2859,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAdd:
     case HloOpcode::kAddDependency:
     case HloOpcode::kAnd:
@@ -3023,10 +3022,10 @@ absl::Status LayoutAssignment::Init(HloModule* module) {
     std::vector<HloInstruction*> copies_to_remove(added_copies_.begin(),
                                                   added_copies_.end());
     // Ensure determinism.
-    std::sort(copies_to_remove.begin(), copies_to_remove.end(),
-              [](const HloInstruction* a, const HloInstruction* b) {
-                return a->unique_id() < b->unique_id();
-              });
+    absl::c_sort(copies_to_remove,
+                 [](const HloInstruction* a, const HloInstruction* b) {
+                   return a->unique_id() < b->unique_id();
+                 });
     for (HloInstruction* instruction : copies_to_remove) {
       VLOG(5) << "Removing added copy: " << instruction->ToString();
       HloComputation* computation = instruction->parent();
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index 29a4179cea01f6..63347ecb6ccb14 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -279,13 +279,6 @@ class LayoutAssignment : public HloModulePass {
   }
   absl::string_view name() const override { return "layout-assignment"; }
 
-  // Assign layouts to the given module. Returns whether the module was changed
-  // (any layouts were changed).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Class encapsulating the layout constraints of the values in a HLO
   // computation.
   class LayoutConstraints {
@@ -571,6 +564,12 @@ class LayoutAssignment : public HloModulePass {
       const HloInstruction* user,
       const OperandLayoutConstraint& operand_constraint);
 
+  // Assign layouts to the given module. Returns whether the module was changed
+  // (any layouts were changed).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Initializes the layout assignment object for a new Run() call.
   absl::Status Init(HloModule* module);
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index e39858a6daf7ce..d3bc72af548f79 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -47,11 +47,10 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -310,7 +309,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 2}, {1, 0});
   *ShapeUtil::GetMutableSubshape(&result_shape, /*index=*/{1, 0}) =
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 2}, {0, 1});
-  TF_CHECK_OK(computation_layout.mutable_result_layout()->CopyLayoutFromShape(
+  CHECK_OK(computation_layout.mutable_result_layout()->CopyLayoutFromShape(
       result_shape));
 
   LayoutAssignment layout_assignment(&computation_layout);
@@ -2092,12 +2091,11 @@ ENTRY main {
       std::unique_ptr<HloModule> m,
       ParseAndReturnUnverifiedModule(
           module_str, {}, HloParserOptions().set_fill_missing_layouts(false)));
-  TF_CHECK_OK(backend()
-                  .compiler()
-                  ->RunHloPasses(m->Clone(),
-                                 backend().default_stream_executor(),
-                                 /*device_allocator=*/nullptr)
-                  .status());
+  CHECK_OK(backend()
+               .compiler()
+               ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
+                              /*device_allocator=*/nullptr)
+               .status());
 }
 
 TEST_F(LayoutAssignmentTest, HloBufferLayoutUnconstrained) {
diff --git a/third_party/xla/xla/service/layout_normalization.cc b/third_party/xla/xla/service/layout_normalization.cc
index 4cc6507e5f2ae3..c74f1ec4a76197 100644
--- a/third_party/xla/xla/service/layout_normalization.cc
+++ b/third_party/xla/xla/service/layout_normalization.cc
@@ -86,13 +86,12 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
   // To handle a constant, just give the literal data a new layout.
   absl::Status HandleConstant(HloInstruction* hlo) override {
+    Shape shape = hlo->shape();
     Literal& literal = *Cast<HloConstantInstruction>(hlo)->mutable_literal();
-    if (literal.shape().IsTuple()) {
-      // TODO(cheshire): Tuple constants.
+    if (literal.shape().IsTuple() || ShapeUtil::IsZeroElementArray(shape)) {
       return absl::OkStatus();
     }
 
-    Shape shape = hlo->shape();
     Shape normalized_shape = Normalize(shape);
     *literal.mutable_shape_do_not_use() = normalized_shape;
     // Ensure element_size_in_bits of literal is 0, because literals do not
@@ -872,7 +871,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
 }  // end namespace
 
-absl::StatusOr<bool> LayoutNormalization::Run(
+absl::StatusOr<bool> LayoutNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return LayoutNormalizationVisitor{this, custom_call_transformer_}.RunOnModule(
diff --git a/third_party/xla/xla/service/layout_normalization.h b/third_party/xla/xla/service/layout_normalization.h
index da416aa9687f47..b4ca1b2a9ead02 100644
--- a/third_party/xla/xla/service/layout_normalization.h
+++ b/third_party/xla/xla/service/layout_normalization.h
@@ -50,8 +50,9 @@ class LayoutNormalization : public HloModulePass {
       : custom_call_transformer_(custom_call_transformer) {}
 
   absl::string_view name() const override { return "layout_normalization"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/layout_normalization_test.cc b/third_party/xla/xla/service/layout_normalization_test.cc
index cbb8fcabbed34b..6669dc42b47081 100644
--- a/third_party/xla/xla/service/layout_normalization_test.cc
+++ b/third_party/xla/xla/service/layout_normalization_test.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include <functional>
 #include <optional>
 
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/scatter_simplifier.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -601,6 +602,17 @@ ENTRY main {
   )");
 }
 
+TEST_F(LayoutNormalizationTest, ZeroSizedConstant) {
+  const char* hlo = R"(
+  HloModule zero_sized_constant, entry_computation_layout={()->s32[0,179]{0,1}}
+  ENTRY main() -> s32[0,179] {
+    ROOT %constant = s32[0,179]{1,0} constant({  })
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(auto status, LayoutNormalization().Run(module.get()));
+  EXPECT_FALSE(status);
+}
+
 TEST_F(LayoutNormalizationTest, ConstantAvoidRevisitOfUser) {
   const char* hlo = R"(
 HloModule module
@@ -864,16 +876,15 @@ ENTRY main.17 {
 }
 )";
 
-  CheckLayoutNormalization(
-      hlo, R"(
+  CheckLayoutNormalization(hlo, R"(
 // CHECK: scatter({{.*}}),
 // CHECK-SAME: update_window_dims={2,0}, inserted_window_dims={0,1,3}, scatter_dims_to_operand_dims={2,4}, index_vector_dim=1, to_apply=%region_0.10
 )",
-      // Run the ScatterSimplifier afterwards, otherwise the verifier will
-      // complain!
-      [](HloModule* module) {
-        TF_CHECK_OK(ScatterSimplifier().Run(module).status());
-      });
+                           // Run the ScatterSimplifier afterwards, otherwise
+                           // the verifier will complain!
+                           [](HloModule* module) {
+                             CHECK_OK(ScatterSimplifier().Run(module).status());
+                           });
 }
 
 TEST_F(LayoutNormalizationTest, SimplifiedScatter) {
@@ -894,16 +905,15 @@ ENTRY main.17 {
 }
 )";
 
-  CheckLayoutNormalization(
-      hlo, R"(
+  CheckLayoutNormalization(hlo, R"(
 // CHECK: scatter({{.*}}),
 // CHECK-SAME: update_window_dims={4,0,1,2,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=1, to_apply=%region_0.10
 )",
-      // Run the ScatterSimplifier afterwards, otherwise the verifier will
-      // complain!
-      [](HloModule* module) {
-        TF_CHECK_OK(ScatterSimplifier().Run(module).status());
-      });
+                           // Run the ScatterSimplifier afterwards, otherwise
+                           // the verifier will complain!
+                           [](HloModule* module) {
+                             CHECK_OK(ScatterSimplifier().Run(module).status());
+                           });
 }
 
 TEST_F(LayoutNormalizationTest, VariadicScatter) {
@@ -930,16 +940,15 @@ ENTRY main.17 {
 }
 )";
 
-  CheckLayoutNormalization(
-      hlo, R"(
+  CheckLayoutNormalization(hlo, R"(
 // CHECK: scatter({{.*}}),
 // CHECK-SAME: update_window_dims={4,0,1,2,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=1, to_apply=%region_0.10
 )",
-      // Run the ScatterSimplifier afterwards, otherwise the verifier will
-      // complain!
-      [](HloModule* module) {
-        TF_CHECK_OK(ScatterSimplifier().Run(module).status());
-      });
+                           // Run the ScatterSimplifier afterwards, otherwise
+                           // the verifier will complain!
+                           [](HloModule* module) {
+                             CHECK_OK(ScatterSimplifier().Run(module).status());
+                           });
 }
 
 TEST_F(LayoutNormalizationTest, CompareInt4) {
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations.cc b/third_party/xla/xla/service/legalize_scheduling_annotations.cc
index 4d8db8fec80f06..b1b539f8fe6e0a 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations.cc
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations.cc
@@ -479,7 +479,7 @@ absl::Status LegalizeSchedulingAnnotations::Verify(HloModule* module) {
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
+absl::StatusOr<bool> LegalizeSchedulingAnnotations::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<HloInstruction*, Annotation> instruction_to_annotation;
@@ -605,7 +605,7 @@ absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
   return changed;
 }
 
-absl::StatusOr<bool> CheckNoDataDependencyInSchedulingAnnotations::Run(
+absl::StatusOr<bool> CheckNoDataDependencyInSchedulingAnnotations::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (HloComputation* computation :
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations.h b/third_party/xla/xla/service/legalize_scheduling_annotations.h
index df3dfa47737f46..06891826ec9847 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations.h
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations.h
@@ -66,8 +66,8 @@ class LegalizeSchedulingAnnotations : public HloModulePass {
 
   absl::Status Verify(HloModule* module);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -90,7 +90,9 @@ class CheckNoDataDependencyInSchedulingAnnotations : public HloModulePass {
   absl::string_view name() const override {
     return "check-no-data-dependency-in-scheduling-annotations";
   }
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc b/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc
index 53689f40bb10fc..c8ad6c2df84ec9 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/side_effect_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace {
@@ -43,7 +43,6 @@ namespace {
 using LegalizeSchedulingAnnotationsTest = HloHardwareIndependentTestBase;
 using SchedulingAnnotationPropagationTest = HloHardwareIndependentTestBase;
 using RemoveLoopIterationAnnotationTest = HloHardwareIndependentTestBase;
-using ::tsl::testing::IsOkAndHolds;
 
 TEST_F(LegalizeSchedulingAnnotationsTest, NonIntegerAnnotation) {
   constexpr absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index 470f379614ecaa..06d0fe7da59eab 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -127,7 +127,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
@@ -282,7 +281,7 @@ cc_library(
         ":llvm_util",
         "//xla/service:hlo_module_config",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.cc b/third_party/xla/xla/service/llvm_ir/ir_array.cc
index fc1a80067ea6d3..e8b23a4f4f2ca2 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -199,7 +198,7 @@ IrArray::IrArray(llvm::Value* base_ptr, llvm::Type* pointee_type, Shape shape)
     : base_ptr_(base_ptr),
       pointee_type_(pointee_type),
       shape_(std::move(shape)) {
-  TF_CHECK_OK(ShapeUtil::ValidateShape(shape_));
+  CHECK_OK(ShapeUtil::ValidateShape(shape_));
   CHECK(base_ptr_->getType()->isPointerTy());
   int depth = 0;
   element_type_ = pointee_type;
diff --git a/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc b/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc
index f60511a83d4d84..51d8ef32a247ca 100644
--- a/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc
+++ b/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "xla/service/llvm_ir/kernel_support_library.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -110,9 +110,8 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel(
   if (!function) {
     VLOG(2) << "Generating kernel for " << kernel_name;
     std::vector<llvm::Type*> arg_types;
-    std::transform(sanitized_args.begin(), sanitized_args.end(),
-                   std::back_inserter(arg_types),
-                   [](llvm::Value* arg) { return arg->getType(); });
+    absl::c_transform(sanitized_args, std::back_inserter(arg_types),
+                      [](llvm::Value* arg) { return arg->getType(); });
 
     auto* function_type =
         llvm::FunctionType::get(b->getVoidTy(), arg_types, /*isVarArg=*/false);
diff --git a/third_party/xla/xla/service/llvm_ir/kernel_support_library.h b/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
index 0c4b87adcf24f7..83607d8c40e51a 100644
--- a/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
+++ b/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
@@ -18,20 +18,16 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
-#include <string>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_loop.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/tsl/platform/status.h"
 
 namespace xla {
 // A thin wrapper around llvm_loop.h to make code generating structured control
@@ -184,7 +180,7 @@ class KernelSupportLibrary {
           const std::function<void()>& true_block_generator,
           const std::function<void()>& false_block_generator = nullptr) {
     if (false_block_generator != nullptr) {
-      TF_CHECK_OK(IfWithStatus(
+      CHECK_OK(IfWithStatus(
           name, condition,
           [&]() {
             true_block_generator();
@@ -195,7 +191,7 @@ class KernelSupportLibrary {
             return absl::OkStatus();
           }));
     } else {
-      TF_CHECK_OK(IfWithStatus(name, condition, [&]() {
+      CHECK_OK(IfWithStatus(name, condition, [&]() {
         true_block_generator();
         return absl::OkStatus();
       }));
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index 2d0ab8fad9131a..876b5f222f4a41 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -626,14 +627,12 @@ std::string SanitizeFunctionName(std::string function_name) {
   // are illegal.
 
   // Sanitize chars in function_name.
-  std::transform(function_name.begin(), function_name.end(),
-                 function_name.begin(), [](char c) {
-                   if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
-                       ('0' <= c && c <= '9') || c == '_' || c == '$') {
-                     return c;
-                   }
-                   return '_';
-                 });
+  absl::c_transform(function_name, function_name.begin(), [](char c) {
+    if (absl::ascii_isalnum(c) || c == '_' || c == '$') {
+      return c;
+    }
+    return '_';
+  });
 
   // Ensure the name isn't empty.
   if (function_name.empty()) {
@@ -641,8 +640,7 @@ std::string SanitizeFunctionName(std::string function_name) {
   }
 
   // Ensure the name doesn't start with a number.
-  if (!function_name.empty() && function_name[0] >= '0' &&
-      function_name[0] <= '9') {
+  if (!function_name.empty() && absl::ascii_isdigit(function_name[0])) {
     function_name.insert(function_name.begin(), '_');
   }
 
diff --git a/third_party/xla/xla/service/local_service.cc b/third_party/xla/xla/service/local_service.cc
index d35117e0248b08..49da257b79ff3d 100644
--- a/third_party/xla/xla/service/local_service.cc
+++ b/third_party/xla/xla/service/local_service.cc
@@ -96,7 +96,8 @@ LocalService::CompileExecutables(
       build_options.compile_thread_pool(),
       build_options.layout_canonicalization_callback(),
       false,
-      {},
+      /*gpu_target_config=*/{},
+      /*cpu_target_config=*/{},
       {build_options.key_value_store(), build_options.process_index(),
        build_options.process_count()},
       build_options.slice_size()};
diff --git a/third_party/xla/xla/service/loop_schedule_linearizer.cc b/third_party/xla/xla/service/loop_schedule_linearizer.cc
index a4503fed5bc121..39329ad2c7440c 100644
--- a/third_party/xla/xla/service/loop_schedule_linearizer.cc
+++ b/third_party/xla/xla/service/loop_schedule_linearizer.cc
@@ -164,7 +164,7 @@ static absl::StatusOr<bool> AddControlEdgesForLoopWrites(
   return changed;
 }
 
-absl::StatusOr<bool> LoopScheduleLinearizer::Run(
+absl::StatusOr<bool> LoopScheduleLinearizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Constructing HloAliasAnalysis is expensive, so don't do it until we find at
diff --git a/third_party/xla/xla/service/loop_schedule_linearizer.h b/third_party/xla/xla/service/loop_schedule_linearizer.h
index d16a7391e3cb40..7d4780f41e9912 100644
--- a/third_party/xla/xla/service/loop_schedule_linearizer.h
+++ b/third_party/xla/xla/service/loop_schedule_linearizer.h
@@ -44,8 +44,8 @@ class LoopScheduleLinearizer : public HloModulePass {
   explicit LoopScheduleLinearizer(const AliasInfo* alias_info)
       : alias_info_(alias_info) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/map_inliner.cc b/third_party/xla/xla/service/map_inliner.cc
index 7f96c1e8aa80a4..e003f189f60203 100644
--- a/third_party/xla/xla/service/map_inliner.cc
+++ b/third_party/xla/xla/service/map_inliner.cc
@@ -114,7 +114,7 @@ absl::Status MapInlinerVisitor::HandleMap(HloInstruction* map) {
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> MapInliner::Run(
+absl::StatusOr<bool> MapInliner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   MapInlinerVisitor visitor(/*computation=*/nullptr);
diff --git a/third_party/xla/xla/service/map_inliner.h b/third_party/xla/xla/service/map_inliner.h
index 33821208770aeb..1bb5adcd9683ae 100644
--- a/third_party/xla/xla/service/map_inliner.h
+++ b/third_party/xla/xla/service/map_inliner.h
@@ -32,10 +32,10 @@ class MapInliner : public HloModulePass {
   ~MapInliner() override = default;
   absl::string_view name() const override { return "map-inline"; }
 
+ protected:
   // Run map inlining on the given computation. Returns whether the computation
   // was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/matmul_indexing_utils.cc b/third_party/xla/xla/service/matmul_indexing_utils.cc
index 09e2592d0ec461..da55a029f45840 100644
--- a/third_party/xla/xla/service/matmul_indexing_utils.cc
+++ b/third_party/xla/xla/service/matmul_indexing_utils.cc
@@ -60,7 +60,11 @@ absl::StatusOr<std::vector<int64_t>> GetNonContractingDims(
                                          contracting_dims, batch_dims);
 
   TF_RET_CHECK(batch_dims.size() + contracting_dims.size() + nc.size() ==
-               shape.dimensions().size());
+               shape.dimensions().size())
+      << "batch_dims: " << batch_dims.size()
+      << " contracting_dims: " << contracting_dims.size()
+      << " nc: " << nc.size()
+      << " vs shape dims size: " << shape.dimensions().size();
   return std::vector<int64_t>(nc.begin(), nc.end());
 }
 
@@ -130,15 +134,13 @@ absl::StatusOr<std::array<DotOperandDims, 4>> DotOperandDims::FromScaledDot(
     const HloInstruction* scaled_dot) {
   TF_ASSIGN_OR_RETURN(auto lhs_dims, FromDotOperand(scaled_dot, 0));
   DotOperandDims lhs_scale_dims;
-  if (scaled_dot->operand(2)->opcode() != HloOpcode::kConstant ||
-      !scaled_dot->operand(2)->shape().dimensions().empty()) {
+  if (!ShapeUtil::IsScalar(scaled_dot->operand(2)->shape())) {
     TF_ASSIGN_OR_RETURN(lhs_scale_dims, FromDotOperand(scaled_dot, 2));
   }
 
   TF_ASSIGN_OR_RETURN(auto rhs_dims, FromDotOperand(scaled_dot, 1));
   DotOperandDims rhs_scale_dims;
-  if (scaled_dot->operand(3)->opcode() != HloOpcode::kConstant ||
-      !scaled_dot->operand(3)->shape().dimensions().empty()) {
+  if (!ShapeUtil::IsScalar(scaled_dot->operand(3)->shape())) {
     TF_ASSIGN_OR_RETURN(rhs_scale_dims, FromDotOperand(scaled_dot, 3));
   }
 
diff --git a/third_party/xla/xla/service/matmul_indexing_utils_test.cc b/third_party/xla/xla/service/matmul_indexing_utils_test.cc
index 2aeed80af377bb..f171949e422d36 100644
--- a/third_party/xla/xla/service/matmul_indexing_utils_test.cc
+++ b/third_party/xla/xla/service/matmul_indexing_utils_test.cc
@@ -16,19 +16,19 @@ limitations under the License.
 #include "xla/service/matmul_indexing_utils.h"
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/shape.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
-using ::tsl::testing::IsOkAndHolds;
 
 TEST(GetNonContractingDimsTest, Valid) {
   Shape shape = ParseShape("f32[1,2,3,4,5,6]").value();
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 1e36c198ab4fca..4a4f3c46ee2682 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -4,7 +4,9 @@
 load(
     "//xla:xla.default.bzl",
     "xla_cc_test",
+    "xla_py_proto_library",
 )
+load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load(
     "//xla/tsl/platform:build_config.bzl",
@@ -31,6 +33,11 @@ tf_proto_library(
     make_default_target_header_only = True,
 )
 
+xla_py_proto_library(
+    name = "memory_space_assignment_proto_py_pb2",
+    deps = [":memory_space_assignment_proto"],
+)
+
 cc_library(
     name = "memory_space_assignment",
     srcs = ["memory_space_assignment.cc"],
@@ -56,7 +63,6 @@ cc_library(
         "//xla/service:hlo_value",
         "//xla/service/heap_simulator",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -72,9 +78,15 @@ cc_library(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "memory_space_assignment_test",
     srcs = ["memory_space_assignment_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":algorithm",
         ":allocation",
@@ -113,8 +125,6 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -128,7 +138,6 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -160,8 +169,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -187,12 +194,12 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_value",
         "//xla/service/cost_modelling:op_cost",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_for_library",
@@ -216,8 +223,12 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_googlesource_code_re2//:re2",
+        "@highwayhash",
+        "@highwayhash//:arch_specific",
+        "@highwayhash//:hh_types",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
index a213fb8a1ff96b..fc822cf421ce33 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -157,9 +157,8 @@ bool LooksLikeAnActivation(const HloInstruction* inst, bool permissive_mode) {
           user = user->parent()->FusionInstruction();
           if (LooksLikeAnActivation(user, permissive_mode)) {
             return true;
-          } else {
-            break;
           }
+          break;
         }
         return true;
       case HloOpcode::kDynamicUpdateSlice:
@@ -854,13 +853,12 @@ bool MsaAlgorithm::IsUseAllowedInAlternateMemory(const AllocationValue& value,
                 << ", parameter time = " << parameter_time
                 << ", min use time = " << min_use_time;
         return true;
-      } else {
-        VLOG(4) << "Conditional allocation not allowed in alternate memory for "
-                   "computation = "
-                << called_computation->name()
-                << ", parameter time = " << parameter_time
-                << ", min use time = " << min_use_time;
       }
+      VLOG(4) << "Conditional allocation not allowed in alternate memory for "
+                 "computation = "
+              << called_computation->name()
+              << ", parameter time = " << parameter_time
+              << ", min use time = " << min_use_time;
     }
     return false;
   }
@@ -1323,8 +1321,8 @@ void MsaAlgorithm::IdentifyAndOptimizeMemoryBoundLoops() {
               << " loop end: " << loop_end_idx
               << " num iterations: " << num_iterations;
 
-      TF_CHECK_OK(OptimizeMemoryBoundLoop(loop_start_idx, loop_end_idx,
-                                          loop_size_candidate));
+      CHECK_OK(OptimizeMemoryBoundLoop(loop_start_idx, loop_end_idx,
+                                       loop_size_candidate));
     }
   }
 }
@@ -1511,8 +1509,7 @@ std::vector<const HloValue*> MsaAlgorithm::GenerateJointProcessedValues(
       const HloValue& next_value = alias_analysis_.dataflow_analysis()
                                        .GetValueSet(inst)
                                        .GetUniqueValue();
-      if (std::find(worklist.begin(), worklist.end(), &next_value) ==
-          worklist.end()) {
+      if (absl::c_find(worklist, &next_value) == worklist.end()) {
         worklist.push_back(&next_value);
       }
     };
@@ -1781,9 +1778,8 @@ void FixAllocationSequenceAfterPostAllocationTransformation(
       std::remove_if(
           allocations->begin(), allocations->end(),
           [transformation_info](const std::unique_ptr<Allocation>& allocation) {
-            return std::find(transformation_info.to_be_removed.begin(),
-                             transformation_info.to_be_removed.end(),
-                             allocation->defining_position().instruction) !=
+            return absl::c_find(transformation_info.to_be_removed,
+                                allocation->defining_position().instruction) !=
                    transformation_info.to_be_removed.end();
           }),
       allocations->end());
@@ -2007,13 +2003,12 @@ absl::Status MsaAlgorithm::ProcessColoredBuffers() {
   return absl::OkStatus();
 }
 
-int64_t MsaAlgorithm::MaxReservedScopedMemory() {
+int64_t MsaAlgorithm::MaxScopedMemorySize() {
   const std::vector<HloInstruction*>& instruction_sequence =
       hlo_live_range_.flattened_instruction_sequence().instructions();
   int64_t max_reserved_scoped_memory = 0;
-  for (BreadthFirstMidpointIterator it(0, instruction_sequence.size() - 1);
-       !it.End(); it.Next()) {
-    HloInstruction* instruction = instruction_sequence[it.value()];
+  for (int64_t i = 0; i < instruction_sequence.size(); ++i) {
+    HloInstruction* instruction = instruction_sequence[i];
     int64_t reserved_scoped_memory =
         std::min(options_.reserved_scoped_memory_fn(
                      instruction, /*operands_in_alternate_memory=*/{},
@@ -2089,6 +2084,30 @@ std::optional<int64_t> MsaAlgorithm::EarliestBlockPrefetchStartTime(
 
 namespace {
 
+absl::flat_hash_map<const HloValue*, UseInterval> GetUseIntervals(
+    const std::vector<const HloValue*>& values,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_schedule) {
+  absl::flat_hash_map<const HloValue*, UseInterval> value_to_use_intervals;
+  for (const HloValue* value : values) {
+    UseInterval& use_interval = value_to_use_intervals[value];
+    use_interval.first_use_time = std::numeric_limits<int64_t>::max();
+    use_interval.last_use_time = -1;
+    for (const HloUse& use : value->GetUses()) {
+      auto it = instruction_schedule.find(use.instruction);
+      if (it == instruction_schedule.end()) {
+        continue;
+      }
+      use_interval.first_use_time =
+          std::min(use_interval.first_use_time, it->second);
+      use_interval.last_use_time =
+          std::max(use_interval.last_use_time, it->second);
+    }
+    CHECK_NE(use_interval.last_use_time, -1);
+  }
+  return value_to_use_intervals;
+}
+
 absl::flat_hash_set<HloPosition> GetParameterInstructionsAliasedToOutput(
     const HloInputOutputAliasConfig& alias_config,
     const HloInstruction* root_instruction) {
@@ -2104,17 +2123,399 @@ absl::flat_hash_set<HloPosition> GetParameterInstructionsAliasedToOutput(
   return aliased_parameter_positions;
 }
 
+// Marks all RepackAllocationBlocks in the list as colocated, forming a circular
+// linked list, representing colocated allocations.
+void MarkRepackAllocationBlocksColocated(
+    std::vector<AllocationBlock*>& colocations) {
+  if (colocations.empty()) {
+    return;
+  }
+  for (size_t i = 0; i < colocations.size() - 1; ++i) {
+    colocations[i]->next_colocated = colocations[i + 1];
+  }
+  colocations.back()->next_colocated = colocations.front();
+}
+
+void PopulateExistingBlockPrefetchedValues(
+    const Options& options, const HloAliasAnalysis& alias_analysis,
+    std::vector<const HloValue*>& block_prefetched_values,
+    absl::flat_hash_map<const HloValue*, const HloValue*>&
+        prefetch_done_value_to_original_value,
+    absl::flat_hash_map<const HloValue*, HloInstruction*>&
+        prefetch_done_value_to_prefetch_start_instruction) {
+  // Block prefetched values are prefetch done HloValues. Original values are
+  // the HLO values that are being prefetched. We maintain a map of prefetch
+  // done value to original value. We also maintain a map of prefetch done
+  // HloValue to the prefetch start instruction.
+  for (const auto& [position, custom_call_prefetch_details] :
+       options.hlo_position_to_custom_call_prefetch_details) {
+    const HloValue* original_value =
+        &alias_analysis.dataflow_analysis().GetUniqueValueAt(
+            position.instruction, position.index);
+    for (const auto& custom_call_prefetch_detail :
+         custom_call_prefetch_details) {
+      HloInstruction* prefetch_start =
+          custom_call_prefetch_detail.prefetch_start;
+      HloInstruction* prefetch_done = custom_call_prefetch_detail.prefetch_done;
+      const HloValue* prefetch_done_value =
+          &alias_analysis.dataflow_analysis().GetUniqueValueAt(prefetch_done,
+                                                               {});
+      prefetch_done_value_to_original_value[prefetch_done_value] =
+          original_value;
+      prefetch_done_value_to_prefetch_start_instruction[prefetch_done_value] =
+          prefetch_start;
+      block_prefetched_values.push_back(prefetch_done_value);
+    }
+  }
+}
+
 }  // namespace
 
-absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
-  if (!options_.hlo_position_to_custom_call_prefetch_details.empty()) {
-    return absl::UnimplementedError(
-        "Block prefetching for custom call prefetches is not yet implemented "
-        "in MSA.");
+absl::Status MsaAlgorithm::AllocateAndScheduleExistingBlockPrefetches(
+    int64_t block_prefetching_starting_offset) {
+  if (options_.hlo_position_to_custom_call_prefetch_details.empty()) {
+    return absl::OkStatus();
   }
   if (options_.reserved_bytes_for_block_prefetches <= 0) {
+    return absl::FailedPreconditionError(
+        "Reserved bytes for block prefetches is zero, we need memory to "
+        "schedule custom call block prefetches.");
+  }
+  // List of all block prefetched HloValues.
+  std::vector<const HloValue*> block_prefetched_values;
+  absl::flat_hash_map<const HloValue*, const HloValue*>
+      prefetch_done_value_to_original_value;
+  absl::flat_hash_map<const HloValue*, HloInstruction*>
+      prefetch_done_value_to_prefetch_start_instruction;
+
+  PopulateExistingBlockPrefetchedValues(
+      options_, alias_analysis_, block_prefetched_values,
+      prefetch_done_value_to_original_value,
+      prefetch_done_value_to_prefetch_start_instruction);
+
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+
+  // Compute the live ranges for each block prefetched value.
+  absl::flat_hash_map<const HloValue*, UseInterval> value_to_use_intervals =
+      GetUseIntervals(block_prefetched_values, instruction_schedule);
+
+  // Erase all the values from block_prefetched_values that have been finalized.
+  block_prefetched_values.erase(
+      std::remove_if(block_prefetched_values.begin(),
+                     block_prefetched_values.end(),
+                     [&](const HloValue* value) {
+                       return finalized_values_.contains(value);
+                     }),
+      block_prefetched_values.end());
+
+  // Sort block prefetched values in ascending order of first use time.
+  absl::c_sort(block_prefetched_values,
+               [&](const HloValue* a, const HloValue* b) {
+                 return value_to_use_intervals.at(a).first_use_time <
+                        value_to_use_intervals.at(b).first_use_time;
+               });
+
+  // All block prefetches should be placed within this limit.
+  int64_t block_prefetching_limit_bytes =
+      block_prefetching_starting_offset +
+      options_.reserved_bytes_for_block_prefetches;
+  CHECK_LE(block_prefetching_limit_bytes, options_.max_size_in_bytes);
+  VLOG(1) << "block prefetched values bytes limit: "
+          << block_prefetching_limit_bytes;
+  int64_t previous_start_time = -1;
+  int64_t max_in_flight_prefetches_allowed =
+      options_.max_outstanding_block_prefetches;
+  std::vector<int64_t> prefetch_end_times;
+  std::vector<int64_t> prefetch_done_schedule_before_times;
+
+  absl::flat_hash_map<const HloValue*, Allocation*> value_to_pinned_allocation;
+
+  // For each block prefetched value, we try to find a chunk within the block
+  // prefetching limit, ensuring FIFO ordering. After a suitable chunk is found
+  // for a block prefetched value, we:
+  // 1. Commit the chunk to the alternate memory.
+  // 2. Update the operands in alternate memory map.
+  // 3. Add the copy done and copy start values to the finalized values set.
+  // 4. Add a repack allocation block to the repack allocation blocks list.
+  // 5. If the prefetch done value is aliased with values other than the
+  //    prefetch start value, allocate the aliased values.
+  // 6. Serve the uses of the original value from the pinned allocation in the
+  //    default memory.
+  // 7. Clear the pending chunks after the loop.
+  for (const HloValue* prefetch_done_value : block_prefetched_values) {
+    UseInterval use_interval = value_to_use_intervals.at(prefetch_done_value);
+    int64_t first_use_time = use_interval.first_use_time;
+    int64_t last_use_time = use_interval.last_use_time;
+    auto it = prefetch_done_value_to_original_value.find(prefetch_done_value);
+    CHECK(it != prefetch_done_value_to_original_value.end());
+    const HloValue* original_value = it->second;
+    int64_t definition_time =
+        instruction_schedule.at(original_value->defining_instruction());
+    int64_t end_time = last_use_time;
+    int64_t buffer_size = buffer_intervals_.at(prefetch_done_value).size;
+    int64_t earliest_start_time_candidate =
+        std::max(definition_time, previous_start_time);
+    CHECK_LE(earliest_start_time_candidate, first_use_time);
+
+    // Find the earliest start time for which a chunk can be allocated for the
+    // block prefetched value.
+    std::optional<int64_t> optional_start_time = EarliestBlockPrefetchStartTime(
+        previous_start_time, definition_time, first_use_time, end_time,
+        buffer_size, block_prefetching_limit_bytes,
+        max_in_flight_prefetches_allowed, prefetch_done_schedule_before_times,
+        prefetch_end_times);
+
+    if (!optional_start_time.has_value()) {
+      // Custom call block prefetches must be allocated in the alternate memory.
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Could not find a chunk for block prefetched value: ",
+          prefetch_done_value->defining_position().ToString(), " buffer size: ",
+          buffer_size, " within limit: ", block_prefetching_limit_bytes));
+    }
+
+    int64_t start_time = optional_start_time.value();
+    MsaBufferInterval interval = MsaBufferInterval{/*buffer=*/original_value,
+                                                   /*size=*/buffer_size,
+                                                   /*start=*/start_time,
+                                                   /*end=*/end_time,
+                                                   /*colocations=*/{},
+                                                   /*need_allocation=*/true};
+    Chunk chunk_candidate = FindChunkCandidate(interval);
+
+    // The chunk candidate should always be within the block prefetched values
+    // limit, otherwise we would have returned earlier.
+    CHECK_LE(chunk_candidate.chunk_end(), block_prefetching_limit_bytes);
+
+    // Add a pinned allocation in the default memory to serve as the prev
+    // allocation for the copy allocation or extend the existing pinned
+    // allocation.
+    Allocation* pinned_allocation;
+    auto pinned_allocation_it = value_to_pinned_allocation.find(original_value);
+    if (pinned_allocation_it == value_to_pinned_allocation.end()) {
+      allocations_->push_back(std::make_unique<PinnedAllocation>(
+          original_value->defining_position(), MemorySpace::kDefault,
+          kDummyChunk, definition_time, end_time));
+      pinned_allocation = allocations_->back().get();
+      value_to_pinned_allocation[original_value] = pinned_allocation;
+    } else {
+      pinned_allocation = pinned_allocation_it->second;
+      pinned_allocation->Extend(end_time);
+    }
+
+    HloInstruction* async_mem_op_start =
+        prefetch_done_value_to_prefetch_start_instruction[prefetch_done_value];
+    HloInstruction* asyn_mem_op_done = prefetch_done_value->instruction();
+
+    // Add an async slice copy for the block prefetched value value.
+    AddAsyncCopyOrOtherMemOp(
+        /*prev_allocation=*/*pinned_allocation,
+        /*memory_space=*/MemorySpace::kAlternate,
+        /*chunk=*/chunk_candidate,
+        /*exclusive_start_time=*/InclusiveToExclusiveStartTime(start_time),
+        /*end_time=*/end_time,
+        /*copy_done_schedule_before_time=*/first_use_time,
+        /*allocations=*/allocations_,
+        /*aliased_offset=*/nullptr,
+        /*resource=*/0.0,
+        /*cross_program_prefetch_index=*/std::nullopt,
+        /*sync_mem_op=*/nullptr,
+        /*async_mem_op_start*/ async_mem_op_start,
+        /*async_mem_op_done=*/asyn_mem_op_done);
+
+    previous_start_time = start_time;
+    auto const sorted_position = std::lower_bound(
+        prefetch_end_times.begin(), prefetch_end_times.end(), end_time);
+    prefetch_end_times.insert(sorted_position, end_time);
+    prefetch_done_schedule_before_times.push_back(first_use_time);
+
+    // 1. Commit the chunk to the alternate memory.
+    AddToPendingChunks(interval, chunk_candidate);
+
+    for (const HloUse& use : prefetch_done_value->GetUses()) {
+      allocations_->back()->AddUse(use);
+      // 2. Update the operands in alternate memory map.
+      operands_in_alternate_memory_map_[use.instruction].insert(
+          std::make_pair(use.operand_number, use.operand_index));
+    }
+
+    // 3. Add the copy done and copy start values to the finalized values set.
+    finalized_values_.insert(prefetch_done_value);
+    const HloValue* copy_start_value =
+        &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+            async_mem_op_start, {0});
+    finalized_values_.insert(copy_start_value);
+
+    // 4. Add a repack allocation block to the repack allocation blocks list.
+    repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+        start_time, end_time, chunk_candidate.size, chunk_candidate.offset,
+        allocations_->back().get()));
+    repack_allocation_blocks_.back().next_colocated =
+        &(repack_allocation_blocks_.back());
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*prefetch_done_value);
+
+    // 5. If the prefetch done value is aliased with values other than the
+    //    prefetch start value, allocate the aliased values.
+    if (buffer.values().size() > 2) {
+      ColocateAndFinalizeValuesAliasedToExistingBlockPrefetches(
+          prefetch_done_value, buffer, chunk_candidate, buffer_size,
+          &repack_allocation_blocks_.back(), instruction_schedule,
+          value_to_use_intervals,
+          prefetch_done_value_to_prefetch_start_instruction,
+          prefetch_end_times);
+    }
+  }
+
+  // 6. Serve the uses of the original value from the pinned allocation in the
+  //    default memory.
+  for (auto [_, original_value] : prefetch_done_value_to_original_value) {
+    // Finalize all values aliased to the original value.
+    // Note: We do not need to add pinned allocations for the aliased values,
+    // just finalizing them is sufficient to ensure that they will be served
+    // from default memory.
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*original_value);
+    for (const HloValue* aliased_value : buffer.values()) {
+      if (finalized_values_.contains(aliased_value)) {
+        continue;
+      }
+      // If a pinned allocation already exists for the aliased value, add the
+      // uses of the original value to the pinned allocation.
+      auto it = value_to_pinned_allocation.find(aliased_value);
+      if (it != value_to_pinned_allocation.end()) {
+        Allocation* allocation = it->second;
+        for (const HloUse& use : original_value->GetUses()) {
+          allocation->AddUse(use);
+        }
+      }
+      finalized_values_.insert(aliased_value);
+    }
+  }
+
+  // 7. Clear the pending chunks.
+  ClearPendingChunks();
+  return absl::OkStatus();
+}
+
+void MsaAlgorithm::ColocateAndFinalizeValuesAliasedToExistingBlockPrefetches(
+    const HloValue* prefetch_done_value, const HloBuffer& buffer,
+    const Chunk& chunk_candidate, int64_t buffer_size,
+    AllocationBlock* first_colocated_repack_allocation,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_schedule,
+    const absl::flat_hash_map<const HloValue*, UseInterval>&
+        value_to_use_intervals,
+    absl::flat_hash_map<const HloValue*, HloInstruction*>&
+        prefetch_done_value_to_prefetch_start_instruction,
+    std::vector<int64_t>& prefetch_end_times) {
+  VLOG(1) << "HloBuffer for block prefetched value: "
+          << prefetch_done_value->ToShortString()
+          << " aliases with: " << (buffer.values().size() - 1)
+          << " other values";
+
+  std::vector<const HloValue*> colocated_values;
+  for (const HloValue* aliased_value : buffer.values()) {
+    colocated_values.push_back(aliased_value);
+  }
+  absl::c_sort(colocated_values, [&](const HloValue* a, const HloValue* b) {
+    return instruction_schedule.at(a->defining_instruction()) <
+           instruction_schedule.at(b->defining_instruction());
+  });
+
+  // After sorting by schedule time, the first two values in the colocated
+  // values list should be the prefetch start and prefetch done values, followed
+  // by their uses which might be aliased.
+  CHECK_EQ(
+      colocated_values[0]->instruction(),
+      prefetch_done_value_to_prefetch_start_instruction[prefetch_done_value]);
+  CHECK_EQ(colocated_values[1], prefetch_done_value);
+
+  int64_t prev_last_use_time =
+      value_to_use_intervals.at(prefetch_done_value).last_use_time;
+
+  std::vector<AllocationBlock*> colocations;
+  colocations.push_back(first_colocated_repack_allocation);
+
+  int64_t maybe_sliced_value_definition_time =
+      instruction_schedule.at(prefetch_done_value->defining_instruction());
+
+  // For each of the colocated values that follow the block prefetched value,
+  // extend the chunk candidate to the right and add a pinned allocation in
+  // the alternate memory. We start from index 2, since the first two values
+  // in the colocated values list are the prefetch start and prefetch done
+  // values.
+  for (int i = 2; i < colocated_values.size(); ++i) {
+    const HloValue* aliased_value = colocated_values[i];
+    CHECK(!finalized_values_.contains(aliased_value));
+    int64_t aliased_value_definition_time =
+        instruction_schedule.at(aliased_value->defining_instruction());
+    CHECK_LT(maybe_sliced_value_definition_time, aliased_value_definition_time);
+    // The last use time of the previous value in the colocated values list
+    // should be the definition time of the current value in the colocated
+    // values list. This is because only the last use of a value can be
+    // aliased.
+    CHECK_EQ(prev_last_use_time, aliased_value_definition_time);
+    int64_t aliased_value_last_use_time = std::numeric_limits<int64_t>::min();
+    for (const HloUse& use : aliased_value->GetUses()) {
+      aliased_value_last_use_time =
+          std::max(aliased_value_last_use_time,
+                   instruction_schedule.at(use.instruction));
+    }
+    prev_last_use_time = aliased_value_last_use_time;
+
+    MsaBufferInterval aliased_interval = MsaBufferInterval{
+        /*buffer=*/aliased_value,
+        /*size=*/buffer_size,
+        /*start=*/aliased_value_definition_time +
+            1,  // We need to add 1 because a chunk is already reserved till
+                // the prev_last_use_time which is equal to the
+                // aliased_value_definition_time.
+        /*end=*/aliased_value_last_use_time,
+        /*colocations=*/{},
+        /*need_allocation=*/true};
+    Chunk aliased_chunk_candidate = FindChunkCandidate(
+        aliased_interval, /*preferred_offset=*/chunk_candidate.offset);
+    // The aliased chunk candidate should be the same as the chunk candidate,
+    // since they are colocated and aliased. We are in principle extending the
+    // same chunk candidate to the right and we should always be able to do
+    // that because we are processing the values from left to right and we
+    // have checked that the prefetched value is only aliased to the right.
+    CHECK_EQ(aliased_chunk_candidate, chunk_candidate);
+    allocations_->push_back(std::make_unique<PinnedAllocation>(
+        aliased_value->defining_position(), MemorySpace::kAlternate,
+        aliased_chunk_candidate, aliased_value_definition_time,
+        aliased_value_last_use_time));
+    AddToPendingChunks(aliased_interval, aliased_chunk_candidate);
+    for (const HloUse& use : aliased_value->GetUses()) {
+      allocations_->back()->AddUse(use);
+      operands_in_alternate_memory_map_[use.instruction].insert(
+          std::make_pair(use.operand_number, use.operand_index));
+    }
+    auto const sorted_position =
+        std::lower_bound(prefetch_end_times.begin(), prefetch_end_times.end(),
+                         aliased_value_last_use_time);
+    prefetch_end_times.insert(sorted_position, aliased_value_last_use_time);
+    finalized_values_.insert(aliased_value);
+    repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+        aliased_value_definition_time, aliased_value_last_use_time,
+        aliased_chunk_candidate.size, aliased_chunk_candidate.offset,
+        allocations_->back().get()));
+    repack_allocation_blocks_.back().next_colocated =
+        &(repack_allocation_blocks_.back());
+    colocations.push_back(&repack_allocation_blocks_.back());
+  }
+
+  // Mark repack allocation blocks as colocated.
+  MarkRepackAllocationBlocksColocated(colocations);
+}
+
+absl::Status MsaAlgorithm::CreateNewBlockPrefetches(
+    int64_t block_prefetching_starting_offset) {
+  if (!options_.hlo_position_to_custom_call_prefetch_details.empty() ||
+      options_.reserved_bytes_for_block_prefetches <= 0) {
     return absl::OkStatus();
   }
+
   absl::flat_hash_set<HloPosition> aliased_parameter_positions =
       GetParameterInstructionsAliasedToOutput(
           module_->input_output_alias_config(),
@@ -2127,15 +2528,19 @@ absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
   absl::flat_hash_map<const HloValue*, const HloValue*>
       sliced_value_to_original_value;
   for (const HloPosition& position : options_.block_prefetched_positions) {
-    if (aliased_parameter_positions.contains(position)) {
-      // TODO(b/441344194): Add support for block allocations for parameters
-      // that are aliased to outputs.
-      continue;
-    }
     const HloValue* value =
         &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
             position.instruction, position.index);
-    block_prefetched_values.push_back(value);
+    if (!aliased_parameter_positions.contains(value->defining_position())) {
+      block_prefetched_values.push_back(value);
+    } else {
+      // TODO(b/441344194): Add support for block allocations for parameters
+      // that are aliased to outputs.
+      LOG(WARNING) << "Skipping block prefetch for value: "
+                   << position.ToString()
+                   << " because it is aliased to a program output.";
+    }
+
     // As mentioned above, we also track slices of block prefetched values.
     for (const HloUse& use : value->GetUses()) {
       if (use.instruction->opcode() == HloOpcode::kSlice) {
@@ -2149,16 +2554,12 @@ absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
   }
 
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  struct LiveRange {
-    int64_t first_use_time;
-    int64_t last_use_time;
-  };
   // Compute the live ranges for each block prefetched value.
-  absl::flat_hash_map<const HloValue*, LiveRange> value_to_live_ranges;
+  absl::flat_hash_map<const HloValue*, UseInterval> value_to_use_intervals;
   for (const HloValue* value : block_prefetched_values) {
-    LiveRange& live_range = value_to_live_ranges[value];
-    live_range.first_use_time = std::numeric_limits<int64_t>::max();
-    live_range.last_use_time = -1;
+    UseInterval& use_interval = value_to_use_intervals[value];
+    use_interval.first_use_time = std::numeric_limits<int64_t>::max();
+    use_interval.last_use_time = -1;
     bool is_original_value = !sliced_value_to_original_value.contains(value);
     for (const HloUse& use : value->GetUses()) {
       // We skip slices here because they have been explicitly added to
@@ -2170,62 +2571,71 @@ absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
       if (it == instruction_schedule.end()) {
         continue;
       }
-      live_range.first_use_time =
-          std::min(live_range.first_use_time, it->second);
-      live_range.last_use_time = std::max(live_range.last_use_time, it->second);
-    }
-    if (live_range.last_use_time == -1) {
-      // If the value does not have any direct uses other than slices, we can
-      // finalize it. Note, slices are handled separately as part of
-      // block_prefetched_values.
+      use_interval.first_use_time =
+          std::min(use_interval.first_use_time, it->second);
+      use_interval.last_use_time =
+          std::max(use_interval.last_use_time, it->second);
+    }
+  }
+
+  // Finalize all the values that have no uses.
+  for (const auto& [value, use_interval] : value_to_use_intervals) {
+    if (use_interval.last_use_time == -1) {
       finalized_values_.insert(value);
     }
   }
 
-  // Erase all the values from block_prefetched_values that are not used
-  // directly and have been finalized.
+  // Erase all the values from block_prefetched_values that have been finalized.
   block_prefetched_values.erase(
-      std::remove_if(
-          block_prefetched_values.begin(), block_prefetched_values.end(),
-          [&](const HloValue* value) {
-            return value_to_live_ranges.at(value).last_use_time == -1;
-          }),
+      std::remove_if(block_prefetched_values.begin(),
+                     block_prefetched_values.end(),
+                     [&](const HloValue* value) {
+                       return finalized_values_.contains(value);
+                     }),
       block_prefetched_values.end());
 
   // Sort block prefetched value values in ascending order of first use time.
   absl::c_sort(block_prefetched_values,
                [&](const HloValue* a, const HloValue* b) {
-                 return value_to_live_ranges.at(a).first_use_time <
-                        value_to_live_ranges.at(b).first_use_time;
+                 return value_to_use_intervals.at(a).first_use_time <
+                        value_to_use_intervals.at(b).first_use_time;
                });
 
-  // Block allocations can also happen in the fragmented scoped memory, so we
-  // need to account for the max reserved scoped memory in the block prefetched
-  // memory limit.
-  int64_t max_reserved_scoped_memory = MaxReservedScopedMemory();
   int64_t block_prefetching_limit_bytes =
-      max_reserved_scoped_memory + options_.reserved_bytes_for_block_prefetches;
+      block_prefetching_starting_offset +
+      options_.reserved_bytes_for_block_prefetches;
   CHECK_LE(block_prefetching_limit_bytes, options_.max_size_in_bytes);
   VLOG(1) << "block prefetched values bytes limit: "
           << block_prefetching_limit_bytes;
-
-  // To ensure fifo ordering, we need to ensure that the prefetch start time of
-  // each block prefetched value is greater than or equal to the previous block
-  // prefetched value's prefetch start time. We are traversing these is
-  // ascending order of use time, so the previous prefetchs start time will
-  // always be less than or equal to the current prefetchs use time.
   int64_t previous_start_time = -1;
   int64_t max_in_flight_prefetches_allowed =
       options_.max_outstanding_block_prefetches;
   std::vector<int64_t> prefetch_end_times;
-  std::vector<int64_t> copy_done_schedule_before_times;
+  std::vector<int64_t> prefetch_done_schedule_before_times;
 
   absl::flat_hash_map<const HloValue*, Allocation*> value_to_pinned_allocation;
 
+  // For each block prefetched value, we try to find a chunk within the block
+  // prefetching limit, ensuring FIFO ordering. After a suitable chunk is found
+  // for a block prefetched value, we:
+  // 1. Commit the chunk to the alternate memory.
+  // 2. Update the operands in alternate memory map.
+  // 3. Add the value to the finalized values set.
+  // 4. Add a repack allocation block to the repack allocation blocks list.
+  // 5. If the block prefetched value is aliased with other values that come
+  //    after it in the schedule, colocate and finalize the aliased values.
+  // Outside the loop:
+  // 6. Finalize the prefetch source and its aliases that come before it in the
+  //    schedule, to default memory. If source value was successfully block
+  //    prefetched, the aliased values that come after in the schedule are
+  //    already colocated with the prefetch and finalized to the alternate
+  //    memory. If the block prefetch failed, finalize source and its aliases
+  //    to default memory.
+  // 7. Clear the pending chunks after the loop.
   for (const HloValue* maybe_sliced_value : block_prefetched_values) {
-    LiveRange live_range = value_to_live_ranges.at(maybe_sliced_value);
-    int64_t first_use_time = live_range.first_use_time;
-    int64_t last_use_time = live_range.last_use_time;
+    UseInterval use_interval = value_to_use_intervals.at(maybe_sliced_value);
+    int64_t first_use_time = use_interval.first_use_time;
+    int64_t last_use_time = use_interval.last_use_time;
     auto it = sliced_value_to_original_value.find(maybe_sliced_value);
     const HloValue* original_value;
     if (it != sliced_value_to_original_value.end()) {
@@ -2242,7 +2652,7 @@ absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
     std::optional<int64_t> optional_start_time = EarliestBlockPrefetchStartTime(
         previous_start_time, definition_time, first_use_time, end_time,
         buffer_size, block_prefetching_limit_bytes,
-        max_in_flight_prefetches_allowed, copy_done_schedule_before_times,
+        max_in_flight_prefetches_allowed, prefetch_done_schedule_before_times,
         prefetch_end_times);
 
     if (!optional_start_time.has_value()) {
@@ -2252,8 +2662,8 @@ absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
                    << " within limit: " << block_prefetching_limit_bytes;
       continue;
     }
-    int64_t start_time = optional_start_time.value();
 
+    int64_t start_time = optional_start_time.value();
     MsaBufferInterval interval = MsaBufferInterval{/*buffer=*/original_value,
                                                    /*size=*/buffer_size,
                                                    /*start=*/start_time,
@@ -2304,16 +2714,9 @@ absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
     auto const sorted_position = std::lower_bound(
         prefetch_end_times.begin(), prefetch_end_times.end(), end_time);
     prefetch_end_times.insert(sorted_position, end_time);
-    copy_done_schedule_before_times.push_back(first_use_time);
-
-    // Bookkeeping Checklist:
-    // Commit the chunk to the alternate memory.
-    // Add entries to operands in alternate memory map.
-    // Add the value to the finalized values set.
-    // Add a repack allocation block to the repack allocation blocks list.
-    // Clear the pending chunks.
+    prefetch_done_schedule_before_times.push_back(first_use_time);
 
-    // Commit the chunk to the alternate memory.
+    // 1. Commit the chunk to the alternate memory.
     AddToPendingChunks(interval, chunk_candidate);
 
     for (const HloUse& use : maybe_sliced_value->GetUses()) {
@@ -2324,25 +2727,207 @@ absl::Status MsaAlgorithm::ProcessBlockPrefetches() {
         continue;
       }
       allocations_->back()->AddUse(use);
-      // Add entries to operands in alternate memory map.
+      // 2. Update the operands in alternate memory map.
       operands_in_alternate_memory_map_[use.instruction].insert(
           std::make_pair(use.operand_number, use.operand_index));
     }
 
-    // Add the value to the finalized values set.
+    // 3. Add the value to the finalized values set.
     finalized_values_.insert(maybe_sliced_value);
-    // Add a repack allocation block to the repack allocation blocks list.
+    // 4. Add a repack allocation block to the repack allocation blocks list.
     repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
         start_time, end_time, chunk_candidate.size, chunk_candidate.offset,
         allocations_->back().get()));
     repack_allocation_blocks_.back().next_colocated =
         &(repack_allocation_blocks_.back());
+
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*maybe_sliced_value);
+    // 5. If the block prefetched value is aliased with other values that come
+    //    after it in the schedule, colocate and finalize the aliased values.
+    if (buffer.values().size() > 1) {
+      ColocateAndFinalizeValuesAliasedToNewBlockPrefetches(
+          maybe_sliced_value, buffer, chunk_candidate, buffer_size,
+          &repack_allocation_blocks_.back(), instruction_schedule,
+          value_to_use_intervals, prefetch_end_times);
+    }
+  }
+
+  // 6. Finalize the prefetch source and its aliases that come before it in the
+  //    schedule, to default memory. If source value was successfully block
+  //    prefetched, the aliased values that come after in the schedule are
+  //    already colocated with the prefetch and finalized to the alternate
+  //    memory. If the block prefetch failed, finalize source and its aliases
+  //    to default memory.
+  for (const HloValue* original_value : block_prefetched_values) {
+    // Note: We do not need to add pinned allocations for the aliased values,
+    // just finalizing them is sufficient to ensure that they will be served
+    // from default memory.
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*original_value);
+    for (const HloValue* aliased_value : buffer.values()) {
+      if (finalized_values_.contains(aliased_value)) {
+        continue;
+      }
+      // If a pinned allocation already exists for the aliased value, add the
+      // uses of the original value to the pinned allocation.
+      auto it = value_to_pinned_allocation.find(aliased_value);
+      if (it != value_to_pinned_allocation.end()) {
+        Allocation* allocation = it->second;
+        for (const HloUse& use : original_value->GetUses()) {
+          allocation->AddUse(use);
+        }
+      }
+      finalized_values_.insert(aliased_value);
+    }
   }
-  // Clear the pending chunks.
+
+  // 7. Clear the pending chunks.
   ClearPendingChunks();
   return absl::OkStatus();
 }
 
+void MsaAlgorithm::ColocateAndFinalizeValuesAliasedToNewBlockPrefetches(
+    const HloValue* maybe_sliced_value, const HloBuffer& buffer,
+    const Chunk& chunk_candidate, int64_t buffer_size,
+    AllocationBlock* first_colocated_repack_allocation,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_schedule,
+    const absl::flat_hash_map<const HloValue*, UseInterval>&
+        value_to_use_intervals,
+    std::vector<int64_t>& prefetch_end_times) {
+  VLOG(1) << "HloBuffer for block prefetched value: "
+          << maybe_sliced_value->ToShortString()
+          << " aliases with: " << (buffer.values().size() - 1)
+          << " other values";
+
+  int64_t maybe_sliced_value_definition_time =
+      instruction_schedule.at(maybe_sliced_value->defining_instruction());
+  std::vector<const HloValue*> colocated_values;
+
+  // We want to process only those values that are aliased to the right of the
+  // block prefetched value. What is aliased to the left will be pinned to the
+  // default memory.
+  for (const HloValue* aliased_value : buffer.values()) {
+    if (instruction_schedule.at(aliased_value->defining_instruction()) >
+        maybe_sliced_value_definition_time) {
+      colocated_values.push_back(aliased_value);
+    }
+  }
+
+  if (colocated_values.empty()) {
+    return;
+  }
+
+  absl::c_sort(colocated_values, [&](const HloValue* a, const HloValue* b) {
+    return instruction_schedule.at(a->defining_instruction()) <
+           instruction_schedule.at(b->defining_instruction());
+  });
+
+  int64_t prev_last_use_time =
+      value_to_use_intervals.at(maybe_sliced_value).last_use_time;
+
+  std::vector<AllocationBlock*> colocations;
+  colocations.push_back(first_colocated_repack_allocation);
+
+  // For each of the colocated values that follow the block prefetched value,
+  // extend the chunk candidate to the right and add a pinned allocation in
+  // the alternate memory.
+  for (int i = 0; i < colocated_values.size(); ++i) {
+    const HloValue* aliased_value = colocated_values[i];
+    CHECK(!finalized_values_.contains(aliased_value));
+    int64_t aliased_value_definition_time =
+        instruction_schedule.at(aliased_value->defining_instruction());
+    CHECK_LT(maybe_sliced_value_definition_time, aliased_value_definition_time);
+    // The last use time of the previous value in the colocated values list
+    // should be the definition time of the current value in the colocated
+    // values list. This is because only the last use of a value can be
+    // aliased.
+    CHECK_EQ(prev_last_use_time, aliased_value_definition_time);
+    int64_t aliased_value_last_use_time = std::numeric_limits<int64_t>::min();
+    for (const HloUse& use : aliased_value->GetUses()) {
+      aliased_value_last_use_time =
+          std::max(aliased_value_last_use_time,
+                   instruction_schedule.at(use.instruction));
+    }
+    prev_last_use_time = aliased_value_last_use_time;
+
+    MsaBufferInterval aliased_interval = MsaBufferInterval{
+        /*buffer=*/aliased_value,
+        /*size=*/buffer_size,
+        /*start=*/aliased_value_definition_time +
+            1,  // We need to add 1 because a chunk is already reserved till
+                // the prev_last_use_time which is equal to the
+                // aliased_value_definition_time.
+        /*end=*/aliased_value_last_use_time,
+        /*colocations=*/{},
+        /*need_allocation=*/true};
+    Chunk aliased_chunk_candidate = FindChunkCandidate(
+        aliased_interval, /*preferred_offset=*/chunk_candidate.offset);
+    // The aliased chunk candidate should be the same as the chunk candidate,
+    // since they are colocated and aliased. We are in principle extending the
+    // same chunk candidate to the right and we should always be able to do
+    // that because we are processing the values from left to right and we
+    // have checked that the prefetched value is only aliased to the right.
+    CHECK_EQ(aliased_chunk_candidate, chunk_candidate);
+    allocations_->push_back(std::make_unique<PinnedAllocation>(
+        aliased_value->defining_position(), MemorySpace::kAlternate,
+        aliased_chunk_candidate, aliased_value_definition_time,
+        aliased_value_last_use_time));
+    AddToPendingChunks(aliased_interval, aliased_chunk_candidate);
+    for (const HloUse& use : aliased_value->GetUses()) {
+      allocations_->back()->AddUse(use);
+      operands_in_alternate_memory_map_[use.instruction].insert(
+          std::make_pair(use.operand_number, use.operand_index));
+    }
+    auto const sorted_position =
+        std::lower_bound(prefetch_end_times.begin(), prefetch_end_times.end(),
+                         aliased_value_last_use_time);
+    prefetch_end_times.insert(sorted_position, aliased_value_last_use_time);
+    finalized_values_.insert(aliased_value);
+    repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+        aliased_value_definition_time, aliased_value_last_use_time,
+        aliased_chunk_candidate.size, aliased_chunk_candidate.offset,
+        allocations_->back().get()));
+    repack_allocation_blocks_.back().next_colocated =
+        &(repack_allocation_blocks_.back());
+    colocations.push_back(&repack_allocation_blocks_.back());
+  }
+  // Mark repack allocation blocks as colocated.
+  MarkRepackAllocationBlocksColocated(colocations);
+}
+
+int64_t MsaAlgorithm::ReserveAlternateMemoryForScopedMemoryAllocations() {
+  int64_t max_scoped_memory_size = MaxScopedMemorySize();
+  int program_start_time = 0;
+  int program_end_time =
+      hlo_live_range_.flattened_instruction_sequence().instructions().size() -
+      1;
+  MsaBufferInterval interval;
+  interval.buffer = nullptr;
+  interval.size = max_scoped_memory_size;
+  interval.start = program_start_time;
+  interval.end = program_end_time;
+  interval.need_allocation = true;
+  HeapSimulator::Chunk chunk_candidate =
+      FindChunkCandidate(interval, /*preferred_offset=*/0);
+  CHECK_EQ(chunk_candidate.offset, 0);
+  CHECK_EQ(chunk_candidate.size, max_scoped_memory_size);
+  CommitChunk(interval, chunk_candidate);
+  return max_scoped_memory_size;
+}
+
+void MsaAlgorithm::FreeAlternateMemoryForScopedMemoryAllocations(
+    int64_t max_scoped_memory_size) {
+  int program_start_time = 0;
+  int program_end_time =
+      hlo_live_range_.flattened_instruction_sequence().instructions().size() -
+      1;
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      /*offset=*/0, /*size=*/max_scoped_memory_size);
+  interval_tree_.Remove(program_start_time, program_end_time, chunk);
+}
+
 absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
   // Note: Memory Space Assignment creates a HeapSimulator and passes an
   // MsaAlgorithm object to it. buffer_intervals_ is populated by calling the
@@ -2355,8 +2940,19 @@ absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
           << (options_.sliced_prefetch_options.max_slices() >= 2 ? "enabled"
                                                                  : "disabled");
 
+  // Reserve alternate memory for scoped memory allocations before block
+  // prefetches are allocated strictly above the MaxScopedMemorySize().
+  int64_t max_scoped_memory_size =
+      ReserveAlternateMemoryForScopedMemoryAllocations();
+
+  TF_RETURN_IF_ERROR(
+      AllocateAndScheduleExistingBlockPrefetches(max_scoped_memory_size));
+  TF_RETURN_IF_ERROR(CreateNewBlockPrefetches(max_scoped_memory_size));
+
+  // Free the alternate memory reserved for scoped memory allocations before
+  // allocating the scoped memory allocations.
+  FreeAlternateMemoryForScopedMemoryAllocations(max_scoped_memory_size);
   AllocateReservedScopedAllocations();
-  TF_RETURN_IF_ERROR(ProcessBlockPrefetches());
 
   std::vector<MsaBufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
@@ -3780,19 +4376,19 @@ AllocationRequest MsaAlgorithm::CreateAllocationRequest(
             options_.preferred_prefetch_overrides, allocation_value.size(),
             hlo_use, instruction_schedule, live_range_start_time,
             latest_prefetch_time);
-    TF_CHECK_OK(overridden_preferred_prefetch_time.status());
+    CHECK_OK(overridden_preferred_prefetch_time.status());
     if (overridden_preferred_prefetch_time.value().has_value()) {
-      LOG(INFO) << "Overriding preferred prefetch for "
-                << hlo_use.instruction->name() << " operand number "
-                << hlo_use.operand_number << " operand index "
-                << hlo_use.operand_index.ToString() << " size "
-                << allocation_value.size() << " live range ("
-                << live_range_start_time << ", " << latest_prefetch_time
-                << ") from "
-                << (preferred_prefetch_time.has_value()
-                        ? preferred_prefetch_time.value()
-                        : -1)
-                << " to " << overridden_preferred_prefetch_time.value().value();
+      VLOG(1) << "Overriding preferred prefetch for "
+              << hlo_use.instruction->name() << " operand number "
+              << hlo_use.operand_number << " operand index "
+              << hlo_use.operand_index.ToString() << " size "
+              << allocation_value.size() << " live range ("
+              << live_range_start_time << ", " << latest_prefetch_time
+              << ") from "
+              << (preferred_prefetch_time.has_value()
+                      ? preferred_prefetch_time.value()
+                      : -1)
+              << " to " << overridden_preferred_prefetch_time.value().value();
       preferred_prefetch_time = overridden_preferred_prefetch_time.value();
     }
 
@@ -3854,11 +4450,15 @@ void MsaAlgorithm::MaybeCreateMirroredParentAllocationForWhileUse(
         preferred_offset_for_computation) {
   const HloUse& hlo_use = use.hlo_use;
 
-  if (hlo_use.instruction->opcode() != HloOpcode::kWhile) return;
+  if (hlo_use.instruction->opcode() != HloOpcode::kWhile) {
+    return;
+  }
 
   Allocation* aliased_allocation =
       GetLiveAllocationAt(*allocation_value.allocation_sequence(), use_time);
-  if (aliased_allocation->memory_space() != MemorySpace::kAlternate) return;
+  if (aliased_allocation->memory_space() != MemorySpace::kAlternate) {
+    return;
+  }
 
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   if (options_.enable_while_redundant_eviction_elimination &&
@@ -4493,25 +5093,14 @@ void MsaAlgorithm::AllocateCrossProgramPrefetchBuffer(
       colocations.push_back(&repack_allocation_blocks_.back());
     }
   }
-  for (int i = 0; i < colocations.size() - 1; ++i) {
-    colocations[i]->next_colocated = colocations[i + 1];
-  }
-  if (!colocations.empty()) {
-    colocations.back()->next_colocated = colocations.front();
-  }
-
+  MarkRepackAllocationBlocksColocated(colocations);
   ClearPendingChunks();
 }
 
 void MsaAlgorithm::AllocateReservedScopedAllocations() {
   const std::vector<HloInstruction*>& instruction_sequence =
       hlo_live_range_.flattened_instruction_sequence().instructions();
-  if (options_.allocate_reserved_scoped_memory_at_same_offset) {
-    // If we are co-locating scoped allocations, then we need to make sure that
-    // the repack allocation blocks are empty, because we mark all the repack
-    // allocation blocks as co-located in a loop below.
-    CHECK(repack_allocation_blocks_.empty());
-  }
+  std::vector<AllocationBlock*> colocations;
   for (BreadthFirstMidpointIterator it(0, instruction_sequence.size() - 1);
        !it.End(); it.Next()) {
     HloInstruction* instruction = instruction_sequence[it.value()];
@@ -4524,7 +5113,7 @@ void MsaAlgorithm::AllocateReservedScopedAllocations() {
       continue;
     }
     AllocateScopedAllocation(instruction, /*is_post_module=*/false,
-                             reserved_scoped_memory, it.value());
+                             reserved_scoped_memory, it.value(), colocations);
   }
   // If requested, make all scoped allocations to colocate with each other so
   // that when we repack, all scoped allocations get the same offsets. Since
@@ -4532,18 +5121,10 @@ void MsaAlgorithm::AllocateReservedScopedAllocations() {
   // opportunity to deduplicate different ops.  However, this may hurt the
   // memory packing efficiency.
   if (options_.allocate_reserved_scoped_memory_at_same_offset) {
-    for (auto allocation_block_it = repack_allocation_blocks_.begin();
-         allocation_block_it != repack_allocation_blocks_.end() &&
-         std::next(allocation_block_it) != repack_allocation_blocks_.end();
-         ++allocation_block_it) {
-      allocation_block_it->next_colocated = &*std::next(allocation_block_it);
-    }
-    if (!repack_allocation_blocks_.empty()) {
-      repack_allocation_blocks_.back().next_colocated =
-          &repack_allocation_blocks_.front();
-    }
+    MarkRepackAllocationBlocksColocated(colocations);
   }
 
+  colocations.clear();
   // Allocate post-module scoped allocation if requested. It never needs to be
   // colocated with other scoped allocations.
   if (options_.post_module_scoped_alternate_memory_size_in_bytes > 0) {
@@ -4551,15 +5132,15 @@ void MsaAlgorithm::AllocateReservedScopedAllocations() {
         /*instruction=*/module_->entry_computation()->root_instruction(),
         /*is_post_module=*/true,
         options_.post_module_scoped_alternate_memory_size_in_bytes,
-        hlo_live_range_.schedule_end_time());
+        hlo_live_range_.schedule_end_time(), colocations);
   }
 
   ClearPendingChunks();
 }
 
-void MsaAlgorithm::AllocateScopedAllocation(HloInstruction* instruction,
-                                            bool is_post_module, int64_t size,
-                                            int64_t time) {
+void MsaAlgorithm::AllocateScopedAllocation(
+    HloInstruction* instruction, bool is_post_module, int64_t size,
+    int64_t time, std::vector<AllocationBlock*>& colocations) {
   VLOG(1) << "Allocate reserved scoped memory at " << time << " ("
           << (is_post_module ? "<post-module>" : instruction->name())
           << "): " << size;
@@ -4586,6 +5167,7 @@ void MsaAlgorithm::AllocateScopedAllocation(HloInstruction* instruction,
       /*initial_offset=*/0, allocations_->back().get()));
   repack_allocation_blocks_.back().next_colocated =
       &repack_allocation_blocks_.back();
+  colocations.push_back(&repack_allocation_blocks_.back());
 }
 
 int64_t MsaAlgorithm::GetCorrectedUseTime(
@@ -5307,12 +5889,7 @@ void MsaAlgorithm::FinalizeAllocations(
           colocated_allocation->chunk().offset, colocated_allocation));
       colocations.push_back(&repack_allocation_blocks_.back());
     }
-    for (int i = 0; i < colocations.size() - 1; ++i) {
-      colocations[i]->next_colocated = colocations[i + 1];
-    }
-    if (!colocations.empty()) {
-      colocations.back()->next_colocated = colocations.front();
-    }
+    MarkRepackAllocationBlocksColocated(colocations);
   }
   ClearPendingChunks();
 }
@@ -5797,11 +6374,21 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
           request.end_time));
       prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
     }
+  } else if (prev_allocation_it == allocation_sequence->rend() &&
+             (request.require_start_colored_in_default_memory ||
+              request.require_end_colored_in_default_memory)) {
+    // There are no previous allocations, we require contiguous allocation and
+    // either the start or end needs to be colored in the default memory.
+    // We can satisfy this requirement by pinning the allocation in the default
+    // memory space for this time range.
+    allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
+        defining_position, MemorySpace::kDefault,
+        /*chunk=*/std::nullopt, request.inclusive_start_time,
+        request.end_time));
+    prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
   } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
     VLOG(3) << "Allocation requires contiguous allocation, but it wasn't "
-               "possible to find one.";
-    CHECK(!request.require_start_colored_in_default_memory);
-    CHECK(!request.require_end_colored_in_default_memory);
+               "possible to find one in alternate memory or default memory.";
     return result_mark(AllocationResult::kFailRequiresUncommit,
                        allocation_result);
   }
@@ -5945,7 +6532,8 @@ void MsaAlgorithm::AddAsyncCopyOrOtherMemOp(
     int64_t copy_done_schedule_before_time, AllocationSequence* allocations,
     AliasedOffset* aliased_offset, float resource,
     std::optional<int> cross_program_prefetch_index,
-    HloInstruction* sync_mem_op) {
+    HloInstruction* sync_mem_op, HloInstruction* async_mem_op_start,
+    HloInstruction* async_mem_op_done) {
   VLOG(3) << "Copy to "
           << (memory_space == MemorySpace::kDefault ? "default" : "alternate")
           << " memory in (" << exclusive_start_time << ", "
@@ -5956,7 +6544,7 @@ void MsaAlgorithm::AddAsyncCopyOrOtherMemOp(
   allocations->push_back(std::make_unique<CopyAllocation>(
       prev_allocation, memory_space, chunk, exclusive_start_time,
       copy_done_schedule_before_time, end_time, cross_program_prefetch_index,
-      sync_mem_op));
+      sync_mem_op, async_mem_op_start, async_mem_op_done));
 
   RegisterAsyncCopy(memory_space, exclusive_start_time,
                     copy_done_schedule_before_time, allocations, aliased_offset,
@@ -6080,13 +6668,12 @@ bool MsaAlgorithm::ViolatesMaximumOutstandingAsyncCopies(
                              num_additional_copies;
     return num_prefetches >=
            options_.max_outstanding_prefetches + extra_async_copy_limit;
-  } else {
-    int64_t num_evictions = eviction_interval_tree_.NumChunksOverlappingInTime(
-                                inclusive_start_time, end_time) +
-                            num_additional_copies;
-    return num_evictions >=
-           options_.max_outstanding_evictions + extra_async_copy_limit;
   }
+  int64_t num_evictions = eviction_interval_tree_.NumChunksOverlappingInTime(
+                              inclusive_start_time, end_time) +
+                          num_additional_copies;
+  return num_evictions >=
+         options_.max_outstanding_evictions + extra_async_copy_limit;
 }
 
 AllocationResult MsaAlgorithm::ForceAlternateMemoryAllocationForMinTime(
@@ -6576,6 +7163,9 @@ absl::Status MsaAlgorithm::WindowPrefetch() {
     if (!instruction->IsOutputFusion() && !instruction->IsLoopFusion()) {
       continue;
     }
+    if (instruction->parent()->execution_thread() == "sparsecore") {
+      continue;
+    }
 
     window_prefetchable_instructions.insert(instruction);
 
@@ -6648,7 +7238,7 @@ absl::Status MsaAlgorithm::WindowPrefetch() {
   // Remove the cloned instructions.
   for (auto [_, cloned] : cloned_insts) {
     HloComputation* computation = cloned->parent();
-    TF_CHECK_OK(computation->RemoveInstruction(cloned));
+    CHECK_OK(computation->RemoveInstruction(cloned));
     computation->Cleanup();
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.h b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
index 4c1f26d4285716..4daa9c9f25e161 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.h
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/service/heap_simulator/allocation_block.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/memory_space_assignment/allocation.h"
 #include "xla/service/memory_space_assignment/allocation_value.h"
@@ -60,6 +61,13 @@ limitations under the License.
 
 namespace xla {
 namespace memory_space_assignment {
+
+// A struct representing use intervals.
+struct UseInterval {
+  int64_t first_use_time;
+  int64_t last_use_time;
+};
+
 // A struct representing an asynchronous copy with its logical start and end
 // time (time that copy done is scheduled), the resource this copy would use,
 // its destination memory space, and a unique ID.
@@ -334,12 +342,68 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
 
   absl::StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
 
-  // Processes all block prefetches.
-  absl::Status ProcessBlockPrefetches();
+  // Block prefetching is an MSA feature that allows processing all prefetches
+  // in one pass within a block of memory space in the alternate memory. This
+  // guarantees FIFO ordering of all prefetches and allows for more aggressive
+  // prefetching i.e. allowing for bandwidth saturation.
+  //
+  // 1) Prefetches are copy-like operations and generate a new HloValue.
+  // 2) For compiler-inserted block prefetches:
+  //    A) The prefetch done, and everything that aliases with the prefetch
+  //       source and comes after the prefetch done will now alias with the new
+  //       HloValue and get alternate memory.
+  //    B) Everything that aliases with the prefetch source and comes before the
+  //       prefetch done will get default memory.
+  // 3) For user-inserted block prefetches:
+  //    A) The prefetch done and everything that aliases with it will get
+  //       alternate memory.
+  //    B) Everything that aliases with the source of the prefetch will get
+  //       default memory.
+
+  // Processes existing, explicit block prefetched copy start/done instructions.
+  // Such instructions are inserted before MSA. MSA just needs to schedule them.
+  //
+  // REQUIRED: Scoped vmem must be allocated at offset 0 at the time this method
+  //           is called.
+  absl::Status AllocateAndScheduleExistingBlockPrefetches(
+      int64_t block_prefetching_starting_offset);
+
+  // Create, allocate and schedule new block prefetches.
+  //
+  // REQUIRED: Scoped vmem must be allocated at offset 0 at the time this method
+  //           is called.
+  absl::Status CreateNewBlockPrefetches(
+      int64_t block_prefetching_starting_offset);
+
+  // Creates colocated allocations for values aliased to the new block
+  // prefetches and finalizes them.
+  void ColocateAndFinalizeValuesAliasedToNewBlockPrefetches(
+      const HloValue* maybe_sliced_value, const HloBuffer& buffer,
+      const Chunk& chunk_candidate, int64_t buffer_size,
+      AllocationBlock* first_colocated_repack_allocation,
+      const absl::flat_hash_map<const HloInstruction*, int64_t>&
+          instruction_schedule,
+      const absl::flat_hash_map<const HloValue*, UseInterval>&
+          value_to_use_intervals,
+      std::vector<int64_t>& prefetch_end_times);
+
+  // Creates colocated allocations for values aliased to existing block
+  // prefetches and finalizes them.
+  void ColocateAndFinalizeValuesAliasedToExistingBlockPrefetches(
+      const HloValue* prefetch_done_value, const HloBuffer& buffer,
+      const Chunk& chunk_candidate, int64_t buffer_size,
+      AllocationBlock* first_colocated_repack_allocation,
+      const absl::flat_hash_map<const HloInstruction*, int64_t>&
+          instruction_schedule,
+      const absl::flat_hash_map<const HloValue*, UseInterval>&
+          value_to_use_intervals,
+      absl::flat_hash_map<const HloValue*, HloInstruction*>&
+          prefetch_done_value_to_prefetch_start_instruction,
+      std::vector<int64_t>& prefetch_end_times);
 
   // Returns the maximum amount of scoped memory that is reserved at any time in
   // the program.
-  int64_t MaxReservedScopedMemory();
+  int64_t MaxScopedMemorySize();
 
   // Finds and returns the earliest block prefetch start time subject to the
   // following constraints:
@@ -636,9 +700,11 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   // Allocates buffers for instructions that need reserved scoped allocations in
   // the alternate memory space.
   void AllocateReservedScopedAllocations();
+
+  // Creates a ScopedAllocation for the given instruction.
   void AllocateScopedAllocation(HloInstruction* instruction,
-                                bool is_post_module, int64_t size,
-                                int64_t time);
+                                bool is_post_module, int64_t size, int64_t time,
+                                std::vector<AllocationBlock*>& colocations);
 
   // Returns the AliasedOffset object associated with the allocation.
   AliasedOffset* GetAliasedOffset(const Allocation& allocation);
@@ -974,7 +1040,11 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   // allocations. We pass sync_mem_op to the CopyAllocation constructor. When
   // sync_mem_op is set, instead of an async copy, CopyAllocation::Process()
   // will replace sync_mem_op with the async version of sync_mem_op's opcode
-  // (e.g., slice) and shape.
+  // (e.g., slice) and shape. Generally, MSA inserts and schedules new async
+  // copy instructions. If async copy instructions are already present in the
+  // original schedule, MSA will just schedule them in correct positions. If not
+  // null, `async_mem_op_start` and `async_mem_op_done` are async copy start and
+  // done instructions that are already present in the original schedule.
   void AddAsyncCopyOrOtherMemOp(
       Allocation& prev_allocation, MemorySpace memory_space,
       std::optional<Chunk> chunk, int64_t exclusive_start_time,
@@ -982,7 +1052,9 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
       AllocationSequence* allocations, AliasedOffset* aliased_offset,
       float resource,
       std::optional<int> cross_program_prefetch_index = std::nullopt,
-      HloInstruction* sync_mem_op = nullptr);
+      HloInstruction* sync_mem_op = nullptr,
+      HloInstruction* async_mem_op_start = nullptr,
+      HloInstruction* async_mem_op_done = nullptr);
 
   // For prefetching, adds a SlicedCopyAllocation to allocations. Also updates
   // asynchronous copy data structures, prefetch_interval_tree_, and aliasing
@@ -1166,6 +1238,14 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   bool IsPositionColoredInDefaultMemoryAtTime(const HloPosition& position,
                                               int64_t time) const;
 
+  // Reserves a chunk in alternate memory of size MaxScopedMemorySize() for
+  // the entire program duration for scoped memory allocations.
+  int64_t ReserveAlternateMemoryForScopedMemoryAllocations();
+
+  // Frees the alternate memory reserved for scoped memory allocations.
+  void FreeAlternateMemoryForScopedMemoryAllocations(
+      int64_t max_scoped_memory_size);
+
   HloModule* module_ = nullptr;
   AllocationSequence* allocations_;
   // Edge time indices store start and end times allocations in alternate
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
index 7f9b61d25d4a0e..ee494918bc7fa3 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -376,7 +376,8 @@ CopyAllocation::CopyAllocation(
     int64_t copy_start_schedule_after_time,
     int64_t copy_done_schedule_before_time, int64_t end_time,
     std::optional<int64_t> cross_program_prefetch_index,
-    HloInstruction* sync_mem_op)
+    HloInstruction* sync_mem_op, HloInstruction* async_mem_op_start,
+    HloInstruction* async_mem_op_done)
     : Allocation(
           /*defining_position=*/{nullptr, {}}, memory_space, chunk,
           // Allocation uses an inclusive start time
@@ -385,6 +386,8 @@ CopyAllocation::CopyAllocation(
       prev_allocation_(prev_allocation),
       copy_start_schedule_after_(copy_start_schedule_after_time),
       copy_done_schedule_before_(copy_done_schedule_before_time),
+      copy_start_(async_mem_op_start),
+      copy_done_(async_mem_op_done),
       sync_mem_op_(sync_mem_op) {}
 
 int64_t CopyAllocation::earliest_available_time() const {
@@ -392,6 +395,17 @@ int64_t CopyAllocation::earliest_available_time() const {
 }
 
 absl::Status CopyAllocation::Process(const BitcastSplitFn& bitcast_split_fn) {
+  // If the copy start and copy done instructions are already present in the
+  // original schedule we just need to schedule them in correct positions. There
+  // is no need to create new instructions.
+  if (copy_start_ != nullptr && copy_done_ != nullptr) {
+    // Update the allocation position with the copy complete instruction, so
+    // that if there are further copies from it, they can find the correct
+    // position.
+    set_original_defining_position(HloPosition{copy_start_, {0}});
+    return absl::OkStatus();
+  }
+
   // Copy allocations need to insert asynchronous copy nodes.
   Shape shape = defining_position().shape();
   HloInstruction* producing_instruction = AddGetTupleElements();
@@ -457,13 +471,16 @@ std::string CopyAllocation::ToString() const {
     absl::StrAppend(&memory_space_str, " (off: ", chunk->offset,
                     ", size: ", chunk->size, ")");
   }
-  return absl::StrCat("Copy Allocation in ", memory_space_str,
-                      ", start_time:", start_time(), ", end_time:", end_time(),
-                      ", copy_start_after_time: ", copy_start_schedule_after(),
-                      ", copy_done_before_time: ", copy_done_schedule_before(),
-                      ", uses: ", UsesToString(uses()), ", sync_mem_op: ",
-                      sync_mem_op_ ? sync_mem_op_->name() : "none", ", from ",
-                      prev_allocation_.ToString());
+  return absl::StrCat(
+      "Copy Allocation in ", memory_space_str, ", start_time:", start_time(),
+      ", end_time:", end_time(),
+      ", copy_start_after_time: ", copy_start_schedule_after(),
+      ", copy_done_before_time: ", copy_done_schedule_before(),
+      ", uses: ", UsesToString(uses()),
+      ", existing sync_mem_op: ", sync_mem_op_ ? sync_mem_op_->name() : "none",
+      ", existing copy_start: ", copy_start_ ? copy_start_->name() : "none",
+      ", existing copy_done: ", copy_done_ ? copy_done_->name() : "none",
+      ", from ", prev_allocation_.ToString());
 }
 
 HloPosition CopyAllocation::defining_position() const {
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.h b/third_party/xla/xla/service/memory_space_assignment/allocation.h
index a37f7f7faf5ab5..20f4a7e2395038 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.h
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.h
@@ -287,10 +287,21 @@ class ReservedAllocation final : public Allocation {
   bool reserved_;
 };
 
-// This class represents an allocation as a result of an asynchronous copy.
-// Note: CopyStart instructions are inserted after
-// `copy_start_schedule_after`, while CopyDone instructions are inserted
-// before `copy_done_schedule_before_time`.
+// This class represents an allocation as a result of a single asynchronous
+// data movement operation. The data movement operation is a copy, except when
+// certain arguments are set, as described below.
+// * CopyStart instructions are inserted after `copy_start_schedule_after`,
+//   while CopyDone instructions are inserted before
+//   `copy_done_schedule_before_time`.
+// * When `sync_mem_op` is set, it points to a sync data movement instruction
+//   that we intend to turn into an asynchronous data movement operation.
+// * When `async_mem_op_start` and `async_mem_op_done` are set, it indicates
+//   that an asynchronous data movement operation already exists, but MSA
+//   needs to appropriately schedule the operation.
+// * If `sync_mem_op` is non-null, `async_mem_op_start` and `async_mem_op_done`
+//   must be null.
+// * If are `async_mem_op_start` and `async_mem_op_done` are non-null,
+//   `sync_mem_op` must be null.
 class CopyAllocation final : public Allocation {
  public:
   CopyAllocation(
@@ -299,7 +310,9 @@ class CopyAllocation final : public Allocation {
       int64_t copy_start_schedule_after_time,
       int64_t copy_done_schedule_before_time, int64_t end_time,
       std::optional<int64_t> cross_program_prefetch_index = std::nullopt,
-      HloInstruction* sync_mem_op = nullptr);
+      HloInstruction* sync_mem_op = nullptr,
+      HloInstruction* async_mem_op_start = nullptr,
+      HloInstruction* async_mem_op_done = nullptr);
 
   // Overridden methods
   //
@@ -352,8 +365,8 @@ class CopyAllocation final : public Allocation {
   HloInstruction* sync_mem_op_ = nullptr;
 };
 
-// This class represents an allocation resulting from asynchronous sliced
-// copies.
+// This class represents an allocation resulting from a collection of
+// asynchronous sliced copies that work together to copy a single tensor.
 //
 // Let the sliced allocation be represented as follows, and imagine that t3
 // is the time when the entire buffer [p0, p3) is available for use
diff --git a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
index c44829c596e754..8420562281df43 100644
--- a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
@@ -136,8 +136,6 @@ Step 5: Update AllocationBlocks with the repacking placements
 #include "xla/comparison_util.h"
 #include "xla/service/heap_simulator/allocation_block.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace {
@@ -578,7 +576,7 @@ class BestFitRepacker
   }
 
   bool Repack() {
-    TF_CHECK_OK(Finish().status());
+    CHECK_OK(Finish().status());
     bool success = result_.heap_size <= max_size_;
     if (!success) {
       VLOG(1) << "Repacking unsuccessful with heap size " << result_.heap_size;
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index b51f13a57d3281..1fd08fc9c60b7c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -61,7 +61,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -428,7 +427,7 @@ MemorySpaceAssignment::RunMemorySpaceAssignment(
     LOG(INFO) << "Module after memory space assignment: ";
     XLA_LOG_LINES(INFO, module_->ToString());
   }
-  TF_CHECK_OK(module_->schedule().Verify());
+  CHECK_OK(module_->schedule().Verify());
   if (VLOG_IS_ON(1)) {
     TF_ASSIGN_OR_RETURN(AsyncCopyStats stats,
                         CalculateAsyncCopyStats(alias->dataflow_analysis()));
@@ -1068,6 +1067,30 @@ absl::Status MemorySpaceAssignment::FixSchedule() {
     computation_to_stats[computation] = {};
   }
 
+  // This set contains instructions that should be ignored when iterating
+  // through `flattened_instructions_` to build the new schedule.  MSA adds new
+  // asynchronous copy instructions (e.g., `copy-start`, `copy-done`) and
+  // decides to schedule them before or after a certain instruction as an
+  // anchor. For each instruction in the schedule, it first schedules the
+  // instructions that are supposed to be scheduled before the current
+  // instruction, then the current instruction and finally the instructions that
+  // are supposed to be scheduled after the current instruction. If the "anchor"
+  // instruction itself is scheduled relative to another instruction, we skip
+  // it when iterating over the original schedule.
+  absl::flat_hash_set<HloInstruction*> pass_over_instructions;
+  for (const auto& [_, custom_call_prefetch_details] :
+       options_.hlo_position_to_custom_call_prefetch_details) {
+    for (const auto& custom_call_prefetch_detail :
+         custom_call_prefetch_details) {
+      pass_over_instructions.insert(custom_call_prefetch_detail.prefetch_start);
+      pass_over_instructions.insert(custom_call_prefetch_detail.prefetch_done);
+      for (const auto& intermediate_instruction :
+           custom_call_prefetch_detail.intermediate_instructions) {
+        pass_over_instructions.insert(intermediate_instruction);
+      }
+    }
+  }
+
   // Create the schedule for all computations at the same time, by first
   // scheduling the before instructions, then the current instruction and
   // finally the after instructions (each in its respective computation).
@@ -1101,7 +1124,8 @@ absl::Status MemorySpaceAssignment::FixSchedule() {
       // dependencies.
       if (instruction != nullptr &&
           instruction->opcode() != HloOpcode::kBitcast &&
-          instruction->opcode() != HloOpcode::kTuple) {
+          instruction->opcode() != HloOpcode::kTuple &&
+          !pass_over_instructions.contains(instruction)) {
         HloComputation* computation = instruction->parent();
         if (computation_to_stats.contains(computation)) {
           ComputationStats& stats = computation_to_stats[computation];
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
index 92e59c2be4d0c2..0f1ed29b50357a 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
@@ -131,6 +131,33 @@ message PreferredPrefetchOverrides {
   repeated PreferredPrefetchOverride overrides = 1;
 }
 
+// Specifies details on how to randomally select HloPositions for perturbation.
+//
+// Each HloPosition is randomally assigned (based on the seed) to a value in the
+// range [0...1). If it is within the range
+// [selection_range_begin...selection_range_end) it will have its prefetch
+// priority overridden based on the value of MsaSortOrderOverrideOptions.
+// Anything outside this range will be left as-is.
+//
+// We use the selection range (rather than just supplying the probability) to be
+// able to binary search in the case where there's just one instruction that
+// needs to have its priority fixed.
+//
+// The probability of any HloPosition to be perturbed is
+// selection_range_end-selection_range_begin.
+//
+// NOTE: different HloPositions with the same computation graph (the same
+// instruction fingerprint) will be considered as identical, and hence will
+// receive the same priority from the HloRandomFilter.
+message HloRandomFilter {
+  // Defaults to 1234
+  optional int64 seed = 1;
+  // Defaults to 0.
+  optional double selection_range_begin = 2;
+  // Defaults to 1.
+  optional double selection_range_end = 3;
+}
+
 // A message that identifies one or more HloPositions.
 message HloPositionMatcher {
   // Regex to match the entire instruction HLO. The HLO string is constructed
@@ -149,6 +176,8 @@ message HloPositionMatcher {
   optional int64 size_lte = 5;
   // Filters instructions that have a use that matches the filter.
   optional HloOperandFilter hlo_use_filter = 6;
+  // Filters instructions randomly.
+  optional HloRandomFilter hlo_random_filter = 7;
 }
 
 // Options to override preferred prefetch time for an operand.
@@ -162,6 +191,10 @@ message MsaSortOrderOverrideOptions {
     // multiple buffers are to be assigned last (within the same override
     // config) other tie breakers and stable sort order will take effect.
     bool assign_last = 2;
+    // Set the specific priority value, where lower values means "before" higher
+    // values. If multiple buffers are to be assigned the same value, other tie
+    // breakers and stable sort order will take effect.
+    int64 assign_value = 3;
   }
 }
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 086030991b589f..b60ef8872a01e7 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/service/memory_space_assignment/memory_space_assignment.h"
 
-#include <stdbool.h>
-
 #include <algorithm>
 #include <cstdint>
 #include <functional>
@@ -53,6 +51,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_live_range.h"
@@ -86,13 +85,10 @@ limitations under the License.
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace memory_space_assignment {
@@ -182,7 +178,7 @@ TEST_F(MemorySpaceAssignmentTest, ParameterOnly) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -211,7 +207,7 @@ TEST_F(MemorySpaceAssignmentTest, Simple) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, add, sub, mul});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
   options.post_module_scoped_alternate_memory_size_in_bytes = 10;
@@ -355,7 +351,7 @@ TEST_F(MemorySpaceAssignmentTest, NegateChain) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -510,8 +506,7 @@ ENTRY entry {
       module->schedule().sequence(module->entry_computation());
   auto find_index = [&](const HloInstruction* instruction) {
     return std::distance(sequence.instructions().begin(),
-                         std::find(sequence.instructions().begin(),
-                                   sequence.instructions().end(), instruction));
+                         absl::c_find(sequence.instructions(), instruction));
   };
   int64_t copy_done_time = find_index(copy_done);
   int64_t negate9_time = find_index(negate9);
@@ -1501,8 +1496,7 @@ ENTRY entry {
           request_modifier_module->entry_computation());
   auto find_index = [&](const HloInstruction* instruction) {
     return std::distance(sequence.instructions().begin(),
-                         std::find(sequence.instructions().begin(),
-                                   sequence.instructions().end(), instruction));
+                         absl::c_find(sequence.instructions(), instruction));
   };
 
   int negate4_index = find_index(negate4);
@@ -1878,7 +1872,7 @@ TEST_F(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
 
@@ -1954,7 +1948,7 @@ TEST_F(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchBeforeTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
 
@@ -2032,7 +2026,7 @@ TEST_F(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchAfterTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
 
@@ -2110,7 +2104,7 @@ TEST_F(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchTooLateTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
 
@@ -2180,7 +2174,7 @@ TEST_F(MemorySpaceAssignmentTest, FilterUpdateConfigPrecedenceTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
 
@@ -2262,7 +2256,7 @@ TEST_F(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchPrecedenceTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
 
@@ -2345,7 +2339,7 @@ TEST_F(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchNoMatchTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
 
@@ -2493,7 +2487,7 @@ TEST_F(MemorySpaceAssignmentTest,
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, tanh, a, b, c, d, e, f, g, h, i,
                                       j, k, l, m, n, o, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(),
                     UpdateMaxAsyncCopies(DefaultMemorySpaceOptions(), 1));
@@ -2582,7 +2576,7 @@ TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetchAndPrefetch) {
        f,       g,       h,       i,       j,       k,       l,       m,
        n,       o,       add0,    negate0, negate1, negate2, negate3, negate4,
        negate5, negate6, negate7, negate8, negate9, add1});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -2673,7 +2667,7 @@ TEST_F(MemorySpaceAssignmentTest, While) {
                          body_iter_next, body_data_increment, body_data_mul,
                          body_data_add, body_data_next, body_out});
   schedule.set_sequence(entry_computation, {iter, data, tuple, while_op});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   LOG(INFO) << module->ToString(HloPrintOptions::ShortParsable());
 
@@ -2729,7 +2723,7 @@ TEST_F(MemorySpaceAssignmentTest, Tuple) {
   schedule.set_sequence(
       computation, {p, p0, negate0, negate1, negate2, negate3, negate4, negate5,
                     negate6, p1, add, p2, p2_0, mul});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -2765,7 +2759,7 @@ TEST_F(MemorySpaceAssignmentTest, Bitcast) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate, bitcast, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -2803,7 +2797,7 @@ TEST_F(MemorySpaceAssignmentTest, Bitcast2) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, bitcast, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -2851,7 +2845,7 @@ TEST_F(MemorySpaceAssignmentTest, Bitcast3) {
   schedule.set_sequence(computation,
                         {p0, p1, negate0, negate1, negate2, negate3, negate4,
                          bitcast1, add, bitcast2, bitcast3, bitcast4, mul});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -2923,7 +2917,7 @@ TEST_F(MemorySpaceAssignmentTest, BitcastTuple) {
   schedule.set_sequence(computation,
                         {p0, p1, negate0, negate1, negate2, negate3, negate4,
                          bitcast, tuple, fusion});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 }
@@ -3016,7 +3010,7 @@ TEST_F(MemorySpaceAssignmentTest, BitcastMultiUse) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, bitcast, negate0, negate1, negate2,
                                       negate3, negate4, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
@@ -3072,7 +3066,7 @@ TEST_F(MemorySpaceAssignmentTest, BitcastMultiUseTuple) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, bitcast, negate0, negate1, negate2,
                                       negate3, negate4, tuple, fusion});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
@@ -3136,7 +3130,7 @@ TEST_F(MemorySpaceAssignmentTest, BitcastScheduleBug) {
   schedule.set_sequence(
       computation, {p0, p1, bitcast, negate0, negate1, negate2, negate3,
                     negate4, negate5, negate6, negate7, negate8, negate9, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(),
                     /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/4);
@@ -4428,7 +4422,7 @@ TEST_F(MemorySpaceAssignmentTest, LastUseOpt) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, add1, sub1, mul1, add2, mul2,
                                       padding_value, padded_mul2, add3});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -4520,7 +4514,7 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule1) {
                          body_data_add, body_data_next, body_out});
   schedule.set_sequence(entry_computation,
                         {iter, data, p2, tuple, while_op, while_data, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(), 50);
 }
@@ -4591,7 +4585,7 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule2) {
        negate4, negate5, negate6, negate7, add0});
   schedule.set_sequence(entry_computation,
                         {p0, p1, add1, add2, negate8, call, add3, add4, add5});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(), 5);
 }
@@ -4657,7 +4651,7 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule3) {
       {call_param, iota, slice, mul, negate0, negate1, negate2, negate3,
        negate4, negate5, negate6, negate7, add0});
   schedule.set_sequence(entry_computation, {p0, add1, add2, call, add3});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(), 5);
 }
@@ -4731,7 +4725,7 @@ TEST_F(MemorySpaceAssignmentTest, DISABLED_NonEntryComputationSchedule4) {
   schedule.set_sequence(false_computation, {false_param});
   schedule.set_sequence(entry_computation,
                         {p0, add1, add2, pred, conditional, add3});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(), 5);
 }
@@ -4850,7 +4844,7 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule5) {
       entry_computation,
       {iter, data, data2, negate0, negate1, negate2, negate3, negate4, negate5,
        negate6, negate7, sub, tuple, while_op, while_data, root});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   // Set a large max prefetch interval so that the buffer can be kept in
   // alternate memory.
@@ -4955,7 +4949,7 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
       entry_computation,
       {iter, data, negate0, negate1, negate2, negate3, negate4, negate5,
        negate6, negate7, tuple, while_op, while_data, while_data2, root});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   // Pick a large max prefetch interval to ensure all the while inputs are
   // allocated in the alternate memory.
@@ -5040,7 +5034,7 @@ TEST_F(MemorySpaceAssignmentTest, DanglingCopy) {
   schedule.set_sequence(
       computation, {p, p0, negate0, negate1, negate2, negate3, negate4, negate5,
                     negate6, p1a, copy, p1b, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 }
@@ -5077,7 +5071,7 @@ TEST_F(MemorySpaceAssignmentTest, MultiOutputFusion) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, fusion, element0, element1, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 }
@@ -5117,7 +5111,7 @@ TEST_F(MemorySpaceAssignmentTest, TupleInput) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, tuple, fusion});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 }
@@ -5190,7 +5184,7 @@ TEST_F(MemorySpaceAssignmentTest, TupleToTuple1) {
       computation,
       {p0, fusion0, element0, element1, negate0, negate1, negate2, negate3,
        negate4, negate5, negate6, add0, add1, fusion1, mul});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(), 5);
   EXPECT_THAT(fusion1,
@@ -5263,7 +5257,7 @@ TEST_F(MemorySpaceAssignmentTest, TupleToTuple2) {
   schedule.set_sequence(
       computation, {p0, fusion0, negate0, negate1, negate2, negate3, negate4,
                     negate5, negate6, fusion1});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(), 5);
 
@@ -5322,7 +5316,7 @@ TEST_F(MemorySpaceAssignmentTest, TupleToTuple3) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, fusion0, fusion1});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
   EXPECT_THAT(fusion1, op::Fusion(op::Fusion()));
@@ -5366,11 +5360,11 @@ TEST_F(MemorySpaceAssignmentTest, InputOutputAlias) {
   schedule.set_sequence(
       computation, {p, p0, negate0, negate1, negate2, negate3, negate4, negate5,
                     negate6, p1, add, negate7, tuple});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   // Make input {0} alias with output {0} and input {1} alias with output {1}.
-  TF_CHECK_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0}));
-  TF_CHECK_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1}));
+  CHECK_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0}));
+  CHECK_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1}));
 
   AssignMemorySpace(module.get());
 
@@ -5413,7 +5407,7 @@ TEST_F(MemorySpaceAssignmentTest, CostAnalysis) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
                                       negate3, negate4, negate5, negate6, add});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpaceUsingCostAnalysis(module.get());
   // Parameters are in the default memory space.
@@ -5485,7 +5479,7 @@ TEST_F(MemorySpaceAssignmentTest, MemoryBoundednessBufferIntervalCompare) {
   schedule.set_sequence(computation,
                         {p0, p1, tanh0, negate0, tanh1, negate1, tanh2, negate2,
                          tanh3, negate3, tanh4, negate4, tuple});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpaceUsingCostAnalysis(module.get());
   // Parameters are in the default memory space.
@@ -5992,6 +5986,15 @@ ENTRY main {
             kAlternateMemorySpace);
 }
 
+#define EXPECT_INSTRUCTION_IN(instruction_name, expected_memory_space) \
+  {                                                                    \
+    HloInstruction* instruction =                                      \
+        FindInstruction(module.get(), instruction_name);               \
+    EXPECT_EQ(instruction->shape().layout().memory_space(),            \
+              expected_memory_space)                                   \
+        << "Instruction name:" << instruction_name;                    \
+  }
+
 TEST_F(MemorySpaceAssignmentTest,
        MemoryBoundednessOverrideSortOrderAssignFirst) {
   absl::string_view hlo_string = R"(
@@ -6033,32 +6036,20 @@ TEST_F(MemorySpaceAssignmentTest,
       /*hlo_cost_options_override=*/std::nullopt,
       /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
   // Parameters are in the default memory space.
-  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
-  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
-  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p1", kDefaultMemorySpace);
   // Check that all negates are in alternate memory space except negate4.
   // negate4 is a program output, so it has to land in default memory.
-  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
-  EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
-  EXPECT_EQ(negate1->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
-  EXPECT_EQ(negate2->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
-  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
-  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
-  EXPECT_EQ(tanh0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
-  EXPECT_EQ(tanh1->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
-  EXPECT_EQ(tanh2->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
-  EXPECT_EQ(tanh3->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
-  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate0", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate1", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate2", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate3", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate4", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh2", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh3", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh4", kDefaultMemorySpace);
 }
 
 TEST_F(MemorySpaceAssignmentTest,
@@ -6103,32 +6094,20 @@ TEST_F(MemorySpaceAssignmentTest,
       /*hlo_cost_options_override=*/std::nullopt,
       /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
   // Parameters are in the default memory space.
-  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
-  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
-  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
-  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
-  EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
-  EXPECT_EQ(negate1->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
-  EXPECT_EQ(negate2->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
-  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  EXPECT_INSTRUCTION_IN("p0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate0", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate1", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate2", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate3", kAlternateMemorySpace);
   // negate4 is a program output, so it has to land in default memory.
-  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate4", kDefaultMemorySpace);
   // Check that all tanhs are in default memory space.
-  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
-  EXPECT_EQ(tanh0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
-  EXPECT_EQ(tanh1->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
-  EXPECT_EQ(tanh2->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
-  EXPECT_EQ(tanh3->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
-  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh2", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh3", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh4", kDefaultMemorySpace);
 }
 
 TEST_F(MemorySpaceAssignmentTest,
@@ -6179,32 +6158,20 @@ TEST_F(MemorySpaceAssignmentTest,
       /*hlo_cost_options_override=*/std::nullopt,
       /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
   // Parameters are in the default memory space.
-  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
-  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
-  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
-  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
-  EXPECT_EQ(negate0->shape().layout().memory_space(), kDefaultMemorySpace);
-  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
-  EXPECT_EQ(negate1->shape().layout().memory_space(), kDefaultMemorySpace);
-  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
-  EXPECT_EQ(negate2->shape().layout().memory_space(), kDefaultMemorySpace);
-  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
-  EXPECT_EQ(negate3->shape().layout().memory_space(), kDefaultMemorySpace);
-  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
-  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate2", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate3", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate4", kDefaultMemorySpace);
   // Check that all tanhs are in alternate memory space except tanh4. tanh4
   // is a program output, so it has to land in default memory.
-  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
-  EXPECT_EQ(tanh0->shape().layout().memory_space(), kAlternateMemorySpace);
-  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
-  EXPECT_EQ(tanh1->shape().layout().memory_space(), kAlternateMemorySpace);
-  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
-  EXPECT_EQ(tanh2->shape().layout().memory_space(), kAlternateMemorySpace);
-  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
-  EXPECT_EQ(tanh3->shape().layout().memory_space(), kAlternateMemorySpace);
-  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
-  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh0", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh1", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh2", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh3", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh4", kDefaultMemorySpace);
 }
 
 TEST_F(MemorySpaceAssignmentTest,
@@ -6255,32 +6222,20 @@ TEST_F(MemorySpaceAssignmentTest,
       /*hlo_cost_options_override=*/std::nullopt,
       /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
   // Parameters are in the default memory space.
-  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
-  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
-  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p1", kDefaultMemorySpace);
   // Check that all negates are in alternate memory space except negate4.
   // negate4 is a program output, so it has to land in default memory.
-  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
-  EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
-  EXPECT_EQ(negate1->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
-  EXPECT_EQ(negate2->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
-  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
-  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
-  EXPECT_EQ(tanh0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
-  EXPECT_EQ(tanh1->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
-  EXPECT_EQ(tanh2->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
-  EXPECT_EQ(tanh3->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
-  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate0", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate1", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate2", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate3", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate4", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh2", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh3", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh4", kDefaultMemorySpace);
 }
 
 TEST_F(MemorySpaceAssignmentTest,
@@ -6326,32 +6281,116 @@ TEST_F(MemorySpaceAssignmentTest,
       /*hlo_cost_options_override=*/std::nullopt,
       /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
   // Parameters are in the default memory space.
-  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
-  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
-  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p1", kDefaultMemorySpace);
   // Check that all negates are in alternate memory space except negate4.
   // negate4 is a program output, so it has to land in default memory.
-  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
-  EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
-  EXPECT_EQ(negate1->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
-  EXPECT_EQ(negate2->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
-  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
-  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
-  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
-  EXPECT_EQ(tanh0->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
-  EXPECT_EQ(tanh1->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
-  EXPECT_EQ(tanh2->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
-  EXPECT_EQ(tanh3->shape().layout().memory_space(), kDefaultMemorySpace);
-  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
-  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate0", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate1", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate2", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate3", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate4", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh2", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh3", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh4", kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       MemoryBoundednessOverrideSortOrderByRandomFingerprintAssignFirst) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[3,4]{1,0} parameter(0)
+    p1 = f32[3,4]{1,0} parameter(1)
+    tanh0 = f32[3,4]{1,0} tanh(p0)
+    negate0 = f32[3,4]{1,0} negate(p1)
+    tanh1 = f32[3,4]{1,0} tanh(tanh0)
+    negate1 = f32[3,4]{1,0} negate(negate0)
+    tanh2 = f32[3,4]{1,0} tanh(tanh1)
+    negate2 = f32[3,4]{1,0} negate(negate1)
+    tanh3 = f32[3,4]{1,0} tanh(tanh2)
+    negate3 = f32[3,4]{1,0} negate(negate2)
+    tanh4 = f32[3,4]{1,0} tanh(tanh3)
+    negate4 = f32[3,4]{1,0} negate(negate3)
+    ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  // Override MSA sort order and try to assign all negates to alternate memory
+  // first. Alternate memory size is enough to fit 2 f32[4,3] tensors at a time.
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher {
+        hlo_random_filter {
+          seed: 123456
+          selection_range_begin: 0.
+          selection_range_end: 0.5
+        }
+      }
+      override_options { assign_first: true }
+    })pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  // Show that with this random seed and bound values, we get tanh to pass the
+  // random filter while negate do not pass the filter.
+  //
+  // NOTE that all tanh functions are considered "identical" since the filter
+  // it only uses the "fingerprint" of the instruction which is based on the
+  // computation graph and does not include the name. Same for the negate
+  // functions.
+  const auto& hlo_position_matcher =
+      msa_sort_order_overrides.overrides(0).hlo_position_matcher();
+  EXPECT_TRUE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "tanh0")));
+  EXPECT_TRUE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "tanh1")));
+  EXPECT_TRUE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "tanh2")));
+  EXPECT_TRUE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "tanh3")));
+  EXPECT_TRUE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "tanh4")));
+
+  EXPECT_FALSE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "negate0")));
+  EXPECT_FALSE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "negate1")));
+  EXPECT_FALSE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "negate2")));
+  EXPECT_FALSE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "negate3")));
+  EXPECT_FALSE(MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+      hlo_position_matcher, *FindInstruction(module.get(), "negate4")));
+
+  AssignMemorySpaceUsingCostAnalysis(
+      module.get(), /*memory_space_options_override=*/std::nullopt,
+      /*cost_analysis_options_override=*/std::nullopt,
+      /*hlo_cost_options_override=*/std::nullopt,
+      /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
+  // Parameters are in the default memory space.
+  EXPECT_INSTRUCTION_IN("p0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("p1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate0", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate1", kDefaultMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate2", kDefaultMemorySpace);
+  // Since the max size is 128 bytes, there's enough space for negate3 to be
+  // assigned to alternate memory even though it doesn't have priority
+  EXPECT_INSTRUCTION_IN("negate3", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("negate4", kDefaultMemorySpace);
+  // Check that all tanh are in alternate memory space except tanh4.
+  // tanh4 is a program output, so it has to land in default memory.
+  EXPECT_INSTRUCTION_IN("tanh0", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh1", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh2", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh3", kAlternateMemorySpace);
+  EXPECT_INSTRUCTION_IN("tanh4", kDefaultMemorySpace);
 }
 
 TEST_F(MemorySpaceAssignmentTest, SimpleWhileTupleTest) {
@@ -6435,7 +6474,7 @@ TEST_F(MemorySpaceAssignmentTest, SimpleWhileTupleTest) {
 
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   schedule.set_sequence(computation, {param, gte0, gte1, tuple, while0});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get(), DefaultMemorySpaceOptions(),
                     /*max_prefetch_interval=*/50);
@@ -6518,7 +6557,7 @@ TEST_F(MemorySpaceAssignmentTest, EvictionsShouldntBeDelayed) {
       {p0, tanh0, tanh_redundant0, tanh_redundant1, tanh_redundant2,
        tanh_redundant3, tanh_redundant4, tanh_redundant5, tanh_redundant6,
        negate0, tanh1, negate1, tanh2, negate2, tanh3, negate3, tuple});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpaceUsingCostAnalysis(module.get());
 
@@ -6607,7 +6646,7 @@ TEST_F(MemorySpaceAssignmentTest,
   schedule.set_sequence(computation,
                         {p0, p1, negate0, negate1, negate2, negate3, negate4,
                          negate5, negate6, add, tuple});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
   options.post_module_scoped_alternate_memory_size_in_bytes = 32;
@@ -10661,7 +10700,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {lhs, rhs, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -10714,7 +10753,7 @@ TEST_F(MemorySpaceAssignmentTest, MultiCrossProgramPrefetchTest) {
   HloSchedule schedule(module.get());
   schedule.set_sequence(
       computation, {lhs, first_weight, second_weight, first_dot, second_dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
   options.max_cross_program_prefetches = -1;
@@ -10769,7 +10808,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchTupleTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {param, lhs, rhs, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -10809,7 +10848,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchBitcastTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {lhs, rhs, bitcast, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -10853,7 +10892,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchBitcastTupleTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {param, lhs, rhs, bitcast, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -10897,7 +10936,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchNestedTupleTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {param, gte, lhs, rhs, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -10920,7 +10959,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchUnusedParamTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {param});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -10954,7 +10993,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchTooBigTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {lhs, rhs, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -10992,7 +11031,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchTooBigTupleTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {param, lhs, rhs, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -11040,7 +11079,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchFusionTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {activations, weights, fusion});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -11093,7 +11132,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchFusionTupleTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {activations, weights, tuple, fusion});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   AssignMemorySpace(module.get());
 
@@ -11131,7 +11170,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {lhs, rhs, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
   options.is_allowed_in_alternate_mem_fn = [](const HloValue& value) {
@@ -11178,7 +11217,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTupleTest) {
 
   HloSchedule schedule(module.get());
   schedule.set_sequence(computation, {param, lhs, rhs, dot});
-  TF_CHECK_OK(module->set_schedule(schedule));
+  CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
   options.is_allowed_in_alternate_mem_fn = [](const HloValue& value) {
@@ -14810,6 +14849,148 @@ ENTRY entry {
       /*operand_memory_space=*/kAlternateMemorySpace);
 }
 
+TEST_F(MemorySpaceAssignmentTest, TestBlockPrefetchingWithAlisedUses) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[2,3]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  p2 = f32[2,3]{1,0} parameter(2)
+  p3 = f32[2,3]{1,0} parameter(3)
+  p4 = f32[2,3]{1,0} parameter(4)
+  p5 = f32[2,3]{1,0} parameter(5)
+  custom_call0 = f32[2,3]{1,0} custom-call(p0), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate1 = f32[2,3]{1,0} negate(custom_call0)
+  negate2 = f32[2,3]{1,0} negate(negate1)
+  add3 = f32[2,3]{1,0} add(p1, negate2)
+  negate4 = f32[2,3]{1,0} negate(add3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  add6 = f32[2,3]{1,0} add(p2, negate5)
+  negate7 = f32[2,3]{1,0} negate(add6)
+  negate8 = f32[2,3]{1,0} negate(negate7)
+  add9 = f32[2,3]{1,0} add(p3, negate8)
+  negate10 = f32[2,3]{1,0} negate(add9)
+  negate11 = f32[2,3]{1,0} negate(negate10)
+  add12 = f32[2,3]{1,0} add(p4, negate11)
+  custom_call13 = f32[2,3]{1,0} custom-call(custom_call0, add12), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate14 = f32[2,3]{1,0} negate(custom_call13)
+  ROOT add15 = f32[2,3]{1,0} add(p5, negate14)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 24;
+  memory_space_options.reserved_bytes_for_block_prefetches = 24;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+
+  memory_space_options.block_prefetched_positions = GetHloPositions(
+      /*module=*/module.get(),
+      /*instruction_names=*/{"p0", "p1", "p2", "p3", "p4", "p5"});
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+
+  std::vector<std::string> prefetched_uses = {"custom_call0", "add15"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetched_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCopyDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  std::vector<std::string> aliased_uses = {"negate1", "custom_call13",
+                                           "negate14"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/aliased_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  std::vector<std::string> default_memory_uses = {"add3", "add6", "add9",
+                                                  "add12"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/default_memory_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kParameter,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       TestBlockPrefetchingWithAliasingAndScopedAllocations) {
+  // Simple block prefetching test where p0 is block prefetched and aliases with
+  // custom_call2 and custom_call8 which is used at add13. This makes the
+  // effective aliased live range of the block prefetch p0 to be the entire
+  // program duration. We also have a scoped allocation for negate10 which
+  // reserves 24 bytes of scoped memory. The alternate memory is enough for
+  // prefetching p0 and allocating scoped memory for negate10.
+
+  // If we allocate scoped memory before block prefetching, 24 bytes of scoped
+  // memory for negate10 will be allocated at offset 0. Block prefetch for p0
+  // will also get offset 0, but we would not be able to extend it past the
+  // scoped allocation of negate10 when we try to extend it for the entire live
+  // range.
+
+  // To make sure we dont allocate block prefetches below MaxScopedMemorySize
+  // we reserve memory for scoped allocations before block prefetching and
+  // process scoped allocations after block prefetching.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[2,3]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  custom_call2 = f32[2,3]{1,0} custom-call(p0), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate3 = f32[2,3]{1,0} negate(p1)
+  negate4 = f32[2,3]{1,0} negate(negate3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  negate6 = f32[2,3]{1,0} negate(negate5)
+  negate7 = f32[2,3]{1,0} negate(negate6)
+  custom_call8 = f32[2,3]{1,0} custom-call(custom_call2), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate9 = f32[2,3]{1,0} negate(negate7)
+  negate10 = f32[2,3]{1,0} negate(negate9)
+  negate11 = f32[2,3]{1,0} negate(negate10)
+  negate12 = f32[2,3]{1,0} negate(negate11)
+  ROOT add13 = f32[2,3]{1,0} add(custom_call8, negate12)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 48;
+  memory_space_options.reserved_bytes_for_block_prefetches = 24;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+  memory_space_options.block_prefetched_positions = GetHloPositions(
+      /*module=*/module.get(),
+      /*instruction_names=*/{"p0"});
+  memory_space_options.reserved_scoped_memory_fn =
+      [&](const HloInstruction* instruction,
+          const absl::flat_hash_set<std::pair<int, ShapeIndex>>
+              operands_in_alternate_memory,
+          const absl::flat_hash_set<ShapeIndex> outputs_in_alternate_memory) {
+        if (instruction->name() == "negate10") {
+          return 24;
+        }
+        return 0;
+      };
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+
+  std::vector<std::string> prefetched_uses = {"custom_call2"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetched_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCopyDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  std::vector<std::string> aliased_uses = {"custom_call8", "add13"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/aliased_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest,
        TestBlockPrefetchingWithInputOutputAliasConfig) {
   absl::string_view hlo_string = R"(
@@ -14992,6 +15173,125 @@ ENTRY entry {
   EXPECT_EQ(add3->operand(0), add13->operand(0));
 }
 
+TEST_F(MemorySpaceAssignmentTest, TestBlockPrefetchSourceValueAliased) {
+  // In this test, the source value of the block prefetch aliases to the left.
+  // custom_call1 and custom_call3 alias to the left, we test that all aliases
+  // to the left should be pinned to default memory space.
+  // custom_call3 also aliases to the right, which should be pinned to
+  // alternate memory space.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[2,3]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  p2 = f32[2,3]{1,0} parameter(2)
+  p3 = f32[2,3]{1,0} parameter(3)
+  p4 = f32[2,3]{1,0} parameter(4)
+  p5 = f32[2,3]{1,0} parameter(5)
+
+  custom_call0 = f32[2,3]{1,0} custom-call(p0), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc0 = f32[2,3]{1,0} negate(custom_call0)
+  negate_cc1 = f32[2,3]{1,0} negate(negate_cc0)
+  custom_call1 = f32[2,3]{1,0} custom-call(custom_call0 ,negate_cc1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc2 = f32[2,3]{1,0} negate(custom_call1)
+  negate_cc3 = f32[2,3]{1,0} negate(negate_cc2)
+
+  custom_call2 = f32[2,3]{1,0} custom-call(p1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc4 = f32[2,3]{1,0} negate(custom_call2)
+  negate_cc5 = f32[2,3]{1,0} negate(negate_cc4)
+  custom_call3 = f32[2,3]{1,0} custom-call(custom_call2 ,negate_cc5), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc6 = f32[2,3]{1,0} negate(custom_call3)
+  negate_cc7 = f32[2,3]{1,0} negate(negate_cc6)
+
+  add0 = f32[2,3]{1,0} add(custom_call1, negate_cc3)
+  add1 = f32[2,3]{1,0} add(add0, negate_cc7)
+  negate2 = f32[2,3]{1,0} negate(add1)
+  add3 = f32[2,3]{1,0} add(custom_call3, negate2)
+  negate4 = f32[2,3]{1,0} negate(add3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  add6 = f32[2,3]{1,0} add(p2, negate5)
+  negate7 = f32[2,3]{1,0} negate(add6)
+  negate8 = f32[2,3]{1,0} negate(negate7)
+  add9 = f32[2,3]{1,0} add(p3, negate8)
+  negate10 = f32[2,3]{1,0} negate(add9)
+  negate11 = f32[2,3]{1,0} negate(negate10)
+  add12 = f32[2,3]{1,0} add(p4, negate11)
+  custom_call13 = f32[2,3]{1,0} custom-call(custom_call3, add12), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  add14 = f32[2,3]{1,0} add(custom_call1, add12)
+  add15 = f32[2,3]{1,0} add(custom_call13, add14)
+  ROOT add16 = f32[2,3]{1,0} add(p5, add15)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 400;
+  memory_space_options.reserved_bytes_for_block_prefetches = 400;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+
+  memory_space_options.block_prefetched_positions = GetHloPositions(
+      /*module=*/module.get(),
+      /*instruction_names=*/{"custom_call1", "custom_call3", "p2", "p3", "p4",
+                             "p5"});
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher {
+        instruction_name_regex: "custom_call0|custom_call1|custom_call2|custom_call3"
+      }
+      override_options { assign_first: true }
+    })pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+  memory_space_options.msa_sort_order_overrides =
+      std::move(msa_sort_order_overrides);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  // Check uses of block prefetche are from alternate memory space.
+  std::vector<std::string> alternate_memory_uses = {
+      "negate_cc2", "negate_cc6", "add0",          "add3",  "add6",
+      "add9",       "add12",      "custom_call13", "add14", "add16"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/alternate_memory_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCopyDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  // Check that the uses of the values aliased to the right of prefetched value
+  // are from alternate memory space.
+  std::vector<std::string> alt_memory_uses_from_aliased_pinned_values = {
+      "add15"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(),
+      /*instruction_names=*/alt_memory_uses_from_aliased_pinned_values,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  HloInstruction* add0 = FindInstruction(module.get(), "add0");
+  HloInstruction* add3 = FindInstruction(module.get(), "add3");
+  HloInstruction* custom_call13 =
+      FindInstruction(module.get(), "custom_call13");
+  HloInstruction* add14 = FindInstruction(module.get(), "add14");
+  // Check that the prefetch of p0 is reused for add14.
+  EXPECT_EQ(add0->operand(0), add14->operand(0));
+  // Check that the prefetch of p1 is reused for custom_call13.
+  EXPECT_EQ(add3->operand(0), custom_call13->operand(0));
+
+  // Check that all the uses of the hlo values that alias to the left of the
+  // prefetched hlo value (custom_call1, custom_call3) are from default memory
+  // space even though they are higher in the sort order.
+  std::vector<std::string> default_memory_uses = {"negate_cc0", "custom_call1",
+                                                  "negate_cc4", "custom_call3"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/default_memory_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest,
        TestBlockPrefetchingDoubleBufferedWithColoring) {
   absl::string_view hlo_string = R"(
@@ -15381,6 +15681,522 @@ ENTRY entry {
   EXPECT_EQ(num_prefetches, 6);
 }
 
+TEST_F(MemorySpaceAssignmentTest, TestScheduleCustomCallPrefetchesBasic) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  param0 = f32[2,1,3]{2,1,0} parameter(0), sharding={replicated}
+
+  prefetch_start_param0 = (f32[2,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(param0), custom_call_target="tpu_custom_call"
+  gte_param0_0 = f32[2,1,3]{2,1,0:S(1)} get-tuple-element(prefetch_start_param0), index=0
+  gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(prefetch_start_param0), index=1
+  prefetch_done_param0 = f32[2,1,3]{2,1,0:S(1)} custom-call(param0, gte_param0_0, gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  first_slice_prefetch_start_param0 = (f32[1,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(param0), custom_call_target="tpu_custom_call"
+  first_slice_gte_param0_0 = f32[1,1,3]{2,1,0:S(1)} get-tuple-element(first_slice_prefetch_start_param0), index=0
+  first_slice_gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(first_slice_prefetch_start_param0), index=1
+  first_slice_prefetch_done_param0 = f32[1,1,3]{2,1,0:S(1)} custom-call(param0, first_slice_gte_param0_0, first_slice_gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  second_slice_prefetch_start_param0 = (f32[1,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(param0), custom_call_target="tpu_custom_call"
+  second_slice_gte_param0_0 = f32[1,1,3]{2,1,0:S(1)} get-tuple-element(second_slice_prefetch_start_param0), index=0
+  second_slice_gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(second_slice_prefetch_start_param0), index=1
+  second_slice_prefetch_done_param0 = f32[1,1,3]{2,1,0:S(1)} custom-call(param0, second_slice_gte_param0_0, second_slice_gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  negate_param0 = f32[2,1,3]{2,1,0} negate(prefetch_done_param0)
+  add_param0 = f32[2,1,3]{2,1,0} add(prefetch_done_param0, negate_param0)
+  negate_param0_first_slice = f32[1,1,3]{2,1,0} negate(first_slice_prefetch_done_param0)
+  add_param0_first_slice = f32[1,1,3]{2,1,0} add(first_slice_prefetch_done_param0, negate_param0_first_slice)
+  negate_param0_second_slice = f32[1,1,3]{2,1,0} negate(second_slice_prefetch_done_param0)
+  add_param0_second_slice = f32[1,1,3]{2,1,0} add(second_slice_prefetch_done_param0, negate_param0_second_slice)
+  ROOT tuple = (f32[2,1,3]{2,1,0}, f32[1,1,3]{2,1,0}, f32[1,1,3]{2,1,0}) tuple(add_param0, add_param0_first_slice, add_param0_second_slice)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 200;
+  memory_space_options.reserved_bytes_for_block_prefetches = 200;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+
+  std::vector<CustomCallPrefetchInfo> custom_call_prefetch_instructions = {
+      {"param0", "prefetch_start_param0", "prefetch_done_param0"},
+      {"param0", "first_slice_prefetch_start_param0",
+       "first_slice_prefetch_done_param0"},
+      {"param0", "second_slice_prefetch_start_param0",
+       "second_slice_prefetch_done_param0"}};
+
+  memory_space_options.hlo_position_to_custom_call_prefetch_details =
+      GetCustomCallPrefetchDetailsMap(/*module=*/module.get(),
+                                      /*custom_call_prefetch_instructions=*/
+                                      custom_call_prefetch_instructions);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  std::vector<std::string> prefetch_uses = {
+      "add_param0",
+      "add_param0_first_slice",
+      "add_param0_second_slice",
+      "negate_param0",
+      "negate_param0_first_slice",
+      "negate_param0_second_slice",
+  };
+
+  // Check that all uses of custom call prefetches are in alternate memory
+  // space.
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetch_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       TestScheduleCustomCallPrefetchesWithMultipleUses) {
+  // params p0, p1, p2 have sliced and non-sliced custom call prefetches.
+  // slice_done_1 and prefetch_done_0 have multiple uses.
+  // p0, p1 and p2 have uses apart from the custom call prefetches.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[4,3]{1,0} parameter(0)
+  p1 = f32[4,3]{1,0} parameter(1)
+  p2 = f32[4,3]{1,0} parameter(2)
+
+  prefetch_start0 = (f32[4,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  prefetch_start0_gte_0 = f32[4,3]{1,0} get-tuple-element(prefetch_start0), index=0
+  prefetch_start0_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(prefetch_start0), index=1
+  prefetch_done_0 = f32[4,3]{1,0} custom-call(p0, prefetch_start0_gte_0, prefetch_start0_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+
+  slice_start_0 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  slice_start_0_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_0), index=0
+  slice_start_0_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_0), index=1
+  slice_done_0 = f32[2,3]{1,0} custom-call(p0, slice_start_0_gte_0, slice_start_0_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_1 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p1), custom_call_target="tpu_custom_call"
+  slice_start_1_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_1), index=0
+  slice_start_1_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_1), index=1
+  slice_done_1 = f32[2,3]{1,0} custom-call(p1, slice_start_1_gte_0, slice_start_1_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_2 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p2), custom_call_target="tpu_custom_call"
+  slice_start_2_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_2), index=0
+  slice_start_2_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_2), index=1
+  slice_done_2 = f32[2,3]{1,0} custom-call(p2, slice_start_2_gte_0, slice_start_2_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_3 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  slice_start_3_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_3), index=0
+  slice_start_3_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_3), index=1
+  slice_done_3 = f32[2,3]{1,0} custom-call(p0, slice_start_3_gte_0, slice_start_3_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_4 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p1), custom_call_target="tpu_custom_call"
+  slice_start_4_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_4), index=0
+  slice_start_4_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_4), index=1
+  slice_done_4 = f32[2,3]{1,0} custom-call(p1, slice_start_4_gte_0, slice_start_4_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_5 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p2), custom_call_target="tpu_custom_call"
+  slice_start_5_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_5), index=0
+  slice_start_5_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_5), index=1
+  slice_done_5 = f32[2,3]{1,0} custom-call(p2, slice_start_5_gte_0, slice_start_5_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  negate0 = f32[2,3]{1,0} negate(slice_done_0)
+  negate1 = f32[2,3]{1,0} negate(negate0)
+  negate2 = f32[2,3]{1,0} negate(negate1)
+  add3 = f32[2,3]{1,0} add(slice_done_1, negate2)
+  negate4 = f32[2,3]{1,0} negate(add3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  add6 = f32[2,3]{1,0} add(slice_done_2, negate5)
+  negate7 = f32[2,3]{1,0} negate(add6)
+  negate8 = f32[2,3]{1,0} negate(negate7)
+  add9 = f32[2,3]{1,0} add(slice_done_3, negate8)
+  negate10 = f32[2,3]{1,0} negate(add9)
+  add11 = f32[2,3]{1,0} add(slice_done_1, negate10)
+  add12 = f32[2,3]{1,0} add(slice_done_4, add11)
+  negate13 = f32[2,3]{1,0} negate(add12)
+  negate14 = f32[2,3]{1,0} negate(negate13)
+  add15 = f32[2,3]{1,0} add(slice_done_5, negate14)
+  negate16 = f32[4,3]{1,0} negate(prefetch_done_0)
+  add17 = f32[4,3]{1,0} add(prefetch_done_0, negate16)
+  add18 = f32[4,3]{1,0} add(p0, add17)
+  add19 = f32[4,3]{1,0} add(p1, add18)
+  add20 = f32[4,3]{1,0} add(p2, add19)
+  ROOT tuple = (f32[4,3]{1,0}, f32[2,3]{1,0}) tuple(add20, add15)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 96;
+  memory_space_options.reserved_bytes_for_block_prefetches = 96;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+  memory_space_options.verify = true;
+
+  std::vector<CustomCallPrefetchInfo> custom_call_prefetch_instructions = {
+      {"p0", "prefetch_start0", "prefetch_done_0"},
+      {"p0", "slice_start_0", "slice_done_0"},
+      {"p0", "slice_start_3", "slice_done_3"},
+      {"p1", "slice_start_1", "slice_done_1"},
+      {"p1", "slice_start_4", "slice_done_4"},
+      {"p2", "slice_start_2", "slice_done_2"},
+      {"p2", "slice_start_5", "slice_done_5"}};
+
+  memory_space_options.hlo_position_to_custom_call_prefetch_details =
+      GetCustomCallPrefetchDetailsMap(/*module=*/module.get(),
+                                      /*custom_call_prefetch_instructions=*/
+                                      custom_call_prefetch_instructions);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  // Check that all uses of custom call prefetches are in alternate memory
+  // space.
+  std::vector<std::string> prefetched_uses = {"negate0", "add3",     "add6",
+                                              "add9",    "add11",    "add12",
+                                              "add15",   "negate16", "add17"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetched_uses,
+      /*operand_index=*/0,
+      /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  // Check that all direct uses of parameters are in default memory space.
+  std::vector<std::string> direct_uses = {"add18", "add19", "add20"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/direct_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kParameter,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       TestScheduleCustomCallPrefetchesWithAliasing) {
+  // params p0, p1, p2 have sliced and non-sliced custom call prefetches.
+  // slice_done_1 and prefetch_done_0 have multiple uses.
+  // p0, p1 and p2 have uses apart from the custom call prefetches.
+  // custom_call17 output aliases with prefetch_done_0, extending the live range
+  // of prefetch_done_0 by sharing the buffer with one additional hlo value.
+  // custom_call15 output aliases with slice_done_5 and custom_call21
+  // output aliases with custom_call17, extending the live range of
+  // slice_done_5, by sharing the buffer with two additional hlo values.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[4,3]{1,0} parameter(0)
+  p1 = f32[4,3]{1,0} parameter(1)
+  p2 = f32[4,3]{1,0} parameter(2)
+
+  prefetch_start0 = (f32[4,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  prefetch_start0_gte_0 = f32[4,3]{1,0} get-tuple-element(prefetch_start0), index=0
+  prefetch_start0_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(prefetch_start0), index=1
+  prefetch_done_0 = f32[4,3]{1,0} custom-call(p0, prefetch_start0_gte_0, prefetch_start0_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+
+  slice_start_0 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  slice_start_0_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_0), index=0
+  slice_start_0_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_0), index=1
+  slice_done_0 = f32[2,3]{1,0} custom-call(p0, slice_start_0_gte_0, slice_start_0_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_1 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p1), custom_call_target="tpu_custom_call"
+  slice_start_1_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_1), index=0
+  slice_start_1_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_1), index=1
+  slice_done_1 = f32[2,3]{1,0} custom-call(p1, slice_start_1_gte_0, slice_start_1_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_2 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p2), custom_call_target="tpu_custom_call"
+  slice_start_2_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_2), index=0
+  slice_start_2_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_2), index=1
+  slice_done_2 = f32[2,3]{1,0} custom-call(p2, slice_start_2_gte_0, slice_start_2_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_3 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  slice_start_3_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_3), index=0
+  slice_start_3_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_3), index=1
+  slice_done_3 = f32[2,3]{1,0} custom-call(p0, slice_start_3_gte_0, slice_start_3_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_4 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p1), custom_call_target="tpu_custom_call"
+  slice_start_4_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_4), index=0
+  slice_start_4_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_4), index=1
+  slice_done_4 = f32[2,3]{1,0} custom-call(p1, slice_start_4_gte_0, slice_start_4_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_5 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p2), custom_call_target="tpu_custom_call"
+  slice_start_5_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_5), index=0
+  slice_start_5_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_5), index=1
+  slice_done_5 = f32[2,3]{1,0} custom-call(p2, slice_start_5_gte_0, slice_start_5_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  negate0 = f32[2,3]{1,0} negate(slice_done_0)
+  negate1 = f32[2,3]{1,0} negate(negate0)
+  negate2 = f32[2,3]{1,0} negate(negate1)
+  add3 = f32[2,3]{1,0} add(slice_done_1, negate2)
+  negate4 = f32[2,3]{1,0} negate(add3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  add6 = f32[2,3]{1,0} add(slice_done_2, negate5)
+  negate7 = f32[2,3]{1,0} negate(add6)
+  negate8 = f32[2,3]{1,0} negate(negate7)
+  add9 = f32[2,3]{1,0} add(slice_done_3, negate8)
+  negate10 = f32[2,3]{1,0} negate(add9)
+  add11 = f32[2,3]{1,0} add(slice_done_1, negate10)
+  add12 = f32[2,3]{1,0} add(slice_done_4, add11)
+  negate13 = f32[2,3]{1,0} negate(add12)
+  negate14 = f32[2,3]{1,0} negate(negate13)
+  custom_call15 = f32[2,3]{1,0} custom-call(slice_done_5, negate14), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate16 = f32[4,3]{1,0} negate(prefetch_done_0)
+  custom_call17 = f32[4,3]{1,0} custom-call(prefetch_done_0, negate16), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  add18 = f32[4,3]{1,0} add(p0, custom_call17)
+  add19 = f32[4,3]{1,0} add(p1, add18)
+  add20 = f32[4,3]{1,0} add(p2, custom_call17)
+  custom_call21 = f32[4,3]{1,0} custom-call(add19, custom_call17), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+  add22 = f32[4,3]{1,0} add(add20, custom_call21)
+  add23 = f32[2,3]{1,0} add(negate14, custom_call15)
+  ROOT tuple = (f32[4,3]{1,0}, f32[2,3]{1,0}) tuple(add22, add23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 96;
+  memory_space_options.reserved_bytes_for_block_prefetches = 96;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+  memory_space_options.verify = true;
+
+  std::vector<CustomCallPrefetchInfo> custom_call_prefetch_instructions = {
+      {"p0", "prefetch_start0", "prefetch_done_0"},
+      {"p0", "slice_start_0", "slice_done_0"},
+      {"p0", "slice_start_3", "slice_done_3"},
+      {"p1", "slice_start_1", "slice_done_1"},
+      {"p1", "slice_start_4", "slice_done_4"},
+      {"p2", "slice_start_2", "slice_done_2"},
+      {"p2", "slice_start_5", "slice_done_5"}};
+
+  memory_space_options.hlo_position_to_custom_call_prefetch_details =
+      GetCustomCallPrefetchDetailsMap(/*module=*/module.get(),
+                                      /*custom_call_prefetch_instructions=*/
+                                      custom_call_prefetch_instructions);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  // Check that all uses of custom call prefetches are in alternate memory
+  // space.
+  std::vector<std::string> prefetched_uses = {
+      "negate0", "add3",          "add6",     "add9",         "add11",
+      "add12",   "custom_call15", "negate16", "custom_call17"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetched_uses,
+      /*operand_index=*/0,
+      /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  // Check that all aliased uses of custom call prefetches are in alternate
+  // memory space.
+  std::vector<std::string> aliased_uses = {"add18", "add20", "custom_call21",
+                                           "add22", "add23"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/aliased_uses,
+      /*operand_index=*/1,
+      /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  // Check that all direct uses of parameters are in default memory space.
+  std::vector<std::string> direct_uses = {"add18", "add19", "add20"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/direct_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kParameter,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, TestCustomCallPrefetchSourceAliasing) {
+  // The block prefetched value (custom_call1) is aliased to the left and to the
+  // right, we test that all aliased values are in default memory space.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  param0 = f32[2,1,3]{2,1,0} parameter(0), sharding={replicated}
+
+  custom_call0 = f32[2,1,3]{2,1,0} custom-call(param0), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate0 = f32[2,1,3]{2,1,0} negate(custom_call0)
+  negate1 = f32[2,1,3]{2,1,0} negate(negate0)
+  custom_call1 = f32[2,1,3]{2,1,0} custom-call(custom_call0, negate1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate2 = f32[2,1,3]{2,1,0} negate(custom_call1)
+  negate3 = f32[2,1,3]{2,1,0} negate(negate2)
+
+  prefetch_start_param0 = (f32[2,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(custom_call1), custom_call_target="tpu_custom_call"
+  gte_param0_0 = f32[2,1,3]{2,1,0:S(1)} get-tuple-element(prefetch_start_param0), index=0
+  gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(prefetch_start_param0), index=1
+  prefetch_done_param0 = f32[2,1,3]{2,1,0:S(1)} custom-call(custom_call1, gte_param0_0, gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  first_slice_prefetch_start_param0 = (f32[1,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(custom_call1), custom_call_target="tpu_custom_call"
+  first_slice_gte_param0_0 = f32[1,1,3]{2,1,0:S(1)} get-tuple-element(first_slice_prefetch_start_param0), index=0
+  first_slice_gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(first_slice_prefetch_start_param0), index=1
+  first_slice_prefetch_done_param0 = f32[1,1,3]{2,1,0:S(1)} custom-call(custom_call1, first_slice_gte_param0_0, first_slice_gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  second_slice_prefetch_start_param0 = (f32[1,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(custom_call1), custom_call_target="tpu_custom_call"
+  second_slice_gte_param0_0 = f32[1,1,3]{2,1,0:S(1)} get-tuple-element(second_slice_prefetch_start_param0), index=0
+  second_slice_gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(second_slice_prefetch_start_param0), index=1
+  second_slice_prefetch_done_param0 = f32[1,1,3]{2,1,0:S(1)} custom-call(custom_call1, second_slice_gte_param0_0, second_slice_gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  negate_param0 = f32[2,1,3]{2,1,0} negate(prefetch_done_param0)
+  add_param0 = f32[2,1,3]{2,1,0} add(prefetch_done_param0, negate_param0)
+  negate_param0_first_slice = f32[1,1,3]{2,1,0} negate(first_slice_prefetch_done_param0)
+  add_param0_first_slice = f32[1,1,3]{2,1,0} add(first_slice_prefetch_done_param0, negate_param0_first_slice)
+  negate_param0_second_slice = f32[1,1,3]{2,1,0} negate(second_slice_prefetch_done_param0)
+  add_param0_second_slice = f32[1,1,3]{2,1,0} add(second_slice_prefetch_done_param0, negate_param0_second_slice)
+
+  custom_call2 = f32[2,1,3]{2,1,0} custom-call(custom_call1, negate3), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate4 = f32[2,1,3]{2,1,0} negate(custom_call2)
+  negate5 = f32[2,1,3]{2,1,0} negate(negate4)
+  custom_call3 = f32[2,1,3]{2,1,0} custom-call(custom_call2, negate5), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  add0 = f32[2,1,3]{2,1,0} add(custom_call3, add_param0)
+  negate6 = f32[2,1,3]{2,1,0} negate(add0)
+
+  ROOT tuple = (f32[2,1,3]{2,1,0}, f32[1,1,3]{2,1,0}, f32[1,1,3]{2,1,0}) tuple(negate6, add_param0_first_slice, add_param0_second_slice)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 1000;
+  memory_space_options.reserved_bytes_for_block_prefetches = 1000;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+
+  std::vector<CustomCallPrefetchInfo> custom_call_prefetch_instructions = {
+      {"custom_call1", "prefetch_start_param0", "prefetch_done_param0"},
+      {"custom_call1", "first_slice_prefetch_start_param0",
+       "first_slice_prefetch_done_param0"},
+      {"custom_call1", "second_slice_prefetch_start_param0",
+       "second_slice_prefetch_done_param0"}};
+
+  memory_space_options.hlo_position_to_custom_call_prefetch_details =
+      GetCustomCallPrefetchDetailsMap(/*module=*/module.get(),
+                                      /*custom_call_prefetch_instructions=*/
+                                      custom_call_prefetch_instructions);
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher {
+        instruction_name_regex: "param0|custom_call0|custom_call1|custom_call2|custom_call3"
+      }
+      override_options { assign_first: true }
+    })pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+  memory_space_options.msa_sort_order_overrides =
+      std::move(msa_sort_order_overrides);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  std::vector<std::string> prefetch_uses = {
+      "add_param0",
+      "add_param0_first_slice",
+      "add_param0_second_slice",
+      "negate_param0",
+      "negate_param0_first_slice",
+      "negate_param0_second_slice",
+  };
+
+  // Check that all uses of custom call prefetches are in alternate memory
+  // space.
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetch_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  std::vector<std::string> default_memory_space_uses = {
+      "negate0",
+      "custom_call1",
+      "negate2",
+      "prefetch_start_param0",
+      "prefetch_done_param0",
+      "first_slice_prefetch_start_param0",
+      "first_slice_prefetch_done_param0",
+      "second_slice_prefetch_start_param0",
+      "second_slice_prefetch_done_param0",
+      "custom_call2",
+      "negate4",
+      "custom_call3",
+      "add0"};
+
+  // Check that all the uses of the hlo values that alias with the prefetched
+  // hlo value (custom_call1) are in default memory space even though they are
+  // higher in the sort order.
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/default_memory_space_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
+TEST_F(SlicedPrefetchTest, TestMultiplePinnedAllocationsBug) {
+  // When block prefetching, finalize the original value if a sliced value is
+  // prefetched successfully and the original value is not, if not finalized it
+  // can cause re-allocation causing multiple pinned allocations for the same
+  // buffer.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[4,3]{1,0} parameter(0)
+  slice0 = f32[2,3]{1,0} slice(p0), slice={[0:2], [0:3]}
+  negate0 = f32[2,3]{1,0} negate(slice0)
+  negate1 = f32[2,3]{1,0} negate(negate0)
+  negate2 = f32[2,3]{1,0} negate(negate1)
+  add2 = f32[2,3]{1,0} add(slice0, negate2)
+  negate3 = f32[2,3]{1,0} negate(add2)
+  negate4 = f32[2,3]{1,0} negate(negate3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  negate6 = f32[4,3]{1,0} negate(p0)
+  negate7 = f32[4,3]{1,0} negate(negate6)
+  negate8 = f32[4,3]{1,0} negate(negate7)
+  negate9 = f32[4,3]{1,0} negate(negate8)
+  add10 = f32[4,3]{1,0} add(p0, negate9)
+  ROOT tuple = (f32[2,3]{1,0}, f32[4,3]{1,0}) tuple(negate5, add10)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 256;
+  memory_space_options.reserved_bytes_for_block_prefetches = 24;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+  memory_space_options.block_prefetched_positions =
+      GetHloPositions(module.get(), {"p0"});
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "p0" }
+      override_options { assign_first: true }
+    })pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+  memory_space_options.msa_sort_order_overrides =
+      std::move(msa_sort_order_overrides);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+  std::vector<std::string> sliced_prefetch_uses = {"negate0", "add2"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/sliced_prefetch_uses,
+      /*operand_index=*/0,
+      /*operand_opcode=*/HloOpcode::kAsyncDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest, NoPrefetchWithBandwidthLimitingAsyncStart) {
   // The negate chain is long enough for asynchronous copy to be inserted
   // between p1 and add. The prefetch will not happen because the bandwidth
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
index 3ae688a100db02..4c6ab79c09b421 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/hlo/analysis/alias_info.h"
@@ -54,9 +55,8 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -92,7 +92,7 @@ class TestBufferIntervalComparator : public BufferIntervalComparator {
   MsaBufferIntervalCompare compare_method_;
 };
 
-class MemorySpaceAssignmentTestBase : public HloTestBase {
+class MemorySpaceAssignmentTestBase : public HloPjRtTestBase {
  protected:
   // We use the following two memory space values to describe the default (slow
   // and large) and alternate (fast and small) memory spaces.
@@ -190,9 +190,9 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     HloCostAnalysis hlo_cost_analysis(hlo_cost_options);
     HloCostAnalysisWithAcceptState hlo_cost_analysis_wrapper(hlo_cost_analysis);
     for (HloComputation* computation : module->MakeNonfusionComputations()) {
-      TF_CHECK_OK(computation->Accept(&hlo_cost_analysis));
+      CHECK_OK(computation->Accept(&hlo_cost_analysis));
     }
-    TF_CHECK_OK(HloAliasAnalysis::Run(module, &alias_info_).status());
+    CHECK_OK(HloAliasAnalysis::Run(module, &alias_info_).status());
 
     Options memory_space_options = DefaultMemorySpaceOptions();
     if (memory_space_options_override) {
@@ -214,7 +214,7 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
 
     auto status_or_cost_analysis = CostAnalysis::Create(
         op_cost_manager, cost_analysis_options, &alias_info_, *module);
-    TF_CHECK_OK(status_or_cost_analysis.status());
+    CHECK_OK(status_or_cost_analysis.status());
     auto cost_analysis = std::move(status_or_cost_analysis.value());
 
     memory_space_options.cost_analysis = cost_analysis.get();
@@ -243,7 +243,7 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
       HloModule* module, std::optional<Options> options_override = std::nullopt,
       int64_t max_prefetch_interval = 10, int64_t min_prefetch_interval = 2) {
     InstructionHoister instruction_hoister;
-    TF_CHECK_OK(instruction_hoister.Run(module).status());
+    CHECK_OK(instruction_hoister.Run(module).status());
     InstructionCountPrefetchIntervalPicker prefetch_interval_picker(
         min_prefetch_interval, max_prefetch_interval);
     return AssignMemorySpace(module, std::move(options_override),
@@ -391,6 +391,44 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     return block_prefetched_positions;
   }
 
+  struct CustomCallPrefetchInfo {
+    std::string prefetched_instruction_name;
+    std::string prefetch_start_instruction_name;
+    std::string prefetch_done_instruction_name;
+  };
+
+  // Returns a map of HloPositions to CustomCallPrefetchDetails for the given
+  // custom call prefetch instructions.
+  absl::flat_hash_map<HloPosition, std::vector<CustomCallPrefetchDetails>>
+  GetCustomCallPrefetchDetailsMap(
+      const HloModule* module,
+      std::vector<CustomCallPrefetchInfo> custom_call_prefetch_instructions) {
+    absl::flat_hash_map<HloPosition, std::vector<CustomCallPrefetchDetails>>
+        hlo_position_to_custom_call_prefetch_details;
+    for (const auto& info : custom_call_prefetch_instructions) {
+      HloInstruction* param =
+          FindInstruction(module, info.prefetched_instruction_name);
+      EXPECT_NE(param, nullptr);
+      HloPosition param_position{param, {}};
+      HloInstruction* prefetch_start =
+          FindInstruction(module, info.prefetch_start_instruction_name);
+      EXPECT_NE(prefetch_start, nullptr);
+      HloInstruction* prefetch_done =
+          FindInstruction(module, info.prefetch_done_instruction_name);
+      EXPECT_NE(prefetch_done, nullptr);
+
+      CustomCallPrefetchDetails details{/*prefetch_start=*/prefetch_start,
+                                        /*prefetch_done=*/prefetch_done,
+                                        /*intermediate_instructions=*/
+                                        {prefetch_done->mutable_operand(1),
+                                         prefetch_done->mutable_operand(2)}};
+
+      hlo_position_to_custom_call_prefetch_details[param_position].push_back(
+          details);
+    }
+    return hlo_position_to_custom_call_prefetch_details;
+  }
+
   // Checks for every instruction in instruction_names that the operand at
   // operand_index has the given opcode and memory space.
   void CheckOperandOpcodeAndMemorySpaceForInstructionNames(
@@ -455,7 +493,7 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     const HloModule* module = instruction->GetModule();
     AliasInfo alias_info;
     auto status_or_alias_analysis = HloAliasAnalysis::Run(module, &alias_info);
-    TF_CHECK_OK(status_or_alias_analysis.status());
+    CHECK_OK(status_or_alias_analysis.status());
     auto alias_analysis = std::move(status_or_alias_analysis.value());
     const HloBuffer& buffer =
         alias_analysis->GetUniqueBufferAt(instruction, index);
@@ -524,7 +562,7 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     HloSchedule schedule(module.get());
     schedule.set_sequence(computation, {p0, p1, tanh, a, b, c, d, e, f, g, h, i,
                                         j, k, l, m, n, o, add});
-    TF_CHECK_OK(module->set_schedule(schedule));
+    CHECK_OK(module->set_schedule(schedule));
     return module;
   }
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/utils.cc b/third_party/xla/xla/service/memory_space_assignment/utils.cc
index 43f04c263f27ee..fc7ae9bca271f5 100644
--- a/third_party/xla/xla/service/memory_space_assignment/utils.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/utils.cc
@@ -18,16 +18,23 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 #include <optional>
+#include <random>
+#include <string>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/highwayhash.h"
 #include "re2/re2.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo_value.h"
@@ -38,6 +45,12 @@ limitations under the License.
 
 namespace xla {
 namespace memory_space_assignment {
+namespace {
+
+// The default seed used by HloRandomFilter when no seed is given.
+const int64_t kRandomFilterDefaultSeed = 1234;
+
+}  // namespace
 
 bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
     const HloValue* value, int64_t alternate_memory_space) {
@@ -147,6 +160,78 @@ bool MemorySpaceAssignmentUtils::DoesUseMatchFilter(
   return true;
 }
 
+double GetInstructionUniformRandom(const HloInstruction& instruction,
+                                   int64_t seed) {
+  // We use the instruction Fingerprint because it's robust across different
+  // runs and compilations, as well as because it depends only on the
+  // isomorphic computation graph of the instruction rather than the variable
+  // names. We would like to make identical decisions for identical
+  // instructions.
+  //
+  // See HloPrintOptions::Fingerprint() for details.
+  //
+  // If in the future that turns out to not be good enough, we can also pass an
+  // option in the proto to choose different string representations of the
+  // instructions - for example, the name or "instruction.ToString()" if we want
+  // a finer grained perturbation, or alternatively the
+  // "instruction.SignatureString()" if we want a coarser perturbation.
+  const std::string& instruction_identifier =
+      instruction.ToString(HloPrintOptions::Fingerprint());
+  std::string instruction_seed_str = absl::StrCat(
+      instruction_identifier.size(), ":", instruction_identifier, ":", seed);
+
+  // We use highwayhash as our hashing function, because it is "Strong",
+  // relatively fast, and "all (64/128/256 bit) variants of HighwayHash frozen,
+  // i.e. unchanging forever" which is good for test stability and repeatability
+  // of experiments.
+
+  // PI in hex
+  static constexpr highwayhash::HHKey hash_key = {
+      0x3243F6A8885A308Dull,
+      0x313198A2E0370734ull,
+      0x4A4093822299F31Dull,
+      0x0082EFA98EC4E6C8ull,
+  };
+
+  highwayhash::HHStateT<HH_TARGET> state(hash_key);
+  highwayhash::HHResult64 seed_result;
+  highwayhash::HighwayHashT(&state, instruction_seed_str.data(),
+                            instruction_seed_str.size(), &seed_result);
+
+  // Assuming highwayhash is a "good" hash and uniformly distributed across all
+  // 64 bits, we could probably skip the step of going through the random
+  // library to generate our uniform [0...1) random.
+  //
+  // We add this anyway to make it clear we're looking for a uniform random
+  // number.
+  std::mt19937_64 gen(seed_result);
+  return std::uniform_real_distribution<double>()(gen);
+}
+
+bool MemorySpaceAssignmentUtils::DoesInstructionMatchRandomFilter(
+    const HloPositionMatcher& filter, const HloInstruction& instruction) {
+  if (!filter.has_hlo_random_filter()) {
+    // This HloPositionMatcher doesn't have a random filter. We return that we
+    // aren't blocking this instruction.
+    return true;
+  }
+  const auto& hlo_random_filter = filter.hlo_random_filter();
+  double selection_range_begin = 0.;
+  if (hlo_random_filter.has_selection_range_begin()) {
+    selection_range_begin = hlo_random_filter.selection_range_begin();
+  }
+  double selection_range_end = 1.;
+  if (hlo_random_filter.has_selection_range_end()) {
+    selection_range_end = hlo_random_filter.selection_range_end();
+  }
+  int64_t seed = kRandomFilterDefaultSeed;
+  if (hlo_random_filter.has_seed()) {
+    seed = hlo_random_filter.seed();
+  }
+  double rnd = GetInstructionUniformRandom(instruction, seed);
+  return rnd >= selection_range_begin && rnd < selection_range_end;
+}
+
 bool MemorySpaceAssignmentUtils::DoesPositionMatchFilter(
     const HloPositionMatcher& filter,
     const MsaBufferInterval& buffer_interval) {
@@ -165,7 +250,8 @@ bool MemorySpaceAssignmentUtils::DoesPositionMatchFilter(
     return false;
   }
   return DoesInstructionMatchFilter(filter, *instruction) &&
-         DoesBufferIntervalMatchHloUseFilter(filter, buffer_interval);
+         DoesBufferIntervalMatchHloUseFilter(filter, buffer_interval) &&
+         DoesInstructionMatchRandomFilter(filter, *instruction);
 }
 
 bool MemorySpaceAssignmentUtils::DoesInstructionMatchFilter(
@@ -301,21 +387,9 @@ MemorySpaceAssignmentUtils::GetOverriddenPreferredPrefetchTime(
 bool MemorySpaceAssignmentUtils::DoesCrossProgramPrefetchBufferMatchAnyFilter(
     const MsaSortOrderOverrides& sort_order_overrides,
     const MsaBufferInterval& buffer_interval) {
-  for (const MsaSortOrderOverride& override :
-       sort_order_overrides.overrides()) {
-    if (override.has_apply_to_cross_program_prefetches() &&
-        override.apply_to_cross_program_prefetches() &&
-        MemorySpaceAssignmentUtils::DoesPositionMatchFilter(
-            override.hlo_position_matcher(), buffer_interval) &&
-        override.override_options().has_assign_first() &&
-        override.override_options().assign_first()) {
-      VLOG(3) << "Cross program prefetch buffer "
-              << buffer_interval.buffer->ToString()
-              << " matches sort order override " << override.DebugString();
-      return true;
-    }
-  }
-  return false;
+  return GetBufferIntervalOverridePriority(
+             sort_order_overrides, buffer_interval,
+             /*is_cross_program_prefetch=*/true) < 0;
 }
 
 int64_t MemorySpaceAssignmentUtils::GetBufferIntervalOverridePriority(
@@ -342,6 +416,8 @@ int64_t MemorySpaceAssignmentUtils::GetBufferIntervalOverridePriority(
         return std::numeric_limits<int64_t>::lowest() + i;
       case MsaSortOrderOverrideOptions::kAssignLast:
         return std::numeric_limits<int64_t>::max() - i;
+      case MsaSortOrderOverrideOptions::kAssignValue:
+        return override.override_options().assign_value();
       case MsaSortOrderOverrideOptions::OPTIONS_NOT_SET:
         continue;
     }
diff --git a/third_party/xla/xla/service/memory_space_assignment/utils.h b/third_party/xla/xla/service/memory_space_assignment/utils.h
index ee7096a12f25cc..df7817d3aa2693 100644
--- a/third_party/xla/xla/service/memory_space_assignment/utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/utils.h
@@ -41,8 +41,7 @@ class MemorySpaceAssignmentUtils {
   // Returns true if this buffer is allowed to be placed in the alternate
   // memory.
   static bool IsIntervalAllowedInAlternateMemory(
-      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
-      int64_t alternate_memory_space);
+      const MsaBufferInterval& interval, int64_t alternate_memory_space);
 
   // Returns true if the HloValue is allowed to be placed in alternate memory.
   static bool IsValueAllowedInAlternateMemory(const HloValue* value,
@@ -54,6 +53,9 @@ class MemorySpaceAssignmentUtils {
   static bool DoesInstructionMatchFilter(const HloPositionMatcher& filter,
                                          const HloInstruction& instruction);
 
+  static bool DoesInstructionMatchRandomFilter(
+      const HloPositionMatcher& filter, const HloInstruction& instruction);
+
   static bool DoesPositionMatchFilter(const HloPositionMatcher& filter,
                                       const MsaBufferInterval& buffer_interval);
 
diff --git a/third_party/xla/xla/service/metrics.proto b/third_party/xla/xla/service/metrics.proto
index a1252dbe168f6f..d9014a16da939a 100644
--- a/third_party/xla/xla/service/metrics.proto
+++ b/third_party/xla/xla/service/metrics.proto
@@ -48,6 +48,11 @@ message JobInfo {
   optional int64 task_id = 5;
   // Task unique id, which may change across job restarts.
   optional int64 task_uid = 6;
+  // Process id -- track subprocesses.
+  optional int64 process_id = 7;
+  // Thread unique id for simultaneous events -- indicate dependencies
+  // and code flow for compilations within a task.
+  optional int64 thread_id = 8;
 }
 
 // Key-Value pair for metrics metadata tags.
diff --git a/third_party/xla/xla/service/multi_output_fusion.cc b/third_party/xla/xla/service/multi_output_fusion.cc
index 1803e586ce6abe..aea40ca4862541 100644
--- a/third_party/xla/xla/service/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/multi_output_fusion.cc
@@ -18,25 +18,30 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <optional>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
-#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/map_util.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 
 namespace xla {
 
-absl::StatusOr<bool> MultiOutputFusion::Run(
+absl::StatusOr<bool> MultiOutputFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -109,7 +114,7 @@ HloInstruction* MultiOutputFusion::CreateFusion(HloInstruction* base,
   reachability_->Replace(base, input_fusion);
   all_fusion_candidates_.emplace_back(input_fusion,
                                       reachability_->GetIndex(input_fusion));
-  TF_CHECK_OK(computation()->ReplaceInstruction(base, input_fusion));
+  CHECK_OK(computation()->ReplaceInstruction(base, input_fusion));
   return input_fusion;
 }
 
@@ -286,11 +291,11 @@ bool MultiOutputFusion::LegalToFuseMainConstraints(HloInstruction* instr1,
   // If both nodes are in-place operations and they use a common in-place
   // operand, we can't fuse these two.
   for (const auto& operand_and_output_index1 :
-       HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr1)) {
+       alias_info_->GetInPlaceInputOutputPairs(instr1)) {
     const HloInstruction* operand =
         instr1->operand(operand_and_output_index1.first.operand_number);
     for (const auto& operand_and_output_index2 :
-         HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr2)) {
+         alias_info_->GetInPlaceInputOutputPairs(instr2)) {
       if (operand ==
           instr2->operand(operand_and_output_index2.first.operand_number)) {
         return false;
diff --git a/third_party/xla/xla/service/multi_output_fusion.h b/third_party/xla/xla/service/multi_output_fusion.h
index 7348f015bde791..798cbeba0e5cb4 100644
--- a/third_party/xla/xla/service/multi_output_fusion.h
+++ b/third_party/xla/xla/service/multi_output_fusion.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -53,17 +54,11 @@ namespace xla {
 //  instruction fusion.
 class MultiOutputFusion : public HloModulePass {
  public:
-  MultiOutputFusion() = default;
+  explicit MultiOutputFusion(const AliasInfo* alias_info)
+      : alias_info_(alias_info) {}
 
   absl::string_view name() const override { return "multi_output_fusion"; }
 
-  // Run multi-output fusion on the given module. Returns whether the module
-  // was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   // Main entry for the optimization. Returns true if the optimization happens.
   bool Perform();
@@ -164,6 +159,14 @@ class MultiOutputFusion : public HloModulePass {
   // computation.
   virtual void CreateFusionWorkListForCurrentComputation();
 
+  // Run multi-output fusion on the given module. Returns whether the module
+  // was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  const AliasInfo* alias_info_;
+
  private:
   // The pair of candidates to be fused and the profit score.
   struct ToBeFused {
@@ -238,7 +241,7 @@ class MultiOutputFusion : public HloModulePass {
       all_fusion_candidates_;
 
   // Computation for the pass.
-  HloComputation* computation_;
+  HloComputation* computation_ = nullptr;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 4000b87c33b05f..11ab2325d38df3 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -883,7 +883,7 @@ absl::Status LinearizeCollectivesWithPipelinedP2PChild(
 
 }  // namespace
 
-absl::StatusOr<bool> P2PSchedulePreparation::Run(
+absl::StatusOr<bool> P2PSchedulePreparation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   P2PGroupMap p2p_group_map;
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.h b/third_party/xla/xla/service/p2p_schedule_preparation.h
index 1545b3b51ddf7d..3d52f231176dc2 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.h
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.h
@@ -193,10 +193,8 @@ class P2PSchedulePreparation : public HloModulePass {
     return "latency-hiding-scheduler-preparation";
   }
 
-  using HloPassInterface::Run;
-  // Runs P2PSchedulePreparation pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
index a5b6928c7b3ec6..0e16f2be6316d1 100644
--- a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
@@ -41,7 +43,6 @@ namespace xla {
 
 namespace {
 
-using ::tsl::testing::StatusIs;
 
 int GetIndex(absl::Span<HloInstruction* const> instruction_sequence,
              absl::string_view hlo_name) {
diff --git a/third_party/xla/xla/service/propagate_original_value_test.cc b/third_party/xla/xla/service/propagate_original_value_test.cc
index 05b01d09e4bd8c..6fcd882a6baa27 100644
--- a/third_party/xla/xla/service/propagate_original_value_test.cc
+++ b/third_party/xla/xla/service/propagate_original_value_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/literal_util.h"
 #include "xla/service/call_inliner.h"
-#include "xla/service/instruction_fusion.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -80,48 +79,6 @@ ENTRY %main (param: s32[2,8], param.1: s32[8,8]) -> s32[2,8] {
   EXPECT_NE(new_root->original_value(), nullptr);
 }
 
-TEST_F(PropagateOriginalValueTest, InstructionFusion) {
-  constexpr absl::string_view hlo_string = R"(
-HloModule test, entry_computation_layout={(s32[]{:T(256)})->u32[2]{0:T(256)}}
-
-ENTRY test {
-  Arg_0 = s32[]{:T(256)} parameter(0), origin={{"Arg_0"}}, metadata={op_name="seed"}
-  constant = s32[]{:T(256)} constant(32), origin={{"constant"}}
-  shift-right-logical = s32[]{:T(256)} shift-right-logical(Arg_0, constant), origin={{"shift-right-logical"}}
-  convert = u32[]{:T(256)} convert(shift-right-logical), origin={{"convert"}}
-  bitcast = u32[1]{0:T(256)} bitcast(convert), origin={{"reshape"}}
-  constant.1 = u32[]{:T(256)} constant(0)
-  pad = u32[2]{0:T(256)} pad(bitcast, constant.1), padding=0_1
-  convert.1 = u32[]{:T(256)} convert(Arg_0), origin={{"convert.1"}}
-  bitcast.1 = u32[1]{0:T(256)} bitcast(convert.1), origin={{"reshape.1"}}
-  pad.1 = u32[2]{0:T(256)} pad(bitcast.1, constant.1), padding=1_0
-  ROOT add = u32[2]{0:T(256)} add(pad, pad.1), origin={{"concatenate"}}
-}
-  )";
-
-  RunAndFilecheckHloRewrite(
-      hlo_string,
-      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true),
-      R"(
-CHECK: %fused_computation
-CHECK:   %[[PARAM:.*]] = s32[]{:T(256)} parameter(0)
-CHECK:   %[[CONSTANT:.*]] = s32[]{:T(256)} constant(32), origin={{[{]}}{"constant"}}
-CHECK:   %[[SHIFT:.*]] = s32[]{:T(256)} shift-right-logical(%[[PARAM]], %[[CONSTANT]]), origin={{[{]}}{"shift-right-logical"}
-CHECK:   %[[CONVERT:.*]] = u32[]{:T(256)} convert(%[[SHIFT]]), origin={{[{]}}{"convert"}
-CHECK:   %[[BITCAST:.*]] = u32[1]{0:T(256)} bitcast(%[[CONVERT]]), origin={{[{]}}{"reshape"}
-CHECK:   %[[CONSTANT1:.*]] = u32[]{:T(256)} constant(0)
-CHECK:   %[[PAD:.*]] = u32[2]{0:T(256)} pad(%[[BITCAST]], %[[CONSTANT1]]), padding=0_1
-CHECK:   %[[CONVERT1:.*]] = u32[]{:T(256)} convert(%[[PARAM]]), origin={{[{]}}{"convert.1"}
-CHECK:   %[[BITCAST1:.*]] = u32[1]{0:T(256)} bitcast(%[[CONVERT1]]), origin={{[{]}}{"reshape.1"}
-CHECK:   %[[PAD1:.*]] = u32[2]{0:T(256)} pad(%[[BITCAST1]], %[[CONSTANT1]]), padding=1_0
-CHECK:   ROOT %[[ADD:.*]] = u32[2]{0:T(256)} add(%[[PAD]], %[[PAD1]]), origin={{[{]}}{"concatenate"}
-
-CHECK: ENTRY %test
-CHECK:   %Arg_0 = s32[]{:T(256)} parameter(0), origin={{[{]}}{"Arg_0"}
-CHECK:   ROOT %pad_add_fusion = u32[2]{0:T(256)} fusion(%Arg_0), kind=kLoop, calls=%fused_computation, origin={{[{]}}{"concatenate"}
-)");
-}
-
 TEST_F(PropagateOriginalValueTest, CallInlinerMultipleCallSites) {
   const absl::string_view hlo_string = R"(
 // CHECK-LABEL:test
diff --git a/third_party/xla/xla/service/reduce_scatter_combiner.cc b/third_party/xla/xla/service/reduce_scatter_combiner.cc
index f2fa462a0fc8d6..32def903a34550 100644
--- a/third_party/xla/xla/service/reduce_scatter_combiner.cc
+++ b/third_party/xla/xla/service/reduce_scatter_combiner.cc
@@ -256,7 +256,7 @@ ReduceScatterCombiner::ReduceScatterCombiner(int64_t combine_threshold_in_bytes,
       combine_by_dim_(combine_by_dim),
       combine_while_loops_(combine_while_loops) {}
 
-absl::StatusOr<bool> ReduceScatterCombiner::Run(
+absl::StatusOr<bool> ReduceScatterCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/reduce_scatter_combiner.h b/third_party/xla/xla/service/reduce_scatter_combiner.h
index 22134bcb258e18..095c69803d186b 100644
--- a/third_party/xla/xla/service/reduce_scatter_combiner.h
+++ b/third_party/xla/xla/service/reduce_scatter_combiner.h
@@ -45,11 +45,6 @@ class ReduceScatterCombiner : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-scatter-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   using GroupKey = std::tuple<AllReduceKey, /*scatter_dimension*/ int64_t,
                               /*extra_args*/ std::string>;
 
@@ -70,6 +65,10 @@ class ReduceScatterCombiner : public HloModulePass {
           const HloInstruction*, const HloDomainMap&, bool)>
           combine_key);
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // Combine reduce-scatter ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index c0a5c8d5993238..f01254ecb2de81 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -34,9 +34,9 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> ReduceScatterDecomposer::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReduceScatterDecomposer::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
   int64_t next_channel_id = hlo_query::NextChannelId(*module);
 
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.h b/third_party/xla/xla/service/reduce_scatter_decomposer.h
index ea52de241020bc..2b24266ef34853 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.h
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 
 #include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -30,18 +31,19 @@ class ReduceScatterDecomposer : public HloModulePass {
  public:
   explicit ReduceScatterDecomposer(
       std::function<void(Shape&)> update_layout = nullptr,
-      std::function<bool(const HloInstruction*)> should_decompose = nullptr)
+      std::function<bool(HloReduceScatterInstruction*)> should_decompose =
+          nullptr)
       : update_layout_(update_layout), should_decompose_(should_decompose) {}
   absl::string_view name() const override {
     return "reduce-scatter-decomposer";
   }
+  std::function<void(Shape&)> update_layout_;
+  std::function<bool(HloReduceScatterInstruction*)> should_decompose_;
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  std::function<void(Shape&)> update_layout_;
-  std::function<bool(const HloInstruction*)> should_decompose_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
index 71476e9d9bd8a8..b4957cce318679 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
@@ -41,11 +41,12 @@ class ReduceScatterDecomposerTest : public HloHardwareIndependentTestBase {
   };
   void RunPass(
       absl::string_view hlo_module, PassAction action,
-      CollectiveOpGroupMode mode = CollectiveOpGroupMode::kCrossReplica,
+      CollectiveOpGroupMode mode =
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
       int64_t shard_size = 0, int64_t shard_dimension = 0,
       int64_t replica_count = 2,
-      std::function<bool(const HloInstruction *)> should_decompose =
-          [](const HloInstruction *) { return true; },
+      std::function<bool(const HloInstruction*)> should_decompose =
+          [](const HloInstruction*) { return true; },
       std::optional<std::pair<std::string, std::string>> attribute =
           std::nullopt) {
     const int64_t partition_count = 2;
@@ -66,19 +67,22 @@ class ReduceScatterDecomposerTest : public HloHardwareIndependentTestBase {
     Literal multiplier = LiteralUtil::CreateR0<uint32_t>(shard_size);
     ::testing::Matcher<const ::xla::HloInstruction *> id_matcher = [&]() {
       switch (mode) {
-        case CollectiveOpGroupMode::kCrossPartition:
+        case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION:
           return op::PartitionId();
-        case CollectiveOpGroupMode::kCrossReplica:
+        case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA:
           return op::ReplicaId();
-        case CollectiveOpGroupMode::kCrossReplicaAndPartition:
+        case CollectiveOpGroupMode::
+            COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION:
           return op::ReplicaId();
-        case CollectiveOpGroupMode::kFlattenedID: {
+        case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID: {
           return op::Add(
               op::Multiply(op::ReplicaId(),
                            op::Constant(LiteralUtil::CreateR0<uint32_t>(
                                partition_count))),
               op::PartitionId());
         }
+        default:
+          LOG(FATAL) << "Unsupported mode: " << static_cast<int>(mode);
       }
     }();
     auto root = module->entry_computation()->root_instruction();
@@ -88,7 +92,8 @@ class ReduceScatterDecomposerTest : public HloHardwareIndependentTestBase {
     if (action == PassAction::kTableLookup) {
       slice_index = op::Reshape(op::DynamicSlice(op::Constant(), id_matcher));
     }
-    if (mode == CollectiveOpGroupMode::kCrossReplicaAndPartition) {
+    if (mode == CollectiveOpGroupMode::
+                    COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION) {
       slice_index = op::Add(
           op::Multiply(
               slice_index,
@@ -128,7 +133,7 @@ ENTRY main {
 }
 )";
   RunPass(hlo_string, PassAction::kTrivialGroups,
-          CollectiveOpGroupMode::kCrossReplica,
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
           /*shard_size=*/4);
 }
 
@@ -148,7 +153,7 @@ ENTRY main {
 }
 )";
   RunPass(hlo_string, PassAction::kTableLookup,
-          CollectiveOpGroupMode::kCrossReplica,
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
           /*shard_size=*/4);
 }
 
@@ -164,13 +169,14 @@ sum {
 
 ENTRY main {
   p0 = f32[4, 8] parameter(0)
-  // Tn this mode, the participants are the given replicas across all partitions.
+  // In this mode, the participants are the given replicas across all partitions.
   // Here, we have 2 replicas and 2 partitions, so 4 total shards.
   ROOT rs = f32[4, 2] reduce-scatter(p0), replica_groups={{0, 1}}, channel_id=1, dimensions={1}, to_apply=sum
 }
 )";
   RunPass(hlo_string, PassAction::kTrivialGroups,
-          CollectiveOpGroupMode::kCrossReplicaAndPartition,
+          CollectiveOpGroupMode::
+              COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION,
           /*shard_size=*/2, /*shard_dimension=*/1);
 }
 
@@ -187,7 +193,7 @@ sum {
 
 ENTRY main {
   p0 = f32[4, 8] parameter(0)
-  // Tn this mode, the participants are the given replicas across all partitions.
+  // In this mode, the participants are the given replicas across all partitions.
   // Here, we have 1 replicas and 2 partitions, so 2 total shards.
   ROOT rs = f32[4, 4] reduce-scatter(p0), frontend_attributes={_scheduling_group_id="1"}, replica_groups={{0}}, channel_id=1, dimensions={1}, to_apply=sum
 }
@@ -196,9 +202,9 @@ ENTRY main {
   // partition_id will be simplified by the pass to just partition_id
   RunPass(
       hlo_string, PassAction::kTrivialGroups,
-      CollectiveOpGroupMode::kCrossPartition,
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION,
       /*shard_size=*/4, /*shard_dimension=*/1, /*replica_count=*/1,
-      /*should_decompose =*/[](const HloInstruction *) { return true; },
+      /*should_decompose =*/[](const HloInstruction*) { return true; },
       std::make_pair("_scheduling_group_id", "1"));
 }
 
@@ -218,7 +224,7 @@ ENTRY main {
 }
 )";
   RunPass(hlo_string, PassAction::kTableLookup,
-          CollectiveOpGroupMode::kFlattenedID,
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID,
           /*shard_size=*/2, /*shard_dimension=*/1);
 }
 
@@ -256,9 +262,9 @@ ENTRY main {
 }
 )";
   RunPass(hlo_string, PassAction::kNoChange,
-          CollectiveOpGroupMode::kCrossReplica,
+          CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
           /*shard_size=*/0, /*shard_dimension=*/0,
-          /*replica_count=*/2, [](const HloInstruction *) { return false; });
+          /*replica_count=*/2, [](const HloInstruction*) { return false; });
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/reduce_scatter_reassociate.cc b/third_party/xla/xla/service/reduce_scatter_reassociate.cc
index 8134b3fc792601..41273b93a2c5aa 100644
--- a/third_party/xla/xla/service/reduce_scatter_reassociate.cc
+++ b/third_party/xla/xla/service/reduce_scatter_reassociate.cc
@@ -53,9 +53,9 @@ bool AreCompatible(const HloReduceScatterInstruction *rs0,
 
 }  // namespace
 
-absl::StatusOr<bool> ReduceScatterReassociate::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReduceScatterReassociate::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedCollective(
           *module, HloOpcode::kReduceScatter)) {
     VLOG(1)
diff --git a/third_party/xla/xla/service/reduce_scatter_reassociate.h b/third_party/xla/xla/service/reduce_scatter_reassociate.h
index fb7a0eccfce042..8a0015fb478328 100644
--- a/third_party/xla/xla/service/reduce_scatter_reassociate.h
+++ b/third_party/xla/xla/service/reduce_scatter_reassociate.h
@@ -34,8 +34,8 @@ class ReduceScatterReassociate : public HloModulePass {
     return "reduce-scatter-reassociate";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
index 7104e1ad310675..092d17df9c8611 100644
--- a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
@@ -247,7 +247,7 @@ absl::StatusOr<bool> UnifyAccumulatorWithInput(
 
 }  // namespace
 
-absl::StatusOr<bool> ScanLoopAccumulatorInputUnification::Run(
+absl::StatusOr<bool> ScanLoopAccumulatorInputUnification::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before ScanLoopAccumulatorInputUnification:";
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
index e105dddb194cb2..d26946965948d1 100644
--- a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
@@ -86,8 +86,9 @@ class ScanLoopAccumulatorInputUnification : public HloModulePass {
   absl::string_view name() const override {
     return "scan_loop_accumulator_input_unification";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/scatter_determinism_expander.cc b/third_party/xla/xla/service/scatter_determinism_expander.cc
index 8476085fdf84d5..c6121b02c2957d 100644
--- a/third_party/xla/xla/service/scatter_determinism_expander.cc
+++ b/third_party/xla/xla/service/scatter_determinism_expander.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "xla/service/scatter_determinism_expander.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <optional>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/strings/str_format.h"
@@ -38,10 +38,9 @@ limitations under the License.
 #include "xla/service/scatter_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -736,9 +735,8 @@ absl::StatusOr<HloInstruction*> ScatterDeterminismExpander::ExpandInstruction(
     std::vector<int64_t> actual_update_window_dims(num_operand_dims);
     int update_dim_index = 0;
     for (int i = 0; i < num_operand_dims; ++i) {
-      if (std::find(dim_numbers.inserted_window_dims().begin(),
-                    dim_numbers.inserted_window_dims().end(),
-                    i) != dim_numbers.inserted_window_dims().end()) {
+      if (absl::c_find(dim_numbers.inserted_window_dims(), i) !=
+          dim_numbers.inserted_window_dims().end()) {
         actual_update_window_dims[i] = 1;
       } else {
         actual_update_window_dims[i] =
diff --git a/third_party/xla/xla/service/service.cc b/third_party/xla/xla/service/service.cc
index a5845f6c2bb62c..44d7898631fe6f 100644
--- a/third_party/xla/xla/service/service.cc
+++ b/third_party/xla/xla/service/service.cc
@@ -18,9 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
-#include <map>
 #include <memory>
-#include <numeric>
 #include <optional>
 #include <set>
 #include <string>
@@ -41,7 +39,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
@@ -68,12 +65,10 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
@@ -541,6 +536,7 @@ absl::StatusOr<std::vector<std::unique_ptr<GlobalData>>> Service::ExecuteGraph(
   TF_RETURN_IF_ERROR(execution_status);
 
   std::vector<std::unique_ptr<GlobalData>> out;
+  out.reserve(outputs.size());
   for (GlobalDataHandle& output : outputs) {
     out.push_back(std::make_unique<GlobalData>(this, output));
   }
@@ -902,7 +898,7 @@ absl::StatusOr<Literal> Service::ComputeConstantGraph(
   TF_ASSIGN_OR_RETURN(
       ProgramShape program_shape,
       ProgramShape::FromProto(computation.proto().host_program_shape()));
-  TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
+  DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
 
   if (output_layout) {
     TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index baebfb88de85ce..b5dc87cbf44ee7 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/service/shape_inference.h"
 
 #include <algorithm>
-#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -32,26 +31,24 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -345,19 +342,13 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
 
   TF_RETURN_IF_ERROR(ExpectArray(shape, "operand of unary operation"));
 
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
   switch (opcode) {
-    case HloOpcode::kAsin:
-    case HloOpcode::kAcos:
-    case HloOpcode::kAcosh:
-    case HloOpcode::kAtanh:
-    case HloOpcode::kCosh:
     case HloOpcode::kFloor:
     case HloOpcode::kCbrt:  // Complex cbrt is not implemented in either of the
                             // backends.
     case HloOpcode::kCeil:
     case HloOpcode::kErf:
-    case HloOpcode::kSinh:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kRoundNearestEven:
       if (!ShapeUtil::ElementIsFloating(shape)) {
@@ -367,7 +358,14 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
             HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type()));
       }
       return shape;
+    case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
+    case HloOpcode::kAcos:
+    case HloOpcode::kAcosh:
+    case HloOpcode::kAtanh:
+    case HloOpcode::kCosh:
     case HloOpcode::kCos:
+    case HloOpcode::kSinh:
     case HloOpcode::kSin:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
@@ -663,8 +661,8 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
 /* static */ absl::StatusOr<Shape> ShapeInference::InferStochasticConvertShape(
     const Shape& operand_shape, const Shape& random_shape,
     PrimitiveType new_element_type) {
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape));
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(random_shape));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(random_shape));
 
   TF_RETURN_IF_ERROR(
       ExpectArray(operand_shape, "lhs of stochastic convert operation"));
@@ -966,7 +964,7 @@ void GenerateDotResultDimensions(
       ShapeUtil::HigherPrecisionElementType(lhs, rhs));
   Shape result = ShapeUtil::MakeShape(type, dimensions, is_dynamic);
 
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
   VLOG(2) << "inferred dot shape: " << ShapeUtil::HumanString(result);
   return result;
 }
@@ -1172,7 +1170,7 @@ void GenerateDotResultDimensions(
                               is_dynamic, rhs_group_dimensions);
 
   Shape result = ShapeUtil::MakeShape(type, dimensions, is_dynamic);
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
   VLOG(2) << "inferred ragged dot shape: " << ShapeUtil::HumanString(result);
   return result;
 }
@@ -1464,8 +1462,8 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
       ShapeUtil::HumanStringWithLayout(rhs),
       StrJoin(broadcast_dimensions, ", "));
 
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
   TF_RETURN_IF_ERROR(ExpectArray(
       lhs, absl::StrCat("lhs of binary operation ", HloOpcodeString(opcode))));
@@ -1547,9 +1545,9 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
 
 /* static */ absl::StatusOr<Shape> ShapeInference::InferTernaryOpShape(
     HloOpcode opcode, const Shape& lhs, const Shape& rhs, const Shape& ehs) {
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(ehs));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(ehs));
   switch (opcode) {
     case HloOpcode::kClamp:
       return InferClampShape(lhs, rhs, ehs);
@@ -1573,7 +1571,7 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
 /* static */ absl::StatusOr<Shape> ShapeInference::InferVariadicOpShape(
     HloOpcode opcode, absl::Span<const Shape* const> operand_shapes) {
   for (const Shape* shape : operand_shapes) {
-    TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(*shape));
+    DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(*shape));
   }
   switch (opcode) {
     case HloOpcode::kTuple: {
@@ -2183,8 +2181,8 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         "The RHS argument to a convolution should have rank %d; rhs: %s.",
         num_dims, ShapeUtil::HumanString(rhs));
   }
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
+  DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
   // Verifies that the input and window dimensions are a permutation of
   // the dimension numbers.
@@ -3859,9 +3857,6 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
         StrJoin(dimensions, ","), ShapeUtil::HumanString(operand));
   }
 
-  // Permute(dimensions,input) computes output[dimensions[i]]=input[i]. However,
-  // we need output[i]=input[dimensions[i]] which is
-  // Permute(Inverse(dimensions),input).
   return ShapeUtil::PermuteDimensions(dimensions, operand);
 }
 
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index 06fa5c1cb4dbe0..bb9658608e4067 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -853,7 +853,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithBF16_F16) {
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
-          /*preferred_element_type=*/std::nullopt))
+          /*preferred_element_type=*/std::nullopt));
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(BF16, {10, 12, 2, 3}),
                                inferred_shape));
 }
@@ -865,7 +865,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithF16_BF16) {
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
-          /*preferred_element_type=*/std::nullopt))
+          /*preferred_element_type=*/std::nullopt));
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(BF16, {10, 12, 2, 3}),
                                inferred_shape));
 }
@@ -877,7 +877,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithS32_U32) {
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
-          /*preferred_element_type=*/std::nullopt))
+          /*preferred_element_type=*/std::nullopt));
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(S32, {10, 12, 2, 3}),
                                inferred_shape));
 }
@@ -889,7 +889,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithU32_S32) {
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
-          /*preferred_element_type=*/std::nullopt))
+          /*preferred_element_type=*/std::nullopt));
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(S32, {10, 12, 2, 3}),
                                inferred_shape));
 }
@@ -901,7 +901,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementType) {
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
-          /*preferred_element_type=*/S16))
+          /*preferred_element_type=*/S16));
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(S16, {10, 12, 2, 3}),
                                inferred_shape));
 }
@@ -913,7 +913,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementTypeSameAsInferredType) {
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
-          /*preferred_element_type=*/S32))
+          /*preferred_element_type=*/S32));
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(S32, {10, 12, 2, 3}),
                                inferred_shape));
 }
@@ -926,7 +926,7 @@ TEST_F(ShapeInferenceTest,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
-          /*preferred_element_type=*/BF16))
+          /*preferred_element_type=*/BF16));
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(BF16, {10, 12, 2, 3}),
                                inferred_shape));
 }
@@ -6017,6 +6017,7 @@ INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, UnboundedUnaryOpShapeInferenceTest,
                               {"f32[?]", "f32[?]", HloOpcode::kAcos},
                               {"f32[?]", "f32[?]", HloOpcode::kAcosh},
                               {"f32[?]", "f32[?]", HloOpcode::kAsin},
+                              {"f32[?]", "f32[?]", HloOpcode::kAsinh},
                               {"f32[?]", "f32[?]", HloOpcode::kAtanh},
                               {"f32[?]", "f32[?]", HloOpcode::kCbrt},
                               {"f32[?]", "f32[?]", HloOpcode::kCeil},
diff --git a/third_party/xla/xla/service/shaped_buffer.cc b/third_party/xla/xla/service/shaped_buffer.cc
index 5abbb00819a462..0fe09a04c8aedd 100644
--- a/third_party/xla/xla/service/shaped_buffer.cc
+++ b/third_party/xla/xla/service/shaped_buffer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -181,7 +181,7 @@ void ScopedShapedBuffer::Deallocate() {
     se::DeviceMemoryBase& memory_base = pair.second;
     if (!memory_base.is_null() &&
         deallocated_ptrs.insert(memory_base.opaque()).second) {
-      TF_CHECK_OK(allocator_->Deallocate(device_ordinal(), memory_base));
+      CHECK_OK(allocator_->Deallocate(device_ordinal(), memory_base));
     }
   }
 }
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index c9154d166da485..2638f829a4fd5b 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -216,6 +216,7 @@ const HloInstruction* PickRepresentativeOperand(
       return nullptr;
     case HloOpcode::kAbs:
     case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
     case HloOpcode::kAcos:
     case HloOpcode::kAcosh:
     case HloOpcode::kAdd:
@@ -1392,8 +1393,7 @@ absl::StatusOr<bool> ProcessShardingInstruction(
         shard_group_id_to_shard_as_group,
     absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_like_group,
-    const std::vector<bool>*
-        allow_spmd_sharding_propagation_to_parameters_vector,
+    absl::Span<const bool> allow_spmd_sharding_propagation_to_parameters_vector,
     bool remove_unknown_shardings) {
   bool changed = false;
 
@@ -1411,11 +1411,10 @@ absl::StatusOr<bool> ProcessShardingInstruction(
           instruction->sharding().IsShardGroup()) {
         if (instruction->IsCustomCall("Sharding")) {
           CHECK(instruction->operand(0)->opcode() != HloOpcode::kParameter ||
-                (allow_spmd_sharding_propagation_to_parameters_vector &&
-                 allow_spmd_sharding_propagation_to_parameters_vector->size() ==
+                (allow_spmd_sharding_propagation_to_parameters_vector.size() ==
                      module->entry_computation()->num_parameters() &&
-                 allow_spmd_sharding_propagation_to_parameters_vector->at(
-                     instruction->operand(0)->parameter_number())));
+                 allow_spmd_sharding_propagation_to_parameters_vector
+                     [instruction->operand(0)->parameter_number()]));
         }
         if (instruction->IsCustomCall("Sharding") && !replaced_with_copy) {
           // Pass shard group to operand sharding custom-call if it's not
@@ -3120,7 +3119,7 @@ std::vector<HloInstruction*> ShardingPropagation::GetRelatedInstructions(
   }
 };
 
-absl::StatusOr<bool> ShardingPropagation::Run(
+absl::StatusOr<bool> ShardingPropagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   LOG(WARNING) << "GSPMD sharding propagation is going to be deprecated and "
@@ -3189,7 +3188,7 @@ absl::StatusOr<bool> ShardingPropagation::Run(
               : nullptr,
           &instruction_to_shard_group_id, &shard_group_id_to_shard_as_group,
           &shard_group_id_to_shard_like_group,
-          &allow_spmd_sharding_propagation_to_parameters_vector_));
+          allow_spmd_sharding_propagation_to_parameters_vector_));
   any_changed |= changed;
 
   for (const auto& [shard_group_id, shard_as_group] :
diff --git a/third_party/xla/xla/service/sharding_propagation.h b/third_party/xla/xla/service/sharding_propagation.h
index f1b05bcba32aee..21d77783b69989 100644
--- a/third_party/xla/xla/service/sharding_propagation.h
+++ b/third_party/xla/xla/service/sharding_propagation.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -79,8 +80,8 @@ absl::StatusOr<bool> ProcessShardingInstruction(
         shard_group_id_to_shard_as_group = nullptr,
     absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_like_group = nullptr,
-    const std::vector<bool>*
-        allow_spmd_sharding_propagation_to_parameters_vector = nullptr,
+    absl::Span<const bool>
+        allow_spmd_sharding_propagation_to_parameters_vector = {},
     bool remove_unknown_shardings = false);
 
 int64_t ComputeNonRootUsers(const HloInstruction* instr);
@@ -130,10 +131,6 @@ class ShardingPropagation : public HloModulePass {
     }
   }
   absl::string_view name() const override { return "sharding-propagation"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Function which can be used to apply a spatially partitioned sharding onto a
   // given domain. It will apply the sharding into the exit edges of the domain
@@ -147,6 +144,11 @@ class ShardingPropagation : public HloModulePass {
       int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph,
       const CustomCallShardingHelper* sharding_helper);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   bool InferShardingFromShardGroup(
       HloInstruction* instruction, int64_t aggressiveness,
@@ -197,8 +199,10 @@ class ShardingPropagation : public HloModulePass {
   bool propagate_metadata_;
   bool allow_spmd_sharding_propagation_to_output_;
   bool allow_spmd_sharding_propagation_to_parameters_;
-  std::vector<bool> allow_spmd_sharding_propagation_to_output_vector_;
-  std::vector<bool> allow_spmd_sharding_propagation_to_parameters_vector_;
+  absl::InlinedVector<bool, 1>
+      allow_spmd_sharding_propagation_to_output_vector_;
+  absl::InlinedVector<bool, 1>
+      allow_spmd_sharding_propagation_to_parameters_vector_;
   // If true, the pass keeps the propagation results only on selected
   // instructions to prevent CSE across unrelated subgraphs. (A common case is
   // scalar broadcasts).
diff --git a/third_party/xla/xla/service/sharding_remover.cc b/third_party/xla/xla/service/sharding_remover.cc
index e2d926a9045aa8..d7c52c68a534a9 100644
--- a/third_party/xla/xla/service/sharding_remover.cc
+++ b/third_party/xla/xla/service/sharding_remover.cc
@@ -36,7 +36,7 @@ namespace xla {
 
 // Remove Sharding custom-call instruction by assigning its users to
 // to its operand.
-absl::StatusOr<bool> ShardingRemover::Run(
+absl::StatusOr<bool> ShardingRemover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/sharding_remover.h b/third_party/xla/xla/service/sharding_remover.h
index 5ea1b6e1273bce..338680bdd64f9d 100644
--- a/third_party/xla/xla/service/sharding_remover.h
+++ b/third_party/xla/xla/service/sharding_remover.h
@@ -32,8 +32,9 @@ namespace xla {
 class ShardingRemover : public HloModulePass {
  public:
   absl::string_view name() const override { return "sharding-remover"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/source_target_pairs_test.cc b/third_party/xla/xla/service/source_target_pairs_test.cc
index 662262c8b5297c..13cdd75e9de216 100644
--- a/third_party/xla/xla/service/source_target_pairs_test.cc
+++ b/third_party/xla/xla/service/source_target_pairs_test.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_format.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/space_to_batch_converter.cc b/third_party/xla/xla/service/space_to_batch_converter.cc
index 8ac3fb10ac4201..754fb426442ee0 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter.cc
@@ -15,19 +15,18 @@ limitations under the License.
 #include "xla/service/space_to_batch_converter.h"
 
 #include <algorithm>
-#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <iterator>
 #include <map>
-#include <memory>
 #include <queue>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -45,11 +44,9 @@ limitations under the License.
 #include "xla/service/shape_inference.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/bitmap.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -912,7 +909,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Run() {
       convs_to_visit_.erase(conv);
     }
     if (convs_to_visit_.count(conv) > 0) {
-      TF_CHECK_OK(PerformSpaceToBatchOnConvolution(conv));
+      CHECK_OK(PerformSpaceToBatchOnConvolution(conv));
       changed_ = true;
     }
   }
@@ -938,8 +935,8 @@ absl::StatusOr<bool> ConvolutionVisitor::Run() {
           bool needs_further_propagation;
           TF_ASSIGN_OR_RETURN(needs_further_propagation,
                               Propagate(instr, producer));
-          TF_CHECK_OK(computation_->ReplaceInstruction(
-              instr, old_to_new_instrs_[instr]));
+          CHECK_OK(computation_->ReplaceInstruction(instr,
+                                                    old_to_new_instrs_[instr]));
           continue;
         }
       }
@@ -953,7 +950,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Run() {
       }
     }
     for (auto entry : operand_map) {
-      TF_CHECK_OK(instr->ReplaceOperandWith(entry.first, entry.second));
+      CHECK_OK(instr->ReplaceOperandWith(entry.first, entry.second));
     }
   }
   non_propagatable_instrs_.clear();
@@ -1901,7 +1898,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
           }
         }
         CHECK_NE(new_broadcast, nullptr);
-        TF_CHECK_OK(
+        CHECK_OK(
             new_consumer->ReplaceOperandWithDifferentShape(i, new_broadcast));
       } else if (old_to_new_instrs_.contains(consumer->mutable_operand(i))) {
         HloInstruction* operand_to_use = nullptr;
@@ -1965,7 +1962,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
         } else {
           operand_to_use = old_to_new_instrs_[consumer->mutable_operand(i)];
         }
-        TF_CHECK_OK(
+        CHECK_OK(
             new_consumer->ReplaceOperandWithDifferentShape(i, operand_to_use));
       } else if (consumer->IsElementwiseBinary() &&
                  consumer->mutable_operand(i)->opcode() ==
@@ -1973,13 +1970,13 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
                  IsBroadcastTree(consumer->mutable_operand(i), producer,
                                  instructions_to_transform)) {
         RewriteBroadcastTree(producer, instructions_to_transform);
-        TF_CHECK_OK(new_consumer->ReplaceOperandWithDifferentShape(
+        CHECK_OK(new_consumer->ReplaceOperandWithDifferentShape(
             i, old_to_new_instrs_[consumer->mutable_operand(i)]));
       } else if (consumer->operand(i)->opcode() == HloOpcode::kConstant) {
         TF_ASSIGN_OR_RETURN(
             auto new_constant,
             PropagateOnConstant(consumer->mutable_operand(i), producer));
-        TF_CHECK_OK(
+        CHECK_OK(
             new_consumer->ReplaceOperandWithDifferentShape(i, new_constant));
       }
     }
@@ -2005,21 +2002,21 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
 
   if (consumer->opcode() == HloOpcode::kConvolution) {
     if (IsConvSuitableForSpaceToBatch(consumer)) {
-      TF_CHECK_OK(PropagateOnConv(consumer));
+      CHECK_OK(PropagateOnConv(consumer));
       return true;
     } else {
-      TF_CHECK_OK(PropagateOnBackpropFilterConv(consumer));
+      CHECK_OK(PropagateOnBackpropFilterConv(consumer));
       return false;
     }
   }
 
   if (consumer->opcode() == HloOpcode::kConcatenate) {
-    TF_CHECK_OK(PropagateOnConcat(consumer));
+    CHECK_OK(PropagateOnConcat(consumer));
     return true;
   }
 
   if (consumer->opcode() == HloOpcode::kReverse) {
-    TF_CHECK_OK(PropagateOnReverse(consumer));
+    CHECK_OK(PropagateOnReverse(consumer));
     return true;
   }
 
@@ -2070,12 +2067,12 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
   // TODO(b/189500737) : Consider a common way of propagation for
   // slice/pad/reduce-window.
   if (consumer->opcode() == HloOpcode::kPad) {
-    TF_CHECK_OK(PropagateOnPad(consumer));
+    CHECK_OK(PropagateOnPad(consumer));
     return true;
   }
 
   if (consumer->opcode() == HloOpcode::kSlice) {
-    TF_CHECK_OK(PropagateOnSlice(consumer));
+    CHECK_OK(PropagateOnSlice(consumer));
     return true;
   }
 
@@ -2204,8 +2201,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
     }
     *(new_consumer->mutable_dimensions()) = changed_dims;
     // Replace operand 0.
-    TF_CHECK_OK(
-        new_consumer->ReplaceOperandWithDifferentShape(0, first_operand));
+    CHECK_OK(new_consumer->ReplaceOperandWithDifferentShape(0, first_operand));
     // We do not set instr_to_dim_permute_map_ here because no further
     // propagation is needed here.
     old_to_new_instrs_[consumer] = new_consumer;
@@ -2385,10 +2381,10 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
               second_operand, init_val, scatter_comp),
           &consumer->metadata(), &consumer->frontend_attributes());
       // Replace operand 0.
-      TF_CHECK_OK(
+      CHECK_OK(
           new_consumer->ReplaceOperandWithDifferentShape(0, first_operand));
       // Replace operand 1.
-      TF_CHECK_OK(
+      CHECK_OK(
           new_consumer->ReplaceOperandWithDifferentShape(1, second_operand));
       VLOG(2) << "New select and scatter " << new_consumer->ToString();
 
@@ -2557,7 +2553,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
                                              reduce_comp),
           &consumer->metadata(), &consumer->frontend_attributes());
       // Replace operand 0.
-      TF_CHECK_OK(
+      CHECK_OK(
           new_consumer->ReplaceOperandWithDifferentShape(0, first_operand));
       VLOG(1) << "New reduce window " << new_consumer->ToString();
     }
@@ -2750,7 +2746,7 @@ absl::Status ConvolutionVisitor::PropagateOnUsers(HloInstruction* old_conv) {
                         BatchToSpace(old_conv));
     VLOG(1) << "Replacing the root instruction to "
             << batch_to_space->ToString();
-    TF_CHECK_OK(computation_->ReplaceInstruction(old_conv, batch_to_space));
+    CHECK_OK(computation_->ReplaceInstruction(old_conv, batch_to_space));
     VLOG(1) << "Replacement successful";
     return absl::OkStatus();
   }
@@ -2783,7 +2779,7 @@ absl::Status ConvolutionVisitor::PropagateOnUsers(HloInstruction* old_conv) {
       if (!needs_further_propagation) {
         VLOG(1) << "Replacing the root instruction to "
                 << old_to_new_instrs_[node]->ToString();
-        TF_CHECK_OK(
+        CHECK_OK(
             computation_->ReplaceInstruction(node, old_to_new_instrs_[node]));
         continue;
       }
@@ -2791,10 +2787,10 @@ absl::Status ConvolutionVisitor::PropagateOnUsers(HloInstruction* old_conv) {
       TF_ASSIGN_OR_RETURN(HloInstruction * batch_to_space, BatchToSpace(node));
       VLOG(1) << "Replacing the root instruction to "
               << batch_to_space->ToString();
-      TF_CHECK_OK(computation_->ReplaceInstruction(node, batch_to_space));
+      CHECK_OK(computation_->ReplaceInstruction(node, batch_to_space));
     } else {
       if (!needs_further_propagation) {
-        TF_CHECK_OK(
+        CHECK_OK(
             computation_->ReplaceInstruction(node, old_to_new_instrs_[node]));
         continue;
       }
@@ -2825,7 +2821,7 @@ absl::Status ConvolutionVisitor::PropagateOnUsers(HloInstruction* old_conv) {
         for (auto user : unsupported_users) {
           for (int64_t i = 0; i < user->operand_count(); ++i) {
             if (user->operand(i) == node) {
-              TF_CHECK_OK(user->ReplaceOperandWith(i, batch_to_space));
+              CHECK_OK(user->ReplaceOperandWith(i, batch_to_space));
             }
           }
         }
@@ -4187,18 +4183,18 @@ absl::Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   if (non_propagatable_instrs_.count(convolution) > 0) {
     non_propagatable_instrs_.erase(convolution);
   }
-  TF_CHECK_OK(PropagateOnUsers(original_conv));
+  CHECK_OK(PropagateOnUsers(original_conv));
 
   return absl::OkStatus();
 }
 
 }  // namespace
 
-absl::StatusOr<bool> SpaceToBatchConverter::Run(
+absl::StatusOr<bool> SpaceToBatchConverter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      2, "SpaceToBatchConverter::Run(), before:\n" + module->ToString());
+      2, "SpaceToBatchConverter::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
 
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
@@ -4208,8 +4204,8 @@ absl::StatusOr<bool> SpaceToBatchConverter::Run(
     }
     VLOG(1) << "Done operating on computation";
   }
-  XLA_VLOG_LINES(2,
-                 "SpaceToBatchConverter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      2, "SpaceToBatchConverter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/space_to_batch_converter.h b/third_party/xla/xla/service/space_to_batch_converter.h
index 37cf1809703d21..7455b86657b68e 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.h
+++ b/third_party/xla/xla/service/space_to_batch_converter.h
@@ -56,15 +56,15 @@ class SpaceToBatchConverter : public HloModulePass {
 
   absl::string_view name() const override { return "space-to-batch-converter"; }
 
+  // Controller for various knobs.
+  SpaceToBatchController ctrl_;
+
+ protected:
   // Run convolution rewriting on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  // Controller for various knobs.
-  SpaceToBatchController ctrl_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index 1592838c1134cf..0e41129ecf809a 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -89,7 +89,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/utility",
         "@local_tsl//tsl/platform:numbers",
     ],
 )
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
index 513ceb2e462dcd..7d92183f735c3a 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
@@ -104,7 +104,7 @@ absl::StatusOr<bool> CanonicalizeAllGatherForCSE::RunOnComputation(
   return changed;
 }
 
-absl::StatusOr<bool> CanonicalizeAllGatherForCSE::Run(
+absl::StatusOr<bool> CanonicalizeAllGatherForCSE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
index 113ffa17ee27d6..e65e6b26c5ec5b 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
@@ -35,8 +35,8 @@ class CanonicalizeAllGatherForCSE : public HloModulePass {
   ~CanonicalizeAllGatherForCSE() override = default;
   absl::string_view name() const override { return "canon-all-gather-for-cse"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/spmd/collective_permute_motion.cc b/third_party/xla/xla/service/spmd/collective_permute_motion.cc
index 8bc97141982c85..63e43fc4dc4c2e 100644
--- a/third_party/xla/xla/service/spmd/collective_permute_motion.cc
+++ b/third_party/xla/xla/service/spmd/collective_permute_motion.cc
@@ -305,7 +305,7 @@ absl::StatusOr<bool> MoveCollectivePermutes(HloComputation* computation,
   return changed;
 }
 
-absl::StatusOr<bool> CollectivePermuteMotion::Run(
+absl::StatusOr<bool> CollectivePermuteMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/collective_permute_motion.h b/third_party/xla/xla/service/spmd/collective_permute_motion.h
index 8a97b165d5beb0..3eac12cefcc26f 100644
--- a/third_party/xla/xla/service/spmd/collective_permute_motion.h
+++ b/third_party/xla/xla/service/spmd/collective_permute_motion.h
@@ -33,8 +33,8 @@ class CollectivePermuteMotion : public HloModulePass {
     return "collective-permute-motion";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/spmd/custom_call_handler.cc b/third_party/xla/xla/service/spmd/custom_call_handler.cc
index 3fb82561ec710c..59ef91f222e5fe 100644
--- a/third_party/xla/xla/service/spmd/custom_call_handler.cc
+++ b/third_party/xla/xla/service/spmd/custom_call_handler.cc
@@ -341,7 +341,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
   };
   HloInstruction* rotated0 = rotate_with_padding(amount);
   if (right_padding == 0) {
-    SetPartitionedHlo(hlo, [&] { return rotated0; });
+    SetPartitionedHlo(hlo, rotated0);
     return absl::OkStatus();
   }
 
@@ -374,10 +374,9 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
   HloInstruction* pred = b_.AddInstruction(HloInstruction::CreateCompare(
       ShapeUtil::ChangeElementType(iota->shape(), PRED), iota,
       selection_boundary, Comparison::Direction::kLt));
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(HloInstruction::CreateTernary(
-        rotated0->shape(), HloOpcode::kSelect, pred, rotated1, rotated0));
-  });
+  SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateTernary(
+                             rotated0->shape(), HloOpcode::kSelect, pred,
+                             rotated1, rotated0)));
   return absl::OkStatus();
 }
 
@@ -405,7 +404,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
         input->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
     auto copy = b_.AddInstruction(
         HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
-    SetPartitionedHlo(hlo, [&] { return copy; });
+    SetPartitionedHlo(hlo, copy);
     return absl::OkStatus();
   }
   if (hlo->custom_call_target() == "SPMDShardToFullShape") {
@@ -416,7 +415,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
         HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
     CHECK(ShapeUtil::Compatible(
         copy->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
-    SetPartitionedHlo(hlo, [&] { return copy; });
+    SetPartitionedHlo(hlo, copy);
     return absl::OkStatus();
   }
 
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index d8844cb90887da..3bda8f55781ebc 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <deque>
 #include <functional>
 #include <memory>
-#include <numeric>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -732,7 +731,7 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
     lhs_tile_assignment_array->Reshape(lhs_tile_assignment_dimensions);
 
     std::vector<int64_t> transpose_order(lhs_tile_assignment_dimensions.size());
-    std::iota(transpose_order.begin(), transpose_order.end(), 0);
+    absl::c_iota(transpose_order, 0);
     transpose_order.back() = lhs_windowing_dim;
     transpose_order[lhs_windowing_dim] =
         lhs_tile_assignment_dimensions.size() - 1;
@@ -4349,7 +4348,7 @@ absl::Status SpmdPartitioningVisitor::HandleDotHelper(
                      num_partitions_, create_sharded_dot, conv_window, module_,
                      hlo, options_, &b_, &windowed_dot_general_loops_, this));
   }
-  SetPartitionedHlo(hlo, [partitioned_dot] { return partitioned_dot; });
+  SetPartitionedHlo(hlo, partitioned_dot);
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/spmd/fft_handler.cc b/third_party/xla/xla/service/spmd/fft_handler.cc
index 086ae31d139d15..715a634193a606 100644
--- a/third_party/xla/xla/service/spmd/fft_handler.cc
+++ b/third_party/xla/xla/service/spmd/fft_handler.cc
@@ -426,10 +426,7 @@ absl::Status SpmdPartitioningVisitor::HandleFft(HloInstruction* hlo) {
       partitioned_input.state().next_channel_id, module_,
       partitioned_input.state().b);
 
-  result->set_sharding(hlo->sharding());
-  auto partitioned_fft =
-      PartitionedHlo(result, hlo->shape(), partitioned_input.state());
-  SetPartitionedHlo(hlo, std::move(partitioned_fft));
+  SetPartitionedHlo(hlo, result);
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index 178a9f5849a02e..80279bb773d1de 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -1009,8 +1009,7 @@ absl::Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
       PartitionGather(gather, operand, indices, gather->shape(),
                       gather->sharding(), absl::MakeConstSpan(batch_dims),
                       gather->gather_slice_sizes(), this));
-  SetPartitionedHlo(gather, PartitionedHlo(pgather, gather->shape(),
-                                           MakePartitioningState()));
+  SetPartitionedHlo(gather, pgather);
   return absl::OkStatus();
 }
 
@@ -1904,8 +1903,7 @@ absl::Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
   if (!pscatter) {
     return DefaultAction(hlo);
   }
-  SetPartitionedHlo(scatter, PartitionedHlo(pscatter, scatter->shape(),
-                                            MakePartitioningState()));
+  SetPartitionedHlo(scatter, pscatter);
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/spmd/partition_assignment.cc b/third_party/xla/xla/service/spmd/partition_assignment.cc
index b74605087df834..6e009c6d8cbca0 100644
--- a/third_party/xla/xla/service/spmd/partition_assignment.cc
+++ b/third_party/xla/xla/service/spmd/partition_assignment.cc
@@ -94,7 +94,7 @@ PartitionAssignment::ChoosePartitioningAlgorithm(
   return PartitioningAlgorithm::CreateNoopPartitioning(num_partitions());
 }
 
-absl::StatusOr<bool> PartitionAssignment::Run(
+absl::StatusOr<bool> PartitionAssignment::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Running partition assignment on module " << module->name();
diff --git a/third_party/xla/xla/service/spmd/partition_assignment.h b/third_party/xla/xla/service/spmd/partition_assignment.h
index 5ba8a2080bb1d8..25b9ba91289921 100644
--- a/third_party/xla/xla/service/spmd/partition_assignment.h
+++ b/third_party/xla/xla/service/spmd/partition_assignment.h
@@ -101,18 +101,18 @@ class PartitionAssignment : public HloModulePass {
   virtual std::unique_ptr<PartitioningAlgorithm> ChoosePartitioningAlgorithm(
       const HloModule& module) const;
 
-  // Runs the pass.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Returns the algorithm being used.
   const PartitioningAlgorithm* algorithm();
 
   // Returns the number of partitions.
   int64_t num_partitions() const;
 
+ protected:
+  // Runs the pass.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // The partitioning algorithm to be used. For now, it is determined by a flag.
   std::unique_ptr<PartitioningAlgorithm> algorithm_ = nullptr;
diff --git a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
index bba155578f477c..2883a5acc637b2 100644
--- a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
+++ b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
@@ -168,7 +168,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* comp, bool for_replicas,
 
 }  // namespace
 
-absl::StatusOr<bool> ScheduleAwareCollectiveOpsCSE::Run(
+absl::StatusOr<bool> ScheduleAwareCollectiveOpsCSE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
index b23216be99f837..0b6463d758aa20 100644
--- a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
+++ b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
@@ -42,8 +42,8 @@ class ScheduleAwareCollectiveOpsCSE : public HloModulePass {
     return "schedule-aware-collective-cse";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/spmd/sharding_format_picker.cc b/third_party/xla/xla/service/spmd/sharding_format_picker.cc
index c6a1a6a8008194..8a4e0c691a3616 100644
--- a/third_party/xla/xla/service/spmd/sharding_format_picker.cc
+++ b/third_party/xla/xla/service/spmd/sharding_format_picker.cc
@@ -165,7 +165,7 @@ std::unique_ptr<HloSharding> MaybeConvertToV1(const HloSharding& sharding) {
 
 }  // namespace
 
-absl::StatusOr<bool> ShardingFormatPicker::Run(
+absl::StatusOr<bool> ShardingFormatPicker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/sharding_format_picker.h b/third_party/xla/xla/service/spmd/sharding_format_picker.h
index 583444157b631e..8eb420cd1d735b 100644
--- a/third_party/xla/xla/service/spmd/sharding_format_picker.h
+++ b/third_party/xla/xla/service/spmd/sharding_format_picker.h
@@ -35,8 +35,9 @@ class ShardingFormatPicker : public HloModulePass {
   explicit ShardingFormatPicker(ShardingType sharding_type)
       : sharding_type_(sharding_type) {}
   absl::string_view name() const override { return "sharding-format-picker"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/spmd/shardy/BUILD b/third_party/xla/xla/service/spmd/shardy/BUILD
index 252ff2c89d0ab0..ac957b3b95a7f2 100644
--- a/third_party/xla/xla/service/spmd/shardy/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/BUILD
@@ -61,6 +61,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:path",
         "@shardy//shardy/common:file_utils",
+        "@shardy//shardy/dialect/sdy/transforms/common:propagation_options",
         "@shardy//shardy/dialect/sdy/transforms/propagation:passes",
     ],
 )
@@ -121,7 +122,6 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:computation_layout",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD b/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
index cf92bcd604fc71..2bfffd7b68a266 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
@@ -39,7 +39,9 @@ cc_library(
     srcs = ["export_named_computations.cc"],
     hdrs = ["export_named_computations.h"],
     deps = [
+        "//xla/mlir_hlo",
         "//xla/service/spmd/shardy:constants",
+        "//xla/service/spmd/shardy:utils",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -56,7 +58,6 @@ cc_library(
     srcs = ["import_func_calls.cc"],
     hdrs = ["import_func_calls.h"],
     deps = [
-        "//xla/service/spmd/shardy:constants",
         "//xla/service/spmd/shardy:utils",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
index 75134550a29721..fdafc3438c8ee3 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/spmd/shardy/round_trip_common/export_named_computations.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <tuple>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/SymbolTable.h"
@@ -40,7 +42,9 @@ limitations under the License.
 #include "shardy/dialect/sdy/ir/constants.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/utils.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/spmd/shardy/constants.h"
+#include "xla/service/spmd/shardy/utils.h"
 
 namespace xla {
 namespace sdy {
@@ -50,12 +54,14 @@ namespace {
 using ::mlir::ArrayAttr;
 using ::mlir::ModuleOp;
 using ::mlir::NamedAttribute;
+using ::mlir::StringAttr;
 using ::mlir::StringRef;
 using ::mlir::SymbolTable;
+using ::mlir::SymbolTableCollection;
+using ::mlir::SymbolUserMap;
 using ::mlir::func::CallOp;
 using ::mlir::func::FuncOp;
 
-using ::mlir::StringAttr;
 using ::mlir::sdy::kShardingAttr;
 using ::mlir::sdy::ManualAxesAttr;
 using ::mlir::sdy::NamedComputationOp;
@@ -145,12 +151,68 @@ class ExportNamedComputationsPass
     this->dedupFunctionsFully = other.dedupFunctionsFully;
   }
 
-  llvm::SmallDenseMap<ComputationKey, StringAttr> funcCache;
-
   void runOnOperation() final {
     ModuleOp moduleOp = getOperation();
-    SymbolTable symbolTable(moduleOp);
+    SymbolTableCollection symbolTableCollection;
+    SymbolTable& symbolTable = symbolTableCollection.getSymbolTable(moduleOp);
     mlir::Block& moduleBlock = moduleOp.getRegion().front();
+    llvm::SmallDenseMap<ComputationKey, StringAttr> funcCache;
+
+    if (dedupFunctionsFully) {
+      using FuncNameKey = std::pair<StringRef, ManualAxesAttr>;
+      llvm::SmallDenseMap<ComputationKey, int64_t> funcCallSiteCounts;
+      llvm::SmallDenseMap<FuncNameKey, std::pair<NamedComputationOp, int64_t>>
+          funcToNamedComputations;
+      // TODO(enver): Instead of a SmallDenseMap and a separate SmallVector to
+      // guarantee a deterministic iteration order, consider using
+      // llvm::MapVector.
+      // Required to iterate on functions in a deterministic order.
+      llvm::SmallVector<FuncNameKey> funcNames;
+      moduleOp.walk([&](NamedComputationOp namedComputationOp) {
+        ManualAxesAttr manualAxesAttr =
+            namedComputationOp->getAttrOfType<ManualAxesAttr>(kManualAxes);
+        auto key =
+            std::make_tuple(namedComputationOp.getName(),
+                            namedComputationOp.getInShardings().value_or(
+                                TensorShardingPerValueAttr()),
+                            namedComputationOp.getOutShardings().value_or(
+                                TensorShardingPerValueAttr()),
+                            manualAxesAttr);
+        const int64_t callSiteCount = funcCallSiteCounts[key]++;
+        FuncNameKey funcNameKey =
+            std::pair(namedComputationOp.getName(), manualAxesAttr);
+        if (auto [it, inserted] = funcToNamedComputations.try_emplace(
+                funcNameKey, namedComputationOp, callSiteCount);
+            !inserted) {
+          auto& [cachedNamedComputationOp, cachedCallSiteCount] = it->second;
+          if (callSiteCount > cachedCallSiteCount) {
+            cachedNamedComputationOp = namedComputationOp;
+            cachedCallSiteCount = callSiteCount;
+          }
+        } else {  // inserted is true.
+          funcNames.push_back(funcNameKey);
+        }
+      });
+
+      for (FuncNameKey funcNameKey : funcNames) {
+        auto& [namedComputationOp, callSiteCount] =
+            funcToNamedComputations.find(funcNameKey)->second;
+        mlir::IRRewriter rewriter(namedComputationOp);
+        rewriter.setInsertionPointToEnd(&moduleBlock);
+        ManualAxesAttr manualAxesAttr =
+            namedComputationOp->getAttrOfType<ManualAxesAttr>(kManualAxes);
+        StringAttr funcSymName =
+            createFuncOp(namedComputationOp, rewriter, symbolTable,
+                         namedComputationOp.getInShardings(),
+                         namedComputationOp.getOutShardings(), manualAxesAttr);
+        funcCache.try_emplace(
+            std::make_tuple(namedComputationOp.getName(),
+                            TensorShardingPerValueAttr(),
+                            TensorShardingPerValueAttr(), manualAxesAttr),
+            funcSymName);
+      }
+    }
+
     // NOTE: The walk needs to be in post order, which is the default order, to
     // account for nested named computations.
     moduleOp.walk([&](NamedComputationOp namedComputationOp) {
@@ -181,14 +243,46 @@ class ExportNamedComputationsPass
           namedComputationOp.getOperands());
       callOp->setAttrs(callOpAttrs);
 
-      // Copy the output shardings to the call op.
-      if (outShardings.has_value()) {
-        mlir::sdy::setShardings(callOp, *outShardings);
+      // Copy the func output shardings to the call op.
+      FuncOp funcOp = symbolTable.lookup<FuncOp>(funcSymName);
+      if (TensorShardingPerValueAttr funcResultShardings =
+              getFuncResultShardings(callOp, funcOp, symbolTable);
+          funcResultShardings) {
+        mlir::sdy::setShardings(callOp, funcResultShardings);
+        if (outShardings.has_value()) {
+          for (auto [funcResultSharding, outSharding, result] : llvm::zip_equal(
+                   funcResultShardings.getShardings(),
+                   outShardings->getShardings(), callOp.getResults())) {
+            if (!funcResultSharding.isEquivalent(outSharding)) {
+              rewriter.setInsertionPointAfterValue(result);
+              auto copyOp =
+                  mlir::mhlo::CopyOp::create(rewriter, result.getLoc(), result);
+              mlir::sdy::setShardings(copyOp, outSharding);
+              rewriter.replaceAllUsesExcept(result, copyOp, copyOp);
+            }
+          }
+        }
         if (manualAxesAttr) {
           callOp->setAttr(kManualAxes, manualAxesAttr);
         }
       }
     });
+
+    // Drop uncalled inlineable manual computation funcs.
+    // TODO(enver): Drop generically, not just inlined manual computation funcs.
+    llvm::SmallVector<FuncOp> uncalledInlineableManualComputationFuncs;
+    SymbolUserMap symbolUserMap(symbolTableCollection, moduleOp);
+    for (FuncOp funcOp : moduleOp.getOps<FuncOp>()) {
+      if (StringRef funcSymName = funcOp.getName();
+          funcSymName.contains(kInlineableManualComputationFuncName) &&
+          symbolUserMap.useEmpty(funcOp)) {
+        uncalledInlineableManualComputationFuncs.push_back(funcOp);
+      }
+    }
+    // TODO(enver): Erase directly without collecting on a vector.
+    for (FuncOp funcOp : uncalledInlineableManualComputationFuncs) {
+      symbolTable.erase(funcOp);
+    }
   }
 
   StringRef getArgument() const override {
@@ -202,13 +296,17 @@ class ExportNamedComputationsPass
            "`NamedComputationOp`s operands/results.";
   }
 
+  void getDependentDialects(mlir::DialectRegistry& registry) const final {
+    registry.insert<mlir::sdy::SdyDialect, mlir::mhlo::MhloDialect>();
+  }
+
   Option<bool> dedupFunctionsFully{
       *this, "dedup-functions-fully",
       llvm::cl::desc(
-          "Whether to deduplicate functions fully, regardless of the input and "
-          "output shardings of functions, and it keeps one callee function for "
-          "each caller function. The default is false, meaning it will "
-          "deduplicate only if the input and output shardings are the same."),
+          "If true, regardless of the input and output shardings of functions, "
+          "it keeps one callee function for each caller function. The default "
+          "is false, meaning it will deduplicate only if the input and output "
+          "shardings are the same."),
       llvm::cl::init(false)};
 };
 
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc
index 4333955ccf403b..99a6e6d167339b 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Threading.h"
 #include "mlir/Analysis/CallGraph.h"
@@ -46,7 +45,6 @@ limitations under the License.
 #include "shardy/dialect/sdy/ir/constants.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/utils.h"
-#include "xla/service/spmd/shardy/constants.h"
 #include "xla/service/spmd/shardy/utils.h"
 
 namespace xla {
@@ -65,15 +63,6 @@ using ::mlir::sdy::NamedComputationOp;
 using ::mlir::sdy::TensorShardingAttr;
 using ::mlir::sdy::TensorShardingPerValueAttr;
 
-bool isInlineableCallOp(CallOp callOp) {
-  if (hasFrontendAttr(callOp, kXlaBackendConfigAttr)) {
-    return false;
-  }
-  auto inlineableAttr =
-      tryGetFrontendAttr<mlir::BoolAttr>(callOp, kXlaInlineableAttr);
-  return !inlineableAttr || inlineableAttr->getValue();
-}
-
 // Returns the first non-maximal mesh on the argument shardings, if there is
 // one. Otherwise returns `std::nullopt`.
 // TODO(enver): Move to utils and potentially with a common helper that takes an
@@ -112,44 +101,6 @@ TensorShardingPerValueAttr getFuncArgShardings(CallOp callOp, FuncOp funcOp,
   return TensorShardingPerValueAttr::get(funcOp.getContext(), argShardings);
 }
 
-// Returns the first non-maximal mesh on the result shardings, if there is
-// one. Otherwise returns `std::nullopt`.
-// TODO(enver): Move to utils and potentially with a common helper that takes an
-// std::function to get the sharding given an index.
-std::optional<mlir::Attribute> getMeshOrRefOnResults(
-    FuncOp funcOp, const SymbolTable& symbolTable) {
-  for (int64_t resultNum = 0; resultNum < funcOp.getNumResults(); ++resultNum) {
-    if (TensorShardingAttr sdySharding =
-            mlir::sdy::getFuncResultSharding(funcOp, resultNum);
-        sdySharding && !sdySharding.getMesh(symbolTable).isMaximal()) {
-      return std::make_optional(sdySharding.getMeshOrRef());
-    }
-  }
-  return std::nullopt;
-}
-
-TensorShardingPerValueAttr getFuncResultShardings(
-    CallOp callOp, FuncOp funcOp, const SymbolTable& symbolTable) {
-  std::optional<mlir::Attribute> meshOrRef =
-      getMeshOrRefOnResults(funcOp, symbolTable);
-  if (!meshOrRef) {
-    return nullptr;
-  }
-  mlir::SmallVector<TensorShardingAttr> resultShardings;
-  resultShardings.reserve(funcOp.getNumResults());
-  for (int64_t resultNum = 0; resultNum < funcOp.getNumResults(); ++resultNum) {
-    TensorShardingAttr sdySharding =
-        mlir::sdy::getFuncResultSharding(funcOp, resultNum);
-    resultShardings.push_back(
-        sdySharding
-            ? sdySharding
-            : TensorShardingAttr::getFullyOpen(
-                  funcOp.getContext(),
-                  getTensorRank(callOp.getResult(resultNum)), *meshOrRef));
-  }
-  return TensorShardingPerValueAttr::get(funcOp.getContext(), resultShardings);
-}
-
 void importCallOp(
     CallOp callOp,
     llvm::SmallDenseMap<StringRef, mlir::Region*>& calleeNameToMovedRegion,
@@ -168,11 +119,10 @@ void importCallOp(
   rewriter.setInsertionPoint(callOp);
   TensorShardingPerValueAttr callOpResultShardings =
       mlir::sdy::getShardingPerValue(callOp);
-  auto namedCompOp = rewriter.create<NamedComputationOp>(
-      callOp->getLoc(), callOp->getResultTypes(), calleeName,
+  auto namedCompOp = NamedComputationOp::create(
+      rewriter, callOp->getLoc(), callOp->getResultTypes(), calleeName,
       callOp.getOperands(),
-      /*inShardings=*/
-      getFuncArgShardings(callOp, funcOp, symbolTable),
+      /*inShardings=*/getFuncArgShardings(callOp, funcOp, symbolTable),
       // TODO(b/439018088): Take func result shardings if call op result
       // shardings are empty.
       /*outShardings=*/
@@ -225,7 +175,9 @@ class ImportFuncCallsPass
     mlir::CallGraph callGraph(moduleOp);
     llvm::ReversePostOrderTraversal<const mlir::CallGraph*> rpo(&callGraph);
     for (mlir::CallGraphNode* node : llvm::reverse(rpo)) {
-      if (node->isExternal()) continue;
+      if (node->isExternal()) {
+        continue;
+      }
       node->getCallableRegion()->walk([&](CallOp op) {
         importCallOp(op, calleeNameToMovedRegion, rewriter, symbolTable);
       });
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc
index 4985d73a38a50a..6fc7be12e0627e 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc
@@ -30,7 +30,8 @@ namespace sdy {
 using ::mlir::func::FuncOp;
 
 void addCommonPreImportPasses(mlir::OpPassManager& pm,
-                              bool enableConstantImport) {
+                              bool enableConstantImport,
+                              bool enableStablehloCanonicalizeFromHloImport) {
   pm.addPass(mlir::createSymbolDCEPass());
   // TODO(b/333505182): remove when partitioning is done in SDY.
   // We call prepare-for-export pass before SDY propagation, so that all IR
@@ -47,8 +48,10 @@ void addCommonPreImportPasses(mlir::OpPassManager& pm,
   if (enableConstantImport) {
     pm.addNestedPass<FuncOp>(createImportConstantsPass());
   }
-  pm.addNestedPass<FuncOp>(
-      mlir::stablehlo_ext::createStablehloCanonicalizeFromHloImportPass());
+  if (enableStablehloCanonicalizeFromHloImport) {
+    pm.addNestedPass<FuncOp>(
+        mlir::stablehlo_ext::createStablehloCanonicalizeFromHloImportPass());
+  }
 }
 
 void addCommonPostImportPasses(mlir::OpPassManager& pm, bool importFuncCalls) {
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.h b/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.h
index 3ed66b0f28445f..3108a098e10486 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.h
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.h
@@ -24,8 +24,9 @@ namespace sdy {
 // Adds the common import passes for both the SDY and StableHLO import
 // pipelines that need to be called before each pipeline converts an HLO
 // sharding/SDY sharding string into an `sdy.sharding` attribute.
-void addCommonPreImportPasses(mlir::OpPassManager& pm,
-                              bool enableConstantImport = true);
+void addCommonPreImportPasses(
+    mlir::OpPassManager& pm, bool enableConstantImport = true,
+    bool enableStablehloCanonicalizeFromHloImport = true);
 
 // Adds the common import passes for both the SDY and StableHLO import
 // pipelines that need to be called after each pipeline converts an HLO
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
index d7a1d67eed2d67..c0edc9df9ddd77 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
@@ -133,9 +133,8 @@ mlir::LogicalResult rewriteManualComputation(
   sdy::TensorShardingPerValueAttr outShardings =
       sdy::TensorShardingPerValueAttr::get(context, {});
   sdy::ManualAxesAttr manualAxes = sdy::ManualAxesAttr::get(context, {});
-  bool newCodePath = false;
 
-  auto setShardingAttrs = [&newCodePath, &manualAxes](
+  auto setShardingAttrs = [&manualAxes](
                               CustomCallOp customCallOp,
                               sdy::TensorShardingPerValueAttr& shardings,
                               llvm::StringRef shardingAttrName) {
@@ -143,7 +142,6 @@ mlir::LogicalResult rewriteManualComputation(
       return;
     }
     if (mlir::DictionaryAttr frontendAttrs = getFrontendAttrs(customCallOp)) {
-      newCodePath = true;
       shardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
           frontendAttrs, shardingAttrName);
       if (manualAxes.empty()) {
@@ -155,18 +153,6 @@ mlir::LogicalResult rewriteManualComputation(
 
   setShardingAttrs(globalToLocalShape, inShardings, kInShardings);
   setShardingAttrs(localToGlobalShape, outShardings, kOutShardings);
-  // TODO(b/410499196): Code to handle loading an old checkpoint. Remove after
-  // 6 months of cl/745735176 being submitted.
-  mlir::DictionaryAttr callOpFrontendAttrs = getFrontendAttrs(callOp);
-  if (!newCodePath && callOpFrontendAttrs &&
-      callOpFrontendAttrs.contains(kManualAxes)) {
-    inShardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
-        callOpFrontendAttrs, kInShardings);
-    outShardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
-        callOpFrontendAttrs, kOutShardings);
-    manualAxes =
-        parseStringAttr<sdy::ManualAxesAttr>(callOpFrontendAttrs, kManualAxes);
-  }
   auto manualComputationOp =
       rewriter.replaceOpWithNewOp<sdy::ManualComputationOp>(
           callOp, resultTypes, operands, inShardings, outShardings, manualAxes);
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
index 844cec98c2be8d..336d7855e29323 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LLVM.h"
 #include "shardy/common/file_utils.h"
+#include "shardy/dialect/sdy/transforms/common/propagation_options.h"
 #include "shardy/dialect/sdy/transforms/propagation/passes.h"
 #include "re2/re2.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -312,6 +313,7 @@ absl::Status runShardingPropagation(HloModule* hloModule,
                                     mlir::ModuleOp mlirModule,
                                     bool importMhloShardings,
                                     mlir::sdy::PropagationOptions options,
+                                    bool dedupFunctionsFully,
                                     absl::string_view passName) {
   LOG(INFO) << "Using Shardy for XLA SPMD propagation.";
 
@@ -367,13 +369,10 @@ absl::Status runShardingPropagation(HloModule* hloModule,
 
     addStablehloImportPipeline(
         pm,
-        /* allowPropagationToArgs=*/
         spanToArrayRef(hloModule->config()
                            .allow_spmd_sharding_propagation_to_parameters()),
-        /*allowPropagationToResults=*/
         spanToArrayRef(
-            hloModule->config().allow_spmd_sharding_propagation_to_output()),
-        /*importFuncCalls=*/true);
+            hloModule->config().allow_spmd_sharding_propagation_to_output()));
   } else {
     // This branch is in production.
     addSdyRoundTripImportPipeline(pm, /*enableConstantImport=*/true,
@@ -387,7 +386,10 @@ absl::Status runShardingPropagation(HloModule* hloModule,
   options.conservativePropagation = hloModule->use_auto_spmd_partitioning();
   options.enableAutoPartitioning = hloModule->use_auto_spmd_partitioning();
   mlir::sdy::addPropagationPipeline(pm, dumpIndex, options);
-  addStablehloExportPipeline(pm);
+
+  xla::sdy::StablehloExportPipelineOptions stablehloExportPipelineOptions;
+  stablehloExportPipelineOptions.dedupFunctionsFully = dedupFunctionsFully;
+  addStablehloExportPipeline(pm, stablehloExportPipelineOptions);
   pm.addPass(mlir::sdy::createSaveModuleOpPass(shardyDir, "output_module",
                                                dumpIndex++));
   tsl::StatusScopedDiagnosticHandler diagnosticHandler(
@@ -422,7 +424,7 @@ bool eraseInlineableAttrForShardyManualComputations(HloModule* module) {
 
 }  // namespace
 
-absl::StatusOr<bool> ShardyXLA::Run(
+absl::StatusOr<bool> ShardyXLA::RunImpl(
     HloModule* hloModule,
     const absl::flat_hash_set<absl::string_view>& executionThreads) {
   auto moduleFrontendAttrs = hloModule->frontend_attributes().map();
@@ -471,9 +473,9 @@ absl::StatusOr<bool> ShardyXLA::Run(
                                      useTupleArgs);
 
   if (runSdyShardingPropagation) {
-    TF_RETURN_IF_ERROR(runShardingPropagation(hloModule, mlirModule.get(),
-                                              importMhloShardings,
-                                              defaultOptions, name()));
+    TF_RETURN_IF_ERROR(
+        runShardingPropagation(hloModule, mlirModule.get(), importMhloShardings,
+                               defaultOptions, dedupFunctionsFully, name()));
   }
 
   // TODO(b/431836696): Remove once issue is fixed.
@@ -502,13 +504,11 @@ absl::StatusOr<bool> ShardyXLA::Run(
       std::move(flattenedInputOutputAliasConfig));
   hloModule->set_buffer_donor_config(std::move(flattenedBufferDonorsConfig));
 
-  // Shardy currently propagates shardings to parameters and root
-  // instructions. Hence, we specify `true` for update_output_layout and
-  // update_parameters_layout.
   TF_RETURN_IF_ERROR(
       hlo_sharding_util::CanonicalizeLayoutAfterShardingPropagation(
-          hloModule, /*update_output_layout=*/{true},
-          /*update_parameters_layout=*/{true}));
+          hloModule,
+          hloModule->config().allow_spmd_sharding_propagation_to_output(),
+          hloModule->config().allow_spmd_sharding_propagation_to_parameters()));
 
   // We don't fully replace the HLO module, so it will continue to have the
   // temporary frontend attributes. So clean them up as XLA won't need them.
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
index da8dc7f294cd9f..0de981e99524bd 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "shardy/dialect/sdy/transforms/propagation/passes.h"
+#include "shardy/dialect/sdy/transforms/common/propagation_options.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -34,24 +34,27 @@ class ShardyXLA : public xla::HloModulePass {
  public:
   explicit ShardyXLA(bool runSdyShardingPropagation = true,
                      mlir::sdy::PropagationOptions defaultOptions =
-                         mlir::sdy::PropagationOptions{})
+                         mlir::sdy::PropagationOptions{},
+                     bool dedupFunctionsFully = false)
       : runSdyShardingPropagation(runSdyShardingPropagation),
-        defaultOptions(defaultOptions) {}
+        defaultOptions(defaultOptions),
+        dedupFunctionsFully(dedupFunctionsFully) {}
 
   absl::string_view name() const override { return "shardy-xla"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       xla::HloModule* hloModule,
       const absl::flat_hash_set<absl::string_view>& executionThreads) override;
 
-  void setRunSdyShardingPropagation(bool runSdyShardingPropagation) {
-    this->runSdyShardingPropagation = runSdyShardingPropagation;
-  }
-
  private:
   bool runSdyShardingPropagation;
   mlir::sdy::PropagationOptions defaultOptions;
+  // Whether to deduplicate functions fully, regardless of the input and output
+  // shardings of functions, and it keeps one callee function for each caller
+  // function. The default is false, meaning it will deduplicate only if the
+  // input and output shardings are the same.
+  bool dedupFunctionsFully;
   // TODO. Run other SDY passes with flags.
 };
 
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc
index 2e58040aeb84c0..9e2c4d2fe9401f 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -53,12 +52,7 @@ void runShardy(VerifiedHloModule* module, bool stablehloImport,
   }
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           ShardyXLA(runSdyShardingPropagation).Run(module));
-  VLOG(1) << module->ToString();
-  if (expectChanged) {
-    EXPECT_TRUE(changed);
-  } else {
-    EXPECT_FALSE(changed);
-  }
+  EXPECT_EQ(changed, expectChanged);
 }
 
 void runShardyWithStablehloImport(VerifiedHloModule* module,
@@ -924,8 +918,6 @@ TEST_F(ShardyXLATest, TestRunShardingPropagationFalseUseTuplesTrue) {
                           ParseAndReturnVerifiedModule(hloString));
   runShardyWithStablehloImport(module.get(),
                                /*runSdyShardingPropagation=*/false);
-  LOG(INFO) << module->ToString(
-      HloPrintOptions{}.set_include_layout_in_shapes(false));
   EXPECT_TRUE(*RunFileCheck(
       module->ToString(HloPrintOptions{}.set_include_layout_in_shapes(false)),
       expected));
@@ -955,8 +947,6 @@ ENTRY %main.0 (Arg_0.0: s64[2]) -> s64[2] {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hloString));
   runShardyWithSdyImport(module.get());
-  LOG(INFO) << module->ToString(
-      HloPrintOptions{}.set_include_layout_in_shapes(false));
   EXPECT_TRUE(*RunFileCheck(
       module->ToString(HloPrintOptions{}.set_include_layout_in_shapes(false)),
       expected));
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc
index def6596fb859f5..6a71171ef4fa8e 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc
@@ -28,8 +28,8 @@ limitations under the License.
 namespace xla {
 namespace sdy {
 
-void addStablehloExportPipeline(
-    mlir::OpPassManager& pm, const StablehloExportPipelineOptions& options) {
+void addStablehloExportPipeline(mlir::OpPassManager& pm,
+                                const StablehloExportPipelineOptions& options) {
   pm.addPass(createStablehloExportManualReductionCollectivesPass());
   // This pass converts `sdy.constant` (which isn't foldable) into
   // `stablehlo.constant` (which is foldable), therefore greedy pattern
@@ -44,14 +44,15 @@ void addStablehloExportPipeline(
   // free variable that has a sharding is lifted as an additional result, and in
   // effect the op will have a replicated sharding for all results.
   pm.addPass(createExportStablehloShardingsPass(
-      /*addMissingShardingToControlFlow=*/true));
+      /*addMissingShardingToControlFlow=*/options
+          .addMissingShardingToControlFlow));
   pm.addPass(createStablehloRoundTripExportCallbackCustomCallsPass());
 }
 
 namespace {
 
-void stablehloExportPipeline(
-    mlir::OpPassManager& pm, const StablehloExportPipelineOptions& options) {
+void stablehloExportPipeline(mlir::OpPassManager& pm,
+                             const StablehloExportPipelineOptions& options) {
   addStablehloExportPipeline(pm, options);
 }
 
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h
index d089b1a2a9f45f..bfb1a60d467288 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h
@@ -27,12 +27,12 @@ namespace sdy {
 struct StablehloExportPipelineOptions
     : public mlir::PassPipelineOptions<StablehloExportPipelineOptions> {
   Option<bool> keepHloShardingConstraints{
-    *this, "keep-hlo-sharding-constraints",
-    llvm::cl::desc(
-        "Whether to convert SDY sharding constraints to @Sharding custom "
-        "calls - the HLO sharding constraint op. Else export "
-        "them to MHLO copy ops. By default, export to MHLO copy ops."),
-    llvm::cl::init(false)};
+      *this, "keep-hlo-sharding-constraints",
+      llvm::cl::desc(
+          "Whether to convert SDY sharding constraints to @Sharding custom "
+          "calls - the HLO sharding constraint op. Else export "
+          "them to MHLO copy ops. By default, export to MHLO copy ops."),
+      llvm::cl::init(false)};
   Option<bool> dedupFunctionsFully{
       *this, "dedup-functions-fully",
       llvm::cl::desc(
@@ -41,6 +41,11 @@ struct StablehloExportPipelineOptions
           "each caller function. The default is false, meaning it will "
           "deduplicate only if the input and output shardings are the same."),
       llvm::cl::init(false)};
+  Option<bool> addMissingShardingToControlFlow{
+      *this, "add-missing-sharding-to-control-flow",
+      llvm::cl::desc(
+          "Whether to add a sharding to a control flow op without one."),
+      llvm::cl::init(true)};
 };
 
 // Register the xla-sdy-stablehlo-export-pipeline.
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.cc
index 36741388f9cb1f..0e2980e1e91997 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.cc
@@ -751,12 +751,13 @@ void registerStablehloImportShardingsPass() {
 void addStablehloImportPipeline(mlir::OpPassManager& pm,
                                 ArrayRef<bool> allowPropagationToArgs,
                                 ArrayRef<bool> allowPropagationToResults,
-                                bool importFuncCalls) {
-  addCommonPreImportPasses(pm);
+                                bool enableStablehloCanonicalizeFromHloImport) {
+  addCommonPreImportPasses(pm, /*enableConstantImport=*/true,
+                           enableStablehloCanonicalizeFromHloImport);
   pm.addPass(createImportShardingsPass(allowPropagationToArgs,
                                        allowPropagationToResults));
   pm.addPass(createStablehloRoundTripShardMapImportPass());
-  addCommonPostImportPasses(pm, importFuncCalls);
+  addCommonPostImportPasses(pm, /*importFuncCalls=*/true);
 }
 
 void registerStablehloImportPipeline() {
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.h b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.h
index 10db81bf0ce2e0..46dc70ac8fdaa0 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.h
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_import.h
@@ -73,10 +73,10 @@ void registerStablehloImportPipeline();
 // - be empty, in which case the default is false for all args/results.
 // - have a single element, in which case the value applies to all args/results.
 // - have the same number of elements as the number of args/results.
-void addStablehloImportPipeline(mlir::OpPassManager& pm,
-                                mlir::ArrayRef<bool> allowPropagationToArgs,
-                                mlir::ArrayRef<bool> allowPropagationToResults,
-                                bool importFuncCalls = true);
+void addStablehloImportPipeline(
+    mlir::OpPassManager& pm, mlir::ArrayRef<bool> allowPropagationToArgs,
+    mlir::ArrayRef<bool> allowPropagationToResults,
+    bool enableStablehloCanonicalizeFromHloImport = true);
 
 // Creates ImportShardingsPass that converts `mhlo.sharding` to `mesh` and
 // `sdy.sharding`.
diff --git a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir
index 8a8fbd591bdeb9..a389c3f6fbbf9b 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir
@@ -101,6 +101,56 @@ func.func @multiple_same_named_computations_different_shardings(%arg0: tensor<8x
 
 sdy.mesh @mesh = <["x"=2, "y"=2]>
 
+// CHECK-LABEL: func @multiple_same_named_computations_same_shardings_named_computations_have_different_manual_computation_calls(
+func.func @multiple_same_named_computations_same_shardings_named_computations_have_different_manual_computation_calls(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"y"}, {"x"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
+  // CHECK-NEXT: %0 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: %1 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: %2 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: return %2 : tensor<8x2xi32>
+  %0 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {}]>] out_shardings=[<@mesh, [{}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = func.call @local_xla.sdy.inlinable_manual_computation_body(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %1 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {}]>] out_shardings=[<@mesh, [{}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = func.call @local_xla.sdy.inlinable_manual_computation_body_0(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %2 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {}]>] out_shardings=[<@mesh, [{}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = func.call @local_xla.sdy.inlinable_manual_computation_body_1(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %2 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func @local_xla.sdy.inlinable_manual_computation_body(
+func.func @local_xla.sdy.inlinable_manual_computation_body(
+  %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>})
+ -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>}) {
+  return %arg0 : tensor<8x2xi32>
+}
+
+// CHECK-NOT:   func @local_xla.sdy.inlinable_manual_computation_body_0(
+func.func @local_xla.sdy.inlinable_manual_computation_body_0(
+  %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>})
+ -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>}) {
+  return %arg0 : tensor<8x2xi32>
+}
+
+// CHECK-NOT:   func @local_xla.sdy.inlinable_manual_computation_body_1(
+func.func @local_xla.sdy.inlinable_manual_computation_body_1(
+  %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>})
+ -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>}) {
+  return %arg0 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func private @baz(
+// CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>})
+// CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>})
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
 // CHECK-LABEL: func @non_flat_nested_named_computations_same_shardings(
 // CHECK-SAME:      %arg0: tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}]>}
 // CHECK-SAME:      -> (tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}]>}) {
diff --git a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir
index 96c07bbbd1a9a3..dac7bb953b4b85 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir
@@ -3,40 +3,114 @@
 sdy.mesh @mesh = <["x"=2, "y"=2]>
 
 // CHECK-LABEL: func @multiple_same_named_computations_different_shardings(
-func.func @multiple_same_named_computations_different_shardings(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"y"}, {"x"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
-  // CHECK-NEXT: %0 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: %1 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: return %1 : tensor<8x2xi32>
+func.func @multiple_same_named_computations_different_shardings(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  // CHECK-NEXT: %[[CALL0:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>}
+  // CHECK-NEXT: %[[CALL1:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>}
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %[[CALL1]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[CALL0]], %[[COPY]]
+  // CHECK-NEXT: return %[[ADD]]
   %0 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {}]>] (%arg1: tensor<8x2xi32>) {
-    %2 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    %2 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>} : tensor<8x2xi32>
     sdy.return %2 : tensor<8x2xi32>
   } : (tensor<8x2xi32>) -> tensor<8x2xi32>
   %1 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
     %3 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
     sdy.return %3 : tensor<8x2xi32>
   } : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  return %1 : tensor<8x2xi32>
+  %4 = stablehlo.add %0, %1 : tensor<8x2xi32>
+  return %4 : tensor<8x2xi32>
 }
 
 // CHECK-LABEL: func private @baz(
 // CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
 // CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
+// CHECK-NEXT:  stablehlo.multiply %arg0, %arg0
+// CHECK-SAME:  sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>}
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @multiple_same_named_computations_different_shardings_different_number_of_call_sites(
+func.func @multiple_same_named_computations_different_shardings_different_number_of_call_sites(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  // CHECK-NEXT: %[[CALL0:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %[[CALL0]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>}
+  // CHECK-NEXT: %[[CALL1:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[CALL2:.*]] = call @baz(%[[COPY]]) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: return %[[CALL2]] : tensor<8x2xi32>
+  %0 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %2 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>} : tensor<8x2xi32>
+    sdy.return %2 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %1 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %2 = sdy.named_computation<"baz">(%0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %2 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func private @baz(
+// CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
+// CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>})
+// CHECK-NEXT:  stablehlo.multiply %arg0, %arg0
+// CHECK-SAME:  sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>}
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @multiple_same_named_computations_multiple_outputs_different_shardings(
+func.func @multiple_same_named_computations_multiple_outputs_different_shardings(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  // CHECK-NEXT: %[[CALL0:.*]]:2 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>]>}
+  // CHECK-NEXT: %[[DIVIDE0:.*]] = stablehlo.divide %[[CALL0]]#0, %[[CALL0]]#1
+  // CHECK-NEXT: %[[CALL1:.*]]:2 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>]>}
+  // CHECK-NEXT: %[[COPY0:.*]] = mhlo.copy %[[CALL1]]#1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}, {"x"}]>]>}
+  // CHECK-NEXT: %[[COPY1:.*]] = mhlo.copy %[[CALL1]]#0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[DIVIDE1:.*]] = stablehlo.divide %[[COPY1]], %[[COPY0]]
+  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[DIVIDE0]], %[[DIVIDE1]]
+  // CHECK-NEXT: return %[[ADD]]
+  %0:2 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+    %5 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>} : tensor<8x2xi32>
+    sdy.return %5, %5 : tensor<8x2xi32>, tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
+  %1 = stablehlo.divide %0#0, %0#1 : tensor<8x2xi32>
+  %2:2 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>, <@mesh, [{"y"}, {"x"}]>] (%arg1: tensor<8x2xi32>) {
+    %5 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    sdy.return %5, %5 : tensor<8x2xi32>, tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
+  %3 = stablehlo.divide %2#0, %2#1 : tensor<8x2xi32>
+  %4 = stablehlo.add %1, %3 : tensor<8x2xi32>
+  return %4 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func private @baz(
+// CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
+// CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>}, tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
+// CHECK-NEXT:  stablehlo.multiply %arg0, %arg0
+// CHECK-SAME:  sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>}
 
 // -----
 
 sdy.mesh @mesh = <["x"=2, "y"=2]>
 
 // CHECK-LABEL: func @named_computations_same_funcs_two_same_manual_axes_different_shardings_one_without_manual_axes(
-// CHECK-SAME:      %arg0: tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}
-// CHECK-SAME:      -> (tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}) {
-// CHECK:       %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
-// CHECK-NEXT:    %2 = func.call @foo(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
-// CHECK-NEXT:    %3 = func.call @foo(%2) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
-// CHECK-NEXT:    sdy.return %3 : tensor<4xf32>
+// CHECK-NEXT:  %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
+// CHECK-NEXT:    %3 = func.call @foo(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT:    %4 = func.call @foo(%3) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT:    %5 = mhlo.copy %4 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}]>]>} : tensor<4xf32>
+// CHECK-NEXT:    sdy.return %5 : tensor<4xf32>
 // CHECK-NEXT:  } : (tensor<8xf32>) -> tensor<8xf32>
 // CHECK-NEXT:  %1 = call @foo_0(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : (tensor<8xf32>) -> tensor<8xf32>
-// CHECK-NEXT:  return %1 : tensor<8xf32>
-func.func @named_computations_same_funcs_two_same_manual_axes_different_shardings_one_without_manual_axes(%arg0: tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}) -> (tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}) {
+// CHECK-NEXT:  %2 = sdy.manual_computation(%1) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
+// CHECK-NEXT:    %3 = func.call @foo(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT:    sdy.return %3 : tensor<4xf32>
+// CHECK-NEXT:  } : (tensor<8xf32>) -> tensor<8xf32>
+// CHECK-NEXT:  return %2 : tensor<8xf32>
+func.func @named_computations_same_funcs_two_same_manual_axes_different_shardings_one_without_manual_axes(%arg0: tensor<8xf32>) -> tensor<8xf32> {
   %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
     %1 = sdy.named_computation<"foo">(%arg1) in_shardings=[<@mesh, [{"y"}]>] out_shardings=[<@mesh, [{"y"}]>] (%arg2: tensor<4xf32>) {
       %2 = stablehlo.abs %arg2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : tensor<4xf32>
@@ -52,7 +126,14 @@ func.func @named_computations_same_funcs_two_same_manual_axes_different_sharding
     %6 = stablehlo.abs %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : tensor<8xf32>
     sdy.return %6 : tensor<8xf32>
   } : (tensor<8xf32>) -> tensor<8xf32>
-  return %5 : tensor<8xf32>
+  %7 = sdy.manual_computation(%5) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
+    %1 = sdy.named_computation<"foo">(%arg1) in_shardings=[<@mesh, [{"y"}]>] out_shardings=[<@mesh, [{"y"}]>] (%arg2: tensor<4xf32>) {
+      %2 = stablehlo.abs %arg2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : tensor<4xf32>
+      sdy.return %2 : tensor<4xf32>
+    } {xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+    sdy.return %1 : tensor<4xf32>
+  } : (tensor<8xf32>) -> tensor<8xf32>
+  return %7 : tensor<8xf32>
 }
 
 // CHECK-LABEL: func private @foo(
diff --git a/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir b/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
index 923d940989864d..170056aaead49d 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
@@ -68,7 +68,8 @@ func.func @ragged_dot_mode_batch(
     %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
     %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"c"}, {"d"}]>},
     %arg2: tensor<4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}]>}) -> (tensor<16x32x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>}) {
-  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{
+  // CHECK: %[[RESHARD:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{}]> : tensor<4xi32>
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %[[RESHARD]]) <{
   // CHECK: }>
   // CHECK-SAME: sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>
   // CHECK-SAME: sdy.sharding_rule = #sdy.op_sharding_rule<([i, j, l], [i, l, k], [m])->([i, j, k]) {i=16, j=32, k=8, l=64, m=1} reduction={l}>
diff --git a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir
index 9b9c97aa1fbd24..ad893647423a63 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir
@@ -62,7 +62,8 @@ func.func @ragged_dot_mode_batch(
     %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
     %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"c"}, {"d"}]>},
     %arg2: tensor<4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}]>}) -> tensor<16x32x8xf32> {
-  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{
+  // CHECK: %[[RESHARD:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{}]> : tensor<4xi32>
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %[[RESHARD]]) <{
   // CHECK: }>
   // CHECK-SAME: {sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>
   // CHECK: %[[ALL_REDUCE:.*]] = sdy.all_reduce {"c"} %[[RAGGED_DOT]] out_sharding=<@mesh_abcd, [{"a"}, {"b"}, {"d"}]> : tensor<16x32x8xf32>
diff --git a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir
index 1f401e10b2b84e..df81b84f627ac1 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir
@@ -64,12 +64,13 @@ func.func @ragged_dot_mode_batch(
     %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
     %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"c"}, {"d"}]>},
     %arg2: tensor<4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}]>}) -> tensor<16x32x8xf32> {
-  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{
+  // CHECK: %[[RESHARD0:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{}]> : tensor<4xi32>
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %[[RESHARD0]]) <{
   // CHECK: }>
   // CHECK-SAME: {sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>
   // CHECK: %[[ALL_REDUCE:.*]] = sdy.all_reduce {"c"} %[[RAGGED_DOT]] out_sharding=<@mesh_abcd, [{"a"}, {"b"}, {"d"}]> : tensor<16x32x8xf32>
-  // CHECK-NEXT: %[[RESHARD:.*]] = sdy.reshard %[[ALL_REDUCE]] <@mesh_abcd, [{}, {}, {}]> : tensor<16x32x8xf32>
-  // CHECK: return %[[RESHARD]] : tensor<16x32x8xf32>
+  // CHECK-NEXT: %[[RESHARD1:.*]] = sdy.reshard %[[ALL_REDUCE]] <@mesh_abcd, [{}, {}, {}]> : tensor<16x32x8xf32>
+  // CHECK: return %[[RESHARD1]] : tensor<16x32x8xf32>
   %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
     #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
     lhs_batching_dimensions = [0], rhs_batching_dimensions = [0],
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir
deleted file mode 100644
index 611044113035fd..00000000000000
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir
+++ /dev/null
@@ -1,265 +0,0 @@
-// RUN: sdy_opt %s -xla-sdy-round-trip-shard-map-import 2>&1 | FileCheck %s
-
-sdy.mesh @mesh_0 = <["a"=4, "b"=2]>
-sdy.mesh @mesh_1 = <["a"=2, "b"=2, "c"=2, "d"=2]>
-
-// CHECK-LABEL: func @single_manual_comp
-func.func @single_manual_comp(%arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<8x32xf32>) {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body
-  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {"b"}]>, <@mesh_0, [{"b"}, {}], replicated={"a"}>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}], replicated={"b"}>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a", "b"}
-  // CHECK-SAME:              (%arg2: tensor<2x8xf32>, %arg3: tensor<8x32xf32>) {
-  // CHECK-NEXT:            %[[ADD_0:.*]] = stablehlo.add %arg2, %arg2 : tensor<2x8xf32>
-  // CHECK-NEXT:            %[[DOT:.*]] = stablehlo.dot %[[ADD_0]], %arg3 : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  // CHECK-NEXT:            %[[REDUCE:.*]] = "stablehlo.all_reduce"(%[[DOT]])
-  // CHECK-NEXT:            ^bb0(%arg4: tensor<f32>, %arg5: tensor<f32>):
-  // CHECK-NEXT:              %[[ADD_1:.*]] = stablehlo.add %arg4, %arg5 : tensor<f32>
-  // CHECK-NEXT:              stablehlo.return %[[ADD_1]] : tensor<f32>
-  // CHECK-NEXT:            }) : (tensor<2x32xf32>) -> tensor<2x32xf32>
-  // CHECK-NEXT:            sdy.return %[[REDUCE]] : tensor<2x32xf32>
-  // CHECK-NEXT:          } : (tensor<8x16xf32>, tensor<16x32xf32>) -> tensor<8x32xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x32xf32>
-  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<8x16xf32>, tensor<16x32xf32>) -> (tensor<2x8xf32>, tensor<8x32xf32>)
-  %1 = call @local_xla.sdy.manual_computation_body(%0#0, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}], replicated={\22a\22}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}], replicated={\22b\22}>]>"}} : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x32xf32>) -> tensor<8x32xf32>
-  return %2 : tensor<8x32xf32>
-}
-
-// CHECK-LABEL: func @single_manual_comp_name_is_not_prefix_nor_suffix
-func.func @single_manual_comp_name_is_not_prefix_nor_suffix(%arg0: tensor<8x8xf32>) -> (tensor<8x8xf32>) {
-  // CHECK-NOT: call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234
-  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            sdy.return %arg1 : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
-  %1 = call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
-  return %2 : tensor<8x8xf32>
-}
-
-// CHECK-LABEL: func @manual_comp_using_another
-func.func @manual_comp_using_another(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_0
-  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            sdy.return %arg1 : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_1
-  // CHECK-NEXT:          %[[MAN_COMP_1:.*]] = sdy.manual_computation(%[[MAN_COMP_0]])
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME:              (%arg1: tensor<8x4xf32>) {
-  // CHECK-NEXT:            sdy.return %arg1 : tensor<8x4xf32>
-  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP_1]] : tensor<8x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_0(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
-  %3 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%2) : (tensor<8x8xf32>) -> tensor<8x4xf32>
-  %4 = call @local_xla.sdy.manual_computation_body_1(%3) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>"}} : (tensor<8x4xf32>) -> tensor<8x4xf32>
-  %5 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%4) : (tensor<8x4xf32>) -> tensor<8x8xf32>
-  return %5 : tensor<8x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_3(
-func.func @local_xla.sdy.manual_computation_body_3(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_2(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
-  return %2 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_2(
-func.func @local_xla.sdy.manual_computation_body_2(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
-  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
-  return %0 : tensor<2x4xf32>
-}
-
-// CHECK-LABEL: func @nested_shmaps
-func.func @nested_shmaps(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_3
-  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            %[[MAN_COMP_1:.*]] = sdy.manual_computation(%arg1)
-  // CHECK-SAME{LITERAL}:       in_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       out_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       manual_axes={"b"}
-  // CHECK-SAME:                (%arg2: tensor<2x4xf32>) {
-  // CHECK-NEXT:              %[[MULT:.*]] = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
-  // CHECK-NEXT:              sdy.return %[[MULT]] : tensor<2x4xf32>
-  // CHECK-NEXT:            } : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT:            sdy.return %[[MAN_COMP_1]] : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_3(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
-  return %2 : tensor<4x8xf32>
-}
-
-// CHECK-LABEL: func @nested_shmaps_extra_op
-func.func @nested_shmaps_extra_op(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_5
-  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            %[[MAN_COMP_1:.*]] = sdy.manual_computation(%arg1)
-  // CHECK-SAME{LITERAL}:       in_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       out_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       manual_axes={"b"}
-  // CHECK-SAME:                (%arg2: tensor<2x4xf32>) {
-  // CHECK-NEXT:              %[[MULT:.*]] = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
-  // CHECK-NEXT:              sdy.return %[[MULT]] : tensor<2x4xf32>
-  // CHECK-NEXT:            } : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT:            %[[ADD:.*]] = stablehlo.add %[[MAN_COMP_1]], %[[MAN_COMP_1]] : tensor<2x8xf32>
-  // CHECK-NEXT:            sdy.return %[[ADD]] : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_5(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
-  return %2 : tensor<4x8xf32>
-}
-
-// CHECK-LABEL: func @manual_computation_no_inputs
-func.func @manual_computation_no_inputs() -> tensor<4xi64> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_6
-  // CHECK:               %[[SHMAP:.*]] = sdy.manual_computation()
-  // CHECK-SAME{LITERAL}:     in_shardings=[]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"b"}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME{LITERAL}:     () {
-  // CHECK-NEXT:            %[[C:.*]] = stablehlo.constant dense<[2, 3]> : tensor<2xi64>
-  // CHECK-NEXT:            sdy.return %[[C]] : tensor<2xi64>
-  // CHECK-NEXT:          } : () -> tensor<4xi64>
-  // CHECK-NEXT:          return %[[SHMAP]] : tensor<4xi64>
-  %0 = call @local_xla.sdy.manual_computation_body_6() {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>"}} : () -> tensor<2xi64>
-  %1 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%0) : (tensor<2xi64>) -> tensor<4xi64>
-  return %1 : tensor<4xi64>
-}
-
-// CHECK-LABEL: func @manual_computation_no_outputs
-func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_7
-  // CHECK:               sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"b"}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME{LITERAL}:     (%arg1: tensor<2xi64>) {
-  // CHECK-NEXT:            stablehlo.custom_call @sdy_testonly(%arg1) : (tensor<2xi64>) -> ()
-  // CHECK-NEXT:            sdy.return
-  // CHECK-NEXT:          } : (tensor<4xi64>) -> ()
-  // CHECK-NEXT:          return
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4xi64>) -> tensor<2xi64>
-  call @local_xla.sdy.manual_computation_body_7(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}} : (tensor<2xi64>) -> ()
-  return
-}
-
-// CHECK-LABEL: func @manual_computation_no_inputs_no_outputs
-func.func @manual_computation_no_inputs_no_outputs() {
-  // CHECK-NEXT: sdy.manual_computation() in_shardings=[] out_shardings=[] manual_axes={} () {
-  // CHECK-NEXT:   sdy.return
-  // CHECK-NEXT: } : () -> ()
-  // CHECK-NEXT: return
-  call @local_xla.sdy.manual_computation_body_8() {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>", xla.sdy.manual_axes = "#sdy<manual_axes{}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}} : () -> ()
-  return
-}
-
-// CHECK-LABEL: func @manual_computation_zero_dim_inputs
-func.func @manual_computation_zero_dim_inputs(%arg0: tensor<0x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<0x32xf32>) {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body
-  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{}, {"b"}]>, <@mesh_0, [{"b"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {}], replicated={"b"}>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME:              (%arg2: tensor<0x8xf32>, %arg3: tensor<8x32xf32>) {
-  // CHECK-NEXT:            %[[DOT:.*]] = stablehlo.dot %arg2, %arg3
-  // CHECK-NEXT:            sdy.return %[[DOT]]
-  // CHECK-NEXT:          } : (tensor<0x16xf32>, tensor<16x32xf32>) -> tensor<0x32xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP]]
-  %c = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
-  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<0x16xf32>, tensor<16x32xf32>) -> (tensor<0x8xf32>, tensor<8x32xf32>)
-  %1 = call @local_xla.sdy.manual_computation_body_9(%c, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {}], replicated={\22b\22}>]>"}} : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<0x32xf32>) -> tensor<0x32xf32>
-  return %2 : tensor<0x32xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body(
-func.func @local_xla.sdy.manual_computation_body(%arg0: tensor<2x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<2x32xf32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2x8xf32>
-  %1 = stablehlo.dot %0, %arg1 : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  %2 = "stablehlo.all_reduce"(%1) <{replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>}> ({
-  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %3 = stablehlo.add %arg2, %arg3 : tensor<f32>
-    stablehlo.return %3 : tensor<f32>
-  }) : (tensor<2x32xf32>) -> tensor<2x32xf32>
-  return %2 : tensor<2x32xf32>
-}
-
-func.func @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  return %arg0 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_0(
-func.func @local_xla.sdy.manual_computation_body_0(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  return %arg0 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_1(
-func.func @local_xla.sdy.manual_computation_body_1(%arg0: tensor<8x4xf32>) -> tensor<8x4xf32> {
-  return %arg0 : tensor<8x4xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_4(
-func.func @local_xla.sdy.manual_computation_body_4(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
-  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
-  return %0 : tensor<2x4xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_5(
-func.func @local_xla.sdy.manual_computation_body_5(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_4(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
-  %3 = stablehlo.add %2, %2 : tensor<2x8xf32>
-  return %3 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_6(
-func.func @local_xla.sdy.manual_computation_body_6() -> tensor<2xi64> {
-  %c = stablehlo.constant dense<[2, 3]> : tensor<2xi64>
-  return %c : tensor<2xi64>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_7(
-func.func @local_xla.sdy.manual_computation_body_7(%arg0: tensor<2xi64>) {
-  stablehlo.custom_call @sdy_testonly(%arg0) : (tensor<2xi64>) -> ()
-  return
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_8(
-func.func @local_xla.sdy.manual_computation_body_8() {
-  return
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_9(
-func.func @local_xla.sdy.manual_computation_body_9(%arg0: tensor<0x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<0x32xf32> {
-  %0 = stablehlo.dot %arg0, %arg1 : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
-  return %0 : tensor<0x32xf32>
-}
diff --git a/third_party/xla/xla/service/spmd/shardy/utils.cc b/third_party/xla/xla/service/spmd/shardy/utils.cc
index 93e6a80f08136b..d3a4bd26971cef 100644
--- a/third_party/xla/xla/service/spmd/shardy/utils.cc
+++ b/third_party/xla/xla/service/spmd/shardy/utils.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Support/LLVM.h"
@@ -439,5 +440,47 @@ bool hasShardyMesh(mlir::ModuleOp module) {
   return !module.getOps<mlir::sdy::MeshOp>().empty();
 }
 
+namespace {
+// Returns the first non-maximal mesh on the result shardings, if there is
+// one. Otherwise returns `std::nullopt`.
+// TODO(enver): Use a common helper that takes an std::function to get the
+// sharding given an index.
+std::optional<Attribute> getMeshOrRefOnResults(
+    mlir::func::FuncOp funcOp, const mlir::SymbolTable& symbolTable) {
+  for (int64_t resultNum = 0; resultNum < funcOp.getNumResults(); ++resultNum) {
+    if (mlir::sdy::TensorShardingAttr sdySharding =
+            mlir::sdy::getFuncResultSharding(funcOp, resultNum);
+        sdySharding && !sdySharding.getMesh(symbolTable).isMaximal()) {
+      return std::make_optional(sdySharding.getMeshOrRef());
+    }
+  }
+  return std::nullopt;
+}
+}  // namespace
+
+mlir::sdy::TensorShardingPerValueAttr getFuncResultShardings(
+    mlir::func::CallOp callOp, mlir::func::FuncOp funcOp,
+    const mlir::SymbolTable& symbolTable) {
+  std::optional<mlir::Attribute> meshOrRef =
+      getMeshOrRefOnResults(funcOp, symbolTable);
+  if (!meshOrRef) {
+    return nullptr;
+  }
+  SmallVector<mlir::sdy::TensorShardingAttr> resultShardings;
+  resultShardings.reserve(funcOp.getNumResults());
+  for (int64_t resultNum = 0; resultNum < funcOp.getNumResults(); ++resultNum) {
+    mlir::sdy::TensorShardingAttr sdySharding =
+        mlir::sdy::getFuncResultSharding(funcOp, resultNum);
+    resultShardings.push_back(
+        sdySharding ? sdySharding
+                    : mlir::sdy::TensorShardingAttr::getFullyOpen(
+                          funcOp.getContext(),
+                          mlir::sdy::getTensorRank(callOp.getResult(resultNum)),
+                          *meshOrRef));
+  }
+  return mlir::sdy::TensorShardingPerValueAttr::get(funcOp.getContext(),
+                                                    resultShardings);
+}
+
 }  // namespace sdy
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/shardy/utils.h b/third_party/xla/xla/service/spmd/shardy/utils.h
index 944153f5f81526..362ac876879512 100644
--- a/third_party/xla/xla/service/spmd/shardy/utils.h
+++ b/third_party/xla/xla/service/spmd/shardy/utils.h
@@ -166,6 +166,12 @@ bool hasGspmdAttrsOrOps(mlir::ModuleOp module);
 // TODO(b/420837831): delete this once we don't fall back to GSPMD.
 bool hasShardyMesh(mlir::ModuleOp module);
 
+// Returns the func result shardings of `funcOp`, with fully-replicated
+// shardings for empty shardings on `funcOp`, by using the ranks from `callOp`.
+mlir::sdy::TensorShardingPerValueAttr getFuncResultShardings(
+    mlir::func::CallOp callOp, mlir::func::FuncOp funcOp,
+    const mlir::SymbolTable& symbolTable);
+
 }  // namespace sdy
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index d1b01dbd3996a1..dedd077eeb73c8 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -93,10 +93,9 @@ std::string SpmdLogger::MakeReport() {
   absl::StrAppend(&report,
                   "\n\n***** SPMD memory during transformation *****\n");
 
-  std::sort(entries_.begin(), entries_.end(),
-            [](auto const& entry0, auto const& entry1) {
-              return entry0.first > entry1.first;
-            });
+  absl::c_sort(entries_, [](auto const& entry0, auto const& entry1) {
+    return entry0.first > entry1.first;
+  });
   for (int64_t i = 0;
        i < std::min<int64_t>(report_instruction_count_, entries_.size()); ++i) {
     absl::StrAppend(&report, "\n  ",
@@ -176,22 +175,22 @@ template <typename F>
     }
   }
 
-  const auto add_report = [&](std::vector<HloInstruction*>* insts) {
-    std::sort(insts->begin(), insts->end(),
-              [](const HloInstruction* inst0, const HloInstruction* inst1) {
-                return ShapeSizeInBytes(inst0->shape()) >
-                       ShapeSizeInBytes(inst1->shape());
-              });
+  const auto add_report = [&](std::vector<HloInstruction*>& insts) {
+    absl::c_sort(insts,
+                 [](const HloInstruction* inst0, const HloInstruction* inst1) {
+                   return ShapeSizeInBytes(inst0->shape()) >
+                          ShapeSizeInBytes(inst1->shape());
+                 });
     for (int64_t i = 0;
-         i < std::min<int64_t>(report_instruction_count, insts->size()); ++i) {
+         i < std::min<int64_t>(report_instruction_count, insts.size()); ++i) {
       absl::StrAppend(&report, "  ",
                       tsl::strings::HumanReadableNumBytes(
-                          ShapeSizeInBytes((*insts)[i]->shape())),
-                      " : ", (*insts)[i]->ToString(), "\n");
+                          ShapeSizeInBytes(insts[i]->shape())),
+                      " : ", insts[i]->ToString(), "\n");
     }
   };
 
-  add_report(&instructions);
+  add_report(instructions);
   return report;
 }
 
@@ -677,7 +676,7 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(
   if (target.ReplicateOnLastTileDim()) {
     std::vector<int64_t> group_dims(target.tile_assignment().num_dimensions() -
                                     1);
-    std::iota(group_dims.begin(), group_dims.end(), 0);
+    absl::c_iota(group_dims, 0);
     auto target_grouped =
         hlo_sharding_util::GroupShardingOnDims(target, group_dims);
     auto partially_sharded = PerGroupSliceFromReplicated(
@@ -1339,7 +1338,7 @@ PartitionedHlo PartitionedHlo::Replicate() const {
 
   // 'Tiled' to 'Replicated'.
   std::vector<int64_t> all_dims(shape.dimensions().size());
-  std::iota(all_dims.begin(), all_dims.end(), 0);
+  absl::c_iota(all_dims, 0);
   HloInstruction* result = ReplicatePartial(all_dims);
   result->set_sharding(HloSharding::Replicate());
   return update_cache(PartitionedHlo(result, base_shape_, state_));
@@ -1683,7 +1682,7 @@ HloSharding GetAllToAllSharding(const HloSharding& source_sharding,
     }
 
     std::vector<int> permutation(shape_1_dims.size());
-    std::iota(permutation.begin(), permutation.end(), 0);
+    absl::c_iota(permutation, 0);
     std::swap(permutation[added_source_dim], permutation[added_target_dim]);
     std::vector<int64_t> shape_2_dims(result.dimensions().begin(),
                                       result.dimensions().end());
@@ -1995,7 +1994,7 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
       HloInstruction::CreateReshape(transpose_0->shape(), all_to_all));
 
   std::vector<int64_t> permutation_1(base_shape_.dimensions().size());
-  std::iota(permutation_1.begin(), permutation_1.end(), num_eligible_dims);
+  absl::c_iota(permutation_1, num_eligible_dims);
   for (int64_t i = 0; i < num_eligible_dims; ++i) {
     auto it = absl::c_find(permutation_1,
                            eligible_source_dims[i] + num_eligible_dims);
@@ -2268,7 +2267,7 @@ std::optional<PartitionedHlo> PartitionedHlo::TryComplexReshardHandling(
             << sharding().ToString();
     std::vector<int64_t> transpose_dims(
         sharding().tile_assignment().num_dimensions(), 0);
-    std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+    absl::c_iota(transpose_dims, 0);
     std::swap(transpose_dims[first_different_dimension], transpose_dims.back());
     auto intermediate_sharding =
         hlo_sharding_util::TransposeSharding(sharding(), transpose_dims);
@@ -2467,10 +2466,9 @@ SpmdPartitioningVisitor::MakePartitioningState() {
     state.collective_ops_creator = *visiting_collective_ops_creator_;
     state.partition_id = *visiting_partition_id_;
     return CreatePerGroupPartitioningState(state, *device_groups_, &b_);
-  } else {
-    state.collective_ops_creator = collective_ops_creator_;
-    state.partition_id = partition_id_;
   }
+  state.collective_ops_creator = collective_ops_creator_;
+  state.partition_id = partition_id_;
   return state;
 }
 
@@ -2783,10 +2781,10 @@ absl::Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
     new_operands.push_back(
         GetPartitionedHlo(operand).Reshard(hlo->sharding()).hlo());
   }
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands)));
   return absl::OkStatus();
 }
 
@@ -2909,7 +2907,7 @@ absl::Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
     return DefaultAction(hlo);
   }
 
-  SetPartitionedHlo(hlo, [&] { return final_operand; });
+  SetPartitionedHlo(hlo, final_operand);
   return absl::OkStatus();
 }
 
@@ -2934,9 +2932,7 @@ absl::Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
     }
     auto clone = b_.AddInstruction(
         hlo->CloneWithNewOperands(hlo->shape(), new_operands));
-    clone->set_sharding(sharding);
-    SetPartitionedHlo(
-        hlo, PartitionedHlo(clone, hlo->shape(), MakePartitioningState()));
+    SetPartitionedHlo(hlo, clone);
     return absl::OkStatus();
   }
   // Special handling for sort in TopK when first operand partitioined at
@@ -3127,10 +3123,10 @@ absl::Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
   for (HloInstruction* operand : hlo->operands()) {
     new_operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
   }
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands)));
   return absl::OkStatus();
 }
 
@@ -3150,10 +3146,10 @@ absl::Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
   auto operand = GetPartitionedHlo(hlo->operand(0))
                      .Reshard(desired_operand_sharding)
                      .hlo();
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand}));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand})));
   return absl::OkStatus();
 }
 
@@ -3200,7 +3196,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     PartitionedHlo reshard_reshape =
         PartitionedHlo(reshape, hlo->shape(), MakePartitioningState())
             .Reshard(sharding);
-    SetPartitionedHlo(hlo, [&] { return reshard_reshape.hlo(); });
+    SetPartitionedHlo(hlo, reshard_reshape.hlo());
 
     if (sharding_pairs.size() == 2 &&
         sharding_pairs[1].first == operand.sharding() &&
@@ -3323,7 +3319,8 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
           output_shard_size * split_factor);
       return operand.state().b->AddInstruction(HloInstruction::CreateReshape(
           output_shard_shape, reshard_operand->sharded_input));
-    } else if (output_dim_size % input_dim_size == 0) {
+    }
+    if (output_dim_size % input_dim_size == 0) {
       // Merge dims.
       int64_t merge_factor = output_dim_size / input_dim_size;
       // First reshape locally. (The sharded dimension could include padded
@@ -3456,7 +3453,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
   };
   TF_ASSIGN_OR_RETURN(HloInstruction * partitioned,
                       recursive_shard(operand, sharding, hlo->shape()));
-  SetPartitionedHlo(hlo, [&] { return partitioned; });
+  SetPartitionedHlo(hlo, partitioned);
   return absl::OkStatus();
 }
 
@@ -3545,11 +3542,9 @@ absl::Status SpmdPartitioningVisitor::HandleSingleDevice(
     false_computation = module_->AddEmbeddedComputation(false_b.Build(root));
   }
 
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateConditional(
-        hlo->shape(), pred, operand, true_computation, operand,
-        false_computation));
-  });
+  SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateConditional(
+                             hlo->shape(), pred, operand, true_computation,
+                             operand, false_computation)));
   return absl::OkStatus();
 }
 
@@ -3665,10 +3660,8 @@ absl::Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
       new_dims);
   auto input = operand.Reshard(desired_input_sharding).hlo();
   auto output_shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(
-        hlo->CloneWithNewOperands(output_shard_shape, {input}));
-  });
+  SetPartitionedHlo(hlo, b_.AddInstruction(hlo->CloneWithNewOperands(
+                             output_shard_shape, {input})));
   return absl::OkStatus();
 }
 
@@ -3727,81 +3720,91 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
-absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
-    HloInstruction* hlo) {
-  if (hlo->sharding().IsTileMaximal()) {
-    return DefaultAction(hlo);
+HloInstruction* PadHelper(SpmdPartitioningVisitor& visitor,
+                          PartitionedHlo operand,
+                          HloInstruction* replicated_padding_value,
+                          const PaddingConfig& padding_config,
+                          const Shape& base_shape,
+                          const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return nullptr;
   }
 
-  std::vector<HloInstruction*> new_indices;
-  new_indices.reserve(hlo->shape().dimensions().size());
-  for (int64_t i = 0; i < hlo->shape().dimensions().size(); ++i) {
-    const HloInstruction* index = hlo->operand(i + 2);
-    if (hlo->operand(1)->shape().dimensions(i) == hlo->shape().dimensions(i)) {
-      new_indices.emplace_back(CreateZero(index->shape(), &b_));
-    } else {
-      // Replicate the indices.
-      new_indices.emplace_back(GetPartitionedHlo(index).Replicate().hlo());
-    }
+  std::optional<PartitionedHlo::WindowedInputShardReturnValue> reshard_operand =
+      ReshardDataForPad(replicated_padding_value, padding_config, operand,
+                        sharding, visitor.builder());
+
+  if (!reshard_operand.has_value()) {
+    return nullptr;
   }
 
-  DynamicUpdateSliceAnalysis analysis = AnalyzeDynamicUpdateSlice(hlo);
+  HloInstruction* sharded_pad = PadDataFromWindowReshard(
+      *reshard_operand, replicated_padding_value, visitor.builder());
 
-  // Method 1. Replicate the slice dimensions for all involved tensors.
-  // TODO(b/407610806). Add support if all partitioned slice dimensions have
-  // constant indices.
-  if (analysis.method == DynamicUpdateSliceMethod::kDefault ||
-      analysis.method == DynamicUpdateSliceMethod::
-                             kAllPartitionedSliceDimsHaveConstantIndices) {
-    const HloSharding& input_sharding = hlo->operand(0)->sharding();
-    const HloSharding& output_sharding = hlo->sharding();
-    const HloSharding& better_sharding =
-        input_sharding.NumTiles() > output_sharding.NumTiles()
-            ? input_sharding
-            : output_sharding;
-
-    HloSharding replicated_sharding =
-        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-            better_sharding, analysis.slice_dims);
-    auto base = GetPartitionedHlo(hlo->operand(0)).Reshard(replicated_sharding);
-    auto operand =
-        GetPartitionedHlo(hlo->operand(1)).Reshard(replicated_sharding);
-    auto dus = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-        base.hlo()->shape(), base.hlo(), operand.hlo(), new_indices));
-    dus->set_sharding(replicated_sharding);
-    SetPartitionedHlo(hlo, PartitionedHlo(dus, base.base_shape(), base.state())
-                               .Reshard(hlo->sharding()));
-    return absl::OkStatus();
+  if (!reshard_operand->dynamic_slice_index_on_output) {
+    return sharded_pad;
   }
+  Shape shard_shape = MakePartitionedShape(base_shape, sharding);
 
-  // Method 2. Keep the sharding for input and output since the update is fully
-  // contained in a single partition.
-  CHECK(analysis.method == DynamicUpdateSliceMethod::kUpdateOnASinglePartition);
+  HloInstruction* result =
+      visitor.builder()->AddInstruction(HloInstruction::CreateDynamicSlice(
+          shard_shape, sharded_pad,
+          *reshard_operand->dynamic_slice_index_on_output,
+          shard_shape.dimensions()));
+  return result;
+}
 
+absl::Status SpmdPartitioningVisitor::HandleDUSDefault(
+    HloInstruction* hlo, const HloInstruction* input_tensor,
+    const HloInstruction* update_tensor,
+    std::vector<HloInstruction*>& new_indices,
+    std::vector<int64_t> slice_dims) {
+  const HloSharding& input_sharding = input_tensor->sharding();
+  const HloSharding& output_sharding = hlo->sharding();
+  const HloSharding& better_sharding =
+      input_sharding.NumTiles() > output_sharding.NumTiles() ? input_sharding
+                                                             : output_sharding;
+
+  HloSharding replicated_sharding =
+      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(better_sharding,
+                                                               slice_dims);
+  auto base = GetPartitionedHlo(input_tensor).Reshard(replicated_sharding);
+  auto operand = GetPartitionedHlo(update_tensor).Reshard(replicated_sharding);
+  auto dus = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      base.hlo()->shape(), base.hlo(), operand.hlo(), new_indices));
+  dus->set_sharding(replicated_sharding);
+  SetPartitionedHlo(hlo, PartitionedHlo(dus, base.base_shape(), base.state())
+                             .Reshard(hlo->sharding()));
+  return absl::OkStatus();
+};
+
+absl::Status SpmdPartitioningVisitor::HandleDUSSinglePartitionUpdate(
+    HloInstruction* hlo, const HloInstruction* input_tensor,
+    const HloInstruction* update_tensor,
+    std::vector<HloInstruction*>& new_indices, std::vector<int64_t> slice_dims,
+    std::vector<int64_t> partitioned_slice_dims) {
   auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
     return b_.AddInstruction(std::move(to_add));
   };
-
   // Get partitioned input.
   const auto& dus_sharding = hlo->sharding();
   const auto& partitioned_input =
-      GetPartitionedHlo(hlo->operand(0)).Reshard(dus_sharding).hlo();
+      GetPartitionedHlo(input_tensor).Reshard(dus_sharding).hlo();
 
-  auto update_sharding =
-      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-          dus_sharding, analysis.slice_dims);
+  HloSharding update_sharding =
+      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(dus_sharding,
+                                                               slice_dims);
 
-  // TODO(wangtao): use collective permute for sharded update.
   HloInstruction* replicate_update =
-      GetPartitionedHlo(hlo->operand(1)).Reshard(update_sharding).hlo();
+      GetPartitionedHlo(update_tensor).Reshard(update_sharding).hlo();
 
-  const auto& partitioned_shape = partitioned_input->shape();
-  auto partition_ordinals = MakeTiledPartitionOrdinals(
+  const Shape& partitioned_shape = partitioned_input->shape();
+  std::vector<HloInstruction*> partition_ordinals = MakeTiledPartitionOrdinals(
       hlo->sharding(), MakePartitioningState().partition_id, &b_);
   HloInstruction* all_dims_within_partition = add_hlo(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
 
-  for (int64_t dim : analysis.partitioned_slice_dims) {
+  for (int64_t dim : partitioned_slice_dims) {
     // Calculate per partition size.
     const int64_t per_partition_size = partitioned_shape.dimensions(dim);
 
@@ -3819,15 +3822,15 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
                                        offset_shape.element_type()),
           new_indices[dim]));
     }
-    auto partition_offset = add_hlo(HloInstruction::CreateBinary(
+    HloInstruction* partition_offset = add_hlo(HloInstruction::CreateBinary(
         offset_shape, HloOpcode::kMultiply, partition_ordinals[dim],
         per_partition_size_hlo));
     // offset >= partition_id * per_partition_size
-    auto offset_ge = add_hlo(HloInstruction::CreateCompare(
+    HloInstruction* offset_ge = add_hlo(HloInstruction::CreateCompare(
         compare_shape, new_indices[dim], partition_offset,
         ComparisonDirection::kGe));
     // offset < (partition_id + 1) * per_partition_size
-    auto offset_lt = add_hlo(HloInstruction::CreateCompare(
+    HloInstruction* offset_lt = add_hlo(HloInstruction::CreateCompare(
         compare_shape, new_indices[dim],
         add_hlo(HloInstruction::CreateBinary(
             offset_shape, HloOpcode::kMultiply,
@@ -3837,8 +3840,9 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
                     LiteralUtil::CreateR0<int>(1))))),
             per_partition_size_hlo)),
         ComparisonDirection::kLt));
-    auto update_within_partition = add_hlo(HloInstruction::CreateBinary(
-        compare_shape, HloOpcode::kAnd, offset_ge, offset_lt));
+    HloInstruction* update_within_partition =
+        add_hlo(HloInstruction::CreateBinary(compare_shape, HloOpcode::kAnd,
+                                             offset_ge, offset_lt));
 
     all_dims_within_partition = add_hlo(HloInstruction::CreateBinary(
         compare_shape, HloOpcode::kAnd, all_dims_within_partition,
@@ -3864,18 +3868,130 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
   }
 
   // Create dynamic update slice.
-  auto dus = add_hlo(HloInstruction::CreateDynamicUpdateSlice(
+  HloInstruction* dus = add_hlo(HloInstruction::CreateDynamicUpdateSlice(
       partitioned_shape, partitioned_input, replicate_update, new_indices));
-  SetPartitionedHlo(hlo, [&]() {
-    // Select if update is needed.
-    return add_hlo(HloInstruction::CreateTernary(
-        dus->shape(), HloOpcode::kSelect,
-        add_hlo(HloInstruction::CreateBroadcast(
-            ShapeUtil::ChangeElementType(dus->shape(), PRED),
-            all_dims_within_partition, {})),
-        dus, partitioned_input));
-  });
+  // Select if update is needed
+  SetPartitionedHlo(hlo,
+                    add_hlo(HloInstruction::CreateTernary(
+                        dus->shape(), HloOpcode::kSelect,
+                        add_hlo(HloInstruction::CreateBroadcast(
+                            ShapeUtil::ChangeElementType(dus->shape(), PRED),
+                            all_dims_within_partition, {})),
+                        dus, partitioned_input)));
+  return absl::OkStatus();
+};
+
+absl::Status
+SpmdPartitioningVisitor::HandleDUSAllPartitionedSliceDimsHaveConstantIndices(
+    HloInstruction* hlo, const HloInstruction* input_tensor,
+    const HloInstruction* update_tensor) {
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    return b_.AddInstruction(std::move(to_add));
+  };
+  PaddingConfig padding_config;
+  for (int64_t input_tensor_dim = 0;
+       input_tensor_dim < hlo->shape().dimensions().size();
+       ++input_tensor_dim) {
+    auto padding_dim = padding_config.add_dimensions();
+    padding_dim->set_interior_padding(0);
+
+    const HloInstruction* dus_index = hlo->operand(input_tensor_dim + 2);
+    CHECK(dus_index->IsConstant());
+
+    int64_t start_index = dus_index->literal().GetIntegralAsS64({}).value();
+    int64_t end_index =
+        start_index + update_tensor->shape().dimensions(input_tensor_dim);
+    int64_t padding_high =
+        hlo->shape().dimensions(input_tensor_dim) - end_index;
+    padding_dim->set_edge_padding_low(start_index);
+    padding_dim->set_edge_padding_high(padding_high);
+  }
+
+  const Shape operand_pred_shape =
+      ShapeUtil::ChangeElementType(hlo->shape(), PRED);
+  const Shape update_pred_shape =
+      ShapeUtil::ChangeElementType(update_tensor->shape(), PRED);
+  const Shape sharded_update_pred_shape =
+      MakePartitionedShape(update_pred_shape, hlo->sharding());
+
+  auto zeroOperand = CreateZero(sharded_update_pred_shape, &b_);
+  zeroOperand->set_sharding(hlo->sharding());
+
+  HloInstruction* paddingValue = CreateOne(Shape(PRED, {}), &b_);
+  HloInstruction* maskOp = PadHelper(
+      *this,
+      PartitionedHlo(zeroOperand, update_pred_shape, MakePartitioningState()),
+      paddingValue, padding_config, operand_pred_shape, hlo->sharding());
+  if (!maskOp) {
+    maskOp = add_hlo(HloInstruction::CreatePad(operand_pred_shape, zeroOperand,
+                                               paddingValue, padding_config));
+    maskOp->set_sharding(hlo->sharding());
+  }
+
+  auto zeroElemOp = add_hlo(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  HloInstruction* newOperand =
+      PadHelper(*this, GetPartitionedHlo(update_tensor), zeroElemOp,
+                padding_config, hlo->shape(), hlo->sharding());
+  if (!newOperand) {
+    newOperand = add_hlo(HloInstruction::CreatePad(
+        hlo->shape(), GetPartitionedHlo(update_tensor).hlo(), zeroElemOp,
+        padding_config));
+    newOperand->set_sharding(hlo->sharding());
+  }
+
+  auto shard_result_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  auto result = add_hlo(HloInstruction::CreateTernary(
+      shard_result_shape, HloOpcode::kSelect, maskOp,
+      GetPartitionedHlo(input_tensor).hlo(), newOperand));
+  SetPartitionedHlo(hlo, result);
   return absl::OkStatus();
+};
+
+absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
+    HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  const HloInstruction* input_tensor = hlo->operand(0);
+  const HloInstruction* update_tensor = hlo->operand(1);
+
+  std::vector<HloInstruction*> new_indices;
+  new_indices.reserve(hlo->shape().dimensions().size());
+  for (int64_t i = 0; i < hlo->shape().dimensions().size(); ++i) {
+    const HloInstruction* index = hlo->operand(i + 2);
+    if (update_tensor->shape().dimensions(i) == hlo->shape().dimensions(i)) {
+      new_indices.emplace_back(CreateZero(index->shape(), &b_));
+    } else {
+      // Replicate the indices.
+      new_indices.emplace_back(GetPartitionedHlo(index).Replicate().hlo());
+    }
+  }
+
+  // Refer to go/dus-spmd for more details.
+  DynamicUpdateSliceAnalysis analysis = AnalyzeDynamicUpdateSlice(hlo);
+
+  // Keep the sharding for input and output since the update is fully contained
+  // in a single partition.
+  if (analysis.method == DynamicUpdateSliceMethod::kUpdateOnASinglePartition) {
+    return HandleDUSSinglePartitionUpdate(hlo, input_tensor, update_tensor,
+                                          new_indices, analysis.slice_dims,
+                                          analysis.partitioned_slice_dims);
+  }
+
+  // All partitioned slice dimensions have compile-time constant indices. It is
+  // currently enabled for enzyme only.
+  if (analysis.method == DynamicUpdateSliceMethod::
+                             kAllPartitionedSliceDimsHaveConstantIndices &&
+      module_->config().debug_options().xla_enable_enzyme_comms_opt()) {
+    return HandleDUSAllPartitionedSliceDimsHaveConstantIndices(
+        hlo, input_tensor, update_tensor);
+  }
+
+  // The default method is to replicate the slice dimensions for all involved
+  // tensors.
+  return HandleDUSDefault(hlo, input_tensor, update_tensor, new_indices,
+                          analysis.slice_dims);
 }
 
 absl::Status SpmdPartitioningVisitor::HandleGetTupleElement(
@@ -3893,8 +4009,7 @@ absl::Status SpmdPartitioningVisitor::HandleGetTupleElement(
   PartitionedHlo source_partitioned_gte(
       gte, tuple.base_shape().tuple_shapes(hlo->tuple_index()),
       MakePartitioningState());
-  source_partitioned_gte = source_partitioned_gte.Reshard(hlo->sharding());
-  SetPartitionedHlo(hlo, std::move(source_partitioned_gte));
+  SetPartitionedHlo(hlo, source_partitioned_gte.Reshard(hlo->sharding()));
   return absl::OkStatus();
 }
 
@@ -3907,19 +4022,15 @@ absl::Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
     // elements for non-empty tuple. So if it has a nested empty tuple, we
     // cannot invoke GetSubSharding() since it expects a sharding for the empty
     // tuple. This is a workaround for that case.
-    SetPartitionedHlo(hlo, [&]() {
-      return b_.AddInstruction(
-          HloInstruction::CreateInfeed(shape, token, hlo->infeed_config()));
-    });
+    SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateInfeed(
+                               shape, token, hlo->infeed_config())));
     return absl::OkStatus();
   }
   auto sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
   auto shard_shape = MakePartitionedShape(shape, sharding);
   if (EvenlyPartitions(shape, sharding)) {
-    SetPartitionedHlo(hlo, [&]() {
-      return b_.AddInstruction(HloInstruction::CreateInfeed(
-          shard_shape, token, hlo->infeed_config()));
-    });
+    SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateInfeed(
+                               shard_shape, token, hlo->infeed_config())));
     return absl::OkStatus();
   }
 
@@ -4023,38 +4134,22 @@ absl::Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
     }
     branches[i] = module_->AddEmbeddedComputation(branch_b.Build());
   }
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateConditional(
-        ShapeUtil::MakeTupleShape({shard_shape, token->shape()}), branch_index,
-        branches, std::vector<HloInstruction*>(branches.size(), token)));
-  });
+  SetPartitionedHlo(
+      hlo, b_.AddInstruction(HloInstruction::CreateConditional(
+               ShapeUtil::MakeTupleShape({shard_shape, token->shape()}),
+               branch_index, branches,
+               std::vector<HloInstruction*>(branches.size(), token))));
   return absl::OkStatus();
 }
 
 absl::Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
-  if (hlo->sharding().IsTileMaximal()) {
+  auto result = PadHelper(*this, GetPartitionedHlo(hlo->operand(0)),
+                          GetPartitionedHlo(hlo->operand(1)).Replicate().hlo(),
+                          hlo->padding_config(), hlo->shape(), hlo->sharding());
+  if (!result) {
     return DefaultAction(hlo);
   }
-  auto lhs = GetPartitionedHlo(hlo->operand(0));
-  auto replicated_rhs = GetPartitionedHlo(hlo->operand(1)).Replicate().hlo();
-  auto reshard_operand = ReshardDataForPad(
-      replicated_rhs, hlo->padding_config(), lhs, hlo->sharding(), &b_);
-  if (!reshard_operand.has_value()) {
-    return DefaultAction(hlo);
-  }
-  auto* sharded_pad =
-      PadDataFromWindowReshard(*reshard_operand, replicated_rhs, &b_);
-
-  SetPartitionedHlo(hlo, [&]() {
-    if (!reshard_operand->dynamic_slice_index_on_output) {
-      return sharded_pad;
-    }
-    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
-    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
-        shard_shape, sharded_pad,
-        *reshard_operand->dynamic_slice_index_on_output,
-        shard_shape.dimensions()));
-  });
+  SetPartitionedHlo(hlo, result);
   return absl::OkStatus();
 }
 
@@ -4223,10 +4318,9 @@ absl::Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
   if (!left_padded_operand) {
     return DefaultAction(hlo);
   }
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        left_padded_operand->shape(), {left_padded_operand}));
-  });
+  SetPartitionedHlo(hlo,
+                    b_.AddInstruction(hlo->CloneWithNewOperands(
+                        left_padded_operand->shape(), {left_padded_operand})));
   return absl::OkStatus();
 }
 
@@ -4237,7 +4331,7 @@ absl::Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
       hlo->while_body(),
       GetPartitionedHlo(hlo->operand(0)).Reshard(sharding).hlo()));
   hlo->SetupDerivedInstruction(whileOp);
-  SetPartitionedHlo(hlo, [&] { return whileOp; });
+  SetPartitionedHlo(hlo, whileOp);
   return absl::OkStatus();
 }
 
@@ -4282,21 +4376,15 @@ absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
     return HandleSingleDevice(hlo);
   }
   if (hlo->sharding().IsManual()) {
-    auto clone_from_original = [&](const HloSharding& shared_sharding) {
-      std::vector<HloInstruction*> new_operands;
-      new_operands.reserve(hlo->operand_count());
-      for (int64_t i = 0; i < hlo->operand_count(); ++i) {
-        new_operands.push_back(
-            GetPartitionedHlo(hlo->operand(i)).Reshard(shared_sharding).hlo());
-      }
-      auto clone = b_.AddInstruction(
-          hlo->CloneWithNewOperands(hlo->shape(), new_operands));
-      clone->set_sharding(shared_sharding);
-      return clone;
-    };
-
-    SetPartitionedHlo(hlo,
-                      [&] { return clone_from_original(hlo->sharding()); });
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(hlo->operand_count());
+    for (int64_t i = 0; i < hlo->operand_count(); ++i) {
+      new_operands.push_back(
+          GetPartitionedHlo(hlo->operand(i)).Reshard(hlo->sharding()).hlo());
+    }
+    auto clone = b_.AddInstruction(
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+    SetPartitionedHlo(hlo, clone);
     return absl::OkStatus();
   }
 
@@ -4331,10 +4419,9 @@ absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
     Shape outfeed_shape = operand->shape();
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(hlo->outfeed_shape(),
                                                            &outfeed_shape));
-    SetPartitionedHlo(hlo, [&]() {
-      return b_.AddInstruction(HloInstruction::CreateOutfeed(
-          outfeed_shape, operand, token, hlo->outfeed_config()));
-    });
+    SetPartitionedHlo(
+        hlo, b_.AddInstruction(HloInstruction::CreateOutfeed(
+                 outfeed_shape, operand, token, hlo->outfeed_config())));
     return absl::OkStatus();
   }
 
@@ -4453,13 +4540,13 @@ absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
         hlo->outfeed_config()));
     branches[i] = module_->AddEmbeddedComputation(branch_b.Build());
   }
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateConditional(
-        token->shape(), branch_index, branches,
-        std::vector<HloInstruction*>(
-            branches.size(),
-            b_.AddInstruction(HloInstruction::CreateTuple({operand, token})))));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(HloInstruction::CreateConditional(
+          token->shape(), branch_index, branches,
+          std::vector<HloInstruction*>(
+              branches.size(), b_.AddInstruction(HloInstruction::CreateTuple(
+                                   {operand, token}))))));
   return absl::OkStatus();
 }
 
@@ -4481,8 +4568,7 @@ absl::Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   };
 
   if (hlo->sharding().IsManual()) {
-    SetPartitionedHlo(hlo,
-                      [&] { return clone_from_original(hlo->sharding()); });
+    SetPartitionedHlo(hlo, clone_from_original(hlo->sharding()));
     return absl::OkStatus();
   }
 
@@ -4507,15 +4593,14 @@ absl::Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   }
 
   if (!hlo->sharding().ReplicateOnLastTileDim()) {
-    SetPartitionedHlo(hlo, [&] {
-      return b_.AddInstruction(HloInstruction::CreateRng(
-          MakePartitionedShape(hlo->shape(), hlo->sharding()),
-          hlo->random_distribution(), new_operands));
-    });
+    SetPartitionedHlo(hlo,
+                      b_.AddInstruction(HloInstruction::CreateRng(
+                          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+                          hlo->random_distribution(), new_operands)));
   } else {
     std::vector<int64_t> group_dims(
         hlo->sharding().tile_assignment().num_dimensions() - 1);
-    std::iota(group_dims.begin(), group_dims.end(), 0);
+    absl::c_iota(group_dims, 0);
     auto sharding_grouped =
         hlo_sharding_util::GroupShardingOnDims(hlo->sharding(), group_dims);
     auto per_group_state = CreatePerGroupPartitioningState(
@@ -4830,9 +4915,8 @@ absl::Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
             .Reshard(hlo->sharding().GetSubSharding(hlo->shape(), {i}))
             .hlo());
   }
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateTuple(new_operands));
-  });
+  SetPartitionedHlo(
+      hlo, b_.AddInstruction(HloInstruction::CreateTuple(new_operands)));
   return absl::OkStatus();
 }
 
@@ -4870,8 +4954,8 @@ absl::Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
 }
 
 absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
-  LOG(WARNING) << "You have to use Shardy for RaggedDot. If not, the behavior "
-                  "is undefined.";
+  CHECK(hlo->parent()->parent()->config().use_shardy_partitioner())
+      << "RaggedDot is only supported with Shardy.";
 
   const RaggedDotDimensionNumbers& ragged_dot_dnums =
       hlo->ragged_dot_dimension_numbers();
@@ -4921,7 +5005,7 @@ absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
         MakeBinaryAdd(phlo->shape().element_type(), lhs.state().module));
   }
 
-  SetPartitionedHlo(hlo, [&]() { return phlo; });
+  SetPartitionedHlo(hlo, phlo);
   return absl::OkStatus();
 }
 
@@ -5054,22 +5138,20 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
           // If the src/dst pairs are empty, then the collective permute
           // just initializes the output to zero.
           return CreateZero(operand->shape(), b);
-        } else {
-          // A collective-permute is a copy if all pairs are "identity" and
-          // all partitions are listed.
-          bool is_copy =
-              src_dst_pairs.size() == num_partitions &&
-              absl::c_all_of(src_dst_pairs,
-                             [](const std::pair<int64_t, int64_t>& pair) {
-                               return pair.first == pair.second;
-                             });
-          if (is_copy) {
-            return operand;
-          } else {
-            return b->AddInstruction(HloInstruction::CreateCollectivePermute(
-                operand->shape(), operand, src_dst_pairs, channel_id));
-          }
         }
+        // A collective-permute is a copy if all pairs are "identity" and
+        // all partitions are listed.
+        bool is_copy =
+            src_dst_pairs.size() == num_partitions &&
+            absl::c_all_of(src_dst_pairs,
+                           [](const std::pair<int64_t, int64_t>& pair) {
+                             return pair.first == pair.second;
+                           });
+        if (is_copy) {
+          return operand;
+        }
+        return b->AddInstruction(HloInstruction::CreateCollectivePermute(
+            operand->shape(), operand, src_dst_pairs, channel_id));
       },
       [create_all_to_all_list_of_lists](
           SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
@@ -5309,13 +5391,13 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
           .create_cross_partition_all_reduce_with_iota_device_list(
               b, operand, reduction, partition_group_list.value(),
               (*next_channel_id)++);
-    } else {
-      auto partition_subgroups =
-          GetPartitionGroupsForReplication(sharding, selected_dims);
-      return collectives_creator.create_cross_partition_all_reduce(
-          b, operand, reduction, partition_subgroups, (*next_channel_id)++);
     }
+    auto partition_subgroups =
+        GetPartitionGroupsForReplication(sharding, selected_dims);
+    return collectives_creator.create_cross_partition_all_reduce(
+        b, operand, reduction, partition_subgroups, (*next_channel_id)++);
   }
+
   auto result = operand;
   for (auto it = selected_dims.rbegin(); it != selected_dims.rend(); ++it) {
     if (sharding.tile_assignment().dim(*it) == 1) {
@@ -5430,7 +5512,7 @@ int64_t SpmdPartitioner::CommunicationCostInBytes(HloInstruction* hlo) {
   module->set_spmd_output_sharding(entry_root->sharding());
 }
 
-absl::StatusOr<bool> SpmdPartitioner::Run(
+absl::StatusOr<bool> SpmdPartitioner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   set_execution_threads(execution_threads);
@@ -5527,16 +5609,28 @@ absl::StatusOr<bool> SpmdPartitioner::Run(
   // parameters preserve their signatures.
   auto new_program_shape = module->entry_computation()->ComputeProgramShape();
   if (!options_.allow_module_signature_change) {
-    TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
-        program_shape.result(), new_program_shape.result()))
-        << "Result shape changed for the entry computation";
-    TF_RET_CHECK(program_shape.parameters_size() ==
-                 new_program_shape.parameters_size())
-        << "Parameter count changed for the entry computation";
+    if (!Shape::Equal()(program_shape.result(), new_program_shape.result())) {
+      return absl::InvalidArgumentError(
+          "Result shape changed for the entry computation from: " +
+          program_shape.result().ToString() +
+          " to: " + new_program_shape.result().ToString());
+    }
+    if (program_shape.parameters_size() !=
+        new_program_shape.parameters_size()) {
+      return absl::InvalidArgumentError(
+          "Parameter count changed for the entry computation from: " +
+          std::to_string(program_shape.parameters_size()) +
+          " to: " + std::to_string(new_program_shape.parameters_size()));
+    }
     for (int64_t i = 0; i < program_shape.parameters_size(); ++i) {
-      TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
-          program_shape.parameters(i), new_program_shape.parameters(i)))
-          << "Parameter shape changed for the entry computation";
+      if (!Shape::Equal()(program_shape.parameters(i),
+                          new_program_shape.parameters(i))) {
+        return absl::InvalidArgumentError(
+            "Parameter shape changed for the entry computation parameter " +
+            std::to_string(i) +
+            " from: " + program_shape.parameters(i).ToString() +
+            " to: " + new_program_shape.parameters(i).ToString());
+      }
     }
   } else {
     // Fix up some bad tiling in entry computation layout.
@@ -5629,34 +5723,6 @@ absl::Status SpmdPartitioner::PreprocessSharding(
     }
   }
 
-  // Entry computation's parameter and root sharding must be either all
-  // replicated or all on a single device.
-  if (!options_.allow_module_signature_change) {
-    const HloComputation* entry = module->entry_computation();
-    TF_RET_CHECK(entry->root_instruction()->has_sharding());
-    const HloSharding& root_sharding = entry->root_instruction()->sharding();
-    if (!root_sharding.UniqueDevice().has_value()) {
-      if (root_sharding.IsTuple()) {
-        TF_RET_CHECK(absl::c_all_of(root_sharding.tuple_elements(),
-                                    [](const HloSharding& s) {
-                                      return s.IsReplicated() || s.IsManual();
-                                    }))
-            << "Unsupported entry root sharding: " << root_sharding.ToString();
-
-      } else {
-        TF_RET_CHECK(root_sharding.IsReplicated() || root_sharding.IsManual())
-            << "Unsupported entry root sharding: " << root_sharding.ToString();
-      }
-    }
-
-    for (const HloInstruction* param : entry->parameter_instructions()) {
-      TF_RET_CHECK(param->has_sharding());
-      TF_RET_CHECK(param->sharding().IsReplicated() ||
-                   param->sharding().UniqueDevice().has_value())
-          << "Unsupported entry parameter sharding:"
-          << param->sharding().ToString();
-    }
-  }
   return absl::OkStatus();
 }
 
@@ -6180,50 +6246,50 @@ void SpmdPartitioningVisitor::SetPartitionedHlo(
   } else if (!sharding.IsReplicated()) {
     // Adds recovery computation to the original value recovery table.
     auto* module = const_cast<HloModule*>(hlo->parent()->parent());
-    module->mutable_original_value_recovery_table().AddRecoveryComputation(
-        hlo, partitioned_hlo.hlo(),
+    auto build_recovery_computation =
         [&](const ShapeIndex& index, const OriginalArray& old_original_array,
-            const xla::Shape& old_array_shape,
-            const xla::Shape& new_array_shape)
-            -> std::optional<std::unique_ptr<HloModule>> {
-          if (ShapeUtil::Compatible(old_array_shape, new_array_shape)) {
-            // If the shapes are the same, nothing is sharded so we return
-            // nullptr to indicate identity recovery module. This may happen
-            // for scalars in tuples.
-            return nullptr;
-          }
-          SpmdBuilder builder("recovery_computation", nullptr);
-          auto* param =
-              builder.AddInstruction(xla::HloInstruction::CreateParameter(
-                  0, new_array_shape, "param"));
-          if (sharding.IsTuple()) {
-            param->set_sharding(sharding.GetSubSharding(hlo->shape(), index));
-          } else {
-            param->set_sharding(sharding);
-          }
-          xla::HloModuleConfig config;
-          auto recovery_module =
-              std::make_unique<HloModule>("recovery_module", config);
-          PartitionedHlo::ReshardCache reshard_cache;
-          int64_t next_channel_id = hlo_query::NextChannelId(*recovery_module);
-
-          xla::spmd::PartitionedHlo::PartitioningState partitioning_state =
-              partitioned_hlo.state();
-          partitioning_state.b = &builder;
-          partitioning_state.module = recovery_module.get();
-          partitioning_state.partition_id =
-              partitioning_state.collective_ops_creator.create_partition_id(
-                  &builder);
-          partitioning_state.next_channel_id = &next_channel_id;
-          partitioning_state.reshard_cache = &reshard_cache;
-
-          PartitionedHlo param_partitioned_hlo(param, old_array_shape,
-                                               partitioning_state);
-          // Creates computation to recover the partitioned value.
-          param_partitioned_hlo.Replicate();
-          recovery_module->AddEntryComputation(builder.Build());
-          return recovery_module;
-        });
+            const xla::Shape& old_shape, const xla::Shape& new_shape)
+        -> std::optional<std::unique_ptr<HloModule>> {
+      SpmdBuilder builder("recovery_computation", nullptr);
+      auto* param = builder.AddInstruction(
+          xla::HloInstruction::CreateParameter(0, new_shape, "param"));
+      if (sharding.IsTuple()) {
+        const HloSharding sub_sharding =
+            sharding.GetSubSharding(hlo->shape(), index);
+        if (sub_sharding.IsReplicated()) {
+          return nullptr;
+        }
+        param->set_sharding(sub_sharding);
+      } else {
+        param->set_sharding(sharding);
+      }
+
+      xla::HloModuleConfig config;
+      auto recovery_module =
+          std::make_unique<HloModule>("recovery_module", config);
+      PartitionedHlo::ReshardCache reshard_cache;
+      int64_t next_channel_id = hlo_query::NextChannelId(*recovery_module);
+
+      xla::spmd::PartitionedHlo::PartitioningState partitioning_state =
+          partitioned_hlo.state();
+      partitioning_state.b = &builder;
+      partitioning_state.module = recovery_module.get();
+      partitioning_state.partition_id =
+          partitioning_state.collective_ops_creator.create_partition_id(
+              &builder);
+      partitioning_state.next_channel_id = &next_channel_id;
+      partitioning_state.reshard_cache = &reshard_cache;
+
+      PartitionedHlo param_partitioned_hlo(param, old_shape,
+                                           partitioning_state);
+      // Creates computation to recover the partitioned value.
+      param_partitioned_hlo.Replicate();
+      recovery_module->AddEntryComputation(builder.Build());
+      return recovery_module;
+    };
+
+    module->mutable_original_value_recovery_table().AddRecoveryComputation(
+        hlo, partitioned_hlo.hlo(), build_recovery_computation);
   }
 
   partitioned_instructions_.emplace(hlo, partitioned_hlo);
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index a5df88a74a331c..c430d29b65036c 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -158,8 +158,6 @@ class SpmdBuilder : public HloComputation::Builder {
     instructions_[hlo];
   }
 
-  HloInstruction* visiting_hlo() const { return visiting_hlo_; }
-
   // Wrapper of queries to broadcast_dims_.
   std::optional<const absl::flat_hash_set<int64_t>*> BroadcastDimsForCreatedHlo(
       const HloInstruction* hlo) {
@@ -316,10 +314,6 @@ class SpmdPartitioner : public HloModulePass {
         options_(std::move(options)),
         collective_ops_creator_(std::move(collective_ops_creator)) {}
   absl::string_view name() const override { return "spmd-partitioning"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Transforms the given computation with SPMD instructions, replacing it with
   // a new computation.
@@ -370,13 +364,17 @@ class SpmdPartitioner : public HloModulePass {
   }
 
   // Update module's parameter and output sharding information, based on the
-  // sharding information of the module's parameters and outptuts.
+  // sharding information of the module's parameters and outputs.
   static void RecordInputsOutputsSharding(HloModule* module);
 
   int64_t num_partitions() const { return num_partitions_; }
   int64_t num_replicas() const { return num_replicas_; }
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // This is the internal implementation for AllGatherShards(), returns a pair
   // of hlo instructions whose first element is the result of the all-gather
   // shard(which might not be the all-gather itself and it could go through
@@ -443,7 +441,6 @@ class SpmdPartitioner : public HloModulePass {
 
   SpmdPartitionerOptions options_;
   SPMDCollectiveOpsCreator collective_ops_creator_;
-  std::vector<std::vector<int64_t>> device_groups_;
   absl::flat_hash_set<absl::string_view> execution_threads_;
 };
 
@@ -722,6 +719,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
 
   absl::Status DefaultAction(HloInstruction* hlo) override;
 
+  // go/keep-sorted start
   absl::Status HandleAllReduce(HloInstruction* hlo) override;
   absl::Status HandleBitcastConvert(HloInstruction* hlo) override;
   absl::Status HandleBroadcast(HloInstruction* hlo) override;
@@ -760,6 +758,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   absl::Status HandleTriangularSolve(HloInstruction* hlo) override;
   absl::Status HandleTuple(HloInstruction* hlo) override;
   absl::Status HandleWhile(HloInstruction* hlo) override;
+  // go/keep-sorted end
 
   // Implementation of dot partitioning given DotGeneralDimsMapping.
   template <typename CreateShardedFunctor>
@@ -796,17 +795,21 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   void SetPartitionedHlo(const HloInstruction* hlo,
                          PartitionedHlo&& partitioned_hlo);
 
-  // Convenient wrapper that creates PartitionedHlo from the result of the func
-  // and maps it to the given original hlo.
-  void SetPartitionedHlo(const HloInstruction* hlo,
-                         absl::FunctionRef<HloInstruction*()> func) {
-    HloInstruction* new_hlo = func();
+  // Convenient wrapper that creates PartitionedHlo from `new_hlo`.
+  void SetPartitionedHlo(const HloInstruction* hlo, HloInstruction* new_hlo) {
     new_hlo->set_sharding(hlo->sharding());
     SetPartitionedHlo(
         hlo, PartitionedHlo(new_hlo, hlo->shape(), MakePartitioningState()));
     changed_ = true;
   }
 
+  // Convenient wrapper that creates PartitionedHlo from the result of the func
+  // and maps it to the given original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         absl::FunctionRef<HloInstruction*()> func) {
+    return SetPartitionedHlo(hlo, func());
+  }
+
   int64_t NewChannel() { return (*next_channel_id_)++; }
 
   PartitionedHlo::PartitioningState MakePartitioningState();
@@ -910,6 +913,30 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   std::vector<PartitionedHlo::PartitioningState> visiting_state_;
   std::optional<hlo_sharding_util::DeviceGroupTileAssignment> device_groups_;
   const CallGraph& call_graph_;
+
+  // Dispatches DUS handler to one of the three implementations based on
+  // analysis.
+
+  // Method 1. Replicate the slice dimensions for all involved
+  // tensors.
+  absl::Status HandleDUSDefault(HloInstruction* hlo,
+                                const HloInstruction* input_tensor,
+                                const HloInstruction* update_tensor,
+                                std::vector<HloInstruction*>& new_indices,
+                                std::vector<int64_t> slice_dims);
+  // Method 2. Keep the sharding for input and output since the update is fully
+  // contained in a single partition.
+  absl::Status HandleDUSSinglePartitionUpdate(
+      HloInstruction* hlo, const HloInstruction* input_tensor,
+      const HloInstruction* update_tensor,
+      std::vector<HloInstruction*>& new_indices,
+      std::vector<int64_t> slice_dims,
+      std::vector<int64_t> partitioned_slice_dims);
+  // Method 3: All partitioned slice dimensions have compile-time constant
+  // indices.
+  absl::Status HandleDUSAllPartitionedSliceDimsHaveConstantIndices(
+      HloInstruction* hlo, const HloInstruction* input_tensor,
+      const HloInstruction* update_tensor);
 };
 
 }  // namespace spmd
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index f588dd175dc2e0..fa880889ce9bf9 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/replica_group.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
@@ -84,12 +85,10 @@ class SpmdPartitioningTest
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
       absl::string_view hlo_module, int64_t num_devices,
       SpmdPartitionerOptions options = SpmdPartitionerOptions(),
-      bool use_all_gather = false) {
+      bool use_all_gather = true, bool enable_enzyme_opt = false) {
     options.allow_module_signature_change = true;
     auto collective_ops_creator =
         GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
-    // Do not use all-gather for pattern-matching purpose, as the partitioner
-    // might create reshape/transposes around it.
     if (!use_all_gather) {
       collective_ops_creator.create_cross_partition_all_gather = nullptr;
     }
@@ -97,6 +96,10 @@ class SpmdPartitioningTest
     HloModuleConfig config = GetModuleConfigForTest();
     config.set_use_spmd_partitioning(true);
     config.set_num_partitions(num_devices);
+    config.set_use_shardy_partitioner(true);
+    if (enable_enzyme_opt) {
+      config.mutable_debug_options().set_xla_enable_enzyme_comms_opt(true);
+    }
     TF_ASSIGN_OR_RETURN(auto module,
                         ParseAndReturnVerifiedModule(hlo_module, config));
 
@@ -501,15 +504,11 @@ ENTRY entry {
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(
-      root,
-      op::Copy(op::AllReduce(AllOf(
-          op::DynamicUpdateSlice(
-              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
-              op::Constant()),
-          op::Shape("s32[2,3]")))));
+  auto sharded_constant = AllOf(op::Constant(), op::Shape("s32[1,3]"));
+  auto replicated_constant =
+      AllOf(op::AllGather(op::Constant()), op::Shape("s32[2,3]"));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(replicated_constant));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -530,23 +529,15 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/4));
   VLOG(1) << module->ToString();
 
-  // Verify all-reduce instruction is generated.
-  auto all_reduce_instruction =
-      std::find_if(module->entry_computation()->instructions().begin(),
-                   module->entry_computation()->instructions().end(),
-                   HloPredicateIsOp<HloOpcode::kAllReduce>);
-  EXPECT_NE(all_reduce_instruction,
-            module->entry_computation()->instructions().end());
+  // Verify all-gather instruction is generated.
+  HloInstruction* all_gather =
+      FindInstruction(module.get(), HloOpcode::kAllGather);
+  EXPECT_NE(all_gather, nullptr);
 
-  // Verify all-reduce instruction contains ReplicaGroupV2.
-  EXPECT_TRUE((*all_reduce_instruction)
-                  ->device_list()
-                  .iota_replica_group_list()
-                  .has_value());
-  IotaReplicaGroupList list = (*all_reduce_instruction)
-                                  ->device_list()
-                                  .iota_replica_group_list()
-                                  .value();
+  // Verify all-gather instruction contains ReplicaGroupV2.
+  EXPECT_TRUE(all_gather->device_list().iota_replica_group_list().has_value());
+  IotaReplicaGroupList list =
+      all_gather->device_list().iota_replica_group_list().value();
   EXPECT_EQ(list.num_replica_groups(), 1);
   EXPECT_EQ(list.num_devices_per_group(), 4);
   EXPECT_THAT(list.reshape_dims(), ::testing::ElementsAre(4));
@@ -564,15 +555,11 @@ ENTRY entry {
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
+  auto sharded_constant = AllOf(op::Constant(), op::Shape("s32[1,3]"));
+  auto replicated_constant =
+      AllOf(op::AllGather(op::Constant()), op::Shape("s32[2,3]"));
   HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(
-      root,
-      op::Copy(op::Copy(op::AllReduce(AllOf(
-          op::DynamicUpdateSlice(
-              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
-              op::Constant()),
-          op::Shape("s32[2,3]"))))));
+  EXPECT_THAT(root, op::Copy(op::Copy(replicated_constant)));
 }
 
 TEST_P(SpmdPartitioningTest, TiledToTiledEven) {
@@ -761,13 +748,8 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
-      root,
-      op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-          op::Broadcast(),
-          op::GetTupleElement(
-              AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
-          op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
-          op::Constant()))));
+      root, op::Copy(op::AllGather(op::GetTupleElement(
+                AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))))));
 }
 
 TEST_P(SpmdPartitioningTest, UnevenTiledInfeed) {
@@ -1090,10 +1072,7 @@ ENTRY entry {
                              op::Constant(), op::Constant(), op::Constant())),
                          op::Shape("f32[16,3,6,16,32]"));
   auto resharded_rhs =
-      AllOf(op::Shape("f32[16,6,6,16,32]"),
-            op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(), rhs, op::Constant(), op::Reshape(),
-                op::Constant(), op::Constant(), op::Constant())));
+      AllOf(op::Shape("f32[16,6,6,16,32]"), op::AllGather(rhs));
 
   auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
                          op::Shape("f32[16,2,12,24,32]"));
@@ -2584,17 +2563,11 @@ ENTRY entry {
   VLOG(1) << module->ToString();
 
   auto param0 = AllOf(op::Parameter(0), op::Shape("f32[7,129]"));
-  auto param0_adjusted =
-      AllOf(op::Select(op::Compare(op::Add(), op::Broadcast(op::Constant())),
-                       param0, op::Broadcast(op::Constant())),
-            op::Shape("f32[7,129]"));
-  auto param0_replicated = AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                                     op::Broadcast(), param0_adjusted, _, _)),
-                                 op::Shape("f32[7,257]"));
+  auto param0_replicated =
+      AllOf(op::Slice(op::AllGather(param0)), op::Shape("f32[7,257]"));
   auto param1 = AllOf(op::Parameter(1), op::Shape("f32[7,58]"));
-  auto param1_replicated = AllOf(
-      op::AllReduce(op::DynamicUpdateSlice(op::Broadcast(), param1, _, _)),
-      op::Shape("f32[7,116]"));
+  auto param1_replicated =
+      AllOf(op::AllGather(param1), op::Shape("f32[7,116]"));
 
   auto concatenate =
       AllOf(op::Concatenate(param0_replicated, param1_replicated),
@@ -2625,8 +2598,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/4));
 
-  auto param0_replicated = AllOf(op::AllReduce(
-      op::DynamicUpdateSlice(op::Broadcast(), op::Parameter(0), _)));
+  auto param0_replicated = AllOf(op::AllGather(op::Parameter(0)));
   auto concatenate_replicated =
       AllOf(op::Concatenate(param0_replicated, param0_replicated),
             op::Shape("f32[512]"));
@@ -2709,7 +2681,7 @@ ENTRY entry {
 
 // Pad is treated as a special case of window operator. When this pad-window has
 // a large edge pad, halo exchange with collective permute is not sufficient.
-// resharding with collective(AG or AR) is needed.
+// resharding with collective (AG or AR) is needed.
 // This test case aims to validate the collective insertion behavior when spmd
 // partitioner handles large cross-partition pad in SPMD partitioner.
 TEST_P(SpmdPartitioningTest, LargeEdgePadAlongCrossPartitionDimension) {
@@ -2717,29 +2689,23 @@ TEST_P(SpmdPartitioningTest, LargeEdgePadAlongCrossPartitionDimension) {
     HloModule module
 
     ENTRY entry {
-      %param0 = f32[14,257] parameter(0), sharding={devices=[2,4]0,1,2,3,4,5,6,7}
+      %param0 = f32[14,257] parameter(0), sharding={devices=[2,4]<=[8]}
       %const = f32[] constant(0)
-      ROOT %pad = f32[14,2257] pad(%param0, %const), padding=0_0x0_2000,
-    sharding={devices=[2,4]0,1,2,3,4,5,6,7}
+      ROOT %pad = f32[14,2257] pad(%param0, %const), padding=0_0x0_2000, sharding={devices=[2,4]<=[8]}
   })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/8));
-  VLOG(1) << module->ToString();
 
-  const HloInstruction* root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[7,65]"));
+  auto all_gather = AllOf(op::AllGather(param0), op::Shape("f32[7,260]"));
+  auto remove_padding = AllOf(op::Slice(all_gather), op::Shape("f32[7,257]"));
+  auto pad =
+      AllOf(op::Pad(remove_padding, op::Constant()), op::Shape("f32[7,2260]"));
+  auto dynamic_slice =
+      AllOf(op::DynamicSlice(pad, _, _), op::Shape("f32[7,565]"));
 
-  // Reshard operand will trigger collective.
-  // All-gather is intended to be used in production for pad operation,
-  // dynamic-update-slice + all-reduce is used in test environment for pattern
-  // matching purpose.
-  const HloInstruction* all_reduce =
-      FindInstruction(module.get(), HloOpcode::kAllReduce);
-  ASSERT_NE(all_reduce, nullptr);
-
-  EXPECT_THAT(all_reduce->operand(0), op::DynamicUpdateSlice());
-
-  EXPECT_THAT(root, op::DynamicSlice(op::Pad(all_reduce, _), _, _));
+  EXPECT_THAT(module->entry_computation()->root_instruction(), dynamic_slice);
 }
 
 TEST_P(SpmdPartitioningTest, LargeRightPadOnSliceHaloExchange) {
@@ -3822,7 +3788,7 @@ ENTRY entry {
         EXPECT_EQ(operand->shape().dimensions(0), 1);
         EXPECT_EQ(operand->shape().dimensions(1), 1024);
       }
-      EXPECT_THAT(inst, op::Sort(op::AllReduce(), op::AllReduce()));
+      EXPECT_THAT(inst, op::Sort(op::AllGather(), op::AllGather()));
     }
     EXPECT_NE(inst->opcode(), HloOpcode::kAllToAll);
   }
@@ -4253,8 +4219,7 @@ ENTRY %reshape {
       auto module, PartitionComputation(hlo_string, /*num_devices=*/128));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  auto reshape = AllOf(op::Reshape(op::AllReduce(op::DynamicUpdateSlice(
-                           _, op::Parameter(0), _, _, _, _, _, _, _))),
+  auto reshape = AllOf(op::Reshape(op::AllGather(op::Parameter(0))),
                        op::Shape("bf16[320,4,8]"));
   EXPECT_THAT(root, AllOf(op::DynamicSlice(reshape, _, _, _),
                           op::Shape("bf16[40,4,8]")));
@@ -4293,8 +4258,7 @@ ENTRY %reshape {
   auto param = AllOf(op::Parameter(0), op::Shape("bf16[6]"));
   // Reshard param from {devices=[4]<=[4]} to {devices=[2,2]<=[4]
   // last_tile_dim_replicate}
-  auto reshard_param = AllOf(op::AllReduce(op::DynamicUpdateSlice(_, param, _)),
-                             op::Shape("bf16[12]"));
+  auto reshard_param = AllOf(op::AllGather(param), op::Shape("bf16[12]"));
 
   auto reshape = AllOf(op::Reshape(reshard_param), op::Shape("bf16[3,4]"));
 
@@ -4345,9 +4309,7 @@ ENTRY %reshape {
                           PartitionComputation(hlo_string, /*num_devices=*/4));
 
   auto param = AllOf(op::Shape("bf16[2,8]"), op::Parameter(0));
-  auto param_replicated = AllOf(
-      op::Shape("bf16[8,8]"), op::AllReduce(op::DynamicUpdateSlice(
-                                  op::Broadcast(op::Constant()), param, _, _)));
+  auto param_replicated = AllOf(op::Shape("bf16[8,8]"), op::AllGather(param));
   auto reshape = AllOf(op::Shape("bf16[64]"), op::Reshape(param_replicated));
   auto reshape_resharded =
       AllOf(op::Shape("bf16[16]"), op::DynamicSlice(reshape, _));
@@ -4681,12 +4643,8 @@ ENTRY %main {
                          op::Shape("f32[14,1]"));
   auto reshape_r = AllOf(op::Reshape(op::GetTupleElement(local_reduce)),
                          op::Shape("s32[14,1]"));
-  auto broadcast_l =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(_, reshape_l, _, _)),
-            op::Shape("f32[14,4]"));
-  auto broadcast_r =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(_, reshape_r, _, _)),
-            op::Shape("s32[14,4]"));
+  auto broadcast_l = AllOf(op::AllGather(reshape_l), op::Shape("f32[14,4]"));
+  auto broadcast_r = AllOf(op::AllGather(reshape_r), op::Shape("s32[14,4]"));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::Reduce(broadcast_l, broadcast_r, op::Parameter(2),
                                      op::Parameter(3)),
@@ -4736,12 +4694,8 @@ ENTRY %main {
                          op::Shape("f32[28,1]"));
   auto reshape_r = AllOf(op::Reshape(op::GetTupleElement(local_reduce)),
                          op::Shape("s32[28,1]"));
-  auto broadcast_l =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(_, reshape_l, _, _)),
-            op::Shape("f32[28,2]"));
-  auto broadcast_r =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(_, reshape_r, _, _)),
-            op::Shape("s32[28,2]"));
+  auto broadcast_l = AllOf(op::AllGather(reshape_l), op::Shape("f32[28,2]"));
+  auto broadcast_r = AllOf(op::AllGather(reshape_r), op::Shape("s32[28,2]"));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::Reduce(broadcast_l, broadcast_r, op::Parameter(2),
                                      op::Parameter(3)),
@@ -5388,7 +5342,7 @@ ENTRY entry {
 
   // Check while op.
   const auto arg0 = AllOf(op::Reshape(), op::Shape("bf16[32,1,512]{2,1,0}"));
-  const auto arg1 = AllOf(op::AllReduce(), op::Shape("bf16[1,512,768]{2,1,0}"));
+  const auto arg1 = AllOf(op::AllGather(), op::Shape("bf16[1,512,768]{2,1,0}"));
 
   const auto while_op =
       AllOf(op::While(op::Tuple(arg0, arg1, op::Broadcast(), op::Broadcast(),
@@ -6367,29 +6321,24 @@ TEST_P(SpmdPartitioningTest, EinsumRHSWindowedNonContractingNoDoubleAG) {
 HloModule module
 
 ENTRY entry {
-  %lhs = f32[32,24,64,128] parameter(0)
-  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
-  %lhs2 = f32[32,24,64,128] parameter(2)
-  %lhs2.copy = f32[32,24,64,128] copy(%lhs2), sharding={devices=[1,2,1,1]0,1}
-  %rhs = f32[32,39295,64,128] parameter(1)
-  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
-  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+  %lhs = f32[32,24,64,128] parameter(0), sharding={devices=[1,2,1,1]0,1}
+  %lhs2 = f32[32,24,64,128] parameter(2), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,40,64,128] parameter(1), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,40] dot(%lhs, %rhs),
     lhs_batch_dims={0}, rhs_batch_dims={0},
     lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
     sharding={devices=[1,2,1]0,1}
-  %dot2 = f32[32,24,39295] dot(%lhs2.copy, %rhs.copy),
+  %dot2 = f32[32,24,40] dot(%lhs2, %rhs),
     lhs_batch_dims={0}, rhs_batch_dims={0},
     lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
     sharding={devices=[1,2,1]0,1}
   ROOT %t = tuple(%dot, %dot2)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
-                                                            /*num_devices=*/2));
-  VLOG(1) << module->ToString();
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
   const auto root = module->entry_computation()->root_instruction();
-  const auto tuple_element = op::AllReduce(op::DynamicUpdateSlice(
-      _, op::Dot(_, op::AllReduce(op::DynamicUpdateSlice())), _, _, _));
+  const auto tuple_element = op::AllGather(op::Dot(_, op::AllGather()));
   EXPECT_THAT(root, op::Tuple(tuple_element, tuple_element));
 }
 
@@ -6421,10 +6370,8 @@ ENTRY entry {
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
       root,
-      op::Tuple(op::AllReduce(op::DynamicUpdateSlice(
-                    _, op::Slice(op::GetTupleElement(op::While(_))), _, _, _)),
-                op::AllReduce(op::DynamicUpdateSlice(
-                    _, op::Dot(_, op::Slice(_)), _, _, _))));
+      op::Tuple(op::AllGather(op::Slice(op::GetTupleElement(op::While(_)))),
+                op::AllGather(op::Dot(_, op::Slice(_)))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -6459,14 +6406,11 @@ ENTRY entry {
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
       root,
-      op::Tuple(op::AllReduce(op::DynamicUpdateSlice(
-                    _, op::Slice(op::GetTupleElement(op::While(_))), _, _, _)),
-                op::AllReduce(op::DynamicUpdateSlice(
-                    _, op::Dot(_, op::Slice(_)), _, _, _))));
+      op::Tuple(op::AllGather(op::Slice(op::GetTupleElement(op::While(_)))),
+                op::AllGather(op::Dot(_, op::Slice(_)))));
 
-  // Tuple<-AllReduce<-DynamicUpdateSlice<-Slice<-GetTupleElement<-While
-  const auto while_loop =
-      root->operand(0)->operand(0)->operand(1)->operand(0)->operand(0);
+  // Tuple<-AllGather<-Slice<-GetTupleElement<-While
+  const auto while_loop = root->operand(0)->operand(0)->operand(0)->operand(0);
   // Check loop condition.
   EXPECT_THAT(
       while_loop->while_condition()->root_instruction(),
@@ -6529,14 +6473,11 @@ ENTRY entry {
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
       root,
-      op::Tuple(op::AllReduce(op::DynamicUpdateSlice(
-                    _, op::Slice(op::GetTupleElement(op::While(_))), _, _, _)),
-                op::AllReduce(op::DynamicUpdateSlice(
-                    _, op::Dot(_, op::Slice(_)), _, _, _))));
+      op::Tuple(op::AllGather(op::Slice(op::GetTupleElement(op::While(_)))),
+                op::AllGather(op::Dot(_, op::Slice(_)))));
 
-  // Tuple<-AllReduce<-DynamicUpdateSlice<-Slice<-GetTupleElement<-While
-  const auto while_loop =
-      root->operand(0)->operand(0)->operand(1)->operand(0)->operand(0);
+  // Tuple<-AllGather<-Slice<-GetTupleElement<-While
+  const auto while_loop = root->operand(0)->operand(0)->operand(0)->operand(0);
   // Check loop condition.
   EXPECT_THAT(
       while_loop->while_condition()->root_instruction(),
@@ -7827,10 +7768,7 @@ ENTRY entry {
       op::Shape("bf16[1,1536,256,1]"));
 
   const auto partial_replicate_rhs =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(), rhs, op::Constant(), op::Constant(),
-                op::Reshape(), op::Constant())),
-            op::Shape("bf16[1,1536,512,1]"));
+      AllOf(op::AllGather(rhs), op::Shape("bf16[1,1536,512,1]"));
   EXPECT_THAT(
       root,
       AllOf(op::DynamicSlice(
@@ -7869,10 +7807,7 @@ ENTRY entry {
       op::Shape("bf16[1,1536,256,1]"));
 
   const auto partial_replicate_rhs =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(), rhs, op::Constant(), op::Constant(),
-                op::Reshape(), op::Constant())),
-            op::Shape("bf16[1,1536,512,1]"));
+      AllOf(op::AllGather(rhs), op::Shape("bf16[1,1536,512,1]"));
   EXPECT_THAT(
       root,
       AllOf(op::DynamicSlice(
@@ -8072,10 +8007,7 @@ ENTRY entry {
 
   const auto root = module->entry_computation()->root_instruction();
   auto input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
-  auto update = AllOf(
-      op::AllReduce(op::DynamicUpdateSlice(op::Broadcast(), op::Parameter(2),
-                                           op::Constant(), op::Reshape())),
-      op::Shape("s32[128,2]"));
+  auto update = AllOf(op::AllGather(op::Parameter(2)), op::Shape("s32[128,2]"));
 
   EXPECT_THAT(root,
               AllOf(op::Select(op::Broadcast(),
@@ -8143,11 +8075,8 @@ ENTRY entry {
   auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
                                                op::Reshape())),
                      op::Shape("s32[64,32]"));
-  auto update = AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                          op::Broadcast(),
-                          op::Copy(op::DynamicSlice(
-                              op::Parameter(1), op::Reshape(), op::Reshape())),
-                          op::Constant(), op::Reshape())),
+  auto update = AllOf(op::AllGather(op::Copy(op::DynamicSlice(
+                          op::Parameter(1), op::Reshape(), op::Reshape()))),
                       op::Shape("s32[64,2]"));
 
   EXPECT_THAT(root,
@@ -8158,6 +8087,164 @@ ENTRY entry {
                     op::Shape("s32[64,32]")));
 }
 
+TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantInRange) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+
+  ENTRY entry {
+    %input = s32[128,64] parameter(0), sharding={devices=[1,2]<=[2]}
+    %update = s32[10,10] parameter(1), sharding={devices=[1,2]<=[2]}
+    %c59 = s32[] constant(59)
+    %c27 = s32[] constant(27)
+    ROOT %dynamic-update-slice = s32[128,64]
+      dynamic-update-slice(%input, %update, %c59, %c27),
+      sharding={devices=[1,2]<=[2]}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[10,5]"));
+  auto zero_one_mask = AllOf(op::Pad(op::Broadcast(_), op::Constant()),
+                             op::Shape("pred[10,59]"));
+  auto sliced_mask_based_on_partition_id =
+      AllOf(op::DynamicSlice(zero_one_mask, _, _), op::Shape("pred[10,32]"));
+  auto dus_range_for_partition_id =
+      AllOf(op::Select(op::And(_, _), sliced_mask_based_on_partition_id,
+                       op::Broadcast(op::Constant())),
+            op::Shape("pred[10,32]"));
+  auto padded_dus_range_for_partition_id =
+      AllOf(op::Pad(dus_range_for_partition_id, op::Constant()),
+            op::Shape("pred[128,32]"));
+  auto padded_sharded_update =
+      AllOf(op::Pad(sharded_update, op::Constant()), op::Shape("s32[10,59]"));
+  auto sharded_update_for_partition_id =
+      AllOf(op::DynamicSlice(padded_sharded_update, op::Constant(),
+                             op::Multiply(_, _)),
+            op::Shape("s32[10,32]"));
+  auto fully_padded_sharded_update_for_partition_id =
+      AllOf(op::Pad(sharded_update_for_partition_id, op::Constant()),
+            op::Shape("s32[128,32]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::Select(padded_dus_range_for_partition_id, sharded_input,
+                               fully_padded_sharded_update_for_partition_id),
+                    op::Shape("s32[128,32]")));
+}
+
+// Out of range DUS is legal. The update index will be recalculated so that the
+// update tensor fits in the input tensor. Eg. for input[7], update[3],
+// dus_index = 5, the input tensor will be updated from index 4 to 6. More
+// details in the StableHlo spec: http://shortn/_g5KIGyMt9X.
+// TODO: b/457448098 - fix out-of-range indexing test case for
+// collective_ops_e2e_test.cc
+TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantOutOfRange) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+
+  ENTRY entry {
+    %input = s32[128,64] parameter(0), sharding={devices=[1,2]<=[2]}
+    %update = s32[128,20] parameter(1), sharding={devices=[1,2]<=[2]}
+    %c20 = s32[] constant(20)
+    %c60 = s32[] constant(60)
+    ROOT %dynamic-update-slice = s32[128,64]
+      dynamic-update-slice(%input, %update, %c20, %c60),
+      sharding={devices=[1,2]<=[2]}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[128,10]"));
+  auto all_gather_input =
+      AllOf(op::AllGather(sharded_input), op::Shape("s32[128,64]"));
+  auto all_gather_update =
+      AllOf(op::AllGather(sharded_update), op::Shape("s32[128,20]"));
+  auto dus = op::DynamicUpdateSlice(all_gather_input, all_gather_update, _, _);
+
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(dus, op::Constant(),
+                                     op::Reshape(op::DynamicSlice(
+                                         op::Constant(), op::PartitionId()))),
+                    op::Shape("s32[128,32]")));
+}
+
+TEST_P(SpmdPartitioningTest, DynamicUpdateSliceSingleDimensionWithEnzymeOpt) {
+  absl::string_view hlo_string = R"(
+    HloModule module
+
+    ENTRY entry {
+      %input = s32[16] parameter(0), sharding={devices=[4]<=[4]}
+      %update = s32[8] parameter(1), sharding={devices=[4]<=[4]}
+      %c3 = s32[] constant(3)
+      ROOT %dynamic-update-slice = s32[16]
+        dynamic-update-slice(%input, %update, %c3),
+        sharding={devices=[4]<=[4]}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/4, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[4]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[2]"));
+  auto c3 = AllOf(op::Constant(), op::Shape("s32[]"));
+  auto per_partition_padded_mask = AllOf(op::Select(
+      _, op::DynamicSlice(op::Pad(_, _), op::Multiply(_, _)), op::Broadcast()));
+  auto sliced_sharded_update_fwd_edge = AllOf(
+      op::CollectivePermute(op::Slice(sharded_update)), op::Shape("s32[1]"));
+  auto sharded_update_bwd_edge =
+      AllOf(op::CollectivePermute(sharded_update), op::Shape("s32[2]"));
+  auto sliced_sharded_update_non_neighboring_devices = AllOf(
+      op::CollectivePermute(op::Slice(sharded_update)), op::Shape("s32[1]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Select(
+                per_partition_padded_mask, sharded_input,
+                op::DynamicSlice(
+                    op::Pad(op::Concatenate(
+                                sliced_sharded_update_fwd_edge, sharded_update,
+                                sharded_update_bwd_edge,
+                                sliced_sharded_update_non_neighboring_devices),
+                            op::Constant()),
+                    op::Multiply(_, _))),
+            op::Shape("s32[4]")));
+}
+
+TEST_P(SpmdPartitioningTest,
+       DynamicUpdateSliceSingleDimensionWithoutEnzymeOpt) {
+  absl::string_view hlo_string = R"(
+    HloModule module
+
+    ENTRY entry {
+      %input = s32[16] parameter(0), sharding={devices=[4]<=[4]}
+      %update = s32[8] parameter(1), sharding={devices=[4]<=[4]}
+      %c3 = s32[] constant(3)
+      ROOT %dynamic-update-slice = s32[16]
+        dynamic-update-slice(%input, %update, %c3),
+        sharding={devices=[4]<=[4]}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[4]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[2]"));
+  auto c3 = AllOf(op::Constant(), op::Shape("s32[]"));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice((op::DynamicUpdateSlice(
+                                         op::AllGather(sharded_input),
+                                         op::AllGather(sharded_update), c3)),
+                                     op::Reshape(_)),
+                    op::Shape("s32[4]")));
+}
+
 TEST_P(SpmdPartitioningTest, UnpartitionedGather) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -8298,7 +8385,7 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   auto root = module->entry_computation()->root_instruction();
   auto operand = AllOf(op::Shape("f32[2,9,8]"), op::Parameter(0));
-  auto indices = AllOf(op::Shape("s32[2,2,2]"), op::AllReduce());
+  auto indices = AllOf(op::Shape("s32[2,2,2]"), op::AllGather());
   auto gather = AllOf(op::Shape("f32[8,2,2]"), op::Gather(operand, indices));
   VLOG(1) << module->ToString();
   EXPECT_THAT(root, op::CollectivePermute(gather));
@@ -8548,17 +8635,14 @@ ENTRY entry {
                              op::Shape("f32[3,9]"));
   EXPECT_THAT(
       root,
-      AllOf(op::Tuple(op::DynamicSlice(
-                          op::Pad(op::AllReduce(op::DynamicUpdateSlice(
-                                      _, op::GetTupleElement(scatter), _, _)),
-                                  _),
-                          _, _),
-                      op::DynamicSlice(
-                          op::Pad(op::AllReduce(op::DynamicUpdateSlice(
-                                      _, op::GetTupleElement(scatter), _, _)),
-                                  _),
-                          _, _)),
-            op::Shape("(f32[2,3],f32[2,3])")));
+      AllOf(
+          op::Tuple(op::DynamicSlice(
+                        op::Pad(op::AllGather(op::GetTupleElement(scatter)), _),
+                        _, _),
+                    op::DynamicSlice(
+                        op::Pad(op::AllGather(op::GetTupleElement(scatter)), _),
+                        _, _)),
+          op::Shape("(f32[2,3],f32[2,3])")));
 }
 
 TEST_P(SpmdPartitioningTest, VariadicScatterSharedOperands) {
@@ -8838,7 +8922,7 @@ ENTRY entry {
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
   auto operand = AllOf(op::Shape("f32[2,9,8]"), op::Select());
-  auto indices = AllOf(op::Shape("s32[2,2,2]"), op::AllReduce());
+  auto indices = AllOf(op::Shape("s32[2,2,2]"), op::AllGather());
   auto update = AllOf(op::Shape("f32[2,2,8]"), op::CollectivePermute());
   auto scatter =
       AllOf(op::Shape("f32[2,9,8]"), op::Scatter(operand, indices, update));
@@ -9413,12 +9497,10 @@ ENTRY entry {
 
   const auto lhs = AllOf(op::Shape("f32[24,6]"), op::Parameter(0));
   auto partial_replicated_lhs =
-      AllOf(op::Shape("f32[24,12]"),
-            op::AllReduce(op::DynamicUpdateSlice(_, lhs, _, _)));
+      AllOf(op::Shape("f32[24,12]"), op::AllGather(lhs));
   const auto rhs = AllOf(op::Shape("f32[16,6]"), op::Parameter(1));
   auto partial_replicated_rhs =
-      AllOf(op::Shape("f32[16,12]"),
-            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)));
+      AllOf(op::Shape("f32[16,12]"), op::AllGather(rhs));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               AllOf(op::Dot(partial_replicated_lhs, partial_replicated_rhs),
@@ -9445,8 +9527,7 @@ ENTRY entry {
   const auto lhs = AllOf(op::Shape("f32[24,50]"), op::Parameter(0));
   const auto rhs = AllOf(op::Shape("f32[16,50]"), op::Parameter(1));
   auto partial_replicated_rhs =
-      AllOf(op::Shape("f32[32,50]"),
-            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)));
+      AllOf(op::Shape("f32[32,50]"), op::AllGather(rhs));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
       root, AllOf(op::Shape("f32[24,16]"),
@@ -9478,8 +9559,7 @@ ENTRY entry {
       AllOf(op::Shape("f32[24,100]"), op::DynamicSlice(lhs, _, _));
   const auto rhs = AllOf(op::Shape("f32[16,50]"), op::Parameter(1));
   auto partial_replicated_rhs = AllOf(
-      op::Shape("f32[16,100]"), op::AllReduce(op::DynamicUpdateSlice(
-                                    _, op::CollectivePermute(rhs), _, _)));
+      op::Shape("f32[16,100]"), op::AllGather(op::CollectivePermute(rhs)));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::Shape("f32[24,16]"),
                           op::Dot(lhs_slice, partial_replicated_rhs)));
@@ -9534,8 +9614,7 @@ ENTRY entry {
   const auto lhs = AllOf(op::Shape("f32[2,12,100]"), op::Parameter(0));
   const auto rhs = AllOf(op::Shape("f32[2,16,100]"), op::Parameter(1));
   auto partial_replicated_rhs =
-      AllOf(op::Shape("f32[2,32,100]"),
-            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _, _)));
+      AllOf(op::Shape("f32[2,32,100]"), op::AllGather(rhs));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::Shape("f32[2,12,32]"),
                           op::Dot(lhs, partial_replicated_rhs)));
@@ -9621,8 +9700,7 @@ ENTRY entry {
   const auto lhs = AllOf(op::Shape("f32[2,24,50]"), op::Parameter(0));
   const auto rhs = AllOf(op::Shape("f32[2,16,100]"), op::Parameter(1));
   auto partial_replicated_lhs =
-      AllOf(op::Shape("f32[2,24,100]"),
-            op::AllReduce(op::DynamicUpdateSlice(_, lhs, _, _, _)));
+      AllOf(op::Shape("f32[2,24,100]"), op::AllGather(lhs));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::Shape("f32[2,24,16]"),
                           op::Dot(partial_replicated_lhs, rhs)));
@@ -9648,8 +9726,7 @@ ENTRY entry {
   const auto lhs = AllOf(op::Shape("f32[2,8,12,100]"), op::Parameter(0));
   const auto rhs = AllOf(op::Shape("f32[2,8,16,100]"), op::Parameter(1));
   auto partial_replicated_rhs =
-      AllOf(op::Shape("f32[2,8,32,100]"),
-            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _, _, _)));
+      AllOf(op::Shape("f32[2,8,32,100]"), op::AllGather(rhs));
   auto dot =
       AllOf(op::Shape("f32[2,8,12,32]"), op::Dot(lhs, partial_replicated_rhs));
   auto reshape = AllOf(op::Shape("f32[2,2,4,12,32]"), op::Reshape(dot));
@@ -9814,8 +9891,7 @@ ENTRY entry {
   const auto lhs = AllOf(op::Shape("f32[12,8,100]"), op::Parameter(0));
   const auto rhs = AllOf(op::Shape("f32[16,50]"), op::Parameter(1));
   auto partially_replicated_rhs =
-      AllOf(op::Shape("f32[16,100]"),
-            op::AllReduce(op::DynamicUpdateSlice(op::Broadcast(_), rhs, _, _)));
+      AllOf(op::Shape("f32[16,100]"), op::AllGather(rhs));
   auto dot =
       AllOf(op::Shape("f32[12,8,16]"), op::Dot(lhs, partially_replicated_rhs));
   const auto root = module->entry_computation()->root_instruction();
@@ -9868,10 +9944,9 @@ ENTRY entry {
 
   const auto lhs = AllOf(op::Shape("f32[12,8,50]"), op::Parameter(0));
   const auto rhs = AllOf(op::Shape("f32[50,25]"), op::Parameter(1));
-  auto dot = AllOf(
-      op::Shape("f32[12,8,50]"),
-      op::Dot(lhs, AllOf(op::Shape("f32[50,50]"),
-                         op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)))));
+  auto dot =
+      AllOf(op::Shape("f32[12,8,50]"),
+            op::Dot(lhs, AllOf(op::Shape("f32[50,50]"), op::AllGather(rhs))));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::Shape("f32[12,4,50]"),
                           op::DynamicSlice(op::AllReduce(dot), _, _, _)))
@@ -9898,10 +9973,9 @@ ENTRY entry {
 
   const auto lhs = AllOf(op::Shape("f32[12,4,10]"), op::Parameter(0));
   const auto rhs = AllOf(op::Shape("f32[5,50]"), op::Parameter(1));
-  auto dot = AllOf(
-      op::Shape("f32[12,4,50]"),
-      op::Dot(lhs, AllOf(op::Shape("f32[10,50]"),
-                         op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)))));
+  auto dot =
+      AllOf(op::Shape("f32[12,4,50]"),
+            op::Dot(lhs, AllOf(op::Shape("f32[10,50]"), op::AllGather(rhs))));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, dot) << module->ToString();
 }
@@ -9969,9 +10043,8 @@ ENTRY main {
   VLOG(1) << module->ToString();
 
   const auto lhs = AllOf(op::Shape("bf16[1024,256]"), op::Parameter(0));
-  const auto rhs = AllOf(op::Shape("bf16[1024,256]"),
-                         op::AllReduce(op::DynamicUpdateSlice(
-                             op::Broadcast(), op::Parameter(1), _, _)));
+  const auto rhs =
+      AllOf(op::Shape("bf16[1024,256]"), op::AllGather(op::Parameter(1)));
   auto dot = AllOf(op::Shape("bf16[256,256]"), op::Dot(lhs, rhs));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::AllReduce(dot));
@@ -9995,9 +10068,8 @@ ENTRY main {
   const auto all_to_all_p1 = AllOf(
       op::Shape("bf16[32,64,16]"),
       op::Reshape(op::Transpose(op::AllToAll(op::Reshape(op::Parameter(1))))));
-  const auto rhs = AllOf(op::Shape("bf16[32,64,32]"),
-                         op::AllReduce(op::DynamicUpdateSlice(
-                             op::Broadcast(), all_to_all_p1, _, _, _)));
+  const auto rhs =
+      AllOf(op::Shape("bf16[32,64,32]"), op::AllGather(all_to_all_p1));
   auto dot = AllOf(op::Shape("bf16[32,32,64]"), op::Dot(lhs, rhs));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::AllReduce(dot));
@@ -10017,9 +10089,8 @@ ENTRY main {
       auto module, PartitionComputation(hlo_string, /*num_devices=*/128));
   VLOG(1) << module->ToString();
 
-  const auto lhs = AllOf(op::Shape("bf16[128,8,8,1280]"),
-                         op::AllReduce(op::DynamicUpdateSlice(
-                             op::Broadcast(), op::Parameter(0), _, _, _, _)));
+  const auto lhs =
+      AllOf(op::Shape("bf16[128,8,8,1280]"), op::AllGather(op::Parameter(0)));
   const auto rhs = AllOf(op::Shape("bf16[3,3,1280,160]"), op::Parameter(1));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
@@ -10060,10 +10131,8 @@ ENTRY entry {
       AllOf(op::Shape("f32[6,2]"), op::Multiply(multiply_lhs, multiply_rhs));
   auto add = AllOf(op::Shape("f32[6,2]"), op::Add(multiply, multiply_rhs));
   const auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, AllOf(op::Shape("f32[6,3]"),
-                          op::AllReduce(op::DynamicUpdateSlice(
-                              op::Broadcast(), op::Select(_, add, _),
-                              op::Constant(), op::Reshape()))));
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[6,3]"), op::Slice(op::AllGather(add))));
 }
 
 TEST_P(SpmdPartitioningTest, ElementwiseTest_SubgroupSharding_ReplicateToTile) {
@@ -10143,9 +10212,8 @@ ENTRY entry {
   auto tiled = AllOf(op::Shape("f32[4,4]"),
                      op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
                                                op::Reshape())));
-  auto partially_replicated = AllOf(
-      op::Shape("f32[4,8]"), op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-                                 op::Broadcast(_), tiled, _, _))));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[4,8]"), op::Copy(op::AllGather(tiled)));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, partially_replicated);
 }
@@ -10163,19 +10231,19 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/6));
   VLOG(1) << module->ToString();
-  auto tiled = AllOf(op::Shape("f32[4,3]"), op::Select(_, op::Parameter(0), _));
-  auto partially_replicated = AllOf(
-      op::Shape("f32[8,4]"),
-      op::Copy(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(op::AllReduce(
-          op::DynamicUpdateSlice(op::Broadcast(), tiled, _, _))))))));
+  auto tiled = AllOf(op::Shape("f32[4,3]"), op::Parameter(0));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[8,4]"),
+            op::Copy(op::Reshape(op::Transpose(
+                op::AllToAll(op::Reshape(op::Slice(op::AllGather(tiled))))))));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, partially_replicated);
 
-  const HloInstruction* all_reduce =
-      FindInstruction(module.get(), "all-reduce");
-  EXPECT_NE(all_reduce, nullptr);
+  const HloInstruction* all_gather =
+      FindInstruction(module.get(), "all-gather");
+  EXPECT_NE(all_gather, nullptr);
   EXPECT_TRUE(
-      absl::StrContains(all_reduce->ToString(), "replica_groups=[2,3]<=[6]"));
+      absl::StrContains(all_gather->ToString(), "replica_groups=[2,3]<=[6]"));
 }
 
 TEST_P(SpmdPartitioningTest, PartialReplicateToTileReshardUnevenPartition) {
@@ -10252,8 +10320,7 @@ ENTRY entry {
                                       op::Reshape())));
   auto partially_replicated =
       AllOf(op::Shape("f32[4,8]"),
-            op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(_), partially_replicated_init, _, _))));
+            op::Copy(op::AllGather(partially_replicated_init)));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, partially_replicated);
 }
@@ -10309,8 +10376,7 @@ ENTRY entry {
                 op::Parameter(0), op::Reshape(), op::Reshape()))));
   auto partially_replicated =
       AllOf(op::Shape("f32[8,4]"),
-            op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(_), partially_replicated_init, _, _))));
+            op::Copy(op::AllGather(partially_replicated_init)));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, partially_replicated);
 }
@@ -10368,9 +10434,7 @@ ENTRY entry {
   auto concat = op::Concatenate(piece1, piece2);
   auto partially_replicated =
       AllOf(op::Shape("f32[3,3]"),
-            op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(_),
-                op::Select(_, op::DynamicSlice(concat, _, _), _), _, _)));
+            op::Slice(op::AllGather(op::DynamicSlice(concat, _, _))));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Copy(partially_replicated));
 }
@@ -10755,8 +10819,7 @@ ENTRY entry {
   const auto lhs = AllOf(op::Shape("f32[4,275,4]"));
   const auto rhs = AllOf(op::Shape("f32[1,275,4]"));
   auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,4,4]"));
-  EXPECT_THAT(root, AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                              _, op::CollectivePermute(conv), _, _, _)),
+  EXPECT_THAT(root, AllOf(op::AllGather(op::CollectivePermute(conv)),
                           op::Shape("f32[5,4,8]")));
 }
 
@@ -10781,8 +10844,7 @@ ENTRY entry {
   const auto lhs = AllOf(op::Shape("f32[4,275,4]"));
   const auto rhs = AllOf(op::Shape("f32[1,275,4]"));
   auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,4,4]"));
-  EXPECT_THAT(root, AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                              _, op::CollectivePermute(conv), _, _, _)),
+  EXPECT_THAT(root, AllOf(op::AllGather(op::CollectivePermute(conv)),
                           op::Shape("f32[5,4,8]")));
 }
 
@@ -11397,11 +11459,9 @@ ENTRY entry {
   const auto root = module->entry_computation()->root_instruction();
   auto param = AllOf(op::Parameter(), op::Shape("f32[2000, 1000]"));
   auto resharded_lhs =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(_, param, _, _)),
-            op::Shape("f32[2000, 4000]"));
+      AllOf(op::AllGather(param), op::Shape("f32[2000, 4000]"));
   auto resharded_rhs =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(_, op::Copy(param), _, _)),
-            op::Shape("f32[4000, 1000]"));
+      AllOf(op::AllGather(op::Copy(param)), op::Shape("f32[4000, 1000]"));
   EXPECT_THAT(root, AllOf(op::Convolution(resharded_lhs, resharded_rhs),
                           op::Shape("f32[2000, 1000]")));
 }
@@ -11469,8 +11529,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[1,4,2,2]"), op::Reshape());
   auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(gather));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimRedistributionIndices) {
@@ -11499,8 +11558,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[2,2,2,2]"), op::Reshape());
   auto indices = AllOf(op::Shape("s32[2,2,2]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[2,2,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, gather, _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(gather)));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimReplicatedIndices) {
@@ -11530,8 +11588,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[1,4,2,2]"), op::Parameter());
   auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(gather));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimReplicatedOperand) {
@@ -11560,8 +11617,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[1,4,2,2]"), op::DynamicSlice());
   auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(gather));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimPartialReplicatedIndices) {
@@ -11591,8 +11647,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[1,4,2,2]"), op::Parameter());
   auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(gather));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimPartialReplicatedOperand) {
@@ -11622,8 +11677,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[1,4,2,2]"), op::DynamicSlice());
   auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(gather));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimSwappedDimensions) {
@@ -11653,8 +11707,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[4,1,2,2]"), op::CollectivePermute());
   auto indices = AllOf(op::Shape("s32[2,4,1]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[4,1,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, gather, _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(gather)));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimFromOutsideWhilePositive) {
@@ -11717,10 +11770,7 @@ ENTRY entry {
   auto operand = AllOf(op::Shape("s32[1,4,2,2]"), op::DynamicSlice());
   auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(
-      root,
-      op::Tuple(op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)), _,
-                _));
+  EXPECT_THAT(root, op::Tuple(op::AllGather(gather), _, _));
 }
 
 TEST_P(SpmdPartitioningTest, GatherParallelDimFromOutsideWhileNegative) {
@@ -11785,10 +11835,7 @@ ENTRY entry {
   auto operand = AllOf(op::Shape("s32[8,4,2,2]"), op::GetTupleElement());
   auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Concatenate());
   auto gather = AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(
-      root,
-      op::Tuple(op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)), _,
-                _));
+  EXPECT_THAT(root, op::Tuple(op::AllGather(gather), _, _));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterRepsOnLastTileDimDontDivideGroups) {
@@ -11921,9 +11968,7 @@ ENTRY entry {
     auto indices = AllOf(op::Shape("s32[2,1,4]"), op::Subtract());
     auto gather =
         AllOf(op::Shape("s32[1,4,2,2]"), op::Gather(operand, indices));
-    EXPECT_THAT(
-        partitioned_gather,
-        op::Copy(op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _))));
+    EXPECT_THAT(partitioned_gather, op::Copy(op::AllGather(gather)));
   }
 
   // Verify scatter is partitioned properly.
@@ -11937,9 +11982,7 @@ ENTRY entry {
     auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::DynamicSlice());
     auto scatter =
         AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-    EXPECT_THAT(partitioned_scatter,
-                op::Copy(op::AllReduce(
-                    op::DynamicUpdateSlice(_, scatter, _, _, _, _))));
+    EXPECT_THAT(partitioned_scatter, op::Copy(op::AllGather(scatter)));
   }
 }
 
@@ -11970,11 +12013,10 @@ ENTRY %module {
                           PartitionComputation(hlo_string, /*num_devices=*/4));
   const auto root = module->entry_computation()->root_instruction();
   VLOG(1) << module->ToString();
-  auto operand = AllOf(op::Shape("s32[4,4,2,2]"), op::AllReduce());
+  auto operand = AllOf(op::Shape("s32[4,4,2,2]"), op::AllGather());
   auto indices = AllOf(op::Shape("s32[2,4,2]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[4,2,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, gather, _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(gather)));
 }
 
 TEST_P(SpmdPartitioningTest, Gather_b303520921) {
@@ -12026,8 +12068,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[2,4,1,2]"), op::Reshape());
   auto indices = AllOf(op::Shape("s32[2,2,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[2,4,1,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, gather, _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(gather)));
   auto* all_to_all = FindInstruction(module.get(), "all-to-all");
   EXPECT_TRUE(all_to_all != nullptr);
   if (GetParam() ==
@@ -12069,9 +12110,7 @@ ENTRY %module {
   auto indices = AllOf(op::Shape("s32[2,2,1]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[2,1,2,2]"), op::Gather(operand, indices));
   VLOG(1) << module->ToString();
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(
-                  _, op::AllReduce(op::Select(_, _, gather)), _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(op::AllReduce(op::Select(_, _, gather))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -12160,9 +12199,8 @@ ENTRY %module {
   auto indices = AllOf(op::Shape("s32[2,8,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[8,4,1,2]"), op::Gather(operand, indices));
   EXPECT_THAT(
-      root, op::AllReduce(op::DynamicUpdateSlice(
-                _, op::AllReduce(op::AllReduce(op::Select(_, _, gather))), _, _,
-                _, _)));
+      root,
+      op::AllGather(op::AllReduce(op::AllReduce(op::Select(_, _, gather)))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -12188,10 +12226,7 @@ ENTRY %module {
   auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::Parameter());
   auto indices = AllOf(op::Shape("s32[2,4,4]"), op::CollectivePermute());
   auto gather = AllOf(op::Shape("s32[4,4,1,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(
-      root, op::AllReduce(op::DynamicUpdateSlice(
-                _, op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)),
-                _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(gather)));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -12214,13 +12249,10 @@ ENTRY %module {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::AllReduce());
+  auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::AllGather());
   auto indices = AllOf(op::Shape("s32[2,4,2]"), op::Parameter());
   auto gather = AllOf(op::Shape("s32[4,2,1,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(
-      root, op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
-                _, op::AllReduce(op::DynamicUpdateSlice(_, gather, _, _, _, _)),
-                _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(op::AllGather(gather))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -12239,8 +12271,10 @@ ENTRY %module {
     collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=0,
     slice_sizes={1,1,2,2}, sharding={replicated}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/8,
+                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
   auto operand = AllOf(op::Shape("s32[4,2,2,2]"), op::Parameter());
@@ -12272,12 +12306,12 @@ ENTRY %module {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  auto operand = AllOf(op::Shape("s32[8,2,2,2]"), op::AllReduce());
+  auto operand = AllOf(op::Shape("s32[8,2,2,2]"), op::AllGather());
   auto indices = AllOf(op::Shape("s32[2,4,2]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[4,2,2,2]"), op::Gather(operand, indices));
-  EXPECT_THAT(root,
-              op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
-                  _, op::AllReduce(op::Select(_, _, gather)), _, _, _, _))));
+  EXPECT_THAT(
+      root,
+      op::AllGather(op::AllGather(op::AllReduce(op::Select(_, _, gather)))));
 }
 
 TEST_P(SpmdPartitioningTest, GatherTrivialSlicedOperandPartial) {
@@ -12295,7 +12329,7 @@ ENTRY main.4 {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  auto operand = AllOf(op::Shape("s64[8,1]"), op::AllReduce());
+  auto operand = AllOf(op::Shape("s64[8,1]"), op::AllGather());
   auto indices = AllOf(op::Shape("s32[2]"), op::Subtract());
   auto gather = AllOf(op::Shape("s64[2,1]"), op::Gather(operand, indices));
   EXPECT_THAT(root, op::AllReduce(op::Select(_, _, gather)));
@@ -12444,11 +12478,8 @@ ENTRY entry {
       PartitionComputation(hlo_string, /*num_devices=*/32, options));
   VLOG(1) << module->ToString();
   HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(
-      root,
-      op::AllReduce(op::DynamicUpdateSlice(
-          _, op::AllReduce(op::Select(_, _, op::Gather(op::AllReduce(_), _))),
-          _, _, _)));
+  EXPECT_THAT(root, op::AllGather(op::AllReduce(
+                        op::Select(_, _, op::Gather(op::AllGather(_), _)))));
   auto gather = FindInstruction(module.get(), HloOpcode::kGather);
   EXPECT_THAT(
       gather->operand(1),
@@ -12456,8 +12487,6 @@ ENTRY entry {
   auto collective_permute =
       FindInstruction(module.get(), HloOpcode::kCollectivePermute);
   EXPECT_NE(collective_permute, nullptr);
-  auto all_reduce = FindInstruction(module.get(), HloOpcode::kAllReduce);
-  EXPECT_THAT(all_reduce->operand(0), op::DynamicUpdateSlice(_, _, _, _));
   auto dynamic_slice = FindInstruction(module.get(), HloOpcode::kDynamicSlice);
   EXPECT_THAT(dynamic_slice->operand(1), op::PartitionId());
 }
@@ -12505,8 +12534,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::Parameter());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(scatter));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimReplicatedIndices) {
@@ -12550,8 +12578,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::Parameter());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(scatter));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimReplicatedOperand) {
@@ -12594,8 +12621,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::Parameter());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(scatter));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimReplicatedUpdate) {
@@ -12638,8 +12664,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(scatter));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimPartialReplicatedIndices) {
@@ -12683,8 +12708,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::Parameter());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(scatter));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimPartialReplicatedOperand) {
@@ -12728,8 +12752,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::Parameter());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(scatter));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimPartialReplicatedUpdate) {
@@ -12773,8 +12796,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root,
-              op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(scatter));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimSwappedDimensions) {
@@ -12818,8 +12840,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[4,1,2,2]"), op::CollectivePermute());
   auto scatter =
       AllOf(op::Shape("s32[4,1,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, scatter, _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(scatter)));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimFromOutsideWhilePositive) {
@@ -12897,10 +12918,7 @@ ENTRY entry {
   auto update = AllOf(op::Shape("s32[1,4,2,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[1,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(
-      root,
-      op::Tuple(op::AllReduce(op::DynamicUpdateSlice(_, scatter, _, _, _, _)),
-                _, _, _));
+  EXPECT_THAT(root, op::Tuple(op::AllGather(scatter), _, _, _));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterParallelDimAndNonParallelDimPartitioned) {
@@ -12948,9 +12966,8 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[4,2,2,2]"));
   auto scatter =
       AllOf(op::Shape("s32[4,4,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
-                        _, op::DynamicSlice(op::AllReduce(scatter), _, _, _, _),
-                        _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(
+                        op::DynamicSlice(op::AllReduce(scatter), _, _, _, _))));
 }
 
 TEST_P(SpmdPartitioningTest, b_356877097) {
@@ -12980,7 +12997,7 @@ ENTRY main.22 {
   const auto root = module->entry_computation()->root_instruction();
   auto operand = AllOf(op::Shape("f32[16,2]"), op::Broadcast());
   auto indices = AllOf(op::Shape("s32[8,2]"), op::Subtract());
-  auto update = AllOf(op::Shape("f32[8]"), op::AllReduce());
+  auto update = AllOf(op::Shape("f32[8]"), op::AllGather());
   EXPECT_THAT(root, AllOf(op::Shape("f32[16,2]"),
                           op::Scatter(operand, indices, update)));
 }
@@ -13026,8 +13043,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[2,4,1,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[2,4,1,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, scatter, _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(scatter)));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -13072,8 +13088,7 @@ ENTRY %module {
   auto scatter =
       AllOf(op::Shape("s32[2,2,2,2]"), op::Scatter(operand, indices, update));
   VLOG(1) << module->ToString();
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, scatter, _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(scatter)));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterMergedIndexParallelAndIndexPassthrough) {
@@ -13124,8 +13139,7 @@ ENTRY %module {
     auto scatter =
         AllOf(op::Shape("s32[2,4,2,2]"), op::Scatter(operand, indices, update));
     EXPECT_THAT(module->entry_computation()->root_instruction(),
-                op::AllReduce(op::DynamicUpdateSlice(_, op::AllReduce(scatter),
-                                                     _, _, _, _)));
+                op::AllGather(op::AllReduce(scatter)));
   }
 }
 
@@ -13166,8 +13180,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[8,4,1,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[4,2,1,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(op::AllReduce(
-                        op::DynamicUpdateSlice(_, scatter, _, _, _, _)))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(op::AllGather(scatter))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -13207,8 +13220,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[4,4,1,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[8,4,1,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::DynamicUpdateSlice(
-                        _, op::AllReduce(scatter), _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(op::AllReduce(scatter)));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -13239,8 +13251,10 @@ ENTRY %module {
     scatter_dims_to_operand_dims={0,1},
     index_vector_dim=0, sharding={replicated}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/8,
+                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
   auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::Select());
@@ -13289,8 +13303,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[4,4,2,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[4,2,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
-                        _, op::AllReduce(scatter), _, _, _, _))));
+  EXPECT_THAT(root, op::AllGather(op::AllGather(op::AllReduce(scatter))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -13330,8 +13343,7 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[4,2,2,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[8,2,2,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::DynamicUpdateSlice(
-                        _, op::AllReduce(op::AllReduce(scatter)), _, _, _, _)));
+  EXPECT_THAT(root, op::AllGather(op::AllReduce(op::AllReduce(scatter))));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterTrivialSlicedOperandPartial) {
@@ -13359,13 +13371,13 @@ ENTRY main.4 {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  auto operand = AllOf(op::Shape("s64[8,1]"), op::AllReduce());
+  auto operand = AllOf(op::Shape("s64[8,1]"), op::AllGather());
   auto indices = AllOf(op::Shape("s32[2]"), op::Subtract());
   auto update = AllOf(op::Shape("s64[2,1]"), op::Parameter());
   auto scatter =
       AllOf(op::Shape("s64[8,1]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
-                        _, op::DynamicSlice(scatter, _, _), _, _))));
+  EXPECT_THAT(root,
+              op::AllGather(op::AllGather(op::DynamicSlice(scatter, _, _))));
 }
 
 // Tests for scatter partitioning methods with SPMD config option.
@@ -14122,10 +14134,8 @@ ENTRY entry {
   auto param0 = AllOf(op::Parameter(0), op::Shape("f32[1,1]"));
   auto broadcast =
       AllOf(op::AllReduce(op::Select(_, param0, _)), op::Shape("f32[1,1]"));
-  EXPECT_THAT(
-      root,
-      AllOf(op::Copy(op::AllReduce(op::DynamicUpdateSlice(_, broadcast, _, _))),
-            op::Shape("f32[1,2]")));
+  EXPECT_THAT(root,
+              AllOf(op::Copy(op::AllGather(broadcast)), op::Shape("f32[1,2]")));
 }
 
 TEST_P(SpmdPartitioningTest, BroadcastAsReplicate3) {
@@ -14336,12 +14346,10 @@ ENTRY entry {
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
-      root, op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-                _,
-                op::CollectivePermute(op::AllReduce(op::Scatter(
-                    op::Shape("bf16[50048,1020]"), op::Shape("s32[512,1024,1]"),
-                    op::Shape("bf16[512,1024,1020]")))),
-                _, _))));
+      root,
+      op::Copy(op::AllGather(op::CollectivePermute(op::AllReduce(op::Scatter(
+          op::Shape("bf16[50048,1020]"), op::Shape("s32[512,1024,1]"),
+          op::Shape("bf16[512,1024,1020]")))))));
 }
 
 TEST_P(SpmdPartitioningTest, ScatterPreferTrivialIfSmallerThanIndices) {
@@ -14378,12 +14386,9 @@ ENTRY entry {
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
-              op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-                  _,
-                  op::AllReduce(op::Scatter(op::Shape("bf16[32,128,50001]"),
-                                            op::Shape("s32[32,256,3]"),
-                                            op::Shape("bf16[32,256]"))),
-                  _, _, _))));
+              op::Copy(op::AllGather(op::AllReduce(op::Scatter(
+                  op::Shape("bf16[32,128,50001]"), op::Shape("s32[32,256,3]"),
+                  op::Shape("bf16[32,256]"))))));
 }
 
 TEST_P(SpmdPartitioningTest, GatherOperandPassthroughIndexPassthrough) {
@@ -14716,13 +14721,15 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/16));
-  VLOG(1) << module->ToString();
-  auto dus =
-      AllOf(op::Shape("f32[3,2]"),
-            op::DynamicUpdateSlice(op::Broadcast(),
-                                   op::Select(_, op::Parameter(0), _), _, _));
+
+  auto param0 = AllOf(op::Parameter(0), op::Shape("f32[1,1]"));
+  auto all_gather_dim_1 = AllOf(op::AllGather(param0), op::Shape("f32[1,2]"));
+  auto all_reduce_dus_dim_0 =
+      AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                _, op::Select(_, all_gather_dim_1, _), _, _)),
+            op::Shape("f32[3,2]"));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Copy(AllOf(op::AllReduce(op::AllReduce(dus)))));
+              op::Copy(all_reduce_dus_dim_0));
 }
 
 TEST_P(SpmdPartitioningTest, GatherPassthrough) {
@@ -14928,15 +14935,17 @@ ENTRY entry {
   ROOT c = bf16[16,224,224,384]{3,2,1,0} copy(dynamic-update-slice.128), sharding={devices=[2,2,2,1]<=[8]}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/8, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
 
   XLA_VLOG_LINES(1, module->ToString());
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      op::Copy(op::DynamicSlice(
-          AllOf(op::DynamicUpdateSlice(), op::Shape("bf16[8,224, 224,384]")), _,
-          _, _, _)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              AllOf(op::Copy(op::Select(op::Select(_, _, _),
+                                        op::DynamicSlice(_, _, _, _, _),
+                                        op::DynamicSlice(_, _, _, _, _))),
+                    op::Shape("bf16[8,112,112,384]")));
 }
 
 TEST_P(SpmdPartitioningTest, CustomCallManualSharding) {
@@ -14972,10 +14981,8 @@ ENTRY %main.21 (Arg_0.1: f32[4,4,8], Arg_1.2: f32[4,8]) -> (f32[4,4,8], f32[4])
 
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::AllReduce(op::DynamicUpdateSlice(
-                            _, op::Shape("f32[1,4,8]"), _, _, _)),
-                        op::AllReduce(op::DynamicUpdateSlice(
-                            _, op::Shape("f32[1]"), _))));
+              op::Tuple(op::AllGather(op::Shape("f32[1,4,8]")),
+                        op::AllGather(op::Shape("f32[1]"))));
 }
 
 TEST_P(SpmdPartitioningTest, UnevenPadAllToAllReshard) {
@@ -15387,7 +15394,7 @@ ENTRY main {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
 
   const HloComputation* entry = module->entry_computation();
-  EXPECT_EQ(NumOfInstructions(entry, HloOpcode::kAllReduce), 1);
+  EXPECT_EQ(NumOfInstructions(entry, HloOpcode::kAllGather), 1);
   EXPECT_EQ(NumOfInstructions(entry, HloOpcode::kAllToAll), 0);
   EXPECT_EQ(NumOfInstructions(entry, HloOpcode::kCollectivePermute), 0);
 }
@@ -15437,20 +15444,23 @@ TEST_P(SpmdPartitioningTest, ReshardNoFullRematCompatible) {
 HloModule Test
 
 ENTRY main.6 {
-  Arg_0.1 = f32[6,32,4] parameter(0), sharding={devices=[4,2,1]0,2,1,3,4,6,5,7}
-  ROOT copy = copy(Arg_0.1), sharding={devices=[2,2,2]<=[8]}
+
+  p0 = f32[6,32,4] parameter(0), sharding={devices=[4,2,1]<=[2,2,2]T(0,2,1)}
+  ROOT copy = copy(p0), sharding={devices=[2,2,2]<=[8]}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/8));
 
-  XLA_VLOG_LINES(1, module->ToString());
-  auto* allreduce = FindInstruction(module.get(), HloOpcode::kAllReduce);
-  EXPECT_NE(allreduce, nullptr);
-  // It should not touch the middle dim in the [2,2,2] sharding.
-  EXPECT_EQ(allreduce->replica_groups().size(), 2);
-  EXPECT_EQ(FindInstruction(module.get(), HloOpcode::kCollectivePermute),
-            nullptr);
+  auto param0 = AllOf(op::Parameter(0), op::Shape("f32[2,16,4]"));
+  auto all_gather =
+      AllOf(op::AllGather(op::Copy(param0)), op::Shape("f32[8,16,4]"));
+  auto remove_padding = AllOf(op::Slice(all_gather), op::Shape("f32[6,16,4]"));
+
+  auto dynamic_slice = AllOf(op::DynamicSlice(remove_padding, _, _, _),
+                             op::Shape("f32[3,16,2]"));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(dynamic_slice));
 }
 
 TEST_P(SpmdPartitioningTest, ReshardNoFullRematIncompatible) {
@@ -15465,14 +15475,15 @@ ENTRY main.6 {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/8));
 
-  XLA_VLOG_LINES(1, module->ToString());
-  auto* allreduce = FindInstruction(module.get(), HloOpcode::kAllReduce);
-  EXPECT_NE(allreduce, nullptr);
-  // It should not touch the middle dim in the [2,2,2] sharding.
-  EXPECT_EQ(allreduce->replica_groups().size(), 2);
-  // Collective permute to resolve different device orders.
-  EXPECT_NE(FindInstruction(module.get(), HloOpcode::kCollectivePermute),
-            nullptr);
+  auto param0 = AllOf(op::Parameter(0), op::Shape("f32[2,16,4]"));
+  auto all_gather =
+      AllOf(op::AllGather(op::Copy(param0)), op::Shape("f32[8,16,4]"));
+  auto remove_padding = AllOf(op::Slice(all_gather), op::Shape("f32[6,16,4]"));
+
+  auto dynamic_slice = AllOf(op::DynamicSlice(remove_padding, _, _, _),
+                             op::Shape("f32[3,16,2]"));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(op::CollectivePermute(dynamic_slice)));
 }
 
 TEST_P(SpmdPartitioningTest, OutfeedChainedManualPartitioned) {
@@ -15748,11 +15759,11 @@ ENTRY %entry {
       AllOf(op::CollectivePermute(), op::Shape("bf16[8,2048,1,5120]"));
   const auto broadcast =
       AllOf(op::Broadcast(), op::Shape("bf16[8,2048,16384]"));
-  const auto all_reduce =
-      AllOf(op::AllReduce(), op::Shape("bf16[20480,16384]"));
+  const auto all_gather =
+      AllOf(op::AllGather(), op::Shape("bf16[20480,16384]"));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::GetTupleElement(op::While(op::Tuple(
-                              op::Reshape(), all_reduce, op::Broadcast(),
+                              op::Reshape(), all_gather, op::Broadcast(),
                               collective_permute, op::Constant()))),
                           op::Shape("bf16[8,2048,16384]")));
 }
@@ -16101,9 +16112,7 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/2));
 
   auto param0 = AllOf(op::Parameter(0), op::Shape("s32[2]"));
-  auto param0_replicated = AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                                     op::Broadcast(op::Constant()), param0, _)),
-                                 op::Shape("s32[4]"));
+  auto param0_replicated = AllOf(op::AllGather(param0), op::Shape("s32[4]"));
   auto result =
       AllOf(op::BitcastConvert(param0_replicated), op::Shape("f32[4]"));
   EXPECT_THAT(module->entry_computation()->root_instruction(), result);
@@ -16122,9 +16131,7 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/4));
 
   auto param0 = AllOf(op::Parameter(0), op::Shape("s32[2,1]"));
-  auto param0_reshard = AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                                  op::Broadcast(op::Constant()), param0, _, _)),
-                              op::Shape("s32[2,2]"));
+  auto param0_reshard = AllOf(op::AllGather(param0), op::Shape("s32[2,2]"));
   auto result = AllOf(op::BitcastConvert(param0_reshard), op::Shape("f64[2]"));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::CollectivePermute(result));
@@ -16165,16 +16172,14 @@ ENTRY entry {
   // {devices=[2,2,2]<=[8]} to {devices=[8,1,1]<=[8]}.
   auto param0 = AllOf(op::Parameter(0), op::Shape("f32[16,16,16]"));
   auto param0_replicated =
-      AllOf(op::AllReduce(op::AllReduce(op::AllReduce(
-                op::DynamicUpdateSlice(op::Broadcast(), param0, _, _, _)))),
+      AllOf(op::AllGather(op::AllGather(op::AllGather(param0))),
             op::Shape("f32[32,32,32]"));
   auto param0_reshard = AllOf(op::Shape("f32[4,32,32]"),
                               op::DynamicSlice(param0_replicated, _, _, _));
   auto cholesky =
       AllOf(op::Cholesky(param0_reshard), op::Shape("f32[4,32,32]"));
   auto cholesky_partially_replicated =
-      AllOf(op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(), op::Copy(op::Reshape(cholesky)), _, _, _, _)),
+      AllOf(op::AllGather(op::Copy(op::Reshape(cholesky))),
             op::Shape("f32[1,16,32,32]"));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               AllOf(op::Reshape(op::DynamicSlice(cholesky_partially_replicated,
@@ -16195,14 +16200,10 @@ ENTRY main {
 
   auto param0 = AllOf(op::Parameter(0), op::Shape("f32[5,16,16]"));
   auto param0_reshard =
-      AllOf(op::Shape("f32[5,32,32]"),
-            op::AllReduce(op::AllReduce(
-                op::DynamicUpdateSlice(op::Broadcast(), param0, _, _, _))));
+      AllOf(op::Shape("f32[5,32,32]"), op::AllGather(op::AllGather(param0)));
   auto param1 = AllOf(op::Parameter(1), op::Shape("f32[5,16,24]"));
   auto param1_reshard =
-      AllOf(op::Shape("f32[5,32,48]"),
-            op::AllReduce(op::AllReduce(
-                op::DynamicUpdateSlice(op::Broadcast(), param1, _, _, _))));
+      AllOf(op::Shape("f32[5,32,48]"), op::AllGather(op::AllGather(param1)));
 
   auto ts = AllOf(op::TriangularSolve(param0_reshard, param1_reshard),
                   op::Shape("f32[5,32,48]"));
@@ -16388,8 +16389,10 @@ ENTRY entry {
   add = s32[2,4]{1,0} add(constant, a), sharding={unreduced}
   ROOT copy = s32[2,4]{1,0} copy(%add), sharding={unreduced}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
   VLOG(1) << module->ToString();
   // Check that we use all-reduce to reshard the operands of the add in spite
   // that the `add` has unreduced axes.
@@ -16475,9 +16478,7 @@ ENTRY entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          PartitionComputation(hlo_string, /*num_devices=*/16,
-                                               SpmdPartitionerOptions(),
-                                               /*use_all_gather=*/true));
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
   const HloComputation* recovery_computation =
       module->original_value_recovery_table()
           .begin()
@@ -16538,9 +16539,7 @@ ENTRY entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          PartitionComputation(hlo_string, /*num_devices=*/16,
-                                               SpmdPartitionerOptions(),
-                                               /*use_all_gather=*/true));
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
   const HloComputation* recovery_computation =
       module->original_value_recovery_table()
           .begin()
@@ -16599,6 +16598,27 @@ ENTRY entry {
               ::testing::ElementsAre(8, 10, 12, 14, 9, 11, 13, 15))));
 }
 
+TEST_P(SpmdPartitioningTest, OriginalValueWithTupleTypeShardingAnnotation) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = ((f32[9,2]{1,0}, f32[2]{0}), token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {replicated}, {maximal device=0}}
+  ROOT infeed.data = (f32[9,2]{1,0}, f32[2]{0}) get-tuple-element(infeed),
+    index=0, sharding={{devices=[2,1]0,1}, {replicated}}, origin={({"a"}, {"b"})}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  EXPECT_EQ(module->original_value_recovery_table().size(), 1);
+  EXPECT_EQ(module->original_value_recovery_table().begin()->first.ToString(),
+            R"("a")");
+  EXPECT_EQ(
+      module->original_value_recovery_table().begin()->second.first.ToString(),
+      R"("a)" + std::string(kOriginalValuePlaceholderDelimiter) + R"(0")");
+}
+
 TEST_P(SpmdPartitioningTest, ShardingPreprocessOrderWhile) {
   absl::string_view hlo_string = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index 33cd27e4c0d724..433405a85ee719 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -2534,10 +2534,9 @@ HloSharding CreateMatchingShardingOnDims(
   if (to_be_partially_replicated) {
     return AlignShardingOnDims(HloSharding::PartialTile(tgt_tile_assignment),
                                target_dims, source_sharding, source_dims);
-  } else {
-    return AlignShardingOnDims(HloSharding::Tile(tgt_tile_assignment),
-                               target_dims, source_sharding, source_dims);
   }
+  return AlignShardingOnDims(HloSharding::Tile(tgt_tile_assignment),
+                             target_dims, source_sharding, source_dims);
 }
 
 std::optional<GatherScatterParallelDimSharding>
@@ -2909,8 +2908,8 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
       [&](absl::Span<const int64_t> indices, int64_t device) {
         int64_t group_id = 0;
         for (int64_t dim = 0; dim < indices.size(); ++dim) {
-          auto it = absl::c_find(target_dims, dim);
-          if (it != target_dims.end()) {
+          if (auto it = absl::c_find(target_dims, dim);
+              it != target_dims.end()) {
             int64_t group_size =
                 group_sizes[std::distance(target_dims.begin(), it)];
             group_id *= sharding.tile_assignment().dim(dim) / group_size;
@@ -2951,8 +2950,9 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
   // We perform the following steps on the original tile assignment:
   // 1. Expand target dims: [8,8,16]->[2,4,2,4,16]
   // 2. Transpose to make target dims minor: [2,4,2,4,16]->[2,2,16,4,4] with
-  // (0,1,2,3,4) -> (0,2,4,1,3)
-  // 3. Reshape to get groups of size 16: [2,4,16,2,4]->[2,2,16,16]
+  //    (0,1,2,3,4) -> (0,2,4,1,3)
+  // 3. Reshape to [num_replica_groups, num_devices_per_group]:
+  //    [2,2,16,4,4]->[2x2x16, 4x4].
   int64_t total_group_size = std::accumulate(
       group_sizes.begin(), group_sizes.end(), 1, std::multiplies<int64_t>());
   int64_t num_replica_groups =
@@ -2963,8 +2963,7 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
   std::vector<int64_t> target_dim_locations;
   for (int64_t dim = 0; dim < sharding.tile_assignment().num_dimensions();
        ++dim) {
-    auto it = std::find(target_dims.begin(), target_dims.end(), dim);
-    if (it != target_dims.end()) {
+    if (auto it = absl::c_find(target_dims, dim); it != target_dims.end()) {
       int64_t current_val = sharding.tile_assignment().dim(dim);
       int64_t group_size = group_sizes[std::distance(target_dims.begin(), it)];
       reshape_dimensions.push_back(current_val / group_size);
@@ -2976,10 +2975,10 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
   }
 
   std::vector<int> transpose_dims(reshape_dimensions.size());
-  std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+  absl::c_iota(transpose_dims, 0);
   for (int64_t loc : target_dim_locations) {
-    auto it = std::find(transpose_dims.begin(), transpose_dims.end(), loc);
-    if (it != transpose_dims.end()) {
+    if (auto it = absl::c_find(transpose_dims, loc);
+        it != transpose_dims.end()) {
       transpose_dims.erase(it);
       transpose_dims.push_back(loc);
     }
@@ -3004,13 +3003,11 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
     return std::nullopt;
   }
 
-  // Step 3: Final reshape to get groups of size total_group_size. This is done
-  // implicitly by creating an IotaReplicaGroupList with num_replica_groups,
-  // total_group_size.
-  IotaReplicaGroupList groups(
-      num_replica_groups, total_group_size,
-      tranposed_tile_assignment.value().reshape_dims(),
-      tranposed_tile_assignment.value().transpose_perm());
+  // Step 3: Final reshape to [num_replica_groups, num_devices_per_group]. This
+  // is done implicitly by creating an IotaReplicaGroupList.
+  IotaReplicaGroupList groups(num_replica_groups, total_group_size,
+                              tranposed_tile_assignment->reshape_dims(),
+                              tranposed_tile_assignment->transpose_perm());
   return groups;
 }
 
@@ -3039,16 +3036,15 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
   // into a tile assignment with dims [M, N], where M is the number of replica
   // groups and N is the size of each replica group.
   std::vector<int> transpose_dims(sharding.tile_assignment().num_dimensions());
-  std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+  absl::c_iota(transpose_dims, 0);
 
   // Sorting is not necessary but is done to match the non-optimized equivalent
   // function.
   std::vector<int> replication_dims_sorted(replication_dims.begin(),
                                            replication_dims.end());
-  std::sort(replication_dims_sorted.begin(), replication_dims_sorted.end());
+  absl::c_sort(replication_dims_sorted);
   for (int64_t i : replication_dims_sorted) {
-    auto it = std::find(transpose_dims.begin(), transpose_dims.end(), i);
-    if (it != transpose_dims.end()) {
+    if (auto it = absl::c_find(transpose_dims, i); it != transpose_dims.end()) {
       transpose_dims.erase(it);
       transpose_dims.push_back(i);
     }
@@ -3072,16 +3068,16 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
 // Expands partition group list across all replicas. Expects that provided
 // partition group list utilizes all the partitions.
 CollectiveDeviceList ExpandPartitionGroupListAcrossReplicas(
-    IotaReplicaGroupList partition_group_list, int num_replicas,
-    int num_partitions) {
-  int partition_group_count = partition_group_list.num_replica_groups();
-  int partition_group_size = partition_group_list.num_devices_per_group();
+    IotaReplicaGroupList partition_group_list, int64_t num_replicas,
+    int64_t num_partitions) {
+  int64_t partition_group_count = partition_group_list.num_replica_groups();
+  int64_t partition_group_size = partition_group_list.num_devices_per_group();
   // Verify that partition group list utilizes all partitions.
   CHECK_EQ((partition_group_count * partition_group_size), num_partitions);
 
   // Total number of replica groups is number of partitions groups * num of
   // replicas.
-  int replica_group_count = partition_group_count * num_replicas;
+  int64_t replica_group_count = partition_group_count * num_replicas;
 
   // Newly generated replica group list expands the pattern within one replica
   // across all replicas. For example, if we want to expand a partition group
@@ -3166,6 +3162,37 @@ DynamicUpdateSliceAnalysis AnalyzeDynamicUpdateSlice(
         DynamicUpdateSliceMethod::kAllPartitionedSliceDimsHaveConstantIndices;
   }
 
+  // For now, only enable Method 3 if enzyme optimization is enabled.
+  bool is_enzyme_opt_enabled = hlo->parent()
+                                   ->parent()
+                                   ->config()
+                                   .debug_options()
+                                   .xla_enable_enzyme_comms_opt();
+  if (!is_enzyme_opt_enabled &&
+      analysis.method == DynamicUpdateSliceMethod::
+                             kAllPartitionedSliceDimsHaveConstantIndices) {
+    analysis.method = DynamicUpdateSliceMethod::kDefault;
+    return analysis;
+  }
+
+  // Extra check for out-of-bounds indexing
+  const HloInstruction* update_tensor = hlo->operand(1);
+  if (analysis.method ==
+      DynamicUpdateSliceMethod::kAllPartitionedSliceDimsHaveConstantIndices) {
+    for (int64_t dim = 0; dim < hlo->shape().dimensions().size(); ++dim) {
+      const HloInstruction* dus_index = hlo->operand(dim + 2);
+      CHECK(dus_index->IsConstant());
+
+      int64_t start_index = dus_index->literal().GetIntegralAsS64({}).value();
+      int64_t end_index = start_index + update_tensor->shape().dimensions(dim);
+      int64_t padding_high = hlo->shape().dimensions(dim) - end_index;
+      if (start_index < 0 || padding_high < 0) {
+        analysis.method = DynamicUpdateSliceMethod::kDefault;
+        return analysis;
+      }
+    }
+  }
+
   return analysis;
 }
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 74f1be9c92583f..ac9f08edb11038 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -17,29 +17,21 @@ limitations under the License.
 #define XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
 
 #include <algorithm>
-#include <cstddef>
 #include <cstdint>
 #include <initializer_list>
 #include <limits>
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_replace.h"
 #include "absl/types/span.h"
-#include "absl/utility/utility.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -47,7 +39,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/replica_group.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
-#include "xla/hlo/utils/hlo_query.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -55,7 +46,6 @@ limitations under the License.
 #include "xla/service/spmd/spmd_partitioner.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -640,8 +630,8 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
 // Expands partition group list across all replicas. Expects that provided
 // partition_group_list utilizes all the partitions.
 CollectiveDeviceList ExpandPartitionGroupListAcrossReplicas(
-    IotaReplicaGroupList partition_group_list, int num_replicas,
-    int num_partitions);
+    IotaReplicaGroupList partition_group_list, int64_t num_replicas,
+    int64_t num_partitions);
 
 namespace detail {
 
@@ -822,9 +812,6 @@ template <typename Arg, IsHloModulePointer<Arg> = 0>
 std::decay_t<Arg> FakeHloModule(Arg&& module, HloModule* fake_module) {
   return fake_module;
 }
-template <class T>
-using decay_rvalue_reference_t =
-    std::conditional_t<std::is_rvalue_reference<T>::value, std::decay_t<T>, T>;
 
 // Modifies SpmdPartitioningVisitor* type objects.
 template <typename Arg, IsSpmdPartitioningVisitorPointer<Arg> = 0>
diff --git a/third_party/xla/xla/service/spmd/spmd_prepare.cc b/third_party/xla/xla/service/spmd/spmd_prepare.cc
index 3fc98bda1ec747..02e37864e813a4 100644
--- a/third_party/xla/xla/service/spmd/spmd_prepare.cc
+++ b/third_party/xla/xla/service/spmd/spmd_prepare.cc
@@ -174,7 +174,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
 }
 }  // namespace
 
-absl::StatusOr<bool> SpmdPrepare::Run(
+absl::StatusOr<bool> SpmdPrepare::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/spmd_prepare.h b/third_party/xla/xla/service/spmd/spmd_prepare.h
index 7789287add8084..c9cf799c9197a0 100644
--- a/third_party/xla/xla/service/spmd/spmd_prepare.h
+++ b/third_party/xla/xla/service/spmd/spmd_prepare.h
@@ -35,8 +35,8 @@ class SpmdPrepare : public HloModulePass {
   ~SpmdPrepare() override = default;
   absl::string_view name() const override { return "spmd-prepare"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc
index beb173c38aa6eb..8f9c0598a82b59 100644
--- a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc
+++ b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc
@@ -61,7 +61,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> WholeGraphManualPass::Run(
+absl::StatusOr<bool> WholeGraphManualPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
index ba3264aa2f4919..8ee2a787e747b5 100644
--- a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
+++ b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
@@ -32,8 +32,8 @@ class WholeGraphManualPass : public HloModulePass {
   WholeGraphManualPass() : HloModulePass() {}
   absl::string_view name() const override { return "whole-graph-manual-pass"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/symbol_repository.h b/third_party/xla/xla/service/symbol_repository.h
index ed7a5ff4996f19..1f202a8dc6c38d 100644
--- a/third_party/xla/xla/service/symbol_repository.h
+++ b/third_party/xla/xla/service/symbol_repository.h
@@ -57,7 +57,7 @@ struct HloModuleAndMetadata {
   virtual ~HloModuleAndMetadata() = default;
 
   std::unique_ptr<HloModule> hlo_module;
-  std::unique_ptr<Compiler::TargetConfig> target_config;
+  std::unique_ptr<Compiler::GpuTargetConfig> target_config;
   // Use static_cast to cast this to a concrete type.
   std::unique_ptr<BackendSpecificData> backend_specific_data;
 };
diff --git a/third_party/xla/xla/service/topk_rewriter.cc b/third_party/xla/xla/service/topk_rewriter.cc
index 688f49e1595026..f72ed2f4bc92a2 100644
--- a/third_party/xla/xla/service/topk_rewriter.cc
+++ b/third_party/xla/xla/service/topk_rewriter.cc
@@ -16,13 +16,20 @@ limitations under the License.
 #include "xla/service/topk_rewriter.h"
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
-#include <memory>
+#include <limits>
 #include <optional>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/builder/lib/comparators.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -35,6 +42,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
@@ -430,7 +438,7 @@ absl::StatusOr<bool> TopkRewriter::TransformToCustomCall(
   return changed;
 }
 
-absl::StatusOr<bool> TopkRewriter::Run(
+absl::StatusOr<bool> TopkRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -537,7 +545,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
   HloPredicate should_decompose_;
 };
 
-absl::StatusOr<bool> TopkDecomposer::Run(
+absl::StatusOr<bool> TopkDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return TopkDecomposerVisitor(should_decompose_)
diff --git a/third_party/xla/xla/service/topk_rewriter.h b/third_party/xla/xla/service/topk_rewriter.h
index cf673fe702b197..2a982584eff52a 100644
--- a/third_party/xla/xla/service/topk_rewriter.h
+++ b/third_party/xla/xla/service/topk_rewriter.h
@@ -21,6 +21,9 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -40,12 +43,11 @@ class TopkRewriter : public HloModulePass {
 
   absl::string_view name() const override { return "topk-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
- protected:
   // Check if the sort instruction is in TopK.
   std::optional<int64_t> SortIsInTopK(HloInstruction* inst);
 
@@ -73,8 +75,8 @@ class TopkDecomposer : public HloModulePass {
   explicit TopkDecomposer(HloPredicate should_decompose = {})
       : should_decompose_(should_decompose) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/topk_rewriter_test.cc b/third_party/xla/xla/service/topk_rewriter_test.cc
index f32060582e8bed..2dc611a017a5a4 100644
--- a/third_party/xla/xla/service/topk_rewriter_test.cc
+++ b/third_party/xla/xla/service/topk_rewriter_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -49,7 +50,6 @@ namespace xla {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
-using ::tsl::testing::IsOkAndHolds;
 using TopkRewriterTest = HloTestBase;
 
 std::string getComparator() {
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 4e4c6c8191e809..8dad45bb164069 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -24,7 +24,10 @@ limitations under the License.
 #include "absl/base/const_init.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "xla/literal.h"
@@ -37,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/notification.h"
diff --git a/third_party/xla/xla/service/transpose_folding.cc b/third_party/xla/xla/service/transpose_folding.cc
index 2550e3e98c4fac..a04814c31dc262 100644
--- a/third_party/xla/xla/service/transpose_folding.cc
+++ b/third_party/xla/xla/service/transpose_folding.cc
@@ -15,24 +15,26 @@ limitations under the License.
 
 #include "xla/service/transpose_folding.h"
 
-#include <algorithm>
+#include <cstdint>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace {
@@ -174,7 +176,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair& pair) {
       convolution.shape(), new_lhs, new_rhs, convolution.feature_group_count(),
       convolution.batch_group_count(), convolution.window(), new_dnums,
       convolution.precision_config());
-  TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
+  CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
       &convolution, std::move(new_conv)));
 
   return true;
@@ -189,7 +191,7 @@ TransposeFolding::TransposeFolding(
           std::move(dot_can_fold_transpose_operand)),
       transposable_conv_operands_(std::move(transposable_conv_operands)) {}
 
-absl::StatusOr<bool> TransposeFolding::Run(
+absl::StatusOr<bool> TransposeFolding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Modifying the graph while traversing is dangerous, so we find all folding
diff --git a/third_party/xla/xla/service/transpose_folding.h b/third_party/xla/xla/service/transpose_folding.h
index 447b96771318ba..d73bb548219418 100644
--- a/third_party/xla/xla/service/transpose_folding.h
+++ b/third_party/xla/xla/service/transpose_folding.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -62,14 +65,14 @@ class TransposeFolding : public HloModulePass {
           AlwaysFoldTranspose);
   absl::string_view name() const override { return "transpose-folding"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static absl::StatusOr<bool> IsRowColumnTransposeDotOperand(
       const HloInstruction& dot, int64_t operand_idx);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   CanFoldTransposeOperand dot_can_fold_transpose_operand_;
   TransposableConvOperandsFn transposable_conv_operands_;
diff --git a/third_party/xla/xla/service/transpose_folding_test.cc b/third_party/xla/xla/service/transpose_folding_test.cc
index 49c5417d110a65..82a7b90ec854e2 100644
--- a/third_party/xla/xla/service/transpose_folding_test.cc
+++ b/third_party/xla/xla/service/transpose_folding_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "xla/service/transpose_folding.h"
 
+#include <cstdint>
 #include <memory>
-#include <vector>
+#include <optional>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -29,19 +33,17 @@ limitations under the License.
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/literal.h"
-#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/literal_util.h"
 #include "xla/service/shape_inference.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
-using ::tsl::testing::IsOkAndHolds;
 
 using TransposeFoldingTest = HloHardwareIndependentTestBase;
 
diff --git a/third_party/xla/xla/service/triangular_solve_expander.cc b/third_party/xla/xla/service/triangular_solve_expander.cc
index 67dc0235810de8..049249aa5b0481 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.cc
+++ b/third_party/xla/xla/service/triangular_solve_expander.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/triangular_solve_expander.h b/third_party/xla/xla/service/triangular_solve_expander.h
index 87aaf5612ce48a..ce19b1c8b17b21 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.h
+++ b/third_party/xla/xla/service/triangular_solve_expander.h
@@ -17,8 +17,11 @@ limitations under the License.
 #define XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/value_range.cc b/third_party/xla/xla/service/value_range.cc
index 130d07c4d9e20e..bf9b21c9757b04 100644
--- a/third_party/xla/xla/service/value_range.cc
+++ b/third_party/xla/xla/service/value_range.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/constant_value.h"
 #include "xla/service/hlo_value.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
index 6f4903fc4cbf95..806ff2d7b1efab 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/while_loop_all_reduce_code_motion.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <stack>
@@ -23,7 +24,13 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/analysis/hlo_replication_analysis.h"
 #include "xla/hlo/analysis/while_loop_analysis.h"
@@ -100,17 +107,18 @@ bool IsValueReplicatedWithinEachAllReduceGroup(
           << " all_reduce_group_mode: "
           << CollectiveOpGroupModeToString(all_reduce_group_mode);
   switch (all_reduce_group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA: {
       return cross_replica_replication_analysis == nullptr ||
              cross_replica_replication_analysis->HloInstructionIsReplicatedAt(
                  &instruction, index, replica_groups);
     }
-    case CollectiveOpGroupMode::kCrossPartition: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION: {
       return cross_partition_replication_analysis == nullptr ||
              cross_partition_replication_analysis->HloInstructionIsReplicatedAt(
                  &instruction, index, replica_groups);
     }
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition: {
+    case CollectiveOpGroupMode::
+        COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION: {
       return (cross_replica_replication_analysis == nullptr ||
               cross_replica_replication_analysis->HloInstructionIsReplicatedAt(
                   &instruction, index, replica_groups)) &&
@@ -118,7 +126,7 @@ bool IsValueReplicatedWithinEachAllReduceGroup(
               cross_partition_replication_analysis
                   ->HloInstructionIsReplicatedAt(&instruction, index));
     }
-    case CollectiveOpGroupMode::kFlattenedID: {
+    case CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID: {
       if (num_replicas == 1) {
         return cross_partition_replication_analysis == nullptr ||
                cross_partition_replication_analysis
@@ -137,6 +145,10 @@ bool IsValueReplicatedWithinEachAllReduceGroup(
               cross_partition_replication_analysis
                   ->HloInstructionIsReplicatedAt(&instruction, index));
     }
+    default: {
+      LOG(FATAL) << "Unsupported all-reduce group mode: "
+                 << CollectiveOpGroupModeToString(all_reduce_group_mode);
+    }
   }
 }
 
@@ -1220,7 +1232,7 @@ absl::StatusOr<HloInstruction*> AddSinkedAllReducesAndReplaceWhile(
 
 }  // namespace
 
-absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool is_changed = false;
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
index 956f7d232faa52..2e4c9c0304961d 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
@@ -69,8 +69,9 @@ class WhileLoopAllReduceCodeMotion : public HloModulePass {
   static constexpr absl::string_view kName =
       "while-loop-all-reduce-code-motion";
   absl::string_view name() const override { return kName; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
index 677355413aa0f9..5c2e1fd7c1154d 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
@@ -42,8 +43,8 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -53,7 +54,6 @@ using ::testing::Ne;
 using ::testing::NotNull;
 using ::testing::Property;
 using ::testing::SizeIs;
-using ::tsl::testing::IsOkAndHolds;
 
 class WhileLoopAllReduceCodeMotionTest : public HloHardwareIndependentTestBase {
  public:
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.cc b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
index 3f8c298423648f..b7d7888ac99f6b 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
@@ -1040,7 +1040,7 @@ absl::StatusOr<bool> RunOnLoop(HloInstruction* loop,
 
 }  // namespace
 
-absl::StatusOr<bool> WhileLoopConcatCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopConcatCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.h b/third_party/xla/xla/service/while_loop_concat_code_motion.h
index 236d84e93c500e..aeaa9a2ff1d1e2 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.h
@@ -69,8 +69,8 @@ class WhileLoopConcatCodeMotion : public HloModulePass {
     return "while-loop-concat-code-motion";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.cc b/third_party/xla/xla/service/while_loop_constant_sinking.cc
index 806d5e229ed130..396b221748c30c 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.cc
@@ -163,7 +163,7 @@ absl::StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
   return body_clone || cond_clone;
 }
 
-absl::StatusOr<bool> WhileLoopConstantSinking::Run(
+absl::StatusOr<bool> WhileLoopConstantSinking::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before WhileLoopConstantSinking:";
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.h b/third_party/xla/xla/service/while_loop_constant_sinking.h
index 4db22bea33a529..98cb2aab411977 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.h
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.h
@@ -60,8 +60,8 @@ class WhileLoopConstantSinking : public HloModulePass {
   static constexpr absl::string_view kName = "while-loop-constant-sinking";
   absl::string_view name() const override { return kName; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
index 209b15aa510468..2fb18f31dd3f73 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
@@ -298,7 +298,7 @@ absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::
   return true;
 }
 
-absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(3) << "HLO module before WhileLoopExpensiveInvariantCodeMotion:";
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
index 06b4bac1908fae..06d5525a1481ab 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
@@ -49,8 +49,9 @@ class WhileLoopExpensiveInvariantCodeMotion : public HloModulePass {
   absl::string_view name() const override {
     return "while-loop-expensive-invariant-code-motion";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.cc b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
index 39690ded697dd1..5c870e08967614 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
@@ -34,14 +34,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/while_util.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -122,6 +121,9 @@ absl::StatusOr<HloInstruction*> AppendToWhileState(
   TF_RETURN_IF_ERROR(
       UpdateWhileUsesWithTuple(while_instr, while_input->operand_count() - 1));
   *while_instr->mutable_shape() = while_input->shape();
+  // The new body root tuple element has the same value as the new operand.
+  AppendToWhileLoopOriginalValue(while_instr, {new_operand});
+
   return new_gte;
 }
 
@@ -459,6 +461,7 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
     HloInstruction* parameter = while_body->parameter_instruction(0);
     int64_t next_index = init_value->operand_count();
     new_operands.resize(fusion->operand_count());
+
     for (int64_t i = 0; i < fusion->operand_count(); ++i) {
       init_value->AppendOperand(fusion->mutable_operand(i));
       parameter->mutable_shape()->mutable_tuple_shapes()->push_back(
@@ -469,9 +472,14 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
     }
     *(init_value->mutable_shape()) = parameter->shape();
     *(while_instr->mutable_shape()) = parameter->shape();
+    //
+    // The new body root tuple elements have the same value as the fusion
+    // operands.
+    AppendToWhileLoopOriginalValue(while_instr, fusion->operands());
     *(while_cond->parameter_instruction(0)->mutable_shape()) =
         parameter->shape();
     *(root->mutable_shape()) = parameter->shape();
+
     auto cloned_fusion = while_body->AddInstruction(
         fusion->CloneWithNewOperands(fusion->shape(), new_operands));
     TF_RETURN_IF_ERROR(fusion->parent()->RemoveInstruction(fusion));
@@ -482,7 +490,7 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
   return changed;
 }
 
-absl::StatusOr<bool> WhileLoopFusibleSinking::Run(
+absl::StatusOr<bool> WhileLoopFusibleSinking::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Before WhileLoopFusibleSinking " << module->unique_id();
@@ -539,6 +547,7 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::Run(
       }
     }
   }
+
   return changed;
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.h b/third_party/xla/xla/service/while_loop_fusible_sinking.h
index e3f65e6bdb1a53..4f9ef6268b39a1 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.h
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.h
@@ -96,8 +96,8 @@ class WhileLoopFusibleSinking : public HloModulePass {
     return "while-loop-fusible-sinking";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
index d5ecda426902ac..f5baf95b30dc17 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
@@ -429,5 +430,100 @@ TEST_F(WhileLoopFusibleSinkingTest, TestNoPlumbWithUnknonwnTripCount) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(WhileLoopFusibleSinkingTest, SinkMaskWithOriginalValue) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[5,7],f32[5,7]) parameter(0)
+  p_body.0 = get-tuple-element(p_body), index=0
+  p_body.1 = get-tuple-element(p_body), index=1
+
+  add.0 = add(p_body.0, p_body.1)
+  ROOT root = tuple(add.0, p_body.1)
+}
+
+condition {
+  p_cond = (f32[5,7],f32[5,7]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[5,7] parameter(0), origin={{"constant"}}
+  p = f32[5] parameter(1), origin={{"parameter"}}
+  a = f32[5,7] iota(), iota_dimension=0
+  b = f32[5,7] iota(), iota_dimension=1
+  c = add(a, b)
+  d = f32[5,7] broadcast(p), dimensions={0}
+  mask = multiply(c,d), origin={{"mask"}}
+  while_init = tuple(const_0, mask), origin={({"constant"}, {"mask"})}
+  ROOT while = while(while_init), condition=condition, body=body, origin={({"while" {0}}, {"while" {1}})}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  HloInstruction* while_instr = FindInstruction(module.get(), "while");
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while" {0}}, {"while" {1}}, {"parameter"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"constant"}, {"mask"}, {"parameter"}))");
+}
+
+TEST_F(WhileLoopFusibleSinkingTest, PlumbSingleBroadcastWithOriginalValue) {
+  const std::string hlo_string_before = R"(
+  HloModule test
+
+  loop.body {
+    loop_var.1 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.1 = s32[]{:T(128)} get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = s32[4,3,5]{2,1,0} get-tuple-element(loop_var.1), index=2
+    bitcast.12855 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} bitcast(get-tuple-element.3)
+    add.40974 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} add(get-tuple-element.2, bitcast.12855)
+    constant.1 = s32[]{:T(128)} constant(1)
+    idx = s32[]{:T(128)} add(get-tuple-element.1, constant.1)
+    ROOT tuple = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(idx, add.40974, get-tuple-element.3)
+  }
+
+  loop.condition {
+    loop_var.2 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.3 = s32[]{:T(128)} get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[]{:T(128)} constant(4)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+
+  ENTRY %main {
+    param.1 = s32[4,3,5]{2,1,0} iota(), iota_dimension=0
+    zero = s32[]{:T(128)} constant(0), origin={{"zero"}}
+    zeros32 = s32[]{:T(128)} constant(0), origin={{"zeros32"}}
+    broadcast = s32[1,1,1,4,3,5]{5,4,3,2,1,0} broadcast(zeros32), origin={{"broadcast"}}
+    input = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(zero, broadcast, param.1), origin={({"zero"}, {"zeros32"}, {"broadcast"})}
+    ROOT while = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) while(input), condition=loop.condition, body=loop.body, origin={({"while" {0}}, {"while" {1}}, {"while" {2}})}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string_before));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* while_instr = FindInstruction(module.get(), "while");
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while" {0}}, {"while" {1}}, {"while" {2}}, {"zero"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"zero"}, {"zeros32"}, {"broadcast"}, {"zero"}))");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
index d0c4275d853a47..ba68c757c13f79 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
@@ -37,6 +36,7 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
 #include "xla/map_util.h"
 #include "xla/service/compile_time_cap.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/while_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -145,13 +145,20 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     return false;
   }
 
-  // LICM in the presence of domain instructions is complex, bail.
   for (auto* instruction : while_body->MakeInstructionPostOrder()) {
+    // LICM in the presence of domain instructions is complex, bail.
     if (instruction->opcode() == HloOpcode::kDomain ||
         instruction->IsCustomCall("SPMDFullToShardShape") ||
         instruction->IsCustomCall("SPMDShardShapeToFull")) {
       return false;
     }
+
+    // Host offloading annotation should stay in its original position.
+    if (instruction->IsCustomCall(std::vector<absl::string_view>{
+            memory_annotations::kMoveToDeviceCustomCallTarget,
+            memory_annotations::kMoveToHostCustomCallTarget})) {
+      return false;
+    }
   }
 
   // instructions_to_replace[i] is hoisted into a loop invariant instruction
@@ -284,7 +291,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   return true;
 }
 
-absl::StatusOr<bool> WhileLoopInvariantCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopInvariantCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(3) << "HLO module before WhileLoopInvariantCodeMotion:";
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
index 695f08d0a706ad..1c2962d3971134 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
@@ -71,8 +71,9 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
   absl::string_view name() const override {
     return "while-loop-invariant-code-motion";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
index e0f4f6c39db580..2f8838dfc7dd1b 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
@@ -108,7 +108,7 @@ int64_t WhileLoopPipelineUnroller::ComputeWhileLoopPipelineDepth(
   return pipeline_depth;
 }
 
-absl::StatusOr<bool> WhileLoopPipelineUnroller::Run(
+absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<std::pair<HloInstruction*, int64_t>> while_instructions;
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller.h b/third_party/xla/xla/service/while_loop_pipeline_unroller.h
index ec5cab6da0c03a..ee2d45d0544dc7 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller.h
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller.h
@@ -44,16 +44,16 @@ class WhileLoopPipelineUnroller : public HloModulePass {
     return "while_loop_pipeline_unroller";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // The pipeline depth of a while loop is the number of loop iterations that
   // pipelined loop inputs live throughout. This is used to determine how many
   // times to unroll the loop in order to remove aliasing interference.
   static int64_t ComputeWhileLoopPipelineDepth(
       const HloInstruction& while_instruction);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index faa0a6d97061f4..55c260fb08571d 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
@@ -45,6 +46,7 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/service/while_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -150,6 +152,35 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
     HloInstruction* while_op, absl::flat_hash_set<int64_t>& used_tuple_indices,
     std::optional<absl::flat_hash_map<int32_t, int32_t>>
         dead_to_surviving_index = std::nullopt) {
+  auto copy_remaining_original_arrays =
+      [&](const HloInstruction* src_instruction,
+          HloInstruction* dest_instruction,
+          const absl::flat_hash_map<int64_t, int64_t>& old_to_new_tuple_idx) {
+        std::shared_ptr<OriginalValue> original_value =
+            src_instruction->original_value();
+        if (!original_value) {
+          return;
+        }
+
+        const int64_t src_tuple_size =
+                          src_instruction->shape().tuple_shapes().size(),
+                      dest_tuple_size =
+                          dest_instruction->shape().tuple_shapes().size();
+        std::shared_ptr<OriginalValue> old_original_value =
+            src_instruction->original_value();
+        std::shared_ptr<xla::OriginalValue> new_original_value =
+            std::make_shared<xla::OriginalValue>(dest_instruction->shape());
+        for (const auto& [old_idx, new_idx] : old_to_new_tuple_idx) {
+          if (old_idx < 0 || old_idx >= src_tuple_size || new_idx < 0 ||
+              new_idx >= dest_tuple_size) {
+            return;
+          }
+          new_original_value->mutable_tree()->CopySubtreeFrom(
+              old_original_value->tree(), {old_idx}, {new_idx});
+        }
+        dest_instruction->set_original_value(new_original_value);
+      };
+
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64_t> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                             used_tuple_indices.end());
@@ -275,6 +306,10 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
   CopyFrontendAttributes(while_op, new_while_op);
   CopyMetadata(while_op, new_while_op);
 
+  copy_remaining_original_arrays(while_init, new_while_init,
+                                 old_to_new_tuple_idx);
+  copy_remaining_original_arrays(while_op, new_while_op, old_to_new_tuple_idx);
+
   // Create a tuple op that recreates the output of the old while op.  That is,
   // we transform to
   //
@@ -1160,6 +1195,20 @@ static std::vector<HloInstruction*> GetFlatTupleElems(
 }
 
 static absl::StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
+  auto flatten_original_value = [&](HloInstruction* old_instr,
+                                    HloInstruction* new_instr) {
+    if (old_instr->original_value()) {
+      auto new_original_value =
+          std::make_shared<OriginalValue>(new_instr->shape());
+      int64_t i = 0;
+      for (auto& [shape_index, original_array] :
+           old_instr->original_value()->tree().leaves()) {
+        *new_original_value->mutable_tree()->mutable_element({i++}) =
+            original_array;
+      }
+      new_instr->set_original_value(new_original_value);
+    }
+  };
   HloModule* module = while_op->GetModule();
   HloComputation* computation = while_op->parent();
   auto* while_init = while_op->mutable_operand(0);
@@ -1261,6 +1310,9 @@ static absl::StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   for (auto& instr : new_instrs) {
     computation->AddInstruction(std::move(instr));
   }
+
+  flatten_original_value(while_init, new_while_op->mutable_operand(0));
+  flatten_original_value(while_op, new_while_op);
   return true;
 }
 
@@ -1403,6 +1455,8 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
           add_binary_op(elem_shape, HloOpcode::kMultiply,
                         add_gte(instr, *trip_counter),
                         add_new_instr(induction_vars.at(i)->Clone()))));
+      // Copy the original value of the induction variable to its replacement.
+      tuple_elems.back()->CopyOriginalValue(while_body_root->operand(i));
     }
     return HloInstruction::CreateTuple(tuple_elems);
   };
@@ -1496,6 +1550,15 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
       module->AddEmbeddedComputation(std::move(new_while_body)),
       get_new_while_init(while_init)));
   new_while->CopyBackendConfigFrom(while_op);
+  if (auto original_value = while_init->original_value()) {
+    new_while->while_init()->set_original_value(original_value);
+  }
+  if (auto original_value = while_op->original_value()) {
+    new_while->set_original_value(original_value);
+  }
+  if (added_trip_counter) {
+    AppendToWhileLoopOriginalValue(new_while, {});
+  }
   CopyFrontendAttributes(while_op, new_while);
   CopyMetadata(while_op, new_while);
   TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
@@ -1506,11 +1569,11 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
   return new_while;
 }
 
-absl::StatusOr<bool> WhileLoopSimplifier::Run(
+absl::StatusOr<bool> WhileLoopSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(3,
-                 "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "WhileLoopSimplifier::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
 
   // Gather all the while ops in our module.  We do this ahead of time so we
@@ -1631,8 +1694,8 @@ absl::StatusOr<bool> WhileLoopSimplifier::Run(
     HloDCE dce;
     TF_RETURN_IF_ERROR(dce.Run(module).status());
   }
-  XLA_VLOG_LINES(3,
-                 "WhileLoopSimplifier::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "WhileLoopSimplifier::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier.h b/third_party/xla/xla/service/while_loop_simplifier.h
index 5fc34b22a3db4d..3ad61b38cb6c05 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.h
+++ b/third_party/xla/xla/service/while_loop_simplifier.h
@@ -67,8 +67,9 @@ class WhileLoopSimplifier : public HloModulePass {
   ~WhileLoopSimplifier() override = default;
   static constexpr absl::string_view kName = "simplify-while-loops";
   absl::string_view name() const override { return kName; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index 953526affe78ee..24202035575301 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -1481,5 +1481,172 @@ ENTRY %main (arg.0: f32[3], arg.1: f32[2]) -> (f32[3], f32[2], f32[2], f32[3]) {
                         op::GetTupleElement(op::While(), 0)));
 }
 
+TEST_F(WhileLoopSimplifierTest, RemoveDeadTupleIndicesWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule dus
+
+%while.body (arg_tuple: (f32[3], f32[2], f32[2], f32[3], s32[])) -> (f32[3], f32[2], f32[2], f32[3], s32[]) {
+  %arg_tuple = (f32[3], f32[2], f32[2], f32[3], s32[]) parameter(0)
+  %get-tuple-element.0 = f32[3] get-tuple-element(%arg_tuple), index=0
+  %get-tuple-element.1 = f32[2] get-tuple-element(%arg_tuple), index=1
+  %get-tuple-element.2 = f32[2] get-tuple-element(%arg_tuple), index=2
+  %get-tuple-element.3 = f32[3] get-tuple-element(%arg_tuple), index=3
+  %get-tuple-element.4 = s32[] get-tuple-element(%arg_tuple), index=4
+  %constant.1 = s32[] constant(1)
+  %constant.v0 = f32[1] constant({0.0})
+  %constant.v1 = f32[1] constant({1.0})
+  %dynamic-update-slice.0 = f32[3] dynamic-update-slice(%get-tuple-element.0, %constant.v0, s32[] %constant.1)
+  %dynamic-update-slice.3 = f32[3] dynamic-update-slice(%get-tuple-element.3, %constant.v0, s32[] %constant.1)
+  %add = add(s32[] %get-tuple-element.4, s32[] %constant.1)
+  ROOT %tuple = tuple(%dynamic-update-slice.0, %get-tuple-element.1, %get-tuple-element.2, %dynamic-update-slice.3, %add)
+}
+
+%while.condition (arg_tuple.cond:(f32[3], f32[2], f32[2], f32[3], s32[])) -> pred[] {
+  %arg_tuple.cond = (f32[3], f32[2], f32[2], f32[3], s32[]) parameter(0)
+  %get-tuple-element.cond = s32[] get-tuple-element(%arg_tuple.cond), index=4
+  %constant.3 = s32[] constant(3)
+  ROOT %compare = pred[] compare(s32[] %get-tuple-element.cond, s32[] %constant.3), direction=LT
+}
+
+ENTRY %main (arg.0: f32[3], arg.1: f32[2]) -> (f32[3], f32[2], f32[2], f32[3]) {
+  %constant.0 = s32[] constant(0)
+  %arg.0 = f32[3] parameter(0)
+  %arg.1 = f32[2] parameter(1)
+  %input = tuple(%arg.0, %arg.1, %arg.1, %arg.0, %constant.0), origin={({"arg.0"}, {"arg.1"}, {"arg.1"}, {"arg.0"}, {"constant.0"})}
+  %while = while(%input), condition=%while.condition, body=%while.body, origin={({"while.116" {0}}, {"while.116" {1}}, {"while.116" {2}}, {"while.116" {3}}, {"while.116" {4}})}
+  %get-tuple-element.out0 = f32[3] get-tuple-element(%while), index=0
+  %get-tuple-element.out1 = f32[2] get-tuple-element(%while), index=1
+  %get-tuple-element.out2 = f32[2] get-tuple-element(%while), index=2
+  %get-tuple-element.out3 = f32[3] get-tuple-element(%while), index=3
+  ROOT %root = tuple(%get-tuple-element.out0, %get-tuple-element.out1, %get-tuple-element.out2, %get-tuple-element.out3)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_TRUE(WhileLoopSimplifier().Run(module.get()).value());
+  HloInstruction* while_instr = FindFirstWhile(module.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while.116" {0}}, {"while.116" {1}}, {"while.116" {4}}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"arg.0"}, {"arg.1"}, {"constant.0"}))");
+}
+
+TEST_F(WhileLoopSimplifierTest, FlattenNestedTupleWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ta = (s32[1]) get-tuple-element(param), index=0
+    a = s32[1] get-tuple-element(ta), index=0
+    a.1 = s32[1] add(a, a)
+    tbcd = (s32[2], s32[3], (s32[4])) get-tuple-element(param), index=1
+    ROOT tuple = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+  }
+  Cond {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    b = s32[2] constant({0,1})
+    c = s32[3] constant({0,1,2})
+    d = s32[4] constant({0,1,2,3})
+    ta = (s32[1]) tuple(a)
+    td = (s32[4]) tuple(d)
+    tbcd = (s32[2], s32[3], (s32[4])) tuple(b, c, td)
+    init = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd), origin={(({"a"}), (
+      {"b"}, {"c"}, ({"d"})))}
+    ROOT while = ((s32[1]), (s32[2], s32[3], (s32[4]))) while(init),
+      condition=Cond, body=Body, origin={(({"while.116" {0}}), (
+      {"while.116" {1}}, {"while.116" {2}}, ({"while.116" {3}})))}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopSimplifier().Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* while_instr = FindFirstWhile(module.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(
+      while_instr->original_value()->ToString(),
+      R"(({"while.116" {0}}, {"while.116" {1}}, {"while.116" {2}}, {"while.116" {3}}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"a"}, {"b"}, {"c"}, {"d"}))");
+}
+
+const char* const kSimpleMergeInductionVariablesModuleWithOriginalValue = R"(
+  HloModule Test
+  Body {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+
+    a = TYPE[] get-tuple-element(param), index=0
+    one = TYPE[] constant(1)
+    a1 = TYPE[] add(a, one), origin={{"induction_var_0"}}
+
+    b = TYPE[] get-tuple-element(param), index=1
+    negone = TYPE[] constant(-1)
+    b1 = TYPE[] add(b, negone), origin={{"induction_var_1"}}
+
+    c = TYPE[] add(a, b)
+
+    ROOT tuple = (TYPE[], TYPE[], TYPE[]) tuple(a1,b1,c)
+  }
+  Cond {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+    a = TYPE[] get-tuple-element(param), index=0
+    b = TYPE[] get-tuple-element(param), index=1
+    sum = TYPE[] power(a, b)
+    ten = TYPE[] constant(10)
+    ROOT cond = pred[] compare(sum, ten), direction=LT
+  }
+  ENTRY Loop {
+    a = TYPE[] constant(10)
+    b = TYPE[] constant(100)
+    c = TYPE[] constant(0)
+    init = (TYPE[], TYPE[], TYPE[]) tuple(a,b,c), origin={({"a"}, {"b"}, {"c"})}
+    while = (TYPE[], TYPE[], TYPE[]) while(init), condition=Cond, body=Body, origin={({"while" {0}}, {"while" {1}}, {"while" {2}})}
+
+    a1 = TYPE[] get-tuple-element(while), index=0
+    b1 = TYPE[] get-tuple-element(while), index=1
+    c1 = TYPE[] get-tuple-element(while), index=2
+    sum = TYPE[] add(a1, b1)
+    ROOT sum.1 = TYPE[] add(sum, c1)
+  })";
+
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariablesWithOriginalValue) {
+  std::string hlo_string = absl::StrReplaceAll(
+      kSimpleMergeInductionVariablesModuleWithOriginalValue, {{"TYPE", "s32"}});
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopSimplifier().Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* while_instr = FindFirstWhile(module.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while" {0}}, {"while" {1}}, {"while" {2}}, {}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"a"}, {"b"}, {"c"}, {}))");
+  const HloInstruction* add =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* induction_var_0 = add->operand(0);
+  ASSERT_NE(induction_var_0->original_value(), nullptr);
+  EXPECT_EQ(induction_var_0->original_value()->ToString(),
+            R"({"induction_var_0"})");
+  const HloInstruction* induction_var_1 = add->operand(1);
+  ASSERT_NE(induction_var_1->original_value(), nullptr);
+  EXPECT_EQ(induction_var_1->original_value()->ToString(),
+            R"({"induction_var_1"})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index d69ba416c5d961..a9dbb06099045e 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/algorithm.h"
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -1432,7 +1431,7 @@ WhileLoopUnroller::UnrollAndReturnReplacement(
   return result;
 }
 
-absl::StatusOr<bool> WhileLoopUnroller::Run(
+absl::StatusOr<bool> WhileLoopUnroller::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // TODO(b/288130138) For now, we only support full unrolling. Will add partial
@@ -1440,7 +1439,8 @@ absl::StatusOr<bool> WhileLoopUnroller::Run(
   if (unroll_factor_ != -1) {
     return false;
   }
-  XLA_VLOG_LINES(3, "WhileLoopUnroller::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "WhileLoopUnroller::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   // Make sure all the necessary passes are executed before unrolling in order
   // to unroll every possible loop.
@@ -1478,7 +1478,8 @@ absl::StatusOr<bool> WhileLoopUnroller::Run(
     TF_RETURN_IF_ERROR(HloDCE().Run(module, execution_threads).status());
   }
 
-  XLA_VLOG_LINES(3, "WhileLoopUnroller::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(3,
+                 "WhileLoopUnroller::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index 0982bf290ac749..846e0f6ed01d19 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -139,11 +139,6 @@ class WhileLoopUnroller : public HloModulePass {
 
   absl::string_view name() const override { return "while_loop_unroller"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Runs a sequence of passes that are necessary to prepare loops for
   // unrolling. Failure to run these passes will prevent unroller from unrolling
   // loops that would have been otherwise unrollable.
@@ -176,6 +171,11 @@ class WhileLoopUnroller : public HloModulePass {
       bool wrap_in_trivial_loop = false, bool force_unroll = false,
       bool prepare = true, const UnrollConfig& unroll_config = UnrollConfig());
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   int64_t unroll_factor_;
   // Whether to wrap the unrolled computation in a loop with trip count of one.
diff --git a/third_party/xla/xla/service/while_util.cc b/third_party/xla/xla/service/while_util.cc
index 5149850241c67d..7e092d2bc2682d 100644
--- a/third_party/xla/xla/service/while_util.cc
+++ b/third_party/xla/xla/service/while_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/while_util.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -615,4 +616,70 @@ absl::Status WhileUtil::IncrementWhileLoopTripCount(
   return induction_var->ReplaceAllUsesWith(decremented_induction_var);
 }
 
+void AppendToWhileLoopOriginalValue(
+    HloInstruction* while_instr,
+    const HloInstruction::InstructionVector& new_while_input_tuple_elements) {
+  auto append_to_original_value = [&](HloInstruction* instr,
+                                      int64_t next_index) {
+    std::shared_ptr<OriginalValue> old_original_value = instr->original_value();
+    if (old_original_value != nullptr &&
+        old_original_value->IsCompatibleWith(instr->shape())) {
+      return;
+    }
+
+    // Returns if neither the instruction nor any of its new tuple elements have
+    // an original value.
+    if (old_original_value == nullptr) {
+      bool has_original_value = false;
+      std::for_each(new_while_input_tuple_elements.begin(),
+                    new_while_input_tuple_elements.end(),
+                    [&has_original_value](const HloInstruction* instr) {
+                      has_original_value |=
+                          (instr->original_value() != nullptr &&
+                           !instr->original_value()->IsEmpty());
+                    });
+      if (!has_original_value) {
+        return;
+      }
+    }
+
+    std::shared_ptr<OriginalValue> new_original_value =
+        std::make_shared<OriginalValue>(instr->shape());
+    if (old_original_value != nullptr) {
+      if (!old_original_value->IsTuple()) {
+        new_original_value->mutable_tree()->CopySubtreeFrom(
+            old_original_value->tree(), {}, {0});
+      } else {
+        for (auto& [shape_index, original_array] : old_original_value->tree()) {
+          *new_original_value->mutable_original_array(shape_index) =
+              original_array;
+        }
+      }
+    }
+
+    for (int64_t i = 0; i < new_while_input_tuple_elements.size(); ++i) {
+      if (new_while_input_tuple_elements[i]->original_value() != nullptr) {
+        new_original_value->mutable_tree()->CopySubtreeFrom(
+            new_while_input_tuple_elements[i]->original_value()->tree(), {},
+            {next_index + i});
+      }
+    }
+    instr->set_original_value(new_original_value);
+  };
+
+  if (while_instr->opcode() != HloOpcode::kWhile) {
+    return;
+  }
+  const Shape& while_shape = while_instr->shape();
+  if (!while_shape.IsTuple()) {
+    return;
+  }
+  // Calculates the start index for the new tuple elements in the new original
+  // value.
+  int64_t next_index =
+      while_shape.tuple_shapes().size() - new_while_input_tuple_elements.size();
+  append_to_original_value(while_instr->while_init(), next_index);
+  append_to_original_value(while_instr, next_index);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_util.h b/third_party/xla/xla/service/while_util.h
index c05234d28ae3f0..7b2a820ced8024 100644
--- a/third_party/xla/xla/service/while_util.h
+++ b/third_party/xla/xla/service/while_util.h
@@ -155,6 +155,15 @@ class WhileUtil {
   static absl::Status IncrementWhileLoopTripCount(
       const HloInstruction& while_instruction, int32_t increment);
 };
+
+// This is a helper function to update the original value after some
+// transformations append new elements to the while input tuple (or turn it into
+// a tuple if it was not one before). It appends the original values of the
+// new elements after existing children of the root node of the old original
+// value. This is done for both the input and output of the loop respectively.
+void AppendToWhileLoopOriginalValue(
+    HloInstruction* while_instr,
+    const HloInstruction::InstructionVector& new_while_input_tuple_elements);
 }  // namespace xla
 
 #endif  // XLA_SERVICE_WHILE_UTIL_H_
diff --git a/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc b/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
index 054a737a9042a7..8f338d16b94f1b 100644
--- a/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
+++ b/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "xla/client/client_library.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
@@ -59,9 +62,20 @@ TEST(XlaCompileTest, LoadCpuExecutable) {
 
   // Load from AOT result.
   ExecutableBuildOptions executable_build_options;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LocalExecutable> local_executable,
-      client->Load(serialized_aot_result, executable_build_options));
+
+  auto local_executable_or_status =
+      client->Load(serialized_aot_result, executable_build_options);
+  if (!local_executable_or_status.ok()) {
+    EXPECT_EQ(local_executable_or_status.status().code(),
+              absl::StatusCode::kInvalidArgument);
+
+    EXPECT_THAT(local_executable_or_status.status().message(),
+                ::testing::HasSubstr("Failed to load XLA:CPU AOT result."));
+    return;
+  }
+
+  std::unique_ptr<LocalExecutable> local_executable =
+      std::move(local_executable_or_status.value());
 
   // Run loaded excutable.
   auto alpha_literal = xla::LiteralUtil::CreateR0<float>(3.14f);
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.txtpb b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.txtpb
index ecccb69ad7665e..d8174e0d37181e 100644
--- a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.txtpb
+++ b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.txtpb
@@ -17,7 +17,7 @@
 
 version: 3
 results {
-  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 0.0.0"
+  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 8.9.4"
   hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[],\"lhs_contracting_dimensions\":[\"1\"],\"rhs_batch_dimensions\":[],\"rhs_contracting_dimensions\":[\"0\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"9\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"9\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
   result {
     gemm {
@@ -26,7 +26,7 @@ results {
   }
 }
 results {
-  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 0.0.0"
+  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 8.9.4"
   hlo: "(f32[1,2,3,1]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,4,4,2]{3,2,1,0}, f32[1,3,2,2]{3,2,1,0}), window={size=3x2}, dim_labels=b01f_o01i->b01f, custom_call_target=\"__cudnn$convForward\", backend_config={\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"leakyrelu_alpha\":0,\"side_input_scale\":0},\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
   result {
     run_time {
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index 4d06c741cbca4b..ad446c3cf7c5a1 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <iostream>
+#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -99,7 +100,7 @@ int main(int argc, char* argv[]) {
                 "Required if --output_file is not set."),
   };
 
-  tsl::string usage = xla::xla_compile::kUsageHeader;
+  std::string usage = xla::xla_compile::kUsageHeader;
   usage += tsl::Flags::Usage(argv[0], flag_list);
   if (argc > 1 && absl::string_view(argv[1]) == "--help") {
     std::cerr << usage << "\n";
diff --git a/third_party/xla/xla/shape.h b/third_party/xla/xla/shape.h
index ffaa4ae7ea3747..b385704f7058e1 100644
--- a/third_party/xla/xla/shape.h
+++ b/third_party/xla/xla/shape.h
@@ -77,12 +77,6 @@ class Shape {
   Shape& operator=(const Shape&);
   Shape& operator=(Shape&&) noexcept;
 
-  // Constructs a shape from a ShapeProto. Results in an invalid shape (as
-  // opposed to crashing) if the proto has logically invalid fields.
-  ABSL_DEPRECATE_AND_INLINE()
-  explicit Shape(const ShapeProto& shape_proto)
-      : Shape(FromProto(shape_proto).value_or(Shape())) {}
-
   // Creates a token, opaque or buffer shape.
   // Precondition:
   //  - `element_type` must be TOKEN, OPAQUE_TYPE or BUFFER.
@@ -695,12 +689,6 @@ class ProgramShape {
   ProgramShape& operator=(const ProgramShape&);
   ProgramShape& operator=(ProgramShape&&);
 
-  // Constructs a ProgramShape from a ProgramShapeProto protobuf. If the
-  // ProgramShapeProto is invalid, an empty ProgramShape is constructed.
-  ABSL_DEPRECATE_AND_INLINE()
-  explicit ProgramShape(const ProgramShapeProto& program_shape_proto)
-      : ProgramShape(FromProto(program_shape_proto).value_or(ProgramShape())) {}
-
   // Creates a ProgramShape from a ProgramShapeProto protobuf.
   static absl::StatusOr<ProgramShape> FromProto(
       const ProgramShapeProto& program_shape_proto);
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index e1412666c5bd40..321f9dfffab4cd 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -1264,7 +1264,7 @@ ShapeUtil::PackedFactorFor1DInterleavedArray(const Shape& shape) {
       [&](int64_t dim) -> bool { return shape.dimensions()[dim] != 1; }, shape);
 }
 
-/* static */ Shape ShapeUtil::PermuteDimensions(
+/* static */ Shape ShapeUtil::PermuteDimensionsIgnoringLayout(
     absl::Span<const int64_t> permutation, const Shape& shape) {
   Shape new_shape = shape;
   new_shape.clear_dimensions();
@@ -1274,7 +1274,12 @@ ShapeUtil::PackedFactorFor1DInterleavedArray(const Shape& shape) {
   for (int i = 0; i < permuted_dims.size(); ++i) {
     new_shape.add_dimensions(permuted_dims[i], permuted_dynamic_dims[i]);
   }
+  return new_shape;
+}
 
+/* static */ Shape ShapeUtil::PermuteDimensions(
+    absl::Span<const int64_t> permutation, const Shape& shape) {
+  Shape new_shape = PermuteDimensionsIgnoringLayout(permutation, shape);
   // If `shape` has a layout, by contract we choose a new layout such that the
   // transpose defined by this permutation is a bitcast.
   //
@@ -2196,8 +2201,8 @@ Shape ShapeUtil::DeviceShapeToHostShape(Shape s) {
 }
 
 /*static*/
-absl::Status ShapeUtil::ByteStrides(const Shape& shape,
-                                    absl::Span<int64_t> strides) {
+absl::Status ShapeUtil::UnpackedByteStrides(const Shape& shape,
+                                            absl::Span<int64_t> strides) {
   TF_RET_CHECK(shape.IsArray());
   TF_RET_CHECK(shape.has_layout());
   TF_RET_CHECK(shape.dimensions().size() == strides.size());
@@ -2211,15 +2216,29 @@ absl::Status ShapeUtil::ByteStrides(const Shape& shape,
 }
 
 /*static*/
-std::optional<absl::InlinedVector<int64_t, 4>> ShapeUtil::ByteStrides(
+absl::Status ShapeUtil::ByteStrides(const Shape& shape,
+                                    absl::Span<int64_t> strides) {
+  return UnpackedByteStrides(shape, strides);
+}
+
+/*static*/
+std::optional<absl::InlinedVector<int64_t, 4>> ShapeUtil::UnpackedByteStrides(
     const Shape& shape) {
   absl::InlinedVector<int64_t, 4> strides(shape.dimensions().size());
-  if (!ByteStrides(shape, absl::MakeSpan(strides)).ok()) {
+  if (!UnpackedByteStrides(shape, absl::MakeSpan(strides)).ok()) {
     return std::nullopt;
   }
   return strides;
 }
 
+/*static*/ std::optional<absl::InlinedVector<int64_t, 4>>
+ShapeUtil::ByteStrides(const Shape& shape) {
+  if (shape.layout().element_size_in_bits() % CHAR_BIT != 0) {
+    return std::nullopt;
+  }
+  return UnpackedByteStrides(shape);
+}
+
 /*static*/ int64_t ShapeUtil::ElementSizeInBits(const Shape& shape) {
   if (shape.has_layout() && shape.layout().element_size_in_bits() != 0) {
     return shape.layout().element_size_in_bits();
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index a4e134bcb3000e..fde70d0dd22ef5 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -827,6 +827,10 @@ class ShapeUtil {
   // Drops any degenerate dimensions (i.e. dimensions of size 1)
   static Shape DropDegenerateDimensions(const Shape& shape);
 
+  // Permutes the dimensions of `shape` without changing the layout, if present.
+  static Shape PermuteDimensionsIgnoringLayout(
+      absl::Span<const int64_t> permutation, const Shape& shape);
+
   // Permutes the dimensions by the given permutation, so
   // return_value.dimensions[i] = argument.dimensions[permutation[i]].
   //
@@ -1132,10 +1136,17 @@ class ShapeUtil {
 
   // Computes byte strides of an array shape `shape`. `shape` must have a
   // layout. Ignores tiling. `strides` must have size equal to the number of
-  // dimensions of `shape`.
+  // dimensions of `shape`. Ignores element_size_in_bits - uses its default
+  // value, ByteSizeOfPrimitiveType - therefore `unpacked`.
+  static absl::Status UnpackedByteStrides(const Shape& shape,
+                                          absl::Span<int64_t> strides);
+  ABSL_DEPRECATE_AND_INLINE()
   static absl::Status ByteStrides(const Shape& shape,
                                   absl::Span<int64_t> strides);
   // Same as above but returns the stride array, or std::nullopt if error.
+  static std::optional<absl::InlinedVector<int64_t, 4>> UnpackedByteStrides(
+      const Shape& shape);
+  // Same as above but returns std::nullopt if elements are not byte-aligned.
   static std::optional<absl::InlinedVector<int64_t, 4>> ByteStrides(
       const Shape& shape);
 
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index 77e241e4c0b465..6aee1856096bf0 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/shape_util.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <numeric>
@@ -25,11 +24,13 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "google/protobuf/text_format.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
@@ -342,12 +342,21 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
   EXPECT_EQ(1600, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {10, 20})));
 }
 
-TEST(ShapeUtilTest, ByteStrides) {
+TEST(ShapeUtilTest, UnpackedByteStrides) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {3, 5, 7});
   Shape shape2 = ShapeUtil::MakeShape(F16, {5, 7, 9});
+  Shape shape3 = ShapeUtil::MakeShape(S4, {2, 4});
+  shape3.mutable_layout()->set_element_size_in_bits(4);
 
-  EXPECT_THAT(*ShapeUtil::ByteStrides(shape1), ElementsAre(140, 28, 4));
-  EXPECT_THAT(*ShapeUtil::ByteStrides(shape2), ElementsAre(126, 18, 2));
+  EXPECT_THAT(*ShapeUtil::UnpackedByteStrides(shape1), ElementsAre(140, 28, 4));
+  EXPECT_THAT(*ShapeUtil::UnpackedByteStrides(shape2), ElementsAre(126, 18, 2));
+  EXPECT_THAT(*ShapeUtil::UnpackedByteStrides(shape3), ElementsAre(4, 1));
+}
+
+TEST(ShapeUtilTest, ByteStridesFailsOnFractionalElementSize) {
+  Shape shape = ShapeUtil::MakeShape(S4, {10, 20});
+  shape.mutable_layout()->set_element_size_in_bits(4);
+  EXPECT_THAT(ShapeUtil::ByteStrides(shape), std::nullopt);
 }
 
 TEST(ShapeUtilTest, NilShape) {
@@ -1011,6 +1020,24 @@ TEST(ShapeUtilTest, HasDegenerateDimensions) {
       ShapeUtil::HasDegenerateDimensions(ShapeUtil::MakeShape(F32, {3, 0, 5})));
 }
 
+TEST(ShapeUtilTest, PermuteDimensionsIgnoringLayout) {
+  {
+    Shape s =
+        ShapeUtil::MakeShapeWithDenseLayout(F32, {10, 100, 1000}, {2, 1, 0});
+    Shape permuted = ShapeUtil::PermuteDimensionsIgnoringLayout({1, 2, 0}, s);
+    EXPECT_EQ(permuted, ShapeUtil::MakeShapeWithDenseLayout(
+                            F32, {100, 1000, 10}, {2, 1, 0}));
+  }
+  {
+    Shape s = ShapeUtil::MakeShape(F32, {10, 100, 1000});
+    LayoutUtil::ClearLayout(&s);
+    Shape permuted = ShapeUtil::PermuteDimensionsIgnoringLayout({1, 2, 0}, s);
+    Shape expected = ShapeUtil::MakeShape(F32, {100, 1000, 10});
+    LayoutUtil::ClearLayout(&expected);
+    EXPECT_EQ(permuted, expected);
+  }
+}
+
 TEST(ShapeUtilTest, PermuteDimensionsLayout) {
   std::vector<int64_t> layout(3);
   std::iota(layout.begin(), layout.end(), 0);
@@ -1026,8 +1053,8 @@ TEST(ShapeUtilTest, PermuteDimensionsLayout) {
 
       EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(
           s, ShapeUtil::PermuteDimensions(permutation, s), permutation));
-    } while (std::next_permutation(permutation.begin(), permutation.end()));
-  } while (std::next_permutation(layout.begin(), layout.end()));
+    } while (absl::c_next_permutation(permutation));
+  } while (absl::c_next_permutation(layout));
 }
 
 TEST(ShapeUtilTest, UpdateDynamicDimensions) {
@@ -1065,7 +1092,7 @@ TEST(ShapeUtilTest, PermuteDynamicDimensions) {
       EXPECT_EQ(permuted.is_dynamic_dimension(i),
                 shape.is_dynamic_dimension(permutation[i]));
     }
-  } while (std::next_permutation(permutation.begin(), permutation.end()));
+  } while (absl::c_next_permutation(permutation));
 }
 
 TEST(ShapeUtilTest, PrependMajorDimension) {
diff --git a/third_party/xla/xla/side_effect_util.cc b/third_party/xla/xla/side_effect_util.cc
index 787021405242d5..05b8d19b3adbbd 100644
--- a/third_party/xla/xla/side_effect_util.cc
+++ b/third_party/xla/xla/side_effect_util.cc
@@ -33,6 +33,8 @@ const char kXlaComputeTypeDense[] = "dense";
 
 const char kXlaComputeTypeHost[] = "host";
 
+const char kXlaComputeTypeSparseOffload[] = "sparseoffload";
+
 const char kXlaMaxIdsPerPartitionAttr[] = "_xla_max_ids_per_partition";
 
 const char kXlaMaxUniqueIdsPerPartitionAttr[] =
diff --git a/third_party/xla/xla/side_effect_util.h b/third_party/xla/xla/side_effect_util.h
index 6d4751aca435c7..901544a4d521fc 100644
--- a/third_party/xla/xla/side_effect_util.h
+++ b/third_party/xla/xla/side_effect_util.h
@@ -36,6 +36,7 @@ extern const char kXlaComputeTypeAttr[];
 extern const char kXlaComputeTypeSparse[];
 extern const char kXlaComputeTypeDense[];
 extern const char kXlaComputeTypeHost[];
+extern const char kXlaComputeTypeSparseOffload[];
 
 // XLA frontend attribute name for the maximum number of ids expected per
 // partition *before* an input batch is partitioned.
diff --git a/third_party/xla/xla/sort_json.cc b/third_party/xla/xla/sort_json.cc
index 65ef59e0232cf8..412f30b4226077 100644
--- a/third_party/xla/xla/sort_json.cc
+++ b/third_party/xla/xla/sort_json.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/errors.h"
@@ -34,7 +35,7 @@ limitations under the License.
 namespace {
 
 void SkipWhitespace(absl::string_view json, size_t& index) {
-  while (index < json.size() && std::isspace(json[index])) {
+  while (index < json.size() && absl::ascii_isspace(json[index])) {
     ++index;
   }
 }
@@ -103,7 +104,7 @@ absl::StatusOr<std::unique_ptr<T>> ParseSequence(absl::string_view outer_json,
 
 absl::Status EnsureValidLiteralStart(char c) {
   if (c != '"' && c != '+' && c != '-' && c != 'f' && c != 't' && c != 'n' &&
-      (c < '0' || c > '9')) {
+      !absl::ascii_isdigit(c)) {
     return absl::InvalidArgumentError(absl::StrCat(
         "Invalid first character of literal: '", std::string(1, c), "'."));
   }
@@ -134,8 +135,8 @@ bool LiteralIsFinished(absl::string_view outer_json, size_t& index,
     return c == '"';
   }
 
-  return std::isspace(c) || c == ',' || c == '{' || c == '}' || c == '[' ||
-         c == ']' || c == ':';
+  return absl::ascii_isspace(c) || c == ',' || c == '{' || c == '}' ||
+         c == '[' || c == ']' || c == ':';
 }
 
 absl::StatusOr<absl::string_view> ParseLiteral(absl::string_view outer_json,
diff --git a/third_party/xla/xla/sort_json_test.cc b/third_party/xla/xla/sort_json_test.cc
index 97e9f9c2ac06fa..c24d5ab689b40d 100644
--- a/third_party/xla/xla/sort_json_test.cc
+++ b/third_party/xla/xla/sort_json_test.cc
@@ -18,14 +18,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status_matchers.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 TEST(SortJsonTest, SortsJson) {
   EXPECT_THAT(SortJson(R"({"a": 1, "c": 3,"b": 2, "b": 1,})"),
               absl_testing::IsOkAndHolds(R"({"a":1,"b":1,"b":2,"c":3})"));
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 361a537225e087..3936e440837211 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -68,6 +68,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -199,7 +200,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-    ] + if_google(["@com_google_protobuf//:wrappers_cc_proto"]),
+        "@com_google_protobuf//:wrappers_cc_proto",
+    ],
 )
 
 cc_library(
@@ -217,11 +219,10 @@ cc_library(
     deps = [
         ":device_memory",
         ":platform",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -249,15 +250,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "host_or_device_scalar",
-    hdrs = ["host_or_device_scalar.h"],
-    deps = [
-        ":device_memory",
-        "@com_google_absl//absl/log:check",
-    ],
-)
-
 cc_library(
     name = "launch_dim",
     srcs = ["launch_dim.cc"],
@@ -283,13 +275,12 @@ xla_cc_test(
     deps = [
         ":launch_dim",
         ":launch_dim_proto_cc",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -318,8 +309,8 @@ cc_library(
 )
 
 cc_library(
-    name = "numeric_options",
-    hdrs = ["numeric_options.h"],
+    name = "engine_options",
+    hdrs = ["engine_options.h"],
 )
 
 cc_library(
@@ -345,7 +336,7 @@ cc_library(
         ":blas_proto_cc",
         ":data_type",
         ":device_memory",
-        ":numeric_options",
+        ":engine_options",
         ":scratch_allocator",
         ":stream",
         "//xla/tsl/protobuf:dnn_proto_cc",
@@ -366,7 +357,7 @@ cc_library(
         ":data_type",
         ":device_description_proto_cc",
         ":device_memory",
-        ":numeric_options",
+        ":engine_options",
         ":scratch_allocator",
         ":stream",
         "//xla:util",
@@ -382,13 +373,14 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:wrappers_cc_proto",
         "@eigen_archive//:eigen3",  # buildcleaner: keep
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_google(["@com_google_protobuf//:wrappers_cc_proto"]),
+    ],
 )
 
 tf_proto_library(
@@ -542,13 +534,54 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_symbol_registry",
+    srcs = ["kernel_symbol_registry.cc"],
+    hdrs = ["kernel_symbol_registry.h"],
+    deps = [
+        ":platform",
+        ":platform_manager",
+        "//xla/stream_executor/platform:initialize",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_cc_test(
+    name = "kernel_symbol_registry_test",
+    srcs = ["kernel_symbol_registry_test.cc"],
+    deps = [
+        ":kernel_symbol_registry",
+        "//xla/stream_executor/cuda:cuda_platform_id",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 cc_library(
     name = "kernel_spec",
     srcs = ["kernel_spec.cc"],
     hdrs = ["kernel_spec.h"],
     deps = [
         ":kernel",
+        ":kernel_args",
+        ":kernel_argument_packing_spec",
         ":kernel_spec_proto_cc",
+        ":kernel_symbol_registry",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -560,31 +593,92 @@ cc_library(
 tf_proto_library(
     name = "kernel_spec_proto",
     srcs = ["kernel_spec.proto"],
+    protodeps = [":kernel_argument_packing_spec_proto"],
 )
 
 xla_cc_test(
     name = "kernel_spec_test",
     srcs = ["kernel_spec_test.cc"],
     deps = [
+        ":kernel_argument_packing_spec",
         ":kernel_spec",
         ":kernel_spec_proto_cc",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "//xla/tsl/util/proto:parse_text_proto",
         "//xla/tsl/util/proto:proto_matchers",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+cc_library(
+    name = "kernel_metadata",
+    hdrs = ["kernel_metadata.h"],
+)
+
+xla_cc_test(
+    name = "kernel_metadata_test",
+    srcs = ["kernel_metadata_test.cc"],
+    deps = [
+        ":kernel_metadata",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "kernel_argument_packing_spec",
+    srcs = ["kernel_argument_packing_spec.cc"],
+    hdrs = ["kernel_argument_packing_spec.h"],
+    deps = [
+        ":device_memory",
+        ":kernel_args_packed_vector",
+        ":kernel_argument_packing_spec_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_proto_library(
+    name = "kernel_argument_packing_spec_proto",
+    srcs = ["kernel_argument_packing_spec.proto"],
+)
+
+xla_cc_test(
+    name = "kernel_argument_packing_spec_test",
+    srcs = ["kernel_argument_packing_spec_test.cc"],
+    deps = [
+        ":device_memory",
+        ":kernel_args_packed_vector",
+        ":kernel_argument_packing_spec",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
     name = "kernel",
-    srcs = ["kernel.cc"],
     hdrs = ["kernel.h"],
     deps = [
         ":device_memory",
+        ":kernel_args",
+        ":kernel_argument_packing_spec",
+        ":kernel_metadata",
         ":launch_dim",
         ":stream",
         "@com_google_absl//absl/base:core_headers",
@@ -613,6 +707,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_args_packed_vector",
+    hdrs = ["kernel_args_packed_vector.h"],
+    deps = [
+        ":kernel_args",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "kernel_args_packed_vector_test",
+    srcs = ["kernel_args_packed_vector_test.cc"],
+    deps = [
+        ":kernel_args_packed_vector",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "semantic_version",
     srcs = ["semantic_version.cc"],
@@ -722,7 +836,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -806,7 +919,6 @@ xla_cc_test(
         ":generic_memory_allocation",
         ":generic_memory_allocator",
         ":memory_allocation",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -828,20 +940,35 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
     ],
 )
 
+cc_library(
+    name = "kernel_args",
+    hdrs = ["kernel_args.h"],
+    deps = [
+        ":device_memory",
+        ":kernel_metadata",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:overload",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_cc_test(
-    name = "kernel_test",
-    srcs = ["kernel_test.cc"],
+    name = "kernel_args_test",
+    srcs = ["kernel_args_test.cc"],
     deps = [
         ":device_memory",
-        ":kernel",
-        ":platform",
-        ":platform_manager",
-        ":stream_executor_h",
+        ":kernel_args",
+        ":kernel_metadata",
         "//xla/stream_executor/host:host_platform",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
@@ -924,6 +1051,9 @@ xla_cc_test(
     deps = [
         ":device_description",
         ":semantic_version",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/stream_executor/blas.h b/third_party/xla/xla/stream_executor/blas.h
index 1800c640078211..b1b1a2ab1ae24c 100644
--- a/third_party/xla/xla/stream_executor/blas.h
+++ b/third_party/xla/xla/stream_executor/blas.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.pb.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -296,12 +296,14 @@ class BlasSupport {
   //
   // Alpha/beta type matches `dtype`, unless `dtype` is `Eigen::half`, in that
   // case the expected alpha/beta type is `float`.
-  virtual absl::Status DoBlasGemm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void *alpha,
-      const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
-      const void *beta, DeviceMemoryBase *c, int ldc,
-      const NumericOptions &numeric_options, blas::CallContext context) = 0;
+  virtual absl::Status DoBlasGemm(Stream* stream, blas::Transpose transa,
+                                  blas::Transpose transb, uint64_t m,
+                                  uint64_t n, uint64_t k, DataType dtype,
+                                  const void* alpha, const DeviceMemoryBase& a,
+                                  int lda, const DeviceMemoryBase& b, int ldb,
+                                  const void* beta, DeviceMemoryBase* c,
+                                  int ldc, const EngineOptions& engine_options,
+                                  blas::CallContext context) = 0;
 
   // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
   virtual bool GetBlasGemmAlgorithms(
@@ -322,95 +324,95 @@ class BlasSupport {
   // choosing the best algorithm among many (some of which may fail) without
   // creating a new Stream for each attempt.
   virtual absl::Status DoBlasGemmWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,
-      const DeviceMemoryBase &a, DataType type_a, int lda,
-      const DeviceMemoryBase &b, DataType type_b, int ldb, const void *beta,
-      DeviceMemoryBase *c, DataType type_c, int ldc,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,
+      const DeviceMemoryBase& a, DataType type_a, int lda,
+      const DeviceMemoryBase& b, DataType type_b, int ldb, const void* beta,
+      DeviceMemoryBase* c, DataType type_c, int ldc,
       ComputationType computation_type, AlgorithmType algorithm,
-      const NumericOptions &numeric_options,
-      ProfileResult *output_profile_result, blas::CallContext context) = 0;
+      const EngineOptions& engine_options, ProfileResult* output_profile_result,
+      blas::CallContext context) = 0;
   virtual absl::Status DoBlasGemmStridedBatchedWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,
-      const DeviceMemoryBase &a, DataType type_a, int lda, int64_t stride_a,
-      const DeviceMemoryBase &b, DataType type_b, int ldb, int64_t stride_b,
-      const void *beta, DeviceMemoryBase *c, DataType type_c, int ldc,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,
+      const DeviceMemoryBase& a, DataType type_a, int lda, int64_t stride_a,
+      const DeviceMemoryBase& b, DataType type_b, int ldb, int64_t stride_b,
+      const void* beta, DeviceMemoryBase* c, DataType type_c, int ldc,
       int64_t stride_c, int batch_count, ComputationType computation_type,
-      AlgorithmType algorithm, const NumericOptions &numeric_options,
-      ProfileResult *output_profile_result, blas::CallContext context) = 0;
+      AlgorithmType algorithm, const EngineOptions& engine_options,
+      ProfileResult* output_profile_result, blas::CallContext context) = 0;
 
   // Computes a batch of matrix-matrix product with general matrices.
   // This is a batched version of DoBlasGemm.
   // The batched GEMM computes matrix product for each input/output in a, b,
   // and c, which contain batch_count DeviceMemory objects.
-  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+  virtual bool DoBlasGemmBatched(Stream* stream, blas::Transpose transa,
                                  blas::Transpose transb, uint64_t m, uint64_t n,
                                  uint64_t k, float alpha,
                                  DeviceMemorySlice<Eigen::half> a, int lda,
                                  DeviceMemorySlice<Eigen::half> b, int ldb,
                                  float beta, DeviceMemorySlice<Eigen::half> c,
                                  int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator,
+                                 const EngineOptions& engine_options,
+                                 ScratchAllocator* scratch_allocator,
                                  blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, float alpha,
       DeviceMemorySlice<Eigen::bfloat16> a, int lda,
       DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
       DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
+      blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, float alpha,
       DeviceMemorySlice<float> a, int lda, DeviceMemorySlice<float> b, int ldb,
       float beta, DeviceMemorySlice<float> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
+      blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, double alpha,
       DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,
       int ldb, double beta, DeviceMemorySlice<double> c, int ldc,
-      int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      int batch_count, const EngineOptions& engine_options,
+      ScratchAllocator* scratch_allocator, blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, std::complex<float> alpha,
       DeviceMemorySlice<std::complex<float>> a, int lda,
       DeviceMemorySlice<std::complex<float>> b, int ldb,
       std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
-      int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      int ldc, int batch_count, const EngineOptions& engine_options,
+      ScratchAllocator* scratch_allocator, blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, std::complex<double> alpha,
       DeviceMemorySlice<std::complex<double>> a, int lda,
       DeviceMemorySlice<std::complex<double>> b, int ldb,
       std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
-      int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      int ldc, int batch_count, const EngineOptions& engine_options,
+      ScratchAllocator* scratch_allocator, blas::CallContext context) = 0;
   // Batched gemm with strides instead of pointer arrays.
   virtual absl::Status DoBlasGemmStridedBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void *alpha,
-      const DeviceMemoryBase &a, int lda, int64_t stride_a,
-      const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
-      DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-      const NumericOptions &numeric_options, blas::CallContext context) = 0;
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void* alpha,
+      const DeviceMemoryBase& a, int lda, int64_t stride_a,
+      const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,
+      DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,
+      const EngineOptions& engine_options, blas::CallContext context) = 0;
 
   template <typename InputType, typename OutputType, typename ConstantType>
   absl::Status BlasGemmStridedBatchedWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
-      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
-      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
-      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
+      const DeviceMemory<InputType>& a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType>& b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType>* c, int ldc, int64_t stride_c,
       int batch_count, blas::ComputationType computation_type,
-      blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+      blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+      blas::ProfileResult* output_profile_result, blas::CallContext context) {
     TF_RETURN_IF_ERROR(
         CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
             computation_type));
@@ -425,7 +427,7 @@ class BlasSupport {
         blas::ToDataType<InputType>::value, lda, stride_a, b,
         blas::ToDataType<InputType>::value, ldb, stride_b, beta_ptr, c,
         blas::ToDataType<OutputType>::value, ldc, stride_c, batch_count,
-        computation_type, algorithm, numeric_options, output_profile_result,
+        computation_type, algorithm, engine_options, output_profile_result,
         context);
     if (output_profile_result) {
       // The error is recorded in the profile.
@@ -435,13 +437,13 @@ class BlasSupport {
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
-  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+  absl::Status BlasGemm(Stream* stream, blas::Transpose transa,
                         blas::Transpose transb, uint64_t m, uint64_t n,
                         uint64_t k, ConstantType alpha,
-                        const DeviceMemory<InputType> &a, int lda,
-                        const DeviceMemory<InputType> &b, int ldb,
-                        ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
-                        const NumericOptions &numeric_options,
+                        const DeviceMemory<InputType>& a, int lda,
+                        const DeviceMemory<InputType>& b, int ldb,
+                        ConstantType beta, DeviceMemory<OutputType>* c, int ldc,
+                        const EngineOptions& engine_options,
                         blas::CallContext context) {
     static_assert(
         detail::is_any_of<InputType, int8_t, Eigen::half, Eigen::bfloat16,
@@ -466,33 +468,33 @@ class BlasSupport {
 
     return DoBlasGemm(stream, transa, transb, m, n, k,
                       blas::ToDataType<InputType>::value, alpha_ptr, a, lda, b,
-                      ldb, beta_ptr, c, ldc, numeric_options, context);
+                      ldb, beta_ptr, c, ldc, engine_options, context);
   }
 
   template <typename InputType, typename OutputType>
-  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+  absl::Status BlasGemm(Stream* stream, blas::Transpose transa,
                         blas::Transpose transb, uint64_t m, uint64_t n,
-                        uint64_t k, const DeviceMemory<InputType> &a, int lda,
-                        const DeviceMemory<InputType> &b, int ldb,
-                        DeviceMemory<OutputType> *c, int ldc,
-                        const NumericOptions &numeric_options,
+                        uint64_t k, const DeviceMemory<InputType>& a, int lda,
+                        const DeviceMemory<InputType>& b, int ldb,
+                        DeviceMemory<OutputType>* c, int ldc,
+                        const EngineOptions& engine_options,
                         blas::CallContext context) {
     InputType alpha{1.0};
     InputType beta{0.0};
     return BlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                    beta, c, ldc, numeric_options, context);
+                    beta, c, ldc, engine_options, context);
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
   absl::Status BlasGemmWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
-      const DeviceMemory<InputType> &a, int lda,
-      const DeviceMemory<InputType> &b, int ldb, ConstantType beta,
-      DeviceMemory<OutputType> *c, int ldc,
+      const DeviceMemory<InputType>& a, int lda,
+      const DeviceMemory<InputType>& b, int ldb, ConstantType beta,
+      DeviceMemory<OutputType>* c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
-      const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+      const EngineOptions& engine_options,
+      blas::ProfileResult* output_profile_result, blas::CallContext context) {
     TF_RETURN_IF_ERROR(
         CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
             computation_type));
@@ -508,7 +510,7 @@ class BlasSupport {
         blas::ToDataType<InputType>::value, lda, b,
         blas::ToDataType<InputType>::value, ldb, beta_ptr, c,
         blas::ToDataType<OutputType>::value, ldc, computation_type, algorithm,
-        numeric_options, output_profile_result, context);
+        engine_options, output_profile_result, context);
 
     if (output_profile_result) {
       // The error is recorded in the profile.
@@ -530,18 +532,18 @@ class BlasSupport {
 
     return BlasGemmWithAlgorithm(stream, transa, transb, m, n, k, alpha, a, lda,
                                  b, ldb, beta, c, ldc, computation_type,
-                                 algorithm, NumericOptions{},
+                                 algorithm, EngineOptions{},
                                  output_profile_result, context);
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
   absl::Status BlasGemmStridedBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
-      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
-      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
-      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
-      int batch_count, const NumericOptions &numeric_options,
+      const DeviceMemory<InputType>& a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType>& b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType>* c, int ldc, int64_t stride_c,
+      int batch_count, const EngineOptions& engine_options,
       blas::CallContext context) {
     static_assert(
         detail::is_any_of<InputType, int8_t, float, Eigen::half,
@@ -563,7 +565,7 @@ class BlasSupport {
     return DoBlasGemmStridedBatched(
         stream, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
         alpha_ptr, a, lda, stride_a, b, ldb, stride_b, beta_ptr, c, ldc,
-        stride_c, batch_count, numeric_options, context);
+        stride_c, batch_count, engine_options, context);
   }
 
   // Solves a triangular matrix equation.
@@ -740,176 +742,175 @@ class BlasSupport {
 // BlasSupport base class.
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES                  \
   absl::StatusOr<bool> IsMainStreamSet() const override;                       \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,            \
-                  DeviceMemory<float> *x, int incx) override;                  \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,           \
-                  DeviceMemory<double> *x, int incx) override;                 \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,            \
-                  DeviceMemory<std::complex<float>> *x, int incx) override;    \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,           \
-                  DeviceMemory<std::complex<double>> *x, int incx) override;   \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count,                         \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, float alpha,            \
+                  DeviceMemory<float>* x, int incx) override;                  \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, double alpha,           \
+                  DeviceMemory<double>* x, int incx) override;                 \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, float alpha,            \
+                  DeviceMemory<std::complex<float>>* x, int incx) override;    \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, double alpha,           \
+                  DeviceMemory<std::complex<double>>* x, int incx) override;   \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count,                         \
                   std::complex<float> alpha,                                   \
-                  DeviceMemory<std::complex<float>> *x, int incx) override;    \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count,                         \
+                  DeviceMemory<std::complex<float>>* x, int incx) override;    \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count,                         \
                   std::complex<double> alpha,                                  \
-                  DeviceMemory<std::complex<double>> *x, int incx) override;   \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
-                  uint64_t n, float alpha, const DeviceMemory<float> &a,       \
-                  int lda, const DeviceMemory<float> &x, int incx, float beta, \
-                  DeviceMemory<float> *y, int incy) override;                  \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
-                  uint64_t n, double alpha, const DeviceMemory<double> &a,     \
-                  int lda, const DeviceMemory<double> &x, int incx,            \
-                  double beta, DeviceMemory<double> *y, int incy) override;    \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  DeviceMemory<std::complex<double>>* x, int incx) override;   \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, float alpha, const DeviceMemory<float>& a,       \
+                  int lda, const DeviceMemory<float>& x, int incx, float beta, \
+                  DeviceMemory<float>* y, int incy) override;                  \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, double alpha, const DeviceMemory<double>& a,     \
+                  int lda, const DeviceMemory<double>& x, int incx,            \
+                  double beta, DeviceMemory<double>* y, int incy) override;    \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
                   uint64_t n, std::complex<float> alpha,                       \
-                  const DeviceMemory<std::complex<float>> &a, int lda,         \
-                  const DeviceMemory<std::complex<float>> &x, int incx,        \
+                  const DeviceMemory<std::complex<float>>& a, int lda,         \
+                  const DeviceMemory<std::complex<float>>& x, int incx,        \
                   std::complex<float> beta,                                    \
-                  DeviceMemory<std::complex<float>> *y, int incy) override;    \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  DeviceMemory<std::complex<float>>* y, int incy) override;    \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
                   uint64_t n, std::complex<double> alpha,                      \
-                  const DeviceMemory<std::complex<double>> &a, int lda,        \
-                  const DeviceMemory<std::complex<double>> &x, int incx,       \
+                  const DeviceMemory<std::complex<double>>& a, int lda,        \
+                  const DeviceMemory<std::complex<double>>& x, int incx,       \
                   std::complex<double> beta,                                   \
-                  DeviceMemory<std::complex<double>> *y, int incy) override;   \
+                  DeviceMemory<std::complex<double>>* y, int incy) override;   \
   absl::Status DoBlasGemm(                                                     \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,                \
-      const void *alpha, const DeviceMemoryBase &a, int lda,                   \
-      const DeviceMemoryBase &b, int ldb, const void *beta,                    \
-      DeviceMemoryBase *c, int ldc, const NumericOptions &numeric_options,     \
+      const void* alpha, const DeviceMemoryBase& a, int lda,                   \
+      const DeviceMemoryBase& b, int ldb, const void* beta,                    \
+      DeviceMemoryBase* c, int ldc, const EngineOptions& engine_options,       \
       blas::CallContext context) override;                                     \
   bool GetBlasGemmAlgorithms(                                                  \
-      Stream *stream, const gpu::MatrixDescriptor &a,                          \
-      const gpu::MatrixDescriptor &b, gpu::OutputMatrixDescriptor *c,          \
-      const void *alpha, const void *beta,                                     \
-      std::vector<blas::AlgorithmType> *out_algorithms) override;              \
+      Stream* stream, const gpu::MatrixDescriptor& a,                          \
+      const gpu::MatrixDescriptor& b, gpu::OutputMatrixDescriptor* c,          \
+      const void* alpha, const void* beta,                                     \
+      std::vector<blas::AlgorithmType>* out_algorithms) override;              \
   absl::Status DoBlasGemmWithAlgorithm(                                        \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,                   \
-      const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
-      const DeviceMemoryBase &b, blas::DataType type_b, int ldb,               \
-      const void *beta, DeviceMemoryBase *c, blas::DataType type_c, int ldc,   \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,                   \
+      const DeviceMemoryBase& a, blas::DataType type_a, int lda,               \
+      const DeviceMemoryBase& b, blas::DataType type_b, int ldb,               \
+      const void* beta, DeviceMemoryBase* c, blas::DataType type_c, int ldc,   \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
-      const NumericOptions &numeric_options,                                   \
-      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      const EngineOptions& engine_options,                                     \
+      blas::ProfileResult* output_profile_result, blas::CallContext context)   \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
       DeviceMemorySlice<Eigen::half> a, int lda,                               \
       DeviceMemorySlice<Eigen::half> b, int ldb, float beta,                   \
       DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,              \
-      const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      const EngineOptions& engine_options,                                     \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
       DeviceMemorySlice<Eigen::bfloat16> a, int lda,                           \
       DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,               \
       DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,          \
-      const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
-      override;                                                                \
-  bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
-      DeviceMemorySlice<float> a, int lda, DeviceMemorySlice<float> b,         \
-      int ldb, float beta, DeviceMemorySlice<float> c, int ldc,                \
-      int batch_count, const NumericOptions &numeric_options,                  \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      const EngineOptions& engine_options,                                     \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
+  bool DoBlasGemmBatched(Stream* stream, blas::Transpose transa,               \
+                         blas::Transpose transb, uint64_t m, uint64_t n,       \
+                         uint64_t k, float alpha, DeviceMemorySlice<float> a,  \
+                         int lda, DeviceMemorySlice<float> b, int ldb,         \
+                         float beta, DeviceMemorySlice<float> c, int ldc,      \
+                         int batch_count, const EngineOptions& engine_options, \
+                         ScratchAllocator* scratch_allocator,                  \
+                         blas::CallContext context) override;                  \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, double alpha,                        \
       DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,       \
       int ldb, double beta, DeviceMemorySlice<double> c, int ldc,              \
-      int batch_count, const NumericOptions &numeric_options,                  \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      int batch_count, const EngineOptions& engine_options,                    \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, std::complex<float> alpha,           \
       DeviceMemorySlice<std::complex<float>> a, int lda,                       \
       DeviceMemorySlice<std::complex<float>> b, int ldb,                       \
       std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,      \
-      int ldc, int batch_count, const NumericOptions &numeric_options,         \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      int ldc, int batch_count, const EngineOptions& engine_options,           \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, std::complex<double> alpha,          \
       DeviceMemorySlice<std::complex<double>> a, int lda,                      \
       DeviceMemorySlice<std::complex<double>> b, int ldb,                      \
       std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,    \
-      int ldc, int batch_count, const NumericOptions &numeric_options,         \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      int ldc, int batch_count, const EngineOptions& engine_options,           \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   absl::Status DoBlasGemmStridedBatched(                                       \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,                \
-      const void *alpha, const DeviceMemoryBase &a, int lda, int64_t stride_a, \
-      const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,  \
-      DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,         \
-      const NumericOptions &numeric_options, blas::CallContext context)        \
+      const void* alpha, const DeviceMemoryBase& a, int lda, int64_t stride_a, \
+      const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,  \
+      DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,         \
+      const EngineOptions& engine_options, blas::CallContext context)          \
       override;                                                                \
   absl::Status DoBlasGemmStridedBatchedWithAlgorithm(                          \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,                   \
-      const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
-      int64_t stride_a, const DeviceMemoryBase &b, blas::DataType type_b,      \
-      int ldb, int64_t stride_b, const void *beta, DeviceMemoryBase *c,        \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,                   \
+      const DeviceMemoryBase& a, blas::DataType type_a, int lda,               \
+      int64_t stride_a, const DeviceMemoryBase& b, blas::DataType type_b,      \
+      int ldb, int64_t stride_b, const void* beta, DeviceMemoryBase* c,        \
       blas::DataType type_c, int ldc, int64_t stride_c, int batch_count,       \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
-      const NumericOptions &numeric_options,                                   \
-      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      const EngineOptions& engine_options,                                     \
+      blas::ProfileResult* output_profile_result, blas::CallContext context)   \
       override;                                                                \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
-                  uint64_t n, float alpha, const DeviceMemory<float> &a,       \
-                  int lda, DeviceMemory<float> *b, int ldb) override;          \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  uint64_t n, float alpha, const DeviceMemory<float>& a,       \
+                  int lda, DeviceMemory<float>* b, int ldb) override;          \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
-                  uint64_t n, double alpha, const DeviceMemory<double> &a,     \
-                  int lda, DeviceMemory<double> *b, int ldb) override;         \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  uint64_t n, double alpha, const DeviceMemory<double>& a,     \
+                  int lda, DeviceMemory<double>* b, int ldb) override;         \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
                   uint64_t n, std::complex<float> alpha,                       \
-                  const DeviceMemory<std::complex<float>> &a, int lda,         \
-                  DeviceMemory<std::complex<float>> *b, int ldb) override;     \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  const DeviceMemory<std::complex<float>>& a, int lda,         \
+                  DeviceMemory<std::complex<float>>* b, int ldb) override;     \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
                   uint64_t n, std::complex<double> alpha,                      \
-                  const DeviceMemory<std::complex<double>> &a, int lda,        \
-                  DeviceMemory<std::complex<double>> *b, int ldb) override;    \
+                  const DeviceMemory<std::complex<double>>& a, int lda,        \
+                  DeviceMemory<std::complex<double>>* b, int ldb) override;    \
   bool DoBlasTrsmBatched(                                                      \
-      Stream *stream, blas::Side side, blas::UpperLower uplo,                  \
+      Stream* stream, blas::Side side, blas::UpperLower uplo,                  \
       blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
-      float alpha, const DeviceMemory<float *> &as, int lda,                   \
-      DeviceMemory<float *> *bs, int ldb, int batch_count) override;           \
+      float alpha, const DeviceMemory<float*>& as, int lda,                    \
+      DeviceMemory<float*>* bs, int ldb, int batch_count) override;            \
   bool DoBlasTrsmBatched(                                                      \
-      Stream *stream, blas::Side side, blas::UpperLower uplo,                  \
+      Stream* stream, blas::Side side, blas::UpperLower uplo,                  \
       blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
-      double alpha, const DeviceMemory<double *> &as, int lda,                 \
-      DeviceMemory<double *> *bs, int ldb, int batch_count) override;          \
-  bool DoBlasTrsmBatched(Stream *stream, blas::Side side,                      \
-                         blas::UpperLower uplo, blas::Transpose transa,        \
-                         blas::Diagonal diag, uint64_t m, uint64_t n,          \
-                         std::complex<float> alpha,                            \
-                         const DeviceMemory<std::complex<float> *> &as,        \
-                         int lda, DeviceMemory<std::complex<float> *> *bs,     \
-                         int ldb, int batch_count) override;                   \
-  bool DoBlasTrsmBatched(Stream *stream, blas::Side side,                      \
+      double alpha, const DeviceMemory<double*>& as, int lda,                  \
+      DeviceMemory<double*>* bs, int ldb, int batch_count) override;           \
+  bool DoBlasTrsmBatched(                                                      \
+      Stream* stream, blas::Side side, blas::UpperLower uplo,                  \
+      blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
+      std::complex<float> alpha, const DeviceMemory<std::complex<float>*>& as, \
+      int lda, DeviceMemory<std::complex<float>*>* bs, int ldb,                \
+      int batch_count) override;                                               \
+  bool DoBlasTrsmBatched(Stream* stream, blas::Side side,                      \
                          blas::UpperLower uplo, blas::Transpose transa,        \
                          blas::Diagonal diag, uint64_t m, uint64_t n,          \
                          std::complex<double> alpha,                           \
-                         const DeviceMemory<std::complex<double> *> &as,       \
-                         int lda, DeviceMemory<std::complex<double> *> *bs,    \
+                         const DeviceMemory<std::complex<double>*>& as,        \
+                         int lda, DeviceMemory<std::complex<double>*>* bs,     \
                          int ldb, int batch_count) override;                   \
-  absl::Status GetVersion(std::string *version) override;
+  absl::Status GetVersion(std::string* version) override;
 
 }  // namespace blas
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 2c10d92287c80c..8519a5ba53979f 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -311,6 +311,8 @@ class CommandBuffer {
   // Returns command buffer state.
   virtual State state() const = 0;
 
+  virtual std::string ToString() const = 0;
+
   //--------------------------------------------------------------------------//
   // Command buffer tracing API
   //--------------------------------------------------------------------------//
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index cdcdbfcb0b856a..aa5b016d652e55 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -97,35 +97,35 @@ cc_library(
         "gpu",
     ],
     visibility = ["//visibility:public"],
-    deps =
-        [
-            ":cuda_diagnostics",
-            ":cuda_executor",
-            ":cuda_platform_id",
-            ":cuda_status",
-            "//xla/stream_executor:device_description",
-            "//xla/stream_executor:executor_cache",
-            "//xla/stream_executor:platform",
-            "//xla/stream_executor:platform_manager",
-            "//xla/stream_executor:stream_executor_h",
-            "//xla/stream_executor/platform:initialize",
-            "//xla/tsl/platform:errors",
-            "//xla/tsl/platform:status",
-            "@com_google_absl//absl/base",
-            "@com_google_absl//absl/base:core_headers",
-            "@com_google_absl//absl/log",
-            "@com_google_absl//absl/log:check",
-            "@com_google_absl//absl/memory",
-            "@com_google_absl//absl/status",
-            "@com_google_absl//absl/status:statusor",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
-            "@com_google_absl//absl/synchronization",
-            "@local_config_cuda//cuda:cuda_headers",
-            "@local_tsl//tsl/platform:errors",
-            "@local_tsl//tsl/platform:status",
-            "@local_tsl//tsl/platform:statusor",
-        ] + tf_additional_cuda_platform_deps(),
+    deps = [
+        ":cuda_diagnostics",
+        ":cuda_executor",
+        ":cuda_platform_id",
+        ":cuda_status",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:executor_cache",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/platform:initialize",
+        "//xla/tsl/cuda:nvml",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ] + tf_additional_cuda_platform_deps(),
     alwayslink = True,  # Registers itself with the PlatformManager.
 )
 
@@ -180,13 +180,12 @@ cc_library(
         "//xla/stream_executor/gpu:context",
         "//xla/stream_executor/gpu:context_map",
         "//xla/stream_executor/gpu:scoped_activate_context",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -217,10 +216,9 @@ xla_test(
         ":cuda_diagnostics",
         ":cuda_status",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -280,9 +278,8 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
-        "//xla/stream_executor:host_or_device_scalar",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
@@ -320,36 +317,6 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "cuda_solver_context",
-    srcs = ["cuda_solver_context.cc"],
-    hdrs = ["cuda_solver_context.h"],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps = [
-        ":cuda_platform_id",
-        "//xla:comparison_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor/platform:platform_object_registry",
-        "//xla/tsl/cuda:cusolver",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "cuda_blas_utils",
     srcs = ["cuda_blas_utils.cc"],
@@ -424,86 +391,91 @@ cuda_library(
     ],
 )
 
-cc_library(
-    name = "sdc_log",
-    srcs = ["sdc_log.cc"],
-    hdrs = ["sdc_log.h"],
-    tags = ["gpu"],
+cuda_library(
+    name = "buffer_debug_xor_checksum_kernel_cuda",
+    srcs = ["buffer_debug_xor_checksum_kernel_cuda.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
     deps = [
-        "//xla/backends/gpu/runtime:sdc_log_structs",
-        "//xla/backends/gpu/runtime:sdc_proto_cc",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
+        ":cuda_platform_id",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:buffer_debug_xor_checksum_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base",
+        "@local_config_cuda//cuda:cuda_headers",
     ],
+    alwayslink = True,
 )
 
 xla_test(
-    name = "sdc_log_test",
-    srcs = ["sdc_log_test.cc"],
+    name = "buffer_debug_xor_checksum_kernel_cuda_test",
+    srcs = ["buffer_debug_xor_checksum_kernel_cuda_test.cc"],
     backends = ["gpu"],
+    tags = ["cuda-only"],
     deps = [
-        ":sdc_log",
-        "//xla/backends/gpu/runtime:sdc_buffer_id",
-        "//xla/backends/gpu/runtime:sdc_log_structs",
-        "//xla/backends/gpu/runtime:thunk_id",
+        ":buffer_debug_xor_checksum_kernel_cuda",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor:typed_kernel_factory",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/stream_executor/gpu:buffer_debug_xor_checksum_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:status_matchers",
-        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
 cuda_library(
-    name = "sdc_xor_checksum_kernel_cuda",
-    srcs = ["sdc_xor_checksum_kernel_cuda.cu.cc"],
-    hdrs = ["sdc_xor_checksum_kernel_cuda.h"],
+    name = "buffer_debug_float_check_kernel_cuda",
+    srcs = ["buffer_debug_float_check_kernel_cuda.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
     tags = [
         "cuda-only",
         "gpu",
     ],
     deps = [
         ":cuda_platform_id",
-        "//xla/backends/gpu/runtime:sdc_buffer_id",
-        "//xla/backends/gpu/runtime:sdc_log_structs",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:kernel",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base",
-        "@com_google_absl//absl/status:statusor",
         "@local_config_cuda//cuda:cuda_headers",
     ],
+    alwayslink = True,
 )
 
 xla_test(
-    name = "sdc_xor_checksum_kernel_cuda_test",
-    srcs = ["sdc_xor_checksum_kernel_cuda_test.cc"],
+    name = "buffer_debug_float_check_kernel_cuda_test",
+    srcs = ["buffer_debug_float_check_kernel_cuda_test.cc"],
     backends = ["gpu"],
     tags = ["cuda-only"],
     deps = [
-        ":sdc_log",
-        ":sdc_xor_checksum_kernel_cuda",
-        "//xla/backends/gpu/runtime:sdc_buffer_id",
-        "//xla/backends/gpu/runtime:sdc_log_structs",
+        ":buffer_debug_float_check_kernel_cuda",
+        "//xla:types",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/backends/gpu/runtime:thunk_id",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -511,6 +483,9 @@ xla_test(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor:typed_kernel_factory",
+        "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -545,8 +520,8 @@ cc_library(
         "//xla/stream_executor:data_type",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:semantic_version",
@@ -555,7 +530,6 @@ cc_library(
         "//xla/stream_executor/platform:initialize",
         "//xla/tsl/cuda:cudnn",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util:env_var",
@@ -636,15 +610,18 @@ cc_library(
         "gpu",
     ],
     deps = [
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:stream_executor_util",
         "//xla/stream_executor:dnn",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@cudnn_frontend_archive//:cudnn_frontend",
-        "@local_config_cuda//cuda:cudnn_header",
     ],
 )
 
@@ -663,13 +640,11 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/tsl/cuda:cudart",  # build_cleaner: keep
         "//xla/tsl/cuda:cudnn",  # build_cleaner: keep
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@cudnn_frontend_archive//:cudnn_frontend",
         "@jsoncpp_git//:jsoncpp",
-        "@local_config_cuda//cuda:cudnn_header",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -686,6 +661,7 @@ cc_library(
         ":cuda_status",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -699,6 +675,8 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
@@ -713,8 +691,8 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -789,7 +767,6 @@ xla_test(
         "//xla/stream_executor:stream_executor_h",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -915,6 +892,7 @@ xla_test(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -1154,6 +1132,7 @@ cc_library(
         ":cuda_version_parser",
         ":cudnn_api_wrappers",
         ":tma_util",
+        "//xla:util",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives",
         "//xla/core/collectives:collectives_registry",
@@ -1169,6 +1148,9 @@ cc_library(
         "//xla/stream_executor:generic_memory_allocation",
         "//xla/stream_executor:generic_memory_allocator",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:kernel_argument_packing_spec",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:memory_allocation",
@@ -1235,9 +1217,53 @@ xla_test(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cuda_library(
+    name = "cuda_executor_multigpu_test_kernels",
+    srcs = ["cuda_executor_multigpu_test_kernels.cu.cc"],
+    hdrs = ["cuda_executor_multigpu_test_kernels.h"],
+    tags = ["cuda-only"],
+    deps = [
+        ":cuda_status",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/status",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cuda_runtime",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+xla_test(
+    name = "cuda_executor_multigpu_test",
+    srcs = ["cuda_executor_multigpu_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+            "no_oss",
+        ],
+    },
+    backends = ["gpu"],
+    tags = ["cuda-only"],
+    deps = [
+        ":cuda_executor",
+        ":cuda_executor_multigpu_test_kernels",
+        ":cuda_platform",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_init",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1253,9 +1279,10 @@ cc_library(
     deps = [
         ":all_reduce_kernel_cuda",
         ":buffer_comparator_kernel_cuda",
+        ":buffer_debug_float_check_kernel_cuda",
+        ":buffer_debug_xor_checksum_kernel_cuda",
         ":cublas_plugin",
         ":cuda_platform",
-        ":cuda_solver_context",
         ":cudnn_plugin",
         ":cufft_plugin",
         ":make_batch_pointers_kernel_cuda",
@@ -1263,7 +1290,6 @@ cc_library(
         ":redzone_allocator_kernel_cuda",
         ":repeat_buffer_kernel_cuda",
         ":topk_kernel_cuda",
-        "//xla/tsl/cuda:cusolver",
         "//xla/tsl/cuda:cusparse",
         "//xla/tsl/cuda:tensorrt_rpath",
     ] + [":cub_sort_kernel_cuda_" + suffix for suffix in get_cub_sort_kernel_types()],
@@ -1273,7 +1299,7 @@ cc_library(
 # OSX framework for device driver access
 cc_library(
     name = "IOKit",
-    linkopts = ["-framework IOKit"],
+    linkopts = ["-frameworFk IOKit"],
 )
 
 cc_library(
@@ -1340,9 +1366,8 @@ xla_cc_test(
         ":cuda_version_parser",
         "//xla/stream_executor:semantic_version",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1380,6 +1405,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/profiler/lib:nvtx_utils",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
@@ -1400,9 +1427,9 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -1456,9 +1483,9 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1504,6 +1531,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:path",
     ],
 )
 
@@ -1519,14 +1547,14 @@ xla_test(
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -1574,7 +1602,6 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_asm_opts",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:subprocess",
         "@com_google_absl//absl/algorithm:container",
@@ -1583,6 +1610,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1658,12 +1686,11 @@ xla_cc_test(
     deps = [
         ":subprocess_compilation",
         "//xla/stream_executor:semantic_version",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1707,7 +1734,6 @@ xla_cc_test(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1752,13 +1778,11 @@ cc_library(
         ":cuda_compute_capability",
         ":ptx_compiler_helpers",
         ":subprocess_compilation",
-        "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:gpu_asm_opts",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:subprocess",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -1856,7 +1880,6 @@ cc_library(
         ":nvjitlink",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:gpu_asm_opts",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -1910,8 +1933,8 @@ xla_cc_test(
         ":mock_compilation_provider",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -1944,13 +1967,12 @@ xla_cc_test(
         ":caching_compilation_provider",
         ":compilation_options",
         ":compilation_provider",
+        ":cuda_compute_capability",
         ":mock_compilation_provider",
-        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2000,6 +2022,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ],
@@ -2011,15 +2034,14 @@ xla_cc_test(
     deps = [
         ":compilation_options",
         ":compilation_provider",
+        ":cuda_compute_capability",
         ":defer_relocatable_compilation_compilation_provider",
         ":mock_compilation_provider",
-        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2104,10 +2126,12 @@ xla_cc_test(
         ":cuda_platform",
         ":nvjitlink_support",
         ":ptx_compiler_support",
-        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:cuda_root_path",
         "@local_tsl//tsl/platform:path",
     ],
@@ -2140,8 +2164,8 @@ xla_cc_test(
     deps = [
         ":tma_util",
         "//xla/stream_executor/gpu:tma_metadata",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -2312,6 +2336,7 @@ cuda_library(
     ],
     deps = [
         ":cub_sort_kernel_cuda_impl_{}".format(typename),
+        "//xla/backends/gpu:ffi",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/stream_executor/cuda:cuda_status",
@@ -2322,6 +2347,64 @@ cuda_library(
     alwayslink = 1,
 ) for typename in get_cub_sort_kernel_types()]
 
+[cuda_library(
+    name = "cub_prefix_sum_kernel_cuda_{}".format(typename),
+    srcs = ["cub_prefix_sum_kernel_cuda.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    local_defines = ["CUB_TYPE_" + typename.upper()],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:prefix_sum_kernel",
+        "@local_config_cuda//cuda:cub_headers",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    alwayslink = 1,
+) for typename in get_cub_sort_kernel_types()]
+
+xla_test(
+    name = "cub_prefix_sum_kernel_cuda_test",
+    srcs = ["cub_prefix_sum_kernel_cuda_test.cc"],
+    backends = ["gpu"],
+    tags = ["cuda-only"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor:typed_kernel_factory",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:prefix_sum_kernel",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/lib/math:math_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_config_cuda//cuda:cuda_headers",
+    ] + [":cub_prefix_sum_kernel_cuda_" + suffix for suffix in get_cub_sort_kernel_types()],
+)
+
 cuda_library(
     name = "topk_kernel_cuda",
     srcs = [
@@ -2337,6 +2420,7 @@ cuda_library(
     deps = [
         ":cuda_platform_id",
         "//xla:types",
+        "//xla/stream_executor:kernel_symbol_registry",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:topk_kernel",
         "//xla/tsl/lib/math:math_util",
@@ -2402,6 +2486,7 @@ cuda_library(
     deps = [
         ":cuda_platform_id",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:gpu_test_kernel_traits",
diff --git a/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc b/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
index 6fb7cdc818b6e1..539ef191707a10 100644
--- a/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
+++ b/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
@@ -91,6 +91,9 @@ __device__ __forceinline__ void WaitSignalFlag<PlatformType::CUDA>(
 REGISTER_ALL_REDUCE_KERNEL(AddBF16, xla::bfloat16, __nv_bfloat16, SUM);
 REGISTER_ALL_REDUCE_KERNEL(AddF32, float, float, SUM);
 
+// Multimem so far supported only for f32.
+REGISTER_ALL_REDUCE_KERNEL_IMPL(AddF32, float, float, SUM, kMultimem);
+
 // AllReduce doesn't have a corresponding reduction kind for logical operations.
 // NCCL uses MAX and MIN on uint8_t for logical operations.
 REGISTER_ALL_REDUCE_KERNEL(OrPRED, bool, uint8_t, MAX);
diff --git a/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc
index 96ee34d87c55b8..12d8bb626ceaf9 100644
--- a/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc
@@ -21,21 +21,32 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
 #include "xla/stream_executor/cuda/compilation_provider_options.h"
 #include "xla/stream_executor/cuda/nvjitlink_support.h"
 #include "xla/stream_executor/cuda/ptx_compiler_support.h"
-#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/cuda_root_path.h"
 #include "tsl/platform/path.h"
+#include "tsl/platform/platform.h"
 
 namespace stream_executor::cuda {
 
 namespace {
 using ::testing::AllOf;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
+
+TEST(AssembleCompilationProviderTest, CandidateCudaRootsConsidersCUDA_HOME) {
+  const std::string cuda_home = "/my/cuda/home";
+  tsl::setenv("CUDA_HOME", cuda_home.c_str(), 1);
+  if (!tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "CUDA_HOME is only being considered in the OSS build of XLA.";
+  }
+  EXPECT_THAT(tsl::CandidateCudaRoots(), ::testing::Contains(cuda_home));
+}
 
 TEST(AssembleCompilationProviderTest,
      ReturnsErrorIfNoCompilationProviderIsAvailable) {
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
new file mode 100644
index 00000000000000..2325478c1256fb
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
@@ -0,0 +1,255 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+
+#include "absl/base/casts.h"
+#include "third_party/gpus/cuda/include/cuda/atomic"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/kernel_spec.h"
+
+namespace se = stream_executor;
+
+namespace {
+
+__device__ unsigned int ThreadIdx() {
+  return threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x +
+         threadIdx.x;
+}
+
+__device__ unsigned int BlockIdx() {
+  return blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x +
+         blockIdx.x;
+}
+
+// Based on
+// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+template <unsigned int BLOCK_SIZE>
+__device__ void WarpReduceSum(unsigned int tid, volatile uint32_t* data) {
+  if (BLOCK_SIZE >= 64) data[tid] += data[tid + 32];
+  if (BLOCK_SIZE >= 32) data[tid] += data[tid + 16];
+  if (BLOCK_SIZE >= 16) data[tid] += data[tid + 8];
+  if (BLOCK_SIZE >= 8) data[tid] += data[tid + 4];
+  if (BLOCK_SIZE >= 4) data[tid] += data[tid + 2];
+  if (BLOCK_SIZE >= 2) data[tid] += data[tid + 1];
+}
+
+__device__ inline bool IsNan(float v) { return isnan(v); }
+__device__ inline bool IsNan(__nv_bfloat16 v) { return __isnan(v); }
+__device__ inline bool IsInf(float v) { return isinf(v); }
+__device__ inline bool IsInf(__nv_bfloat16 v) { return __isinf(v); }
+__device__ inline bool IsZero(float v) { return v == 0.0f; }
+__device__ inline bool IsZero(__nv_bfloat16 v) {
+  return v == __nv_bfloat16(0.0f);
+}
+
+// Calculates count of NaNs of all elements of `input` and puts result in
+// `output`.
+//
+// Optimized implementation based on
+// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+// that takes advantage of `BLOCK_SIZE` threads.
+//
+// `BLOCK_SIZE` must be a power of 2 no larger than 1024.
+template <typename T, unsigned int BLOCK_SIZE>
+__device__ void ReduceSum(const T* input, uint64_t input_size,
+                          uint32_t* nan_counter, uint32_t* inf_counter,
+                          uint32_t* zero_counter) {
+  __shared__ uint32_t nan_count[BLOCK_SIZE];
+  __shared__ uint32_t inf_count[BLOCK_SIZE];
+  __shared__ uint32_t zero_count[BLOCK_SIZE];
+
+  assert(BlockIdx() == 0);
+  const unsigned int tid = ThreadIdx();
+
+  nan_count[tid] = 0;
+  inf_count[tid] = 0;
+  zero_count[tid] = 0;
+  for (unsigned int i = tid; i < input_size; i += BLOCK_SIZE) {
+    if (IsNan(input[i])) {
+      nan_count[tid]++;
+    }
+    if (IsInf(input[i])) {
+      inf_count[tid]++;
+    }
+    if (IsZero(input[i])) {
+      zero_count[tid]++;
+    }
+  }
+
+  __syncthreads();
+
+  if (BLOCK_SIZE >= 1024) {
+    if (tid < 512) {
+      nan_count[tid] += nan_count[tid + 512];
+      inf_count[tid] += inf_count[tid + 512];
+      zero_count[tid] += zero_count[tid + 512];
+    }
+    __syncthreads();
+  }
+  if (BLOCK_SIZE >= 512) {
+    if (tid < 256) {
+      nan_count[tid] += nan_count[tid + 256];
+      inf_count[tid] += inf_count[tid + 256];
+      zero_count[tid] += zero_count[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (BLOCK_SIZE >= 256) {
+    if (tid < 128) {
+      nan_count[tid] += nan_count[tid + 128];
+      inf_count[tid] += inf_count[tid + 128];
+      zero_count[tid] += zero_count[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (BLOCK_SIZE >= 128) {
+    if (tid < 64) {
+      nan_count[tid] += nan_count[tid + 64];
+      inf_count[tid] += inf_count[tid + 64];
+      zero_count[tid] += zero_count[tid + 64];
+    }
+    __syncthreads();
+  }
+  if (tid < 32) {
+    WarpReduceSum<BLOCK_SIZE>(tid, nan_count);
+    WarpReduceSum<BLOCK_SIZE>(tid, inf_count);
+    WarpReduceSum<BLOCK_SIZE>(tid, zero_count);
+  }
+  if (tid == 0) {
+    *nan_counter = nan_count[0];
+    *inf_counter = inf_count[0];
+    *zero_counter = zero_count[0];
+  }
+}
+
+// Attempts to append the NaN count of the `input` buffer to the
+// `float_check_entries`, using `log_header` to track available capacity and
+// used space.
+//
+// The log entry is tagged with `entry_id`. The NaN count is parallelized as
+// much as block dimensions allow it.
+//
+// If the log does not have enough space for the new entry, the entry is
+// discarded.
+//
+// `input_size_in_bytes` is the size of the input buffer in bytes.
+//
+// LIMITATIONS:
+// - Only a single thread block is supported.
+// - Block dimensions must be a power of 2.
+template <typename T>
+__global__ void AppendFloatCheck(
+    xla::gpu::BufferDebugLogEntryId entry_id, const T* input,
+    uint64_t input_size_in_bytes, xla::gpu::BufferDebugLogHeader* log_header,
+    xla::gpu::BufferDebugFloatCheckEntry* float_check_entries) {
+  const uint32_t block_size = blockDim.x * blockDim.y * blockDim.z;
+  const uint64_t input_size = input_size_in_bytes / sizeof(T);
+  uint32_t nan_count = 0;
+  uint32_t inf_count = 0;
+  uint32_t zero_count = 0;
+
+  assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
+  if (BlockIdx() != 0) {
+    return;
+  }
+
+  // https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
+  // > CUDA architecture limits the numbers of threads per block (1024 threads
+  // > per block limit).
+  switch (block_size) {
+    case 1024:
+      ReduceSum<T, 1024>(input, input_size, &nan_count, &inf_count,
+                         &zero_count);
+      break;
+    case 512:
+      ReduceSum<T, 512>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 256:
+      ReduceSum<T, 256>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 128:
+      ReduceSum<T, 128>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 64:
+      ReduceSum<T, 64>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 32:
+      ReduceSum<T, 32>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 16:
+      ReduceSum<T, 16>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 8:
+      ReduceSum<T, 8>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 4:
+      ReduceSum<T, 4>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 2:
+      ReduceSum<T, 2>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 1:
+      ReduceSum<T, 1>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    default:
+      // Unsupported block size.
+      assert(false);
+      return;
+  }
+
+  if (ThreadIdx() == 0) {
+    cuda::atomic_ref<uint32_t, cuda::thread_scope_system>
+        nan_count_log_write_idx(log_header->write_idx);
+#if __CUDA_ARCH__ >= 600
+    const uint32_t write_idx = nan_count_log_write_idx.fetch_add(1);
+    if (nan_count_log_write_idx.load() < log_header->capacity) {
+      float_check_entries[write_idx] = xla::gpu::BufferDebugFloatCheckEntry{
+          entry_id, nan_count, inf_count, zero_count};
+    }
+#else
+    // Our toolchains generate a fetch_add PTX instructions with system scope,
+    // which is not supported on pre-Pascal architectures.
+    assert(false);
+#endif
+  }
+}
+
+se::KernelLoaderSpec GetFloatCheckF32KernelSpec(int arity) {
+  return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
+      absl::bit_cast<void*>(&AppendFloatCheck<float>),
+      "BufferDebugFloatCheckF32Kernel", arity);
+}
+
+se::KernelLoaderSpec GetFloatCheckBf16KernelSpec(int arity) {
+  return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
+      absl::bit_cast<void*>(&AppendFloatCheck<__nv_bfloat16>),
+      "BufferDebugFloatCheckBf16Kernel", arity);
+}
+
+}  // namespace
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    BufferDebugFloatCheckF32Kernel, se::gpu::BufferDebugFloatCheckF32Kernel,
+    se::cuda::kCudaPlatformId, GetFloatCheckF32KernelSpec);
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    BufferDebugFloatCheckBf16Kernel, se::gpu::BufferDebugFloatCheckBf16Kernel,
+    se::cuda::kCudaPlatformId, GetFloatCheckBf16KernelSpec);
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
new file mode 100644
index 00000000000000..e556af42a9aec1
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/stream_executor/typed_kernel_factory.h"  // IWYU pragma: keep, required for KernelType::FactoryType::Create
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+
+namespace se = stream_executor;
+
+namespace stream_executor::cuda {
+namespace {
+
+using xla::gpu::BufferDebugFloatCheckEntry;
+using xla::gpu::BufferDebugLogEntryId;
+using xla::gpu::ThunkId;
+
+class FloatCheckKernelTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            se::PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+
+    if (!executor_->GetDeviceDescription()
+             .cuda_compute_capability()
+             .IsAtLeastPascal()) {
+      GTEST_SKIP()
+          << "Buffer checking is not supported on CUDA architectures older "
+             "than Pascal due to missing atomic fetch_add with system scope";
+    }
+  }
+
+  template <typename T>
+  absl::StatusOr<se::DeviceMemory<T>> CheckNotNull(
+      se::DeviceMemory<T> device_memory, absl::string_view name) {
+    if (device_memory.is_null()) {
+      return absl::InternalError(
+          absl::StrFormat("Device memory for %s is null", name));
+    }
+    return device_memory;
+  }
+
+  template <typename Kernel, typename InputType, typename BufferType>
+  absl::Status AppendFloatCheckOnDevice(
+      BufferDebugLogEntryId entry_id, const std::vector<InputType>& input,
+      se::gpu::BufferDebugLog<BufferType>& buffer_debug_log,
+      stream_executor::ThreadDim dim = stream_executor::ThreadDim(1, 1, 1)) {
+    // Load kernel
+    gpu::GpuKernelRegistry registry =
+        gpu::GpuKernelRegistry::GetGlobalRegistry();
+    TF_ASSIGN_OR_RETURN(auto kernel, registry.LoadKernel<Kernel>(executor_));
+
+    // Setup device buffers
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemory<InputType> device_input,
+        CheckNotNull(executor_->AllocateArray<InputType>(input.size()),
+                     "input"));
+    auto cleanup_input =
+        absl::MakeCleanup([&]() { executor_->Deallocate(&device_input); });
+
+    // Call kernel
+    TF_RETURN_IF_ERROR(stream_->Memcpy(&device_input, input.data(),
+                                       input.size() * sizeof(input[0])));
+    TF_RETURN_IF_ERROR(kernel.Launch(
+        dim, stream_executor::BlockDim(1, 1, 1), stream_.get(), entry_id,
+        device_input, device_input.ElementCount() * sizeof(InputType),
+        buffer_debug_log.GetDeviceHeader(),
+        buffer_debug_log.GetDeviceEntries()));
+    TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+
+    // The result gets stored in `buffer_debug_log`.
+    return absl::OkStatus();
+  }
+
+  se::Platform* platform_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_F(FloatCheckKernelTest, ChecksFloatsForF32) {
+  se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
+  std::vector<float> input(1024, 1.0f);
+  input[100] = std::numeric_limits<float>::quiet_NaN();
+  input[200] = std::numeric_limits<float>::quiet_NaN();
+  input[300] = 0.0f;
+  input[400] = std::numeric_limits<float>::infinity();
+  input[500] = std::numeric_limits<float>::infinity();
+  input[600] = std::numeric_limits<float>::infinity();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, mem));
+
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
+      BufferDebugLogEntryId{123}, input, device_log));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+  ASSERT_GE(host_log.size(), 1);
+  EXPECT_EQ(host_log[0].nan_count, 2);
+  EXPECT_EQ(host_log[0].inf_count, 3);
+  EXPECT_EQ(host_log[0].zero_count, 1);
+}
+
+TEST_F(FloatCheckKernelTest, ChecksFloatsForBf16) {
+  std::vector<xla::bfloat16> input(1024, xla::bfloat16(1.0f));
+  input[10] = xla::bfloat16(std::numeric_limits<float>::quiet_NaN());
+  input[20] = xla::bfloat16(std::numeric_limits<float>::quiet_NaN());
+  input[30] = xla::bfloat16(0.0f),
+  input[40] = xla::bfloat16(std::numeric_limits<float>::infinity());
+  input[50] = xla::bfloat16(std::numeric_limits<float>::infinity());
+  input[60] = xla::bfloat16(std::numeric_limits<float>::infinity());
+
+  se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, mem));
+
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckBf16Kernel>(
+      BufferDebugLogEntryId{0}, input, device_log));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+  ASSERT_GE(host_log.size(), 1);
+  EXPECT_EQ(host_log[0].nan_count, 2);
+  EXPECT_EQ(host_log[0].inf_count, 3);
+  EXPECT_EQ(host_log[0].zero_count, 1);
+}
+
+TEST_F(FloatCheckKernelTest, ChecksFloatsInParallel) {
+  se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
+  std::vector<float> input(1024, 1.0f);
+  input[100] = std::numeric_limits<float>::quiet_NaN();
+  input[200] = std::numeric_limits<float>::quiet_NaN();
+  input[300] = std::numeric_limits<float>::quiet_NaN();
+  input[400] = 0.0f;
+  input[600] = std::numeric_limits<float>::infinity();
+  input[700] = std::numeric_limits<float>::infinity();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, mem));
+
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
+      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
+      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+  ASSERT_GE(host_log.size(), 2);
+  EXPECT_EQ(host_log[0].nan_count, 3);
+  EXPECT_EQ(host_log[0].inf_count, 2);
+  EXPECT_EQ(host_log[0].zero_count, 1);
+  EXPECT_EQ(host_log[1].nan_count, 3);
+  EXPECT_EQ(host_log[1].inf_count, 2);
+  EXPECT_EQ(host_log[1].zero_count, 1);
+}
+
+}  // namespace
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda.cu.cc
similarity index 87%
rename from third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.cu.cc
rename to third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda.cu.cc
index f90bd2f9b2bc59..412441afb61926 100644
--- a/third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda.cu.cc
@@ -13,18 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.h"
-
-#include <algorithm>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
 
 #include "absl/base/casts.h"
 #include "third_party/gpus/cuda/include/cuda/atomic"
-#include "xla/backends/gpu/runtime/sdc_buffer_id.h"
-#include "xla/backends/gpu/runtime/sdc_log_structs.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/tsl/platform/logging.h"
@@ -119,10 +116,10 @@ __device__ void ReduceXor(const uint32_t* input, uint64_t input_size,
 // LIMITATIONS:
 // - Only a single thread block is supported.
 // - Block dimensions must be a power of 2.
-__global__ void AppendChecksum(xla::gpu::SdcBufferId entry_id,
+__global__ void AppendChecksum(xla::gpu::BufferDebugLogEntryId entry_id,
                                const uint8_t* input, uint64_t input_size,
-                               xla::gpu::SdcLogHeader* log_header,
-                               xla::gpu::SdcLogEntry* log_entries) {
+                               xla::gpu::BufferDebugLogHeader* log_header,
+                               xla::gpu::BufferDebugLogEntry* log_entries) {
   const uint32_t block_size = blockDim.x * blockDim.y * blockDim.z;
   const uint32_t* input_u32 = reinterpret_cast<const uint32_t*>(input);
   const uint64_t input_u32_size = input_size / sizeof(uint32_t);
@@ -185,27 +182,27 @@ __global__ void AppendChecksum(xla::gpu::SdcBufferId entry_id,
 
     cuda::atomic_ref<uint32_t, cuda::thread_scope_system>
         checksum_log_write_idx(log_header->write_idx);
+#if __CUDA_ARCH__ >= 600
     const uint32_t write_idx = checksum_log_write_idx.fetch_add(1);
     if (write_idx < log_header->capacity) {
       log_entries[write_idx] = {entry_id, checksum};
     }
+#else
+    // Our toolchains generate a fetch_add PTX instructions with system scope,
+    // which is not supported on pre-Pascal architectures.
+    assert(false);
+#endif
   }
 }
 
-}  // namespace
-
-GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
-    SdcXorChecksumKernel, se::cuda::SdcXorChecksumKernel,
-    se::cuda::kCudaPlatformId, ([](size_t _arity) {
-      return se::cuda::GetSdcXorChecksumKernelSpec().value();
-    }));
-
-namespace stream_executor::cuda {
-
-absl::StatusOr<se::KernelLoaderSpec> GetSdcXorChecksumKernelSpec() {
+se::KernelLoaderSpec GetChecksumKernelSpec(int arity) {
   return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      absl::bit_cast<void*>(&AppendChecksum), "SdcXorChecksumKernel",
-      /*arity=*/5);
+      absl::bit_cast<void*>(&AppendChecksum), "BufferDebugXorChecksumKernel",
+      arity);
 }
 
-}  // namespace stream_executor::cuda
+}  // namespace
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    BufferDebugXorChecksumKernel, se::gpu::BufferDebugXorChecksumKernel,
+    se::cuda::kCudaPlatformId, GetChecksumKernelSpec);
diff --git a/third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
similarity index 58%
rename from third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda_test.cc
rename to third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
index f794fd98b8680d..e9dfe49d313102 100644
--- a/third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.h"
-
 #include <array>
 #include <cstdint>
 #include <memory>
@@ -28,12 +26,11 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/runtime/sdc_buffer_id.h"
-#include "xla/backends/gpu/runtime/sdc_log_structs.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/stream_executor/cuda/sdc_log.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
@@ -50,10 +47,9 @@ namespace se = stream_executor;
 namespace stream_executor::cuda {
 namespace {
 
-using xla::gpu::SdcBufferId;
-using xla::gpu::SdcLogEntry;
-using xla::gpu::SdcLogHeader;
-using xla::gpu::ThunkId;
+using xla::gpu::BufferDebugLogEntry;
+using xla::gpu::BufferDebugLogEntryId;
+using xla::gpu::BufferDebugLogHeader;
 
 class ChecksumKernelTest : public ::testing::Test {
  protected:
@@ -64,6 +60,14 @@ class ChecksumKernelTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
         std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+
+    if (!executor_->GetDeviceDescription()
+             .cuda_compute_capability()
+             .IsAtLeastPascal()) {
+      GTEST_SKIP()
+          << "Buffer checksumming is not supported on CUDA architectures older "
+             "than Pascal due to missing atomic fetch_add with system scope";
+    }
   }
 
   template <typename T>
@@ -78,15 +82,15 @@ class ChecksumKernelTest : public ::testing::Test {
 
   template <typename T>
   absl::Status AppendChecksumOnDevice(
-      SdcBufferId entry_id, const T& input, se::cuda::SdcLog& sdc_log,
+      BufferDebugLogEntryId entry_id, const T& input,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>& buffer_debug_log,
       stream_executor::ThreadDim dim = stream_executor::ThreadDim(1, 1, 1)) {
     // Load kernel
-    TF_ASSIGN_OR_RETURN(se::KernelLoaderSpec spec,
-                        se::cuda::GetSdcXorChecksumKernelSpec());
+    gpu::GpuKernelRegistry registry =
+        gpu::GpuKernelRegistry::GetGlobalRegistry();
     TF_ASSIGN_OR_RETURN(
         auto kernel,
-        se::cuda::SdcXorChecksumKernel::KernelType::FactoryType::Create(
-            executor_, spec));
+        registry.LoadKernel<gpu::BufferDebugXorChecksumKernel>(executor_));
 
     // Setup device buffers
     TF_ASSIGN_OR_RETURN(se::DeviceMemory<uint8_t> device_input,
@@ -99,13 +103,14 @@ class ChecksumKernelTest : public ::testing::Test {
     // Call kernel
     TF_RETURN_IF_ERROR(stream_->Memcpy(&device_input, input.data(),
                                        input.size() * sizeof(input[0])));
-    TF_RETURN_IF_ERROR(
-        kernel.Launch(dim, stream_executor::BlockDim(1, 1, 1), stream_.get(),
-                      entry_id, device_input, device_input.ElementCount(),
-                      sdc_log.GetDeviceHeader(), sdc_log.GetDeviceEntries()));
+    TF_RETURN_IF_ERROR(kernel.Launch(dim, stream_executor::BlockDim(1, 1, 1),
+                                     stream_.get(), entry_id, device_input,
+                                     device_input.ElementCount(),
+                                     buffer_debug_log.GetDeviceHeader(),
+                                     buffer_debug_log.GetDeviceEntries()));
     TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
 
-    // The result gets stored in `sdc_log`.
+    // The result gets stored in `buffer_debug_log`.
     return absl::OkStatus();
   }
 
@@ -126,29 +131,35 @@ TEST_F(ChecksumKernelTest, ComputesCorrectChecksumForMultipleOf32Bit) {
   input[1003] ^= 0x12;
   constexpr uint32_t kExpectedChecksum = 0x12345678;
 
-  TF_ASSERT_OK_AND_ASSIGN(se::cuda::SdcLog device_log,
-                          se::cuda::SdcLog::CreateOnDevice(*stream_, mem));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(SdcBufferId(), input, device_log));
+  TF_EXPECT_OK(
+      AppendChecksumOnDevice(BufferDebugLogEntryId{0}, input, device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
-  EXPECT_EQ(host_log[0].checksum, kExpectedChecksum);
+  EXPECT_EQ(host_log[0].value, kExpectedChecksum);
 }
 
 TEST_F(ChecksumKernelTest,
        PadsMostSignifantBitsOfIncomplete32BitInputWordWithZeros) {
   se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
   const std::vector<uint8_t> kInput = std::vector<uint8_t>(1023, 0x55);
-  TF_ASSERT_OK_AND_ASSIGN(se::cuda::SdcLog device_log,
-                          se::cuda::SdcLog::CreateOnDevice(*stream_, mem));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(SdcBufferId(), kInput, device_log));
+  TF_EXPECT_OK(
+      AppendChecksumOnDevice(BufferDebugLogEntryId{0}, kInput, device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
   // Assumes the device uses little-endian byte order.
-  EXPECT_EQ(host_log[0].checksum, 0x55000000);
+  EXPECT_EQ(host_log[0].value, 0x55000000);
 }
 
 TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallel) {
@@ -158,15 +169,17 @@ TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallel) {
   // Xor with the expected checksum value.
   input[1000] ^= 0x12345678;
   constexpr uint32_t kExpectedChecksum = 0x12345678;
-  TF_ASSERT_OK_AND_ASSIGN(se::cuda::SdcLog device_log,
-                          se::cuda::SdcLog::CreateOnDevice(*stream_, mem));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(SdcBufferId(), input, device_log,
-                                      se::ThreadDim(2, 4, 8)));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{0}, input,
+                                      device_log, se::ThreadDim(2, 4, 8)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
-  EXPECT_EQ(host_log[0].checksum, kExpectedChecksum);
+  EXPECT_EQ(host_log[0].value, kExpectedChecksum);
 }
 
 TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallelWithMaxThreads) {
@@ -176,65 +189,71 @@ TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallelWithMaxThreads) {
   // Xor with the expected checksum value.
   input[1000] ^= 0x12345678;
   constexpr uint32_t kExpectedChecksum = 0x12345678;
-  TF_ASSERT_OK_AND_ASSIGN(se::cuda::SdcLog device_log,
-                          se::cuda::SdcLog::CreateOnDevice(*stream_, mem));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(SdcBufferId(), input, device_log,
-                                      se::ThreadDim(128, 4, 2)));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{0}, input,
+                                      device_log, se::ThreadDim(128, 4, 2)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
-  EXPECT_EQ(host_log[0].checksum, kExpectedChecksum);
+  EXPECT_EQ(host_log[0].value, kExpectedChecksum);
 }
 
 TEST_F(ChecksumKernelTest, AppendsChecksumsToLog) {
   se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
-  SdcBufferId kId123 = SdcBufferId::Create(ThunkId(123), 0).value();
-  SdcBufferId kId456 = SdcBufferId::Create(ThunkId(456), 0).value();
-  SdcBufferId kId789 = SdcBufferId::Create(ThunkId(789), 0).value();
   constexpr std::array<uint32_t, 1> kInput123 = {0x01230123};
   constexpr std::array<uint32_t, 1> kInput456 = {0x04560456};
   constexpr std::array<uint32_t, 1> kInput789 = {0x07890789};
-  TF_ASSERT_OK_AND_ASSIGN(se::cuda::SdcLog device_log,
-                          se::cuda::SdcLog::CreateOnDevice(*stream_, mem));
-
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId123, kInput123, device_log));
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId456, kInput456, device_log));
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId789, kInput789, device_log));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
+
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{123}, kInput123,
+                                      device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{456}, kInput456,
+                                      device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{789}, kInput789,
+                                      device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 3);
-  EXPECT_EQ(host_log[0].entry_id, kId123);
-  EXPECT_EQ(host_log[0].checksum, 0x01230123);
-  EXPECT_EQ(host_log[1].entry_id, kId456);
-  EXPECT_EQ(host_log[1].checksum, 0x04560456);
-  EXPECT_EQ(host_log[2].entry_id, kId789);
-  EXPECT_EQ(host_log[2].checksum, 0x07890789);
+  EXPECT_EQ(host_log[0].entry_id, 123);
+  EXPECT_EQ(host_log[0].value, 0x01230123);
+  EXPECT_EQ(host_log[1].entry_id, 456);
+  EXPECT_EQ(host_log[1].value, 0x04560456);
+  EXPECT_EQ(host_log[2].entry_id, 789);
+  EXPECT_EQ(host_log[2].value, 0x07890789);
 }
 
 TEST_F(ChecksumKernelTest, DiscardsOverflowingChecksums) {
   se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(
-      sizeof(SdcLogHeader) + sizeof(SdcLogEntry) * 2);
-  SdcBufferId kId123 = SdcBufferId::Create(ThunkId(123), 0).value();
-  SdcBufferId kId456 = SdcBufferId::Create(ThunkId(456), 0).value();
-  SdcBufferId kId789 = SdcBufferId::Create(ThunkId(789), 0).value();
+      sizeof(BufferDebugLogHeader) + sizeof(BufferDebugLogEntry) * 2);
   constexpr std::array<uint32_t, 1> kInput123 = {0x01230123};
   constexpr std::array<uint32_t, 1> kInput456 = {0x04560456};
   constexpr std::array<uint32_t, 1> kInput789 = {0x07890789};
-  TF_ASSERT_OK_AND_ASSIGN(se::cuda::SdcLog device_log,
-                          se::cuda::SdcLog::CreateOnDevice(*stream_, mem));
-
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId123, kInput123, device_log));
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId456, kInput456, device_log));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
+
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{123}, kInput123,
+                                      device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{456}, kInput456,
+                                      device_log));
   // This entry will be discarded.
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId789, kInput789, device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{789}, kInput789,
+                                      device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 2);
-  EXPECT_EQ(host_log[0].entry_id, kId123);
-  EXPECT_EQ(host_log[0].checksum, 0x01230123);
-  EXPECT_EQ(host_log[1].entry_id, kId456);
-  EXPECT_EQ(host_log[1].checksum, 0x04560456);
+  EXPECT_EQ(host_log[0].entry_id, 123);
+  EXPECT_EQ(host_log[0].value, 0x01230123);
+  EXPECT_EQ(host_log[1].entry_id, 456);
+  EXPECT_EQ(host_log[1].value, 0x04560456);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc
index ae56303c25a8e1..e3d7417e2fb041 100644
--- a/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc
@@ -22,21 +22,19 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/mock_compilation_provider.h"
-#include "xla/stream_executor/device_description.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/threadpool.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/threadpool.h"
 
 namespace stream_executor::cuda {
 namespace {
 
 using ::testing::Return;
-using ::tsl::testing::IsOkAndHolds;
 
 TEST(CachingCompilationProviderTest, CachingCompileCallsWorks) {
   auto mock_compilation_provider = std::make_unique<MockCompilationProvider>();
diff --git a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc
index cac3a954b2641f..35ae53ceb73011 100644
--- a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
 #include "xla/stream_executor/cuda/mock_compilation_provider.h"
 #include "xla/stream_executor/device_description.h"
-#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -36,8 +36,6 @@ namespace {
 
 using ::testing::HasSubstr;
 using ::testing::Return;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 TEST(CompositeCompilationProviderTest, CreateFailsWithNoProviders) {
   EXPECT_THAT(CompositeCompilationProvider::Create({}),
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda.cu.cc
new file mode 100644
index 00000000000000..b2b709f9865419
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda.cu.cc
@@ -0,0 +1,210 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+
+#include "cub/block/block_scan.cuh"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/prefix_sum_kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+
+namespace se = stream_executor;
+
+namespace stream_executor::cuda {
+namespace {
+
+template <unsigned int BLOCK_SIZE, typename ElementT>
+__device__ void RowPrefixSum(const ElementT* data_in, ElementT* data_out,
+                             size_t num_items) {
+  // `BLOCK_SIZE` must be a power of 2 no larger than 512.
+  static_assert(BLOCK_SIZE <= 512 && (BLOCK_SIZE & (BLOCK_SIZE - 1)) == 0);
+  using BlockScan = cub::BlockScan<ElementT, BLOCK_SIZE>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  ElementT total = 0;
+  size_t thread_idx =
+      ((threadIdx.z * blockDim.y) + threadIdx.y) * blockDim.x + threadIdx.x;
+  for (size_t offset = thread_idx; offset < num_items; offset += BLOCK_SIZE) {
+    if (offset < num_items) {
+      ElementT thread_data = data_in[offset];
+      ElementT block_aggregate;
+      BlockScan(temp_storage)
+          .InclusiveSum(thread_data, thread_data, block_aggregate);
+      data_out[offset] = thread_data + total;
+      total += block_aggregate;
+      __syncthreads();
+    }
+  }
+}
+
+template <typename ElementT>
+__global__ void PrefixSum(const void* data_in, void* data_out,
+                          size_t num_items) {
+  const ElementT* data_in_typed = static_cast<const ElementT*>(data_in);
+  ElementT* data_out_typed = static_cast<ElementT*>(data_out);
+  int64_t block_idx =
+      ((static_cast<int64_t>(blockIdx.z) * gridDim.y) + blockIdx.y) *
+          gridDim.x +
+      blockIdx.x;
+  int64_t row_offset = block_idx * num_items;
+  // https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
+  // CUDA architecture limits the numbers of threads per block (1024 threads
+  // per block limit). We need to limit it to 512 to avoid running out of shared
+  // memory with 8 byte data types.
+  switch (blockDim.x * blockDim.y * blockDim.z) {
+    case 512:
+      RowPrefixSum<512>(data_in_typed + row_offset, data_out_typed + row_offset,
+                        num_items);
+      break;
+    case 256:
+      RowPrefixSum<256>(data_in_typed + row_offset, data_out_typed + row_offset,
+                        num_items);
+      break;
+    case 128:
+      RowPrefixSum<128>(data_in_typed + row_offset, data_out_typed + row_offset,
+                        num_items);
+      break;
+    case 64:
+      RowPrefixSum<64>(data_in_typed + row_offset, data_out_typed + row_offset,
+                       num_items);
+      break;
+    case 32:
+      RowPrefixSum<32>(data_in_typed + row_offset, data_out_typed + row_offset,
+                       num_items);
+      break;
+    case 16:
+      RowPrefixSum<16>(data_in_typed + row_offset, data_out_typed + row_offset,
+                       num_items);
+      break;
+    case 8:
+      RowPrefixSum<8>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    case 4:
+      RowPrefixSum<4>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    case 2:
+      RowPrefixSum<2>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    case 1:
+      RowPrefixSum<1>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    default:
+      // Unsupported block size.
+      assert(false);
+      return;
+  }
+}
+
+#define XLA_CUB_PREFIX_SUM_KERNEL_SPEC(primitive_type, native_type)          \
+  se::KernelLoaderSpec GetPrefixSum##primitive_type##KernelSpec(int arity) { \
+    return se::KernelLoaderSpec::CreateInProcessSymbolSpec(                  \
+        absl::bit_cast<void*>(&PrefixSum<native_type>),                      \
+        "PrefixSum##primitive_type##Kernel", arity);                         \
+  }
+
+// Floating point types.
+#ifdef CUB_TYPE_BF16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(BF16, __nv_bfloat16)
+#endif
+#ifdef CUB_TYPE_F16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(F16, __half)
+#endif
+#ifdef CUB_TYPE_F32
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(F32, float)
+#endif
+#ifdef CUB_TYPE_F64
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(F64, double)
+#endif
+
+// Signed integer types.
+#ifdef CUB_TYPE_S8
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S8, int8_t)
+#endif
+#ifdef CUB_TYPE_S16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S16, int16_t)
+#endif
+#ifdef CUB_TYPE_S32
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S32, int32_t)
+#endif
+#ifdef CUB_TYPE_S64
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S64, int64_t)
+#endif
+
+// Unsigned integer types.
+#ifdef CUB_TYPE_U8
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U8, uint8_t)
+#endif
+#ifdef CUB_TYPE_U16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U16, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U32, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U64, uint64_t)
+#endif
+
+}  // namespace
+
+#define REGISTER_PREFIX_SUM_KERNEL(primitive_type)                 \
+  GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                  \
+      PrefixSum##primitive_type##Kernel,                           \
+      se::gpu::PrefixSum##primitive_type##Kernel, kCudaPlatformId, \
+      GetPrefixSum##primitive_type##KernelSpec)
+
+#ifdef CUB_TYPE_BF16
+REGISTER_PREFIX_SUM_KERNEL(BF16)
+#endif
+#ifdef CUB_TYPE_F16
+REGISTER_PREFIX_SUM_KERNEL(F16)
+#endif
+#ifdef CUB_TYPE_F32
+REGISTER_PREFIX_SUM_KERNEL(F32)
+#endif
+#ifdef CUB_TYPE_F64
+REGISTER_PREFIX_SUM_KERNEL(F64)
+#endif
+#ifdef CUB_TYPE_S8
+REGISTER_PREFIX_SUM_KERNEL(S8)
+#endif
+#ifdef CUB_TYPE_S16
+REGISTER_PREFIX_SUM_KERNEL(S16)
+#endif
+#ifdef CUB_TYPE_S32
+REGISTER_PREFIX_SUM_KERNEL(S32)
+#endif
+#ifdef CUB_TYPE_S64
+REGISTER_PREFIX_SUM_KERNEL(S64)
+#endif
+#ifdef CUB_TYPE_U8
+REGISTER_PREFIX_SUM_KERNEL(U8)
+#endif
+#ifdef CUB_TYPE_U16
+REGISTER_PREFIX_SUM_KERNEL(U16)
+#endif
+#ifdef CUB_TYPE_U32
+REGISTER_PREFIX_SUM_KERNEL(U32)
+#endif
+#ifdef CUB_TYPE_U64
+REGISTER_PREFIX_SUM_KERNEL(U64)
+#endif
+
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
new file mode 100644
index 00000000000000..2a18b7ad29b31b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
@@ -0,0 +1,239 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
+#include "absl/log/log.h"
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/prefix_sum_kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/stream_executor/typed_kernel_factory.h"  // IWYU pragma: keep, required for KernelType::FactoryType::Create
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace se = stream_executor;
+
+namespace stream_executor::cuda {
+namespace {
+
+class CubPrefixSumKernelCudaTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<
+          std::tuple<xla::PrimitiveType, int, int, bool>> {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            se::PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+  }
+
+  template <typename T>
+  absl::StatusOr<se::DeviceMemory<T>> CheckNotNull(
+      se::DeviceMemory<T> device_memory, absl::string_view name) {
+    if (device_memory.is_null()) {
+      return absl::InternalError(
+          absl::StrFormat("Device memory for %s is null", name));
+    }
+    return device_memory;
+  }
+
+  template <typename Kernel, typename T>
+  absl::Status ComputePrefixSumOnDevice(const std::vector<T>& input,
+                                        std::vector<T>& output, size_t num_rows,
+                                        size_t num_items, bool in_place) {
+    // Load kernel
+    gpu::GpuKernelRegistry registry =
+        gpu::GpuKernelRegistry::GetGlobalRegistry();
+    TF_ASSIGN_OR_RETURN(auto kernel, registry.LoadKernel<Kernel>(executor_));
+
+    // Setup device buffers
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemory<T> device_input,
+        CheckNotNull(executor_->AllocateArray<T>(input.size()), "input"));
+    se::DeviceMemory<T> device_output;
+    if (in_place) {
+      device_output = device_input;
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          device_output,
+          CheckNotNull(executor_->AllocateArray<T>(output.size()), "output"));
+    }
+    auto cleanup = absl::MakeCleanup([&]() {
+      if (!in_place) {
+        executor_->Deallocate(&device_output);
+      }
+      executor_->Deallocate(&device_input);
+    });
+
+    TF_RETURN_IF_ERROR(stream_->Memcpy(&device_input, input.data(),
+                                       input.size() * sizeof(input[0])));
+    // For large number of items, limit the number of threads per block to 512
+    // to avoid running out of shared memory.
+    size_t num_threads_per_block =
+        std::min(size_t{512}, absl::bit_ceil(num_items));
+    // Call kernel
+    TF_RETURN_IF_ERROR(
+        kernel.Launch(stream_executor::ThreadDim(num_threads_per_block, 1, 1),
+                      stream_executor::BlockDim(num_rows, 1, 1), stream_.get(),
+                      device_input, device_output, num_items));
+    TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+    TF_RETURN_IF_ERROR(stream_->Memcpy(output.data(), device_output,
+                                       output.size() * sizeof(output[0])));
+    return absl::OkStatus();
+  }
+
+  template <typename Kernel, typename T>
+  absl::Status CheckComputePrefixSumOnDevice(size_t num_rows, size_t num_items,
+                                             bool in_place) {
+    std::vector<T> input(num_rows * num_items);
+    std::vector<T> output(input.size());
+    std::vector<T> expected;
+    expected.reserve(input.size());
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_items; ++j) {
+        // We use only small values, otherwise we will get precision problems
+        // with small data types.
+        input[i * num_items + j] = static_cast<T>((i + j) % 5);
+        expected.push_back(input[i * num_items + j]);
+        if (j > 0) {
+          expected.back() += expected[expected.size() - 2];
+        }
+      }
+    }
+    TF_RETURN_IF_ERROR(ComputePrefixSumOnDevice<Kernel>(input, output, num_rows,
+                                                        num_items, in_place));
+    EXPECT_EQ(output, expected);
+    return absl::OkStatus();
+  }
+
+  se::Platform* platform_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_P(CubPrefixSumKernelCudaTest, TestPrefixSum) {
+  absl::Status status;
+  const auto& [primitive_type, num_rows, num_items, in_place] = GetParam();
+  switch (primitive_type) {
+    case xla::BF16:
+      if (num_items > 128) {
+        GTEST_SKIP() << "Rounding errors";
+      }
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumBF16Kernel,
+                                             xla::bfloat16>(num_rows, num_items,
+                                                            in_place);
+      break;
+    case xla::F16:
+      status =
+          CheckComputePrefixSumOnDevice<gpu::PrefixSumF16Kernel, xla::half>(
+              num_rows, num_items, in_place);
+      break;
+    case xla::F32:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumF32Kernel, float>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::F64:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumF64Kernel, double>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S8:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS8Kernel, int8_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S16:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS16Kernel, int16_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S32:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS32Kernel, int32_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S64:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS64Kernel, int64_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U8:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU8Kernel, uint8_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U16:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU16Kernel, uint16_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U32:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU32Kernel, uint32_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U64:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU64Kernel, uint64_t>(
+          num_rows, num_items, in_place);
+      break;
+    default:
+      status = absl::OkStatus();
+  }
+  TF_EXPECT_OK(status);
+}
+
+std::string ParametersToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<xla::PrimitiveType, int, int, bool>>& data) {
+  const auto& [primitive_type, num_rows, num_items, in_place] = data.param;
+  return absl::StrFormat(
+      "Prefix_Sum_%dx%d_%s%s", num_rows, num_items,
+      xla::primitive_util::LowercasePrimitiveTypeName(primitive_type),
+      in_place ? "_in_place" : "");
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CubPrefixSumKernelCudaTestInstance, CubPrefixSumKernelCudaTest,
+    ::testing::Combine(::testing::ValuesIn({xla::BF16, xla::F16, xla::F32,
+                                            xla::F64, xla::S8, xla::S16,
+                                            xla::S32, xla::S64, xla::U8,
+                                            xla::U16, xla::U32, xla::U64}),
+                       ::testing::ValuesIn({1, 2, 3, 128, 511, 512}),
+                       ::testing::ValuesIn({1, 2, 3, 128, 511, 513}),
+                       ::testing::ValuesIn({false, true})),
+    ParametersToString);
+
+}  // namespace
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.cc b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.cc
index 7a40dfeae19b62..28cbf55076784e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_fp16.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/ffi.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"  // IWYU pragma: keep
 #include "xla/stream_executor/cuda/cuda_status.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
index 5d5cec0f19bb38..70e5d9553092e8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
@@ -124,108 +124,108 @@ cudaError_t CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
                    /*end_bit=*/sizeof(KeyT) * 8, stream);
 }
 
-#define XLA_CUB_DEFINE_SORT_KEYS(suffix, type)                               \
+#define XLA_CUB_DEFINE_SORT_KEYS(type)                                       \
   template cudaError_t CubSortKeys<type>(void*, size_t&, const void*, void*, \
                                          size_t, bool, size_t, CUstream);
 
-#define XLA_CUB_DEFINE_SORT_PAIRS(suffix, type1, type2)                     \
+#define XLA_CUB_DEFINE_SORT_PAIRS(type1, type2)                             \
   template cudaError_t CubSortPairs<type1, type2>(                          \
       void*, size_t&, const void*, void*, const void*, void*, size_t, bool, \
       size_t, CUstream);
 
 // Floating point types.
 #ifdef CUB_TYPE_BF16
-XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
+XLA_CUB_DEFINE_SORT_KEYS(__nv_bfloat16)
 #endif
 #ifdef CUB_TYPE_F16
-XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
+XLA_CUB_DEFINE_SORT_KEYS(__half)
 #endif
 #ifdef CUB_TYPE_F32
-XLA_CUB_DEFINE_SORT_KEYS(f32, float)
+XLA_CUB_DEFINE_SORT_KEYS(float)
 #endif
 #ifdef CUB_TYPE_F64
-XLA_CUB_DEFINE_SORT_KEYS(f64, double)
+XLA_CUB_DEFINE_SORT_KEYS(double)
 #endif
 
 // Signed integer types.
 #ifdef CUB_TYPE_S8
-XLA_CUB_DEFINE_SORT_KEYS(s8, int8_t)
+XLA_CUB_DEFINE_SORT_KEYS(int8_t)
 #endif
 #ifdef CUB_TYPE_S16
-XLA_CUB_DEFINE_SORT_KEYS(s16, int16_t)
+XLA_CUB_DEFINE_SORT_KEYS(int16_t)
 #endif
 #ifdef CUB_TYPE_S32
-XLA_CUB_DEFINE_SORT_KEYS(s32, int32_t)
+XLA_CUB_DEFINE_SORT_KEYS(int32_t)
 #endif
 #ifdef CUB_TYPE_S64
-XLA_CUB_DEFINE_SORT_KEYS(s64, int64_t)
+XLA_CUB_DEFINE_SORT_KEYS(int64_t)
 #endif
 
 // Unsigned integer types.
 #ifdef CUB_TYPE_U8
-XLA_CUB_DEFINE_SORT_KEYS(u8, uint8_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint8_t)
 #endif
 #ifdef CUB_TYPE_U16
-XLA_CUB_DEFINE_SORT_KEYS(u16, uint16_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint16_t)
 #endif
 #ifdef CUB_TYPE_U32
-XLA_CUB_DEFINE_SORT_KEYS(u32, uint32_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint32_t)
 #endif
 #ifdef CUB_TYPE_U64
-XLA_CUB_DEFINE_SORT_KEYS(u64, uint64_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint64_t)
 #endif
 
 // Pairs with 8-bit key.
 #ifdef CUB_TYPE_U8_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b16, uint8_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint8_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U8_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b32, uint8_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint8_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U8_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b64, uint8_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint8_t, uint64_t)
 #endif
 
 // Pairs with 16-bit key.
 #ifdef CUB_TYPE_U16_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b16, uint16_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint16_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U16_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b32, uint16_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint16_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U16_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b64, uint16_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint16_t, uint64_t)
 #endif
 
 // Pairs with 32-bit key.
 #ifdef CUB_TYPE_U32_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b16, uint32_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint32_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U32_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b32, uint32_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint32_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U32_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b64, uint32_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint32_t, uint64_t)
 #endif
 #ifdef CUB_TYPE_F32_B16
-XLA_CUB_DEFINE_SORT_PAIRS(f32_b16, float, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(float, uint16_t)
 #endif
 #ifdef CUB_TYPE_F32_B32
-XLA_CUB_DEFINE_SORT_PAIRS(f32_b32, float, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(float, uint32_t)
 #endif
 #ifdef CUB_TYPE_F32_B64
-XLA_CUB_DEFINE_SORT_PAIRS(f32_b64, float, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(float, uint64_t)
 #endif
 
 // Pairs with 64-bit key.
 #ifdef CUB_TYPE_U64_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b16, uint64_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint64_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U64_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b32, uint64_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint64_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U64_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b64, uint64_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint64_t, uint64_t)
 #endif
 
 }  // namespace cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index 1635641d202296..0844a5ed982d8a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -48,9 +48,9 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -515,12 +515,14 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,
                         CUDAComplex(GpuMemoryMutable(y)), incy);
 }
 
-absl::Status CUDABlas::DoBlasGemm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
-    const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
-    const void *beta, DeviceMemoryBase *c, int ldc,
-    const NumericOptions &numeric_options, blas::CallContext context) {
+absl::Status CUDABlas::DoBlasGemm(Stream* stream, blas::Transpose transa,
+                                  blas::Transpose transb, uint64_t m,
+                                  uint64_t n, uint64_t k, blas::DataType dtype,
+                                  const void* alpha, const DeviceMemoryBase& a,
+                                  int lda, const DeviceMemoryBase& b, int ldb,
+                                  const void* beta, DeviceMemoryBase* c,
+                                  int ldc, const EngineOptions& engine_options,
+                                  blas::CallContext context) {
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 
 #if CUDA_VERSION < 11000
@@ -530,7 +532,7 @@ absl::Status CUDABlas::DoBlasGemm(
 #else
   if (dtype == blas::DataType::kFloat) {
     math_type = CUBLAS_TF32_TENSOR_OP_MATH;
-    if (!numeric_options.allow_tf32) {
+    if (!engine_options.allow_tf32) {
       math_type = CUBLAS_DEFAULT_MATH;
     }
   }
@@ -643,8 +645,8 @@ static bool UsesTensorOps(blas::AlgorithmType algo) {
 }
 
 static absl::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
-    Stream *stream, blas::AlgorithmType algorithm, blas::DataType type_a,
-    blas::DataType type_b, const NumericOptions &numeric_options) {
+    Stream* stream, blas::AlgorithmType algorithm, blas::DataType type_a,
+    blas::DataType type_b, const EngineOptions& engine_options) {
   if (type_a != type_b) {
     return absl::InternalError("Types of inputs mismatch");
   }
@@ -688,7 +690,7 @@ static absl::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
                        " uses tensor ops which are not supported for input"));
     }
   }
-  if (!numeric_options.allow_tf32) {
+  if (!engine_options.allow_tf32) {
     math_type = CUBLAS_DEFAULT_MATH;
   }
 
@@ -709,16 +711,16 @@ static absl::Status PopulateProfileFromTimer(
 }
 
 absl::Status CUDABlas::DoBlasGemmWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, const void *beta, DeviceMemoryBase *c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, const void* beta, DeviceMemoryBase* c,
     blas::DataType type_c, int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* output_profile_result, blas::CallContext context) {
   TF_ASSIGN_OR_RETURN(
       cublasMath_t math_type,
-      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, numeric_options));
+      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, engine_options));
 
   std::unique_ptr<EventBasedTimer> timer;
   if (output_profile_result != nullptr) {
@@ -744,17 +746,17 @@ absl::Status CUDABlas::DoBlasGemmWithAlgorithm(
 }
 
 absl::Status CUDABlas::DoBlasGemmStridedBatchedWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, int64_t stride_b, const void *beta,
-    DeviceMemoryBase *c, blas::DataType type_c, int ldc, int64_t stride_c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, int64_t stride_b, const void* beta,
+    DeviceMemoryBase* c, blas::DataType type_c, int ldc, int64_t stride_c,
     int batch_count, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* output_profile_result, blas::CallContext context) {
   TF_ASSIGN_OR_RETURN(
       cublasMath_t math_type,
-      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, numeric_options));
+      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, engine_options));
   std::unique_ptr<EventBasedTimer> timer;
   if (output_profile_result != nullptr) {
     TF_ASSIGN_OR_RETURN(timer,
@@ -912,13 +914,12 @@ T inline CUDAComplexValue(T v) {
 
 template <typename T, typename Scalar, typename FuncT>
 absl::Status CUDABlas::DoBlasGemmBatchedInternal(
-    FuncT cublas_func, Stream *stream, blas::Transpose transa,
+    FuncT cublas_func, Stream* stream, blas::Transpose transa,
     blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, Scalar alpha,
-    const DeviceMemorySlice<T> &a_ptrs_to_wrappers, int lda,
-    const DeviceMemorySlice<T> &b_ptrs_to_wrappers, int ldb, Scalar beta,
-    const DeviceMemorySlice<T> &c_ptrs_to_wrappers, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const DeviceMemorySlice<T>& a_ptrs_to_wrappers, int lda,
+    const DeviceMemorySlice<T>& b_ptrs_to_wrappers, int ldb, Scalar beta,
+    const DeviceMemorySlice<T>& c_ptrs_to_wrappers, int ldc, int batch_count,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator) {
   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
   for (int i = 0; i < batch_count; ++i) {
     a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque()));
@@ -968,7 +969,7 @@ absl::Status CUDABlas::DoBlasGemmBatchedInternal(
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 #if CUBLAS_VER_MAJOR >= 11
     } else if (data_type == CUDA_R_32F) {
-      if (numeric_options.allow_tf32 &&
+      if (engine_options.allow_tf32 &&
           tsl::tensor_float_32_execution_enabled()) {
         math_type = CUBLAS_TENSOR_OP_MATH;
         algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
@@ -1016,7 +1017,7 @@ absl::Status CUDABlas::DoBlasGemmBatchedInternal(
       DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
       TF_RETURN_IF_ERROR(DoBlasGemm(
           stream, transa, transb, m, n, k, blas::ToDataType<T>::value, &alpha,
-          a_matrix, lda, b_matrix, ldb, &beta, c_matrix, ldc, numeric_options,
+          a_matrix, lda, b_matrix, ldb, &beta, c_matrix, ldc, engine_options,
           blas::CallContext::kNone));
     }
     return absl::OkStatus();
@@ -1024,17 +1025,17 @@ absl::Status CUDABlas::DoBlasGemmBatchedInternal(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a_array,
     int lda, DeviceMemorySlice<Eigen::half> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::half> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1043,18 +1044,18 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha,
     DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of bf16 inside DoBlasGemmBatchedInternal.
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1063,15 +1064,15 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<float> a_array,
     int lda, DeviceMemorySlice<float> b_array, int ldb, float beta,
     DeviceMemorySlice<float> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1080,15 +1081,15 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, double alpha, DeviceMemorySlice<double> a_array,
     int lda, DeviceMemorySlice<double> b_array, int ldb, double beta,
     DeviceMemorySlice<double> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
 
       scratch_allocator);
   if (!status.ok()) {
@@ -1098,16 +1099,16 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, std::complex<float> alpha,
     DeviceMemorySlice<std::complex<float>> a_array, int lda,
     DeviceMemorySlice<std::complex<float>> b_array, int ldb,
     std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c_array,
-    int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator, blas::CallContext context) {
+    int ldc, int batch_count, const EngineOptions& engine_options,
+    ScratchAllocator* scratch_allocator, blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
 
       scratch_allocator);
   if (!status.ok()) {
@@ -1117,16 +1118,16 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, std::complex<double> alpha,
     DeviceMemorySlice<std::complex<double>> a_array, int lda,
     DeviceMemorySlice<std::complex<double>> b_array, int ldb,
     std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c_array,
-    int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator, blas::CallContext context) {
+    int ldc, int batch_count, const EngineOptions& engine_options,
+    ScratchAllocator* scratch_allocator, blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1135,19 +1136,19 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 absl::Status CUDABlas::DoBlasGemmStridedBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
-    const DeviceMemoryBase &a, int lda, int64_t stride_a,
-    const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
-    DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-    const NumericOptions &numeric_options, blas::CallContext context) {
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, blas::DataType dtype, const void* alpha,
+    const DeviceMemoryBase& a, int lda, int64_t stride_a,
+    const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,
+    DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,
+    const EngineOptions& engine_options, blas::CallContext context) {
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 #if CUDA_VERSION < 11000
   if (dtype == dnn::kHalf) {
     math_type = CUBLAS_TENSOR_OP_MATH;
   }
 #else
-  if (dtype == dnn::kFloat && numeric_options.allow_tf32) {
+  if (dtype == dnn::kFloat && engine_options.allow_tf32) {
     math_type = CUBLAS_TF32_TENSOR_OP_MATH;
   }
 #endif
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
index fc87558a8d4b59..eb1ad5c6f82b8a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_lt.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -97,13 +97,12 @@ class CUDABlas : public blas::BlasSupport {
   // types.
   template <typename T, typename Scalar, typename FuncT>
   absl::Status DoBlasGemmBatchedInternal(
-      FuncT cublas_func, Stream *stream, blas::Transpose transa,
+      FuncT cublas_func, Stream* stream, blas::Transpose transa,
       blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, Scalar alpha,
-      const DeviceMemorySlice<T> &a_array, int lda,
-      const DeviceMemorySlice<T> &b_array, int ldb, Scalar beta,
-      const DeviceMemorySlice<T> &c_array, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator);
+      const DeviceMemorySlice<T>& a_array, int lda,
+      const DeviceMemorySlice<T>& b_array, int ldb, Scalar beta,
+      const DeviceMemorySlice<T>& c_array, int ldc, int batch_count,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator);
 
   // Guards the cuBLAS handle for this device.
   mutable absl::Mutex mu_;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
index 95f53fa907fba1..8dacdb6f642b70 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
+#include "tsl/platform/path.h"
 
 namespace stream_executor::gpu {
 namespace {
@@ -78,19 +79,6 @@ CUgraphNode ToCudaGraphHandle(GraphNodeHandle handle) {
   return absl::bit_cast<CUgraphNode>(handle);
 }
 
-int ToCudaGraphKernelNodePriority(StreamPriority priority) {
-  switch (priority) {
-    case StreamPriority::Default:
-      return 0;
-    case StreamPriority::Lowest:
-      return -1;
-    case StreamPriority::Highest:
-      return 1;
-    default:
-      return 0;
-  }
-}
-
 // Converts a platform independent GraphConditionalHandle into a CUDA specific
 // CUgraphConditionalHandle.
 CUgraphConditionalHandle ToCudaGraphHandle(GraphConditionalHandle handle) {
@@ -542,7 +530,7 @@ absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateKernelNode(
 
   if (priority != StreamPriority::Default) {
     CUlaunchAttributeValue value;
-    value.priority = ToCudaGraphKernelNodePriority(priority);
+    value.priority = stream_exec_->GetGpuStreamPriority(priority);
     TF_RETURN_IF_ERROR(
         cuda::ToStatus(cuGraphKernelNodeSetAttribute(
                            node_handle, CU_LAUNCH_ATTRIBUTE_PRIORITY, &value),
@@ -702,6 +690,7 @@ absl::Status CudaCommandBuffer::SetPriority(StreamPriority priority) {
   TF_RETURN_IF_ERROR(
       cuda::ToStatus(cuGraphGetNodes(graph_, nodes.data(), &num_nodes)));
 
+  int priority_value = stream_exec_->GetGpuStreamPriority(priority);
   for (size_t i = 0; i < num_nodes; i++) {
     CUgraphNodeType type;
     TF_RETURN_IF_ERROR(cuda::ToStatus(cuGraphNodeGetType(nodes[i], &type),
@@ -709,7 +698,7 @@ absl::Status CudaCommandBuffer::SetPriority(StreamPriority priority) {
 
     if (type == CU_GRAPH_NODE_TYPE_KERNEL) {
       CUlaunchAttributeValue value;
-      value.priority = ToCudaGraphKernelNodePriority(priority);
+      value.priority = priority_value;
       TF_RETURN_IF_ERROR(
           cuda::ToStatus(cuGraphKernelNodeSetAttribute(
                              nodes[i], CU_LAUNCH_ATTRIBUTE_PRIORITY, &value),
@@ -831,4 +820,25 @@ absl::Status CudaCommandBuffer::CheckCanBeUpdated() {
   return absl::OkStatus();
 }
 
+std::string CudaCommandBuffer::ToString() const {
+  std::string path = tsl::io::GetTempFilename(/*extension=*/"dot");
+#if CUDA_VERSION >= 12000
+  int flags = CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE;
+  auto dot_print_status =
+      cuda::ToStatus(cuGraphDebugDotPrint(graph_, path.c_str(), flags),
+                     "Failed to print gpu graph debug file");
+  if (!dot_print_status.ok()) {
+    return std::string(dot_print_status.message());
+  }
+  std::string dot_file_contents;
+  auto read_status =
+      tsl::ReadFileToString(tsl::Env::Default(), path, &dot_file_contents);
+  if (!read_status.ok()) {
+    return std::string(read_status.message());
+  }
+  return dot_file_contents;
+#endif  // CUDA_VERSION >= 12000
+  return "CUDA graph debug dot print is not supported.";
+}
+
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
index 74f59fcc31728c..84fb08a2cfa60d 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
@@ -53,6 +53,8 @@ class CudaCommandBuffer final : public GpuCommandBuffer {
   static absl::StatusOr<std::unique_ptr<CudaCommandBuffer>> Create(
       Mode mode, StreamExecutor* executor, CudaContext* cuda_context);
 
+  std::string ToString() const override;
+
   ~CudaCommandBuffer() override;
 
  private:
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
index 535f4c64cb2053..927b42738507e2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/ascii.h"
 #include "absl/types/span.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"  // IWYU pragma: keep - cudnn frontend headers are not hermetic
@@ -32,20 +33,18 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_dnn.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::cuda {
 namespace {
 
 using ::testing::Each;
-using ::tsl::testing::IsOkAndHolds;
 
 static Platform* CudaPlatform() {
   auto name = absl::AsciiStrToUpper(
@@ -94,7 +93,10 @@ TEST(CudaCommandBufferTest, CuDnnExplicitConstructionAndUpdateWork) {
         .set_uid(3);
     return graph;
   }());
-  TF_ASSERT_OK(graph.Prepare(dnn_support, NumericOptions{}));
+  TF_ASSERT_OK(graph.Prepare(dnn_support,
+                             EngineOptions{/*require_determinism=*/false,
+                                           /*allow_tf32=*/true,
+                                           /*require_command_buffer=*/true}));
   TF_ASSERT_OK(graph.Build(dnn_support, /*plan_id=*/std::nullopt));
   EXPECT_THAT(graph.SupportsExplicitCommandBufferConstruction(),
               absl_testing::IsOkAndHolds(true));
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc
index d24d37d7391e9c..b5cf385bb50d4a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.pb.h"
 
 namespace stream_executor {
 
@@ -43,7 +44,7 @@ absl::StatusOr<CudaComputeCapability> CudaComputeCapability::FromString(
   }
 
   if (!split[1].empty() && (split[1].back() == 'f' || split[1].back() == 'F')) {
-    feature_extension = FeatureExtension::kForwardCompatibleFeatures;
+    feature_extension = FeatureExtension::kFamilyCompatibleFeatures;
     split[1].remove_suffix(1);
   }
 
@@ -63,7 +64,7 @@ static std::string FeatureExtensionToString(
       return "";
     case CudaComputeCapability::FeatureExtension::kAcceleratedFeatures:
       return "a";
-    case CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures:
+    case CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures:
       return "f";
   }
 }
@@ -110,8 +111,8 @@ absl::StatusOr<CudaComputeCapability> CudaComputeCapability::FromProto(
     case CudaComputeCapabilityProto::ACCELERATED_FEATURES:
       cc.feature_extension = FeatureExtension::kAcceleratedFeatures;
       break;
-    case CudaComputeCapabilityProto::FORWARD_COMPATIBLE_FEATURES:
-      cc.feature_extension = FeatureExtension::kForwardCompatibleFeatures;
+    case CudaComputeCapabilityProto::FAMILY_COMPATIBLE_FEATURES:
+      cc.feature_extension = FeatureExtension::kFamilyCompatibleFeatures;
       break;
     default:
       return absl::InvalidArgumentError(absl::StrCat(
@@ -133,9 +134,9 @@ CudaComputeCapabilityProto CudaComputeCapability::ToProto() const {
       proto.set_feature_extension(
           CudaComputeCapabilityProto::ACCELERATED_FEATURES);
       break;
-    case FeatureExtension::kForwardCompatibleFeatures:
+    case FeatureExtension::kFamilyCompatibleFeatures:
       proto.set_feature_extension(
-          CudaComputeCapabilityProto::FORWARD_COMPATIBLE_FEATURES);
+          CudaComputeCapabilityProto::FAMILY_COMPATIBLE_FEATURES);
       break;
   }
   return proto;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
index 1cb117ac7f0bbd..24ad288ede79cf 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
@@ -57,9 +57,9 @@ struct CudaComputeCapability {
             // a higher compute capability. Example: sm_90
     kAcceleratedFeatures,  // Enables features that only work on GPUs with the
                            // same compute capability. Example: sm_90a
-    kForwardCompatibleFeatures  // Enables features that only work on GPUs
-                                // within the same major version and a later
-                                // minor version. Example: sm_100f
+    kFamilyCompatibleFeatures  // Enables features that only work on GPUs
+                               // within the same major version and a later
+                               // minor version. Example: sm_100f
   };
   FeatureExtension feature_extension = FeatureExtension::kNone;
 
@@ -70,6 +70,7 @@ struct CudaComputeCapability {
     kAmpere = 8,
     kHopper = 9,
     kBlackwell = 10,
+    kBlackwell_11 = 11,
     kBlackwell_12 = 12
   };
 
@@ -104,7 +105,7 @@ struct CudaComputeCapability {
   // Includes all GPUs with compute capability 9.0, notably H100, H200, and
   // GH200. When comparing with `IsAtLeast` this will only be true for GPUs with
   // compute capability 9.0.
-  constexpr static CudaComputeCapability H100Family() {
+  constexpr static CudaComputeCapability H100Accelerated() {
     return CudaComputeCapability{kHopper, 0,
                                  FeatureExtension::kAcceleratedFeatures};
   }
@@ -119,7 +120,7 @@ struct CudaComputeCapability {
   // Includes all GPUs with compute capability 10.0, notably B200 and GB200.
   // When comparing with `IsAtLeast` this will only be true for GPUs with
   // compute capability 10.0.
-  constexpr static CudaComputeCapability B200Family() {
+  constexpr static CudaComputeCapability B200Accelerated() {
     return CudaComputeCapability{kBlackwell, 0,
                                  FeatureExtension::kAcceleratedFeatures};
   }
@@ -133,9 +134,9 @@ struct CudaComputeCapability {
   // Includes all GPUs with compute capability 10.x. When comparing with
   // `IsAtLeast` this will true for all 10.x compute capabilities but not for
   // compute capabilities with a higher major version.
-  constexpr static CudaComputeCapability BlackwellGenerationOnly() {
+  constexpr static CudaComputeCapability BlackwellFamily() {
     return CudaComputeCapability{kBlackwell, 0,
-                                 FeatureExtension::kForwardCompatibleFeatures};
+                                 FeatureExtension::kFamilyCompatibleFeatures};
   }
 
   // Returns true if the compute capability is at least
@@ -194,7 +195,7 @@ struct CudaComputeCapability {
         return std::tie(major, minor) >= std::tie(other.major, other.minor);
       case FeatureExtension::kAcceleratedFeatures:
         return std::tie(major, minor) == std::tie(other.major, other.minor);
-      case FeatureExtension::kForwardCompatibleFeatures:
+      case FeatureExtension::kFamilyCompatibleFeatures:
         return major == other.major && minor >= other.minor;
     }
   }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto
index 9c5a6446f843fb..3d31e78968a149 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto
@@ -22,10 +22,13 @@ message CudaComputeCapabilityProto {
   int32 minor = 2;
 
   enum FeatureExtension {
+    // UNSPECIFIED is only needed because when this field is not set
+    // it defaults to 0, but on Hopper and Blackwell we want to detect that
+    // and default to ACCELERATED_FEATURES, not to NONE.
     UNSPECIFIED = 0;
     NONE = 1;
     ACCELERATED_FEATURES = 2;
-    FORWARD_COMPATIBLE_FEATURES = 3;
+    FAMILY_COMPATIBLE_FEATURES = 3;
   }
 
   FeatureExtension feature_extension = 3;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
index 71fade462cfe39..b2c1afccad2475 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
@@ -44,7 +44,7 @@ TEST(CudaComputeCapabilityTest, ToString) {
   EXPECT_EQ(
       CudaComputeCapability(
           100, 52,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .ToString(),
       "100.52f");
 }
@@ -65,13 +65,13 @@ TEST(CudaComputeCapabilityTest, FromString) {
                   100, 52, FeatureExtension::kAcceleratedFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("100.52f"),
               IsOkAndHolds(CudaComputeCapability(
-                  100, 52, FeatureExtension::kForwardCompatibleFeatures)));
+                  100, 52, FeatureExtension::kFamilyCompatibleFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("100.52F"),
               IsOkAndHolds(CudaComputeCapability(
-                  100, 52, FeatureExtension::kForwardCompatibleFeatures)));
+                  100, 52, FeatureExtension::kFamilyCompatibleFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("100.52 f"),
               IsOkAndHolds(CudaComputeCapability(
-                  100, 52, FeatureExtension::kForwardCompatibleFeatures)));
+                  100, 52, FeatureExtension::kFamilyCompatibleFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("1"),
               StatusIs(absl::StatusCode::kInvalidArgument));
   EXPECT_THAT(CudaComputeCapability::FromString("12"),
@@ -103,12 +103,12 @@ TEST(CudaComputeCapabilityTest, ToProto) {
   CudaComputeCapabilityProto proto2 =
       CudaComputeCapability(
           100, 5,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .ToProto();
   EXPECT_EQ(proto2.major(), 100);
   EXPECT_EQ(proto2.minor(), 5);
   EXPECT_EQ(proto2.feature_extension(),
-            CudaComputeCapabilityProto::FORWARD_COMPATIBLE_FEATURES);
+            CudaComputeCapabilityProto::FAMILY_COMPATIBLE_FEATURES);
 }
 
 TEST(CudaComputeCapabilityTest, FromProtoWithFeatureExtensionUnspecified) {
@@ -167,7 +167,7 @@ TEST(CudaComputeCapabilityTest, IsAtLeastMethods) {
   // IsAtLeastAmpere (sm_80)
   EXPECT_FALSE(CudaComputeCapability(7, 5).IsAtLeastAmpere());
   EXPECT_FALSE(
-      CudaComputeCapability(7, 5, FeatureExtension::kForwardCompatibleFeatures)
+      CudaComputeCapability(7, 5, FeatureExtension::kFamilyCompatibleFeatures)
           .IsAtLeastAmpere());
   EXPECT_TRUE(CudaComputeCapability(8, 0).IsAtLeastAmpere());
   EXPECT_TRUE(
@@ -188,7 +188,7 @@ TEST(CudaComputeCapabilityTest, IsAtLeastMethods) {
   // IsAtLeastBlackwell (sm_100)
   EXPECT_FALSE(CudaComputeCapability(9, 0).IsAtLeastBlackwell());
   EXPECT_FALSE(
-      CudaComputeCapability(9, 0, FeatureExtension::kForwardCompatibleFeatures)
+      CudaComputeCapability(9, 0, FeatureExtension::kFamilyCompatibleFeatures)
           .IsAtLeastBlackwell());
   EXPECT_TRUE(CudaComputeCapability(10, 0).IsAtLeastBlackwell());
   EXPECT_TRUE(
@@ -216,16 +216,16 @@ TEST(CudaComputeCapabilityTest, Hash) {
   EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly({
       CudaComputeCapability(0, 0),
       CudaComputeCapability(0, 0, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(0, 0, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(0, 0, FeatureExtension::kFamilyCompatibleFeatures),
       CudaComputeCapability(0, 1),
       CudaComputeCapability(0, 1, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(0, 1, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(0, 1, FeatureExtension::kFamilyCompatibleFeatures),
       CudaComputeCapability(1, 0),
       CudaComputeCapability(1, 0, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(1, 0, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(1, 0, FeatureExtension::kFamilyCompatibleFeatures),
       CudaComputeCapability(1, 1),
       CudaComputeCapability(1, 1, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(1, 1, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(1, 1, FeatureExtension::kFamilyCompatibleFeatures),
   }));
 }
 
@@ -241,12 +241,12 @@ TEST(CudaComputeCapabilityTest, ComparisonTest) {
   CudaComputeCapability base_but_accelerated{
       1, 0, FeatureExtension::kAcceleratedFeatures};
   CudaComputeCapability base_but_forward_compatible{
-      1, 0, FeatureExtension::kForwardCompatibleFeatures};
+      1, 0, FeatureExtension::kFamilyCompatibleFeatures};
   CudaComputeCapability newer_but_same_generation{1, 1};
   CudaComputeCapability newer_but_same_generation_accelerated{
       1, 1, FeatureExtension::kAcceleratedFeatures};
   CudaComputeCapability newer_but_same_generation_compatible{
-      1, 1, FeatureExtension::kForwardCompatibleFeatures};
+      1, 1, FeatureExtension::kFamilyCompatibleFeatures};
   CudaComputeCapability next_generation{2, 0};
 
   EXPECT_TRUE(base == base);
@@ -325,7 +325,7 @@ TEST(CudaComputeCapabilityTest, GetPtxAsTargetName) {
   EXPECT_EQ(
       CudaComputeCapability(
           10, 0,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .GetPtxAsTargetName(),
       "sm_100f");
 }
@@ -343,7 +343,7 @@ TEST(CudaComputeCapabilityTest, WithoutAnyFeatureExtension) {
   EXPECT_EQ(
       CudaComputeCapability(
           100, 52,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .WithoutAnyFeatureExtension(),
       CudaComputeCapability(100, 52));
 }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_context.cc b/third_party/xla/xla/stream_executor/cuda/cuda_context.cc
index cce7f2c2d20761..b15cada11740fb 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_context.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_context.cc
@@ -24,8 +24,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/gpu/context_map.h"
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
+#include "xla/tsl/platform/errors.h"
 
 namespace stream_executor::gpu {
 
@@ -67,8 +66,8 @@ int GetFlagsFromEnv() {
 // Returns the current context or dies if it fails.
 CUcontext CurrentContextOrDie() {
   CUcontext current = nullptr;
-  TF_CHECK_OK(cuda::ToStatus(cuCtxGetCurrent(&current),
-                             "Failed to query current context"));
+  CHECK_OK(cuda::ToStatus(cuCtxGetCurrent(&current),
+                          "Failed to query current context"));
   return current;
 }
 
@@ -183,8 +182,7 @@ absl::StatusOr<CudaContext*> CudaContext::Create(int device_ordinal,
 }
 
 void CudaContext::SetActive() {
-  TF_CHECK_OK(
-      cuda::ToStatus(cuCtxSetCurrent(context_), "Failed setting context"));
+  CHECK_OK(cuda::ToStatus(cuCtxSetCurrent(context_), "Failed setting context"));
 }
 
 bool CudaContext::IsActive() const { return CurrentContext() == context_; }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 439ac237cf3704..55d93ce992e741 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -63,8 +63,8 @@ limitations under the License.
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -72,7 +72,6 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/env_var.h"
@@ -845,7 +844,7 @@ const json* CudnnExecutionPlanEngineFilterRuntime() {
 bool BatchnormSpatialPersistentEnabled() {
   static bool is_enabled = [] {
     bool is_enabled = false;
-    TF_CHECK_OK(
+    CHECK_OK(
         tsl::ReadBoolFromEnvVar("TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT",
                                 /*default_val=*/false, &is_enabled));
     return is_enabled;
@@ -857,8 +856,8 @@ bool BatchnormSpatialPersistentEnabled() {
 bool ConvUseDefaultAlgorithm() {
   static bool use_default = [] {
     bool use_default = false;
-    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_DEFAULT_CONV_ALGO",
-                                        /*default_val=*/false, &use_default));
+    CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_DEFAULT_CONV_ALGO",
+                                     /*default_val=*/false, &use_default));
     return use_default;
   }();
   return use_default;
@@ -945,7 +944,7 @@ class CudnnPoolingDescriptor {
  public:
   explicit CudnnPoolingDescriptor(
       const dnn::PoolingDescriptor& pooling_descriptor,
-      const NumericOptions& numeric_options)
+      const EngineOptions& engine_options)
       : handle_(CreatePoolingDescriptor()) {
     absl::Span<const int64_t> strides64 = pooling_descriptor.strides();
     absl::Span<const int64_t> padding64 = pooling_descriptor.padding();
@@ -962,7 +961,7 @@ class CudnnPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64_t, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
-    const auto cudnn_max_pooling_mode = numeric_options.require_determinism
+    const auto cudnn_max_pooling_mode = engine_options.require_determinism
                                             ? CUDNN_POOLING_MAX_DETERMINISTIC
                                             : CUDNN_POOLING_MAX;
     CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
@@ -1301,7 +1300,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
       cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode,
       cudnnDataType_t data_type, cudnnDataType_t compute_type,
       const dnn::AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) {
     TF_ASSIGN_OR_RETURN(
         CudnnDropoutDescriptor dropout_desc,
@@ -1324,8 +1323,8 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // TODO(csigg): Minimal support cuDNN version is 7.3, clean up.
     bool allow_tensor_ops = data_type == CUDNN_DATA_HALF;
     if (data_type == CUDNN_DATA_FLOAT)
-      allow_tensor_ops = numeric_options.allow_tf32 &&
-                         tsl::tensor_float_32_execution_enabled();
+      allow_tensor_ops =
+          engine_options.allow_tf32 && tsl::tensor_float_32_execution_enabled();
     bool use_tensor_ops =
         algorithm_config.algorithm().has_value()
             ? algorithm_config.algorithm()->tensor_ops_enabled()
@@ -2263,7 +2262,7 @@ CudnnSupport::CreateRnnDescriptor(
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
     dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-    const NumericOptions& numeric_options, float dropout, uint64_t seed,
+    const EngineOptions& engine_options, float dropout, uint64_t seed,
     ScratchAllocator* state_allocator, bool use_padded_io) {
   // Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's
   // not enqueueing anything into a stream, we pass in the null stream.
@@ -2275,7 +2274,7 @@ CudnnSupport::CreateRnnDescriptor(
           ToCudnnRnnInputMode(input_mode),
           ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
           ToCudnnDataType(data_type), GetRnnComputeType(data_type),
-          algorithm_config, numeric_options, dropout, seed, state_allocator,
+          algorithm_config, engine_options, dropout, seed, state_allocator,
           use_padded_io));
   return std::unique_ptr<dnn::RnnDescriptor>(
       new CudnnRnnDescriptor(std::move(rnn_desc)));
@@ -3461,13 +3460,12 @@ GetGenericCudnnOperationGraph(
 bool SideInputNeeded(dnn::ActivationMode activation_mode, double conv_scale,
                      double side_input_scale) {
   // Cudnn uses precompiled kernels to perform the Conv-Add-BiasAdd-Act when the
-  // activation is Relu or Identity and this requires the "side_input" for the
+  // activation is Relu and this requires the "side_input" for the
   // Add. For other activations, cudnn uses the runtime-compiled kernels.
   // However, for this case, we need to drop the Add node and use
   // Conv-BiasAdd-Act pattern to trigger the correct cudnn path.
   // TODO(kaixih@nvidia): We should remove this WAR when the cudnn fixes it.
-  bool check_activation = activation_mode == dnn::ActivationMode::kNone ||
-                          activation_mode == dnn::ActivationMode::kRelu;
+  bool check_activation = activation_mode == dnn::ActivationMode::kRelu;
   bool check_scale = conv_scale != 1.0 || side_input_scale != 0.0;
   return check_activation || check_scale;
 }
@@ -4091,11 +4089,14 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
 
   // Setting bias
   if (bias_descriptor.has_value()) {
+    cudnn_frontend::DataType_t dataType =
+        ToCudnnFrontendDataType(bias_descriptor->type());
     auto bias_tensor =
         graph.tensor(Tensor_attributes()
                          .set_name("bias")
                          .set_dim(bias_descriptor->dimensions())
                          .set_stride(bias_descriptor->GetLogicalStrides())
+                         .set_data_type(dataType)
                          .set_uid(next_uid()));
     sdpa_options.set_bias(bias_tensor);
   }
@@ -4242,8 +4243,9 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
   }
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b flash attention operation graph: " << cudnnGraph.Graph();
@@ -4388,8 +4390,9 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionF8OperationGraph(
   }
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b workspace size:" << cudnnGraph.Graph().get_workspace_size();
@@ -4577,8 +4580,9 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardF8OperationGraph(
 
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b workspace size:" << cudnnGraph.Graph().get_workspace_size();
@@ -4624,7 +4628,7 @@ absl::StatusOr<CudnnGraph> GetCudnnBlockScaledDotOperationGraph(
                         desc.GetPhysicalDimensionsMajorToMinor());
     std::vector<int64_t> strides = desc.GetPhysicalStridesMajorToMinor();
     if (dimensions.size() == 2) {
-      dimensions.insert(dimensions.begin(), 1);
+      dimensions.insert(dimensions.begin(), 1);  // Batch dimension is implicit.
       strides.insert(strides.begin(), dimensions[1] * dimensions[2]);
     }
     CHECK_EQ(dimensions.size(), 3);
@@ -4664,7 +4668,7 @@ absl::StatusOr<CudnnGraph> GetCudnnBlockScaledDotOperationGraph(
     d_tensor->set_uid(next_uid());
     d_tensor->set_is_virtual(false);
   } else {
-    std::vector<int64_t> scalar(lhs_data.ndims(), 1);
+    std::vector<int64_t> scalar(3, 1);  // Batch dimension is implicit.
     auto scale_attr = Tensor_attributes()
                           .set_uid(next_uid())
                           .set_dim(scalar)
@@ -4683,8 +4687,9 @@ absl::StatusOr<CudnnGraph> GetCudnnBlockScaledDotOperationGraph(
 
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b workspace size:" << cudnnGraph.Graph().get_workspace_size();
@@ -4813,12 +4818,15 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
   std::shared_ptr<Tensor_attributes> d_bias_tensor;
   if (use_bias) {
     DCHECK(bias_descriptor != std::nullopt);
+    cudnn_frontend::DataType_t dataType =
+        ToCudnnFrontendDataType(bias_descriptor->type());
     auto bias_dims = bias_descriptor->dimensions();
     auto bias_strides = bias_descriptor->GetLogicalStrides();
     auto bias_tensor = graph.tensor(Tensor_attributes()
                                         .set_name("bias")
                                         .set_dim(bias_dims)
                                         .set_stride(bias_strides)
+                                        .set_data_type(dataType)
                                         .set_uid(next_uid()));
     sdpa_backward_options.set_bias(bias_tensor);
 
@@ -4972,9 +4980,10 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
   }
 
   CudnnGraph cudnnGraph(std::move(graph));
-  TF_RETURN_IF_ERROR(
-      cudnnGraph.Prepare(dnn_support, NumericOptions{force_deterministic,
-                                                     /*allow_tf32=*/true}));
+  TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
+      dnn_support, EngineOptions{force_deterministic,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b flash attention operation backward graph: "
@@ -5371,19 +5380,19 @@ absl::Status CreateOpRunners(
     dnn::ConvolutionKind kind, dnn::DataType input_type,
     absl::Span<const int64_t> input_uids, bool use_fallback,
     std::vector<std::unique_ptr<const dnn::OpRunner<Sig>>>* out_runners,
-    bool need_side_input, const NumericOptions& numeric_options) {
+    bool need_side_input, const EngineOptions& engine_options) {
   cudnn_frontend::EngineConfigList filtered_configs;
   const bool disable_winograd = !CudnnEnvVar<WinogradNonfused>::IsEnabled();
   const bool disable_tensor_core =
-      !IsTensorMathEnabled(stream, input_type, numeric_options.allow_tf32);
+      !IsTensorMathEnabled(stream, input_type, engine_options.allow_tf32);
   auto generic_filter_fn = [=](cudnnBackendDescriptor_t engine_config) -> bool {
     return GenericEngineFilter(engine_config, disable_winograd,
-                               numeric_options.require_determinism,
+                               engine_options.require_determinism,
                                disable_tensor_core);
   };
   VLOG(4) << "Filtering engine configs with disable_winograd="
           << disable_winograd
-          << ", disable_nondeterminism=" << numeric_options.require_determinism
+          << ", disable_nondeterminism=" << engine_options.require_determinism
           << ", disable_tensor_core=" << disable_tensor_core;
 
   std::array<std::string, 1> heur_mode = {use_fallback ? "heuristics_fallback"
@@ -5421,6 +5430,13 @@ absl::Status CreateOpRunners(
                     .setEngineConfig(filtered_configs[i], op_graph->getTag())
                     .build();
     if (plan.get_status() != CUDNN_STATUS_SUCCESS) {
+#if CUDNN_VERSION >= 90000
+      std::string message(65535, '\0');
+      cudnnGetLastErrorString(message.data(), message.size());
+      VLOG(4) << "Failed building ExecutionPlan: found error: "
+              << cudnnGetErrorString(plan.get_status())
+              << " with message: " << message;
+#endif
       continue;
     }
 
@@ -5462,7 +5478,7 @@ absl::Status CreateOpRunners(
         std::move(runner_or).value()));
 
     // We will use the first working plan when determinism is required.
-    if (numeric_options.require_determinism) {
+    if (engine_options.require_determinism) {
       break;
     }
   }
@@ -5485,7 +5501,7 @@ absl::Status CudnnSupport::GetConvolveRunners(
     DeviceMemoryBase /*output_data*/,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
     ScratchAllocator* /*scratch_allocator*/,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   TF_ASSIGN_OR_RETURN(
@@ -5497,7 +5513,7 @@ absl::Status CudnnSupport::GetConvolveRunners(
   return CreateOpRunners<dnn::ConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'y'}, use_fallback, out_exec_plans,
-      /*need_side_input=*/false, numeric_options);
+      /*need_side_input=*/false, engine_options);
 }
 
 absl::Status CudnnSupport::GetGraphConvolveRunners(
@@ -5507,7 +5523,7 @@ absl::Status CudnnSupport::GetGraphConvolveRunners(
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
     std::string serialized_graph) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
@@ -5519,7 +5535,7 @@ absl::Status CudnnSupport::GetGraphConvolveRunners(
   return CreateOpRunners<dnn::GraphConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph_and_uids.first),
       kind, input_type, op_graph_and_uids.second, use_fallback, out_exec_plans,
-      /*need_side_input=*/false, numeric_options);
+      /*need_side_input=*/false, engine_options);
 }
 
 absl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
@@ -5624,7 +5640,7 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
     const dnn::ActivationMode activation_mode,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   // Fused convolutions with identity activations are broken in that they
   // implicitly do ReLU on some engines, and we can't reliably detect which
@@ -5663,7 +5679,7 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
   return CreateOpRunners<dnn::FusedConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'z', 'b', 'y'}, use_fallback, out_exec_plans,
-      need_side_input, numeric_options);
+      need_side_input, engine_options);
 }
 
 absl::Status CudnnSupport::GetFusedMatmulRunners(
@@ -5671,7 +5687,7 @@ absl::Status CudnnSupport::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
@@ -5691,17 +5707,17 @@ absl::Status CudnnSupport::GetFusedMatmulRunners(
   return CreateOpRunners<dnn::FusedMatmulSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph),
       dnn::ConvolutionKind::INVALID, input_type, {'a', 'b', 'z', 'c'},
-      use_fallback, out_exec_plans, /*need_side_input=*/true, numeric_options);
+      use_fallback, out_exec_plans, /*need_side_input=*/true, engine_options);
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvFwd);
 
   bool tensor_op_math_available = IsTensorMathEnabled(
-      cuda_compute_capability, input_type, numeric_options.allow_tf32);
+      cuda_compute_capability, input_type, engine_options.allow_tf32);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types;
@@ -5903,12 +5919,12 @@ bool CudnnSupport::GetRnnAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdData);
 
   bool tensor_op_math_available = IsTensorMathEnabled(
-      cuda_compute_capability, input_type, numeric_options.allow_tf32);
+      cuda_compute_capability, input_type, engine_options.allow_tf32);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
@@ -5922,7 +5938,7 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
-  if (numeric_options.require_determinism) {
+  if (engine_options.require_determinism) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0);
   }
 
@@ -5939,12 +5955,12 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdFilter);
 
   bool tensor_op_math_available = IsTensorMathEnabled(
-      cuda_compute_capability, input_type, numeric_options.allow_tf32);
+      cuda_compute_capability, input_type, engine_options.allow_tf32);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
@@ -5962,7 +5978,7 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
-  if (!numeric_options.require_determinism) {
+  if (!engine_options.require_determinism) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0);
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3);
   }
@@ -6414,7 +6430,7 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
     absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
     absl::Span<const int> input_lengths_data,
-    const NumericOptions& numeric_options, ScratchAllocator* scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Query the workspace size.
@@ -6428,7 +6444,7 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
   // Try running with `algo`, if successful then pick it. The
   // non-deterministic algorithm is first and thus preferentially picked
   // when determinism is not required.
-  auto algo = numeric_options.require_determinism
+  auto algo = engine_options.require_determinism
                   ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
                   : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC;
   cudnnStatus_t status = cudnnGetCTCLossWorkspaceSize(
@@ -6440,7 +6456,7 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
       /*algo=*/algo,
       /*ctcLossDesc=*/cudnn_ctc_loss_desc.handle(),
       /*sizeInBytes=*/&workspace_size_in_bytes);
-  if (numeric_options.require_determinism) {
+  if (engine_options.require_determinism) {
     RETURN_IF_CUDNN_ERROR(status);
   }
 
@@ -6585,14 +6601,14 @@ absl::Status CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
   return DoPoolForward(element_type, stream, pooling_dimensions,
-                       NumericOptions{}, input_dimensions, input_data,
+                       EngineOptions{}, input_dimensions, input_data,
                        output_dimensions, output_data, workspace_allocator);
 }
 
 absl::Status CudnnSupport::DoPoolForward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
@@ -6613,7 +6629,7 @@ absl::Status CudnnSupport::DoPoolForward(
       ToCudnnDataType(element_type, input_dimensions.layout());
   cudnnDataType_t cudnn_output_type =
       ToCudnnDataType(element_type, output_dimensions.layout());
-  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, numeric_options);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, engine_options);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   auto cudnn_launcher = [&](CudnnTensorDescriptor& src_desc,
@@ -6664,7 +6680,7 @@ absl::Status CudnnSupport::DoPoolBackward(
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
     ScratchAllocator* workspace_allocator) {
   return DoPoolBackward(element_type, stream, pooling_dimensions,
-                        NumericOptions{}, input_dimensions, input_data,
+                        EngineOptions{}, input_dimensions, input_data,
                         output_dimensions, output_data, input_diff_data,
                         output_diff_data, workspace_allocator);
 }
@@ -6672,7 +6688,7 @@ absl::Status CudnnSupport::DoPoolBackward(
 absl::Status CudnnSupport::DoPoolBackward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
@@ -6694,7 +6710,7 @@ absl::Status CudnnSupport::DoPoolBackward(
       ToCudnnDataType(element_type, input_dimensions.layout());
   cudnnDataType_t cudnn_output_type =
       ToCudnnDataType(element_type, output_dimensions.layout());
-  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, numeric_options);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, engine_options);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   auto cudnn_launcher = [&](CudnnTensorDescriptor& src_desc,
@@ -6853,18 +6869,22 @@ absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> CudnnSupport::DeserializeGraph(
 }
 
 absl::Status CudnnGraph::Prepare(dnn::DnnSupport& dnn_support,
-                                 const NumericOptions& numeric_options) {
+                                 const EngineOptions& engine_options) {
   const CudnnSupport& cudnn_support = static_cast<CudnnSupport&>(dnn_support);
   TF_ASSIGN_OR_RETURN(auto cudnn_handle,
                       cudnn_support.cudnn_->GetCompilationHandle());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.validate());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.build_operation_graph(cudnn_handle));
-  if (numeric_options.require_determinism) {
+  RETURN_IF_CUDNN_FRONTEND_ERROR(
+      graph_.create_execution_plans({cudnn_frontend::HeurMode_t::A}));
+  if (engine_options.require_determinism) {
     graph_.deselect_numeric_notes(
         {cudnn_frontend::NumericalNote_t::NONDETERMINISTIC});
   }
-  RETURN_IF_CUDNN_FRONTEND_ERROR(
-      graph_.create_execution_plans({cudnn_frontend::HeurMode_t::A}));
+  if (engine_options.require_command_buffer) {
+    graph_.select_behavior_notes(
+        {cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
+  }
   RETURN_CUDNN_FRONTEND_STATUS(graph_.check_support(cudnn_handle));
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index ec248539be953b..e651bf34895ca7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cudnn_sdpa_score_mod.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -58,7 +58,7 @@ class CudnnGraph : public dnn::DnnGraph {
   explicit CudnnGraph(cudnn_frontend::graph::Graph&& graph)
       : graph_(std::move(graph)) {}
   // Prepares a graph and checks whether it is generally supported.
-  absl::Status Prepare(dnn::DnnSupport&, const NumericOptions&) override;
+  absl::Status Prepare(dnn::DnnSupport&, const EngineOptions&) override;
   // Builds single plan of the graph with given ID.
   absl::Status Build(dnn::DnnSupport&, std::optional<int64_t> plan_id) override;
   // Builds all the plans
@@ -106,7 +106,7 @@ class CudnnSupport : public dnn::DnnSupport {
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
       dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
@@ -270,7 +270,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans)
       override;
 
@@ -289,7 +289,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::FilterDescriptor& filter_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, const NumericOptions& numeric_options,
+      bool use_fallback, const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
       std::string serialized_graph) override;
 
@@ -313,7 +313,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, dnn::ActivationMode activation_mode,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
       override;
 
@@ -322,7 +322,7 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
       uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
@@ -486,7 +486,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   absl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
-                             const NumericOptions& numeric_options,
+                             const EngineOptions& engine_options,
                              const dnn::BatchDescriptor& input_dimensions,
                              DeviceMemoryBase input_data,
                              const dnn::BatchDescriptor& output_dimensions,
@@ -505,7 +505,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   absl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
-                              const NumericOptions& numeric_options,
+                              const EngineOptions& engine_options,
                               const dnn::BatchDescriptor& input_dimensions,
                               DeviceMemoryBase input_data,
                               const dnn::BatchDescriptor& output_dimensions,
@@ -573,17 +573,17 @@ class CudnnSupport : public dnn::DnnSupport {
 
   bool GetConvolveAlgorithms(CudaComputeCapability cuda_compute_capability,
                              dnn::DataType input_type,
-                             const NumericOptions& numeric_options,
+                             const EngineOptions& engine_options,
                              std::vector<dnn::AlgorithmDesc>* out_algorithms);
 
   bool GetConvolveBackwardDataAlgorithms(
       CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<dnn::AlgorithmDesc>* out_algorithms);
 
   bool GetConvolveBackwardFilterAlgorithms(
       CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<dnn::AlgorithmDesc>* out_algorithms);
 
   template <class T, class U>
@@ -692,8 +692,7 @@ class CudnnSupport : public dnn::DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
-      const NumericOptions& numeric_options,
-      ScratchAllocator* scratch_allocator,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
 
   CudnnSupport(const CudnnSupport&) = delete;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc
index 9eb6bbdcd45cb2..81a215c1ad72fe 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc
@@ -14,20 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/test.h"
 
 namespace stream_executor {
 namespace cuda {
 
 void CheckCuda(CUresult result, const char* file, int line) {
-  TF_CHECK_OK(cuda::ToStatus(result));
+  CHECK_OK(cuda::ToStatus(result));
 }
 
 void CheckCuda(cudaError_t result, const char* file, int line) {
@@ -52,13 +51,13 @@ TEST_F(CudaDriverTest, DriverVersionParsingTest) {
   auto driver_version = Diagnostician::FindKernelModuleVersion(
       "... NVIDIA UNIX Open Kernel Module for x86_64  570.00  Release Build  "
       "...  Mon Aug 12 04:17:20 UTC 2024");
-  TF_CHECK_OK(driver_version.status());
+  CHECK_OK(driver_version.status());
   EXPECT_EQ("570.0.0", cuda::DriverVersionToString(driver_version.value()));
 
   driver_version = Diagnostician::FindKernelModuleVersion(
       "... NVIDIA UNIX Open Kernel Module  571.00  Release Build  "
       "...  Mon Aug 12 04:17:20 UTC 2024");
-  TF_CHECK_OK(driver_version.status());
+  CHECK_OK(driver_version.status());
   EXPECT_EQ("571.0.0", cuda::DriverVersionToString(driver_version.value()));
 }
 }  // namespace cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
index 2e8601e1a0bc23..1a8469f4a33e81 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 3509821b82314b..845c5ff5967785 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <variant>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/call_once.h"
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
@@ -47,6 +48,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
@@ -74,10 +76,14 @@ limitations under the License.
 #include "xla/stream_executor/generic_memory_allocation.h"
 #include "xla/stream_executor/generic_memory_allocator.h"
 #include "xla/stream_executor/gpu/context.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/read_numa_node.h"
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -94,6 +100,7 @@ limitations under the License.
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/numa.h"
@@ -623,6 +630,181 @@ absl::StatusOr<std::unique_ptr<MemoryAllocation>> AllocateHostMemory(
       });
 }
 
+absl::StatusOr<bool> IsVmmSupported(CUdevice device) {
+  int deviceSupportsVmm = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuDeviceGetAttribute(
+      &deviceSupportsVmm,
+      CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)));
+  return deviceSupportsVmm;
+}
+
+absl::StatusOr<bool> IsRdmaSupported(CUdevice device) {
+  int rdma_supported = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuDeviceGetAttribute(
+      &rdma_supported,
+      CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, device)));
+  return rdma_supported;
+}
+
+absl::StatusOr<bool> IsMulticastSupported(CUdevice device) {
+  int is_multicast_supported = 0;
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuDeviceGetAttribute(&is_multicast_supported,
+                           CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, device)));
+  return is_multicast_supported;
+}
+
+CUmemAllocationProp GetVmmAllocationProperties(CUdevice device,
+                                               bool is_rdma_supported) {
+  CUmemAllocationProp properties = {};
+  properties.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  properties.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  properties.requestedHandleTypes =
+      static_cast<CUmemAllocationHandleType>(CU_MEM_HANDLE_TYPE_NONE);
+  properties.location.id = device;
+  properties.allocFlags.gpuDirectRDMACapable = is_rdma_supported ? 1 : 0;
+  return properties;
+}
+
+CUmemAccessDesc GetVmmAccessDescriptor(int device) {
+  CUmemAccessDesc descriptor = {};
+  descriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  descriptor.location.id = device;
+  descriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  return descriptor;
+}
+
+absl::StatusOr<CUmulticastObjectProp> CreateMulticastObjectProperties(
+    int num_devices, size_t size) {
+  CUmulticastObjectProp multicast_properties;
+  memset(&multicast_properties, 0, sizeof(CUmulticastObjectProp));
+  multicast_properties.numDevices = num_devices;
+
+  multicast_properties.handleTypes = CU_MEM_HANDLE_TYPE_NONE;
+  multicast_properties.flags = 0;
+
+  size_t multicast_granularity = 0;
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuMulticastGetGranularity(&multicast_granularity, &multicast_properties,
+                                CU_MULTICAST_GRANULARITY_RECOMMENDED)));
+
+  // Align up the size to the multicast granularity.
+  multicast_properties.size =
+      xla::RoundUpTo<size_t>(size, multicast_granularity);
+  return multicast_properties;
+}
+
+absl::Status ToStatus(nvmlReturn_t result) {
+  if (result == NVML_SUCCESS) {
+    return absl::OkStatus();
+  }
+  // NVML library is not a part of the CUDA toolkit, so there might be a
+  // situation when user is using newer CUDA, but the host NVML
+  // version doen't have the required functions.
+  if (result == NVML_ERROR_FUNCTION_NOT_FOUND) {
+    return absl::InternalError("NVML library doesn't have required functions.");
+  }
+  return absl::InternalError(absl::StrFormat("Nvml call failed with %d(%s).",
+                                             result, nvmlErrorString(result)));
+}
+
+// CUDA and Nvml can have different device ordering.
+absl::StatusOr<nvmlDevice_t> GetNvmlDevice(const std::string& pci_bus_id) {
+  nvmlDevice_t device;
+  TF_RETURN_IF_ERROR(
+      ToStatus(nvmlDeviceGetHandleByPciBusId_v2(pci_bus_id.c_str(), &device)));
+  return device;
+}
+
+absl::StatusOr<int64_t> GetDevicePcieBandwidth(nvmlDevice_t nvml_device) {
+  // nvmlDeviceGetPcieSpeed returns wrong information. Verified with
+  // nvbandwidth.
+  unsigned int link_gen, link_width;
+  nvmlReturn_t result =
+      nvmlDeviceGetCurrPcieLinkGeneration(nvml_device, &link_gen);
+  TF_RETURN_IF_ERROR(ToStatus(result));
+
+  result = nvmlDeviceGetCurrPcieLinkWidth(nvml_device, &link_width);
+  TF_RETURN_IF_ERROR(ToStatus(result));
+
+  // PCIe v1 single lane speed. 0.25 GB/s
+  int64_t lane_speed = 0.25 * 1024 * 1024 * 1024;
+  for (int i = 1; i < link_gen; i++) {
+    lane_speed *= 2;
+  }
+
+  return lane_speed * link_width;
+}
+
+absl::StatusOr<int> GetNumberOfActiveP2PNvlinks(nvmlDevice_t nvml_device) {
+  int p2p_links = 0;
+
+  constexpr int kBlackwellNvLinkCount = 18;
+  for (unsigned int i = 0; i < kBlackwellNvLinkCount; i++) {
+    nvmlEnableState_t is_active = NVML_FEATURE_DISABLED;
+    nvmlReturn_t result = nvmlDeviceGetNvLinkState(nvml_device, i, &is_active);
+    if (result == NVML_ERROR_NOT_SUPPORTED) {
+      break;
+    }
+    TF_RETURN_IF_ERROR(ToStatus(result));
+    if (is_active == NVML_FEATURE_DISABLED) {
+      break;
+    }
+
+    uint32_t supported_p2p = 0;
+    result = nvmlDeviceGetNvLinkCapability(
+        nvml_device, i, NVML_NVLINK_CAP_P2P_SUPPORTED, &supported_p2p);
+    if (result != NVML_ERROR_NOT_SUPPORTED) {
+      TF_RETURN_IF_ERROR(ToStatus(result));
+    }
+    if (supported_p2p) {
+      ++p2p_links;
+    }
+  }
+  return p2p_links;
+}
+
+struct FabricInfo {
+  std::string cluster_uuid;
+  std::string clique_id;
+};
+
+absl::StatusOr<FabricInfo> GetDeviceFabricInfo(nvmlDevice_t device) {
+#if CUDA_VERSION >= 12040
+  nvmlGpuFabricInfoV_t fabricInfo{nvmlGpuFabricInfo_v2};
+  fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
+
+  nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device, &fabricInfo);
+  TF_RETURN_IF_ERROR(ToStatus(result));
+
+  if (fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+    std::string error_message =
+        "NVML doesn't support extracting fabric info or NVLink is not used by "
+        "the device.";
+    VLOG(2) << error_message;
+    return absl::InternalError(error_message);
+  }
+
+  static_assert(sizeof(fabricInfo.clusterUuid) == 16);
+  std::string uuid_str = absl::StrFormat(
+      "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+      fabricInfo.clusterUuid[0], fabricInfo.clusterUuid[1],
+      fabricInfo.clusterUuid[2], fabricInfo.clusterUuid[3],
+      fabricInfo.clusterUuid[4], fabricInfo.clusterUuid[5],
+      fabricInfo.clusterUuid[6], fabricInfo.clusterUuid[7],
+      fabricInfo.clusterUuid[8], fabricInfo.clusterUuid[9],
+      fabricInfo.clusterUuid[10], fabricInfo.clusterUuid[11],
+      fabricInfo.clusterUuid[12], fabricInfo.clusterUuid[13],
+      fabricInfo.clusterUuid[14], fabricInfo.clusterUuid[15]);
+
+  return FabricInfo{uuid_str, absl::StrCat(fabricInfo.cliqueId)};
+#else   // CUDA_VERSION >= 12040
+  std::string error_message = "NVML usage is not supported";
+  VLOG(2) << error_message;
+  return absl::InternalError(error_message);
+#endif  // CUDA_VERSION >= 12040
+}
+
 }  // namespace
 
 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
@@ -641,7 +823,7 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
 }
 
 absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetMemoryRange(
-    const DeviceMemoryBase& location) {
+    const DeviceMemoryBase& location) const {
   CUdeviceptr device_pointer;
   size_t size;
   TF_RETURN_IF_ERROR(cuda::ToStatus(
@@ -666,6 +848,120 @@ absl::StatusOr<xla::gpu::GpuCollectives*> GetGpuCollectives(
   return tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
 }
 
+CudaExecutor::VmmMemoryHandle::~VmmMemoryHandle() { CHECK_OK(Release()); }
+
+absl::Status CudaExecutor::VmmMemoryHandle::Release() {
+  if (handle_ != 0) {
+    TF_RETURN_IF_ERROR(cuda::ToStatus(
+        cuMemRelease(static_cast<CUmemGenericAllocationHandle>(handle_))));
+    handle_ = 0;
+  }
+
+  return absl::OkStatus();
+}
+
+CudaExecutor::VmmMemoryHandle::VmmMemoryHandle(VmmMemoryHandle&& other) {
+  handle_ = other.handle_;
+  other.handle_ = 0;
+}
+
+CudaExecutor::VmmMemoryHandle& CudaExecutor::VmmMemoryHandle::operator=(
+    VmmMemoryHandle&& other) {
+  if (this != &other) {
+    CHECK_OK(Release());
+    handle_ = other.handle_;
+    other.handle_ = 0;
+  }
+  return *this;
+}
+
+absl::StatusOr<CudaExecutor::VmmMemoryHandle>
+CudaExecutor::RetainVmmMemoryHandle(void* ptr) const {
+  if (!is_vmm_supported_) {
+    return absl::InternalError("VMM is not supported on this device.");
+  }
+
+  CUmemGenericAllocationHandle handle;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemRetainAllocationHandle(&handle, ptr)));
+
+  return CudaExecutor::VmmMemoryHandle(static_cast<uint64_t>(handle));
+}
+
+absl::StatusOr<size_t> CudaExecutor::GetVmmGranularity() const {
+  CUmemAllocationProp properties =
+      GetVmmAllocationProperties(device_, is_rdma_supported_);
+  size_t granularity = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemGetAllocationGranularity(
+      &granularity, &properties, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)));
+  return granularity;
+}
+
+absl::StatusOr<void*> CudaExecutor::VmmAllocateMemory(uint64_t bytes) {
+  if (!is_vmm_supported_) {
+    return absl::InternalError("VMM is not supported on this device.");
+  }
+
+  std::unique_ptr<ActivateContext> activation = Activate();
+
+  CUmemAllocationProp properties =
+      GetVmmAllocationProperties(device_, is_rdma_supported_);
+  size_t granularity = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemGetAllocationGranularity(
+      &granularity, &properties, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)));
+
+  uint64_t padded_size = xla::RoundUpTo<uint64_t>(bytes, granularity);
+  CUmemGenericAllocationHandle handle;
+
+  // Create physical memory allocation.
+  TF_RETURN_IF_ERROR(
+      cuda::ToStatus(cuMemCreate(&handle, padded_size, &properties, 0)));
+
+  // Reserve and map virtual address space.
+  CUdeviceptr ptr;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(
+      cuMemAddressReserve(&ptr, padded_size, granularity, 0, 0)));
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemMap(ptr, padded_size, 0, handle, 0)));
+
+  VLOG(3) << "[" << device_ordinal() << "] VMM allocated " << ptr
+          << " requested size: " << bytes << " padded size: " << padded_size
+          << " granularity: " << granularity;
+
+  int device_count = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cudaGetDeviceCount(&device_count)));
+  for (int peer = 0; peer < device_count; peer++) {
+    if (peer == device_ordinal() || CanEnablePeerAccess(peer, device_)) {
+      CUmemAccessDesc accessDesc = GetVmmAccessDescriptor(peer);
+      TF_RETURN_IF_ERROR(
+          cuda::ToStatus(cuMemSetAccess(ptr, padded_size, &accessDesc, 1)));
+    }
+  }
+
+  return reinterpret_cast<void*>(ptr);
+}
+
+absl::Status CudaExecutor::VmmDeallocateMemory(void* ptr) {
+  if (!is_vmm_supported_) {
+    return absl::InternalError("VMM is not supported on this device.");
+  }
+
+  std::unique_ptr<ActivateContext> activation = Activate();
+
+  CUmemGenericAllocationHandle handle = 0;
+  {
+    TF_ASSIGN_OR_RETURN(VmmMemoryHandle scoped_handle,
+                        RetainVmmMemoryHandle(ptr));
+    handle = static_cast<CUmemGenericAllocationHandle>(scoped_handle.handle());
+  }
+  size_t size = 0;
+  CUdeviceptr device_ptr = reinterpret_cast<CUdeviceptr>(ptr);
+  TF_RETURN_IF_ERROR(
+      cuda::ToStatus(cuMemGetAddressRange(nullptr, &size, device_ptr)));
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemUnmap(device_ptr, size)));
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemRelease(handle)));
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemAddressFree(device_ptr, size)));
+  return absl::OkStatus();
+}
+
 absl::StatusOr<void*> CollectiveMemoryAllocate(StreamExecutor* executor,
                                                uint64_t bytes) {
   if (bytes == 0) return nullptr;
@@ -755,6 +1051,9 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
 
 absl::Status CudaExecutor::Init() {
   TF_ASSIGN_OR_RETURN(device_, GetDevice(device_ordinal()));
+  TF_ASSIGN_OR_RETURN(is_vmm_supported_, IsVmmSupported(device_));
+  TF_ASSIGN_OR_RETURN(is_rdma_supported_, IsRdmaSupported(device_));
+  TF_ASSIGN_OR_RETURN(is_multicast_supported_, IsMulticastSupported(device_));
   TF_ASSIGN_OR_RETURN(CudaContext * context,
                       CudaContext::Create(device_ordinal(), device_));
   cuda_context_ = context;
@@ -865,6 +1164,7 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
     VLOG(2) << "[" << device_ordinal() << "] Resolve CUDA kernel "
             << kernel_name << " from symbol pointer: " << symbol;
     cudaFunction_t func;
+    std::unique_ptr<ActivateContext> scoped_activation = Activate();
     TF_RETURN_IF_ERROR(cuda::ToStatus(
         cudaGetFuncBySymbol(&func, symbol),
         absl::StrFormat("[%d] Failed call to cudaGetFuncBySymbol",
@@ -893,7 +1193,23 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
   TF_ASSIGN_OR_RETURN(KernelMetadata kernel_metadata,
                       cuda_kernel->GetKernelMetadata());
   cuda_kernel->set_metadata(kernel_metadata);
-  cuda_kernel->set_args_packing(spec.kernel_args_packing());
+  if (std::holds_alternative<KernelLoaderSpec::KernelArgsPackingFunc>(
+          spec.kernel_args_packing())) {
+    cuda_kernel->set_args_packing(
+        std::get<KernelLoaderSpec::KernelArgsPackingFunc>(
+            spec.kernel_args_packing()));
+  } else {
+    const auto& packing_spec =
+        std::get<KernelArgumentsPackingSpec>(spec.kernel_args_packing());
+    cuda_kernel->set_args_packing([packing_spec](const Kernel& kernel,
+                                                 const KernelArgs& args) {
+      const auto& mem_args =
+          stream_executor::Cast<stream_executor::KernelArgsDeviceMemoryArray>(
+              &args);
+      return packing_spec.BuildArguments(mem_args->device_memory_args(),
+                                         args.number_of_shared_bytes());
+    });
+  }
   return std::move(cuda_kernel);
 }
 
@@ -1078,6 +1394,20 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceMemoryBase(result.value(), size);
   }
 
+  if (memory_space == static_cast<int64_t>(MemoryType::kP2P) &&
+      is_vmm_supported_) {
+    auto device_buf_base = VmmAllocateMemory(size);
+
+    if (device_buf_base.ok()) {
+      return DeviceMemoryBase(device_buf_base.value(), size);
+    }
+
+    LOG(ERROR) << "Failed to allocate memory with VMM: "
+               << device_buf_base.status();
+
+    return DeviceMemoryBase(nullptr, 0);
+  }
+
   CHECK(memory_space == static_cast<int64_t>(MemoryType::kDevice) ||
         memory_space == static_cast<int64_t>(MemoryType::kP2P));
 
@@ -1105,7 +1435,12 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
   if (memory_space == MemoryType::kHost) {
     HostDeallocate(cuda_context_, numa_node_, mem->opaque(), mem->size());
   } else {
-    DeviceDeallocate(cuda_context_, mem->opaque());
+    // Memory space is always kDevice here, so the only way to check if the
+    // memory was allocated with VMM API is to try to retain the handle with VMM
+    // API (which VmmDeallocateMemory does).
+    if (!VmmDeallocateMemory(mem->opaque()).ok()) {
+      DeviceDeallocate(cuda_context_, mem->opaque());
+    }
   }
 }
 
@@ -1383,10 +1718,10 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
   });
   cudnn_version_ready.WaitForNotification();
 
-  {
-    std::string pci_bus_id = GetPCIBusID(device);
-    desc.set_pci_bus_id(pci_bus_id);
+  std::string pci_bus_id = GetPCIBusID(device);
+  desc.set_pci_bus_id(pci_bus_id);
 
+  {
     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
     std::optional<int> numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
     // If the kernel reports -1, adjust to 0; leave as -1 if no value could be
@@ -1439,6 +1774,36 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
                               int64_t{mem_bus_width_bits.value()} / 8);
   }
 
+  if (absl::StatusOr<nvmlDevice_t> device = GetNvmlDevice(pci_bus_id);
+      device.ok()) {
+    absl::StatusOr<int64_t> bandwidth = GetDevicePcieBandwidth(*device);
+    if (bandwidth.ok()) {
+      desc.set_pcie_bandwidth(*bandwidth);
+    } else {
+      LOG(ERROR) << bandwidth.status().message()
+                 << " Assuming PCIe gen 3 x16 bandwidth.";
+      bandwidth = 16LL * 1024 * 1024 * 1024;
+    }
+
+    absl::StatusOr<int64_t> p2p_link_count =
+        GetNumberOfActiveP2PNvlinks(*device);
+    DeviceInterconnectInfo info;
+    if (p2p_link_count.ok()) {
+      info.active_links = *p2p_link_count;
+    } else {
+      LOG(ERROR) << p2p_link_count;
+    }
+    absl::StatusOr<FabricInfo> fabric_info = GetDeviceFabricInfo(*device);
+    if (fabric_info.ok()) {
+      info.cluster_uuid = fabric_info->cluster_uuid;
+      info.clique_id = fabric_info->clique_id;
+    } else {
+      LOG(WARNING) << "GPU interconnect information not available: "
+                   << fabric_info.status();
+    }
+    desc.set_device_interconnect_info(info);
+  }
+
   {
     BlockDim block_dim_limit;
     TF_RETURN_IF_ERROR(FillBlockDimLimit(device, &block_dim_limit));
@@ -1520,6 +1885,35 @@ absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
   }
 }
 
+int CudaExecutor::GetGpuStreamPriority(StreamPriority priority) {
+  if (priority == StreamPriority::Default) {
+    return 0;
+  }
+
+  absl::call_once(stream_priority_once_, [this]() {
+    std::unique_ptr<ActivateContext> activation = Activate();
+    int lowest = 0;
+    int highest = 0;
+    absl::Status status =
+        cuda::ToStatus(cuCtxGetStreamPriorityRange(&lowest, &highest));
+    if (!status.ok()) {
+      LOG(ERROR) << "Could not query stream priority range. Returning default "
+                    "priority.";
+      stream_priority_query_ok_ = false;
+      return;
+    }
+    stream_priority_lowest_ = lowest;
+    stream_priority_highest_ = highest;
+    stream_priority_query_ok_ = true;
+  });
+
+  if (!stream_priority_query_ok_) {
+    return 0;
+  }
+  return priority == StreamPriority::Highest ? stream_priority_highest_
+                                             : stream_priority_lowest_;
+}
+
 absl::StatusOr<const CudaKernel*> CudaExecutor::GetCudaKernel(
     const Kernel* kernel) {
   absl::MutexLock lock{in_memory_modules_mu_};
@@ -1558,5 +1952,162 @@ absl::StatusOr<TensorMap> CudaExecutor::CreateTensorMap(
   return absl::bit_cast<TensorMap>(tensor_map);
 }
 
+absl::StatusOr<std::unique_ptr<GpuExecutor::MulticastMemory>>
+CudaExecutor::CreateMulticastMemory(uint64_t size, int num_devices) const {
+  if (!is_multicast_supported_) {
+    return absl::FailedPreconditionError(
+        "Multicast memory is not supported on this platform.");
+  }
+  if (size == 0 || num_devices <= 1) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Multicast memory size must be > 0 and number of devices "
+                     "must be greater than 1, but got size: ",
+                     size, " and num_devices: ", num_devices, "."));
+  }
+
+  auto multicast_memory = std::make_unique<CudaMulticastMemory>();
+  TF_RETURN_IF_ERROR(multicast_memory->Initialize(size, num_devices, this));
+  return multicast_memory;
+}
+
+CudaExecutor::CudaMulticastMemory::~CudaMulticastMemory() {
+  if (handle_ != 0) {
+    for (auto const& [device_ordinal, mapped_memory_ptr] : mapped_devices_) {
+      VLOG(3) << "[" << device_ordinal << "] Unbind multicast: " << handle_;
+      CHECK_OK(stream_executor::cuda::ToStatus(cuMulticastUnbind(
+          handle_, device_ordinal, /*mcOffset=*/0, padded_size_)));
+
+      VLOG(3) << "[" << device_ordinal << "] Unmap ptr: " << mapped_memory_ptr;
+      CHECK_OK(stream_executor::cuda::ToStatus(
+          cuMemUnmap(mapped_memory_ptr, padded_size_)));
+      VLOG(3) << "[" << device_ordinal
+              << "] Release address space: " << mapped_memory_ptr;
+      CHECK_OK(stream_executor::cuda::ToStatus(
+          cuMemAddressFree(mapped_memory_ptr, padded_size_)));
+    }
+    CHECK_OK(stream_executor::cuda::ToStatus(
+        cuMemRelease(static_cast<CUmemGenericAllocationHandle>(handle_))));
+  }
+}
+
+absl::Status CudaExecutor::CudaMulticastMemory::Initialize(
+    uint64_t size, int num_devices, const GpuExecutor* gpu_executor) {
+  const CudaExecutor* cuda_executor =
+      dynamic_cast<const CudaExecutor*>(gpu_executor);
+  if (cuda_executor == nullptr) {
+    return absl::InvalidArgumentError("GpuExecutor is not a CudaExecutor.");
+  }
+
+  if (handle_ != 0) {
+    return absl::FailedPreconditionError(
+        "Multicast memory is already initialized.");
+  }
+
+  if (num_devices <= 1) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Number of devices must be greater than 1, but got ",
+                     num_devices, "."));
+  }
+
+  CUmemAllocationProp properties = GetVmmAllocationProperties(
+      cuda_executor->device_, cuda_executor->is_rdma_supported_);
+  TF_RETURN_IF_ERROR(
+      stream_executor::cuda::ToStatus(cuMemGetAllocationGranularity(
+          &granularity_, &properties, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)));
+
+  padded_size_ = xla::RoundUpTo<size_t>(size, granularity_);
+  num_devices_ = num_devices;
+  TF_ASSIGN_OR_RETURN(CUmulticastObjectProp multicast_properties,
+                      CreateMulticastObjectProperties(num_devices_, size));
+
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuMulticastCreate(&handle_, &multicast_properties)));
+  VLOG(3) << "[" << static_cast<int>(cuda_executor->device_)
+          << "] Created multicast memory: " << static_cast<uint64_t>(handle_)
+          << " size: " << padded_size_ << " with granularity: " << granularity_
+          << " for " << num_devices_ << " devices.";
+  return absl::OkStatus();
+}
+
+absl::Status CudaExecutor::CudaMulticastMemory::SubscribeDevice(
+    int device_number) {
+  if (handle_ == 0) {
+    return absl::FailedPreconditionError(
+        "Multicast memory is not initialized.");
+  }
+
+  if (subscribed_devices_ >= num_devices_) {
+    return absl::InvalidArgumentError("All devices are already subscribed.");
+  }
+
+  VLOG(3) << "[" << device_number << "] Subscribe to multicast: " << handle_;
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuMulticastAddDevice(handle_, device_number)));
+  subscribed_devices_++;
+  return absl::OkStatus();
+}
+
+absl::StatusOr<void*> CudaExecutor::CudaMulticastMemory::MapMemory(
+    const DeviceMemoryBase& location, const GpuExecutor* gpu_executor) {
+  const CudaExecutor* cuda_executor =
+      dynamic_cast<const CudaExecutor*>(gpu_executor);
+  if (cuda_executor == nullptr) {
+    return absl::InvalidArgumentError("GpuExecutor is not a CudaExecutor.");
+  }
+
+  if (location.is_null()) {
+    return absl::InvalidArgumentError("Device pointer is null.");
+  }
+
+  if (handle_ == 0) {
+    return absl::FailedPreconditionError(
+        "Multicast memory is not initialized.");
+  }
+
+  if (subscribed_devices_ != num_devices_) {
+    return absl::FailedPreconditionError("All devices should be subscribed.");
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      stream_executor::gpu::CudaExecutor::VmmMemoryHandle memory_handle,
+      cuda_executor->RetainVmmMemoryHandle(location.opaque()));
+
+  CUmemGenericAllocationHandle retained_memory_handle =
+      static_cast<CUmemGenericAllocationHandle>(memory_handle.handle());
+
+  TF_ASSIGN_OR_RETURN(auto base_address,
+                      cuda_executor->GetMemoryRange(location));
+  uint64_t offset = reinterpret_cast<uint64_t>(location.opaque()) -
+                    reinterpret_cast<uint64_t>(base_address.opaque());
+
+  // Bind the memory to the multicast object.
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuMulticastBindMem(handle_, /*mcOffset=*/0, retained_memory_handle,
+                         /*memOffset=*/offset, padded_size_, /*flags=*/0)));
+
+  VLOG(3) << "[" << static_cast<int>(cuda_executor->device_)
+          << "] Mapped multicast memory: " << static_cast<uint64_t>(handle_)
+          << " size: " << padded_size_ << " with granularity: " << granularity_
+          << " to address: " << location.opaque()
+          << " offset from base range: " << offset;
+
+  // Map a virtual address range for the multicast memory. Multicast
+  // memory is used to reduce the data stored in the multicast object.
+  CUdeviceptr multicast_device_ptr;
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(cuMemAddressReserve(
+      &multicast_device_ptr, padded_size_, granularity_, 0, 0)));
+
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuMemMap(multicast_device_ptr, padded_size_, 0, handle_, 0)));
+
+  CUmemAccessDesc accessDesc = GetVmmAccessDescriptor(cuda_executor->device_);
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuMemSetAccess(multicast_device_ptr, padded_size_, &accessDesc, 1)));
+
+  absl::MutexLock subscription_lock(mapped_devices_mu_);
+  mapped_devices_.emplace(cuda_executor->device_, multicast_device_ptr);
+  return reinterpret_cast<void*>(multicast_device_ptr);
+}
+
 }  // namespace gpu
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index aacf74a1166030..2077ee253c1098 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
 
+#include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <map>
 #include <memory>
@@ -24,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <variant>
 
+#include "absl/base/call_once.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -65,8 +68,7 @@ class CudaExecutor : public GpuExecutor {
   absl::Status Init() override;
   bool SynchronizeAllActivity() override;
   absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
-      const DeviceMemoryBase& location) override;
-
+      const DeviceMemoryBase& location) const override;
   absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
       Stream* stream, bool use_delay_kernel) override;
   absl::StatusOr<DeviceMemoryBase> GetSymbol(
@@ -137,7 +139,74 @@ class CudaExecutor : public GpuExecutor {
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
       MemoryType type) override;
 
+  // Returns the granularity which is the minimum unit of memory that can be
+  // allocated with VMM API. In order to map the memory slices to multicast
+  // object, the offset of the slices should be aligned with this granularity.
+  absl::StatusOr<size_t> GetVmmGranularity() const;
+
+  int GetGpuStreamPriority(StreamPriority priority) override;
+
+  // RAII wrapper for a VMM memory handle.
+  class VmmMemoryHandle {
+   public:
+    explicit VmmMemoryHandle(uint64_t handle) : handle_(handle) {}
+    ~VmmMemoryHandle();
+    VmmMemoryHandle(const VmmMemoryHandle&) = delete;
+    VmmMemoryHandle& operator=(const VmmMemoryHandle&) = delete;
+    VmmMemoryHandle(VmmMemoryHandle&&);
+    VmmMemoryHandle& operator=(VmmMemoryHandle&&);
+
+    uint64_t handle() const { return handle_; }
+
+   private:
+    absl::Status Release();
+    uint64_t handle_;
+  };
+
+  class CudaMulticastMemory : public MulticastMemory {
+   public:
+    CudaMulticastMemory()
+        : handle_(0),
+          padded_size_(0),
+          granularity_(0),
+          num_devices_(0),
+          subscribed_devices_(0) {}
+    ~CudaMulticastMemory() override;
+
+    absl::Status SubscribeDevice(int device_number) override;
+
+    absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
+                                    const GpuExecutor* gpu_executor) override;
+
+   private:
+    friend class CudaExecutor;
+    absl::Status Initialize(uint64_t size, int num_devices,
+                            const GpuExecutor* gpu_executor);
+    CUmemGenericAllocationHandle handle_;
+    uint64_t padded_size_;
+    uint64_t granularity_;
+    int num_devices_;
+    std::atomic<int> subscribed_devices_;
+    absl::flat_hash_map<int, CUdeviceptr> mapped_devices_
+        ABSL_GUARDED_BY(mapped_devices_mu_);
+    absl::Mutex mapped_devices_mu_;
+  };
+
+  absl::StatusOr<std::unique_ptr<MulticastMemory>> CreateMulticastMemory(
+      uint64_t size, int num_devices) const override;
+
+  // Returns a handle to the given memory if it was allocated with VMM API.
+  absl::StatusOr<VmmMemoryHandle> RetainVmmMemoryHandle(void* ptr) const;
+
+  bool is_multicast_supported() const override {
+    return is_multicast_supported_;
+  }
+
  private:
+  absl::Status VmmDeallocateMemory(void* ptr);
+
+  absl::StatusOr<void*> VmmAllocateMemory(uint64_t bytes);
+
   // Loads a module in cubin format.
   absl::StatusOr<ModuleHandle> LoadModuleFromCuBin(const char* cubin)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
@@ -152,6 +221,12 @@ class CudaExecutor : public GpuExecutor {
   // Returns true if a delay kernel is supported.
   absl::StatusOr<bool> DelayKernelIsSupported();
 
+  bool is_vmm_supported_ = false;
+
+  bool is_rdma_supported_ = false;
+
+  bool is_multicast_supported_ = false;
+
   // Guards the in-memory-module mapping.
   absl::Mutex in_memory_modules_mu_;
 
@@ -208,6 +283,13 @@ class CudaExecutor : public GpuExecutor {
 
   // CudaContext for this device.
   CudaContext* cuda_context_;
+
+  // Cached CUDA stream priority range. Initialized once on first non-default
+  // request and then reused for subsequent calls.
+  absl::once_flag stream_priority_once_;
+  int stream_priority_lowest_ = 0;
+  int stream_priority_highest_ = 0;
+  bool stream_priority_query_ok_ = false;
 };
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
new file mode 100644
index 00000000000000..c5ffbf3d4d0fbd
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
@@ -0,0 +1,282 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/cuda/cuda_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu {
+namespace {
+using ::absl_testing::IsOk;
+using ::absl_testing::IsOkAndHolds;
+using ::absl_testing::StatusIs;
+using ::testing::NotNull;
+
+template <typename T>
+absl::StatusOr<stream_executor::DeviceMemoryBase> AllocateInitializedMemory(
+    CudaExecutor* executor, size_t size, size_t offset, T value) {
+  stream_executor::DeviceMemoryBase device_memory = executor->Allocate(
+      size + offset, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
+  if (device_memory.opaque() == nullptr) {
+    return absl::InternalError("Failed to allocate memory.");
+  }
+
+  size_t num_initialized_elements = size / sizeof(T);
+  std::vector<T> device_memory_vector(num_initialized_elements, value);
+
+  auto stride_memory = device_memory.GetByteSlice(offset, size);
+  TF_RETURN_IF_ERROR(executor->SynchronousMemcpy(
+      &stride_memory, device_memory_vector.data(), size));
+  return stride_memory;
+}
+
+template <typename T>
+absl::Status CheckMemory(CudaExecutor* executor,
+                         stream_executor::DeviceMemoryBase device_memory,
+                         T expected_value) {
+  size_t num_elements = device_memory.size() / sizeof(T);
+  std::vector<T> device_memory_vector(num_elements, 0);
+  TF_RETURN_IF_ERROR(executor->SynchronousMemcpy(
+      device_memory_vector.data(), device_memory, device_memory.size()));
+  for (int i = 0; i < device_memory_vector.size(); ++i) {
+    EXPECT_EQ(device_memory_vector[i], expected_value);
+  }
+
+  return absl::OkStatus();
+}
+
+StreamExecutor* GetGpuExecutor(int64_t device_ordinal) {
+  auto* platform =
+      PlatformManager::PlatformWithName(stream_executor::GpuPlatformName())
+          .value();
+  return platform->ExecutorForDevice(device_ordinal).value();
+}
+
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryResubscriptionFails) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(multicast_memory,
+                          executors[0]->CreateMulticastMemory(1024, 2));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0),
+              StatusIs(absl::StatusCode::kInternal,
+                       "CUDA error: : CUDA_ERROR_UNKNOWN: unknown error"));
+}
+
+TEST(CudaExecutorMultiGpuTest, AllDevicesMustBeSubscribedBeforeMapping) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(multicast_memory,
+                          executors[0]->CreateMulticastMemory(1024, 2));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  DeviceMemoryBase device_memory(reinterpret_cast<void*>(1), 1);
+  EXPECT_THAT(multicast_memory->MapMemory(device_memory, executors[0]),
+              StatusIs(absl::StatusCode::kFailedPrecondition,
+                       "All devices should be subscribed."));
+  ;
+}
+
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemorySubscribeMoreDevices) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(multicast_memory,
+                          executors[0]->CreateMulticastMemory(1024, 2));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(2),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       "All devices are already subscribed."));
+  ;
+}
+
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingNonVmmMemory) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  const int64_t kNumDevices = 2;
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(
+      multicast_memory, executors[0]->CreateMulticastMemory(1024, kNumDevices));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
+
+  DeviceMemoryBase device_memory = executors[0]->Allocate(8, 0);
+  EXPECT_THAT(
+      multicast_memory->MapMemory(device_memory, executors[0]),
+      StatusIs(absl::StatusCode::kInternal,
+               "CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
+}
+
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingVmmMemory) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  const int kNumDevices = 2;
+  const int kNumElements = 8;
+  const size_t kMemorySize = kNumElements * sizeof(int);
+  const int kValue = 2;
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(multicast_memory, executors[0]->CreateMulticastMemory(
+                                                kMemorySize, kNumDevices));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase first_device_memory,
+      AllocateInitializedMemory(executors[0], kMemorySize, 0, kValue));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase output_device_memory,
+      AllocateInitializedMemory(executors[0], kMemorySize, 0, 0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      void* first_device_multicast_ptr,
+      multicast_memory->MapMemory(first_device_memory, executors[0]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase second_device_memory,
+      AllocateInitializedMemory(executors[1], kMemorySize, 0, kValue));
+  EXPECT_THAT(multicast_memory->MapMemory(second_device_memory, executors[1]),
+              IsOkAndHolds(NotNull()));
+
+  EXPECT_THAT(
+      MulticastReduce((int*)first_device_multicast_ptr,
+                      (int*)output_device_memory.opaque(), kNumElements),
+      IsOk());
+
+  const int kExpectedValue = kValue * kNumDevices;
+  EXPECT_THAT(CheckMemory(executors[0], output_device_memory, kExpectedValue),
+              IsOk());
+}
+
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlicesUnaligned) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  const int64_t kNumDevices = 2;
+  const int64_t kNumElements = 8;
+  const int64_t kMappedMemorySize = kNumElements * sizeof(int);
+  const int kValue = 2;
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(
+      multicast_memory,
+      executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
+
+  TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
+                          executors[0]->GetVmmGranularity());
+  // Allocate memory with unaligned offset.
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase first_device_mapped_memory,
+      AllocateInitializedMemory(
+          executors[0],
+          // Add granularity to make sure that there is
+          // enough memory after adding offset to map with multicast object.
+          kMappedMemorySize + vmm_granularity, kMappedMemorySize, kValue));
+  EXPECT_THAT(
+      multicast_memory->MapMemory(first_device_mapped_memory, executors[0]),
+      StatusIs(absl::StatusCode::kInternal,
+               "CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
+}
+
+// Slices mapping works only when offset is aligned with the VMM granularity.
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlices) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  const int64_t kNumDevices = 2;
+  const int64_t kNumElements = 8;
+  const int64_t kMappedMemorySize = kNumElements * sizeof(int);
+  const int kValue = 2;
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(
+      multicast_memory,
+      executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
+
+  TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
+                          executors[0]->GetVmmGranularity());
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase first_device_mapped_memory,
+      AllocateInitializedMemory(executors[0], kMappedMemorySize,
+                                vmm_granularity, kValue));
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase output_device_memory,
+      AllocateInitializedMemory(executors[0], kMappedMemorySize, 0, 0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      void* first_device_multicast_ptr,
+      multicast_memory->MapMemory(first_device_mapped_memory, executors[0]));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase second_device_mapped_memory,
+      AllocateInitializedMemory(executors[1], kMappedMemorySize, 0, kValue));
+  EXPECT_THAT(
+      multicast_memory->MapMemory(second_device_mapped_memory, executors[1]),
+      IsOkAndHolds(NotNull()));
+
+  EXPECT_THAT(
+      MulticastReduce((int*)first_device_multicast_ptr,
+                      (int*)output_device_memory.opaque(), kNumElements),
+      IsOk());
+
+  const int kExpectedValue = kValue * kNumDevices;
+  EXPECT_THAT(CheckMemory(executors[0], output_device_memory, kExpectedValue),
+              IsOk());
+}
+}  // namespace
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.cu.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.cu.cc
new file mode 100644
index 00000000000000..fc2652cf210d94
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.cu.cc
@@ -0,0 +1,51 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h"
+
+#include "xla/stream_executor/cuda/cuda_status.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+__global__ void MulticastReduceKernel(int* input, int* output, size_t size) {
+#if __CUDA_ARCH__ >= 900
+  for (int i = 0; i < size; i++) {
+    int* multimem_element_ptr = input + i;
+    int result = 0;
+    asm volatile("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];"
+                 : "=r"(result)
+                 : "l"(multimem_element_ptr)
+                 : "memory");
+
+    output[i] = result;
+  }
+#endif
+}
+}  // namespace
+
+__host__ absl::Status MulticastReduce(int* input, int* output, size_t size) {
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(cudaSetDevice(0)));
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(cudaDeviceSynchronize()));
+  MulticastReduceKernel<<<1, 1, 0>>>(input, output, size);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    return absl::InternalError(
+        absl::StrCat("CUDA Kernel launch failed: ", cudaGetErrorString(err)));
+  }
+  return stream_executor::cuda::ToStatus(cudaDeviceSynchronize());
+}
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h
similarity index 58%
rename from third_party/xla/xla/backends/profiler/gpu/nvtx_utils.cc
rename to third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h
index c4b38d899b0812..bac83b7ee8734b 100644
--- a/third_party/xla/xla/backends/profiler/gpu/nvtx_utils.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,18 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/profiler/gpu/nvtx_utils.h"
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_MULTIGPU_TEST_KERNELS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_MULTIGPU_TEST_KERNELS_H_
 
-#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
-#include "tsl/platform/platform.h"
+#include "absl/status/status.h"
 
-namespace xla {
-namespace profiler {
+namespace stream_executor::gpu {
+absl::Status MulticastReduce(int* input, int* output, size_t size);
+}  // namespace stream_executor::gpu
 
-/*static*/ std::stack<std::string> &NVTXRangeTracker::GetRangeStack() {
-  static thread_local std::stack<std::string> range_stack;
-  return range_stack;
-}
-
-}  // namespace profiler
-}  // namespace xla
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_MULTIGPU_TEST_KERNELS_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
index 4d1c4ca56da342..e3b2919a6d538d 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_platform.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
@@ -47,9 +47,6 @@ using ::testing::HasSubstr;
 using testing::IsEmpty;
 using testing::Not;
 using testing::VariantWith;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(CudaExecutorTest, CreateDeviceDescription) {
   CudaPlatform platform;
@@ -63,14 +60,23 @@ TEST(CudaExecutorTest, CreateDeviceDescription) {
   EXPECT_NE(result->driver_version(), kNullVersion);
   EXPECT_NE(result->compile_time_toolkit_version(), kNullVersion);
 
+  EXPECT_GT(result->pcie_bandwidth(), 1024 * 1024);
   EXPECT_THAT(result->platform_version(), Not(IsEmpty()));
   EXPECT_THAT(result->name(), Not(IsEmpty()));
   EXPECT_THAT(result->model_str(), Not(IsEmpty()));
   EXPECT_THAT(result->device_vendor(), "NVIDIA Corporation");
 
-  EXPECT_THAT(result->gpu_compute_capability(),
-              VariantWith<CudaComputeCapability>(::testing::Field(
-                  "major", &CudaComputeCapability::major, Ge(1))));
+  EXPECT_THAT(*result->gpu_compute_capability().cuda_compute_capability(),
+              ::testing::Field("major", &CudaComputeCapability::major, Ge(1)));
+
+  DeviceInterconnectInfo info = result->device_interconnect_info();
+  if (result->cuda_compute_capability().IsAtLeastBlackwell() &&
+      info.active_links) {
+    EXPECT_GE(info.active_links, 18);
+
+    EXPECT_THAT(info.clique_id, Not(IsEmpty()));
+    EXPECT_THAT(info.cluster_uuid, Not(IsEmpty()));
+  }
 }
 
 TEST(CudaExecutorTest, GetCudaKernel) {
@@ -184,7 +190,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithUnifiedMemory) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           unified_memory_allocator->Allocate(256));
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              IsOkAndHolds(MemoryType::kUnified));
+              absl_testing::IsOkAndHolds(MemoryType::kUnified));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
@@ -196,7 +202,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           executor->HostMemoryAllocate(256));
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              IsOkAndHolds(MemoryType::kHost));
+              absl_testing::IsOkAndHolds(MemoryType::kHost));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithDeviceMemory) {
@@ -208,8 +214,47 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithDeviceMemory) {
   DeviceMemoryBase allocation = executor->Allocate(256);
   EXPECT_NE(allocation.opaque(), nullptr);
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation.opaque()),
-              IsOkAndHolds(MemoryType::kDevice));
+              absl_testing::IsOkAndHolds(MemoryType::kDevice));
+}
+
+TEST(CudaExecutorTest, AllocateMemoryWithVmmApi) {
+  TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
+                          PlatformManager::PlatformWithName("CUDA"));
+  TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
+                          platform->ExecutorForDevice(0));
+
+  auto cuda_executor = dynamic_cast<CudaExecutor*>(executor);
+  ASSERT_NE(cuda_executor, nullptr);
+  DeviceMemoryBase ptr =
+      cuda_executor->Allocate(1024, static_cast<int>(MemoryType::kP2P));
+
+  EXPECT_NE(ptr.opaque(), nullptr);
+  EXPECT_EQ(ptr.size(), 1024);
+  EXPECT_THAT(executor->GetPointerMemorySpace(ptr.opaque()),
+              absl_testing::IsOkAndHolds(MemoryType::kDevice));
+
+  TF_ASSERT_OK_AND_ASSIGN(CudaExecutor::VmmMemoryHandle handle,
+                          cuda_executor->RetainVmmMemoryHandle(ptr.opaque()));
+  EXPECT_NE(handle.handle(), 0);
 }
 
+TEST(CudaExecutorTest,
+     RetainVmmMemoryHandleForTheMemoryAllocatedWithoutVmmApi) {
+  TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
+                          PlatformManager::PlatformWithName("CUDA"));
+  TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
+                          platform->ExecutorForDevice(0));
+
+  auto cuda_executor = dynamic_cast<CudaExecutor*>(executor);
+  ASSERT_NE(cuda_executor, nullptr);
+  DeviceMemoryBase ptr =
+      cuda_executor->Allocate(1024, static_cast<int>(MemoryType::kDevice));
+
+  EXPECT_NE(ptr.opaque(), nullptr);
+  EXPECT_EQ(ptr.size(), 1024);
+
+  EXPECT_THAT(cuda_executor->RetainVmmMemoryHandle(ptr.opaque()),
+              absl_testing::StatusIs(absl::StatusCode::kInternal));
+}
 }  // namespace
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
index 1ab6d767d3ac64..81b8a7663a23c4 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
@@ -30,10 +30,17 @@ limitations under the License.
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace stream_executor {
 namespace gpu {
@@ -84,11 +91,17 @@ absl::Status CudaKernel::Launch(const ThreadDim& thread_dims,
                                 const BlockDim& block_dims,
                                 const std::optional<ClusterDim>& cluster_dims,
                                 Stream* stream, const KernelArgs& args) {
+  TraceMe trace([] { return TraceMeEncode("CudaKernel::Launch", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
   CUfunction function = gpu_function();
 
   // Launch kernels with packed arguments.
   auto launch = [this, stream, &cluster_dims, &thread_dims, &block_dims,
                  function](const KernelArgsPackedArrayBase& packed) {
+    TraceMe trace([] { return TraceMeEncode("CudaKernel::Launch/launch", {}); },
+                  /*level=*/TraceMeLevel::kVerbose);
+
     int32_t expected_number_of_arguments =
         Arity() + (packed.number_of_shared_bytes() > 0);
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
index 18bb2ca5772a63..66db39b00c9dd3 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/logging.h"
@@ -61,9 +62,9 @@ class CudaKernel : public Kernel {
   absl::StatusOr<KernelMetadata> GetKernelMetadata();
 
  private:
-  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
-                      const std::optional<ClusterDim> &cluster_dims,
-                      Stream *stream, const KernelArgs &args) override;
+  absl::Status Launch(const ThreadDim& thread_dims, const BlockDim& block_dims,
+                      const std::optional<ClusterDim>& cluster_dims,
+                      Stream* stream, const KernelArgs& args) override;
 
   StreamExecutor* executor_ = nullptr;
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
index 53ea53b5f2a1f7..b55b1582e941e5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
@@ -15,19 +15,18 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 namespace {
 using testing::Ge;
-using tsl::testing::IsOkAndHolds;
 
 TEST(CudaKernelTest, GetMaxOccupiedBlocksPerCore) {
   TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index f1b0040dbe6170..87bc56a136f65e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
@@ -33,7 +35,6 @@ limitations under the License.
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -44,14 +45,19 @@ namespace {
 static absl::Status InternalInit() {
   absl::Status status =
       cuda::ToStatus(cuInit(0 /* = flags */), "Failed call to cuInit");
-  if (status.ok()) {
+  if (!status.ok()) {
+    LOG(ERROR) << "failed call to cuInit: " << status;
+    cuda::Diagnostician::LogDiagnosticInformation();
     return status;
   }
 
-  LOG(ERROR) << "failed call to cuInit: " << status;
+  nvmlReturn_t init_result = nvmlInit();
+  if (init_result != NVML_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("NVML init failed with ", init_result));
+  }
 
-  cuda::Diagnostician::LogDiagnosticInformation();
-  return status;
+  return absl::OkStatus();
 }
 
 static absl::Status PlatformInitialize() {
@@ -67,6 +73,13 @@ static absl::Status PlatformInitialize() {
 
 CudaPlatform::CudaPlatform() : name_("CUDA") {}
 
+CudaPlatform::~CudaPlatform() {
+  nvmlReturn_t shutdown_result = nvmlShutdown();
+  if (shutdown_result != NVML_SUCCESS) {
+    LOG(ERROR) << "NVML shutdown failed with " << shutdown_result;
+  }
+}
+
 Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
 
 int CudaPlatform::VisibleDeviceCount() const {
@@ -115,7 +128,7 @@ CudaPlatform::GetUncachedExecutor(int ordinal) {
 }  // namespace gpu
 
 static void InitializeCudaPlatform() {
-  TF_CHECK_OK(
+  CHECK_OK(
       PlatformManager::RegisterPlatform(std::make_unique<gpu::CudaPlatform>()));
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
index b03e90f08d8f27..3d50e92cc107fa 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
@@ -25,19 +25,13 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor {
-namespace cuda {
-// Opaque and unique identifier for the CUDA platform plugin.
-// This is needed so that plugins can refer to/identify this platform without
-// instantiating a CudaPlatform object.
-extern const Platform::Id kCudaPlatformId;
-}  // namespace cuda
-
 namespace gpu {
 // Cuda-specific platform plugin, registered as a singleton value via module
 // initializer.
 class CudaPlatform : public Platform {
  public:
   CudaPlatform();
+  ~CudaPlatform() override;
 
   // Platform interface implementation:
   // Returns the same value as kCudaPlatform above.
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
index ce3a4715810080..e0083a4aa7c5dd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "tsl/platform/statusor.h"
@@ -45,5 +46,17 @@ TEST(CudaPlatformTest, FindExistingWorks) {
   }
 }
 
+TEST(CudaPlatformTest, NVML) {
+  TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
+                          PlatformManager::PlatformWithName("CUDA"));
+  CHECK_GT(platform->VisibleDeviceCount(), 0);
+
+  // After successful init, we try to use one of the
+  // nvml functions to see if the result is good.
+  nvmlDevice_t nvml_device;
+  nvmlReturn_t get_device_result = nvmlDeviceGetHandleByIndex(0, &nvml_device);
+  EXPECT_TRUE(get_device_result == NVML_SUCCESS);
+}
+
 }  // namespace
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc b/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc
deleted file mode 100644
index f5ee54b01c8e65..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/cuda/cuda_solver_context.h"
-
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#include "absl/log/log.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "third_party/gpus/cuda/include/cuComplex.h"
-#include "third_party/gpus/cuda/include/cusolverDn.h"
-#include "third_party/gpus/cuda/include/cusolver_common.h"
-#include "third_party/gpus/cuda/include/library_types.h"
-#include "xla/primitive_util.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/platform/platform_object_registry.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace stream_executor {
-
-namespace {
-
-// Type traits to get CUDA complex types from std::complex<T>.
-template <typename T>
-struct GpuComplexT {
-  typedef T type;
-};
-
-template <>
-struct GpuComplexT<std::complex<float>> {
-  typedef cuComplex type;
-};
-template <>
-struct GpuComplexT<std::complex<double>> {
-  typedef cuDoubleComplex type;
-};
-
-template <>
-struct GpuComplexT<std::complex<float>*> {
-  typedef cuComplex* type;
-};
-template <>
-struct GpuComplexT<std::complex<double>*> {
-  typedef cuDoubleComplex* type;
-};
-
-template <typename T>
-inline typename GpuComplexT<T>::type* ToDevicePointer(DeviceMemory<T> p) {
-  return static_cast<typename GpuComplexT<T>::type*>(p.opaque());
-}
-
-cublasFillMode_t GpuBlasUpperLower(blas::UpperLower uplo) {
-  switch (uplo) {
-    case blas::UpperLower::kUpper:
-      return CUBLAS_FILL_MODE_UPPER;
-    case blas::UpperLower::kLower:
-      return CUBLAS_FILL_MODE_LOWER;
-    default:
-      LOG(FATAL) << "Invalid value of blas::UpperLower.";
-  }
-}
-
-// Converts a cuSolver absl::Status to a absl::Status.
-absl::Status ConvertStatus(cusolverStatus_t status) {
-  switch (status) {
-    case CUSOLVER_STATUS_SUCCESS:
-      return absl::OkStatus();
-    case CUSOLVER_STATUS_NOT_INITIALIZED:
-      return xla::FailedPrecondition("cuSolver has not been initialized");
-    case CUSOLVER_STATUS_ALLOC_FAILED:
-      return xla::ResourceExhausted("cuSolver allocation failed");
-    case CUSOLVER_STATUS_INVALID_VALUE:
-      return xla::InvalidArgument("cuSolver invalid value error");
-    case CUSOLVER_STATUS_ARCH_MISMATCH:
-      return xla::FailedPrecondition("cuSolver architecture mismatch error");
-    case CUSOLVER_STATUS_MAPPING_ERROR:
-      return xla::Unknown("cuSolver mapping error");
-    case CUSOLVER_STATUS_EXECUTION_FAILED:
-      return xla::Unknown("cuSolver execution failed");
-    case CUSOLVER_STATUS_INTERNAL_ERROR:
-      return xla::Internal("cuSolver internal error");
-    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return xla::Unimplemented("cuSolver matrix type not supported error");
-    case CUSOLVER_STATUS_NOT_SUPPORTED:
-      return xla::Unimplemented("cuSolver not supported error");
-    case CUSOLVER_STATUS_ZERO_PIVOT:
-      return xla::InvalidArgument("cuSolver zero pivot error");
-    case CUSOLVER_STATUS_INVALID_LICENSE:
-      return xla::FailedPrecondition("cuSolver invalid license error");
-    default:
-      return xla::Unknown("Unknown cuSolver error");
-  }
-}
-
-}  // namespace
-
-absl::StatusOr<std::unique_ptr<GpuSolverContext>> CudaSolverContext::Create() {
-  cusolverDnHandle_t handle;
-  TF_RETURN_IF_ERROR(ConvertStatus(cusolverDnCreate(&handle)));
-  return absl::WrapUnique(new CudaSolverContext(handle));
-}
-
-absl::Status CudaSolverContext::SetStream(Stream* stream) {
-  return ConvertStatus(cusolverDnSetStream(
-      handle_,
-      static_cast<cudaStream_t>(stream->platform_specific_handle().stream)));
-}
-
-CudaSolverContext::CudaSolverContext(cusolverDnHandle_t handle)
-    : handle_(handle) {}
-
-CudaSolverContext::~CudaSolverContext() {
-  absl::Status status = ConvertStatus(cusolverDnDestroy(handle_));
-  if (!status.ok()) {
-    LOG(ERROR) << "GpuSolverDestroy failed: " << status;
-  }
-}
-
-// Note: NVidia have promised that it is safe to pass 'nullptr' as the argument
-// buffers to cuSolver buffer size methods and this will be a documented
-// behavior in a future cuSolver release.
-absl::StatusOr<int64_t> CudaSolverContext::PotrfBufferSize(
-    xla::PrimitiveType type, blas::UpperLower uplo, int n, int lda,
-    int batch_size) {
-  int size = -1;
-  auto gpu_uplo = GpuBlasUpperLower(uplo);
-  size_t d_lwork = 0; /* size of workspace */
-  size_t h_lwork = 0; /* size of workspace */
-
-  cudaDataType_t cuda_data_type;
-  switch (type) {
-    case xla::F32: {
-      cuda_data_type = CUDA_R_32F;
-      break;
-    }
-    case xla::F64: {
-      cuda_data_type = CUDA_R_64F;
-      break;
-    }
-    case xla::C64: {
-      cuda_data_type = CUDA_C_32F;
-      break;
-    }
-    case xla::C128: {
-      cuda_data_type = CUDA_C_64F;
-      break;
-    }
-    default:
-      return xla::InvalidArgument("Invalid type for cholesky decomposition: %s",
-                                  PrimitiveType_Name(type));
-  }
-  TF_RETURN_IF_ERROR(ConvertStatus(cusolverDnXpotrf_bufferSize(
-      handle_, nullptr, gpu_uplo, n, cuda_data_type, nullptr, lda,
-      cuda_data_type, &d_lwork, &h_lwork)));
-  size = static_cast<int>(d_lwork);
-
-  // CUDA's potrfBatched needs space for the `as` array, which contains
-  // batch_size pointers.  Divide by sizeof(type) because this function returns
-  // not bytes but a number of elements of `type`.
-  int64_t potrf_batched_scratch = xla::CeilOfRatio<int64_t>(
-      batch_size * sizeof(void*), xla::primitive_util::ByteWidth(type));
-
-  return std::max<int64_t>(size, potrf_batched_scratch);
-}
-
-absl::Status CudaSolverContext::PotrfBatched(blas::UpperLower uplo, int n,
-                                             DeviceMemory<float*> as, int lda,
-                                             DeviceMemory<int> lapack_info,
-                                             int batch_size) {
-  return ConvertStatus(cusolverDnSpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::PotrfBatched(blas::UpperLower uplo, int n,
-                                             DeviceMemory<double*> as, int lda,
-                                             DeviceMemory<int> lapack_info,
-                                             int batch_size) {
-  return ConvertStatus(cusolverDnDpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::PotrfBatched(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<float>*> as,
-    int lda, DeviceMemory<int> lapack_info, int batch_size) {
-  return ConvertStatus(cusolverDnCpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::PotrfBatched(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<double>*> as,
-    int lda, DeviceMemory<int> lapack_info, int batch_size) {
-  return ConvertStatus(cusolverDnZpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::Potrf(blas::UpperLower uplo, int n,
-                                      DeviceMemory<double> a, int lda,
-                                      DeviceMemory<int> lapack_info,
-                                      DeviceMemory<double> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_R_64F,
-      ToDevicePointer(a), lda, CUDA_R_64F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-absl::Status CudaSolverContext::Potrf(blas::UpperLower uplo, int n,
-                                      DeviceMemory<float> a, int lda,
-                                      DeviceMemory<int> lapack_info,
-                                      DeviceMemory<float> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_R_32F,
-      ToDevicePointer(a), lda, CUDA_R_32F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-absl::Status CudaSolverContext::Potrf(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<float>> a, int lda,
-    DeviceMemory<int> lapack_info,
-    DeviceMemory<std::complex<float>> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_C_32F,
-      ToDevicePointer(a), lda, CUDA_C_32F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-absl::Status CudaSolverContext::Potrf(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<double>> a, int lda,
-    DeviceMemory<int> lapack_info,
-    DeviceMemory<std::complex<double>> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_C_64F,
-      ToDevicePointer(a), lda, CUDA_C_64F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(CudaSolverContextFactory,
-                                           GpuSolverContextFactory,
-                                           cuda::kCudaPlatformId,
-                                           CudaSolverContext::Create);
-
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h b/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h
deleted file mode 100644
index 15c3a326601602..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
-#define XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
-
-#include <complex>
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "third_party/gpus/cuda/include/cusolverDn.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-
-namespace stream_executor {
-
-class CudaSolverContext : public GpuSolverContext {
- public:
-  static absl::StatusOr<std::unique_ptr<GpuSolverContext>> Create();
-
-  ~CudaSolverContext() override;
-
-  absl::Status SetStream(Stream* stream) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<float*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<double*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<std::complex<float>*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<std::complex<double>*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<float> a,
-                     int lda, DeviceMemory<int> lapack_info,
-                     DeviceMemory<float> workspace) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<double> a,
-                     int lda, DeviceMemory<int> lapack_info,
-                     DeviceMemory<double> workspace) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n,
-                     DeviceMemory<std::complex<float>> a, int lda,
-                     DeviceMemory<int> lapack_info,
-                     DeviceMemory<std::complex<float>> workspace) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n,
-                     DeviceMemory<std::complex<double>> a, int lda,
-                     DeviceMemory<int> lapack_info,
-                     DeviceMemory<std::complex<double>> workspace) override;
-  absl::StatusOr<int64_t> PotrfBufferSize(xla::PrimitiveType type,
-                                          blas::UpperLower uplo, int n, int lda,
-                                          int batch_size) override;
-
- private:
-  explicit CudaSolverContext(cusolverDnHandle_t handle);
-
-  cusolverDnHandle_t handle_;
-};
-
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc b/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
index edfaca09491a53..b23eaf4e13ca1f 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
@@ -48,6 +48,12 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace stream_executor {
 namespace gpu {
@@ -66,21 +72,6 @@ absl::Status RecordGpuEvent(StreamExecutor* executor, CUevent event,
                         "Error recording CUDA event");
 }
 
-int GetGpuStreamPriority(stream_executor::StreamPriority stream_priority) {
-  if (stream_priority == stream_executor::StreamPriority::Default) {
-    return 0;
-  }
-  int lowest, highest;
-  auto status = cuda::ToStatus(cuCtxGetStreamPriorityRange(&lowest, &highest));
-  if (!status.ok()) {
-    LOG(ERROR)
-        << "Could not query stream priority range. Returning default priority.";
-    return 0;
-  }
-  return stream_priority == stream_executor::StreamPriority::Highest ? highest
-                                                                     : lowest;
-}
-
 absl::StatusOr<CUstream> CreateStream(StreamExecutor* executor, int priority) {
   std::unique_ptr<ActivateContext> activation = executor->Activate();
   CUstream stream;
@@ -185,7 +176,7 @@ absl::StatusOr<std::unique_ptr<CudaStream>> CudaStream::Create(
       return std::get<int>(priority.value());
     }
     std::unique_ptr<ActivateContext> activation = executor->Activate();
-    return GetGpuStreamPriority(
+    return executor->GetGpuStreamPriority(
         std::get<StreamPriority>(priority.value_or(StreamPriority::Default)));
   }();
   TF_ASSIGN_OR_RETURN(auto stream_handle,
@@ -244,6 +235,9 @@ void DestroyStream(StreamExecutor* executor, CUstream stream) {
 }
 
 absl::Status SynchronizeStream(StreamExecutor* executor, CUstream stream) {
+  TraceMe trace(
+      [] { return TraceMeEncode("CudaStream::SynchronizeStream", {}); },
+      /*level=*/TraceMeLevel::kVerbose);
   std::unique_ptr<ActivateContext> activation = executor->Activate();
   CHECK(stream != nullptr);
   return cuda::ToStatus(cuStreamSynchronize(stream),
@@ -259,6 +253,9 @@ CudaStream::~CudaStream() {
 }
 
 absl::Status CudaStream::BlockHostUntilDone() {
+  TraceMe trace(
+      [] { return TraceMeEncode("CudaStream::BlockHostUntilDone", {}); },
+      /*level=*/TraceMeLevel::kVerbose);
   TF_RETURN_IF_ERROR(SynchronizeStream(executor_, stream_handle_));
   absl::MutexLock lock(mutex_);
   mutex_.Await(absl::Condition(&no_pending_host_callbacks_));
@@ -363,6 +360,9 @@ absl::Status LaunchCudaKernel(
     unsigned int grid_dim_z, unsigned int block_dim_x, unsigned int block_dim_y,
     unsigned int block_dim_z, unsigned int shared_mem_bytes, CUstream stream,
     void** kernel_params, void** extra) {
+  TraceMe trace([] { return TraceMeEncode("LaunchCudaKernel", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
   std::unique_ptr<ActivateContext> activation = executor->Activate();
   VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
@@ -383,15 +383,20 @@ absl::Status LaunchCudaKernel(
         cuFuncSetCacheConfig(function, CU_FUNC_CACHE_PREFER_SHARED)));
   }
 
-  return cuda::ToStatus(
-      cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
-                     block_dim_y, block_dim_z, shared_mem_bytes, stream,
-                     kernel_params, extra),
-      absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
-                   "; block dims: ", block_dim_x, "x", block_dim_y, "x",
-                   block_dim_z, "; grid dims: ", grid_dim_x, "x", grid_dim_y,
-                   "x", grid_dim_z,
-                   "; shared memory size: ", shared_mem_bytes));
+  {
+    TraceMe trace(
+        [&] { return TraceMeEncode("LaunchCudaKernel/cuLaunchKernel", {}); },
+        /*level=*/TraceMeLevel::kVerbose);
+    return cuda::ToStatus(
+        cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
+                       block_dim_x, block_dim_y, block_dim_z, shared_mem_bytes,
+                       stream, kernel_params, extra),
+        absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
+                     "; block dims: ", block_dim_x, "x", block_dim_y, "x",
+                     block_dim_z, "; grid dims: ", grid_dim_x, "x", grid_dim_y,
+                     "x", grid_dim_z,
+                     "; shared memory size: ", shared_mem_bytes));
+  }
 }
 
 absl::Status LaunchCudaKernel(
@@ -402,6 +407,8 @@ absl::Status LaunchCudaKernel(
     unsigned int block_dim_y, unsigned int block_dim_z,
     unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
     void** extra) {
+  TraceMe trace([] { return TraceMeEncode("LaunchCudaKernel", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
   std::unique_ptr<ActivateContext> activation = executor->Activate();
   VLOG(2) << "launching kernel: " << kernel_name << "; cdx: " << cluster_dim_x
           << " cdy: " << cluster_dim_y << " cdz: " << cluster_dim_z
@@ -444,14 +451,19 @@ absl::Status LaunchCudaKernel(
   launch_config.attrs = &cluster_dims;
   launch_config.numAttrs = 1;
 
-  return cuda::ToStatus(
-      cuLaunchKernelEx(&launch_config, function, kernel_params, extra),
-      absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
-                   "; cluster dims: ", cluster_dim_x, "x", cluster_dim_y, "x",
-                   cluster_dim_z, "; block dims: ", block_dim_x, "x",
-                   block_dim_y, "x", block_dim_z, "; grid dims: ", grid_dim_x,
-                   "x", grid_dim_y, "x", grid_dim_z,
-                   "; shared memory size: ", shared_mem_bytes));
+  {
+    TraceMe trace(
+        [] { return TraceMeEncode("LaunchCudaKernel/cuLaunchKernelEx", {}); },
+        /*level=*/TraceMeLevel::kVerbose);
+    return cuda::ToStatus(
+        cuLaunchKernelEx(&launch_config, function, kernel_params, extra),
+        absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
+                     "; cluster dims: ", cluster_dim_x, "x", cluster_dim_y, "x",
+                     cluster_dim_z, "; block dims: ", block_dim_x, "x",
+                     block_dim_y, "x", block_dim_z, "; grid dims: ", grid_dim_x,
+                     "x", grid_dim_y, "x", grid_dim_z,
+                     "; shared memory size: ", shared_mem_bytes));
+  }
 }
 
 }  // namespace
@@ -460,6 +472,9 @@ absl::Status CudaStream::LaunchKernel(
     const ThreadDim& thread_dims, const BlockDim& block_dims,
     const std::optional<ClusterDim>& cluster_dims, void* function,
     absl::string_view name, void** args, int64_t shmem_bytes) {
+  TraceMe trace([] { return TraceMeEncode("CudaStream::LaunchKernel", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
   if (cluster_dims.has_value()) {
     return LaunchCudaKernel(executor_, name, static_cast<CUfunction>(function),
                             cluster_dims->x, cluster_dims->y, cluster_dims->z,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc
index 7209e565dd9d3e..dea8f31b0008ca 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
@@ -49,7 +49,6 @@ namespace {
 using ::testing::Each;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
-using ::tsl::testing::IsOk;
 
 class CudaStreamTest : public ::testing::Test {
  public:
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc
index 3167ca6c39021d..930a955c65cad7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
@@ -31,13 +32,11 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 namespace {
 using ::testing::Gt;
-using ::tsl::testing::IsOk;
 
 class CudaTimerTest : public ::testing::TestWithParam<CudaTimer::TimerType> {
  public:
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc
index 3d35e16828d15e..f6bda2d14fe482 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc
@@ -18,14 +18,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/semantic_version.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/test.h"
 
 namespace stream_executor {
 namespace {
-using tsl::testing::IsOkAndHolds;
-using tsl::testing::StatusIs;
 
 TEST(CudaVersionParserTest, ValidVersion) {
   EXPECT_THAT(ParseCudaVersion(12040),
diff --git a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc
index 74b73552723f41..55b0a5185dd720 100644
--- a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc
@@ -15,14 +15,25 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cudnn_sdpa_score_mod.h"
 
+#include <iostream>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace stream_executor {
 namespace gpu {
diff --git a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc
index 80189a5f745d8b..f58174f44aede2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc
@@ -21,19 +21,17 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"
 #include "json/json.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
-#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
 
 namespace stream_executor {
 namespace gpu {
 
-using tsl::testing::IsOk;
-
 TEST(CudnnSdpaScoreModTest, CompileFwd) {
   absl::string_view hlo = R"(
   HloModule jit__unnamed_wrapped_function_
diff --git a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
index a52e8058fcd972..3d2008093a4848 100644
--- a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc
index 5c4f3ae0ec4380..1df119ca5e5885 100644
--- a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc
@@ -22,14 +22,13 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/mock_compilation_provider.h"
-#include "xla/stream_executor/device_description.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::cuda {
 namespace {
@@ -40,8 +39,6 @@ using ::testing::Field;
 using ::testing::FieldsAre;
 using ::testing::Return;
 using ::testing::VariantWith;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 TEST(DeferRelocatableCompilationCompilationProviderTest,
      CreateFailsIfDelegateDoesNotSupportCompileAndLink) {
diff --git a/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc
index b2a938bf65e48b..79517ec84435ff 100644
--- a/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc
@@ -95,7 +95,7 @@ absl::StatusOr<Assembly> DriverCompilationProvider::CompileAndLink(
 #endif
 
   if (cc.feature_extension ==
-      CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures) {
+      CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures) {
     return absl::UnimplementedError(
         "Compiling forward compatible kernels is not implemented yet.");
   }
diff --git a/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc
index e40d490ce7c85b..849cdbe9f90e13 100644
--- a/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/gpu_test_kernel_traits.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.h"
 
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
@@ -33,6 +33,25 @@ GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
           "AddI32", arity);
     }));
 
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    IncrementBy5I32KernelWithCustomArgsPackingCuda,
+    stream_executor::gpu::internal::IncrementBy5I32KernelWithCustomArgsPacking,
+    stream_executor::cuda::kCudaPlatformId, ([](size_t arity) {
+      stream_executor::KernelArgumentsPackingSpec spec;
+      // This kernels is implemented in terms of the generic `IncI32` kernel
+      // which accepts a constant scalar argument and an addressable pointer
+      // argument. We use a custom args packing spec to pass a constant scalar
+      // value of 5 to the kernel.
+      spec.AddConstantArgument<int32_t>(5);
+      spec.AddAddressArgument(/*argument_index=*/0);
+      spec.AddAddressArgument(/*argument_index=*/1);
+
+      return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
+          absl::bit_cast<void*>(&stream_executor::gpu::IncI32),
+
+          "IncI32", /*arity=*/3, spec);
+    }));
+
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
     MulI32KernelCuda, stream_executor::gpu::internal::MulI32Kernel,
     stream_executor::cuda::kCudaPlatformId, ([](size_t arity) {
diff --git a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
index e3937c1d82205d..d04b24d363b8a2 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h"
 
-#include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/third_party/xla/xla/stream_executor/cuda/sdc_log.cc b/third_party/xla/xla/stream_executor/cuda/sdc_log.cc
deleted file mode 100644
index e692cb56dd2206..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/sdc_log.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/cuda/sdc_log.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "xla/backends/gpu/runtime/sdc_log_structs.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace stream_executor::cuda {
-
-using ::xla::gpu::SdcLogEntry;
-using ::xla::gpu::SdcLogHeader;
-
-absl::StatusOr<SdcLog> SdcLog::CreateOnDevice(
-    Stream& stream, DeviceMemory<uint8_t> log_buffer) {
-  if (log_buffer.is_null()) {
-    return absl::InvalidArgumentError("Log buffer must be non-null");
-  }
-
-  static constexpr size_t kMinBufferSize =
-      sizeof(SdcLogHeader) + sizeof(SdcLogEntry);
-  if (log_buffer.size() < kMinBufferSize) {
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Log buffer size %u is too small to hold any log "
-                        "entries (required: %u bytes)",
-                        log_buffer.size(), kMinBufferSize));
-  }
-
-  const uint32_t max_entries =
-      (log_buffer.size() - sizeof(SdcLogHeader)) / sizeof(SdcLogEntry);
-  const SdcLogHeader empty_header{
-      /*write_idx=*/0,
-      /*capacity=*/max_entries,
-  };
-  TF_RETURN_IF_ERROR(
-      stream.Memcpy(&log_buffer, &empty_header, sizeof(empty_header)));
-  return SdcLog(log_buffer);
-}
-
-absl::StatusOr<SdcLogHeader> SdcLog::ReadHeaderFromDevice(
-    Stream& stream) const {
-  SdcLogHeader header;
-  TF_RETURN_IF_ERROR(stream.Memcpy(&header, memory_, sizeof(header)));
-  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
-  return header;
-}
-
-absl::StatusOr<std::vector<SdcLogEntry>> SdcLog::ReadFromDevice(
-    Stream& stream) const {
-  std::vector<uint8_t> buffer(memory_.size());
-  TF_RETURN_IF_ERROR(stream.Memcpy(buffer.data(), memory_, memory_.size()));
-  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
-
-  SdcLogHeader header;
-  memcpy(&header, buffer.data(), sizeof(header));
-
-  const uint32_t max_entries =
-      (memory_.size() - sizeof(SdcLogHeader)) / sizeof(SdcLogEntry);
-  const size_t initialized_entries =
-      std::min(max_entries, std::min(header.capacity, header.write_idx));
-  std::vector<SdcLogEntry> entries(initialized_entries);
-  memcpy(entries.data(), buffer.data() + sizeof(header),
-         initialized_entries * sizeof(SdcLogEntry));
-
-  return entries;
-}
-
-absl::StatusOr<xla::gpu::SdcLogProto> SdcLog::ReadProto(Stream& stream) const {
-  TF_ASSIGN_OR_RETURN(std::vector<SdcLogEntry> entries, ReadFromDevice(stream));
-
-  xla::gpu::SdcLogProto sdc_log_proto;
-  sdc_log_proto.mutable_entries()->Reserve(entries.size());
-  for (const auto& entry : entries) {
-    xla::gpu::SdcLogEntryProto* entry_proto = sdc_log_proto.add_entries();
-    entry_proto->set_thunk_id(entry.entry_id.thunk_id().value());
-    entry_proto->set_buffer_idx(entry.entry_id.buffer_idx());
-    entry_proto->set_checksum(entry.checksum);
-  }
-
-  return sdc_log_proto;
-}
-
-}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/sdc_log.h b/third_party/xla/xla/stream_executor/cuda/sdc_log.h
deleted file mode 100644
index 75de21c7cb5ec5..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/sdc_log.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_CUDA_SDC_LOG_H_
-#define XLA_STREAM_EXECUTOR_CUDA_SDC_LOG_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/runtime/sdc.pb.h"
-#include "xla/backends/gpu/runtime/sdc_log_structs.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/stream.h"
-
-namespace stream_executor::cuda {
-
-// A device memory buffer that holds a SdcLogHeader and a variable number of
-// SdcLogEntry structs.
-class SdcLog {
- public:
-  // Returns the number of bytes required to store a log with `entries`
-  // entries.
-  static constexpr size_t RequiredSizeForEntries(size_t entries) {
-    return sizeof(xla::gpu::SdcLogHeader) +
-           sizeof(xla::gpu::SdcLogEntry) * entries;
-  }
-
-  // Initializes an empty `SdcLog` using a `log_buffer` allocated in device
-  // memory.
-  //
-  // `log_buffer` must be allocated in memory of the same device `stream` is
-  // associated with. `log_buffer` must outlive the returned `SdcLog`.
-  //
-  // Contents of the log can be retrieved with `SdcLog::ReadFromDevice`.
-  //
-  // Fails with `absl::StatusCode::kInvalidArgument` if `log_buffer` is too
-  // small to hold any entries.
-  static absl::StatusOr<SdcLog> CreateOnDevice(
-      Stream& stream, DeviceMemory<uint8_t> log_buffer);
-
-  // Creates a `SdcLog` from an already initialized device memory buffer.
-  //
-  // `log_buffer` must contain an initialized `SdcLogHeader`.
-  static SdcLog FromDeviceMemoryUnchecked(DeviceMemory<uint8_t> log_buffer) {
-    return SdcLog(log_buffer);
-  }
-
-  // Reads the header from the device log.
-  //
-  // `stream` must be associated with the same device as the one used to create
-  // the log.
-  absl::StatusOr<xla::gpu::SdcLogHeader> ReadHeaderFromDevice(
-      Stream& stream) const;
-
-  // Reads all entries from the device log into host memory.
-  //
-  // Returned vector contains all initialized entries. If the log overflowed,
-  // excess elements are silently discarded.
-  //
-  // `stream` must be associated with the same device as the one used to create
-  // the log.
-  absl::StatusOr<std::vector<xla::gpu::SdcLogEntry>> ReadFromDevice(
-      Stream& stream) const;
-
-  // Reads all entries from the device log into a proto dump.
-  //
-  // `stream` must be associated with the same device as the one used to create
-  // the log.
-  absl::StatusOr<xla::gpu::SdcLogProto> ReadProto(Stream& stream) const;
-
-  // Returns a view of the `SdcLogHeader`.
-  //
-  // The returned `DeviceMemory` gets invalidated when the `SdcLog` is
-  // destroyed.
-  DeviceMemory<xla::gpu::SdcLogHeader> GetDeviceHeader() const {
-    return DeviceMemory<xla::gpu::SdcLogHeader>(
-        memory_.GetByteSlice(0, sizeof(xla::gpu::SdcLogHeader)));
-  }
-
-  // Returns a view of the `SdcLogEntry` array.
-  //
-  // The returned `DeviceMemory` gets invalidated when the `SdcLog` is
-  // destroyed.
-  DeviceMemory<xla::gpu::SdcLogEntry> GetDeviceEntries() const {
-    return DeviceMemory<xla::gpu::SdcLogEntry>(
-        memory_.GetByteSlice(sizeof(xla::gpu::SdcLogHeader),
-                             memory_.size() - sizeof(xla::gpu::SdcLogHeader)));
-  }
-
- private:
-  explicit SdcLog(DeviceMemory<uint8_t> memory) : memory_(memory) {}
-
-  DeviceMemory<uint8_t> memory_;
-};
-
-}  // namespace stream_executor::cuda
-
-#endif  // XLA_STREAM_EXECUTOR_CUDA_SDC_LOG_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/sdc_log_test.cc b/third_party/xla/xla/stream_executor/cuda/sdc_log_test.cc
deleted file mode 100644
index f19d278cfc6339..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/sdc_log_test.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/cuda/sdc_log.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <optional>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "absl/status/status_matchers.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/runtime/sdc_buffer_id.h"
-#include "xla/backends/gpu/runtime/sdc_log_structs.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/util/proto/proto_matchers.h"
-
-namespace stream_executor::cuda {
-namespace {
-
-using ::tsl::proto_testing::EqualsProto;
-using ::xla::gpu::SdcBufferId;
-using ::xla::gpu::SdcLogEntry;
-using ::xla::gpu::SdcLogHeader;
-using ::xla::gpu::SdcLogProto;
-using ::xla::gpu::ThunkId;
-
-class SdcLogTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    TF_ASSERT_OK_AND_ASSIGN(platform_,
-                            PlatformManager::PlatformWithName("CUDA"));
-    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
-    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
-    allocator_ =
-        std::make_unique<StreamExecutorMemoryAllocator>(stream_->parent());
-  }
-
-  Platform* platform_;
-  StreamExecutor* executor_;
-  std::unique_ptr<Stream> stream_;
-  std::unique_ptr<StreamExecutorMemoryAllocator> allocator_;
-};
-
-TEST_F(SdcLogTest, CreateSdcLogOnDevice_InitializesEmptyLog) {
-  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(1024);
-
-  TF_ASSERT_OK_AND_ASSIGN(SdcLog device_log,
-                          SdcLog::CreateOnDevice(*stream_, log_buffer));
-  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
-
-  EXPECT_EQ(host_log.size(), 0);
-}
-
-TEST_F(SdcLogTest, CreateSdcLogOnDevice_InitializesLogWithCorrectCapacity) {
-  constexpr size_t kMaxEntries = 10;
-  constexpr size_t kExpectedHeaderSize = sizeof(SdcLogHeader);
-  constexpr size_t kExpectedEntriesSize = sizeof(SdcLogEntry) * kMaxEntries;
-  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
-      kExpectedHeaderSize + kExpectedEntriesSize);
-
-  TF_ASSERT_OK_AND_ASSIGN(SdcLog device_log,
-                          SdcLog::CreateOnDevice(*stream_, log_buffer));
-
-  EXPECT_EQ(device_log.GetDeviceHeader().size(), kExpectedHeaderSize);
-  EXPECT_EQ(device_log.GetDeviceEntries().size(), kExpectedEntriesSize);
-}
-
-TEST_F(SdcLogTest, CreateSdcLogOnDevice_InitializesHeader) {
-  constexpr size_t kMaxEntries = 123;
-  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
-      SdcLog::RequiredSizeForEntries(kMaxEntries));
-
-  TF_ASSERT_OK_AND_ASSIGN(SdcLog device_log,
-                          SdcLog::CreateOnDevice(*stream_, log_buffer));
-  TF_ASSERT_OK_AND_ASSIGN(SdcLogHeader header,
-                          device_log.ReadHeaderFromDevice(*stream_));
-
-  EXPECT_EQ(header.write_idx, 0);
-  EXPECT_EQ(header.capacity, kMaxEntries);
-}
-
-TEST_F(SdcLogTest, CreateSdcLogOnDevice_FailsForNullBuffer) {
-  EXPECT_THAT(SdcLog::CreateOnDevice(*stream_, DeviceMemory<uint8_t>()),
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
-}
-
-TEST_F(SdcLogTest, CreateSdcLogOnDevice_FailsForTooSmallBuffer) {
-  DeviceMemory<uint8_t> log_buffer =
-      executor_->AllocateArray<uint8_t>(SdcLog::RequiredSizeForEntries(1) - 1);
-
-  EXPECT_THAT(SdcLog::CreateOnDevice(*stream_, log_buffer),
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
-}
-
-TEST_F(SdcLogTest, ReadAsProto) {
-  DeviceMemory<uint8_t> log_buffer =
-      executor_->AllocateArray<uint8_t>(SdcLog::RequiredSizeForEntries(10));
-  const SdcLogHeader header = {/*write_idx=*/2,
-                               /*capacity=*/10};
-  const SdcLogEntry entries[] = {
-      {/*entry_id=*/SdcBufferId::Create(ThunkId(123), 4).value(),
-       /*checksum=*/12341234},
-      {/*entry_id=*/SdcBufferId::Create(ThunkId(567), 8).value(),
-       /*checksum=*/56785678},
-  };
-  std::vector<uint8_t> log_data(sizeof(header) + sizeof(entries));
-  memcpy(log_data.data(), &header, sizeof(header));
-  memcpy(log_data.data() + sizeof(header), entries, sizeof(entries));
-  TF_ASSERT_OK(stream_->MemcpyH2D(absl::MakeConstSpan(log_data), &log_buffer));
-  TF_ASSERT_OK(stream_->BlockHostUntilDone());
-
-  SdcLog device_log = SdcLog::FromDeviceMemoryUnchecked(log_buffer);
-  TF_ASSERT_OK_AND_ASSIGN(SdcLogProto log_proto,
-                          device_log.ReadProto(*stream_));
-
-  EXPECT_THAT(log_proto, EqualsProto(R"pb(
-                entries { thunk_id: 123 buffer_idx: 4 checksum: 12341234 }
-                entries { thunk_id: 567 buffer_idx: 8 checksum: 56785678 }
-              )pb"));
-}
-
-}  // namespace
-}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc
index 8b8a38ddd36c0c..cd394504942ec7 100644
--- a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc
+++ b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -52,7 +53,6 @@ limitations under the License.
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/subprocess.h"
 #include "xla/util.h"
@@ -281,7 +281,7 @@ absl::StatusOr<cuda::Assembly> CompileGpuAsmUsingPtxAs(
   VLOG(2) << "ptx written to: " << ptx_path;
 
   absl::Cleanup ptx_cleaner = [&ptx_path] {
-    TF_CHECK_OK(tsl::Env::Default()->DeleteFile(ptx_path));
+    CHECK_OK(tsl::Env::Default()->DeleteFile(ptx_path));
   };
 
   // Invoke ptxas and collect its output.
@@ -396,7 +396,7 @@ absl::StatusOr<std::vector<uint8_t>> BundleGpuAsmUsingFatbin(
   }
   absl::Cleanup image_files_cleaner = [&image_paths] {
     for (const auto& path : image_paths) {
-      TF_CHECK_OK(tsl::Env::Default()->DeleteFile(path));
+      CHECK_OK(tsl::Env::Default()->DeleteFile(path));
     }
   };
 
@@ -504,7 +504,7 @@ absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
   std::vector<std::string> temp_files;
   absl::Cleanup cleaners = [&] {
     for (auto& f : temp_files) {
-      TF_CHECK_OK(tsl::Env::Default()->DeleteFile(f));
+      CHECK_OK(tsl::Env::Default()->DeleteFile(f));
     }
   };
   for (int i = 0; i < images.size(); i++) {
diff --git a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc
index 49e8e2bf30a6cc..798365b632d930 100644
--- a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc
@@ -20,17 +20,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/semantic_version.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace stream_executor {
 namespace {
 using testing::Not;
-using tsl::testing::IsOkAndHolds;
-using tsl::testing::StatusIs;
 
 TEST(SubprocessCompilationTest, GetToolVersion) {
   std::string cuda_dir;
diff --git a/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc b/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc
index 2cc1899c1d141d..ca65b09f4a1dae 100644
--- a/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc
@@ -18,16 +18,13 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace stream_executor::gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
-
 TEST(TmaUtilTest, GetTensorMapDataTypeReturnsCorrectDataType) {
   EXPECT_THAT(GetTensorMapDataType(1),
               absl_testing::IsOkAndHolds(CU_TENSOR_MAP_DATA_TYPE_UINT8));
diff --git a/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
index e2e9561e0e2a72..e89f3cc8dfdec0 100644
--- a/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
+++ b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
@@ -1,3 +1,4 @@
+#include "xla/stream_executor/kernel_symbol_registry.h"
 /* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -285,10 +286,15 @@ __launch_bounds__(stream_executor::gpu::kTopKMaxThreadsPerBlock, 1) __global__
   GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                             \
       TopKKernelCuda_K##K_VAL##_##TYPE##_##VT, KERNEL_TRAIT(K_VAL, TYPE, VT), \
       stream_executor::cuda::kCudaPlatformId, ([](size_t arity) {             \
-        return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(  \
-            absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>),                     \
-            "topk_k" #K_VAL "_" #TYPE "_" #VT, arity);                        \
-      }));
+        return stream_executor::KernelLoaderSpec::                            \
+            CreateSerializableInProcessSymbolSpec(                            \
+                /*persistent_kernel_name=*/"topk_k" #K_VAL "_" #TYPE "_" #VT, \
+                absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>),                 \
+                "topk_k" #K_VAL "_" #TYPE "_" #VT, arity);                    \
+      }));                                                                    \
+  KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(                          \
+      topk_k##K_VAL##_##TYPE##_##VT, stream_executor::cuda::kCudaPlatformId,  \
+      (&Run<K_VAL, TYPE, VT>));
 
 }  // namespace stream_executor::cuda
 
diff --git a/third_party/xla/xla/stream_executor/device_description.cc b/third_party/xla/xla/stream_executor/device_description.cc
index a63bc439834e07..9264fb048dd642 100644
--- a/third_party/xla/xla/stream_executor/device_description.cc
+++ b/third_party/xla/xla/stream_executor/device_description.cc
@@ -17,13 +17,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
-#include <variant>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/tsl/lib/math/math_util.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -68,12 +69,10 @@ absl::StatusOr<DeviceDescription> DeviceDescription::FromProto(
 
 GpuDeviceInfoProto DeviceDescription::ToGpuProto() const {
   stream_executor::GpuDeviceInfoProto proto;
-  if (auto* ptr = std::get_if<stream_executor::CudaComputeCapability>(
-          &gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.cuda_compute_capability()) {
     *proto.mutable_cuda_compute_capability() = ptr->ToProto();
   }
-  if (auto* ptr = std::get_if<stream_executor::RocmComputeCapability>(
-          &gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.rocm_compute_capability()) {
     *proto.mutable_rocm_compute_capability() = ptr->ToProto();
   }
 
@@ -106,8 +105,7 @@ const GpuComputeCapability &DeviceDescription::gpu_compute_capability() const {
 }
 
 CudaComputeCapability DeviceDescription::cuda_compute_capability() const {
-  if (auto *ptr =
-          std::get_if<CudaComputeCapability>(&gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.cuda_compute_capability()) {
     return *ptr;
   }
   // Fallback for backwards compatibility.
@@ -115,8 +113,7 @@ CudaComputeCapability DeviceDescription::cuda_compute_capability() const {
 }
 
 RocmComputeCapability DeviceDescription::rocm_compute_capability() const {
-  if (auto *ptr =
-          std::get_if<RocmComputeCapability>(&gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.rocm_compute_capability()) {
     return *ptr;
   }
   return RocmComputeCapability{};
@@ -154,4 +151,33 @@ void CalculateDimensionality(const DeviceDescription &device_description,
   }
 }
 
+GpuComputeCapabilityProto GpuComputeCapability::ToProto() const {
+  GpuComputeCapabilityProto proto;
+  if (IsCuda()) {
+    *proto.mutable_cuda_compute_capability() =
+        cuda_compute_capability()->ToProto();
+  } else {
+    *proto.mutable_rocm_compute_capability() =
+        rocm_compute_capability()->ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<GpuComputeCapability> GpuComputeCapability::FromProto(
+    const GpuComputeCapabilityProto& proto) {
+  if (proto.has_cuda_compute_capability()) {
+    TF_ASSIGN_OR_RETURN(
+        CudaComputeCapability cuda_compute_capability,
+        CudaComputeCapability::FromProto(proto.cuda_compute_capability()));
+    return GpuComputeCapability(cuda_compute_capability);
+  }
+
+  if (proto.has_rocm_compute_capability()) {
+    return GpuComputeCapability(
+        RocmComputeCapability::FromProto(proto.rocm_compute_capability()));
+  }
+
+  return absl::InvalidArgumentError(
+      "The serialized GpuComputeCapability has no compute capability set.");
+}
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index fa394c2ee23c0b..39962e7dda737f 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <cassert>
 #include <cstdint>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <variant>
 
@@ -36,8 +35,78 @@ limitations under the License.
 
 namespace stream_executor {
 
-using GpuComputeCapability =
-    std::variant<CudaComputeCapability, RocmComputeCapability>;
+class GpuComputeCapability {
+ public:
+  GpuComputeCapability() = default;
+  GpuComputeCapability(const CudaComputeCapability& compute_capability)
+      : compute_capability_(compute_capability) {}
+  explicit GpuComputeCapability(const RocmComputeCapability& compute_capability)
+      : compute_capability_(compute_capability) {}
+
+  GpuComputeCapability& operator=(
+      const CudaComputeCapability& compute_capability) {
+    compute_capability_ = compute_capability;
+    return *this;
+  }
+
+  GpuComputeCapability& operator=(
+      const RocmComputeCapability& compute_capability) {
+    compute_capability_ = compute_capability;
+    return *this;
+  }
+
+  bool IsCuda() const {
+    return std::holds_alternative<CudaComputeCapability>(compute_capability_);
+  }
+
+  bool IsRocm() const {
+    return std::holds_alternative<RocmComputeCapability>(compute_capability_);
+  }
+
+  const CudaComputeCapability* cuda_compute_capability() const {
+    return std::get_if<CudaComputeCapability>(&compute_capability_);
+  }
+
+  const RocmComputeCapability* rocm_compute_capability() const {
+    return std::get_if<RocmComputeCapability>(&compute_capability_);
+  }
+
+  std::string ToString() const {
+    if (auto ptr = cuda_compute_capability()) {
+      return ptr->ToString();
+    }
+    return rocm_compute_capability()->ToString();
+  }
+
+  GpuComputeCapabilityProto ToProto() const;
+
+  static absl::StatusOr<GpuComputeCapability> FromProto(
+      const GpuComputeCapabilityProto& proto);
+
+  friend bool operator==(const GpuComputeCapability& lhs,
+                         const GpuComputeCapability& rhs) {
+    return lhs.compute_capability_ == rhs.compute_capability_;
+  }
+
+  friend bool operator!=(const GpuComputeCapability& lhs,
+                         const GpuComputeCapability& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  std::variant<CudaComputeCapability, RocmComputeCapability>
+      compute_capability_;
+};
+
+// Information about NVLink/UALink.
+struct DeviceInterconnectInfo {
+  int active_links = 0;
+
+  // Uuid of the cluster to which this GPU belongs.
+  std::string cluster_uuid;
+  // ID of the fabric clique to which this GPU belongs.
+  std::string clique_id;
+};
 
 // Data that describes the execution target of the StreamExecutor, in terms of
 // important logical parameters. These include dimensionality limits and
@@ -47,6 +116,8 @@ using GpuComputeCapability =
 // Thread-safe: immutable post-initialization.
 class DeviceDescription {
  public:
+  DeviceDescription() = default;
+
   // Returns the platform being run on; this value is primarily intended for
   // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
   // 3.5".
@@ -110,34 +181,28 @@ class DeviceDescription {
   // Returns the limit on the total number of threads that can be launched in a
   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
   // This limit affects what constitutes a legitimate kernel launch request.
-  const int64_t& threads_per_block_limit() const {
-    return threads_per_block_limit_;
-  }
+  int64_t threads_per_block_limit() const { return threads_per_block_limit_; }
 
   // Returns the limit on the total number of threads that can be simultaneously
   // launched on a given multiprocessor.
-  const int64_t& threads_per_core_limit() const {
-    return threads_per_core_limit_;
-  }
+  int64_t threads_per_core_limit() const { return threads_per_core_limit_; }
 
   // Returns the number of threads per warp/wavefront.
   constexpr int64_t threads_per_warp() const { return threads_per_warp_; }
 
   // Returns the limit on the total number of registers per core.
-  const int64_t& registers_per_core_limit() const {
-    return registers_per_core_limit_;
-  }
+  int64_t registers_per_core_limit() const { return registers_per_core_limit_; }
 
   // Returns the limit on the total number of registers that can be
   // simultaneously used by a block.
-  const int64_t& registers_per_block_limit() const {
+  int64_t registers_per_block_limit() const {
     return registers_per_block_limit_;
   }
 
   // Returns the number of address bits available to kernel code running on the
   // platform. This affects things like the maximum allocation size and perhaps
   // types used in kernel code such as size_t.
-  const int64_t& device_address_bits() const { return device_address_bits_; }
+  int64_t device_address_bits() const { return device_address_bits_; }
 
   // Returns the device memory size in bytes.
   int64_t device_memory_size() const { return device_memory_size_; }
@@ -150,6 +215,9 @@ class DeviceDescription {
   // host and device.)
   int64_t memory_bandwidth() const { return memory_bandwidth_; }
 
+  // Returns the PCIe memory bandwidth in bytes/sec.
+  int64_t pcie_bandwidth() const { return pcie_bandwidth_; }
+
   // Returns the device's core clock rate in GHz.
   float clock_rate_ghz() const { return clock_rate_ghz_; }
 
@@ -193,67 +261,55 @@ class DeviceDescription {
   // also we do not count what occupies cache, but rather claim that what is
   // much smaller than the cache size will likely stay in it.
   constexpr int64_t l1_cache_size_per_SM() const {
-    return std::visit(
-        [](const auto& capability) -> int64_t {
-          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
-                                       RocmComputeCapability>) {
-            // MI100 and MI200 has 16KB L1 cache per CU.
-            if (capability.gfx9_mi100() || capability.gfx9_mi200()) {
-              return 16 * 1024;
-            }
-            // MI300 has 32KB L1 cache per CU.
-            if (capability.gfx9_mi300_series()) {
-              return 32 * 1024;
-            }
-          }
-          // Default return for other GPUs (e.g., RTX A6000).
-          return 2 * 1024;
-        },
-        gpu_compute_capability_);
+    if (auto* capability = gpu_compute_capability_.rocm_compute_capability()) {
+      // MI100 and MI200 has 16KB L1 cache per CU.
+      if (capability->gfx9_mi100() || capability->gfx9_mi200()) {
+        return 16 * 1024;
+      }
+      // MI300 has 32KB L1 cache per CU.
+      if (capability->gfx9_mi300_series()) {
+        return 32 * 1024;
+      }
+    }
+    // Default return for other GPUs (e.g., RTX A6000).
+    return 2 * 1024;
   }
 
   constexpr int64_t dram_to_l2_transaction_size_bytes() const {
-    return std::visit(
-        [](const auto& capability) -> int {
-          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
-                                       RocmComputeCapability>) {
-            // DRAM->L2 bus is 128 Byte width for MI300.
-            if (capability.gfx9_mi300_series()) {
-              return 128;
-            }
-          }
-          // Cache line is 128B that is split into 4 sectors of 32B. Default
-          // transaction size from DRAM -> L2 = 64 Bytes = 2 sectors, since
-          // V100, but it can be also configured.
-          // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
-          // (page 10).
-          // return 64 Bytes by default.
-          return 64;
-        },
-        gpu_compute_capability_);
+    if (auto* capability = gpu_compute_capability_.rocm_compute_capability()) {
+      // DRAM->L2 bus is 128 Byte width for MI300.
+      if (capability->gfx9_mi300_series()) {
+        return 128;
+      }
+    }
+    // Cache line is 128B that is split into 4 sectors of 32B. Default
+    // transaction size from DRAM -> L2 = 64 Bytes = 2 sectors, since
+    // V100, but it can be also configured.
+    // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
+    // (page 10).
+    // return 64 Bytes by default.
+    return 64;
   }
 
   constexpr int64_t memory_transactions_per_clock() const {
-    return std::visit(
-        [](const auto& capability) -> int {
-          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
-                                       RocmComputeCapability>) {
-            // 16 works well on MI300.
-            if (capability.gfx9_mi300_series()) {
-              return 16;
-            }
-          }
-          // Default return for other GPUs.
-          return 32;
-        },
-        gpu_compute_capability_);
+    if (auto* capability = gpu_compute_capability_.rocm_compute_capability()) {
+      // 16 works well on MI300.
+      if (capability->gfx9_mi300_series()) {
+        return 16;
+      }
+    }
+    // Default return for other GPUs.
+    return 32;
+  }
+
+  const DeviceInterconnectInfo& device_interconnect_info() const {
+    return interconnect_info_;
   }
 
   GpuDeviceInfoProto ToGpuProto() const;
 
   std::string ToString() const;
 
-  DeviceDescription() = default;
   static absl::StatusOr<DeviceDescription> FromProto(
       const GpuDeviceInfoProto& proto);
 
@@ -261,10 +317,6 @@ class DeviceDescription {
   // value will be provided.
   static inline const char* const kUndefinedString = "<undefined>";
 
-  void set_gpu_compute_capability(const GpuComputeCapability& c) {
-    gpu_compute_capability_ = c;
-  }
-
   void set_block_dim_limit_x(int64_t limit) { block_dim_limit_.x = limit; }
 
   void set_block_dim_limit_y(int64_t limit) { block_dim_limit_.y = limit; }
@@ -315,6 +367,7 @@ class DeviceDescription {
   void set_device_memory_size(int64_t value) { device_memory_size_ = value; }
   void set_l2_cache_size(int64_t value) { l2_cache_size_ = value; }
   void set_memory_bandwidth(int64_t value) { memory_bandwidth_ = value; }
+  void set_pcie_bandwidth(int64_t value) { pcie_bandwidth_ = value; }
 
   void set_shared_memory_per_core(int64_t value) {
     shared_memory_per_core_ = value;
@@ -328,6 +381,10 @@ class DeviceDescription {
 
   void set_clock_rate_ghz(float value) { clock_rate_ghz_ = value; }
 
+  void set_gpu_compute_capability(const GpuComputeCapability& c) {
+    gpu_compute_capability_ = c;
+  }
+
   void set_cuda_compute_capability(const CudaComputeCapability& cc) {
     gpu_compute_capability_ = cc;
   }
@@ -341,6 +398,10 @@ class DeviceDescription {
   void set_fpus_per_core(int value) { fpus_per_core_ = value; }
   void set_ecc_enabled(bool value) { ecc_enabled_ = value; }
 
+  void set_device_interconnect_info(DeviceInterconnectInfo info) {
+    interconnect_info_ = std::move(info);
+  }
+
  private:
   // For description of the following members, see the corresponding accessor
   // above.
@@ -371,7 +432,9 @@ class DeviceDescription {
   int64_t device_address_bits_ = kUninitialized<int64_t>;
   int64_t device_memory_size_ = kUninitialized<int64_t>;
   int64_t l2_cache_size_ = kUninitialized<int64_t>;
+
   int64_t memory_bandwidth_ = kUninitialized<int64_t>;
+  int64_t pcie_bandwidth_ = kUninitialized<int64_t>;
 
   // Shared memory limits on a given device.
   int64_t shared_memory_per_core_ = kUninitialized<int64_t>;
@@ -391,6 +454,8 @@ class DeviceDescription {
   SemanticVersion runtime_version_{0, 0, 0};
   SemanticVersion compile_time_toolkit_version_{0, 0, 0};
   SemanticVersion dnn_version_{0, 0, 0};
+
+  DeviceInterconnectInfo interconnect_info_;
 };
 
 // Returns whether the given thread_dim is acceptable given the limits described
diff --git a/third_party/xla/xla/stream_executor/device_description.proto b/third_party/xla/xla/stream_executor/device_description.proto
index 80e2e64bc82158..54e7db38cfd2d7 100644
--- a/third_party/xla/xla/stream_executor/device_description.proto
+++ b/third_party/xla/xla/stream_executor/device_description.proto
@@ -24,6 +24,13 @@ message RocmComputeCapabilityProto {
   string gcn_arch_name = 1;
 }
 
+message GpuComputeCapabilityProto {
+  oneof compute_capability {
+    CudaComputeCapabilityProto cuda_compute_capability = 1;
+    RocmComputeCapabilityProto rocm_compute_capability = 2;
+  }
+}
+
 message GpuDeviceInfoProto {
   int32 threads_per_block_limit = 1;
   int32 threads_per_warp = 2;
diff --git a/third_party/xla/xla/stream_executor/device_description_test.cc b/third_party/xla/xla/stream_executor/device_description_test.cc
index 51443ddf666f86..8f6684abaaa6b4 100644
--- a/third_party/xla/xla/stream_executor/device_description_test.cc
+++ b/third_party/xla/xla/stream_executor/device_description_test.cc
@@ -16,11 +16,16 @@ limitations under the License.
 
 #include <string>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/stream_executor/semantic_version.h"
 
 namespace stream_executor {
 namespace {
+using absl_testing::IsOkAndHolds;
 
 TEST(DeviceDescription, DefaultConstruction) {
   DeviceDescription desc;
@@ -116,5 +121,16 @@ TEST(RocmComputeCapability, Accessors) {
   EXPECT_TRUE(RocmComputeCapability{"gfx1103"}.has_hipblaslt());
 }
 
+TEST(GpuComputeCapability, ProtoConversion) {
+  EXPECT_THAT(
+      GpuComputeCapability::FromProto(
+          GpuComputeCapability(CudaComputeCapability::Volta()).ToProto()),
+      IsOkAndHolds(GpuComputeCapability(CudaComputeCapability::Volta())));
+  EXPECT_THAT(
+      GpuComputeCapability::FromProto(
+          GpuComputeCapability(RocmComputeCapability("gfx900")).ToProto()),
+      IsOkAndHolds(GpuComputeCapability(RocmComputeCapability("gfx900"))));
+}
+
 }  // namespace
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/device_memory_allocator.h b/third_party/xla/xla/stream_executor/device_memory_allocator.h
index 9520fa9b917012..44692817316b38 100644
--- a/third_party/xla/xla/stream_executor/device_memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/device_memory_allocator.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
+#include "xla/tsl/platform/errors.h"
 
 namespace stream_executor {
 
@@ -73,13 +72,13 @@ class ScopedDeviceMemory {
         allocator_(other.allocator_) {}
 
   // Releases the memory that was provided in the constructor.
-  ~ScopedDeviceMemory() { TF_CHECK_OK(Free()); }
+  ~ScopedDeviceMemory() { CHECK_OK(Free()); }
 
   // Moves ownership of the memory from other to this object.
   //
   // Postcondition: other == nullptr.
   ScopedDeviceMemory &operator=(ScopedDeviceMemory &&other) noexcept {
-    TF_CHECK_OK(Free());
+    CHECK_OK(Free());
     wrapped_ = other.Release();
     allocator_ = other.allocator_;
     device_ordinal_ = other.device_ordinal_;
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index 0818b3a0c3090d..3c39aa8272840b 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
@@ -144,7 +144,7 @@ absl::Status DnnSupport::GetConvolveRunners(
     DeviceMemoryBase /*output_data*/,
     const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
     bool /*use_fallback*/, ScratchAllocator* /*scratch_allocator*/,
-    const NumericOptions& /*numeric_options*/,
+    const EngineOptions& /*engine_options*/,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* /*exec_plans*/) {
   return absl::UnimplementedError("GetConvolveRunners not implemented.");
 }
@@ -167,7 +167,7 @@ absl::Status DnnSupport::GetGraphConvolveRunners(
     const dnn::FilterDescriptor& /*filter_descriptor*/,
     const dnn::BatchDescriptor& /*output_descriptor*/,
     const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
-    bool /*use_fallback*/, const NumericOptions& /*numeric_options*/,
+    bool /*use_fallback*/, const EngineOptions& /*engine_options*/,
     std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* /*exec_plans*/,
     std::string /*serialized_graph*/) {
   return absl::UnimplementedError("GetGraphConvolveRunners not implemented.");
@@ -195,7 +195,7 @@ absl::Status DnnSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& bias_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
+    dnn::ActivationMode activation_mode, const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   return absl::UnimplementedError("GetFusedConvolveRunners not implemented.");
 }
@@ -205,7 +205,7 @@ absl::Status DnnSupport::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   return absl::UnimplementedError("GetFusedMatmulRunners not implemented.");
@@ -265,7 +265,7 @@ bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
 absl::Status DnnSupport::DoPoolForward(
     DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
@@ -278,7 +278,7 @@ absl::Status DnnSupport::DoPoolForward(
 absl::Status DnnSupport::DoPoolBackward(
     DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index a69e40a1dee334..20209254aa2459 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -1092,7 +1092,7 @@ class DnnGraph {
   DnnGraph() = default;
   virtual ~DnnGraph() = default;
 
-  virtual absl::Status Prepare(DnnSupport&, const NumericOptions&) = 0;
+  virtual absl::Status Prepare(DnnSupport&, const EngineOptions&) = 0;
   virtual absl::Status Build(DnnSupport&, std::optional<int64_t> plan_id) = 0;
   virtual absl::Status Execute(Stream& stream,
                                absl::Span<DeviceMemoryBase> operands,
@@ -1491,8 +1491,7 @@ class DnnSupport {
       DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
       DeviceMemoryBase output_data,
       const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-      ScratchAllocator* scratch_allocator,
-      const NumericOptions& numeric_options,
+      ScratchAllocator* scratch_allocator, const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const ConvRunner>>* out_exec_plans);
 
   virtual absl::StatusOr<std::unique_ptr<const ConvRunner>>
@@ -1510,7 +1509,7 @@ class DnnSupport {
       const FilterDescriptor& filter_descriptor,
       const BatchDescriptor& output_descriptor,
       const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const GraphConvRunner>>* out_exec_plans,
       std::string serialized_graph);
 
@@ -1533,7 +1532,7 @@ class DnnSupport {
       const BatchDescriptor& bias_descriptor,
       const BatchDescriptor& output_descriptor,
       const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-      ActivationMode activation_mode, const NumericOptions& numeric_options,
+      ActivationMode activation_mode, const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const FusedConvRunner>>* out_exec_plans);
 
   virtual absl::Status GetFusedMatmulRunners(
@@ -1541,7 +1540,7 @@ class DnnSupport {
       Stream* stream, bool trans_a, bool trans_b, uint64_t m, uint64_t n,
       uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       ActivationMode activation_mode, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const FusedMatmulRunner>>* out_exec_plans);
 
   virtual absl::StatusOr<std::unique_ptr<const FusedConvRunner>>
@@ -1591,14 +1590,14 @@ class DnnSupport {
   template <typename ElementType>
   absl::Status PoolForward(Stream* stream,
                            const PoolingDescriptor& pooling_dimensions,
-                           const NumericOptions& numeric_options,
+                           const EngineOptions& engine_options,
                            const BatchDescriptor& input_dimensions,
                            const DeviceMemory<ElementType>& input_data,
                            const BatchDescriptor& output_dimensions,
                            DeviceMemory<ElementType>* output_data,
                            ScratchAllocator* workspace_allocator = nullptr) {
     return DoPoolForward(ToDataType<ElementType>::value, stream,
-                         pooling_dimensions, numeric_options, input_dimensions,
+                         pooling_dimensions, engine_options, input_dimensions,
                          input_data, output_dimensions, *output_data,
                          workspace_allocator);
   }
@@ -1606,7 +1605,7 @@ class DnnSupport {
   template <typename ElementType>
   absl::Status PoolBackward(Stream* stream,
                             const PoolingDescriptor& pooling_dimensions,
-                            const NumericOptions& numeric_options,
+                            const EngineOptions& engine_options,
                             const BatchDescriptor& input_dimensions,
                             const DeviceMemory<ElementType>& input_data,
                             const BatchDescriptor& output_dimensions,
@@ -1616,7 +1615,7 @@ class DnnSupport {
                             ScratchAllocator* workspace_allocator = nullptr) {
     return DoPoolBackward(
         ToDataType<ElementType>::value, stream, pooling_dimensions,
-        numeric_options, input_dimensions, input_data, output_dimensions,
+        engine_options, input_dimensions, input_data, output_dimensions,
         output_data, input_diff_data, *output_diff_data, workspace_allocator);
   }  // Performs a forward pooling operation on input_data, writing to
   // output_data. See PoolingDescriptor for how to configure the
@@ -1641,7 +1640,7 @@ class DnnSupport {
   virtual absl::Status DoPoolForward(
       DataType element_type, Stream* stream,
       const PoolingDescriptor& pooling_dimensions,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
       const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
       ScratchAllocator* workspace_allocator);
@@ -1658,7 +1657,7 @@ class DnnSupport {
   virtual absl::Status DoPoolBackward(
       DataType element_type, Stream* stream,
       const PoolingDescriptor& pooling_dimensions,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
       const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
       DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
@@ -1725,7 +1724,7 @@ class DnnSupport {
       int batch_size, RnnInputMode input_mode, RnnDirectionMode direction_mode,
       RnnMode rnn_mode, DataType data_type,
       const AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) {
     return absl::UnimplementedError("CreateRnnDescriptor is unimplemented");
   }
@@ -1992,13 +1991,13 @@ class DnnSupport {
                                  absl::Span<const int> labels_data,
                                  absl::Span<const int> labels_lengths_data,
                                  absl::Span<const int> input_lengths_data,
-                                 const NumericOptions& numeric_options,
+                                 const EngineOptions& engine_options,
                                  ScratchAllocator* workspace_allocator,
                                  DeviceMemory<uint8_t>* scratch_memory,
                                  int* ctc_loss_algo_id) {
     return DoPrepareForCtcLoss(
         stream, ToDataType<ElementType>::value, probs_desc, grads_desc,
-        labels_data, labels_lengths_data, input_lengths_data, numeric_options,
+        labels_data, labels_lengths_data, input_lengths_data, engine_options,
         workspace_allocator, scratch_memory, ctc_loss_algo_id);
   }
 
@@ -2101,8 +2100,7 @@ class DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
-      const NumericOptions& numeric_options,
-      ScratchAllocator* scratch_allocator,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
     *scratch_memory = {};
     return absl::OkStatus();
diff --git a/third_party/xla/xla/stream_executor/numeric_options.h b/third_party/xla/xla/stream_executor/engine_options.h
similarity index 52%
rename from third_party/xla/xla/stream_executor/numeric_options.h
rename to third_party/xla/xla/stream_executor/engine_options.h
index 5620d3ad45def2..1b0eae8774fe3d 100644
--- a/third_party/xla/xla/stream_executor/numeric_options.h
+++ b/third_party/xla/xla/stream_executor/engine_options.h
@@ -13,25 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ------------------------------------------------------------------------------*/
 
-#ifndef XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
-#define XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+#ifndef XLA_STREAM_EXECUTOR_ENGINE_OPTIONS_H_
+#define XLA_STREAM_EXECUTOR_ENGINE_OPTIONS_H_
 
 namespace stream_executor {
 
-// Options that specify the numeric behavior of operations like matrix
-// multiplications and convolutions
-struct NumericOptions {
-  NumericOptions(bool require_determinism, bool allow_tf32)
-      : require_determinism(require_determinism), allow_tf32(allow_tf32) {}
+// Options (used when calling math libraries) that specify the behavior of
+// operations like matrix multiplications and convolutions.
+struct EngineOptions {
+  EngineOptions(bool require_determinism, bool allow_tf32,
+                bool require_command_buffer)
+      : require_determinism(require_determinism),
+        allow_tf32(allow_tf32),
+        require_command_buffer(require_command_buffer) {}
 
-  NumericOptions() : require_determinism(false), allow_tf32(true) {}
+  EngineOptions()
+      : require_determinism(false),
+        allow_tf32(true),
+        require_command_buffer(false) {}
 
   // If true, the op must be deterministic
   bool require_determinism;
   // If true, float32 inputs can be rounded to TensorFloat-32 precision
   bool allow_tf32;
+  // If true, the execution plan selected must support command buffer
+  // construction.
+  bool require_command_buffer;
 };
 
 }  // namespace stream_executor
 
-#endif  // XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+#endif  // XLA_STREAM_EXECUTOR_ENGINE_OPTIONS_H_
diff --git a/third_party/xla/xla/stream_executor/fft.h b/third_party/xla/xla/stream_executor/fft.h
index 3349beb7146261..783e078159a7d6 100644
--- a/third_party/xla/xla/stream_executor/fft.h
+++ b/third_party/xla/xla/stream_executor/fft.h
@@ -34,7 +34,7 @@ limitations under the License.
 //  stream
 //    .Init()
 //    .ThenFft(plan.get(), x, &y);
-//  TF_CHECK_OK(stream.BlockHostUntilDone());
+//  CHECK_OK(stream.BlockHostUntilDone());
 //
 // By using stream operations in this manner the user can easily intermix custom
 // kernel launches with these pre-canned FFT routines.
diff --git a/third_party/xla/xla/stream_executor/generic_memory_allocator_test.cc b/third_party/xla/xla/stream_executor/generic_memory_allocator_test.cc
index 1fca775c86cfac..6070437853ef44 100644
--- a/third_party/xla/xla/stream_executor/generic_memory_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/generic_memory_allocator_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/generic_memory_allocation.h"
 #include "xla/stream_executor/memory_allocation.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 9c25ea606a3405..5f56816e01600a 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -137,6 +137,8 @@ cc_library(
         "gpu_command_buffer.h",
     ],
     deps = [
+        "//xla:debug_options_flags",
+        "//xla/service:dump",
         "//xla/stream_executor:bit_pattern",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
@@ -168,6 +170,7 @@ cc_library(
     name = "gpu_executor_header",
     hdrs = ["gpu_executor.h"],
     deps = [
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_common",
         "//xla/stream_executor:stream_executor_h",
@@ -301,7 +304,6 @@ cc_library(
     deps = [
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:subprocess",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
@@ -404,8 +406,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -437,7 +437,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":gpu_blas_lt_proto_cc",
-        "//xla:protobuf_util",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:types",
@@ -446,7 +445,6 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
@@ -461,7 +459,6 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ] + if_cuda_is_configured([
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
@@ -583,6 +580,7 @@ xla_test(
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -594,13 +592,13 @@ xla_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -609,7 +607,6 @@ xla_test(
     srcs = ["gpu_command_buffer_test.cc"],
     backends = ["gpu"],
     deps = [
-        ":gpu_command_buffer",
         ":gpu_test_kernels",
         "//xla/service:platform_util",
         "//xla/stream_executor:command_buffer",
@@ -623,18 +620,13 @@ xla_test(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:trace_command_buffer_factory",
-        "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -697,8 +689,6 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -711,6 +701,7 @@ xla_test(
         "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
         "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
         "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/b200.txtpb",
         "//xla/tools/hlo_opt:gpu_specs/h100_pcie.txtpb",
         "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
         "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
@@ -774,7 +765,6 @@ xla_cc_test(
     deps = [
         ":tma_metadata",
         ":tma_metadata_proto_cc",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
@@ -814,7 +804,6 @@ xla_cc_test(
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
@@ -905,9 +894,8 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -940,3 +928,90 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "buffer_debug_log",
+    srcs = ["buffer_debug_log.cc"],
+    hdrs = ["buffer_debug_log.h"],
+    deps = [
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_cc",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:stream",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+xla_test(
+    name = "buffer_debug_log_test",
+    srcs = ["buffer_debug_log_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":buffer_debug_log",
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_cc",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/backends/gpu/runtime:thunk_id",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "buffer_debug_xor_checksum_kernel",
+    hdrs = ["buffer_debug_xor_checksum_kernel.h"],
+    deps = [
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "buffer_debug_float_check_kernel",
+    hdrs = ["buffer_debug_float_check_kernel.h"],
+    deps = [
+        "//xla:types",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "prefix_sum_kernel",
+    hdrs = ["prefix_sum_kernel.h"],
+    deps = [
+        "//xla:types",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "kernel_serialization_check",
+    testonly = True,
+    srcs = ["kernel_serialization_check.cc"],
+    hdrs = ["kernel_serialization_check.h"],
+    deps = [
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor:kernel_symbol_registry",
+        "//xla/stream_executor:platform",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_for_library",
+    ],
+)
diff --git a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
index eaf75c7c81b073..da3065502d51d2 100644
--- a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
@@ -25,9 +25,18 @@ limitations under the License.
 
 namespace stream_executor::gpu {
 
+// Strategy for performing an all-reduce.
 enum class AllReduceStrategy : uint32_t {
+  // With one-shot strategy all GPUs gathers and reduces data from all peer
+  // GPUs.
   kOneShot,
+  // With two-shot strategy each GPU gathers and reduces only a part of the
+  // data in the first shot, as a second shot it gathers peer GPUs results to
+  // construct a final result.
   kTwoShot,
+  // With multimem strategy single GPU uses multimem instructions to perform
+  // reduce+broadcast in one-shot.
+  kMultimem,
 };
 
 template <typename Sink>
@@ -39,6 +48,9 @@ void AbslStringify(Sink& sink, AllReduceStrategy strategy) {
     case AllReduceStrategy::kTwoShot:
       sink.Append("kTwoShot");
       break;
+    case AllReduceStrategy::kMultimem:
+      sink.Append("kMultimem");
+      break;
   }
 }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
index 823b32c68c8b48..adb77859f7715e 100644
--- a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
+++ b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
@@ -86,12 +86,28 @@ __device__ __forceinline__ void VecOp(Vec<T>& res, const Vec<T>& vec) {
 
 template <typename T>
 __device__ __forceinline__ RestrictedPtr<T> GetPeerPtr(
-    void* ptr, int64_t peer_rank, const CollectiveKernelMetadata& metadata) {
-  uint64_t current_base = metadata.local_buffer_root_ptrs[metadata.rank];
+    void* ptr, int64_t peer_rank, int64_t argument_index, int num_ranks,
+    const CollectiveKernelMetadata& metadata) {
+  uint64_t argument_offset = num_ranks * argument_index;
+  uint64_t current_base =
+      (uint64_t)metadata.param_to_peers[argument_offset + metadata.rank];
+  uint64_t peer_base =
+      (uint64_t)metadata.param_to_peers[argument_offset + peer_rank];
   uint64_t offset = (uint64_t)ptr - current_base;
 
-  return (RestrictedPtr<T>)(metadata.local_buffer_root_ptrs[peer_rank] +
-                            offset);
+  return (RestrictedPtr<T>)(peer_base + offset);
+}
+
+template <typename T>
+__device__ __forceinline__ RestrictedPtr<T> GetMultimemPtr(
+    void* ptr, int64_t argument_index, int num_ranks,
+    const CollectiveKernelMetadata& metadata) {
+  uint64_t argument_offset = num_ranks * argument_index;
+  uint64_t current_base =
+      (uint64_t)metadata.param_to_peers[argument_offset + metadata.rank];
+  uint64_t offset = (uint64_t)ptr - current_base;
+
+  return (RestrictedPtr<T>)((uint64_t)metadata.multicast_buffer_ptr + offset);
 }
 
 template <PlatformType T = PlatformType::NOGPU>
@@ -126,11 +142,13 @@ __device__ __forceinline__ void OneShotAllReduceKernelImpl(
   __shared__ std::array<RestrictedPtr<T>, kMaxNumAllReduceInputPtrs>
       remote_input_buffers;
 
-  if (threadIdx.x < kMaxNumAllReduceInputPtrs) {
-    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
-        args.symmetric_signal_ptrs, threadIdx.x, *args.metadata);
+  if (threadIdx.x < args.num_ranks) {
     remote_input_buffers[threadIdx.x] =
-        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x, *args.metadata);
+        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x,
+                      /*argument_index=*/0, args.num_ranks, *args.metadata);
+    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
+        args.symmetric_signal_ptrs, threadIdx.x, /*argument_index=*/1,
+        args.num_ranks, *args.metadata);
   }
 
   __syncthreads();
@@ -164,6 +182,80 @@ __device__ __forceinline__ void OneShotAllReduceKernelImpl(
   }
 }
 
+#if __CUDA_ARCH__ >= 900
+
+// This is the simplest implementation of all-reduce with multimem instructions.
+// Right now all devices are copying their data to the remote buffer after
+// which, the first device performs the reduce and broadcast operations using
+// multimem instructions.
+template <typename T, xla::ReductionKind ReductionKindT,
+          PlatformType PlatformT = PlatformType::NOGPU>
+__device__ __forceinline__ void MultimemAllReduceKernelImpl(
+    const AllReduceKernelParams<T>& args) {
+  if (!std::is_same_v<T, float>) {
+    assert(false &&
+           "Multimem all-reduce strategy is only supported for float.");
+  }
+
+  __shared__ std::array<RestrictedPtr<uint32_t>, kMaxNumAllReduceInputPtrs>
+      signal_flags_buffers;
+
+  if (threadIdx.x < args.num_ranks) {
+    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
+        args.symmetric_signal_ptrs, threadIdx.x, /*argument_index=*/1,
+        args.num_ranks, *args.metadata);
+  }
+
+  int64_t offset =
+      kNumElementsPerThread * (blockIdx.x * blockDim.x + threadIdx.x);
+  int64_t stride = kNumElementsPerThread * blockDim.x * gridDim.x;
+
+  // Copy data from local input buffer to remote input buffer.
+  for (int i = offset; i < args.num_elements; i += stride) {
+    VecStore(args.symmetric_input_ptrs + i, VecLoad(args.input_buffer + i));
+  }
+
+  SyncRemoteBlocks<PlatformT>(signal_flags_buffers, args.rank, args.num_ranks,
+                              args.signal_value);
+  __syncthreads();
+
+  RestrictedPtr<T> multimem_ptr = GetMultimemPtr<T>(
+      args.symmetric_input_ptrs, 0, args.num_ranks, *args.metadata);
+  if (args.metadata->rank == 0) {
+    for (int i = offset; i < args.num_elements; i += stride) {
+      T* multimem_element_ptr = multimem_ptr + i;
+
+      // Reduce
+      Vec<T> vec;
+      asm volatile(
+          "multimem.ld_reduce.relaxed.sys.global.add.v4.f32 {%0,%1,%2,%3}, "
+          "[%4];"
+          : "=f"(vec.data[0]), "=f"(vec.data[1]), "=f"(vec.data[2]),
+            "=f"(vec.data[3])
+          : "l"(multimem_element_ptr)
+          : "memory");
+
+      // Broadcast
+      asm volatile(
+          "multimem.st.relaxed.sys.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(
+              multimem_element_ptr),
+          "f"(vec.data[0]), "f"(vec.data[1]), "f"(vec.data[2]), "f"(vec.data[3])
+          : "memory");
+    }
+  }
+
+  __syncthreads();
+  // Wait for all participants to receive the data.
+  SyncRemoteBlocks<PlatformT>(signal_flags_buffers, args.rank, args.num_ranks,
+                              args.signal_value + 1);
+  __syncthreads();
+
+  for (int i = offset; i < args.num_elements; i += stride) {
+    VecStore(args.output_buffer + i, VecLoad(args.symmetric_input_ptrs + i));
+  }
+}
+#endif  // __CUDA_ARCH__ >= 900
+
 template <typename T, xla::ReductionKind ReductionKindT,
           PlatformType PlatformT = PlatformType::NOGPU>
 __device__ __forceinline__ void TwoShotAllReduceKernelImpl(
@@ -173,11 +265,13 @@ __device__ __forceinline__ void TwoShotAllReduceKernelImpl(
   __shared__ std::array<RestrictedPtr<T>, kMaxNumAllReduceInputPtrs>
       remote_input_buffers;
 
-  if (threadIdx.x < kMaxNumAllReduceInputPtrs) {
-    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
-        args.symmetric_signal_ptrs, threadIdx.x, *args.metadata);
+  if (threadIdx.x < args.num_ranks) {
     remote_input_buffers[threadIdx.x] =
-        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x, *args.metadata);
+        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x,
+                      /*argument_index=*/0, args.num_ranks, *args.metadata);
+    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
+        args.symmetric_signal_ptrs, threadIdx.x, /*argument_index=*/1,
+        args.num_ranks, *args.metadata);
   }
 
   __syncthreads();
@@ -278,6 +372,14 @@ __global__ void AllReduceKernelImpl(AllReduceKernelParams<T> args) {
     OneShotAllReduceKernelImpl<T, ReductionKindT, PlatformT>(args);
   } else if constexpr (kAllReduceStrategy == AllReduceStrategy::kTwoShot) {
     TwoShotAllReduceKernelImpl<T, ReductionKindT, PlatformT>(args);
+  } else if constexpr (kAllReduceStrategy == AllReduceStrategy::kMultimem) {
+#if __CUDA_ARCH__ >= 900
+    MultimemAllReduceKernelImpl<T, ReductionKindT, PlatformT>(args);
+#else
+    assert(false &&
+           "Multimem all-reduce strategy is not supported on this "
+           "architecture.");
+#endif
   } else {
     assert(false && "Unsupported all-reduce strategy");
   }
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index 95130019c7e1c0..1d53d8a2324a78 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/subprocess.h"
 #include "tsl/platform/path.h"
 
@@ -83,7 +82,7 @@ absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
   }
   absl::Cleanup image_files_cleaner = [&image_paths] {
     for (const auto& path : image_paths) {
-      TF_CHECK_OK(tsl::Env::Default()->DeleteFile(path));
+      CHECK_OK(tsl::Env::Default()->DeleteFile(path));
     }
   };
 
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
new file mode 100644
index 00000000000000..6c2d84c9613262
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_FLOAT_CHECK_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_FLOAT_CHECK_KERNEL_H_
+
+#include <cstdint>
+
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/types.h"
+
+namespace stream_executor::gpu {
+
+// Trait for a kernel that computes the NaN count of given input buffer and
+// appends it to the buffer debug log.
+//
+// This kernel MUST execute on a single thread block.
+struct BufferDebugFloatCheckF32Kernel {
+  using KernelType =
+      TypedKernel<xla::gpu::BufferDebugLogEntryId, DeviceMemory<float>,
+                  uint64_t, DeviceMemory<xla::gpu::BufferDebugLogHeader>,
+                  DeviceMemory<xla::gpu::BufferDebugFloatCheckEntry>>;
+};
+
+struct BufferDebugFloatCheckBf16Kernel {
+  using KernelType =
+      TypedKernel<xla::gpu::BufferDebugLogEntryId,
+                  DeviceMemory<Eigen::bfloat16>, uint64_t,
+                  DeviceMemory<xla::gpu::BufferDebugLogHeader>,
+                  DeviceMemory<xla::gpu::BufferDebugFloatCheckEntry>>;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_FLOAT_CHECK_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.cc b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.cc
new file mode 100644
index 00000000000000..fee98c27c6db59
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.cc
@@ -0,0 +1,90 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace stream_executor::gpu {
+
+using ::xla::gpu::BufferDebugLogHeader;
+
+absl::StatusOr<DeviceMemory<uint8_t>> BufferDebugLogBase::CreateOnDevice(
+    Stream& stream, DeviceMemory<uint8_t> memory, size_t entry_size) {
+  if (memory.is_null()) {
+    return absl::InvalidArgumentError("Log buffer must be non-null");
+  }
+
+  size_t kMinBufferSize = sizeof(BufferDebugLogHeader) + entry_size;
+  if (memory.size() < kMinBufferSize) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Log buffer size %u is too small to hold any log "
+                        "entries (required: %u bytes)",
+                        memory.size(), kMinBufferSize));
+  }
+
+  const uint32_t max_entries =
+      (memory.size() - sizeof(BufferDebugLogHeader)) / entry_size;
+  const BufferDebugLogHeader empty_header{
+      /*write_idx=*/0,
+      /*capacity=*/max_entries,
+  };
+  TF_RETURN_IF_ERROR(
+      stream.Memcpy(&memory, &empty_header, sizeof(empty_header)));
+  return memory;
+}
+
+absl::StatusOr<BufferDebugLogHeader> BufferDebugLogBase::ReadHeaderFromDevice(
+    Stream& stream, DeviceMemory<uint8_t> memory) const {
+  BufferDebugLogHeader header;
+  TF_RETURN_IF_ERROR(stream.Memcpy(&header, memory, sizeof(header)));
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+  return header;
+}
+
+absl::StatusOr<size_t> BufferDebugLogBase::ReadFromDevice(
+    Stream& stream, DeviceMemory<uint8_t> memory, size_t entry_size,
+    void* entries_data) const {
+  std::vector<uint8_t> buffer(memory.size());
+  TF_RETURN_IF_ERROR(stream.Memcpy(buffer.data(), memory, memory.size()));
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+
+  BufferDebugLogHeader header;
+  memcpy(&header, buffer.data(), sizeof(header));
+
+  const uint32_t max_entries =
+      (memory.size() - sizeof(BufferDebugLogHeader)) / entry_size;
+  const size_t initialized_entries =
+      std::min(max_entries, std::min(header.capacity, header.write_idx));
+  memcpy(entries_data, buffer.data() + sizeof(header),
+         initialized_entries * entry_size);
+
+  return initialized_entries;
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.h
new file mode 100644
index 00000000000000..28d805eb77a3f4
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.h
@@ -0,0 +1,140 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_LOG_H_
+#define XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_LOG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu {
+
+// Base class for BufferDebugLog.
+//
+// This class is not intended to be used directly. Use BufferDebugLog instead.
+class BufferDebugLogBase {
+ protected:
+  absl::StatusOr<xla::gpu::BufferDebugLogHeader> ReadHeaderFromDevice(
+      Stream& stream, DeviceMemory<uint8_t> memory) const;
+
+  absl::StatusOr<size_t> ReadFromDevice(Stream& stream,
+                                        DeviceMemory<uint8_t> memory,
+                                        size_t entry_size,
+                                        void* entries_data) const;
+
+  static absl::StatusOr<DeviceMemory<uint8_t>> CreateOnDevice(
+      Stream& stream, DeviceMemory<uint8_t> memory, size_t entry_size);
+};
+
+// A wrapper over a device memory buffer used to store debug info about contents
+// of buffers (e.g. checksums).
+//
+// It holds a BufferDebugLogHeader and a variable number of Entry structs.
+template <typename Entry>
+class BufferDebugLog : public BufferDebugLogBase {
+ public:
+  // Returns the number of bytes required to store a log with `entries`
+  // entries.
+  static constexpr size_t RequiredSizeForEntries(size_t entries) {
+    return sizeof(xla::gpu::BufferDebugLogHeader) + sizeof(Entry) * entries;
+  }
+
+  // Initializes an empty `BufferDebugLog` using a `log_buffer` allocated in
+  // device memory.
+  //
+  // `log_buffer` must be allocated in memory of the same device `stream` is
+  // associated with. `log_buffer` must outlive the returned `BufferDebugLog`.
+  //
+  // Contents of the log can be retrieved with `BufferDebugLog::ReadFromDevice`.
+  //
+  // Fails with `absl::StatusCode::kInvalidArgument` if `log_buffer` is too
+  // small to hold any entries.
+  static absl::StatusOr<BufferDebugLog<Entry>> CreateOnDevice(
+      Stream& stream, DeviceMemory<uint8_t> log_buffer) {
+    TF_ASSIGN_OR_RETURN(auto memory, BufferDebugLogBase::CreateOnDevice(
+                                         stream, log_buffer, sizeof(Entry)));
+    return BufferDebugLog<Entry>(memory);
+  }
+
+  // Creates a `BufferDebugLog` from an already initialized device memory
+  // buffer.
+  //
+  // `log_buffer` must contain an initialized `BufferDebugLogHeader`.
+  static BufferDebugLog FromDeviceMemoryUnchecked(
+      DeviceMemory<uint8_t> memory) {
+    return BufferDebugLog<Entry>(memory);
+  }
+
+  // Reads the header from the device log.
+  //
+  // `stream` must be associated with the same device as the one used to create
+  // the log.
+  absl::StatusOr<xla::gpu::BufferDebugLogHeader> ReadHeaderFromDevice(
+      Stream& stream) const {
+    return BufferDebugLogBase::ReadHeaderFromDevice(stream, memory_);
+  }
+
+  // Reads all entries from the device log into host memory.
+  //
+  // Returned vector contains all initialized entries. If the log overflowed,
+  // excess elements are silently discarded.
+  //
+  // `stream` must be associated with the same device as the one used to create
+  // the log.
+  absl::StatusOr<std::vector<Entry>> ReadFromDevice(Stream& stream) const {
+    std::vector<Entry> entries(memory_.size() / sizeof(Entry), Entry{});
+    TF_ASSIGN_OR_RETURN(size_t initialized_entries,
+                        BufferDebugLogBase::ReadFromDevice(
+                            stream, memory_, sizeof(Entry), entries.data()));
+    entries.resize(initialized_entries);
+    return entries;
+  }
+
+  // Returns a view of the `BufferDebugLogHeader`.
+  //
+  // The returned `DeviceMemory` gets invalidated when the `BufferDebugLog` is
+  // destroyed.
+  DeviceMemory<xla::gpu::BufferDebugLogHeader> GetDeviceHeader() const {
+    return DeviceMemory<xla::gpu::BufferDebugLogHeader>(
+        memory_.GetByteSlice(0, sizeof(xla::gpu::BufferDebugLogHeader)));
+  }
+
+  // Returns a view of the `Entry` array.
+  //
+  // The returned `DeviceMemory` gets invalidated when the `BufferDebugLog` is
+  // destroyed.
+  DeviceMemory<Entry> GetDeviceEntries() const {
+    return DeviceMemory<Entry>(memory_.GetByteSlice(
+        sizeof(xla::gpu::BufferDebugLogHeader),
+        memory_.size() - sizeof(xla::gpu::BufferDebugLogHeader)));
+  }
+
+ private:
+  explicit BufferDebugLog(DeviceMemory<uint8_t> memory) : memory_(memory) {}
+
+  DeviceMemory<uint8_t> memory_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_LOG_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
new file mode 100644
index 00000000000000..27b7bb3c9d254e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+using ::xla::gpu::BufferDebugLogEntry;
+using ::xla::gpu::BufferDebugLogHeader;
+using ::xla::gpu::ThunkId;
+
+class BufferDebugLogTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<StreamExecutorMemoryAllocator>(stream_->parent());
+  }
+
+  Platform* platform_;
+  StreamExecutor* executor_;
+  std::unique_ptr<Stream> stream_;
+  std::unique_ptr<StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_InitializesEmptyLog) {
+  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(1024);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                              *stream_, log_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+
+  EXPECT_EQ(host_log.size(), 0);
+}
+
+TEST_F(BufferDebugLogTest,
+       CreateBufferDebugLogOnDevice_InitializesLogWithCorrectCapacity) {
+  constexpr size_t kMaxEntries = 10;
+  constexpr size_t kExpectedHeaderSize = sizeof(BufferDebugLogHeader);
+  constexpr size_t kExpectedEntriesSize =
+      sizeof(BufferDebugLogEntry) * kMaxEntries;
+  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
+      kExpectedHeaderSize + kExpectedEntriesSize);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                              *stream_, log_buffer));
+
+  EXPECT_EQ(device_log.GetDeviceHeader().size(), kExpectedHeaderSize);
+  EXPECT_EQ(device_log.GetDeviceEntries().size(), kExpectedEntriesSize);
+}
+
+TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_InitializesHeader) {
+  constexpr size_t kMaxEntries = 123;
+  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
+      BufferDebugLog<BufferDebugLogEntry>::RequiredSizeForEntries(kMaxEntries));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                              *stream_, log_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(BufferDebugLogHeader header,
+                          device_log.ReadHeaderFromDevice(*stream_));
+
+  EXPECT_EQ(header.write_idx, 0);
+  EXPECT_EQ(header.capacity, kMaxEntries);
+}
+
+TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_FailsForNullBuffer) {
+  EXPECT_THAT(BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                  *stream_, DeviceMemory<uint8_t>()),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(BufferDebugLogTest,
+       CreateBufferDebugLogOnDevice_FailsForTooSmallBuffer) {
+  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
+      BufferDebugLog<BufferDebugLogEntry>::RequiredSizeForEntries(1) - 1);
+
+  EXPECT_THAT(
+      BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_, log_buffer),
+      absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+}  // namespace
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h
similarity index 52%
rename from third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.h
rename to third_party/xla/xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h
index a20b07bb04c947..0cce7b2e57588b 100644
--- a/third_party/xla/xla/stream_executor/cuda/sdc_xor_checksum_kernel_cuda.h
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h
@@ -13,32 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_STREAM_EXECUTOR_CUDA_SDC_XOR_CHECKSUM_KERNEL_CUDA_H_
-#define XLA_STREAM_EXECUTOR_CUDA_SDC_XOR_CHECKSUM_KERNEL_CUDA_H_
+#ifndef XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_XOR_CHECKSUM_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_XOR_CHECKSUM_KERNEL_H_
 
 #include <cstdint>
 
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/runtime/sdc_buffer_id.h"
-#include "xla/backends/gpu/runtime/sdc_log_structs.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/kernel_spec.h"
 
-namespace stream_executor::cuda {
+namespace stream_executor::gpu {
 
 // Trait for a kernel that computes the checksum of given input buffer and
-// appends it to the SDC log.
+// appends it to the buffer debug log.
 //
 // This kernel MUST execute on a single thread block.
-struct SdcXorChecksumKernel {
-  using KernelType = TypedKernel<xla::gpu::SdcBufferId, DeviceMemory<uint8_t>,
-                                 uint64_t, DeviceMemory<xla::gpu::SdcLogHeader>,
-                                 DeviceMemory<xla::gpu::SdcLogEntry>>;
+struct BufferDebugXorChecksumKernel {
+  using KernelType =
+      TypedKernel<xla::gpu::BufferDebugLogEntryId, DeviceMemory<uint8_t>,
+                  uint64_t, DeviceMemory<xla::gpu::BufferDebugLogHeader>,
+                  DeviceMemory<xla::gpu::BufferDebugLogEntry>>;
 };
 
-absl::StatusOr<KernelLoaderSpec> GetSdcXorChecksumKernelSpec();
+}  // namespace stream_executor::gpu
 
-}  // namespace stream_executor::cuda
-
-#endif  // XLA_STREAM_EXECUTOR_CUDA_SDC_XOR_CHECKSUM_KERNEL_CUDA_H_
+#endif  // XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_XOR_CHECKSUM_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h b/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h
index e0b9906e29069c..8dcd2069e8f3b7 100644
--- a/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h
+++ b/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h
@@ -21,20 +21,22 @@ limitations under the License.
 // Metadata parameter which is passed to the collective kernel.
 // The metadata allows to compute the address of a peer's buffer in the
 // collective kernel and get the current rank of a peer device.
-// Right now two root pointers are getting passed. One is used for buffers
-// allocated by the buffer assignment and allows kernel to address input and
-// output buffers. The second one is used for buffers allocated within the
-// collective kernel thunk.
-// TODO(patrios): Unify two root pointers once symmetric memory allocator will
-// be implemented.
+// For each kernel parameter `param_to_peers` contains the N peer pointers to
+// the same parameter at the peer device, where N is the number of devices
+// participating in the collective kernel.
+// This information is structured as the
+// single dimentional array with the following layout:
+// [
+//   param0_peer0, param0_peer1, ..., param0_peerN,
+//   param1_peer0, param1_peer1, ..., param1_peerN,
+//   ...
+// ]
 struct CollectiveKernelMetadata {
-  constexpr static int kMaxNumDevices = 8;
-  int64_t rank;
-  // Root pointer for buffers allocated by the buffer assignment.
-  int64_t buffer_root_ptrs[kMaxNumDevices];
+  uint64_t rank;
+  void** param_to_peers;
 
-  // Root pointer for buffers allocated by the collective kernel thunk.
-  int64_t local_buffer_root_ptrs[kMaxNumDevices];
+  // Root pointer for multicast buffer for current device.
+  void* multicast_buffer_ptr;
 };
 
 #endif  // XLA_STREAM_EXECUTOR_GPU_COLLECTIVE_KERNEL_METADATA_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index 03df028088e731..b1e4f8f6aaa892 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -30,14 +30,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/protobuf_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.pb.h"
-#include "xla/stream_executor/host_or_device_scalar.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
 
 namespace stream_executor::gpu {
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 2a0c582677210c..2c55443e552a98 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -33,6 +33,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/debug_options_flags.h"
+#include "xla/service/dump.h"
 #include "xla/stream_executor/bit_pattern.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
@@ -569,15 +571,12 @@ absl::Status GpuCommandBuffer::Finalize() {
   TF_RETURN_IF_ERROR(PrepareFinalization());
 
   // Maybe dump created GPU graph to a dot file for debugging.
-  if (state_ == State::kCreate && VLOG_IS_ON(10)) {
-    std::string path = tsl::io::GetTempFilename(/*extension=*/"dot");
-    TF_RETURN_IF_ERROR(WriteGraphToDotFile(path));
-    if (VLOG_IS_ON(100)) {
-      std::string dot_file_contents;
-      TF_RETURN_IF_ERROR(
-          tsl::ReadFileToString(tsl::Env::Default(), path, &dot_file_contents));
-      VLOG(100) << "Contents of " << path << " is:\n" << dot_file_contents;
-    }
+  if (state_ == State::kCreate &&
+      (VLOG_IS_ON(10) || (VLOG_IS_ON(9) && mode_ == Mode::kPrimary))) {
+    xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
+    std::string contents = ToString();
+    std::string filename = absl::StrFormat("gpu_command_buffer_%p.dot", this);
+    xla::DumpToFileInDir(debug_options, filename, contents);
   }
 
   size_t num_commands = commands_.size();
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
index 9159a5e2611c12..05b581069ef099 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
@@ -38,8 +38,6 @@ limitations under the License.
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/framework/device_id.h"
 #include "xla/tsl/util/env_var.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace stream_executor {
 
@@ -216,8 +214,8 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
   // If in TF_DETERMINISTIC_ALLOCATOR is set, then make the allocator behave
   // determistically.
   bool deterministic = false;
-  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_DETERMINISTIC_ALLOCATOR",
-                                      /*default_val=*/false, &deterministic));
+  CHECK_OK(tsl::ReadBoolFromEnvVar("TF_DETERMINISTIC_ALLOCATOR",
+                                   /*default_val=*/false, &deterministic));
   if (deterministic) {
     int disable = 0;
     if (auto status = cuMemPoolSetAttribute(
@@ -488,8 +486,8 @@ void GpuCudaMallocAsyncAllocator::SetStreamAndPreallocateMemory(void* stream) {
   int64_t prealloc_size = 0;
   // TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=-1 is a special value that
   // preallocates the total pool size.
-  TF_CHECK_OK(tsl::ReadInt64FromEnvVar(
-      "TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", 0, &prealloc_size));
+  CHECK_OK(tsl::ReadInt64FromEnvVar("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC",
+                                    0, &prealloc_size));
   if (prealloc_size == -1) {
     prealloc_size = pool_size_64;
   } else if (reserve_memory_) {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
index c3d641889038e2..14bf3ddd285fe4 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
@@ -39,8 +39,8 @@ namespace {
 TEST(DeviceInfoTest, DeviceInfoMatches) {
   absl::flat_hash_map<std::string, GpuDeviceInfoProto> gpu_specs;
   for (const std::string file_name :
-       {"a100_pcie_80", "a100_sxm_40", "a100_sxm_80", "a6000", "h100_pcie",
-        "h100_sxm", "p100", "v100", "mi200"}) {
+       {"a100_pcie_80", "a100_sxm_40", "a100_sxm_80", "a6000", "b200",
+        "h100_pcie", "h100_sxm", "p100", "v100", "mi200"}) {
     GpuTargetConfigProto proto;
     std::string spec_string;
     TF_ASSERT_OK(tsl::ReadFileToString(
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 31bb0e71b496ff..9f28a352859a48 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
 
 #include <cstdint>
+#include <memory>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -25,6 +26,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_common.h"
@@ -66,6 +68,31 @@ class GpuExecutor : public StreamExecutorCommon {
 
   uint64_t GetArgumentLoggingMode() const { return argument_logging_mode_; }
 
+  // Abstract class for multicast memory.
+  class MulticastMemory {
+   public:
+    virtual ~MulticastMemory() = default;
+
+    MulticastMemory() = default;
+
+    virtual absl::Status SubscribeDevice(int device_number) {
+      return absl::UnimplementedError("SubscribeDevice is not implemented.");
+    }
+
+    virtual absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
+                                            const GpuExecutor* gpu_executor) {
+      return absl::UnimplementedError("MapMemory is not implemented.");
+    }
+  };
+
+  virtual absl::StatusOr<std::unique_ptr<MulticastMemory>>
+  CreateMulticastMemory(uint64_t size, int num_devices) const {
+    return absl::UnimplementedError(
+        "CreateMulticastMemory is not implemented.");
+  };
+
+  virtual bool is_multicast_supported() const { return false; }
+
  private:
   // The device ordinal value that this executor was initialized with; recorded
   // for use in getting device metadata. Immutable post-initialization.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
index 2d685ae60b54ea..cae9b82293d0e8 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -43,7 +43,7 @@ TEST_F(GetPointerMemorySpaceTest, Host) {
   StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
   TF_ASSERT_OK_AND_ASSIGN(auto host_ptr, executor->HostMemoryAllocate(64));
   TF_ASSERT_OK_AND_ASSIGN(auto memory_space,
-                          executor->GetPointerMemorySpace(host_ptr->opaque()))
+                          executor->GetPointerMemorySpace(host_ptr->opaque()));
   EXPECT_EQ(memory_space, MemoryType::kHost);
 }
 
@@ -53,7 +53,7 @@ TEST_F(GetPointerMemorySpaceTest, HostAllocatedWithMemoryKind) {
       64, static_cast<int64_t>(stream_executor::MemoryType::kHost));
   EXPECT_FALSE(host_ptr.is_null());
   TF_ASSERT_OK_AND_ASSIGN(MemoryType memory_space,
-                          executor->GetPointerMemorySpace(host_ptr.opaque()))
+                          executor->GetPointerMemorySpace(host_ptr.opaque()));
   EXPECT_EQ(memory_space, MemoryType::kHost);
   executor->Deallocate(&host_ptr);
 }
@@ -63,7 +63,7 @@ TEST_F(GetPointerMemorySpaceTest, Device) {
   auto mem = executor->Allocate(64);
   ASSERT_NE(mem, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(auto memory_space,
-                          executor->GetPointerMemorySpace(mem.opaque()))
+                          executor->GetPointerMemorySpace(mem.opaque()));
   EXPECT_EQ(memory_space, MemoryType::kDevice);
   executor->Deallocate(&mem);
 }
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry_test.cc
index 69c2333f47740d..e7dcd13dc5c268 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/threadpool.h"
 
 namespace stream_executor::gpu {
@@ -40,9 +39,6 @@ struct OtherTestKernelTrait {
 };
 
 using testing::Property;
-using tsl::testing::IsOk;
-using tsl::testing::IsOkAndHolds;
-using tsl::testing::StatusIs;
 
 TEST(GpuKernelRegistryTest, RegisterKernel) {
   PlatformObjectRegistry object_registry;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
index dfb64b7e087f99..f3dddfd3bb80d1 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -34,6 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/gpu/tma_metadata.pb.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -45,10 +47,12 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/protobuf.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
 
 namespace stream_executor::gpu {
 namespace {
+using ::testing::Each;
+using tsl::proto_testing::ParseTextProtoOrDie;
 
 using AddI32Kernel =
     TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
@@ -119,6 +123,36 @@ TEST_F(GpuKernelTest, LoadAndRunKernelFromSymbol) {
   RunAddI32Kernel(spec);
 }
 
+TEST_F(GpuKernelTest, LoadAndRunKernelFromSymbolWithCustomArgsPacking) {
+  constexpr int64_t kArraySize = 4;
+  constexpr int64_t kArraySizeBytes = sizeof(int32_t) * kArraySize;
+
+  // Prepare arguments: in=10, out=0
+  DeviceMemory<int32_t> in =
+      executor_->AllocateArray<int32_t>(kArraySize, /*memory_space=*/0);
+  DeviceMemory<int32_t> out =
+      executor_->AllocateArray<int32_t>(kArraySize, /*memory_space=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Stream> stream,
+                          executor_->CreateStream());
+  TF_ASSERT_OK(stream->Memset32(&in, 10, kArraySizeBytes));
+  TF_ASSERT_OK(stream->MemZero(&out, kArraySizeBytes));
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
+                          GetIncrementBy5I32TestKernelSpecWithCustomArgsPacking(
+                              executor_->GetPlatform()->id()));
+  TF_ASSERT_OK_AND_ASSIGN(auto kernel, executor_->LoadKernel(spec));
+  TF_ASSERT_OK(kernel->Launch(
+      ThreadDim(), BlockDim(4),
+      /*cluster_dims=*/std::nullopt, stream.get(),
+      KernelArgsDeviceMemoryArray({in, out}, /*shared_memory_bytes=*/0)));
+
+  // Copy data back to host and verify that the output is 5 + 10 = 15.
+  std::vector<int32_t> dst(4, 0);
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, kArraySizeBytes));
+  EXPECT_THAT(dst, Each(15));
+}
+
 TEST_F(GpuKernelTest, ArrayArgByValue) {
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor_->CreateStream());
   TF_ASSERT_OK_AND_ASSIGN(auto kernel, LoadCopyTestKernel(executor_));
@@ -157,9 +191,8 @@ TEST_F(GpuKernelTest, TmaLoadAndRunKernelFromPtx) {
 
   auto get_tma_descriptor_from_proto =
       [](absl::string_view proto) -> absl::StatusOr<TmaDescriptor> {
-    TmaDescriptorProto tma_descriptor_proto;
-    tsl::protobuf::TextFormat::ParseFromString(proto, &tma_descriptor_proto);
-    return TmaDescriptor::FromProto(tma_descriptor_proto);
+    return TmaDescriptor::FromProto(
+        ParseTextProtoOrDie<TmaDescriptorProto>(proto));
   };
 
   TF_ASSERT_OK_AND_ASSIGN(TmaDescriptor arg0_desc,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h
index f0de693e617459..28279350444151 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h
@@ -38,6 +38,10 @@ struct AddI32Kernel {
                                  DeviceMemory<int32_t>>;
 };
 
+struct IncrementBy5I32KernelWithCustomArgsPacking {
+  using KernelType = TypedKernel<DeviceMemory<int32_t>>;
+};
+
 struct MulI32Kernel {
   using KernelType = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
                                  DeviceMemory<int32_t>>;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc
index a5f583f71e18e4..668aa3d2845394 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc
@@ -59,6 +59,14 @@ absl::StatusOr<KernelLoaderSpec> GetAddI32TestKernelSpec(
       .FindKernel<internal::AddI32Kernel>(platform_id);
 }
 
+absl::StatusOr<KernelLoaderSpec>
+GetIncrementBy5I32TestKernelSpecWithCustomArgsPacking(
+    Platform::Id platform_id) {
+  return GpuKernelRegistry::GetGlobalRegistry()
+      .FindKernel<internal::IncrementBy5I32KernelWithCustomArgsPacking>(
+          platform_id);
+}
+
 KernelLoaderSpec GetAddI32PtxKernelSpec() {
   // PTX kernel compiled from:
   //
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
index ab16b7f3a02d2d..902de0e5f1fc1b 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
@@ -49,6 +49,17 @@ absl::StatusOr<internal::CopyKernel::KernelType> LoadCopyTestKernel(
 absl::StatusOr<KernelLoaderSpec> GetAddI32TestKernelSpec(
     Platform::Id platform_id);
 
+// This is using a kernel with the function signature `void IncI32(int32_t a,
+// int32_t* b, int32_t* c)` under the hood and implements `c[i] = a + b[i]`.
+// It uses a custom argument packing that supplies a constant scalar value of 5
+// to the kernel for `a`, therefore it appears as if the the kernel had the
+// function signature `void IncI32(DeviceMemory<int32_t> in,
+// DeviceMemory<int32_t> out)`.
+//
+// The main purpose is the testing of the custom argument packing feature.
+absl::StatusOr<KernelLoaderSpec>
+GetIncrementBy5I32TestKernelSpecWithCustomArgsPacking(Platform::Id platform_id);
+
 // Returns a PTX kernel loader spec for the `AddI32` PTX kernel above.
 KernelLoaderSpec GetAddI32PtxKernelSpec();
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h
index 64f5843d974e69..d92bc5c536b00d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h
@@ -32,6 +32,11 @@ __global__ void AddI32(int32_t* a, int32_t* b, int32_t* c) {
   c[index] = a[index] + b[index];
 }
 
+__global__ void IncI32(int32_t a, int32_t* b, int32_t* c) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  c[index] = a + b[index];
+}
+
 __global__ void MulI32(int32_t* a, int32_t* b, int32_t* c) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   c[index] = a[index] * b[index];
diff --git a/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.cc b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.cc
new file mode 100644
index 00000000000000..ab79887a10dabc
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.cc
@@ -0,0 +1,56 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/gpu/kernel_serialization_check.h"
+
+#include <gmock/gmock.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/kernel_symbol_registry.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu {
+using ::testing::IsEmpty;
+using ::testing::NotNull;
+
+void VerifyKernelIsSerializable(const KernelLoaderSpec& kernel_spec,
+                                Platform::Id platform_id) {
+  auto resolve_kernel_symbol =
+      [&](absl::string_view persistent_kernel_name) -> absl::StatusOr<void*> {
+    return KernelSymbolRegistry::GetGlobalInstance().FindSymbol(
+        persistent_kernel_name, platform_id);
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpecProto proto, kernel_spec.ToProto());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      KernelLoaderSpec deserialized_spec,
+      KernelLoaderSpec::FromProto(proto, resolve_kernel_symbol));
+
+  if (deserialized_spec.has_in_process_symbol()) {
+    EXPECT_THAT(deserialized_spec.in_process_symbol()->symbol, NotNull());
+  }
+  if (deserialized_spec.has_cuda_cubin_in_memory()) {
+    EXPECT_THAT(deserialized_spec.cuda_cubin_in_memory()->cubin_bytes,
+                Not(IsEmpty()));
+  }
+  if (deserialized_spec.has_cuda_ptx_in_memory()) {
+    EXPECT_THAT(deserialized_spec.cuda_ptx_in_memory()->ptx, Not(IsEmpty()));
+  }
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.h b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.h
new file mode 100644
index 00000000000000..e790acb9493a3f
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_KERNEL_SERIALIZATION_CHECK_H_
+#define XLA_STREAM_EXECUTOR_GPU_KERNEL_SERIALIZATION_CHECK_H_
+
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/platform.h"
+namespace stream_executor::gpu {
+
+// Verifies that a the given KernelLoaderSpec can be serialized and deserialized
+// correctly for the given platform id.
+// The check is best-effort and won't actually try to run or load the kernel.
+// It's just verifying that the necessary information is present.
+void VerifyKernelIsSerializable(const KernelLoaderSpec& kernel_spec,
+                                Platform::Id platform_id);
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_KERNEL_SERIALIZATION_CHECK_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/prefix_sum_kernel.h b/third_party/xla/xla/stream_executor/gpu/prefix_sum_kernel.h
new file mode 100644
index 00000000000000..c669f6b3927bad
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/prefix_sum_kernel.h
@@ -0,0 +1,77 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_PREFIX_SUM_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_PREFIX_SUM_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/types.h"
+
+namespace stream_executor::gpu {
+struct PrefixSumBF16Kernel {
+  using KernelType = TypedKernel<const DeviceMemory<xla::bfloat16>,
+                                 DeviceMemory<xla::bfloat16>, size_t>;
+};
+struct PrefixSumF16Kernel {
+  using KernelType = TypedKernel<const DeviceMemory<xla::half>,
+                                 DeviceMemory<xla::half>, size_t>;
+};
+struct PrefixSumF32Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<float>, DeviceMemory<float>, size_t>;
+};
+struct PrefixSumF64Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<double>, DeviceMemory<double>, size_t>;
+};
+struct PrefixSumS8Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int8_t>, DeviceMemory<int8_t>, size_t>;
+};
+struct PrefixSumS16Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int16_t>, DeviceMemory<int16_t>, size_t>;
+};
+struct PrefixSumS32Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int32_t>, DeviceMemory<int32_t>, size_t>;
+};
+struct PrefixSumS64Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int64_t>, DeviceMemory<int64_t>, size_t>;
+};
+struct PrefixSumU8Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint8_t>, DeviceMemory<uint8_t>, size_t>;
+};
+struct PrefixSumU16Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint16_t>, DeviceMemory<uint16_t>, size_t>;
+};
+struct PrefixSumU32Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint32_t>, DeviceMemory<uint32_t>, size_t>;
+};
+struct PrefixSumU64Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint64_t>, DeviceMemory<uint64_t>, size_t>;
+};
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_PREFIX_SUM_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/repeat_buffer_kernel_test.cc b/third_party/xla/xla/stream_executor/gpu/repeat_buffer_kernel_test.cc
index 1050b6376680bf..0c6fd85f8bffeb 100644
--- a/third_party/xla/xla/stream_executor/gpu/repeat_buffer_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/repeat_buffer_kernel_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/ascii.h"
 #include "absl/types/span.h"
@@ -33,14 +34,11 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 namespace {
 using ::testing::ElementsAreArray;
-using ::tsl::testing::IsOk;
 
 class RepeatBufferKernelTest : public testing::Test {
  public:
@@ -69,7 +67,7 @@ TEST_F(RepeatBufferKernelTest, CreateRepeatedBufferAndTestResult) {
   DeviceMemory<float> buffer =
       executor_->AllocateArray<float>(kNumberOfTotalElements);
 
-  TF_CHECK_OK(stream->MemcpyH2D(absl::MakeConstSpan(kInitialBuf), &buffer));
+  CHECK_OK(stream->MemcpyH2D(absl::MakeConstSpan(kInitialBuf), &buffer));
 
   TF_ASSERT_OK_AND_ASSIGN(
       RepeatBufferKernel::KernelType kernel,
@@ -88,8 +86,8 @@ TEST_F(RepeatBufferKernelTest, CreateRepeatedBufferAndTestResult) {
   std::array<float, kNumberOfTotalElements> result_buffer{};
   absl::Span<const float> result = absl::MakeConstSpan(result_buffer);
 
-  TF_CHECK_OK(stream->MemcpyD2H(buffer, absl::MakeSpan(result_buffer)));
-  TF_CHECK_OK(stream->BlockHostUntilDone());
+  CHECK_OK(stream->MemcpyD2H(buffer, absl::MakeSpan(result_buffer)));
+  CHECK_OK(stream->BlockHostUntilDone());
 
   for (int offset = 0; offset < kNumberOfTotalElements;
        offset += kNumberOfRepeatedElements) {
diff --git a/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc b/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
index 99cbdb4268dfe4..be72e4f0051ae3 100644
--- a/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
+++ b/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
@@ -460,11 +460,12 @@ absl::StatusOr<TmaMetadata> TmaMetadata::FromProto(
   return metadata;
 }
 
+// TODO(b/463912789): Re-enable TMA for Blackwell once the bug is fixed.
 bool IsTmaAvailableForDevice(
     const stream_executor::DeviceDescription& device_info) {
-  if (auto* cuda_cc = std::get_if<stream_executor::CudaComputeCapability>(
-          &device_info.gpu_compute_capability())) {
-    return cuda_cc->IsAtLeastHopper();
+  if (auto* cuda_cc =
+          device_info.gpu_compute_capability().cuda_compute_capability()) {
+    return cuda_cc->IsHopper();
   }
   return false;
 }
diff --git a/third_party/xla/xla/stream_executor/gpu/tma_metadata_test.cc b/third_party/xla/xla/stream_executor/gpu/tma_metadata_test.cc
index 5167a2a5b7a180..ee2ddc8db12bd1 100644
--- a/third_party/xla/xla/stream_executor/gpu/tma_metadata_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/tma_metadata_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/stream_executor/gpu/tma_metadata.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "tsl/platform/protobuf.h"
@@ -36,8 +35,6 @@ namespace {
 using absl::StatusCode;
 using testing::HasSubstr;
 using tsl::proto_testing::EqualsProto;
-using tsl::testing::IsOk;
-using tsl::testing::StatusIs;
 
 TEST(TmaMetadataTest, CreateValidTmaInfoReturnsOk) {
   EXPECT_THAT(TmaDescriptor::Create(/*global_dims=*/{500, 360},
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 4e5a76f1462a4b..2c82cdc0e67b88 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -48,10 +48,10 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:initialize",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:status",
     ],
     alwayslink = True,  # Registers itself with the PlatformManager.
 )
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.cc b/third_party/xla/xla/stream_executor/host/host_platform.cc
index b70ea46fa25825..5c99ecb1a917d5 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.cc
+++ b/third_party/xla/xla/stream_executor/host/host_platform.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <thread>  // NOLINT
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
-#include "tsl/platform/status.h"
 
 namespace stream_executor {
 namespace host {
@@ -71,7 +71,7 @@ HostPlatform::GetUncachedExecutor(int ordinal) {
 
 static void InitializeHostPlatform() {
   std::unique_ptr<Platform> platform(new host::HostPlatform);
-  TF_CHECK_OK(PlatformManager::RegisterPlatform(std::move(platform)));
+  CHECK_OK(PlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace host
diff --git a/third_party/xla/xla/stream_executor/host_or_device_scalar.h b/third_party/xla/xla/stream_executor/host_or_device_scalar.h
deleted file mode 100644
index 81e07e2d194cee..00000000000000
--- a/third_party/xla/xla/stream_executor/host_or_device_scalar.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
-#define XLA_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
-
-#include <utility>
-#include <variant>
-
-#include "absl/log/check.h"
-#include "xla/stream_executor/device_memory.h"
-
-namespace stream_executor {
-
-// Allows to represent a value that is either a host scalar or a scalar stored
-// on the device.
-template <typename T>
-class HostOrDeviceScalar {
- public:
-  explicit HostOrDeviceScalar(T host_value) : value_(std::move(host_value)) {}
-  explicit HostOrDeviceScalar(DeviceMemory<T> device_ptr)
-      : value_(std::move(device_ptr)) {
-    CHECK_EQ(1, device_ptr.ElementCount());
-  }
-
-  bool on_device() const {
-    return std::holds_alternative<DeviceMemory<T>>(value_);
-  }
-
-  const void* opaque() const {
-    return on_device() ? std::get<DeviceMemory<T>>(value_).opaque()
-                       : &std::get<T>(value_);
-  }
-
- private:
-  std::variant<T, DeviceMemory<T>> value_;
-};
-
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index 66170ec2f00749..044c6e9f109f3f 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:numbers",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h b/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
index 382f032a42f744..2f15ba3f2be9d4 100644
--- a/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
+++ b/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
@@ -33,8 +33,9 @@ class DeviceMemAllocator : public tsl::SubAllocator {
   // Note: stream_exec cannot be null.
   DeviceMemAllocator(StreamExecutor* stream_exec,
                      tsl::PlatformDeviceId device_id,
-                     const std::vector<Visitor>& alloc_visitors = {})
-      : SubAllocator(alloc_visitors, {}),
+                     const std::vector<Visitor>& alloc_visitors = {},
+                     const std::vector<Visitor>& free_visitors = {})
+      : SubAllocator(alloc_visitors, free_visitors),
         stream_exec_(stream_exec),
         device_id_(device_id) {
     CHECK(stream_exec_ != nullptr);
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
index 69ce79cf4b3254..a2c0cea3c7087f 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/logging.h"
+#include "tsl/platform/numbers.h"
 
 namespace stream_executor {
 
@@ -95,7 +96,8 @@ absl::Status MemoryAllocationError(uint64_t size, bool is_host_mem) {
 
   absl::Status status = absl::ResourceExhaustedError(
       absl::StrCat("Out of ", (is_host_mem ? "host " : ""),
-                   "memory while trying to allocate ", size, " bytes.",
+                   "memory while trying to allocate ",
+                   tsl::strings::HumanReadableNumBytes(size), ".",
                    (is_host_mem ? kHostMemoryExplanation : "")));
   status.SetPayload(kMemoryAllocationErrorPayloadKey, absl::Cord());
   return status;
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
index f098c017c3cb80..5d072e24044471 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/framework/allocator.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
@@ -76,7 +75,7 @@ TEST(MultiDeviceAdapter, UsesCorrectAllocator) {
   TF_ASSERT_OK_AND_ASSIGN(auto* platform,
                           xla::PlatformUtil::GetDefaultPlatform());
   TF_ASSERT_OK_AND_ASSIGN(std::vector<StreamExecutor*> executors,
-                          xla::PlatformUtil::GetStreamExecutors(platform))
+                          xla::PlatformUtil::GetStreamExecutors(platform));
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executors[0]->CreateStream());
 
   std::vector<MultiDeviceAdapter::AllocatorInfo> infos;
@@ -144,7 +143,7 @@ TEST(MultiDeviceAdapter, DeallocationWithDifferentAllocator) {
   CHECK_EQ(allocations->size(), 1);
   CHECK_EQ(reinterpret_cast<size_t>(buff0->opaque()), 0x1001);
 
-  TF_CHECK_OK(deallocator->Deallocate(/*device_ordinal=*/0, buff0.cref()));
+  CHECK_OK(deallocator->Deallocate(/*device_ordinal=*/0, buff0.cref()));
   CHECK_EQ(allocations->size(), 0);
 
   // Place back memory pointer to remove it during with ScopedDeviceMemory
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
deleted file mode 100644
index 3f419fe04ad62a..00000000000000
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2015 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/kernel.h"
-
-#include <cstdint>
-#include <optional>
-#include <string>
-
-#include "absl/strings/string_view.h"
-
-namespace stream_executor {
-
-std::optional<int64_t> KernelMetadata::registers_per_thread() const {
-  return registers_per_thread_;
-}
-
-std::optional<int64_t> KernelMetadata::shared_memory_bytes() const {
-  return shared_memory_bytes_;
-}
-
-void KernelMetadata::set_registers_per_thread(int registers_per_thread) {
-  registers_per_thread_ = registers_per_thread;
-}
-
-void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
-  shared_memory_bytes_ = shared_memory_bytes;
-}
-
-//===----------------------------------------------------------------------===//
-// Kernel
-//===----------------------------------------------------------------------===//
-
-void Kernel::set_name(absl::string_view name) { name_ = std::string(name); }
-
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 6a738ebe6c6943..ac0815683d5359 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -67,7 +67,6 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_KERNEL_H_
 #define XLA_STREAM_EXECUTOR_KERNEL_H_
 
-#include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -76,117 +75,19 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
-#include <type_traits>
 #include <utility>
-#include <variant>
 
-#include "absl/base/optimization.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/overload.h"
 #include "absl/log/check.h"
-#include "absl/meta/type_traits.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 
 namespace stream_executor {
 
-//===----------------------------------------------------------------------===//
-// Kernel metadata
-//===----------------------------------------------------------------------===//
-
-// KernelMetadata holds runtime-queryable attributes of a loaded kernel, such as
-// registers allocated, shared memory used, etc.
-// Not all platforms support reporting of all information, so each accessor
-// returns false if the associated field is not populated in the underlying
-// platform.
-class KernelMetadata {
- public:
-  KernelMetadata() = default;
-
-  // Returns the number of registers used per thread executing this kernel.
-  std::optional<int64_t> registers_per_thread() const;
-
-  // Returns the amount of [static] shared memory used per block executing this
-  // kernel. Note that dynamic shared memory allocations are not (and can not)
-  // be reported here (since they're not specified until kernel launch time).
-  std::optional<int64_t> shared_memory_bytes() const;
-
-  void set_registers_per_thread(int registers_per_thread);
-  void set_shared_memory_bytes(int shared_memory_bytes);
-
- private:
-  std::optional<int64_t> registers_per_thread_;
-  std::optional<int64_t> shared_memory_bytes_;
-};
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments
-//===----------------------------------------------------------------------===//
-
-// A virtual base class for passing kernel arguments to a stream executor APIs.
-class KernelArgs {
- public:
-  template <typename T>
-  using IsKernelArgs = std::enable_if_t<std::is_base_of<KernelArgs, T>::value>;
-
-  enum class Kind {
-    // A list of type-erased DeviceMemoryBase pointers to on-device memory. This
-    // type of kernel arguments used only when the kernel has to do its own
-    // custom packing, e.g. wrap all device pointers into a custom
-    // structure, but can't be implemented as a TypedKernel because it has to be
-    // passed around as a generic Kernel.
-    kDeviceMemoryArray,
-
-    // A list of kernel arguments packed into a storage that can be passed
-    // directly to device kernel as void** kernel parameters.
-    kPackedArray
-  };
-
-  virtual ~KernelArgs() = default;
-
-  // Gets the number of arguments added so far, including shared memory
-  // arguments.
-  virtual size_t number_of_arguments() const = 0;
-
-  // Gets the total number of shared memory bytes added so far.
-  virtual uint64_t number_of_shared_bytes() const = 0;
-
-  virtual Kind kind() const = 0;
-};
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments packed array
-//===----------------------------------------------------------------------===//
-
-// A virtual base class for passing kernel arguments packed into a storage so
-// that we have stable addresses for all arguments. This is a low level API for
-// passing arguments in a platform-specific way that relies on the knowledge of
-// the ABI of the underlying platform.
-//
-// For example `cuLaunchKernel` accepts arguments as `void** kernelParams`, and
-// packed array base guarantees that `argument_addresses` are compatible with
-// the CUDA APIs.
-//
-// See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html
-class KernelArgsPackedArrayBase : public KernelArgs {
- public:
-  // Gets the list of argument addresses.
-  virtual absl::Span<const void* const> argument_addresses() const = 0;
-
-  static bool classof(const KernelArgs* args) {
-    return args->kind() == Kind::kPackedArray;
-  }
-
-  Kind kind() const final { return Kind::kPackedArray; }
-};
-
 //===----------------------------------------------------------------------===//
 // Kernel
 //===----------------------------------------------------------------------===//
@@ -230,7 +131,7 @@ class Kernel {
   }
 
   absl::string_view name() const { return name_; }
-  void set_name(absl::string_view name);
+  void set_name(std::string name) { name_ = std::move(name); }
 
   // Launches a data parallel kernel with the given thread/block
   // dimensionality and already-packed args/sizes to pass to the underlying
@@ -322,463 +223,6 @@ class TypedKernel {
 
   std::unique_ptr<Kernel> kernel_;
 };
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments LLVM-style RTTI library
-//===----------------------------------------------------------------------===//
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-T* Cast(KernelArgs* args) {
-  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
-                          << typeid(T).name();
-  CHECK(args != nullptr) << "Casted arguments must be not null";
-  return static_cast<const T*>(args);
-}
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-const T* Cast(const KernelArgs* args) {
-  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
-                          << typeid(T).name();
-  CHECK(args != nullptr) << "Casted arguments must be not null";
-  return static_cast<const T*>(args);
-}
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-const T* DynCast(const KernelArgs* args) {
-  CHECK(args != nullptr) << "Casted arguments must be not null";
-  return T::classof(args) ? static_cast<const T*>(args) : nullptr;
-}
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-const T* DynCastOrNull(const KernelArgs* args) {
-  return args && T::classof(args) ? static_cast<const T*>(args) : nullptr;
-}
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments device memory array
-//===----------------------------------------------------------------------===//
-
-class KernelArgsDeviceMemoryArray : public KernelArgs {
- public:
-  KernelArgsDeviceMemoryArray(absl::Span<const DeviceMemoryBase> args,
-                              size_t shared_memory_bytes)
-      : device_memory_args_(args.begin(), args.end()),
-        shared_memory_bytes_(shared_memory_bytes) {}
-
-  static bool classof(const KernelArgs* args) {
-    return args->kind() == Kind::kDeviceMemoryArray;
-  }
-
-  Kind kind() const final { return Kind::kDeviceMemoryArray; }
-
-  size_t number_of_arguments() const final {
-    return device_memory_args_.size() + (shared_memory_bytes_ > 0);
-  }
-
-  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
-
-  absl::Span<const DeviceMemoryBase> device_memory_args() const {
-    return device_memory_args_;
-  }
-
-  const void* device_memory_ptr(size_t index) const {
-    return device_memory_args_[index].opaque();
-  }
-
-  size_t device_memory_size(size_t index) const {
-    return device_memory_args_[index].size();
-  }
-
- private:
-  absl::InlinedVector<DeviceMemoryBase, 4> device_memory_args_;
-  size_t shared_memory_bytes_ = 0;
-};
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments packing for device memory and POD args
-//===----------------------------------------------------------------------===//
-
-// KernelArgsPackedArray is optimized for packing DeviceMemoryBase pointers
-// and POD arguments (i.e. scalars) when the number and type of arguments are
-// not known at compile time.
-
-namespace internal {
-
-// An empty storage for packing just the device memory arguments, that are
-// stored directly in the `KernelArgsPackedArray`.
-struct EmptyArgs {
-  static constexpr size_t kSize = 0;
-};
-
-// A storage for POD generic arguments that are smaller than `size` and require
-// alignment smaller or equal to `alignment`.
-template <size_t capacity, size_t size = 8,
-          size_t alignment = alignof(std::max_align_t)>
-class PodArgs {
- public:
-  static constexpr size_t kSize = size;
-
- protected:
-  template <typename T>
-  const std::byte* add_pod_argument(const T& arg) {
-    static_assert(std::is_trivially_copyable_v<T> &&
-                      sizeof(T) <= size & alignof(T) <= alignment,
-                  "Type is not compatible with POD arguments storage");
-
-    assert(num_args_ < capacity && "pod args overflow");
-    std::byte* arg_storage = args_storage_[num_args_++].storage;
-    std::memcpy(arg_storage, &arg, sizeof(T));
-
-    return arg_storage;
-  }
-
- private:
-  struct Arg {
-    alignas(alignment) std::byte storage[size];
-  };
-
-  size_t num_args_ = 0;
-  std::array<Arg, capacity> args_storage_;
-};
-
-template <typename ArgsStorage>
-static constexpr bool is_pod_args_v = false;
-
-template <size_t capacity, size_t size, size_t alignment>
-static constexpr bool is_pod_args_v<PodArgs<capacity, size, alignment>> = true;
-
-}  // namespace internal
-
-// An array of arguments for a kernel call.
-//
-// The template parameter `num_args` is the maximum number of arguments which
-// can be stored in the array.
-template <size_t num_args, typename ArgsStorage = internal::PodArgs<num_args>>
-class KernelArgsPackedArray : public KernelArgsPackedArrayBase, ArgsStorage {
- public:
-  KernelArgsPackedArray() = default;
-
-  // KernelArgsPackedArray is not copyable or movable because argument addresses
-  // point to inline storage that can't be moved.
-  KernelArgsPackedArray(const KernelArgsPackedArray&) = delete;
-  KernelArgsPackedArray& operator=(const KernelArgsPackedArray&) = delete;
-
-  // Adds an argument to the list.
-  template <typename T>
-  void add_argument(const T& arg) {
-    if constexpr (internal::is_pod_args_v<ArgsStorage>) {
-      argument_addresses_[number_of_argument_addresses_++] =
-          ArgsStorage::add_pod_argument(arg);
-    } else {
-      // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
-      static_assert(sizeof(T) == 0, "Arguments storage is not supported");
-    }
-  }
-
-  // Adds a device memory argument to the list.
-  void add_device_memory_argument(const DeviceMemoryBase& arg) {
-    const void** copy_ptr =
-        &device_memory_opaque_pointers_[number_of_argument_addresses_];
-    *copy_ptr = arg.opaque();
-    argument_addresses_[number_of_argument_addresses_] = copy_ptr;
-    ++number_of_argument_addresses_;
-  }
-
-  // Adds a shared memory argument to the list.
-  //
-  // The only significant information about a shared argument is its size, so
-  // that is the only parameter in this function.
-  void add_shared_bytes(size_t number_of_bytes) {
-    shared_memory_bytes_ += number_of_bytes;
-  }
-
-  // Gets the number of arguments added so far, including shared memory
-  // arguments.
-  size_t number_of_arguments() const final {
-    return number_of_argument_addresses_ + (shared_memory_bytes_ > 0);
-  }
-
-  // Gets the total number of shared memory bytes added so far.
-  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
-
-  // Gets the list of argument addresses.
-  absl::Span<const void* const> argument_addresses() const final {
-    return absl::Span<const void* const>(argument_addresses_.data(),
-                                         number_of_argument_addresses_);
-  }
-
- private:
-  // A place to store copies of opaque pointers from device memory arguments.
-  std::array<const void*, num_args> device_memory_opaque_pointers_;
-
-  // Addresses for non-shared-memory arguments.
-  std::array<const void*, num_args> argument_addresses_;
-
-  // Shared memory required by a kernel.
-  size_t shared_memory_bytes_ = 0;
-
-  // Number of significant entries in argument_addresses_.
-  size_t number_of_argument_addresses_ = 0;
-};
-
-using KernelArgument = std::variant<DeviceMemoryBase, TensorMap, int64_t>;
-
-namespace internal {
-template <int n>
-std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
-    absl::Span<const DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
-  auto packed = std::make_unique<KernelArgsPackedArray<n, EmptyArgs>>();
-  for (const DeviceMemoryBase& buf : args) {
-    packed->add_device_memory_argument(buf);
-  }
-  if (shared_mem_bytes > 0) {
-    packed->add_shared_bytes(shared_mem_bytes);
-  }
-  return packed;
-}
-
-template <int n, typename ArgsStorage>
-std::unique_ptr<KernelArgsPackedArray<n, ArgsStorage>> PackKernelArgsImpl(
-    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
-  auto packed = std::make_unique<KernelArgsPackedArray<n, ArgsStorage>>();
-  for (const auto& arg : args) {
-    std::visit(
-        absl::Overload{
-            [&](const DeviceMemoryBase& device_memory) {
-              packed->add_device_memory_argument(device_memory);
-            },
-            [&](int64_t int_arg) {
-              if constexpr (ArgsStorage::kSize >= sizeof(int64_t)) {
-                packed->add_argument(int_arg);
-              }
-            },
-            [&](const TensorMap& tensor_map) {
-              if constexpr (ArgsStorage::kSize >= sizeof(tensor_map.storage)) {
-                packed->add_argument(tensor_map.storage);
-              }
-            },
-        },
-        arg);
-  }
-  if (shared_mem_bytes > 0) {
-    packed->add_shared_bytes(shared_mem_bytes);
-  }
-  return packed;
-}
-
-template <int n>
-std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
-    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
-  const int32_t pod_size = [](absl::Span<const KernelArgument> args) {
-    bool has_int = false;
-    for (const auto& arg : args) {
-      if (std::holds_alternative<TensorMap>(arg)) {
-        return 128;
-      }
-      if (std::holds_alternative<int64_t>(arg)) {
-        has_int = true;
-      }
-    }
-    return has_int ? 64 : 0;
-  }(args);
-
-  switch (pod_size) {
-    case 128:
-      return PackKernelArgsImpl<n, PodArgs<n, 128, 64>>(args, shared_mem_bytes);
-    case 64:
-      return PackKernelArgsImpl<n, PodArgs<n, 64, 64>>(args, shared_mem_bytes);
-    case 0:
-      return PackKernelArgsImpl<n, EmptyArgs>(args, shared_mem_bytes);
-    default:
-      ABSL_UNREACHABLE();
-  }
-}
-}  // namespace internal
-
-template <typename ArgType>
-inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
-PackKernelArgs(absl::Span<const ArgType> args, uint32_t shared_mem_bytes) {
-  static constexpr int kKernelArgsLimit = 1024;
-
-  if (args.size() > kKernelArgsLimit) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Can't pack device memory arguments array of size ", args.size(),
-        " which is larger than the maximum supported size of ",
-        kKernelArgsLimit));
-  }
-
-  // Specialize kernel arguments array for small sizes to allocate a smaller
-  // chunk of memory and hopefully hit a small allocations cache.
-  if (args.size() <= 4) {
-    return internal::PackKernelArgs<4>(args, shared_mem_bytes);
-  } else if (args.size() <= 8) {
-    return internal::PackKernelArgs<8>(args, shared_mem_bytes);
-  } else if (args.size() <= 16) {
-    return internal::PackKernelArgs<16>(args, shared_mem_bytes);
-  } else if (args.size() <= 32) {
-    return internal::PackKernelArgs<32>(args, shared_mem_bytes);
-  } else if (args.size() <= 64) {
-    return internal::PackKernelArgs<64>(args, shared_mem_bytes);
-  } else if (args.size() <= 256) {
-    return internal::PackKernelArgs<256>(args, shared_mem_bytes);
-  } else if (args.size() <= 512) {
-    return internal::PackKernelArgs<512>(args, shared_mem_bytes);
-  }
-
-  return internal::PackKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
-}
-
-template <typename ArgType>
-inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
-PackKernelArgs(absl::Span<const ArgType> args, const KernelMetadata& metadata) {
-  return PackKernelArgs(args, metadata.shared_memory_bytes().value_or(0));
-}
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments packing for statically know argument types
-//===----------------------------------------------------------------------===//
-
-// KernelArgsPackedTuple is optimized for packing arguments when their types are
-// known at compile time, and somewhat similar to `std::tuple` but with a few
-// special rules for passing device memory arguments.
-
-namespace internal {
-
-// PackedArgType template specialization defines what storage type we'll be
-// using for each kernel argument type:
-//
-//   (1) We always strip references and store a copy of an argument.
-//   (2) We do not support pointer arguments, as we should not be passing a
-//       pointers to host memory to device kernels.
-//   (3) DeviceMemory passed as an opaque `void*` pointer.
-//   (4) We have a special case for passing pointers to DeviceMemory where we
-//       also pass it as an opaque device pointer.
-template <typename T>
-struct PackedArgType {
-  static_assert(!std::is_pointer_v<T>, "cannot pass raw pointer to the device");
-  using Type = T;
-};
-
-template <>
-struct PackedArgType<DeviceMemoryBase> {
-  using Type = const void*;
-};
-
-template <typename T>
-struct PackedArgType<DeviceMemory<T>> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <>
-struct PackedArgType<DeviceMemoryBase*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <>
-struct PackedArgType<const DeviceMemoryBase*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <typename T>
-struct PackedArgType<DeviceMemory<T>*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <typename T>
-struct PackedArgType<const DeviceMemory<T>*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-// Overload set for packing kernel arguments. This overload set matches
-// supported kernel arguments types defined by `PackedArgType`.
-template <typename T, std::enable_if_t<!std::is_pointer_v<T>>* = nullptr>
-T PackArg(const T& arg) {
-  return arg;
-}
-
-inline const void* PackArg(const DeviceMemoryBase& arg) { return arg.opaque(); }
-inline const void* PackArg(const DeviceMemoryBase* arg) {
-  return PackArg(*arg);
-}
-
-template <typename T>
-const void* PackArg(const DeviceMemory<T>& arg) {
-  return arg.opaque();
-}
-
-template <typename T>
-const void* PackArg(const DeviceMemory<T>* arg) {
-  return PackArg(*arg);
-}
-
-}  // namespace internal
-
-template <typename... Args>
-class KernelArgsPackedTuple : public KernelArgsPackedArrayBase {
- public:
-  static constexpr size_t kSize = sizeof...(Args);
-
-  using Storage = std::tuple<
-      typename internal::PackedArgType<absl::remove_cvref_t<Args>>::Type...>;
-
-  explicit KernelArgsPackedTuple(Args... args, size_t shared_memory_bytes)
-      : storage_(internal::PackArg(std::forward<Args>(args))...),
-        shared_memory_bytes_(shared_memory_bytes) {
-    InitializeArgumentAddresses(std::make_index_sequence<kSize>{});
-  }
-
-  // KernelArgsPackedTuple is not copyable or movable because argument addresses
-  // point to inline storage that can't be moved.
-  KernelArgsPackedTuple(const KernelArgsPackedTuple&) = delete;
-  KernelArgsPackedTuple& operator=(const KernelArgsPackedTuple&) = delete;
-
-  size_t number_of_arguments() const final {
-    return kSize + (shared_memory_bytes_ > 0);
-  }
-
-  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
-
-  absl::Span<const void* const> argument_addresses() const final {
-    return absl::Span<const void* const>(argument_addresses_.data(), kSize);
-  }
-
-  // Compile time check that KernelArgsPackedTuple is compatible with
-  // `OtherArgs`: after stripping const and reference all types match.
-  template <typename... OtherArgs>
-  static void CheckCompatibleStaticAssert() {
-    static constexpr size_t kOtherSize = sizeof...(OtherArgs);
-    static_assert(kSize == kOtherSize, "length of arguments packs must match");
-
-    using StrippedArgs = std::tuple<absl::remove_cvref_t<Args>...>;
-    using StrippedOtherArgs = std::tuple<absl::remove_cvref_t<OtherArgs>...>;
-    static_assert(std::is_same_v<StrippedArgs, StrippedOtherArgs>,
-                  "arguments types do not match");
-  }
-
- private:
-  template <size_t... Is>
-  void InitializeArgumentAddresses(std::index_sequence<Is...>) {
-    ((argument_addresses_[Is] = &std::get<Is>(storage_)), ...);
-  }
-
-  // Storage for packed kernel arguments.
-  Storage storage_;
-
-  // Shared memory required by a kernel.
-  size_t shared_memory_bytes_ = 0;
-
-  // Pointers into `storage_`.
-  std::array<const void*, kSize> argument_addresses_;
-};
-
-// Packs the given arguments into a KernelArgsPackedTuple.
-template <typename... Args>
-std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(int64_t shmem_bytes,
-                                                          Args... args) {
-  using PackedArgs = KernelArgsPackedTuple<Args...>;
-  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
-}
-
 // Packs the given arguments into a KernelArgsPackedTuple with compile-time type
 // checks that arguments are compatible with TypedKernel signature.
 template <typename... Params, typename... Args>
diff --git a/third_party/xla/xla/stream_executor/kernel_args.h b/third_party/xla/xla/stream_executor/kernel_args.h
new file mode 100644
index 00000000000000..e57ed57e4e544b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_args.h
@@ -0,0 +1,569 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_ARGS_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_ARGS_H_
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "absl/base/optimization.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/overload.h"
+#include "absl/log/check.h"
+#include "absl/meta/type_traits.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_metadata.h"
+
+namespace stream_executor {
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments to a stream executor APIs.
+class KernelArgs {
+ public:
+  template <typename T>
+  using IsKernelArgs = std::enable_if_t<std::is_base_of<KernelArgs, T>::value>;
+
+  enum class Kind {
+    // A list of type-erased DeviceMemoryBase pointers to on-device memory. This
+    // type of kernel arguments used only when the kernel has to do its own
+    // custom packing, e.g. wrap all device pointers into a custom
+    // structure, but can't be implemented as a TypedKernel because it has to be
+    // passed around as a generic Kernel.
+    kDeviceMemoryArray,
+
+    // A list of kernel arguments packed into a storage that can be passed
+    // directly to device kernel as void** kernel parameters.
+    kPackedArray
+  };
+
+  virtual ~KernelArgs() = default;
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  virtual size_t number_of_arguments() const = 0;
+
+  // Gets the total number of shared memory bytes added so far.
+  virtual uint64_t number_of_shared_bytes() const = 0;
+
+  virtual Kind kind() const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packed array
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments packed into a storage so
+// that we have stable addresses for all arguments. This is a low level API for
+// passing arguments in a platform-specific way that relies on the knowledge of
+// the ABI of the underlying platform.
+//
+// For example `cuLaunchKernel` accepts arguments as `void** kernelParams`, and
+// packed array base guarantees that `argument_addresses` are compatible with
+// the CUDA APIs.
+//
+// See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html
+class KernelArgsPackedArrayBase : public KernelArgs {
+ public:
+  // Gets the list of argument addresses.
+  virtual absl::Span<const void* const> argument_addresses() const = 0;
+
+  static bool classof(const KernelArgs* args) {
+    return args->kind() == Kind::kPackedArray;
+  }
+
+  Kind kind() const final { return Kind::kPackedArray; }
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments LLVM-style RTTI library
+//===----------------------------------------------------------------------===//
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+T* Cast(KernelArgs* args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T*>(args);
+}
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+const T* Cast(const KernelArgs* args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T*>(args);
+}
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+const T* DynCast(const KernelArgs* args) {
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return T::classof(args) ? static_cast<const T*>(args) : nullptr;
+}
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+const T* DynCastOrNull(const KernelArgs* args) {
+  return args && T::classof(args) ? static_cast<const T*>(args) : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments device memory array
+//===----------------------------------------------------------------------===//
+
+class KernelArgsDeviceMemoryArray : public KernelArgs {
+ public:
+  KernelArgsDeviceMemoryArray(absl::Span<const DeviceMemoryBase> args,
+                              size_t shared_memory_bytes)
+      : device_memory_args_(args.begin(), args.end()),
+        shared_memory_bytes_(shared_memory_bytes) {}
+
+  static bool classof(const KernelArgs* args) {
+    return args->kind() == Kind::kDeviceMemoryArray;
+  }
+
+  Kind kind() const final { return Kind::kDeviceMemoryArray; }
+
+  size_t number_of_arguments() const final {
+    return device_memory_args_.size() + (shared_memory_bytes_ > 0);
+  }
+
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const DeviceMemoryBase> device_memory_args() const {
+    return device_memory_args_;
+  }
+
+  const void* device_memory_ptr(size_t index) const {
+    return device_memory_args_[index].opaque();
+  }
+
+  size_t device_memory_size(size_t index) const {
+    return device_memory_args_[index].size();
+  }
+
+ private:
+  absl::InlinedVector<DeviceMemoryBase, 4> device_memory_args_;
+  size_t shared_memory_bytes_ = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packing for device memory and POD args
+//===----------------------------------------------------------------------===//
+
+// KernelArgsPackedArray is optimized for packing DeviceMemoryBase pointers
+// and POD arguments (i.e. scalars) when the number and type of arguments are
+// not known at compile time.
+
+namespace internal {
+
+// An empty storage for packing just the device memory arguments, that are
+// stored directly in the `KernelArgsPackedArray`.
+struct EmptyArgs {
+  static constexpr size_t kSize = 0;
+};
+
+// A storage for POD generic arguments that are smaller than `size` and require
+// alignment smaller or equal to `alignment`.
+template <size_t capacity, size_t size = 8,
+          size_t alignment = alignof(std::max_align_t)>
+class PodArgs {
+ public:
+  static constexpr size_t kSize = size;
+
+ protected:
+  template <typename T>
+  const std::byte* add_pod_argument(const T& arg) {
+    static_assert(std::is_trivially_copyable_v<T> &&
+                      sizeof(T) <= size & alignof(T) <= alignment,
+                  "Type is not compatible with POD arguments storage");
+
+    assert(num_args_ < capacity && "pod args overflow");
+    std::byte* arg_storage = args_storage_[num_args_++].storage;
+    std::memcpy(arg_storage, &arg, sizeof(T));
+
+    return arg_storage;
+  }
+
+ private:
+  struct Arg {
+    alignas(alignment) std::byte storage[size];
+  };
+
+  size_t num_args_ = 0;
+  std::array<Arg, capacity> args_storage_;
+};
+
+template <typename ArgsStorage>
+static constexpr bool is_pod_args_v = false;
+
+template <size_t capacity, size_t size, size_t alignment>
+static constexpr bool is_pod_args_v<PodArgs<capacity, size, alignment>> = true;
+
+}  // namespace internal
+
+// An array of arguments for a kernel call.
+//
+// The template parameter `num_args` is the maximum number of arguments which
+// can be stored in the array.
+template <size_t num_args, typename ArgsStorage = internal::PodArgs<num_args>>
+class KernelArgsPackedArray : public KernelArgsPackedArrayBase, ArgsStorage {
+ public:
+  KernelArgsPackedArray() = default;
+
+  // KernelArgsPackedArray is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedArray(const KernelArgsPackedArray&) = delete;
+  KernelArgsPackedArray& operator=(const KernelArgsPackedArray&) = delete;
+
+  // Adds an argument to the list.
+  template <typename T>
+  void add_argument(const T& arg) {
+    if constexpr (internal::is_pod_args_v<ArgsStorage>) {
+      argument_addresses_[number_of_argument_addresses_++] =
+          ArgsStorage::add_pod_argument(arg);
+    } else {
+      // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+      static_assert(sizeof(T) == 0, "Arguments storage is not supported");
+    }
+  }
+
+  // Adds a device memory argument to the list.
+  void add_device_memory_argument(const DeviceMemoryBase& arg) {
+    const void** copy_ptr =
+        &device_memory_opaque_pointers_[number_of_argument_addresses_];
+    *copy_ptr = arg.opaque();
+    argument_addresses_[number_of_argument_addresses_] = copy_ptr;
+    ++number_of_argument_addresses_;
+  }
+
+  // Adds a shared memory argument to the list.
+  //
+  // The only significant information about a shared argument is its size, so
+  // that is the only parameter in this function.
+  void add_shared_bytes(size_t number_of_bytes) {
+    shared_memory_bytes_ += number_of_bytes;
+  }
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  size_t number_of_arguments() const final {
+    return number_of_argument_addresses_ + (shared_memory_bytes_ > 0);
+  }
+
+  // Gets the total number of shared memory bytes added so far.
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  // Gets the list of argument addresses.
+  absl::Span<const void* const> argument_addresses() const final {
+    return absl::Span<const void* const>(argument_addresses_.data(),
+                                         number_of_argument_addresses_);
+  }
+
+ private:
+  // A place to store copies of opaque pointers from device memory arguments.
+  std::array<const void*, num_args> device_memory_opaque_pointers_;
+
+  // Addresses for non-shared-memory arguments.
+  std::array<const void*, num_args> argument_addresses_;
+
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
+
+  // Number of significant entries in argument_addresses_.
+  size_t number_of_argument_addresses_ = 0;
+};
+
+using KernelArgument = std::variant<DeviceMemoryBase, TensorMap, int64_t>;
+
+namespace internal {
+template <int n>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    absl::Span<const DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
+  auto packed = std::make_unique<KernelArgsPackedArray<n, EmptyArgs>>();
+  for (const DeviceMemoryBase& buf : args) {
+    packed->add_device_memory_argument(buf);
+  }
+  if (shared_mem_bytes > 0) {
+    packed->add_shared_bytes(shared_mem_bytes);
+  }
+  return packed;
+}
+
+template <int n, typename ArgsStorage>
+std::unique_ptr<KernelArgsPackedArray<n, ArgsStorage>> PackKernelArgsImpl(
+    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
+  auto packed = std::make_unique<KernelArgsPackedArray<n, ArgsStorage>>();
+  for (const auto& arg : args) {
+    std::visit(
+        absl::Overload{
+            [&](const DeviceMemoryBase& device_memory) {
+              packed->add_device_memory_argument(device_memory);
+            },
+            [&](int64_t int_arg) {
+              if constexpr (ArgsStorage::kSize >= sizeof(int64_t)) {
+                packed->add_argument(int_arg);
+              }
+            },
+            [&](const TensorMap& tensor_map) {
+              if constexpr (ArgsStorage::kSize >= sizeof(tensor_map.storage)) {
+                packed->add_argument(tensor_map.storage);
+              }
+            },
+        },
+        arg);
+  }
+  if (shared_mem_bytes > 0) {
+    packed->add_shared_bytes(shared_mem_bytes);
+  }
+  return packed;
+}
+
+template <int n>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
+  const int32_t pod_size = [](absl::Span<const KernelArgument> args) {
+    bool has_int = false;
+    for (const auto& arg : args) {
+      if (std::holds_alternative<TensorMap>(arg)) {
+        return 128;
+      }
+      if (std::holds_alternative<int64_t>(arg)) {
+        has_int = true;
+      }
+    }
+    return has_int ? 64 : 0;
+  }(args);
+
+  switch (pod_size) {
+    case 128:
+      return PackKernelArgsImpl<n, PodArgs<n, 128, 64>>(args, shared_mem_bytes);
+    case 64:
+      return PackKernelArgsImpl<n, PodArgs<n, 64, 64>>(args, shared_mem_bytes);
+    case 0:
+      return PackKernelArgsImpl<n, EmptyArgs>(args, shared_mem_bytes);
+    default:
+      ABSL_UNREACHABLE();
+  }
+}
+}  // namespace internal
+
+template <typename ArgType>
+inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
+PackKernelArgs(absl::Span<const ArgType> args, uint32_t shared_mem_bytes) {
+  static constexpr int kKernelArgsLimit = 1024;
+
+  if (args.size() > kKernelArgsLimit) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Can't pack device memory arguments array of size ", args.size(),
+        " which is larger than the maximum supported size of ",
+        kKernelArgsLimit));
+  }
+
+  // Specialize kernel arguments array for small sizes to allocate a smaller
+  // chunk of memory and hopefully hit a small allocations cache.
+  if (args.size() <= 4) {
+    return internal::PackKernelArgs<4>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 8) {
+    return internal::PackKernelArgs<8>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 16) {
+    return internal::PackKernelArgs<16>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 32) {
+    return internal::PackKernelArgs<32>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 64) {
+    return internal::PackKernelArgs<64>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 256) {
+    return internal::PackKernelArgs<256>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 512) {
+    return internal::PackKernelArgs<512>(args, shared_mem_bytes);
+  }
+
+  return internal::PackKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
+}
+
+template <typename ArgType>
+inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
+PackKernelArgs(absl::Span<const ArgType> args, const KernelMetadata& metadata) {
+  return PackKernelArgs(args, metadata.shared_memory_bytes().value_or(0));
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packing for statically know argument types
+//===----------------------------------------------------------------------===//
+
+// KernelArgsPackedTuple is optimized for packing arguments when their types are
+// known at compile time, and somewhat similar to `std::tuple` but with a few
+// special rules for passing device memory arguments.
+
+namespace internal {
+
+// PackedArgType template specialization defines what storage type we'll be
+// using for each kernel argument type:
+//
+//   (1) We always strip references and store a copy of an argument.
+//   (2) We do not support pointer arguments, as we should not be passing a
+//       pointers to host memory to device kernels.
+//   (3) DeviceMemory passed as an opaque `void*` pointer.
+//   (4) We have a special case for passing pointers to DeviceMemory where we
+//       also pass it as an opaque device pointer.
+template <typename T>
+struct PackedArgType {
+  static_assert(!std::is_pointer_v<T>, "cannot pass raw pointer to the device");
+  using Type = T;
+};
+
+template <>
+struct PackedArgType<DeviceMemoryBase> {
+  using Type = const void*;
+};
+
+template <typename T>
+struct PackedArgType<DeviceMemory<T>> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <>
+struct PackedArgType<DeviceMemoryBase*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <>
+struct PackedArgType<const DeviceMemoryBase*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <typename T>
+struct PackedArgType<DeviceMemory<T>*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <typename T>
+struct PackedArgType<const DeviceMemory<T>*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+// Overload set for packing kernel arguments. This overload set matches
+// supported kernel arguments types defined by `PackedArgType`.
+template <typename T, std::enable_if_t<!std::is_pointer_v<T>>* = nullptr>
+T PackArg(const T& arg) {
+  return arg;
+}
+
+inline const void* PackArg(const DeviceMemoryBase& arg) { return arg.opaque(); }
+inline const void* PackArg(const DeviceMemoryBase* arg) {
+  return PackArg(*arg);
+}
+
+template <typename T>
+const void* PackArg(const DeviceMemory<T>& arg) {
+  return arg.opaque();
+}
+
+template <typename T>
+const void* PackArg(const DeviceMemory<T>* arg) {
+  return PackArg(*arg);
+}
+
+}  // namespace internal
+
+template <typename... Args>
+class KernelArgsPackedTuple : public KernelArgsPackedArrayBase {
+ public:
+  static constexpr size_t kSize = sizeof...(Args);
+
+  using Storage = std::tuple<
+      typename internal::PackedArgType<absl::remove_cvref_t<Args>>::Type...>;
+
+  explicit KernelArgsPackedTuple(Args... args, size_t shared_memory_bytes)
+      : storage_(internal::PackArg(std::forward<Args>(args))...),
+        shared_memory_bytes_(shared_memory_bytes) {
+    InitializeArgumentAddresses(std::make_index_sequence<kSize>{});
+  }
+
+  // KernelArgsPackedTuple is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedTuple(const KernelArgsPackedTuple&) = delete;
+  KernelArgsPackedTuple& operator=(const KernelArgsPackedTuple&) = delete;
+
+  size_t number_of_arguments() const final {
+    return kSize + (shared_memory_bytes_ > 0);
+  }
+
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const void* const> argument_addresses() const final {
+    return absl::Span<const void* const>(argument_addresses_.data(), kSize);
+  }
+
+  // Compile time check that KernelArgsPackedTuple is compatible with
+  // `OtherArgs`: after stripping const and reference all types match.
+  template <typename... OtherArgs>
+  static void CheckCompatibleStaticAssert() {
+    static constexpr size_t kOtherSize = sizeof...(OtherArgs);
+    static_assert(kSize == kOtherSize, "length of arguments packs must match");
+
+    using StrippedArgs = std::tuple<absl::remove_cvref_t<Args>...>;
+    using StrippedOtherArgs = std::tuple<absl::remove_cvref_t<OtherArgs>...>;
+    static_assert(std::is_same_v<StrippedArgs, StrippedOtherArgs>,
+                  "arguments types do not match");
+  }
+
+ private:
+  template <size_t... Is>
+  void InitializeArgumentAddresses(std::index_sequence<Is...>) {
+    ((argument_addresses_[Is] = &std::get<Is>(storage_)), ...);
+  }
+
+  // Storage for packed kernel arguments.
+  Storage storage_;
+
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
+
+  // Pointers into `storage_`.
+  std::array<const void*, kSize> argument_addresses_;
+};
+
+// Packs the given arguments into a KernelArgsPackedTuple.
+template <typename... Args>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(int64_t shmem_bytes,
+                                                          Args... args) {
+  using PackedArgs = KernelArgsPackedTuple<Args...>;
+  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_ARGS_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_args_packed_vector.h b/third_party/xla/xla/stream_executor/kernel_args_packed_vector.h
new file mode 100644
index 00000000000000..2d2b19ee02bcdc
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_args_packed_vector.h
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_ARGS_PACKED_VECTOR_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_ARGS_PACKED_VECTOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/stream_executor/kernel_args.h"
+
+namespace stream_executor {
+
+// An array of arguments for a kernel call, where each argument is stored in a
+// separate vector. The storage is owned by the KernelArgsPackedVector.
+class KernelArgsPackedVector : public KernelArgsPackedArrayBase {
+ public:
+  KernelArgsPackedVector(std::vector<std::vector<char>> arguments,
+                         size_t shared_memory_bytes)
+      : argument_storage_(std::move(arguments)),
+        shared_memory_bytes_(shared_memory_bytes) {
+    argument_addresses_.reserve(argument_storage_.size());
+    for (std::vector<char>& argument : argument_storage_) {
+      argument_addresses_.push_back(argument.data());
+    }
+  }
+
+  size_t number_of_arguments() const final {
+    // We need to add 1 to the number of arguments if the kernel is using shared
+    // memory because we treat the number of shared memory bytes like an
+    // additional argument in the StreamExecutor kernel launch API. Note that
+    // shared memory doesn't appear in
+    // KernelArgsPackedVector::argument_addresses().
+    return argument_storage_.size() + (shared_memory_bytes_ > 0);
+  }
+
+  // Returns the number of bytes that need to be allocated in shared memory for
+  // this kernel. This value is treated like an additional argument in the CUDA
+  // kernel launch API, therefore we pass it alongside the real kernel arguments
+  // here.
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const void* const> argument_addresses() const final {
+    return argument_addresses_;
+  }
+
+ private:
+  std::vector<std::vector<char>> argument_storage_;
+  size_t shared_memory_bytes_ = 0;
+  std::vector<const void*> argument_addresses_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_ARGS_PACKED_VECTOR_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_args_packed_vector_test.cc b/third_party/xla/xla/stream_executor/kernel_args_packed_vector_test.cc
new file mode 100644
index 00000000000000..186ca5b9d7f2f2
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_args_packed_vector_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/types/span.h"
+
+namespace stream_executor {
+namespace {
+using ::testing::Each;
+using ::testing::ElementsAre;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+TEST(KernelArgsPackedVectorTest, StoresSharedMemoryBytes) {
+  KernelArgsPackedVector args(std::vector<std::vector<char>>(),
+                              /*shared_memory_bytes=*/42);
+  EXPECT_EQ(args.number_of_shared_bytes(), 42);
+  EXPECT_EQ(args.number_of_arguments(), 1);
+}
+
+TEST(KernelArgsPackedVectorTest, StoresArgumentAddresses) {
+  KernelArgsPackedVector args = []() {
+    std::vector<std::vector<char>> storage;
+    storage.push_back(std::vector<char>({10}));
+    storage.push_back(std::vector<char>({20, 21}));
+    storage.push_back(std::vector<char>({30, 31, 32}));
+    return KernelArgsPackedVector(std::move(storage),
+                                  /*shared_memory_bytes=*/0);
+  }();
+
+  EXPECT_EQ(args.number_of_arguments(), 3);
+  ASSERT_THAT(args.argument_addresses(), SizeIs(3));
+  ASSERT_THAT(args.argument_addresses(), Each(Ne(nullptr)));
+
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(args.argument_addresses().at(0)), 1),
+      ElementsAre(10));
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(args.argument_addresses().at(1)), 2),
+      ElementsAre(20, 21));
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(args.argument_addresses().at(2)), 3),
+      ElementsAre(30, 31, 32));
+}
+
+TEST(KernelArgsPackedVectorTest,
+     ConsidersSharedMemoryBytesInNumberOfArguments) {
+  KernelArgsPackedVector args({{}, {}},
+                              /*shared_memory_bytes=*/42);
+
+  // Two arguments and one shared memory argument.
+  EXPECT_EQ(args.number_of_arguments(), 3);
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_test.cc b/third_party/xla/xla/stream_executor/kernel_args_test.cc
similarity index 93%
rename from third_party/xla/xla/stream_executor/kernel_test.cc
rename to third_party/xla/xla/stream_executor/kernel_args_test.cc
index 0e1b219c8de285..017247a3110b0a 100644
--- a/third_party/xla/xla/stream_executor/kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/kernel_args_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 
 #include <cstdint>
 #include <memory>
@@ -25,9 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "benchmark/benchmark.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
@@ -66,11 +64,6 @@ static_assert(
     std::is_same_v<ArgsStorage<DeviceMemoryBase*, const DeviceMemoryBase*>,
                    std::tuple<const void*, const void*>>);
 
-static StreamExecutor* NewStreamExecutor() {
-  Platform* platform = PlatformManager::PlatformWithName("Host").value();
-  return platform->ExecutorForDevice(/*ordinal=*/0).value();
-}
-
 TEST(KernelTest, PackDeviceMemoryArguments) {
   DeviceMemoryBase a(reinterpret_cast<void*>(0x12345678));
   DeviceMemoryBase b(reinterpret_cast<void*>(0x87654321));
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.cc b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.cc
new file mode 100644
index 00000000000000..76b28a0f11655e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.cc
@@ -0,0 +1,173 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+
+namespace stream_executor {
+namespace {
+ArgumentPackingRelocationProto::Type ToProtoType(
+    ArgumentPackingRelocation::Type type) {
+  switch (type) {
+    case ArgumentPackingRelocation::Type::kBits64Absolute:
+      return ArgumentPackingRelocationProto::TYPE_BITS64_ABSOLUTE;
+  }
+}
+
+absl::StatusOr<ArgumentPackingRelocation::Type> FromProtoType(
+    ArgumentPackingRelocationProto::Type type) {
+  switch (type) {
+    case ArgumentPackingRelocationProto::TYPE_BITS64_ABSOLUTE:
+      return ArgumentPackingRelocation::Type::kBits64Absolute;
+    default:
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "Unsupported relocation type: %d", static_cast<int>(type)));
+  }
+}
+}  // namespace
+
+absl::StatusOr<std::vector<char>> SingleArgumentPackingSpec::BuildArgument(
+    absl::Span<const DeviceMemoryBase> args) const {
+  auto argument = storage_;
+
+  for (const ArgumentPackingRelocation& relocation : relocations_) {
+    switch (relocation.type()) {
+      case ArgumentPackingRelocation::Type::kBits64Absolute: {
+        if (args.size() <= relocation.argument_index()) {
+          return absl::InvalidArgumentError(
+              absl::StrFormat("Not enough arguments for relocation (expected "
+                              "at least %d, but got %d)",
+                              relocation.argument_index(), args.size()));
+        }
+        if (relocation.offset() + sizeof(uint64_t) > argument.size()) {
+          return absl::InvalidArgumentError(
+              absl::StrFormat("Not enough storage for relocation (expected "
+                              "at least %d, but got %d)",
+                              sizeof(void*), argument.size()));
+        }
+        uint64_t ptr =
+            static_cast<uint64_t>(tsl::safe_reinterpret_cast<uintptr_t>(
+                args.at(relocation.argument_index()).opaque()));
+        std::memcpy(argument.data() + relocation.offset(), &ptr, sizeof(ptr));
+        break;
+      }
+      default:
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Unsupported relocation type: %d",
+                            static_cast<int>(relocation.type())));
+    }
+  }
+  return argument;
+}
+
+void SingleArgumentPackingSpec::WriteArgumentAddress(int argument_index) {
+  relocations_.push_back(ArgumentPackingRelocation(
+      ArgumentPackingRelocation::Type::kBits64Absolute, argument_index,
+      /*offset=*/storage_.size()));
+  storage_.insert(storage_.end(), sizeof(uint64_t), 0);
+}
+
+absl::StatusOr<std::unique_ptr<KernelArgsPackedVector>>
+KernelArgumentsPackingSpec::BuildArguments(
+    absl::Span<const DeviceMemoryBase> thunk_arguments,
+    size_t shared_memory_bytes) const {
+  std::vector<std::vector<char>> result;
+  result.reserve(kernel_arguments_.size());
+  for (const SingleArgumentPackingSpec& kernel_argument : kernel_arguments_) {
+    TF_ASSIGN_OR_RETURN(result.emplace_back(),
+                        kernel_argument.BuildArgument(thunk_arguments));
+  }
+  return std::make_unique<KernelArgsPackedVector>(std::move(result),
+                                                  shared_memory_bytes);
+}
+absl::StatusOr<SingleArgumentPackingSpecProto>
+SingleArgumentPackingSpec::ToProto() const {
+  SingleArgumentPackingSpecProto proto;
+  for (const ArgumentPackingRelocation& relocation : relocations_) {
+    TF_ASSIGN_OR_RETURN(*proto.add_relocations(), relocation.ToProto());
+  }
+  proto.set_data(storage_.data(), storage_.size());
+  return proto;
+}
+
+absl::StatusOr<SingleArgumentPackingSpec> SingleArgumentPackingSpec::FromProto(
+    const SingleArgumentPackingSpecProto& proto) {
+  std::vector<char> storage(proto.data().begin(), proto.data().end());
+  std::vector<ArgumentPackingRelocation> relocations;
+  for (const ArgumentPackingRelocationProto& relocation_proto :
+       proto.relocations()) {
+    TF_ASSIGN_OR_RETURN(ArgumentPackingRelocation relocation,
+                        ArgumentPackingRelocation::FromProto(relocation_proto));
+    relocations.push_back(std::move(relocation));
+  }
+  return SingleArgumentPackingSpec(std::move(storage), std::move(relocations));
+}
+
+absl::StatusOr<ArgumentPackingRelocationProto>
+ArgumentPackingRelocation::ToProto() const {
+  ArgumentPackingRelocationProto proto;
+  proto.set_type(ToProtoType(type_));
+  proto.set_argument_index(argument_index_);
+  proto.set_offset(offset_);
+  return proto;
+}
+
+absl::StatusOr<ArgumentPackingRelocation> ArgumentPackingRelocation::FromProto(
+    const ArgumentPackingRelocationProto& proto) {
+  TF_ASSIGN_OR_RETURN(ArgumentPackingRelocation::Type type,
+                      FromProtoType(proto.type()));
+  return ArgumentPackingRelocation(type, proto.argument_index(),
+                                   proto.offset());
+}
+
+absl::StatusOr<KernelArgumentsPackingSpecProto>
+KernelArgumentsPackingSpec::ToProto() const {
+  KernelArgumentsPackingSpecProto proto;
+  for (const SingleArgumentPackingSpec& kernel_argument : kernel_arguments_) {
+    TF_ASSIGN_OR_RETURN(*proto.add_kernel_arguments(),
+                        kernel_argument.ToProto());
+  }
+  return proto;
+}
+
+absl::StatusOr<KernelArgumentsPackingSpec>
+KernelArgumentsPackingSpec::FromProto(
+    const KernelArgumentsPackingSpecProto& proto) {
+  std::vector<SingleArgumentPackingSpec> kernel_arguments;
+  for (const SingleArgumentPackingSpecProto& kernel_argument_proto :
+       proto.kernel_arguments()) {
+    TF_ASSIGN_OR_RETURN(
+        SingleArgumentPackingSpec kernel_argument,
+        SingleArgumentPackingSpec::FromProto(kernel_argument_proto));
+    kernel_arguments.push_back(std::move(kernel_argument));
+  }
+  return KernelArgumentsPackingSpec(std::move(kernel_arguments));
+}
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.h b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.h
new file mode 100644
index 00000000000000..eb04e5fd7a6458
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.h
@@ -0,0 +1,187 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_ARGUMENT_PACKING_SPEC_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_ARGUMENT_PACKING_SPEC_H_
+
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.pb.h"
+
+namespace stream_executor {
+
+// Represents a relocation of an argument to a specific location in the
+// packed argument buffer. Imagine an arbitrary buffer of bytes with a
+// placeholder that later on will be replaced by the runtime with a
+// pointer to the actual argument. Currently only supports 64 bit absolute
+// pointers to device memory.
+class ArgumentPackingRelocation {
+ public:
+  enum class Type { kBits64Absolute };
+
+  explicit ArgumentPackingRelocation(Type type, int argument_index,
+                                     size_t offset)
+      : type_(type), argument_index_(argument_index), offset_(offset) {}
+  Type type() const { return type_; }
+  int argument_index() const { return argument_index_; }
+  size_t offset() const { return offset_; }
+
+  absl::StatusOr<ArgumentPackingRelocationProto> ToProto() const;
+
+  static absl::StatusOr<ArgumentPackingRelocation> FromProto(
+      const ArgumentPackingRelocationProto& proto);
+
+ private:
+  Type type_;
+  int argument_index_;
+  size_t offset_;
+};
+
+// Represents the packing spec for a single argument of a kernel. So this is a
+// buffer of bytes and a list of relocations (placeholders) that describe
+// where later on argument buffer addresses will be written to.
+class SingleArgumentPackingSpec {
+ public:
+  SingleArgumentPackingSpec() = default;
+  SingleArgumentPackingSpec(std::vector<char> storage,
+                            std::vector<ArgumentPackingRelocation> relocations)
+      : storage_(std::move(storage)), relocations_(std::move(relocations)) {}
+
+  // Materializes the argument buffer for this packing spec. The `args` span
+  // must contain at least the number of arguments referenced in the packing
+  // spec, otherwise an error will be returned.
+  absl::StatusOr<std::vector<char>> BuildArgument(
+      absl::Span<const DeviceMemoryBase> args) const;
+
+  // Writes a placeholder to the argument packing spec that will be replaced
+  // by the runtime with the address of the argument `argument_index`th
+  // argument. Currently this is always a 64bits absolute pointer to device
+  // memory. Other types of relocations will be added in the future if needed.
+  void WriteArgumentAddress(int argument_index);
+
+  // Writes a constant value to the argument packing spec. The value must be
+  // trivially copyable.
+  template <typename T>
+  void WriteConstant(T value) {
+    static_assert(std::is_trivially_copyable_v<T>,
+                  "The given value must be trivially copyable");
+    std::array<char, sizeof(T)> temp_storage;
+    std::memcpy(temp_storage.data(), &value, sizeof(T));
+    storage_.insert(storage_.end(), temp_storage.begin(), temp_storage.end());
+  }
+
+  absl::StatusOr<SingleArgumentPackingSpecProto> ToProto() const;
+
+  static absl::StatusOr<SingleArgumentPackingSpec> FromProto(
+      const SingleArgumentPackingSpecProto& proto);
+
+ private:
+  std::vector<char> storage_;
+  std::vector<ArgumentPackingRelocation> relocations_;
+};
+
+// `KernelArgumentsPackingSpec` defines how to convert a list of device buffer
+// pointers into a packed argument buffer that can be passed to a device kernel.
+//
+// When calling a custom kernel from XLA each HLO parameter and HLO result is
+// represented as a device buffer pointer and by default the custom kernel gets
+// launched with those pointers as kernel arguments in a predefined order -
+// input parameters first, then output parameters.
+//
+// This is very inflexible so KernelArgumentsPackingSpec allows to specify a
+// transformation from a list of device buffer pointers (usually created by
+// xla::emitters::KernelArguments) to a packed argument buffer (list of byte
+// arrays). Each argument of the custom kernel can be a buffer of arbitrary
+// bytes with a list of placeholders (which we call relocations - similar to
+// linker relocations) that will be replaced by the runtime with the address of
+// the corresponding device buffer.
+//
+// Since this is all declarative it is also possible to serialize
+// KernelArgumentsPackingSpec to a proto.
+//
+// Usage example: We want to launch a kernel that has the following launch
+// arguments:
+// - Output buffer pointer
+// - Input buffer pointer
+// - Constant value 42
+//
+// KernelArgumentsPackingSpec packing_spec;
+// // `1` refers to the second argument as defined by
+// // xla::emitters::KernelArguments. In case of 1 HLO input this is the first
+// // output buffer.
+// packing_spec.AddAddressArgument(1);
+// // `0` refers to the first argument as defined by
+// // xla::emitters::KernelArguments.
+// packing_spec.AddAddressArgument(0);
+// packing_spec.AddConstantArgument<int64_t>(42);
+//
+// custom_kernel.SetArgumentsPackingSpec(packing_spec);
+//
+// Now the custom kernel gets launched with a packed argument buffer that looks
+// like this: | output_ptr | input_ptr | 42 |
+class KernelArgumentsPackingSpec {
+ public:
+  KernelArgumentsPackingSpec() = default;
+  explicit KernelArgumentsPackingSpec(
+      std::vector<SingleArgumentPackingSpec> kernel_arguments)
+      : kernel_arguments_(std::move(kernel_arguments)) {}
+
+  // Adds a single argument packing spec to the kernel arguments packing spec.
+  void AddArgument(SingleArgumentPackingSpec spec) {
+    kernel_arguments_.push_back(std::move(spec));
+  }
+
+  // Adds a an argument that only contains a pointer to the `argument_index`th
+  // argument.
+  void AddAddressArgument(int argument_index) {
+    kernel_arguments_.push_back(SingleArgumentPackingSpec());
+    kernel_arguments_.back().WriteArgumentAddress(argument_index);
+  }
+
+  template <typename T>
+  void AddConstantArgument(T value) {
+    kernel_arguments_.push_back(SingleArgumentPackingSpec());
+    kernel_arguments_.back().WriteConstant(value);
+  }
+
+  // Materializes the argument buffers for this packing spec. The `args` span
+  // must contain at least the number of arguments referenced in the packing
+  // spec, otherwise an error will be returned.
+  absl::StatusOr<std::unique_ptr<KernelArgsPackedVector>> BuildArguments(
+      absl::Span<const DeviceMemoryBase> thunk_arguments,
+      size_t shared_memory_bytes) const;
+
+  absl::StatusOr<KernelArgumentsPackingSpecProto> ToProto() const;
+
+  static absl::StatusOr<KernelArgumentsPackingSpec> FromProto(
+      const KernelArgumentsPackingSpecProto& proto);
+
+ private:
+  std::vector<SingleArgumentPackingSpec> kernel_arguments_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_ARGUMENT_PACKING_SPEC_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.proto b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.proto
new file mode 100644
index 00000000000000..fe2d86def0d436
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+
+package stream_executor;
+
+option java_multiple_files = true;
+option java_outer_classname = "KernelArgumentPackingSpec";
+
+message ArgumentPackingRelocationProto {
+  enum Type {
+    TYPE_UNSPECIFIED = 0;
+    TYPE_BITS64_ABSOLUTE = 1;
+  }
+
+  Type type = 1;
+  int32 argument_index = 2;
+  int64 offset = 3;
+}
+
+message SingleArgumentPackingSpecProto {
+  repeated ArgumentPackingRelocationProto relocations = 1;
+  bytes data = 2;
+}
+
+message KernelArgumentsPackingSpecProto {
+  repeated SingleArgumentPackingSpecProto kernel_arguments = 1;
+}
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec_test.cc b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec_test.cc
new file mode 100644
index 00000000000000..8c4fae606bc590
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec_test.cc
@@ -0,0 +1,219 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+
+namespace stream_executor {
+namespace {
+using absl_testing::IsOkAndHolds;
+using absl_testing::StatusIs;
+using ::testing::ElementsAre;
+using ::testing::SizeIs;
+using tsl::proto_testing::EqualsProto;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+// This function creates a `DeviceMemoryBase` with an opaque pointer that
+// contains the given value. The size of the device memory is set to 0 since
+// it's unused.
+// Note that this device pointer is not a valid pointer to device memory, it
+// is only used for testing and can't be dereferenced.
+DeviceMemoryBase MakeDevicePointer(uint32_t value) {
+  // To construct a pointer that works both on 32bit and 64bit platforms and
+  // does not invoke undefined behaviour, we first cast our integer to uintptr_t
+  // and then cast it to void*.
+  return DeviceMemoryBase(
+      tsl::safe_reinterpret_cast<void*>(static_cast<uintptr_t>(value)),
+      /*size=*/0);
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteArgumentAddress) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteArgumentAddress(/*argument_index=*/2);
+
+  // We fail if not enough arguments are provided.Since we are referencing
+  // argument #2, we will need to provide 3 arguments.
+  EXPECT_THAT(
+      first_arg.BuildArgument({MakeDevicePointer(0), MakeDevicePointer(0)}),
+      StatusIs(absl::StatusCode::kInvalidArgument));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<char> first_arg_storage,
+      first_arg.BuildArgument({MakeDevicePointer(0), MakeDevicePointer(0),
+                               MakeDevicePointer(0xff42)}));
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteMultipleArgumentAddresses) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteArgumentAddress(/*argument_index=*/0);
+  first_arg.WriteArgumentAddress(/*argument_index=*/1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<char> first_arg_storage,
+      first_arg.BuildArgument(
+          {MakeDevicePointer(0xff42), MakeDevicePointer(0xaabbccdd)}));
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd,
+                          0xcc, 0xbb, 0xaa, 0x00, 0x00, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteConstant) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteConstant(static_cast<uint32_t>(0x1348));
+  first_arg.WriteConstant(static_cast<uint64_t>(0x2389));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<char> first_arg_storage,
+                          first_arg.BuildArgument(/*args=*/{}));
+
+  // SingleArgumentPackingSpec::WriteConstant doesn't take endianness into
+  // account, so this assertion will fail for big endian architectures - which
+  // we don't support anyway.
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x48, 0x13, 0x00, 0x00, 0x89, 0x23, 0x00, 0x00, 0x00,
+                          0x00, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteConstantAndAddress) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteArgumentAddress(/*argument_index=*/0);
+  first_arg.WriteConstant(static_cast<uint32_t>(0x1234));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<char> first_arg_storage,
+                          first_arg.BuildArgument({MakeDevicePointer(0xff42)}));
+
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34,
+                          0x12, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, ToProto) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteConstant(0x1234);
+  first_arg.WriteArgumentAddress(/*argument_index=*/0);
+
+  EXPECT_THAT(
+      first_arg.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+        relocations { type: TYPE_BITS64_ABSOLUTE argument_index: 0 offset: 4 }
+        data: "\x34\x12\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+      )pb")));
+}
+
+TEST(SingleArgumentPackingSpecTest, FromProto) {
+  auto proto = ParseTextProtoOrDie<SingleArgumentPackingSpecProto>(
+      R"pb(
+        relocations { type: TYPE_BITS64_ABSOLUTE argument_index: 0 offset: 4 }
+        data: "\x34\x12\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+      )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(SingleArgumentPackingSpec spec,
+                          SingleArgumentPackingSpec::FromProto(proto));
+  EXPECT_THAT(spec.BuildArgument({MakeDevicePointer(0xff42)}),
+              IsOkAndHolds(ElementsAre(0x34, 0x12, 0x00, 0x00, 0x42, 0xff, 0x00,
+                                       0x00, 0x00, 0x00, 0x00, 0x00)));
+}
+
+TEST(KernelArgumentsPackingSpecTest, BuildArguments) {
+  KernelArgumentsPackingSpec spec;
+  spec.AddAddressArgument(/*argument_index=*/0);
+  spec.AddConstantArgument(0x1234);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KernelArgsPackedVector> packed_args,
+                          spec.BuildArguments({MakeDevicePointer(0xff42)},
+                                              /*shared_memory_bytes=*/8989));
+  // We expect 3 arguments: 2 parameters and the shared memory which counts as
+  // an argument.
+  EXPECT_EQ(packed_args->number_of_arguments(), 3);
+  EXPECT_EQ(packed_args->number_of_shared_bytes(), 8989);
+  EXPECT_EQ(packed_args->argument_addresses().size(), 2);
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(packed_args->argument_addresses().at(0)),
+          8),
+      ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(packed_args->argument_addresses().at(1)),
+          4),
+      ElementsAre(0x34, 0x12, 0x00, 0x00));
+}
+
+TEST(KernelArgumentsPackingSpecTest, ToProto) {
+  KernelArgumentsPackingSpec spec;
+  spec.AddAddressArgument(/*argument_index=*/33);
+  spec.AddConstantArgument(0x1234);
+
+  EXPECT_THAT(spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+                kernel_arguments {
+                  relocations {
+                    type: TYPE_BITS64_ABSOLUTE
+                    argument_index: 33
+                    offset: 0
+                  }
+                  data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+                }
+                kernel_arguments { data: "\x34\x12\x00\x00" }
+              )pb")));
+}
+
+TEST(KernelArgumentsPackingSpecTest, FromProto) {
+  auto proto = ParseTextProtoOrDie<KernelArgumentsPackingSpecProto>(
+      R"pb(
+        kernel_arguments {
+          relocations { type: TYPE_BITS64_ABSOLUTE argument_index: 0 offset: 0 }
+          data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+        }
+        kernel_arguments { data: "\x34\x12\x00\x00" }
+      )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelArgumentsPackingSpec spec,
+                          KernelArgumentsPackingSpec::FromProto(proto));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KernelArgsPackedVector> arguments,
+                          spec.BuildArguments({MakeDevicePointer(0xff42)},
+                                              /*shared_memory_bytes=*/8989));
+  // We expect 3 arguments: 2 parameters and the shared memory which counts as
+  // an argument.
+  EXPECT_EQ(arguments->number_of_arguments(), 3);
+  EXPECT_EQ(arguments->number_of_shared_bytes(), 8989);
+  ASSERT_THAT(arguments->argument_addresses(), SizeIs(2));
+  EXPECT_THAT(absl::Span<const char>(absl::bit_cast<const char*>(
+                                         arguments->argument_addresses().at(0)),
+                                     8),
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
+  EXPECT_THAT(absl::Span<const char>(absl::bit_cast<const char*>(
+                                         arguments->argument_addresses().at(1)),
+                                     4),
+              ElementsAre(0x34, 0x12, 0x00, 0x00));
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_metadata.h b/third_party/xla/xla/stream_executor/kernel_metadata.h
new file mode 100644
index 00000000000000..c19e2376b5e9c3
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_metadata.h
@@ -0,0 +1,59 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_METADATA_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_METADATA_H_
+
+#include <cstdint>
+#include <optional>
+
+namespace stream_executor {
+
+// KernelMetadata holds runtime-queryable attributes of a loaded kernel, such as
+// registers allocated, shared memory used, etc.
+// Not all platforms support reporting of all information, so each accessor
+// returns false if the associated field is not populated in the underlying
+// platform.
+class KernelMetadata {
+ public:
+  KernelMetadata() = default;
+
+  // Returns the number of registers used per thread executing this kernel.
+  std::optional<int64_t> registers_per_thread() const {
+    return registers_per_thread_;
+  }
+
+  // Returns the amount of [static] shared memory used per block executing this
+  // kernel. Note that dynamic shared memory allocations are not (and can not)
+  // be reported here (since they're not specified until kernel launch time).
+  std::optional<int64_t> shared_memory_bytes() const {
+    return shared_memory_bytes_;
+  }
+
+  void set_registers_per_thread(int registers_per_thread) {
+    registers_per_thread_ = registers_per_thread;
+  }
+  void set_shared_memory_bytes(int shared_memory_bytes) {
+    shared_memory_bytes_ = shared_memory_bytes;
+  }
+
+ private:
+  std::optional<int64_t> registers_per_thread_;
+  std::optional<int64_t> shared_memory_bytes_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_METADATA_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_metadata_test.cc b/third_party/xla/xla/stream_executor/kernel_metadata_test.cc
new file mode 100644
index 00000000000000..1fd8c007646d95
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_metadata_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_metadata.h"
+
+#include <optional>
+
+#include <gtest/gtest.h>
+
+namespace stream_executor {
+namespace {
+
+TEST(KernelMetadataTest, DefaultConstructor) {
+  KernelMetadata metadata;
+  EXPECT_EQ(metadata.registers_per_thread(), std::nullopt);
+  EXPECT_EQ(metadata.shared_memory_bytes(), std::nullopt);
+}
+
+TEST(KernelMetadataTest, SetRegistersPerThread) {
+  KernelMetadata metadata;
+  metadata.set_registers_per_thread(10);
+  EXPECT_EQ(metadata.registers_per_thread(), 10);
+}
+
+TEST(KernelMetadataTest, SetSharedMemoryBytes) {
+  KernelMetadata metadata;
+  metadata.set_shared_memory_bytes(1024);
+  EXPECT_EQ(metadata.shared_memory_bytes(), 1024);
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.cc b/third_party/xla/xla/stream_executor/kernel_spec.cc
index ce7e205a2b3aa3..3b2c74f5091e6e 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -26,7 +28,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.pb.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -37,6 +41,14 @@ KernelLoaderSpec KernelLoaderSpec::CreateInProcessSymbolSpec(
                           arity, kernel_args_packing};
 }
 
+KernelLoaderSpec KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+    std::string persistent_kernel_name, void* symbol, std::string kernel_name,
+    size_t arity, KernelArgsPacking kernel_args_packing) {
+  return KernelLoaderSpec{
+      InProcessSymbol{symbol, std::move(persistent_kernel_name)},
+      std::move(kernel_name), arity, kernel_args_packing};
+}
+
 KernelLoaderSpec KernelLoaderSpec::CreateCudaCubinInMemorySpec(
     absl::Span<const uint8_t> cubin_bytes, std::string kernel_name,
     size_t arity, KernelArgsPacking kernel_args_packing) {
@@ -66,18 +78,13 @@ KernelLoaderSpec KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
 }
 
 absl::StatusOr<KernelLoaderSpecProto> KernelLoaderSpec::ToProto() const {
-  if (kernel_args_packing_ != nullptr) {
+  if (std::holds_alternative<KernelArgsPackingFunc>(kernel_args_packing_) &&
+      std::get<KernelArgsPackingFunc>(kernel_args_packing_) != nullptr) {
     return absl::UnimplementedError(
-        "KernelLoaderSpecs with KernelArgsPacking are not currently"
+        "KernelLoaderSpecs with a function for argument packing is not "
         "serializable.");
   }
 
-  if (has_in_process_symbol()) {
-    return absl::InvalidArgumentError(
-        "KernelLoaderSpec referencing in process device functions can't "
-        "be serialized.");
-  }
-
   KernelLoaderSpecProto proto{};
   proto.set_arity(arity_);
   proto.set_kernel_name(kernel_name_);
@@ -91,28 +98,79 @@ absl::StatusOr<KernelLoaderSpecProto> KernelLoaderSpec::ToProto() const {
     proto.mutable_ptx()->set_data(cuda_ptx_in_memory()->ptx);
   }
 
-  CHECK(proto.has_cubin() || proto.has_ptx());
+  if (has_in_process_symbol()) {
+    if (in_process_symbol()->persistent_name.empty()) {
+      return absl::InvalidArgumentError(
+          "KernelLoaderSpec referencing in process device functions can't "
+          "be serialized without a persistent kernel name.");
+    }
+    proto.mutable_in_process_symbol()->set_persistent_name(
+        in_process_symbol()->persistent_name);
+  }
+
+  CHECK(has_cuda_cubin_in_memory() || has_cuda_ptx_in_memory() ||
+        has_in_process_symbol());
+
+  if (std::holds_alternative<KernelArgumentsPackingSpec>(
+          kernel_args_packing_)) {
+    TF_ASSIGN_OR_RETURN(
+        *proto.mutable_kernel_args_packing_spec(),
+        std::get<KernelArgumentsPackingSpec>(kernel_args_packing_).ToProto());
+  }
 
   return proto;
 }
 
 absl::StatusOr<KernelLoaderSpec> KernelLoaderSpec::FromProto(
-    const KernelLoaderSpecProto& proto) {
-  if (proto.has_cubin()) {
-    const std::string& data = proto.cubin().data();
-    return KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
-        std::vector<uint8_t>{data.begin(), data.end()}, proto.kernel_name(),
-        proto.arity());
+    const KernelLoaderSpecProto& proto,
+    std::optional<SymbolResolver> symbol_resolver) {
+  KernelArgsPacking kernel_args_packing;
+  if (proto.has_kernel_args_packing_spec()) {
+    TF_ASSIGN_OR_RETURN(kernel_args_packing,
+                        KernelArgumentsPackingSpec::FromProto(
+                            proto.kernel_args_packing_spec()));
   }
 
-  if (proto.has_ptx()) {
-    return KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
-        proto.ptx().data(), proto.kernel_name(), proto.arity());
+  switch (proto.payload_case()) {
+    case KernelLoaderSpecProto::kCubin: {
+      const std::string& data = proto.cubin().data();
+      return KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
+          std::vector<uint8_t>{data.begin(), data.end()}, proto.kernel_name(),
+          proto.arity(), std::move(kernel_args_packing));
+    }
+
+    case KernelLoaderSpecProto::kPtx: {
+      return KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
+          proto.ptx().data(), proto.kernel_name(), proto.arity(),
+          std::move(kernel_args_packing));
+    }
+
+    case KernelLoaderSpecProto::kInProcessSymbol: {
+      if (!symbol_resolver.has_value()) {
+        return absl::InvalidArgumentError(
+            "KernelLoaderSpecProto references in process symbol, but no symbol "
+            "registry has been provided.");
+      }
+      if (proto.in_process_symbol().persistent_name().empty()) {
+        return absl::InvalidArgumentError(
+            "KernelLoaderSpecProto references in process symbol, but no "
+            "persistent name has been provided.");
+      }
+
+      TF_ASSIGN_OR_RETURN(
+          void* symbol,
+          (*symbol_resolver)(proto.in_process_symbol().persistent_name()));
+      return KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+          proto.in_process_symbol().persistent_name(), symbol,
+          proto.kernel_name(), proto.arity());
+    }
+
+    default:
+      return absl::InvalidArgumentError(
+          "Invalid KernelLoaderSpecProto. Neither PTX nor CUBIN payload has "
+          "been "
+          "found.");
   }
-
-  return absl::InvalidArgumentError(
-      "Invalid KernelLoaderSpecProto. Neither PTX nor CUBIN payload has been "
-      "found.");
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index a7ba849256570e..450bccfda1a62a 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -1,3 +1,4 @@
+#include "xla/stream_executor/kernel.h"
 /* Copyright 2015 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -47,11 +48,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.pb.h"
 
 namespace stream_executor {
@@ -59,7 +62,10 @@ namespace stream_executor {
 // Loads kernel from in process symbol pointer (e.g. pointer to C++ device
 // function).
 struct InProcessSymbol {
-  void *symbol;
+  void* symbol;
+  // If not empty, this symbol can be looked up by this name in the kernel
+  // symbol registry.
+  std::string persistent_name;
 };
 
 // Kernel loader specification for PTX text that resides in memory.
@@ -89,9 +95,15 @@ class KernelLoaderSpec {
   // that can be directly passed to a device kernel. This indirection allows
   // registering custom CUDA C++ kernels with non-trivial C++ API with a
   // StreamExecutor as a generic `Kernel`.
-  using KernelArgsPacking =
+  using KernelArgsPackingFunc =
       std::function<absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
-          const Kernel &kernel, const KernelArgs &args)>;
+          const Kernel& kernel, const KernelArgs& args)>;
+
+  // Kernel arguments packing can be either a function or a specification.
+  // The specification has the advantage that it can be serialized and is
+  // therefore a requirement for AOT compilation.
+  using KernelArgsPacking =
+      std::variant<KernelArgsPackingFunc, KernelArgumentsPackingSpec>;
 
   // Returns the number of arguments that this kernel accepts.
   size_t arity() const { return arity_; }
@@ -145,44 +157,52 @@ class KernelLoaderSpec {
   // the PTX being loaded. Also be aware that in CUDA C++ the kernel name may be
   // mangled by the compiler if it is not declared in an extern "C" scope.
   static KernelLoaderSpec CreateInProcessSymbolSpec(
-      void *symbol, std::string kernel_name, size_t arity,
+      void* symbol, std::string kernel_name, size_t arity,
       KernelArgsPacking kernel_args_packing = nullptr);
+  static KernelLoaderSpec CreateSerializableInProcessSymbolSpec(
+      std::string persistent_kernel_name, void* symbol, std::string kernel_name,
+      size_t arity, KernelArgsPacking kernel_args_packing = nullptr);
   static KernelLoaderSpec CreateCudaCubinInMemorySpec(
       absl::Span<const uint8_t> cubin_bytes, std::string kernel_name,
-      size_t arity, KernelArgsPacking kernel_args_packing = nullptr);
+      size_t arity,
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
   static KernelLoaderSpec CreateOwningCudaCubinInMemorySpec(
       std::vector<uint8_t> cubin_bytes, std::string kernel_name, size_t arity,
-      KernelArgsPacking kernel_args_packing = nullptr);
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
   static KernelLoaderSpec CreateCudaPtxInMemorySpec(
       absl::string_view ptx, std::string kernel_name, size_t arity,
-      KernelArgsPacking kernel_args_packing = nullptr);
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
   static KernelLoaderSpec CreateOwningCudaPtxInMemorySpec(
       std::string ptx, std::string kernel_name, size_t arity,
-      KernelArgsPacking kernel_args_packing = nullptr);
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
 
   void set_kernel_args_packing(KernelArgsPacking kernel_args_packing) {
     kernel_args_packing_ = std::move(kernel_args_packing);
   }
 
-  const KernelArgsPacking &kernel_args_packing() const {
+  const KernelArgsPacking& kernel_args_packing() const {
     return kernel_args_packing_;
   }
 
-  const std::string &kernel_name() const { return kernel_name_; }
+  const std::string& kernel_name() const { return kernel_name_; }
 
   absl::StatusOr<KernelLoaderSpecProto> ToProto() const;
 
+  using SymbolResolver =
+      absl::FunctionRef<absl::StatusOr<void*>(absl::string_view)>;
+
   static absl::StatusOr<KernelLoaderSpec> FromProto(
-      const KernelLoaderSpecProto &proto);
+      const KernelLoaderSpecProto& proto,
+      std::optional<SymbolResolver> symbol_resolver = std::nullopt);
 
  private:
   using Payload =
       std::variant<InProcessSymbol, CudaCubinInMemory, CudaPtxInMemory,
                    OwningCudaCubinInMemory, OwningCudaPtxInMemory>;
 
-  explicit KernelLoaderSpec(Payload payload, std::string kernel_name,
-                            size_t arity,
-                            KernelArgsPacking kernel_args_packing = nullptr)
+  explicit KernelLoaderSpec(
+      Payload payload, std::string kernel_name, size_t arity,
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{})
       : payload_(std::move(payload)),
         kernel_name_(std::move(kernel_name)),
         arity_(arity),
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.proto b/third_party/xla/xla/stream_executor/kernel_spec.proto
index 962229d2e3f366..939b06cba35ebd 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.proto
+++ b/third_party/xla/xla/stream_executor/kernel_spec.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
 
 package stream_executor;
 
+import "xla/stream_executor/kernel_argument_packing_spec.proto";
+
 message CudaPtxProto {
   string data = 1;
 }
@@ -10,12 +12,19 @@ message CudaCubinProto {
   bytes data = 1;
 }
 
+message InProcessSymbolProto {
+  string persistent_name = 1;
+}
+
 message KernelLoaderSpecProto {
   oneof payload {
     CudaPtxProto ptx = 1;
     CudaCubinProto cubin = 2;
+    InProcessSymbolProto in_process_symbol = 6;
   }
 
   int32 arity = 3;
   string kernel_name = 4;
+
+  optional KernelArgumentsPackingSpecProto kernel_args_packing_spec = 5;
 }
diff --git a/third_party/xla/xla/stream_executor/kernel_spec_test.cc b/third_party/xla/xla/stream_executor/kernel_spec_test.cc
index e1c24efa568518..71065e5ad50a81 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec_test.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec_test.cc
@@ -18,32 +18,42 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/base/casts.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 
 namespace stream_executor {
 namespace {
 
+using ::absl_testing::IsOkAndHolds;
+using ::absl_testing::StatusIs;
 using ::testing::Field;
 using ::testing::Optional;
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+// Creates a pointer to a CUDA kernel with a value that can be used to identify
+// it later. Note that this is not a valid pointer, but that's fine as long
+// as we don't try to dereference it.
+void* InventPointerToCudaKernel(std::uintptr_t value) {
+  return tsl::safe_reinterpret_cast<void*>(value);
+}
 
 TEST(KernelLoaderSpec, InProcessSymbol) {
-  void* symbol = absl::bit_cast<void*>(0xDEADBEEFul);
-  auto spec = stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      symbol, "kernel24", 2);
+  void* symbol = InventPointerToCudaKernel(0xDEADBEEF);
+  auto spec = KernelLoaderSpec::CreateInProcessSymbolSpec(symbol, "kernel24",
+                                                          /*arity=*/2);
   EXPECT_FALSE(spec.has_cuda_cubin_in_memory());
   EXPECT_FALSE(spec.has_cuda_ptx_in_memory());
   EXPECT_TRUE(spec.has_in_process_symbol());
@@ -55,8 +65,8 @@ TEST(KernelLoaderSpec, InProcessSymbol) {
 
 TEST(KernelLoaderSpec, CudaCubin) {
   static constexpr std::array<uint8_t, 4> kCubinData = {0xDE, 0xAD, 0xBE, 0xEF};
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaCubinInMemorySpec(
-      kCubinData, "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateCudaCubinInMemorySpec(
+      kCubinData, "kernel24", /*arity=*/2);
   EXPECT_TRUE(spec.has_cuda_cubin_in_memory());
   EXPECT_FALSE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -68,10 +78,9 @@ TEST(KernelLoaderSpec, CudaCubin) {
 
 TEST(KernelLoaderSpec, OwningCudaCubin) {
   static constexpr std::array<uint8_t, 4> kCubinData = {0xDE, 0xAD, 0xBE, 0xEF};
-  auto spec =
-      stream_executor::KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
-          std::vector<uint8_t>{kCubinData.begin(), kCubinData.end()},
-          "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
+      std::vector<uint8_t>{kCubinData.begin(), kCubinData.end()}, "kernel24",
+      /*arity=*/2);
   EXPECT_TRUE(spec.has_cuda_cubin_in_memory());
   EXPECT_FALSE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -83,8 +92,8 @@ TEST(KernelLoaderSpec, OwningCudaCubin) {
 
 TEST(KernelLoaderSpec, CudaPtx) {
   static constexpr absl::string_view kPtxData = "PTX DEADBEEF";
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-      kPtxData, "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateCudaPtxInMemorySpec(kPtxData, "kernel24",
+                                                          /*arity=*/2);
   EXPECT_FALSE(spec.has_cuda_cubin_in_memory());
   EXPECT_TRUE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -96,9 +105,8 @@ TEST(KernelLoaderSpec, CudaPtx) {
 
 TEST(KernelLoaderSpec, OwningCudaPtx) {
   static constexpr absl::string_view kPtxData = "PTX DEADBEEF";
-  auto spec =
-      stream_executor::KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
-          std::string{kPtxData}, "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
+      std::string{kPtxData}, "kernel24", /*arity=*/2);
   EXPECT_FALSE(spec.has_cuda_cubin_in_memory());
   EXPECT_TRUE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -109,14 +117,11 @@ TEST(KernelLoaderSpec, OwningCudaPtx) {
 }
 
 TEST(KernelLoaderSpec, PtxKernelFromProto) {
-  KernelLoaderSpecProto proto;
-  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
-      R"pb(
-        ptx { data: "PTX!" }
-        kernel_name: "kernel_name"
-        arity: 42
-      )pb",
-      &proto));
+  auto proto = ParseTextProtoOrDie<KernelLoaderSpecProto>(R"pb(
+    ptx { data: "PTX!" }
+    kernel_name: "kernel_name"
+    arity: 42
+  )pb");
 
   TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
                           KernelLoaderSpec::FromProto(proto));
@@ -127,8 +132,8 @@ TEST(KernelLoaderSpec, PtxKernelFromProto) {
 }
 
 TEST(KernelLoaderSpec, PtxKernelToProto) {
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-      "PTX!", "kernel_name", 42);
+  auto spec = KernelLoaderSpec::CreateCudaPtxInMemorySpec("PTX!", "kernel_name",
+                                                          /*arity=*/42);
 
   EXPECT_THAT(spec.ToProto(), absl_testing::IsOkAndHolds(EqualsProto(R"pb(
                 ptx { data: "PTX!" }
@@ -138,14 +143,11 @@ TEST(KernelLoaderSpec, PtxKernelToProto) {
 }
 
 TEST(KernelLoaderSpec, CubinKernelFromProto) {
-  KernelLoaderSpecProto proto;
-  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
-      R"pb(
-        cubin { data: "CUBIN" }
-        kernel_name: "kernel_name"
-        arity: 42
-      )pb",
-      &proto));
+  auto proto = ParseTextProtoOrDie<KernelLoaderSpecProto>(R"pb(
+    cubin { data: "CUBIN" }
+    kernel_name: "kernel_name"
+    arity: 42
+  )pb");
 
   TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
                           KernelLoaderSpec::FromProto(proto));
@@ -158,22 +160,100 @@ TEST(KernelLoaderSpec, CubinKernelFromProto) {
 
 TEST(KernelLoaderSpec, CubinKernelToProto) {
   std::array<uint8_t, 5> kCubin = {'C', 'U', 'B', 'I', 'N'};
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaCubinInMemorySpec(
-      kCubin, "kernel_name", 42);
+  auto spec = KernelLoaderSpec::CreateCudaCubinInMemorySpec(
+      kCubin, "kernel_name", /*arity=*/42);
 
-  EXPECT_THAT(spec.ToProto(), absl_testing::IsOkAndHolds(EqualsProto(R"pb(
+  EXPECT_THAT(spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
                 cubin { data: "CUBIN" }
                 kernel_name: "kernel_name"
                 arity: 42
               )pb")));
 }
 
+TEST(KernelLoaderSpec, InProcessSymbolFromProto) {
+  auto proto = ParseTextProtoOrDie<KernelLoaderSpecProto>(R"pb(
+    in_process_symbol { persistent_name: "persistent_kernel_name" }
+    kernel_name: "kernel_name"
+    arity: 42
+  )pb");
+
+  const auto symbol_resolver = [](absl::string_view name) {
+    return InventPointerToCudaKernel(0x1234567890);
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
+                          KernelLoaderSpec::FromProto(proto, symbol_resolver));
+  EXPECT_EQ(spec.kernel_name(), "kernel_name");
+  EXPECT_EQ(spec.arity(), 42);
+  EXPECT_THAT(spec.in_process_symbol(),
+              Optional(Field(&InProcessSymbol::symbol,
+                             InventPointerToCudaKernel(0x1234567890))));
+  EXPECT_THAT(spec.in_process_symbol(),
+              Optional(Field(&InProcessSymbol::persistent_name,
+                             "persistent_kernel_name")));
+
+  // If the symbol resolver is not provided, the spec cannot be deserialized.
+  EXPECT_THAT(KernelLoaderSpec::FromProto(proto),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
 TEST(KernelLoaderSpec, InProcessSymbolToProto) {
-  auto spec = stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      nullptr, "kernel_name", 42);
+  auto non_serializable_spec = KernelLoaderSpec::CreateInProcessSymbolSpec(
+      /*symbol=*/nullptr, "kernel_name", 42);
+
+  // InProcessSymbol specs without a persistent name cannot be serialized.
+  EXPECT_THAT(non_serializable_spec.ToProto(),
+              StatusIs(absl::StatusCode::kInvalidArgument));
 
-  EXPECT_THAT(spec.ToProto(),
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  auto serializable_spec =
+      KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+          "persistent_kernel_name", nullptr, "kernel_name", 42);
+  EXPECT_THAT(serializable_spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+                in_process_symbol { persistent_name: "persistent_kernel_name" }
+                kernel_name: "kernel_name"
+                arity: 42
+              )pb")));
+}
+
+TEST(kernelLoaderSpec, StoresKernelArgsPackingSpec) {
+  auto kernel_args_packing_spec_proto =
+      ParseTextProtoOrDie<KernelArgumentsPackingSpecProto>(
+          R"pb(
+            kernel_arguments {
+              relocations {
+                type: TYPE_BITS64_ABSOLUTE
+                argument_index: 0
+                offset: 0
+              }
+              data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+            }
+            kernel_arguments { data: "\x34\x12\x00\x00" }
+          )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      KernelArgumentsPackingSpec kernel_args_packing_spec,
+      KernelArgumentsPackingSpec::FromProto(kernel_args_packing_spec_proto));
+
+  auto spec = KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
+      std::vector<uint8_t>{'C', 'U', 'B', 'I', 'N'}, "kernel_name",
+      /*arity=*/42, std::move(kernel_args_packing_spec));
+
+  EXPECT_THAT(spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+                cubin { data: "CUBIN" }
+                kernel_name: "kernel_name"
+                arity: 42
+                kernel_args_packing_spec {
+                  kernel_arguments {
+                    relocations {
+                      type: TYPE_BITS64_ABSOLUTE
+                      argument_index: 0
+                      offset: 0
+                    }
+                    data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+                  }
+                  kernel_arguments { data: "\x34\x12\x00\x00" }
+                }
+              )pb")));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/stream_executor/kernel_symbol_registry.cc b/third_party/xla/xla/stream_executor/kernel_symbol_registry.cc
new file mode 100644
index 00000000000000..c6375e0cfef37e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_symbol_registry.cc
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_symbol_registry.h"
+
+#include <string>
+#include <tuple>
+
+#include "absl/base/no_destructor.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+
+namespace stream_executor {
+namespace {
+std::string GetPlatformName(Platform::Id platform_id) {
+  absl::StatusOr<Platform*> platform =
+      PlatformManager::PlatformWithId(platform_id);
+  return platform.ok() ? platform.value()->Name() : "<unknown>";
+}
+}  // namespace
+
+KernelSymbolRegistry& KernelSymbolRegistry::GetGlobalInstance() {
+  static absl::NoDestructor<KernelSymbolRegistry> registry;
+  return *registry;
+}
+
+absl::Status KernelSymbolRegistry::RegisterSymbol(absl::string_view name,
+                                                  Platform::Id platform_id,
+                                                  void* symbol) {
+  absl::MutexLock lock(mutex_);
+  bool inserted;
+  std::tie(std::ignore, inserted) =
+      symbols_.insert({{std::string(name), platform_id}, symbol});
+  if (!inserted) {
+    return absl::AlreadyExistsError(
+        absl::StrCat("Symbol ", name, " is already registered."));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<void*> KernelSymbolRegistry::FindSymbol(
+    absl::string_view name, Platform::Id platform_id) const {
+  absl::MutexLock lock(mutex_);
+  auto it = symbols_.find({std::string(name), platform_id});
+  if (it == symbols_.end()) {
+    return absl::NotFoundError(absl::StrCat("Symbol ", name,
+                                            " not found for platform ",
+                                            GetPlatformName(platform_id)));
+  }
+  return it->second;
+}
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_symbol_registry.h b/third_party/xla/xla/stream_executor/kernel_symbol_registry.h
new file mode 100644
index 00000000000000..ac388a5e17daba
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_symbol_registry.h
@@ -0,0 +1,114 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_SYMBOL_REGISTRY_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_SYMBOL_REGISTRY_H_
+
+#include <string>
+#include <tuple>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform/initialize.h"
+
+namespace stream_executor {
+
+/**
+ * A registry for GPU kernel symbols.
+ *
+ * We use void* pointers to the host entry functions of CUDA C++ kernels to
+ * identify them and load their GPU implementation into the GPU.
+ *
+ * This registry allows us to do this consistently and reliably across different
+ * processes by mapping the host entry function to a persistent name and using
+ * that name to look up the host entry function pointer in the registry.
+ *
+ * Maps a (name, platform_id) tuple to a void* pointer.
+ *
+ * You can use the KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY macro to
+ * register symbols during initialization. Make sure to mark the target as
+ * `alwayslink = True` so that it won't be stripped by the linker.
+ *
+ * The class is thread-safe.
+ */
+class KernelSymbolRegistry {
+ public:
+  static KernelSymbolRegistry& GetGlobalInstance();
+
+  // Registers a symbol in the registry. The symbol is identified by a name and
+  // a platform ID.
+  //
+  // Returns an error if the symbol is already registered.
+  absl::Status RegisterSymbol(absl::string_view name, Platform::Id platform_id,
+                              void* symbol);
+
+  // Convenience overload for registering any raw function pointer as a symbol.
+  template <typename... Args>
+  absl::Status RegisterSymbol(absl::string_view name, Platform::Id platform_id,
+                              void (*symbol)(Args...)) {
+    return RegisterSymbol(name, platform_id, absl::bit_cast<void*>(symbol));
+  }
+
+  absl::StatusOr<void*> FindSymbol(absl::string_view name,
+                                   Platform::Id platform_id) const;
+
+ private:
+  mutable absl::Mutex mutex_;
+  using RegistryKey = std::tuple<std::string, Platform::Id>;
+  absl::flat_hash_map<RegistryKey, void*> symbols_ ABSL_GUARDED_BY(mutex_);
+};
+
+// Registers a symbol in the kernel symbol registry.
+//
+// This macro registers a symbol in the kernel symbol registry during static
+// initialization. It uses the identifier to generate a unique name for the
+// symbol and logs a fatal error if the symbol is already registered.
+//
+// Example usage:
+//
+//   __global__ void my_cuda_kernel(int* x);
+//   KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(my_unique_persistent_name,
+//                                                     cuda::kCudaPlatformId,
+//                                                     &my_cuda_kernel);
+//
+// The symbol will be registered with the name "my_unique_persistent_name" and
+// the platform ID cuda::kCudaPlatformId. The name will also be used to generate
+// a C++ identifier for the static initializer. therefore it needs to be a valid
+// C++ variable name.
+//
+// Make sure to mark the target as `alwayslink = True` so that it won't be
+// stripped by the linker.
+#define KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(identifier,          \
+                                                          platform_id, symbol) \
+  static void RegisterSymbol##identifier##Impl() {                             \
+    absl::Status result =                                                      \
+        stream_executor::KernelSymbolRegistry::GetGlobalInstance()             \
+            .RegisterSymbol(#identifier, platform_id, symbol);                 \
+    if (!result.ok()) {                                                        \
+      LOG(FATAL) << "Failed to register symbol: " << result;                   \
+    }                                                                          \
+  }                                                                            \
+  STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(                                 \
+      RegisterSymbol##identifier, RegisterSymbol##identifier##Impl());
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_SYMBOL_REGISTRY_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_symbol_registry_test.cc b/third_party/xla/xla/stream_executor/kernel_symbol_registry_test.cc
new file mode 100644
index 00000000000000..afcf7bc8801788
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_symbol_registry_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_symbol_registry.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+
+namespace stream_executor {
+namespace {
+
+using absl_testing::IsOk;
+using absl_testing::IsOkAndHolds;
+using absl_testing::StatusIs;
+
+TEST(KernelSymbolRegistryTest, RegisterSymbol) {
+  KernelSymbolRegistry registry;
+  EXPECT_THAT(registry.RegisterSymbol("symbol_name", cuda::kCudaPlatformId,
+                                      /*symbol=*/nullptr),
+              IsOk());
+  EXPECT_THAT(registry.RegisterSymbol("symbol_name", cuda::kCudaPlatformId,
+                                      /*symbol=*/nullptr),
+              StatusIs(absl::StatusCode::kAlreadyExists));
+}
+
+void PretendsToBeAKernel(int* x) { *x = 42; }
+
+TEST(KernelSymbolRegistryTest, FindSymbol) {
+  KernelSymbolRegistry registry;
+  EXPECT_THAT(registry.FindSymbol("symbol_name", cuda::kCudaPlatformId),
+              StatusIs(absl::StatusCode::kNotFound));
+  EXPECT_THAT(registry.RegisterSymbol("symbol_name", cuda::kCudaPlatformId,
+                                      &PretendsToBeAKernel),
+              IsOk());
+  EXPECT_THAT(registry.FindSymbol("symbol_name", cuda::kCudaPlatformId),
+              IsOkAndHolds(absl::bit_cast<void*>(&PretendsToBeAKernel)));
+}
+
+KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(static_test_symbol,
+                                                  cuda::kCudaPlatformId,
+                                                  &PretendsToBeAKernel);
+
+TEST(KernelSymbolRegistryTest, StaticRegistration) {
+  EXPECT_THAT(KernelSymbolRegistry::GetGlobalInstance().FindSymbol(
+                  "static_test_symbol", cuda::kCudaPlatformId),
+              IsOkAndHolds(absl::bit_cast<void*>(&PretendsToBeAKernel)));
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/launch_dim_test.cc b/third_party/xla/xla/stream_executor/launch_dim_test.cc
index 2f4bfe7920a5c4..0763a7c910c90f 100644
--- a/third_party/xla/xla/stream_executor/launch_dim_test.cc
+++ b/third_party/xla/xla/stream_executor/launch_dim_test.cc
@@ -19,17 +19,14 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
+#include "google/protobuf/text_format.h"
 #include "xla/stream_executor/launch_dim.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace stream_executor {
 namespace {
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(Dim3DTest, ToProto) {
   Dim3D dimensions{1, 2, 3};
diff --git a/third_party/xla/xla/stream_executor/platform/BUILD b/third_party/xla/xla/stream_executor/platform/BUILD
index 61df897f670f29..1c8f358d4d1995 100644
--- a/third_party/xla/xla/stream_executor/platform/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/BUILD
@@ -55,7 +55,6 @@ xla_cc_test(
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/stream_executor/platform/platform_object_registry_test.cc b/third_party/xla/xla/stream_executor/platform/platform_object_registry_test.cc
index 1b2fbc5e67f695..59782ce50cfca1 100644
--- a/third_party/xla/xla/stream_executor/platform/platform_object_registry_test.cc
+++ b/third_party/xla/xla/stream_executor/platform/platform_object_registry_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 
@@ -41,10 +40,6 @@ struct StaticTestTrait {
   using Type = int;
 };
 
-using tsl::testing::IsOk;
-using tsl::testing::IsOkAndHolds;
-using tsl::testing::StatusIs;
-
 TEST(PlatformObjectRegistryTest, RegisterObject) {
   PlatformObjectRegistry registry;
 
diff --git a/third_party/xla/xla/stream_executor/platform_manager.cc b/third_party/xla/xla/stream_executor/platform_manager.cc
index 7e324ec66ba717..4d9f8efcb3710f 100644
--- a/third_party/xla/xla/stream_executor/platform_manager.cc
+++ b/third_party/xla/xla/stream_executor/platform_manager.cc
@@ -92,7 +92,7 @@ absl::Status PlatformManagerImpl::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   std::string key = absl::AsciiStrToLower(platform->Name());
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (name_map_.find(key) != name_map_.end()) {
     return absl::InternalError("platform is already registered with name: \"" +
                                platform->Name() + "\"");
@@ -120,7 +120,7 @@ absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithId(
 
 absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithName(
     absl::string_view target, bool initialize_platform) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
 
   TF_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (initialize_platform && !platform->Initialized()) {
@@ -132,7 +132,7 @@ absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithName(
 
 absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithId(
     const Platform::Id& id, bool initialize_platform) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
 
   TF_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (initialize_platform && !platform->Initialized()) {
@@ -144,7 +144,7 @@ absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithId(
 
 absl::StatusOr<Platform*> PlatformManagerImpl::InitializePlatformWithId(
     const Platform::Id& id) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
 
   TF_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (platform->Initialized()) {
@@ -160,7 +160,7 @@ absl::StatusOr<Platform*> PlatformManagerImpl::InitializePlatformWithId(
 absl::StatusOr<std::vector<Platform*>> PlatformManagerImpl::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter,
     bool initialize_platform) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   CHECK_EQ(id_map_.size(), name_map_.size());
   std::vector<Platform*> platforms;
   platforms.reserve(id_map_.size());
diff --git a/third_party/xla/xla/stream_executor/plugin_registry.cc b/third_party/xla/xla/stream_executor/plugin_registry.cc
index f16a4f6707c35c..c8afc149497a21 100644
--- a/third_party/xla/xla/stream_executor/plugin_registry.cc
+++ b/third_party/xla/xla/stream_executor/plugin_registry.cc
@@ -62,7 +62,7 @@ absl::Status PluginRegistry::RegisterFactory(Platform::Id platform_id,
                                              const std::string& name,
                                              FactoryT factory) {
   PluginKind plugin_kind = GetPluginKind<FactoryT>();
-  absl::MutexLock lock(&registry_mutex_);
+  absl::MutexLock lock(registry_mutex_);
   auto [_, inserted] = factories_.insert({{platform_id, plugin_kind}, factory});
   if (!inserted) {
     return absl::AlreadyExistsError(
@@ -77,7 +77,7 @@ template <typename FactoryT>
 absl::StatusOr<FactoryT> PluginRegistry::GetFactory(
     Platform::Id platform_id) const {
   PluginKind plugin_kind = GetPluginKind<FactoryT>();
-  absl::MutexLock lock(&registry_mutex_);
+  absl::MutexLock lock(registry_mutex_);
   auto it = factories_.find({platform_id, plugin_kind});
   if (it == factories_.end()) {
     absl::string_view name = GetPluginName<FactoryT>();
@@ -91,7 +91,7 @@ absl::StatusOr<FactoryT> PluginRegistry::GetFactory(
 
 bool PluginRegistry::HasFactory(Platform::Id platform_id,
                                 PluginKind plugin_kind) const {
-  absl::MutexLock lock(&registry_mutex_);
+  absl::MutexLock lock(registry_mutex_);
   return factories_.contains({platform_id, plugin_kind});
 }
 
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 6fd7d30c8d3de5..29ad816ab03114 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -2,7 +2,6 @@
 
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_hipblaslt",
     "rocm_library",
 )
 load("//xla:xla.default.bzl", "xla_cc_test")
@@ -39,21 +38,14 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "rocm_diagnostics",
-    srcs = ["rocm_diagnostics.cc"],
-    hdrs = ["rocm_diagnostics.h"],
-    tags = [
-        "gpu",
-        "rocm-only",
-    ],
+xla_cc_test(
+    name = "rocm_compute_capability_test",
+    srcs = ["rocm_compute_capability_test.cc"],
     deps = [
-        "//xla/tsl/platform:logging",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:platform_port",
+        ":rocm_compute_capability",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -68,12 +60,10 @@ cc_library(
     deps = [
         ":rocm_driver_wrapper",
         ":rocm_status",
-        "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:context",
         "//xla/stream_executor/gpu:context_map",
         "//xla/stream_executor/gpu:scoped_activate_context",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -175,6 +165,9 @@ cc_library(
         "//xla/stream_executor:generic_memory_allocation",
         "//xla/stream_executor:generic_memory_allocator",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:kernel_argument_packing_spec",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:memory_allocation",
@@ -209,9 +202,9 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:platform_port",
     ],
     alwayslink = True,
 )
@@ -237,9 +230,9 @@ xla_test(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -258,6 +251,7 @@ cc_library(
         ":rocm_status",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -290,9 +284,9 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -307,7 +301,6 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":rocm_diagnostics",
         ":rocm_driver_wrapper",
         ":rocm_executor",
         ":rocm_platform_id",
@@ -319,8 +312,8 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:initialize",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -387,9 +380,8 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
-        "//xla/stream_executor:host_or_device_scalar",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_h",
@@ -518,7 +510,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":miopen_if_static",  # build_cleaner: keep
-        ":rocm_diagnostics",
         ":rocm_executor",
         ":rocm_helpers",
         ":rocm_platform_id",
@@ -527,8 +518,8 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
@@ -538,7 +529,6 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:determinism_for_kernels",
         "//xla/tsl/util:env_var",
@@ -665,7 +655,7 @@ cc_library(
         "gpu",
         "rocm-only",
     ],
-    deps = if_rocm_hipblaslt([
+    deps = if_static([
         "@local_config_rocm//rocm:hipblaslt",
     ]),
 )
@@ -697,7 +687,6 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:event_based_timer",
-        "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:gpu_blas_lt",
@@ -716,9 +705,9 @@ cc_library(
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:ml_dtypes",
-    ] + if_static([
+    ] + [
         ":hipblaslt_if_static",
-    ]),
+    ],
     alwayslink = True,
 )
 
@@ -738,7 +727,6 @@ cc_library(
         "//xla:types",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/tsl/platform:env",
@@ -892,9 +880,9 @@ xla_cc_test(
     deps = [
         ":rocm_version_parser",
         "//xla/stream_executor:semantic_version",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_config_rocm//rocm:rocm_headers",
     ],
@@ -953,10 +941,10 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -1000,16 +988,15 @@ xla_test(
         ":rocm_timer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
-        "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1041,9 +1028,9 @@ xla_cc_test(
     ],
     deps = [
         ":rocm_status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_config_rocm//rocm:rocm_headers",
@@ -1163,6 +1150,7 @@ rocm_library(
         "rocm-only",
     ],
     deps = [
+        "//xla/backends/gpu:ffi",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/stream_executor/rocm:rocm_status",
@@ -1190,6 +1178,7 @@ rocm_library(
     deps = [
         ":rocm_platform_id",
         "//xla:types",
+        "//xla/stream_executor:kernel_symbol_registry",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:topk_kernel",
         "//xla/tsl/lib/math:math_util",
@@ -1257,7 +1246,7 @@ rocm_library(
     ],
     deps = [
         ":rocm_platform_id",
-        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:gpu_test_kernel_traits",
diff --git a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
index c4a07d84b21054..30baa5d6ce286a 100644
--- a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "rocm/include/rocprim/thread/radix_key_codec.hpp"
 #include "rocm/include/rocprim/type_traits.hpp"
 #include "rocm/rocm_config.h"
+#include "xla/backends/gpu/ffi.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"  // IWYU pragma: keep
 #include "xla/stream_executor/rocm/rocm_status.h"
@@ -61,7 +62,7 @@ struct radix_key_codec_base<tsl::bfloat16>
 namespace traits {
 
 template <>
-struct rocprim::traits::define<Eigen::half> {
+struct define<Eigen::half> {
   using float_bit_mask =
       rocprim::traits::float_bit_mask::values<uint16_t, 0x8000, 0x7C00, 0x03FF>;
   using is_arithmetic = rocprim::traits::is_arithmetic::values<true>;
@@ -70,7 +71,7 @@ struct rocprim::traits::define<Eigen::half> {
 };
 
 template <>
-struct rocprim::traits::define<tsl::bfloat16> {
+struct define<tsl::bfloat16> {
   using float_bit_mask =
       rocprim::traits::float_bit_mask::values<uint16_t, 0x8000, 0x7F80, 0x007F>;
   using is_arithmetic = rocprim::traits::is_arithmetic::values<true>;
@@ -284,7 +285,7 @@ static absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes,
 
 // Floating point types.
 #ifdef CUB_TYPE_BF16
-XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
+XLA_CUB_DEFINE_SORT_KEYS(bf16, hip_bfloat16)
 #endif
 #ifdef CUB_TYPE_F16
 XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
diff --git a/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc
index ce976a32e63341..1dc83bc2ba276b 100644
--- a/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/gpu_test_kernel_traits.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 
@@ -32,6 +32,24 @@ GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
           "AddI32", arity);
     }));
 
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    IncrementBy5I32KernelWithCustomArgsPackingRocm,
+    stream_executor::gpu::internal::IncrementBy5I32KernelWithCustomArgsPacking,
+    stream_executor::rocm::kROCmPlatformId, ([](size_t arity) {
+      // This kernels is implemented in terms of the generic `IncI32` kernel
+      // which accepts a constant scalar argument and an addressable pointer
+      // argument. We use a custom args packing spec to pass a constant scalar
+      // value of 5 to the kernel.
+      stream_executor::KernelArgumentsPackingSpec spec;
+      spec.AddConstantArgument<int32_t>(5);
+      spec.AddAddressArgument(/*argument_index=*/0);
+      spec.AddAddressArgument(/*argument_index=*/1);
+
+      return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
+          absl::bit_cast<void*>(&stream_executor::gpu::IncI32), "IncI32",
+          /*arity=*/3, spec);
+    }));
+
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
     MulI32KernelRocm, stream_executor::gpu::internal::MulI32Kernel,
     stream_executor::rocm::kROCmPlatformId, ([](size_t arity) {
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index 70740c05b43af9..19ce154948d8e7 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
-#include "xla/stream_executor/host_or_device_scalar.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/types.h"
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index 732454d86d2181..f74c91a5bd312d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -42,10 +42,10 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocblas_wrapper.h"
@@ -471,12 +471,14 @@ void ROCMBlas::MaybeLogGemmOp(GemmCallTrace::GemmType op,
       parent_->RecordApiTrace(GemmCallTrace{op, (int)context, size1, size2});
 }
 
-absl::Status ROCMBlas::DoBlasGemm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
-    const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
-    const void *beta, DeviceMemoryBase *c, int ldc,
-    const NumericOptions &numeric_options, blas::CallContext context) {
+absl::Status ROCMBlas::DoBlasGemm(Stream* stream, blas::Transpose transa,
+                                  blas::Transpose transb, uint64_t m,
+                                  uint64_t n, uint64_t k, blas::DataType dtype,
+                                  const void* alpha, const DeviceMemoryBase& a,
+                                  int lda, const DeviceMemoryBase& b, int ldb,
+                                  const void* beta, DeviceMemoryBase* c,
+                                  int ldc, const EngineOptions& engine_options,
+                                  blas::CallContext context) {
   MaybeLogGemmOp(GemmCallTrace::GemmType::kPlain, context,
                  m * k * DtypeSize(dtype), n * k * DtypeSize(dtype));
 
@@ -557,13 +559,13 @@ absl::Status ROCMBlas::DoBlasGemm(
 }
 
 absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, const void *beta, DeviceMemoryBase *c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, const void* beta, DeviceMemoryBase* c,
     blas::DataType type_c, int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* profile_result, blas::CallContext context) {
   if (type_a != type_b) {
     return absl::InternalError(absl::StrFormat(
         "DoBlasGemmWithAlgorithm: different "
@@ -580,7 +582,7 @@ absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
   if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
     TF_RETURN_IF_ERROR(DoBlasGemm(stream, transa, transb, m, n, k, type_a,
                                   alpha, a, lda, b, ldb, beta, c, ldc,
-                                  numeric_options, context));
+                                  engine_options, context));
 
   } else {
     MaybeLogGemmOp(GemmCallTrace::GemmType::kPlain, context,
@@ -617,14 +619,14 @@ absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
 }
 
 absl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, int64_t stride_b, const void *beta,
-    DeviceMemoryBase *c, blas::DataType type_c, int ldc, int64_t stride_c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, int64_t stride_b, const void* beta,
+    DeviceMemoryBase* c, blas::DataType type_c, int ldc, int64_t stride_c,
     int batch_count, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* profile_result, blas::CallContext context) {
   if (type_a != type_b) {
     return absl::InternalError(absl::StrFormat(
         "DoBlasGemmStridedBatchedWithAlgorithm: different "
@@ -641,7 +643,7 @@ absl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
   if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
     TF_RETURN_IF_ERROR(DoBlasGemmStridedBatched(
         stream, transa, transb, m, n, k, type_a, alpha, a, lda, stride_a, b,
-        ldb, stride_b, beta, c, ldc, stride_c, batch_count, numeric_options,
+        ldb, stride_b, beta, c, ldc, stride_c, batch_count, engine_options,
         context));
   } else {
     MaybeLogGemmOp(GemmCallTrace::GemmType::kStridedBatched, context, a.size(),
@@ -1070,11 +1072,11 @@ class rocblas_gemm_strided_batched_bf16 {
 const char *rocblas_gemm_strided_batched_bf16::kName =
     "rocblas_gemm_strided_batched_bf16";
 bool ROCMBlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a,
     int lda, DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
     DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a.size(),
                  b.size());
@@ -1105,12 +1107,12 @@ bool ROCMBlas::DoBlasGemmBatched(
 }
 
 bool ROCMBlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha,
     DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a_array.size(),
                  b_array.size());
@@ -1129,12 +1131,12 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 #define IMPL_DoBlasGemmBatched(T, Fun)                                         \
   bool ROCMBlas::DoBlasGemmBatched(                                            \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, T alpha,                             \
       DeviceMemorySlice<T> a_array, int lda, DeviceMemorySlice<T> b_array,     \
       int ldb, T beta, DeviceMemorySlice<T> c_array, int ldc, int batch_count, \
-      const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator, blas::CallContext context) {        \
+      const EngineOptions& engine_options,                                     \
+      ScratchAllocator* scratch_allocator, blas::CallContext context) {        \
     MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a_array.size(), \
                    b_array.size());                                            \
     absl::Status status = DoBlasGemmBatchedInternal(                           \
@@ -1190,12 +1192,12 @@ IMPL_DoBlasGemmBatched(float, wrap::rocblas_sgemm_strided_batched)
 
                                 absl::Status
     ROCMBlas::DoBlasGemmStridedBatched(
-        Stream *stream, blas::Transpose transa, blas::Transpose transb,
+        Stream* stream, blas::Transpose transa, blas::Transpose transb,
         uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,
-        const void *alpha, const DeviceMemoryBase &a, int lda, int64_t stride_a,
-        const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
-        DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-        const NumericOptions &numeric_options, blas::CallContext context) {
+        const void* alpha, const DeviceMemoryBase& a, int lda, int64_t stride_a,
+        const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,
+        DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,
+        const EngineOptions& engine_options, blas::CallContext context) {
   VLOG(1) << absl::StreamFormat(
       "doing rocBLAS GEMM Strided Batched: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
index 92d6067151002f..9f462f338977f2 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
@@ -463,4 +463,8 @@ absl::Status RocmCommandBuffer::CheckCanBeUpdated() {
   return absl::OkStatus();
 }
 
+std::string RocmCommandBuffer::ToString() const {
+  return "ROCM graph debug dot print is not supported.";
+}
+
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
index 6a2d33ef0dd106..9f680dff3ffbb4 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
@@ -46,6 +46,8 @@ class RocmCommandBuffer : public GpuCommandBuffer {
   static absl::StatusOr<std::unique_ptr<RocmCommandBuffer>> Create(
       Mode mode, StreamExecutor* executor);
 
+  std::string ToString() const override;
+
   ~RocmCommandBuffer() override;
 
  private:
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h
index 4405d2c4a80a1f..45efd17b32e7b1 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h
@@ -43,7 +43,15 @@ class RocmComputeCapability {
 
   RocmComputeCapability() = default;
 
-  std::string gcn_arch_name() const { return gcn_arch_name_; }
+  static RocmComputeCapability EarliestCDNASupport() {
+    return RocmComputeCapability{"gfx908"};
+  }
+
+  static RocmComputeCapability EarliestRDNASupport() {
+    return RocmComputeCapability{"gfx1030"};
+  }
+
+  const std::string& gcn_arch_name() const { return gcn_arch_name_; }
 
   std::string ToString() const { return gcn_arch_name(); }
 
@@ -53,6 +61,11 @@ class RocmComputeCapability {
     return proto;
   }
 
+  static RocmComputeCapability FromProto(
+      const RocmComputeCapabilityProto& proto) {
+    return RocmComputeCapability{proto.gcn_arch_name()};
+  }
+
   bool operator==(const RocmComputeCapability& other) const {
     return gcn_arch_name_ == other.gcn_arch_name_;
   }
@@ -83,8 +96,7 @@ class RocmComputeCapability {
       "gfx1030",  // RX68xx / RX69xx
       "gfx1100",  // RX7900
       "gfx1101",  // RX7700 / RX7800
-      "gfx1103", "gfx1150", "gfx1151", "gfx1200", "gfx1201",
-  };
+      "gfx1103", "gfx1150", "gfx1151", "gfx1200", "gfx1201"};
 
   bool is_supported_gfx_version() const {
     return IsThisGfxInAnyList(kSupportedGfxVersions);
@@ -156,7 +168,7 @@ class RocmComputeCapability {
 
   bool has_mfma_instr_support() const { return gfx9_mi100_or_later(); }
 
-  bool has_amd_matrix_core() const {
+  bool has_amd_matrix_instr() const {
     return gfx9_mi100_or_later() || gfx12() || gfx11();
   }
 
@@ -191,7 +203,7 @@ class RocmComputeCapability {
   template <typename... ArrayOfStrings>
   bool IsThisGfxInAnyList(ArrayOfStrings&&... arr) const {
     static_assert(sizeof...(arr) >= 1);
-    const auto gfx = gfx_version();
+    const std::string gfx = gfx_version();
     return (implIsThisGfxInAnyList(std::begin(arr), std::end(arr), gfx) || ...);
   }
 
@@ -199,10 +211,9 @@ class RocmComputeCapability {
   /// \warning Don't use directly!
   bool implIsThisGfxInAnyList(const absl::string_view* beg,
                               const absl::string_view* end,
-                              const std::string& gfx) const {
-    return std::any_of(beg, end, [&gfx = gfx](const absl::string_view& s) {
-      return gfx == s;
-    });
+                              const absl::string_view gfx) const {
+    return std::any_of(
+        beg, end, [&gfx = gfx](const absl::string_view s) { return gfx == s; });
   }
 
   std::string gcn_arch_name_{kInvalidGfx};  // default to invalid arch.
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
new file mode 100644
index 00000000000000..ed9eba6c545e17
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace stream_executor::rocm {
+namespace {
+
+constexpr absl::string_view kExampleGcnArchName = "gfx1010:xnack-";
+
+TEST(RocmComputeCapabilityTest, FromProto) {
+  RocmComputeCapabilityProto proto;
+  proto.set_gcn_arch_name(kExampleGcnArchName);
+  RocmComputeCapability cc = RocmComputeCapability::FromProto(proto);
+  EXPECT_EQ(cc, RocmComputeCapability{std::string(kExampleGcnArchName)});
+}
+
+TEST(RocmComputeCapabilityTest, ToProto) {
+  RocmComputeCapability cc{std::string(kExampleGcnArchName)};
+  EXPECT_THAT(cc.ToProto(), tsl::proto_testing::EqualsProto(
+                                R"pb(gcn_arch_name: "gfx1010:xnack-")pb"));
+}
+
+}  // namespace
+}  // namespace stream_executor::rocm
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_context.cc b/third_party/xla/xla/stream_executor/rocm/rocm_context.cc
index ab61f6902aa2d3..4bcfe51e8e5a08 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_context.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_context.cc
@@ -24,14 +24,13 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "rocm/include/hip/driver_types.h"
 #include "rocm/include/hip/hip_runtime_api.h"
-#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/context_map.h"
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
 #include "xla/stream_executor/rocm/rocm_driver_wrapper.h"
 #include "xla/stream_executor/rocm/rocm_status.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 
 namespace stream_executor::gpu {
 
@@ -40,7 +39,7 @@ namespace {
 // Returns the current context or dies if it fails.
 hipCtx_t CurrentContextOrDie() {
   hipCtx_t current = nullptr;
-  TF_CHECK_OK(
+  CHECK_OK(
       ToStatus(hipCtxGetCurrent(&current), "Failed to query current context"));
   return current;
 }
@@ -134,7 +133,7 @@ RocmContext::~RocmContext() {
 }
 
 void RocmContext::SetActive() {
-  TF_CHECK_OK(
+  CHECK_OK(
       ToStatus(wrap::hipCtxSetCurrent(context_), "Failed setting context"));
 }
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
deleted file mode 100644
index 123e69bd075ee4..00000000000000
--- a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/rocm/rocm_diagnostics.h"
-
-#include <dirent.h>
-#include <limits.h>
-#include <link.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/sysmacros.h>
-#include <unistd.h>
-
-#include <string>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/strip.h"
-#include "xla/tsl/platform/logging.h"
-#include "tsl/platform/host_info.h"
-
-namespace stream_executor {
-namespace rocm {
-
-std::string DriverVersionToString(DriverVersion version) {
-  return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
-                         std::get<2>(version));
-}
-
-std::string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version) {
-  if (!version.ok()) {
-    return version.status().ToString();
-  }
-
-  return DriverVersionToString(version.value());
-}
-
-absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value) {
-  std::vector<std::string> pieces = absl::StrSplit(value, '.');
-  if (pieces.size() != 2 && pieces.size() != 3) {
-    return absl::Status{absl::StatusCode::kInvalidArgument,
-                        absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
-                                        "for driver version; got \"%s\"",
-                                        value.c_str())};
-  }
-
-  int major;
-  int minor;
-  int patch = 0;
-  if (!absl::SimpleAtoi(pieces[0], &major)) {
-    return absl::Status{
-        absl::StatusCode::kInvalidArgument,
-        absl::StrFormat("could not parse major version number \"%s\" as an "
-                        "integer from string \"%s\"",
-                        pieces[0].c_str(), value.c_str())};
-  }
-  if (!absl::SimpleAtoi(pieces[1], &minor)) {
-    return absl::Status{
-        absl::StatusCode::kInvalidArgument,
-        absl::StrFormat("could not parse minor version number \"%s\" as an "
-                        "integer from string \"%s\"",
-                        pieces[1].c_str(), value.c_str())};
-  }
-  if (pieces.size() == 3 && !absl::SimpleAtoi(pieces[2], &patch)) {
-    return absl::Status{
-        absl::StatusCode::kInvalidArgument,
-        absl::StrFormat("could not parse patch version number \"%s\" as an "
-                        "integer from string \"%s\"",
-                        pieces[2].c_str(), value.c_str())};
-  }
-
-  DriverVersion result{major, minor, patch};
-  VLOG(2) << "version string \"" << value << "\" made value "
-          << DriverVersionToString(result);
-  return result;
-}
-
-// -- class Diagnostician
-
-std::string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
-  return absl::StrCat("/dev/kfd", dev_node_ordinal);
-}
-
-void Diagnostician::LogDiagnosticInformation() {
-  LOG(INFO) << "retrieving ROCM diagnostic information for host: "
-            << tsl::port::Hostname();
-
-  LogDriverVersionInformation();
-}
-
-/* static */ void Diagnostician::LogDriverVersionInformation() {
-  LOG(INFO) << "hostname: " << tsl::port::Hostname();
-  if (VLOG_IS_ON(1)) {
-    const char* value = getenv("LD_LIBRARY_PATH");
-    std::string library_path = value == nullptr ? "" : value;
-    VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
-
-    std::vector<std::string> pieces = absl::StrSplit(library_path, ':');
-    for (const auto& piece : pieces) {
-      if (piece.empty()) {
-        continue;
-      }
-      DIR* dir = opendir(piece.c_str());
-      if (dir == nullptr) {
-        VLOG(1) << "could not open \"" << piece << "\"";
-        continue;
-      }
-      while (dirent* entity = readdir(dir)) {
-        VLOG(1) << piece << " :: " << entity->d_name;
-      }
-      closedir(dir);
-    }
-  }
-  absl::StatusOr<DriverVersion> dso_version = FindDsoVersion();
-  LOG(INFO) << "librocm reported version is: "
-            << rocm::DriverVersionStatusToString(dso_version);
-
-  absl::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
-  LOG(INFO) << "kernel reported version is: "
-            << rocm::DriverVersionStatusToString(kernel_version);
-
-  if (kernel_version.ok() && dso_version.ok()) {
-    WarnOnDsoKernelMismatch(dso_version, kernel_version);
-  }
-}
-
-// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
-// driver-interfacing DSO version number. Returns it as a string.
-absl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
-  absl::StatusOr<DriverVersion> result{absl::Status{
-      absl::StatusCode::kNotFound,
-      "was unable to find librocm.so DSO loaded into this program"}};
-
-  // Callback used when iterating through DSOs. Looks for the driver-interfacing
-  // DSO and yields its version number into the callback data, when found.
-  auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
-                         void* data) -> int {
-    if (strstr(info->dlpi_name, "librocm.so.1")) {
-      VLOG(1) << "found DLL info with name: " << info->dlpi_name;
-      char resolved_path[PATH_MAX] = {0};
-      if (realpath(info->dlpi_name, resolved_path) == nullptr) {
-        return 0;
-      }
-      VLOG(1) << "found DLL info with resolved path: " << resolved_path;
-      const char* slash = rindex(resolved_path, '/');
-      if (slash == nullptr) {
-        return 0;
-      }
-      const char* so_suffix = ".so.";
-      const char* dot = strstr(slash, so_suffix);
-      if (dot == nullptr) {
-        return 0;
-      }
-      std::string dso_version = dot + strlen(so_suffix);
-      // TODO(b/22689637): Eliminate the explicit namespace if possible.
-      auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
-      auto result = static_cast<absl::StatusOr<DriverVersion>*>(data);
-      *result = rocm::StringToDriverVersion(std::string(stripped_dso_version));
-      return 1;
-    }
-    return 0;
-  };
-
-  dl_iterate_phdr(iterate_phdr, &result);
-
-  return result;
-}
-
-absl::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
-    const std::string& driver_version_file_contents) {
-  static const char* kDriverFilePrelude = "Kernel Module  ";
-  size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
-  if (offset == std::string::npos) {
-    return absl::Status{
-        absl::StatusCode::kNotFound,
-        absl::StrCat("could not find kernel module information in "
-                     "driver version file contents: \"",
-                     driver_version_file_contents, "\"")};
-  }
-
-  std::string version_and_rest = driver_version_file_contents.substr(
-      offset + strlen(kDriverFilePrelude), std::string::npos);
-  size_t space_index = version_and_rest.find(' ');
-  auto kernel_version = version_and_rest.substr(0, space_index);
-  // TODO(b/22689637): Eliminate the explicit namespace if possible.
-  auto stripped_kernel_version = absl::StripSuffix(kernel_version, ".ld64");
-  return rocm::StringToDriverVersion(std::string(stripped_kernel_version));
-}
-
-void Diagnostician::WarnOnDsoKernelMismatch(
-    absl::StatusOr<DriverVersion> dso_version,
-    absl::StatusOr<DriverVersion> kernel_version) {
-  if (kernel_version.ok() && dso_version.ok() &&
-      dso_version.value() == kernel_version.value()) {
-    LOG(INFO) << "kernel version seems to match DSO: "
-              << rocm::DriverVersionToString(kernel_version.value());
-  } else {
-    LOG(ERROR) << "kernel version "
-               << rocm::DriverVersionStatusToString(kernel_version)
-               << " does not match DSO version "
-               << rocm::DriverVersionStatusToString(dso_version)
-               << " -- cannot find working devices in this configuration";
-  }
-}
-
-absl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
-  auto status = absl::Status{absl::StatusCode::kUnimplemented,
-                             "kernel reported driver version not implemented"};
-  return status;
-}
-
-}  // namespace rocm
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h
deleted file mode 100644
index edd9247db05ced..00000000000000
--- a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2015 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
-#define XLA_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
-
-#include <string>
-#include <tuple>
-
-#include "absl/status/statusor.h"
-
-namespace stream_executor {
-namespace rocm {
-
-// e.g. DriverVersion{346, 3, 4}
-using DriverVersion = std::tuple<int, int, int>;
-
-// Converts a parsed driver version to string form.
-std::string DriverVersionToString(DriverVersion version);
-
-// Converts a parsed driver version or status value to natural string form.
-std::string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version);
-
-// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value);
-
-class Diagnostician {
- public:
-  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
-  // not initializing).
-  //
-  // Note: if we're running on a machine that has no GPUs, we don't want to
-  // produce very much log spew beyond saying, "looks like there's no CUDA
-  // kernel
-  // module running".
-  //
-  // Note: we use non-Google-File:: API here because we may be called before
-  // InitGoogle has completed.
-  static void LogDiagnosticInformation();
-
-  // Given the driver version file contents, finds the kernel module version and
-  // returns it as a string.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static absl::StatusOr<DriverVersion> FindKernelModuleVersion(
-      const std::string& driver_version_file_contents);
-
-  // Extracts the kernel driver version from the current host.
-  static absl::StatusOr<DriverVersion> FindKernelDriverVersion();
-
-  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
-  // driver-interfacing DSO version number. Returns it as a string.
-  static absl::StatusOr<DriverVersion> FindDsoVersion();
-
-  // Logs information about the kernel driver version and userspace driver
-  // library version.
-  static void LogDriverVersionInformation();
-
- private:
-  // Given the DSO version number and the driver version file contents, extracts
-  // the driver version and compares, warning the user in the case of
-  // incompatibility.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static void WarnOnDsoKernelMismatch(
-      absl::StatusOr<DriverVersion> dso_version,
-      absl::StatusOr<DriverVersion> kernel_version);
-
-  static std::string GetDevNodePath(int dev_node_ordinal);
-
-  Diagnostician(const Diagnostician&) = delete;
-  void operator=(const Diagnostician&) = delete;
-};
-
-}  // namespace rocm
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index aeb38183f08495..89c8daec069855 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -52,11 +52,10 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
-#include "xla/stream_executor/rocm/rocm_diagnostics.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
@@ -64,7 +63,6 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/macros.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/determinism.h"
 #include "xla/tsl/util/env_var.h"
@@ -791,19 +789,19 @@ MIOpenSupport::MIOpenSupport(StreamExecutor* parent) : parent_(parent) {
   return_best_algo_only_ = false;
   // but if the env var TF_ROCM_RETURN_BEST_ALGO_ONLY is set, only the best
   // (i.e. most efficient) algorithm will be returned
-  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_RETURN_BEST_ALGO_ONLY", false,
-                                      &return_best_algo_only_));
+  CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_RETURN_BEST_ALGO_ONLY", false,
+                                   &return_best_algo_only_));
 
   // by default, use Find Mode APIs for convolution
   use_immediate_mode_ = false;
   // swich to Find Mode if env var TF_ROCM_USE_IMMEDIATE_MODE is set
 
-  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_USE_IMMEDIATE_MODE", false,
-                                      &use_immediate_mode_));
+  CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_USE_IMMEDIATE_MODE", false,
+                                   &use_immediate_mode_));
 
   bool enable_pooling_cache = false;
-  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_BW_POOL_CACHE", false,
-                                      &enable_pooling_cache));
+  CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_BW_POOL_CACHE", false,
+                                   &enable_pooling_cache));
   if (enable_pooling_cache) m_pooling_cache_allowed = true;
 }
 
@@ -819,18 +817,6 @@ absl::Status MIOpenSupport::Init() {
 
   CHECK_EQ(miopen_handle, nullptr);
   LOG(ERROR) << "could not create miopen handle: " << ToString(status);
-  if (status == miopenStatusNotInitialized) {
-    auto result = rocm::Diagnostician::FindKernelDriverVersion();
-    if (!result.ok()) {
-      LOG(ERROR) << "error retrieving driver version: "
-                 << rocm::DriverVersionStatusToString(result);
-    } else {
-      const auto& version = result.value();
-      LOG(INFO) << "possibly insufficient driver version: "
-                << rocm::DriverVersionToString(version);
-    }
-  }
-
   return absl::Status{absl::StatusCode::kInternal,
                       absl::StrCat("miopen library could not create a handle: ",
                                    ToString(status))};
@@ -2813,7 +2799,7 @@ absl::Status MIOpenSupport::DoPrepareForCtcLoss(
     absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
     absl::Span<const int> input_lengths_data,
-    const NumericOptions& numeric_options, ScratchAllocator* scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
   auto miopen = miopen_->GetHandle(parent_, stream);
 
@@ -2935,7 +2921,7 @@ MIOpenSupport::CreateRnnDescriptor(
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
     dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-    const NumericOptions& numeric_options, float dropout, uint64_t seed,
+    const EngineOptions& engine_options, float dropout, uint64_t seed,
     ScratchAllocator* state_allocator, bool use_padded_io) {
   // ROCM TODO: batch_size is used in dynamic persistent RNN algorithm and is
   // not supported by MIOpen now.
@@ -3501,7 +3487,7 @@ absl::Status MIOpenSupport::GetConvolveRunners(
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
     DeviceMemoryBase output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    ScratchAllocator* scratch_allocator, const NumericOptions& numeric_options,
+    ScratchAllocator* scratch_allocator, const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners) {
   if (input_type != output_type) {
     return absl::UnimplementedError(
@@ -4283,7 +4269,7 @@ absl::Status ROCmFusedMatmulRunner::gemm(Stream* stream,
                               static_cast<DeviceMemory<T>>(b_data), _ldb,
                               static_cast<DeviceMemory<T>>(a_data), _lda,
                               static_cast<DeviceMemory<T>*>(&c_data), _ldc,
-                              NumericOptions{}, blas::CallContext::kNone);
+                              EngineOptions{}, blas::CallContext::kNone);
 }
 
 template <typename T, typename Tbias = T>
@@ -4361,7 +4347,7 @@ absl::Status MIOpenSupport::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   out_exec_plans->clear();
@@ -5180,7 +5166,7 @@ absl::Status MIOpenSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& bias_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
+    dnn::ActivationMode activation_mode, const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   VLOG(2) << "MIOpenSupport::GetFusedConvolveRunners";
   VLOG(2) << "filter_descriptor " << filter_descriptor.ndims();
@@ -5213,8 +5199,8 @@ bool UseNhwcLayoutForRocm() {
 #if TF_ROCM_VERSION >= 50100
   static bool is_enabled = [] {
     bool is_enabled = false;
-    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_ROCM_NHWC",
-                                        /*default_val=*/false, &is_enabled));
+    CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_ROCM_NHWC",
+                                     /*default_val=*/false, &is_enabled));
     return is_enabled;
   }();
   return is_enabled;
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 7457448dcafcb9..63c8f8666c4c92 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -92,7 +92,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
       dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
@@ -249,7 +249,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners)
       override;
 
@@ -432,7 +432,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
       uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
@@ -446,7 +446,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, dnn::ActivationMode activation_mode,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
       override;
 
@@ -626,8 +626,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
-      const NumericOptions& numeric_options,
-      ScratchAllocator* scratch_allocator,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
 
   MIOpenSupport(const MIOpenSupport&) = delete;
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
index 97e60484a7594e..83064ef830fef6 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -49,8 +49,8 @@ namespace wrap {
   auto hipSymbolName(Args... args) -> decltype(::hipSymbolName(args...)) {  \
     using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
     static FuncPtrT loaded = []() -> FuncPtrT {                             \
-      static const char *kName = TO_STR(hipSymbolName);                     \
-      void *f;                                                              \
+      static const char* kName = TO_STR(hipSymbolName);                     \
+      void* f;                                                              \
       auto s = tsl::Env::Default()->GetSymbolFromLibrary(                   \
           tsl::internal::CachedDsoLoader::GetHipDsoHandle().value(), kName, \
           &f);                                                              \
@@ -100,6 +100,7 @@ namespace wrap {
   __macro(hipGetDeviceCount)                        \
   __macro(hipGetDeviceProperties)                   \
   __macro(hipGetErrorString)                        \
+  __macro(hipGetLastError)                          \
   __macro(hipGraphAddKernelNode)                    \
   __macro(hipGraphAddChildGraphNode)                \
   __macro(hipGraphAddEmptyNode)                     \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index 5db217980de522..4e8364593f4ddf 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unistd.h>
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -61,6 +62,9 @@ limitations under the License.
 #include "xla/stream_executor/gpu/read_numa_node.h"
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -87,8 +91,8 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
-#include "tsl/platform/casts.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/platform/numa.h"
 #include "tsl/platform/numbers.h"
 
 namespace stream_executor {
@@ -380,11 +384,16 @@ absl::Status EnablePeerAccess(Context* from, Context* to) {
   hipError_t result =
       wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
 
-  if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
+  if (result == hipErrorPeerAccessAlreadyEnabled) {
+    // hipGetLastError is used to reset per thread error state,
+    // as hipGetLastError would get the recent error code since rocm7 even the
+    // last call is successful.
+    (void)wrap::hipGetLastError();
+  } else if (result != hipSuccess) {
     return absl::InternalError(
         absl::StrFormat("failed to enable peer access from %d to %d: %s",
                         from->device_ordinal(), to->device_ordinal(),
-                        ToString(result).c_str()));
+                        wrap::hipGetErrorString(result)));
   }
 
   return absl::OkStatus();
@@ -522,7 +531,7 @@ bool RocmExecutor::UnloadModule(ModuleHandle module_handle) {
 }
 
 absl::StatusOr<DeviceMemoryBase> RocmExecutor::GetMemoryRange(
-    const DeviceMemoryBase& location) {
+    const DeviceMemoryBase& location) const {
   hipDeviceptr_t device_pointer;
   size_t size;
   hipError_t result = wrap::hipMemGetAddressRange(
@@ -713,7 +722,23 @@ absl::StatusOr<std::unique_ptr<Kernel>> RocmExecutor::LoadKernel(
     rocm_kernel->set_metadata(kernel_metadata);
   }
   rocm_kernel->set_name(kernel_name);
-  rocm_kernel->set_args_packing(spec.kernel_args_packing());
+  if (std::holds_alternative<KernelLoaderSpec::KernelArgsPackingFunc>(
+          spec.kernel_args_packing())) {
+    rocm_kernel->set_args_packing(
+        std::get<KernelLoaderSpec::KernelArgsPackingFunc>(
+            spec.kernel_args_packing()));
+  } else {
+    const auto& packing_spec =
+        std::get<KernelArgumentsPackingSpec>(spec.kernel_args_packing());
+    rocm_kernel->set_args_packing([packing_spec](const Kernel& kernel,
+                                                 const KernelArgs& args) {
+      const auto& mem_args =
+          stream_executor::Cast<stream_executor::KernelArgsDeviceMemoryArray>(
+              &args);
+      return packing_spec.BuildArguments(mem_args->device_memory_args(),
+                                         args.number_of_shared_bytes());
+    });
+  }
   return std::move(rocm_kernel);
 }
 
@@ -1076,6 +1101,20 @@ RocmExecutor::CreateCommandBuffer(CommandBuffer::Mode mode) {
   return RocmCommandBuffer::Create(mode, this);
 }
 
+int RocmExecutor::GetGpuStreamPriority(StreamPriority priority) {
+  if (priority == StreamPriority::Default) {
+    return 0;
+  }
+  std::unique_ptr<ActivateContext> activation = Activate();
+  int lowest, highest;
+  auto status = wrap::hipDeviceGetStreamPriorityRange(&lowest, &highest);
+  if (status != hipSuccess) {
+    LOG(ERROR) << "Failed to get stream priority range: " << ToString(status);
+    return 0;
+  }
+  return priority == StreamPriority::Highest ? highest : lowest;
+}
+
 absl::StatusOr<std::unique_ptr<DeviceDescription>>
 RocmExecutor::CreateDeviceDescription(int device_ordinal) {
   TF_ASSIGN_OR_RETURN(hipDevice_t device, GetDevice(device_ordinal));
@@ -1121,6 +1160,9 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
     desc.set_l2_cache_size(prop.l2CacheSize);
   }
 
+  // PCIe bandwidth for PCI Gen4 x16 (approximate)
+  desc.set_pcie_bandwidth(32LL * 1024 * 1024 * 1024);
+
   {
     auto ecc_enabled_or = IsEccEnabled(device);
     if (!ecc_enabled_or.ok()) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
index 41843efe4fee07..eb02184e4cee42 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
@@ -84,7 +84,7 @@ class RocmExecutor : public GpuExecutor {
       Stream* stream, absl::Span<const uint8_t> content) override;
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
   absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
-      const DeviceMemoryBase& location) override;
+      const DeviceMemoryBase& location) const override;
   void Deallocate(DeviceMemoryBase* mem) override;
   bool SynchronizeAllActivity() override;
   absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
@@ -133,6 +133,8 @@ class RocmExecutor : public GpuExecutor {
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
       MemoryType type) override;
 
+  int GetGpuStreamPriority(StreamPriority priority) override;
+
  private:
   // Initializes Blas interfaces
   absl::Status InitBlas();
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
index 3069de644eaded..4ede48411b45b5 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/kernel.h"
@@ -32,16 +33,12 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 namespace {
 using testing::IsEmpty;
 using testing::Not;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(RocmExecutorTest, CreateDeviceDescription) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DeviceDescription> result,
@@ -57,10 +54,10 @@ TEST(RocmExecutorTest, CreateDeviceDescription) {
   EXPECT_THAT(result->model_str(), Not(IsEmpty()));
   EXPECT_THAT(result->device_vendor(), "Advanced Micro Devices, Inc");
 
-  EXPECT_THAT(
-      std::get_if<RocmComputeCapability>(&result->gpu_compute_capability())
-          ->gcn_arch_name(),
-      Not(IsEmpty()));
+  EXPECT_THAT(result->gpu_compute_capability()
+                  .rocm_compute_capability()
+                  ->gcn_arch_name(),
+              Not(IsEmpty()));
 }
 
 TEST(RocmExecutorTest, GetRocmKernel) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
index cc10638d09c78c..8227f08c146182 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/rocm/rocm_driver_wrapper.h"
 #include "xla/stream_executor/rocm/rocm_status.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
index 9c15fe9a777546..ff532209094801 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "rocm/include/hip/hip_runtime.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/logging.h"
@@ -60,9 +61,9 @@ class RocmKernel : public Kernel {
   absl::StatusOr<KernelMetadata> GetKernelMetadata();
 
  private:
-  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
-                      const std::optional<ClusterDim> &cluster_dims,
-                      Stream *stream, const KernelArgs &args) override;
+  absl::Status Launch(const ThreadDim& thread_dims, const BlockDim& block_dims,
+                      const std::optional<ClusterDim>& cluster_dims,
+                      Stream* stream, const KernelArgs& args) override;
 
   StreamExecutor* executor_ = nullptr;
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_kernel_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_kernel_test.cc
index 1147692cc52472..81faf9f9b832fb 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_kernel_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
@@ -24,14 +25,12 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
 namespace stream_executor::gpu {
 namespace {
 using testing::Ge;
-using tsl::testing::IsOkAndHolds;
 
 TEST(RocmKernelTest, GetMaxOccupiedBlocksPerCore) {
   TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
index d11e3c26370479..30d91be0640cbc 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -27,13 +28,11 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/rocm/rocm_diagnostics.h"
 #include "xla/stream_executor/rocm/rocm_driver_wrapper.h"
 #include "xla/stream_executor/rocm/rocm_executor.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/rocm/rocm_status.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -49,7 +48,6 @@ static absl::Status InternalInitialize() {
   }
 
   LOG(ERROR) << "failed call to hipInit: " << ToString(res);
-  rocm::Diagnostician::LogDiagnosticInformation();
   return absl::AbortedError(
       absl::StrCat("failed call to hipInit: ", ToString(res)));
 }
@@ -116,7 +114,7 @@ ROCmPlatform::GetUncachedExecutor(int ordinal) {
 static void InitializeROCmPlatform() {
   auto status = PlatformManager::PlatformWithName("ROCM");
   if (!status.ok()) {
-    TF_CHECK_OK(PlatformManager::RegisterPlatform(
+    CHECK_OK(PlatformManager::RegisterPlatform(
         std::make_unique<gpu::ROCmPlatform>()));
   }
 }
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
index 8e23c945445e3d..64ce30286953c9 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -29,11 +29,6 @@ limitations under the License.
 namespace stream_executor {
 namespace gpu {
 
-// Opaque and unique identifier for the ROCM platform plugin.
-// This is needed so that plugins can refer to/identify this platform without
-// instantiating a ROCmPlatform object.
-extern const Platform::Id kROCmPlatformId;
-
 // ROCm-specific platform plugin, registered as a singleton value via module
 // initializer.
 class ROCmPlatform : public Platform {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_status_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_status_test.cc
index 7a8e3562c66ff6..9ac99f94732d5d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_status_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_status_test.cc
@@ -18,17 +18,15 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "rocm/include/hip/hip_runtime.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace stream_executor::gpu {
 namespace {
 
 using ::testing::HasSubstr;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 TEST(RocmStatusTest, ToStatusReturnsExpectedStatusCodes) {
   // We only promise hipSuccess to map to Ok, hipErrorOutOfMemory to
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc b/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc
index e16e54d5344c22..14447e39353948 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc
@@ -50,22 +50,6 @@ limitations under the License.
 
 namespace stream_executor::gpu {
 namespace {
-int GetGpuStreamPriority(StreamExecutor* executor,
-                         stream_executor::StreamPriority stream_priority) {
-  std::unique_ptr<ActivateContext> activation = executor->Activate();
-  if (stream_priority == stream_executor::StreamPriority::Default) {
-    return 0;
-  }
-  int lowest, highest;
-  hipError_t res = wrap::hipDeviceGetStreamPriorityRange(&lowest, &highest);
-  if (res != hipSuccess) {
-    LOG(ERROR)
-        << "Could not query stream priority range. Returning default priority.";
-    return 0;
-  }
-  return stream_priority == stream_executor::StreamPriority::Highest ? highest
-                                                                     : lowest;
-}
 
 absl::StatusOr<hipStream_t> CreateStream(StreamExecutor* executor,
                                          int priority) {
@@ -188,8 +172,7 @@ absl::StatusOr<std::unique_ptr<RocmStream>> RocmStream::Create(
     if (priority.has_value() && std::holds_alternative<int>(priority.value())) {
       return std::get<int>(priority.value());
     }
-    return GetGpuStreamPriority(
-        executor,
+    return executor->GetGpuStreamPriority(
         std::get<StreamPriority>(priority.value_or(StreamPriority::Default)));
   }();
   TF_ASSIGN_OR_RETURN(auto stream_handle,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc
index 3d12639b228b88..4655805d443405 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_event.h"
 #include "xla/stream_executor/rocm/rocm_executor.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -48,7 +48,6 @@ namespace {
 using ::testing::Each;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
-using ::tsl::testing::IsOk;
 
 class RocmStreamTest : public ::testing::Test {
  public:
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_timer_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_timer_test.cc
index e181a934870d3b..d7de1bf8491013 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_timer_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_timer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
@@ -32,14 +33,12 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_executor.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
 namespace stream_executor::gpu {
 namespace {
 using ::testing::Gt;
-using ::tsl::testing::IsOk;
 
 class RocmTimerTest : public ::testing::Test {
  public:
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_version_parser_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_version_parser_test.cc
index e6e94f3b478b5e..be3483004ebf72 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_version_parser_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_version_parser_test.cc
@@ -18,16 +18,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "rocm/include/hip/hip_version.h"
 #include "xla/stream_executor/semantic_version.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace stream_executor {
 
 namespace {
-using tsl::testing::IsOkAndHolds;
-using tsl::testing::StatusIs;
 
 TEST(ParseRocmVersionTest, Simple) {
   EXPECT_THAT(stream_executor::ParseRocmVersion(60'100'002),
diff --git a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
index 4ba18752824f0e..52ad7a29a12393 100644
--- a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
@@ -1,3 +1,4 @@
+
 /* Copyright 2021 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,92 +21,14 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
 #define XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
 
-#include "rocm/include/roctracer/roctracer.h"
-#include "rocm/include/roctracer/roctracer_hip.h"
-#include "rocm/rocm_config.h"
-#if TF_ROCM_VERSION >= 50300
-#include "rocm/include/roctracer/roctracer_roctx.h"
-#else
-#include "rocm/include/roctracer/roctracer_hcc.h"
-#endif
-#include "xla/tsl/platform/env.h"
-#include "tsl/platform/dso_loader.h"
-#include "tsl/platform/platform.h"
-
-namespace stream_executor {
-namespace wrap {
-
-#ifdef PLATFORM_GOOGLE
-
-#define ROCTRACER_API_WRAPPER(API_NAME)                            \
-  template <typename... Args>                                      \
-  auto API_NAME(Args... args) -> decltype((::API_NAME)(args...)) { \
-    return (::API_NAME)(args...);                                  \
-  }
-
-#else
-
-#define ROCTRACER_API_WRAPPER(API_NAME)                                    \
-  template <typename... Args>                                              \
-  auto API_NAME(Args... args) -> decltype(::API_NAME(args...)) {           \
-    using FuncPtrT = std::add_pointer<decltype(::API_NAME)>::type;         \
-    static FuncPtrT loaded = []() -> FuncPtrT {                            \
-      static const char* kName = #API_NAME;                                \
-      void* f;                                                             \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                  \
-          tsl::internal::CachedDsoLoader::GetRoctracerDsoHandle().value(), \
-          kName, &f);                                                      \
-      CHECK(s.ok()) << "could not find " << kName                          \
-                    << " in roctracer DSO; dlerror: " << s.message();      \
-      return reinterpret_cast<FuncPtrT>(f);                                \
-    }();                                                                   \
-    return loaded(args...);                                                \
-  }
-
-#endif  // PLATFORM_GOOGLE
-
-#if TF_ROCM_VERSION >= 50300
-#define FOREACH_ROCTRACER_API(DO_FUNC)           \
-  DO_FUNC(roctracer_default_pool_expl)           \
-  DO_FUNC(roctracer_disable_domain_activity)     \
-  DO_FUNC(roctracer_disable_domain_callback)     \
-  DO_FUNC(roctracer_disable_op_activity)         \
-  DO_FUNC(roctracer_disable_op_callback)         \
-  DO_FUNC(roctracer_enable_domain_activity_expl) \
-  DO_FUNC(roctracer_enable_domain_callback)      \
-  DO_FUNC(roctracer_enable_op_activity_expl)     \
-  DO_FUNC(roctracer_enable_op_callback)          \
-  DO_FUNC(roctracer_error_string)                \
-  DO_FUNC(roctracer_flush_activity_expl)         \
-  DO_FUNC(roctracer_get_timestamp)               \
-  DO_FUNC(roctracer_op_string)                   \
-  DO_FUNC(roctracer_open_pool_expl)              \
-  DO_FUNC(roctracer_set_properties)              \
-  DO_FUNC(roctracer_next_record)
-#else
-#define FOREACH_ROCTRACER_API(DO_FUNC)           \
-  DO_FUNC(roctracer_default_pool_expl)           \
-  DO_FUNC(roctracer_disable_domain_activity)     \
-  DO_FUNC(roctracer_disable_domain_callback)     \
-  DO_FUNC(roctracer_disable_op_activity)         \
-  DO_FUNC(roctracer_disable_op_callback)         \
-  DO_FUNC(roctracer_enable_domain_activity_expl) \
-  DO_FUNC(roctracer_enable_domain_callback)      \
-  DO_FUNC(roctracer_enable_op_activity_expl)     \
-  DO_FUNC(roctracer_enable_op_callback)          \
-  DO_FUNC(roctracer_error_string)                \
-  DO_FUNC(roctracer_flush_activity_expl)         \
-  DO_FUNC(roctracer_get_timestamp)               \
-  DO_FUNC(roctracer_op_string)                   \
-  DO_FUNC(roctracer_open_pool_expl)              \
-  DO_FUNC(roctracer_set_properties)
-#endif
-FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER)
-
-#undef FOREACH_ROCTRACER_API
-#undef ROCTRACER_API_WRAPPER
-
-}  // namespace wrap
-}  // namespace stream_executor
+#include "rocm/include/rocprofiler-sdk/buffer.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/buffer_tracing.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/callback_tracing.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/cxx/name_info.hpp"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/external_correlation.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/fwd.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/internal_threading.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/registration.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/rocprofiler.h"  // IWYU pragma: export
 
 #endif  // XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
index ff557f3666cfc2..2ce77648b0ef08 100644
--- a/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
+++ b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/topk_kernel.h"
+#include "xla/stream_executor/kernel_symbol_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/tsl/lib/math/math_util.h"
 
@@ -292,7 +293,10 @@ __launch_bounds__(stream_executor::gpu::kTopKMaxThreadsPerBlock, 1) __global__
         return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(  \
             absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>),                     \
             "topk_k" #K_VAL "_" #TYPE "_" #VT, arity);                        \
-      }));
+      }));                                                                    \
+  KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(                          \
+      TopKKernelRocm_K##K_VAL##_##TYPE##_##VT,                                \
+      stream_executor::rocm::kROCmPlatformId, (&Run<K_VAL, TYPE, VT>));
 
 }  // namespace stream_executor::rocm
 
diff --git a/third_party/xla/xla/stream_executor/semantic_version_test.cc b/third_party/xla/xla/stream_executor/semantic_version_test.cc
index 8ed8c2bf7752a1..24ba042a61a8d0 100644
--- a/third_party/xla/xla/stream_executor/semantic_version_test.cc
+++ b/third_party/xla/xla/stream_executor/semantic_version_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/xla/xla/stream_executor/stream_executor.h
index 1795a2afd86ecd..e8858ac82e9a75 100644
--- a/third_party/xla/xla/stream_executor/stream_executor.h
+++ b/third_party/xla/xla/stream_executor/stream_executor.h
@@ -192,7 +192,7 @@ class StreamExecutor {
   // for the given DeviceMemoryBase, such that location is contained within the
   // returned range.
   virtual absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
-      const DeviceMemoryBase& location) {
+      const DeviceMemoryBase& location) const {
     return absl::UnimplementedError("Not implemented for this executor.");
   }
 
@@ -257,6 +257,9 @@ class StreamExecutor {
   virtual absl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription() const = 0;
 
+  // Return the platform dependent stream priority value for the given priority.
+  virtual int GetGpuStreamPriority(StreamPriority priority) { return 0; }
+
   // Gets-or-creates a BlasSupport datatype that can be used to execute BLAS
   // routines on the current platform.
   //
diff --git a/third_party/xla/xla/stream_executor/stream_executor_common.cc b/third_party/xla/xla/stream_executor/stream_executor_common.cc
index eed2c0338b3e49..ebeff36f1de6d4 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_common.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_common.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/tsl/util/env_var.h"
-#include "tsl/platform/status.h"
 
 namespace stream_executor {
 
@@ -33,7 +32,7 @@ namespace stream_executor {
 // TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
 static int64_t GetMemoryLimitBytesFromEnvironmentVariable() {
   int64_t value;
-  TF_CHECK_OK(
+  CHECK_OK(
       tsl::ReadInt64FromEnvVar("TF_PER_DEVICE_MEMORY_LIMIT_MB", 0, &value));
   return value * (1ll << 20);
 }
diff --git a/third_party/xla/xla/stream_executor/sycl/BUILD b/third_party/xla/xla/stream_executor/sycl/BUILD
index 5b16d09437d6dc..4e0c530ba245c5 100644
--- a/third_party/xla/xla/stream_executor/sycl/BUILD
+++ b/third_party/xla/xla/stream_executor/sycl/BUILD
@@ -12,7 +12,7 @@ load(
     "stream_executor_friends",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
+load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_copts")
 load("//xla/tsl/platform:build_config_root.bzl", "if_static")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -47,8 +47,6 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:initialize",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -82,7 +80,7 @@ cc_library(
 
 cc_library(
     name = "sycl_rpath",
-    linkopts = ["-Wl,-rpath,../local_config_sycl/sycl/sycl/lib"],
+    linkopts = if_oss(["-Wl,-rpath,../local_config_sycl/sycl/sycl/lib"]),
 )
 
 cc_library(
@@ -117,7 +115,6 @@ xla_cc_test(
     srcs = ["sycl_status_test.cc"],
     deps = [
         ":sycl_status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -130,11 +127,7 @@ cc_library(
     name = "sycl_solver_context",
     srcs = ["sycl_solver_context.cc"],
     hdrs = ["sycl_solver_context.h"],
-    tags = [
-        "gpu",
-        "manual",
-        "oneapi-only",
-    ],
+    tags = ["gpu"],
     deps = [
         ":sycl_platform_id",
         "//xla:xla_data_proto_cc",
@@ -157,8 +150,16 @@ sycl_library(
         "oneapi-only",
     ],
     deps = [
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_metadata",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -178,7 +179,6 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
@@ -228,6 +228,8 @@ sycl_library(
     ],
     deps = [
         ":sycl_event",
+        ":sycl_gpu_runtime",
+        "//xla/stream_executor:activate_context",
         "//xla/stream_executor:event_based_timer",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -248,7 +250,8 @@ xla_test(
         ":sycl_timer",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:typed_kernel_factory",
-        "//xla/tsl/platform:status_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -264,8 +267,11 @@ sycl_library(
     deps = [
         ":sycl_status",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/util:env_var",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -279,7 +285,9 @@ xla_test(
     ],
     deps = [
         ":sycl_gpu_runtime",
-        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -297,7 +305,6 @@ sycl_library(
         "//xla/stream_executor/gpu:context",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_context.cc b/third_party/xla/xla/stream_executor/sycl/sycl_context.cc
index 63d26774732f3a..56249a37988982 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_context.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_context.cc
@@ -30,9 +30,7 @@ absl::StatusOr<uint64_t> SyclContext::GetDeviceTotalMemory(
 }
 
 absl::Status SyclContext::Synchronize() {
-  // TODO(intel-tf): Add this feature once SyclStreamPool class is implemented.
-  return absl::UnimplementedError(
-      "SyclContext::Synchronize is not implemented for SYCL platform.");
+  return SyclStreamPool::SynchronizeStreamPool(device_ordinal_);
 }
 
 }  // namespace stream_executor::sycl
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc
index 02e245ba102cc9..dbd295677fb6c2 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc
@@ -17,11 +17,9 @@ limitations under the License.
 
 #include <cassert>
 #include <iostream>
-#include <unordered_map>
 
 #include "absl/base/call_once.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/tsl/util/env_var.h"
 
 namespace stream_executor::sycl {
 
@@ -32,10 +30,99 @@ absl::Status IsValidDeviceOrdinal(int device_ordinal,
   TF_ASSIGN_OR_RETURN(int device_count, SyclDevicePool::GetDeviceCount());
   if (device_ordinal >= 0 && device_ordinal < device_count) {
     return absl::OkStatus();
-  } else {
-    return absl::InvalidArgumentError(absl::StrCat(
-        function_name, ": Invalid device ordinal: ", device_ordinal));
   }
+  return absl::InvalidArgumentError(absl::StrCat(
+      function_name, ": Invalid device ordinal: ", device_ordinal));
+}
+
+// Returns true if the oneAPI version is 2024.2 or newer.
+// oneAPI 2024.2 corresponds to __LIBSYCL_MAJOR_VERSION == 7 and
+// __LIBSYCL_MINOR_VERSION == 2.
+bool IsOneAPIVersionAtLeast2024_2() {
+  return (__LIBSYCL_MAJOR_VERSION >= 7) && (__LIBSYCL_MINOR_VERSION >= 2);
+}
+
+absl::Status MemcpyDeviceToHost(::sycl::queue* stream_handle, void* dst_host,
+                                const void* src_device, size_t byte_count,
+                                bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memcpy(dst_host, src_device, byte_count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemcpyDeviceToHost failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemcpyHostToDevice(::sycl::queue* stream_handle, void* dst_device,
+                                const void* src_host, size_t byte_count,
+                                bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memcpy(dst_device, src_host, byte_count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemcpyHostToDevice failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemcpyDeviceToDevice(::sycl::queue* stream_handle,
+                                  void* dst_device, const void* src_device,
+                                  size_t byte_count, bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memcpy(dst_device, src_device, byte_count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemcpyDeviceToDevice failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemsetDevice(::sycl::queue* stream_handle, void* dst_device,
+                          unsigned char value, size_t count,
+                          bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memset(dst_device, value, count * sizeof(uint8_t));
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError("MemsetDevice failed: " + std::string(e.what()) +
+                               ", file = " + __FILE__ +
+                               ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemfillDevice(::sycl::queue* stream_handle, void* dst_device,
+                           uint32_t value, size_t count, bool async = false) {
+  try {
+    ::sycl::event event = stream_handle->fill(dst_device, value, count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemfillDevice failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -94,10 +181,9 @@ absl::StatusOr<int> SyclDevicePool::GetDeviceOrdinal(
   auto it = std::find(device_pool_.begin(), device_pool_.end(), device);
   if (it != device_pool_.end()) {
     return static_cast<int>(it - device_pool_.begin());
-  } else {
-    return absl::InternalError(
-        "SyclDevicePool::GetDeviceOrdinal failed, got invalid device");
   }
+  return absl::InternalError(
+      "SyclDevicePool::GetDeviceOrdinal failed, got invalid device");
 }
 
 absl::StatusOr<::sycl::device> SyclDevicePool::GetDevice(int device_ordinal) {
@@ -107,4 +193,508 @@ absl::StatusOr<::sycl::device> SyclDevicePool::GetDevice(int device_ordinal) {
   return device_pool_[device_ordinal];
 }
 
+StreamPoolMap SyclStreamPool::stream_pool_map_;
+absl::Mutex SyclStreamPool::stream_pool_mu_(absl::kConstInit);
+
+void SyclAsyncHandler(::sycl::exception_list ex_list) {
+  for (auto& e : ex_list) {
+    try {
+      std::rethrow_exception(e);
+    } catch (::sycl::exception& e) {
+      LOG(ERROR) << "SYCL exception: " << e.what() << ", file = " << __FILE__
+                 << ", line = " << __LINE__ << ".";
+    }
+  }
+}
+
+absl::StatusOr<StreamPool*> SyclStreamPool::InitStreamPool(int device_ordinal) {
+  {
+    absl::ReaderMutexLock read_lock(&stream_pool_mu_);
+    auto it = stream_pool_map_.find(device_ordinal);
+    // Returns the existing non-empty stream pool for this device, if available.
+    // The pool may be empty if DestroyStream was called on the last stream.
+    if (it != stream_pool_map_.end() && !it->second.empty()) {
+      VLOG(2) << "Check 1: Returning existing stream pool for device ordinal "
+              << device_ordinal << " whose size is " << it->second.size();
+      return &(it->second);
+    }
+  }
+  // Creates a new stream pool for this device using the device and context.
+  ::sycl::property_list prop_list{::sycl::property::queue::enable_profiling(),
+                                  ::sycl::property::queue::in_order()};
+  TF_ASSIGN_OR_RETURN(::sycl::device sycl_device,
+                      SyclDevicePool::GetDevice(device_ordinal));
+  TF_ASSIGN_OR_RETURN(::sycl::context sycl_context,
+                      SyclDevicePool::GetDeviceContext());
+
+  VLOG(2) << "Creating new stream pool for device ordinal " << device_ordinal;
+  absl::MutexLock write_lock(&stream_pool_mu_);
+  auto it = stream_pool_map_.find(device_ordinal);
+  // Double-checks that another thread has not already created the pool.
+  if (it != stream_pool_map_.end() && !it->second.empty()) {
+    VLOG(2) << "Check 2: Returning existing stream pool for device ordinal "
+            << device_ordinal << " whose size is " << it->second.size();
+    return &(it->second);
+  }
+
+  StreamPool stream_pool = {std::make_shared<::sycl::queue>(
+      sycl_context, sycl_device, SyclAsyncHandler, prop_list)};
+
+  // Use assignment (not insert) to update the stream pool if it was
+  // previously destroyed.
+  stream_pool_map_[device_ordinal] = std::move(stream_pool);
+
+  return &(stream_pool_map_[device_ordinal]);
+}
+
+absl::StatusOr<StreamPtr> SyclStreamPool::GetDefaultStream(int device_ordinal) {
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclStreamPool::GetDefaultStream"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  // InitStreamPool always returns a valid pointer, so no null check is needed.
+  absl::ReaderMutexLock read_lock(&stream_pool_mu_);
+  if (stream_pool->empty()) {
+    return absl::InternalError(
+        absl::StrCat("SyclStreamPool::GetDefaultStream: Stream pool is empty "
+                     "for device ordinal ",
+                     device_ordinal,
+                     ". The pool may have been destroyed by another thread."));
+  }
+  return stream_pool->front();
+}
+
+absl::StatusOr<StreamPtr> SyclStreamPool::GetOrCreateStream(
+    int device_ordinal, bool enable_multiple_streams) {
+  VLOG(2) << "SyclStreamPool::GetOrCreateStream called for device ordinal "
+          << device_ordinal
+          << ", enable_multiple_streams: " << enable_multiple_streams;
+  if (!enable_multiple_streams) {
+    return SyclStreamPool::GetDefaultStream(device_ordinal);
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal,
+                                          "SyclStreamPool::GetOrCreateStream"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  // If multiple streams are enabled, create a new stream and add it
+  // to the pool, unless the pool has reached kMaxStreamsPerDevice.
+  absl::MutexLock write_lock(&stream_pool_mu_);
+  if (stream_pool->size() >= kMaxStreamsPerDevice) {
+    VLOG(2) << "Stream pool size for device ordinal " << device_ordinal
+            << " exceeds the maximum limit of " << kMaxStreamsPerDevice;
+    return absl::ResourceExhaustedError(
+        absl::StrCat("SyclStreamPool::GetOrCreateStream: Maximum number of "
+                     "streams reached for device ordinal ",
+                     device_ordinal, "."));
+  }
+  VLOG(2) << "Stream pool size for device ordinal " << device_ordinal << ": "
+          << stream_pool->size();
+  ::sycl::property_list prop_list{::sycl::property::queue::enable_profiling(),
+                                  ::sycl::property::queue::in_order()};
+  TF_ASSIGN_OR_RETURN(::sycl::device sycl_device,
+                      SyclDevicePool::GetDevice(device_ordinal));
+  TF_ASSIGN_OR_RETURN(::sycl::context sycl_context,
+                      SyclDevicePool::GetDeviceContext());
+  stream_pool->push_back(std::make_shared<::sycl::queue>(
+      sycl_context, sycl_device, SyclAsyncHandler, prop_list));
+  return stream_pool->back();
+}
+
+absl::Status SyclStreamPool::SynchronizeStreamPool(int device_ordinal) {
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(
+      device_ordinal, "SyclStreamPool::SynchronizeStreamPool"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  absl::ReaderMutexLock read_lock(&stream_pool_mu_);
+  if (stream_pool->empty()) {
+    return absl::InternalError(
+        absl::StrCat("SyclStreamPool::SynchronizeStreamPool: Stream pool is "
+                     "empty for device ordinal ",
+                     device_ordinal,
+                     ". The pool may have been destroyed by another thread."));
+  }
+  for (auto& stream : *stream_pool) {
+    stream->wait();
+  }
+  return absl::OkStatus();
+}
+
+absl::Status SyclStreamPool::DestroyStream(int device_ordinal,
+                                           StreamPtr& stream_handle) {
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclStreamPool::DestroyStream: Attempting to destroy a null stream "
+        "handle.");
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclStreamPool::DestroyStream"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  absl::MutexLock write_lock(&stream_pool_mu_);
+  if (stream_pool->empty()) {
+    return absl::InternalError(
+        absl::StrCat("SyclStreamPool::DestroyStream: Stream pool is empty for "
+                     "device ordinal ",
+                     device_ordinal,
+                     ". The pool may have been destroyed by another thread."));
+  }
+  auto it = std::find(stream_pool->begin(), stream_pool->end(), stream_handle);
+  if (it == stream_pool->end()) {
+    return absl::NotFoundError(absl::StrCat(
+        "SyclStreamPool::DestroyStream: Stream handle for device ordinal ",
+        device_ordinal, " not found in the pool."));
+  }
+  // Remove the stream from the pool and reset the handle.
+  // The stream pool remains, but may become empty.
+  stream_pool->erase(it);
+  stream_handle.reset();
+  VLOG(2) << "Successfully destroyed stream for device ordinal "
+          << device_ordinal << ", stream pool size is " << stream_pool->size();
+  return absl::OkStatus();
+}
+
+absl::StatusOr<SyclTimerProperties> SyclGetTimerProperties(int device_ordinal) {
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclGetTimerProperties"));
+  TF_ASSIGN_OR_RETURN(::sycl::device device,
+                      SyclDevicePool::GetDevice(device_ordinal));
+  ze_device_handle_t lz_device_handle =
+      ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>(device);
+  ze_device_properties_t lz_device_props{
+      // timerResolution will be in cycles/sec (Hz) with this structure type.
+      ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2,
+  };
+  ze_result_t status =
+      zeDeviceGetProperties(lz_device_handle, &lz_device_props);
+  if (status != ZE_RESULT_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("SyclGetTimerProperties: zeDeviceGetProperties failed for "
+                     "device ordinal ",
+                     device_ordinal, " with return code: ", status));
+  }
+  uint64_t timer_freq_hz = lz_device_props.timerResolution;
+  uint64_t timestamp_mask =
+      (1ull << lz_device_props.kernelTimestampValidBits) - 1ull;
+  return SyclTimerProperties{timer_freq_hz, timestamp_mask};
+}
+
+absl::Status SyclStreamSynchronize(::sycl::queue* stream_handle) {
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclStreamSynchronize: Null stream handle provided.");
+  }
+  try {
+    stream_handle->wait();
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclStreamSynchronize: Failed to synchronize stream: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::optional<::sycl::event>> SyclGetRecentEventFromStream(
+    ::sycl::queue* stream_handle) {
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclGetRecentEventFromStream: Null stream handle provided.");
+  }
+  try {
+    // Use the new DPC++/SYCL API when oneAPI version is at least 2024.2.
+    std::optional<::sycl::event> event =
+        IsOneAPIVersionAtLeast2024_2()
+            ? stream_handle->ext_oneapi_get_last_event()
+            : stream_handle->ext_oneapi_submit_barrier();
+    return event;
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclGetRecentEventFromStream: Failed to get event from stream: ",
+        e.what(), ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::Status SyclMemcpyDeviceToHost(int device_ordinal, void* dst_host,
+                                    const void* src_device, size_t byte_count) {
+  if (dst_host == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToHost: Null pointer provided for destination or "
+        "source.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMemcpyDeviceToHost: Attempting to copy zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclMemcpyDeviceToHost"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemcpyDeviceToHost(stream_handle.get(), dst_host, src_device,
+                            byte_count);
+}
+
+absl::Status SyclMemcpyHostToDevice(int device_ordinal, void* dst_device,
+                                    const void* src_host, size_t byte_count) {
+  if (dst_device == nullptr || src_host == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyHostToDevice: Null pointer provided for destination or "
+        "source.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMemcpyHostToDevice: Attempting to copy zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclMemcpyHostToDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemcpyHostToDevice(stream_handle.get(), dst_device, src_host,
+                            byte_count);
+}
+
+absl::Status SyclMemcpyDeviceToDevice(int device_ordinal, void* dst_device,
+                                      const void* src_device,
+                                      size_t byte_count) {
+  if (dst_device == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToDevice: Null pointer provided for destination or "
+        "source.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMemcpyDeviceToDevice: Attempting to copy zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclMemcpyDeviceToDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemcpyDeviceToDevice(stream_handle.get(), dst_device, src_device,
+                              byte_count);
+}
+
+absl::Status SyclMemcpyDeviceToHostAsync(::sycl::queue* stream_handle,
+                                         void* dst_host, const void* src_device,
+                                         size_t byte_count) {
+  if (dst_host == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToHostAsync: Null pointer provided for destination or "
+        "source.");
+  }
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToHostAsync: Null stream handle provided.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING)
+        << "SyclMemcpyDeviceToHostAsync: Attempting to copy zero bytes, "
+           "skipping operation.";
+    return absl::OkStatus();
+  }
+  ::sycl::usm::alloc dst_alloc_type =
+      ::sycl::get_pointer_type(dst_host, stream_handle->get_context());
+  bool async = (dst_alloc_type == ::sycl::usm::alloc::host);
+  return MemcpyDeviceToHost(stream_handle, dst_host, src_device, byte_count,
+                            async);
+}
+
+absl::Status SyclMemcpyHostToDeviceAsync(::sycl::queue* stream_handle,
+                                         void* dst_device, const void* src_host,
+                                         size_t byte_count) {
+  if (dst_device == nullptr || src_host == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyHostToDeviceAsync: Null pointer provided for destination or "
+        "source.");
+  }
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyHostToDeviceAsync: Null stream handle provided.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING)
+        << "SyclMemcpyHostToDeviceAsync: Attempting to copy zero bytes, "
+           "skipping operation.";
+    return absl::OkStatus();
+  }
+  ::sycl::usm::alloc src_alloc_type =
+      ::sycl::get_pointer_type(src_host, stream_handle->get_context());
+  bool async = (src_alloc_type == ::sycl::usm::alloc::host);
+  return MemcpyHostToDevice(stream_handle, dst_device, src_host, byte_count,
+                            async);
+}
+
+absl::Status SyclMemcpyDeviceToDeviceAsync(::sycl::queue* stream_handle,
+                                           void* dst_device,
+                                           const void* src_device,
+                                           size_t byte_count) {
+  if (dst_device == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToDeviceAsync: Null pointer provided for destination "
+        "or source.");
+  }
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToDeviceAsync: Null stream handle provided.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING)
+        << "SyclMemcpyDeviceToDeviceAsync: Attempting to copy zero bytes, "
+           "skipping operation.";
+    return absl::OkStatus();
+  }
+  return MemcpyDeviceToDevice(stream_handle, dst_device, src_device, byte_count,
+                              /*async=*/true);
+}
+
+absl::Status SyclMemsetDevice(int device_ordinal, void* dst_device,
+                              unsigned char value, size_t count) {
+  if (dst_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemsetDevice: Null pointer provided for destination.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemsetDevice: Attempting to set zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMemsetDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemsetDevice(stream_handle.get(), dst_device, value, count);
+}
+
+absl::Status SyclMemsetDeviceAsync(::sycl::queue* stream_handle,
+                                   void* dst_device, unsigned char value,
+                                   size_t count) {
+  if (dst_device == nullptr || stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemsetDeviceAsync: Null pointer provided for destination or "
+        "stream handle.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemsetDeviceAsync: Attempting to set zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  return MemsetDevice(stream_handle, dst_device, value, count, /*async=*/true);
+}
+
+absl::Status SyclMemfillDevice(int device_ordinal, void* dst_device,
+                               uint32_t value, size_t count) {
+  if (dst_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemfillDevice: Null pointer provided for destination.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemfillDevice: Attempting to fill zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMemfillDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemfillDevice(stream_handle.get(), dst_device, value, count);
+}
+
+absl::Status SyclMemfillDeviceAsync(::sycl::queue* stream_handle,
+                                    void* dst_device, uint32_t value,
+                                    size_t count) {
+  if (dst_device == nullptr || stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemfillDeviceAsync: Null pointer provided for destination or "
+        "stream handle.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemfillDeviceAsync: Attempting to fill zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  return MemfillDevice(stream_handle, dst_device, value, count, /*async=*/true);
+}
+
+// TODO(intel-tf): Need OOM checks for all SYCL memory allocation functions.
+absl::StatusOr<void*> SyclMallocDevice(int device_ordinal, size_t byte_count) {
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMallocDevice: Attempting to allocate zero bytes, "
+                    "returning nullptr.";
+    return nullptr;
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMallocDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to allocate memory
+    void* ptr = ::sycl::aligned_alloc_device(/*alignment=*/64, byte_count,
+                                             *stream_handle);
+    return ptr;
+  } catch (const std::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclMallocDevice: Failed to allocate device memory: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::StatusOr<void*> SyclMallocHost(int device_ordinal, size_t byte_count) {
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMallocHost: Attempting to allocate zero bytes, "
+                    "returning nullptr.";
+    return nullptr;
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMallocHost"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to allocate memory
+    void* ptr = ::sycl::aligned_alloc_host(/*alignment=*/64, byte_count,
+                                           *stream_handle);
+    return ptr;
+  } catch (const std::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclMallocHost: Failed to allocate host memory: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::StatusOr<void*> SyclMallocShared(int device_ordinal, size_t byte_count) {
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMallocShared: Attempting to allocate zero bytes, "
+                    "returning nullptr.";
+    return nullptr;
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMallocShared"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to allocate memory
+    void* ptr = ::sycl::aligned_alloc_shared(/*alignment=*/64, byte_count,
+                                             *stream_handle);
+    return ptr;
+  } catch (const std::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclMallocShared: Failed to allocate shared memory: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::Status SyclFree(int device_ordinal, void*& ptr) {
+  if (ptr == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclFree: Attempting to free a null pointer.");
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclFree"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to free memory
+    ::sycl::free(ptr, *stream_handle);
+    ptr = nullptr;
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        absl::StrCat("SyclFree: Failed to free memory: ", e.what(),
+                     ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace stream_executor::sycl
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h
index ec48eb282a7a92..27dc280c019a99 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h
@@ -24,6 +24,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/ascii.h"
 #include "xla/stream_executor/sycl/sycl_status.h"
 #include "xla/tsl/platform/statusor.h"
@@ -73,5 +75,182 @@ class SyclDevicePool {
   SyclDevicePool() = delete;
 };
 
+using StreamPtr = std::shared_ptr<::sycl::queue>;
+using StreamPool = std::vector<StreamPtr>;
+using StreamPoolMap = absl::flat_hash_map<int /*device_ordinal*/, StreamPool>;
+
+// TODO(intel-tf): kMaxStreamsPerDevice is the maximum number of streams that
+// can be created per device via GetOrCreateStream when multiple streams are
+// enabled.
+//
+// For now, we set it to 8 so that there is no unbounded growth. However, it can
+// be adjusted based on the device capabilities and workload requirements.
+//
+// This feature will be enabled by default in the future once the performance
+// implications are better understood.
+constexpr int kMaxStreamsPerDevice = 8;
+
+// Manages pools of SYCL streams (queues) per device. All methods are static and
+// thread-safe via a global mutex. For high concurrency workloads, consider
+// refactoring to use per-device mutexes.
+// This class cannot be instantiated and is intended to be used as a
+// static utility.
+class SyclStreamPool {
+ public:
+  // Returns the default (first in the pool) SYCL stream for the given device
+  // ordinal. Returns an error if the device ordinal is invalid or the stream
+  // pool is empty.
+  static absl::StatusOr<StreamPtr> GetDefaultStream(int device_ordinal);
+
+  // Returns a SYCL stream for the given device ordinal.
+  //
+  // If multiple streams are not enabled, returns the default (first in the
+  // pool) SYCL stream. If the stream pool is empty, returns an error.
+  //
+  // If multiple streams are enabled (via enable_multiple_streams), creates
+  // a new stream up to the maximum limit (kMaxStreamsPerDevice). Returns an
+  // error if the limit is reached.
+  static absl::StatusOr<StreamPtr> GetOrCreateStream(
+      int device_ordinal, bool enable_multiple_streams);
+
+  // Synchronizes all streams associated with the given device ordinal.
+  static absl::Status SynchronizeStreamPool(int device_ordinal);
+
+  // Destroys a previously created SYCL stream for the given device ordinal.
+  static absl::Status DestroyStream(int device_ordinal,
+                                    StreamPtr& stream_handle);
+
+ private:
+  // Global mutex protecting the stream pool.
+  // TODO(intel-tf): We should consider using a more fine-grained locking
+  // mechanism (ex. per-device mutex) in the future to avoid performance issues.
+  static absl::Mutex stream_pool_mu_;
+
+  // The underlying stream pool for each device. The device ordinal
+  // is used as the key.
+  static StreamPoolMap stream_pool_map_ ABSL_GUARDED_BY(stream_pool_mu_);
+
+  // Initializes and returns a pointer to the stream pool for the given device
+  // ordinal.
+  static absl::StatusOr<StreamPool*> InitStreamPool(int device_ordinal);
+
+  // Prevent instantiation: this class is intended to be a static utility only
+  SyclStreamPool() = delete;
+};
+
+// Timer properties for SYCL device timing operations.
+struct SyclTimerProperties {
+  // Timer frequency in cycles per second (Hz).
+  uint64_t frequency_hz;
+
+  // Bitmask for valid kernel timestamp bits.
+  uint64_t timestamp_mask;
+};
+
+// Returns the timer frequency (Hz) and valid timestamp bitmask for the given
+// device ordinal using the Level Zero backend.
+absl::StatusOr<SyclTimerProperties> SyclGetTimerProperties(int device_ordinal);
+
+// Synchronizes the given SYCL stream by blocking until all previously submitted
+// tasks are complete.
+absl::Status SyclStreamSynchronize(::sycl::queue* stream_handle)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Retrieves the most recent SYCL event associated with the given stream,
+// if available.
+absl::StatusOr<std::optional<::sycl::event>> SyclGetRecentEventFromStream(
+    ::sycl::queue* stream_handle) ABSL_ATTRIBUTE_NONNULL(1);
+
+// NOTE: Similar to standard memcpy, all SYCL memcpy functions work
+// only when the source and destination buffers do not overlap. Add support for
+// overlapping copies if needed via a SYCL kernel.
+
+// Copies data from a device buffer to a host buffer using the default SYCL
+// stream for the specified device ordinal. The copy is synchronous and blocks
+// until the operation is complete.
+absl::Status SyclMemcpyDeviceToHost(int device_ordinal, void* dst_host,
+                                    const void* src_device, size_t byte_count);
+
+// Copies data from a host buffer to a device buffer using the default SYCL
+// stream for the specified device ordinal. The copy is synchronous and blocks
+// until the operation is complete.
+absl::Status SyclMemcpyHostToDevice(int device_ordinal, void* dst_device,
+                                    const void* src_host, size_t byte_count);
+
+// Copies data between two device buffers using the default SYCL stream for
+// the specified device ordinal. It supports both intra-device and
+// inter-device transfers. The copy is synchronous and blocks until the
+// operation is complete.
+absl::Status SyclMemcpyDeviceToDevice(int device_ordinal, void* dst_device,
+                                      const void* src_device,
+                                      size_t byte_count);
+
+// Asynchronously copies data from a device buffer to a host buffer using the
+// specified SYCL stream. The operation may return before the copy is complete.
+absl::Status SyclMemcpyDeviceToHostAsync(::sycl::queue* stream_handle,
+                                         void* dst_host, const void* src_device,
+                                         size_t byte_count)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Asynchronously copies data from a host buffer to a device buffer using the
+// specified SYCL stream. The operation may return before the copy is complete.
+absl::Status SyclMemcpyHostToDeviceAsync(::sycl::queue* stream_handle,
+                                         void* dst_device, const void* src_host,
+                                         size_t byte_count)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Asynchronously copies data between two device buffers using the specified
+// SYCL stream. It supports both intra-device and inter-device transfers. The
+// operation may return before the copy is complete.
+absl::Status SyclMemcpyDeviceToDeviceAsync(::sycl::queue* stream_handle,
+                                           void* dst_device,
+                                           const void* src_device,
+                                           size_t byte_count)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Sets the device buffer to a byte value using the default SYCL stream
+// for the specified device ordinal. The operation is synchronous
+// and blocks until the operation is complete.
+absl::Status SyclMemsetDevice(int device_ordinal, void* dst_device,
+                              unsigned char value, size_t count);
+
+// Asynchronously sets the device buffer to a byte value using the specified
+// SYCL stream. The operation may return before it is complete.
+absl::Status SyclMemsetDeviceAsync(::sycl::queue* stream_handle,
+                                   void* dst_device, unsigned char value,
+                                   size_t count) ABSL_ATTRIBUTE_NONNULL(1);
+
+// Sets the device buffer to an unsigned 32-bit value using the default SYCL
+// stream for the specified device ordinal. The operation is synchronous and
+// blocks until the operation is complete.
+absl::Status SyclMemfillDevice(int device_ordinal, void* dst_device,
+                               uint32_t value, size_t count);
+
+// Asynchronously sets the device buffer to an unsigned 32-bit value using the
+// specified SYCL stream. The operation may return before it is complete.
+absl::Status SyclMemfillDeviceAsync(::sycl::queue* stream_handle,
+                                    void* dst_device, uint32_t value,
+                                    size_t count) ABSL_ATTRIBUTE_NONNULL(1);
+
+// Allocates a block of memory on the given device ordinal using the default
+// stream for that device. The memory is aligned to 64 bytes.
+absl::StatusOr<void*> SyclMallocDevice(int device_ordinal, size_t byte_count);
+
+// Allocates a block of host-accessible memory on the given device ordinal
+// using the default stream for that device. The memory is aligned to 64 bytes.
+absl::StatusOr<void*> SyclMallocHost(int device_ordinal, size_t byte_count);
+
+// Allocates a block of shared memory that is accessible by both the host and
+// the specified device ordinal, using the default stream for that device. The
+// memory is aligned to 64 bytes.
+absl::StatusOr<void*> SyclMallocShared(int device_ordinal, size_t byte_count);
+
+// Frees a previously allocated block of memory on the specified device ordinal
+// using the default stream for that device. After successful deallocation, the
+// pointer is set to nullptr.
+// This function is thread-safe only for different pointers. Concurrent calls
+// to free the same pointer requires synchronization by the caller.
+absl::Status SyclFree(int device_ordinal, void*& ptr);
+
 }  // namespace stream_executor::sycl
 #endif  // XLA_STREAM_EXECUTOR_SYCL_SYCL_GPU_RUNTIME_H_
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc
index 8e01fb4b2e8d6f..671605b2e0887c 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc
@@ -15,17 +15,101 @@ limitations under the License.
 #include "xla/stream_executor/sycl/sycl_gpu_runtime.h"
 
 #include <gtest/gtest.h>
-#include "xla/tsl/platform/status_matchers.h"
+#include "absl/status/status_matchers.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 
 namespace stream_executor::sycl {
 namespace {
 
-TEST(SyclGpuRuntimeTest, GetDeviceCount) {
+class SyclGpuRuntimeTest : public ::testing::Test {
+ public:
+  std::vector<::sycl::device> sycl_devices_;
+
+ protected:
+  absl::StatusOr<void*> AllocateHostBuffer(int count) {
+    TF_ASSIGN_OR_RETURN(
+        void* buf, SyclMallocHost(kDefaultDeviceOrdinal, sizeof(int) * count));
+    if (buf == nullptr) {
+      return absl::InternalError(
+          "SyclGpuRuntimeTest::AllocateHostBuffer: Failed to allocate host "
+          "buffer.");
+    }
+    return buf;
+  }
+
+  absl::StatusOr<void*> AllocateDeviceBuffer(
+      int count, int device_ordinal = kDefaultDeviceOrdinal) {
+    TF_ASSIGN_OR_RETURN(void* buf,
+                        SyclMallocDevice(device_ordinal, sizeof(int) * count));
+    if (buf == nullptr) {
+      return absl::InternalError(
+          "SyclGpuRuntimeTest::AllocateDeviceBuffer: Failed to allocate "
+          "device buffer.");
+    }
+    return buf;
+  }
+
+  void VerifyIntBuffer(void* buf, int count, int expected) {
+    for (int i = 0; i < count; ++i) {
+      EXPECT_EQ(static_cast<int*>(buf)[i], expected)
+          << "Buffer mismatch at index " << i;
+    }
+  }
+
+  absl::StatusOr<void*> AllocateAndInitHostBuffer(int count, int value) {
+    TF_ASSIGN_OR_RETURN(void* buf, AllocateHostBuffer(count));
+    for (int i = 0; i < count; ++i) {
+      static_cast<int*>(buf)[i] = value;
+    }
+    return buf;
+  }
+
+  absl::StatusOr<void*> AllocateAndInitDeviceBuffer(
+      int count, int value, int device_ordinal = kDefaultDeviceOrdinal) {
+    TF_ASSIGN_OR_RETURN(void* buf, AllocateDeviceBuffer(count));
+    TF_RETURN_IF_ERROR(
+        SyclMemfillDevice(device_ordinal, buf, value, sizeof(int) * count));
+    if (buf == nullptr) {
+      return absl::InternalError(
+          "SyclGpuRuntimeTest::AllocateAndInitDeviceBuffer: Failed to fill "
+          "device buffer.");
+    }
+    return buf;
+  }
+
+  void FreeAndNullify(void*& ptr, int device_ordinal = kDefaultDeviceOrdinal) {
+    if (ptr != nullptr) {
+      EXPECT_THAT(SyclFree(device_ordinal, ptr), absl_testing::IsOk());
+      EXPECT_EQ(ptr, nullptr);
+    }
+  }
+
+ private:
+  void SetUp() override {
+    // Find the number of SYCL devices available. If there are none, skip the
+    // test.
+    TF_ASSERT_OK_AND_ASSIGN(int device_count, SyclDevicePool::GetDeviceCount());
+    if (device_count <= 0) {
+      GTEST_SKIP() << "No SYCL devices found.";
+    } else {
+      VLOG(2) << "Found " << device_count << " SYCL devices.";
+    }
+
+    // Initialize the device pool with available devices.
+    for (int i = 0; i < device_count; ++i) {
+      TF_ASSERT_OK_AND_ASSIGN(::sycl::device sycl_device,
+                              SyclDevicePool::GetDevice(i));
+      sycl_devices_.push_back(sycl_device);
+    }
+  }
+};
+
+TEST_F(SyclGpuRuntimeTest, GetDeviceCount) {
   EXPECT_THAT(SyclDevicePool::GetDeviceCount(),
               ::absl_testing::IsOkAndHolds(::testing::Gt(0)));
 }
 
-TEST(SyclGpuRuntimeTest, GetDeviceOrdinal) {
+TEST_F(SyclGpuRuntimeTest, GetDeviceOrdinal) {
   TF_ASSERT_OK_AND_ASSIGN(::sycl::device sycl_device,
                           SyclDevicePool::GetDevice(kDefaultDeviceOrdinal));
   TF_ASSERT_OK_AND_ASSIGN(int device_ordinal,
@@ -33,7 +117,7 @@ TEST(SyclGpuRuntimeTest, GetDeviceOrdinal) {
   EXPECT_EQ(device_ordinal, kDefaultDeviceOrdinal);
 }
 
-TEST(SyclGpuRuntimeTest, TestStaticDeviceContext) {
+TEST_F(SyclGpuRuntimeTest, TestStaticDeviceContext) {
   // Verify that GetDeviceContext returns the same context instance on multiple
   // calls.
   TF_ASSERT_OK_AND_ASSIGN(::sycl::context saved_sycl_context,
@@ -43,5 +127,556 @@ TEST(SyclGpuRuntimeTest, TestStaticDeviceContext) {
   EXPECT_EQ(saved_sycl_context, current_sycl_context);
 }
 
+TEST_F(SyclGpuRuntimeTest, TestDefaultStreamSynchronizeAndDestroy) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetDefaultStream(kDefaultDeviceOrdinal));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(SyclStreamPool::SynchronizeStreamPool(kDefaultDeviceOrdinal));
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestCreateStreamSynchronizeAndDestroy) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(SyclStreamPool::SynchronizeStreamPool(kDefaultDeviceOrdinal));
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestStreamPoolCreateAfterDestroy) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  ASSERT_EQ(stream_handle, nullptr);
+
+  // Verify that we can create a new stream after destroying the previous one.
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  // Clean up the stream after the test.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestStreamPoolCreate_Negative) {
+  constexpr int kInvalidDeviceOrdinal = -1;
+  EXPECT_THAT(
+      SyclStreamPool::GetOrCreateStream(kInvalidDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false),
+      absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(SyclGpuRuntimeTest, TestStreamPoolDestroy_Negative) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  ASSERT_EQ(stream_handle, nullptr);
+
+  // Try to destroy the stream again, which should be a no-op.
+  EXPECT_THAT(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle),
+      absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMaxStreamsPerDevice) {
+  // Ensure that the maximum number of streams per device is respected.
+  constexpr int kMaxStreams = 8;
+  std::vector<StreamPtr> streams(kMaxStreams);
+  for (int i = 0; i < kMaxStreams - 1; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(streams[i], SyclStreamPool::GetOrCreateStream(
+                                            kDefaultDeviceOrdinal,
+                                            /*enable_multiple_streams=*/true));
+    ASSERT_NE(streams[i], nullptr);
+  }
+
+  // Attempt to create one more stream, which should fail.
+  EXPECT_THAT(
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/true),
+      absl_testing::StatusIs(absl::StatusCode::kResourceExhausted));
+
+  // Clean up the streams created.
+  for (int i = 0; i < kMaxStreams - 1; ++i) {
+    TF_ASSERT_OK(
+        SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, streams[i]));
+    EXPECT_EQ(streams[i], nullptr);
+  }
+}
+
+TEST_F(SyclGpuRuntimeTest, TestGetTimerProperties) {
+  TF_ASSERT_OK_AND_ASSIGN(SyclTimerProperties timer_props,
+                          SyclGetTimerProperties(kDefaultDeviceOrdinal));
+  EXPECT_GT(timer_props.frequency_hz, 0);
+  EXPECT_GT(timer_props.timestamp_mask, 0);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestSyclGetRecentEventFromStream) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  // Ensure there is an event associated with the stream by filling some memory
+  // on the device.
+  TF_ASSERT_OK_AND_ASSIGN(void* device_buf,
+                          AllocateAndInitDeviceBuffer(kCount, 0xDEADC0DE));
+
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::optional<::sycl::event> event,
+                          SyclGetRecentEventFromStream(stream_handle.get()));
+
+  ASSERT_TRUE(event.has_value());
+
+  // Expect the event to be in a valid state. The command_execution_status
+  // should not be "unknown".
+  EXPECT_NE(
+      event.value().get_info<::sycl::info::event::command_execution_status>(),
+      ::sycl::info::event_command_status::ext_oneapi_unknown);
+
+  FreeAndNullify(device_buf);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyDeviceToHost) {
+  constexpr int kCount = 12;
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device,
+                          AllocateAndInitDeviceBuffer(kCount, 0xDEADBEEF));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_host, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, dst_host,
+                                      src_device, sizeof(int) * kCount));
+
+  VerifyIntBuffer(dst_host, kCount, 0xDEADBEEF);
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_host);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyHostToDeviceAndBack) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(void* src_host,
+                          AllocateAndInitHostBuffer(kCount, 0xDEADC0DE));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_device, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyHostToDevice(kDefaultDeviceOrdinal, dst_device,
+                                      src_host, sizeof(int) * kCount));
+
+  // Clear out the host buffer to ensure data is copied back correctly.
+  for (int i = 0; i < kCount; ++i) {
+    static_cast<int*>(src_host)[i] = 0;
+  }
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, src_host,
+                                      dst_device, sizeof(int) * kCount));
+
+  VerifyIntBuffer(src_host, kCount, 0xDEADC0DE);
+
+  FreeAndNullify(src_host);
+  FreeAndNullify(dst_device);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyDeviceToDevice_SameDevice) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device, AllocateDeviceBuffer(kCount));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_device, AllocateDeviceBuffer(kCount));
+
+  // Test memcpy between two buffers within the same device.
+  TF_ASSERT_OK(SyclMemcpyDeviceToDevice(kDefaultDeviceOrdinal, dst_device,
+                                        src_device, sizeof(int) * kCount));
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_device);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyDeviceToHostAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device,
+                          AllocateAndInitDeviceBuffer(kCount, 0xDEADBEEF));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_host, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHostAsync(stream_handle.get(), dst_host,
+                                           src_device, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the copy is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  // Check the results after synchronization.
+  VerifyIntBuffer(dst_host, kCount, 0xDEADBEEF);
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_host);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyHostToDeviceAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* src_host,
+                          AllocateAndInitHostBuffer(kCount, 0xDEADC0DE));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_device, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyHostToDeviceAsync(stream_handle.get(), dst_device,
+                                           src_host, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the copy is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  // Verify the copy by reading back to host.
+  // First, clear out the host buffer to ensure data is copied back correctly.
+  for (int i = 0; i < kCount; ++i) {
+    static_cast<int*>(src_host)[i] = 0;
+  }
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, src_host,
+                                      dst_device, sizeof(int) * kCount));
+
+  VerifyIntBuffer(src_host, kCount, 0xDEADC0DE);
+
+  FreeAndNullify(src_host);
+  FreeAndNullify(dst_device);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemsetDevice) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      void* src_device,
+      SyclMallocDevice(kDefaultDeviceOrdinal, sizeof(char) * kCount));
+  ASSERT_NE(src_device, nullptr);
+
+  TF_ASSERT_OK(SyclMemsetDevice(kDefaultDeviceOrdinal, src_device, 'A',
+                                sizeof(char) * kCount));
+
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_host, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, dst_host,
+                                      src_device, sizeof(char) * kCount));
+
+  for (int i = 0; i < kCount; ++i) {
+    EXPECT_EQ(static_cast<char*>(dst_host)[i], 'A')
+        << "Mismatch at index " << i;
+  }
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_host);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemsetDevice_Negative) {
+  constexpr int kCount = 10;
+  constexpr int kInvalidDeviceOrdinal = -1;
+
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device, AllocateDeviceBuffer(kCount));
+  ASSERT_NE(src_device, nullptr);
+
+  // Attempt to memset with an invalid device ordinal.
+  EXPECT_THAT(SyclMemsetDevice(kInvalidDeviceOrdinal, src_device, 'A',
+                               sizeof(char) * kCount),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // Attempt to memset a null pointer.
+  void* null_ptr = nullptr;
+  EXPECT_THAT(SyclMemsetDevice(kDefaultDeviceOrdinal, null_ptr, 'A',
+                               sizeof(char) * kCount),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  FreeAndNullify(src_device);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemsetDeviceAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_buf, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemsetDeviceAsync(stream_handle.get(), device_buf, 'B',
+                                     sizeof(char) * kCount));
+
+  // Synchronize the stream to ensure the memset is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, host_buf,
+                                      device_buf, sizeof(char) * kCount));
+
+  for (int i = 0; i < kCount; ++i) {
+    EXPECT_EQ(static_cast<char*>(host_buf)[i], 'B')
+        << "Mismatch at index " << i;
+  }
+
+  FreeAndNullify(device_buf);
+  FreeAndNullify(host_buf);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemfillDeviceAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_buf, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemfillDeviceAsync(stream_handle.get(), device_buf,
+                                      0xDEADC0DE, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the fill is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, host_buf,
+                                      device_buf, sizeof(int) * kCount));
+
+  VerifyIntBuffer(host_buf, kCount, 0xDEADC0DE);
+
+  FreeAndNullify(device_buf);
+  FreeAndNullify(host_buf);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  ASSERT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemfillDeviceAsync_Negative) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  // Attempt to fill a null pointer.
+  void* null_ptr = nullptr;
+  EXPECT_THAT(SyclMemfillDeviceAsync(stream_handle.get(), null_ptr, 0xFEEDEAF,
+                                     sizeof(int) * kCount),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMultiDeviceAllocationAndSyncCopy) {
+  // Skip if less than 2 devices are available.
+  if (sycl_devices_.size() < 2) {
+    GTEST_SKIP() << "Not enough SYCL devices available for this test.";
+  }
+
+  constexpr int kDevice0 = 0, kDevice1 = 1;
+  constexpr int kCount = 16;
+
+  // Allocate and initialize on device 0.
+  TF_ASSERT_OK_AND_ASSIGN(void* device0_buf, AllocateAndInitDeviceBuffer(
+                                                 kCount, 0x1234ABCD, kDevice0));
+  // Allocate on device 1.
+  TF_ASSERT_OK_AND_ASSIGN(void* device1_buf,
+                          AllocateDeviceBuffer(kCount, kDevice1));
+
+  // Try to copy from device 0 to device 1. It should work since cross-device
+  // memcpy is supported.
+  TF_ASSERT_OK(SyclMemcpyDeviceToDevice(kDevice0, device1_buf, device0_buf,
+                                        sizeof(int) * kCount));
+
+  // Verify the copy by reading back to host.
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDevice1, host_buf, device1_buf,
+                                      sizeof(int) * kCount));
+
+  VerifyIntBuffer(host_buf, kCount, 0x1234ABCD);
+
+  // Free the buffers.
+  FreeAndNullify(device0_buf, kDevice0);
+  FreeAndNullify(device1_buf, kDevice1);
+  FreeAndNullify(host_buf, kDefaultDeviceOrdinal);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMultiDeviceAllocationAndAsyncCopy) {
+  if (sycl_devices_.size() < 2) {
+    GTEST_SKIP() << "Not enough SYCL devices available for this test.";
+  }
+
+  constexpr int kDevice0 = 0, kDevice1 = 1;
+  constexpr int kCount = 10;
+
+  // Create a stream for device-0.
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream0,
+      SyclStreamPool::GetOrCreateStream(kDevice0,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream0, nullptr);
+
+  // Allocate and initialize on device-0.
+  TF_ASSERT_OK_AND_ASSIGN(void* device0_buf, AllocateAndInitDeviceBuffer(
+                                                 kCount, 0xDEADBEEF, kDevice0));
+
+  // Allocate on device 1.
+  TF_ASSERT_OK_AND_ASSIGN(void* device1_buf,
+                          AllocateDeviceBuffer(kCount, kDevice1));
+
+  // Copy from device-0 to device-1 using stream-0.
+  TF_ASSERT_OK(SyclMemcpyDeviceToDeviceAsync(
+      stream0.get(), device1_buf, device0_buf, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the copy is complete.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream0.get()));
+
+  // Verify the copy by copying back to host.
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDevice1, host_buf, device1_buf,
+                                      sizeof(int) * kCount));
+
+  VerifyIntBuffer(host_buf, kCount, 0xDEADBEEF);
+
+  // Free the buffers.
+  FreeAndNullify(device0_buf, kDevice0);
+  FreeAndNullify(device1_buf, kDevice1);
+  FreeAndNullify(host_buf, kDefaultDeviceOrdinal);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(SyclStreamPool::DestroyStream(kDevice0, stream0));
+  EXPECT_EQ(stream0, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMallocAll_Positive) {
+  TF_ASSERT_OK_AND_ASSIGN(void* host_ptr, AllocateHostBuffer(/*count=*/256));
+  FreeAndNullify(host_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_ptr,
+                          AllocateDeviceBuffer(/*count=*/256));
+  FreeAndNullify(device_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* shared_ptr,
+                          SyclMallocShared(kDefaultDeviceOrdinal,
+                                           /*byte_count=*/1024));
+  EXPECT_NE(shared_ptr, nullptr);
+  FreeAndNullify(shared_ptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMallocAll_InvalidDeviceOrdinal) {
+  constexpr int kInvalidDeviceOrdinal = -1;
+  EXPECT_THAT(SyclMallocHost(kInvalidDeviceOrdinal, 10).status(),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_THAT(SyclMallocDevice(kInvalidDeviceOrdinal, 20).status(),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_THAT(SyclMallocShared(kInvalidDeviceOrdinal, 30).status(),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMallocAll_ZeroAllocation) {
+  constexpr size_t kByteCount = 0;
+  TF_ASSERT_OK_AND_ASSIGN(void* host_ptr,
+                          SyclMallocHost(kDefaultDeviceOrdinal, kByteCount));
+  EXPECT_EQ(host_ptr, nullptr)
+      << "Expected nullptr for zero allocation on host memory.";
+  FreeAndNullify(host_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_ptr,
+                          SyclMallocDevice(kDefaultDeviceOrdinal, kByteCount));
+  EXPECT_EQ(device_ptr, nullptr)
+      << "Expected nullptr for zero allocation on device memory.";
+  FreeAndNullify(device_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* shared_ptr,
+                          SyclMallocShared(kDefaultDeviceOrdinal, kByteCount));
+  EXPECT_EQ(shared_ptr, nullptr)
+      << "Expected nullptr for zero allocation on shared memory.";
+  FreeAndNullify(shared_ptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestSyclFree_Negative) {
+  constexpr int kInvalidDeviceOrdinal = -1;
+  void* null_ptr = nullptr;  // Null pointer should not cause issues.
+
+  // Attempt to free with an invalid device ordinal.
+  EXPECT_THAT(SyclFree(kInvalidDeviceOrdinal, null_ptr),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // Attempt to free a null pointer.
+  EXPECT_THAT(SyclFree(kDefaultDeviceOrdinal, null_ptr),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument))
+      << "Expected error when trying to free a null pointer.";
+}
+
+TEST_F(SyclGpuRuntimeTest, TestSyclFree_DoubleFree) {
+  TF_ASSERT_OK_AND_ASSIGN(void* device_ptr, AllocateDeviceBuffer(10));
+  TF_ASSERT_OK(SyclFree(kDefaultDeviceOrdinal, device_ptr));
+  EXPECT_EQ(device_ptr, nullptr);
+
+  // Try to free again, which should return an error.
+  EXPECT_THAT(SyclFree(kDefaultDeviceOrdinal, device_ptr),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
 }  // namespace
 }  // namespace stream_executor::sycl
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc
index 72dc4b222a75cc..04bf0eaa4c82d0 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc
@@ -12,6 +12,22 @@ limitations under the License.
 
 #include "xla/stream_executor/sycl/sycl_kernel.h"
 
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
+
 namespace stream_executor::sycl {
 
 // TODO(intel-tf): Implement this feature in SYCL
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h
index 5fc589138c3c0a..4dd3e030fd1e92 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h
@@ -13,9 +13,16 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_SYCL_SYCL_KERNEL_H_
 #define XLA_STREAM_EXECUTOR_SYCL_SYCL_KERNEL_H_
 
-#include <sycl/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <sycl/sycl.hpp>  // NOLINT
 
 #include "absl/status/statusor.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_kernel_test.cc b/third_party/xla/xla/stream_executor/sycl/sycl_kernel_test.cc
index d793f150b7a0ca..295cb4defa02c6 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_kernel_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/sycl/sycl_platform_id.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace stream_executor::sycl {
 namespace {
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc b/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc
index 94de7f9ba2f96c..85e2cdea9cd6ad 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc
@@ -27,8 +27,6 @@ limitations under the License.
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/sycl/sycl_platform_id.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace sycl {
@@ -67,7 +65,7 @@ SyclPlatform::GetUncachedExecutor(int ordinal) {
 }  // namespace sycl
 
 static void InitializeSyclPlatform() {
-  TF_CHECK_OK(PlatformManager::RegisterPlatform(
+  CHECK_OK(PlatformManager::RegisterPlatform(
       std::make_unique<sycl::SyclPlatform>()));
 }
 
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_platform.h b/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
index 4273d7744e0cee..1fe8e301d735d7 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
@@ -20,18 +20,13 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor::sycl {
 
-// Opaque and unique identifier for the SYCL platform plugin.
-// This is needed so that plugins can refer to/identify this platform without
-// instantiating a SyclPlatform object.
-extern const Platform::Id kSyclPlatformId;
-
 // SYCL-specific platform plugin, registered as a singleton value via module
 // initializer.
 class SyclPlatform : public Platform {
@@ -40,7 +35,7 @@ class SyclPlatform : public Platform {
   ~SyclPlatform() override;
 
   // Platform interface implementation:
-  // Returns the same value as kSyclPlatform above.
+  // Returns the same value as kSyclPlatformId above.
   Platform::Id id() const override;
 
   // Returns -1 as a sentinel on internal failure (and logs the error).
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_status_test.cc b/third_party/xla/xla/stream_executor/sycl/sycl_status_test.cc
index 9897ad6da0a992..347c2b0dfbbc91 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_status_test.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_status_test.cc
@@ -20,15 +20,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace stream_executor::sycl {
 namespace {
 
 using ::testing::HasSubstr;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 TEST(SyclStatusTest, ToStatusReturnsExpectedStatusCodes) {
   // We only promise SyclError::kSyclSuccess to map to Ok, everything
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc b/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc
index 4b734db79c76ae..4bf9353279be5a 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc
@@ -55,10 +55,10 @@ absl::StatusOr<float> GetEventElapsedTime(StreamExecutor* executor,
 
   // Get the frequency and mask for the device to convert timestamps to
   // milliseconds.
-  // TODO(intel-tf): Remove the hardcoded frequency and mask values once
-  // SyclGetFrequencyMask is implemented for all devices in SYCL GPU runtime.
-  constexpr uint64_t frequency = 12500000;  // 12.5 MHz
-  constexpr uint64_t mask = 4294967295;     // 0xFFFFFFFF
+  // We assume that all SYCL devices have the same frequency and mask, so
+  // we use kDefaultDeviceOrdinal.
+  TF_ASSIGN_OR_RETURN(SyclTimerProperties timer_props,
+                      SyclGetTimerProperties(kDefaultDeviceOrdinal));
 
   const uint64_t kernel_start_time = start_timestamp.global.kernelStart;
   const uint64_t kernel_end_time = end_timestamp.global.kernelEnd;
@@ -66,12 +66,14 @@ absl::StatusOr<float> GetEventElapsedTime(StreamExecutor* executor,
   if (kernel_start_time < kernel_end_time) {
     elapsed_ticks = kernel_end_time - kernel_start_time;
   } else {
-    elapsed_ticks = (mask + 1ull) + kernel_end_time - kernel_start_time;
+    elapsed_ticks = (timer_props.timestamp_mask + 1ull) + kernel_end_time -
+                    kernel_start_time;
   }
   float elapsed_milliseconds =
-      static_cast<float>(elapsed_ticks) * kMsecInSec / frequency;
+      static_cast<float>(elapsed_ticks) * kMsecInSec / timer_props.frequency_hz;
 
-  VLOG(1) << "Frequency: " << frequency << ", mask: " << mask;
+  VLOG(1) << "Frequency: " << timer_props.frequency_hz
+          << ", mask: " << timer_props.timestamp_mask;
   VLOG(1) << "The duration between start and stop events is "
           << elapsed_milliseconds << " ms.";
   return elapsed_milliseconds;
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_timer.h b/third_party/xla/xla/stream_executor/sycl/sycl_timer.h
index db7317ac6daaaf..984608b5b160e7 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_timer.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_timer.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/sycl/sycl_event.h"
+#include "xla/stream_executor/sycl/sycl_gpu_runtime.h"
 
 namespace stream_executor::sycl {
 
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_timer_test.cc b/third_party/xla/xla/stream_executor/sycl/sycl_timer_test.cc
index d771f2be8ee05b..47ca19049861c5 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_timer_test.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_timer_test.cc
@@ -17,18 +17,18 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/sycl/sycl_platform_id.h"
 #include "xla/stream_executor/typed_kernel_factory.h"
-#include "xla/tsl/platform/status_matchers.h"
 
 namespace stream_executor::sycl {
 namespace {
 
 const int kDefaultDeviceOrdinal = 0;
 
+using ::absl_testing::IsOk;
 using ::testing::Gt;
-using ::tsl::testing::IsOk;
 
 class SyclTimerTest : public ::testing::Test {
  public:
@@ -235,7 +235,7 @@ TEST_F(SyclTimerTest, Create) {
                           timer.GetElapsedDuration());
   EXPECT_THAT(timer_result, Gt(absl::ZeroDuration()));
   EXPECT_THAT(timer.GetElapsedDuration(),
-              tsl::testing::StatusIs(absl::StatusCode::kFailedPrecondition));
+              absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index 4c7323da2a7286..12e6e71cb665d7 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -352,7 +352,6 @@ cc_library(
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:types",
     ],
 )
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
index 729487300b3723..29db3c980a9852 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
@@ -34,10 +34,8 @@ limitations under the License.
 #include "xla/stream_executor/tpu/tpu_executor.h"
 #include "xla/stream_executor/tpu/tpu_executor_api.h"
 #include "xla/stream_executor/tpu/tpu_platform_id.h"
-#include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "xla/stream_executor/tpu/tpu_topology.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
-#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -176,7 +174,7 @@ bool RegisterTpuPlatform() {
     tpu_registered_platform = new TpuPlatform();
     std::unique_ptr<stream_executor::Platform> platform(
         tpu_registered_platform);
-    TF_CHECK_OK(stream_executor::PlatformManager::RegisterPlatform(
+    CHECK_OK(stream_executor::PlatformManager::RegisterPlatform(
         std::move(platform)));
     tpu_platform_registered = true;
   }
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 182a01a43ea01a..9b01d3bccad7de 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -91,10 +91,10 @@ cc_library(
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -188,14 +188,12 @@ cc_library(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -235,6 +233,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -311,15 +310,15 @@ cc_library(
     deps = [
         ":client_library_test_runner_utils",
         ":literal_test_util",
-        ":test_utils",
+        "//xla:array",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:array4d",
+        "//xla:error_spec",
         "//xla:execution_options_util",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/client:client_library",
@@ -327,17 +326,21 @@ cc_library(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/service",
         "//xla/service:interpreter_plugin",  # reference backend
         "//xla/service:platform_util",
-        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:platform",
         "//xla/tsl/lib/core:bitmap",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:test",
+        "@com_google_googletest//:gtest_for_library",
     ],
     alwayslink = True,  # This library registers test cases at static initialization time.
 )
@@ -388,7 +391,6 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
@@ -416,10 +418,10 @@ cc_library(
     srcs = ["codegen_test_base.cc"],
     hdrs = ["codegen_test_base.h"],
     deps = [
+        ":hlo_test_base_with_mlir_context",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
-        "//xla/tests:hlo_test_base_with_mlir_context",
     ],
 )
 
@@ -503,17 +505,35 @@ xla_test(
         ":hlo_test_base",
         ":literal_test_util",
         ":xla_internal_test_main",
+        "//xla:comparison_util",
+        "//xla:executable_run_options",
         "//xla:literal",
-        "//xla:status_macros",
+        "//xla:literal_util",
+        "//xla:shape_tree",
+        "//xla:shape_util",
         "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:backend",
         "//xla/service:executable",
+        "//xla/service:hlo_module_config",
+        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:shaped_buffer",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -525,17 +545,22 @@ xla_test(
         "conv_depthwise_test.cc",
     ],
     shard_count = 30,
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
-        ":client_library_test_base",
         ":conv_depthwise_common",
-        ":hlo_test_base",
-        ":xla_internal_test_main",
-        "//xla:execution_options_util",
-        "//xla:status_macros",
-        "//xla/hlo/builder:xla_computation",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
         "//xla/hlo/transforms:despecializer",
         "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -622,7 +647,6 @@ xla_cc_test(
 xla_test(
     name = "while_test",
     srcs = ["while_test.cc"],
-    # placeholder for extra args for while_test
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -674,9 +698,13 @@ xla_test(
 xla_test(
     name = "map_test",
     srcs = ["map_test.cc"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
@@ -692,8 +720,8 @@ xla_test(
         "//xla/hlo/testlib:test_helpers",
         "//xla/service",
         "//xla/tests:xla_test_backend_predicates",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -706,10 +734,12 @@ xla_test(
     shard_count = 15,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:error_spec",
@@ -862,6 +892,7 @@ xla_test(
         "//xla/service",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -877,6 +908,8 @@ xla_test(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/service",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
@@ -887,8 +920,10 @@ xla_test(
     srcs = ["deconstruct_tuple_test.cc"],
     deps = [
         ":client_library_test_base",
+        ":literal_test_util",
         ":xla_internal_test_main",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/client:local_client",
@@ -896,9 +931,11 @@ xla_test(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/service",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -940,9 +977,7 @@ cc_library(
     testonly = True,
     srcs = ["conv_depthwise_common.cc"],
     hdrs = ["conv_depthwise_common.h"],
-    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
@@ -978,13 +1013,16 @@ xla_test(
 xla_test(
     name = "fft_test",
     srcs = ["fft_test.cc"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -999,13 +1037,15 @@ xla_test(
     },
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1017,13 +1057,16 @@ xla_test(
         # TODO(b/445172709): Re-enable once fixed.
         "b200": ["broken"],
     },
+    precompile_test = False,
     shard_count = 20,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
@@ -1044,15 +1087,18 @@ xla_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test_helpers",
         "//xla/service",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1070,7 +1116,7 @@ xla_test(
 # Run dot tests with auto-tuning disabled.  This just does a basic sanity check
 # that setting xla_gpu_autotune_level to 0 does not break simple graphs.
 xla_test(
-    name = "dot_operation_test_autotune_disabled",
+    name = "dot_operation_autotune_disabled_test",
     srcs = ["dot_operation_test.cc"],
     args = ["--xla_gpu_autotune_level=0"],
     backend_tags = {
@@ -1078,33 +1124,43 @@ xla_test(
         "b200": ["broken"],
     },
     backends = ["gpu"],
+    precompile_test = False,
     shard_count = 20,
     tags = [
         "optonly",
         # TODO(b/151340488): Timed out on 2020-03-12.
         "nozapfhahn",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
+        "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/builder/lib:matrix",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
@@ -1116,7 +1172,7 @@ xla_test(
 
 # Run dot tests with dot canonicalization after the layout assignment pass.
 xla_test(
-    name = "dot_operation_test_canonicalization_after_layout",
+    name = "dot_operation_canonicalization_after_layout_test",
     timeout = "long",
     srcs = ["dot_operation_test.cc"],
     args = [
@@ -1128,35 +1184,42 @@ xla_test(
         "gpu",
         "interpreter",
     ],
+    precompile_test = False,
     shard_count = 50,
     tags = [
         "nozapfhahn",
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
-        "//xla/client:local_client",
+        "//xla/client:client_library",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/builder/lib:matrix",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
     ] + if_rocm_is_configured([
         # keep sorted
         "@local_config_rocm//rocm:rocm_headers",
@@ -1208,11 +1271,11 @@ xla_test(
         "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
+        ":hlo_pjrt_test_base",
         "//xla/service:platform_util",
-        "//xla/tests:hlo_pjrt_test_base",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1290,31 +1353,41 @@ xla_test(
         # TODO(b/445172709): Re-enable once fixed.
         "b200": ["broken"],
     },
-    shard_count = 20,
+    precompile_test = False,
+    shard_count = 50,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
+        "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/builder/lib:matrix",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
@@ -1416,7 +1489,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d",
+    name = "convolution_1d_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     # Turn on logging so that VLOG statements don't appear uncovered to zapfhahn.
@@ -1453,7 +1526,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d_no_vmodule",
+    name = "convolution_1d_no_vmodule_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     backends = [
@@ -1489,7 +1562,7 @@ xla_test(
 # sanity check that setting xla_gpu_autotune_level to 0 does not break simple
 # graphs.
 xla_test(
-    name = "convolution_test_autotune_disabled",
+    name = "convolution_autotune_disabled_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
     args = ["--xla_gpu_autotune_level=0"],
@@ -1527,7 +1600,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d_autotune_disabled",
+    name = "convolution_1d_autotune_disabled_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     args = ["--xla_gpu_autotune_level=0"],
@@ -1558,7 +1631,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_gpu_alternative_layout",
+    name = "convolution_gpu_alternative_layout_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
@@ -1592,7 +1665,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d_gpu_alternative_layout",
+    name = "convolution_1d_gpu_alternative_layout_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
@@ -1829,10 +1902,14 @@ xla_test(
     name = "slice_test",
     timeout = "long",
     srcs = ["slice_test.cc"],
-    shard_count = 40,
+    shard_count = 50,
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
@@ -1874,6 +1951,7 @@ xla_test(
     name = "dynamic_ops_test",
     timeout = "moderate",
     srcs = ["dynamic_ops_test.cc"],
+    precompile_test = False,
     shard_count = 4,
     tags = [
         "test_migrated_to_hlo_runner_pjrt",
@@ -2260,6 +2338,7 @@ xla_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu:ffi",
         "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/ffi",
@@ -2312,9 +2391,13 @@ xla_test(
 xla_test(
     name = "broadcast_simple_test",
     srcs = ["broadcast_simple_test.cc"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
@@ -2335,13 +2418,18 @@ xla_test(
 xla_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:array",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:array4d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
@@ -2350,7 +2438,8 @@ xla_test(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:arithmetic",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -2432,19 +2521,22 @@ xla_test(
     ],
     deps = [
         ":client_library_test_base",
+        ":literal_test_util",
         ":xla_internal_test_main",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:util",
+        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
+        "//xla/service",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2474,9 +2566,13 @@ xla_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
     shard_count = 30,
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
@@ -2766,12 +2862,125 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "collective_ops_e2e_test_base",
+    testonly = True,
+    srcs = ["collective_ops_e2e_test_base.cc"],
+    hdrs = ["collective_ops_e2e_test_base.h"],
+    deps = [
+        "//xla:array",
+        "//xla:literal",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:backend",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_runner",
+        "//xla/service:hlo_runner_interface",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_memory_space_assignment",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/integrations:device_mem_allocator",
+        "//xla/stream_executor/integrations:stream_executor_allocator",
+        "//xla/stream_executor/integrations:tf_allocator_adapter",
+        "//xla/tsl/framework:allocator",
+        "//xla/tsl/framework:bfc_allocator",
+        "//xla/tsl/framework:device_id_impl",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_test(
     name = "collective_ops_e2e_test",
     srcs = ["collective_ops_e2e_test.cc"],
     backend_tags = {
         "gpu": [
-            "multi_gpu_h100",
+            "multi_gpu",
+        ],
+        "nvgpu_any": [
+            "broken",
+            "no_oss",
+        ],
+    },
+    backends = [
+        "gpu",
+    ],
+    deps = [
+        ":collective_ops_e2e_test_base",
+        ":literal_test_util",
+        ":test_utils",
+        ":xla_internal_test_main",
+        "//xla:array",
+        "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:backend",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_runner",
+        "//xla/service:hlo_runner_interface",
+        "//xla/service:pattern_matcher",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_memory_space_assignment",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/integrations:device_mem_allocator",
+        "//xla/stream_executor/integrations:stream_executor_allocator",
+        "//xla/stream_executor/integrations:tf_allocator_adapter",
+        "//xla/tsl/framework:bfc_allocator",
+        "//xla/tsl/framework:device_id",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:regexp",
+    ] + if_google([
+        # In OSS this dependency is added automatically for xla_test targets. See
+        # third_party/tensorflow/compiler/xla/xla.default.bzl.
+        "//xla/tsl/framework:allocator",
+    ]),
+)
+
+xla_test(
+    name = "ragged_all_to_all_e2e_test",
+    srcs = ["ragged_all_to_all_e2e_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
             "no_oss",
         ],
     },
@@ -2779,6 +2988,7 @@ xla_test(
         "gpu",
     ],
     deps = [
+        ":collective_ops_e2e_test_base",
         ":literal_test_util",
         ":test_utils",
         ":xla_internal_test_main",
@@ -2789,6 +2999,7 @@ xla_test(
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -2807,6 +3018,7 @@ xla_test(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/integrations:device_mem_allocator",
+        "//xla/stream_executor/integrations:stream_executor_allocator",
         "//xla/stream_executor/integrations:tf_allocator_adapter",
         "//xla/tsl/framework:bfc_allocator",
         "//xla/tsl/framework:device_id",
@@ -2816,6 +3028,7 @@ xla_test(
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -2971,12 +3184,10 @@ xla_test(
     srcs = ["compute_constant_test.cc"],
     deps = [
         ":literal_test_util",
-        ":test_utils",
         ":xla_internal_test_main",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/client",
         "//xla/client:client_library",
@@ -2984,11 +3195,10 @@ xla_test(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
         "//xla/stream_executor:platform",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ],
@@ -3099,10 +3309,13 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/client:local_client",
+        "//xla/service",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -3115,9 +3328,13 @@ xla_test(
         "gpu",
         "interpreter",
     ],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
@@ -3153,11 +3370,12 @@ xla_test(
     name = "multioutput_fusion_test",
     srcs = ["multioutput_fusion_test.cc"],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
@@ -3165,7 +3383,6 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/tests:xla_test_backend_predicates",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
@@ -3321,12 +3538,18 @@ xla_cc_test(
     srcs = ["literal_test_util_test.cc"],
     deps = [
         ":literal_test_util",
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:types",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:test",
     ],
@@ -3341,19 +3564,24 @@ xla_test(
         ":local_client_test_base",
         ":xla_internal_test_main",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/service:generic_transfer_manager",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream_executor_h",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test_benchmark",
+        "@com_google_absl//absl/types:span",
+        "@com_google_benchmark//:benchmark",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3384,14 +3612,26 @@ xla_test(
     srcs = ["sample_file_test.cc"],
     backends = ["gpu"],
     data = ["isolated_convolution.hlo"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
+        ":hlo_runner_agnostic_reference_mixin",
         ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
-        "//xla/service:cpu_plugin",  # reference backend
-        "//xla/service:platform_util",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "//xla/service:hlo_module_util",
+        "//xla/service:hlo_runner_interface",
+        "//xla/service:hlo_runner_pjrt",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -3460,7 +3700,7 @@ xla_cc_test(
 )
 
 xla_test(
-    name = "ptxas_bug_120501638",
+    name = "ptxas_bug_120501638_test",
     srcs = ["ptxas_bug_120501638.cc"],
     tags = [
         # Disabled in OSS until nvidia publicly releases a fixed ptxas.
@@ -3556,7 +3796,7 @@ xla_test(
 xla_test(
     name = "cholesky_test",
     srcs = ["cholesky_test.cc"],
-    shard_count = 10,
+    shard_count = 20,
     tags = [
         "optonly",
     ],
@@ -3621,6 +3861,7 @@ xla_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3641,10 +3882,10 @@ xla_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla/hlo/testlib:test",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -3804,7 +4045,8 @@ cc_library(
     testonly = True,
     hdrs = ["hlo_test_base_with_mlir_context.h"],
     deps = [
-        "//xla/tests:hlo_test_base",
+        ":hlo_test_base",
+        "//xla/hlo/analysis:symbolic_expr",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/third_party/xla/xla/tests/broadcast_simple_test.cc b/third_party/xla/xla/tests/broadcast_simple_test.cc
index 8efc57cf84bb76..f567c6ab76931f 100644
--- a/third_party/xla/xla/tests/broadcast_simple_test.cc
+++ b/third_party/xla/xla/tests/broadcast_simple_test.cc
@@ -32,7 +32,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
@@ -101,7 +102,8 @@ float ApplyOpToFloats(HloOpcode op, float lhs, float rhs) {
   }
 }
 
-using BroadcastSimpleTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+using BroadcastSimpleTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
   XlaBuilder b(TestName());
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 9d72c63cc046c2..150e6c769ace79 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -14,24 +14,46 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "xla/tests/xla_test_backend_predicates.h"
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/client/client_library.h"
 #include "xla/client/local_client.h"
+#include "xla/comparison_util.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/service/backend.h"
 #include "xla/service/executable.h"
-#include "xla/status_macros.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -49,7 +71,7 @@ class BufferDonationTest : public HloTestBase {
     backend_ = client_->mutable_backend();
     platform_ = backend_->platform();
     executor_ = backend_->default_stream_executor();
-    TF_CHECK_OK(executor_->Init());
+    CHECK_OK(executor_->Init());
   }
 
  protected:
@@ -106,7 +128,7 @@ class BufferDonationTest : public HloTestBase {
               argument_literal.shape(), &memory_allocator,
               executor_->device_ordinal()));
       ShapedBuffer shaped_buffer = scoped_shaped_buffer.release();
-      TF_CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
+      CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
           stream.get(), argument_literal, shaped_buffer));
       ShapeTree<se::DeviceMemoryBase> input_buffers = shaped_buffer.buffers();
       inputs_buffers.push_back(input_buffers);
diff --git a/third_party/xla/xla/tests/build_defs.bzl b/third_party/xla/xla/tests/build_defs.bzl
index 46b6fb773a2220..9266526a250827 100644
--- a/third_party/xla/xla/tests/build_defs.bzl
+++ b/third_party/xla/xla/tests/build_defs.bzl
@@ -374,6 +374,8 @@ def xla_test(
                 ]
             if backend in AMD_GPU_DEFAULT_BACKENDS:
                 this_backend_tags.append("gpu")
+                if "multi_gpu" in this_backend_tags:
+                    this_backend_tags.append("exclusive-if-local")
                 backend_deps += [
                     "//xla/stream_executor/rocm:all_runtime",
                     "//xla/stream_executor/rocm:gpu_test_kernels_rocm",
diff --git a/third_party/xla/xla/tests/client_library_test_base.cc b/third_party/xla/xla/tests/client_library_test_base.cc
index d84efe662408f9..457d254f17cdc2 100644
--- a/third_party/xla/xla/tests/client_library_test_base.cc
+++ b/third_party/xla/xla/tests/client_library_test_base.cc
@@ -15,24 +15,45 @@ limitations under the License.
 
 #include "xla/tests/client_library_test_base.h"
 
+#include <cstdint>
+#include <functional>
+#include <iterator>
 #include <memory>
+#include <numeric>
+#include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
 
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/client/client_library.h"
 #include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/execution_options_util.h"
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/layout_util.h"
 #include "xla/literal_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/service.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/lib/core/bitmap.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace {
@@ -46,14 +67,14 @@ LocalClient* GetOrCreateLocalClientOrDie(
     const LocalClientOptions& client_options) {
   absl::StatusOr<LocalClient*> result =
       ClientLibrary::GetOrCreateLocalClient(client_options);
-  TF_CHECK_OK(result.status()) << " could not create local client for testing";
+  CHECK_OK(result.status()) << " could not create local client for testing";
   return result.value();
 }
 
 // Helper functions to get the reference platform.
 se::Platform* GetReferencePlatform() {
   auto result = PlatformUtil::GetPlatform(kInterpreter);
-  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  CHECK_OK(result.status()) << "could not get interpreter platform";
   return result.value();
 }
 
@@ -161,9 +182,8 @@ std::string ClientLibraryTestBase::ExecuteToString(
       client_->ExecuteAndTransfer(computation, arguments, &execution_options_);
   if (!result.ok()) {
     return result.status().ToString();
-  } else {
-    return result.value().ToString();
   }
+  return result.value().ToString();
 }
 
 void ClientLibraryTestBase::ComputeAndCompareR1(
@@ -211,7 +231,7 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
     verify_output(actual,
                   absl::StrCat("Test with output layout: ",
                                ShapeUtil::HumanStringWithLayout(layout)));
-  } while (std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
+  } while (absl::c_next_permutation(minor_to_major));
   return absl::OkStatus();
 }
 
@@ -256,8 +276,7 @@ absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         TF_RETURN_IF_ERROR(choose(index + 1));
         arguments_with_layout.pop_back();
         layout_strings.pop_back();
-      } while (
-          std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
+      } while (absl::c_next_permutation(minor_to_major));
       return absl::OkStatus();
     }
 
@@ -301,15 +320,14 @@ absl::StatusOr<Literal> ClientLibraryTestBase::ComputeAndTransfer(
   return ExecuteAndTransfer(computation, arguments, shape_with_layout);
 }
 
-absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    XlaBuilder* builder, const Literal& expected,
+absl::StatusOr<std::vector<GlobalData*>>
+ClientLibraryTestBase::PrepareArguments(
     absl::Span<GlobalData* const> arguments_passed_in,
-    std::optional<ErrorSpec> error, const Shape* shape_with_layout) {
+    std::vector<std::unique_ptr<GlobalData>>& owning_arguments) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
                                      arguments_passed_in.end());
 
-  // Transfer and use elements of arguments_, if the AddParam() API was used.
-  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
+  // If AddParam() API was used, transfer and use elements of arguments_.
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
@@ -320,6 +338,37 @@ absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
       arguments.push_back(owning_arguments.back().get());
     }
   }
+  return arguments;
+}
+
+ClientLibraryTestBase::LiteralWithShape
+ClientLibraryTestBase::PrepareExpectedLiteralAndShape(
+    const Literal& expected, const Shape* shape_with_layout) {
+  ClientLibraryTestBase::LiteralWithShape out;
+  if (test_type_ != F32) {
+    // Convert literal
+    out.literal = MaybeConvertLiteralToTestType(expected);
+    // Convert shape
+    if (shape_with_layout != nullptr) {
+      out.shape = *shape_with_layout;
+      ShapeUtil::ForEachMutableSubshape(
+          &out.shape.value(), [&](Shape* subshape, const ShapeIndex&) {
+            if (subshape->element_type() == F32) {
+              subshape->set_element_type(test_type_);
+            }
+          });
+    }
+  }
+  return out;
+}
+
+absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
+    XlaBuilder* builder, const Literal& expected,
+    absl::Span<GlobalData* const> arguments_passed_in,
+    std::optional<ErrorSpec> error, const Shape* shape_with_layout) {
+  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
+  TF_ASSIGN_OR_RETURN(std::vector<GlobalData*> arguments,
+                      PrepareArguments(arguments_passed_in, owning_arguments));
 
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   if (error == std::nullopt) {
@@ -330,23 +379,15 @@ absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   }
   // We allow using a float expected literal for a non float outputs. In this
   // case, we need to convert the expected literal to test_type_.
-  const Literal* expected_ptr = &expected;
-  Literal converted_expected;
-  Shape layout_shape;
-  if (test_type_ != F32) {
-    converted_expected = MaybeConvertLiteralToTestType(expected);
-    expected_ptr = &converted_expected;
-    if (shape_with_layout != nullptr) {
-      layout_shape = *shape_with_layout;
-      ShapeUtil::ForEachMutableSubshape(
-          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
-            if (subshape->element_type() == F32) {
-              subshape->set_element_type(test_type_);
-            }
-          });
-      shape_with_layout = &layout_shape;
-    }
+  LiteralWithShape expected_converted =
+      PrepareExpectedLiteralAndShape(expected, shape_with_layout);
+  const Literal* expected_ptr = expected_converted.literal.has_value()
+                                    ? &expected_converted.literal.value()
+                                    : &expected;
+  if (expected_converted.shape.has_value()) {
+    shape_with_layout = &expected_converted.shape.value();
   }
+
   auto expect = [&](const Literal& actual, const std::string& error_message) {
     if (error) {
       EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, *error))
@@ -451,14 +492,12 @@ ClientLibraryTestBase::ComputeValueAndReference(
 
   // Create raw pointers to the GlobalData for the rest of the call stack.
   std::vector<GlobalData*> argument_data_ptr;
-  std::transform(
-      argument_data.begin(), argument_data.end(),
-      std::back_inserter(argument_data_ptr),
+  absl::c_transform(
+      argument_data, std::back_inserter(argument_data_ptr),
       [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
   std::vector<GlobalData*> ref_argument_data_ptr;
-  std::transform(
-      ref_argument_data.begin(), ref_argument_data.end(),
-      std::back_inserter(ref_argument_data_ptr),
+  absl::c_transform(
+      ref_argument_data, std::back_inserter(ref_argument_data_ptr),
       [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
 
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
diff --git a/third_party/xla/xla/tests/client_library_test_base.h b/third_party/xla/xla/tests/client_library_test_base.h
index 961f00abf3c0cb..a33e4d0237eba5 100644
--- a/third_party/xla/xla/tests/client_library_test_base.h
+++ b/third_party/xla/xla/tests/client_library_test_base.h
@@ -22,6 +22,19 @@ limitations under the License.
 // This macro helps to ensure that migration test base classes are not used in
 // conjunction with ClientLibraryTestBase.
 // TODO: b/408276009 - Remove these macros once all tests have been migrated.
+#include <functional>
+#include <optional>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "xla/array.h"
+#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
+#include "xla/service/service.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/platform.h"
 #define XLA_TEST_NOT_MIGRATED_TO_HLO_RUNNER_PJRT
 #ifdef XLA_TEST_MIGRATED_TO_HLO_RUNNER_PJRT
 static_assert(false,
@@ -46,15 +59,10 @@ static_assert(false,
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/client_library_test_runner_utils.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/bitmap.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 
@@ -215,11 +223,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Creates an array of pseudorandom values lying between the given minimum and
   // maximum values.
   template <typename NativeT>
-  std::vector<NativeT> CreatePseudorandomR1(const int width, NativeT min_value,
+  std::vector<NativeT> CreatePseudorandomR1(int width, NativeT min_value,
                                             NativeT max_value, uint32_t seed);
   template <typename NativeT>
-  std::unique_ptr<Array2D<NativeT>> CreatePseudorandomR2(const int rows,
-                                                         const int cols,
+  std::unique_ptr<Array2D<NativeT>> CreatePseudorandomR2(int rows, int cols,
                                                          NativeT min_value,
                                                          NativeT max_value,
                                                          uint32_t seed);
@@ -233,16 +240,14 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // If provided, offset is added uniformly to every element (e.g. an offset of
   // 64 would cause 0 in the above to be 64, 1 to be 65, 1000 to be 1064, etc.)
-  std::unique_ptr<Array2D<float>> CreatePatternedMatrix(const int rows,
-                                                        const int cols,
+  std::unique_ptr<Array2D<float>> CreatePatternedMatrix(int rows, int cols,
                                                         float offset = 0.0);
 
   // Creates a (rows x cols) array as above, padded out to
   // (rows_padded x cols_padded) with zeroes.  Requires rows_padded >= rows
   // and cols_padded > cols.
   std::unique_ptr<Array2D<float>> CreatePatternedMatrixWithZeroPadding(
-      const int rows, const int cols, const int rows_padded,
-      const int cols_padded);
+      int rows, int cols, int rows_padded, int cols_padded);
 
   // Creates a parameter instruction, transfers the literal for the parameter to
   // server, then stores into "data_handle" the global handle for that
@@ -413,6 +418,24 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Converts an f32 shape to test_type_.
   Shape MaybeConvertShapeToTestType(const Shape& shape);
 
+  // Helper function to prepare arguments, moving transfer logic out of the main
+  // function.
+  absl::StatusOr<std::vector<GlobalData*>> PrepareArguments(
+      absl::Span<GlobalData* const> arguments_passed_in,
+      std::vector<std::unique_ptr<GlobalData>>& owning_arguments);
+
+  struct LiteralWithShape {
+    std::optional<Literal> literal;
+    std::optional<Shape> shape;
+  };
+
+  // Converts the expected literal and the shape with layout to test_type_.
+  // If the test_type_ is not F32, the expected literal will be converted to
+  // test_type_. If the shape_with_layout is not nullptr and the test_type_ is
+  // not F32, the shape_with_layout will be converted to test_type_.
+  LiteralWithShape PrepareExpectedLiteralAndShape(
+      const Literal& expected, const Shape* shape_with_layout);
+
   // Type to use when running tests. By default, we use F32 for historical
   // reasons and we rely on the underlying tests to change it.
   PrimitiveType test_type_ = F32;
diff --git a/third_party/xla/xla/tests/client_library_test_runner_utils.cc b/third_party/xla/xla/tests/client_library_test_runner_utils.cc
index f344ca5311eb2b..e3cf8afe4d8a8f 100644
--- a/third_party/xla/xla/tests/client_library_test_runner_utils.cc
+++ b/third_party/xla/xla/tests/client_library_test_runner_utils.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status.h"
 
 namespace xla {
 
@@ -36,7 +35,7 @@ XlaComputation CreateScalarReluF32() {
   XlaOp zero = ConstantR0<float>(&builder, 0.0f);
   Max(std::move(z_value), std::move(zero));
   absl::StatusOr<XlaComputation> computation = builder.Build();
-  TF_CHECK_OK(computation.status());
+  CHECK_OK(computation.status());
   return *std::move(computation);
 }
 
@@ -47,7 +46,7 @@ XlaComputation CreateScalarMax(const PrimitiveType test_type) {
   XlaOp y = Parameter(&builder, 1, shape, "y");
   Max(std::move(x), std::move(y));
   absl::StatusOr<XlaComputation> computation = builder.Build();
-  TF_CHECK_OK(computation.status());
+  CHECK_OK(computation.status());
   return *std::move(computation);
 }
 
diff --git a/third_party/xla/xla/tests/codegen_test_base.h b/third_party/xla/xla/tests/codegen_test_base.h
index 9a5da75397aef3..d4f4ff7d5ad441 100644
--- a/third_party/xla/xla/tests/codegen_test_base.h
+++ b/third_party/xla/xla/tests/codegen_test_base.h
@@ -26,7 +26,7 @@ limitations under the License.
 namespace xla {
 
 // Provides access to both the JIT and the AOT compiler for testing.
-class CodegenTestBase : public HloTestBaseWithMlirContext {
+class CodegenTestBase : public HloTestBaseWithMLIRContext {
  protected:
   // Compiles hlo_module with the JIT compiler.
   absl::StatusOr<std::unique_ptr<Executable>> CompileToExecutable(
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test.cc b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
index 7063a5ac2f8f48..6a2371bdb03216 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
@@ -14,19 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <array>
 #include <cmath>
-#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -40,37 +39,30 @@ limitations under the License.
 #include "xla/array.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/service/backend.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_memory_space_assignment.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/hlo_runner_interface.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/platform_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/integrations/device_mem_allocator.h"
-#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/tests/collective_ops_e2e_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_utils.h"
-#include "xla/tsl/framework/bfc_allocator.h"
-#include "xla/tsl/framework/device_id.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/types.h"
@@ -84,90 +76,27 @@ namespace op = ::xla::testing::opcode_matchers;
 namespace m = ::xla::match;
 using ::testing::NotNull;
 
-// Makes a DeviceAssignment device#i to replica_id #i.
-DeviceAssignment MakeDeviceAssn(int64_t num_replicas) {
-  DeviceAssignment assn(/*replica_count=*/num_replicas,
-                        /*computation_count=*/1);
-  for (int64_t i = 0; i < num_replicas; ++i) {
-    assn(i, 0) = i;
-  }
-  return assn;
-}
-
-std::unique_ptr<tsl::BFCAllocator> CreateAllocator(se::StreamExecutor* executor,
-                                                   int64_t device_ordinal,
-                                                   std::string name_suffix,
-                                                   size_t memory_size) {
-  tsl::BFCAllocator::Options opts;
-  opts.allow_growth = false;
-  return std::make_unique<tsl::BFCAllocator>(
-      std::make_unique<se::DeviceMemAllocator>(
-          executor, tsl::PlatformDeviceId(device_ordinal)),
-      memory_size, absl::StrCat("GPU_", device_ordinal, name_suffix), opts);
-}
-
-template <typename Type>
-Type CheckStatus(absl::StatusOr<Type> result) {
-  CHECK_OK(result);
-  return *result;
+bool IsAsync(const HloInstruction* inst) {
+  return !inst->backend_config<gpu::GpuBackendConfig>()
+              .value()
+              .collective_backend_config()
+              .is_sync();
 }
 
-class CollectiveOpsTestE2E : public HloHardwareIndependentTestBase {
+class CollectiveOpsTestE2E : public CollectiveOpsE2ETestBase {
  public:
   CollectiveOpsTestE2E() {
-    se::Platform* platform = CheckStatus(PlatformUtil::GetPlatform("GPU"));
-    se::Platform* reference_platform =
-        CheckStatus(PlatformUtil::GetPlatform("GPU"));
-
-    std::vector<se::MultiDeviceAdapter::AllocatorInfo> allocators;
-    constexpr int64_t kGB = 1024LL * 1024LL * 1024LL;
-    size_t common_buffers_size = 8 * kGB;   // 8GB
-    size_t collectives_buffers_size = kGB;  // 1GB
-    for (int64_t i = 0; i < platform->VisibleDeviceCount(); ++i) {
-      se::StreamExecutor* executor =
-          CheckStatus(platform->ExecutorForDevice(i));
-      // Common memory allocator for device i.
-      allocators.emplace_back(
-          CreateAllocator(executor, i, "_bfc", common_buffers_size), nullptr, 0,
-          i, platform);
-
-      // Collectives and symmetric memory allocator for device i.
-      allocators.emplace_back(CreateAllocator(executor, i, "_collectives_bfc",
-                                              collectives_buffers_size),
-                              nullptr, (int)gpu::MemorySpaceColor::kCollective,
-                              i, platform);
-    }
-
-    hlo_runner_ = std::make_unique<HloRunner>(
-        platform, /*intra_op_parallelism_threads=*/0,
-        std::make_unique<se::MultiDeviceAdapter>(platform,
-                                                 std::move(allocators)));
-    reference_hlo_runner_ = std::make_unique<HloRunner>(
-        reference_platform, /*intra_op_parallelism_threads=*/0);
-
     replacements_[kF8E4M3DatatypePlaceholder] =
-        IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
+        Capability().IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
     replacements_[kF8E5M2DatatypePlaceholder] =
-        IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
-  }
-
-  bool IsCuda() {
-    return std::holds_alternative<se::CudaComputeCapability>(Capability());
-  }
-
-  const se::GpuComputeCapability& Capability() {
-    return hlo_runner_->backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
+        Capability().IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
   }
 
   bool HasFp8Support() {
-    if (IsCuda()) {
-      return std::get<se::CudaComputeCapability>(Capability()).IsAtLeast(8, 9);
+    if (Capability().IsCuda()) {
+      return Capability().cuda_compute_capability()->IsAtLeast(8, 9);
     }
-    return std::get<se::RocmComputeCapability>(Capability())
-               .has_fp8_support() &&
+    return Capability().rocm_compute_capability()->has_fp8_support() &&
            GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
   }
 
@@ -199,149 +128,14 @@ class CollectiveOpsTestE2E : public HloHardwareIndependentTestBase {
     }
   }
 
-  // TODO(b/449655621) Use absl::AnyInvocable instead of std::function.
-  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      const std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      const std::function<int64_t(int64_t)> argument_count_provider,
-      const std::function<const Literal*(int64_t, int64_t)> argument_provider,
-      const int64_t num_replicas, const bool run_hlo_passes,
-      DeviceAssignment* const device_assignment) {
-    // TODO(b/441865120): Use designated initializers this once XLA moves to
-    // C++20.
-    HloRunnerInterface::ReplicatedExecuteOptions options;
-    options.num_replicas = num_replicas;
-    options.run_hlo_passes = run_hlo_passes;
-    options.use_threads = true;
-
-    return hlo_runner_->ExecuteReplicated(
-        std::move(executable_provider), std::move(argument_count_provider),
-        std::move(argument_provider), std::move(options), device_assignment);
-  }
-
-  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::unique_ptr<HloModule> module,
-      const absl::Span<const Literal* const> arguments,
-      const int64_t num_replicas, DeviceAssignment* const device_assignment,
-      const bool run_hlo_passes, const bool use_threads) {
-    // TODO(b/441865120): Use designated initializers this once XLA moves to
-    // C++20.
-    HloRunnerInterface::ReplicatedExecuteOptions options;
-    options.num_replicas = num_replicas;
-    options.arguments = {arguments.begin(), arguments.end()};
-    options.run_hlo_passes = run_hlo_passes;
-    options.use_threads = use_threads;
-
-    return hlo_runner_->ExecuteReplicated(std::move(module), std::move(options),
-                                          device_assignment);
-  }
-
-  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::unique_ptr<HloModule> module,
-      const std::vector<std::vector<Literal*>> arguments,
-      DeviceAssignment* const device_assignment, const int64_t num_replicas,
-      const bool run_hlo_passes) {
-    CHECK(num_replicas > 0 && "expect at least one replica");
-    CHECK(num_replicas == arguments.size() &&
-          "expect arguments for each replica");
-    int64_t argument_count = arguments.front().size();
-    TF_ASSIGN_OR_RETURN(
-        const std::unique_ptr<OpaqueExecutable> executable,
-        hlo_runner_->CreateExecutable(std::move(module), run_hlo_passes));
-    return ExecuteReplicated(
-        /*executable_provider=*/[&](int64_t) { return executable.get(); },
-        /*argument_count_provider=*/[&](int64_t) { return argument_count; },
-        /*argument_provider=*/
-        [&](int64_t replica_idx, int64_t argument_idx) -> const Literal* {
-          return arguments[replica_idx][argument_idx];
-        },
-        num_replicas, /*run_hlo_passes=*/run_hlo_passes,
-        /*device_assignment=*/device_assignment);
-  }
-
-  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      OpaqueExecutable* executable, int64_t num_replicas) {
-    DeviceAssignment device_assignment = MakeDeviceAssn(num_replicas);
-    return ExecuteReplicated(
-        /*executable_provider*/ [&](int64_t) { return executable; },
-        /*argument_count_provider*/ [](int64_t) { return 0; },
-        /*argument_provider*/ [](int64_t, int64_t) { return nullptr; },
-        num_replicas, /*run_hlo_passes=*/false, &device_assignment);
-  }
-
-  bool IsAsync(const HloInstruction* inst) {
-    return !inst->backend_config<gpu::GpuBackendConfig>()
-                .value()
-                .collective_backend_config()
-                .is_sync();
-  }
-
  protected:
   absl::flat_hash_map<absl::string_view, absl::string_view> replacements_;
-  std::unique_ptr<HloRunner> hlo_runner_;
-  std::unique_ptr<HloRunner> reference_hlo_runner_;
 
  private:
   static constexpr const char* kF8E4M3DatatypePlaceholder{"<<F8E4M3>>"};
   static constexpr const char* kF8E5M2DatatypePlaceholder{"<<F8E5M2>>"};
 };
 
-// E2E tests for collective ops. These will generally verify some HLO transform
-// for collectives (for example, sync -> async conversion) and correct
-// execution of the transformed HLO.
-
-// E2E test for collectives with flags set. Has constructor arguments specifying
-// whether to enable/disable async collectives, and to set the memcpy_local_p2p
-// flag. Subclasses pass in constructor arguments based on GetParam().
-class CollectiveOpsWithFlagsBase : public CollectiveOpsTestE2E {
- public:
-  CollectiveOpsWithFlagsBase(bool enable_async, bool enable_p2p_memcpy)
-      : enable_async_(enable_async), enable_p2p_memcpy_(enable_p2p_memcpy) {
-    VLOG(1) << "Running with " << num_devices_ << " devices";
-    num_devices_ = hlo_runner_->backend().device_count();
-  }
-
- protected:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options =
-        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
-
-    // Disable autotuning which is unnecessary.
-    debug_options.set_xla_gpu_autotune_level(0);
-
-    // Enable or disable all async collectives based on test parameter.
-    if (!enable_async_) {
-      for (auto option :
-           {DebugOptions::NOOP, DebugOptions::ALLREDUCE,
-            DebugOptions::ALLGATHER, DebugOptions::REDUCESCATTER,
-            DebugOptions::COLLECTIVEBROADCAST, DebugOptions::ALLTOALL,
-            DebugOptions::COLLECTIVEPERMUTE, DebugOptions::RAGGEDALLTOALL}) {
-        debug_options.add_xla_gpu_disable_async_collectives(option);
-      }
-    }
-    debug_options.add_xla_disable_hlo_passes(
-        "gpu-convert-async-collectives-to-sync");
-    if (enable_p2p_memcpy_) {
-      debug_options.set_xla_gpu_use_memcpy_local_p2p(true);
-    }
-    return debug_options;
-  }
-
-  absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
-      absl::string_view hlo_string, int64_t num_replicas) {
-    HloModuleConfig config =
-        GetModuleConfigForTest(/*replica_count=*/num_replicas);
-
-    TF_ASSIGN_OR_RETURN(auto module,
-                        ParseAndReturnVerifiedModule(hlo_string, config));
-    return hlo_runner_->CreateExecutable(std::move(module),
-                                         /*run_hlo_passes=*/true);
-  }
-
-  const bool enable_async_;
-  const bool enable_p2p_memcpy_;
-  int64_t num_devices_;
-};
-
 class AsyncCollectiveOps : public CollectiveOpsWithFlagsBase,
                            public ::testing::WithParamInterface<bool> {
  public:
@@ -1505,8 +1299,9 @@ TEST_F(CollectiveOpsTestE2E, HostMemoryOffloadingWithDonation) {
 // E2E tests comparing the results of sharded and unsharded execution.
 class CollectiveOpsTestE2EShardedUnsharded : public CollectiveOpsTestE2E {
  public:
-  void CollectiveOpsCompareShardedUnsharded(const std::string& hlo_text,
-                                            const int64_t num_partitions = 2) {
+  void CollectiveOpsCompareShardedUnsharded(
+      const std::string& hlo_text, const int64_t num_partitions = 2,
+      bool enable_enzyme_comms_opt = false) {
     const int64_t num_replicas = 1;
     if (hlo_runner_->device_count() < num_replicas * num_partitions) {
       GTEST_SKIP() << "Test requires at least " << num_replicas * num_partitions
@@ -1518,8 +1313,9 @@ class CollectiveOpsTestE2EShardedUnsharded : public CollectiveOpsTestE2E {
                             ExecuteUnsharded(hlo_text));
     ASSERT_EQ(ref_results.size(), 1);
 
-    TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                            ExecuteSharded(hlo_text, num_partitions));
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::vector<Literal> results,
+        ExecuteSharded(hlo_text, num_partitions, enable_enzyme_comms_opt));
     ASSERT_EQ(results.size(), num_partitions);
 
     ErrorSpec error_spec{1e-4, 1e-4};
@@ -1565,12 +1361,16 @@ class CollectiveOpsTestE2EShardedUnsharded : public CollectiveOpsTestE2E {
 
   // Execute the sharded case.
   absl::StatusOr<std::vector<Literal>> ExecuteSharded(
-      const std::string& hlo_text, int64_t num_partitions) {
+      const std::string& hlo_text, int64_t num_partitions,
+      bool enable_enzyme_comms_opt = false) {
     HloModuleConfig config = GetModuleConfigForTest();
     DebugOptions opts = GetDebugOptionsForTest();
     opts.set_xla_gpu_enable_triton_gemm(false);
     config.set_debug_options(opts);
     config.set_num_partitions(num_partitions);
+    if (enable_enzyme_comms_opt) {
+      config.mutable_debug_options().set_xla_enable_enzyme_comms_opt(true);
+    }
     TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> module,
                         ParseAndReturnVerifiedModule(hlo_text, config));
     const int64_t num_params = module->entry_computation()->num_parameters();
@@ -1716,6 +1516,51 @@ ENTRY entry {
   CollectiveOpsCompareShardedUnsharded(hlo_text);
 }
 
+// This is an execution test for the example in Option 2 in go/dus-spmd. This
+// test should pass regardless of which DUS SPMD implementation option is used.
+TEST_F(CollectiveOpsTestE2EShardedUnsharded,
+       DusSingleDimensionInPartitionMode) {
+  const std::string hlo_text = R"(
+    HloModule module, entry_computation_layout={(s32[16]{0}, s32[8]{0})->s32[16]{0}}, num_partitions=4
+
+    ENTRY entry {
+      %input = s32[16] parameter(0), sharding={devices=[4]<=[4]}
+      %update = s32[8] parameter(1), sharding={devices=[4]<=[4]}
+      %c3 = s32[] constant(3)
+      ROOT %dynamic-update-slice = s32[16] dynamic-update-slice(%input, %update, %c3), sharding={devices=[4]<=[4]}
+    })";
+  CollectiveOpsCompareShardedUnsharded(hlo_text, /*num_partitions=*/4,
+                                       /*enable_enzyme_comms_opt=*/true);
+  // This test should pass regardless if enzyme comms opt is enabled or not.
+  CollectiveOpsCompareShardedUnsharded(hlo_text, /*num_partitions=*/4,
+                                       /*enable_enzyme_comms_opt=*/false);
+}
+
+// TODO(463571743): Reduce the shapes to make test-case faster.
+TEST_F(CollectiveOpsTestE2EShardedUnsharded,
+       KeepPartitionedNonSlicedDimensionWithConstantIndices) {
+  const std::string hlo_text = R"(
+    HloModule module, entry_computation_layout={(bf16[16,192,192,384]{3,2,1,0}, bf16[16,128,128,384]{3,2,1,0})->bf16[16,224,224,384]{3,2,1,0}}, num_partitions=8
+
+    ENTRY entry {
+      p1 = bf16[16,192,192,384]{3,2,1,0} parameter(0), sharding={replicated}
+      p2 = bf16[16,128,128,384]{3,2,1,0} parameter(1), sharding={replicated}
+      c1 = bf16[16,192,192,384]{3,2,1,0} copy(p1), sharding={devices=[2,2,2,1]<=[8]}
+      c2 = bf16[16,128,128,384]{3,2,1,0} copy(p2), sharding={devices=[2,2,2,1]<=[8]}
+      constant.1163 = bf16[] constant(0), sharding={replicated}
+      constant.1165 = s32[] constant(0), sharding={replicated}
+      pad.179 = bf16[16,224,224,384]{3,2,1,0} pad(c1, constant.1163), padding=0_0x16_16x16_16x0_0, sharding={devices=[2,2,2,1]<=[8]}
+      add.439 = bf16[16,128,128,384]{3,2,1,0} add(c2, c2), sharding={devices=[2,2,2,1]<=[8]}
+      constant.1070 = s32[] constant(48), sharding={replicated}
+      dynamic-update-slice.128 = bf16[16,224,224,384]{3,2,1,0} dynamic-update-slice(pad.179, add.439, constant.1165, constant.1070, constant.1070, /*index=5*/constant.1165), sharding={devices=[2,2,2,1]<=[8]}
+      ROOT c = bf16[16,224,224,384]{3,2,1,0} copy(dynamic-update-slice.128), sharding={devices=[2,2,2,1]<=[8]}
+    })";
+  CollectiveOpsCompareShardedUnsharded(hlo_text, /*num_partitions=*/8,
+                                       /*enable_enzyme_comms_opt=*/true);
+  CollectiveOpsCompareShardedUnsharded(hlo_text, /*num_partitions=*/8,
+                                       /*enable_enzyme_comms_opt=*/false);
+}
+
 TEST_F(CollectiveOpsTestE2EShardedUnsharded, DotBatchAndNonContracting) {
   const std::string hlo_text = R"(
 HloModule module, entry_computation_layout={(f32[4,16,8]{2,1,0}, f32[4,4,8]{2,1,0})->f32[4,16,4]{2,1,0}}, num_partitions=2
@@ -2750,924 +2595,6 @@ ENTRY entry {
   EXPECT_NE(hlo_module, nullptr);
 }
 
-enum class RaggedAllToAllImplType {
-  kNccl,
-  kMemcpy,
-  kDecomposer,
-  kOneShot,
-};
-
-class RaggedAllToAllTestBase : public CollectiveOpsWithFlagsBase {
- public:
-  RaggedAllToAllTestBase(bool enable_async, RaggedAllToAllImplType impl_type)
-      : CollectiveOpsWithFlagsBase(
-            enable_async, impl_type == RaggedAllToAllImplType::kMemcpy),
-        impl_type_(impl_type) {}
-
-  // Creates random test data for a ragged-all-to-all.
-  //
-  // Ragged tensors which are ragged (have various size) along the second most
-  // changing dimension only, i.e. shape such as [8, (4), 3]. In memory those
-  // tensors are flattened out the outermost dimension.
-  //
-  // A ragged tensor is represented by three arrays: data, offsets, and sizes.
-  //   * The data array holds the elements of the ragged tensor.
-  //   * The offsets array holds the starting offset of each ragged row.
-  //   * The sizes array holds the number of elements in each ragged row.
-  //
-  // A ragged-all-to-all of N replicas performance a collective transpose of the
-  // ragged tensors. Each pair of replicas exchanges one ragged row. To generate
-  // the test data we need to know the sizes of all ragged rows for each
-  // replica.
-  //
-  // `input_sizes` is an array of shape [num_replicas, num_replicas,
-  // num_updates_per_replica]. For concenivence, `input_sizes` can be a 2D
-  // array, in that case `num_updates_per_replica` is assumed to be 1.
-  absl::Status CreateRandomTestData(HloModule* module,
-                                    Array<int64_t> input_sizes) {
-    CHECK(inputs_.empty());
-    if (input_sizes.num_dimensions() == 2) {
-      input_sizes.Reshape({input_sizes.dim(0), input_sizes.dim(1), 1});
-    }
-    auto ragged_all_to_all =
-        FindInstruction(module, HloOpcode::kRaggedAllToAll);
-    EXPECT_THAT(ragged_all_to_all, NotNull());
-
-    const std::vector<ReplicaGroup>& replica_groups =
-        ragged_all_to_all->replica_groups();
-    EXPECT_FALSE(replica_groups.empty());
-
-    int64_t num_total_replicas = input_sizes.dim(0);
-    int64_t num_replicas = replica_groups[0].replica_ids_size();
-
-    EXPECT_TRUE(
-        absl::c_all_of(replica_groups, [&](const ReplicaGroup& replica_group) {
-          return replica_group.replica_ids_size() == num_replicas;
-        }));
-
-    inputs_.resize(num_total_replicas);
-    expected_outputs_.resize(num_total_replicas);
-    input_offsets_.resize(num_total_replicas);
-    input_sizes_.resize(num_total_replicas);
-    output_offsets_.resize(num_total_replicas);
-    output_sizes_.resize(num_total_replicas);
-
-    HloInstruction* output_param =
-        module->entry_computation()->parameter_instruction(1);
-
-    // The ragged-all-to-all accepts an output tensor as a parameter to allow
-    // buffer reuse. We initialize the output tensor with -1 to make sure that
-    // we don't accidentally overwrite data that is not part of the
-    // ragged-all-to-all update.
-    Array<float> output_init_data(output_param->shape().dimensions());
-    output_init_data.Fill(-1);
-
-    // Iterate over all replica groups and create random test data for each
-    // group.
-    for (const ReplicaGroup& replica_group : replica_groups) {
-      Array<int64_t> input_sizes_per_replica_group(
-          {num_replicas, input_sizes.dim(1), input_sizes.dim(2)});
-
-      for (int64_t i = 0; i < num_replicas; ++i) {
-        int64_t replica_id = replica_group.replica_ids(i);
-        input_sizes_per_replica_group.UpdateSlice(
-            input_sizes.Slice(
-                {replica_id, 0, 0},
-                {replica_id + 1, input_sizes.dim(1), input_sizes.dim(2)}),
-            {i, 0, 0});
-      }
-
-      TF_RETURN_IF_ERROR(CreateRandomTestDataForReplicaGroup(
-          module, input_sizes_per_replica_group, output_init_data,
-          replica_group));
-    }
-
-    TF_ASSIGN_OR_RETURN(output_init_,
-                        LiteralUtil::CreateFromArrayWithLayout(
-                            output_init_data, output_param->shape().layout())
-                            .Convert(output_param->shape().element_type()));
-    return absl::OkStatus();
-  }
-
-  // Create random test data for a ragged-all-to-all for a single replica group.
-  absl::Status CreateRandomTestDataForReplicaGroup(
-      HloModule* module, Array<int64_t> input_sizes,
-      const Array<float>& output_init_data, const ReplicaGroup& replica_group) {
-    HloInstruction* input_param =
-        module->entry_computation()->parameter_instruction(0);
-    HloInstruction* output_param =
-        module->entry_computation()->parameter_instruction(1);
-    int64_t num_replicas = replica_group.replica_ids_size();
-
-    Array<int64_t> output_sizes = input_sizes;
-    output_sizes.TransposeDimensions({1, 0, 2});
-
-    Array<int64_t> input_offsets = CalculateOffsetsFromSizes(input_sizes);
-    Array<int64_t> output_offsets = CalculateOffsetsFromSizes(output_sizes);
-    output_offsets.TransposeDimensions({1, 0, 2});
-
-    std::vector<Array<float>> input_data(
-        num_replicas, Array<float>(input_param->shape().dimensions()));
-    std::vector<Array<float>> output_data(num_replicas, output_init_data);
-    FillWithRandomData(input_data, output_data, input_offsets, output_offsets,
-                       input_sizes);
-
-    // Create literals from array data.
-    for (int64_t i = 0; i < num_replicas; ++i) {
-      int64_t replica_id = replica_group.replica_ids(i);
-      TF_ASSIGN_OR_RETURN(inputs_[replica_id],
-                          LiteralUtil::CreateFromArrayWithLayout(
-                              input_data[i], input_param->shape().layout())
-                              .Convert(input_param->shape().element_type()));
-
-      TF_ASSIGN_OR_RETURN(expected_outputs_[replica_id],
-                          LiteralUtil::CreateFromArrayWithLayout(
-                              output_data[i], output_param->shape().layout())
-                              .Convert(output_param->shape().element_type()));
-
-      TF_ASSIGN_OR_RETURN(
-          input_offsets_[replica_id],
-          GetParameterLiteral(module, /*parameter_index=*/2, i, input_offsets));
-
-      TF_ASSIGN_OR_RETURN(
-          input_sizes_[replica_id],
-          GetParameterLiteral(module, /*parameter_index=*/3, i, input_sizes));
-
-      TF_ASSIGN_OR_RETURN(output_offsets_[replica_id],
-                          GetParameterLiteral(module, /*parameter_index=*/4, i,
-                                              output_offsets));
-      TF_ASSIGN_OR_RETURN(
-          output_sizes_[replica_id],
-          GetParameterLiteral(module, /*parameter_index=*/5, i, output_sizes));
-    }
-    return absl::OkStatus();
-  }
-
-  // Returns a vector of pointers to the literals in the format needed for
-  // ExecuteReplicated.
-  std::vector<std::vector<Literal*>> GetInputLiteralPtrs() {
-    std::vector<std::vector<Literal*>> input_literal_ptrs;
-    for (int i = 0; i < inputs_.size(); ++i) {
-      input_literal_ptrs.push_back({&inputs_[i], &output_init_,
-                                    &input_offsets_[i], &input_sizes_[i],
-                                    &output_offsets_[i], &output_sizes_[i]});
-    }
-    return input_literal_ptrs;
-  }
-
- protected:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions opts = CollectiveOpsWithFlagsBase::GetDebugOptionsForTest();
-    opts.set_xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(
-        impl_type_ == RaggedAllToAllImplType::kDecomposer);
-    opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(
-        impl_type_ == RaggedAllToAllImplType::kOneShot);
-    return opts;
-  }
-
-  // Computes ragged tensor offsets based on the sizes of the ragged rows.
-  Array<int64_t> CalculateOffsetsFromSizes(const Array<int64_t>& sizes) {
-    int64_t num_replicas = sizes.dim(0);
-    int64_t num_updates_per_replica = sizes.dim(2);
-    Array<int64_t> offsets(sizes.dimensions());
-    for (int i = 0; i < num_replicas; ++i) {
-      int64_t cur_offset = 0;
-      for (int j = 0; j < num_replicas; ++j) {
-        for (int k = 0; k < num_updates_per_replica; ++k) {
-          offsets(i, j, k) = cur_offset;
-          cur_offset += sizes(i, j, k);
-        }
-      }
-    }
-    return offsets;
-  }
-
-  // Fill the input and output tensors with random data. An all-to-all is
-  // effectively a transpose. We generate a chunk of random data for each update
-  // of each pair of replicas and write the chunk starting from the (i, j, k)
-  // offset of the input tensor and starting from the (j, i, k) offset of the
-  // output tensor.
-  void FillWithRandomData(std::vector<Array<float>>& input_data,
-                          std::vector<Array<float>>& output_data,
-                          const Array<int64_t>& input_offsets,
-                          const Array<int64_t>& output_offsets,
-                          const Array<int64_t>& input_sizes) {
-    int64_t num_replicas = input_sizes.dim(0);
-    int64_t num_updates_per_replica = input_sizes.dim(2);
-    std::vector<int64_t> start_indices(input_data[0].num_dimensions());
-    std::vector<int64_t> chunk_sizes{input_data[0].dimensions().begin(),
-                                     input_data[0].dimensions().end()};
-
-    for (int i = 0; i < num_replicas; ++i) {
-      for (int j = 0; j < num_replicas; ++j) {
-        for (int k = 0; k < num_updates_per_replica; ++k) {
-          chunk_sizes[0] = input_sizes(i, j, k);
-
-          Array<float> chunk_data(chunk_sizes);
-          chunk_data.FillRandomUniform(
-              1, 127,
-              /*seed=*/(i * num_replicas + j) * num_updates_per_replica + k);
-
-          start_indices[0] = input_offsets(i, j, k);
-          input_data[i].UpdateSlice(chunk_data, start_indices);
-
-          start_indices[0] = output_offsets(i, j, k);
-          output_data[j].UpdateSlice(chunk_data, start_indices);
-        }
-      }
-    }
-  }
-
-  // Returns a literal for the given parameter of the given replica.
-  absl::StatusOr<Literal> GetParameterLiteral(HloModule* module,
-                                              int64_t parameter_index,
-                                              int64_t replica_id,
-                                              const Array<int64_t>& data) {
-    HloInstruction* param =
-        module->entry_computation()->parameter_instruction(parameter_index);
-
-    int64_t num_replicas = data.dim(0);
-    int64_t num_updates_per_replica = data.dim(2);
-    Array<int64_t> replica_slice =
-        data.Slice({replica_id, 0, 0},
-                   {replica_id + 1, num_replicas, num_updates_per_replica});
-    replica_slice.Reshape({num_replicas * num_updates_per_replica});
-    return LiteralUtil::CreateFromArray(replica_slice)
-        .Convert(param->shape().element_type());
-  }
-
-  // Literates for the input and output data, offset, and size parameters of
-  // the ragged-all-to-all. Each vector contains one literal per replica.
-  std::vector<Literal> inputs_;
-  std::vector<Literal> input_offsets_;
-  std::vector<Literal> input_sizes_;
-
-  std::vector<Literal> expected_outputs_;
-  std::vector<Literal> output_offsets_;
-  std::vector<Literal> output_sizes_;
-
-  Literal output_init_;
-
-  RaggedAllToAllImplType impl_type_;
-};
-
-class RaggedAllToAllTest : public RaggedAllToAllTestBase,
-                           public ::testing::WithParamInterface<
-                               std::tuple<bool, RaggedAllToAllImplType>> {
- public:
-  RaggedAllToAllTest()
-      : RaggedAllToAllTestBase(std::get<0>(GetParam()),
-                               std::get<1>(GetParam())) {}
-};
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[4] parameter(0)
-    output = f32[4] parameter(1)
-    input_offsets = s32[2] parameter(2)
-    send_sizes = s32[2] parameter(3)
-    output_offsets = s32[2] parameter(4)
-    recv_sizes = s32[2] parameter(5)
-    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
-    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
-      << "Test requires at least " << kNumReplicas << " devices ("
-      << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(),
-                                    /*input_sizes=*/{/*replica_0=*/{1, 1},
-                                                     /*replica_1=*/{3, 1}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_InputBufferLargerThanOutput) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[32] parameter(0)
-    output = f32[16] parameter(1)
-    input_offsets = s32[2] parameter(2)
-    send_sizes = s32[2] parameter(3)
-    output_offsets = s32[2] parameter(4)
-    recv_sizes = s32[2] parameter(5)
-    ROOT ra2a = f32[16] ragged-all-to-all(input, output, input_offsets,
-    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
-      << "Test requires at least " << kNumReplicas << " devices ("
-      << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(),
-                                    /*input_sizes=*/{/*replica_0=*/{8, 5},
-                                                     /*replica_1=*/{4, 3}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_OutputBufferLargerThanInput) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[16] parameter(0)
-    output = f32[32] parameter(1)
-    input_offsets = s32[2] parameter(2)
-    send_sizes = s32[2] parameter(3)
-    output_offsets = s32[2] parameter(4)
-    recv_sizes = s32[2] parameter(5)
-    ROOT ra2a = f32[32] ragged-all-to-all(input, output, input_offsets,
-    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
-      << "Test requires at least " << kNumReplicas * kNumPartitions
-      << " devices (" << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(),
-                                    /*input_sizes=*/{/*replica_0=*/{4, 12},
-                                                     /*replica_1=*/{5, 11}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_MultipleUpdates) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[8] parameter(0)
-    output = f32[8] parameter(1)
-    input_offsets = s32[4] parameter(2)
-    send_sizes = s32[4] parameter(3)
-    output_offsets = s32[4] parameter(4)
-    recv_sizes = s32[4] parameter(5)
-    ROOT ra2a = f32[8] ragged-all-to-all(input, output, input_offsets,
-    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
-      << "Test requires at least " << kNumReplicas * kNumPartitions
-      << " devices (" << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  TF_ASSERT_OK(CreateRandomTestData(
-      module.get(), /*input_sizes=*/{/*replica_0=*/{{1, 2}, {2, 1}},
-                                     /*replica_1=*/{{3, 1}, {1, 1}}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_MultiDimData) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = bf16[16, 5, 32] parameter(0)
-    output = bf16[16, 5, 32] parameter(1)
-    input_offsets = s64[2] parameter(2)
-    send_sizes = s64[2] parameter(3)
-    output_offsets = s64[2] parameter(4)
-    recv_sizes = s64[2] parameter(5)
-    ROOT ra2a = bf16[16, 5, 32] ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes),
-      replica_groups={{0,1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
-      << "Test requires at least " << kNumReplicas * kNumPartitions
-      << " devices (" << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(),
-                                    /*input_sizes=*/{/*replica_0=*/{4, 7},
-                                                     /*replica_1=*/{2, 5}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_Degenerate) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module
-
-  ENTRY entry {
-    input = f32[4] parameter(0)
-    output = f32[4] parameter(1)
-    input_offsets = s32[1] parameter(2)
-    send_sizes = s32[1] parameter(3)
-    output_offsets = s32[1] parameter(4)
-    recv_sizes = s32[1] parameter(5)
-    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
-    send_sizes, output_offsets, recv_sizes), replica_groups={{0},{1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
-      << "Test requires at least " << kNumReplicas * kNumPartitions
-      << " devices (" << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(),
-                                    /*input_sizes=*/{/*replica_0=*/{1},
-                                                     /*replica_1=*/{3}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_NonDefaultLayout) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module
-
-  ENTRY entry {
-    input = f32[16,4,8]{0,2,1} parameter(0)
-    output = f32[16,4,8]{0,1,2} parameter(1)
-    input_offsets = s32[2] parameter(2)
-    send_sizes = s32[2] parameter(3)
-    output_offsets = s32[2] parameter(4)
-    recv_sizes = s32[2] parameter(5)
-    ROOT ra2a = f32[16,4,8]{0,1,2} ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes),
-      replica_groups={{0,1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
-      << "Test requires at least " << kNumReplicas * kNumPartitions
-      << " devices (" << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  auto ragged_all_to_all =
-      FindInstruction(module.get(), HloOpcode::kRaggedAllToAll);
-  EXPECT_THAT(ragged_all_to_all, NotNull());
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(),
-                                    /*input_sizes=*/{/*replica_0=*/{4, 7},
-                                                     /*replica_1=*/{2, 5}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest,
-       RaggedAllToAll_2GPUs_DevicesInReplicaGroupInReverseOrder) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[4] parameter(0)
-    output = f32[4] parameter(1)
-    input_offsets = s32[2] parameter(2)
-    send_sizes = s32[2] parameter(3)
-    output_offsets = s32[2] parameter(4)
-    recv_sizes = s32[2] parameter(5)
-    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
-    send_sizes, output_offsets, recv_sizes), replica_groups={{1,0}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
-      << "Test requires at least " << kNumReplicas * kNumPartitions
-      << " devices (" << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(),
-                                    /*input_sizes=*/{/*replica_0=*/{1, 1},
-                                                     /*replica_1=*/{3, 1}}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_8GPUs) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[512, 5, 32] parameter(0)
-    output = f32[512, 5, 32] parameter(1)
-    input_offsets = s32[32] parameter(2)
-    send_sizes = s32[32] parameter(3)
-    output_offsets = s32[32] parameter(4)
-    recv_sizes = s32[32] parameter(5)
-    ROOT ra2a = f32[512, 5, 32] ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes),
-      replica_groups={{0,1,2,3,4,5,6,7}}
-  })";
-
-  const int64_t kNumReplicas = 8;
-  const int64_t kNumPartitions = 1;
-  const int64_t kNumUpdatesPerReplica = 4;
-  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
-    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
-                 << " devices (" << hlo_runner_->device_count()
-                 << " available)";
-  }
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  Array<int64_t> input_sizes(
-      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
-  input_sizes.FillRandomUniform(0, 10);
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-
-  for (int i = 0; i < kNumReplicas; ++i) {
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
-  }
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_8GPUs_2ReplicasPerGroups) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[4096,1024] parameter(0)
-    output = f32[8192,1024] parameter(1)
-    input_offsets = s64[32] parameter(2)
-    send_sizes = s64[32] parameter(3)
-    output_offsets = s64[32] parameter(4)
-    recv_sizes = s64[32] parameter(5)
-    ROOT ra2a = f32[8192,1024] ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes),
-      replica_groups={{0,4},{1,5},{2,6},{3,7}}
-  })";
-
-  const int64_t kNumReplicas = 8;
-  const int64_t kNumReplicasPerGroup = 2;
-  const int64_t kNumPartitions = 1;
-  const int64_t kNumUpdatesPerReplica = 16;
-  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
-    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
-                 << " devices (" << hlo_runner_->device_count()
-                 << " available)";
-  }
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  Array<int64_t> input_sizes(
-      {kNumReplicas, kNumReplicasPerGroup, kNumUpdatesPerReplica});
-  input_sizes.FillRandomUniform(0, 10);
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-
-  for (int i = 0; i < kNumReplicas; ++i) {
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
-  }
-}
-
-TEST_P(RaggedAllToAllTest, RaggedAllToAll_8GPUs_4ReplicasPerGroups) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[512, 5, 32] parameter(0)
-    output = f32[512, 5, 32] parameter(1)
-    input_offsets = s32[32] parameter(2)
-    send_sizes = s32[32] parameter(3)
-    output_offsets = s32[32] parameter(4)
-    recv_sizes = s32[32] parameter(5)
-    ROOT ra2a = f32[512, 5, 32] ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes),
-      replica_groups={{0,1,2,3},{4,5,6,7}}
-  })";
-
-  const int64_t kNumReplicas = 8;
-  const int64_t kNumReplicasPerGroup = 4;
-  const int64_t kNumPartitions = 1;
-  const int64_t kNumUpdatesPerReplica = 8;
-  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
-    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
-                 << " devices (" << hlo_runner_->device_count()
-                 << " available)";
-  }
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  Array<int64_t> input_sizes(
-      {kNumReplicas, kNumReplicasPerGroup, kNumUpdatesPerReplica});
-  input_sizes.FillRandomUniform(0, 10);
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-
-  for (int i = 0; i < kNumReplicas; ++i) {
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
-  }
-}
-
-std::string RaggedAllToAllImplTypeName(
-    RaggedAllToAllImplType ragged_all_to_all_impl_type) {
-  switch (ragged_all_to_all_impl_type) {
-    case RaggedAllToAllImplType::kNccl:
-      return "nccl";
-    case RaggedAllToAllImplType::kMemcpy:
-      return "memcpy";
-    case RaggedAllToAllImplType::kDecomposer:
-      return "decomposer";
-    case RaggedAllToAllImplType::kOneShot:
-      return "one_shot";
-    default:
-      LOG(FATAL) << "Unknown ragged all-to-all implementation type.";
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    RaggedAllToAllTest, RaggedAllToAllTest,
-    ::testing::Combine(::testing::Bool(),
-                       ::testing::Values(RaggedAllToAllImplType::kNccl,
-                                         RaggedAllToAllImplType::kMemcpy,
-                                         RaggedAllToAllImplType::kDecomposer,
-                                         RaggedAllToAllImplType::kOneShot)),
-    [](const ::testing::TestParamInfo<std::tuple<bool, RaggedAllToAllImplType>>&
-           info) {
-      return absl::StrCat(GetAsyncTestName(std::get<0>(info.param)), "_",
-                          RaggedAllToAllImplTypeName(std::get<1>(info.param)));
-    });
-
-class RaggedAllToAllMultiHostDecomposerTest : public RaggedAllToAllTestBase {
- public:
-  RaggedAllToAllMultiHostDecomposerTest()
-      : RaggedAllToAllTestBase(/*enable_async=*/false,
-                               /*impl_type=*/RaggedAllToAllImplType::kOneShot) {
-  }
-
- protected:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options =
-        RaggedAllToAllTestBase::GetDebugOptionsForTest();
-    debug_options
-        .set_xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer(
-            true);
-    return debug_options;
-  }
-};
-
-TEST_F(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_2GPUs_SliceSize1) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[512,5,32] parameter(0)
-    output = f32[512,5,32] parameter(1)
-    input_offsets = s32[32] parameter(2)
-    send_sizes = s32[32] parameter(3)
-    output_offsets = s32[32] parameter(4)
-    recv_sizes = s32[32] parameter(5)
-    ROOT ra2a = f32[512,5,32] ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes), 
-      replica_groups={{0,1}}
-  })";
-
-  const int64_t kNumReplicas = 2;
-  const int64_t kNumPartitions = 1;
-  const int64_t kNumUpdatesPerReplica = 16;
-  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
-      << "Test requires at least " << kNumReplicas * kNumPartitions
-      << " devices (" << hlo_runner_->device_count() << " available)";
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  config.mutable_debug_options()
-      .set_xla_gpu_unsupported_override_fast_interconnect_slice_size(1);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  Array<int64_t> input_sizes(
-      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
-  input_sizes.FillRandomUniform(0, 10);
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-
-  for (int i = 0; i < kNumReplicas; ++i) {
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
-  }
-}
-
-TEST_F(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
-  absl::string_view kModuleReplicatedStr = R"(
-  HloModule module, num_partitions=1
-
-  ENTRY entry {
-    input = f32[512,5,32] parameter(0)
-    output = f32[512,5,32] parameter(1)
-    input_offsets = s32[32] parameter(2)
-    send_sizes = s32[32] parameter(3)
-    output_offsets = s32[32] parameter(4)
-    recv_sizes = s32[32] parameter(5)
-    ROOT ra2a = f32[512,5,32] ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes), 
-      replica_groups={{0,1,2,3,4,5,6,7}}
-  })";
-
-  const int64_t kNumReplicas = 8;
-  const int64_t kNumPartitions = 1;
-  const int64_t kNumUpdatesPerReplica = 4;
-  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
-    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
-                 << " devices (" << hlo_runner_->device_count()
-                 << " available)";
-  }
-
-  HloModuleConfig config =
-      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
-
-  config.mutable_debug_options()
-      .set_xla_gpu_unsupported_override_fast_interconnect_slice_size(4);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
-
-  Array<int64_t> input_sizes(
-      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
-  input_sizes.FillRandomUniform(0, 10);
-
-  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<Literal> results,
-      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
-                        /*device_assignment=*/nullptr,
-                        /*num_replicas=*/kNumReplicas,
-                        /*run_hlo_passes=*/true));
-  ASSERT_EQ(results.size(), kNumReplicas);
-
-  for (int i = 0; i < kNumReplicas; ++i) {
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
-  }
-}
-
 TEST_F(CollectiveOpsTestE2E, MemcpyP2pWhileLoopCorrectness) {
   absl::string_view hlo_string = R"(
 HloModule MemcpyP2pWhileLoopCorrectness, entry_computation_layout={(bf16[128,96]{1,0})->(bf16[32,384]{1,0}, bf16[32,384]{1,0})}, allow_spmd_sharding_propagation_to_output={true,true}, num_partitions=4
@@ -4421,7 +3348,7 @@ TEST_P(AllReduceTest, AsyncAllReduce_8GPUs_2ReplicasPerGroup) {
   }
 }
 
-TEST_F(CollectiveOpsTestE2E, OptimizedSubByteAllGatherOutputIsCorrect) {
+TEST_F(CollectiveOpsTestE2E, OptimizedSubByteAllGatherOnDim0OutputIsCorrect) {
   constexpr int kNumReplicas = 2;
   ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
       << "Test requires at least " << kNumReplicas << " devices ("
@@ -4462,6 +3389,50 @@ TEST_F(CollectiveOpsTestE2E, OptimizedSubByteAllGatherOutputIsCorrect) {
   }
 }
 
+TEST_F(CollectiveOpsTestE2E, OptimizedSubByteAllGatherOnDim1OutputIsCorrect) {
+  constexpr int kNumReplicas = 2;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
+      << "Test requires at least " << kNumReplicas << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto unoptimized_module,
+                          ParseAndReturnVerifiedModule(R"(
+    HloModule m, replica_count=2
+
+    e {
+      a = s4[4,2]{1,0:E(4)} constant({{0,1},{2,3},{4,5},{5,4}})
+      b = s4[4,4]{1,0:E(4)} all-gather(a), dimensions={1}
+    })"));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, hlo_runner_->CreateExecutable(
+                                               std::move(unoptimized_module),
+                                               /*run_hlo_passes=*/true));
+
+  TF_ASSERT_OK_AND_ASSIGN(const HloModule* const module,
+                          hlo_runner_->HloModuleFromWrapped(executable.get()));
+
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Fusion(
+                        m::Bitcast(m::AllGatherDone().WithShape(S8, {2, 4})))));
+  EXPECT_THAT(root->fused_expression_root(),
+              GmockMatch(m::Transpose(m::Parameter())));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> result,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+
+  const Literal expected_result =
+      LiteralUtil::CreateR2<s4>({{s4(0), s4(1), s4(0), s4(1)},
+                                 {s4(2), s4(3), s4(2), s4(3)},
+                                 {s4(4), s4(5), s4(4), s4(5)},
+                                 {s4(5), s4(4), s4(5), s4(4)}});
+
+  ASSERT_EQ(result.size(), kNumReplicas);
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_result, result[i]))
+        << "Results differ at replica " << i;
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(
     AllReduceTest, AllReduceTest,
     ::testing::Combine(::testing::Bool(), ::testing::Bool()),
@@ -4470,5 +3441,170 @@ INSTANTIATE_TEST_SUITE_P(
                           std::get<1>(info.param) ? "one_shot" : "nccl");
     });
 
+class CollectiveMetadataTest : public CollectiveOpsE2ETestBase {
+ protected:
+  void SetUp() override {
+    CollectiveOpsE2ETestBase::SetUp();
+    if (!IsHopperAndHigher()) {
+      GTEST_SKIP() << "Test requires Hopper or newer architecture since it's "
+                      "using a multicast.";
+    }
+  }
+};
+
+TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadata) {
+  constexpr int kNumReplicas = 2;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> unoptimized_module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule test, replica_count=2
+
+  ENTRY test_computation {
+    param_0 = f32[4] parameter(0)
+    param_1 = f32[4] parameter(1)
+    copy_1 = f32[4]{0:S(1)} copy(param_1)
+
+    const_0 = f32[1] constant({10})
+
+    result_tuple = (f32[4], f32[4]{0:S(1)}, f32[1], u64[9]) custom-call(param_0, copy_1, const_0), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {}), {1}: (1, {})}
+    ROOT get_tuple_element = u64[9] get-tuple-element(result_tuple), index=3
+  })"));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<OpaqueExecutable> executable,
+      hlo_runner_->CreateExecutable(std::move(unoptimized_module),
+                                    /*run_hlo_passes=*/false));
+  const std::array<Literal, 2> arguments = {
+      LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f}),
+      LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f})};
+  DeviceAssignment device_assignment = MakeDeviceAssn(kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> result,
+      ExecuteReplicated(
+          /*executable_provider*/ [&](int64_t) { return executable.get(); },
+          /*argument_count_provider*/ [&](int64_t) { return arguments.size(); },
+          /*argument_provider*/
+          [&](int64_t replica_id, int64_t arg_index) {
+            return &arguments[arg_index];
+          },
+          kNumReplicas,
+          /*run_hlo_passes=*/false, &device_assignment));
+
+  ASSERT_EQ(result.size(), kNumReplicas);
+  Literal first_result = std::move(result[0]);
+  Literal second_result = std::move(result[1]);
+
+  absl::Span<const uint64_t> first_result_data = first_result.data<uint64_t>();
+  absl::Span<const uint64_t> second_result_data =
+      second_result.data<uint64_t>();
+  constexpr int kNumElements = 9;
+  ASSERT_EQ(first_result_data.size(), kNumElements);
+  ASSERT_EQ(second_result_data.size(), kNumElements);
+
+  // Check the rank in the first position.
+  EXPECT_EQ(first_result_data[0], 0);
+  EXPECT_EQ(second_result_data[0], 1);
+
+  // Check pointer to peers in the second position.
+  EXPECT_NE(first_result_data[1], 0);
+  EXPECT_NE(second_result_data[1], 0);
+
+  // Check pointer to multimem metadata in the third position.
+  EXPECT_NE(first_result_data[2], 0);
+  EXPECT_NE(second_result_data[2], 0);
+
+  // Check param_to_peers structure.
+  for (int i = 3; i < kNumElements; ++i) {
+    EXPECT_NE(first_result_data[i], 0);
+    EXPECT_EQ(second_result_data[i], first_result_data[i]);
+  }
+}
+
+TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadataWithReplicaGroup) {
+  constexpr int kNumReplicas = 4;
+  if (hlo_runner_->device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << hlo_runner_->device_count() << " available)";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> unoptimized_module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule test, replica_count=4
+
+  ENTRY test_computation {
+    param_0 = f32[4] parameter(0)
+    param_1 = f32[4] parameter(1)
+    copy_1 = f32[4]{0:S(1)} copy(param_1)
+
+    result_tuple = (f32[4], f32[4]{0:S(1)}, u64[7]) custom-call(param_0, copy_1), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {}), {1}: (1, {})}, backend_config="{\"collective_metadata_backend_config\":{\"collective_devices\": { \"replica_groups\": [{\"replica_ids\": [0,1]}, {\"replica_ids\": [2,3]}]}}}"
+    ROOT get_tuple_element = u64[7] get-tuple-element(result_tuple), index=2
+  })"));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<OpaqueExecutable> executable,
+      hlo_runner_->CreateExecutable(std::move(unoptimized_module),
+                                    /*run_hlo_passes=*/false));
+  const std::array<Literal, 2> arguments = {
+      LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f}),
+      LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f})};
+  DeviceAssignment device_assignment = MakeDeviceAssn(kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> result,
+      ExecuteReplicated(
+          /*executable_provider*/ [&](int64_t) { return executable.get(); },
+          /*argument_count_provider*/ [&](int64_t) { return arguments.size(); },
+          /*argument_provider*/
+          [&](int64_t replica_id, int64_t arg_index) {
+            return &arguments[arg_index];
+          },
+          kNumReplicas,
+          /*run_hlo_passes=*/false, &device_assignment));
+
+  ASSERT_EQ(result.size(), kNumReplicas);
+  Literal replica_0_result_0 = std::move(result[0]);
+  Literal replica_0_result_1 = std::move(result[1]);
+  Literal replica_1_result_0 = std::move(result[2]);
+  Literal replica_1_result_1 = std::move(result[3]);
+
+  absl::Span<const uint64_t> replica_0_result_0_data =
+      replica_0_result_0.data<uint64_t>();
+  absl::Span<const uint64_t> replica_0_result_1_data =
+      replica_0_result_1.data<uint64_t>();
+  absl::Span<const uint64_t> replica_1_result_0_data =
+      replica_1_result_0.data<uint64_t>();
+  absl::Span<const uint64_t> replica_1_result_1_data =
+      replica_1_result_1.data<uint64_t>();
+
+  // Check the rank in the first position.
+  constexpr int kNumElements = 7;
+  ASSERT_EQ(replica_0_result_0_data.size(), kNumElements);
+  ASSERT_EQ(replica_0_result_1_data.size(), kNumElements);
+  ASSERT_EQ(replica_1_result_0_data.size(), kNumElements);
+  ASSERT_EQ(replica_1_result_1_data.size(), kNumElements);
+
+  EXPECT_EQ(replica_0_result_0_data[0], 0);
+  EXPECT_EQ(replica_0_result_1_data[0], 1);
+  EXPECT_EQ(replica_1_result_0_data[0], 0);
+  EXPECT_EQ(replica_1_result_1_data[0], 1);
+
+  // Check pointer to peers in the second position.
+  EXPECT_NE(replica_0_result_0_data[1], 0);
+  EXPECT_NE(replica_0_result_1_data[1], 0);
+  EXPECT_NE(replica_1_result_0_data[1], 0);
+  EXPECT_NE(replica_1_result_1_data[1], 0);
+
+  // Check pointer to multimem metadata in the third position.
+  EXPECT_NE(replica_0_result_0_data[2], 0);
+  EXPECT_NE(replica_0_result_1_data[2], 0);
+  EXPECT_NE(replica_1_result_0_data[2], 0);
+  EXPECT_NE(replica_1_result_1_data[2], 0);
+
+  // Check param_to_peers structure.
+  for (int i = 3; i < kNumElements; ++i) {
+    EXPECT_NE(replica_0_result_0_data[i], 0);
+    EXPECT_EQ(replica_0_result_1_data[i], replica_0_result_0_data[i]);
+    EXPECT_NE(replica_1_result_0_data[i], 0);
+    EXPECT_EQ(replica_1_result_1_data[i], replica_1_result_0_data[i]);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc b/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
new file mode 100644
index 00000000000000..e7afe98e12e650
--- /dev/null
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
@@ -0,0 +1,228 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tests/collective_ops_e2e_test_base.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/literal.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_memory_space_assignment.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/integrations/device_mem_allocator.h"
+#include "xla/stream_executor/integrations/stream_executor_allocator.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/framework/bfc_allocator.h"
+#include "xla/tsl/framework/device_id.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+std::unique_ptr<tsl::BFCAllocator> CreateAllocator(se::StreamExecutor* executor,
+                                                   int64_t device_ordinal,
+                                                   bool is_collective,
+                                                   size_t memory_size) {
+  std::string name_suffix = is_collective ? "_collectives_bfc" : "_bfc";
+  tsl::BFCAllocator::Options opts;
+  opts.allow_growth = false;
+  std::unique_ptr<tsl::SubAllocator> device_mem_allocator;
+  if (is_collective) {
+    device_mem_allocator = std::make_unique<se::StreamExecutorAllocator>(
+        executor->CreateMemoryAllocator(se::MemoryType::kCollective).value(),
+        /*memory_type=*/stream_executor::MemoryType::kCollective,
+        device_ordinal);
+  } else {
+    device_mem_allocator = std::make_unique<se::DeviceMemAllocator>(
+        executor, tsl::PlatformDeviceId(device_ordinal));
+  }
+  return std::make_unique<tsl::BFCAllocator>(
+      std::move(device_mem_allocator), memory_size,
+      absl::StrCat("GPU_", device_ordinal, name_suffix), opts);
+}
+
+template <typename Type>
+Type CheckStatus(absl::StatusOr<Type> result) {
+  CHECK_OK(result);
+  return *result;
+}
+
+}  // namespace
+
+CollectiveOpsE2ETestBase::CollectiveOpsE2ETestBase() {
+  se::Platform* platform = CheckStatus(PlatformUtil::GetPlatform("GPU"));
+  se::Platform* reference_platform =
+      CheckStatus(PlatformUtil::GetPlatform("GPU"));
+
+  std::vector<se::MultiDeviceAdapter::AllocatorInfo> allocators;
+  constexpr int64_t kGB = 1024LL * 1024LL * 1024LL;
+  size_t common_buffers_size = 8 * kGB;   // 8GB
+  size_t collectives_buffers_size = kGB;  // 1GB
+  for (int64_t i = 0; i < platform->VisibleDeviceCount(); ++i) {
+    se::StreamExecutor* executor = CheckStatus(platform->ExecutorForDevice(i));
+    // Common memory allocator for device i.
+    allocators.emplace_back(
+        CreateAllocator(executor, i, /*is_collective=*/false,
+                        common_buffers_size),
+        nullptr, 0, i, platform);
+
+    // Collectives and symmetric memory allocator for device i.
+    allocators.emplace_back(CreateAllocator(executor, i, /*is_collective=*/true,
+                                            collectives_buffers_size),
+                            nullptr, (int)gpu::MemorySpaceColor::kCollective, i,
+                            platform);
+  }
+
+  hlo_runner_ =
+      std::make_unique<HloRunner>(platform, /*intra_op_parallelism_threads=*/0,
+                                  std::make_unique<se::MultiDeviceAdapter>(
+                                      platform, std::move(allocators)));
+  reference_hlo_runner_ = std::make_unique<HloRunner>(
+      reference_platform, /*intra_op_parallelism_threads=*/0);
+}
+
+absl::StatusOr<std::vector<Literal>>
+CollectiveOpsE2ETestBase::ExecuteReplicated(
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
+    const int64_t num_replicas, const bool run_hlo_passes,
+    DeviceAssignment* const device_assignment) {
+  // TODO(b/441865120): Use designated initializers this once XLA moves to
+  // C++20.
+  HloRunnerInterface::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = true;
+
+  return hlo_runner_->ExecuteReplicated(
+      std::move(executable_provider), std::move(argument_count_provider),
+      std::move(argument_provider), std::move(options), device_assignment);
+}
+
+absl::StatusOr<std::vector<Literal>>
+CollectiveOpsE2ETestBase::ExecuteReplicated(
+    std::unique_ptr<HloModule> module,
+    const absl::Span<const Literal* const> arguments,
+    const int64_t num_replicas, DeviceAssignment* const device_assignment,
+    const bool run_hlo_passes, const bool use_threads) {
+  // TODO(b/441865120): Use designated initializers this once XLA moves to
+  // C++20.
+  HloRunnerInterface::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  options.arguments = {arguments.begin(), arguments.end()};
+  options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = use_threads;
+
+  return hlo_runner_->ExecuteReplicated(std::move(module), std::move(options),
+                                        device_assignment);
+}
+
+absl::StatusOr<std::vector<Literal>>
+CollectiveOpsE2ETestBase::ExecuteReplicated(
+    std::unique_ptr<HloModule> module,
+    const std::vector<std::vector<Literal*>> arguments,
+    DeviceAssignment* const device_assignment, const int64_t num_replicas,
+    const bool run_hlo_passes) {
+  CHECK(num_replicas > 0 && "expect at least one replica");
+  CHECK(num_replicas == arguments.size() &&
+        "expect arguments for each replica");
+  int64_t argument_count = arguments.front().size();
+  TF_ASSIGN_OR_RETURN(
+      const std::unique_ptr<OpaqueExecutable> executable,
+      hlo_runner_->CreateExecutable(std::move(module), run_hlo_passes));
+  return ExecuteReplicated(
+      /*executable_provider=*/[&](int64_t) { return executable.get(); },
+      /*argument_count_provider=*/[&](int64_t) { return argument_count; },
+      /*argument_provider=*/
+      [&](int64_t replica_idx, int64_t argument_idx) -> const Literal* {
+        return arguments[replica_idx][argument_idx];
+      },
+      num_replicas, /*run_hlo_passes=*/run_hlo_passes,
+      /*device_assignment=*/device_assignment);
+}
+
+absl::StatusOr<std::vector<Literal>>
+CollectiveOpsE2ETestBase::ExecuteReplicated(OpaqueExecutable* executable,
+                                            int64_t num_replicas) {
+  DeviceAssignment device_assignment = MakeDeviceAssn(num_replicas);
+  return ExecuteReplicated(
+      /*executable_provider*/ [&](int64_t) { return executable; },
+      /*argument_count_provider*/ [](int64_t) { return 0; },
+      /*argument_provider*/ [](int64_t, int64_t) { return nullptr; },
+      num_replicas, /*run_hlo_passes=*/false, &device_assignment);
+}
+
+DebugOptions CollectiveOpsWithFlagsBase::GetDebugOptionsForTest() const {
+  DebugOptions debug_options =
+      HloHardwareIndependentTestBase::GetDebugOptionsForTest();
+
+  // Disable autotuning which is unnecessary.
+  debug_options.set_xla_gpu_autotune_level(0);
+
+  // Enable or disable all async collectives based on test parameter.
+  if (!enable_async_) {
+    for (auto option :
+         {DebugOptions::NOOP, DebugOptions::ALLREDUCE, DebugOptions::ALLGATHER,
+          DebugOptions::REDUCESCATTER, DebugOptions::COLLECTIVEBROADCAST,
+          DebugOptions::ALLTOALL, DebugOptions::COLLECTIVEPERMUTE,
+          DebugOptions::RAGGEDALLTOALL}) {
+      debug_options.add_xla_gpu_disable_async_collectives(option);
+    }
+  }
+  debug_options.add_xla_disable_hlo_passes(
+      "gpu-convert-async-collectives-to-sync");
+  if (enable_p2p_memcpy_) {
+    debug_options.set_xla_gpu_use_memcpy_local_p2p(true);
+  }
+  return debug_options;
+}
+
+absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
+CollectiveOpsWithFlagsBase::CreateExecutable(absl::string_view hlo_string,
+                                             int64_t num_replicas) {
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/num_replicas);
+
+  TF_ASSIGN_OR_RETURN(auto module,
+                      ParseAndReturnVerifiedModule(hlo_string, config));
+  return hlo_runner_->CreateExecutable(std::move(module),
+                                       /*run_hlo_passes=*/true);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test_base.h b/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
new file mode 100644
index 00000000000000..3ae5d33726d10e
--- /dev/null
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
@@ -0,0 +1,123 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_COLLECTIVE_OPS_E2E_TEST_BASE_H_
+#define XLA_TESTS_COLLECTIVE_OPS_E2E_TEST_BASE_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/literal.h"
+#include "xla/service/backend.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class CollectiveOpsE2ETestBase : public HloHardwareIndependentTestBase {
+ public:
+  CollectiveOpsE2ETestBase();
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
+      int64_t num_replicas, bool run_hlo_passes,
+      DeviceAssignment* device_assignment);
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      absl::Span<const Literal* const> arguments, int64_t num_replicas,
+      DeviceAssignment* vice_assignment, bool run_hlo_passes, bool use_threads);
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      std::vector<std::vector<Literal*>> arguments,
+      DeviceAssignment* device_assignment, int64_t num_replicas,
+      bool run_hlo_passes);
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      OpaqueExecutable* executable, int64_t num_replicas);
+
+  const se::GpuComputeCapability& Capability() {
+    return hlo_runner_->backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability();
+  }
+
+  bool IsHopperAndHigher() {
+    return Capability().IsCuda() &&
+           Capability().cuda_compute_capability()->IsAtLeastHopper();
+  }
+
+  // Makes a DeviceAssignment device#i to replica_id #i.
+  DeviceAssignment MakeDeviceAssn(int64_t num_replicas) {
+    DeviceAssignment assn(/*replica_count=*/num_replicas,
+                          /*computation_count=*/1);
+    for (int64_t i = 0; i < num_replicas; ++i) {
+      assn(i, 0) = i;
+    }
+    return assn;
+  }
+
+ protected:
+  std::unique_ptr<HloRunner> hlo_runner_;
+  std::unique_ptr<HloRunner> reference_hlo_runner_;
+};
+
+// E2E tests for collective ops. These will generally verify some HLO transform
+// for collectives (for example, sync -> async conversion) and correct
+// execution of the transformed HLO.
+
+// E2E test for collectives with flags set. Has constructor arguments specifying
+// whether to enable/disable async collectives, and to set the memcpy_local_p2p
+// flag. Subclasses pass in constructor arguments based on GetParam().
+class CollectiveOpsWithFlagsBase : public CollectiveOpsE2ETestBase {
+ public:
+  CollectiveOpsWithFlagsBase(bool enable_async, bool enable_p2p_memcpy)
+      : enable_async_(enable_async), enable_p2p_memcpy_(enable_p2p_memcpy) {}
+
+ protected:
+  DebugOptions GetDebugOptionsForTest() const override;
+
+  absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
+      absl::string_view hlo_string, int64_t num_replicas);
+
+  const bool enable_async_;
+  const bool enable_p2p_memcpy_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_TESTS_COLLECTIVE_OPS_E2E_TEST_BASE_H_
diff --git a/third_party/xla/xla/tests/collective_ops_test.cc b/third_party/xla/xla/tests/collective_ops_test.cc
index 9f56e6ce875502..26dab88fea8a8b 100644
--- a/third_party/xla/xla/tests/collective_ops_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_test.cc
@@ -2665,17 +2665,13 @@ class Fp8CollectiveOpsTest : public CollectiveOpsTest {
  public:
   Fp8CollectiveOpsTest() {
     replacements_[kF8E4M3DatatypePlaceholder] =
-        IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
+        Capability().IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
     replacements_[kF8E5M2DatatypePlaceholder] =
-        IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
+        Capability().IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
     replacements_[kF8E8M0DatatypePlaceholder] = "f8e8m0fnu";
   }
 
  protected:
-  bool IsCuda() {
-    return std::holds_alternative<se::CudaComputeCapability>(Capability());
-  }
-
   const se::GpuComputeCapability& Capability() {
     return backend()
         .default_stream_executor()
diff --git a/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc b/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc
index 6e89f6232bd43a..afdc58a81b7008 100644
--- a/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc
+++ b/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc
@@ -1274,7 +1274,6 @@ TEST_P(CollectivePipelineParallelismTest,
 
 // This is the partially pipelined version of
 // NaiveBFSMicrobatch5CircularRepeat2Replica4 and should yield the same results.
-// TODO(b/383868854): replace this with GPU pipeliner implementation.
 TEST_P(CollectivePipelineParallelismTest,
        NaiveBFSMb5Cr2Replica4SendRecvPartiallyPipelined) {
   constexpr char kMoreComputationsStr[] = R"(
@@ -1508,7 +1507,6 @@ TEST_P(CollectivePipelineParallelismTest,
 
 // This is the async-grouped version of
 // NaiveBFSMicrobatch5CircularRepeat2Replica4 and should yield the same results.
-// TODO(b/383868854): replace this with GPU pipeliner implementation.
 TEST_P(CollectivePipelineParallelismTest,
        NaiveBFSMb5Cr2Replica4SendRecvAsyncGroup) {
   constexpr char kMoreComputationsStr[] = R"(
diff --git a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
index 86d6d0c860529b..82e1e657a3223d 100644
--- a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
+++ b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
@@ -434,13 +434,15 @@ ENTRY entry {
   ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
 }
 )";
-  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
-  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  std::unique_ptr<HloModule> module1 = module->Clone("module1");
+  std::unique_ptr<HloModule> module2 = module->Clone("module2");
+  EXPECT_FALSE(RunOptimizer(module1.get(), /*last_run=*/true, 0).value());
   EXPECT_FALSE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
-  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module1->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module1), std::move(module2),
                                       ErrorSpec{0.1, 0.1}));
 }
 
@@ -563,13 +565,16 @@ TEST_F(CollectivePipelinerExecutionTest, EscapedInputNoTransform) {
    get-tuple-element(while), index=1
  }
 )";
-  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
-  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  std::unique_ptr<HloModule> module1 = module->Clone("module1");
+  std::unique_ptr<HloModule> module2 = module->Clone("module2");
+  EXPECT_FALSE(RunOptimizer(module1.get(), /*last_run=*/true, 0).value());
   EXPECT_FALSE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
-  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module1->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module1), std::move(module2),
                                       ErrorSpec{0.1, 0.1}));
 }
 
diff --git a/third_party/xla/xla/tests/complex_unary_op_samples.h b/third_party/xla/xla/tests/complex_unary_op_samples.h
index 94d38ee05243fa..6b8f66c4719fb9 100644
--- a/third_party/xla/xla/tests/complex_unary_op_samples.h
+++ b/third_party/xla/xla/tests/complex_unary_op_samples.h
@@ -2554,6 +2554,632 @@ struct Asinh {
   }
 };
 
+template <typename T, int default_dps_deficiency = 0>
+struct Exp {
+  typedef std::complex<T> InputType;
+  typedef std::complex<T> OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {
+    if constexpr (std::is_same_v<T, float>) {
+      const T nan = std::nanf("");
+      const T zero = 0.0f;
+      const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
+      const T min = std::numeric_limits<T>::min();
+#endif
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { nan, nan }, 1.e+00f },
+        /* 1 */ { { -max, -inf }, { nan, nan }, 1.e+00f },
+        /* 2 */ { { -6.1409603e+25f, -inf }, { nan, nan }, 1.e+00f },
+        /* 3 */ { { -1.1082383e+13f, -inf }, { nan, nan }, 1.e+00f },
+        /* 4 */ { { -2.e+00f, -inf }, { nan, nan }, 1.e+00f },
+        /* 5 */ { { -3.6093321e-13f, -inf }, { nan, nan }, 1.e+00f },
+        /* 6 */ { { -6.5136393e-26f, -inf }, { nan, nan }, 1.e+00f },
+        /* 7 */ { { -min, -inf }, { nan, nan }, 1.e+00f },
+        /* 8 */ { { zero, -inf }, { nan, nan }, 1.e+00f },
+        /* 9 */ { { min, -inf }, { nan, nan }, 1.e+00f },
+        /* 10 */ { { 6.5136393e-26f, -inf }, { nan, nan }, 1.e+00f },
+        /* 11 */ { { 3.6093321e-13f, -inf }, { nan, nan }, 1.e+00f },
+        /* 12 */ { { 2.e+00f, -inf }, { nan, nan }, 1.e+00f },
+        /* 13 */ { { 1.1082383e+13f, -inf }, { nan, nan }, 1.e+00f },
+        /* 14 */ { { 6.1409603e+25f, -inf }, { nan, nan }, 1.e+00f },
+        /* 15 */ { { max, -inf }, { nan, nan }, 1.e+00f },
+        /* 16 */ { { inf, -inf }, { nan, nan }, 1.e+00f },
+        /* 17 */ { { -inf, -max }, { zero, zero }, 1.e+00f },
+        /* 18 */ { { -max, -max }, { zero, zero }, 1.e+00f },
+        /* 19 */ { { -6.1409603e+25f, -max }, { zero, zero }, 1.e+00f },
+        /* 20 */ { { -1.1082383e+13f, -max }, { zero, zero }, 1.e+00f },
+        /* 21 */ { { -2.e+00f, -max }, { 1.1544384e-01f, 7.0628308e-02f }, 4.e+00f },
+        /* 22 */ { { -3.6093321e-13f, -max }, { 8.5302103e-01f, 5.2187651e-01f }, 5.e-01f },
+        /* 23 */ { { -6.5136393e-26f, -max }, { 8.5302103e-01f, 5.2187651e-01f }, 5.e-01f },
+        /* 24 */ { { -min, -max }, { 8.5302103e-01f, 5.2187651e-01f }, 5.e-01f },
+        /* 25 */ { { zero, -max }, { 8.5302103e-01f, 5.2187651e-01f }, 5.e-01f },
+        /* 26 */ { { min, -max }, { 8.5302103e-01f, 5.2187651e-01f }, 5.e-01f },
+        /* 27 */ { { 6.5136393e-26f, -max }, { 8.5302103e-01f, 5.2187651e-01f }, 5.e-01f },
+        /* 28 */ { { 3.6093321e-13f, -max }, { 8.5302103e-01f, 5.2187651e-01f }, 5.e-01f },
+        /* 29 */ { { 2.e+00f, -max }, { 6.3030205e+00f, 3.8561749e+00f }, 1.25e-01f },
+        /* 30 */ { { 1.1082383e+13f, -max }, { inf, inf }, 1.e+00f },
+        /* 31 */ { { 6.1409603e+25f, -max }, { inf, inf }, 1.e+00f },
+        /* 32 */ { { max, -max }, { inf, inf }, 1.e+00f },
+        /* 33 */ { { inf, -max }, { inf, inf }, 1.e+00f },
+        /* 34 */ { { -inf, -6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 35 */ { { -max, -6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 36 */ { { -6.1409603e+25f, -6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 37 */ { { -1.1082383e+13f, -6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 38 */ { { -2.e+00f, -6.1409603e+25f }, { 9.5967902e-03f, 1.349946e-01f }, 4.e+00f },
+        /* 39 */ { { -3.6093321e-13f, -6.1409603e+25f }, { 7.0911221e-02f, 9.9748266e-01f }, 5.e-01f },
+        /* 40 */ { { -6.5136393e-26f, -6.1409603e+25f }, { 7.0911221e-02f, 9.9748266e-01f }, 5.e-01f },
+        /* 41 */ { { -min, -6.1409603e+25f }, { 7.0911221e-02f, 9.9748266e-01f }, 5.e-01f },
+        /* 42 */ { { zero, -6.1409603e+25f }, { 7.0911221e-02f, 9.9748266e-01f }, 5.e-01f },
+        /* 43 */ { { min, -6.1409603e+25f }, { 7.0911221e-02f, 9.9748266e-01f }, 5.e-01f },
+        /* 44 */ { { 6.5136393e-26f, -6.1409603e+25f }, { 7.0911221e-02f, 9.9748266e-01f }, 5.e-01f },
+        /* 45 */ { { 3.6093321e-13f, -6.1409603e+25f }, { 7.0911221e-02f, 9.9748266e-01f }, 5.e-01f },
+        /* 46 */ { { 2.e+00f, -6.1409603e+25f }, { 5.2396703e-01f, 7.3704553e+00f }, 1.25e-01f },
+        /* 47 */ { { 1.1082383e+13f, -6.1409603e+25f }, { inf, inf }, 1.e+00f },
+        /* 48 */ { { 6.1409603e+25f, -6.1409603e+25f }, { inf, inf }, 1.e+00f },
+        /* 49 */ { { max, -6.1409603e+25f }, { inf, inf }, 1.e+00f },
+        /* 50 */ { { inf, -6.1409603e+25f }, { inf, inf }, 1.e+00f },
+        /* 51 */ { { -inf, -1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 52 */ { { -max, -1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 53 */ { { -6.1409603e+25f, -1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 54 */ { { -1.1082383e+13f, -1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 55 */ { { -2.e+00f, -1.1082383e+13f }, { 1.1061824e-01f, -7.7969506e-02f }, 4.e+00f },
+        /* 56 */ { { -3.6093321e-13f, -1.1082383e+13f }, { 8.1736439e-01f, -5.7612109e-01f }, 5.e-01f },
+        /* 57 */ { { -6.5136393e-26f, -1.1082383e+13f }, { 8.1736439e-01f, -5.7612109e-01f }, 5.e-01f },
+        /* 58 */ { { -min, -1.1082383e+13f }, { 8.1736439e-01f, -5.7612109e-01f }, 5.e-01f },
+        /* 59 */ { { zero, -1.1082383e+13f }, { 8.1736439e-01f, -5.7612109e-01f }, 5.e-01f },
+        /* 60 */ { { min, -1.1082383e+13f }, { 8.1736439e-01f, -5.7612109e-01f }, 5.e-01f },
+        /* 61 */ { { 6.5136393e-26f, -1.1082383e+13f }, { 8.1736439e-01f, -5.7612109e-01f }, 5.e-01f },
+        /* 62 */ { { 3.6093321e-13f, -1.1082383e+13f }, { 8.1736439e-01f, -5.7612109e-01f }, 5.e-01f },
+        /* 63 */ { { 2.e+00f, -1.1082383e+13f }, { 6.0395513e+00f, -4.2569909e+00f }, 1.25e-01f },
+        /* 64 */ { { 1.1082383e+13f, -1.1082383e+13f }, { inf, -inf }, 1.e+00f },
+        /* 65 */ { { 6.1409603e+25f, -1.1082383e+13f }, { inf, -inf }, 1.e+00f },
+        /* 66 */ { { max, -1.1082383e+13f }, { inf, -inf }, 1.e+00f },
+        /* 67 */ { { inf, -1.1082383e+13f }, { inf, -inf }, 1.e+00f },
+        /* 68 */ { { -inf, -2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 69 */ { { -max, -2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 70 */ { { -6.1409603e+25f, -2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 71 */ { { -1.1082383e+13f, -2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 72 */ { { -2.e+00f, -2.e+00f }, { -5.6319349e-02f, -1.2306003e-01f }, 4.e+00f },
+        /* 73 */ { { -3.6093321e-13f, -2.e+00f }, { -4.1614684e-01f, -9.0929741e-01f }, 5.e-01f },
+        /* 74 */ { { -6.5136393e-26f, -2.e+00f }, { -4.1614684e-01f, -9.0929741e-01f }, 5.e-01f },
+        /* 75 */ { { -min, -2.e+00f }, { -4.1614684e-01f, -9.0929741e-01f }, 5.e-01f },
+        /* 76 */ { { zero, -2.e+00f }, { -4.1614684e-01f, -9.0929741e-01f }, 5.e-01f },
+        /* 77 */ { { min, -2.e+00f }, { -4.1614684e-01f, -9.0929741e-01f }, 5.e-01f },
+        /* 78 */ { { 6.5136393e-26f, -2.e+00f }, { -4.1614684e-01f, -9.0929741e-01f }, 5.e-01f },
+        /* 79 */ { { 3.6093321e-13f, -2.e+00f }, { -4.1614684e-01f, -9.0929741e-01f }, 5.e-01f },
+        /* 80 */ { { 2.e+00f, -2.e+00f }, { -3.0749323e+00f, -6.7188497e+00f }, 1.25e-01f },
+        /* 81 */ { { 1.1082383e+13f, -2.e+00f }, { -inf, -inf }, 1.e+00f },
+        /* 82 */ { { 6.1409603e+25f, -2.e+00f }, { -inf, -inf }, 1.e+00f },
+        /* 83 */ { { max, -2.e+00f }, { -inf, -inf }, 1.e+00f },
+        /* 84 */ { { inf, -2.e+00f }, { -inf, -inf }, 1.e+00f },
+        /* 85 */ { { -inf, -3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 86 */ { { -max, -3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 87 */ { { -6.1409603e+25f, -3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 88 */ { { -1.1082383e+13f, -3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 89 */ { { -2.e+00f, -3.6093321e-13f }, { 1.3533528e-01f, -4.8846998e-14f }, 4.e+00f },
+        /* 90 */ { { -3.6093321e-13f, -3.6093321e-13f }, { 1.e+00f, -3.6093321e-13f }, 5.e-01f },
+        /* 91 */ { { -6.5136393e-26f, -3.6093321e-13f }, { 1.e+00f, -3.6093321e-13f }, 5.e-01f },
+        /* 92 */ { { -min, -3.6093321e-13f }, { 1.e+00f, -3.6093321e-13f }, 5.e-01f },
+        /* 93 */ { { zero, -3.6093321e-13f }, { 1.e+00f, -3.6093321e-13f }, 5.e-01f },
+        /* 94 */ { { min, -3.6093321e-13f }, { 1.e+00f, -3.6093321e-13f }, 5.e-01f },
+        /* 95 */ { { 6.5136393e-26f, -3.6093321e-13f }, { 1.e+00f, -3.6093321e-13f }, 5.e-01f },
+        /* 96 */ { { 3.6093321e-13f, -3.6093321e-13f }, { 1.e+00f, -3.6093321e-13f }, 5.e-01f },
+        /* 97 */ { { 2.e+00f, -3.6093321e-13f }, { 7.3890562e+00f, -2.6669556e-12f }, 1.25e-01f },
+        /* 98 */ { { 1.1082383e+13f, -3.6093321e-13f }, { inf, -inf }, 1.e+00f },
+        /* 99 */ { { 6.1409603e+25f, -3.6093321e-13f }, { inf, -inf }, 1.e+00f },
+        /* 100 */ { { max, -3.6093321e-13f }, { inf, -inf }, 1.e+00f },
+        /* 101 */ { { inf, -3.6093321e-13f }, { inf, -inf }, 1.e+00f },
+        /* 102 */ { { -inf, -6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 103 */ { { -max, -6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 104 */ { { -6.1409603e+25f, -6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 105 */ { { -1.1082383e+13f, -6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 106 */ { { -2.e+00f, -6.5136393e-26f }, { 1.3533528e-01f, -8.8152525e-27f }, 4.e+00f },
+        /* 107 */ { { -3.6093321e-13f, -6.5136393e-26f }, { 1.e+00f, -6.5136393e-26f }, 5.e-01f },
+        /* 108 */ { { -6.5136393e-26f, -6.5136393e-26f }, { 1.e+00f, -6.5136393e-26f }, 5.e-01f },
+        /* 109 */ { { -min, -6.5136393e-26f }, { 1.e+00f, -6.5136393e-26f }, 5.e-01f },
+        /* 110 */ { { zero, -6.5136393e-26f }, { 1.e+00f, -6.5136393e-26f }, 5.e-01f },
+        /* 111 */ { { min, -6.5136393e-26f }, { 1.e+00f, -6.5136393e-26f }, 5.e-01f },
+        /* 112 */ { { 6.5136393e-26f, -6.5136393e-26f }, { 1.e+00f, -6.5136393e-26f }, 5.e-01f },
+        /* 113 */ { { 3.6093321e-13f, -6.5136393e-26f }, { 1.e+00f, -6.5136393e-26f }, 5.e-01f },
+        /* 114 */ { { 2.e+00f, -6.5136393e-26f }, { 7.3890562e+00f, -4.8129646e-25f }, 1.25e-01f },
+        /* 115 */ { { 1.1082383e+13f, -6.5136393e-26f }, { inf, -inf }, 1.e+00f },
+        /* 116 */ { { 6.1409603e+25f, -6.5136393e-26f }, { inf, -inf }, 1.e+00f },
+        /* 117 */ { { max, -6.5136393e-26f }, { inf, -inf }, 1.e+00f },
+        /* 118 */ { { inf, -6.5136393e-26f }, { inf, -inf }, 1.e+00f },
+        /* 119 */ { { -inf, -min }, { zero, zero }, 1.e+00f },
+        /* 120 */ { { -max, -min }, { zero, zero }, 1.e+00f },
+        /* 121 */ { { -6.1409603e+25f, -min }, { zero, zero }, 1.e+00f },
+        /* 122 */ { { -1.1082383e+13f, -min }, { zero, zero }, 1.e+00f },
+        /* 123 */ { { -2.e+00f, -min }, { 1.3533528e-01f, zero }, 4.e+00f },
+        /* 124 */ { { -3.6093321e-13f, -min }, { 1.e+00f, -min }, 5.e-01f },
+        /* 125 */ { { -6.5136393e-26f, -min }, { 1.e+00f, -min }, 5.e-01f },
+        /* 126 */ { { -min, -min }, { 1.e+00f, -min }, 5.e-01f },
+        /* 127 */ { { zero, -min }, { 1.e+00f, -min }, 5.e-01f },
+        /* 128 */ { { min, -min }, { 1.e+00f, -min }, 5.e-01f },
+        /* 129 */ { { 6.5136393e-26f, -min }, { 1.e+00f, -min }, 5.e-01f },
+        /* 130 */ { { 3.6093321e-13f, -min }, { 1.e+00f, -min }, 5.e-01f },
+        /* 131 */ { { 2.e+00f, -min }, { 7.3890562e+00f, -8.6857938e-38f }, 1.25e-01f },
+        /* 132 */ { { 1.1082383e+13f, -min }, { inf, -inf }, 1.e+00f },
+        /* 133 */ { { 6.1409603e+25f, -min }, { inf, -inf }, 1.e+00f },
+        /* 134 */ { { max, -min }, { inf, -inf }, 1.e+00f },
+        /* 135 */ { { inf, -min }, { inf, -inf }, 1.e+00f },
+        /* 136 */ { { -inf, zero }, { zero, zero }, 1.e+00f },
+        /* 137 */ { { -max, zero }, { zero, zero }, 1.e+00f },
+        /* 138 */ { { -6.1409603e+25f, zero }, { zero, zero }, 1.e+00f },
+        /* 139 */ { { -1.1082383e+13f, zero }, { zero, zero }, 1.e+00f },
+        /* 140 */ { { -2.e+00f, zero }, { 1.3533528e-01f, zero }, 4.e+00f },
+        /* 141 */ { { -3.6093321e-13f, zero }, { 1.e+00f, zero }, 5.e-01f },
+        /* 142 */ { { -6.5136393e-26f, zero }, { 1.e+00f, zero }, 5.e-01f },
+        /* 143 */ { { -min, zero }, { 1.e+00f, zero }, 5.e-01f },
+        /* 144 */ { { zero, zero }, { 1.e+00f, zero }, 5.e-01f },
+        /* 145 */ { { min, zero }, { 1.e+00f, zero }, 5.e-01f },
+        /* 146 */ { { 6.5136393e-26f, zero }, { 1.e+00f, zero }, 5.e-01f },
+        /* 147 */ { { 3.6093321e-13f, zero }, { 1.e+00f, zero }, 5.e-01f },
+        /* 148 */ { { 2.e+00f, zero }, { 7.3890562e+00f, zero }, 1.25e-01f },
+        /* 149 */ { { 1.1082383e+13f, zero }, { inf, zero }, 1.e+00f },
+        /* 150 */ { { 6.1409603e+25f, zero }, { inf, zero }, 1.e+00f },
+        /* 151 */ { { max, zero }, { inf, zero }, 1.e+00f },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00f },
+        /* 153 */ { { -inf, min }, { zero, zero }, 1.e+00f },
+        /* 154 */ { { -max, min }, { zero, zero }, 1.e+00f },
+        /* 155 */ { { -6.1409603e+25f, min }, { zero, zero }, 1.e+00f },
+        /* 156 */ { { -1.1082383e+13f, min }, { zero, zero }, 1.e+00f },
+        /* 157 */ { { -2.e+00f, min }, { 1.3533528e-01f, zero }, 4.e+00f },
+        /* 158 */ { { -3.6093321e-13f, min }, { 1.e+00f, min }, 5.e-01f },
+        /* 159 */ { { -6.5136393e-26f, min }, { 1.e+00f, min }, 5.e-01f },
+        /* 160 */ { { -min, min }, { 1.e+00f, min }, 5.e-01f },
+        /* 161 */ { { zero, min }, { 1.e+00f, min }, 5.e-01f },
+        /* 162 */ { { min, min }, { 1.e+00f, min }, 5.e-01f },
+        /* 163 */ { { 6.5136393e-26f, min }, { 1.e+00f, min }, 5.e-01f },
+        /* 164 */ { { 3.6093321e-13f, min }, { 1.e+00f, min }, 5.e-01f },
+        /* 165 */ { { 2.e+00f, min }, { 7.3890562e+00f, 8.6857938e-38f }, 1.25e-01f },
+        /* 166 */ { { 1.1082383e+13f, min }, { inf, inf }, 1.e+00f },
+        /* 167 */ { { 6.1409603e+25f, min }, { inf, inf }, 1.e+00f },
+        /* 168 */ { { max, min }, { inf, inf }, 1.e+00f },
+        /* 169 */ { { inf, min }, { inf, inf }, 1.e+00f },
+        /* 170 */ { { -inf, 6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 171 */ { { -max, 6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 172 */ { { -6.1409603e+25f, 6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 173 */ { { -1.1082383e+13f, 6.5136393e-26f }, { zero, zero }, 1.e+00f },
+        /* 174 */ { { -2.e+00f, 6.5136393e-26f }, { 1.3533528e-01f, 8.8152525e-27f }, 4.e+00f },
+        /* 175 */ { { -3.6093321e-13f, 6.5136393e-26f }, { 1.e+00f, 6.5136393e-26f }, 5.e-01f },
+        /* 176 */ { { -6.5136393e-26f, 6.5136393e-26f }, { 1.e+00f, 6.5136393e-26f }, 5.e-01f },
+        /* 177 */ { { -min, 6.5136393e-26f }, { 1.e+00f, 6.5136393e-26f }, 5.e-01f },
+        /* 178 */ { { zero, 6.5136393e-26f }, { 1.e+00f, 6.5136393e-26f }, 5.e-01f },
+        /* 179 */ { { min, 6.5136393e-26f }, { 1.e+00f, 6.5136393e-26f }, 5.e-01f },
+        /* 180 */ { { 6.5136393e-26f, 6.5136393e-26f }, { 1.e+00f, 6.5136393e-26f }, 5.e-01f },
+        /* 181 */ { { 3.6093321e-13f, 6.5136393e-26f }, { 1.e+00f, 6.5136393e-26f }, 5.e-01f },
+        /* 182 */ { { 2.e+00f, 6.5136393e-26f }, { 7.3890562e+00f, 4.8129646e-25f }, 1.25e-01f },
+        /* 183 */ { { 1.1082383e+13f, 6.5136393e-26f }, { inf, inf }, 1.e+00f },
+        /* 184 */ { { 6.1409603e+25f, 6.5136393e-26f }, { inf, inf }, 1.e+00f },
+        /* 185 */ { { max, 6.5136393e-26f }, { inf, inf }, 1.e+00f },
+        /* 186 */ { { inf, 6.5136393e-26f }, { inf, inf }, 1.e+00f },
+        /* 187 */ { { -inf, 3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 188 */ { { -max, 3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 189 */ { { -6.1409603e+25f, 3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 190 */ { { -1.1082383e+13f, 3.6093321e-13f }, { zero, zero }, 1.e+00f },
+        /* 191 */ { { -2.e+00f, 3.6093321e-13f }, { 1.3533528e-01f, 4.8846998e-14f }, 4.e+00f },
+        /* 192 */ { { -3.6093321e-13f, 3.6093321e-13f }, { 1.e+00f, 3.6093321e-13f }, 5.e-01f },
+        /* 193 */ { { -6.5136393e-26f, 3.6093321e-13f }, { 1.e+00f, 3.6093321e-13f }, 5.e-01f },
+        /* 194 */ { { -min, 3.6093321e-13f }, { 1.e+00f, 3.6093321e-13f }, 5.e-01f },
+        /* 195 */ { { zero, 3.6093321e-13f }, { 1.e+00f, 3.6093321e-13f }, 5.e-01f },
+        /* 196 */ { { min, 3.6093321e-13f }, { 1.e+00f, 3.6093321e-13f }, 5.e-01f },
+        /* 197 */ { { 6.5136393e-26f, 3.6093321e-13f }, { 1.e+00f, 3.6093321e-13f }, 5.e-01f },
+        /* 198 */ { { 3.6093321e-13f, 3.6093321e-13f }, { 1.e+00f, 3.6093321e-13f }, 5.e-01f },
+        /* 199 */ { { 2.e+00f, 3.6093321e-13f }, { 7.3890562e+00f, 2.6669556e-12f }, 1.25e-01f },
+        /* 200 */ { { 1.1082383e+13f, 3.6093321e-13f }, { inf, inf }, 1.e+00f },
+        /* 201 */ { { 6.1409603e+25f, 3.6093321e-13f }, { inf, inf }, 1.e+00f },
+        /* 202 */ { { max, 3.6093321e-13f }, { inf, inf }, 1.e+00f },
+        /* 203 */ { { inf, 3.6093321e-13f }, { inf, inf }, 1.e+00f },
+        /* 204 */ { { -inf, 2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 205 */ { { -max, 2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 206 */ { { -6.1409603e+25f, 2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 207 */ { { -1.1082383e+13f, 2.e+00f }, { zero, zero }, 1.e+00f },
+        /* 208 */ { { -2.e+00f, 2.e+00f }, { -5.6319349e-02f, 1.2306003e-01f }, 4.e+00f },
+        /* 209 */ { { -3.6093321e-13f, 2.e+00f }, { -4.1614684e-01f, 9.0929741e-01f }, 5.e-01f },
+        /* 210 */ { { -6.5136393e-26f, 2.e+00f }, { -4.1614684e-01f, 9.0929741e-01f }, 5.e-01f },
+        /* 211 */ { { -min, 2.e+00f }, { -4.1614684e-01f, 9.0929741e-01f }, 5.e-01f },
+        /* 212 */ { { zero, 2.e+00f }, { -4.1614684e-01f, 9.0929741e-01f }, 5.e-01f },
+        /* 213 */ { { min, 2.e+00f }, { -4.1614684e-01f, 9.0929741e-01f }, 5.e-01f },
+        /* 214 */ { { 6.5136393e-26f, 2.e+00f }, { -4.1614684e-01f, 9.0929741e-01f }, 5.e-01f },
+        /* 215 */ { { 3.6093321e-13f, 2.e+00f }, { -4.1614684e-01f, 9.0929741e-01f }, 5.e-01f },
+        /* 216 */ { { 2.e+00f, 2.e+00f }, { -3.0749323e+00f, 6.7188497e+00f }, 1.25e-01f },
+        /* 217 */ { { 1.1082383e+13f, 2.e+00f }, { -inf, inf }, 1.e+00f },
+        /* 218 */ { { 6.1409603e+25f, 2.e+00f }, { -inf, inf }, 1.e+00f },
+        /* 219 */ { { max, 2.e+00f }, { -inf, inf }, 1.e+00f },
+        /* 220 */ { { inf, 2.e+00f }, { -inf, inf }, 1.e+00f },
+        /* 221 */ { { -inf, 1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 222 */ { { -max, 1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 223 */ { { -6.1409603e+25f, 1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 224 */ { { -1.1082383e+13f, 1.1082383e+13f }, { zero, zero }, 1.e+00f },
+        /* 225 */ { { -2.e+00f, 1.1082383e+13f }, { 1.1061824e-01f, 7.7969506e-02f }, 4.e+00f },
+        /* 226 */ { { -3.6093321e-13f, 1.1082383e+13f }, { 8.1736439e-01f, 5.7612109e-01f }, 5.e-01f },
+        /* 227 */ { { -6.5136393e-26f, 1.1082383e+13f }, { 8.1736439e-01f, 5.7612109e-01f }, 5.e-01f },
+        /* 228 */ { { -min, 1.1082383e+13f }, { 8.1736439e-01f, 5.7612109e-01f }, 5.e-01f },
+        /* 229 */ { { zero, 1.1082383e+13f }, { 8.1736439e-01f, 5.7612109e-01f }, 5.e-01f },
+        /* 230 */ { { min, 1.1082383e+13f }, { 8.1736439e-01f, 5.7612109e-01f }, 5.e-01f },
+        /* 231 */ { { 6.5136393e-26f, 1.1082383e+13f }, { 8.1736439e-01f, 5.7612109e-01f }, 5.e-01f },
+        /* 232 */ { { 3.6093321e-13f, 1.1082383e+13f }, { 8.1736439e-01f, 5.7612109e-01f }, 5.e-01f },
+        /* 233 */ { { 2.e+00f, 1.1082383e+13f }, { 6.0395513e+00f, 4.2569909e+00f }, 1.25e-01f },
+        /* 234 */ { { 1.1082383e+13f, 1.1082383e+13f }, { inf, inf }, 1.e+00f },
+        /* 235 */ { { 6.1409603e+25f, 1.1082383e+13f }, { inf, inf }, 1.e+00f },
+        /* 236 */ { { max, 1.1082383e+13f }, { inf, inf }, 1.e+00f },
+        /* 237 */ { { inf, 1.1082383e+13f }, { inf, inf }, 1.e+00f },
+        /* 238 */ { { -inf, 6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 239 */ { { -max, 6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 240 */ { { -6.1409603e+25f, 6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 241 */ { { -1.1082383e+13f, 6.1409603e+25f }, { zero, zero }, 1.e+00f },
+        /* 242 */ { { -2.e+00f, 6.1409603e+25f }, { 9.5967902e-03f, -1.349946e-01f }, 4.e+00f },
+        /* 243 */ { { -3.6093321e-13f, 6.1409603e+25f }, { 7.0911221e-02f, -9.9748266e-01f }, 5.e-01f },
+        /* 244 */ { { -6.5136393e-26f, 6.1409603e+25f }, { 7.0911221e-02f, -9.9748266e-01f }, 5.e-01f },
+        /* 245 */ { { -min, 6.1409603e+25f }, { 7.0911221e-02f, -9.9748266e-01f }, 5.e-01f },
+        /* 246 */ { { zero, 6.1409603e+25f }, { 7.0911221e-02f, -9.9748266e-01f }, 5.e-01f },
+        /* 247 */ { { min, 6.1409603e+25f }, { 7.0911221e-02f, -9.9748266e-01f }, 5.e-01f },
+        /* 248 */ { { 6.5136393e-26f, 6.1409603e+25f }, { 7.0911221e-02f, -9.9748266e-01f }, 5.e-01f },
+        /* 249 */ { { 3.6093321e-13f, 6.1409603e+25f }, { 7.0911221e-02f, -9.9748266e-01f }, 5.e-01f },
+        /* 250 */ { { 2.e+00f, 6.1409603e+25f }, { 5.2396703e-01f, -7.3704553e+00f }, 1.25e-01f },
+        /* 251 */ { { 1.1082383e+13f, 6.1409603e+25f }, { inf, -inf }, 1.e+00f },
+        /* 252 */ { { 6.1409603e+25f, 6.1409603e+25f }, { inf, -inf }, 1.e+00f },
+        /* 253 */ { { max, 6.1409603e+25f }, { inf, -inf }, 1.e+00f },
+        /* 254 */ { { inf, 6.1409603e+25f }, { inf, -inf }, 1.e+00f },
+        /* 255 */ { { -inf, max }, { zero, zero }, 1.e+00f },
+        /* 256 */ { { -max, max }, { zero, zero }, 1.e+00f },
+        /* 257 */ { { -6.1409603e+25f, max }, { zero, zero }, 1.e+00f },
+        /* 258 */ { { -1.1082383e+13f, max }, { zero, zero }, 1.e+00f },
+        /* 259 */ { { -2.e+00f, max }, { 1.1544384e-01f, -7.0628308e-02f }, 4.e+00f },
+        /* 260 */ { { -3.6093321e-13f, max }, { 8.5302103e-01f, -5.2187651e-01f }, 5.e-01f },
+        /* 261 */ { { -6.5136393e-26f, max }, { 8.5302103e-01f, -5.2187651e-01f }, 5.e-01f },
+        /* 262 */ { { -min, max }, { 8.5302103e-01f, -5.2187651e-01f }, 5.e-01f },
+        /* 263 */ { { zero, max }, { 8.5302103e-01f, -5.2187651e-01f }, 5.e-01f },
+        /* 264 */ { { min, max }, { 8.5302103e-01f, -5.2187651e-01f }, 5.e-01f },
+        /* 265 */ { { 6.5136393e-26f, max }, { 8.5302103e-01f, -5.2187651e-01f }, 5.e-01f },
+        /* 266 */ { { 3.6093321e-13f, max }, { 8.5302103e-01f, -5.2187651e-01f }, 5.e-01f },
+        /* 267 */ { { 2.e+00f, max }, { 6.3030205e+00f, -3.8561749e+00f }, 1.25e-01f },
+        /* 268 */ { { 1.1082383e+13f, max }, { inf, -inf }, 1.e+00f },
+        /* 269 */ { { 6.1409603e+25f, max }, { inf, -inf }, 1.e+00f },
+        /* 270 */ { { max, max }, { inf, -inf }, 1.e+00f },
+        /* 271 */ { { inf, max }, { inf, -inf }, 1.e+00f },
+        /* 272 */ { { -inf, inf }, { nan, nan }, 1.e+00f },
+        /* 273 */ { { -max, inf }, { nan, nan }, 1.e+00f },
+        /* 274 */ { { -6.1409603e+25f, inf }, { nan, nan }, 1.e+00f },
+        /* 275 */ { { -1.1082383e+13f, inf }, { nan, nan }, 1.e+00f },
+        /* 276 */ { { -2.e+00f, inf }, { nan, nan }, 1.e+00f },
+        /* 277 */ { { -3.6093321e-13f, inf }, { nan, nan }, 1.e+00f },
+        /* 278 */ { { -6.5136393e-26f, inf }, { nan, nan }, 1.e+00f },
+        /* 279 */ { { -min, inf }, { nan, nan }, 1.e+00f },
+        /* 280 */ { { zero, inf }, { nan, nan }, 1.e+00f },
+        /* 281 */ { { min, inf }, { nan, nan }, 1.e+00f },
+        /* 282 */ { { 6.5136393e-26f, inf }, { nan, nan }, 1.e+00f },
+        /* 283 */ { { 3.6093321e-13f, inf }, { nan, nan }, 1.e+00f },
+        /* 284 */ { { 2.e+00f, inf }, { nan, nan }, 1.e+00f },
+        /* 285 */ { { 1.1082383e+13f, inf }, { nan, nan }, 1.e+00f },
+        /* 286 */ { { 6.1409603e+25f, inf }, { nan, nan }, 1.e+00f },
+        /* 287 */ { { max, inf }, { nan, nan }, 1.e+00f },
+        /* 288 */ { { inf, inf }, { nan, nan }, 1.e+00f }  // clang-format on
+      };
+      return table;
+    } else if constexpr (std::is_same_v<T, double>) {
+      const T nan = std::nan("");
+      const T zero = 0.0;
+      const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
+      const T min = std::numeric_limits<T>::min();
+#endif
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { nan, nan }, 1.e+00 },
+        /* 1 */ { { -max, -inf }, { nan, nan }, 1.e+00 },
+        /* 2 */ { { -4.0131652080900752e+205, -inf }, { nan, nan }, 1.e+00 },
+        /* 3 */ { { -8.9589789687104559e+102, -inf }, { nan, nan }, 1.e+00 },
+        /* 4 */ { { -1.9999999999998694e+00, -inf }, { nan, nan }, 1.e+00 },
+        /* 5 */ { { -4.4647944971961829e-103, -inf }, { nan, nan }, 1.e+00 },
+        /* 6 */ { { -9.9671949510973086e-206, -inf }, { nan, nan }, 1.e+00 },
+        /* 7 */ { { -min, -inf }, { nan, nan }, 1.e+00 },
+        /* 8 */ { { zero, -inf }, { nan, nan }, 1.e+00 },
+        /* 9 */ { { min, -inf }, { nan, nan }, 1.e+00 },
+        /* 10 */ { { 9.9671949510973086e-206, -inf }, { nan, nan }, 1.e+00 },
+        /* 11 */ { { 4.4647944971961829e-103, -inf }, { nan, nan }, 1.e+00 },
+        /* 12 */ { { 1.9999999999998694e+00, -inf }, { nan, nan }, 1.e+00 },
+        /* 13 */ { { 8.9589789687104559e+102, -inf }, { nan, nan }, 1.e+00 },
+        /* 14 */ { { 4.0131652080900752e+205, -inf }, { nan, nan }, 1.e+00 },
+        /* 15 */ { { max, -inf }, { nan, nan }, 1.e+00 },
+        /* 16 */ { { inf, -inf }, { nan, nan }, 1.e+00 },
+        /* 17 */ { { -inf, -max }, { zero, zero }, 1.e+00 },
+        /* 18 */ { { -max, -max }, { zero, zero }, 1.e+00 },
+        /* 19 */ { { -4.0131652080900752e+205, -max }, { zero, zero }, 1.e+00 },
+        /* 20 */ { { -8.9589789687104559e+102, -max }, { zero, zero }, 1.e+00 },
+        /* 21 */ { { -1.9999999999998694e+00, -max }, { -1.3533361718168704e-01, -6.7152755680157945e-04 }, 4.e+00 },
+        /* 22 */ { { -4.4647944971961829e-103, -max }, { -9.9998768942655991e-01, -4.961954789184062e-03 }, 5.e-01 },
+        /* 23 */ { { -9.9671949510973086e-206, -max }, { -9.9998768942655991e-01, -4.961954789184062e-03 }, 5.e-01 },
+        /* 24 */ { { -min, -max }, { -9.9998768942655991e-01, -4.961954789184062e-03 }, 5.e-01 },
+        /* 25 */ { { zero, -max }, { -9.9998768942655991e-01, -4.961954789184062e-03 }, 5.e-01 },
+        /* 26 */ { { min, -max }, { -9.9998768942655991e-01, -4.961954789184062e-03 }, 5.e-01 },
+        /* 27 */ { { 9.9671949510973086e-206, -max }, { -9.9998768942655991e-01, -4.961954789184062e-03 }, 5.e-01 },
+        /* 28 */ { { 4.4647944971961829e-103, -max }, { -9.9998768942655991e-01, -4.961954789184062e-03 }, 5.e-01 },
+        /* 29 */ { { 1.9999999999998694e+00, -max }, { -7.3889651354119268e+00, -3.6664162297633855e-02 }, 1.25e-01 },
+        /* 30 */ { { 8.9589789687104559e+102, -max }, { -inf, -inf }, 1.e+00 },
+        /* 31 */ { { 4.0131652080900752e+205, -max }, { -inf, -inf }, 1.e+00 },
+        /* 32 */ { { max, -max }, { -inf, -inf }, 1.e+00 },
+        /* 33 */ { { inf, -max }, { -inf, -inf }, 1.e+00 },
+        /* 34 */ { { -inf, -4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 35 */ { { -max, -4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 36 */ { { -4.0131652080900752e+205, -4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 37 */ { { -8.9589789687104559e+102, -4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 38 */ { { -1.9999999999998694e+00, -4.0131652080900752e+205 }, { 5.4291544962016755e-02, 1.2396800810844819e-01 }, 4.e+00 },
+        /* 39 */ { { -4.4647944971961829e-103, -4.0131652080900752e+205 }, { 4.0116327142190517e-01, 9.1600656638589384e-01 }, 5.e-01 },
+        /* 40 */ { { -9.9671949510973086e-206, -4.0131652080900752e+205 }, { 4.0116327142190517e-01, 9.1600656638589384e-01 }, 5.e-01 },
+        /* 41 */ { { -min, -4.0131652080900752e+205 }, { 4.0116327142190517e-01, 9.1600656638589384e-01 }, 5.e-01 },
+        /* 42 */ { { zero, -4.0131652080900752e+205 }, { 4.0116327142190517e-01, 9.1600656638589384e-01 }, 5.e-01 },
+        /* 43 */ { { min, -4.0131652080900752e+205 }, { 4.0116327142190517e-01, 9.1600656638589384e-01 }, 5.e-01 },
+        /* 44 */ { { 9.9671949510973086e-206, -4.0131652080900752e+205 }, { 4.0116327142190517e-01, 9.1600656638589384e-01 }, 5.e-01 },
+        /* 45 */ { { 4.4647944971961829e-103, -4.0131652080900752e+205 }, { 4.0116327142190517e-01, 9.1600656638589384e-01 }, 5.e-01 },
+        /* 46 */ { { 1.9999999999998694e+00, -4.0131652080900752e+205 }, { 2.964217917366613e+00, 6.7684239060133287e+00 }, 1.25e-01 },
+        /* 47 */ { { 8.9589789687104559e+102, -4.0131652080900752e+205 }, { inf, inf }, 1.e+00 },
+        /* 48 */ { { 4.0131652080900752e+205, -4.0131652080900752e+205 }, { inf, inf }, 1.e+00 },
+        /* 49 */ { { max, -4.0131652080900752e+205 }, { inf, inf }, 1.e+00 },
+        /* 50 */ { { inf, -4.0131652080900752e+205 }, { inf, inf }, 1.e+00 },
+        /* 51 */ { { -inf, -8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 52 */ { { -max, -8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 53 */ { { -4.0131652080900752e+205, -8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 54 */ { { -8.9589789687104559e+102, -8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 55 */ { { -1.9999999999998694e+00, -8.9589789687104559e+102 }, { 8.9268873295850301e-02, 1.0171876498084505e-01 }, 4.e+00 },
+        /* 56 */ { { -4.4647944971961829e-103, -8.9589789687104559e+102 }, { 6.5961271267128396e-01, 7.5160566075730839e-01 }, 5.e-01 },
+        /* 57 */ { { -9.9671949510973086e-206, -8.9589789687104559e+102 }, { 6.5961271267128396e-01, 7.5160566075730839e-01 }, 5.e-01 },
+        /* 58 */ { { -min, -8.9589789687104559e+102 }, { 6.5961271267128396e-01, 7.5160566075730839e-01 }, 5.e-01 },
+        /* 59 */ { { zero, -8.9589789687104559e+102 }, { 6.5961271267128396e-01, 7.5160566075730839e-01 }, 5.e-01 },
+        /* 60 */ { { min, -8.9589789687104559e+102 }, { 6.5961271267128396e-01, 7.5160566075730839e-01 }, 5.e-01 },
+        /* 61 */ { { 9.9671949510973086e-206, -8.9589789687104559e+102 }, { 6.5961271267128396e-01, 7.5160566075730839e-01 }, 5.e-01 },
+        /* 62 */ { { 4.4647944971961829e-103, -8.9589789687104559e+102 }, { 6.5961271267128396e-01, 7.5160566075730839e-01 }, 5.e-01 },
+        /* 63 */ { { 1.9999999999998694e+00, -8.9589789687104559e+102 }, { 4.8739153374953057e+00, 5.5536563916088655e+00 }, 1.25e-01 },
+        /* 64 */ { { 8.9589789687104559e+102, -8.9589789687104559e+102 }, { inf, inf }, 1.e+00 },
+        /* 65 */ { { 4.0131652080900752e+205, -8.9589789687104559e+102 }, { inf, inf }, 1.e+00 },
+        /* 66 */ { { max, -8.9589789687104559e+102 }, { inf, inf }, 1.e+00 },
+        /* 67 */ { { inf, -8.9589789687104559e+102 }, { inf, inf }, 1.e+00 },
+        /* 68 */ { { -inf, -1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 69 */ { { -max, -1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 70 */ { { -4.0131652080900752e+205, -1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 71 */ { { -8.9589789687104559e+102, -1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 72 */ { { -1.9999999999998694e+00, -1.9999999999998694e+00 }, { -5.6319349992119169e-02, -1.2306002480580015e-01 }, 4.e+00 },
+        /* 73 */ { { -4.4647944971961829e-103, -1.9999999999998694e+00 }, { -4.1614683654702367e-01, -9.09297426825736e-01 }, 5.e-01 },
+        /* 74 */ { { -9.9671949510973086e-206, -1.9999999999998694e+00 }, { -4.1614683654702367e-01, -9.09297426825736e-01 }, 5.e-01 },
+        /* 75 */ { { -min, -1.9999999999998694e+00 }, { -4.1614683654702367e-01, -9.09297426825736e-01 }, 5.e-01 },
+        /* 76 */ { { zero, -1.9999999999998694e+00 }, { -4.1614683654702367e-01, -9.09297426825736e-01 }, 5.e-01 },
+        /* 77 */ { { min, -1.9999999999998694e+00 }, { -4.1614683654702367e-01, -9.09297426825736e-01 }, 5.e-01 },
+        /* 78 */ { { 9.9671949510973086e-206, -1.9999999999998694e+00 }, { -4.1614683654702367e-01, -9.09297426825736e-01 }, 5.e-01 },
+        /* 79 */ { { 4.4647944971961829e-103, -1.9999999999998694e+00 }, { -4.1614683654702367e-01, -9.09297426825736e-01 }, 5.e-01 },
+        /* 80 */ { { 1.9999999999998694e+00, -1.9999999999998694e+00 }, { -3.0749323206380801e+00, -6.7188496974277738e+00 }, 1.25e-01 },
+        /* 81 */ { { 8.9589789687104559e+102, -1.9999999999998694e+00 }, { -inf, -inf }, 1.e+00 },
+        /* 82 */ { { 4.0131652080900752e+205, -1.9999999999998694e+00 }, { -inf, -inf }, 1.e+00 },
+        /* 83 */ { { max, -1.9999999999998694e+00 }, { -inf, -inf }, 1.e+00 },
+        /* 84 */ { { inf, -1.9999999999998694e+00 }, { -inf, -inf }, 1.e+00 },
+        /* 85 */ { { -inf, -4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 86 */ { { -max, -4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 87 */ { { -4.0131652080900752e+205, -4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 88 */ { { -8.9589789687104559e+102, -4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 89 */ { { -1.9999999999998694e+00, -4.4647944971961829e-103 }, { 1.3533528323663035e-01, -6.0424422787139409e-104 }, 4.e+00 },
+        /* 90 */ { { -4.4647944971961829e-103, -4.4647944971961829e-103 }, { 1.e+00, -4.4647944971961829e-103 }, 5.e-01 },
+        /* 91 */ { { -9.9671949510973086e-206, -4.4647944971961829e-103 }, { 1.e+00, -4.4647944971961829e-103 }, 5.e-01 },
+        /* 92 */ { { -min, -4.4647944971961829e-103 }, { 1.e+00, -4.4647944971961829e-103 }, 5.e-01 },
+        /* 93 */ { { zero, -4.4647944971961829e-103 }, { 1.e+00, -4.4647944971961829e-103 }, 5.e-01 },
+        /* 94 */ { { min, -4.4647944971961829e-103 }, { 1.e+00, -4.4647944971961829e-103 }, 5.e-01 },
+        /* 95 */ { { 9.9671949510973086e-206, -4.4647944971961829e-103 }, { 1.e+00, -4.4647944971961829e-103 }, 5.e-01 },
+        /* 96 */ { { 4.4647944971961829e-103, -4.4647944971961829e-103 }, { 1.e+00, -4.4647944971961829e-103 }, 5.e-01 },
+        /* 97 */ { { 1.9999999999998694e+00, -4.4647944971961829e-103 }, { 7.3890560989296858e+00, -3.2990617009975154e-102 }, 1.25e-01 },
+        /* 98 */ { { 8.9589789687104559e+102, -4.4647944971961829e-103 }, { inf, -inf }, 1.e+00 },
+        /* 99 */ { { 4.0131652080900752e+205, -4.4647944971961829e-103 }, { inf, -inf }, 1.e+00 },
+        /* 100 */ { { max, -4.4647944971961829e-103 }, { inf, -inf }, 1.e+00 },
+        /* 101 */ { { inf, -4.4647944971961829e-103 }, { inf, -inf }, 1.e+00 },
+        /* 102 */ { { -inf, -9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 103 */ { { -max, -9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 104 */ { { -4.0131652080900752e+205, -9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 105 */ { { -8.9589789687104559e+102, -9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 106 */ { { -1.9999999999998694e+00, -9.9671949510973086e-206 }, { 1.3533528323663035e-01, -1.3489131517814665e-206 }, 4.e+00 },
+        /* 107 */ { { -4.4647944971961829e-103, -9.9671949510973086e-206 }, { 1.e+00, -9.9671949510973086e-206 }, 5.e-01 },
+        /* 108 */ { { -9.9671949510973086e-206, -9.9671949510973086e-206 }, { 1.e+00, -9.9671949510973086e-206 }, 5.e-01 },
+        /* 109 */ { { -min, -9.9671949510973086e-206 }, { 1.e+00, -9.9671949510973086e-206 }, 5.e-01 },
+        /* 110 */ { { zero, -9.9671949510973086e-206 }, { 1.e+00, -9.9671949510973086e-206 }, 5.e-01 },
+        /* 111 */ { { min, -9.9671949510973086e-206 }, { 1.e+00, -9.9671949510973086e-206 }, 5.e-01 },
+        /* 112 */ { { 9.9671949510973086e-206, -9.9671949510973086e-206 }, { 1.e+00, -9.9671949510973086e-206 }, 5.e-01 },
+        /* 113 */ { { 4.4647944971961829e-103, -9.9671949510973086e-206 }, { 1.e+00, -9.9671949510973086e-206 }, 5.e-01 },
+        /* 114 */ { { 1.9999999999998694e+00, -9.9671949510973086e-206 }, { 7.3890560989296858e+00, -7.3648162642626732e-205 }, 1.25e-01 },
+        /* 115 */ { { 8.9589789687104559e+102, -9.9671949510973086e-206 }, { inf, -inf }, 1.e+00 },
+        /* 116 */ { { 4.0131652080900752e+205, -9.9671949510973086e-206 }, { inf, -inf }, 1.e+00 },
+        /* 117 */ { { max, -9.9671949510973086e-206 }, { inf, -inf }, 1.e+00 },
+        /* 118 */ { { inf, -9.9671949510973086e-206 }, { inf, -inf }, 1.e+00 },
+        /* 119 */ { { -inf, -min }, { zero, zero }, 1.e+00 },
+        /* 120 */ { { -max, -min }, { zero, zero }, 1.e+00 },
+        /* 121 */ { { -4.0131652080900752e+205, -min }, { zero, zero }, 1.e+00 },
+        /* 122 */ { { -8.9589789687104559e+102, -min }, { zero, zero }, 1.e+00 },
+        /* 123 */ { { -1.9999999999998694e+00, -min }, { 1.3533528323663035e-01, zero }, 4.e+00 },
+        /* 124 */ { { -4.4647944971961829e-103, -min }, { 1.e+00, -min }, 5.e-01 },
+        /* 125 */ { { -9.9671949510973086e-206, -min }, { 1.e+00, -min }, 5.e-01 },
+        /* 126 */ { { -min, -min }, { 1.e+00, -min }, 5.e-01 },
+        /* 127 */ { { zero, -min }, { 1.e+00, -min }, 5.e-01 },
+        /* 128 */ { { min, -min }, { 1.e+00, -min }, 5.e-01 },
+        /* 129 */ { { 9.9671949510973086e-206, -min }, { 1.e+00, -min }, 5.e-01 },
+        /* 130 */ { { 4.4647944971961829e-103, -min }, { 1.e+00, -min }, 5.e-01 },
+        /* 131 */ { { 1.9999999999998694e+00, -min }, { 7.3890560989296858e+00, -1.6441195564771645e-307 }, 1.25e-01 },
+        /* 132 */ { { 8.9589789687104559e+102, -min }, { inf, -inf }, 1.e+00 },
+        /* 133 */ { { 4.0131652080900752e+205, -min }, { inf, -inf }, 1.e+00 },
+        /* 134 */ { { max, -min }, { inf, -inf }, 1.e+00 },
+        /* 135 */ { { inf, -min }, { inf, -inf }, 1.e+00 },
+        /* 136 */ { { -inf, zero }, { zero, zero }, 1.e+00 },
+        /* 137 */ { { -max, zero }, { zero, zero }, 1.e+00 },
+        /* 138 */ { { -4.0131652080900752e+205, zero }, { zero, zero }, 1.e+00 },
+        /* 139 */ { { -8.9589789687104559e+102, zero }, { zero, zero }, 1.e+00 },
+        /* 140 */ { { -1.9999999999998694e+00, zero }, { 1.3533528323663035e-01, zero }, 4.e+00 },
+        /* 141 */ { { -4.4647944971961829e-103, zero }, { 1.e+00, zero }, 5.e-01 },
+        /* 142 */ { { -9.9671949510973086e-206, zero }, { 1.e+00, zero }, 5.e-01 },
+        /* 143 */ { { -min, zero }, { 1.e+00, zero }, 5.e-01 },
+        /* 144 */ { { zero, zero }, { 1.e+00, zero }, 5.e-01 },
+        /* 145 */ { { min, zero }, { 1.e+00, zero }, 5.e-01 },
+        /* 146 */ { { 9.9671949510973086e-206, zero }, { 1.e+00, zero }, 5.e-01 },
+        /* 147 */ { { 4.4647944971961829e-103, zero }, { 1.e+00, zero }, 5.e-01 },
+        /* 148 */ { { 1.9999999999998694e+00, zero }, { 7.3890560989296858e+00, zero }, 1.25e-01 },
+        /* 149 */ { { 8.9589789687104559e+102, zero }, { inf, zero }, 1.e+00 },
+        /* 150 */ { { 4.0131652080900752e+205, zero }, { inf, zero }, 1.e+00 },
+        /* 151 */ { { max, zero }, { inf, zero }, 1.e+00 },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00 },
+        /* 153 */ { { -inf, min }, { zero, zero }, 1.e+00 },
+        /* 154 */ { { -max, min }, { zero, zero }, 1.e+00 },
+        /* 155 */ { { -4.0131652080900752e+205, min }, { zero, zero }, 1.e+00 },
+        /* 156 */ { { -8.9589789687104559e+102, min }, { zero, zero }, 1.e+00 },
+        /* 157 */ { { -1.9999999999998694e+00, min }, { 1.3533528323663035e-01, zero }, 4.e+00 },
+        /* 158 */ { { -4.4647944971961829e-103, min }, { 1.e+00, min }, 5.e-01 },
+        /* 159 */ { { -9.9671949510973086e-206, min }, { 1.e+00, min }, 5.e-01 },
+        /* 160 */ { { -min, min }, { 1.e+00, min }, 5.e-01 },
+        /* 161 */ { { zero, min }, { 1.e+00, min }, 5.e-01 },
+        /* 162 */ { { min, min }, { 1.e+00, min }, 5.e-01 },
+        /* 163 */ { { 9.9671949510973086e-206, min }, { 1.e+00, min }, 5.e-01 },
+        /* 164 */ { { 4.4647944971961829e-103, min }, { 1.e+00, min }, 5.e-01 },
+        /* 165 */ { { 1.9999999999998694e+00, min }, { 7.3890560989296858e+00, 1.6441195564771645e-307 }, 1.25e-01 },
+        /* 166 */ { { 8.9589789687104559e+102, min }, { inf, inf }, 1.e+00 },
+        /* 167 */ { { 4.0131652080900752e+205, min }, { inf, inf }, 1.e+00 },
+        /* 168 */ { { max, min }, { inf, inf }, 1.e+00 },
+        /* 169 */ { { inf, min }, { inf, inf }, 1.e+00 },
+        /* 170 */ { { -inf, 9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 171 */ { { -max, 9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 172 */ { { -4.0131652080900752e+205, 9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 173 */ { { -8.9589789687104559e+102, 9.9671949510973086e-206 }, { zero, zero }, 1.e+00 },
+        /* 174 */ { { -1.9999999999998694e+00, 9.9671949510973086e-206 }, { 1.3533528323663035e-01, 1.3489131517814665e-206 }, 4.e+00 },
+        /* 175 */ { { -4.4647944971961829e-103, 9.9671949510973086e-206 }, { 1.e+00, 9.9671949510973086e-206 }, 5.e-01 },
+        /* 176 */ { { -9.9671949510973086e-206, 9.9671949510973086e-206 }, { 1.e+00, 9.9671949510973086e-206 }, 5.e-01 },
+        /* 177 */ { { -min, 9.9671949510973086e-206 }, { 1.e+00, 9.9671949510973086e-206 }, 5.e-01 },
+        /* 178 */ { { zero, 9.9671949510973086e-206 }, { 1.e+00, 9.9671949510973086e-206 }, 5.e-01 },
+        /* 179 */ { { min, 9.9671949510973086e-206 }, { 1.e+00, 9.9671949510973086e-206 }, 5.e-01 },
+        /* 180 */ { { 9.9671949510973086e-206, 9.9671949510973086e-206 }, { 1.e+00, 9.9671949510973086e-206 }, 5.e-01 },
+        /* 181 */ { { 4.4647944971961829e-103, 9.9671949510973086e-206 }, { 1.e+00, 9.9671949510973086e-206 }, 5.e-01 },
+        /* 182 */ { { 1.9999999999998694e+00, 9.9671949510973086e-206 }, { 7.3890560989296858e+00, 7.3648162642626732e-205 }, 1.25e-01 },
+        /* 183 */ { { 8.9589789687104559e+102, 9.9671949510973086e-206 }, { inf, inf }, 1.e+00 },
+        /* 184 */ { { 4.0131652080900752e+205, 9.9671949510973086e-206 }, { inf, inf }, 1.e+00 },
+        /* 185 */ { { max, 9.9671949510973086e-206 }, { inf, inf }, 1.e+00 },
+        /* 186 */ { { inf, 9.9671949510973086e-206 }, { inf, inf }, 1.e+00 },
+        /* 187 */ { { -inf, 4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 188 */ { { -max, 4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 189 */ { { -4.0131652080900752e+205, 4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 190 */ { { -8.9589789687104559e+102, 4.4647944971961829e-103 }, { zero, zero }, 1.e+00 },
+        /* 191 */ { { -1.9999999999998694e+00, 4.4647944971961829e-103 }, { 1.3533528323663035e-01, 6.0424422787139409e-104 }, 4.e+00 },
+        /* 192 */ { { -4.4647944971961829e-103, 4.4647944971961829e-103 }, { 1.e+00, 4.4647944971961829e-103 }, 5.e-01 },
+        /* 193 */ { { -9.9671949510973086e-206, 4.4647944971961829e-103 }, { 1.e+00, 4.4647944971961829e-103 }, 5.e-01 },
+        /* 194 */ { { -min, 4.4647944971961829e-103 }, { 1.e+00, 4.4647944971961829e-103 }, 5.e-01 },
+        /* 195 */ { { zero, 4.4647944971961829e-103 }, { 1.e+00, 4.4647944971961829e-103 }, 5.e-01 },
+        /* 196 */ { { min, 4.4647944971961829e-103 }, { 1.e+00, 4.4647944971961829e-103 }, 5.e-01 },
+        /* 197 */ { { 9.9671949510973086e-206, 4.4647944971961829e-103 }, { 1.e+00, 4.4647944971961829e-103 }, 5.e-01 },
+        /* 198 */ { { 4.4647944971961829e-103, 4.4647944971961829e-103 }, { 1.e+00, 4.4647944971961829e-103 }, 5.e-01 },
+        /* 199 */ { { 1.9999999999998694e+00, 4.4647944971961829e-103 }, { 7.3890560989296858e+00, 3.2990617009975154e-102 }, 1.25e-01 },
+        /* 200 */ { { 8.9589789687104559e+102, 4.4647944971961829e-103 }, { inf, inf }, 1.e+00 },
+        /* 201 */ { { 4.0131652080900752e+205, 4.4647944971961829e-103 }, { inf, inf }, 1.e+00 },
+        /* 202 */ { { max, 4.4647944971961829e-103 }, { inf, inf }, 1.e+00 },
+        /* 203 */ { { inf, 4.4647944971961829e-103 }, { inf, inf }, 1.e+00 },
+        /* 204 */ { { -inf, 1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 205 */ { { -max, 1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 206 */ { { -4.0131652080900752e+205, 1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 207 */ { { -8.9589789687104559e+102, 1.9999999999998694e+00 }, { zero, zero }, 1.e+00 },
+        /* 208 */ { { -1.9999999999998694e+00, 1.9999999999998694e+00 }, { -5.6319349992119169e-02, 1.2306002480580015e-01 }, 4.e+00 },
+        /* 209 */ { { -4.4647944971961829e-103, 1.9999999999998694e+00 }, { -4.1614683654702367e-01, 9.09297426825736e-01 }, 5.e-01 },
+        /* 210 */ { { -9.9671949510973086e-206, 1.9999999999998694e+00 }, { -4.1614683654702367e-01, 9.09297426825736e-01 }, 5.e-01 },
+        /* 211 */ { { -min, 1.9999999999998694e+00 }, { -4.1614683654702367e-01, 9.09297426825736e-01 }, 5.e-01 },
+        /* 212 */ { { zero, 1.9999999999998694e+00 }, { -4.1614683654702367e-01, 9.09297426825736e-01 }, 5.e-01 },
+        /* 213 */ { { min, 1.9999999999998694e+00 }, { -4.1614683654702367e-01, 9.09297426825736e-01 }, 5.e-01 },
+        /* 214 */ { { 9.9671949510973086e-206, 1.9999999999998694e+00 }, { -4.1614683654702367e-01, 9.09297426825736e-01 }, 5.e-01 },
+        /* 215 */ { { 4.4647944971961829e-103, 1.9999999999998694e+00 }, { -4.1614683654702367e-01, 9.09297426825736e-01 }, 5.e-01 },
+        /* 216 */ { { 1.9999999999998694e+00, 1.9999999999998694e+00 }, { -3.0749323206380801e+00, 6.7188496974277738e+00 }, 1.25e-01 },
+        /* 217 */ { { 8.9589789687104559e+102, 1.9999999999998694e+00 }, { -inf, inf }, 1.e+00 },
+        /* 218 */ { { 4.0131652080900752e+205, 1.9999999999998694e+00 }, { -inf, inf }, 1.e+00 },
+        /* 219 */ { { max, 1.9999999999998694e+00 }, { -inf, inf }, 1.e+00 },
+        /* 220 */ { { inf, 1.9999999999998694e+00 }, { -inf, inf }, 1.e+00 },
+        /* 221 */ { { -inf, 8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 222 */ { { -max, 8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 223 */ { { -4.0131652080900752e+205, 8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 224 */ { { -8.9589789687104559e+102, 8.9589789687104559e+102 }, { zero, zero }, 1.e+00 },
+        /* 225 */ { { -1.9999999999998694e+00, 8.9589789687104559e+102 }, { 8.9268873295850301e-02, -1.0171876498084505e-01 }, 4.e+00 },
+        /* 226 */ { { -4.4647944971961829e-103, 8.9589789687104559e+102 }, { 6.5961271267128396e-01, -7.5160566075730839e-01 }, 5.e-01 },
+        /* 227 */ { { -9.9671949510973086e-206, 8.9589789687104559e+102 }, { 6.5961271267128396e-01, -7.5160566075730839e-01 }, 5.e-01 },
+        /* 228 */ { { -min, 8.9589789687104559e+102 }, { 6.5961271267128396e-01, -7.5160566075730839e-01 }, 5.e-01 },
+        /* 229 */ { { zero, 8.9589789687104559e+102 }, { 6.5961271267128396e-01, -7.5160566075730839e-01 }, 5.e-01 },
+        /* 230 */ { { min, 8.9589789687104559e+102 }, { 6.5961271267128396e-01, -7.5160566075730839e-01 }, 5.e-01 },
+        /* 231 */ { { 9.9671949510973086e-206, 8.9589789687104559e+102 }, { 6.5961271267128396e-01, -7.5160566075730839e-01 }, 5.e-01 },
+        /* 232 */ { { 4.4647944971961829e-103, 8.9589789687104559e+102 }, { 6.5961271267128396e-01, -7.5160566075730839e-01 }, 5.e-01 },
+        /* 233 */ { { 1.9999999999998694e+00, 8.9589789687104559e+102 }, { 4.8739153374953057e+00, -5.5536563916088655e+00 }, 1.25e-01 },
+        /* 234 */ { { 8.9589789687104559e+102, 8.9589789687104559e+102 }, { inf, -inf }, 1.e+00 },
+        /* 235 */ { { 4.0131652080900752e+205, 8.9589789687104559e+102 }, { inf, -inf }, 1.e+00 },
+        /* 236 */ { { max, 8.9589789687104559e+102 }, { inf, -inf }, 1.e+00 },
+        /* 237 */ { { inf, 8.9589789687104559e+102 }, { inf, -inf }, 1.e+00 },
+        /* 238 */ { { -inf, 4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 239 */ { { -max, 4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 240 */ { { -4.0131652080900752e+205, 4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 241 */ { { -8.9589789687104559e+102, 4.0131652080900752e+205 }, { zero, zero }, 1.e+00 },
+        /* 242 */ { { -1.9999999999998694e+00, 4.0131652080900752e+205 }, { 5.4291544962016755e-02, -1.2396800810844819e-01 }, 4.e+00 },
+        /* 243 */ { { -4.4647944971961829e-103, 4.0131652080900752e+205 }, { 4.0116327142190517e-01, -9.1600656638589384e-01 }, 5.e-01 },
+        /* 244 */ { { -9.9671949510973086e-206, 4.0131652080900752e+205 }, { 4.0116327142190517e-01, -9.1600656638589384e-01 }, 5.e-01 },
+        /* 245 */ { { -min, 4.0131652080900752e+205 }, { 4.0116327142190517e-01, -9.1600656638589384e-01 }, 5.e-01 },
+        /* 246 */ { { zero, 4.0131652080900752e+205 }, { 4.0116327142190517e-01, -9.1600656638589384e-01 }, 5.e-01 },
+        /* 247 */ { { min, 4.0131652080900752e+205 }, { 4.0116327142190517e-01, -9.1600656638589384e-01 }, 5.e-01 },
+        /* 248 */ { { 9.9671949510973086e-206, 4.0131652080900752e+205 }, { 4.0116327142190517e-01, -9.1600656638589384e-01 }, 5.e-01 },
+        /* 249 */ { { 4.4647944971961829e-103, 4.0131652080900752e+205 }, { 4.0116327142190517e-01, -9.1600656638589384e-01 }, 5.e-01 },
+        /* 250 */ { { 1.9999999999998694e+00, 4.0131652080900752e+205 }, { 2.964217917366613e+00, -6.7684239060133287e+00 }, 1.25e-01 },
+        /* 251 */ { { 8.9589789687104559e+102, 4.0131652080900752e+205 }, { inf, -inf }, 1.e+00 },
+        /* 252 */ { { 4.0131652080900752e+205, 4.0131652080900752e+205 }, { inf, -inf }, 1.e+00 },
+        /* 253 */ { { max, 4.0131652080900752e+205 }, { inf, -inf }, 1.e+00 },
+        /* 254 */ { { inf, 4.0131652080900752e+205 }, { inf, -inf }, 1.e+00 },
+        /* 255 */ { { -inf, max }, { zero, zero }, 1.e+00 },
+        /* 256 */ { { -max, max }, { zero, zero }, 1.e+00 },
+        /* 257 */ { { -4.0131652080900752e+205, max }, { zero, zero }, 1.e+00 },
+        /* 258 */ { { -8.9589789687104559e+102, max }, { zero, zero }, 1.e+00 },
+        /* 259 */ { { -1.9999999999998694e+00, max }, { -1.3533361718168704e-01, 6.7152755680157945e-04 }, 4.e+00 },
+        /* 260 */ { { -4.4647944971961829e-103, max }, { -9.9998768942655991e-01, 4.961954789184062e-03 }, 5.e-01 },
+        /* 261 */ { { -9.9671949510973086e-206, max }, { -9.9998768942655991e-01, 4.961954789184062e-03 }, 5.e-01 },
+        /* 262 */ { { -min, max }, { -9.9998768942655991e-01, 4.961954789184062e-03 }, 5.e-01 },
+        /* 263 */ { { zero, max }, { -9.9998768942655991e-01, 4.961954789184062e-03 }, 5.e-01 },
+        /* 264 */ { { min, max }, { -9.9998768942655991e-01, 4.961954789184062e-03 }, 5.e-01 },
+        /* 265 */ { { 9.9671949510973086e-206, max }, { -9.9998768942655991e-01, 4.961954789184062e-03 }, 5.e-01 },
+        /* 266 */ { { 4.4647944971961829e-103, max }, { -9.9998768942655991e-01, 4.961954789184062e-03 }, 5.e-01 },
+        /* 267 */ { { 1.9999999999998694e+00, max }, { -7.3889651354119268e+00, 3.6664162297633855e-02 }, 1.25e-01 },
+        /* 268 */ { { 8.9589789687104559e+102, max }, { -inf, inf }, 1.e+00 },
+        /* 269 */ { { 4.0131652080900752e+205, max }, { -inf, inf }, 1.e+00 },
+        /* 270 */ { { max, max }, { -inf, inf }, 1.e+00 },
+        /* 271 */ { { inf, max }, { -inf, inf }, 1.e+00 },
+        /* 272 */ { { -inf, inf }, { nan, nan }, 1.e+00 },
+        /* 273 */ { { -max, inf }, { nan, nan }, 1.e+00 },
+        /* 274 */ { { -4.0131652080900752e+205, inf }, { nan, nan }, 1.e+00 },
+        /* 275 */ { { -8.9589789687104559e+102, inf }, { nan, nan }, 1.e+00 },
+        /* 276 */ { { -1.9999999999998694e+00, inf }, { nan, nan }, 1.e+00 },
+        /* 277 */ { { -4.4647944971961829e-103, inf }, { nan, nan }, 1.e+00 },
+        /* 278 */ { { -9.9671949510973086e-206, inf }, { nan, nan }, 1.e+00 },
+        /* 279 */ { { -min, inf }, { nan, nan }, 1.e+00 },
+        /* 280 */ { { zero, inf }, { nan, nan }, 1.e+00 },
+        /* 281 */ { { min, inf }, { nan, nan }, 1.e+00 },
+        /* 282 */ { { 9.9671949510973086e-206, inf }, { nan, nan }, 1.e+00 },
+        /* 283 */ { { 4.4647944971961829e-103, inf }, { nan, nan }, 1.e+00 },
+        /* 284 */ { { 1.9999999999998694e+00, inf }, { nan, nan }, 1.e+00 },
+        /* 285 */ { { 8.9589789687104559e+102, inf }, { nan, nan }, 1.e+00 },
+        /* 286 */ { { 4.0131652080900752e+205, inf }, { nan, nan }, 1.e+00 },
+        /* 287 */ { { max, inf }, { nan, nan }, 1.e+00 },
+        /* 288 */ { { inf, inf }, { nan, nan }, 1.e+00 }  // clang-format on
+      };
+      return table;
+    } else {
+      static_assert(dependent_false<T>); /* unreachable */
+    }
+  }
+};
+
 // NOLINTEND(whitespace/line_length)
 }  // namespace complex_unary_op_samples
 
diff --git a/third_party/xla/xla/tests/complex_unary_op_test.cc b/third_party/xla/xla/tests/complex_unary_op_test.cc
index eaf59acd7e6e4a..46827630726677 100644
--- a/third_party/xla/xla/tests/complex_unary_op_test.cc
+++ b/third_party/xla/xla/tests/complex_unary_op_test.cc
@@ -40,6 +40,14 @@ class ComplexUnaryOpTest
     : public ClientLibraryTestRunnerMixin<
           HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
+  // Disable constant folding to ensure we test the actual backend
+  // implementation. Otherwise, constant folding pre-computes results using
+  // HloEvaluator's reference implementation (std c++), not the backend under
+  // test.
+  void SetUp() override {
+    ClientLibraryTestRunnerMixin::SetUp();
+    mutable_debug_options()->add_xla_disable_hlo_passes("constant_folding");
+  }
   template <typename T, size_t index, typename... Types>
   std::vector<T> get_column(const std::vector<std::tuple<Types...>>& table) {
     std::vector<T> column;
@@ -116,13 +124,24 @@ TEST_F(ComplexUnaryOpTest, TanTest) {
 }
 
 TEST_F(ComplexUnaryOpTest, AsinTest) {
-  UnaryTestHelper<complex_unary_op_samples::Asin<float>>(Asin);
-  UnaryTestHelper<complex_unary_op_samples::Asin<double>>(Asin);
+  UnaryTestHelper<complex_unary_op_samples::Asin<float>>(
+      [](XlaOp x) { return Asin(x); });
+  UnaryTestHelper<complex_unary_op_samples::Asin<double>>(
+      [](XlaOp x) { return Asin(x); });
 }
 
 TEST_F(ComplexUnaryOpTest, AsinhTest) {
-  UnaryTestHelper<complex_unary_op_samples::Asinh<float>>(Asinh);
-  UnaryTestHelper<complex_unary_op_samples::Asinh<double>>(Asinh);
+  UnaryTestHelper<complex_unary_op_samples::Asinh<float>>(
+      [](XlaOp x) { return Asinh(x); });
+  UnaryTestHelper<complex_unary_op_samples::Asinh<double>>(
+      [](XlaOp x) { return Asinh(x); });
+}
+
+TEST_F(ComplexUnaryOpTest, ExpTest) {
+  UnaryTestHelper<complex_unary_op_samples::Exp<float>>(
+      [](XlaOp x) { return Exp(x); });
+  UnaryTestHelper<complex_unary_op_samples::Exp<double>>(
+      [](XlaOp x) { return Exp(x); });
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/compute_constant_test.cc b/third_party/xla/xla/tests/compute_constant_test.cc
index 7c31befc6d4005..848ef172732978 100644
--- a/third_party/xla/xla/tests/compute_constant_test.cc
+++ b/third_party/xla/xla/tests/compute_constant_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
@@ -59,13 +59,12 @@ class ComputeConstantTest : public ::testing::Test {
     if (client_type == ClientType::kLocal) {
       absl::StatusOr<Client*> result =
           ClientLibrary::GetOrCreateLocalClient(platform);
-      TF_CHECK_OK(result.status())
-          << "could not create LocalClient for testing";
+      CHECK_OK(result.status()) << "could not create LocalClient for testing";
       return result.value();
     } else if (client_type == ClientType::kCompileOnly) {
       absl::StatusOr<Client*> result =
           ClientLibrary::GetOrCreateCompileOnlyClient(platform);
-      TF_CHECK_OK(result.status())
+      CHECK_OK(result.status())
           << "could not create CompileOnlyClient for testing";
       return result.value();
     }
diff --git a/third_party/xla/xla/tests/concatenate_test.cc b/third_party/xla/xla/tests/concatenate_test.cc
index 373fc9590e3249..ec0db90a50411e 100644
--- a/third_party/xla/xla/tests/concatenate_test.cc
+++ b/third_party/xla/xla/tests/concatenate_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -134,10 +134,10 @@ TEST_F(ConcatenateTest, TwoR3Axis1Parallel) {
                           ParseAndReturnVerifiedModule(hlo_text_module));
 
   Literal x_input = LiteralUtil::CreateFromDimensions(S32, {64, 64, 64});
-  TF_CHECK_OK(x_input.Populate<int32_t>(MakeIotaForShape(x_input.shape())));
+  CHECK_OK(x_input.Populate<int32_t>(MakeIotaForShape(x_input.shape())));
 
   Literal y_input = LiteralUtil::CreateFromDimensions(S32, {64, 64, 64});
-  TF_CHECK_OK(
+  CHECK_OK(
       y_input.Populate<int32_t>(MakeNegativeIotaForShape(y_input.shape())));
 
   TF_ASSERT_OK_AND_ASSIGN(Literal result,
diff --git a/third_party/xla/xla/tests/constants_test.cc b/third_party/xla/xla/tests/constants_test.cc
index c08a4bdf2b8f7c..47dbccc1d85e9c 100644
--- a/third_party/xla/xla/tests/constants_test.cc
+++ b/third_party/xla/xla/tests/constants_test.cc
@@ -206,17 +206,25 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
   input_array.FillWithPZ(pz);
   Literal input_literal = LiteralUtil::CreateR4FromArray4D(input_array);
 
-  {
-    XlaBuilder builder(TestName());
-    ConstantLiteral(&builder, input_literal);
-    ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
-  }
+  XlaBuilder builder(TestName());
+  ConstantLiteral(&builder, input_literal);
+  ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
+}
 
-  {
-    XlaBuilder builder(TestName());
-    ConstantR4FromArray4D<float>(&builder, input_array);
-    ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
-  }
+TEST_F(ConstantsTest, Small_3x2x1x1_array4d) {
+  Array4D<float> input_array(3, 2, 1, 1);
+  Array2D<float> pz({
+      // z0 z1
+      {-1.0f, 4.1f},  // p0
+      {2.0f, 4.1f},   // p1
+      {5.0f, 4.4f},   // p2
+  });
+  input_array.FillWithPZ(pz);
+  Literal input_literal = LiteralUtil::CreateR4FromArray4D(input_array);
+
+  XlaBuilder builder(TestName());
+  ConstantR4FromArray4D<float>(&builder, input_array);
+  ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
 }
 
 // TODO(b/29263943): Support tuple constants.
diff --git a/third_party/xla/xla/tests/conv_depthwise_test.cc b/third_party/xla/xla/tests/conv_depthwise_test.cc
index f96eac7c272703..5e4209a5a9083a 100644
--- a/third_party/xla/xla/tests/conv_depthwise_test.cc
+++ b/third_party/xla/xla/tests/conv_depthwise_test.cc
@@ -13,23 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <optional>
+#include <string>
+#include <vector>
 
-#include "xla/execution_options_util.h"
-#include "xla/hlo/builder/xla_computation.h"
+#include "absl/status/status.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/transforms/despecializer.h"
 #include "xla/hlo/transforms/simplifiers/float_normalization.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
 #include "xla/tests/conv_depthwise_common.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
 class DepthwiseConvolution2DTest
-    : public HloTestBase,
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>,
       public ::testing::WithParamInterface<
           ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
 
diff --git a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
index 5e41066abf30cd..5163b9842d8ba5 100644
--- a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
+++ b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
@@ -52,7 +52,8 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
@@ -74,7 +75,8 @@ const float test_float_vals[3][test_width][test_height] = {
 
 // Test whether fusion operations are emitted with no errors and compute
 // accurate outputs.
-class CpuGpuFusionTest : public HloTestBase {
+class CpuGpuFusionTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  protected:
   template <typename T, int Arity>
   void TestElementwise2D(
@@ -155,7 +157,7 @@ class CpuGpuFusionTest : public HloTestBase {
   bool ComputeElementwiseAnswerCompare(ComparisonDirection direction,
                                        absl::Span<const float> xs);
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options = HloPjRtTestBase::GetDebugOptionsForTest();
     debug_options.add_xla_disable_hlo_passes("layout-assignment");
     return debug_options;
   }
@@ -884,7 +886,8 @@ TEST_F(CpuGpuFusionTest, Clamp2D) {
 }
 
 class FusionClientLibraryTest
-    : public ClientLibraryTestRunnerMixin<HloTestBase> {};
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
 
 TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
   // On the GPU backend, it's possible to have too many transposes within one
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index 8bfdbfb75a38b6..676b5e287c7aba 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
+#include "xla/backends/cpu/ffi.h"
 #include "xla/client/client_library.h"
 #include "xla/client/local_client.h"
 #include "xla/executable_run_options.h"
@@ -227,7 +228,6 @@ TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, kDefaultErrorSpec);
 }
 
-
 class CustomCallClientAPITest
     : public ClientLibraryTestRunnerMixin<
           HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
diff --git a/third_party/xla/xla/tests/deallocation_test.cc b/third_party/xla/xla/tests/deallocation_test.cc
index ab67c995d30bb7..2296ea38512263 100644
--- a/third_party/xla/xla/tests/deallocation_test.cc
+++ b/third_party/xla/xla/tests/deallocation_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/client/local_client.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/service/service.h"
 #include "xla/tests/client_library_test_base.h"
 
 namespace xla {
@@ -38,7 +40,7 @@ class DeallocationTest : public ClientLibraryTestBase {
     XlaComputation computation = builder->Build().value();
     auto global_data =
         client_->Execute(computation, arguments, &execution_options_).value();
-    TF_CHECK_OK(client_->Transfer(*global_data).status());
+    CHECK_OK(client_->Transfer(*global_data).status());
     return global_data;
   }
 };
diff --git a/third_party/xla/xla/tests/deconstruct_tuple_test.cc b/third_party/xla/xla/tests/deconstruct_tuple_test.cc
index 861804c5189dd2..a7dbbe2ec3122b 100644
--- a/third_party/xla/xla/tests/deconstruct_tuple_test.cc
+++ b/third_party/xla/xla/tests/deconstruct_tuple_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/client/local_client.h"
@@ -25,10 +26,13 @@ limitations under the License.
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/service.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_base.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -45,7 +49,7 @@ class DeconstructTupleTest : public ClientLibraryTestBase {
     XlaComputation computation = builder->Build().value();
     auto global_data =
         client_->Execute(computation, arguments, &execution_options_).value();
-    TF_CHECK_OK(client_->Transfer(*global_data).status());
+    CHECK_OK(client_->Transfer(*global_data).status());
     return global_data;
   }
 };
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index c2301e28f31298..28bfe038cb97f7 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -15,26 +15,37 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "xla/tests/xla_test_backend_predicates.h"
+#include "absl/log/check.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
-#include "xla/client/local_client.h"
+#include "xla/client/client_library.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/lib/matrix.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/reference_util.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "tsl/platform/ml_dtypes.h"
@@ -46,7 +57,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class DotOperationTest : public ClientLibraryTestBase {
+class DotOperationTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   ErrorSpec error_spec_{0.0001, 1e-5};
 };
@@ -74,24 +87,23 @@ using TypesF8 = ::testing::Types<tsl::float8_e4m3fnuz>;
 
 // Check that we can safely pass an input tuple's elements to a dot operation.
 TEST_F(DotOperationTest, DotOfInputTupleElem) {
-  XlaBuilder builder(TestName());
+  XlaBuilder builder("DotOfInputTupleElem");
 
   XlaOp param;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto param_data,
-      CreateParameterAndTransferLiteral(
-          0,
-          LiteralUtil::MakeTupleFromSlices(
-              {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
-               LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
-          "arg0", &builder, &param));
+  Literal param_data = this->CreateParameterAndTransferLiteral(
+      0,
+      LiteralUtil::MakeTupleFromSlices(
+          {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
+           LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
+      "arg0", &builder, &param);
   auto lhs = GetTupleElement(param, 0);
   auto rhs = GetTupleElement(param, 1);
   Dot(lhs, rhs);
 
-  ComputeAndCompareLiteral(&builder,
-                           LiteralUtil::CreateR2<float>({{19, 22}, {43, 50}}),
-                           {param_data.get()});
+  Literal expected_literal = LiteralUtil::CreateR2<float>({{19, 22}, {43, 50}});
+
+  ComputeAndCompareLiteral(&builder, expected_literal, {&param_data},
+                           &expected_literal.shape());
 }
 
 template <typename T>
@@ -222,51 +234,37 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, FusedDot) {
   auto exp0 = Exp(param0);
   Dot(exp0, param1);
 
-  auto lhs_handle =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-              {{1.0f, 2.0f, 3.0f, 4.0f}, {-1.0f, -2.0f, -3.0f, -4.0f}}))
-          .value();
-  auto rhs_handle = this->client_
-                        ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                            {{1.0f}, {2.0f}, {3.0f}, {4.0f}}))
-                        .value();
-
+  Literal lhs_handle = LiteralUtil::CreateR2FromArray2D<T>(
+      {{1.0f, 2.0f, 3.0f, 4.0f}, {-1.0f, -2.0f, -3.0f, -4.0f}});
+  Literal rhs_handle =
+      LiteralUtil::CreateR2FromArray2D<T>({{1.0f}, {2.0f}, {3.0f}, {4.0f}});
   if (std::is_same<Eigen::half, T>::value) {
     this->error_spec_ = ErrorSpec{0.0001, 1e-3};
   }
 
   this->template ComputeAndCompareR2<T>(
       &builder, Array2D<T>({{296.14560492846033f}, {0.8611737683031964f}}),
-      {lhs_handle.get(), rhs_handle.get()}, this->error_spec_);
+      {&lhs_handle, &rhs_handle}, this->error_spec_);
 }
 
 template <typename T>
 class SquareMatrixDot : public DotOperationTest {
  public:
   void TestImpl(bool lhs_row_major, bool rhs_row_major) {
-    auto lhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 2.0f}, {3.0f, -4.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(lhs_row_major))))
-            .value();
-    auto rhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 6.0f}, {7.0f, -4.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(rhs_row_major))))
-            .value();
+    Literal lhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 2.0f}, {3.0f, -4.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
+    Literal rhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 6.0f}, {7.0f, -4.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
     XlaBuilder builder(TestName());
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
     Dot(Parameter(&builder, 0, ShapeUtil::MakeShape(prim_type, {2, 2}), "lhs"),
         Parameter(&builder, 1, ShapeUtil::MakeShape(prim_type, {2, 2}), "rhs"));
 
     Array2D<T> expected({{15.0f, -2.0f}, {-25.0f, 34.0f}});
-    ComputeAndCompareR2<T>(&builder, expected,
-                           {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+    ComputeAndCompareR2<T>(&builder, expected, {&lhs_handle, &rhs_handle},
+                           error_spec_);
   }
 
  protected:
@@ -312,6 +310,10 @@ class ParametricDotTest : public DotOperationTest,
                           public ::testing::WithParamInterface<DotTestParam> {
  protected:
   // This method runs before each test runs.
+  bool IsRocm() {
+    return test_runner().HasProperty(HloRunnerPropertyTag::kUsingGpuRocm);
+  }
+
   void SetUp() override {
     // Several F16 tests are subject to denormal issues on MI210 architecture.
     // For that matter, we set propagate_grad_xy_ flag for these tests, which
@@ -319,22 +321,11 @@ class ParametricDotTest : public DotOperationTest,
     // does not work well with ROCBLAS autotuning, hence we also disable it.
     // This also serves as a test that grad_x/y attributes are correctly
     // propagated down to a GEMM routine.
-    const auto& gpu_comp = client_->backend()
-                               .default_stream_executor()
-                               ->GetDeviceDescription()
-                               .gpu_compute_capability();
-    if (std::holds_alternative<se::RocmComputeCapability>(gpu_comp)) {
+    if (IsRocm()) {
       absl::string_view name(
           ::testing::UnitTest::GetInstance()->current_test_info()->name());
-      if (name.find("TestF16/270x270x520_MajorToMinor") != std::string::npos) {
+      if (absl::StrContains(name, "TestF16/270x270x520_MajorToMinor")) {
         GTEST_SKIP() << "Not supported on ROCm until Triton is re-enabled.";
-        execution_options_.mutable_debug_options()->set_xla_gpu_autotune_level(
-            0);
-        DotTestParam param = GetParam();
-        // In order to test both grad_x and grad_y attributes, we set
-        // propagate_grad_xy_ to 1 or 2 based on some alternating parameter
-        // to set it deterministically.
-        propagate_grad_xy_ = param.dot_lhs_row_major ? 1 : 2;
       }
     }
   }
@@ -345,7 +336,7 @@ class ParametricDotTest : public DotOperationTest,
   template <typename NativeT>
   void ComputeAndCompareR2WithError(XlaBuilder* builder,
                                     const Array2D<NativeT>& expected,
-                                    absl::Span<GlobalData* const> arguments);
+                                    absl::Span<Literal* const> arguments);
 
   int32_t propagate_grad_xy_ = 0;
 };
@@ -353,7 +344,7 @@ class ParametricDotTest : public DotOperationTest,
 template <typename NativeT>
 void ParametricDotTest::ComputeAndCompareR2WithError(
     XlaBuilder* builder, const Array2D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 3e-3);
   ComputeAndCompareR2(builder, expected, arguments, error_spec);
 }
@@ -361,7 +352,7 @@ void ParametricDotTest::ComputeAndCompareR2WithError(
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError<Eigen::half>(
     XlaBuilder* builder, const Array2D<Eigen::half>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 7e-3);
   ComputeAndCompareR2(builder, expected, arguments, error_spec);
 }
@@ -369,21 +360,21 @@ void ParametricDotTest::ComputeAndCompareR2WithError<Eigen::half>(
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError<int32_t>(
     XlaBuilder* builder, const Array2D<int32_t>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ComputeAndCompareR2(builder, expected, arguments);
 }
 
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError<uint8_t>(
     XlaBuilder* builder, const Array2D<uint8_t>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ComputeAndCompareR2(builder, expected, arguments);
 }
 
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError(
     XlaBuilder* builder, const Array2D<tsl::float8_e5m2>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 3e-3);
   error_spec.low_precision_fp_error_spec.type =
       primitive_util::NativeToPrimitiveType<tsl::float8_e5m2>();
@@ -394,7 +385,7 @@ void ParametricDotTest::ComputeAndCompareR2WithError(
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError(
     XlaBuilder* builder, const Array2D<tsl::float8_e4m3fn>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 3e-3);
   error_spec.low_precision_fp_error_spec.type =
       primitive_util::NativeToPrimitiveType<tsl::float8_e4m3fn>();
@@ -407,31 +398,25 @@ void ParametricDotTest::TestImpl() {
 
   std::unique_ptr<Array2D<NativeT>> dot_lhs_data =
       MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.m, param.k);
-  Literal dot_lhs_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
+  Literal dot_lhs_handle = LiteralUtil::CreateR2FromArray2DWithLayout(
       *dot_lhs_data, LayoutUtil::MakeLayout(
                          MinorToMajorForIsRowMajor(param.dot_lhs_row_major)));
-  std::unique_ptr<GlobalData> dot_lhs_handle =
-      client_->TransferToServer(dot_lhs_lit).value();
 
   std::unique_ptr<Array2D<NativeT>> dot_rhs_data =
       MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.k, param.n);
   Layout rhs_layout = LayoutUtil::MakeLayout(
       MinorToMajorForIsRowMajor(param.dot_rhs_row_major));
-  Literal dot_rhs_lit =
+  Literal dot_rhs_handle =
       LiteralUtil::CreateR2FromArray2DWithLayout(*dot_rhs_data, rhs_layout);
-  std::unique_ptr<GlobalData> dot_rhs_handle =
-      client_->TransferToServer(dot_rhs_lit).value();
 
   std::unique_ptr<Array2D<NativeT>> addend_data;
   Literal addend_lit;
-  std::unique_ptr<GlobalData> addend_handle;
 
   if (param.has_addend) {
     addend_data = MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.m, param.n);
     addend_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
         *addend_data, LayoutUtil::MakeLayout(
                           MinorToMajorForIsRowMajor(param.addend_row_major)));
-    addend_handle = client_->TransferToServer(addend_lit).value();
   }
 
   XlaBuilder builder(TestName());
@@ -480,9 +465,9 @@ void ParametricDotTest::TestImpl() {
     expected = ReferenceUtil::MatmulArray2D(*dot_lhs_data, *dot_rhs_data);
   }
 
-  std::vector<GlobalData*> args = {dot_lhs_handle.get(), dot_rhs_handle.get()};
+  std::vector<Literal*> args = {&dot_lhs_handle, &dot_rhs_handle};
   if (param.has_addend) {
-    args.push_back(addend_handle.get());
+    args.push_back(&addend_lit);
   }
   ComputeAndCompareR2WithError<NativeT>(&builder, *expected, args);
 }
@@ -542,14 +527,11 @@ INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,
 class ParametricDotTestWithoutLayoutAssignment : public ParametricDotTest {
  public:
   ParametricDotTestWithoutLayoutAssignment() {
-    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-        "layout-assignment");
-    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-        "hlo-verifier");
+    mutable_debug_options()->add_xla_disable_hlo_passes("layout-assignment");
+    mutable_debug_options()->add_xla_disable_hlo_passes("hlo-verifier");
     // Disable algebraic simplification because the pass may replace a dot
     // instruction with a layout-changing multiplication instruction.
-    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-        "algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
   }
 };
 
@@ -614,20 +596,12 @@ template <typename T>
 class NonsquareMatrixDot : public DotOperationTest {
  public:
   void TestImpl(bool lhs_row_major, bool rhs_row_major) {
-    auto lhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 2.0f, 3.0f}, {3.0f, -4.0f, -1.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(lhs_row_major))))
-            .value();
-    auto rhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 6.0f}, {2.0f, 3.0f}, {7.0f, -4.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(rhs_row_major))))
-            .value();
+    Literal lhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 2.0f, 3.0f}, {3.0f, -4.0f, -1.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
+    Literal rhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 6.0f}, {2.0f, 3.0f}, {7.0f, -4.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
 
     XlaBuilder builder(TestName());
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
@@ -636,8 +610,8 @@ class NonsquareMatrixDot : public DotOperationTest {
 
     Array2D<T> expected({{26.0f, 0.0f}, {-12.0f, 10.0f}});
 
-    ComputeAndCompareR2<T>(&builder, expected,
-                           {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+    ComputeAndCompareR2<T>(&builder, expected, {&lhs_handle, &rhs_handle},
+                           error_spec_);
   }
 
  protected:
@@ -656,17 +630,11 @@ TYPED_TEST(NonsquareMatrixDot, TestTF) { this->TestImpl(true, false); }
 TYPED_TEST(NonsquareMatrixDot, TestTT) { this->TestImpl(true, true); }
 
 TEST_F(DotOperationTest, MatrixVectorC64) {
-  auto lhs_handle =
-      client_
-          ->TransferToServer(LiteralUtil::CreateR2WithLayout<complex64>(
-              {{1.0, 2.0, 3.0, -4.0}}, LayoutUtil::MakeLayout({1, 0})))
-          .value();
-  auto rhs_handle =
-      client_
-          ->TransferToServer(LiteralUtil::CreateR2WithLayout<complex64>(
-              {{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}, {-4.0, 4.0}},
-              LayoutUtil::MakeLayout({1, 0})))
-          .value();
+  auto lhs_handle = LiteralUtil::CreateR2WithLayout<complex64>(
+      {{1.0, 2.0, 3.0, -4.0}}, LayoutUtil::MakeLayout({1, 0}));
+  auto rhs_handle = LiteralUtil::CreateR2WithLayout<complex64>(
+      {{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}, {-4.0, 4.0}},
+      LayoutUtil::MakeLayout({1, 0}));
 
   XlaBuilder builder(TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<complex64>();
@@ -675,8 +643,8 @@ TEST_F(DotOperationTest, MatrixVectorC64) {
 
   Array2D<complex64> expected({{30.0, -2.0}});
 
-  ComputeAndCompareR2<complex64>(
-      &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+  ComputeAndCompareR2<complex64>(&builder, expected, {&lhs_handle, &rhs_handle},
+                                 error_spec_);
 }
 
 TYPED_TEST(DotOperationTest_F16F32F64CF64, ConcurrentMatMult) {
@@ -716,13 +684,13 @@ TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
   }
   using T = TypeParam;
   XlaBuilder builder(this->TestName());
-  auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}),
-                     "x");
-  auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}),
-                     "y");
+  XlaOp x = Parameter(&builder, 0,
+                      ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}), "x");
+  XlaOp y = Parameter(&builder, 1,
+                      ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}), "y");
 
-  auto x_flat = Reshape(x, {4, 2, 2});
-  auto y_flat = Reshape(y, {4, 2, 2});
+  XlaOp x_flat = Reshape(x, {4, 2, 2});
+  XlaOp y_flat = Reshape(y, {4, 2, 2});
 
   // Slice batches into individual matrices and multiply them.
   std::vector<XlaOp> out_slices;
@@ -742,20 +710,13 @@ TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
   auto out_flat = ConcatInDim(&builder, out_slices, 0);
   Reshape(out_flat, {2, 2, 2, 2});
 
-  auto x_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-                        {{{{1000.0f, 100.0f}, {10.0f, 1.0f}},
-                          {{2000.0f, 200.0f}, {20.0f, 2.0f}}},
-                         {{{3000.0f, 300.0f}, {30.0f, 3.0f}},
-                          {{4000.0f, 400.0f}, {40.0f, 4.0f}}}}))
-                    .value();
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-              {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
-               {{{11.0f, 22.0f}, {33.0f, 44.0f}},
-                {{55.0f, 66.0f}, {77.0f, 88.0f}}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1000.0f, 100.0f}, {10.0f, 1.0f}}, {{2000.0f, 200.0f}, {20.0f, 2.0f}}},
+       {{{3000.0f, 300.0f}, {30.0f, 3.0f}},
+        {{4000.0f, 400.0f}, {40.0f, 4.0f}}}});
+  Literal y_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
+       {{{11.0f, 22.0f}, {33.0f, 44.0f}}, {{55.0f, 66.0f}, {77.0f, 88.0f}}}});
 
   if (std::is_same<Eigen::half, T>::value) {
     this->error_spec_ = ErrorSpec{0.0001, 1e-3};
@@ -767,7 +728,7 @@ TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
         {{11400.0f, 13600.0f}, {114.0f, 136.0f}}},
        {{{42900.0f, 79200.0f}, {429.0f, 792.0f}},
         {{250800.0f, 299200.0f}, {2508.0f, 2992.0f}}}},
-      {x_data.get(), y_data.get()}, this->error_spec_);
+      {&x_data, &y_data}, this->error_spec_);
 }
 
 TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) {
@@ -787,23 +748,17 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}}))
-          .value();
+  Literal y_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}});
 
   this->template ComputeAndCompareR3<T>(
       &builder,
       /*expected=*/
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
-      {x_data.get(), y_data.get()}, this->error_spec_);
+      {&x_data, &y_data}, this->error_spec_);
 }
 
 #if GOOGLE_CUDA || (TF_HIPBLASLT && TF_ROCM_VERSION >= 60000)
@@ -813,8 +768,7 @@ class DotOperationTestWithCublasLt_F16F32F64CF64 : public DotOperationTest {
   DotOperationTestWithCublasLt_F16F32F64CF64() {
     bool enable_cublas_lt = true;
 
-    execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
-        enable_cublas_lt);
+    mutable_debug_options()->set_xla_gpu_enable_cublaslt(enable_cublas_lt);
   }
 
  protected:
@@ -845,17 +799,11 @@ TYPED_TEST(DotOperationTestWithCublasLt_F16F32F64CF64,
 
   auto dot = DotGeneral(x, y, dnums);
   auto prim_type = primitive_util::NativeToPrimitiveType<T>();
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{-1.0f, 2.0f}, {3.0f, -4.0f}}, {{5.0f, 6.0f}, {-7.0f, 8.0f}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{-1.0f, 2.0f}, {3.0f, -4.0f}}, {{5.0f, 6.0f}, {-7.0f, 8.0f}}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 0.0f}, {0.0f, -1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}}))
-          .value();
+  Literal y_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 0.0f}, {0.0f, -1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}});
   Array3D<T> expected(
       {{{-1.0f, -2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {-7.0f, 8.0f}}});
   if (prim_type != C64) {
@@ -865,8 +813,8 @@ TYPED_TEST(DotOperationTestWithCublasLt_F16F32F64CF64,
     expected = Array3D<T>(
         {{{0.0f, 0.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {0.0f, 8.0f}}});
   }
-  this->template ComputeAndCompareR3<T>(
-      &builder, expected, {x_data.get(), y_data.get()}, this->error_spec_);
+  this->template ComputeAndCompareR3<T>(&builder, expected, {&x_data, &y_data},
+                                        this->error_spec_);
 }
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
@@ -875,8 +823,7 @@ template <typename T>
 class DotOperationTestWithCublasLt_F8 : public DotOperationTest {
  public:
   DotOperationTestWithCublasLt_F8() {
-    execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
-        true);
+    mutable_debug_options()->set_xla_gpu_enable_cublaslt(true);
   }
 };
 TYPED_TEST_CASE(DotOperationTestWithCublasLt_F8, TypesF8);
@@ -908,82 +855,74 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABUnscaledDF8) {
 
   DotGeneral(a_scaled_f32, b_scaled_f32, dnums);
 
-  auto a_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 7.0f, 5.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 2.0f}}))
-                    .value();
-  auto b_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 19.0f, 17.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 13.0f, 11.0f}}))
-                    .value();
-  auto a_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(2.0f))
-          .value();
-  auto b_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(4.0f))
-          .value();
+  Literal a_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 7.0f, 5.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 3.0f, 2.0f}});
+  Literal b_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 19.0f, 17.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 13.0f, 11.0f}});
+  Literal a_scale_data = LiteralUtil::CreateR0<float>(2.0f);
+  Literal b_scale_data = LiteralUtil::CreateR0<float>(4.0f);
 
   Literal expected_d = LiteralUtil::CreateR2FromArray2D<float>(
       {{560.0f, 688.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
@@ -1019,10 +958,9 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABUnscaledDF8) {
        {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
         0.0f, 0.0f, 688.0f, 560.0f}});
 
-  this->ComputeAndCompareTuple(
-      &builder, expected_d,
-      {a_data.get(), b_data.get(), a_scale_data.get(), b_scale_data.get()},
-      this->error_spec_);
+  this->ComputeAndCompareTuple(&builder, expected_d,
+                               {&a_data, &b_data, &a_scale_data, &b_scale_data},
+                               this->error_spec_);
 }
 
 TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABScaledDWithDAmaxF8) {
@@ -1071,85 +1009,75 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABScaledDWithDAmaxF8) {
       d_clamped_f32, primitive_util::NativeToPrimitiveType<T>());
   Tuple(&builder, {d_f8, d_amax});
 
-  auto a_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 7.0f, 5.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 2.0f}}))
-                    .value();
-  auto b_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 19.0f, 17.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 13.0f, 11.0f}}))
-                    .value();
-  auto a_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(2.0f))
-          .value();
-  auto b_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(4.0f))
-          .value();
-  auto d_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(8.0f))
-          .value();
+  Literal a_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 7.0f, 5.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 3.0f, 2.0f}});
+  auto b_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 19.0f, 17.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 13.0f, 11.0f}});
+  Literal a_scale_data = LiteralUtil::CreateR0<float>(2.0f);
+  Literal b_scale_data = LiteralUtil::CreateR0<float>(4.0f);
+  auto d_scale_data = LiteralUtil::CreateR0<float>(8.0f);
 
   Literal expected_d = LiteralUtil::CreateR2FromArray2D<T>(
       {{72.0f, 88.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
@@ -1187,10 +1115,10 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABScaledDWithDAmaxF8) {
   Literal expected_amax = LiteralUtil::CreateR0<float>(1640.0f);
   Literal expected = LiteralUtil::MakeTuple({&expected_d, &expected_amax});
 
-  this->ComputeAndCompareTuple(&builder, expected,
-                               {a_data.get(), b_data.get(), a_scale_data.get(),
-                                b_scale_data.get(), d_scale_data.get()},
-                               this->error_spec_);
+  this->ComputeAndCompareTuple(
+      &builder, expected,
+      {&a_data, &b_data, &a_scale_data, &b_scale_data, &d_scale_data},
+      this->error_spec_);
 }
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
@@ -1210,20 +1138,15 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR3LhsR2Rhs) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
 
-  auto y_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
-                    .value();
+  Literal y_data =
+      LiteralUtil::CreateR2FromArray2D<T>({{1.0f, 0.0f}, {0.0f, 1.0f}});
 
   this->template ComputeAndCompareR2<T>(
       &builder,
-      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {&x_data, &y_data},
       this->error_spec_);
 }
 
@@ -1243,20 +1166,15 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR2LhsR3Rhs) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
-                    .value();
+  Literal x_data =
+      LiteralUtil::CreateR2FromArray2D<T>({{1.0f, 0.0f}, {0.0f, 1.0f}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
-          .value();
+  auto y_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
 
   this->template ComputeAndCompareR2<T>(
       &builder,
-      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {&x_data, &y_data},
       this->error_spec_);
 }
 
@@ -1279,27 +1197,20 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulMultipleBatch) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-              {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
-               {{{9.0f, 10.0f}, {11.0f, 12.0f}},
-                {{13.0f, 14.0f}, {15.0f, 16.0f}}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
+       {{{9.0f, 10.0f}, {11.0f, 12.0f}}, {{13.0f, 14.0f}, {15.0f, 16.0f}}}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-              {{{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}},
-               {{{0.0f, 1.0f}, {1.0f, 0.0f}}, {{0.0f, 1.0f}, {1.0f, 0.0f}}}}))
-          .value();
+  Literal y_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}},
+       {{{0.0f, 1.0f}, {1.0f, 0.0f}}, {{0.0f, 1.0f}, {1.0f, 0.0f}}}});
 
   this->template ComputeAndCompareR4<T>(
       &builder,
       /*expected=*/
       {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
        {{{10.0f, 9.0f}, {12.0f, 11.0f}}, {{14.0f, 13.0f}, {16.0f, 15.0f}}}},
-      {x_data.get(), y_data.get()}, this->error_spec_);
+      {&x_data, &y_data}, this->error_spec_);
 }
 
 TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) {
@@ -1318,20 +1229,10 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) {
         if (transpose_rhs) {
           rhs = ReferenceUtil::TransposeArray2D(*rhs);
         }
-        auto lhs_handle =
-            this->client_
-                ->TransferToServer(
-                    LiteralUtil::CreateR2FromArray2DWithLayout<T>(
-                        *lhs, LayoutUtil::MakeLayout(
-                                  MinorToMajorForIsRowMajor(row_major))))
-                .value();
-        auto rhs_handle =
-            this->client_
-                ->TransferToServer(
-                    LiteralUtil::CreateR2FromArray2DWithLayout<T>(
-                        *rhs, LayoutUtil::MakeLayout(
-                                  MinorToMajorForIsRowMajor(row_major))))
-                .value();
+        Literal lhs_handle = LiteralUtil::CreateR2FromArray2DWithLayout<T>(
+            *lhs, LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(row_major)));
+        Literal rhs_handle = LiteralUtil::CreateR2FromArray2DWithLayout<T>(
+            *rhs, LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(row_major)));
 
         XlaBuilder builder(this->TestName());
         auto prim_type = primitive_util::NativeToPrimitiveType<T>();
@@ -1355,8 +1256,7 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) {
         VLOG(1) << "TestTransposeFolding " << transpose_lhs << " "
                 << transpose_rhs << " " << row_major;
         this->template ComputeAndCompareR2<T>(
-            &builder, expected, {lhs_handle.get(), rhs_handle.get()},
-            this->error_spec_);
+            &builder, expected, {&lhs_handle, &rhs_handle}, this->error_spec_);
       }
     }
   }
@@ -1388,23 +1288,14 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64,
       new Array2D<T>({{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}));
   std::unique_ptr<Array2D<T>> arg_2_value_array(new Array2D<T>({{1.0f, 2.0f}}));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_0_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_1_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_2_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array)));
+  Literal arg_0_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array);
+
+  Literal arg_1_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array);
+  Literal arg_2_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array);
 
   Array2D<T> expected({{53.0f, 74.0f}, {45.0f, 66.0f}});
   this->template ComputeAndCompareR2<T>(
-      &builder, expected,
-      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()},
+      &builder, expected, {&arg_0_value, &arg_1_value, &arg_2_value},
       this->error_spec_);
 }
 
@@ -1437,23 +1328,13 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64,
   std::unique_ptr<Array2D<T>> arg_2_value_array(
       new Array2D<T>({{1.0f}, {2.0f}}));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_0_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_1_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_2_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array)));
+  auto arg_0_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array);
+  auto arg_1_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array);
+  auto arg_2_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array);
 
   Array2D<T> expected({{38.0f, 36.0f}, {93.0f, 91.0f}});
   this->template ComputeAndCompareR2<T>(
-      &builder, expected,
-      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()},
+      &builder, expected, {&arg_0_value, &arg_1_value, &arg_2_value},
       this->error_spec_);
 }
 
@@ -1716,21 +1597,21 @@ class EinsumTest : public DotOperationTest,
                    public ::testing::WithParamInterface<EinsumParamType> {};
 TEST_P(EinsumTest, SimpleEinsumTest) {
   XlaBuilder builder(TestName());
-  auto x = AddParam(
+  Literal x_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<0>(GetParam())))
-          .value(),
-      &builder);
-  auto y = AddParam(
+          .value();
+  XlaOp x = Parameter(&builder, 0, x_literal.shape(), "parameter1");
+  Literal y_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<1>(GetParam())))
-          .value(),
-      &builder);
+          .value();
+  XlaOp y = Parameter(&builder, 1, y_literal.shape(), "parameter2");
   auto config = std::get<2>(GetParam());
   if (config.find(',') == config.npos) {
     Einsum(x, config);
   } else {
     Einsum(x, y, config);
   }
-  ComputeAndCompare(&builder, {}, ErrorSpec{1e-3, 1e-3});
+  ComputeAndCompare(&builder, {&x_literal, &y_literal}, ErrorSpec{1e-3, 1e-3});
 }
 
 std::vector<EinsumParamType> GetEinsumTestCases() {
@@ -1803,20 +1684,21 @@ using BatchDotParamType = std::tuple<std::vector<int64_t>, std::vector<int64_t>,
                                      std::vector<int64_t>>;
 class BatchDotTest : public DotOperationTest,
                      public ::testing::WithParamInterface<BatchDotParamType> {};
+
 TEST_P(BatchDotTest, BroadcastingBatchDotTest) {
   XlaBuilder builder(TestName());
-  auto x = AddParam(
+  Literal x_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<0>(GetParam())))
-          .value(),
-      &builder);
-  auto y = AddParam(
+          .value();
+  XlaOp x = Parameter(&builder, 0, x_literal.shape(), "parameter1");
+  Literal y_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<1>(GetParam())))
-          .value(),
-      &builder);
+          .value();
+  XlaOp y = Parameter(&builder, 1, y_literal.shape(), "parameter2");
   auto batch_dot = BatchDot(x, y);
   auto output_shape = builder.GetShape(batch_dot).value();
   EXPECT_EQ(output_shape.dimensions(), std::get<2>(GetParam()));
-  ComputeAndCompare(&builder, {}, ErrorSpec{1e-3, 1e-3});
+  ComputeAndCompare(&builder, {&x_literal, &y_literal}, ErrorSpec{1e-3, 1e-3});
 }
 
 std::vector<BatchDotParamType> GetBatchDotTestCases() {
@@ -1838,7 +1720,8 @@ std::vector<BatchDotParamType> GetBatchDotTestCases() {
 INSTANTIATE_TEST_SUITE_P(BatchDot, BatchDotTest,
                          ::testing::ValuesIn(GetBatchDotTestCases()));
 
-class DotOperationTextTest : public HloTestBase {};
+class DotOperationTextTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {};
 
 TEST_F(DotOperationTextTest, DotReorderedDotDims) {
   absl::string_view hlo_string =
@@ -2080,9 +1963,10 @@ ENTRY SmallIntegerDot {
 }
 
 TEST_F(DotOperationTextTest, S4Dot) {
-  if (test::DeviceTypeIs(test::kTpu)) {
-    GTEST_SKIP();
-  }
+  // TODO (b/456833594): reenable once the missing logic in tfrt_gpu_client
+  // to pack int4 type for host literals has been added.
+  GTEST_SKIP();
+
   absl::string_view hlo_string =
       R"(
 HloModule SmallIntegerDot
@@ -2249,14 +2133,16 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstLHS_RL) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR3FromArray3D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
+  Literal y_literal =
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, {2, 6})).value();
   auto t1 = Transpose(t0, {1, 0, 2});
   auto rhs = Reshape(t1, {6, 2});
   auto lhs = ConstantR2FromArray2D(&builder, const_arr);
   Dot(lhs, rhs);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_LR) {
@@ -2266,8 +2152,8 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_LR) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR3FromArray3D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Transpose(t0, {1, 0, 2});
   auto lhs = Reshape(t1, {6, 2});
   auto rhs = ConstantR2FromArray2D(&builder, const_arr);
@@ -2277,7 +2163,7 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_LR) {
   dims.add_rhs_contracting_dimensions(1);
   DotGeneral(lhs, rhs, dims);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_RL) {
@@ -2287,14 +2173,14 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_RL) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR4FromArray4D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR4FromArray4D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Transpose(t0, {0, 2, 3, 1});
   auto lhs = Reshape(t1, {2, 24});
   auto rhs = ConstantR2FromArray2D(&builder, const_arr);
   Dot(lhs, rhs);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_MM) {
@@ -2304,8 +2190,8 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_MM) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR3FromArray3D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Reshape(t0, {2, 2, 3, 2});
   auto t2 = Transpose(t1, {0, 2, 1, 3});
   auto lhs = Reshape(t2, {2, 6, 2});
@@ -2318,7 +2204,7 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_MM) {
   dims.add_rhs_batch_dimensions(0);
   DotGeneral(lhs, rhs, dims);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDims_Multipass) {
@@ -2328,8 +2214,8 @@ TEST_F(DotOperationTest, ReorderContractingDims_Multipass) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR4FromArray4D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR4FromArray4D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Transpose(t0, {0, 2, 1, 3});
   auto t2 = Reshape(t1, {2, 6, 5});
   auto t3 = Transpose(t2, {0, 2, 1});
@@ -2345,7 +2231,7 @@ TEST_F(DotOperationTest, ReorderContractingDims_Multipass) {
   // optimization can be applied multiple times if we fold the transpose
   // and reshape that are moved to the constant side of the dot.
   mutable_debug_options()->clear_xla_disable_hlo_passes();
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTextTest, WiderIntegralResultAccumulation) {
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index 16eea631f7c081..c9bf57bbaee1cc 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -106,7 +106,7 @@ cc_library(
 )
 
 exhaustive_xla_test(
-    name = "exhaustive_unary_test",
+    name = "exhaustive_unary",
     timeout = "long",
     srcs = [
         "exhaustive_unary_test_definitions.h",
@@ -164,7 +164,7 @@ exhaustive_xla_test(
 )
 
 xla_test(
-    name = "exhaustive_unary_complex_test",
+    name = "exhaustive_unary_complex",
     timeout = "long",
     srcs = [
         "exhaustive_unary_complex_test.cc",
@@ -213,7 +213,7 @@ cc_library(
 )
 
 exhaustive_xla_test(
-    name = "exhaustive_binary_test",
+    name = "exhaustive_binary",
     timeout = "long",
     srcs = [
         "exhaustive_binary_test_definitions.h",
diff --git a/third_party/xla/xla/tests/exhaustive/build_defs.bzl b/third_party/xla/xla/tests/exhaustive/build_defs.bzl
index 5f7c2e161442d8..2ddf85c2eeb00e 100644
--- a/third_party/xla/xla/tests/exhaustive/build_defs.bzl
+++ b/third_party/xla/xla/tests/exhaustive/build_defs.bzl
@@ -3,9 +3,6 @@
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
-def register_extension_info(**_kwargs):
-    pass
-
 visibility(DEFAULT_LOAD_VISIBILITY)
 
 def exhaustive_xla_test(name, srcs, partitions, tags, **kwargs):
@@ -37,7 +34,7 @@ def exhaustive_xla_test(name, srcs, partitions, tags, **kwargs):
 
     test_target_names = []
     for (partition_suffix, additional_partition_srcs) in partitions.items():
-        target_name = name + "_" + partition_suffix
+        target_name = name + "_" + partition_suffix + "_test"
         xla_test(
             name = target_name,
             srcs = srcs + additional_partition_srcs,
@@ -52,12 +49,3 @@ def exhaustive_xla_test(name, srcs, partitions, tags, **kwargs):
         tests = test_target_names,
         tags = tags + ["-broken", "-manual"],
     )
-
-register_extension_info(
-    extension = exhaustive_xla_test,
-    # Needs to be kept up-to-date on all partition names defined in the invocations.
-    #
-    # For some reason, manually specifying the expansion targets like (cpu|cpu_.*|...) is required
-    # for build tools.
-    label_regex_for_dep = "{extension_name}_(f16_and_smaller|f32_and_smaller|f32|f64)_(cpu|cpu_.*|gpu|gpu_.*)",
-)
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc
index 0e5a08622bb12b..9c8edb44f03a36 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc
@@ -199,13 +199,16 @@ DEFINE_UNARY_TEST_OP(
 DEFINE_UNARY_TEST_OP(
     AcoshOp, { return [](XlaOp x) { return Acosh(x); }; },
     { return std::acosh; });
-DEFINE_UNARY_TEST_OP(AsinhOp, { return Asinh; }, { return std::asinh; });
+DEFINE_UNARY_TEST_OP(
+    AsinhOp, { return [](XlaOp x) { return Asinh(x); }; },
+    { return std::asinh; });
 DEFINE_UNARY_TEST_OP(
     AtanhOp, { return [](XlaOp x) { return Atanh(x); }; },
     { return std::atanh; });
 DEFINE_UNARY_TEST_OP(
     AcosOp, { return [](XlaOp x) { return Acos(x); }; }, { return std::acos; });
-DEFINE_UNARY_TEST_OP(AsinOp, { return Asin; }, { return std::asin; });
+DEFINE_UNARY_TEST_OP(
+    AsinOp, { return [](XlaOp x) { return Asin(x); }; }, { return std::asin; });
 DEFINE_UNARY_TEST_OP(AtanOp, { return Atan; }, { return std::atan; });
 DEFINE_UNARY_TEST_OP(
     CoshOp, { return [](XlaOp x) { return Cosh(x); }; }, { return std::cosh; });
diff --git a/third_party/xla/xla/tests/fft_test.cc b/third_party/xla/xla/tests/fft_test.cc
index 14c4df8cdb84cc..c55bd735f8ab4c 100644
--- a/third_party/xla/xla/tests/fft_test.cc
+++ b/third_party/xla/xla/tests/fft_test.cc
@@ -15,13 +15,14 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class FftTextTest : public HloTestBase {};
+using FftTextTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 TEST_F(FftTextTest, Fft) {
   absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/tests/fuzz/BUILD b/third_party/xla/xla/tests/fuzz/BUILD
deleted file mode 100644
index 293679962b8ddc..00000000000000
--- a/third_party/xla/xla/tests/fuzz/BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-load("@rules_cc//cc:cc_library.bzl", "cc_library")
-load("//xla/tests/fuzz:build_defs.bzl", "hlo_test")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-cc_library(
-    name = "hlo_test_lib",
-    testonly = True,
-    srcs = ["hlo_test_lib.cc"],
-    deps = [
-        "//xla:error_spec",
-        "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service:hlo_module_config",
-        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
-        "//xla/tests:hlo_pjrt_test_base",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-    ],
-    alwayslink = True,  # This library registers test cases at dynamic initialization time.
-)
-
-[hlo_test(
-    name = hlo + "_test",
-    hlo = hlo,
-    tags = (
-        ["cuda-only"] if hlo == "rand_000079.hlo" else []  # No int8
-    ) + [
-        "test_migrated_to_hlo_runner_pjrt",
-    ],
-) for hlo in glob(
-    include = ["rand_*.hlo"],
-    exclude = [
-        "rand_000001.hlo",  # fails on GPU
-        "rand_000004.hlo",  # times out during coverage
-        # These fail on all platforms
-        "rand_000060.hlo",
-        "rand_000067.hlo",
-        "rand_000072.hlo",
-    ],
-)]
diff --git a/third_party/xla/xla/tests/fuzz/build_defs.bzl b/third_party/xla/xla/tests/fuzz/build_defs.bzl
deleted file mode 100644
index e7560870b9cdf8..00000000000000
--- a/third_party/xla/xla/tests/fuzz/build_defs.bzl
+++ /dev/null
@@ -1,53 +0,0 @@
-"""Build rules for XLA generated regression testing."""
-
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
-
-visibility(DEFAULT_LOAD_VISIBILITY)
-
-def hlo_test(name, hlo, **kwargs):
-    """Wrapper around `xla_test` which runs an HLO through `hlo_test_lib`.
-
-    `srcs = []` because `hlo_test_lib` linked with `tsl/platform:test_main`
-    makes usable test binary where the path to the HLO is given via `HLO_PATH`
-    environment variable.
-
-    This has the following nice properties:
-      * adding an HLO to this directory with the appropriate prefix for a test
-      suite (e.g. rand) will have it automatically create the corresponding test
-      * `hlo_test_lib` only needs to be compiled once instead of for every
-      target
-      * automated tools can easily create reproducer CLs by appending one line
-      to the `xla/tests/fuzz` BUILD file like `hlo_test(name = ..., hlo = ...)`.
-      * plays nicely with `xla_test`, so we have easy testing against all
-      platforms and a `test_suite` generated for each HLO which includes tests
-      against all platforms. This is particularly useful for pruning the set of
-      HLOs, as we can prune against `test_suites` representing all the tests
-      associated with a particular HLO, rather than individual targets.
-
-    In the future it may make sense to reformulate this to use `hlo-opt` and
-    `run_hlo_module` or similar to accomplish the same thing.
-
-    Args:
-      name:
-        The name of the macro. This really could be generated from `hlo`, but
-        tools like build_cleaner assume that all macros have a name attribute.
-      hlo:
-        The hlo to test.
-      **kwargs:
-        Additional arguments passed to `xla_test`.
-    """
-    backend_kwargs = {}
-    xla_test(
-        name = name,
-        srcs = [],
-        backend_kwargs = backend_kwargs,
-        env = {"HLO_PATH": "$(location {})".format(hlo)},
-        data = [hlo],
-        real_hardware_only = True,
-        deps = [
-            "//xla/tests/fuzz:hlo_test_lib",
-            "@com_google_googletest//:gtest_main",
-        ],
-        **kwargs
-    )
diff --git a/third_party/xla/xla/tests/fuzz/hlo_test_lib.cc b/third_party/xla/xla/tests/fuzz/hlo_test_lib.cc
deleted file mode 100644
index f669c606f63d0a..00000000000000
--- a/third_party/xla/xla/tests/fuzz/hlo_test_lib.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdlib>
-#include <iostream>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <utility>
-
-#include "xla/error_spec.h"
-#include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
-#include "xla/tests/hlo_pjrt_test_base.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class HloTest : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {};
-
-TEST_F(HloTest, HloTest) {
-  std::string path_to_hlo = std::getenv("HLO_PATH");
-  std::cout << path_to_hlo << std::endl;
-  std::string hlo;
-  TF_CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), path_to_hlo, &hlo));
-  std::cerr << hlo << std::endl;
-  HloModuleConfig config;
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo, config));
-  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01, 0.01}));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/tests/fuzz/rand_000000.hlo b/third_party/xla/xla/tests/fuzz/rand_000000.hlo
deleted file mode 100644
index 18141540c126db..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000000.hlo
+++ /dev/null
@@ -1,16 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[10,1]{1,0},s8[10,1]{1,0})->(s8[10,1]{1,0})}
-
-ENTRY main.11 {
-  Arg_0.1 = s8[10,1]{1,0} parameter(0)
-  constant.3 = s8[] constant(1)
-  broadcast.4 = s8[5,1]{1,0} broadcast(constant.3), dimensions={}
-  constant.5 = s32[] constant(5)
-  constant.6 = s32[] constant(0)
-  dynamic-update-slice.7 = s8[10,1]{1,0} dynamic-update-slice(Arg_0.1, broadcast.4, constant.5, constant.6)
-  Arg_1.2 = s8[10,1]{1,0} parameter(1)
-  sign.8 = s8[10,1]{1,0} sign(Arg_1.2)
-  maximum.9 = s8[10,1]{1,0} maximum(dynamic-update-slice.7, sign.8)
-  ROOT tuple.10 = (s8[10,1]{1,0}) tuple(maximum.9)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000001.hlo b/third_party/xla/xla/tests/fuzz/rand_000001.hlo
deleted file mode 100644
index 87f3aa2ca63866..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000001.hlo
+++ /dev/null
@@ -1,40 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[8,12,16]{2,1,0},s32[1,3]{1,0},s32[3,11,2,4]{3,2,1,0},s16[3,3,11,2,1,9,4]{6,5,4,3,2,1,0})->(s16[8,12,16]{2,1,0})}
-
-region_0.21 {
-  Arg_0.22 = s16[] parameter(0)
-  ROOT Arg_1.23 = s16[] parameter(1)
-}
-
-region_1.26 {
-  Arg_0.27 = s16[] parameter(0)
-  ROOT Arg_1.28 = s16[] parameter(1)
-}
-
-ENTRY main.31 {
-  Arg_0.1 = s16[8,12,16]{2,1,0} parameter(0)
-  constant.11 = s16[] constant(1)
-  broadcast.12 = s16[8,12,16]{2,1,0} broadcast(constant.11), dimensions={}
-  and.17 = s16[8,12,16]{2,1,0} and(Arg_0.1, broadcast.12)
-  constant.5 = s32[] constant(0)
-  broadcast.6 = s32[1,3]{1,0} broadcast(constant.5), dimensions={}
-  Arg_1.2 = s32[1,3]{1,0} parameter(1)
-  constant.13 = s32[] constant(1)
-  broadcast.14 = s32[1,3]{1,0} broadcast(constant.13), dimensions={}
-  shift-right-logical.18 = s32[1,3]{1,0} shift-right-logical(Arg_1.2, broadcast.14)
-  constant.7 = s32[1,3]{1,0} constant({ { 1, 1, 5 } })
-  clamp.25 = s32[1,3]{1,0} clamp(broadcast.6, shift-right-logical.18, constant.7)
-  constant.15 = s16[] constant(1)
-  broadcast.16 = s16[3,1,11,15]{3,2,1,0} broadcast(constant.15), dimensions={}
-  constant.8 = s32[] constant(0)
-  broadcast.9 = s32[3,11,2,4]{3,2,1,0} broadcast(constant.8), dimensions={}
-  Arg_2.3 = s32[3,11,2,4]{3,2,1,0} parameter(2)
-  constant.10 = s32[4]{0} constant({2, 0, 11, 0})
-  broadcast.19 = s32[3,11,2,4]{3,2,1,0} broadcast(constant.10), dimensions={3}
-  clamp.20 = s32[3,11,2,4]{3,2,1,0} clamp(broadcast.9, Arg_2.3, broadcast.19)
-  Arg_3.4 = s16[3,3,11,2,1,9,4]{6,5,4,3,2,1,0} parameter(3)
-  scatter.24 = s16[3,1,11,15]{3,2,1,0} scatter(broadcast.16, clamp.20, Arg_3.4), update_window_dims={1,4,5,6}, inserted_window_dims={}, scatter_dims_to_operand_dims={2,0,3,1}, index_vector_dim=3, to_apply=region_0.21
-  scatter.29 = s16[8,12,16]{2,1,0} scatter(and.17, clamp.25, scatter.24), update_window_dims={0,2,3}, inserted_window_dims={}, scatter_dims_to_operand_dims={1,2,0}, index_vector_dim=1, to_apply=region_1.26
-  ROOT tuple.30 = (s16[8,12,16]{2,1,0}) tuple(scatter.29)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000003.hlo b/third_party/xla/xla/tests/fuzz/rand_000003.hlo
deleted file mode 100644
index 970fb8e69543cb..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000003.hlo
+++ /dev/null
@@ -1,24 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[6,11,8,2]{3,2,1,0},s16[6,11,8,3,5]{4,3,2,1,0})->(s16[3,2,2,14,16]{4,3,2,1,0})}
-
-region_0.10 {
-  Arg_0.11 = s16[] parameter(0)
-  Arg_1.12 = s16[] parameter(1)
-  ROOT maximum.13 = s16[] maximum(Arg_0.11, Arg_1.12)
-}
-
-ENTRY main.17 {
-  constant.3 = s16[] constant(1)
-  broadcast.4 = s16[3,2,2,14,16]{4,3,2,1,0} broadcast(constant.3), dimensions={}
-  constant.5 = s32[] constant(0)
-  broadcast.6 = s32[6,11,8,2]{3,2,1,0} broadcast(constant.5), dimensions={}
-  Arg_0.1 = s32[6,11,8,2]{3,2,1,0} parameter(0)
-  constant.7 = s32[2]{0} constant({11, 0})
-  broadcast.8 = s32[6,11,8,2]{3,2,1,0} broadcast(constant.7), dimensions={3}
-  clamp.9 = s32[6,11,8,2]{3,2,1,0} clamp(broadcast.6, Arg_0.1, broadcast.8)
-  Arg_1.2 = s16[6,11,8,3,5]{4,3,2,1,0} parameter(1)
-  scatter.14 = s16[3,2,2,14,16]{4,3,2,1,0} scatter(broadcast.4, clamp.9, Arg_1.2), update_window_dims={3,4}, inserted_window_dims={1,2,3}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=3, to_apply=region_0.10
-  shift-right-arithmetic.15 = s16[3,2,2,14,16]{4,3,2,1,0} shift-right-arithmetic(scatter.14, broadcast.4)
-  ROOT tuple.16 = (s16[3,2,2,14,16]{4,3,2,1,0}) tuple(shift-right-arithmetic.15)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000004.hlo b/third_party/xla/xla/tests/fuzz/rand_000004.hlo
deleted file mode 100644
index 54d5459a21264d..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000004.hlo
+++ /dev/null
@@ -1,17 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[9,8,1,5,16]{4,3,2,1,0})->(s32[7,11,11,8,15]{4,3,2,1,0})}
-
-region_0.4 {
-  Arg_0.5 = s32[] parameter(0)
-  Arg_1.6 = s32[] parameter(1)
-  ROOT maximum.7 = s32[] maximum(Arg_0.5, Arg_1.6)
-}
-
-ENTRY main.10 {
-  Arg_0.1 = s32[9,8,1,5,16]{4,3,2,1,0} parameter(0)
-  abs.3 = s32[9,8,1,5,16]{4,3,2,1,0} abs(Arg_0.1)
-  constant.2 = s32[] constant(-2147483648)
-  reduce-window.8 = s32[7,11,11,8,15]{4,3,2,1,0} reduce-window(abs.3, constant.2), window={size=6x12x2x2x13 pad=2_6x6_1x4_7x3_4x6_5 lhs_dilate=1x2x4x1x1 rhs_dilate=2x1x1x4x1}, to_apply=region_0.4
-  ROOT tuple.9 = (s32[7,11,11,8,15]{4,3,2,1,0}) tuple(reduce-window.8)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000005.hlo b/third_party/xla/xla/tests/fuzz/rand_000005.hlo
deleted file mode 100644
index ac5cdc4ae8ebff..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000005.hlo
+++ /dev/null
@@ -1,38 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u16[5,3,2,12,3,6]{5,4,3,2,1,0},s32[6,2]{1,0},u16[6,3,1]{2,1,0},u16[6,3,1]{2,1,0})->(u16[6,3,12,14,3]{4,3,2,1,0})}
-
-region_0.15 {
-  Arg_0.16 = u16[] parameter(0)
-  ROOT Arg_1.17 = u16[] parameter(1)
-}
-
-region_1.23 {
-  Arg_0.24 = u16[] parameter(0)
-  Arg_1.25 = u16[] parameter(1)
-  ROOT minimum.26 = u16[] minimum(Arg_0.24, Arg_1.25)
-}
-
-ENTRY main.29 {
-  constant.5 = u16[] constant(1)
-  broadcast.6 = u16[6,3,12,14,3]{4,3,2,1,0} broadcast(constant.5), dimensions={}
-  constant.7 = s32[] constant(1)
-  broadcast.8 = s32[6,5]{1,0} broadcast(constant.7), dimensions={}
-  Arg_0.1 = u16[5,3,2,12,3,6]{5,4,3,2,1,0} parameter(0)
-  scatter.18 = u16[6,3,12,14,3]{4,3,2,1,0} scatter(broadcast.6, broadcast.8, Arg_0.1), update_window_dims={0,1,2,3,4}, inserted_window_dims={}, scatter_dims_to_operand_dims={3,2,0,4,1}, index_vector_dim=1, to_apply=region_0.15
-  Arg_1.2 = s32[6,2]{1,0} parameter(1)
-  constant.9 = s32[] constant(1)
-  broadcast.10 = s32[6,2]{1,0} broadcast(constant.9), dimensions={}
-  xor.19 = s32[6,2]{1,0} xor(Arg_1.2, broadcast.10)
-  Arg_2.3 = u16[6,3,1]{2,1,0} parameter(2)
-  constant.11 = u16[] constant(0)
-  broadcast.12 = u16[6,3,1]{2,1,0} broadcast(constant.11), dimensions={}
-  Arg_3.4 = u16[6,3,1]{2,1,0} parameter(3)
-  compare.20 = pred[6,3,1]{2,1,0} compare(broadcast.12, Arg_3.4), direction=EQ
-  constant.13 = u16[] constant(1)
-  broadcast.14 = u16[6,3,1]{2,1,0} broadcast(constant.13), dimensions={}
-  select.21 = u16[6,3,1]{2,1,0} select(compare.20, broadcast.14, Arg_3.4)
-  divide.22 = u16[6,3,1]{2,1,0} divide(Arg_2.3, select.21)
-  scatter.27 = u16[6,3,12,14,3]{4,3,2,1,0} scatter(scatter.18, xor.19, divide.22), update_window_dims={1,2}, inserted_window_dims={2,3,4}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=1, to_apply=region_1.23
-  ROOT tuple.28 = (u16[6,3,12,14,3]{4,3,2,1,0}) tuple(scatter.27)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000006.hlo b/third_party/xla/xla/tests/fuzz/rand_000006.hlo
deleted file mode 100644
index c509094781f3da..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000006.hlo
+++ /dev/null
@@ -1,20 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(u8[4]{0})}
-
-region_0.6 {
-  Arg_0.7 = u8[] parameter(0)
-  Arg_1.8 = u8[] parameter(1)
-  ROOT multiply.9 = u8[] multiply(Arg_0.7, Arg_1.8)
-}
-
-ENTRY main.13 {
-  constant.1 = u8[] constant(1)
-  broadcast.2 = u8[14,4,5,6,1]{4,3,2,1,0} broadcast(constant.1), dimensions={}
-  constant.5 = u8[] constant(1)
-  reduce.10 = u8[4,1]{1,0} reduce(broadcast.2, constant.5), dimensions={0,2,3}, to_apply=region_0.6
-  constant.3 = u8[] constant(1)
-  broadcast.4 = u8[4,1]{1,0} broadcast(constant.3), dimensions={}
-  dot.11 = u8[4]{0} dot(reduce.10, broadcast.4), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  ROOT tuple.12 = (u8[4]{0}) tuple(dot.11)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000007.hlo b/third_party/xla/xla/tests/fuzz/rand_000007.hlo
deleted file mode 100644
index a71e77bd92f501..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000007.hlo
+++ /dev/null
@@ -1,53 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(pred[])->(u16[7,11,11,3]{3,2,1,0})}
-
-region_1.9 {
-  Arg_0.10 = s32[] parameter(0)
-  Arg_1.11 = s32[] parameter(1)
-  ROOT add.12 = s32[] add(Arg_0.10, Arg_1.11)
-}
-
-region_0.13 {
-  arg_empty_tuple.14 = () parameter(0)
-  constant.16 = s32[] constant(1)
-  broadcast.17 = s32[5]{0} broadcast(constant.16), dimensions={}
-  constant.15 = s32[] constant(0)
-  reduce.18 = s32[] reduce(broadcast.17, constant.15), dimensions={0}, to_apply=region_1.9
-  ROOT broadcast.19 = s32[4]{0} broadcast(reduce.18), dimensions={}
-}
-
-region_3.20 {
-  Arg_0.21 = s32[] parameter(0)
-  Arg_1.22 = s32[] parameter(1)
-  ROOT add.23 = s32[] add(Arg_0.21, Arg_1.22)
-}
-
-region_2.24 {
-  arg_empty_tuple.25 = () parameter(0)
-  constant.27 = s32[] constant(1)
-  broadcast.28 = s32[5]{0} broadcast(constant.27), dimensions={}
-  constant.26 = s32[] constant(0)
-  reduce.29 = s32[] reduce(broadcast.28, constant.26), dimensions={0}, to_apply=region_3.20
-  ROOT broadcast.30 = s32[4]{0} broadcast(reduce.29), dimensions={}
-}
-
-region_4.32 {
-  Arg_0.33 = u16[] parameter(0)
-  Arg_1.34 = u16[] parameter(1)
-  ROOT multiply.35 = u16[] multiply(Arg_0.33, Arg_1.34)
-}
-
-ENTRY main.38 {
-  constant.4 = u16[] constant(1)
-  broadcast.5 = u16[7,11,11,3]{3,2,1,0} broadcast(constant.4), dimensions={}
-  Arg_0.1 = pred[] parameter(0)
-  convert.6 = s32[] convert(Arg_0.1)
-  tuple.7 = () tuple()
-  tuple.8 = () tuple()
-  conditional.31 = s32[4]{0} conditional(convert.6, tuple.7, tuple.8), branch_computations={region_0.13, region_2.24}
-  constant.2 = u16[] constant(1)
-  broadcast.3 = u16[6,8,7,2]{3,2,1,0} broadcast(constant.2), dimensions={}
-  scatter.36 = u16[7,11,11,3]{3,2,1,0} scatter(broadcast.5, conditional.31, broadcast.3), update_window_dims={0,1,2,3}, inserted_window_dims={}, scatter_dims_to_operand_dims={2,0,1,3}, index_vector_dim=0, to_apply=region_4.32
-  ROOT tuple.37 = (u16[7,11,11,3]{3,2,1,0}) tuple(scatter.36)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000008.hlo b/third_party/xla/xla/tests/fuzz/rand_000008.hlo
deleted file mode 100644
index e241a7eceb0d04..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000008.hlo
+++ /dev/null
@@ -1,128 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f16[16,2,11]{2,1,0})->(f16[13,2]{1,0})}
-
-region_0.115 {
-  Arg_0.116 = f16[] parameter(0)
-  Arg_1.117 = f16[] parameter(1)
-  ROOT minimum.118 = f16[] minimum(Arg_0.116, Arg_1.117)
-}
-
-ENTRY main.121 {
-  constant.4 = f16[] constant(1)
-  broadcast.5 = f16[13,2]{1,0} broadcast(constant.4), dimensions={}
-  log-plus-one.54 = f16[13,2]{1,0} log-plus-one(broadcast.5)
-  constant.2 = s32[] constant(0)
-  broadcast.3 = s32[16,11,1]{2,1,0} broadcast(constant.2), dimensions={}
-  Arg_0.1 = f16[16,2,11]{2,1,0} parameter(0)
-  constant.52 = f16[] constant(0.0078125)
-  broadcast.53 = f16[16,2,11]{2,1,0} broadcast(constant.52), dimensions={}
-  maximum.55 = f16[16,2,11]{2,1,0} maximum(Arg_0.1, broadcast.53)
-  convert.56 = f32[16,2,11]{2,1,0} convert(maximum.55)
-  abs.111 = f32[16,2,11]{2,1,0} abs(convert.56)
-  constant.6 = f32[] constant(inf)
-  broadcast.7 = f32[16,2,11]{2,1,0} broadcast(constant.6), dimensions={}
-  compare.112 = pred[16,2,11]{2,1,0} compare(abs.111, broadcast.7), direction=EQ
-  constant.50 = f32[] constant(0.5)
-  broadcast.51 = f32[16,2,11]{2,1,0} broadcast(constant.50), dimensions={}
-  compare.57 = pred[16,2,11]{2,1,0} compare(convert.56, broadcast.51), direction=LT
-  abs.96 = f32[16,2,11]{2,1,0} abs(convert.56)
-  floor.97 = f32[16,2,11]{2,1,0} floor(abs.96)
-  subtract.98 = f32[16,2,11]{2,1,0} subtract(abs.96, floor.97)
-  compare.99 = pred[16,2,11]{2,1,0} compare(broadcast.51, subtract.98), direction=LT
-  constant.48 = f32[] constant(1)
-  broadcast.49 = f32[16,2,11]{2,1,0} broadcast(constant.48), dimensions={}
-  subtract.100 = f32[16,2,11]{2,1,0} subtract(broadcast.49, subtract.98)
-  select.101 = f32[16,2,11]{2,1,0} select(compare.99, subtract.100, subtract.98)
-  constant.10 = f32[] constant(3.14159274)
-  broadcast.11 = f32[16,2,11]{2,1,0} broadcast(constant.10), dimensions={}
-  multiply.102 = f32[16,2,11]{2,1,0} multiply(select.101, broadcast.11)
-  sine.103 = f32[16,2,11]{2,1,0} sine(multiply.102)
-  log.104 = f32[16,2,11]{2,1,0} log(sine.103)
-  is-finite.107 = pred[16,2,11]{2,1,0} is-finite(log.104)
-  constant.8 = f32[] constant(1.14472985)
-  broadcast.9 = f32[16,2,11]{2,1,0} broadcast(constant.8), dimensions={}
-  subtract.105 = f32[16,2,11]{2,1,0} subtract(broadcast.9, log.104)
-  negate.58 = f32[16,2,11]{2,1,0} negate(convert.56)
-  subtract.59 = f32[16,2,11]{2,1,0} subtract(convert.56, broadcast.49)
-  select.60 = f32[16,2,11]{2,1,0} select(compare.57, negate.58, subtract.59)
-  add.90 = f32[16,2,11]{2,1,0} add(select.60, broadcast.51)
-  constant.16 = f32[] constant(7.5)
-  broadcast.17 = f32[16,2,11]{2,1,0} broadcast(constant.16), dimensions={}
-  add.85 = f32[16,2,11]{2,1,0} add(select.60, broadcast.17)
-  divide.86 = f32[16,2,11]{2,1,0} divide(select.60, broadcast.17)
-  log-plus-one.87 = f32[16,2,11]{2,1,0} log-plus-one(divide.86)
-  constant.14 = f32[] constant(2.01490307)
-  broadcast.15 = f32[16,2,11]{2,1,0} broadcast(constant.14), dimensions={}
-  add.88 = f32[16,2,11]{2,1,0} add(log-plus-one.87, broadcast.15)
-  divide.89 = f32[16,2,11]{2,1,0} divide(add.85, add.88)
-  subtract.91 = f32[16,2,11]{2,1,0} subtract(add.90, divide.89)
-  multiply.92 = f32[16,2,11]{2,1,0} multiply(subtract.91, add.88)
-  constant.12 = f32[] constant(0.918938518)
-  broadcast.13 = f32[16,2,11]{2,1,0} broadcast(constant.12), dimensions={}
-  add.94 = f32[16,2,11]{2,1,0} add(multiply.92, broadcast.13)
-  constant.46 = f32[] constant(676.520386)
-  broadcast.47 = f32[16,2,11]{2,1,0} broadcast(constant.46), dimensions={}
-  add.61 = f32[16,2,11]{2,1,0} add(select.60, broadcast.49)
-  divide.62 = f32[16,2,11]{2,1,0} divide(broadcast.47, add.61)
-  add.63 = f32[16,2,11]{2,1,0} add(divide.62, broadcast.49)
-  constant.44 = f32[] constant(-1259.13916)
-  broadcast.45 = f32[16,2,11]{2,1,0} broadcast(constant.44), dimensions={}
-  constant.42 = f32[] constant(2)
-  broadcast.43 = f32[16,2,11]{2,1,0} broadcast(constant.42), dimensions={}
-  add.64 = f32[16,2,11]{2,1,0} add(select.60, broadcast.43)
-  divide.65 = f32[16,2,11]{2,1,0} divide(broadcast.45, add.64)
-  add.66 = f32[16,2,11]{2,1,0} add(add.63, divide.65)
-  constant.40 = f32[] constant(771.323425)
-  broadcast.41 = f32[16,2,11]{2,1,0} broadcast(constant.40), dimensions={}
-  constant.38 = f32[] constant(3)
-  broadcast.39 = f32[16,2,11]{2,1,0} broadcast(constant.38), dimensions={}
-  add.67 = f32[16,2,11]{2,1,0} add(select.60, broadcast.39)
-  divide.68 = f32[16,2,11]{2,1,0} divide(broadcast.41, add.67)
-  add.69 = f32[16,2,11]{2,1,0} add(add.66, divide.68)
-  constant.36 = f32[] constant(-176.615036)
-  broadcast.37 = f32[16,2,11]{2,1,0} broadcast(constant.36), dimensions={}
-  constant.34 = f32[] constant(4)
-  broadcast.35 = f32[16,2,11]{2,1,0} broadcast(constant.34), dimensions={}
-  add.70 = f32[16,2,11]{2,1,0} add(select.60, broadcast.35)
-  divide.71 = f32[16,2,11]{2,1,0} divide(broadcast.37, add.70)
-  add.72 = f32[16,2,11]{2,1,0} add(add.69, divide.71)
-  constant.32 = f32[] constant(12.5073433)
-  broadcast.33 = f32[16,2,11]{2,1,0} broadcast(constant.32), dimensions={}
-  constant.30 = f32[] constant(5)
-  broadcast.31 = f32[16,2,11]{2,1,0} broadcast(constant.30), dimensions={}
-  add.73 = f32[16,2,11]{2,1,0} add(select.60, broadcast.31)
-  divide.74 = f32[16,2,11]{2,1,0} divide(broadcast.33, add.73)
-  add.75 = f32[16,2,11]{2,1,0} add(add.72, divide.74)
-  constant.28 = f32[] constant(-0.138571098)
-  broadcast.29 = f32[16,2,11]{2,1,0} broadcast(constant.28), dimensions={}
-  constant.26 = f32[] constant(6)
-  broadcast.27 = f32[16,2,11]{2,1,0} broadcast(constant.26), dimensions={}
-  add.76 = f32[16,2,11]{2,1,0} add(select.60, broadcast.27)
-  divide.77 = f32[16,2,11]{2,1,0} divide(broadcast.29, add.76)
-  add.78 = f32[16,2,11]{2,1,0} add(add.75, divide.77)
-  constant.24 = f32[] constant(9.98436917e-06)
-  broadcast.25 = f32[16,2,11]{2,1,0} broadcast(constant.24), dimensions={}
-  constant.22 = f32[] constant(7)
-  broadcast.23 = f32[16,2,11]{2,1,0} broadcast(constant.22), dimensions={}
-  add.79 = f32[16,2,11]{2,1,0} add(select.60, broadcast.23)
-  divide.80 = f32[16,2,11]{2,1,0} divide(broadcast.25, add.79)
-  add.81 = f32[16,2,11]{2,1,0} add(add.78, divide.80)
-  constant.20 = f32[] constant(1.50563267e-07)
-  broadcast.21 = f32[16,2,11]{2,1,0} broadcast(constant.20), dimensions={}
-  constant.18 = f32[] constant(8)
-  broadcast.19 = f32[16,2,11]{2,1,0} broadcast(constant.18), dimensions={}
-  add.82 = f32[16,2,11]{2,1,0} add(select.60, broadcast.19)
-  divide.83 = f32[16,2,11]{2,1,0} divide(broadcast.21, add.82)
-  add.84 = f32[16,2,11]{2,1,0} add(add.81, divide.83)
-  log.93 = f32[16,2,11]{2,1,0} log(add.84)
-  add.95 = f32[16,2,11]{2,1,0} add(add.94, log.93)
-  subtract.106 = f32[16,2,11]{2,1,0} subtract(subtract.105, add.95)
-  negate.108 = f32[16,2,11]{2,1,0} negate(log.104)
-  select.109 = f32[16,2,11]{2,1,0} select(is-finite.107, subtract.106, negate.108)
-  select.110 = f32[16,2,11]{2,1,0} select(compare.57, select.109, add.95)
-  select.113 = f32[16,2,11]{2,1,0} select(compare.112, broadcast.7, select.110)
-  convert.114 = f16[16,2,11]{2,1,0} convert(select.113)
-  scatter.119 = f16[13,2]{1,0} scatter(log-plus-one.54, broadcast.3, convert.114), update_window_dims={1}, inserted_window_dims={0}, scatter_dims_to_operand_dims={1}, index_vector_dim=2, to_apply=region_0.115
-  ROOT tuple.120 = (f16[13,2]{1,0}) tuple(scatter.119)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000009.hlo b/third_party/xla/xla/tests/fuzz/rand_000009.hlo
deleted file mode 100644
index df67d82e5dd521..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000009.hlo
+++ /dev/null
@@ -1,254 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(bf16[])->(bf16[])}
-
-region_0.38 {
-  arg_tuple.39 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
-  get-tuple-element.40 = pred[] get-tuple-element(arg_tuple.39), index=0
-  get-tuple-element.42 = f32[] get-tuple-element(arg_tuple.39), index=2
-  get-tuple-element.46 = f32[] get-tuple-element(arg_tuple.39), index=6
-  get-tuple-element.41 = f32[] get-tuple-element(arg_tuple.39), index=1
-  constant.48 = f32[] constant(1)
-  add.49 = f32[] add(get-tuple-element.41, constant.48)
-  divide.57 = f32[] divide(get-tuple-element.46, add.49)
-  multiply.58 = f32[] multiply(get-tuple-element.42, divide.57)
-  get-tuple-element.43 = f32[] get-tuple-element(arg_tuple.39), index=3
-  add.59 = f32[] add(get-tuple-element.43, multiply.58)
-  divide.60 = f32[] divide(multiply.58, add.59)
-  constant.47 = f32[] constant(1.1920929e-07)
-  compare.61 = pred[] compare(divide.60, constant.47), direction=GT
-  and.62 = pred[] and(get-tuple-element.40, compare.61)
-  select.63 = f32[] select(get-tuple-element.40, add.49, get-tuple-element.41)
-  select.64 = f32[] select(get-tuple-element.40, multiply.58, get-tuple-element.42)
-  select.65 = f32[] select(get-tuple-element.40, add.59, get-tuple-element.43)
-  get-tuple-element.44 = f32[] get-tuple-element(arg_tuple.39), index=4
-  divide.50 = f32[] divide(get-tuple-element.46, add.49)
-  multiply.51 = f32[] multiply(get-tuple-element.44, divide.50)
-  multiply.52 = f32[] multiply(get-tuple-element.42, get-tuple-element.46)
-  multiply.53 = f32[] multiply(add.49, add.49)
-  divide.54 = f32[] divide(multiply.52, multiply.53)
-  subtract.55 = f32[] subtract(multiply.51, divide.54)
-  select.66 = f32[] select(get-tuple-element.40, subtract.55, get-tuple-element.44)
-  get-tuple-element.45 = f32[] get-tuple-element(arg_tuple.39), index=5
-  add.56 = f32[] add(get-tuple-element.45, subtract.55)
-  select.67 = f32[] select(get-tuple-element.40, add.56, get-tuple-element.45)
-  ROOT tuple.68 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.62, select.63, select.64, select.65, select.66, select.67, get-tuple-element.46)
-}
-
-region_1.69 {
-  arg_tuple.70 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
-  ROOT get-tuple-element.71 = pred[] get-tuple-element(arg_tuple.70), index=0
-  get-tuple-element.72 = f32[] get-tuple-element(arg_tuple.70), index=1
-  get-tuple-element.73 = f32[] get-tuple-element(arg_tuple.70), index=2
-  get-tuple-element.74 = f32[] get-tuple-element(arg_tuple.70), index=3
-  get-tuple-element.75 = f32[] get-tuple-element(arg_tuple.70), index=4
-  get-tuple-element.76 = f32[] get-tuple-element(arg_tuple.70), index=5
-  get-tuple-element.77 = f32[] get-tuple-element(arg_tuple.70), index=6
-}
-
-region_2.99 {
-  arg_tuple.100 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
-  get-tuple-element.101 = pred[] get-tuple-element(arg_tuple.100), index=0
-  get-tuple-element.108 = f32[] get-tuple-element(arg_tuple.100), index=7
-  get-tuple-element.105 = f32[] get-tuple-element(arg_tuple.100), index=4
-  constant.119 = f32[] constant(2)
-  add.123 = f32[] add(get-tuple-element.105, constant.119)
-  multiply.128 = f32[] multiply(get-tuple-element.108, add.123)
-  get-tuple-element.110 = f32[] get-tuple-element(arg_tuple.100), index=9
-  get-tuple-element.104 = f32[] get-tuple-element(arg_tuple.100), index=3
-  constant.120 = f32[] constant(1)
-  add.122 = f32[] add(get-tuple-element.104, constant.120)
-  get-tuple-element.106 = f32[] get-tuple-element(arg_tuple.100), index=5
-  add.121 = f32[] add(get-tuple-element.106, constant.120)
-  multiply.124 = f32[] multiply(add.122, add.121)
-  multiply.129 = f32[] multiply(get-tuple-element.110, multiply.124)
-  subtract.130 = f32[] subtract(multiply.128, multiply.129)
-  constant.118 = f32[] constant(0)
-  compare.131 = pred[] compare(subtract.130, constant.118), direction=NE
-  get-tuple-element.102 = f32[] get-tuple-element(arg_tuple.100), index=1
-  get-tuple-element.107 = f32[] get-tuple-element(arg_tuple.100), index=6
-  multiply.125 = f32[] multiply(get-tuple-element.107, add.123)
-  get-tuple-element.109 = f32[] get-tuple-element(arg_tuple.100), index=8
-  multiply.126 = f32[] multiply(get-tuple-element.109, multiply.124)
-  subtract.127 = f32[] subtract(multiply.125, multiply.126)
-  divide.132 = f32[] divide(subtract.127, subtract.130)
-  subtract.133 = f32[] subtract(get-tuple-element.102, divide.132)
-  divide.134 = f32[] divide(subtract.133, divide.132)
-  abs.135 = f32[] abs(divide.134)
-  select.136 = f32[] select(compare.131, abs.135, constant.120)
-  constant.116 = f32[] constant(1.1920929e-07)
-  compare.172 = pred[] compare(select.136, constant.116), direction=GT
-  and.173 = pred[] and(get-tuple-element.101, compare.172)
-  select.137 = f32[] select(compare.131, divide.132, get-tuple-element.102)
-  select.174 = f32[] select(get-tuple-element.101, select.137, get-tuple-element.102)
-  get-tuple-element.103 = f32[] get-tuple-element(arg_tuple.100), index=2
-  select.175 = f32[] select(get-tuple-element.101, select.136, get-tuple-element.103)
-  select.176 = f32[] select(get-tuple-element.101, add.122, get-tuple-element.104)
-  select.177 = f32[] select(get-tuple-element.101, add.123, get-tuple-element.105)
-  abs.154 = f32[] abs(subtract.127)
-  constant.117 = f32[] constant(8388608)
-  compare.155 = pred[] compare(abs.154, constant.117), direction=GT
-  multiply.158 = f32[] multiply(subtract.127, constant.116)
-  select.159 = f32[] select(compare.155, multiply.158, subtract.127)
-  select.178 = f32[] select(get-tuple-element.101, select.159, get-tuple-element.107)
-  multiply.162 = f32[] multiply(subtract.130, constant.116)
-  select.163 = f32[] select(compare.155, multiply.162, subtract.130)
-  select.179 = f32[] select(get-tuple-element.101, select.163, get-tuple-element.108)
-  multiply.156 = f32[] multiply(get-tuple-element.107, constant.116)
-  select.157 = f32[] select(compare.155, multiply.156, get-tuple-element.107)
-  select.180 = f32[] select(get-tuple-element.101, select.157, get-tuple-element.109)
-  multiply.160 = f32[] multiply(get-tuple-element.108, constant.116)
-  select.161 = f32[] select(compare.155, multiply.160, get-tuple-element.108)
-  select.181 = f32[] select(get-tuple-element.101, select.161, get-tuple-element.110)
-  get-tuple-element.113 = f32[] get-tuple-element(arg_tuple.100), index=12
-  multiply.164 = f32[] multiply(get-tuple-element.113, constant.116)
-  select.165 = f32[] select(compare.155, multiply.164, get-tuple-element.113)
-  get-tuple-element.111 = f32[] get-tuple-element(arg_tuple.100), index=10
-  select.182 = f32[] select(get-tuple-element.101, select.165, get-tuple-element.111)
-  get-tuple-element.114 = f32[] get-tuple-element(arg_tuple.100), index=13
-  multiply.166 = f32[] multiply(get-tuple-element.114, constant.116)
-  select.167 = f32[] select(compare.155, multiply.166, get-tuple-element.114)
-  get-tuple-element.112 = f32[] get-tuple-element(arg_tuple.100), index=11
-  select.183 = f32[] select(get-tuple-element.101, select.167, get-tuple-element.112)
-  multiply.138 = f32[] multiply(get-tuple-element.113, add.123)
-  subtract.139 = f32[] subtract(multiply.138, get-tuple-element.107)
-  multiply.140 = f32[] multiply(get-tuple-element.111, multiply.124)
-  subtract.141 = f32[] subtract(subtract.139, multiply.140)
-  multiply.142 = f32[] multiply(get-tuple-element.109, add.121)
-  add.143 = f32[] add(subtract.141, multiply.142)
-  multiply.168 = f32[] multiply(add.143, constant.116)
-  select.169 = f32[] select(compare.155, multiply.168, add.143)
-  select.184 = f32[] select(get-tuple-element.101, select.169, get-tuple-element.113)
-  multiply.144 = f32[] multiply(get-tuple-element.114, add.123)
-  subtract.145 = f32[] subtract(multiply.144, get-tuple-element.108)
-  multiply.146 = f32[] multiply(get-tuple-element.112, multiply.124)
-  subtract.147 = f32[] subtract(subtract.145, multiply.146)
-  multiply.148 = f32[] multiply(get-tuple-element.110, add.121)
-  add.149 = f32[] add(subtract.147, multiply.148)
-  multiply.170 = f32[] multiply(add.149, constant.116)
-  select.171 = f32[] select(compare.155, multiply.170, add.149)
-  select.185 = f32[] select(get-tuple-element.101, select.171, get-tuple-element.114)
-  multiply.150 = f32[] multiply(select.137, add.149)
-  subtract.151 = f32[] subtract(add.143, multiply.150)
-  divide.152 = f32[] divide(subtract.151, subtract.130)
-  get-tuple-element.115 = f32[] get-tuple-element(arg_tuple.100), index=14
-  select.153 = f32[] select(compare.131, divide.152, get-tuple-element.115)
-  select.186 = f32[] select(get-tuple-element.101, select.153, get-tuple-element.115)
-  ROOT tuple.187 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.173, select.174, select.175, select.176, select.177, add.121, select.178, select.179, select.180, select.181, select.182, select.183, select.184, select.185, select.186)
-}
-
-region_3.188 {
-  arg_tuple.189 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
-  get-tuple-element.191 = f32[] get-tuple-element(arg_tuple.189), index=1
-  get-tuple-element.192 = f32[] get-tuple-element(arg_tuple.189), index=2
-  get-tuple-element.193 = f32[] get-tuple-element(arg_tuple.189), index=3
-  get-tuple-element.194 = f32[] get-tuple-element(arg_tuple.189), index=4
-  get-tuple-element.196 = f32[] get-tuple-element(arg_tuple.189), index=6
-  get-tuple-element.197 = f32[] get-tuple-element(arg_tuple.189), index=7
-  get-tuple-element.198 = f32[] get-tuple-element(arg_tuple.189), index=8
-  get-tuple-element.199 = f32[] get-tuple-element(arg_tuple.189), index=9
-  get-tuple-element.200 = f32[] get-tuple-element(arg_tuple.189), index=10
-  get-tuple-element.201 = f32[] get-tuple-element(arg_tuple.189), index=11
-  get-tuple-element.202 = f32[] get-tuple-element(arg_tuple.189), index=12
-  get-tuple-element.203 = f32[] get-tuple-element(arg_tuple.189), index=13
-  get-tuple-element.204 = f32[] get-tuple-element(arg_tuple.189), index=14
-  get-tuple-element.195 = f32[] get-tuple-element(arg_tuple.189), index=5
-  constant.205 = f32[] constant(2000)
-  compare.206 = pred[] compare(get-tuple-element.195, constant.205), direction=LT
-  get-tuple-element.190 = pred[] get-tuple-element(arg_tuple.189), index=0
-  ROOT and.207 = pred[] and(compare.206, get-tuple-element.190)
-}
-
-ENTRY main.241 {
-  Arg_0.1 = bf16[] parameter(0)
-  constant.15 = bf16[] constant(0)
-  maximum.16 = bf16[] maximum(Arg_0.1, constant.15)
-  convert.17 = f32[] convert(maximum.16)
-  constant.2 = f32[] constant(0)
-  compare.18 = pred[] compare(convert.17, constant.2), direction=LE
-  log.22 = f32[] log(convert.17)
-  subtract.23 = f32[] subtract(log.22, convert.17)
-  constant.6 = f32[] constant(0.5)
-  constant.7 = f32[] constant(7.5)
-  log-plus-one.24 = f32[] log-plus-one(constant.2)
-  constant.14 = f32[] constant(2.01490307)
-  add.25 = f32[] add(log-plus-one.24, constant.14)
-  divide.26 = f32[] divide(constant.7, add.25)
-  subtract.27 = f32[] subtract(constant.6, divide.26)
-  multiply.28 = f32[] multiply(subtract.27, add.25)
-  constant.13 = f32[] constant(0.918938518)
-  add.29 = f32[] add(multiply.28, constant.13)
-  constant.5 = f32[] constant(5.57361031)
-  add.30 = f32[] add(add.29, constant.5)
-  subtract.31 = f32[] subtract(subtract.23, add.30)
-  constant.3 = f32[] constant(-88.7228394)
-  compare.32 = pred[] compare(subtract.31, constant.3), direction=LT
-  or.33 = pred[] or(compare.18, compare.32)
-  not.34 = pred[] not(or.33)
-  constant.4 = f32[] constant(1)
-  compare.19 = pred[] compare(convert.17, constant.4), direction=LT
-  compare.20 = pred[] compare(convert.17, constant.4), direction=LT
-  or.21 = pred[] or(compare.19, compare.20)
-  and.36 = pred[] and(not.34, or.21)
-  tuple.37 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.36, constant.4, constant.4, constant.4, constant.2, constant.2, convert.17)
-  while.78 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) while(tuple.37), condition=region_1.69, body=region_0.38
-  get-tuple-element.79 = pred[] get-tuple-element(while.78), index=0
-  get-tuple-element.80 = f32[] get-tuple-element(while.78), index=1
-  get-tuple-element.81 = f32[] get-tuple-element(while.78), index=2
-  get-tuple-element.83 = f32[] get-tuple-element(while.78), index=4
-  get-tuple-element.84 = f32[] get-tuple-element(while.78), index=5
-  get-tuple-element.85 = f32[] get-tuple-element(while.78), index=6
-  not.88 = pred[] not(or.21)
-  and.89 = pred[] and(not.34, not.88)
-  add.91 = f32[] add(convert.17, constant.4)
-  add.90 = f32[] add(convert.17, constant.4)
-  multiply.92 = f32[] multiply(add.90, convert.17)
-  divide.93 = f32[] divide(add.91, multiply.92)
-  negate.94 = f32[] negate(convert.17)
-  multiply.95 = f32[] multiply(divide.93, negate.94)
-  subtract.96 = f32[] subtract(constant.2, multiply.95)
-  divide.97 = f32[] divide(subtract.96, multiply.92)
-  tuple.98 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.89, divide.93, constant.4, constant.2, add.90, constant.2, add.91, multiply.92, constant.4, convert.17, constant.2, constant.2, constant.2, negate.94, divide.97)
-  while.208 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) while(tuple.98), condition=region_3.188, body=region_2.99
-  get-tuple-element.209 = pred[] get-tuple-element(while.208), index=0
-  get-tuple-element.211 = f32[] get-tuple-element(while.208), index=2
-  get-tuple-element.212 = f32[] get-tuple-element(while.208), index=3
-  get-tuple-element.213 = f32[] get-tuple-element(while.208), index=4
-  get-tuple-element.214 = f32[] get-tuple-element(while.208), index=5
-  get-tuple-element.215 = f32[] get-tuple-element(while.208), index=6
-  get-tuple-element.216 = f32[] get-tuple-element(while.208), index=7
-  get-tuple-element.217 = f32[] get-tuple-element(while.208), index=8
-  get-tuple-element.218 = f32[] get-tuple-element(while.208), index=9
-  get-tuple-element.219 = f32[] get-tuple-element(while.208), index=10
-  get-tuple-element.220 = f32[] get-tuple-element(while.208), index=11
-  get-tuple-element.221 = f32[] get-tuple-element(while.208), index=12
-  get-tuple-element.222 = f32[] get-tuple-element(while.208), index=13
-  get-tuple-element.223 = f32[] get-tuple-element(while.208), index=14
-  constant.11 = bf16[] constant(-1)
-  constant.12 = f32[] constant(inf)
-  compare.227 = pred[] compare(convert.17, constant.12), direction=EQ
-  get-tuple-element.82 = f32[] get-tuple-element(while.78), index=3
-  exponential.35 = f32[] exponential(subtract.31)
-  multiply.86 = f32[] multiply(get-tuple-element.82, exponential.35)
-  divide.87 = f32[] divide(multiply.86, constant.4)
-  subtract.225 = f32[] subtract(constant.4, divide.87)
-  get-tuple-element.210 = f32[] get-tuple-element(while.208), index=1
-  multiply.224 = f32[] multiply(get-tuple-element.210, exponential.35)
-  select.226 = f32[] select(or.21, subtract.225, multiply.224)
-  select.228 = f32[] select(compare.227, constant.2, select.226)
-  select.229 = f32[] select(compare.18, constant.4, select.228)
-  convert.230 = bf16[] convert(select.229)
-  constant.8 = bf16[] constant(1)
-  clamp.231 = bf16[] clamp(constant.11, convert.230, constant.8)
-  compare.232 = pred[] compare(clamp.231, constant.11), direction=NE
-  multiply.233 = bf16[] multiply(clamp.231, clamp.231)
-  subtract.234 = bf16[] subtract(constant.8, multiply.233)
-  sqrt.235 = bf16[] sqrt(subtract.234)
-  add.236 = bf16[] add(clamp.231, constant.8)
-  atan2.237 = bf16[] atan2(sqrt.235, add.236)
-  constant.10 = bf16[] constant(2)
-  multiply.238 = bf16[] multiply(atan2.237, constant.10)
-  constant.9 = bf16[] constant(3.141)
-  select.239 = bf16[] select(compare.232, multiply.238, constant.9)
-  ROOT tuple.240 = (bf16[]) tuple(select.239)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000013.hlo b/third_party/xla/xla/tests/fuzz/rand_000013.hlo
deleted file mode 100644
index 0ae31e850a98ec..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000013.hlo
+++ /dev/null
@@ -1,14 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(c64[9,15,10,13,6]{4,3,2,1,0},s32[8,24,1,1,1]{4,3,2,1,0})->(c64[5,8,12,1]{3,2,1,0})}
-
-ENTRY main.9 {
-  Arg_0.1 = c64[9,15,10,13,6]{4,3,2,1,0} parameter(0)
-  reverse.3 = c64[9,15,10,13,6]{4,3,2,1,0} reverse(Arg_0.1), dimensions={3,1}
-  Arg_1.2 = s32[8,24,1,1,1]{4,3,2,1,0} parameter(1)
-  transpose.4 = s32[1,1,24,1,8]{0,1,3,2,4} transpose(Arg_1.2), dimensions={4,3,1,2,0}
-  reshape.5 = s32[12,8,2]{2,1,0} reshape(transpose.4)
-  gather.6 = c64[5,12,8,1]{3,2,1,0} gather(reverse.3, reshape.5), offset_dims={0,3}, collapsed_slice_dims={0,2,3}, start_index_map={0,3}, index_vector_dim=2, slice_sizes={1,5,1,1,1}
-  reshape.7 = c64[5,8,12,1]{3,2,1,0} reshape(gather.6)
-  ROOT tuple.8 = (c64[5,8,12,1]{3,2,1,0}) tuple(reshape.7)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000015.hlo b/third_party/xla/xla/tests/fuzz/rand_000015.hlo
deleted file mode 100644
index 9407a9d3ef73c6..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000015.hlo
+++ /dev/null
@@ -1,32 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(c64[10,10,3]{2,1,0},s32[2]{0},c64[9,5]{1,0})->(c64[10,10,3]{2,1,0})}
-
-region_0.10 {
-  Arg_0.11 = c64[] parameter(0)
-  real.13 = f32[] real(Arg_0.11)
-  Arg_1.12 = c64[] parameter(1)
-  real.14 = f32[] real(Arg_1.12)
-  compare.15 = pred[] compare(real.13, real.14), direction=EQ
-  imag.17 = f32[] imag(Arg_0.11)
-  imag.18 = f32[] imag(Arg_1.12)
-  compare.19 = pred[] compare(imag.17, imag.18), direction=LT
-  compare.16 = pred[] compare(real.13, real.14), direction=LT
-  select.20 = pred[] select(compare.15, compare.19, compare.16)
-  ROOT select.21 = c64[] select(select.20, Arg_0.11, Arg_1.12)
-}
-
-ENTRY main.25 {
-  constant.4 = c64[] constant((1, 0))
-  broadcast.5 = c64[10,10,3]{2,1,0} broadcast(constant.4), dimensions={}
-  negate.6 = c64[10,10,3]{2,1,0} negate(broadcast.5)
-  exponential.7 = c64[10,10,3]{2,1,0} exponential(negate.6)
-  add.8 = c64[10,10,3]{2,1,0} add(exponential.7, broadcast.5)
-  divide.9 = c64[10,10,3]{2,1,0} divide(broadcast.5, add.8)
-  Arg_0.1 = c64[10,10,3]{2,1,0} parameter(0)
-  Arg_1.2 = s32[2]{0} parameter(1)
-  Arg_2.3 = c64[9,5]{1,0} parameter(2)
-  scatter.22 = c64[10,10,3]{2,1,0} scatter(Arg_0.1, Arg_1.2, Arg_2.3), update_window_dims={0,1}, inserted_window_dims={2}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=0, to_apply=region_0.10
-  add.23 = c64[10,10,3]{2,1,0} add(divide.9, scatter.22)
-  ROOT tuple.24 = (c64[10,10,3]{2,1,0}) tuple(add.23)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000016.hlo b/third_party/xla/xla/tests/fuzz/rand_000016.hlo
deleted file mode 100644
index c93c340d49ac11..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000016.hlo
+++ /dev/null
@@ -1,19 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0})->(s16[8,14,3]{2,1,0})}
-
-region_0.6 {
-  Arg_0.7 = s16[] parameter(0)
-  Arg_1.8 = s16[] parameter(1)
-  ROOT add.9 = s16[] add(Arg_0.7, Arg_1.8)
-}
-
-ENTRY main.12 {
-  Arg_0.1 = s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0} parameter(0)
-  constant.3 = s16[] constant(1)
-  broadcast.4 = s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0} broadcast(constant.3), dimensions={}
-  shift-left.5 = s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0} shift-left(Arg_0.1, broadcast.4)
-  constant.2 = s16[] constant(0)
-  reduce.10 = s16[8,14,3]{2,1,0} reduce(shift-left.5, constant.2), dimensions={0,2,3,5}, to_apply=region_0.6
-  ROOT tuple.11 = (s16[8,14,3]{2,1,0}) tuple(reduce.10)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000017.hlo b/third_party/xla/xla/tests/fuzz/rand_000017.hlo
deleted file mode 100644
index b90698f52911ba..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000017.hlo
+++ /dev/null
@@ -1,16 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[1080,2,2]{2,1,0},u32[6,5,12,12]{3,2,1,0},u32[6,5,12,12]{3,2,1,0})->(u32[6,5,12,12]{3,2,1,0})}
-
-ENTRY main.11 {
-  Arg_0.1 = u32[1080,2,2]{2,1,0} parameter(0)
-  transpose.6 = u32[2,1080,2]{2,0,1} transpose(Arg_0.1), dimensions={1,0,2}
-  reshape.7 = u32[6,5,12,12]{3,2,1,0} reshape(transpose.6)
-  Arg_1.2 = u32[6,5,12,12]{3,2,1,0} parameter(1)
-  constant.4 = u32[] constant(1)
-  broadcast.5 = u32[6,5,12,12]{3,2,1,0} broadcast(constant.4), dimensions={}
-  Arg_2.3 = u32[6,5,12,12]{3,2,1,0} parameter(2)
-  clamp.8 = u32[6,5,12,12]{3,2,1,0} clamp(Arg_1.2, broadcast.5, Arg_2.3)
-  remainder.9 = u32[6,5,12,12]{3,2,1,0} remainder(reshape.7, clamp.8)
-  ROOT tuple.10 = (u32[6,5,12,12]{3,2,1,0}) tuple(remainder.9)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000018.hlo b/third_party/xla/xla/tests/fuzz/rand_000018.hlo
deleted file mode 100644
index af9e259c162260..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000018.hlo
+++ /dev/null
@@ -1,19 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[11,6]{1,0},s8[1,7,9,2,9,4,16]{6,5,4,3,2,1,0})->(s8[11,6]{1,0})}
-
-region_0.5 {
-  Arg_0.6 = s8[] parameter(0)
-  Arg_1.7 = s8[] parameter(1)
-  ROOT add.8 = s8[] add(Arg_0.6, Arg_1.7)
-}
-
-ENTRY main.12 {
-  Arg_0.1 = s8[11,6]{1,0} parameter(0)
-  constant.3 = s32[] constant(1)
-  broadcast.4 = s32[1,7,9,2,16,2]{5,4,3,2,1,0} broadcast(constant.3), dimensions={}
-  Arg_1.2 = s8[1,7,9,2,9,4,16]{6,5,4,3,2,1,0} parameter(1)
-  scatter.9 = s8[11,6]{1,0} scatter(Arg_0.1, broadcast.4, Arg_1.2), update_window_dims={4,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=5, to_apply=region_0.5
-  abs.10 = s8[11,6]{1,0} abs(scatter.9)
-  ROOT tuple.11 = (s8[11,6]{1,0}) tuple(abs.10)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000019.hlo b/third_party/xla/xla/tests/fuzz/rand_000019.hlo
deleted file mode 100644
index 42eb753d14f08b..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000019.hlo
+++ /dev/null
@@ -1,21 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(c64[5,16]{1,0})->(c64[5,16]{1,0})}
-
-ENTRY main.16 {
-  constant.4 = c64[] constant((1, 0))
-  broadcast.5 = c64[5,16]{1,0} broadcast(constant.4), dimensions={}
-  constant.6 = c64[] constant((0, 0))
-  broadcast.7 = c64[5,16]{1,0} broadcast(constant.6), dimensions={}
-  Arg_0.1 = c64[5,16]{1,0} parameter(0)
-  compare.10 = pred[5,16]{1,0} compare(broadcast.7, Arg_0.1), direction=EQ
-  select.11 = c64[5,16]{1,0} select(compare.10, broadcast.5, Arg_0.1)
-  divide.12 = c64[5,16]{1,0} divide(broadcast.5, select.11)
-  constant.8 = f32[] constant(1)
-  broadcast.9 = f32[5,16]{1,0} broadcast(constant.8), dimensions={}
-  constant.2 = f32[] constant(0)
-  broadcast.3 = f32[5,16]{1,0} broadcast(constant.2), dimensions={}
-  complex.13 = c64[5,16]{1,0} complex(broadcast.9, broadcast.3)
-  subtract.14 = c64[5,16]{1,0} subtract(divide.12, complex.13)
-  ROOT tuple.15 = (c64[5,16]{1,0}) tuple(subtract.14)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000020.hlo b/third_party/xla/xla/tests/fuzz/rand_000020.hlo
deleted file mode 100644
index 1bdff050664828..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000020.hlo
+++ /dev/null
@@ -1,15 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f16[],f16[])->(f16[])}
-
-ENTRY main.10 {
-  Arg_0.1 = f16[] parameter(0)
-  cosine.5 = f16[] cosine(Arg_0.1)
-  constant.3 = f16[] constant(0)
-  maximum.7 = f16[] maximum(cosine.5, constant.3)
-  Arg_1.2 = f16[] parameter(1)
-  constant.4 = f16[] constant(1)
-  maximum.6 = f16[] maximum(Arg_1.2, constant.4)
-  power.8 = f16[] power(maximum.7, maximum.6)
-  ROOT tuple.9 = (f16[]) tuple(power.8)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000022.hlo b/third_party/xla/xla/tests/fuzz/rand_000022.hlo
deleted file mode 100644
index cbc4a650ce1590..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000022.hlo
+++ /dev/null
@@ -1,171 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f16[10,14,15]{2,1,0},f16[6,14,13]{2,1,0})->(f16[10,6,15]{2,1,0})}
-
-region_0.156 {
-  Arg_0.157 = f16[] parameter(0)
-  Arg_1.158 = f16[] parameter(1)
-  ROOT add.159 = f16[] add(Arg_0.157, Arg_1.158)
-}
-
-ENTRY main.164 {
-  Arg_0.1 = f16[10,14,15]{2,1,0} parameter(0)
-  convert.64 = f32[10,14,15]{2,1,0} convert(Arg_0.1)
-  abs.65 = f32[10,14,15]{2,1,0} abs(convert.64)
-  constant.3 = f32[] constant(8)
-  broadcast.4 = f32[10,14,15]{2,1,0} broadcast(constant.3), dimensions={}
-  compare.151 = pred[10,14,15]{2,1,0} compare(abs.65, broadcast.4), direction=LE
-  constant.5 = f32[] constant(0.5)
-  broadcast.6 = f32[10,14,15]{2,1,0} broadcast(constant.5), dimensions={}
-  multiply.66 = f32[10,14,15]{2,1,0} multiply(abs.65, broadcast.6)
-  constant.61 = f32[] constant(2)
-  broadcast.62 = f32[10,14,15]{2,1,0} broadcast(constant.61), dimensions={}
-  subtract.67 = f32[10,14,15]{2,1,0} subtract(multiply.66, broadcast.62)
-  constant.21 = f32[] constant(0)
-  broadcast.22 = f32[10,14,15]{2,1,0} broadcast(constant.21), dimensions={}
-  multiply.68 = f32[10,14,15]{2,1,0} multiply(subtract.67, broadcast.22)
-  subtract.69 = f32[10,14,15]{2,1,0} subtract(multiply.68, broadcast.22)
-  constant.57 = f32[] constant(-1.30002498e-08)
-  broadcast.58 = f32[10,14,15]{2,1,0} broadcast(constant.57), dimensions={}
-  add.70 = f32[10,14,15]{2,1,0} add(subtract.69, broadcast.58)
-  multiply.71 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.70)
-  subtract.72 = f32[10,14,15]{2,1,0} subtract(multiply.71, broadcast.22)
-  constant.55 = f32[] constant(6.04699508e-08)
-  broadcast.56 = f32[10,14,15]{2,1,0} broadcast(constant.55), dimensions={}
-  add.73 = f32[10,14,15]{2,1,0} add(subtract.72, broadcast.56)
-  multiply.74 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.73)
-  subtract.75 = f32[10,14,15]{2,1,0} subtract(multiply.74, add.70)
-  constant.53 = f32[] constant(-2.67079372e-07)
-  broadcast.54 = f32[10,14,15]{2,1,0} broadcast(constant.53), dimensions={}
-  add.76 = f32[10,14,15]{2,1,0} add(subtract.75, broadcast.54)
-  multiply.77 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.76)
-  subtract.78 = f32[10,14,15]{2,1,0} subtract(multiply.77, add.73)
-  constant.51 = f32[] constant(1.11738757e-06)
-  broadcast.52 = f32[10,14,15]{2,1,0} broadcast(constant.51), dimensions={}
-  add.79 = f32[10,14,15]{2,1,0} add(subtract.78, broadcast.52)
-  multiply.80 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.79)
-  subtract.81 = f32[10,14,15]{2,1,0} subtract(multiply.80, add.76)
-  constant.49 = f32[] constant(-4.41673819e-06)
-  broadcast.50 = f32[10,14,15]{2,1,0} broadcast(constant.49), dimensions={}
-  add.82 = f32[10,14,15]{2,1,0} add(subtract.81, broadcast.50)
-  multiply.83 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.82)
-  subtract.84 = f32[10,14,15]{2,1,0} subtract(multiply.83, add.79)
-  constant.47 = f32[] constant(1.64484482e-05)
-  broadcast.48 = f32[10,14,15]{2,1,0} broadcast(constant.47), dimensions={}
-  add.85 = f32[10,14,15]{2,1,0} add(subtract.84, broadcast.48)
-  multiply.86 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.85)
-  subtract.87 = f32[10,14,15]{2,1,0} subtract(multiply.86, add.82)
-  constant.45 = f32[] constant(-5.75419508e-05)
-  broadcast.46 = f32[10,14,15]{2,1,0} broadcast(constant.45), dimensions={}
-  add.88 = f32[10,14,15]{2,1,0} add(subtract.87, broadcast.46)
-  multiply.89 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.88)
-  subtract.90 = f32[10,14,15]{2,1,0} subtract(multiply.89, add.85)
-  constant.43 = f32[] constant(0.000188502891)
-  broadcast.44 = f32[10,14,15]{2,1,0} broadcast(constant.43), dimensions={}
-  add.91 = f32[10,14,15]{2,1,0} add(subtract.90, broadcast.44)
-  multiply.92 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.91)
-  subtract.93 = f32[10,14,15]{2,1,0} subtract(multiply.92, add.88)
-  constant.41 = f32[] constant(-0.000576375576)
-  broadcast.42 = f32[10,14,15]{2,1,0} broadcast(constant.41), dimensions={}
-  add.94 = f32[10,14,15]{2,1,0} add(subtract.93, broadcast.42)
-  multiply.95 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.94)
-  subtract.96 = f32[10,14,15]{2,1,0} subtract(multiply.95, add.91)
-  constant.39 = f32[] constant(0.00163947558)
-  broadcast.40 = f32[10,14,15]{2,1,0} broadcast(constant.39), dimensions={}
-  add.97 = f32[10,14,15]{2,1,0} add(subtract.96, broadcast.40)
-  multiply.98 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.97)
-  subtract.99 = f32[10,14,15]{2,1,0} subtract(multiply.98, add.94)
-  constant.37 = f32[] constant(-0.00432431)
-  broadcast.38 = f32[10,14,15]{2,1,0} broadcast(constant.37), dimensions={}
-  add.100 = f32[10,14,15]{2,1,0} add(subtract.99, broadcast.38)
-  multiply.101 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.100)
-  subtract.102 = f32[10,14,15]{2,1,0} subtract(multiply.101, add.97)
-  constant.35 = f32[] constant(0.0105464607)
-  broadcast.36 = f32[10,14,15]{2,1,0} broadcast(constant.35), dimensions={}
-  add.103 = f32[10,14,15]{2,1,0} add(subtract.102, broadcast.36)
-  multiply.104 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.103)
-  subtract.105 = f32[10,14,15]{2,1,0} subtract(multiply.104, add.100)
-  constant.33 = f32[] constant(-0.0237374157)
-  broadcast.34 = f32[10,14,15]{2,1,0} broadcast(constant.33), dimensions={}
-  add.106 = f32[10,14,15]{2,1,0} add(subtract.105, broadcast.34)
-  multiply.107 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.106)
-  subtract.108 = f32[10,14,15]{2,1,0} subtract(multiply.107, add.103)
-  constant.31 = f32[] constant(0.0493052825)
-  broadcast.32 = f32[10,14,15]{2,1,0} broadcast(constant.31), dimensions={}
-  add.109 = f32[10,14,15]{2,1,0} add(subtract.108, broadcast.32)
-  multiply.110 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.109)
-  subtract.111 = f32[10,14,15]{2,1,0} subtract(multiply.110, add.106)
-  constant.29 = f32[] constant(-0.0949011)
-  broadcast.30 = f32[10,14,15]{2,1,0} broadcast(constant.29), dimensions={}
-  add.112 = f32[10,14,15]{2,1,0} add(subtract.111, broadcast.30)
-  multiply.113 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.112)
-  subtract.114 = f32[10,14,15]{2,1,0} subtract(multiply.113, add.109)
-  constant.27 = f32[] constant(0.171620905)
-  broadcast.28 = f32[10,14,15]{2,1,0} broadcast(constant.27), dimensions={}
-  add.115 = f32[10,14,15]{2,1,0} add(subtract.114, broadcast.28)
-  multiply.116 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.115)
-  subtract.117 = f32[10,14,15]{2,1,0} subtract(multiply.116, add.112)
-  constant.25 = f32[] constant(-0.304682672)
-  broadcast.26 = f32[10,14,15]{2,1,0} broadcast(constant.25), dimensions={}
-  add.118 = f32[10,14,15]{2,1,0} add(subtract.117, broadcast.26)
-  multiply.119 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.118)
-  subtract.120 = f32[10,14,15]{2,1,0} subtract(multiply.119, add.115)
-  constant.23 = f32[] constant(0.676795304)
-  broadcast.24 = f32[10,14,15]{2,1,0} broadcast(constant.23), dimensions={}
-  add.121 = f32[10,14,15]{2,1,0} add(subtract.120, broadcast.24)
-  subtract.122 = f32[10,14,15]{2,1,0} subtract(add.121, add.115)
-  multiply.123 = f32[10,14,15]{2,1,0} multiply(subtract.122, broadcast.6)
-  constant.59 = f32[] constant(32)
-  broadcast.60 = f32[10,14,15]{2,1,0} broadcast(constant.59), dimensions={}
-  divide.124 = f32[10,14,15]{2,1,0} divide(broadcast.60, abs.65)
-  subtract.125 = f32[10,14,15]{2,1,0} subtract(divide.124, broadcast.62)
-  multiply.126 = f32[10,14,15]{2,1,0} multiply(subtract.125, broadcast.22)
-  subtract.127 = f32[10,14,15]{2,1,0} subtract(multiply.126, broadcast.22)
-  constant.19 = f32[] constant(3.39623196e-09)
-  broadcast.20 = f32[10,14,15]{2,1,0} broadcast(constant.19), dimensions={}
-  add.128 = f32[10,14,15]{2,1,0} add(subtract.127, broadcast.20)
-  multiply.129 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.128)
-  subtract.130 = f32[10,14,15]{2,1,0} subtract(multiply.129, broadcast.22)
-  constant.17 = f32[] constant(2.26666899e-08)
-  broadcast.18 = f32[10,14,15]{2,1,0} broadcast(constant.17), dimensions={}
-  add.131 = f32[10,14,15]{2,1,0} add(subtract.130, broadcast.18)
-  multiply.132 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.131)
-  subtract.133 = f32[10,14,15]{2,1,0} subtract(multiply.132, add.128)
-  constant.15 = f32[] constant(2.04891862e-07)
-  broadcast.16 = f32[10,14,15]{2,1,0} broadcast(constant.15), dimensions={}
-  add.134 = f32[10,14,15]{2,1,0} add(subtract.133, broadcast.16)
-  multiply.135 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.134)
-  subtract.136 = f32[10,14,15]{2,1,0} subtract(multiply.135, add.131)
-  constant.13 = f32[] constant(2.89137051e-06)
-  broadcast.14 = f32[10,14,15]{2,1,0} broadcast(constant.13), dimensions={}
-  add.137 = f32[10,14,15]{2,1,0} add(subtract.136, broadcast.14)
-  multiply.138 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.137)
-  subtract.139 = f32[10,14,15]{2,1,0} subtract(multiply.138, add.134)
-  constant.11 = f32[] constant(6.88975852e-05)
-  broadcast.12 = f32[10,14,15]{2,1,0} broadcast(constant.11), dimensions={}
-  add.140 = f32[10,14,15]{2,1,0} add(subtract.139, broadcast.12)
-  multiply.141 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.140)
-  subtract.142 = f32[10,14,15]{2,1,0} subtract(multiply.141, add.137)
-  constant.9 = f32[] constant(0.00336911646)
-  broadcast.10 = f32[10,14,15]{2,1,0} broadcast(constant.9), dimensions={}
-  add.143 = f32[10,14,15]{2,1,0} add(subtract.142, broadcast.10)
-  multiply.144 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.143)
-  subtract.145 = f32[10,14,15]{2,1,0} subtract(multiply.144, add.140)
-  constant.7 = f32[] constant(0.804490387)
-  broadcast.8 = f32[10,14,15]{2,1,0} broadcast(constant.7), dimensions={}
-  add.146 = f32[10,14,15]{2,1,0} add(subtract.145, broadcast.8)
-  subtract.147 = f32[10,14,15]{2,1,0} subtract(add.146, add.140)
-  multiply.148 = f32[10,14,15]{2,1,0} multiply(subtract.147, broadcast.6)
-  sqrt.149 = f32[10,14,15]{2,1,0} sqrt(abs.65)
-  divide.150 = f32[10,14,15]{2,1,0} divide(multiply.148, sqrt.149)
-  select.152 = f32[10,14,15]{2,1,0} select(compare.151, multiply.123, divide.150)
-  convert.153 = f16[10,14,15]{2,1,0} convert(select.152)
-  Arg_1.2 = f16[6,14,13]{2,1,0} parameter(1)
-  sine.154 = f16[6,14,13]{2,1,0} sine(Arg_1.2)
-  convolution.155 = f16[10,6,16]{2,1,0} convolution(convert.153, sine.154), window={size=13 pad=4_3 lhs_dilate=4 rhs_dilate=4}, dim_labels=bf0_oi0->bf0
-  constant.63 = f16[] constant(0)
-  reduce.160 = f16[10,6]{1,0} reduce(convolution.155, constant.63), dimensions={2}, to_apply=region_0.156
-  broadcast.161 = f16[15,10,6]{2,1,0} broadcast(reduce.160), dimensions={1,2}
-  reshape.162 = f16[10,6,15]{2,1,0} reshape(broadcast.161)
-  ROOT tuple.163 = (f16[10,6,15]{2,1,0}) tuple(reshape.162)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000024.hlo b/third_party/xla/xla/tests/fuzz/rand_000024.hlo
deleted file mode 100644
index 1c1851344185c7..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000024.hlo
+++ /dev/null
@@ -1,116 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f16[10]{0})->(f16[10]{0})}
-
-ENTRY main.111 {
-  Arg_0.1 = f16[10]{0} parameter(0)
-  constant.48 = f16[] constant(0.0078125)
-  broadcast.49 = f16[10]{0} broadcast(constant.48), dimensions={}
-  maximum.50 = f16[10]{0} maximum(Arg_0.1, broadcast.49)
-  convert.51 = f32[10]{0} convert(maximum.50)
-  abs.106 = f32[10]{0} abs(convert.51)
-  constant.2 = f32[] constant(inf)
-  broadcast.3 = f32[10]{0} broadcast(constant.2), dimensions={}
-  compare.107 = pred[10]{0} compare(abs.106, broadcast.3), direction=EQ
-  constant.46 = f32[] constant(0.5)
-  broadcast.47 = f32[10]{0} broadcast(constant.46), dimensions={}
-  compare.52 = pred[10]{0} compare(convert.51, broadcast.47), direction=LT
-  abs.91 = f32[10]{0} abs(convert.51)
-  floor.92 = f32[10]{0} floor(abs.91)
-  subtract.93 = f32[10]{0} subtract(abs.91, floor.92)
-  compare.94 = pred[10]{0} compare(broadcast.47, subtract.93), direction=LT
-  constant.44 = f32[] constant(1)
-  broadcast.45 = f32[10]{0} broadcast(constant.44), dimensions={}
-  subtract.95 = f32[10]{0} subtract(broadcast.45, subtract.93)
-  select.96 = f32[10]{0} select(compare.94, subtract.95, subtract.93)
-  constant.6 = f32[] constant(3.14159274)
-  broadcast.7 = f32[10]{0} broadcast(constant.6), dimensions={}
-  multiply.97 = f32[10]{0} multiply(select.96, broadcast.7)
-  sine.98 = f32[10]{0} sine(multiply.97)
-  log.99 = f32[10]{0} log(sine.98)
-  is-finite.102 = pred[10]{0} is-finite(log.99)
-  constant.4 = f32[] constant(1.14472985)
-  broadcast.5 = f32[10]{0} broadcast(constant.4), dimensions={}
-  subtract.100 = f32[10]{0} subtract(broadcast.5, log.99)
-  negate.53 = f32[10]{0} negate(convert.51)
-  subtract.54 = f32[10]{0} subtract(convert.51, broadcast.45)
-  select.55 = f32[10]{0} select(compare.52, negate.53, subtract.54)
-  add.85 = f32[10]{0} add(select.55, broadcast.47)
-  constant.12 = f32[] constant(7.5)
-  broadcast.13 = f32[10]{0} broadcast(constant.12), dimensions={}
-  add.80 = f32[10]{0} add(select.55, broadcast.13)
-  divide.81 = f32[10]{0} divide(select.55, broadcast.13)
-  log-plus-one.82 = f32[10]{0} log-plus-one(divide.81)
-  constant.10 = f32[] constant(2.01490307)
-  broadcast.11 = f32[10]{0} broadcast(constant.10), dimensions={}
-  add.83 = f32[10]{0} add(log-plus-one.82, broadcast.11)
-  divide.84 = f32[10]{0} divide(add.80, add.83)
-  subtract.86 = f32[10]{0} subtract(add.85, divide.84)
-  multiply.87 = f32[10]{0} multiply(subtract.86, add.83)
-  constant.8 = f32[] constant(0.918938518)
-  broadcast.9 = f32[10]{0} broadcast(constant.8), dimensions={}
-  add.89 = f32[10]{0} add(multiply.87, broadcast.9)
-  constant.42 = f32[] constant(676.520386)
-  broadcast.43 = f32[10]{0} broadcast(constant.42), dimensions={}
-  add.56 = f32[10]{0} add(select.55, broadcast.45)
-  divide.57 = f32[10]{0} divide(broadcast.43, add.56)
-  add.58 = f32[10]{0} add(divide.57, broadcast.45)
-  constant.40 = f32[] constant(-1259.13916)
-  broadcast.41 = f32[10]{0} broadcast(constant.40), dimensions={}
-  constant.38 = f32[] constant(2)
-  broadcast.39 = f32[10]{0} broadcast(constant.38), dimensions={}
-  add.59 = f32[10]{0} add(select.55, broadcast.39)
-  divide.60 = f32[10]{0} divide(broadcast.41, add.59)
-  add.61 = f32[10]{0} add(add.58, divide.60)
-  constant.36 = f32[] constant(771.323425)
-  broadcast.37 = f32[10]{0} broadcast(constant.36), dimensions={}
-  constant.34 = f32[] constant(3)
-  broadcast.35 = f32[10]{0} broadcast(constant.34), dimensions={}
-  add.62 = f32[10]{0} add(select.55, broadcast.35)
-  divide.63 = f32[10]{0} divide(broadcast.37, add.62)
-  add.64 = f32[10]{0} add(add.61, divide.63)
-  constant.32 = f32[] constant(-176.615036)
-  broadcast.33 = f32[10]{0} broadcast(constant.32), dimensions={}
-  constant.30 = f32[] constant(4)
-  broadcast.31 = f32[10]{0} broadcast(constant.30), dimensions={}
-  add.65 = f32[10]{0} add(select.55, broadcast.31)
-  divide.66 = f32[10]{0} divide(broadcast.33, add.65)
-  add.67 = f32[10]{0} add(add.64, divide.66)
-  constant.28 = f32[] constant(12.5073433)
-  broadcast.29 = f32[10]{0} broadcast(constant.28), dimensions={}
-  constant.26 = f32[] constant(5)
-  broadcast.27 = f32[10]{0} broadcast(constant.26), dimensions={}
-  add.68 = f32[10]{0} add(select.55, broadcast.27)
-  divide.69 = f32[10]{0} divide(broadcast.29, add.68)
-  add.70 = f32[10]{0} add(add.67, divide.69)
-  constant.24 = f32[] constant(-0.138571098)
-  broadcast.25 = f32[10]{0} broadcast(constant.24), dimensions={}
-  constant.22 = f32[] constant(6)
-  broadcast.23 = f32[10]{0} broadcast(constant.22), dimensions={}
-  add.71 = f32[10]{0} add(select.55, broadcast.23)
-  divide.72 = f32[10]{0} divide(broadcast.25, add.71)
-  add.73 = f32[10]{0} add(add.70, divide.72)
-  constant.20 = f32[] constant(9.98436917e-06)
-  broadcast.21 = f32[10]{0} broadcast(constant.20), dimensions={}
-  constant.18 = f32[] constant(7)
-  broadcast.19 = f32[10]{0} broadcast(constant.18), dimensions={}
-  add.74 = f32[10]{0} add(select.55, broadcast.19)
-  divide.75 = f32[10]{0} divide(broadcast.21, add.74)
-  add.76 = f32[10]{0} add(add.73, divide.75)
-  constant.16 = f32[] constant(1.50563267e-07)
-  broadcast.17 = f32[10]{0} broadcast(constant.16), dimensions={}
-  constant.14 = f32[] constant(8)
-  broadcast.15 = f32[10]{0} broadcast(constant.14), dimensions={}
-  add.77 = f32[10]{0} add(select.55, broadcast.15)
-  divide.78 = f32[10]{0} divide(broadcast.17, add.77)
-  add.79 = f32[10]{0} add(add.76, divide.78)
-  log.88 = f32[10]{0} log(add.79)
-  add.90 = f32[10]{0} add(add.89, log.88)
-  subtract.101 = f32[10]{0} subtract(subtract.100, add.90)
-  negate.103 = f32[10]{0} negate(log.99)
-  select.104 = f32[10]{0} select(is-finite.102, subtract.101, negate.103)
-  select.105 = f32[10]{0} select(compare.52, select.104, add.90)
-  select.108 = f32[10]{0} select(compare.107, broadcast.3, select.105)
-  convert.109 = f16[10]{0} convert(select.108)
-  ROOT tuple.110 = (f16[10]{0}) tuple(convert.109)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000025.hlo b/third_party/xla/xla/tests/fuzz/rand_000025.hlo
deleted file mode 100644
index c5e02fd83d2c9b..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000025.hlo
+++ /dev/null
@@ -1,20 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[2]{0},f32[7,2]{1,0},f32[11,2,8,8]{3,2,1,0})->(f32[11,2,8,8]{3,2,1,0})}
-
-region_0.6 {
-  Arg_0.7 = f32[] parameter(0)
-  ROOT Arg_1.8 = f32[] parameter(1)
-}
-
-ENTRY main.13 {
-  constant.4 = f32[] constant(1)
-  broadcast.5 = f32[11,2,8,8]{3,2,1,0} broadcast(constant.4), dimensions={}
-  Arg_0.1 = s32[2]{0} parameter(0)
-  Arg_1.2 = f32[7,2]{1,0} parameter(1)
-  scatter.9 = f32[11,2,8,8]{3,2,1,0} scatter(broadcast.5, Arg_0.1, Arg_1.2), update_window_dims={0,1}, inserted_window_dims={2,3}, scatter_dims_to_operand_dims={1,0}, index_vector_dim=0, to_apply=region_0.6
-  Arg_2.3 = f32[11,2,8,8]{3,2,1,0} parameter(2)
-  reverse.10 = f32[11,2,8,8]{3,2,1,0} reverse(Arg_2.3), dimensions={1,2,0}
-  multiply.11 = f32[11,2,8,8]{3,2,1,0} multiply(scatter.9, reverse.10)
-  ROOT tuple.12 = (f32[11,2,8,8]{3,2,1,0}) tuple(multiply.11)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000026.hlo b/third_party/xla/xla/tests/fuzz/rand_000026.hlo
deleted file mode 100644
index b595fbaab58e20..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000026.hlo
+++ /dev/null
@@ -1,20 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[13,1]{1,0})->(pred[11]{0})}
-
-region_0.6 {
-  Arg_0.7 = u16[] parameter(0)
-  Arg_1.8 = u16[] parameter(1)
-  ROOT maximum.9 = u16[] maximum(Arg_0.7, Arg_1.8)
-}
-
-ENTRY main.13 {
-  constant.2 = u16[] constant(1)
-  broadcast.3 = u16[11]{0} broadcast(constant.2), dimensions={}
-  Arg_0.1 = s32[13,1]{1,0} parameter(0)
-  constant.4 = u16[] constant(1)
-  broadcast.5 = u16[10,13]{1,0} broadcast(constant.4), dimensions={}
-  scatter.10 = u16[11]{0} scatter(broadcast.3, Arg_0.1, broadcast.5), update_window_dims={0}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=1, to_apply=region_0.6
-  compare.11 = pred[11]{0} compare(broadcast.3, scatter.10), direction=LT
-  ROOT tuple.12 = (pred[11]{0}) tuple(compare.11)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000030.hlo b/third_party/xla/xla/tests/fuzz/rand_000030.hlo
deleted file mode 100644
index 7bf45b610d5552..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000030.hlo
+++ /dev/null
@@ -1,16 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u8[11,1,8,10]{3,2,1,0},u8[11,1,8,10]{3,2,1,0})->(u8[11,1,8,10]{3,2,1,0})}
-
-ENTRY main.11 {
-  Arg_0.1 = u8[11,1,8,10]{3,2,1,0} parameter(0)
-  constant.5 = u8[] constant(0)
-  broadcast.6 = u8[11,1,8,10]{3,2,1,0} broadcast(constant.5), dimensions={}
-  compare.7 = pred[11,1,8,10]{3,2,1,0} compare(Arg_0.1, broadcast.6), direction=EQ
-  constant.3 = u8[] constant(1)
-  broadcast.4 = u8[11,1,8,10]{3,2,1,0} broadcast(constant.3), dimensions={}
-  select.8 = u8[11,1,8,10]{3,2,1,0} select(compare.7, broadcast.6, broadcast.4)
-  Arg_1.2 = u8[11,1,8,10]{3,2,1,0} parameter(1)
-  or.9 = u8[11,1,8,10]{3,2,1,0} or(select.8, Arg_1.2)
-  ROOT tuple.10 = (u8[11,1,8,10]{3,2,1,0}) tuple(or.9)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000031.hlo b/third_party/xla/xla/tests/fuzz/rand_000031.hlo
deleted file mode 100644
index 12a02f6245da90..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000031.hlo
+++ /dev/null
@@ -1,26 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u16[11,12]{1,0})->(u16[12]{0})}
-
-region_0.6 {
-  Arg_0.7 = u16[] parameter(0)
-  Arg_1.8 = u16[] parameter(1)
-  ROOT minimum.9 = u16[] minimum(Arg_0.7, Arg_1.8)
-}
-
-region_1.11 {
-  Arg_0.12 = u16[] parameter(0)
-  Arg_1.13 = u16[] parameter(1)
-  ROOT maximum.14 = u16[] maximum(Arg_0.12, Arg_1.13)
-}
-
-ENTRY main.17 {
-  Arg_0.1 = u16[11,12]{1,0} parameter(0)
-  constant.2 = s32[1]{0} constant({1})
-  constant.4 = u16[] constant(1)
-  broadcast.5 = u16[12]{0} broadcast(constant.4), dimensions={}
-  scatter.10 = u16[11,12]{1,0} scatter(Arg_0.1, constant.2, broadcast.5), update_window_dims={0}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, to_apply=region_0.6
-  constant.3 = u16[] constant(0)
-  reduce.15 = u16[12]{0} reduce(scatter.10, constant.3), dimensions={0}, to_apply=region_1.11
-  ROOT tuple.16 = (u16[12]{0}) tuple(reduce.15)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000032.hlo b/third_party/xla/xla/tests/fuzz/rand_000032.hlo
deleted file mode 100644
index f0c7e7f9179ce5..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000032.hlo
+++ /dev/null
@@ -1,30 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f16[3,15,15,13]{3,2,1,0},f16[3,3,11]{2,1,0})->(f16[3,15,15,13]{3,2,1,0})}
-
-region_0.9 {
-  Arg_0.10 = s32[] parameter(0)
-  Arg_1.11 = s32[] parameter(1)
-  ROOT add.12 = s32[] add(Arg_0.10, Arg_1.11)
-}
-
-region_1.16 {
-  Arg_0.17 = f16[] parameter(0)
-  ROOT Arg_1.18 = f16[] parameter(1)
-}
-
-ENTRY main.21 {
-  constant.5 = f16[] constant(1)
-  broadcast.6 = f16[3,15,15,13]{3,2,1,0} broadcast(constant.5), dimensions={}
-  Arg_0.1 = f16[3,15,15,13]{3,2,1,0} parameter(0)
-  power.8 = f16[3,15,15,13]{3,2,1,0} power(broadcast.6, Arg_0.1)
-  constant.3 = s32[] constant(1)
-  broadcast.4 = s32[28]{0} broadcast(constant.3), dimensions={}
-  constant.7 = s32[] constant(0)
-  reduce.13 = s32[] reduce(broadcast.4, constant.7), dimensions={0}, to_apply=region_0.9
-  broadcast.14 = s32[3]{0} broadcast(reduce.13), dimensions={}
-  Arg_1.2 = f16[3,3,11]{2,1,0} parameter(1)
-  ceil.15 = f16[3,3,11]{2,1,0} ceil(Arg_1.2)
-  scatter.19 = f16[3,15,15,13]{3,2,1,0} scatter(power.8, broadcast.14, ceil.15), update_window_dims={0,1,2}, inserted_window_dims={2}, scatter_dims_to_operand_dims={2,0,1}, index_vector_dim=0, to_apply=region_1.16
-  ROOT tuple.20 = (f16[3,15,15,13]{3,2,1,0}) tuple(scatter.19)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000033.hlo b/third_party/xla/xla/tests/fuzz/rand_000033.hlo
deleted file mode 100644
index b24560c508d61c..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000033.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(s32[11,5]{1,0})}
-
-ENTRY main.8 {
-  constant.3 = s32[] constant(1)
-  broadcast.4 = s32[11,3]{1,0} broadcast(constant.3), dimensions={}
-  shift-left.5 = s32[11,3]{1,0} shift-left(broadcast.4, broadcast.4)
-  constant.1 = s32[] constant(-2)
-  broadcast.2 = s32[3,5]{1,0} broadcast(constant.1), dimensions={}
-  dot.6 = s32[11,5]{1,0} dot(shift-left.5, broadcast.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT tuple.7 = (s32[11,5]{1,0}) tuple(dot.6)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000034.hlo b/third_party/xla/xla/tests/fuzz/rand_000034.hlo
deleted file mode 100644
index 001e2e939566e2..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000034.hlo
+++ /dev/null
@@ -1,326 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(bf16[9,11,3]{2,1,0},bf16[11,9,3]{2,1,0})->(bf16[])}
-
-region_0.117 {
-  arg_tuple.118 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
-  get-tuple-element.119 = pred[] get-tuple-element(arg_tuple.118), index=0
-  get-tuple-element.121 = f32[] get-tuple-element(arg_tuple.118), index=2
-  get-tuple-element.125 = f32[] get-tuple-element(arg_tuple.118), index=6
-  get-tuple-element.120 = f32[] get-tuple-element(arg_tuple.118), index=1
-  constant.127 = f32[] constant(1)
-  add.128 = f32[] add(get-tuple-element.120, constant.127)
-  divide.136 = f32[] divide(get-tuple-element.125, add.128)
-  multiply.137 = f32[] multiply(get-tuple-element.121, divide.136)
-  get-tuple-element.122 = f32[] get-tuple-element(arg_tuple.118), index=3
-  add.138 = f32[] add(get-tuple-element.122, multiply.137)
-  divide.139 = f32[] divide(multiply.137, add.138)
-  constant.126 = f32[] constant(1.1920929e-07)
-  compare.140 = pred[] compare(divide.139, constant.126), direction=GT
-  and.141 = pred[] and(get-tuple-element.119, compare.140)
-  select.142 = f32[] select(get-tuple-element.119, add.128, get-tuple-element.120)
-  select.143 = f32[] select(get-tuple-element.119, multiply.137, get-tuple-element.121)
-  select.144 = f32[] select(get-tuple-element.119, add.138, get-tuple-element.122)
-  get-tuple-element.123 = f32[] get-tuple-element(arg_tuple.118), index=4
-  divide.129 = f32[] divide(get-tuple-element.125, add.128)
-  multiply.130 = f32[] multiply(get-tuple-element.123, divide.129)
-  multiply.131 = f32[] multiply(get-tuple-element.121, get-tuple-element.125)
-  multiply.132 = f32[] multiply(add.128, add.128)
-  divide.133 = f32[] divide(multiply.131, multiply.132)
-  subtract.134 = f32[] subtract(multiply.130, divide.133)
-  select.145 = f32[] select(get-tuple-element.119, subtract.134, get-tuple-element.123)
-  get-tuple-element.124 = f32[] get-tuple-element(arg_tuple.118), index=5
-  add.135 = f32[] add(get-tuple-element.124, subtract.134)
-  select.146 = f32[] select(get-tuple-element.119, add.135, get-tuple-element.124)
-  ROOT tuple.147 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.141, select.142, select.143, select.144, select.145, select.146, get-tuple-element.125)
-}
-
-region_1.148 {
-  arg_tuple.149 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
-  ROOT get-tuple-element.150 = pred[] get-tuple-element(arg_tuple.149), index=0
-  get-tuple-element.151 = f32[] get-tuple-element(arg_tuple.149), index=1
-  get-tuple-element.152 = f32[] get-tuple-element(arg_tuple.149), index=2
-  get-tuple-element.153 = f32[] get-tuple-element(arg_tuple.149), index=3
-  get-tuple-element.154 = f32[] get-tuple-element(arg_tuple.149), index=4
-  get-tuple-element.155 = f32[] get-tuple-element(arg_tuple.149), index=5
-  get-tuple-element.156 = f32[] get-tuple-element(arg_tuple.149), index=6
-}
-
-region_2.180 {
-  arg_tuple.181 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
-  get-tuple-element.182 = pred[] get-tuple-element(arg_tuple.181), index=0
-  get-tuple-element.189 = f32[] get-tuple-element(arg_tuple.181), index=7
-  get-tuple-element.186 = f32[] get-tuple-element(arg_tuple.181), index=4
-  constant.200 = f32[] constant(2)
-  add.204 = f32[] add(get-tuple-element.186, constant.200)
-  multiply.209 = f32[] multiply(get-tuple-element.189, add.204)
-  get-tuple-element.191 = f32[] get-tuple-element(arg_tuple.181), index=9
-  get-tuple-element.185 = f32[] get-tuple-element(arg_tuple.181), index=3
-  constant.201 = f32[] constant(1)
-  add.203 = f32[] add(get-tuple-element.185, constant.201)
-  get-tuple-element.187 = f32[] get-tuple-element(arg_tuple.181), index=5
-  add.202 = f32[] add(get-tuple-element.187, constant.201)
-  multiply.205 = f32[] multiply(add.203, add.202)
-  multiply.210 = f32[] multiply(get-tuple-element.191, multiply.205)
-  subtract.211 = f32[] subtract(multiply.209, multiply.210)
-  constant.199 = f32[] constant(0)
-  compare.212 = pred[] compare(subtract.211, constant.199), direction=NE
-  get-tuple-element.183 = f32[] get-tuple-element(arg_tuple.181), index=1
-  get-tuple-element.188 = f32[] get-tuple-element(arg_tuple.181), index=6
-  multiply.206 = f32[] multiply(get-tuple-element.188, add.204)
-  get-tuple-element.190 = f32[] get-tuple-element(arg_tuple.181), index=8
-  multiply.207 = f32[] multiply(get-tuple-element.190, multiply.205)
-  subtract.208 = f32[] subtract(multiply.206, multiply.207)
-  divide.213 = f32[] divide(subtract.208, subtract.211)
-  subtract.214 = f32[] subtract(get-tuple-element.183, divide.213)
-  divide.215 = f32[] divide(subtract.214, divide.213)
-  abs.216 = f32[] abs(divide.215)
-  select.217 = f32[] select(compare.212, abs.216, constant.201)
-  constant.197 = f32[] constant(1.1920929e-07)
-  compare.253 = pred[] compare(select.217, constant.197), direction=GT
-  and.254 = pred[] and(get-tuple-element.182, compare.253)
-  select.218 = f32[] select(compare.212, divide.213, get-tuple-element.183)
-  select.255 = f32[] select(get-tuple-element.182, select.218, get-tuple-element.183)
-  get-tuple-element.184 = f32[] get-tuple-element(arg_tuple.181), index=2
-  select.256 = f32[] select(get-tuple-element.182, select.217, get-tuple-element.184)
-  select.257 = f32[] select(get-tuple-element.182, add.203, get-tuple-element.185)
-  select.258 = f32[] select(get-tuple-element.182, add.204, get-tuple-element.186)
-  abs.235 = f32[] abs(subtract.208)
-  constant.198 = f32[] constant(8388608)
-  compare.236 = pred[] compare(abs.235, constant.198), direction=GT
-  multiply.239 = f32[] multiply(subtract.208, constant.197)
-  select.240 = f32[] select(compare.236, multiply.239, subtract.208)
-  select.259 = f32[] select(get-tuple-element.182, select.240, get-tuple-element.188)
-  multiply.243 = f32[] multiply(subtract.211, constant.197)
-  select.244 = f32[] select(compare.236, multiply.243, subtract.211)
-  select.260 = f32[] select(get-tuple-element.182, select.244, get-tuple-element.189)
-  multiply.237 = f32[] multiply(get-tuple-element.188, constant.197)
-  select.238 = f32[] select(compare.236, multiply.237, get-tuple-element.188)
-  select.261 = f32[] select(get-tuple-element.182, select.238, get-tuple-element.190)
-  multiply.241 = f32[] multiply(get-tuple-element.189, constant.197)
-  select.242 = f32[] select(compare.236, multiply.241, get-tuple-element.189)
-  select.262 = f32[] select(get-tuple-element.182, select.242, get-tuple-element.191)
-  get-tuple-element.194 = f32[] get-tuple-element(arg_tuple.181), index=12
-  multiply.245 = f32[] multiply(get-tuple-element.194, constant.197)
-  select.246 = f32[] select(compare.236, multiply.245, get-tuple-element.194)
-  get-tuple-element.192 = f32[] get-tuple-element(arg_tuple.181), index=10
-  select.263 = f32[] select(get-tuple-element.182, select.246, get-tuple-element.192)
-  get-tuple-element.195 = f32[] get-tuple-element(arg_tuple.181), index=13
-  multiply.247 = f32[] multiply(get-tuple-element.195, constant.197)
-  select.248 = f32[] select(compare.236, multiply.247, get-tuple-element.195)
-  get-tuple-element.193 = f32[] get-tuple-element(arg_tuple.181), index=11
-  select.264 = f32[] select(get-tuple-element.182, select.248, get-tuple-element.193)
-  multiply.219 = f32[] multiply(get-tuple-element.194, add.204)
-  subtract.220 = f32[] subtract(multiply.219, get-tuple-element.188)
-  multiply.221 = f32[] multiply(get-tuple-element.192, multiply.205)
-  subtract.222 = f32[] subtract(subtract.220, multiply.221)
-  multiply.223 = f32[] multiply(get-tuple-element.190, add.202)
-  add.224 = f32[] add(subtract.222, multiply.223)
-  multiply.249 = f32[] multiply(add.224, constant.197)
-  select.250 = f32[] select(compare.236, multiply.249, add.224)
-  select.265 = f32[] select(get-tuple-element.182, select.250, get-tuple-element.194)
-  multiply.225 = f32[] multiply(get-tuple-element.195, add.204)
-  subtract.226 = f32[] subtract(multiply.225, get-tuple-element.189)
-  multiply.227 = f32[] multiply(get-tuple-element.193, multiply.205)
-  subtract.228 = f32[] subtract(subtract.226, multiply.227)
-  multiply.229 = f32[] multiply(get-tuple-element.191, add.202)
-  add.230 = f32[] add(subtract.228, multiply.229)
-  multiply.251 = f32[] multiply(add.230, constant.197)
-  select.252 = f32[] select(compare.236, multiply.251, add.230)
-  select.266 = f32[] select(get-tuple-element.182, select.252, get-tuple-element.195)
-  multiply.231 = f32[] multiply(select.218, add.230)
-  subtract.232 = f32[] subtract(add.224, multiply.231)
-  divide.233 = f32[] divide(subtract.232, subtract.211)
-  get-tuple-element.196 = f32[] get-tuple-element(arg_tuple.181), index=14
-  select.234 = f32[] select(compare.212, divide.233, get-tuple-element.196)
-  select.267 = f32[] select(get-tuple-element.182, select.234, get-tuple-element.196)
-  ROOT tuple.268 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.254, select.255, select.256, select.257, select.258, add.202, select.259, select.260, select.261, select.262, select.263, select.264, select.265, select.266, select.267)
-}
-
-region_3.269 {
-  arg_tuple.270 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
-  get-tuple-element.272 = f32[] get-tuple-element(arg_tuple.270), index=1
-  get-tuple-element.273 = f32[] get-tuple-element(arg_tuple.270), index=2
-  get-tuple-element.274 = f32[] get-tuple-element(arg_tuple.270), index=3
-  get-tuple-element.275 = f32[] get-tuple-element(arg_tuple.270), index=4
-  get-tuple-element.277 = f32[] get-tuple-element(arg_tuple.270), index=6
-  get-tuple-element.278 = f32[] get-tuple-element(arg_tuple.270), index=7
-  get-tuple-element.279 = f32[] get-tuple-element(arg_tuple.270), index=8
-  get-tuple-element.280 = f32[] get-tuple-element(arg_tuple.270), index=9
-  get-tuple-element.281 = f32[] get-tuple-element(arg_tuple.270), index=10
-  get-tuple-element.282 = f32[] get-tuple-element(arg_tuple.270), index=11
-  get-tuple-element.283 = f32[] get-tuple-element(arg_tuple.270), index=12
-  get-tuple-element.284 = f32[] get-tuple-element(arg_tuple.270), index=13
-  get-tuple-element.285 = f32[] get-tuple-element(arg_tuple.270), index=14
-  get-tuple-element.276 = f32[] get-tuple-element(arg_tuple.270), index=5
-  constant.286 = f32[] constant(2000)
-  compare.287 = pred[] compare(get-tuple-element.276, constant.286), direction=LT
-  get-tuple-element.271 = pred[] get-tuple-element(arg_tuple.270), index=0
-  ROOT and.288 = pred[] and(compare.287, get-tuple-element.271)
-}
-
-ENTRY main.313 {
-  Arg_0.1 = bf16[9,11,3]{2,1,0} parameter(0)
-  Arg_1.2 = bf16[11,9,3]{2,1,0} parameter(1)
-  dot.39 = bf16[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={2,1,0}, rhs_contracting_dims={2,0,1}
-  constant.17 = bf16[] constant(0)
-  maximum.41 = bf16[] maximum(dot.39, constant.17)
-  convert.43 = f32[] convert(maximum.41)
-  constant.7 = f32[] constant(0)
-  compare.44 = pred[] compare(convert.43, constant.7), direction=LE
-  constant.6 = f32[] constant(0.5)
-  constant.8 = f32[] constant(7.5)
-  log-plus-one.31 = f32[] log-plus-one(constant.7)
-  constant.22 = f32[] constant(2.01490307)
-  add.32 = f32[] add(log-plus-one.31, constant.22)
-  divide.33 = f32[] divide(constant.8, add.32)
-  subtract.34 = f32[] subtract(constant.6, divide.33)
-  multiply.35 = f32[] multiply(subtract.34, add.32)
-  constant.21 = f32[] constant(0.918938518)
-  add.36 = f32[] add(multiply.35, constant.21)
-  constant.5 = f32[] constant(5.57361031)
-  add.37 = f32[] add(add.36, constant.5)
-  convert.38 = bf16[] convert(add.37)
-  constant.30 = bf16[] constant(0.007812)
-  maximum.40 = bf16[] maximum(convert.38, constant.30)
-  convert.42 = f32[] convert(maximum.40)
-  compare.45 = pred[] compare(convert.42, constant.7), direction=LE
-  or.46 = pred[] or(compare.44, compare.45)
-  log.50 = f32[] log(convert.43)
-  multiply.51 = f32[] multiply(convert.42, log.50)
-  subtract.52 = f32[] subtract(multiply.51, convert.43)
-  abs.107 = f32[] abs(convert.42)
-  constant.18 = f32[] constant(inf)
-  compare.108 = pred[] compare(abs.107, constant.18), direction=EQ
-  compare.53 = pred[] compare(convert.42, constant.6), direction=LT
-  abs.92 = f32[] abs(convert.42)
-  floor.93 = f32[] floor(abs.92)
-  subtract.94 = f32[] subtract(abs.92, floor.93)
-  compare.95 = pred[] compare(constant.6, subtract.94), direction=LT
-  constant.4 = f32[] constant(1)
-  subtract.96 = f32[] subtract(constant.4, subtract.94)
-  select.97 = f32[] select(compare.95, subtract.96, subtract.94)
-  constant.20 = f32[] constant(3.14159274)
-  multiply.98 = f32[] multiply(select.97, constant.20)
-  sine.99 = f32[] sine(multiply.98)
-  log.100 = f32[] log(sine.99)
-  is-finite.103 = pred[] is-finite(log.100)
-  constant.19 = f32[] constant(1.14472985)
-  subtract.101 = f32[] subtract(constant.19, log.100)
-  negate.54 = f32[] negate(convert.42)
-  subtract.55 = f32[] subtract(convert.42, constant.4)
-  select.56 = f32[] select(compare.53, negate.54, subtract.55)
-  add.86 = f32[] add(select.56, constant.6)
-  add.81 = f32[] add(select.56, constant.8)
-  divide.82 = f32[] divide(select.56, constant.8)
-  log-plus-one.83 = f32[] log-plus-one(divide.82)
-  add.84 = f32[] add(log-plus-one.83, constant.22)
-  divide.85 = f32[] divide(add.81, add.84)
-  subtract.87 = f32[] subtract(add.86, divide.85)
-  multiply.88 = f32[] multiply(subtract.87, add.84)
-  add.90 = f32[] add(multiply.88, constant.21)
-  constant.16 = f32[] constant(676.520386)
-  add.57 = f32[] add(select.56, constant.4)
-  divide.58 = f32[] divide(constant.16, add.57)
-  add.59 = f32[] add(divide.58, constant.4)
-  constant.29 = f32[] constant(-1259.13916)
-  constant.15 = f32[] constant(2)
-  add.60 = f32[] add(select.56, constant.15)
-  divide.61 = f32[] divide(constant.29, add.60)
-  add.62 = f32[] add(add.59, divide.61)
-  constant.28 = f32[] constant(771.323425)
-  constant.14 = f32[] constant(3)
-  add.63 = f32[] add(select.56, constant.14)
-  divide.64 = f32[] divide(constant.28, add.63)
-  add.65 = f32[] add(add.62, divide.64)
-  constant.27 = f32[] constant(-176.615036)
-  constant.13 = f32[] constant(4)
-  add.66 = f32[] add(select.56, constant.13)
-  divide.67 = f32[] divide(constant.27, add.66)
-  add.68 = f32[] add(add.65, divide.67)
-  constant.26 = f32[] constant(12.5073433)
-  constant.12 = f32[] constant(5)
-  add.69 = f32[] add(select.56, constant.12)
-  divide.70 = f32[] divide(constant.26, add.69)
-  add.71 = f32[] add(add.68, divide.70)
-  constant.25 = f32[] constant(-0.138571098)
-  constant.11 = f32[] constant(6)
-  add.72 = f32[] add(select.56, constant.11)
-  divide.73 = f32[] divide(constant.25, add.72)
-  add.74 = f32[] add(add.71, divide.73)
-  constant.24 = f32[] constant(9.98436917e-06)
-  constant.10 = f32[] constant(7)
-  add.75 = f32[] add(select.56, constant.10)
-  divide.76 = f32[] divide(constant.24, add.75)
-  add.77 = f32[] add(add.74, divide.76)
-  constant.23 = f32[] constant(1.50563267e-07)
-  constant.9 = f32[] constant(8)
-  add.78 = f32[] add(select.56, constant.9)
-  divide.79 = f32[] divide(constant.23, add.78)
-  add.80 = f32[] add(add.77, divide.79)
-  log.89 = f32[] log(add.80)
-  add.91 = f32[] add(add.90, log.89)
-  subtract.102 = f32[] subtract(subtract.101, add.91)
-  negate.104 = f32[] negate(log.100)
-  select.105 = f32[] select(is-finite.103, subtract.102, negate.104)
-  select.106 = f32[] select(compare.53, select.105, add.91)
-  select.109 = f32[] select(compare.108, constant.18, select.106)
-  subtract.110 = f32[] subtract(subtract.52, select.109)
-  constant.3 = f32[] constant(-88.7228394)
-  compare.111 = pred[] compare(subtract.110, constant.3), direction=LT
-  or.112 = pred[] or(or.46, compare.111)
-  not.113 = pred[] not(or.112)
-  compare.47 = pred[] compare(convert.43, constant.4), direction=LT
-  compare.48 = pred[] compare(convert.43, convert.42), direction=LT
-  or.49 = pred[] or(compare.47, compare.48)
-  and.115 = pred[] and(not.113, or.49)
-  tuple.116 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.115, convert.42, constant.4, constant.4, constant.7, constant.7, convert.43)
-  while.157 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) while(tuple.116), condition=region_1.148, body=region_0.117
-  get-tuple-element.158 = pred[] get-tuple-element(while.157), index=0
-  get-tuple-element.159 = f32[] get-tuple-element(while.157), index=1
-  get-tuple-element.160 = f32[] get-tuple-element(while.157), index=2
-  get-tuple-element.162 = f32[] get-tuple-element(while.157), index=4
-  get-tuple-element.163 = f32[] get-tuple-element(while.157), index=5
-  get-tuple-element.164 = f32[] get-tuple-element(while.157), index=6
-  not.167 = pred[] not(or.49)
-  and.168 = pred[] and(not.113, not.167)
-  add.172 = f32[] add(convert.43, constant.4)
-  subtract.169 = f32[] subtract(constant.4, convert.42)
-  add.170 = f32[] add(convert.43, subtract.169)
-  add.171 = f32[] add(add.170, constant.4)
-  multiply.173 = f32[] multiply(add.171, convert.43)
-  divide.174 = f32[] divide(add.172, multiply.173)
-  negate.175 = f32[] negate(convert.43)
-  multiply.176 = f32[] multiply(divide.174, negate.175)
-  subtract.177 = f32[] subtract(constant.7, multiply.176)
-  divide.178 = f32[] divide(subtract.177, multiply.173)
-  tuple.179 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.168, divide.174, constant.4, subtract.169, add.171, constant.7, add.172, multiply.173, constant.4, convert.43, constant.7, constant.7, constant.7, negate.175, divide.178)
-  while.289 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) while(tuple.179), condition=region_3.269, body=region_2.180
-  get-tuple-element.290 = pred[] get-tuple-element(while.289), index=0
-  get-tuple-element.292 = f32[] get-tuple-element(while.289), index=2
-  get-tuple-element.293 = f32[] get-tuple-element(while.289), index=3
-  get-tuple-element.294 = f32[] get-tuple-element(while.289), index=4
-  get-tuple-element.295 = f32[] get-tuple-element(while.289), index=5
-  get-tuple-element.296 = f32[] get-tuple-element(while.289), index=6
-  get-tuple-element.297 = f32[] get-tuple-element(while.289), index=7
-  get-tuple-element.298 = f32[] get-tuple-element(while.289), index=8
-  get-tuple-element.299 = f32[] get-tuple-element(while.289), index=9
-  get-tuple-element.300 = f32[] get-tuple-element(while.289), index=10
-  get-tuple-element.301 = f32[] get-tuple-element(while.289), index=11
-  get-tuple-element.302 = f32[] get-tuple-element(while.289), index=12
-  get-tuple-element.303 = f32[] get-tuple-element(while.289), index=13
-  get-tuple-element.304 = f32[] get-tuple-element(while.289), index=14
-  compare.308 = pred[] compare(convert.43, constant.18), direction=EQ
-  get-tuple-element.161 = f32[] get-tuple-element(while.157), index=3
-  exponential.114 = f32[] exponential(subtract.110)
-  multiply.165 = f32[] multiply(get-tuple-element.161, exponential.114)
-  divide.166 = f32[] divide(multiply.165, convert.42)
-  subtract.306 = f32[] subtract(constant.4, divide.166)
-  get-tuple-element.291 = f32[] get-tuple-element(while.289), index=1
-  multiply.305 = f32[] multiply(get-tuple-element.291, exponential.114)
-  select.307 = f32[] select(or.49, subtract.306, multiply.305)
-  select.309 = f32[] select(compare.308, constant.7, select.307)
-  select.310 = f32[] select(or.46, constant.4, select.309)
-  convert.311 = bf16[] convert(select.310)
-  ROOT tuple.312 = (bf16[]) tuple(convert.311)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000035.hlo b/third_party/xla/xla/tests/fuzz/rand_000035.hlo
deleted file mode 100644
index bb0a31a9a3e611..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000035.hlo
+++ /dev/null
@@ -1,51 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(pred[],c64[12]{0})->(f32[11,3,2]{2,1,0})}
-
-region_1.10 {
-  Arg_0.11 = f32[] parameter(0)
-  Arg_1.12 = f32[] parameter(1)
-  ROOT add.13 = f32[] add(Arg_0.11, Arg_1.12)
-}
-
-region_0.14 {
-  Arg_.15 = c64[12]{0} parameter(0)
-  real.19 = f32[12]{0} real(Arg_.15)
-  broadcast.20 = f32[11,3,12]{2,1,0} broadcast(real.19), dimensions={2}
-  constant.18 = f32[] constant(0)
-  reduce.21 = f32[11,3]{1,0} reduce(broadcast.20, constant.18), dimensions={2}, to_apply=region_1.10
-  broadcast.22 = f32[2,11,3]{2,1,0} broadcast(reduce.21), dimensions={1,2}
-  reshape.23 = f32[11,3,2]{2,1,0} reshape(broadcast.22)
-  constant.16 = f32[] constant(1)
-  broadcast.17 = f32[11,3,2]{2,1,0} broadcast(constant.16), dimensions={}
-  ROOT add.24 = f32[11,3,2]{2,1,0} add(reshape.23, broadcast.17)
-}
-
-region_3.25 {
-  Arg_0.26 = f32[] parameter(0)
-  Arg_1.27 = f32[] parameter(1)
-  ROOT add.28 = f32[] add(Arg_0.26, Arg_1.27)
-}
-
-region_2.29 {
-  arg_empty_tuple.30 = () parameter(0)
-  constant.32 = f32[] constant(1)
-  broadcast.33 = f32[182,9,10]{2,1,0} broadcast(constant.32), dimensions={}
-  constant.31 = f32[] constant(0)
-  reduce.34 = f32[] reduce(broadcast.33, constant.31), dimensions={0,1,2}, to_apply=region_3.25
-  ROOT broadcast.35 = f32[11,3,2]{2,1,0} broadcast(reduce.34), dimensions={}
-}
-
-ENTRY main.38 {
-  Arg_0.1 = pred[] parameter(0)
-  constant.5 = pred[] constant(true)
-  xor.6 = pred[] xor(Arg_0.1, constant.5)
-  convert.8 = s32[] convert(xor.6)
-  constant.3 = c64[] constant((1, 0))
-  broadcast.4 = c64[12]{0} broadcast(constant.3), dimensions={}
-  Arg_1.2 = c64[12]{0} parameter(1)
-  atan2.7 = c64[12]{0} atan2(broadcast.4, Arg_1.2)
-  tuple.9 = () tuple()
-  conditional.36 = f32[11,3,2]{2,1,0} conditional(convert.8, atan2.7, tuple.9), branch_computations={region_0.14, region_2.29}
-  ROOT tuple.37 = (f32[11,3,2]{2,1,0}) tuple(conditional.36)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000036.hlo b/third_party/xla/xla/tests/fuzz/rand_000036.hlo
deleted file mode 100644
index dc53be6649b608..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000036.hlo
+++ /dev/null
@@ -1,18 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(pred[11,4]{1,0},s32[5,4,2,16,16,1]{5,4,3,2,1,0},pred[5,4,2,16,16,9]{5,4,3,2,1,0})->(pred[14,11,4]{2,1,0})}
-
-region_0.4 {
-  Arg_0.5 = pred[] parameter(0)
-  Arg_1.6 = pred[] parameter(1)
-  ROOT maximum.7 = pred[] maximum(Arg_0.5, Arg_1.6)
-}
-
-ENTRY main.11 {
-  Arg_0.1 = pred[11,4]{1,0} parameter(0)
-  Arg_1.2 = s32[5,4,2,16,16,1]{5,4,3,2,1,0} parameter(1)
-  Arg_2.3 = pred[5,4,2,16,16,9]{5,4,3,2,1,0} parameter(2)
-  scatter.8 = pred[11,4]{1,0} scatter(Arg_0.1, Arg_1.2, Arg_2.3), update_window_dims={5}, inserted_window_dims={1}, scatter_dims_to_operand_dims={0}, index_vector_dim=5, to_apply=region_0.4
-  broadcast.9 = pred[14,11,4]{2,1,0} broadcast(scatter.8), dimensions={1,2}
-  ROOT tuple.10 = (pred[14,11,4]{2,1,0}) tuple(broadcast.9)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000039.hlo b/third_party/xla/xla/tests/fuzz/rand_000039.hlo
deleted file mode 100644
index c088a6e240fe7a..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000039.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u8[8,1,14,6,13]{4,3,2,1,0},u8[8,1,14,6,13]{4,3,2,1,0},u8[8,1,6,13]{3,2,1,0})->(u8[8,1,14,6,13]{4,3,2,1,0})}
-
-ENTRY main.8 {
-  Arg_0.1 = u8[8,1,14,6,13]{4,3,2,1,0} parameter(0)
-  Arg_1.2 = u8[8,1,14,6,13]{4,3,2,1,0} parameter(1)
-  xor.4 = u8[8,1,14,6,13]{4,3,2,1,0} xor(Arg_0.1, Arg_1.2)
-  Arg_2.3 = u8[8,1,6,13]{3,2,1,0} parameter(2)
-  broadcast.5 = u8[8,1,14,6,13]{4,3,2,1,0} broadcast(Arg_2.3), dimensions={0,1,3,4}
-  subtract.6 = u8[8,1,14,6,13]{4,3,2,1,0} subtract(xor.4, broadcast.5)
-  ROOT tuple.7 = (u8[8,1,14,6,13]{4,3,2,1,0}) tuple(subtract.6)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000040.hlo b/third_party/xla/xla/tests/fuzz/rand_000040.hlo
deleted file mode 100644
index cb28276112661f..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000040.hlo
+++ /dev/null
@@ -1,27 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[1,1,14,8]{3,2,1,0},s16[1,1,14,8]{3,2,1,0},s32[13,10,8,2]{3,2,1,0},s16[13,10,1,8,1]{4,3,2,1,0})->(s16[1,1,14,8]{3,2,1,0})}
-
-region_0.13 {
-  Arg_0.14 = s16[] parameter(0)
-  Arg_1.15 = s16[] parameter(1)
-  ROOT maximum.16 = s16[] maximum(Arg_0.14, Arg_1.15)
-}
-
-ENTRY main.20 {
-  Arg_0.1 = s16[1,1,14,8]{3,2,1,0} parameter(0)
-  constant.5 = s16[] constant(0)
-  broadcast.6 = s16[1,1,14,8]{3,2,1,0} broadcast(constant.5), dimensions={}
-  Arg_1.2 = s16[1,1,14,8]{3,2,1,0} parameter(1)
-  compare.9 = pred[1,1,14,8]{3,2,1,0} compare(broadcast.6, Arg_1.2), direction=EQ
-  constant.7 = s16[] constant(1)
-  broadcast.8 = s16[1,1,14,8]{3,2,1,0} broadcast(constant.7), dimensions={}
-  select.10 = s16[1,1,14,8]{3,2,1,0} select(compare.9, broadcast.8, Arg_1.2)
-  divide.11 = s16[1,1,14,8]{3,2,1,0} divide(Arg_0.1, select.10)
-  shift-right-logical.12 = s16[1,1,14,8]{3,2,1,0} shift-right-logical(broadcast.8, broadcast.8)
-  Arg_2.3 = s32[13,10,8,2]{3,2,1,0} parameter(2)
-  Arg_3.4 = s16[13,10,1,8,1]{4,3,2,1,0} parameter(3)
-  scatter.17 = s16[1,1,14,8]{3,2,1,0} scatter(broadcast.8, Arg_2.3, Arg_3.4), update_window_dims={2,4}, inserted_window_dims={2,3}, scatter_dims_to_operand_dims={2,3}, index_vector_dim=3, to_apply=region_0.13
-  clamp.18 = s16[1,1,14,8]{3,2,1,0} clamp(divide.11, shift-right-logical.12, scatter.17)
-  ROOT tuple.19 = (s16[1,1,14,8]{3,2,1,0}) tuple(clamp.18)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000041.hlo b/third_party/xla/xla/tests/fuzz/rand_000041.hlo
deleted file mode 100644
index a02e353aef666c..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000041.hlo
+++ /dev/null
@@ -1,8 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(pred[8,6]{1,0})->(pred[8,6]{1,0})}
-
-ENTRY main.3 {
-  Arg_0.1 = pred[8,6]{1,0} parameter(0)
-  ROOT tuple.2 = (pred[8,6]{1,0}) tuple(Arg_0.1)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000043.hlo b/third_party/xla/xla/tests/fuzz/rand_000043.hlo
deleted file mode 100644
index e65df9df321626..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000043.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(u16[10]{0})}
-
-ENTRY main.7 {
-  constant.1 = u16[] constant(1)
-  broadcast.2 = u16[14]{0} broadcast(constant.1), dimensions={}
-  constant.3 = s32[1]{0} constant({1})
-  shift-right-arithmetic.4 = s32[1]{0} shift-right-arithmetic(constant.3, constant.3)
-  gather.5 = u16[10]{0} gather(broadcast.2, shift-right-arithmetic.4), offset_dims={0}, collapsed_slice_dims={}, start_index_map={0}, index_vector_dim=0, slice_sizes={10}
-  ROOT tuple.6 = (u16[10]{0}) tuple(gather.5)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000049.hlo b/third_party/xla/xla/tests/fuzz/rand_000049.hlo
deleted file mode 100644
index fe6778e52de0f2..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000049.hlo
+++ /dev/null
@@ -1,24 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[7,1]{1,0},u16[7,24]{1,0})->(u16[6]{0})}
-
-region_0.10 {
-  Arg_0.11 = u16[] parameter(0)
-  Arg_1.12 = u16[] parameter(1)
-  ROOT maximum.13 = u16[] maximum(Arg_0.11, Arg_1.12)
-}
-
-ENTRY main.17 {
-  constant.7 = u16[] constant(1)
-  broadcast.8 = u16[29]{0} broadcast(constant.7), dimensions={}
-  constant.3 = s32[] constant(0)
-  broadcast.4 = s32[7,1]{1,0} broadcast(constant.3), dimensions={}
-  Arg_0.1 = s32[7,1]{1,0} parameter(0)
-  constant.5 = s32[] constant(5)
-  broadcast.6 = s32[7,1]{1,0} broadcast(constant.5), dimensions={}
-  clamp.9 = s32[7,1]{1,0} clamp(broadcast.4, Arg_0.1, broadcast.6)
-  Arg_1.2 = u16[7,24]{1,0} parameter(1)
-  scatter.14 = u16[29]{0} scatter(broadcast.8, clamp.9, Arg_1.2), update_window_dims={1}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=1, to_apply=region_0.10
-  slice.15 = u16[6]{0} slice(scatter.14), slice={[11:29:3]}
-  ROOT tuple.16 = (u16[6]{0}) tuple(slice.15)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000053.hlo b/third_party/xla/xla/tests/fuzz/rand_000053.hlo
deleted file mode 100644
index 4866d1beb29cd3..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000053.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[11,11,4]{2,1,0},s8[11,11,4]{2,1,0})->(s8[11,11,10]{2,1,0})}
-
-ENTRY main.8 {
-  Arg_0.1 = s8[11,11,4]{2,1,0} parameter(0)
-  Arg_1.2 = s8[11,11,4]{2,1,0} parameter(1)
-  remainder.5 = s8[11,11,4]{2,1,0} remainder(Arg_0.1, Arg_1.2)
-  constant.3 = s8[] constant(-1)
-  broadcast.4 = s8[11,4,10]{2,1,0} broadcast(constant.3), dimensions={}
-  dot.6 = s8[11,11,10]{2,1,0} dot(remainder.5, broadcast.4), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  ROOT tuple.7 = (s8[11,11,10]{2,1,0}) tuple(dot.6)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000056.hlo b/third_party/xla/xla/tests/fuzz/rand_000056.hlo
deleted file mode 100644
index 323ead964f20ba..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000056.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u8[4]{0})->(u8[16]{0})}
-
-ENTRY main.8 {
-  constant.2 = u8[] constant(1)
-  broadcast.3 = u8[16]{0} broadcast(constant.2), dimensions={}
-  Arg_0.1 = u8[4]{0} parameter(0)
-  constant.4 = s32[] constant(3)
-  dynamic-update-slice.5 = u8[16]{0} dynamic-update-slice(broadcast.3, Arg_0.1, constant.4)
-  remainder.6 = u8[16]{0} remainder(broadcast.3, dynamic-update-slice.5)
-  ROOT tuple.7 = (u8[16]{0}) tuple(remainder.6)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000059.hlo b/third_party/xla/xla/tests/fuzz/rand_000059.hlo
deleted file mode 100644
index eeca88454376e1..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000059.hlo
+++ /dev/null
@@ -1,14 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[15,16,7,14,1]{4,3,2,1,0})->(pred[15,16,7,14,1]{4,3,2,1,0})}
-
-ENTRY main.9 {
-  constant.4 = u32[] constant(1)
-  broadcast.5 = u32[15,16,7,14,1]{4,3,2,1,0} broadcast(constant.4), dimensions={}
-  Arg_0.1 = u32[15,16,7,14,1]{4,3,2,1,0} parameter(0)
-  shift-left.6 = u32[15,16,7,14,1]{4,3,2,1,0} shift-left(broadcast.5, Arg_0.1)
-  constant.2 = u32[] constant(0)
-  broadcast.3 = u32[15,16,7,14,1]{4,3,2,1,0} broadcast(constant.2), dimensions={}
-  compare.7 = pred[15,16,7,14,1]{4,3,2,1,0} compare(shift-left.6, broadcast.3), direction=GE
-  ROOT tuple.8 = (pred[15,16,7,14,1]{4,3,2,1,0}) tuple(compare.7)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000060.hlo b/third_party/xla/xla/tests/fuzz/rand_000060.hlo
deleted file mode 100644
index 27ae0648507da8..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000060.hlo
+++ /dev/null
@@ -1,27 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[13,16,15]{2,1,0})->(s16[13,4,12]{2,1,0})}
-
-region_0.12 {
-  Arg_0.13 = s16[] parameter(0)
-  Arg_1.14 = s16[] parameter(1)
-  ROOT add.15 = s16[] add(Arg_0.13, Arg_1.14)
-}
-
-ENTRY main.20 {
-  constant.3 = s16[] constant(1)
-  broadcast.4 = s16[13,2,16]{2,1,0} broadcast(constant.3), dimensions={}
-  Arg_0.1 = s16[13,16,15]{2,1,0} parameter(0)
-  dot.9 = s16[13,2,15]{2,1,0} dot(broadcast.4, Arg_0.1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  constant.5 = s16[] constant(1)
-  broadcast.6 = s16[4,2,6]{2,1,0} broadcast(constant.5), dimensions={}
-  constant.7 = s16[] constant(1)
-  broadcast.8 = s16[4,6,9]{2,1,0} broadcast(constant.7), dimensions={}
-  dot.10 = s16[4,2,9]{2,1,0} dot(broadcast.6, broadcast.8), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  convolution.11 = s16[13,4,15]{2,1,0} convolution(dot.9, dot.10), window={size=9 pad=2_2 lhs_dilate=3 rhs_dilate=4}, dim_labels=bf0_oi0->bf0
-  constant.2 = s16[] constant(0)
-  reduce.16 = s16[13,4]{1,0} reduce(convolution.11, constant.2), dimensions={2}, to_apply=region_0.12
-  broadcast.17 = s16[12,13,4]{2,1,0} broadcast(reduce.16), dimensions={1,2}
-  reshape.18 = s16[13,4,12]{2,1,0} reshape(broadcast.17)
-  ROOT tuple.19 = (s16[13,4,12]{2,1,0}) tuple(reshape.18)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000061.hlo b/third_party/xla/xla/tests/fuzz/rand_000061.hlo
deleted file mode 100644
index 55185eba833aee..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000061.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[],s8[])->(s8[])}
-
-ENTRY main.8 {
-  Arg_0.1 = s8[] parameter(0)
-  negate.4 = s8[] negate(Arg_0.1)
-  constant.3 = s8[] constant(1)
-  Arg_1.2 = s8[] parameter(1)
-  shift-right-arithmetic.5 = s8[] shift-right-arithmetic(constant.3, Arg_1.2)
-  xor.6 = s8[] xor(negate.4, shift-right-arithmetic.5)
-  ROOT tuple.7 = (s8[]) tuple(xor.6)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000062.hlo b/third_party/xla/xla/tests/fuzz/rand_000062.hlo
deleted file mode 100644
index 6f628bd941eb5a..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000062.hlo
+++ /dev/null
@@ -1,10 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(bf16[18,22,24,41,11]{4,3,2,1,0})->(bf16[11,9,6,15,1]{4,3,2,1,0})}
-
-ENTRY main.5 {
-  Arg_0.1 = bf16[18,22,24,41,11]{4,3,2,1,0} parameter(0)
-  sign.2 = bf16[18,22,24,41,11]{4,3,2,1,0} sign(Arg_0.1)
-  slice.3 = bf16[11,9,6,15,1]{4,3,2,1,0} slice(sign.2), slice={[7:18:1], [13:22:1], [6:24:3], [11:41:2], [1:4:3]}
-  ROOT tuple.4 = (bf16[11,9,6,15,1]{4,3,2,1,0}) tuple(slice.3)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000064.hlo b/third_party/xla/xla/tests/fuzz/rand_000064.hlo
deleted file mode 100644
index e644e27e012207..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000064.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(c64[])}
-
-ENTRY main.4 {
-  constant.1 = c64[] constant((1, 0))
-  sine.2 = c64[] sine(constant.1)
-  ROOT tuple.3 = (c64[]) tuple(sine.2)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000066.hlo b/third_party/xla/xla/tests/fuzz/rand_000066.hlo
deleted file mode 100644
index c42d2ce2272374..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000066.hlo
+++ /dev/null
@@ -1,26 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u16[16,13,8,10]{3,2,1,0},s32[4,16,15,1]{3,2,1,0},u16[4,16,15,5]{3,2,1,0},u16[16,13,8,10]{3,2,1,0},u16[16,13,8,10]{3,2,1,0})->(u16[16,13,8,10]{3,2,1,0})}
-
-region_0.11 {
-  Arg_0.12 = u16[] parameter(0)
-  Arg_1.13 = u16[] parameter(1)
-  ROOT minimum.14 = u16[] minimum(Arg_0.12, Arg_1.13)
-}
-
-ENTRY main.19 {
-  Arg_0.1 = u16[16,13,8,10]{3,2,1,0} parameter(0)
-  constant.6 = s32[] constant(0)
-  broadcast.7 = s32[4,16,15,1]{3,2,1,0} broadcast(constant.6), dimensions={}
-  Arg_1.2 = s32[4,16,15,1]{3,2,1,0} parameter(1)
-  constant.8 = s32[] constant(7)
-  broadcast.9 = s32[4,16,15,1]{3,2,1,0} broadcast(constant.8), dimensions={}
-  clamp.10 = s32[4,16,15,1]{3,2,1,0} clamp(broadcast.7, Arg_1.2, broadcast.9)
-  Arg_2.3 = u16[4,16,15,5]{3,2,1,0} parameter(2)
-  scatter.15 = u16[16,13,8,10]{3,2,1,0} scatter(Arg_0.1, clamp.10, Arg_2.3), update_window_dims={3}, inserted_window_dims={1,2,3}, scatter_dims_to_operand_dims={2}, index_vector_dim=3, to_apply=region_0.11
-  Arg_3.4 = u16[16,13,8,10]{3,2,1,0} parameter(3)
-  Arg_4.5 = u16[16,13,8,10]{3,2,1,0} parameter(4)
-  remainder.16 = u16[16,13,8,10]{3,2,1,0} remainder(Arg_3.4, Arg_4.5)
-  maximum.17 = u16[16,13,8,10]{3,2,1,0} maximum(scatter.15, remainder.16)
-  ROOT tuple.18 = (u16[16,13,8,10]{3,2,1,0}) tuple(maximum.17)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000067.hlo b/third_party/xla/xla/tests/fuzz/rand_000067.hlo
deleted file mode 100644
index 733588f991f028..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000067.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u8[11,15,9,15,14]{4,3,2,1,0})->(u8[11,16,9]{2,1,0})}
-
-ENTRY main.7 {
-  Arg_0.1 = u8[11,15,9,15,14]{4,3,2,1,0} parameter(0)
-  constant.2 = u8[] constant(1)
-  broadcast.3 = u8[4,15,30,11,18]{4,3,2,1,0} broadcast(constant.2), dimensions={}
-  convolution.4 = u8[11,4,2,2,9]{4,3,2,1,0} convolution(Arg_0.1, broadcast.3), window={size=30x11x18 stride=7x3x1 pad=5_7x6_4x1_2 lhs_dilate=3x1x3 rhs_dilate=1x2x2}, dim_labels=bf012_oi012->bf012
-  reshape.5 = u8[11,16,9]{2,1,0} reshape(convolution.4)
-  ROOT tuple.6 = (u8[11,16,9]{2,1,0}) tuple(reshape.5)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000069.hlo b/third_party/xla/xla/tests/fuzz/rand_000069.hlo
deleted file mode 100644
index 8c3a8f8a53f073..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000069.hlo
+++ /dev/null
@@ -1,25 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[14,11,10,14]{3,2,1,0})->(u32[14,11,14,10]{0,3,1,2})}
-
-region_0.11 {
-  Arg_0.12 = u32[] parameter(0)
-  Arg_1.13 = u32[] parameter(1)
-  ROOT maximum.14 = u32[] maximum(Arg_0.12, Arg_1.13)
-}
-
-ENTRY main.18 {
-  Arg_0.1 = u32[14,11,10,14]{3,2,1,0} parameter(0)
-  constant.2 = s32[] constant(0)
-  broadcast.3 = s32[7,3]{1,0} broadcast(constant.2), dimensions={}
-  constant.5 = s32[] constant(1)
-  broadcast.6 = s32[7,3]{1,0} broadcast(constant.5), dimensions={}
-  constant.4 = s32[3]{0} constant({2, 4, 9})
-  broadcast.9 = s32[7,3]{1,0} broadcast(constant.4), dimensions={1}
-  clamp.10 = s32[7,3]{1,0} clamp(broadcast.3, broadcast.6, broadcast.9)
-  constant.7 = u32[] constant(1)
-  broadcast.8 = u32[6,9,10,7]{3,2,1,0} broadcast(constant.7), dimensions={}
-  scatter.15 = u32[14,11,10,14]{3,2,1,0} scatter(Arg_0.1, clamp.10, broadcast.8), update_window_dims={0,1,2}, inserted_window_dims={2}, scatter_dims_to_operand_dims={1,3,2}, index_vector_dim=1, to_apply=region_0.11
-  transpose.16 = u32[14,11,14,10]{0,3,1,2} transpose(scatter.15), dimensions={3,1,0,2}
-  ROOT tuple.17 = (u32[14,11,14,10]{0,3,1,2}) tuple(transpose.16)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000071.hlo b/third_party/xla/xla/tests/fuzz/rand_000071.hlo
deleted file mode 100644
index 025029419ad43a..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000071.hlo
+++ /dev/null
@@ -1,37 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[10,14]{1,0})->(s16[15,7]{1,0})}
-
-region_0.20 {
-  Arg_0.21 = pred[] parameter(0)
-  Arg_1.22 = pred[] parameter(1)
-  ROOT and.23 = pred[] and(Arg_0.21, Arg_1.22)
-}
-
-ENTRY main.30 {
-  constant.7 = s16[] constant(1)
-  broadcast.8 = s16[15,16]{1,0} broadcast(constant.7), dimensions={}
-  constant.10 = s32[] constant(1)
-  broadcast.11 = s32[16,1]{1,0} broadcast(constant.10), dimensions={}
-  constant.6 = s32[2]{0} constant({10, 14})
-  constant.4 = s32[1,1]{1,0} constant({ {1} })
-  gather.12 = s32[1]{0} gather(constant.6, constant.4), offset_dims={}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1, slice_sizes={1}
-  constant.5 = s32[2]{0} constant({1, 7})
-  gather.13 = s32[1]{0} gather(constant.5, constant.4), offset_dims={}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1, slice_sizes={1}
-  subtract.14 = s32[1]{0} subtract(gather.12, gather.13)
-  reshape.15 = s32[1,1]{1,0} reshape(subtract.14)
-  broadcast.16 = s32[1,1]{1,0} broadcast(reshape.15), dimensions={0,1}
-  reshape.17 = s32[1]{0} reshape(broadcast.16)
-  broadcast.18 = s32[16,1]{1,0} broadcast(reshape.17), dimensions={1}
-  compare.19 = pred[16,1]{1,0} compare(broadcast.11, broadcast.18), direction=LE
-  constant.9 = pred[] constant(true)
-  reduce.24 = pred[16]{0} reduce(compare.19, constant.9), dimensions={1}, to_apply=region_0.20
-  broadcast.26 = pred[16,7]{1,0} broadcast(reduce.24), dimensions={0}
-  Arg_0.1 = s16[10,14]{1,0} parameter(0)
-  gather.25 = s16[16,7]{1,0} gather(Arg_0.1, broadcast.11), offset_dims={1}, collapsed_slice_dims={0}, start_index_map={1}, index_vector_dim=1, slice_sizes={1,7}
-  constant.2 = s16[] constant(0)
-  broadcast.3 = s16[16,7]{1,0} broadcast(constant.2), dimensions={}
-  select.27 = s16[16,7]{1,0} select(broadcast.26, gather.25, broadcast.3)
-  dot.28 = s16[15,7]{1,0} dot(broadcast.8, select.27), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT tuple.29 = (s16[15,7]{1,0}) tuple(dot.28)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000072.hlo b/third_party/xla/xla/tests/fuzz/rand_000072.hlo
deleted file mode 100644
index fb6a7d2073d5cc..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000072.hlo
+++ /dev/null
@@ -1,15 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(c64[12,4,6,13,3]{4,3,2,1,0})}
-
-ENTRY main.10 {
-  constant.3 = c64[] constant((0, 0))
-  broadcast.4 = c64[12,4,6,13,3]{4,3,2,1,0} broadcast(constant.3), dimensions={}
-  constant.1 = c64[] constant((1, 0))
-  broadcast.2 = c64[12,4,6,13,3]{4,3,2,1,0} broadcast(constant.1), dimensions={}
-  exponential-minus-one.5 = c64[12,4,6,13,3]{4,3,2,1,0} exponential-minus-one(broadcast.2)
-  compare.6 = pred[12,4,6,13,3]{4,3,2,1,0} compare(broadcast.4, exponential-minus-one.5), direction=EQ
-  select.7 = c64[12,4,6,13,3]{4,3,2,1,0} select(compare.6, broadcast.2, exponential-minus-one.5)
-  rsqrt.8 = c64[12,4,6,13,3]{4,3,2,1,0} rsqrt(select.7)
-  ROOT tuple.9 = (c64[12,4,6,13,3]{4,3,2,1,0}) tuple(rsqrt.8)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000077.hlo b/third_party/xla/xla/tests/fuzz/rand_000077.hlo
deleted file mode 100644
index e576a550d95bce..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000077.hlo
+++ /dev/null
@@ -1,25 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(u32[11]{0})}
-
-region_0.4 {
-  Arg_0.5 = u32[] parameter(0)
-  Arg_1.6 = u32[] parameter(1)
-  ROOT maximum.7 = u32[] maximum(Arg_0.5, Arg_1.6)
-}
-
-region_1.9 {
-  Arg_0.10 = u32[] parameter(0)
-  Arg_1.11 = u32[] parameter(1)
-  ROOT add.12 = u32[] add(Arg_0.10, Arg_1.11)
-}
-
-ENTRY main.16 {
-  constant.1 = u32[] constant(1)
-  broadcast.2 = u32[13]{0} broadcast(constant.1), dimensions={}
-  constant.3 = u32[] constant(0)
-  reduce-window.8 = u32[13]{0} reduce-window(broadcast.2, constant.3), window={size=1 rhs_dilate=3}, to_apply=region_0.4
-  reduce.13 = u32[] reduce(reduce-window.8, constant.3), dimensions={0}, to_apply=region_1.9
-  broadcast.14 = u32[11]{0} broadcast(reduce.13), dimensions={}
-  ROOT tuple.15 = (u32[11]{0}) tuple(broadcast.14)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000078.hlo b/third_party/xla/xla/tests/fuzz/rand_000078.hlo
deleted file mode 100644
index 75186475d34872..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000078.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[14,2]{1,0})->(s8[4,14,8,2,8]{4,3,2,1,0})}
-
-ENTRY main.7 {
-  Arg_0.1 = s8[14,2]{1,0} parameter(0)
-  constant.2 = s8[] constant(1)
-  broadcast.3 = s8[14,2]{1,0} broadcast(constant.2), dimensions={}
-  minimum.4 = s8[14,2]{1,0} minimum(Arg_0.1, broadcast.3)
-  broadcast.5 = s8[4,14,8,2,8]{4,3,2,1,0} broadcast(minimum.4), dimensions={1,3}
-  ROOT tuple.6 = (s8[4,14,8,2,8]{4,3,2,1,0}) tuple(broadcast.5)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000079.hlo b/third_party/xla/xla/tests/fuzz/rand_000079.hlo
deleted file mode 100644
index 147cd5b473d115..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000079.hlo
+++ /dev/null
@@ -1,39 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[5,13,11,16,4,7]{5,4,3,2,1,0},s32[6,4]{1,0},s8[5,14,5,3,9]{4,3,2,1,0},s8[10,14,15,2,7]{4,3,2,1,0})->(s8[5,11,16,7]{3,2,1,0})}
-
-region_0.8 {
-  Arg_0.9 = s8[] parameter(0)
-  Arg_1.10 = s8[] parameter(1)
-  ROOT add.11 = s8[] add(Arg_0.9, Arg_1.10)
-}
-
-region_1.15 {
-  Arg_0.16 = s8[] parameter(0)
-  Arg_1.17 = s8[] parameter(1)
-  ROOT add.18 = s8[] add(Arg_0.16, Arg_1.17)
-}
-
-region_2.22 {
-  Arg_0.23 = s8[] parameter(0)
-  Arg_1.24 = s8[] parameter(1)
-  ROOT maximum.25 = s8[] maximum(Arg_0.23, Arg_1.24)
-}
-
-ENTRY main.28 {
-  Arg_0.1 = s8[5,13,11,16,4,7]{5,4,3,2,1,0} parameter(0)
-  constant.7 = s8[] constant(0)
-  reduce.12 = s8[5,11,16,7]{3,2,1,0} reduce(Arg_0.1, constant.7), dimensions={1,4}, to_apply=region_0.8
-  constant.5 = s32[] constant(1)
-  broadcast.6 = s32[6,4]{1,0} broadcast(constant.5), dimensions={}
-  Arg_1.2 = s32[6,4]{1,0} parameter(1)
-  subtract.13 = s32[6,4]{1,0} subtract(broadcast.6, Arg_1.2)
-  Arg_2.3 = s8[5,14,5,3,9]{4,3,2,1,0} parameter(2)
-  Arg_3.4 = s8[10,14,15,2,7]{4,3,2,1,0} parameter(3)
-  convolution.14 = s8[5,10,16,6,6]{4,3,2,1,0} convolution(Arg_2.3, Arg_3.4), window={size=15x2x7 stride=1x2x1 pad=6_7x1_4x7_6 lhs_dilate=4x4x2 rhs_dilate=1x3x4}, dim_labels=bf012_oi012->bf012
-  reduce.19 = s8[5,10,16,6]{3,2,1,0} reduce(convolution.14, constant.7), dimensions={3}, to_apply=region_1.15
-  broadcast.20 = s8[5,5,10,16,6]{4,3,2,1,0} broadcast(reduce.19), dimensions={1,2,3,4}
-  reshape.21 = s8[5,10,16,5,6]{4,3,2,1,0} reshape(broadcast.20)
-  scatter.26 = s8[5,11,16,7]{3,2,1,0} scatter(reduce.12, subtract.13, reshape.21), update_window_dims={0,1,2,3}, inserted_window_dims={}, scatter_dims_to_operand_dims={3,0,1,2}, index_vector_dim=1, to_apply=region_2.22
-  ROOT tuple.27 = (s8[5,11,16,7]{3,2,1,0}) tuple(scatter.26)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000081.hlo b/third_party/xla/xla/tests/fuzz/rand_000081.hlo
deleted file mode 100644
index 8225683aa9087e..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000081.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(c64[3,12,11,13]{3,2,1,0})->(c64[3,12,11,13]{3,2,1,0})}
-
-ENTRY main.8 {
-  constant.2 = c64[] constant((1, 0))
-  broadcast.3 = c64[3,12,11,13]{3,2,1,0} broadcast(constant.2), dimensions={}
-  rsqrt.4 = c64[3,12,11,13]{3,2,1,0} rsqrt(broadcast.3)
-  Arg_0.1 = c64[3,12,11,13]{3,2,1,0} parameter(0)
-  atan2.5 = c64[3,12,11,13]{3,2,1,0} atan2(Arg_0.1, broadcast.3)
-  subtract.6 = c64[3,12,11,13]{3,2,1,0} subtract(rsqrt.4, atan2.5)
-  ROOT tuple.7 = (c64[3,12,11,13]{3,2,1,0}) tuple(subtract.6)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000084.hlo b/third_party/xla/xla/tests/fuzz/rand_000084.hlo
deleted file mode 100644
index 45ebb92f481319..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000084.hlo
+++ /dev/null
@@ -1,16 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(c64[30,37,54]{2,1,0})->(c64[6,12,16]{2,1,0})}
-
-ENTRY main.11 {
-  constant.2 = c64[] constant((1, 0))
-  broadcast.3 = c64[30,37,54]{2,1,0} broadcast(constant.2), dimensions={}
-  constant.4 = c64[] constant((0, 0))
-  broadcast.5 = c64[30,37,54]{2,1,0} broadcast(constant.4), dimensions={}
-  Arg_0.1 = c64[30,37,54]{2,1,0} parameter(0)
-  compare.6 = pred[30,37,54]{2,1,0} compare(broadcast.5, Arg_0.1), direction=EQ
-  select.7 = c64[30,37,54]{2,1,0} select(compare.6, broadcast.3, Arg_0.1)
-  divide.8 = c64[30,37,54]{2,1,0} divide(broadcast.3, select.7)
-  slice.9 = c64[6,12,16]{2,1,0} slice(divide.8), slice={[12:30:3], [1:37:3], [6:54:3]}
-  ROOT tuple.10 = (c64[6,12,16]{2,1,0}) tuple(slice.9)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000085.hlo b/third_party/xla/xla/tests/fuzz/rand_000085.hlo
deleted file mode 100644
index 5ca2d95278a9b9..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000085.hlo
+++ /dev/null
@@ -1,17 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f16[])->(f16[])}
-
-ENTRY main.12 {
-  constant.3 = f16[] constant(-1.5)
-  constant.5 = f16[] constant(0)
-  Arg_0.1 = f16[] parameter(0)
-  compare.6 = pred[] compare(constant.5, Arg_0.1), direction=EQ
-  constant.4 = f16[] constant(1)
-  select.7 = f16[] select(compare.6, constant.4, Arg_0.1)
-  atan2.8 = f16[] atan2(select.7, constant.4)
-  constant.2 = f16[] constant(1.5)
-  clamp.9 = f16[] clamp(constant.3, atan2.8, constant.2)
-  tan.10 = f16[] tan(clamp.9)
-  ROOT tuple.11 = (f16[]) tuple(tan.10)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000086.hlo b/third_party/xla/xla/tests/fuzz/rand_000086.hlo
deleted file mode 100644
index 44bcedb8156102..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000086.hlo
+++ /dev/null
@@ -1,14 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[],u32[],u32[])->(u32[])}
-
-ENTRY main.9 {
-  Arg_0.1 = u32[] parameter(0)
-  Arg_1.2 = u32[] parameter(1)
-  minimum.5 = u32[] minimum(Arg_0.1, Arg_1.2)
-  Arg_2.3 = u32[] parameter(2)
-  constant.4 = u32[] constant(1)
-  minimum.6 = u32[] minimum(Arg_2.3, constant.4)
-  and.7 = u32[] and(minimum.5, minimum.6)
-  ROOT tuple.8 = (u32[]) tuple(and.7)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000088.hlo b/third_party/xla/xla/tests/fuzz/rand_000088.hlo
deleted file mode 100644
index 4066454eaacda6..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000088.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(bf16[11]{0})}
-
-ENTRY main.8 {
-  constant.3 = bf16[] constant(0)
-  broadcast.4 = bf16[11]{0} broadcast(constant.3), dimensions={}
-  constant.1 = bf16[] constant(2)
-  broadcast.2 = bf16[11]{0} broadcast(constant.1), dimensions={}
-  atan2.5 = bf16[11]{0} atan2(broadcast.4, broadcast.2)
-  multiply.6 = bf16[11]{0} multiply(atan2.5, broadcast.2)
-  ROOT tuple.7 = (bf16[11]{0}) tuple(multiply.6)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000089.hlo b/third_party/xla/xla/tests/fuzz/rand_000089.hlo
deleted file mode 100644
index 6d0ecd608d2ec0..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000089.hlo
+++ /dev/null
@@ -1,47 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u16[7,4,15,9]{3,2,1,0},u16[7,4,15,9]{3,2,1,0},pred[],u32[16,8,10]{2,1,0})->(u16[7,4,15,9]{3,2,1,0})}
-
-region_1.7 {
-  Arg_0.8 = u16[] parameter(0)
-  Arg_1.9 = u16[] parameter(1)
-  ROOT add.10 = u16[] add(Arg_0.8, Arg_1.9)
-}
-
-region_0.11 {
-  Arg_.12 = u32[16,8,10]{2,1,0} parameter(0)
-  convert.14 = u16[16,8,10]{2,1,0} convert(Arg_.12)
-  broadcast.15 = u16[7,16,8,10]{3,2,1,0} broadcast(convert.14), dimensions={1,2,3}
-  constant.13 = u16[] constant(0)
-  reduce.16 = u16[7]{0} reduce(broadcast.15, constant.13), dimensions={1,2,3}, to_apply=region_1.7
-  broadcast.17 = u16[4,15,9,7]{3,2,1,0} broadcast(reduce.16), dimensions={3}
-  ROOT reshape.18 = u16[7,4,15,9]{3,2,1,0} reshape(broadcast.17)
-}
-
-region_3.19 {
-  Arg_0.20 = u16[] parameter(0)
-  Arg_1.21 = u16[] parameter(1)
-  ROOT add.22 = u16[] add(Arg_0.20, Arg_1.21)
-}
-
-region_2.23 {
-  Arg_.24 = u32[16,8,10]{2,1,0} parameter(0)
-  convert.26 = u16[16,8,10]{2,1,0} convert(Arg_.24)
-  broadcast.27 = u16[7,16,8,10]{3,2,1,0} broadcast(convert.26), dimensions={1,2,3}
-  constant.25 = u16[] constant(0)
-  reduce.28 = u16[7]{0} reduce(broadcast.27, constant.25), dimensions={1,2,3}, to_apply=region_3.19
-  broadcast.29 = u16[4,15,9,7]{3,2,1,0} broadcast(reduce.28), dimensions={3}
-  ROOT reshape.30 = u16[7,4,15,9]{3,2,1,0} reshape(broadcast.29)
-}
-
-ENTRY main.34 {
-  Arg_0.1 = u16[7,4,15,9]{3,2,1,0} parameter(0)
-  Arg_1.2 = u16[7,4,15,9]{3,2,1,0} parameter(1)
-  shift-right-logical.5 = u16[7,4,15,9]{3,2,1,0} shift-right-logical(Arg_0.1, Arg_1.2)
-  Arg_2.3 = pred[] parameter(2)
-  convert.6 = s32[] convert(Arg_2.3)
-  Arg_3.4 = u32[16,8,10]{2,1,0} parameter(3)
-  conditional.31 = u16[7,4,15,9]{3,2,1,0} conditional(convert.6, Arg_3.4, Arg_3.4), branch_computations={region_0.11, region_2.23}
-  xor.32 = u16[7,4,15,9]{3,2,1,0} xor(shift-right-logical.5, conditional.31)
-  ROOT tuple.33 = (u16[7,4,15,9]{3,2,1,0}) tuple(xor.32)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000090.hlo b/third_party/xla/xla/tests/fuzz/rand_000090.hlo
deleted file mode 100644
index 2a3f0bd84d72a4..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000090.hlo
+++ /dev/null
@@ -1,15 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u16[2,2,12]{2,1,0},u16[2,2,12]{2,1,0},u16[2,2,2,5,12]{4,3,2,1,0})->(pred[2,2,12]{2,1,0})}
-
-ENTRY main.10 {
-  Arg_0.1 = u16[2,2,12]{2,1,0} parameter(0)
-  Arg_1.2 = u16[2,2,12]{2,1,0} parameter(1)
-  shift-right-logical.6 = u16[2,2,12]{2,1,0} shift-right-logical(Arg_0.1, Arg_1.2)
-  Arg_2.3 = u16[2,2,2,5,12]{4,3,2,1,0} parameter(2)
-  constant.4 = u16[] constant(1)
-  broadcast.5 = u16[5,2]{1,0} broadcast(constant.4), dimensions={}
-  dot.7 = u16[2,2,12]{2,1,0} dot(Arg_2.3, broadcast.5), lhs_contracting_dims={3,0}, rhs_contracting_dims={0,1}
-  compare.8 = pred[2,2,12]{2,1,0} compare(shift-right-logical.6, dot.7), direction=GT
-  ROOT tuple.9 = (pred[2,2,12]{2,1,0}) tuple(compare.8)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000092.hlo b/third_party/xla/xla/tests/fuzz/rand_000092.hlo
deleted file mode 100644
index d26180356212f1..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000092.hlo
+++ /dev/null
@@ -1,27 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(f16[11,13,15,12]{3,2,1,0})}
-
-region_0.5 {
-  Arg_0.6 = f16[] parameter(0)
-  Arg_1.7 = f16[] parameter(1)
-  ROOT add.8 = f16[] add(Arg_0.6, Arg_1.7)
-}
-
-region_1.10 {
-  Arg_0.11 = f16[] parameter(0)
-  Arg_1.12 = f16[] parameter(1)
-  ROOT add.13 = f16[] add(Arg_0.11, Arg_1.12)
-}
-
-ENTRY main.18 {
-  constant.2 = f16[] constant(1)
-  broadcast.3 = f16[15,13,11,13]{3,2,1,0} broadcast(constant.2), dimensions={}
-  floor.4 = f16[15,13,11,13]{3,2,1,0} floor(broadcast.3)
-  constant.1 = f16[] constant(0)
-  reduce-window.9 = f16[12,13,15,13]{3,2,1,0} reduce-window(floor.4, constant.1), window={size=16x1x3x19 lhs_dilate=4x1x2x4 rhs_dilate=3x1x3x2}, to_apply=region_0.5
-  reduce.14 = f16[13,15]{1,0} reduce(reduce-window.9, constant.1), dimensions={0,3}, to_apply=region_1.10
-  broadcast.15 = f16[11,12,13,15]{3,2,1,0} broadcast(reduce.14), dimensions={2,3}
-  reshape.16 = f16[11,13,15,12]{3,2,1,0} reshape(broadcast.15)
-  ROOT tuple.17 = (f16[11,13,15,12]{3,2,1,0}) tuple(reshape.16)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000094.hlo b/third_party/xla/xla/tests/fuzz/rand_000094.hlo
deleted file mode 100644
index 903fa648e1e0da..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000094.hlo
+++ /dev/null
@@ -1,17 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[13,6]{1,0})->(u32[13,6]{1,0})}
-
-ENTRY main.12 {
-  constant.2 = u32[] constant(1)
-  broadcast.3 = u32[13,3]{1,0} broadcast(constant.2), dimensions={}
-  constant.4 = u32[] constant(1)
-  broadcast.5 = u32[3,6]{1,0} broadcast(constant.4), dimensions={}
-  dot.8 = u32[13,6]{1,0} dot(broadcast.3, broadcast.5), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  Arg_0.1 = u32[13,6]{1,0} parameter(0)
-  constant.6 = u32[] constant(1)
-  broadcast.7 = u32[13,6]{1,0} broadcast(constant.6), dimensions={}
-  maximum.9 = u32[13,6]{1,0} maximum(Arg_0.1, broadcast.7)
-  shift-right-logical.10 = u32[13,6]{1,0} shift-right-logical(dot.8, maximum.9)
-  ROOT tuple.11 = (u32[13,6]{1,0}) tuple(shift-right-logical.10)
-}
-
diff --git a/third_party/xla/xla/tests/fuzz/rand_000095.hlo b/third_party/xla/xla/tests/fuzz/rand_000095.hlo
deleted file mode 100644
index 8f2f3cd7da99cf..00000000000000
--- a/third_party/xla/xla/tests/fuzz/rand_000095.hlo
+++ /dev/null
@@ -1,56 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(pred[],u32[8,5,3,8,2]{4,3,2,1,0},u32[1,1,2,3,1]{4,3,2,1,0})->(u32[13]{0})}
-
-region_1.11 {
-  Arg_0.12 = s32[] parameter(0)
-  Arg_1.13 = s32[] parameter(1)
-  ROOT add.14 = s32[] add(Arg_0.12, Arg_1.13)
-}
-
-region_0.15 {
-  arg_empty_tuple.16 = () parameter(0)
-  constant.18 = s32[] constant(1)
-  broadcast.19 = s32[6,12,1,9,5]{4,3,2,1,0} broadcast(constant.18), dimensions={}
-  constant.17 = s32[] constant(0)
-  reduce.20 = s32[] reduce(broadcast.19, constant.17), dimensions={0,1,2,3,4}, to_apply=region_1.11
-  ROOT broadcast.21 = s32[8,3,8,2,1]{4,3,2,1,0} broadcast(reduce.20), dimensions={}
-}
-
-region_3.22 {
-  Arg_0.23 = s32[] parameter(0)
-  Arg_1.24 = s32[] parameter(1)
-  ROOT add.25 = s32[] add(Arg_0.23, Arg_1.24)
-}
-
-region_2.26 {
-  arg_empty_tuple.27 = () parameter(0)
-  constant.29 = s32[] constant(1)
-  broadcast.30 = s32[6,12,1,9,5]{4,3,2,1,0} broadcast(constant.29), dimensions={}
-  constant.28 = s32[] constant(0)
-  reduce.31 = s32[] reduce(broadcast.30, constant.28), dimensions={0,1,2,3,4}, to_apply=region_3.22
-  ROOT broadcast.32 = s32[8,3,8,2,1]{4,3,2,1,0} broadcast(reduce.31), dimensions={}
-}
-
-region_4.35 {
-  Arg_0.36 = u32[] parameter(0)
-  Arg_1.37 = u32[] parameter(1)
-  ROOT multiply.38 = u32[] multiply(Arg_0.36, Arg_1.37)
-}
-
-ENTRY main.41 {
-  constant.4 = u32[] constant(1)
-  broadcast.5 = u32[13]{0} broadcast(constant.4), dimensions={}
-  Arg_0.1 = pred[] parameter(0)
-  convert.8 = s32[] convert(Arg_0.1)
-  tuple.9 = () tuple()
-  tuple.10 = () tuple()
-  conditional.33 = s32[8,3,8,2,1]{4,3,2,1,0} conditional(convert.8, tuple.9, tuple.10), branch_computations={region_0.15, region_2.26}
-  Arg_1.2 = u32[8,5,3,8,2]{4,3,2,1,0} parameter(1)
-  Arg_2.3 = u32[1,1,2,3,1]{4,3,2,1,0} parameter(2)
-  constant.6 = s32[] constant(1)
-  constant.7 = s32[] constant(0)
-  dynamic-update-slice.34 = u32[8,5,3,8,2]{4,3,2,1,0} dynamic-update-slice(Arg_1.2, Arg_2.3, constant.6, constant.7, constant.6, constant.6, constant.6)
-  scatter.39 = u32[13]{0} scatter(broadcast.5, conditional.33, dynamic-update-slice.34), update_window_dims={1}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=4, to_apply=region_4.35
-  ROOT tuple.40 = (u32[13]{0}) tuple(scatter.39)
-}
-
diff --git a/third_party/xla/xla/tests/generate_complex_unary_op_samples.py b/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
index 6dc472e6bba84d..5175ca09011f39 100644
--- a/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
+++ b/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
@@ -57,10 +57,10 @@ def main():
   default_extra_prec_multiplier = 1
 
   blocks = []
-  for opname in ['Log1p', 'Tan', 'Asin', 'Asinh']:
+  for opname in ['Log1p', 'Tan', 'Asin', 'Asinh', 'Exp']:
     mpmath_op = opname.lower()
     mpmath_op = dict(asin='arcsin', asinh='arcsinh').get(mpmath_op, mpmath_op)
-    size_re, size_im = dict(Log1p=(7, 7), Tan=(7, 7)).get(
+    size_re, size_im = dict(Log1p=(7, 7), Tan=(7, 7), Exp=(7, 7)).get(
         opname, (default_size, default_size)
     )
     extra_prec_multiplier = dict(
@@ -70,6 +70,7 @@ def main():
         # available
         Asin=20,
         Asinh=20,
+        Exp=1,
     ).get(opname, default_extra_prec_multiplier)
     nmp = jtu.numpy_with_mpmath(
         mpmath, extra_prec_multiplier=extra_prec_multiplier
diff --git a/third_party/xla/xla/tests/get_default_platform_test.cc b/third_party/xla/xla/tests/get_default_platform_test.cc
index b561fb3bda9804..36e9ec5f0064b6 100644
--- a/third_party/xla/xla/tests/get_default_platform_test.cc
+++ b/third_party/xla/xla/tests/get_default_platform_test.cc
@@ -14,16 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/service/platform_util.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 // Although we don't use any of the functionality provided by HloPjRtTestBase,
 // we want to model the same environment as a PjRt migrated test that ends up
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
index 62be9d21cd60a9..0f03467cf92087 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -176,9 +177,9 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
 
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
-    const std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    const std::function<int64_t(int64_t)> argument_count_provider,
-    const std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const int64_t num_replicas, const bool run_hlo_passes,
     DeviceAssignment* const device_assignment) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
@@ -186,8 +187,8 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
   return test_runner_->ExecuteReplicated(
-      executable_provider, argument_count_provider, argument_provider,
-      std::move(options), device_assignment);
+      std::move(executable_provider), std::move(argument_count_provider),
+      std::move(argument_provider), std::move(options), device_assignment);
 }
 
 absl::StatusOr<std::vector<Literal>>
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
index b00eb38fbd7738..d7460df59858f2 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -176,9 +177,9 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
 
   // Same as above, but allows passing different programs for replicas.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       int64_t num_replicas, bool run_hlo_passes,
       DeviceAssignment* device_assignment = nullptr);
 
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index 35d83bb2cbbf97..6421e9badcbec7 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -24,7 +24,6 @@ limitations under the License.
 
 #include "absl/base/nullability.h"
 #include "absl/log/check.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
@@ -47,7 +46,6 @@ limitations under the License.
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
 #include "xla/tests/pjrt_client_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/util.h"
@@ -140,13 +138,13 @@ HloTestBase::HloTestBase(
 
 /*static*/ se::Platform* HloTestBase::GetReferencePlatform() {
   auto result = PlatformUtil::GetPlatform(kInterpreter);
-  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  CHECK_OK(result.status()) << "could not get interpreter platform";
   return result.value();
 }
 
 /*static*/ se::Platform* HloTestBase::GetTestPlatform() {
   auto result = PlatformUtil::GetDefaultPlatform();
-  TF_CHECK_OK(result.status()) << "could not get test platform";
+  CHECK_OK(result.status()) << "could not get test platform";
   return result.value();
 }
 
diff --git a/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h b/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
index 8bafe83523ddcd..516aec73ef38e4 100644
--- a/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
+++ b/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
@@ -17,11 +17,12 @@ limitations under the License.
 #define XLA_TESTS_HLO_TEST_BASE_WITH_MLIR_CONTEXT_H_
 
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 
-class HloTestBaseWithMlirContext : public HloTestBase {
+class HloTestBaseWithMLIRContext : public HloTestBase {
  public:
   mlir::MLIRContext* mlir_context() { return &mlir_context_; }
 
diff --git a/third_party/xla/xla/tests/int4_test.cc b/third_party/xla/xla/tests/int4_test.cc
index d7ab5cf980cd8d..670844c99ac2b3 100644
--- a/third_party/xla/xla/tests/int4_test.cc
+++ b/third_party/xla/xla/tests/int4_test.cc
@@ -338,6 +338,7 @@ class ElementwiseTest : public HloTestBase,
   static bool IsFloatingPointOnly(HloOpcode opcode) {
     switch (opcode) {
       case HloOpcode::kAsin:
+      case HloOpcode::kAsinh:
       case HloOpcode::kAcos:
       case HloOpcode::kAcosh:
       case HloOpcode::kAtan2:
diff --git a/third_party/xla/xla/tests/literal_test_util.cc b/third_party/xla/xla/tests/literal_test_util.cc
index a49c32612dca28..af5512a2248304 100644
--- a/third_party/xla/xla/tests/literal_test_util.cc
+++ b/third_party/xla/xla/tests/literal_test_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/test.h"
 #include "tsl/platform/path.h"
 
@@ -51,10 +51,10 @@ void WriteLiteralToTempFile(const LiteralSlice& literal,
   auto* env = tsl::Env::Default();
   std::string filename = tsl::io::JoinPath(
       outdir, absl::StrFormat("tempfile-%d-%s", env->NowMicros(), name));
-  TF_CHECK_OK(tsl::WriteBinaryProto(env, absl::StrCat(filename, ".pb"),
-                                    literal.ToProto()));
-  TF_CHECK_OK(tsl::WriteStringToFile(env, absl::StrCat(filename, ".txt"),
-                                     literal.ToString()));
+  CHECK_OK(tsl::WriteBinaryProto(env, absl::StrCat(filename, ".pb"),
+                                 literal.ToProto()));
+  CHECK_OK(tsl::WriteStringToFile(env, absl::StrCat(filename, ".txt"),
+                                  literal.ToString()));
   LOG(ERROR) << "wrote Literal to " << name << " file: " << filename
              << ".{pb,txt}";
 }
diff --git a/third_party/xla/xla/tests/literal_test_util_test.cc b/third_party/xla/xla/tests/literal_test_util_test.cc
index 7c6b201fb9a260..36face591dd390 100644
--- a/third_party/xla/xla/tests/literal_test_util_test.cc
+++ b/third_party/xla/xla/tests/literal_test_util_test.cc
@@ -18,15 +18,26 @@ limitations under the License.
 
 #include "xla/tests/literal_test_util.h"
 
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_join.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/logging.h"
+#include "xla/literal_util.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/types.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -135,9 +146,9 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   }
   std::string pattern = tsl::io::JoinPath(outdir, "tempfile-*.pb");
   std::vector<std::string> files;
-  TF_CHECK_OK(env->GetMatchingPaths(pattern, &files));
+  CHECK_OK(env->GetMatchingPaths(pattern, &files));
   for (const auto& f : files) {
-    TF_CHECK_OK(env->DeleteFile(f)) << f;
+    CHECK_OK(env->DeleteFile(f)) << f;
   }
 
   ASSERT_DEATH(dummy_lambda(), "two is not near four");
@@ -145,14 +156,13 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   // Now check we wrote temporary files to the temporary directory that we can
   // read.
   std::vector<std::string> results;
-  TF_CHECK_OK(env->GetMatchingPaths(pattern, &results));
+  CHECK_OK(env->GetMatchingPaths(pattern, &results));
 
   LOG(INFO) << "results: [" << absl::StrJoin(results, ", ") << "]";
   EXPECT_EQ(3, results.size());
   for (const std::string& result : results) {
     LiteralProto literal_proto;
-    TF_CHECK_OK(
-        tsl::ReadBinaryProto(tsl::Env::Default(), result, &literal_proto));
+    CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), result, &literal_proto));
     Literal literal = Literal::CreateFromProto(literal_proto).value();
     if (result.find("expected") != std::string::npos) {
       EXPECT_EQ("f32[] 2", literal.ToString());
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index 98d1f81782071e..ac4aec28517450 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -857,8 +857,10 @@ TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
 }
 
 // Disabled on interpreter backend since infeed HLO is unsupported.
+// Not all TPU generations support infeed/outfeed, but SE does provide the
+// capability to query this.
 TEST_F(LocalClientExecuteTest, InfeedTest) {
-  if (test::DeviceIs(test::kInterpreter)) {
+  if (test::DeviceTypeIsOneOf({test::kInterpreter, test::kTpu})) {
     GTEST_SKIP();
   }
   XlaBuilder builder(TestName());
@@ -885,8 +887,10 @@ TEST_F(LocalClientExecuteTest, InfeedTest) {
 }
 
 // Disabled on interpreter backend since infeed/outfeed HLOs are unsupported.
+// Not all TPU generations support infeed/outfeed, but SE does provide the
+// capability to query this.
 TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) {
-  if (test::DeviceIs(test::kInterpreter)) {
+  if (test::DeviceTypeIsOneOf({test::kInterpreter, test::kTpu})) {
     GTEST_SKIP();
   }
   XlaBuilder builder(TestName());
diff --git a/third_party/xla/xla/tests/map_test.cc b/third_party/xla/xla/tests/map_test.cc
index b2d309646b76f9..d4a8bba948287b 100644
--- a/third_party/xla/xla/tests/map_test.cc
+++ b/third_party/xla/xla/tests/map_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/tests/xla_test_backend_predicates.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/array2d.h"
@@ -34,15 +35,16 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/status.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
+class MapTest : public ClientLibraryTestRunnerMixin<
+                    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   MapTest() {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
@@ -60,7 +62,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     auto one = ConstantR0<float>(&mapped_builder, 1.0);
     Add(x, one);
     auto computation_status = mapped_builder.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 
@@ -70,7 +72,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     auto rhs = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y");
     Max(lhs, rhs);
     auto computation_status = b.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 
@@ -82,7 +84,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     (void)Parameter(&mapped_builder, 0, ShapeUtil::MakeShape(F32, {}), "x");
     ConstantR0<T>(&mapped_builder, 1);
     auto computation_status = mapped_builder.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 
@@ -97,7 +99,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     auto two = ConstantR0<float>(&mapped_builder, 2.0);
     Mul(x, two);
     auto computation_status = mapped_builder.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 
@@ -116,7 +118,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     auto adder_to_one = Add(x, one);
     Mul(x, adder_to_one);
     auto computation_status = mapped_builder.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 
@@ -134,7 +136,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     auto constant_n = ConstantR0<float>(&builder, n);
     Add(map, constant_n);
     auto computation_status = builder.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 
@@ -146,7 +148,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y");
     Gt(x, y);
     auto computation_status = b.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 
@@ -165,7 +167,7 @@ class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
     auto xy = Add(x, y);
     Add(xy, z);
     auto computation_status = mapped_builder.Build();
-    TF_CHECK_OK(computation_status.status());
+    CHECK_OK(computation_status.status());
     return std::move(computation_status).value();
   }
 };
@@ -454,7 +456,7 @@ TEST_F(MapTest, MapOperationWithBuildError) {
                                    "different element types: f32[] and u16[]"));
 }
 
-class MapHloTest : public HloTestBase {};
+using MapHloTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 // TODO(b/230123847): Enable this on GPU once mhlo allows mixed-type map.
 TEST_F(MapHloTest, MapWithMixedInputTypes) {
@@ -484,7 +486,8 @@ TEST_F(MapHloTest, MapWithMixedInputTypes) {
 
 // MapTest disables inline and algsimp. MapTestWithFullOpt runs all
 // optimizations.
-using MapTestWithFullOpt = ClientLibraryTestRunnerMixin<HloTestBase>;
+using MapTestWithFullOpt = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 // Regression test for b/31466798. The inliner simplifies map(param0, param1,
 // power) to power(param0, param1) without deleting the old subcomputation which
diff --git a/third_party/xla/xla/tests/matmul_test.cc b/third_party/xla/xla/tests/matmul_test.cc
index 21bc3b8e8a459f..b9c151889df7f6 100644
--- a/third_party/xla/xla/tests/matmul_test.cc
+++ b/third_party/xla/xla/tests/matmul_test.cc
@@ -39,7 +39,7 @@ class MatmulTestWithCublas : public HloTestBase,
                                .default_stream_executor()
                                ->GetDeviceDescription()
                                .gpu_compute_capability();
-      if (auto* rocm = std::get_if<se::RocmComputeCapability>(&gpu_cc);
+      if (auto* rocm = gpu_cc.rocm_compute_capability();
           rocm != nullptr && !rocm->has_hipblaslt()) {
         GTEST_SKIP() << "No hipblas-lt support on this architecture!";
       }
diff --git a/third_party/xla/xla/tests/matrix_ops_simple_test.cc b/third_party/xla/xla/tests/matrix_ops_simple_test.cc
index a23f1749cde22c..5f3e344e9902ec 100644
--- a/third_party/xla/xla/tests/matrix_ops_simple_test.cc
+++ b/third_party/xla/xla/tests/matrix_ops_simple_test.cc
@@ -181,10 +181,7 @@ class MatOpsDotAddTest
     auto stream_executor = client_->platform()->ExecutorForDevice(0).value();
     auto gpu_compute_capability =
         stream_executor->GetDeviceDescription().gpu_compute_capability();
-    if ((std::holds_alternative<stream_executor::CudaComputeCapability>(
-            gpu_compute_capability)) ||
-        std::holds_alternative<stream_executor::RocmComputeCapability>(
-            gpu_compute_capability)) {
+    if (gpu_compute_capability.IsCuda() || gpu_compute_capability.IsRocm()) {
       return true;
     }
     return false;
diff --git a/third_party/xla/xla/tests/multioutput_fusion_test.cc b/third_party/xla/xla/tests/multioutput_fusion_test.cc
index 7b04315cc624a4..c839b5e1826c61 100644
--- a/third_party/xla/xla/tests/multioutput_fusion_test.cc
+++ b/third_party/xla/xla/tests/multioutput_fusion_test.cc
@@ -30,9 +30,9 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
@@ -40,15 +40,16 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class MultiOutputFusionTest : public HloTestBase {
- protected:
-  MultiOutputFusionTest() { error_spec_ = ErrorSpec{0.0001, 1e-2}; }
+constexpr ErrorSpec kErrorSpec{0.0001, 1e-2};
 
+class MultiOutputFusionTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
+ protected:
   // Layout assignment assumes that there are no fusions in the input graph.
   // Since the purpose of this test is to send pre-fused graphs to XLA, we have
   // to do layout assignment ourselves.
   DebugOptions GetDebugOptionsForTest() const override {
-    auto opts = HloTestBase::GetDebugOptionsForTest();
+    auto opts = HloPjRtTestBase::GetDebugOptionsForTest();
     opts.add_xla_disable_hlo_passes("layout-assignment");
     return opts;
   }
@@ -93,8 +94,8 @@ class MultiOutputFusionTest : public HloTestBase {
           HloInstruction::CreateGetTupleElement(elem_shape2, tuple, 0));
       auto gte1 = computation->AddInstruction(
           HloInstruction::CreateGetTupleElement(elem_shape2, tuple, 1));
-      TF_CHECK_OK(dot->ReplaceOperandWith(0, gte0));
-      TF_CHECK_OK(dot->ReplaceOperandWith(1, gte1));
+      CHECK_OK(dot->ReplaceOperandWith(0, gte0));
+      CHECK_OK(dot->ReplaceOperandWith(1, gte1));
 
       CHECK_NE(
           computation->CreateFusionInstruction(
@@ -110,7 +111,7 @@ class MultiOutputFusionTest : public HloTestBase {
     Literal literal_r0 = LiteralUtil::CreateR0<float>(-9.0f);
     TF_ASSERT_OK_AND_ASSIGN(
         Literal actual, Execute(std::move(hlo_module), {&literal_r0, &arg1}));
-    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, error_spec_));
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, kErrorSpec));
   }
 
   void RunTest1D(bool manual_fusion, int size) {
@@ -157,8 +158,8 @@ class MultiOutputFusionTest : public HloTestBase {
           HloInstruction::CreateGetTupleElement(elem_shape_U8, tuple, 0));
       auto gte1 = computation->AddInstruction(
           HloInstruction::CreateGetTupleElement(elem_shape_F32, tuple, 1));
-      TF_CHECK_OK(sub->ReplaceOperandWith(0, gte0));
-      TF_CHECK_OK(reshape->ReplaceOperandWith(0, gte1));
+      CHECK_OK(sub->ReplaceOperandWith(0, gte0));
+      CHECK_OK(reshape->ReplaceOperandWith(0, gte1));
 
       CHECK_NE(computation->CreateFusionInstruction(
                    {tuple, sub_U8, add, param0_U8, param1_F32},
@@ -174,7 +175,7 @@ class MultiOutputFusionTest : public HloTestBase {
     Literal expect = LiteralUtil::CreateR1<float>({size * 1.5f * 3.5f});
     TF_ASSERT_OK_AND_ASSIGN(Literal actual,
                             Execute(std::move(hlo_module), {&input0, &input1}));
-    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, error_spec_));
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, kErrorSpec));
   }
 };
 
diff --git a/third_party/xla/xla/tests/numerics_test.cc b/third_party/xla/xla/tests/numerics_test.cc
index 6e7fa4f3f1ba07..d0b1478010ab26 100644
--- a/third_party/xla/xla/tests/numerics_test.cc
+++ b/third_party/xla/xla/tests/numerics_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "xla/tests/xla_test_backend_predicates.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
@@ -47,6 +48,7 @@ ENTRY entry {
   auto abs_of_complex_x = [&hlo, this](float x) {
     std::unique_ptr<HloModule> module =
         ParseAndReturnVerifiedModule(hlo).value();
+    module->set_name(absl::StrCat(module->name(), "_", x));
     auto x_lit = LiteralUtil::CreateR0<complex64>(x);
     return RunAndCompare(std::move(module), {&x_lit}, ErrorSpec{1e-5, 1e-5});
   };
@@ -70,6 +72,7 @@ ENTRY entry {
   auto complex_a_raised_to_complex_b = [&hlo, this](float num, float exp) {
     std::unique_ptr<HloModule> module =
         ParseAndReturnVerifiedModule(hlo).value();
+    module->set_name(absl::StrCat(module->name(), "_", num, "_", exp));
     auto num_lit = LiteralUtil::CreateR0<complex64>(num);
     auto exp_lit = LiteralUtil::CreateR0<complex64>(exp);
     return RunAndCompare(std::move(module), {&num_lit, &exp_lit},
diff --git a/third_party/xla/xla/tests/pad_test.cc b/third_party/xla/xla/tests/pad_test.cc
index a2ae629515f116..33a11a0df5e349 100644
--- a/third_party/xla/xla/tests/pad_test.cc
+++ b/third_party/xla/xla/tests/pad_test.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
+#include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
 #include "xla/array4d.h"
@@ -27,20 +30,26 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-static std::array<PrimitiveType, 4> test_type_params{F32, BF16, F8E5M2,
-                                                     F8E4M3FN};
+constexpr std::array<PrimitiveType, 4> test_type_params{F32, BF16, F8E5M2,
+                                                        F8E4M3FN};
 
-class PadTest : public ClientLibraryTestBase {
+constexpr ErrorSpec kErrorSpec(1e-5, 1e-5);
+
+class PadTest : public ClientLibraryTestRunnerMixin<
+                    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   PadTest() {
     // Initializes the padding configuration used for R4 tests.
@@ -64,16 +73,50 @@ class PadTest : public ClientLibraryTestBase {
     dimension3->set_interior_padding(0);
   }
 
+  void TearDown() override {
+    ASSERT_FALSE(!params_were_used_ && !params_.empty())
+        << "AddParam() was used to add parameters, but those parameters were "
+           "never used for execution. Please remove the AddParam() calls or "
+           "ensure that you call AddParamArgumentPointers().";
+  }
+
+  // Convenience function to help us port tests from ClientLibraryTestBase.
+  // Usually AddParam should be replaced with Parameter() and an appropriate
+  // literal, but there are too many of these in this test.
+  XlaOp AddParam(Literal literal, XlaBuilder* builder) {
+    Literal converted_literal = MaybeConvertLiteralToTestType(literal);
+    const Shape shape = converted_literal.shape();
+    params_.push_back(std::move(converted_literal));
+    return Parameter(builder, params_.size() - 1, shape, "");
+  }
+
+  template <class T>
+  XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
+    return AddParam(LiteralUtil::CreateFromArray(argument), builder);
+  }
+
+  std::vector<const Literal*> AddParamArgumentPointers() {
+    params_were_used_ = true;
+    std::vector<const Literal*> ptrs;
+    ptrs.reserve(params_.size());
+    for (const Literal& param : params_) {
+      ptrs.push_back(&param);
+    }
+    return ptrs;
+  }
+
   // Padding configuration for R4 that only pads dimension 0 and 1.
   PaddingConfig r4_padding_on_dim0_dim1_;
+
+ private:
+  std::vector<Literal> params_;
+  bool params_were_used_ = false;
 };
 
 class PadTestFloat : public PadTest,
                      public ::testing::WithParamInterface<PrimitiveType> {
  protected:
   PadTestFloat() { set_float_type(GetParam()); }
-
-  ErrorSpec DefaultErrorSpec() const { return ErrorSpec(1e-5, 1e-5); }
 };
 
 // Tests a Pad() with a zero-element input and output.
@@ -88,7 +131,7 @@ TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
 
   Pad(AddParam(LiteralUtil::CreateR1<float>({}), &b),
       AddParam(LiteralUtil::CreateR0<float>(0.1), &b), padding_config);
-  ComputeAndCompareR1<float>(&b, {}, {}, DefaultErrorSpec());
+  ComputeAndCompareR1<float>(&b, {}, AddParamArgumentPointers(), kErrorSpec);
 }
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
@@ -103,8 +146,8 @@ TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
 
   Pad(AddParam(LiteralUtil::CreateR1<float>({}), &b),
       AddParam(LiteralUtil::CreateR0<float>(0.1), &b), padding_config);
-  ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1), {},
-                             DefaultErrorSpec());
+  ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1),
+                             AddParamArgumentPointers(), kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad1DS3Array) {
@@ -119,7 +162,8 @@ TEST_P(PadTestFloat, Pad1DS3Array) {
   Pad(AddParam(LiteralUtil::CreateR1<float>({1, 2, 3}), &b),
       AddParam(LiteralUtil::CreateR0<float>(0.1), &b), padding_config);
   std::vector<float> expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3});
-  ComputeAndCompareR1<float>(&b, expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR1<float>(&b, expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
@@ -127,8 +171,8 @@ TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
   Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
       AddParam(LiteralUtil::CreateR0<float>(1.5), &b),
       r4_padding_on_dim0_dim1_);
-  ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
-                             DefaultErrorSpec());
+  ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f),
+                             AddParamArgumentPointers(), kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
@@ -152,7 +196,8 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   (*expected)(1, 0, 1, 1) = 4.0f;
   (*expected)(1, 0, 2, 0) = 5.0f;
   (*expected)(1, 0, 2, 1) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR4<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
@@ -172,7 +217,8 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
   (*expected)(4, 2, 0, 0) = 4.0f;
   (*expected)(7, 0, 0, 0) = 5.0f;
   (*expected)(7, 2, 0, 0) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, *expected, AddParamArgumentPointers(),
+                             ErrorSpec(0.0001));
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
@@ -203,7 +249,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   auto input = LiteralUtil::CreateR4FromArray4D<float>(input_array);
   input = input.Relayout(layout);
 
-  Pad(AddParam(input, &b),
+  Pad(AddParam(std::move(input), &b),
       AddParam(LiteralUtil::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 1, 5, 8);
@@ -214,7 +260,8 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   expected_array(0, 0, 3, 2) = 4.0f;
   expected_array(0, 0, 3, 3) = 5.0f;
   expected_array(0, 0, 3, 4) = 6.0f;
-  ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, expected_array, AddParamArgumentPointers(),
+                             ErrorSpec(0.0001));
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
@@ -249,7 +296,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   auto input = LiteralUtil::CreateR4FromArray4D<float>(input_array);
   input = input.Relayout(layout);
 
-  Pad(AddParam(input, &b),
+  Pad(AddParam(std::move(input), &b),
       AddParam(LiteralUtil::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 25, 17, 11);
@@ -257,7 +304,8 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   expected_array(0, 0, 2, 2) = 1.0f;
   expected_array(0, 24, 14, 8) = 2.0f;
   expected_array(0, 17, 6, 7) = 3.0f;
-  ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, expected_array, AddParamArgumentPointers(),
+                             ErrorSpec(0.0001));
 }
 
 TEST_F(PadTest, Pad4DU8Array) {
@@ -281,7 +329,7 @@ TEST_F(PadTest, Pad4DU8Array) {
   (*expected)(1, 0, 1, 1) = 4;
   (*expected)(1, 0, 2, 0) = 5;
   (*expected)(1, 0, 2, 1) = 6;
-  ComputeAndCompareR4<uint8_t>(&b, *expected, {});
+  ComputeAndCompareR4<uint8_t>(&b, *expected, AddParamArgumentPointers());
 }
 
 TEST_F(PadTest, Pad4DPredArray) {
@@ -308,7 +356,7 @@ TEST_F(PadTest, Pad4DPredArray) {
   (*expected)(1, 0, 1, 1) = 1;
   (*expected)(1, 0, 2, 0) = 1;
   (*expected)(1, 0, 2, 1) = 1;
-  ComputeAndCompareR4<int32_t>(&b, *expected, {});
+  ComputeAndCompareR4<int32_t>(&b, *expected, AddParamArgumentPointers());
 }
 
 TEST_P(PadTestFloat, Large2DPad) {
@@ -327,7 +375,8 @@ TEST_P(PadTestFloat, Large2DPad) {
   Pad(input, AddParam(LiteralUtil::CreateR0<float>(0.0f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, AllTypes2DPad) {
@@ -349,7 +398,8 @@ TEST_P(PadTestFloat, AllTypes2DPad) {
   Pad(input, AddParam(LiteralUtil::CreateR0<float>(3.14f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, High2DPad) {
@@ -376,7 +426,8 @@ TEST_P(PadTestFloat, High2DPad) {
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, NegativePadding2D) {
@@ -404,7 +455,8 @@ TEST_P(PadTestFloat, NegativePadding2D) {
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
@@ -432,7 +484,8 @@ TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 // Regression test for b/31827337.
@@ -455,7 +508,8 @@ TEST_P(PadTestFloat, ReducePad) {
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{0.0, 0.0}, {0.0, 0.0}}});
-  ComputeAndCompareR3<float>(&b, expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR3<float>(&b, expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 INSTANTIATE_TEST_CASE_P(PadTestFloatInstantiation, PadTestFloat,
diff --git a/third_party/xla/xla/tests/params_test.cc b/third_party/xla/xla/tests/params_test.cc
index 2fc9daa23f1ef9..85a23ca0d4acb9 100644
--- a/third_party/xla/xla/tests/params_test.cc
+++ b/third_party/xla/xla/tests/params_test.cc
@@ -32,14 +32,16 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using ParamsTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+using ParamsTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(ParamsTest, ConstantR0F32Param) {
   XlaBuilder builder(TestName());
diff --git a/third_party/xla/xla/tests/prng_test.cc b/third_party/xla/xla/tests/prng_test.cc
index e4cfcb092102e2..7e390a4b80b71b 100644
--- a/third_party/xla/xla/tests/prng_test.cc
+++ b/third_party/xla/xla/tests/prng_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <array>
 #include <cmath>
-#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <memory>
@@ -24,19 +23,23 @@ limitations under the License.
 #include <vector>
 
 #include "xla/tests/xla_test_backend_predicates.h"
+#include "absl/log/log.h"
 #include "absl/types/span.h"
+#include "Eigen/Core"
 #include "unsupported/Eigen/SpecialFunctions"
 #include "xla/client/local_client.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/primitive_util.h"
+#include "xla/service/service.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_base.h"
-#include "xla/util.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
new file mode 100644
index 00000000000000..df3d670a6d77b3
--- /dev/null
+++ b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
@@ -0,0 +1,1056 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/tests/collective_ops_e2e_test_base.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using ::testing::NotNull;
+
+enum class RaggedAllToAllImplType {
+  kNccl,
+  kMemcpy,
+  kDecomposer,
+  kOneShot,
+};
+
+class RaggedAllToAllTestBase : public CollectiveOpsWithFlagsBase {
+ public:
+  RaggedAllToAllTestBase(bool enable_async, RaggedAllToAllImplType impl_type)
+      : CollectiveOpsWithFlagsBase(
+            enable_async, impl_type == RaggedAllToAllImplType::kMemcpy),
+        impl_type_(impl_type) {}
+
+  // Creates random test data for a ragged-all-to-all.
+  //
+  // Ragged tensors which are ragged (have various size) along the second most
+  // changing dimension only, i.e. shape such as [8, (4), 3]. In memory those
+  // tensors are flattened out the outermost dimension.
+  //
+  // A ragged tensor is represented by three arrays: data, offsets, and sizes.
+  //   * The data array holds the elements of the ragged tensor.
+  //   * The offsets array holds the starting offset of each ragged row.
+  //   * The sizes array holds the number of elements in each ragged row.
+  //
+  // A ragged-all-to-all of N replicas performance a collective transpose of the
+  // ragged tensors. Each pair of replicas exchanges one ragged row. To generate
+  // the test data we need to know the sizes of all ragged rows for each
+  // replica.
+  //
+  // `input_sizes` is an array of shape [num_replicas, num_replicas,
+  // num_updates_per_replica]. For concenivence, `input_sizes` can be a 2D
+  // array, in that case `num_updates_per_replica` is assumed to be 1.
+  absl::Status CreateRandomTestData(HloModule* module,
+                                    Array<int64_t> input_sizes) {
+    CHECK(inputs_.empty());
+    if (input_sizes.num_dimensions() == 2) {
+      input_sizes.Reshape({input_sizes.dim(0), input_sizes.dim(1), 1});
+    }
+    auto ragged_all_to_all =
+        FindInstruction(module, HloOpcode::kRaggedAllToAll);
+    EXPECT_THAT(ragged_all_to_all, NotNull());
+
+    const std::vector<ReplicaGroup>& replica_groups =
+        ragged_all_to_all->replica_groups();
+    EXPECT_FALSE(replica_groups.empty());
+
+    int64_t num_total_replicas = input_sizes.dim(0);
+    int64_t num_replicas = replica_groups[0].replica_ids_size();
+
+    EXPECT_TRUE(
+        absl::c_all_of(replica_groups, [&](const ReplicaGroup& replica_group) {
+          return replica_group.replica_ids_size() == num_replicas;
+        }));
+
+    inputs_.resize(num_total_replicas);
+    expected_outputs_.resize(num_total_replicas);
+    input_offsets_.resize(num_total_replicas);
+    input_sizes_.resize(num_total_replicas);
+    output_offsets_.resize(num_total_replicas);
+    output_sizes_.resize(num_total_replicas);
+
+    HloInstruction* output_param =
+        module->entry_computation()->parameter_instruction(1);
+
+    // The ragged-all-to-all accepts an output tensor as a parameter to allow
+    // buffer reuse. We initialize the output tensor with -1 to make sure that
+    // we don't accidentally overwrite data that is not part of the
+    // ragged-all-to-all update.
+    Array<float> output_init_data(output_param->shape().dimensions());
+    output_init_data.Fill(-1);
+
+    // Iterate over all replica groups and create random test data for each
+    // group.
+    for (const ReplicaGroup& replica_group : replica_groups) {
+      Array<int64_t> input_sizes_per_replica_group(
+          {num_replicas, input_sizes.dim(1), input_sizes.dim(2)});
+
+      for (int64_t i = 0; i < num_replicas; ++i) {
+        int64_t replica_id = replica_group.replica_ids(i);
+        input_sizes_per_replica_group.UpdateSlice(
+            input_sizes.Slice(
+                {replica_id, 0, 0},
+                {replica_id + 1, input_sizes.dim(1), input_sizes.dim(2)}),
+            {i, 0, 0});
+      }
+
+      TF_RETURN_IF_ERROR(CreateRandomTestDataForReplicaGroup(
+          module, input_sizes_per_replica_group, output_init_data,
+          replica_group));
+    }
+
+    TF_ASSIGN_OR_RETURN(output_init_,
+                        LiteralUtil::CreateFromArrayWithLayout(
+                            output_init_data, output_param->shape().layout())
+                            .Convert(output_param->shape().element_type()));
+    return absl::OkStatus();
+  }
+
+  // Create random test data for a ragged-all-to-all for a single replica group.
+  absl::Status CreateRandomTestDataForReplicaGroup(
+      HloModule* module, Array<int64_t> input_sizes,
+      const Array<float>& output_init_data, const ReplicaGroup& replica_group) {
+    HloInstruction* input_param =
+        module->entry_computation()->parameter_instruction(0);
+    HloInstruction* output_param =
+        module->entry_computation()->parameter_instruction(1);
+    int64_t num_replicas = replica_group.replica_ids_size();
+
+    Array<int64_t> output_sizes = input_sizes;
+    output_sizes.TransposeDimensions({1, 0, 2});
+
+    Array<int64_t> input_offsets = CalculateOffsetsFromSizes(input_sizes);
+    Array<int64_t> output_offsets = CalculateOffsetsFromSizes(output_sizes);
+    output_offsets.TransposeDimensions({1, 0, 2});
+
+    std::vector<Array<float>> input_data(
+        num_replicas, Array<float>(input_param->shape().dimensions()));
+    std::vector<Array<float>> output_data(num_replicas, output_init_data);
+    FillWithRandomData(input_data, output_data, input_offsets, output_offsets,
+                       input_sizes);
+
+    // Create literals from array data.
+    for (int64_t i = 0; i < num_replicas; ++i) {
+      int64_t replica_id = replica_group.replica_ids(i);
+      TF_ASSIGN_OR_RETURN(inputs_[replica_id],
+                          LiteralUtil::CreateFromArrayWithLayout(
+                              input_data[i], input_param->shape().layout())
+                              .Convert(input_param->shape().element_type()));
+
+      TF_ASSIGN_OR_RETURN(expected_outputs_[replica_id],
+                          LiteralUtil::CreateFromArrayWithLayout(
+                              output_data[i], output_param->shape().layout())
+                              .Convert(output_param->shape().element_type()));
+
+      TF_ASSIGN_OR_RETURN(
+          input_offsets_[replica_id],
+          GetParameterLiteral(module, /*parameter_index=*/2, i, input_offsets));
+
+      TF_ASSIGN_OR_RETURN(
+          input_sizes_[replica_id],
+          GetParameterLiteral(module, /*parameter_index=*/3, i, input_sizes));
+
+      TF_ASSIGN_OR_RETURN(output_offsets_[replica_id],
+                          GetParameterLiteral(module, /*parameter_index=*/4, i,
+                                              output_offsets));
+      TF_ASSIGN_OR_RETURN(
+          output_sizes_[replica_id],
+          GetParameterLiteral(module, /*parameter_index=*/5, i, output_sizes));
+    }
+    return absl::OkStatus();
+  }
+
+  // Returns a vector of pointers to the literals in the format needed for
+  // ExecuteReplicated.
+  std::vector<std::vector<Literal*>> GetInputLiteralPtrs() {
+    std::vector<std::vector<Literal*>> input_literal_ptrs;
+    for (int i = 0; i < inputs_.size(); ++i) {
+      input_literal_ptrs.push_back({&inputs_[i], &output_init_,
+                                    &input_offsets_[i], &input_sizes_[i],
+                                    &output_offsets_[i], &output_sizes_[i]});
+    }
+    return input_literal_ptrs;
+  }
+
+ protected:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions opts = CollectiveOpsWithFlagsBase::GetDebugOptionsForTest();
+    opts.set_xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(
+        impl_type_ == RaggedAllToAllImplType::kDecomposer);
+    opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(
+        impl_type_ == RaggedAllToAllImplType::kOneShot);
+    return opts;
+  }
+
+  // Computes ragged tensor offsets based on the sizes of the ragged rows.
+  Array<int64_t> CalculateOffsetsFromSizes(const Array<int64_t>& sizes) {
+    int64_t num_replicas = sizes.dim(0);
+    int64_t num_updates_per_replica = sizes.dim(2);
+    Array<int64_t> offsets(sizes.dimensions());
+    for (int i = 0; i < num_replicas; ++i) {
+      int64_t cur_offset = 0;
+      for (int j = 0; j < num_replicas; ++j) {
+        for (int k = 0; k < num_updates_per_replica; ++k) {
+          offsets(i, j, k) = cur_offset;
+          cur_offset += sizes(i, j, k);
+        }
+      }
+    }
+    return offsets;
+  }
+
+  // Fill the input and output tensors with random data. An all-to-all is
+  // effectively a transpose. We generate a chunk of random data for each update
+  // of each pair of replicas and write the chunk starting from the (i, j, k)
+  // offset of the input tensor and starting from the (j, i, k) offset of the
+  // output tensor.
+  void FillWithRandomData(std::vector<Array<float>>& input_data,
+                          std::vector<Array<float>>& output_data,
+                          const Array<int64_t>& input_offsets,
+                          const Array<int64_t>& output_offsets,
+                          const Array<int64_t>& input_sizes) {
+    int64_t num_replicas = input_sizes.dim(0);
+    int64_t num_updates_per_replica = input_sizes.dim(2);
+    std::vector<int64_t> start_indices(input_data[0].num_dimensions());
+    std::vector<int64_t> chunk_sizes{input_data[0].dimensions().begin(),
+                                     input_data[0].dimensions().end()};
+
+    for (int i = 0; i < num_replicas; ++i) {
+      for (int j = 0; j < num_replicas; ++j) {
+        for (int k = 0; k < num_updates_per_replica; ++k) {
+          chunk_sizes[0] = input_sizes(i, j, k);
+
+          Array<float> chunk_data(chunk_sizes);
+          chunk_data.FillRandomUniform(
+              1, 127,
+              /*seed=*/(i * num_replicas + j) * num_updates_per_replica + k);
+
+          start_indices[0] = input_offsets(i, j, k);
+          input_data[i].UpdateSlice(chunk_data, start_indices);
+
+          start_indices[0] = output_offsets(i, j, k);
+          output_data[j].UpdateSlice(chunk_data, start_indices);
+        }
+      }
+    }
+  }
+
+  // Returns a literal for the given parameter of the given replica.
+  absl::StatusOr<Literal> GetParameterLiteral(HloModule* module,
+                                              int64_t parameter_index,
+                                              int64_t replica_id,
+                                              const Array<int64_t>& data) {
+    HloInstruction* param =
+        module->entry_computation()->parameter_instruction(parameter_index);
+
+    int64_t num_replicas = data.dim(0);
+    int64_t num_updates_per_replica = data.dim(2);
+    Array<int64_t> replica_slice =
+        data.Slice({replica_id, 0, 0},
+                   {replica_id + 1, num_replicas, num_updates_per_replica});
+    replica_slice.Reshape({num_replicas * num_updates_per_replica});
+    return LiteralUtil::CreateFromArray(replica_slice)
+        .Convert(param->shape().element_type());
+  }
+
+  // Literates for the input and output data, offset, and size parameters of
+  // the ragged-all-to-all. Each vector contains one literal per replica.
+  std::vector<Literal> inputs_;
+  std::vector<Literal> input_offsets_;
+  std::vector<Literal> input_sizes_;
+
+  std::vector<Literal> expected_outputs_;
+  std::vector<Literal> output_offsets_;
+  std::vector<Literal> output_sizes_;
+
+  Literal output_init_;
+
+  RaggedAllToAllImplType impl_type_;
+};
+
+class RaggedAllToAllTest : public RaggedAllToAllTestBase,
+                           public ::testing::WithParamInterface<
+                               std::tuple<bool, RaggedAllToAllImplType>> {
+ public:
+  RaggedAllToAllTest()
+      : RaggedAllToAllTestBase(std::get<0>(GetParam()),
+                               std::get<1>(GetParam())) {}
+};
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[4] parameter(0)
+    output = f32[4] parameter(1)
+    input_offsets = s32[2] parameter(2)
+    send_sizes = s32[2] parameter(3)
+    output_offsets = s32[2] parameter(4)
+    recv_sizes = s32[2] parameter(5)
+    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
+      << "Test requires at least " << kNumReplicas << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{1, 1},
+                                                     /*replica_1=*/{3, 1}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_InputBufferLargerThanOutput) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[32] parameter(0)
+    output = f32[16] parameter(1)
+    input_offsets = s32[2] parameter(2)
+    send_sizes = s32[2] parameter(3)
+    output_offsets = s32[2] parameter(4)
+    recv_sizes = s32[2] parameter(5)
+    ROOT ra2a = f32[16] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
+      << "Test requires at least " << kNumReplicas << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{8, 5},
+                                                     /*replica_1=*/{4, 3}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_OutputBufferLargerThanInput) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[16] parameter(0)
+    output = f32[32] parameter(1)
+    input_offsets = s32[2] parameter(2)
+    send_sizes = s32[2] parameter(3)
+    output_offsets = s32[2] parameter(4)
+    recv_sizes = s32[2] parameter(5)
+    ROOT ra2a = f32[32] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
+      << "Test requires at least " << kNumReplicas * kNumPartitions
+      << " devices (" << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{4, 12},
+                                                     /*replica_1=*/{5, 11}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_MultipleUpdates) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[8] parameter(0)
+    output = f32[8] parameter(1)
+    input_offsets = s32[4] parameter(2)
+    send_sizes = s32[4] parameter(3)
+    output_offsets = s32[4] parameter(4)
+    recv_sizes = s32[4] parameter(5)
+    ROOT ra2a = f32[8] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
+      << "Test requires at least " << kNumReplicas * kNumPartitions
+      << " devices (" << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  TF_ASSERT_OK(CreateRandomTestData(
+      module.get(), /*input_sizes=*/{/*replica_0=*/{{1, 2}, {2, 1}},
+                                     /*replica_1=*/{{3, 1}, {1, 1}}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_MultiDimData) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = bf16[16, 5, 32] parameter(0)
+    output = bf16[16, 5, 32] parameter(1)
+    input_offsets = s64[2] parameter(2)
+    send_sizes = s64[2] parameter(3)
+    output_offsets = s64[2] parameter(4)
+    recv_sizes = s64[2] parameter(5)
+    ROOT ra2a = bf16[16, 5, 32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
+      << "Test requires at least " << kNumReplicas * kNumPartitions
+      << " devices (" << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{4, 7},
+                                                     /*replica_1=*/{2, 5}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_Degenerate) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module
+
+  ENTRY entry {
+    input = f32[4] parameter(0)
+    output = f32[4] parameter(1)
+    input_offsets = s32[1] parameter(2)
+    send_sizes = s32[1] parameter(3)
+    output_offsets = s32[1] parameter(4)
+    recv_sizes = s32[1] parameter(5)
+    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0},{1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
+      << "Test requires at least " << kNumReplicas * kNumPartitions
+      << " devices (" << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{1},
+                                                     /*replica_1=*/{3}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_NonDefaultLayout) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module
+
+  ENTRY entry {
+    input = f32[16,4,8]{0,2,1} parameter(0)
+    output = f32[16,4,8]{0,1,2} parameter(1)
+    input_offsets = s32[2] parameter(2)
+    send_sizes = s32[2] parameter(3)
+    output_offsets = s32[2] parameter(4)
+    recv_sizes = s32[2] parameter(5)
+    ROOT ra2a = f32[16,4,8]{0,1,2} ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
+      << "Test requires at least " << kNumReplicas * kNumPartitions
+      << " devices (" << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  auto ragged_all_to_all =
+      FindInstruction(module.get(), HloOpcode::kRaggedAllToAll);
+  EXPECT_THAT(ragged_all_to_all, NotNull());
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{4, 7},
+                                                     /*replica_1=*/{2, 5}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest,
+       RaggedAllToAll_2GPUs_DevicesInReplicaGroupInReverseOrder) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[4] parameter(0)
+    output = f32[4] parameter(1)
+    input_offsets = s32[2] parameter(2)
+    send_sizes = s32[2] parameter(3)
+    output_offsets = s32[2] parameter(4)
+    recv_sizes = s32[2] parameter(5)
+    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{1,0}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
+      << "Test requires at least " << kNumReplicas * kNumPartitions
+      << " devices (" << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{1, 1},
+                                                     /*replica_1=*/{3, 1}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_8GPUs) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[512, 5, 32] parameter(0)
+    output = f32[512, 5, 32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[512, 5, 32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,1,2,3,4,5,6,7}}
+  })";
+
+  const int64_t kNumReplicas = 8;
+  const int64_t kNumPartitions = 1;
+  const int64_t kNumUpdatesPerReplica = 4;
+  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << hlo_runner_->device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 10);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_8GPUs_2ReplicasPerGroups) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[512, 5, 32] parameter(0)
+    output = f32[512, 5, 32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[512, 5, 32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,4},{1,5},{2,6},{3,7}}
+  })";
+
+  const int64_t kNumReplicas = 8;
+  const int64_t kNumReplicasPerGroup = 2;
+  const int64_t kNumPartitions = 1;
+  const int64_t kNumUpdatesPerReplica = 16;
+  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << hlo_runner_->device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicasPerGroup, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 10);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_8GPUs_4ReplicasPerGroups) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[512, 5, 32] parameter(0)
+    output = f32[512, 5, 32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[512, 5, 32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,1,2,3},{4,5,6,7}}
+  })";
+
+  const int64_t kNumReplicas = 8;
+  const int64_t kNumReplicasPerGroup = 4;
+  const int64_t kNumPartitions = 1;
+  const int64_t kNumUpdatesPerReplica = 8;
+  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << hlo_runner_->device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicasPerGroup, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 10);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
+std::string RaggedAllToAllImplTypeName(
+    RaggedAllToAllImplType ragged_all_to_all_impl_type) {
+  switch (ragged_all_to_all_impl_type) {
+    case RaggedAllToAllImplType::kNccl:
+      return "nccl";
+    case RaggedAllToAllImplType::kMemcpy:
+      return "memcpy";
+    case RaggedAllToAllImplType::kDecomposer:
+      return "decomposer";
+    case RaggedAllToAllImplType::kOneShot:
+      return "one_shot";
+    default:
+      LOG(FATAL) << "Unknown ragged all-to-all implementation type.";
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RaggedAllToAllTest, RaggedAllToAllTest,
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Values(RaggedAllToAllImplType::kNccl,
+                                         RaggedAllToAllImplType::kMemcpy,
+                                         RaggedAllToAllImplType::kDecomposer,
+                                         RaggedAllToAllImplType::kOneShot)),
+    [](const ::testing::TestParamInfo<std::tuple<bool, RaggedAllToAllImplType>>&
+           info) {
+      return absl::StrCat(std::get<0>(info.param) ? "async" : "sync", "_",
+                          RaggedAllToAllImplTypeName(std::get<1>(info.param)));
+    });
+
+class RaggedAllToAllMultiHostDecomposerTest
+    : public RaggedAllToAllTestBase,
+      public ::testing::WithParamInterface<std::tuple<int64_t, int64_t>> {
+ public:
+  RaggedAllToAllMultiHostDecomposerTest()
+      : RaggedAllToAllTestBase(/*enable_async=*/false,
+                               /*impl_type=*/RaggedAllToAllImplType::kOneShot) {
+  }
+
+ protected:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options =
+        RaggedAllToAllTestBase::GetDebugOptionsForTest();
+    debug_options
+        .set_xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer(
+            true);
+    return debug_options;
+  }
+};
+
+TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_2GPUs_SliceSize1) {
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,1}}
+  })",
+                       num_input_rows, num_output_rows);
+
+  const int64_t kNumReplicas = 2;
+  const int64_t kNumPartitions = 1;
+  const int64_t kNumUpdatesPerReplica = 16;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas * kNumPartitions)
+      << "Test requires at least " << kNumReplicas * kNumPartitions
+      << " devices (" << hlo_runner_->device_count() << " available)";
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  config.mutable_debug_options()
+      .set_xla_gpu_unsupported_override_fast_interconnect_slice_size(1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 10);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
+TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,1,2,3,4,5,6,7}}
+  })",
+                       num_input_rows, num_output_rows);
+
+  const int64_t kNumReplicas = 8;
+  const int64_t kNumPartitions = 1;
+  const int64_t kNumUpdatesPerReplica = 4;
+  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << hlo_runner_->device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  config.mutable_debug_options()
+      .set_xla_gpu_unsupported_override_fast_interconnect_slice_size(4);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 16);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
+TEST_P(RaggedAllToAllMultiHostDecomposerTest,
+       RaggedAllToAll_8GPUs_SliceSize4_2ReplicaGroups) {
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,2,4,6},{1,3,5,7}}
+  })",
+                       num_input_rows, num_output_rows);
+
+  const int64_t kNumReplicas = 8;
+  const int64_t kNumReplicasPerGroup = 4;
+  const int64_t kNumPartitions = 1;
+  const int64_t kNumUpdatesPerReplica = 8;
+  if (hlo_runner_->device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << hlo_runner_->device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas * kNumPartitions);
+
+  config.mutable_debug_options()
+      .set_xla_gpu_unsupported_override_fast_interconnect_slice_size(4);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicasPerGroup, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 10);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs(),
+                        /*device_assignment=*/nullptr,
+                        /*num_replicas=*/kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RaggedAllToAllMultiHostDecomposerTest,
+    RaggedAllToAllMultiHostDecomposerTest,
+    ::testing::Values(std::make_tuple(512, 4096), std::make_tuple(4096, 512)),
+    [](const ::testing::TestParamInfo<std::tuple<int64_t, int64_t>>& info) {
+      if (std::get<0>(info.param) > std::get<1>(info.param)) {
+        return absl::StrCat("combine_", std::get<0>(info.param), "_",
+                            std::get<1>(info.param));
+      }
+      return absl::StrCat("dispatch_", std::get<0>(info.param), "_",
+                          std::get<1>(info.param));
+    });
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/tests/reshape_test.cc b/third_party/xla/xla/tests/reshape_test.cc
index 19b6ae21226d53..2058df6629e3ec 100644
--- a/third_party/xla/xla/tests/reshape_test.cc
+++ b/third_party/xla/xla/tests/reshape_test.cc
@@ -35,7 +35,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -47,7 +48,8 @@ namespace xla {
 namespace {
 
 class ReshapeTest : public ::testing::WithParamInterface<PrimitiveType>,
-                    public ClientLibraryTestRunnerMixin<HloTestBase> {
+                    public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   ReshapeTest() { set_float_type(GetParam()); }
 
@@ -957,7 +959,7 @@ TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
 INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest,
                         ::testing::ValuesIn({F32, BF16, F8E5M2, F8E4M3FN}));
 
-using ReshapeHloTest = HloTestBase;
+using ReshapeHloTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 TEST_F(ReshapeHloTest, NoHloPasses) {
   const std::string hlo_string = R"(
diff --git a/third_party/xla/xla/tests/round_trip_packed_literal_test.cc b/third_party/xla/xla/tests/round_trip_packed_literal_test.cc
index 444aa7c3ab0797..f618502926f395 100644
--- a/third_party/xla/xla/tests/round_trip_packed_literal_test.cc
+++ b/third_party/xla/xla/tests/round_trip_packed_literal_test.cc
@@ -14,20 +14,25 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
 
 #include "absl/base/casts.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/client/local_client.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/packed_literal_reader.h"
+#include "xla/service/service.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -52,7 +57,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR1F32Length2) {
   EXPECT_TRUE(tsl::WriteStringToFile(tsl::Env::Default(), fname, data).ok());
 
   std::unique_ptr<tsl::RandomAccessFile> f;
-  TF_CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(fname, &f));
+  CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(fname, &f));
   PackedLiteralReader reader(f.release());
   Literal actual = reader.Read(ShapeUtil::MakeShape(F32, {2})).value();
   EXPECT_TRUE(reader.IsExhausted());
@@ -77,7 +82,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) {
   const Layout layout = LayoutUtil::MakeLayout({1, 0});
 
   std::unique_ptr<tsl::RandomAccessFile> f;
-  TF_CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(fname, &f));
+  CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(fname, &f));
   PackedLiteralReader reader(f.release());
   Literal actual =
       reader.Read(ShapeUtil::MakeShape(F32, {2, 2}), &layout).value();
@@ -108,7 +113,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
   const Layout layout = LayoutUtil::MakeLayout({0, 1});
 
   std::unique_ptr<tsl::RandomAccessFile> f;
-  TF_CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(fname, &f));
+  CHECK_OK(tsl::Env::Default()->NewRandomAccessFile(fname, &f));
   PackedLiteralReader reader(f.release());
   Literal actual =
       reader.Read(ShapeUtil::MakeShape(F32, {2, 2}), &layout).value();
diff --git a/third_party/xla/xla/tests/sample_file_test.cc b/third_party/xla/xla/tests/sample_file_test.cc
index be3392fbb79dfa..a568a6dbe97f5e 100644
--- a/third_party/xla/xla/tests/sample_file_test.cc
+++ b/third_party/xla/xla/tests/sample_file_test.cc
@@ -13,33 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This demonstrates how to use hlo_test_base to create a file based testcase
-// and compare results on gpu and cpu.
+// This demonstrates how to create a file-based test case and compare results
+// between gpu and cpu.
 
+#include <memory>
 #include <string>
-#include <vector>
+#include <utility>
 
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/service/platform_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
+#include "xla/service/hlo_module_util.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/service/hlo_runner_pjrt.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class SampleFileTest : public HloTestBase {
+std::unique_ptr<HloRunnerInterface> GetReferenceRunner() {
+  absl::StatusOr<std::unique_ptr<PjRtClient>> client = GetXlaPjrtCpuClient({});
+  if (!client.ok()) {
+    LOG(FATAL) << "Failed to create XLA:CPU PjRtClient: " << client.status();
+  }
+  return std::make_unique<HloRunnerPjRt>(*std::move(client));
+}
+
+class SampleFileTest : public HloRunnerAgnosticReferenceMixin<HloPjRtTestBase> {
  protected:
   SampleFileTest()
-      : HloTestBase(
-            /*test_platform=*/PlatformUtil::GetPlatform("gpu").value(),
-            /*reference_platform=*/PlatformUtil::GetPlatform("cpu").value()) {}
+      : HloRunnerAgnosticReferenceMixin<HloPjRtTestBase>(
+            /*reference_runner=*/GetReferenceRunner()) {}
 };
 
 TEST_F(SampleFileTest, Convolution) {
-  const std::string& filename = tsl::io::JoinPath(
+  const std::string filename = tsl::io::JoinPath(
       tsl::testing::XlaSrcRoot(), "tests", "isolated_convolution.hlo");
-  EXPECT_TRUE(RunAndCompareFromFile(filename, ErrorSpec{0.01}));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ReadModuleFromHloTextFile(filename));
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_parallel_codegen_split_count(1);
+
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01}));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/sample_text_test.cc b/third_party/xla/xla/tests/sample_text_test.cc
index e7f0ca3dbde6e5..485ebbfd49081e 100644
--- a/third_party/xla/xla/tests/sample_text_test.cc
+++ b/third_party/xla/xla/tests/sample_text_test.cc
@@ -33,7 +33,7 @@ class SampleTextTest
     : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {};
 
 TEST_F(SampleTextTest, Axpy) {
-  const std::string& hlo_string = R"(
+  const std::string& hlo_string = R"hlo(
 HloModule axpy_module:
 ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   %alpha = f32[] parameter(0)
@@ -43,12 +43,12 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   %y = f32[2,4]{1,0} parameter(2)
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
-)";
+)hlo";
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0.0001}));
 }
 
 TEST_F(SampleTextTest, Tuple) {
-  const std::string& hlo_string = R"(
+  const std::string& hlo_string = R"hlo(
 HloModule TupleCreate_module:
 ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
   %v1 = f32[] parameter(0)
@@ -56,7 +56,7 @@ ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f
   %v3 = f32[2,3]{1,0} parameter(2)
   ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3)
 }
-)";
+)hlo";
   EXPECT_TRUE(RunAndCompare(hlo_string, nullopt));
 }
 
diff --git a/third_party/xla/xla/tests/scalar_computations_test.cc b/third_party/xla/xla/tests/scalar_computations_test.cc
index b38a0139990376..77e56519d9b968 100644
--- a/third_party/xla/xla/tests/scalar_computations_test.cc
+++ b/third_party/xla/xla/tests/scalar_computations_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
@@ -385,6 +386,8 @@ TEST_F(ScalarComputationsTest, DivU32s) {
             LiteralUtil::CreateR0<uint32_t>(dividend);
         const Literal divisor_literal =
             LiteralUtil::CreateR0<uint32_t>(divisor);
+        *div_computation.mutable_proto()->mutable_name() =
+            absl::StrCat(TestName(), "_", dividend, "_", divisor);
         TF_ASSERT_OK_AND_ASSIGN(
             const Literal actual_literal,
             ExecuteAndTransfer(div_computation,
@@ -423,6 +426,8 @@ TEST_F(ScalarComputationsTest, RemU32s) {
             LiteralUtil::CreateR0<uint32_t>(dividend);
         const Literal divisor_literal =
             LiteralUtil::CreateR0<uint32_t>(divisor);
+        *rem_computation.mutable_proto()->mutable_name() =
+            absl::StrCat(TestName(), "_", dividend, "_", divisor);
         TF_ASSERT_OK_AND_ASSIGN(
             const Literal actual_literal,
             ExecuteAndTransfer(rem_computation,
diff --git a/third_party/xla/xla/tests/slice_test.cc b/third_party/xla/xla/tests/slice_test.cc
index b228501200d638..b363742d8b271c 100644
--- a/third_party/xla/xla/tests/slice_test.cc
+++ b/third_party/xla/xla/tests/slice_test.cc
@@ -37,13 +37,15 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/reference_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using SliceTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+using SliceTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
   Array3D<float> values(3, 3, 3);
@@ -216,7 +218,8 @@ struct R1Spec {
 
 // Parameterized test that generates R1 values, slices them according
 // to the R1Spec, and compares the result with a computed version.
-class SliceR1Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
+class SliceR1Test : public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                     public ::testing::WithParamInterface<R1Spec> {
  protected:
   template <typename NativeT>
@@ -421,7 +424,8 @@ struct R2Spec {
 
 // Parameterized test that generates patterned R2 values, slices them according
 // to the R2Spec, and compares the results with the ReferenceUtil version.
-class SliceR2Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
+class SliceR2Test : public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                     public ::testing::WithParamInterface<R2Spec> {};
 
 TEST_P(SliceR2Test, DoIt) {
@@ -510,7 +514,8 @@ std::string R4SpecToString(const ::testing::TestParamInfo<R4Spec>& data) {
                       "__strides_", absl::StrJoin(spec.slice_strides, "x"));
 }
 
-class SliceR4Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
+class SliceR4Test : public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                     public ::testing::WithParamInterface<R4Spec> {
  protected:
   void Run(const R4Spec& spec) {
diff --git a/third_party/xla/xla/tests/test_utils.cc b/third_party/xla/xla/tests/test_utils.cc
index 413d918796ee1e..808e1882b7b9b4 100644
--- a/third_party/xla/xla/tests/test_utils.cc
+++ b/third_party/xla/xla/tests/test_utils.cc
@@ -163,7 +163,9 @@ std::vector<HloInstruction*> FindConstrainedUses(
         constrained_uses.insert(constrained_uses.end(), converted_uses.begin(),
                                 converted_uses.end());
       } else if (opcode == HloOpcode::kSort &&
-                 instruction->operand_count() >= 2 && op_num == 0) {
+                 (instruction->operand_count() >= 2 ||
+                  Cast<const HloSortInstruction>(instruction)->is_stable()) &&
+                 op_num == 0) {
         // Operand 0 of sort is the array of keys used for key/value
         // (two-operand) kSort instructions. Since sort stability is not
         // guaranteed, constrain keys of key-value sort not to have
diff --git a/third_party/xla/xla/tests/transfer_manager_test.cc b/third_party/xla/xla/tests/transfer_manager_test.cc
index 228fdf20ecf1e3..6a4a188afd94fa 100644
--- a/third_party/xla/xla/tests/transfer_manager_test.cc
+++ b/third_party/xla/xla/tests/transfer_manager_test.cc
@@ -13,27 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <string>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <utility>
 #include <vector>
 
 #include "xla/tests/xla_test_backend_predicates.h"
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "benchmark/benchmark.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
-#include "xla/service/generic_transfer_manager.h"
+#include "xla/literal_util.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/stream_pool.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/local_client_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test_benchmark.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/test_benchmark.h"
 
 namespace xla {
 namespace {
@@ -412,8 +419,8 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
     }
     Literal literal = LiteralUtil::MakeTupleOwned(std::move(tuple_elements));
     auto device_buffer = AllocateDeviceBuffer(literal.shape());
-    TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
-                                                           device_buffer));
+    CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
+                                                        device_buffer));
     for (auto s : state) {
       TF_ASSERT_OK_AND_ASSIGN(
           Literal result,
@@ -443,8 +450,8 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
     auto device_buffer = AllocateDeviceBuffer(literal.shape());
 
     for (auto s : state) {
-      TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
-                                                             device_buffer));
+      CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
+                                                          device_buffer));
     }
     TearDown();
   }
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 9400a61805db20..5e2b95c359cedd 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -63,16 +63,14 @@ xla_cc_binary(
     srcs = ["show_signature.cc"],
     deps = [
         "//xla:shape_util",
-        "//xla:types",
         "//xla/client",
         "//xla/client:client_library",
-        "//xla/client:local_client",
         "//xla/service:hlo_proto_cc",
         "//xla/service:interpreter_plugin",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
@@ -99,14 +97,12 @@ xla_cc_binary(
         "//xla/service:hlo_proto_cc",
         "//xla/service:interpreter_plugin",
         "//xla/service:local_service",
+        "//xla/tsl/platform:env",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -131,16 +127,14 @@ xla_cc_binary(
         "//xla/service:hlo_proto_cc",
         "//xla/service:interpreter_plugin",
         "//xla/service:local_service",
+        "//xla/tsl/platform:env",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -176,12 +170,12 @@ cc_library(
         "//xla/service:compilation_environments",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_verifier",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -300,7 +294,6 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -336,6 +329,7 @@ xla_cc_binary(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -389,12 +383,16 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/translate/mhlo_to_hlo:translate",
+        "//xla/hlo/translate/stablehlo_to_hlo:translate",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_protobuf//:protobuf",
         "@com_googlesource_code_re2//:re2",
+        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -495,14 +493,19 @@ cc_library(
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_comparison",
+        "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_runner",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:hlo_verifier",
         "//xla/tests:test_utils",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -510,11 +513,8 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -545,8 +545,6 @@ xla_cc_binary(
         ":run_hlo_module_lib",
         ":run_hlo_module_proto_cc",
         "//xla:debug_options_flags",
-        "//xla/hlo/translate/mhlo_to_hlo:translate",
-        "//xla/hlo/translate/stablehlo_to_hlo:translate",
         "//xla/service:cpu_plugin",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_runner",
@@ -557,7 +555,6 @@ xla_cc_binary(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
@@ -725,6 +722,7 @@ xla_test(
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -737,6 +735,7 @@ cc_library(
     ],
     deps = [
         ":matmul_perf_table_gen",
+        "//xla:xla_data_proto_cc",
         "//xla/service/gpu/model:hlo_op_profile_proto_cc",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -891,7 +890,7 @@ xla_cc_binary(
 )
 
 xla_test(
-    name = "matmul_perf_table_gen_run",
+    name = "matmul_perf_table_gen_run_test",
     timeout = "eternal",
     srcs = ["matmul_perf_table_gen_run.cc"],
     # TODO(b/372714955): Fix the memory leak.
@@ -940,10 +939,9 @@ xla_cc_binary(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -966,17 +964,16 @@ xla_cc_binary(
         "//xla:debug_options_flags",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1024,6 +1021,8 @@ tsl_gpu_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_protobuf//:duration_cc_proto",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -1041,7 +1040,7 @@ tsl_gpu_library(
         "//xla/service/gpu:nvptx_compiler",
     ]) + if_rocm_is_configured([
         "//xla/service/gpu:amdgpu_compiler",
-    ]) + if_google(["@com_google_protobuf//:duration_cc_proto"]),
+    ]),
 )
 
 xla_test(
@@ -1063,12 +1062,19 @@ xla_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:env_time",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "//xla/tsl/protobuf:status_proto_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:duration_cc_proto",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:env_time",
         "@local_tsl//tsl/platform:errors",
@@ -1076,7 +1082,9 @@ xla_test(
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
-    ] + if_google(["@com_google_protobuf//:duration_cc_proto"]),
+    ] + if_cuda([
+        "//xla/tsl/cuda:nvml",
+    ]),
 )
 
 xla_test(
@@ -1104,14 +1112,14 @@ xla_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "//xla/tsl/protobuf:status_proto_cc",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1138,6 +1146,7 @@ xla_cc_binary(
     deps = [
         ":hlo_module_loader",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:command_line_flags",
@@ -1203,9 +1212,8 @@ xla_cc_binary(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/tools/benchmarks/hlo/hlo_llama3_8b_bf16_activation_offloading_1x8.hlo b/third_party/xla/xla/tools/benchmarks/hlo/hlo_llama3_8b_bf16_activation_offloading_1x8.hlo
new file mode 100755
index 00000000000000..de55f42fdadc92
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/hlo/hlo_llama3_8b_bf16_activation_offloading_1x8.hlo
@@ -0,0 +1,2106 @@
+HloModule jit_train_step, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias), {4}: (4, {}, may-alias), {5}: (5, {}, may-alias), {6}: (6, {}, may-alias), {7}: (7, {}, may-alias), {8}: (8, {}, may-alias), {9}: (9, {}, may-alias), {10}: (10, {}, may-alias), {11}: (11, {}, may-alias), {12}: (12, {}, may-alias), {13}: (13, {}, may-alias), {14}: (14, {}, may-alias), {15}: (15, {}, may-alias), {16}: (16, {}, may-alias), {17}: (17, {}, may-alias), {18}: (18, {}, may-alias), {19}: (19, {}, may-alias), {20}: (20, {}, may-alias), {21}: (21, {}, may-alias), {22}: (22, {}, may-alias), {23}: (23, {}, may-alias), {24}: (24, {}, may-alias), {25}: (25, {}, may-alias), {26}: (26, {}, may-alias), {27}: (27, {}, may-alias), {28}: (28, {}, may-alias), {29}: (29, {}, may-alias), {30}: (30, {}, may-alias), {31}: (31, {}, may-alias), {32}: (32, {}, may-alias), {33}: (33, {}, may-alias), {34}: (34, {}, may-alias), {35}: (35, {}, may-alias), {36}: (36, {}, may-alias), {37}: (37, {}, may-alias), {38}: (38, {}, may-alias) }, entry_computation_layout={(s32[], f32[4096]{0}, f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, /*index=5*/f32[4096,32]{1,0}, f32[4096,32]{1,0}, f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, /*index=10*/f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, f32[128256,512]{1,0}, s32[], f32[4096]{0}, /*index=15*/f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, f32[4096,32]{1,0}, f32[4096,32]{1,0}, /*index=20*/f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, /*index=25*/f32[128256,512]{1,0}, f32[4096]{0}, f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, /*index=30*/f32[4096,32]{1,0}, f32[4096,32]{1,0}, f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, /*index=35*/f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, f32[128256,512]{1,0}, s32[], s32[1,8192]{1,0}, /*index=40*/s32[1,8192]{1,0}, s32[1,8192]{1,0}, s32[1,8192]{1,0}, s32[1,8192]{1,0})->(s32[], f32[4096]{0}, f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, /*index=5*/f32[4096,32]{1,0}, f32[4096,32]{1,0}, f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, /*index=10*/f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, f32[128256,512]{1,0}, s32[], f32[4096]{0}, /*index=15*/f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, f32[4096,32]{1,0}, f32[4096,32]{1,0}, /*index=20*/f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, /*index=25*/f32[128256,512]{1,0}, f32[4096]{0}, f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, /*index=30*/f32[4096,32]{1,0}, f32[4096,32]{1,0}, f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, /*index=35*/f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, f32[128256,512]{1,0}, s32[], f32[], /*index=40*/f32[], f32[], f32[], f32[], f32[], /*index=45*/s32[])}, allow_spmd_sharding_propagation_to_parameters={false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false}, allow_spmd_sharding_propagation_to_output={false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,true,true,true}, num_partitions=8
+
+%region_62.63 (reduce_sum.330: f32[], reduce_sum.331: f32[]) -> f32[] {
+  %reduce_sum.330 = f32[] parameter(0)
+  %reduce_sum.331 = f32[] parameter(1)
+  ROOT %reduce_sum.405 = f32[] add(%reduce_sum.330, %reduce_sum.331)
+}
+
+%region_63.64 (reduce_sum.336: f32[], reduce_sum.337: f32[]) -> f32[] {
+  %reduce_sum.336 = f32[] parameter(0)
+  %reduce_sum.337 = f32[] parameter(1)
+  ROOT %reduce_sum.412 = f32[] add(%reduce_sum.336, %reduce_sum.337)
+}
+
+%region_64.65 (reduce_sum.338: f32[], reduce_sum.343: f32[]) -> f32[] {
+  %reduce_sum.338 = f32[] parameter(0)
+  %reduce_sum.343 = f32[] parameter(1)
+  ROOT %reduce_sum.419 = f32[] add(%reduce_sum.338, %reduce_sum.343)
+}
+
+%region_65.66 (reduce_sum.344: f32[], reduce_sum.345: f32[]) -> f32[] {
+  %reduce_sum.344 = f32[] parameter(0)
+  %reduce_sum.345 = f32[] parameter(1)
+  ROOT %reduce_sum.426 = f32[] add(%reduce_sum.344, %reduce_sum.345)
+}
+
+%region_66.67 (reduce_sum.350: f32[], reduce_sum.351: f32[]) -> f32[] {
+  %reduce_sum.350 = f32[] parameter(0)
+  %reduce_sum.351 = f32[] parameter(1)
+  ROOT %reduce_sum.433 = f32[] add(%reduce_sum.350, %reduce_sum.351)
+}
+
+%region_67.68 (reduce_sum.352: f32[], reduce_sum.357: f32[]) -> f32[] {
+  %reduce_sum.352 = f32[] parameter(0)
+  %reduce_sum.357 = f32[] parameter(1)
+  ROOT %reduce_sum.440 = f32[] add(%reduce_sum.352, %reduce_sum.357)
+}
+
+%region_68.69 (reduce_sum.358: f32[], reduce_sum.359: f32[]) -> f32[] {
+  %reduce_sum.358 = f32[] parameter(0)
+  %reduce_sum.359 = f32[] parameter(1)
+  ROOT %reduce_sum.447 = f32[] add(%reduce_sum.358, %reduce_sum.359)
+}
+
+%region_69.70 (reduce_sum.364: f32[], reduce_sum.365: f32[]) -> f32[] {
+  %reduce_sum.364 = f32[] parameter(0)
+  %reduce_sum.365 = f32[] parameter(1)
+  ROOT %reduce_sum.454 = f32[] add(%reduce_sum.364, %reduce_sum.365)
+}
+
+%region_70.71 (reduce_sum.366: f32[], reduce_sum.371: f32[]) -> f32[] {
+  %reduce_sum.366 = f32[] parameter(0)
+  %reduce_sum.371 = f32[] parameter(1)
+  ROOT %reduce_sum.461 = f32[] add(%reduce_sum.366, %reduce_sum.371)
+}
+
+%region_71.72 (reduce_sum.372: f32[], reduce_sum.373: f32[]) -> f32[] {
+  %reduce_sum.372 = f32[] parameter(0)
+  %reduce_sum.373 = f32[] parameter(1)
+  ROOT %reduce_sum.468 = f32[] add(%reduce_sum.372, %reduce_sum.373)
+}
+
+%region_72.73 (reduce_sum.378: f32[], reduce_sum.379: f32[]) -> f32[] {
+  %reduce_sum.378 = f32[] parameter(0)
+  %reduce_sum.379 = f32[] parameter(1)
+  ROOT %reduce_sum.475 = f32[] add(%reduce_sum.378, %reduce_sum.379)
+}
+
+%region_73.74 (reduce_sum.380: f32[], reduce_sum.385: f32[]) -> f32[] {
+  %reduce_sum.380 = f32[] parameter(0)
+  %reduce_sum.385 = f32[] parameter(1)
+  ROOT %reduce_sum.482 = f32[] add(%reduce_sum.380, %reduce_sum.385)
+}
+
+%region_0.1 (reduce_sum.23: s32[], reduce_sum.27: s32[]) -> s32[] {
+  %reduce_sum.23 = s32[] parameter(0)
+  %reduce_sum.27 = s32[] parameter(1)
+  ROOT %reduce_sum.30 = s32[] add(%reduce_sum.23, %reduce_sum.27)
+}
+
+%region_1.2 (reduce_sum.31: s32[], reduce_sum.32: s32[]) -> s32[] {
+  %reduce_sum.31 = s32[] parameter(0)
+  %reduce_sum.32 = s32[] parameter(1)
+  ROOT %reduce_sum.36 = s32[] add(%reduce_sum.31, %reduce_sum.32)
+}
+
+%region_2.3 (reduce_max.1: s32[], reduce_max.2: s32[]) -> s32[] {
+  %reduce_max.1 = s32[] parameter(0)
+  %reduce_max.2 = s32[] parameter(1)
+  ROOT %reduce_max.6 = s32[] maximum(%reduce_max.1, %reduce_max.2)
+}
+
+%region_4.4 (reduce_sum.37: f32[], reduce_sum.38: f32[]) -> f32[] {
+  %reduce_sum.37 = f32[] parameter(0)
+  %reduce_sum.38 = f32[] parameter(1)
+  ROOT %reduce_sum.39 = f32[] add(%reduce_sum.37, %reduce_sum.38)
+}
+
+%region_5.5 (reduce_sum.43: f32[], reduce_sum.44: f32[]) -> f32[] {
+  %reduce_sum.43 = f32[] parameter(0)
+  %reduce_sum.44 = f32[] parameter(1)
+  ROOT %reduce_sum.45 = f32[] add(%reduce_sum.43, %reduce_sum.44)
+}
+
+%region_3.6_spmd (param: (s32[], bf16[1,8192,4096], bf16[32,1,8192,4096], bf16[32,1,8192,32,128], bf16[32,1,8192,8,128], /*index=5*/bf16[32,1,8192,8,128], f32[32,1,32,8192,1], u32[32,2,4], bf16[32,1,8192,32,128], bf16[32,1,8192,4096], /*index=10*/bf16[32,1,8192,14336], bf16[32,1,8192,14336], f32[32,4096], f32[32,512,32,128], bf16[1,8192,1,64], /*index=15*/bf16[1,8192,1,64], f32[32,512,8,128], bf16[1,8192,1,64], bf16[1,8192,1,64], f32[32,512,8,128], /*index=20*/s32[1,1], s32[1,1], f32[32,32,128,512], f32[32,4096], f32[32,512,14336], /*index=25*/f32[32,512,14336], f32[32,14336,512])) -> (s32[], bf16[1,8192,4096], bf16[32,1,8192,4096], bf16[32,1,8192,32,128], bf16[32,1,8192,8,128], /*index=5*/bf16[32,1,8192,8,128], f32[32,1,32,8192,1], u32[32,2,4], bf16[32,1,8192,32,128], bf16[32,1,8192,4096], /*index=10*/bf16[32,1,8192,14336], bf16[32,1,8192,14336], f32[32,4096], f32[32,512,32,128], bf16[1,8192,1,64], /*index=15*/bf16[1,8192,1,64], f32[32,512,8,128], bf16[1,8192,1,64], bf16[1,8192,1,64], f32[32,512,8,128], /*index=20*/s32[1,1], s32[1,1], f32[32,32,128,512], f32[32,4096], f32[32,512,14336], /*index=25*/f32[32,512,14336], f32[32,14336,512]) {
+  %param = (s32[], bf16[1,8192,4096]{2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, /*index=5*/bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, /*index=10*/bf16[32,1,8192,14336]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,4096]{1,0}, f32[32,512,32,128]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, /*index=15*/bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,1,0}, /*index=20*/s32[1,1]{1,0}, s32[1,1]{1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,4096]{1,0}, f32[32,512,14336]{2,1,0}, /*index=25*/f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}) parameter(0)
+  %get-tuple-element.236 = s32[] get-tuple-element(%param), index=0
+  %constant.78 = s32[] constant(1)
+  %add.285 = s32[] add(%get-tuple-element.236, %constant.78)
+  %get-tuple-element.237 = bf16[1,8192,4096]{2,1,0} get-tuple-element(%param), index=1
+  %sharding_constraint.114 = bf16[1,8192,4096]{2,1,0} copy(%get-tuple-element.237)
+  %reduce_precision.18 = bf16[1,8192,4096]{2,1,0} reduce-precision(%sharding_constraint.114), exponent_bits=8, mantissa_bits=7
+  %convert_element_type.203 = f32[1,8192,4096]{2,1,0} convert(%reduce_precision.18)
+  %square.88 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.203, %convert_element_type.203)
+  %constant.79 = f32[] constant(0)
+  %reduce = f32[1,8192]{1,0} reduce(%square.88, %constant.79), dimensions={2}, to_apply=%region_4.4
+  %broadcast_in_dim.132 = f32[1,8192,1]{2,1,0} reshape(%reduce)
+  %constant.80 = f32[] constant(0.000244140625)
+  %closed_call.24 = f32[1,8192,1]{2,1,0} broadcast(%constant.80), dimensions={}
+  %div.112 = f32[1,8192,1]{2,1,0} multiply(%broadcast_in_dim.132, %closed_call.24)
+  %constant.81 = f32[] constant(1e-05)
+  %closed_call.25 = f32[1,8192,1]{2,1,0} broadcast(%constant.81), dimensions={}
+  %add.286 = f32[1,8192,1]{2,1,0} add(%div.112, %closed_call.25)
+  %rsqrt.14 = f32[1,8192,1]{2,1,0} rsqrt(%add.286)
+  %mul.687 = f32[1,8192]{1,0} reshape(%rsqrt.14)
+  %mul.688 = f32[1,8192,4096]{2,1,0} broadcast(%mul.687), dimensions={0,1}
+  %mul.689 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.203, %mul.688)
+  %convert_element_type.204 = bf16[1,8192,4096]{2,1,0} convert(%mul.689)
+  %get-tuple-element.238 = f32[32,4096]{1,0} get-tuple-element(%param), index=12
+  %constant.82 = s32[] constant(0)
+  %dynamic_slice.56 = f32[1,4096]{1,0} dynamic-slice(%get-tuple-element.238, %get-tuple-element.236, %constant.82), dynamic_slice_sizes={1,4096}
+  %squeeze.63 = f32[4096]{0} reshape(%dynamic_slice.56)
+  %convert_element_type.205 = bf16[4096]{0} convert(%squeeze.63)
+  %mul.690 = bf16[1,8192,4096]{2,1,0} broadcast(%convert_element_type.205), dimensions={2}
+  %mul.691 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.204, %mul.690)
+  %sharding_constraint.115 = bf16[1,8192,4096]{2,1,0} copy(%mul.691)
+  %sharding_constraint.116 = bf16[1,8192,4096]{2,1,0} copy(%sharding_constraint.115)
+  %get-tuple-element.239 = f32[32,512,32,128]{3,2,1,0} get-tuple-element(%param), index=13
+  %dynamic-slice = f32[1,512,32,128]{3,2,1,0} dynamic-slice(%get-tuple-element.239, %get-tuple-element.236, %constant.82, %constant.82, %constant.82), dynamic_slice_sizes={1,512,32,128}
+  %squeeze.64 = f32[512,32,128]{2,1,0} reshape(%dynamic-slice)
+  %convert_element_type.206 = bf16[512,32,128]{2,1,0} convert(%squeeze.64)
+  %all-gather = bf16[4096,32,128]{2,1,0} all-gather(%convert_element_type.206), channel_id=1, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.5 = bf16[1,8192,32,128]{3,2,1,0} dot(%sharding_constraint.116, %all-gather), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %slice.5 = bf16[1,8192,32,64]{3,2,1,0} slice(%dot.5), slice={[0:1], [0:8192], [0:32], [0:64]}
+  %get-tuple-element.240 = bf16[1,8192,1,64]{3,2,1,0} get-tuple-element(%param), index=14
+  %mul.692 = bf16[1,8192,64]{2,1,0} reshape(%get-tuple-element.240)
+  %mul.693 = bf16[1,8192,32,64]{3,2,1,0} broadcast(%mul.692), dimensions={0,1,3}
+  %mul.694 = bf16[1,8192,32,64]{3,2,1,0} multiply(%slice.5, %mul.693)
+  %slice.6 = bf16[1,8192,32,64]{3,2,1,0} slice(%dot.5), slice={[0:1], [0:8192], [0:32], [64:128]}
+  %get-tuple-element.241 = bf16[1,8192,1,64]{3,2,1,0} get-tuple-element(%param), index=15
+  %mul.695 = bf16[1,8192,64]{2,1,0} reshape(%get-tuple-element.241)
+  %mul.696 = bf16[1,8192,32,64]{3,2,1,0} broadcast(%mul.695), dimensions={0,1,3}
+  %mul.697 = bf16[1,8192,32,64]{3,2,1,0} multiply(%slice.6, %mul.696)
+  %sub.15 = bf16[1,8192,32,64]{3,2,1,0} subtract(%mul.694, %mul.697)
+  %mul.700 = bf16[1,8192,32,64]{3,2,1,0} multiply(%slice.6, %mul.693)
+  %mul.703 = bf16[1,8192,32,64]{3,2,1,0} multiply(%slice.5, %mul.696)
+  %add.287 = bf16[1,8192,32,64]{3,2,1,0} add(%mul.700, %mul.703)
+  %concatenate.12 = bf16[1,8192,32,128]{3,2,1,0} concatenate(%sub.15, %add.287), dimensions={3}
+  %sharding_constraint.117 = bf16[1,8192,32,128]{3,2,1,0} copy(%concatenate.12)
+  %reduce_precision.19 = bf16[1,8192,32,128]{3,2,1,0} reduce-precision(%sharding_constraint.117), exponent_bits=8, mantissa_bits=7
+  %get-tuple-element.242 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%param), index=16
+  %dynamic-slice.3 = f32[1,512,8,128]{3,2,1,0} dynamic-slice(%get-tuple-element.242, %get-tuple-element.236, %constant.82, %constant.82, %constant.82), dynamic_slice_sizes={1,512,8,128}
+  %squeeze.65 = f32[512,8,128]{2,1,0} reshape(%dynamic-slice.3)
+  %convert_element_type.207 = bf16[512,8,128]{2,1,0} convert(%squeeze.65)
+  %all-gather.1 = bf16[4096,8,128]{2,1,0} all-gather(%convert_element_type.207), channel_id=2, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.6 = bf16[1,8192,8,128]{3,2,1,0} dot(%sharding_constraint.116, %all-gather.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %slice.7 = bf16[1,8192,8,64]{3,2,1,0} slice(%dot.6), slice={[0:1], [0:8192], [0:8], [0:64]}
+  %get-tuple-element.243 = bf16[1,8192,1,64]{3,2,1,0} get-tuple-element(%param), index=17
+  %mul.704 = bf16[1,8192,64]{2,1,0} reshape(%get-tuple-element.243)
+  %mul.705 = bf16[1,8192,8,64]{3,2,1,0} broadcast(%mul.704), dimensions={0,1,3}
+  %mul.706 = bf16[1,8192,8,64]{3,2,1,0} multiply(%slice.7, %mul.705)
+  %slice.8 = bf16[1,8192,8,64]{3,2,1,0} slice(%dot.6), slice={[0:1], [0:8192], [0:8], [64:128]}
+  %get-tuple-element.244 = bf16[1,8192,1,64]{3,2,1,0} get-tuple-element(%param), index=18
+  %mul.707 = bf16[1,8192,64]{2,1,0} reshape(%get-tuple-element.244)
+  %mul.708 = bf16[1,8192,8,64]{3,2,1,0} broadcast(%mul.707), dimensions={0,1,3}
+  %mul.709 = bf16[1,8192,8,64]{3,2,1,0} multiply(%slice.8, %mul.708)
+  %sub.36 = bf16[1,8192,8,64]{3,2,1,0} subtract(%mul.706, %mul.709)
+  %mul.712 = bf16[1,8192,8,64]{3,2,1,0} multiply(%slice.8, %mul.705)
+  %mul.715 = bf16[1,8192,8,64]{3,2,1,0} multiply(%slice.7, %mul.708)
+  %add.288 = bf16[1,8192,8,64]{3,2,1,0} add(%mul.712, %mul.715)
+  %concatenate.13 = bf16[1,8192,8,128]{3,2,1,0} concatenate(%sub.36, %add.288), dimensions={3}
+  %sharding_constraint.119 = bf16[1,8192,8,128]{3,2,1,0} copy(%concatenate.13)
+  %reduce_precision.20 = bf16[1,8192,8,128]{3,2,1,0} reduce-precision(%sharding_constraint.119), exponent_bits=8, mantissa_bits=7
+  %get-tuple-element.245 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%param), index=19
+  %dynamic-slice.6 = f32[1,512,8,128]{3,2,1,0} dynamic-slice(%get-tuple-element.245, %get-tuple-element.236, %constant.82, %constant.82, %constant.82), dynamic_slice_sizes={1,512,8,128}
+  %squeeze.66 = f32[512,8,128]{2,1,0} reshape(%dynamic-slice.6)
+  %convert_element_type.208 = bf16[512,8,128]{2,1,0} convert(%squeeze.66)
+  %all-gather.2 = bf16[4096,8,128]{2,1,0} all-gather(%convert_element_type.208), channel_id=3, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.7 = bf16[1,8192,8,128]{3,2,1,0} dot(%sharding_constraint.116, %all-gather.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %sharding_constraint.120 = bf16[1,8192,8,128]{3,2,1,0} copy(%dot.7)
+  %reduce_precision.21 = bf16[1,8192,8,128]{3,2,1,0} reduce-precision(%sharding_constraint.120), exponent_bits=8, mantissa_bits=7
+  %constant.260 = bf16[0]{0} constant({})
+  %constant.261 = u32[] constant(0)
+  %broadcast.150 = u32[16]{0} broadcast(%constant.261), dimensions={}
+  %constant.263 = s32[8]{0} constant({0, 2, 4, 6, 8, 10, 12, 14})
+  %partition-id = u32[] partition-id()
+  %dynamic-slice.7 = s32[1]{0} dynamic-slice(%constant.263, %partition-id), dynamic_slice_sizes={1}
+  %reshape.272 = s32[] reshape(%dynamic-slice.7)
+  %dynamic-slice.8 = u32[2]{0} dynamic-slice(%broadcast.150, %reshape.272), dynamic_slice_sizes={2}
+  %constant.264 = s32[1]{0} constant({0})
+  %get-tuple-element.246 = s32[1,1]{1,0} get-tuple-element(%param), index=20
+  %reshape.273 = s32[1]{0} reshape(%get-tuple-element.246)
+  %lt.12 = pred[1]{0} compare(%reshape.273, %constant.264), direction=LT
+  %broadcast_in_dim.133 = s32[1]{0} reshape(%constant.82)
+  %select_n.61 = s32[1]{0} select(%lt.12, %broadcast_in_dim.133, %reshape.273)
+  %concatenate.14 = s32[2]{0} concatenate(%constant.264, %select_n.61), dimensions={0}
+  %get-tuple-element.247 = s32[1,1]{1,0} get-tuple-element(%param), index=21
+  %reshape.274 = s32[1]{0} reshape(%get-tuple-element.247)
+  %lt.13 = pred[1]{0} compare(%reshape.274, %constant.264), direction=LT
+  %select_n.62 = s32[1]{0} select(%lt.13, %broadcast_in_dim.133, %reshape.274)
+  %concatenate.15 = s32[2]{0} concatenate(%constant.264, %select_n.62), dimensions={0}
+  %constant.262 = f32[0]{0} constant({})
+  %te_fused_attn_forward_ffi.5 = (bf16[1,8192,32,128]{3,2,1,0}, f32[1,32,8192,1]{3,2,1,0}, u32[2,4]{1,0}, u8[288]{0}) custom-call(%reduce_precision.19, %reduce_precision.20, %reduce_precision.21, %constant.260, %dynamic-slice.8, /*index=5*/%concatenate.14, %concatenate.15, %constant.262, %constant.262, %constant.262, /*index=10*/%constant.262, %constant.262, %constant.262), custom_call_target="te_fused_attn_forward_ffi", operand_layout_constraints={bf16[1,8192,32,128]{3,2,1,0}, bf16[1,8192,8,128]{3,2,1,0}, bf16[1,8192,8,128]{3,2,1,0}, bf16[0]{0}, u32[2]{0}, s32[2]{0}, s32[2]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}}, api_version=API_VERSION_TYPED_FFI, backend_config={attn_heads = 32 : i64, bias_batch = 0 : i64, bias_heads = 0 : i64, bias_type = 0 : i64, deterministic = false, dropout_probability = 0.000000e+00 : f64, input_batch = 1 : i64, is_training = true, kv_max_seqlen = 8192 : i64, mask_type = 3 : i64, max_segments_per_seq = 1 : i64, num_gqa_groups = 8 : i64, q_max_seqlen = 8192 : i64, qk_head_dim = 128 : i64, qkv_layout = 9 : i64, scaling_factor = 1.000000e+00 : f64, v_head_dim = 128 : i64, window_size_left = -1 : i64, window_size_right = -1 : i64}
+  %get-tuple-element.248 = bf16[1,8192,32,128]{3,2,1,0} get-tuple-element(%te_fused_attn_forward_ffi.5), index=0
+  %reduce_precision.22 = bf16[1,8192,32,128]{3,2,1,0} reduce-precision(%get-tuple-element.248), exponent_bits=8, mantissa_bits=7
+  %sharding_constraint.121 = bf16[1,8192,32,128]{3,2,1,0} copy(%reduce_precision.22)
+  %get-tuple-element.249 = f32[32,32,128,512]{3,2,1,0} get-tuple-element(%param), index=22
+  %dynamic-slice.9 = f32[1,32,128,512]{3,2,1,0} dynamic-slice(%get-tuple-element.249, %get-tuple-element.236, %constant.82, %constant.82, %constant.82), dynamic_slice_sizes={1,32,128,512}
+  %squeeze.67 = f32[32,128,512]{2,1,0} reshape(%dynamic-slice.9)
+  %convert_element_type.209 = bf16[32,128,512]{2,1,0} convert(%squeeze.67)
+  %all-gather.3 = bf16[32,128,4096]{2,1,0} all-gather(%convert_element_type.209), channel_id=4, replica_groups=[1,8]<=[8], dimensions={2}, use_global_device_ids=true
+  %dot.8 = bf16[1,8192,4096]{2,1,0} dot(%sharding_constraint.121, %all-gather.3), lhs_contracting_dims={2,3}, rhs_contracting_dims={0,1}
+  %reduce_precision.23 = bf16[1,8192,4096]{2,1,0} reduce-precision(%dot.8), exponent_bits=8, mantissa_bits=7
+  %sharding_constraint.122 = bf16[1,8192,4096]{2,1,0} copy(%reduce_precision.23)
+  %add.289 = bf16[1,8192,4096]{2,1,0} add(%reduce_precision.18, %sharding_constraint.122)
+  %convert_element_type.210 = f32[1,8192,4096]{2,1,0} convert(%add.289)
+  %square.89 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.210, %convert_element_type.210)
+  %reduce.1 = f32[1,8192]{1,0} reduce(%square.89, %constant.79), dimensions={2}, to_apply=%region_5.5
+  %broadcast_in_dim.135 = f32[1,8192,1]{2,1,0} reshape(%reduce.1)
+  %div.113 = f32[1,8192,1]{2,1,0} multiply(%broadcast_in_dim.135, %closed_call.24)
+  %add.290 = f32[1,8192,1]{2,1,0} add(%div.113, %closed_call.25)
+  %rsqrt.15 = f32[1,8192,1]{2,1,0} rsqrt(%add.290)
+  %mul.716 = f32[1,8192]{1,0} reshape(%rsqrt.15)
+  %mul.717 = f32[1,8192,4096]{2,1,0} broadcast(%mul.716), dimensions={0,1}
+  %mul.718 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.210, %mul.717)
+  %convert_element_type.211 = bf16[1,8192,4096]{2,1,0} convert(%mul.718)
+  %get-tuple-element.250 = f32[32,4096]{1,0} get-tuple-element(%param), index=23
+  %dynamic_slice.57 = f32[1,4096]{1,0} dynamic-slice(%get-tuple-element.250, %get-tuple-element.236, %constant.82), dynamic_slice_sizes={1,4096}
+  %squeeze.68 = f32[4096]{0} reshape(%dynamic_slice.57)
+  %convert_element_type.212 = bf16[4096]{0} convert(%squeeze.68)
+  %mul.719 = bf16[1,8192,4096]{2,1,0} broadcast(%convert_element_type.212), dimensions={2}
+  %mul.720 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.211, %mul.719)
+  %sharding_constraint.123 = bf16[1,8192,4096]{2,1,0} copy(%mul.720)
+  %get-tuple-element.251 = f32[32,512,14336]{2,1,0} get-tuple-element(%param), index=24
+  %dynamic-slice.10 = f32[1,512,14336]{2,1,0} dynamic-slice(%get-tuple-element.251, %get-tuple-element.236, %constant.82, %constant.82), dynamic_slice_sizes={1,512,14336}
+  %squeeze.69 = f32[512,14336]{1,0} reshape(%dynamic-slice.10)
+  %convert_element_type.213 = bf16[512,14336]{1,0} convert(%squeeze.69)
+  %all-gather.4 = bf16[4096,14336]{1,0} all-gather(%convert_element_type.213), channel_id=5, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.9 = bf16[1,8192,14336]{2,1,0} dot(%sharding_constraint.123, %all-gather.4), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %constant.271 = bf16[] constant(1)
+  %jit_silu_.6 = bf16[1,8192,14336]{2,1,0} broadcast(%constant.271), dimensions={}
+  %neg.16 = bf16[1,8192,14336]{2,1,0} negate(%dot.9)
+  %exp.10 = bf16[1,8192,14336]{2,1,0} exponential(%neg.16)
+  %add.291 = bf16[1,8192,14336]{2,1,0} add(%exp.10, %jit_silu_.6)
+  %div.114 = bf16[1,8192,14336]{2,1,0} divide(%jit_silu_.6, %add.291)
+  %mul.721 = bf16[1,8192,14336]{2,1,0} multiply(%dot.9, %div.114)
+  %get-tuple-element.252 = f32[32,512,14336]{2,1,0} get-tuple-element(%param), index=25
+  %dynamic-slice.11 = f32[1,512,14336]{2,1,0} dynamic-slice(%get-tuple-element.252, %get-tuple-element.236, %constant.82, %constant.82), dynamic_slice_sizes={1,512,14336}
+  %squeeze.70 = f32[512,14336]{1,0} reshape(%dynamic-slice.11)
+  %convert_element_type.214 = bf16[512,14336]{1,0} convert(%squeeze.70)
+  %all-gather.5 = bf16[4096,14336]{1,0} all-gather(%convert_element_type.214), channel_id=6, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.10 = bf16[1,8192,14336]{2,1,0} dot(%sharding_constraint.123, %all-gather.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %mul.722 = bf16[1,8192,14336]{2,1,0} multiply(%mul.721, %dot.10)
+  %sharding_constraint.124 = bf16[1,8192,14336]{2,1,0} copy(%mul.722)
+  %get-tuple-element.253 = f32[32,14336,512]{2,1,0} get-tuple-element(%param), index=26
+  %dynamic-slice.12 = f32[1,14336,512]{2,1,0} dynamic-slice(%get-tuple-element.253, %get-tuple-element.236, %constant.82, %constant.82), dynamic_slice_sizes={1,14336,512}
+  %squeeze.71 = f32[14336,512]{1,0} reshape(%dynamic-slice.12)
+  %convert_element_type.215 = bf16[14336,512]{1,0} convert(%squeeze.71)
+  %all-gather.6 = bf16[14336,4096]{1,0} all-gather(%convert_element_type.215), channel_id=7, replica_groups=[1,8]<=[8], dimensions={1}, use_global_device_ids=true
+  %dot.11 = bf16[1,8192,4096]{2,1,0} dot(%sharding_constraint.124, %all-gather.6), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %sharding_constraint.125 = bf16[1,8192,4096]{2,1,0} copy(%dot.11)
+  %add.292 = bf16[1,8192,4096]{2,1,0} add(%sharding_constraint.125, %add.289)
+  %sharding_constraint.126 = bf16[1,8192,4096]{2,1,0} copy(%add.292)
+  %get-tuple-element.254 = bf16[32,1,8192,4096]{3,2,1,0} get-tuple-element(%param), index=2
+  %broadcast_in_dim.136 = bf16[1,1,8192,4096]{3,2,1,0} reshape(%reduce_precision.18)
+  %dynamic-update-slice = bf16[32,1,8192,4096]{3,2,1,0} dynamic-update-slice(%get-tuple-element.254, %broadcast_in_dim.136, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82)
+  %get-tuple-element.255 = bf16[32,1,8192,32,128]{4,3,2,1,0} get-tuple-element(%param), index=3
+  %broadcast_in_dim.137 = bf16[1,1,8192,32,128]{4,3,2,1,0} reshape(%reduce_precision.19)
+  %dynamic-update-slice.1 = bf16[32,1,8192,32,128]{4,3,2,1,0} dynamic-update-slice(%get-tuple-element.255, %broadcast_in_dim.137, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82, %constant.82)
+  %get-tuple-element.256 = bf16[32,1,8192,8,128]{4,3,2,1,0} get-tuple-element(%param), index=4
+  %broadcast_in_dim.138 = bf16[1,1,8192,8,128]{4,3,2,1,0} reshape(%reduce_precision.20)
+  %dynamic-update-slice.2 = bf16[32,1,8192,8,128]{4,3,2,1,0} dynamic-update-slice(%get-tuple-element.256, %broadcast_in_dim.138, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82, %constant.82)
+  %get-tuple-element.257 = bf16[32,1,8192,8,128]{4,3,2,1,0} get-tuple-element(%param), index=5
+  %broadcast_in_dim.139 = bf16[1,1,8192,8,128]{4,3,2,1,0} reshape(%reduce_precision.21)
+  %dynamic-update-slice.3 = bf16[32,1,8192,8,128]{4,3,2,1,0} dynamic-update-slice(%get-tuple-element.257, %broadcast_in_dim.139, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82, %constant.82)
+  %get-tuple-element.258 = f32[32,1,32,8192,1]{4,3,2,1,0} get-tuple-element(%param), index=6
+  %get-tuple-element.259 = f32[1,32,8192,1]{3,2,1,0} get-tuple-element(%te_fused_attn_forward_ffi.5), index=1
+  %broadcast_in_dim.140 = f32[1,1,32,8192,1]{4,3,2,1,0} reshape(%get-tuple-element.259)
+  %dynamic-update-slice.4 = f32[32,1,32,8192,1]{4,3,2,1,0} dynamic-update-slice(%get-tuple-element.258, %broadcast_in_dim.140, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82, %constant.82)
+  %get-tuple-element.260 = u32[32,2,4]{2,1,0} get-tuple-element(%param), index=7
+  %get-tuple-element.261 = u32[2,4]{1,0} get-tuple-element(%te_fused_attn_forward_ffi.5), index=2
+  %sharding_constraint.127 = u32[2,4]{1,0} copy(%get-tuple-element.261)
+  %broadcast_in_dim.141 = u32[1,2,4]{2,1,0} reshape(%sharding_constraint.127)
+  %dynamic-update-slice.5 = u32[32,2,4]{2,1,0} dynamic-update-slice(%get-tuple-element.260, %broadcast_in_dim.141, %get-tuple-element.236, %constant.82, %constant.82)
+  %get-tuple-element.262 = bf16[32,1,8192,32,128]{4,3,2,1,0} get-tuple-element(%param), index=8
+  %broadcast_in_dim.142 = bf16[1,1,8192,32,128]{4,3,2,1,0} reshape(%reduce_precision.22)
+  %dynamic-update-slice.6 = bf16[32,1,8192,32,128]{4,3,2,1,0} dynamic-update-slice(%get-tuple-element.262, %broadcast_in_dim.142, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82, %constant.82)
+  %get-tuple-element.263 = bf16[32,1,8192,4096]{3,2,1,0} get-tuple-element(%param), index=9
+  %broadcast_in_dim.143 = bf16[1,1,8192,4096]{3,2,1,0} reshape(%reduce_precision.23)
+  %dynamic-update-slice.7 = bf16[32,1,8192,4096]{3,2,1,0} dynamic-update-slice(%get-tuple-element.263, %broadcast_in_dim.143, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82)
+  %get-tuple-element.264 = bf16[32,1,8192,14336]{3,2,1,0} get-tuple-element(%param), index=10
+  %closed_call.26 = bf16[1,8192,14336]{2,1,0} custom-call(%dot.9), custom_call_target="MoveToHost"
+  %broadcast_in_dim.144 = bf16[1,1,8192,14336]{3,2,1,0} reshape(%closed_call.26)
+  %dynamic-update-slice.8 = bf16[32,1,8192,14336]{3,2,1,0} dynamic-update-slice(%get-tuple-element.264, %broadcast_in_dim.144, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82)
+  %get-tuple-element.265 = bf16[32,1,8192,14336]{3,2,1,0} get-tuple-element(%param), index=11
+  %closed_call.27 = bf16[1,8192,14336]{2,1,0} custom-call(%dot.10), custom_call_target="MoveToHost"
+  %broadcast_in_dim.145 = bf16[1,1,8192,14336]{3,2,1,0} reshape(%closed_call.27)
+  %dynamic-update-slice.9 = bf16[32,1,8192,14336]{3,2,1,0} dynamic-update-slice(%get-tuple-element.265, %broadcast_in_dim.145, %get-tuple-element.236, %constant.82, %constant.82, /*index=5*/%constant.82)
+  ROOT %tuple.14 = (s32[], bf16[1,8192,4096]{2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, /*index=5*/bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, /*index=10*/bf16[32,1,8192,14336]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,4096]{1,0}, f32[32,512,32,128]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, /*index=15*/bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,1,0}, /*index=20*/s32[1,1]{1,0}, s32[1,1]{1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,4096]{1,0}, f32[32,512,14336]{2,1,0}, /*index=25*/f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}) tuple(%add.285, %sharding_constraint.126, %dynamic-update-slice, %dynamic-update-slice.1, %dynamic-update-slice.2, /*index=5*/%dynamic-update-slice.3, %dynamic-update-slice.4, %dynamic-update-slice.5, %dynamic-update-slice.6, %dynamic-update-slice.7, /*index=10*/%dynamic-update-slice.8, %dynamic-update-slice.9, %get-tuple-element.238, %get-tuple-element.239, %get-tuple-element.240, /*index=15*/%get-tuple-element.241, %get-tuple-element.242, %get-tuple-element.243, %get-tuple-element.244, %get-tuple-element.245, /*index=20*/%get-tuple-element.246, %get-tuple-element.247, %get-tuple-element.249, %get-tuple-element.250, %get-tuple-element.251, /*index=25*/%get-tuple-element.252, %get-tuple-element.253)
+}
+
+%region_6.7_spmd (param.1: (s32[], bf16[1,8192,4096], bf16[32,1,8192,4096], bf16[32,1,8192,32,128], bf16[32,1,8192,8,128], /*index=5*/bf16[32,1,8192,8,128], f32[32,1,32,8192,1], u32[32,2,4], bf16[32,1,8192,32,128], bf16[32,1,8192,4096], /*index=10*/bf16[32,1,8192,14336], bf16[32,1,8192,14336], f32[32,4096], f32[32,512,32,128], bf16[1,8192,1,64], /*index=15*/bf16[1,8192,1,64], f32[32,512,8,128], bf16[1,8192,1,64], bf16[1,8192,1,64], f32[32,512,8,128], /*index=20*/s32[1,1], s32[1,1], f32[32,32,128,512], f32[32,4096], f32[32,512,14336], /*index=25*/f32[32,512,14336], f32[32,14336,512])) -> pred[] {
+  %param.1 = (s32[], bf16[1,8192,4096]{2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, /*index=5*/bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, /*index=10*/bf16[32,1,8192,14336]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,4096]{1,0}, f32[32,512,32,128]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, /*index=15*/bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,1,0}, /*index=20*/s32[1,1]{1,0}, s32[1,1]{1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,4096]{1,0}, f32[32,512,14336]{2,1,0}, /*index=25*/f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}) parameter(0)
+  %get-tuple-element.266 = s32[] get-tuple-element(%param.1), index=0
+  %constant.310 = s32[] constant(32)
+  ROOT %lt.14 = pred[] compare(%get-tuple-element.266, %constant.310), direction=LT
+}
+
+%region_7.8 (reduce_sum.51: f32[], reduce_sum.52: f32[]) -> f32[] {
+  %reduce_sum.51 = f32[] parameter(0)
+  %reduce_sum.52 = f32[] parameter(1)
+  ROOT %reduce_sum.53 = f32[] add(%reduce_sum.51, %reduce_sum.52)
+}
+
+%region_8.9 (reduce_sum.57: s32[], reduce_sum.58: s32[]) -> s32[] {
+  %reduce_sum.57 = s32[] parameter(0)
+  %reduce_sum.58 = s32[] parameter(1)
+  ROOT %reduce_sum.59 = s32[] add(%reduce_sum.57, %reduce_sum.58)
+}
+
+%region_8.9.clone (reduce_sum.585: s32[], reduce_sum.586: s32[]) -> s32[] {
+  %reduce_sum.585 = s32[] parameter(0)
+  %reduce_sum.586 = s32[] parameter(1)
+  ROOT %reduce_sum.587 = s32[] add(%reduce_sum.585, %reduce_sum.586)
+}
+
+%region_9.10 (reduce_max.7: f32[], reduce_max.8: f32[]) -> f32[] {
+  %reduce_max.7 = f32[] parameter(0)
+  %reduce_max.8 = f32[] parameter(1)
+  ROOT %reduce_max.9 = f32[] maximum(%reduce_max.7, %reduce_max.8)
+}
+
+%region_10.11 (reduce_sum.60: f32[], reduce_sum.64: f32[]) -> f32[] {
+  %reduce_sum.60 = f32[] parameter(0)
+  %reduce_sum.64 = f32[] parameter(1)
+  ROOT %reduce_sum.65 = f32[] add(%reduce_sum.60, %reduce_sum.64)
+}
+
+%region_11.12 (reduce_sum.66: bf16[], reduce_sum.67: bf16[]) -> bf16[] {
+  %reduce_sum.66 = bf16[] parameter(0)
+  %reduce_sum.67 = bf16[] parameter(1)
+  ROOT %reduce_sum.71 = bf16[] add(%reduce_sum.66, %reduce_sum.67)
+}
+
+%region_11.12.clone (reduce_sum.588: bf16[], reduce_sum.589: bf16[]) -> bf16[] {
+  %reduce_sum.588 = bf16[] parameter(0)
+  %reduce_sum.589 = bf16[] parameter(1)
+  ROOT %reduce_sum.590 = bf16[] add(%reduce_sum.588, %reduce_sum.589)
+}
+
+%region_12.13 (reduce_sum.72: f32[], reduce_sum.73: f32[]) -> f32[] {
+  %reduce_sum.72 = f32[] parameter(0)
+  %reduce_sum.73 = f32[] parameter(1)
+  ROOT %reduce_sum.74 = f32[] add(%reduce_sum.72, %reduce_sum.73)
+}
+
+%region_13.14 (reduce_sum.78: f32[], reduce_sum.79: f32[]) -> f32[] {
+  %reduce_sum.78 = f32[] parameter(0)
+  %reduce_sum.79 = f32[] parameter(1)
+  ROOT %reduce_sum.80 = f32[] add(%reduce_sum.78, %reduce_sum.79)
+}
+
+%region_15.15 (reduce_sum.81: f32[], reduce_sum.85: f32[]) -> f32[] {
+  %reduce_sum.81 = f32[] parameter(0)
+  %reduce_sum.85 = f32[] parameter(1)
+  ROOT %reduce_sum.86 = f32[] add(%reduce_sum.81, %reduce_sum.85)
+}
+
+%region_16.16 (reduce_sum.87: f32[], reduce_sum.88: f32[]) -> f32[] {
+  %reduce_sum.87 = f32[] parameter(0)
+  %reduce_sum.88 = f32[] parameter(1)
+  ROOT %reduce_sum.92 = f32[] add(%reduce_sum.87, %reduce_sum.88)
+}
+
+%region_17.17 (reduce_sum.93: s32[], reduce_sum.94: s32[]) -> s32[] {
+  %reduce_sum.93 = s32[] parameter(0)
+  %reduce_sum.94 = s32[] parameter(1)
+  ROOT %reduce_sum.95 = s32[] add(%reduce_sum.93, %reduce_sum.94)
+}
+
+%region_18.18 (reduce_sum.99: s32[], reduce_sum.100: s32[]) -> s32[] {
+  %reduce_sum.99 = s32[] parameter(0)
+  %reduce_sum.100 = s32[] parameter(1)
+  ROOT %reduce_sum.101 = s32[] add(%reduce_sum.99, %reduce_sum.100)
+}
+
+%region_19.19 (reduce_max.13: s32[], reduce_max.14: s32[]) -> s32[] {
+  %reduce_max.13 = s32[] parameter(0)
+  %reduce_max.14 = s32[] parameter(1)
+  ROOT %reduce_max.15 = s32[] maximum(%reduce_max.13, %reduce_max.14)
+}
+
+%region_20.20 (reduce_sum.102: f32[], reduce_sum.106: f32[]) -> f32[] {
+  %reduce_sum.102 = f32[] parameter(0)
+  %reduce_sum.106 = f32[] parameter(1)
+  ROOT %reduce_sum.107 = f32[] add(%reduce_sum.102, %reduce_sum.106)
+}
+
+%region_21.21 (reduce_sum.108: f32[], reduce_sum.113: f32[]) -> f32[] {
+  %reduce_sum.108 = f32[] parameter(0)
+  %reduce_sum.113 = f32[] parameter(1)
+  ROOT %reduce_sum.114 = f32[] add(%reduce_sum.108, %reduce_sum.113)
+}
+
+%add.clone (x.1: bf16[], y.1: bf16[]) -> bf16[] {
+  %x.1 = bf16[] parameter(0)
+  %y.1 = bf16[] parameter(1)
+  ROOT %add.294 = bf16[] add(%x.1, %y.1)
+}
+
+%add.1.clone (x.3: bf16[], y.3: bf16[]) -> bf16[] {
+  %x.3 = bf16[] parameter(0)
+  %y.3 = bf16[] parameter(1)
+  ROOT %add.296 = bf16[] add(%x.3, %y.3)
+}
+
+%add.2.clone (x.5: bf16[], y.5: bf16[]) -> bf16[] {
+  %x.5 = bf16[] parameter(0)
+  %y.5 = bf16[] parameter(1)
+  ROOT %add.298 = bf16[] add(%x.5, %y.5)
+}
+
+%region_22.22 (reduce_sum.120: bf16[], reduce_sum.121: bf16[]) -> bf16[] {
+  %reduce_sum.120 = bf16[] parameter(0)
+  %reduce_sum.121 = bf16[] parameter(1)
+  ROOT %reduce_sum.122 = bf16[] add(%reduce_sum.120, %reduce_sum.121)
+}
+
+%region_22.22.clone (reduce_sum.469: bf16[], reduce_sum.470: bf16[]) -> bf16[] {
+  %reduce_sum.469 = bf16[] parameter(0)
+  %reduce_sum.470 = bf16[] parameter(1)
+  ROOT %reduce_sum.471 = bf16[] add(%reduce_sum.469, %reduce_sum.470)
+}
+
+%region_23.23 (reduce_sum.127: bf16[], reduce_sum.128: bf16[]) -> bf16[] {
+  %reduce_sum.127 = bf16[] parameter(0)
+  %reduce_sum.128 = bf16[] parameter(1)
+  ROOT %reduce_sum.129 = bf16[] add(%reduce_sum.127, %reduce_sum.128)
+}
+
+%region_23.23.clone (reduce_sum.476: bf16[], reduce_sum.477: bf16[]) -> bf16[] {
+  %reduce_sum.476 = bf16[] parameter(0)
+  %reduce_sum.477 = bf16[] parameter(1)
+  ROOT %reduce_sum.478 = bf16[] add(%reduce_sum.476, %reduce_sum.477)
+}
+
+%add.3.clone (x.7: bf16[], y.7: bf16[]) -> bf16[] {
+  %x.7 = bf16[] parameter(0)
+  %y.7 = bf16[] parameter(1)
+  ROOT %add.300 = bf16[] add(%x.7, %y.7)
+}
+
+%add.4.clone (x.9: bf16[], y.9: bf16[]) -> bf16[] {
+  %x.9 = bf16[] parameter(0)
+  %y.9 = bf16[] parameter(1)
+  ROOT %add.302 = bf16[] add(%x.9, %y.9)
+}
+
+%add.5.clone (x.11: bf16[], y.11: bf16[]) -> bf16[] {
+  %x.11 = bf16[] parameter(0)
+  %y.11 = bf16[] parameter(1)
+  ROOT %add.304 = bf16[] add(%x.11, %y.11)
+}
+
+%add.6.clone (x.13: bf16[], y.13: bf16[]) -> bf16[] {
+  %x.13 = bf16[] parameter(0)
+  %y.13 = bf16[] parameter(1)
+  ROOT %add.306 = bf16[] add(%x.13, %y.13)
+}
+
+%region_14.24_spmd (param.2: (s32[], bf16[1,8192,4096], f32[32,512,14336], f32[32,512,14336], f32[32,14336,512], /*index=5*/f32[32,4096], f32[32,4096], f32[32,512,8,128], f32[32,32,128,512], f32[32,512,32,128], /*index=10*/f32[32,512,8,128], bf16[32,1,8192,14336], f32[32,14336,512], f32[32,512,14336], bf16[32,1,8192,14336], /*index=15*/f32[32,512,14336], f32[32,4096], bf16[32,1,8192,4096], bf16[32,1,8192,4096], bf16[32,1,8192,32,128], /*index=20*/bf16[32,1,8192,8,128], bf16[32,1,8192,8,128], f32[32,1,32,8192,1], u32[32,2,4], bf16[32,1,8192,32,128], /*index=25*/f32[32,32,128,512], s32[1,8192], f32[32,512,8,128], s32[1,8192], f32[32,512,8,128], /*index=30*/f32[32,512,32,128], f32[32,4096])) -> (s32[], bf16[1,8192,4096], f32[32,512,14336], f32[32,512,14336], f32[32,14336,512], /*index=5*/f32[32,4096], f32[32,4096], f32[32,512,8,128], f32[32,32,128,512], f32[32,512,32,128], /*index=10*/f32[32,512,8,128], bf16[32,1,8192,14336], f32[32,14336,512], f32[32,512,14336], bf16[32,1,8192,14336], /*index=15*/f32[32,512,14336], f32[32,4096], bf16[32,1,8192,4096], bf16[32,1,8192,4096], bf16[32,1,8192,32,128], /*index=20*/bf16[32,1,8192,8,128], bf16[32,1,8192,8,128], f32[32,1,32,8192,1], u32[32,2,4], bf16[32,1,8192,32,128], /*index=25*/f32[32,32,128,512], s32[1,8192], f32[32,512,8,128], s32[1,8192], f32[32,512,8,128], /*index=30*/f32[32,512,32,128], f32[32,4096]) {
+  %param.2 = (s32[], bf16[1,8192,4096]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}, /*index=5*/f32[32,4096]{1,0}, f32[32,4096]{1,0}, f32[32,512,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,512,32,128]{3,2,1,0}, /*index=10*/f32[32,512,8,128]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,14336,512]{2,1,0}, f32[32,512,14336]{2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, /*index=15*/f32[32,512,14336]{2,1,0}, f32[32,4096]{1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=20*/bf16[32,1,8192,8,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=25*/f32[32,32,128,512]{3,2,1,0}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,1,0}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,1,0}, /*index=30*/f32[32,512,32,128]{3,2,1,0}, f32[32,4096]{1,0}) parameter(0)
+  %get-tuple-element.267 = s32[] get-tuple-element(%param.2), index=0
+  %constant.311 = s32[] constant(1)
+  %add.307 = s32[] add(%get-tuple-element.267, %constant.311)
+  %get-tuple-element.268 = bf16[1,8192,4096]{2,1,0} get-tuple-element(%param.2), index=1
+  %sharding_constraint.128 = bf16[1,8192,4096]{2,1,0} copy(%get-tuple-element.268)
+  %get-tuple-element.269 = bf16[32,1,8192,14336]{3,2,1,0} get-tuple-element(%param.2), index=11
+  %constant.312 = s32[] constant(31)
+  %sub.37 = s32[] subtract(%constant.312, %get-tuple-element.267)
+  %constant.313 = s32[] constant(0)
+  %dynamic-slice.13 = bf16[1,1,8192,14336]{3,2,1,0} dynamic-slice(%get-tuple-element.269, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,1,8192,14336}
+  %squeeze.72 = bf16[1,8192,14336]{2,1,0} reshape(%dynamic-slice.13)
+  %closed_call.28 = bf16[1,8192,14336]{2,1,0} custom-call(%squeeze.72), custom_call_target="MoveToDevice"
+  %constant.317 = bf16[] constant(1)
+  %jit_silu_.12 = bf16[1,8192,14336]{2,1,0} broadcast(%constant.317), dimensions={}
+  %neg.17 = bf16[1,8192,14336]{2,1,0} negate(%closed_call.28)
+  %exp.11 = bf16[1,8192,14336]{2,1,0} exponential(%neg.17)
+  %add.308 = bf16[1,8192,14336]{2,1,0} add(%exp.11, %jit_silu_.12)
+  %div.115 = bf16[1,8192,14336]{2,1,0} divide(%jit_silu_.12, %add.308)
+  %mul.723 = bf16[1,8192,14336]{2,1,0} multiply(%closed_call.28, %div.115)
+  %sharding_constraint.129 = bf16[1,8192,4096]{2,1,0} copy(%sharding_constraint.128)
+  %get-tuple-element.270 = f32[32,14336,512]{2,1,0} get-tuple-element(%param.2), index=12
+  %dynamic-slice.14 = f32[1,14336,512]{2,1,0} dynamic-slice(%get-tuple-element.270, %sub.37, %constant.313, %constant.313), dynamic_slice_sizes={1,14336,512}
+  %squeeze.73 = f32[14336,512]{1,0} reshape(%dynamic-slice.14)
+  %convert_element_type.216 = bf16[14336,512]{1,0} convert(%squeeze.73)
+  %all-gather.7 = bf16[14336,4096]{1,0} all-gather(%convert_element_type.216), channel_id=8, replica_groups=[1,8]<=[8], dimensions={1}, use_global_device_ids=true
+  %dot.12 = bf16[1,8192,14336]{2,1,0} dot(%sharding_constraint.129, %all-gather.7), lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %sharding_constraint.130 = bf16[1,8192,14336]{2,1,0} copy(%dot.12)
+  %mul.724 = bf16[1,8192,14336]{2,1,0} multiply(%mul.723, %sharding_constraint.130)
+  %get-tuple-element.271 = f32[32,512,14336]{2,1,0} get-tuple-element(%param.2), index=13
+  %dynamic-slice.15 = f32[1,512,14336]{2,1,0} dynamic-slice(%get-tuple-element.271, %sub.37, %constant.313, %constant.313), dynamic_slice_sizes={1,512,14336}
+  %squeeze.74 = f32[512,14336]{1,0} reshape(%dynamic-slice.15)
+  %convert_element_type.217 = bf16[512,14336]{1,0} convert(%squeeze.74)
+  %all-gather.8 = bf16[4096,14336]{1,0} all-gather(%convert_element_type.217), channel_id=9, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.13 = bf16[1,8192,4096]{2,1,0} dot(%mul.724, %all-gather.8), lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %get-tuple-element.272 = bf16[32,1,8192,14336]{3,2,1,0} get-tuple-element(%param.2), index=14
+  %dynamic-slice.16 = bf16[1,1,8192,14336]{3,2,1,0} dynamic-slice(%get-tuple-element.272, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,1,8192,14336}
+  %squeeze.75 = bf16[1,8192,14336]{2,1,0} reshape(%dynamic-slice.16)
+  %closed_call.29 = bf16[1,8192,14336]{2,1,0} custom-call(%squeeze.75), custom_call_target="MoveToDevice"
+  %mul.725 = bf16[1,8192,14336]{2,1,0} multiply(%sharding_constraint.130, %closed_call.29)
+  %mul.726 = bf16[1,8192,14336]{2,1,0} multiply(%mul.725, %div.115)
+  %mul.727 = bf16[1,8192,14336]{2,1,0} multiply(%closed_call.28, %mul.725)
+  %sub.38 = bf16[1,8192,14336]{2,1,0} subtract(%jit_silu_.12, %div.115)
+  %mul.728 = bf16[1,8192,14336]{2,1,0} multiply(%div.115, %sub.38)
+  %mul.729 = bf16[1,8192,14336]{2,1,0} multiply(%mul.727, %mul.728)
+  %add_any.39 = bf16[1,8192,14336]{2,1,0} add(%mul.726, %mul.729)
+  %get-tuple-element.273 = f32[32,512,14336]{2,1,0} get-tuple-element(%param.2), index=15
+  %dynamic-slice.17 = f32[1,512,14336]{2,1,0} dynamic-slice(%get-tuple-element.273, %sub.37, %constant.313, %constant.313), dynamic_slice_sizes={1,512,14336}
+  %squeeze.76 = f32[512,14336]{1,0} reshape(%dynamic-slice.17)
+  %convert_element_type.218 = bf16[512,14336]{1,0} convert(%squeeze.76)
+  %all-gather.9 = bf16[4096,14336]{1,0} all-gather(%convert_element_type.218), channel_id=10, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.14 = bf16[1,8192,4096]{2,1,0} dot(%add_any.39, %all-gather.9), lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %add_any.40 = bf16[1,8192,4096]{2,1,0} add(%dot.13, %dot.14)
+  %sharding_constraint.131 = bf16[1,8192,4096]{2,1,0} copy(%add_any.40)
+  %get-tuple-element.274 = f32[32,4096]{1,0} get-tuple-element(%param.2), index=16
+  %dynamic_slice.58 = f32[1,4096]{1,0} dynamic-slice(%get-tuple-element.274, %sub.37, %constant.313), dynamic_slice_sizes={1,4096}
+  %squeeze.77 = f32[4096]{0} reshape(%dynamic_slice.58)
+  %convert_element_type.219 = bf16[4096]{0} convert(%squeeze.77)
+  %mul.730 = bf16[1,8192,4096]{2,1,0} broadcast(%convert_element_type.219), dimensions={2}
+  %mul.731 = bf16[1,8192,4096]{2,1,0} multiply(%sharding_constraint.131, %mul.730)
+  %convert_element_type.220 = f32[1,8192,4096]{2,1,0} convert(%mul.731)
+  %get-tuple-element.275 = bf16[32,1,8192,4096]{3,2,1,0} get-tuple-element(%param.2), index=17
+  %dynamic-slice.18 = bf16[1,1,8192,4096]{3,2,1,0} dynamic-slice(%get-tuple-element.275, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,1,8192,4096}
+  %squeeze.78 = bf16[1,8192,4096]{2,1,0} reshape(%dynamic-slice.18)
+  %get-tuple-element.276 = bf16[32,1,8192,4096]{3,2,1,0} get-tuple-element(%param.2), index=18
+  %dynamic-slice.19 = bf16[1,1,8192,4096]{3,2,1,0} dynamic-slice(%get-tuple-element.276, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,1,8192,4096}
+  %squeeze.79 = bf16[1,8192,4096]{2,1,0} reshape(%dynamic-slice.19)
+  %sharding_constraint.132 = bf16[1,8192,4096]{2,1,0} copy(%squeeze.79)
+  %add.309 = bf16[1,8192,4096]{2,1,0} add(%squeeze.78, %sharding_constraint.132)
+  %convert_element_type.221 = f32[1,8192,4096]{2,1,0} convert(%add.309)
+  %square.90 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.221, %convert_element_type.221)
+  %constant.333 = f32[] constant(0)
+  %reduce.2 = f32[1,8192]{1,0} reduce(%square.90, %constant.333), dimensions={2}, to_apply=%region_15.15
+  %broadcast_in_dim.146 = f32[1,8192,1]{2,1,0} reshape(%reduce.2)
+  %constant.334 = f32[] constant(0.000244140625)
+  %closed_call.30 = f32[1,8192,1]{2,1,0} broadcast(%constant.334), dimensions={}
+  %div.116 = f32[1,8192,1]{2,1,0} multiply(%broadcast_in_dim.146, %closed_call.30)
+  %constant.335 = f32[] constant(1e-05)
+  %closed_call.31 = f32[1,8192,1]{2,1,0} broadcast(%constant.335), dimensions={}
+  %add.310 = f32[1,8192,1]{2,1,0} add(%div.116, %closed_call.31)
+  %rsqrt.16 = f32[1,8192,1]{2,1,0} rsqrt(%add.310)
+  %mul.732 = f32[1,8192]{1,0} reshape(%rsqrt.16)
+  %mul.733 = f32[1,8192,4096]{2,1,0} broadcast(%mul.732), dimensions={0,1}
+  %mul.734 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.220, %mul.733)
+  %mul.735 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.221, %convert_element_type.220)
+  %reduce.3 = f32[1,8192]{1,0} reduce(%mul.735, %constant.333), dimensions={2}, to_apply=%region_16.16
+  %reshape.275 = f32[1,8192,1]{2,1,0} reshape(%reduce.3)
+  %div.117 = f32[1,8192,1]{2,1,0} divide(%rsqrt.16, %add.310)
+  %constant.336 = f32[] constant(-0.5)
+  %closed_call.32 = f32[1,8192,1]{2,1,0} broadcast(%constant.336), dimensions={}
+  %mul.736 = f32[1,8192,1]{2,1,0} multiply(%div.117, %closed_call.32)
+  %mul.737 = f32[1,8192,1]{2,1,0} multiply(%reshape.275, %mul.736)
+  %div.118 = f32[1,8192,1]{2,1,0} multiply(%mul.737, %closed_call.30)
+  %reduce_sum.483 = f32[1,8192]{1,0} reshape(%div.118)
+  %constant.337 = f32[] constant(2)
+  %mul.738 = f32[1,8192]{1,0} broadcast(%constant.337), dimensions={}
+  %mul.739 = f32[1,8192]{1,0} multiply(%reduce_sum.483, %mul.738)
+  %mul.740 = f32[1,8192,4096]{2,1,0} broadcast(%mul.739), dimensions={0,1}
+  %mul.741 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.221, %mul.740)
+  %add_any.41 = f32[1,8192,4096]{2,1,0} add(%mul.734, %mul.741)
+  %convert_element_type.222 = bf16[1,8192,4096]{2,1,0} convert(%add_any.41)
+  %add_any.42 = bf16[1,8192,4096]{2,1,0} add(%sharding_constraint.128, %convert_element_type.222)
+  %get-tuple-element.277 = bf16[32,1,8192,32,128]{4,3,2,1,0} get-tuple-element(%param.2), index=19
+  %dynamic-slice.20 = bf16[1,1,8192,32,128]{4,3,2,1,0} dynamic-slice(%get-tuple-element.277, %sub.37, %constant.313, %constant.313, %constant.313, /*index=5*/%constant.313), dynamic_slice_sizes={1,1,8192,32,128}
+  %squeeze.80 = bf16[1,8192,32,128]{3,2,1,0} reshape(%dynamic-slice.20)
+  %get-tuple-element.278 = bf16[32,1,8192,8,128]{4,3,2,1,0} get-tuple-element(%param.2), index=20
+  %dynamic-slice.21 = bf16[1,1,8192,8,128]{4,3,2,1,0} dynamic-slice(%get-tuple-element.278, %sub.37, %constant.313, %constant.313, %constant.313, /*index=5*/%constant.313), dynamic_slice_sizes={1,1,8192,8,128}
+  %squeeze.81 = bf16[1,8192,8,128]{3,2,1,0} reshape(%dynamic-slice.21)
+  %get-tuple-element.279 = bf16[32,1,8192,8,128]{4,3,2,1,0} get-tuple-element(%param.2), index=21
+  %dynamic-slice.22 = bf16[1,1,8192,8,128]{4,3,2,1,0} dynamic-slice(%get-tuple-element.279, %sub.37, %constant.313, %constant.313, %constant.313, /*index=5*/%constant.313), dynamic_slice_sizes={1,1,8192,8,128}
+  %squeeze.82 = bf16[1,8192,8,128]{3,2,1,0} reshape(%dynamic-slice.22)
+  %constant.350 = bf16[0]{0} constant({})
+  %get-tuple-element.280 = f32[32,1,32,8192,1]{4,3,2,1,0} get-tuple-element(%param.2), index=22
+  %dynamic-slice.23 = f32[1,1,32,8192,1]{4,3,2,1,0} dynamic-slice(%get-tuple-element.280, %sub.37, %constant.313, %constant.313, %constant.313, /*index=5*/%constant.313), dynamic_slice_sizes={1,1,32,8192,1}
+  %squeeze.83 = f32[1,32,8192,1]{3,2,1,0} reshape(%dynamic-slice.23)
+  %get-tuple-element.281 = u32[32,2,4]{2,1,0} get-tuple-element(%param.2), index=23
+  %dynamic-slice.24 = u32[1,2,4]{2,1,0} dynamic-slice(%get-tuple-element.281, %sub.37, %constant.313, %constant.313), dynamic_slice_sizes={1,2,4}
+  %squeeze.84 = u32[2,4]{1,0} reshape(%dynamic-slice.24)
+  %get-tuple-element.282 = bf16[32,1,8192,32,128]{4,3,2,1,0} get-tuple-element(%param.2), index=24
+  %dynamic-slice.25 = bf16[1,1,8192,32,128]{4,3,2,1,0} dynamic-slice(%get-tuple-element.282, %sub.37, %constant.313, %constant.313, %constant.313, /*index=5*/%constant.313), dynamic_slice_sizes={1,1,8192,32,128}
+  %squeeze.85 = bf16[1,8192,32,128]{3,2,1,0} reshape(%dynamic-slice.25)
+  %sharding_constraint.133 = bf16[1,8192,4096]{2,1,0} copy(%add_any.42)
+  %get-tuple-element.283 = f32[32,32,128,512]{3,2,1,0} get-tuple-element(%param.2), index=25
+  %dynamic-slice.26 = f32[1,32,128,512]{3,2,1,0} dynamic-slice(%get-tuple-element.283, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,32,128,512}
+  %squeeze.86 = f32[32,128,512]{2,1,0} reshape(%dynamic-slice.26)
+  %convert_element_type.223 = bf16[32,128,512]{2,1,0} convert(%squeeze.86)
+  %all-gather.10 = bf16[32,128,4096]{2,1,0} all-gather(%convert_element_type.223), channel_id=11, replica_groups=[1,8]<=[8], dimensions={2}, use_global_device_ids=true
+  %dot.15 = bf16[1,8192,32,128]{3,2,1,0} dot(%sharding_constraint.133, %all-gather.10), lhs_contracting_dims={2}, rhs_contracting_dims={2}
+  %sharding_constraint.134 = bf16[1,8192,32,128]{3,2,1,0} copy(%dot.15)
+  %constant.376 = s32[1]{0} constant({0})
+  %get-tuple-element.284 = s32[1,8192]{1,0} get-tuple-element(%param.2), index=26
+  %eq.33 = s32[1,8192,8192]{2,1,0} broadcast(%get-tuple-element.284), dimensions={0,1}
+  %eq.34 = s32[1,8192,8192]{2,1,0} broadcast(%get-tuple-element.284), dimensions={0,2}
+  %eq.56 = pred[1,8192,8192]{2,1,0} compare(%eq.33, %eq.34), direction=EQ
+  %broadcast_in_dim.147 = pred[1,1,1,8192,8192]{4,3,2,1,0} reshape(%eq.56)
+  %iota.41 = s32[8192,8192]{1,0} iota(), iota_dimension=1
+  %iota.42 = s32[8192,8192]{1,0} iota(), iota_dimension=0
+  %le.5 = pred[8192,8192]{1,0} compare(%iota.41, %iota.42), direction=LE
+  %and.16 = pred[1,1,1,8192,8192]{4,3,2,1,0} broadcast(%le.5), dimensions={3,4}
+  %and.17 = pred[1,1,1,8192,8192]{4,3,2,1,0} and(%broadcast_in_dim.147, %and.16)
+  %broadcast_in_dim.148 = f32[1,1,1,8192,8192]{4,3,2,1,0} broadcast(%constant.333), dimensions={}
+  %constant.364 = f32[] constant(-2.38197633e+38)
+  %broadcast_in_dim.149 = f32[1,1,1,8192,8192]{4,3,2,1,0} broadcast(%constant.364), dimensions={}
+  %select_n.63 = f32[1,1,1,8192,8192]{4,3,2,1,0} select(%and.17, %broadcast_in_dim.148, %broadcast_in_dim.149)
+  %ne.12 = pred[1,1,1,8192,8192]{4,3,2,1,0} compare(%select_n.63, %broadcast_in_dim.148), direction=NE
+  %not.5 = pred[1,1,1,8192,8192]{4,3,2,1,0} not(%ne.12)
+  %convert_element_type.224 = s32[1,1,1,8192,8192]{4,3,2,1,0} convert(%not.5)
+  %reduce.4 = s32[1,1,1,8192]{3,2,1,0} reduce(%convert_element_type.224, %constant.313), dimensions={3}, to_apply=%region_17.17
+  %slice.9 = s32[1,1,1,1]{3,2,1,0} slice(%reduce.4), slice={[0:1], [0:1], [0:1], [0:1]}
+  %squeeze.87 = s32[1,1]{1,0} reshape(%slice.9)
+  %reshape.277 = s32[1]{0} reshape(%squeeze.87)
+  %lt.15 = pred[1]{0} compare(%reshape.277, %constant.376), direction=LT
+  %broadcast_in_dim.150 = s32[1]{0} reshape(%constant.313)
+  %select_n.64 = s32[1]{0} select(%lt.15, %broadcast_in_dim.150, %reshape.277)
+  %concatenate.16 = s32[2]{0} concatenate(%constant.376, %select_n.64), dimensions={0}
+  %reduce.5 = s32[1,1,1,8192]{3,2,1,0} reduce(%convert_element_type.224, %constant.313), dimensions={4}, to_apply=%region_18.18
+  %constant.374 = s32[] constant(-2147483648)
+  %reduce.6 = s32[1,1]{1,0} reduce(%reduce.5, %constant.374), dimensions={3,2}, to_apply=%region_19.19
+  %reshape.278 = s32[1]{0} reshape(%reduce.6)
+  %lt.16 = pred[1]{0} compare(%reshape.278, %constant.376), direction=LT
+  %select_n.65 = s32[1]{0} select(%lt.16, %broadcast_in_dim.150, %reshape.278)
+  %concatenate.17 = s32[2]{0} concatenate(%constant.376, %select_n.65), dimensions={0}
+  %constant.375 = f32[0]{0} constant({})
+  %te_fused_attn_backward_ffi.6 = (bf16[1,8192,32,128]{3,2,1,0}, bf16[1,8192,8,128]{3,2,1,0}, bf16[1,8192,8,128]{3,2,1,0}, bf16[0]{0}, u8[269484320]{0}) custom-call(%squeeze.80, %squeeze.81, %squeeze.82, %constant.350, %squeeze.83, /*index=5*/%squeeze.84, %squeeze.85, %sharding_constraint.134, %concatenate.16, %concatenate.17, /*index=10*/%constant.375, %constant.375, %constant.375, %constant.375, %constant.375, /*index=15*/%constant.375), custom_call_target="te_fused_attn_backward_ffi", operand_layout_constraints={bf16[1,8192,32,128]{3,2,1,0}, bf16[1,8192,8,128]{3,2,1,0}, bf16[1,8192,8,128]{3,2,1,0}, bf16[0]{0}, f32[1,32,8192,1]{3,2,1,0}, u32[2,4]{1,0}, bf16[1,8192,32,128]{3,2,1,0}, bf16[1,8192,32,128]{3,2,1,0}, s32[2]{0}, s32[2]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}, f32[0]{0}}, api_version=API_VERSION_TYPED_FFI, backend_config={attn_heads = 32 : i64, bias_batch = 0 : i64, bias_heads = 0 : i64, bias_type = 0 : i64, deterministic = false, dropout_probability = 0.000000e+00 : f64, input_batch = 1 : i64, is_training = true, kv_max_seqlen = 8192 : i64, mask_type = 3 : i64, max_segments_per_seq = 1 : i64, num_gqa_groups = 8 : i64, q_max_seqlen = 8192 : i64, qk_head_dim = 128 : i64, qkv_layout = 9 : i64, scaling_factor = 1.000000e+00 : f64, v_head_dim = 128 : i64, window_size_left = -1 : i64, window_size_right = -1 : i64}
+  %get-tuple-element.285 = bf16[1,8192,8,128]{3,2,1,0} get-tuple-element(%te_fused_attn_backward_ffi.6), index=2
+  %sharding_constraint.135 = bf16[1,8192,8,128]{3,2,1,0} copy(%get-tuple-element.285)
+  %get-tuple-element.286 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%param.2), index=27
+  %dynamic-slice.28 = f32[1,512,8,128]{3,2,1,0} dynamic-slice(%get-tuple-element.286, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,512,8,128}
+  %squeeze.88 = f32[512,8,128]{2,1,0} reshape(%dynamic-slice.28)
+  %convert_element_type.226 = bf16[512,8,128]{2,1,0} convert(%squeeze.88)
+  %all-gather.11 = bf16[4096,8,128]{2,1,0} all-gather(%convert_element_type.226), channel_id=12, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.16 = bf16[1,8192,4096]{2,1,0} dot(%sharding_constraint.135, %all-gather.11), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+  %get-tuple-element.287 = bf16[1,8192,8,128]{3,2,1,0} get-tuple-element(%te_fused_attn_backward_ffi.6), index=1
+  %sharding_constraint.136 = bf16[1,8192,8,128]{3,2,1,0} copy(%get-tuple-element.287)
+  %slice.10 = bf16[1,8192,8,64]{3,2,1,0} slice(%sharding_constraint.136), slice={[0:1], [0:8192], [0:8], [64:128]}
+  %get-tuple-element.288 = s32[1,8192]{1,0} get-tuple-element(%param.2), index=28
+  %broadcast_in_dim.152 = s32[1,8192,1,1]{3,2,1,0} reshape(%get-tuple-element.288)
+  %convert_element_type.227 = f32[1,8192,1,1]{3,2,1,0} convert(%broadcast_in_dim.152)
+  %div.119 = f32[1,8192]{1,0} reshape(%convert_element_type.227)
+  %div.272 = f32[1,8192,1,64]{3,2,1,0} broadcast(%div.119), dimensions={0,1}
+  %constant.390 = f32[] constant(500000)
+  %closed_call.33 = f32[64]{0} broadcast(%constant.390), dimensions={}
+  %iota.43 = s32[64]{0} iota(), iota_dimension=0
+  %constant.391 = s32[] constant(2)
+  %closed_call.34 = s32[64]{0} broadcast(%constant.391), dimensions={}
+  %mul.742 = s32[64]{0} multiply(%iota.43, %closed_call.34)
+  %convert_element_type.228 = f32[64]{0} convert(%mul.742)
+  %constant.392 = f32[] constant(0.0078125)
+  %closed_call.46 = f32[64]{0} broadcast(%constant.392), dimensions={}
+  %div.273 = f32[64]{0} multiply(%convert_element_type.228, %closed_call.46)
+  %pow.18 = f32[64]{0} power(%closed_call.33, %div.273)
+  %div.274 = f32[1,8192,1,64]{3,2,1,0} broadcast(%pow.18), dimensions={3}
+  %div.275 = f32[1,8192,1,64]{3,2,1,0} divide(%div.272, %div.274)
+  %sin.10 = f32[1,8192,1,64]{3,2,1,0} sine(%div.275)
+  %convert_element_type.229 = bf16[1,8192,1,64]{3,2,1,0} convert(%sin.10)
+  %mul.743 = bf16[1,8192,64]{2,1,0} reshape(%convert_element_type.229)
+  %mul.744 = bf16[1,8192,8,64]{3,2,1,0} broadcast(%mul.743), dimensions={0,1,3}
+  %mul.745 = bf16[1,8192,8,64]{3,2,1,0} multiply(%slice.10, %mul.744)
+  %slice.11 = bf16[1,8192,8,64]{3,2,1,0} slice(%sharding_constraint.136), slice={[0:1], [0:8192], [0:8], [0:64]}
+  %cos.12 = f32[1,8192,1,64]{3,2,1,0} cosine(%div.275)
+  %convert_element_type.230 = bf16[1,8192,1,64]{3,2,1,0} convert(%cos.12)
+  %mul.746 = bf16[1,8192,64]{2,1,0} reshape(%convert_element_type.230)
+  %mul.747 = bf16[1,8192,8,64]{3,2,1,0} broadcast(%mul.746), dimensions={0,1,3}
+  %mul.748 = bf16[1,8192,8,64]{3,2,1,0} multiply(%slice.11, %mul.747)
+  %add_any.43 = bf16[1,8192,8,64]{3,2,1,0} add(%mul.745, %mul.748)
+  %mul.751 = bf16[1,8192,8,64]{3,2,1,0} multiply(%slice.10, %mul.747)
+  %neg.18 = bf16[1,8192,8,64]{3,2,1,0} negate(%slice.11)
+  %mul.754 = bf16[1,8192,8,64]{3,2,1,0} multiply(%neg.18, %mul.744)
+  %add_any.44 = bf16[1,8192,8,64]{3,2,1,0} add(%mul.751, %mul.754)
+  %concatenate.18 = bf16[1,8192,8,128]{3,2,1,0} concatenate(%add_any.43, %add_any.44), dimensions={3}
+  %get-tuple-element.289 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%param.2), index=29
+  %dynamic-slice.31 = f32[1,512,8,128]{3,2,1,0} dynamic-slice(%get-tuple-element.289, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,512,8,128}
+  %squeeze.89 = f32[512,8,128]{2,1,0} reshape(%dynamic-slice.31)
+  %convert_element_type.231 = bf16[512,8,128]{2,1,0} convert(%squeeze.89)
+  %all-gather.12 = bf16[4096,8,128]{2,1,0} all-gather(%convert_element_type.231), channel_id=13, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.17 = bf16[1,8192,4096]{2,1,0} dot(%concatenate.18, %all-gather.12), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+  %add_any.45 = bf16[1,8192,4096]{2,1,0} add(%dot.16, %dot.17)
+  %sharding_constraint.137 = bf16[1,8192,4096]{2,1,0} copy(%add_any.45)
+  %get-tuple-element.290 = bf16[1,8192,32,128]{3,2,1,0} get-tuple-element(%te_fused_attn_backward_ffi.6), index=0
+  %sharding_constraint.138 = bf16[1,8192,32,128]{3,2,1,0} copy(%get-tuple-element.290)
+  %slice.12 = bf16[1,8192,32,64]{3,2,1,0} slice(%sharding_constraint.138), slice={[0:1], [0:8192], [0:32], [64:128]}
+  %mul.757 = bf16[1,8192,32,64]{3,2,1,0} broadcast(%mul.743), dimensions={0,1,3}
+  %mul.758 = bf16[1,8192,32,64]{3,2,1,0} multiply(%slice.12, %mul.757)
+  %slice.13 = bf16[1,8192,32,64]{3,2,1,0} slice(%sharding_constraint.138), slice={[0:1], [0:8192], [0:32], [0:64]}
+  %mul.760 = bf16[1,8192,32,64]{3,2,1,0} broadcast(%mul.746), dimensions={0,1,3}
+  %mul.761 = bf16[1,8192,32,64]{3,2,1,0} multiply(%slice.13, %mul.760)
+  %add_any.46 = bf16[1,8192,32,64]{3,2,1,0} add(%mul.758, %mul.761)
+  %mul.764 = bf16[1,8192,32,64]{3,2,1,0} multiply(%slice.12, %mul.760)
+  %neg.19 = bf16[1,8192,32,64]{3,2,1,0} negate(%slice.13)
+  %mul.767 = bf16[1,8192,32,64]{3,2,1,0} multiply(%neg.19, %mul.757)
+  %add_any.47 = bf16[1,8192,32,64]{3,2,1,0} add(%mul.764, %mul.767)
+  %concatenate.19 = bf16[1,8192,32,128]{3,2,1,0} concatenate(%add_any.46, %add_any.47), dimensions={3}
+  %get-tuple-element.291 = f32[32,512,32,128]{3,2,1,0} get-tuple-element(%param.2), index=30
+  %dynamic-slice.34 = f32[1,512,32,128]{3,2,1,0} dynamic-slice(%get-tuple-element.291, %sub.37, %constant.313, %constant.313, %constant.313), dynamic_slice_sizes={1,512,32,128}
+  %squeeze.90 = f32[512,32,128]{2,1,0} reshape(%dynamic-slice.34)
+  %convert_element_type.236 = bf16[512,32,128]{2,1,0} convert(%squeeze.90)
+  %all-gather.13 = bf16[4096,32,128]{2,1,0} all-gather(%convert_element_type.236), channel_id=14, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.18 = bf16[1,8192,4096]{2,1,0} dot(%concatenate.19, %all-gather.13), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+  %sharding_constraint.139 = bf16[1,8192,4096]{2,1,0} copy(%dot.18)
+  %add_any.48 = bf16[1,8192,4096]{2,1,0} add(%sharding_constraint.137, %sharding_constraint.139)
+  %sharding_constraint.140 = bf16[1,8192,4096]{2,1,0} copy(%add_any.48)
+  %get-tuple-element.292 = f32[32,4096]{1,0} get-tuple-element(%param.2), index=31
+  %dynamic_slice.59 = f32[1,4096]{1,0} dynamic-slice(%get-tuple-element.292, %sub.37, %constant.313), dynamic_slice_sizes={1,4096}
+  %squeeze.91 = f32[4096]{0} reshape(%dynamic_slice.59)
+  %convert_element_type.237 = bf16[4096]{0} convert(%squeeze.91)
+  %mul.768 = bf16[1,8192,4096]{2,1,0} broadcast(%convert_element_type.237), dimensions={2}
+  %mul.769 = bf16[1,8192,4096]{2,1,0} multiply(%sharding_constraint.140, %mul.768)
+  %convert_element_type.238 = f32[1,8192,4096]{2,1,0} convert(%mul.769)
+  %convert_element_type.239 = f32[1,8192,4096]{2,1,0} convert(%squeeze.78)
+  %square.91 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.239, %convert_element_type.239)
+  %reduce.7 = f32[1,8192]{1,0} reduce(%square.91, %constant.333), dimensions={2}, to_apply=%region_20.20
+  %broadcast_in_dim.154 = f32[1,8192,1]{2,1,0} reshape(%reduce.7)
+  %div.281 = f32[1,8192,1]{2,1,0} multiply(%broadcast_in_dim.154, %closed_call.30)
+  %add.311 = f32[1,8192,1]{2,1,0} add(%div.281, %closed_call.31)
+  %rsqrt.17 = f32[1,8192,1]{2,1,0} rsqrt(%add.311)
+  %mul.770 = f32[1,8192]{1,0} reshape(%rsqrt.17)
+  %mul.771 = f32[1,8192,4096]{2,1,0} broadcast(%mul.770), dimensions={0,1}
+  %mul.772 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.238, %mul.771)
+  %mul.773 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.239, %convert_element_type.238)
+  %reduce.8 = f32[1,8192]{1,0} reduce(%mul.773, %constant.333), dimensions={2}, to_apply=%region_21.21
+  %reshape.283 = f32[1,8192,1]{2,1,0} reshape(%reduce.8)
+  %div.282 = f32[1,8192,1]{2,1,0} divide(%rsqrt.17, %add.311)
+  %mul.774 = f32[1,8192,1]{2,1,0} multiply(%div.282, %closed_call.32)
+  %mul.775 = f32[1,8192,1]{2,1,0} multiply(%reshape.283, %mul.774)
+  %div.283 = f32[1,8192,1]{2,1,0} multiply(%mul.775, %closed_call.30)
+  %reduce_sum.584 = f32[1,8192]{1,0} reshape(%div.283)
+  %mul.776 = f32[1,8192]{1,0} multiply(%reduce_sum.584, %mul.738)
+  %mul.777 = f32[1,8192,4096]{2,1,0} broadcast(%mul.776), dimensions={0,1}
+  %mul.778 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.239, %mul.777)
+  %add_any.49 = f32[1,8192,4096]{2,1,0} add(%mul.772, %mul.778)
+  %convert_element_type.240 = bf16[1,8192,4096]{2,1,0} convert(%add_any.49)
+  %add_any.50 = bf16[1,8192,4096]{2,1,0} add(%add_any.42, %convert_element_type.240)
+  %sharding_constraint.141 = bf16[1,8192,4096]{2,1,0} copy(%add_any.50)
+  %get-tuple-element.293 = f32[32,512,14336]{2,1,0} get-tuple-element(%param.2), index=2
+  %mul.781 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.221, %mul.733)
+  %convert_element_type.241 = bf16[1,8192,4096]{2,1,0} convert(%mul.781)
+  %mul.783 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.241, %mul.730)
+  %sharding_constraint.142 = bf16[1,8192,4096]{2,1,0} copy(%mul.783)
+  %dot.19 = bf16[4096,14336]{1,0} dot(%sharding_constraint.142, %add_any.39), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce = bf16[4096,14336]{1,0} all-reduce(%dot.19), channel_id=15, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.clone
+  %constant.428 = s32[8]{0} constant({0, 512, 1024, 1536, 2048, 2560, 3072, 3584})
+  %partition-id.2 = u32[] partition-id()
+  %dynamic-slice.35 = s32[1]{0} dynamic-slice(%constant.428, %partition-id.2), dynamic_slice_sizes={1}
+  %reshape.284 = s32[] reshape(%dynamic-slice.35)
+  %dynamic-slice.36 = bf16[512,14336]{1,0} dynamic-slice(%all-reduce, %reshape.284, %constant.313), dynamic_slice_sizes={512,14336}
+  %convert_element_type.242 = f32[512,14336]{1,0} convert(%dynamic-slice.36)
+  %broadcast_in_dim.155 = f32[1,512,14336]{2,1,0} reshape(%convert_element_type.242)
+  %dynamic-update-slice.10 = f32[32,512,14336]{2,1,0} dynamic-update-slice(%get-tuple-element.293, %broadcast_in_dim.155, %sub.37, %constant.313, %constant.313)
+  %get-tuple-element.294 = f32[32,512,14336]{2,1,0} get-tuple-element(%param.2), index=3
+  %dot.20 = bf16[4096,14336]{1,0} dot(%sharding_constraint.142, %mul.724), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.1 = bf16[4096,14336]{1,0} all-reduce(%dot.20), channel_id=16, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.1.clone
+  %dynamic-slice.38 = bf16[512,14336]{1,0} dynamic-slice(%all-reduce.1, %reshape.284, %constant.313), dynamic_slice_sizes={512,14336}
+  %convert_element_type.243 = f32[512,14336]{1,0} convert(%dynamic-slice.38)
+  %broadcast_in_dim.156 = f32[1,512,14336]{2,1,0} reshape(%convert_element_type.243)
+  %dynamic-update-slice.11 = f32[32,512,14336]{2,1,0} dynamic-update-slice(%get-tuple-element.294, %broadcast_in_dim.156, %sub.37, %constant.313, %constant.313)
+  %get-tuple-element.295 = f32[32,14336,512]{2,1,0} get-tuple-element(%param.2), index=4
+  %mul.784 = bf16[1,8192,14336]{2,1,0} multiply(%mul.723, %closed_call.29)
+  %sharding_constraint.143 = bf16[1,8192,14336]{2,1,0} copy(%mul.784)
+  %dot.21 = bf16[14336,4096]{1,0} dot(%sharding_constraint.143, %sharding_constraint.129), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.2 = bf16[14336,4096]{1,0} all-reduce(%dot.21), channel_id=17, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.2.clone
+  %dynamic-slice.40 = bf16[14336,512]{1,0} dynamic-slice(%all-reduce.2, %constant.313, %reshape.284), dynamic_slice_sizes={14336,512}
+  %convert_element_type.244 = f32[14336,512]{1,0} convert(%dynamic-slice.40)
+  %broadcast_in_dim.157 = f32[1,14336,512]{2,1,0} reshape(%convert_element_type.244)
+  %dynamic-update-slice.12 = f32[32,14336,512]{2,1,0} dynamic-update-slice(%get-tuple-element.295, %broadcast_in_dim.157, %sub.37, %constant.313, %constant.313)
+  %get-tuple-element.296 = f32[32,4096]{1,0} get-tuple-element(%param.2), index=5
+  %mul.785 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.241, %sharding_constraint.131)
+  %constant.444 = bf16[] constant(0)
+  %reduce.9 = bf16[4096]{0} reduce(%mul.785, %constant.444), dimensions={0,1}, to_apply=%region_22.22
+  %all-reduce.3 = bf16[4096]{0} all-reduce(%reduce.9), channel_id=18, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_22.22.clone
+  %convert_element_type.245 = f32[4096]{0} convert(%all-reduce.3)
+  %broadcast_in_dim.158 = f32[1,4096]{1,0} reshape(%convert_element_type.245)
+  %dynamic_update_slice.38 = f32[32,4096]{1,0} dynamic-update-slice(%get-tuple-element.296, %broadcast_in_dim.158, %sub.37, %constant.313)
+  %get-tuple-element.297 = f32[32,4096]{1,0} get-tuple-element(%param.2), index=6
+  %mul.788 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.239, %mul.771)
+  %convert_element_type.246 = bf16[1,8192,4096]{2,1,0} convert(%mul.788)
+  %mul.789 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.246, %sharding_constraint.140)
+  %reduce.10 = bf16[4096]{0} reduce(%mul.789, %constant.444), dimensions={0,1}, to_apply=%region_23.23
+  %all-reduce.4 = bf16[4096]{0} all-reduce(%reduce.10), channel_id=19, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_23.23.clone
+  %convert_element_type.247 = f32[4096]{0} convert(%all-reduce.4)
+  %broadcast_in_dim.159 = f32[1,4096]{1,0} reshape(%convert_element_type.247)
+  %dynamic_update_slice.39 = f32[32,4096]{1,0} dynamic-update-slice(%get-tuple-element.297, %broadcast_in_dim.159, %sub.37, %constant.313)
+  %get-tuple-element.298 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%param.2), index=7
+  %mul.791 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.246, %mul.768)
+  %sharding_constraint.144 = bf16[1,8192,4096]{2,1,0} copy(%mul.791)
+  %sharding_constraint.145 = bf16[1,8192,4096]{2,1,0} copy(%sharding_constraint.144)
+  %dot.22 = bf16[8,128,4096]{2,1,0} dot(%concatenate.18, %sharding_constraint.145), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.5 = bf16[8,128,4096]{2,1,0} all-reduce(%dot.22), channel_id=20, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.3.clone
+  %dynamic-slice.42 = bf16[8,128,512]{2,1,0} dynamic-slice(%all-reduce.5, %constant.313, %constant.313, %reshape.284), dynamic_slice_sizes={8,128,512}
+  %transpose.61 = bf16[512,8,128]{0,2,1} transpose(%dynamic-slice.42), dimensions={2,0,1}
+  %convert_element_type.248 = f32[512,8,128]{0,2,1} convert(%transpose.61)
+  %broadcast_in_dim.160 = f32[1,512,8,128]{3,2,1,0} reshape(%convert_element_type.248)
+  %dynamic-update-slice.13 = f32[32,512,8,128]{3,2,1,0} dynamic-update-slice(%get-tuple-element.298, %broadcast_in_dim.160, %sub.37, %constant.313, %constant.313, /*index=5*/%constant.313)
+  %get-tuple-element.299 = f32[32,32,128,512]{3,2,1,0} get-tuple-element(%param.2), index=8
+  %sharding_constraint.146 = bf16[1,8192,32,128]{3,2,1,0} copy(%squeeze.85)
+  %dot.23 = bf16[4096,32,128]{2,1,0} dot(%sharding_constraint.133, %sharding_constraint.146), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.6 = bf16[4096,32,128]{2,1,0} all-reduce(%dot.23), channel_id=21, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.4.clone
+  %dynamic-slice.44 = bf16[512,32,128]{2,1,0} dynamic-slice(%all-reduce.6, %reshape.284, %constant.313, %constant.313), dynamic_slice_sizes={512,32,128}
+  %transpose.62 = bf16[32,128,512]{1,0,2} transpose(%dynamic-slice.44), dimensions={1,2,0}
+  %convert_element_type.249 = f32[32,128,512]{1,0,2} convert(%transpose.62)
+  %broadcast_in_dim.161 = f32[1,32,128,512]{3,2,1,0} reshape(%convert_element_type.249)
+  %dynamic-update-slice.14 = f32[32,32,128,512]{3,2,1,0} dynamic-update-slice(%get-tuple-element.299, %broadcast_in_dim.161, %sub.37, %constant.313, %constant.313, /*index=5*/%constant.313)
+  %get-tuple-element.300 = f32[32,512,32,128]{3,2,1,0} get-tuple-element(%param.2), index=9
+  %dot.24 = bf16[32,128,4096]{2,1,0} dot(%concatenate.19, %sharding_constraint.145), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.7 = bf16[32,128,4096]{2,1,0} all-reduce(%dot.24), channel_id=22, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.5.clone
+  %dynamic-slice.46 = bf16[32,128,512]{2,1,0} dynamic-slice(%all-reduce.7, %constant.313, %constant.313, %reshape.284), dynamic_slice_sizes={32,128,512}
+  %transpose.63 = bf16[512,32,128]{0,2,1} transpose(%dynamic-slice.46), dimensions={2,0,1}
+  %convert_element_type.250 = f32[512,32,128]{0,2,1} convert(%transpose.63)
+  %broadcast_in_dim.162 = f32[1,512,32,128]{3,2,1,0} reshape(%convert_element_type.250)
+  %dynamic-update-slice.15 = f32[32,512,32,128]{3,2,1,0} dynamic-update-slice(%get-tuple-element.300, %broadcast_in_dim.162, %sub.37, %constant.313, %constant.313, /*index=5*/%constant.313)
+  %get-tuple-element.301 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%param.2), index=10
+  %dot.25 = bf16[8,128,4096]{2,1,0} dot(%sharding_constraint.135, %sharding_constraint.145), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.8 = bf16[8,128,4096]{2,1,0} all-reduce(%dot.25), channel_id=23, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.6.clone
+  %dynamic-slice.48 = bf16[8,128,512]{2,1,0} dynamic-slice(%all-reduce.8, %constant.313, %constant.313, %reshape.284), dynamic_slice_sizes={8,128,512}
+  %transpose.64 = bf16[512,8,128]{0,2,1} transpose(%dynamic-slice.48), dimensions={2,0,1}
+  %convert_element_type.251 = f32[512,8,128]{0,2,1} convert(%transpose.64)
+  %broadcast_in_dim.163 = f32[1,512,8,128]{3,2,1,0} reshape(%convert_element_type.251)
+  %dynamic-update-slice.16 = f32[32,512,8,128]{3,2,1,0} dynamic-update-slice(%get-tuple-element.301, %broadcast_in_dim.163, %sub.37, %constant.313, %constant.313, /*index=5*/%constant.313)
+  ROOT %tuple.16 = (s32[], bf16[1,8192,4096]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}, /*index=5*/f32[32,4096]{1,0}, f32[32,4096]{1,0}, f32[32,512,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,512,32,128]{3,2,1,0}, /*index=10*/f32[32,512,8,128]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,14336,512]{2,1,0}, f32[32,512,14336]{2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, /*index=15*/f32[32,512,14336]{2,1,0}, f32[32,4096]{1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=20*/bf16[32,1,8192,8,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=25*/f32[32,32,128,512]{3,2,1,0}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,1,0}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,1,0}, /*index=30*/f32[32,512,32,128]{3,2,1,0}, f32[32,4096]{1,0}) tuple(%add.307, %sharding_constraint.141, %dynamic-update-slice.10, %dynamic-update-slice.11, %dynamic-update-slice.12, /*index=5*/%dynamic_update_slice.38, %dynamic_update_slice.39, %dynamic-update-slice.13, %dynamic-update-slice.14, %dynamic-update-slice.15, /*index=10*/%dynamic-update-slice.16, %get-tuple-element.269, %get-tuple-element.270, %get-tuple-element.271, %get-tuple-element.272, /*index=15*/%get-tuple-element.273, %get-tuple-element.274, %get-tuple-element.275, %get-tuple-element.276, %get-tuple-element.277, /*index=20*/%get-tuple-element.278, %get-tuple-element.279, %get-tuple-element.280, %get-tuple-element.281, %get-tuple-element.282, /*index=25*/%get-tuple-element.283, %get-tuple-element.284, %get-tuple-element.286, %get-tuple-element.288, %get-tuple-element.289, /*index=30*/%get-tuple-element.291, %get-tuple-element.292)
+}
+
+%region_24.25_spmd (param.3: (s32[], bf16[1,8192,4096], f32[32,512,14336], f32[32,512,14336], f32[32,14336,512], /*index=5*/f32[32,4096], f32[32,4096], f32[32,512,8,128], f32[32,32,128,512], f32[32,512,32,128], /*index=10*/f32[32,512,8,128], bf16[32,1,8192,14336], f32[32,14336,512], f32[32,512,14336], bf16[32,1,8192,14336], /*index=15*/f32[32,512,14336], f32[32,4096], bf16[32,1,8192,4096], bf16[32,1,8192,4096], bf16[32,1,8192,32,128], /*index=20*/bf16[32,1,8192,8,128], bf16[32,1,8192,8,128], f32[32,1,32,8192,1], u32[32,2,4], bf16[32,1,8192,32,128], /*index=25*/f32[32,32,128,512], s32[1,8192], f32[32,512,8,128], s32[1,8192], f32[32,512,8,128], /*index=30*/f32[32,512,32,128], f32[32,4096])) -> pred[] {
+  %param.3 = (s32[], bf16[1,8192,4096]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}, /*index=5*/f32[32,4096]{1,0}, f32[32,4096]{1,0}, f32[32,512,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,512,32,128]{3,2,1,0}, /*index=10*/f32[32,512,8,128]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,14336,512]{2,1,0}, f32[32,512,14336]{2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, /*index=15*/f32[32,512,14336]{2,1,0}, f32[32,4096]{1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=20*/bf16[32,1,8192,8,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=25*/f32[32,32,128,512]{3,2,1,0}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,1,0}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,1,0}, /*index=30*/f32[32,512,32,128]{3,2,1,0}, f32[32,4096]{1,0}) parameter(0)
+  %get-tuple-element.302 = s32[] get-tuple-element(%param.3), index=0
+  %constant.477 = s32[] constant(32)
+  ROOT %lt.17 = pred[] compare(%get-tuple-element.302, %constant.477), direction=LT
+}
+
+%region_25.26 (reduce_sum.144: f32[], reduce_sum.142: f32[]) -> f32[] {
+  %reduce_sum.144 = f32[] parameter(0)
+  %reduce_sum.142 = f32[] parameter(1)
+  ROOT %reduce_sum.143 = f32[] add(%reduce_sum.144, %reduce_sum.142)
+}
+
+%region_25.26.clone (reduce_sum.591: f32[], reduce_sum.592: f32[]) -> f32[] {
+  %reduce_sum.591 = f32[] parameter(0)
+  %reduce_sum.592 = f32[] parameter(1)
+  ROOT %reduce_sum.593 = f32[] add(%reduce_sum.591, %reduce_sum.592)
+}
+
+%region_26.27 (reduce_sum.151: f32[], reduce_sum.148: f32[]) -> f32[] {
+  %reduce_sum.151 = f32[] parameter(0)
+  %reduce_sum.148 = f32[] parameter(1)
+  ROOT %reduce_sum.149 = f32[] add(%reduce_sum.151, %reduce_sum.148)
+}
+
+%region_26.27.clone (reduce_sum.594: f32[], reduce_sum.595: f32[]) -> f32[] {
+  %reduce_sum.594 = f32[] parameter(0)
+  %reduce_sum.595 = f32[] parameter(1)
+  ROOT %reduce_sum.596 = f32[] add(%reduce_sum.594, %reduce_sum.595)
+}
+
+%region_27.28 (reduce_sum.158: f32[], reduce_sum.150: f32[]) -> f32[] {
+  %reduce_sum.158 = f32[] parameter(0)
+  %reduce_sum.150 = f32[] parameter(1)
+  ROOT %reduce_sum.155 = f32[] add(%reduce_sum.158, %reduce_sum.150)
+}
+
+%region_27.28.clone (reduce_sum.597: f32[], reduce_sum.598: f32[]) -> f32[] {
+  %reduce_sum.597 = f32[] parameter(0)
+  %reduce_sum.598 = f32[] parameter(1)
+  ROOT %reduce_sum.599 = f32[] add(%reduce_sum.597, %reduce_sum.598)
+}
+
+%region_28.29 (reduce_sum.165: f32[], reduce_sum.156: f32[]) -> f32[] {
+  %reduce_sum.165 = f32[] parameter(0)
+  %reduce_sum.156 = f32[] parameter(1)
+  ROOT %reduce_sum.157 = f32[] add(%reduce_sum.165, %reduce_sum.156)
+}
+
+%region_29.30 (reduce_sum.172: f32[], reduce_sum.162: f32[]) -> f32[] {
+  %reduce_sum.172 = f32[] parameter(0)
+  %reduce_sum.162 = f32[] parameter(1)
+  ROOT %reduce_sum.163 = f32[] add(%reduce_sum.172, %reduce_sum.162)
+}
+
+%region_30.31 (reduce_sum.164: f32[], reduce_sum.169: f32[]) -> f32[] {
+  %reduce_sum.164 = f32[] parameter(0)
+  %reduce_sum.169 = f32[] parameter(1)
+  ROOT %reduce_sum.170 = f32[] add(%reduce_sum.164, %reduce_sum.169)
+}
+
+%region_30.31.clone (reduce_sum.600: f32[], reduce_sum.601: f32[]) -> f32[] {
+  %reduce_sum.600 = f32[] parameter(0)
+  %reduce_sum.601 = f32[] parameter(1)
+  ROOT %reduce_sum.602 = f32[] add(%reduce_sum.600, %reduce_sum.601)
+}
+
+%region_31.32 (reduce_sum.171: f32[], reduce_sum.176: f32[]) -> f32[] {
+  %reduce_sum.171 = f32[] parameter(0)
+  %reduce_sum.176 = f32[] parameter(1)
+  ROOT %reduce_sum.189 = f32[] add(%reduce_sum.171, %reduce_sum.176)
+}
+
+%region_31.32.clone (reduce_sum.603: f32[], reduce_sum.604: f32[]) -> f32[] {
+  %reduce_sum.603 = f32[] parameter(0)
+  %reduce_sum.604 = f32[] parameter(1)
+  ROOT %reduce_sum.605 = f32[] add(%reduce_sum.603, %reduce_sum.604)
+}
+
+%region_32.33 (reduce_sum.190: f32[], reduce_sum.191: f32[]) -> f32[] {
+  %reduce_sum.190 = f32[] parameter(0)
+  %reduce_sum.191 = f32[] parameter(1)
+  ROOT %reduce_sum.195 = f32[] add(%reduce_sum.190, %reduce_sum.191)
+}
+
+%region_32.33.clone (reduce_sum.606: f32[], reduce_sum.607: f32[]) -> f32[] {
+  %reduce_sum.606 = f32[] parameter(0)
+  %reduce_sum.607 = f32[] parameter(1)
+  ROOT %reduce_sum.608 = f32[] add(%reduce_sum.606, %reduce_sum.607)
+}
+
+%region_33.34 (reduce_sum.196: f32[], reduce_sum.197: f32[]) -> f32[] {
+  %reduce_sum.196 = f32[] parameter(0)
+  %reduce_sum.197 = f32[] parameter(1)
+  ROOT %reduce_sum.202 = f32[] add(%reduce_sum.196, %reduce_sum.197)
+}
+
+%region_33.34.clone (reduce_sum.609: f32[], reduce_sum.610: f32[]) -> f32[] {
+  %reduce_sum.609 = f32[] parameter(0)
+  %reduce_sum.610 = f32[] parameter(1)
+  ROOT %reduce_sum.611 = f32[] add(%reduce_sum.609, %reduce_sum.610)
+}
+
+%add.7.clone (x.15: bf16[], y.15: bf16[]) -> bf16[] {
+  %x.15 = bf16[] parameter(0)
+  %y.15 = bf16[] parameter(1)
+  ROOT %add.313 = bf16[] add(%x.15, %y.15)
+}
+
+%region_34.35 (reduce_sum.198: f32[], reduce_sum.203: f32[]) -> f32[] {
+  %reduce_sum.198 = f32[] parameter(0)
+  %reduce_sum.203 = f32[] parameter(1)
+  ROOT %reduce_sum.209 = f32[] add(%reduce_sum.198, %reduce_sum.203)
+}
+
+%region_34.35.clone (reduce_sum.612: f32[], reduce_sum.613: f32[]) -> f32[] {
+  %reduce_sum.612 = f32[] parameter(0)
+  %reduce_sum.613 = f32[] parameter(1)
+  ROOT %reduce_sum.614 = f32[] add(%reduce_sum.612, %reduce_sum.613)
+}
+
+%add.8.clone (x.17: bf16[], y.17: bf16[]) -> bf16[] {
+  %x.17 = bf16[] parameter(0)
+  %y.17 = bf16[] parameter(1)
+  ROOT %add.315 = bf16[] add(%x.17, %y.17)
+}
+
+%region_35.36 (reduce_sum.204: f32[], reduce_sum.205: f32[]) -> f32[] {
+  %reduce_sum.204 = f32[] parameter(0)
+  %reduce_sum.205 = f32[] parameter(1)
+  ROOT %reduce_sum.216 = f32[] add(%reduce_sum.204, %reduce_sum.205)
+}
+
+%region_35.36.clone (reduce_sum.615: f32[], reduce_sum.616: f32[]) -> f32[] {
+  %reduce_sum.615 = f32[] parameter(0)
+  %reduce_sum.616 = f32[] parameter(1)
+  ROOT %reduce_sum.617 = f32[] add(%reduce_sum.615, %reduce_sum.616)
+}
+
+%region_36.37 (reduce_sum.210: f32[], reduce_sum.211: f32[]) -> f32[] {
+  %reduce_sum.210 = f32[] parameter(0)
+  %reduce_sum.211 = f32[] parameter(1)
+  ROOT %reduce_sum.223 = f32[] add(%reduce_sum.210, %reduce_sum.211)
+}
+
+%region_37.38 (reduce_sum.212: f32[], reduce_sum.217: f32[]) -> f32[] {
+  %reduce_sum.212 = f32[] parameter(0)
+  %reduce_sum.217 = f32[] parameter(1)
+  ROOT %reduce_sum.230 = f32[] add(%reduce_sum.212, %reduce_sum.217)
+}
+
+%region_37.38.clone (reduce_sum.618: f32[], reduce_sum.619: f32[]) -> f32[] {
+  %reduce_sum.618 = f32[] parameter(0)
+  %reduce_sum.619 = f32[] parameter(1)
+  ROOT %reduce_sum.620 = f32[] add(%reduce_sum.618, %reduce_sum.619)
+}
+
+%region_38.39 (reduce_sum.218: f32[], reduce_sum.219: f32[]) -> f32[] {
+  %reduce_sum.218 = f32[] parameter(0)
+  %reduce_sum.219 = f32[] parameter(1)
+  ROOT %reduce_sum.237 = f32[] add(%reduce_sum.218, %reduce_sum.219)
+}
+
+%region_38.39.clone (reduce_sum.621: f32[], reduce_sum.622: f32[]) -> f32[] {
+  %reduce_sum.621 = f32[] parameter(0)
+  %reduce_sum.622 = f32[] parameter(1)
+  ROOT %reduce_sum.623 = f32[] add(%reduce_sum.621, %reduce_sum.622)
+}
+
+%region_39.40 (reduce_sum.224: f32[], reduce_sum.225: f32[]) -> f32[] {
+  %reduce_sum.224 = f32[] parameter(0)
+  %reduce_sum.225 = f32[] parameter(1)
+  ROOT %reduce_sum.244 = f32[] add(%reduce_sum.224, %reduce_sum.225)
+}
+
+%region_39.40.clone (reduce_sum.624: f32[], reduce_sum.625: f32[]) -> f32[] {
+  %reduce_sum.624 = f32[] parameter(0)
+  %reduce_sum.625 = f32[] parameter(1)
+  ROOT %reduce_sum.626 = f32[] add(%reduce_sum.624, %reduce_sum.625)
+}
+
+%region_40.41 (reduce_sum.226: f32[], reduce_sum.231: f32[]) -> f32[] {
+  %reduce_sum.226 = f32[] parameter(0)
+  %reduce_sum.231 = f32[] parameter(1)
+  ROOT %reduce_sum.251 = f32[] add(%reduce_sum.226, %reduce_sum.231)
+}
+
+%region_41.42 (reduce_sum.232: f32[], reduce_sum.233: f32[]) -> f32[] {
+  %reduce_sum.232 = f32[] parameter(0)
+  %reduce_sum.233 = f32[] parameter(1)
+  ROOT %reduce_sum.258 = f32[] add(%reduce_sum.232, %reduce_sum.233)
+}
+
+%region_42.43 (reduce_sum.238: f32[], reduce_sum.239: f32[]) -> f32[] {
+  %reduce_sum.238 = f32[] parameter(0)
+  %reduce_sum.239 = f32[] parameter(1)
+  ROOT %reduce_sum.265 = f32[] add(%reduce_sum.238, %reduce_sum.239)
+}
+
+%region_42.43.clone (reduce_sum.627: f32[], reduce_sum.628: f32[]) -> f32[] {
+  %reduce_sum.627 = f32[] parameter(0)
+  %reduce_sum.628 = f32[] parameter(1)
+  ROOT %reduce_sum.629 = f32[] add(%reduce_sum.627, %reduce_sum.628)
+}
+
+%region_43.44 (reduce_sum.240: f32[], reduce_sum.245: f32[]) -> f32[] {
+  %reduce_sum.240 = f32[] parameter(0)
+  %reduce_sum.245 = f32[] parameter(1)
+  ROOT %reduce_sum.272 = f32[] add(%reduce_sum.240, %reduce_sum.245)
+}
+
+%region_43.44.clone (reduce_sum.630: f32[], reduce_sum.631: f32[]) -> f32[] {
+  %reduce_sum.630 = f32[] parameter(0)
+  %reduce_sum.631 = f32[] parameter(1)
+  ROOT %reduce_sum.632 = f32[] add(%reduce_sum.630, %reduce_sum.631)
+}
+
+%region_44.45 (reduce_sum.246: f32[], reduce_sum.247: f32[]) -> f32[] {
+  %reduce_sum.246 = f32[] parameter(0)
+  %reduce_sum.247 = f32[] parameter(1)
+  ROOT %reduce_sum.279 = f32[] add(%reduce_sum.246, %reduce_sum.247)
+}
+
+%region_44.45.clone (reduce_sum.633: f32[], reduce_sum.634: f32[]) -> f32[] {
+  %reduce_sum.633 = f32[] parameter(0)
+  %reduce_sum.634 = f32[] parameter(1)
+  ROOT %reduce_sum.635 = f32[] add(%reduce_sum.633, %reduce_sum.634)
+}
+
+%region_45.46 (reduce_sum.252: f32[], reduce_sum.253: f32[]) -> f32[] {
+  %reduce_sum.252 = f32[] parameter(0)
+  %reduce_sum.253 = f32[] parameter(1)
+  ROOT %reduce_sum.286 = f32[] add(%reduce_sum.252, %reduce_sum.253)
+}
+
+%region_45.46.clone (reduce_sum.636: f32[], reduce_sum.637: f32[]) -> f32[] {
+  %reduce_sum.636 = f32[] parameter(0)
+  %reduce_sum.637 = f32[] parameter(1)
+  ROOT %reduce_sum.638 = f32[] add(%reduce_sum.636, %reduce_sum.637)
+}
+
+%region_46.47 (reduce_sum.254: f32[], reduce_sum.259: f32[]) -> f32[] {
+  %reduce_sum.254 = f32[] parameter(0)
+  %reduce_sum.259 = f32[] parameter(1)
+  ROOT %reduce_sum.293 = f32[] add(%reduce_sum.254, %reduce_sum.259)
+}
+
+%region_46.47.clone (reduce_sum.639: f32[], reduce_sum.640: f32[]) -> f32[] {
+  %reduce_sum.639 = f32[] parameter(0)
+  %reduce_sum.640 = f32[] parameter(1)
+  ROOT %reduce_sum.641 = f32[] add(%reduce_sum.639, %reduce_sum.640)
+}
+
+%region_47.48 (reduce_sum.260: f32[], reduce_sum.261: f32[]) -> f32[] {
+  %reduce_sum.260 = f32[] parameter(0)
+  %reduce_sum.261 = f32[] parameter(1)
+  ROOT %reduce_sum.300 = f32[] add(%reduce_sum.260, %reduce_sum.261)
+}
+
+%region_47.48.clone (reduce_sum.642: f32[], reduce_sum.643: f32[]) -> f32[] {
+  %reduce_sum.642 = f32[] parameter(0)
+  %reduce_sum.643 = f32[] parameter(1)
+  ROOT %reduce_sum.644 = f32[] add(%reduce_sum.642, %reduce_sum.643)
+}
+
+%region_48.49 (reduce_sum.266: f32[], reduce_sum.267: f32[]) -> f32[] {
+  %reduce_sum.266 = f32[] parameter(0)
+  %reduce_sum.267 = f32[] parameter(1)
+  ROOT %reduce_sum.307 = f32[] add(%reduce_sum.266, %reduce_sum.267)
+}
+
+%region_49.50 (reduce_sum.268: f32[], reduce_sum.273: f32[]) -> f32[] {
+  %reduce_sum.268 = f32[] parameter(0)
+  %reduce_sum.273 = f32[] parameter(1)
+  ROOT %reduce_sum.314 = f32[] add(%reduce_sum.268, %reduce_sum.273)
+}
+
+%region_49.50.clone (reduce_sum.645: f32[], reduce_sum.646: f32[]) -> f32[] {
+  %reduce_sum.645 = f32[] parameter(0)
+  %reduce_sum.646 = f32[] parameter(1)
+  ROOT %reduce_sum.647 = f32[] add(%reduce_sum.645, %reduce_sum.646)
+}
+
+%region_50.51 (reduce_sum.274: f32[], reduce_sum.275: f32[]) -> f32[] {
+  %reduce_sum.274 = f32[] parameter(0)
+  %reduce_sum.275 = f32[] parameter(1)
+  ROOT %reduce_sum.321 = f32[] add(%reduce_sum.274, %reduce_sum.275)
+}
+
+%region_51.52 (reduce_sum.280: f32[], reduce_sum.281: f32[]) -> f32[] {
+  %reduce_sum.280 = f32[] parameter(0)
+  %reduce_sum.281 = f32[] parameter(1)
+  ROOT %reduce_sum.328 = f32[] add(%reduce_sum.280, %reduce_sum.281)
+}
+
+%region_51.52.clone (reduce_sum.648: f32[], reduce_sum.649: f32[]) -> f32[] {
+  %reduce_sum.648 = f32[] parameter(0)
+  %reduce_sum.649 = f32[] parameter(1)
+  ROOT %reduce_sum.650 = f32[] add(%reduce_sum.648, %reduce_sum.649)
+}
+
+%region_52.53 (reduce_sum.282: f32[], reduce_sum.287: f32[]) -> f32[] {
+  %reduce_sum.282 = f32[] parameter(0)
+  %reduce_sum.287 = f32[] parameter(1)
+  ROOT %reduce_sum.335 = f32[] add(%reduce_sum.282, %reduce_sum.287)
+}
+
+%region_52.53.clone (reduce_sum.651: f32[], reduce_sum.652: f32[]) -> f32[] {
+  %reduce_sum.651 = f32[] parameter(0)
+  %reduce_sum.652 = f32[] parameter(1)
+  ROOT %reduce_sum.653 = f32[] add(%reduce_sum.651, %reduce_sum.652)
+}
+
+%region_53.54 (reduce_sum.288: f32[], reduce_sum.289: f32[]) -> f32[] {
+  %reduce_sum.288 = f32[] parameter(0)
+  %reduce_sum.289 = f32[] parameter(1)
+  ROOT %reduce_sum.342 = f32[] add(%reduce_sum.288, %reduce_sum.289)
+}
+
+%region_53.54.clone (reduce_sum.654: f32[], reduce_sum.655: f32[]) -> f32[] {
+  %reduce_sum.654 = f32[] parameter(0)
+  %reduce_sum.655 = f32[] parameter(1)
+  ROOT %reduce_sum.656 = f32[] add(%reduce_sum.654, %reduce_sum.655)
+}
+
+%region_54.55 (reduce_sum.294: f32[], reduce_sum.295: f32[]) -> f32[] {
+  %reduce_sum.294 = f32[] parameter(0)
+  %reduce_sum.295 = f32[] parameter(1)
+  ROOT %reduce_sum.349 = f32[] add(%reduce_sum.294, %reduce_sum.295)
+}
+
+%region_55.56 (reduce_sum.296: f32[], reduce_sum.301: f32[]) -> f32[] {
+  %reduce_sum.296 = f32[] parameter(0)
+  %reduce_sum.301 = f32[] parameter(1)
+  ROOT %reduce_sum.356 = f32[] add(%reduce_sum.296, %reduce_sum.301)
+}
+
+%region_56.57 (reduce_sum.302: f32[], reduce_sum.303: f32[]) -> f32[] {
+  %reduce_sum.302 = f32[] parameter(0)
+  %reduce_sum.303 = f32[] parameter(1)
+  ROOT %reduce_sum.363 = f32[] add(%reduce_sum.302, %reduce_sum.303)
+}
+
+%region_56.57.clone (reduce_sum.657: f32[], reduce_sum.658: f32[]) -> f32[] {
+  %reduce_sum.657 = f32[] parameter(0)
+  %reduce_sum.658 = f32[] parameter(1)
+  ROOT %reduce_sum.659 = f32[] add(%reduce_sum.657, %reduce_sum.658)
+}
+
+%region_57.58 (reduce_sum.308: f32[], reduce_sum.309: f32[]) -> f32[] {
+  %reduce_sum.308 = f32[] parameter(0)
+  %reduce_sum.309 = f32[] parameter(1)
+  ROOT %reduce_sum.370 = f32[] add(%reduce_sum.308, %reduce_sum.309)
+}
+
+%region_57.58.clone (reduce_sum.660: f32[], reduce_sum.661: f32[]) -> f32[] {
+  %reduce_sum.660 = f32[] parameter(0)
+  %reduce_sum.661 = f32[] parameter(1)
+  ROOT %reduce_sum.662 = f32[] add(%reduce_sum.660, %reduce_sum.661)
+}
+
+%region_58.59 (reduce_sum.310: f32[], reduce_sum.315: f32[]) -> f32[] {
+  %reduce_sum.310 = f32[] parameter(0)
+  %reduce_sum.315 = f32[] parameter(1)
+  ROOT %reduce_sum.377 = f32[] add(%reduce_sum.310, %reduce_sum.315)
+}
+
+%region_58.59.clone (reduce_sum.663: f32[], reduce_sum.664: f32[]) -> f32[] {
+  %reduce_sum.663 = f32[] parameter(0)
+  %reduce_sum.664 = f32[] parameter(1)
+  ROOT %reduce_sum.665 = f32[] add(%reduce_sum.663, %reduce_sum.664)
+}
+
+%region_59.60 (reduce_sum.316: f32[], reduce_sum.317: f32[]) -> f32[] {
+  %reduce_sum.316 = f32[] parameter(0)
+  %reduce_sum.317 = f32[] parameter(1)
+  ROOT %reduce_sum.384 = f32[] add(%reduce_sum.316, %reduce_sum.317)
+}
+
+%region_59.60.clone (reduce_sum.666: f32[], reduce_sum.667: f32[]) -> f32[] {
+  %reduce_sum.666 = f32[] parameter(0)
+  %reduce_sum.667 = f32[] parameter(1)
+  ROOT %reduce_sum.668 = f32[] add(%reduce_sum.666, %reduce_sum.667)
+}
+
+%region_60.61 (reduce_sum.322: f32[], reduce_sum.323: f32[]) -> f32[] {
+  %reduce_sum.322 = f32[] parameter(0)
+  %reduce_sum.323 = f32[] parameter(1)
+  ROOT %reduce_sum.391 = f32[] add(%reduce_sum.322, %reduce_sum.323)
+}
+
+%region_60.61.clone (reduce_sum.669: f32[], reduce_sum.670: f32[]) -> f32[] {
+  %reduce_sum.669 = f32[] parameter(0)
+  %reduce_sum.670 = f32[] parameter(1)
+  ROOT %reduce_sum.671 = f32[] add(%reduce_sum.669, %reduce_sum.670)
+}
+
+%region_61.62 (reduce_sum.324: f32[], reduce_sum.329: f32[]) -> f32[] {
+  %reduce_sum.324 = f32[] parameter(0)
+  %reduce_sum.329 = f32[] parameter(1)
+  ROOT %reduce_sum.398 = f32[] add(%reduce_sum.324, %reduce_sum.329)
+}
+
+%region_61.62.clone (reduce_sum.672: f32[], reduce_sum.673: f32[]) -> f32[] {
+  %reduce_sum.672 = f32[] parameter(0)
+  %reduce_sum.673 = f32[] parameter(1)
+  ROOT %reduce_sum.674 = f32[] add(%reduce_sum.672, %reduce_sum.673)
+}
+
+%region_63.64.clone (reduce_sum.675: f32[], reduce_sum.676: f32[]) -> f32[] {
+  %reduce_sum.675 = f32[] parameter(0)
+  %reduce_sum.676 = f32[] parameter(1)
+  ROOT %reduce_sum.677 = f32[] add(%reduce_sum.675, %reduce_sum.676)
+}
+
+%region_64.65.clone (reduce_sum.678: f32[], reduce_sum.679: f32[]) -> f32[] {
+  %reduce_sum.678 = f32[] parameter(0)
+  %reduce_sum.679 = f32[] parameter(1)
+  ROOT %reduce_sum.680 = f32[] add(%reduce_sum.678, %reduce_sum.679)
+}
+
+%region_65.66.clone (reduce_sum.681: f32[], reduce_sum.682: f32[]) -> f32[] {
+  %reduce_sum.681 = f32[] parameter(0)
+  %reduce_sum.682 = f32[] parameter(1)
+  ROOT %reduce_sum.683 = f32[] add(%reduce_sum.681, %reduce_sum.682)
+}
+
+%region_68.69.clone (reduce_sum.684: f32[], reduce_sum.685: f32[]) -> f32[] {
+  %reduce_sum.684 = f32[] parameter(0)
+  %reduce_sum.685 = f32[] parameter(1)
+  ROOT %reduce_sum.686 = f32[] add(%reduce_sum.684, %reduce_sum.685)
+}
+
+%region_69.70.clone (reduce_sum.687: f32[], reduce_sum.688: f32[]) -> f32[] {
+  %reduce_sum.687 = f32[] parameter(0)
+  %reduce_sum.688 = f32[] parameter(1)
+  ROOT %reduce_sum.689 = f32[] add(%reduce_sum.687, %reduce_sum.688)
+}
+
+%region_70.71.clone (reduce_sum.690: f32[], reduce_sum.691: f32[]) -> f32[] {
+  %reduce_sum.690 = f32[] parameter(0)
+  %reduce_sum.691 = f32[] parameter(1)
+  ROOT %reduce_sum.692 = f32[] add(%reduce_sum.690, %reduce_sum.691)
+}
+
+%region_71.72.clone (reduce_sum.693: f32[], reduce_sum.694: f32[]) -> f32[] {
+  %reduce_sum.693 = f32[] parameter(0)
+  %reduce_sum.694 = f32[] parameter(1)
+  ROOT %reduce_sum.695 = f32[] add(%reduce_sum.693, %reduce_sum.694)
+}
+
+%region_72.73.clone (reduce_sum.696: f32[], reduce_sum.697: f32[]) -> f32[] {
+  %reduce_sum.696 = f32[] parameter(0)
+  %reduce_sum.697 = f32[] parameter(1)
+  ROOT %reduce_sum.698 = f32[] add(%reduce_sum.696, %reduce_sum.697)
+}
+
+%region_73.74.clone (reduce_sum.699: f32[], reduce_sum.700: f32[]) -> f32[] {
+  %reduce_sum.699 = f32[] parameter(0)
+  %reduce_sum.700 = f32[] parameter(1)
+  ROOT %reduce_sum.701 = f32[] add(%reduce_sum.699, %reduce_sum.700)
+}
+
+ENTRY %main.75_spmd (param.4: s32[], param.5: f32[4096], param.17: f32[512,32,14336], param.18: f32[512,32,14336], param.19: f32[14336,32,512], param.16: f32[4096,32], param.9: f32[4096,32], param.12: f32[512,32,8,128], param.15: f32[32,32,128,512], param.10: f32[512,32,32,128], param.13: f32[512,32,8,128], param.21: f32[512,128256], param.8: f32[128256,512], param.24: s32[], param.23: f32[4096], param.26: f32[512,32,14336], param.28: f32[512,32,14336], param.30: f32[14336,32,512], param.32: f32[4096,32], param.34: f32[4096,32], param.36: f32[512,32,8,128], param.38: f32[32,32,128,512], param.40: f32[512,32,32,128], param.42: f32[512,32,8,128], param.44: f32[512,128256], param.46: f32[128256,512], param.25: f32[4096], param.27: f32[512,32,14336], param.29: f32[512,32,14336], param.31: f32[14336,32,512], param.33: f32[4096,32], param.35: f32[4096,32], param.37: f32[512,32,8,128], param.39: f32[32,32,128,512], param.41: f32[512,32,32,128], param.43: f32[512,32,8,128], param.45: f32[512,128256], param.47: f32[128256,512], param.6: s32[], param.7: s32[1,8192], param.11: s32[1,8192], param.14: s32[1,8192], param.22: s32[1,8192], param.20: s32[1,8192]) -> (s32[], f32[4096], f32[512,32,14336], f32[512,32,14336], f32[14336,32,512], /*index=5*/f32[4096,32], f32[4096,32], f32[512,32,8,128], f32[32,32,128,512], f32[512,32,32,128], /*index=10*/f32[512,32,8,128], f32[512,128256], f32[128256,512], s32[], f32[4096], /*index=15*/f32[512,32,14336], f32[512,32,14336], f32[14336,32,512], f32[4096,32], f32[4096,32], /*index=20*/f32[512,32,8,128], f32[32,32,128,512], f32[512,32,32,128], f32[512,32,8,128], f32[512,128256], /*index=25*/f32[128256,512], f32[4096], f32[512,32,14336], f32[512,32,14336], f32[14336,32,512], /*index=30*/f32[4096,32], f32[4096,32], f32[512,32,8,128], f32[32,32,128,512], f32[512,32,32,128], /*index=35*/f32[512,32,8,128], f32[512,128256], f32[128256,512], s32[], f32[], /*index=40*/f32[], f32[], f32[], f32[], f32[], /*index=45*/s32[]) {
+  %param.4 = s32[] parameter(0), sharding={replicated}
+  %constant.478 = s32[] constant(1)
+  %add.316 = s32[] add(%param.4, %constant.478)
+  %custom-call.164 = s32[] custom-call(%add.316), custom_call_target="MoveToDevice"
+  %param.5 = f32[4096]{0} parameter(1), sharding={replicated}
+  %param.6 = s32[] parameter(38), sharding={replicated}
+  %constant.479 = s32[] constant(0)
+  %lt.18 = pred[] compare(%param.6, %constant.479), direction=LT
+  %constant.480 = f32[] constant(0)
+  %convert_element_type.252 = f32[] convert(%param.6)
+  %constant.481 = f32[] constant(3.14159274)
+  %mul.792 = f32[] multiply(%convert_element_type.252, %constant.481)
+  %cos.14 = f32[] cosine(%mul.792)
+  %constant.482 = f32[] constant(1)
+  %add.317 = f32[] add(%cos.14, %constant.482)
+  %constant.483 = f32[] constant(1.5e-05)
+  %mul.793 = f32[] multiply(%add.317, %constant.483)
+  %constant.484 = f32[] constant(0.5)
+  %mul.794 = f32[] multiply(%add.317, %constant.484)
+  %sub.39 = f32[] subtract(%constant.482, %mul.794)
+  %constant.485 = f32[] constant(3e-06)
+  %mul.795 = f32[] multiply(%sub.39, %constant.485)
+  %add.318 = f32[] add(%mul.793, %mul.795)
+  %select_n.66 = f32[] select(%lt.18, %constant.480, %add.318)
+  %constant.486 = f32[] constant(-1)
+  %mul.796 = f32[] multiply(%select_n.66, %constant.486)
+  %mul.797 = f32[4096]{0} broadcast(%mul.796), dimensions={}
+  %param.7 = s32[1,8192]{1,0} parameter(39), sharding={devices=[8,1]<=[8]}
+  %eq.57 = s32[1,8192,128256]{2,1,0} broadcast(%param.7), dimensions={0,1}
+  %iota.45 = s32[1,8192,128256]{2,1,0} iota(), iota_dimension=2
+  %eq.58 = pred[1,8192,128256]{2,1,0} compare(%eq.57, %iota.45), direction=EQ
+  %convert_element_type.253 = bf16[1,8192,128256]{2,1,0} convert(%eq.58)
+  %param.8 = f32[128256,512]{1,0} parameter(12), sharding={devices=[1,8]<=[8]}
+  %convert_element_type.254 = bf16[128256,512]{1,0} convert(%param.8)
+  %all-gather.14 = bf16[128256,4096]{1,0} all-gather(%convert_element_type.254), channel_id=24, replica_groups=[1,8]<=[8], dimensions={1}, use_global_device_ids=true
+  %dot.26 = bf16[1,8192,4096]{2,1,0} dot(%convert_element_type.253, %all-gather.14), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %sharding_constraint.148 = bf16[1,8192,4096]{2,1,0} copy(%dot.26)
+  %constant.487 = bf16[] constant(0)
+  %broadcast.151 = bf16[32,1,8192,4096]{3,2,1,0} broadcast(%constant.487), dimensions={}
+  %broadcast.152 = bf16[32,1,8192,32,128]{4,3,2,1,0} broadcast(%constant.487), dimensions={}
+  %broadcast.153 = bf16[32,1,8192,8,128]{4,3,2,1,0} broadcast(%constant.487), dimensions={}
+  %broadcast_in_dim.164 = f32[32,1,32,8192,1]{4,3,2,1,0} broadcast(%constant.480), dimensions={}
+  %constant.488 = u32[] constant(0)
+  %broadcast_in_dim.165 = u32[32,2,4]{2,1,0} broadcast(%constant.488), dimensions={}
+  %broadcast.154 = bf16[32,1,8192,14336]{3,2,1,0} broadcast(%constant.487), dimensions={}
+  %param.9 = f32[4096,32]{1,0} parameter(6), sharding={replicated}
+  %transpose.65 = f32[32,4096]{0,1} transpose(%param.9), dimensions={1,0}
+  %param.10 = f32[512,32,32,128]{3,2,1,0} parameter(9), sharding={devices=[8,1,1,1]<=[8]}
+  %transpose.66 = f32[32,512,32,128]{3,2,0,1} transpose(%param.10), dimensions={1,0,2,3}
+  %param.11 = s32[1,8192]{1,0} parameter(40), sharding={devices=[8,1]<=[8]}
+  %broadcast_in_dim.166 = s32[1,8192,1,1]{3,2,1,0} reshape(%param.11)
+  %convert_element_type.255 = f32[1,8192,1,1]{3,2,1,0} convert(%broadcast_in_dim.166)
+  %div.284 = f32[1,8192]{1,0} reshape(%convert_element_type.255)
+  %div.285 = f32[1,8192,1,64]{3,2,1,0} broadcast(%div.284), dimensions={0,1}
+  %constant.489 = f32[] constant(500000)
+  %broadcast.155 = f32[64]{0} broadcast(%constant.489), dimensions={}
+  %iota.46 = s32[64]{0} iota(), iota_dimension=0
+  %constant.490 = s32[] constant(2)
+  %broadcast.156 = s32[64]{0} broadcast(%constant.490), dimensions={}
+  %mul.798 = s32[64]{0} multiply(%iota.46, %broadcast.156)
+  %convert_element_type.256 = f32[64]{0} convert(%mul.798)
+  %constant.491 = f32[] constant(0.0078125)
+  %broadcast.157 = f32[64]{0} broadcast(%constant.491), dimensions={}
+  %div.286 = f32[64]{0} multiply(%convert_element_type.256, %broadcast.157)
+  %pow.20 = f32[64]{0} power(%broadcast.155, %div.286)
+  %div.287 = f32[1,8192,1,64]{3,2,1,0} broadcast(%pow.20), dimensions={3}
+  %div.288 = f32[1,8192,1,64]{3,2,1,0} divide(%div.285, %div.287)
+  %cos.15 = f32[1,8192,1,64]{3,2,1,0} cosine(%div.288)
+  %convert_element_type.257 = bf16[1,8192,1,64]{3,2,1,0} convert(%cos.15)
+  %sin.12 = f32[1,8192,1,64]{3,2,1,0} sine(%div.288)
+  %convert_element_type.258 = bf16[1,8192,1,64]{3,2,1,0} convert(%sin.12)
+  %param.12 = f32[512,32,8,128]{3,2,1,0} parameter(7), sharding={devices=[8,1,1,1]<=[8]}
+  %transpose.67 = f32[32,512,8,128]{3,2,0,1} transpose(%param.12), dimensions={1,0,2,3}
+  %param.13 = f32[512,32,8,128]{3,2,1,0} parameter(10), sharding={devices=[8,1,1,1]<=[8]}
+  %transpose.68 = f32[32,512,8,128]{3,2,0,1} transpose(%param.13), dimensions={1,0,2,3}
+  %param.14 = s32[1,8192]{1,0} parameter(41), sharding={devices=[8,1]<=[8]}
+  %eq.59 = s32[1,8192,8192]{2,1,0} broadcast(%param.14), dimensions={0,1}
+  %eq.60 = s32[1,8192,8192]{2,1,0} broadcast(%param.14), dimensions={0,2}
+  %eq.61 = pred[1,8192,8192]{2,1,0} compare(%eq.59, %eq.60), direction=EQ
+  %broadcast_in_dim.168 = pred[1,1,1,8192,8192]{4,3,2,1,0} reshape(%eq.61)
+  %iota.48 = s32[8192,8192]{1,0} iota(), iota_dimension=1
+  %iota.49 = s32[8192,8192]{1,0} iota(), iota_dimension=0
+  %le.6 = pred[8192,8192]{1,0} compare(%iota.48, %iota.49), direction=LE
+  %and.18 = pred[1,1,1,8192,8192]{4,3,2,1,0} broadcast(%le.6), dimensions={3,4}
+  %and.19 = pred[1,1,1,8192,8192]{4,3,2,1,0} and(%broadcast_in_dim.168, %and.18)
+  %broadcast_in_dim.169 = f32[1,1,1,8192,8192]{4,3,2,1,0} broadcast(%constant.480), dimensions={}
+  %constant.492 = f32[] constant(-2.38197633e+38)
+  %broadcast_in_dim.170 = f32[1,1,1,8192,8192]{4,3,2,1,0} broadcast(%constant.492), dimensions={}
+  %select_n.67 = f32[1,1,1,8192,8192]{4,3,2,1,0} select(%and.19, %broadcast_in_dim.169, %broadcast_in_dim.170)
+  %ne.13 = pred[1,1,1,8192,8192]{4,3,2,1,0} compare(%select_n.67, %broadcast_in_dim.169), direction=NE
+  %not.6 = pred[1,1,1,8192,8192]{4,3,2,1,0} not(%ne.13)
+  %convert_element_type.263 = s32[1,1,1,8192,8192]{4,3,2,1,0} convert(%not.6)
+  %reduce.11 = s32[1,1,1,8192]{3,2,1,0} reduce(%convert_element_type.263, %constant.479), dimensions={3}, to_apply=%region_0.1
+  %slice.14 = s32[1,1,1,1]{3,2,1,0} slice(%reduce.11), slice={[0:1], [0:1], [0:1], [0:1]}
+  %squeeze.92 = s32[1,1]{1,0} reshape(%slice.14)
+  %reduce.12 = s32[1,1,1,8192]{3,2,1,0} reduce(%convert_element_type.263, %constant.479), dimensions={4}, to_apply=%region_1.2
+  %constant.502 = s32[] constant(-2147483648)
+  %reduce.13 = s32[1,1]{1,0} reduce(%reduce.12, %constant.502), dimensions={3,2}, to_apply=%region_2.3
+  %param.15 = f32[32,32,128,512]{3,2,1,0} parameter(8), sharding={devices=[1,1,1,8]<=[8]}
+  %transpose.69 = f32[32,32,128,512]{3,2,0,1} transpose(%param.15), dimensions={1,0,2,3}
+  %param.16 = f32[4096,32]{1,0} parameter(5), sharding={replicated}
+  %transpose.70 = f32[32,4096]{0,1} transpose(%param.16), dimensions={1,0}
+  %param.17 = f32[512,32,14336]{2,1,0} parameter(2), sharding={devices=[8,1,1]<=[8]}
+  %transpose.71 = f32[32,512,14336]{2,0,1} transpose(%param.17), dimensions={1,0,2}
+  %param.18 = f32[512,32,14336]{2,1,0} parameter(3), sharding={devices=[8,1,1]<=[8]}
+  %transpose.72 = f32[32,512,14336]{2,0,1} transpose(%param.18), dimensions={1,0,2}
+  %param.19 = f32[14336,32,512]{2,1,0} parameter(4), sharding={devices=[1,1,8]<=[8]}
+  %transpose.73 = f32[32,14336,512]{2,0,1} transpose(%param.19), dimensions={1,0,2}
+  %tuple.17 = (s32[], bf16[1,8192,4096]{2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, /*index=5*/bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, /*index=10*/bf16[32,1,8192,14336]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,4096]{0,1}, f32[32,512,32,128]{3,2,0,1}, bf16[1,8192,1,64]{3,2,1,0}, /*index=15*/bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,0,1}, bf16[1,8192,1,64]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,0,1}, /*index=20*/s32[1,1]{1,0}, s32[1,1]{1,0}, f32[32,32,128,512]{3,2,0,1}, f32[32,4096]{0,1}, f32[32,512,14336]{2,0,1}, /*index=25*/f32[32,512,14336]{2,0,1}, f32[32,14336,512]{2,0,1}) tuple(%constant.479, %sharding_constraint.148, %broadcast.151, %broadcast.152, %broadcast.153, /*index=5*/%broadcast.153, %broadcast_in_dim.164, %broadcast_in_dim.165, %broadcast.152, %broadcast.151, /*index=10*/%broadcast.154, %broadcast.154, %transpose.65, %transpose.66, %convert_element_type.257, /*index=15*/%convert_element_type.258, %transpose.67, %convert_element_type.257, %convert_element_type.258, %transpose.68, /*index=20*/%squeeze.92, %reduce.13, %transpose.69, %transpose.70, %transpose.71, /*index=25*/%transpose.72, %transpose.73)
+  %while.54 = (s32[], bf16[1,8192,4096]{2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, /*index=5*/bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, /*index=10*/bf16[32,1,8192,14336]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,4096]{0,1}, f32[32,512,32,128]{3,2,0,1}, bf16[1,8192,1,64]{3,2,1,0}, /*index=15*/bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,0,1}, bf16[1,8192,1,64]{3,2,1,0}, bf16[1,8192,1,64]{3,2,1,0}, f32[32,512,8,128]{3,2,0,1}, /*index=20*/s32[1,1]{1,0}, s32[1,1]{1,0}, f32[32,32,128,512]{3,2,0,1}, f32[32,4096]{0,1}, f32[32,512,14336]{2,0,1}, /*index=25*/f32[32,512,14336]{2,0,1}, f32[32,14336,512]{2,0,1}) while(%tuple.17), condition=%region_6.7_spmd, body=%region_3.6_spmd
+  %get-tuple-element.303 = bf16[1,8192,4096]{2,1,0} get-tuple-element(%while.54), index=1
+  %convert_element_type.265 = f32[1,8192,4096]{2,1,0} convert(%get-tuple-element.303)
+  %square.92 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.265, %convert_element_type.265)
+  %reduce.14 = f32[1,8192]{1,0} reduce(%square.92, %constant.480), dimensions={2}, to_apply=%region_7.8
+  %broadcast_in_dim.171 = f32[1,8192,1]{2,1,0} reshape(%reduce.14)
+  %constant.503 = f32[] constant(0.000244140625)
+  %broadcast.158 = f32[1,8192,1]{2,1,0} broadcast(%constant.503), dimensions={}
+  %div.294 = f32[1,8192,1]{2,1,0} multiply(%broadcast_in_dim.171, %broadcast.158)
+  %constant.504 = f32[] constant(1e-05)
+  %add.319 = f32[1,8192,1]{2,1,0} broadcast(%constant.504), dimensions={}
+  %add.320 = f32[1,8192,1]{2,1,0} add(%div.294, %add.319)
+  %rsqrt.18 = f32[1,8192,1]{2,1,0} rsqrt(%add.320)
+  %mul.800 = f32[1,8192]{1,0} reshape(%rsqrt.18)
+  %mul.801 = f32[1,8192,4096]{2,1,0} broadcast(%mul.800), dimensions={0,1}
+  %mul.802 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.265, %mul.801)
+  %convert_element_type.266 = bf16[1,8192,4096]{2,1,0} convert(%mul.802)
+  %param.20 = s32[1,8192]{1,0} parameter(43), sharding={devices=[8,1]<=[8]}
+  %broadcast.159 = s32[1,8192]{1,0} broadcast(%constant.479), dimensions={}
+  %ne.14 = pred[1,8192]{1,0} compare(%param.20, %broadcast.159), direction=NE
+  %convert_element_type.267 = s32[1,8192]{1,0} convert(%ne.14)
+  %reduce.15 = s32[] reduce(%convert_element_type.267, %constant.479), dimensions={0,1}, to_apply=%region_8.9
+  %all-reduce.9 = s32[] all-reduce(%reduce.15), channel_id=25, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_8.9.clone
+  %convert_element_type.268 = f32[] convert(%all-reduce.9)
+  %constant.505 = f32[] constant(1e-08)
+  %add.321 = f32[] add(%convert_element_type.268, %constant.505)
+  %div.295 = f32[] divide(%constant.482, %add.321)
+  %broadcast_in_dim.172 = f32[1,8192]{1,0} broadcast(%div.295), dimensions={}
+  %broadcast.160 = f32[1,8192]{1,0} broadcast(%constant.480), dimensions={}
+  %mul.803 = f32[1,8192]{1,0} select(%ne.14, %broadcast_in_dim.172, %broadcast.160)
+  %sharding_constraint.149 = f32[1,8192]{1,0} copy(%mul.803)
+  %mul.804 = f32[1,8192,128256]{2,1,0} broadcast(%sharding_constraint.149), dimensions={0,1}
+  %convert_element_type.269 = bf16[4096]{0} convert(%param.5)
+  %mul.805 = bf16[1,8192,4096]{2,1,0} broadcast(%convert_element_type.269), dimensions={2}
+  %mul.806 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.266, %mul.805)
+  %param.21 = f32[512,128256]{1,0} parameter(11), sharding={devices=[8,1]<=[8]}
+  %convert_element_type.270 = bf16[512,128256]{1,0} convert(%param.21)
+  %all-gather.15 = bf16[4096,128256]{1,0} all-gather(%convert_element_type.270), channel_id=26, replica_groups=[1,8]<=[8], dimensions={0}, use_global_device_ids=true
+  %dot.27 = bf16[1,8192,128256]{2,1,0} dot(%mul.806, %all-gather.15), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  %sharding_constraint.150 = bf16[1,8192,128256]{2,1,0} copy(%dot.27)
+  %convert_element_type.271 = f32[1,8192,128256]{2,1,0} convert(%sharding_constraint.150)
+  %constant.506 = f32[] constant(-inf)
+  %reduce.16 = f32[1,8192]{1,0} reduce(%convert_element_type.271, %constant.506), dimensions={2}, to_apply=%region_9.10
+  %sub.40 = f32[1,8192,128256]{2,1,0} broadcast(%reduce.16), dimensions={0,1}
+  %sub.41 = f32[1,8192,128256]{2,1,0} subtract(%convert_element_type.271, %sub.40)
+  %exp.12 = f32[1,8192,128256]{2,1,0} exponential(%sub.41)
+  %reduce.17 = f32[1,8192]{1,0} reduce(%exp.12, %constant.480), dimensions={2}, to_apply=%region_10.11
+  %broadcast_in_dim.173 = f32[1,8192,1]{2,1,0} reshape(%reduce.17)
+  %log.4 = f32[1,8192,1]{2,1,0} log(%broadcast_in_dim.173)
+  %broadcast_in_dim.174 = f32[1,8192,1]{2,1,0} reshape(%reduce.16)
+  %add.322 = f32[1,8192,1]{2,1,0} add(%log.4, %broadcast_in_dim.174)
+  %squeeze.93 = f32[1,8192]{1,0} reshape(%add.322)
+  %mul.807 = f32[1,8192]{1,0} multiply(%squeeze.93, %broadcast.160)
+  %add.323 = f32[1,8192]{1,0} broadcast(%constant.482), dimensions={}
+  %add.324 = f32[1,8192]{1,0} add(%mul.807, %add.323)
+  %mul.808 = f32[1,8192,128256]{2,1,0} broadcast(%add.324), dimensions={0,1}
+  %mul.809 = f32[1,8192,128256]{2,1,0} multiply(%mul.808, %exp.12)
+  %div.296 = f32[1,8192,128256]{2,1,0} broadcast(%reduce.17), dimensions={0,1}
+  %div.297 = f32[1,8192,128256]{2,1,0} divide(%mul.809, %div.296)
+  %param.22 = s32[1,8192]{1,0} parameter(42), sharding={devices=[8,1]<=[8]}
+  %eq.62 = s32[1,8192,128256]{2,1,0} broadcast(%param.22), dimensions={0,1}
+  %iota.50 = s32[1,8192,128256]{2,1,0} iota(), iota_dimension=2
+  %eq.63 = pred[1,8192,128256]{2,1,0} compare(%eq.62, %iota.50), direction=EQ
+  %convert_element_type.272 = f32[1,8192,128256]{2,1,0} convert(%eq.63)
+  %sub.42 = f32[1,8192,128256]{2,1,0} subtract(%div.297, %convert_element_type.272)
+  %mul.810 = f32[1,8192,128256]{2,1,0} multiply(%mul.804, %sub.42)
+  %convert_element_type.273 = bf16[1,8192,128256]{2,1,0} convert(%mul.810)
+  %sharding_constraint.151 = bf16[1,8192,128256]{2,1,0} copy(%convert_element_type.273)
+  %dot.28 = bf16[1,8192,4096]{2,1,0} dot(%sharding_constraint.151, %all-gather.15), lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %mul.811 = bf16[1,8192,4096]{2,1,0} multiply(%convert_element_type.266, %dot.28)
+  %reduce.18 = bf16[4096]{0} reduce(%mul.811, %constant.487), dimensions={0,1}, to_apply=%region_11.12
+  %all-reduce.10 = bf16[4096]{0} all-reduce(%reduce.18), channel_id=27, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_11.12.clone
+  %convert_element_type.274 = f32[4096]{0} convert(%all-reduce.10)
+  %mul.812 = f32[4096]{0} multiply(%convert_element_type.274, %convert_element_type.274)
+  %reduce.19 = f32[] reduce(%mul.812, %constant.480), dimensions={0}, to_apply=%region_12.13
+  %mul.814 = bf16[1,8192,4096]{2,1,0} multiply(%dot.28, %mul.805)
+  %convert_element_type.275 = f32[1,8192,4096]{2,1,0} convert(%mul.814)
+  %mul.817 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.275, %mul.801)
+  %mul.818 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.265, %convert_element_type.275)
+  %reduce.20 = f32[1,8192]{1,0} reduce(%mul.818, %constant.480), dimensions={2}, to_apply=%region_13.14
+  %reshape.292 = f32[1,8192,1]{2,1,0} reshape(%reduce.20)
+  %div.298 = f32[1,8192,1]{2,1,0} divide(%rsqrt.18, %add.320)
+  %constant.507 = f32[] constant(-0.5)
+  %mul.819 = f32[1,8192,1]{2,1,0} broadcast(%constant.507), dimensions={}
+  %mul.820 = f32[1,8192,1]{2,1,0} multiply(%div.298, %mul.819)
+  %mul.821 = f32[1,8192,1]{2,1,0} multiply(%reshape.292, %mul.820)
+  %div.299 = f32[1,8192,1]{2,1,0} multiply(%mul.821, %broadcast.158)
+  %reduce_sum.702 = f32[1,8192]{1,0} reshape(%div.299)
+  %constant.508 = f32[] constant(2)
+  %mul.822 = f32[1,8192]{1,0} broadcast(%constant.508), dimensions={}
+  %mul.823 = f32[1,8192]{1,0} multiply(%reduce_sum.702, %mul.822)
+  %mul.824 = f32[1,8192,4096]{2,1,0} broadcast(%mul.823), dimensions={0,1}
+  %mul.825 = f32[1,8192,4096]{2,1,0} multiply(%convert_element_type.265, %mul.824)
+  %add_any.51 = f32[1,8192,4096]{2,1,0} add(%mul.817, %mul.825)
+  %convert_element_type.276 = bf16[1,8192,4096]{2,1,0} convert(%add_any.51)
+  %broadcast.161 = f32[32,512,14336]{2,1,0} broadcast(%constant.480), dimensions={}
+  %broadcast_in_dim.175 = f32[32,14336,512]{2,1,0} broadcast(%constant.480), dimensions={}
+  %broadcast.162 = f32[32,4096]{1,0} broadcast(%constant.480), dimensions={}
+  %broadcast.163 = f32[32,512,8,128]{3,2,1,0} broadcast(%constant.480), dimensions={}
+  %broadcast_in_dim.176 = f32[32,32,128,512]{3,2,1,0} broadcast(%constant.480), dimensions={}
+  %broadcast_in_dim.177 = f32[32,512,32,128]{3,2,1,0} broadcast(%constant.480), dimensions={}
+  %get-tuple-element.304 = bf16[32,1,8192,14336]{3,2,1,0} get-tuple-element(%while.54), index=10
+  %get-tuple-element.305 = bf16[32,1,8192,14336]{3,2,1,0} get-tuple-element(%while.54), index=11
+  %get-tuple-element.306 = bf16[32,1,8192,4096]{3,2,1,0} get-tuple-element(%while.54), index=2
+  %get-tuple-element.307 = bf16[32,1,8192,4096]{3,2,1,0} get-tuple-element(%while.54), index=9
+  %get-tuple-element.308 = bf16[32,1,8192,32,128]{4,3,2,1,0} get-tuple-element(%while.54), index=3
+  %get-tuple-element.309 = bf16[32,1,8192,8,128]{4,3,2,1,0} get-tuple-element(%while.54), index=4
+  %get-tuple-element.310 = bf16[32,1,8192,8,128]{4,3,2,1,0} get-tuple-element(%while.54), index=5
+  %get-tuple-element.311 = f32[32,1,32,8192,1]{4,3,2,1,0} get-tuple-element(%while.54), index=6
+  %get-tuple-element.312 = u32[32,2,4]{2,1,0} get-tuple-element(%while.54), index=7
+  %get-tuple-element.313 = bf16[32,1,8192,32,128]{4,3,2,1,0} get-tuple-element(%while.54), index=8
+  %data__inputs_segmentation__.0 = s32[1,8192]{1,0} copy(%param.14)
+  %data__inputs_position__.0 = s32[1,8192]{1,0} copy(%param.11)
+  %tuple.18 = (s32[], bf16[1,8192,4096]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}, /*index=5*/f32[32,4096]{1,0}, f32[32,4096]{1,0}, f32[32,512,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,512,32,128]{3,2,1,0}, /*index=10*/f32[32,512,8,128]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,14336,512]{2,0,1}, f32[32,512,14336]{2,0,1}, bf16[32,1,8192,14336]{3,2,1,0}, /*index=15*/f32[32,512,14336]{2,0,1}, f32[32,4096]{0,1}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=20*/bf16[32,1,8192,8,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=25*/f32[32,32,128,512]{3,2,0,1}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,0,1}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,0,1}, /*index=30*/f32[32,512,32,128]{3,2,0,1}, f32[32,4096]{0,1}) tuple(%constant.479, %convert_element_type.276, %broadcast.161, %broadcast.161, %broadcast_in_dim.175, /*index=5*/%broadcast.162, %broadcast.162, %broadcast.163, %broadcast_in_dim.176, %broadcast_in_dim.177, /*index=10*/%broadcast.163, %get-tuple-element.304, %transpose.73, %transpose.72, %get-tuple-element.305, /*index=15*/%transpose.71, %transpose.70, %get-tuple-element.306, %get-tuple-element.307, %get-tuple-element.308, /*index=20*/%get-tuple-element.309, %get-tuple-element.310, %get-tuple-element.311, %get-tuple-element.312, %get-tuple-element.313, /*index=25*/%transpose.69, %data__inputs_segmentation__.0, %transpose.68, %data__inputs_position__.0, %transpose.67, /*index=30*/%transpose.66, %transpose.65)
+  %while.55 = (s32[], bf16[1,8192,4096]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,512,14336]{2,1,0}, f32[32,14336,512]{2,1,0}, /*index=5*/f32[32,4096]{1,0}, f32[32,4096]{1,0}, f32[32,512,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[32,512,32,128]{3,2,1,0}, /*index=10*/f32[32,512,8,128]{3,2,1,0}, bf16[32,1,8192,14336]{3,2,1,0}, f32[32,14336,512]{2,0,1}, f32[32,512,14336]{2,0,1}, bf16[32,1,8192,14336]{3,2,1,0}, /*index=15*/f32[32,512,14336]{2,0,1}, f32[32,4096]{0,1}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,4096]{3,2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=20*/bf16[32,1,8192,8,128]{4,3,2,1,0}, bf16[32,1,8192,8,128]{4,3,2,1,0}, f32[32,1,32,8192,1]{4,3,2,1,0}, u32[32,2,4]{2,1,0}, bf16[32,1,8192,32,128]{4,3,2,1,0}, /*index=25*/f32[32,32,128,512]{3,2,0,1}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,0,1}, s32[1,8192]{1,0}, f32[32,512,8,128]{3,2,0,1}, /*index=30*/f32[32,512,32,128]{3,2,0,1}, f32[32,4096]{0,1}) while(%tuple.18), condition=%region_24.25_spmd, body=%region_14.24_spmd
+  %get-tuple-element.314 = f32[32,512,14336]{2,1,0} get-tuple-element(%while.55), index=2
+  %transpose.74 = f32[512,32,14336]{2,0,1} transpose(%get-tuple-element.314), dimensions={1,0,2}
+  %mul.826 = f32[512,32,14336]{2,0,1} multiply(%transpose.74, %transpose.74)
+  %reduce.21 = f32[] reduce(%mul.826, %constant.480), dimensions={0,1,2}, to_apply=%region_25.26
+  %all-reduce.11 = f32[] all-reduce(%reduce.21), channel_id=28, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_25.26.clone
+  %add.325 = f32[] add(%reduce.19, %all-reduce.11)
+  %get-tuple-element.315 = f32[32,512,14336]{2,1,0} get-tuple-element(%while.55), index=3
+  %transpose.75 = f32[512,32,14336]{2,0,1} transpose(%get-tuple-element.315), dimensions={1,0,2}
+  %mul.827 = f32[512,32,14336]{2,0,1} multiply(%transpose.75, %transpose.75)
+  %reduce.22 = f32[] reduce(%mul.827, %constant.480), dimensions={0,1,2}, to_apply=%region_26.27
+  %all-reduce.12 = f32[] all-reduce(%reduce.22), channel_id=29, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_26.27.clone
+  %add.326 = f32[] add(%add.325, %all-reduce.12)
+  %get-tuple-element.316 = f32[32,14336,512]{2,1,0} get-tuple-element(%while.55), index=4
+  %transpose.76 = f32[14336,32,512]{2,0,1} transpose(%get-tuple-element.316), dimensions={1,0,2}
+  %mul.828 = f32[14336,32,512]{2,0,1} multiply(%transpose.76, %transpose.76)
+  %reduce.23 = f32[] reduce(%mul.828, %constant.480), dimensions={0,1,2}, to_apply=%region_27.28
+  %all-reduce.13 = f32[] all-reduce(%reduce.23), channel_id=30, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_27.28.clone
+  %add.327 = f32[] add(%add.326, %all-reduce.13)
+  %get-tuple-element.317 = f32[32,4096]{1,0} get-tuple-element(%while.55), index=5
+  %transpose.77 = f32[4096,32]{0,1} transpose(%get-tuple-element.317), dimensions={1,0}
+  %mul.829 = f32[4096,32]{0,1} multiply(%transpose.77, %transpose.77)
+  %reduce.24 = f32[] reduce(%mul.829, %constant.480), dimensions={0,1}, to_apply=%region_28.29
+  %add.328 = f32[] add(%add.327, %reduce.24)
+  %get-tuple-element.318 = f32[32,4096]{1,0} get-tuple-element(%while.55), index=6
+  %transpose.78 = f32[4096,32]{0,1} transpose(%get-tuple-element.318), dimensions={1,0}
+  %mul.830 = f32[4096,32]{0,1} multiply(%transpose.78, %transpose.78)
+  %reduce.25 = f32[] reduce(%mul.830, %constant.480), dimensions={0,1}, to_apply=%region_29.30
+  %add.329 = f32[] add(%add.328, %reduce.25)
+  %get-tuple-element.319 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%while.55), index=7
+  %transpose.79 = f32[512,32,8,128]{3,2,0,1} transpose(%get-tuple-element.319), dimensions={1,0,2,3}
+  %mul.831 = f32[512,32,8,128]{3,2,0,1} multiply(%transpose.79, %transpose.79)
+  %reduce.26 = f32[] reduce(%mul.831, %constant.480), dimensions={0,1,2,3}, to_apply=%region_30.31
+  %all-reduce.14 = f32[] all-reduce(%reduce.26), channel_id=31, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_30.31.clone
+  %add.330 = f32[] add(%add.329, %all-reduce.14)
+  %get-tuple-element.320 = f32[32,32,128,512]{3,2,1,0} get-tuple-element(%while.55), index=8
+  %transpose.80 = f32[32,32,128,512]{3,2,0,1} transpose(%get-tuple-element.320), dimensions={1,0,2,3}
+  %mul.832 = f32[32,32,128,512]{3,2,0,1} multiply(%transpose.80, %transpose.80)
+  %reduce.27 = f32[] reduce(%mul.832, %constant.480), dimensions={0,1,2,3}, to_apply=%region_31.32
+  %all-reduce.15 = f32[] all-reduce(%reduce.27), channel_id=32, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_31.32.clone
+  %add.331 = f32[] add(%add.330, %all-reduce.15)
+  %get-tuple-element.321 = f32[32,512,32,128]{3,2,1,0} get-tuple-element(%while.55), index=9
+  %transpose.81 = f32[512,32,32,128]{3,2,0,1} transpose(%get-tuple-element.321), dimensions={1,0,2,3}
+  %mul.833 = f32[512,32,32,128]{3,2,0,1} multiply(%transpose.81, %transpose.81)
+  %reduce.28 = f32[] reduce(%mul.833, %constant.480), dimensions={0,1,2,3}, to_apply=%region_32.33
+  %all-reduce.16 = f32[] all-reduce(%reduce.28), channel_id=33, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_32.33.clone
+  %add.332 = f32[] add(%add.331, %all-reduce.16)
+  %get-tuple-element.322 = f32[32,512,8,128]{3,2,1,0} get-tuple-element(%while.55), index=10
+  %transpose.82 = f32[512,32,8,128]{3,2,0,1} transpose(%get-tuple-element.322), dimensions={1,0,2,3}
+  %mul.834 = f32[512,32,8,128]{3,2,0,1} multiply(%transpose.82, %transpose.82)
+  %reduce.29 = f32[] reduce(%mul.834, %constant.480), dimensions={0,1,2,3}, to_apply=%region_33.34
+  %all-reduce.17 = f32[] all-reduce(%reduce.29), channel_id=34, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_33.34.clone
+  %add.333 = f32[] add(%add.332, %all-reduce.17)
+  %dot.29 = bf16[4096,128256]{1,0} dot(%mul.806, %sharding_constraint.151), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.18 = bf16[4096,128256]{1,0} all-reduce(%dot.29), channel_id=35, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.7.clone
+  %constant.511 = s32[8]{0} constant({0, 512, 1024, 1536, 2048, 2560, 3072, 3584})
+  %partition-id.4 = u32[] partition-id()
+  %dynamic-slice.50 = s32[1]{0} dynamic-slice(%constant.511, %partition-id.4), dynamic_slice_sizes={1}
+  %reshape.293 = s32[] reshape(%dynamic-slice.50)
+  %dynamic-slice.51 = bf16[512,128256]{1,0} dynamic-slice(%all-reduce.18, %reshape.293, %constant.479), dynamic_slice_sizes={512,128256}
+  %convert_element_type.277 = f32[512,128256]{1,0} convert(%dynamic-slice.51)
+  %mul.835 = f32[512,128256]{1,0} multiply(%convert_element_type.277, %convert_element_type.277)
+  %reduce.30 = f32[] reduce(%mul.835, %constant.480), dimensions={0,1}, to_apply=%region_34.35
+  %all-reduce.19 = f32[] all-reduce(%reduce.30), channel_id=36, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_34.35.clone
+  %add.334 = f32[] add(%add.333, %all-reduce.19)
+  %get-tuple-element.323 = bf16[1,8192,4096]{2,1,0} get-tuple-element(%while.55), index=1
+  %sharding_constraint.152 = bf16[1,8192,4096]{2,1,0} copy(%get-tuple-element.323)
+  %dot.30 = bf16[128256,4096]{1,0} dot(%convert_element_type.253, %sharding_constraint.152), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  %all-reduce.20 = bf16[128256,4096]{1,0} all-reduce(%dot.30), channel_id=37, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%add.8.clone
+  %dynamic-slice.53 = bf16[128256,512]{1,0} dynamic-slice(%all-reduce.20, %constant.479, %reshape.293), dynamic_slice_sizes={128256,512}
+  %convert_element_type.278 = f32[128256,512]{1,0} convert(%dynamic-slice.53)
+  %mul.836 = f32[128256,512]{1,0} multiply(%convert_element_type.278, %convert_element_type.278)
+  %reduce.31 = f32[] reduce(%mul.836, %constant.480), dimensions={0,1}, to_apply=%region_35.36
+  %all-reduce.21 = f32[] all-reduce(%reduce.31), channel_id=38, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_35.36.clone
+  %add.335 = f32[] add(%add.334, %all-reduce.21)
+  %sqrt.32 = f32[] sqrt(%add.335)
+  %lt.19 = pred[] compare(%sqrt.32, %constant.482), direction=LT
+  %select_n.68 = pred[4096]{0} broadcast(%lt.19), dimensions={}
+  %div.300 = f32[4096]{0} broadcast(%sqrt.32), dimensions={}
+  %div.301 = f32[4096]{0} divide(%convert_element_type.274, %div.300)
+  %select_n.69 = f32[4096]{0} select(%select_n.68, %convert_element_type.274, %div.301)
+  %constant.517 = f32[] constant(0.1)
+  %broadcast.164 = f32[4096]{0} broadcast(%constant.517), dimensions={}
+  %mul.837 = f32[4096]{0} multiply(%select_n.69, %broadcast.164)
+  %param.23 = f32[4096]{0} parameter(14), sharding={replicated}
+  %constant.518 = f32[] constant(0.9)
+  %mul.838 = f32[4096]{0} broadcast(%constant.518), dimensions={}
+  %mul.839 = f32[4096]{0} multiply(%param.23, %mul.838)
+  %add.336 = f32[4096]{0} add(%mul.837, %mul.839)
+  %param.24 = s32[] parameter(13), sharding={replicated}
+  %constant.519 = s32[] constant(2147483647)
+  %lt.20 = pred[] compare(%param.24, %constant.519), direction=LT
+  %add.337 = s32[] add(%param.24, %constant.478)
+  %select_n.70 = s32[] select(%lt.20, %add.337, %constant.519)
+  %pow.22 = f32[] convert(%select_n.70)
+  %pow.23 = f32[] power(%constant.518, %pow.22)
+  %sub.43 = f32[] subtract(%constant.482, %pow.23)
+  %div.302 = f32[4096]{0} broadcast(%sub.43), dimensions={}
+  %integer_pow.24 = f32[4096]{0} multiply(%select_n.69, %select_n.69)
+  %constant.520 = f32[] constant(0.05)
+  %mul.840 = f32[4096]{0} broadcast(%constant.520), dimensions={}
+  %mul.841 = f32[4096]{0} multiply(%integer_pow.24, %mul.840)
+  %param.25 = f32[4096]{0} parameter(26), sharding={replicated}
+  %constant.521 = f32[] constant(0.95)
+  %mul.842 = f32[4096]{0} broadcast(%constant.521), dimensions={}
+  %mul.843 = f32[4096]{0} multiply(%param.25, %mul.842)
+  %add.338 = f32[4096]{0} add(%mul.841, %mul.843)
+  %pow.25 = f32[] power(%constant.521, %pow.22)
+  %sub.44 = f32[] subtract(%constant.482, %pow.25)
+  %div.303 = f32[4096]{0} broadcast(%sub.44), dimensions={}
+  %div.304 = f32[4096]{0} divide(%add.338, %div.303)
+  %sqrt.33 = f32[4096]{0} sqrt(%div.304)
+  %add.339 = f32[4096]{0} broadcast(%constant.505), dimensions={}
+  %add.340 = f32[4096]{0} add(%sqrt.33, %add.339)
+  %multiply.54 = f32[4096]{0} multiply(%div.302, %add.340)
+  %div.305 = f32[4096]{0} divide(%add.336, %multiply.54)
+  %mul.844 = f32[4096]{0} multiply(%param.5, %broadcast.164)
+  %add.341 = f32[4096]{0} add(%div.305, %mul.844)
+  %mul.845 = f32[4096]{0} multiply(%mul.797, %add.341)
+  %add.342 = f32[4096]{0} add(%param.5, %mul.845)
+  %custom-call.165 = f32[4096]{0} custom-call(%add.342), custom_call_target="MoveToDevice"
+  %mul.846 = f32[512,32,14336]{2,1,0} broadcast(%mul.796), dimensions={}
+  %select_n.71 = pred[512,32,14336]{2,1,0} broadcast(%lt.19), dimensions={}
+  %div.306 = f32[512,32,14336]{2,1,0} broadcast(%sqrt.32), dimensions={}
+  %div.307 = f32[512,32,14336]{2,0,1} divide(%transpose.74, %div.306)
+  %select_n.72 = f32[512,32,14336]{2,1,0} select(%select_n.71, %transpose.74, %div.307)
+  %broadcast.165 = f32[512,32,14336]{2,1,0} broadcast(%constant.517), dimensions={}
+  %mul.847 = f32[512,32,14336]{2,1,0} multiply(%select_n.72, %broadcast.165)
+  %param.26 = f32[512,32,14336]{2,1,0} parameter(15), sharding={devices=[8,1,1]<=[8]}
+  %broadcast.166 = f32[512,32,14336]{2,1,0} broadcast(%constant.518), dimensions={}
+  %mul.848 = f32[512,32,14336]{2,1,0} multiply(%param.26, %broadcast.166)
+  %add.343 = f32[512,32,14336]{2,1,0} add(%mul.847, %mul.848)
+  %div.308 = f32[512,32,14336]{2,1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.25 = f32[512,32,14336]{2,1,0} multiply(%select_n.72, %select_n.72)
+  %broadcast.167 = f32[512,32,14336]{2,1,0} broadcast(%constant.520), dimensions={}
+  %mul.849 = f32[512,32,14336]{2,1,0} multiply(%integer_pow.25, %broadcast.167)
+  %param.27 = f32[512,32,14336]{2,1,0} parameter(27), sharding={devices=[8,1,1]<=[8]}
+  %broadcast.168 = f32[512,32,14336]{2,1,0} broadcast(%constant.521), dimensions={}
+  %mul.850 = f32[512,32,14336]{2,1,0} multiply(%param.27, %broadcast.168)
+  %add.344 = f32[512,32,14336]{2,1,0} add(%mul.849, %mul.850)
+  %div.309 = f32[512,32,14336]{2,1,0} broadcast(%sub.44), dimensions={}
+  %div.310 = f32[512,32,14336]{2,1,0} divide(%add.344, %div.309)
+  %sqrt.34 = f32[512,32,14336]{2,1,0} sqrt(%div.310)
+  %broadcast.169 = f32[512,32,14336]{2,1,0} broadcast(%constant.505), dimensions={}
+  %add.345 = f32[512,32,14336]{2,1,0} add(%sqrt.34, %broadcast.169)
+  %multiply.55 = f32[512,32,14336]{2,1,0} multiply(%div.308, %add.345)
+  %div.311 = f32[512,32,14336]{2,1,0} divide(%add.343, %multiply.55)
+  %mul.851 = f32[512,32,14336]{2,1,0} multiply(%param.17, %broadcast.165)
+  %add.346 = f32[512,32,14336]{2,1,0} add(%div.311, %mul.851)
+  %mul.852 = f32[512,32,14336]{2,1,0} multiply(%mul.846, %add.346)
+  %add.347 = f32[512,32,14336]{2,1,0} add(%param.17, %mul.852)
+  %custom-call.166 = f32[512,32,14336]{2,1,0} custom-call(%add.347), custom_call_target="MoveToDevice"
+  %div.312 = f32[512,32,14336]{2,0,1} divide(%transpose.75, %div.306)
+  %select_n.73 = f32[512,32,14336]{2,1,0} select(%select_n.71, %transpose.75, %div.312)
+  %mul.853 = f32[512,32,14336]{2,1,0} multiply(%select_n.73, %broadcast.165)
+  %param.28 = f32[512,32,14336]{2,1,0} parameter(16), sharding={devices=[8,1,1]<=[8]}
+  %mul.854 = f32[512,32,14336]{2,1,0} multiply(%param.28, %broadcast.166)
+  %add.348 = f32[512,32,14336]{2,1,0} add(%mul.853, %mul.854)
+  %integer_pow.26 = f32[512,32,14336]{2,1,0} multiply(%select_n.73, %select_n.73)
+  %mul.855 = f32[512,32,14336]{2,1,0} multiply(%integer_pow.26, %broadcast.167)
+  %param.29 = f32[512,32,14336]{2,1,0} parameter(28), sharding={devices=[8,1,1]<=[8]}
+  %mul.856 = f32[512,32,14336]{2,1,0} multiply(%param.29, %broadcast.168)
+  %add.349 = f32[512,32,14336]{2,1,0} add(%mul.855, %mul.856)
+  %div.313 = f32[512,32,14336]{2,1,0} divide(%add.349, %div.309)
+  %sqrt.35 = f32[512,32,14336]{2,1,0} sqrt(%div.313)
+  %add.350 = f32[512,32,14336]{2,1,0} add(%sqrt.35, %broadcast.169)
+  %multiply.56 = f32[512,32,14336]{2,1,0} multiply(%div.308, %add.350)
+  %div.314 = f32[512,32,14336]{2,1,0} divide(%add.348, %multiply.56)
+  %mul.857 = f32[512,32,14336]{2,1,0} multiply(%param.18, %broadcast.165)
+  %add.351 = f32[512,32,14336]{2,1,0} add(%div.314, %mul.857)
+  %mul.858 = f32[512,32,14336]{2,1,0} multiply(%mul.846, %add.351)
+  %add.352 = f32[512,32,14336]{2,1,0} add(%param.18, %mul.858)
+  %custom-call.167 = f32[512,32,14336]{2,1,0} custom-call(%add.352), custom_call_target="MoveToDevice"
+  %mul.859 = f32[14336,32,512]{2,1,0} broadcast(%mul.796), dimensions={}
+  %select_n.74 = pred[14336,32,512]{2,1,0} broadcast(%lt.19), dimensions={}
+  %div.315 = f32[14336,32,512]{2,1,0} broadcast(%sqrt.32), dimensions={}
+  %div.316 = f32[14336,32,512]{2,0,1} divide(%transpose.76, %div.315)
+  %select_n.75 = f32[14336,32,512]{2,1,0} select(%select_n.74, %transpose.76, %div.316)
+  %broadcast.170 = f32[14336,32,512]{2,1,0} broadcast(%constant.517), dimensions={}
+  %mul.860 = f32[14336,32,512]{2,1,0} multiply(%select_n.75, %broadcast.170)
+  %param.30 = f32[14336,32,512]{2,1,0} parameter(17), sharding={devices=[1,1,8]<=[8]}
+  %mul.861 = f32[14336,32,512]{2,1,0} broadcast(%constant.518), dimensions={}
+  %mul.862 = f32[14336,32,512]{2,1,0} multiply(%param.30, %mul.861)
+  %add.353 = f32[14336,32,512]{2,1,0} add(%mul.860, %mul.862)
+  %div.317 = f32[14336,32,512]{2,1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.27 = f32[14336,32,512]{2,1,0} multiply(%select_n.75, %select_n.75)
+  %mul.863 = f32[14336,32,512]{2,1,0} broadcast(%constant.520), dimensions={}
+  %mul.864 = f32[14336,32,512]{2,1,0} multiply(%integer_pow.27, %mul.863)
+  %param.31 = f32[14336,32,512]{2,1,0} parameter(29), sharding={devices=[1,1,8]<=[8]}
+  %mul.865 = f32[14336,32,512]{2,1,0} broadcast(%constant.521), dimensions={}
+  %mul.866 = f32[14336,32,512]{2,1,0} multiply(%param.31, %mul.865)
+  %add.354 = f32[14336,32,512]{2,1,0} add(%mul.864, %mul.866)
+  %div.318 = f32[14336,32,512]{2,1,0} broadcast(%sub.44), dimensions={}
+  %div.319 = f32[14336,32,512]{2,1,0} divide(%add.354, %div.318)
+  %sqrt.36 = f32[14336,32,512]{2,1,0} sqrt(%div.319)
+  %add.355 = f32[14336,32,512]{2,1,0} broadcast(%constant.505), dimensions={}
+  %add.356 = f32[14336,32,512]{2,1,0} add(%sqrt.36, %add.355)
+  %multiply.57 = f32[14336,32,512]{2,1,0} multiply(%div.317, %add.356)
+  %div.320 = f32[14336,32,512]{2,1,0} divide(%add.353, %multiply.57)
+  %mul.867 = f32[14336,32,512]{2,1,0} multiply(%param.19, %broadcast.170)
+  %add.357 = f32[14336,32,512]{2,1,0} add(%div.320, %mul.867)
+  %mul.868 = f32[14336,32,512]{2,1,0} multiply(%mul.859, %add.357)
+  %add.358 = f32[14336,32,512]{2,1,0} add(%param.19, %mul.868)
+  %custom-call.168 = f32[14336,32,512]{2,1,0} custom-call(%add.358), custom_call_target="MoveToDevice"
+  %mul.869 = f32[4096,32]{1,0} broadcast(%mul.796), dimensions={}
+  %select_n.76 = pred[4096,32]{1,0} broadcast(%lt.19), dimensions={}
+  %div.321 = f32[4096,32]{1,0} broadcast(%sqrt.32), dimensions={}
+  %div.322 = f32[4096,32]{0,1} divide(%transpose.77, %div.321)
+  %select_n.77 = f32[4096,32]{1,0} select(%select_n.76, %transpose.77, %div.322)
+  %broadcast.171 = f32[4096,32]{1,0} broadcast(%constant.517), dimensions={}
+  %mul.870 = f32[4096,32]{1,0} multiply(%select_n.77, %broadcast.171)
+  %param.32 = f32[4096,32]{1,0} parameter(18), sharding={replicated}
+  %broadcast.172 = f32[4096,32]{1,0} broadcast(%constant.518), dimensions={}
+  %mul.871 = f32[4096,32]{1,0} multiply(%param.32, %broadcast.172)
+  %add.359 = f32[4096,32]{1,0} add(%mul.870, %mul.871)
+  %div.323 = f32[4096,32]{1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.28 = f32[4096,32]{1,0} multiply(%select_n.77, %select_n.77)
+  %broadcast.173 = f32[4096,32]{1,0} broadcast(%constant.520), dimensions={}
+  %mul.872 = f32[4096,32]{1,0} multiply(%integer_pow.28, %broadcast.173)
+  %param.33 = f32[4096,32]{1,0} parameter(30), sharding={replicated}
+  %broadcast.174 = f32[4096,32]{1,0} broadcast(%constant.521), dimensions={}
+  %mul.873 = f32[4096,32]{1,0} multiply(%param.33, %broadcast.174)
+  %add.360 = f32[4096,32]{1,0} add(%mul.872, %mul.873)
+  %div.324 = f32[4096,32]{1,0} broadcast(%sub.44), dimensions={}
+  %div.325 = f32[4096,32]{1,0} divide(%add.360, %div.324)
+  %sqrt.37 = f32[4096,32]{1,0} sqrt(%div.325)
+  %broadcast.175 = f32[4096,32]{1,0} broadcast(%constant.505), dimensions={}
+  %add.361 = f32[4096,32]{1,0} add(%sqrt.37, %broadcast.175)
+  %multiply.58 = f32[4096,32]{1,0} multiply(%div.323, %add.361)
+  %div.326 = f32[4096,32]{1,0} divide(%add.359, %multiply.58)
+  %mul.874 = f32[4096,32]{1,0} multiply(%param.16, %broadcast.171)
+  %add.362 = f32[4096,32]{1,0} add(%div.326, %mul.874)
+  %mul.875 = f32[4096,32]{1,0} multiply(%mul.869, %add.362)
+  %add.363 = f32[4096,32]{1,0} add(%param.16, %mul.875)
+  %custom-call.169 = f32[4096,32]{1,0} custom-call(%add.363), custom_call_target="MoveToDevice"
+  %div.327 = f32[4096,32]{0,1} divide(%transpose.78, %div.321)
+  %select_n.78 = f32[4096,32]{1,0} select(%select_n.76, %transpose.78, %div.327)
+  %mul.876 = f32[4096,32]{1,0} multiply(%select_n.78, %broadcast.171)
+  %param.34 = f32[4096,32]{1,0} parameter(19), sharding={replicated}
+  %mul.877 = f32[4096,32]{1,0} multiply(%param.34, %broadcast.172)
+  %add.364 = f32[4096,32]{1,0} add(%mul.876, %mul.877)
+  %integer_pow.29 = f32[4096,32]{1,0} multiply(%select_n.78, %select_n.78)
+  %mul.878 = f32[4096,32]{1,0} multiply(%integer_pow.29, %broadcast.173)
+  %param.35 = f32[4096,32]{1,0} parameter(31), sharding={replicated}
+  %mul.879 = f32[4096,32]{1,0} multiply(%param.35, %broadcast.174)
+  %add.365 = f32[4096,32]{1,0} add(%mul.878, %mul.879)
+  %div.328 = f32[4096,32]{1,0} divide(%add.365, %div.324)
+  %sqrt.38 = f32[4096,32]{1,0} sqrt(%div.328)
+  %add.366 = f32[4096,32]{1,0} add(%sqrt.38, %broadcast.175)
+  %multiply.59 = f32[4096,32]{1,0} multiply(%div.323, %add.366)
+  %div.329 = f32[4096,32]{1,0} divide(%add.364, %multiply.59)
+  %mul.880 = f32[4096,32]{1,0} multiply(%param.9, %broadcast.171)
+  %add.367 = f32[4096,32]{1,0} add(%div.329, %mul.880)
+  %mul.881 = f32[4096,32]{1,0} multiply(%mul.869, %add.367)
+  %add.368 = f32[4096,32]{1,0} add(%param.9, %mul.881)
+  %custom-call.170 = f32[4096,32]{1,0} custom-call(%add.368), custom_call_target="MoveToDevice"
+  %mul.882 = f32[512,32,8,128]{3,2,1,0} broadcast(%mul.796), dimensions={}
+  %select_n.79 = pred[512,32,8,128]{3,2,1,0} broadcast(%lt.19), dimensions={}
+  %div.330 = f32[512,32,8,128]{3,2,1,0} broadcast(%sqrt.32), dimensions={}
+  %div.331 = f32[512,32,8,128]{3,2,0,1} divide(%transpose.79, %div.330)
+  %select_n.80 = f32[512,32,8,128]{3,2,1,0} select(%select_n.79, %transpose.79, %div.331)
+  %broadcast.176 = f32[512,32,8,128]{3,2,1,0} broadcast(%constant.517), dimensions={}
+  %mul.883 = f32[512,32,8,128]{3,2,1,0} multiply(%select_n.80, %broadcast.176)
+  %param.36 = f32[512,32,8,128]{3,2,1,0} parameter(20), sharding={devices=[8,1,1,1]<=[8]}
+  %broadcast.177 = f32[512,32,8,128]{3,2,1,0} broadcast(%constant.518), dimensions={}
+  %mul.884 = f32[512,32,8,128]{3,2,1,0} multiply(%param.36, %broadcast.177)
+  %add.369 = f32[512,32,8,128]{3,2,1,0} add(%mul.883, %mul.884)
+  %div.332 = f32[512,32,8,128]{3,2,1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.30 = f32[512,32,8,128]{3,2,1,0} multiply(%select_n.80, %select_n.80)
+  %broadcast.178 = f32[512,32,8,128]{3,2,1,0} broadcast(%constant.520), dimensions={}
+  %mul.885 = f32[512,32,8,128]{3,2,1,0} multiply(%integer_pow.30, %broadcast.178)
+  %param.37 = f32[512,32,8,128]{3,2,1,0} parameter(32), sharding={devices=[8,1,1,1]<=[8]}
+  %broadcast.179 = f32[512,32,8,128]{3,2,1,0} broadcast(%constant.521), dimensions={}
+  %mul.886 = f32[512,32,8,128]{3,2,1,0} multiply(%param.37, %broadcast.179)
+  %add.370 = f32[512,32,8,128]{3,2,1,0} add(%mul.885, %mul.886)
+  %div.333 = f32[512,32,8,128]{3,2,1,0} broadcast(%sub.44), dimensions={}
+  %div.334 = f32[512,32,8,128]{3,2,1,0} divide(%add.370, %div.333)
+  %sqrt.39 = f32[512,32,8,128]{3,2,1,0} sqrt(%div.334)
+  %broadcast.180 = f32[512,32,8,128]{3,2,1,0} broadcast(%constant.505), dimensions={}
+  %add.371 = f32[512,32,8,128]{3,2,1,0} add(%sqrt.39, %broadcast.180)
+  %multiply.60 = f32[512,32,8,128]{3,2,1,0} multiply(%div.332, %add.371)
+  %div.335 = f32[512,32,8,128]{3,2,1,0} divide(%add.369, %multiply.60)
+  %mul.887 = f32[512,32,8,128]{3,2,1,0} multiply(%param.12, %broadcast.176)
+  %add.372 = f32[512,32,8,128]{3,2,1,0} add(%div.335, %mul.887)
+  %mul.888 = f32[512,32,8,128]{3,2,1,0} multiply(%mul.882, %add.372)
+  %add.373 = f32[512,32,8,128]{3,2,1,0} add(%param.12, %mul.888)
+  %custom-call.171 = f32[512,32,8,128]{3,2,1,0} custom-call(%add.373), custom_call_target="MoveToDevice"
+  %mul.889 = f32[32,32,128,512]{3,2,1,0} broadcast(%mul.796), dimensions={}
+  %select_n.81 = pred[32,32,128,512]{3,2,1,0} broadcast(%lt.19), dimensions={}
+  %div.336 = f32[32,32,128,512]{3,2,1,0} broadcast(%sqrt.32), dimensions={}
+  %div.337 = f32[32,32,128,512]{3,2,0,1} divide(%transpose.80, %div.336)
+  %select_n.82 = f32[32,32,128,512]{3,2,1,0} select(%select_n.81, %transpose.80, %div.337)
+  %broadcast.181 = f32[32,32,128,512]{3,2,1,0} broadcast(%constant.517), dimensions={}
+  %mul.890 = f32[32,32,128,512]{3,2,1,0} multiply(%select_n.82, %broadcast.181)
+  %param.38 = f32[32,32,128,512]{3,2,1,0} parameter(21), sharding={devices=[1,1,1,8]<=[8]}
+  %mul.891 = f32[32,32,128,512]{3,2,1,0} broadcast(%constant.518), dimensions={}
+  %mul.892 = f32[32,32,128,512]{3,2,1,0} multiply(%param.38, %mul.891)
+  %add.374 = f32[32,32,128,512]{3,2,1,0} add(%mul.890, %mul.892)
+  %div.338 = f32[32,32,128,512]{3,2,1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.31 = f32[32,32,128,512]{3,2,1,0} multiply(%select_n.82, %select_n.82)
+  %mul.893 = f32[32,32,128,512]{3,2,1,0} broadcast(%constant.520), dimensions={}
+  %mul.894 = f32[32,32,128,512]{3,2,1,0} multiply(%integer_pow.31, %mul.893)
+  %param.39 = f32[32,32,128,512]{3,2,1,0} parameter(33), sharding={devices=[1,1,1,8]<=[8]}
+  %mul.895 = f32[32,32,128,512]{3,2,1,0} broadcast(%constant.521), dimensions={}
+  %mul.896 = f32[32,32,128,512]{3,2,1,0} multiply(%param.39, %mul.895)
+  %add.375 = f32[32,32,128,512]{3,2,1,0} add(%mul.894, %mul.896)
+  %div.339 = f32[32,32,128,512]{3,2,1,0} broadcast(%sub.44), dimensions={}
+  %div.340 = f32[32,32,128,512]{3,2,1,0} divide(%add.375, %div.339)
+  %sqrt.40 = f32[32,32,128,512]{3,2,1,0} sqrt(%div.340)
+  %add.376 = f32[32,32,128,512]{3,2,1,0} broadcast(%constant.505), dimensions={}
+  %add.377 = f32[32,32,128,512]{3,2,1,0} add(%sqrt.40, %add.376)
+  %multiply.61 = f32[32,32,128,512]{3,2,1,0} multiply(%div.338, %add.377)
+  %div.341 = f32[32,32,128,512]{3,2,1,0} divide(%add.374, %multiply.61)
+  %mul.897 = f32[32,32,128,512]{3,2,1,0} multiply(%param.15, %broadcast.181)
+  %add.378 = f32[32,32,128,512]{3,2,1,0} add(%div.341, %mul.897)
+  %mul.898 = f32[32,32,128,512]{3,2,1,0} multiply(%mul.889, %add.378)
+  %add.379 = f32[32,32,128,512]{3,2,1,0} add(%param.15, %mul.898)
+  %custom-call.172 = f32[32,32,128,512]{3,2,1,0} custom-call(%add.379), custom_call_target="MoveToDevice"
+  %mul.899 = f32[512,32,32,128]{3,2,1,0} broadcast(%mul.796), dimensions={}
+  %select_n.83 = pred[512,32,32,128]{3,2,1,0} broadcast(%lt.19), dimensions={}
+  %div.342 = f32[512,32,32,128]{3,2,1,0} broadcast(%sqrt.32), dimensions={}
+  %div.343 = f32[512,32,32,128]{3,2,0,1} divide(%transpose.81, %div.342)
+  %select_n.84 = f32[512,32,32,128]{3,2,1,0} select(%select_n.83, %transpose.81, %div.343)
+  %broadcast.182 = f32[512,32,32,128]{3,2,1,0} broadcast(%constant.517), dimensions={}
+  %mul.900 = f32[512,32,32,128]{3,2,1,0} multiply(%select_n.84, %broadcast.182)
+  %param.40 = f32[512,32,32,128]{3,2,1,0} parameter(22), sharding={devices=[8,1,1,1]<=[8]}
+  %mul.901 = f32[512,32,32,128]{3,2,1,0} broadcast(%constant.518), dimensions={}
+  %mul.902 = f32[512,32,32,128]{3,2,1,0} multiply(%param.40, %mul.901)
+  %add.380 = f32[512,32,32,128]{3,2,1,0} add(%mul.900, %mul.902)
+  %div.344 = f32[512,32,32,128]{3,2,1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.32 = f32[512,32,32,128]{3,2,1,0} multiply(%select_n.84, %select_n.84)
+  %mul.903 = f32[512,32,32,128]{3,2,1,0} broadcast(%constant.520), dimensions={}
+  %mul.904 = f32[512,32,32,128]{3,2,1,0} multiply(%integer_pow.32, %mul.903)
+  %param.41 = f32[512,32,32,128]{3,2,1,0} parameter(34), sharding={devices=[8,1,1,1]<=[8]}
+  %mul.905 = f32[512,32,32,128]{3,2,1,0} broadcast(%constant.521), dimensions={}
+  %mul.906 = f32[512,32,32,128]{3,2,1,0} multiply(%param.41, %mul.905)
+  %add.381 = f32[512,32,32,128]{3,2,1,0} add(%mul.904, %mul.906)
+  %div.345 = f32[512,32,32,128]{3,2,1,0} broadcast(%sub.44), dimensions={}
+  %div.346 = f32[512,32,32,128]{3,2,1,0} divide(%add.381, %div.345)
+  %sqrt.41 = f32[512,32,32,128]{3,2,1,0} sqrt(%div.346)
+  %add.382 = f32[512,32,32,128]{3,2,1,0} broadcast(%constant.505), dimensions={}
+  %add.383 = f32[512,32,32,128]{3,2,1,0} add(%sqrt.41, %add.382)
+  %multiply.62 = f32[512,32,32,128]{3,2,1,0} multiply(%div.344, %add.383)
+  %div.347 = f32[512,32,32,128]{3,2,1,0} divide(%add.380, %multiply.62)
+  %mul.907 = f32[512,32,32,128]{3,2,1,0} multiply(%param.10, %broadcast.182)
+  %add.384 = f32[512,32,32,128]{3,2,1,0} add(%div.347, %mul.907)
+  %mul.908 = f32[512,32,32,128]{3,2,1,0} multiply(%mul.899, %add.384)
+  %add.385 = f32[512,32,32,128]{3,2,1,0} add(%param.10, %mul.908)
+  %custom-call.173 = f32[512,32,32,128]{3,2,1,0} custom-call(%add.385), custom_call_target="MoveToDevice"
+  %div.348 = f32[512,32,8,128]{3,2,0,1} divide(%transpose.82, %div.330)
+  %select_n.85 = f32[512,32,8,128]{3,2,1,0} select(%select_n.79, %transpose.82, %div.348)
+  %mul.909 = f32[512,32,8,128]{3,2,1,0} multiply(%select_n.85, %broadcast.176)
+  %param.42 = f32[512,32,8,128]{3,2,1,0} parameter(23), sharding={devices=[8,1,1,1]<=[8]}
+  %mul.910 = f32[512,32,8,128]{3,2,1,0} multiply(%param.42, %broadcast.177)
+  %add.386 = f32[512,32,8,128]{3,2,1,0} add(%mul.909, %mul.910)
+  %integer_pow.33 = f32[512,32,8,128]{3,2,1,0} multiply(%select_n.85, %select_n.85)
+  %mul.911 = f32[512,32,8,128]{3,2,1,0} multiply(%integer_pow.33, %broadcast.178)
+  %param.43 = f32[512,32,8,128]{3,2,1,0} parameter(35), sharding={devices=[8,1,1,1]<=[8]}
+  %mul.912 = f32[512,32,8,128]{3,2,1,0} multiply(%param.43, %broadcast.179)
+  %add.387 = f32[512,32,8,128]{3,2,1,0} add(%mul.911, %mul.912)
+  %div.349 = f32[512,32,8,128]{3,2,1,0} divide(%add.387, %div.333)
+  %sqrt.42 = f32[512,32,8,128]{3,2,1,0} sqrt(%div.349)
+  %add.388 = f32[512,32,8,128]{3,2,1,0} add(%sqrt.42, %broadcast.180)
+  %multiply.63 = f32[512,32,8,128]{3,2,1,0} multiply(%div.332, %add.388)
+  %div.350 = f32[512,32,8,128]{3,2,1,0} divide(%add.386, %multiply.63)
+  %mul.913 = f32[512,32,8,128]{3,2,1,0} multiply(%param.13, %broadcast.176)
+  %add.389 = f32[512,32,8,128]{3,2,1,0} add(%div.350, %mul.913)
+  %mul.914 = f32[512,32,8,128]{3,2,1,0} multiply(%mul.882, %add.389)
+  %add.390 = f32[512,32,8,128]{3,2,1,0} add(%param.13, %mul.914)
+  %custom-call.174 = f32[512,32,8,128]{3,2,1,0} custom-call(%add.390), custom_call_target="MoveToDevice"
+  %mul.915 = f32[512,128256]{1,0} broadcast(%mul.796), dimensions={}
+  %select_n.86 = pred[512,128256]{1,0} broadcast(%lt.19), dimensions={}
+  %div.351 = f32[512,128256]{1,0} broadcast(%sqrt.32), dimensions={}
+  %div.352 = f32[512,128256]{1,0} divide(%convert_element_type.277, %div.351)
+  %select_n.87 = f32[512,128256]{1,0} select(%select_n.86, %convert_element_type.277, %div.352)
+  %broadcast.183 = f32[512,128256]{1,0} broadcast(%constant.517), dimensions={}
+  %mul.916 = f32[512,128256]{1,0} multiply(%select_n.87, %broadcast.183)
+  %param.44 = f32[512,128256]{1,0} parameter(24), sharding={devices=[8,1]<=[8]}
+  %mul.917 = f32[512,128256]{1,0} broadcast(%constant.518), dimensions={}
+  %mul.918 = f32[512,128256]{1,0} multiply(%param.44, %mul.917)
+  %add.391 = f32[512,128256]{1,0} add(%mul.916, %mul.918)
+  %div.353 = f32[512,128256]{1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.34 = f32[512,128256]{1,0} multiply(%select_n.87, %select_n.87)
+  %mul.919 = f32[512,128256]{1,0} broadcast(%constant.520), dimensions={}
+  %mul.920 = f32[512,128256]{1,0} multiply(%integer_pow.34, %mul.919)
+  %param.45 = f32[512,128256]{1,0} parameter(36), sharding={devices=[8,1]<=[8]}
+  %mul.921 = f32[512,128256]{1,0} broadcast(%constant.521), dimensions={}
+  %mul.922 = f32[512,128256]{1,0} multiply(%param.45, %mul.921)
+  %add.392 = f32[512,128256]{1,0} add(%mul.920, %mul.922)
+  %div.354 = f32[512,128256]{1,0} broadcast(%sub.44), dimensions={}
+  %div.355 = f32[512,128256]{1,0} divide(%add.392, %div.354)
+  %sqrt.43 = f32[512,128256]{1,0} sqrt(%div.355)
+  %add.393 = f32[512,128256]{1,0} broadcast(%constant.505), dimensions={}
+  %add.394 = f32[512,128256]{1,0} add(%sqrt.43, %add.393)
+  %multiply.64 = f32[512,128256]{1,0} multiply(%div.353, %add.394)
+  %div.356 = f32[512,128256]{1,0} divide(%add.391, %multiply.64)
+  %mul.923 = f32[512,128256]{1,0} multiply(%param.21, %broadcast.183)
+  %add.395 = f32[512,128256]{1,0} add(%div.356, %mul.923)
+  %mul.924 = f32[512,128256]{1,0} multiply(%mul.915, %add.395)
+  %add.396 = f32[512,128256]{1,0} add(%param.21, %mul.924)
+  %custom-call.175 = f32[512,128256]{1,0} custom-call(%add.396), custom_call_target="MoveToDevice"
+  %mul.925 = f32[128256,512]{1,0} broadcast(%mul.796), dimensions={}
+  %select_n.88 = pred[128256,512]{1,0} broadcast(%lt.19), dimensions={}
+  %div.357 = f32[128256,512]{1,0} broadcast(%sqrt.32), dimensions={}
+  %div.358 = f32[128256,512]{1,0} divide(%convert_element_type.278, %div.357)
+  %select_n.89 = f32[128256,512]{1,0} select(%select_n.88, %convert_element_type.278, %div.358)
+  %broadcast.184 = f32[128256,512]{1,0} broadcast(%constant.517), dimensions={}
+  %mul.926 = f32[128256,512]{1,0} multiply(%select_n.89, %broadcast.184)
+  %param.46 = f32[128256,512]{1,0} parameter(25), sharding={devices=[1,8]<=[8]}
+  %mul.927 = f32[128256,512]{1,0} broadcast(%constant.518), dimensions={}
+  %mul.928 = f32[128256,512]{1,0} multiply(%param.46, %mul.927)
+  %add.397 = f32[128256,512]{1,0} add(%mul.926, %mul.928)
+  %div.359 = f32[128256,512]{1,0} broadcast(%sub.43), dimensions={}
+  %integer_pow.35 = f32[128256,512]{1,0} multiply(%select_n.89, %select_n.89)
+  %mul.929 = f32[128256,512]{1,0} broadcast(%constant.520), dimensions={}
+  %mul.930 = f32[128256,512]{1,0} multiply(%integer_pow.35, %mul.929)
+  %param.47 = f32[128256,512]{1,0} parameter(37), sharding={devices=[1,8]<=[8]}
+  %mul.931 = f32[128256,512]{1,0} broadcast(%constant.521), dimensions={}
+  %mul.932 = f32[128256,512]{1,0} multiply(%param.47, %mul.931)
+  %add.398 = f32[128256,512]{1,0} add(%mul.930, %mul.932)
+  %div.360 = f32[128256,512]{1,0} broadcast(%sub.44), dimensions={}
+  %div.361 = f32[128256,512]{1,0} divide(%add.398, %div.360)
+  %sqrt.44 = f32[128256,512]{1,0} sqrt(%div.361)
+  %add.399 = f32[128256,512]{1,0} broadcast(%constant.505), dimensions={}
+  %add.400 = f32[128256,512]{1,0} add(%sqrt.44, %add.399)
+  %multiply.65 = f32[128256,512]{1,0} multiply(%div.359, %add.400)
+  %div.362 = f32[128256,512]{1,0} divide(%add.397, %multiply.65)
+  %mul.933 = f32[128256,512]{1,0} multiply(%param.8, %broadcast.184)
+  %add.401 = f32[128256,512]{1,0} add(%div.362, %mul.933)
+  %mul.934 = f32[128256,512]{1,0} multiply(%mul.925, %add.401)
+  %add.402 = f32[128256,512]{1,0} add(%param.8, %mul.934)
+  %custom-call.176 = f32[128256,512]{1,0} custom-call(%add.402), custom_call_target="MoveToDevice"
+  %custom-call.177 = s32[] custom-call(%select_n.70), custom_call_target="MoveToDevice"
+  %custom-call.178 = f32[4096]{0} custom-call(%add.336), custom_call_target="MoveToDevice"
+  %custom-call.179 = f32[512,32,14336]{2,1,0} custom-call(%add.343), custom_call_target="MoveToDevice"
+  %custom-call.180 = f32[512,32,14336]{2,1,0} custom-call(%add.348), custom_call_target="MoveToDevice"
+  %custom-call.181 = f32[14336,32,512]{2,1,0} custom-call(%add.353), custom_call_target="MoveToDevice"
+  %custom-call.182 = f32[4096,32]{1,0} custom-call(%add.359), custom_call_target="MoveToDevice"
+  %custom-call.183 = f32[4096,32]{1,0} custom-call(%add.364), custom_call_target="MoveToDevice"
+  %custom-call.184 = f32[512,32,8,128]{3,2,1,0} custom-call(%add.369), custom_call_target="MoveToDevice"
+  %custom-call.185 = f32[32,32,128,512]{3,2,1,0} custom-call(%add.374), custom_call_target="MoveToDevice"
+  %custom-call.186 = f32[512,32,32,128]{3,2,1,0} custom-call(%add.380), custom_call_target="MoveToDevice"
+  %custom-call.187 = f32[512,32,8,128]{3,2,1,0} custom-call(%add.386), custom_call_target="MoveToDevice"
+  %custom-call.188 = f32[512,128256]{1,0} custom-call(%add.391), custom_call_target="MoveToDevice"
+  %custom-call.189 = f32[128256,512]{1,0} custom-call(%add.397), custom_call_target="MoveToDevice"
+  %custom-call.190 = f32[4096]{0} custom-call(%add.338), custom_call_target="MoveToDevice"
+  %custom-call.191 = f32[512,32,14336]{2,1,0} custom-call(%add.344), custom_call_target="MoveToDevice"
+  %custom-call.192 = f32[512,32,14336]{2,1,0} custom-call(%add.349), custom_call_target="MoveToDevice"
+  %custom-call.193 = f32[14336,32,512]{2,1,0} custom-call(%add.354), custom_call_target="MoveToDevice"
+  %custom-call.194 = f32[4096,32]{1,0} custom-call(%add.360), custom_call_target="MoveToDevice"
+  %custom-call.195 = f32[4096,32]{1,0} custom-call(%add.365), custom_call_target="MoveToDevice"
+  %custom-call.196 = f32[512,32,8,128]{3,2,1,0} custom-call(%add.370), custom_call_target="MoveToDevice"
+  %custom-call.197 = f32[32,32,128,512]{3,2,1,0} custom-call(%add.375), custom_call_target="MoveToDevice"
+  %custom-call.198 = f32[512,32,32,128]{3,2,1,0} custom-call(%add.381), custom_call_target="MoveToDevice"
+  %custom-call.199 = f32[512,32,8,128]{3,2,1,0} custom-call(%add.387), custom_call_target="MoveToDevice"
+  %custom-call.200 = f32[512,128256]{1,0} custom-call(%add.392), custom_call_target="MoveToDevice"
+  %custom-call.201 = f32[128256,512]{1,0} custom-call(%add.398), custom_call_target="MoveToDevice"
+  %lt.21 = pred[] compare(%param.6, %constant.519), direction=LT
+  %add.403 = s32[] add(%param.6, %constant.478)
+  %select_n.90 = s32[] select(%lt.21, %add.403, %constant.519)
+  %custom-call.202 = s32[] custom-call(%select_n.90), custom_call_target="MoveToDevice"
+  %reduce.32 = f32[] reduce(%integer_pow.24, %constant.480), dimensions={0}, to_apply=%region_36.37
+  %reduce.33 = f32[] reduce(%integer_pow.25, %constant.480), dimensions={0,1,2}, to_apply=%region_37.38
+  %all-reduce.22 = f32[] all-reduce(%reduce.33), channel_id=39, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_37.38.clone
+  %add.404 = f32[] add(%reduce.32, %all-reduce.22)
+  %reduce.34 = f32[] reduce(%integer_pow.26, %constant.480), dimensions={0,1,2}, to_apply=%region_38.39
+  %all-reduce.23 = f32[] all-reduce(%reduce.34), channel_id=40, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_38.39.clone
+  %add.405 = f32[] add(%add.404, %all-reduce.23)
+  %reduce.35 = f32[] reduce(%integer_pow.27, %constant.480), dimensions={0,1,2}, to_apply=%region_39.40
+  %all-reduce.24 = f32[] all-reduce(%reduce.35), channel_id=41, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_39.40.clone
+  %add.406 = f32[] add(%add.405, %all-reduce.24)
+  %reduce.36 = f32[] reduce(%integer_pow.28, %constant.480), dimensions={0,1}, to_apply=%region_40.41
+  %add.407 = f32[] add(%add.406, %reduce.36)
+  %reduce.37 = f32[] reduce(%integer_pow.29, %constant.480), dimensions={0,1}, to_apply=%region_41.42
+  %add.408 = f32[] add(%add.407, %reduce.37)
+  %reduce.38 = f32[] reduce(%integer_pow.30, %constant.480), dimensions={0,1,2,3}, to_apply=%region_42.43
+  %all-reduce.25 = f32[] all-reduce(%reduce.38), channel_id=42, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_42.43.clone
+  %add.409 = f32[] add(%add.408, %all-reduce.25)
+  %reduce.39 = f32[] reduce(%integer_pow.31, %constant.480), dimensions={0,1,2,3}, to_apply=%region_43.44
+  %all-reduce.26 = f32[] all-reduce(%reduce.39), channel_id=43, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_43.44.clone
+  %add.410 = f32[] add(%add.409, %all-reduce.26)
+  %reduce.40 = f32[] reduce(%integer_pow.32, %constant.480), dimensions={0,1,2,3}, to_apply=%region_44.45
+  %all-reduce.27 = f32[] all-reduce(%reduce.40), channel_id=44, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_44.45.clone
+  %add.411 = f32[] add(%add.410, %all-reduce.27)
+  %reduce.41 = f32[] reduce(%integer_pow.33, %constant.480), dimensions={0,1,2,3}, to_apply=%region_45.46
+  %all-reduce.28 = f32[] all-reduce(%reduce.41), channel_id=45, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_45.46.clone
+  %add.412 = f32[] add(%add.411, %all-reduce.28)
+  %reduce.42 = f32[] reduce(%integer_pow.34, %constant.480), dimensions={0,1}, to_apply=%region_46.47
+  %all-reduce.29 = f32[] all-reduce(%reduce.42), channel_id=46, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_46.47.clone
+  %add.413 = f32[] add(%add.412, %all-reduce.29)
+  %reduce.43 = f32[] reduce(%integer_pow.35, %constant.480), dimensions={0,1}, to_apply=%region_47.48
+  %all-reduce.30 = f32[] all-reduce(%reduce.43), channel_id=47, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_47.48.clone
+  %add.414 = f32[] add(%add.413, %all-reduce.30)
+  %sqrt.45 = f32[] sqrt(%add.414)
+  %sub.45 = f32[1,8192]{1,0} reshape(%log.4)
+  %sub.46 = f32[1,8192,128256]{2,1,0} broadcast(%sub.45), dimensions={0,1}
+  %sub.47 = f32[1,8192,128256]{2,1,0} subtract(%sub.41, %sub.46)
+  %broadcast.185 = f32[1,8192,128256]{2,1,0} broadcast(%constant.480), dimensions={}
+  %mul.935 = f32[1,8192,128256]{2,1,0} select(%eq.63, %sub.47, %broadcast.185)
+  %reduce.44 = f32[1,8192]{1,0} reduce(%mul.935, %constant.480), dimensions={2}, to_apply=%region_48.49
+  %neg.20 = f32[1,8192]{1,0} negate(%reduce.44)
+  %square.105 = f32[1,8192]{1,0} multiply(%squeeze.93, %squeeze.93)
+  %mul.936 = f32[1,8192]{1,0} multiply(%square.105, %broadcast.160)
+  %add.415 = f32[1,8192]{1,0} add(%neg.20, %mul.936)
+  %sharding_constraint.153 = f32[1,8192]{1,0} copy(%add.415)
+  %mul.937 = f32[1,8192]{1,0} select(%ne.14, %sharding_constraint.153, %broadcast.160)
+  %reduce.45 = f32[] reduce(%mul.937, %constant.480), dimensions={0,1}, to_apply=%region_49.50
+  %all-reduce.31 = f32[] all-reduce(%reduce.45), channel_id=48, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_49.50.clone
+  %div.363 = f32[] divide(%all-reduce.31, %add.321)
+  %square.106 = f32[4096]{0} multiply(%add.342, %add.342)
+  %reduce.46 = f32[] reduce(%square.106, %constant.480), dimensions={0}, to_apply=%region_50.51
+  %square.107 = f32[512,32,14336]{2,1,0} multiply(%add.347, %add.347)
+  %reduce.47 = f32[] reduce(%square.107, %constant.480), dimensions={0,1,2}, to_apply=%region_51.52
+  %all-reduce.32 = f32[] all-reduce(%reduce.47), channel_id=49, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_51.52.clone
+  %add.416 = f32[] add(%reduce.46, %all-reduce.32)
+  %square.108 = f32[512,32,14336]{2,1,0} multiply(%add.352, %add.352)
+  %reduce.48 = f32[] reduce(%square.108, %constant.480), dimensions={0,1,2}, to_apply=%region_52.53
+  %all-reduce.33 = f32[] all-reduce(%reduce.48), channel_id=50, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_52.53.clone
+  %add.417 = f32[] add(%add.416, %all-reduce.33)
+  %square.109 = f32[14336,32,512]{2,1,0} multiply(%add.358, %add.358)
+  %reduce.49 = f32[] reduce(%square.109, %constant.480), dimensions={0,1,2}, to_apply=%region_53.54
+  %all-reduce.34 = f32[] all-reduce(%reduce.49), channel_id=51, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_53.54.clone
+  %add.418 = f32[] add(%add.417, %all-reduce.34)
+  %square.110 = f32[4096,32]{1,0} multiply(%add.363, %add.363)
+  %reduce.50 = f32[] reduce(%square.110, %constant.480), dimensions={0,1}, to_apply=%region_54.55
+  %add.419 = f32[] add(%add.418, %reduce.50)
+  %square.111 = f32[4096,32]{1,0} multiply(%add.368, %add.368)
+  %reduce.51 = f32[] reduce(%square.111, %constant.480), dimensions={0,1}, to_apply=%region_55.56
+  %add.420 = f32[] add(%add.419, %reduce.51)
+  %square.112 = f32[512,32,8,128]{3,2,1,0} multiply(%add.373, %add.373)
+  %reduce.52 = f32[] reduce(%square.112, %constant.480), dimensions={0,1,2,3}, to_apply=%region_56.57
+  %all-reduce.35 = f32[] all-reduce(%reduce.52), channel_id=52, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_56.57.clone
+  %add.421 = f32[] add(%add.420, %all-reduce.35)
+  %square.113 = f32[32,32,128,512]{3,2,1,0} multiply(%add.379, %add.379)
+  %reduce.53 = f32[] reduce(%square.113, %constant.480), dimensions={0,1,2,3}, to_apply=%region_57.58
+  %all-reduce.36 = f32[] all-reduce(%reduce.53), channel_id=53, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_57.58.clone
+  %add.422 = f32[] add(%add.421, %all-reduce.36)
+  %square.114 = f32[512,32,32,128]{3,2,1,0} multiply(%add.385, %add.385)
+  %reduce.54 = f32[] reduce(%square.114, %constant.480), dimensions={0,1,2,3}, to_apply=%region_58.59
+  %all-reduce.37 = f32[] all-reduce(%reduce.54), channel_id=54, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_58.59.clone
+  %add.423 = f32[] add(%add.422, %all-reduce.37)
+  %square.115 = f32[512,32,8,128]{3,2,1,0} multiply(%add.390, %add.390)
+  %reduce.55 = f32[] reduce(%square.115, %constant.480), dimensions={0,1,2,3}, to_apply=%region_59.60
+  %all-reduce.38 = f32[] all-reduce(%reduce.55), channel_id=55, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_59.60.clone
+  %add.424 = f32[] add(%add.423, %all-reduce.38)
+  %square.116 = f32[512,128256]{1,0} multiply(%add.396, %add.396)
+  %reduce.56 = f32[] reduce(%square.116, %constant.480), dimensions={0,1}, to_apply=%region_60.61
+  %all-reduce.39 = f32[] all-reduce(%reduce.56), channel_id=56, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_60.61.clone
+  %add.425 = f32[] add(%add.424, %all-reduce.39)
+  %square.117 = f32[128256,512]{1,0} multiply(%add.402, %add.402)
+  %reduce.57 = f32[] reduce(%square.117, %constant.480), dimensions={0,1}, to_apply=%region_61.62
+  %all-reduce.40 = f32[] all-reduce(%reduce.57), channel_id=57, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_61.62.clone
+  %add.426 = f32[] add(%add.425, %all-reduce.40)
+  %sqrt.46 = f32[] sqrt(%add.426)
+  %all-reduce.41 = f32[] all-reduce(%reduce.21), channel_id=58, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_63.64.clone
+  %add.427 = f32[] add(%reduce.19, %all-reduce.41)
+  %all-reduce.42 = f32[] all-reduce(%reduce.22), channel_id=59, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_64.65.clone
+  %add.428 = f32[] add(%add.427, %all-reduce.42)
+  %all-reduce.43 = f32[] all-reduce(%reduce.23), channel_id=60, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_65.66.clone
+  %add.429 = f32[] add(%add.428, %all-reduce.43)
+  %add.430 = f32[] add(%add.429, %reduce.24)
+  %add.431 = f32[] add(%add.430, %reduce.25)
+  %all-reduce.44 = f32[] all-reduce(%reduce.26), channel_id=61, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_68.69.clone
+  %add.432 = f32[] add(%add.431, %all-reduce.44)
+  %all-reduce.45 = f32[] all-reduce(%reduce.27), channel_id=62, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_69.70.clone
+  %add.433 = f32[] add(%add.432, %all-reduce.45)
+  %all-reduce.46 = f32[] all-reduce(%reduce.28), channel_id=63, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_70.71.clone
+  %add.434 = f32[] add(%add.433, %all-reduce.46)
+  %all-reduce.47 = f32[] all-reduce(%reduce.29), channel_id=64, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_71.72.clone
+  %add.435 = f32[] add(%add.434, %all-reduce.47)
+  %all-reduce.48 = f32[] all-reduce(%reduce.30), channel_id=65, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_72.73.clone
+  %add.436 = f32[] add(%add.435, %all-reduce.48)
+  %all-reduce.49 = f32[] all-reduce(%reduce.31), channel_id=66, replica_groups=[1,8]<=[8], use_global_device_ids=true, to_apply=%region_73.74.clone
+  %add.437 = f32[] add(%add.436, %all-reduce.49)
+  %sqrt.47 = f32[] sqrt(%add.437)
+  ROOT %tuple.19 = (s32[], f32[4096]{0}, f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, /*index=5*/f32[4096,32]{1,0}, f32[4096,32]{1,0}, f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, /*index=10*/f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, f32[128256,512]{1,0}, s32[], f32[4096]{0}, /*index=15*/f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, f32[4096,32]{1,0}, f32[4096,32]{1,0}, /*index=20*/f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, /*index=25*/f32[128256,512]{1,0}, f32[4096]{0}, f32[512,32,14336]{2,1,0}, f32[512,32,14336]{2,1,0}, f32[14336,32,512]{2,1,0}, /*index=30*/f32[4096,32]{1,0}, f32[4096,32]{1,0}, f32[512,32,8,128]{3,2,1,0}, f32[32,32,128,512]{3,2,1,0}, f32[512,32,32,128]{3,2,1,0}, /*index=35*/f32[512,32,8,128]{3,2,1,0}, f32[512,128256]{1,0}, f32[128256,512]{1,0}, s32[], f32[], /*index=40*/f32[], f32[], f32[], f32[], f32[], /*index=45*/s32[]) tuple(%custom-call.164, %custom-call.165, %custom-call.166, %custom-call.167, %custom-call.168, /*index=5*/%custom-call.169, %custom-call.170, %custom-call.171, %custom-call.172, %custom-call.173, /*index=10*/%custom-call.174, %custom-call.175, %custom-call.176, %custom-call.177, %custom-call.178, /*index=15*/%custom-call.179, %custom-call.180, %custom-call.181, %custom-call.182, %custom-call.183, /*index=20*/%custom-call.184, %custom-call.185, %custom-call.186, %custom-call.187, %custom-call.188, /*index=25*/%custom-call.189, %custom-call.190, %custom-call.191, %custom-call.192, %custom-call.193, /*index=30*/%custom-call.194, %custom-call.195, %custom-call.196, %custom-call.197, %custom-call.198, /*index=35*/%custom-call.199, %custom-call.200, %custom-call.201, %custom-call.202, %sqrt.45, /*index=40*/%div.363, %constant.480, %constant.480, %sqrt.46, %sqrt.47, /*index=45*/%all-reduce.9)
+}
+
diff --git a/third_party/xla/xla/tools/benchmarks/hlo/nv_maxtext_deepseek_1n4g_jit_train_step_before_optimization.hlo b/third_party/xla/xla/tools/benchmarks/hlo/nv_maxtext_deepseek_1n4g_jit_train_step_before_optimization.hlo
new file mode 100644
index 00000000000000..b91109a7c43ccf
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/hlo/nv_maxtext_deepseek_1n4g_jit_train_step_before_optimization.hlo
@@ -0,0 +1,16665 @@
+HloModule jit_train_step, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias), {4}: (4, {}, may-alias), {5}: (5, {}, may-alias), {6}: (6, {}, may-alias), {7}: (7, {}, may-alias), {8}: (8, {}, may-alias), {9}: (9, {}, may-alias), {10}: (10, {}, may-alias), {11}: (11, {}, may-alias), {12}: (12, {}, may-alias), {13}: (13, {}, may-alias), {14}: (14, {}, may-alias), {15}: (15, {}, may-alias), {16}: (16, {}, may-alias), {17}: (17, {}, may-alias), {18}: (18, {}, may-alias), {19}: (19, {}, may-alias), {20}: (20, {}, may-alias), {21}: (21, {}, may-alias), {22}: (22, {}, may-alias), {23}: (23, {}, may-alias), {24}: (24, {}, may-alias), {25}: (25, {}, may-alias), {26}: (26, {}, may-alias), {27}: (27, {}, may-alias), {28}: (28, {}, may-alias), {29}: (29, {}, may-alias), {30}: (30, {}, may-alias), {31}: (31, {}, may-alias), {32}: (32, {}, may-alias), {33}: (33, {}, may-alias), {34}: (34, {}, may-alias), {35}: (35, {}, may-alias), {36}: (36, {}, may-alias), {37}: (37, {}, may-alias), {38}: (38, {}, may-alias), {39}: (39, {}, may-alias), {40}: (40, {}, may-alias), {41}: (41, {}, may-alias), {42}: (42, {}, may-alias), {43}: (43, {}, may-alias), {44}: (44, {}, may-alias), {45}: (45, {}, may-alias), {46}: (46, {}, may-alias), {47}: (47, {}, may-alias), {48}: (48, {}, may-alias), {49}: (49, {}, may-alias), {50}: (50, {}, may-alias), {51}: (51, {}, may-alias), {52}: (52, {}, may-alias), {53}: (53, {}, may-alias), {54}: (54, {}, may-alias), {55}: (55, {}, may-alias), {56}: (56, {}, may-alias), {57}: (57, {}, may-alias), {58}: (58, {}, may-alias), {59}: (59, {}, may-alias), {60}: (60, {}, may-alias), {61}: (61, {}, may-alias), {62}: (62, {}, may-alias), {63}: (63, {}, may-alias), {64}: (64, {}, may-alias), {65}: (65, {}, may-alias), {66}: (66, {}, may-alias), {67}: (67, {}, may-alias), {68}: (68, {}, may-alias), {69}: (69, {}, may-alias), {70}: (70, {}, may-alias), {71}: (71, {}, may-alias), {72}: (72, {}, may-alias), {73}: (73, {}, may-alias), {74}: (74, {}, may-alias), {75}: (75, {}, may-alias), {76}: (76, {}, may-alias), {77}: (77, {}, may-alias), {78}: (78, {}, may-alias), {79}: (79, {}, may-alias), {80}: (80, {}, may-alias), {81}: (81, {}, may-alias), {82}: (82, {}, may-alias), {83}: (83, {}, may-alias), {84}: (84, {}, may-alias), {85}: (85, {}, may-alias), {86}: (86, {}, may-alias), {87}: (87, {}, may-alias), {88}: (88, {}, may-alias), {89}: (89, {}, may-alias), {90}: (90, {}, may-alias), {91}: (91, {}, may-alias), {92}: (92, {}, may-alias), {93}: (93, {}, may-alias), {94}: (94, {}, may-alias), {95}: (95, {}, may-alias), {96}: (96, {}, may-alias), {97}: (97, {}, may-alias), {98}: (98, {}, may-alias), {99}: (99, {}, may-alias), {100}: (100, {}, may-alias), {101}: (101, {}, may-alias), {102}: (102, {}, may-alias), {103}: (103, {}, may-alias), {104}: (104, {}, may-alias), {105}: (105, {}, may-alias), {106}: (106, {}, may-alias), {107}: (107, {}, may-alias), {108}: (108, {}, may-alias), {109}: (109, {}, may-alias), {110}: (110, {}, may-alias), {111}: (111, {}, may-alias), {112}: (112, {}, may-alias), {113}: (113, {}, may-alias), {114}: (114, {}, may-alias), {115}: (115, {}, may-alias), {116}: (116, {}, may-alias), {117}: (117, {}, may-alias), {118}: (118, {}, may-alias), {119}: (119, {}, may-alias), {120}: (120, {}, may-alias), {121}: (121, {}, may-alias), {122}: (122, {}, may-alias), {123}: (123, {}, may-alias), {124}: (124, {}, may-alias), {125}: (125, {}, may-alias), {126}: (126, {}, may-alias), {127}: (127, {}, may-alias), {128}: (128, {}, may-alias), {129}: (129, {}, may-alias), {130}: (130, {}, may-alias), {131}: (131, {}, may-alias), {132}: (132, {}, may-alias), {133}: (133, {}, may-alias), {134}: (134, {}, may-alias), {135}: (135, {}, may-alias), {136}: (136, {}, may-alias), {137}: (137, {}, may-alias), {138}: (138, {}, may-alias), {139}: (139, {}, may-alias), {140}: (140, {}, may-alias), {141}: (141, {}, may-alias), {142}: (142, {}, may-alias), {143}: (143, {}, may-alias), {144}: (144, {}, may-alias), {145}: (145, {}, may-alias), {146}: (146, {}, may-alias), {147}: (147, {}, may-alias), {148}: (148, {}, may-alias), {149}: (149, {}, may-alias), {150}: (150, {}, may-alias), {151}: (151, {}, may-alias), {152}: (152, {}, may-alias), {153}: (153, {}, may-alias), {154}: (154, {}, may-alias), {155}: (155, {}, may-alias), {156}: (156, {}, may-alias), {157}: (157, {}, may-alias), {158}: (158, {}, may-alias), {159}: (159, {}, may-alias), {160}: (160, {}, may-alias), {161}: (161, {}, may-alias), {162}: (162, {}, may-alias), {163}: (163, {}, may-alias), {164}: (164, {}, may-alias), {165}: (165, {}, may-alias), {166}: (166, {}, may-alias), {167}: (167, {}, may-alias), {168}: (168, {}, may-alias), {169}: (169, {}, may-alias), {170}: (170, {}, may-alias), {171}: (171, {}, may-alias), {172}: (172, {}, may-alias), {173}: (173, {}, may-alias), {174}: (174, {}, may-alias), {175}: (175, {}, may-alias), {176}: (176, {}, may-alias), {177}: (177, {}, may-alias), {178}: (178, {}, may-alias), {179}: (179, {}, may-alias), {180}: (180, {}, may-alias), {181}: (181, {}, may-alias), {182}: (182, {}, may-alias), {183}: (183, {}, may-alias), {184}: (184, {}, may-alias), {185}: (185, {}, may-alias), {186}: (186, {}, may-alias), {187}: (187, {}, may-alias), {188}: (188, {}, may-alias), {189}: (189, {}, may-alias), {190}: (190, {}, may-alias), {191}: (191, {}, may-alias), {192}: (192, {}, may-alias), {193}: (193, {}, may-alias), {194}: (194, {}, may-alias), {195}: (195, {}, may-alias), {196}: (196, {}, may-alias), {197}: (197, {}, may-alias), {198}: (198, {}, may-alias), {199}: (199, {}, may-alias), {200}: (200, {}, may-alias), {201}: (201, {}, may-alias), {202}: (202, {}, may-alias), {203}: (203, {}, may-alias), {204}: (204, {}, may-alias), {205}: (205, {}, may-alias), {206}: (206, {}, may-alias), {207}: (207, {}, may-alias), {208}: (208, {}, may-alias), {209}: (209, {}, may-alias), {210}: (210, {}, may-alias), {211}: (211, {}, may-alias), {212}: (212, {}, may-alias), {213}: (213, {}, may-alias), {214}: (214, {}, may-alias), {215}: (215, {}, may-alias), {216}: (216, {}, may-alias), {217}: (217, {}, may-alias), {218}: (218, {}, may-alias), {219}: (219, {}, may-alias), {220}: (220, {}, may-alias), {221}: (221, {}, may-alias), {222}: (222, {}, may-alias), {223}: (223, {}, may-alias), {224}: (224, {}, may-alias), {225}: (225, {}, may-alias), {226}: (226, {}, may-alias), {227}: (227, {}, may-alias) }, entry_computation_layout={(s32[], f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=5*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=10*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=15*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=20*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=25*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=30*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=45*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=100*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=105*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[7168]{0}, bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, /*index=130*/bf16[18432,1,7168]{2,1,0}, bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, /*index=135*/bf16[1536,1]{1,0}, bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, /*index=140*/bf16[7168,129280]{1,0}, bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, /*index=145*/bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, /*index=150*/bf16[7168,2]{1,0}, bf16[7168,2]{1,0}, bf16[512,2]{1,0}, bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, /*index=155*/bf16[7168,2,576]{2,1,0}, bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, /*index=160*/s32[], bf16[7168]{0}, bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, bf16[18432,1,7168]{2,1,0}, /*index=165*/bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, bf16[1536,1]{1,0}, /*index=170*/bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, bf16[7168,129280]{1,0}, /*index=175*/bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,7168,2048]{3,2,1,0}, /*index=180*/bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, bf16[7168,2]{1,0}, /*index=185*/bf16[7168,2]{1,0}, bf16[512,2]{1,0}, bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, bf16[7168,2,576]{2,1,0}, /*index=190*/bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, bf16[7168]{0}, /*index=195*/bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, bf16[18432,1,7168]{2,1,0}, bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, /*index=200*/bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, bf16[1536,1]{1,0}, bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, /*index=205*/bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, bf16[7168,129280]{1,0}, bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, /*index=210*/bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, /*index=215*/bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, bf16[7168,2]{1,0}, bf16[7168,2]{1,0}, bf16[512,2]{1,0}, /*index=220*/bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, bf16[7168,2,576]{2,1,0}, bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, /*index=225*/bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, s32[], s32[32,4096]{1,0}, s32[32,4096]{1,0}, /*index=230*/s32[32,4096]{1,0}, s32[32,4096]{1,0})->(s32[], f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=5*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=10*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=15*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=20*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=25*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=30*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=45*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=100*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=105*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[7168]{0}, bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, /*index=130*/bf16[18432,1,7168]{2,1,0}, bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, /*index=135*/bf16[1536,1]{1,0}, bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, /*index=140*/bf16[7168,129280]{1,0}, bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, /*index=145*/bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, /*index=150*/bf16[7168,2]{1,0}, bf16[7168,2]{1,0}, bf16[512,2]{1,0}, bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, /*index=155*/bf16[7168,2,576]{2,1,0}, bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, /*index=160*/s32[], bf16[7168]{0}, bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, bf16[18432,1,7168]{2,1,0}, /*index=165*/bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, bf16[1536,1]{1,0}, /*index=170*/bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, bf16[7168,129280]{1,0}, /*index=175*/bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,7168,2048]{3,2,1,0}, /*index=180*/bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, bf16[7168,2]{1,0}, /*index=185*/bf16[7168,2]{1,0}, bf16[512,2]{1,0}, bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, bf16[7168,2,576]{2,1,0}, /*index=190*/bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, bf16[7168]{0}, /*index=195*/bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, bf16[18432,1,7168]{2,1,0}, bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, /*index=200*/bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, bf16[1536,1]{1,0}, bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, /*index=205*/bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, bf16[7168,129280]{1,0}, bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, /*index=210*/bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, /*index=215*/bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, bf16[7168,2]{1,0}, bf16[7168,2]{1,0}, bf16[512,2]{1,0}, /*index=220*/bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, bf16[7168,2,576]{2,1,0}, bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, /*index=225*/bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, s32[], f32[], f32[], /*index=230*/f32[], f32[], f32[], f32[], s32[])}, allow_spmd_sharding_propagation_to_parameters={false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false}, allow_spmd_sharding_propagation_to_output={false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,true,true,true}, num_partitions=4, frontend_attributes={xla.sdy.meshes={mesh = #sdy.mesh<["data"=1, "stage"=1, "fsdp"=1, "fsdp_transpose"=1, "sequence"=1, "context"=1, "context_autoregressive"=1, "tensor"=1, "tensor_transpose"=1, "tensor_sequence"=1, "expert"=4, "autoregressive"=1]>}}
+
+%clip.1 (Arg_0.1: f32[32], Arg_1.1: s32[], Arg_2.1: s32[]) -> f32[32] {
+  %Arg_2.1 = s32[] parameter(2)
+  %convert_element_type.3 = f32[] convert(%Arg_2.1), metadata={op_name="convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %min.2 = f32[32]{0} broadcast(%convert_element_type.3), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %Arg_1.1 = s32[] parameter(1)
+  %convert_element_type.2 = f32[] convert(%Arg_1.1), metadata={op_name="convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %max.2 = f32[32]{0} broadcast(%convert_element_type.2), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %Arg_0.1 = f32[32]{0} parameter(0)
+  %max.3 = f32[32]{0} maximum(%max.2, %Arg_0.1), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  ROOT %min.3 = f32[32]{0} minimum(%min.2, %max.3), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+}
+
+%_where.2 (Arg_0.4: pred[32,4096], Arg_1.4: s32[32,4096], Arg_2.3: s32[32,4096]) -> s32[32,4096] {
+  %Arg_0.4 = pred[32,4096]{1,0} parameter(0)
+  %Arg_1.4 = s32[32,4096]{1,0} parameter(1)
+  %Arg_2.3 = s32[32,4096]{1,0} parameter(2)
+  ROOT %select_n.2 = s32[32,4096]{1,0} select(%Arg_0.4, %Arg_1.4, %Arg_2.3), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+}
+
+%region_0.3 (reduce_and.3: pred[], reduce_and.4: pred[]) -> pred[] {
+  %reduce_and.3 = pred[] parameter(0), metadata={op_name="reduce_and"}
+  %reduce_and.4 = pred[] parameter(1), metadata={op_name="reduce_and"}
+  ROOT %reduce_and.5 = pred[] and(%reduce_and.3, %reduce_and.4), metadata={op_name="reduce_and" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+}
+
+%_take.4 (Arg_0.5: c64[163840,32], Arg_1.5: s32[32,4096]) -> c64[32,4096,32] {
+  %Arg_1.5 = s32[32,4096]{1,0} parameter(1)
+  %constant.223 = s32[] constant(0)
+  %lt.3 = s32[32,4096]{1,0} broadcast(%constant.223), dimensions={}, metadata={op_name="lt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %lt.4 = pred[32,4096]{1,0} compare(%Arg_1.5, %lt.3), direction=LT, metadata={op_name="lt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.221 = s32[] constant(163840)
+  %add.28 = s32[32,4096]{1,0} broadcast(%constant.221), dimensions={}, metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %add.29 = s32[32,4096]{1,0} add(%Arg_1.5, %add.28), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %jit__where_.1 = s32[32,4096]{1,0} call(%lt.4, %add.29, %Arg_1.5), to_apply=%_where.2, metadata={op_name="jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.32 = s32[32,4096,1]{2,1,0} reshape(%jit__where_.1), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.220 = s32[] constant(0)
+  %ge.2 = s32[32,4096,1]{2,1,0} broadcast(%constant.220), dimensions={}, metadata={op_name="ge" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %ge.3 = pred[32,4096,1]{2,1,0} compare(%broadcast_in_dim.32, %ge.2), direction=GE, metadata={op_name="ge" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.219 = s32[] constant(163839)
+  %le.2 = s32[32,4096,1]{2,1,0} broadcast(%constant.219), dimensions={}, metadata={op_name="le" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %le.3 = pred[32,4096,1]{2,1,0} compare(%broadcast_in_dim.32, %le.2), direction=LE, metadata={op_name="le" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %and.1 = pred[32,4096,1]{2,1,0} and(%ge.3, %le.3), metadata={op_name="and" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.222 = pred[] constant(true)
+  %reduce_and.7 = pred[32,4096]{1,0} reduce(%and.1, %constant.222), dimensions={2}, to_apply=%region_0.3, metadata={op_name="reduce_and" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.33 = pred[32,4096,32]{2,1,0} broadcast(%reduce_and.7), dimensions={0,1}, metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %Arg_0.5 = c64[163840,32]{1,0} parameter(0)
+  %gather.2 = c64[32,4096,32]{2,1,0} gather(%Arg_0.5, %broadcast_in_dim.32), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,32}, metadata={op_name="gather" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.218 = c64[] constant((nan, 0))
+  %broadcast_in_dim.31 = c64[32,4096,32]{2,1,0} broadcast(%constant.218), dimensions={}, metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  ROOT %select_n.4 = c64[32,4096,32]{2,1,0} select(%broadcast_in_dim.33, %gather.2, %broadcast_in_dim.31), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+}
+
+%region_2.5 (reduce_sum.3: f32[], reduce_sum.4: f32[]) -> f32[] {
+  %reduce_sum.3 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.5 = f32[] add(%reduce_sum.3, %reduce_sum.4), metadata={op_name="dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_3.6 (reduce_max.3: f32[], reduce_max.4: f32[]) -> f32[] {
+  %reduce_max.3 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.4 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.5 = f32[] maximum(%reduce_max.3, %reduce_max.4), metadata={op_name="dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%_where_68.7 (Arg_0.8: pred[], Arg_1.8: f32[], Arg_2.6: f32[1]) -> f32[1] {
+  %Arg_0.8 = pred[] parameter(0)
+  %select_n.7 = pred[1]{0} broadcast(%Arg_0.8), dimensions={}, metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.8 = f32[] parameter(1)
+  %broadcast_in_dim.41 = f32[1]{0} reshape(%Arg_1.8), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_2.6 = f32[1]{0} parameter(2)
+  ROOT %select_n.8 = f32[1]{0} select(%select_n.7, %broadcast_in_dim.41, %Arg_2.6), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%_where_71.8 (Arg_0.10: pred[], Arg_1.10: f32[1], Arg_2.8: f32[1]) -> f32[1] {
+  %Arg_0.10 = pred[] parameter(0)
+  %select_n.11 = pred[1]{0} broadcast(%Arg_0.10), dimensions={}, metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.10 = f32[1]{0} parameter(1)
+  %Arg_2.8 = f32[1]{0} parameter(2)
+  ROOT %select_n.12 = f32[1]{0} select(%select_n.11, %Arg_1.10, %Arg_2.8), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%clip_75.9 (Arg_0.12: bf16[32,4096,7168], Arg_1.12: bf16[], Arg_2.10: bf16[]) -> bf16[32,4096,7168] {
+  %Arg_1.12 = bf16[] parameter(1)
+  %min.6 = bf16[32,4096,7168]{2,1,0} broadcast(%Arg_1.12), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_2.10 = bf16[] parameter(2)
+  %max.6 = bf16[32,4096,7168]{2,1,0} broadcast(%Arg_2.10), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.12 = bf16[32,4096,7168]{2,1,0} parameter(0)
+  %max.7 = bf16[32,4096,7168]{2,1,0} maximum(%max.6, %Arg_0.12), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %min.7 = bf16[32,4096,7168]{2,1,0} minimum(%min.6, %max.7), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_4.10 (reduce_max.10: f32[], reduce_max.11: f32[]) -> f32[] {
+  %reduce_max.10 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.11 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.12 = f32[] maximum(%reduce_max.10, %reduce_max.11), metadata={op_name="dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%clip_81.11 (Arg_0.14: bf16[7168,1536], Arg_1.14: bf16[], Arg_2.12: bf16[]) -> bf16[7168,1536] {
+  %Arg_1.14 = bf16[] parameter(1)
+  %min.10 = bf16[7168,1536]{1,0} broadcast(%Arg_1.14), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_2.12 = bf16[] parameter(2)
+  %max.10 = bf16[7168,1536]{1,0} broadcast(%Arg_2.12), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_0.14 = bf16[7168,1536]{1,0} parameter(0)
+  %max.11 = bf16[7168,1536]{1,0} maximum(%max.10, %Arg_0.14), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  ROOT %min.11 = bf16[7168,1536]{1,0} minimum(%min.10, %max.11), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%region_6.14 (reduce_sum.10: f32[], reduce_sum.11: f32[]) -> f32[] {
+  %reduce_sum.10 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.11 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.12 = f32[] add(%reduce_sum.10, %reduce_sum.11), metadata={op_name="dense_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_7.15 (reduce_max.24: f32[], reduce_max.25: f32[]) -> f32[] {
+  %reduce_max.24 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.25 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.26 = f32[] maximum(%reduce_max.24, %reduce_max.25), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%clip_106.16 (Arg_0.18: bf16[32,4096,1536], Arg_1.18: bf16[], Arg_2.16: bf16[]) -> bf16[32,4096,1536] {
+  %Arg_1.18 = bf16[] parameter(1)
+  %min.18 = bf16[32,4096,1536]{2,1,0} broadcast(%Arg_1.18), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_2.16 = bf16[] parameter(2)
+  %max.18 = bf16[32,4096,1536]{2,1,0} broadcast(%Arg_2.16), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.18 = bf16[32,4096,1536]{2,1,0} parameter(0)
+  %max.19 = bf16[32,4096,1536]{2,1,0} maximum(%max.18, %Arg_0.18), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.19 = bf16[32,4096,1536]{2,1,0} minimum(%min.18, %max.19), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_8.17 (reduce_max.31: f32[], reduce_max.32: f32[]) -> f32[] {
+  %reduce_max.31 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.32 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.33 = f32[] maximum(%reduce_max.31, %reduce_max.32), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_112.18 (Arg_0.20: bf16[1536,128,192], Arg_1.20: bf16[], Arg_2.18: bf16[]) -> bf16[1536,128,192] {
+  %Arg_1.20 = bf16[] parameter(1)
+  %min.22 = bf16[1536,128,192]{2,1,0} broadcast(%Arg_1.20), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.18 = bf16[] parameter(2)
+  %max.22 = bf16[1536,128,192]{2,1,0} broadcast(%Arg_2.18), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.20 = bf16[1536,128,192]{2,1,0} parameter(0)
+  %max.23 = bf16[1536,128,192]{2,1,0} maximum(%max.22, %Arg_0.20), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.23 = bf16[1536,128,192]{2,1,0} minimum(%min.22, %max.23), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_5.12 (reduce_max.17: f32[], reduce_max.18: f32[]) -> f32[] {
+  %reduce_max.17 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.18 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.19 = f32[] maximum(%reduce_max.17, %reduce_max.18), metadata={op_name="dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%clip_90.13 (Arg_0.16: bf16[7168,576], Arg_1.16: bf16[], Arg_2.14: bf16[]) -> bf16[7168,576] {
+  %Arg_1.16 = bf16[] parameter(1)
+  %min.14 = bf16[7168,576]{1,0} broadcast(%Arg_1.16), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_2.14 = bf16[] parameter(2)
+  %max.14 = bf16[7168,576]{1,0} broadcast(%Arg_2.14), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_0.16 = bf16[7168,576]{1,0} parameter(0)
+  %max.15 = bf16[7168,576]{1,0} maximum(%max.14, %Arg_0.16), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  ROOT %min.15 = bf16[7168,576]{1,0} minimum(%min.14, %max.15), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%region_9.19 (reduce_sum.17: f32[], reduce_sum.18: f32[]) -> f32[] {
+  %reduce_sum.17 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.18 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.19 = f32[] add(%reduce_sum.17, %reduce_sum.18), metadata={op_name="dense_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_10.20 (reduce_max.38: f32[], reduce_max.39: f32[]) -> f32[] {
+  %reduce_max.38 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.39 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.40 = f32[] maximum(%reduce_max.38, %reduce_max.39), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%clip_154.21 (Arg_0.22: bf16[32,4096,512], Arg_1.22: bf16[], Arg_2.20: bf16[]) -> bf16[32,4096,512] {
+  %Arg_1.22 = bf16[] parameter(1)
+  %min.26 = bf16[32,4096,512]{2,1,0} broadcast(%Arg_1.22), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_2.20 = bf16[] parameter(2)
+  %max.26 = bf16[32,4096,512]{2,1,0} broadcast(%Arg_2.20), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.22 = bf16[32,4096,512]{2,1,0} parameter(0)
+  %max.27 = bf16[32,4096,512]{2,1,0} maximum(%max.26, %Arg_0.22), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.27 = bf16[32,4096,512]{2,1,0} minimum(%min.26, %max.27), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_11.22 (reduce_max.45: f32[], reduce_max.46: f32[]) -> f32[] {
+  %reduce_max.45 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.46 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.47 = f32[] maximum(%reduce_max.45, %reduce_max.46), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_160.23 (Arg_0.24: bf16[512,128,256], Arg_1.24: bf16[], Arg_2.22: bf16[]) -> bf16[512,128,256] {
+  %Arg_1.24 = bf16[] parameter(1)
+  %min.30 = bf16[512,128,256]{2,1,0} broadcast(%Arg_1.24), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.22 = bf16[] parameter(2)
+  %max.30 = bf16[512,128,256]{2,1,0} broadcast(%Arg_2.22), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.24 = bf16[512,128,256]{2,1,0} parameter(0)
+  %max.31 = bf16[512,128,256]{2,1,0} maximum(%max.30, %Arg_0.24), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.31 = bf16[512,128,256]{2,1,0} minimum(%min.30, %max.31), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_12.24 (reduce_max.52: f32[], reduce_max.53: f32[]) -> f32[] {
+  %reduce_max.52 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.53 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.54 = f32[] maximum(%reduce_max.52, %reduce_max.53), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%clip_177.25 (Arg_0.26: bf16[32,4096,128,128], Arg_1.26: bf16[], Arg_2.24: bf16[]) -> bf16[32,4096,128,128] {
+  %Arg_1.26 = bf16[] parameter(1)
+  %min.34 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%Arg_1.26), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_2.24 = bf16[] parameter(2)
+  %max.34 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%Arg_2.24), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.26 = bf16[32,4096,128,128]{3,2,1,0} parameter(0)
+  %max.35 = bf16[32,4096,128,128]{3,2,1,0} maximum(%max.34, %Arg_0.26), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.35 = bf16[32,4096,128,128]{3,2,1,0} minimum(%min.34, %max.35), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_13.26 (reduce_max.59: f32[], reduce_max.60: f32[]) -> f32[] {
+  %reduce_max.59 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.60 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.61 = f32[] maximum(%reduce_max.59, %reduce_max.60), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_183.27 (Arg_0.28: bf16[128,128,7168], Arg_1.28: bf16[], Arg_2.26: bf16[]) -> bf16[128,128,7168] {
+  %Arg_1.28 = bf16[] parameter(1)
+  %min.38 = bf16[128,128,7168]{2,1,0} broadcast(%Arg_1.28), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.26 = bf16[] parameter(2)
+  %max.38 = bf16[128,128,7168]{2,1,0} broadcast(%Arg_2.26), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.28 = bf16[128,128,7168]{2,1,0} parameter(0)
+  %max.39 = bf16[128,128,7168]{2,1,0} maximum(%max.38, %Arg_0.28), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.39 = bf16[128,128,7168]{2,1,0} minimum(%min.38, %max.39), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_14.28 (reduce_sum.24: f32[], reduce_sum.25: f32[]) -> f32[] {
+  %reduce_sum.24 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.25 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.26 = f32[] add(%reduce_sum.24, %reduce_sum.25), metadata={op_name="dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_15.29 (reduce_max.66: f32[], reduce_max.67: f32[]) -> f32[] {
+  %reduce_max.66 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.67 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.68 = f32[] maximum(%reduce_max.66, %reduce_max.67), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_16.30 (reduce_max.73: f32[], reduce_max.74: f32[]) -> f32[] {
+  %reduce_max.73 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.74 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.75 = f32[] maximum(%reduce_max.73, %reduce_max.74), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_193.31 (Arg_0.30: bf16[7168,18432], Arg_1.30: bf16[], Arg_2.28: bf16[]) -> bf16[7168,18432] {
+  %Arg_1.30 = bf16[] parameter(1)
+  %min.42 = bf16[7168,18432]{1,0} broadcast(%Arg_1.30), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.28 = bf16[] parameter(2)
+  %max.42 = bf16[7168,18432]{1,0} broadcast(%Arg_2.28), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.30 = bf16[7168,18432]{1,0} parameter(0)
+  %max.43 = bf16[7168,18432]{1,0} maximum(%max.42, %Arg_0.30), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.43 = bf16[7168,18432]{1,0} minimum(%min.42, %max.43), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%silu.32 (Arg_0.32: bf16[32,4096,18432]) -> bf16[32,4096,18432] {
+  %Arg_0.32 = bf16[32,4096,18432]{2,1,0} parameter(0)
+  %constant.242 = bf16[] constant(1)
+  %broadcast.82 = bf16[32,4096,18432]{2,1,0} broadcast(%constant.242), dimensions={}
+  %neg.1 = bf16[32,4096,18432]{2,1,0} negate(%Arg_0.32), metadata={op_name="neg" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %exp.39 = bf16[32,4096,18432]{2,1,0} exponential(%neg.1), metadata={op_name="exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %add.39 = bf16[32,4096,18432]{2,1,0} add(%exp.39, %broadcast.82), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %div.70 = bf16[32,4096,18432]{2,1,0} divide(%broadcast.82, %add.39), metadata={op_name="div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  ROOT %mul.143 = bf16[32,4096,18432]{2,1,0} multiply(%Arg_0.32, %div.70), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+}
+
+%region_17.33 (reduce_max.80: f32[], reduce_max.81: f32[]) -> f32[] {
+  %reduce_max.80 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.81 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.82 = f32[] maximum(%reduce_max.80, %reduce_max.81), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_18.34 (reduce_max.87: f32[], reduce_max.88: f32[]) -> f32[] {
+  %reduce_max.87 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.88 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.89 = f32[] maximum(%reduce_max.87, %reduce_max.88), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_19.35 (reduce_max.94: f32[], reduce_max.95: f32[]) -> f32[] {
+  %reduce_max.94 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.95 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.96 = f32[] maximum(%reduce_max.94, %reduce_max.95), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%clip_208.36 (Arg_0.34: bf16[32,4096,18432], Arg_1.32: bf16[], Arg_2.30: bf16[]) -> bf16[32,4096,18432] {
+  %Arg_1.32 = bf16[] parameter(1)
+  %min.46 = bf16[32,4096,18432]{2,1,0} broadcast(%Arg_1.32), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_2.30 = bf16[] parameter(2)
+  %max.46 = bf16[32,4096,18432]{2,1,0} broadcast(%Arg_2.30), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.34 = bf16[32,4096,18432]{2,1,0} parameter(0)
+  %max.47 = bf16[32,4096,18432]{2,1,0} maximum(%max.46, %Arg_0.34), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.47 = bf16[32,4096,18432]{2,1,0} minimum(%min.46, %max.47), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_20.37 (reduce_max.101: f32[], reduce_max.102: f32[]) -> f32[] {
+  %reduce_max.101 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.102 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.103 = f32[] maximum(%reduce_max.101, %reduce_max.102), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_214.38 (Arg_0.36: bf16[18432,7168], Arg_1.34: bf16[], Arg_2.32: bf16[]) -> bf16[18432,7168] {
+  %Arg_1.34 = bf16[] parameter(1)
+  %min.50 = bf16[18432,7168]{1,0} broadcast(%Arg_1.34), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.32 = bf16[] parameter(2)
+  %max.50 = bf16[18432,7168]{1,0} broadcast(%Arg_2.32), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.36 = bf16[18432,7168]{1,0} parameter(0)
+  %max.51 = bf16[18432,7168]{1,0} maximum(%max.50, %Arg_0.36), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.51 = bf16[18432,7168]{1,0} minimum(%min.50, %max.51), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%closed_call.39 (Arg_0.37: bf16[], Arg_1.35: bf16[], Arg_2.33: bf16[], Arg_3.1: bf16[], Arg_4.1: bf16[], Arg_5.1: bf16[], Arg_6.1: bf16[], Arg_7.1: bf16[], Arg_8.1: c64[32,4096,1,32], Arg_9.1: bf16[], Arg_10.1: bf16[], Arg_11.1: bf16[], Arg_12.1: bf16[], Arg_13.1: bf16[], Arg_14.1: bf16[], Arg_15.1: c64[32,4096,1,32], Arg_16.1: bf16[0], Arg_17.1: bf16[], Arg_18.1: bf16[], Arg_19.1: bf16[], Arg_20.1: bf16[], Arg_21.1: bf16[], Arg_22.1: bf16[], Arg_23.1: bf16[], Arg_24.1: bf16[], Arg_25.1: bf16[], Arg_26.1: bf16[], Arg_27.1: bf16[], Arg_28.1: bf16[], Arg_29.1: bf16[], Arg_30.1: bf16[], Arg_31.1: bf16[], Arg_32.1: bf16[], Arg_33.1: bf16[32,4096,7168], Arg_34.1: bf16[7168,18432], Arg_35.1: bf16[7168,18432], Arg_36.1: bf16[18432,7168], Arg_37.1: bf16[7168], Arg_38.1: bf16[7168], Arg_39.1: bf16[512], Arg_40.1: bf16[128,128,7168], Arg_41.1: bf16[1536], Arg_42.1: bf16[7168,576], Arg_43.1: bf16[512,128,256], Arg_44.1: bf16[7168,1536], Arg_45.1: bf16[1536,128,192], Arg_46.1: f32[1024], Arg_47.1: f32[1], Arg_48.1: f32[1024], Arg_49.1: f32[1], Arg_50.1: f32[1024], Arg_51.1: f32[1], Arg_52.1: f32[1024], Arg_53.1: f32[1], Arg_54.1: f32[1024], Arg_55.1: f32[1], Arg_56.1: f32[1024], Arg_57.1: f32[1], Arg_58.1: f32[1024], Arg_59.1: f32[1], Arg_60.1: f32[1024], Arg_61.1: f32[1024], Arg_62.1: f32[1], Arg_63.1: f32[1], Arg_64.1: f32[1024], Arg_65.1: f32[1], Arg_66.1: f32[1024], Arg_67.1: f32[1], Arg_68.1: f32[1024], Arg_69.1: f32[1], Arg_70.1: f32[1024], Arg_71.1: f32[1], Arg_72.1: f32[1024], Arg_73.1: f32[1], Arg_74.1: f32[1024], Arg_75.1: f32[1]) -> (bf16[32,4096,7168], bf16[32,4096,7168], bf16[32,4096,1536], bf16[32,4096,576], bf16[32,4096,7168], /*index=5*/bf16[32,4096,18432], bf16[32,4096,18432]) {
+  %Arg_33.1 = bf16[32,4096,7168]{2,1,0} parameter(33)
+  %sharding_constraint.13 = bf16[32,4096,7168]{2,1,0} custom-call(%Arg_33.1), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="dense_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %reduce_precision.2 = bf16[32,4096,7168]{2,1,0} reduce-precision(%sharding_constraint.13), exponent_bits=8, mantissa_bits=7, metadata={op_name="dense_layers/reduce_precision" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/deepseek.py" source_line=173 source_end_line=173 source_column=13 source_end_column=59}
+  %convert_element_type.58 = f32[32,4096,7168]{2,1,0} convert(%reduce_precision.2), metadata={op_name="dense_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.4 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.58, %convert_element_type.58), metadata={op_name="dense_layers/pre_self_attention_layer_norm/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %constant.254 = f32[] constant(0)
+  %reduce_sum.28 = f32[32,4096]{1,0} reduce(%square.4, %constant.254), dimensions={2}, to_apply=%region_2.5, metadata={op_name="dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.120 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.28), metadata={op_name="dense_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.250 = f32[] constant(7168)
+  %broadcast.84 = f32[32,4096,1]{2,1,0} broadcast(%constant.250), dimensions={}
+  %div.93 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.120, %broadcast.84), metadata={op_name="dense_layers/pre_self_attention_layer_norm/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.249 = f32[] constant(1e-06)
+  %broadcast.83 = f32[32,4096,1]{2,1,0} broadcast(%constant.249), dimensions={}
+  %add.41 = f32[32,4096,1]{2,1,0} add(%div.93, %broadcast.83), metadata={op_name="dense_layers/pre_self_attention_layer_norm/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.4 = f32[32,4096,1]{2,1,0} rsqrt(%add.41), metadata={op_name="dense_layers/pre_self_attention_layer_norm/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.152 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.4), dimensions={0,1,2}, metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.153 = f32[32,4096]{1,0} reshape(%mul.152), metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.154 = f32[32,4096,7168]{2,1,0} broadcast(%mul.153), dimensions={0,1}, metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.155 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.58, %mul.154), metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.59 = bf16[32,4096,7168]{2,1,0} convert(%mul.155), metadata={op_name="dense_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_38.1 = bf16[7168]{0} parameter(38)
+  %broadcast_in_dim.121 = bf16[1,1,7168]{2,1,0} reshape(%Arg_38.1), metadata={op_name="dense_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.156 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.121), dimensions={0,1,2}, metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.157 = bf16[7168]{0} reshape(%mul.156), metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.158 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.157), dimensions={2}, metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.159 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.59, %mul.158), metadata={op_name="dense_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %sharding_constraint.14 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.159), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="dense_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %sharding_constraint.15 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.14), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %constant.248 = f32[1]{0} constant({1})
+  %Arg_58.1 = f32[1024]{0} parameter(58)
+  %constant.253 = f32[] constant(-inf)
+  %reduce_max.105 = f32[] reduce(%Arg_58.1, %constant.253), dimensions={0}, to_apply=%region_3.6, metadata={op_name="dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %is_finite.15 = pred[] is-finite(%reduce_max.105), metadata={op_name="dense_layers/self_attention/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %gt.15 = pred[] compare(%reduce_max.105, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %constant.251 = f32[] constant(448)
+  %div.95 = f32[] divide(%constant.251, %reduce_max.105), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %constant.252 = f32[] constant(1)
+  %div.96 = f32[] divide(%div.95, %constant.252), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_59.1 = f32[1]{0} parameter(59)
+  %div.94 = f32[1]{0} divide(%constant.248, %Arg_59.1), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %jit__where_.32 = f32[1]{0} call(%gt.15, %div.96, %div.94), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %jit__where_.33 = f32[1]{0} call(%is_finite.15, %jit__where_.32, %div.94), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.97 = f32[1]{0} divide(%constant.248, %jit__where_.33), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %convert_element_type.60 = bf16[1]{0} convert(%div.97), metadata={op_name="dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.122 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.60), dimensions={2}, metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.123 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.122), metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.124 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.123), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.98 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.15, %broadcast_in_dim.124), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.37 = bf16[] parameter(0)
+  %Arg_1.35 = bf16[] parameter(1)
+  %jit_clip_.17 = bf16[32,4096,7168]{2,1,0} call(%div.98, %Arg_0.37, %Arg_1.35), to_apply=%clip_75.9, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %convert_element_type.61 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.17), metadata={op_name="dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_44.1 = bf16[7168,1536]{1,0} parameter(44)
+  %Arg_60.1 = f32[1024]{0} parameter(60)
+  %reduce_max.106 = f32[] reduce(%Arg_60.1, %constant.253), dimensions={0}, to_apply=%region_4.10, metadata={op_name="dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %is_finite.16 = pred[] is-finite(%reduce_max.106), metadata={op_name="dense_layers/self_attention/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %gt.16 = pred[] compare(%reduce_max.106, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.100 = f32[] divide(%constant.251, %reduce_max.106), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.101 = f32[] divide(%div.100, %constant.252), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_62.1 = f32[1]{0} parameter(62)
+  %div.99 = f32[1]{0} divide(%constant.248, %Arg_62.1), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.34 = f32[1]{0} call(%gt.16, %div.101, %div.99), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.35 = f32[1]{0} call(%is_finite.16, %jit__where_.34, %div.99), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.102 = f32[1]{0} divide(%constant.248, %jit__where_.35), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.62 = bf16[1]{0} convert(%div.102), metadata={op_name="dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.125 = bf16[7168,1]{1,0} broadcast(%convert_element_type.62), dimensions={1}, metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.126 = bf16[7168]{0} reshape(%broadcast_in_dim.125), metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.127 = bf16[7168,1536]{1,0} broadcast(%broadcast_in_dim.126), dimensions={0}, metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.103 = bf16[7168,1536]{1,0} divide(%Arg_44.1, %broadcast_in_dim.127), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_2.33 = bf16[] parameter(2)
+  %Arg_3.1 = bf16[] parameter(3)
+  %jit_clip_.18 = bf16[7168,1536]{1,0} call(%div.103, %Arg_2.33, %Arg_3.1), to_apply=%clip_81.11, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.63 = f8e4m3fn[7168,1536]{1,0} convert(%jit_clip_.18), metadata={op_name="dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %dot_general.8 = bf16[32,4096,1536]{2,1,0} dot(%convert_element_type.61, %convert_element_type.63), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.160 = f32[1]{0} multiply(%div.97, %div.102), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.64 = bf16[1]{0} convert(%mul.160), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.128 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.64), dimensions={2}, metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.129 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.128), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.130 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.129), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.161 = bf16[32,4096,1536]{2,1,0} multiply(%dot_general.8, %broadcast_in_dim.130), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.68 = f32[32,4096,1536]{2,1,0} convert(%mul.161), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.5 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.68, %convert_element_type.68), metadata={op_name="dense_layers/self_attention/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %reduce_sum.29 = f32[32,4096]{1,0} reduce(%square.5, %constant.254), dimensions={2}, to_apply=%region_6.14, metadata={op_name="dense_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.137 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.29), metadata={op_name="dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.247 = f32[] constant(1536)
+  %div.92 = f32[32,4096,1]{2,1,0} broadcast(%constant.247), dimensions={}, metadata={op_name="dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %div.109 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.137, %div.92), metadata={op_name="dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %add.42 = f32[32,4096,1]{2,1,0} add(%div.109, %broadcast.83), metadata={op_name="dense_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.5 = f32[32,4096,1]{2,1,0} rsqrt(%add.42), metadata={op_name="dense_layers/self_attention/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.164 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.5), dimensions={0,1,2}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.165 = f32[32,4096]{1,0} reshape(%mul.164), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.166 = f32[32,4096,1536]{2,1,0} broadcast(%mul.165), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.167 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.68, %mul.166), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.69 = bf16[32,4096,1536]{2,1,0} convert(%mul.167), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_41.1 = bf16[1536]{0} parameter(41)
+  %broadcast_in_dim.138 = bf16[1,1,1536]{2,1,0} reshape(%Arg_41.1), metadata={op_name="dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.168 = bf16[1,1,1536]{2,1,0} broadcast(%broadcast_in_dim.138), dimensions={0,1,2}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.169 = bf16[1536]{0} reshape(%mul.168), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.170 = bf16[32,4096,1536]{2,1,0} broadcast(%mul.169), dimensions={2}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.171 = bf16[32,4096,1536]{2,1,0} multiply(%convert_element_type.69, %mul.170), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %Arg_72.1 = f32[1024]{0} parameter(72)
+  %reduce_max.108 = f32[] reduce(%Arg_72.1, %constant.253), dimensions={0}, to_apply=%region_7.15, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.18 = pred[] is-finite(%reduce_max.108), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.18 = pred[] compare(%reduce_max.108, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.111 = f32[] divide(%constant.251, %reduce_max.108), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.112 = f32[] divide(%div.111, %constant.252), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_73.1 = f32[1]{0} parameter(73)
+  %div.110 = f32[1]{0} divide(%constant.248, %Arg_73.1), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.38 = f32[1]{0} call(%gt.18, %div.112, %div.110), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.39 = f32[1]{0} call(%is_finite.18, %jit__where_.38, %div.110), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.113 = f32[1]{0} divide(%constant.248, %jit__where_.39), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.70 = bf16[1]{0} convert(%div.113), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.139 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.70), dimensions={2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.140 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.139), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.141 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.140), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.114 = bf16[32,4096,1536]{2,1,0} divide(%mul.171, %broadcast_in_dim.141), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_4.1 = bf16[] parameter(4)
+  %Arg_5.1 = bf16[] parameter(5)
+  %jit_clip_.20 = bf16[32,4096,1536]{2,1,0} call(%div.114, %Arg_4.1, %Arg_5.1), to_apply=%clip_106.16, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.71 = f8e4m3fn[32,4096,1536]{2,1,0} convert(%jit_clip_.20), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_45.1 = bf16[1536,128,192]{2,1,0} parameter(45)
+  %Arg_74.1 = f32[1024]{0} parameter(74)
+  %reduce_max.109 = f32[] reduce(%Arg_74.1, %constant.253), dimensions={0}, to_apply=%region_8.17, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.19 = pred[] is-finite(%reduce_max.109), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.19 = pred[] compare(%reduce_max.109, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.116 = f32[] divide(%constant.251, %reduce_max.109), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.117 = f32[] divide(%div.116, %constant.252), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_75.1 = f32[1]{0} parameter(75)
+  %div.115 = f32[1]{0} divide(%constant.248, %Arg_75.1), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.40 = f32[1]{0} call(%gt.19, %div.117, %div.115), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.41 = f32[1]{0} call(%is_finite.19, %jit__where_.40, %div.115), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.118 = f32[1]{0} divide(%constant.248, %jit__where_.41), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.72 = bf16[1]{0} convert(%div.118), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.142 = bf16[1536,128,1]{2,1,0} broadcast(%convert_element_type.72), dimensions={2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.143 = bf16[1536,128]{1,0} reshape(%broadcast_in_dim.142), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.144 = bf16[1536,128,192]{2,1,0} broadcast(%broadcast_in_dim.143), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.119 = bf16[1536,128,192]{2,1,0} divide(%Arg_45.1, %broadcast_in_dim.144), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_6.1 = bf16[] parameter(6)
+  %Arg_7.1 = bf16[] parameter(7)
+  %jit_clip_.21 = bf16[1536,128,192]{2,1,0} call(%div.119, %Arg_6.1, %Arg_7.1), to_apply=%clip_112.18, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.73 = f8e4m3fn[1536,128,192]{2,1,0} convert(%jit_clip_.21), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.10 = bf16[32,4096,128,192]{3,2,1,0} dot(%convert_element_type.71, %convert_element_type.73), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.172 = f32[1]{0} multiply(%div.113, %div.118), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.74 = bf16[1]{0} convert(%mul.172), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.145 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.74), dimensions={3}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.146 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.145), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.147 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%broadcast_in_dim.146), dimensions={0,1,2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.173 = bf16[32,4096,128,192]{3,2,1,0} multiply(%dot_general.10, %broadcast_in_dim.147), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %split.6 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.173), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="dense_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=541 source_end_line=541 source_column=19 source_end_column=65}
+  %split.7 = bf16[32,4096,128,64]{3,2,1,0} slice(%mul.173), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="dense_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=541 source_end_line=541 source_column=19 source_end_column=65}
+  %reshape.2 = bf16[32,4096,128,32,2]{4,3,2,1,0} reshape(%split.7), metadata={op_name="dense_layers/self_attention/reshape" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=702 source_end_line=702 source_column=24 source_end_column=60}
+  %slice.4 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.2), slice={[0:32], [0:4096], [0:128], [0:32], [0:1]}, metadata={op_name="dense_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %squeeze.46 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.4), metadata={op_name="dense_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %convert_element_type.76 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.46), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %slice.5 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.2), slice={[0:32], [0:4096], [0:128], [0:32], [1:2]}, metadata={op_name="dense_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %squeeze.47 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.5), metadata={op_name="dense_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %convert_element_type.75 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.47), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %constant.246 = c64[] constant((0, 1))
+  %mul.151 = c64[32,4096,128,32]{3,2,1,0} broadcast(%constant.246), dimensions={}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %mul.174 = c64[32,4096,128,32]{3,2,1,0} multiply(%convert_element_type.75, %mul.151), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %add.43 = c64[32,4096,128,32]{3,2,1,0} add(%convert_element_type.76, %mul.174), metadata={op_name="dense_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %Arg_8.1 = c64[32,4096,1,32]{3,2,1,0} parameter(8)
+  %mul.175 = c64[32,4096,1,32]{3,2,1,0} broadcast(%Arg_8.1), dimensions={0,1,2,3}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %mul.176 = c64[32,4096,32]{2,1,0} reshape(%mul.175), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %mul.177 = c64[32,4096,128,32]{3,2,1,0} broadcast(%mul.176), dimensions={0,1,3}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %mul.178 = c64[32,4096,128,32]{3,2,1,0} multiply(%add.43, %mul.177), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %real.2 = f32[32,4096,128,32]{3,2,1,0} real(%mul.178), metadata={op_name="dense_layers/self_attention/real" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=30 source_end_column=47}
+  %imag.2 = f32[32,4096,128,32]{3,2,1,0} imag(%mul.178), metadata={op_name="dense_layers/self_attention/imag" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=49 source_end_column=66}
+  %concatenate.4 = f32[32,4096,128,64]{3,2,1,0} concatenate(%real.2, %imag.2), dimensions={3}, metadata={op_name="dense_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=13 source_end_column=77}
+  %convert_element_type.77 = bf16[32,4096,128,64]{3,2,1,0} convert(%concatenate.4), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=721 source_end_line=721 source_column=15 source_end_column=46}
+  %concatenate.5 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.6, %convert_element_type.77), dimensions={3}, metadata={op_name="dense_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=545 source_end_line=545 source_column=12 source_end_column=73}
+  %constant.245 = bf16[] constant(0.1348)
+  %mul.150 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%constant.245), dimensions={}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=545 source_end_line=545 source_column=12 source_end_column=73}
+  %mul.179 = bf16[32,4096,128,192]{3,2,1,0} multiply(%concatenate.5, %mul.150), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=545 source_end_line=545 source_column=12 source_end_column=73}
+  %sharding_constraint.16 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%mul.179), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_42.1 = bf16[7168,576]{1,0} parameter(42)
+  %Arg_61.1 = f32[1024]{0} parameter(61)
+  %reduce_max.107 = f32[] reduce(%Arg_61.1, %constant.253), dimensions={0}, to_apply=%region_5.12, metadata={op_name="dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %is_finite.17 = pred[] is-finite(%reduce_max.107), metadata={op_name="dense_layers/self_attention/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %gt.17 = pred[] compare(%reduce_max.107, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.105 = f32[] divide(%constant.251, %reduce_max.107), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.106 = f32[] divide(%div.105, %constant.252), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_63.1 = f32[1]{0} parameter(63)
+  %div.104 = f32[1]{0} divide(%constant.248, %Arg_63.1), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.36 = f32[1]{0} call(%gt.17, %div.106, %div.104), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.37 = f32[1]{0} call(%is_finite.17, %jit__where_.36, %div.104), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.107 = f32[1]{0} divide(%constant.248, %jit__where_.37), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.65 = bf16[1]{0} convert(%div.107), metadata={op_name="dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.131 = bf16[7168,1]{1,0} broadcast(%convert_element_type.65), dimensions={1}, metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.132 = bf16[7168]{0} reshape(%broadcast_in_dim.131), metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.133 = bf16[7168,576]{1,0} broadcast(%broadcast_in_dim.132), dimensions={0}, metadata={op_name="dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.108 = bf16[7168,576]{1,0} divide(%Arg_42.1, %broadcast_in_dim.133), metadata={op_name="dense_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_9.1 = bf16[] parameter(9)
+  %Arg_10.1 = bf16[] parameter(10)
+  %jit_clip_.19 = bf16[7168,576]{1,0} call(%div.108, %Arg_9.1, %Arg_10.1), to_apply=%clip_90.13, metadata={op_name="dense_layers/self_attention/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.66 = f8e4m3fn[7168,576]{1,0} convert(%jit_clip_.19), metadata={op_name="dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %dot_general.9 = bf16[32,4096,576]{2,1,0} dot(%convert_element_type.61, %convert_element_type.66), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.162 = f32[1]{0} multiply(%div.97, %div.107), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.67 = bf16[1]{0} convert(%mul.162), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.134 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.67), dimensions={2}, metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.135 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.134), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.136 = bf16[32,4096,576]{2,1,0} broadcast(%broadcast_in_dim.135), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.163 = bf16[32,4096,576]{2,1,0} multiply(%dot_general.9, %broadcast_in_dim.136), metadata={op_name="dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %split.8 = bf16[32,4096,512]{2,1,0} slice(%mul.163), slice={[0:32], [0:4096], [0:512]}, metadata={op_name="dense_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=667 source_end_line=667 source_column=35 source_end_column=84}
+  %convert_element_type.78 = f32[32,4096,512]{2,1,0} convert(%split.8), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.6 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.78, %convert_element_type.78), metadata={op_name="dense_layers/self_attention/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %reduce_sum.30 = f32[32,4096]{1,0} reduce(%square.6, %constant.254), dimensions={2}, to_apply=%region_9.19, metadata={op_name="dense_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.148 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.30), metadata={op_name="dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.244 = f32[] constant(512)
+  %div.91 = f32[32,4096,1]{2,1,0} broadcast(%constant.244), dimensions={}, metadata={op_name="dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %div.120 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.148, %div.91), metadata={op_name="dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %add.44 = f32[32,4096,1]{2,1,0} add(%div.120, %broadcast.83), metadata={op_name="dense_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.6 = f32[32,4096,1]{2,1,0} rsqrt(%add.44), metadata={op_name="dense_layers/self_attention/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.180 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.6), dimensions={0,1,2}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.181 = f32[32,4096]{1,0} reshape(%mul.180), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.182 = f32[32,4096,512]{2,1,0} broadcast(%mul.181), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.183 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.78, %mul.182), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.79 = bf16[32,4096,512]{2,1,0} convert(%mul.183), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_39.1 = bf16[512]{0} parameter(39)
+  %broadcast_in_dim.149 = bf16[1,1,512]{2,1,0} reshape(%Arg_39.1), metadata={op_name="dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.184 = bf16[1,1,512]{2,1,0} broadcast(%broadcast_in_dim.149), dimensions={0,1,2}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.185 = bf16[512]{0} reshape(%mul.184), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.186 = bf16[32,4096,512]{2,1,0} broadcast(%mul.185), dimensions={2}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.187 = bf16[32,4096,512]{2,1,0} multiply(%convert_element_type.79, %mul.186), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %Arg_68.1 = f32[1024]{0} parameter(68)
+  %reduce_max.110 = f32[] reduce(%Arg_68.1, %constant.253), dimensions={0}, to_apply=%region_10.20, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.20 = pred[] is-finite(%reduce_max.110), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.20 = pred[] compare(%reduce_max.110, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.122 = f32[] divide(%constant.251, %reduce_max.110), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.123 = f32[] divide(%div.122, %constant.252), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_69.1 = f32[1]{0} parameter(69)
+  %div.121 = f32[1]{0} divide(%constant.248, %Arg_69.1), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.42 = f32[1]{0} call(%gt.20, %div.123, %div.121), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.43 = f32[1]{0} call(%is_finite.20, %jit__where_.42, %div.121), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.124 = f32[1]{0} divide(%constant.248, %jit__where_.43), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.83 = bf16[1]{0} convert(%div.124), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.150 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.83), dimensions={2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.151 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.150), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.152 = bf16[32,4096,512]{2,1,0} broadcast(%broadcast_in_dim.151), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.125 = bf16[32,4096,512]{2,1,0} divide(%mul.187, %broadcast_in_dim.152), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_11.1 = bf16[] parameter(11)
+  %Arg_12.1 = bf16[] parameter(12)
+  %jit_clip_.22 = bf16[32,4096,512]{2,1,0} call(%div.125, %Arg_11.1, %Arg_12.1), to_apply=%clip_154.21, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.84 = f8e4m3fn[32,4096,512]{2,1,0} convert(%jit_clip_.22), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_43.1 = bf16[512,128,256]{2,1,0} parameter(43)
+  %Arg_70.1 = f32[1024]{0} parameter(70)
+  %reduce_max.111 = f32[] reduce(%Arg_70.1, %constant.253), dimensions={0}, to_apply=%region_11.22, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.21 = pred[] is-finite(%reduce_max.111), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.21 = pred[] compare(%reduce_max.111, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.127 = f32[] divide(%constant.251, %reduce_max.111), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.128 = f32[] divide(%div.127, %constant.252), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_71.1 = f32[1]{0} parameter(71)
+  %div.126 = f32[1]{0} divide(%constant.248, %Arg_71.1), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.44 = f32[1]{0} call(%gt.21, %div.128, %div.126), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.45 = f32[1]{0} call(%is_finite.21, %jit__where_.44, %div.126), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.129 = f32[1]{0} divide(%constant.248, %jit__where_.45), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.85 = bf16[1]{0} convert(%div.129), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.153 = bf16[512,128,1]{2,1,0} broadcast(%convert_element_type.85), dimensions={2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.154 = bf16[512,128]{1,0} reshape(%broadcast_in_dim.153), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.155 = bf16[512,128,256]{2,1,0} broadcast(%broadcast_in_dim.154), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.130 = bf16[512,128,256]{2,1,0} divide(%Arg_43.1, %broadcast_in_dim.155), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_13.1 = bf16[] parameter(13)
+  %Arg_14.1 = bf16[] parameter(14)
+  %jit_clip_.23 = bf16[512,128,256]{2,1,0} call(%div.130, %Arg_13.1, %Arg_14.1), to_apply=%clip_160.23, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.86 = f8e4m3fn[512,128,256]{2,1,0} convert(%jit_clip_.23), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.11 = bf16[32,4096,128,256]{3,2,1,0} dot(%convert_element_type.84, %convert_element_type.86), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.190 = f32[1]{0} multiply(%div.124, %div.129), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.87 = bf16[1]{0} convert(%mul.190), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.156 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.87), dimensions={3}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.157 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.156), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.158 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%broadcast_in_dim.157), dimensions={0,1,2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.191 = bf16[32,4096,128,256]{3,2,1,0} multiply(%dot_general.11, %broadcast_in_dim.158), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %split.10 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.191), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="dense_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=560 source_end_line=560 source_column=22 source_end_column=73}
+  %split.9 = bf16[32,4096,64]{2,1,0} slice(%mul.163), slice={[0:32], [0:4096], [512:576]}, metadata={op_name="dense_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=667 source_end_line=667 source_column=35 source_end_column=84}
+  %reshape.3 = bf16[32,4096,1,32,2]{4,3,2,1,0} reshape(%split.9), metadata={op_name="dense_layers/self_attention/reshape" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=702 source_end_line=702 source_column=24 source_end_column=60}
+  %slice.6 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.3), slice={[0:32], [0:4096], [0:1], [0:32], [0:1]}, metadata={op_name="dense_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %squeeze.48 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.6), metadata={op_name="dense_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %convert_element_type.81 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.48), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %slice.7 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.3), slice={[0:32], [0:4096], [0:1], [0:32], [1:2]}, metadata={op_name="dense_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %squeeze.49 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.7), metadata={op_name="dense_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %convert_element_type.80 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.49), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %constant.243 = c64[] constant((0, 1))
+  %mul.149 = c64[32,4096,1,32]{3,2,1,0} broadcast(%constant.243), dimensions={}, metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %mul.188 = c64[32,4096,1,32]{3,2,1,0} multiply(%convert_element_type.80, %mul.149), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %add.45 = c64[32,4096,1,32]{3,2,1,0} add(%convert_element_type.81, %mul.188), metadata={op_name="dense_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %Arg_15.1 = c64[32,4096,1,32]{3,2,1,0} parameter(15)
+  %mul.189 = c64[32,4096,1,32]{3,2,1,0} multiply(%add.45, %Arg_15.1), metadata={op_name="dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %real.3 = f32[32,4096,1,32]{3,2,1,0} real(%mul.189), metadata={op_name="dense_layers/self_attention/real" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=30 source_end_column=47}
+  %imag.3 = f32[32,4096,1,32]{3,2,1,0} imag(%mul.189), metadata={op_name="dense_layers/self_attention/imag" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=49 source_end_column=66}
+  %concatenate.6 = f32[32,4096,1,64]{3,2,1,0} concatenate(%real.3, %imag.3), dimensions={3}, metadata={op_name="dense_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=13 source_end_column=77}
+  %convert_element_type.82 = bf16[32,4096,1,64]{3,2,1,0} convert(%concatenate.6), metadata={op_name="dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=721 source_end_line=721 source_column=15 source_end_column=46}
+  %broadcast_in_dim.159 = bf16[32,4096,1,64]{3,2,1,0} broadcast(%convert_element_type.82), dimensions={0,1,2,3}, metadata={op_name="dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=561 source_end_line=561 source_column=15 source_end_column=122}
+  %broadcast_in_dim.160 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.159), metadata={op_name="dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=561 source_end_line=561 source_column=15 source_end_column=122}
+  %broadcast_in_dim.161 = bf16[32,4096,128,64]{3,2,1,0} broadcast(%broadcast_in_dim.160), dimensions={0,1,3}, metadata={op_name="dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=561 source_end_line=561 source_column=15 source_end_column=122}
+  %concatenate.7 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.10, %broadcast_in_dim.161), dimensions={3}, metadata={op_name="dense_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=563 source_end_line=563 source_column=10 source_end_column=56}
+  %sharding_constraint.17 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%concatenate.7), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %split.11 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.191), slice={[0:32], [0:4096], [0:128], [128:256]}, metadata={op_name="dense_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=560 source_end_line=560 source_column=22 source_end_column=73}
+  %sharding_constraint.18 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%split.11), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_16.1 = bf16[0]{0} parameter(16)
+  %custom_partitioning.3 = (bf16[32,4096,128,128]{3,2,1,0}, f32[32,128,4096]{2,1,0}) custom-call(%sharding_constraint.16, %sharding_constraint.17, %sharding_constraint.18, %Arg_16.1, %Arg_16.1, /*index=5*/%Arg_16.1, %Arg_16.1, %Arg_16.1, %Arg_16.1, %Arg_16.1), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="dense_layers/self_attention/custom_partitioning" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_op.py" source_line=1199 source_end_line=1208 source_column=20 source_end_column=7}, backend_config="281457329920656"
+  %custom_partitioning.5 = f32[32,128,4096]{2,1,0} get-tuple-element(%custom_partitioning.3), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="dense_layers/self_attention/custom_partitioning" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_op.py" source_line=1199 source_end_line=1208 source_column=20 source_end_column=7}
+  %custom_partitioning.4 = bf16[32,4096,128,128]{3,2,1,0} get-tuple-element(%custom_partitioning.3), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="dense_layers/self_attention/custom_partitioning" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_op.py" source_line=1199 source_end_line=1208 source_column=20 source_end_column=7}
+  %sharding_constraint.19 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%custom_partitioning.4), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_64.1 = f32[1024]{0} parameter(64)
+  %reduce_max.112 = f32[] reduce(%Arg_64.1, %constant.253), dimensions={0}, to_apply=%region_12.24, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.22 = pred[] is-finite(%reduce_max.112), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.22 = pred[] compare(%reduce_max.112, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.132 = f32[] divide(%constant.251, %reduce_max.112), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.133 = f32[] divide(%div.132, %constant.252), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_65.1 = f32[1]{0} parameter(65)
+  %div.131 = f32[1]{0} divide(%constant.248, %Arg_65.1), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.46 = f32[1]{0} call(%gt.22, %div.133, %div.131), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.47 = f32[1]{0} call(%is_finite.22, %jit__where_.46, %div.131), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.134 = f32[1]{0} divide(%constant.248, %jit__where_.47), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.88 = bf16[1]{0} convert(%div.134), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.162 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.88), dimensions={3}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.163 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.162), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.164 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%broadcast_in_dim.163), dimensions={0,1,2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.135 = bf16[32,4096,128,128]{3,2,1,0} divide(%sharding_constraint.19, %broadcast_in_dim.164), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_17.1 = bf16[] parameter(17)
+  %Arg_18.1 = bf16[] parameter(18)
+  %jit_clip_.24 = bf16[32,4096,128,128]{3,2,1,0} call(%div.135, %Arg_17.1, %Arg_18.1), to_apply=%clip_177.25, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.89 = f8e4m3fn[32,4096,128,128]{3,2,1,0} convert(%jit_clip_.24), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_40.1 = bf16[128,128,7168]{2,1,0} parameter(40)
+  %Arg_66.1 = f32[1024]{0} parameter(66)
+  %reduce_max.113 = f32[] reduce(%Arg_66.1, %constant.253), dimensions={0}, to_apply=%region_13.26, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.23 = pred[] is-finite(%reduce_max.113), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.23 = pred[] compare(%reduce_max.113, %constant.254), direction=GT, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.137 = f32[] divide(%constant.251, %reduce_max.113), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.138 = f32[] divide(%div.137, %constant.252), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_67.1 = f32[1]{0} parameter(67)
+  %div.136 = f32[1]{0} divide(%constant.248, %Arg_67.1), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.48 = f32[1]{0} call(%gt.23, %div.138, %div.136), to_apply=%_where_68.7, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.49 = f32[1]{0} call(%is_finite.23, %jit__where_.48, %div.136), to_apply=%_where_71.8, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.139 = f32[1]{0} divide(%constant.248, %jit__where_.49), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.90 = bf16[1]{0} convert(%div.139), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.165 = bf16[128,128,1]{2,1,0} broadcast(%convert_element_type.90), dimensions={2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.166 = bf16[128,128]{1,0} reshape(%broadcast_in_dim.165), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.167 = bf16[128,128,7168]{2,1,0} broadcast(%broadcast_in_dim.166), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.140 = bf16[128,128,7168]{2,1,0} divide(%Arg_40.1, %broadcast_in_dim.167), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_19.1 = bf16[] parameter(19)
+  %Arg_20.1 = bf16[] parameter(20)
+  %jit_clip_.25 = bf16[128,128,7168]{2,1,0} call(%div.140, %Arg_19.1, %Arg_20.1), to_apply=%clip_183.27, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.91 = f8e4m3fn[128,128,7168]{2,1,0} convert(%jit_clip_.25), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.12 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.89, %convert_element_type.91), lhs_contracting_dims={2,3}, rhs_contracting_dims={0,1}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.192 = f32[1]{0} multiply(%div.134, %div.139), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.92 = bf16[1]{0} convert(%mul.192), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.168 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.92), dimensions={2}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.169 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.168), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.170 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.169), dimensions={0,1}, metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.193 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.12, %broadcast_in_dim.170), metadata={op_name="dense_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %reduce_precision.3 = bf16[32,4096,7168]{2,1,0} reduce-precision(%mul.193), exponent_bits=8, mantissa_bits=7, metadata={op_name="dense_layers/self_attention/reduce_precision" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=751 source_end_line=751 source_column=10 source_end_column=42}
+  %sharding_constraint.20 = bf16[32,4096,7168]{2,1,0} custom-call(%reduce_precision.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="dense_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %add.46 = bf16[32,4096,7168]{2,1,0} add(%reduce_precision.2, %sharding_constraint.20), metadata={op_name="dense_layers/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/deepseek.py" source_line=115 source_end_line=115 source_column=24 source_end_column=46}
+  %convert_element_type.93 = f32[32,4096,7168]{2,1,0} convert(%add.46), metadata={op_name="dense_layers/post_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.7 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.93, %convert_element_type.93), metadata={op_name="dense_layers/post_self_attention_layer_norm/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %reduce_sum.31 = f32[32,4096]{1,0} reduce(%square.7, %constant.254), dimensions={2}, to_apply=%region_14.28, metadata={op_name="dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.171 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.31), metadata={op_name="dense_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %div.141 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.171, %broadcast.84), metadata={op_name="dense_layers/post_self_attention_layer_norm/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %add.47 = f32[32,4096,1]{2,1,0} add(%div.141, %broadcast.83), metadata={op_name="dense_layers/post_self_attention_layer_norm/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.7 = f32[32,4096,1]{2,1,0} rsqrt(%add.47), metadata={op_name="dense_layers/post_self_attention_layer_norm/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.194 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.7), dimensions={0,1,2}, metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.195 = f32[32,4096]{1,0} reshape(%mul.194), metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.196 = f32[32,4096,7168]{2,1,0} broadcast(%mul.195), dimensions={0,1}, metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.197 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.93, %mul.196), metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.94 = bf16[32,4096,7168]{2,1,0} convert(%mul.197), metadata={op_name="dense_layers/post_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_37.1 = bf16[7168]{0} parameter(37)
+  %broadcast_in_dim.172 = bf16[1,1,7168]{2,1,0} reshape(%Arg_37.1), metadata={op_name="dense_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.198 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.172), dimensions={0,1,2}, metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.199 = bf16[7168]{0} reshape(%mul.198), metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.200 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.199), dimensions={2}, metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.201 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.94, %mul.200), metadata={op_name="dense_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %sharding_constraint.21 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.201), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="dense_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_46.1 = f32[1024]{0} parameter(46)
+  %reduce_max.114 = f32[] reduce(%Arg_46.1, %constant.253), dimensions={0}, to_apply=%region_15.29, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.24 = pred[] is-finite(%reduce_max.114), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.24 = pred[] compare(%reduce_max.114, %constant.254), direction=GT, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.143 = f32[] divide(%constant.251, %reduce_max.114), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.144 = f32[] divide(%div.143, %constant.252), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_47.1 = f32[1]{0} parameter(47)
+  %div.142 = f32[1]{0} divide(%constant.248, %Arg_47.1), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.50 = f32[1]{0} call(%gt.24, %div.144, %div.142), to_apply=%_where_68.7, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.51 = f32[1]{0} call(%is_finite.24, %jit__where_.50, %div.142), to_apply=%_where_71.8, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.145 = f32[1]{0} divide(%constant.248, %jit__where_.51), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.95 = bf16[1]{0} convert(%div.145), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.173 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.95), dimensions={2}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.174 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.173), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.175 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.174), dimensions={0,1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.146 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.21, %broadcast_in_dim.175), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_21.1 = bf16[] parameter(21)
+  %Arg_22.1 = bf16[] parameter(22)
+  %jit_clip_.26 = bf16[32,4096,7168]{2,1,0} call(%div.146, %Arg_21.1, %Arg_22.1), to_apply=%clip_75.9, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.96 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.26), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_34.1 = bf16[7168,18432]{1,0} parameter(34)
+  %Arg_48.1 = f32[1024]{0} parameter(48)
+  %reduce_max.115 = f32[] reduce(%Arg_48.1, %constant.253), dimensions={0}, to_apply=%region_16.30, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.25 = pred[] is-finite(%reduce_max.115), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.25 = pred[] compare(%reduce_max.115, %constant.254), direction=GT, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.148 = f32[] divide(%constant.251, %reduce_max.115), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.149 = f32[] divide(%div.148, %constant.252), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_49.1 = f32[1]{0} parameter(49)
+  %div.147 = f32[1]{0} divide(%constant.248, %Arg_49.1), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.52 = f32[1]{0} call(%gt.25, %div.149, %div.147), to_apply=%_where_68.7, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.53 = f32[1]{0} call(%is_finite.25, %jit__where_.52, %div.147), to_apply=%_where_71.8, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.150 = f32[1]{0} divide(%constant.248, %jit__where_.53), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.97 = bf16[1]{0} convert(%div.150), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.176 = bf16[7168,1]{1,0} broadcast(%convert_element_type.97), dimensions={1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.177 = bf16[7168]{0} reshape(%broadcast_in_dim.176), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.178 = bf16[7168,18432]{1,0} broadcast(%broadcast_in_dim.177), dimensions={0}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.151 = bf16[7168,18432]{1,0} divide(%Arg_34.1, %broadcast_in_dim.178), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_23.1 = bf16[] parameter(23)
+  %Arg_24.1 = bf16[] parameter(24)
+  %jit_clip_.27 = bf16[7168,18432]{1,0} call(%div.151, %Arg_23.1, %Arg_24.1), to_apply=%clip_193.31, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.98 = f8e4m3fn[7168,18432]{1,0} convert(%jit_clip_.27), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.13 = bf16[32,4096,18432]{2,1,0} dot(%convert_element_type.96, %convert_element_type.98), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.202 = f32[1]{0} multiply(%div.145, %div.150), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.99 = bf16[1]{0} convert(%mul.202), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.179 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.99), dimensions={2}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.180 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.179), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.181 = bf16[32,4096,18432]{2,1,0} broadcast(%broadcast_in_dim.180), dimensions={0,1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.203 = bf16[32,4096,18432]{2,1,0} multiply(%dot_general.13, %broadcast_in_dim.181), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %jit_silu_.1 = bf16[32,4096,18432]{2,1,0} call(%mul.203), to_apply=%silu.32, metadata={op_name="dense_layers/mlp/jit(silu)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %Arg_50.1 = f32[1024]{0} parameter(50)
+  %reduce_max.116 = f32[] reduce(%Arg_50.1, %constant.253), dimensions={0}, to_apply=%region_17.33, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.26 = pred[] is-finite(%reduce_max.116), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.26 = pred[] compare(%reduce_max.116, %constant.254), direction=GT, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.153 = f32[] divide(%constant.251, %reduce_max.116), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.154 = f32[] divide(%div.153, %constant.252), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_51.1 = f32[1]{0} parameter(51)
+  %div.152 = f32[1]{0} divide(%constant.248, %Arg_51.1), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.54 = f32[1]{0} call(%gt.26, %div.154, %div.152), to_apply=%_where_68.7, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.55 = f32[1]{0} call(%is_finite.26, %jit__where_.54, %div.152), to_apply=%_where_71.8, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.155 = f32[1]{0} divide(%constant.248, %jit__where_.55), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.100 = bf16[1]{0} convert(%div.155), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.182 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.100), dimensions={2}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.183 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.182), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.184 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.183), dimensions={0,1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.156 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.21, %broadcast_in_dim.184), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_25.1 = bf16[] parameter(25)
+  %Arg_26.1 = bf16[] parameter(26)
+  %jit_clip_.28 = bf16[32,4096,7168]{2,1,0} call(%div.156, %Arg_25.1, %Arg_26.1), to_apply=%clip_75.9, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.101 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.28), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_35.1 = bf16[7168,18432]{1,0} parameter(35)
+  %Arg_52.1 = f32[1024]{0} parameter(52)
+  %reduce_max.117 = f32[] reduce(%Arg_52.1, %constant.253), dimensions={0}, to_apply=%region_18.34, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.27 = pred[] is-finite(%reduce_max.117), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.27 = pred[] compare(%reduce_max.117, %constant.254), direction=GT, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.158 = f32[] divide(%constant.251, %reduce_max.117), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.159 = f32[] divide(%div.158, %constant.252), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_53.1 = f32[1]{0} parameter(53)
+  %div.157 = f32[1]{0} divide(%constant.248, %Arg_53.1), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.56 = f32[1]{0} call(%gt.27, %div.159, %div.157), to_apply=%_where_68.7, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.57 = f32[1]{0} call(%is_finite.27, %jit__where_.56, %div.157), to_apply=%_where_71.8, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.160 = f32[1]{0} divide(%constant.248, %jit__where_.57), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.102 = bf16[1]{0} convert(%div.160), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.185 = bf16[7168,1]{1,0} broadcast(%convert_element_type.102), dimensions={1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.186 = bf16[7168]{0} reshape(%broadcast_in_dim.185), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.187 = bf16[7168,18432]{1,0} broadcast(%broadcast_in_dim.186), dimensions={0}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.161 = bf16[7168,18432]{1,0} divide(%Arg_35.1, %broadcast_in_dim.187), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_27.1 = bf16[] parameter(27)
+  %Arg_28.1 = bf16[] parameter(28)
+  %jit_clip_.29 = bf16[7168,18432]{1,0} call(%div.161, %Arg_27.1, %Arg_28.1), to_apply=%clip_193.31, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.103 = f8e4m3fn[7168,18432]{1,0} convert(%jit_clip_.29), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.14 = bf16[32,4096,18432]{2,1,0} dot(%convert_element_type.101, %convert_element_type.103), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.204 = f32[1]{0} multiply(%div.155, %div.160), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.104 = bf16[1]{0} convert(%mul.204), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.188 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.104), dimensions={2}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.189 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.188), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.190 = bf16[32,4096,18432]{2,1,0} broadcast(%broadcast_in_dim.189), dimensions={0,1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.205 = bf16[32,4096,18432]{2,1,0} multiply(%dot_general.14, %broadcast_in_dim.190), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.206 = bf16[32,4096,18432]{2,1,0} multiply(%jit_silu_.1, %mul.205), metadata={op_name="dense_layers/mlp/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=471 source_end_line=471 source_column=8 source_end_column=51}
+  %sharding_constraint.22 = bf16[32,4096,18432]{2,1,0} custom-call(%mul.206), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="dense_layers/mlp/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_54.1 = f32[1024]{0} parameter(54)
+  %reduce_max.118 = f32[] reduce(%Arg_54.1, %constant.253), dimensions={0}, to_apply=%region_19.35, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.28 = pred[] is-finite(%reduce_max.118), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.28 = pred[] compare(%reduce_max.118, %constant.254), direction=GT, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.163 = f32[] divide(%constant.251, %reduce_max.118), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.164 = f32[] divide(%div.163, %constant.252), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_55.1 = f32[1]{0} parameter(55)
+  %div.162 = f32[1]{0} divide(%constant.248, %Arg_55.1), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.58 = f32[1]{0} call(%gt.28, %div.164, %div.162), to_apply=%_where_68.7, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.59 = f32[1]{0} call(%is_finite.28, %jit__where_.58, %div.162), to_apply=%_where_71.8, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.165 = f32[1]{0} divide(%constant.248, %jit__where_.59), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.105 = bf16[1]{0} convert(%div.165), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.191 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.105), dimensions={2}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.192 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.191), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.193 = bf16[32,4096,18432]{2,1,0} broadcast(%broadcast_in_dim.192), dimensions={0,1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.166 = bf16[32,4096,18432]{2,1,0} divide(%sharding_constraint.22, %broadcast_in_dim.193), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_29.1 = bf16[] parameter(29)
+  %Arg_30.1 = bf16[] parameter(30)
+  %jit_clip_.30 = bf16[32,4096,18432]{2,1,0} call(%div.166, %Arg_29.1, %Arg_30.1), to_apply=%clip_208.36, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.106 = f8e4m3fn[32,4096,18432]{2,1,0} convert(%jit_clip_.30), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_36.1 = bf16[18432,7168]{1,0} parameter(36)
+  %Arg_56.1 = f32[1024]{0} parameter(56)
+  %reduce_max.119 = f32[] reduce(%Arg_56.1, %constant.253), dimensions={0}, to_apply=%region_20.37, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.29 = pred[] is-finite(%reduce_max.119), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.29 = pred[] compare(%reduce_max.119, %constant.254), direction=GT, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.168 = f32[] divide(%constant.251, %reduce_max.119), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.169 = f32[] divide(%div.168, %constant.252), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_57.1 = f32[1]{0} parameter(57)
+  %div.167 = f32[1]{0} divide(%constant.248, %Arg_57.1), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.60 = f32[1]{0} call(%gt.29, %div.169, %div.167), to_apply=%_where_68.7, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.61 = f32[1]{0} call(%is_finite.29, %jit__where_.60, %div.167), to_apply=%_where_71.8, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.170 = f32[1]{0} divide(%constant.248, %jit__where_.61), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.107 = bf16[1]{0} convert(%div.170), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.194 = bf16[18432,1]{1,0} broadcast(%convert_element_type.107), dimensions={1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.195 = bf16[18432]{0} reshape(%broadcast_in_dim.194), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.196 = bf16[18432,7168]{1,0} broadcast(%broadcast_in_dim.195), dimensions={0}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.171 = bf16[18432,7168]{1,0} divide(%Arg_36.1, %broadcast_in_dim.196), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_31.1 = bf16[] parameter(31)
+  %Arg_32.1 = bf16[] parameter(32)
+  %jit_clip_.31 = bf16[18432,7168]{1,0} call(%div.171, %Arg_31.1, %Arg_32.1), to_apply=%clip_214.38, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.108 = f8e4m3fn[18432,7168]{1,0} convert(%jit_clip_.31), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.15 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.106, %convert_element_type.108), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.207 = f32[1]{0} multiply(%div.165, %div.170), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.109 = bf16[1]{0} convert(%mul.207), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.197 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.109), dimensions={2}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.198 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.197), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.199 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.198), dimensions={0,1}, metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.208 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.15, %broadcast_in_dim.199), metadata={op_name="dense_layers/mlp/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %sharding_constraint.23 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.208), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="dense_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %add.48 = bf16[32,4096,7168]{2,1,0} add(%sharding_constraint.23, %add.46), metadata={op_name="dense_layers/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/deepseek.py" source_line=201 source_end_line=201 source_column=19 source_end_column=48}
+  %sharding_constraint.24 = bf16[32,4096,7168]{2,1,0} custom-call(%add.48), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="dense_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %device_put.4 = bf16[32,4096,1536]{2,1,0} custom-call(%mul.161), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.5 = bf16[32,4096,576]{2,1,0} custom-call(%mul.163), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.6 = bf16[32,4096,18432]{2,1,0} custom-call(%mul.203), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.7 = bf16[32,4096,18432]{2,1,0} custom-call(%mul.205), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  ROOT %tuple.1 = (bf16[32,4096,7168]{2,1,0}, bf16[32,4096,7168]{2,1,0}, bf16[32,4096,1536]{2,1,0}, bf16[32,4096,576]{2,1,0}, bf16[32,4096,7168]{2,1,0}, /*index=5*/bf16[32,4096,18432]{2,1,0}, bf16[32,4096,18432]{2,1,0}) tuple(%sharding_constraint.24, %reduce_precision.2, %device_put.4, %device_put.5, %reduce_precision.3, /*index=5*/%device_put.6, %device_put.7)
+}
+
+%region_1.40 (arg_tuple.1: (s32[], bf16[32,4096,7168], bf16[1,32,4096,7168], bf16[1,32,4096,1536], bf16[1,32,4096,576], /*index=5*/bf16[1,32,4096,7168], bf16[1,32,4096,18432], bf16[1,32,4096,18432], bf16[1,7168,18432], bf16[1,7168,18432], /*index=10*/bf16[1,18432,7168], bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], /*index=15*/bf16[1,1536], bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], /*index=20*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=25*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=30*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=35*/f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], f32[1,1], /*index=40*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=45*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=50*/c64[32,4096,1,32], c64[32,4096,1,32])) -> (s32[], bf16[32,4096,7168], bf16[1,32,4096,7168], bf16[1,32,4096,1536], bf16[1,32,4096,576], /*index=5*/bf16[1,32,4096,7168], bf16[1,32,4096,18432], bf16[1,32,4096,18432], bf16[1,7168,18432], bf16[1,7168,18432], /*index=10*/bf16[1,18432,7168], bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], /*index=15*/bf16[1,1536], bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], /*index=20*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=25*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=30*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=35*/f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], f32[1,1], /*index=40*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=45*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=50*/c64[32,4096,1,32], c64[32,4096,1,32]) {
+  %arg_tuple.1 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, /*index=5*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, /*index=10*/bf16[1,18432,7168]{2,1,0}, bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, /*index=15*/bf16[1,1536]{1,0}, bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=40*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/c64[32,4096,1,32]{3,2,1,0}, c64[32,4096,1,32]{3,2,1,0}) parameter(0)
+  %get-tuple-element.52 = s32[] get-tuple-element(%arg_tuple.1), index=0
+  %constant.255 = s32[] constant(1)
+  %add.50 = s32[] add(%get-tuple-element.52, %constant.255), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.258 = bf16[] constant(448)
+  %constant.257 = bf16[] constant(-448)
+  %get-tuple-element.102 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=50
+  %get-tuple-element.103 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=51
+  %constant.256 = bf16[0]{0} constant({})
+  %get-tuple-element.53 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.1), index=1
+  %get-tuple-element.60 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.1), index=8
+  %constant.259 = s32[] constant(0)
+  %dynamic_slice.42 = bf16[1,7168,18432]{2,1,0} dynamic-slice(%get-tuple-element.60, %get-tuple-element.52, %constant.259, %constant.259), dynamic_slice_sizes={1,7168,18432}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.50 = bf16[7168,18432]{1,0} reshape(%dynamic_slice.42), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.61 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.1), index=9
+  %dynamic_slice.43 = bf16[1,7168,18432]{2,1,0} dynamic-slice(%get-tuple-element.61, %get-tuple-element.52, %constant.259, %constant.259), dynamic_slice_sizes={1,7168,18432}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.51 = bf16[7168,18432]{1,0} reshape(%dynamic_slice.43), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.62 = bf16[1,18432,7168]{2,1,0} get-tuple-element(%arg_tuple.1), index=10
+  %dynamic_slice.44 = bf16[1,18432,7168]{2,1,0} dynamic-slice(%get-tuple-element.62, %get-tuple-element.52, %constant.259, %constant.259), dynamic_slice_sizes={1,18432,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.52 = bf16[18432,7168]{1,0} reshape(%dynamic_slice.44), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.63 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.1), index=11
+  %dynamic_slice.45 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.63, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.53 = bf16[7168]{0} reshape(%dynamic_slice.45), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.64 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.1), index=12
+  %dynamic_slice.46 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.64, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.54 = bf16[7168]{0} reshape(%dynamic_slice.46), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.65 = bf16[1,512]{1,0} get-tuple-element(%arg_tuple.1), index=13
+  %dynamic_slice.47 = bf16[1,512]{1,0} dynamic-slice(%get-tuple-element.65, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,512}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.55 = bf16[512]{0} reshape(%dynamic_slice.47), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.66 = bf16[1,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=14
+  %dynamic_slice.48 = bf16[1,128,128,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.66, %get-tuple-element.52, %constant.259, %constant.259, %constant.259), dynamic_slice_sizes={1,128,128,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.56 = bf16[128,128,7168]{2,1,0} reshape(%dynamic_slice.48), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.67 = bf16[1,1536]{1,0} get-tuple-element(%arg_tuple.1), index=15
+  %dynamic_slice.49 = bf16[1,1536]{1,0} dynamic-slice(%get-tuple-element.67, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1536}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.57 = bf16[1536]{0} reshape(%dynamic_slice.49), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.68 = bf16[1,7168,576]{2,1,0} get-tuple-element(%arg_tuple.1), index=16
+  %dynamic_slice.50 = bf16[1,7168,576]{2,1,0} dynamic-slice(%get-tuple-element.68, %get-tuple-element.52, %constant.259, %constant.259), dynamic_slice_sizes={1,7168,576}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.58 = bf16[7168,576]{1,0} reshape(%dynamic_slice.50), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.69 = bf16[1,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=17
+  %dynamic_slice.51 = bf16[1,512,128,256]{3,2,1,0} dynamic-slice(%get-tuple-element.69, %get-tuple-element.52, %constant.259, %constant.259, %constant.259), dynamic_slice_sizes={1,512,128,256}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.59 = bf16[512,128,256]{2,1,0} reshape(%dynamic_slice.51), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.70 = bf16[1,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.1), index=18
+  %dynamic_slice.52 = bf16[1,7168,1536]{2,1,0} dynamic-slice(%get-tuple-element.70, %get-tuple-element.52, %constant.259, %constant.259), dynamic_slice_sizes={1,7168,1536}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.60 = bf16[7168,1536]{1,0} reshape(%dynamic_slice.52), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.71 = bf16[1,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=19
+  %dynamic_slice.53 = bf16[1,1536,128,192]{3,2,1,0} dynamic-slice(%get-tuple-element.71, %get-tuple-element.52, %constant.259, %constant.259, %constant.259), dynamic_slice_sizes={1,1536,128,192}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.61 = bf16[1536,128,192]{2,1,0} reshape(%dynamic_slice.53), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.72 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=20
+  %dynamic_slice.54 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.72, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.62 = f32[1024]{0} reshape(%dynamic_slice.54), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.73 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=21
+  %dynamic_slice.55 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.73, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.63 = f32[1]{0} reshape(%dynamic_slice.55), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.74 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=22
+  %dynamic_slice.56 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.74, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.64 = f32[1024]{0} reshape(%dynamic_slice.56), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.75 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=23
+  %dynamic_slice.57 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.75, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.65 = f32[1]{0} reshape(%dynamic_slice.57), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.76 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=24
+  %dynamic_slice.58 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.76, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.66 = f32[1024]{0} reshape(%dynamic_slice.58), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.77 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=25
+  %dynamic_slice.59 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.77, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.67 = f32[1]{0} reshape(%dynamic_slice.59), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.78 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=26
+  %dynamic_slice.60 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.78, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.68 = f32[1024]{0} reshape(%dynamic_slice.60), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.79 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=27
+  %dynamic_slice.61 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.79, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.69 = f32[1]{0} reshape(%dynamic_slice.61), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.80 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=28
+  %dynamic_slice.62 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.80, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.70 = f32[1024]{0} reshape(%dynamic_slice.62), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.81 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=29
+  %dynamic_slice.63 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.81, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.71 = f32[1]{0} reshape(%dynamic_slice.63), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.82 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=30
+  %dynamic_slice.64 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.82, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.72 = f32[1024]{0} reshape(%dynamic_slice.64), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.83 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=31
+  %dynamic_slice.65 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.83, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.73 = f32[1]{0} reshape(%dynamic_slice.65), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.84 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=32
+  %dynamic_slice.66 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.84, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.74 = f32[1024]{0} reshape(%dynamic_slice.66), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.85 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=33
+  %dynamic_slice.67 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.85, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.75 = f32[1]{0} reshape(%dynamic_slice.67), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.86 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=34
+  %dynamic_slice.68 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.86, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.76 = f32[1024]{0} reshape(%dynamic_slice.68), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.87 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=35
+  %dynamic_slice.69 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.87, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.77 = f32[1024]{0} reshape(%dynamic_slice.69), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.88 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=36
+  %dynamic_slice.70 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.88, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.78 = f32[1]{0} reshape(%dynamic_slice.70), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.89 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=37
+  %dynamic_slice.71 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.89, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.79 = f32[1]{0} reshape(%dynamic_slice.71), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.90 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=38
+  %dynamic_slice.72 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.90, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.80 = f32[1024]{0} reshape(%dynamic_slice.72), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.91 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=39
+  %dynamic_slice.73 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.91, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.81 = f32[1]{0} reshape(%dynamic_slice.73), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.92 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=40
+  %dynamic_slice.74 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.92, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.82 = f32[1024]{0} reshape(%dynamic_slice.74), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.93 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=41
+  %dynamic_slice.75 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.93, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.83 = f32[1]{0} reshape(%dynamic_slice.75), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.94 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=42
+  %dynamic_slice.76 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.94, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.84 = f32[1024]{0} reshape(%dynamic_slice.76), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.95 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=43
+  %dynamic_slice.77 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.95, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.85 = f32[1]{0} reshape(%dynamic_slice.77), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.96 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=44
+  %dynamic_slice.78 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.96, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.86 = f32[1024]{0} reshape(%dynamic_slice.78), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.97 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=45
+  %dynamic_slice.79 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.97, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.87 = f32[1]{0} reshape(%dynamic_slice.79), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.98 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=46
+  %dynamic_slice.80 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.98, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.88 = f32[1024]{0} reshape(%dynamic_slice.80), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.99 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=47
+  %dynamic_slice.81 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.99, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.89 = f32[1]{0} reshape(%dynamic_slice.81), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.100 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.1), index=48
+  %dynamic_slice.82 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.100, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.90 = f32[1024]{0} reshape(%dynamic_slice.82), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.101 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.1), index=49
+  %dynamic_slice.83 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.101, %get-tuple-element.52, %constant.259), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.91 = f32[1]{0} reshape(%dynamic_slice.83), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.8 = (bf16[32,4096,7168]{2,1,0}, bf16[32,4096,7168]{2,1,0}, bf16[32,4096,1536]{2,1,0}, bf16[32,4096,576]{2,1,0}, bf16[32,4096,7168]{2,1,0}, /*index=5*/bf16[32,4096,18432]{2,1,0}, bf16[32,4096,18432]{2,1,0}) call(%constant.258, %constant.257, %constant.258, %constant.257, %constant.258, /*index=5*/%constant.257, %constant.258, %constant.257, %get-tuple-element.102, %constant.258, /*index=10*/%constant.257, %constant.258, %constant.257, %constant.258, %constant.257, /*index=15*/%get-tuple-element.103, %constant.256, %constant.258, %constant.257, %constant.258, /*index=20*/%constant.257, %constant.258, %constant.257, %constant.258, %constant.257, /*index=25*/%constant.258, %constant.257, %constant.258, %constant.257, %constant.258, /*index=30*/%constant.257, %constant.258, %constant.257, %get-tuple-element.53, %squeeze.50, /*index=35*/%squeeze.51, %squeeze.52, %squeeze.53, %squeeze.54, %squeeze.55, /*index=40*/%squeeze.56, %squeeze.57, %squeeze.58, %squeeze.59, %squeeze.60, /*index=45*/%squeeze.61, %squeeze.62, %squeeze.63, %squeeze.64, %squeeze.65, /*index=50*/%squeeze.66, %squeeze.67, %squeeze.68, %squeeze.69, %squeeze.70, /*index=55*/%squeeze.71, %squeeze.72, %squeeze.73, %squeeze.74, %squeeze.75, /*index=60*/%squeeze.76, %squeeze.77, %squeeze.78, %squeeze.79, %squeeze.80, /*index=65*/%squeeze.81, %squeeze.82, %squeeze.83, %squeeze.84, %squeeze.85, /*index=70*/%squeeze.86, %squeeze.87, %squeeze.88, %squeeze.89, %squeeze.90, /*index=75*/%squeeze.91), to_apply=%closed_call.39, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.9 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.8), index=0, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.54 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=2
+  %closed_call.10 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.8), index=1, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.206 = bf16[1,32,4096,7168]{3,2,1,0} reshape(%closed_call.10), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.6 = bf16[1,32,4096,7168]{3,2,1,0} dynamic-update-slice(%get-tuple-element.54, %broadcast_in_dim.206, %get-tuple-element.52, %constant.259, %constant.259, /*index=5*/%constant.259), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.55 = bf16[1,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=3
+  %closed_call.11 = bf16[32,4096,1536]{2,1,0} get-tuple-element(%closed_call.8), index=2, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.207 = bf16[1,32,4096,1536]{3,2,1,0} reshape(%closed_call.11), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.7 = bf16[1,32,4096,1536]{3,2,1,0} dynamic-update-slice(%get-tuple-element.55, %broadcast_in_dim.207, %get-tuple-element.52, %constant.259, %constant.259, /*index=5*/%constant.259), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.56 = bf16[1,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=4
+  %closed_call.12 = bf16[32,4096,576]{2,1,0} get-tuple-element(%closed_call.8), index=3, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.208 = bf16[1,32,4096,576]{3,2,1,0} reshape(%closed_call.12), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.8 = bf16[1,32,4096,576]{3,2,1,0} dynamic-update-slice(%get-tuple-element.56, %broadcast_in_dim.208, %get-tuple-element.52, %constant.259, %constant.259, /*index=5*/%constant.259), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.57 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=5
+  %closed_call.13 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.8), index=4, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.209 = bf16[1,32,4096,7168]{3,2,1,0} reshape(%closed_call.13), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.9 = bf16[1,32,4096,7168]{3,2,1,0} dynamic-update-slice(%get-tuple-element.57, %broadcast_in_dim.209, %get-tuple-element.52, %constant.259, %constant.259, /*index=5*/%constant.259), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.58 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=6
+  %closed_call.14 = bf16[32,4096,18432]{2,1,0} get-tuple-element(%closed_call.8), index=5, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.210 = bf16[1,32,4096,18432]{3,2,1,0} reshape(%closed_call.14), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.10 = bf16[1,32,4096,18432]{3,2,1,0} dynamic-update-slice(%get-tuple-element.58, %broadcast_in_dim.210, %get-tuple-element.52, %constant.259, %constant.259, /*index=5*/%constant.259), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.59 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.1), index=7
+  %closed_call.15 = bf16[32,4096,18432]{2,1,0} get-tuple-element(%closed_call.8), index=6, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.211 = bf16[1,32,4096,18432]{3,2,1,0} reshape(%closed_call.15), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.11 = bf16[1,32,4096,18432]{3,2,1,0} dynamic-update-slice(%get-tuple-element.59, %broadcast_in_dim.211, %get-tuple-element.52, %constant.259, %constant.259, /*index=5*/%constant.259), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  ROOT %tuple.3 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, /*index=5*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, /*index=10*/bf16[1,18432,7168]{2,1,0}, bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, /*index=15*/bf16[1,1536]{1,0}, bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=40*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/c64[32,4096,1,32]{3,2,1,0}, c64[32,4096,1,32]{3,2,1,0}) tuple(%add.50, %closed_call.9, %dynamic_update_slice.6, %dynamic_update_slice.7, %dynamic_update_slice.8, /*index=5*/%dynamic_update_slice.9, %dynamic_update_slice.10, %dynamic_update_slice.11, %get-tuple-element.60, %get-tuple-element.61, /*index=10*/%get-tuple-element.62, %get-tuple-element.63, %get-tuple-element.64, %get-tuple-element.65, %get-tuple-element.66, /*index=15*/%get-tuple-element.67, %get-tuple-element.68, %get-tuple-element.69, %get-tuple-element.70, %get-tuple-element.71, /*index=20*/%get-tuple-element.72, %get-tuple-element.73, %get-tuple-element.74, %get-tuple-element.75, %get-tuple-element.76, /*index=25*/%get-tuple-element.77, %get-tuple-element.78, %get-tuple-element.79, %get-tuple-element.80, %get-tuple-element.81, /*index=30*/%get-tuple-element.82, %get-tuple-element.83, %get-tuple-element.84, %get-tuple-element.85, %get-tuple-element.86, /*index=35*/%get-tuple-element.87, %get-tuple-element.88, %get-tuple-element.89, %get-tuple-element.90, %get-tuple-element.91, /*index=40*/%get-tuple-element.92, %get-tuple-element.93, %get-tuple-element.94, %get-tuple-element.95, %get-tuple-element.96, /*index=45*/%get-tuple-element.97, %get-tuple-element.98, %get-tuple-element.99, %get-tuple-element.100, %get-tuple-element.101, /*index=50*/%get-tuple-element.102, %get-tuple-element.103)
+}
+
+%region_21.41 (arg_tuple.3: (s32[], bf16[32,4096,7168], bf16[1,32,4096,7168], bf16[1,32,4096,1536], bf16[1,32,4096,576], /*index=5*/bf16[1,32,4096,7168], bf16[1,32,4096,18432], bf16[1,32,4096,18432], bf16[1,7168,18432], bf16[1,7168,18432], /*index=10*/bf16[1,18432,7168], bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], /*index=15*/bf16[1,1536], bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], /*index=20*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=25*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=30*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=35*/f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], f32[1,1], /*index=40*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=45*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=50*/c64[32,4096,1,32], c64[32,4096,1,32])) -> pred[] {
+  %arg_tuple.3 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, /*index=5*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, /*index=10*/bf16[1,18432,7168]{2,1,0}, bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, /*index=15*/bf16[1,1536]{1,0}, bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=40*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/c64[32,4096,1,32]{3,2,1,0}, c64[32,4096,1,32]{3,2,1,0}) parameter(0)
+  %get-tuple-element.157 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.3), index=1
+  %get-tuple-element.158 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=2
+  %get-tuple-element.159 = bf16[1,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=3
+  %get-tuple-element.160 = bf16[1,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=4
+  %get-tuple-element.161 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=5
+  %get-tuple-element.162 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=6
+  %get-tuple-element.163 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=7
+  %get-tuple-element.164 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.3), index=8
+  %get-tuple-element.165 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.3), index=9
+  %get-tuple-element.166 = bf16[1,18432,7168]{2,1,0} get-tuple-element(%arg_tuple.3), index=10
+  %get-tuple-element.167 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.3), index=11
+  %get-tuple-element.168 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.3), index=12
+  %get-tuple-element.169 = bf16[1,512]{1,0} get-tuple-element(%arg_tuple.3), index=13
+  %get-tuple-element.170 = bf16[1,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=14
+  %get-tuple-element.171 = bf16[1,1536]{1,0} get-tuple-element(%arg_tuple.3), index=15
+  %get-tuple-element.172 = bf16[1,7168,576]{2,1,0} get-tuple-element(%arg_tuple.3), index=16
+  %get-tuple-element.173 = bf16[1,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=17
+  %get-tuple-element.174 = bf16[1,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.3), index=18
+  %get-tuple-element.175 = bf16[1,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=19
+  %get-tuple-element.176 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=20
+  %get-tuple-element.177 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=21
+  %get-tuple-element.178 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=22
+  %get-tuple-element.179 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=23
+  %get-tuple-element.180 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=24
+  %get-tuple-element.181 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=25
+  %get-tuple-element.182 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=26
+  %get-tuple-element.183 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=27
+  %get-tuple-element.184 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=28
+  %get-tuple-element.185 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=29
+  %get-tuple-element.186 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=30
+  %get-tuple-element.187 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=31
+  %get-tuple-element.188 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=32
+  %get-tuple-element.189 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=33
+  %get-tuple-element.190 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=34
+  %get-tuple-element.191 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=35
+  %get-tuple-element.192 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=36
+  %get-tuple-element.193 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=37
+  %get-tuple-element.194 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=38
+  %get-tuple-element.195 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=39
+  %get-tuple-element.196 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=40
+  %get-tuple-element.197 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=41
+  %get-tuple-element.198 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=42
+  %get-tuple-element.199 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=43
+  %get-tuple-element.200 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=44
+  %get-tuple-element.201 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=45
+  %get-tuple-element.202 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=46
+  %get-tuple-element.203 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=47
+  %get-tuple-element.204 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.3), index=48
+  %get-tuple-element.205 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.3), index=49
+  %get-tuple-element.206 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=50
+  %get-tuple-element.207 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.3), index=51
+  %get-tuple-element.156 = s32[] get-tuple-element(%arg_tuple.3), index=0
+  %constant.261 = s32[] constant(1)
+  ROOT %lt.6 = pred[] compare(%get-tuple-element.156, %constant.261), direction=LT, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/cond/lt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+}
+
+%_one_hot.42 () -> s32[1,1,1,513] {
+  %iota.14 = s32[513]{0} iota(), iota_dimension=0, metadata={op_name="iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  ROOT %iota.15 = s32[1,1,1,513]{3,2,1,0} reshape(%iota.14), metadata={op_name="iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+}
+
+%region_23.43 (reduce_sum.35: f32[], reduce_sum.36: f32[]) -> f32[] {
+  %reduce_sum.35 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.36 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.37 = f32[] add(%reduce_sum.35, %reduce_sum.36), metadata={op_name="moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_24.44 (reduce_max.123: f32[], reduce_max.124: f32[]) -> f32[] {
+  %reduce_max.123 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.124 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.125 = f32[] maximum(%reduce_max.123, %reduce_max.124), metadata={op_name="moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_25.45 (reduce_max.130: f32[], reduce_max.131: f32[]) -> f32[] {
+  %reduce_max.130 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.131 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.132 = f32[] maximum(%reduce_max.130, %reduce_max.131), metadata={op_name="moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%region_27.47 (reduce_sum.42: f32[], reduce_sum.43: f32[]) -> f32[] {
+  %reduce_sum.42 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.43 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.44 = f32[] add(%reduce_sum.42, %reduce_sum.43), metadata={op_name="moe_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_28.48 (reduce_max.144: f32[], reduce_max.145: f32[]) -> f32[] {
+  %reduce_max.144 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.145 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.146 = f32[] maximum(%reduce_max.144, %reduce_max.145), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_29.49 (reduce_max.151: f32[], reduce_max.152: f32[]) -> f32[] {
+  %reduce_max.151 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.152 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.153 = f32[] maximum(%reduce_max.151, %reduce_max.152), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_26.46 (reduce_max.137: f32[], reduce_max.138: f32[]) -> f32[] {
+  %reduce_max.137 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.138 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.139 = f32[] maximum(%reduce_max.137, %reduce_max.138), metadata={op_name="moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%region_30.50 (reduce_sum.49: f32[], reduce_sum.50: f32[]) -> f32[] {
+  %reduce_sum.49 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.50 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.51 = f32[] add(%reduce_sum.49, %reduce_sum.50), metadata={op_name="moe_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_31.51 (reduce_max.158: f32[], reduce_max.159: f32[]) -> f32[] {
+  %reduce_max.158 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.159 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.160 = f32[] maximum(%reduce_max.158, %reduce_max.159), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_32.52 (reduce_max.165: f32[], reduce_max.166: f32[]) -> f32[] {
+  %reduce_max.165 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.166 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.167 = f32[] maximum(%reduce_max.165, %reduce_max.166), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_33.53 (reduce_max.172: f32[], reduce_max.173: f32[]) -> f32[] {
+  %reduce_max.172 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.173 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.174 = f32[] maximum(%reduce_max.172, %reduce_max.173), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_34.54 (reduce_max.179: f32[], reduce_max.180: f32[]) -> f32[] {
+  %reduce_max.179 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.180 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.181 = f32[] maximum(%reduce_max.179, %reduce_max.180), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_35.55 (reduce_sum.56: f32[], reduce_sum.57: f32[]) -> f32[] {
+  %reduce_sum.56 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.57 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.58 = f32[] add(%reduce_sum.56, %reduce_sum.57), metadata={op_name="moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_36.56 (reduce_max.186: f32[], reduce_max.187: f32[]) -> f32[] {
+  %reduce_max.186 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.187 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.188 = f32[] maximum(%reduce_max.186, %reduce_max.187), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_37.57 (reduce_max.193: f32[], reduce_max.194: f32[]) -> f32[] {
+  %reduce_max.193 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.194 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.195 = f32[] maximum(%reduce_max.193, %reduce_max.194), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_275.58 (Arg_0.40: bf16[7168,64], Arg_1.38: bf16[], Arg_2.36: bf16[]) -> bf16[7168,64] {
+  %Arg_1.38 = bf16[] parameter(1)
+  %min.54 = bf16[7168,64]{1,0} broadcast(%Arg_1.38), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.36 = bf16[] parameter(2)
+  %max.54 = bf16[7168,64]{1,0} broadcast(%Arg_2.36), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.40 = bf16[7168,64]{1,0} parameter(0)
+  %max.55 = bf16[7168,64]{1,0} maximum(%max.54, %Arg_0.40), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.55 = bf16[7168,64]{1,0} minimum(%min.54, %max.55), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_38.59 (reduce_window_sum.3: s32[], reduce_window_sum.4: s32[]) -> s32[] {
+  %reduce_window_sum.3 = s32[] parameter(0), metadata={op_name="reduce_window_sum"}
+  %reduce_window_sum.4 = s32[] parameter(1), metadata={op_name="reduce_window_sum"}
+  ROOT %reduce_window_sum.5 = s32[] add(%reduce_window_sum.3, %reduce_window_sum.4), metadata={op_name="reduce_window_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1388 source_end_line=1388 source_column=25 source_end_column=56}
+}
+
+%cumsum_285.60 (RoutedMoE_generate_masks_from_expert_format.1: s32[32,4096,64]) -> s32[32,4096,64] {
+  %RoutedMoE_generate_masks_from_expert_format.1 = s32[32,4096,64]{2,1,0} parameter(0), metadata={op_name="RoutedMoE.generate_masks_from_expert_format"}
+  %constant.282 = s32[] constant(0)
+  ROOT %reduce_window_sum.7 = s32[32,4096,64]{2,1,0} reduce-window(%RoutedMoE_generate_masks_from_expert_format.1, %constant.282), window={size=1x4096x1 pad=0_0x4095_0x0_0}, to_apply=%region_38.59, metadata={op_name="reduce_window_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1388 source_end_line=1388 source_column=25 source_end_column=56}
+}
+
+%cumsum.61 (Arg_0.42: s32[32,4096,64]) -> s32[32,4096,64] {
+  %Arg_0.42 = s32[32,4096,64]{2,1,0} parameter(0)
+  ROOT %RoutedMoE_generate_masks_from_expert_format.3 = s32[32,4096,64]{2,1,0} call(%Arg_0.42), to_apply=%cumsum_285.60, metadata={op_name="RoutedMoE.generate_masks_from_expert_format" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1388 source_end_line=1388 source_column=25 source_end_column=56}
+}
+
+%_one_hot_289.62 (Arg_0.44: s32[32,4096,64], Arg_1.40: s32[1,1,1,513]) -> s32[32,4096,64,513] {
+  %Arg_0.44 = s32[32,4096,64]{2,1,0} parameter(0)
+  %broadcast_in_dim.281 = s32[32,4096,64,1]{3,2,1,0} reshape(%Arg_0.44), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.7 = s32[32,4096,64,1]{3,2,1,0} broadcast(%broadcast_in_dim.281), dimensions={0,1,2,3}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.8 = s32[32,4096,64]{2,1,0} reshape(%eq.7), metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.9 = s32[32,4096,64,513]{3,2,1,0} broadcast(%eq.8), dimensions={0,1,2}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %Arg_1.40 = s32[1,1,1,513]{3,2,1,0} parameter(1)
+  %eq.10 = s32[1,1,1,513]{3,2,1,0} broadcast(%Arg_1.40), dimensions={0,1,2,3}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.11 = s32[513]{0} reshape(%eq.10), metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.12 = s32[32,4096,64,513]{3,2,1,0} broadcast(%eq.11), dimensions={3}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.13 = pred[32,4096,64,513]{3,2,1,0} compare(%eq.9, %eq.12), direction=EQ, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  ROOT %convert_element_type.159 = s32[32,4096,64,513]{3,2,1,0} convert(%eq.13), metadata={op_name="convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+}
+
+%region_46.76 (reduce_max.249: f32[], reduce_max.250: f32[]) -> f32[] {
+  %reduce_max.249 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.250 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.251 = f32[] maximum(%reduce_max.249, %reduce_max.250), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%clip_302.64 (Arg_0.46: bf16[32,4096,64,512], Arg_1.42: bf16[], Arg_2.38: bf16[]) -> bf16[32,4096,64,512] {
+  %Arg_1.42 = bf16[] parameter(1)
+  %min.58 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%Arg_1.42), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_2.38 = bf16[] parameter(2)
+  %max.58 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%Arg_2.38), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.46 = bf16[32,4096,64,512]{3,2,1,0} parameter(0)
+  %max.59 = bf16[32,4096,64,512]{3,2,1,0} maximum(%max.58, %Arg_0.46), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.59 = bf16[32,4096,64,512]{3,2,1,0} minimum(%min.58, %max.59), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_39.63 (reduce_max.200: f32[], reduce_max.201: f32[]) -> f32[] {
+  %reduce_max.200 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.201 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.202 = f32[] maximum(%reduce_max.200, %reduce_max.201), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_40.65 (reduce_max.207: f32[], reduce_max.208: f32[]) -> f32[] {
+  %reduce_max.207 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.208 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.209 = f32[] maximum(%reduce_max.207, %reduce_max.208), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_41.66 (reduce_max.214: f32[], reduce_max.215: f32[]) -> f32[] {
+  %reduce_max.214 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.215 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.216 = f32[] maximum(%reduce_max.214, %reduce_max.215), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%clip_315.67 (Arg_0.48: bf16[64,32,512,7168], Arg_1.44: bf16[], Arg_2.40: bf16[]) -> bf16[64,32,512,7168] {
+  %Arg_1.44 = bf16[] parameter(1)
+  %min.62 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%Arg_1.44), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_2.40 = bf16[] parameter(2)
+  %max.62 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%Arg_2.40), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.48 = bf16[64,32,512,7168]{3,2,1,0} parameter(0)
+  %max.63 = bf16[64,32,512,7168]{3,2,1,0} maximum(%max.62, %Arg_0.48), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %min.63 = bf16[64,32,512,7168]{3,2,1,0} minimum(%min.62, %max.63), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_42.68 (reduce_max.221: f32[], reduce_max.222: f32[]) -> f32[] {
+  %reduce_max.221 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.222 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.223 = f32[] maximum(%reduce_max.221, %reduce_max.222), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%clip_321.69 (Arg_0.50: bf16[64,7168,2048], Arg_1.46: bf16[], Arg_2.42: bf16[]) -> bf16[64,7168,2048] {
+  %Arg_1.46 = bf16[] parameter(1)
+  %min.66 = bf16[64,7168,2048]{2,1,0} broadcast(%Arg_1.46), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_2.42 = bf16[] parameter(2)
+  %max.66 = bf16[64,7168,2048]{2,1,0} broadcast(%Arg_2.42), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_0.50 = bf16[64,7168,2048]{2,1,0} parameter(0)
+  %max.67 = bf16[64,7168,2048]{2,1,0} maximum(%max.66, %Arg_0.50), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  ROOT %min.67 = bf16[64,7168,2048]{2,1,0} minimum(%min.66, %max.67), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%silu_332.71 (Arg_0.52: bf16[64,32,512,2048]) -> bf16[64,32,512,2048] {
+  %Arg_0.52 = bf16[64,32,512,2048]{3,2,1,0} parameter(0)
+  %constant.284 = bf16[] constant(1)
+  %broadcast.88 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%constant.284), dimensions={}
+  %neg.3 = bf16[64,32,512,2048]{3,2,1,0} negate(%Arg_0.52), metadata={op_name="neg" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %exp.79 = bf16[64,32,512,2048]{3,2,1,0} exponential(%neg.3), metadata={op_name="exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %add.61 = bf16[64,32,512,2048]{3,2,1,0} add(%exp.79, %broadcast.88), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %div.267 = bf16[64,32,512,2048]{3,2,1,0} divide(%broadcast.88, %add.61), metadata={op_name="div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  ROOT %mul.301 = bf16[64,32,512,2048]{3,2,1,0} multiply(%Arg_0.52, %div.267), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+}
+
+%region_43.70 (reduce_max.228: f32[], reduce_max.229: f32[]) -> f32[] {
+  %reduce_max.228 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.229 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.230 = f32[] maximum(%reduce_max.228, %reduce_max.229), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%region_44.72 (reduce_max.235: f32[], reduce_max.236: f32[]) -> f32[] {
+  %reduce_max.235 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.236 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.237 = f32[] maximum(%reduce_max.235, %reduce_max.236), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%clip_340.73 (Arg_0.54: bf16[64,32,512,2048], Arg_1.48: bf16[], Arg_2.44: bf16[]) -> bf16[64,32,512,2048] {
+  %Arg_1.48 = bf16[] parameter(1)
+  %min.70 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%Arg_1.48), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_2.44 = bf16[] parameter(2)
+  %max.70 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%Arg_2.44), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.54 = bf16[64,32,512,2048]{3,2,1,0} parameter(0)
+  %max.71 = bf16[64,32,512,2048]{3,2,1,0} maximum(%max.70, %Arg_0.54), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.71 = bf16[64,32,512,2048]{3,2,1,0} minimum(%min.70, %max.71), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_45.74 (reduce_max.242: f32[], reduce_max.243: f32[]) -> f32[] {
+  %reduce_max.242 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.243 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.244 = f32[] maximum(%reduce_max.242, %reduce_max.243), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%clip_346.75 (Arg_0.56: bf16[64,2048,7168], Arg_1.50: bf16[], Arg_2.46: bf16[]) -> bf16[64,2048,7168] {
+  %Arg_1.50 = bf16[] parameter(1)
+  %min.74 = bf16[64,2048,7168]{2,1,0} broadcast(%Arg_1.50), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_2.46 = bf16[] parameter(2)
+  %max.74 = bf16[64,2048,7168]{2,1,0} broadcast(%Arg_2.46), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.56 = bf16[64,2048,7168]{2,1,0} parameter(0)
+  %max.75 = bf16[64,2048,7168]{2,1,0} maximum(%max.74, %Arg_0.56), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.75 = bf16[64,2048,7168]{2,1,0} minimum(%min.74, %max.75), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_47.77 (reduce_max.256: f32[], reduce_max.257: f32[]) -> f32[] {
+  %reduce_max.256 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.257 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.258 = f32[] maximum(%reduce_max.256, %reduce_max.257), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_48.78 (reduce_max.263: f32[], reduce_max.264: f32[]) -> f32[] {
+  %reduce_max.263 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.264 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.265 = f32[] maximum(%reduce_max.263, %reduce_max.264), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_49.79 (reduce_max.270: f32[], reduce_max.271: f32[]) -> f32[] {
+  %reduce_max.270 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.271 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.272 = f32[] maximum(%reduce_max.270, %reduce_max.271), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_357.80 (Arg_0.58: bf16[7168,2048], Arg_1.52: bf16[], Arg_2.48: bf16[]) -> bf16[7168,2048] {
+  %Arg_1.52 = bf16[] parameter(1)
+  %min.78 = bf16[7168,2048]{1,0} broadcast(%Arg_1.52), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.48 = bf16[] parameter(2)
+  %max.78 = bf16[7168,2048]{1,0} broadcast(%Arg_2.48), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.58 = bf16[7168,2048]{1,0} parameter(0)
+  %max.79 = bf16[7168,2048]{1,0} maximum(%max.78, %Arg_0.58), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.79 = bf16[7168,2048]{1,0} minimum(%min.78, %max.79), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%silu_366.81 (Arg_0.60: bf16[32,4096,2048]) -> bf16[32,4096,2048] {
+  %Arg_0.60 = bf16[32,4096,2048]{2,1,0} parameter(0)
+  %constant.286 = bf16[] constant(1)
+  %broadcast.90 = bf16[32,4096,2048]{2,1,0} broadcast(%constant.286), dimensions={}
+  %neg.5 = bf16[32,4096,2048]{2,1,0} negate(%Arg_0.60), metadata={op_name="neg" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %exp.81 = bf16[32,4096,2048]{2,1,0} exponential(%neg.5), metadata={op_name="exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %add.63 = bf16[32,4096,2048]{2,1,0} add(%exp.81, %broadcast.90), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %div.299 = bf16[32,4096,2048]{2,1,0} divide(%broadcast.90, %add.63), metadata={op_name="div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  ROOT %mul.310 = bf16[32,4096,2048]{2,1,0} multiply(%Arg_0.60, %div.299), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+}
+
+%region_50.82 (reduce_max.277: f32[], reduce_max.278: f32[]) -> f32[] {
+  %reduce_max.277 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.278 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.279 = f32[] maximum(%reduce_max.277, %reduce_max.278), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_51.83 (reduce_max.284: f32[], reduce_max.285: f32[]) -> f32[] {
+  %reduce_max.284 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.285 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.286 = f32[] maximum(%reduce_max.284, %reduce_max.285), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_52.84 (reduce_max.291: f32[], reduce_max.292: f32[]) -> f32[] {
+  %reduce_max.291 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.292 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.293 = f32[] maximum(%reduce_max.291, %reduce_max.292), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%clip_375.85 (Arg_0.62: bf16[32,4096,2048], Arg_1.54: bf16[], Arg_2.50: bf16[]) -> bf16[32,4096,2048] {
+  %Arg_1.54 = bf16[] parameter(1)
+  %min.82 = bf16[32,4096,2048]{2,1,0} broadcast(%Arg_1.54), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_2.50 = bf16[] parameter(2)
+  %max.82 = bf16[32,4096,2048]{2,1,0} broadcast(%Arg_2.50), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.62 = bf16[32,4096,2048]{2,1,0} parameter(0)
+  %max.83 = bf16[32,4096,2048]{2,1,0} maximum(%max.82, %Arg_0.62), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.83 = bf16[32,4096,2048]{2,1,0} minimum(%min.82, %max.83), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_53.86 (reduce_max.298: f32[], reduce_max.299: f32[]) -> f32[] {
+  %reduce_max.298 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.299 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.300 = f32[] maximum(%reduce_max.298, %reduce_max.299), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_381.87 (Arg_0.64: bf16[2048,7168], Arg_1.56: bf16[], Arg_2.52: bf16[]) -> bf16[2048,7168] {
+  %Arg_1.56 = bf16[] parameter(1)
+  %min.86 = bf16[2048,7168]{1,0} broadcast(%Arg_1.56), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_2.52 = bf16[] parameter(2)
+  %max.86 = bf16[2048,7168]{1,0} broadcast(%Arg_2.52), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.64 = bf16[2048,7168]{1,0} parameter(0)
+  %max.87 = bf16[2048,7168]{1,0} maximum(%max.86, %Arg_0.64), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.87 = bf16[2048,7168]{1,0} minimum(%min.86, %max.87), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%closed_call_272.88 (Arg_0.65: bf16[], Arg_1.57: bf16[], Arg_2.53: bf16[], Arg_3.3: bf16[], Arg_4.3: bf16[], Arg_5.3: bf16[], Arg_6.3: bf16[], Arg_7.3: bf16[], Arg_8.3: c64[32,4096,1,32], Arg_9.3: bf16[], Arg_10.3: bf16[], Arg_11.3: bf16[], Arg_12.3: bf16[], Arg_13.3: bf16[], Arg_14.3: bf16[], Arg_15.3: c64[32,4096,1,32], Arg_16.3: bf16[0], Arg_17.3: bf16[], Arg_18.3: bf16[], Arg_19.3: bf16[], Arg_20.3: bf16[], Arg_21.3: bf16[], Arg_22.3: bf16[], Arg_23.3: bf16[], Arg_24.3: bf16[], Arg_25.3: s32[1,1,1,513], Arg_26.3: bf16[], Arg_27.3: bf16[], Arg_28.3: bf16[], Arg_29.3: bf16[], Arg_30.3: bf16[], Arg_31.3: bf16[], Arg_32.3: bf16[], Arg_33.3: bf16[], Arg_34.3: bf16[], Arg_35.3: bf16[], Arg_36.3: bf16[], Arg_37.3: bf16[], Arg_38.3: bf16[], Arg_39.3: bf16[], Arg_40.3: bf16[], Arg_41.3: bf16[], Arg_42.3: bf16[], Arg_43.3: bf16[], Arg_44.3: bf16[], Arg_45.3: bf16[], Arg_46.3: bf16[], Arg_47.3: bf16[], Arg_48.3: bf16[], Arg_49.3: bf16[], Arg_50.3: bf16[], Arg_51.3: bf16[], Arg_52.3: bf16[], Arg_53.3: bf16[], Arg_54.3: bf16[], Arg_55.3: bf16[], Arg_56.3: bf16[32,4096,7168], Arg_57.3: bf16[7168,64], Arg_58.3: bf16[64], Arg_59.3: bf16[64,7168,2048], Arg_60.3: bf16[64,7168,2048], Arg_61.3: bf16[64,2048,7168], Arg_62.3: bf16[7168,2048], Arg_63.3: bf16[7168,2048], Arg_64.3: bf16[2048,7168], Arg_65.3: bf16[7168], Arg_66.3: bf16[7168], Arg_67.3: bf16[512], Arg_68.3: bf16[128,128,7168], Arg_69.3: bf16[1536], Arg_70.3: bf16[7168,576], Arg_71.3: bf16[512,128,256], Arg_72.3: bf16[7168,1536], Arg_73.3: bf16[1536,128,192], Arg_74.3: f32[1024], Arg_75.3: f32[1], Arg_76.1: f32[1024], Arg_77.1: f32[1], Arg_78.1: f32[1024], Arg_79.1: f32[1], Arg_80.1: f32[1024], Arg_81.1: f32[1], Arg_82.1: f32[1024], Arg_83.1: f32[1], Arg_84.1: f32[1024], Arg_85.1: f32[1024], Arg_86.1: f32[1], Arg_87.1: f32[1], Arg_88.1: f32[1024], Arg_89.1: f32[1], Arg_90.1: f32[1024], Arg_91.1: f32[1], Arg_92.1: f32[1024], Arg_93.1: f32[1], Arg_94.1: f32[1024], Arg_95.1: f32[1], Arg_96.1: f32[1024], Arg_97.1: f32[1], Arg_98.1: f32[1024], Arg_99.1: f32[1], Arg_100.1: f32[1024], Arg_101.1: f32[1], Arg_102.1: f32[1024], Arg_103.1: f32[1], Arg_104.1: f32[1024], Arg_105.1: f32[1], Arg_106.1: f32[1024], Arg_107.1: f32[1], Arg_108.1: f32[1024], Arg_109.1: f32[1], Arg_110.1: f32[1024], Arg_111.1: f32[1024], Arg_112.1: f32[1], Arg_113.1: f32[1], Arg_114.1: f32[1024], Arg_115.1: f32[1], Arg_116.1: f32[1024], Arg_117.1: f32[1], Arg_118.1: f32[1024], Arg_119.1: f32[1], Arg_120.1: f32[1024], Arg_121.1: f32[1], Arg_122.1: f32[1024], Arg_123.1: f32[1], Arg_124.1: f32[1024], Arg_125.1: f32[1]) -> (bf16[32,4096,7168], bf16[32,4096,7168], bf16[32,4096,1536], bf16[32,4096,576], bf16[32,4096,7168], /*index=5*/bf16[64,32,512,2048], bf16[64,32,512,2048], bf16[32,4096,2048], bf16[32,4096,2048]) {
+  %Arg_56.3 = bf16[32,4096,7168]{2,1,0} parameter(56)
+  %sharding_constraint.47 = bf16[32,4096,7168]{2,1,0} custom-call(%Arg_56.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %reduce_precision.6 = bf16[32,4096,7168]{2,1,0} reduce-precision(%sharding_constraint.47), exponent_bits=8, mantissa_bits=7, metadata={op_name="moe_layers/reduce_precision" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/deepseek.py" source_line=239 source_end_line=239 source_column=13 source_end_column=59}
+  %convert_element_type.202 = f32[32,4096,7168]{2,1,0} convert(%reduce_precision.6), metadata={op_name="moe_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.12 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.202, %convert_element_type.202), metadata={op_name="moe_layers/pre_self_attention_layer_norm/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %constant.300 = f32[] constant(0)
+  %reduce_sum.60 = f32[32,4096]{1,0} reduce(%square.12, %constant.300), dimensions={2}, to_apply=%region_23.43, metadata={op_name="moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.352 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.60), metadata={op_name="moe_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.296 = f32[] constant(7168)
+  %broadcast.92 = f32[32,4096,1]{2,1,0} broadcast(%constant.296), dimensions={}
+  %div.322 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.352, %broadcast.92), metadata={op_name="moe_layers/pre_self_attention_layer_norm/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.295 = f32[] constant(1e-06)
+  %broadcast.91 = f32[32,4096,1]{2,1,0} broadcast(%constant.295), dimensions={}
+  %add.66 = f32[32,4096,1]{2,1,0} add(%div.322, %broadcast.91), metadata={op_name="moe_layers/pre_self_attention_layer_norm/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.12 = f32[32,4096,1]{2,1,0} rsqrt(%add.66), metadata={op_name="moe_layers/pre_self_attention_layer_norm/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.319 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.12), dimensions={0,1,2}, metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.320 = f32[32,4096]{1,0} reshape(%mul.319), metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.321 = f32[32,4096,7168]{2,1,0} broadcast(%mul.320), dimensions={0,1}, metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.322 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.202, %mul.321), metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.203 = bf16[32,4096,7168]{2,1,0} convert(%mul.322), metadata={op_name="moe_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_66.3 = bf16[7168]{0} parameter(66)
+  %broadcast_in_dim.353 = bf16[1,1,7168]{2,1,0} reshape(%Arg_66.3), metadata={op_name="moe_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.323 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.353), dimensions={0,1,2}, metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.324 = bf16[7168]{0} reshape(%mul.323), metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.325 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.324), dimensions={2}, metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.326 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.203, %mul.325), metadata={op_name="moe_layers/pre_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %sharding_constraint.48 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.326), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %sharding_constraint.49 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.48), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %constant.294 = f32[1]{0} constant({1})
+  %Arg_108.1 = f32[1024]{0} parameter(108)
+  %constant.299 = f32[] constant(-inf)
+  %reduce_max.302 = f32[] reduce(%Arg_108.1, %constant.299), dimensions={0}, to_apply=%region_24.44, metadata={op_name="moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %is_finite.56 = pred[] is-finite(%reduce_max.302), metadata={op_name="moe_layers/self_attention/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %gt.56 = pred[] compare(%reduce_max.302, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %constant.297 = f32[] constant(448)
+  %div.324 = f32[] divide(%constant.297, %reduce_max.302), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %constant.298 = f32[] constant(1)
+  %div.325 = f32[] divide(%div.324, %constant.298), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_109.1 = f32[1]{0} parameter(109)
+  %div.323 = f32[1]{0} divide(%constant.294, %Arg_109.1), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %jit__where_.114 = f32[1]{0} call(%gt.56, %div.325, %div.323), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %jit__where_.115 = f32[1]{0} call(%is_finite.56, %jit__where_.114, %div.323), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.326 = f32[1]{0} divide(%constant.294, %jit__where_.115), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %convert_element_type.204 = bf16[1]{0} convert(%div.326), metadata={op_name="moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.354 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.204), dimensions={2}, metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.355 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.354), metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.356 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.355), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.327 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.49, %broadcast_in_dim.356), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.65 = bf16[] parameter(0)
+  %Arg_1.57 = bf16[] parameter(1)
+  %jit_clip_.60 = bf16[32,4096,7168]{2,1,0} call(%div.327, %Arg_0.65, %Arg_1.57), to_apply=%clip_75.9, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %convert_element_type.205 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.60), metadata={op_name="moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_72.3 = bf16[7168,1536]{1,0} parameter(72)
+  %Arg_110.1 = f32[1024]{0} parameter(110)
+  %reduce_max.303 = f32[] reduce(%Arg_110.1, %constant.299), dimensions={0}, to_apply=%region_25.45, metadata={op_name="moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %is_finite.57 = pred[] is-finite(%reduce_max.303), metadata={op_name="moe_layers/self_attention/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %gt.57 = pred[] compare(%reduce_max.303, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.329 = f32[] divide(%constant.297, %reduce_max.303), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.330 = f32[] divide(%div.329, %constant.298), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_112.1 = f32[1]{0} parameter(112)
+  %div.328 = f32[1]{0} divide(%constant.294, %Arg_112.1), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.116 = f32[1]{0} call(%gt.57, %div.330, %div.328), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.117 = f32[1]{0} call(%is_finite.57, %jit__where_.116, %div.328), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.331 = f32[1]{0} divide(%constant.294, %jit__where_.117), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.206 = bf16[1]{0} convert(%div.331), metadata={op_name="moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.357 = bf16[7168,1]{1,0} broadcast(%convert_element_type.206), dimensions={1}, metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.358 = bf16[7168]{0} reshape(%broadcast_in_dim.357), metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.359 = bf16[7168,1536]{1,0} broadcast(%broadcast_in_dim.358), dimensions={0}, metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.332 = bf16[7168,1536]{1,0} divide(%Arg_72.3, %broadcast_in_dim.359), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_2.53 = bf16[] parameter(2)
+  %Arg_3.3 = bf16[] parameter(3)
+  %jit_clip_.61 = bf16[7168,1536]{1,0} call(%div.332, %Arg_2.53, %Arg_3.3), to_apply=%clip_81.11, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.207 = f8e4m3fn[7168,1536]{1,0} convert(%jit_clip_.61), metadata={op_name="moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %dot_general.30 = bf16[32,4096,1536]{2,1,0} dot(%convert_element_type.205, %convert_element_type.207), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.327 = f32[1]{0} multiply(%div.326, %div.331), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.208 = bf16[1]{0} convert(%mul.327), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.360 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.208), dimensions={2}, metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.361 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.360), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.362 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.361), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.328 = bf16[32,4096,1536]{2,1,0} multiply(%dot_general.30, %broadcast_in_dim.362), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.212 = f32[32,4096,1536]{2,1,0} convert(%mul.328), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.13 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.212, %convert_element_type.212), metadata={op_name="moe_layers/self_attention/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %reduce_sum.61 = f32[32,4096]{1,0} reduce(%square.13, %constant.300), dimensions={2}, to_apply=%region_27.47, metadata={op_name="moe_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.369 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.61), metadata={op_name="moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.293 = f32[] constant(1536)
+  %div.321 = f32[32,4096,1]{2,1,0} broadcast(%constant.293), dimensions={}, metadata={op_name="moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %div.338 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.369, %div.321), metadata={op_name="moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %add.67 = f32[32,4096,1]{2,1,0} add(%div.338, %broadcast.91), metadata={op_name="moe_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.13 = f32[32,4096,1]{2,1,0} rsqrt(%add.67), metadata={op_name="moe_layers/self_attention/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.331 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.13), dimensions={0,1,2}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.332 = f32[32,4096]{1,0} reshape(%mul.331), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.333 = f32[32,4096,1536]{2,1,0} broadcast(%mul.332), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.334 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.212, %mul.333), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.213 = bf16[32,4096,1536]{2,1,0} convert(%mul.334), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_69.3 = bf16[1536]{0} parameter(69)
+  %broadcast_in_dim.370 = bf16[1,1,1536]{2,1,0} reshape(%Arg_69.3), metadata={op_name="moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.335 = bf16[1,1,1536]{2,1,0} broadcast(%broadcast_in_dim.370), dimensions={0,1,2}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.336 = bf16[1536]{0} reshape(%mul.335), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.337 = bf16[32,4096,1536]{2,1,0} broadcast(%mul.336), dimensions={2}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.338 = bf16[32,4096,1536]{2,1,0} multiply(%convert_element_type.213, %mul.337), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %Arg_122.1 = f32[1024]{0} parameter(122)
+  %reduce_max.305 = f32[] reduce(%Arg_122.1, %constant.299), dimensions={0}, to_apply=%region_28.48, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.59 = pred[] is-finite(%reduce_max.305), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.59 = pred[] compare(%reduce_max.305, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.340 = f32[] divide(%constant.297, %reduce_max.305), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.341 = f32[] divide(%div.340, %constant.298), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_123.1 = f32[1]{0} parameter(123)
+  %div.339 = f32[1]{0} divide(%constant.294, %Arg_123.1), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.120 = f32[1]{0} call(%gt.59, %div.341, %div.339), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.121 = f32[1]{0} call(%is_finite.59, %jit__where_.120, %div.339), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.342 = f32[1]{0} divide(%constant.294, %jit__where_.121), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.214 = bf16[1]{0} convert(%div.342), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.371 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.214), dimensions={2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.372 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.371), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.373 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.372), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.343 = bf16[32,4096,1536]{2,1,0} divide(%mul.338, %broadcast_in_dim.373), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_4.3 = bf16[] parameter(4)
+  %Arg_5.3 = bf16[] parameter(5)
+  %jit_clip_.63 = bf16[32,4096,1536]{2,1,0} call(%div.343, %Arg_4.3, %Arg_5.3), to_apply=%clip_106.16, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.215 = f8e4m3fn[32,4096,1536]{2,1,0} convert(%jit_clip_.63), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_73.3 = bf16[1536,128,192]{2,1,0} parameter(73)
+  %Arg_124.1 = f32[1024]{0} parameter(124)
+  %reduce_max.306 = f32[] reduce(%Arg_124.1, %constant.299), dimensions={0}, to_apply=%region_29.49, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.60 = pred[] is-finite(%reduce_max.306), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.60 = pred[] compare(%reduce_max.306, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.345 = f32[] divide(%constant.297, %reduce_max.306), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.346 = f32[] divide(%div.345, %constant.298), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_125.1 = f32[1]{0} parameter(125)
+  %div.344 = f32[1]{0} divide(%constant.294, %Arg_125.1), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.122 = f32[1]{0} call(%gt.60, %div.346, %div.344), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.123 = f32[1]{0} call(%is_finite.60, %jit__where_.122, %div.344), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.347 = f32[1]{0} divide(%constant.294, %jit__where_.123), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.216 = bf16[1]{0} convert(%div.347), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.374 = bf16[1536,128,1]{2,1,0} broadcast(%convert_element_type.216), dimensions={2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.375 = bf16[1536,128]{1,0} reshape(%broadcast_in_dim.374), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.376 = bf16[1536,128,192]{2,1,0} broadcast(%broadcast_in_dim.375), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.348 = bf16[1536,128,192]{2,1,0} divide(%Arg_73.3, %broadcast_in_dim.376), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_6.3 = bf16[] parameter(6)
+  %Arg_7.3 = bf16[] parameter(7)
+  %jit_clip_.64 = bf16[1536,128,192]{2,1,0} call(%div.348, %Arg_6.3, %Arg_7.3), to_apply=%clip_112.18, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.217 = f8e4m3fn[1536,128,192]{2,1,0} convert(%jit_clip_.64), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.32 = bf16[32,4096,128,192]{3,2,1,0} dot(%convert_element_type.215, %convert_element_type.217), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.339 = f32[1]{0} multiply(%div.342, %div.347), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.218 = bf16[1]{0} convert(%mul.339), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.377 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.218), dimensions={3}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.378 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.377), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.379 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%broadcast_in_dim.378), dimensions={0,1,2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.340 = bf16[32,4096,128,192]{3,2,1,0} multiply(%dot_general.32, %broadcast_in_dim.379), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %split.18 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.340), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="moe_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=541 source_end_line=541 source_column=19 source_end_column=65}
+  %split.19 = bf16[32,4096,128,64]{3,2,1,0} slice(%mul.340), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="moe_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=541 source_end_line=541 source_column=19 source_end_column=65}
+  %reshape.6 = bf16[32,4096,128,32,2]{4,3,2,1,0} reshape(%split.19), metadata={op_name="moe_layers/self_attention/reshape" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=702 source_end_line=702 source_column=24 source_end_column=60}
+  %slice.13 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.6), slice={[0:32], [0:4096], [0:128], [0:32], [0:1]}, metadata={op_name="moe_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %squeeze.165 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.13), metadata={op_name="moe_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %convert_element_type.220 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.165), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %slice.14 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.6), slice={[0:32], [0:4096], [0:128], [0:32], [1:2]}, metadata={op_name="moe_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %squeeze.166 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.14), metadata={op_name="moe_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %convert_element_type.219 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.166), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %constant.292 = c64[] constant((0, 1))
+  %mul.318 = c64[32,4096,128,32]{3,2,1,0} broadcast(%constant.292), dimensions={}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %mul.341 = c64[32,4096,128,32]{3,2,1,0} multiply(%convert_element_type.219, %mul.318), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %add.68 = c64[32,4096,128,32]{3,2,1,0} add(%convert_element_type.220, %mul.341), metadata={op_name="moe_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %Arg_8.3 = c64[32,4096,1,32]{3,2,1,0} parameter(8)
+  %mul.342 = c64[32,4096,1,32]{3,2,1,0} broadcast(%Arg_8.3), dimensions={0,1,2,3}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %mul.343 = c64[32,4096,32]{2,1,0} reshape(%mul.342), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %mul.344 = c64[32,4096,128,32]{3,2,1,0} broadcast(%mul.343), dimensions={0,1,3}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %mul.345 = c64[32,4096,128,32]{3,2,1,0} multiply(%add.68, %mul.344), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %real.6 = f32[32,4096,128,32]{3,2,1,0} real(%mul.345), metadata={op_name="moe_layers/self_attention/real" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=30 source_end_column=47}
+  %imag.6 = f32[32,4096,128,32]{3,2,1,0} imag(%mul.345), metadata={op_name="moe_layers/self_attention/imag" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=49 source_end_column=66}
+  %concatenate.12 = f32[32,4096,128,64]{3,2,1,0} concatenate(%real.6, %imag.6), dimensions={3}, metadata={op_name="moe_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=13 source_end_column=77}
+  %convert_element_type.221 = bf16[32,4096,128,64]{3,2,1,0} convert(%concatenate.12), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=721 source_end_line=721 source_column=15 source_end_column=46}
+  %concatenate.13 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.18, %convert_element_type.221), dimensions={3}, metadata={op_name="moe_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=545 source_end_line=545 source_column=12 source_end_column=73}
+  %constant.291 = bf16[] constant(0.1348)
+  %mul.317 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%constant.291), dimensions={}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=545 source_end_line=545 source_column=12 source_end_column=73}
+  %mul.346 = bf16[32,4096,128,192]{3,2,1,0} multiply(%concatenate.13, %mul.317), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=545 source_end_line=545 source_column=12 source_end_column=73}
+  %sharding_constraint.50 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%mul.346), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_70.3 = bf16[7168,576]{1,0} parameter(70)
+  %Arg_111.1 = f32[1024]{0} parameter(111)
+  %reduce_max.304 = f32[] reduce(%Arg_111.1, %constant.299), dimensions={0}, to_apply=%region_26.46, metadata={op_name="moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %is_finite.58 = pred[] is-finite(%reduce_max.304), metadata={op_name="moe_layers/self_attention/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %gt.58 = pred[] compare(%reduce_max.304, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.334 = f32[] divide(%constant.297, %reduce_max.304), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.335 = f32[] divide(%div.334, %constant.298), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_113.1 = f32[1]{0} parameter(113)
+  %div.333 = f32[1]{0} divide(%constant.294, %Arg_113.1), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.118 = f32[1]{0} call(%gt.58, %div.335, %div.333), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.119 = f32[1]{0} call(%is_finite.58, %jit__where_.118, %div.333), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.336 = f32[1]{0} divide(%constant.294, %jit__where_.119), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.209 = bf16[1]{0} convert(%div.336), metadata={op_name="moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.363 = bf16[7168,1]{1,0} broadcast(%convert_element_type.209), dimensions={1}, metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.364 = bf16[7168]{0} reshape(%broadcast_in_dim.363), metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.365 = bf16[7168,576]{1,0} broadcast(%broadcast_in_dim.364), dimensions={0}, metadata={op_name="moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.337 = bf16[7168,576]{1,0} divide(%Arg_70.3, %broadcast_in_dim.365), metadata={op_name="moe_layers/self_attention/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_9.3 = bf16[] parameter(9)
+  %Arg_10.3 = bf16[] parameter(10)
+  %jit_clip_.62 = bf16[7168,576]{1,0} call(%div.337, %Arg_9.3, %Arg_10.3), to_apply=%clip_90.13, metadata={op_name="moe_layers/self_attention/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.210 = f8e4m3fn[7168,576]{1,0} convert(%jit_clip_.62), metadata={op_name="moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %dot_general.31 = bf16[32,4096,576]{2,1,0} dot(%convert_element_type.205, %convert_element_type.210), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.329 = f32[1]{0} multiply(%div.326, %div.336), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.211 = bf16[1]{0} convert(%mul.329), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.366 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.211), dimensions={2}, metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.367 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.366), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.368 = bf16[32,4096,576]{2,1,0} broadcast(%broadcast_in_dim.367), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.330 = bf16[32,4096,576]{2,1,0} multiply(%dot_general.31, %broadcast_in_dim.368), metadata={op_name="moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %split.20 = bf16[32,4096,512]{2,1,0} slice(%mul.330), slice={[0:32], [0:4096], [0:512]}, metadata={op_name="moe_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=667 source_end_line=667 source_column=35 source_end_column=84}
+  %convert_element_type.222 = f32[32,4096,512]{2,1,0} convert(%split.20), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.14 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.222, %convert_element_type.222), metadata={op_name="moe_layers/self_attention/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %reduce_sum.62 = f32[32,4096]{1,0} reduce(%square.14, %constant.300), dimensions={2}, to_apply=%region_30.50, metadata={op_name="moe_layers/self_attention/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.380 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.62), metadata={op_name="moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.290 = f32[] constant(512)
+  %div.320 = f32[32,4096,1]{2,1,0} broadcast(%constant.290), dimensions={}, metadata={op_name="moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %div.349 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.380, %div.320), metadata={op_name="moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %add.69 = f32[32,4096,1]{2,1,0} add(%div.349, %broadcast.91), metadata={op_name="moe_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.14 = f32[32,4096,1]{2,1,0} rsqrt(%add.69), metadata={op_name="moe_layers/self_attention/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.347 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.14), dimensions={0,1,2}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.348 = f32[32,4096]{1,0} reshape(%mul.347), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.349 = f32[32,4096,512]{2,1,0} broadcast(%mul.348), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.350 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.222, %mul.349), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.223 = bf16[32,4096,512]{2,1,0} convert(%mul.350), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_67.3 = bf16[512]{0} parameter(67)
+  %broadcast_in_dim.381 = bf16[1,1,512]{2,1,0} reshape(%Arg_67.3), metadata={op_name="moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.351 = bf16[1,1,512]{2,1,0} broadcast(%broadcast_in_dim.381), dimensions={0,1,2}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.352 = bf16[512]{0} reshape(%mul.351), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.353 = bf16[32,4096,512]{2,1,0} broadcast(%mul.352), dimensions={2}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.354 = bf16[32,4096,512]{2,1,0} multiply(%convert_element_type.223, %mul.353), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %Arg_118.1 = f32[1024]{0} parameter(118)
+  %reduce_max.307 = f32[] reduce(%Arg_118.1, %constant.299), dimensions={0}, to_apply=%region_31.51, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.61 = pred[] is-finite(%reduce_max.307), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.61 = pred[] compare(%reduce_max.307, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.351 = f32[] divide(%constant.297, %reduce_max.307), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.352 = f32[] divide(%div.351, %constant.298), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_119.1 = f32[1]{0} parameter(119)
+  %div.350 = f32[1]{0} divide(%constant.294, %Arg_119.1), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.124 = f32[1]{0} call(%gt.61, %div.352, %div.350), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.125 = f32[1]{0} call(%is_finite.61, %jit__where_.124, %div.350), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.353 = f32[1]{0} divide(%constant.294, %jit__where_.125), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.227 = bf16[1]{0} convert(%div.353), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.382 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.227), dimensions={2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.383 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.382), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.384 = bf16[32,4096,512]{2,1,0} broadcast(%broadcast_in_dim.383), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.354 = bf16[32,4096,512]{2,1,0} divide(%mul.354, %broadcast_in_dim.384), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_11.3 = bf16[] parameter(11)
+  %Arg_12.3 = bf16[] parameter(12)
+  %jit_clip_.65 = bf16[32,4096,512]{2,1,0} call(%div.354, %Arg_11.3, %Arg_12.3), to_apply=%clip_154.21, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.228 = f8e4m3fn[32,4096,512]{2,1,0} convert(%jit_clip_.65), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_71.3 = bf16[512,128,256]{2,1,0} parameter(71)
+  %Arg_120.1 = f32[1024]{0} parameter(120)
+  %reduce_max.308 = f32[] reduce(%Arg_120.1, %constant.299), dimensions={0}, to_apply=%region_32.52, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.62 = pred[] is-finite(%reduce_max.308), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.62 = pred[] compare(%reduce_max.308, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.356 = f32[] divide(%constant.297, %reduce_max.308), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.357 = f32[] divide(%div.356, %constant.298), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_121.1 = f32[1]{0} parameter(121)
+  %div.355 = f32[1]{0} divide(%constant.294, %Arg_121.1), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.126 = f32[1]{0} call(%gt.62, %div.357, %div.355), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.127 = f32[1]{0} call(%is_finite.62, %jit__where_.126, %div.355), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.358 = f32[1]{0} divide(%constant.294, %jit__where_.127), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.229 = bf16[1]{0} convert(%div.358), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.385 = bf16[512,128,1]{2,1,0} broadcast(%convert_element_type.229), dimensions={2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.386 = bf16[512,128]{1,0} reshape(%broadcast_in_dim.385), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.387 = bf16[512,128,256]{2,1,0} broadcast(%broadcast_in_dim.386), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.359 = bf16[512,128,256]{2,1,0} divide(%Arg_71.3, %broadcast_in_dim.387), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_13.3 = bf16[] parameter(13)
+  %Arg_14.3 = bf16[] parameter(14)
+  %jit_clip_.66 = bf16[512,128,256]{2,1,0} call(%div.359, %Arg_13.3, %Arg_14.3), to_apply=%clip_160.23, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.230 = f8e4m3fn[512,128,256]{2,1,0} convert(%jit_clip_.66), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.33 = bf16[32,4096,128,256]{3,2,1,0} dot(%convert_element_type.228, %convert_element_type.230), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.357 = f32[1]{0} multiply(%div.353, %div.358), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.231 = bf16[1]{0} convert(%mul.357), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.388 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.231), dimensions={3}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.389 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.388), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.390 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%broadcast_in_dim.389), dimensions={0,1,2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.358 = bf16[32,4096,128,256]{3,2,1,0} multiply(%dot_general.33, %broadcast_in_dim.390), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %split.22 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.358), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="moe_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=560 source_end_line=560 source_column=22 source_end_column=73}
+  %split.21 = bf16[32,4096,64]{2,1,0} slice(%mul.330), slice={[0:32], [0:4096], [512:576]}, metadata={op_name="moe_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=667 source_end_line=667 source_column=35 source_end_column=84}
+  %reshape.7 = bf16[32,4096,1,32,2]{4,3,2,1,0} reshape(%split.21), metadata={op_name="moe_layers/self_attention/reshape" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=702 source_end_line=702 source_column=24 source_end_column=60}
+  %slice.15 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.7), slice={[0:32], [0:4096], [0:1], [0:32], [0:1]}, metadata={op_name="moe_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %squeeze.167 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.15), metadata={op_name="moe_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=32 source_end_column=55}
+  %convert_element_type.225 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.167), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %slice.16 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.7), slice={[0:32], [0:4096], [0:1], [0:32], [1:2]}, metadata={op_name="moe_layers/self_attention/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %squeeze.168 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.16), metadata={op_name="moe_layers/self_attention/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=703 source_end_line=703 source_column=57 source_end_column=80}
+  %convert_element_type.224 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.168), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %constant.289 = c64[] constant((0, 1))
+  %mul.316 = c64[32,4096,1,32]{3,2,1,0} broadcast(%constant.289), dimensions={}, metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %mul.355 = c64[32,4096,1,32]{3,2,1,0} multiply(%convert_element_type.224, %mul.316), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=34 source_end_column=50}
+  %add.70 = c64[32,4096,1,32]{3,2,1,0} add(%convert_element_type.225, %mul.355), metadata={op_name="moe_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=708 source_end_line=708 source_column=21 source_end_column=50}
+  %Arg_15.3 = c64[32,4096,1,32]{3,2,1,0} parameter(15)
+  %mul.356 = c64[32,4096,1,32]{3,2,1,0} multiply(%add.70, %Arg_15.3), metadata={op_name="moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=710 source_end_line=710 source_column=14 source_end_column=36}
+  %real.7 = f32[32,4096,1,32]{3,2,1,0} real(%mul.356), metadata={op_name="moe_layers/self_attention/real" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=30 source_end_column=47}
+  %imag.7 = f32[32,4096,1,32]{3,2,1,0} imag(%mul.356), metadata={op_name="moe_layers/self_attention/imag" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=49 source_end_column=66}
+  %concatenate.14 = f32[32,4096,1,64]{3,2,1,0} concatenate(%real.7, %imag.7), dimensions={3}, metadata={op_name="moe_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=714 source_end_line=714 source_column=13 source_end_column=77}
+  %convert_element_type.226 = bf16[32,4096,1,64]{3,2,1,0} convert(%concatenate.14), metadata={op_name="moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=721 source_end_line=721 source_column=15 source_end_column=46}
+  %broadcast_in_dim.391 = bf16[32,4096,1,64]{3,2,1,0} broadcast(%convert_element_type.226), dimensions={0,1,2,3}, metadata={op_name="moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=561 source_end_line=561 source_column=15 source_end_column=122}
+  %broadcast_in_dim.392 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.391), metadata={op_name="moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=561 source_end_line=561 source_column=15 source_end_column=122}
+  %broadcast_in_dim.393 = bf16[32,4096,128,64]{3,2,1,0} broadcast(%broadcast_in_dim.392), dimensions={0,1,3}, metadata={op_name="moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=561 source_end_line=561 source_column=15 source_end_column=122}
+  %concatenate.15 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.22, %broadcast_in_dim.393), dimensions={3}, metadata={op_name="moe_layers/self_attention/concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=563 source_end_line=563 source_column=10 source_end_column=56}
+  %sharding_constraint.51 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%concatenate.15), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %split.23 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.358), slice={[0:32], [0:4096], [0:128], [128:256]}, metadata={op_name="moe_layers/self_attention/split" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=560 source_end_line=560 source_column=22 source_end_column=73}
+  %sharding_constraint.52 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%split.23), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_16.3 = bf16[0]{0} parameter(16)
+  %custom_partitioning.13 = (bf16[32,4096,128,128]{3,2,1,0}, f32[32,128,4096]{2,1,0}) custom-call(%sharding_constraint.50, %sharding_constraint.51, %sharding_constraint.52, %Arg_16.3, %Arg_16.3, /*index=5*/%Arg_16.3, %Arg_16.3, %Arg_16.3, %Arg_16.3, %Arg_16.3), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="moe_layers/self_attention/custom_partitioning" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_op.py" source_line=1199 source_end_line=1208 source_column=20 source_end_column=7}, backend_config="281457329920656"
+  %custom_partitioning.15 = f32[32,128,4096]{2,1,0} get-tuple-element(%custom_partitioning.13), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="moe_layers/self_attention/custom_partitioning" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_op.py" source_line=1199 source_end_line=1208 source_column=20 source_end_column=7}
+  %custom_partitioning.14 = bf16[32,4096,128,128]{3,2,1,0} get-tuple-element(%custom_partitioning.13), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="moe_layers/self_attention/custom_partitioning" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_op.py" source_line=1199 source_end_line=1208 source_column=20 source_end_column=7}
+  %sharding_constraint.53 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%custom_partitioning.14), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_114.1 = f32[1024]{0} parameter(114)
+  %reduce_max.309 = f32[] reduce(%Arg_114.1, %constant.299), dimensions={0}, to_apply=%region_33.53, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.63 = pred[] is-finite(%reduce_max.309), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.63 = pred[] compare(%reduce_max.309, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.361 = f32[] divide(%constant.297, %reduce_max.309), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.362 = f32[] divide(%div.361, %constant.298), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_115.1 = f32[1]{0} parameter(115)
+  %div.360 = f32[1]{0} divide(%constant.294, %Arg_115.1), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.128 = f32[1]{0} call(%gt.63, %div.362, %div.360), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.129 = f32[1]{0} call(%is_finite.63, %jit__where_.128, %div.360), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.363 = f32[1]{0} divide(%constant.294, %jit__where_.129), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.232 = bf16[1]{0} convert(%div.363), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.394 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.232), dimensions={3}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.395 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.394), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.396 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%broadcast_in_dim.395), dimensions={0,1,2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.364 = bf16[32,4096,128,128]{3,2,1,0} divide(%sharding_constraint.53, %broadcast_in_dim.396), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_17.3 = bf16[] parameter(17)
+  %Arg_18.3 = bf16[] parameter(18)
+  %jit_clip_.67 = bf16[32,4096,128,128]{3,2,1,0} call(%div.364, %Arg_17.3, %Arg_18.3), to_apply=%clip_177.25, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.233 = f8e4m3fn[32,4096,128,128]{3,2,1,0} convert(%jit_clip_.67), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_68.3 = bf16[128,128,7168]{2,1,0} parameter(68)
+  %Arg_116.1 = f32[1024]{0} parameter(116)
+  %reduce_max.310 = f32[] reduce(%Arg_116.1, %constant.299), dimensions={0}, to_apply=%region_34.54, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.64 = pred[] is-finite(%reduce_max.310), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.64 = pred[] compare(%reduce_max.310, %constant.300), direction=GT, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.366 = f32[] divide(%constant.297, %reduce_max.310), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.367 = f32[] divide(%div.366, %constant.298), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_117.1 = f32[1]{0} parameter(117)
+  %div.365 = f32[1]{0} divide(%constant.294, %Arg_117.1), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.130 = f32[1]{0} call(%gt.64, %div.367, %div.365), to_apply=%_where_68.7, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.131 = f32[1]{0} call(%is_finite.64, %jit__where_.130, %div.365), to_apply=%_where_71.8, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.368 = f32[1]{0} divide(%constant.294, %jit__where_.131), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.234 = bf16[1]{0} convert(%div.368), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.397 = bf16[128,128,1]{2,1,0} broadcast(%convert_element_type.234), dimensions={2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.398 = bf16[128,128]{1,0} reshape(%broadcast_in_dim.397), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.399 = bf16[128,128,7168]{2,1,0} broadcast(%broadcast_in_dim.398), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.369 = bf16[128,128,7168]{2,1,0} divide(%Arg_68.3, %broadcast_in_dim.399), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_19.3 = bf16[] parameter(19)
+  %Arg_20.3 = bf16[] parameter(20)
+  %jit_clip_.68 = bf16[128,128,7168]{2,1,0} call(%div.369, %Arg_19.3, %Arg_20.3), to_apply=%clip_183.27, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.235 = f8e4m3fn[128,128,7168]{2,1,0} convert(%jit_clip_.68), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.34 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.233, %convert_element_type.235), lhs_contracting_dims={2,3}, rhs_contracting_dims={0,1}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.359 = f32[1]{0} multiply(%div.363, %div.368), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.236 = bf16[1]{0} convert(%mul.359), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.400 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.236), dimensions={2}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.401 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.400), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.402 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.401), dimensions={0,1}, metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.360 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.34, %broadcast_in_dim.402), metadata={op_name="moe_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %reduce_precision.7 = bf16[32,4096,7168]{2,1,0} reduce-precision(%mul.360), exponent_bits=8, mantissa_bits=7, metadata={op_name="moe_layers/self_attention/reduce_precision" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/attention_mla.py" source_line=751 source_end_line=751 source_column=10 source_end_column=42}
+  %sharding_constraint.54 = bf16[32,4096,7168]{2,1,0} custom-call(%reduce_precision.7), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %add.71 = bf16[32,4096,7168]{2,1,0} add(%reduce_precision.6, %sharding_constraint.54), metadata={op_name="moe_layers/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/deepseek.py" source_line=115 source_end_line=115 source_column=24 source_end_column=46}
+  %convert_element_type.237 = f32[32,4096,7168]{2,1,0} convert(%add.71), metadata={op_name="moe_layers/post_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.15 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.237, %convert_element_type.237), metadata={op_name="moe_layers/post_self_attention_layer_norm/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %reduce_sum.63 = f32[32,4096]{1,0} reduce(%square.15, %constant.300), dimensions={2}, to_apply=%region_35.55, metadata={op_name="moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.403 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.63), metadata={op_name="moe_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %div.370 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.403, %broadcast.92), metadata={op_name="moe_layers/post_self_attention_layer_norm/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %add.72 = f32[32,4096,1]{2,1,0} add(%div.370, %broadcast.91), metadata={op_name="moe_layers/post_self_attention_layer_norm/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.15 = f32[32,4096,1]{2,1,0} rsqrt(%add.72), metadata={op_name="moe_layers/post_self_attention_layer_norm/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.361 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.15), dimensions={0,1,2}, metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.362 = f32[32,4096]{1,0} reshape(%mul.361), metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.363 = f32[32,4096,7168]{2,1,0} broadcast(%mul.362), dimensions={0,1}, metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.364 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.237, %mul.363), metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.238 = bf16[32,4096,7168]{2,1,0} convert(%mul.364), metadata={op_name="moe_layers/post_self_attention_layer_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %Arg_65.3 = bf16[7168]{0} parameter(65)
+  %broadcast_in_dim.404 = bf16[1,1,7168]{2,1,0} reshape(%Arg_65.3), metadata={op_name="moe_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.365 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.404), dimensions={0,1,2}, metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.366 = bf16[7168]{0} reshape(%mul.365), metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.367 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.366), dimensions={2}, metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.368 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.238, %mul.367), metadata={op_name="moe_layers/post_self_attention_layer_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %sharding_constraint.55 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.368), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_74.3 = f32[1024]{0} parameter(74)
+  %reduce_max.311 = f32[] reduce(%Arg_74.3, %constant.299), dimensions={0}, to_apply=%region_36.56, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.65 = pred[] is-finite(%reduce_max.311), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.65 = pred[] compare(%reduce_max.311, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.372 = f32[] divide(%constant.297, %reduce_max.311), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.373 = f32[] divide(%div.372, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_75.3 = f32[1]{0} parameter(75)
+  %div.371 = f32[1]{0} divide(%constant.294, %Arg_75.3), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.132 = f32[1]{0} call(%gt.65, %div.373, %div.371), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.133 = f32[1]{0} call(%is_finite.65, %jit__where_.132, %div.371), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.374 = f32[1]{0} divide(%constant.294, %jit__where_.133), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.239 = bf16[1]{0} convert(%div.374), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.405 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.239), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.406 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.405), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.407 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.406), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.375 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.55, %broadcast_in_dim.407), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_21.3 = bf16[] parameter(21)
+  %Arg_22.3 = bf16[] parameter(22)
+  %jit_clip_.69 = bf16[32,4096,7168]{2,1,0} call(%div.375, %Arg_21.3, %Arg_22.3), to_apply=%clip_75.9, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.240 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.69), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_57.3 = bf16[7168,64]{1,0} parameter(57)
+  %Arg_76.1 = f32[1024]{0} parameter(76)
+  %reduce_max.312 = f32[] reduce(%Arg_76.1, %constant.299), dimensions={0}, to_apply=%region_37.57, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.66 = pred[] is-finite(%reduce_max.312), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.66 = pred[] compare(%reduce_max.312, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.377 = f32[] divide(%constant.297, %reduce_max.312), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.378 = f32[] divide(%div.377, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_77.1 = f32[1]{0} parameter(77)
+  %div.376 = f32[1]{0} divide(%constant.294, %Arg_77.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.134 = f32[1]{0} call(%gt.66, %div.378, %div.376), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.135 = f32[1]{0} call(%is_finite.66, %jit__where_.134, %div.376), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.379 = f32[1]{0} divide(%constant.294, %jit__where_.135), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.241 = bf16[1]{0} convert(%div.379), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.408 = bf16[7168,1]{1,0} broadcast(%convert_element_type.241), dimensions={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.409 = bf16[7168]{0} reshape(%broadcast_in_dim.408), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.410 = bf16[7168,64]{1,0} broadcast(%broadcast_in_dim.409), dimensions={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.380 = bf16[7168,64]{1,0} divide(%Arg_57.3, %broadcast_in_dim.410), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_23.3 = bf16[] parameter(23)
+  %Arg_24.3 = bf16[] parameter(24)
+  %jit_clip_.70 = bf16[7168,64]{1,0} call(%div.380, %Arg_23.3, %Arg_24.3), to_apply=%clip_275.58, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.242 = f8e4m3fn[7168,64]{1,0} convert(%jit_clip_.70), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.35 = bf16[32,4096,64]{2,1,0} dot(%convert_element_type.240, %convert_element_type.242), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.369 = f32[1]{0} multiply(%div.374, %div.379), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.243 = bf16[1]{0} convert(%mul.369), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.411 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.243), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.412 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.411), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.413 = bf16[32,4096,64]{2,1,0} broadcast(%broadcast_in_dim.412), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.370 = bf16[32,4096,64]{2,1,0} multiply(%dot_general.35, %broadcast_in_dim.413), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %sharding_constraint.56 = bf16[32,4096,64]{2,1,0} custom-call(%mul.370), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_58.3 = bf16[64]{0} parameter(58)
+  %custom_partitioning.16 = (bf16[32,4096,64]{2,1,0}, pred[32,4096,64]{2,1,0}, bf16[32,4096,64]{2,1,0}) custom-call(%sharding_constraint.56, %Arg_58.3), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/usr/local/lib/python3.12/dist-packages/transformer_engine/jax/router.py" source_line=34 source_end_line=43 source_column=14 source_end_column=5}, backend_config="281456797909792"
+  %custom_partitioning.19 = bf16[32,4096,64]{2,1,0} get-tuple-element(%custom_partitioning.16), index=2, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/usr/local/lib/python3.12/dist-packages/transformer_engine/jax/router.py" source_line=34 source_end_line=43 source_column=14 source_end_column=5}
+  %custom_partitioning.17 = bf16[32,4096,64]{2,1,0} get-tuple-element(%custom_partitioning.16), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/usr/local/lib/python3.12/dist-packages/transformer_engine/jax/router.py" source_line=34 source_end_line=43 source_column=14 source_end_column=5}
+  %custom_partitioning.18 = pred[32,4096,64]{2,1,0} get-tuple-element(%custom_partitioning.16), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/usr/local/lib/python3.12/dist-packages/transformer_engine/jax/router.py" source_line=34 source_end_line=43 source_column=14 source_end_column=5}
+  %convert_element_type.244 = s32[32,4096,64]{2,1,0} convert(%custom_partitioning.18), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1385 source_end_line=1385 source_column=18 source_end_column=49}
+  %jit_cumsum_.1 = s32[32,4096,64]{2,1,0} call(%convert_element_type.244), to_apply=%cumsum.61, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/jit(cumsum)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1388 source_end_line=1388 source_column=25 source_end_column=56}
+  %constant.288 = s32[] constant(512)
+  %le.6 = s32[32,4096,64]{2,1,0} broadcast(%constant.288), dimensions={}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/le" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1391 source_end_line=1391 source_column=38 source_end_column=99}
+  %le.7 = pred[32,4096,64]{2,1,0} compare(%jit_cumsum_.1, %le.6), direction=LE, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/le" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1391 source_end_line=1391 source_column=38 source_end_column=99}
+  %convert_element_type.245 = s32[32,4096,64]{2,1,0} convert(%le.7), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1391 source_end_line=1391 source_column=24 source_end_column=99}
+  %mul.371 = s32[32,4096,64]{2,1,0} multiply(%convert_element_type.244, %convert_element_type.245), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1391 source_end_line=1391 source_column=24 source_end_column=99}
+  %convert_element_type.246 = bf16[32,4096,64]{2,1,0} convert(%mul.371), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1394 source_end_line=1394 source_column=4 source_end_column=41}
+  %mul.372 = bf16[32,4096,64]{2,1,0} multiply(%custom_partitioning.17, %convert_element_type.246), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1394 source_end_line=1394 source_column=4 source_end_column=41}
+  %broadcast_in_dim.414 = bf16[32,4096,64,1]{3,2,1,0} reshape(%mul.372), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1409 source_end_line=1409 source_column=19 source_end_column=43}
+  %mul.375 = bf16[32,4096,64,1]{3,2,1,0} broadcast(%broadcast_in_dim.414), dimensions={0,1,2,3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1409 source_end_line=1409 source_column=19 source_end_column=43}
+  %mul.376 = bf16[32,4096,64]{2,1,0} reshape(%mul.375), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1409 source_end_line=1409 source_column=19 source_end_column=43}
+  %mul.377 = bf16[32,4096,64,513]{3,2,1,0} broadcast(%mul.376), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1409 source_end_line=1409 source_column=19 source_end_column=43}
+  %mul.373 = s32[32,4096,64]{2,1,0} multiply(%convert_element_type.244, %jit_cumsum_.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1397 source_end_line=1397 source_column=28 source_end_column=60}
+  %mul.374 = s32[32,4096,64]{2,1,0} multiply(%mul.373, %mul.371), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1398 source_end_line=1398 source_column=34 source_end_column=78}
+  %Arg_25.3 = s32[1,1,1,513]{3,2,1,0} parameter(25)
+  %jit__one_hot_.2 = s32[32,4096,64,513]{3,2,1,0} call(%mul.374, %Arg_25.3), to_apply=%_one_hot_289.62, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/jit(_one_hot)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %convert_element_type.247 = bf16[32,4096,64,513]{3,2,1,0} convert(%jit__one_hot_.2), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1409 source_end_line=1409 source_column=19 source_end_column=43}
+  %mul.378 = bf16[32,4096,64,513]{3,2,1,0} multiply(%mul.377, %convert_element_type.247), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1409 source_end_line=1409 source_column=19 source_end_column=43}
+  %slice.17 = bf16[32,4096,64,512]{3,2,1,0} slice(%mul.378), slice={[0:32], [0:4096], [0:64], [1:513]}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1410 source_end_line=1410 source_column=19 source_end_column=40}
+  %sharding_constraint.58 = bf16[32,4096,64,512]{3,2,1,0} custom-call(%slice.17), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}, {}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_92.1 = f32[1024]{0} parameter(92)
+  %reduce_max.320 = f32[] reduce(%Arg_92.1, %constant.299), dimensions={0}, to_apply=%region_46.76, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %is_finite.74 = pred[] is-finite(%reduce_max.320), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %gt.74 = pred[] compare(%reduce_max.320, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.417 = f32[] divide(%constant.297, %reduce_max.320), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.418 = f32[] divide(%div.417, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_93.1 = f32[1]{0} parameter(93)
+  %div.416 = f32[1]{0} divide(%constant.294, %Arg_93.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.150 = f32[1]{0} call(%gt.74, %div.418, %div.416), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.151 = f32[1]{0} call(%is_finite.74, %jit__where_.150, %div.416), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.419 = f32[1]{0} divide(%constant.294, %jit__where_.151), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.268 = bf16[1]{0} convert(%div.419), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.448 = bf16[32,4096,64,1]{3,2,1,0} broadcast(%convert_element_type.268), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.449 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.448), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.450 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%broadcast_in_dim.449), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.420 = bf16[32,4096,64,512]{3,2,1,0} divide(%sharding_constraint.58, %broadcast_in_dim.450), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_26.3 = bf16[] parameter(26)
+  %Arg_27.3 = bf16[] parameter(27)
+  %jit_clip_.78 = bf16[32,4096,64,512]{3,2,1,0} call(%div.420, %Arg_26.3, %Arg_27.3), to_apply=%clip_302.64, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.269 = f8e4m3fn[32,4096,64,512]{3,2,1,0} convert(%jit_clip_.78), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %constant.287 = bf16[] constant(0)
+  %convert_element_type.201 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%constant.287), dimensions={}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1411 source_end_line=1411 source_column=20 source_end_column=45}
+  %convert_element_type.248 = pred[32,4096,64,512]{3,2,1,0} compare(%slice.17, %convert_element_type.201), direction=NE, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1411 source_end_line=1411 source_column=20 source_end_column=45}
+  %sharding_constraint.57 = pred[32,4096,64,512]{3,2,1,0} custom-call(%convert_element_type.248), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}, {}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %convert_element_type.249 = bf16[32,4096,64,512]{3,2,1,0} convert(%sharding_constraint.57), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=222 source_end_line=222 source_column=12 source_end_column=42}
+  %Arg_78.1 = f32[1024]{0} parameter(78)
+  %reduce_max.313 = f32[] reduce(%Arg_78.1, %constant.299), dimensions={0}, to_apply=%region_39.63, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %is_finite.67 = pred[] is-finite(%reduce_max.313), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %gt.67 = pred[] compare(%reduce_max.313, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.382 = f32[] divide(%constant.297, %reduce_max.313), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.383 = f32[] divide(%div.382, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_79.1 = f32[1]{0} parameter(79)
+  %div.381 = f32[1]{0} divide(%constant.294, %Arg_79.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.136 = f32[1]{0} call(%gt.67, %div.383, %div.381), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.137 = f32[1]{0} call(%is_finite.67, %jit__where_.136, %div.381), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.384 = f32[1]{0} divide(%constant.294, %jit__where_.137), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.250 = bf16[1]{0} convert(%div.384), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.415 = bf16[32,4096,64,1]{3,2,1,0} broadcast(%convert_element_type.250), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.416 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.415), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.417 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%broadcast_in_dim.416), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.385 = bf16[32,4096,64,512]{3,2,1,0} divide(%convert_element_type.249, %broadcast_in_dim.417), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_28.3 = bf16[] parameter(28)
+  %Arg_29.3 = bf16[] parameter(29)
+  %jit_clip_.71 = bf16[32,4096,64,512]{3,2,1,0} call(%div.385, %Arg_28.3, %Arg_29.3), to_apply=%clip_302.64, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.251 = f8e4m3fn[32,4096,64,512]{3,2,1,0} convert(%jit_clip_.71), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_80.1 = f32[1024]{0} parameter(80)
+  %reduce_max.314 = f32[] reduce(%Arg_80.1, %constant.299), dimensions={0}, to_apply=%region_40.65, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %is_finite.68 = pred[] is-finite(%reduce_max.314), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %gt.68 = pred[] compare(%reduce_max.314, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.387 = f32[] divide(%constant.297, %reduce_max.314), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.388 = f32[] divide(%div.387, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_81.1 = f32[1]{0} parameter(81)
+  %div.386 = f32[1]{0} divide(%constant.294, %Arg_81.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.138 = f32[1]{0} call(%gt.68, %div.388, %div.386), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.139 = f32[1]{0} call(%is_finite.68, %jit__where_.138, %div.386), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.389 = f32[1]{0} divide(%constant.294, %jit__where_.139), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.252 = bf16[1]{0} convert(%div.389), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.418 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.252), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.419 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.418), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.420 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.419), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.390 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.55, %broadcast_in_dim.420), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_30.3 = bf16[] parameter(30)
+  %Arg_31.3 = bf16[] parameter(31)
+  %jit_clip_.72 = bf16[32,4096,7168]{2,1,0} call(%div.390, %Arg_30.3, %Arg_31.3), to_apply=%clip_75.9, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.253 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.72), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %dot_general.36 = bf16[32,64,512,7168]{3,2,1,0} dot(%convert_element_type.251, %convert_element_type.253), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %mul.379 = f32[1]{0} multiply(%div.384, %div.389), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.254 = bf16[1]{0} convert(%mul.379), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.421 = bf16[32,64,512,1]{3,2,1,0} broadcast(%convert_element_type.254), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.422 = bf16[32,64,512]{2,1,0} reshape(%broadcast_in_dim.421), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.423 = bf16[32,64,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.422), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %mul.380 = bf16[32,64,512,7168]{3,2,1,0} multiply(%dot_general.36, %broadcast_in_dim.423), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %transpose.30 = bf16[64,32,512,7168]{3,2,0,1} transpose(%mul.380), dimensions={1,0,2,3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/transpose" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %sharding_constraint.59 = bf16[64,32,512,7168]{3,2,1,0} custom-call(%transpose.30), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/dispatch/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_82.1 = f32[1024]{0} parameter(82)
+  %reduce_max.315 = f32[] reduce(%Arg_82.1, %constant.299), dimensions={0}, to_apply=%region_41.66, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %is_finite.69 = pred[] is-finite(%reduce_max.315), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %gt.69 = pred[] compare(%reduce_max.315, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.392 = f32[] divide(%constant.297, %reduce_max.315), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.393 = f32[] divide(%div.392, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_83.1 = f32[1]{0} parameter(83)
+  %div.391 = f32[1]{0} divide(%constant.294, %Arg_83.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %jit__where_.140 = f32[1]{0} call(%gt.69, %div.393, %div.391), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %jit__where_.141 = f32[1]{0} call(%is_finite.69, %jit__where_.140, %div.391), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.394 = f32[1]{0} divide(%constant.294, %jit__where_.141), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %convert_element_type.255 = bf16[1]{0} convert(%div.394), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.424 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.255), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.425 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.424), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %broadcast_in_dim.426 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.425), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %div.395 = bf16[64,32,512,7168]{3,2,1,0} divide(%sharding_constraint.59, %broadcast_in_dim.426), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_32.3 = bf16[] parameter(32)
+  %Arg_33.3 = bf16[] parameter(33)
+  %jit_clip_.73 = bf16[64,32,512,7168]{3,2,1,0} call(%div.395, %Arg_32.3, %Arg_33.3), to_apply=%clip_315.67, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %convert_element_type.256 = f8e4m3fn[64,32,512,7168]{3,2,1,0} convert(%jit_clip_.73), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_59.3 = bf16[64,7168,2048]{2,1,0} parameter(59)
+  %sharding_constraint.60 = bf16[64,7168,2048]{2,1,0} custom-call(%Arg_59.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_84.1 = f32[1024]{0} parameter(84)
+  %reduce_max.316 = f32[] reduce(%Arg_84.1, %constant.299), dimensions={0}, to_apply=%region_42.68, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %is_finite.70 = pred[] is-finite(%reduce_max.316), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %gt.70 = pred[] compare(%reduce_max.316, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.397 = f32[] divide(%constant.297, %reduce_max.316), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.398 = f32[] divide(%div.397, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_86.1 = f32[1]{0} parameter(86)
+  %div.396 = f32[1]{0} divide(%constant.294, %Arg_86.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.142 = f32[1]{0} call(%gt.70, %div.398, %div.396), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.143 = f32[1]{0} call(%is_finite.70, %jit__where_.142, %div.396), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.399 = f32[1]{0} divide(%constant.294, %jit__where_.143), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.257 = bf16[1]{0} convert(%div.399), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.427 = bf16[64,7168,1]{2,1,0} broadcast(%convert_element_type.257), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.428 = bf16[64,7168]{1,0} reshape(%broadcast_in_dim.427), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.429 = bf16[64,7168,2048]{2,1,0} broadcast(%broadcast_in_dim.428), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.400 = bf16[64,7168,2048]{2,1,0} divide(%sharding_constraint.60, %broadcast_in_dim.429), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_34.3 = bf16[] parameter(34)
+  %Arg_35.3 = bf16[] parameter(35)
+  %jit_clip_.74 = bf16[64,7168,2048]{2,1,0} call(%div.400, %Arg_34.3, %Arg_35.3), to_apply=%clip_321.69, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.258 = f8e4m3fn[64,7168,2048]{2,1,0} convert(%jit_clip_.74), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %dot_general.37 = bf16[64,32,512,2048]{3,2,1,0} dot(%convert_element_type.256, %convert_element_type.258), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.381 = f32[1]{0} multiply(%div.394, %div.399), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.259 = bf16[1]{0} convert(%mul.381), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.430 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.259), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.431 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.430), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.432 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%broadcast_in_dim.431), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.382 = bf16[64,32,512,2048]{3,2,1,0} multiply(%dot_general.37, %broadcast_in_dim.432), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %sharding_constraint.62 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%mul.382), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %jit_silu_.4 = bf16[64,32,512,2048]{3,2,1,0} call(%sharding_constraint.62), to_apply=%silu_332.71, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/ffn_act/jit(silu)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %Arg_60.3 = bf16[64,7168,2048]{2,1,0} parameter(60)
+  %sharding_constraint.61 = bf16[64,7168,2048]{2,1,0} custom-call(%Arg_60.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_85.1 = f32[1024]{0} parameter(85)
+  %reduce_max.317 = f32[] reduce(%Arg_85.1, %constant.299), dimensions={0}, to_apply=%region_43.70, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %is_finite.71 = pred[] is-finite(%reduce_max.317), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/is_finite" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %gt.71 = pred[] compare(%reduce_max.317, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/gt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.402 = f32[] divide(%constant.297, %reduce_max.317), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.403 = f32[] divide(%div.402, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_87.1 = f32[1]{0} parameter(87)
+  %div.401 = f32[1]{0} divide(%constant.294, %Arg_87.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.144 = f32[1]{0} call(%gt.71, %div.403, %div.401), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %jit__where_.145 = f32[1]{0} call(%is_finite.71, %jit__where_.144, %div.401), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.404 = f32[1]{0} divide(%constant.294, %jit__where_.145), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.260 = bf16[1]{0} convert(%div.404), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.433 = bf16[64,7168,1]{2,1,0} broadcast(%convert_element_type.260), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.434 = bf16[64,7168]{1,0} reshape(%broadcast_in_dim.433), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %broadcast_in_dim.435 = bf16[64,7168,2048]{2,1,0} broadcast(%broadcast_in_dim.434), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %div.405 = bf16[64,7168,2048]{2,1,0} divide(%sharding_constraint.61, %broadcast_in_dim.435), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_36.3 = bf16[] parameter(36)
+  %Arg_37.3 = bf16[] parameter(37)
+  %jit_clip_.75 = bf16[64,7168,2048]{2,1,0} call(%div.405, %Arg_36.3, %Arg_37.3), to_apply=%clip_321.69, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %convert_element_type.261 = f8e4m3fn[64,7168,2048]{2,1,0} convert(%jit_clip_.75), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %dot_general.38 = bf16[64,32,512,2048]{3,2,1,0} dot(%convert_element_type.256, %convert_element_type.261), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.383 = f32[1]{0} multiply(%div.394, %div.404), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %convert_element_type.262 = bf16[1]{0} convert(%mul.383), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.436 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.262), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.437 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.436), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %broadcast_in_dim.438 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%broadcast_in_dim.437), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %mul.384 = bf16[64,32,512,2048]{3,2,1,0} multiply(%dot_general.38, %broadcast_in_dim.438), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %sharding_constraint.63 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%mul.384), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %mul.385 = bf16[64,32,512,2048]{3,2,1,0} multiply(%jit_silu_.4, %sharding_constraint.63), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/ffn_act/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=547 source_end_line=547 source_column=29 source_end_column=62}
+  %Arg_88.1 = f32[1024]{0} parameter(88)
+  %reduce_max.318 = f32[] reduce(%Arg_88.1, %constant.299), dimensions={0}, to_apply=%region_44.72, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %is_finite.72 = pred[] is-finite(%reduce_max.318), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %gt.72 = pred[] compare(%reduce_max.318, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.407 = f32[] divide(%constant.297, %reduce_max.318), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.408 = f32[] divide(%div.407, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_89.1 = f32[1]{0} parameter(89)
+  %div.406 = f32[1]{0} divide(%constant.294, %Arg_89.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.146 = f32[1]{0} call(%gt.72, %div.408, %div.406), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.147 = f32[1]{0} call(%is_finite.72, %jit__where_.146, %div.406), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.409 = f32[1]{0} divide(%constant.294, %jit__where_.147), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.263 = bf16[1]{0} convert(%div.409), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.439 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.263), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.440 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.439), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.441 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%broadcast_in_dim.440), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.410 = bf16[64,32,512,2048]{3,2,1,0} divide(%mul.385, %broadcast_in_dim.441), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_38.3 = bf16[] parameter(38)
+  %Arg_39.3 = bf16[] parameter(39)
+  %jit_clip_.76 = bf16[64,32,512,2048]{3,2,1,0} call(%div.410, %Arg_38.3, %Arg_39.3), to_apply=%clip_340.73, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.264 = f8e4m3fn[64,32,512,2048]{3,2,1,0} convert(%jit_clip_.76), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_61.3 = bf16[64,2048,7168]{2,1,0} parameter(61)
+  %sharding_constraint.64 = bf16[64,2048,7168]{2,1,0} custom-call(%Arg_61.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_90.1 = f32[1024]{0} parameter(90)
+  %reduce_max.319 = f32[] reduce(%Arg_90.1, %constant.299), dimensions={0}, to_apply=%region_45.74, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %is_finite.73 = pred[] is-finite(%reduce_max.319), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %gt.73 = pred[] compare(%reduce_max.319, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.412 = f32[] divide(%constant.297, %reduce_max.319), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.413 = f32[] divide(%div.412, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_91.1 = f32[1]{0} parameter(91)
+  %div.411 = f32[1]{0} divide(%constant.294, %Arg_91.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.148 = f32[1]{0} call(%gt.73, %div.413, %div.411), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.149 = f32[1]{0} call(%is_finite.73, %jit__where_.148, %div.411), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.414 = f32[1]{0} divide(%constant.294, %jit__where_.149), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.265 = bf16[1]{0} convert(%div.414), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.442 = bf16[64,2048,1]{2,1,0} broadcast(%convert_element_type.265), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.443 = bf16[64,2048]{1,0} reshape(%broadcast_in_dim.442), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.444 = bf16[64,2048,7168]{2,1,0} broadcast(%broadcast_in_dim.443), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.415 = bf16[64,2048,7168]{2,1,0} divide(%sharding_constraint.64, %broadcast_in_dim.444), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_40.3 = bf16[] parameter(40)
+  %Arg_41.3 = bf16[] parameter(41)
+  %jit_clip_.77 = bf16[64,2048,7168]{2,1,0} call(%div.415, %Arg_40.3, %Arg_41.3), to_apply=%clip_346.75, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.266 = f8e4m3fn[64,2048,7168]{2,1,0} convert(%jit_clip_.77), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %dot_general.39 = bf16[64,32,512,7168]{3,2,1,0} dot(%convert_element_type.264, %convert_element_type.266), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %mul.386 = f32[1]{0} multiply(%div.409, %div.414), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.267 = bf16[1]{0} convert(%mul.386), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.445 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.267), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.446 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.445), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.447 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.446), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %mul.387 = bf16[64,32,512,7168]{3,2,1,0} multiply(%dot_general.39, %broadcast_in_dim.447), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %sharding_constraint.65 = bf16[64,32,512,7168]{3,2,1,0} custom-call(%mul.387), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/wo/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_94.1 = f32[1024]{0} parameter(94)
+  %reduce_max.321 = f32[] reduce(%Arg_94.1, %constant.299), dimensions={0}, to_apply=%region_47.77, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %is_finite.75 = pred[] is-finite(%reduce_max.321), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %gt.75 = pred[] compare(%reduce_max.321, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.422 = f32[] divide(%constant.297, %reduce_max.321), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.423 = f32[] divide(%div.422, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_95.1 = f32[1]{0} parameter(95)
+  %div.421 = f32[1]{0} divide(%constant.294, %Arg_95.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.152 = f32[1]{0} call(%gt.75, %div.423, %div.421), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %jit__where_.153 = f32[1]{0} call(%is_finite.75, %jit__where_.152, %div.421), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.424 = f32[1]{0} divide(%constant.294, %jit__where_.153), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.270 = bf16[1]{0} convert(%div.424), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.451 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.270), dimensions={3}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.452 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.451), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.453 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.452), dimensions={0,1,2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %div.425 = bf16[64,32,512,7168]{3,2,1,0} divide(%sharding_constraint.65, %broadcast_in_dim.453), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_42.3 = bf16[] parameter(42)
+  %Arg_43.3 = bf16[] parameter(43)
+  %jit_clip_.79 = bf16[64,32,512,7168]{3,2,1,0} call(%div.425, %Arg_42.3, %Arg_43.3), to_apply=%clip_315.67, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.271 = f8e4m3fn[64,32,512,7168]{3,2,1,0} convert(%jit_clip_.79), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %dot_general.40 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.269, %convert_element_type.271), lhs_batch_dims={0}, lhs_contracting_dims={3,2}, rhs_batch_dims={1}, rhs_contracting_dims={2,0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %mul.388 = f32[1]{0} multiply(%div.419, %div.424), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %convert_element_type.272 = bf16[1]{0} convert(%mul.388), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.454 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.272), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.455 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.454), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %broadcast_in_dim.456 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.455), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %mul.389 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.40, %broadcast_in_dim.456), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_96.1 = f32[1024]{0} parameter(96)
+  %reduce_max.322 = f32[] reduce(%Arg_96.1, %constant.299), dimensions={0}, to_apply=%region_48.78, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.76 = pred[] is-finite(%reduce_max.322), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.76 = pred[] compare(%reduce_max.322, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.427 = f32[] divide(%constant.297, %reduce_max.322), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.428 = f32[] divide(%div.427, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_97.1 = f32[1]{0} parameter(97)
+  %div.426 = f32[1]{0} divide(%constant.294, %Arg_97.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.154 = f32[1]{0} call(%gt.76, %div.428, %div.426), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.155 = f32[1]{0} call(%is_finite.76, %jit__where_.154, %div.426), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.429 = f32[1]{0} divide(%constant.294, %jit__where_.155), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.273 = bf16[1]{0} convert(%div.429), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.457 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.273), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.458 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.457), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.459 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.458), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.430 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.55, %broadcast_in_dim.459), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_44.3 = bf16[] parameter(44)
+  %Arg_45.3 = bf16[] parameter(45)
+  %jit_clip_.80 = bf16[32,4096,7168]{2,1,0} call(%div.430, %Arg_44.3, %Arg_45.3), to_apply=%clip_75.9, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.274 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.80), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_62.3 = bf16[7168,2048]{1,0} parameter(62)
+  %Arg_98.1 = f32[1024]{0} parameter(98)
+  %reduce_max.323 = f32[] reduce(%Arg_98.1, %constant.299), dimensions={0}, to_apply=%region_49.79, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.77 = pred[] is-finite(%reduce_max.323), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.77 = pred[] compare(%reduce_max.323, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.432 = f32[] divide(%constant.297, %reduce_max.323), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.433 = f32[] divide(%div.432, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_99.1 = f32[1]{0} parameter(99)
+  %div.431 = f32[1]{0} divide(%constant.294, %Arg_99.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.156 = f32[1]{0} call(%gt.77, %div.433, %div.431), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.157 = f32[1]{0} call(%is_finite.77, %jit__where_.156, %div.431), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.434 = f32[1]{0} divide(%constant.294, %jit__where_.157), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.275 = bf16[1]{0} convert(%div.434), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.460 = bf16[7168,1]{1,0} broadcast(%convert_element_type.275), dimensions={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.461 = bf16[7168]{0} reshape(%broadcast_in_dim.460), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.462 = bf16[7168,2048]{1,0} broadcast(%broadcast_in_dim.461), dimensions={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.435 = bf16[7168,2048]{1,0} divide(%Arg_62.3, %broadcast_in_dim.462), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_46.3 = bf16[] parameter(46)
+  %Arg_47.3 = bf16[] parameter(47)
+  %jit_clip_.81 = bf16[7168,2048]{1,0} call(%div.435, %Arg_46.3, %Arg_47.3), to_apply=%clip_357.80, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.276 = f8e4m3fn[7168,2048]{1,0} convert(%jit_clip_.81), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.41 = bf16[32,4096,2048]{2,1,0} dot(%convert_element_type.274, %convert_element_type.276), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.390 = f32[1]{0} multiply(%div.429, %div.434), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.277 = bf16[1]{0} convert(%mul.390), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.463 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.277), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.464 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.463), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.465 = bf16[32,4096,2048]{2,1,0} broadcast(%broadcast_in_dim.464), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.391 = bf16[32,4096,2048]{2,1,0} multiply(%dot_general.41, %broadcast_in_dim.465), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %jit_silu_.5 = bf16[32,4096,2048]{2,1,0} call(%mul.391), to_apply=%silu_366.81, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/jit(silu)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %Arg_100.1 = f32[1024]{0} parameter(100)
+  %reduce_max.324 = f32[] reduce(%Arg_100.1, %constant.299), dimensions={0}, to_apply=%region_50.82, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.78 = pred[] is-finite(%reduce_max.324), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.78 = pred[] compare(%reduce_max.324, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.437 = f32[] divide(%constant.297, %reduce_max.324), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.438 = f32[] divide(%div.437, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_101.1 = f32[1]{0} parameter(101)
+  %div.436 = f32[1]{0} divide(%constant.294, %Arg_101.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.158 = f32[1]{0} call(%gt.78, %div.438, %div.436), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.159 = f32[1]{0} call(%is_finite.78, %jit__where_.158, %div.436), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.439 = f32[1]{0} divide(%constant.294, %jit__where_.159), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.278 = bf16[1]{0} convert(%div.439), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.466 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.278), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.467 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.466), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.468 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.467), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.440 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.55, %broadcast_in_dim.468), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_48.3 = bf16[] parameter(48)
+  %Arg_49.3 = bf16[] parameter(49)
+  %jit_clip_.82 = bf16[32,4096,7168]{2,1,0} call(%div.440, %Arg_48.3, %Arg_49.3), to_apply=%clip_75.9, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.279 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.82), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_63.3 = bf16[7168,2048]{1,0} parameter(63)
+  %Arg_102.1 = f32[1024]{0} parameter(102)
+  %reduce_max.325 = f32[] reduce(%Arg_102.1, %constant.299), dimensions={0}, to_apply=%region_51.83, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.79 = pred[] is-finite(%reduce_max.325), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.79 = pred[] compare(%reduce_max.325, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.442 = f32[] divide(%constant.297, %reduce_max.325), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.443 = f32[] divide(%div.442, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_103.1 = f32[1]{0} parameter(103)
+  %div.441 = f32[1]{0} divide(%constant.294, %Arg_103.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.160 = f32[1]{0} call(%gt.79, %div.443, %div.441), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.161 = f32[1]{0} call(%is_finite.79, %jit__where_.160, %div.441), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.444 = f32[1]{0} divide(%constant.294, %jit__where_.161), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.280 = bf16[1]{0} convert(%div.444), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.469 = bf16[7168,1]{1,0} broadcast(%convert_element_type.280), dimensions={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.470 = bf16[7168]{0} reshape(%broadcast_in_dim.469), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.471 = bf16[7168,2048]{1,0} broadcast(%broadcast_in_dim.470), dimensions={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.445 = bf16[7168,2048]{1,0} divide(%Arg_63.3, %broadcast_in_dim.471), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_50.3 = bf16[] parameter(50)
+  %Arg_51.3 = bf16[] parameter(51)
+  %jit_clip_.83 = bf16[7168,2048]{1,0} call(%div.445, %Arg_50.3, %Arg_51.3), to_apply=%clip_357.80, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.281 = f8e4m3fn[7168,2048]{1,0} convert(%jit_clip_.83), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.42 = bf16[32,4096,2048]{2,1,0} dot(%convert_element_type.279, %convert_element_type.281), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.392 = f32[1]{0} multiply(%div.439, %div.444), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.282 = bf16[1]{0} convert(%mul.392), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.472 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.282), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.473 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.472), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.474 = bf16[32,4096,2048]{2,1,0} broadcast(%broadcast_in_dim.473), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.393 = bf16[32,4096,2048]{2,1,0} multiply(%dot_general.42, %broadcast_in_dim.474), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.394 = bf16[32,4096,2048]{2,1,0} multiply(%jit_silu_.5, %mul.393), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=471 source_end_line=471 source_column=8 source_end_column=51}
+  %sharding_constraint.66 = bf16[32,4096,2048]{2,1,0} custom-call(%mul.394), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %Arg_104.1 = f32[1024]{0} parameter(104)
+  %reduce_max.326 = f32[] reduce(%Arg_104.1, %constant.299), dimensions={0}, to_apply=%region_52.84, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %is_finite.80 = pred[] is-finite(%reduce_max.326), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %gt.80 = pred[] compare(%reduce_max.326, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.447 = f32[] divide(%constant.297, %reduce_max.326), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.448 = f32[] divide(%div.447, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_105.1 = f32[1]{0} parameter(105)
+  %div.446 = f32[1]{0} divide(%constant.294, %Arg_105.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.162 = f32[1]{0} call(%gt.80, %div.448, %div.446), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %jit__where_.163 = f32[1]{0} call(%is_finite.80, %jit__where_.162, %div.446), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.449 = f32[1]{0} divide(%constant.294, %jit__where_.163), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.283 = bf16[1]{0} convert(%div.449), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.475 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.283), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.476 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.475), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %broadcast_in_dim.477 = bf16[32,4096,2048]{2,1,0} broadcast(%broadcast_in_dim.476), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %div.450 = bf16[32,4096,2048]{2,1,0} divide(%sharding_constraint.66, %broadcast_in_dim.477), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_52.3 = bf16[] parameter(52)
+  %Arg_53.3 = bf16[] parameter(53)
+  %jit_clip_.84 = bf16[32,4096,2048]{2,1,0} call(%div.450, %Arg_52.3, %Arg_53.3), to_apply=%clip_375.85, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %convert_element_type.284 = f8e4m3fn[32,4096,2048]{2,1,0} convert(%jit_clip_.84), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_64.3 = bf16[2048,7168]{1,0} parameter(64)
+  %Arg_106.1 = f32[1024]{0} parameter(106)
+  %reduce_max.327 = f32[] reduce(%Arg_106.1, %constant.299), dimensions={0}, to_apply=%region_53.86, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %is_finite.81 = pred[] is-finite(%reduce_max.327), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/is_finite" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %gt.81 = pred[] compare(%reduce_max.327, %constant.300), direction=GT, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/gt" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.452 = f32[] divide(%constant.297, %reduce_max.327), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.453 = f32[] divide(%div.452, %constant.298), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_107.1 = f32[1]{0} parameter(107)
+  %div.451 = f32[1]{0} divide(%constant.294, %Arg_107.1), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.164 = f32[1]{0} call(%gt.81, %div.453, %div.451), to_apply=%_where_68.7, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %jit__where_.165 = f32[1]{0} call(%is_finite.81, %jit__where_.164, %div.451), to_apply=%_where_71.8, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.454 = f32[1]{0} divide(%constant.294, %jit__where_.165), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.285 = bf16[1]{0} convert(%div.454), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.478 = bf16[2048,1]{1,0} broadcast(%convert_element_type.285), dimensions={1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.479 = bf16[2048]{0} reshape(%broadcast_in_dim.478), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %broadcast_in_dim.480 = bf16[2048,7168]{1,0} broadcast(%broadcast_in_dim.479), dimensions={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %div.455 = bf16[2048,7168]{1,0} divide(%Arg_64.3, %broadcast_in_dim.480), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_54.3 = bf16[] parameter(54)
+  %Arg_55.3 = bf16[] parameter(55)
+  %jit_clip_.85 = bf16[2048,7168]{1,0} call(%div.455, %Arg_54.3, %Arg_55.3), to_apply=%clip_381.87, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(clip)" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %convert_element_type.286 = f8e4m3fn[2048,7168]{1,0} convert(%jit_clip_.85), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %dot_general.43 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.284, %convert_element_type.286), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/dot_general" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %mul.395 = f32[1]{0} multiply(%div.449, %div.454), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %convert_element_type.287 = bf16[1]{0} convert(%mul.395), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.481 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.287), dimensions={2}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.482 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.481), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %broadcast_in_dim.483 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.482), dimensions={0,1}, metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %mul.396 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.43, %broadcast_in_dim.483), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/mul" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=488 source_end_line=493 source_column=6 source_end_column=3}
+  %add.73 = bf16[32,4096,7168]{2,1,0} add(%mul.389, %mul.396), metadata={op_name="moe_layers/DeepSeekMoeBlock_0/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=2002 source_end_line=2002 source_column=11 source_end_column=42}
+  %sharding_constraint.67 = bf16[32,4096,7168]{2,1,0} custom-call(%add.73), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %add.74 = bf16[32,4096,7168]{2,1,0} add(%sharding_constraint.67, %add.71), metadata={op_name="moe_layers/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/deepseek.py" source_line=270 source_end_line=270 source_column=19 source_end_column=48}
+  %sharding_constraint.68 = bf16[32,4096,7168]{2,1,0} custom-call(%add.74), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="moe_layers/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %device_put.14 = bf16[32,4096,1536]{2,1,0} custom-call(%mul.328), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.15 = bf16[32,4096,576]{2,1,0} custom-call(%mul.330), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.16 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%sharding_constraint.62), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.17 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%sharding_constraint.63), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.18 = bf16[32,4096,2048]{2,1,0} custom-call(%mul.391), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %device_put.19 = bf16[32,4096,2048]{2,1,0} custom-call(%mul.393), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="pinned_host"}, metadata={op_name="device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  ROOT %tuple.5 = (bf16[32,4096,7168]{2,1,0}, bf16[32,4096,7168]{2,1,0}, bf16[32,4096,1536]{2,1,0}, bf16[32,4096,576]{2,1,0}, bf16[32,4096,7168]{2,1,0}, /*index=5*/bf16[64,32,512,2048]{3,2,1,0}, bf16[64,32,512,2048]{3,2,1,0}, bf16[32,4096,2048]{2,1,0}, bf16[32,4096,2048]{2,1,0}) tuple(%sharding_constraint.68, %reduce_precision.6, %device_put.14, %device_put.15, %reduce_precision.7, /*index=5*/%device_put.16, %device_put.17, %device_put.18, %device_put.19)
+}
+
+%region_22.89 (arg_tuple.5: (s32[], bf16[32,4096,7168], bf16[2,32,4096,7168], bf16[2,32,4096,1536], bf16[2,32,4096,576], /*index=5*/bf16[2,32,4096,7168], bf16[2,64,32,512,2048], bf16[2,64,32,512,2048], bf16[2,32,4096,2048], bf16[2,32,4096,2048], /*index=10*/bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], bf16[2,64,7168,2048], bf16[2,64,2048,7168], /*index=15*/bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], bf16[2,7168], bf16[2,7168], /*index=20*/bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], bf16[2,7168,576], bf16[2,512,128,256], /*index=25*/bf16[2,7168,1536], bf16[2,1536,128,192], f32[2,1024], f32[2,1], f32[2,1024], /*index=30*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=35*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=40*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=45*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=50*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=55*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=60*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=65*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=70*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=75*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], c64[32,4096,1,32], /*index=80*/c64[32,4096,1,32], s32[1,1,1,513])) -> (s32[], bf16[32,4096,7168], bf16[2,32,4096,7168], bf16[2,32,4096,1536], bf16[2,32,4096,576], /*index=5*/bf16[2,32,4096,7168], bf16[2,64,32,512,2048], bf16[2,64,32,512,2048], bf16[2,32,4096,2048], bf16[2,32,4096,2048], /*index=10*/bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], bf16[2,64,7168,2048], bf16[2,64,2048,7168], /*index=15*/bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], bf16[2,7168], bf16[2,7168], /*index=20*/bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], bf16[2,7168,576], bf16[2,512,128,256], /*index=25*/bf16[2,7168,1536], bf16[2,1536,128,192], f32[2,1024], f32[2,1], f32[2,1024], /*index=30*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=35*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=40*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=45*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=50*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=55*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=60*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=65*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=70*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=75*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], c64[32,4096,1,32], /*index=80*/c64[32,4096,1,32], s32[1,1,1,513]) {
+  %arg_tuple.5 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, /*index=5*/bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, /*index=10*/bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, /*index=15*/bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, /*index=20*/bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, /*index=25*/bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=35*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=65*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, c64[32,4096,1,32]{3,2,1,0}, /*index=80*/c64[32,4096,1,32]{3,2,1,0}, s32[1,1,1,513]{3,2,1,0}) parameter(0)
+  %get-tuple-element.290 = s32[] get-tuple-element(%arg_tuple.5), index=0
+  %constant.301 = s32[] constant(1)
+  %add.76 = s32[] add(%get-tuple-element.290, %constant.301), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.304 = bf16[] constant(448)
+  %constant.303 = bf16[] constant(-448)
+  %get-tuple-element.369 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=79
+  %get-tuple-element.370 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=80
+  %constant.302 = bf16[0]{0} constant({})
+  %get-tuple-element.371 = s32[1,1,1,513]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=81
+  %get-tuple-element.291 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.5), index=1
+  %get-tuple-element.300 = bf16[2,7168,64]{2,1,0} get-tuple-element(%arg_tuple.5), index=10
+  %constant.305 = s32[] constant(0)
+  %dynamic_slice.153 = bf16[1,7168,64]{2,1,0} dynamic-slice(%get-tuple-element.300, %get-tuple-element.290, %constant.305, %constant.305), dynamic_slice_sizes={1,7168,64}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.169 = bf16[7168,64]{1,0} reshape(%dynamic_slice.153), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.301 = bf16[2,64]{1,0} get-tuple-element(%arg_tuple.5), index=11
+  %dynamic_slice.154 = bf16[1,64]{1,0} dynamic-slice(%get-tuple-element.301, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,64}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.170 = bf16[64]{0} reshape(%dynamic_slice.154), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.302 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=12
+  %dynamic_slice.155 = bf16[1,64,7168,2048]{3,2,1,0} dynamic-slice(%get-tuple-element.302, %get-tuple-element.290, %constant.305, %constant.305, %constant.305), dynamic_slice_sizes={1,64,7168,2048}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.171 = bf16[64,7168,2048]{2,1,0} reshape(%dynamic_slice.155), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.303 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=13
+  %dynamic_slice.156 = bf16[1,64,7168,2048]{3,2,1,0} dynamic-slice(%get-tuple-element.303, %get-tuple-element.290, %constant.305, %constant.305, %constant.305), dynamic_slice_sizes={1,64,7168,2048}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.172 = bf16[64,7168,2048]{2,1,0} reshape(%dynamic_slice.156), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.304 = bf16[2,64,2048,7168]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=14
+  %dynamic_slice.157 = bf16[1,64,2048,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.304, %get-tuple-element.290, %constant.305, %constant.305, %constant.305), dynamic_slice_sizes={1,64,2048,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.173 = bf16[64,2048,7168]{2,1,0} reshape(%dynamic_slice.157), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.305 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.5), index=15
+  %dynamic_slice.158 = bf16[1,7168,2048]{2,1,0} dynamic-slice(%get-tuple-element.305, %get-tuple-element.290, %constant.305, %constant.305), dynamic_slice_sizes={1,7168,2048}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.174 = bf16[7168,2048]{1,0} reshape(%dynamic_slice.158), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.306 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.5), index=16
+  %dynamic_slice.159 = bf16[1,7168,2048]{2,1,0} dynamic-slice(%get-tuple-element.306, %get-tuple-element.290, %constant.305, %constant.305), dynamic_slice_sizes={1,7168,2048}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.175 = bf16[7168,2048]{1,0} reshape(%dynamic_slice.159), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.307 = bf16[2,2048,7168]{2,1,0} get-tuple-element(%arg_tuple.5), index=17
+  %dynamic_slice.160 = bf16[1,2048,7168]{2,1,0} dynamic-slice(%get-tuple-element.307, %get-tuple-element.290, %constant.305, %constant.305), dynamic_slice_sizes={1,2048,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.176 = bf16[2048,7168]{1,0} reshape(%dynamic_slice.160), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.308 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.5), index=18
+  %dynamic_slice.161 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.308, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.177 = bf16[7168]{0} reshape(%dynamic_slice.161), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.309 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.5), index=19
+  %dynamic_slice.162 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.309, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.178 = bf16[7168]{0} reshape(%dynamic_slice.162), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.310 = bf16[2,512]{1,0} get-tuple-element(%arg_tuple.5), index=20
+  %dynamic_slice.163 = bf16[1,512]{1,0} dynamic-slice(%get-tuple-element.310, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,512}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.179 = bf16[512]{0} reshape(%dynamic_slice.163), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.311 = bf16[2,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=21
+  %dynamic_slice.164 = bf16[1,128,128,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.311, %get-tuple-element.290, %constant.305, %constant.305, %constant.305), dynamic_slice_sizes={1,128,128,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.180 = bf16[128,128,7168]{2,1,0} reshape(%dynamic_slice.164), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.312 = bf16[2,1536]{1,0} get-tuple-element(%arg_tuple.5), index=22
+  %dynamic_slice.165 = bf16[1,1536]{1,0} dynamic-slice(%get-tuple-element.312, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1536}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.181 = bf16[1536]{0} reshape(%dynamic_slice.165), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.313 = bf16[2,7168,576]{2,1,0} get-tuple-element(%arg_tuple.5), index=23
+  %dynamic_slice.166 = bf16[1,7168,576]{2,1,0} dynamic-slice(%get-tuple-element.313, %get-tuple-element.290, %constant.305, %constant.305), dynamic_slice_sizes={1,7168,576}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.182 = bf16[7168,576]{1,0} reshape(%dynamic_slice.166), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.314 = bf16[2,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=24
+  %dynamic_slice.167 = bf16[1,512,128,256]{3,2,1,0} dynamic-slice(%get-tuple-element.314, %get-tuple-element.290, %constant.305, %constant.305, %constant.305), dynamic_slice_sizes={1,512,128,256}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.183 = bf16[512,128,256]{2,1,0} reshape(%dynamic_slice.167), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.315 = bf16[2,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.5), index=25
+  %dynamic_slice.168 = bf16[1,7168,1536]{2,1,0} dynamic-slice(%get-tuple-element.315, %get-tuple-element.290, %constant.305, %constant.305), dynamic_slice_sizes={1,7168,1536}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.184 = bf16[7168,1536]{1,0} reshape(%dynamic_slice.168), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.316 = bf16[2,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=26
+  %dynamic_slice.169 = bf16[1,1536,128,192]{3,2,1,0} dynamic-slice(%get-tuple-element.316, %get-tuple-element.290, %constant.305, %constant.305, %constant.305), dynamic_slice_sizes={1,1536,128,192}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.185 = bf16[1536,128,192]{2,1,0} reshape(%dynamic_slice.169), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.317 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=27
+  %dynamic_slice.170 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.317, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.186 = f32[1024]{0} reshape(%dynamic_slice.170), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.318 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=28
+  %dynamic_slice.171 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.318, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.187 = f32[1]{0} reshape(%dynamic_slice.171), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.319 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=29
+  %dynamic_slice.172 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.319, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.188 = f32[1024]{0} reshape(%dynamic_slice.172), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.320 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=30
+  %dynamic_slice.173 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.320, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.189 = f32[1]{0} reshape(%dynamic_slice.173), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.321 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=31
+  %dynamic_slice.174 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.321, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.190 = f32[1024]{0} reshape(%dynamic_slice.174), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.322 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=32
+  %dynamic_slice.175 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.322, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.191 = f32[1]{0} reshape(%dynamic_slice.175), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.323 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=33
+  %dynamic_slice.176 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.323, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.192 = f32[1024]{0} reshape(%dynamic_slice.176), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.324 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=34
+  %dynamic_slice.177 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.324, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.193 = f32[1]{0} reshape(%dynamic_slice.177), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.325 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=35
+  %dynamic_slice.178 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.325, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.194 = f32[1024]{0} reshape(%dynamic_slice.178), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.326 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=36
+  %dynamic_slice.179 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.326, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.195 = f32[1]{0} reshape(%dynamic_slice.179), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.327 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=37
+  %dynamic_slice.180 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.327, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.196 = f32[1024]{0} reshape(%dynamic_slice.180), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.328 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=38
+  %dynamic_slice.181 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.328, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.197 = f32[1024]{0} reshape(%dynamic_slice.181), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.329 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=39
+  %dynamic_slice.182 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.329, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.198 = f32[1]{0} reshape(%dynamic_slice.182), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.330 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=40
+  %dynamic_slice.183 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.330, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.199 = f32[1]{0} reshape(%dynamic_slice.183), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.331 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=41
+  %dynamic_slice.184 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.331, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.200 = f32[1024]{0} reshape(%dynamic_slice.184), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.332 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=42
+  %dynamic_slice.185 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.332, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.201 = f32[1]{0} reshape(%dynamic_slice.185), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.333 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=43
+  %dynamic_slice.186 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.333, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.202 = f32[1024]{0} reshape(%dynamic_slice.186), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.334 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=44
+  %dynamic_slice.187 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.334, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.203 = f32[1]{0} reshape(%dynamic_slice.187), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.335 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=45
+  %dynamic_slice.188 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.335, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.204 = f32[1024]{0} reshape(%dynamic_slice.188), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.336 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=46
+  %dynamic_slice.189 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.336, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.205 = f32[1]{0} reshape(%dynamic_slice.189), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.337 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=47
+  %dynamic_slice.190 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.337, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.206 = f32[1024]{0} reshape(%dynamic_slice.190), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.338 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=48
+  %dynamic_slice.191 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.338, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.207 = f32[1]{0} reshape(%dynamic_slice.191), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.339 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=49
+  %dynamic_slice.192 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.339, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.208 = f32[1024]{0} reshape(%dynamic_slice.192), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.340 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=50
+  %dynamic_slice.193 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.340, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.209 = f32[1]{0} reshape(%dynamic_slice.193), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.341 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=51
+  %dynamic_slice.194 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.341, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.210 = f32[1024]{0} reshape(%dynamic_slice.194), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.342 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=52
+  %dynamic_slice.195 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.342, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.211 = f32[1]{0} reshape(%dynamic_slice.195), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.343 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=53
+  %dynamic_slice.196 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.343, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.212 = f32[1024]{0} reshape(%dynamic_slice.196), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.344 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=54
+  %dynamic_slice.197 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.344, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.213 = f32[1]{0} reshape(%dynamic_slice.197), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.345 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=55
+  %dynamic_slice.198 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.345, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.214 = f32[1024]{0} reshape(%dynamic_slice.198), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.346 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=56
+  %dynamic_slice.199 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.346, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.215 = f32[1]{0} reshape(%dynamic_slice.199), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.347 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=57
+  %dynamic_slice.200 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.347, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.216 = f32[1024]{0} reshape(%dynamic_slice.200), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.348 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=58
+  %dynamic_slice.201 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.348, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.217 = f32[1]{0} reshape(%dynamic_slice.201), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.349 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=59
+  %dynamic_slice.202 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.349, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.218 = f32[1024]{0} reshape(%dynamic_slice.202), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.350 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=60
+  %dynamic_slice.203 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.350, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.219 = f32[1]{0} reshape(%dynamic_slice.203), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.351 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=61
+  %dynamic_slice.204 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.351, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.220 = f32[1024]{0} reshape(%dynamic_slice.204), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.352 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=62
+  %dynamic_slice.205 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.352, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.221 = f32[1]{0} reshape(%dynamic_slice.205), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.353 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=63
+  %dynamic_slice.206 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.353, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.222 = f32[1024]{0} reshape(%dynamic_slice.206), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.354 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=64
+  %dynamic_slice.207 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.354, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.223 = f32[1024]{0} reshape(%dynamic_slice.207), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.355 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=65
+  %dynamic_slice.208 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.355, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.224 = f32[1]{0} reshape(%dynamic_slice.208), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.356 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=66
+  %dynamic_slice.209 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.356, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.225 = f32[1]{0} reshape(%dynamic_slice.209), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.357 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=67
+  %dynamic_slice.210 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.357, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.226 = f32[1024]{0} reshape(%dynamic_slice.210), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.358 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=68
+  %dynamic_slice.211 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.358, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.227 = f32[1]{0} reshape(%dynamic_slice.211), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.359 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=69
+  %dynamic_slice.212 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.359, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.228 = f32[1024]{0} reshape(%dynamic_slice.212), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.360 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=70
+  %dynamic_slice.213 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.360, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.229 = f32[1]{0} reshape(%dynamic_slice.213), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.361 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=71
+  %dynamic_slice.214 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.361, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.230 = f32[1024]{0} reshape(%dynamic_slice.214), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.362 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=72
+  %dynamic_slice.215 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.362, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.231 = f32[1]{0} reshape(%dynamic_slice.215), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.363 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=73
+  %dynamic_slice.216 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.363, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.232 = f32[1024]{0} reshape(%dynamic_slice.216), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.364 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=74
+  %dynamic_slice.217 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.364, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.233 = f32[1]{0} reshape(%dynamic_slice.217), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.365 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=75
+  %dynamic_slice.218 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.365, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.234 = f32[1024]{0} reshape(%dynamic_slice.218), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.366 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=76
+  %dynamic_slice.219 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.366, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.235 = f32[1]{0} reshape(%dynamic_slice.219), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.367 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.5), index=77
+  %dynamic_slice.220 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.367, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.236 = f32[1024]{0} reshape(%dynamic_slice.220), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.368 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.5), index=78
+  %dynamic_slice.221 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.368, %get-tuple-element.290, %constant.305), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.237 = f32[1]{0} reshape(%dynamic_slice.221), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.26 = (bf16[32,4096,7168]{2,1,0}, bf16[32,4096,7168]{2,1,0}, bf16[32,4096,1536]{2,1,0}, bf16[32,4096,576]{2,1,0}, bf16[32,4096,7168]{2,1,0}, /*index=5*/bf16[64,32,512,2048]{3,2,1,0}, bf16[64,32,512,2048]{3,2,1,0}, bf16[32,4096,2048]{2,1,0}, bf16[32,4096,2048]{2,1,0}) call(%constant.304, %constant.303, %constant.304, %constant.303, %constant.304, /*index=5*/%constant.303, %constant.304, %constant.303, %get-tuple-element.369, %constant.304, /*index=10*/%constant.303, %constant.304, %constant.303, %constant.304, %constant.303, /*index=15*/%get-tuple-element.370, %constant.302, %constant.304, %constant.303, %constant.304, /*index=20*/%constant.303, %constant.304, %constant.303, %constant.304, %constant.303, /*index=25*/%get-tuple-element.371, %constant.304, %constant.303, %constant.304, %constant.303, /*index=30*/%constant.304, %constant.303, %constant.304, %constant.303, %constant.304, /*index=35*/%constant.303, %constant.304, %constant.303, %constant.304, %constant.303, /*index=40*/%constant.304, %constant.303, %constant.304, %constant.303, %constant.304, /*index=45*/%constant.303, %constant.304, %constant.303, %constant.304, %constant.303, /*index=50*/%constant.304, %constant.303, %constant.304, %constant.303, %constant.304, /*index=55*/%constant.303, %get-tuple-element.291, %squeeze.169, %squeeze.170, %squeeze.171, /*index=60*/%squeeze.172, %squeeze.173, %squeeze.174, %squeeze.175, %squeeze.176, /*index=65*/%squeeze.177, %squeeze.178, %squeeze.179, %squeeze.180, %squeeze.181, /*index=70*/%squeeze.182, %squeeze.183, %squeeze.184, %squeeze.185, %squeeze.186, /*index=75*/%squeeze.187, %squeeze.188, %squeeze.189, %squeeze.190, %squeeze.191, /*index=80*/%squeeze.192, %squeeze.193, %squeeze.194, %squeeze.195, %squeeze.196, /*index=85*/%squeeze.197, %squeeze.198, %squeeze.199, %squeeze.200, %squeeze.201, /*index=90*/%squeeze.202, %squeeze.203, %squeeze.204, %squeeze.205, %squeeze.206, /*index=95*/%squeeze.207, %squeeze.208, %squeeze.209, %squeeze.210, %squeeze.211, /*index=100*/%squeeze.212, %squeeze.213, %squeeze.214, %squeeze.215, %squeeze.216, /*index=105*/%squeeze.217, %squeeze.218, %squeeze.219, %squeeze.220, %squeeze.221, /*index=110*/%squeeze.222, %squeeze.223, %squeeze.224, %squeeze.225, %squeeze.226, /*index=115*/%squeeze.227, %squeeze.228, %squeeze.229, %squeeze.230, %squeeze.231, /*index=120*/%squeeze.232, %squeeze.233, %squeeze.234, %squeeze.235, %squeeze.236, /*index=125*/%squeeze.237), to_apply=%closed_call_272.88, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.27 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.26), index=0, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.292 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=2
+  %closed_call.28 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.26), index=1, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.492 = bf16[1,32,4096,7168]{3,2,1,0} reshape(%closed_call.28), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.20 = bf16[2,32,4096,7168]{3,2,1,0} dynamic-update-slice(%get-tuple-element.292, %broadcast_in_dim.492, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.293 = bf16[2,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=3
+  %closed_call.29 = bf16[32,4096,1536]{2,1,0} get-tuple-element(%closed_call.26), index=2, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.493 = bf16[1,32,4096,1536]{3,2,1,0} reshape(%closed_call.29), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.21 = bf16[2,32,4096,1536]{3,2,1,0} dynamic-update-slice(%get-tuple-element.293, %broadcast_in_dim.493, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.294 = bf16[2,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=4
+  %closed_call.30 = bf16[32,4096,576]{2,1,0} get-tuple-element(%closed_call.26), index=3, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.494 = bf16[1,32,4096,576]{3,2,1,0} reshape(%closed_call.30), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.22 = bf16[2,32,4096,576]{3,2,1,0} dynamic-update-slice(%get-tuple-element.294, %broadcast_in_dim.494, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.295 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=5
+  %closed_call.31 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.26), index=4, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.495 = bf16[1,32,4096,7168]{3,2,1,0} reshape(%closed_call.31), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.23 = bf16[2,32,4096,7168]{3,2,1,0} dynamic-update-slice(%get-tuple-element.295, %broadcast_in_dim.495, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.296 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.5), index=6
+  %closed_call.32 = bf16[64,32,512,2048]{3,2,1,0} get-tuple-element(%closed_call.26), index=5, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.496 = bf16[1,64,32,512,2048]{4,3,2,1,0} reshape(%closed_call.32), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.24 = bf16[2,64,32,512,2048]{4,3,2,1,0} dynamic-update-slice(%get-tuple-element.296, %broadcast_in_dim.496, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305, %constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.297 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.5), index=7
+  %closed_call.33 = bf16[64,32,512,2048]{3,2,1,0} get-tuple-element(%closed_call.26), index=6, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.497 = bf16[1,64,32,512,2048]{4,3,2,1,0} reshape(%closed_call.33), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.25 = bf16[2,64,32,512,2048]{4,3,2,1,0} dynamic-update-slice(%get-tuple-element.297, %broadcast_in_dim.497, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305, %constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.298 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=8
+  %closed_call.34 = bf16[32,4096,2048]{2,1,0} get-tuple-element(%closed_call.26), index=7, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.498 = bf16[1,32,4096,2048]{3,2,1,0} reshape(%closed_call.34), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.26 = bf16[2,32,4096,2048]{3,2,1,0} dynamic-update-slice(%get-tuple-element.298, %broadcast_in_dim.498, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.299 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.5), index=9
+  %closed_call.35 = bf16[32,4096,2048]{2,1,0} get-tuple-element(%closed_call.26), index=8, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.499 = bf16[1,32,4096,2048]{3,2,1,0} reshape(%closed_call.35), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.27 = bf16[2,32,4096,2048]{3,2,1,0} dynamic-update-slice(%get-tuple-element.299, %broadcast_in_dim.499, %get-tuple-element.290, %constant.305, %constant.305, /*index=5*/%constant.305), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  ROOT %tuple.7 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, /*index=5*/bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, /*index=10*/bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, /*index=15*/bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, /*index=20*/bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, /*index=25*/bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=35*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=65*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, c64[32,4096,1,32]{3,2,1,0}, /*index=80*/c64[32,4096,1,32]{3,2,1,0}, s32[1,1,1,513]{3,2,1,0}) tuple(%add.76, %closed_call.27, %dynamic_update_slice.20, %dynamic_update_slice.21, %dynamic_update_slice.22, /*index=5*/%dynamic_update_slice.23, %dynamic_update_slice.24, %dynamic_update_slice.25, %dynamic_update_slice.26, %dynamic_update_slice.27, /*index=10*/%get-tuple-element.300, %get-tuple-element.301, %get-tuple-element.302, %get-tuple-element.303, %get-tuple-element.304, /*index=15*/%get-tuple-element.305, %get-tuple-element.306, %get-tuple-element.307, %get-tuple-element.308, %get-tuple-element.309, /*index=20*/%get-tuple-element.310, %get-tuple-element.311, %get-tuple-element.312, %get-tuple-element.313, %get-tuple-element.314, /*index=25*/%get-tuple-element.315, %get-tuple-element.316, %get-tuple-element.317, %get-tuple-element.318, %get-tuple-element.319, /*index=30*/%get-tuple-element.320, %get-tuple-element.321, %get-tuple-element.322, %get-tuple-element.323, %get-tuple-element.324, /*index=35*/%get-tuple-element.325, %get-tuple-element.326, %get-tuple-element.327, %get-tuple-element.328, %get-tuple-element.329, /*index=40*/%get-tuple-element.330, %get-tuple-element.331, %get-tuple-element.332, %get-tuple-element.333, %get-tuple-element.334, /*index=45*/%get-tuple-element.335, %get-tuple-element.336, %get-tuple-element.337, %get-tuple-element.338, %get-tuple-element.339, /*index=50*/%get-tuple-element.340, %get-tuple-element.341, %get-tuple-element.342, %get-tuple-element.343, %get-tuple-element.344, /*index=55*/%get-tuple-element.345, %get-tuple-element.346, %get-tuple-element.347, %get-tuple-element.348, %get-tuple-element.349, /*index=60*/%get-tuple-element.350, %get-tuple-element.351, %get-tuple-element.352, %get-tuple-element.353, %get-tuple-element.354, /*index=65*/%get-tuple-element.355, %get-tuple-element.356, %get-tuple-element.357, %get-tuple-element.358, %get-tuple-element.359, /*index=70*/%get-tuple-element.360, %get-tuple-element.361, %get-tuple-element.362, %get-tuple-element.363, %get-tuple-element.364, /*index=75*/%get-tuple-element.365, %get-tuple-element.366, %get-tuple-element.367, %get-tuple-element.368, %get-tuple-element.369, /*index=80*/%get-tuple-element.370, %get-tuple-element.371)
+}
+
+%region_54.90 (arg_tuple.7: (s32[], bf16[32,4096,7168], bf16[2,32,4096,7168], bf16[2,32,4096,1536], bf16[2,32,4096,576], /*index=5*/bf16[2,32,4096,7168], bf16[2,64,32,512,2048], bf16[2,64,32,512,2048], bf16[2,32,4096,2048], bf16[2,32,4096,2048], /*index=10*/bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], bf16[2,64,7168,2048], bf16[2,64,2048,7168], /*index=15*/bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], bf16[2,7168], bf16[2,7168], /*index=20*/bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], bf16[2,7168,576], bf16[2,512,128,256], /*index=25*/bf16[2,7168,1536], bf16[2,1536,128,192], f32[2,1024], f32[2,1], f32[2,1024], /*index=30*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=35*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=40*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=45*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=50*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=55*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=60*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=65*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=70*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=75*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], c64[32,4096,1,32], /*index=80*/c64[32,4096,1,32], s32[1,1,1,513])) -> pred[] {
+  %arg_tuple.7 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, /*index=5*/bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, /*index=10*/bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, /*index=15*/bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, /*index=20*/bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, /*index=25*/bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=35*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=65*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, c64[32,4096,1,32]{3,2,1,0}, /*index=80*/c64[32,4096,1,32]{3,2,1,0}, s32[1,1,1,513]{3,2,1,0}) parameter(0)
+  %get-tuple-element.455 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.7), index=1
+  %get-tuple-element.456 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=2
+  %get-tuple-element.457 = bf16[2,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=3
+  %get-tuple-element.458 = bf16[2,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=4
+  %get-tuple-element.459 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=5
+  %get-tuple-element.460 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.7), index=6
+  %get-tuple-element.461 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.7), index=7
+  %get-tuple-element.462 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=8
+  %get-tuple-element.463 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=9
+  %get-tuple-element.464 = bf16[2,7168,64]{2,1,0} get-tuple-element(%arg_tuple.7), index=10
+  %get-tuple-element.465 = bf16[2,64]{1,0} get-tuple-element(%arg_tuple.7), index=11
+  %get-tuple-element.466 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=12
+  %get-tuple-element.467 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=13
+  %get-tuple-element.468 = bf16[2,64,2048,7168]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=14
+  %get-tuple-element.469 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.7), index=15
+  %get-tuple-element.470 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.7), index=16
+  %get-tuple-element.471 = bf16[2,2048,7168]{2,1,0} get-tuple-element(%arg_tuple.7), index=17
+  %get-tuple-element.472 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.7), index=18
+  %get-tuple-element.473 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.7), index=19
+  %get-tuple-element.474 = bf16[2,512]{1,0} get-tuple-element(%arg_tuple.7), index=20
+  %get-tuple-element.475 = bf16[2,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=21
+  %get-tuple-element.476 = bf16[2,1536]{1,0} get-tuple-element(%arg_tuple.7), index=22
+  %get-tuple-element.477 = bf16[2,7168,576]{2,1,0} get-tuple-element(%arg_tuple.7), index=23
+  %get-tuple-element.478 = bf16[2,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=24
+  %get-tuple-element.479 = bf16[2,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.7), index=25
+  %get-tuple-element.480 = bf16[2,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=26
+  %get-tuple-element.481 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=27
+  %get-tuple-element.482 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=28
+  %get-tuple-element.483 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=29
+  %get-tuple-element.484 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=30
+  %get-tuple-element.485 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=31
+  %get-tuple-element.486 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=32
+  %get-tuple-element.487 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=33
+  %get-tuple-element.488 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=34
+  %get-tuple-element.489 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=35
+  %get-tuple-element.490 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=36
+  %get-tuple-element.491 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=37
+  %get-tuple-element.492 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=38
+  %get-tuple-element.493 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=39
+  %get-tuple-element.494 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=40
+  %get-tuple-element.495 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=41
+  %get-tuple-element.496 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=42
+  %get-tuple-element.497 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=43
+  %get-tuple-element.498 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=44
+  %get-tuple-element.499 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=45
+  %get-tuple-element.500 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=46
+  %get-tuple-element.501 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=47
+  %get-tuple-element.502 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=48
+  %get-tuple-element.503 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=49
+  %get-tuple-element.504 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=50
+  %get-tuple-element.505 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=51
+  %get-tuple-element.506 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=52
+  %get-tuple-element.507 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=53
+  %get-tuple-element.508 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=54
+  %get-tuple-element.509 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=55
+  %get-tuple-element.510 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=56
+  %get-tuple-element.511 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=57
+  %get-tuple-element.512 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=58
+  %get-tuple-element.513 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=59
+  %get-tuple-element.514 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=60
+  %get-tuple-element.515 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=61
+  %get-tuple-element.516 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=62
+  %get-tuple-element.517 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=63
+  %get-tuple-element.518 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=64
+  %get-tuple-element.519 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=65
+  %get-tuple-element.520 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=66
+  %get-tuple-element.521 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=67
+  %get-tuple-element.522 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=68
+  %get-tuple-element.523 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=69
+  %get-tuple-element.524 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=70
+  %get-tuple-element.525 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=71
+  %get-tuple-element.526 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=72
+  %get-tuple-element.527 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=73
+  %get-tuple-element.528 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=74
+  %get-tuple-element.529 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=75
+  %get-tuple-element.530 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=76
+  %get-tuple-element.531 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.7), index=77
+  %get-tuple-element.532 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.7), index=78
+  %get-tuple-element.533 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=79
+  %get-tuple-element.534 = c64[32,4096,1,32]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=80
+  %get-tuple-element.535 = s32[1,1,1,513]{3,2,1,0} get-tuple-element(%arg_tuple.7), index=81
+  %get-tuple-element.454 = s32[] get-tuple-element(%arg_tuple.7), index=0
+  %constant.307 = s32[] constant(2)
+  ROOT %lt.8 = pred[] compare(%get-tuple-element.454, %constant.307), direction=LT, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/cond/lt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+}
+
+%region_60.97 (reduce_sum.95: s32[], reduce_sum.96: s32[]) -> s32[] {
+  %reduce_sum.95 = s32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.96 = s32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.97 = s32[] add(%reduce_sum.95, %reduce_sum.96), metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=151 source_end_line=151 source_column=18 source_end_column=60}
+}
+
+%region_55.91 (reduce_sum.67: f32[], reduce_sum.68: f32[]) -> f32[] {
+  %reduce_sum.67 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.68 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.69 = f32[] add(%reduce_sum.67, %reduce_sum.68), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%region_56.93 (reduce_max.331: f32[], reduce_max.332: f32[]) -> f32[] {
+  %reduce_max.331 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.332 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.333 = f32[] maximum(%reduce_max.331, %reduce_max.332), metadata={op_name="jit(train_step)/jvp()/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=570 source_end_line=570 source_column=14 source_end_column=48}
+}
+
+%region_57.94 (reduce_sum.74: f32[], reduce_sum.75: f32[]) -> f32[] {
+  %reduce_sum.74 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.75 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.76 = f32[] add(%reduce_sum.74, %reduce_sum.75), metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=573 source_end_line=573 source_column=12 source_end_column=56}
+}
+
+%_one_hot_399.92 (Arg_0.67: s32[32,4096]) -> f32[32,4096,129280] {
+  %Arg_0.67 = s32[32,4096]{1,0} parameter(0)
+  %broadcast_in_dim.503 = s32[32,4096,1]{2,1,0} reshape(%Arg_0.67), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %eq.21 = s32[32,4096,1]{2,1,0} broadcast(%broadcast_in_dim.503), dimensions={0,1,2}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %eq.22 = s32[32,4096]{1,0} reshape(%eq.21), metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %eq.23 = s32[32,4096,129280]{2,1,0} broadcast(%eq.22), dimensions={0,1}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %iota.18 = s32[129280]{0} iota(), iota_dimension=0, metadata={op_name="iota" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %iota.19 = s32[1,1,129280]{2,1,0} reshape(%iota.18), metadata={op_name="iota" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %eq.24 = s32[1,1,129280]{2,1,0} broadcast(%iota.19), dimensions={0,1,2}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %eq.25 = s32[129280]{0} reshape(%eq.24), metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %eq.26 = s32[32,4096,129280]{2,1,0} broadcast(%eq.25), dimensions={2}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %eq.27 = pred[32,4096,129280]{2,1,0} compare(%eq.23, %eq.26), direction=EQ, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  ROOT %convert_element_type.292 = f32[32,4096,129280]{2,1,0} convert(%eq.27), metadata={op_name="convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+}
+
+%region_63.100 (reduce_sum.116: f32[], reduce_sum.117: f32[]) -> f32[] {
+  %reduce_sum.116 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.117 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.118 = f32[] add(%reduce_sum.116, %reduce_sum.117), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+}
+
+%region_64.101 (reduce_sum.123: f32[], reduce_sum.124: f32[]) -> f32[] {
+  %reduce_sum.123 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.124 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.125 = f32[] add(%reduce_sum.123, %reduce_sum.124), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+}
+
+%silu_544.198 (Arg_0.120: bf16[32,4096,2048]) -> (bf16[32,4096,2048], bf16[32,4096,2048], bf16[32,4096,2048]) {
+  %Arg_0.120 = bf16[32,4096,2048]{2,1,0} parameter(0)
+  %constant.369 = bf16[] constant(1)
+  %broadcast.118 = bf16[32,4096,2048]{2,1,0} broadcast(%constant.369), dimensions={}
+  %neg.10 = bf16[32,4096,2048]{2,1,0} negate(%Arg_0.120), metadata={op_name="neg" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %exp.124 = bf16[32,4096,2048]{2,1,0} exponential(%neg.10), metadata={op_name="exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %add.98 = bf16[32,4096,2048]{2,1,0} add(%exp.124, %broadcast.118), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %div.594 = bf16[32,4096,2048]{2,1,0} divide(%broadcast.118, %add.98), metadata={op_name="div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %mul.530 = bf16[32,4096,2048]{2,1,0} multiply(%Arg_0.120, %div.594), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %sub.32 = bf16[32,4096,2048]{2,1,0} subtract(%broadcast.118, %div.594), metadata={op_name="sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %mul.529 = bf16[32,4096,2048]{2,1,0} multiply(%div.594, %sub.32), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  ROOT %tuple.11 = (bf16[32,4096,2048]{2,1,0}, bf16[32,4096,2048]{2,1,0}, bf16[32,4096,2048]{2,1,0}) tuple(%mul.530, %div.594, %mul.529)
+}
+
+%region_149.213 (reduce_max.702: f32[], reduce_max.703: f32[]) -> f32[] {
+  %reduce_max.702 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.703 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.704 = f32[] maximum(%reduce_max.702, %reduce_max.703), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%_where_552.214 (Arg_0.126: pred[], Arg_1.106: f32[], Arg_2.100: f32[1]) -> f32[1] {
+  %Arg_0.126 = pred[] parameter(0)
+  %select_n.27 = pred[1]{0} broadcast(%Arg_0.126), dimensions={}, metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.106 = f32[] parameter(1)
+  %broadcast_in_dim.633 = f32[1]{0} reshape(%Arg_1.106), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_2.100 = f32[1]{0} parameter(2)
+  ROOT %select_n.28 = f32[1]{0} select(%select_n.27, %broadcast_in_dim.633, %Arg_2.100), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%_where_553.215 (Arg_0.128: pred[], Arg_1.108: f32[1], Arg_2.102: f32[1]) -> f32[1] {
+  %Arg_0.128 = pred[] parameter(0)
+  %select_n.31 = pred[1]{0} broadcast(%Arg_0.128), dimensions={}, metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.108 = f32[1]{0} parameter(1)
+  %Arg_2.102 = f32[1]{0} parameter(2)
+  ROOT %select_n.32 = f32[1]{0} select(%select_n.31, %Arg_1.108, %Arg_2.102), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%clip_555.219 (Arg_0.132: bf16[32,4096,7168], Arg_1.110: bf16[], Arg_2.104: bf16[]) -> bf16[32,4096,7168] {
+  %Arg_2.104 = bf16[] parameter(2)
+  %min.166 = bf16[32,4096,7168]{2,1,0} broadcast(%Arg_2.104), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.110 = bf16[] parameter(1)
+  %max.166 = bf16[32,4096,7168]{2,1,0} broadcast(%Arg_1.110), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.132 = bf16[32,4096,7168]{2,1,0} parameter(0)
+  %max.167 = bf16[32,4096,7168]{2,1,0} maximum(%max.166, %Arg_0.132), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %min.167 = bf16[32,4096,7168]{2,1,0} minimum(%min.166, %max.167), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_146.209 (reduce_max.688: f32[], reduce_max.689: f32[]) -> f32[] {
+  %reduce_max.688 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.689 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.690 = f32[] maximum(%reduce_max.688, %reduce_max.689), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%_where_476.104 (Arg_0.70: pred[], Arg_1.60: f32[], Arg_2.56: f32[1]) -> f32[1] {
+  %Arg_0.70 = pred[] parameter(0)
+  %select_n.15 = pred[1]{0} broadcast(%Arg_0.70), dimensions={}, metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.60 = f32[] parameter(1)
+  %broadcast_in_dim.514 = f32[1]{0} reshape(%Arg_1.60), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_2.56 = f32[1]{0} parameter(2)
+  ROOT %select_n.16 = f32[1]{0} select(%select_n.15, %broadcast_in_dim.514, %Arg_2.56), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%_where_477.105 (Arg_0.72: pred[], Arg_1.62: f32[1], Arg_2.58: f32[1]) -> f32[1] {
+  %Arg_0.72 = pred[] parameter(0)
+  %select_n.19 = pred[1]{0} broadcast(%Arg_0.72), dimensions={}, metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.62 = f32[1]{0} parameter(1)
+  %Arg_2.58 = f32[1]{0} parameter(2)
+  ROOT %select_n.20 = f32[1]{0} select(%select_n.19, %Arg_1.62, %Arg_2.58), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%clip_551.212 (Arg_0.124: bf16[2048,7168], Arg_1.104: bf16[], Arg_2.98: bf16[]) -> bf16[2048,7168] {
+  %Arg_2.98 = bf16[] parameter(2)
+  %min.162 = bf16[2048,7168]{1,0} broadcast(%Arg_2.98), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.104 = bf16[] parameter(1)
+  %max.162 = bf16[2048,7168]{1,0} broadcast(%Arg_1.104), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.124 = bf16[2048,7168]{1,0} parameter(0)
+  %max.163 = bf16[2048,7168]{1,0} maximum(%max.162, %Arg_0.124), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.163 = bf16[2048,7168]{1,0} minimum(%min.162, %max.163), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_152.220 (reduce_max.716: f32[], reduce_max.717: f32[]) -> f32[] {
+  %reduce_max.716 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.717 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.718 = f32[] maximum(%reduce_max.716, %reduce_max.717), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_561.223 (Arg_0.134: bf16[32,4096,2048], Arg_1.112: bf16[], Arg_2.106: bf16[]) -> bf16[32,4096,2048] {
+  %Arg_2.106 = bf16[] parameter(2)
+  %min.170 = bf16[32,4096,2048]{2,1,0} broadcast(%Arg_2.106), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.112 = bf16[] parameter(1)
+  %max.170 = bf16[32,4096,2048]{2,1,0} broadcast(%Arg_1.112), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.134 = bf16[32,4096,2048]{2,1,0} parameter(0)
+  %max.171 = bf16[32,4096,2048]{2,1,0} maximum(%max.170, %Arg_0.134), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.171 = bf16[32,4096,2048]{2,1,0} minimum(%min.170, %max.171), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_140.202 (reduce_max.660: f32[], reduce_max.661: f32[]) -> f32[] {
+  %reduce_max.660 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.661 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.662 = f32[] maximum(%reduce_max.660, %reduce_max.661), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_542.197 (Arg_0.118: bf16[7168,2048], Arg_1.100: bf16[], Arg_2.94: bf16[]) -> bf16[7168,2048] {
+  %Arg_2.94 = bf16[] parameter(2)
+  %min.154 = bf16[7168,2048]{1,0} broadcast(%Arg_2.94), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.100 = bf16[] parameter(1)
+  %max.154 = bf16[7168,2048]{1,0} broadcast(%Arg_1.100), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.118 = bf16[7168,2048]{1,0} parameter(0)
+  %max.155 = bf16[7168,2048]{1,0} maximum(%max.154, %Arg_0.118), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.155 = bf16[7168,2048]{1,0} minimum(%min.154, %max.155), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%silu_567.224 (Arg_0.136: bf16[32,4096,2048], Arg_1.114: bf16[32,4096,2048], Arg_2.108: bf16[32,4096,2048], Arg_3.6: bf16[32,4096,2048]) -> bf16[32,4096,2048] {
+  %Arg_3.6 = bf16[32,4096,2048]{2,1,0} parameter(3)
+  %Arg_0.136 = bf16[32,4096,2048]{2,1,0} parameter(0)
+  %mul.546 = bf16[32,4096,2048]{2,1,0} multiply(%Arg_3.6, %Arg_0.136), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %Arg_1.114 = bf16[32,4096,2048]{2,1,0} parameter(1)
+  %mul.545 = bf16[32,4096,2048]{2,1,0} multiply(%Arg_1.114, %Arg_3.6), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %Arg_2.108 = bf16[32,4096,2048]{2,1,0} parameter(2)
+  %mul.547 = bf16[32,4096,2048]{2,1,0} multiply(%mul.545, %Arg_2.108), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  ROOT %add_any.2 = bf16[32,4096,2048]{2,1,0} add(%mul.546, %mul.547), metadata={op_name="add_any" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+}
+
+%region_155.225 (reduce_max.730: f32[], reduce_max.731: f32[]) -> f32[] {
+  %reduce_max.730 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.731 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.732 = f32[] maximum(%reduce_max.730, %reduce_max.731), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_134.194 (reduce_max.632: f32[], reduce_max.633: f32[]) -> f32[] {
+  %reduce_max.632 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.633 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.634 = f32[] maximum(%reduce_max.632, %reduce_max.633), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%silu_531.176 (Arg_0.112: bf16[64,32,512,2048]) -> (bf16[64,32,512,2048], bf16[64,32,512,2048], bf16[64,32,512,2048]) {
+  %Arg_0.112 = bf16[64,32,512,2048]{3,2,1,0} parameter(0)
+  %constant.367 = bf16[] constant(1)
+  %broadcast.116 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%constant.367), dimensions={}
+  %neg.8 = bf16[64,32,512,2048]{3,2,1,0} negate(%Arg_0.112), metadata={op_name="neg" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %exp.122 = bf16[64,32,512,2048]{3,2,1,0} exponential(%neg.8), metadata={op_name="exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %add.96 = bf16[64,32,512,2048]{3,2,1,0} add(%exp.122, %broadcast.116), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %div.562 = bf16[64,32,512,2048]{3,2,1,0} divide(%broadcast.116, %add.96), metadata={op_name="div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %mul.523 = bf16[64,32,512,2048]{3,2,1,0} multiply(%Arg_0.112, %div.562), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %sub.30 = bf16[64,32,512,2048]{3,2,1,0} subtract(%broadcast.116, %div.562), metadata={op_name="sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %mul.522 = bf16[64,32,512,2048]{3,2,1,0} multiply(%div.562, %sub.30), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  ROOT %tuple.9 = (bf16[64,32,512,2048]{3,2,1,0}, bf16[64,32,512,2048]{3,2,1,0}, bf16[64,32,512,2048]{3,2,1,0}) tuple(%mul.523, %div.562, %mul.522)
+}
+
+%region_158.228 (reduce_max.744: f32[], reduce_max.745: f32[]) -> f32[] {
+  %reduce_max.744 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.745 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.746 = f32[] maximum(%reduce_max.744, %reduce_max.745), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_97.148 (reduce_sum.151: f32[], reduce_sum.152: f32[]) -> f32[] {
+  %reduce_sum.151 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.152 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.153 = f32[] add(%reduce_sum.151, %reduce_sum.152), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_98.149 (reduce_max.464: f32[], reduce_max.465: f32[]) -> f32[] {
+  %reduce_max.464 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.465 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.466 = f32[] maximum(%reduce_max.464, %reduce_max.465), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_484.109 (Arg_0.76: bf16[32,4096,7168], Arg_1.64: bf16[], Arg_2.60: bf16[]) -> bf16[32,4096,7168] {
+  %Arg_2.60 = bf16[] parameter(2)
+  %min.90 = bf16[32,4096,7168]{2,1,0} broadcast(%Arg_2.60), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.64 = bf16[] parameter(1)
+  %max.90 = bf16[32,4096,7168]{2,1,0} broadcast(%Arg_1.64), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.76 = bf16[32,4096,7168]{2,1,0} parameter(0)
+  %max.91 = bf16[32,4096,7168]{2,1,0} maximum(%max.90, %Arg_0.76), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %min.91 = bf16[32,4096,7168]{2,1,0} minimum(%min.90, %max.91), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_101.152 (reduce_max.478: f32[], reduce_max.479: f32[]) -> f32[] {
+  %reduce_max.478 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.479 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.480 = f32[] maximum(%reduce_max.478, %reduce_max.479), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_518.155 (Arg_0.100: bf16[7168,64], Arg_1.88: bf16[], Arg_2.82: bf16[]) -> bf16[7168,64] {
+  %Arg_2.82 = bf16[] parameter(2)
+  %min.130 = bf16[7168,64]{1,0} broadcast(%Arg_2.82), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.88 = bf16[] parameter(1)
+  %max.130 = bf16[7168,64]{1,0} broadcast(%Arg_1.88), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.100 = bf16[7168,64]{1,0} parameter(0)
+  %max.131 = bf16[7168,64]{1,0} maximum(%max.130, %Arg_0.100), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.131 = bf16[7168,64]{1,0} minimum(%min.130, %max.131), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%cumsum_519.156 (Arg_0.102: s32[32,4096,64]) -> s32[32,4096,64] {
+  %Arg_0.102 = s32[32,4096,64]{2,1,0} parameter(0)
+  ROOT %RoutedMoE_generate_masks_from_expert_format.5 = s32[32,4096,64]{2,1,0} call(%Arg_0.102), to_apply=%cumsum_285.60, metadata={op_name="RoutedMoE.generate_masks_from_expert_format" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1388 source_end_line=1388 source_column=25 source_end_column=56}
+}
+
+%_one_hot_520.157 (Arg_0.104: s32[32,4096,64]) -> s32[32,4096,64,513] {
+  %Arg_0.104 = s32[32,4096,64]{2,1,0} parameter(0)
+  %broadcast_in_dim.579 = s32[32,4096,64,1]{3,2,1,0} reshape(%Arg_0.104), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.35 = s32[32,4096,64,1]{3,2,1,0} broadcast(%broadcast_in_dim.579), dimensions={0,1,2,3}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.36 = s32[32,4096,64]{2,1,0} reshape(%eq.35), metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.37 = s32[32,4096,64,513]{3,2,1,0} broadcast(%eq.36), dimensions={0,1,2}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %iota.28 = s32[513]{0} iota(), iota_dimension=0, metadata={op_name="iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %iota.29 = s32[1,1,1,513]{3,2,1,0} reshape(%iota.28), metadata={op_name="iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.38 = s32[1,1,1,513]{3,2,1,0} broadcast(%iota.29), dimensions={0,1,2,3}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.39 = s32[513]{0} reshape(%eq.38), metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.40 = s32[32,4096,64,513]{3,2,1,0} broadcast(%eq.39), dimensions={3}, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %eq.41 = pred[32,4096,64,513]{3,2,1,0} compare(%eq.37, %eq.40), direction=EQ, metadata={op_name="eq" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  ROOT %convert_element_type.360 = s32[32,4096,64,513]{3,2,1,0} convert(%eq.41), metadata={op_name="convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+}
+
+%region_125.185 (reduce_max.590: f32[], reduce_max.591: f32[]) -> f32[] {
+  %reduce_max.590 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.591 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.592 = f32[] maximum(%reduce_max.590, %reduce_max.591), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_523.161 (Arg_0.106: bf16[32,4096,64,512], Arg_1.90: bf16[], Arg_2.84: bf16[]) -> bf16[32,4096,64,512] {
+  %Arg_2.84 = bf16[] parameter(2)
+  %min.134 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%Arg_2.84), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_1.90 = bf16[] parameter(1)
+  %max.134 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%Arg_1.90), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.106 = bf16[32,4096,64,512]{3,2,1,0} parameter(0)
+  %max.135 = bf16[32,4096,64,512]{3,2,1,0} maximum(%max.134, %Arg_0.106), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.135 = bf16[32,4096,64,512]{3,2,1,0} minimum(%min.134, %max.135), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_161.231 (reduce_max.758: f32[], reduce_max.759: f32[]) -> f32[] {
+  %reduce_max.758 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.759 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.760 = f32[] maximum(%reduce_max.758, %reduce_max.759), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_574.234 (Arg_0.138: bf16[64,32,512,7168], Arg_1.116: bf16[], Arg_2.110: bf16[]) -> bf16[64,32,512,7168] {
+  %Arg_2.110 = bf16[] parameter(2)
+  %min.174 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%Arg_2.110), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.116 = bf16[] parameter(1)
+  %max.174 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%Arg_1.116), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.138 = bf16[64,32,512,7168]{3,2,1,0} parameter(0)
+  %max.175 = bf16[64,32,512,7168]{3,2,1,0} maximum(%max.174, %Arg_0.138), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %min.175 = bf16[64,32,512,7168]{3,2,1,0} minimum(%min.174, %max.175), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_122.181 (reduce_max.576: f32[], reduce_max.577: f32[]) -> f32[] {
+  %reduce_max.576 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.577 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.578 = f32[] maximum(%reduce_max.576, %reduce_max.577), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_539.184 (Arg_0.116: bf16[64,2048,7168], Arg_1.98: bf16[], Arg_2.92: bf16[]) -> bf16[64,2048,7168] {
+  %Arg_2.92 = bf16[] parameter(2)
+  %min.150 = bf16[64,2048,7168]{2,1,0} broadcast(%Arg_2.92), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_1.98 = bf16[] parameter(1)
+  %max.150 = bf16[64,2048,7168]{2,1,0} broadcast(%Arg_1.98), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.116 = bf16[64,2048,7168]{2,1,0} parameter(0)
+  %max.151 = bf16[64,2048,7168]{2,1,0} maximum(%max.150, %Arg_0.116), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.151 = bf16[64,2048,7168]{2,1,0} minimum(%min.150, %max.151), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_164.236 (reduce_max.772: f32[], reduce_max.773: f32[]) -> f32[] {
+  %reduce_max.772 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.773 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.774 = f32[] maximum(%reduce_max.772, %reduce_max.773), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_582.239 (Arg_0.142: bf16[64,32,512,2048], Arg_1.120: bf16[], Arg_2.114: bf16[]) -> bf16[64,32,512,2048] {
+  %Arg_2.114 = bf16[] parameter(2)
+  %min.178 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%Arg_2.114), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_1.120 = bf16[] parameter(1)
+  %max.178 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%Arg_1.120), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.142 = bf16[64,32,512,2048]{3,2,1,0} parameter(0)
+  %max.179 = bf16[64,32,512,2048]{3,2,1,0} maximum(%max.178, %Arg_0.142), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.179 = bf16[64,32,512,2048]{3,2,1,0} minimum(%min.178, %max.179), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_116.173 (reduce_max.548: f32[], reduce_max.549: f32[]) -> f32[] {
+  %reduce_max.548 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.549 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.550 = f32[] maximum(%reduce_max.548, %reduce_max.549), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_529.172 (Arg_0.110: bf16[64,7168,2048], Arg_1.94: bf16[], Arg_2.88: bf16[]) -> bf16[64,7168,2048] {
+  %Arg_2.88 = bf16[] parameter(2)
+  %min.142 = bf16[64,7168,2048]{2,1,0} broadcast(%Arg_2.88), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_1.94 = bf16[] parameter(1)
+  %max.142 = bf16[64,7168,2048]{2,1,0} broadcast(%Arg_1.94), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_0.110 = bf16[64,7168,2048]{2,1,0} parameter(0)
+  %max.143 = bf16[64,7168,2048]{2,1,0} maximum(%max.142, %Arg_0.110), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  ROOT %min.143 = bf16[64,7168,2048]{2,1,0} minimum(%min.142, %max.143), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%silu_580.235 (Arg_0.140: bf16[64,32,512,2048], Arg_1.118: bf16[64,32,512,2048], Arg_2.112: bf16[64,32,512,2048], Arg_3.8: bf16[64,32,512,2048]) -> bf16[64,32,512,2048] {
+  %Arg_3.8 = bf16[64,32,512,2048]{3,2,1,0} parameter(3)
+  %Arg_0.140 = bf16[64,32,512,2048]{3,2,1,0} parameter(0)
+  %mul.566 = bf16[64,32,512,2048]{3,2,1,0} multiply(%Arg_3.8, %Arg_0.140), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %Arg_1.118 = bf16[64,32,512,2048]{3,2,1,0} parameter(1)
+  %mul.565 = bf16[64,32,512,2048]{3,2,1,0} multiply(%Arg_1.118, %Arg_3.8), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  %Arg_2.112 = bf16[64,32,512,2048]{3,2,1,0} parameter(2)
+  %mul.567 = bf16[64,32,512,2048]{3,2,1,0} multiply(%mul.565, %Arg_2.112), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+  ROOT %add_any.5 = bf16[64,32,512,2048]{3,2,1,0} add(%mul.566, %mul.567), metadata={op_name="add_any" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=546 source_end_line=546 source_column=20 source_end_column=48}
+}
+
+%region_167.240 (reduce_max.786: f32[], reduce_max.787: f32[]) -> f32[] {
+  %reduce_max.786 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.787 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.788 = f32[] maximum(%reduce_max.786, %reduce_max.787), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_113.169 (reduce_max.534: f32[], reduce_max.535: f32[]) -> f32[] {
+  %reduce_max.534 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.535 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.536 = f32[] maximum(%reduce_max.534, %reduce_max.535), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_170.243 (reduce_max.800: f32[], reduce_max.801: f32[]) -> f32[] {
+  %reduce_max.800 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.801 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.802 = f32[] maximum(%reduce_max.800, %reduce_max.801), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_593.246 (Arg_0.144: bf16[32,64,512,7168], Arg_1.122: bf16[], Arg_2.116: bf16[]) -> bf16[32,64,512,7168] {
+  %Arg_2.116 = bf16[] parameter(2)
+  %min.182 = bf16[32,64,512,7168]{3,2,1,0} broadcast(%Arg_2.116), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_1.122 = bf16[] parameter(1)
+  %max.182 = bf16[32,64,512,7168]{3,2,1,0} broadcast(%Arg_1.122), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.144 = bf16[32,64,512,7168]{3,2,1,0} parameter(0)
+  %max.183 = bf16[32,64,512,7168]{3,2,1,0} maximum(%max.182, %Arg_0.144), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.183 = bf16[32,64,512,7168]{3,2,1,0} minimum(%min.182, %max.183), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_104.158 (reduce_max.492: f32[], reduce_max.493: f32[]) -> f32[] {
+  %reduce_max.492 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.493 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.494 = f32[] maximum(%reduce_max.492, %reduce_max.493), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_119.177 (reduce_max.562: f32[], reduce_max.563: f32[]) -> f32[] {
+  %reduce_max.562 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.563 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.564 = f32[] maximum(%reduce_max.562, %reduce_max.563), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_536.180 (Arg_0.114: bf16[64,32,512,2048], Arg_1.96: bf16[], Arg_2.90: bf16[]) -> bf16[64,32,512,2048] {
+  %Arg_2.90 = bf16[] parameter(2)
+  %min.146 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%Arg_2.90), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_1.96 = bf16[] parameter(1)
+  %max.146 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%Arg_1.96), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  %Arg_0.114 = bf16[64,32,512,2048]{3,2,1,0} parameter(0)
+  %max.147 = bf16[64,32,512,2048]{3,2,1,0} maximum(%max.146, %Arg_0.114), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+  ROOT %min.147 = bf16[64,32,512,2048]{3,2,1,0} minimum(%min.146, %max.147), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=661 source_end_line=663 source_column=10 source_end_column=49}
+}
+
+%region_128.188 (reduce_max.604: f32[], reduce_max.605: f32[]) -> f32[] {
+  %reduce_max.604 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.605 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.606 = f32[] maximum(%reduce_max.604, %reduce_max.605), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_526.168 (Arg_0.108: bf16[64,32,512,7168], Arg_1.92: bf16[], Arg_2.86: bf16[]) -> bf16[64,32,512,7168] {
+  %Arg_2.86 = bf16[] parameter(2)
+  %min.138 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%Arg_2.86), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_1.92 = bf16[] parameter(1)
+  %max.138 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%Arg_1.92), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %Arg_0.108 = bf16[64,32,512,7168]{3,2,1,0} parameter(0)
+  %max.139 = bf16[64,32,512,7168]{3,2,1,0} maximum(%max.138, %Arg_0.108), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %min.139 = bf16[64,32,512,7168]{3,2,1,0} minimum(%min.138, %max.139), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_173.247 (reduce_sum.158: bf16[], reduce_sum.159: bf16[]) -> bf16[] {
+  %reduce_sum.158 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.159 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.160 = bf16[] add(%reduce_sum.158, %reduce_sum.159), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_174.248 (reduce_sum.165: bf16[], reduce_sum.166: bf16[]) -> bf16[] {
+  %reduce_sum.165 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.166 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.167 = bf16[] add(%reduce_sum.165, %reduce_sum.166), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_175.249 (reduce_max.814: f32[], reduce_max.815: f32[]) -> f32[] {
+  %reduce_max.814 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.815 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.816 = f32[] maximum(%reduce_max.814, %reduce_max.815), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_607.252 (Arg_0.146: bf16[32,4096,64], Arg_1.124: bf16[], Arg_2.118: bf16[]) -> bf16[32,4096,64] {
+  %Arg_2.118 = bf16[] parameter(2)
+  %min.186 = bf16[32,4096,64]{2,1,0} broadcast(%Arg_2.118), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %Arg_1.124 = bf16[] parameter(1)
+  %max.186 = bf16[32,4096,64]{2,1,0} broadcast(%Arg_1.124), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %Arg_0.146 = bf16[32,4096,64]{2,1,0} parameter(0)
+  %max.187 = bf16[32,4096,64]{2,1,0} maximum(%max.186, %Arg_0.146), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  ROOT %min.187 = bf16[32,4096,64]{2,1,0} minimum(%min.186, %max.187), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+}
+
+%region_180.255 (reduce_sum.186: f32[], reduce_sum.187: f32[]) -> f32[] {
+  %reduce_sum.186 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.187 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.188 = f32[] add(%reduce_sum.186, %reduce_sum.187), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_181.256 (reduce_sum.193: f32[], reduce_sum.194: f32[]) -> f32[] {
+  %reduce_sum.193 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.194 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.195 = f32[] add(%reduce_sum.193, %reduce_sum.194), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_76.118 (reduce_sum.137: f32[], reduce_sum.138: f32[]) -> f32[] {
+  %reduce_sum.137 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.138 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.139 = f32[] add(%reduce_sum.137, %reduce_sum.138), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_77.119 (reduce_max.380: f32[], reduce_max.381: f32[]) -> f32[] {
+  %reduce_max.380 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.381 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.382 = f32[] maximum(%reduce_max.380, %reduce_max.381), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_495.122 (Arg_0.82: bf16[32,4096,1536], Arg_1.70: bf16[], Arg_2.66: bf16[]) -> bf16[32,4096,1536] {
+  %Arg_2.66 = bf16[] parameter(2)
+  %min.102 = bf16[32,4096,1536]{2,1,0} broadcast(%Arg_2.66), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.70 = bf16[] parameter(1)
+  %max.102 = bf16[32,4096,1536]{2,1,0} broadcast(%Arg_1.70), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.82 = bf16[32,4096,1536]{2,1,0} parameter(0)
+  %max.103 = bf16[32,4096,1536]{2,1,0} maximum(%max.102, %Arg_0.82), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.103 = bf16[32,4096,1536]{2,1,0} minimum(%min.102, %max.103), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_80.123 (reduce_max.394: f32[], reduce_max.395: f32[]) -> f32[] {
+  %reduce_max.394 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.395 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.396 = f32[] maximum(%reduce_max.394, %reduce_max.395), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_498.126 (Arg_0.84: bf16[1536,128,192], Arg_1.72: bf16[], Arg_2.68: bf16[]) -> bf16[1536,128,192] {
+  %Arg_2.68 = bf16[] parameter(2)
+  %min.106 = bf16[1536,128,192]{2,1,0} broadcast(%Arg_2.68), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.72 = bf16[] parameter(1)
+  %max.106 = bf16[1536,128,192]{2,1,0} broadcast(%Arg_1.72), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.84 = bf16[1536,128,192]{2,1,0} parameter(0)
+  %max.107 = bf16[1536,128,192]{2,1,0} maximum(%max.106, %Arg_0.84), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.107 = bf16[1536,128,192]{2,1,0} minimum(%min.106, %max.107), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%clip_499.127 (Arg_0.86: f32[32], Arg_1.74: s32[], Arg_2.70: s32[]) -> f32[32] {
+  %Arg_2.70 = s32[] parameter(2)
+  %convert_element_type.323 = f32[] convert(%Arg_2.70), metadata={op_name="convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %min.110 = f32[32]{0} broadcast(%convert_element_type.323), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %Arg_1.74 = s32[] parameter(1)
+  %convert_element_type.322 = f32[] convert(%Arg_1.74), metadata={op_name="convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %max.110 = f32[32]{0} broadcast(%convert_element_type.322), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %Arg_0.86 = f32[32]{0} parameter(0)
+  %max.111 = f32[32]{0} maximum(%max.110, %Arg_0.86), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  ROOT %min.111 = f32[32]{0} minimum(%min.110, %max.111), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+}
+
+%_where_501.128 (Arg_0.89: pred[32,4096], Arg_1.77: s32[32,4096], Arg_2.72: s32[32,4096]) -> s32[32,4096] {
+  %Arg_0.89 = pred[32,4096]{1,0} parameter(0)
+  %Arg_1.77 = s32[32,4096]{1,0} parameter(1)
+  %Arg_2.72 = s32[32,4096]{1,0} parameter(2)
+  ROOT %select_n.22 = s32[32,4096]{1,0} select(%Arg_0.89, %Arg_1.77, %Arg_2.72), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+}
+
+%region_83.129 (reduce_and.11: pred[], reduce_and.12: pred[]) -> pred[] {
+  %reduce_and.11 = pred[] parameter(0), metadata={op_name="reduce_and"}
+  %reduce_and.12 = pred[] parameter(1), metadata={op_name="reduce_and"}
+  ROOT %reduce_and.13 = pred[] and(%reduce_and.11, %reduce_and.12), metadata={op_name="reduce_and" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+}
+
+%_take_500.130 (Arg_0.90: c64[163840,32], Arg_1.78: s32[32,4096]) -> c64[32,4096,32] {
+  %Arg_1.78 = s32[32,4096]{1,0} parameter(1)
+  %constant.365 = s32[] constant(0)
+  %lt.11 = s32[32,4096]{1,0} broadcast(%constant.365), dimensions={}, metadata={op_name="lt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %lt.12 = pred[32,4096]{1,0} compare(%Arg_1.78, %lt.11), direction=LT, metadata={op_name="lt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.363 = s32[] constant(163840)
+  %add.87 = s32[32,4096]{1,0} broadcast(%constant.363), dimensions={}, metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %add.88 = s32[32,4096]{1,0} add(%Arg_1.78, %add.87), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %jit__where_.177 = s32[32,4096]{1,0} call(%lt.12, %add.88, %Arg_1.78), to_apply=%_where_501.128, metadata={op_name="jit(_where)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.541 = s32[32,4096,1]{2,1,0} reshape(%jit__where_.177), metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.362 = s32[] constant(0)
+  %ge.6 = s32[32,4096,1]{2,1,0} broadcast(%constant.362), dimensions={}, metadata={op_name="ge" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %ge.7 = pred[32,4096,1]{2,1,0} compare(%broadcast_in_dim.541, %ge.6), direction=GE, metadata={op_name="ge" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.361 = s32[] constant(163839)
+  %le.11 = s32[32,4096,1]{2,1,0} broadcast(%constant.361), dimensions={}, metadata={op_name="le" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %le.12 = pred[32,4096,1]{2,1,0} compare(%broadcast_in_dim.541, %le.11), direction=LE, metadata={op_name="le" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %and.3 = pred[32,4096,1]{2,1,0} and(%ge.7, %le.12), metadata={op_name="and" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.364 = pred[] constant(true)
+  %reduce_and.15 = pred[32,4096]{1,0} reduce(%and.3, %constant.364), dimensions={2}, to_apply=%region_83.129, metadata={op_name="reduce_and" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.542 = pred[32,4096,32]{2,1,0} broadcast(%reduce_and.15), dimensions={0,1}, metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %Arg_0.90 = c64[163840,32]{1,0} parameter(0)
+  %gather.4 = c64[32,4096,32]{2,1,0} gather(%Arg_0.90, %broadcast_in_dim.541), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,32}, metadata={op_name="gather" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %constant.360 = c64[] constant((nan, 0))
+  %broadcast_in_dim.540 = c64[32,4096,32]{2,1,0} broadcast(%constant.360), dimensions={}, metadata={op_name="broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  ROOT %select_n.24 = c64[32,4096,32]{2,1,0} select(%broadcast_in_dim.542, %gather.4, %broadcast_in_dim.540), metadata={op_name="select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+}
+
+%region_84.131 (reduce_sum.144: f32[], reduce_sum.145: f32[]) -> f32[] {
+  %reduce_sum.144 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.145 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.146 = f32[] add(%reduce_sum.144, %reduce_sum.145), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_85.132 (reduce_max.408: f32[], reduce_max.409: f32[]) -> f32[] {
+  %reduce_max.408 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.409 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.410 = f32[] maximum(%reduce_max.408, %reduce_max.409), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_506.135 (Arg_0.92: bf16[32,4096,512], Arg_1.80: bf16[], Arg_2.74: bf16[]) -> bf16[32,4096,512] {
+  %Arg_2.74 = bf16[] parameter(2)
+  %min.114 = bf16[32,4096,512]{2,1,0} broadcast(%Arg_2.74), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.80 = bf16[] parameter(1)
+  %max.114 = bf16[32,4096,512]{2,1,0} broadcast(%Arg_1.80), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.92 = bf16[32,4096,512]{2,1,0} parameter(0)
+  %max.115 = bf16[32,4096,512]{2,1,0} maximum(%max.114, %Arg_0.92), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.115 = bf16[32,4096,512]{2,1,0} minimum(%min.114, %max.115), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_88.136 (reduce_max.422: f32[], reduce_max.423: f32[]) -> f32[] {
+  %reduce_max.422 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.423 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.424 = f32[] maximum(%reduce_max.422, %reduce_max.423), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_509.139 (Arg_0.94: bf16[512,128,256], Arg_1.82: bf16[], Arg_2.76: bf16[]) -> bf16[512,128,256] {
+  %Arg_2.76 = bf16[] parameter(2)
+  %min.118 = bf16[512,128,256]{2,1,0} broadcast(%Arg_2.76), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.82 = bf16[] parameter(1)
+  %max.118 = bf16[512,128,256]{2,1,0} broadcast(%Arg_1.82), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.94 = bf16[512,128,256]{2,1,0} parameter(0)
+  %max.119 = bf16[512,128,256]{2,1,0} maximum(%max.118, %Arg_0.94), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.119 = bf16[512,128,256]{2,1,0} minimum(%min.118, %max.119), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_182.257 (reduce_max.828: f32[], reduce_max.829: f32[]) -> f32[] {
+  %reduce_max.828 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.829 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.830 = f32[] maximum(%reduce_max.828, %reduce_max.829), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_94.144 (reduce_max.450: f32[], reduce_max.451: f32[]) -> f32[] {
+  %reduce_max.450 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.451 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.452 = f32[] maximum(%reduce_max.450, %reduce_max.451), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_515.147 (Arg_0.98: bf16[128,128,7168], Arg_1.86: bf16[], Arg_2.80: bf16[]) -> bf16[128,128,7168] {
+  %Arg_2.80 = bf16[] parameter(2)
+  %min.126 = bf16[128,128,7168]{2,1,0} broadcast(%Arg_2.80), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.86 = bf16[] parameter(1)
+  %max.126 = bf16[128,128,7168]{2,1,0} broadcast(%Arg_1.86), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.98 = bf16[128,128,7168]{2,1,0} parameter(0)
+  %max.127 = bf16[128,128,7168]{2,1,0} maximum(%max.126, %Arg_0.98), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.127 = bf16[128,128,7168]{2,1,0} minimum(%min.126, %max.127), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_186.261 (reduce_max.842: f32[], reduce_max.843: f32[]) -> f32[] {
+  %reduce_max.842 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.843 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.844 = f32[] maximum(%reduce_max.842, %reduce_max.843), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_629.264 (Arg_0.148: bf16[32,4096,128,256], Arg_1.126: bf16[], Arg_2.120: bf16[]) -> bf16[32,4096,128,256] {
+  %Arg_2.120 = bf16[] parameter(2)
+  %min.190 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%Arg_2.120), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %Arg_1.126 = bf16[] parameter(1)
+  %max.190 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%Arg_1.126), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %Arg_0.148 = bf16[32,4096,128,256]{3,2,1,0} parameter(0)
+  %max.191 = bf16[32,4096,128,256]{3,2,1,0} maximum(%max.190, %Arg_0.148), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  ROOT %min.191 = bf16[32,4096,128,256]{3,2,1,0} minimum(%min.190, %max.191), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+}
+
+%region_192.268 (reduce_sum.228: f32[], reduce_sum.229: f32[]) -> f32[] {
+  %reduce_sum.228 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.229 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.230 = f32[] add(%reduce_sum.228, %reduce_sum.229), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_193.269 (reduce_sum.235: f32[], reduce_sum.236: f32[]) -> f32[] {
+  %reduce_sum.235 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.236 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.237 = f32[] add(%reduce_sum.235, %reduce_sum.236), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_185.260 (reduce_sum.200: bf16[], reduce_sum.201: bf16[]) -> bf16[] {
+  %reduce_sum.200 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.201 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.202 = bf16[] add(%reduce_sum.200, %reduce_sum.201), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_189.265 (reduce_sum.207: bf16[], reduce_sum.208: bf16[]) -> bf16[] {
+  %reduce_sum.207 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.208 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.209 = bf16[] add(%reduce_sum.207, %reduce_sum.208), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_201.278 (reduce_max.870: f32[], reduce_max.871: f32[]) -> f32[] {
+  %reduce_max.870 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.871 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.872 = f32[] maximum(%reduce_max.870, %reduce_max.871), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_689.281 (Arg_0.152: bf16[32,4096,576], Arg_1.130: bf16[], Arg_2.124: bf16[]) -> bf16[32,4096,576] {
+  %Arg_2.124 = bf16[] parameter(2)
+  %min.198 = bf16[32,4096,576]{2,1,0} broadcast(%Arg_2.124), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %Arg_1.130 = bf16[] parameter(1)
+  %max.198 = bf16[32,4096,576]{2,1,0} broadcast(%Arg_1.130), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  %Arg_0.152 = bf16[32,4096,576]{2,1,0} parameter(0)
+  %max.199 = bf16[32,4096,576]{2,1,0} maximum(%max.198, %Arg_0.152), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+  ROOT %min.199 = bf16[32,4096,576]{2,1,0} minimum(%min.198, %max.199), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=304 source_end_line=306 source_column=10 source_end_column=49}
+}
+
+%region_73.114 (reduce_max.366: f32[], reduce_max.367: f32[]) -> f32[] {
+  %reduce_max.366 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.367 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.368 = f32[] maximum(%reduce_max.366, %reduce_max.367), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_490.117 (Arg_0.80: bf16[7168,576], Arg_1.68: bf16[], Arg_2.64: bf16[]) -> bf16[7168,576] {
+  %Arg_2.64 = bf16[] parameter(2)
+  %min.98 = bf16[7168,576]{1,0} broadcast(%Arg_2.64), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_1.68 = bf16[] parameter(1)
+  %max.98 = bf16[7168,576]{1,0} broadcast(%Arg_1.68), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_0.80 = bf16[7168,576]{1,0} parameter(0)
+  %max.99 = bf16[7168,576]{1,0} maximum(%max.98, %Arg_0.80), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  ROOT %min.99 = bf16[7168,576]{1,0} minimum(%min.98, %max.99), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%region_194.270 (reduce_max.856: f32[], reduce_max.857: f32[]) -> f32[] {
+  %reduce_max.856 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.857 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.858 = f32[] maximum(%reduce_max.856, %reduce_max.857), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_672.273 (Arg_0.150: bf16[32,4096,128,192], Arg_1.128: bf16[], Arg_2.122: bf16[]) -> bf16[32,4096,128,192] {
+  %Arg_2.122 = bf16[] parameter(2)
+  %min.194 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%Arg_2.122), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %Arg_1.128 = bf16[] parameter(1)
+  %max.194 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%Arg_1.128), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  %Arg_0.150 = bf16[32,4096,128,192]{3,2,1,0} parameter(0)
+  %max.195 = bf16[32,4096,128,192]{3,2,1,0} maximum(%max.194, %Arg_0.150), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+  ROOT %min.195 = bf16[32,4096,128,192]{3,2,1,0} minimum(%min.194, %max.195), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=476 source_end_line=487 source_column=6 source_end_column=3}
+}
+
+%region_199.276 (reduce_sum.256: f32[], reduce_sum.257: f32[]) -> f32[] {
+  %reduce_sum.256 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.257 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.258 = f32[] add(%reduce_sum.256, %reduce_sum.257), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_200.277 (reduce_sum.263: f32[], reduce_sum.264: f32[]) -> f32[] {
+  %reduce_sum.263 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.264 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.265 = f32[] add(%reduce_sum.263, %reduce_sum.264), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_204.282 (reduce_max.884: f32[], reduce_max.885: f32[]) -> f32[] {
+  %reduce_max.884 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.885 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.886 = f32[] maximum(%reduce_max.884, %reduce_max.885), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_697.285 (Arg_0.154: bf16[32,4096,1536], Arg_1.132: bf16[], Arg_2.126: bf16[]) -> bf16[32,4096,1536] {
+  %Arg_2.126 = bf16[] parameter(2)
+  %min.202 = bf16[32,4096,1536]{2,1,0} broadcast(%Arg_2.126), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.132 = bf16[] parameter(1)
+  %max.202 = bf16[32,4096,1536]{2,1,0} broadcast(%Arg_1.132), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.154 = bf16[32,4096,1536]{2,1,0} parameter(0)
+  %max.203 = bf16[32,4096,1536]{2,1,0} maximum(%max.202, %Arg_0.154), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.203 = bf16[32,4096,1536]{2,1,0} minimum(%min.202, %max.203), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_70.110 (reduce_max.352: f32[], reduce_max.353: f32[]) -> f32[] {
+  %reduce_max.352 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.353 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.354 = f32[] maximum(%reduce_max.352, %reduce_max.353), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_487.113 (Arg_0.78: bf16[7168,1536], Arg_1.66: bf16[], Arg_2.62: bf16[]) -> bf16[7168,1536] {
+  %Arg_2.62 = bf16[] parameter(2)
+  %min.94 = bf16[7168,1536]{1,0} broadcast(%Arg_2.62), dimensions={}, metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_1.66 = bf16[] parameter(1)
+  %max.94 = bf16[7168,1536]{1,0} broadcast(%Arg_1.66), dimensions={}, metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  %Arg_0.78 = bf16[7168,1536]{1,0} parameter(0)
+  %max.95 = bf16[7168,1536]{1,0} maximum(%max.94, %Arg_0.78), metadata={op_name="max" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+  ROOT %min.95 = bf16[7168,1536]{1,0} minimum(%min.94, %max.95), metadata={op_name="min" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=293 source_end_line=293 source_column=25 source_end_column=132}
+}
+
+%region_66.102 (reduce_sum.130: f32[], reduce_sum.131: f32[]) -> f32[] {
+  %reduce_sum.130 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.131 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.132 = f32[] add(%reduce_sum.130, %reduce_sum.131), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_209.288 (reduce_sum.284: f32[], reduce_sum.285: f32[]) -> f32[] {
+  %reduce_sum.284 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.285 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.286 = f32[] add(%reduce_sum.284, %reduce_sum.285), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_210.289 (reduce_sum.291: f32[], reduce_sum.292: f32[]) -> f32[] {
+  %reduce_sum.291 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.292 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.293 = f32[] add(%reduce_sum.291, %reduce_sum.292), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_107.162 (reduce_max.506: f32[], reduce_max.507: f32[]) -> f32[] {
+  %reduce_max.506 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.507 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.508 = f32[] maximum(%reduce_max.506, %reduce_max.507), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_110.165 (reduce_max.520: f32[], reduce_max.521: f32[]) -> f32[] {
+  %reduce_max.520 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.521 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.522 = f32[] maximum(%reduce_max.520, %reduce_max.521), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_131.191 (reduce_max.618: f32[], reduce_max.619: f32[]) -> f32[] {
+  %reduce_max.618 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.619 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.620 = f32[] maximum(%reduce_max.618, %reduce_max.619), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_137.199 (reduce_max.646: f32[], reduce_max.647: f32[]) -> f32[] {
+  %reduce_max.646 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.647 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.648 = f32[] maximum(%reduce_max.646, %reduce_max.647), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_143.205 (reduce_max.674: f32[], reduce_max.675: f32[]) -> f32[] {
+  %reduce_max.674 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.675 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.676 = f32[] maximum(%reduce_max.674, %reduce_max.675), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_548.208 (Arg_0.122: bf16[32,4096,2048], Arg_1.102: bf16[], Arg_2.96: bf16[]) -> bf16[32,4096,2048] {
+  %Arg_2.96 = bf16[] parameter(2)
+  %min.158 = bf16[32,4096,2048]{2,1,0} broadcast(%Arg_2.96), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.102 = bf16[] parameter(1)
+  %max.158 = bf16[32,4096,2048]{2,1,0} broadcast(%Arg_1.102), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.122 = bf16[32,4096,2048]{2,1,0} parameter(0)
+  %max.159 = bf16[32,4096,2048]{2,1,0} maximum(%max.158, %Arg_0.122), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.159 = bf16[32,4096,2048]{2,1,0} minimum(%min.158, %max.159), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_178.253 (reduce_sum.172: bf16[], reduce_sum.173: bf16[]) -> bf16[] {
+  %reduce_sum.172 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.173 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.174 = bf16[] add(%reduce_sum.172, %reduce_sum.173), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_179.254 (reduce_sum.179: bf16[], reduce_sum.180: bf16[]) -> bf16[] {
+  %reduce_sum.179 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.180 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.181 = bf16[] add(%reduce_sum.179, %reduce_sum.180), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_207.286 (reduce_sum.270: bf16[], reduce_sum.271: bf16[]) -> bf16[] {
+  %reduce_sum.270 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.271 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.272 = bf16[] add(%reduce_sum.270, %reduce_sum.271), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_208.287 (reduce_sum.277: bf16[], reduce_sum.278: bf16[]) -> bf16[] {
+  %reduce_sum.277 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.278 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.279 = bf16[] add(%reduce_sum.277, %reduce_sum.278), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_190.266 (reduce_sum.214: bf16[], reduce_sum.215: bf16[]) -> bf16[] {
+  %reduce_sum.214 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.215 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.216 = bf16[] add(%reduce_sum.214, %reduce_sum.215), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_191.267 (reduce_sum.221: bf16[], reduce_sum.222: bf16[]) -> bf16[] {
+  %reduce_sum.221 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.222 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.223 = bf16[] add(%reduce_sum.221, %reduce_sum.222), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_91.140 (reduce_max.436: f32[], reduce_max.437: f32[]) -> f32[] {
+  %reduce_max.436 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.437 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.438 = f32[] maximum(%reduce_max.436, %reduce_max.437), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_512.143 (Arg_0.96: bf16[32,4096,128,128], Arg_1.84: bf16[], Arg_2.78: bf16[]) -> bf16[32,4096,128,128] {
+  %Arg_2.78 = bf16[] parameter(2)
+  %min.122 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%Arg_2.78), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.84 = bf16[] parameter(1)
+  %max.122 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%Arg_1.84), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.96 = bf16[32,4096,128,128]{3,2,1,0} parameter(0)
+  %max.123 = bf16[32,4096,128,128]{3,2,1,0} maximum(%max.122, %Arg_0.96), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.123 = bf16[32,4096,128,128]{3,2,1,0} minimum(%min.122, %max.123), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_197.274 (reduce_sum.242: bf16[], reduce_sum.243: bf16[]) -> bf16[] {
+  %reduce_sum.242 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.243 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.244 = bf16[] add(%reduce_sum.242, %reduce_sum.243), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_198.275 (reduce_sum.249: bf16[], reduce_sum.250: bf16[]) -> bf16[] {
+  %reduce_sum.249 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.250 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.251 = bf16[] add(%reduce_sum.249, %reduce_sum.250), metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_67.103 (reduce_max.338: f32[], reduce_max.339: f32[]) -> f32[] {
+  %reduce_max.338 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.339 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.340 = f32[] maximum(%reduce_max.338, %reduce_max.339), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%_roll_static.107 (Arg_0.74: f32[1024]) -> f32[1024] {
+  %Arg_0.74 = f32[1024]{0} parameter(0)
+  %slice.20 = f32[1023]{0} slice(%Arg_0.74), slice={[1:1024]}, metadata={op_name="slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %slice.21 = f32[1]{0} slice(%Arg_0.74), slice={[0:1]}, metadata={op_name="slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %concatenate.17 = f32[1024]{0} concatenate(%slice.20, %slice.21), dimensions={0}, metadata={op_name="concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_99.150 (reduce_max.471: bf16[], reduce_max.472: bf16[]) -> bf16[] {
+  %reduce_max.471 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.472 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.473 = bf16[] maximum(%reduce_max.471, %reduce_max.472), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_100.151 (scatter.47: f32[], scatter.48: f32[]) -> f32[] {
+  %scatter.47 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.48 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_102.153 (reduce_max.485: bf16[], reduce_max.486: bf16[]) -> bf16[] {
+  %reduce_max.485 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.486 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.487 = bf16[] maximum(%reduce_max.485, %reduce_max.486), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_103.154 (scatter.52: f32[], scatter.53: f32[]) -> f32[] {
+  %scatter.52 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.53 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%_roll_static_554.217 (Arg_0.130: f32[1024]) -> f32[1024] {
+  %Arg_0.130 = f32[1024]{0} parameter(0)
+  %slice.29 = f32[1023]{0} slice(%Arg_0.130), slice={[1:1024]}, metadata={op_name="slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  %slice.30 = f32[1]{0} slice(%Arg_0.130), slice={[0:1]}, metadata={op_name="slice" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+  ROOT %concatenate.23 = f32[1024]{0} concatenate(%slice.29, %slice.30), dimensions={0}, metadata={op_name="concatenate" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/quantizations.py" source_line=285 source_end_line=287 source_column=23 source_end_column=5}
+}
+
+%region_176.250 (reduce_max.821: bf16[], reduce_max.822: bf16[]) -> bf16[] {
+  %reduce_max.821 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.822 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.823 = bf16[] maximum(%reduce_max.821, %reduce_max.822), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_177.251 (scatter.172: f32[], scatter.173: f32[]) -> f32[] {
+  %scatter.172 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.173 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_105.159 (reduce_max.499: bf16[], reduce_max.500: bf16[]) -> bf16[] {
+  %reduce_max.499 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.500 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.501 = bf16[] maximum(%reduce_max.499, %reduce_max.500), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_106.160 (scatter.57: f32[], scatter.58: f32[]) -> f32[] {
+  %scatter.57 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.58 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_108.163 (reduce_max.513: bf16[], reduce_max.514: bf16[]) -> bf16[] {
+  %reduce_max.513 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.514 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.515 = bf16[] maximum(%reduce_max.513, %reduce_max.514), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_109.164 (scatter.62: f32[], scatter.63: f32[]) -> f32[] {
+  %scatter.62 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.63 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_171.244 (reduce_max.807: bf16[], reduce_max.808: bf16[]) -> bf16[] {
+  %reduce_max.807 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.808 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.809 = bf16[] maximum(%reduce_max.807, %reduce_max.808), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_172.245 (scatter.167: f32[], scatter.168: f32[]) -> f32[] {
+  %scatter.167 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.168 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_111.166 (reduce_max.527: bf16[], reduce_max.528: bf16[]) -> bf16[] {
+  %reduce_max.527 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.528 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.529 = bf16[] maximum(%reduce_max.527, %reduce_max.528), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_112.167 (scatter.67: f32[], scatter.68: f32[]) -> f32[] {
+  %scatter.67 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.68 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_114.170 (reduce_max.541: bf16[], reduce_max.542: bf16[]) -> bf16[] {
+  %reduce_max.541 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.542 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.543 = bf16[] maximum(%reduce_max.541, %reduce_max.542), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_115.171 (scatter.72: f32[], scatter.73: f32[]) -> f32[] {
+  %scatter.72 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.73 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_117.174 (reduce_max.555: bf16[], reduce_max.556: bf16[]) -> bf16[] {
+  %reduce_max.555 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.556 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.557 = bf16[] maximum(%reduce_max.555, %reduce_max.556), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_118.175 (scatter.77: f32[], scatter.78: f32[]) -> f32[] {
+  %scatter.77 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.78 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_168.241 (reduce_max.793: bf16[], reduce_max.794: bf16[]) -> bf16[] {
+  %reduce_max.793 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.794 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.795 = bf16[] maximum(%reduce_max.793, %reduce_max.794), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_169.242 (scatter.162: f32[], scatter.163: f32[]) -> f32[] {
+  %scatter.162 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.163 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_165.237 (reduce_max.779: bf16[], reduce_max.780: bf16[]) -> bf16[] {
+  %reduce_max.779 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.780 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.781 = bf16[] maximum(%reduce_max.779, %reduce_max.780), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_166.238 (scatter.157: f32[], scatter.158: f32[]) -> f32[] {
+  %scatter.157 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.158 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_120.178 (reduce_max.569: bf16[], reduce_max.570: bf16[]) -> bf16[] {
+  %reduce_max.569 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.570 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.571 = bf16[] maximum(%reduce_max.569, %reduce_max.570), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_121.179 (scatter.82: f32[], scatter.83: f32[]) -> f32[] {
+  %scatter.82 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.83 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_123.182 (reduce_max.583: bf16[], reduce_max.584: bf16[]) -> bf16[] {
+  %reduce_max.583 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.584 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.585 = bf16[] maximum(%reduce_max.583, %reduce_max.584), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_124.183 (scatter.87: f32[], scatter.88: f32[]) -> f32[] {
+  %scatter.87 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.88 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_162.232 (reduce_max.765: bf16[], reduce_max.766: bf16[]) -> bf16[] {
+  %reduce_max.765 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.766 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.767 = bf16[] maximum(%reduce_max.765, %reduce_max.766), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_163.233 (scatter.152: f32[], scatter.153: f32[]) -> f32[] {
+  %scatter.152 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.153 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_126.186 (reduce_max.597: bf16[], reduce_max.598: bf16[]) -> bf16[] {
+  %reduce_max.597 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.598 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.599 = bf16[] maximum(%reduce_max.597, %reduce_max.598), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_127.187 (scatter.92: f32[], scatter.93: f32[]) -> f32[] {
+  %scatter.92 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.93 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_129.189 (reduce_max.611: bf16[], reduce_max.612: bf16[]) -> bf16[] {
+  %reduce_max.611 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.612 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.613 = bf16[] maximum(%reduce_max.611, %reduce_max.612), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_130.190 (scatter.97: f32[], scatter.98: f32[]) -> f32[] {
+  %scatter.97 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.98 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_159.229 (reduce_max.751: bf16[], reduce_max.752: bf16[]) -> bf16[] {
+  %reduce_max.751 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.752 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.753 = bf16[] maximum(%reduce_max.751, %reduce_max.752), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_160.230 (scatter.147: f32[], scatter.148: f32[]) -> f32[] {
+  %scatter.147 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.148 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_132.192 (reduce_max.625: bf16[], reduce_max.626: bf16[]) -> bf16[] {
+  %reduce_max.625 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.626 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.627 = bf16[] maximum(%reduce_max.625, %reduce_max.626), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_133.193 (scatter.102: f32[], scatter.103: f32[]) -> f32[] {
+  %scatter.102 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.103 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_135.195 (reduce_max.639: bf16[], reduce_max.640: bf16[]) -> bf16[] {
+  %reduce_max.639 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.640 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.641 = bf16[] maximum(%reduce_max.639, %reduce_max.640), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_136.196 (scatter.107: f32[], scatter.108: f32[]) -> f32[] {
+  %scatter.107 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.108 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_156.226 (reduce_max.737: bf16[], reduce_max.738: bf16[]) -> bf16[] {
+  %reduce_max.737 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.738 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.739 = bf16[] maximum(%reduce_max.737, %reduce_max.738), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_157.227 (scatter.142: f32[], scatter.143: f32[]) -> f32[] {
+  %scatter.142 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.143 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_138.200 (reduce_max.653: bf16[], reduce_max.654: bf16[]) -> bf16[] {
+  %reduce_max.653 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.654 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.655 = bf16[] maximum(%reduce_max.653, %reduce_max.654), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_139.201 (scatter.112: f32[], scatter.113: f32[]) -> f32[] {
+  %scatter.112 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.113 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_141.203 (reduce_max.667: bf16[], reduce_max.668: bf16[]) -> bf16[] {
+  %reduce_max.667 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.668 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.669 = bf16[] maximum(%reduce_max.667, %reduce_max.668), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_142.204 (scatter.117: f32[], scatter.118: f32[]) -> f32[] {
+  %scatter.117 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.118 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_153.221 (reduce_max.723: bf16[], reduce_max.724: bf16[]) -> bf16[] {
+  %reduce_max.723 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.724 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.725 = bf16[] maximum(%reduce_max.723, %reduce_max.724), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_154.222 (scatter.137: f32[], scatter.138: f32[]) -> f32[] {
+  %scatter.137 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.138 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_144.206 (reduce_max.681: bf16[], reduce_max.682: bf16[]) -> bf16[] {
+  %reduce_max.681 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.682 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.683 = bf16[] maximum(%reduce_max.681, %reduce_max.682), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_145.207 (scatter.122: f32[], scatter.123: f32[]) -> f32[] {
+  %scatter.122 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.123 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_147.210 (reduce_max.695: bf16[], reduce_max.696: bf16[]) -> bf16[] {
+  %reduce_max.695 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.696 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.697 = bf16[] maximum(%reduce_max.695, %reduce_max.696), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_148.211 (scatter.127: f32[], scatter.128: f32[]) -> f32[] {
+  %scatter.127 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.128 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_150.216 (reduce_max.709: bf16[], reduce_max.710: bf16[]) -> bf16[] {
+  %reduce_max.709 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.710 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.711 = bf16[] maximum(%reduce_max.709, %reduce_max.710), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_151.218 (scatter.132: f32[], scatter.133: f32[]) -> f32[] {
+  %scatter.132 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.133 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_68.106 (reduce_max.345: bf16[], reduce_max.346: bf16[]) -> bf16[] {
+  %reduce_max.345 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.346 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.347 = bf16[] maximum(%reduce_max.345, %reduce_max.346), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_69.108 (scatter.2: f32[], scatter.3: f32[]) -> f32[] {
+  %scatter.2 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.3 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_71.111 (reduce_max.359: bf16[], reduce_max.360: bf16[]) -> bf16[] {
+  %reduce_max.359 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.360 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.361 = bf16[] maximum(%reduce_max.359, %reduce_max.360), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_72.112 (scatter.7: f32[], scatter.8: f32[]) -> f32[] {
+  %scatter.7 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.8 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_74.115 (reduce_max.373: bf16[], reduce_max.374: bf16[]) -> bf16[] {
+  %reduce_max.373 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.374 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.375 = bf16[] maximum(%reduce_max.373, %reduce_max.374), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_75.116 (scatter.12: f32[], scatter.13: f32[]) -> f32[] {
+  %scatter.12 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.13 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_205.283 (reduce_max.891: bf16[], reduce_max.892: bf16[]) -> bf16[] {
+  %reduce_max.891 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.892 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.893 = bf16[] maximum(%reduce_max.891, %reduce_max.892), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_206.284 (scatter.197: f32[], scatter.198: f32[]) -> f32[] {
+  %scatter.197 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.198 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_202.279 (reduce_max.877: bf16[], reduce_max.878: bf16[]) -> bf16[] {
+  %reduce_max.877 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.878 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.879 = bf16[] maximum(%reduce_max.877, %reduce_max.878), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_203.280 (scatter.192: f32[], scatter.193: f32[]) -> f32[] {
+  %scatter.192 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.193 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_92.141 (reduce_max.443: bf16[], reduce_max.444: bf16[]) -> bf16[] {
+  %reduce_max.443 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.444 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.445 = bf16[] maximum(%reduce_max.443, %reduce_max.444), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_93.142 (scatter.37: f32[], scatter.38: f32[]) -> f32[] {
+  %scatter.37 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.38 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_95.145 (reduce_max.457: bf16[], reduce_max.458: bf16[]) -> bf16[] {
+  %reduce_max.457 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.458 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.459 = bf16[] maximum(%reduce_max.457, %reduce_max.458), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_96.146 (scatter.42: f32[], scatter.43: f32[]) -> f32[] {
+  %scatter.42 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.43 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_183.258 (reduce_max.835: bf16[], reduce_max.836: bf16[]) -> bf16[] {
+  %reduce_max.835 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.836 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.837 = bf16[] maximum(%reduce_max.835, %reduce_max.836), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_184.259 (scatter.177: f32[], scatter.178: f32[]) -> f32[] {
+  %scatter.177 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.178 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_86.133 (reduce_max.415: bf16[], reduce_max.416: bf16[]) -> bf16[] {
+  %reduce_max.415 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.416 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.417 = bf16[] maximum(%reduce_max.415, %reduce_max.416), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_87.134 (scatter.27: f32[], scatter.28: f32[]) -> f32[] {
+  %scatter.27 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.28 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_89.137 (reduce_max.429: bf16[], reduce_max.430: bf16[]) -> bf16[] {
+  %reduce_max.429 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.430 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.431 = bf16[] maximum(%reduce_max.429, %reduce_max.430), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_90.138 (scatter.32: f32[], scatter.33: f32[]) -> f32[] {
+  %scatter.32 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.33 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_187.262 (reduce_max.849: bf16[], reduce_max.850: bf16[]) -> bf16[] {
+  %reduce_max.849 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.850 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.851 = bf16[] maximum(%reduce_max.849, %reduce_max.850), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_188.263 (scatter.182: f32[], scatter.183: f32[]) -> f32[] {
+  %scatter.182 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.183 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_78.120 (reduce_max.387: bf16[], reduce_max.388: bf16[]) -> bf16[] {
+  %reduce_max.387 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.388 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.389 = bf16[] maximum(%reduce_max.387, %reduce_max.388), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_79.121 (scatter.17: f32[], scatter.18: f32[]) -> f32[] {
+  %scatter.17 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.18 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_81.124 (reduce_max.401: bf16[], reduce_max.402: bf16[]) -> bf16[] {
+  %reduce_max.401 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.402 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.403 = bf16[] maximum(%reduce_max.401, %reduce_max.402), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_82.125 (scatter.22: f32[], scatter.23: f32[]) -> f32[] {
+  %scatter.22 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.23 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_195.271 (reduce_max.863: bf16[], reduce_max.864: bf16[]) -> bf16[] {
+  %reduce_max.863 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.864 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.865 = bf16[] maximum(%reduce_max.863, %reduce_max.864), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_196.272 (scatter.187: f32[], scatter.188: f32[]) -> f32[] {
+  %scatter.187 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.188 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%closed_call_475.290 (Arg_0.155: s32[32,4096], Arg_1.133: bf16[32,4096,7168], Arg_2.127: bf16[32,4096,7168], Arg_3.9: bf16[32,4096,1536], Arg_4.5: bf16[32,4096,576], Arg_5.5: bf16[32,4096,7168], Arg_6.5: bf16[64,32,512,2048], Arg_7.5: bf16[64,32,512,2048], Arg_8.5: bf16[32,4096,2048], Arg_9.5: bf16[32,4096,2048], Arg_10.5: f32[1024], Arg_11.5: f32[1], Arg_12.5: f32[1024], Arg_13.5: f32[1], Arg_14.5: f32[1024], Arg_15.5: f32[1], Arg_16.5: f32[1024], Arg_17.5: f32[1], Arg_18.5: f32[1024], Arg_19.5: f32[1], Arg_20.5: f32[1024], Arg_21.5: f32[1], Arg_22.5: f32[1024], Arg_23.5: f32[1], Arg_24.5: f32[1024], Arg_25.5: f32[1024], Arg_26.5: f32[1], Arg_27.5: f32[1], Arg_28.5: f32[1024], Arg_29.5: f32[1024], Arg_30.5: f32[1], Arg_31.5: f32[1], Arg_32.5: f32[1024], Arg_33.5: f32[1], Arg_34.5: f32[1024], Arg_35.5: f32[1], Arg_36.5: f32[1024], Arg_37.5: f32[1], Arg_38.5: f32[1024], Arg_39.5: f32[1], Arg_40.5: f32[1024], Arg_41.5: f32[1], Arg_42.5: f32[1024], Arg_43.5: f32[1], Arg_44.5: f32[1024], Arg_45.5: f32[1], Arg_46.5: f32[1024], Arg_47.5: f32[1], Arg_48.5: f32[1024], Arg_49.5: f32[1], Arg_50.5: f32[1024], Arg_51.5: f32[1], Arg_52.5: f32[1024], Arg_53.5: f32[1], Arg_54.5: f32[1024], Arg_55.5: f32[1], Arg_56.5: f32[1024], Arg_57.5: f32[1], Arg_58.5: f32[1024], Arg_59.5: f32[1], Arg_60.5: f32[1024], Arg_61.5: f32[1], Arg_62.5: f32[1024], Arg_63.5: f32[1], Arg_64.5: f32[1024], Arg_65.5: f32[1024], Arg_66.5: f32[1], Arg_67.5: f32[1], Arg_68.5: f32[1024], Arg_69.5: f32[1024], Arg_70.5: f32[1], Arg_71.5: f32[1], Arg_72.5: f32[1024], Arg_73.5: f32[1], Arg_74.5: f32[1024], Arg_75.5: f32[1], Arg_76.3: f32[1024], Arg_77.3: f32[1], Arg_78.3: f32[1024], Arg_79.3: f32[1], Arg_80.3: f32[1024], Arg_81.3: f32[1], Arg_82.3: f32[1024], Arg_83.3: f32[1], Arg_84.3: f32[1024], Arg_85.3: f32[1], Arg_86.3: f32[1024], Arg_87.3: f32[1], Arg_88.3: f32[1024], Arg_89.3: f32[1], Arg_90.3: bf16[7168,64], Arg_91.3: bf16[64], Arg_92.3: bf16[64,7168,2048], Arg_93.3: bf16[64,7168,2048], Arg_94.3: bf16[64,2048,7168], Arg_95.3: bf16[7168,2048], Arg_96.3: bf16[7168,2048], Arg_97.3: bf16[2048,7168], Arg_98.3: bf16[7168], Arg_99.3: bf16[7168], Arg_100.3: bf16[512], Arg_101.3: bf16[128,128,7168], Arg_102.3: bf16[1536], Arg_103.3: bf16[7168,576], Arg_104.3: bf16[512,128,256], Arg_105.3: bf16[7168,1536], Arg_106.3: bf16[1536,128,192]) -> (bf16[32,4096,7168], bf16[7168,64], bf16[64], bf16[64,7168,2048], bf16[64,7168,2048], /*index=5*/bf16[64,2048,7168], bf16[7168,2048], bf16[7168,2048], bf16[2048,7168], bf16[7168], /*index=10*/bf16[7168], bf16[512], bf16[128,128,7168], bf16[1536], bf16[7168,576], /*index=15*/bf16[512,128,256], bf16[7168,1536], bf16[1536,128,192], f32[1024], f32[1], /*index=20*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=25*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=30*/f32[1024], f32[1], f32[1024], f32[1024], f32[1], /*index=35*/f32[1], f32[1024], f32[1024], f32[1], f32[1], /*index=40*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=45*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=50*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=55*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=60*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=65*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=70*/f32[1024], f32[1], f32[1024], f32[1024], f32[1], /*index=75*/f32[1], f32[1024], f32[1024], f32[1], f32[1], /*index=80*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=85*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=90*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=95*/f32[1], f32[1024], f32[1]) {
+  %Arg_1.133 = bf16[32,4096,7168]{2,1,0} parameter(1)
+  %sharding_constraint.128 = bf16[32,4096,7168]{2,1,0} custom-call(%Arg_1.133), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_8.5 = bf16[32,4096,2048]{2,1,0} parameter(8)
+  %device_put.30 = bf16[32,4096,2048]{2,1,0} custom-call(%Arg_8.5), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.20 = (bf16[32,4096,2048]{2,1,0}, bf16[32,4096,2048]{2,1,0}, bf16[32,4096,2048]{2,1,0}) call(%device_put.30), to_apply=%silu_544.198, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.21 = bf16[32,4096,2048]{2,1,0} get-tuple-element(%jit_silu_.20), index=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.129 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.128), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.392 = f32[1]{0} constant({1})
+  %Arg_60.5 = f32[1024]{0} parameter(60)
+  %constant.411 = f32[] constant(-inf)
+  %reduce_max.947 = f32[] reduce(%Arg_60.5, %constant.411), dimensions={0}, to_apply=%region_149.213, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.148 = pred[] is-finite(%reduce_max.947), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.412 = f32[] constant(0)
+  %gt.148 = pred[] compare(%reduce_max.947, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.402 = f32[] constant(57344)
+  %div.836 = f32[] divide(%constant.402, %reduce_max.947), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.410 = f32[] constant(1)
+  %div.837 = f32[] divide(%div.836, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_61.5 = f32[1]{0} parameter(61)
+  %div.835 = f32[1]{0} divide(%constant.392, %Arg_61.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.300 = f32[1]{0} call(%gt.148, %div.837, %div.835), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.301 = f32[1]{0} call(%is_finite.148, %jit__where_.300, %div.835), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.838 = f32[1]{0} divide(%constant.392, %jit__where_.301), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.605 = bf16[1]{0} convert(%div.838), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.878 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.605), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.879 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.878), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.880 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.879), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.839 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.129, %broadcast_in_dim.880), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.401 = bf16[] constant(-5.734e+04)
+  %constant.400 = bf16[] constant(5.734e+04)
+  %jit_clip_.156 = bf16[32,4096,7168]{2,1,0} call(%div.839, %constant.401, %constant.400), to_apply=%clip_555.219, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.606 = f8e5m2[32,4096,7168]{2,1,0} convert(%jit_clip_.156), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_97.3 = bf16[2048,7168]{1,0} parameter(97)
+  %Arg_58.5 = f32[1024]{0} parameter(58)
+  %reduce_max.945 = f32[] reduce(%Arg_58.5, %constant.411), dimensions={0}, to_apply=%region_146.209, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.147 = pred[] is-finite(%reduce_max.945), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.147 = pred[] compare(%reduce_max.945, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.409 = f32[] constant(448)
+  %div.831 = f32[] divide(%constant.409, %reduce_max.945), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.832 = f32[] divide(%div.831, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_59.5 = f32[1]{0} parameter(59)
+  %div.830 = f32[1]{0} divide(%constant.392, %Arg_59.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.298 = f32[1]{0} call(%gt.147, %div.832, %div.830), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.299 = f32[1]{0} call(%is_finite.147, %jit__where_.298, %div.830), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.833 = f32[1]{0} divide(%constant.392, %jit__where_.299), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.602 = bf16[1]{0} convert(%div.833), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.875 = bf16[2048,1]{1,0} broadcast(%convert_element_type.602), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.876 = bf16[2048]{0} reshape(%broadcast_in_dim.875), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.877 = bf16[2048,7168]{1,0} broadcast(%broadcast_in_dim.876), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.834 = bf16[2048,7168]{1,0} divide(%Arg_97.3, %broadcast_in_dim.877), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.406 = bf16[] constant(-448)
+  %constant.405 = bf16[] constant(448)
+  %jit_clip_.155 = bf16[2048,7168]{1,0} call(%div.834, %constant.406, %constant.405), to_apply=%clip_551.212, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.603 = f8e4m3fn[2048,7168]{1,0} convert(%jit_clip_.155), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.84 = bf16[32,4096,2048]{2,1,0} dot(%convert_element_type.606, %convert_element_type.603), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.752 = f32[1]{0} multiply(%div.833, %div.838), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.607 = bf16[1]{0} convert(%mul.752), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.881 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.607), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.882 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.881), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.883 = bf16[32,4096,2048]{2,1,0} broadcast(%broadcast_in_dim.882), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.753 = bf16[32,4096,2048]{2,1,0} multiply(%dot_general.84, %broadcast_in_dim.883), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.130 = bf16[32,4096,2048]{2,1,0} custom-call(%mul.753), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.756 = bf16[32,4096,2048]{2,1,0} multiply(%jit_silu_.21, %sharding_constraint.130), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_54.5 = f32[1024]{0} parameter(54)
+  %reduce_max.949 = f32[] reduce(%Arg_54.5, %constant.411), dimensions={0}, to_apply=%region_152.220, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.149 = pred[] is-finite(%reduce_max.949), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.149 = pred[] compare(%reduce_max.949, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.841 = f32[] divide(%constant.402, %reduce_max.949), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.842 = f32[] divide(%div.841, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_55.5 = f32[1]{0} parameter(55)
+  %div.840 = f32[1]{0} divide(%constant.392, %Arg_55.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.302 = f32[1]{0} call(%gt.149, %div.842, %div.840), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.303 = f32[1]{0} call(%is_finite.149, %jit__where_.302, %div.840), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.843 = f32[1]{0} divide(%constant.392, %jit__where_.303), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.610 = bf16[1]{0} convert(%div.843), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.887 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.610), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.888 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.887), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.889 = bf16[32,4096,2048]{2,1,0} broadcast(%broadcast_in_dim.888), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.844 = bf16[32,4096,2048]{2,1,0} divide(%mul.756, %broadcast_in_dim.889), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.157 = bf16[32,4096,2048]{2,1,0} call(%div.844, %constant.401, %constant.400), to_apply=%clip_561.223, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.611 = f8e5m2[32,4096,2048]{2,1,0} convert(%jit_clip_.157), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_96.3 = bf16[7168,2048]{1,0} parameter(96)
+  %Arg_52.5 = f32[1024]{0} parameter(52)
+  %reduce_max.941 = f32[] reduce(%Arg_52.5, %constant.411), dimensions={0}, to_apply=%region_140.202, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.145 = pred[] is-finite(%reduce_max.941), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.145 = pred[] compare(%reduce_max.941, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.821 = f32[] divide(%constant.409, %reduce_max.941), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.822 = f32[] divide(%div.821, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_53.5 = f32[1]{0} parameter(53)
+  %div.820 = f32[1]{0} divide(%constant.392, %Arg_53.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.294 = f32[1]{0} call(%gt.145, %div.822, %div.820), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.295 = f32[1]{0} call(%is_finite.145, %jit__where_.294, %div.820), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.823 = f32[1]{0} divide(%constant.392, %jit__where_.295), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.596 = bf16[1]{0} convert(%div.823), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.869 = bf16[7168,1]{1,0} broadcast(%convert_element_type.596), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.870 = bf16[7168]{0} reshape(%broadcast_in_dim.869), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.871 = bf16[7168,2048]{1,0} broadcast(%broadcast_in_dim.870), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.824 = bf16[7168,2048]{1,0} divide(%Arg_96.3, %broadcast_in_dim.871), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.153 = bf16[7168,2048]{1,0} call(%div.824, %constant.406, %constant.405), to_apply=%clip_542.197, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.597 = f8e4m3fn[7168,2048]{1,0} convert(%jit_clip_.153), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.86 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.611, %convert_element_type.597), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.758 = f32[1]{0} multiply(%div.823, %div.843), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.612 = bf16[1]{0} convert(%mul.758), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.890 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.612), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.891 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.890), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.892 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.891), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.759 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.86, %broadcast_in_dim.892), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.22 = bf16[32,4096,2048]{2,1,0} get-tuple-element(%jit_silu_.20), index=1, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.23 = bf16[32,4096,2048]{2,1,0} get-tuple-element(%jit_silu_.20), index=2, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_9.5 = bf16[32,4096,2048]{2,1,0} parameter(9)
+  %device_put.31 = bf16[32,4096,2048]{2,1,0} custom-call(%Arg_9.5), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.757 = bf16[32,4096,2048]{2,1,0} multiply(%sharding_constraint.130, %device_put.31), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.24 = bf16[32,4096,2048]{2,1,0} call(%jit_silu_.22, %device_put.30, %jit_silu_.23, %mul.757), to_apply=%silu_567.224, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_48.5 = f32[1024]{0} parameter(48)
+  %reduce_max.951 = f32[] reduce(%Arg_48.5, %constant.411), dimensions={0}, to_apply=%region_155.225, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.150 = pred[] is-finite(%reduce_max.951), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.150 = pred[] compare(%reduce_max.951, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.846 = f32[] divide(%constant.402, %reduce_max.951), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.847 = f32[] divide(%div.846, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_49.5 = f32[1]{0} parameter(49)
+  %div.845 = f32[1]{0} divide(%constant.392, %Arg_49.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.304 = f32[1]{0} call(%gt.150, %div.847, %div.845), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.305 = f32[1]{0} call(%is_finite.150, %jit__where_.304, %div.845), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.848 = f32[1]{0} divide(%constant.392, %jit__where_.305), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.615 = bf16[1]{0} convert(%div.848), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.896 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.615), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.897 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.896), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.898 = bf16[32,4096,2048]{2,1,0} broadcast(%broadcast_in_dim.897), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.849 = bf16[32,4096,2048]{2,1,0} divide(%jit_silu_.24, %broadcast_in_dim.898), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.158 = bf16[32,4096,2048]{2,1,0} call(%div.849, %constant.401, %constant.400), to_apply=%clip_561.223, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.616 = f8e5m2[32,4096,2048]{2,1,0} convert(%jit_clip_.158), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_95.3 = bf16[7168,2048]{1,0} parameter(95)
+  %Arg_46.5 = f32[1024]{0} parameter(46)
+  %reduce_max.937 = f32[] reduce(%Arg_46.5, %constant.411), dimensions={0}, to_apply=%region_134.194, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.143 = pred[] is-finite(%reduce_max.937), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.143 = pred[] compare(%reduce_max.937, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.811 = f32[] divide(%constant.409, %reduce_max.937), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.812 = f32[] divide(%div.811, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_47.5 = f32[1]{0} parameter(47)
+  %div.810 = f32[1]{0} divide(%constant.392, %Arg_47.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.290 = f32[1]{0} call(%gt.143, %div.812, %div.810), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.291 = f32[1]{0} call(%is_finite.143, %jit__where_.290, %div.810), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.813 = f32[1]{0} divide(%constant.392, %jit__where_.291), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.590 = bf16[1]{0} convert(%div.813), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.863 = bf16[7168,1]{1,0} broadcast(%convert_element_type.590), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.864 = bf16[7168]{0} reshape(%broadcast_in_dim.863), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.865 = bf16[7168,2048]{1,0} broadcast(%broadcast_in_dim.864), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.814 = bf16[7168,2048]{1,0} divide(%Arg_95.3, %broadcast_in_dim.865), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.151 = bf16[7168,2048]{1,0} call(%div.814, %constant.406, %constant.405), to_apply=%clip_542.197, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.591 = f8e4m3fn[7168,2048]{1,0} convert(%jit_clip_.151), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.88 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.616, %convert_element_type.591), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.762 = f32[1]{0} multiply(%div.813, %div.848), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.617 = bf16[1]{0} convert(%mul.762), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.899 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.617), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.900 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.899), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.901 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.900), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.763 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.88, %broadcast_in_dim.901), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.20 = bf16[32,4096,7168]{2,1,0} add(%mul.759, %mul.763), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_6.5 = bf16[64,32,512,2048]{3,2,1,0} parameter(6)
+  %device_put.28 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%Arg_6.5), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.16 = (bf16[64,32,512,2048]{3,2,1,0}, bf16[64,32,512,2048]{3,2,1,0}, bf16[64,32,512,2048]{3,2,1,0}) call(%device_put.28), to_apply=%silu_531.176, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/ffn_act/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.17 = bf16[64,32,512,2048]{3,2,1,0} get-tuple-element(%jit_silu_.16), index=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/ffn_act/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_42.5 = f32[1024]{0} parameter(42)
+  %reduce_max.953 = f32[] reduce(%Arg_42.5, %constant.411), dimensions={0}, to_apply=%region_158.228, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.151 = pred[] is-finite(%reduce_max.953), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.151 = pred[] compare(%reduce_max.953, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.851 = f32[] divide(%constant.402, %reduce_max.953), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.852 = f32[] divide(%div.851, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_43.5 = f32[1]{0} parameter(43)
+  %div.850 = f32[1]{0} divide(%constant.392, %Arg_43.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.306 = f32[1]{0} call(%gt.151, %div.852, %div.850), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.307 = f32[1]{0} call(%is_finite.151, %jit__where_.306, %div.850), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.853 = f32[1]{0} divide(%constant.392, %jit__where_.307), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.620 = bf16[1]{0} convert(%div.853), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.905 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.620), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.906 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.905), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.907 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.906), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.854 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.129, %broadcast_in_dim.907), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.159 = bf16[32,4096,7168]{2,1,0} call(%div.854, %constant.401, %constant.400), to_apply=%clip_555.219, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.621 = f8e5m2[32,4096,7168]{2,1,0} convert(%jit_clip_.159), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_2.127 = bf16[32,4096,7168]{2,1,0} parameter(2)
+  %Arg_5.5 = bf16[32,4096,7168]{2,1,0} parameter(5)
+  %sharding_constraint.117 = bf16[32,4096,7168]{2,1,0} custom-call(%Arg_5.5), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.106 = bf16[32,4096,7168]{2,1,0} add(%Arg_2.127, %sharding_constraint.117), metadata={op_name="checkpoint/rematted_computation/moe_layers/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.542 = f32[32,4096,7168]{2,1,0} convert(%add.106), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.25 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.542, %convert_element_type.542), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.298 = f32[32,4096]{1,0} reduce(%square.25, %constant.412), dimensions={2}, to_apply=%region_97.148, metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.815 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.298), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.395 = f32[] constant(7168)
+  %broadcast.136 = f32[32,4096,1]{2,1,0} broadcast(%constant.395), dimensions={}
+  %div.748 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.815, %broadcast.136), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.394 = f32[] constant(1e-06)
+  %broadcast.135 = f32[32,4096,1]{2,1,0} broadcast(%constant.394), dimensions={}
+  %add.107 = f32[32,4096,1]{2,1,0} add(%div.748, %broadcast.135), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.24 = f32[32,4096,1]{2,1,0} rsqrt(%add.107), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.728 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.24), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.729 = f32[32,4096]{1,0} reshape(%mul.728), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.730 = f32[32,4096,7168]{2,1,0} broadcast(%mul.729), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.731 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.542, %mul.730), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.543 = bf16[32,4096,7168]{2,1,0} convert(%mul.731), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_98.3 = bf16[7168]{0} parameter(98)
+  %broadcast_in_dim.816 = bf16[1,1,7168]{2,1,0} reshape(%Arg_98.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.732 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.816), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.733 = bf16[7168]{0} reshape(%mul.732), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.734 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.733), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.735 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.543, %mul.734), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.118 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.735), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_10.5 = f32[1024]{0} parameter(10)
+  %reduce_max.913 = f32[] reduce(%Arg_10.5, %constant.411), dimensions={0}, to_apply=%region_98.149, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.131 = pred[] is-finite(%reduce_max.913), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.131 = pred[] compare(%reduce_max.913, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.751 = f32[] divide(%constant.409, %reduce_max.913), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.752 = f32[] divide(%div.751, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_11.5 = f32[1]{0} parameter(11)
+  %div.750 = f32[1]{0} divide(%constant.392, %Arg_11.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.266 = f32[1]{0} call(%gt.131, %div.752, %div.750), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.267 = f32[1]{0} call(%is_finite.131, %jit__where_.266, %div.750), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.753 = f32[1]{0} divide(%constant.392, %jit__where_.267), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.545 = bf16[1]{0} convert(%div.753), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.817 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.545), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.818 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.817), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.819 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.818), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.754 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.118, %broadcast_in_dim.819), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.139 = bf16[32,4096,7168]{2,1,0} call(%div.754, %constant.406, %constant.405), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.546 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.139), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_90.3 = bf16[7168,64]{1,0} parameter(90)
+  %Arg_12.5 = f32[1024]{0} parameter(12)
+  %reduce_max.915 = f32[] reduce(%Arg_12.5, %constant.411), dimensions={0}, to_apply=%region_101.152, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.132 = pred[] is-finite(%reduce_max.915), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.132 = pred[] compare(%reduce_max.915, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.756 = f32[] divide(%constant.409, %reduce_max.915), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.757 = f32[] divide(%div.756, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_13.5 = f32[1]{0} parameter(13)
+  %div.755 = f32[1]{0} divide(%constant.392, %Arg_13.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.268 = f32[1]{0} call(%gt.132, %div.757, %div.755), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.269 = f32[1]{0} call(%is_finite.132, %jit__where_.268, %div.755), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.758 = f32[1]{0} divide(%constant.392, %jit__where_.269), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.548 = bf16[1]{0} convert(%div.758), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.820 = bf16[7168,1]{1,0} broadcast(%convert_element_type.548), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.821 = bf16[7168]{0} reshape(%broadcast_in_dim.820), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.822 = bf16[7168,64]{1,0} broadcast(%broadcast_in_dim.821), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.759 = bf16[7168,64]{1,0} divide(%Arg_90.3, %broadcast_in_dim.822), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.140 = bf16[7168,64]{1,0} call(%div.759, %constant.406, %constant.405), to_apply=%clip_518.155, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.549 = f8e4m3fn[7168,64]{1,0} convert(%jit_clip_.140), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.81 = bf16[32,4096,64]{2,1,0} dot(%convert_element_type.546, %convert_element_type.549), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.736 = f32[1]{0} multiply(%div.753, %div.758), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.550 = bf16[1]{0} convert(%mul.736), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.823 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.550), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.824 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.823), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.825 = bf16[32,4096,64]{2,1,0} broadcast(%broadcast_in_dim.824), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.737 = bf16[32,4096,64]{2,1,0} multiply(%dot_general.81, %broadcast_in_dim.825), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.119 = bf16[32,4096,64]{2,1,0} custom-call(%mul.737), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_91.3 = bf16[64]{0} parameter(91)
+  %custom_partitioning.35 = (bf16[32,4096,64]{2,1,0}, pred[32,4096,64]{2,1,0}, bf16[32,4096,64]{2,1,0}) custom-call(%sharding_constraint.119, %Arg_91.3), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}, backend_config="281456797909792"
+  %custom_partitioning.36 = bf16[32,4096,64]{2,1,0} get-tuple-element(%custom_partitioning.35), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.37 = pred[32,4096,64]{2,1,0} get-tuple-element(%custom_partitioning.35), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.551 = s32[32,4096,64]{2,1,0} convert(%custom_partitioning.37), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_cumsum_.3 = s32[32,4096,64]{2,1,0} call(%convert_element_type.551), to_apply=%cumsum_519.156, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/jit(cumsum)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.374 = s32[] constant(512)
+  %le.14 = s32[32,4096,64]{2,1,0} broadcast(%constant.374), dimensions={}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/le" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %le.15 = pred[32,4096,64]{2,1,0} compare(%jit_cumsum_.3, %le.14), direction=LE, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/le" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.552 = s32[32,4096,64]{2,1,0} convert(%le.15), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.738 = s32[32,4096,64]{2,1,0} multiply(%convert_element_type.551, %convert_element_type.552), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.553 = bf16[32,4096,64]{2,1,0} convert(%mul.738), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.739 = bf16[32,4096,64]{2,1,0} multiply(%custom_partitioning.36, %convert_element_type.553), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.826 = bf16[32,4096,64,1]{3,2,1,0} reshape(%mul.739), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.742 = bf16[32,4096,64,1]{3,2,1,0} broadcast(%broadcast_in_dim.826), dimensions={0,1,2,3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.743 = bf16[32,4096,64]{2,1,0} reshape(%mul.742), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.744 = bf16[32,4096,64,513]{3,2,1,0} broadcast(%mul.743), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.740 = s32[32,4096,64]{2,1,0} multiply(%convert_element_type.551, %jit_cumsum_.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.741 = s32[32,4096,64]{2,1,0} multiply(%mul.740, %mul.738), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__one_hot_.4 = s32[32,4096,64,513]{3,2,1,0} call(%mul.741), to_apply=%_one_hot_520.157, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/jit(_one_hot)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.554 = bf16[32,4096,64,513]{3,2,1,0} convert(%jit__one_hot_.4), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.745 = bf16[32,4096,64,513]{3,2,1,0} multiply(%mul.744, %convert_element_type.554), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.35 = bf16[32,4096,64,512]{3,2,1,0} slice(%mul.745), slice={[0:32], [0:4096], [0:64], [1:513]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.121 = bf16[32,4096,64,512]{3,2,1,0} custom-call(%slice.35), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_38.5 = f32[1024]{0} parameter(38)
+  %reduce_max.931 = f32[] reduce(%Arg_38.5, %constant.411), dimensions={0}, to_apply=%region_125.185, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.140 = pred[] is-finite(%reduce_max.931), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.140 = pred[] compare(%reduce_max.931, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.796 = f32[] divide(%constant.409, %reduce_max.931), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.797 = f32[] divide(%div.796, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_39.5 = f32[1]{0} parameter(39)
+  %div.795 = f32[1]{0} divide(%constant.392, %Arg_39.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.284 = f32[1]{0} call(%gt.140, %div.797, %div.795), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.285 = f32[1]{0} call(%is_finite.140, %jit__where_.284, %div.795), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.798 = f32[1]{0} divide(%constant.392, %jit__where_.285), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.581 = bf16[1]{0} convert(%div.798), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.854 = bf16[32,4096,64,1]{3,2,1,0} broadcast(%convert_element_type.581), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.855 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.854), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.856 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%broadcast_in_dim.855), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.799 = bf16[32,4096,64,512]{3,2,1,0} divide(%sharding_constraint.121, %broadcast_in_dim.856), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.148 = bf16[32,4096,64,512]{3,2,1,0} call(%div.799, %constant.406, %constant.405), to_apply=%clip_523.161, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.582 = f8e4m3fn[32,4096,64,512]{3,2,1,0} convert(%jit_clip_.148), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.91 = bf16[32,7168,64,512]{3,2,1,0} dot(%convert_element_type.621, %convert_element_type.582), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.52 = bf16[64,32,512,7168]{2,0,3,1} transpose(%dot_general.91), dimensions={2,0,3,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.768 = f32[1]{0} multiply(%div.798, %div.853), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.623 = bf16[1]{0} convert(%mul.768), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.911 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.623), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.912 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.911), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.913 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.912), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.769 = bf16[64,32,512,7168]{2,0,3,1} multiply(%transpose.52, %broadcast_in_dim.913), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.131 = bf16[64,32,512,7168]{3,2,1,0} custom-call(%mul.769), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_36.5 = f32[1024]{0} parameter(36)
+  %reduce_max.955 = f32[] reduce(%Arg_36.5, %constant.411), dimensions={0}, to_apply=%region_161.231, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.152 = pred[] is-finite(%reduce_max.955), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.152 = pred[] compare(%reduce_max.955, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.856 = f32[] divide(%constant.402, %reduce_max.955), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.857 = f32[] divide(%div.856, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_37.5 = f32[1]{0} parameter(37)
+  %div.855 = f32[1]{0} divide(%constant.392, %Arg_37.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.308 = f32[1]{0} call(%gt.152, %div.857, %div.855), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.309 = f32[1]{0} call(%is_finite.152, %jit__where_.308, %div.855), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.858 = f32[1]{0} divide(%constant.392, %jit__where_.309), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.625 = bf16[1]{0} convert(%div.858), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.914 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.625), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.915 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.914), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.916 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.915), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.859 = bf16[64,32,512,7168]{3,2,1,0} divide(%sharding_constraint.131, %broadcast_in_dim.916), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.160 = bf16[64,32,512,7168]{3,2,1,0} call(%div.859, %constant.401, %constant.400), to_apply=%clip_574.234, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.626 = f8e5m2[64,32,512,7168]{3,2,1,0} convert(%jit_clip_.160), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_94.3 = bf16[64,2048,7168]{2,1,0} parameter(94)
+  %sharding_constraint.125 = bf16[64,2048,7168]{2,1,0} custom-call(%Arg_94.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_34.5 = f32[1024]{0} parameter(34)
+  %reduce_max.929 = f32[] reduce(%Arg_34.5, %constant.411), dimensions={0}, to_apply=%region_122.181, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.139 = pred[] is-finite(%reduce_max.929), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.139 = pred[] compare(%reduce_max.929, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.791 = f32[] divide(%constant.409, %reduce_max.929), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.792 = f32[] divide(%div.791, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_35.5 = f32[1]{0} parameter(35)
+  %div.790 = f32[1]{0} divide(%constant.392, %Arg_35.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.282 = f32[1]{0} call(%gt.139, %div.792, %div.790), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.283 = f32[1]{0} call(%is_finite.139, %jit__where_.282, %div.790), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.793 = f32[1]{0} divide(%constant.392, %jit__where_.283), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.577 = bf16[1]{0} convert(%div.793), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.848 = bf16[64,2048,1]{2,1,0} broadcast(%convert_element_type.577), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.849 = bf16[64,2048]{1,0} reshape(%broadcast_in_dim.848), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.850 = bf16[64,2048,7168]{2,1,0} broadcast(%broadcast_in_dim.849), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.794 = bf16[64,2048,7168]{2,1,0} divide(%sharding_constraint.125, %broadcast_in_dim.850), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.147 = bf16[64,2048,7168]{2,1,0} call(%div.794, %constant.406, %constant.405), to_apply=%clip_539.184, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.578 = f8e4m3fn[64,2048,7168]{2,1,0} convert(%jit_clip_.147), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.92 = bf16[64,32,512,2048]{3,2,1,0} dot(%convert_element_type.626, %convert_element_type.578), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.770 = f32[1]{0} multiply(%div.793, %div.858), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.627 = bf16[1]{0} convert(%mul.770), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.917 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.627), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.918 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.917), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.919 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%broadcast_in_dim.918), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.771 = bf16[64,32,512,2048]{3,2,1,0} multiply(%dot_general.92, %broadcast_in_dim.919), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.774 = bf16[64,32,512,2048]{3,2,1,0} multiply(%jit_silu_.17, %mul.771), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/ffn_act/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.133 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%mul.774), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_29.5 = f32[1024]{0} parameter(29)
+  %reduce_max.957 = f32[] reduce(%Arg_29.5, %constant.411), dimensions={0}, to_apply=%region_164.236, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.153 = pred[] is-finite(%reduce_max.957), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.153 = pred[] compare(%reduce_max.957, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.861 = f32[] divide(%constant.402, %reduce_max.957), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.862 = f32[] divide(%div.861, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_31.5 = f32[1]{0} parameter(31)
+  %div.860 = f32[1]{0} divide(%constant.392, %Arg_31.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.310 = f32[1]{0} call(%gt.153, %div.862, %div.860), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.311 = f32[1]{0} call(%is_finite.153, %jit__where_.310, %div.860), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.863 = f32[1]{0} divide(%constant.392, %jit__where_.311), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.630 = bf16[1]{0} convert(%div.863), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.923 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.630), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.924 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.923), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.925 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%broadcast_in_dim.924), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.864 = bf16[64,32,512,2048]{3,2,1,0} divide(%sharding_constraint.133, %broadcast_in_dim.925), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.161 = bf16[64,32,512,2048]{3,2,1,0} call(%div.864, %constant.401, %constant.400), to_apply=%clip_582.239, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.631 = f8e5m2[64,32,512,2048]{3,2,1,0} convert(%jit_clip_.161), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_93.3 = bf16[64,7168,2048]{2,1,0} parameter(93)
+  %sharding_constraint.124 = bf16[64,7168,2048]{2,1,0} custom-call(%Arg_93.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_25.5 = f32[1024]{0} parameter(25)
+  %reduce_max.925 = f32[] reduce(%Arg_25.5, %constant.411), dimensions={0}, to_apply=%region_116.173, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.137 = pred[] is-finite(%reduce_max.925), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.137 = pred[] compare(%reduce_max.925, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.781 = f32[] divide(%constant.409, %reduce_max.925), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.782 = f32[] divide(%div.781, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_27.5 = f32[1]{0} parameter(27)
+  %div.780 = f32[1]{0} divide(%constant.392, %Arg_27.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.278 = f32[1]{0} call(%gt.137, %div.782, %div.780), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.279 = f32[1]{0} call(%is_finite.137, %jit__where_.278, %div.780), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.783 = f32[1]{0} divide(%constant.392, %jit__where_.279), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.571 = bf16[1]{0} convert(%div.783), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.842 = bf16[64,7168,1]{2,1,0} broadcast(%convert_element_type.571), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.843 = bf16[64,7168]{1,0} reshape(%broadcast_in_dim.842), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.844 = bf16[64,7168,2048]{2,1,0} broadcast(%broadcast_in_dim.843), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.784 = bf16[64,7168,2048]{2,1,0} divide(%sharding_constraint.124, %broadcast_in_dim.844), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.145 = bf16[64,7168,2048]{2,1,0} call(%div.784, %constant.406, %constant.405), to_apply=%clip_529.172, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.572 = f8e4m3fn[64,7168,2048]{2,1,0} convert(%jit_clip_.145), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.94 = bf16[64,32,512,7168]{3,2,1,0} dot(%convert_element_type.631, %convert_element_type.572), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.776 = f32[1]{0} multiply(%div.783, %div.863), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.632 = bf16[1]{0} convert(%mul.776), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.926 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.632), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.927 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.926), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.928 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.927), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.777 = bf16[64,32,512,7168]{3,2,1,0} multiply(%dot_general.94, %broadcast_in_dim.928), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.18 = bf16[64,32,512,2048]{3,2,1,0} get-tuple-element(%jit_silu_.16), index=1, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/ffn_act/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.19 = bf16[64,32,512,2048]{3,2,1,0} get-tuple-element(%jit_silu_.16), index=2, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/ffn_act/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_7.5 = bf16[64,32,512,2048]{3,2,1,0} parameter(7)
+  %device_put.29 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%Arg_7.5), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.775 = bf16[64,32,512,2048]{3,2,1,0} multiply(%mul.771, %device_put.29), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/ffn_act/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.25 = bf16[64,32,512,2048]{3,2,1,0} call(%jit_silu_.18, %device_put.28, %jit_silu_.19, %mul.775), to_apply=%silu_580.235, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/ffn_act/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.134 = bf16[64,32,512,2048]{3,2,1,0} custom-call(%jit_silu_.25), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_28.5 = f32[1024]{0} parameter(28)
+  %reduce_max.959 = f32[] reduce(%Arg_28.5, %constant.411), dimensions={0}, to_apply=%region_167.240, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.154 = pred[] is-finite(%reduce_max.959), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.154 = pred[] compare(%reduce_max.959, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.866 = f32[] divide(%constant.402, %reduce_max.959), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.867 = f32[] divide(%div.866, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_30.5 = f32[1]{0} parameter(30)
+  %div.865 = f32[1]{0} divide(%constant.392, %Arg_30.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.312 = f32[1]{0} call(%gt.154, %div.867, %div.865), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.313 = f32[1]{0} call(%is_finite.154, %jit__where_.312, %div.865), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.868 = f32[1]{0} divide(%constant.392, %jit__where_.313), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.635 = bf16[1]{0} convert(%div.868), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.932 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.635), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.933 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.932), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.934 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%broadcast_in_dim.933), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.869 = bf16[64,32,512,2048]{3,2,1,0} divide(%sharding_constraint.134, %broadcast_in_dim.934), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.162 = bf16[64,32,512,2048]{3,2,1,0} call(%div.869, %constant.401, %constant.400), to_apply=%clip_582.239, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.636 = f8e5m2[64,32,512,2048]{3,2,1,0} convert(%jit_clip_.162), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_92.3 = bf16[64,7168,2048]{2,1,0} parameter(92)
+  %sharding_constraint.123 = bf16[64,7168,2048]{2,1,0} custom-call(%Arg_92.3), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_24.5 = f32[1024]{0} parameter(24)
+  %reduce_max.923 = f32[] reduce(%Arg_24.5, %constant.411), dimensions={0}, to_apply=%region_113.169, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.136 = pred[] is-finite(%reduce_max.923), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.136 = pred[] compare(%reduce_max.923, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.776 = f32[] divide(%constant.409, %reduce_max.923), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.777 = f32[] divide(%div.776, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_26.5 = f32[1]{0} parameter(26)
+  %div.775 = f32[1]{0} divide(%constant.392, %Arg_26.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.276 = f32[1]{0} call(%gt.136, %div.777, %div.775), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.277 = f32[1]{0} call(%is_finite.136, %jit__where_.276, %div.775), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.778 = f32[1]{0} divide(%constant.392, %jit__where_.277), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.568 = bf16[1]{0} convert(%div.778), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.839 = bf16[64,7168,1]{2,1,0} broadcast(%convert_element_type.568), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.840 = bf16[64,7168]{1,0} reshape(%broadcast_in_dim.839), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.841 = bf16[64,7168,2048]{2,1,0} broadcast(%broadcast_in_dim.840), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.779 = bf16[64,7168,2048]{2,1,0} divide(%sharding_constraint.123, %broadcast_in_dim.841), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.144 = bf16[64,7168,2048]{2,1,0} call(%div.779, %constant.406, %constant.405), to_apply=%clip_529.172, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.569 = f8e4m3fn[64,7168,2048]{2,1,0} convert(%jit_clip_.144), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.96 = bf16[64,32,512,7168]{3,2,1,0} dot(%convert_element_type.636, %convert_element_type.569), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.780 = f32[1]{0} multiply(%div.778, %div.868), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.637 = bf16[1]{0} convert(%mul.780), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.935 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.637), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.936 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.935), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.937 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.936), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.781 = bf16[64,32,512,7168]{3,2,1,0} multiply(%dot_general.96, %broadcast_in_dim.937), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.21 = bf16[64,32,512,7168]{3,2,1,0} add(%mul.777, %mul.781), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.137 = bf16[64,32,512,7168]{3,2,1,0} custom-call(%add_any.21), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.56 = bf16[32,64,512,7168]{3,2,0,1} transpose(%sharding_constraint.137), dimensions={1,0,2,3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_20.5 = f32[1024]{0} parameter(20)
+  %reduce_max.961 = f32[] reduce(%Arg_20.5, %constant.411), dimensions={0}, to_apply=%region_170.243, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.155 = pred[] is-finite(%reduce_max.961), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.155 = pred[] compare(%reduce_max.961, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.871 = f32[] divide(%constant.402, %reduce_max.961), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.872 = f32[] divide(%div.871, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_21.5 = f32[1]{0} parameter(21)
+  %div.870 = f32[1]{0} divide(%constant.392, %Arg_21.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.314 = f32[1]{0} call(%gt.155, %div.872, %div.870), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.315 = f32[1]{0} call(%is_finite.155, %jit__where_.314, %div.870), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.873 = f32[1]{0} divide(%constant.392, %jit__where_.315), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.640 = bf16[1]{0} convert(%div.873), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.941 = bf16[32,64,512,1]{3,2,1,0} broadcast(%convert_element_type.640), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.942 = bf16[32,64,512]{2,1,0} reshape(%broadcast_in_dim.941), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.943 = bf16[32,64,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.942), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.874 = bf16[32,64,512,7168]{3,2,0,1} divide(%transpose.56, %broadcast_in_dim.943), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.163 = bf16[32,64,512,7168]{3,2,1,0} call(%div.874, %constant.401, %constant.400), to_apply=%clip_593.246, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.641 = f8e5m2[32,64,512,7168]{3,2,1,0} convert(%jit_clip_.163), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.373 = bf16[] constant(0)
+  %convert_element_type.498 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%constant.373), dimensions={}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.555 = pred[32,4096,64,512]{3,2,1,0} compare(%slice.35, %convert_element_type.498), direction=NE, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.120 = pred[32,4096,64,512]{3,2,1,0} custom-call(%convert_element_type.555), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.556 = bf16[32,4096,64,512]{3,2,1,0} convert(%sharding_constraint.120), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_16.5 = f32[1024]{0} parameter(16)
+  %reduce_max.917 = f32[] reduce(%Arg_16.5, %constant.411), dimensions={0}, to_apply=%region_104.158, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.133 = pred[] is-finite(%reduce_max.917), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.133 = pred[] compare(%reduce_max.917, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.761 = f32[] divide(%constant.409, %reduce_max.917), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.762 = f32[] divide(%div.761, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_17.5 = f32[1]{0} parameter(17)
+  %div.760 = f32[1]{0} divide(%constant.392, %Arg_17.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.270 = f32[1]{0} call(%gt.133, %div.762, %div.760), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.271 = f32[1]{0} call(%is_finite.133, %jit__where_.270, %div.760), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.763 = f32[1]{0} divide(%constant.392, %jit__where_.271), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.558 = bf16[1]{0} convert(%div.763), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.827 = bf16[32,4096,64,1]{3,2,1,0} broadcast(%convert_element_type.558), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.828 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.827), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.829 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%broadcast_in_dim.828), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.764 = bf16[32,4096,64,512]{3,2,1,0} divide(%convert_element_type.556, %broadcast_in_dim.829), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.141 = bf16[32,4096,64,512]{3,2,1,0} call(%div.764, %constant.406, %constant.405), to_apply=%clip_523.161, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.559 = f8e4m3fn[32,4096,64,512]{3,2,1,0} convert(%jit_clip_.141), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.98 = bf16[32,7168,4096]{2,1,0} dot(%convert_element_type.641, %convert_element_type.559), lhs_batch_dims={0}, lhs_contracting_dims={1,2}, rhs_batch_dims={0}, rhs_contracting_dims={2,3}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.57 = bf16[32,4096,7168]{1,2,0} transpose(%dot_general.98), dimensions={0,2,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.784 = f32[1]{0} multiply(%div.763, %div.873), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.642 = bf16[1]{0} convert(%mul.784), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.944 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.642), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.945 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.944), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.946 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.945), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.785 = bf16[32,4096,7168]{1,2,0} multiply(%transpose.57, %broadcast_in_dim.946), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.22 = bf16[32,4096,7168]{2,1,0} add(%add_any.20, %mul.785), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.38 = bf16[32,4096,64]{2,1,0} get-tuple-element(%custom_partitioning.35), index=2, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [k])->([i, j, k], [i, j, k], [i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.748 = bf16[64,32,512,2048]{3,2,1,0} multiply(%jit_silu_.17, %device_put.29), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/ffn_act/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_32.5 = f32[1024]{0} parameter(32)
+  %reduce_max.927 = f32[] reduce(%Arg_32.5, %constant.411), dimensions={0}, to_apply=%region_119.177, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.138 = pred[] is-finite(%reduce_max.927), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.138 = pred[] compare(%reduce_max.927, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.786 = f32[] divide(%constant.409, %reduce_max.927), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.787 = f32[] divide(%div.786, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_33.5 = f32[1]{0} parameter(33)
+  %div.785 = f32[1]{0} divide(%constant.392, %Arg_33.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.280 = f32[1]{0} call(%gt.138, %div.787, %div.785), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.281 = f32[1]{0} call(%is_finite.138, %jit__where_.280, %div.785), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.788 = f32[1]{0} divide(%constant.392, %jit__where_.281), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.574 = bf16[1]{0} convert(%div.788), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.845 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.574), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.846 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.845), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.847 = bf16[64,32,512,2048]{3,2,1,0} broadcast(%broadcast_in_dim.846), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.789 = bf16[64,32,512,2048]{3,2,1,0} divide(%mul.748, %broadcast_in_dim.847), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.146 = bf16[64,32,512,2048]{3,2,1,0} call(%div.789, %constant.406, %constant.405), to_apply=%clip_536.180, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.575 = f8e4m3fn[64,32,512,2048]{3,2,1,0} convert(%jit_clip_.146), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.83 = bf16[64,32,512,7168]{3,2,1,0} dot(%convert_element_type.575, %convert_element_type.578), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.749 = f32[1]{0} multiply(%div.788, %div.793), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.579 = bf16[1]{0} convert(%mul.749), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.851 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.579), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.852 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.851), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.853 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.852), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.750 = bf16[64,32,512,7168]{3,2,1,0} multiply(%dot_general.83, %broadcast_in_dim.853), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.126 = bf16[64,32,512,7168]{3,2,1,0} custom-call(%mul.750), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_40.5 = f32[1024]{0} parameter(40)
+  %reduce_max.933 = f32[] reduce(%Arg_40.5, %constant.411), dimensions={0}, to_apply=%region_128.188, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.141 = pred[] is-finite(%reduce_max.933), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.141 = pred[] compare(%reduce_max.933, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.801 = f32[] divide(%constant.409, %reduce_max.933), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.802 = f32[] divide(%div.801, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_41.5 = f32[1]{0} parameter(41)
+  %div.800 = f32[1]{0} divide(%constant.392, %Arg_41.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.286 = f32[1]{0} call(%gt.141, %div.802, %div.800), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.287 = f32[1]{0} call(%is_finite.141, %jit__where_.286, %div.800), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.803 = f32[1]{0} divide(%constant.392, %jit__where_.287), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.584 = bf16[1]{0} convert(%div.803), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.857 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.584), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.858 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.857), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.859 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.858), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.804 = bf16[64,32,512,7168]{3,2,1,0} divide(%sharding_constraint.126, %broadcast_in_dim.859), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.149 = bf16[64,32,512,7168]{3,2,1,0} call(%div.804, %constant.406, %constant.405), to_apply=%clip_526.168, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.585 = f8e4m3fn[64,32,512,7168]{3,2,1,0} convert(%jit_clip_.149), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.90 = bf16[32,4096,64,512]{3,2,1,0} dot(%convert_element_type.621, %convert_element_type.585), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={1}, rhs_contracting_dims={3}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.766 = f32[1]{0} multiply(%div.803, %div.853), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.622 = bf16[1]{0} convert(%mul.766), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.908 = bf16[32,4096,64,1]{3,2,1,0} broadcast(%convert_element_type.622), dimensions={3}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.909 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.908), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.910 = bf16[32,4096,64,512]{3,2,1,0} broadcast(%broadcast_in_dim.909), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.767 = bf16[32,4096,64,512]{3,2,1,0} multiply(%dot_general.90, %broadcast_in_dim.910), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.138 = bf16[32,4096,64,512]{3,2,1,0} custom-call(%mul.767), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}, {}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.403 = bf16[] constant(0)
+  %pad.5 = bf16[32,4096,64,513]{3,2,1,0} pad(%sharding_constraint.138, %constant.403), padding=0_0x0_0x0_0x1_0, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.786 = bf16[32,4096,64,513]{3,2,1,0} multiply(%pad.5, %convert_element_type.554), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.299 = bf16[32,4096,64]{2,1,0} reduce(%mul.786, %constant.403), dimensions={3}, to_apply=%region_173.247, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.25 = bf16[32,4096,64,1]{3,2,1,0} reshape(%reduce_sum.299), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.300 = bf16[32,4096,64]{2,1,0} reduce(%reshape.25, %constant.403), dimensions={3}, to_apply=%region_174.248, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.787 = bf16[32,4096,64]{2,1,0} multiply(%reduce_sum.300, %convert_element_type.553), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.39 = bf16[32,4096,64]{2,1,0} custom-call(%custom_partitioning.37, %custom_partitioning.38, %mul.787), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k], [i, j, k], [i, j, k])->([i, j, k]) {i=32, j=4096, k=64}, custom>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}, backend_config="281458267806352"
+  %sharding_constraint.139 = bf16[32,4096,64]{2,1,0} custom-call(%custom_partitioning.39), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_14.5 = f32[1024]{0} parameter(14)
+  %reduce_max.963 = f32[] reduce(%Arg_14.5, %constant.411), dimensions={0}, to_apply=%region_175.249, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.156 = pred[] is-finite(%reduce_max.963), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.156 = pred[] compare(%reduce_max.963, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.876 = f32[] divide(%constant.402, %reduce_max.963), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.877 = f32[] divide(%div.876, %constant.410), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_15.5 = f32[1]{0} parameter(15)
+  %div.875 = f32[1]{0} divide(%constant.392, %Arg_15.5), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.316 = f32[1]{0} call(%gt.156, %div.877, %div.875), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.317 = f32[1]{0} call(%is_finite.156, %jit__where_.316, %div.875), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.878 = f32[1]{0} divide(%constant.392, %jit__where_.317), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.644 = bf16[1]{0} convert(%div.878), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.947 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.644), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.948 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.947), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.949 = bf16[32,4096,64]{2,1,0} broadcast(%broadcast_in_dim.948), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.879 = bf16[32,4096,64]{2,1,0} divide(%sharding_constraint.139, %broadcast_in_dim.949), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.164 = bf16[32,4096,64]{2,1,0} call(%div.879, %constant.401, %constant.400), to_apply=%clip_607.252, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.645 = f8e5m2[32,4096,64]{2,1,0} convert(%jit_clip_.164), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.99 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.645, %convert_element_type.549), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.788 = f32[1]{0} multiply(%div.758, %div.878), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.646 = bf16[1]{0} convert(%mul.788), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.950 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.646), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.951 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.950), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.952 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.951), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.789 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.99, %broadcast_in_dim.952), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.23 = bf16[32,4096,7168]{2,1,0} add(%add_any.22, %mul.789), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.140 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.23), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.793 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.816), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.794 = bf16[7168]{0} reshape(%mul.793), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.795 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.794), dimensions={2}, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.796 = bf16[32,4096,7168]{2,1,0} multiply(%sharding_constraint.140, %mul.795), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.648 = f32[32,4096,7168]{2,1,0} convert(%mul.796), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.798 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.24), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.799 = f32[32,4096]{1,0} reshape(%mul.798), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.800 = f32[32,4096,7168]{2,1,0} broadcast(%mul.799), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.801 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.648, %mul.800), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.797 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.542, %convert_element_type.648), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.303 = f32[32,4096]{1,0} reduce(%mul.797, %constant.412), dimensions={2}, to_apply=%region_180.255, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.27 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.303), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.749 = f32[32,4096,1]{2,1,0} divide(%rsqrt.24, %add.107), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.393 = f32[] constant(-0.5)
+  %broadcast.134 = f32[32,4096,1]{2,1,0} broadcast(%constant.393), dimensions={}
+  %mul.727 = f32[32,4096,1]{2,1,0} multiply(%div.749, %broadcast.134), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.802 = f32[32,4096,1]{2,1,0} multiply(%reshape.27, %mul.727), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.880 = f32[32,4096,1]{2,1,0} divide(%mul.802, %broadcast.136), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.304 = f32[32,4096]{1,0} reduce(%div.880, %constant.412), dimensions={2}, to_apply=%region_181.256, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.956 = f32[32,4096,7168]{2,1,0} broadcast(%reduce_sum.304), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.396 = f32[] constant(2)
+  %broadcast.137 = f32[32,4096,7168]{2,1,0} broadcast(%constant.396), dimensions={}
+  %mul.726 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.542, %broadcast.137), metadata={op_name="checkpoint/rematted_computation/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.803 = f32[32,4096,7168]{2,1,0} multiply(%broadcast_in_dim.956, %mul.726), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.24 = f32[32,4096,7168]{2,1,0} add(%mul.801, %mul.803), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.649 = bf16[32,4096,7168]{2,1,0} convert(%add_any.24), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.25 = bf16[32,4096,7168]{2,1,0} add(%sharding_constraint.128, %convert_element_type.649), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_3.9 = bf16[32,4096,1536]{2,1,0} parameter(3)
+  %device_put.26 = bf16[32,4096,1536]{2,1,0} custom-call(%Arg_3.9), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.510 = f32[32,4096,1536]{2,1,0} convert(%device_put.26), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.23 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.510, %convert_element_type.510), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.296 = f32[32,4096]{1,0} reduce(%square.23, %constant.412), dimensions={2}, to_apply=%region_76.118, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.778 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.296), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.389 = f32[] constant(1536)
+  %broadcast.133 = f32[32,4096,1]{2,1,0} broadcast(%constant.389), dimensions={}
+  %div.706 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.778, %broadcast.133), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.100 = f32[32,4096,1]{2,1,0} add(%div.706, %broadcast.135), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.22 = f32[32,4096,1]{2,1,0} rsqrt(%add.100), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.674 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.22), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.675 = f32[32,4096]{1,0} reshape(%mul.674), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.676 = f32[32,4096,1536]{2,1,0} broadcast(%mul.675), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.677 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.510, %mul.676), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.511 = bf16[32,4096,1536]{2,1,0} convert(%mul.677), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_102.3 = bf16[1536]{0} parameter(102)
+  %broadcast_in_dim.779 = bf16[1,1,1536]{2,1,0} reshape(%Arg_102.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.678 = bf16[1,1,1536]{2,1,0} broadcast(%broadcast_in_dim.779), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.679 = bf16[1536]{0} reshape(%mul.678), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.680 = bf16[32,4096,1536]{2,1,0} broadcast(%mul.679), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.681 = bf16[32,4096,1536]{2,1,0} multiply(%convert_element_type.511, %mul.680), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_84.3 = f32[1024]{0} parameter(84)
+  %reduce_max.901 = f32[] reduce(%Arg_84.3, %constant.411), dimensions={0}, to_apply=%region_77.119, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.125 = pred[] is-finite(%reduce_max.901), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.125 = pred[] compare(%reduce_max.901, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.709 = f32[] divide(%constant.409, %reduce_max.901), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.710 = f32[] divide(%div.709, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_85.3 = f32[1]{0} parameter(85)
+  %div.708 = f32[1]{0} divide(%constant.392, %Arg_85.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.254 = f32[1]{0} call(%gt.125, %div.710, %div.708), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.255 = f32[1]{0} call(%is_finite.125, %jit__where_.254, %div.708), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.711 = f32[1]{0} divide(%constant.392, %jit__where_.255), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.513 = bf16[1]{0} convert(%div.711), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.780 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.513), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.781 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.780), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.782 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.781), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.712 = bf16[32,4096,1536]{2,1,0} divide(%mul.681, %broadcast_in_dim.782), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.131 = bf16[32,4096,1536]{2,1,0} call(%div.712, %constant.406, %constant.405), to_apply=%clip_495.122, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.514 = f8e4m3fn[32,4096,1536]{2,1,0} convert(%jit_clip_.131), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_106.3 = bf16[1536,128,192]{2,1,0} parameter(106)
+  %Arg_86.3 = f32[1024]{0} parameter(86)
+  %reduce_max.903 = f32[] reduce(%Arg_86.3, %constant.411), dimensions={0}, to_apply=%region_80.123, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.126 = pred[] is-finite(%reduce_max.903), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.126 = pred[] compare(%reduce_max.903, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.714 = f32[] divide(%constant.409, %reduce_max.903), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.715 = f32[] divide(%div.714, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_87.3 = f32[1]{0} parameter(87)
+  %div.713 = f32[1]{0} divide(%constant.392, %Arg_87.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.256 = f32[1]{0} call(%gt.126, %div.715, %div.713), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.257 = f32[1]{0} call(%is_finite.126, %jit__where_.256, %div.713), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.716 = f32[1]{0} divide(%constant.392, %jit__where_.257), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.516 = bf16[1]{0} convert(%div.716), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.783 = bf16[1536,128,1]{2,1,0} broadcast(%convert_element_type.516), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.784 = bf16[1536,128]{1,0} reshape(%broadcast_in_dim.783), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.785 = bf16[1536,128,192]{2,1,0} broadcast(%broadcast_in_dim.784), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.717 = bf16[1536,128,192]{2,1,0} divide(%Arg_106.3, %broadcast_in_dim.785), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.132 = bf16[1536,128,192]{2,1,0} call(%div.717, %constant.406, %constant.405), to_apply=%clip_498.126, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.517 = f8e4m3fn[1536,128,192]{2,1,0} convert(%jit_clip_.132), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.79 = bf16[32,4096,128,192]{3,2,1,0} dot(%convert_element_type.514, %convert_element_type.517), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.682 = f32[1]{0} multiply(%div.711, %div.716), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.518 = bf16[1]{0} convert(%mul.682), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.786 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.518), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.787 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.786), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.788 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%broadcast_in_dim.787), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.683 = bf16[32,4096,128,192]{3,2,1,0} multiply(%dot_general.79, %broadcast_in_dim.788), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.38 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.683), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.39 = bf16[32,4096,128,64]{3,2,1,0} slice(%mul.683), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.23 = bf16[32,4096,128,32,2]{4,3,2,1,0} reshape(%split.39), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.31 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.23), slice={[0:32], [0:4096], [0:128], [0:32], [0:1]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.348 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.31), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.521 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.348), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.32 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.23), slice={[0:32], [0:4096], [0:128], [0:32], [1:2]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.349 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.32), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.520 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.349), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.380 = c64[] constant((0, 1))
+  %broadcast.124 = c64[32,4096,128,32]{3,2,1,0} broadcast(%constant.380), dimensions={}
+  %mul.695 = c64[32,4096,128,32]{3,2,1,0} multiply(%convert_element_type.520, %broadcast.124), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.102 = c64[32,4096,128,32]{3,2,1,0} add(%convert_element_type.521, %mul.695), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.32 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.789 = f32[163840,1]{1,0} reshape(%iota.32), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.687 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.789), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.688 = f32[163840]{0} reshape(%mul.687), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.689 = f32[163840,32]{1,0} broadcast(%mul.688), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.385 = f32[] constant(1)
+  %broadcast.129 = f32[32]{0} broadcast(%constant.385), dimensions={}
+  %constant.386 = f32[] constant(10000)
+  %broadcast.130 = f32[32]{0} broadcast(%constant.386), dimensions={}
+  %iota.30 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.388 = f32[] constant(2)
+  %broadcast.132 = f32[32]{0} broadcast(%constant.388), dimensions={}
+  %mul.684 = f32[32]{0} multiply(%iota.30, %broadcast.132), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.387 = f32[] constant(64)
+  %broadcast.131 = f32[32]{0} broadcast(%constant.387), dimensions={}
+  %div.718 = f32[32]{0} divide(%mul.684, %broadcast.131), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pow.6 = f32[32]{0} power(%broadcast.130, %div.718), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/pow" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.719 = f32[32]{0} divide(%broadcast.129, %pow.6), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.382 = f32[] constant(40)
+  %broadcast.126 = f32[32]{0} broadcast(%constant.382), dimensions={}
+  %div.721 = f32[32]{0} divide(%div.719, %broadcast.126), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.31 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.384 = f32[] constant(10)
+  %broadcast.128 = f32[32]{0} broadcast(%constant.384), dimensions={}
+  %sub.33 = f32[32]{0} subtract(%iota.31, %broadcast.128), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.383 = f32[] constant(13)
+  %broadcast.127 = f32[32]{0} broadcast(%constant.383), dimensions={}
+  %div.720 = f32[32]{0} divide(%sub.33, %broadcast.127), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.407 = s32[] constant(0)
+  %constant.404 = s32[] constant(1)
+  %jit_clip_.133 = f32[32]{0} call(%div.720, %constant.407, %constant.404), to_apply=%clip_499.127, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.34 = f32[32]{0} subtract(%broadcast.129, %jit_clip_.133), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.35 = f32[32]{0} subtract(%broadcast.129, %sub.34), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.685 = f32[32]{0} multiply(%div.721, %sub.35), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.686 = f32[32]{0} multiply(%div.719, %sub.34), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.101 = f32[32]{0} add(%mul.685, %mul.686), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.790 = f32[1,32]{1,0} reshape(%add.101), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.690 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.790), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.691 = f32[32]{0} reshape(%mul.690), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.692 = f32[163840,32]{1,0} broadcast(%mul.691), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.693 = f32[163840,32]{1,0} multiply(%mul.689, %mul.692), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.519 = c64[163840,32]{1,0} convert(%mul.693), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.381 = c64[] constant((0, 1))
+  %broadcast.125 = c64[163840,32]{1,0} broadcast(%constant.381), dimensions={}
+  %mul.694 = c64[163840,32]{1,0} multiply(%convert_element_type.519, %broadcast.125), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.125 = f32[163840,32]{1,0} real(%mul.694), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.126 = f32[163840,32]{1,0} exponential(%exp.125), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.399 = f32[] constant(inf)
+  %broadcast.140 = f32[163840,32]{1,0} broadcast(%constant.399), dimensions={}
+  %exp.127 = pred[163840,32]{1,0} compare(%exp.126, %broadcast.140), direction=EQ, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.398 = f32[] constant(0.5)
+  %broadcast.139 = f32[163840,32]{1,0} broadcast(%constant.398), dimensions={}
+  %exp.128 = f32[163840,32]{1,0} multiply(%exp.125, %broadcast.139), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.129 = f32[163840,32]{1,0} exponential(%exp.128), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.130 = f32[163840,32]{1,0} imag(%mul.694), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.131 = f32[163840,32]{1,0} cosine(%exp.130), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.132 = f32[163840,32]{1,0} multiply(%exp.129, %exp.131), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.133 = f32[163840,32]{1,0} multiply(%exp.132, %exp.129), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.134 = f32[163840,32]{1,0} multiply(%exp.126, %exp.131), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.135 = f32[163840,32]{1,0} select(%exp.127, %exp.133, %exp.134), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.397 = f32[] constant(0)
+  %broadcast.138 = f32[163840,32]{1,0} broadcast(%constant.397), dimensions={}
+  %exp.136 = pred[163840,32]{1,0} compare(%exp.130, %broadcast.138), direction=EQ, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.137 = f32[163840,32]{1,0} sine(%exp.130), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.138 = f32[163840,32]{1,0} multiply(%exp.129, %exp.137), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.139 = f32[163840,32]{1,0} multiply(%exp.138, %exp.129), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.140 = f32[163840,32]{1,0} multiply(%exp.126, %exp.137), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.141 = f32[163840,32]{1,0} select(%exp.127, %exp.139, %exp.140), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.142 = f32[163840,32]{1,0} select(%exp.136, %broadcast.138, %exp.141), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.143 = c64[163840,32]{1,0} complex(%exp.135, %exp.142), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_0.155 = s32[32,4096]{1,0} parameter(0)
+  %jit__take_.6 = c64[32,4096,32]{2,1,0} call(%exp.143, %Arg_0.155), to_apply=%_take_500.130, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/jit(_take)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.791 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.6), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.696 = c64[32,4096,1,32]{3,2,1,0} broadcast(%broadcast_in_dim.791), dimensions={0,1,2,3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.697 = c64[32,4096,32]{2,1,0} reshape(%mul.696), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.698 = c64[32,4096,128,32]{3,2,1,0} broadcast(%mul.697), dimensions={0,1,3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.699 = c64[32,4096,128,32]{3,2,1,0} multiply(%add.102, %mul.698), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %real.10 = f32[32,4096,128,32]{3,2,1,0} real(%mul.699), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/real" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %imag.10 = f32[32,4096,128,32]{3,2,1,0} imag(%mul.699), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/imag" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.27 = f32[32,4096,128,64]{3,2,1,0} concatenate(%real.10, %imag.10), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.522 = bf16[32,4096,128,64]{3,2,1,0} convert(%concatenate.27), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.28 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.38, %convert_element_type.522), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.379 = bf16[] constant(0.1348)
+  %broadcast.123 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%constant.379), dimensions={}
+  %mul.700 = bf16[32,4096,128,192]{3,2,1,0} multiply(%concatenate.28, %broadcast.123), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.113 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%mul.700), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_4.5 = bf16[32,4096,576]{2,1,0} parameter(4)
+  %device_put.27 = bf16[32,4096,576]{2,1,0} custom-call(%Arg_4.5), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.40 = bf16[32,4096,512]{2,1,0} slice(%device_put.27), slice={[0:32], [0:4096], [0:512]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.523 = f32[32,4096,512]{2,1,0} convert(%split.40), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.24 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.523, %convert_element_type.523), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.297 = f32[32,4096]{1,0} reduce(%square.24, %constant.412), dimensions={2}, to_apply=%region_84.131, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.792 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.297), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.377 = f32[] constant(512)
+  %broadcast.122 = f32[32,4096,1]{2,1,0} broadcast(%constant.377), dimensions={}
+  %div.722 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.792, %broadcast.122), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.103 = f32[32,4096,1]{2,1,0} add(%div.722, %broadcast.135), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.23 = f32[32,4096,1]{2,1,0} rsqrt(%add.103), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.703 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.23), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.704 = f32[32,4096]{1,0} reshape(%mul.703), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.705 = f32[32,4096,512]{2,1,0} broadcast(%mul.704), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.706 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.523, %mul.705), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.524 = bf16[32,4096,512]{2,1,0} convert(%mul.706), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_100.3 = bf16[512]{0} parameter(100)
+  %broadcast_in_dim.793 = bf16[1,1,512]{2,1,0} reshape(%Arg_100.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.707 = bf16[1,1,512]{2,1,0} broadcast(%broadcast_in_dim.793), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.708 = bf16[512]{0} reshape(%mul.707), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.709 = bf16[32,4096,512]{2,1,0} broadcast(%mul.708), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.710 = bf16[32,4096,512]{2,1,0} multiply(%convert_element_type.524, %mul.709), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_78.3 = f32[1024]{0} parameter(78)
+  %reduce_max.905 = f32[] reduce(%Arg_78.3, %constant.411), dimensions={0}, to_apply=%region_85.132, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.127 = pred[] is-finite(%reduce_max.905), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.127 = pred[] compare(%reduce_max.905, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.729 = f32[] divide(%constant.409, %reduce_max.905), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.730 = f32[] divide(%div.729, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_79.3 = f32[1]{0} parameter(79)
+  %div.728 = f32[1]{0} divide(%constant.392, %Arg_79.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.258 = f32[1]{0} call(%gt.127, %div.730, %div.728), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.259 = f32[1]{0} call(%is_finite.127, %jit__where_.258, %div.728), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.731 = f32[1]{0} divide(%constant.392, %jit__where_.259), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.530 = bf16[1]{0} convert(%div.731), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.797 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.530), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.798 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.797), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.799 = bf16[32,4096,512]{2,1,0} broadcast(%broadcast_in_dim.798), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.732 = bf16[32,4096,512]{2,1,0} divide(%mul.710, %broadcast_in_dim.799), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.135 = bf16[32,4096,512]{2,1,0} call(%div.732, %constant.406, %constant.405), to_apply=%clip_506.135, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.531 = f8e4m3fn[32,4096,512]{2,1,0} convert(%jit_clip_.135), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_104.3 = bf16[512,128,256]{2,1,0} parameter(104)
+  %Arg_80.3 = f32[1024]{0} parameter(80)
+  %reduce_max.907 = f32[] reduce(%Arg_80.3, %constant.411), dimensions={0}, to_apply=%region_88.136, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.128 = pred[] is-finite(%reduce_max.907), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.128 = pred[] compare(%reduce_max.907, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.734 = f32[] divide(%constant.409, %reduce_max.907), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.735 = f32[] divide(%div.734, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_81.3 = f32[1]{0} parameter(81)
+  %div.733 = f32[1]{0} divide(%constant.392, %Arg_81.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.260 = f32[1]{0} call(%gt.128, %div.735, %div.733), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.261 = f32[1]{0} call(%is_finite.128, %jit__where_.260, %div.733), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.736 = f32[1]{0} divide(%constant.392, %jit__where_.261), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.533 = bf16[1]{0} convert(%div.736), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.800 = bf16[512,128,1]{2,1,0} broadcast(%convert_element_type.533), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.801 = bf16[512,128]{1,0} reshape(%broadcast_in_dim.800), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.802 = bf16[512,128,256]{2,1,0} broadcast(%broadcast_in_dim.801), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.737 = bf16[512,128,256]{2,1,0} divide(%Arg_104.3, %broadcast_in_dim.802), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.136 = bf16[512,128,256]{2,1,0} call(%div.737, %constant.406, %constant.405), to_apply=%clip_509.139, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.534 = f8e4m3fn[512,128,256]{2,1,0} convert(%jit_clip_.136), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.80 = bf16[32,4096,128,256]{3,2,1,0} dot(%convert_element_type.531, %convert_element_type.534), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.724 = f32[1]{0} multiply(%div.731, %div.736), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.535 = bf16[1]{0} convert(%mul.724), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.803 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.535), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.804 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.803), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.805 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%broadcast_in_dim.804), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.725 = bf16[32,4096,128,256]{3,2,1,0} multiply(%dot_general.80, %broadcast_in_dim.805), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.42 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.725), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.41 = bf16[32,4096,64]{2,1,0} slice(%device_put.27), slice={[0:32], [0:4096], [512:576]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.24 = bf16[32,4096,1,32,2]{4,3,2,1,0} reshape(%split.41), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.33 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.24), slice={[0:32], [0:4096], [0:1], [0:32], [0:1]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.350 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.33), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.527 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.350), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.34 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.24), slice={[0:32], [0:4096], [0:1], [0:32], [1:2]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.351 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.34), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.526 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.351), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.376 = c64[] constant((0, 1))
+  %broadcast.121 = c64[32,4096,1,32]{3,2,1,0} broadcast(%constant.376), dimensions={}
+  %mul.722 = c64[32,4096,1,32]{3,2,1,0} multiply(%convert_element_type.526, %broadcast.121), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.105 = c64[32,4096,1,32]{3,2,1,0} add(%convert_element_type.527, %mul.722), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.35 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.794 = f32[163840,1]{1,0} reshape(%iota.35), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.714 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.794), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.715 = f32[163840]{0} reshape(%mul.714), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.716 = f32[163840,32]{1,0} broadcast(%mul.715), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.33 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.711 = f32[32]{0} multiply(%iota.33, %broadcast.132), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.724 = f32[32]{0} divide(%mul.711, %broadcast.131), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pow.7 = f32[32]{0} power(%broadcast.130, %div.724), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/pow" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.725 = f32[32]{0} divide(%broadcast.129, %pow.7), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.727 = f32[32]{0} divide(%div.725, %broadcast.126), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.34 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.36 = f32[32]{0} subtract(%iota.34, %broadcast.128), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.726 = f32[32]{0} divide(%sub.36, %broadcast.127), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.134 = f32[32]{0} call(%div.726, %constant.407, %constant.404), to_apply=%clip_499.127, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.37 = f32[32]{0} subtract(%broadcast.129, %jit_clip_.134), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.38 = f32[32]{0} subtract(%broadcast.129, %sub.37), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.712 = f32[32]{0} multiply(%div.727, %sub.38), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.713 = f32[32]{0} multiply(%div.725, %sub.37), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.104 = f32[32]{0} add(%mul.712, %mul.713), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.795 = f32[1,32]{1,0} reshape(%add.104), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.717 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.795), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.718 = f32[32]{0} reshape(%mul.717), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.719 = f32[163840,32]{1,0} broadcast(%mul.718), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.720 = f32[163840,32]{1,0} multiply(%mul.716, %mul.719), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.525 = c64[163840,32]{1,0} convert(%mul.720), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.721 = c64[163840,32]{1,0} multiply(%convert_element_type.525, %broadcast.125), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.144 = f32[163840,32]{1,0} real(%mul.721), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.145 = f32[163840,32]{1,0} exponential(%exp.144), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.146 = pred[163840,32]{1,0} compare(%exp.145, %broadcast.140), direction=EQ, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.147 = f32[163840,32]{1,0} multiply(%exp.144, %broadcast.139), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.148 = f32[163840,32]{1,0} exponential(%exp.147), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.149 = f32[163840,32]{1,0} imag(%mul.721), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.150 = f32[163840,32]{1,0} cosine(%exp.149), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.151 = f32[163840,32]{1,0} multiply(%exp.148, %exp.150), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.152 = f32[163840,32]{1,0} multiply(%exp.151, %exp.148), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.153 = f32[163840,32]{1,0} multiply(%exp.145, %exp.150), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.154 = f32[163840,32]{1,0} select(%exp.146, %exp.152, %exp.153), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.155 = pred[163840,32]{1,0} compare(%exp.149, %broadcast.138), direction=EQ, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.156 = f32[163840,32]{1,0} sine(%exp.149), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.157 = f32[163840,32]{1,0} multiply(%exp.148, %exp.156), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.158 = f32[163840,32]{1,0} multiply(%exp.157, %exp.148), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.159 = f32[163840,32]{1,0} multiply(%exp.145, %exp.156), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.160 = f32[163840,32]{1,0} select(%exp.146, %exp.158, %exp.159), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.161 = f32[163840,32]{1,0} select(%exp.155, %broadcast.138, %exp.160), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.162 = c64[163840,32]{1,0} complex(%exp.154, %exp.161), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__take_.7 = c64[32,4096,32]{2,1,0} call(%exp.162, %Arg_0.155), to_apply=%_take_500.130, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/jit(_take)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.796 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.7), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.723 = c64[32,4096,1,32]{3,2,1,0} multiply(%add.105, %broadcast_in_dim.796), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %real.11 = f32[32,4096,1,32]{3,2,1,0} real(%mul.723), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/real" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %imag.11 = f32[32,4096,1,32]{3,2,1,0} imag(%mul.723), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/imag" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.29 = f32[32,4096,1,64]{3,2,1,0} concatenate(%real.11, %imag.11), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.528 = bf16[32,4096,1,64]{3,2,1,0} convert(%concatenate.29), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.806 = bf16[32,4096,1,64]{3,2,1,0} broadcast(%convert_element_type.528), dimensions={0,1,2,3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.807 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.806), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.808 = bf16[32,4096,128,64]{3,2,1,0} broadcast(%broadcast_in_dim.807), dimensions={0,1,3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.30 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.42, %broadcast_in_dim.808), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.114 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%concatenate.30), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.43 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.725), slice={[0:32], [0:4096], [0:128], [128:256]}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.115 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%split.43), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.375 = bf16[0]{0} constant({})
+  %custom_partitioning.32 = (bf16[32,4096,128,128]{3,2,1,0}, f32[32,128,4096]{2,1,0}) custom-call(%sharding_constraint.113, %sharding_constraint.114, %sharding_constraint.115, %constant.375, %constant.375, /*index=5*/%constant.375, %constant.375, %constant.375, %constant.375, %constant.375), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}, backend_config="281457329920656"
+  %custom_partitioning.34 = f32[32,128,4096]{2,1,0} get-tuple-element(%custom_partitioning.32), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.33 = bf16[32,4096,128,128]{3,2,1,0} get-tuple-element(%custom_partitioning.32), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.141 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.25), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_76.3 = f32[1024]{0} parameter(76)
+  %reduce_max.965 = f32[] reduce(%Arg_76.3, %constant.411), dimensions={0}, to_apply=%region_182.257, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.157 = pred[] is-finite(%reduce_max.965), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.157 = pred[] compare(%reduce_max.965, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.882 = f32[] divide(%constant.402, %reduce_max.965), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.883 = f32[] divide(%div.882, %constant.410), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_77.3 = f32[1]{0} parameter(77)
+  %div.881 = f32[1]{0} divide(%constant.392, %Arg_77.3), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.318 = f32[1]{0} call(%gt.157, %div.883, %div.881), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.319 = f32[1]{0} call(%is_finite.157, %jit__where_.318, %div.881), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.884 = f32[1]{0} divide(%constant.392, %jit__where_.319), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.651 = bf16[1]{0} convert(%div.884), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.957 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.651), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.958 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.957), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.959 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.958), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.885 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.141, %broadcast_in_dim.959), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.165 = bf16[32,4096,7168]{2,1,0} call(%div.885, %constant.401, %constant.400), to_apply=%clip_555.219, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.652 = f8e5m2[32,4096,7168]{2,1,0} convert(%jit_clip_.165), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_101.3 = bf16[128,128,7168]{2,1,0} parameter(101)
+  %Arg_74.5 = f32[1024]{0} parameter(74)
+  %reduce_max.911 = f32[] reduce(%Arg_74.5, %constant.411), dimensions={0}, to_apply=%region_94.144, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.130 = pred[] is-finite(%reduce_max.911), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.130 = pred[] compare(%reduce_max.911, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.744 = f32[] divide(%constant.409, %reduce_max.911), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.745 = f32[] divide(%div.744, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_75.5 = f32[1]{0} parameter(75)
+  %div.743 = f32[1]{0} divide(%constant.392, %Arg_75.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.264 = f32[1]{0} call(%gt.130, %div.745, %div.743), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.265 = f32[1]{0} call(%is_finite.130, %jit__where_.264, %div.743), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.746 = f32[1]{0} divide(%constant.392, %jit__where_.265), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.540 = bf16[1]{0} convert(%div.746), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.812 = bf16[128,128,1]{2,1,0} broadcast(%convert_element_type.540), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.813 = bf16[128,128]{1,0} reshape(%broadcast_in_dim.812), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.814 = bf16[128,128,7168]{2,1,0} broadcast(%broadcast_in_dim.813), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.747 = bf16[128,128,7168]{2,1,0} divide(%Arg_101.3, %broadcast_in_dim.814), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.138 = bf16[128,128,7168]{2,1,0} call(%div.747, %constant.406, %constant.405), to_apply=%clip_515.147, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.541 = f8e4m3fn[128,128,7168]{2,1,0} convert(%jit_clip_.138), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.101 = bf16[32,4096,128,128]{3,2,1,0} dot(%convert_element_type.652, %convert_element_type.541), lhs_contracting_dims={2}, rhs_contracting_dims={2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.804 = f32[1]{0} multiply(%div.746, %div.884), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.653 = bf16[1]{0} convert(%mul.804), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.960 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.653), dimensions={3}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.961 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.960), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.962 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%broadcast_in_dim.961), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.805 = bf16[32,4096,128,128]{3,2,1,0} multiply(%dot_general.101, %broadcast_in_dim.962), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.142 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%mul.805), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="checkpoint/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.40 = (bf16[32,4096,128,192]{3,2,1,0}, bf16[32,4096,128,192]{3,2,1,0}, bf16[32,4096,128,128]{3,2,1,0}) custom-call(%sharding_constraint.113, %sharding_constraint.114, %sharding_constraint.115, %constant.375, %constant.375, /*index=5*/%constant.375, %constant.375, %constant.375, %constant.375, %constant.375, /*index=10*/%custom_partitioning.34, %custom_partitioning.33, %sharding_constraint.142), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/moe_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}, backend_config="281457329392288"
+  %custom_partitioning.42 = bf16[32,4096,128,192]{3,2,1,0} get-tuple-element(%custom_partitioning.40), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/moe_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.144 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%custom_partitioning.42), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.44 = bf16[32,4096,128,128]{3,2,1,0} slice(%sharding_constraint.144), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.43 = bf16[32,4096,128,128]{3,2,1,0} get-tuple-element(%custom_partitioning.40), index=2, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/moe_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.143 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%custom_partitioning.43), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.31 = bf16[32,4096,128,256]{3,2,1,0} concatenate(%split.44, %sharding_constraint.143), dimensions={3}, metadata={op_name="checkpoint/moe_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_82.3 = f32[1024]{0} parameter(82)
+  %reduce_max.967 = f32[] reduce(%Arg_82.3, %constant.411), dimensions={0}, to_apply=%region_186.261, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.158 = pred[] is-finite(%reduce_max.967), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.158 = pred[] compare(%reduce_max.967, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.887 = f32[] divide(%constant.402, %reduce_max.967), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.888 = f32[] divide(%div.887, %constant.410), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_83.3 = f32[1]{0} parameter(83)
+  %div.886 = f32[1]{0} divide(%constant.392, %Arg_83.3), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.320 = f32[1]{0} call(%gt.158, %div.888, %div.886), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.321 = f32[1]{0} call(%is_finite.158, %jit__where_.320, %div.886), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.889 = f32[1]{0} divide(%constant.392, %jit__where_.321), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.656 = bf16[1]{0} convert(%div.889), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.967 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.656), dimensions={3}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.968 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.967), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.969 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%broadcast_in_dim.968), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.890 = bf16[32,4096,128,256]{3,2,1,0} divide(%concatenate.31, %broadcast_in_dim.969), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.166 = bf16[32,4096,128,256]{3,2,1,0} call(%div.890, %constant.401, %constant.400), to_apply=%clip_629.264, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.657 = f8e5m2[32,4096,128,256]{3,2,1,0} convert(%jit_clip_.166), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.103 = bf16[32,4096,512]{2,1,0} dot(%convert_element_type.657, %convert_element_type.534), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.808 = f32[1]{0} multiply(%div.736, %div.889), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.658 = bf16[1]{0} convert(%mul.808), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.970 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.658), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.971 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.970), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.972 = bf16[32,4096,512]{2,1,0} broadcast(%broadcast_in_dim.971), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.809 = bf16[32,4096,512]{2,1,0} multiply(%dot_general.103, %broadcast_in_dim.972), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.815 = bf16[1,1,512]{2,1,0} broadcast(%broadcast_in_dim.793), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.816 = bf16[512]{0} reshape(%mul.815), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.817 = bf16[32,4096,512]{2,1,0} broadcast(%mul.816), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.818 = bf16[32,4096,512]{2,1,0} multiply(%mul.809, %mul.817), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.665 = f32[32,4096,512]{2,1,0} convert(%mul.818), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.820 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.23), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.821 = f32[32,4096]{1,0} reshape(%mul.820), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.822 = f32[32,4096,512]{2,1,0} broadcast(%mul.821), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.823 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.665, %mul.822), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.819 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.523, %convert_element_type.665), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.309 = f32[32,4096]{1,0} reduce(%mul.819, %constant.412), dimensions={2}, to_apply=%region_192.268, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.30 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.309), metadata={op_name="checkpoint/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.723 = f32[32,4096,1]{2,1,0} divide(%rsqrt.23, %add.103), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.702 = f32[32,4096,1]{2,1,0} multiply(%div.723, %broadcast.134), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.824 = f32[32,4096,1]{2,1,0} multiply(%reshape.30, %mul.702), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.891 = f32[32,4096,1]{2,1,0} divide(%mul.824, %broadcast.122), metadata={op_name="checkpoint/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.310 = f32[32,4096]{1,0} reduce(%div.891, %constant.412), dimensions={2}, to_apply=%region_193.269, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.978 = f32[32,4096,512]{2,1,0} broadcast(%reduce_sum.310), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.378 = f32[] constant(2)
+  %mul.660 = f32[32,4096,512]{2,1,0} broadcast(%constant.378), dimensions={}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.701 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.523, %mul.660), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.825 = f32[32,4096,512]{2,1,0} multiply(%broadcast_in_dim.978, %mul.701), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.28 = f32[32,4096,512]{2,1,0} add(%mul.823, %mul.825), metadata={op_name="checkpoint/moe_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.666 = bf16[32,4096,512]{2,1,0} convert(%add_any.28), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.372 = f32[] constant(0)
+  %broadcast.120 = f32[32,4096,1,32]{3,2,1,0} broadcast(%constant.372), dimensions={}
+  %split.45 = bf16[32,4096,128,64]{3,2,1,0} slice(%sharding_constraint.144), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.305 = bf16[32,4096,64]{2,1,0} reduce(%split.45, %constant.403), dimensions={2}, to_apply=%region_185.260, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.966 = bf16[32,4096,1,64]{3,2,1,0} reshape(%reduce_sum.305), metadata={op_name="checkpoint/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.660 = f32[32,4096,1,64]{3,2,1,0} convert(%broadcast_in_dim.966), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.47 = f32[32,4096,1,32]{3,2,1,0} slice(%convert_element_type.660), slice={[0:32], [0:4096], [0:1], [32:64]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %neg.13 = f32[32,4096,1,32]{3,2,1,0} negate(%split.47), metadata={op_name="checkpoint/moe_layers/self_attention/neg" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.4 = c64[32,4096,1,32]{3,2,1,0} complex(%broadcast.120, %neg.13), metadata={op_name="checkpoint/moe_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.46 = f32[32,4096,1,32]{3,2,1,0} slice(%convert_element_type.660), slice={[0:32], [0:4096], [0:1], [0:32]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.5 = c64[32,4096,1,32]{3,2,1,0} complex(%split.46, %broadcast.120), metadata={op_name="checkpoint/moe_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.26 = c64[32,4096,1,32]{3,2,1,0} add(%complex.4, %complex.5), metadata={op_name="checkpoint/moe_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.812 = c64[32,4096,1,32]{3,2,1,0} multiply(%add_any.26, %broadcast_in_dim.796), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.813 = c64[32,4096,1,32]{3,2,1,0} multiply(%mul.812, %broadcast.121), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.663 = f32[32,4096,1,32]{3,2,1,0} real(%mul.813), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.664 = bf16[32,4096,1,32]{3,2,1,0} convert(%convert_element_type.663), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.976 = bf16[32,4096,1,32,1]{4,3,2,1,0} reshape(%convert_element_type.664), metadata={op_name="checkpoint/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.6 = bf16[32,4096,1,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.976, %constant.403), padding=0_0x0_0x0_0x0_0x1_0, metadata={op_name="checkpoint/moe_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.661 = f32[32,4096,1,32]{3,2,1,0} real(%mul.812), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.662 = bf16[32,4096,1,32]{3,2,1,0} convert(%convert_element_type.661), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.977 = bf16[32,4096,1,32,1]{4,3,2,1,0} reshape(%convert_element_type.662), metadata={op_name="checkpoint/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.7 = bf16[32,4096,1,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.977, %constant.403), padding=0_0x0_0x0_0x0_0x0_1, metadata={op_name="checkpoint/moe_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.27 = bf16[32,4096,1,32,2]{4,3,2,1,0} add(%pad.6, %pad.7), metadata={op_name="checkpoint/moe_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.28 = bf16[32,4096,1,64]{3,2,1,0} reshape(%add_any.27), metadata={op_name="checkpoint/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.306 = bf16[32,4096,64]{2,1,0} reduce(%reshape.28, %constant.403), dimensions={2}, to_apply=%region_189.265, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.32 = bf16[32,4096,576]{2,1,0} concatenate(%convert_element_type.666, %reduce_sum.306), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_69.5 = f32[1024]{0} parameter(69)
+  %reduce_max.971 = f32[] reduce(%Arg_69.5, %constant.411), dimensions={0}, to_apply=%region_201.278, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.160 = pred[] is-finite(%reduce_max.971), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.160 = pred[] compare(%reduce_max.971, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.899 = f32[] divide(%constant.402, %reduce_max.971), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.900 = f32[] divide(%div.899, %constant.410), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_71.5 = f32[1]{0} parameter(71)
+  %div.898 = f32[1]{0} divide(%constant.392, %Arg_71.5), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.324 = f32[1]{0} call(%gt.160, %div.900, %div.898), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.325 = f32[1]{0} call(%is_finite.160, %jit__where_.324, %div.898), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.901 = f32[1]{0} divide(%constant.392, %jit__where_.325), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.680 = bf16[1]{0} convert(%div.901), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.991 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.680), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.992 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.991), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.993 = bf16[32,4096,576]{2,1,0} broadcast(%broadcast_in_dim.992), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.902 = bf16[32,4096,576]{2,1,0} divide(%concatenate.32, %broadcast_in_dim.993), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.168 = bf16[32,4096,576]{2,1,0} call(%div.902, %constant.401, %constant.400), to_apply=%clip_689.281, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.681 = f8e5m2[32,4096,576]{2,1,0} convert(%jit_clip_.168), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_103.3 = bf16[7168,576]{1,0} parameter(103)
+  %Arg_65.5 = f32[1024]{0} parameter(65)
+  %reduce_max.899 = f32[] reduce(%Arg_65.5, %constant.411), dimensions={0}, to_apply=%region_73.114, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.124 = pred[] is-finite(%reduce_max.899), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.124 = pred[] compare(%reduce_max.899, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.702 = f32[] divide(%constant.409, %reduce_max.899), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.703 = f32[] divide(%div.702, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_67.5 = f32[1]{0} parameter(67)
+  %div.701 = f32[1]{0} divide(%constant.392, %Arg_67.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.252 = f32[1]{0} call(%gt.124, %div.703, %div.701), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.253 = f32[1]{0} call(%is_finite.124, %jit__where_.252, %div.701), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.704 = f32[1]{0} divide(%constant.392, %jit__where_.253), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.508 = bf16[1]{0} convert(%div.704), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.775 = bf16[7168,1]{1,0} broadcast(%convert_element_type.508), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.776 = bf16[7168]{0} reshape(%broadcast_in_dim.775), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.777 = bf16[7168,576]{1,0} broadcast(%broadcast_in_dim.776), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.705 = bf16[7168,576]{1,0} divide(%Arg_103.3, %broadcast_in_dim.777), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.130 = bf16[7168,576]{1,0} call(%div.705, %constant.406, %constant.405), to_apply=%clip_490.117, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.509 = f8e4m3fn[7168,576]{1,0} convert(%jit_clip_.130), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.107 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.681, %convert_element_type.509), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.848 = f32[1]{0} multiply(%div.704, %div.901), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.682 = bf16[1]{0} convert(%mul.848), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.994 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.682), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.995 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.994), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.996 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.995), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.849 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.107, %broadcast_in_dim.996), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.41 = bf16[32,4096,128,192]{3,2,1,0} get-tuple-element(%custom_partitioning.40), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/moe_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.145 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%custom_partitioning.41), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.826 = bf16[32,4096,128,192]{3,2,1,0} multiply(%sharding_constraint.145, %broadcast.123), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.48 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.826), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.371 = f32[] constant(0)
+  %broadcast.119 = f32[32,4096,128,32]{3,2,1,0} broadcast(%constant.371), dimensions={}
+  %split.49 = bf16[32,4096,128,64]{3,2,1,0} slice(%mul.826), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.667 = f32[32,4096,128,64]{3,2,1,0} convert(%split.49), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.51 = f32[32,4096,128,32]{3,2,1,0} slice(%convert_element_type.667), slice={[0:32], [0:4096], [0:128], [32:64]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %neg.14 = f32[32,4096,128,32]{3,2,1,0} negate(%split.51), metadata={op_name="checkpoint/moe_layers/self_attention/neg" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.6 = c64[32,4096,128,32]{3,2,1,0} complex(%broadcast.119, %neg.14), metadata={op_name="checkpoint/moe_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.50 = f32[32,4096,128,32]{3,2,1,0} slice(%convert_element_type.667), slice={[0:32], [0:4096], [0:128], [0:32]}, metadata={op_name="checkpoint/moe_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.7 = c64[32,4096,128,32]{3,2,1,0} complex(%split.50, %broadcast.119), metadata={op_name="checkpoint/moe_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.29 = c64[32,4096,128,32]{3,2,1,0} add(%complex.6, %complex.7), metadata={op_name="checkpoint/moe_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.827 = c64[32,4096,1,32]{3,2,1,0} broadcast(%broadcast_in_dim.791), dimensions={0,1,2,3}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.828 = c64[32,4096,32]{2,1,0} reshape(%mul.827), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.829 = c64[32,4096,128,32]{3,2,1,0} broadcast(%mul.828), dimensions={0,1,3}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.830 = c64[32,4096,128,32]{3,2,1,0} multiply(%add_any.29, %mul.829), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.831 = c64[32,4096,128,32]{3,2,1,0} multiply(%mul.830, %broadcast.124), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.670 = f32[32,4096,128,32]{3,2,1,0} real(%mul.831), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.671 = bf16[32,4096,128,32]{3,2,1,0} convert(%convert_element_type.670), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.979 = bf16[32,4096,128,32,1]{4,3,2,1,0} reshape(%convert_element_type.671), metadata={op_name="checkpoint/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.8 = bf16[32,4096,128,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.979, %constant.403), padding=0_0x0_0x0_0x0_0x1_0, metadata={op_name="checkpoint/moe_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.668 = f32[32,4096,128,32]{3,2,1,0} real(%mul.830), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.669 = bf16[32,4096,128,32]{3,2,1,0} convert(%convert_element_type.668), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.980 = bf16[32,4096,128,32,1]{4,3,2,1,0} reshape(%convert_element_type.669), metadata={op_name="checkpoint/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.9 = bf16[32,4096,128,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.980, %constant.403), padding=0_0x0_0x0_0x0_0x0_1, metadata={op_name="checkpoint/moe_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.30 = bf16[32,4096,128,32,2]{4,3,2,1,0} add(%pad.8, %pad.9), metadata={op_name="checkpoint/moe_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.31 = bf16[32,4096,128,64]{3,2,1,0} reshape(%add_any.30), metadata={op_name="checkpoint/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.33 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.48, %reshape.31), dimensions={3}, metadata={op_name="checkpoint/moe_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_88.3 = f32[1024]{0} parameter(88)
+  %reduce_max.969 = f32[] reduce(%Arg_88.3, %constant.411), dimensions={0}, to_apply=%region_194.270, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.159 = pred[] is-finite(%reduce_max.969), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.159 = pred[] compare(%reduce_max.969, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.893 = f32[] divide(%constant.402, %reduce_max.969), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.894 = f32[] divide(%div.893, %constant.410), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_89.3 = f32[1]{0} parameter(89)
+  %div.892 = f32[1]{0} divide(%constant.392, %Arg_89.3), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.322 = f32[1]{0} call(%gt.159, %div.894, %div.892), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.323 = f32[1]{0} call(%is_finite.159, %jit__where_.322, %div.892), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.895 = f32[1]{0} divide(%constant.392, %jit__where_.323), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.673 = bf16[1]{0} convert(%div.895), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.981 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.673), dimensions={3}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.982 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.981), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.983 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%broadcast_in_dim.982), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.896 = bf16[32,4096,128,192]{3,2,1,0} divide(%concatenate.33, %broadcast_in_dim.983), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.167 = bf16[32,4096,128,192]{3,2,1,0} call(%div.896, %constant.401, %constant.400), to_apply=%clip_672.273, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.674 = f8e5m2[32,4096,128,192]{3,2,1,0} convert(%jit_clip_.167), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.105 = bf16[32,4096,1536]{2,1,0} dot(%convert_element_type.674, %convert_element_type.517), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.832 = f32[1]{0} multiply(%div.716, %div.895), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.675 = bf16[1]{0} convert(%mul.832), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.984 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.675), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.985 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.984), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.986 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.985), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.833 = bf16[32,4096,1536]{2,1,0} multiply(%dot_general.105, %broadcast_in_dim.986), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.837 = bf16[1,1,1536]{2,1,0} broadcast(%broadcast_in_dim.779), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.838 = bf16[1536]{0} reshape(%mul.837), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.839 = bf16[32,4096,1536]{2,1,0} broadcast(%mul.838), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.840 = bf16[32,4096,1536]{2,1,0} multiply(%mul.833, %mul.839), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.677 = f32[32,4096,1536]{2,1,0} convert(%mul.840), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.842 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.22), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.843 = f32[32,4096]{1,0} reshape(%mul.842), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.844 = f32[32,4096,1536]{2,1,0} broadcast(%mul.843), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.845 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.677, %mul.844), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.841 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.510, %convert_element_type.677), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.313 = f32[32,4096]{1,0} reduce(%mul.841, %constant.412), dimensions={2}, to_apply=%region_199.276, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.33 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.313), metadata={op_name="checkpoint/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.707 = f32[32,4096,1]{2,1,0} divide(%rsqrt.22, %add.100), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.673 = f32[32,4096,1]{2,1,0} multiply(%div.707, %broadcast.134), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.846 = f32[32,4096,1]{2,1,0} multiply(%reshape.33, %mul.673), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.897 = f32[32,4096,1]{2,1,0} divide(%mul.846, %broadcast.133), metadata={op_name="checkpoint/moe_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.314 = f32[32,4096]{1,0} reduce(%div.897, %constant.412), dimensions={2}, to_apply=%region_200.277, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.990 = f32[32,4096,1536]{2,1,0} broadcast(%reduce_sum.314), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.390 = f32[] constant(2)
+  %mul.661 = f32[32,4096,1536]{2,1,0} broadcast(%constant.390), dimensions={}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.672 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.510, %mul.661), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.847 = f32[32,4096,1536]{2,1,0} multiply(%broadcast_in_dim.990, %mul.672), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.31 = f32[32,4096,1536]{2,1,0} add(%mul.845, %mul.847), metadata={op_name="checkpoint/moe_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.678 = bf16[32,4096,1536]{2,1,0} convert(%add_any.31), metadata={op_name="checkpoint/moe_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_68.5 = f32[1024]{0} parameter(68)
+  %reduce_max.973 = f32[] reduce(%Arg_68.5, %constant.411), dimensions={0}, to_apply=%region_204.282, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.161 = pred[] is-finite(%reduce_max.973), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.161 = pred[] compare(%reduce_max.973, %constant.412), direction=GT, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.904 = f32[] divide(%constant.402, %reduce_max.973), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.905 = f32[] divide(%div.904, %constant.410), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_70.5 = f32[1]{0} parameter(70)
+  %div.903 = f32[1]{0} divide(%constant.392, %Arg_70.5), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.326 = f32[1]{0} call(%gt.161, %div.905, %div.903), to_apply=%_where_552.214, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.327 = f32[1]{0} call(%is_finite.161, %jit__where_.326, %div.903), to_apply=%_where_553.215, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.906 = f32[1]{0} divide(%constant.392, %jit__where_.327), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.685 = bf16[1]{0} convert(%div.906), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1000 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.685), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1001 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1000), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1002 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.1001), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.907 = bf16[32,4096,1536]{2,1,0} divide(%convert_element_type.678, %broadcast_in_dim.1002), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.169 = bf16[32,4096,1536]{2,1,0} call(%div.907, %constant.401, %constant.400), to_apply=%clip_697.285, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.686 = f8e5m2[32,4096,1536]{2,1,0} convert(%jit_clip_.169), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_105.3 = bf16[7168,1536]{1,0} parameter(105)
+  %Arg_64.5 = f32[1024]{0} parameter(64)
+  %reduce_max.897 = f32[] reduce(%Arg_64.5, %constant.411), dimensions={0}, to_apply=%region_70.110, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.123 = pred[] is-finite(%reduce_max.897), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.123 = pred[] compare(%reduce_max.897, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.697 = f32[] divide(%constant.409, %reduce_max.897), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.698 = f32[] divide(%div.697, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_66.5 = f32[1]{0} parameter(66)
+  %div.696 = f32[1]{0} divide(%constant.392, %Arg_66.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.250 = f32[1]{0} call(%gt.123, %div.698, %div.696), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.251 = f32[1]{0} call(%is_finite.123, %jit__where_.250, %div.696), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.699 = f32[1]{0} divide(%constant.392, %jit__where_.251), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.505 = bf16[1]{0} convert(%div.699), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.772 = bf16[7168,1]{1,0} broadcast(%convert_element_type.505), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.773 = bf16[7168]{0} reshape(%broadcast_in_dim.772), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.774 = bf16[7168,1536]{1,0} broadcast(%broadcast_in_dim.773), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.700 = bf16[7168,1536]{1,0} divide(%Arg_105.3, %broadcast_in_dim.774), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.129 = bf16[7168,1536]{1,0} call(%div.700, %constant.406, %constant.405), to_apply=%clip_487.113, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.506 = f8e4m3fn[7168,1536]{1,0} convert(%jit_clip_.129), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.109 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.686, %convert_element_type.506), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.852 = f32[1]{0} multiply(%div.699, %div.906), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.687 = bf16[1]{0} convert(%mul.852), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1003 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.687), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1004 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1003), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1005 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1004), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.853 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.109, %broadcast_in_dim.1005), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.32 = bf16[32,4096,7168]{2,1,0} add(%mul.849, %mul.853), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.146 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.32), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.147 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.146), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_99.3 = bf16[7168]{0} parameter(99)
+  %broadcast_in_dim.768 = bf16[1,1,7168]{2,1,0} reshape(%Arg_99.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.857 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.768), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.858 = bf16[7168]{0} reshape(%mul.857), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.859 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.858), dimensions={2}, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.860 = bf16[32,4096,7168]{2,1,0} multiply(%sharding_constraint.147, %mul.859), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.689 = f32[32,4096,7168]{2,1,0} convert(%mul.860), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.499 = f32[32,4096,7168]{2,1,0} convert(%Arg_2.127), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.22 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.499, %convert_element_type.499), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.295 = f32[32,4096]{1,0} reduce(%square.22, %constant.412), dimensions={2}, to_apply=%region_66.102, metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.767 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.295), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.689 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.767, %broadcast.136), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.99 = f32[32,4096,1]{2,1,0} add(%div.689, %broadcast.135), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.21 = f32[32,4096,1]{2,1,0} rsqrt(%add.99), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.862 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.21), dimensions={0,1,2}, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.863 = f32[32,4096]{1,0} reshape(%mul.862), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.864 = f32[32,4096,7168]{2,1,0} broadcast(%mul.863), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.865 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.689, %mul.864), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.861 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.499, %convert_element_type.689), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.317 = f32[32,4096]{1,0} reduce(%mul.861, %constant.412), dimensions={2}, to_apply=%region_209.288, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.35 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.317), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.690 = f32[32,4096,1]{2,1,0} divide(%rsqrt.21, %add.99), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.663 = f32[32,4096,1]{2,1,0} multiply(%div.690, %broadcast.134), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.866 = f32[32,4096,1]{2,1,0} multiply(%reshape.35, %mul.663), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.908 = f32[32,4096,1]{2,1,0} divide(%mul.866, %broadcast.136), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.318 = f32[32,4096]{1,0} reduce(%div.908, %constant.412), dimensions={2}, to_apply=%region_210.289, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1009 = f32[32,4096,7168]{2,1,0} broadcast(%reduce_sum.318), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.662 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.499, %broadcast.137), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.867 = f32[32,4096,7168]{2,1,0} multiply(%broadcast_in_dim.1009, %mul.662), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.33 = f32[32,4096,7168]{2,1,0} add(%mul.865, %mul.867), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.690 = bf16[32,4096,7168]{2,1,0} convert(%add_any.33), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.34 = bf16[32,4096,7168]{2,1,0} add(%add_any.25, %convert_element_type.690), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.148 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.34), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.100 = bf16[64,7168]{1,0} dot(%convert_element_type.645, %convert_element_type.546), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.58 = bf16[7168,64]{0,1} transpose(%dot_general.100), dimensions={1,0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.790 = f32[1]{0} multiply(%div.753, %div.878), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.647 = bf16[1]{0} convert(%mul.790), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.953 = bf16[7168,1]{1,0} broadcast(%convert_element_type.647), dimensions={1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.954 = bf16[7168]{0} reshape(%broadcast_in_dim.953), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.955 = bf16[7168,64]{1,0} broadcast(%broadcast_in_dim.954), dimensions={0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.791 = bf16[7168,64]{0,1} multiply(%transpose.58, %broadcast_in_dim.955), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.370 = bf16[] constant(0)
+  %broadcast_in_dim.766 = bf16[64]{0} broadcast(%constant.370), dimensions={}, metadata={op_name="broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %Arg_18.5 = f32[1024]{0} parameter(18)
+  %reduce_max.919 = f32[] reduce(%Arg_18.5, %constant.411), dimensions={0}, to_apply=%region_107.162, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.134 = pred[] is-finite(%reduce_max.919), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.134 = pred[] compare(%reduce_max.919, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.766 = f32[] divide(%constant.409, %reduce_max.919), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.767 = f32[] divide(%div.766, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_19.5 = f32[1]{0} parameter(19)
+  %div.765 = f32[1]{0} divide(%constant.392, %Arg_19.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.272 = f32[1]{0} call(%gt.134, %div.767, %div.765), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.273 = f32[1]{0} call(%is_finite.134, %jit__where_.272, %div.765), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.768 = f32[1]{0} divide(%constant.392, %jit__where_.273), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.561 = bf16[1]{0} convert(%div.768), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.830 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.561), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.831 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.830), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.832 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.831), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.769 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.118, %broadcast_in_dim.832), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.142 = bf16[32,4096,7168]{2,1,0} call(%div.769, %constant.406, %constant.405), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.562 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.142), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.82 = bf16[32,64,512,7168]{3,2,1,0} dot(%convert_element_type.559, %convert_element_type.562), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.746 = f32[1]{0} multiply(%div.763, %div.768), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.563 = bf16[1]{0} convert(%mul.746), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.833 = bf16[32,64,512,1]{3,2,1,0} broadcast(%convert_element_type.563), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.834 = bf16[32,64,512]{2,1,0} reshape(%broadcast_in_dim.833), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.835 = bf16[32,64,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.834), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.747 = bf16[32,64,512,7168]{3,2,1,0} multiply(%dot_general.82, %broadcast_in_dim.835), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.48 = bf16[64,32,512,7168]{3,2,0,1} transpose(%mul.747), dimensions={1,0,2,3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.122 = bf16[64,32,512,7168]{3,2,1,0} custom-call(%transpose.48), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"data\", \"fsdp\", \"fsdp_transpose\"}, {}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_22.5 = f32[1024]{0} parameter(22)
+  %reduce_max.921 = f32[] reduce(%Arg_22.5, %constant.411), dimensions={0}, to_apply=%region_110.165, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.135 = pred[] is-finite(%reduce_max.921), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.135 = pred[] compare(%reduce_max.921, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.771 = f32[] divide(%constant.409, %reduce_max.921), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.772 = f32[] divide(%div.771, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_23.5 = f32[1]{0} parameter(23)
+  %div.770 = f32[1]{0} divide(%constant.392, %Arg_23.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.274 = f32[1]{0} call(%gt.135, %div.772, %div.770), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.275 = f32[1]{0} call(%is_finite.135, %jit__where_.274, %div.770), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.773 = f32[1]{0} divide(%constant.392, %jit__where_.275), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.565 = bf16[1]{0} convert(%div.773), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.836 = bf16[64,32,512,1]{3,2,1,0} broadcast(%convert_element_type.565), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.837 = bf16[64,32,512]{2,1,0} reshape(%broadcast_in_dim.836), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.838 = bf16[64,32,512,7168]{3,2,1,0} broadcast(%broadcast_in_dim.837), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.774 = bf16[64,32,512,7168]{3,2,1,0} divide(%sharding_constraint.122, %broadcast_in_dim.838), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.143 = bf16[64,32,512,7168]{3,2,1,0} call(%div.774, %constant.406, %constant.405), to_apply=%clip_526.168, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.566 = f8e4m3fn[64,32,512,7168]{3,2,1,0} convert(%jit_clip_.143), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.97 = bf16[64,2048,7168]{2,1,0} dot(%convert_element_type.636, %convert_element_type.566), lhs_batch_dims={0}, lhs_contracting_dims={1,2}, rhs_batch_dims={0}, rhs_contracting_dims={1,2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.55 = bf16[64,7168,2048]{1,2,0} transpose(%dot_general.97), dimensions={0,2,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.782 = f32[1]{0} multiply(%div.773, %div.868), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.638 = bf16[1]{0} convert(%mul.782), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.938 = bf16[64,7168,1]{2,1,0} broadcast(%convert_element_type.638), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.939 = bf16[64,7168]{1,0} reshape(%broadcast_in_dim.938), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.940 = bf16[64,7168,2048]{2,1,0} broadcast(%broadcast_in_dim.939), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.783 = bf16[64,7168,2048]{1,2,0} multiply(%transpose.55, %broadcast_in_dim.940), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.136 = bf16[64,7168,2048]{2,1,0} custom-call(%mul.783), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.95 = bf16[64,2048,7168]{2,1,0} dot(%convert_element_type.631, %convert_element_type.566), lhs_batch_dims={0}, lhs_contracting_dims={1,2}, rhs_batch_dims={0}, rhs_contracting_dims={1,2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.54 = bf16[64,7168,2048]{1,2,0} transpose(%dot_general.95), dimensions={0,2,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.778 = f32[1]{0} multiply(%div.773, %div.863), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.633 = bf16[1]{0} convert(%mul.778), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.929 = bf16[64,7168,1]{2,1,0} broadcast(%convert_element_type.633), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.930 = bf16[64,7168]{1,0} reshape(%broadcast_in_dim.929), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.931 = bf16[64,7168,2048]{2,1,0} broadcast(%broadcast_in_dim.930), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.779 = bf16[64,7168,2048]{1,2,0} multiply(%transpose.54, %broadcast_in_dim.931), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.135 = bf16[64,7168,2048]{2,1,0} custom-call(%mul.779), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.93 = bf16[64,7168,2048]{2,1,0} dot(%convert_element_type.626, %convert_element_type.575), lhs_batch_dims={0}, lhs_contracting_dims={1,2}, rhs_batch_dims={0}, rhs_contracting_dims={1,2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.53 = bf16[64,2048,7168]{1,2,0} transpose(%dot_general.93), dimensions={0,2,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.772 = f32[1]{0} multiply(%div.788, %div.858), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.628 = bf16[1]{0} convert(%mul.772), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.920 = bf16[64,2048,1]{2,1,0} broadcast(%convert_element_type.628), dimensions={2}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.921 = bf16[64,2048]{1,0} reshape(%broadcast_in_dim.920), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.922 = bf16[64,2048,7168]{2,1,0} broadcast(%broadcast_in_dim.921), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.773 = bf16[64,2048,7168]{1,2,0} multiply(%transpose.53, %broadcast_in_dim.922), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.132 = bf16[64,2048,7168]{2,1,0} custom-call(%mul.773), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_44.5 = f32[1024]{0} parameter(44)
+  %reduce_max.935 = f32[] reduce(%Arg_44.5, %constant.411), dimensions={0}, to_apply=%region_131.191, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.142 = pred[] is-finite(%reduce_max.935), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.142 = pred[] compare(%reduce_max.935, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.806 = f32[] divide(%constant.409, %reduce_max.935), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.807 = f32[] divide(%div.806, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_45.5 = f32[1]{0} parameter(45)
+  %div.805 = f32[1]{0} divide(%constant.392, %Arg_45.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.288 = f32[1]{0} call(%gt.142, %div.807, %div.805), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.289 = f32[1]{0} call(%is_finite.142, %jit__where_.288, %div.805), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.808 = f32[1]{0} divide(%constant.392, %jit__where_.289), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.587 = bf16[1]{0} convert(%div.808), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.860 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.587), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.861 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.860), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.862 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.861), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.809 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.118, %broadcast_in_dim.862), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.150 = bf16[32,4096,7168]{2,1,0} call(%div.809, %constant.406, %constant.405), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.588 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.150), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.89 = bf16[2048,7168]{1,0} dot(%convert_element_type.616, %convert_element_type.588), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.51 = bf16[7168,2048]{0,1} transpose(%dot_general.89), dimensions={1,0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.764 = f32[1]{0} multiply(%div.808, %div.848), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.618 = bf16[1]{0} convert(%mul.764), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.902 = bf16[7168,1]{1,0} broadcast(%convert_element_type.618), dimensions={1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.903 = bf16[7168]{0} reshape(%broadcast_in_dim.902), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.904 = bf16[7168,2048]{1,0} broadcast(%broadcast_in_dim.903), dimensions={0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.765 = bf16[7168,2048]{0,1} multiply(%transpose.51, %broadcast_in_dim.904), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_50.5 = f32[1024]{0} parameter(50)
+  %reduce_max.939 = f32[] reduce(%Arg_50.5, %constant.411), dimensions={0}, to_apply=%region_137.199, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.144 = pred[] is-finite(%reduce_max.939), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.144 = pred[] compare(%reduce_max.939, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.816 = f32[] divide(%constant.409, %reduce_max.939), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.817 = f32[] divide(%div.816, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_51.5 = f32[1]{0} parameter(51)
+  %div.815 = f32[1]{0} divide(%constant.392, %Arg_51.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.292 = f32[1]{0} call(%gt.144, %div.817, %div.815), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.293 = f32[1]{0} call(%is_finite.144, %jit__where_.292, %div.815), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.818 = f32[1]{0} divide(%constant.392, %jit__where_.293), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.593 = bf16[1]{0} convert(%div.818), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.866 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.593), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.867 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.866), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.868 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.867), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.819 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.118, %broadcast_in_dim.868), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.152 = bf16[32,4096,7168]{2,1,0} call(%div.819, %constant.406, %constant.405), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.594 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.152), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.87 = bf16[2048,7168]{1,0} dot(%convert_element_type.611, %convert_element_type.594), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.50 = bf16[7168,2048]{0,1} transpose(%dot_general.87), dimensions={1,0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.760 = f32[1]{0} multiply(%div.818, %div.843), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.613 = bf16[1]{0} convert(%mul.760), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.893 = bf16[7168,1]{1,0} broadcast(%convert_element_type.613), dimensions={1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.894 = bf16[7168]{0} reshape(%broadcast_in_dim.893), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.895 = bf16[7168,2048]{1,0} broadcast(%broadcast_in_dim.894), dimensions={0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.761 = bf16[7168,2048]{0,1} multiply(%transpose.50, %broadcast_in_dim.895), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.751 = bf16[32,4096,2048]{2,1,0} multiply(%jit_silu_.21, %device_put.31), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.127 = bf16[32,4096,2048]{2,1,0} custom-call(%mul.751), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_56.5 = f32[1024]{0} parameter(56)
+  %reduce_max.943 = f32[] reduce(%Arg_56.5, %constant.411), dimensions={0}, to_apply=%region_143.205, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.146 = pred[] is-finite(%reduce_max.943), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.146 = pred[] compare(%reduce_max.943, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.826 = f32[] divide(%constant.409, %reduce_max.943), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.827 = f32[] divide(%div.826, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_57.5 = f32[1]{0} parameter(57)
+  %div.825 = f32[1]{0} divide(%constant.392, %Arg_57.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.296 = f32[1]{0} call(%gt.146, %div.827, %div.825), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.297 = f32[1]{0} call(%is_finite.146, %jit__where_.296, %div.825), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.828 = f32[1]{0} divide(%constant.392, %jit__where_.297), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.599 = bf16[1]{0} convert(%div.828), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.872 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.599), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.873 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.872), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.874 = bf16[32,4096,2048]{2,1,0} broadcast(%broadcast_in_dim.873), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.829 = bf16[32,4096,2048]{2,1,0} divide(%sharding_constraint.127, %broadcast_in_dim.874), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.154 = bf16[32,4096,2048]{2,1,0} call(%div.829, %constant.406, %constant.405), to_apply=%clip_548.208, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.600 = f8e4m3fn[32,4096,2048]{2,1,0} convert(%jit_clip_.154), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.85 = bf16[7168,2048]{1,0} dot(%convert_element_type.606, %convert_element_type.600), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.49 = bf16[2048,7168]{0,1} transpose(%dot_general.85), dimensions={1,0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.754 = f32[1]{0} multiply(%div.828, %div.838), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.608 = bf16[1]{0} convert(%mul.754), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.884 = bf16[2048,1]{1,0} broadcast(%convert_element_type.608), dimensions={1}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.885 = bf16[2048]{0} reshape(%broadcast_in_dim.884), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.886 = bf16[2048,7168]{1,0} broadcast(%broadcast_in_dim.885), dimensions={0}, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.755 = bf16[2048,7168]{0,1} multiply(%transpose.49, %broadcast_in_dim.886), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.792 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.543, %sharding_constraint.140), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.301 = bf16[7168]{0} reduce(%mul.792, %constant.403), dimensions={0,1}, to_apply=%region_178.253, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.26 = bf16[1,1,7168]{2,1,0} reshape(%reduce_sum.301), metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.302 = bf16[7168]{0} reduce(%reshape.26, %constant.403), dimensions={0,1}, to_apply=%region_179.254, metadata={op_name="checkpoint/moe_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.664 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.21), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.665 = f32[32,4096]{1,0} reshape(%mul.664), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.666 = f32[32,4096,7168]{2,1,0} broadcast(%mul.665), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.667 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.499, %mul.666), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.500 = bf16[32,4096,7168]{2,1,0} convert(%mul.667), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.856 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.500, %sharding_constraint.147), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.315 = bf16[7168]{0} reduce(%mul.856, %constant.403), dimensions={0,1}, to_apply=%region_207.286, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.34 = bf16[1,1,7168]{2,1,0} reshape(%reduce_sum.315), metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.316 = bf16[7168]{0} reduce(%reshape.34, %constant.403), dimensions={0,1}, to_apply=%region_208.287, metadata={op_name="checkpoint/moe_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.814 = bf16[32,4096,512]{2,1,0} multiply(%convert_element_type.524, %mul.809), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.307 = bf16[512]{0} reduce(%mul.814, %constant.403), dimensions={0,1}, to_apply=%region_190.266, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.29 = bf16[1,1,512]{2,1,0} reshape(%reduce_sum.307), metadata={op_name="checkpoint/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.308 = bf16[512]{0} reduce(%reshape.29, %constant.403), dimensions={0,1}, to_apply=%region_191.267, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.116 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%custom_partitioning.33), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_72.5 = f32[1024]{0} parameter(72)
+  %reduce_max.909 = f32[] reduce(%Arg_72.5, %constant.411), dimensions={0}, to_apply=%region_91.140, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.129 = pred[] is-finite(%reduce_max.909), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.129 = pred[] compare(%reduce_max.909, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.739 = f32[] divide(%constant.409, %reduce_max.909), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.740 = f32[] divide(%div.739, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_73.5 = f32[1]{0} parameter(73)
+  %div.738 = f32[1]{0} divide(%constant.392, %Arg_73.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.262 = f32[1]{0} call(%gt.129, %div.740, %div.738), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.263 = f32[1]{0} call(%is_finite.129, %jit__where_.262, %div.738), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.741 = f32[1]{0} divide(%constant.392, %jit__where_.263), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.537 = bf16[1]{0} convert(%div.741), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.809 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.537), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.810 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.809), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.811 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%broadcast_in_dim.810), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.742 = bf16[32,4096,128,128]{3,2,1,0} divide(%sharding_constraint.116, %broadcast_in_dim.811), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.137 = bf16[32,4096,128,128]{3,2,1,0} call(%div.742, %constant.406, %constant.405), to_apply=%clip_512.143, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.538 = f8e4m3fn[32,4096,128,128]{3,2,1,0} convert(%jit_clip_.137), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.102 = bf16[7168,128,128]{2,1,0} dot(%convert_element_type.652, %convert_element_type.538), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.59 = bf16[128,128,7168]{1,0,2} transpose(%dot_general.102), dimensions={1,2,0}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.806 = f32[1]{0} multiply(%div.741, %div.884), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.654 = bf16[1]{0} convert(%mul.806), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.963 = bf16[128,128,1]{2,1,0} broadcast(%convert_element_type.654), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.964 = bf16[128,128]{1,0} reshape(%broadcast_in_dim.963), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.965 = bf16[128,128,7168]{2,1,0} broadcast(%broadcast_in_dim.964), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.807 = bf16[128,128,7168]{1,0,2} multiply(%transpose.59, %broadcast_in_dim.965), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.836 = bf16[32,4096,1536]{2,1,0} multiply(%convert_element_type.511, %mul.833), metadata={op_name="checkpoint/moe_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.311 = bf16[1536]{0} reduce(%mul.836, %constant.403), dimensions={0,1}, to_apply=%region_197.274, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.32 = bf16[1,1,1536]{2,1,0} reshape(%reduce_sum.311), metadata={op_name="checkpoint/moe_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.312 = bf16[1536]{0} reduce(%reshape.32, %constant.403), dimensions={0,1}, to_apply=%region_198.275, metadata={op_name="checkpoint/moe_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.668 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.768), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.669 = bf16[7168]{0} reshape(%mul.668), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.670 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.669), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.671 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.500, %mul.670), metadata={op_name="checkpoint/rematted_computation/moe_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.111 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.671), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.112 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.111), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_62.5 = f32[1024]{0} parameter(62)
+  %reduce_max.895 = f32[] reduce(%Arg_62.5, %constant.411), dimensions={0}, to_apply=%region_67.103, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.122 = pred[] is-finite(%reduce_max.895), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.122 = pred[] compare(%reduce_max.895, %constant.412), direction=GT, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.692 = f32[] divide(%constant.409, %reduce_max.895), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.693 = f32[] divide(%div.692, %constant.410), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_63.5 = f32[1]{0} parameter(63)
+  %div.691 = f32[1]{0} divide(%constant.392, %Arg_63.5), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.248 = f32[1]{0} call(%gt.122, %div.693, %div.691), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.249 = f32[1]{0} call(%is_finite.122, %jit__where_.248, %div.691), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.694 = f32[1]{0} divide(%constant.392, %jit__where_.249), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.502 = bf16[1]{0} convert(%div.694), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.769 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.502), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.770 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.769), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.771 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.770), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.695 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.112, %broadcast_in_dim.771), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.128 = bf16[32,4096,7168]{2,1,0} call(%div.695, %constant.406, %constant.405), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.503 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.128), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.108 = bf16[576,7168]{1,0} dot(%convert_element_type.681, %convert_element_type.503), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.62 = bf16[7168,576]{0,1} transpose(%dot_general.108), dimensions={1,0}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.850 = f32[1]{0} multiply(%div.694, %div.901), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.683 = bf16[1]{0} convert(%mul.850), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.997 = bf16[7168,1]{1,0} broadcast(%convert_element_type.683), dimensions={1}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.998 = bf16[7168]{0} reshape(%broadcast_in_dim.997), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.999 = bf16[7168,576]{1,0} broadcast(%broadcast_in_dim.998), dimensions={0}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.851 = bf16[7168,576]{0,1} multiply(%transpose.62, %broadcast_in_dim.999), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.104 = bf16[128,256,512]{2,1,0} dot(%convert_element_type.657, %convert_element_type.531), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.60 = bf16[512,128,256]{0,2,1} transpose(%dot_general.104), dimensions={2,0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.810 = f32[1]{0} multiply(%div.731, %div.889), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.659 = bf16[1]{0} convert(%mul.810), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.973 = bf16[512,128,1]{2,1,0} broadcast(%convert_element_type.659), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.974 = bf16[512,128]{1,0} reshape(%broadcast_in_dim.973), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.975 = bf16[512,128,256]{2,1,0} broadcast(%broadcast_in_dim.974), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.811 = bf16[512,128,256]{0,2,1} multiply(%transpose.60, %broadcast_in_dim.975), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.110 = bf16[1536,7168]{1,0} dot(%convert_element_type.686, %convert_element_type.503), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.63 = bf16[7168,1536]{0,1} transpose(%dot_general.110), dimensions={1,0}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.854 = f32[1]{0} multiply(%div.694, %div.906), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.688 = bf16[1]{0} convert(%mul.854), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1006 = bf16[7168,1]{1,0} broadcast(%convert_element_type.688), dimensions={1}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1007 = bf16[7168]{0} reshape(%broadcast_in_dim.1006), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1008 = bf16[7168,1536]{1,0} broadcast(%broadcast_in_dim.1007), dimensions={0}, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.855 = bf16[7168,1536]{0,1} multiply(%transpose.63, %broadcast_in_dim.1008), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.106 = bf16[128,192,1536]{2,1,0} dot(%convert_element_type.674, %convert_element_type.514), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.61 = bf16[1536,128,192]{0,2,1} transpose(%dot_general.106), dimensions={2,0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.834 = f32[1]{0} multiply(%div.711, %div.895), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.676 = bf16[1]{0} convert(%mul.834), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.987 = bf16[1536,128,1]{2,1,0} broadcast(%convert_element_type.676), dimensions={2}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.988 = bf16[1536,128]{1,0} reshape(%broadcast_in_dim.987), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.989 = bf16[1536,128,192]{2,1,0} broadcast(%broadcast_in_dim.988), dimensions={0,1}, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.835 = bf16[1536,128,192]{0,2,1} multiply(%transpose.61, %broadcast_in_dim.989), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.49 = f32[1024]{0} call(%Arg_10.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.391 = s32[1]{0} constant({0})
+  %abs.49 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.118), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.408 = bf16[] constant(-inf)
+  %reduce_max.914 = bf16[] reduce(%abs.49, %constant.408), dimensions={0,1,2}, to_apply=%region_99.150, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.544 = f32[] convert(%reduce_max.914), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.209 = f32[1024]{0} scatter(%jit__roll_static_.49, %constant.391, %convert_element_type.544), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_100.151, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.50 = f32[1024]{0} call(%Arg_12.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.50 = bf16[7168,64]{1,0} abs(%Arg_90.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.916 = bf16[] reduce(%abs.50, %constant.408), dimensions={0,1}, to_apply=%region_102.153, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.547 = f32[] convert(%reduce_max.916), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.210 = f32[1024]{0} scatter(%jit__roll_static_.50, %constant.391, %convert_element_type.547), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_103.154, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.74 = f32[1024]{0} call(%Arg_14.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.74 = bf16[32,4096,64]{2,1,0} abs(%sharding_constraint.139), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.964 = bf16[] reduce(%abs.74, %constant.408), dimensions={0,1,2}, to_apply=%region_176.250, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.643 = f32[] convert(%reduce_max.964), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.234 = f32[1024]{0} scatter(%jit__roll_static_.74, %constant.391, %convert_element_type.643), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_177.251, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.51 = f32[1024]{0} call(%Arg_16.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.51 = bf16[32,4096,64,512]{3,2,1,0} abs(%convert_element_type.556), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.918 = bf16[] reduce(%abs.51, %constant.408), dimensions={0,1,2,3}, to_apply=%region_105.159, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.557 = f32[] convert(%reduce_max.918), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.211 = f32[1024]{0} scatter(%jit__roll_static_.51, %constant.391, %convert_element_type.557), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_106.160, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.52 = f32[1024]{0} call(%Arg_18.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.52 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.118), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.920 = bf16[] reduce(%abs.52, %constant.408), dimensions={0,1,2}, to_apply=%region_108.163, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.560 = f32[] convert(%reduce_max.920), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.212 = f32[1024]{0} scatter(%jit__roll_static_.52, %constant.391, %convert_element_type.560), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_109.164, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.73 = f32[1024]{0} call(%Arg_20.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.73 = bf16[32,64,512,7168]{3,2,0,1} abs(%transpose.56), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.962 = bf16[] reduce(%abs.73, %constant.408), dimensions={0,1,2,3}, to_apply=%region_171.244, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.639 = f32[] convert(%reduce_max.962), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.233 = f32[1024]{0} scatter(%jit__roll_static_.73, %constant.391, %convert_element_type.639), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_172.245, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/dispatch/_Fp8EinsumWrapper_0/fp8_einsum/BSM,BSEC -> EBCM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.53 = f32[1024]{0} call(%Arg_22.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.53 = bf16[64,32,512,7168]{3,2,1,0} abs(%sharding_constraint.122), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.922 = bf16[] reduce(%abs.53, %constant.408), dimensions={0,1,2,3}, to_apply=%region_111.166, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.564 = f32[] convert(%reduce_max.922), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.213 = f32[1024]{0} scatter(%jit__roll_static_.53, %constant.391, %convert_element_type.564), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_112.167, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.54 = f32[1024]{0} call(%Arg_24.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.54 = bf16[64,7168,2048]{2,1,0} abs(%sharding_constraint.123), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.924 = bf16[] reduce(%abs.54, %constant.408), dimensions={0,1,2}, to_apply=%region_114.170, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.567 = f32[] convert(%reduce_max.924), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.214 = f32[1024]{0} scatter(%jit__roll_static_.54, %constant.391, %convert_element_type.567), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_115.171, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.55 = f32[1024]{0} call(%Arg_25.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.55 = bf16[64,7168,2048]{2,1,0} abs(%sharding_constraint.124), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.926 = bf16[] reduce(%abs.55, %constant.408), dimensions={0,1,2}, to_apply=%region_117.174, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.570 = f32[] convert(%reduce_max.926), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.215 = f32[1024]{0} scatter(%jit__roll_static_.55, %constant.391, %convert_element_type.570), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_118.175, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.72 = f32[1024]{0} call(%Arg_28.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.72 = bf16[64,32,512,2048]{3,2,1,0} abs(%sharding_constraint.134), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.960 = bf16[] reduce(%abs.72, %constant.408), dimensions={0,1,2,3}, to_apply=%region_168.241, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.634 = f32[] convert(%reduce_max.960), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.232 = f32[1024]{0} scatter(%jit__roll_static_.72, %constant.391, %convert_element_type.634), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_169.242, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.71 = f32[1024]{0} call(%Arg_29.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.71 = bf16[64,32,512,2048]{3,2,1,0} abs(%sharding_constraint.133), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.958 = bf16[] reduce(%abs.71, %constant.408), dimensions={0,1,2,3}, to_apply=%region_165.237, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.629 = f32[] convert(%reduce_max.958), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.231 = f32[1024]{0} scatter(%jit__roll_static_.71, %constant.391, %convert_element_type.629), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_166.238, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wis/_Fp8EinsumWrapper_1/fp8_einsum/EBCM,EMH -> EBCH/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.56 = f32[1024]{0} call(%Arg_32.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.56 = bf16[64,32,512,2048]{3,2,1,0} abs(%mul.748), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.928 = bf16[] reduce(%abs.56, %constant.408), dimensions={0,1,2,3}, to_apply=%region_120.178, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.573 = f32[] convert(%reduce_max.928), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.216 = f32[1024]{0} scatter(%jit__roll_static_.56, %constant.391, %convert_element_type.573), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_121.179, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.57 = f32[1024]{0} call(%Arg_34.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.57 = bf16[64,2048,7168]{2,1,0} abs(%sharding_constraint.125), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.930 = bf16[] reduce(%abs.57, %constant.408), dimensions={0,1,2}, to_apply=%region_123.182, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.576 = f32[] convert(%reduce_max.930), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.217 = f32[1024]{0} scatter(%jit__roll_static_.57, %constant.391, %convert_element_type.576), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_124.183, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.70 = f32[1024]{0} call(%Arg_36.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.70 = bf16[64,32,512,7168]{3,2,1,0} abs(%sharding_constraint.131), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.956 = bf16[] reduce(%abs.70, %constant.408), dimensions={0,1,2,3}, to_apply=%region_162.232, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.624 = f32[] convert(%reduce_max.956), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.230 = f32[1024]{0} scatter(%jit__roll_static_.70, %constant.391, %convert_element_type.624), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_163.233, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/wo/_Fp8EinsumWrapper_2/fp8_einsum/EBCH,EHM -> EBCM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.58 = f32[1024]{0} call(%Arg_38.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.58 = bf16[32,4096,64,512]{3,2,1,0} abs(%sharding_constraint.121), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.932 = bf16[] reduce(%abs.58, %constant.408), dimensions={0,1,2,3}, to_apply=%region_126.186, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.580 = f32[] convert(%reduce_max.932), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.218 = f32[1024]{0} scatter(%jit__roll_static_.58, %constant.391, %convert_element_type.580), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_127.187, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.59 = f32[1024]{0} call(%Arg_40.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.59 = bf16[64,32,512,7168]{3,2,1,0} abs(%sharding_constraint.126), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.934 = bf16[] reduce(%abs.59, %constant.408), dimensions={0,1,2,3}, to_apply=%region_129.189, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.583 = f32[] convert(%reduce_max.934), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.219 = f32[1024]{0} scatter(%jit__roll_static_.59, %constant.391, %convert_element_type.583), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_130.190, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.69 = f32[1024]{0} call(%Arg_42.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.69 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.129), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.954 = bf16[] reduce(%abs.69, %constant.408), dimensions={0,1,2}, to_apply=%region_159.229, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.619 = f32[] convert(%reduce_max.954), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.229 = f32[1024]{0} scatter(%jit__roll_static_.69, %constant.391, %convert_element_type.619), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_160.230, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/combine/_Fp8EinsumWrapper_3/fp8_einsum/EBCM,BSEC -> BSM/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.60 = f32[1024]{0} call(%Arg_44.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.60 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.118), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.936 = bf16[] reduce(%abs.60, %constant.408), dimensions={0,1,2}, to_apply=%region_132.192, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.586 = f32[] convert(%reduce_max.936), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.220 = f32[1024]{0} scatter(%jit__roll_static_.60, %constant.391, %convert_element_type.586), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_133.193, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.61 = f32[1024]{0} call(%Arg_46.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.61 = bf16[7168,2048]{1,0} abs(%Arg_95.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.938 = bf16[] reduce(%abs.61, %constant.408), dimensions={0,1}, to_apply=%region_135.195, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.589 = f32[] convert(%reduce_max.938), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.221 = f32[1024]{0} scatter(%jit__roll_static_.61, %constant.391, %convert_element_type.589), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_136.196, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.68 = f32[1024]{0} call(%Arg_48.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.68 = bf16[32,4096,2048]{2,1,0} abs(%jit_silu_.24), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.952 = bf16[] reduce(%abs.68, %constant.408), dimensions={0,1,2}, to_apply=%region_156.226, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.614 = f32[] convert(%reduce_max.952), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.228 = f32[1024]{0} scatter(%jit__roll_static_.68, %constant.391, %convert_element_type.614), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_157.227, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.62 = f32[1024]{0} call(%Arg_50.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.62 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.118), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.940 = bf16[] reduce(%abs.62, %constant.408), dimensions={0,1,2}, to_apply=%region_138.200, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.592 = f32[] convert(%reduce_max.940), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.222 = f32[1024]{0} scatter(%jit__roll_static_.62, %constant.391, %convert_element_type.592), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_139.201, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.63 = f32[1024]{0} call(%Arg_52.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.63 = bf16[7168,2048]{1,0} abs(%Arg_96.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.942 = bf16[] reduce(%abs.63, %constant.408), dimensions={0,1}, to_apply=%region_141.203, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.595 = f32[] convert(%reduce_max.942), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.223 = f32[1024]{0} scatter(%jit__roll_static_.63, %constant.391, %convert_element_type.595), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_142.204, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.67 = f32[1024]{0} call(%Arg_54.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.67 = bf16[32,4096,2048]{2,1,0} abs(%mul.756), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.950 = bf16[] reduce(%abs.67, %constant.408), dimensions={0,1,2}, to_apply=%region_153.221, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.609 = f32[] convert(%reduce_max.950), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.227 = f32[1024]{0} scatter(%jit__roll_static_.67, %constant.391, %convert_element_type.609), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_154.222, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.64 = f32[1024]{0} call(%Arg_56.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.64 = bf16[32,4096,2048]{2,1,0} abs(%sharding_constraint.127), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.944 = bf16[] reduce(%abs.64, %constant.408), dimensions={0,1,2}, to_apply=%region_144.206, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.598 = f32[] convert(%reduce_max.944), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.224 = f32[1024]{0} scatter(%jit__roll_static_.64, %constant.391, %convert_element_type.598), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_145.207, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.65 = f32[1024]{0} call(%Arg_58.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.65 = bf16[2048,7168]{1,0} abs(%Arg_97.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.946 = bf16[] reduce(%abs.65, %constant.408), dimensions={0,1}, to_apply=%region_147.210, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.601 = f32[] convert(%reduce_max.946), metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.225 = f32[1024]{0} scatter(%jit__roll_static_.65, %constant.391, %convert_element_type.601), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_148.211, metadata={op_name="checkpoint/rematted_computation/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.66 = f32[1024]{0} call(%Arg_60.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.66 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.129), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.948 = bf16[] reduce(%abs.66, %constant.408), dimensions={0,1,2}, to_apply=%region_150.216, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.604 = f32[] convert(%reduce_max.948), metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.226 = f32[1024]{0} scatter(%jit__roll_static_.66, %constant.391, %convert_element_type.604), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_151.218, metadata={op_name="checkpoint/moe_layers/DeepSeekMoeBlock_0/Fp8DirectDotGeneralOp_3/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.40 = f32[1024]{0} call(%Arg_62.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.40 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.112), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.896 = bf16[] reduce(%abs.40, %constant.408), dimensions={0,1,2}, to_apply=%region_68.106, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.501 = f32[] convert(%reduce_max.896), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.200 = f32[1024]{0} scatter(%jit__roll_static_.40, %constant.391, %convert_element_type.501), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_69.108, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.41 = f32[1024]{0} call(%Arg_64.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.41 = bf16[7168,1536]{1,0} abs(%Arg_105.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.898 = bf16[] reduce(%abs.41, %constant.408), dimensions={0,1}, to_apply=%region_71.111, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.504 = f32[] convert(%reduce_max.898), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.201 = f32[1024]{0} scatter(%jit__roll_static_.41, %constant.391, %convert_element_type.504), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_72.112, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.42 = f32[1024]{0} call(%Arg_65.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.42 = bf16[7168,576]{1,0} abs(%Arg_103.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.900 = bf16[] reduce(%abs.42, %constant.408), dimensions={0,1}, to_apply=%region_74.115, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.507 = f32[] convert(%reduce_max.900), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.202 = f32[1024]{0} scatter(%jit__roll_static_.42, %constant.391, %convert_element_type.507), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_75.116, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.79 = f32[1024]{0} call(%Arg_68.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.79 = bf16[32,4096,1536]{2,1,0} abs(%convert_element_type.678), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.974 = bf16[] reduce(%abs.79, %constant.408), dimensions={0,1,2}, to_apply=%region_205.283, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.684 = f32[] convert(%reduce_max.974), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.239 = f32[1024]{0} scatter(%jit__roll_static_.79, %constant.391, %convert_element_type.684), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_206.284, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.78 = f32[1024]{0} call(%Arg_69.5), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.78 = bf16[32,4096,576]{2,1,0} abs(%concatenate.32), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.972 = bf16[] reduce(%abs.78, %constant.408), dimensions={0,1,2}, to_apply=%region_202.279, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.679 = f32[] convert(%reduce_max.972), metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.238 = f32[1024]{0} scatter(%jit__roll_static_.78, %constant.391, %convert_element_type.679), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_203.280, metadata={op_name="checkpoint/moe_layers/self_attention/fp8_einsum/BSD,DH->BSH/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.47 = f32[1024]{0} call(%Arg_72.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.47 = bf16[32,4096,128,128]{3,2,1,0} abs(%sharding_constraint.116), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.910 = bf16[] reduce(%abs.47, %constant.408), dimensions={0,1,2,3}, to_apply=%region_92.141, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.536 = f32[] convert(%reduce_max.910), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.207 = f32[1024]{0} scatter(%jit__roll_static_.47, %constant.391, %convert_element_type.536), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_93.142, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.48 = f32[1024]{0} call(%Arg_74.5), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.48 = bf16[128,128,7168]{2,1,0} abs(%Arg_101.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.912 = bf16[] reduce(%abs.48, %constant.408), dimensions={0,1,2}, to_apply=%region_95.145, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.539 = f32[] convert(%reduce_max.912), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.208 = f32[1024]{0} scatter(%jit__roll_static_.48, %constant.391, %convert_element_type.539), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_96.146, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.75 = f32[1024]{0} call(%Arg_76.3), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.75 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.141), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.966 = bf16[] reduce(%abs.75, %constant.408), dimensions={0,1,2}, to_apply=%region_183.258, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.650 = f32[] convert(%reduce_max.966), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.235 = f32[1024]{0} scatter(%jit__roll_static_.75, %constant.391, %convert_element_type.650), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_184.259, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.45 = f32[1024]{0} call(%Arg_78.3), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.45 = bf16[32,4096,512]{2,1,0} abs(%mul.710), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.906 = bf16[] reduce(%abs.45, %constant.408), dimensions={0,1,2}, to_apply=%region_86.133, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.529 = f32[] convert(%reduce_max.906), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.205 = f32[1024]{0} scatter(%jit__roll_static_.45, %constant.391, %convert_element_type.529), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_87.134, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.46 = f32[1024]{0} call(%Arg_80.3), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.46 = bf16[512,128,256]{2,1,0} abs(%Arg_104.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.908 = bf16[] reduce(%abs.46, %constant.408), dimensions={0,1,2}, to_apply=%region_89.137, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.532 = f32[] convert(%reduce_max.908), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.206 = f32[1024]{0} scatter(%jit__roll_static_.46, %constant.391, %convert_element_type.532), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_90.138, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.76 = f32[1024]{0} call(%Arg_82.3), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.76 = bf16[32,4096,128,256]{3,2,1,0} abs(%concatenate.31), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.968 = bf16[] reduce(%abs.76, %constant.408), dimensions={0,1,2,3}, to_apply=%region_187.262, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.655 = f32[] convert(%reduce_max.968), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.236 = f32[1024]{0} scatter(%jit__roll_static_.76, %constant.391, %convert_element_type.655), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_188.263, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.43 = f32[1024]{0} call(%Arg_84.3), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.43 = bf16[32,4096,1536]{2,1,0} abs(%mul.681), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.902 = bf16[] reduce(%abs.43, %constant.408), dimensions={0,1,2}, to_apply=%region_78.120, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.512 = f32[] convert(%reduce_max.902), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.203 = f32[1024]{0} scatter(%jit__roll_static_.43, %constant.391, %convert_element_type.512), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_79.121, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.44 = f32[1024]{0} call(%Arg_86.3), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.44 = bf16[1536,128,192]{2,1,0} abs(%Arg_106.3), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.904 = bf16[] reduce(%abs.44, %constant.408), dimensions={0,1,2}, to_apply=%region_81.124, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.515 = f32[] convert(%reduce_max.904), metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.204 = f32[1024]{0} scatter(%jit__roll_static_.44, %constant.391, %convert_element_type.515), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_82.125, metadata={op_name="checkpoint/rematted_computation/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.77 = f32[1024]{0} call(%Arg_88.3), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.77 = bf16[32,4096,128,192]{3,2,1,0} abs(%concatenate.33), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.970 = bf16[] reduce(%abs.77, %constant.408), dimensions={0,1,2,3}, to_apply=%region_195.271, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.672 = f32[] convert(%reduce_max.970), metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.237 = f32[1024]{0} scatter(%jit__roll_static_.77, %constant.391, %convert_element_type.672), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_196.272, metadata={op_name="checkpoint/moe_layers/self_attention/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  ROOT %tuple.13 = (bf16[32,4096,7168]{2,1,0}, bf16[7168,64]{0,1}, bf16[64]{0}, bf16[64,7168,2048]{2,1,0}, bf16[64,7168,2048]{2,1,0}, /*index=5*/bf16[64,2048,7168]{2,1,0}, bf16[7168,2048]{0,1}, bf16[7168,2048]{0,1}, bf16[2048,7168]{0,1}, bf16[7168]{0}, /*index=10*/bf16[7168]{0}, bf16[512]{0}, bf16[128,128,7168]{1,0,2}, bf16[1536]{0}, bf16[7168,576]{0,1}, /*index=15*/bf16[512,128,256]{0,2,1}, bf16[7168,1536]{0,1}, bf16[1536,128,192]{0,2,1}, f32[1024]{0}, f32[1]{0}, /*index=20*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=25*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=30*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, /*index=35*/f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, f32[1]{0}, /*index=40*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=45*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=50*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=55*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=60*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=65*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=70*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, /*index=75*/f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, f32[1]{0}, /*index=80*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=85*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=90*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=95*/f32[1]{0}, f32[1024]{0}, f32[1]{0}) tuple(%sharding_constraint.148, %mul.791, %broadcast_in_dim.766, %sharding_constraint.136, %sharding_constraint.135, /*index=5*/%sharding_constraint.132, %mul.765, %mul.761, %mul.755, %reduce_sum.302, /*index=10*/%reduce_sum.316, %reduce_sum.308, %mul.807, %reduce_sum.312, %mul.851, /*index=15*/%mul.811, %mul.855, %mul.835, %scatter.209, %div.753, /*index=20*/%scatter.210, %div.758, %scatter.234, %div.878, %scatter.211, /*index=25*/%div.763, %scatter.212, %div.768, %scatter.233, %div.873, /*index=30*/%scatter.213, %div.773, %scatter.214, %scatter.215, %div.778, /*index=35*/%div.783, %scatter.232, %scatter.231, %div.868, %div.863, /*index=40*/%scatter.216, %div.788, %scatter.217, %div.793, %scatter.230, /*index=45*/%div.858, %scatter.218, %div.798, %scatter.219, %div.803, /*index=50*/%scatter.229, %div.853, %scatter.220, %div.808, %scatter.221, /*index=55*/%div.813, %scatter.228, %div.848, %scatter.222, %div.818, /*index=60*/%scatter.223, %div.823, %scatter.227, %div.843, %scatter.224, /*index=65*/%div.828, %scatter.225, %div.833, %scatter.226, %div.838, /*index=70*/%scatter.200, %div.694, %scatter.201, %scatter.202, %div.699, /*index=75*/%div.704, %scatter.239, %scatter.238, %div.906, %div.901, /*index=80*/%scatter.207, %div.741, %scatter.208, %div.746, %scatter.235, /*index=85*/%div.884, %scatter.205, %div.731, %scatter.206, %div.736, /*index=90*/%scatter.236, %div.889, %scatter.203, %div.711, %scatter.204, /*index=95*/%div.716, %scatter.237, %div.895)
+}
+
+%region_65.291 (arg_tuple.9: (s32[], bf16[32,4096,7168], bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], /*index=5*/bf16[2,64,7168,2048], bf16[2,64,2048,7168], bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], /*index=10*/bf16[2,7168], bf16[2,7168], bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], /*index=15*/bf16[2,7168,576], bf16[2,512,128,256], bf16[2,7168,1536], bf16[2,1536,128,192], f32[2,1024], /*index=20*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=25*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=30*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=35*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=40*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=45*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=50*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=55*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=60*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=65*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=70*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=75*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=80*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=85*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=90*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=95*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], bf16[2,32,4096,7168], /*index=100*/bf16[2,32,4096,1536], bf16[2,32,4096,576], bf16[2,32,4096,7168], bf16[2,64,32,512,2048], bf16[2,64,32,512,2048], /*index=105*/bf16[2,32,4096,2048], bf16[2,32,4096,2048], f32[2,1024], f32[2,1], f32[2,1024], /*index=110*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=115*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=120*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=125*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=130*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=135*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=140*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=145*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=150*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=155*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=160*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=165*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=170*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=175*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=180*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=185*/f32[2,1024], f32[2,1], bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], /*index=190*/bf16[2,64,7168,2048], bf16[2,64,2048,7168], bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], /*index=195*/bf16[2,7168], bf16[2,7168], bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], /*index=200*/bf16[2,7168,576], bf16[2,512,128,256], bf16[2,7168,1536], bf16[2,1536,128,192], s32[32,4096])) -> (s32[], bf16[32,4096,7168], bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], /*index=5*/bf16[2,64,7168,2048], bf16[2,64,2048,7168], bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], /*index=10*/bf16[2,7168], bf16[2,7168], bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], /*index=15*/bf16[2,7168,576], bf16[2,512,128,256], bf16[2,7168,1536], bf16[2,1536,128,192], f32[2,1024], /*index=20*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=25*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=30*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=35*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=40*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=45*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=50*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=55*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=60*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=65*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=70*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=75*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=80*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=85*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=90*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=95*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], bf16[2,32,4096,7168], /*index=100*/bf16[2,32,4096,1536], bf16[2,32,4096,576], bf16[2,32,4096,7168], bf16[2,64,32,512,2048], bf16[2,64,32,512,2048], /*index=105*/bf16[2,32,4096,2048], bf16[2,32,4096,2048], f32[2,1024], f32[2,1], f32[2,1024], /*index=110*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=115*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=120*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=125*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=130*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=135*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=140*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=145*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=150*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=155*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=160*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=165*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=170*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=175*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=180*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=185*/f32[2,1024], f32[2,1], bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], /*index=190*/bf16[2,64,7168,2048], bf16[2,64,2048,7168], bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], /*index=195*/bf16[2,7168], bf16[2,7168], bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], /*index=200*/bf16[2,7168,576], bf16[2,512,128,256], bf16[2,7168,1536], bf16[2,1536,128,192], s32[32,4096]) {
+  %arg_tuple.9 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=5*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=10*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=15*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, /*index=20*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=25*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=35*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=75*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,32,4096,7168]{3,2,1,0}, /*index=100*/bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, /*index=105*/bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=130*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=135*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=140*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=145*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=150*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=155*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=160*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=165*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=170*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=175*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=180*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=185*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=190*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=195*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=200*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) parameter(0)
+  %get-tuple-element.741 = s32[] get-tuple-element(%arg_tuple.9), index=0
+  %constant.414 = s32[] constant(1)
+  %add.109 = s32[] add(%get-tuple-element.741, %constant.414), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.945 = s32[32,4096]{1,0} get-tuple-element(%arg_tuple.9), index=204
+  %get-tuple-element.742 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.9), index=1
+  %get-tuple-element.840 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=99
+  %constant.415 = s32[] constant(2)
+  %sub.39 = s32[] subtract(%constant.415, %get-tuple-element.741), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %sub.40 = s32[] subtract(%sub.39, %constant.414), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.413 = s32[] constant(0)
+  %dynamic_slice.327 = bf16[1,32,4096,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.840, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,32,4096,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.352 = bf16[32,4096,7168]{2,1,0} reshape(%dynamic_slice.327), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.841 = bf16[2,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=100
+  %dynamic_slice.328 = bf16[1,32,4096,1536]{3,2,1,0} dynamic-slice(%get-tuple-element.841, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,32,4096,1536}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.353 = bf16[32,4096,1536]{2,1,0} reshape(%dynamic_slice.328), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.842 = bf16[2,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=101
+  %dynamic_slice.329 = bf16[1,32,4096,576]{3,2,1,0} dynamic-slice(%get-tuple-element.842, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,32,4096,576}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.354 = bf16[32,4096,576]{2,1,0} reshape(%dynamic_slice.329), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.843 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=102
+  %dynamic_slice.330 = bf16[1,32,4096,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.843, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,32,4096,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.355 = bf16[32,4096,7168]{2,1,0} reshape(%dynamic_slice.330), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.844 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.9), index=103
+  %dynamic_slice.331 = bf16[1,64,32,512,2048]{4,3,2,1,0} dynamic-slice(%get-tuple-element.844, %sub.40, %constant.413, %constant.413, %constant.413, /*index=5*/%constant.413), dynamic_slice_sizes={1,64,32,512,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.356 = bf16[64,32,512,2048]{3,2,1,0} reshape(%dynamic_slice.331), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.845 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.9), index=104
+  %dynamic_slice.332 = bf16[1,64,32,512,2048]{4,3,2,1,0} dynamic-slice(%get-tuple-element.845, %sub.40, %constant.413, %constant.413, %constant.413, /*index=5*/%constant.413), dynamic_slice_sizes={1,64,32,512,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.357 = bf16[64,32,512,2048]{3,2,1,0} reshape(%dynamic_slice.332), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.846 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=105
+  %dynamic_slice.333 = bf16[1,32,4096,2048]{3,2,1,0} dynamic-slice(%get-tuple-element.846, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,32,4096,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.358 = bf16[32,4096,2048]{2,1,0} reshape(%dynamic_slice.333), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.847 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=106
+  %dynamic_slice.334 = bf16[1,32,4096,2048]{3,2,1,0} dynamic-slice(%get-tuple-element.847, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,32,4096,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.359 = bf16[32,4096,2048]{2,1,0} reshape(%dynamic_slice.334), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.848 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=107
+  %dynamic_slice.335 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.848, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.360 = f32[1024]{0} reshape(%dynamic_slice.335), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.849 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=108
+  %dynamic_slice.336 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.849, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.361 = f32[1]{0} reshape(%dynamic_slice.336), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.850 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=109
+  %dynamic_slice.337 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.850, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.362 = f32[1024]{0} reshape(%dynamic_slice.337), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.851 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=110
+  %dynamic_slice.338 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.851, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.363 = f32[1]{0} reshape(%dynamic_slice.338), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.852 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=111
+  %dynamic_slice.339 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.852, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.364 = f32[1024]{0} reshape(%dynamic_slice.339), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.853 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=112
+  %dynamic_slice.340 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.853, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.365 = f32[1]{0} reshape(%dynamic_slice.340), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.854 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=113
+  %dynamic_slice.341 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.854, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.366 = f32[1024]{0} reshape(%dynamic_slice.341), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.855 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=114
+  %dynamic_slice.342 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.855, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.367 = f32[1]{0} reshape(%dynamic_slice.342), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.856 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=115
+  %dynamic_slice.343 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.856, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.368 = f32[1024]{0} reshape(%dynamic_slice.343), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.857 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=116
+  %dynamic_slice.344 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.857, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.369 = f32[1]{0} reshape(%dynamic_slice.344), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.858 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=117
+  %dynamic_slice.345 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.858, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.370 = f32[1024]{0} reshape(%dynamic_slice.345), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.859 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=118
+  %dynamic_slice.346 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.859, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.371 = f32[1]{0} reshape(%dynamic_slice.346), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.860 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=119
+  %dynamic_slice.347 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.860, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.372 = f32[1024]{0} reshape(%dynamic_slice.347), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.861 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=120
+  %dynamic_slice.348 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.861, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.373 = f32[1]{0} reshape(%dynamic_slice.348), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.862 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=121
+  %dynamic_slice.349 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.862, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.374 = f32[1024]{0} reshape(%dynamic_slice.349), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.863 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=122
+  %dynamic_slice.350 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.863, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.375 = f32[1024]{0} reshape(%dynamic_slice.350), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.864 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=123
+  %dynamic_slice.351 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.864, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.376 = f32[1]{0} reshape(%dynamic_slice.351), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.865 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=124
+  %dynamic_slice.352 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.865, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.377 = f32[1]{0} reshape(%dynamic_slice.352), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.866 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=125
+  %dynamic_slice.353 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.866, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.378 = f32[1024]{0} reshape(%dynamic_slice.353), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.867 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=126
+  %dynamic_slice.354 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.867, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.379 = f32[1024]{0} reshape(%dynamic_slice.354), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.868 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=127
+  %dynamic_slice.355 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.868, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.380 = f32[1]{0} reshape(%dynamic_slice.355), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.869 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=128
+  %dynamic_slice.356 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.869, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.381 = f32[1]{0} reshape(%dynamic_slice.356), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.870 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=129
+  %dynamic_slice.357 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.870, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.382 = f32[1024]{0} reshape(%dynamic_slice.357), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.871 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=130
+  %dynamic_slice.358 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.871, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.383 = f32[1]{0} reshape(%dynamic_slice.358), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.872 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=131
+  %dynamic_slice.359 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.872, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.384 = f32[1024]{0} reshape(%dynamic_slice.359), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.873 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=132
+  %dynamic_slice.360 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.873, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.385 = f32[1]{0} reshape(%dynamic_slice.360), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.874 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=133
+  %dynamic_slice.361 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.874, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.386 = f32[1024]{0} reshape(%dynamic_slice.361), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.875 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=134
+  %dynamic_slice.362 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.875, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.387 = f32[1]{0} reshape(%dynamic_slice.362), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.876 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=135
+  %dynamic_slice.363 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.876, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.388 = f32[1024]{0} reshape(%dynamic_slice.363), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.877 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=136
+  %dynamic_slice.364 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.877, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.389 = f32[1]{0} reshape(%dynamic_slice.364), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.878 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=137
+  %dynamic_slice.365 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.878, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.390 = f32[1024]{0} reshape(%dynamic_slice.365), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.879 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=138
+  %dynamic_slice.366 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.879, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.391 = f32[1]{0} reshape(%dynamic_slice.366), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.880 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=139
+  %dynamic_slice.367 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.880, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.392 = f32[1024]{0} reshape(%dynamic_slice.367), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.881 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=140
+  %dynamic_slice.368 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.881, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.393 = f32[1]{0} reshape(%dynamic_slice.368), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.882 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=141
+  %dynamic_slice.369 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.882, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.394 = f32[1024]{0} reshape(%dynamic_slice.369), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.883 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=142
+  %dynamic_slice.370 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.883, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.395 = f32[1]{0} reshape(%dynamic_slice.370), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.884 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=143
+  %dynamic_slice.371 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.884, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.396 = f32[1024]{0} reshape(%dynamic_slice.371), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.885 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=144
+  %dynamic_slice.372 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.885, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.397 = f32[1]{0} reshape(%dynamic_slice.372), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.886 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=145
+  %dynamic_slice.373 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.886, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.398 = f32[1024]{0} reshape(%dynamic_slice.373), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.887 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=146
+  %dynamic_slice.374 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.887, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.399 = f32[1]{0} reshape(%dynamic_slice.374), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.888 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=147
+  %dynamic_slice.375 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.888, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.400 = f32[1024]{0} reshape(%dynamic_slice.375), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.889 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=148
+  %dynamic_slice.376 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.889, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.401 = f32[1]{0} reshape(%dynamic_slice.376), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.890 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=149
+  %dynamic_slice.377 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.890, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.402 = f32[1024]{0} reshape(%dynamic_slice.377), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.891 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=150
+  %dynamic_slice.378 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.891, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.403 = f32[1]{0} reshape(%dynamic_slice.378), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.892 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=151
+  %dynamic_slice.379 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.892, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.404 = f32[1024]{0} reshape(%dynamic_slice.379), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.893 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=152
+  %dynamic_slice.380 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.893, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.405 = f32[1]{0} reshape(%dynamic_slice.380), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.894 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=153
+  %dynamic_slice.381 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.894, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.406 = f32[1024]{0} reshape(%dynamic_slice.381), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.895 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=154
+  %dynamic_slice.382 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.895, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.407 = f32[1]{0} reshape(%dynamic_slice.382), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.896 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=155
+  %dynamic_slice.383 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.896, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.408 = f32[1024]{0} reshape(%dynamic_slice.383), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.897 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=156
+  %dynamic_slice.384 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.897, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.409 = f32[1]{0} reshape(%dynamic_slice.384), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.898 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=157
+  %dynamic_slice.385 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.898, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.410 = f32[1024]{0} reshape(%dynamic_slice.385), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.899 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=158
+  %dynamic_slice.386 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.899, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.411 = f32[1]{0} reshape(%dynamic_slice.386), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.900 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=159
+  %dynamic_slice.387 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.900, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.412 = f32[1024]{0} reshape(%dynamic_slice.387), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.901 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=160
+  %dynamic_slice.388 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.901, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.413 = f32[1]{0} reshape(%dynamic_slice.388), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.902 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=161
+  %dynamic_slice.389 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.902, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.414 = f32[1024]{0} reshape(%dynamic_slice.389), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.903 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=162
+  %dynamic_slice.390 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.903, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.415 = f32[1024]{0} reshape(%dynamic_slice.390), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.904 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=163
+  %dynamic_slice.391 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.904, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.416 = f32[1]{0} reshape(%dynamic_slice.391), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.905 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=164
+  %dynamic_slice.392 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.905, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.417 = f32[1]{0} reshape(%dynamic_slice.392), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.906 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=165
+  %dynamic_slice.393 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.906, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.418 = f32[1024]{0} reshape(%dynamic_slice.393), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.907 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=166
+  %dynamic_slice.394 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.907, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.419 = f32[1024]{0} reshape(%dynamic_slice.394), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.908 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=167
+  %dynamic_slice.395 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.908, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.420 = f32[1]{0} reshape(%dynamic_slice.395), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.909 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=168
+  %dynamic_slice.396 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.909, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.421 = f32[1]{0} reshape(%dynamic_slice.396), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.910 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=169
+  %dynamic_slice.397 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.910, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.422 = f32[1024]{0} reshape(%dynamic_slice.397), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.911 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=170
+  %dynamic_slice.398 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.911, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.423 = f32[1]{0} reshape(%dynamic_slice.398), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.912 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=171
+  %dynamic_slice.399 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.912, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.424 = f32[1024]{0} reshape(%dynamic_slice.399), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.913 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=172
+  %dynamic_slice.400 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.913, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.425 = f32[1]{0} reshape(%dynamic_slice.400), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.914 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=173
+  %dynamic_slice.401 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.914, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.426 = f32[1024]{0} reshape(%dynamic_slice.401), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.915 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=174
+  %dynamic_slice.402 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.915, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.427 = f32[1]{0} reshape(%dynamic_slice.402), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.916 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=175
+  %dynamic_slice.403 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.916, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.428 = f32[1024]{0} reshape(%dynamic_slice.403), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.917 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=176
+  %dynamic_slice.404 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.917, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.429 = f32[1]{0} reshape(%dynamic_slice.404), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.918 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=177
+  %dynamic_slice.405 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.918, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.430 = f32[1024]{0} reshape(%dynamic_slice.405), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.919 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=178
+  %dynamic_slice.406 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.919, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.431 = f32[1]{0} reshape(%dynamic_slice.406), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.920 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=179
+  %dynamic_slice.407 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.920, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.432 = f32[1024]{0} reshape(%dynamic_slice.407), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.921 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=180
+  %dynamic_slice.408 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.921, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.433 = f32[1]{0} reshape(%dynamic_slice.408), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.922 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=181
+  %dynamic_slice.409 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.922, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.434 = f32[1024]{0} reshape(%dynamic_slice.409), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.923 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=182
+  %dynamic_slice.410 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.923, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.435 = f32[1]{0} reshape(%dynamic_slice.410), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.924 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=183
+  %dynamic_slice.411 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.924, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.436 = f32[1024]{0} reshape(%dynamic_slice.411), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.925 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=184
+  %dynamic_slice.412 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.925, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.437 = f32[1]{0} reshape(%dynamic_slice.412), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.926 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=185
+  %dynamic_slice.413 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.926, %sub.40, %constant.413), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.438 = f32[1024]{0} reshape(%dynamic_slice.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.927 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=186
+  %dynamic_slice.414 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.927, %sub.40, %constant.413), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.439 = f32[1]{0} reshape(%dynamic_slice.414), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.928 = bf16[2,7168,64]{2,1,0} get-tuple-element(%arg_tuple.9), index=187
+  %dynamic_slice.415 = bf16[1,7168,64]{2,1,0} dynamic-slice(%get-tuple-element.928, %sub.40, %constant.413, %constant.413), dynamic_slice_sizes={1,7168,64}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.440 = bf16[7168,64]{1,0} reshape(%dynamic_slice.415), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.929 = bf16[2,64]{1,0} get-tuple-element(%arg_tuple.9), index=188
+  %dynamic_slice.416 = bf16[1,64]{1,0} dynamic-slice(%get-tuple-element.929, %sub.40, %constant.413), dynamic_slice_sizes={1,64}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.441 = bf16[64]{0} reshape(%dynamic_slice.416), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.930 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=189
+  %dynamic_slice.417 = bf16[1,64,7168,2048]{3,2,1,0} dynamic-slice(%get-tuple-element.930, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,64,7168,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.442 = bf16[64,7168,2048]{2,1,0} reshape(%dynamic_slice.417), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.931 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=190
+  %dynamic_slice.418 = bf16[1,64,7168,2048]{3,2,1,0} dynamic-slice(%get-tuple-element.931, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,64,7168,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.443 = bf16[64,7168,2048]{2,1,0} reshape(%dynamic_slice.418), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.932 = bf16[2,64,2048,7168]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=191
+  %dynamic_slice.419 = bf16[1,64,2048,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.932, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,64,2048,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.444 = bf16[64,2048,7168]{2,1,0} reshape(%dynamic_slice.419), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.933 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.9), index=192
+  %dynamic_slice.420 = bf16[1,7168,2048]{2,1,0} dynamic-slice(%get-tuple-element.933, %sub.40, %constant.413, %constant.413), dynamic_slice_sizes={1,7168,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.445 = bf16[7168,2048]{1,0} reshape(%dynamic_slice.420), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.934 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.9), index=193
+  %dynamic_slice.421 = bf16[1,7168,2048]{2,1,0} dynamic-slice(%get-tuple-element.934, %sub.40, %constant.413, %constant.413), dynamic_slice_sizes={1,7168,2048}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.446 = bf16[7168,2048]{1,0} reshape(%dynamic_slice.421), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.935 = bf16[2,2048,7168]{2,1,0} get-tuple-element(%arg_tuple.9), index=194
+  %dynamic_slice.422 = bf16[1,2048,7168]{2,1,0} dynamic-slice(%get-tuple-element.935, %sub.40, %constant.413, %constant.413), dynamic_slice_sizes={1,2048,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.447 = bf16[2048,7168]{1,0} reshape(%dynamic_slice.422), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.936 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.9), index=195
+  %dynamic_slice.423 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.936, %sub.40, %constant.413), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.448 = bf16[7168]{0} reshape(%dynamic_slice.423), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.937 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.9), index=196
+  %dynamic_slice.424 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.937, %sub.40, %constant.413), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.449 = bf16[7168]{0} reshape(%dynamic_slice.424), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.938 = bf16[2,512]{1,0} get-tuple-element(%arg_tuple.9), index=197
+  %dynamic_slice.425 = bf16[1,512]{1,0} dynamic-slice(%get-tuple-element.938, %sub.40, %constant.413), dynamic_slice_sizes={1,512}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.450 = bf16[512]{0} reshape(%dynamic_slice.425), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.939 = bf16[2,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=198
+  %dynamic_slice.426 = bf16[1,128,128,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.939, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,128,128,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.451 = bf16[128,128,7168]{2,1,0} reshape(%dynamic_slice.426), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.940 = bf16[2,1536]{1,0} get-tuple-element(%arg_tuple.9), index=199
+  %dynamic_slice.427 = bf16[1,1536]{1,0} dynamic-slice(%get-tuple-element.940, %sub.40, %constant.413), dynamic_slice_sizes={1,1536}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.452 = bf16[1536]{0} reshape(%dynamic_slice.427), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.941 = bf16[2,7168,576]{2,1,0} get-tuple-element(%arg_tuple.9), index=200
+  %dynamic_slice.428 = bf16[1,7168,576]{2,1,0} dynamic-slice(%get-tuple-element.941, %sub.40, %constant.413, %constant.413), dynamic_slice_sizes={1,7168,576}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.453 = bf16[7168,576]{1,0} reshape(%dynamic_slice.428), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.942 = bf16[2,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=201
+  %dynamic_slice.429 = bf16[1,512,128,256]{3,2,1,0} dynamic-slice(%get-tuple-element.942, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,512,128,256}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.454 = bf16[512,128,256]{2,1,0} reshape(%dynamic_slice.429), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.943 = bf16[2,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.9), index=202
+  %dynamic_slice.430 = bf16[1,7168,1536]{2,1,0} dynamic-slice(%get-tuple-element.943, %sub.40, %constant.413, %constant.413), dynamic_slice_sizes={1,7168,1536}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.455 = bf16[7168,1536]{1,0} reshape(%dynamic_slice.430), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.944 = bf16[2,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=203
+  %dynamic_slice.431 = bf16[1,1536,128,192]{3,2,1,0} dynamic-slice(%get-tuple-element.944, %sub.40, %constant.413, %constant.413, %constant.413), dynamic_slice_sizes={1,1536,128,192}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.456 = bf16[1536,128,192]{2,1,0} reshape(%dynamic_slice.431), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.135 = (bf16[32,4096,7168]{2,1,0}, bf16[7168,64]{0,1}, bf16[64]{0}, bf16[64,7168,2048]{2,1,0}, bf16[64,7168,2048]{2,1,0}, /*index=5*/bf16[64,2048,7168]{2,1,0}, bf16[7168,2048]{0,1}, bf16[7168,2048]{0,1}, bf16[2048,7168]{0,1}, bf16[7168]{0}, /*index=10*/bf16[7168]{0}, bf16[512]{0}, bf16[128,128,7168]{1,0,2}, bf16[1536]{0}, bf16[7168,576]{0,1}, /*index=15*/bf16[512,128,256]{0,2,1}, bf16[7168,1536]{0,1}, bf16[1536,128,192]{0,2,1}, f32[1024]{0}, f32[1]{0}, /*index=20*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=25*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=30*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, /*index=35*/f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, f32[1]{0}, /*index=40*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=45*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=50*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=55*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=60*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=65*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=70*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, /*index=75*/f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, f32[1]{0}, /*index=80*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=85*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=90*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=95*/f32[1]{0}, f32[1024]{0}, f32[1]{0}) call(%get-tuple-element.945, %get-tuple-element.742, %squeeze.352, %squeeze.353, %squeeze.354, /*index=5*/%squeeze.355, %squeeze.356, %squeeze.357, %squeeze.358, %squeeze.359, /*index=10*/%squeeze.360, %squeeze.361, %squeeze.362, %squeeze.363, %squeeze.364, /*index=15*/%squeeze.365, %squeeze.366, %squeeze.367, %squeeze.368, %squeeze.369, /*index=20*/%squeeze.370, %squeeze.371, %squeeze.372, %squeeze.373, %squeeze.374, /*index=25*/%squeeze.375, %squeeze.376, %squeeze.377, %squeeze.378, %squeeze.379, /*index=30*/%squeeze.380, %squeeze.381, %squeeze.382, %squeeze.383, %squeeze.384, /*index=35*/%squeeze.385, %squeeze.386, %squeeze.387, %squeeze.388, %squeeze.389, /*index=40*/%squeeze.390, %squeeze.391, %squeeze.392, %squeeze.393, %squeeze.394, /*index=45*/%squeeze.395, %squeeze.396, %squeeze.397, %squeeze.398, %squeeze.399, /*index=50*/%squeeze.400, %squeeze.401, %squeeze.402, %squeeze.403, %squeeze.404, /*index=55*/%squeeze.405, %squeeze.406, %squeeze.407, %squeeze.408, %squeeze.409, /*index=60*/%squeeze.410, %squeeze.411, %squeeze.412, %squeeze.413, %squeeze.414, /*index=65*/%squeeze.415, %squeeze.416, %squeeze.417, %squeeze.418, %squeeze.419, /*index=70*/%squeeze.420, %squeeze.421, %squeeze.422, %squeeze.423, %squeeze.424, /*index=75*/%squeeze.425, %squeeze.426, %squeeze.427, %squeeze.428, %squeeze.429, /*index=80*/%squeeze.430, %squeeze.431, %squeeze.432, %squeeze.433, %squeeze.434, /*index=85*/%squeeze.435, %squeeze.436, %squeeze.437, %squeeze.438, %squeeze.439, /*index=90*/%squeeze.440, %squeeze.441, %squeeze.442, %squeeze.443, %squeeze.444, /*index=95*/%squeeze.445, %squeeze.446, %squeeze.447, %squeeze.448, %squeeze.449, /*index=100*/%squeeze.450, %squeeze.451, %squeeze.452, %squeeze.453, %squeeze.454, /*index=105*/%squeeze.455, %squeeze.456), to_apply=%closed_call_475.290, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.136 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.135), index=0, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.743 = bf16[2,7168,64]{2,1,0} get-tuple-element(%arg_tuple.9), index=2
+  %closed_call.137 = bf16[7168,64]{0,1} get-tuple-element(%closed_call.135), index=1, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1107 = bf16[1,7168,64]{2,1,0} reshape(%closed_call.137), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.125 = bf16[2,7168,64]{2,1,0} dynamic-update-slice(%get-tuple-element.743, %broadcast_in_dim.1107, %sub.40, %constant.413, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.744 = bf16[2,64]{1,0} get-tuple-element(%arg_tuple.9), index=3
+  %closed_call.138 = bf16[64]{0} get-tuple-element(%closed_call.135), index=2, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1108 = bf16[1,64]{1,0} reshape(%closed_call.138), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.126 = bf16[2,64]{1,0} dynamic-update-slice(%get-tuple-element.744, %broadcast_in_dim.1108, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.745 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=4
+  %closed_call.139 = bf16[64,7168,2048]{2,1,0} get-tuple-element(%closed_call.135), index=3, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1109 = bf16[1,64,7168,2048]{3,2,1,0} reshape(%closed_call.139), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.127 = bf16[2,64,7168,2048]{3,2,1,0} dynamic-update-slice(%get-tuple-element.745, %broadcast_in_dim.1109, %sub.40, %constant.413, %constant.413, /*index=5*/%constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.746 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=5
+  %closed_call.140 = bf16[64,7168,2048]{2,1,0} get-tuple-element(%closed_call.135), index=4, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1110 = bf16[1,64,7168,2048]{3,2,1,0} reshape(%closed_call.140), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.128 = bf16[2,64,7168,2048]{3,2,1,0} dynamic-update-slice(%get-tuple-element.746, %broadcast_in_dim.1110, %sub.40, %constant.413, %constant.413, /*index=5*/%constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.747 = bf16[2,64,2048,7168]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=6
+  %closed_call.141 = bf16[64,2048,7168]{2,1,0} get-tuple-element(%closed_call.135), index=5, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1111 = bf16[1,64,2048,7168]{3,2,1,0} reshape(%closed_call.141), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.129 = bf16[2,64,2048,7168]{3,2,1,0} dynamic-update-slice(%get-tuple-element.747, %broadcast_in_dim.1111, %sub.40, %constant.413, %constant.413, /*index=5*/%constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.748 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.9), index=7
+  %closed_call.142 = bf16[7168,2048]{0,1} get-tuple-element(%closed_call.135), index=6, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1112 = bf16[1,7168,2048]{2,1,0} reshape(%closed_call.142), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.130 = bf16[2,7168,2048]{2,1,0} dynamic-update-slice(%get-tuple-element.748, %broadcast_in_dim.1112, %sub.40, %constant.413, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.749 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.9), index=8
+  %closed_call.143 = bf16[7168,2048]{0,1} get-tuple-element(%closed_call.135), index=7, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1113 = bf16[1,7168,2048]{2,1,0} reshape(%closed_call.143), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.131 = bf16[2,7168,2048]{2,1,0} dynamic-update-slice(%get-tuple-element.749, %broadcast_in_dim.1113, %sub.40, %constant.413, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.750 = bf16[2,2048,7168]{2,1,0} get-tuple-element(%arg_tuple.9), index=9
+  %closed_call.144 = bf16[2048,7168]{0,1} get-tuple-element(%closed_call.135), index=8, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1114 = bf16[1,2048,7168]{2,1,0} reshape(%closed_call.144), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.132 = bf16[2,2048,7168]{2,1,0} dynamic-update-slice(%get-tuple-element.750, %broadcast_in_dim.1114, %sub.40, %constant.413, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.751 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.9), index=10
+  %closed_call.145 = bf16[7168]{0} get-tuple-element(%closed_call.135), index=9, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1115 = bf16[1,7168]{1,0} reshape(%closed_call.145), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.133 = bf16[2,7168]{1,0} dynamic-update-slice(%get-tuple-element.751, %broadcast_in_dim.1115, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.752 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.9), index=11
+  %closed_call.146 = bf16[7168]{0} get-tuple-element(%closed_call.135), index=10, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1116 = bf16[1,7168]{1,0} reshape(%closed_call.146), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.134 = bf16[2,7168]{1,0} dynamic-update-slice(%get-tuple-element.752, %broadcast_in_dim.1116, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.753 = bf16[2,512]{1,0} get-tuple-element(%arg_tuple.9), index=12
+  %closed_call.147 = bf16[512]{0} get-tuple-element(%closed_call.135), index=11, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1117 = bf16[1,512]{1,0} reshape(%closed_call.147), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.135 = bf16[2,512]{1,0} dynamic-update-slice(%get-tuple-element.753, %broadcast_in_dim.1117, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.754 = bf16[2,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=13
+  %closed_call.148 = bf16[128,128,7168]{1,0,2} get-tuple-element(%closed_call.135), index=12, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1118 = bf16[1,128,128,7168]{3,2,1,0} reshape(%closed_call.148), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.136 = bf16[2,128,128,7168]{3,2,1,0} dynamic-update-slice(%get-tuple-element.754, %broadcast_in_dim.1118, %sub.40, %constant.413, %constant.413, /*index=5*/%constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.755 = bf16[2,1536]{1,0} get-tuple-element(%arg_tuple.9), index=14
+  %closed_call.149 = bf16[1536]{0} get-tuple-element(%closed_call.135), index=13, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1119 = bf16[1,1536]{1,0} reshape(%closed_call.149), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.137 = bf16[2,1536]{1,0} dynamic-update-slice(%get-tuple-element.755, %broadcast_in_dim.1119, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.756 = bf16[2,7168,576]{2,1,0} get-tuple-element(%arg_tuple.9), index=15
+  %closed_call.150 = bf16[7168,576]{0,1} get-tuple-element(%closed_call.135), index=14, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1120 = bf16[1,7168,576]{2,1,0} reshape(%closed_call.150), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.138 = bf16[2,7168,576]{2,1,0} dynamic-update-slice(%get-tuple-element.756, %broadcast_in_dim.1120, %sub.40, %constant.413, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.757 = bf16[2,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=16
+  %closed_call.151 = bf16[512,128,256]{0,2,1} get-tuple-element(%closed_call.135), index=15, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1121 = bf16[1,512,128,256]{3,2,1,0} reshape(%closed_call.151), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.139 = bf16[2,512,128,256]{3,2,1,0} dynamic-update-slice(%get-tuple-element.757, %broadcast_in_dim.1121, %sub.40, %constant.413, %constant.413, /*index=5*/%constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.758 = bf16[2,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.9), index=17
+  %closed_call.152 = bf16[7168,1536]{0,1} get-tuple-element(%closed_call.135), index=16, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1122 = bf16[1,7168,1536]{2,1,0} reshape(%closed_call.152), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.140 = bf16[2,7168,1536]{2,1,0} dynamic-update-slice(%get-tuple-element.758, %broadcast_in_dim.1122, %sub.40, %constant.413, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.759 = bf16[2,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.9), index=18
+  %closed_call.153 = bf16[1536,128,192]{0,2,1} get-tuple-element(%closed_call.135), index=17, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1123 = bf16[1,1536,128,192]{3,2,1,0} reshape(%closed_call.153), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.141 = bf16[2,1536,128,192]{3,2,1,0} dynamic-update-slice(%get-tuple-element.759, %broadcast_in_dim.1123, %sub.40, %constant.413, %constant.413, /*index=5*/%constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.760 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=19
+  %closed_call.154 = f32[1024]{0} get-tuple-element(%closed_call.135), index=18, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1124 = f32[1,1024]{1,0} reshape(%closed_call.154), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.142 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.760, %broadcast_in_dim.1124, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.761 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=20
+  %closed_call.155 = f32[1]{0} get-tuple-element(%closed_call.135), index=19, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1125 = f32[1,1]{1,0} reshape(%closed_call.155), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.143 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.761, %broadcast_in_dim.1125, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.762 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=21
+  %closed_call.156 = f32[1024]{0} get-tuple-element(%closed_call.135), index=20, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1126 = f32[1,1024]{1,0} reshape(%closed_call.156), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.144 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.762, %broadcast_in_dim.1126, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.763 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=22
+  %closed_call.157 = f32[1]{0} get-tuple-element(%closed_call.135), index=21, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1127 = f32[1,1]{1,0} reshape(%closed_call.157), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.145 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.763, %broadcast_in_dim.1127, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.764 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=23
+  %closed_call.158 = f32[1024]{0} get-tuple-element(%closed_call.135), index=22, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1128 = f32[1,1024]{1,0} reshape(%closed_call.158), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.146 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.764, %broadcast_in_dim.1128, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.765 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=24
+  %closed_call.159 = f32[1]{0} get-tuple-element(%closed_call.135), index=23, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1129 = f32[1,1]{1,0} reshape(%closed_call.159), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.147 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.765, %broadcast_in_dim.1129, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.766 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=25
+  %closed_call.160 = f32[1024]{0} get-tuple-element(%closed_call.135), index=24, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1130 = f32[1,1024]{1,0} reshape(%closed_call.160), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.148 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.766, %broadcast_in_dim.1130, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.767 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=26
+  %closed_call.161 = f32[1]{0} get-tuple-element(%closed_call.135), index=25, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1131 = f32[1,1]{1,0} reshape(%closed_call.161), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.149 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.767, %broadcast_in_dim.1131, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.768 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=27
+  %closed_call.162 = f32[1024]{0} get-tuple-element(%closed_call.135), index=26, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1132 = f32[1,1024]{1,0} reshape(%closed_call.162), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.150 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.768, %broadcast_in_dim.1132, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.769 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=28
+  %closed_call.163 = f32[1]{0} get-tuple-element(%closed_call.135), index=27, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1133 = f32[1,1]{1,0} reshape(%closed_call.163), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.151 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.769, %broadcast_in_dim.1133, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.770 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=29
+  %closed_call.164 = f32[1024]{0} get-tuple-element(%closed_call.135), index=28, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1134 = f32[1,1024]{1,0} reshape(%closed_call.164), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.152 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.770, %broadcast_in_dim.1134, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.771 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=30
+  %closed_call.165 = f32[1]{0} get-tuple-element(%closed_call.135), index=29, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1135 = f32[1,1]{1,0} reshape(%closed_call.165), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.153 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.771, %broadcast_in_dim.1135, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.772 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=31
+  %closed_call.166 = f32[1024]{0} get-tuple-element(%closed_call.135), index=30, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1136 = f32[1,1024]{1,0} reshape(%closed_call.166), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.154 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.772, %broadcast_in_dim.1136, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.773 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=32
+  %closed_call.167 = f32[1]{0} get-tuple-element(%closed_call.135), index=31, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1137 = f32[1,1]{1,0} reshape(%closed_call.167), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.155 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.773, %broadcast_in_dim.1137, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.774 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=33
+  %closed_call.168 = f32[1024]{0} get-tuple-element(%closed_call.135), index=32, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1138 = f32[1,1024]{1,0} reshape(%closed_call.168), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.156 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.774, %broadcast_in_dim.1138, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.775 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=34
+  %closed_call.169 = f32[1024]{0} get-tuple-element(%closed_call.135), index=33, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1139 = f32[1,1024]{1,0} reshape(%closed_call.169), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.157 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.775, %broadcast_in_dim.1139, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.776 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=35
+  %closed_call.170 = f32[1]{0} get-tuple-element(%closed_call.135), index=34, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1140 = f32[1,1]{1,0} reshape(%closed_call.170), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.158 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.776, %broadcast_in_dim.1140, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.777 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=36
+  %closed_call.171 = f32[1]{0} get-tuple-element(%closed_call.135), index=35, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1141 = f32[1,1]{1,0} reshape(%closed_call.171), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.159 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.777, %broadcast_in_dim.1141, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.778 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=37
+  %closed_call.172 = f32[1024]{0} get-tuple-element(%closed_call.135), index=36, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1142 = f32[1,1024]{1,0} reshape(%closed_call.172), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.160 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.778, %broadcast_in_dim.1142, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.779 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=38
+  %closed_call.173 = f32[1024]{0} get-tuple-element(%closed_call.135), index=37, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1143 = f32[1,1024]{1,0} reshape(%closed_call.173), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.161 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.779, %broadcast_in_dim.1143, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.780 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=39
+  %closed_call.174 = f32[1]{0} get-tuple-element(%closed_call.135), index=38, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1144 = f32[1,1]{1,0} reshape(%closed_call.174), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.162 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.780, %broadcast_in_dim.1144, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.781 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=40
+  %closed_call.175 = f32[1]{0} get-tuple-element(%closed_call.135), index=39, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1145 = f32[1,1]{1,0} reshape(%closed_call.175), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.163 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.781, %broadcast_in_dim.1145, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.782 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=41
+  %closed_call.176 = f32[1024]{0} get-tuple-element(%closed_call.135), index=40, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1146 = f32[1,1024]{1,0} reshape(%closed_call.176), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.164 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.782, %broadcast_in_dim.1146, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.783 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=42
+  %closed_call.177 = f32[1]{0} get-tuple-element(%closed_call.135), index=41, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1147 = f32[1,1]{1,0} reshape(%closed_call.177), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.165 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.783, %broadcast_in_dim.1147, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.784 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=43
+  %closed_call.178 = f32[1024]{0} get-tuple-element(%closed_call.135), index=42, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1148 = f32[1,1024]{1,0} reshape(%closed_call.178), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.166 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.784, %broadcast_in_dim.1148, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.785 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=44
+  %closed_call.179 = f32[1]{0} get-tuple-element(%closed_call.135), index=43, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1149 = f32[1,1]{1,0} reshape(%closed_call.179), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.167 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.785, %broadcast_in_dim.1149, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.786 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=45
+  %closed_call.180 = f32[1024]{0} get-tuple-element(%closed_call.135), index=44, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1150 = f32[1,1024]{1,0} reshape(%closed_call.180), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.168 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.786, %broadcast_in_dim.1150, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.787 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=46
+  %closed_call.181 = f32[1]{0} get-tuple-element(%closed_call.135), index=45, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1151 = f32[1,1]{1,0} reshape(%closed_call.181), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.169 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.787, %broadcast_in_dim.1151, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.788 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=47
+  %closed_call.182 = f32[1024]{0} get-tuple-element(%closed_call.135), index=46, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1152 = f32[1,1024]{1,0} reshape(%closed_call.182), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.170 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.788, %broadcast_in_dim.1152, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.789 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=48
+  %closed_call.183 = f32[1]{0} get-tuple-element(%closed_call.135), index=47, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1153 = f32[1,1]{1,0} reshape(%closed_call.183), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.171 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.789, %broadcast_in_dim.1153, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.790 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=49
+  %closed_call.184 = f32[1024]{0} get-tuple-element(%closed_call.135), index=48, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1154 = f32[1,1024]{1,0} reshape(%closed_call.184), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.172 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.790, %broadcast_in_dim.1154, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.791 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=50
+  %closed_call.185 = f32[1]{0} get-tuple-element(%closed_call.135), index=49, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1155 = f32[1,1]{1,0} reshape(%closed_call.185), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.173 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.791, %broadcast_in_dim.1155, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.792 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=51
+  %closed_call.186 = f32[1024]{0} get-tuple-element(%closed_call.135), index=50, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1156 = f32[1,1024]{1,0} reshape(%closed_call.186), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.174 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.792, %broadcast_in_dim.1156, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.793 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=52
+  %closed_call.187 = f32[1]{0} get-tuple-element(%closed_call.135), index=51, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1157 = f32[1,1]{1,0} reshape(%closed_call.187), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.175 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.793, %broadcast_in_dim.1157, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.794 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=53
+  %closed_call.188 = f32[1024]{0} get-tuple-element(%closed_call.135), index=52, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1158 = f32[1,1024]{1,0} reshape(%closed_call.188), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.176 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.794, %broadcast_in_dim.1158, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.795 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=54
+  %closed_call.189 = f32[1]{0} get-tuple-element(%closed_call.135), index=53, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1159 = f32[1,1]{1,0} reshape(%closed_call.189), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.177 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.795, %broadcast_in_dim.1159, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.796 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=55
+  %closed_call.190 = f32[1024]{0} get-tuple-element(%closed_call.135), index=54, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1160 = f32[1,1024]{1,0} reshape(%closed_call.190), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.178 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.796, %broadcast_in_dim.1160, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.797 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=56
+  %closed_call.191 = f32[1]{0} get-tuple-element(%closed_call.135), index=55, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1161 = f32[1,1]{1,0} reshape(%closed_call.191), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.179 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.797, %broadcast_in_dim.1161, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.798 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=57
+  %closed_call.192 = f32[1024]{0} get-tuple-element(%closed_call.135), index=56, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1162 = f32[1,1024]{1,0} reshape(%closed_call.192), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.180 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.798, %broadcast_in_dim.1162, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.799 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=58
+  %closed_call.193 = f32[1]{0} get-tuple-element(%closed_call.135), index=57, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1163 = f32[1,1]{1,0} reshape(%closed_call.193), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.181 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.799, %broadcast_in_dim.1163, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.800 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=59
+  %closed_call.194 = f32[1024]{0} get-tuple-element(%closed_call.135), index=58, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1164 = f32[1,1024]{1,0} reshape(%closed_call.194), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.182 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.800, %broadcast_in_dim.1164, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.801 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=60
+  %closed_call.195 = f32[1]{0} get-tuple-element(%closed_call.135), index=59, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1165 = f32[1,1]{1,0} reshape(%closed_call.195), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.183 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.801, %broadcast_in_dim.1165, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.802 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=61
+  %closed_call.196 = f32[1024]{0} get-tuple-element(%closed_call.135), index=60, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1166 = f32[1,1024]{1,0} reshape(%closed_call.196), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.184 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.802, %broadcast_in_dim.1166, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.803 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=62
+  %closed_call.197 = f32[1]{0} get-tuple-element(%closed_call.135), index=61, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1167 = f32[1,1]{1,0} reshape(%closed_call.197), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.185 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.803, %broadcast_in_dim.1167, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.804 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=63
+  %closed_call.198 = f32[1024]{0} get-tuple-element(%closed_call.135), index=62, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1168 = f32[1,1024]{1,0} reshape(%closed_call.198), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.186 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.804, %broadcast_in_dim.1168, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.805 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=64
+  %closed_call.199 = f32[1]{0} get-tuple-element(%closed_call.135), index=63, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1169 = f32[1,1]{1,0} reshape(%closed_call.199), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.187 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.805, %broadcast_in_dim.1169, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.806 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=65
+  %closed_call.200 = f32[1024]{0} get-tuple-element(%closed_call.135), index=64, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1170 = f32[1,1024]{1,0} reshape(%closed_call.200), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.188 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.806, %broadcast_in_dim.1170, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.807 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=66
+  %closed_call.201 = f32[1]{0} get-tuple-element(%closed_call.135), index=65, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1171 = f32[1,1]{1,0} reshape(%closed_call.201), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.189 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.807, %broadcast_in_dim.1171, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.808 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=67
+  %closed_call.202 = f32[1024]{0} get-tuple-element(%closed_call.135), index=66, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1172 = f32[1,1024]{1,0} reshape(%closed_call.202), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.190 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.808, %broadcast_in_dim.1172, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.809 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=68
+  %closed_call.203 = f32[1]{0} get-tuple-element(%closed_call.135), index=67, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1173 = f32[1,1]{1,0} reshape(%closed_call.203), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.191 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.809, %broadcast_in_dim.1173, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.810 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=69
+  %closed_call.204 = f32[1024]{0} get-tuple-element(%closed_call.135), index=68, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1174 = f32[1,1024]{1,0} reshape(%closed_call.204), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.192 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.810, %broadcast_in_dim.1174, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.811 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=70
+  %closed_call.205 = f32[1]{0} get-tuple-element(%closed_call.135), index=69, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1175 = f32[1,1]{1,0} reshape(%closed_call.205), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.193 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.811, %broadcast_in_dim.1175, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.812 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=71
+  %closed_call.206 = f32[1024]{0} get-tuple-element(%closed_call.135), index=70, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1176 = f32[1,1024]{1,0} reshape(%closed_call.206), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.194 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.812, %broadcast_in_dim.1176, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.813 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=72
+  %closed_call.207 = f32[1]{0} get-tuple-element(%closed_call.135), index=71, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1177 = f32[1,1]{1,0} reshape(%closed_call.207), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.195 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.813, %broadcast_in_dim.1177, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.814 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=73
+  %closed_call.208 = f32[1024]{0} get-tuple-element(%closed_call.135), index=72, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1178 = f32[1,1024]{1,0} reshape(%closed_call.208), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.196 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.814, %broadcast_in_dim.1178, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.815 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=74
+  %closed_call.209 = f32[1024]{0} get-tuple-element(%closed_call.135), index=73, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1179 = f32[1,1024]{1,0} reshape(%closed_call.209), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.197 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.815, %broadcast_in_dim.1179, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.816 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=75
+  %closed_call.210 = f32[1]{0} get-tuple-element(%closed_call.135), index=74, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1180 = f32[1,1]{1,0} reshape(%closed_call.210), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.198 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.816, %broadcast_in_dim.1180, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.817 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=76
+  %closed_call.211 = f32[1]{0} get-tuple-element(%closed_call.135), index=75, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1181 = f32[1,1]{1,0} reshape(%closed_call.211), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.199 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.817, %broadcast_in_dim.1181, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.818 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=77
+  %closed_call.212 = f32[1024]{0} get-tuple-element(%closed_call.135), index=76, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1182 = f32[1,1024]{1,0} reshape(%closed_call.212), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.200 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.818, %broadcast_in_dim.1182, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.819 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=78
+  %closed_call.213 = f32[1024]{0} get-tuple-element(%closed_call.135), index=77, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1183 = f32[1,1024]{1,0} reshape(%closed_call.213), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.201 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.819, %broadcast_in_dim.1183, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.820 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=79
+  %closed_call.214 = f32[1]{0} get-tuple-element(%closed_call.135), index=78, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1184 = f32[1,1]{1,0} reshape(%closed_call.214), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.202 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.820, %broadcast_in_dim.1184, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.821 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=80
+  %closed_call.215 = f32[1]{0} get-tuple-element(%closed_call.135), index=79, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1185 = f32[1,1]{1,0} reshape(%closed_call.215), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.203 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.821, %broadcast_in_dim.1185, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.822 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=81
+  %closed_call.216 = f32[1024]{0} get-tuple-element(%closed_call.135), index=80, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1186 = f32[1,1024]{1,0} reshape(%closed_call.216), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.204 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.822, %broadcast_in_dim.1186, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.823 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=82
+  %closed_call.217 = f32[1]{0} get-tuple-element(%closed_call.135), index=81, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1187 = f32[1,1]{1,0} reshape(%closed_call.217), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.205 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.823, %broadcast_in_dim.1187, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.824 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=83
+  %closed_call.218 = f32[1024]{0} get-tuple-element(%closed_call.135), index=82, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1188 = f32[1,1024]{1,0} reshape(%closed_call.218), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.206 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.824, %broadcast_in_dim.1188, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.825 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=84
+  %closed_call.219 = f32[1]{0} get-tuple-element(%closed_call.135), index=83, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1189 = f32[1,1]{1,0} reshape(%closed_call.219), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.207 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.825, %broadcast_in_dim.1189, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.826 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=85
+  %closed_call.220 = f32[1024]{0} get-tuple-element(%closed_call.135), index=84, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1190 = f32[1,1024]{1,0} reshape(%closed_call.220), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.208 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.826, %broadcast_in_dim.1190, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.827 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=86
+  %closed_call.221 = f32[1]{0} get-tuple-element(%closed_call.135), index=85, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1191 = f32[1,1]{1,0} reshape(%closed_call.221), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.209 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.827, %broadcast_in_dim.1191, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.828 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=87
+  %closed_call.222 = f32[1024]{0} get-tuple-element(%closed_call.135), index=86, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1192 = f32[1,1024]{1,0} reshape(%closed_call.222), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.210 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.828, %broadcast_in_dim.1192, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.829 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=88
+  %closed_call.223 = f32[1]{0} get-tuple-element(%closed_call.135), index=87, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1193 = f32[1,1]{1,0} reshape(%closed_call.223), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.211 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.829, %broadcast_in_dim.1193, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.830 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=89
+  %closed_call.224 = f32[1024]{0} get-tuple-element(%closed_call.135), index=88, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1194 = f32[1,1024]{1,0} reshape(%closed_call.224), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.212 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.830, %broadcast_in_dim.1194, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.831 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=90
+  %closed_call.225 = f32[1]{0} get-tuple-element(%closed_call.135), index=89, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1195 = f32[1,1]{1,0} reshape(%closed_call.225), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.213 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.831, %broadcast_in_dim.1195, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.832 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=91
+  %closed_call.226 = f32[1024]{0} get-tuple-element(%closed_call.135), index=90, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1196 = f32[1,1024]{1,0} reshape(%closed_call.226), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.214 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.832, %broadcast_in_dim.1196, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.833 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=92
+  %closed_call.227 = f32[1]{0} get-tuple-element(%closed_call.135), index=91, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1197 = f32[1,1]{1,0} reshape(%closed_call.227), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.215 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.833, %broadcast_in_dim.1197, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.834 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=93
+  %closed_call.228 = f32[1024]{0} get-tuple-element(%closed_call.135), index=92, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1198 = f32[1,1024]{1,0} reshape(%closed_call.228), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.216 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.834, %broadcast_in_dim.1198, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.835 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=94
+  %closed_call.229 = f32[1]{0} get-tuple-element(%closed_call.135), index=93, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1199 = f32[1,1]{1,0} reshape(%closed_call.229), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.217 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.835, %broadcast_in_dim.1199, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.836 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=95
+  %closed_call.230 = f32[1024]{0} get-tuple-element(%closed_call.135), index=94, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1200 = f32[1,1024]{1,0} reshape(%closed_call.230), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.218 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.836, %broadcast_in_dim.1200, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.837 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=96
+  %closed_call.231 = f32[1]{0} get-tuple-element(%closed_call.135), index=95, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1201 = f32[1,1]{1,0} reshape(%closed_call.231), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.219 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.837, %broadcast_in_dim.1201, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.838 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.9), index=97
+  %closed_call.232 = f32[1024]{0} get-tuple-element(%closed_call.135), index=96, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1202 = f32[1,1024]{1,0} reshape(%closed_call.232), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.220 = f32[2,1024]{1,0} dynamic-update-slice(%get-tuple-element.838, %broadcast_in_dim.1202, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.839 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.9), index=98
+  %closed_call.233 = f32[1]{0} get-tuple-element(%closed_call.135), index=97, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1203 = f32[1,1]{1,0} reshape(%closed_call.233), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.221 = f32[2,1]{1,0} dynamic-update-slice(%get-tuple-element.839, %broadcast_in_dim.1203, %sub.40, %constant.413), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  ROOT %tuple.15 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=5*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=10*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=15*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, /*index=20*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=25*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=35*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=75*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,32,4096,7168]{3,2,1,0}, /*index=100*/bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, /*index=105*/bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=130*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=135*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=140*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=145*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=150*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=155*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=160*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=165*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=170*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=175*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=180*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=185*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=190*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=195*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=200*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) tuple(%add.109, %closed_call.136, %dynamic_update_slice.125, %dynamic_update_slice.126, %dynamic_update_slice.127, /*index=5*/%dynamic_update_slice.128, %dynamic_update_slice.129, %dynamic_update_slice.130, %dynamic_update_slice.131, %dynamic_update_slice.132, /*index=10*/%dynamic_update_slice.133, %dynamic_update_slice.134, %dynamic_update_slice.135, %dynamic_update_slice.136, %dynamic_update_slice.137, /*index=15*/%dynamic_update_slice.138, %dynamic_update_slice.139, %dynamic_update_slice.140, %dynamic_update_slice.141, %dynamic_update_slice.142, /*index=20*/%dynamic_update_slice.143, %dynamic_update_slice.144, %dynamic_update_slice.145, %dynamic_update_slice.146, %dynamic_update_slice.147, /*index=25*/%dynamic_update_slice.148, %dynamic_update_slice.149, %dynamic_update_slice.150, %dynamic_update_slice.151, %dynamic_update_slice.152, /*index=30*/%dynamic_update_slice.153, %dynamic_update_slice.154, %dynamic_update_slice.155, %dynamic_update_slice.156, %dynamic_update_slice.157, /*index=35*/%dynamic_update_slice.158, %dynamic_update_slice.159, %dynamic_update_slice.160, %dynamic_update_slice.161, %dynamic_update_slice.162, /*index=40*/%dynamic_update_slice.163, %dynamic_update_slice.164, %dynamic_update_slice.165, %dynamic_update_slice.166, %dynamic_update_slice.167, /*index=45*/%dynamic_update_slice.168, %dynamic_update_slice.169, %dynamic_update_slice.170, %dynamic_update_slice.171, %dynamic_update_slice.172, /*index=50*/%dynamic_update_slice.173, %dynamic_update_slice.174, %dynamic_update_slice.175, %dynamic_update_slice.176, %dynamic_update_slice.177, /*index=55*/%dynamic_update_slice.178, %dynamic_update_slice.179, %dynamic_update_slice.180, %dynamic_update_slice.181, %dynamic_update_slice.182, /*index=60*/%dynamic_update_slice.183, %dynamic_update_slice.184, %dynamic_update_slice.185, %dynamic_update_slice.186, %dynamic_update_slice.187, /*index=65*/%dynamic_update_slice.188, %dynamic_update_slice.189, %dynamic_update_slice.190, %dynamic_update_slice.191, %dynamic_update_slice.192, /*index=70*/%dynamic_update_slice.193, %dynamic_update_slice.194, %dynamic_update_slice.195, %dynamic_update_slice.196, %dynamic_update_slice.197, /*index=75*/%dynamic_update_slice.198, %dynamic_update_slice.199, %dynamic_update_slice.200, %dynamic_update_slice.201, %dynamic_update_slice.202, /*index=80*/%dynamic_update_slice.203, %dynamic_update_slice.204, %dynamic_update_slice.205, %dynamic_update_slice.206, %dynamic_update_slice.207, /*index=85*/%dynamic_update_slice.208, %dynamic_update_slice.209, %dynamic_update_slice.210, %dynamic_update_slice.211, %dynamic_update_slice.212, /*index=90*/%dynamic_update_slice.213, %dynamic_update_slice.214, %dynamic_update_slice.215, %dynamic_update_slice.216, %dynamic_update_slice.217, /*index=95*/%dynamic_update_slice.218, %dynamic_update_slice.219, %dynamic_update_slice.220, %dynamic_update_slice.221, %get-tuple-element.840, /*index=100*/%get-tuple-element.841, %get-tuple-element.842, %get-tuple-element.843, %get-tuple-element.844, %get-tuple-element.845, /*index=105*/%get-tuple-element.846, %get-tuple-element.847, %get-tuple-element.848, %get-tuple-element.849, %get-tuple-element.850, /*index=110*/%get-tuple-element.851, %get-tuple-element.852, %get-tuple-element.853, %get-tuple-element.854, %get-tuple-element.855, /*index=115*/%get-tuple-element.856, %get-tuple-element.857, %get-tuple-element.858, %get-tuple-element.859, %get-tuple-element.860, /*index=120*/%get-tuple-element.861, %get-tuple-element.862, %get-tuple-element.863, %get-tuple-element.864, %get-tuple-element.865, /*index=125*/%get-tuple-element.866, %get-tuple-element.867, %get-tuple-element.868, %get-tuple-element.869, %get-tuple-element.870, /*index=130*/%get-tuple-element.871, %get-tuple-element.872, %get-tuple-element.873, %get-tuple-element.874, %get-tuple-element.875, /*index=135*/%get-tuple-element.876, %get-tuple-element.877, %get-tuple-element.878, %get-tuple-element.879, %get-tuple-element.880, /*index=140*/%get-tuple-element.881, %get-tuple-element.882, %get-tuple-element.883, %get-tuple-element.884, %get-tuple-element.885, /*index=145*/%get-tuple-element.886, %get-tuple-element.887, %get-tuple-element.888, %get-tuple-element.889, %get-tuple-element.890, /*index=150*/%get-tuple-element.891, %get-tuple-element.892, %get-tuple-element.893, %get-tuple-element.894, %get-tuple-element.895, /*index=155*/%get-tuple-element.896, %get-tuple-element.897, %get-tuple-element.898, %get-tuple-element.899, %get-tuple-element.900, /*index=160*/%get-tuple-element.901, %get-tuple-element.902, %get-tuple-element.903, %get-tuple-element.904, %get-tuple-element.905, /*index=165*/%get-tuple-element.906, %get-tuple-element.907, %get-tuple-element.908, %get-tuple-element.909, %get-tuple-element.910, /*index=170*/%get-tuple-element.911, %get-tuple-element.912, %get-tuple-element.913, %get-tuple-element.914, %get-tuple-element.915, /*index=175*/%get-tuple-element.916, %get-tuple-element.917, %get-tuple-element.918, %get-tuple-element.919, %get-tuple-element.920, /*index=180*/%get-tuple-element.921, %get-tuple-element.922, %get-tuple-element.923, %get-tuple-element.924, %get-tuple-element.925, /*index=185*/%get-tuple-element.926, %get-tuple-element.927, %get-tuple-element.928, %get-tuple-element.929, %get-tuple-element.930, /*index=190*/%get-tuple-element.931, %get-tuple-element.932, %get-tuple-element.933, %get-tuple-element.934, %get-tuple-element.935, /*index=195*/%get-tuple-element.936, %get-tuple-element.937, %get-tuple-element.938, %get-tuple-element.939, %get-tuple-element.940, /*index=200*/%get-tuple-element.941, %get-tuple-element.942, %get-tuple-element.943, %get-tuple-element.944, %get-tuple-element.945)
+}
+
+%region_211.292 (arg_tuple.11: (s32[], bf16[32,4096,7168], bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], /*index=5*/bf16[2,64,7168,2048], bf16[2,64,2048,7168], bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], /*index=10*/bf16[2,7168], bf16[2,7168], bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], /*index=15*/bf16[2,7168,576], bf16[2,512,128,256], bf16[2,7168,1536], bf16[2,1536,128,192], f32[2,1024], /*index=20*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=25*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=30*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=35*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=40*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=45*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=50*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=55*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=60*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=65*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=70*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1024], /*index=75*/f32[2,1], f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], /*index=80*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=85*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=90*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=95*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], bf16[2,32,4096,7168], /*index=100*/bf16[2,32,4096,1536], bf16[2,32,4096,576], bf16[2,32,4096,7168], bf16[2,64,32,512,2048], bf16[2,64,32,512,2048], /*index=105*/bf16[2,32,4096,2048], bf16[2,32,4096,2048], f32[2,1024], f32[2,1], f32[2,1024], /*index=110*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=115*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=120*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=125*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=130*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=135*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=140*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=145*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=150*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=155*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=160*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=165*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=170*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=175*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=180*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=185*/f32[2,1024], f32[2,1], bf16[2,7168,64], bf16[2,64], bf16[2,64,7168,2048], /*index=190*/bf16[2,64,7168,2048], bf16[2,64,2048,7168], bf16[2,7168,2048], bf16[2,7168,2048], bf16[2,2048,7168], /*index=195*/bf16[2,7168], bf16[2,7168], bf16[2,512], bf16[2,128,128,7168], bf16[2,1536], /*index=200*/bf16[2,7168,576], bf16[2,512,128,256], bf16[2,7168,1536], bf16[2,1536,128,192], s32[32,4096])) -> pred[] {
+  %arg_tuple.11 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=5*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=10*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=15*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, /*index=20*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=25*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=35*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=75*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,32,4096,7168]{3,2,1,0}, /*index=100*/bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, /*index=105*/bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=130*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=135*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=140*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=145*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=150*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=155*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=160*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=165*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=170*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=175*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=180*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=185*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=190*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=195*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=200*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) parameter(0)
+  %get-tuple-element.1152 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.11), index=1
+  %get-tuple-element.1153 = bf16[2,7168,64]{2,1,0} get-tuple-element(%arg_tuple.11), index=2
+  %get-tuple-element.1154 = bf16[2,64]{1,0} get-tuple-element(%arg_tuple.11), index=3
+  %get-tuple-element.1155 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=4
+  %get-tuple-element.1156 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=5
+  %get-tuple-element.1157 = bf16[2,64,2048,7168]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=6
+  %get-tuple-element.1158 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.11), index=7
+  %get-tuple-element.1159 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.11), index=8
+  %get-tuple-element.1160 = bf16[2,2048,7168]{2,1,0} get-tuple-element(%arg_tuple.11), index=9
+  %get-tuple-element.1161 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.11), index=10
+  %get-tuple-element.1162 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.11), index=11
+  %get-tuple-element.1163 = bf16[2,512]{1,0} get-tuple-element(%arg_tuple.11), index=12
+  %get-tuple-element.1164 = bf16[2,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=13
+  %get-tuple-element.1165 = bf16[2,1536]{1,0} get-tuple-element(%arg_tuple.11), index=14
+  %get-tuple-element.1166 = bf16[2,7168,576]{2,1,0} get-tuple-element(%arg_tuple.11), index=15
+  %get-tuple-element.1167 = bf16[2,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=16
+  %get-tuple-element.1168 = bf16[2,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.11), index=17
+  %get-tuple-element.1169 = bf16[2,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=18
+  %get-tuple-element.1170 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=19
+  %get-tuple-element.1171 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=20
+  %get-tuple-element.1172 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=21
+  %get-tuple-element.1173 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=22
+  %get-tuple-element.1174 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=23
+  %get-tuple-element.1175 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=24
+  %get-tuple-element.1176 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=25
+  %get-tuple-element.1177 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=26
+  %get-tuple-element.1178 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=27
+  %get-tuple-element.1179 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=28
+  %get-tuple-element.1180 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=29
+  %get-tuple-element.1181 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=30
+  %get-tuple-element.1182 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=31
+  %get-tuple-element.1183 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=32
+  %get-tuple-element.1184 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=33
+  %get-tuple-element.1185 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=34
+  %get-tuple-element.1186 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=35
+  %get-tuple-element.1187 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=36
+  %get-tuple-element.1188 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=37
+  %get-tuple-element.1189 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=38
+  %get-tuple-element.1190 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=39
+  %get-tuple-element.1191 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=40
+  %get-tuple-element.1192 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=41
+  %get-tuple-element.1193 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=42
+  %get-tuple-element.1194 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=43
+  %get-tuple-element.1195 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=44
+  %get-tuple-element.1196 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=45
+  %get-tuple-element.1197 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=46
+  %get-tuple-element.1198 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=47
+  %get-tuple-element.1199 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=48
+  %get-tuple-element.1200 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=49
+  %get-tuple-element.1201 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=50
+  %get-tuple-element.1202 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=51
+  %get-tuple-element.1203 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=52
+  %get-tuple-element.1204 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=53
+  %get-tuple-element.1205 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=54
+  %get-tuple-element.1206 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=55
+  %get-tuple-element.1207 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=56
+  %get-tuple-element.1208 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=57
+  %get-tuple-element.1209 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=58
+  %get-tuple-element.1210 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=59
+  %get-tuple-element.1211 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=60
+  %get-tuple-element.1212 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=61
+  %get-tuple-element.1213 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=62
+  %get-tuple-element.1214 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=63
+  %get-tuple-element.1215 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=64
+  %get-tuple-element.1216 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=65
+  %get-tuple-element.1217 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=66
+  %get-tuple-element.1218 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=67
+  %get-tuple-element.1219 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=68
+  %get-tuple-element.1220 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=69
+  %get-tuple-element.1221 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=70
+  %get-tuple-element.1222 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=71
+  %get-tuple-element.1223 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=72
+  %get-tuple-element.1224 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=73
+  %get-tuple-element.1225 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=74
+  %get-tuple-element.1226 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=75
+  %get-tuple-element.1227 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=76
+  %get-tuple-element.1228 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=77
+  %get-tuple-element.1229 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=78
+  %get-tuple-element.1230 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=79
+  %get-tuple-element.1231 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=80
+  %get-tuple-element.1232 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=81
+  %get-tuple-element.1233 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=82
+  %get-tuple-element.1234 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=83
+  %get-tuple-element.1235 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=84
+  %get-tuple-element.1236 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=85
+  %get-tuple-element.1237 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=86
+  %get-tuple-element.1238 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=87
+  %get-tuple-element.1239 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=88
+  %get-tuple-element.1240 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=89
+  %get-tuple-element.1241 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=90
+  %get-tuple-element.1242 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=91
+  %get-tuple-element.1243 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=92
+  %get-tuple-element.1244 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=93
+  %get-tuple-element.1245 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=94
+  %get-tuple-element.1246 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=95
+  %get-tuple-element.1247 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=96
+  %get-tuple-element.1248 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=97
+  %get-tuple-element.1249 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=98
+  %get-tuple-element.1250 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=99
+  %get-tuple-element.1251 = bf16[2,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=100
+  %get-tuple-element.1252 = bf16[2,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=101
+  %get-tuple-element.1253 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=102
+  %get-tuple-element.1254 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.11), index=103
+  %get-tuple-element.1255 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%arg_tuple.11), index=104
+  %get-tuple-element.1256 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=105
+  %get-tuple-element.1257 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=106
+  %get-tuple-element.1258 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=107
+  %get-tuple-element.1259 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=108
+  %get-tuple-element.1260 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=109
+  %get-tuple-element.1261 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=110
+  %get-tuple-element.1262 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=111
+  %get-tuple-element.1263 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=112
+  %get-tuple-element.1264 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=113
+  %get-tuple-element.1265 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=114
+  %get-tuple-element.1266 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=115
+  %get-tuple-element.1267 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=116
+  %get-tuple-element.1268 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=117
+  %get-tuple-element.1269 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=118
+  %get-tuple-element.1270 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=119
+  %get-tuple-element.1271 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=120
+  %get-tuple-element.1272 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=121
+  %get-tuple-element.1273 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=122
+  %get-tuple-element.1274 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=123
+  %get-tuple-element.1275 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=124
+  %get-tuple-element.1276 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=125
+  %get-tuple-element.1277 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=126
+  %get-tuple-element.1278 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=127
+  %get-tuple-element.1279 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=128
+  %get-tuple-element.1280 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=129
+  %get-tuple-element.1281 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=130
+  %get-tuple-element.1282 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=131
+  %get-tuple-element.1283 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=132
+  %get-tuple-element.1284 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=133
+  %get-tuple-element.1285 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=134
+  %get-tuple-element.1286 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=135
+  %get-tuple-element.1287 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=136
+  %get-tuple-element.1288 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=137
+  %get-tuple-element.1289 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=138
+  %get-tuple-element.1290 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=139
+  %get-tuple-element.1291 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=140
+  %get-tuple-element.1292 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=141
+  %get-tuple-element.1293 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=142
+  %get-tuple-element.1294 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=143
+  %get-tuple-element.1295 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=144
+  %get-tuple-element.1296 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=145
+  %get-tuple-element.1297 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=146
+  %get-tuple-element.1298 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=147
+  %get-tuple-element.1299 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=148
+  %get-tuple-element.1300 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=149
+  %get-tuple-element.1301 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=150
+  %get-tuple-element.1302 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=151
+  %get-tuple-element.1303 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=152
+  %get-tuple-element.1304 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=153
+  %get-tuple-element.1305 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=154
+  %get-tuple-element.1306 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=155
+  %get-tuple-element.1307 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=156
+  %get-tuple-element.1308 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=157
+  %get-tuple-element.1309 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=158
+  %get-tuple-element.1310 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=159
+  %get-tuple-element.1311 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=160
+  %get-tuple-element.1312 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=161
+  %get-tuple-element.1313 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=162
+  %get-tuple-element.1314 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=163
+  %get-tuple-element.1315 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=164
+  %get-tuple-element.1316 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=165
+  %get-tuple-element.1317 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=166
+  %get-tuple-element.1318 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=167
+  %get-tuple-element.1319 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=168
+  %get-tuple-element.1320 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=169
+  %get-tuple-element.1321 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=170
+  %get-tuple-element.1322 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=171
+  %get-tuple-element.1323 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=172
+  %get-tuple-element.1324 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=173
+  %get-tuple-element.1325 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=174
+  %get-tuple-element.1326 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=175
+  %get-tuple-element.1327 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=176
+  %get-tuple-element.1328 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=177
+  %get-tuple-element.1329 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=178
+  %get-tuple-element.1330 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=179
+  %get-tuple-element.1331 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=180
+  %get-tuple-element.1332 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=181
+  %get-tuple-element.1333 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=182
+  %get-tuple-element.1334 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=183
+  %get-tuple-element.1335 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=184
+  %get-tuple-element.1336 = f32[2,1024]{1,0} get-tuple-element(%arg_tuple.11), index=185
+  %get-tuple-element.1337 = f32[2,1]{1,0} get-tuple-element(%arg_tuple.11), index=186
+  %get-tuple-element.1338 = bf16[2,7168,64]{2,1,0} get-tuple-element(%arg_tuple.11), index=187
+  %get-tuple-element.1339 = bf16[2,64]{1,0} get-tuple-element(%arg_tuple.11), index=188
+  %get-tuple-element.1340 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=189
+  %get-tuple-element.1341 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=190
+  %get-tuple-element.1342 = bf16[2,64,2048,7168]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=191
+  %get-tuple-element.1343 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.11), index=192
+  %get-tuple-element.1344 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%arg_tuple.11), index=193
+  %get-tuple-element.1345 = bf16[2,2048,7168]{2,1,0} get-tuple-element(%arg_tuple.11), index=194
+  %get-tuple-element.1346 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.11), index=195
+  %get-tuple-element.1347 = bf16[2,7168]{1,0} get-tuple-element(%arg_tuple.11), index=196
+  %get-tuple-element.1348 = bf16[2,512]{1,0} get-tuple-element(%arg_tuple.11), index=197
+  %get-tuple-element.1349 = bf16[2,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=198
+  %get-tuple-element.1350 = bf16[2,1536]{1,0} get-tuple-element(%arg_tuple.11), index=199
+  %get-tuple-element.1351 = bf16[2,7168,576]{2,1,0} get-tuple-element(%arg_tuple.11), index=200
+  %get-tuple-element.1352 = bf16[2,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=201
+  %get-tuple-element.1353 = bf16[2,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.11), index=202
+  %get-tuple-element.1354 = bf16[2,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.11), index=203
+  %get-tuple-element.1355 = s32[32,4096]{1,0} get-tuple-element(%arg_tuple.11), index=204
+  %get-tuple-element.1151 = s32[] get-tuple-element(%arg_tuple.11), index=0
+  %constant.417 = s32[] constant(2)
+  ROOT %lt.14 = pred[] compare(%get-tuple-element.1151, %constant.417), direction=LT, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while/cond/lt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+}
+
+%silu_775.331 (Arg_0.160: bf16[32,4096,18432]) -> (bf16[32,4096,18432], bf16[32,4096,18432], bf16[32,4096,18432]) {
+  %Arg_0.160 = bf16[32,4096,18432]{2,1,0} parameter(0)
+  %constant.461 = bf16[] constant(1)
+  %broadcast.164 = bf16[32,4096,18432]{2,1,0} broadcast(%constant.461), dimensions={}
+  %neg.16 = bf16[32,4096,18432]{2,1,0} negate(%Arg_0.160), metadata={op_name="neg" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %exp.202 = bf16[32,4096,18432]{2,1,0} exponential(%neg.16), metadata={op_name="exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %add.120 = bf16[32,4096,18432]{2,1,0} add(%exp.202, %broadcast.164), metadata={op_name="add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %div.981 = bf16[32,4096,18432]{2,1,0} divide(%broadcast.164, %add.120), metadata={op_name="div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %mul.947 = bf16[32,4096,18432]{2,1,0} multiply(%Arg_0.160, %div.981), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %sub.50 = bf16[32,4096,18432]{2,1,0} subtract(%broadcast.164, %div.981), metadata={op_name="sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %mul.946 = bf16[32,4096,18432]{2,1,0} multiply(%div.981, %sub.50), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  ROOT %tuple.17 = (bf16[32,4096,18432]{2,1,0}, bf16[32,4096,18432]{2,1,0}, bf16[32,4096,18432]{2,1,0}) tuple(%mul.947, %div.981, %mul.946)
+}
+
+%region_262.346 (reduce_max.1188: f32[], reduce_max.1189: f32[]) -> f32[] {
+  %reduce_max.1188 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1189 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1190 = f32[] maximum(%reduce_max.1188, %reduce_max.1189), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_259.342 (reduce_max.1174: f32[], reduce_max.1175: f32[]) -> f32[] {
+  %reduce_max.1174 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1175 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1176 = f32[] maximum(%reduce_max.1174, %reduce_max.1175), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_782.345 (Arg_0.164: bf16[18432,7168], Arg_1.140: bf16[], Arg_2.134: bf16[]) -> bf16[18432,7168] {
+  %Arg_2.134 = bf16[] parameter(2)
+  %min.214 = bf16[18432,7168]{1,0} broadcast(%Arg_2.134), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.140 = bf16[] parameter(1)
+  %max.214 = bf16[18432,7168]{1,0} broadcast(%Arg_1.140), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.164 = bf16[18432,7168]{1,0} parameter(0)
+  %max.215 = bf16[18432,7168]{1,0} maximum(%max.214, %Arg_0.164), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.215 = bf16[18432,7168]{1,0} minimum(%min.214, %max.215), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%region_265.349 (reduce_max.1202: f32[], reduce_max.1203: f32[]) -> f32[] {
+  %reduce_max.1202 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1203 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1204 = f32[] maximum(%reduce_max.1202, %reduce_max.1203), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_787.352 (Arg_0.166: bf16[32,4096,18432], Arg_1.142: bf16[], Arg_2.136: bf16[]) -> bf16[32,4096,18432] {
+  %Arg_2.136 = bf16[] parameter(2)
+  %min.218 = bf16[32,4096,18432]{2,1,0} broadcast(%Arg_2.136), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.142 = bf16[] parameter(1)
+  %max.218 = bf16[32,4096,18432]{2,1,0} broadcast(%Arg_1.142), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.166 = bf16[32,4096,18432]{2,1,0} parameter(0)
+  %max.219 = bf16[32,4096,18432]{2,1,0} maximum(%max.218, %Arg_0.166), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.219 = bf16[32,4096,18432]{2,1,0} minimum(%min.218, %max.219), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_253.335 (reduce_max.1146: f32[], reduce_max.1147: f32[]) -> f32[] {
+  %reduce_max.1146 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1147 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1148 = f32[] maximum(%reduce_max.1146, %reduce_max.1147), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_773.330 (Arg_0.158: bf16[7168,18432], Arg_1.136: bf16[], Arg_2.130: bf16[]) -> bf16[7168,18432] {
+  %Arg_2.130 = bf16[] parameter(2)
+  %min.206 = bf16[7168,18432]{1,0} broadcast(%Arg_2.130), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_1.136 = bf16[] parameter(1)
+  %max.206 = bf16[7168,18432]{1,0} broadcast(%Arg_1.136), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  %Arg_0.158 = bf16[7168,18432]{1,0} parameter(0)
+  %max.207 = bf16[7168,18432]{1,0} maximum(%max.206, %Arg_0.158), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+  ROOT %min.207 = bf16[7168,18432]{1,0} minimum(%min.206, %max.207), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=473 source_end_line=475 source_column=25 source_end_column=3}
+}
+
+%silu_793.353 (Arg_0.168: bf16[32,4096,18432], Arg_1.144: bf16[32,4096,18432], Arg_2.138: bf16[32,4096,18432], Arg_3.12: bf16[32,4096,18432]) -> bf16[32,4096,18432] {
+  %Arg_3.12 = bf16[32,4096,18432]{2,1,0} parameter(3)
+  %Arg_0.168 = bf16[32,4096,18432]{2,1,0} parameter(0)
+  %mul.963 = bf16[32,4096,18432]{2,1,0} multiply(%Arg_3.12, %Arg_0.168), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %Arg_1.144 = bf16[32,4096,18432]{2,1,0} parameter(1)
+  %mul.962 = bf16[32,4096,18432]{2,1,0} multiply(%Arg_1.144, %Arg_3.12), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  %Arg_2.138 = bf16[32,4096,18432]{2,1,0} parameter(2)
+  %mul.964 = bf16[32,4096,18432]{2,1,0} multiply(%mul.962, %Arg_2.138), metadata={op_name="mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+  ROOT %add_any.36 = bf16[32,4096,18432]{2,1,0} add(%mul.963, %mul.964), metadata={op_name="add_any" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=467 source_end_line=467 source_column=12 source_end_column=54}
+}
+
+%region_268.354 (reduce_max.1216: f32[], reduce_max.1217: f32[]) -> f32[] {
+  %reduce_max.1216 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1217 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1218 = f32[] maximum(%reduce_max.1216, %reduce_max.1217), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_247.327 (reduce_max.1118: f32[], reduce_max.1119: f32[]) -> f32[] {
+  %reduce_max.1118 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1119 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1120 = f32[] maximum(%reduce_max.1118, %reduce_max.1119), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_243.323 (reduce_sum.343: f32[], reduce_sum.344: f32[]) -> f32[] {
+  %reduce_sum.343 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.344 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.345 = f32[] add(%reduce_sum.343, %reduce_sum.344), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_273.359 (reduce_sum.364: f32[], reduce_sum.365: f32[]) -> f32[] {
+  %reduce_sum.364 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.365 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.366 = f32[] add(%reduce_sum.364, %reduce_sum.365), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_274.360 (reduce_sum.371: f32[], reduce_sum.372: f32[]) -> f32[] {
+  %reduce_sum.371 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.372 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.373 = f32[] add(%reduce_sum.371, %reduce_sum.372), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_223.303 (reduce_sum.329: f32[], reduce_sum.330: f32[]) -> f32[] {
+  %reduce_sum.329 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.330 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.331 = f32[] add(%reduce_sum.329, %reduce_sum.330), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_224.304 (reduce_max.1020: f32[], reduce_max.1021: f32[]) -> f32[] {
+  %reduce_max.1020 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1021 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1022 = f32[] maximum(%reduce_max.1020, %reduce_max.1021), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_227.307 (reduce_max.1034: f32[], reduce_max.1035: f32[]) -> f32[] {
+  %reduce_max.1034 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1035 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1036 = f32[] maximum(%reduce_max.1034, %reduce_max.1035), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_230.310 (reduce_sum.336: f32[], reduce_sum.337: f32[]) -> f32[] {
+  %reduce_sum.336 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.337 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.338 = f32[] add(%reduce_sum.336, %reduce_sum.337), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_231.311 (reduce_max.1048: f32[], reduce_max.1049: f32[]) -> f32[] {
+  %reduce_max.1048 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1049 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1050 = f32[] maximum(%reduce_max.1048, %reduce_max.1049), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_234.314 (reduce_max.1062: f32[], reduce_max.1063: f32[]) -> f32[] {
+  %reduce_max.1062 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1063 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1064 = f32[] maximum(%reduce_max.1062, %reduce_max.1063), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_275.361 (reduce_max.1230: f32[], reduce_max.1231: f32[]) -> f32[] {
+  %reduce_max.1230 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1231 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1232 = f32[] maximum(%reduce_max.1230, %reduce_max.1231), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_240.320 (reduce_max.1090: f32[], reduce_max.1091: f32[]) -> f32[] {
+  %reduce_max.1090 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1091 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1092 = f32[] maximum(%reduce_max.1090, %reduce_max.1091), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_279.365 (reduce_max.1244: f32[], reduce_max.1245: f32[]) -> f32[] {
+  %reduce_max.1244 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1245 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1246 = f32[] maximum(%reduce_max.1244, %reduce_max.1245), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_285.371 (reduce_sum.406: f32[], reduce_sum.407: f32[]) -> f32[] {
+  %reduce_sum.406 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.407 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.408 = f32[] add(%reduce_sum.406, %reduce_sum.407), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_286.372 (reduce_sum.413: f32[], reduce_sum.414: f32[]) -> f32[] {
+  %reduce_sum.413 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.414 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.415 = f32[] add(%reduce_sum.413, %reduce_sum.414), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_278.364 (reduce_sum.378: bf16[], reduce_sum.379: bf16[]) -> bf16[] {
+  %reduce_sum.378 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.379 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.380 = bf16[] add(%reduce_sum.378, %reduce_sum.379), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_282.368 (reduce_sum.385: bf16[], reduce_sum.386: bf16[]) -> bf16[] {
+  %reduce_sum.385 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.386 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.387 = bf16[] add(%reduce_sum.385, %reduce_sum.386), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_294.380 (reduce_max.1272: f32[], reduce_max.1273: f32[]) -> f32[] {
+  %reduce_max.1272 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1273 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1274 = f32[] maximum(%reduce_max.1272, %reduce_max.1273), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_220.300 (reduce_max.1006: f32[], reduce_max.1007: f32[]) -> f32[] {
+  %reduce_max.1006 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1007 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1008 = f32[] maximum(%reduce_max.1006, %reduce_max.1007), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_287.373 (reduce_max.1258: f32[], reduce_max.1259: f32[]) -> f32[] {
+  %reduce_max.1258 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1259 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1260 = f32[] maximum(%reduce_max.1258, %reduce_max.1259), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_292.378 (reduce_sum.434: f32[], reduce_sum.435: f32[]) -> f32[] {
+  %reduce_sum.434 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.435 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.436 = f32[] add(%reduce_sum.434, %reduce_sum.435), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_293.379 (reduce_sum.441: f32[], reduce_sum.442: f32[]) -> f32[] {
+  %reduce_sum.441 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.442 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.443 = f32[] add(%reduce_sum.441, %reduce_sum.442), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_297.383 (reduce_max.1286: f32[], reduce_max.1287: f32[]) -> f32[] {
+  %reduce_max.1286 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1287 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1288 = f32[] maximum(%reduce_max.1286, %reduce_max.1287), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_217.297 (reduce_max.992: f32[], reduce_max.993: f32[]) -> f32[] {
+  %reduce_max.992 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.993 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.994 = f32[] maximum(%reduce_max.992, %reduce_max.993), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_213.293 (reduce_sum.322: f32[], reduce_sum.323: f32[]) -> f32[] {
+  %reduce_sum.322 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.323 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.324 = f32[] add(%reduce_sum.322, %reduce_sum.323), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_302.388 (reduce_sum.462: f32[], reduce_sum.463: f32[]) -> f32[] {
+  %reduce_sum.462 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.463 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.464 = f32[] add(%reduce_sum.462, %reduce_sum.463), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_303.389 (reduce_sum.469: f32[], reduce_sum.470: f32[]) -> f32[] {
+  %reduce_sum.469 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.470 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.471 = f32[] add(%reduce_sum.469, %reduce_sum.470), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_244.324 (reduce_max.1104: f32[], reduce_max.1105: f32[]) -> f32[] {
+  %reduce_max.1104 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1105 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1106 = f32[] maximum(%reduce_max.1104, %reduce_max.1105), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_250.332 (reduce_max.1132: f32[], reduce_max.1133: f32[]) -> f32[] {
+  %reduce_max.1132 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1133 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1134 = f32[] maximum(%reduce_max.1132, %reduce_max.1133), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_256.338 (reduce_max.1160: f32[], reduce_max.1161: f32[]) -> f32[] {
+  %reduce_max.1160 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1161 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1162 = f32[] maximum(%reduce_max.1160, %reduce_max.1161), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%clip_779.341 (Arg_0.162: bf16[32,4096,18432], Arg_1.138: bf16[], Arg_2.132: bf16[]) -> bf16[32,4096,18432] {
+  %Arg_2.132 = bf16[] parameter(2)
+  %min.210 = bf16[32,4096,18432]{2,1,0} broadcast(%Arg_2.132), dimensions={}, metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_1.138 = bf16[] parameter(1)
+  %max.210 = bf16[32,4096,18432]{2,1,0} broadcast(%Arg_1.138), dimensions={}, metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  %Arg_0.162 = bf16[32,4096,18432]{2,1,0} parameter(0)
+  %max.211 = bf16[32,4096,18432]{2,1,0} maximum(%max.210, %Arg_0.162), metadata={op_name="max" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+  ROOT %min.211 = bf16[32,4096,18432]{2,1,0} minimum(%min.210, %max.211), metadata={op_name="min" source_file="/opt/flax/flax/linen/fp8_ops.py" source_line=470 source_end_line=472 source_column=25 source_end_column=3}
+}
+
+%region_271.357 (reduce_sum.350: bf16[], reduce_sum.351: bf16[]) -> bf16[] {
+  %reduce_sum.350 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.351 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.352 = bf16[] add(%reduce_sum.350, %reduce_sum.351), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_272.358 (reduce_sum.357: bf16[], reduce_sum.358: bf16[]) -> bf16[] {
+  %reduce_sum.357 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.358 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.359 = bf16[] add(%reduce_sum.357, %reduce_sum.358), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_300.386 (reduce_sum.448: bf16[], reduce_sum.449: bf16[]) -> bf16[] {
+  %reduce_sum.448 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.449 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.450 = bf16[] add(%reduce_sum.448, %reduce_sum.449), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_301.387 (reduce_sum.455: bf16[], reduce_sum.456: bf16[]) -> bf16[] {
+  %reduce_sum.455 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.456 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.457 = bf16[] add(%reduce_sum.455, %reduce_sum.456), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_283.369 (reduce_sum.392: bf16[], reduce_sum.393: bf16[]) -> bf16[] {
+  %reduce_sum.392 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.393 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.394 = bf16[] add(%reduce_sum.392, %reduce_sum.393), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_284.370 (reduce_sum.399: bf16[], reduce_sum.400: bf16[]) -> bf16[] {
+  %reduce_sum.399 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.400 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.401 = bf16[] add(%reduce_sum.399, %reduce_sum.400), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_237.317 (reduce_max.1076: f32[], reduce_max.1077: f32[]) -> f32[] {
+  %reduce_max.1076 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1077 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1078 = f32[] maximum(%reduce_max.1076, %reduce_max.1077), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_290.376 (reduce_sum.420: bf16[], reduce_sum.421: bf16[]) -> bf16[] {
+  %reduce_sum.420 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.421 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.422 = bf16[] add(%reduce_sum.420, %reduce_sum.421), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_291.377 (reduce_sum.427: bf16[], reduce_sum.428: bf16[]) -> bf16[] {
+  %reduce_sum.427 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.428 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.429 = bf16[] add(%reduce_sum.427, %reduce_sum.428), metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_214.294 (reduce_max.978: f32[], reduce_max.979: f32[]) -> f32[] {
+  %reduce_max.978 = f32[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.979 = f32[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.980 = f32[] maximum(%reduce_max.978, %reduce_max.979), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_245.325 (reduce_max.1111: bf16[], reduce_max.1112: bf16[]) -> bf16[] {
+  %reduce_max.1111 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1112 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1113 = bf16[] maximum(%reduce_max.1111, %reduce_max.1112), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_246.326 (scatter.287: f32[], scatter.288: f32[]) -> f32[] {
+  %scatter.287 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.288 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_248.328 (reduce_max.1125: bf16[], reduce_max.1126: bf16[]) -> bf16[] {
+  %reduce_max.1125 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1126 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1127 = bf16[] maximum(%reduce_max.1125, %reduce_max.1126), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_249.329 (scatter.292: f32[], scatter.293: f32[]) -> f32[] {
+  %scatter.292 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.293 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_269.355 (reduce_max.1223: bf16[], reduce_max.1224: bf16[]) -> bf16[] {
+  %reduce_max.1223 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1224 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1225 = bf16[] maximum(%reduce_max.1223, %reduce_max.1224), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_270.356 (scatter.327: f32[], scatter.328: f32[]) -> f32[] {
+  %scatter.327 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.328 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_251.333 (reduce_max.1139: bf16[], reduce_max.1140: bf16[]) -> bf16[] {
+  %reduce_max.1139 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1140 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1141 = bf16[] maximum(%reduce_max.1139, %reduce_max.1140), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_252.334 (scatter.297: f32[], scatter.298: f32[]) -> f32[] {
+  %scatter.297 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.298 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_254.336 (reduce_max.1153: bf16[], reduce_max.1154: bf16[]) -> bf16[] {
+  %reduce_max.1153 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1154 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1155 = bf16[] maximum(%reduce_max.1153, %reduce_max.1154), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_255.337 (scatter.302: f32[], scatter.303: f32[]) -> f32[] {
+  %scatter.302 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.303 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_266.350 (reduce_max.1209: bf16[], reduce_max.1210: bf16[]) -> bf16[] {
+  %reduce_max.1209 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1210 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1211 = bf16[] maximum(%reduce_max.1209, %reduce_max.1210), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_267.351 (scatter.322: f32[], scatter.323: f32[]) -> f32[] {
+  %scatter.322 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.323 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_257.339 (reduce_max.1167: bf16[], reduce_max.1168: bf16[]) -> bf16[] {
+  %reduce_max.1167 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1168 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1169 = bf16[] maximum(%reduce_max.1167, %reduce_max.1168), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_258.340 (scatter.307: f32[], scatter.308: f32[]) -> f32[] {
+  %scatter.307 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.308 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_260.343 (reduce_max.1181: bf16[], reduce_max.1182: bf16[]) -> bf16[] {
+  %reduce_max.1181 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1182 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1183 = bf16[] maximum(%reduce_max.1181, %reduce_max.1182), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_261.344 (scatter.312: f32[], scatter.313: f32[]) -> f32[] {
+  %scatter.312 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.313 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_263.347 (reduce_max.1195: bf16[], reduce_max.1196: bf16[]) -> bf16[] {
+  %reduce_max.1195 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1196 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1197 = bf16[] maximum(%reduce_max.1195, %reduce_max.1196), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_264.348 (scatter.317: f32[], scatter.318: f32[]) -> f32[] {
+  %scatter.317 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.318 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_215.295 (reduce_max.985: bf16[], reduce_max.986: bf16[]) -> bf16[] {
+  %reduce_max.985 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.986 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.987 = bf16[] maximum(%reduce_max.985, %reduce_max.986), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_216.296 (scatter.242: f32[], scatter.243: f32[]) -> f32[] {
+  %scatter.242 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.243 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_218.298 (reduce_max.999: bf16[], reduce_max.1000: bf16[]) -> bf16[] {
+  %reduce_max.999 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1000 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1001 = bf16[] maximum(%reduce_max.999, %reduce_max.1000), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_219.299 (scatter.247: f32[], scatter.248: f32[]) -> f32[] {
+  %scatter.247 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.248 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_221.301 (reduce_max.1013: bf16[], reduce_max.1014: bf16[]) -> bf16[] {
+  %reduce_max.1013 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1014 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1015 = bf16[] maximum(%reduce_max.1013, %reduce_max.1014), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_222.302 (scatter.252: f32[], scatter.253: f32[]) -> f32[] {
+  %scatter.252 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.253 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_298.384 (reduce_max.1293: bf16[], reduce_max.1294: bf16[]) -> bf16[] {
+  %reduce_max.1293 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1294 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1295 = bf16[] maximum(%reduce_max.1293, %reduce_max.1294), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_299.385 (scatter.352: f32[], scatter.353: f32[]) -> f32[] {
+  %scatter.352 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.353 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_295.381 (reduce_max.1279: bf16[], reduce_max.1280: bf16[]) -> bf16[] {
+  %reduce_max.1279 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1280 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1281 = bf16[] maximum(%reduce_max.1279, %reduce_max.1280), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_296.382 (scatter.347: f32[], scatter.348: f32[]) -> f32[] {
+  %scatter.347 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.348 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_238.318 (reduce_max.1083: bf16[], reduce_max.1084: bf16[]) -> bf16[] {
+  %reduce_max.1083 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1084 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1085 = bf16[] maximum(%reduce_max.1083, %reduce_max.1084), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_239.319 (scatter.277: f32[], scatter.278: f32[]) -> f32[] {
+  %scatter.277 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.278 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_241.321 (reduce_max.1097: bf16[], reduce_max.1098: bf16[]) -> bf16[] {
+  %reduce_max.1097 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1098 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1099 = bf16[] maximum(%reduce_max.1097, %reduce_max.1098), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_242.322 (scatter.282: f32[], scatter.283: f32[]) -> f32[] {
+  %scatter.282 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.283 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_276.362 (reduce_max.1237: bf16[], reduce_max.1238: bf16[]) -> bf16[] {
+  %reduce_max.1237 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1238 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1239 = bf16[] maximum(%reduce_max.1237, %reduce_max.1238), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_277.363 (scatter.332: f32[], scatter.333: f32[]) -> f32[] {
+  %scatter.332 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.333 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_232.312 (reduce_max.1055: bf16[], reduce_max.1056: bf16[]) -> bf16[] {
+  %reduce_max.1055 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1056 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1057 = bf16[] maximum(%reduce_max.1055, %reduce_max.1056), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_233.313 (scatter.267: f32[], scatter.268: f32[]) -> f32[] {
+  %scatter.267 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.268 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_235.315 (reduce_max.1069: bf16[], reduce_max.1070: bf16[]) -> bf16[] {
+  %reduce_max.1069 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1070 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1071 = bf16[] maximum(%reduce_max.1069, %reduce_max.1070), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_236.316 (scatter.272: f32[], scatter.273: f32[]) -> f32[] {
+  %scatter.272 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.273 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_280.366 (reduce_max.1251: bf16[], reduce_max.1252: bf16[]) -> bf16[] {
+  %reduce_max.1251 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1252 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1253 = bf16[] maximum(%reduce_max.1251, %reduce_max.1252), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_281.367 (scatter.337: f32[], scatter.338: f32[]) -> f32[] {
+  %scatter.337 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.338 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_225.305 (reduce_max.1027: bf16[], reduce_max.1028: bf16[]) -> bf16[] {
+  %reduce_max.1027 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1028 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1029 = bf16[] maximum(%reduce_max.1027, %reduce_max.1028), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_226.306 (scatter.257: f32[], scatter.258: f32[]) -> f32[] {
+  %scatter.257 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.258 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_228.308 (reduce_max.1041: bf16[], reduce_max.1042: bf16[]) -> bf16[] {
+  %reduce_max.1041 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1042 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1043 = bf16[] maximum(%reduce_max.1041, %reduce_max.1042), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_229.309 (scatter.262: f32[], scatter.263: f32[]) -> f32[] {
+  %scatter.262 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.263 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%region_288.374 (reduce_max.1265: bf16[], reduce_max.1266: bf16[]) -> bf16[] {
+  %reduce_max.1265 = bf16[] parameter(0), metadata={op_name="reduce_max"}
+  %reduce_max.1266 = bf16[] parameter(1), metadata={op_name="reduce_max"}
+  ROOT %reduce_max.1267 = bf16[] maximum(%reduce_max.1265, %reduce_max.1266), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+}
+
+%region_289.375 (scatter.342: f32[], scatter.343: f32[]) -> f32[] {
+  %scatter.342 = f32[] parameter(0), metadata={op_name="scatter"}
+  ROOT %scatter.343 = f32[] parameter(1), metadata={op_name="scatter"}
+}
+
+%closed_call_769.390 (Arg_0.169: s32[32,4096], Arg_1.145: bf16[32,4096,7168], Arg_2.139: bf16[32,4096,7168], Arg_3.13: bf16[32,4096,1536], Arg_4.7: bf16[32,4096,576], Arg_5.7: bf16[32,4096,7168], Arg_6.7: bf16[32,4096,18432], Arg_7.7: bf16[32,4096,18432], Arg_8.7: f32[1024], Arg_9.7: f32[1], Arg_10.7: f32[1024], Arg_11.7: f32[1], Arg_12.7: f32[1024], Arg_13.7: f32[1], Arg_14.7: f32[1024], Arg_15.7: f32[1], Arg_16.7: f32[1024], Arg_17.7: f32[1], Arg_18.7: f32[1024], Arg_19.7: f32[1], Arg_20.7: f32[1024], Arg_21.7: f32[1], Arg_22.7: f32[1024], Arg_23.7: f32[1], Arg_24.7: f32[1024], Arg_25.7: f32[1], Arg_26.7: f32[1024], Arg_27.7: f32[1], Arg_28.7: f32[1024], Arg_29.7: f32[1024], Arg_30.7: f32[1], Arg_31.7: f32[1], Arg_32.7: f32[1024], Arg_33.7: f32[1024], Arg_34.7: f32[1], Arg_35.7: f32[1], Arg_36.7: f32[1024], Arg_37.7: f32[1], Arg_38.7: f32[1024], Arg_39.7: f32[1], Arg_40.7: f32[1024], Arg_41.7: f32[1], Arg_42.7: f32[1024], Arg_43.7: f32[1], Arg_44.7: f32[1024], Arg_45.7: f32[1], Arg_46.7: f32[1024], Arg_47.7: f32[1], Arg_48.7: f32[1024], Arg_49.7: f32[1], Arg_50.7: f32[1024], Arg_51.7: f32[1], Arg_52.7: f32[1024], Arg_53.7: f32[1], Arg_54.7: bf16[7168,18432], Arg_55.7: bf16[7168,18432], Arg_56.7: bf16[18432,7168], Arg_57.7: bf16[7168], Arg_58.7: bf16[7168], Arg_59.7: bf16[512], Arg_60.7: bf16[128,128,7168], Arg_61.7: bf16[1536], Arg_62.7: bf16[7168,576], Arg_63.7: bf16[512,128,256], Arg_64.7: bf16[7168,1536], Arg_65.7: bf16[1536,128,192]) -> (bf16[32,4096,7168], bf16[7168,18432], bf16[7168,18432], bf16[18432,7168], bf16[7168], /*index=5*/bf16[7168], bf16[512], bf16[128,128,7168], bf16[1536], bf16[7168,576], /*index=10*/bf16[512,128,256], bf16[7168,1536], bf16[1536,128,192], f32[1024], f32[1], /*index=15*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=20*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=25*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=30*/f32[1], f32[1024], f32[1], f32[1024], f32[1024], /*index=35*/f32[1], f32[1], f32[1024], f32[1024], f32[1], /*index=40*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=45*/f32[1024], f32[1], f32[1024], f32[1], f32[1024], /*index=50*/f32[1], f32[1024], f32[1], f32[1024], f32[1], /*index=55*/f32[1024], f32[1], f32[1024], f32[1]) {
+  %Arg_1.145 = bf16[32,4096,7168]{2,1,0} parameter(1)
+  %sharding_constraint.179 = bf16[32,4096,7168]{2,1,0} custom-call(%Arg_1.145), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_6.7 = bf16[32,4096,18432]{2,1,0} parameter(6)
+  %device_put.38 = bf16[32,4096,18432]{2,1,0} custom-call(%Arg_6.7), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.31 = (bf16[32,4096,18432]{2,1,0}, bf16[32,4096,18432]{2,1,0}, bf16[32,4096,18432]{2,1,0}) call(%device_put.38), to_apply=%silu_775.331, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.32 = bf16[32,4096,18432]{2,1,0} get-tuple-element(%jit_silu_.31), index=0, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.180 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.179), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.481 = f32[1]{0} constant({1})
+  %Arg_24.7 = f32[1024]{0} parameter(24)
+  %constant.500 = f32[] constant(-inf)
+  %reduce_max.1327 = f32[] reduce(%Arg_24.7, %constant.500), dimensions={0}, to_apply=%region_262.346, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.200 = pred[] is-finite(%reduce_max.1327), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.501 = f32[] constant(0)
+  %gt.200 = pred[] compare(%reduce_max.1327, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.491 = f32[] constant(57344)
+  %div.1138 = f32[] divide(%constant.491, %reduce_max.1327), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.499 = f32[] constant(1)
+  %div.1139 = f32[] divide(%div.1138, %constant.499), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_25.7 = f32[1]{0} parameter(25)
+  %div.1137 = f32[1]{0} divide(%constant.481, %Arg_25.7), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.404 = f32[1]{0} call(%gt.200, %div.1139, %div.1137), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.405 = f32[1]{0} call(%is_finite.200, %jit__where_.404, %div.1137), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1140 = f32[1]{0} divide(%constant.481, %jit__where_.405), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.876 = bf16[1]{0} convert(%div.1140), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1421 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.876), dimensions={2}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1422 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1421), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1423 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1422), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1141 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.180, %broadcast_in_dim.1423), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.490 = bf16[] constant(-5.734e+04)
+  %constant.489 = bf16[] constant(5.734e+04)
+  %jit_clip_.212 = bf16[32,4096,7168]{2,1,0} call(%div.1141, %constant.490, %constant.489), to_apply=%clip_555.219, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.877 = f8e5m2[32,4096,7168]{2,1,0} convert(%jit_clip_.212), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_56.7 = bf16[18432,7168]{1,0} parameter(56)
+  %Arg_22.7 = f32[1024]{0} parameter(22)
+  %reduce_max.1325 = f32[] reduce(%Arg_22.7, %constant.500), dimensions={0}, to_apply=%region_259.342, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.199 = pred[] is-finite(%reduce_max.1325), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.199 = pred[] compare(%reduce_max.1325, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.498 = f32[] constant(448)
+  %div.1133 = f32[] divide(%constant.498, %reduce_max.1325), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1134 = f32[] divide(%div.1133, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_23.7 = f32[1]{0} parameter(23)
+  %div.1132 = f32[1]{0} divide(%constant.481, %Arg_23.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.402 = f32[1]{0} call(%gt.199, %div.1134, %div.1132), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.403 = f32[1]{0} call(%is_finite.199, %jit__where_.402, %div.1132), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1135 = f32[1]{0} divide(%constant.481, %jit__where_.403), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.873 = bf16[1]{0} convert(%div.1135), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1418 = bf16[18432,1]{1,0} broadcast(%convert_element_type.873), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1419 = bf16[18432]{0} reshape(%broadcast_in_dim.1418), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1420 = bf16[18432,7168]{1,0} broadcast(%broadcast_in_dim.1419), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1136 = bf16[18432,7168]{1,0} divide(%Arg_56.7, %broadcast_in_dim.1420), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.495 = bf16[] constant(-448)
+  %constant.494 = bf16[] constant(448)
+  %jit_clip_.211 = bf16[18432,7168]{1,0} call(%div.1136, %constant.495, %constant.494), to_apply=%clip_782.345, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.874 = f8e4m3fn[18432,7168]{1,0} convert(%jit_clip_.211), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.131 = bf16[32,4096,18432]{2,1,0} dot(%convert_element_type.877, %convert_element_type.874), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1122 = f32[1]{0} multiply(%div.1135, %div.1140), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.878 = bf16[1]{0} convert(%mul.1122), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1424 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.878), dimensions={2}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1425 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1424), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1426 = bf16[32,4096,18432]{2,1,0} broadcast(%broadcast_in_dim.1425), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1123 = bf16[32,4096,18432]{2,1,0} multiply(%dot_general.131, %broadcast_in_dim.1426), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.181 = bf16[32,4096,18432]{2,1,0} custom-call(%mul.1123), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/mlp/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1126 = bf16[32,4096,18432]{2,1,0} multiply(%jit_silu_.32, %sharding_constraint.181), metadata={op_name="checkpoint/dense_layers/mlp/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_18.7 = f32[1024]{0} parameter(18)
+  %reduce_max.1329 = f32[] reduce(%Arg_18.7, %constant.500), dimensions={0}, to_apply=%region_265.349, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.201 = pred[] is-finite(%reduce_max.1329), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.201 = pred[] compare(%reduce_max.1329, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1143 = f32[] divide(%constant.491, %reduce_max.1329), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1144 = f32[] divide(%div.1143, %constant.499), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_19.7 = f32[1]{0} parameter(19)
+  %div.1142 = f32[1]{0} divide(%constant.481, %Arg_19.7), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.406 = f32[1]{0} call(%gt.201, %div.1144, %div.1142), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.407 = f32[1]{0} call(%is_finite.201, %jit__where_.406, %div.1142), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1145 = f32[1]{0} divide(%constant.481, %jit__where_.407), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.881 = bf16[1]{0} convert(%div.1145), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1430 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.881), dimensions={2}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1431 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1430), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1432 = bf16[32,4096,18432]{2,1,0} broadcast(%broadcast_in_dim.1431), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1146 = bf16[32,4096,18432]{2,1,0} divide(%mul.1126, %broadcast_in_dim.1432), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.213 = bf16[32,4096,18432]{2,1,0} call(%div.1146, %constant.490, %constant.489), to_apply=%clip_787.352, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.882 = f8e5m2[32,4096,18432]{2,1,0} convert(%jit_clip_.213), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_55.7 = bf16[7168,18432]{1,0} parameter(55)
+  %Arg_16.7 = f32[1024]{0} parameter(16)
+  %reduce_max.1321 = f32[] reduce(%Arg_16.7, %constant.500), dimensions={0}, to_apply=%region_253.335, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.197 = pred[] is-finite(%reduce_max.1321), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.197 = pred[] compare(%reduce_max.1321, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1123 = f32[] divide(%constant.498, %reduce_max.1321), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1124 = f32[] divide(%div.1123, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_17.7 = f32[1]{0} parameter(17)
+  %div.1122 = f32[1]{0} divide(%constant.481, %Arg_17.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.398 = f32[1]{0} call(%gt.197, %div.1124, %div.1122), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.399 = f32[1]{0} call(%is_finite.197, %jit__where_.398, %div.1122), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1125 = f32[1]{0} divide(%constant.481, %jit__where_.399), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.867 = bf16[1]{0} convert(%div.1125), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1412 = bf16[7168,1]{1,0} broadcast(%convert_element_type.867), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1413 = bf16[7168]{0} reshape(%broadcast_in_dim.1412), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1414 = bf16[7168,18432]{1,0} broadcast(%broadcast_in_dim.1413), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1126 = bf16[7168,18432]{1,0} divide(%Arg_55.7, %broadcast_in_dim.1414), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.209 = bf16[7168,18432]{1,0} call(%div.1126, %constant.495, %constant.494), to_apply=%clip_773.330, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.868 = f8e4m3fn[7168,18432]{1,0} convert(%jit_clip_.209), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.133 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.882, %convert_element_type.868), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1128 = f32[1]{0} multiply(%div.1125, %div.1145), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.883 = bf16[1]{0} convert(%mul.1128), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1433 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.883), dimensions={2}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1434 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1433), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1435 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1434), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1129 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.133, %broadcast_in_dim.1435), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.33 = bf16[32,4096,18432]{2,1,0} get-tuple-element(%jit_silu_.31), index=1, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.34 = bf16[32,4096,18432]{2,1,0} get-tuple-element(%jit_silu_.31), index=2, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_7.7 = bf16[32,4096,18432]{2,1,0} parameter(7)
+  %device_put.39 = bf16[32,4096,18432]{2,1,0} custom-call(%Arg_7.7), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1127 = bf16[32,4096,18432]{2,1,0} multiply(%sharding_constraint.181, %device_put.39), metadata={op_name="checkpoint/dense_layers/mlp/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_silu_.35 = bf16[32,4096,18432]{2,1,0} call(%jit_silu_.33, %device_put.38, %jit_silu_.34, %mul.1127), to_apply=%silu_793.353, metadata={op_name="checkpoint/dense_layers/mlp/jit(silu)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_12.7 = f32[1024]{0} parameter(12)
+  %reduce_max.1331 = f32[] reduce(%Arg_12.7, %constant.500), dimensions={0}, to_apply=%region_268.354, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.202 = pred[] is-finite(%reduce_max.1331), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.202 = pred[] compare(%reduce_max.1331, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1148 = f32[] divide(%constant.491, %reduce_max.1331), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1149 = f32[] divide(%div.1148, %constant.499), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_13.7 = f32[1]{0} parameter(13)
+  %div.1147 = f32[1]{0} divide(%constant.481, %Arg_13.7), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.408 = f32[1]{0} call(%gt.202, %div.1149, %div.1147), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.409 = f32[1]{0} call(%is_finite.202, %jit__where_.408, %div.1147), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1150 = f32[1]{0} divide(%constant.481, %jit__where_.409), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.886 = bf16[1]{0} convert(%div.1150), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1439 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.886), dimensions={2}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1440 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1439), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1441 = bf16[32,4096,18432]{2,1,0} broadcast(%broadcast_in_dim.1440), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1151 = bf16[32,4096,18432]{2,1,0} divide(%jit_silu_.35, %broadcast_in_dim.1441), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.214 = bf16[32,4096,18432]{2,1,0} call(%div.1151, %constant.490, %constant.489), to_apply=%clip_787.352, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.887 = f8e5m2[32,4096,18432]{2,1,0} convert(%jit_clip_.214), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_54.7 = bf16[7168,18432]{1,0} parameter(54)
+  %Arg_10.7 = f32[1024]{0} parameter(10)
+  %reduce_max.1317 = f32[] reduce(%Arg_10.7, %constant.500), dimensions={0}, to_apply=%region_247.327, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.195 = pred[] is-finite(%reduce_max.1317), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.195 = pred[] compare(%reduce_max.1317, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1113 = f32[] divide(%constant.498, %reduce_max.1317), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1114 = f32[] divide(%div.1113, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_11.7 = f32[1]{0} parameter(11)
+  %div.1112 = f32[1]{0} divide(%constant.481, %Arg_11.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.394 = f32[1]{0} call(%gt.195, %div.1114, %div.1112), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.395 = f32[1]{0} call(%is_finite.195, %jit__where_.394, %div.1112), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1115 = f32[1]{0} divide(%constant.481, %jit__where_.395), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.861 = bf16[1]{0} convert(%div.1115), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1406 = bf16[7168,1]{1,0} broadcast(%convert_element_type.861), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1407 = bf16[7168]{0} reshape(%broadcast_in_dim.1406), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1408 = bf16[7168,18432]{1,0} broadcast(%broadcast_in_dim.1407), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1116 = bf16[7168,18432]{1,0} divide(%Arg_54.7, %broadcast_in_dim.1408), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.207 = bf16[7168,18432]{1,0} call(%div.1116, %constant.495, %constant.494), to_apply=%clip_773.330, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.862 = f8e4m3fn[7168,18432]{1,0} convert(%jit_clip_.207), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.135 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.887, %convert_element_type.862), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1132 = f32[1]{0} multiply(%div.1115, %div.1150), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.888 = bf16[1]{0} convert(%mul.1132), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1442 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.888), dimensions={2}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1443 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1442), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1444 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1443), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1133 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.135, %broadcast_in_dim.1444), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.49 = bf16[32,4096,7168]{2,1,0} add(%mul.1129, %mul.1133), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.182 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.49), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_57.7 = bf16[7168]{0} parameter(57)
+  %broadcast_in_dim.1402 = bf16[1,1,7168]{2,1,0} reshape(%Arg_57.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1137 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.1402), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1138 = bf16[7168]{0} reshape(%mul.1137), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1139 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.1138), dimensions={2}, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1140 = bf16[32,4096,7168]{2,1,0} multiply(%sharding_constraint.182, %mul.1139), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.890 = f32[32,4096,7168]{2,1,0} convert(%mul.1140), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_2.139 = bf16[32,4096,7168]{2,1,0} parameter(2)
+  %Arg_5.7 = bf16[32,4096,7168]{2,1,0} parameter(5)
+  %sharding_constraint.176 = bf16[32,4096,7168]{2,1,0} custom-call(%Arg_5.7), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.128 = bf16[32,4096,7168]{2,1,0} add(%Arg_2.139, %sharding_constraint.176), metadata={op_name="checkpoint/rematted_computation/dense_layers/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.855 = f32[32,4096,7168]{2,1,0} convert(%add.128), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.33 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.855, %convert_element_type.855), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.476 = f32[32,4096]{1,0} reduce(%square.33, %constant.501), dimensions={2}, to_apply=%region_243.323, metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1401 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.476), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.484 = f32[] constant(7168)
+  %broadcast.182 = f32[32,4096,1]{2,1,0} broadcast(%constant.484), dimensions={}
+  %div.1105 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.1401, %broadcast.182), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.483 = f32[] constant(1e-06)
+  %broadcast.181 = f32[32,4096,1]{2,1,0} broadcast(%constant.483), dimensions={}
+  %add.129 = f32[32,4096,1]{2,1,0} add(%div.1105, %broadcast.181), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.32 = f32[32,4096,1]{2,1,0} rsqrt(%add.129), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1142 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.32), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1143 = f32[32,4096]{1,0} reshape(%mul.1142), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1144 = f32[32,4096,7168]{2,1,0} broadcast(%mul.1143), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1145 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.890, %mul.1144), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1141 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.855, %convert_element_type.890), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.479 = f32[32,4096]{1,0} reduce(%mul.1141, %constant.501), dimensions={2}, to_apply=%region_273.359, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.51 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.479), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1106 = f32[32,4096,1]{2,1,0} divide(%rsqrt.32, %add.129), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.482 = f32[] constant(-0.5)
+  %broadcast.180 = f32[32,4096,1]{2,1,0} broadcast(%constant.482), dimensions={}
+  %mul.1112 = f32[32,4096,1]{2,1,0} multiply(%div.1106, %broadcast.180), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1146 = f32[32,4096,1]{2,1,0} multiply(%reshape.51, %mul.1112), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1152 = f32[32,4096,1]{2,1,0} divide(%mul.1146, %broadcast.182), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.480 = f32[32,4096]{1,0} reduce(%div.1152, %constant.501), dimensions={2}, to_apply=%region_274.360, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1448 = f32[32,4096,7168]{2,1,0} broadcast(%reduce_sum.480), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.485 = f32[] constant(2)
+  %broadcast.183 = f32[32,4096,7168]{2,1,0} broadcast(%constant.485), dimensions={}
+  %mul.1111 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.855, %broadcast.183), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1147 = f32[32,4096,7168]{2,1,0} multiply(%broadcast_in_dim.1448, %mul.1111), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.50 = f32[32,4096,7168]{2,1,0} add(%mul.1145, %mul.1147), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.891 = bf16[32,4096,7168]{2,1,0} convert(%add_any.50), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.51 = bf16[32,4096,7168]{2,1,0} add(%sharding_constraint.179, %convert_element_type.891), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_3.13 = bf16[32,4096,1536]{2,1,0} parameter(3)
+  %device_put.36 = bf16[32,4096,1536]{2,1,0} custom-call(%Arg_3.13), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.823 = f32[32,4096,1536]{2,1,0} convert(%device_put.36), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.31 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.823, %convert_element_type.823), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.474 = f32[32,4096]{1,0} reduce(%square.31, %constant.501), dimensions={2}, to_apply=%region_223.303, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1364 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.474), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.478 = f32[] constant(1536)
+  %broadcast.179 = f32[32,4096,1]{2,1,0} broadcast(%constant.478), dimensions={}
+  %div.1063 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.1364, %broadcast.179), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.122 = f32[32,4096,1]{2,1,0} add(%div.1063, %broadcast.181), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.30 = f32[32,4096,1]{2,1,0} rsqrt(%add.122), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1059 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.30), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1060 = f32[32,4096]{1,0} reshape(%mul.1059), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1061 = f32[32,4096,1536]{2,1,0} broadcast(%mul.1060), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1062 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.823, %mul.1061), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.824 = bf16[32,4096,1536]{2,1,0} convert(%mul.1062), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_61.7 = bf16[1536]{0} parameter(61)
+  %broadcast_in_dim.1365 = bf16[1,1,1536]{2,1,0} reshape(%Arg_61.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1063 = bf16[1,1,1536]{2,1,0} broadcast(%broadcast_in_dim.1365), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1064 = bf16[1536]{0} reshape(%mul.1063), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1065 = bf16[32,4096,1536]{2,1,0} broadcast(%mul.1064), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1066 = bf16[32,4096,1536]{2,1,0} multiply(%convert_element_type.824, %mul.1065), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_48.7 = f32[1024]{0} parameter(48)
+  %reduce_max.1303 = f32[] reduce(%Arg_48.7, %constant.500), dimensions={0}, to_apply=%region_224.304, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.188 = pred[] is-finite(%reduce_max.1303), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.188 = pred[] compare(%reduce_max.1303, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1066 = f32[] divide(%constant.498, %reduce_max.1303), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1067 = f32[] divide(%div.1066, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_49.7 = f32[1]{0} parameter(49)
+  %div.1065 = f32[1]{0} divide(%constant.481, %Arg_49.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.380 = f32[1]{0} call(%gt.188, %div.1067, %div.1065), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.381 = f32[1]{0} call(%is_finite.188, %jit__where_.380, %div.1065), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1068 = f32[1]{0} divide(%constant.481, %jit__where_.381), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.826 = bf16[1]{0} convert(%div.1068), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1366 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.826), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1367 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1366), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1368 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.1367), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1069 = bf16[32,4096,1536]{2,1,0} divide(%mul.1066, %broadcast_in_dim.1368), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.198 = bf16[32,4096,1536]{2,1,0} call(%div.1069, %constant.495, %constant.494), to_apply=%clip_495.122, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.827 = f8e4m3fn[32,4096,1536]{2,1,0} convert(%jit_clip_.198), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_65.7 = bf16[1536,128,192]{2,1,0} parameter(65)
+  %Arg_50.7 = f32[1024]{0} parameter(50)
+  %reduce_max.1305 = f32[] reduce(%Arg_50.7, %constant.500), dimensions={0}, to_apply=%region_227.307, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.189 = pred[] is-finite(%reduce_max.1305), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.189 = pred[] compare(%reduce_max.1305, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1071 = f32[] divide(%constant.498, %reduce_max.1305), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1072 = f32[] divide(%div.1071, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_51.7 = f32[1]{0} parameter(51)
+  %div.1070 = f32[1]{0} divide(%constant.481, %Arg_51.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.382 = f32[1]{0} call(%gt.189, %div.1072, %div.1070), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.383 = f32[1]{0} call(%is_finite.189, %jit__where_.382, %div.1070), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1073 = f32[1]{0} divide(%constant.481, %jit__where_.383), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.829 = bf16[1]{0} convert(%div.1073), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1369 = bf16[1536,128,1]{2,1,0} broadcast(%convert_element_type.829), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1370 = bf16[1536,128]{1,0} reshape(%broadcast_in_dim.1369), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1371 = bf16[1536,128,192]{2,1,0} broadcast(%broadcast_in_dim.1370), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1074 = bf16[1536,128,192]{2,1,0} divide(%Arg_65.7, %broadcast_in_dim.1371), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.199 = bf16[1536,128,192]{2,1,0} call(%div.1074, %constant.495, %constant.494), to_apply=%clip_498.126, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.830 = f8e4m3fn[1536,128,192]{2,1,0} convert(%jit_clip_.199), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.129 = bf16[32,4096,128,192]{3,2,1,0} dot(%convert_element_type.827, %convert_element_type.830), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1067 = f32[1]{0} multiply(%div.1068, %div.1073), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.831 = bf16[1]{0} convert(%mul.1067), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1372 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.831), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1373 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.1372), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1374 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%broadcast_in_dim.1373), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1068 = bf16[32,4096,128,192]{3,2,1,0} multiply(%dot_general.129, %broadcast_in_dim.1374), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.66 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.1068), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.67 = bf16[32,4096,128,64]{3,2,1,0} slice(%mul.1068), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.48 = bf16[32,4096,128,32,2]{4,3,2,1,0} reshape(%split.67), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.40 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.48), slice={[0:32], [0:4096], [0:128], [0:32], [0:1]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.525 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.40), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.834 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.525), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.41 = bf16[32,4096,128,32,1]{4,3,2,1,0} slice(%reshape.48), slice={[0:32], [0:4096], [0:128], [0:32], [1:2]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.526 = bf16[32,4096,128,32]{3,2,1,0} reshape(%slice.41), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.833 = c64[32,4096,128,32]{3,2,1,0} convert(%squeeze.526), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.469 = c64[] constant((0, 1))
+  %broadcast.170 = c64[32,4096,128,32]{3,2,1,0} broadcast(%constant.469), dimensions={}
+  %mul.1080 = c64[32,4096,128,32]{3,2,1,0} multiply(%convert_element_type.833, %broadcast.170), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.124 = c64[32,4096,128,32]{3,2,1,0} add(%convert_element_type.834, %mul.1080), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.44 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1375 = f32[163840,1]{1,0} reshape(%iota.44), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1072 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.1375), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1073 = f32[163840]{0} reshape(%mul.1072), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1074 = f32[163840,32]{1,0} broadcast(%mul.1073), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.474 = f32[] constant(1)
+  %broadcast.175 = f32[32]{0} broadcast(%constant.474), dimensions={}
+  %constant.475 = f32[] constant(10000)
+  %broadcast.176 = f32[32]{0} broadcast(%constant.475), dimensions={}
+  %iota.42 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.477 = f32[] constant(2)
+  %broadcast.178 = f32[32]{0} broadcast(%constant.477), dimensions={}
+  %mul.1069 = f32[32]{0} multiply(%iota.42, %broadcast.178), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.476 = f32[] constant(64)
+  %broadcast.177 = f32[32]{0} broadcast(%constant.476), dimensions={}
+  %div.1075 = f32[32]{0} divide(%mul.1069, %broadcast.177), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pow.10 = f32[32]{0} power(%broadcast.176, %div.1075), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/pow" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1076 = f32[32]{0} divide(%broadcast.175, %pow.10), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.471 = f32[] constant(40)
+  %broadcast.172 = f32[32]{0} broadcast(%constant.471), dimensions={}
+  %div.1078 = f32[32]{0} divide(%div.1076, %broadcast.172), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.43 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.473 = f32[] constant(10)
+  %broadcast.174 = f32[32]{0} broadcast(%constant.473), dimensions={}
+  %sub.51 = f32[32]{0} subtract(%iota.43, %broadcast.174), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.472 = f32[] constant(13)
+  %broadcast.173 = f32[32]{0} broadcast(%constant.472), dimensions={}
+  %div.1077 = f32[32]{0} divide(%sub.51, %broadcast.173), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.496 = s32[] constant(0)
+  %constant.493 = s32[] constant(1)
+  %jit_clip_.200 = f32[32]{0} call(%div.1077, %constant.496, %constant.493), to_apply=%clip_499.127, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.52 = f32[32]{0} subtract(%broadcast.175, %jit_clip_.200), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.53 = f32[32]{0} subtract(%broadcast.175, %sub.52), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1070 = f32[32]{0} multiply(%div.1078, %sub.53), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1071 = f32[32]{0} multiply(%div.1076, %sub.52), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.123 = f32[32]{0} add(%mul.1070, %mul.1071), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1376 = f32[1,32]{1,0} reshape(%add.123), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1075 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.1376), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1076 = f32[32]{0} reshape(%mul.1075), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1077 = f32[163840,32]{1,0} broadcast(%mul.1076), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1078 = f32[163840,32]{1,0} multiply(%mul.1074, %mul.1077), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.832 = c64[163840,32]{1,0} convert(%mul.1078), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.470 = c64[] constant((0, 1))
+  %broadcast.171 = c64[163840,32]{1,0} broadcast(%constant.470), dimensions={}
+  %mul.1079 = c64[163840,32]{1,0} multiply(%convert_element_type.832, %broadcast.171), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.203 = f32[163840,32]{1,0} real(%mul.1079), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.204 = f32[163840,32]{1,0} exponential(%exp.203), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.488 = f32[] constant(inf)
+  %broadcast.186 = f32[163840,32]{1,0} broadcast(%constant.488), dimensions={}
+  %exp.205 = pred[163840,32]{1,0} compare(%exp.204, %broadcast.186), direction=EQ, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.487 = f32[] constant(0.5)
+  %broadcast.185 = f32[163840,32]{1,0} broadcast(%constant.487), dimensions={}
+  %exp.206 = f32[163840,32]{1,0} multiply(%exp.203, %broadcast.185), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.207 = f32[163840,32]{1,0} exponential(%exp.206), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.208 = f32[163840,32]{1,0} imag(%mul.1079), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.209 = f32[163840,32]{1,0} cosine(%exp.208), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.210 = f32[163840,32]{1,0} multiply(%exp.207, %exp.209), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.211 = f32[163840,32]{1,0} multiply(%exp.210, %exp.207), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.212 = f32[163840,32]{1,0} multiply(%exp.204, %exp.209), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.213 = f32[163840,32]{1,0} select(%exp.205, %exp.211, %exp.212), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.486 = f32[] constant(0)
+  %broadcast.184 = f32[163840,32]{1,0} broadcast(%constant.486), dimensions={}
+  %exp.214 = pred[163840,32]{1,0} compare(%exp.208, %broadcast.184), direction=EQ, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.215 = f32[163840,32]{1,0} sine(%exp.208), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.216 = f32[163840,32]{1,0} multiply(%exp.207, %exp.215), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.217 = f32[163840,32]{1,0} multiply(%exp.216, %exp.207), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.218 = f32[163840,32]{1,0} multiply(%exp.204, %exp.215), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.219 = f32[163840,32]{1,0} select(%exp.205, %exp.217, %exp.218), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.220 = f32[163840,32]{1,0} select(%exp.214, %broadcast.184, %exp.219), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.221 = c64[163840,32]{1,0} complex(%exp.213, %exp.220), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_0.169 = s32[32,4096]{1,0} parameter(0)
+  %jit__take_.10 = c64[32,4096,32]{2,1,0} call(%exp.221, %Arg_0.169), to_apply=%_take_500.130, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/jit(_take)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1377 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.10), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1081 = c64[32,4096,1,32]{3,2,1,0} broadcast(%broadcast_in_dim.1377), dimensions={0,1,2,3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1082 = c64[32,4096,32]{2,1,0} reshape(%mul.1081), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1083 = c64[32,4096,128,32]{3,2,1,0} broadcast(%mul.1082), dimensions={0,1,3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1084 = c64[32,4096,128,32]{3,2,1,0} multiply(%add.124, %mul.1083), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %real.14 = f32[32,4096,128,32]{3,2,1,0} real(%mul.1084), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/real" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %imag.14 = f32[32,4096,128,32]{3,2,1,0} imag(%mul.1084), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/imag" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.41 = f32[32,4096,128,64]{3,2,1,0} concatenate(%real.14, %imag.14), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.835 = bf16[32,4096,128,64]{3,2,1,0} convert(%concatenate.41), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.42 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.66, %convert_element_type.835), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.468 = bf16[] constant(0.1348)
+  %broadcast.169 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%constant.468), dimensions={}
+  %mul.1085 = bf16[32,4096,128,192]{3,2,1,0} multiply(%concatenate.42, %broadcast.169), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.172 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%mul.1085), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_4.7 = bf16[32,4096,576]{2,1,0} parameter(4)
+  %device_put.37 = bf16[32,4096,576]{2,1,0} custom-call(%Arg_4.7), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}, metadata={op_name="checkpoint/rematted_computation/device_put" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.68 = bf16[32,4096,512]{2,1,0} slice(%device_put.37), slice={[0:32], [0:4096], [0:512]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.836 = f32[32,4096,512]{2,1,0} convert(%split.68), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.32 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.836, %convert_element_type.836), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.475 = f32[32,4096]{1,0} reduce(%square.32, %constant.501), dimensions={2}, to_apply=%region_230.310, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1378 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.475), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.466 = f32[] constant(512)
+  %broadcast.168 = f32[32,4096,1]{2,1,0} broadcast(%constant.466), dimensions={}
+  %div.1079 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.1378, %broadcast.168), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.125 = f32[32,4096,1]{2,1,0} add(%div.1079, %broadcast.181), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.31 = f32[32,4096,1]{2,1,0} rsqrt(%add.125), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1088 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.31), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1089 = f32[32,4096]{1,0} reshape(%mul.1088), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1090 = f32[32,4096,512]{2,1,0} broadcast(%mul.1089), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1091 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.836, %mul.1090), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.837 = bf16[32,4096,512]{2,1,0} convert(%mul.1091), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_59.7 = bf16[512]{0} parameter(59)
+  %broadcast_in_dim.1379 = bf16[1,1,512]{2,1,0} reshape(%Arg_59.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1092 = bf16[1,1,512]{2,1,0} broadcast(%broadcast_in_dim.1379), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1093 = bf16[512]{0} reshape(%mul.1092), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1094 = bf16[32,4096,512]{2,1,0} broadcast(%mul.1093), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1095 = bf16[32,4096,512]{2,1,0} multiply(%convert_element_type.837, %mul.1094), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_42.7 = f32[1024]{0} parameter(42)
+  %reduce_max.1307 = f32[] reduce(%Arg_42.7, %constant.500), dimensions={0}, to_apply=%region_231.311, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.190 = pred[] is-finite(%reduce_max.1307), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.190 = pred[] compare(%reduce_max.1307, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1086 = f32[] divide(%constant.498, %reduce_max.1307), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1087 = f32[] divide(%div.1086, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_43.7 = f32[1]{0} parameter(43)
+  %div.1085 = f32[1]{0} divide(%constant.481, %Arg_43.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.384 = f32[1]{0} call(%gt.190, %div.1087, %div.1085), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.385 = f32[1]{0} call(%is_finite.190, %jit__where_.384, %div.1085), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1088 = f32[1]{0} divide(%constant.481, %jit__where_.385), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.843 = bf16[1]{0} convert(%div.1088), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1383 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.843), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1384 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1383), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1385 = bf16[32,4096,512]{2,1,0} broadcast(%broadcast_in_dim.1384), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1089 = bf16[32,4096,512]{2,1,0} divide(%mul.1095, %broadcast_in_dim.1385), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.202 = bf16[32,4096,512]{2,1,0} call(%div.1089, %constant.495, %constant.494), to_apply=%clip_506.135, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.844 = f8e4m3fn[32,4096,512]{2,1,0} convert(%jit_clip_.202), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_63.7 = bf16[512,128,256]{2,1,0} parameter(63)
+  %Arg_44.7 = f32[1024]{0} parameter(44)
+  %reduce_max.1309 = f32[] reduce(%Arg_44.7, %constant.500), dimensions={0}, to_apply=%region_234.314, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.191 = pred[] is-finite(%reduce_max.1309), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.191 = pred[] compare(%reduce_max.1309, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1091 = f32[] divide(%constant.498, %reduce_max.1309), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1092 = f32[] divide(%div.1091, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_45.7 = f32[1]{0} parameter(45)
+  %div.1090 = f32[1]{0} divide(%constant.481, %Arg_45.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.386 = f32[1]{0} call(%gt.191, %div.1092, %div.1090), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.387 = f32[1]{0} call(%is_finite.191, %jit__where_.386, %div.1090), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1093 = f32[1]{0} divide(%constant.481, %jit__where_.387), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.846 = bf16[1]{0} convert(%div.1093), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1386 = bf16[512,128,1]{2,1,0} broadcast(%convert_element_type.846), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1387 = bf16[512,128]{1,0} reshape(%broadcast_in_dim.1386), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1388 = bf16[512,128,256]{2,1,0} broadcast(%broadcast_in_dim.1387), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1094 = bf16[512,128,256]{2,1,0} divide(%Arg_63.7, %broadcast_in_dim.1388), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.203 = bf16[512,128,256]{2,1,0} call(%div.1094, %constant.495, %constant.494), to_apply=%clip_509.139, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.847 = f8e4m3fn[512,128,256]{2,1,0} convert(%jit_clip_.203), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.130 = bf16[32,4096,128,256]{3,2,1,0} dot(%convert_element_type.844, %convert_element_type.847), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1109 = f32[1]{0} multiply(%div.1088, %div.1093), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.848 = bf16[1]{0} convert(%mul.1109), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1389 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.848), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1390 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.1389), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1391 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%broadcast_in_dim.1390), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1110 = bf16[32,4096,128,256]{3,2,1,0} multiply(%dot_general.130, %broadcast_in_dim.1391), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.70 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.1110), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.69 = bf16[32,4096,64]{2,1,0} slice(%device_put.37), slice={[0:32], [0:4096], [512:576]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.49 = bf16[32,4096,1,32,2]{4,3,2,1,0} reshape(%split.69), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.42 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.49), slice={[0:32], [0:4096], [0:1], [0:32], [0:1]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.527 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.42), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.840 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.527), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %slice.43 = bf16[32,4096,1,32,1]{4,3,2,1,0} slice(%reshape.49), slice={[0:32], [0:4096], [0:1], [0:32], [1:2]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %squeeze.528 = bf16[32,4096,1,32]{3,2,1,0} reshape(%slice.43), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.839 = c64[32,4096,1,32]{3,2,1,0} convert(%squeeze.528), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.465 = c64[] constant((0, 1))
+  %broadcast.167 = c64[32,4096,1,32]{3,2,1,0} broadcast(%constant.465), dimensions={}
+  %mul.1107 = c64[32,4096,1,32]{3,2,1,0} multiply(%convert_element_type.839, %broadcast.167), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.127 = c64[32,4096,1,32]{3,2,1,0} add(%convert_element_type.840, %mul.1107), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.47 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1380 = f32[163840,1]{1,0} reshape(%iota.47), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1099 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.1380), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1100 = f32[163840]{0} reshape(%mul.1099), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1101 = f32[163840,32]{1,0} broadcast(%mul.1100), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.45 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1096 = f32[32]{0} multiply(%iota.45, %broadcast.178), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1081 = f32[32]{0} divide(%mul.1096, %broadcast.177), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pow.11 = f32[32]{0} power(%broadcast.176, %div.1081), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/pow" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1082 = f32[32]{0} divide(%broadcast.175, %pow.11), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1084 = f32[32]{0} divide(%div.1082, %broadcast.172), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %iota.46 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/iota" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.54 = f32[32]{0} subtract(%iota.46, %broadcast.174), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1083 = f32[32]{0} divide(%sub.54, %broadcast.173), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.201 = f32[32]{0} call(%div.1083, %constant.496, %constant.493), to_apply=%clip_499.127, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.55 = f32[32]{0} subtract(%broadcast.175, %jit_clip_.201), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sub.56 = f32[32]{0} subtract(%broadcast.175, %sub.55), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1097 = f32[32]{0} multiply(%div.1084, %sub.56), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1098 = f32[32]{0} multiply(%div.1082, %sub.55), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.126 = f32[32]{0} add(%mul.1097, %mul.1098), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1381 = f32[1,32]{1,0} reshape(%add.126), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1102 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.1381), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1103 = f32[32]{0} reshape(%mul.1102), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1104 = f32[163840,32]{1,0} broadcast(%mul.1103), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1105 = f32[163840,32]{1,0} multiply(%mul.1101, %mul.1104), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.838 = c64[163840,32]{1,0} convert(%mul.1105), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1106 = c64[163840,32]{1,0} multiply(%convert_element_type.838, %broadcast.171), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.222 = f32[163840,32]{1,0} real(%mul.1106), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.223 = f32[163840,32]{1,0} exponential(%exp.222), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.224 = pred[163840,32]{1,0} compare(%exp.223, %broadcast.186), direction=EQ, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.225 = f32[163840,32]{1,0} multiply(%exp.222, %broadcast.185), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.226 = f32[163840,32]{1,0} exponential(%exp.225), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.227 = f32[163840,32]{1,0} imag(%mul.1106), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.228 = f32[163840,32]{1,0} cosine(%exp.227), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.229 = f32[163840,32]{1,0} multiply(%exp.226, %exp.228), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.230 = f32[163840,32]{1,0} multiply(%exp.229, %exp.226), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.231 = f32[163840,32]{1,0} multiply(%exp.223, %exp.228), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.232 = f32[163840,32]{1,0} select(%exp.224, %exp.230, %exp.231), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.233 = pred[163840,32]{1,0} compare(%exp.227, %broadcast.184), direction=EQ, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.234 = f32[163840,32]{1,0} sine(%exp.227), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.235 = f32[163840,32]{1,0} multiply(%exp.226, %exp.234), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.236 = f32[163840,32]{1,0} multiply(%exp.235, %exp.226), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.237 = f32[163840,32]{1,0} multiply(%exp.223, %exp.234), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.238 = f32[163840,32]{1,0} select(%exp.224, %exp.236, %exp.237), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.239 = f32[163840,32]{1,0} select(%exp.233, %broadcast.184, %exp.238), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %exp.240 = c64[163840,32]{1,0} complex(%exp.232, %exp.239), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/exp" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__take_.11 = c64[32,4096,32]{2,1,0} call(%exp.240, %Arg_0.169), to_apply=%_take_500.130, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/jit(_take)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1382 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.11), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1108 = c64[32,4096,1,32]{3,2,1,0} multiply(%add.127, %broadcast_in_dim.1382), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %real.15 = f32[32,4096,1,32]{3,2,1,0} real(%mul.1108), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/real" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %imag.15 = f32[32,4096,1,32]{3,2,1,0} imag(%mul.1108), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/imag" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.43 = f32[32,4096,1,64]{3,2,1,0} concatenate(%real.15, %imag.15), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.841 = bf16[32,4096,1,64]{3,2,1,0} convert(%concatenate.43), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1392 = bf16[32,4096,1,64]{3,2,1,0} broadcast(%convert_element_type.841), dimensions={0,1,2,3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1393 = bf16[32,4096,64]{2,1,0} reshape(%broadcast_in_dim.1392), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1394 = bf16[32,4096,128,64]{3,2,1,0} broadcast(%broadcast_in_dim.1393), dimensions={0,1,3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.44 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.70, %broadcast_in_dim.1394), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.173 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%concatenate.44), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.71 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.1110), slice={[0:32], [0:4096], [0:128], [128:256]}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.174 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%split.71), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.464 = bf16[0]{0} constant({})
+  %custom_partitioning.51 = (bf16[32,4096,128,128]{3,2,1,0}, f32[32,128,4096]{2,1,0}) custom-call(%sharding_constraint.172, %sharding_constraint.173, %sharding_constraint.174, %constant.464, %constant.464, /*index=5*/%constant.464, %constant.464, %constant.464, %constant.464, %constant.464), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}, backend_config="281457329920656"
+  %custom_partitioning.53 = f32[32,128,4096]{2,1,0} get-tuple-element(%custom_partitioning.51), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.52 = bf16[32,4096,128,128]{3,2,1,0} get-tuple-element(%custom_partitioning.51), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1])->([i, j, k, t], [i, k, j]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0}, custom>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.183 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.51), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_40.7 = f32[1024]{0} parameter(40)
+  %reduce_max.1333 = f32[] reduce(%Arg_40.7, %constant.500), dimensions={0}, to_apply=%region_275.361, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.203 = pred[] is-finite(%reduce_max.1333), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.203 = pred[] compare(%reduce_max.1333, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1154 = f32[] divide(%constant.491, %reduce_max.1333), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1155 = f32[] divide(%div.1154, %constant.499), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_41.7 = f32[1]{0} parameter(41)
+  %div.1153 = f32[1]{0} divide(%constant.481, %Arg_41.7), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.410 = f32[1]{0} call(%gt.203, %div.1155, %div.1153), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.411 = f32[1]{0} call(%is_finite.203, %jit__where_.410, %div.1153), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1156 = f32[1]{0} divide(%constant.481, %jit__where_.411), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.893 = bf16[1]{0} convert(%div.1156), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1449 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.893), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1450 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1449), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1451 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1450), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1157 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.183, %broadcast_in_dim.1451), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.215 = bf16[32,4096,7168]{2,1,0} call(%div.1157, %constant.490, %constant.489), to_apply=%clip_555.219, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.894 = f8e5m2[32,4096,7168]{2,1,0} convert(%jit_clip_.215), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_60.7 = bf16[128,128,7168]{2,1,0} parameter(60)
+  %Arg_38.7 = f32[1024]{0} parameter(38)
+  %reduce_max.1313 = f32[] reduce(%Arg_38.7, %constant.500), dimensions={0}, to_apply=%region_240.320, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.193 = pred[] is-finite(%reduce_max.1313), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.193 = pred[] compare(%reduce_max.1313, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1101 = f32[] divide(%constant.498, %reduce_max.1313), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1102 = f32[] divide(%div.1101, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_39.7 = f32[1]{0} parameter(39)
+  %div.1100 = f32[1]{0} divide(%constant.481, %Arg_39.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.390 = f32[1]{0} call(%gt.193, %div.1102, %div.1100), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.391 = f32[1]{0} call(%is_finite.193, %jit__where_.390, %div.1100), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1103 = f32[1]{0} divide(%constant.481, %jit__where_.391), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.853 = bf16[1]{0} convert(%div.1103), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1398 = bf16[128,128,1]{2,1,0} broadcast(%convert_element_type.853), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1399 = bf16[128,128]{1,0} reshape(%broadcast_in_dim.1398), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1400 = bf16[128,128,7168]{2,1,0} broadcast(%broadcast_in_dim.1399), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1104 = bf16[128,128,7168]{2,1,0} divide(%Arg_60.7, %broadcast_in_dim.1400), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.205 = bf16[128,128,7168]{2,1,0} call(%div.1104, %constant.495, %constant.494), to_apply=%clip_515.147, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.854 = f8e4m3fn[128,128,7168]{2,1,0} convert(%jit_clip_.205), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.137 = bf16[32,4096,128,128]{3,2,1,0} dot(%convert_element_type.894, %convert_element_type.854), lhs_contracting_dims={2}, rhs_contracting_dims={2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1148 = f32[1]{0} multiply(%div.1103, %div.1156), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.895 = bf16[1]{0} convert(%mul.1148), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1452 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.895), dimensions={3}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1453 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.1452), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1454 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%broadcast_in_dim.1453), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1149 = bf16[32,4096,128,128]{3,2,1,0} multiply(%dot_general.137, %broadcast_in_dim.1454), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.184 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%mul.1149), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="checkpoint/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.54 = (bf16[32,4096,128,192]{3,2,1,0}, bf16[32,4096,128,192]{3,2,1,0}, bf16[32,4096,128,128]{3,2,1,0}) custom-call(%sharding_constraint.172, %sharding_constraint.173, %sharding_constraint.174, %constant.464, %constant.464, /*index=5*/%constant.464, %constant.464, %constant.464, %constant.464, %constant.464, /*index=10*/%custom_partitioning.53, %custom_partitioning.52, %sharding_constraint.184), custom_call_target="CustomSPMDPartitioning", api_version=API_VERSION_STATUS_RETURNING, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/dense_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}, backend_config="281457329392288"
+  %custom_partitioning.56 = bf16[32,4096,128,192]{3,2,1,0} get-tuple-element(%custom_partitioning.54), index=1, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/dense_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.186 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%custom_partitioning.56), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.72 = bf16[32,4096,128,128]{3,2,1,0} slice(%sharding_constraint.186), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.57 = bf16[32,4096,128,128]{3,2,1,0} get-tuple-element(%custom_partitioning.54), index=2, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/dense_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.185 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%custom_partitioning.57), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.45 = bf16[32,4096,128,256]{3,2,1,0} concatenate(%split.72, %sharding_constraint.185), dimensions={3}, metadata={op_name="checkpoint/dense_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_46.7 = f32[1024]{0} parameter(46)
+  %reduce_max.1335 = f32[] reduce(%Arg_46.7, %constant.500), dimensions={0}, to_apply=%region_279.365, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.204 = pred[] is-finite(%reduce_max.1335), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.204 = pred[] compare(%reduce_max.1335, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1159 = f32[] divide(%constant.491, %reduce_max.1335), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1160 = f32[] divide(%div.1159, %constant.499), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_47.7 = f32[1]{0} parameter(47)
+  %div.1158 = f32[1]{0} divide(%constant.481, %Arg_47.7), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.412 = f32[1]{0} call(%gt.204, %div.1160, %div.1158), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.413 = f32[1]{0} call(%is_finite.204, %jit__where_.412, %div.1158), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1161 = f32[1]{0} divide(%constant.481, %jit__where_.413), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.898 = bf16[1]{0} convert(%div.1161), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1459 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.898), dimensions={3}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1460 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.1459), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1461 = bf16[32,4096,128,256]{3,2,1,0} broadcast(%broadcast_in_dim.1460), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1162 = bf16[32,4096,128,256]{3,2,1,0} divide(%concatenate.45, %broadcast_in_dim.1461), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.216 = bf16[32,4096,128,256]{3,2,1,0} call(%div.1162, %constant.490, %constant.489), to_apply=%clip_629.264, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.899 = f8e5m2[32,4096,128,256]{3,2,1,0} convert(%jit_clip_.216), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.139 = bf16[32,4096,512]{2,1,0} dot(%convert_element_type.899, %convert_element_type.847), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1152 = f32[1]{0} multiply(%div.1093, %div.1161), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.900 = bf16[1]{0} convert(%mul.1152), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1462 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.900), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1463 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1462), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1464 = bf16[32,4096,512]{2,1,0} broadcast(%broadcast_in_dim.1463), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1153 = bf16[32,4096,512]{2,1,0} multiply(%dot_general.139, %broadcast_in_dim.1464), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1159 = bf16[1,1,512]{2,1,0} broadcast(%broadcast_in_dim.1379), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1160 = bf16[512]{0} reshape(%mul.1159), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1161 = bf16[32,4096,512]{2,1,0} broadcast(%mul.1160), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1162 = bf16[32,4096,512]{2,1,0} multiply(%mul.1153, %mul.1161), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.907 = f32[32,4096,512]{2,1,0} convert(%mul.1162), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1164 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.31), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1165 = f32[32,4096]{1,0} reshape(%mul.1164), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1166 = f32[32,4096,512]{2,1,0} broadcast(%mul.1165), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1167 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.907, %mul.1166), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1163 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.836, %convert_element_type.907), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.485 = f32[32,4096]{1,0} reduce(%mul.1163, %constant.501), dimensions={2}, to_apply=%region_285.371, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.54 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.485), metadata={op_name="checkpoint/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1080 = f32[32,4096,1]{2,1,0} divide(%rsqrt.31, %add.125), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1087 = f32[32,4096,1]{2,1,0} multiply(%div.1080, %broadcast.180), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1168 = f32[32,4096,1]{2,1,0} multiply(%reshape.54, %mul.1087), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1163 = f32[32,4096,1]{2,1,0} divide(%mul.1168, %broadcast.168), metadata={op_name="checkpoint/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.486 = f32[32,4096]{1,0} reduce(%div.1163, %constant.501), dimensions={2}, to_apply=%region_286.372, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1470 = f32[32,4096,512]{2,1,0} broadcast(%reduce_sum.486), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.467 = f32[] constant(2)
+  %mul.1045 = f32[32,4096,512]{2,1,0} broadcast(%constant.467), dimensions={}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1086 = f32[32,4096,512]{2,1,0} multiply(%convert_element_type.836, %mul.1045), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1169 = f32[32,4096,512]{2,1,0} multiply(%broadcast_in_dim.1470, %mul.1086), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.54 = f32[32,4096,512]{2,1,0} add(%mul.1167, %mul.1169), metadata={op_name="checkpoint/dense_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.908 = bf16[32,4096,512]{2,1,0} convert(%add_any.54), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.463 = f32[] constant(0)
+  %broadcast.166 = f32[32,4096,1,32]{3,2,1,0} broadcast(%constant.463), dimensions={}
+  %split.73 = bf16[32,4096,128,64]{3,2,1,0} slice(%sharding_constraint.186), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.492 = bf16[] constant(0)
+  %reduce_sum.481 = bf16[32,4096,64]{2,1,0} reduce(%split.73, %constant.492), dimensions={2}, to_apply=%region_278.364, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1458 = bf16[32,4096,1,64]{3,2,1,0} reshape(%reduce_sum.481), metadata={op_name="checkpoint/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.902 = f32[32,4096,1,64]{3,2,1,0} convert(%broadcast_in_dim.1458), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.75 = f32[32,4096,1,32]{3,2,1,0} slice(%convert_element_type.902), slice={[0:32], [0:4096], [0:1], [32:64]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %neg.19 = f32[32,4096,1,32]{3,2,1,0} negate(%split.75), metadata={op_name="checkpoint/dense_layers/self_attention/neg" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.12 = c64[32,4096,1,32]{3,2,1,0} complex(%broadcast.166, %neg.19), metadata={op_name="checkpoint/dense_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.74 = f32[32,4096,1,32]{3,2,1,0} slice(%convert_element_type.902), slice={[0:32], [0:4096], [0:1], [0:32]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.13 = c64[32,4096,1,32]{3,2,1,0} complex(%split.74, %broadcast.166), metadata={op_name="checkpoint/dense_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.52 = c64[32,4096,1,32]{3,2,1,0} add(%complex.12, %complex.13), metadata={op_name="checkpoint/dense_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1156 = c64[32,4096,1,32]{3,2,1,0} multiply(%add_any.52, %broadcast_in_dim.1382), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1157 = c64[32,4096,1,32]{3,2,1,0} multiply(%mul.1156, %broadcast.167), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.905 = f32[32,4096,1,32]{3,2,1,0} real(%mul.1157), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.906 = bf16[32,4096,1,32]{3,2,1,0} convert(%convert_element_type.905), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1468 = bf16[32,4096,1,32,1]{4,3,2,1,0} reshape(%convert_element_type.906), metadata={op_name="checkpoint/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.14 = bf16[32,4096,1,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.1468, %constant.492), padding=0_0x0_0x0_0x0_0x1_0, metadata={op_name="checkpoint/dense_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.903 = f32[32,4096,1,32]{3,2,1,0} real(%mul.1156), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.904 = bf16[32,4096,1,32]{3,2,1,0} convert(%convert_element_type.903), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1469 = bf16[32,4096,1,32,1]{4,3,2,1,0} reshape(%convert_element_type.904), metadata={op_name="checkpoint/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.15 = bf16[32,4096,1,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.1469, %constant.492), padding=0_0x0_0x0_0x0_0x0_1, metadata={op_name="checkpoint/dense_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.53 = bf16[32,4096,1,32,2]{4,3,2,1,0} add(%pad.14, %pad.15), metadata={op_name="checkpoint/dense_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.52 = bf16[32,4096,1,64]{3,2,1,0} reshape(%add_any.53), metadata={op_name="checkpoint/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.482 = bf16[32,4096,64]{2,1,0} reduce(%reshape.52, %constant.492), dimensions={2}, to_apply=%region_282.368, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.46 = bf16[32,4096,576]{2,1,0} concatenate(%convert_element_type.908, %reduce_sum.482), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_33.7 = f32[1024]{0} parameter(33)
+  %reduce_max.1339 = f32[] reduce(%Arg_33.7, %constant.500), dimensions={0}, to_apply=%region_294.380, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.206 = pred[] is-finite(%reduce_max.1339), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.206 = pred[] compare(%reduce_max.1339, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1171 = f32[] divide(%constant.491, %reduce_max.1339), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1172 = f32[] divide(%div.1171, %constant.499), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_35.7 = f32[1]{0} parameter(35)
+  %div.1170 = f32[1]{0} divide(%constant.481, %Arg_35.7), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.416 = f32[1]{0} call(%gt.206, %div.1172, %div.1170), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.417 = f32[1]{0} call(%is_finite.206, %jit__where_.416, %div.1170), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1173 = f32[1]{0} divide(%constant.481, %jit__where_.417), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.922 = bf16[1]{0} convert(%div.1173), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1483 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.922), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1484 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1483), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1485 = bf16[32,4096,576]{2,1,0} broadcast(%broadcast_in_dim.1484), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1174 = bf16[32,4096,576]{2,1,0} divide(%concatenate.46, %broadcast_in_dim.1485), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.218 = bf16[32,4096,576]{2,1,0} call(%div.1174, %constant.490, %constant.489), to_apply=%clip_689.281, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.923 = f8e5m2[32,4096,576]{2,1,0} convert(%jit_clip_.218), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_62.7 = bf16[7168,576]{1,0} parameter(62)
+  %Arg_29.7 = f32[1024]{0} parameter(29)
+  %reduce_max.1301 = f32[] reduce(%Arg_29.7, %constant.500), dimensions={0}, to_apply=%region_220.300, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.187 = pred[] is-finite(%reduce_max.1301), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.187 = pred[] compare(%reduce_max.1301, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1059 = f32[] divide(%constant.498, %reduce_max.1301), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1060 = f32[] divide(%div.1059, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_31.7 = f32[1]{0} parameter(31)
+  %div.1058 = f32[1]{0} divide(%constant.481, %Arg_31.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.378 = f32[1]{0} call(%gt.187, %div.1060, %div.1058), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.379 = f32[1]{0} call(%is_finite.187, %jit__where_.378, %div.1058), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1061 = f32[1]{0} divide(%constant.481, %jit__where_.379), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.821 = bf16[1]{0} convert(%div.1061), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1361 = bf16[7168,1]{1,0} broadcast(%convert_element_type.821), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1362 = bf16[7168]{0} reshape(%broadcast_in_dim.1361), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1363 = bf16[7168,576]{1,0} broadcast(%broadcast_in_dim.1362), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1062 = bf16[7168,576]{1,0} divide(%Arg_62.7, %broadcast_in_dim.1363), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.197 = bf16[7168,576]{1,0} call(%div.1062, %constant.495, %constant.494), to_apply=%clip_490.117, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.822 = f8e4m3fn[7168,576]{1,0} convert(%jit_clip_.197), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.143 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.923, %convert_element_type.822), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1192 = f32[1]{0} multiply(%div.1061, %div.1173), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.924 = bf16[1]{0} convert(%mul.1192), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1486 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.924), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1487 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1486), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1488 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1487), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1193 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.143, %broadcast_in_dim.1488), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %custom_partitioning.55 = bf16[32,4096,128,192]{3,2,1,0} get-tuple-element(%custom_partitioning.54), index=0, frontend_attributes={xla.sdy.sharding_rule="#sdy.op_sharding_rule<([i, j, k, l], [m, n, o, p], [q, r, s, t], [u], [v], [w], [x], [y], [z], [z_1], [z_2, z_3, z_4], [z_5, z_6, z_7, z_8], [z_9, z_10, z_11, z_12])->([i, j, k, l], [m, n, o, p], [q, r, s, t]) {i=32, j=4096, k=128, l=192, m=32, n=4096, o=128, p=192, q=32, r=4096, s=128, t=128, u=0, v=0, w=0, x=0, y=0, z=0, z_1=0, z_2=32, z_3=128, z_4=4096, z_5=32, z_6=4096, z_7=128, z_8=128, z_9=32, z_10=4096, z_11=128, z_12=128}, custom>"}, metadata={op_name="checkpoint/dense_layers/self_attention/custom_partitioning" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.187 = bf16[32,4096,128,192]{3,2,1,0} custom-call(%custom_partitioning.55), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\"}, {}]>]>"}, metadata={op_name="checkpoint/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1170 = bf16[32,4096,128,192]{3,2,1,0} multiply(%sharding_constraint.187, %broadcast.169), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.76 = bf16[32,4096,128,128]{3,2,1,0} slice(%mul.1170), slice={[0:32], [0:4096], [0:128], [0:128]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.462 = f32[] constant(0)
+  %broadcast.165 = f32[32,4096,128,32]{3,2,1,0} broadcast(%constant.462), dimensions={}
+  %split.77 = bf16[32,4096,128,64]{3,2,1,0} slice(%mul.1170), slice={[0:32], [0:4096], [0:128], [128:192]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.909 = f32[32,4096,128,64]{3,2,1,0} convert(%split.77), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.79 = f32[32,4096,128,32]{3,2,1,0} slice(%convert_element_type.909), slice={[0:32], [0:4096], [0:128], [32:64]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %neg.20 = f32[32,4096,128,32]{3,2,1,0} negate(%split.79), metadata={op_name="checkpoint/dense_layers/self_attention/neg" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.14 = c64[32,4096,128,32]{3,2,1,0} complex(%broadcast.165, %neg.20), metadata={op_name="checkpoint/dense_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %split.78 = f32[32,4096,128,32]{3,2,1,0} slice(%convert_element_type.909), slice={[0:32], [0:4096], [0:128], [0:32]}, metadata={op_name="checkpoint/dense_layers/self_attention/split" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %complex.15 = c64[32,4096,128,32]{3,2,1,0} complex(%split.78, %broadcast.165), metadata={op_name="checkpoint/dense_layers/self_attention/complex" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.55 = c64[32,4096,128,32]{3,2,1,0} add(%complex.14, %complex.15), metadata={op_name="checkpoint/dense_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1171 = c64[32,4096,1,32]{3,2,1,0} broadcast(%broadcast_in_dim.1377), dimensions={0,1,2,3}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1172 = c64[32,4096,32]{2,1,0} reshape(%mul.1171), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1173 = c64[32,4096,128,32]{3,2,1,0} broadcast(%mul.1172), dimensions={0,1,3}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1174 = c64[32,4096,128,32]{3,2,1,0} multiply(%add_any.55, %mul.1173), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1175 = c64[32,4096,128,32]{3,2,1,0} multiply(%mul.1174, %broadcast.170), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.912 = f32[32,4096,128,32]{3,2,1,0} real(%mul.1175), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.913 = bf16[32,4096,128,32]{3,2,1,0} convert(%convert_element_type.912), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1471 = bf16[32,4096,128,32,1]{4,3,2,1,0} reshape(%convert_element_type.913), metadata={op_name="checkpoint/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.16 = bf16[32,4096,128,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.1471, %constant.492), padding=0_0x0_0x0_0x0_0x1_0, metadata={op_name="checkpoint/dense_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.910 = f32[32,4096,128,32]{3,2,1,0} real(%mul.1174), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.911 = bf16[32,4096,128,32]{3,2,1,0} convert(%convert_element_type.910), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1472 = bf16[32,4096,128,32,1]{4,3,2,1,0} reshape(%convert_element_type.911), metadata={op_name="checkpoint/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %pad.17 = bf16[32,4096,128,32,2]{4,3,2,1,0} pad(%broadcast_in_dim.1472, %constant.492), padding=0_0x0_0x0_0x0_0x0_1, metadata={op_name="checkpoint/dense_layers/self_attention/pad" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.56 = bf16[32,4096,128,32,2]{4,3,2,1,0} add(%pad.16, %pad.17), metadata={op_name="checkpoint/dense_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.55 = bf16[32,4096,128,64]{3,2,1,0} reshape(%add_any.56), metadata={op_name="checkpoint/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %concatenate.47 = bf16[32,4096,128,192]{3,2,1,0} concatenate(%split.76, %reshape.55), dimensions={3}, metadata={op_name="checkpoint/dense_layers/self_attention/concatenate" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_52.7 = f32[1024]{0} parameter(52)
+  %reduce_max.1337 = f32[] reduce(%Arg_52.7, %constant.500), dimensions={0}, to_apply=%region_287.373, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.205 = pred[] is-finite(%reduce_max.1337), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.205 = pred[] compare(%reduce_max.1337, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1165 = f32[] divide(%constant.491, %reduce_max.1337), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1166 = f32[] divide(%div.1165, %constant.499), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_53.7 = f32[1]{0} parameter(53)
+  %div.1164 = f32[1]{0} divide(%constant.481, %Arg_53.7), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.414 = f32[1]{0} call(%gt.205, %div.1166, %div.1164), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.415 = f32[1]{0} call(%is_finite.205, %jit__where_.414, %div.1164), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1167 = f32[1]{0} divide(%constant.481, %jit__where_.415), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.915 = bf16[1]{0} convert(%div.1167), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1473 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.915), dimensions={3}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1474 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.1473), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1475 = bf16[32,4096,128,192]{3,2,1,0} broadcast(%broadcast_in_dim.1474), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1168 = bf16[32,4096,128,192]{3,2,1,0} divide(%concatenate.47, %broadcast_in_dim.1475), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.217 = bf16[32,4096,128,192]{3,2,1,0} call(%div.1168, %constant.490, %constant.489), to_apply=%clip_672.273, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.916 = f8e5m2[32,4096,128,192]{3,2,1,0} convert(%jit_clip_.217), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.141 = bf16[32,4096,1536]{2,1,0} dot(%convert_element_type.916, %convert_element_type.830), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1176 = f32[1]{0} multiply(%div.1073, %div.1167), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.917 = bf16[1]{0} convert(%mul.1176), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1476 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.917), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1477 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1476), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1478 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.1477), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1177 = bf16[32,4096,1536]{2,1,0} multiply(%dot_general.141, %broadcast_in_dim.1478), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1181 = bf16[1,1,1536]{2,1,0} broadcast(%broadcast_in_dim.1365), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1182 = bf16[1536]{0} reshape(%mul.1181), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1183 = bf16[32,4096,1536]{2,1,0} broadcast(%mul.1182), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1184 = bf16[32,4096,1536]{2,1,0} multiply(%mul.1177, %mul.1183), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.919 = f32[32,4096,1536]{2,1,0} convert(%mul.1184), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1186 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.30), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1187 = f32[32,4096]{1,0} reshape(%mul.1186), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1188 = f32[32,4096,1536]{2,1,0} broadcast(%mul.1187), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1189 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.919, %mul.1188), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1185 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.823, %convert_element_type.919), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.489 = f32[32,4096]{1,0} reduce(%mul.1185, %constant.501), dimensions={2}, to_apply=%region_292.378, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.57 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.489), metadata={op_name="checkpoint/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1064 = f32[32,4096,1]{2,1,0} divide(%rsqrt.30, %add.122), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1058 = f32[32,4096,1]{2,1,0} multiply(%div.1064, %broadcast.180), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1190 = f32[32,4096,1]{2,1,0} multiply(%reshape.57, %mul.1058), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1169 = f32[32,4096,1]{2,1,0} divide(%mul.1190, %broadcast.179), metadata={op_name="checkpoint/dense_layers/self_attention/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.490 = f32[32,4096]{1,0} reduce(%div.1169, %constant.501), dimensions={2}, to_apply=%region_293.379, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1482 = f32[32,4096,1536]{2,1,0} broadcast(%reduce_sum.490), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.479 = f32[] constant(2)
+  %mul.1046 = f32[32,4096,1536]{2,1,0} broadcast(%constant.479), dimensions={}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1057 = f32[32,4096,1536]{2,1,0} multiply(%convert_element_type.823, %mul.1046), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1191 = f32[32,4096,1536]{2,1,0} multiply(%broadcast_in_dim.1482, %mul.1057), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.57 = f32[32,4096,1536]{2,1,0} add(%mul.1189, %mul.1191), metadata={op_name="checkpoint/dense_layers/self_attention/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.920 = bf16[32,4096,1536]{2,1,0} convert(%add_any.57), metadata={op_name="checkpoint/dense_layers/self_attention/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_32.7 = f32[1024]{0} parameter(32)
+  %reduce_max.1341 = f32[] reduce(%Arg_32.7, %constant.500), dimensions={0}, to_apply=%region_297.383, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.207 = pred[] is-finite(%reduce_max.1341), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.207 = pred[] compare(%reduce_max.1341, %constant.501), direction=GT, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1176 = f32[] divide(%constant.491, %reduce_max.1341), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1177 = f32[] divide(%div.1176, %constant.499), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_34.7 = f32[1]{0} parameter(34)
+  %div.1175 = f32[1]{0} divide(%constant.481, %Arg_34.7), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.418 = f32[1]{0} call(%gt.207, %div.1177, %div.1175), to_apply=%_where_552.214, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.419 = f32[1]{0} call(%is_finite.207, %jit__where_.418, %div.1175), to_apply=%_where_553.215, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1178 = f32[1]{0} divide(%constant.481, %jit__where_.419), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.927 = bf16[1]{0} convert(%div.1178), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1492 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.927), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1493 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1492), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1494 = bf16[32,4096,1536]{2,1,0} broadcast(%broadcast_in_dim.1493), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1179 = bf16[32,4096,1536]{2,1,0} divide(%convert_element_type.920, %broadcast_in_dim.1494), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.219 = bf16[32,4096,1536]{2,1,0} call(%div.1179, %constant.490, %constant.489), to_apply=%clip_697.285, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.928 = f8e5m2[32,4096,1536]{2,1,0} convert(%jit_clip_.219), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_64.7 = bf16[7168,1536]{1,0} parameter(64)
+  %Arg_28.7 = f32[1024]{0} parameter(28)
+  %reduce_max.1299 = f32[] reduce(%Arg_28.7, %constant.500), dimensions={0}, to_apply=%region_217.297, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.186 = pred[] is-finite(%reduce_max.1299), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.186 = pred[] compare(%reduce_max.1299, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1054 = f32[] divide(%constant.498, %reduce_max.1299), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1055 = f32[] divide(%div.1054, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_30.7 = f32[1]{0} parameter(30)
+  %div.1053 = f32[1]{0} divide(%constant.481, %Arg_30.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.376 = f32[1]{0} call(%gt.186, %div.1055, %div.1053), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.377 = f32[1]{0} call(%is_finite.186, %jit__where_.376, %div.1053), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1056 = f32[1]{0} divide(%constant.481, %jit__where_.377), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.818 = bf16[1]{0} convert(%div.1056), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1358 = bf16[7168,1]{1,0} broadcast(%convert_element_type.818), dimensions={1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1359 = bf16[7168]{0} reshape(%broadcast_in_dim.1358), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1360 = bf16[7168,1536]{1,0} broadcast(%broadcast_in_dim.1359), dimensions={0}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1057 = bf16[7168,1536]{1,0} divide(%Arg_64.7, %broadcast_in_dim.1360), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.196 = bf16[7168,1536]{1,0} call(%div.1057, %constant.495, %constant.494), to_apply=%clip_487.113, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.819 = f8e4m3fn[7168,1536]{1,0} convert(%jit_clip_.196), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.145 = bf16[32,4096,7168]{2,1,0} dot(%convert_element_type.928, %convert_element_type.819), lhs_contracting_dims={2}, rhs_contracting_dims={1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1196 = f32[1]{0} multiply(%div.1056, %div.1178), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.929 = bf16[1]{0} convert(%mul.1196), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1495 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.929), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1496 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1495), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1497 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1496), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1197 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.145, %broadcast_in_dim.1497), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.58 = bf16[32,4096,7168]{2,1,0} add(%mul.1193, %mul.1197), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.188 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.58), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.189 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.188), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_58.7 = bf16[7168]{0} parameter(58)
+  %broadcast_in_dim.1354 = bf16[1,1,7168]{2,1,0} reshape(%Arg_58.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1201 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.1354), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1202 = bf16[7168]{0} reshape(%mul.1201), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1203 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.1202), dimensions={2}, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1204 = bf16[32,4096,7168]{2,1,0} multiply(%sharding_constraint.189, %mul.1203), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.931 = f32[32,4096,7168]{2,1,0} convert(%mul.1204), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.812 = f32[32,4096,7168]{2,1,0} convert(%Arg_2.139), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %square.30 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.812, %convert_element_type.812), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/square" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.473 = f32[32,4096]{1,0} reduce(%square.30, %constant.501), dimensions={2}, to_apply=%region_213.293, metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1353 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.473), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1046 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.1353, %broadcast.182), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add.121 = f32[32,4096,1]{2,1,0} add(%div.1046, %broadcast.181), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %rsqrt.29 = f32[32,4096,1]{2,1,0} rsqrt(%add.121), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/rsqrt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1206 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.29), dimensions={0,1,2}, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1207 = f32[32,4096]{1,0} reshape(%mul.1206), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1208 = f32[32,4096,7168]{2,1,0} broadcast(%mul.1207), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1209 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.931, %mul.1208), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1205 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.812, %convert_element_type.931), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.493 = f32[32,4096]{1,0} reduce(%mul.1205, %constant.501), dimensions={2}, to_apply=%region_302.388, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.59 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.493), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1047 = f32[32,4096,1]{2,1,0} divide(%rsqrt.29, %add.121), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1048 = f32[32,4096,1]{2,1,0} multiply(%div.1047, %broadcast.180), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1210 = f32[32,4096,1]{2,1,0} multiply(%reshape.59, %mul.1048), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1180 = f32[32,4096,1]{2,1,0} divide(%mul.1210, %broadcast.182), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.494 = f32[32,4096]{1,0} reduce(%div.1180, %constant.501), dimensions={2}, to_apply=%region_303.389, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1501 = f32[32,4096,7168]{2,1,0} broadcast(%reduce_sum.494), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1047 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.812, %broadcast.183), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1211 = f32[32,4096,7168]{2,1,0} multiply(%broadcast_in_dim.1501, %mul.1047), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.59 = f32[32,4096,7168]{2,1,0} add(%mul.1209, %mul.1211), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.932 = bf16[32,4096,7168]{2,1,0} convert(%add_any.59), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %add_any.60 = bf16[32,4096,7168]{2,1,0} add(%add_any.51, %convert_element_type.932), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/add_any" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.190 = bf16[32,4096,7168]{2,1,0} custom-call(%add_any.60), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1113 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.32), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1114 = f32[32,4096]{1,0} reshape(%mul.1113), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1115 = f32[32,4096,7168]{2,1,0} broadcast(%mul.1114), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1116 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.855, %mul.1115), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.856 = bf16[32,4096,7168]{2,1,0} convert(%mul.1116), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1117 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.1402), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1118 = bf16[7168]{0} reshape(%mul.1117), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1119 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.1118), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1120 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.856, %mul.1119), metadata={op_name="checkpoint/rematted_computation/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.177 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.1120), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_8.7 = f32[1024]{0} parameter(8)
+  %reduce_max.1315 = f32[] reduce(%Arg_8.7, %constant.500), dimensions={0}, to_apply=%region_244.324, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.194 = pred[] is-finite(%reduce_max.1315), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.194 = pred[] compare(%reduce_max.1315, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1108 = f32[] divide(%constant.498, %reduce_max.1315), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1109 = f32[] divide(%div.1108, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_9.7 = f32[1]{0} parameter(9)
+  %div.1107 = f32[1]{0} divide(%constant.481, %Arg_9.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.392 = f32[1]{0} call(%gt.194, %div.1109, %div.1107), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.393 = f32[1]{0} call(%is_finite.194, %jit__where_.392, %div.1107), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1110 = f32[1]{0} divide(%constant.481, %jit__where_.393), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.858 = bf16[1]{0} convert(%div.1110), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1403 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.858), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1404 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1403), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1405 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1404), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1111 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.177, %broadcast_in_dim.1405), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.206 = bf16[32,4096,7168]{2,1,0} call(%div.1111, %constant.495, %constant.494), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.859 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.206), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.136 = bf16[18432,7168]{1,0} dot(%convert_element_type.887, %convert_element_type.859), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.91 = bf16[7168,18432]{0,1} transpose(%dot_general.136), dimensions={1,0}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1134 = f32[1]{0} multiply(%div.1110, %div.1150), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.889 = bf16[1]{0} convert(%mul.1134), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1445 = bf16[7168,1]{1,0} broadcast(%convert_element_type.889), dimensions={1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1446 = bf16[7168]{0} reshape(%broadcast_in_dim.1445), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1447 = bf16[7168,18432]{1,0} broadcast(%broadcast_in_dim.1446), dimensions={0}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1135 = bf16[7168,18432]{0,1} multiply(%transpose.91, %broadcast_in_dim.1447), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_14.7 = f32[1024]{0} parameter(14)
+  %reduce_max.1319 = f32[] reduce(%Arg_14.7, %constant.500), dimensions={0}, to_apply=%region_250.332, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.196 = pred[] is-finite(%reduce_max.1319), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.196 = pred[] compare(%reduce_max.1319, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1118 = f32[] divide(%constant.498, %reduce_max.1319), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1119 = f32[] divide(%div.1118, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_15.7 = f32[1]{0} parameter(15)
+  %div.1117 = f32[1]{0} divide(%constant.481, %Arg_15.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.396 = f32[1]{0} call(%gt.196, %div.1119, %div.1117), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.397 = f32[1]{0} call(%is_finite.196, %jit__where_.396, %div.1117), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1120 = f32[1]{0} divide(%constant.481, %jit__where_.397), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.864 = bf16[1]{0} convert(%div.1120), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1409 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.864), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1410 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1409), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1411 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1410), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1121 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.177, %broadcast_in_dim.1411), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.208 = bf16[32,4096,7168]{2,1,0} call(%div.1121, %constant.495, %constant.494), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.865 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.208), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.134 = bf16[18432,7168]{1,0} dot(%convert_element_type.882, %convert_element_type.865), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.90 = bf16[7168,18432]{0,1} transpose(%dot_general.134), dimensions={1,0}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1130 = f32[1]{0} multiply(%div.1120, %div.1145), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.884 = bf16[1]{0} convert(%mul.1130), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1436 = bf16[7168,1]{1,0} broadcast(%convert_element_type.884), dimensions={1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1437 = bf16[7168]{0} reshape(%broadcast_in_dim.1436), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1438 = bf16[7168,18432]{1,0} broadcast(%broadcast_in_dim.1437), dimensions={0}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1131 = bf16[7168,18432]{0,1} multiply(%transpose.90, %broadcast_in_dim.1438), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1121 = bf16[32,4096,18432]{2,1,0} multiply(%jit_silu_.32, %device_put.39), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.178 = bf16[32,4096,18432]{2,1,0} custom-call(%mul.1121), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_20.7 = f32[1024]{0} parameter(20)
+  %reduce_max.1323 = f32[] reduce(%Arg_20.7, %constant.500), dimensions={0}, to_apply=%region_256.338, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.198 = pred[] is-finite(%reduce_max.1323), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.198 = pred[] compare(%reduce_max.1323, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1128 = f32[] divide(%constant.498, %reduce_max.1323), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1129 = f32[] divide(%div.1128, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_21.7 = f32[1]{0} parameter(21)
+  %div.1127 = f32[1]{0} divide(%constant.481, %Arg_21.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.400 = f32[1]{0} call(%gt.198, %div.1129, %div.1127), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.401 = f32[1]{0} call(%is_finite.198, %jit__where_.400, %div.1127), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1130 = f32[1]{0} divide(%constant.481, %jit__where_.401), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.870 = bf16[1]{0} convert(%div.1130), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1415 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.870), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1416 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1415), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1417 = bf16[32,4096,18432]{2,1,0} broadcast(%broadcast_in_dim.1416), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1131 = bf16[32,4096,18432]{2,1,0} divide(%sharding_constraint.178, %broadcast_in_dim.1417), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.210 = bf16[32,4096,18432]{2,1,0} call(%div.1131, %constant.495, %constant.494), to_apply=%clip_779.341, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.871 = f8e4m3fn[32,4096,18432]{2,1,0} convert(%jit_clip_.210), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.132 = bf16[7168,18432]{1,0} dot(%convert_element_type.877, %convert_element_type.871), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.89 = bf16[18432,7168]{0,1} transpose(%dot_general.132), dimensions={1,0}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1124 = f32[1]{0} multiply(%div.1130, %div.1140), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.879 = bf16[1]{0} convert(%mul.1124), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1427 = bf16[18432,1]{1,0} broadcast(%convert_element_type.879), dimensions={1}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1428 = bf16[18432]{0} reshape(%broadcast_in_dim.1427), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1429 = bf16[18432,7168]{1,0} broadcast(%broadcast_in_dim.1428), dimensions={0}, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1125 = bf16[18432,7168]{0,1} multiply(%transpose.89, %broadcast_in_dim.1429), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1136 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.856, %sharding_constraint.182), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.477 = bf16[7168]{0} reduce(%mul.1136, %constant.492), dimensions={0,1}, to_apply=%region_271.357, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.50 = bf16[1,1,7168]{2,1,0} reshape(%reduce_sum.477), metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.478 = bf16[7168]{0} reduce(%reshape.50, %constant.492), dimensions={0,1}, to_apply=%region_272.358, metadata={op_name="checkpoint/dense_layers/post_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1049 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.29), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1050 = f32[32,4096]{1,0} reshape(%mul.1049), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1051 = f32[32,4096,7168]{2,1,0} broadcast(%mul.1050), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1052 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.812, %mul.1051), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.813 = bf16[32,4096,7168]{2,1,0} convert(%mul.1052), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1200 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.813, %sharding_constraint.189), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.491 = bf16[7168]{0} reduce(%mul.1200, %constant.492), dimensions={0,1}, to_apply=%region_300.386, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.58 = bf16[1,1,7168]{2,1,0} reshape(%reduce_sum.491), metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.492 = bf16[7168]{0} reduce(%reshape.58, %constant.492), dimensions={0,1}, to_apply=%region_301.387, metadata={op_name="checkpoint/dense_layers/pre_self_attention_layer_norm/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1158 = bf16[32,4096,512]{2,1,0} multiply(%convert_element_type.837, %mul.1153), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.483 = bf16[512]{0} reduce(%mul.1158, %constant.492), dimensions={0,1}, to_apply=%region_283.369, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.53 = bf16[1,1,512]{2,1,0} reshape(%reduce_sum.483), metadata={op_name="checkpoint/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.484 = bf16[512]{0} reduce(%reshape.53, %constant.492), dimensions={0,1}, to_apply=%region_284.370, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.175 = bf16[32,4096,128,128]{3,2,1,0} custom-call(%custom_partitioning.52), custom_call_target="Sharding", sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"context\"}, {\"tensor\", \"tensor_transpose\", \"sequence\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_36.7 = f32[1024]{0} parameter(36)
+  %reduce_max.1311 = f32[] reduce(%Arg_36.7, %constant.500), dimensions={0}, to_apply=%region_237.317, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.192 = pred[] is-finite(%reduce_max.1311), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.192 = pred[] compare(%reduce_max.1311, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1096 = f32[] divide(%constant.498, %reduce_max.1311), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1097 = f32[] divide(%div.1096, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_37.7 = f32[1]{0} parameter(37)
+  %div.1095 = f32[1]{0} divide(%constant.481, %Arg_37.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.388 = f32[1]{0} call(%gt.192, %div.1097, %div.1095), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.389 = f32[1]{0} call(%is_finite.192, %jit__where_.388, %div.1095), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1098 = f32[1]{0} divide(%constant.481, %jit__where_.389), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.850 = bf16[1]{0} convert(%div.1098), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1395 = bf16[32,4096,128,1]{3,2,1,0} broadcast(%convert_element_type.850), dimensions={3}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1396 = bf16[32,4096,128]{2,1,0} reshape(%broadcast_in_dim.1395), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1397 = bf16[32,4096,128,128]{3,2,1,0} broadcast(%broadcast_in_dim.1396), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1099 = bf16[32,4096,128,128]{3,2,1,0} divide(%sharding_constraint.175, %broadcast_in_dim.1397), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.204 = bf16[32,4096,128,128]{3,2,1,0} call(%div.1099, %constant.495, %constant.494), to_apply=%clip_512.143, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.851 = f8e4m3fn[32,4096,128,128]{3,2,1,0} convert(%jit_clip_.204), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.138 = bf16[7168,128,128]{2,1,0} dot(%convert_element_type.894, %convert_element_type.851), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.92 = bf16[128,128,7168]{1,0,2} transpose(%dot_general.138), dimensions={1,2,0}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1150 = f32[1]{0} multiply(%div.1098, %div.1156), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.896 = bf16[1]{0} convert(%mul.1150), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1455 = bf16[128,128,1]{2,1,0} broadcast(%convert_element_type.896), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1456 = bf16[128,128]{1,0} reshape(%broadcast_in_dim.1455), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1457 = bf16[128,128,7168]{2,1,0} broadcast(%broadcast_in_dim.1456), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1151 = bf16[128,128,7168]{1,0,2} multiply(%transpose.92, %broadcast_in_dim.1457), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1180 = bf16[32,4096,1536]{2,1,0} multiply(%convert_element_type.824, %mul.1177), metadata={op_name="checkpoint/dense_layers/self_attention/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.487 = bf16[1536]{0} reduce(%mul.1180, %constant.492), dimensions={0,1}, to_apply=%region_290.376, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reshape.56 = bf16[1,1,1536]{2,1,0} reshape(%reduce_sum.487), metadata={op_name="checkpoint/dense_layers/self_attention/reshape" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_sum.488 = bf16[1536]{0} reduce(%reshape.56, %constant.492), dimensions={0,1}, to_apply=%region_291.377, metadata={op_name="checkpoint/dense_layers/self_attention/reduce_sum" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1053 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.1354), dimensions={0,1,2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1054 = bf16[7168]{0} reshape(%mul.1053), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1055 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.1054), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1056 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.813, %mul.1055), metadata={op_name="checkpoint/rematted_computation/dense_layers/pre_self_attention_layer_norm/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.170 = bf16[32,4096,7168]{2,1,0} custom-call(%mul.1056), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %sharding_constraint.171 = bf16[32,4096,7168]{2,1,0} custom-call(%sharding_constraint.170), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/sharding_constraint" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_26.7 = f32[1024]{0} parameter(26)
+  %reduce_max.1297 = f32[] reduce(%Arg_26.7, %constant.500), dimensions={0}, to_apply=%region_214.294, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %is_finite.185 = pred[] is-finite(%reduce_max.1297), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/is_finite" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %gt.185 = pred[] compare(%reduce_max.1297, %constant.501), direction=GT, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/gt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1049 = f32[] divide(%constant.498, %reduce_max.1297), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1050 = f32[] divide(%div.1049, %constant.499), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %Arg_27.7 = f32[1]{0} parameter(27)
+  %div.1048 = f32[1]{0} divide(%constant.481, %Arg_27.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.374 = f32[1]{0} call(%gt.185, %div.1050, %div.1048), to_apply=%_where_476.104, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__where_.375 = f32[1]{0} call(%is_finite.185, %jit__where_.374, %div.1048), to_apply=%_where_477.105, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_where)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1051 = f32[1]{0} divide(%constant.481, %jit__where_.375), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.815 = bf16[1]{0} convert(%div.1051), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1355 = bf16[32,4096,1]{2,1,0} broadcast(%convert_element_type.815), dimensions={2}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1356 = bf16[32,4096]{1,0} reshape(%broadcast_in_dim.1355), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1357 = bf16[32,4096,7168]{2,1,0} broadcast(%broadcast_in_dim.1356), dimensions={0,1}, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %div.1052 = bf16[32,4096,7168]{2,1,0} divide(%sharding_constraint.171, %broadcast_in_dim.1357), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/div" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit_clip_.195 = bf16[32,4096,7168]{2,1,0} call(%div.1052, %constant.495, %constant.494), to_apply=%clip_484.109, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(clip)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.816 = f8e4m3fn[32,4096,7168]{2,1,0} convert(%jit_clip_.195), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.144 = bf16[576,7168]{1,0} dot(%convert_element_type.923, %convert_element_type.816), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.95 = bf16[7168,576]{0,1} transpose(%dot_general.144), dimensions={1,0}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1194 = f32[1]{0} multiply(%div.1051, %div.1173), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.925 = bf16[1]{0} convert(%mul.1194), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1489 = bf16[7168,1]{1,0} broadcast(%convert_element_type.925), dimensions={1}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1490 = bf16[7168]{0} reshape(%broadcast_in_dim.1489), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1491 = bf16[7168,576]{1,0} broadcast(%broadcast_in_dim.1490), dimensions={0}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1195 = bf16[7168,576]{0,1} multiply(%transpose.95, %broadcast_in_dim.1491), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.140 = bf16[128,256,512]{2,1,0} dot(%convert_element_type.899, %convert_element_type.844), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.93 = bf16[512,128,256]{0,2,1} transpose(%dot_general.140), dimensions={2,0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1154 = f32[1]{0} multiply(%div.1088, %div.1161), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.901 = bf16[1]{0} convert(%mul.1154), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1465 = bf16[512,128,1]{2,1,0} broadcast(%convert_element_type.901), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1466 = bf16[512,128]{1,0} reshape(%broadcast_in_dim.1465), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1467 = bf16[512,128,256]{2,1,0} broadcast(%broadcast_in_dim.1466), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1155 = bf16[512,128,256]{0,2,1} multiply(%transpose.93, %broadcast_in_dim.1467), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.146 = bf16[1536,7168]{1,0} dot(%convert_element_type.928, %convert_element_type.816), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.96 = bf16[7168,1536]{0,1} transpose(%dot_general.146), dimensions={1,0}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1198 = f32[1]{0} multiply(%div.1051, %div.1178), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.930 = bf16[1]{0} convert(%mul.1198), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1498 = bf16[7168,1]{1,0} broadcast(%convert_element_type.930), dimensions={1}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1499 = bf16[7168]{0} reshape(%broadcast_in_dim.1498), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1500 = bf16[7168,1536]{1,0} broadcast(%broadcast_in_dim.1499), dimensions={0}, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1199 = bf16[7168,1536]{0,1} multiply(%transpose.96, %broadcast_in_dim.1500), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %dot_general.142 = bf16[128,192,1536]{2,1,0} dot(%convert_element_type.916, %convert_element_type.827), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, operand_precision={highest,highest}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/dot_general" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %transpose.94 = bf16[1536,128,192]{0,2,1} transpose(%dot_general.142), dimensions={2,0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1178 = f32[1]{0} multiply(%div.1068, %div.1167), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.918 = bf16[1]{0} convert(%mul.1178), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1479 = bf16[1536,128,1]{2,1,0} broadcast(%convert_element_type.918), dimensions={2}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1480 = bf16[1536,128]{1,0} reshape(%broadcast_in_dim.1479), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %broadcast_in_dim.1481 = bf16[1536,128,192]{2,1,0} broadcast(%broadcast_in_dim.1480), dimensions={0,1}, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %mul.1179 = bf16[1536,128,192]{0,2,1} multiply(%transpose.94, %broadcast_in_dim.1481), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/mul" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.112 = f32[1024]{0} call(%Arg_8.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.480 = s32[1]{0} constant({0})
+  %abs.112 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.177), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %constant.497 = bf16[] constant(-inf)
+  %reduce_max.1316 = bf16[] reduce(%abs.112, %constant.497), dimensions={0,1,2}, to_apply=%region_245.325, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.857 = f32[] convert(%reduce_max.1316), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.364 = f32[1024]{0} scatter(%jit__roll_static_.112, %constant.480, %convert_element_type.857), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_246.326, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.113 = f32[1024]{0} call(%Arg_10.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.113 = bf16[7168,18432]{1,0} abs(%Arg_54.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1318 = bf16[] reduce(%abs.113, %constant.497), dimensions={0,1}, to_apply=%region_248.328, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.860 = f32[] convert(%reduce_max.1318), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.365 = f32[1024]{0} scatter(%jit__roll_static_.113, %constant.480, %convert_element_type.860), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_249.329, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.120 = f32[1024]{0} call(%Arg_12.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.120 = bf16[32,4096,18432]{2,1,0} abs(%jit_silu_.35), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1332 = bf16[] reduce(%abs.120, %constant.497), dimensions={0,1,2}, to_apply=%region_269.355, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.885 = f32[] convert(%reduce_max.1332), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.372 = f32[1024]{0} scatter(%jit__roll_static_.120, %constant.480, %convert_element_type.885), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_270.356, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.114 = f32[1024]{0} call(%Arg_14.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.114 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.177), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1320 = bf16[] reduce(%abs.114, %constant.497), dimensions={0,1,2}, to_apply=%region_251.333, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.863 = f32[] convert(%reduce_max.1320), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.366 = f32[1024]{0} scatter(%jit__roll_static_.114, %constant.480, %convert_element_type.863), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_252.334, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.115 = f32[1024]{0} call(%Arg_16.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.115 = bf16[7168,18432]{1,0} abs(%Arg_55.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1322 = bf16[] reduce(%abs.115, %constant.497), dimensions={0,1}, to_apply=%region_254.336, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.866 = f32[] convert(%reduce_max.1322), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.367 = f32[1024]{0} scatter(%jit__roll_static_.115, %constant.480, %convert_element_type.866), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_255.337, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.119 = f32[1024]{0} call(%Arg_18.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.119 = bf16[32,4096,18432]{2,1,0} abs(%mul.1126), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1330 = bf16[] reduce(%abs.119, %constant.497), dimensions={0,1,2}, to_apply=%region_266.350, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.880 = f32[] convert(%reduce_max.1330), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.371 = f32[1024]{0} scatter(%jit__roll_static_.119, %constant.480, %convert_element_type.880), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_267.351, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.116 = f32[1024]{0} call(%Arg_20.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.116 = bf16[32,4096,18432]{2,1,0} abs(%sharding_constraint.178), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1324 = bf16[] reduce(%abs.116, %constant.497), dimensions={0,1,2}, to_apply=%region_257.339, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.869 = f32[] convert(%reduce_max.1324), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.368 = f32[1024]{0} scatter(%jit__roll_static_.116, %constant.480, %convert_element_type.869), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_258.340, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.117 = f32[1024]{0} call(%Arg_22.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.117 = bf16[18432,7168]{1,0} abs(%Arg_56.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1326 = bf16[] reduce(%abs.117, %constant.497), dimensions={0,1}, to_apply=%region_260.343, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.872 = f32[] convert(%reduce_max.1326), metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.369 = f32[1024]{0} scatter(%jit__roll_static_.117, %constant.480, %convert_element_type.872), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_261.344, metadata={op_name="checkpoint/rematted_computation/dense_layers/mlp/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.118 = f32[1024]{0} call(%Arg_24.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.118 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.180), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1328 = bf16[] reduce(%abs.118, %constant.497), dimensions={0,1,2}, to_apply=%region_263.347, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.875 = f32[] convert(%reduce_max.1328), metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.370 = f32[1024]{0} scatter(%jit__roll_static_.118, %constant.480, %convert_element_type.875), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_264.348, metadata={op_name="checkpoint/dense_layers/mlp/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.103 = f32[1024]{0} call(%Arg_26.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.103 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.171), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1298 = bf16[] reduce(%abs.103, %constant.497), dimensions={0,1,2}, to_apply=%region_215.295, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.814 = f32[] convert(%reduce_max.1298), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.355 = f32[1024]{0} scatter(%jit__roll_static_.103, %constant.480, %convert_element_type.814), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_216.296, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.104 = f32[1024]{0} call(%Arg_28.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.104 = bf16[7168,1536]{1,0} abs(%Arg_64.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1300 = bf16[] reduce(%abs.104, %constant.497), dimensions={0,1}, to_apply=%region_218.298, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.817 = f32[] convert(%reduce_max.1300), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.356 = f32[1024]{0} scatter(%jit__roll_static_.104, %constant.480, %convert_element_type.817), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_219.299, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.105 = f32[1024]{0} call(%Arg_29.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.105 = bf16[7168,576]{1,0} abs(%Arg_62.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1302 = bf16[] reduce(%abs.105, %constant.497), dimensions={0,1}, to_apply=%region_221.301, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.820 = f32[] convert(%reduce_max.1302), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.357 = f32[1024]{0} scatter(%jit__roll_static_.105, %constant.480, %convert_element_type.820), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_222.302, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/fp8_einsum/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.125 = f32[1024]{0} call(%Arg_32.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.125 = bf16[32,4096,1536]{2,1,0} abs(%convert_element_type.920), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1342 = bf16[] reduce(%abs.125, %constant.497), dimensions={0,1,2}, to_apply=%region_298.384, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.926 = f32[] convert(%reduce_max.1342), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.377 = f32[1024]{0} scatter(%jit__roll_static_.125, %constant.480, %convert_element_type.926), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_299.385, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.124 = f32[1024]{0} call(%Arg_33.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.124 = bf16[32,4096,576]{2,1,0} abs(%concatenate.46), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1340 = bf16[] reduce(%abs.124, %constant.497), dimensions={0,1,2}, to_apply=%region_295.381, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.921 = f32[] convert(%reduce_max.1340), metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.376 = f32[1024]{0} scatter(%jit__roll_static_.124, %constant.480, %convert_element_type.921), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_296.382, metadata={op_name="checkpoint/dense_layers/self_attention/fp8_einsum/BSD,DH->BSH/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.110 = f32[1024]{0} call(%Arg_36.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.110 = bf16[32,4096,128,128]{3,2,1,0} abs(%sharding_constraint.175), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1312 = bf16[] reduce(%abs.110, %constant.497), dimensions={0,1,2,3}, to_apply=%region_238.318, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.849 = f32[] convert(%reduce_max.1312), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.362 = f32[1024]{0} scatter(%jit__roll_static_.110, %constant.480, %convert_element_type.849), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_239.319, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.111 = f32[1024]{0} call(%Arg_38.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.111 = bf16[128,128,7168]{2,1,0} abs(%Arg_60.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1314 = bf16[] reduce(%abs.111, %constant.497), dimensions={0,1,2}, to_apply=%region_241.321, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.852 = f32[] convert(%reduce_max.1314), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.363 = f32[1024]{0} scatter(%jit__roll_static_.111, %constant.480, %convert_element_type.852), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_242.322, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.121 = f32[1024]{0} call(%Arg_40.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.121 = bf16[32,4096,7168]{2,1,0} abs(%sharding_constraint.183), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1334 = bf16[] reduce(%abs.121, %constant.497), dimensions={0,1,2}, to_apply=%region_276.362, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.892 = f32[] convert(%reduce_max.1334), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.373 = f32[1024]{0} scatter(%jit__roll_static_.121, %constant.480, %convert_element_type.892), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_277.363, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_2/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.108 = f32[1024]{0} call(%Arg_42.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.108 = bf16[32,4096,512]{2,1,0} abs(%mul.1095), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1308 = bf16[] reduce(%abs.108, %constant.497), dimensions={0,1,2}, to_apply=%region_232.312, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.842 = f32[] convert(%reduce_max.1308), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.360 = f32[1024]{0} scatter(%jit__roll_static_.108, %constant.480, %convert_element_type.842), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_233.313, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.109 = f32[1024]{0} call(%Arg_44.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.109 = bf16[512,128,256]{2,1,0} abs(%Arg_63.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1310 = bf16[] reduce(%abs.109, %constant.497), dimensions={0,1,2}, to_apply=%region_235.315, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.845 = f32[] convert(%reduce_max.1310), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.361 = f32[1024]{0} scatter(%jit__roll_static_.109, %constant.480, %convert_element_type.845), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_236.316, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.122 = f32[1024]{0} call(%Arg_46.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.122 = bf16[32,4096,128,256]{3,2,1,0} abs(%concatenate.45), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1336 = bf16[] reduce(%abs.122, %constant.497), dimensions={0,1,2,3}, to_apply=%region_280.366, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.897 = f32[] convert(%reduce_max.1336), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.374 = f32[1024]{0} scatter(%jit__roll_static_.122, %constant.480, %convert_element_type.897), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_281.367, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_1/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.106 = f32[1024]{0} call(%Arg_48.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.106 = bf16[32,4096,1536]{2,1,0} abs(%mul.1066), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1304 = bf16[] reduce(%abs.106, %constant.497), dimensions={0,1,2}, to_apply=%region_225.305, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.825 = f32[] convert(%reduce_max.1304), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.358 = f32[1024]{0} scatter(%jit__roll_static_.106, %constant.480, %convert_element_type.825), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_226.306, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.107 = f32[1024]{0} call(%Arg_50.7), to_apply=%_roll_static.107, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.107 = bf16[1536,128,192]{2,1,0} abs(%Arg_65.7), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1306 = bf16[] reduce(%abs.107, %constant.497), dimensions={0,1,2}, to_apply=%region_228.308, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.828 = f32[] convert(%reduce_max.1306), metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.359 = f32[1024]{0} scatter(%jit__roll_static_.107, %constant.480, %convert_element_type.828), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_229.309, metadata={op_name="checkpoint/rematted_computation/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %jit__roll_static_.123 = f32[1024]{0} call(%Arg_52.7), to_apply=%_roll_static_554.217, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/jit(_roll_static)" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %abs.123 = bf16[32,4096,128,192]{3,2,1,0} abs(%concatenate.47), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/abs" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %reduce_max.1338 = bf16[] reduce(%abs.123, %constant.497), dimensions={0,1,2,3}, to_apply=%region_288.374, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/reduce_max" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %convert_element_type.914 = f32[] convert(%reduce_max.1338), metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/convert_element_type" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  %scatter.375 = f32[1024]{0} scatter(%jit__roll_static_.123, %constant.480, %convert_element_type.914), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, indices_are_sorted=true, unique_indices=true, to_apply=%region_289.375, metadata={op_name="checkpoint/dense_layers/self_attention/Fp8DirectDotGeneralOp_0/scatter" source_file="/opt/flax/flax/core/axes_scan.py" source_line=156 source_end_line=156 source_column=29 source_end_column=53}
+  ROOT %tuple.19 = (bf16[32,4096,7168]{2,1,0}, bf16[7168,18432]{0,1}, bf16[7168,18432]{0,1}, bf16[18432,7168]{0,1}, bf16[7168]{0}, /*index=5*/bf16[7168]{0}, bf16[512]{0}, bf16[128,128,7168]{1,0,2}, bf16[1536]{0}, bf16[7168,576]{0,1}, /*index=10*/bf16[512,128,256]{0,2,1}, bf16[7168,1536]{0,1}, bf16[1536,128,192]{0,2,1}, f32[1024]{0}, f32[1]{0}, /*index=15*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=20*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=25*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=30*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, /*index=35*/f32[1]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, /*index=40*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=45*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=50*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=55*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}) tuple(%sharding_constraint.190, %mul.1135, %mul.1131, %mul.1125, %reduce_sum.478, /*index=5*/%reduce_sum.492, %reduce_sum.484, %mul.1151, %reduce_sum.488, %mul.1195, /*index=10*/%mul.1155, %mul.1199, %mul.1179, %scatter.364, %div.1110, /*index=15*/%scatter.365, %div.1115, %scatter.372, %div.1150, %scatter.366, /*index=20*/%div.1120, %scatter.367, %div.1125, %scatter.371, %div.1145, /*index=25*/%scatter.368, %div.1130, %scatter.369, %div.1135, %scatter.370, /*index=30*/%div.1140, %scatter.355, %div.1051, %scatter.356, %scatter.357, /*index=35*/%div.1056, %div.1061, %scatter.377, %scatter.376, %div.1178, /*index=40*/%div.1173, %scatter.362, %div.1098, %scatter.363, %div.1103, /*index=45*/%scatter.373, %div.1156, %scatter.360, %div.1088, %scatter.361, /*index=50*/%div.1093, %scatter.374, %div.1161, %scatter.358, %div.1068, /*index=55*/%scatter.359, %div.1073, %scatter.375, %div.1167)
+}
+
+%region_212.391 (arg_tuple.13: (s32[], bf16[32,4096,7168], bf16[1,7168,18432], bf16[1,7168,18432], bf16[1,18432,7168], /*index=5*/bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], bf16[1,1536], /*index=10*/bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], f32[1,1024], /*index=15*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=20*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=25*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=30*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=35*/f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], f32[1,1024], /*index=40*/f32[1,1], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=45*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=50*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=55*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=60*/bf16[1,32,4096,7168], bf16[1,32,4096,1536], bf16[1,32,4096,576], bf16[1,32,4096,7168], bf16[1,32,4096,18432], /*index=65*/bf16[1,32,4096,18432], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=70*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=75*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=80*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=85*/f32[1,1], f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], /*index=90*/f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], /*index=95*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=100*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=105*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=110*/f32[1,1024], f32[1,1], bf16[1,7168,18432], bf16[1,7168,18432], bf16[1,18432,7168], /*index=115*/bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], bf16[1,1536], /*index=120*/bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], s32[32,4096])) -> (s32[], bf16[32,4096,7168], bf16[1,7168,18432], bf16[1,7168,18432], bf16[1,18432,7168], /*index=5*/bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], bf16[1,1536], /*index=10*/bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], f32[1,1024], /*index=15*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=20*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=25*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=30*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=35*/f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], f32[1,1024], /*index=40*/f32[1,1], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=45*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=50*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=55*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=60*/bf16[1,32,4096,7168], bf16[1,32,4096,1536], bf16[1,32,4096,576], bf16[1,32,4096,7168], bf16[1,32,4096,18432], /*index=65*/bf16[1,32,4096,18432], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=70*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=75*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=80*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=85*/f32[1,1], f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], /*index=90*/f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], /*index=95*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=100*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=105*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=110*/f32[1,1024], f32[1,1], bf16[1,7168,18432], bf16[1,7168,18432], bf16[1,18432,7168], /*index=115*/bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], bf16[1,1536], /*index=120*/bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], s32[32,4096]) {
+  %arg_tuple.13 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=5*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=10*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, f32[1,1024]{1,0}, /*index=15*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=55*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=60*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, /*index=65*/bf16[1,32,4096,18432]{3,2,1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=70*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=75*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=80*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=85*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=90*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=95*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=100*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=105*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=110*/f32[1,1024]{1,0}, f32[1,1]{1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=115*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=120*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) parameter(0)
+  %get-tuple-element.1481 = s32[] get-tuple-element(%arg_tuple.13), index=0
+  %constant.503 = s32[] constant(1)
+  %add.131 = s32[] add(%get-tuple-element.1481, %constant.503), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/add" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1605 = s32[32,4096]{1,0} get-tuple-element(%arg_tuple.13), index=124
+  %get-tuple-element.1482 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.13), index=1
+  %get-tuple-element.1541 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=60
+  %sub.57 = s32[] subtract(%constant.503, %get-tuple-element.1481), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %sub.58 = s32[] subtract(%sub.57, %constant.503), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/sub" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.502 = s32[] constant(0)
+  %dynamic_slice.496 = bf16[1,32,4096,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.1541, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,32,4096,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.529 = bf16[32,4096,7168]{2,1,0} reshape(%dynamic_slice.496), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1542 = bf16[1,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=61
+  %dynamic_slice.497 = bf16[1,32,4096,1536]{3,2,1,0} dynamic-slice(%get-tuple-element.1542, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,32,4096,1536}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.530 = bf16[32,4096,1536]{2,1,0} reshape(%dynamic_slice.497), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1543 = bf16[1,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=62
+  %dynamic_slice.498 = bf16[1,32,4096,576]{3,2,1,0} dynamic-slice(%get-tuple-element.1543, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,32,4096,576}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.531 = bf16[32,4096,576]{2,1,0} reshape(%dynamic_slice.498), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1544 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=63
+  %dynamic_slice.499 = bf16[1,32,4096,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.1544, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,32,4096,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.532 = bf16[32,4096,7168]{2,1,0} reshape(%dynamic_slice.499), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1545 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=64
+  %dynamic_slice.500 = bf16[1,32,4096,18432]{3,2,1,0} dynamic-slice(%get-tuple-element.1545, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,32,4096,18432}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.533 = bf16[32,4096,18432]{2,1,0} reshape(%dynamic_slice.500), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1546 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=65
+  %dynamic_slice.501 = bf16[1,32,4096,18432]{3,2,1,0} dynamic-slice(%get-tuple-element.1546, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,32,4096,18432}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.534 = bf16[32,4096,18432]{2,1,0} reshape(%dynamic_slice.501), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1547 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=66
+  %dynamic_slice.502 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1547, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.535 = f32[1024]{0} reshape(%dynamic_slice.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1548 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=67
+  %dynamic_slice.503 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1548, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.536 = f32[1]{0} reshape(%dynamic_slice.503), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1549 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=68
+  %dynamic_slice.504 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1549, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.537 = f32[1024]{0} reshape(%dynamic_slice.504), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1550 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=69
+  %dynamic_slice.505 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1550, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.538 = f32[1]{0} reshape(%dynamic_slice.505), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1551 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=70
+  %dynamic_slice.506 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1551, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.539 = f32[1024]{0} reshape(%dynamic_slice.506), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1552 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=71
+  %dynamic_slice.507 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1552, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.540 = f32[1]{0} reshape(%dynamic_slice.507), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1553 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=72
+  %dynamic_slice.508 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1553, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.541 = f32[1024]{0} reshape(%dynamic_slice.508), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1554 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=73
+  %dynamic_slice.509 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1554, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.542 = f32[1]{0} reshape(%dynamic_slice.509), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1555 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=74
+  %dynamic_slice.510 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1555, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.543 = f32[1024]{0} reshape(%dynamic_slice.510), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1556 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=75
+  %dynamic_slice.511 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1556, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.544 = f32[1]{0} reshape(%dynamic_slice.511), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1557 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=76
+  %dynamic_slice.512 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1557, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.545 = f32[1024]{0} reshape(%dynamic_slice.512), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1558 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=77
+  %dynamic_slice.513 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1558, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.546 = f32[1]{0} reshape(%dynamic_slice.513), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1559 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=78
+  %dynamic_slice.514 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1559, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.547 = f32[1024]{0} reshape(%dynamic_slice.514), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1560 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=79
+  %dynamic_slice.515 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1560, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.548 = f32[1]{0} reshape(%dynamic_slice.515), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1561 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=80
+  %dynamic_slice.516 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1561, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.549 = f32[1024]{0} reshape(%dynamic_slice.516), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1562 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=81
+  %dynamic_slice.517 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1562, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.550 = f32[1]{0} reshape(%dynamic_slice.517), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1563 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=82
+  %dynamic_slice.518 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1563, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.551 = f32[1024]{0} reshape(%dynamic_slice.518), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1564 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=83
+  %dynamic_slice.519 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1564, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.552 = f32[1]{0} reshape(%dynamic_slice.519), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1565 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=84
+  %dynamic_slice.520 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1565, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.553 = f32[1024]{0} reshape(%dynamic_slice.520), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1566 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=85
+  %dynamic_slice.521 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1566, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.554 = f32[1]{0} reshape(%dynamic_slice.521), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1567 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=86
+  %dynamic_slice.522 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1567, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.555 = f32[1024]{0} reshape(%dynamic_slice.522), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1568 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=87
+  %dynamic_slice.523 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1568, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.556 = f32[1024]{0} reshape(%dynamic_slice.523), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1569 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=88
+  %dynamic_slice.524 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1569, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.557 = f32[1]{0} reshape(%dynamic_slice.524), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1570 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=89
+  %dynamic_slice.525 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1570, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.558 = f32[1]{0} reshape(%dynamic_slice.525), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1571 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=90
+  %dynamic_slice.526 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1571, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.559 = f32[1024]{0} reshape(%dynamic_slice.526), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1572 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=91
+  %dynamic_slice.527 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1572, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.560 = f32[1024]{0} reshape(%dynamic_slice.527), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1573 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=92
+  %dynamic_slice.528 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1573, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.561 = f32[1]{0} reshape(%dynamic_slice.528), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1574 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=93
+  %dynamic_slice.529 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1574, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.562 = f32[1]{0} reshape(%dynamic_slice.529), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1575 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=94
+  %dynamic_slice.530 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1575, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.563 = f32[1024]{0} reshape(%dynamic_slice.530), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1576 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=95
+  %dynamic_slice.531 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1576, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.564 = f32[1]{0} reshape(%dynamic_slice.531), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1577 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=96
+  %dynamic_slice.532 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1577, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.565 = f32[1024]{0} reshape(%dynamic_slice.532), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1578 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=97
+  %dynamic_slice.533 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1578, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.566 = f32[1]{0} reshape(%dynamic_slice.533), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1579 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=98
+  %dynamic_slice.534 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1579, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.567 = f32[1024]{0} reshape(%dynamic_slice.534), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1580 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=99
+  %dynamic_slice.535 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1580, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.568 = f32[1]{0} reshape(%dynamic_slice.535), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1581 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=100
+  %dynamic_slice.536 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1581, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.569 = f32[1024]{0} reshape(%dynamic_slice.536), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1582 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=101
+  %dynamic_slice.537 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1582, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.570 = f32[1]{0} reshape(%dynamic_slice.537), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1583 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=102
+  %dynamic_slice.538 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1583, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.571 = f32[1024]{0} reshape(%dynamic_slice.538), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1584 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=103
+  %dynamic_slice.539 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1584, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.572 = f32[1]{0} reshape(%dynamic_slice.539), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1585 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=104
+  %dynamic_slice.540 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1585, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.573 = f32[1024]{0} reshape(%dynamic_slice.540), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1586 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=105
+  %dynamic_slice.541 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1586, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.574 = f32[1]{0} reshape(%dynamic_slice.541), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1587 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=106
+  %dynamic_slice.542 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1587, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.575 = f32[1024]{0} reshape(%dynamic_slice.542), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1588 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=107
+  %dynamic_slice.543 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1588, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.576 = f32[1]{0} reshape(%dynamic_slice.543), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1589 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=108
+  %dynamic_slice.544 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1589, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.577 = f32[1024]{0} reshape(%dynamic_slice.544), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1590 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=109
+  %dynamic_slice.545 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1590, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.578 = f32[1]{0} reshape(%dynamic_slice.545), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1591 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=110
+  %dynamic_slice.546 = f32[1,1024]{1,0} dynamic-slice(%get-tuple-element.1591, %sub.58, %constant.502), dynamic_slice_sizes={1,1024}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.579 = f32[1024]{0} reshape(%dynamic_slice.546), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1592 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=111
+  %dynamic_slice.547 = f32[1,1]{1,0} dynamic-slice(%get-tuple-element.1592, %sub.58, %constant.502), dynamic_slice_sizes={1,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.580 = f32[1]{0} reshape(%dynamic_slice.547), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1593 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.13), index=112
+  %dynamic_slice.548 = bf16[1,7168,18432]{2,1,0} dynamic-slice(%get-tuple-element.1593, %sub.58, %constant.502, %constant.502), dynamic_slice_sizes={1,7168,18432}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.581 = bf16[7168,18432]{1,0} reshape(%dynamic_slice.548), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1594 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.13), index=113
+  %dynamic_slice.549 = bf16[1,7168,18432]{2,1,0} dynamic-slice(%get-tuple-element.1594, %sub.58, %constant.502, %constant.502), dynamic_slice_sizes={1,7168,18432}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.582 = bf16[7168,18432]{1,0} reshape(%dynamic_slice.549), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1595 = bf16[1,18432,7168]{2,1,0} get-tuple-element(%arg_tuple.13), index=114
+  %dynamic_slice.550 = bf16[1,18432,7168]{2,1,0} dynamic-slice(%get-tuple-element.1595, %sub.58, %constant.502, %constant.502), dynamic_slice_sizes={1,18432,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.583 = bf16[18432,7168]{1,0} reshape(%dynamic_slice.550), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1596 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.13), index=115
+  %dynamic_slice.551 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.1596, %sub.58, %constant.502), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.584 = bf16[7168]{0} reshape(%dynamic_slice.551), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1597 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.13), index=116
+  %dynamic_slice.552 = bf16[1,7168]{1,0} dynamic-slice(%get-tuple-element.1597, %sub.58, %constant.502), dynamic_slice_sizes={1,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.585 = bf16[7168]{0} reshape(%dynamic_slice.552), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1598 = bf16[1,512]{1,0} get-tuple-element(%arg_tuple.13), index=117
+  %dynamic_slice.553 = bf16[1,512]{1,0} dynamic-slice(%get-tuple-element.1598, %sub.58, %constant.502), dynamic_slice_sizes={1,512}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.586 = bf16[512]{0} reshape(%dynamic_slice.553), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1599 = bf16[1,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=118
+  %dynamic_slice.554 = bf16[1,128,128,7168]{3,2,1,0} dynamic-slice(%get-tuple-element.1599, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,128,128,7168}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.587 = bf16[128,128,7168]{2,1,0} reshape(%dynamic_slice.554), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1600 = bf16[1,1536]{1,0} get-tuple-element(%arg_tuple.13), index=119
+  %dynamic_slice.555 = bf16[1,1536]{1,0} dynamic-slice(%get-tuple-element.1600, %sub.58, %constant.502), dynamic_slice_sizes={1,1536}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.588 = bf16[1536]{0} reshape(%dynamic_slice.555), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1601 = bf16[1,7168,576]{2,1,0} get-tuple-element(%arg_tuple.13), index=120
+  %dynamic_slice.556 = bf16[1,7168,576]{2,1,0} dynamic-slice(%get-tuple-element.1601, %sub.58, %constant.502, %constant.502), dynamic_slice_sizes={1,7168,576}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.589 = bf16[7168,576]{1,0} reshape(%dynamic_slice.556), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1602 = bf16[1,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=121
+  %dynamic_slice.557 = bf16[1,512,128,256]{3,2,1,0} dynamic-slice(%get-tuple-element.1602, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,512,128,256}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.590 = bf16[512,128,256]{2,1,0} reshape(%dynamic_slice.557), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1603 = bf16[1,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.13), index=122
+  %dynamic_slice.558 = bf16[1,7168,1536]{2,1,0} dynamic-slice(%get-tuple-element.1603, %sub.58, %constant.502, %constant.502), dynamic_slice_sizes={1,7168,1536}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.591 = bf16[7168,1536]{1,0} reshape(%dynamic_slice.558), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1604 = bf16[1,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=123
+  %dynamic_slice.559 = bf16[1,1536,128,192]{3,2,1,0} dynamic-slice(%get-tuple-element.1604, %sub.58, %constant.502, %constant.502, %constant.502), dynamic_slice_sizes={1,1536,128,192}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %squeeze.592 = bf16[1536,128,192]{2,1,0} reshape(%dynamic_slice.559), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/squeeze" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.294 = (bf16[32,4096,7168]{2,1,0}, bf16[7168,18432]{0,1}, bf16[7168,18432]{0,1}, bf16[18432,7168]{0,1}, bf16[7168]{0}, /*index=5*/bf16[7168]{0}, bf16[512]{0}, bf16[128,128,7168]{1,0,2}, bf16[1536]{0}, bf16[7168,576]{0,1}, /*index=10*/bf16[512,128,256]{0,2,1}, bf16[7168,1536]{0,1}, bf16[1536,128,192]{0,2,1}, f32[1024]{0}, f32[1]{0}, /*index=15*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=20*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=25*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=30*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, /*index=35*/f32[1]{0}, f32[1]{0}, f32[1024]{0}, f32[1024]{0}, f32[1]{0}, /*index=40*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=45*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, /*index=50*/f32[1]{0}, f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}, /*index=55*/f32[1024]{0}, f32[1]{0}, f32[1024]{0}, f32[1]{0}) call(%get-tuple-element.1605, %get-tuple-element.1482, %squeeze.529, %squeeze.530, %squeeze.531, /*index=5*/%squeeze.532, %squeeze.533, %squeeze.534, %squeeze.535, %squeeze.536, /*index=10*/%squeeze.537, %squeeze.538, %squeeze.539, %squeeze.540, %squeeze.541, /*index=15*/%squeeze.542, %squeeze.543, %squeeze.544, %squeeze.545, %squeeze.546, /*index=20*/%squeeze.547, %squeeze.548, %squeeze.549, %squeeze.550, %squeeze.551, /*index=25*/%squeeze.552, %squeeze.553, %squeeze.554, %squeeze.555, %squeeze.556, /*index=30*/%squeeze.557, %squeeze.558, %squeeze.559, %squeeze.560, %squeeze.561, /*index=35*/%squeeze.562, %squeeze.563, %squeeze.564, %squeeze.565, %squeeze.566, /*index=40*/%squeeze.567, %squeeze.568, %squeeze.569, %squeeze.570, %squeeze.571, /*index=45*/%squeeze.572, %squeeze.573, %squeeze.574, %squeeze.575, %squeeze.576, /*index=50*/%squeeze.577, %squeeze.578, %squeeze.579, %squeeze.580, %squeeze.581, /*index=55*/%squeeze.582, %squeeze.583, %squeeze.584, %squeeze.585, %squeeze.586, /*index=60*/%squeeze.587, %squeeze.588, %squeeze.589, %squeeze.590, %squeeze.591, /*index=65*/%squeeze.592), to_apply=%closed_call_769.390, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %closed_call.295 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%closed_call.294), index=0, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1483 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.13), index=2
+  %closed_call.296 = bf16[7168,18432]{0,1} get-tuple-element(%closed_call.294), index=1, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1560 = bf16[1,7168,18432]{2,1,0} reshape(%closed_call.296), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.280 = bf16[1,7168,18432]{2,1,0} dynamic-update-slice(%get-tuple-element.1483, %broadcast_in_dim.1560, %sub.58, %constant.502, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1484 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.13), index=3
+  %closed_call.297 = bf16[7168,18432]{0,1} get-tuple-element(%closed_call.294), index=2, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1561 = bf16[1,7168,18432]{2,1,0} reshape(%closed_call.297), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.281 = bf16[1,7168,18432]{2,1,0} dynamic-update-slice(%get-tuple-element.1484, %broadcast_in_dim.1561, %sub.58, %constant.502, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1485 = bf16[1,18432,7168]{2,1,0} get-tuple-element(%arg_tuple.13), index=4
+  %closed_call.298 = bf16[18432,7168]{0,1} get-tuple-element(%closed_call.294), index=3, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1562 = bf16[1,18432,7168]{2,1,0} reshape(%closed_call.298), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.282 = bf16[1,18432,7168]{2,1,0} dynamic-update-slice(%get-tuple-element.1485, %broadcast_in_dim.1562, %sub.58, %constant.502, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1486 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.13), index=5
+  %closed_call.299 = bf16[7168]{0} get-tuple-element(%closed_call.294), index=4, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1563 = bf16[1,7168]{1,0} reshape(%closed_call.299), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.283 = bf16[1,7168]{1,0} dynamic-update-slice(%get-tuple-element.1486, %broadcast_in_dim.1563, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1487 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.13), index=6
+  %closed_call.300 = bf16[7168]{0} get-tuple-element(%closed_call.294), index=5, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1564 = bf16[1,7168]{1,0} reshape(%closed_call.300), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.284 = bf16[1,7168]{1,0} dynamic-update-slice(%get-tuple-element.1487, %broadcast_in_dim.1564, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1488 = bf16[1,512]{1,0} get-tuple-element(%arg_tuple.13), index=7
+  %closed_call.301 = bf16[512]{0} get-tuple-element(%closed_call.294), index=6, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1565 = bf16[1,512]{1,0} reshape(%closed_call.301), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.285 = bf16[1,512]{1,0} dynamic-update-slice(%get-tuple-element.1488, %broadcast_in_dim.1565, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1489 = bf16[1,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=8
+  %closed_call.302 = bf16[128,128,7168]{1,0,2} get-tuple-element(%closed_call.294), index=7, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1566 = bf16[1,128,128,7168]{3,2,1,0} reshape(%closed_call.302), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.286 = bf16[1,128,128,7168]{3,2,1,0} dynamic-update-slice(%get-tuple-element.1489, %broadcast_in_dim.1566, %sub.58, %constant.502, %constant.502, /*index=5*/%constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1490 = bf16[1,1536]{1,0} get-tuple-element(%arg_tuple.13), index=9
+  %closed_call.303 = bf16[1536]{0} get-tuple-element(%closed_call.294), index=8, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1567 = bf16[1,1536]{1,0} reshape(%closed_call.303), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.287 = bf16[1,1536]{1,0} dynamic-update-slice(%get-tuple-element.1490, %broadcast_in_dim.1567, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1491 = bf16[1,7168,576]{2,1,0} get-tuple-element(%arg_tuple.13), index=10
+  %closed_call.304 = bf16[7168,576]{0,1} get-tuple-element(%closed_call.294), index=9, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1568 = bf16[1,7168,576]{2,1,0} reshape(%closed_call.304), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.288 = bf16[1,7168,576]{2,1,0} dynamic-update-slice(%get-tuple-element.1491, %broadcast_in_dim.1568, %sub.58, %constant.502, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1492 = bf16[1,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=11
+  %closed_call.305 = bf16[512,128,256]{0,2,1} get-tuple-element(%closed_call.294), index=10, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1569 = bf16[1,512,128,256]{3,2,1,0} reshape(%closed_call.305), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.289 = bf16[1,512,128,256]{3,2,1,0} dynamic-update-slice(%get-tuple-element.1492, %broadcast_in_dim.1569, %sub.58, %constant.502, %constant.502, /*index=5*/%constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1493 = bf16[1,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.13), index=12
+  %closed_call.306 = bf16[7168,1536]{0,1} get-tuple-element(%closed_call.294), index=11, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1570 = bf16[1,7168,1536]{2,1,0} reshape(%closed_call.306), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.290 = bf16[1,7168,1536]{2,1,0} dynamic-update-slice(%get-tuple-element.1493, %broadcast_in_dim.1570, %sub.58, %constant.502, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1494 = bf16[1,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.13), index=13
+  %closed_call.307 = bf16[1536,128,192]{0,2,1} get-tuple-element(%closed_call.294), index=12, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1571 = bf16[1,1536,128,192]{3,2,1,0} reshape(%closed_call.307), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.291 = bf16[1,1536,128,192]{3,2,1,0} dynamic-update-slice(%get-tuple-element.1494, %broadcast_in_dim.1571, %sub.58, %constant.502, %constant.502, /*index=5*/%constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1495 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=14
+  %closed_call.308 = f32[1024]{0} get-tuple-element(%closed_call.294), index=13, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1572 = f32[1,1024]{1,0} reshape(%closed_call.308), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.292 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1495, %broadcast_in_dim.1572, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1496 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=15
+  %closed_call.309 = f32[1]{0} get-tuple-element(%closed_call.294), index=14, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1573 = f32[1,1]{1,0} reshape(%closed_call.309), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.293 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1496, %broadcast_in_dim.1573, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1497 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=16
+  %closed_call.310 = f32[1024]{0} get-tuple-element(%closed_call.294), index=15, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1574 = f32[1,1024]{1,0} reshape(%closed_call.310), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.294 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1497, %broadcast_in_dim.1574, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1498 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=17
+  %closed_call.311 = f32[1]{0} get-tuple-element(%closed_call.294), index=16, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1575 = f32[1,1]{1,0} reshape(%closed_call.311), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.295 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1498, %broadcast_in_dim.1575, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1499 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=18
+  %closed_call.312 = f32[1024]{0} get-tuple-element(%closed_call.294), index=17, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1576 = f32[1,1024]{1,0} reshape(%closed_call.312), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.296 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1499, %broadcast_in_dim.1576, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1500 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=19
+  %closed_call.313 = f32[1]{0} get-tuple-element(%closed_call.294), index=18, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1577 = f32[1,1]{1,0} reshape(%closed_call.313), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.297 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1500, %broadcast_in_dim.1577, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1501 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=20
+  %closed_call.314 = f32[1024]{0} get-tuple-element(%closed_call.294), index=19, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1578 = f32[1,1024]{1,0} reshape(%closed_call.314), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.298 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1501, %broadcast_in_dim.1578, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1502 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=21
+  %closed_call.315 = f32[1]{0} get-tuple-element(%closed_call.294), index=20, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1579 = f32[1,1]{1,0} reshape(%closed_call.315), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.299 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1502, %broadcast_in_dim.1579, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1503 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=22
+  %closed_call.316 = f32[1024]{0} get-tuple-element(%closed_call.294), index=21, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1580 = f32[1,1024]{1,0} reshape(%closed_call.316), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.300 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1503, %broadcast_in_dim.1580, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1504 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=23
+  %closed_call.317 = f32[1]{0} get-tuple-element(%closed_call.294), index=22, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1581 = f32[1,1]{1,0} reshape(%closed_call.317), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.301 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1504, %broadcast_in_dim.1581, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1505 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=24
+  %closed_call.318 = f32[1024]{0} get-tuple-element(%closed_call.294), index=23, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1582 = f32[1,1024]{1,0} reshape(%closed_call.318), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.302 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1505, %broadcast_in_dim.1582, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1506 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=25
+  %closed_call.319 = f32[1]{0} get-tuple-element(%closed_call.294), index=24, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1583 = f32[1,1]{1,0} reshape(%closed_call.319), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.303 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1506, %broadcast_in_dim.1583, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1507 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=26
+  %closed_call.320 = f32[1024]{0} get-tuple-element(%closed_call.294), index=25, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1584 = f32[1,1024]{1,0} reshape(%closed_call.320), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.304 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1507, %broadcast_in_dim.1584, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1508 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=27
+  %closed_call.321 = f32[1]{0} get-tuple-element(%closed_call.294), index=26, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1585 = f32[1,1]{1,0} reshape(%closed_call.321), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.305 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1508, %broadcast_in_dim.1585, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1509 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=28
+  %closed_call.322 = f32[1024]{0} get-tuple-element(%closed_call.294), index=27, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1586 = f32[1,1024]{1,0} reshape(%closed_call.322), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.306 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1509, %broadcast_in_dim.1586, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1510 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=29
+  %closed_call.323 = f32[1]{0} get-tuple-element(%closed_call.294), index=28, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1587 = f32[1,1]{1,0} reshape(%closed_call.323), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.307 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1510, %broadcast_in_dim.1587, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1511 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=30
+  %closed_call.324 = f32[1024]{0} get-tuple-element(%closed_call.294), index=29, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1588 = f32[1,1024]{1,0} reshape(%closed_call.324), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.308 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1511, %broadcast_in_dim.1588, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1512 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=31
+  %closed_call.325 = f32[1]{0} get-tuple-element(%closed_call.294), index=30, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1589 = f32[1,1]{1,0} reshape(%closed_call.325), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.309 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1512, %broadcast_in_dim.1589, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1513 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=32
+  %closed_call.326 = f32[1024]{0} get-tuple-element(%closed_call.294), index=31, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1590 = f32[1,1024]{1,0} reshape(%closed_call.326), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.310 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1513, %broadcast_in_dim.1590, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1514 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=33
+  %closed_call.327 = f32[1]{0} get-tuple-element(%closed_call.294), index=32, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1591 = f32[1,1]{1,0} reshape(%closed_call.327), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.311 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1514, %broadcast_in_dim.1591, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1515 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=34
+  %closed_call.328 = f32[1024]{0} get-tuple-element(%closed_call.294), index=33, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1592 = f32[1,1024]{1,0} reshape(%closed_call.328), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.312 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1515, %broadcast_in_dim.1592, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1516 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=35
+  %closed_call.329 = f32[1024]{0} get-tuple-element(%closed_call.294), index=34, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1593 = f32[1,1024]{1,0} reshape(%closed_call.329), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.313 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1516, %broadcast_in_dim.1593, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1517 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=36
+  %closed_call.330 = f32[1]{0} get-tuple-element(%closed_call.294), index=35, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1594 = f32[1,1]{1,0} reshape(%closed_call.330), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.314 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1517, %broadcast_in_dim.1594, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1518 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=37
+  %closed_call.331 = f32[1]{0} get-tuple-element(%closed_call.294), index=36, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1595 = f32[1,1]{1,0} reshape(%closed_call.331), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.315 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1518, %broadcast_in_dim.1595, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1519 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=38
+  %closed_call.332 = f32[1024]{0} get-tuple-element(%closed_call.294), index=37, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1596 = f32[1,1024]{1,0} reshape(%closed_call.332), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.316 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1519, %broadcast_in_dim.1596, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1520 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=39
+  %closed_call.333 = f32[1024]{0} get-tuple-element(%closed_call.294), index=38, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1597 = f32[1,1024]{1,0} reshape(%closed_call.333), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.317 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1520, %broadcast_in_dim.1597, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1521 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=40
+  %closed_call.334 = f32[1]{0} get-tuple-element(%closed_call.294), index=39, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1598 = f32[1,1]{1,0} reshape(%closed_call.334), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.318 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1521, %broadcast_in_dim.1598, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1522 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=41
+  %closed_call.335 = f32[1]{0} get-tuple-element(%closed_call.294), index=40, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1599 = f32[1,1]{1,0} reshape(%closed_call.335), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.319 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1522, %broadcast_in_dim.1599, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1523 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=42
+  %closed_call.336 = f32[1024]{0} get-tuple-element(%closed_call.294), index=41, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1600 = f32[1,1024]{1,0} reshape(%closed_call.336), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.320 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1523, %broadcast_in_dim.1600, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1524 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=43
+  %closed_call.337 = f32[1]{0} get-tuple-element(%closed_call.294), index=42, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1601 = f32[1,1]{1,0} reshape(%closed_call.337), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.321 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1524, %broadcast_in_dim.1601, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1525 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=44
+  %closed_call.338 = f32[1024]{0} get-tuple-element(%closed_call.294), index=43, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1602 = f32[1,1024]{1,0} reshape(%closed_call.338), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.322 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1525, %broadcast_in_dim.1602, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1526 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=45
+  %closed_call.339 = f32[1]{0} get-tuple-element(%closed_call.294), index=44, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1603 = f32[1,1]{1,0} reshape(%closed_call.339), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.323 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1526, %broadcast_in_dim.1603, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1527 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=46
+  %closed_call.340 = f32[1024]{0} get-tuple-element(%closed_call.294), index=45, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1604 = f32[1,1024]{1,0} reshape(%closed_call.340), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.324 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1527, %broadcast_in_dim.1604, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1528 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=47
+  %closed_call.341 = f32[1]{0} get-tuple-element(%closed_call.294), index=46, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1605 = f32[1,1]{1,0} reshape(%closed_call.341), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.325 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1528, %broadcast_in_dim.1605, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1529 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=48
+  %closed_call.342 = f32[1024]{0} get-tuple-element(%closed_call.294), index=47, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1606 = f32[1,1024]{1,0} reshape(%closed_call.342), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.326 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1529, %broadcast_in_dim.1606, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1530 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=49
+  %closed_call.343 = f32[1]{0} get-tuple-element(%closed_call.294), index=48, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1607 = f32[1,1]{1,0} reshape(%closed_call.343), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.327 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1530, %broadcast_in_dim.1607, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1531 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=50
+  %closed_call.344 = f32[1024]{0} get-tuple-element(%closed_call.294), index=49, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1608 = f32[1,1024]{1,0} reshape(%closed_call.344), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.328 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1531, %broadcast_in_dim.1608, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1532 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=51
+  %closed_call.345 = f32[1]{0} get-tuple-element(%closed_call.294), index=50, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1609 = f32[1,1]{1,0} reshape(%closed_call.345), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.329 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1532, %broadcast_in_dim.1609, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1533 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=52
+  %closed_call.346 = f32[1024]{0} get-tuple-element(%closed_call.294), index=51, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1610 = f32[1,1024]{1,0} reshape(%closed_call.346), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.330 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1533, %broadcast_in_dim.1610, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1534 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=53
+  %closed_call.347 = f32[1]{0} get-tuple-element(%closed_call.294), index=52, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1611 = f32[1,1]{1,0} reshape(%closed_call.347), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.331 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1534, %broadcast_in_dim.1611, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1535 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=54
+  %closed_call.348 = f32[1024]{0} get-tuple-element(%closed_call.294), index=53, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1612 = f32[1,1024]{1,0} reshape(%closed_call.348), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.332 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1535, %broadcast_in_dim.1612, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1536 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=55
+  %closed_call.349 = f32[1]{0} get-tuple-element(%closed_call.294), index=54, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1613 = f32[1,1]{1,0} reshape(%closed_call.349), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.333 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1536, %broadcast_in_dim.1613, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1537 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=56
+  %closed_call.350 = f32[1024]{0} get-tuple-element(%closed_call.294), index=55, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1614 = f32[1,1024]{1,0} reshape(%closed_call.350), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.334 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1537, %broadcast_in_dim.1614, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1538 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=57
+  %closed_call.351 = f32[1]{0} get-tuple-element(%closed_call.294), index=56, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1615 = f32[1,1]{1,0} reshape(%closed_call.351), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.335 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1538, %broadcast_in_dim.1615, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1539 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.13), index=58
+  %closed_call.352 = f32[1024]{0} get-tuple-element(%closed_call.294), index=57, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1616 = f32[1,1024]{1,0} reshape(%closed_call.352), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.336 = f32[1,1024]{1,0} dynamic-update-slice(%get-tuple-element.1539, %broadcast_in_dim.1616, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %get-tuple-element.1540 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.13), index=59
+  %closed_call.353 = f32[1]{0} get-tuple-element(%closed_call.294), index=58, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/closed_call" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %broadcast_in_dim.1617 = f32[1,1]{1,0} reshape(%closed_call.353), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %dynamic_update_slice.337 = f32[1,1]{1,0} dynamic-update-slice(%get-tuple-element.1540, %broadcast_in_dim.1617, %sub.58, %constant.502), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/body/dynamic_update_slice" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  ROOT %tuple.21 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=5*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=10*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, f32[1,1024]{1,0}, /*index=15*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=55*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=60*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, /*index=65*/bf16[1,32,4096,18432]{3,2,1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=70*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=75*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=80*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=85*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=90*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=95*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=100*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=105*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=110*/f32[1,1024]{1,0}, f32[1,1]{1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=115*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=120*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) tuple(%add.131, %closed_call.295, %dynamic_update_slice.280, %dynamic_update_slice.281, %dynamic_update_slice.282, /*index=5*/%dynamic_update_slice.283, %dynamic_update_slice.284, %dynamic_update_slice.285, %dynamic_update_slice.286, %dynamic_update_slice.287, /*index=10*/%dynamic_update_slice.288, %dynamic_update_slice.289, %dynamic_update_slice.290, %dynamic_update_slice.291, %dynamic_update_slice.292, /*index=15*/%dynamic_update_slice.293, %dynamic_update_slice.294, %dynamic_update_slice.295, %dynamic_update_slice.296, %dynamic_update_slice.297, /*index=20*/%dynamic_update_slice.298, %dynamic_update_slice.299, %dynamic_update_slice.300, %dynamic_update_slice.301, %dynamic_update_slice.302, /*index=25*/%dynamic_update_slice.303, %dynamic_update_slice.304, %dynamic_update_slice.305, %dynamic_update_slice.306, %dynamic_update_slice.307, /*index=30*/%dynamic_update_slice.308, %dynamic_update_slice.309, %dynamic_update_slice.310, %dynamic_update_slice.311, %dynamic_update_slice.312, /*index=35*/%dynamic_update_slice.313, %dynamic_update_slice.314, %dynamic_update_slice.315, %dynamic_update_slice.316, %dynamic_update_slice.317, /*index=40*/%dynamic_update_slice.318, %dynamic_update_slice.319, %dynamic_update_slice.320, %dynamic_update_slice.321, %dynamic_update_slice.322, /*index=45*/%dynamic_update_slice.323, %dynamic_update_slice.324, %dynamic_update_slice.325, %dynamic_update_slice.326, %dynamic_update_slice.327, /*index=50*/%dynamic_update_slice.328, %dynamic_update_slice.329, %dynamic_update_slice.330, %dynamic_update_slice.331, %dynamic_update_slice.332, /*index=55*/%dynamic_update_slice.333, %dynamic_update_slice.334, %dynamic_update_slice.335, %dynamic_update_slice.336, %dynamic_update_slice.337, /*index=60*/%get-tuple-element.1541, %get-tuple-element.1542, %get-tuple-element.1543, %get-tuple-element.1544, %get-tuple-element.1545, /*index=65*/%get-tuple-element.1546, %get-tuple-element.1547, %get-tuple-element.1548, %get-tuple-element.1549, %get-tuple-element.1550, /*index=70*/%get-tuple-element.1551, %get-tuple-element.1552, %get-tuple-element.1553, %get-tuple-element.1554, %get-tuple-element.1555, /*index=75*/%get-tuple-element.1556, %get-tuple-element.1557, %get-tuple-element.1558, %get-tuple-element.1559, %get-tuple-element.1560, /*index=80*/%get-tuple-element.1561, %get-tuple-element.1562, %get-tuple-element.1563, %get-tuple-element.1564, %get-tuple-element.1565, /*index=85*/%get-tuple-element.1566, %get-tuple-element.1567, %get-tuple-element.1568, %get-tuple-element.1569, %get-tuple-element.1570, /*index=90*/%get-tuple-element.1571, %get-tuple-element.1572, %get-tuple-element.1573, %get-tuple-element.1574, %get-tuple-element.1575, /*index=95*/%get-tuple-element.1576, %get-tuple-element.1577, %get-tuple-element.1578, %get-tuple-element.1579, %get-tuple-element.1580, /*index=100*/%get-tuple-element.1581, %get-tuple-element.1582, %get-tuple-element.1583, %get-tuple-element.1584, %get-tuple-element.1585, /*index=105*/%get-tuple-element.1586, %get-tuple-element.1587, %get-tuple-element.1588, %get-tuple-element.1589, %get-tuple-element.1590, /*index=110*/%get-tuple-element.1591, %get-tuple-element.1592, %get-tuple-element.1593, %get-tuple-element.1594, %get-tuple-element.1595, /*index=115*/%get-tuple-element.1596, %get-tuple-element.1597, %get-tuple-element.1598, %get-tuple-element.1599, %get-tuple-element.1600, /*index=120*/%get-tuple-element.1601, %get-tuple-element.1602, %get-tuple-element.1603, %get-tuple-element.1604, %get-tuple-element.1605)
+}
+
+%region_304.392 (arg_tuple.15: (s32[], bf16[32,4096,7168], bf16[1,7168,18432], bf16[1,7168,18432], bf16[1,18432,7168], /*index=5*/bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], bf16[1,1536], /*index=10*/bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], f32[1,1024], /*index=15*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=20*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=25*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=30*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=35*/f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], f32[1,1024], /*index=40*/f32[1,1], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=45*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=50*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=55*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=60*/bf16[1,32,4096,7168], bf16[1,32,4096,1536], bf16[1,32,4096,576], bf16[1,32,4096,7168], bf16[1,32,4096,18432], /*index=65*/bf16[1,32,4096,18432], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=70*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=75*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=80*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=85*/f32[1,1], f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], /*index=90*/f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], /*index=95*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=100*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=105*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=110*/f32[1,1024], f32[1,1], bf16[1,7168,18432], bf16[1,7168,18432], bf16[1,18432,7168], /*index=115*/bf16[1,7168], bf16[1,7168], bf16[1,512], bf16[1,128,128,7168], bf16[1,1536], /*index=120*/bf16[1,7168,576], bf16[1,512,128,256], bf16[1,7168,1536], bf16[1,1536,128,192], s32[32,4096])) -> pred[] {
+  %arg_tuple.15 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=5*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=10*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, f32[1,1024]{1,0}, /*index=15*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=55*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=60*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, /*index=65*/bf16[1,32,4096,18432]{3,2,1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=70*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=75*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=80*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=85*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=90*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=95*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=100*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=105*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=110*/f32[1,1024]{1,0}, f32[1,1]{1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=115*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=120*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) parameter(0)
+  %get-tuple-element.1732 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%arg_tuple.15), index=1
+  %get-tuple-element.1733 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.15), index=2
+  %get-tuple-element.1734 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.15), index=3
+  %get-tuple-element.1735 = bf16[1,18432,7168]{2,1,0} get-tuple-element(%arg_tuple.15), index=4
+  %get-tuple-element.1736 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.15), index=5
+  %get-tuple-element.1737 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.15), index=6
+  %get-tuple-element.1738 = bf16[1,512]{1,0} get-tuple-element(%arg_tuple.15), index=7
+  %get-tuple-element.1739 = bf16[1,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=8
+  %get-tuple-element.1740 = bf16[1,1536]{1,0} get-tuple-element(%arg_tuple.15), index=9
+  %get-tuple-element.1741 = bf16[1,7168,576]{2,1,0} get-tuple-element(%arg_tuple.15), index=10
+  %get-tuple-element.1742 = bf16[1,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=11
+  %get-tuple-element.1743 = bf16[1,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.15), index=12
+  %get-tuple-element.1744 = bf16[1,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=13
+  %get-tuple-element.1745 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=14
+  %get-tuple-element.1746 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=15
+  %get-tuple-element.1747 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=16
+  %get-tuple-element.1748 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=17
+  %get-tuple-element.1749 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=18
+  %get-tuple-element.1750 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=19
+  %get-tuple-element.1751 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=20
+  %get-tuple-element.1752 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=21
+  %get-tuple-element.1753 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=22
+  %get-tuple-element.1754 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=23
+  %get-tuple-element.1755 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=24
+  %get-tuple-element.1756 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=25
+  %get-tuple-element.1757 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=26
+  %get-tuple-element.1758 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=27
+  %get-tuple-element.1759 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=28
+  %get-tuple-element.1760 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=29
+  %get-tuple-element.1761 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=30
+  %get-tuple-element.1762 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=31
+  %get-tuple-element.1763 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=32
+  %get-tuple-element.1764 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=33
+  %get-tuple-element.1765 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=34
+  %get-tuple-element.1766 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=35
+  %get-tuple-element.1767 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=36
+  %get-tuple-element.1768 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=37
+  %get-tuple-element.1769 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=38
+  %get-tuple-element.1770 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=39
+  %get-tuple-element.1771 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=40
+  %get-tuple-element.1772 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=41
+  %get-tuple-element.1773 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=42
+  %get-tuple-element.1774 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=43
+  %get-tuple-element.1775 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=44
+  %get-tuple-element.1776 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=45
+  %get-tuple-element.1777 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=46
+  %get-tuple-element.1778 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=47
+  %get-tuple-element.1779 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=48
+  %get-tuple-element.1780 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=49
+  %get-tuple-element.1781 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=50
+  %get-tuple-element.1782 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=51
+  %get-tuple-element.1783 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=52
+  %get-tuple-element.1784 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=53
+  %get-tuple-element.1785 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=54
+  %get-tuple-element.1786 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=55
+  %get-tuple-element.1787 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=56
+  %get-tuple-element.1788 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=57
+  %get-tuple-element.1789 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=58
+  %get-tuple-element.1790 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=59
+  %get-tuple-element.1791 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=60
+  %get-tuple-element.1792 = bf16[1,32,4096,1536]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=61
+  %get-tuple-element.1793 = bf16[1,32,4096,576]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=62
+  %get-tuple-element.1794 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=63
+  %get-tuple-element.1795 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=64
+  %get-tuple-element.1796 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=65
+  %get-tuple-element.1797 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=66
+  %get-tuple-element.1798 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=67
+  %get-tuple-element.1799 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=68
+  %get-tuple-element.1800 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=69
+  %get-tuple-element.1801 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=70
+  %get-tuple-element.1802 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=71
+  %get-tuple-element.1803 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=72
+  %get-tuple-element.1804 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=73
+  %get-tuple-element.1805 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=74
+  %get-tuple-element.1806 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=75
+  %get-tuple-element.1807 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=76
+  %get-tuple-element.1808 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=77
+  %get-tuple-element.1809 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=78
+  %get-tuple-element.1810 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=79
+  %get-tuple-element.1811 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=80
+  %get-tuple-element.1812 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=81
+  %get-tuple-element.1813 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=82
+  %get-tuple-element.1814 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=83
+  %get-tuple-element.1815 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=84
+  %get-tuple-element.1816 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=85
+  %get-tuple-element.1817 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=86
+  %get-tuple-element.1818 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=87
+  %get-tuple-element.1819 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=88
+  %get-tuple-element.1820 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=89
+  %get-tuple-element.1821 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=90
+  %get-tuple-element.1822 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=91
+  %get-tuple-element.1823 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=92
+  %get-tuple-element.1824 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=93
+  %get-tuple-element.1825 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=94
+  %get-tuple-element.1826 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=95
+  %get-tuple-element.1827 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=96
+  %get-tuple-element.1828 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=97
+  %get-tuple-element.1829 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=98
+  %get-tuple-element.1830 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=99
+  %get-tuple-element.1831 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=100
+  %get-tuple-element.1832 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=101
+  %get-tuple-element.1833 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=102
+  %get-tuple-element.1834 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=103
+  %get-tuple-element.1835 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=104
+  %get-tuple-element.1836 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=105
+  %get-tuple-element.1837 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=106
+  %get-tuple-element.1838 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=107
+  %get-tuple-element.1839 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=108
+  %get-tuple-element.1840 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=109
+  %get-tuple-element.1841 = f32[1,1024]{1,0} get-tuple-element(%arg_tuple.15), index=110
+  %get-tuple-element.1842 = f32[1,1]{1,0} get-tuple-element(%arg_tuple.15), index=111
+  %get-tuple-element.1843 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.15), index=112
+  %get-tuple-element.1844 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%arg_tuple.15), index=113
+  %get-tuple-element.1845 = bf16[1,18432,7168]{2,1,0} get-tuple-element(%arg_tuple.15), index=114
+  %get-tuple-element.1846 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.15), index=115
+  %get-tuple-element.1847 = bf16[1,7168]{1,0} get-tuple-element(%arg_tuple.15), index=116
+  %get-tuple-element.1848 = bf16[1,512]{1,0} get-tuple-element(%arg_tuple.15), index=117
+  %get-tuple-element.1849 = bf16[1,128,128,7168]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=118
+  %get-tuple-element.1850 = bf16[1,1536]{1,0} get-tuple-element(%arg_tuple.15), index=119
+  %get-tuple-element.1851 = bf16[1,7168,576]{2,1,0} get-tuple-element(%arg_tuple.15), index=120
+  %get-tuple-element.1852 = bf16[1,512,128,256]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=121
+  %get-tuple-element.1853 = bf16[1,7168,1536]{2,1,0} get-tuple-element(%arg_tuple.15), index=122
+  %get-tuple-element.1854 = bf16[1,1536,128,192]{3,2,1,0} get-tuple-element(%arg_tuple.15), index=123
+  %get-tuple-element.1855 = s32[32,4096]{1,0} get-tuple-element(%arg_tuple.15), index=124
+  %get-tuple-element.1731 = s32[] get-tuple-element(%arg_tuple.15), index=0
+  %constant.505 = s32[] constant(1)
+  ROOT %lt.16 = pred[] compare(%get-tuple-element.1731, %constant.505), direction=LT, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while/cond/lt" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+}
+
+%clip_1183.428 (Arg_0.173: s32[], Arg_1.149: s32[], Arg_2.143: s32[]) -> s32[] {
+  %Arg_2.143 = s32[] parameter(2)
+  %Arg_1.149 = s32[] parameter(1)
+  %Arg_0.173 = s32[] parameter(0)
+  %max.221 = s32[] maximum(%Arg_1.149, %Arg_0.173), metadata={op_name="max" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=143 source_end_line=143 source_column=12 source_end_column=67}
+  ROOT %min.221 = s32[] minimum(%Arg_2.143, %max.221), metadata={op_name="min" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=143 source_end_line=143 source_column=12 source_end_column=67}
+}
+
+%_where_1191.429 (Arg_0.175: pred[], Arg_1.151: f32[], Arg_2.145: f32[]) -> f32[] {
+  %Arg_0.175 = pred[] parameter(0)
+  %Arg_1.151 = f32[] parameter(1)
+  %Arg_2.145 = f32[] parameter(2)
+  ROOT %select_n.102 = f32[] select(%Arg_0.175, %Arg_1.151, %Arg_2.145), metadata={op_name="select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_join.py" source_line=43 source_end_line=43 source_column=15 source_end_column=76}
+}
+
+%region_61.98 (reduce_sum.102: bf16[], reduce_sum.103: bf16[]) -> bf16[] {
+  %reduce_sum.102 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.103 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.104 = bf16[] add(%reduce_sum.102, %reduce_sum.103), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+}
+
+%region_62.99 (reduce_sum.109: bf16[], reduce_sum.110: bf16[]) -> bf16[] {
+  %reduce_sum.109 = bf16[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.110 = bf16[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.111 = bf16[] add(%reduce_sum.109, %reduce_sum.110), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+}
+
+%region_306.394 (reduce_sum.498: f32[], reduce_sum.499: f32[]) -> f32[] {
+  %reduce_sum.498 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.499 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.500 = f32[] add(%reduce_sum.498, %reduce_sum.499), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_307.395 (reduce_sum.505: f32[], reduce_sum.506: f32[]) -> f32[] {
+  %reduce_sum.505 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.506 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.507 = f32[] add(%reduce_sum.505, %reduce_sum.506), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_308.396 (reduce_sum.512: f32[], reduce_sum.513: f32[]) -> f32[] {
+  %reduce_sum.512 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.513 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.514 = f32[] add(%reduce_sum.512, %reduce_sum.513), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_309.397 (reduce_sum.519: f32[], reduce_sum.520: f32[]) -> f32[] {
+  %reduce_sum.519 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.520 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.521 = f32[] add(%reduce_sum.519, %reduce_sum.520), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_310.398 (reduce_sum.526: f32[], reduce_sum.527: f32[]) -> f32[] {
+  %reduce_sum.526 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.527 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.528 = f32[] add(%reduce_sum.526, %reduce_sum.527), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_311.399 (reduce_sum.533: f32[], reduce_sum.534: f32[]) -> f32[] {
+  %reduce_sum.533 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.534 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.535 = f32[] add(%reduce_sum.533, %reduce_sum.534), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_312.400 (reduce_sum.540: f32[], reduce_sum.541: f32[]) -> f32[] {
+  %reduce_sum.540 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.541 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.542 = f32[] add(%reduce_sum.540, %reduce_sum.541), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_313.401 (reduce_sum.547: f32[], reduce_sum.548: f32[]) -> f32[] {
+  %reduce_sum.547 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.548 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.549 = f32[] add(%reduce_sum.547, %reduce_sum.548), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_314.402 (reduce_sum.554: f32[], reduce_sum.555: f32[]) -> f32[] {
+  %reduce_sum.554 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.555 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.556 = f32[] add(%reduce_sum.554, %reduce_sum.555), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_315.403 (reduce_sum.561: f32[], reduce_sum.562: f32[]) -> f32[] {
+  %reduce_sum.561 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.562 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.563 = f32[] add(%reduce_sum.561, %reduce_sum.562), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_316.404 (reduce_sum.568: f32[], reduce_sum.569: f32[]) -> f32[] {
+  %reduce_sum.568 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.569 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.570 = f32[] add(%reduce_sum.568, %reduce_sum.569), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_317.405 (reduce_sum.575: f32[], reduce_sum.576: f32[]) -> f32[] {
+  %reduce_sum.575 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.576 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.577 = f32[] add(%reduce_sum.575, %reduce_sum.576), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_318.406 (reduce_sum.582: f32[], reduce_sum.583: f32[]) -> f32[] {
+  %reduce_sum.582 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.583 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.584 = f32[] add(%reduce_sum.582, %reduce_sum.583), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_319.407 (reduce_sum.589: f32[], reduce_sum.590: f32[]) -> f32[] {
+  %reduce_sum.589 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.590 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.591 = f32[] add(%reduce_sum.589, %reduce_sum.590), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_320.408 (reduce_sum.596: f32[], reduce_sum.597: f32[]) -> f32[] {
+  %reduce_sum.596 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.597 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.598 = f32[] add(%reduce_sum.596, %reduce_sum.597), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_321.409 (reduce_sum.603: f32[], reduce_sum.604: f32[]) -> f32[] {
+  %reduce_sum.603 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.604 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.605 = f32[] add(%reduce_sum.603, %reduce_sum.604), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_322.410 (reduce_sum.610: f32[], reduce_sum.611: f32[]) -> f32[] {
+  %reduce_sum.610 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.611 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.612 = f32[] add(%reduce_sum.610, %reduce_sum.611), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_323.411 (reduce_sum.617: f32[], reduce_sum.618: f32[]) -> f32[] {
+  %reduce_sum.617 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.618 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.619 = f32[] add(%reduce_sum.617, %reduce_sum.618), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_324.412 (reduce_sum.624: f32[], reduce_sum.625: f32[]) -> f32[] {
+  %reduce_sum.624 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.625 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.626 = f32[] add(%reduce_sum.624, %reduce_sum.625), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_325.413 (reduce_sum.631: f32[], reduce_sum.632: f32[]) -> f32[] {
+  %reduce_sum.631 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.632 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.633 = f32[] add(%reduce_sum.631, %reduce_sum.632), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_326.414 (reduce_sum.638: f32[], reduce_sum.639: f32[]) -> f32[] {
+  %reduce_sum.638 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.639 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.640 = f32[] add(%reduce_sum.638, %reduce_sum.639), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_327.415 (reduce_sum.645: f32[], reduce_sum.646: f32[]) -> f32[] {
+  %reduce_sum.645 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.646 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.647 = f32[] add(%reduce_sum.645, %reduce_sum.646), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_328.416 (reduce_sum.652: f32[], reduce_sum.653: f32[]) -> f32[] {
+  %reduce_sum.652 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.653 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.654 = f32[] add(%reduce_sum.652, %reduce_sum.653), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_329.417 (reduce_sum.659: f32[], reduce_sum.660: f32[]) -> f32[] {
+  %reduce_sum.659 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.660 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.661 = f32[] add(%reduce_sum.659, %reduce_sum.660), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_330.418 (reduce_sum.666: f32[], reduce_sum.667: f32[]) -> f32[] {
+  %reduce_sum.666 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.667 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.668 = f32[] add(%reduce_sum.666, %reduce_sum.667), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_331.419 (reduce_sum.673: f32[], reduce_sum.674: f32[]) -> f32[] {
+  %reduce_sum.673 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.674 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.675 = f32[] add(%reduce_sum.673, %reduce_sum.674), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_332.420 (reduce_sum.680: f32[], reduce_sum.681: f32[]) -> f32[] {
+  %reduce_sum.680 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.681 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.682 = f32[] add(%reduce_sum.680, %reduce_sum.681), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_333.421 (reduce_sum.687: f32[], reduce_sum.688: f32[]) -> f32[] {
+  %reduce_sum.687 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.688 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.689 = f32[] add(%reduce_sum.687, %reduce_sum.688), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_334.422 (reduce_sum.694: f32[], reduce_sum.695: f32[]) -> f32[] {
+  %reduce_sum.694 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.695 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.696 = f32[] add(%reduce_sum.694, %reduce_sum.695), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_335.423 (reduce_sum.701: f32[], reduce_sum.702: f32[]) -> f32[] {
+  %reduce_sum.701 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.702 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.703 = f32[] add(%reduce_sum.701, %reduce_sum.702), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_336.424 (reduce_sum.708: f32[], reduce_sum.709: f32[]) -> f32[] {
+  %reduce_sum.708 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.709 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.710 = f32[] add(%reduce_sum.708, %reduce_sum.709), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_337.425 (reduce_sum.715: f32[], reduce_sum.716: f32[]) -> f32[] {
+  %reduce_sum.715 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.716 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.717 = f32[] add(%reduce_sum.715, %reduce_sum.716), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%region_305.393 (scatter-add.2: bf16[], scatter-add.3: bf16[]) -> bf16[] {
+  %scatter-add.2 = bf16[] parameter(0), metadata={op_name="scatter-add"}
+  %scatter-add.3 = bf16[] parameter(1), metadata={op_name="scatter-add"}
+  ROOT %add.133 = bf16[] add(%scatter-add.2, %scatter-add.3), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+}
+
+%region_338.426 (reduce_sum.722: f32[], reduce_sum.723: f32[]) -> f32[] {
+  %reduce_sum.722 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.723 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.724 = f32[] add(%reduce_sum.722, %reduce_sum.723), metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+}
+
+%_where_1097.427 (Arg_0.171: pred[], Arg_1.147: s32[], Arg_2.141: s32[]) -> s32[] {
+  %Arg_0.171 = pred[] parameter(0)
+  %Arg_1.147 = s32[] parameter(1)
+  %Arg_2.141 = s32[] parameter(2)
+  ROOT %select_n.100 = s32[] select(%Arg_0.171, %Arg_1.147, %Arg_2.141), metadata={op_name="select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=142 source_end_line=142 source_column=9 source_end_column=61}
+}
+
+%region_339.430 (reduce_sum.729: f32[], reduce_sum.730: f32[]) -> f32[] {
+  %reduce_sum.729 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.730 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.731 = f32[] add(%reduce_sum.729, %reduce_sum.730), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_340.431 (reduce_sum.736: f32[], reduce_sum.737: f32[]) -> f32[] {
+  %reduce_sum.736 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.737 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.738 = f32[] add(%reduce_sum.736, %reduce_sum.737), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_341.432 (reduce_sum.743: f32[], reduce_sum.744: f32[]) -> f32[] {
+  %reduce_sum.743 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.744 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.745 = f32[] add(%reduce_sum.743, %reduce_sum.744), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_342.433 (reduce_sum.750: f32[], reduce_sum.751: f32[]) -> f32[] {
+  %reduce_sum.750 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.751 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.752 = f32[] add(%reduce_sum.750, %reduce_sum.751), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_343.434 (reduce_sum.757: f32[], reduce_sum.758: f32[]) -> f32[] {
+  %reduce_sum.757 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.758 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.759 = f32[] add(%reduce_sum.757, %reduce_sum.758), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_344.435 (reduce_sum.764: f32[], reduce_sum.765: f32[]) -> f32[] {
+  %reduce_sum.764 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.765 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.766 = f32[] add(%reduce_sum.764, %reduce_sum.765), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_345.436 (reduce_sum.771: f32[], reduce_sum.772: f32[]) -> f32[] {
+  %reduce_sum.771 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.772 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.773 = f32[] add(%reduce_sum.771, %reduce_sum.772), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_346.437 (reduce_sum.778: f32[], reduce_sum.779: f32[]) -> f32[] {
+  %reduce_sum.778 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.779 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.780 = f32[] add(%reduce_sum.778, %reduce_sum.779), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_347.438 (reduce_sum.785: f32[], reduce_sum.786: f32[]) -> f32[] {
+  %reduce_sum.785 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.786 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.787 = f32[] add(%reduce_sum.785, %reduce_sum.786), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_348.439 (reduce_sum.792: f32[], reduce_sum.793: f32[]) -> f32[] {
+  %reduce_sum.792 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.793 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.794 = f32[] add(%reduce_sum.792, %reduce_sum.793), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_349.440 (reduce_sum.799: f32[], reduce_sum.800: f32[]) -> f32[] {
+  %reduce_sum.799 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.800 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.801 = f32[] add(%reduce_sum.799, %reduce_sum.800), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_350.441 (reduce_sum.806: f32[], reduce_sum.807: f32[]) -> f32[] {
+  %reduce_sum.806 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.807 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.808 = f32[] add(%reduce_sum.806, %reduce_sum.807), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_351.442 (reduce_sum.813: f32[], reduce_sum.814: f32[]) -> f32[] {
+  %reduce_sum.813 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.814 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.815 = f32[] add(%reduce_sum.813, %reduce_sum.814), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_352.443 (reduce_sum.820: f32[], reduce_sum.821: f32[]) -> f32[] {
+  %reduce_sum.820 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.821 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.822 = f32[] add(%reduce_sum.820, %reduce_sum.821), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_353.444 (reduce_sum.827: f32[], reduce_sum.828: f32[]) -> f32[] {
+  %reduce_sum.827 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.828 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.829 = f32[] add(%reduce_sum.827, %reduce_sum.828), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_354.445 (reduce_sum.834: f32[], reduce_sum.835: f32[]) -> f32[] {
+  %reduce_sum.834 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.835 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.836 = f32[] add(%reduce_sum.834, %reduce_sum.835), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_355.446 (reduce_sum.841: f32[], reduce_sum.842: f32[]) -> f32[] {
+  %reduce_sum.841 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.842 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.843 = f32[] add(%reduce_sum.841, %reduce_sum.842), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_356.447 (reduce_sum.848: f32[], reduce_sum.849: f32[]) -> f32[] {
+  %reduce_sum.848 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.849 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.850 = f32[] add(%reduce_sum.848, %reduce_sum.849), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_357.448 (reduce_sum.855: f32[], reduce_sum.856: f32[]) -> f32[] {
+  %reduce_sum.855 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.856 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.857 = f32[] add(%reduce_sum.855, %reduce_sum.856), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_358.449 (reduce_sum.862: f32[], reduce_sum.863: f32[]) -> f32[] {
+  %reduce_sum.862 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.863 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.864 = f32[] add(%reduce_sum.862, %reduce_sum.863), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_359.450 (reduce_sum.869: f32[], reduce_sum.870: f32[]) -> f32[] {
+  %reduce_sum.869 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.870 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.871 = f32[] add(%reduce_sum.869, %reduce_sum.870), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_360.451 (reduce_sum.876: f32[], reduce_sum.877: f32[]) -> f32[] {
+  %reduce_sum.876 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.877 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.878 = f32[] add(%reduce_sum.876, %reduce_sum.877), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_361.452 (reduce_sum.883: f32[], reduce_sum.884: f32[]) -> f32[] {
+  %reduce_sum.883 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.884 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.885 = f32[] add(%reduce_sum.883, %reduce_sum.884), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_362.453 (reduce_sum.890: f32[], reduce_sum.891: f32[]) -> f32[] {
+  %reduce_sum.890 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.891 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.892 = f32[] add(%reduce_sum.890, %reduce_sum.891), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_363.454 (reduce_sum.897: f32[], reduce_sum.898: f32[]) -> f32[] {
+  %reduce_sum.897 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.898 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.899 = f32[] add(%reduce_sum.897, %reduce_sum.898), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_364.455 (reduce_sum.904: f32[], reduce_sum.905: f32[]) -> f32[] {
+  %reduce_sum.904 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.905 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.906 = f32[] add(%reduce_sum.904, %reduce_sum.905), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_365.456 (reduce_sum.911: f32[], reduce_sum.912: f32[]) -> f32[] {
+  %reduce_sum.911 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.912 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.913 = f32[] add(%reduce_sum.911, %reduce_sum.912), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_366.457 (reduce_sum.918: f32[], reduce_sum.919: f32[]) -> f32[] {
+  %reduce_sum.918 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.919 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.920 = f32[] add(%reduce_sum.918, %reduce_sum.919), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_367.458 (reduce_sum.925: f32[], reduce_sum.926: f32[]) -> f32[] {
+  %reduce_sum.925 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.926 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.927 = f32[] add(%reduce_sum.925, %reduce_sum.926), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_368.459 (reduce_sum.932: f32[], reduce_sum.933: f32[]) -> f32[] {
+  %reduce_sum.932 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.933 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.934 = f32[] add(%reduce_sum.932, %reduce_sum.933), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_369.460 (reduce_sum.939: f32[], reduce_sum.940: f32[]) -> f32[] {
+  %reduce_sum.939 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.940 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.941 = f32[] add(%reduce_sum.939, %reduce_sum.940), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_370.461 (reduce_sum.946: f32[], reduce_sum.947: f32[]) -> f32[] {
+  %reduce_sum.946 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.947 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.948 = f32[] add(%reduce_sum.946, %reduce_sum.947), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_371.462 (reduce_sum.953: f32[], reduce_sum.954: f32[]) -> f32[] {
+  %reduce_sum.953 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.954 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.955 = f32[] add(%reduce_sum.953, %reduce_sum.954), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_372.463 (reduce_sum.960: f32[], reduce_sum.961: f32[]) -> f32[] {
+  %reduce_sum.960 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.961 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.962 = f32[] add(%reduce_sum.960, %reduce_sum.961), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_373.464 (reduce_sum.967: f32[], reduce_sum.968: f32[]) -> f32[] {
+  %reduce_sum.967 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.968 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.969 = f32[] add(%reduce_sum.967, %reduce_sum.968), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_374.465 (reduce_sum.974: f32[], reduce_sum.975: f32[]) -> f32[] {
+  %reduce_sum.974 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.975 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.976 = f32[] add(%reduce_sum.974, %reduce_sum.975), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_375.466 (reduce_sum.981: f32[], reduce_sum.982: f32[]) -> f32[] {
+  %reduce_sum.981 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.982 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.983 = f32[] add(%reduce_sum.981, %reduce_sum.982), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_376.467 (reduce_sum.988: f32[], reduce_sum.989: f32[]) -> f32[] {
+  %reduce_sum.988 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.989 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.990 = f32[] add(%reduce_sum.988, %reduce_sum.989), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_377.468 (reduce_sum.995: f32[], reduce_sum.996: f32[]) -> f32[] {
+  %reduce_sum.995 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.996 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.997 = f32[] add(%reduce_sum.995, %reduce_sum.996), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_378.469 (reduce_sum.1002: f32[], reduce_sum.1003: f32[]) -> f32[] {
+  %reduce_sum.1002 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1003 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1004 = f32[] add(%reduce_sum.1002, %reduce_sum.1003), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_379.470 (reduce_sum.1009: f32[], reduce_sum.1010: f32[]) -> f32[] {
+  %reduce_sum.1009 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1010 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1011 = f32[] add(%reduce_sum.1009, %reduce_sum.1010), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_380.471 (reduce_sum.1016: f32[], reduce_sum.1017: f32[]) -> f32[] {
+  %reduce_sum.1016 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1017 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1018 = f32[] add(%reduce_sum.1016, %reduce_sum.1017), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_381.472 (reduce_sum.1023: f32[], reduce_sum.1024: f32[]) -> f32[] {
+  %reduce_sum.1023 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1024 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1025 = f32[] add(%reduce_sum.1023, %reduce_sum.1024), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_382.473 (reduce_sum.1030: f32[], reduce_sum.1031: f32[]) -> f32[] {
+  %reduce_sum.1030 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1031 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1032 = f32[] add(%reduce_sum.1030, %reduce_sum.1031), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_383.474 (reduce_sum.1037: f32[], reduce_sum.1038: f32[]) -> f32[] {
+  %reduce_sum.1037 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1038 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1039 = f32[] add(%reduce_sum.1037, %reduce_sum.1038), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_384.475 (reduce_sum.1044: f32[], reduce_sum.1045: f32[]) -> f32[] {
+  %reduce_sum.1044 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1045 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1046 = f32[] add(%reduce_sum.1044, %reduce_sum.1045), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_385.476 (reduce_sum.1051: f32[], reduce_sum.1052: f32[]) -> f32[] {
+  %reduce_sum.1051 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1052 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1053 = f32[] add(%reduce_sum.1051, %reduce_sum.1052), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_386.477 (reduce_sum.1058: f32[], reduce_sum.1059: f32[]) -> f32[] {
+  %reduce_sum.1058 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1059 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1060 = f32[] add(%reduce_sum.1058, %reduce_sum.1059), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_387.478 (reduce_sum.1065: f32[], reduce_sum.1066: f32[]) -> f32[] {
+  %reduce_sum.1065 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1066 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1067 = f32[] add(%reduce_sum.1065, %reduce_sum.1066), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_388.479 (reduce_sum.1072: f32[], reduce_sum.1073: f32[]) -> f32[] {
+  %reduce_sum.1072 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1073 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1074 = f32[] add(%reduce_sum.1072, %reduce_sum.1073), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_389.480 (reduce_sum.1079: f32[], reduce_sum.1080: f32[]) -> f32[] {
+  %reduce_sum.1079 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1080 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1081 = f32[] add(%reduce_sum.1079, %reduce_sum.1080), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_390.481 (reduce_sum.1086: f32[], reduce_sum.1087: f32[]) -> f32[] {
+  %reduce_sum.1086 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1087 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1088 = f32[] add(%reduce_sum.1086, %reduce_sum.1087), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_391.482 (reduce_sum.1093: f32[], reduce_sum.1094: f32[]) -> f32[] {
+  %reduce_sum.1093 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1094 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1095 = f32[] add(%reduce_sum.1093, %reduce_sum.1094), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_392.483 (reduce_sum.1100: f32[], reduce_sum.1101: f32[]) -> f32[] {
+  %reduce_sum.1100 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1101 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1102 = f32[] add(%reduce_sum.1100, %reduce_sum.1101), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_393.484 (reduce_sum.1107: f32[], reduce_sum.1108: f32[]) -> f32[] {
+  %reduce_sum.1107 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1108 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1109 = f32[] add(%reduce_sum.1107, %reduce_sum.1108), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_394.485 (reduce_sum.1114: f32[], reduce_sum.1115: f32[]) -> f32[] {
+  %reduce_sum.1114 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1115 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1116 = f32[] add(%reduce_sum.1114, %reduce_sum.1115), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_395.486 (reduce_sum.1121: f32[], reduce_sum.1122: f32[]) -> f32[] {
+  %reduce_sum.1121 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1122 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1123 = f32[] add(%reduce_sum.1121, %reduce_sum.1122), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_396.487 (reduce_sum.1128: f32[], reduce_sum.1129: f32[]) -> f32[] {
+  %reduce_sum.1128 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1129 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1130 = f32[] add(%reduce_sum.1128, %reduce_sum.1129), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_397.488 (reduce_sum.1135: f32[], reduce_sum.1136: f32[]) -> f32[] {
+  %reduce_sum.1135 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1136 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1137 = f32[] add(%reduce_sum.1135, %reduce_sum.1136), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_398.489 (reduce_sum.1142: f32[], reduce_sum.1143: f32[]) -> f32[] {
+  %reduce_sum.1142 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1143 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1144 = f32[] add(%reduce_sum.1142, %reduce_sum.1143), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_399.490 (reduce_sum.1149: f32[], reduce_sum.1150: f32[]) -> f32[] {
+  %reduce_sum.1149 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1150 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1151 = f32[] add(%reduce_sum.1149, %reduce_sum.1150), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_400.491 (reduce_sum.1156: f32[], reduce_sum.1157: f32[]) -> f32[] {
+  %reduce_sum.1156 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1157 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1158 = f32[] add(%reduce_sum.1156, %reduce_sum.1157), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_401.492 (reduce_sum.1163: f32[], reduce_sum.1164: f32[]) -> f32[] {
+  %reduce_sum.1163 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1164 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1165 = f32[] add(%reduce_sum.1163, %reduce_sum.1164), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_402.493 (reduce_sum.1170: f32[], reduce_sum.1171: f32[]) -> f32[] {
+  %reduce_sum.1170 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1171 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1172 = f32[] add(%reduce_sum.1170, %reduce_sum.1171), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_403.494 (reduce_sum.1177: f32[], reduce_sum.1178: f32[]) -> f32[] {
+  %reduce_sum.1177 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1178 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1179 = f32[] add(%reduce_sum.1177, %reduce_sum.1178), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_404.495 (reduce_sum.1184: f32[], reduce_sum.1185: f32[]) -> f32[] {
+  %reduce_sum.1184 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1185 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1186 = f32[] add(%reduce_sum.1184, %reduce_sum.1185), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_405.496 (reduce_sum.1191: f32[], reduce_sum.1192: f32[]) -> f32[] {
+  %reduce_sum.1191 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1192 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1193 = f32[] add(%reduce_sum.1191, %reduce_sum.1192), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_406.497 (reduce_sum.1198: f32[], reduce_sum.1199: f32[]) -> f32[] {
+  %reduce_sum.1198 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1199 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1200 = f32[] add(%reduce_sum.1198, %reduce_sum.1199), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_407.498 (reduce_sum.1205: f32[], reduce_sum.1206: f32[]) -> f32[] {
+  %reduce_sum.1205 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1206 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1207 = f32[] add(%reduce_sum.1205, %reduce_sum.1206), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_408.499 (reduce_sum.1212: f32[], reduce_sum.1213: f32[]) -> f32[] {
+  %reduce_sum.1212 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1213 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1214 = f32[] add(%reduce_sum.1212, %reduce_sum.1213), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_409.500 (reduce_sum.1219: f32[], reduce_sum.1220: f32[]) -> f32[] {
+  %reduce_sum.1219 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1220 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1221 = f32[] add(%reduce_sum.1219, %reduce_sum.1220), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_410.501 (reduce_sum.1226: f32[], reduce_sum.1227: f32[]) -> f32[] {
+  %reduce_sum.1226 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1227 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1228 = f32[] add(%reduce_sum.1226, %reduce_sum.1227), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_411.502 (reduce_sum.1233: f32[], reduce_sum.1234: f32[]) -> f32[] {
+  %reduce_sum.1233 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1234 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1235 = f32[] add(%reduce_sum.1233, %reduce_sum.1234), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_412.503 (reduce_sum.1240: f32[], reduce_sum.1241: f32[]) -> f32[] {
+  %reduce_sum.1240 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1241 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1242 = f32[] add(%reduce_sum.1240, %reduce_sum.1241), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_413.504 (reduce_sum.1247: f32[], reduce_sum.1248: f32[]) -> f32[] {
+  %reduce_sum.1247 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1248 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1249 = f32[] add(%reduce_sum.1247, %reduce_sum.1248), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_414.505 (reduce_sum.1254: f32[], reduce_sum.1255: f32[]) -> f32[] {
+  %reduce_sum.1254 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1255 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1256 = f32[] add(%reduce_sum.1254, %reduce_sum.1255), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_415.506 (reduce_sum.1261: f32[], reduce_sum.1262: f32[]) -> f32[] {
+  %reduce_sum.1261 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1262 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1263 = f32[] add(%reduce_sum.1261, %reduce_sum.1262), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_416.507 (reduce_sum.1268: f32[], reduce_sum.1269: f32[]) -> f32[] {
+  %reduce_sum.1268 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1269 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1270 = f32[] add(%reduce_sum.1268, %reduce_sum.1269), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_417.508 (reduce_sum.1275: f32[], reduce_sum.1276: f32[]) -> f32[] {
+  %reduce_sum.1275 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1276 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1277 = f32[] add(%reduce_sum.1275, %reduce_sum.1276), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_418.509 (reduce_sum.1282: f32[], reduce_sum.1283: f32[]) -> f32[] {
+  %reduce_sum.1282 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1283 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1284 = f32[] add(%reduce_sum.1282, %reduce_sum.1283), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_419.510 (reduce_sum.1289: f32[], reduce_sum.1290: f32[]) -> f32[] {
+  %reduce_sum.1289 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1290 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1291 = f32[] add(%reduce_sum.1289, %reduce_sum.1290), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_420.511 (reduce_sum.1296: f32[], reduce_sum.1297: f32[]) -> f32[] {
+  %reduce_sum.1296 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1297 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1298 = f32[] add(%reduce_sum.1296, %reduce_sum.1297), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_421.512 (reduce_sum.1303: f32[], reduce_sum.1304: f32[]) -> f32[] {
+  %reduce_sum.1303 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1304 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1305 = f32[] add(%reduce_sum.1303, %reduce_sum.1304), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_422.513 (reduce_sum.1310: f32[], reduce_sum.1311: f32[]) -> f32[] {
+  %reduce_sum.1310 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1311 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1312 = f32[] add(%reduce_sum.1310, %reduce_sum.1311), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_423.514 (reduce_sum.1317: f32[], reduce_sum.1318: f32[]) -> f32[] {
+  %reduce_sum.1317 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1318 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1319 = f32[] add(%reduce_sum.1317, %reduce_sum.1318), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_424.515 (reduce_sum.1324: f32[], reduce_sum.1325: f32[]) -> f32[] {
+  %reduce_sum.1324 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1325 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1326 = f32[] add(%reduce_sum.1324, %reduce_sum.1325), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_425.516 (reduce_sum.1331: f32[], reduce_sum.1332: f32[]) -> f32[] {
+  %reduce_sum.1331 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1332 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1333 = f32[] add(%reduce_sum.1331, %reduce_sum.1332), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_426.517 (reduce_sum.1338: f32[], reduce_sum.1339: f32[]) -> f32[] {
+  %reduce_sum.1338 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1339 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1340 = f32[] add(%reduce_sum.1338, %reduce_sum.1339), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_427.518 (reduce_sum.1345: f32[], reduce_sum.1346: f32[]) -> f32[] {
+  %reduce_sum.1345 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1346 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1347 = f32[] add(%reduce_sum.1345, %reduce_sum.1346), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_428.519 (reduce_sum.1352: f32[], reduce_sum.1353: f32[]) -> f32[] {
+  %reduce_sum.1352 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1353 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1354 = f32[] add(%reduce_sum.1352, %reduce_sum.1353), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_429.520 (reduce_sum.1359: f32[], reduce_sum.1360: f32[]) -> f32[] {
+  %reduce_sum.1359 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1360 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1361 = f32[] add(%reduce_sum.1359, %reduce_sum.1360), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_430.521 (reduce_sum.1366: f32[], reduce_sum.1367: f32[]) -> f32[] {
+  %reduce_sum.1366 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1367 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1368 = f32[] add(%reduce_sum.1366, %reduce_sum.1367), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_431.522 (reduce_sum.1373: f32[], reduce_sum.1374: f32[]) -> f32[] {
+  %reduce_sum.1373 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1374 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1375 = f32[] add(%reduce_sum.1373, %reduce_sum.1374), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_432.523 (reduce_sum.1380: f32[], reduce_sum.1381: f32[]) -> f32[] {
+  %reduce_sum.1380 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1381 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1382 = f32[] add(%reduce_sum.1380, %reduce_sum.1381), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_433.524 (reduce_sum.1387: f32[], reduce_sum.1388: f32[]) -> f32[] {
+  %reduce_sum.1387 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1388 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1389 = f32[] add(%reduce_sum.1387, %reduce_sum.1388), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_434.525 (reduce_sum.1394: f32[], reduce_sum.1395: f32[]) -> f32[] {
+  %reduce_sum.1394 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1395 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1396 = f32[] add(%reduce_sum.1394, %reduce_sum.1395), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_435.526 (reduce_sum.1401: f32[], reduce_sum.1402: f32[]) -> f32[] {
+  %reduce_sum.1401 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1402 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1403 = f32[] add(%reduce_sum.1401, %reduce_sum.1402), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_436.527 (reduce_sum.1408: f32[], reduce_sum.1409: f32[]) -> f32[] {
+  %reduce_sum.1408 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1409 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1410 = f32[] add(%reduce_sum.1408, %reduce_sum.1409), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_437.528 (reduce_sum.1415: f32[], reduce_sum.1416: f32[]) -> f32[] {
+  %reduce_sum.1415 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1416 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1417 = f32[] add(%reduce_sum.1415, %reduce_sum.1416), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_438.529 (reduce_sum.1422: f32[], reduce_sum.1423: f32[]) -> f32[] {
+  %reduce_sum.1422 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1423 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1424 = f32[] add(%reduce_sum.1422, %reduce_sum.1423), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_439.530 (reduce_sum.1429: f32[], reduce_sum.1430: f32[]) -> f32[] {
+  %reduce_sum.1429 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1430 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1431 = f32[] add(%reduce_sum.1429, %reduce_sum.1430), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_440.531 (reduce_sum.1436: f32[], reduce_sum.1437: f32[]) -> f32[] {
+  %reduce_sum.1436 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1437 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1438 = f32[] add(%reduce_sum.1436, %reduce_sum.1437), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_441.532 (reduce_sum.1443: f32[], reduce_sum.1444: f32[]) -> f32[] {
+  %reduce_sum.1443 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1444 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1445 = f32[] add(%reduce_sum.1443, %reduce_sum.1444), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_442.533 (reduce_sum.1450: f32[], reduce_sum.1451: f32[]) -> f32[] {
+  %reduce_sum.1450 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1451 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1452 = f32[] add(%reduce_sum.1450, %reduce_sum.1451), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_443.534 (reduce_sum.1457: f32[], reduce_sum.1458: f32[]) -> f32[] {
+  %reduce_sum.1457 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1458 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1459 = f32[] add(%reduce_sum.1457, %reduce_sum.1458), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_444.535 (reduce_sum.1464: f32[], reduce_sum.1465: f32[]) -> f32[] {
+  %reduce_sum.1464 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1465 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1466 = f32[] add(%reduce_sum.1464, %reduce_sum.1465), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_445.536 (reduce_sum.1471: f32[], reduce_sum.1472: f32[]) -> f32[] {
+  %reduce_sum.1471 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1472 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1473 = f32[] add(%reduce_sum.1471, %reduce_sum.1472), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_446.537 (reduce_sum.1478: f32[], reduce_sum.1479: f32[]) -> f32[] {
+  %reduce_sum.1478 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1479 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1480 = f32[] add(%reduce_sum.1478, %reduce_sum.1479), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_447.538 (reduce_sum.1485: f32[], reduce_sum.1486: f32[]) -> f32[] {
+  %reduce_sum.1485 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1486 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1487 = f32[] add(%reduce_sum.1485, %reduce_sum.1486), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_448.539 (reduce_sum.1492: f32[], reduce_sum.1493: f32[]) -> f32[] {
+  %reduce_sum.1492 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1493 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1494 = f32[] add(%reduce_sum.1492, %reduce_sum.1493), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_449.540 (reduce_sum.1499: f32[], reduce_sum.1500: f32[]) -> f32[] {
+  %reduce_sum.1499 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1500 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1501 = f32[] add(%reduce_sum.1499, %reduce_sum.1500), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_450.541 (reduce_sum.1506: f32[], reduce_sum.1507: f32[]) -> f32[] {
+  %reduce_sum.1506 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1507 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1508 = f32[] add(%reduce_sum.1506, %reduce_sum.1507), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_451.542 (reduce_sum.1513: f32[], reduce_sum.1514: f32[]) -> f32[] {
+  %reduce_sum.1513 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1514 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1515 = f32[] add(%reduce_sum.1513, %reduce_sum.1514), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_452.543 (reduce_sum.1520: f32[], reduce_sum.1521: f32[]) -> f32[] {
+  %reduce_sum.1520 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1521 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1522 = f32[] add(%reduce_sum.1520, %reduce_sum.1521), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_453.544 (reduce_sum.1527: f32[], reduce_sum.1528: f32[]) -> f32[] {
+  %reduce_sum.1527 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1528 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1529 = f32[] add(%reduce_sum.1527, %reduce_sum.1528), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_454.545 (reduce_sum.1534: f32[], reduce_sum.1535: f32[]) -> f32[] {
+  %reduce_sum.1534 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1535 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1536 = f32[] add(%reduce_sum.1534, %reduce_sum.1535), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_455.546 (reduce_sum.1541: f32[], reduce_sum.1542: f32[]) -> f32[] {
+  %reduce_sum.1541 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1542 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1543 = f32[] add(%reduce_sum.1541, %reduce_sum.1542), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_456.547 (reduce_sum.1548: f32[], reduce_sum.1549: f32[]) -> f32[] {
+  %reduce_sum.1548 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1549 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1550 = f32[] add(%reduce_sum.1548, %reduce_sum.1549), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_457.548 (reduce_sum.1555: f32[], reduce_sum.1556: f32[]) -> f32[] {
+  %reduce_sum.1555 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1556 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1557 = f32[] add(%reduce_sum.1555, %reduce_sum.1556), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_458.549 (reduce_sum.1562: f32[], reduce_sum.1563: f32[]) -> f32[] {
+  %reduce_sum.1562 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1563 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1564 = f32[] add(%reduce_sum.1562, %reduce_sum.1563), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_459.550 (reduce_sum.1569: f32[], reduce_sum.1570: f32[]) -> f32[] {
+  %reduce_sum.1569 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1570 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1571 = f32[] add(%reduce_sum.1569, %reduce_sum.1570), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_460.551 (reduce_sum.1576: f32[], reduce_sum.1577: f32[]) -> f32[] {
+  %reduce_sum.1576 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1577 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1578 = f32[] add(%reduce_sum.1576, %reduce_sum.1577), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_461.552 (reduce_sum.1583: f32[], reduce_sum.1584: f32[]) -> f32[] {
+  %reduce_sum.1583 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1584 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1585 = f32[] add(%reduce_sum.1583, %reduce_sum.1584), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_462.553 (reduce_sum.1590: f32[], reduce_sum.1591: f32[]) -> f32[] {
+  %reduce_sum.1590 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1591 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1592 = f32[] add(%reduce_sum.1590, %reduce_sum.1591), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_463.554 (reduce_sum.1597: f32[], reduce_sum.1598: f32[]) -> f32[] {
+  %reduce_sum.1597 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1598 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1599 = f32[] add(%reduce_sum.1597, %reduce_sum.1598), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_464.555 (reduce_sum.1604: f32[], reduce_sum.1605: f32[]) -> f32[] {
+  %reduce_sum.1604 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1605 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1606 = f32[] add(%reduce_sum.1604, %reduce_sum.1605), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_465.556 (reduce_sum.1611: f32[], reduce_sum.1612: f32[]) -> f32[] {
+  %reduce_sum.1611 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1612 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1613 = f32[] add(%reduce_sum.1611, %reduce_sum.1612), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_466.557 (reduce_sum.1618: f32[], reduce_sum.1619: f32[]) -> f32[] {
+  %reduce_sum.1618 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1619 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1620 = f32[] add(%reduce_sum.1618, %reduce_sum.1619), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_467.558 (reduce_sum.1625: f32[], reduce_sum.1626: f32[]) -> f32[] {
+  %reduce_sum.1625 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1626 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1627 = f32[] add(%reduce_sum.1625, %reduce_sum.1626), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_468.559 (reduce_sum.1632: f32[], reduce_sum.1633: f32[]) -> f32[] {
+  %reduce_sum.1632 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1633 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1634 = f32[] add(%reduce_sum.1632, %reduce_sum.1633), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_469.560 (reduce_sum.1639: f32[], reduce_sum.1640: f32[]) -> f32[] {
+  %reduce_sum.1639 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1640 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1641 = f32[] add(%reduce_sum.1639, %reduce_sum.1640), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_470.561 (reduce_sum.1646: f32[], reduce_sum.1647: f32[]) -> f32[] {
+  %reduce_sum.1646 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1647 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1648 = f32[] add(%reduce_sum.1646, %reduce_sum.1647), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_471.562 (reduce_sum.1653: f32[], reduce_sum.1654: f32[]) -> f32[] {
+  %reduce_sum.1653 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1654 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1655 = f32[] add(%reduce_sum.1653, %reduce_sum.1654), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_472.563 (reduce_sum.1660: f32[], reduce_sum.1661: f32[]) -> f32[] {
+  %reduce_sum.1660 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1661 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1662 = f32[] add(%reduce_sum.1660, %reduce_sum.1661), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_473.564 (reduce_sum.1667: f32[], reduce_sum.1668: f32[]) -> f32[] {
+  %reduce_sum.1667 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1668 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1669 = f32[] add(%reduce_sum.1667, %reduce_sum.1668), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_474.565 (reduce_sum.1674: f32[], reduce_sum.1675: f32[]) -> f32[] {
+  %reduce_sum.1674 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1675 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1676 = f32[] add(%reduce_sum.1674, %reduce_sum.1675), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_475.566 (reduce_sum.1681: f32[], reduce_sum.1682: f32[]) -> f32[] {
+  %reduce_sum.1681 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1682 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1683 = f32[] add(%reduce_sum.1681, %reduce_sum.1682), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_476.567 (reduce_sum.1688: f32[], reduce_sum.1689: f32[]) -> f32[] {
+  %reduce_sum.1688 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1689 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1690 = f32[] add(%reduce_sum.1688, %reduce_sum.1689), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_477.568 (reduce_sum.1695: f32[], reduce_sum.1696: f32[]) -> f32[] {
+  %reduce_sum.1695 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1696 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1697 = f32[] add(%reduce_sum.1695, %reduce_sum.1696), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_478.569 (reduce_sum.1702: f32[], reduce_sum.1703: f32[]) -> f32[] {
+  %reduce_sum.1702 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1703 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1704 = f32[] add(%reduce_sum.1702, %reduce_sum.1703), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_479.570 (reduce_sum.1709: f32[], reduce_sum.1710: f32[]) -> f32[] {
+  %reduce_sum.1709 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1710 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1711 = f32[] add(%reduce_sum.1709, %reduce_sum.1710), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_480.571 (reduce_sum.1716: f32[], reduce_sum.1717: f32[]) -> f32[] {
+  %reduce_sum.1716 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1717 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1718 = f32[] add(%reduce_sum.1716, %reduce_sum.1717), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_481.572 (reduce_sum.1723: f32[], reduce_sum.1724: f32[]) -> f32[] {
+  %reduce_sum.1723 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1724 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1725 = f32[] add(%reduce_sum.1723, %reduce_sum.1724), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_482.573 (reduce_sum.1730: f32[], reduce_sum.1731: f32[]) -> f32[] {
+  %reduce_sum.1730 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1731 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1732 = f32[] add(%reduce_sum.1730, %reduce_sum.1731), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_483.574 (reduce_sum.1737: f32[], reduce_sum.1738: f32[]) -> f32[] {
+  %reduce_sum.1737 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1738 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1739 = f32[] add(%reduce_sum.1737, %reduce_sum.1738), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_484.575 (reduce_sum.1744: f32[], reduce_sum.1745: f32[]) -> f32[] {
+  %reduce_sum.1744 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1745 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1746 = f32[] add(%reduce_sum.1744, %reduce_sum.1745), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_485.576 (reduce_sum.1751: f32[], reduce_sum.1752: f32[]) -> f32[] {
+  %reduce_sum.1751 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1752 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1753 = f32[] add(%reduce_sum.1751, %reduce_sum.1752), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_486.577 (reduce_sum.1758: f32[], reduce_sum.1759: f32[]) -> f32[] {
+  %reduce_sum.1758 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1759 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1760 = f32[] add(%reduce_sum.1758, %reduce_sum.1759), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_487.578 (reduce_sum.1765: f32[], reduce_sum.1766: f32[]) -> f32[] {
+  %reduce_sum.1765 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1766 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1767 = f32[] add(%reduce_sum.1765, %reduce_sum.1766), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_488.579 (reduce_sum.1772: f32[], reduce_sum.1773: f32[]) -> f32[] {
+  %reduce_sum.1772 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1773 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1774 = f32[] add(%reduce_sum.1772, %reduce_sum.1773), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_489.580 (reduce_sum.1779: f32[], reduce_sum.1780: f32[]) -> f32[] {
+  %reduce_sum.1779 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1780 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1781 = f32[] add(%reduce_sum.1779, %reduce_sum.1780), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_490.581 (reduce_sum.1786: f32[], reduce_sum.1787: f32[]) -> f32[] {
+  %reduce_sum.1786 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1787 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1788 = f32[] add(%reduce_sum.1786, %reduce_sum.1787), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_491.582 (reduce_sum.1793: f32[], reduce_sum.1794: f32[]) -> f32[] {
+  %reduce_sum.1793 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1794 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1795 = f32[] add(%reduce_sum.1793, %reduce_sum.1794), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_492.583 (reduce_sum.1800: f32[], reduce_sum.1801: f32[]) -> f32[] {
+  %reduce_sum.1800 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1801 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1802 = f32[] add(%reduce_sum.1800, %reduce_sum.1801), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_493.584 (reduce_sum.1807: f32[], reduce_sum.1808: f32[]) -> f32[] {
+  %reduce_sum.1807 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1808 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1809 = f32[] add(%reduce_sum.1807, %reduce_sum.1808), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_494.585 (reduce_sum.1814: f32[], reduce_sum.1815: f32[]) -> f32[] {
+  %reduce_sum.1814 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1815 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1816 = f32[] add(%reduce_sum.1814, %reduce_sum.1815), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_495.586 (reduce_sum.1821: f32[], reduce_sum.1822: f32[]) -> f32[] {
+  %reduce_sum.1821 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1822 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1823 = f32[] add(%reduce_sum.1821, %reduce_sum.1822), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_496.587 (reduce_sum.1828: f32[], reduce_sum.1829: f32[]) -> f32[] {
+  %reduce_sum.1828 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1829 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1830 = f32[] add(%reduce_sum.1828, %reduce_sum.1829), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_497.588 (reduce_sum.1835: f32[], reduce_sum.1836: f32[]) -> f32[] {
+  %reduce_sum.1835 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1836 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1837 = f32[] add(%reduce_sum.1835, %reduce_sum.1836), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_58.95 (reduce_sum.81: f32[], reduce_sum.82: f32[]) -> f32[] {
+  %reduce_sum.81 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.82 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.83 = f32[] add(%reduce_sum.81, %reduce_sum.82), metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=575 source_end_line=575 source_column=10 source_end_column=49}
+}
+
+%region_59.96 (reduce_sum.88: f32[], reduce_sum.89: f32[]) -> f32[] {
+  %reduce_sum.88 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.89 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.90 = f32[] add(%reduce_sum.88, %reduce_sum.89), metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=150 source_end_line=150 source_column=15 source_end_column=28}
+}
+
+%region_657.748 (reduce_sum.2955: f32[], reduce_sum.2956: f32[]) -> f32[] {
+  %reduce_sum.2955 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2956 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2957 = f32[] add(%reduce_sum.2955, %reduce_sum.2956), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_658.749 (reduce_sum.2962: f32[], reduce_sum.2963: f32[]) -> f32[] {
+  %reduce_sum.2962 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2963 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2964 = f32[] add(%reduce_sum.2962, %reduce_sum.2963), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_659.750 (reduce_sum.2969: f32[], reduce_sum.2970: f32[]) -> f32[] {
+  %reduce_sum.2969 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2970 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2971 = f32[] add(%reduce_sum.2969, %reduce_sum.2970), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_660.751 (reduce_sum.2976: f32[], reduce_sum.2977: f32[]) -> f32[] {
+  %reduce_sum.2976 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2977 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2978 = f32[] add(%reduce_sum.2976, %reduce_sum.2977), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_661.752 (reduce_sum.2983: f32[], reduce_sum.2984: f32[]) -> f32[] {
+  %reduce_sum.2983 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2984 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2985 = f32[] add(%reduce_sum.2983, %reduce_sum.2984), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_662.753 (reduce_sum.2990: f32[], reduce_sum.2991: f32[]) -> f32[] {
+  %reduce_sum.2990 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2991 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2992 = f32[] add(%reduce_sum.2990, %reduce_sum.2991), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_663.754 (reduce_sum.2997: f32[], reduce_sum.2998: f32[]) -> f32[] {
+  %reduce_sum.2997 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2998 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2999 = f32[] add(%reduce_sum.2997, %reduce_sum.2998), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_664.755 (reduce_sum.3004: f32[], reduce_sum.3005: f32[]) -> f32[] {
+  %reduce_sum.3004 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3005 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3006 = f32[] add(%reduce_sum.3004, %reduce_sum.3005), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_665.756 (reduce_sum.3011: f32[], reduce_sum.3012: f32[]) -> f32[] {
+  %reduce_sum.3011 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3012 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3013 = f32[] add(%reduce_sum.3011, %reduce_sum.3012), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_666.757 (reduce_sum.3018: f32[], reduce_sum.3019: f32[]) -> f32[] {
+  %reduce_sum.3018 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3019 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3020 = f32[] add(%reduce_sum.3018, %reduce_sum.3019), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_667.758 (reduce_sum.3025: f32[], reduce_sum.3026: f32[]) -> f32[] {
+  %reduce_sum.3025 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3026 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3027 = f32[] add(%reduce_sum.3025, %reduce_sum.3026), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_668.759 (reduce_sum.3032: f32[], reduce_sum.3033: f32[]) -> f32[] {
+  %reduce_sum.3032 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3033 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3034 = f32[] add(%reduce_sum.3032, %reduce_sum.3033), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_669.760 (reduce_sum.3039: f32[], reduce_sum.3040: f32[]) -> f32[] {
+  %reduce_sum.3039 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3040 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3041 = f32[] add(%reduce_sum.3039, %reduce_sum.3040), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_670.761 (reduce_sum.3046: f32[], reduce_sum.3047: f32[]) -> f32[] {
+  %reduce_sum.3046 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3047 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3048 = f32[] add(%reduce_sum.3046, %reduce_sum.3047), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_671.762 (reduce_sum.3053: f32[], reduce_sum.3054: f32[]) -> f32[] {
+  %reduce_sum.3053 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3054 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3055 = f32[] add(%reduce_sum.3053, %reduce_sum.3054), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_672.763 (reduce_sum.3060: f32[], reduce_sum.3061: f32[]) -> f32[] {
+  %reduce_sum.3060 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3061 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3062 = f32[] add(%reduce_sum.3060, %reduce_sum.3061), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_673.764 (reduce_sum.3067: f32[], reduce_sum.3068: f32[]) -> f32[] {
+  %reduce_sum.3067 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3068 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3069 = f32[] add(%reduce_sum.3067, %reduce_sum.3068), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_674.765 (reduce_sum.3074: f32[], reduce_sum.3075: f32[]) -> f32[] {
+  %reduce_sum.3074 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3075 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3076 = f32[] add(%reduce_sum.3074, %reduce_sum.3075), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_675.766 (reduce_sum.3081: f32[], reduce_sum.3082: f32[]) -> f32[] {
+  %reduce_sum.3081 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3082 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3083 = f32[] add(%reduce_sum.3081, %reduce_sum.3082), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_676.767 (reduce_sum.3088: f32[], reduce_sum.3089: f32[]) -> f32[] {
+  %reduce_sum.3088 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3089 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3090 = f32[] add(%reduce_sum.3088, %reduce_sum.3089), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_677.768 (reduce_sum.3095: f32[], reduce_sum.3096: f32[]) -> f32[] {
+  %reduce_sum.3095 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3096 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3097 = f32[] add(%reduce_sum.3095, %reduce_sum.3096), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_678.769 (reduce_sum.3102: f32[], reduce_sum.3103: f32[]) -> f32[] {
+  %reduce_sum.3102 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3103 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3104 = f32[] add(%reduce_sum.3102, %reduce_sum.3103), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_679.770 (reduce_sum.3109: f32[], reduce_sum.3110: f32[]) -> f32[] {
+  %reduce_sum.3109 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3110 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3111 = f32[] add(%reduce_sum.3109, %reduce_sum.3110), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_680.771 (reduce_sum.3116: f32[], reduce_sum.3117: f32[]) -> f32[] {
+  %reduce_sum.3116 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3117 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3118 = f32[] add(%reduce_sum.3116, %reduce_sum.3117), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_681.772 (reduce_sum.3123: f32[], reduce_sum.3124: f32[]) -> f32[] {
+  %reduce_sum.3123 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3124 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3125 = f32[] add(%reduce_sum.3123, %reduce_sum.3124), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_682.773 (reduce_sum.3130: f32[], reduce_sum.3131: f32[]) -> f32[] {
+  %reduce_sum.3130 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3131 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3132 = f32[] add(%reduce_sum.3130, %reduce_sum.3131), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_683.774 (reduce_sum.3137: f32[], reduce_sum.3138: f32[]) -> f32[] {
+  %reduce_sum.3137 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3138 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3139 = f32[] add(%reduce_sum.3137, %reduce_sum.3138), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_684.775 (reduce_sum.3144: f32[], reduce_sum.3145: f32[]) -> f32[] {
+  %reduce_sum.3144 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3145 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3146 = f32[] add(%reduce_sum.3144, %reduce_sum.3145), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_685.776 (reduce_sum.3151: f32[], reduce_sum.3152: f32[]) -> f32[] {
+  %reduce_sum.3151 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3152 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3153 = f32[] add(%reduce_sum.3151, %reduce_sum.3152), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_686.777 (reduce_sum.3158: f32[], reduce_sum.3159: f32[]) -> f32[] {
+  %reduce_sum.3158 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3159 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3160 = f32[] add(%reduce_sum.3158, %reduce_sum.3159), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_687.778 (reduce_sum.3165: f32[], reduce_sum.3166: f32[]) -> f32[] {
+  %reduce_sum.3165 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3166 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3167 = f32[] add(%reduce_sum.3165, %reduce_sum.3166), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_688.779 (reduce_sum.3172: f32[], reduce_sum.3173: f32[]) -> f32[] {
+  %reduce_sum.3172 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3173 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3174 = f32[] add(%reduce_sum.3172, %reduce_sum.3173), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_689.780 (reduce_sum.3179: f32[], reduce_sum.3180: f32[]) -> f32[] {
+  %reduce_sum.3179 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3180 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3181 = f32[] add(%reduce_sum.3179, %reduce_sum.3180), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_690.781 (reduce_sum.3186: f32[], reduce_sum.3187: f32[]) -> f32[] {
+  %reduce_sum.3186 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3187 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3188 = f32[] add(%reduce_sum.3186, %reduce_sum.3187), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_691.782 (reduce_sum.3193: f32[], reduce_sum.3194: f32[]) -> f32[] {
+  %reduce_sum.3193 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3194 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3195 = f32[] add(%reduce_sum.3193, %reduce_sum.3194), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_692.783 (reduce_sum.3200: f32[], reduce_sum.3201: f32[]) -> f32[] {
+  %reduce_sum.3200 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3201 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3202 = f32[] add(%reduce_sum.3200, %reduce_sum.3201), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_693.784 (reduce_sum.3207: f32[], reduce_sum.3208: f32[]) -> f32[] {
+  %reduce_sum.3207 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3208 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3209 = f32[] add(%reduce_sum.3207, %reduce_sum.3208), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_694.785 (reduce_sum.3214: f32[], reduce_sum.3215: f32[]) -> f32[] {
+  %reduce_sum.3214 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3215 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3216 = f32[] add(%reduce_sum.3214, %reduce_sum.3215), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_695.786 (reduce_sum.3221: f32[], reduce_sum.3222: f32[]) -> f32[] {
+  %reduce_sum.3221 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3222 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3223 = f32[] add(%reduce_sum.3221, %reduce_sum.3222), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_696.787 (reduce_sum.3228: f32[], reduce_sum.3229: f32[]) -> f32[] {
+  %reduce_sum.3228 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3229 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3230 = f32[] add(%reduce_sum.3228, %reduce_sum.3229), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_697.788 (reduce_sum.3235: f32[], reduce_sum.3236: f32[]) -> f32[] {
+  %reduce_sum.3235 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3236 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3237 = f32[] add(%reduce_sum.3235, %reduce_sum.3236), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_698.789 (reduce_sum.3242: f32[], reduce_sum.3243: f32[]) -> f32[] {
+  %reduce_sum.3242 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3243 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3244 = f32[] add(%reduce_sum.3242, %reduce_sum.3243), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_699.790 (reduce_sum.3249: f32[], reduce_sum.3250: f32[]) -> f32[] {
+  %reduce_sum.3249 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3250 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3251 = f32[] add(%reduce_sum.3249, %reduce_sum.3250), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_700.791 (reduce_sum.3256: f32[], reduce_sum.3257: f32[]) -> f32[] {
+  %reduce_sum.3256 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3257 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3258 = f32[] add(%reduce_sum.3256, %reduce_sum.3257), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_701.792 (reduce_sum.3263: f32[], reduce_sum.3264: f32[]) -> f32[] {
+  %reduce_sum.3263 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3264 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3265 = f32[] add(%reduce_sum.3263, %reduce_sum.3264), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_702.793 (reduce_sum.3270: f32[], reduce_sum.3271: f32[]) -> f32[] {
+  %reduce_sum.3270 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3271 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3272 = f32[] add(%reduce_sum.3270, %reduce_sum.3271), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_703.794 (reduce_sum.3277: f32[], reduce_sum.3278: f32[]) -> f32[] {
+  %reduce_sum.3277 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3278 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3279 = f32[] add(%reduce_sum.3277, %reduce_sum.3278), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_704.795 (reduce_sum.3284: f32[], reduce_sum.3285: f32[]) -> f32[] {
+  %reduce_sum.3284 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3285 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3286 = f32[] add(%reduce_sum.3284, %reduce_sum.3285), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_705.796 (reduce_sum.3291: f32[], reduce_sum.3292: f32[]) -> f32[] {
+  %reduce_sum.3291 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3292 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3293 = f32[] add(%reduce_sum.3291, %reduce_sum.3292), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_706.797 (reduce_sum.3298: f32[], reduce_sum.3299: f32[]) -> f32[] {
+  %reduce_sum.3298 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3299 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3300 = f32[] add(%reduce_sum.3298, %reduce_sum.3299), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_707.798 (reduce_sum.3305: f32[], reduce_sum.3306: f32[]) -> f32[] {
+  %reduce_sum.3305 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3306 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3307 = f32[] add(%reduce_sum.3305, %reduce_sum.3306), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_708.799 (reduce_sum.3312: f32[], reduce_sum.3313: f32[]) -> f32[] {
+  %reduce_sum.3312 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3313 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3314 = f32[] add(%reduce_sum.3312, %reduce_sum.3313), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_709.800 (reduce_sum.3319: f32[], reduce_sum.3320: f32[]) -> f32[] {
+  %reduce_sum.3319 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3320 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3321 = f32[] add(%reduce_sum.3319, %reduce_sum.3320), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_710.801 (reduce_sum.3326: f32[], reduce_sum.3327: f32[]) -> f32[] {
+  %reduce_sum.3326 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3327 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3328 = f32[] add(%reduce_sum.3326, %reduce_sum.3327), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_711.802 (reduce_sum.3333: f32[], reduce_sum.3334: f32[]) -> f32[] {
+  %reduce_sum.3333 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3334 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3335 = f32[] add(%reduce_sum.3333, %reduce_sum.3334), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_712.803 (reduce_sum.3340: f32[], reduce_sum.3341: f32[]) -> f32[] {
+  %reduce_sum.3340 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3341 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3342 = f32[] add(%reduce_sum.3340, %reduce_sum.3341), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_713.804 (reduce_sum.3347: f32[], reduce_sum.3348: f32[]) -> f32[] {
+  %reduce_sum.3347 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3348 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3349 = f32[] add(%reduce_sum.3347, %reduce_sum.3348), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_714.805 (reduce_sum.3354: f32[], reduce_sum.3355: f32[]) -> f32[] {
+  %reduce_sum.3354 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3355 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3356 = f32[] add(%reduce_sum.3354, %reduce_sum.3355), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_715.806 (reduce_sum.3361: f32[], reduce_sum.3362: f32[]) -> f32[] {
+  %reduce_sum.3361 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3362 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3363 = f32[] add(%reduce_sum.3361, %reduce_sum.3362), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_716.807 (reduce_sum.3368: f32[], reduce_sum.3369: f32[]) -> f32[] {
+  %reduce_sum.3368 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3369 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3370 = f32[] add(%reduce_sum.3368, %reduce_sum.3369), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_717.808 (reduce_sum.3375: f32[], reduce_sum.3376: f32[]) -> f32[] {
+  %reduce_sum.3375 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3376 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3377 = f32[] add(%reduce_sum.3375, %reduce_sum.3376), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_718.809 (reduce_sum.3382: f32[], reduce_sum.3383: f32[]) -> f32[] {
+  %reduce_sum.3382 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3383 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3384 = f32[] add(%reduce_sum.3382, %reduce_sum.3383), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_719.810 (reduce_sum.3389: f32[], reduce_sum.3390: f32[]) -> f32[] {
+  %reduce_sum.3389 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3390 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3391 = f32[] add(%reduce_sum.3389, %reduce_sum.3390), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_720.811 (reduce_sum.3396: f32[], reduce_sum.3397: f32[]) -> f32[] {
+  %reduce_sum.3396 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3397 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3398 = f32[] add(%reduce_sum.3396, %reduce_sum.3397), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_721.812 (reduce_sum.3403: f32[], reduce_sum.3404: f32[]) -> f32[] {
+  %reduce_sum.3403 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3404 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3405 = f32[] add(%reduce_sum.3403, %reduce_sum.3404), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_722.813 (reduce_sum.3410: f32[], reduce_sum.3411: f32[]) -> f32[] {
+  %reduce_sum.3410 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3411 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3412 = f32[] add(%reduce_sum.3410, %reduce_sum.3411), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_723.814 (reduce_sum.3417: f32[], reduce_sum.3418: f32[]) -> f32[] {
+  %reduce_sum.3417 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3418 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3419 = f32[] add(%reduce_sum.3417, %reduce_sum.3418), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_724.815 (reduce_sum.3424: f32[], reduce_sum.3425: f32[]) -> f32[] {
+  %reduce_sum.3424 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3425 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3426 = f32[] add(%reduce_sum.3424, %reduce_sum.3425), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_725.816 (reduce_sum.3431: f32[], reduce_sum.3432: f32[]) -> f32[] {
+  %reduce_sum.3431 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3432 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3433 = f32[] add(%reduce_sum.3431, %reduce_sum.3432), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_726.817 (reduce_sum.3438: f32[], reduce_sum.3439: f32[]) -> f32[] {
+  %reduce_sum.3438 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3439 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3440 = f32[] add(%reduce_sum.3438, %reduce_sum.3439), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_727.818 (reduce_sum.3445: f32[], reduce_sum.3446: f32[]) -> f32[] {
+  %reduce_sum.3445 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3446 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3447 = f32[] add(%reduce_sum.3445, %reduce_sum.3446), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_728.819 (reduce_sum.3452: f32[], reduce_sum.3453: f32[]) -> f32[] {
+  %reduce_sum.3452 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3453 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3454 = f32[] add(%reduce_sum.3452, %reduce_sum.3453), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_729.820 (reduce_sum.3459: f32[], reduce_sum.3460: f32[]) -> f32[] {
+  %reduce_sum.3459 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3460 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3461 = f32[] add(%reduce_sum.3459, %reduce_sum.3460), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_730.821 (reduce_sum.3466: f32[], reduce_sum.3467: f32[]) -> f32[] {
+  %reduce_sum.3466 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3467 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3468 = f32[] add(%reduce_sum.3466, %reduce_sum.3467), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_731.822 (reduce_sum.3473: f32[], reduce_sum.3474: f32[]) -> f32[] {
+  %reduce_sum.3473 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3474 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3475 = f32[] add(%reduce_sum.3473, %reduce_sum.3474), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_732.823 (reduce_sum.3480: f32[], reduce_sum.3481: f32[]) -> f32[] {
+  %reduce_sum.3480 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3481 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3482 = f32[] add(%reduce_sum.3480, %reduce_sum.3481), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_733.824 (reduce_sum.3487: f32[], reduce_sum.3488: f32[]) -> f32[] {
+  %reduce_sum.3487 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3488 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3489 = f32[] add(%reduce_sum.3487, %reduce_sum.3488), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_734.825 (reduce_sum.3494: f32[], reduce_sum.3495: f32[]) -> f32[] {
+  %reduce_sum.3494 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3495 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3496 = f32[] add(%reduce_sum.3494, %reduce_sum.3495), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_735.826 (reduce_sum.3501: f32[], reduce_sum.3502: f32[]) -> f32[] {
+  %reduce_sum.3501 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3502 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3503 = f32[] add(%reduce_sum.3501, %reduce_sum.3502), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_736.827 (reduce_sum.3508: f32[], reduce_sum.3509: f32[]) -> f32[] {
+  %reduce_sum.3508 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3509 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3510 = f32[] add(%reduce_sum.3508, %reduce_sum.3509), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_737.828 (reduce_sum.3515: f32[], reduce_sum.3516: f32[]) -> f32[] {
+  %reduce_sum.3515 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3516 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3517 = f32[] add(%reduce_sum.3515, %reduce_sum.3516), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_738.829 (reduce_sum.3522: f32[], reduce_sum.3523: f32[]) -> f32[] {
+  %reduce_sum.3522 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3523 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3524 = f32[] add(%reduce_sum.3522, %reduce_sum.3523), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_739.830 (reduce_sum.3529: f32[], reduce_sum.3530: f32[]) -> f32[] {
+  %reduce_sum.3529 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3530 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3531 = f32[] add(%reduce_sum.3529, %reduce_sum.3530), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_740.831 (reduce_sum.3536: f32[], reduce_sum.3537: f32[]) -> f32[] {
+  %reduce_sum.3536 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3537 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3538 = f32[] add(%reduce_sum.3536, %reduce_sum.3537), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_741.832 (reduce_sum.3543: f32[], reduce_sum.3544: f32[]) -> f32[] {
+  %reduce_sum.3543 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3544 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3545 = f32[] add(%reduce_sum.3543, %reduce_sum.3544), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_742.833 (reduce_sum.3550: f32[], reduce_sum.3551: f32[]) -> f32[] {
+  %reduce_sum.3550 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3551 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3552 = f32[] add(%reduce_sum.3550, %reduce_sum.3551), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_743.834 (reduce_sum.3557: f32[], reduce_sum.3558: f32[]) -> f32[] {
+  %reduce_sum.3557 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3558 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3559 = f32[] add(%reduce_sum.3557, %reduce_sum.3558), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_744.835 (reduce_sum.3564: f32[], reduce_sum.3565: f32[]) -> f32[] {
+  %reduce_sum.3564 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3565 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3566 = f32[] add(%reduce_sum.3564, %reduce_sum.3565), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_745.836 (reduce_sum.3571: f32[], reduce_sum.3572: f32[]) -> f32[] {
+  %reduce_sum.3571 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3572 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3573 = f32[] add(%reduce_sum.3571, %reduce_sum.3572), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_746.837 (reduce_sum.3578: f32[], reduce_sum.3579: f32[]) -> f32[] {
+  %reduce_sum.3578 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3579 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3580 = f32[] add(%reduce_sum.3578, %reduce_sum.3579), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_747.838 (reduce_sum.3585: f32[], reduce_sum.3586: f32[]) -> f32[] {
+  %reduce_sum.3585 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3586 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3587 = f32[] add(%reduce_sum.3585, %reduce_sum.3586), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_748.839 (reduce_sum.3592: f32[], reduce_sum.3593: f32[]) -> f32[] {
+  %reduce_sum.3592 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3593 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3594 = f32[] add(%reduce_sum.3592, %reduce_sum.3593), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_749.840 (reduce_sum.3599: f32[], reduce_sum.3600: f32[]) -> f32[] {
+  %reduce_sum.3599 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3600 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3601 = f32[] add(%reduce_sum.3599, %reduce_sum.3600), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_750.841 (reduce_sum.3606: f32[], reduce_sum.3607: f32[]) -> f32[] {
+  %reduce_sum.3606 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3607 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3608 = f32[] add(%reduce_sum.3606, %reduce_sum.3607), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_751.842 (reduce_sum.3613: f32[], reduce_sum.3614: f32[]) -> f32[] {
+  %reduce_sum.3613 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3614 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3615 = f32[] add(%reduce_sum.3613, %reduce_sum.3614), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_752.843 (reduce_sum.3620: f32[], reduce_sum.3621: f32[]) -> f32[] {
+  %reduce_sum.3620 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3621 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3622 = f32[] add(%reduce_sum.3620, %reduce_sum.3621), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_753.844 (reduce_sum.3627: f32[], reduce_sum.3628: f32[]) -> f32[] {
+  %reduce_sum.3627 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3628 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3629 = f32[] add(%reduce_sum.3627, %reduce_sum.3628), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_754.845 (reduce_sum.3634: f32[], reduce_sum.3635: f32[]) -> f32[] {
+  %reduce_sum.3634 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3635 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3636 = f32[] add(%reduce_sum.3634, %reduce_sum.3635), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_755.846 (reduce_sum.3641: f32[], reduce_sum.3642: f32[]) -> f32[] {
+  %reduce_sum.3641 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3642 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3643 = f32[] add(%reduce_sum.3641, %reduce_sum.3642), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_756.847 (reduce_sum.3648: f32[], reduce_sum.3649: f32[]) -> f32[] {
+  %reduce_sum.3648 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3649 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3650 = f32[] add(%reduce_sum.3648, %reduce_sum.3649), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_757.848 (reduce_sum.3655: f32[], reduce_sum.3656: f32[]) -> f32[] {
+  %reduce_sum.3655 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3656 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3657 = f32[] add(%reduce_sum.3655, %reduce_sum.3656), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_758.849 (reduce_sum.3662: f32[], reduce_sum.3663: f32[]) -> f32[] {
+  %reduce_sum.3662 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3663 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3664 = f32[] add(%reduce_sum.3662, %reduce_sum.3663), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_759.850 (reduce_sum.3669: f32[], reduce_sum.3670: f32[]) -> f32[] {
+  %reduce_sum.3669 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3670 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3671 = f32[] add(%reduce_sum.3669, %reduce_sum.3670), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_760.851 (reduce_sum.3676: f32[], reduce_sum.3677: f32[]) -> f32[] {
+  %reduce_sum.3676 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3677 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3678 = f32[] add(%reduce_sum.3676, %reduce_sum.3677), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_761.852 (reduce_sum.3683: f32[], reduce_sum.3684: f32[]) -> f32[] {
+  %reduce_sum.3683 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3684 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3685 = f32[] add(%reduce_sum.3683, %reduce_sum.3684), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_762.853 (reduce_sum.3690: f32[], reduce_sum.3691: f32[]) -> f32[] {
+  %reduce_sum.3690 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3691 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3692 = f32[] add(%reduce_sum.3690, %reduce_sum.3691), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_763.854 (reduce_sum.3697: f32[], reduce_sum.3698: f32[]) -> f32[] {
+  %reduce_sum.3697 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3698 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3699 = f32[] add(%reduce_sum.3697, %reduce_sum.3698), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_764.855 (reduce_sum.3704: f32[], reduce_sum.3705: f32[]) -> f32[] {
+  %reduce_sum.3704 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3705 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3706 = f32[] add(%reduce_sum.3704, %reduce_sum.3705), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_765.856 (reduce_sum.3711: f32[], reduce_sum.3712: f32[]) -> f32[] {
+  %reduce_sum.3711 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3712 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3713 = f32[] add(%reduce_sum.3711, %reduce_sum.3712), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_766.857 (reduce_sum.3718: f32[], reduce_sum.3719: f32[]) -> f32[] {
+  %reduce_sum.3718 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3719 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3720 = f32[] add(%reduce_sum.3718, %reduce_sum.3719), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_767.858 (reduce_sum.3725: f32[], reduce_sum.3726: f32[]) -> f32[] {
+  %reduce_sum.3725 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3726 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3727 = f32[] add(%reduce_sum.3725, %reduce_sum.3726), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_768.859 (reduce_sum.3732: f32[], reduce_sum.3733: f32[]) -> f32[] {
+  %reduce_sum.3732 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3733 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3734 = f32[] add(%reduce_sum.3732, %reduce_sum.3733), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_769.860 (reduce_sum.3739: f32[], reduce_sum.3740: f32[]) -> f32[] {
+  %reduce_sum.3739 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3740 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3741 = f32[] add(%reduce_sum.3739, %reduce_sum.3740), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_770.861 (reduce_sum.3746: f32[], reduce_sum.3747: f32[]) -> f32[] {
+  %reduce_sum.3746 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3747 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3748 = f32[] add(%reduce_sum.3746, %reduce_sum.3747), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_771.862 (reduce_sum.3753: f32[], reduce_sum.3754: f32[]) -> f32[] {
+  %reduce_sum.3753 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3754 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3755 = f32[] add(%reduce_sum.3753, %reduce_sum.3754), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_772.863 (reduce_sum.3760: f32[], reduce_sum.3761: f32[]) -> f32[] {
+  %reduce_sum.3760 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3761 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3762 = f32[] add(%reduce_sum.3760, %reduce_sum.3761), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_773.864 (reduce_sum.3767: f32[], reduce_sum.3768: f32[]) -> f32[] {
+  %reduce_sum.3767 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3768 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3769 = f32[] add(%reduce_sum.3767, %reduce_sum.3768), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_774.865 (reduce_sum.3774: f32[], reduce_sum.3775: f32[]) -> f32[] {
+  %reduce_sum.3774 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3775 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3776 = f32[] add(%reduce_sum.3774, %reduce_sum.3775), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_775.866 (reduce_sum.3781: f32[], reduce_sum.3782: f32[]) -> f32[] {
+  %reduce_sum.3781 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3782 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3783 = f32[] add(%reduce_sum.3781, %reduce_sum.3782), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_776.867 (reduce_sum.3788: f32[], reduce_sum.3789: f32[]) -> f32[] {
+  %reduce_sum.3788 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3789 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3790 = f32[] add(%reduce_sum.3788, %reduce_sum.3789), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_777.868 (reduce_sum.3795: f32[], reduce_sum.3796: f32[]) -> f32[] {
+  %reduce_sum.3795 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3796 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3797 = f32[] add(%reduce_sum.3795, %reduce_sum.3796), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_778.869 (reduce_sum.3802: f32[], reduce_sum.3803: f32[]) -> f32[] {
+  %reduce_sum.3802 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3803 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3804 = f32[] add(%reduce_sum.3802, %reduce_sum.3803), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_779.870 (reduce_sum.3809: f32[], reduce_sum.3810: f32[]) -> f32[] {
+  %reduce_sum.3809 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3810 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3811 = f32[] add(%reduce_sum.3809, %reduce_sum.3810), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_780.871 (reduce_sum.3816: f32[], reduce_sum.3817: f32[]) -> f32[] {
+  %reduce_sum.3816 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3817 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3818 = f32[] add(%reduce_sum.3816, %reduce_sum.3817), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_781.872 (reduce_sum.3823: f32[], reduce_sum.3824: f32[]) -> f32[] {
+  %reduce_sum.3823 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3824 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3825 = f32[] add(%reduce_sum.3823, %reduce_sum.3824), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_782.873 (reduce_sum.3830: f32[], reduce_sum.3831: f32[]) -> f32[] {
+  %reduce_sum.3830 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3831 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3832 = f32[] add(%reduce_sum.3830, %reduce_sum.3831), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_783.874 (reduce_sum.3837: f32[], reduce_sum.3838: f32[]) -> f32[] {
+  %reduce_sum.3837 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3838 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3839 = f32[] add(%reduce_sum.3837, %reduce_sum.3838), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_784.875 (reduce_sum.3844: f32[], reduce_sum.3845: f32[]) -> f32[] {
+  %reduce_sum.3844 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3845 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3846 = f32[] add(%reduce_sum.3844, %reduce_sum.3845), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_785.876 (reduce_sum.3851: f32[], reduce_sum.3852: f32[]) -> f32[] {
+  %reduce_sum.3851 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3852 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3853 = f32[] add(%reduce_sum.3851, %reduce_sum.3852), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_786.877 (reduce_sum.3858: f32[], reduce_sum.3859: f32[]) -> f32[] {
+  %reduce_sum.3858 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3859 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3860 = f32[] add(%reduce_sum.3858, %reduce_sum.3859), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_787.878 (reduce_sum.3865: f32[], reduce_sum.3866: f32[]) -> f32[] {
+  %reduce_sum.3865 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3866 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3867 = f32[] add(%reduce_sum.3865, %reduce_sum.3866), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_788.879 (reduce_sum.3872: f32[], reduce_sum.3873: f32[]) -> f32[] {
+  %reduce_sum.3872 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3873 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3874 = f32[] add(%reduce_sum.3872, %reduce_sum.3873), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_789.880 (reduce_sum.3879: f32[], reduce_sum.3880: f32[]) -> f32[] {
+  %reduce_sum.3879 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3880 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3881 = f32[] add(%reduce_sum.3879, %reduce_sum.3880), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_790.881 (reduce_sum.3886: f32[], reduce_sum.3887: f32[]) -> f32[] {
+  %reduce_sum.3886 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3887 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3888 = f32[] add(%reduce_sum.3886, %reduce_sum.3887), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_791.882 (reduce_sum.3893: f32[], reduce_sum.3894: f32[]) -> f32[] {
+  %reduce_sum.3893 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3894 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3895 = f32[] add(%reduce_sum.3893, %reduce_sum.3894), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_792.883 (reduce_sum.3900: f32[], reduce_sum.3901: f32[]) -> f32[] {
+  %reduce_sum.3900 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3901 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3902 = f32[] add(%reduce_sum.3900, %reduce_sum.3901), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_793.884 (reduce_sum.3907: f32[], reduce_sum.3908: f32[]) -> f32[] {
+  %reduce_sum.3907 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3908 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3909 = f32[] add(%reduce_sum.3907, %reduce_sum.3908), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_794.885 (reduce_sum.3914: f32[], reduce_sum.3915: f32[]) -> f32[] {
+  %reduce_sum.3914 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3915 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3916 = f32[] add(%reduce_sum.3914, %reduce_sum.3915), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_795.886 (reduce_sum.3921: f32[], reduce_sum.3922: f32[]) -> f32[] {
+  %reduce_sum.3921 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3922 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3923 = f32[] add(%reduce_sum.3921, %reduce_sum.3922), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_796.887 (reduce_sum.3928: f32[], reduce_sum.3929: f32[]) -> f32[] {
+  %reduce_sum.3928 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3929 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3930 = f32[] add(%reduce_sum.3928, %reduce_sum.3929), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_797.888 (reduce_sum.3935: f32[], reduce_sum.3936: f32[]) -> f32[] {
+  %reduce_sum.3935 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3936 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3937 = f32[] add(%reduce_sum.3935, %reduce_sum.3936), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_798.889 (reduce_sum.3942: f32[], reduce_sum.3943: f32[]) -> f32[] {
+  %reduce_sum.3942 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3943 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3944 = f32[] add(%reduce_sum.3942, %reduce_sum.3943), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_799.890 (reduce_sum.3949: f32[], reduce_sum.3950: f32[]) -> f32[] {
+  %reduce_sum.3949 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3950 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3951 = f32[] add(%reduce_sum.3949, %reduce_sum.3950), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_800.891 (reduce_sum.3956: f32[], reduce_sum.3957: f32[]) -> f32[] {
+  %reduce_sum.3956 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3957 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3958 = f32[] add(%reduce_sum.3956, %reduce_sum.3957), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_801.892 (reduce_sum.3963: f32[], reduce_sum.3964: f32[]) -> f32[] {
+  %reduce_sum.3963 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3964 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3965 = f32[] add(%reduce_sum.3963, %reduce_sum.3964), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_802.893 (reduce_sum.3970: f32[], reduce_sum.3971: f32[]) -> f32[] {
+  %reduce_sum.3970 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3971 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3972 = f32[] add(%reduce_sum.3970, %reduce_sum.3971), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_803.894 (reduce_sum.3977: f32[], reduce_sum.3978: f32[]) -> f32[] {
+  %reduce_sum.3977 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3978 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3979 = f32[] add(%reduce_sum.3977, %reduce_sum.3978), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_804.895 (reduce_sum.3984: f32[], reduce_sum.3985: f32[]) -> f32[] {
+  %reduce_sum.3984 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3985 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3986 = f32[] add(%reduce_sum.3984, %reduce_sum.3985), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_805.896 (reduce_sum.3991: f32[], reduce_sum.3992: f32[]) -> f32[] {
+  %reduce_sum.3991 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3992 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.3993 = f32[] add(%reduce_sum.3991, %reduce_sum.3992), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_806.897 (reduce_sum.3998: f32[], reduce_sum.3999: f32[]) -> f32[] {
+  %reduce_sum.3998 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.3999 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4000 = f32[] add(%reduce_sum.3998, %reduce_sum.3999), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_807.898 (reduce_sum.4005: f32[], reduce_sum.4006: f32[]) -> f32[] {
+  %reduce_sum.4005 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4006 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4007 = f32[] add(%reduce_sum.4005, %reduce_sum.4006), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_808.899 (reduce_sum.4012: f32[], reduce_sum.4013: f32[]) -> f32[] {
+  %reduce_sum.4012 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4013 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4014 = f32[] add(%reduce_sum.4012, %reduce_sum.4013), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_809.900 (reduce_sum.4019: f32[], reduce_sum.4020: f32[]) -> f32[] {
+  %reduce_sum.4019 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4020 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4021 = f32[] add(%reduce_sum.4019, %reduce_sum.4020), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_810.901 (reduce_sum.4026: f32[], reduce_sum.4027: f32[]) -> f32[] {
+  %reduce_sum.4026 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4027 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4028 = f32[] add(%reduce_sum.4026, %reduce_sum.4027), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_811.902 (reduce_sum.4033: f32[], reduce_sum.4034: f32[]) -> f32[] {
+  %reduce_sum.4033 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4034 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4035 = f32[] add(%reduce_sum.4033, %reduce_sum.4034), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_812.903 (reduce_sum.4040: f32[], reduce_sum.4041: f32[]) -> f32[] {
+  %reduce_sum.4040 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4041 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4042 = f32[] add(%reduce_sum.4040, %reduce_sum.4041), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_813.904 (reduce_sum.4047: f32[], reduce_sum.4048: f32[]) -> f32[] {
+  %reduce_sum.4047 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4048 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4049 = f32[] add(%reduce_sum.4047, %reduce_sum.4048), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_814.905 (reduce_sum.4054: f32[], reduce_sum.4055: f32[]) -> f32[] {
+  %reduce_sum.4054 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4055 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4056 = f32[] add(%reduce_sum.4054, %reduce_sum.4055), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_815.906 (reduce_sum.4061: f32[], reduce_sum.4062: f32[]) -> f32[] {
+  %reduce_sum.4061 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.4062 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.4063 = f32[] add(%reduce_sum.4061, %reduce_sum.4062), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_498.589 (reduce_sum.1842: f32[], reduce_sum.1843: f32[]) -> f32[] {
+  %reduce_sum.1842 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1843 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1844 = f32[] add(%reduce_sum.1842, %reduce_sum.1843), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_499.590 (reduce_sum.1849: f32[], reduce_sum.1850: f32[]) -> f32[] {
+  %reduce_sum.1849 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1850 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1851 = f32[] add(%reduce_sum.1849, %reduce_sum.1850), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_500.591 (reduce_sum.1856: f32[], reduce_sum.1857: f32[]) -> f32[] {
+  %reduce_sum.1856 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1857 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1858 = f32[] add(%reduce_sum.1856, %reduce_sum.1857), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_501.592 (reduce_sum.1863: f32[], reduce_sum.1864: f32[]) -> f32[] {
+  %reduce_sum.1863 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1864 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1865 = f32[] add(%reduce_sum.1863, %reduce_sum.1864), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_502.593 (reduce_sum.1870: f32[], reduce_sum.1871: f32[]) -> f32[] {
+  %reduce_sum.1870 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1871 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1872 = f32[] add(%reduce_sum.1870, %reduce_sum.1871), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_503.594 (reduce_sum.1877: f32[], reduce_sum.1878: f32[]) -> f32[] {
+  %reduce_sum.1877 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1878 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1879 = f32[] add(%reduce_sum.1877, %reduce_sum.1878), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_504.595 (reduce_sum.1884: f32[], reduce_sum.1885: f32[]) -> f32[] {
+  %reduce_sum.1884 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1885 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1886 = f32[] add(%reduce_sum.1884, %reduce_sum.1885), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_505.596 (reduce_sum.1891: f32[], reduce_sum.1892: f32[]) -> f32[] {
+  %reduce_sum.1891 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1892 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1893 = f32[] add(%reduce_sum.1891, %reduce_sum.1892), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_506.597 (reduce_sum.1898: f32[], reduce_sum.1899: f32[]) -> f32[] {
+  %reduce_sum.1898 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1899 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1900 = f32[] add(%reduce_sum.1898, %reduce_sum.1899), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_507.598 (reduce_sum.1905: f32[], reduce_sum.1906: f32[]) -> f32[] {
+  %reduce_sum.1905 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1906 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1907 = f32[] add(%reduce_sum.1905, %reduce_sum.1906), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_508.599 (reduce_sum.1912: f32[], reduce_sum.1913: f32[]) -> f32[] {
+  %reduce_sum.1912 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1913 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1914 = f32[] add(%reduce_sum.1912, %reduce_sum.1913), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_509.600 (reduce_sum.1919: f32[], reduce_sum.1920: f32[]) -> f32[] {
+  %reduce_sum.1919 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1920 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1921 = f32[] add(%reduce_sum.1919, %reduce_sum.1920), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_510.601 (reduce_sum.1926: f32[], reduce_sum.1927: f32[]) -> f32[] {
+  %reduce_sum.1926 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1927 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1928 = f32[] add(%reduce_sum.1926, %reduce_sum.1927), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_511.602 (reduce_sum.1933: f32[], reduce_sum.1934: f32[]) -> f32[] {
+  %reduce_sum.1933 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1934 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1935 = f32[] add(%reduce_sum.1933, %reduce_sum.1934), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_512.603 (reduce_sum.1940: f32[], reduce_sum.1941: f32[]) -> f32[] {
+  %reduce_sum.1940 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1941 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1942 = f32[] add(%reduce_sum.1940, %reduce_sum.1941), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_513.604 (reduce_sum.1947: f32[], reduce_sum.1948: f32[]) -> f32[] {
+  %reduce_sum.1947 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1948 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1949 = f32[] add(%reduce_sum.1947, %reduce_sum.1948), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_514.605 (reduce_sum.1954: f32[], reduce_sum.1955: f32[]) -> f32[] {
+  %reduce_sum.1954 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1955 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1956 = f32[] add(%reduce_sum.1954, %reduce_sum.1955), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_515.606 (reduce_sum.1961: f32[], reduce_sum.1962: f32[]) -> f32[] {
+  %reduce_sum.1961 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1962 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1963 = f32[] add(%reduce_sum.1961, %reduce_sum.1962), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_516.607 (reduce_sum.1968: f32[], reduce_sum.1969: f32[]) -> f32[] {
+  %reduce_sum.1968 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1969 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1970 = f32[] add(%reduce_sum.1968, %reduce_sum.1969), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_517.608 (reduce_sum.1975: f32[], reduce_sum.1976: f32[]) -> f32[] {
+  %reduce_sum.1975 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1976 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1977 = f32[] add(%reduce_sum.1975, %reduce_sum.1976), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_518.609 (reduce_sum.1982: f32[], reduce_sum.1983: f32[]) -> f32[] {
+  %reduce_sum.1982 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1983 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1984 = f32[] add(%reduce_sum.1982, %reduce_sum.1983), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_519.610 (reduce_sum.1989: f32[], reduce_sum.1990: f32[]) -> f32[] {
+  %reduce_sum.1989 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1990 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1991 = f32[] add(%reduce_sum.1989, %reduce_sum.1990), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_520.611 (reduce_sum.1996: f32[], reduce_sum.1997: f32[]) -> f32[] {
+  %reduce_sum.1996 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.1997 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.1998 = f32[] add(%reduce_sum.1996, %reduce_sum.1997), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_521.612 (reduce_sum.2003: f32[], reduce_sum.2004: f32[]) -> f32[] {
+  %reduce_sum.2003 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2004 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2005 = f32[] add(%reduce_sum.2003, %reduce_sum.2004), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_522.613 (reduce_sum.2010: f32[], reduce_sum.2011: f32[]) -> f32[] {
+  %reduce_sum.2010 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2011 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2012 = f32[] add(%reduce_sum.2010, %reduce_sum.2011), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_523.614 (reduce_sum.2017: f32[], reduce_sum.2018: f32[]) -> f32[] {
+  %reduce_sum.2017 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2018 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2019 = f32[] add(%reduce_sum.2017, %reduce_sum.2018), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_524.615 (reduce_sum.2024: f32[], reduce_sum.2025: f32[]) -> f32[] {
+  %reduce_sum.2024 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2025 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2026 = f32[] add(%reduce_sum.2024, %reduce_sum.2025), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_525.616 (reduce_sum.2031: f32[], reduce_sum.2032: f32[]) -> f32[] {
+  %reduce_sum.2031 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2032 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2033 = f32[] add(%reduce_sum.2031, %reduce_sum.2032), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_526.617 (reduce_sum.2038: f32[], reduce_sum.2039: f32[]) -> f32[] {
+  %reduce_sum.2038 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2039 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2040 = f32[] add(%reduce_sum.2038, %reduce_sum.2039), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_527.618 (reduce_sum.2045: f32[], reduce_sum.2046: f32[]) -> f32[] {
+  %reduce_sum.2045 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2046 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2047 = f32[] add(%reduce_sum.2045, %reduce_sum.2046), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_528.619 (reduce_sum.2052: f32[], reduce_sum.2053: f32[]) -> f32[] {
+  %reduce_sum.2052 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2053 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2054 = f32[] add(%reduce_sum.2052, %reduce_sum.2053), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_529.620 (reduce_sum.2059: f32[], reduce_sum.2060: f32[]) -> f32[] {
+  %reduce_sum.2059 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2060 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2061 = f32[] add(%reduce_sum.2059, %reduce_sum.2060), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_530.621 (reduce_sum.2066: f32[], reduce_sum.2067: f32[]) -> f32[] {
+  %reduce_sum.2066 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2067 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2068 = f32[] add(%reduce_sum.2066, %reduce_sum.2067), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_531.622 (reduce_sum.2073: f32[], reduce_sum.2074: f32[]) -> f32[] {
+  %reduce_sum.2073 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2074 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2075 = f32[] add(%reduce_sum.2073, %reduce_sum.2074), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_532.623 (reduce_sum.2080: f32[], reduce_sum.2081: f32[]) -> f32[] {
+  %reduce_sum.2080 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2081 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2082 = f32[] add(%reduce_sum.2080, %reduce_sum.2081), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_533.624 (reduce_sum.2087: f32[], reduce_sum.2088: f32[]) -> f32[] {
+  %reduce_sum.2087 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2088 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2089 = f32[] add(%reduce_sum.2087, %reduce_sum.2088), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_534.625 (reduce_sum.2094: f32[], reduce_sum.2095: f32[]) -> f32[] {
+  %reduce_sum.2094 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2095 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2096 = f32[] add(%reduce_sum.2094, %reduce_sum.2095), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_535.626 (reduce_sum.2101: f32[], reduce_sum.2102: f32[]) -> f32[] {
+  %reduce_sum.2101 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2102 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2103 = f32[] add(%reduce_sum.2101, %reduce_sum.2102), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_536.627 (reduce_sum.2108: f32[], reduce_sum.2109: f32[]) -> f32[] {
+  %reduce_sum.2108 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2109 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2110 = f32[] add(%reduce_sum.2108, %reduce_sum.2109), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_537.628 (reduce_sum.2115: f32[], reduce_sum.2116: f32[]) -> f32[] {
+  %reduce_sum.2115 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2116 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2117 = f32[] add(%reduce_sum.2115, %reduce_sum.2116), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_538.629 (reduce_sum.2122: f32[], reduce_sum.2123: f32[]) -> f32[] {
+  %reduce_sum.2122 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2123 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2124 = f32[] add(%reduce_sum.2122, %reduce_sum.2123), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_539.630 (reduce_sum.2129: f32[], reduce_sum.2130: f32[]) -> f32[] {
+  %reduce_sum.2129 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2130 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2131 = f32[] add(%reduce_sum.2129, %reduce_sum.2130), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_540.631 (reduce_sum.2136: f32[], reduce_sum.2137: f32[]) -> f32[] {
+  %reduce_sum.2136 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2137 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2138 = f32[] add(%reduce_sum.2136, %reduce_sum.2137), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_541.632 (reduce_sum.2143: f32[], reduce_sum.2144: f32[]) -> f32[] {
+  %reduce_sum.2143 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2144 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2145 = f32[] add(%reduce_sum.2143, %reduce_sum.2144), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_542.633 (reduce_sum.2150: f32[], reduce_sum.2151: f32[]) -> f32[] {
+  %reduce_sum.2150 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2151 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2152 = f32[] add(%reduce_sum.2150, %reduce_sum.2151), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_543.634 (reduce_sum.2157: f32[], reduce_sum.2158: f32[]) -> f32[] {
+  %reduce_sum.2157 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2158 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2159 = f32[] add(%reduce_sum.2157, %reduce_sum.2158), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_544.635 (reduce_sum.2164: f32[], reduce_sum.2165: f32[]) -> f32[] {
+  %reduce_sum.2164 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2165 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2166 = f32[] add(%reduce_sum.2164, %reduce_sum.2165), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_545.636 (reduce_sum.2171: f32[], reduce_sum.2172: f32[]) -> f32[] {
+  %reduce_sum.2171 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2172 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2173 = f32[] add(%reduce_sum.2171, %reduce_sum.2172), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_546.637 (reduce_sum.2178: f32[], reduce_sum.2179: f32[]) -> f32[] {
+  %reduce_sum.2178 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2179 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2180 = f32[] add(%reduce_sum.2178, %reduce_sum.2179), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_547.638 (reduce_sum.2185: f32[], reduce_sum.2186: f32[]) -> f32[] {
+  %reduce_sum.2185 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2186 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2187 = f32[] add(%reduce_sum.2185, %reduce_sum.2186), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_548.639 (reduce_sum.2192: f32[], reduce_sum.2193: f32[]) -> f32[] {
+  %reduce_sum.2192 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2193 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2194 = f32[] add(%reduce_sum.2192, %reduce_sum.2193), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_549.640 (reduce_sum.2199: f32[], reduce_sum.2200: f32[]) -> f32[] {
+  %reduce_sum.2199 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2200 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2201 = f32[] add(%reduce_sum.2199, %reduce_sum.2200), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_550.641 (reduce_sum.2206: f32[], reduce_sum.2207: f32[]) -> f32[] {
+  %reduce_sum.2206 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2207 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2208 = f32[] add(%reduce_sum.2206, %reduce_sum.2207), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_551.642 (reduce_sum.2213: f32[], reduce_sum.2214: f32[]) -> f32[] {
+  %reduce_sum.2213 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2214 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2215 = f32[] add(%reduce_sum.2213, %reduce_sum.2214), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_552.643 (reduce_sum.2220: f32[], reduce_sum.2221: f32[]) -> f32[] {
+  %reduce_sum.2220 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2221 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2222 = f32[] add(%reduce_sum.2220, %reduce_sum.2221), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_553.644 (reduce_sum.2227: f32[], reduce_sum.2228: f32[]) -> f32[] {
+  %reduce_sum.2227 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2228 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2229 = f32[] add(%reduce_sum.2227, %reduce_sum.2228), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_554.645 (reduce_sum.2234: f32[], reduce_sum.2235: f32[]) -> f32[] {
+  %reduce_sum.2234 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2235 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2236 = f32[] add(%reduce_sum.2234, %reduce_sum.2235), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_555.646 (reduce_sum.2241: f32[], reduce_sum.2242: f32[]) -> f32[] {
+  %reduce_sum.2241 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2242 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2243 = f32[] add(%reduce_sum.2241, %reduce_sum.2242), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_556.647 (reduce_sum.2248: f32[], reduce_sum.2249: f32[]) -> f32[] {
+  %reduce_sum.2248 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2249 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2250 = f32[] add(%reduce_sum.2248, %reduce_sum.2249), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_557.648 (reduce_sum.2255: f32[], reduce_sum.2256: f32[]) -> f32[] {
+  %reduce_sum.2255 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2256 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2257 = f32[] add(%reduce_sum.2255, %reduce_sum.2256), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_558.649 (reduce_sum.2262: f32[], reduce_sum.2263: f32[]) -> f32[] {
+  %reduce_sum.2262 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2263 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2264 = f32[] add(%reduce_sum.2262, %reduce_sum.2263), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_559.650 (reduce_sum.2269: f32[], reduce_sum.2270: f32[]) -> f32[] {
+  %reduce_sum.2269 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2270 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2271 = f32[] add(%reduce_sum.2269, %reduce_sum.2270), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_560.651 (reduce_sum.2276: f32[], reduce_sum.2277: f32[]) -> f32[] {
+  %reduce_sum.2276 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2277 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2278 = f32[] add(%reduce_sum.2276, %reduce_sum.2277), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_561.652 (reduce_sum.2283: f32[], reduce_sum.2284: f32[]) -> f32[] {
+  %reduce_sum.2283 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2284 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2285 = f32[] add(%reduce_sum.2283, %reduce_sum.2284), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_562.653 (reduce_sum.2290: f32[], reduce_sum.2291: f32[]) -> f32[] {
+  %reduce_sum.2290 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2291 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2292 = f32[] add(%reduce_sum.2290, %reduce_sum.2291), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_563.654 (reduce_sum.2297: f32[], reduce_sum.2298: f32[]) -> f32[] {
+  %reduce_sum.2297 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2298 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2299 = f32[] add(%reduce_sum.2297, %reduce_sum.2298), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_564.655 (reduce_sum.2304: f32[], reduce_sum.2305: f32[]) -> f32[] {
+  %reduce_sum.2304 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2305 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2306 = f32[] add(%reduce_sum.2304, %reduce_sum.2305), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_565.656 (reduce_sum.2311: f32[], reduce_sum.2312: f32[]) -> f32[] {
+  %reduce_sum.2311 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2312 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2313 = f32[] add(%reduce_sum.2311, %reduce_sum.2312), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_566.657 (reduce_sum.2318: f32[], reduce_sum.2319: f32[]) -> f32[] {
+  %reduce_sum.2318 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2319 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2320 = f32[] add(%reduce_sum.2318, %reduce_sum.2319), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_567.658 (reduce_sum.2325: f32[], reduce_sum.2326: f32[]) -> f32[] {
+  %reduce_sum.2325 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2326 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2327 = f32[] add(%reduce_sum.2325, %reduce_sum.2326), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_568.659 (reduce_sum.2332: f32[], reduce_sum.2333: f32[]) -> f32[] {
+  %reduce_sum.2332 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2333 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2334 = f32[] add(%reduce_sum.2332, %reduce_sum.2333), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_569.660 (reduce_sum.2339: f32[], reduce_sum.2340: f32[]) -> f32[] {
+  %reduce_sum.2339 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2340 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2341 = f32[] add(%reduce_sum.2339, %reduce_sum.2340), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_570.661 (reduce_sum.2346: f32[], reduce_sum.2347: f32[]) -> f32[] {
+  %reduce_sum.2346 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2347 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2348 = f32[] add(%reduce_sum.2346, %reduce_sum.2347), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_571.662 (reduce_sum.2353: f32[], reduce_sum.2354: f32[]) -> f32[] {
+  %reduce_sum.2353 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2354 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2355 = f32[] add(%reduce_sum.2353, %reduce_sum.2354), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_572.663 (reduce_sum.2360: f32[], reduce_sum.2361: f32[]) -> f32[] {
+  %reduce_sum.2360 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2361 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2362 = f32[] add(%reduce_sum.2360, %reduce_sum.2361), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_573.664 (reduce_sum.2367: f32[], reduce_sum.2368: f32[]) -> f32[] {
+  %reduce_sum.2367 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2368 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2369 = f32[] add(%reduce_sum.2367, %reduce_sum.2368), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_574.665 (reduce_sum.2374: f32[], reduce_sum.2375: f32[]) -> f32[] {
+  %reduce_sum.2374 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2375 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2376 = f32[] add(%reduce_sum.2374, %reduce_sum.2375), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_575.666 (reduce_sum.2381: f32[], reduce_sum.2382: f32[]) -> f32[] {
+  %reduce_sum.2381 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2382 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2383 = f32[] add(%reduce_sum.2381, %reduce_sum.2382), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_576.667 (reduce_sum.2388: f32[], reduce_sum.2389: f32[]) -> f32[] {
+  %reduce_sum.2388 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2389 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2390 = f32[] add(%reduce_sum.2388, %reduce_sum.2389), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_577.668 (reduce_sum.2395: f32[], reduce_sum.2396: f32[]) -> f32[] {
+  %reduce_sum.2395 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2396 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2397 = f32[] add(%reduce_sum.2395, %reduce_sum.2396), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_578.669 (reduce_sum.2402: f32[], reduce_sum.2403: f32[]) -> f32[] {
+  %reduce_sum.2402 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2403 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2404 = f32[] add(%reduce_sum.2402, %reduce_sum.2403), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_579.670 (reduce_sum.2409: f32[], reduce_sum.2410: f32[]) -> f32[] {
+  %reduce_sum.2409 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2410 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2411 = f32[] add(%reduce_sum.2409, %reduce_sum.2410), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_580.671 (reduce_sum.2416: f32[], reduce_sum.2417: f32[]) -> f32[] {
+  %reduce_sum.2416 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2417 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2418 = f32[] add(%reduce_sum.2416, %reduce_sum.2417), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_581.672 (reduce_sum.2423: f32[], reduce_sum.2424: f32[]) -> f32[] {
+  %reduce_sum.2423 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2424 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2425 = f32[] add(%reduce_sum.2423, %reduce_sum.2424), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_582.673 (reduce_sum.2430: f32[], reduce_sum.2431: f32[]) -> f32[] {
+  %reduce_sum.2430 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2431 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2432 = f32[] add(%reduce_sum.2430, %reduce_sum.2431), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_583.674 (reduce_sum.2437: f32[], reduce_sum.2438: f32[]) -> f32[] {
+  %reduce_sum.2437 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2438 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2439 = f32[] add(%reduce_sum.2437, %reduce_sum.2438), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_584.675 (reduce_sum.2444: f32[], reduce_sum.2445: f32[]) -> f32[] {
+  %reduce_sum.2444 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2445 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2446 = f32[] add(%reduce_sum.2444, %reduce_sum.2445), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_585.676 (reduce_sum.2451: f32[], reduce_sum.2452: f32[]) -> f32[] {
+  %reduce_sum.2451 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2452 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2453 = f32[] add(%reduce_sum.2451, %reduce_sum.2452), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_586.677 (reduce_sum.2458: f32[], reduce_sum.2459: f32[]) -> f32[] {
+  %reduce_sum.2458 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2459 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2460 = f32[] add(%reduce_sum.2458, %reduce_sum.2459), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_587.678 (reduce_sum.2465: f32[], reduce_sum.2466: f32[]) -> f32[] {
+  %reduce_sum.2465 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2466 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2467 = f32[] add(%reduce_sum.2465, %reduce_sum.2466), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_588.679 (reduce_sum.2472: f32[], reduce_sum.2473: f32[]) -> f32[] {
+  %reduce_sum.2472 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2473 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2474 = f32[] add(%reduce_sum.2472, %reduce_sum.2473), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_589.680 (reduce_sum.2479: f32[], reduce_sum.2480: f32[]) -> f32[] {
+  %reduce_sum.2479 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2480 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2481 = f32[] add(%reduce_sum.2479, %reduce_sum.2480), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_590.681 (reduce_sum.2486: f32[], reduce_sum.2487: f32[]) -> f32[] {
+  %reduce_sum.2486 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2487 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2488 = f32[] add(%reduce_sum.2486, %reduce_sum.2487), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_591.682 (reduce_sum.2493: f32[], reduce_sum.2494: f32[]) -> f32[] {
+  %reduce_sum.2493 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2494 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2495 = f32[] add(%reduce_sum.2493, %reduce_sum.2494), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_592.683 (reduce_sum.2500: f32[], reduce_sum.2501: f32[]) -> f32[] {
+  %reduce_sum.2500 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2501 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2502 = f32[] add(%reduce_sum.2500, %reduce_sum.2501), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_593.684 (reduce_sum.2507: f32[], reduce_sum.2508: f32[]) -> f32[] {
+  %reduce_sum.2507 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2508 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2509 = f32[] add(%reduce_sum.2507, %reduce_sum.2508), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_594.685 (reduce_sum.2514: f32[], reduce_sum.2515: f32[]) -> f32[] {
+  %reduce_sum.2514 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2515 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2516 = f32[] add(%reduce_sum.2514, %reduce_sum.2515), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_595.686 (reduce_sum.2521: f32[], reduce_sum.2522: f32[]) -> f32[] {
+  %reduce_sum.2521 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2522 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2523 = f32[] add(%reduce_sum.2521, %reduce_sum.2522), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_596.687 (reduce_sum.2528: f32[], reduce_sum.2529: f32[]) -> f32[] {
+  %reduce_sum.2528 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2529 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2530 = f32[] add(%reduce_sum.2528, %reduce_sum.2529), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_597.688 (reduce_sum.2535: f32[], reduce_sum.2536: f32[]) -> f32[] {
+  %reduce_sum.2535 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2536 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2537 = f32[] add(%reduce_sum.2535, %reduce_sum.2536), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_598.689 (reduce_sum.2542: f32[], reduce_sum.2543: f32[]) -> f32[] {
+  %reduce_sum.2542 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2543 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2544 = f32[] add(%reduce_sum.2542, %reduce_sum.2543), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_599.690 (reduce_sum.2549: f32[], reduce_sum.2550: f32[]) -> f32[] {
+  %reduce_sum.2549 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2550 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2551 = f32[] add(%reduce_sum.2549, %reduce_sum.2550), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_600.691 (reduce_sum.2556: f32[], reduce_sum.2557: f32[]) -> f32[] {
+  %reduce_sum.2556 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2557 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2558 = f32[] add(%reduce_sum.2556, %reduce_sum.2557), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_601.692 (reduce_sum.2563: f32[], reduce_sum.2564: f32[]) -> f32[] {
+  %reduce_sum.2563 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2564 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2565 = f32[] add(%reduce_sum.2563, %reduce_sum.2564), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_602.693 (reduce_sum.2570: f32[], reduce_sum.2571: f32[]) -> f32[] {
+  %reduce_sum.2570 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2571 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2572 = f32[] add(%reduce_sum.2570, %reduce_sum.2571), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_603.694 (reduce_sum.2577: f32[], reduce_sum.2578: f32[]) -> f32[] {
+  %reduce_sum.2577 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2578 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2579 = f32[] add(%reduce_sum.2577, %reduce_sum.2578), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_604.695 (reduce_sum.2584: f32[], reduce_sum.2585: f32[]) -> f32[] {
+  %reduce_sum.2584 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2585 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2586 = f32[] add(%reduce_sum.2584, %reduce_sum.2585), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_605.696 (reduce_sum.2591: f32[], reduce_sum.2592: f32[]) -> f32[] {
+  %reduce_sum.2591 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2592 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2593 = f32[] add(%reduce_sum.2591, %reduce_sum.2592), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_606.697 (reduce_sum.2598: f32[], reduce_sum.2599: f32[]) -> f32[] {
+  %reduce_sum.2598 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2599 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2600 = f32[] add(%reduce_sum.2598, %reduce_sum.2599), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_607.698 (reduce_sum.2605: f32[], reduce_sum.2606: f32[]) -> f32[] {
+  %reduce_sum.2605 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2606 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2607 = f32[] add(%reduce_sum.2605, %reduce_sum.2606), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_608.699 (reduce_sum.2612: f32[], reduce_sum.2613: f32[]) -> f32[] {
+  %reduce_sum.2612 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2613 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2614 = f32[] add(%reduce_sum.2612, %reduce_sum.2613), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_609.700 (reduce_sum.2619: f32[], reduce_sum.2620: f32[]) -> f32[] {
+  %reduce_sum.2619 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2620 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2621 = f32[] add(%reduce_sum.2619, %reduce_sum.2620), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_610.701 (reduce_sum.2626: f32[], reduce_sum.2627: f32[]) -> f32[] {
+  %reduce_sum.2626 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2627 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2628 = f32[] add(%reduce_sum.2626, %reduce_sum.2627), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_611.702 (reduce_sum.2633: f32[], reduce_sum.2634: f32[]) -> f32[] {
+  %reduce_sum.2633 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2634 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2635 = f32[] add(%reduce_sum.2633, %reduce_sum.2634), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_612.703 (reduce_sum.2640: f32[], reduce_sum.2641: f32[]) -> f32[] {
+  %reduce_sum.2640 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2641 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2642 = f32[] add(%reduce_sum.2640, %reduce_sum.2641), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_613.704 (reduce_sum.2647: f32[], reduce_sum.2648: f32[]) -> f32[] {
+  %reduce_sum.2647 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2648 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2649 = f32[] add(%reduce_sum.2647, %reduce_sum.2648), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_614.705 (reduce_sum.2654: f32[], reduce_sum.2655: f32[]) -> f32[] {
+  %reduce_sum.2654 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2655 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2656 = f32[] add(%reduce_sum.2654, %reduce_sum.2655), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_615.706 (reduce_sum.2661: f32[], reduce_sum.2662: f32[]) -> f32[] {
+  %reduce_sum.2661 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2662 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2663 = f32[] add(%reduce_sum.2661, %reduce_sum.2662), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_616.707 (reduce_sum.2668: f32[], reduce_sum.2669: f32[]) -> f32[] {
+  %reduce_sum.2668 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2669 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2670 = f32[] add(%reduce_sum.2668, %reduce_sum.2669), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_617.708 (reduce_sum.2675: f32[], reduce_sum.2676: f32[]) -> f32[] {
+  %reduce_sum.2675 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2676 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2677 = f32[] add(%reduce_sum.2675, %reduce_sum.2676), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_618.709 (reduce_sum.2682: f32[], reduce_sum.2683: f32[]) -> f32[] {
+  %reduce_sum.2682 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2683 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2684 = f32[] add(%reduce_sum.2682, %reduce_sum.2683), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_619.710 (reduce_sum.2689: f32[], reduce_sum.2690: f32[]) -> f32[] {
+  %reduce_sum.2689 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2690 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2691 = f32[] add(%reduce_sum.2689, %reduce_sum.2690), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_620.711 (reduce_sum.2696: f32[], reduce_sum.2697: f32[]) -> f32[] {
+  %reduce_sum.2696 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2697 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2698 = f32[] add(%reduce_sum.2696, %reduce_sum.2697), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_621.712 (reduce_sum.2703: f32[], reduce_sum.2704: f32[]) -> f32[] {
+  %reduce_sum.2703 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2704 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2705 = f32[] add(%reduce_sum.2703, %reduce_sum.2704), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_622.713 (reduce_sum.2710: f32[], reduce_sum.2711: f32[]) -> f32[] {
+  %reduce_sum.2710 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2711 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2712 = f32[] add(%reduce_sum.2710, %reduce_sum.2711), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_623.714 (reduce_sum.2717: f32[], reduce_sum.2718: f32[]) -> f32[] {
+  %reduce_sum.2717 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2718 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2719 = f32[] add(%reduce_sum.2717, %reduce_sum.2718), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_624.715 (reduce_sum.2724: f32[], reduce_sum.2725: f32[]) -> f32[] {
+  %reduce_sum.2724 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2725 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2726 = f32[] add(%reduce_sum.2724, %reduce_sum.2725), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_625.716 (reduce_sum.2731: f32[], reduce_sum.2732: f32[]) -> f32[] {
+  %reduce_sum.2731 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2732 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2733 = f32[] add(%reduce_sum.2731, %reduce_sum.2732), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_626.717 (reduce_sum.2738: f32[], reduce_sum.2739: f32[]) -> f32[] {
+  %reduce_sum.2738 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2739 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2740 = f32[] add(%reduce_sum.2738, %reduce_sum.2739), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_627.718 (reduce_sum.2745: f32[], reduce_sum.2746: f32[]) -> f32[] {
+  %reduce_sum.2745 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2746 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2747 = f32[] add(%reduce_sum.2745, %reduce_sum.2746), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_628.719 (reduce_sum.2752: f32[], reduce_sum.2753: f32[]) -> f32[] {
+  %reduce_sum.2752 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2753 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2754 = f32[] add(%reduce_sum.2752, %reduce_sum.2753), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_629.720 (reduce_sum.2759: f32[], reduce_sum.2760: f32[]) -> f32[] {
+  %reduce_sum.2759 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2760 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2761 = f32[] add(%reduce_sum.2759, %reduce_sum.2760), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_630.721 (reduce_sum.2766: f32[], reduce_sum.2767: f32[]) -> f32[] {
+  %reduce_sum.2766 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2767 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2768 = f32[] add(%reduce_sum.2766, %reduce_sum.2767), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_631.722 (reduce_sum.2773: f32[], reduce_sum.2774: f32[]) -> f32[] {
+  %reduce_sum.2773 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2774 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2775 = f32[] add(%reduce_sum.2773, %reduce_sum.2774), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_632.723 (reduce_sum.2780: f32[], reduce_sum.2781: f32[]) -> f32[] {
+  %reduce_sum.2780 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2781 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2782 = f32[] add(%reduce_sum.2780, %reduce_sum.2781), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_633.724 (reduce_sum.2787: f32[], reduce_sum.2788: f32[]) -> f32[] {
+  %reduce_sum.2787 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2788 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2789 = f32[] add(%reduce_sum.2787, %reduce_sum.2788), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_634.725 (reduce_sum.2794: f32[], reduce_sum.2795: f32[]) -> f32[] {
+  %reduce_sum.2794 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2795 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2796 = f32[] add(%reduce_sum.2794, %reduce_sum.2795), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_635.726 (reduce_sum.2801: f32[], reduce_sum.2802: f32[]) -> f32[] {
+  %reduce_sum.2801 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2802 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2803 = f32[] add(%reduce_sum.2801, %reduce_sum.2802), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_636.727 (reduce_sum.2808: f32[], reduce_sum.2809: f32[]) -> f32[] {
+  %reduce_sum.2808 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2809 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2810 = f32[] add(%reduce_sum.2808, %reduce_sum.2809), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_637.728 (reduce_sum.2815: f32[], reduce_sum.2816: f32[]) -> f32[] {
+  %reduce_sum.2815 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2816 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2817 = f32[] add(%reduce_sum.2815, %reduce_sum.2816), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_638.729 (reduce_sum.2822: f32[], reduce_sum.2823: f32[]) -> f32[] {
+  %reduce_sum.2822 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2823 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2824 = f32[] add(%reduce_sum.2822, %reduce_sum.2823), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_639.730 (reduce_sum.2829: f32[], reduce_sum.2830: f32[]) -> f32[] {
+  %reduce_sum.2829 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2830 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2831 = f32[] add(%reduce_sum.2829, %reduce_sum.2830), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_640.731 (reduce_sum.2836: f32[], reduce_sum.2837: f32[]) -> f32[] {
+  %reduce_sum.2836 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2837 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2838 = f32[] add(%reduce_sum.2836, %reduce_sum.2837), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_641.732 (reduce_sum.2843: f32[], reduce_sum.2844: f32[]) -> f32[] {
+  %reduce_sum.2843 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2844 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2845 = f32[] add(%reduce_sum.2843, %reduce_sum.2844), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_642.733 (reduce_sum.2850: f32[], reduce_sum.2851: f32[]) -> f32[] {
+  %reduce_sum.2850 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2851 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2852 = f32[] add(%reduce_sum.2850, %reduce_sum.2851), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_643.734 (reduce_sum.2857: f32[], reduce_sum.2858: f32[]) -> f32[] {
+  %reduce_sum.2857 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2858 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2859 = f32[] add(%reduce_sum.2857, %reduce_sum.2858), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_644.735 (reduce_sum.2864: f32[], reduce_sum.2865: f32[]) -> f32[] {
+  %reduce_sum.2864 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2865 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2866 = f32[] add(%reduce_sum.2864, %reduce_sum.2865), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_645.736 (reduce_sum.2871: f32[], reduce_sum.2872: f32[]) -> f32[] {
+  %reduce_sum.2871 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2872 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2873 = f32[] add(%reduce_sum.2871, %reduce_sum.2872), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_646.737 (reduce_sum.2878: f32[], reduce_sum.2879: f32[]) -> f32[] {
+  %reduce_sum.2878 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2879 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2880 = f32[] add(%reduce_sum.2878, %reduce_sum.2879), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_647.738 (reduce_sum.2885: f32[], reduce_sum.2886: f32[]) -> f32[] {
+  %reduce_sum.2885 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2886 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2887 = f32[] add(%reduce_sum.2885, %reduce_sum.2886), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_648.739 (reduce_sum.2892: f32[], reduce_sum.2893: f32[]) -> f32[] {
+  %reduce_sum.2892 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2893 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2894 = f32[] add(%reduce_sum.2892, %reduce_sum.2893), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_649.740 (reduce_sum.2899: f32[], reduce_sum.2900: f32[]) -> f32[] {
+  %reduce_sum.2899 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2900 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2901 = f32[] add(%reduce_sum.2899, %reduce_sum.2900), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_650.741 (reduce_sum.2906: f32[], reduce_sum.2907: f32[]) -> f32[] {
+  %reduce_sum.2906 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2907 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2908 = f32[] add(%reduce_sum.2906, %reduce_sum.2907), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_651.742 (reduce_sum.2913: f32[], reduce_sum.2914: f32[]) -> f32[] {
+  %reduce_sum.2913 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2914 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2915 = f32[] add(%reduce_sum.2913, %reduce_sum.2914), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_652.743 (reduce_sum.2920: f32[], reduce_sum.2921: f32[]) -> f32[] {
+  %reduce_sum.2920 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2921 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2922 = f32[] add(%reduce_sum.2920, %reduce_sum.2921), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_653.744 (reduce_sum.2927: f32[], reduce_sum.2928: f32[]) -> f32[] {
+  %reduce_sum.2927 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2928 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2929 = f32[] add(%reduce_sum.2927, %reduce_sum.2928), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_654.745 (reduce_sum.2934: f32[], reduce_sum.2935: f32[]) -> f32[] {
+  %reduce_sum.2934 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2935 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2936 = f32[] add(%reduce_sum.2934, %reduce_sum.2935), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_655.746 (reduce_sum.2941: f32[], reduce_sum.2942: f32[]) -> f32[] {
+  %reduce_sum.2941 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2942 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2943 = f32[] add(%reduce_sum.2941, %reduce_sum.2942), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+%region_656.747 (reduce_sum.2948: f32[], reduce_sum.2949: f32[]) -> f32[] {
+  %reduce_sum.2948 = f32[] parameter(0), metadata={op_name="reduce_sum"}
+  %reduce_sum.2949 = f32[] parameter(1), metadata={op_name="reduce_sum"}
+  ROOT %reduce_sum.2950 = f32[] add(%reduce_sum.2948, %reduce_sum.2949), metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+}
+
+ENTRY %main.907 (state_step.1: s32[], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history2__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale2__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_amax_history2__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_scale2__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[1,1024], state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[1,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history2__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale2__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_amax_history2__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_scale2__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history2__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale2__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_amax_history2__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_scale2__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1: f32[2,1], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1: f32[2,1024], state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1: f32[2,1], state_params__params____decoder____decoder_norm____scale__.1: bf16[7168], state_params__params____decoder____dense_layers____mlp____wi_0____kernel__.1: bf16[7168,1,18432], state_params__params____decoder____dense_layers____mlp____wi_1____kernel__.1: bf16[7168,1,18432], state_params__params____decoder____dense_layers____mlp____wo____kernel__.1: bf16[18432,1,7168], state_params__params____decoder____dense_layers____post_self_attention_layer_norm____scale__.1: bf16[7168,1], state_params__params____decoder____dense_layers____pre_self_attention_layer_norm____scale__.1: bf16[7168,1], state_params__params____decoder____dense_layers____self_attention____kv_norm____scale__.1: bf16[512,1], state_params__params____decoder____dense_layers____self_attention____out____kernel__.1: bf16[128,1,128,7168], state_params__params____decoder____dense_layers____self_attention____q_norm____scale__.1: bf16[1536,1], state_params__params____decoder____dense_layers____self_attention____wkv_a_kernel__.1: bf16[7168,1,576], state_params__params____decoder____dense_layers____self_attention____wkv_b____kernel__.1: bf16[512,1,128,256], state_params__params____decoder____dense_layers____self_attention____wq_a_kernel__.1: bf16[7168,1,1536], state_params__params____decoder____dense_layers____self_attention____wq_b____kernel__.1: bf16[1536,1,128,192], state_params__params____decoder____logits_dense____kernel__.1: bf16[7168,129280], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1: bf16[64,2], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1: bf16[7168,2,64], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1: bf16[64,2], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1: bf16[64,2,7168,2048], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1: bf16[64,2,7168,2048], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1: bf16[64,2,2048,7168], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1: bf16[7168,2,2048], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1: bf16[7168,2,2048], state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1: bf16[2048,2,7168], state_params__params____decoder____moe_layers____post_self_attention_layer_norm____scale__.1: bf16[7168,2], state_params__params____decoder____moe_layers____pre_self_attention_layer_norm____scale__.1: bf16[7168,2], state_params__params____decoder____moe_layers____self_attention____kv_norm____scale__.1: bf16[512,2], state_params__params____decoder____moe_layers____self_attention____out____kernel__.1: bf16[128,2,128,7168], state_params__params____decoder____moe_layers____self_attention____q_norm____scale__.1: bf16[1536,2], state_params__params____decoder____moe_layers____self_attention____wkv_a_kernel__.1: bf16[7168,2,576], state_params__params____decoder____moe_layers____self_attention____wkv_b____kernel__.1: bf16[512,2,128,256], state_params__params____decoder____moe_layers____self_attention____wq_a_kernel__.1: bf16[7168,2,1536], state_params__params____decoder____moe_layers____self_attention____wq_b____kernel__.1: bf16[1536,2,128,192], state_params__params____token_embedder____embedding__.1: bf16[129280,7168], state_opt_state_0__count.1: s32[], state_opt_state_0__mu__decoder____decoder_norm____scale__.1: bf16[7168], state_opt_state_0__mu__decoder____dense_layers____mlp____wi_0____kernel__.1: bf16[7168,1,18432], state_opt_state_0__mu__decoder____dense_layers____mlp____wi_1____kernel__.1: bf16[7168,1,18432], state_opt_state_0__mu__decoder____dense_layers____mlp____wo____kernel__.1: bf16[18432,1,7168], state_opt_state_0__mu__decoder____dense_layers____post_self_attention_layer_norm____scale__.1: bf16[7168,1], state_opt_state_0__mu__decoder____dense_layers____pre_self_attention_layer_norm____scale__.1: bf16[7168,1], state_opt_state_0__mu__decoder____dense_layers____self_attention____kv_norm____scale__.1: bf16[512,1], state_opt_state_0__mu__decoder____dense_layers____self_attention____out____kernel__.1: bf16[128,1,128,7168], state_opt_state_0__mu__decoder____dense_layers____self_attention____q_norm____scale__.1: bf16[1536,1], state_opt_state_0__mu__decoder____dense_layers____self_attention____wkv_a_kernel__.1: bf16[7168,1,576], state_opt_state_0__mu__decoder____dense_layers____self_attention____wkv_b____kernel__.1: bf16[512,1,128,256], state_opt_state_0__mu__decoder____dense_layers____self_attention____wq_a_kernel__.1: bf16[7168,1,1536], state_opt_state_0__mu__decoder____dense_layers____self_attention____wq_b____kernel__.1: bf16[1536,1,128,192], state_opt_state_0__mu__decoder____logits_dense____kernel__.1: bf16[7168,129280], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1: bf16[64,2], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1: bf16[7168,2,64], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1: bf16[64,2], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1: bf16[64,2,7168,2048], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1: bf16[64,2,7168,2048], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1: bf16[64,2,2048,7168], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1: bf16[7168,2,2048], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1: bf16[7168,2,2048], state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1: bf16[2048,2,7168], state_opt_state_0__mu__decoder____moe_layers____post_self_attention_layer_norm____scale__.1: bf16[7168,2], state_opt_state_0__mu__decoder____moe_layers____pre_self_attention_layer_norm____scale__.1: bf16[7168,2], state_opt_state_0__mu__decoder____moe_layers____self_attention____kv_norm____scale__.1: bf16[512,2], state_opt_state_0__mu__decoder____moe_layers____self_attention____out____kernel__.1: bf16[128,2,128,7168], state_opt_state_0__mu__decoder____moe_layers____self_attention____q_norm____scale__.1: bf16[1536,2], state_opt_state_0__mu__decoder____moe_layers____self_attention____wkv_a_kernel__.1: bf16[7168,2,576], state_opt_state_0__mu__decoder____moe_layers____self_attention____wkv_b____kernel__.1: bf16[512,2,128,256], state_opt_state_0__mu__decoder____moe_layers____self_attention____wq_a_kernel__.1: bf16[7168,2,1536], state_opt_state_0__mu__decoder____moe_layers____self_attention____wq_b____kernel__.1: bf16[1536,2,128,192], state_opt_state_0__mu__token_embedder____embedding__.1: bf16[129280,7168], state_opt_state_0__nu__decoder____decoder_norm____scale__.1: bf16[7168], state_opt_state_0__nu__decoder____dense_layers____mlp____wi_0____kernel__.1: bf16[7168,1,18432], state_opt_state_0__nu__decoder____dense_layers____mlp____wi_1____kernel__.1: bf16[7168,1,18432], state_opt_state_0__nu__decoder____dense_layers____mlp____wo____kernel__.1: bf16[18432,1,7168], state_opt_state_0__nu__decoder____dense_layers____post_self_attention_layer_norm____scale__.1: bf16[7168,1], state_opt_state_0__nu__decoder____dense_layers____pre_self_attention_layer_norm____scale__.1: bf16[7168,1], state_opt_state_0__nu__decoder____dense_layers____self_attention____kv_norm____scale__.1: bf16[512,1], state_opt_state_0__nu__decoder____dense_layers____self_attention____out____kernel__.1: bf16[128,1,128,7168], state_opt_state_0__nu__decoder____dense_layers____self_attention____q_norm____scale__.1: bf16[1536,1], state_opt_state_0__nu__decoder____dense_layers____self_attention____wkv_a_kernel__.1: bf16[7168,1,576], state_opt_state_0__nu__decoder____dense_layers____self_attention____wkv_b____kernel__.1: bf16[512,1,128,256], state_opt_state_0__nu__decoder____dense_layers____self_attention____wq_a_kernel__.1: bf16[7168,1,1536], state_opt_state_0__nu__decoder____dense_layers____self_attention____wq_b____kernel__.1: bf16[1536,1,128,192], state_opt_state_0__nu__decoder____logits_dense____kernel__.1: bf16[7168,129280], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1: bf16[64,2], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1: bf16[7168,2,64], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1: bf16[64,2], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1: bf16[64,2,7168,2048], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1: bf16[64,2,7168,2048], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1: bf16[64,2,2048,7168], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1: bf16[7168,2,2048], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1: bf16[7168,2,2048], state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1: bf16[2048,2,7168], state_opt_state_0__nu__decoder____moe_layers____post_self_attention_layer_norm____scale__.1: bf16[7168,2], state_opt_state_0__nu__decoder____moe_layers____pre_self_attention_layer_norm____scale__.1: bf16[7168,2], state_opt_state_0__nu__decoder____moe_layers____self_attention____kv_norm____scale__.1: bf16[512,2], state_opt_state_0__nu__decoder____moe_layers____self_attention____out____kernel__.1: bf16[128,2,128,7168], state_opt_state_0__nu__decoder____moe_layers____self_attention____q_norm____scale__.1: bf16[1536,2], state_opt_state_0__nu__decoder____moe_layers____self_attention____wkv_a_kernel__.1: bf16[7168,2,576], state_opt_state_0__nu__decoder____moe_layers____self_attention____wkv_b____kernel__.1: bf16[512,2,128,256], state_opt_state_0__nu__decoder____moe_layers____self_attention____wq_a_kernel__.1: bf16[7168,2,1536], state_opt_state_0__nu__decoder____moe_layers____self_attention____wq_b____kernel__.1: bf16[1536,2,128,192], state_opt_state_0__nu__token_embedder____embedding__.1: bf16[129280,7168], state_opt_state_2__count.1: s32[], data__inputs__.1: s32[32,4096], data__inputs_position__.1: s32[32,4096], data__targets__.1: s32[32,4096], data__targets_segmentation__.1: s32[32,4096]) -> (s32[], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=5*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=10*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=15*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=20*/f32[1,1], f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], /*index=25*/f32[1,1024], f32[1,1024], f32[1,1], f32[1,1], f32[1,1024], /*index=30*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=35*/f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], /*index=40*/f32[1,1], f32[1,1024], f32[1,1], f32[1,1024], f32[1,1], /*index=45*/f32[1,1024], f32[1,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=50*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=55*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=60*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=65*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=70*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=75*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=80*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=85*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=90*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=95*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=100*/f32[2,1], f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], /*index=105*/f32[2,1024], f32[2,1024], f32[2,1], f32[2,1], f32[2,1024], /*index=110*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=115*/f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], /*index=120*/f32[2,1], f32[2,1024], f32[2,1], f32[2,1024], f32[2,1], /*index=125*/f32[2,1024], f32[2,1], bf16[7168], bf16[7168,1,18432], bf16[7168,1,18432], /*index=130*/bf16[18432,1,7168], bf16[7168,1], bf16[7168,1], bf16[512,1], bf16[128,1,128,7168], /*index=135*/bf16[1536,1], bf16[7168,1,576], bf16[512,1,128,256], bf16[7168,1,1536], bf16[1536,1,128,192], /*index=140*/bf16[7168,129280], bf16[64,2], bf16[7168,2,64], bf16[64,2], bf16[64,2,7168,2048], /*index=145*/bf16[64,2,7168,2048], bf16[64,2,2048,7168], bf16[7168,2,2048], bf16[7168,2,2048], bf16[2048,2,7168], /*index=150*/bf16[7168,2], bf16[7168,2], bf16[512,2], bf16[128,2,128,7168], bf16[1536,2], /*index=155*/bf16[7168,2,576], bf16[512,2,128,256], bf16[7168,2,1536], bf16[1536,2,128,192], bf16[129280,7168], /*index=160*/s32[], bf16[7168], bf16[7168,1,18432], bf16[7168,1,18432], bf16[18432,1,7168], /*index=165*/bf16[7168,1], bf16[7168,1], bf16[512,1], bf16[128,1,128,7168], bf16[1536,1], /*index=170*/bf16[7168,1,576], bf16[512,1,128,256], bf16[7168,1,1536], bf16[1536,1,128,192], bf16[7168,129280], /*index=175*/bf16[64,2], bf16[7168,2,64], bf16[64,2], bf16[64,2,7168,2048], bf16[64,2,7168,2048], /*index=180*/bf16[64,2,2048,7168], bf16[7168,2,2048], bf16[7168,2,2048], bf16[2048,2,7168], bf16[7168,2], /*index=185*/bf16[7168,2], bf16[512,2], bf16[128,2,128,7168], bf16[1536,2], bf16[7168,2,576], /*index=190*/bf16[512,2,128,256], bf16[7168,2,1536], bf16[1536,2,128,192], bf16[129280,7168], bf16[7168], /*index=195*/bf16[7168,1,18432], bf16[7168,1,18432], bf16[18432,1,7168], bf16[7168,1], bf16[7168,1], /*index=200*/bf16[512,1], bf16[128,1,128,7168], bf16[1536,1], bf16[7168,1,576], bf16[512,1,128,256], /*index=205*/bf16[7168,1,1536], bf16[1536,1,128,192], bf16[7168,129280], bf16[64,2], bf16[7168,2,64], /*index=210*/bf16[64,2], bf16[64,2,7168,2048], bf16[64,2,7168,2048], bf16[64,2,2048,7168], bf16[7168,2,2048], /*index=215*/bf16[7168,2,2048], bf16[2048,2,7168], bf16[7168,2], bf16[7168,2], bf16[512,2], /*index=220*/bf16[128,2,128,7168], bf16[1536,2], bf16[7168,2,576], bf16[512,2,128,256], bf16[7168,2,1536], /*index=225*/bf16[1536,2,128,192], bf16[129280,7168], s32[], f32[], f32[], /*index=230*/f32[], f32[], f32[], f32[], s32[]) {
+  %constant.716 = s32[] constant(0)
+  %state_params__params____token_embedder____embedding__.1 = bf16[129280,7168]{1,0} parameter(159), sharding={devices=[1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.params[\'params\'][\'token_embedder\'][\'embedding\']"}
+  %data__inputs__.1 = s32[32,4096]{1,0} parameter(228), sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}]>"}, metadata={op_name="data[\'inputs\']"}
+  %constant.717 = s32[] constant(0)
+  %broadcast.265 = s32[32,4096]{1,0} broadcast(%constant.717), dimensions={}
+  %lt.21 = pred[32,4096]{1,0} compare(%data__inputs__.1, %broadcast.265), direction=LT, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/lt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %constant.695 = s32[] constant(129280)
+  %add.834 = s32[32,4096]{1,0} broadcast(%constant.695), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %add.835 = s32[32,4096]{1,0} add(%data__inputs__.1, %add.834), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %select_n.103 = s32[32,4096]{1,0} select(%lt.21, %add.835, %data__inputs__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/select_n" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %broadcast_in_dim.1643 = s32[32,4096,1]{2,1,0} reshape(%select_n.103), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %gather.5 = bf16[32,4096,7168]{2,1,0} gather(%state_params__params____token_embedder____embedding__.1, %broadcast_in_dim.1643), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,7168}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/gather" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %sharding_constraint.192 = bf16[32,4096,7168]{2,1,0} custom-call(%gather.5), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %constant.686 = bf16[] constant(0)
+  %broadcast.253 = bf16[1,32,4096,7168]{3,2,1,0} broadcast(%constant.686), dimensions={}
+  %constant.685 = bf16[] constant(0)
+  %broadcast_in_dim.1642 = bf16[1,32,4096,1536]{3,2,1,0} broadcast(%constant.685), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.684 = bf16[] constant(0)
+  %broadcast_in_dim.1641 = bf16[1,32,4096,576]{3,2,1,0} broadcast(%constant.684), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.683 = bf16[] constant(0)
+  %broadcast.252 = bf16[1,32,4096,18432]{3,2,1,0} broadcast(%constant.683), dimensions={}
+  %state_params__params____decoder____dense_layers____mlp____wi_0____kernel__.1 = bf16[7168,1,18432]{2,1,0} parameter(128), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'kernel\']"}
+  %transpose.109 = bf16[1,7168,18432]{2,1,0} reshape(%state_params__params____decoder____dense_layers____mlp____wi_0____kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____mlp____wi_1____kernel__.1 = bf16[7168,1,18432]{2,1,0} parameter(129), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'kernel\']"}
+  %transpose.110 = bf16[1,7168,18432]{2,1,0} reshape(%state_params__params____decoder____dense_layers____mlp____wi_1____kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____mlp____wo____kernel__.1 = bf16[18432,1,7168]{2,1,0} parameter(130), sharding={devices=[1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'kernel\']"}
+  %transpose.111 = bf16[1,18432,7168]{2,1,0} reshape(%state_params__params____decoder____dense_layers____mlp____wo____kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____post_self_attention_layer_norm____scale__.1 = bf16[7168,1]{1,0} parameter(131), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'post_self_attention_layer_norm\'][\'scale\']"}
+  %transpose.112 = bf16[1,7168]{1,0} reshape(%state_params__params____decoder____dense_layers____post_self_attention_layer_norm____scale__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____pre_self_attention_layer_norm____scale__.1 = bf16[7168,1]{1,0} parameter(132), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'pre_self_attention_layer_norm\'][\'scale\']"}
+  %transpose.113 = bf16[1,7168]{1,0} reshape(%state_params__params____decoder____dense_layers____pre_self_attention_layer_norm____scale__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____self_attention____kv_norm____scale__.1 = bf16[512,1]{1,0} parameter(133), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'kv_norm\'][\'scale\']"}
+  %transpose.114 = bf16[1,512]{1,0} reshape(%state_params__params____decoder____dense_layers____self_attention____kv_norm____scale__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____self_attention____out____kernel__.1 = bf16[128,1,128,7168]{3,2,1,0} parameter(134), sharding={devices=[1,1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'kernel\']"}
+  %transpose.115 = bf16[1,128,128,7168]{3,2,1,0} reshape(%state_params__params____decoder____dense_layers____self_attention____out____kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____self_attention____q_norm____scale__.1 = bf16[1536,1]{1,0} parameter(135), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'q_norm\'][\'scale\']"}
+  %transpose.116 = bf16[1,1536]{1,0} reshape(%state_params__params____decoder____dense_layers____self_attention____q_norm____scale__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____self_attention____wkv_a_kernel__.1 = bf16[7168,1,576]{2,1,0} parameter(136), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_a_kernel\']"}
+  %transpose.117 = bf16[1,7168,576]{2,1,0} reshape(%state_params__params____decoder____dense_layers____self_attention____wkv_a_kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____self_attention____wkv_b____kernel__.1 = bf16[512,1,128,256]{3,2,1,0} parameter(137), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'kernel\']"}
+  %transpose.118 = bf16[1,512,128,256]{3,2,1,0} reshape(%state_params__params____decoder____dense_layers____self_attention____wkv_b____kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____self_attention____wq_a_kernel__.1 = bf16[7168,1,1536]{2,1,0} parameter(138), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_a_kernel\']"}
+  %transpose.119 = bf16[1,7168,1536]{2,1,0} reshape(%state_params__params____decoder____dense_layers____self_attention____wq_a_kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____dense_layers____self_attention____wq_b____kernel__.1 = bf16[1536,1,128,192]{3,2,1,0} parameter(139), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'kernel\']"}
+  %transpose.120 = bf16[1,1536,128,192]{3,2,1,0} reshape(%state_params__params____decoder____dense_layers____self_attention____wq_b____kernel__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[1,1024]{1,0} parameter(1), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[1,1]{1,0} parameter(2), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[1,1024]{1,0} parameter(3), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[1,1]{1,0} parameter(4), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[1,1024]{1,0} parameter(7), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[1,1]{1,0} parameter(8), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[1,1024]{1,0} parameter(9), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[1,1]{1,0} parameter(10), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[1,1024]{1,0} parameter(13), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[1,1]{1,0} parameter(14), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[1,1024]{1,0} parameter(15), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[1,1]{1,0} parameter(16), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_amax_history__.1 = f32[1,1024]{1,0} parameter(19), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_scale__.1 = f32[1,1]{1,0} parameter(20), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history__.1 = f32[1,1024]{1,0} parameter(21), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history2__.1 = f32[1,1024]{1,0} parameter(22), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_amax_history2\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale__.1 = f32[1,1]{1,0} parameter(23), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale2__.1 = f32[1,1]{1,0} parameter(24), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_scale2\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[1,1024]{1,0} parameter(29), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[1,1]{1,0} parameter(30), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[1,1024]{1,0} parameter(31), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[1,1]{1,0} parameter(32), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[1,1024]{1,0} parameter(35), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[1,1]{1,0} parameter(36), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[1,1024]{1,0} parameter(37), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[1,1]{1,0} parameter(38), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[1,1024]{1,0} parameter(41), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[1,1]{1,0} parameter(42), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[1,1024]{1,0} parameter(43), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[1,1]{1,0} parameter(44), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %iota.50 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/dense_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=622 source_end_line=622 source_column=8 source_end_column=67}
+  %broadcast_in_dim.1644 = f32[163840,1]{1,0} reshape(%iota.50), metadata={op_name="jit(train_step)/dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1549 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.1644), dimensions={0,1}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1550 = f32[163840]{0} reshape(%mul.1549), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1551 = f32[163840,32]{1,0} broadcast(%mul.1550), dimensions={0}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %constant.691 = f32[] constant(1)
+  %broadcast.258 = f32[32]{0} broadcast(%constant.691), dimensions={}
+  %constant.692 = f32[] constant(10000)
+  %broadcast.259 = f32[32]{0} broadcast(%constant.692), dimensions={}
+  %iota.48 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/dense_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=45 source_end_column=87}
+  %constant.694 = f32[] constant(2)
+  %broadcast.261 = f32[32]{0} broadcast(%constant.694), dimensions={}
+  %mul.1546 = f32[32]{0} multiply(%iota.48, %broadcast.261), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %constant.693 = f32[] constant(64)
+  %broadcast.260 = f32[32]{0} broadcast(%constant.693), dimensions={}
+  %div.1414 = f32[32]{0} divide(%mul.1546, %broadcast.260), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %pow.16 = f32[32]{0} power(%broadcast.259, %div.1414), metadata={op_name="jit(train_step)/dense_layers/self_attention/pow" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=19 source_end_column=110}
+  %div.1415 = f32[32]{0} divide(%broadcast.258, %pow.16), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=12 source_end_column=111}
+  %constant.688 = f32[] constant(40)
+  %broadcast.255 = f32[32]{0} broadcast(%constant.688), dimensions={}
+  %div.1417 = f32[32]{0} divide(%div.1415, %broadcast.255), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %iota.49 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/dense_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %constant.690 = f32[] constant(10)
+  %broadcast.257 = f32[32]{0} broadcast(%constant.690), dimensions={}
+  %sub.65 = f32[32]{0} subtract(%iota.49, %broadcast.257), metadata={op_name="jit(train_step)/dense_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %constant.689 = f32[] constant(13)
+  %broadcast.256 = f32[32]{0} broadcast(%constant.689), dimensions={}
+  %div.1416 = f32[32]{0} divide(%sub.65, %broadcast.256), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=18 source_end_column=86}
+  %constant.714 = s32[] constant(1)
+  %jit_clip_.221 = f32[32]{0} call(%div.1416, %constant.716, %constant.714), to_apply=%clip.1, metadata={op_name="jit(train_step)/dense_layers/self_attention/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %sub.66 = f32[32]{0} subtract(%broadcast.258, %jit_clip_.221), metadata={op_name="jit(train_step)/dense_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=617 source_end_line=617 source_column=13 source_end_column=62}
+  %sub.67 = f32[32]{0} subtract(%broadcast.258, %sub.66), metadata={op_name="jit(train_step)/dense_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=40 source_end_column=50}
+  %mul.1547 = f32[32]{0} multiply(%div.1417, %sub.67), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %mul.1548 = f32[32]{0} multiply(%div.1415, %sub.66), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=54 source_end_column=68}
+  %add.836 = f32[32]{0} add(%mul.1547, %mul.1548), metadata={op_name="jit(train_step)/dense_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %broadcast_in_dim.1645 = f32[1,32]{1,0} reshape(%add.836), metadata={op_name="jit(train_step)/dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1552 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.1645), dimensions={0,1}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1553 = f32[32]{0} reshape(%mul.1552), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1554 = f32[163840,32]{1,0} broadcast(%mul.1553), dimensions={1}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1555 = f32[163840,32]{1,0} multiply(%mul.1551, %mul.1554), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %convert_element_type.1395 = c64[163840,32]{1,0} convert(%mul.1555), metadata={op_name="jit(train_step)/dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %constant.687 = c64[] constant((0, 1))
+  %broadcast.254 = c64[163840,32]{1,0} broadcast(%constant.687), dimensions={}
+  %mul.1556 = c64[163840,32]{1,0} multiply(%convert_element_type.1395, %broadcast.254), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %exp.241 = f32[163840,32]{1,0} real(%mul.1556), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.242 = f32[163840,32]{1,0} exponential(%exp.241), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %constant.698 = f32[] constant(inf)
+  %broadcast.264 = f32[163840,32]{1,0} broadcast(%constant.698), dimensions={}
+  %exp.243 = pred[163840,32]{1,0} compare(%exp.242, %broadcast.264), direction=EQ, metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %constant.697 = f32[] constant(0.5)
+  %broadcast.263 = f32[163840,32]{1,0} broadcast(%constant.697), dimensions={}
+  %exp.244 = f32[163840,32]{1,0} multiply(%exp.241, %broadcast.263), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.245 = f32[163840,32]{1,0} exponential(%exp.244), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.246 = f32[163840,32]{1,0} imag(%mul.1556), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.247 = f32[163840,32]{1,0} cosine(%exp.246), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.248 = f32[163840,32]{1,0} multiply(%exp.245, %exp.247), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.249 = f32[163840,32]{1,0} multiply(%exp.248, %exp.245), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.250 = f32[163840,32]{1,0} multiply(%exp.242, %exp.247), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.251 = f32[163840,32]{1,0} select(%exp.243, %exp.249, %exp.250), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %constant.696 = f32[] constant(0)
+  %broadcast.262 = f32[163840,32]{1,0} broadcast(%constant.696), dimensions={}
+  %exp.252 = pred[163840,32]{1,0} compare(%exp.246, %broadcast.262), direction=EQ, metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.253 = f32[163840,32]{1,0} sine(%exp.246), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.254 = f32[163840,32]{1,0} multiply(%exp.245, %exp.253), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.255 = f32[163840,32]{1,0} multiply(%exp.254, %exp.245), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.256 = f32[163840,32]{1,0} multiply(%exp.242, %exp.253), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.257 = f32[163840,32]{1,0} select(%exp.243, %exp.255, %exp.256), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.258 = f32[163840,32]{1,0} select(%exp.252, %broadcast.262, %exp.257), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.259 = c64[163840,32]{1,0} complex(%exp.251, %exp.258), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %data__inputs_position__.1 = s32[32,4096]{1,0} parameter(229), sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}]>"}, metadata={op_name="data[\'inputs_position\']"}
+  %jit__take_.12 = c64[32,4096,32]{2,1,0} call(%exp.259, %data__inputs_position__.1), to_apply=%_take.4, metadata={op_name="jit(train_step)/dense_layers/self_attention/jit(_take)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.1646 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.12), metadata={op_name="jit(train_step)/dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=694 source_end_line=694 source_column=12 source_end_column=39}
+  %iota.53 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/dense_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=622 source_end_line=622 source_column=8 source_end_column=67}
+  %broadcast_in_dim.1647 = f32[163840,1]{1,0} reshape(%iota.53), metadata={op_name="jit(train_step)/dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1560 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.1647), dimensions={0,1}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1561 = f32[163840]{0} reshape(%mul.1560), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1562 = f32[163840,32]{1,0} broadcast(%mul.1561), dimensions={0}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %iota.51 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/dense_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=45 source_end_column=87}
+  %mul.1557 = f32[32]{0} multiply(%iota.51, %broadcast.261), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %div.1418 = f32[32]{0} divide(%mul.1557, %broadcast.260), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %pow.17 = f32[32]{0} power(%broadcast.259, %div.1418), metadata={op_name="jit(train_step)/dense_layers/self_attention/pow" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=19 source_end_column=110}
+  %div.1419 = f32[32]{0} divide(%broadcast.258, %pow.17), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=12 source_end_column=111}
+  %div.1421 = f32[32]{0} divide(%div.1419, %broadcast.255), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %iota.52 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/dense_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %sub.68 = f32[32]{0} subtract(%iota.52, %broadcast.257), metadata={op_name="jit(train_step)/dense_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %div.1420 = f32[32]{0} divide(%sub.68, %broadcast.256), metadata={op_name="jit(train_step)/dense_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=18 source_end_column=86}
+  %jit_clip_.222 = f32[32]{0} call(%div.1420, %constant.716, %constant.714), to_apply=%clip.1, metadata={op_name="jit(train_step)/dense_layers/self_attention/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %sub.69 = f32[32]{0} subtract(%broadcast.258, %jit_clip_.222), metadata={op_name="jit(train_step)/dense_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=617 source_end_line=617 source_column=13 source_end_column=62}
+  %sub.70 = f32[32]{0} subtract(%broadcast.258, %sub.69), metadata={op_name="jit(train_step)/dense_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=40 source_end_column=50}
+  %mul.1558 = f32[32]{0} multiply(%div.1421, %sub.70), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %mul.1559 = f32[32]{0} multiply(%div.1419, %sub.69), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=54 source_end_column=68}
+  %add.837 = f32[32]{0} add(%mul.1558, %mul.1559), metadata={op_name="jit(train_step)/dense_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %broadcast_in_dim.1648 = f32[1,32]{1,0} reshape(%add.837), metadata={op_name="jit(train_step)/dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1563 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.1648), dimensions={0,1}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1564 = f32[32]{0} reshape(%mul.1563), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1565 = f32[163840,32]{1,0} broadcast(%mul.1564), dimensions={1}, metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1566 = f32[163840,32]{1,0} multiply(%mul.1562, %mul.1565), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %convert_element_type.1396 = c64[163840,32]{1,0} convert(%mul.1566), metadata={op_name="jit(train_step)/dense_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %mul.1567 = c64[163840,32]{1,0} multiply(%convert_element_type.1396, %broadcast.254), metadata={op_name="jit(train_step)/dense_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %exp.260 = f32[163840,32]{1,0} real(%mul.1567), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.261 = f32[163840,32]{1,0} exponential(%exp.260), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.262 = pred[163840,32]{1,0} compare(%exp.261, %broadcast.264), direction=EQ, metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.263 = f32[163840,32]{1,0} multiply(%exp.260, %broadcast.263), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.264 = f32[163840,32]{1,0} exponential(%exp.263), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.265 = f32[163840,32]{1,0} imag(%mul.1567), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.266 = f32[163840,32]{1,0} cosine(%exp.265), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.267 = f32[163840,32]{1,0} multiply(%exp.264, %exp.266), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.268 = f32[163840,32]{1,0} multiply(%exp.267, %exp.264), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.269 = f32[163840,32]{1,0} multiply(%exp.261, %exp.266), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.270 = f32[163840,32]{1,0} select(%exp.262, %exp.268, %exp.269), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.271 = pred[163840,32]{1,0} compare(%exp.265, %broadcast.262), direction=EQ, metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.272 = f32[163840,32]{1,0} sine(%exp.265), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.273 = f32[163840,32]{1,0} multiply(%exp.264, %exp.272), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.274 = f32[163840,32]{1,0} multiply(%exp.273, %exp.264), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.275 = f32[163840,32]{1,0} multiply(%exp.261, %exp.272), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.276 = f32[163840,32]{1,0} select(%exp.262, %exp.274, %exp.275), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.277 = f32[163840,32]{1,0} select(%exp.271, %broadcast.262, %exp.276), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.278 = c64[163840,32]{1,0} complex(%exp.270, %exp.277), metadata={op_name="jit(train_step)/dense_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %jit__take_.13 = c64[32,4096,32]{2,1,0} call(%exp.278, %data__inputs_position__.1), to_apply=%_take.4, metadata={op_name="jit(train_step)/dense_layers/self_attention/jit(_take)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.1649 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.13), metadata={op_name="jit(train_step)/dense_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=694 source_end_line=694 source_column=12 source_end_column=39}
+  %while.185 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, /*index=5*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, /*index=10*/bf16[1,18432,7168]{2,1,0}, bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, /*index=15*/bf16[1,1536]{1,0}, bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=40*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/c64[32,4096,1,32]{3,2,1,0}, c64[32,4096,1,32]{3,2,1,0}) tuple(%constant.716, %sharding_constraint.192, %broadcast.253, %broadcast_in_dim.1642, %broadcast_in_dim.1641, /*index=5*/%broadcast.253, %broadcast.252, %broadcast.252, %transpose.109, %transpose.110, /*index=10*/%transpose.111, %transpose.112, %transpose.113, %transpose.114, %transpose.115, /*index=15*/%transpose.116, %transpose.117, %transpose.118, %transpose.119, %transpose.120, /*index=20*/%state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1, /*index=25*/%state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_scale__.1, /*index=30*/%state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history__.1, /*index=35*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale2__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1, /*index=40*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, /*index=45*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, /*index=50*/%broadcast_in_dim.1646, %broadcast_in_dim.1649), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.186 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, /*index=5*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, /*index=10*/bf16[1,18432,7168]{2,1,0}, bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, /*index=15*/bf16[1,1536]{1,0}, bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=40*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/c64[32,4096,1,32]{3,2,1,0}, c64[32,4096,1,32]{3,2,1,0}) while(%while.185), condition=%region_21.41, body=%region_1.40, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.187 = s32[] get-tuple-element(%while.186), index=0, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.188 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%while.186), index=1, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.682 = bf16[] constant(0)
+  %broadcast.251 = bf16[2,32,4096,7168]{3,2,1,0} broadcast(%constant.682), dimensions={}
+  %constant.681 = bf16[] constant(0)
+  %broadcast_in_dim.1640 = bf16[2,32,4096,1536]{3,2,1,0} broadcast(%constant.681), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.680 = bf16[] constant(0)
+  %broadcast_in_dim.1639 = bf16[2,32,4096,576]{3,2,1,0} broadcast(%constant.680), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.679 = bf16[] constant(0)
+  %broadcast.250 = bf16[2,64,32,512,2048]{4,3,2,1,0} broadcast(%constant.679), dimensions={}
+  %constant.678 = bf16[] constant(0)
+  %broadcast.249 = bf16[2,32,4096,2048]{3,2,1,0} broadcast(%constant.678), dimensions={}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1 = bf16[7168,2,64]{2,1,0} parameter(142), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'kernel\']"}
+  %transpose.121 = bf16[2,7168,64]{2,0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1), dimensions={1,0,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1 = bf16[64,2]{1,0} parameter(143), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'router_bias\']"}
+  %transpose.122 = bf16[2,64]{0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1), dimensions={1,0}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1 = bf16[64,2,7168,2048]{3,2,1,0} parameter(144), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wi_0\']"}
+  %transpose.123 = bf16[2,64,7168,2048]{3,2,0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1 = bf16[64,2,7168,2048]{3,2,1,0} parameter(145), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wi_1\']"}
+  %transpose.124 = bf16[2,64,7168,2048]{3,2,0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1 = bf16[64,2,2048,7168]{3,2,1,0} parameter(146), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wo\']"}
+  %transpose.125 = bf16[2,64,2048,7168]{3,2,0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1 = bf16[7168,2,2048]{2,1,0} parameter(147), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'kernel\']"}
+  %transpose.126 = bf16[2,7168,2048]{2,0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1), dimensions={1,0,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1 = bf16[7168,2,2048]{2,1,0} parameter(148), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'kernel\']"}
+  %transpose.127 = bf16[2,7168,2048]{2,0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1), dimensions={1,0,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1 = bf16[2048,2,7168]{2,1,0} parameter(149), sharding={devices=[1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'kernel\']"}
+  %transpose.128 = bf16[2,2048,7168]{2,0,1} transpose(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1), dimensions={1,0,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____post_self_attention_layer_norm____scale__.1 = bf16[7168,2]{1,0} parameter(150), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'post_self_attention_layer_norm\'][\'scale\']"}
+  %transpose.129 = bf16[2,7168]{0,1} transpose(%state_params__params____decoder____moe_layers____post_self_attention_layer_norm____scale__.1), dimensions={1,0}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____pre_self_attention_layer_norm____scale__.1 = bf16[7168,2]{1,0} parameter(151), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'pre_self_attention_layer_norm\'][\'scale\']"}
+  %transpose.130 = bf16[2,7168]{0,1} transpose(%state_params__params____decoder____moe_layers____pre_self_attention_layer_norm____scale__.1), dimensions={1,0}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____self_attention____kv_norm____scale__.1 = bf16[512,2]{1,0} parameter(152), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'kv_norm\'][\'scale\']"}
+  %transpose.131 = bf16[2,512]{0,1} transpose(%state_params__params____decoder____moe_layers____self_attention____kv_norm____scale__.1), dimensions={1,0}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____self_attention____out____kernel__.1 = bf16[128,2,128,7168]{3,2,1,0} parameter(153), sharding={devices=[1,1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'kernel\']"}
+  %transpose.132 = bf16[2,128,128,7168]{3,2,0,1} transpose(%state_params__params____decoder____moe_layers____self_attention____out____kernel__.1), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____self_attention____q_norm____scale__.1 = bf16[1536,2]{1,0} parameter(154), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'q_norm\'][\'scale\']"}
+  %transpose.133 = bf16[2,1536]{0,1} transpose(%state_params__params____decoder____moe_layers____self_attention____q_norm____scale__.1), dimensions={1,0}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____self_attention____wkv_a_kernel__.1 = bf16[7168,2,576]{2,1,0} parameter(155), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_a_kernel\']"}
+  %transpose.134 = bf16[2,7168,576]{2,0,1} transpose(%state_params__params____decoder____moe_layers____self_attention____wkv_a_kernel__.1), dimensions={1,0,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____self_attention____wkv_b____kernel__.1 = bf16[512,2,128,256]{3,2,1,0} parameter(156), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'kernel\']"}
+  %transpose.135 = bf16[2,512,128,256]{3,2,0,1} transpose(%state_params__params____decoder____moe_layers____self_attention____wkv_b____kernel__.1), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____self_attention____wq_a_kernel__.1 = bf16[7168,2,1536]{2,1,0} parameter(157), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_a_kernel\']"}
+  %transpose.136 = bf16[2,7168,1536]{2,0,1} transpose(%state_params__params____decoder____moe_layers____self_attention____wq_a_kernel__.1), dimensions={1,0,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params__params____decoder____moe_layers____self_attention____wq_b____kernel__.1 = bf16[1536,2,128,192]{3,2,1,0} parameter(158), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'kernel\']"}
+  %transpose.137 = bf16[2,1536,128,192]{3,2,0,1} transpose(%state_params__params____decoder____moe_layers____self_attention____wq_b____kernel__.1), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[2,1024]{1,0} parameter(47), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[2,1]{1,0} parameter(48), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(49), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[2,1]{1,0} parameter(50), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_amax_history__.1 = f32[2,1024]{1,0} parameter(53), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_0\'][\'fp8_einsum\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_scale__.1 = f32[2,1]{1,0} parameter(54), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_0\'][\'fp8_einsum\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(55), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_0\'][\'fp8_einsum\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_scale__.1 = f32[2,1]{1,0} parameter(56), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_0\'][\'fp8_einsum\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_amax_history__.1 = f32[2,1024]{1,0} parameter(59), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_scale__.1 = f32[2,1]{1,0} parameter(60), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(61), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history2__.1 = f32[2,1024]{1,0} parameter(62), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'kernel_amax_history2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale__.1 = f32[2,1]{1,0} parameter(63), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale2__.1 = f32[2,1]{1,0} parameter(64), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'kernel_scale2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_amax_history__.1 = f32[2,1024]{1,0} parameter(69), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_2\'][\'fp8_einsum\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_scale__.1 = f32[2,1]{1,0} parameter(70), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_2\'][\'fp8_einsum\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(71), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_2\'][\'fp8_einsum\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_scale__.1 = f32[2,1]{1,0} parameter(72), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_2\'][\'fp8_einsum\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_amax_history__.1 = f32[2,1024]{1,0} parameter(75), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_3\'][\'fp8_einsum\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_scale__.1 = f32[2,1]{1,0} parameter(76), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_3\'][\'fp8_einsum\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(77), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_3\'][\'fp8_einsum\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_scale__.1 = f32[2,1]{1,0} parameter(78), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_3\'][\'fp8_einsum\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[2,1024]{1,0} parameter(81), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[2,1]{1,0} parameter(82), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(83), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[2,1]{1,0} parameter(84), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[2,1024]{1,0} parameter(87), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[2,1]{1,0} parameter(88), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(89), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[2,1]{1,0} parameter(90), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[2,1024]{1,0} parameter(93), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[2,1]{1,0} parameter(94), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(95), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[2,1]{1,0} parameter(96), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_amax_history__.1 = f32[2,1024]{1,0} parameter(99), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_scale__.1 = f32[2,1]{1,0} parameter(100), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(101), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history2__.1 = f32[2,1024]{1,0} parameter(102), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_amax_history2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale__.1 = f32[2,1]{1,0} parameter(103), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale2__.1 = f32[2,1]{1,0} parameter(104), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'kernel_scale2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[2,1024]{1,0} parameter(109), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[2,1]{1,0} parameter(110), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(111), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[2,1]{1,0} parameter(112), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[2,1024]{1,0} parameter(115), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[2,1]{1,0} parameter(116), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(117), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[2,1]{1,0} parameter(118), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1 = f32[2,1024]{1,0} parameter(121), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1 = f32[2,1]{1,0} parameter(122), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'input_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1 = f32[2,1024]{1,0} parameter(123), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1 = f32[2,1]{1,0} parameter(124), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'kernel_scale\']"}
+  %iota.56 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/moe_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=622 source_end_line=622 source_column=8 source_end_column=67}
+  %broadcast_in_dim.1650 = f32[163840,1]{1,0} reshape(%iota.56), metadata={op_name="jit(train_step)/moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1571 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.1650), dimensions={0,1}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1572 = f32[163840]{0} reshape(%mul.1571), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1573 = f32[163840,32]{1,0} broadcast(%mul.1572), dimensions={0}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %iota.54 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/moe_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=45 source_end_column=87}
+  %mul.1568 = f32[32]{0} multiply(%iota.54, %broadcast.261), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %div.1422 = f32[32]{0} divide(%mul.1568, %broadcast.260), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %pow.18 = f32[32]{0} power(%broadcast.259, %div.1422), metadata={op_name="jit(train_step)/moe_layers/self_attention/pow" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=19 source_end_column=110}
+  %div.1423 = f32[32]{0} divide(%broadcast.258, %pow.18), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=12 source_end_column=111}
+  %div.1425 = f32[32]{0} divide(%div.1423, %broadcast.255), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %iota.55 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/moe_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %sub.71 = f32[32]{0} subtract(%iota.55, %broadcast.257), metadata={op_name="jit(train_step)/moe_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %div.1424 = f32[32]{0} divide(%sub.71, %broadcast.256), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=18 source_end_column=86}
+  %jit_clip_.223 = f32[32]{0} call(%div.1424, %constant.716, %constant.714), to_apply=%clip.1, metadata={op_name="jit(train_step)/moe_layers/self_attention/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %sub.72 = f32[32]{0} subtract(%broadcast.258, %jit_clip_.223), metadata={op_name="jit(train_step)/moe_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=617 source_end_line=617 source_column=13 source_end_column=62}
+  %sub.73 = f32[32]{0} subtract(%broadcast.258, %sub.72), metadata={op_name="jit(train_step)/moe_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=40 source_end_column=50}
+  %mul.1569 = f32[32]{0} multiply(%div.1425, %sub.73), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %mul.1570 = f32[32]{0} multiply(%div.1423, %sub.72), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=54 source_end_column=68}
+  %add.838 = f32[32]{0} add(%mul.1569, %mul.1570), metadata={op_name="jit(train_step)/moe_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %broadcast_in_dim.1651 = f32[1,32]{1,0} reshape(%add.838), metadata={op_name="jit(train_step)/moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1574 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.1651), dimensions={0,1}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1575 = f32[32]{0} reshape(%mul.1574), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1576 = f32[163840,32]{1,0} broadcast(%mul.1575), dimensions={1}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1577 = f32[163840,32]{1,0} multiply(%mul.1573, %mul.1576), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %convert_element_type.1397 = c64[163840,32]{1,0} convert(%mul.1577), metadata={op_name="jit(train_step)/moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %mul.1578 = c64[163840,32]{1,0} multiply(%convert_element_type.1397, %broadcast.254), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %exp.279 = f32[163840,32]{1,0} real(%mul.1578), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.280 = f32[163840,32]{1,0} exponential(%exp.279), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.281 = pred[163840,32]{1,0} compare(%exp.280, %broadcast.264), direction=EQ, metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.282 = f32[163840,32]{1,0} multiply(%exp.279, %broadcast.263), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.283 = f32[163840,32]{1,0} exponential(%exp.282), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.284 = f32[163840,32]{1,0} imag(%mul.1578), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.285 = f32[163840,32]{1,0} cosine(%exp.284), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.286 = f32[163840,32]{1,0} multiply(%exp.283, %exp.285), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.287 = f32[163840,32]{1,0} multiply(%exp.286, %exp.283), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.288 = f32[163840,32]{1,0} multiply(%exp.280, %exp.285), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.289 = f32[163840,32]{1,0} select(%exp.281, %exp.287, %exp.288), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.290 = pred[163840,32]{1,0} compare(%exp.284, %broadcast.262), direction=EQ, metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.291 = f32[163840,32]{1,0} sine(%exp.284), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.292 = f32[163840,32]{1,0} multiply(%exp.283, %exp.291), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.293 = f32[163840,32]{1,0} multiply(%exp.292, %exp.283), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.294 = f32[163840,32]{1,0} multiply(%exp.280, %exp.291), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.295 = f32[163840,32]{1,0} select(%exp.281, %exp.293, %exp.294), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.296 = f32[163840,32]{1,0} select(%exp.290, %broadcast.262, %exp.295), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.297 = c64[163840,32]{1,0} complex(%exp.289, %exp.296), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %jit__take_.14 = c64[32,4096,32]{2,1,0} call(%exp.297, %data__inputs_position__.1), to_apply=%_take.4, metadata={op_name="jit(train_step)/moe_layers/self_attention/jit(_take)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.1652 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.14), metadata={op_name="jit(train_step)/moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=694 source_end_line=694 source_column=12 source_end_column=39}
+  %iota.59 = f32[163840]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/moe_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=622 source_end_line=622 source_column=8 source_end_column=67}
+  %broadcast_in_dim.1653 = f32[163840,1]{1,0} reshape(%iota.59), metadata={op_name="jit(train_step)/moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1582 = f32[163840,1]{1,0} broadcast(%broadcast_in_dim.1653), dimensions={0,1}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1583 = f32[163840]{0} reshape(%mul.1582), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1584 = f32[163840,32]{1,0} broadcast(%mul.1583), dimensions={0}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %iota.57 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/moe_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=45 source_end_column=87}
+  %mul.1579 = f32[32]{0} multiply(%iota.57, %broadcast.261), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %div.1426 = f32[32]{0} divide(%mul.1579, %broadcast.260), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=39 source_end_column=87}
+  %pow.19 = f32[32]{0} power(%broadcast.259, %div.1426), metadata={op_name="jit(train_step)/moe_layers/self_attention/pow" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=19 source_end_column=110}
+  %div.1427 = f32[32]{0} divide(%broadcast.258, %pow.19), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=607 source_end_line=607 source_column=12 source_end_column=111}
+  %div.1429 = f32[32]{0} divide(%div.1427, %broadcast.255), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %iota.58 = f32[32]{0} iota(), iota_dimension=0, metadata={op_name="jit(train_step)/moe_layers/self_attention/iota" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %sub.74 = f32[32]{0} subtract(%iota.58, %broadcast.257), metadata={op_name="jit(train_step)/moe_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=19 source_end_column=53}
+  %div.1428 = f32[32]{0} divide(%sub.74, %broadcast.256), metadata={op_name="jit(train_step)/moe_layers/self_attention/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=665 source_end_line=665 source_column=18 source_end_column=86}
+  %jit_clip_.224 = f32[32]{0} call(%div.1428, %constant.716, %constant.714), to_apply=%clip.1, metadata={op_name="jit(train_step)/moe_layers/self_attention/jit(clip)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=666 source_end_line=666 source_column=11 source_end_column=38}
+  %sub.75 = f32[32]{0} subtract(%broadcast.258, %jit_clip_.224), metadata={op_name="jit(train_step)/moe_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=617 source_end_line=617 source_column=13 source_end_column=62}
+  %sub.76 = f32[32]{0} subtract(%broadcast.258, %sub.75), metadata={op_name="jit(train_step)/moe_layers/self_attention/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=40 source_end_column=50}
+  %mul.1580 = f32[32]{0} multiply(%div.1429, %sub.76), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %mul.1581 = f32[32]{0} multiply(%div.1427, %sub.75), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=54 source_end_column=68}
+  %add.839 = f32[32]{0} add(%mul.1580, %mul.1581), metadata={op_name="jit(train_step)/moe_layers/self_attention/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=619 source_end_line=619 source_column=12 source_end_column=36}
+  %broadcast_in_dim.1654 = f32[1,32]{1,0} reshape(%add.839), metadata={op_name="jit(train_step)/moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1585 = f32[1,32]{1,0} broadcast(%broadcast_in_dim.1654), dimensions={0,1}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1586 = f32[32]{0} reshape(%mul.1585), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1587 = f32[163840,32]{1,0} broadcast(%mul.1586), dimensions={1}, metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %mul.1588 = f32[163840,32]{1,0} multiply(%mul.1584, %mul.1587), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=624 source_end_line=624 source_column=12 source_end_column=31}
+  %convert_element_type.1398 = c64[163840,32]{1,0} convert(%mul.1588), metadata={op_name="jit(train_step)/moe_layers/self_attention/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %mul.1589 = c64[163840,32]{1,0} multiply(%convert_element_type.1398, %broadcast.254), metadata={op_name="jit(train_step)/moe_layers/self_attention/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=19 source_end_column=29}
+  %exp.298 = f32[163840,32]{1,0} real(%mul.1589), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.299 = f32[163840,32]{1,0} exponential(%exp.298), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.300 = pred[163840,32]{1,0} compare(%exp.299, %broadcast.264), direction=EQ, metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.301 = f32[163840,32]{1,0} multiply(%exp.298, %broadcast.263), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.302 = f32[163840,32]{1,0} exponential(%exp.301), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.303 = f32[163840,32]{1,0} imag(%mul.1589), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.304 = f32[163840,32]{1,0} cosine(%exp.303), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.305 = f32[163840,32]{1,0} multiply(%exp.302, %exp.304), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.306 = f32[163840,32]{1,0} multiply(%exp.305, %exp.302), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.307 = f32[163840,32]{1,0} multiply(%exp.299, %exp.304), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.308 = f32[163840,32]{1,0} select(%exp.300, %exp.306, %exp.307), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.309 = pred[163840,32]{1,0} compare(%exp.303, %broadcast.262), direction=EQ, metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.310 = f32[163840,32]{1,0} sine(%exp.303), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.311 = f32[163840,32]{1,0} multiply(%exp.302, %exp.310), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.312 = f32[163840,32]{1,0} multiply(%exp.311, %exp.302), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.313 = f32[163840,32]{1,0} multiply(%exp.299, %exp.310), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.314 = f32[163840,32]{1,0} select(%exp.300, %exp.312, %exp.313), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.315 = f32[163840,32]{1,0} select(%exp.309, %broadcast.262, %exp.314), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %exp.316 = c64[163840,32]{1,0} complex(%exp.308, %exp.315), metadata={op_name="jit(train_step)/moe_layers/self_attention/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=627 source_end_line=627 source_column=11 source_end_column=30}
+  %jit__take_.15 = c64[32,4096,32]{2,1,0} call(%exp.316, %data__inputs_position__.1), to_apply=%_take.4, metadata={op_name="jit(train_step)/moe_layers/self_attention/jit(_take)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=693 source_end_line=693 source_column=12 source_end_column=54}
+  %broadcast_in_dim.1655 = c64[32,4096,1,32]{3,2,1,0} reshape(%jit__take_.15), metadata={op_name="jit(train_step)/moe_layers/self_attention/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=694 source_end_line=694 source_column=12 source_end_column=39}
+  %jit__one_hot_.5 = s32[1,1,1,513]{3,2,1,0} call(), to_apply=%_one_hot.42, metadata={op_name="jit(train_step)/moe_layers/DeepSeekMoeBlock_0/jit(_one_hot)" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/moe.py" source_line=1400 source_end_line=1404 source_column=40 source_end_column=5}
+  %while.195 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, /*index=5*/bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, /*index=10*/bf16[2,7168,64]{2,0,1}, bf16[2,64]{0,1}, bf16[2,64,7168,2048]{3,2,0,1}, bf16[2,64,7168,2048]{3,2,0,1}, bf16[2,64,2048,7168]{3,2,0,1}, /*index=15*/bf16[2,7168,2048]{2,0,1}, bf16[2,7168,2048]{2,0,1}, bf16[2,2048,7168]{2,0,1}, bf16[2,7168]{0,1}, bf16[2,7168]{0,1}, /*index=20*/bf16[2,512]{0,1}, bf16[2,128,128,7168]{3,2,0,1}, bf16[2,1536]{0,1}, bf16[2,7168,576]{2,0,1}, bf16[2,512,128,256]{3,2,0,1}, /*index=25*/bf16[2,7168,1536]{2,0,1}, bf16[2,1536,128,192]{3,2,0,1}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=35*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=65*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, c64[32,4096,1,32]{3,2,1,0}, /*index=80*/c64[32,4096,1,32]{3,2,1,0}, s32[1,1,1,513]{3,2,1,0}) tuple(%constant.716, %while.188, %broadcast.251, %broadcast_in_dim.1640, %broadcast_in_dim.1639, /*index=5*/%broadcast.251, %broadcast.250, %broadcast.250, %broadcast.249, %broadcast.249, /*index=10*/%transpose.121, %transpose.122, %transpose.123, %transpose.124, %transpose.125, /*index=15*/%transpose.126, %transpose.127, %transpose.128, %transpose.129, %transpose.130, /*index=20*/%transpose.131, %transpose.132, %transpose.133, %transpose.134, %transpose.135, /*index=25*/%transpose.136, %transpose.137, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, /*index=30*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_scale__.1, /*index=35*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale__.1, /*index=40*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_scale__.1, /*index=45*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1, /*index=50*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1, /*index=55*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, /*index=60*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history2__.1, /*index=65*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, /*index=70*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, /*index=75*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %broadcast_in_dim.1652, /*index=80*/%broadcast_in_dim.1655, %jit__one_hot_.5), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.196 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, /*index=5*/bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, /*index=10*/bf16[2,7168,64]{2,0,1}, bf16[2,64]{0,1}, bf16[2,64,7168,2048]{3,2,0,1}, bf16[2,64,7168,2048]{3,2,0,1}, bf16[2,64,2048,7168]{3,2,0,1}, /*index=15*/bf16[2,7168,2048]{2,0,1}, bf16[2,7168,2048]{2,0,1}, bf16[2,2048,7168]{2,0,1}, bf16[2,7168]{0,1}, bf16[2,7168]{0,1}, /*index=20*/bf16[2,512]{0,1}, bf16[2,128,128,7168]{3,2,0,1}, bf16[2,1536]{0,1}, bf16[2,7168,576]{2,0,1}, bf16[2,512,128,256]{3,2,0,1}, /*index=25*/bf16[2,7168,1536]{2,0,1}, bf16[2,1536,128,192]{3,2,0,1}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=35*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=65*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, c64[32,4096,1,32]{3,2,1,0}, /*index=80*/c64[32,4096,1,32]{3,2,1,0}, s32[1,1,1,513]{3,2,1,0}) while(%while.195), condition=%region_54.90, body=%region_22.89, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.197 = s32[] get-tuple-element(%while.196), index=0, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.715 = f32[] constant(1)
+  %data__targets_segmentation__.1 = s32[32,4096]{1,0} parameter(231), sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}]>"}, metadata={op_name="data[\'targets_segmentation\']"}
+  %ne.3 = pred[32,4096]{1,0} compare(%data__targets_segmentation__.1, %broadcast.265), direction=NE, metadata={op_name="jit(train_step)/jvp()/ne" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=151 source_end_line=151 source_column=26 source_end_column=59}
+  %convert_element_type.1403 = s32[32,4096]{1,0} convert(%ne.3), metadata={op_name="jit(train_step)/jvp()/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=151 source_end_line=151 source_column=18 source_end_column=60}
+  %reduce_sum.4069 = s32[] reduce(%convert_element_type.1403, %constant.716), dimensions={0,1}, to_apply=%region_60.97, metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=151 source_end_line=151 source_column=18 source_end_column=60}
+  %convert_element_type.1404 = f32[] convert(%reduce_sum.4069), metadata={op_name="jit(train_step)/jvp()/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=163 source_end_line=163 source_column=25 source_end_column=44}
+  %constant.710 = f32[] constant(1e-08)
+  %add.843 = f32[] add(%convert_element_type.1404, %constant.710), metadata={op_name="jit(train_step)/jvp()/add" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=163 source_end_line=163 source_column=25 source_end_column=44}
+  %div.1433 = f32[] divide(%constant.715, %add.843), metadata={op_name="jit(train_step)/transpose(jvp())/div" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=163 source_end_line=163 source_column=11 source_end_column=45}
+  %broadcast_in_dim.1660 = f32[32,4096]{1,0} broadcast(%div.1433), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp())/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=150 source_end_line=150 source_column=15 source_end_column=28}
+  %ne.2 = pred[32,4096]{1,0} compare(%data__targets_segmentation__.1, %broadcast.265), direction=NE, metadata={op_name="jit(train_step)/jvp()/ne" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=149 source_end_line=149 source_column=17 source_end_column=50}
+  %convert_element_type.1402 = f32[32,4096]{1,0} convert(%ne.2), metadata={op_name="jit(train_step)/jvp()/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=149 source_end_line=149 source_column=9 source_end_column=51}
+  %mul.1603 = f32[32,4096]{1,0} multiply(%broadcast_in_dim.1660, %convert_element_type.1402), metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=149 source_end_line=149 source_column=9 source_end_column=51}
+  %sharding_constraint.195 = f32[32,4096]{1,0} custom-call(%mul.1603), custom_call_target="Sharding", sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}]>]>"}, metadata={op_name="jit(train_step)/transpose(jvp())/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %broadcast_in_dim.1662 = f32[32,4096,1]{2,1,0} reshape(%sharding_constraint.195), metadata={op_name="jit(train_step)/transpose(jvp())/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1609 = f32[32,4096,1]{2,1,0} broadcast(%broadcast_in_dim.1662), dimensions={0,1,2}, metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1610 = f32[32,4096]{1,0} reshape(%mul.1609), metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1611 = f32[32,4096,129280]{2,1,0} broadcast(%mul.1610), dimensions={0,1}, metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %while.198 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%while.196), index=1, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %convert_element_type.1399 = f32[32,4096,7168]{2,1,0} convert(%while.198), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %square.510 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.1399, %convert_element_type.1399), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/square" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %constant.712 = f32[] constant(0)
+  %reduce_sum.4065 = f32[32,4096]{1,0} reduce(%square.510, %constant.712), dimensions={2}, to_apply=%region_55.91, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.1656 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.4065), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.676 = f32[] constant(7168)
+  %broadcast.248 = f32[32,4096,1]{2,1,0} broadcast(%constant.676), dimensions={}
+  %div.1430 = f32[32,4096,1]{2,1,0} divide(%broadcast_in_dim.1656, %broadcast.248), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.675 = f32[] constant(1e-06)
+  %add.833 = f32[32,4096,1]{2,1,0} broadcast(%constant.675), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %add.840 = f32[32,4096,1]{2,1,0} add(%div.1430, %add.833), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=34 source_end_column=54}
+  %rsqrt.33 = f32[32,4096,1]{2,1,0} rsqrt(%add.840), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/rsqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.1592 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.33), dimensions={0,1,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.1593 = f32[32,4096]{1,0} reshape(%mul.1592), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.1594 = f32[32,4096,7168]{2,1,0} broadcast(%mul.1593), dimensions={0,1}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.1595 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.1399, %mul.1594), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %convert_element_type.1400 = bf16[32,4096,7168]{2,1,0} convert(%mul.1595), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %state_params__params____decoder____decoder_norm____scale__.1 = bf16[7168]{0} parameter(127), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'decoder_norm\'][\'scale\']"}
+  %broadcast_in_dim.1657 = bf16[1,1,7168]{2,1,0} reshape(%state_params__params____decoder____decoder_norm____scale__.1), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1596 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.1657), dimensions={0,1,2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1597 = bf16[7168]{0} reshape(%mul.1596), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1598 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.1597), dimensions={2}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1599 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.1400, %mul.1598), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %state_params__params____decoder____logits_dense____kernel__.1 = bf16[7168,129280]{1,0} parameter(140), sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'logits_dense\'][\'kernel\']"}
+  %dot_general.147 = bf16[32,4096,129280]{2,1,0} dot(%mul.1599, %state_params__params____decoder____logits_dense____kernel__.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/logits_dense/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=88 source_end_line=88 source_column=9 source_end_column=98}
+  %sharding_constraint.193 = bf16[32,4096,129280]{2,1,0} custom-call(%dot_general.147), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %convert_element_type.1401 = f32[32,4096,129280]{2,1,0} convert(%sharding_constraint.193), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/decoders.py" source_line=619 source_end_line=619 source_column=15 source_end_column=41}
+  %constant.711 = f32[] constant(-inf)
+  %reduce_max.1343 = f32[32,4096]{1,0} reduce(%convert_element_type.1401, %constant.711), dimensions={2}, to_apply=%region_56.93, metadata={op_name="jit(train_step)/jvp()/reduce_max" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=570 source_end_line=570 source_column=14 source_end_column=48}
+  %broadcast_in_dim.1658 = f32[32,4096,1]{2,1,0} reshape(%reduce_max.1343), metadata={op_name="jit(train_step)/jvp()/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=570 source_end_line=570 source_column=14 source_end_column=48}
+  %sub.77 = f32[32,4096,1]{2,1,0} broadcast(%broadcast_in_dim.1658), dimensions={0,1,2}, metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=571 source_end_line=571 source_column=12 source_end_column=30}
+  %sub.78 = f32[32,4096]{1,0} reshape(%sub.77), metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=571 source_end_line=571 source_column=12 source_end_column=30}
+  %sub.79 = f32[32,4096,129280]{2,1,0} broadcast(%sub.78), dimensions={0,1}, metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=571 source_end_line=571 source_column=12 source_end_column=30}
+  %sub.80 = f32[32,4096,129280]{2,1,0} subtract(%convert_element_type.1401, %sub.79), metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=571 source_end_line=571 source_column=12 source_end_column=30}
+  %exp.317 = f32[32,4096,129280]{2,1,0} exponential(%sub.80), metadata={op_name="jit(train_step)/jvp()/exp" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=572 source_end_line=572 source_column=16 source_end_column=32}
+  %reduce_sum.4066 = f32[32,4096]{1,0} reduce(%exp.317, %constant.712), dimensions={2}, to_apply=%region_57.94, metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=573 source_end_line=573 source_column=12 source_end_column=56}
+  %broadcast_in_dim.1659 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.4066), metadata={op_name="jit(train_step)/jvp()/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=573 source_end_line=573 source_column=12 source_end_column=56}
+  %log.3 = f32[32,4096,1]{2,1,0} log(%broadcast_in_dim.1659), metadata={op_name="jit(train_step)/jvp()/log" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=577 source_end_line=577 source_column=22 source_end_column=38}
+  %add.841 = f32[32,4096,1]{2,1,0} add(%log.3, %broadcast_in_dim.1658), metadata={op_name="jit(train_step)/jvp()/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=577 source_end_line=577 source_column=22 source_end_column=38}
+  %squeeze.593 = f32[32,4096]{1,0} reshape(%add.841), metadata={op_name="jit(train_step)/jvp()/squeeze" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=577 source_end_line=577 source_column=10 source_end_column=60}
+  %constant.673 = f32[] constant(0)
+  %broadcast.247 = f32[32,4096]{1,0} broadcast(%constant.673), dimensions={}
+  %mul.1604 = f32[32,4096]{1,0} multiply(%squeeze.593, %broadcast.247), metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %constant.672 = f32[] constant(1)
+  %add.832 = f32[32,4096]{1,0} broadcast(%constant.672), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp())/add" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %add.844 = f32[32,4096]{1,0} add(%mul.1604, %add.832), metadata={op_name="jit(train_step)/transpose(jvp())/add" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %broadcast_in_dim.1661 = f32[32,4096,1]{2,1,0} reshape(%add.844), metadata={op_name="jit(train_step)/transpose(jvp())/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1605 = f32[32,4096,1]{2,1,0} broadcast(%broadcast_in_dim.1661), dimensions={0,1,2}, metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1606 = f32[32,4096]{1,0} reshape(%mul.1605), metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1607 = f32[32,4096,129280]{2,1,0} broadcast(%mul.1606), dimensions={0,1}, metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1608 = f32[32,4096,129280]{2,1,0} multiply(%mul.1607, %exp.317), metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %div.1434 = f32[32,4096,1]{2,1,0} broadcast(%broadcast_in_dim.1659), dimensions={0,1,2}, metadata={op_name="jit(train_step)/transpose(jvp())/div" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %div.1435 = f32[32,4096]{1,0} reshape(%div.1434), metadata={op_name="jit(train_step)/transpose(jvp())/div" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %div.1436 = f32[32,4096,129280]{2,1,0} broadcast(%div.1435), dimensions={0,1}, metadata={op_name="jit(train_step)/transpose(jvp())/div" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %div.1437 = f32[32,4096,129280]{2,1,0} divide(%mul.1608, %div.1436), metadata={op_name="jit(train_step)/transpose(jvp())/div" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %data__targets__.1 = s32[32,4096]{1,0} parameter(230), sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"tensor_sequence\", \"context\", \"sequence\"}]>"}, metadata={op_name="data[\'targets\']"}
+  %jvp_jit__one_hot__.1 = f32[32,4096,129280]{2,1,0} call(%data__targets__.1), to_apply=%_one_hot_399.92, metadata={op_name="jit(train_step)/jvp(jit(_one_hot))" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=145 source_end_line=145 source_column=20 source_end_column=70}
+  %sub.85 = f32[32,4096,129280]{2,1,0} subtract(%div.1437, %jvp_jit__one_hot__.1), metadata={op_name="jit(train_step)/transpose(jvp())/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %mul.1612 = f32[32,4096,129280]{2,1,0} multiply(%mul.1611, %sub.85), metadata={op_name="jit(train_step)/transpose(jvp())/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=146 source_end_line=146 source_column=12 source_end_column=77}
+  %convert_element_type.1405 = bf16[32,4096,129280]{2,1,0} convert(%mul.1612), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/decoders.py" source_line=619 source_end_line=619 source_column=15 source_end_column=41}
+  %sharding_constraint.196 = bf16[32,4096,129280]{2,1,0} custom-call(%convert_element_type.1405), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %dot_general.149 = bf16[32,4096,7168]{2,1,0} dot(%sharding_constraint.196, %state_params__params____decoder____logits_dense____kernel__.1), lhs_contracting_dims={2}, rhs_contracting_dims={1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/logits_dense/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=88 source_end_line=88 source_column=9 source_end_column=98}
+  %mul.1614 = bf16[1,1,7168]{2,1,0} broadcast(%broadcast_in_dim.1657), dimensions={0,1,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1615 = bf16[7168]{0} reshape(%mul.1614), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1616 = bf16[32,4096,7168]{2,1,0} broadcast(%mul.1615), dimensions={2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1617 = bf16[32,4096,7168]{2,1,0} multiply(%dot_general.149, %mul.1616), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %convert_element_type.1406 = f32[32,4096,7168]{2,1,0} convert(%mul.1617), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=8 source_end_column=68}
+  %mul.1619 = f32[32,4096,1]{2,1,0} broadcast(%rsqrt.33), dimensions={0,1,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.1620 = f32[32,4096]{1,0} reshape(%mul.1619), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.1621 = f32[32,4096,7168]{2,1,0} broadcast(%mul.1620), dimensions={0,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.1622 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.1406, %mul.1621), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %mul.1618 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.1399, %convert_element_type.1406), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %reduce_sum.4072 = f32[32,4096]{1,0} reduce(%mul.1618, %constant.712), dimensions={2}, to_apply=%region_63.100, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %reshape.296 = f32[32,4096,1]{2,1,0} reshape(%reduce_sum.4072), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reshape" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=20 source_end_column=55}
+  %div.1431 = f32[32,4096,1]{2,1,0} divide(%rsqrt.33, %add.840), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %constant.674 = f32[] constant(-0.5)
+  %mul.1544 = f32[32,4096,1]{2,1,0} broadcast(%constant.674), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.1591 = f32[32,4096,1]{2,1,0} multiply(%div.1431, %mul.1544), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %mul.1623 = f32[32,4096,1]{2,1,0} multiply(%reshape.296, %mul.1591), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=61 source_end_line=61 source_column=24 source_end_column=55}
+  %div.1438 = f32[32,4096,1]{2,1,0} divide(%mul.1623, %broadcast.248), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/div" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %reduce_sum.4073 = f32[32,4096]{1,0} reduce(%div.1438, %constant.712), dimensions={2}, to_apply=%region_64.101, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %broadcast_in_dim.1663 = f32[32,4096,7168]{2,1,0} broadcast(%reduce_sum.4073), dimensions={0,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=12 source_end_column=59}
+  %constant.677 = f32[] constant(2)
+  %mul.1545 = f32[32,4096,7168]{2,1,0} broadcast(%constant.677), dimensions={}, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %mul.1590 = f32[32,4096,7168]{2,1,0} multiply(%convert_element_type.1399, %mul.1545), metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %mul.1624 = f32[32,4096,7168]{2,1,0} multiply(%broadcast_in_dim.1663, %mul.1590), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %add_any.61 = f32[32,4096,7168]{2,1,0} add(%mul.1622, %mul.1624), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/add_any" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=60 source_end_line=60 source_column=21 source_end_column=34}
+  %convert_element_type.1407 = bf16[32,4096,7168]{2,1,0} convert(%add_any.61), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=59 source_end_line=59 source_column=8 source_end_column=35}
+  %constant.671 = bf16[] constant(0)
+  %broadcast_in_dim.1638 = bf16[2,7168,64]{2,1,0} broadcast(%constant.671), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.670 = bf16[] constant(0)
+  %broadcast_in_dim.1637 = bf16[2,64]{1,0} broadcast(%constant.670), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.669 = bf16[] constant(0)
+  %broadcast.246 = bf16[2,64,7168,2048]{3,2,1,0} broadcast(%constant.669), dimensions={}
+  %constant.668 = bf16[] constant(0)
+  %broadcast_in_dim.1636 = bf16[2,64,2048,7168]{3,2,1,0} broadcast(%constant.668), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.667 = bf16[] constant(0)
+  %broadcast.245 = bf16[2,7168,2048]{2,1,0} broadcast(%constant.667), dimensions={}
+  %constant.666 = bf16[] constant(0)
+  %broadcast_in_dim.1635 = bf16[2,2048,7168]{2,1,0} broadcast(%constant.666), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.665 = bf16[] constant(0)
+  %broadcast.244 = bf16[2,7168]{1,0} broadcast(%constant.665), dimensions={}
+  %constant.664 = bf16[] constant(0)
+  %broadcast_in_dim.1634 = bf16[2,512]{1,0} broadcast(%constant.664), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.663 = bf16[] constant(0)
+  %broadcast_in_dim.1633 = bf16[2,128,128,7168]{3,2,1,0} broadcast(%constant.663), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.662 = bf16[] constant(0)
+  %broadcast_in_dim.1632 = bf16[2,1536]{1,0} broadcast(%constant.662), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.661 = bf16[] constant(0)
+  %broadcast_in_dim.1631 = bf16[2,7168,576]{2,1,0} broadcast(%constant.661), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.660 = bf16[] constant(0)
+  %broadcast_in_dim.1630 = bf16[2,512,128,256]{3,2,1,0} broadcast(%constant.660), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.659 = bf16[] constant(0)
+  %broadcast_in_dim.1629 = bf16[2,7168,1536]{2,1,0} broadcast(%constant.659), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.658 = bf16[] constant(0)
+  %broadcast_in_dim.1628 = bf16[2,1536,128,192]{3,2,1,0} broadcast(%constant.658), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.657 = f32[] constant(0)
+  %broadcast.243 = f32[2,1024]{1,0} broadcast(%constant.657), dimensions={}
+  %constant.656 = f32[] constant(0)
+  %broadcast.242 = f32[2,1]{1,0} broadcast(%constant.656), dimensions={}
+  %while.199 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%while.196), index=2, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.200 = bf16[2,32,4096,1536]{3,2,1,0} get-tuple-element(%while.196), index=3, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.201 = bf16[2,32,4096,576]{3,2,1,0} get-tuple-element(%while.196), index=4, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.202 = bf16[2,32,4096,7168]{3,2,1,0} get-tuple-element(%while.196), index=5, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.203 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%while.196), index=6, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.204 = bf16[2,64,32,512,2048]{4,3,2,1,0} get-tuple-element(%while.196), index=7, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.205 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%while.196), index=8, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.206 = bf16[2,32,4096,2048]{3,2,1,0} get-tuple-element(%while.196), index=9, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(51), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[2,1]{1,0} parameter(52), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(57), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_0\'][\'fp8_einsum\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____output_grad_scale__.1 = f32[2,1]{1,0} parameter(58), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_0\'][\'fp8_einsum\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(65), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_amax_history2__.1 = f32[2,1024]{1,0} parameter(66), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'output_grad_amax_history2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_scale__.1 = f32[2,1]{1,0} parameter(67), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_scale2__.1 = f32[2,1]{1,0} parameter(68), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_1\'][\'fp8_einsum\'][\'output_grad_scale2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(73), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_2\'][\'fp8_einsum\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____output_grad_scale__.1 = f32[2,1]{1,0} parameter(74), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_2\'][\'fp8_einsum\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(79), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_3\'][\'fp8_einsum\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____output_grad_scale__.1 = f32[2,1]{1,0} parameter(80), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'_Fp8EinsumWrapper_3\'][\'fp8_einsum\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(85), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[2,1]{1,0} parameter(86), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(91), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[2,1]{1,0} parameter(92), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(97), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[2,1]{1,0} parameter(98), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(105), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_amax_history2__.1 = f32[2,1024]{1,0} parameter(106), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_amax_history2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_scale__.1 = f32[2,1]{1,0} parameter(107), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_scale2__.1 = f32[2,1]{1,0} parameter(108), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_scale2\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(113), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[2,1]{1,0} parameter(114), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(119), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[2,1]{1,0} parameter(120), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[2,1024]{1,0} parameter(125), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[2,1]{1,0} parameter(126), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %while.207 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=5*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=10*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=15*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, /*index=20*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=25*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=35*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=75*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,32,4096,7168]{3,2,1,0}, /*index=100*/bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, /*index=105*/bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=130*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=135*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=140*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=145*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=150*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=155*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=160*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=165*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=170*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=175*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=180*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=185*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,7168,64]{2,0,1}, bf16[2,64]{0,1}, bf16[2,64,7168,2048]{3,2,0,1}, /*index=190*/bf16[2,64,7168,2048]{3,2,0,1}, bf16[2,64,2048,7168]{3,2,0,1}, bf16[2,7168,2048]{2,0,1}, bf16[2,7168,2048]{2,0,1}, bf16[2,2048,7168]{2,0,1}, /*index=195*/bf16[2,7168]{0,1}, bf16[2,7168]{0,1}, bf16[2,512]{0,1}, bf16[2,128,128,7168]{3,2,0,1}, bf16[2,1536]{0,1}, /*index=200*/bf16[2,7168,576]{2,0,1}, bf16[2,512,128,256]{3,2,0,1}, bf16[2,7168,1536]{2,0,1}, bf16[2,1536,128,192]{3,2,0,1}, s32[32,4096]{1,0}) tuple(%constant.716, %convert_element_type.1407, %broadcast_in_dim.1638, %broadcast_in_dim.1637, %broadcast.246, /*index=5*/%broadcast.246, %broadcast_in_dim.1636, %broadcast.245, %broadcast.245, %broadcast_in_dim.1635, /*index=10*/%broadcast.244, %broadcast.244, %broadcast_in_dim.1634, %broadcast_in_dim.1633, %broadcast_in_dim.1632, /*index=15*/%broadcast_in_dim.1631, %broadcast_in_dim.1630, %broadcast_in_dim.1629, %broadcast_in_dim.1628, %broadcast.243, /*index=20*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, /*index=25*/%broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, /*index=30*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.243, /*index=35*/%broadcast.242, %broadcast.242, %broadcast.243, %broadcast.243, %broadcast.242, /*index=40*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, /*index=45*/%broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, /*index=50*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, /*index=55*/%broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, /*index=60*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, /*index=65*/%broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, /*index=70*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.243, /*index=75*/%broadcast.242, %broadcast.242, %broadcast.243, %broadcast.243, %broadcast.242, /*index=80*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, /*index=85*/%broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, /*index=90*/%broadcast.242, %broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, /*index=95*/%broadcast.243, %broadcast.242, %broadcast.243, %broadcast.242, %while.199, /*index=100*/%while.200, %while.201, %while.202, %while.203, %while.204, /*index=105*/%while.205, %while.206, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, /*index=110*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____input_scale__.1, /*index=115*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_0____fp8_einsum____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_amax_history__.1, /*index=120*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____kernel_scale2__.1, /*index=125*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_1____fp8_einsum____output_grad_scale2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_amax_history__.1, /*index=130*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_2____fp8_einsum____output_grad_scale__.1, /*index=135*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____output_grad_amax_history__.1, /*index=140*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0_____Fp8EinsumWrapper_3____fp8_einsum____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1, /*index=145*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, /*index=150*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____input_scale__.1, /*index=155*/%state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_amax_history__.1, /*index=160*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____kernel_scale2__.1, /*index=165*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____fp8_einsum____output_grad_scale2__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1, /*index=170*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, /*index=175*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, /*index=180*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, /*index=185*/%state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____moe_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %transpose.121, %transpose.122, %transpose.123, /*index=190*/%transpose.124, %transpose.125, %transpose.126, %transpose.127, %transpose.128, /*index=195*/%transpose.129, %transpose.130, %transpose.131, %transpose.132, %transpose.133, /*index=200*/%transpose.134, %transpose.135, %transpose.136, %transpose.137, %data__inputs_position__.1), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.208 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[2,7168,64]{2,1,0}, bf16[2,64]{1,0}, bf16[2,64,7168,2048]{3,2,1,0}, /*index=5*/bf16[2,64,7168,2048]{3,2,1,0}, bf16[2,64,2048,7168]{3,2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,7168,2048]{2,1,0}, bf16[2,2048,7168]{2,1,0}, /*index=10*/bf16[2,7168]{1,0}, bf16[2,7168]{1,0}, bf16[2,512]{1,0}, bf16[2,128,128,7168]{3,2,1,0}, bf16[2,1536]{1,0}, /*index=15*/bf16[2,7168,576]{2,1,0}, bf16[2,512,128,256]{3,2,1,0}, bf16[2,7168,1536]{2,1,0}, bf16[2,1536,128,192]{3,2,1,0}, f32[2,1024]{1,0}, /*index=20*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=25*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=30*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=35*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=40*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=45*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, /*index=75*/f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,32,4096,7168]{3,2,1,0}, /*index=100*/bf16[2,32,4096,1536]{3,2,1,0}, bf16[2,32,4096,576]{3,2,1,0}, bf16[2,32,4096,7168]{3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, bf16[2,64,32,512,2048]{4,3,2,1,0}, /*index=105*/bf16[2,32,4096,2048]{3,2,1,0}, bf16[2,32,4096,2048]{3,2,1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=130*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=135*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=140*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=145*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=150*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=155*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=160*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=165*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=170*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=175*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=180*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=185*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[2,7168,64]{2,0,1}, bf16[2,64]{0,1}, bf16[2,64,7168,2048]{3,2,0,1}, /*index=190*/bf16[2,64,7168,2048]{3,2,0,1}, bf16[2,64,2048,7168]{3,2,0,1}, bf16[2,7168,2048]{2,0,1}, bf16[2,7168,2048]{2,0,1}, bf16[2,2048,7168]{2,0,1}, /*index=195*/bf16[2,7168]{0,1}, bf16[2,7168]{0,1}, bf16[2,512]{0,1}, bf16[2,128,128,7168]{3,2,0,1}, bf16[2,1536]{0,1}, /*index=200*/bf16[2,7168,576]{2,0,1}, bf16[2,512,128,256]{3,2,0,1}, bf16[2,7168,1536]{2,0,1}, bf16[2,1536,128,192]{3,2,0,1}, s32[32,4096]{1,0}) while(%while.207), condition=%region_211.292, body=%region_65.291, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.209 = s32[] get-tuple-element(%while.208), index=0, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.210 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%while.208), index=1, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.655 = bf16[] constant(0)
+  %broadcast.241 = bf16[1,7168,18432]{2,1,0} broadcast(%constant.655), dimensions={}
+  %constant.654 = bf16[] constant(0)
+  %broadcast_in_dim.1627 = bf16[1,18432,7168]{2,1,0} broadcast(%constant.654), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.653 = bf16[] constant(0)
+  %broadcast.240 = bf16[1,7168]{1,0} broadcast(%constant.653), dimensions={}
+  %constant.652 = bf16[] constant(0)
+  %broadcast_in_dim.1626 = bf16[1,512]{1,0} broadcast(%constant.652), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.651 = bf16[] constant(0)
+  %broadcast_in_dim.1625 = bf16[1,128,128,7168]{3,2,1,0} broadcast(%constant.651), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.650 = bf16[] constant(0)
+  %broadcast_in_dim.1624 = bf16[1,1536]{1,0} broadcast(%constant.650), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.649 = bf16[] constant(0)
+  %broadcast_in_dim.1623 = bf16[1,7168,576]{2,1,0} broadcast(%constant.649), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.648 = bf16[] constant(0)
+  %broadcast_in_dim.1622 = bf16[1,512,128,256]{3,2,1,0} broadcast(%constant.648), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.647 = bf16[] constant(0)
+  %broadcast_in_dim.1621 = bf16[1,7168,1536]{2,1,0} broadcast(%constant.647), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.646 = bf16[] constant(0)
+  %broadcast_in_dim.1620 = bf16[1,1536,128,192]{3,2,1,0} broadcast(%constant.646), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/broadcast_in_dim" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %constant.645 = f32[] constant(0)
+  %broadcast.239 = f32[1,1024]{1,0} broadcast(%constant.645), dimensions={}
+  %constant.644 = f32[1,1]{1,0} constant({ {0} })
+  %while.189 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%while.186), index=2, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.190 = bf16[1,32,4096,1536]{3,2,1,0} get-tuple-element(%while.186), index=3, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.191 = bf16[1,32,4096,576]{3,2,1,0} get-tuple-element(%while.186), index=4, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.192 = bf16[1,32,4096,7168]{3,2,1,0} get-tuple-element(%while.186), index=5, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.193 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%while.186), index=6, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.194 = bf16[1,32,4096,18432]{3,2,1,0} get-tuple-element(%while.186), index=7, metadata={op_name="jit(train_step)/jvp(TransformerLinenPure.apply)/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[1,1024]{1,0} parameter(5), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[1,1]{1,0} parameter(6), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[1,1024]{1,0} parameter(11), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[1,1]{1,0} parameter(12), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[1,1024]{1,0} parameter(17), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[1,1]{1,0} parameter(18), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_amax_history__.1 = f32[1,1024]{1,0} parameter(25), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_amax_history2__.1 = f32[1,1024]{1,0} parameter(26), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_amax_history2\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_scale__.1 = f32[1,1]{1,0} parameter(27), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_scale2__.1 = f32[1,1]{1,0} parameter(28), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'fp8_einsum\'][\'output_grad_scale2\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[1,1024]{1,0} parameter(33), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[1,1]{1,0} parameter(34), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[1,1024]{1,0} parameter(39), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[1,1]{1,0} parameter(40), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1 = f32[1,1024]{1,0} parameter(45), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_amax_history\']"}
+  %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1 = f32[1,1]{1,0} parameter(46), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'_overwrite_with_gradient\'][\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'Fp8DirectDotGeneralOp_0\'][\'output_grad_scale\']"}
+  %while.308 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=5*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=10*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, f32[1,1024]{1,0}, /*index=15*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=55*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=60*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, /*index=65*/bf16[1,32,4096,18432]{3,2,1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=70*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=75*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=80*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=85*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=90*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=95*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=100*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=105*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=110*/f32[1,1024]{1,0}, f32[1,1]{1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=115*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=120*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) tuple(%constant.716, %while.210, %broadcast.241, %broadcast.241, %broadcast_in_dim.1627, /*index=5*/%broadcast.240, %broadcast.240, %broadcast_in_dim.1626, %broadcast_in_dim.1625, %broadcast_in_dim.1624, /*index=10*/%broadcast_in_dim.1623, %broadcast_in_dim.1622, %broadcast_in_dim.1621, %broadcast_in_dim.1620, %broadcast.239, /*index=15*/%constant.644, %broadcast.239, %constant.644, %broadcast.239, %constant.644, /*index=20*/%broadcast.239, %constant.644, %broadcast.239, %constant.644, %broadcast.239, /*index=25*/%constant.644, %broadcast.239, %constant.644, %broadcast.239, %constant.644, /*index=30*/%broadcast.239, %constant.644, %broadcast.239, %constant.644, %broadcast.239, /*index=35*/%broadcast.239, %constant.644, %constant.644, %broadcast.239, %broadcast.239, /*index=40*/%constant.644, %constant.644, %broadcast.239, %constant.644, %broadcast.239, /*index=45*/%constant.644, %broadcast.239, %constant.644, %broadcast.239, %constant.644, /*index=50*/%broadcast.239, %constant.644, %broadcast.239, %constant.644, %broadcast.239, /*index=55*/%constant.644, %broadcast.239, %constant.644, %broadcast.239, %constant.644, /*index=60*/%while.189, %while.190, %while.191, %while.192, %while.193, /*index=65*/%while.194, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____kernel_scale__.1, /*index=70*/%state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_0____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, /*index=75*/%state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wi_1____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____input_scale__.1, /*index=80*/%state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____mlp____wo____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_amax_history__.1, /*index=85*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____kernel_scale2__.1, /*index=90*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_amax_history2__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____fp8_einsum____output_grad_scale2__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_amax_history__.1, /*index=95*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____out____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, /*index=100*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, /*index=105*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wkv_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____input_scale__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____kernel_scale__.1, /*index=110*/%state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_amax_history__.1, %state_params___overwrite_with_gradient____decoder____dense_layers____self_attention____wq_b____Fp8DirectDotGeneralOp_0____output_grad_scale__.1, %transpose.109, %transpose.110, %transpose.111, /*index=115*/%transpose.112, %transpose.113, %transpose.114, %transpose.115, %transpose.116, /*index=120*/%transpose.117, %transpose.118, %transpose.119, %transpose.120, %data__inputs_position__.1), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.309 = (s32[], bf16[32,4096,7168]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=5*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=10*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, f32[1,1024]{1,0}, /*index=15*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=20*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=25*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=30*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=45*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=50*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=55*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=60*/bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,1536]{3,2,1,0}, bf16[1,32,4096,576]{3,2,1,0}, bf16[1,32,4096,7168]{3,2,1,0}, bf16[1,32,4096,18432]{3,2,1,0}, /*index=65*/bf16[1,32,4096,18432]{3,2,1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=70*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=75*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=80*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=85*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=90*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=95*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=100*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=105*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=110*/f32[1,1024]{1,0}, f32[1,1]{1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,7168,18432]{2,1,0}, bf16[1,18432,7168]{2,1,0}, /*index=115*/bf16[1,7168]{1,0}, bf16[1,7168]{1,0}, bf16[1,512]{1,0}, bf16[1,128,128,7168]{3,2,1,0}, bf16[1,1536]{1,0}, /*index=120*/bf16[1,7168,576]{2,1,0}, bf16[1,512,128,256]{3,2,1,0}, bf16[1,7168,1536]{2,1,0}, bf16[1,1536,128,192]{3,2,1,0}, s32[32,4096]{1,0}) while(%while.308), condition=%region_304.392, body=%region_212.391, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %while.310 = s32[] get-tuple-element(%while.309), index=0, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %state_step.1 = s32[] parameter(0), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, []>"}, metadata={op_name="state.step"}
+  %add.1047 = s32[] add(%state_step.1, %constant.714), metadata={op_name="jit(train_step)/add" source_file="/opt/flax/flax/training/train_state.py" source_line=118 source_end_line=118 source_column=11 source_end_column=24}
+  %custom-call.456 = s32[] custom-call(%add.1047), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.684 = s32[] custom-call(%custom-call.456), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, []>]>"}
+  %reshape.297 = s32[] reshape(%custom-call.684), sharding={replicated}
+  %while.324 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=14, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.457 = f32[1,1024]{1,0} custom-call(%while.324), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.685 = f32[1,1024]{1,0} custom-call(%custom-call.457), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.298 = f32[1,1024]{1,0} reshape(%custom-call.685), sharding={replicated}
+  %while.325 = f32[1,1]{1,0} get-tuple-element(%while.309), index=15, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.458 = f32[1,1]{1,0} custom-call(%while.325), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.686 = f32[1,1]{1,0} custom-call(%custom-call.458), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.299 = f32[1,1]{1,0} reshape(%custom-call.686), sharding={replicated}
+  %while.326 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=16, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.459 = f32[1,1024]{1,0} custom-call(%while.326), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.687 = f32[1,1024]{1,0} custom-call(%custom-call.459), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.300 = f32[1,1024]{1,0} reshape(%custom-call.687), sharding={replicated}
+  %while.327 = f32[1,1]{1,0} get-tuple-element(%while.309), index=17, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.460 = f32[1,1]{1,0} custom-call(%while.327), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.688 = f32[1,1]{1,0} custom-call(%custom-call.460), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.301 = f32[1,1]{1,0} reshape(%custom-call.688), sharding={replicated}
+  %while.328 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=18, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.461 = f32[1,1024]{1,0} custom-call(%while.328), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.689 = f32[1,1024]{1,0} custom-call(%custom-call.461), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.302 = f32[1,1024]{1,0} reshape(%custom-call.689), sharding={replicated}
+  %while.329 = f32[1,1]{1,0} get-tuple-element(%while.309), index=19, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.462 = f32[1,1]{1,0} custom-call(%while.329), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.690 = f32[1,1]{1,0} custom-call(%custom-call.462), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.303 = f32[1,1]{1,0} reshape(%custom-call.690), sharding={replicated}
+  %while.330 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=20, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.463 = f32[1,1024]{1,0} custom-call(%while.330), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.691 = f32[1,1024]{1,0} custom-call(%custom-call.463), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.304 = f32[1,1024]{1,0} reshape(%custom-call.691), sharding={replicated}
+  %while.331 = f32[1,1]{1,0} get-tuple-element(%while.309), index=21, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.464 = f32[1,1]{1,0} custom-call(%while.331), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.692 = f32[1,1]{1,0} custom-call(%custom-call.464), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.305 = f32[1,1]{1,0} reshape(%custom-call.692), sharding={replicated}
+  %while.332 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=22, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.465 = f32[1,1024]{1,0} custom-call(%while.332), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.693 = f32[1,1024]{1,0} custom-call(%custom-call.465), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.306 = f32[1,1024]{1,0} reshape(%custom-call.693), sharding={replicated}
+  %while.333 = f32[1,1]{1,0} get-tuple-element(%while.309), index=23, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.466 = f32[1,1]{1,0} custom-call(%while.333), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.694 = f32[1,1]{1,0} custom-call(%custom-call.466), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.307 = f32[1,1]{1,0} reshape(%custom-call.694), sharding={replicated}
+  %while.334 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=24, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.467 = f32[1,1024]{1,0} custom-call(%while.334), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.695 = f32[1,1024]{1,0} custom-call(%custom-call.467), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.308 = f32[1,1024]{1,0} reshape(%custom-call.695), sharding={replicated}
+  %while.335 = f32[1,1]{1,0} get-tuple-element(%while.309), index=25, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.468 = f32[1,1]{1,0} custom-call(%while.335), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.696 = f32[1,1]{1,0} custom-call(%custom-call.468), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.309 = f32[1,1]{1,0} reshape(%custom-call.696), sharding={replicated}
+  %while.336 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=26, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.469 = f32[1,1024]{1,0} custom-call(%while.336), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.697 = f32[1,1024]{1,0} custom-call(%custom-call.469), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.310 = f32[1,1024]{1,0} reshape(%custom-call.697), sharding={replicated}
+  %while.337 = f32[1,1]{1,0} get-tuple-element(%while.309), index=27, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.470 = f32[1,1]{1,0} custom-call(%while.337), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.698 = f32[1,1]{1,0} custom-call(%custom-call.470), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.311 = f32[1,1]{1,0} reshape(%custom-call.698), sharding={replicated}
+  %while.338 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=28, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.471 = f32[1,1024]{1,0} custom-call(%while.338), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.699 = f32[1,1024]{1,0} custom-call(%custom-call.471), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.312 = f32[1,1024]{1,0} reshape(%custom-call.699), sharding={replicated}
+  %while.339 = f32[1,1]{1,0} get-tuple-element(%while.309), index=29, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.472 = f32[1,1]{1,0} custom-call(%while.339), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.700 = f32[1,1]{1,0} custom-call(%custom-call.472), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.313 = f32[1,1]{1,0} reshape(%custom-call.700), sharding={replicated}
+  %while.340 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=30, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.473 = f32[1,1024]{1,0} custom-call(%while.340), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.701 = f32[1,1024]{1,0} custom-call(%custom-call.473), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.314 = f32[1,1024]{1,0} reshape(%custom-call.701), sharding={replicated}
+  %while.341 = f32[1,1]{1,0} get-tuple-element(%while.309), index=31, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.474 = f32[1,1]{1,0} custom-call(%while.341), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.702 = f32[1,1]{1,0} custom-call(%custom-call.474), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.315 = f32[1,1]{1,0} reshape(%custom-call.702), sharding={replicated}
+  %while.342 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=32, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.475 = f32[1,1024]{1,0} custom-call(%while.342), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.703 = f32[1,1024]{1,0} custom-call(%custom-call.475), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.316 = f32[1,1024]{1,0} reshape(%custom-call.703), sharding={replicated}
+  %while.343 = f32[1,1]{1,0} get-tuple-element(%while.309), index=33, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.476 = f32[1,1]{1,0} custom-call(%while.343), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.704 = f32[1,1]{1,0} custom-call(%custom-call.476), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.317 = f32[1,1]{1,0} reshape(%custom-call.704), sharding={replicated}
+  %while.344 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=34, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.477 = f32[1,1024]{1,0} custom-call(%while.344), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.705 = f32[1,1024]{1,0} custom-call(%custom-call.477), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.318 = f32[1,1024]{1,0} reshape(%custom-call.705), sharding={replicated}
+  %while.345 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=35, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.478 = f32[1,1024]{1,0} custom-call(%while.345), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.706 = f32[1,1024]{1,0} custom-call(%custom-call.478), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.319 = f32[1,1024]{1,0} reshape(%custom-call.706), sharding={replicated}
+  %while.346 = f32[1,1]{1,0} get-tuple-element(%while.309), index=36, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.479 = f32[1,1]{1,0} custom-call(%while.346), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.707 = f32[1,1]{1,0} custom-call(%custom-call.479), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.320 = f32[1,1]{1,0} reshape(%custom-call.707), sharding={replicated}
+  %while.347 = f32[1,1]{1,0} get-tuple-element(%while.309), index=37, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.480 = f32[1,1]{1,0} custom-call(%while.347), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.708 = f32[1,1]{1,0} custom-call(%custom-call.480), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.321 = f32[1,1]{1,0} reshape(%custom-call.708), sharding={replicated}
+  %while.348 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=38, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.481 = f32[1,1024]{1,0} custom-call(%while.348), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.709 = f32[1,1024]{1,0} custom-call(%custom-call.481), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.322 = f32[1,1024]{1,0} reshape(%custom-call.709), sharding={replicated}
+  %while.349 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=39, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.482 = f32[1,1024]{1,0} custom-call(%while.349), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.710 = f32[1,1024]{1,0} custom-call(%custom-call.482), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.323 = f32[1,1024]{1,0} reshape(%custom-call.710), sharding={replicated}
+  %while.350 = f32[1,1]{1,0} get-tuple-element(%while.309), index=40, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.483 = f32[1,1]{1,0} custom-call(%while.350), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.711 = f32[1,1]{1,0} custom-call(%custom-call.483), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.324 = f32[1,1]{1,0} reshape(%custom-call.711), sharding={replicated}
+  %while.351 = f32[1,1]{1,0} get-tuple-element(%while.309), index=41, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.484 = f32[1,1]{1,0} custom-call(%while.351), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.712 = f32[1,1]{1,0} custom-call(%custom-call.484), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.325 = f32[1,1]{1,0} reshape(%custom-call.712), sharding={replicated}
+  %while.352 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=42, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.485 = f32[1,1024]{1,0} custom-call(%while.352), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.713 = f32[1,1024]{1,0} custom-call(%custom-call.485), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.326 = f32[1,1024]{1,0} reshape(%custom-call.713), sharding={replicated}
+  %while.353 = f32[1,1]{1,0} get-tuple-element(%while.309), index=43, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.486 = f32[1,1]{1,0} custom-call(%while.353), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.714 = f32[1,1]{1,0} custom-call(%custom-call.486), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.327 = f32[1,1]{1,0} reshape(%custom-call.714), sharding={replicated}
+  %while.354 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=44, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.487 = f32[1,1024]{1,0} custom-call(%while.354), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.715 = f32[1,1024]{1,0} custom-call(%custom-call.487), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.328 = f32[1,1024]{1,0} reshape(%custom-call.715), sharding={replicated}
+  %while.355 = f32[1,1]{1,0} get-tuple-element(%while.309), index=45, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.488 = f32[1,1]{1,0} custom-call(%while.355), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.716 = f32[1,1]{1,0} custom-call(%custom-call.488), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.329 = f32[1,1]{1,0} reshape(%custom-call.716), sharding={replicated}
+  %while.356 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=46, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.489 = f32[1,1024]{1,0} custom-call(%while.356), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.717 = f32[1,1024]{1,0} custom-call(%custom-call.489), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.330 = f32[1,1024]{1,0} reshape(%custom-call.717), sharding={replicated}
+  %while.357 = f32[1,1]{1,0} get-tuple-element(%while.309), index=47, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.490 = f32[1,1]{1,0} custom-call(%while.357), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.718 = f32[1,1]{1,0} custom-call(%custom-call.490), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.331 = f32[1,1]{1,0} reshape(%custom-call.718), sharding={replicated}
+  %while.358 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=48, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.491 = f32[1,1024]{1,0} custom-call(%while.358), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.719 = f32[1,1024]{1,0} custom-call(%custom-call.491), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.332 = f32[1,1024]{1,0} reshape(%custom-call.719), sharding={replicated}
+  %while.359 = f32[1,1]{1,0} get-tuple-element(%while.309), index=49, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.492 = f32[1,1]{1,0} custom-call(%while.359), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.720 = f32[1,1]{1,0} custom-call(%custom-call.492), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.333 = f32[1,1]{1,0} reshape(%custom-call.720), sharding={replicated}
+  %while.360 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=50, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.493 = f32[1,1024]{1,0} custom-call(%while.360), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.721 = f32[1,1024]{1,0} custom-call(%custom-call.493), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.334 = f32[1,1024]{1,0} reshape(%custom-call.721), sharding={replicated}
+  %while.361 = f32[1,1]{1,0} get-tuple-element(%while.309), index=51, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.494 = f32[1,1]{1,0} custom-call(%while.361), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.722 = f32[1,1]{1,0} custom-call(%custom-call.494), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.335 = f32[1,1]{1,0} reshape(%custom-call.722), sharding={replicated}
+  %while.362 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=52, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.495 = f32[1,1024]{1,0} custom-call(%while.362), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.723 = f32[1,1024]{1,0} custom-call(%custom-call.495), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.336 = f32[1,1024]{1,0} reshape(%custom-call.723), sharding={replicated}
+  %while.363 = f32[1,1]{1,0} get-tuple-element(%while.309), index=53, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.496 = f32[1,1]{1,0} custom-call(%while.363), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.724 = f32[1,1]{1,0} custom-call(%custom-call.496), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.337 = f32[1,1]{1,0} reshape(%custom-call.724), sharding={replicated}
+  %while.364 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=54, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.497 = f32[1,1024]{1,0} custom-call(%while.364), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.725 = f32[1,1024]{1,0} custom-call(%custom-call.497), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.338 = f32[1,1024]{1,0} reshape(%custom-call.725), sharding={replicated}
+  %while.365 = f32[1,1]{1,0} get-tuple-element(%while.309), index=55, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.498 = f32[1,1]{1,0} custom-call(%while.365), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.726 = f32[1,1]{1,0} custom-call(%custom-call.498), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.339 = f32[1,1]{1,0} reshape(%custom-call.726), sharding={replicated}
+  %while.366 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=56, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.499 = f32[1,1024]{1,0} custom-call(%while.366), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.727 = f32[1,1024]{1,0} custom-call(%custom-call.499), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.340 = f32[1,1024]{1,0} reshape(%custom-call.727), sharding={replicated}
+  %while.367 = f32[1,1]{1,0} get-tuple-element(%while.309), index=57, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.500 = f32[1,1]{1,0} custom-call(%while.367), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.728 = f32[1,1]{1,0} custom-call(%custom-call.500), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.341 = f32[1,1]{1,0} reshape(%custom-call.728), sharding={replicated}
+  %while.368 = f32[1,1024]{1,0} get-tuple-element(%while.309), index=58, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.501 = f32[1,1024]{1,0} custom-call(%while.368), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.729 = f32[1,1024]{1,0} custom-call(%custom-call.501), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.342 = f32[1,1024]{1,0} reshape(%custom-call.729), sharding={replicated}
+  %while.369 = f32[1,1]{1,0} get-tuple-element(%while.309), index=59, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.502 = f32[1,1]{1,0} custom-call(%while.369), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.730 = f32[1,1]{1,0} custom-call(%custom-call.502), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.343 = f32[1,1]{1,0} reshape(%custom-call.730), sharding={replicated}
+  %while.228 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=19, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.503 = f32[2,1024]{1,0} custom-call(%while.228), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.731 = f32[2,1024]{1,0} custom-call(%custom-call.503), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.344 = f32[2,1024]{1,0} reshape(%custom-call.731), sharding={replicated}
+  %while.229 = f32[2,1]{1,0} get-tuple-element(%while.208), index=20, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.504 = f32[2,1]{1,0} custom-call(%while.229), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.732 = f32[2,1]{1,0} custom-call(%custom-call.504), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.345 = f32[2,1]{1,0} reshape(%custom-call.732), sharding={replicated}
+  %while.230 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=21, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.505 = f32[2,1024]{1,0} custom-call(%while.230), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.733 = f32[2,1024]{1,0} custom-call(%custom-call.505), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.346 = f32[2,1024]{1,0} reshape(%custom-call.733), sharding={replicated}
+  %while.231 = f32[2,1]{1,0} get-tuple-element(%while.208), index=22, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.506 = f32[2,1]{1,0} custom-call(%while.231), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.734 = f32[2,1]{1,0} custom-call(%custom-call.506), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.347 = f32[2,1]{1,0} reshape(%custom-call.734), sharding={replicated}
+  %while.232 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=23, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.507 = f32[2,1024]{1,0} custom-call(%while.232), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.735 = f32[2,1024]{1,0} custom-call(%custom-call.507), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.348 = f32[2,1024]{1,0} reshape(%custom-call.735), sharding={replicated}
+  %while.233 = f32[2,1]{1,0} get-tuple-element(%while.208), index=24, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.508 = f32[2,1]{1,0} custom-call(%while.233), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.736 = f32[2,1]{1,0} custom-call(%custom-call.508), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.349 = f32[2,1]{1,0} reshape(%custom-call.736), sharding={replicated}
+  %while.234 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=25, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.509 = f32[2,1024]{1,0} custom-call(%while.234), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.737 = f32[2,1024]{1,0} custom-call(%custom-call.509), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.350 = f32[2,1024]{1,0} reshape(%custom-call.737), sharding={replicated}
+  %while.235 = f32[2,1]{1,0} get-tuple-element(%while.208), index=26, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.510 = f32[2,1]{1,0} custom-call(%while.235), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.738 = f32[2,1]{1,0} custom-call(%custom-call.510), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.351 = f32[2,1]{1,0} reshape(%custom-call.738), sharding={replicated}
+  %while.236 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=27, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.511 = f32[2,1024]{1,0} custom-call(%while.236), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.739 = f32[2,1024]{1,0} custom-call(%custom-call.511), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.352 = f32[2,1024]{1,0} reshape(%custom-call.739), sharding={replicated}
+  %while.237 = f32[2,1]{1,0} get-tuple-element(%while.208), index=28, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.512 = f32[2,1]{1,0} custom-call(%while.237), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.740 = f32[2,1]{1,0} custom-call(%custom-call.512), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.353 = f32[2,1]{1,0} reshape(%custom-call.740), sharding={replicated}
+  %while.238 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=29, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.513 = f32[2,1024]{1,0} custom-call(%while.238), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.741 = f32[2,1024]{1,0} custom-call(%custom-call.513), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.354 = f32[2,1024]{1,0} reshape(%custom-call.741), sharding={replicated}
+  %while.239 = f32[2,1]{1,0} get-tuple-element(%while.208), index=30, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.514 = f32[2,1]{1,0} custom-call(%while.239), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.742 = f32[2,1]{1,0} custom-call(%custom-call.514), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.355 = f32[2,1]{1,0} reshape(%custom-call.742), sharding={replicated}
+  %while.240 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=31, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.515 = f32[2,1024]{1,0} custom-call(%while.240), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.743 = f32[2,1024]{1,0} custom-call(%custom-call.515), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.356 = f32[2,1024]{1,0} reshape(%custom-call.743), sharding={replicated}
+  %while.241 = f32[2,1]{1,0} get-tuple-element(%while.208), index=32, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.516 = f32[2,1]{1,0} custom-call(%while.241), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.744 = f32[2,1]{1,0} custom-call(%custom-call.516), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.357 = f32[2,1]{1,0} reshape(%custom-call.744), sharding={replicated}
+  %while.242 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=33, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.517 = f32[2,1024]{1,0} custom-call(%while.242), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.745 = f32[2,1024]{1,0} custom-call(%custom-call.517), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.358 = f32[2,1024]{1,0} reshape(%custom-call.745), sharding={replicated}
+  %while.243 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=34, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.518 = f32[2,1024]{1,0} custom-call(%while.243), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.746 = f32[2,1024]{1,0} custom-call(%custom-call.518), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.359 = f32[2,1024]{1,0} reshape(%custom-call.746), sharding={replicated}
+  %while.244 = f32[2,1]{1,0} get-tuple-element(%while.208), index=35, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.519 = f32[2,1]{1,0} custom-call(%while.244), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.747 = f32[2,1]{1,0} custom-call(%custom-call.519), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.360 = f32[2,1]{1,0} reshape(%custom-call.747), sharding={replicated}
+  %while.245 = f32[2,1]{1,0} get-tuple-element(%while.208), index=36, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.520 = f32[2,1]{1,0} custom-call(%while.245), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.748 = f32[2,1]{1,0} custom-call(%custom-call.520), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.361 = f32[2,1]{1,0} reshape(%custom-call.748), sharding={replicated}
+  %while.246 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=37, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.521 = f32[2,1024]{1,0} custom-call(%while.246), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.749 = f32[2,1024]{1,0} custom-call(%custom-call.521), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.362 = f32[2,1024]{1,0} reshape(%custom-call.749), sharding={replicated}
+  %while.247 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=38, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.522 = f32[2,1024]{1,0} custom-call(%while.247), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.750 = f32[2,1024]{1,0} custom-call(%custom-call.522), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.363 = f32[2,1024]{1,0} reshape(%custom-call.750), sharding={replicated}
+  %while.248 = f32[2,1]{1,0} get-tuple-element(%while.208), index=39, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.523 = f32[2,1]{1,0} custom-call(%while.248), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.751 = f32[2,1]{1,0} custom-call(%custom-call.523), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.364 = f32[2,1]{1,0} reshape(%custom-call.751), sharding={replicated}
+  %while.249 = f32[2,1]{1,0} get-tuple-element(%while.208), index=40, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.524 = f32[2,1]{1,0} custom-call(%while.249), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.752 = f32[2,1]{1,0} custom-call(%custom-call.524), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.365 = f32[2,1]{1,0} reshape(%custom-call.752), sharding={replicated}
+  %while.250 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=41, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.525 = f32[2,1024]{1,0} custom-call(%while.250), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.753 = f32[2,1024]{1,0} custom-call(%custom-call.525), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.366 = f32[2,1024]{1,0} reshape(%custom-call.753), sharding={replicated}
+  %while.251 = f32[2,1]{1,0} get-tuple-element(%while.208), index=42, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.526 = f32[2,1]{1,0} custom-call(%while.251), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.754 = f32[2,1]{1,0} custom-call(%custom-call.526), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.367 = f32[2,1]{1,0} reshape(%custom-call.754), sharding={replicated}
+  %while.252 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=43, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.527 = f32[2,1024]{1,0} custom-call(%while.252), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.755 = f32[2,1024]{1,0} custom-call(%custom-call.527), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.368 = f32[2,1024]{1,0} reshape(%custom-call.755), sharding={replicated}
+  %while.253 = f32[2,1]{1,0} get-tuple-element(%while.208), index=44, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.528 = f32[2,1]{1,0} custom-call(%while.253), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.756 = f32[2,1]{1,0} custom-call(%custom-call.528), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.369 = f32[2,1]{1,0} reshape(%custom-call.756), sharding={replicated}
+  %while.254 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=45, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.529 = f32[2,1024]{1,0} custom-call(%while.254), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.757 = f32[2,1024]{1,0} custom-call(%custom-call.529), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.370 = f32[2,1024]{1,0} reshape(%custom-call.757), sharding={replicated}
+  %while.255 = f32[2,1]{1,0} get-tuple-element(%while.208), index=46, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.530 = f32[2,1]{1,0} custom-call(%while.255), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.758 = f32[2,1]{1,0} custom-call(%custom-call.530), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.371 = f32[2,1]{1,0} reshape(%custom-call.758), sharding={replicated}
+  %while.256 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=47, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.531 = f32[2,1024]{1,0} custom-call(%while.256), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.759 = f32[2,1024]{1,0} custom-call(%custom-call.531), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.372 = f32[2,1024]{1,0} reshape(%custom-call.759), sharding={replicated}
+  %while.257 = f32[2,1]{1,0} get-tuple-element(%while.208), index=48, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.532 = f32[2,1]{1,0} custom-call(%while.257), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.760 = f32[2,1]{1,0} custom-call(%custom-call.532), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.373 = f32[2,1]{1,0} reshape(%custom-call.760), sharding={replicated}
+  %while.258 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=49, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.533 = f32[2,1024]{1,0} custom-call(%while.258), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.761 = f32[2,1024]{1,0} custom-call(%custom-call.533), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.374 = f32[2,1024]{1,0} reshape(%custom-call.761), sharding={replicated}
+  %while.259 = f32[2,1]{1,0} get-tuple-element(%while.208), index=50, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.534 = f32[2,1]{1,0} custom-call(%while.259), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.762 = f32[2,1]{1,0} custom-call(%custom-call.534), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.375 = f32[2,1]{1,0} reshape(%custom-call.762), sharding={replicated}
+  %while.260 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=51, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.535 = f32[2,1024]{1,0} custom-call(%while.260), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.763 = f32[2,1024]{1,0} custom-call(%custom-call.535), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.376 = f32[2,1024]{1,0} reshape(%custom-call.763), sharding={replicated}
+  %while.261 = f32[2,1]{1,0} get-tuple-element(%while.208), index=52, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.536 = f32[2,1]{1,0} custom-call(%while.261), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.764 = f32[2,1]{1,0} custom-call(%custom-call.536), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.377 = f32[2,1]{1,0} reshape(%custom-call.764), sharding={replicated}
+  %while.262 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=53, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.537 = f32[2,1024]{1,0} custom-call(%while.262), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.765 = f32[2,1024]{1,0} custom-call(%custom-call.537), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.378 = f32[2,1024]{1,0} reshape(%custom-call.765), sharding={replicated}
+  %while.263 = f32[2,1]{1,0} get-tuple-element(%while.208), index=54, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.538 = f32[2,1]{1,0} custom-call(%while.263), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.766 = f32[2,1]{1,0} custom-call(%custom-call.538), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.379 = f32[2,1]{1,0} reshape(%custom-call.766), sharding={replicated}
+  %while.264 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=55, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.539 = f32[2,1024]{1,0} custom-call(%while.264), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.767 = f32[2,1024]{1,0} custom-call(%custom-call.539), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.380 = f32[2,1024]{1,0} reshape(%custom-call.767), sharding={replicated}
+  %while.265 = f32[2,1]{1,0} get-tuple-element(%while.208), index=56, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.540 = f32[2,1]{1,0} custom-call(%while.265), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.768 = f32[2,1]{1,0} custom-call(%custom-call.540), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.381 = f32[2,1]{1,0} reshape(%custom-call.768), sharding={replicated}
+  %while.266 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=57, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.541 = f32[2,1024]{1,0} custom-call(%while.266), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.769 = f32[2,1024]{1,0} custom-call(%custom-call.541), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.382 = f32[2,1024]{1,0} reshape(%custom-call.769), sharding={replicated}
+  %while.267 = f32[2,1]{1,0} get-tuple-element(%while.208), index=58, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.542 = f32[2,1]{1,0} custom-call(%while.267), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.770 = f32[2,1]{1,0} custom-call(%custom-call.542), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.383 = f32[2,1]{1,0} reshape(%custom-call.770), sharding={replicated}
+  %while.268 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=59, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.543 = f32[2,1024]{1,0} custom-call(%while.268), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.771 = f32[2,1024]{1,0} custom-call(%custom-call.543), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.384 = f32[2,1024]{1,0} reshape(%custom-call.771), sharding={replicated}
+  %while.269 = f32[2,1]{1,0} get-tuple-element(%while.208), index=60, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.544 = f32[2,1]{1,0} custom-call(%while.269), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.772 = f32[2,1]{1,0} custom-call(%custom-call.544), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.385 = f32[2,1]{1,0} reshape(%custom-call.772), sharding={replicated}
+  %while.270 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=61, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.545 = f32[2,1024]{1,0} custom-call(%while.270), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.773 = f32[2,1024]{1,0} custom-call(%custom-call.545), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.386 = f32[2,1024]{1,0} reshape(%custom-call.773), sharding={replicated}
+  %while.271 = f32[2,1]{1,0} get-tuple-element(%while.208), index=62, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.546 = f32[2,1]{1,0} custom-call(%while.271), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.774 = f32[2,1]{1,0} custom-call(%custom-call.546), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.387 = f32[2,1]{1,0} reshape(%custom-call.774), sharding={replicated}
+  %while.272 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=63, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.547 = f32[2,1024]{1,0} custom-call(%while.272), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.775 = f32[2,1024]{1,0} custom-call(%custom-call.547), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.388 = f32[2,1024]{1,0} reshape(%custom-call.775), sharding={replicated}
+  %while.273 = f32[2,1]{1,0} get-tuple-element(%while.208), index=64, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.548 = f32[2,1]{1,0} custom-call(%while.273), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.776 = f32[2,1]{1,0} custom-call(%custom-call.548), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.389 = f32[2,1]{1,0} reshape(%custom-call.776), sharding={replicated}
+  %while.274 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=65, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.549 = f32[2,1024]{1,0} custom-call(%while.274), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.777 = f32[2,1024]{1,0} custom-call(%custom-call.549), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.390 = f32[2,1024]{1,0} reshape(%custom-call.777), sharding={replicated}
+  %while.275 = f32[2,1]{1,0} get-tuple-element(%while.208), index=66, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.550 = f32[2,1]{1,0} custom-call(%while.275), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.778 = f32[2,1]{1,0} custom-call(%custom-call.550), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.391 = f32[2,1]{1,0} reshape(%custom-call.778), sharding={replicated}
+  %while.276 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=67, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.551 = f32[2,1024]{1,0} custom-call(%while.276), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.779 = f32[2,1024]{1,0} custom-call(%custom-call.551), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.392 = f32[2,1024]{1,0} reshape(%custom-call.779), sharding={replicated}
+  %while.277 = f32[2,1]{1,0} get-tuple-element(%while.208), index=68, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.552 = f32[2,1]{1,0} custom-call(%while.277), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.780 = f32[2,1]{1,0} custom-call(%custom-call.552), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.393 = f32[2,1]{1,0} reshape(%custom-call.780), sharding={replicated}
+  %while.278 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=69, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.553 = f32[2,1024]{1,0} custom-call(%while.278), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.781 = f32[2,1024]{1,0} custom-call(%custom-call.553), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.394 = f32[2,1024]{1,0} reshape(%custom-call.781), sharding={replicated}
+  %while.279 = f32[2,1]{1,0} get-tuple-element(%while.208), index=70, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.554 = f32[2,1]{1,0} custom-call(%while.279), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.782 = f32[2,1]{1,0} custom-call(%custom-call.554), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.395 = f32[2,1]{1,0} reshape(%custom-call.782), sharding={replicated}
+  %while.280 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=71, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.555 = f32[2,1024]{1,0} custom-call(%while.280), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.783 = f32[2,1024]{1,0} custom-call(%custom-call.555), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.396 = f32[2,1024]{1,0} reshape(%custom-call.783), sharding={replicated}
+  %while.281 = f32[2,1]{1,0} get-tuple-element(%while.208), index=72, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.556 = f32[2,1]{1,0} custom-call(%while.281), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.784 = f32[2,1]{1,0} custom-call(%custom-call.556), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.397 = f32[2,1]{1,0} reshape(%custom-call.784), sharding={replicated}
+  %while.282 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=73, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.557 = f32[2,1024]{1,0} custom-call(%while.282), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.785 = f32[2,1024]{1,0} custom-call(%custom-call.557), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.398 = f32[2,1024]{1,0} reshape(%custom-call.785), sharding={replicated}
+  %while.283 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=74, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.558 = f32[2,1024]{1,0} custom-call(%while.283), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.786 = f32[2,1024]{1,0} custom-call(%custom-call.558), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.399 = f32[2,1024]{1,0} reshape(%custom-call.786), sharding={replicated}
+  %while.284 = f32[2,1]{1,0} get-tuple-element(%while.208), index=75, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.559 = f32[2,1]{1,0} custom-call(%while.284), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.787 = f32[2,1]{1,0} custom-call(%custom-call.559), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.400 = f32[2,1]{1,0} reshape(%custom-call.787), sharding={replicated}
+  %while.285 = f32[2,1]{1,0} get-tuple-element(%while.208), index=76, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.560 = f32[2,1]{1,0} custom-call(%while.285), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.788 = f32[2,1]{1,0} custom-call(%custom-call.560), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.401 = f32[2,1]{1,0} reshape(%custom-call.788), sharding={replicated}
+  %while.286 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=77, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.561 = f32[2,1024]{1,0} custom-call(%while.286), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.789 = f32[2,1024]{1,0} custom-call(%custom-call.561), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.402 = f32[2,1024]{1,0} reshape(%custom-call.789), sharding={replicated}
+  %while.287 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=78, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.562 = f32[2,1024]{1,0} custom-call(%while.287), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.790 = f32[2,1024]{1,0} custom-call(%custom-call.562), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.403 = f32[2,1024]{1,0} reshape(%custom-call.790), sharding={replicated}
+  %while.288 = f32[2,1]{1,0} get-tuple-element(%while.208), index=79, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.563 = f32[2,1]{1,0} custom-call(%while.288), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.791 = f32[2,1]{1,0} custom-call(%custom-call.563), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.404 = f32[2,1]{1,0} reshape(%custom-call.791), sharding={replicated}
+  %while.289 = f32[2,1]{1,0} get-tuple-element(%while.208), index=80, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.564 = f32[2,1]{1,0} custom-call(%while.289), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.792 = f32[2,1]{1,0} custom-call(%custom-call.564), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.405 = f32[2,1]{1,0} reshape(%custom-call.792), sharding={replicated}
+  %while.290 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=81, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.565 = f32[2,1024]{1,0} custom-call(%while.290), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.793 = f32[2,1024]{1,0} custom-call(%custom-call.565), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.406 = f32[2,1024]{1,0} reshape(%custom-call.793), sharding={replicated}
+  %while.291 = f32[2,1]{1,0} get-tuple-element(%while.208), index=82, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.566 = f32[2,1]{1,0} custom-call(%while.291), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.794 = f32[2,1]{1,0} custom-call(%custom-call.566), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.407 = f32[2,1]{1,0} reshape(%custom-call.794), sharding={replicated}
+  %while.292 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=83, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.567 = f32[2,1024]{1,0} custom-call(%while.292), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.795 = f32[2,1024]{1,0} custom-call(%custom-call.567), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.408 = f32[2,1024]{1,0} reshape(%custom-call.795), sharding={replicated}
+  %while.293 = f32[2,1]{1,0} get-tuple-element(%while.208), index=84, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.568 = f32[2,1]{1,0} custom-call(%while.293), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.796 = f32[2,1]{1,0} custom-call(%custom-call.568), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.409 = f32[2,1]{1,0} reshape(%custom-call.796), sharding={replicated}
+  %while.294 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=85, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.569 = f32[2,1024]{1,0} custom-call(%while.294), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.797 = f32[2,1024]{1,0} custom-call(%custom-call.569), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.410 = f32[2,1024]{1,0} reshape(%custom-call.797), sharding={replicated}
+  %while.295 = f32[2,1]{1,0} get-tuple-element(%while.208), index=86, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.570 = f32[2,1]{1,0} custom-call(%while.295), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.798 = f32[2,1]{1,0} custom-call(%custom-call.570), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.411 = f32[2,1]{1,0} reshape(%custom-call.798), sharding={replicated}
+  %while.296 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=87, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.571 = f32[2,1024]{1,0} custom-call(%while.296), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.799 = f32[2,1024]{1,0} custom-call(%custom-call.571), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.412 = f32[2,1024]{1,0} reshape(%custom-call.799), sharding={replicated}
+  %while.297 = f32[2,1]{1,0} get-tuple-element(%while.208), index=88, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.572 = f32[2,1]{1,0} custom-call(%while.297), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.800 = f32[2,1]{1,0} custom-call(%custom-call.572), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.413 = f32[2,1]{1,0} reshape(%custom-call.800), sharding={replicated}
+  %while.298 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=89, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.573 = f32[2,1024]{1,0} custom-call(%while.298), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.801 = f32[2,1024]{1,0} custom-call(%custom-call.573), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.414 = f32[2,1024]{1,0} reshape(%custom-call.801), sharding={replicated}
+  %while.299 = f32[2,1]{1,0} get-tuple-element(%while.208), index=90, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.574 = f32[2,1]{1,0} custom-call(%while.299), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.802 = f32[2,1]{1,0} custom-call(%custom-call.574), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.415 = f32[2,1]{1,0} reshape(%custom-call.802), sharding={replicated}
+  %while.300 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=91, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.575 = f32[2,1024]{1,0} custom-call(%while.300), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.803 = f32[2,1024]{1,0} custom-call(%custom-call.575), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.416 = f32[2,1024]{1,0} reshape(%custom-call.803), sharding={replicated}
+  %while.301 = f32[2,1]{1,0} get-tuple-element(%while.208), index=92, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.576 = f32[2,1]{1,0} custom-call(%while.301), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.804 = f32[2,1]{1,0} custom-call(%custom-call.576), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.417 = f32[2,1]{1,0} reshape(%custom-call.804), sharding={replicated}
+  %while.302 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=93, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.577 = f32[2,1024]{1,0} custom-call(%while.302), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.805 = f32[2,1024]{1,0} custom-call(%custom-call.577), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.418 = f32[2,1024]{1,0} reshape(%custom-call.805), sharding={replicated}
+  %while.303 = f32[2,1]{1,0} get-tuple-element(%while.208), index=94, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.578 = f32[2,1]{1,0} custom-call(%while.303), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.806 = f32[2,1]{1,0} custom-call(%custom-call.578), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.419 = f32[2,1]{1,0} reshape(%custom-call.806), sharding={replicated}
+  %while.304 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=95, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.579 = f32[2,1024]{1,0} custom-call(%while.304), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.807 = f32[2,1024]{1,0} custom-call(%custom-call.579), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.420 = f32[2,1024]{1,0} reshape(%custom-call.807), sharding={replicated}
+  %while.305 = f32[2,1]{1,0} get-tuple-element(%while.208), index=96, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.580 = f32[2,1]{1,0} custom-call(%while.305), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.808 = f32[2,1]{1,0} custom-call(%custom-call.580), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.421 = f32[2,1]{1,0} reshape(%custom-call.808), sharding={replicated}
+  %while.306 = f32[2,1024]{1,0} get-tuple-element(%while.208), index=97, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.581 = f32[2,1024]{1,0} custom-call(%while.306), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.809 = f32[2,1024]{1,0} custom-call(%custom-call.581), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.422 = f32[2,1024]{1,0} reshape(%custom-call.809), sharding={replicated}
+  %while.307 = f32[2,1]{1,0} get-tuple-element(%while.208), index=98, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %custom-call.582 = f32[2,1]{1,0} custom-call(%while.307), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.810 = f32[2,1]{1,0} custom-call(%custom-call.582), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.423 = f32[2,1]{1,0} reshape(%custom-call.810), sharding={replicated}
+  %state_opt_state_2__count.1 = s32[] parameter(227), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, []>"}, metadata={op_name="state.opt_state[2].count"}
+  %lt.24 = pred[] compare(%state_opt_state_2__count.1, %constant.714), direction=LT, metadata={op_name="jit(train_step)/lt" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_join.py" source_line=43 source_end_line=43 source_column=25 source_end_column=40}
+  %sub.88 = s32[] subtract(%state_opt_state_2__count.1, %constant.716), metadata={op_name="jit(train_step)/sub" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=143 source_end_line=143 source_column=21 source_end_column=45}
+  %jit_clip_.225 = s32[] call(%sub.88, %constant.716, %constant.714), to_apply=%clip_1183.428, metadata={op_name="jit(train_step)/jit(clip)" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=143 source_end_line=143 source_column=12 source_end_column=67}
+  %convert_element_type.1539 = f32[] convert(%jit_clip_.225), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=144 source_end_line=144 source_column=15 source_end_column=39}
+  %div.1670 = f32[] divide(%convert_element_type.1539, %constant.715), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=144 source_end_line=144 source_column=15 source_end_column=39}
+  %sub.89 = f32[] subtract(%constant.715, %div.1670), metadata={op_name="jit(train_step)/sub" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=144 source_end_line=144 source_column=11 source_end_column=39}
+  %constant.705 = f32[] constant(-3e-05)
+  %mul.1822 = f32[] multiply(%sub.89, %constant.705), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=145 source_end_line=145 source_column=11 source_end_column=51}
+  %constant.704 = f32[] constant(3e-05)
+  %add.1010 = f32[] add(%mul.1822, %constant.704), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_schedule.py" source_line=145 source_end_line=145 source_column=11 source_end_column=51}
+  %sub.90 = s32[] subtract(%state_opt_state_2__count.1, %constant.714), metadata={op_name="jit(train_step)/sub" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_join.py" source_line=43 source_end_line=43 source_column=59 source_end_column=74}
+  %convert_element_type.1540 = f32[] convert(%sub.90), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1238 source_end_line=1238 source_column=12 source_end_column=30}
+  %constant.703 = f32[] constant(14)
+  %div.1671 = f32[] divide(%convert_element_type.1540, %constant.703), metadata={op_name="jit(train_step)/div" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1238 source_end_line=1238 source_column=12 source_end_column=30}
+  %constant.702 = f32[] constant(3.14159274)
+  %mul.1823 = f32[] multiply(%div.1671, %constant.702), metadata={op_name="jit(train_step)/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1239 source_end_line=1239 source_column=25 source_end_column=37}
+  %cos.1 = f32[] cosine(%mul.1823), metadata={op_name="jit(train_step)/cos" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1239 source_end_line=1239 source_column=17 source_end_column=38}
+  %add.1011 = f32[] add(%cos.1, %constant.715), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1239 source_end_line=1239 source_column=17 source_end_column=38}
+  %constant.701 = f32[] constant(0.5)
+  %mul.1824 = f32[] multiply(%add.1011, %constant.701), metadata={op_name="jit(train_step)/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1239 source_end_line=1239 source_column=10 source_end_column=43}
+  %mul.1825 = f32[] multiply(%mul.1824, %constant.704), metadata={op_name="jit(train_step)/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1240 source_end_line=1240 source_column=11 source_end_column=22}
+  %sub.91 = f32[] subtract(%constant.715, %mul.1824), metadata={op_name="jit(train_step)/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1240 source_end_line=1240 source_column=37 source_end_column=42}
+  %constant.700 = f32[] constant(3e-06)
+  %mul.1826 = f32[] multiply(%sub.91, %constant.700), metadata={op_name="jit(train_step)/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1240 source_end_line=1240 source_column=25 source_end_column=43}
+  %add.1012 = f32[] add(%mul.1825, %mul.1826), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/maxtext_utils.py" source_line=1240 source_end_line=1240 source_column=11 source_end_column=22}
+  %jit__where_.424 = f32[] call(%lt.24, %add.1010, %add.1012), to_apply=%_where_1191.429, metadata={op_name="jit(train_step)/jit(_where)" source_file="/usr/local/lib/python3.12/dist-packages/optax/schedules/_join.py" source_line=43 source_end_line=43 source_column=15 source_end_column=76}
+  %constant.699 = f32[] constant(-1)
+  %mul.1827 = f32[] multiply(%jit__where_.424, %constant.699), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=964 source_end_line=964 source_column=43 source_end_column=67}
+  %convert_element_type.1541 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1828 = bf16[7168]{0} broadcast(%convert_element_type.1541), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1613 = bf16[32,4096,7168]{2,1,0} multiply(%convert_element_type.1400, %dot_general.149), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %constant.713 = bf16[] constant(0)
+  %reduce_sum.4070 = bf16[7168]{0} reduce(%mul.1613, %constant.713), dimensions={0,1}, to_apply=%region_61.98, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %reshape.295 = bf16[1,1,7168]{2,1,0} reshape(%reduce_sum.4070), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reshape" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %reduce_sum.4071 = bf16[7168]{0} reduce(%reshape.295, %constant.713), dimensions={0,1}, to_apply=%region_62.99, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/decoder_norm/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/normalizations.py" source_line=69 source_end_line=69 source_column=11 source_end_column=20}
+  %mul.1625 = bf16[7168]{0} multiply(%reduce_sum.4071, %reduce_sum.4071), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1408 = f32[7168]{0} convert(%mul.1625), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4074 = f32[] reduce(%convert_element_type.1408, %constant.712), dimensions={0}, to_apply=%region_306.394, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1409 = bf16[] convert(%reduce_sum.4074), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %while.312 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%while.309), index=2, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.167 = bf16[7168,1,18432]{2,1,0} reshape(%while.312), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1626 = bf16[7168,1,18432]{2,1,0} multiply(%transpose.167, %transpose.167), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1410 = f32[7168,1,18432]{2,1,0} convert(%mul.1626), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4075 = f32[] reduce(%convert_element_type.1410, %constant.712), dimensions={0,1,2}, to_apply=%region_307.395, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1411 = bf16[] convert(%reduce_sum.4075), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.845 = bf16[] add(%convert_element_type.1409, %convert_element_type.1411), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.313 = bf16[1,7168,18432]{2,1,0} get-tuple-element(%while.309), index=3, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.166 = bf16[7168,1,18432]{2,1,0} reshape(%while.313), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1627 = bf16[7168,1,18432]{2,1,0} multiply(%transpose.166, %transpose.166), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1412 = f32[7168,1,18432]{2,1,0} convert(%mul.1627), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4076 = f32[] reduce(%convert_element_type.1412, %constant.712), dimensions={0,1,2}, to_apply=%region_308.396, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1413 = bf16[] convert(%reduce_sum.4076), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.846 = bf16[] add(%add.845, %convert_element_type.1413), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.314 = bf16[1,18432,7168]{2,1,0} get-tuple-element(%while.309), index=4, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.165 = bf16[18432,1,7168]{2,1,0} reshape(%while.314), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1628 = bf16[18432,1,7168]{2,1,0} multiply(%transpose.165, %transpose.165), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1414 = f32[18432,1,7168]{2,1,0} convert(%mul.1628), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4077 = f32[] reduce(%convert_element_type.1414, %constant.712), dimensions={0,1,2}, to_apply=%region_309.397, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1415 = bf16[] convert(%reduce_sum.4077), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.847 = bf16[] add(%add.846, %convert_element_type.1415), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.315 = bf16[1,7168]{1,0} get-tuple-element(%while.309), index=5, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.164 = bf16[7168,1]{1,0} reshape(%while.315), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1629 = bf16[7168,1]{1,0} multiply(%transpose.164, %transpose.164), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1416 = f32[7168,1]{1,0} convert(%mul.1629), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4078 = f32[] reduce(%convert_element_type.1416, %constant.712), dimensions={0,1}, to_apply=%region_310.398, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1417 = bf16[] convert(%reduce_sum.4078), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.848 = bf16[] add(%add.847, %convert_element_type.1417), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.316 = bf16[1,7168]{1,0} get-tuple-element(%while.309), index=6, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.163 = bf16[7168,1]{1,0} reshape(%while.316), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1630 = bf16[7168,1]{1,0} multiply(%transpose.163, %transpose.163), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1418 = f32[7168,1]{1,0} convert(%mul.1630), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4079 = f32[] reduce(%convert_element_type.1418, %constant.712), dimensions={0,1}, to_apply=%region_311.399, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1419 = bf16[] convert(%reduce_sum.4079), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.849 = bf16[] add(%add.848, %convert_element_type.1419), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.317 = bf16[1,512]{1,0} get-tuple-element(%while.309), index=7, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.162 = bf16[512,1]{1,0} reshape(%while.317), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1631 = bf16[512,1]{1,0} multiply(%transpose.162, %transpose.162), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1420 = f32[512,1]{1,0} convert(%mul.1631), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4080 = f32[] reduce(%convert_element_type.1420, %constant.712), dimensions={0,1}, to_apply=%region_312.400, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1421 = bf16[] convert(%reduce_sum.4080), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.850 = bf16[] add(%add.849, %convert_element_type.1421), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.318 = bf16[1,128,128,7168]{3,2,1,0} get-tuple-element(%while.309), index=8, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.161 = bf16[128,1,128,7168]{3,2,1,0} reshape(%while.318), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1632 = bf16[128,1,128,7168]{3,2,1,0} multiply(%transpose.161, %transpose.161), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1422 = f32[128,1,128,7168]{3,2,1,0} convert(%mul.1632), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4081 = f32[] reduce(%convert_element_type.1422, %constant.712), dimensions={0,1,2,3}, to_apply=%region_313.401, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1423 = bf16[] convert(%reduce_sum.4081), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.851 = bf16[] add(%add.850, %convert_element_type.1423), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.319 = bf16[1,1536]{1,0} get-tuple-element(%while.309), index=9, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.160 = bf16[1536,1]{1,0} reshape(%while.319), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1633 = bf16[1536,1]{1,0} multiply(%transpose.160, %transpose.160), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1424 = f32[1536,1]{1,0} convert(%mul.1633), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4082 = f32[] reduce(%convert_element_type.1424, %constant.712), dimensions={0,1}, to_apply=%region_314.402, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1425 = bf16[] convert(%reduce_sum.4082), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.852 = bf16[] add(%add.851, %convert_element_type.1425), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.320 = bf16[1,7168,576]{2,1,0} get-tuple-element(%while.309), index=10, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.159 = bf16[7168,1,576]{2,1,0} reshape(%while.320), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1634 = bf16[7168,1,576]{2,1,0} multiply(%transpose.159, %transpose.159), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1426 = f32[7168,1,576]{2,1,0} convert(%mul.1634), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4083 = f32[] reduce(%convert_element_type.1426, %constant.712), dimensions={0,1,2}, to_apply=%region_315.403, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1427 = bf16[] convert(%reduce_sum.4083), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.853 = bf16[] add(%add.852, %convert_element_type.1427), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.321 = bf16[1,512,128,256]{3,2,1,0} get-tuple-element(%while.309), index=11, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.158 = bf16[512,1,128,256]{3,2,1,0} reshape(%while.321), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1635 = bf16[512,1,128,256]{3,2,1,0} multiply(%transpose.158, %transpose.158), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1428 = f32[512,1,128,256]{3,2,1,0} convert(%mul.1635), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4084 = f32[] reduce(%convert_element_type.1428, %constant.712), dimensions={0,1,2,3}, to_apply=%region_316.404, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1429 = bf16[] convert(%reduce_sum.4084), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.854 = bf16[] add(%add.853, %convert_element_type.1429), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.322 = bf16[1,7168,1536]{2,1,0} get-tuple-element(%while.309), index=12, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.157 = bf16[7168,1,1536]{2,1,0} reshape(%while.322), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1636 = bf16[7168,1,1536]{2,1,0} multiply(%transpose.157, %transpose.157), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1430 = f32[7168,1,1536]{2,1,0} convert(%mul.1636), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4085 = f32[] reduce(%convert_element_type.1430, %constant.712), dimensions={0,1,2}, to_apply=%region_317.405, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1431 = bf16[] convert(%reduce_sum.4085), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.855 = bf16[] add(%add.854, %convert_element_type.1431), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.323 = bf16[1,1536,128,192]{3,2,1,0} get-tuple-element(%while.309), index=13, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.156 = bf16[1536,1,128,192]{3,2,1,0} reshape(%while.323), metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1637 = bf16[1536,1,128,192]{3,2,1,0} multiply(%transpose.156, %transpose.156), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1432 = f32[1536,1,128,192]{3,2,1,0} convert(%mul.1637), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4086 = f32[] reduce(%convert_element_type.1432, %constant.712), dimensions={0,1,2,3}, to_apply=%region_318.406, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1433 = bf16[] convert(%reduce_sum.4086), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.856 = bf16[] add(%add.855, %convert_element_type.1433), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %dot_general.148 = bf16[129280,7168]{1,0} dot(%sharding_constraint.196, %mul.1599), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/logits_dense/dot_general" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=88 source_end_line=88 source_column=9 source_end_column=98}
+  %transpose.138 = bf16[7168,129280]{0,1} transpose(%dot_general.148), dimensions={1,0}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_output_head/logits_dense/transpose" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/linears.py" source_line=88 source_end_line=88 source_column=9 source_end_column=98}
+  %mul.1638 = bf16[7168,129280]{0,1} multiply(%transpose.138, %transpose.138), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1434 = f32[7168,129280]{0,1} convert(%mul.1638), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4087 = f32[] reduce(%convert_element_type.1434, %constant.712), dimensions={0,1}, to_apply=%region_319.407, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1435 = bf16[] convert(%reduce_sum.4087), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.857 = bf16[] add(%add.856, %convert_element_type.1435), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %constant.641 = f32[] constant(0)
+  %broadcast.238 = f32[64,2]{1,0} broadcast(%constant.641), dimensions={}
+  %reduce_sum.4088 = f32[] reduce(%broadcast.238, %constant.712), dimensions={0,1}, to_apply=%region_320.408, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1436 = bf16[] convert(%reduce_sum.4088), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.858 = bf16[] add(%add.857, %convert_element_type.1436), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.211 = bf16[2,7168,64]{2,1,0} get-tuple-element(%while.208), index=2, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.155 = bf16[7168,2,64]{2,0,1} transpose(%while.211), dimensions={1,0,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1639 = bf16[7168,2,64]{2,0,1} multiply(%transpose.155, %transpose.155), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1437 = f32[7168,2,64]{2,0,1} convert(%mul.1639), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4089 = f32[] reduce(%convert_element_type.1437, %constant.712), dimensions={0,1,2}, to_apply=%region_321.409, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1438 = bf16[] convert(%reduce_sum.4089), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.859 = bf16[] add(%add.858, %convert_element_type.1438), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.212 = bf16[2,64]{1,0} get-tuple-element(%while.208), index=3, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.154 = bf16[64,2]{0,1} transpose(%while.212), dimensions={1,0}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1640 = bf16[64,2]{0,1} multiply(%transpose.154, %transpose.154), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1439 = f32[64,2]{0,1} convert(%mul.1640), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4090 = f32[] reduce(%convert_element_type.1439, %constant.712), dimensions={0,1}, to_apply=%region_322.410, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1440 = bf16[] convert(%reduce_sum.4090), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.860 = bf16[] add(%add.859, %convert_element_type.1440), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.213 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%while.208), index=4, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.153 = bf16[64,2,7168,2048]{3,2,0,1} transpose(%while.213), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1641 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%transpose.153, %transpose.153), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1441 = f32[64,2,7168,2048]{3,2,0,1} convert(%mul.1641), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4091 = f32[] reduce(%convert_element_type.1441, %constant.712), dimensions={0,1,2,3}, to_apply=%region_323.411, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1442 = bf16[] convert(%reduce_sum.4091), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.861 = bf16[] add(%add.860, %convert_element_type.1442), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.214 = bf16[2,64,7168,2048]{3,2,1,0} get-tuple-element(%while.208), index=5, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.152 = bf16[64,2,7168,2048]{3,2,0,1} transpose(%while.214), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1642 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%transpose.152, %transpose.152), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1443 = f32[64,2,7168,2048]{3,2,0,1} convert(%mul.1642), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4092 = f32[] reduce(%convert_element_type.1443, %constant.712), dimensions={0,1,2,3}, to_apply=%region_324.412, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1444 = bf16[] convert(%reduce_sum.4092), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.862 = bf16[] add(%add.861, %convert_element_type.1444), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.215 = bf16[2,64,2048,7168]{3,2,1,0} get-tuple-element(%while.208), index=6, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.151 = bf16[64,2,2048,7168]{3,2,0,1} transpose(%while.215), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1643 = bf16[64,2,2048,7168]{3,2,0,1} multiply(%transpose.151, %transpose.151), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1445 = f32[64,2,2048,7168]{3,2,0,1} convert(%mul.1643), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4093 = f32[] reduce(%convert_element_type.1445, %constant.712), dimensions={0,1,2,3}, to_apply=%region_325.413, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1446 = bf16[] convert(%reduce_sum.4093), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.863 = bf16[] add(%add.862, %convert_element_type.1446), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.216 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%while.208), index=7, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.150 = bf16[7168,2,2048]{2,0,1} transpose(%while.216), dimensions={1,0,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1644 = bf16[7168,2,2048]{2,0,1} multiply(%transpose.150, %transpose.150), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1447 = f32[7168,2,2048]{2,0,1} convert(%mul.1644), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4094 = f32[] reduce(%convert_element_type.1447, %constant.712), dimensions={0,1,2}, to_apply=%region_326.414, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1448 = bf16[] convert(%reduce_sum.4094), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.864 = bf16[] add(%add.863, %convert_element_type.1448), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.217 = bf16[2,7168,2048]{2,1,0} get-tuple-element(%while.208), index=8, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.149 = bf16[7168,2,2048]{2,0,1} transpose(%while.217), dimensions={1,0,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1645 = bf16[7168,2,2048]{2,0,1} multiply(%transpose.149, %transpose.149), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1449 = f32[7168,2,2048]{2,0,1} convert(%mul.1645), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4095 = f32[] reduce(%convert_element_type.1449, %constant.712), dimensions={0,1,2}, to_apply=%region_327.415, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1450 = bf16[] convert(%reduce_sum.4095), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.865 = bf16[] add(%add.864, %convert_element_type.1450), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.218 = bf16[2,2048,7168]{2,1,0} get-tuple-element(%while.208), index=9, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.148 = bf16[2048,2,7168]{2,0,1} transpose(%while.218), dimensions={1,0,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1646 = bf16[2048,2,7168]{2,0,1} multiply(%transpose.148, %transpose.148), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1451 = f32[2048,2,7168]{2,0,1} convert(%mul.1646), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4096 = f32[] reduce(%convert_element_type.1451, %constant.712), dimensions={0,1,2}, to_apply=%region_328.416, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1452 = bf16[] convert(%reduce_sum.4096), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.866 = bf16[] add(%add.865, %convert_element_type.1452), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.219 = bf16[2,7168]{1,0} get-tuple-element(%while.208), index=10, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.147 = bf16[7168,2]{0,1} transpose(%while.219), dimensions={1,0}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1647 = bf16[7168,2]{0,1} multiply(%transpose.147, %transpose.147), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1453 = f32[7168,2]{0,1} convert(%mul.1647), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4097 = f32[] reduce(%convert_element_type.1453, %constant.712), dimensions={0,1}, to_apply=%region_329.417, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1454 = bf16[] convert(%reduce_sum.4097), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.867 = bf16[] add(%add.866, %convert_element_type.1454), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.220 = bf16[2,7168]{1,0} get-tuple-element(%while.208), index=11, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.146 = bf16[7168,2]{0,1} transpose(%while.220), dimensions={1,0}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1648 = bf16[7168,2]{0,1} multiply(%transpose.146, %transpose.146), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1455 = f32[7168,2]{0,1} convert(%mul.1648), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4098 = f32[] reduce(%convert_element_type.1455, %constant.712), dimensions={0,1}, to_apply=%region_330.418, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1456 = bf16[] convert(%reduce_sum.4098), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.868 = bf16[] add(%add.867, %convert_element_type.1456), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.221 = bf16[2,512]{1,0} get-tuple-element(%while.208), index=12, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.145 = bf16[512,2]{0,1} transpose(%while.221), dimensions={1,0}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1649 = bf16[512,2]{0,1} multiply(%transpose.145, %transpose.145), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1457 = f32[512,2]{0,1} convert(%mul.1649), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4099 = f32[] reduce(%convert_element_type.1457, %constant.712), dimensions={0,1}, to_apply=%region_331.419, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1458 = bf16[] convert(%reduce_sum.4099), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.869 = bf16[] add(%add.868, %convert_element_type.1458), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.222 = bf16[2,128,128,7168]{3,2,1,0} get-tuple-element(%while.208), index=13, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.144 = bf16[128,2,128,7168]{3,2,0,1} transpose(%while.222), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1650 = bf16[128,2,128,7168]{3,2,0,1} multiply(%transpose.144, %transpose.144), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1459 = f32[128,2,128,7168]{3,2,0,1} convert(%mul.1650), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4100 = f32[] reduce(%convert_element_type.1459, %constant.712), dimensions={0,1,2,3}, to_apply=%region_332.420, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1460 = bf16[] convert(%reduce_sum.4100), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.870 = bf16[] add(%add.869, %convert_element_type.1460), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.223 = bf16[2,1536]{1,0} get-tuple-element(%while.208), index=14, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.143 = bf16[1536,2]{0,1} transpose(%while.223), dimensions={1,0}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1651 = bf16[1536,2]{0,1} multiply(%transpose.143, %transpose.143), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1461 = f32[1536,2]{0,1} convert(%mul.1651), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4101 = f32[] reduce(%convert_element_type.1461, %constant.712), dimensions={0,1}, to_apply=%region_333.421, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1462 = bf16[] convert(%reduce_sum.4101), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.871 = bf16[] add(%add.870, %convert_element_type.1462), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.224 = bf16[2,7168,576]{2,1,0} get-tuple-element(%while.208), index=15, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.142 = bf16[7168,2,576]{2,0,1} transpose(%while.224), dimensions={1,0,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1652 = bf16[7168,2,576]{2,0,1} multiply(%transpose.142, %transpose.142), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1463 = f32[7168,2,576]{2,0,1} convert(%mul.1652), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4102 = f32[] reduce(%convert_element_type.1463, %constant.712), dimensions={0,1,2}, to_apply=%region_334.422, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1464 = bf16[] convert(%reduce_sum.4102), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.872 = bf16[] add(%add.871, %convert_element_type.1464), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.225 = bf16[2,512,128,256]{3,2,1,0} get-tuple-element(%while.208), index=16, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.141 = bf16[512,2,128,256]{3,2,0,1} transpose(%while.225), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1653 = bf16[512,2,128,256]{3,2,0,1} multiply(%transpose.141, %transpose.141), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1465 = f32[512,2,128,256]{3,2,0,1} convert(%mul.1653), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4103 = f32[] reduce(%convert_element_type.1465, %constant.712), dimensions={0,1,2,3}, to_apply=%region_335.423, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1466 = bf16[] convert(%reduce_sum.4103), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.873 = bf16[] add(%add.872, %convert_element_type.1466), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.226 = bf16[2,7168,1536]{2,1,0} get-tuple-element(%while.208), index=17, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.140 = bf16[7168,2,1536]{2,0,1} transpose(%while.226), dimensions={1,0,2}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1654 = bf16[7168,2,1536]{2,0,1} multiply(%transpose.140, %transpose.140), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1467 = f32[7168,2,1536]{2,0,1} convert(%mul.1654), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4104 = f32[] reduce(%convert_element_type.1467, %constant.712), dimensions={0,1,2}, to_apply=%region_336.424, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1468 = bf16[] convert(%reduce_sum.4104), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.874 = bf16[] add(%add.873, %convert_element_type.1468), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %while.227 = bf16[2,1536,128,192]{3,2,1,0} get-tuple-element(%while.208), index=18, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %transpose.139 = bf16[1536,2,128,192]{3,2,0,1} transpose(%while.227), dimensions={1,0,2,3}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/moe_layers.wrapped_fn/transpose" source_file="/opt/flax/flax/core/axes_scan.py" source_line=123 source_end_line=123 source_column=13 source_end_column=35}
+  %mul.1655 = bf16[1536,2,128,192]{3,2,0,1} multiply(%transpose.139, %transpose.139), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1469 = f32[1536,2,128,192]{3,2,0,1} convert(%mul.1655), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4105 = f32[] reduce(%convert_element_type.1469, %constant.712), dimensions={0,1,2,3}, to_apply=%region_337.425, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1470 = bf16[] convert(%reduce_sum.4105), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.875 = bf16[] add(%add.874, %convert_element_type.1470), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %constant.643 = bf16[] constant(0)
+  %broadcast_in_dim.1619 = bf16[129280,7168]{1,0} broadcast(%constant.643), dimensions={}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %while.311 = bf16[32,4096,7168]{2,1,0} get-tuple-element(%while.309), index=1, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/dense_layers.wrapped_fn/while" source_file="/opt/flax/flax/core/axes_scan.py" source_line=199 source_end_line=202 source_column=14 source_end_column=7}
+  %sharding_constraint.197 = bf16[32,4096,7168]{2,1,0} custom-call(%while.311), custom_call_target="Sharding", sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {\"sequence\", \"context\"}, {\"tensor\", \"tensor_transpose\"}]>]>"}, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %scatter-add.5 = bf16[129280,7168]{1,0} scatter(%broadcast_in_dim.1619, %broadcast_in_dim.1643, %sharding_constraint.197), update_window_dims={2}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=2, to_apply=%region_305.393, metadata={op_name="jit(train_step)/transpose(jvp(TransformerLinenPure.apply))/TransformerLinenPure/decoder/decoder._apply_embedding/token_embedder/scatter-add" source_file="/maxtext_workspace/maxtext/src/MaxText/layers/embeddings.py" source_line=152 source_end_line=152 source_column=15 source_end_column=57}
+  %mul.1656 = bf16[129280,7168]{1,0} multiply(%scatter-add.5, %scatter-add.5), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=45 source_end_line=45 source_column=10 source_end_column=22}
+  %convert_element_type.1471 = f32[129280,7168]{1,0} convert(%mul.1656), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %reduce_sum.4106 = f32[] reduce(%convert_element_type.1471, %constant.712), dimensions={0,1}, to_apply=%region_338.426, metadata={op_name="jit(train_step)/reduce_sum" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %convert_element_type.1472 = bf16[] convert(%reduce_sum.4106), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=10 source_end_column=37}
+  %add.876 = bf16[] add(%add.875, %convert_element_type.1472), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=38 source_end_line=38 source_column=6 source_end_column=72}
+  %sqrt.37 = bf16[] sqrt(%add.876), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/linear_algebra.py" source_line=37 source_end_line=39 source_column=9 source_end_column=3}
+  %constant.709 = bf16[] constant(1)
+  %lt.22 = pred[] compare(%sqrt.37, %constant.709), direction=LT, metadata={op_name="jit(train_step)/lt" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=98 source_end_line=98 source_column=26 source_end_column=43}
+  %select_n.104 = pred[7168]{0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1439 = bf16[7168]{0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1440 = bf16[7168]{0} divide(%reduce_sum.4071, %div.1439), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.105 = bf16[7168]{0} select(%select_n.104, %reduce_sum.4071, %div.1440), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.640 = bf16[] constant(0.1001)
+  %broadcast.237 = bf16[7168]{0} broadcast(%constant.640), dimensions={}
+  %mul.1657 = bf16[7168]{0} multiply(%select_n.105, %broadcast.237), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____decoder_norm____scale__.1 = bf16[7168]{0} parameter(161), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'decoder_norm\'][\'scale\']"}
+  %constant.639 = bf16[] constant(0.8984)
+  %mul.1543 = bf16[7168]{0} broadcast(%constant.639), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1658 = bf16[7168]{0} multiply(%state_opt_state_0__mu__decoder____decoder_norm____scale__.1, %mul.1543), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.877 = bf16[7168]{0} add(%mul.1657, %mul.1658), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %constant.707 = f32[] constant(0.9)
+  %state_opt_state_0__count.1 = s32[] parameter(160), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, []>"}, metadata={op_name="state.opt_state[0].count"}
+  %constant.708 = s32[] constant(2147483647)
+  %lt.23 = pred[] compare(%state_opt_state_0__count.1, %constant.708), direction=LT, metadata={op_name="jit(train_step)/lt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=142 source_end_line=142 source_column=19 source_end_column=36}
+  %add.943 = s32[] add(%state_opt_state_0__count.1, %constant.714), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=142 source_end_line=142 source_column=38 source_end_column=49}
+  %jit__where_.423 = s32[] call(%lt.23, %add.943, %constant.708), to_apply=%_where_1097.427, metadata={op_name="jit(train_step)/jit(_where)" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=142 source_end_line=142 source_column=9 source_end_column=61}
+  %pow.20 = f32[] convert(%jit__where_.423), metadata={op_name="jit(train_step)/pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %pow.21 = f32[] power(%constant.707, %pow.20), metadata={op_name="jit(train_step)/pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %sub.86 = f32[] subtract(%constant.715, %pow.21), metadata={op_name="jit(train_step)/sub" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %convert_element_type.1473 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1505 = bf16[7168]{0} broadcast(%convert_element_type.1473), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1506 = bf16[7168]{0} divide(%add.877, %div.1505), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.33 = bf16[7168]{0} multiply(%select_n.105, %select_n.105), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.586 = bf16[] constant(0.05005)
+  %mul.1522 = bf16[7168]{0} broadcast(%constant.586), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1723 = bf16[7168]{0} multiply(%integer_pow.33, %mul.1522), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____decoder_norm____scale__.1 = bf16[7168]{0} parameter(194), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'decoder_norm\'][\'scale\']"}
+  %constant.585 = bf16[] constant(0.9492)
+  %mul.1521 = bf16[7168]{0} broadcast(%constant.585), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1724 = bf16[7168]{0} multiply(%state_opt_state_0__nu__decoder____decoder_norm____scale__.1, %mul.1521), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.910 = bf16[7168]{0} add(%mul.1723, %mul.1724), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %constant.706 = f32[] constant(0.95)
+  %pow.22 = f32[] convert(%jit__where_.423), metadata={op_name="jit(train_step)/pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %pow.23 = f32[] power(%constant.706, %pow.22), metadata={op_name="jit(train_step)/pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sub.87 = f32[] subtract(%constant.715, %pow.23), metadata={op_name="jit(train_step)/sub" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %convert_element_type.1506 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1571 = bf16[7168]{0} broadcast(%convert_element_type.1506), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1572 = bf16[7168]{0} divide(%add.910, %div.1571), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.38 = bf16[7168]{0} sqrt(%div.1572), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.532 = bf16[] constant(1.001e-08)
+  %add.831 = bf16[7168]{0} broadcast(%constant.532), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.944 = bf16[7168]{0} add(%sqrt.38, %add.831), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1637 = bf16[7168]{0} divide(%div.1506, %add.944), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1789 = bf16[7168]{0} multiply(%state_params__params____decoder____decoder_norm____scale__.1, %broadcast.237), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.977 = bf16[7168]{0} add(%div.1637, %mul.1789), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1829 = bf16[7168]{0} multiply(%mul.1828, %add.977), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1014 = bf16[7168]{0} add(%state_params__params____decoder____decoder_norm____scale__.1, %mul.1829), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.583 = bf16[7168]{0} custom-call(%add.1014), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.811 = bf16[7168]{0} custom-call(%custom-call.583), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}]>]>"}
+  %reshape.424 = bf16[7168]{0} reshape(%custom-call.811), sharding={replicated}
+  %convert_element_type.1542 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1830 = bf16[7168,1,18432]{2,1,0} broadcast(%convert_element_type.1542), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.106 = pred[7168,1,18432]{2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1441 = bf16[7168,1,18432]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1442 = bf16[7168,1,18432]{2,1,0} divide(%transpose.167, %div.1441), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.107 = bf16[7168,1,18432]{2,1,0} select(%select_n.106, %transpose.167, %div.1442), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.638 = bf16[] constant(0.1001)
+  %broadcast.236 = bf16[7168,1,18432]{2,1,0} broadcast(%constant.638), dimensions={}
+  %mul.1659 = bf16[7168,1,18432]{2,1,0} multiply(%select_n.107, %broadcast.236), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____mlp____wi_0____kernel__.1 = bf16[7168,1,18432]{2,1,0} parameter(162), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'kernel\']"}
+  %constant.637 = bf16[] constant(0.8984)
+  %broadcast.235 = bf16[7168,1,18432]{2,1,0} broadcast(%constant.637), dimensions={}
+  %mul.1660 = bf16[7168,1,18432]{2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____mlp____wi_0____kernel__.1, %broadcast.235), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.878 = bf16[7168,1,18432]{2,1,0} add(%mul.1659, %mul.1660), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1474 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1507 = bf16[7168,1,18432]{2,1,0} broadcast(%convert_element_type.1474), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1508 = bf16[7168,1,18432]{2,1,0} divide(%add.878, %div.1507), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.34 = bf16[7168,1,18432]{2,1,0} multiply(%select_n.107, %select_n.107), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.584 = bf16[] constant(0.05005)
+  %broadcast.204 = bf16[7168,1,18432]{2,1,0} broadcast(%constant.584), dimensions={}
+  %mul.1725 = bf16[7168,1,18432]{2,1,0} multiply(%integer_pow.34, %broadcast.204), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____mlp____wi_0____kernel__.1 = bf16[7168,1,18432]{2,1,0} parameter(195), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_0\'][\'kernel\']"}
+  %constant.583 = bf16[] constant(0.9492)
+  %broadcast.203 = bf16[7168,1,18432]{2,1,0} broadcast(%constant.583), dimensions={}
+  %mul.1726 = bf16[7168,1,18432]{2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____mlp____wi_0____kernel__.1, %broadcast.203), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.911 = bf16[7168,1,18432]{2,1,0} add(%mul.1725, %mul.1726), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1507 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1573 = bf16[7168,1,18432]{2,1,0} broadcast(%convert_element_type.1507), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1574 = bf16[7168,1,18432]{2,1,0} divide(%add.911, %div.1573), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.39 = bf16[7168,1,18432]{2,1,0} sqrt(%div.1574), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.531 = bf16[] constant(1.001e-08)
+  %broadcast.192 = bf16[7168,1,18432]{2,1,0} broadcast(%constant.531), dimensions={}
+  %add.945 = bf16[7168,1,18432]{2,1,0} add(%sqrt.39, %broadcast.192), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1638 = bf16[7168,1,18432]{2,1,0} divide(%div.1508, %add.945), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1790 = bf16[7168,1,18432]{2,1,0} multiply(%state_params__params____decoder____dense_layers____mlp____wi_0____kernel__.1, %broadcast.236), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.978 = bf16[7168,1,18432]{2,1,0} add(%div.1638, %mul.1790), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1831 = bf16[7168,1,18432]{2,1,0} multiply(%mul.1830, %add.978), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1015 = bf16[7168,1,18432]{2,1,0} add(%state_params__params____decoder____dense_layers____mlp____wi_0____kernel__.1, %mul.1831), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.584 = bf16[7168,1,18432]{2,1,0} custom-call(%add.1015), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.812 = bf16[7168,1,18432]{2,1,0} custom-call(%custom-call.584), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.425 = bf16[7168,1,18432]{2,1,0} reshape(%custom-call.812), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1543 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1832 = bf16[7168,1,18432]{2,1,0} broadcast(%convert_element_type.1543), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.108 = pred[7168,1,18432]{2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1443 = bf16[7168,1,18432]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1444 = bf16[7168,1,18432]{2,1,0} divide(%transpose.166, %div.1443), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.109 = bf16[7168,1,18432]{2,1,0} select(%select_n.108, %transpose.166, %div.1444), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %mul.1661 = bf16[7168,1,18432]{2,1,0} multiply(%select_n.109, %broadcast.236), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____mlp____wi_1____kernel__.1 = bf16[7168,1,18432]{2,1,0} parameter(163), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'kernel\']"}
+  %mul.1662 = bf16[7168,1,18432]{2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____mlp____wi_1____kernel__.1, %broadcast.235), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.879 = bf16[7168,1,18432]{2,1,0} add(%mul.1661, %mul.1662), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1475 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1509 = bf16[7168,1,18432]{2,1,0} broadcast(%convert_element_type.1475), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1510 = bf16[7168,1,18432]{2,1,0} divide(%add.879, %div.1509), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.35 = bf16[7168,1,18432]{2,1,0} multiply(%select_n.109, %select_n.109), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %mul.1727 = bf16[7168,1,18432]{2,1,0} multiply(%integer_pow.35, %broadcast.204), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____mlp____wi_1____kernel__.1 = bf16[7168,1,18432]{2,1,0} parameter(196), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'mlp\'][\'wi_1\'][\'kernel\']"}
+  %mul.1728 = bf16[7168,1,18432]{2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____mlp____wi_1____kernel__.1, %broadcast.203), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.912 = bf16[7168,1,18432]{2,1,0} add(%mul.1727, %mul.1728), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1508 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1575 = bf16[7168,1,18432]{2,1,0} broadcast(%convert_element_type.1508), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1576 = bf16[7168,1,18432]{2,1,0} divide(%add.912, %div.1575), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.40 = bf16[7168,1,18432]{2,1,0} sqrt(%div.1576), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.946 = bf16[7168,1,18432]{2,1,0} add(%sqrt.40, %broadcast.192), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1639 = bf16[7168,1,18432]{2,1,0} divide(%div.1510, %add.946), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1791 = bf16[7168,1,18432]{2,1,0} multiply(%state_params__params____decoder____dense_layers____mlp____wi_1____kernel__.1, %broadcast.236), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.979 = bf16[7168,1,18432]{2,1,0} add(%div.1639, %mul.1791), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1833 = bf16[7168,1,18432]{2,1,0} multiply(%mul.1832, %add.979), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1016 = bf16[7168,1,18432]{2,1,0} add(%state_params__params____decoder____dense_layers____mlp____wi_1____kernel__.1, %mul.1833), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.585 = bf16[7168,1,18432]{2,1,0} custom-call(%add.1016), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.813 = bf16[7168,1,18432]{2,1,0} custom-call(%custom-call.585), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.426 = bf16[7168,1,18432]{2,1,0} reshape(%custom-call.813), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1544 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1834 = bf16[18432,1,7168]{2,1,0} broadcast(%convert_element_type.1544), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.110 = pred[18432,1,7168]{2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1445 = bf16[18432,1,7168]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1446 = bf16[18432,1,7168]{2,1,0} divide(%transpose.165, %div.1445), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.111 = bf16[18432,1,7168]{2,1,0} select(%select_n.110, %transpose.165, %div.1446), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.636 = bf16[] constant(0.1001)
+  %broadcast.234 = bf16[18432,1,7168]{2,1,0} broadcast(%constant.636), dimensions={}
+  %mul.1663 = bf16[18432,1,7168]{2,1,0} multiply(%select_n.111, %broadcast.234), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____mlp____wo____kernel__.1 = bf16[18432,1,7168]{2,1,0} parameter(164), sharding={devices=[1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'kernel\']"}
+  %constant.635 = bf16[] constant(0.8984)
+  %mul.1542 = bf16[18432,1,7168]{2,1,0} broadcast(%constant.635), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1664 = bf16[18432,1,7168]{2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____mlp____wo____kernel__.1, %mul.1542), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.880 = bf16[18432,1,7168]{2,1,0} add(%mul.1663, %mul.1664), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1476 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1511 = bf16[18432,1,7168]{2,1,0} broadcast(%convert_element_type.1476), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1512 = bf16[18432,1,7168]{2,1,0} divide(%add.880, %div.1511), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.36 = bf16[18432,1,7168]{2,1,0} multiply(%select_n.111, %select_n.111), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.582 = bf16[] constant(0.05005)
+  %mul.1520 = bf16[18432,1,7168]{2,1,0} broadcast(%constant.582), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1729 = bf16[18432,1,7168]{2,1,0} multiply(%integer_pow.36, %mul.1520), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____mlp____wo____kernel__.1 = bf16[18432,1,7168]{2,1,0} parameter(197), sharding={devices=[1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'mlp\'][\'wo\'][\'kernel\']"}
+  %constant.581 = bf16[] constant(0.9492)
+  %mul.1519 = bf16[18432,1,7168]{2,1,0} broadcast(%constant.581), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1730 = bf16[18432,1,7168]{2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____mlp____wo____kernel__.1, %mul.1519), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.913 = bf16[18432,1,7168]{2,1,0} add(%mul.1729, %mul.1730), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1509 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1577 = bf16[18432,1,7168]{2,1,0} broadcast(%convert_element_type.1509), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1578 = bf16[18432,1,7168]{2,1,0} divide(%add.913, %div.1577), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.41 = bf16[18432,1,7168]{2,1,0} sqrt(%div.1578), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.530 = bf16[] constant(1.001e-08)
+  %add.830 = bf16[18432,1,7168]{2,1,0} broadcast(%constant.530), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.947 = bf16[18432,1,7168]{2,1,0} add(%sqrt.41, %add.830), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1640 = bf16[18432,1,7168]{2,1,0} divide(%div.1512, %add.947), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1792 = bf16[18432,1,7168]{2,1,0} multiply(%state_params__params____decoder____dense_layers____mlp____wo____kernel__.1, %broadcast.234), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.980 = bf16[18432,1,7168]{2,1,0} add(%div.1640, %mul.1792), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1835 = bf16[18432,1,7168]{2,1,0} multiply(%mul.1834, %add.980), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1017 = bf16[18432,1,7168]{2,1,0} add(%state_params__params____decoder____dense_layers____mlp____wo____kernel__.1, %mul.1835), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.586 = bf16[18432,1,7168]{2,1,0} custom-call(%add.1017), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.814 = bf16[18432,1,7168]{2,1,0} custom-call(%custom-call.586), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>]>"}
+  %reshape.427 = bf16[18432,1,7168]{2,1,0} reshape(%custom-call.814), sharding={devices=[1,1,4]<=[4]}
+  %convert_element_type.1545 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1836 = bf16[7168,1]{1,0} broadcast(%convert_element_type.1545), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.112 = pred[7168,1]{1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1447 = bf16[7168,1]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1448 = bf16[7168,1]{1,0} divide(%transpose.164, %div.1447), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.113 = bf16[7168,1]{1,0} select(%select_n.112, %transpose.164, %div.1448), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.634 = bf16[] constant(0.1001)
+  %broadcast.233 = bf16[7168,1]{1,0} broadcast(%constant.634), dimensions={}
+  %mul.1665 = bf16[7168,1]{1,0} multiply(%select_n.113, %broadcast.233), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____post_self_attention_layer_norm____scale__.1 = bf16[7168,1]{1,0} parameter(165), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'post_self_attention_layer_norm\'][\'scale\']"}
+  %constant.633 = bf16[] constant(0.8984)
+  %broadcast.232 = bf16[7168,1]{1,0} broadcast(%constant.633), dimensions={}
+  %mul.1666 = bf16[7168,1]{1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____post_self_attention_layer_norm____scale__.1, %broadcast.232), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.881 = bf16[7168,1]{1,0} add(%mul.1665, %mul.1666), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1477 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1513 = bf16[7168,1]{1,0} broadcast(%convert_element_type.1477), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1514 = bf16[7168,1]{1,0} divide(%add.881, %div.1513), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.37 = bf16[7168,1]{1,0} multiply(%select_n.113, %select_n.113), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.580 = bf16[] constant(0.05005)
+  %broadcast.202 = bf16[7168,1]{1,0} broadcast(%constant.580), dimensions={}
+  %mul.1731 = bf16[7168,1]{1,0} multiply(%integer_pow.37, %broadcast.202), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____post_self_attention_layer_norm____scale__.1 = bf16[7168,1]{1,0} parameter(198), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'post_self_attention_layer_norm\'][\'scale\']"}
+  %constant.579 = bf16[] constant(0.9492)
+  %broadcast.201 = bf16[7168,1]{1,0} broadcast(%constant.579), dimensions={}
+  %mul.1732 = bf16[7168,1]{1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____post_self_attention_layer_norm____scale__.1, %broadcast.201), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.914 = bf16[7168,1]{1,0} add(%mul.1731, %mul.1732), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1510 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1579 = bf16[7168,1]{1,0} broadcast(%convert_element_type.1510), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1580 = bf16[7168,1]{1,0} divide(%add.914, %div.1579), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.42 = bf16[7168,1]{1,0} sqrt(%div.1580), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.529 = bf16[] constant(1.001e-08)
+  %broadcast.191 = bf16[7168,1]{1,0} broadcast(%constant.529), dimensions={}
+  %add.948 = bf16[7168,1]{1,0} add(%sqrt.42, %broadcast.191), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1641 = bf16[7168,1]{1,0} divide(%div.1514, %add.948), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1793 = bf16[7168,1]{1,0} multiply(%state_params__params____decoder____dense_layers____post_self_attention_layer_norm____scale__.1, %broadcast.233), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.981 = bf16[7168,1]{1,0} add(%div.1641, %mul.1793), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1837 = bf16[7168,1]{1,0} multiply(%mul.1836, %add.981), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1018 = bf16[7168,1]{1,0} add(%state_params__params____decoder____dense_layers____post_self_attention_layer_norm____scale__.1, %mul.1837), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.587 = bf16[7168,1]{1,0} custom-call(%add.1018), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.815 = bf16[7168,1]{1,0} custom-call(%custom-call.587), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.428 = bf16[7168,1]{1,0} reshape(%custom-call.815), sharding={replicated}
+  %convert_element_type.1546 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1838 = bf16[7168,1]{1,0} broadcast(%convert_element_type.1546), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.114 = pred[7168,1]{1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1449 = bf16[7168,1]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1450 = bf16[7168,1]{1,0} divide(%transpose.163, %div.1449), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.115 = bf16[7168,1]{1,0} select(%select_n.114, %transpose.163, %div.1450), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %mul.1667 = bf16[7168,1]{1,0} multiply(%select_n.115, %broadcast.233), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____pre_self_attention_layer_norm____scale__.1 = bf16[7168,1]{1,0} parameter(166), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'pre_self_attention_layer_norm\'][\'scale\']"}
+  %mul.1668 = bf16[7168,1]{1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____pre_self_attention_layer_norm____scale__.1, %broadcast.232), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.882 = bf16[7168,1]{1,0} add(%mul.1667, %mul.1668), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1478 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1515 = bf16[7168,1]{1,0} broadcast(%convert_element_type.1478), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1516 = bf16[7168,1]{1,0} divide(%add.882, %div.1515), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.38 = bf16[7168,1]{1,0} multiply(%select_n.115, %select_n.115), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %mul.1733 = bf16[7168,1]{1,0} multiply(%integer_pow.38, %broadcast.202), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____pre_self_attention_layer_norm____scale__.1 = bf16[7168,1]{1,0} parameter(199), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'pre_self_attention_layer_norm\'][\'scale\']"}
+  %mul.1734 = bf16[7168,1]{1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____pre_self_attention_layer_norm____scale__.1, %broadcast.201), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.915 = bf16[7168,1]{1,0} add(%mul.1733, %mul.1734), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1511 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1581 = bf16[7168,1]{1,0} broadcast(%convert_element_type.1511), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1582 = bf16[7168,1]{1,0} divide(%add.915, %div.1581), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.43 = bf16[7168,1]{1,0} sqrt(%div.1582), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.949 = bf16[7168,1]{1,0} add(%sqrt.43, %broadcast.191), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1642 = bf16[7168,1]{1,0} divide(%div.1516, %add.949), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1794 = bf16[7168,1]{1,0} multiply(%state_params__params____decoder____dense_layers____pre_self_attention_layer_norm____scale__.1, %broadcast.233), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.982 = bf16[7168,1]{1,0} add(%div.1642, %mul.1794), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1839 = bf16[7168,1]{1,0} multiply(%mul.1838, %add.982), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1019 = bf16[7168,1]{1,0} add(%state_params__params____decoder____dense_layers____pre_self_attention_layer_norm____scale__.1, %mul.1839), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.588 = bf16[7168,1]{1,0} custom-call(%add.1019), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.816 = bf16[7168,1]{1,0} custom-call(%custom-call.588), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.429 = bf16[7168,1]{1,0} reshape(%custom-call.816), sharding={replicated}
+  %convert_element_type.1547 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1840 = bf16[512,1]{1,0} broadcast(%convert_element_type.1547), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.116 = pred[512,1]{1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1451 = bf16[512,1]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1452 = bf16[512,1]{1,0} divide(%transpose.162, %div.1451), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.117 = bf16[512,1]{1,0} select(%select_n.116, %transpose.162, %div.1452), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.632 = bf16[] constant(0.1001)
+  %broadcast.231 = bf16[512,1]{1,0} broadcast(%constant.632), dimensions={}
+  %mul.1669 = bf16[512,1]{1,0} multiply(%select_n.117, %broadcast.231), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____self_attention____kv_norm____scale__.1 = bf16[512,1]{1,0} parameter(167), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'kv_norm\'][\'scale\']"}
+  %constant.631 = bf16[] constant(0.8984)
+  %mul.1541 = bf16[512,1]{1,0} broadcast(%constant.631), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1670 = bf16[512,1]{1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____self_attention____kv_norm____scale__.1, %mul.1541), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.883 = bf16[512,1]{1,0} add(%mul.1669, %mul.1670), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1479 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1517 = bf16[512,1]{1,0} broadcast(%convert_element_type.1479), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1518 = bf16[512,1]{1,0} divide(%add.883, %div.1517), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.39 = bf16[512,1]{1,0} multiply(%select_n.117, %select_n.117), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.578 = bf16[] constant(0.05005)
+  %mul.1518 = bf16[512,1]{1,0} broadcast(%constant.578), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1735 = bf16[512,1]{1,0} multiply(%integer_pow.39, %mul.1518), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____self_attention____kv_norm____scale__.1 = bf16[512,1]{1,0} parameter(200), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'kv_norm\'][\'scale\']"}
+  %constant.577 = bf16[] constant(0.9492)
+  %mul.1517 = bf16[512,1]{1,0} broadcast(%constant.577), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1736 = bf16[512,1]{1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____self_attention____kv_norm____scale__.1, %mul.1517), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.916 = bf16[512,1]{1,0} add(%mul.1735, %mul.1736), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1512 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1583 = bf16[512,1]{1,0} broadcast(%convert_element_type.1512), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1584 = bf16[512,1]{1,0} divide(%add.916, %div.1583), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.44 = bf16[512,1]{1,0} sqrt(%div.1584), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.528 = bf16[] constant(1.001e-08)
+  %add.829 = bf16[512,1]{1,0} broadcast(%constant.528), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.950 = bf16[512,1]{1,0} add(%sqrt.44, %add.829), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1643 = bf16[512,1]{1,0} divide(%div.1518, %add.950), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1795 = bf16[512,1]{1,0} multiply(%state_params__params____decoder____dense_layers____self_attention____kv_norm____scale__.1, %broadcast.231), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.983 = bf16[512,1]{1,0} add(%div.1643, %mul.1795), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1841 = bf16[512,1]{1,0} multiply(%mul.1840, %add.983), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1020 = bf16[512,1]{1,0} add(%state_params__params____decoder____dense_layers____self_attention____kv_norm____scale__.1, %mul.1841), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.589 = bf16[512,1]{1,0} custom-call(%add.1020), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.817 = bf16[512,1]{1,0} custom-call(%custom-call.589), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.430 = bf16[512,1]{1,0} reshape(%custom-call.817), sharding={replicated}
+  %convert_element_type.1548 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1842 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%convert_element_type.1548), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.118 = pred[128,1,128,7168]{3,2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1453 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1454 = bf16[128,1,128,7168]{3,2,1,0} divide(%transpose.161, %div.1453), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.119 = bf16[128,1,128,7168]{3,2,1,0} select(%select_n.118, %transpose.161, %div.1454), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.630 = bf16[] constant(0.1001)
+  %broadcast.230 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%constant.630), dimensions={}
+  %mul.1671 = bf16[128,1,128,7168]{3,2,1,0} multiply(%select_n.119, %broadcast.230), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____self_attention____out____kernel__.1 = bf16[128,1,128,7168]{3,2,1,0} parameter(168), sharding={devices=[1,1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'kernel\']"}
+  %constant.629 = bf16[] constant(0.8984)
+  %mul.1540 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%constant.629), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1672 = bf16[128,1,128,7168]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____self_attention____out____kernel__.1, %mul.1540), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.884 = bf16[128,1,128,7168]{3,2,1,0} add(%mul.1671, %mul.1672), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1480 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1519 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%convert_element_type.1480), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1520 = bf16[128,1,128,7168]{3,2,1,0} divide(%add.884, %div.1519), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.40 = bf16[128,1,128,7168]{3,2,1,0} multiply(%select_n.119, %select_n.119), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.576 = bf16[] constant(0.05005)
+  %mul.1516 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%constant.576), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1737 = bf16[128,1,128,7168]{3,2,1,0} multiply(%integer_pow.40, %mul.1516), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____self_attention____out____kernel__.1 = bf16[128,1,128,7168]{3,2,1,0} parameter(201), sharding={devices=[1,1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'out\'][\'kernel\']"}
+  %constant.575 = bf16[] constant(0.9492)
+  %mul.1515 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%constant.575), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1738 = bf16[128,1,128,7168]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____self_attention____out____kernel__.1, %mul.1515), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.917 = bf16[128,1,128,7168]{3,2,1,0} add(%mul.1737, %mul.1738), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1513 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1585 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%convert_element_type.1513), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1586 = bf16[128,1,128,7168]{3,2,1,0} divide(%add.917, %div.1585), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.45 = bf16[128,1,128,7168]{3,2,1,0} sqrt(%div.1586), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.527 = bf16[] constant(1.001e-08)
+  %add.828 = bf16[128,1,128,7168]{3,2,1,0} broadcast(%constant.527), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.951 = bf16[128,1,128,7168]{3,2,1,0} add(%sqrt.45, %add.828), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1644 = bf16[128,1,128,7168]{3,2,1,0} divide(%div.1520, %add.951), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1796 = bf16[128,1,128,7168]{3,2,1,0} multiply(%state_params__params____decoder____dense_layers____self_attention____out____kernel__.1, %broadcast.230), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.984 = bf16[128,1,128,7168]{3,2,1,0} add(%div.1644, %mul.1796), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1843 = bf16[128,1,128,7168]{3,2,1,0} multiply(%mul.1842, %add.984), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1021 = bf16[128,1,128,7168]{3,2,1,0} add(%state_params__params____decoder____dense_layers____self_attention____out____kernel__.1, %mul.1843), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.590 = bf16[128,1,128,7168]{3,2,1,0} custom-call(%add.1021), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.818 = bf16[128,1,128,7168]{3,2,1,0} custom-call(%custom-call.590), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.431 = bf16[128,1,128,7168]{3,2,1,0} reshape(%custom-call.818), sharding={devices=[1,1,1,4]<=[4]}
+  %convert_element_type.1549 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1844 = bf16[1536,1]{1,0} broadcast(%convert_element_type.1549), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.120 = pred[1536,1]{1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1455 = bf16[1536,1]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1456 = bf16[1536,1]{1,0} divide(%transpose.160, %div.1455), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.121 = bf16[1536,1]{1,0} select(%select_n.120, %transpose.160, %div.1456), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.628 = bf16[] constant(0.1001)
+  %broadcast.229 = bf16[1536,1]{1,0} broadcast(%constant.628), dimensions={}
+  %mul.1673 = bf16[1536,1]{1,0} multiply(%select_n.121, %broadcast.229), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____self_attention____q_norm____scale__.1 = bf16[1536,1]{1,0} parameter(169), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'q_norm\'][\'scale\']"}
+  %constant.627 = bf16[] constant(0.8984)
+  %mul.1539 = bf16[1536,1]{1,0} broadcast(%constant.627), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1674 = bf16[1536,1]{1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____self_attention____q_norm____scale__.1, %mul.1539), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.885 = bf16[1536,1]{1,0} add(%mul.1673, %mul.1674), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1481 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1521 = bf16[1536,1]{1,0} broadcast(%convert_element_type.1481), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1522 = bf16[1536,1]{1,0} divide(%add.885, %div.1521), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.41 = bf16[1536,1]{1,0} multiply(%select_n.121, %select_n.121), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.574 = bf16[] constant(0.05005)
+  %mul.1514 = bf16[1536,1]{1,0} broadcast(%constant.574), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1739 = bf16[1536,1]{1,0} multiply(%integer_pow.41, %mul.1514), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____self_attention____q_norm____scale__.1 = bf16[1536,1]{1,0} parameter(202), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'q_norm\'][\'scale\']"}
+  %constant.573 = bf16[] constant(0.9492)
+  %mul.1513 = bf16[1536,1]{1,0} broadcast(%constant.573), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1740 = bf16[1536,1]{1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____self_attention____q_norm____scale__.1, %mul.1513), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.918 = bf16[1536,1]{1,0} add(%mul.1739, %mul.1740), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1514 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1587 = bf16[1536,1]{1,0} broadcast(%convert_element_type.1514), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1588 = bf16[1536,1]{1,0} divide(%add.918, %div.1587), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.46 = bf16[1536,1]{1,0} sqrt(%div.1588), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.526 = bf16[] constant(1.001e-08)
+  %add.827 = bf16[1536,1]{1,0} broadcast(%constant.526), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.952 = bf16[1536,1]{1,0} add(%sqrt.46, %add.827), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1645 = bf16[1536,1]{1,0} divide(%div.1522, %add.952), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1797 = bf16[1536,1]{1,0} multiply(%state_params__params____decoder____dense_layers____self_attention____q_norm____scale__.1, %broadcast.229), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.985 = bf16[1536,1]{1,0} add(%div.1645, %mul.1797), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1845 = bf16[1536,1]{1,0} multiply(%mul.1844, %add.985), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1022 = bf16[1536,1]{1,0} add(%state_params__params____decoder____dense_layers____self_attention____q_norm____scale__.1, %mul.1845), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.591 = bf16[1536,1]{1,0} custom-call(%add.1022), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.819 = bf16[1536,1]{1,0} custom-call(%custom-call.591), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.432 = bf16[1536,1]{1,0} reshape(%custom-call.819), sharding={replicated}
+  %convert_element_type.1550 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1846 = bf16[7168,1,576]{2,1,0} broadcast(%convert_element_type.1550), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.122 = pred[7168,1,576]{2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1457 = bf16[7168,1,576]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1458 = bf16[7168,1,576]{2,1,0} divide(%transpose.159, %div.1457), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.123 = bf16[7168,1,576]{2,1,0} select(%select_n.122, %transpose.159, %div.1458), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.626 = bf16[] constant(0.1001)
+  %broadcast.228 = bf16[7168,1,576]{2,1,0} broadcast(%constant.626), dimensions={}
+  %mul.1675 = bf16[7168,1,576]{2,1,0} multiply(%select_n.123, %broadcast.228), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____self_attention____wkv_a_kernel__.1 = bf16[7168,1,576]{2,1,0} parameter(170), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_a_kernel\']"}
+  %constant.625 = bf16[] constant(0.8984)
+  %mul.1538 = bf16[7168,1,576]{2,1,0} broadcast(%constant.625), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1676 = bf16[7168,1,576]{2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____self_attention____wkv_a_kernel__.1, %mul.1538), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.886 = bf16[7168,1,576]{2,1,0} add(%mul.1675, %mul.1676), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1482 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1523 = bf16[7168,1,576]{2,1,0} broadcast(%convert_element_type.1482), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1524 = bf16[7168,1,576]{2,1,0} divide(%add.886, %div.1523), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.42 = bf16[7168,1,576]{2,1,0} multiply(%select_n.123, %select_n.123), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.572 = bf16[] constant(0.05005)
+  %mul.1512 = bf16[7168,1,576]{2,1,0} broadcast(%constant.572), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1741 = bf16[7168,1,576]{2,1,0} multiply(%integer_pow.42, %mul.1512), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____self_attention____wkv_a_kernel__.1 = bf16[7168,1,576]{2,1,0} parameter(203), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_a_kernel\']"}
+  %constant.571 = bf16[] constant(0.9492)
+  %mul.1511 = bf16[7168,1,576]{2,1,0} broadcast(%constant.571), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1742 = bf16[7168,1,576]{2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____self_attention____wkv_a_kernel__.1, %mul.1511), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.919 = bf16[7168,1,576]{2,1,0} add(%mul.1741, %mul.1742), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1515 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1589 = bf16[7168,1,576]{2,1,0} broadcast(%convert_element_type.1515), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1590 = bf16[7168,1,576]{2,1,0} divide(%add.919, %div.1589), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.47 = bf16[7168,1,576]{2,1,0} sqrt(%div.1590), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.525 = bf16[] constant(1.001e-08)
+  %add.826 = bf16[7168,1,576]{2,1,0} broadcast(%constant.525), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.953 = bf16[7168,1,576]{2,1,0} add(%sqrt.47, %add.826), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1646 = bf16[7168,1,576]{2,1,0} divide(%div.1524, %add.953), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1798 = bf16[7168,1,576]{2,1,0} multiply(%state_params__params____decoder____dense_layers____self_attention____wkv_a_kernel__.1, %broadcast.228), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.986 = bf16[7168,1,576]{2,1,0} add(%div.1646, %mul.1798), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1847 = bf16[7168,1,576]{2,1,0} multiply(%mul.1846, %add.986), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1023 = bf16[7168,1,576]{2,1,0} add(%state_params__params____decoder____dense_layers____self_attention____wkv_a_kernel__.1, %mul.1847), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.592 = bf16[7168,1,576]{2,1,0} custom-call(%add.1023), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.820 = bf16[7168,1,576]{2,1,0} custom-call(%custom-call.592), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.433 = bf16[7168,1,576]{2,1,0} reshape(%custom-call.820), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1551 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1848 = bf16[512,1,128,256]{3,2,1,0} broadcast(%convert_element_type.1551), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.124 = pred[512,1,128,256]{3,2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1459 = bf16[512,1,128,256]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1460 = bf16[512,1,128,256]{3,2,1,0} divide(%transpose.158, %div.1459), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.125 = bf16[512,1,128,256]{3,2,1,0} select(%select_n.124, %transpose.158, %div.1460), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.624 = bf16[] constant(0.1001)
+  %broadcast.227 = bf16[512,1,128,256]{3,2,1,0} broadcast(%constant.624), dimensions={}
+  %mul.1677 = bf16[512,1,128,256]{3,2,1,0} multiply(%select_n.125, %broadcast.227), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____self_attention____wkv_b____kernel__.1 = bf16[512,1,128,256]{3,2,1,0} parameter(171), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'kernel\']"}
+  %constant.623 = bf16[] constant(0.8984)
+  %mul.1537 = bf16[512,1,128,256]{3,2,1,0} broadcast(%constant.623), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1678 = bf16[512,1,128,256]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____self_attention____wkv_b____kernel__.1, %mul.1537), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.887 = bf16[512,1,128,256]{3,2,1,0} add(%mul.1677, %mul.1678), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1483 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1525 = bf16[512,1,128,256]{3,2,1,0} broadcast(%convert_element_type.1483), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1526 = bf16[512,1,128,256]{3,2,1,0} divide(%add.887, %div.1525), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.43 = bf16[512,1,128,256]{3,2,1,0} multiply(%select_n.125, %select_n.125), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.570 = bf16[] constant(0.05005)
+  %mul.1510 = bf16[512,1,128,256]{3,2,1,0} broadcast(%constant.570), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1743 = bf16[512,1,128,256]{3,2,1,0} multiply(%integer_pow.43, %mul.1510), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____self_attention____wkv_b____kernel__.1 = bf16[512,1,128,256]{3,2,1,0} parameter(204), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wkv_b\'][\'kernel\']"}
+  %constant.569 = bf16[] constant(0.9492)
+  %mul.1509 = bf16[512,1,128,256]{3,2,1,0} broadcast(%constant.569), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1744 = bf16[512,1,128,256]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____self_attention____wkv_b____kernel__.1, %mul.1509), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.920 = bf16[512,1,128,256]{3,2,1,0} add(%mul.1743, %mul.1744), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1516 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1591 = bf16[512,1,128,256]{3,2,1,0} broadcast(%convert_element_type.1516), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1592 = bf16[512,1,128,256]{3,2,1,0} divide(%add.920, %div.1591), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.48 = bf16[512,1,128,256]{3,2,1,0} sqrt(%div.1592), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.524 = bf16[] constant(1.001e-08)
+  %add.825 = bf16[512,1,128,256]{3,2,1,0} broadcast(%constant.524), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.954 = bf16[512,1,128,256]{3,2,1,0} add(%sqrt.48, %add.825), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1647 = bf16[512,1,128,256]{3,2,1,0} divide(%div.1526, %add.954), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1799 = bf16[512,1,128,256]{3,2,1,0} multiply(%state_params__params____decoder____dense_layers____self_attention____wkv_b____kernel__.1, %broadcast.227), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.987 = bf16[512,1,128,256]{3,2,1,0} add(%div.1647, %mul.1799), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1849 = bf16[512,1,128,256]{3,2,1,0} multiply(%mul.1848, %add.987), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1024 = bf16[512,1,128,256]{3,2,1,0} add(%state_params__params____decoder____dense_layers____self_attention____wkv_b____kernel__.1, %mul.1849), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.593 = bf16[512,1,128,256]{3,2,1,0} custom-call(%add.1024), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.821 = bf16[512,1,128,256]{3,2,1,0} custom-call(%custom-call.593), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.434 = bf16[512,1,128,256]{3,2,1,0} reshape(%custom-call.821), sharding={devices=[4,1,1,1]<=[4]}
+  %convert_element_type.1552 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1850 = bf16[7168,1,1536]{2,1,0} broadcast(%convert_element_type.1552), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.126 = pred[7168,1,1536]{2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1461 = bf16[7168,1,1536]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1462 = bf16[7168,1,1536]{2,1,0} divide(%transpose.157, %div.1461), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.127 = bf16[7168,1,1536]{2,1,0} select(%select_n.126, %transpose.157, %div.1462), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.622 = bf16[] constant(0.1001)
+  %broadcast.226 = bf16[7168,1,1536]{2,1,0} broadcast(%constant.622), dimensions={}
+  %mul.1679 = bf16[7168,1,1536]{2,1,0} multiply(%select_n.127, %broadcast.226), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____self_attention____wq_a_kernel__.1 = bf16[7168,1,1536]{2,1,0} parameter(172), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_a_kernel\']"}
+  %constant.621 = bf16[] constant(0.8984)
+  %mul.1536 = bf16[7168,1,1536]{2,1,0} broadcast(%constant.621), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1680 = bf16[7168,1,1536]{2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____self_attention____wq_a_kernel__.1, %mul.1536), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.888 = bf16[7168,1,1536]{2,1,0} add(%mul.1679, %mul.1680), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1484 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1527 = bf16[7168,1,1536]{2,1,0} broadcast(%convert_element_type.1484), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1528 = bf16[7168,1,1536]{2,1,0} divide(%add.888, %div.1527), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.44 = bf16[7168,1,1536]{2,1,0} multiply(%select_n.127, %select_n.127), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.568 = bf16[] constant(0.05005)
+  %mul.1508 = bf16[7168,1,1536]{2,1,0} broadcast(%constant.568), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1745 = bf16[7168,1,1536]{2,1,0} multiply(%integer_pow.44, %mul.1508), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____self_attention____wq_a_kernel__.1 = bf16[7168,1,1536]{2,1,0} parameter(205), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_a_kernel\']"}
+  %constant.567 = bf16[] constant(0.9492)
+  %mul.1507 = bf16[7168,1,1536]{2,1,0} broadcast(%constant.567), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1746 = bf16[7168,1,1536]{2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____self_attention____wq_a_kernel__.1, %mul.1507), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.921 = bf16[7168,1,1536]{2,1,0} add(%mul.1745, %mul.1746), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1517 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1593 = bf16[7168,1,1536]{2,1,0} broadcast(%convert_element_type.1517), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1594 = bf16[7168,1,1536]{2,1,0} divide(%add.921, %div.1593), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.49 = bf16[7168,1,1536]{2,1,0} sqrt(%div.1594), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.523 = bf16[] constant(1.001e-08)
+  %add.824 = bf16[7168,1,1536]{2,1,0} broadcast(%constant.523), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.955 = bf16[7168,1,1536]{2,1,0} add(%sqrt.49, %add.824), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1648 = bf16[7168,1,1536]{2,1,0} divide(%div.1528, %add.955), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1800 = bf16[7168,1,1536]{2,1,0} multiply(%state_params__params____decoder____dense_layers____self_attention____wq_a_kernel__.1, %broadcast.226), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.988 = bf16[7168,1,1536]{2,1,0} add(%div.1648, %mul.1800), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1851 = bf16[7168,1,1536]{2,1,0} multiply(%mul.1850, %add.988), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1025 = bf16[7168,1,1536]{2,1,0} add(%state_params__params____decoder____dense_layers____self_attention____wq_a_kernel__.1, %mul.1851), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.594 = bf16[7168,1,1536]{2,1,0} custom-call(%add.1025), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.822 = bf16[7168,1,1536]{2,1,0} custom-call(%custom-call.594), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.435 = bf16[7168,1,1536]{2,1,0} reshape(%custom-call.822), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1553 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1852 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%convert_element_type.1553), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.128 = pred[1536,1,128,192]{3,2,1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1463 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1464 = bf16[1536,1,128,192]{3,2,1,0} divide(%transpose.156, %div.1463), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.129 = bf16[1536,1,128,192]{3,2,1,0} select(%select_n.128, %transpose.156, %div.1464), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.620 = bf16[] constant(0.1001)
+  %broadcast.225 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%constant.620), dimensions={}
+  %mul.1681 = bf16[1536,1,128,192]{3,2,1,0} multiply(%select_n.129, %broadcast.225), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____dense_layers____self_attention____wq_b____kernel__.1 = bf16[1536,1,128,192]{3,2,1,0} parameter(173), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'kernel\']"}
+  %constant.619 = bf16[] constant(0.8984)
+  %mul.1535 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%constant.619), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1682 = bf16[1536,1,128,192]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____dense_layers____self_attention____wq_b____kernel__.1, %mul.1535), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.889 = bf16[1536,1,128,192]{3,2,1,0} add(%mul.1681, %mul.1682), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1485 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1529 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%convert_element_type.1485), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1530 = bf16[1536,1,128,192]{3,2,1,0} divide(%add.889, %div.1529), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.45 = bf16[1536,1,128,192]{3,2,1,0} multiply(%select_n.129, %select_n.129), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.566 = bf16[] constant(0.05005)
+  %mul.1506 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%constant.566), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1747 = bf16[1536,1,128,192]{3,2,1,0} multiply(%integer_pow.45, %mul.1506), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____dense_layers____self_attention____wq_b____kernel__.1 = bf16[1536,1,128,192]{3,2,1,0} parameter(206), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'dense_layers\'][\'self_attention\'][\'wq_b\'][\'kernel\']"}
+  %constant.565 = bf16[] constant(0.9492)
+  %mul.1505 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%constant.565), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1748 = bf16[1536,1,128,192]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____dense_layers____self_attention____wq_b____kernel__.1, %mul.1505), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.922 = bf16[1536,1,128,192]{3,2,1,0} add(%mul.1747, %mul.1748), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1518 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1595 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%convert_element_type.1518), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1596 = bf16[1536,1,128,192]{3,2,1,0} divide(%add.922, %div.1595), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.50 = bf16[1536,1,128,192]{3,2,1,0} sqrt(%div.1596), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.522 = bf16[] constant(1.001e-08)
+  %add.823 = bf16[1536,1,128,192]{3,2,1,0} broadcast(%constant.522), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.956 = bf16[1536,1,128,192]{3,2,1,0} add(%sqrt.50, %add.823), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1649 = bf16[1536,1,128,192]{3,2,1,0} divide(%div.1530, %add.956), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1801 = bf16[1536,1,128,192]{3,2,1,0} multiply(%state_params__params____decoder____dense_layers____self_attention____wq_b____kernel__.1, %broadcast.225), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.989 = bf16[1536,1,128,192]{3,2,1,0} add(%div.1649, %mul.1801), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1853 = bf16[1536,1,128,192]{3,2,1,0} multiply(%mul.1852, %add.989), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1026 = bf16[1536,1,128,192]{3,2,1,0} add(%state_params__params____decoder____dense_layers____self_attention____wq_b____kernel__.1, %mul.1853), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.595 = bf16[1536,1,128,192]{3,2,1,0} custom-call(%add.1026), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.823 = bf16[1536,1,128,192]{3,2,1,0} custom-call(%custom-call.595), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.436 = bf16[1536,1,128,192]{3,2,1,0} reshape(%custom-call.823), sharding={devices=[4,1,1,1]<=[4]}
+  %convert_element_type.1554 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1854 = bf16[7168,129280]{1,0} broadcast(%convert_element_type.1554), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.130 = pred[7168,129280]{0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1465 = bf16[7168,129280]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1466 = bf16[7168,129280]{0,1} divide(%transpose.138, %div.1465), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.131 = bf16[7168,129280]{0,1} select(%select_n.130, %transpose.138, %div.1466), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.618 = bf16[] constant(0.1001)
+  %broadcast.224 = bf16[7168,129280]{1,0} broadcast(%constant.618), dimensions={}
+  %mul.1683 = bf16[7168,129280]{0,1} multiply(%select_n.131, %broadcast.224), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____logits_dense____kernel__.1 = bf16[7168,129280]{1,0} parameter(174), sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'logits_dense\'][\'kernel\']"}
+  %constant.617 = bf16[] constant(0.8984)
+  %mul.1534 = bf16[7168,129280]{1,0} broadcast(%constant.617), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1684 = bf16[7168,129280]{1,0} multiply(%state_opt_state_0__mu__decoder____logits_dense____kernel__.1, %mul.1534), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.890 = bf16[7168,129280]{0,1} add(%mul.1683, %mul.1684), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1486 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1531 = bf16[7168,129280]{1,0} broadcast(%convert_element_type.1486), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1532 = bf16[7168,129280]{0,1} divide(%add.890, %div.1531), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.46 = bf16[7168,129280]{0,1} multiply(%select_n.131, %select_n.131), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.564 = bf16[] constant(0.05005)
+  %mul.1504 = bf16[7168,129280]{1,0} broadcast(%constant.564), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1749 = bf16[7168,129280]{0,1} multiply(%integer_pow.46, %mul.1504), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____logits_dense____kernel__.1 = bf16[7168,129280]{1,0} parameter(207), sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'logits_dense\'][\'kernel\']"}
+  %constant.563 = bf16[] constant(0.9492)
+  %mul.1503 = bf16[7168,129280]{1,0} broadcast(%constant.563), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1750 = bf16[7168,129280]{1,0} multiply(%state_opt_state_0__nu__decoder____logits_dense____kernel__.1, %mul.1503), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.923 = bf16[7168,129280]{0,1} add(%mul.1749, %mul.1750), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1519 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1597 = bf16[7168,129280]{1,0} broadcast(%convert_element_type.1519), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1598 = bf16[7168,129280]{0,1} divide(%add.923, %div.1597), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.51 = bf16[7168,129280]{0,1} sqrt(%div.1598), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.521 = bf16[] constant(1.001e-08)
+  %add.822 = bf16[7168,129280]{1,0} broadcast(%constant.521), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.957 = bf16[7168,129280]{0,1} add(%sqrt.51, %add.822), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1650 = bf16[7168,129280]{0,1} divide(%div.1532, %add.957), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1802 = bf16[7168,129280]{1,0} multiply(%state_params__params____decoder____logits_dense____kernel__.1, %broadcast.224), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.990 = bf16[7168,129280]{0,1} add(%div.1650, %mul.1802), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1855 = bf16[7168,129280]{1,0} multiply(%mul.1854, %add.990), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1027 = bf16[7168,129280]{1,0} add(%state_params__params____decoder____logits_dense____kernel__.1, %mul.1855), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.596 = bf16[7168,129280]{1,0} custom-call(%add.1027), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.824 = bf16[7168,129280]{1,0} custom-call(%custom-call.596), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.437 = bf16[7168,129280]{1,0} reshape(%custom-call.824), sharding={devices=[4,1]<=[4]}
+  %state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1 = bf16[64,2]{1,0} parameter(141), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.params[\'params\'][\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'bias\']"}
+  %convert_element_type.1555 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1856 = bf16[64,2]{1,0} broadcast(%convert_element_type.1555), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.132 = pred[64,2]{1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.642 = bf16[] constant(0)
+  %broadcast_in_dim.1618 = bf16[64,2]{1,0} broadcast(%constant.642), dimensions={}, metadata={op_name="jit(train_step)/broadcast_in_dim" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=259 source_end_line=259 source_column=29 source_end_column=118}
+  %div.1467 = bf16[64,2]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1468 = bf16[64,2]{1,0} divide(%broadcast_in_dim.1618, %div.1467), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.133 = bf16[64,2]{1,0} select(%select_n.132, %broadcast_in_dim.1618, %div.1468), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.616 = bf16[] constant(0.1001)
+  %broadcast.223 = bf16[64,2]{1,0} broadcast(%constant.616), dimensions={}
+  %mul.1685 = bf16[64,2]{1,0} multiply(%select_n.133, %broadcast.223), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1 = bf16[64,2]{1,0} parameter(175), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'bias\']"}
+  %constant.615 = bf16[] constant(0.8984)
+  %broadcast.222 = bf16[64,2]{1,0} broadcast(%constant.615), dimensions={}
+  %mul.1686 = bf16[64,2]{1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1, %broadcast.222), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.891 = bf16[64,2]{1,0} add(%mul.1685, %mul.1686), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1487 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1533 = bf16[64,2]{1,0} broadcast(%convert_element_type.1487), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1534 = bf16[64,2]{1,0} divide(%add.891, %div.1533), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.47 = bf16[64,2]{1,0} multiply(%select_n.133, %select_n.133), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.562 = bf16[] constant(0.05005)
+  %broadcast.200 = bf16[64,2]{1,0} broadcast(%constant.562), dimensions={}
+  %mul.1751 = bf16[64,2]{1,0} multiply(%integer_pow.47, %broadcast.200), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1 = bf16[64,2]{1,0} parameter(208), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'bias\']"}
+  %constant.561 = bf16[] constant(0.9492)
+  %broadcast.199 = bf16[64,2]{1,0} broadcast(%constant.561), dimensions={}
+  %mul.1752 = bf16[64,2]{1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1, %broadcast.199), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.924 = bf16[64,2]{1,0} add(%mul.1751, %mul.1752), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1520 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1599 = bf16[64,2]{1,0} broadcast(%convert_element_type.1520), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1600 = bf16[64,2]{1,0} divide(%add.924, %div.1599), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.52 = bf16[64,2]{1,0} sqrt(%div.1600), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.520 = bf16[] constant(1.001e-08)
+  %broadcast.190 = bf16[64,2]{1,0} broadcast(%constant.520), dimensions={}
+  %add.958 = bf16[64,2]{1,0} add(%sqrt.52, %broadcast.190), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1651 = bf16[64,2]{1,0} divide(%div.1534, %add.958), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1803 = bf16[64,2]{1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1, %broadcast.223), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.991 = bf16[64,2]{1,0} add(%div.1651, %mul.1803), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1857 = bf16[64,2]{1,0} multiply(%mul.1856, %add.991), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1028 = bf16[64,2]{1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____bias__.1, %mul.1857), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.597 = bf16[64,2]{1,0} custom-call(%add.1028), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.825 = bf16[64,2]{1,0} custom-call(%custom-call.597), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.438 = bf16[64,2]{1,0} reshape(%custom-call.825), sharding={replicated}
+  %convert_element_type.1556 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1858 = bf16[7168,2,64]{2,1,0} broadcast(%convert_element_type.1556), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.134 = pred[7168,2,64]{2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1469 = bf16[7168,2,64]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1470 = bf16[7168,2,64]{2,0,1} divide(%transpose.155, %div.1469), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.135 = bf16[7168,2,64]{2,0,1} select(%select_n.134, %transpose.155, %div.1470), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.614 = bf16[] constant(0.1001)
+  %broadcast.221 = bf16[7168,2,64]{2,1,0} broadcast(%constant.614), dimensions={}
+  %mul.1687 = bf16[7168,2,64]{2,0,1} multiply(%select_n.135, %broadcast.221), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1 = bf16[7168,2,64]{2,1,0} parameter(176), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'kernel\']"}
+  %constant.613 = bf16[] constant(0.8984)
+  %mul.1533 = bf16[7168,2,64]{2,1,0} broadcast(%constant.613), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1688 = bf16[7168,2,64]{2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1, %mul.1533), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.892 = bf16[7168,2,64]{2,0,1} add(%mul.1687, %mul.1688), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1488 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1535 = bf16[7168,2,64]{2,1,0} broadcast(%convert_element_type.1488), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1536 = bf16[7168,2,64]{2,0,1} divide(%add.892, %div.1535), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.48 = bf16[7168,2,64]{2,0,1} multiply(%select_n.135, %select_n.135), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.560 = bf16[] constant(0.05005)
+  %mul.1502 = bf16[7168,2,64]{2,1,0} broadcast(%constant.560), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1753 = bf16[7168,2,64]{2,0,1} multiply(%integer_pow.48, %mul.1502), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1 = bf16[7168,2,64]{2,1,0} parameter(209), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'gate\'][\'kernel\']"}
+  %constant.559 = bf16[] constant(0.9492)
+  %mul.1501 = bf16[7168,2,64]{2,1,0} broadcast(%constant.559), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1754 = bf16[7168,2,64]{2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1, %mul.1501), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.925 = bf16[7168,2,64]{2,0,1} add(%mul.1753, %mul.1754), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1521 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1601 = bf16[7168,2,64]{2,1,0} broadcast(%convert_element_type.1521), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1602 = bf16[7168,2,64]{2,0,1} divide(%add.925, %div.1601), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.53 = bf16[7168,2,64]{2,0,1} sqrt(%div.1602), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.519 = bf16[] constant(1.001e-08)
+  %add.821 = bf16[7168,2,64]{2,1,0} broadcast(%constant.519), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.959 = bf16[7168,2,64]{2,0,1} add(%sqrt.53, %add.821), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1652 = bf16[7168,2,64]{2,0,1} divide(%div.1536, %add.959), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1804 = bf16[7168,2,64]{2,1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1, %broadcast.221), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.992 = bf16[7168,2,64]{2,0,1} add(%div.1652, %mul.1804), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1859 = bf16[7168,2,64]{2,1,0} multiply(%mul.1858, %add.992), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1029 = bf16[7168,2,64]{2,1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____gate____kernel__.1, %mul.1859), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.598 = bf16[7168,2,64]{2,1,0} custom-call(%add.1029), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.826 = bf16[7168,2,64]{2,1,0} custom-call(%custom-call.598), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.439 = bf16[7168,2,64]{2,1,0} reshape(%custom-call.826), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1557 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1860 = bf16[64,2]{1,0} broadcast(%convert_element_type.1557), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.136 = pred[64,2]{0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1471 = bf16[64,2]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1472 = bf16[64,2]{0,1} divide(%transpose.154, %div.1471), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.137 = bf16[64,2]{0,1} select(%select_n.136, %transpose.154, %div.1472), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %mul.1689 = bf16[64,2]{0,1} multiply(%select_n.137, %broadcast.223), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1 = bf16[64,2]{1,0} parameter(177), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'router_bias\']"}
+  %mul.1690 = bf16[64,2]{1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1, %broadcast.222), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.893 = bf16[64,2]{0,1} add(%mul.1689, %mul.1690), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1489 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1537 = bf16[64,2]{1,0} broadcast(%convert_element_type.1489), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1538 = bf16[64,2]{0,1} divide(%add.893, %div.1537), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.49 = bf16[64,2]{0,1} multiply(%select_n.137, %select_n.137), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %mul.1755 = bf16[64,2]{0,1} multiply(%integer_pow.49, %broadcast.200), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1 = bf16[64,2]{1,0} parameter(210), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'router_bias\']"}
+  %mul.1756 = bf16[64,2]{1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1, %broadcast.199), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.926 = bf16[64,2]{0,1} add(%mul.1755, %mul.1756), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1522 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1603 = bf16[64,2]{1,0} broadcast(%convert_element_type.1522), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1604 = bf16[64,2]{0,1} divide(%add.926, %div.1603), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.54 = bf16[64,2]{0,1} sqrt(%div.1604), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.960 = bf16[64,2]{0,1} add(%sqrt.54, %broadcast.190), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1653 = bf16[64,2]{0,1} divide(%div.1538, %add.960), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1805 = bf16[64,2]{1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1, %broadcast.223), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.993 = bf16[64,2]{0,1} add(%div.1653, %mul.1805), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1861 = bf16[64,2]{1,0} multiply(%mul.1860, %add.993), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1030 = bf16[64,2]{1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____router_bias__.1, %mul.1861), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.599 = bf16[64,2]{1,0} custom-call(%add.1030), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.827 = bf16[64,2]{1,0} custom-call(%custom-call.599), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.440 = bf16[64,2]{1,0} reshape(%custom-call.827), sharding={replicated}
+  %convert_element_type.1558 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1862 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%convert_element_type.1558), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.138 = pred[64,2,7168,2048]{3,2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1473 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1474 = bf16[64,2,7168,2048]{3,2,0,1} divide(%transpose.153, %div.1473), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.139 = bf16[64,2,7168,2048]{3,2,0,1} select(%select_n.138, %transpose.153, %div.1474), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.612 = bf16[] constant(0.1001)
+  %broadcast.220 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%constant.612), dimensions={}
+  %mul.1691 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%select_n.139, %broadcast.220), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1 = bf16[64,2,7168,2048]{3,2,1,0} parameter(178), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wi_0\']"}
+  %constant.611 = bf16[] constant(0.8984)
+  %broadcast.219 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%constant.611), dimensions={}
+  %mul.1692 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1, %broadcast.219), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.894 = bf16[64,2,7168,2048]{3,2,0,1} add(%mul.1691, %mul.1692), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1490 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1539 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%convert_element_type.1490), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1540 = bf16[64,2,7168,2048]{3,2,0,1} divide(%add.894, %div.1539), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.50 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%select_n.139, %select_n.139), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.558 = bf16[] constant(0.05005)
+  %broadcast.198 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%constant.558), dimensions={}
+  %mul.1757 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%integer_pow.50, %broadcast.198), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1 = bf16[64,2,7168,2048]{3,2,1,0} parameter(211), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wi_0\']"}
+  %constant.557 = bf16[] constant(0.9492)
+  %broadcast.197 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%constant.557), dimensions={}
+  %mul.1758 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1, %broadcast.197), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.927 = bf16[64,2,7168,2048]{3,2,0,1} add(%mul.1757, %mul.1758), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1523 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1605 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%convert_element_type.1523), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1606 = bf16[64,2,7168,2048]{3,2,0,1} divide(%add.927, %div.1605), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.55 = bf16[64,2,7168,2048]{3,2,0,1} sqrt(%div.1606), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.518 = bf16[] constant(1.001e-08)
+  %broadcast.189 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%constant.518), dimensions={}
+  %add.961 = bf16[64,2,7168,2048]{3,2,0,1} add(%sqrt.55, %broadcast.189), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1654 = bf16[64,2,7168,2048]{3,2,0,1} divide(%div.1540, %add.961), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1806 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1, %broadcast.220), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.994 = bf16[64,2,7168,2048]{3,2,0,1} add(%div.1654, %mul.1806), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1863 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%mul.1862, %add.994), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1031 = bf16[64,2,7168,2048]{3,2,1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_0__.1, %mul.1863), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.600 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%add.1031), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.828 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%custom-call.600), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.441 = bf16[64,2,7168,2048]{3,2,1,0} reshape(%custom-call.828), sharding={devices=[4,1,1,1]<=[4]}
+  %convert_element_type.1559 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1864 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%convert_element_type.1559), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.140 = pred[64,2,7168,2048]{3,2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1475 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1476 = bf16[64,2,7168,2048]{3,2,0,1} divide(%transpose.152, %div.1475), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.141 = bf16[64,2,7168,2048]{3,2,0,1} select(%select_n.140, %transpose.152, %div.1476), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %mul.1693 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%select_n.141, %broadcast.220), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1 = bf16[64,2,7168,2048]{3,2,1,0} parameter(179), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wi_1\']"}
+  %mul.1694 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1, %broadcast.219), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.895 = bf16[64,2,7168,2048]{3,2,0,1} add(%mul.1693, %mul.1694), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1491 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1541 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%convert_element_type.1491), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1542 = bf16[64,2,7168,2048]{3,2,0,1} divide(%add.895, %div.1541), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.51 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%select_n.141, %select_n.141), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %mul.1759 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%integer_pow.51, %broadcast.198), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1 = bf16[64,2,7168,2048]{3,2,1,0} parameter(212), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wi_1\']"}
+  %mul.1760 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1, %broadcast.197), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.928 = bf16[64,2,7168,2048]{3,2,0,1} add(%mul.1759, %mul.1760), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1524 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1607 = bf16[64,2,7168,2048]{3,2,1,0} broadcast(%convert_element_type.1524), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1608 = bf16[64,2,7168,2048]{3,2,0,1} divide(%add.928, %div.1607), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.56 = bf16[64,2,7168,2048]{3,2,0,1} sqrt(%div.1608), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.962 = bf16[64,2,7168,2048]{3,2,0,1} add(%sqrt.56, %broadcast.189), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1655 = bf16[64,2,7168,2048]{3,2,0,1} divide(%div.1542, %add.962), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1807 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1, %broadcast.220), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.995 = bf16[64,2,7168,2048]{3,2,0,1} add(%div.1655, %mul.1807), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1865 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%mul.1864, %add.995), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1032 = bf16[64,2,7168,2048]{3,2,1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wi_1__.1, %mul.1865), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.601 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%add.1032), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.829 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%custom-call.601), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.442 = bf16[64,2,7168,2048]{3,2,1,0} reshape(%custom-call.829), sharding={devices=[4,1,1,1]<=[4]}
+  %convert_element_type.1560 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1866 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%convert_element_type.1560), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.142 = pred[64,2,2048,7168]{3,2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1477 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1478 = bf16[64,2,2048,7168]{3,2,0,1} divide(%transpose.151, %div.1477), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.143 = bf16[64,2,2048,7168]{3,2,0,1} select(%select_n.142, %transpose.151, %div.1478), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.610 = bf16[] constant(0.1001)
+  %broadcast.218 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%constant.610), dimensions={}
+  %mul.1695 = bf16[64,2,2048,7168]{3,2,0,1} multiply(%select_n.143, %broadcast.218), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1 = bf16[64,2,2048,7168]{3,2,1,0} parameter(180), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wo\']"}
+  %constant.609 = bf16[] constant(0.8984)
+  %mul.1532 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%constant.609), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1696 = bf16[64,2,2048,7168]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1, %mul.1532), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.896 = bf16[64,2,2048,7168]{3,2,0,1} add(%mul.1695, %mul.1696), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1492 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1543 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%convert_element_type.1492), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1544 = bf16[64,2,2048,7168]{3,2,0,1} divide(%add.896, %div.1543), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.52 = bf16[64,2,2048,7168]{3,2,0,1} multiply(%select_n.143, %select_n.143), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.556 = bf16[] constant(0.05005)
+  %mul.1500 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%constant.556), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1761 = bf16[64,2,2048,7168]{3,2,0,1} multiply(%integer_pow.52, %mul.1500), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1 = bf16[64,2,2048,7168]{3,2,1,0} parameter(213), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'MoeBlock_0\'][\'wo\']"}
+  %constant.555 = bf16[] constant(0.9492)
+  %mul.1499 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%constant.555), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1762 = bf16[64,2,2048,7168]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1, %mul.1499), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.929 = bf16[64,2,2048,7168]{3,2,0,1} add(%mul.1761, %mul.1762), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1525 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1609 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%convert_element_type.1525), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1610 = bf16[64,2,2048,7168]{3,2,0,1} divide(%add.929, %div.1609), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.57 = bf16[64,2,2048,7168]{3,2,0,1} sqrt(%div.1610), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.517 = bf16[] constant(1.001e-08)
+  %add.820 = bf16[64,2,2048,7168]{3,2,1,0} broadcast(%constant.517), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.963 = bf16[64,2,2048,7168]{3,2,0,1} add(%sqrt.57, %add.820), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1656 = bf16[64,2,2048,7168]{3,2,0,1} divide(%div.1544, %add.963), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1808 = bf16[64,2,2048,7168]{3,2,1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1, %broadcast.218), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.996 = bf16[64,2,2048,7168]{3,2,0,1} add(%div.1656, %mul.1808), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1867 = bf16[64,2,2048,7168]{3,2,1,0} multiply(%mul.1866, %add.996), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1033 = bf16[64,2,2048,7168]{3,2,1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____MoeBlock_0____wo__.1, %mul.1867), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.602 = bf16[64,2,2048,7168]{3,2,1,0} custom-call(%add.1033), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.830 = bf16[64,2,2048,7168]{3,2,1,0} custom-call(%custom-call.602), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}]>]>"}
+  %reshape.443 = bf16[64,2,2048,7168]{3,2,1,0} reshape(%custom-call.830), sharding={devices=[4,1,1,1]<=[4]}
+  %convert_element_type.1561 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1868 = bf16[7168,2,2048]{2,1,0} broadcast(%convert_element_type.1561), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.144 = pred[7168,2,2048]{2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1479 = bf16[7168,2,2048]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1480 = bf16[7168,2,2048]{2,0,1} divide(%transpose.150, %div.1479), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.145 = bf16[7168,2,2048]{2,0,1} select(%select_n.144, %transpose.150, %div.1480), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.608 = bf16[] constant(0.1001)
+  %broadcast.217 = bf16[7168,2,2048]{2,1,0} broadcast(%constant.608), dimensions={}
+  %mul.1697 = bf16[7168,2,2048]{2,0,1} multiply(%select_n.145, %broadcast.217), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1 = bf16[7168,2,2048]{2,1,0} parameter(181), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'kernel\']"}
+  %constant.607 = bf16[] constant(0.8984)
+  %broadcast.216 = bf16[7168,2,2048]{2,1,0} broadcast(%constant.607), dimensions={}
+  %mul.1698 = bf16[7168,2,2048]{2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1, %broadcast.216), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.897 = bf16[7168,2,2048]{2,0,1} add(%mul.1697, %mul.1698), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1493 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1545 = bf16[7168,2,2048]{2,1,0} broadcast(%convert_element_type.1493), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1546 = bf16[7168,2,2048]{2,0,1} divide(%add.897, %div.1545), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.53 = bf16[7168,2,2048]{2,0,1} multiply(%select_n.145, %select_n.145), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.554 = bf16[] constant(0.05005)
+  %broadcast.196 = bf16[7168,2,2048]{2,1,0} broadcast(%constant.554), dimensions={}
+  %mul.1763 = bf16[7168,2,2048]{2,0,1} multiply(%integer_pow.53, %broadcast.196), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1 = bf16[7168,2,2048]{2,1,0} parameter(214), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_0\'][\'kernel\']"}
+  %constant.553 = bf16[] constant(0.9492)
+  %broadcast.195 = bf16[7168,2,2048]{2,1,0} broadcast(%constant.553), dimensions={}
+  %mul.1764 = bf16[7168,2,2048]{2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1, %broadcast.195), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.930 = bf16[7168,2,2048]{2,0,1} add(%mul.1763, %mul.1764), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1526 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1611 = bf16[7168,2,2048]{2,1,0} broadcast(%convert_element_type.1526), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1612 = bf16[7168,2,2048]{2,0,1} divide(%add.930, %div.1611), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.58 = bf16[7168,2,2048]{2,0,1} sqrt(%div.1612), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.516 = bf16[] constant(1.001e-08)
+  %broadcast.188 = bf16[7168,2,2048]{2,1,0} broadcast(%constant.516), dimensions={}
+  %add.964 = bf16[7168,2,2048]{2,0,1} add(%sqrt.58, %broadcast.188), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1657 = bf16[7168,2,2048]{2,0,1} divide(%div.1546, %add.964), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1809 = bf16[7168,2,2048]{2,1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1, %broadcast.217), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.997 = bf16[7168,2,2048]{2,0,1} add(%div.1657, %mul.1809), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1869 = bf16[7168,2,2048]{2,1,0} multiply(%mul.1868, %add.997), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1034 = bf16[7168,2,2048]{2,1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_0____kernel__.1, %mul.1869), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.603 = bf16[7168,2,2048]{2,1,0} custom-call(%add.1034), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.831 = bf16[7168,2,2048]{2,1,0} custom-call(%custom-call.603), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.444 = bf16[7168,2,2048]{2,1,0} reshape(%custom-call.831), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1562 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1870 = bf16[7168,2,2048]{2,1,0} broadcast(%convert_element_type.1562), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.146 = pred[7168,2,2048]{2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1481 = bf16[7168,2,2048]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1482 = bf16[7168,2,2048]{2,0,1} divide(%transpose.149, %div.1481), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.147 = bf16[7168,2,2048]{2,0,1} select(%select_n.146, %transpose.149, %div.1482), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %mul.1699 = bf16[7168,2,2048]{2,0,1} multiply(%select_n.147, %broadcast.217), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1 = bf16[7168,2,2048]{2,1,0} parameter(182), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'kernel\']"}
+  %mul.1700 = bf16[7168,2,2048]{2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1, %broadcast.216), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.898 = bf16[7168,2,2048]{2,0,1} add(%mul.1699, %mul.1700), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1494 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1547 = bf16[7168,2,2048]{2,1,0} broadcast(%convert_element_type.1494), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1548 = bf16[7168,2,2048]{2,0,1} divide(%add.898, %div.1547), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.54 = bf16[7168,2,2048]{2,0,1} multiply(%select_n.147, %select_n.147), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %mul.1765 = bf16[7168,2,2048]{2,0,1} multiply(%integer_pow.54, %broadcast.196), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1 = bf16[7168,2,2048]{2,1,0} parameter(215), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wi_1\'][\'kernel\']"}
+  %mul.1766 = bf16[7168,2,2048]{2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1, %broadcast.195), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.931 = bf16[7168,2,2048]{2,0,1} add(%mul.1765, %mul.1766), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1527 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1613 = bf16[7168,2,2048]{2,1,0} broadcast(%convert_element_type.1527), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1614 = bf16[7168,2,2048]{2,0,1} divide(%add.931, %div.1613), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.59 = bf16[7168,2,2048]{2,0,1} sqrt(%div.1614), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.965 = bf16[7168,2,2048]{2,0,1} add(%sqrt.59, %broadcast.188), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1658 = bf16[7168,2,2048]{2,0,1} divide(%div.1548, %add.965), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1810 = bf16[7168,2,2048]{2,1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1, %broadcast.217), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.998 = bf16[7168,2,2048]{2,0,1} add(%div.1658, %mul.1810), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1871 = bf16[7168,2,2048]{2,1,0} multiply(%mul.1870, %add.998), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1035 = bf16[7168,2,2048]{2,1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wi_1____kernel__.1, %mul.1871), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.604 = bf16[7168,2,2048]{2,1,0} custom-call(%add.1035), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.832 = bf16[7168,2,2048]{2,1,0} custom-call(%custom-call.604), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.445 = bf16[7168,2,2048]{2,1,0} reshape(%custom-call.832), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1563 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1872 = bf16[2048,2,7168]{2,1,0} broadcast(%convert_element_type.1563), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.148 = pred[2048,2,7168]{2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1483 = bf16[2048,2,7168]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1484 = bf16[2048,2,7168]{2,0,1} divide(%transpose.148, %div.1483), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.149 = bf16[2048,2,7168]{2,0,1} select(%select_n.148, %transpose.148, %div.1484), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.606 = bf16[] constant(0.1001)
+  %broadcast.215 = bf16[2048,2,7168]{2,1,0} broadcast(%constant.606), dimensions={}
+  %mul.1701 = bf16[2048,2,7168]{2,0,1} multiply(%select_n.149, %broadcast.215), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1 = bf16[2048,2,7168]{2,1,0} parameter(183), sharding={devices=[1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'kernel\']"}
+  %constant.605 = bf16[] constant(0.8984)
+  %mul.1531 = bf16[2048,2,7168]{2,1,0} broadcast(%constant.605), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1702 = bf16[2048,2,7168]{2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1, %mul.1531), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.899 = bf16[2048,2,7168]{2,0,1} add(%mul.1701, %mul.1702), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1495 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1549 = bf16[2048,2,7168]{2,1,0} broadcast(%convert_element_type.1495), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1550 = bf16[2048,2,7168]{2,0,1} divide(%add.899, %div.1549), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.55 = bf16[2048,2,7168]{2,0,1} multiply(%select_n.149, %select_n.149), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.552 = bf16[] constant(0.05005)
+  %mul.1498 = bf16[2048,2,7168]{2,1,0} broadcast(%constant.552), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1767 = bf16[2048,2,7168]{2,0,1} multiply(%integer_pow.55, %mul.1498), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1 = bf16[2048,2,7168]{2,1,0} parameter(216), sharding={devices=[1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'DeepSeekMoeBlock_0\'][\'shared_experts\'][\'wo\'][\'kernel\']"}
+  %constant.551 = bf16[] constant(0.9492)
+  %mul.1497 = bf16[2048,2,7168]{2,1,0} broadcast(%constant.551), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1768 = bf16[2048,2,7168]{2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1, %mul.1497), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.932 = bf16[2048,2,7168]{2,0,1} add(%mul.1767, %mul.1768), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1528 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1615 = bf16[2048,2,7168]{2,1,0} broadcast(%convert_element_type.1528), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1616 = bf16[2048,2,7168]{2,0,1} divide(%add.932, %div.1615), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.60 = bf16[2048,2,7168]{2,0,1} sqrt(%div.1616), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.515 = bf16[] constant(1.001e-08)
+  %add.819 = bf16[2048,2,7168]{2,1,0} broadcast(%constant.515), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.966 = bf16[2048,2,7168]{2,0,1} add(%sqrt.60, %add.819), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1659 = bf16[2048,2,7168]{2,0,1} divide(%div.1550, %add.966), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1811 = bf16[2048,2,7168]{2,1,0} multiply(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1, %broadcast.215), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.999 = bf16[2048,2,7168]{2,0,1} add(%div.1659, %mul.1811), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1873 = bf16[2048,2,7168]{2,1,0} multiply(%mul.1872, %add.999), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1036 = bf16[2048,2,7168]{2,1,0} add(%state_params__params____decoder____moe_layers____DeepSeekMoeBlock_0____shared_experts____wo____kernel__.1, %mul.1873), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.605 = bf16[2048,2,7168]{2,1,0} custom-call(%add.1036), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.833 = bf16[2048,2,7168]{2,1,0} custom-call(%custom-call.605), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>]>"}
+  %reshape.446 = bf16[2048,2,7168]{2,1,0} reshape(%custom-call.833), sharding={devices=[1,1,4]<=[4]}
+  %convert_element_type.1564 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1874 = bf16[7168,2]{1,0} broadcast(%convert_element_type.1564), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.150 = pred[7168,2]{0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1485 = bf16[7168,2]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1486 = bf16[7168,2]{0,1} divide(%transpose.147, %div.1485), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.151 = bf16[7168,2]{0,1} select(%select_n.150, %transpose.147, %div.1486), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.604 = bf16[] constant(0.1001)
+  %broadcast.214 = bf16[7168,2]{1,0} broadcast(%constant.604), dimensions={}
+  %mul.1703 = bf16[7168,2]{0,1} multiply(%select_n.151, %broadcast.214), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____post_self_attention_layer_norm____scale__.1 = bf16[7168,2]{1,0} parameter(184), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'post_self_attention_layer_norm\'][\'scale\']"}
+  %constant.603 = bf16[] constant(0.8984)
+  %broadcast.213 = bf16[7168,2]{1,0} broadcast(%constant.603), dimensions={}
+  %mul.1704 = bf16[7168,2]{1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____post_self_attention_layer_norm____scale__.1, %broadcast.213), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.900 = bf16[7168,2]{0,1} add(%mul.1703, %mul.1704), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1496 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1551 = bf16[7168,2]{1,0} broadcast(%convert_element_type.1496), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1552 = bf16[7168,2]{0,1} divide(%add.900, %div.1551), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.56 = bf16[7168,2]{0,1} multiply(%select_n.151, %select_n.151), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.550 = bf16[] constant(0.05005)
+  %broadcast.194 = bf16[7168,2]{1,0} broadcast(%constant.550), dimensions={}
+  %mul.1769 = bf16[7168,2]{0,1} multiply(%integer_pow.56, %broadcast.194), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____post_self_attention_layer_norm____scale__.1 = bf16[7168,2]{1,0} parameter(217), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'post_self_attention_layer_norm\'][\'scale\']"}
+  %constant.549 = bf16[] constant(0.9492)
+  %broadcast.193 = bf16[7168,2]{1,0} broadcast(%constant.549), dimensions={}
+  %mul.1770 = bf16[7168,2]{1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____post_self_attention_layer_norm____scale__.1, %broadcast.193), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.933 = bf16[7168,2]{0,1} add(%mul.1769, %mul.1770), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1529 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1617 = bf16[7168,2]{1,0} broadcast(%convert_element_type.1529), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1618 = bf16[7168,2]{0,1} divide(%add.933, %div.1617), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.61 = bf16[7168,2]{0,1} sqrt(%div.1618), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.514 = bf16[] constant(1.001e-08)
+  %broadcast.187 = bf16[7168,2]{1,0} broadcast(%constant.514), dimensions={}
+  %add.967 = bf16[7168,2]{0,1} add(%sqrt.61, %broadcast.187), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1660 = bf16[7168,2]{0,1} divide(%div.1552, %add.967), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1812 = bf16[7168,2]{1,0} multiply(%state_params__params____decoder____moe_layers____post_self_attention_layer_norm____scale__.1, %broadcast.214), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1000 = bf16[7168,2]{0,1} add(%div.1660, %mul.1812), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1875 = bf16[7168,2]{1,0} multiply(%mul.1874, %add.1000), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1037 = bf16[7168,2]{1,0} add(%state_params__params____decoder____moe_layers____post_self_attention_layer_norm____scale__.1, %mul.1875), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.606 = bf16[7168,2]{1,0} custom-call(%add.1037), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.834 = bf16[7168,2]{1,0} custom-call(%custom-call.606), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.447 = bf16[7168,2]{1,0} reshape(%custom-call.834), sharding={replicated}
+  %convert_element_type.1565 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1876 = bf16[7168,2]{1,0} broadcast(%convert_element_type.1565), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.152 = pred[7168,2]{0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1487 = bf16[7168,2]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1488 = bf16[7168,2]{0,1} divide(%transpose.146, %div.1487), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.153 = bf16[7168,2]{0,1} select(%select_n.152, %transpose.146, %div.1488), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %mul.1705 = bf16[7168,2]{0,1} multiply(%select_n.153, %broadcast.214), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____pre_self_attention_layer_norm____scale__.1 = bf16[7168,2]{1,0} parameter(185), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'pre_self_attention_layer_norm\'][\'scale\']"}
+  %mul.1706 = bf16[7168,2]{1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____pre_self_attention_layer_norm____scale__.1, %broadcast.213), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.901 = bf16[7168,2]{0,1} add(%mul.1705, %mul.1706), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1497 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1553 = bf16[7168,2]{1,0} broadcast(%convert_element_type.1497), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1554 = bf16[7168,2]{0,1} divide(%add.901, %div.1553), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.57 = bf16[7168,2]{0,1} multiply(%select_n.153, %select_n.153), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %mul.1771 = bf16[7168,2]{0,1} multiply(%integer_pow.57, %broadcast.194), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____pre_self_attention_layer_norm____scale__.1 = bf16[7168,2]{1,0} parameter(218), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'pre_self_attention_layer_norm\'][\'scale\']"}
+  %mul.1772 = bf16[7168,2]{1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____pre_self_attention_layer_norm____scale__.1, %broadcast.193), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.934 = bf16[7168,2]{0,1} add(%mul.1771, %mul.1772), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1530 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1619 = bf16[7168,2]{1,0} broadcast(%convert_element_type.1530), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1620 = bf16[7168,2]{0,1} divide(%add.934, %div.1619), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.62 = bf16[7168,2]{0,1} sqrt(%div.1620), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.968 = bf16[7168,2]{0,1} add(%sqrt.62, %broadcast.187), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1661 = bf16[7168,2]{0,1} divide(%div.1554, %add.968), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1813 = bf16[7168,2]{1,0} multiply(%state_params__params____decoder____moe_layers____pre_self_attention_layer_norm____scale__.1, %broadcast.214), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1001 = bf16[7168,2]{0,1} add(%div.1661, %mul.1813), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1877 = bf16[7168,2]{1,0} multiply(%mul.1876, %add.1001), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1038 = bf16[7168,2]{1,0} add(%state_params__params____decoder____moe_layers____pre_self_attention_layer_norm____scale__.1, %mul.1877), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.607 = bf16[7168,2]{1,0} custom-call(%add.1038), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.835 = bf16[7168,2]{1,0} custom-call(%custom-call.607), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.448 = bf16[7168,2]{1,0} reshape(%custom-call.835), sharding={replicated}
+  %convert_element_type.1566 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1878 = bf16[512,2]{1,0} broadcast(%convert_element_type.1566), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.154 = pred[512,2]{0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1489 = bf16[512,2]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1490 = bf16[512,2]{0,1} divide(%transpose.145, %div.1489), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.155 = bf16[512,2]{0,1} select(%select_n.154, %transpose.145, %div.1490), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.602 = bf16[] constant(0.1001)
+  %broadcast.212 = bf16[512,2]{1,0} broadcast(%constant.602), dimensions={}
+  %mul.1707 = bf16[512,2]{0,1} multiply(%select_n.155, %broadcast.212), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____self_attention____kv_norm____scale__.1 = bf16[512,2]{1,0} parameter(186), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'kv_norm\'][\'scale\']"}
+  %constant.601 = bf16[] constant(0.8984)
+  %mul.1530 = bf16[512,2]{1,0} broadcast(%constant.601), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1708 = bf16[512,2]{1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____self_attention____kv_norm____scale__.1, %mul.1530), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.902 = bf16[512,2]{0,1} add(%mul.1707, %mul.1708), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1498 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1555 = bf16[512,2]{1,0} broadcast(%convert_element_type.1498), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1556 = bf16[512,2]{0,1} divide(%add.902, %div.1555), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.58 = bf16[512,2]{0,1} multiply(%select_n.155, %select_n.155), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.548 = bf16[] constant(0.05005)
+  %mul.1496 = bf16[512,2]{1,0} broadcast(%constant.548), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1773 = bf16[512,2]{0,1} multiply(%integer_pow.58, %mul.1496), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____self_attention____kv_norm____scale__.1 = bf16[512,2]{1,0} parameter(219), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'kv_norm\'][\'scale\']"}
+  %constant.547 = bf16[] constant(0.9492)
+  %mul.1495 = bf16[512,2]{1,0} broadcast(%constant.547), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1774 = bf16[512,2]{1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____self_attention____kv_norm____scale__.1, %mul.1495), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.935 = bf16[512,2]{0,1} add(%mul.1773, %mul.1774), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1531 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1621 = bf16[512,2]{1,0} broadcast(%convert_element_type.1531), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1622 = bf16[512,2]{0,1} divide(%add.935, %div.1621), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.63 = bf16[512,2]{0,1} sqrt(%div.1622), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.513 = bf16[] constant(1.001e-08)
+  %add.818 = bf16[512,2]{1,0} broadcast(%constant.513), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.969 = bf16[512,2]{0,1} add(%sqrt.63, %add.818), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1662 = bf16[512,2]{0,1} divide(%div.1556, %add.969), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1814 = bf16[512,2]{1,0} multiply(%state_params__params____decoder____moe_layers____self_attention____kv_norm____scale__.1, %broadcast.212), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1002 = bf16[512,2]{0,1} add(%div.1662, %mul.1814), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1879 = bf16[512,2]{1,0} multiply(%mul.1878, %add.1002), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1039 = bf16[512,2]{1,0} add(%state_params__params____decoder____moe_layers____self_attention____kv_norm____scale__.1, %mul.1879), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.608 = bf16[512,2]{1,0} custom-call(%add.1039), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.836 = bf16[512,2]{1,0} custom-call(%custom-call.608), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.449 = bf16[512,2]{1,0} reshape(%custom-call.836), sharding={replicated}
+  %convert_element_type.1567 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1880 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%convert_element_type.1567), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.156 = pred[128,2,128,7168]{3,2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1491 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1492 = bf16[128,2,128,7168]{3,2,0,1} divide(%transpose.144, %div.1491), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.157 = bf16[128,2,128,7168]{3,2,0,1} select(%select_n.156, %transpose.144, %div.1492), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.600 = bf16[] constant(0.1001)
+  %broadcast.211 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%constant.600), dimensions={}
+  %mul.1709 = bf16[128,2,128,7168]{3,2,0,1} multiply(%select_n.157, %broadcast.211), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____self_attention____out____kernel__.1 = bf16[128,2,128,7168]{3,2,1,0} parameter(187), sharding={devices=[1,1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'kernel\']"}
+  %constant.599 = bf16[] constant(0.8984)
+  %mul.1529 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%constant.599), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1710 = bf16[128,2,128,7168]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____self_attention____out____kernel__.1, %mul.1529), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.903 = bf16[128,2,128,7168]{3,2,0,1} add(%mul.1709, %mul.1710), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1499 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1557 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%convert_element_type.1499), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1558 = bf16[128,2,128,7168]{3,2,0,1} divide(%add.903, %div.1557), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.59 = bf16[128,2,128,7168]{3,2,0,1} multiply(%select_n.157, %select_n.157), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.546 = bf16[] constant(0.05005)
+  %mul.1494 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%constant.546), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1775 = bf16[128,2,128,7168]{3,2,0,1} multiply(%integer_pow.59, %mul.1494), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____self_attention____out____kernel__.1 = bf16[128,2,128,7168]{3,2,1,0} parameter(220), sharding={devices=[1,1,1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'out\'][\'kernel\']"}
+  %constant.545 = bf16[] constant(0.9492)
+  %mul.1493 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%constant.545), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1776 = bf16[128,2,128,7168]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____self_attention____out____kernel__.1, %mul.1493), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.936 = bf16[128,2,128,7168]{3,2,0,1} add(%mul.1775, %mul.1776), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1532 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1623 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%convert_element_type.1532), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1624 = bf16[128,2,128,7168]{3,2,0,1} divide(%add.936, %div.1623), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.64 = bf16[128,2,128,7168]{3,2,0,1} sqrt(%div.1624), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.512 = bf16[] constant(1.001e-08)
+  %add.817 = bf16[128,2,128,7168]{3,2,1,0} broadcast(%constant.512), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.970 = bf16[128,2,128,7168]{3,2,0,1} add(%sqrt.64, %add.817), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1663 = bf16[128,2,128,7168]{3,2,0,1} divide(%div.1558, %add.970), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1815 = bf16[128,2,128,7168]{3,2,1,0} multiply(%state_params__params____decoder____moe_layers____self_attention____out____kernel__.1, %broadcast.211), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1003 = bf16[128,2,128,7168]{3,2,0,1} add(%div.1663, %mul.1815), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1881 = bf16[128,2,128,7168]{3,2,1,0} multiply(%mul.1880, %add.1003), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1040 = bf16[128,2,128,7168]{3,2,1,0} add(%state_params__params____decoder____moe_layers____self_attention____out____kernel__.1, %mul.1881), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.609 = bf16[128,2,128,7168]{3,2,1,0} custom-call(%add.1040), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.837 = bf16[128,2,128,7168]{3,2,1,0} custom-call(%custom-call.609), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.450 = bf16[128,2,128,7168]{3,2,1,0} reshape(%custom-call.837), sharding={devices=[1,1,1,4]<=[4]}
+  %convert_element_type.1568 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1882 = bf16[1536,2]{1,0} broadcast(%convert_element_type.1568), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.158 = pred[1536,2]{0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1493 = bf16[1536,2]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1494 = bf16[1536,2]{0,1} divide(%transpose.143, %div.1493), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.159 = bf16[1536,2]{0,1} select(%select_n.158, %transpose.143, %div.1494), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.598 = bf16[] constant(0.1001)
+  %broadcast.210 = bf16[1536,2]{1,0} broadcast(%constant.598), dimensions={}
+  %mul.1711 = bf16[1536,2]{0,1} multiply(%select_n.159, %broadcast.210), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____self_attention____q_norm____scale__.1 = bf16[1536,2]{1,0} parameter(188), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'q_norm\'][\'scale\']"}
+  %constant.597 = bf16[] constant(0.8984)
+  %mul.1528 = bf16[1536,2]{1,0} broadcast(%constant.597), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1712 = bf16[1536,2]{1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____self_attention____q_norm____scale__.1, %mul.1528), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.904 = bf16[1536,2]{0,1} add(%mul.1711, %mul.1712), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1500 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1559 = bf16[1536,2]{1,0} broadcast(%convert_element_type.1500), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1560 = bf16[1536,2]{0,1} divide(%add.904, %div.1559), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.60 = bf16[1536,2]{0,1} multiply(%select_n.159, %select_n.159), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.544 = bf16[] constant(0.05005)
+  %mul.1492 = bf16[1536,2]{1,0} broadcast(%constant.544), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1777 = bf16[1536,2]{0,1} multiply(%integer_pow.60, %mul.1492), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____self_attention____q_norm____scale__.1 = bf16[1536,2]{1,0} parameter(221), sharding={replicated}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'q_norm\'][\'scale\']"}
+  %constant.543 = bf16[] constant(0.9492)
+  %mul.1491 = bf16[1536,2]{1,0} broadcast(%constant.543), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1778 = bf16[1536,2]{1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____self_attention____q_norm____scale__.1, %mul.1491), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.937 = bf16[1536,2]{0,1} add(%mul.1777, %mul.1778), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1533 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1625 = bf16[1536,2]{1,0} broadcast(%convert_element_type.1533), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1626 = bf16[1536,2]{0,1} divide(%add.937, %div.1625), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.65 = bf16[1536,2]{0,1} sqrt(%div.1626), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.511 = bf16[] constant(1.001e-08)
+  %add.816 = bf16[1536,2]{1,0} broadcast(%constant.511), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.971 = bf16[1536,2]{0,1} add(%sqrt.65, %add.816), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1664 = bf16[1536,2]{0,1} divide(%div.1560, %add.971), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1816 = bf16[1536,2]{1,0} multiply(%state_params__params____decoder____moe_layers____self_attention____q_norm____scale__.1, %broadcast.210), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1004 = bf16[1536,2]{0,1} add(%div.1664, %mul.1816), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1883 = bf16[1536,2]{1,0} multiply(%mul.1882, %add.1004), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1041 = bf16[1536,2]{1,0} add(%state_params__params____decoder____moe_layers____self_attention____q_norm____scale__.1, %mul.1883), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.610 = bf16[1536,2]{1,0} custom-call(%add.1041), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.838 = bf16[1536,2]{1,0} custom-call(%custom-call.610), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.451 = bf16[1536,2]{1,0} reshape(%custom-call.838), sharding={replicated}
+  %convert_element_type.1569 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1884 = bf16[7168,2,576]{2,1,0} broadcast(%convert_element_type.1569), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.160 = pred[7168,2,576]{2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1495 = bf16[7168,2,576]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1496 = bf16[7168,2,576]{2,0,1} divide(%transpose.142, %div.1495), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.161 = bf16[7168,2,576]{2,0,1} select(%select_n.160, %transpose.142, %div.1496), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.596 = bf16[] constant(0.1001)
+  %broadcast.209 = bf16[7168,2,576]{2,1,0} broadcast(%constant.596), dimensions={}
+  %mul.1713 = bf16[7168,2,576]{2,0,1} multiply(%select_n.161, %broadcast.209), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____self_attention____wkv_a_kernel__.1 = bf16[7168,2,576]{2,1,0} parameter(189), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_a_kernel\']"}
+  %constant.595 = bf16[] constant(0.8984)
+  %mul.1527 = bf16[7168,2,576]{2,1,0} broadcast(%constant.595), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1714 = bf16[7168,2,576]{2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____self_attention____wkv_a_kernel__.1, %mul.1527), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.905 = bf16[7168,2,576]{2,0,1} add(%mul.1713, %mul.1714), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1501 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1561 = bf16[7168,2,576]{2,1,0} broadcast(%convert_element_type.1501), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1562 = bf16[7168,2,576]{2,0,1} divide(%add.905, %div.1561), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.61 = bf16[7168,2,576]{2,0,1} multiply(%select_n.161, %select_n.161), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.542 = bf16[] constant(0.05005)
+  %mul.1490 = bf16[7168,2,576]{2,1,0} broadcast(%constant.542), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1779 = bf16[7168,2,576]{2,0,1} multiply(%integer_pow.61, %mul.1490), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____self_attention____wkv_a_kernel__.1 = bf16[7168,2,576]{2,1,0} parameter(222), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_a_kernel\']"}
+  %constant.541 = bf16[] constant(0.9492)
+  %mul.1489 = bf16[7168,2,576]{2,1,0} broadcast(%constant.541), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1780 = bf16[7168,2,576]{2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____self_attention____wkv_a_kernel__.1, %mul.1489), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.938 = bf16[7168,2,576]{2,0,1} add(%mul.1779, %mul.1780), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1534 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1627 = bf16[7168,2,576]{2,1,0} broadcast(%convert_element_type.1534), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1628 = bf16[7168,2,576]{2,0,1} divide(%add.938, %div.1627), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.66 = bf16[7168,2,576]{2,0,1} sqrt(%div.1628), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.510 = bf16[] constant(1.001e-08)
+  %add.815 = bf16[7168,2,576]{2,1,0} broadcast(%constant.510), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.972 = bf16[7168,2,576]{2,0,1} add(%sqrt.66, %add.815), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1665 = bf16[7168,2,576]{2,0,1} divide(%div.1562, %add.972), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1817 = bf16[7168,2,576]{2,1,0} multiply(%state_params__params____decoder____moe_layers____self_attention____wkv_a_kernel__.1, %broadcast.209), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1005 = bf16[7168,2,576]{2,0,1} add(%div.1665, %mul.1817), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1885 = bf16[7168,2,576]{2,1,0} multiply(%mul.1884, %add.1005), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1042 = bf16[7168,2,576]{2,1,0} add(%state_params__params____decoder____moe_layers____self_attention____wkv_a_kernel__.1, %mul.1885), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.611 = bf16[7168,2,576]{2,1,0} custom-call(%add.1042), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.839 = bf16[7168,2,576]{2,1,0} custom-call(%custom-call.611), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.452 = bf16[7168,2,576]{2,1,0} reshape(%custom-call.839), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1570 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1886 = bf16[512,2,128,256]{3,2,1,0} broadcast(%convert_element_type.1570), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.162 = pred[512,2,128,256]{3,2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1497 = bf16[512,2,128,256]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1498 = bf16[512,2,128,256]{3,2,0,1} divide(%transpose.141, %div.1497), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.163 = bf16[512,2,128,256]{3,2,0,1} select(%select_n.162, %transpose.141, %div.1498), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.594 = bf16[] constant(0.1001)
+  %broadcast.208 = bf16[512,2,128,256]{3,2,1,0} broadcast(%constant.594), dimensions={}
+  %mul.1715 = bf16[512,2,128,256]{3,2,0,1} multiply(%select_n.163, %broadcast.208), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____self_attention____wkv_b____kernel__.1 = bf16[512,2,128,256]{3,2,1,0} parameter(190), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'kernel\']"}
+  %constant.593 = bf16[] constant(0.8984)
+  %mul.1526 = bf16[512,2,128,256]{3,2,1,0} broadcast(%constant.593), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1716 = bf16[512,2,128,256]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____self_attention____wkv_b____kernel__.1, %mul.1526), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.906 = bf16[512,2,128,256]{3,2,0,1} add(%mul.1715, %mul.1716), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1502 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1563 = bf16[512,2,128,256]{3,2,1,0} broadcast(%convert_element_type.1502), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1564 = bf16[512,2,128,256]{3,2,0,1} divide(%add.906, %div.1563), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.62 = bf16[512,2,128,256]{3,2,0,1} multiply(%select_n.163, %select_n.163), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.540 = bf16[] constant(0.05005)
+  %mul.1488 = bf16[512,2,128,256]{3,2,1,0} broadcast(%constant.540), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1781 = bf16[512,2,128,256]{3,2,0,1} multiply(%integer_pow.62, %mul.1488), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____self_attention____wkv_b____kernel__.1 = bf16[512,2,128,256]{3,2,1,0} parameter(223), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wkv_b\'][\'kernel\']"}
+  %constant.539 = bf16[] constant(0.9492)
+  %mul.1487 = bf16[512,2,128,256]{3,2,1,0} broadcast(%constant.539), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1782 = bf16[512,2,128,256]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____self_attention____wkv_b____kernel__.1, %mul.1487), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.939 = bf16[512,2,128,256]{3,2,0,1} add(%mul.1781, %mul.1782), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1535 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1629 = bf16[512,2,128,256]{3,2,1,0} broadcast(%convert_element_type.1535), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1630 = bf16[512,2,128,256]{3,2,0,1} divide(%add.939, %div.1629), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.67 = bf16[512,2,128,256]{3,2,0,1} sqrt(%div.1630), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.509 = bf16[] constant(1.001e-08)
+  %add.814 = bf16[512,2,128,256]{3,2,1,0} broadcast(%constant.509), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.973 = bf16[512,2,128,256]{3,2,0,1} add(%sqrt.67, %add.814), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1666 = bf16[512,2,128,256]{3,2,0,1} divide(%div.1564, %add.973), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1818 = bf16[512,2,128,256]{3,2,1,0} multiply(%state_params__params____decoder____moe_layers____self_attention____wkv_b____kernel__.1, %broadcast.208), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1006 = bf16[512,2,128,256]{3,2,0,1} add(%div.1666, %mul.1818), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1887 = bf16[512,2,128,256]{3,2,1,0} multiply(%mul.1886, %add.1006), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1043 = bf16[512,2,128,256]{3,2,1,0} add(%state_params__params____decoder____moe_layers____self_attention____wkv_b____kernel__.1, %mul.1887), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.612 = bf16[512,2,128,256]{3,2,1,0} custom-call(%add.1043), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.840 = bf16[512,2,128,256]{3,2,1,0} custom-call(%custom-call.612), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.453 = bf16[512,2,128,256]{3,2,1,0} reshape(%custom-call.840), sharding={devices=[4,1,1,1]<=[4]}
+  %convert_element_type.1571 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1888 = bf16[7168,2,1536]{2,1,0} broadcast(%convert_element_type.1571), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.164 = pred[7168,2,1536]{2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1499 = bf16[7168,2,1536]{2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1500 = bf16[7168,2,1536]{2,0,1} divide(%transpose.140, %div.1499), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.165 = bf16[7168,2,1536]{2,0,1} select(%select_n.164, %transpose.140, %div.1500), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.592 = bf16[] constant(0.1001)
+  %broadcast.207 = bf16[7168,2,1536]{2,1,0} broadcast(%constant.592), dimensions={}
+  %mul.1717 = bf16[7168,2,1536]{2,0,1} multiply(%select_n.165, %broadcast.207), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____self_attention____wq_a_kernel__.1 = bf16[7168,2,1536]{2,1,0} parameter(191), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_a_kernel\']"}
+  %constant.591 = bf16[] constant(0.8984)
+  %mul.1525 = bf16[7168,2,1536]{2,1,0} broadcast(%constant.591), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1718 = bf16[7168,2,1536]{2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____self_attention____wq_a_kernel__.1, %mul.1525), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.907 = bf16[7168,2,1536]{2,0,1} add(%mul.1717, %mul.1718), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1503 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1565 = bf16[7168,2,1536]{2,1,0} broadcast(%convert_element_type.1503), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1566 = bf16[7168,2,1536]{2,0,1} divide(%add.907, %div.1565), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.63 = bf16[7168,2,1536]{2,0,1} multiply(%select_n.165, %select_n.165), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.538 = bf16[] constant(0.05005)
+  %mul.1486 = bf16[7168,2,1536]{2,1,0} broadcast(%constant.538), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1783 = bf16[7168,2,1536]{2,0,1} multiply(%integer_pow.63, %mul.1486), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____self_attention____wq_a_kernel__.1 = bf16[7168,2,1536]{2,1,0} parameter(224), sharding={devices=[4,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_a_kernel\']"}
+  %constant.537 = bf16[] constant(0.9492)
+  %mul.1485 = bf16[7168,2,1536]{2,1,0} broadcast(%constant.537), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1784 = bf16[7168,2,1536]{2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____self_attention____wq_a_kernel__.1, %mul.1485), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.940 = bf16[7168,2,1536]{2,0,1} add(%mul.1783, %mul.1784), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1536 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1631 = bf16[7168,2,1536]{2,1,0} broadcast(%convert_element_type.1536), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1632 = bf16[7168,2,1536]{2,0,1} divide(%add.940, %div.1631), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.68 = bf16[7168,2,1536]{2,0,1} sqrt(%div.1632), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.508 = bf16[] constant(1.001e-08)
+  %add.813 = bf16[7168,2,1536]{2,1,0} broadcast(%constant.508), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.974 = bf16[7168,2,1536]{2,0,1} add(%sqrt.68, %add.813), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1667 = bf16[7168,2,1536]{2,0,1} divide(%div.1566, %add.974), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1819 = bf16[7168,2,1536]{2,1,0} multiply(%state_params__params____decoder____moe_layers____self_attention____wq_a_kernel__.1, %broadcast.207), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1007 = bf16[7168,2,1536]{2,0,1} add(%div.1667, %mul.1819), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1889 = bf16[7168,2,1536]{2,1,0} multiply(%mul.1888, %add.1007), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1044 = bf16[7168,2,1536]{2,1,0} add(%state_params__params____decoder____moe_layers____self_attention____wq_a_kernel__.1, %mul.1889), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.613 = bf16[7168,2,1536]{2,1,0} custom-call(%add.1044), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.841 = bf16[7168,2,1536]{2,1,0} custom-call(%custom-call.613), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.454 = bf16[7168,2,1536]{2,1,0} reshape(%custom-call.841), sharding={devices=[4,1,1]<=[4]}
+  %convert_element_type.1572 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1890 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%convert_element_type.1572), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.166 = pred[1536,2,128,192]{3,2,0,1} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1501 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1502 = bf16[1536,2,128,192]{3,2,0,1} divide(%transpose.139, %div.1501), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.167 = bf16[1536,2,128,192]{3,2,0,1} select(%select_n.166, %transpose.139, %div.1502), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.590 = bf16[] constant(0.1001)
+  %broadcast.206 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%constant.590), dimensions={}
+  %mul.1719 = bf16[1536,2,128,192]{3,2,0,1} multiply(%select_n.167, %broadcast.206), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__decoder____moe_layers____self_attention____wq_b____kernel__.1 = bf16[1536,2,128,192]{3,2,1,0} parameter(192), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].mu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'kernel\']"}
+  %constant.589 = bf16[] constant(0.8984)
+  %mul.1524 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%constant.589), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1720 = bf16[1536,2,128,192]{3,2,1,0} multiply(%state_opt_state_0__mu__decoder____moe_layers____self_attention____wq_b____kernel__.1, %mul.1524), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.908 = bf16[1536,2,128,192]{3,2,0,1} add(%mul.1719, %mul.1720), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1504 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1567 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%convert_element_type.1504), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1568 = bf16[1536,2,128,192]{3,2,0,1} divide(%add.908, %div.1567), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.64 = bf16[1536,2,128,192]{3,2,0,1} multiply(%select_n.167, %select_n.167), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.536 = bf16[] constant(0.05005)
+  %mul.1484 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%constant.536), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1785 = bf16[1536,2,128,192]{3,2,0,1} multiply(%integer_pow.64, %mul.1484), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__decoder____moe_layers____self_attention____wq_b____kernel__.1 = bf16[1536,2,128,192]{3,2,1,0} parameter(225), sharding={devices=[4,1,1,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>"}, metadata={op_name="state.opt_state[0].nu[\'decoder\'][\'moe_layers\'][\'self_attention\'][\'wq_b\'][\'kernel\']"}
+  %constant.535 = bf16[] constant(0.9492)
+  %mul.1483 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%constant.535), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1786 = bf16[1536,2,128,192]{3,2,1,0} multiply(%state_opt_state_0__nu__decoder____moe_layers____self_attention____wq_b____kernel__.1, %mul.1483), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.941 = bf16[1536,2,128,192]{3,2,0,1} add(%mul.1785, %mul.1786), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1537 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1633 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%convert_element_type.1537), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1634 = bf16[1536,2,128,192]{3,2,0,1} divide(%add.941, %div.1633), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.69 = bf16[1536,2,128,192]{3,2,0,1} sqrt(%div.1634), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.507 = bf16[] constant(1.001e-08)
+  %add.812 = bf16[1536,2,128,192]{3,2,1,0} broadcast(%constant.507), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.975 = bf16[1536,2,128,192]{3,2,0,1} add(%sqrt.69, %add.812), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1668 = bf16[1536,2,128,192]{3,2,0,1} divide(%div.1568, %add.975), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1820 = bf16[1536,2,128,192]{3,2,1,0} multiply(%state_params__params____decoder____moe_layers____self_attention____wq_b____kernel__.1, %broadcast.206), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1008 = bf16[1536,2,128,192]{3,2,0,1} add(%div.1668, %mul.1820), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1891 = bf16[1536,2,128,192]{3,2,1,0} multiply(%mul.1890, %add.1008), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1045 = bf16[1536,2,128,192]{3,2,1,0} add(%state_params__params____decoder____moe_layers____self_attention____wq_b____kernel__.1, %mul.1891), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.614 = bf16[1536,2,128,192]{3,2,1,0} custom-call(%add.1045), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.842 = bf16[1536,2,128,192]{3,2,1,0} custom-call(%custom-call.614), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.455 = bf16[1536,2,128,192]{3,2,1,0} reshape(%custom-call.842), sharding={devices=[4,1,1,1]<=[4]}
+  %convert_element_type.1573 = bf16[] convert(%mul.1827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %mul.1892 = bf16[129280,7168]{1,0} broadcast(%convert_element_type.1573), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %select_n.168 = pred[129280,7168]{1,0} broadcast(%lt.22), dimensions={}, metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %div.1503 = bf16[129280,7168]{1,0} broadcast(%sqrt.37), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %div.1504 = bf16[129280,7168]{1,0} divide(%scatter-add.5, %div.1503), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=41 source_end_column=67}
+  %select_n.169 = bf16[129280,7168]{1,0} select(%select_n.168, %scatter-add.5, %div.1504), metadata={op_name="jit(train_step)/select_n" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_clipping.py" source_line=102 source_end_line=102 source_column=13 source_end_column=80}
+  %constant.588 = bf16[] constant(0.1001)
+  %broadcast.205 = bf16[129280,7168]{1,0} broadcast(%constant.588), dimensions={}
+  %mul.1721 = bf16[129280,7168]{1,0} multiply(%select_n.169, %broadcast.205), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %state_opt_state_0__mu__token_embedder____embedding__.1 = bf16[129280,7168]{1,0} parameter(193), sharding={devices=[1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].mu[\'token_embedder\'][\'embedding\']"}
+  %constant.587 = bf16[] constant(0.8984)
+  %mul.1523 = bf16[129280,7168]{1,0} broadcast(%constant.587), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %mul.1722 = bf16[129280,7168]{1,0} multiply(%state_opt_state_0__mu__token_embedder____embedding__.1, %mul.1523), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=37 source_end_column=46}
+  %add.909 = bf16[129280,7168]{1,0} add(%mul.1721, %mul.1722), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=357 source_end_line=357 source_column=10 source_end_column=34}
+  %convert_element_type.1505 = bf16[] convert(%sub.86), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1569 = bf16[129280,7168]{1,0} broadcast(%convert_element_type.1505), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %div.1570 = bf16[129280,7168]{1,0} divide(%add.909, %div.1569), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=294 source_end_line=294 source_column=15 source_end_column=60}
+  %integer_pow.65 = bf16[129280,7168]{1,0} multiply(%select_n.169, %select_n.169), metadata={op_name="jit(train_step)/integer_pow" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=382 source_end_line=382 source_column=13 source_end_column=23}
+  %constant.534 = bf16[] constant(0.05005)
+  %mul.1482 = bf16[129280,7168]{1,0} broadcast(%constant.534), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %mul.1787 = bf16[129280,7168]{1,0} multiply(%integer_pow.65, %mul.1482), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %state_opt_state_0__nu__token_embedder____embedding__.1 = bf16[129280,7168]{1,0} parameter(226), sharding={devices=[1,4]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>"}, metadata={op_name="state.opt_state[0].nu[\'token_embedder\'][\'embedding\']"}
+  %constant.533 = bf16[] constant(0.9492)
+  %mul.1481 = bf16[129280,7168]{1,0} broadcast(%constant.533), dimensions={}, metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %mul.1788 = bf16[129280,7168]{1,0} multiply(%state_opt_state_0__nu__token_embedder____embedding__.1, %mul.1481), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=42 source_end_column=51}
+  %add.942 = bf16[129280,7168]{1,0} add(%mul.1787, %mul.1788), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/tree_utils/_tree_math.py" source_line=392 source_end_line=392 source_column=10 source_end_column=39}
+  %convert_element_type.1538 = bf16[] convert(%sub.87), metadata={op_name="jit(train_step)/convert_element_type" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1635 = bf16[129280,7168]{1,0} broadcast(%convert_element_type.1538), dimensions={}, metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %div.1636 = bf16[129280,7168]{1,0} divide(%add.942, %div.1635), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=298 source_end_line=298 source_column=13 source_end_column=58}
+  %sqrt.70 = bf16[129280,7168]{1,0} sqrt(%div.1636), metadata={op_name="jit(train_step)/sqrt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %constant.506 = bf16[] constant(1.001e-08)
+  %add.811 = bf16[129280,7168]{1,0} broadcast(%constant.506), dimensions={}, metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %add.976 = bf16[129280,7168]{1,0} add(%sqrt.70, %add.811), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=49 source_end_column=77}
+  %div.1669 = bf16[129280,7168]{1,0} divide(%div.1570, %add.976), metadata={op_name="jit(train_step)/div" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=300 source_end_line=300 source_column=44 source_end_column=78}
+  %mul.1821 = bf16[129280,7168]{1,0} multiply(%state_params__params____token_embedder____embedding__.1, %broadcast.205), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=48 source_end_column=53}
+  %add.1009 = bf16[129280,7168]{1,0} add(%div.1669, %mul.1821), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/transforms/_adding.py" source_line=65 source_end_line=65 source_column=44 source_end_column=53}
+  %mul.1893 = bf16[129280,7168]{1,0} multiply(%mul.1892, %add.1009), metadata={op_name="jit(train_step)/mul" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/transform.py" source_line=989 source_end_line=989 source_column=18 source_end_column=53}
+  %add.1046 = bf16[129280,7168]{1,0} add(%state_params__params____token_embedder____embedding__.1, %mul.1893), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/update.py" source_line=43 source_end_line=43 source_column=45 source_end_column=50}
+  %custom-call.615 = bf16[129280,7168]{1,0} custom-call(%add.1046), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.843 = bf16[129280,7168]{1,0} custom-call(%custom-call.615), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.456 = bf16[129280,7168]{1,0} reshape(%custom-call.843), sharding={devices=[1,4]<=[4]}
+  %custom-call.616 = s32[] custom-call(%jit__where_.423), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.844 = s32[] custom-call(%custom-call.616), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, []>]>"}
+  %reshape.457 = s32[] reshape(%custom-call.844), sharding={replicated}
+  %custom-call.617 = bf16[7168]{0} custom-call(%add.877), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.845 = bf16[7168]{0} custom-call(%custom-call.617), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}]>]>"}
+  %reshape.458 = bf16[7168]{0} reshape(%custom-call.845), sharding={replicated}
+  %custom-call.618 = bf16[7168,1,18432]{2,1,0} custom-call(%add.878), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.846 = bf16[7168,1,18432]{2,1,0} custom-call(%custom-call.618), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.459 = bf16[7168,1,18432]{2,1,0} reshape(%custom-call.846), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.619 = bf16[7168,1,18432]{2,1,0} custom-call(%add.879), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.847 = bf16[7168,1,18432]{2,1,0} custom-call(%custom-call.619), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.460 = bf16[7168,1,18432]{2,1,0} reshape(%custom-call.847), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.620 = bf16[18432,1,7168]{2,1,0} custom-call(%add.880), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.848 = bf16[18432,1,7168]{2,1,0} custom-call(%custom-call.620), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>]>"}
+  %reshape.461 = bf16[18432,1,7168]{2,1,0} reshape(%custom-call.848), sharding={devices=[1,1,4]<=[4]}
+  %custom-call.621 = bf16[7168,1]{1,0} custom-call(%add.881), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.849 = bf16[7168,1]{1,0} custom-call(%custom-call.621), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.462 = bf16[7168,1]{1,0} reshape(%custom-call.849), sharding={replicated}
+  %custom-call.622 = bf16[7168,1]{1,0} custom-call(%add.882), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.850 = bf16[7168,1]{1,0} custom-call(%custom-call.622), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.463 = bf16[7168,1]{1,0} reshape(%custom-call.850), sharding={replicated}
+  %custom-call.623 = bf16[512,1]{1,0} custom-call(%add.883), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.851 = bf16[512,1]{1,0} custom-call(%custom-call.623), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.464 = bf16[512,1]{1,0} reshape(%custom-call.851), sharding={replicated}
+  %custom-call.624 = bf16[128,1,128,7168]{3,2,1,0} custom-call(%add.884), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.852 = bf16[128,1,128,7168]{3,2,1,0} custom-call(%custom-call.624), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.465 = bf16[128,1,128,7168]{3,2,1,0} reshape(%custom-call.852), sharding={devices=[1,1,1,4]<=[4]}
+  %custom-call.625 = bf16[1536,1]{1,0} custom-call(%add.885), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.853 = bf16[1536,1]{1,0} custom-call(%custom-call.625), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.466 = bf16[1536,1]{1,0} reshape(%custom-call.853), sharding={replicated}
+  %custom-call.626 = bf16[7168,1,576]{2,1,0} custom-call(%add.886), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.854 = bf16[7168,1,576]{2,1,0} custom-call(%custom-call.626), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.467 = bf16[7168,1,576]{2,1,0} reshape(%custom-call.854), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.627 = bf16[512,1,128,256]{3,2,1,0} custom-call(%add.887), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.855 = bf16[512,1,128,256]{3,2,1,0} custom-call(%custom-call.627), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.468 = bf16[512,1,128,256]{3,2,1,0} reshape(%custom-call.855), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.628 = bf16[7168,1,1536]{2,1,0} custom-call(%add.888), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.856 = bf16[7168,1,1536]{2,1,0} custom-call(%custom-call.628), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.469 = bf16[7168,1,1536]{2,1,0} reshape(%custom-call.856), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.629 = bf16[1536,1,128,192]{3,2,1,0} custom-call(%add.889), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.857 = bf16[1536,1,128,192]{3,2,1,0} custom-call(%custom-call.629), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.470 = bf16[1536,1,128,192]{3,2,1,0} reshape(%custom-call.857), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.630 = bf16[7168,129280]{1,0} custom-call(%add.890), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.858 = bf16[7168,129280]{1,0} custom-call(%custom-call.630), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.471 = bf16[7168,129280]{1,0} reshape(%custom-call.858), sharding={devices=[4,1]<=[4]}
+  %custom-call.631 = bf16[64,2]{1,0} custom-call(%add.891), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.859 = bf16[64,2]{1,0} custom-call(%custom-call.631), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.472 = bf16[64,2]{1,0} reshape(%custom-call.859), sharding={replicated}
+  %custom-call.632 = bf16[7168,2,64]{2,1,0} custom-call(%add.892), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.860 = bf16[7168,2,64]{2,1,0} custom-call(%custom-call.632), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.473 = bf16[7168,2,64]{2,1,0} reshape(%custom-call.860), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.633 = bf16[64,2]{1,0} custom-call(%add.893), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.861 = bf16[64,2]{1,0} custom-call(%custom-call.633), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.474 = bf16[64,2]{1,0} reshape(%custom-call.861), sharding={replicated}
+  %custom-call.634 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%add.894), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.862 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%custom-call.634), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.475 = bf16[64,2,7168,2048]{3,2,1,0} reshape(%custom-call.862), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.635 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%add.895), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.863 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%custom-call.635), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.476 = bf16[64,2,7168,2048]{3,2,1,0} reshape(%custom-call.863), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.636 = bf16[64,2,2048,7168]{3,2,1,0} custom-call(%add.896), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.864 = bf16[64,2,2048,7168]{3,2,1,0} custom-call(%custom-call.636), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}]>]>"}
+  %reshape.477 = bf16[64,2,2048,7168]{3,2,1,0} reshape(%custom-call.864), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.637 = bf16[7168,2,2048]{2,1,0} custom-call(%add.897), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.865 = bf16[7168,2,2048]{2,1,0} custom-call(%custom-call.637), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.478 = bf16[7168,2,2048]{2,1,0} reshape(%custom-call.865), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.638 = bf16[7168,2,2048]{2,1,0} custom-call(%add.898), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.866 = bf16[7168,2,2048]{2,1,0} custom-call(%custom-call.638), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.479 = bf16[7168,2,2048]{2,1,0} reshape(%custom-call.866), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.639 = bf16[2048,2,7168]{2,1,0} custom-call(%add.899), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.867 = bf16[2048,2,7168]{2,1,0} custom-call(%custom-call.639), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>]>"}
+  %reshape.480 = bf16[2048,2,7168]{2,1,0} reshape(%custom-call.867), sharding={devices=[1,1,4]<=[4]}
+  %custom-call.640 = bf16[7168,2]{1,0} custom-call(%add.900), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.868 = bf16[7168,2]{1,0} custom-call(%custom-call.640), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.481 = bf16[7168,2]{1,0} reshape(%custom-call.868), sharding={replicated}
+  %custom-call.641 = bf16[7168,2]{1,0} custom-call(%add.901), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.869 = bf16[7168,2]{1,0} custom-call(%custom-call.641), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.482 = bf16[7168,2]{1,0} reshape(%custom-call.869), sharding={replicated}
+  %custom-call.642 = bf16[512,2]{1,0} custom-call(%add.902), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.870 = bf16[512,2]{1,0} custom-call(%custom-call.642), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.483 = bf16[512,2]{1,0} reshape(%custom-call.870), sharding={replicated}
+  %custom-call.643 = bf16[128,2,128,7168]{3,2,1,0} custom-call(%add.903), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.871 = bf16[128,2,128,7168]{3,2,1,0} custom-call(%custom-call.643), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.484 = bf16[128,2,128,7168]{3,2,1,0} reshape(%custom-call.871), sharding={devices=[1,1,1,4]<=[4]}
+  %custom-call.644 = bf16[1536,2]{1,0} custom-call(%add.904), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.872 = bf16[1536,2]{1,0} custom-call(%custom-call.644), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.485 = bf16[1536,2]{1,0} reshape(%custom-call.872), sharding={replicated}
+  %custom-call.645 = bf16[7168,2,576]{2,1,0} custom-call(%add.905), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.873 = bf16[7168,2,576]{2,1,0} custom-call(%custom-call.645), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.486 = bf16[7168,2,576]{2,1,0} reshape(%custom-call.873), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.646 = bf16[512,2,128,256]{3,2,1,0} custom-call(%add.906), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.874 = bf16[512,2,128,256]{3,2,1,0} custom-call(%custom-call.646), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.487 = bf16[512,2,128,256]{3,2,1,0} reshape(%custom-call.874), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.647 = bf16[7168,2,1536]{2,1,0} custom-call(%add.907), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.875 = bf16[7168,2,1536]{2,1,0} custom-call(%custom-call.647), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.488 = bf16[7168,2,1536]{2,1,0} reshape(%custom-call.875), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.648 = bf16[1536,2,128,192]{3,2,1,0} custom-call(%add.908), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.876 = bf16[1536,2,128,192]{3,2,1,0} custom-call(%custom-call.648), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.489 = bf16[1536,2,128,192]{3,2,1,0} reshape(%custom-call.876), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.649 = bf16[129280,7168]{1,0} custom-call(%add.909), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.877 = bf16[129280,7168]{1,0} custom-call(%custom-call.649), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.490 = bf16[129280,7168]{1,0} reshape(%custom-call.877), sharding={devices=[1,4]<=[4]}
+  %custom-call.650 = bf16[7168]{0} custom-call(%add.910), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.878 = bf16[7168]{0} custom-call(%custom-call.650), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}]>]>"}
+  %reshape.491 = bf16[7168]{0} reshape(%custom-call.878), sharding={replicated}
+  %custom-call.651 = bf16[7168,1,18432]{2,1,0} custom-call(%add.911), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.879 = bf16[7168,1,18432]{2,1,0} custom-call(%custom-call.651), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.492 = bf16[7168,1,18432]{2,1,0} reshape(%custom-call.879), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.652 = bf16[7168,1,18432]{2,1,0} custom-call(%add.912), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.880 = bf16[7168,1,18432]{2,1,0} custom-call(%custom-call.652), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.493 = bf16[7168,1,18432]{2,1,0} reshape(%custom-call.880), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.653 = bf16[18432,1,7168]{2,1,0} custom-call(%add.913), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.881 = bf16[18432,1,7168]{2,1,0} custom-call(%custom-call.653), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>]>"}
+  %reshape.494 = bf16[18432,1,7168]{2,1,0} reshape(%custom-call.881), sharding={devices=[1,1,4]<=[4]}
+  %custom-call.654 = bf16[7168,1]{1,0} custom-call(%add.914), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.882 = bf16[7168,1]{1,0} custom-call(%custom-call.654), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.495 = bf16[7168,1]{1,0} reshape(%custom-call.882), sharding={replicated}
+  %custom-call.655 = bf16[7168,1]{1,0} custom-call(%add.915), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.883 = bf16[7168,1]{1,0} custom-call(%custom-call.655), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.496 = bf16[7168,1]{1,0} reshape(%custom-call.883), sharding={replicated}
+  %custom-call.656 = bf16[512,1]{1,0} custom-call(%add.916), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.884 = bf16[512,1]{1,0} custom-call(%custom-call.656), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.497 = bf16[512,1]{1,0} reshape(%custom-call.884), sharding={replicated}
+  %custom-call.657 = bf16[128,1,128,7168]{3,2,1,0} custom-call(%add.917), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.885 = bf16[128,1,128,7168]{3,2,1,0} custom-call(%custom-call.657), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.498 = bf16[128,1,128,7168]{3,2,1,0} reshape(%custom-call.885), sharding={devices=[1,1,1,4]<=[4]}
+  %custom-call.658 = bf16[1536,1]{1,0} custom-call(%add.918), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.886 = bf16[1536,1]{1,0} custom-call(%custom-call.658), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.499 = bf16[1536,1]{1,0} reshape(%custom-call.886), sharding={replicated}
+  %custom-call.659 = bf16[7168,1,576]{2,1,0} custom-call(%add.919), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.887 = bf16[7168,1,576]{2,1,0} custom-call(%custom-call.659), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.500 = bf16[7168,1,576]{2,1,0} reshape(%custom-call.887), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.660 = bf16[512,1,128,256]{3,2,1,0} custom-call(%add.920), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.888 = bf16[512,1,128,256]{3,2,1,0} custom-call(%custom-call.660), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.501 = bf16[512,1,128,256]{3,2,1,0} reshape(%custom-call.888), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.661 = bf16[7168,1,1536]{2,1,0} custom-call(%add.921), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.889 = bf16[7168,1,1536]{2,1,0} custom-call(%custom-call.661), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.502 = bf16[7168,1,1536]{2,1,0} reshape(%custom-call.889), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.662 = bf16[1536,1,128,192]{3,2,1,0} custom-call(%add.922), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.890 = bf16[1536,1,128,192]{3,2,1,0} custom-call(%custom-call.662), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.503 = bf16[1536,1,128,192]{3,2,1,0} reshape(%custom-call.890), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.663 = bf16[7168,129280]{1,0} custom-call(%add.923), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.891 = bf16[7168,129280]{1,0} custom-call(%custom-call.663), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.504 = bf16[7168,129280]{1,0} reshape(%custom-call.891), sharding={devices=[4,1]<=[4]}
+  %custom-call.664 = bf16[64,2]{1,0} custom-call(%add.924), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.892 = bf16[64,2]{1,0} custom-call(%custom-call.664), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.505 = bf16[64,2]{1,0} reshape(%custom-call.892), sharding={replicated}
+  %custom-call.665 = bf16[7168,2,64]{2,1,0} custom-call(%add.925), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.893 = bf16[7168,2,64]{2,1,0} custom-call(%custom-call.665), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.506 = bf16[7168,2,64]{2,1,0} reshape(%custom-call.893), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.666 = bf16[64,2]{1,0} custom-call(%add.926), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.894 = bf16[64,2]{1,0} custom-call(%custom-call.666), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}
+  %reshape.507 = bf16[64,2]{1,0} reshape(%custom-call.894), sharding={replicated}
+  %custom-call.667 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%add.927), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.895 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%custom-call.667), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.508 = bf16[64,2,7168,2048]{3,2,1,0} reshape(%custom-call.895), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.668 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%add.928), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.896 = bf16[64,2,7168,2048]{3,2,1,0} custom-call(%custom-call.668), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.509 = bf16[64,2,7168,2048]{3,2,1,0} reshape(%custom-call.896), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.669 = bf16[64,2,2048,7168]{3,2,1,0} custom-call(%add.929), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.897 = bf16[64,2,2048,7168]{3,2,1,0} custom-call(%custom-call.669), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\"}]>]>"}
+  %reshape.510 = bf16[64,2,2048,7168]{3,2,1,0} reshape(%custom-call.897), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.670 = bf16[7168,2,2048]{2,1,0} custom-call(%add.930), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.898 = bf16[7168,2,2048]{2,1,0} custom-call(%custom-call.670), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.511 = bf16[7168,2,2048]{2,1,0} reshape(%custom-call.898), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.671 = bf16[7168,2,2048]{2,1,0} custom-call(%add.931), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.899 = bf16[7168,2,2048]{2,1,0} custom-call(%custom-call.671), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}]>]>"}
+  %reshape.512 = bf16[7168,2,2048]{2,1,0} reshape(%custom-call.899), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.672 = bf16[2048,2,7168]{2,1,0} custom-call(%add.932), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.900 = bf16[2048,2,7168]{2,1,0} custom-call(%custom-call.672), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp_transpose\", \"tensor\", \"tensor_sequence\", \"autoregressive\"}, {}, {\"fsdp\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}]>]>"}
+  %reshape.513 = bf16[2048,2,7168]{2,1,0} reshape(%custom-call.900), sharding={devices=[1,1,4]<=[4]}
+  %custom-call.673 = bf16[7168,2]{1,0} custom-call(%add.933), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.901 = bf16[7168,2]{1,0} custom-call(%custom-call.673), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.514 = bf16[7168,2]{1,0} reshape(%custom-call.901), sharding={replicated}
+  %custom-call.674 = bf16[7168,2]{1,0} custom-call(%add.934), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.902 = bf16[7168,2]{1,0} custom-call(%custom-call.674), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.515 = bf16[7168,2]{1,0} reshape(%custom-call.902), sharding={replicated}
+  %custom-call.675 = bf16[512,2]{1,0} custom-call(%add.935), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.903 = bf16[512,2]{1,0} custom-call(%custom-call.675), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.516 = bf16[512,2]{1,0} reshape(%custom-call.903), sharding={replicated}
+  %custom-call.676 = bf16[128,2,128,7168]{3,2,1,0} custom-call(%add.936), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.904 = bf16[128,2,128,7168]{3,2,1,0} custom-call(%custom-call.676), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}, {}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.517 = bf16[128,2,128,7168]{3,2,1,0} reshape(%custom-call.904), sharding={devices=[1,1,1,4]<=[4]}
+  %custom-call.677 = bf16[1536,2]{1,0} custom-call(%add.937), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.905 = bf16[1536,2]{1,0} custom-call(%custom-call.677), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\"}, {}]>]>"}
+  %reshape.518 = bf16[1536,2]{1,0} reshape(%custom-call.905), sharding={replicated}
+  %custom-call.678 = bf16[7168,2,576]{2,1,0} custom-call(%add.938), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.906 = bf16[7168,2,576]{2,1,0} custom-call(%custom-call.678), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.519 = bf16[7168,2,576]{2,1,0} reshape(%custom-call.906), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.679 = bf16[512,2,128,256]{3,2,1,0} custom-call(%add.939), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.907 = bf16[512,2,128,256]{3,2,1,0} custom-call(%custom-call.679), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.520 = bf16[512,2,128,256]{3,2,1,0} reshape(%custom-call.907), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.680 = bf16[7168,2,1536]{2,1,0} custom-call(%add.940), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.908 = bf16[7168,2,1536]{2,1,0} custom-call(%custom-call.680), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"tensor_transpose\", \"context\", \"expert\"}, {}, {}]>]>"}
+  %reshape.521 = bf16[7168,2,1536]{2,1,0} reshape(%custom-call.908), sharding={devices=[4,1,1]<=[4]}
+  %custom-call.681 = bf16[1536,2,128,192]{3,2,1,0} custom-call(%add.941), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.909 = bf16[1536,2,128,192]{3,2,1,0} custom-call(%custom-call.681), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}, {}, {\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {}]>]>"}
+  %reshape.522 = bf16[1536,2,128,192]{3,2,1,0} reshape(%custom-call.909), sharding={devices=[4,1,1,1]<=[4]}
+  %custom-call.682 = bf16[129280,7168]{1,0} custom-call(%add.942), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.910 = bf16[129280,7168]{1,0} custom-call(%custom-call.682), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"tensor\", \"tensor_transpose\", \"tensor_sequence\", \"autoregressive\"}, {\"fsdp\", \"fsdp_transpose\", \"sequence\", \"context\", \"expert\"}]>]>"}
+  %reshape.523 = bf16[129280,7168]{1,0} reshape(%custom-call.910), sharding={devices=[1,4]<=[4]}
+  %lt.25 = pred[] compare(%state_opt_state_2__count.1, %constant.708), direction=LT, metadata={op_name="jit(train_step)/lt" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=142 source_end_line=142 source_column=19 source_end_column=36}
+  %add.1013 = s32[] add(%state_opt_state_2__count.1, %constant.714), metadata={op_name="jit(train_step)/add" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=142 source_end_line=142 source_column=38 source_end_column=49}
+  %jit__where_.425 = s32[] call(%lt.25, %add.1013, %constant.708), to_apply=%_where_1097.427, metadata={op_name="jit(train_step)/jit(_where)" source_file="/usr/local/lib/python3.12/dist-packages/optax/_src/numerics.py" source_line=142 source_end_line=142 source_column=9 source_end_column=61}
+  %custom-call.683 = s32[] custom-call(%jit__where_.425), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device"}
+  %custom-call.911 = s32[] custom-call(%custom-call.683), custom_call_target="xla.sdy.FuncResultSharding", custom_call_has_side_effect=true, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, []>]>"}
+  %reshape.524 = s32[] reshape(%custom-call.911), sharding={replicated}
+  %square.512 = f32[1,1024]{1,0} multiply(%while.324, %while.324), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4107 = f32[] reduce(%square.512, %constant.712), dimensions={0,1}, to_apply=%region_339.430, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %square.513 = f32[1,1]{1,0} multiply(%while.325, %while.325), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4108 = f32[] reduce(%square.513, %constant.712), dimensions={0,1}, to_apply=%region_340.431, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1048 = f32[] add(%reduce_sum.4107, %reduce_sum.4108), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.514 = f32[1,1024]{1,0} multiply(%while.326, %while.326), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4109 = f32[] reduce(%square.514, %constant.712), dimensions={0,1}, to_apply=%region_341.432, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1049 = f32[] add(%add.1048, %reduce_sum.4109), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.515 = f32[1,1]{1,0} multiply(%while.327, %while.327), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4110 = f32[] reduce(%square.515, %constant.712), dimensions={0,1}, to_apply=%region_342.433, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1050 = f32[] add(%add.1049, %reduce_sum.4110), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.516 = f32[1,1024]{1,0} multiply(%while.328, %while.328), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4111 = f32[] reduce(%square.516, %constant.712), dimensions={0,1}, to_apply=%region_343.434, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1051 = f32[] add(%add.1050, %reduce_sum.4111), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.517 = f32[1,1]{1,0} multiply(%while.329, %while.329), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4112 = f32[] reduce(%square.517, %constant.712), dimensions={0,1}, to_apply=%region_344.435, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1052 = f32[] add(%add.1051, %reduce_sum.4112), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.518 = f32[1,1024]{1,0} multiply(%while.330, %while.330), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4113 = f32[] reduce(%square.518, %constant.712), dimensions={0,1}, to_apply=%region_345.436, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1053 = f32[] add(%add.1052, %reduce_sum.4113), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.519 = f32[1,1]{1,0} multiply(%while.331, %while.331), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4114 = f32[] reduce(%square.519, %constant.712), dimensions={0,1}, to_apply=%region_346.437, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1054 = f32[] add(%add.1053, %reduce_sum.4114), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.520 = f32[1,1024]{1,0} multiply(%while.332, %while.332), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4115 = f32[] reduce(%square.520, %constant.712), dimensions={0,1}, to_apply=%region_347.438, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1055 = f32[] add(%add.1054, %reduce_sum.4115), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.521 = f32[1,1]{1,0} multiply(%while.333, %while.333), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4116 = f32[] reduce(%square.521, %constant.712), dimensions={0,1}, to_apply=%region_348.439, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1056 = f32[] add(%add.1055, %reduce_sum.4116), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.522 = f32[1,1024]{1,0} multiply(%while.334, %while.334), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4117 = f32[] reduce(%square.522, %constant.712), dimensions={0,1}, to_apply=%region_349.440, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1057 = f32[] add(%add.1056, %reduce_sum.4117), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.523 = f32[1,1]{1,0} multiply(%while.335, %while.335), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4118 = f32[] reduce(%square.523, %constant.712), dimensions={0,1}, to_apply=%region_350.441, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1058 = f32[] add(%add.1057, %reduce_sum.4118), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.524 = f32[1,1024]{1,0} multiply(%while.336, %while.336), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4119 = f32[] reduce(%square.524, %constant.712), dimensions={0,1}, to_apply=%region_351.442, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1059 = f32[] add(%add.1058, %reduce_sum.4119), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.525 = f32[1,1]{1,0} multiply(%while.337, %while.337), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4120 = f32[] reduce(%square.525, %constant.712), dimensions={0,1}, to_apply=%region_352.443, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1060 = f32[] add(%add.1059, %reduce_sum.4120), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.526 = f32[1,1024]{1,0} multiply(%while.338, %while.338), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4121 = f32[] reduce(%square.526, %constant.712), dimensions={0,1}, to_apply=%region_353.444, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1061 = f32[] add(%add.1060, %reduce_sum.4121), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.527 = f32[1,1]{1,0} multiply(%while.339, %while.339), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4122 = f32[] reduce(%square.527, %constant.712), dimensions={0,1}, to_apply=%region_354.445, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1062 = f32[] add(%add.1061, %reduce_sum.4122), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.528 = f32[1,1024]{1,0} multiply(%while.340, %while.340), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4123 = f32[] reduce(%square.528, %constant.712), dimensions={0,1}, to_apply=%region_355.446, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1063 = f32[] add(%add.1062, %reduce_sum.4123), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.529 = f32[1,1]{1,0} multiply(%while.341, %while.341), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4124 = f32[] reduce(%square.529, %constant.712), dimensions={0,1}, to_apply=%region_356.447, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1064 = f32[] add(%add.1063, %reduce_sum.4124), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.530 = f32[1,1024]{1,0} multiply(%while.342, %while.342), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4125 = f32[] reduce(%square.530, %constant.712), dimensions={0,1}, to_apply=%region_357.448, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1065 = f32[] add(%add.1064, %reduce_sum.4125), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.531 = f32[1,1]{1,0} multiply(%while.343, %while.343), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4126 = f32[] reduce(%square.531, %constant.712), dimensions={0,1}, to_apply=%region_358.449, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1066 = f32[] add(%add.1065, %reduce_sum.4126), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.532 = f32[1,1024]{1,0} multiply(%while.344, %while.344), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4127 = f32[] reduce(%square.532, %constant.712), dimensions={0,1}, to_apply=%region_359.450, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1067 = f32[] add(%add.1066, %reduce_sum.4127), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.533 = f32[1,1024]{1,0} multiply(%while.345, %while.345), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4128 = f32[] reduce(%square.533, %constant.712), dimensions={0,1}, to_apply=%region_360.451, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1068 = f32[] add(%add.1067, %reduce_sum.4128), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.534 = f32[1,1]{1,0} multiply(%while.346, %while.346), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4129 = f32[] reduce(%square.534, %constant.712), dimensions={0,1}, to_apply=%region_361.452, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1069 = f32[] add(%add.1068, %reduce_sum.4129), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.535 = f32[1,1]{1,0} multiply(%while.347, %while.347), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4130 = f32[] reduce(%square.535, %constant.712), dimensions={0,1}, to_apply=%region_362.453, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1070 = f32[] add(%add.1069, %reduce_sum.4130), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.536 = f32[1,1024]{1,0} multiply(%while.348, %while.348), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4131 = f32[] reduce(%square.536, %constant.712), dimensions={0,1}, to_apply=%region_363.454, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1071 = f32[] add(%add.1070, %reduce_sum.4131), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.537 = f32[1,1024]{1,0} multiply(%while.349, %while.349), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4132 = f32[] reduce(%square.537, %constant.712), dimensions={0,1}, to_apply=%region_364.455, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1072 = f32[] add(%add.1071, %reduce_sum.4132), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.538 = f32[1,1]{1,0} multiply(%while.350, %while.350), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4133 = f32[] reduce(%square.538, %constant.712), dimensions={0,1}, to_apply=%region_365.456, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1073 = f32[] add(%add.1072, %reduce_sum.4133), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.539 = f32[1,1]{1,0} multiply(%while.351, %while.351), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4134 = f32[] reduce(%square.539, %constant.712), dimensions={0,1}, to_apply=%region_366.457, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1074 = f32[] add(%add.1073, %reduce_sum.4134), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.540 = f32[1,1024]{1,0} multiply(%while.352, %while.352), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4135 = f32[] reduce(%square.540, %constant.712), dimensions={0,1}, to_apply=%region_367.458, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1075 = f32[] add(%add.1074, %reduce_sum.4135), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.541 = f32[1,1]{1,0} multiply(%while.353, %while.353), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4136 = f32[] reduce(%square.541, %constant.712), dimensions={0,1}, to_apply=%region_368.459, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1076 = f32[] add(%add.1075, %reduce_sum.4136), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.542 = f32[1,1024]{1,0} multiply(%while.354, %while.354), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4137 = f32[] reduce(%square.542, %constant.712), dimensions={0,1}, to_apply=%region_369.460, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1077 = f32[] add(%add.1076, %reduce_sum.4137), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.543 = f32[1,1]{1,0} multiply(%while.355, %while.355), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4138 = f32[] reduce(%square.543, %constant.712), dimensions={0,1}, to_apply=%region_370.461, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1078 = f32[] add(%add.1077, %reduce_sum.4138), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.544 = f32[1,1024]{1,0} multiply(%while.356, %while.356), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4139 = f32[] reduce(%square.544, %constant.712), dimensions={0,1}, to_apply=%region_371.462, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1079 = f32[] add(%add.1078, %reduce_sum.4139), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.545 = f32[1,1]{1,0} multiply(%while.357, %while.357), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4140 = f32[] reduce(%square.545, %constant.712), dimensions={0,1}, to_apply=%region_372.463, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1080 = f32[] add(%add.1079, %reduce_sum.4140), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.546 = f32[1,1024]{1,0} multiply(%while.358, %while.358), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4141 = f32[] reduce(%square.546, %constant.712), dimensions={0,1}, to_apply=%region_373.464, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1081 = f32[] add(%add.1080, %reduce_sum.4141), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.547 = f32[1,1]{1,0} multiply(%while.359, %while.359), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4142 = f32[] reduce(%square.547, %constant.712), dimensions={0,1}, to_apply=%region_374.465, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1082 = f32[] add(%add.1081, %reduce_sum.4142), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.548 = f32[1,1024]{1,0} multiply(%while.360, %while.360), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4143 = f32[] reduce(%square.548, %constant.712), dimensions={0,1}, to_apply=%region_375.466, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1083 = f32[] add(%add.1082, %reduce_sum.4143), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.549 = f32[1,1]{1,0} multiply(%while.361, %while.361), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4144 = f32[] reduce(%square.549, %constant.712), dimensions={0,1}, to_apply=%region_376.467, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1084 = f32[] add(%add.1083, %reduce_sum.4144), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.550 = f32[1,1024]{1,0} multiply(%while.362, %while.362), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4145 = f32[] reduce(%square.550, %constant.712), dimensions={0,1}, to_apply=%region_377.468, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1085 = f32[] add(%add.1084, %reduce_sum.4145), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.551 = f32[1,1]{1,0} multiply(%while.363, %while.363), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4146 = f32[] reduce(%square.551, %constant.712), dimensions={0,1}, to_apply=%region_378.469, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1086 = f32[] add(%add.1085, %reduce_sum.4146), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.552 = f32[1,1024]{1,0} multiply(%while.364, %while.364), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4147 = f32[] reduce(%square.552, %constant.712), dimensions={0,1}, to_apply=%region_379.470, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1087 = f32[] add(%add.1086, %reduce_sum.4147), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.553 = f32[1,1]{1,0} multiply(%while.365, %while.365), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4148 = f32[] reduce(%square.553, %constant.712), dimensions={0,1}, to_apply=%region_380.471, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1088 = f32[] add(%add.1087, %reduce_sum.4148), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.554 = f32[1,1024]{1,0} multiply(%while.366, %while.366), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4149 = f32[] reduce(%square.554, %constant.712), dimensions={0,1}, to_apply=%region_381.472, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1089 = f32[] add(%add.1088, %reduce_sum.4149), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.555 = f32[1,1]{1,0} multiply(%while.367, %while.367), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4150 = f32[] reduce(%square.555, %constant.712), dimensions={0,1}, to_apply=%region_382.473, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1090 = f32[] add(%add.1089, %reduce_sum.4150), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.556 = f32[1,1024]{1,0} multiply(%while.368, %while.368), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4151 = f32[] reduce(%square.556, %constant.712), dimensions={0,1}, to_apply=%region_383.474, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1091 = f32[] add(%add.1090, %reduce_sum.4151), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.557 = f32[1,1]{1,0} multiply(%while.369, %while.369), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4152 = f32[] reduce(%square.557, %constant.712), dimensions={0,1}, to_apply=%region_384.475, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1092 = f32[] add(%add.1091, %reduce_sum.4152), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.558 = f32[2,1024]{1,0} multiply(%while.228, %while.228), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4153 = f32[] reduce(%square.558, %constant.712), dimensions={0,1}, to_apply=%region_385.476, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1093 = f32[] add(%add.1092, %reduce_sum.4153), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.559 = f32[2,1]{1,0} multiply(%while.229, %while.229), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4154 = f32[] reduce(%square.559, %constant.712), dimensions={0,1}, to_apply=%region_386.477, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1094 = f32[] add(%add.1093, %reduce_sum.4154), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.560 = f32[2,1024]{1,0} multiply(%while.230, %while.230), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4155 = f32[] reduce(%square.560, %constant.712), dimensions={0,1}, to_apply=%region_387.478, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1095 = f32[] add(%add.1094, %reduce_sum.4155), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.561 = f32[2,1]{1,0} multiply(%while.231, %while.231), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4156 = f32[] reduce(%square.561, %constant.712), dimensions={0,1}, to_apply=%region_388.479, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1096 = f32[] add(%add.1095, %reduce_sum.4156), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.562 = f32[2,1024]{1,0} multiply(%while.232, %while.232), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4157 = f32[] reduce(%square.562, %constant.712), dimensions={0,1}, to_apply=%region_389.480, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1097 = f32[] add(%add.1096, %reduce_sum.4157), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.563 = f32[2,1]{1,0} multiply(%while.233, %while.233), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4158 = f32[] reduce(%square.563, %constant.712), dimensions={0,1}, to_apply=%region_390.481, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1098 = f32[] add(%add.1097, %reduce_sum.4158), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.564 = f32[2,1024]{1,0} multiply(%while.234, %while.234), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4159 = f32[] reduce(%square.564, %constant.712), dimensions={0,1}, to_apply=%region_391.482, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1099 = f32[] add(%add.1098, %reduce_sum.4159), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.565 = f32[2,1]{1,0} multiply(%while.235, %while.235), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4160 = f32[] reduce(%square.565, %constant.712), dimensions={0,1}, to_apply=%region_392.483, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1100 = f32[] add(%add.1099, %reduce_sum.4160), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.566 = f32[2,1024]{1,0} multiply(%while.236, %while.236), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4161 = f32[] reduce(%square.566, %constant.712), dimensions={0,1}, to_apply=%region_393.484, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1101 = f32[] add(%add.1100, %reduce_sum.4161), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.567 = f32[2,1]{1,0} multiply(%while.237, %while.237), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4162 = f32[] reduce(%square.567, %constant.712), dimensions={0,1}, to_apply=%region_394.485, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1102 = f32[] add(%add.1101, %reduce_sum.4162), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.568 = f32[2,1024]{1,0} multiply(%while.238, %while.238), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4163 = f32[] reduce(%square.568, %constant.712), dimensions={0,1}, to_apply=%region_395.486, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1103 = f32[] add(%add.1102, %reduce_sum.4163), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.569 = f32[2,1]{1,0} multiply(%while.239, %while.239), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4164 = f32[] reduce(%square.569, %constant.712), dimensions={0,1}, to_apply=%region_396.487, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1104 = f32[] add(%add.1103, %reduce_sum.4164), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.570 = f32[2,1024]{1,0} multiply(%while.240, %while.240), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4165 = f32[] reduce(%square.570, %constant.712), dimensions={0,1}, to_apply=%region_397.488, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1105 = f32[] add(%add.1104, %reduce_sum.4165), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.571 = f32[2,1]{1,0} multiply(%while.241, %while.241), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4166 = f32[] reduce(%square.571, %constant.712), dimensions={0,1}, to_apply=%region_398.489, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1106 = f32[] add(%add.1105, %reduce_sum.4166), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.572 = f32[2,1024]{1,0} multiply(%while.242, %while.242), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4167 = f32[] reduce(%square.572, %constant.712), dimensions={0,1}, to_apply=%region_399.490, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1107 = f32[] add(%add.1106, %reduce_sum.4167), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.573 = f32[2,1024]{1,0} multiply(%while.243, %while.243), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4168 = f32[] reduce(%square.573, %constant.712), dimensions={0,1}, to_apply=%region_400.491, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1108 = f32[] add(%add.1107, %reduce_sum.4168), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.574 = f32[2,1]{1,0} multiply(%while.244, %while.244), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4169 = f32[] reduce(%square.574, %constant.712), dimensions={0,1}, to_apply=%region_401.492, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1109 = f32[] add(%add.1108, %reduce_sum.4169), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.575 = f32[2,1]{1,0} multiply(%while.245, %while.245), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4170 = f32[] reduce(%square.575, %constant.712), dimensions={0,1}, to_apply=%region_402.493, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1110 = f32[] add(%add.1109, %reduce_sum.4170), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.576 = f32[2,1024]{1,0} multiply(%while.246, %while.246), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4171 = f32[] reduce(%square.576, %constant.712), dimensions={0,1}, to_apply=%region_403.494, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1111 = f32[] add(%add.1110, %reduce_sum.4171), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.577 = f32[2,1024]{1,0} multiply(%while.247, %while.247), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4172 = f32[] reduce(%square.577, %constant.712), dimensions={0,1}, to_apply=%region_404.495, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1112 = f32[] add(%add.1111, %reduce_sum.4172), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.578 = f32[2,1]{1,0} multiply(%while.248, %while.248), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4173 = f32[] reduce(%square.578, %constant.712), dimensions={0,1}, to_apply=%region_405.496, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1113 = f32[] add(%add.1112, %reduce_sum.4173), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.579 = f32[2,1]{1,0} multiply(%while.249, %while.249), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4174 = f32[] reduce(%square.579, %constant.712), dimensions={0,1}, to_apply=%region_406.497, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1114 = f32[] add(%add.1113, %reduce_sum.4174), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.580 = f32[2,1024]{1,0} multiply(%while.250, %while.250), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4175 = f32[] reduce(%square.580, %constant.712), dimensions={0,1}, to_apply=%region_407.498, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1115 = f32[] add(%add.1114, %reduce_sum.4175), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.581 = f32[2,1]{1,0} multiply(%while.251, %while.251), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4176 = f32[] reduce(%square.581, %constant.712), dimensions={0,1}, to_apply=%region_408.499, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1116 = f32[] add(%add.1115, %reduce_sum.4176), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.582 = f32[2,1024]{1,0} multiply(%while.252, %while.252), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4177 = f32[] reduce(%square.582, %constant.712), dimensions={0,1}, to_apply=%region_409.500, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1117 = f32[] add(%add.1116, %reduce_sum.4177), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.583 = f32[2,1]{1,0} multiply(%while.253, %while.253), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4178 = f32[] reduce(%square.583, %constant.712), dimensions={0,1}, to_apply=%region_410.501, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1118 = f32[] add(%add.1117, %reduce_sum.4178), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.584 = f32[2,1024]{1,0} multiply(%while.254, %while.254), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4179 = f32[] reduce(%square.584, %constant.712), dimensions={0,1}, to_apply=%region_411.502, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1119 = f32[] add(%add.1118, %reduce_sum.4179), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.585 = f32[2,1]{1,0} multiply(%while.255, %while.255), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4180 = f32[] reduce(%square.585, %constant.712), dimensions={0,1}, to_apply=%region_412.503, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1120 = f32[] add(%add.1119, %reduce_sum.4180), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.586 = f32[2,1024]{1,0} multiply(%while.256, %while.256), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4181 = f32[] reduce(%square.586, %constant.712), dimensions={0,1}, to_apply=%region_413.504, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1121 = f32[] add(%add.1120, %reduce_sum.4181), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.587 = f32[2,1]{1,0} multiply(%while.257, %while.257), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4182 = f32[] reduce(%square.587, %constant.712), dimensions={0,1}, to_apply=%region_414.505, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1122 = f32[] add(%add.1121, %reduce_sum.4182), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.588 = f32[2,1024]{1,0} multiply(%while.258, %while.258), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4183 = f32[] reduce(%square.588, %constant.712), dimensions={0,1}, to_apply=%region_415.506, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1123 = f32[] add(%add.1122, %reduce_sum.4183), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.589 = f32[2,1]{1,0} multiply(%while.259, %while.259), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4184 = f32[] reduce(%square.589, %constant.712), dimensions={0,1}, to_apply=%region_416.507, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1124 = f32[] add(%add.1123, %reduce_sum.4184), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.590 = f32[2,1024]{1,0} multiply(%while.260, %while.260), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4185 = f32[] reduce(%square.590, %constant.712), dimensions={0,1}, to_apply=%region_417.508, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1125 = f32[] add(%add.1124, %reduce_sum.4185), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.591 = f32[2,1]{1,0} multiply(%while.261, %while.261), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4186 = f32[] reduce(%square.591, %constant.712), dimensions={0,1}, to_apply=%region_418.509, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1126 = f32[] add(%add.1125, %reduce_sum.4186), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.592 = f32[2,1024]{1,0} multiply(%while.262, %while.262), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4187 = f32[] reduce(%square.592, %constant.712), dimensions={0,1}, to_apply=%region_419.510, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1127 = f32[] add(%add.1126, %reduce_sum.4187), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.593 = f32[2,1]{1,0} multiply(%while.263, %while.263), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4188 = f32[] reduce(%square.593, %constant.712), dimensions={0,1}, to_apply=%region_420.511, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1128 = f32[] add(%add.1127, %reduce_sum.4188), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.594 = f32[2,1024]{1,0} multiply(%while.264, %while.264), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4189 = f32[] reduce(%square.594, %constant.712), dimensions={0,1}, to_apply=%region_421.512, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1129 = f32[] add(%add.1128, %reduce_sum.4189), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.595 = f32[2,1]{1,0} multiply(%while.265, %while.265), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4190 = f32[] reduce(%square.595, %constant.712), dimensions={0,1}, to_apply=%region_422.513, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1130 = f32[] add(%add.1129, %reduce_sum.4190), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.596 = f32[2,1024]{1,0} multiply(%while.266, %while.266), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4191 = f32[] reduce(%square.596, %constant.712), dimensions={0,1}, to_apply=%region_423.514, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1131 = f32[] add(%add.1130, %reduce_sum.4191), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.597 = f32[2,1]{1,0} multiply(%while.267, %while.267), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4192 = f32[] reduce(%square.597, %constant.712), dimensions={0,1}, to_apply=%region_424.515, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1132 = f32[] add(%add.1131, %reduce_sum.4192), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.598 = f32[2,1024]{1,0} multiply(%while.268, %while.268), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4193 = f32[] reduce(%square.598, %constant.712), dimensions={0,1}, to_apply=%region_425.516, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1133 = f32[] add(%add.1132, %reduce_sum.4193), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.599 = f32[2,1]{1,0} multiply(%while.269, %while.269), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4194 = f32[] reduce(%square.599, %constant.712), dimensions={0,1}, to_apply=%region_426.517, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1134 = f32[] add(%add.1133, %reduce_sum.4194), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.600 = f32[2,1024]{1,0} multiply(%while.270, %while.270), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4195 = f32[] reduce(%square.600, %constant.712), dimensions={0,1}, to_apply=%region_427.518, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1135 = f32[] add(%add.1134, %reduce_sum.4195), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.601 = f32[2,1]{1,0} multiply(%while.271, %while.271), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4196 = f32[] reduce(%square.601, %constant.712), dimensions={0,1}, to_apply=%region_428.519, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1136 = f32[] add(%add.1135, %reduce_sum.4196), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.602 = f32[2,1024]{1,0} multiply(%while.272, %while.272), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4197 = f32[] reduce(%square.602, %constant.712), dimensions={0,1}, to_apply=%region_429.520, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1137 = f32[] add(%add.1136, %reduce_sum.4197), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.603 = f32[2,1]{1,0} multiply(%while.273, %while.273), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4198 = f32[] reduce(%square.603, %constant.712), dimensions={0,1}, to_apply=%region_430.521, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1138 = f32[] add(%add.1137, %reduce_sum.4198), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.604 = f32[2,1024]{1,0} multiply(%while.274, %while.274), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4199 = f32[] reduce(%square.604, %constant.712), dimensions={0,1}, to_apply=%region_431.522, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1139 = f32[] add(%add.1138, %reduce_sum.4199), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.605 = f32[2,1]{1,0} multiply(%while.275, %while.275), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4200 = f32[] reduce(%square.605, %constant.712), dimensions={0,1}, to_apply=%region_432.523, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1140 = f32[] add(%add.1139, %reduce_sum.4200), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.606 = f32[2,1024]{1,0} multiply(%while.276, %while.276), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4201 = f32[] reduce(%square.606, %constant.712), dimensions={0,1}, to_apply=%region_433.524, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1141 = f32[] add(%add.1140, %reduce_sum.4201), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.607 = f32[2,1]{1,0} multiply(%while.277, %while.277), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4202 = f32[] reduce(%square.607, %constant.712), dimensions={0,1}, to_apply=%region_434.525, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1142 = f32[] add(%add.1141, %reduce_sum.4202), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.608 = f32[2,1024]{1,0} multiply(%while.278, %while.278), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4203 = f32[] reduce(%square.608, %constant.712), dimensions={0,1}, to_apply=%region_435.526, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1143 = f32[] add(%add.1142, %reduce_sum.4203), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.609 = f32[2,1]{1,0} multiply(%while.279, %while.279), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4204 = f32[] reduce(%square.609, %constant.712), dimensions={0,1}, to_apply=%region_436.527, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1144 = f32[] add(%add.1143, %reduce_sum.4204), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.610 = f32[2,1024]{1,0} multiply(%while.280, %while.280), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4205 = f32[] reduce(%square.610, %constant.712), dimensions={0,1}, to_apply=%region_437.528, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1145 = f32[] add(%add.1144, %reduce_sum.4205), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.611 = f32[2,1]{1,0} multiply(%while.281, %while.281), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4206 = f32[] reduce(%square.611, %constant.712), dimensions={0,1}, to_apply=%region_438.529, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1146 = f32[] add(%add.1145, %reduce_sum.4206), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.612 = f32[2,1024]{1,0} multiply(%while.282, %while.282), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4207 = f32[] reduce(%square.612, %constant.712), dimensions={0,1}, to_apply=%region_439.530, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1147 = f32[] add(%add.1146, %reduce_sum.4207), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.613 = f32[2,1024]{1,0} multiply(%while.283, %while.283), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4208 = f32[] reduce(%square.613, %constant.712), dimensions={0,1}, to_apply=%region_440.531, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1148 = f32[] add(%add.1147, %reduce_sum.4208), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.614 = f32[2,1]{1,0} multiply(%while.284, %while.284), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4209 = f32[] reduce(%square.614, %constant.712), dimensions={0,1}, to_apply=%region_441.532, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1149 = f32[] add(%add.1148, %reduce_sum.4209), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.615 = f32[2,1]{1,0} multiply(%while.285, %while.285), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4210 = f32[] reduce(%square.615, %constant.712), dimensions={0,1}, to_apply=%region_442.533, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1150 = f32[] add(%add.1149, %reduce_sum.4210), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.616 = f32[2,1024]{1,0} multiply(%while.286, %while.286), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4211 = f32[] reduce(%square.616, %constant.712), dimensions={0,1}, to_apply=%region_443.534, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1151 = f32[] add(%add.1150, %reduce_sum.4211), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.617 = f32[2,1024]{1,0} multiply(%while.287, %while.287), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4212 = f32[] reduce(%square.617, %constant.712), dimensions={0,1}, to_apply=%region_444.535, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1152 = f32[] add(%add.1151, %reduce_sum.4212), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.618 = f32[2,1]{1,0} multiply(%while.288, %while.288), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4213 = f32[] reduce(%square.618, %constant.712), dimensions={0,1}, to_apply=%region_445.536, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1153 = f32[] add(%add.1152, %reduce_sum.4213), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.619 = f32[2,1]{1,0} multiply(%while.289, %while.289), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4214 = f32[] reduce(%square.619, %constant.712), dimensions={0,1}, to_apply=%region_446.537, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1154 = f32[] add(%add.1153, %reduce_sum.4214), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.620 = f32[2,1024]{1,0} multiply(%while.290, %while.290), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4215 = f32[] reduce(%square.620, %constant.712), dimensions={0,1}, to_apply=%region_447.538, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1155 = f32[] add(%add.1154, %reduce_sum.4215), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.621 = f32[2,1]{1,0} multiply(%while.291, %while.291), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4216 = f32[] reduce(%square.621, %constant.712), dimensions={0,1}, to_apply=%region_448.539, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1156 = f32[] add(%add.1155, %reduce_sum.4216), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.622 = f32[2,1024]{1,0} multiply(%while.292, %while.292), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4217 = f32[] reduce(%square.622, %constant.712), dimensions={0,1}, to_apply=%region_449.540, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1157 = f32[] add(%add.1156, %reduce_sum.4217), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.623 = f32[2,1]{1,0} multiply(%while.293, %while.293), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4218 = f32[] reduce(%square.623, %constant.712), dimensions={0,1}, to_apply=%region_450.541, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1158 = f32[] add(%add.1157, %reduce_sum.4218), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.624 = f32[2,1024]{1,0} multiply(%while.294, %while.294), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4219 = f32[] reduce(%square.624, %constant.712), dimensions={0,1}, to_apply=%region_451.542, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1159 = f32[] add(%add.1158, %reduce_sum.4219), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.625 = f32[2,1]{1,0} multiply(%while.295, %while.295), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4220 = f32[] reduce(%square.625, %constant.712), dimensions={0,1}, to_apply=%region_452.543, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1160 = f32[] add(%add.1159, %reduce_sum.4220), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.626 = f32[2,1024]{1,0} multiply(%while.296, %while.296), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4221 = f32[] reduce(%square.626, %constant.712), dimensions={0,1}, to_apply=%region_453.544, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1161 = f32[] add(%add.1160, %reduce_sum.4221), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.627 = f32[2,1]{1,0} multiply(%while.297, %while.297), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4222 = f32[] reduce(%square.627, %constant.712), dimensions={0,1}, to_apply=%region_454.545, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1162 = f32[] add(%add.1161, %reduce_sum.4222), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.628 = f32[2,1024]{1,0} multiply(%while.298, %while.298), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4223 = f32[] reduce(%square.628, %constant.712), dimensions={0,1}, to_apply=%region_455.546, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1163 = f32[] add(%add.1162, %reduce_sum.4223), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.629 = f32[2,1]{1,0} multiply(%while.299, %while.299), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4224 = f32[] reduce(%square.629, %constant.712), dimensions={0,1}, to_apply=%region_456.547, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1164 = f32[] add(%add.1163, %reduce_sum.4224), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.630 = f32[2,1024]{1,0} multiply(%while.300, %while.300), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4225 = f32[] reduce(%square.630, %constant.712), dimensions={0,1}, to_apply=%region_457.548, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1165 = f32[] add(%add.1164, %reduce_sum.4225), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.631 = f32[2,1]{1,0} multiply(%while.301, %while.301), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4226 = f32[] reduce(%square.631, %constant.712), dimensions={0,1}, to_apply=%region_458.549, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1166 = f32[] add(%add.1165, %reduce_sum.4226), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.632 = f32[2,1024]{1,0} multiply(%while.302, %while.302), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4227 = f32[] reduce(%square.632, %constant.712), dimensions={0,1}, to_apply=%region_459.550, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1167 = f32[] add(%add.1166, %reduce_sum.4227), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.633 = f32[2,1]{1,0} multiply(%while.303, %while.303), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4228 = f32[] reduce(%square.633, %constant.712), dimensions={0,1}, to_apply=%region_460.551, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1168 = f32[] add(%add.1167, %reduce_sum.4228), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.634 = f32[2,1024]{1,0} multiply(%while.304, %while.304), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4229 = f32[] reduce(%square.634, %constant.712), dimensions={0,1}, to_apply=%region_461.552, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1169 = f32[] add(%add.1168, %reduce_sum.4229), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.635 = f32[2,1]{1,0} multiply(%while.305, %while.305), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4230 = f32[] reduce(%square.635, %constant.712), dimensions={0,1}, to_apply=%region_462.553, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1170 = f32[] add(%add.1169, %reduce_sum.4230), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.636 = f32[2,1024]{1,0} multiply(%while.306, %while.306), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4231 = f32[] reduce(%square.636, %constant.712), dimensions={0,1}, to_apply=%region_463.554, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1171 = f32[] add(%add.1170, %reduce_sum.4231), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.637 = f32[2,1]{1,0} multiply(%while.307, %while.307), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4232 = f32[] reduce(%square.637, %constant.712), dimensions={0,1}, to_apply=%region_464.555, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1172 = f32[] add(%add.1171, %reduce_sum.4232), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.638 = bf16[7168]{0} multiply(%select_n.105, %select_n.105), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1574 = f32[7168]{0} convert(%square.638), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4233 = f32[] reduce(%convert_element_type.1574, %constant.712), dimensions={0}, to_apply=%region_465.556, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1575 = bf16[] convert(%reduce_sum.4233), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1576 = f32[] convert(%convert_element_type.1575), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1173 = f32[] add(%add.1172, %convert_element_type.1576), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.639 = bf16[7168,1,18432]{2,1,0} multiply(%select_n.107, %select_n.107), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1577 = f32[7168,1,18432]{2,1,0} convert(%square.639), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4234 = f32[] reduce(%convert_element_type.1577, %constant.712), dimensions={0,1,2}, to_apply=%region_466.557, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1578 = bf16[] convert(%reduce_sum.4234), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1579 = f32[] convert(%convert_element_type.1578), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1174 = f32[] add(%add.1173, %convert_element_type.1579), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.640 = bf16[7168,1,18432]{2,1,0} multiply(%select_n.109, %select_n.109), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1580 = f32[7168,1,18432]{2,1,0} convert(%square.640), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4235 = f32[] reduce(%convert_element_type.1580, %constant.712), dimensions={0,1,2}, to_apply=%region_467.558, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1581 = bf16[] convert(%reduce_sum.4235), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1582 = f32[] convert(%convert_element_type.1581), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1175 = f32[] add(%add.1174, %convert_element_type.1582), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.641 = bf16[18432,1,7168]{2,1,0} multiply(%select_n.111, %select_n.111), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1583 = f32[18432,1,7168]{2,1,0} convert(%square.641), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4236 = f32[] reduce(%convert_element_type.1583, %constant.712), dimensions={0,1,2}, to_apply=%region_468.559, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1584 = bf16[] convert(%reduce_sum.4236), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1585 = f32[] convert(%convert_element_type.1584), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1176 = f32[] add(%add.1175, %convert_element_type.1585), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.642 = bf16[7168,1]{1,0} multiply(%select_n.113, %select_n.113), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1586 = f32[7168,1]{1,0} convert(%square.642), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4237 = f32[] reduce(%convert_element_type.1586, %constant.712), dimensions={0,1}, to_apply=%region_469.560, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1587 = bf16[] convert(%reduce_sum.4237), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1588 = f32[] convert(%convert_element_type.1587), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1177 = f32[] add(%add.1176, %convert_element_type.1588), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.643 = bf16[7168,1]{1,0} multiply(%select_n.115, %select_n.115), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1589 = f32[7168,1]{1,0} convert(%square.643), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4238 = f32[] reduce(%convert_element_type.1589, %constant.712), dimensions={0,1}, to_apply=%region_470.561, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1590 = bf16[] convert(%reduce_sum.4238), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1591 = f32[] convert(%convert_element_type.1590), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1178 = f32[] add(%add.1177, %convert_element_type.1591), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.644 = bf16[512,1]{1,0} multiply(%select_n.117, %select_n.117), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1592 = f32[512,1]{1,0} convert(%square.644), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4239 = f32[] reduce(%convert_element_type.1592, %constant.712), dimensions={0,1}, to_apply=%region_471.562, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1593 = bf16[] convert(%reduce_sum.4239), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1594 = f32[] convert(%convert_element_type.1593), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1179 = f32[] add(%add.1178, %convert_element_type.1594), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.645 = bf16[128,1,128,7168]{3,2,1,0} multiply(%select_n.119, %select_n.119), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1595 = f32[128,1,128,7168]{3,2,1,0} convert(%square.645), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4240 = f32[] reduce(%convert_element_type.1595, %constant.712), dimensions={0,1,2,3}, to_apply=%region_472.563, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1596 = bf16[] convert(%reduce_sum.4240), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1597 = f32[] convert(%convert_element_type.1596), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1180 = f32[] add(%add.1179, %convert_element_type.1597), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.646 = bf16[1536,1]{1,0} multiply(%select_n.121, %select_n.121), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1598 = f32[1536,1]{1,0} convert(%square.646), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4241 = f32[] reduce(%convert_element_type.1598, %constant.712), dimensions={0,1}, to_apply=%region_473.564, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1599 = bf16[] convert(%reduce_sum.4241), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1600 = f32[] convert(%convert_element_type.1599), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1181 = f32[] add(%add.1180, %convert_element_type.1600), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.647 = bf16[7168,1,576]{2,1,0} multiply(%select_n.123, %select_n.123), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1601 = f32[7168,1,576]{2,1,0} convert(%square.647), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4242 = f32[] reduce(%convert_element_type.1601, %constant.712), dimensions={0,1,2}, to_apply=%region_474.565, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1602 = bf16[] convert(%reduce_sum.4242), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1603 = f32[] convert(%convert_element_type.1602), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1182 = f32[] add(%add.1181, %convert_element_type.1603), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.648 = bf16[512,1,128,256]{3,2,1,0} multiply(%select_n.125, %select_n.125), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1604 = f32[512,1,128,256]{3,2,1,0} convert(%square.648), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4243 = f32[] reduce(%convert_element_type.1604, %constant.712), dimensions={0,1,2,3}, to_apply=%region_475.566, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1605 = bf16[] convert(%reduce_sum.4243), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1606 = f32[] convert(%convert_element_type.1605), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1183 = f32[] add(%add.1182, %convert_element_type.1606), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.649 = bf16[7168,1,1536]{2,1,0} multiply(%select_n.127, %select_n.127), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1607 = f32[7168,1,1536]{2,1,0} convert(%square.649), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4244 = f32[] reduce(%convert_element_type.1607, %constant.712), dimensions={0,1,2}, to_apply=%region_476.567, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1608 = bf16[] convert(%reduce_sum.4244), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1609 = f32[] convert(%convert_element_type.1608), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1184 = f32[] add(%add.1183, %convert_element_type.1609), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.650 = bf16[1536,1,128,192]{3,2,1,0} multiply(%select_n.129, %select_n.129), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1610 = f32[1536,1,128,192]{3,2,1,0} convert(%square.650), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4245 = f32[] reduce(%convert_element_type.1610, %constant.712), dimensions={0,1,2,3}, to_apply=%region_477.568, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1611 = bf16[] convert(%reduce_sum.4245), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1612 = f32[] convert(%convert_element_type.1611), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1185 = f32[] add(%add.1184, %convert_element_type.1612), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.651 = bf16[7168,129280]{0,1} multiply(%select_n.131, %select_n.131), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1613 = f32[7168,129280]{0,1} convert(%square.651), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4246 = f32[] reduce(%convert_element_type.1613, %constant.712), dimensions={0,1}, to_apply=%region_478.569, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1614 = bf16[] convert(%reduce_sum.4246), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1615 = f32[] convert(%convert_element_type.1614), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1186 = f32[] add(%add.1185, %convert_element_type.1615), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.652 = bf16[64,2]{1,0} multiply(%select_n.133, %select_n.133), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1616 = f32[64,2]{1,0} convert(%square.652), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4247 = f32[] reduce(%convert_element_type.1616, %constant.712), dimensions={0,1}, to_apply=%region_479.570, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1617 = bf16[] convert(%reduce_sum.4247), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1618 = f32[] convert(%convert_element_type.1617), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1187 = f32[] add(%add.1186, %convert_element_type.1618), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.653 = bf16[7168,2,64]{2,0,1} multiply(%select_n.135, %select_n.135), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1619 = f32[7168,2,64]{2,0,1} convert(%square.653), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4248 = f32[] reduce(%convert_element_type.1619, %constant.712), dimensions={0,1,2}, to_apply=%region_480.571, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1620 = bf16[] convert(%reduce_sum.4248), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1621 = f32[] convert(%convert_element_type.1620), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1188 = f32[] add(%add.1187, %convert_element_type.1621), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.654 = bf16[64,2]{0,1} multiply(%select_n.137, %select_n.137), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1622 = f32[64,2]{0,1} convert(%square.654), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4249 = f32[] reduce(%convert_element_type.1622, %constant.712), dimensions={0,1}, to_apply=%region_481.572, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1623 = bf16[] convert(%reduce_sum.4249), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1624 = f32[] convert(%convert_element_type.1623), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1189 = f32[] add(%add.1188, %convert_element_type.1624), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.655 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%select_n.139, %select_n.139), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1625 = f32[64,2,7168,2048]{3,2,0,1} convert(%square.655), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4250 = f32[] reduce(%convert_element_type.1625, %constant.712), dimensions={0,1,2,3}, to_apply=%region_482.573, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1626 = bf16[] convert(%reduce_sum.4250), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1627 = f32[] convert(%convert_element_type.1626), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1190 = f32[] add(%add.1189, %convert_element_type.1627), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.656 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%select_n.141, %select_n.141), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1628 = f32[64,2,7168,2048]{3,2,0,1} convert(%square.656), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4251 = f32[] reduce(%convert_element_type.1628, %constant.712), dimensions={0,1,2,3}, to_apply=%region_483.574, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1629 = bf16[] convert(%reduce_sum.4251), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1630 = f32[] convert(%convert_element_type.1629), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1191 = f32[] add(%add.1190, %convert_element_type.1630), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.657 = bf16[64,2,2048,7168]{3,2,0,1} multiply(%select_n.143, %select_n.143), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1631 = f32[64,2,2048,7168]{3,2,0,1} convert(%square.657), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4252 = f32[] reduce(%convert_element_type.1631, %constant.712), dimensions={0,1,2,3}, to_apply=%region_484.575, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1632 = bf16[] convert(%reduce_sum.4252), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1633 = f32[] convert(%convert_element_type.1632), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1192 = f32[] add(%add.1191, %convert_element_type.1633), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.658 = bf16[7168,2,2048]{2,0,1} multiply(%select_n.145, %select_n.145), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1634 = f32[7168,2,2048]{2,0,1} convert(%square.658), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4253 = f32[] reduce(%convert_element_type.1634, %constant.712), dimensions={0,1,2}, to_apply=%region_485.576, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1635 = bf16[] convert(%reduce_sum.4253), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1636 = f32[] convert(%convert_element_type.1635), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1193 = f32[] add(%add.1192, %convert_element_type.1636), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.659 = bf16[7168,2,2048]{2,0,1} multiply(%select_n.147, %select_n.147), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1637 = f32[7168,2,2048]{2,0,1} convert(%square.659), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4254 = f32[] reduce(%convert_element_type.1637, %constant.712), dimensions={0,1,2}, to_apply=%region_486.577, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1638 = bf16[] convert(%reduce_sum.4254), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1639 = f32[] convert(%convert_element_type.1638), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1194 = f32[] add(%add.1193, %convert_element_type.1639), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.660 = bf16[2048,2,7168]{2,0,1} multiply(%select_n.149, %select_n.149), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1640 = f32[2048,2,7168]{2,0,1} convert(%square.660), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4255 = f32[] reduce(%convert_element_type.1640, %constant.712), dimensions={0,1,2}, to_apply=%region_487.578, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1641 = bf16[] convert(%reduce_sum.4255), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1642 = f32[] convert(%convert_element_type.1641), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1195 = f32[] add(%add.1194, %convert_element_type.1642), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.661 = bf16[7168,2]{0,1} multiply(%select_n.151, %select_n.151), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1643 = f32[7168,2]{0,1} convert(%square.661), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4256 = f32[] reduce(%convert_element_type.1643, %constant.712), dimensions={0,1}, to_apply=%region_488.579, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1644 = bf16[] convert(%reduce_sum.4256), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1645 = f32[] convert(%convert_element_type.1644), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1196 = f32[] add(%add.1195, %convert_element_type.1645), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.662 = bf16[7168,2]{0,1} multiply(%select_n.153, %select_n.153), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1646 = f32[7168,2]{0,1} convert(%square.662), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4257 = f32[] reduce(%convert_element_type.1646, %constant.712), dimensions={0,1}, to_apply=%region_489.580, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1647 = bf16[] convert(%reduce_sum.4257), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1648 = f32[] convert(%convert_element_type.1647), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1197 = f32[] add(%add.1196, %convert_element_type.1648), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.663 = bf16[512,2]{0,1} multiply(%select_n.155, %select_n.155), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1649 = f32[512,2]{0,1} convert(%square.663), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4258 = f32[] reduce(%convert_element_type.1649, %constant.712), dimensions={0,1}, to_apply=%region_490.581, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1650 = bf16[] convert(%reduce_sum.4258), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1651 = f32[] convert(%convert_element_type.1650), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1198 = f32[] add(%add.1197, %convert_element_type.1651), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.664 = bf16[128,2,128,7168]{3,2,0,1} multiply(%select_n.157, %select_n.157), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1652 = f32[128,2,128,7168]{3,2,0,1} convert(%square.664), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4259 = f32[] reduce(%convert_element_type.1652, %constant.712), dimensions={0,1,2,3}, to_apply=%region_491.582, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1653 = bf16[] convert(%reduce_sum.4259), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1654 = f32[] convert(%convert_element_type.1653), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1199 = f32[] add(%add.1198, %convert_element_type.1654), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.665 = bf16[1536,2]{0,1} multiply(%select_n.159, %select_n.159), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1655 = f32[1536,2]{0,1} convert(%square.665), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4260 = f32[] reduce(%convert_element_type.1655, %constant.712), dimensions={0,1}, to_apply=%region_492.583, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1656 = bf16[] convert(%reduce_sum.4260), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1657 = f32[] convert(%convert_element_type.1656), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1200 = f32[] add(%add.1199, %convert_element_type.1657), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.666 = bf16[7168,2,576]{2,0,1} multiply(%select_n.161, %select_n.161), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1658 = f32[7168,2,576]{2,0,1} convert(%square.666), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4261 = f32[] reduce(%convert_element_type.1658, %constant.712), dimensions={0,1,2}, to_apply=%region_493.584, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1659 = bf16[] convert(%reduce_sum.4261), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1660 = f32[] convert(%convert_element_type.1659), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1201 = f32[] add(%add.1200, %convert_element_type.1660), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.667 = bf16[512,2,128,256]{3,2,0,1} multiply(%select_n.163, %select_n.163), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1661 = f32[512,2,128,256]{3,2,0,1} convert(%square.667), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4262 = f32[] reduce(%convert_element_type.1661, %constant.712), dimensions={0,1,2,3}, to_apply=%region_494.585, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1662 = bf16[] convert(%reduce_sum.4262), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1663 = f32[] convert(%convert_element_type.1662), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1202 = f32[] add(%add.1201, %convert_element_type.1663), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.668 = bf16[7168,2,1536]{2,0,1} multiply(%select_n.165, %select_n.165), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1664 = f32[7168,2,1536]{2,0,1} convert(%square.668), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4263 = f32[] reduce(%convert_element_type.1664, %constant.712), dimensions={0,1,2}, to_apply=%region_495.586, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1665 = bf16[] convert(%reduce_sum.4263), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1666 = f32[] convert(%convert_element_type.1665), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1203 = f32[] add(%add.1202, %convert_element_type.1666), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.669 = bf16[1536,2,128,192]{3,2,0,1} multiply(%select_n.167, %select_n.167), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1667 = f32[1536,2,128,192]{3,2,0,1} convert(%square.669), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4264 = f32[] reduce(%convert_element_type.1667, %constant.712), dimensions={0,1,2,3}, to_apply=%region_496.587, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1668 = bf16[] convert(%reduce_sum.4264), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1669 = f32[] convert(%convert_element_type.1668), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1204 = f32[] add(%add.1203, %convert_element_type.1669), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.670 = bf16[129280,7168]{1,0} multiply(%select_n.169, %select_n.169), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1670 = f32[129280,7168]{1,0} convert(%square.670), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4265 = f32[] reduce(%convert_element_type.1670, %constant.712), dimensions={0,1}, to_apply=%region_497.588, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1671 = bf16[] convert(%reduce_sum.4265), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1672 = f32[] convert(%convert_element_type.1671), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1205 = f32[] add(%add.1204, %convert_element_type.1672), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %sqrt.71 = f32[] sqrt(%add.1205), metadata={op_name="jit(train_step)/sqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=9 source_end_column=105}
+  %reshape.525 = f32[] reshape(%sqrt.71)
+  %log.2 = f32[32,4096,1]{2,1,0} log(%broadcast_in_dim.1659), metadata={op_name="jit(train_step)/jvp()/log" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=574 source_end_line=574 source_column=26 source_end_column=42}
+  %sub.81 = f32[32,4096,1]{2,1,0} broadcast(%log.2), dimensions={0,1,2}, metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=574 source_end_line=574 source_column=16 source_end_column=42}
+  %sub.82 = f32[32,4096]{1,0} reshape(%sub.81), metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=574 source_end_line=574 source_column=16 source_end_column=42}
+  %sub.83 = f32[32,4096,129280]{2,1,0} broadcast(%sub.82), dimensions={0,1}, metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=574 source_end_line=574 source_column=16 source_end_column=42}
+  %sub.84 = f32[32,4096,129280]{2,1,0} subtract(%sub.80, %sub.83), metadata={op_name="jit(train_step)/jvp()/sub" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=574 source_end_line=574 source_column=16 source_end_column=42}
+  %mul.1600 = f32[32,4096,129280]{2,1,0} multiply(%jvp_jit__one_hot__.1, %sub.84), metadata={op_name="jit(train_step)/jvp()/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=575 source_end_line=575 source_column=18 source_end_column=39}
+  %reduce_sum.4067 = f32[32,4096]{1,0} reduce(%mul.1600, %constant.712), dimensions={2}, to_apply=%region_58.95, metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=575 source_end_line=575 source_column=10 source_end_column=49}
+  %neg.21 = f32[32,4096]{1,0} negate(%reduce_sum.4067), metadata={op_name="jit(train_step)/jvp()/neg" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=575 source_end_line=575 source_column=9 source_end_column=49}
+  %square.511 = f32[32,4096]{1,0} multiply(%squeeze.593, %squeeze.593), metadata={op_name="jit(train_step)/jvp()/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=578 source_end_line=578 source_column=26 source_end_column=47}
+  %mul.1601 = f32[32,4096]{1,0} multiply(%square.511, %broadcast.247), metadata={op_name="jit(train_step)/jvp()/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=578 source_end_line=578 source_column=17 source_end_column=47}
+  %add.842 = f32[32,4096]{1,0} add(%neg.21, %mul.1601), metadata={op_name="jit(train_step)/jvp()/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=579 source_end_line=579 source_column=2 source_end_column=22}
+  %sharding_constraint.194 = f32[32,4096]{1,0} custom-call(%add.842), custom_call_target="Sharding", sharding={devices=[4,1]<=[4]}, frontend_attributes={xla.sdy.sharding="#sdy.sharding_per_value<[<@mesh, [{\"data\", \"stage\", \"fsdp\", \"fsdp_transpose\", \"expert\"}, {}]>]>"}, metadata={op_name="jit(train_step)/jvp()/sharding_constraint" source_file="/opt/flax/flax/linen/spmd.py" source_line=201 source_end_line=201 source_column=11 source_end_column=58}
+  %mul.1602 = f32[32,4096]{1,0} multiply(%sharding_constraint.194, %convert_element_type.1402), metadata={op_name="jit(train_step)/jvp()/mul" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=149 source_end_line=149 source_column=9 source_end_column=51}
+  %reduce_sum.4068 = f32[] reduce(%mul.1602, %constant.712), dimensions={0,1}, to_apply=%region_59.96, metadata={op_name="jit(train_step)/jvp()/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=150 source_end_line=150 source_column=15 source_end_column=28}
+  %div.1432 = f32[] divide(%reduce_sum.4068, %add.843), metadata={op_name="jit(train_step)/jvp()/div" source_file="/maxtext_workspace/maxtext/src/MaxText/train.py" source_line=163 source_end_line=163 source_column=11 source_end_column=45}
+  %reshape.526 = f32[] reshape(%div.1432)
+  %reshape.527 = f32[] reshape(%constant.712)
+  %reshape.528 = f32[] reshape(%constant.712)
+  %square.829 = f32[1,1024]{1,0} multiply(%while.324, %while.324), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4425 = f32[] reduce(%square.829, %constant.712), dimensions={0,1}, to_apply=%region_657.748, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %square.830 = f32[1,1]{1,0} multiply(%while.325, %while.325), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4426 = f32[] reduce(%square.830, %constant.712), dimensions={0,1}, to_apply=%region_658.749, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1364 = f32[] add(%reduce_sum.4425, %reduce_sum.4426), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.831 = f32[1,1024]{1,0} multiply(%while.326, %while.326), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4427 = f32[] reduce(%square.831, %constant.712), dimensions={0,1}, to_apply=%region_659.750, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1365 = f32[] add(%add.1364, %reduce_sum.4427), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.832 = f32[1,1]{1,0} multiply(%while.327, %while.327), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4428 = f32[] reduce(%square.832, %constant.712), dimensions={0,1}, to_apply=%region_660.751, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1366 = f32[] add(%add.1365, %reduce_sum.4428), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.833 = f32[1,1024]{1,0} multiply(%while.328, %while.328), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4429 = f32[] reduce(%square.833, %constant.712), dimensions={0,1}, to_apply=%region_661.752, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1367 = f32[] add(%add.1366, %reduce_sum.4429), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.834 = f32[1,1]{1,0} multiply(%while.329, %while.329), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4430 = f32[] reduce(%square.834, %constant.712), dimensions={0,1}, to_apply=%region_662.753, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1368 = f32[] add(%add.1367, %reduce_sum.4430), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.835 = f32[1,1024]{1,0} multiply(%while.330, %while.330), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4431 = f32[] reduce(%square.835, %constant.712), dimensions={0,1}, to_apply=%region_663.754, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1369 = f32[] add(%add.1368, %reduce_sum.4431), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.836 = f32[1,1]{1,0} multiply(%while.331, %while.331), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4432 = f32[] reduce(%square.836, %constant.712), dimensions={0,1}, to_apply=%region_664.755, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1370 = f32[] add(%add.1369, %reduce_sum.4432), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.837 = f32[1,1024]{1,0} multiply(%while.332, %while.332), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4433 = f32[] reduce(%square.837, %constant.712), dimensions={0,1}, to_apply=%region_665.756, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1371 = f32[] add(%add.1370, %reduce_sum.4433), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.838 = f32[1,1]{1,0} multiply(%while.333, %while.333), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4434 = f32[] reduce(%square.838, %constant.712), dimensions={0,1}, to_apply=%region_666.757, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1372 = f32[] add(%add.1371, %reduce_sum.4434), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.839 = f32[1,1024]{1,0} multiply(%while.334, %while.334), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4435 = f32[] reduce(%square.839, %constant.712), dimensions={0,1}, to_apply=%region_667.758, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1373 = f32[] add(%add.1372, %reduce_sum.4435), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.840 = f32[1,1]{1,0} multiply(%while.335, %while.335), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4436 = f32[] reduce(%square.840, %constant.712), dimensions={0,1}, to_apply=%region_668.759, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1374 = f32[] add(%add.1373, %reduce_sum.4436), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.841 = f32[1,1024]{1,0} multiply(%while.336, %while.336), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4437 = f32[] reduce(%square.841, %constant.712), dimensions={0,1}, to_apply=%region_669.760, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1375 = f32[] add(%add.1374, %reduce_sum.4437), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.842 = f32[1,1]{1,0} multiply(%while.337, %while.337), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4438 = f32[] reduce(%square.842, %constant.712), dimensions={0,1}, to_apply=%region_670.761, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1376 = f32[] add(%add.1375, %reduce_sum.4438), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.843 = f32[1,1024]{1,0} multiply(%while.338, %while.338), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4439 = f32[] reduce(%square.843, %constant.712), dimensions={0,1}, to_apply=%region_671.762, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1377 = f32[] add(%add.1376, %reduce_sum.4439), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.844 = f32[1,1]{1,0} multiply(%while.339, %while.339), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4440 = f32[] reduce(%square.844, %constant.712), dimensions={0,1}, to_apply=%region_672.763, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1378 = f32[] add(%add.1377, %reduce_sum.4440), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.845 = f32[1,1024]{1,0} multiply(%while.340, %while.340), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4441 = f32[] reduce(%square.845, %constant.712), dimensions={0,1}, to_apply=%region_673.764, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1379 = f32[] add(%add.1378, %reduce_sum.4441), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.846 = f32[1,1]{1,0} multiply(%while.341, %while.341), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4442 = f32[] reduce(%square.846, %constant.712), dimensions={0,1}, to_apply=%region_674.765, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1380 = f32[] add(%add.1379, %reduce_sum.4442), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.847 = f32[1,1024]{1,0} multiply(%while.342, %while.342), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4443 = f32[] reduce(%square.847, %constant.712), dimensions={0,1}, to_apply=%region_675.766, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1381 = f32[] add(%add.1380, %reduce_sum.4443), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.848 = f32[1,1]{1,0} multiply(%while.343, %while.343), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4444 = f32[] reduce(%square.848, %constant.712), dimensions={0,1}, to_apply=%region_676.767, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1382 = f32[] add(%add.1381, %reduce_sum.4444), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.849 = f32[1,1024]{1,0} multiply(%while.344, %while.344), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4445 = f32[] reduce(%square.849, %constant.712), dimensions={0,1}, to_apply=%region_677.768, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1383 = f32[] add(%add.1382, %reduce_sum.4445), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.850 = f32[1,1024]{1,0} multiply(%while.345, %while.345), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4446 = f32[] reduce(%square.850, %constant.712), dimensions={0,1}, to_apply=%region_678.769, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1384 = f32[] add(%add.1383, %reduce_sum.4446), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.851 = f32[1,1]{1,0} multiply(%while.346, %while.346), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4447 = f32[] reduce(%square.851, %constant.712), dimensions={0,1}, to_apply=%region_679.770, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1385 = f32[] add(%add.1384, %reduce_sum.4447), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.852 = f32[1,1]{1,0} multiply(%while.347, %while.347), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4448 = f32[] reduce(%square.852, %constant.712), dimensions={0,1}, to_apply=%region_680.771, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1386 = f32[] add(%add.1385, %reduce_sum.4448), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.853 = f32[1,1024]{1,0} multiply(%while.348, %while.348), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4449 = f32[] reduce(%square.853, %constant.712), dimensions={0,1}, to_apply=%region_681.772, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1387 = f32[] add(%add.1386, %reduce_sum.4449), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.854 = f32[1,1024]{1,0} multiply(%while.349, %while.349), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4450 = f32[] reduce(%square.854, %constant.712), dimensions={0,1}, to_apply=%region_682.773, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1388 = f32[] add(%add.1387, %reduce_sum.4450), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.855 = f32[1,1]{1,0} multiply(%while.350, %while.350), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4451 = f32[] reduce(%square.855, %constant.712), dimensions={0,1}, to_apply=%region_683.774, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1389 = f32[] add(%add.1388, %reduce_sum.4451), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.856 = f32[1,1]{1,0} multiply(%while.351, %while.351), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4452 = f32[] reduce(%square.856, %constant.712), dimensions={0,1}, to_apply=%region_684.775, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1390 = f32[] add(%add.1389, %reduce_sum.4452), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.857 = f32[1,1024]{1,0} multiply(%while.352, %while.352), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4453 = f32[] reduce(%square.857, %constant.712), dimensions={0,1}, to_apply=%region_685.776, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1391 = f32[] add(%add.1390, %reduce_sum.4453), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.858 = f32[1,1]{1,0} multiply(%while.353, %while.353), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4454 = f32[] reduce(%square.858, %constant.712), dimensions={0,1}, to_apply=%region_686.777, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1392 = f32[] add(%add.1391, %reduce_sum.4454), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.859 = f32[1,1024]{1,0} multiply(%while.354, %while.354), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4455 = f32[] reduce(%square.859, %constant.712), dimensions={0,1}, to_apply=%region_687.778, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1393 = f32[] add(%add.1392, %reduce_sum.4455), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.860 = f32[1,1]{1,0} multiply(%while.355, %while.355), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4456 = f32[] reduce(%square.860, %constant.712), dimensions={0,1}, to_apply=%region_688.779, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1394 = f32[] add(%add.1393, %reduce_sum.4456), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.861 = f32[1,1024]{1,0} multiply(%while.356, %while.356), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4457 = f32[] reduce(%square.861, %constant.712), dimensions={0,1}, to_apply=%region_689.780, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1395 = f32[] add(%add.1394, %reduce_sum.4457), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.862 = f32[1,1]{1,0} multiply(%while.357, %while.357), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4458 = f32[] reduce(%square.862, %constant.712), dimensions={0,1}, to_apply=%region_690.781, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1396 = f32[] add(%add.1395, %reduce_sum.4458), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.863 = f32[1,1024]{1,0} multiply(%while.358, %while.358), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4459 = f32[] reduce(%square.863, %constant.712), dimensions={0,1}, to_apply=%region_691.782, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1397 = f32[] add(%add.1396, %reduce_sum.4459), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.864 = f32[1,1]{1,0} multiply(%while.359, %while.359), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4460 = f32[] reduce(%square.864, %constant.712), dimensions={0,1}, to_apply=%region_692.783, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1398 = f32[] add(%add.1397, %reduce_sum.4460), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.865 = f32[1,1024]{1,0} multiply(%while.360, %while.360), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4461 = f32[] reduce(%square.865, %constant.712), dimensions={0,1}, to_apply=%region_693.784, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1399 = f32[] add(%add.1398, %reduce_sum.4461), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.866 = f32[1,1]{1,0} multiply(%while.361, %while.361), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4462 = f32[] reduce(%square.866, %constant.712), dimensions={0,1}, to_apply=%region_694.785, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1400 = f32[] add(%add.1399, %reduce_sum.4462), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.867 = f32[1,1024]{1,0} multiply(%while.362, %while.362), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4463 = f32[] reduce(%square.867, %constant.712), dimensions={0,1}, to_apply=%region_695.786, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1401 = f32[] add(%add.1400, %reduce_sum.4463), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.868 = f32[1,1]{1,0} multiply(%while.363, %while.363), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4464 = f32[] reduce(%square.868, %constant.712), dimensions={0,1}, to_apply=%region_696.787, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1402 = f32[] add(%add.1401, %reduce_sum.4464), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.869 = f32[1,1024]{1,0} multiply(%while.364, %while.364), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4465 = f32[] reduce(%square.869, %constant.712), dimensions={0,1}, to_apply=%region_697.788, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1403 = f32[] add(%add.1402, %reduce_sum.4465), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.870 = f32[1,1]{1,0} multiply(%while.365, %while.365), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4466 = f32[] reduce(%square.870, %constant.712), dimensions={0,1}, to_apply=%region_698.789, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1404 = f32[] add(%add.1403, %reduce_sum.4466), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.871 = f32[1,1024]{1,0} multiply(%while.366, %while.366), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4467 = f32[] reduce(%square.871, %constant.712), dimensions={0,1}, to_apply=%region_699.790, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1405 = f32[] add(%add.1404, %reduce_sum.4467), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.872 = f32[1,1]{1,0} multiply(%while.367, %while.367), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4468 = f32[] reduce(%square.872, %constant.712), dimensions={0,1}, to_apply=%region_700.791, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1406 = f32[] add(%add.1405, %reduce_sum.4468), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.873 = f32[1,1024]{1,0} multiply(%while.368, %while.368), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4469 = f32[] reduce(%square.873, %constant.712), dimensions={0,1}, to_apply=%region_701.792, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1407 = f32[] add(%add.1406, %reduce_sum.4469), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.874 = f32[1,1]{1,0} multiply(%while.369, %while.369), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4470 = f32[] reduce(%square.874, %constant.712), dimensions={0,1}, to_apply=%region_702.793, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1408 = f32[] add(%add.1407, %reduce_sum.4470), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.875 = f32[2,1024]{1,0} multiply(%while.228, %while.228), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4471 = f32[] reduce(%square.875, %constant.712), dimensions={0,1}, to_apply=%region_703.794, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1409 = f32[] add(%add.1408, %reduce_sum.4471), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.876 = f32[2,1]{1,0} multiply(%while.229, %while.229), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4472 = f32[] reduce(%square.876, %constant.712), dimensions={0,1}, to_apply=%region_704.795, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1410 = f32[] add(%add.1409, %reduce_sum.4472), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.877 = f32[2,1024]{1,0} multiply(%while.230, %while.230), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4473 = f32[] reduce(%square.877, %constant.712), dimensions={0,1}, to_apply=%region_705.796, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1411 = f32[] add(%add.1410, %reduce_sum.4473), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.878 = f32[2,1]{1,0} multiply(%while.231, %while.231), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4474 = f32[] reduce(%square.878, %constant.712), dimensions={0,1}, to_apply=%region_706.797, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1412 = f32[] add(%add.1411, %reduce_sum.4474), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.879 = f32[2,1024]{1,0} multiply(%while.232, %while.232), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4475 = f32[] reduce(%square.879, %constant.712), dimensions={0,1}, to_apply=%region_707.798, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1413 = f32[] add(%add.1412, %reduce_sum.4475), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.880 = f32[2,1]{1,0} multiply(%while.233, %while.233), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4476 = f32[] reduce(%square.880, %constant.712), dimensions={0,1}, to_apply=%region_708.799, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1414 = f32[] add(%add.1413, %reduce_sum.4476), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.881 = f32[2,1024]{1,0} multiply(%while.234, %while.234), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4477 = f32[] reduce(%square.881, %constant.712), dimensions={0,1}, to_apply=%region_709.800, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1415 = f32[] add(%add.1414, %reduce_sum.4477), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.882 = f32[2,1]{1,0} multiply(%while.235, %while.235), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4478 = f32[] reduce(%square.882, %constant.712), dimensions={0,1}, to_apply=%region_710.801, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1416 = f32[] add(%add.1415, %reduce_sum.4478), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.883 = f32[2,1024]{1,0} multiply(%while.236, %while.236), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4479 = f32[] reduce(%square.883, %constant.712), dimensions={0,1}, to_apply=%region_711.802, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1417 = f32[] add(%add.1416, %reduce_sum.4479), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.884 = f32[2,1]{1,0} multiply(%while.237, %while.237), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4480 = f32[] reduce(%square.884, %constant.712), dimensions={0,1}, to_apply=%region_712.803, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1418 = f32[] add(%add.1417, %reduce_sum.4480), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.885 = f32[2,1024]{1,0} multiply(%while.238, %while.238), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4481 = f32[] reduce(%square.885, %constant.712), dimensions={0,1}, to_apply=%region_713.804, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1419 = f32[] add(%add.1418, %reduce_sum.4481), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.886 = f32[2,1]{1,0} multiply(%while.239, %while.239), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4482 = f32[] reduce(%square.886, %constant.712), dimensions={0,1}, to_apply=%region_714.805, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1420 = f32[] add(%add.1419, %reduce_sum.4482), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.887 = f32[2,1024]{1,0} multiply(%while.240, %while.240), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4483 = f32[] reduce(%square.887, %constant.712), dimensions={0,1}, to_apply=%region_715.806, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1421 = f32[] add(%add.1420, %reduce_sum.4483), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.888 = f32[2,1]{1,0} multiply(%while.241, %while.241), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4484 = f32[] reduce(%square.888, %constant.712), dimensions={0,1}, to_apply=%region_716.807, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1422 = f32[] add(%add.1421, %reduce_sum.4484), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.889 = f32[2,1024]{1,0} multiply(%while.242, %while.242), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4485 = f32[] reduce(%square.889, %constant.712), dimensions={0,1}, to_apply=%region_717.808, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1423 = f32[] add(%add.1422, %reduce_sum.4485), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.890 = f32[2,1024]{1,0} multiply(%while.243, %while.243), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4486 = f32[] reduce(%square.890, %constant.712), dimensions={0,1}, to_apply=%region_718.809, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1424 = f32[] add(%add.1423, %reduce_sum.4486), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.891 = f32[2,1]{1,0} multiply(%while.244, %while.244), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4487 = f32[] reduce(%square.891, %constant.712), dimensions={0,1}, to_apply=%region_719.810, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1425 = f32[] add(%add.1424, %reduce_sum.4487), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.892 = f32[2,1]{1,0} multiply(%while.245, %while.245), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4488 = f32[] reduce(%square.892, %constant.712), dimensions={0,1}, to_apply=%region_720.811, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1426 = f32[] add(%add.1425, %reduce_sum.4488), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.893 = f32[2,1024]{1,0} multiply(%while.246, %while.246), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4489 = f32[] reduce(%square.893, %constant.712), dimensions={0,1}, to_apply=%region_721.812, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1427 = f32[] add(%add.1426, %reduce_sum.4489), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.894 = f32[2,1024]{1,0} multiply(%while.247, %while.247), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4490 = f32[] reduce(%square.894, %constant.712), dimensions={0,1}, to_apply=%region_722.813, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1428 = f32[] add(%add.1427, %reduce_sum.4490), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.895 = f32[2,1]{1,0} multiply(%while.248, %while.248), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4491 = f32[] reduce(%square.895, %constant.712), dimensions={0,1}, to_apply=%region_723.814, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1429 = f32[] add(%add.1428, %reduce_sum.4491), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.896 = f32[2,1]{1,0} multiply(%while.249, %while.249), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4492 = f32[] reduce(%square.896, %constant.712), dimensions={0,1}, to_apply=%region_724.815, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1430 = f32[] add(%add.1429, %reduce_sum.4492), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.897 = f32[2,1024]{1,0} multiply(%while.250, %while.250), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4493 = f32[] reduce(%square.897, %constant.712), dimensions={0,1}, to_apply=%region_725.816, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1431 = f32[] add(%add.1430, %reduce_sum.4493), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.898 = f32[2,1]{1,0} multiply(%while.251, %while.251), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4494 = f32[] reduce(%square.898, %constant.712), dimensions={0,1}, to_apply=%region_726.817, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1432 = f32[] add(%add.1431, %reduce_sum.4494), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.899 = f32[2,1024]{1,0} multiply(%while.252, %while.252), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4495 = f32[] reduce(%square.899, %constant.712), dimensions={0,1}, to_apply=%region_727.818, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1433 = f32[] add(%add.1432, %reduce_sum.4495), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.900 = f32[2,1]{1,0} multiply(%while.253, %while.253), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4496 = f32[] reduce(%square.900, %constant.712), dimensions={0,1}, to_apply=%region_728.819, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1434 = f32[] add(%add.1433, %reduce_sum.4496), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.901 = f32[2,1024]{1,0} multiply(%while.254, %while.254), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4497 = f32[] reduce(%square.901, %constant.712), dimensions={0,1}, to_apply=%region_729.820, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1435 = f32[] add(%add.1434, %reduce_sum.4497), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.902 = f32[2,1]{1,0} multiply(%while.255, %while.255), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4498 = f32[] reduce(%square.902, %constant.712), dimensions={0,1}, to_apply=%region_730.821, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1436 = f32[] add(%add.1435, %reduce_sum.4498), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.903 = f32[2,1024]{1,0} multiply(%while.256, %while.256), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4499 = f32[] reduce(%square.903, %constant.712), dimensions={0,1}, to_apply=%region_731.822, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1437 = f32[] add(%add.1436, %reduce_sum.4499), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.904 = f32[2,1]{1,0} multiply(%while.257, %while.257), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4500 = f32[] reduce(%square.904, %constant.712), dimensions={0,1}, to_apply=%region_732.823, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1438 = f32[] add(%add.1437, %reduce_sum.4500), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.905 = f32[2,1024]{1,0} multiply(%while.258, %while.258), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4501 = f32[] reduce(%square.905, %constant.712), dimensions={0,1}, to_apply=%region_733.824, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1439 = f32[] add(%add.1438, %reduce_sum.4501), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.906 = f32[2,1]{1,0} multiply(%while.259, %while.259), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4502 = f32[] reduce(%square.906, %constant.712), dimensions={0,1}, to_apply=%region_734.825, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1440 = f32[] add(%add.1439, %reduce_sum.4502), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.907 = f32[2,1024]{1,0} multiply(%while.260, %while.260), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4503 = f32[] reduce(%square.907, %constant.712), dimensions={0,1}, to_apply=%region_735.826, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1441 = f32[] add(%add.1440, %reduce_sum.4503), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.908 = f32[2,1]{1,0} multiply(%while.261, %while.261), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4504 = f32[] reduce(%square.908, %constant.712), dimensions={0,1}, to_apply=%region_736.827, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1442 = f32[] add(%add.1441, %reduce_sum.4504), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.909 = f32[2,1024]{1,0} multiply(%while.262, %while.262), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4505 = f32[] reduce(%square.909, %constant.712), dimensions={0,1}, to_apply=%region_737.828, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1443 = f32[] add(%add.1442, %reduce_sum.4505), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.910 = f32[2,1]{1,0} multiply(%while.263, %while.263), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4506 = f32[] reduce(%square.910, %constant.712), dimensions={0,1}, to_apply=%region_738.829, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1444 = f32[] add(%add.1443, %reduce_sum.4506), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.911 = f32[2,1024]{1,0} multiply(%while.264, %while.264), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4507 = f32[] reduce(%square.911, %constant.712), dimensions={0,1}, to_apply=%region_739.830, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1445 = f32[] add(%add.1444, %reduce_sum.4507), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.912 = f32[2,1]{1,0} multiply(%while.265, %while.265), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4508 = f32[] reduce(%square.912, %constant.712), dimensions={0,1}, to_apply=%region_740.831, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1446 = f32[] add(%add.1445, %reduce_sum.4508), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.913 = f32[2,1024]{1,0} multiply(%while.266, %while.266), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4509 = f32[] reduce(%square.913, %constant.712), dimensions={0,1}, to_apply=%region_741.832, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1447 = f32[] add(%add.1446, %reduce_sum.4509), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.914 = f32[2,1]{1,0} multiply(%while.267, %while.267), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4510 = f32[] reduce(%square.914, %constant.712), dimensions={0,1}, to_apply=%region_742.833, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1448 = f32[] add(%add.1447, %reduce_sum.4510), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.915 = f32[2,1024]{1,0} multiply(%while.268, %while.268), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4511 = f32[] reduce(%square.915, %constant.712), dimensions={0,1}, to_apply=%region_743.834, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1449 = f32[] add(%add.1448, %reduce_sum.4511), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.916 = f32[2,1]{1,0} multiply(%while.269, %while.269), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4512 = f32[] reduce(%square.916, %constant.712), dimensions={0,1}, to_apply=%region_744.835, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1450 = f32[] add(%add.1449, %reduce_sum.4512), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.917 = f32[2,1024]{1,0} multiply(%while.270, %while.270), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4513 = f32[] reduce(%square.917, %constant.712), dimensions={0,1}, to_apply=%region_745.836, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1451 = f32[] add(%add.1450, %reduce_sum.4513), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.918 = f32[2,1]{1,0} multiply(%while.271, %while.271), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4514 = f32[] reduce(%square.918, %constant.712), dimensions={0,1}, to_apply=%region_746.837, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1452 = f32[] add(%add.1451, %reduce_sum.4514), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.919 = f32[2,1024]{1,0} multiply(%while.272, %while.272), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4515 = f32[] reduce(%square.919, %constant.712), dimensions={0,1}, to_apply=%region_747.838, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1453 = f32[] add(%add.1452, %reduce_sum.4515), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.920 = f32[2,1]{1,0} multiply(%while.273, %while.273), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4516 = f32[] reduce(%square.920, %constant.712), dimensions={0,1}, to_apply=%region_748.839, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1454 = f32[] add(%add.1453, %reduce_sum.4516), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.921 = f32[2,1024]{1,0} multiply(%while.274, %while.274), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4517 = f32[] reduce(%square.921, %constant.712), dimensions={0,1}, to_apply=%region_749.840, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1455 = f32[] add(%add.1454, %reduce_sum.4517), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.922 = f32[2,1]{1,0} multiply(%while.275, %while.275), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4518 = f32[] reduce(%square.922, %constant.712), dimensions={0,1}, to_apply=%region_750.841, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1456 = f32[] add(%add.1455, %reduce_sum.4518), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.923 = f32[2,1024]{1,0} multiply(%while.276, %while.276), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4519 = f32[] reduce(%square.923, %constant.712), dimensions={0,1}, to_apply=%region_751.842, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1457 = f32[] add(%add.1456, %reduce_sum.4519), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.924 = f32[2,1]{1,0} multiply(%while.277, %while.277), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4520 = f32[] reduce(%square.924, %constant.712), dimensions={0,1}, to_apply=%region_752.843, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1458 = f32[] add(%add.1457, %reduce_sum.4520), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.925 = f32[2,1024]{1,0} multiply(%while.278, %while.278), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4521 = f32[] reduce(%square.925, %constant.712), dimensions={0,1}, to_apply=%region_753.844, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1459 = f32[] add(%add.1458, %reduce_sum.4521), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.926 = f32[2,1]{1,0} multiply(%while.279, %while.279), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4522 = f32[] reduce(%square.926, %constant.712), dimensions={0,1}, to_apply=%region_754.845, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1460 = f32[] add(%add.1459, %reduce_sum.4522), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.927 = f32[2,1024]{1,0} multiply(%while.280, %while.280), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4523 = f32[] reduce(%square.927, %constant.712), dimensions={0,1}, to_apply=%region_755.846, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1461 = f32[] add(%add.1460, %reduce_sum.4523), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.928 = f32[2,1]{1,0} multiply(%while.281, %while.281), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4524 = f32[] reduce(%square.928, %constant.712), dimensions={0,1}, to_apply=%region_756.847, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1462 = f32[] add(%add.1461, %reduce_sum.4524), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.929 = f32[2,1024]{1,0} multiply(%while.282, %while.282), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4525 = f32[] reduce(%square.929, %constant.712), dimensions={0,1}, to_apply=%region_757.848, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1463 = f32[] add(%add.1462, %reduce_sum.4525), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.930 = f32[2,1024]{1,0} multiply(%while.283, %while.283), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4526 = f32[] reduce(%square.930, %constant.712), dimensions={0,1}, to_apply=%region_758.849, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1464 = f32[] add(%add.1463, %reduce_sum.4526), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.931 = f32[2,1]{1,0} multiply(%while.284, %while.284), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4527 = f32[] reduce(%square.931, %constant.712), dimensions={0,1}, to_apply=%region_759.850, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1465 = f32[] add(%add.1464, %reduce_sum.4527), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.932 = f32[2,1]{1,0} multiply(%while.285, %while.285), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4528 = f32[] reduce(%square.932, %constant.712), dimensions={0,1}, to_apply=%region_760.851, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1466 = f32[] add(%add.1465, %reduce_sum.4528), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.933 = f32[2,1024]{1,0} multiply(%while.286, %while.286), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4529 = f32[] reduce(%square.933, %constant.712), dimensions={0,1}, to_apply=%region_761.852, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1467 = f32[] add(%add.1466, %reduce_sum.4529), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.934 = f32[2,1024]{1,0} multiply(%while.287, %while.287), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4530 = f32[] reduce(%square.934, %constant.712), dimensions={0,1}, to_apply=%region_762.853, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1468 = f32[] add(%add.1467, %reduce_sum.4530), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.935 = f32[2,1]{1,0} multiply(%while.288, %while.288), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4531 = f32[] reduce(%square.935, %constant.712), dimensions={0,1}, to_apply=%region_763.854, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1469 = f32[] add(%add.1468, %reduce_sum.4531), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.936 = f32[2,1]{1,0} multiply(%while.289, %while.289), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4532 = f32[] reduce(%square.936, %constant.712), dimensions={0,1}, to_apply=%region_764.855, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1470 = f32[] add(%add.1469, %reduce_sum.4532), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.937 = f32[2,1024]{1,0} multiply(%while.290, %while.290), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4533 = f32[] reduce(%square.937, %constant.712), dimensions={0,1}, to_apply=%region_765.856, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1471 = f32[] add(%add.1470, %reduce_sum.4533), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.938 = f32[2,1]{1,0} multiply(%while.291, %while.291), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4534 = f32[] reduce(%square.938, %constant.712), dimensions={0,1}, to_apply=%region_766.857, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1472 = f32[] add(%add.1471, %reduce_sum.4534), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.939 = f32[2,1024]{1,0} multiply(%while.292, %while.292), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4535 = f32[] reduce(%square.939, %constant.712), dimensions={0,1}, to_apply=%region_767.858, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1473 = f32[] add(%add.1472, %reduce_sum.4535), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.940 = f32[2,1]{1,0} multiply(%while.293, %while.293), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4536 = f32[] reduce(%square.940, %constant.712), dimensions={0,1}, to_apply=%region_768.859, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1474 = f32[] add(%add.1473, %reduce_sum.4536), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.941 = f32[2,1024]{1,0} multiply(%while.294, %while.294), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4537 = f32[] reduce(%square.941, %constant.712), dimensions={0,1}, to_apply=%region_769.860, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1475 = f32[] add(%add.1474, %reduce_sum.4537), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.942 = f32[2,1]{1,0} multiply(%while.295, %while.295), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4538 = f32[] reduce(%square.942, %constant.712), dimensions={0,1}, to_apply=%region_770.861, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1476 = f32[] add(%add.1475, %reduce_sum.4538), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.943 = f32[2,1024]{1,0} multiply(%while.296, %while.296), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4539 = f32[] reduce(%square.943, %constant.712), dimensions={0,1}, to_apply=%region_771.862, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1477 = f32[] add(%add.1476, %reduce_sum.4539), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.944 = f32[2,1]{1,0} multiply(%while.297, %while.297), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4540 = f32[] reduce(%square.944, %constant.712), dimensions={0,1}, to_apply=%region_772.863, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1478 = f32[] add(%add.1477, %reduce_sum.4540), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.945 = f32[2,1024]{1,0} multiply(%while.298, %while.298), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4541 = f32[] reduce(%square.945, %constant.712), dimensions={0,1}, to_apply=%region_773.864, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1479 = f32[] add(%add.1478, %reduce_sum.4541), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.946 = f32[2,1]{1,0} multiply(%while.299, %while.299), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4542 = f32[] reduce(%square.946, %constant.712), dimensions={0,1}, to_apply=%region_774.865, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1480 = f32[] add(%add.1479, %reduce_sum.4542), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.947 = f32[2,1024]{1,0} multiply(%while.300, %while.300), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4543 = f32[] reduce(%square.947, %constant.712), dimensions={0,1}, to_apply=%region_775.866, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1481 = f32[] add(%add.1480, %reduce_sum.4543), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.948 = f32[2,1]{1,0} multiply(%while.301, %while.301), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4544 = f32[] reduce(%square.948, %constant.712), dimensions={0,1}, to_apply=%region_776.867, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1482 = f32[] add(%add.1481, %reduce_sum.4544), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.949 = f32[2,1024]{1,0} multiply(%while.302, %while.302), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4545 = f32[] reduce(%square.949, %constant.712), dimensions={0,1}, to_apply=%region_777.868, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1483 = f32[] add(%add.1482, %reduce_sum.4545), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.950 = f32[2,1]{1,0} multiply(%while.303, %while.303), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4546 = f32[] reduce(%square.950, %constant.712), dimensions={0,1}, to_apply=%region_778.869, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1484 = f32[] add(%add.1483, %reduce_sum.4546), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.951 = f32[2,1024]{1,0} multiply(%while.304, %while.304), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4547 = f32[] reduce(%square.951, %constant.712), dimensions={0,1}, to_apply=%region_779.870, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1485 = f32[] add(%add.1484, %reduce_sum.4547), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.952 = f32[2,1]{1,0} multiply(%while.305, %while.305), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4548 = f32[] reduce(%square.952, %constant.712), dimensions={0,1}, to_apply=%region_780.871, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1486 = f32[] add(%add.1485, %reduce_sum.4548), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.953 = f32[2,1024]{1,0} multiply(%while.306, %while.306), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4549 = f32[] reduce(%square.953, %constant.712), dimensions={0,1}, to_apply=%region_781.872, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1487 = f32[] add(%add.1486, %reduce_sum.4549), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.954 = f32[2,1]{1,0} multiply(%while.307, %while.307), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4550 = f32[] reduce(%square.954, %constant.712), dimensions={0,1}, to_apply=%region_782.873, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1488 = f32[] add(%add.1487, %reduce_sum.4550), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.955 = bf16[7168]{0} multiply(%add.1014, %add.1014), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1771 = f32[7168]{0} convert(%square.955), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4551 = f32[] reduce(%convert_element_type.1771, %constant.712), dimensions={0}, to_apply=%region_783.874, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1772 = bf16[] convert(%reduce_sum.4551), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1773 = f32[] convert(%convert_element_type.1772), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1489 = f32[] add(%add.1488, %convert_element_type.1773), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.956 = bf16[7168,1,18432]{2,1,0} multiply(%add.1015, %add.1015), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1774 = f32[7168,1,18432]{2,1,0} convert(%square.956), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4552 = f32[] reduce(%convert_element_type.1774, %constant.712), dimensions={0,1,2}, to_apply=%region_784.875, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1775 = bf16[] convert(%reduce_sum.4552), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1776 = f32[] convert(%convert_element_type.1775), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1490 = f32[] add(%add.1489, %convert_element_type.1776), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.957 = bf16[7168,1,18432]{2,1,0} multiply(%add.1016, %add.1016), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1777 = f32[7168,1,18432]{2,1,0} convert(%square.957), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4553 = f32[] reduce(%convert_element_type.1777, %constant.712), dimensions={0,1,2}, to_apply=%region_785.876, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1778 = bf16[] convert(%reduce_sum.4553), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1779 = f32[] convert(%convert_element_type.1778), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1491 = f32[] add(%add.1490, %convert_element_type.1779), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.958 = bf16[18432,1,7168]{2,1,0} multiply(%add.1017, %add.1017), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1780 = f32[18432,1,7168]{2,1,0} convert(%square.958), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4554 = f32[] reduce(%convert_element_type.1780, %constant.712), dimensions={0,1,2}, to_apply=%region_786.877, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1781 = bf16[] convert(%reduce_sum.4554), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1782 = f32[] convert(%convert_element_type.1781), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1492 = f32[] add(%add.1491, %convert_element_type.1782), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.959 = bf16[7168,1]{1,0} multiply(%add.1018, %add.1018), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1783 = f32[7168,1]{1,0} convert(%square.959), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4555 = f32[] reduce(%convert_element_type.1783, %constant.712), dimensions={0,1}, to_apply=%region_787.878, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1784 = bf16[] convert(%reduce_sum.4555), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1785 = f32[] convert(%convert_element_type.1784), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1493 = f32[] add(%add.1492, %convert_element_type.1785), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.960 = bf16[7168,1]{1,0} multiply(%add.1019, %add.1019), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1786 = f32[7168,1]{1,0} convert(%square.960), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4556 = f32[] reduce(%convert_element_type.1786, %constant.712), dimensions={0,1}, to_apply=%region_788.879, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1787 = bf16[] convert(%reduce_sum.4556), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1788 = f32[] convert(%convert_element_type.1787), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1494 = f32[] add(%add.1493, %convert_element_type.1788), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.961 = bf16[512,1]{1,0} multiply(%add.1020, %add.1020), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1789 = f32[512,1]{1,0} convert(%square.961), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4557 = f32[] reduce(%convert_element_type.1789, %constant.712), dimensions={0,1}, to_apply=%region_789.880, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1790 = bf16[] convert(%reduce_sum.4557), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1791 = f32[] convert(%convert_element_type.1790), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1495 = f32[] add(%add.1494, %convert_element_type.1791), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.962 = bf16[128,1,128,7168]{3,2,1,0} multiply(%add.1021, %add.1021), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1792 = f32[128,1,128,7168]{3,2,1,0} convert(%square.962), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4558 = f32[] reduce(%convert_element_type.1792, %constant.712), dimensions={0,1,2,3}, to_apply=%region_790.881, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1793 = bf16[] convert(%reduce_sum.4558), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1794 = f32[] convert(%convert_element_type.1793), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1496 = f32[] add(%add.1495, %convert_element_type.1794), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.963 = bf16[1536,1]{1,0} multiply(%add.1022, %add.1022), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1795 = f32[1536,1]{1,0} convert(%square.963), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4559 = f32[] reduce(%convert_element_type.1795, %constant.712), dimensions={0,1}, to_apply=%region_791.882, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1796 = bf16[] convert(%reduce_sum.4559), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1797 = f32[] convert(%convert_element_type.1796), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1497 = f32[] add(%add.1496, %convert_element_type.1797), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.964 = bf16[7168,1,576]{2,1,0} multiply(%add.1023, %add.1023), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1798 = f32[7168,1,576]{2,1,0} convert(%square.964), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4560 = f32[] reduce(%convert_element_type.1798, %constant.712), dimensions={0,1,2}, to_apply=%region_792.883, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1799 = bf16[] convert(%reduce_sum.4560), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1800 = f32[] convert(%convert_element_type.1799), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1498 = f32[] add(%add.1497, %convert_element_type.1800), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.965 = bf16[512,1,128,256]{3,2,1,0} multiply(%add.1024, %add.1024), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1801 = f32[512,1,128,256]{3,2,1,0} convert(%square.965), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4561 = f32[] reduce(%convert_element_type.1801, %constant.712), dimensions={0,1,2,3}, to_apply=%region_793.884, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1802 = bf16[] convert(%reduce_sum.4561), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1803 = f32[] convert(%convert_element_type.1802), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1499 = f32[] add(%add.1498, %convert_element_type.1803), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.966 = bf16[7168,1,1536]{2,1,0} multiply(%add.1025, %add.1025), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1804 = f32[7168,1,1536]{2,1,0} convert(%square.966), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4562 = f32[] reduce(%convert_element_type.1804, %constant.712), dimensions={0,1,2}, to_apply=%region_794.885, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1805 = bf16[] convert(%reduce_sum.4562), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1806 = f32[] convert(%convert_element_type.1805), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1500 = f32[] add(%add.1499, %convert_element_type.1806), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.967 = bf16[1536,1,128,192]{3,2,1,0} multiply(%add.1026, %add.1026), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1807 = f32[1536,1,128,192]{3,2,1,0} convert(%square.967), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4563 = f32[] reduce(%convert_element_type.1807, %constant.712), dimensions={0,1,2,3}, to_apply=%region_795.886, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1808 = bf16[] convert(%reduce_sum.4563), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1809 = f32[] convert(%convert_element_type.1808), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1501 = f32[] add(%add.1500, %convert_element_type.1809), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.968 = bf16[7168,129280]{1,0} multiply(%add.1027, %add.1027), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1810 = f32[7168,129280]{1,0} convert(%square.968), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4564 = f32[] reduce(%convert_element_type.1810, %constant.712), dimensions={0,1}, to_apply=%region_796.887, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1811 = bf16[] convert(%reduce_sum.4564), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1812 = f32[] convert(%convert_element_type.1811), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1502 = f32[] add(%add.1501, %convert_element_type.1812), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.969 = bf16[64,2]{1,0} multiply(%add.1028, %add.1028), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1813 = f32[64,2]{1,0} convert(%square.969), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4565 = f32[] reduce(%convert_element_type.1813, %constant.712), dimensions={0,1}, to_apply=%region_797.888, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1814 = bf16[] convert(%reduce_sum.4565), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1815 = f32[] convert(%convert_element_type.1814), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1503 = f32[] add(%add.1502, %convert_element_type.1815), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.970 = bf16[7168,2,64]{2,1,0} multiply(%add.1029, %add.1029), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1816 = f32[7168,2,64]{2,1,0} convert(%square.970), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4566 = f32[] reduce(%convert_element_type.1816, %constant.712), dimensions={0,1,2}, to_apply=%region_798.889, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1817 = bf16[] convert(%reduce_sum.4566), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1818 = f32[] convert(%convert_element_type.1817), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1504 = f32[] add(%add.1503, %convert_element_type.1818), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.971 = bf16[64,2]{1,0} multiply(%add.1030, %add.1030), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1819 = f32[64,2]{1,0} convert(%square.971), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4567 = f32[] reduce(%convert_element_type.1819, %constant.712), dimensions={0,1}, to_apply=%region_799.890, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1820 = bf16[] convert(%reduce_sum.4567), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1821 = f32[] convert(%convert_element_type.1820), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1505 = f32[] add(%add.1504, %convert_element_type.1821), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.972 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%add.1031, %add.1031), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1822 = f32[64,2,7168,2048]{3,2,1,0} convert(%square.972), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4568 = f32[] reduce(%convert_element_type.1822, %constant.712), dimensions={0,1,2,3}, to_apply=%region_800.891, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1823 = bf16[] convert(%reduce_sum.4568), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1824 = f32[] convert(%convert_element_type.1823), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1506 = f32[] add(%add.1505, %convert_element_type.1824), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.973 = bf16[64,2,7168,2048]{3,2,1,0} multiply(%add.1032, %add.1032), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1825 = f32[64,2,7168,2048]{3,2,1,0} convert(%square.973), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4569 = f32[] reduce(%convert_element_type.1825, %constant.712), dimensions={0,1,2,3}, to_apply=%region_801.892, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1826 = bf16[] convert(%reduce_sum.4569), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1827 = f32[] convert(%convert_element_type.1826), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1507 = f32[] add(%add.1506, %convert_element_type.1827), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.974 = bf16[64,2,2048,7168]{3,2,1,0} multiply(%add.1033, %add.1033), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1828 = f32[64,2,2048,7168]{3,2,1,0} convert(%square.974), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4570 = f32[] reduce(%convert_element_type.1828, %constant.712), dimensions={0,1,2,3}, to_apply=%region_802.893, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1829 = bf16[] convert(%reduce_sum.4570), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1830 = f32[] convert(%convert_element_type.1829), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1508 = f32[] add(%add.1507, %convert_element_type.1830), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.975 = bf16[7168,2,2048]{2,1,0} multiply(%add.1034, %add.1034), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1831 = f32[7168,2,2048]{2,1,0} convert(%square.975), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4571 = f32[] reduce(%convert_element_type.1831, %constant.712), dimensions={0,1,2}, to_apply=%region_803.894, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1832 = bf16[] convert(%reduce_sum.4571), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1833 = f32[] convert(%convert_element_type.1832), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1509 = f32[] add(%add.1508, %convert_element_type.1833), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.976 = bf16[7168,2,2048]{2,1,0} multiply(%add.1035, %add.1035), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1834 = f32[7168,2,2048]{2,1,0} convert(%square.976), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4572 = f32[] reduce(%convert_element_type.1834, %constant.712), dimensions={0,1,2}, to_apply=%region_804.895, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1835 = bf16[] convert(%reduce_sum.4572), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1836 = f32[] convert(%convert_element_type.1835), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1510 = f32[] add(%add.1509, %convert_element_type.1836), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.977 = bf16[2048,2,7168]{2,1,0} multiply(%add.1036, %add.1036), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1837 = f32[2048,2,7168]{2,1,0} convert(%square.977), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4573 = f32[] reduce(%convert_element_type.1837, %constant.712), dimensions={0,1,2}, to_apply=%region_805.896, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1838 = bf16[] convert(%reduce_sum.4573), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1839 = f32[] convert(%convert_element_type.1838), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1511 = f32[] add(%add.1510, %convert_element_type.1839), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.978 = bf16[7168,2]{1,0} multiply(%add.1037, %add.1037), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1840 = f32[7168,2]{1,0} convert(%square.978), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4574 = f32[] reduce(%convert_element_type.1840, %constant.712), dimensions={0,1}, to_apply=%region_806.897, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1841 = bf16[] convert(%reduce_sum.4574), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1842 = f32[] convert(%convert_element_type.1841), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1512 = f32[] add(%add.1511, %convert_element_type.1842), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.979 = bf16[7168,2]{1,0} multiply(%add.1038, %add.1038), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1843 = f32[7168,2]{1,0} convert(%square.979), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4575 = f32[] reduce(%convert_element_type.1843, %constant.712), dimensions={0,1}, to_apply=%region_807.898, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1844 = bf16[] convert(%reduce_sum.4575), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1845 = f32[] convert(%convert_element_type.1844), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1513 = f32[] add(%add.1512, %convert_element_type.1845), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.980 = bf16[512,2]{1,0} multiply(%add.1039, %add.1039), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1846 = f32[512,2]{1,0} convert(%square.980), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4576 = f32[] reduce(%convert_element_type.1846, %constant.712), dimensions={0,1}, to_apply=%region_808.899, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1847 = bf16[] convert(%reduce_sum.4576), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1848 = f32[] convert(%convert_element_type.1847), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1514 = f32[] add(%add.1513, %convert_element_type.1848), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.981 = bf16[128,2,128,7168]{3,2,1,0} multiply(%add.1040, %add.1040), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1849 = f32[128,2,128,7168]{3,2,1,0} convert(%square.981), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4577 = f32[] reduce(%convert_element_type.1849, %constant.712), dimensions={0,1,2,3}, to_apply=%region_809.900, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1850 = bf16[] convert(%reduce_sum.4577), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1851 = f32[] convert(%convert_element_type.1850), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1515 = f32[] add(%add.1514, %convert_element_type.1851), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.982 = bf16[1536,2]{1,0} multiply(%add.1041, %add.1041), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1852 = f32[1536,2]{1,0} convert(%square.982), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4578 = f32[] reduce(%convert_element_type.1852, %constant.712), dimensions={0,1}, to_apply=%region_810.901, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1853 = bf16[] convert(%reduce_sum.4578), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1854 = f32[] convert(%convert_element_type.1853), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1516 = f32[] add(%add.1515, %convert_element_type.1854), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.983 = bf16[7168,2,576]{2,1,0} multiply(%add.1042, %add.1042), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1855 = f32[7168,2,576]{2,1,0} convert(%square.983), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4579 = f32[] reduce(%convert_element_type.1855, %constant.712), dimensions={0,1,2}, to_apply=%region_811.902, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1856 = bf16[] convert(%reduce_sum.4579), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1857 = f32[] convert(%convert_element_type.1856), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1517 = f32[] add(%add.1516, %convert_element_type.1857), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.984 = bf16[512,2,128,256]{3,2,1,0} multiply(%add.1043, %add.1043), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1858 = f32[512,2,128,256]{3,2,1,0} convert(%square.984), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4580 = f32[] reduce(%convert_element_type.1858, %constant.712), dimensions={0,1,2,3}, to_apply=%region_812.903, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1859 = bf16[] convert(%reduce_sum.4580), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1860 = f32[] convert(%convert_element_type.1859), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1518 = f32[] add(%add.1517, %convert_element_type.1860), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.985 = bf16[7168,2,1536]{2,1,0} multiply(%add.1044, %add.1044), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1861 = f32[7168,2,1536]{2,1,0} convert(%square.985), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4581 = f32[] reduce(%convert_element_type.1861, %constant.712), dimensions={0,1,2}, to_apply=%region_813.904, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1862 = bf16[] convert(%reduce_sum.4581), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1863 = f32[] convert(%convert_element_type.1862), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1519 = f32[] add(%add.1518, %convert_element_type.1863), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.986 = bf16[1536,2,128,192]{3,2,1,0} multiply(%add.1045, %add.1045), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1864 = f32[1536,2,128,192]{3,2,1,0} convert(%square.986), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4582 = f32[] reduce(%convert_element_type.1864, %constant.712), dimensions={0,1,2,3}, to_apply=%region_814.905, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1865 = bf16[] convert(%reduce_sum.4582), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1866 = f32[] convert(%convert_element_type.1865), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1520 = f32[] add(%add.1519, %convert_element_type.1866), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.987 = bf16[129280,7168]{1,0} multiply(%add.1046, %add.1046), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1867 = f32[129280,7168]{1,0} convert(%square.987), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4583 = f32[] reduce(%convert_element_type.1867, %constant.712), dimensions={0,1}, to_apply=%region_815.906, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1868 = bf16[] convert(%reduce_sum.4583), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1869 = f32[] convert(%convert_element_type.1868), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1521 = f32[] add(%add.1520, %convert_element_type.1869), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %sqrt.73 = f32[] sqrt(%add.1521), metadata={op_name="jit(train_step)/sqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=9 source_end_column=105}
+  %reshape.529 = f32[] reshape(%sqrt.73)
+  %square.671 = f32[1,1024]{1,0} multiply(%while.324, %while.324), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4266 = f32[] reduce(%square.671, %constant.712), dimensions={0,1}, to_apply=%region_498.589, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %square.672 = f32[1,1]{1,0} multiply(%while.325, %while.325), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4267 = f32[] reduce(%square.672, %constant.712), dimensions={0,1}, to_apply=%region_499.590, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1206 = f32[] add(%reduce_sum.4266, %reduce_sum.4267), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.673 = f32[1,1024]{1,0} multiply(%while.326, %while.326), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4268 = f32[] reduce(%square.673, %constant.712), dimensions={0,1}, to_apply=%region_500.591, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1207 = f32[] add(%add.1206, %reduce_sum.4268), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.674 = f32[1,1]{1,0} multiply(%while.327, %while.327), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4269 = f32[] reduce(%square.674, %constant.712), dimensions={0,1}, to_apply=%region_501.592, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1208 = f32[] add(%add.1207, %reduce_sum.4269), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.675 = f32[1,1024]{1,0} multiply(%while.328, %while.328), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4270 = f32[] reduce(%square.675, %constant.712), dimensions={0,1}, to_apply=%region_502.593, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1209 = f32[] add(%add.1208, %reduce_sum.4270), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.676 = f32[1,1]{1,0} multiply(%while.329, %while.329), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4271 = f32[] reduce(%square.676, %constant.712), dimensions={0,1}, to_apply=%region_503.594, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1210 = f32[] add(%add.1209, %reduce_sum.4271), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.677 = f32[1,1024]{1,0} multiply(%while.330, %while.330), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4272 = f32[] reduce(%square.677, %constant.712), dimensions={0,1}, to_apply=%region_504.595, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1211 = f32[] add(%add.1210, %reduce_sum.4272), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.678 = f32[1,1]{1,0} multiply(%while.331, %while.331), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4273 = f32[] reduce(%square.678, %constant.712), dimensions={0,1}, to_apply=%region_505.596, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1212 = f32[] add(%add.1211, %reduce_sum.4273), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.679 = f32[1,1024]{1,0} multiply(%while.332, %while.332), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4274 = f32[] reduce(%square.679, %constant.712), dimensions={0,1}, to_apply=%region_506.597, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1213 = f32[] add(%add.1212, %reduce_sum.4274), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.680 = f32[1,1]{1,0} multiply(%while.333, %while.333), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4275 = f32[] reduce(%square.680, %constant.712), dimensions={0,1}, to_apply=%region_507.598, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1214 = f32[] add(%add.1213, %reduce_sum.4275), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.681 = f32[1,1024]{1,0} multiply(%while.334, %while.334), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4276 = f32[] reduce(%square.681, %constant.712), dimensions={0,1}, to_apply=%region_508.599, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1215 = f32[] add(%add.1214, %reduce_sum.4276), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.682 = f32[1,1]{1,0} multiply(%while.335, %while.335), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4277 = f32[] reduce(%square.682, %constant.712), dimensions={0,1}, to_apply=%region_509.600, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1216 = f32[] add(%add.1215, %reduce_sum.4277), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.683 = f32[1,1024]{1,0} multiply(%while.336, %while.336), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4278 = f32[] reduce(%square.683, %constant.712), dimensions={0,1}, to_apply=%region_510.601, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1217 = f32[] add(%add.1216, %reduce_sum.4278), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.684 = f32[1,1]{1,0} multiply(%while.337, %while.337), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4279 = f32[] reduce(%square.684, %constant.712), dimensions={0,1}, to_apply=%region_511.602, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1218 = f32[] add(%add.1217, %reduce_sum.4279), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.685 = f32[1,1024]{1,0} multiply(%while.338, %while.338), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4280 = f32[] reduce(%square.685, %constant.712), dimensions={0,1}, to_apply=%region_512.603, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1219 = f32[] add(%add.1218, %reduce_sum.4280), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.686 = f32[1,1]{1,0} multiply(%while.339, %while.339), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4281 = f32[] reduce(%square.686, %constant.712), dimensions={0,1}, to_apply=%region_513.604, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1220 = f32[] add(%add.1219, %reduce_sum.4281), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.687 = f32[1,1024]{1,0} multiply(%while.340, %while.340), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4282 = f32[] reduce(%square.687, %constant.712), dimensions={0,1}, to_apply=%region_514.605, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1221 = f32[] add(%add.1220, %reduce_sum.4282), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.688 = f32[1,1]{1,0} multiply(%while.341, %while.341), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4283 = f32[] reduce(%square.688, %constant.712), dimensions={0,1}, to_apply=%region_515.606, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1222 = f32[] add(%add.1221, %reduce_sum.4283), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.689 = f32[1,1024]{1,0} multiply(%while.342, %while.342), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4284 = f32[] reduce(%square.689, %constant.712), dimensions={0,1}, to_apply=%region_516.607, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1223 = f32[] add(%add.1222, %reduce_sum.4284), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.690 = f32[1,1]{1,0} multiply(%while.343, %while.343), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4285 = f32[] reduce(%square.690, %constant.712), dimensions={0,1}, to_apply=%region_517.608, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1224 = f32[] add(%add.1223, %reduce_sum.4285), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.691 = f32[1,1024]{1,0} multiply(%while.344, %while.344), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4286 = f32[] reduce(%square.691, %constant.712), dimensions={0,1}, to_apply=%region_518.609, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1225 = f32[] add(%add.1224, %reduce_sum.4286), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.692 = f32[1,1024]{1,0} multiply(%while.345, %while.345), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4287 = f32[] reduce(%square.692, %constant.712), dimensions={0,1}, to_apply=%region_519.610, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1226 = f32[] add(%add.1225, %reduce_sum.4287), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.693 = f32[1,1]{1,0} multiply(%while.346, %while.346), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4288 = f32[] reduce(%square.693, %constant.712), dimensions={0,1}, to_apply=%region_520.611, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1227 = f32[] add(%add.1226, %reduce_sum.4288), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.694 = f32[1,1]{1,0} multiply(%while.347, %while.347), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4289 = f32[] reduce(%square.694, %constant.712), dimensions={0,1}, to_apply=%region_521.612, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1228 = f32[] add(%add.1227, %reduce_sum.4289), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.695 = f32[1,1024]{1,0} multiply(%while.348, %while.348), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4290 = f32[] reduce(%square.695, %constant.712), dimensions={0,1}, to_apply=%region_522.613, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1229 = f32[] add(%add.1228, %reduce_sum.4290), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.696 = f32[1,1024]{1,0} multiply(%while.349, %while.349), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4291 = f32[] reduce(%square.696, %constant.712), dimensions={0,1}, to_apply=%region_523.614, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1230 = f32[] add(%add.1229, %reduce_sum.4291), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.697 = f32[1,1]{1,0} multiply(%while.350, %while.350), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4292 = f32[] reduce(%square.697, %constant.712), dimensions={0,1}, to_apply=%region_524.615, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1231 = f32[] add(%add.1230, %reduce_sum.4292), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.698 = f32[1,1]{1,0} multiply(%while.351, %while.351), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4293 = f32[] reduce(%square.698, %constant.712), dimensions={0,1}, to_apply=%region_525.616, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1232 = f32[] add(%add.1231, %reduce_sum.4293), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.699 = f32[1,1024]{1,0} multiply(%while.352, %while.352), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4294 = f32[] reduce(%square.699, %constant.712), dimensions={0,1}, to_apply=%region_526.617, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1233 = f32[] add(%add.1232, %reduce_sum.4294), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.700 = f32[1,1]{1,0} multiply(%while.353, %while.353), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4295 = f32[] reduce(%square.700, %constant.712), dimensions={0,1}, to_apply=%region_527.618, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1234 = f32[] add(%add.1233, %reduce_sum.4295), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.701 = f32[1,1024]{1,0} multiply(%while.354, %while.354), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4296 = f32[] reduce(%square.701, %constant.712), dimensions={0,1}, to_apply=%region_528.619, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1235 = f32[] add(%add.1234, %reduce_sum.4296), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.702 = f32[1,1]{1,0} multiply(%while.355, %while.355), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4297 = f32[] reduce(%square.702, %constant.712), dimensions={0,1}, to_apply=%region_529.620, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1236 = f32[] add(%add.1235, %reduce_sum.4297), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.703 = f32[1,1024]{1,0} multiply(%while.356, %while.356), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4298 = f32[] reduce(%square.703, %constant.712), dimensions={0,1}, to_apply=%region_530.621, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1237 = f32[] add(%add.1236, %reduce_sum.4298), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.704 = f32[1,1]{1,0} multiply(%while.357, %while.357), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4299 = f32[] reduce(%square.704, %constant.712), dimensions={0,1}, to_apply=%region_531.622, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1238 = f32[] add(%add.1237, %reduce_sum.4299), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.705 = f32[1,1024]{1,0} multiply(%while.358, %while.358), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4300 = f32[] reduce(%square.705, %constant.712), dimensions={0,1}, to_apply=%region_532.623, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1239 = f32[] add(%add.1238, %reduce_sum.4300), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.706 = f32[1,1]{1,0} multiply(%while.359, %while.359), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4301 = f32[] reduce(%square.706, %constant.712), dimensions={0,1}, to_apply=%region_533.624, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1240 = f32[] add(%add.1239, %reduce_sum.4301), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.707 = f32[1,1024]{1,0} multiply(%while.360, %while.360), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4302 = f32[] reduce(%square.707, %constant.712), dimensions={0,1}, to_apply=%region_534.625, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1241 = f32[] add(%add.1240, %reduce_sum.4302), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.708 = f32[1,1]{1,0} multiply(%while.361, %while.361), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4303 = f32[] reduce(%square.708, %constant.712), dimensions={0,1}, to_apply=%region_535.626, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1242 = f32[] add(%add.1241, %reduce_sum.4303), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.709 = f32[1,1024]{1,0} multiply(%while.362, %while.362), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4304 = f32[] reduce(%square.709, %constant.712), dimensions={0,1}, to_apply=%region_536.627, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1243 = f32[] add(%add.1242, %reduce_sum.4304), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.710 = f32[1,1]{1,0} multiply(%while.363, %while.363), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4305 = f32[] reduce(%square.710, %constant.712), dimensions={0,1}, to_apply=%region_537.628, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1244 = f32[] add(%add.1243, %reduce_sum.4305), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.711 = f32[1,1024]{1,0} multiply(%while.364, %while.364), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4306 = f32[] reduce(%square.711, %constant.712), dimensions={0,1}, to_apply=%region_538.629, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1245 = f32[] add(%add.1244, %reduce_sum.4306), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.712 = f32[1,1]{1,0} multiply(%while.365, %while.365), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4307 = f32[] reduce(%square.712, %constant.712), dimensions={0,1}, to_apply=%region_539.630, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1246 = f32[] add(%add.1245, %reduce_sum.4307), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.713 = f32[1,1024]{1,0} multiply(%while.366, %while.366), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4308 = f32[] reduce(%square.713, %constant.712), dimensions={0,1}, to_apply=%region_540.631, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1247 = f32[] add(%add.1246, %reduce_sum.4308), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.714 = f32[1,1]{1,0} multiply(%while.367, %while.367), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4309 = f32[] reduce(%square.714, %constant.712), dimensions={0,1}, to_apply=%region_541.632, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1248 = f32[] add(%add.1247, %reduce_sum.4309), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.715 = f32[1,1024]{1,0} multiply(%while.368, %while.368), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4310 = f32[] reduce(%square.715, %constant.712), dimensions={0,1}, to_apply=%region_542.633, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1249 = f32[] add(%add.1248, %reduce_sum.4310), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.716 = f32[1,1]{1,0} multiply(%while.369, %while.369), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4311 = f32[] reduce(%square.716, %constant.712), dimensions={0,1}, to_apply=%region_543.634, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1250 = f32[] add(%add.1249, %reduce_sum.4311), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.717 = f32[2,1024]{1,0} multiply(%while.228, %while.228), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4312 = f32[] reduce(%square.717, %constant.712), dimensions={0,1}, to_apply=%region_544.635, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1251 = f32[] add(%add.1250, %reduce_sum.4312), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.718 = f32[2,1]{1,0} multiply(%while.229, %while.229), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4313 = f32[] reduce(%square.718, %constant.712), dimensions={0,1}, to_apply=%region_545.636, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1252 = f32[] add(%add.1251, %reduce_sum.4313), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.719 = f32[2,1024]{1,0} multiply(%while.230, %while.230), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4314 = f32[] reduce(%square.719, %constant.712), dimensions={0,1}, to_apply=%region_546.637, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1253 = f32[] add(%add.1252, %reduce_sum.4314), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.720 = f32[2,1]{1,0} multiply(%while.231, %while.231), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4315 = f32[] reduce(%square.720, %constant.712), dimensions={0,1}, to_apply=%region_547.638, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1254 = f32[] add(%add.1253, %reduce_sum.4315), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.721 = f32[2,1024]{1,0} multiply(%while.232, %while.232), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4316 = f32[] reduce(%square.721, %constant.712), dimensions={0,1}, to_apply=%region_548.639, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1255 = f32[] add(%add.1254, %reduce_sum.4316), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.722 = f32[2,1]{1,0} multiply(%while.233, %while.233), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4317 = f32[] reduce(%square.722, %constant.712), dimensions={0,1}, to_apply=%region_549.640, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1256 = f32[] add(%add.1255, %reduce_sum.4317), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.723 = f32[2,1024]{1,0} multiply(%while.234, %while.234), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4318 = f32[] reduce(%square.723, %constant.712), dimensions={0,1}, to_apply=%region_550.641, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1257 = f32[] add(%add.1256, %reduce_sum.4318), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.724 = f32[2,1]{1,0} multiply(%while.235, %while.235), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4319 = f32[] reduce(%square.724, %constant.712), dimensions={0,1}, to_apply=%region_551.642, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1258 = f32[] add(%add.1257, %reduce_sum.4319), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.725 = f32[2,1024]{1,0} multiply(%while.236, %while.236), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4320 = f32[] reduce(%square.725, %constant.712), dimensions={0,1}, to_apply=%region_552.643, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1259 = f32[] add(%add.1258, %reduce_sum.4320), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.726 = f32[2,1]{1,0} multiply(%while.237, %while.237), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4321 = f32[] reduce(%square.726, %constant.712), dimensions={0,1}, to_apply=%region_553.644, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1260 = f32[] add(%add.1259, %reduce_sum.4321), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.727 = f32[2,1024]{1,0} multiply(%while.238, %while.238), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4322 = f32[] reduce(%square.727, %constant.712), dimensions={0,1}, to_apply=%region_554.645, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1261 = f32[] add(%add.1260, %reduce_sum.4322), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.728 = f32[2,1]{1,0} multiply(%while.239, %while.239), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4323 = f32[] reduce(%square.728, %constant.712), dimensions={0,1}, to_apply=%region_555.646, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1262 = f32[] add(%add.1261, %reduce_sum.4323), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.729 = f32[2,1024]{1,0} multiply(%while.240, %while.240), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4324 = f32[] reduce(%square.729, %constant.712), dimensions={0,1}, to_apply=%region_556.647, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1263 = f32[] add(%add.1262, %reduce_sum.4324), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.730 = f32[2,1]{1,0} multiply(%while.241, %while.241), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4325 = f32[] reduce(%square.730, %constant.712), dimensions={0,1}, to_apply=%region_557.648, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1264 = f32[] add(%add.1263, %reduce_sum.4325), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.731 = f32[2,1024]{1,0} multiply(%while.242, %while.242), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4326 = f32[] reduce(%square.731, %constant.712), dimensions={0,1}, to_apply=%region_558.649, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1265 = f32[] add(%add.1264, %reduce_sum.4326), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.732 = f32[2,1024]{1,0} multiply(%while.243, %while.243), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4327 = f32[] reduce(%square.732, %constant.712), dimensions={0,1}, to_apply=%region_559.650, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1266 = f32[] add(%add.1265, %reduce_sum.4327), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.733 = f32[2,1]{1,0} multiply(%while.244, %while.244), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4328 = f32[] reduce(%square.733, %constant.712), dimensions={0,1}, to_apply=%region_560.651, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1267 = f32[] add(%add.1266, %reduce_sum.4328), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.734 = f32[2,1]{1,0} multiply(%while.245, %while.245), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4329 = f32[] reduce(%square.734, %constant.712), dimensions={0,1}, to_apply=%region_561.652, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1268 = f32[] add(%add.1267, %reduce_sum.4329), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.735 = f32[2,1024]{1,0} multiply(%while.246, %while.246), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4330 = f32[] reduce(%square.735, %constant.712), dimensions={0,1}, to_apply=%region_562.653, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1269 = f32[] add(%add.1268, %reduce_sum.4330), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.736 = f32[2,1024]{1,0} multiply(%while.247, %while.247), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4331 = f32[] reduce(%square.736, %constant.712), dimensions={0,1}, to_apply=%region_563.654, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1270 = f32[] add(%add.1269, %reduce_sum.4331), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.737 = f32[2,1]{1,0} multiply(%while.248, %while.248), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4332 = f32[] reduce(%square.737, %constant.712), dimensions={0,1}, to_apply=%region_564.655, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1271 = f32[] add(%add.1270, %reduce_sum.4332), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.738 = f32[2,1]{1,0} multiply(%while.249, %while.249), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4333 = f32[] reduce(%square.738, %constant.712), dimensions={0,1}, to_apply=%region_565.656, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1272 = f32[] add(%add.1271, %reduce_sum.4333), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.739 = f32[2,1024]{1,0} multiply(%while.250, %while.250), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4334 = f32[] reduce(%square.739, %constant.712), dimensions={0,1}, to_apply=%region_566.657, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1273 = f32[] add(%add.1272, %reduce_sum.4334), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.740 = f32[2,1]{1,0} multiply(%while.251, %while.251), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4335 = f32[] reduce(%square.740, %constant.712), dimensions={0,1}, to_apply=%region_567.658, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1274 = f32[] add(%add.1273, %reduce_sum.4335), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.741 = f32[2,1024]{1,0} multiply(%while.252, %while.252), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4336 = f32[] reduce(%square.741, %constant.712), dimensions={0,1}, to_apply=%region_568.659, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1275 = f32[] add(%add.1274, %reduce_sum.4336), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.742 = f32[2,1]{1,0} multiply(%while.253, %while.253), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4337 = f32[] reduce(%square.742, %constant.712), dimensions={0,1}, to_apply=%region_569.660, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1276 = f32[] add(%add.1275, %reduce_sum.4337), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.743 = f32[2,1024]{1,0} multiply(%while.254, %while.254), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4338 = f32[] reduce(%square.743, %constant.712), dimensions={0,1}, to_apply=%region_570.661, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1277 = f32[] add(%add.1276, %reduce_sum.4338), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.744 = f32[2,1]{1,0} multiply(%while.255, %while.255), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4339 = f32[] reduce(%square.744, %constant.712), dimensions={0,1}, to_apply=%region_571.662, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1278 = f32[] add(%add.1277, %reduce_sum.4339), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.745 = f32[2,1024]{1,0} multiply(%while.256, %while.256), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4340 = f32[] reduce(%square.745, %constant.712), dimensions={0,1}, to_apply=%region_572.663, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1279 = f32[] add(%add.1278, %reduce_sum.4340), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.746 = f32[2,1]{1,0} multiply(%while.257, %while.257), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4341 = f32[] reduce(%square.746, %constant.712), dimensions={0,1}, to_apply=%region_573.664, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1280 = f32[] add(%add.1279, %reduce_sum.4341), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.747 = f32[2,1024]{1,0} multiply(%while.258, %while.258), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4342 = f32[] reduce(%square.747, %constant.712), dimensions={0,1}, to_apply=%region_574.665, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1281 = f32[] add(%add.1280, %reduce_sum.4342), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.748 = f32[2,1]{1,0} multiply(%while.259, %while.259), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4343 = f32[] reduce(%square.748, %constant.712), dimensions={0,1}, to_apply=%region_575.666, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1282 = f32[] add(%add.1281, %reduce_sum.4343), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.749 = f32[2,1024]{1,0} multiply(%while.260, %while.260), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4344 = f32[] reduce(%square.749, %constant.712), dimensions={0,1}, to_apply=%region_576.667, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1283 = f32[] add(%add.1282, %reduce_sum.4344), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.750 = f32[2,1]{1,0} multiply(%while.261, %while.261), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4345 = f32[] reduce(%square.750, %constant.712), dimensions={0,1}, to_apply=%region_577.668, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1284 = f32[] add(%add.1283, %reduce_sum.4345), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.751 = f32[2,1024]{1,0} multiply(%while.262, %while.262), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4346 = f32[] reduce(%square.751, %constant.712), dimensions={0,1}, to_apply=%region_578.669, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1285 = f32[] add(%add.1284, %reduce_sum.4346), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.752 = f32[2,1]{1,0} multiply(%while.263, %while.263), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4347 = f32[] reduce(%square.752, %constant.712), dimensions={0,1}, to_apply=%region_579.670, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1286 = f32[] add(%add.1285, %reduce_sum.4347), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.753 = f32[2,1024]{1,0} multiply(%while.264, %while.264), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4348 = f32[] reduce(%square.753, %constant.712), dimensions={0,1}, to_apply=%region_580.671, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1287 = f32[] add(%add.1286, %reduce_sum.4348), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.754 = f32[2,1]{1,0} multiply(%while.265, %while.265), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4349 = f32[] reduce(%square.754, %constant.712), dimensions={0,1}, to_apply=%region_581.672, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1288 = f32[] add(%add.1287, %reduce_sum.4349), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.755 = f32[2,1024]{1,0} multiply(%while.266, %while.266), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4350 = f32[] reduce(%square.755, %constant.712), dimensions={0,1}, to_apply=%region_582.673, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1289 = f32[] add(%add.1288, %reduce_sum.4350), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.756 = f32[2,1]{1,0} multiply(%while.267, %while.267), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4351 = f32[] reduce(%square.756, %constant.712), dimensions={0,1}, to_apply=%region_583.674, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1290 = f32[] add(%add.1289, %reduce_sum.4351), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.757 = f32[2,1024]{1,0} multiply(%while.268, %while.268), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4352 = f32[] reduce(%square.757, %constant.712), dimensions={0,1}, to_apply=%region_584.675, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1291 = f32[] add(%add.1290, %reduce_sum.4352), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.758 = f32[2,1]{1,0} multiply(%while.269, %while.269), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4353 = f32[] reduce(%square.758, %constant.712), dimensions={0,1}, to_apply=%region_585.676, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1292 = f32[] add(%add.1291, %reduce_sum.4353), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.759 = f32[2,1024]{1,0} multiply(%while.270, %while.270), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4354 = f32[] reduce(%square.759, %constant.712), dimensions={0,1}, to_apply=%region_586.677, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1293 = f32[] add(%add.1292, %reduce_sum.4354), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.760 = f32[2,1]{1,0} multiply(%while.271, %while.271), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4355 = f32[] reduce(%square.760, %constant.712), dimensions={0,1}, to_apply=%region_587.678, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1294 = f32[] add(%add.1293, %reduce_sum.4355), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.761 = f32[2,1024]{1,0} multiply(%while.272, %while.272), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4356 = f32[] reduce(%square.761, %constant.712), dimensions={0,1}, to_apply=%region_588.679, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1295 = f32[] add(%add.1294, %reduce_sum.4356), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.762 = f32[2,1]{1,0} multiply(%while.273, %while.273), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4357 = f32[] reduce(%square.762, %constant.712), dimensions={0,1}, to_apply=%region_589.680, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1296 = f32[] add(%add.1295, %reduce_sum.4357), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.763 = f32[2,1024]{1,0} multiply(%while.274, %while.274), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4358 = f32[] reduce(%square.763, %constant.712), dimensions={0,1}, to_apply=%region_590.681, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1297 = f32[] add(%add.1296, %reduce_sum.4358), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.764 = f32[2,1]{1,0} multiply(%while.275, %while.275), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4359 = f32[] reduce(%square.764, %constant.712), dimensions={0,1}, to_apply=%region_591.682, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1298 = f32[] add(%add.1297, %reduce_sum.4359), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.765 = f32[2,1024]{1,0} multiply(%while.276, %while.276), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4360 = f32[] reduce(%square.765, %constant.712), dimensions={0,1}, to_apply=%region_592.683, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1299 = f32[] add(%add.1298, %reduce_sum.4360), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.766 = f32[2,1]{1,0} multiply(%while.277, %while.277), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4361 = f32[] reduce(%square.766, %constant.712), dimensions={0,1}, to_apply=%region_593.684, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1300 = f32[] add(%add.1299, %reduce_sum.4361), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.767 = f32[2,1024]{1,0} multiply(%while.278, %while.278), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4362 = f32[] reduce(%square.767, %constant.712), dimensions={0,1}, to_apply=%region_594.685, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1301 = f32[] add(%add.1300, %reduce_sum.4362), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.768 = f32[2,1]{1,0} multiply(%while.279, %while.279), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4363 = f32[] reduce(%square.768, %constant.712), dimensions={0,1}, to_apply=%region_595.686, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1302 = f32[] add(%add.1301, %reduce_sum.4363), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.769 = f32[2,1024]{1,0} multiply(%while.280, %while.280), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4364 = f32[] reduce(%square.769, %constant.712), dimensions={0,1}, to_apply=%region_596.687, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1303 = f32[] add(%add.1302, %reduce_sum.4364), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.770 = f32[2,1]{1,0} multiply(%while.281, %while.281), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4365 = f32[] reduce(%square.770, %constant.712), dimensions={0,1}, to_apply=%region_597.688, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1304 = f32[] add(%add.1303, %reduce_sum.4365), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.771 = f32[2,1024]{1,0} multiply(%while.282, %while.282), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4366 = f32[] reduce(%square.771, %constant.712), dimensions={0,1}, to_apply=%region_598.689, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1305 = f32[] add(%add.1304, %reduce_sum.4366), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.772 = f32[2,1024]{1,0} multiply(%while.283, %while.283), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4367 = f32[] reduce(%square.772, %constant.712), dimensions={0,1}, to_apply=%region_599.690, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1306 = f32[] add(%add.1305, %reduce_sum.4367), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.773 = f32[2,1]{1,0} multiply(%while.284, %while.284), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4368 = f32[] reduce(%square.773, %constant.712), dimensions={0,1}, to_apply=%region_600.691, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1307 = f32[] add(%add.1306, %reduce_sum.4368), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.774 = f32[2,1]{1,0} multiply(%while.285, %while.285), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4369 = f32[] reduce(%square.774, %constant.712), dimensions={0,1}, to_apply=%region_601.692, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1308 = f32[] add(%add.1307, %reduce_sum.4369), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.775 = f32[2,1024]{1,0} multiply(%while.286, %while.286), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4370 = f32[] reduce(%square.775, %constant.712), dimensions={0,1}, to_apply=%region_602.693, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1309 = f32[] add(%add.1308, %reduce_sum.4370), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.776 = f32[2,1024]{1,0} multiply(%while.287, %while.287), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4371 = f32[] reduce(%square.776, %constant.712), dimensions={0,1}, to_apply=%region_603.694, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1310 = f32[] add(%add.1309, %reduce_sum.4371), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.777 = f32[2,1]{1,0} multiply(%while.288, %while.288), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4372 = f32[] reduce(%square.777, %constant.712), dimensions={0,1}, to_apply=%region_604.695, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1311 = f32[] add(%add.1310, %reduce_sum.4372), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.778 = f32[2,1]{1,0} multiply(%while.289, %while.289), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4373 = f32[] reduce(%square.778, %constant.712), dimensions={0,1}, to_apply=%region_605.696, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1312 = f32[] add(%add.1311, %reduce_sum.4373), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.779 = f32[2,1024]{1,0} multiply(%while.290, %while.290), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4374 = f32[] reduce(%square.779, %constant.712), dimensions={0,1}, to_apply=%region_606.697, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1313 = f32[] add(%add.1312, %reduce_sum.4374), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.780 = f32[2,1]{1,0} multiply(%while.291, %while.291), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4375 = f32[] reduce(%square.780, %constant.712), dimensions={0,1}, to_apply=%region_607.698, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1314 = f32[] add(%add.1313, %reduce_sum.4375), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.781 = f32[2,1024]{1,0} multiply(%while.292, %while.292), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4376 = f32[] reduce(%square.781, %constant.712), dimensions={0,1}, to_apply=%region_608.699, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1315 = f32[] add(%add.1314, %reduce_sum.4376), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.782 = f32[2,1]{1,0} multiply(%while.293, %while.293), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4377 = f32[] reduce(%square.782, %constant.712), dimensions={0,1}, to_apply=%region_609.700, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1316 = f32[] add(%add.1315, %reduce_sum.4377), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.783 = f32[2,1024]{1,0} multiply(%while.294, %while.294), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4378 = f32[] reduce(%square.783, %constant.712), dimensions={0,1}, to_apply=%region_610.701, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1317 = f32[] add(%add.1316, %reduce_sum.4378), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.784 = f32[2,1]{1,0} multiply(%while.295, %while.295), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4379 = f32[] reduce(%square.784, %constant.712), dimensions={0,1}, to_apply=%region_611.702, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1318 = f32[] add(%add.1317, %reduce_sum.4379), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.785 = f32[2,1024]{1,0} multiply(%while.296, %while.296), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4380 = f32[] reduce(%square.785, %constant.712), dimensions={0,1}, to_apply=%region_612.703, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1319 = f32[] add(%add.1318, %reduce_sum.4380), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.786 = f32[2,1]{1,0} multiply(%while.297, %while.297), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4381 = f32[] reduce(%square.786, %constant.712), dimensions={0,1}, to_apply=%region_613.704, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1320 = f32[] add(%add.1319, %reduce_sum.4381), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.787 = f32[2,1024]{1,0} multiply(%while.298, %while.298), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4382 = f32[] reduce(%square.787, %constant.712), dimensions={0,1}, to_apply=%region_614.705, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1321 = f32[] add(%add.1320, %reduce_sum.4382), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.788 = f32[2,1]{1,0} multiply(%while.299, %while.299), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4383 = f32[] reduce(%square.788, %constant.712), dimensions={0,1}, to_apply=%region_615.706, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1322 = f32[] add(%add.1321, %reduce_sum.4383), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.789 = f32[2,1024]{1,0} multiply(%while.300, %while.300), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4384 = f32[] reduce(%square.789, %constant.712), dimensions={0,1}, to_apply=%region_616.707, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1323 = f32[] add(%add.1322, %reduce_sum.4384), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.790 = f32[2,1]{1,0} multiply(%while.301, %while.301), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4385 = f32[] reduce(%square.790, %constant.712), dimensions={0,1}, to_apply=%region_617.708, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1324 = f32[] add(%add.1323, %reduce_sum.4385), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.791 = f32[2,1024]{1,0} multiply(%while.302, %while.302), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4386 = f32[] reduce(%square.791, %constant.712), dimensions={0,1}, to_apply=%region_618.709, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1325 = f32[] add(%add.1324, %reduce_sum.4386), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.792 = f32[2,1]{1,0} multiply(%while.303, %while.303), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4387 = f32[] reduce(%square.792, %constant.712), dimensions={0,1}, to_apply=%region_619.710, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1326 = f32[] add(%add.1325, %reduce_sum.4387), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.793 = f32[2,1024]{1,0} multiply(%while.304, %while.304), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4388 = f32[] reduce(%square.793, %constant.712), dimensions={0,1}, to_apply=%region_620.711, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1327 = f32[] add(%add.1326, %reduce_sum.4388), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.794 = f32[2,1]{1,0} multiply(%while.305, %while.305), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4389 = f32[] reduce(%square.794, %constant.712), dimensions={0,1}, to_apply=%region_621.712, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1328 = f32[] add(%add.1327, %reduce_sum.4389), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.795 = f32[2,1024]{1,0} multiply(%while.306, %while.306), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4390 = f32[] reduce(%square.795, %constant.712), dimensions={0,1}, to_apply=%region_622.713, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1329 = f32[] add(%add.1328, %reduce_sum.4390), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.796 = f32[2,1]{1,0} multiply(%while.307, %while.307), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %reduce_sum.4391 = f32[] reduce(%square.796, %constant.712), dimensions={0,1}, to_apply=%region_623.714, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %add.1330 = f32[] add(%add.1329, %reduce_sum.4391), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.797 = bf16[7168]{0} multiply(%reduce_sum.4071, %reduce_sum.4071), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1673 = f32[7168]{0} convert(%square.797), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4392 = f32[] reduce(%convert_element_type.1673, %constant.712), dimensions={0}, to_apply=%region_624.715, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1674 = bf16[] convert(%reduce_sum.4392), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1675 = f32[] convert(%convert_element_type.1674), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1331 = f32[] add(%add.1330, %convert_element_type.1675), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.798 = bf16[7168,1,18432]{2,1,0} multiply(%transpose.167, %transpose.167), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1676 = f32[7168,1,18432]{2,1,0} convert(%square.798), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4393 = f32[] reduce(%convert_element_type.1676, %constant.712), dimensions={0,1,2}, to_apply=%region_625.716, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1677 = bf16[] convert(%reduce_sum.4393), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1678 = f32[] convert(%convert_element_type.1677), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1332 = f32[] add(%add.1331, %convert_element_type.1678), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.799 = bf16[7168,1,18432]{2,1,0} multiply(%transpose.166, %transpose.166), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1679 = f32[7168,1,18432]{2,1,0} convert(%square.799), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4394 = f32[] reduce(%convert_element_type.1679, %constant.712), dimensions={0,1,2}, to_apply=%region_626.717, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1680 = bf16[] convert(%reduce_sum.4394), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1681 = f32[] convert(%convert_element_type.1680), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1333 = f32[] add(%add.1332, %convert_element_type.1681), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.800 = bf16[18432,1,7168]{2,1,0} multiply(%transpose.165, %transpose.165), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1682 = f32[18432,1,7168]{2,1,0} convert(%square.800), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4395 = f32[] reduce(%convert_element_type.1682, %constant.712), dimensions={0,1,2}, to_apply=%region_627.718, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1683 = bf16[] convert(%reduce_sum.4395), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1684 = f32[] convert(%convert_element_type.1683), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1334 = f32[] add(%add.1333, %convert_element_type.1684), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.801 = bf16[7168,1]{1,0} multiply(%transpose.164, %transpose.164), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1685 = f32[7168,1]{1,0} convert(%square.801), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4396 = f32[] reduce(%convert_element_type.1685, %constant.712), dimensions={0,1}, to_apply=%region_628.719, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1686 = bf16[] convert(%reduce_sum.4396), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1687 = f32[] convert(%convert_element_type.1686), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1335 = f32[] add(%add.1334, %convert_element_type.1687), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.802 = bf16[7168,1]{1,0} multiply(%transpose.163, %transpose.163), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1688 = f32[7168,1]{1,0} convert(%square.802), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4397 = f32[] reduce(%convert_element_type.1688, %constant.712), dimensions={0,1}, to_apply=%region_629.720, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1689 = bf16[] convert(%reduce_sum.4397), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1690 = f32[] convert(%convert_element_type.1689), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1336 = f32[] add(%add.1335, %convert_element_type.1690), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.803 = bf16[512,1]{1,0} multiply(%transpose.162, %transpose.162), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1691 = f32[512,1]{1,0} convert(%square.803), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4398 = f32[] reduce(%convert_element_type.1691, %constant.712), dimensions={0,1}, to_apply=%region_630.721, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1692 = bf16[] convert(%reduce_sum.4398), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1693 = f32[] convert(%convert_element_type.1692), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1337 = f32[] add(%add.1336, %convert_element_type.1693), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.804 = bf16[128,1,128,7168]{3,2,1,0} multiply(%transpose.161, %transpose.161), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1694 = f32[128,1,128,7168]{3,2,1,0} convert(%square.804), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4399 = f32[] reduce(%convert_element_type.1694, %constant.712), dimensions={0,1,2,3}, to_apply=%region_631.722, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1695 = bf16[] convert(%reduce_sum.4399), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1696 = f32[] convert(%convert_element_type.1695), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1338 = f32[] add(%add.1337, %convert_element_type.1696), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.805 = bf16[1536,1]{1,0} multiply(%transpose.160, %transpose.160), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1697 = f32[1536,1]{1,0} convert(%square.805), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4400 = f32[] reduce(%convert_element_type.1697, %constant.712), dimensions={0,1}, to_apply=%region_632.723, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1698 = bf16[] convert(%reduce_sum.4400), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1699 = f32[] convert(%convert_element_type.1698), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1339 = f32[] add(%add.1338, %convert_element_type.1699), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.806 = bf16[7168,1,576]{2,1,0} multiply(%transpose.159, %transpose.159), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1700 = f32[7168,1,576]{2,1,0} convert(%square.806), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4401 = f32[] reduce(%convert_element_type.1700, %constant.712), dimensions={0,1,2}, to_apply=%region_633.724, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1701 = bf16[] convert(%reduce_sum.4401), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1702 = f32[] convert(%convert_element_type.1701), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1340 = f32[] add(%add.1339, %convert_element_type.1702), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.807 = bf16[512,1,128,256]{3,2,1,0} multiply(%transpose.158, %transpose.158), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1703 = f32[512,1,128,256]{3,2,1,0} convert(%square.807), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4402 = f32[] reduce(%convert_element_type.1703, %constant.712), dimensions={0,1,2,3}, to_apply=%region_634.725, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1704 = bf16[] convert(%reduce_sum.4402), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1705 = f32[] convert(%convert_element_type.1704), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1341 = f32[] add(%add.1340, %convert_element_type.1705), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.808 = bf16[7168,1,1536]{2,1,0} multiply(%transpose.157, %transpose.157), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1706 = f32[7168,1,1536]{2,1,0} convert(%square.808), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4403 = f32[] reduce(%convert_element_type.1706, %constant.712), dimensions={0,1,2}, to_apply=%region_635.726, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1707 = bf16[] convert(%reduce_sum.4403), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1708 = f32[] convert(%convert_element_type.1707), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1342 = f32[] add(%add.1341, %convert_element_type.1708), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.809 = bf16[1536,1,128,192]{3,2,1,0} multiply(%transpose.156, %transpose.156), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1709 = f32[1536,1,128,192]{3,2,1,0} convert(%square.809), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4404 = f32[] reduce(%convert_element_type.1709, %constant.712), dimensions={0,1,2,3}, to_apply=%region_636.727, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1710 = bf16[] convert(%reduce_sum.4404), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1711 = f32[] convert(%convert_element_type.1710), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1343 = f32[] add(%add.1342, %convert_element_type.1711), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.810 = bf16[7168,129280]{0,1} multiply(%transpose.138, %transpose.138), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1712 = f32[7168,129280]{0,1} convert(%square.810), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4405 = f32[] reduce(%convert_element_type.1712, %constant.712), dimensions={0,1}, to_apply=%region_637.728, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1713 = bf16[] convert(%reduce_sum.4405), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1714 = f32[] convert(%convert_element_type.1713), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1344 = f32[] add(%add.1343, %convert_element_type.1714), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %reduce_sum.4406 = f32[] reduce(%broadcast.238, %constant.712), dimensions={0,1}, to_apply=%region_638.729, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1715 = bf16[] convert(%reduce_sum.4406), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1716 = f32[] convert(%convert_element_type.1715), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1345 = f32[] add(%add.1344, %convert_element_type.1716), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.811 = bf16[7168,2,64]{2,0,1} multiply(%transpose.155, %transpose.155), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1717 = f32[7168,2,64]{2,0,1} convert(%square.811), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4407 = f32[] reduce(%convert_element_type.1717, %constant.712), dimensions={0,1,2}, to_apply=%region_639.730, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1718 = bf16[] convert(%reduce_sum.4407), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1719 = f32[] convert(%convert_element_type.1718), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1346 = f32[] add(%add.1345, %convert_element_type.1719), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.812 = bf16[64,2]{0,1} multiply(%transpose.154, %transpose.154), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1720 = f32[64,2]{0,1} convert(%square.812), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4408 = f32[] reduce(%convert_element_type.1720, %constant.712), dimensions={0,1}, to_apply=%region_640.731, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1721 = bf16[] convert(%reduce_sum.4408), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1722 = f32[] convert(%convert_element_type.1721), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1347 = f32[] add(%add.1346, %convert_element_type.1722), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.813 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%transpose.153, %transpose.153), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1723 = f32[64,2,7168,2048]{3,2,0,1} convert(%square.813), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4409 = f32[] reduce(%convert_element_type.1723, %constant.712), dimensions={0,1,2,3}, to_apply=%region_641.732, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1724 = bf16[] convert(%reduce_sum.4409), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1725 = f32[] convert(%convert_element_type.1724), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1348 = f32[] add(%add.1347, %convert_element_type.1725), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.814 = bf16[64,2,7168,2048]{3,2,0,1} multiply(%transpose.152, %transpose.152), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1726 = f32[64,2,7168,2048]{3,2,0,1} convert(%square.814), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4410 = f32[] reduce(%convert_element_type.1726, %constant.712), dimensions={0,1,2,3}, to_apply=%region_642.733, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1727 = bf16[] convert(%reduce_sum.4410), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1728 = f32[] convert(%convert_element_type.1727), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1349 = f32[] add(%add.1348, %convert_element_type.1728), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.815 = bf16[64,2,2048,7168]{3,2,0,1} multiply(%transpose.151, %transpose.151), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1729 = f32[64,2,2048,7168]{3,2,0,1} convert(%square.815), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4411 = f32[] reduce(%convert_element_type.1729, %constant.712), dimensions={0,1,2,3}, to_apply=%region_643.734, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1730 = bf16[] convert(%reduce_sum.4411), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1731 = f32[] convert(%convert_element_type.1730), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1350 = f32[] add(%add.1349, %convert_element_type.1731), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.816 = bf16[7168,2,2048]{2,0,1} multiply(%transpose.150, %transpose.150), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1732 = f32[7168,2,2048]{2,0,1} convert(%square.816), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4412 = f32[] reduce(%convert_element_type.1732, %constant.712), dimensions={0,1,2}, to_apply=%region_644.735, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1733 = bf16[] convert(%reduce_sum.4412), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1734 = f32[] convert(%convert_element_type.1733), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1351 = f32[] add(%add.1350, %convert_element_type.1734), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.817 = bf16[7168,2,2048]{2,0,1} multiply(%transpose.149, %transpose.149), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1735 = f32[7168,2,2048]{2,0,1} convert(%square.817), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4413 = f32[] reduce(%convert_element_type.1735, %constant.712), dimensions={0,1,2}, to_apply=%region_645.736, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1736 = bf16[] convert(%reduce_sum.4413), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1737 = f32[] convert(%convert_element_type.1736), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1352 = f32[] add(%add.1351, %convert_element_type.1737), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.818 = bf16[2048,2,7168]{2,0,1} multiply(%transpose.148, %transpose.148), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1738 = f32[2048,2,7168]{2,0,1} convert(%square.818), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4414 = f32[] reduce(%convert_element_type.1738, %constant.712), dimensions={0,1,2}, to_apply=%region_646.737, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1739 = bf16[] convert(%reduce_sum.4414), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1740 = f32[] convert(%convert_element_type.1739), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1353 = f32[] add(%add.1352, %convert_element_type.1740), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.819 = bf16[7168,2]{0,1} multiply(%transpose.147, %transpose.147), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1741 = f32[7168,2]{0,1} convert(%square.819), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4415 = f32[] reduce(%convert_element_type.1741, %constant.712), dimensions={0,1}, to_apply=%region_647.738, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1742 = bf16[] convert(%reduce_sum.4415), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1743 = f32[] convert(%convert_element_type.1742), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1354 = f32[] add(%add.1353, %convert_element_type.1743), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.820 = bf16[7168,2]{0,1} multiply(%transpose.146, %transpose.146), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1744 = f32[7168,2]{0,1} convert(%square.820), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4416 = f32[] reduce(%convert_element_type.1744, %constant.712), dimensions={0,1}, to_apply=%region_648.739, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1745 = bf16[] convert(%reduce_sum.4416), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1746 = f32[] convert(%convert_element_type.1745), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1355 = f32[] add(%add.1354, %convert_element_type.1746), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.821 = bf16[512,2]{0,1} multiply(%transpose.145, %transpose.145), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1747 = f32[512,2]{0,1} convert(%square.821), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4417 = f32[] reduce(%convert_element_type.1747, %constant.712), dimensions={0,1}, to_apply=%region_649.740, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1748 = bf16[] convert(%reduce_sum.4417), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1749 = f32[] convert(%convert_element_type.1748), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1356 = f32[] add(%add.1355, %convert_element_type.1749), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.822 = bf16[128,2,128,7168]{3,2,0,1} multiply(%transpose.144, %transpose.144), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1750 = f32[128,2,128,7168]{3,2,0,1} convert(%square.822), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4418 = f32[] reduce(%convert_element_type.1750, %constant.712), dimensions={0,1,2,3}, to_apply=%region_650.741, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1751 = bf16[] convert(%reduce_sum.4418), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1752 = f32[] convert(%convert_element_type.1751), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1357 = f32[] add(%add.1356, %convert_element_type.1752), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.823 = bf16[1536,2]{0,1} multiply(%transpose.143, %transpose.143), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1753 = f32[1536,2]{0,1} convert(%square.823), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4419 = f32[] reduce(%convert_element_type.1753, %constant.712), dimensions={0,1}, to_apply=%region_651.742, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1754 = bf16[] convert(%reduce_sum.4419), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1755 = f32[] convert(%convert_element_type.1754), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1358 = f32[] add(%add.1357, %convert_element_type.1755), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.824 = bf16[7168,2,576]{2,0,1} multiply(%transpose.142, %transpose.142), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1756 = f32[7168,2,576]{2,0,1} convert(%square.824), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4420 = f32[] reduce(%convert_element_type.1756, %constant.712), dimensions={0,1,2}, to_apply=%region_652.743, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1757 = bf16[] convert(%reduce_sum.4420), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1758 = f32[] convert(%convert_element_type.1757), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1359 = f32[] add(%add.1358, %convert_element_type.1758), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.825 = bf16[512,2,128,256]{3,2,0,1} multiply(%transpose.141, %transpose.141), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1759 = f32[512,2,128,256]{3,2,0,1} convert(%square.825), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4421 = f32[] reduce(%convert_element_type.1759, %constant.712), dimensions={0,1,2,3}, to_apply=%region_653.744, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1760 = bf16[] convert(%reduce_sum.4421), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1761 = f32[] convert(%convert_element_type.1760), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1360 = f32[] add(%add.1359, %convert_element_type.1761), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.826 = bf16[7168,2,1536]{2,0,1} multiply(%transpose.140, %transpose.140), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1762 = f32[7168,2,1536]{2,0,1} convert(%square.826), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4422 = f32[] reduce(%convert_element_type.1762, %constant.712), dimensions={0,1,2}, to_apply=%region_654.745, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1763 = bf16[] convert(%reduce_sum.4422), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1764 = f32[] convert(%convert_element_type.1763), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1361 = f32[] add(%add.1360, %convert_element_type.1764), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.827 = bf16[1536,2,128,192]{3,2,0,1} multiply(%transpose.139, %transpose.139), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1765 = f32[1536,2,128,192]{3,2,0,1} convert(%square.827), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4423 = f32[] reduce(%convert_element_type.1765, %constant.712), dimensions={0,1,2,3}, to_apply=%region_655.746, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1766 = bf16[] convert(%reduce_sum.4423), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1767 = f32[] convert(%convert_element_type.1766), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1362 = f32[] add(%add.1361, %convert_element_type.1767), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %square.828 = bf16[129280,7168]{1,0} multiply(%scatter-add.5, %scatter-add.5), metadata={op_name="jit(train_step)/square" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=69 source_end_column=82}
+  %convert_element_type.1768 = f32[129280,7168]{1,0} convert(%square.828), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %reduce_sum.4424 = f32[] reduce(%convert_element_type.1768, %constant.712), dimensions={0,1}, to_apply=%region_656.747, metadata={op_name="jit(train_step)/reduce_sum" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1769 = bf16[] convert(%reduce_sum.4424), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=61 source_end_column=83}
+  %convert_element_type.1770 = f32[] convert(%convert_element_type.1769), metadata={op_name="jit(train_step)/convert_element_type" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %add.1363 = f32[] add(%add.1362, %convert_element_type.1770), metadata={op_name="jit(train_step)/add" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=57 source_end_column=83}
+  %sqrt.72 = f32[] sqrt(%add.1363), metadata={op_name="jit(train_step)/sqrt" source_file="/maxtext_workspace/maxtext/src/MaxText/max_utils.py" source_line=68 source_end_line=68 source_column=9 source_end_column=105}
+  %reshape.530 = f32[] reshape(%sqrt.72)
+  %reshape.531 = s32[] reshape(%reduce_sum.4069)
+  ROOT %tuple.23 = (s32[], f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=5*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=10*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=15*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=20*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, /*index=25*/f32[1,1024]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=30*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=35*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, /*index=40*/f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[1,1024]{1,0}, f32[1,1]{1,0}, /*index=45*/f32[1,1024]{1,0}, f32[1,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=50*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=55*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=60*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=65*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=70*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=75*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=80*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=85*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=90*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=95*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=100*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, /*index=105*/f32[2,1024]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=110*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=115*/f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, /*index=120*/f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, f32[2,1024]{1,0}, f32[2,1]{1,0}, /*index=125*/f32[2,1024]{1,0}, f32[2,1]{1,0}, bf16[7168]{0}, bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, /*index=130*/bf16[18432,1,7168]{2,1,0}, bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, /*index=135*/bf16[1536,1]{1,0}, bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, /*index=140*/bf16[7168,129280]{1,0}, bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, /*index=145*/bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, /*index=150*/bf16[7168,2]{1,0}, bf16[7168,2]{1,0}, bf16[512,2]{1,0}, bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, /*index=155*/bf16[7168,2,576]{2,1,0}, bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, /*index=160*/s32[], bf16[7168]{0}, bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, bf16[18432,1,7168]{2,1,0}, /*index=165*/bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, bf16[1536,1]{1,0}, /*index=170*/bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, bf16[7168,129280]{1,0}, /*index=175*/bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,7168,2048]{3,2,1,0}, /*index=180*/bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, bf16[7168,2]{1,0}, /*index=185*/bf16[7168,2]{1,0}, bf16[512,2]{1,0}, bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, bf16[7168,2,576]{2,1,0}, /*index=190*/bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, bf16[7168]{0}, /*index=195*/bf16[7168,1,18432]{2,1,0}, bf16[7168,1,18432]{2,1,0}, bf16[18432,1,7168]{2,1,0}, bf16[7168,1]{1,0}, bf16[7168,1]{1,0}, /*index=200*/bf16[512,1]{1,0}, bf16[128,1,128,7168]{3,2,1,0}, bf16[1536,1]{1,0}, bf16[7168,1,576]{2,1,0}, bf16[512,1,128,256]{3,2,1,0}, /*index=205*/bf16[7168,1,1536]{2,1,0}, bf16[1536,1,128,192]{3,2,1,0}, bf16[7168,129280]{1,0}, bf16[64,2]{1,0}, bf16[7168,2,64]{2,1,0}, /*index=210*/bf16[64,2]{1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,7168,2048]{3,2,1,0}, bf16[64,2,2048,7168]{3,2,1,0}, bf16[7168,2,2048]{2,1,0}, /*index=215*/bf16[7168,2,2048]{2,1,0}, bf16[2048,2,7168]{2,1,0}, bf16[7168,2]{1,0}, bf16[7168,2]{1,0}, bf16[512,2]{1,0}, /*index=220*/bf16[128,2,128,7168]{3,2,1,0}, bf16[1536,2]{1,0}, bf16[7168,2,576]{2,1,0}, bf16[512,2,128,256]{3,2,1,0}, bf16[7168,2,1536]{2,1,0}, /*index=225*/bf16[1536,2,128,192]{3,2,1,0}, bf16[129280,7168]{1,0}, s32[], f32[], f32[], /*index=230*/f32[], f32[], f32[], f32[], s32[]) tuple(%custom-call.684, %custom-call.685, %custom-call.686, %custom-call.687, %custom-call.688, /*index=5*/%custom-call.689, %custom-call.690, %custom-call.691, %custom-call.692, %custom-call.693, /*index=10*/%custom-call.694, %custom-call.695, %custom-call.696, %custom-call.697, %custom-call.698, /*index=15*/%custom-call.699, %custom-call.700, %custom-call.701, %custom-call.702, %custom-call.703, /*index=20*/%custom-call.704, %custom-call.705, %custom-call.706, %custom-call.707, %custom-call.708, /*index=25*/%custom-call.709, %custom-call.710, %custom-call.711, %custom-call.712, %custom-call.713, /*index=30*/%custom-call.714, %custom-call.715, %custom-call.716, %custom-call.717, %custom-call.718, /*index=35*/%custom-call.719, %custom-call.720, %custom-call.721, %custom-call.722, %custom-call.723, /*index=40*/%custom-call.724, %custom-call.725, %custom-call.726, %custom-call.727, %custom-call.728, /*index=45*/%custom-call.729, %custom-call.730, %custom-call.731, %custom-call.732, %custom-call.733, /*index=50*/%custom-call.734, %custom-call.735, %custom-call.736, %custom-call.737, %custom-call.738, /*index=55*/%custom-call.739, %custom-call.740, %custom-call.741, %custom-call.742, %custom-call.743, /*index=60*/%custom-call.744, %custom-call.745, %custom-call.746, %custom-call.747, %custom-call.748, /*index=65*/%custom-call.749, %custom-call.750, %custom-call.751, %custom-call.752, %custom-call.753, /*index=70*/%custom-call.754, %custom-call.755, %custom-call.756, %custom-call.757, %custom-call.758, /*index=75*/%custom-call.759, %custom-call.760, %custom-call.761, %custom-call.762, %custom-call.763, /*index=80*/%custom-call.764, %custom-call.765, %custom-call.766, %custom-call.767, %custom-call.768, /*index=85*/%custom-call.769, %custom-call.770, %custom-call.771, %custom-call.772, %custom-call.773, /*index=90*/%custom-call.774, %custom-call.775, %custom-call.776, %custom-call.777, %custom-call.778, /*index=95*/%custom-call.779, %custom-call.780, %custom-call.781, %custom-call.782, %custom-call.783, /*index=100*/%custom-call.784, %custom-call.785, %custom-call.786, %custom-call.787, %custom-call.788, /*index=105*/%custom-call.789, %custom-call.790, %custom-call.791, %custom-call.792, %custom-call.793, /*index=110*/%custom-call.794, %custom-call.795, %custom-call.796, %custom-call.797, %custom-call.798, /*index=115*/%custom-call.799, %custom-call.800, %custom-call.801, %custom-call.802, %custom-call.803, /*index=120*/%custom-call.804, %custom-call.805, %custom-call.806, %custom-call.807, %custom-call.808, /*index=125*/%custom-call.809, %custom-call.810, %custom-call.811, %custom-call.812, %custom-call.813, /*index=130*/%custom-call.814, %custom-call.815, %custom-call.816, %custom-call.817, %custom-call.818, /*index=135*/%custom-call.819, %custom-call.820, %custom-call.821, %custom-call.822, %custom-call.823, /*index=140*/%custom-call.824, %custom-call.825, %custom-call.826, %custom-call.827, %custom-call.828, /*index=145*/%custom-call.829, %custom-call.830, %custom-call.831, %custom-call.832, %custom-call.833, /*index=150*/%custom-call.834, %custom-call.835, %custom-call.836, %custom-call.837, %custom-call.838, /*index=155*/%custom-call.839, %custom-call.840, %custom-call.841, %custom-call.842, %custom-call.843, /*index=160*/%custom-call.844, %custom-call.845, %custom-call.846, %custom-call.847, %custom-call.848, /*index=165*/%custom-call.849, %custom-call.850, %custom-call.851, %custom-call.852, %custom-call.853, /*index=170*/%custom-call.854, %custom-call.855, %custom-call.856, %custom-call.857, %custom-call.858, /*index=175*/%custom-call.859, %custom-call.860, %custom-call.861, %custom-call.862, %custom-call.863, /*index=180*/%custom-call.864, %custom-call.865, %custom-call.866, %custom-call.867, %custom-call.868, /*index=185*/%custom-call.869, %custom-call.870, %custom-call.871, %custom-call.872, %custom-call.873, /*index=190*/%custom-call.874, %custom-call.875, %custom-call.876, %custom-call.877, %custom-call.878, /*index=195*/%custom-call.879, %custom-call.880, %custom-call.881, %custom-call.882, %custom-call.883, /*index=200*/%custom-call.884, %custom-call.885, %custom-call.886, %custom-call.887, %custom-call.888, /*index=205*/%custom-call.889, %custom-call.890, %custom-call.891, %custom-call.892, %custom-call.893, /*index=210*/%custom-call.894, %custom-call.895, %custom-call.896, %custom-call.897, %custom-call.898, /*index=215*/%custom-call.899, %custom-call.900, %custom-call.901, %custom-call.902, %custom-call.903, /*index=220*/%custom-call.904, %custom-call.905, %custom-call.906, %custom-call.907, %custom-call.908, /*index=225*/%custom-call.909, %custom-call.910, %custom-call.911, %sqrt.71, %div.1432, /*index=230*/%constant.712, %constant.712, %sqrt.73, %sqrt.72, %reduce_sum.4069), sharding={{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=5*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=10*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=15*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=20*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=25*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=30*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=35*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=40*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=45*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=50*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=55*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=60*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=65*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=70*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=75*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=80*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=85*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=90*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=95*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=100*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=105*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=110*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=115*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=120*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}, /*index=125*/{replicated}, {replicated}, {replicated}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, /*index=130*/{devices=[1,1,4]<=[4]}, {replicated}, {replicated}, {replicated}, {devices=[1,1,1,4]<=[4]}, /*index=135*/{replicated}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, /*index=140*/{devices=[4,1]<=[4]}, {replicated}, {devices=[4,1,1]<=[4]}, {replicated}, {devices=[4,1,1,1]<=[4]}, /*index=145*/{devices=[4,1,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[1,1,4]<=[4]}, /*index=150*/{replicated}, {replicated}, {replicated}, {devices=[1,1,1,4]<=[4]}, {replicated}, /*index=155*/{devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[1,4]<=[4]}, /*index=160*/{replicated}, {replicated}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[1,1,4]<=[4]}, /*index=165*/{replicated}, {replicated}, {replicated}, {devices=[1,1,1,4]<=[4]}, {replicated}, /*index=170*/{devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1]<=[4]}, /*index=175*/{replicated}, {devices=[4,1,1]<=[4]}, {replicated}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, /*index=180*/{devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[1,1,4]<=[4]}, {replicated}, /*index=185*/{replicated}, {replicated}, {devices=[1,1,1,4]<=[4]}, {replicated}, {devices=[4,1,1]<=[4]}, /*index=190*/{devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[1,4]<=[4]}, {replicated}, /*index=195*/{devices=[4,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, {devices=[1,1,4]<=[4]}, {replicated}, {replicated}, /*index=200*/{replicated}, {devices=[1,1,1,4]<=[4]}, {replicated}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, /*index=205*/{devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1]<=[4]}, {replicated}, {devices=[4,1,1]<=[4]}, /*index=210*/{replicated}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, /*index=215*/{devices=[4,1,1]<=[4]}, {devices=[1,1,4]<=[4]}, {replicated}, {replicated}, {replicated}, /*index=220*/{devices=[1,1,1,4]<=[4]}, {replicated}, {devices=[4,1,1]<=[4]}, {devices=[4,1,1,1]<=[4]}, {devices=[4,1,1]<=[4]}, /*index=225*/{devices=[4,1,1,1]<=[4]}, {devices=[1,4]<=[4]}, {replicated}, {replicated}, {replicated}, /*index=230*/{replicated}, {replicated}, {replicated}, {replicated}, {replicated}}
+}
+
diff --git a/third_party/xla/xla/tools/benchmarks/hlo/u4_all_gather_1x8.hlo b/third_party/xla/xla/tools/benchmarks/hlo/u4_all_gather_1x8.hlo
new file mode 100644
index 00000000000000..970ba9d91512d7
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/hlo/u4_all_gather_1x8.hlo
@@ -0,0 +1,4 @@
+e {
+  a = u4[1024,512]{1,0:E(4)} parameter(0)
+  b = u4[1024,4096]{1,0:E(4)} all-gather(a), dimensions={1}
+}
diff --git a/third_party/xla/xla/tools/benchmarks/utils/BUILD b/third_party/xla/xla/tools/benchmarks/utils/BUILD
index b7bbbc3e6dc3e4..fcc66a70fc0d60 100644
--- a/third_party/xla/xla/tools/benchmarks/utils/BUILD
+++ b/third_party/xla/xla/tools/benchmarks/utils/BUILD
@@ -28,6 +28,8 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf",
+        "@com_google_protobuf//:protobuf_lite",
         "@jsoncpp_git//:jsoncpp",  # buildcleaner: keep
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
@@ -41,7 +43,6 @@ xla_cc_test(
         ":generate_benchmark_matrices_cc",
         "//xla/tools/benchmarks/proto:benchmark_config_proto_cc",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
index c3434c0bdeadd2..a34900b559e30b 100644
--- a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "json/json.h"
+#include "google/protobuf/repeated_ptr_field.h"
+#include "google/protobuf/text_format.h"
 #include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
 #include "xla/tsl/platform/env.h"
 #include "tsl/platform/path.h"
@@ -106,24 +108,24 @@ GetHardwareToRunnerLabelMap() {
 
 const absl::flat_hash_map<std::string, std::string>&
 GetHardwareToContainerImage() {
-  static const auto* kHardwareToContainerImage = new absl::flat_hash_map<
-      std::string, std::string>{
-      {"CPU_X86",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build:infrastructure-public-image-530371eedb7e"},
-      {"CPU_ARM64",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-arm64:latest"},
-      {"GPU_L4",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-cuda12.8-cudnn9.8:infrastructure-public-image-46c0fc3324bc"},
-      {"GPU_B200",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-cuda12.8-cudnn9.8:infrastructure-public-image-46c0fc3324bc"},
-      {"GPU_L4_1H_4D",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-cuda12.8-cudnn9.8:infrastructure-public-image-46c0fc3324bc"},
-  };
+  static const auto* kHardwareToContainerImage =
+      new absl::flat_hash_map<std::string, std::string>{
+          {"CPU_X86",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build:latest"},
+          {"CPU_ARM64",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-arm64:latest"},
+          {"GPU_L4",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+          {"GPU_B200",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+          {"GPU_L4_1H_4D",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+      };
   return *kHardwareToContainerImage;
 }
 
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
index 11ab2aded0f2ac..327d9777a2758d 100644
--- a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
@@ -13,8 +13,6 @@
 // limitations under the License.
 // ==============================================================================
 
-#include <algorithm>
-#include <cctype>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -24,6 +22,7 @@
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "json/json.h"
 #include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
@@ -71,8 +70,7 @@ constexpr char kUsageText[] = R"(
 absl::StatusOr<xla::WorkflowType> GetWorkflowTypeFromStr(
     std::string workflow_type_arg_str) {
   // Convert to uppercase for matching with enum names
-  std::transform(workflow_type_arg_str.begin(), workflow_type_arg_str.end(),
-                 workflow_type_arg_str.begin(), ::toupper);
+  absl::AsciiStrToUpper(&workflow_type_arg_str);
 
   static const absl::NoDestructor<
       absl::flat_hash_map<std::string, xla::WorkflowType>>
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_test.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_test.cc
index 9edf6d4410c5d5..08d25587a96e73 100644
--- a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_test.cc
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "json/json.h"
 #include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "tsl/platform/path.h"
@@ -42,8 +41,6 @@ using testing::ElementsAre;
 using testing::HasSubstr;
 using testing::Not;
 using testing::SizeIs;
-using tsl::testing::IsOkAndHolds;
-using tsl::testing::StatusIs;
 
 // Helper function to create a temporary registry file.
 std::string CreateTempRegistryFile(const std::string& content,
diff --git a/third_party/xla/xla/tools/buffer_debug_log/BUILD b/third_party/xla/xla/tools/buffer_debug_log/BUILD
new file mode 100644
index 00000000000000..e1761a3a75f09d
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/BUILD
@@ -0,0 +1,47 @@
+# Tools and utilities for analyzing the BufferDebugLogProto dumps.
+
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla:pytype.bzl", "pytype_strict_binary", "pytype_strict_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
+    licenses = ["notice"],
+)
+
+pytype_strict_library(
+    name = "checksum_mismatch_report",
+    srcs = ["checksum_mismatch_report.py"],
+    deps = [
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_py",
+        "//xla/backends/gpu/runtime:thunk_proto_py",
+    ],
+)
+
+py_strict_test(
+    name = "checksum_mismatch_report_test",
+    srcs = ["checksum_mismatch_report_test.py"],
+    deps = [
+        ":checksum_mismatch_report",
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_py",
+        "//xla/backends/gpu/runtime:thunk_proto_py",
+        "@absl_py//absl/testing:absltest",
+        "@com_google_protobuf//:protobuf_python",
+    ],
+)
+
+pytype_strict_binary(
+    name = "check_thunk_output_consistency",
+    srcs = [
+        "check_thunk_output_consistency.py",
+    ],
+    main = "check_thunk_output_consistency.py",
+    deps = [
+        ":checksum_mismatch_report",
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_py",
+        "//xla/backends/gpu/runtime:thunk_proto_py",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+        "@com_google_protobuf//:protobuf_python",
+    ],
+)
diff --git a/third_party/xla/xla/tools/buffer_debug_log/check_thunk_output_consistency.py b/third_party/xla/xla/tools/buffer_debug_log/check_thunk_output_consistency.py
new file mode 100644
index 00000000000000..d8189dcc5cf377
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/check_thunk_output_consistency.py
@@ -0,0 +1,117 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tool to analyze buffer debug logs.
+
+To generate the log files, run the HLO with
+--xla_gpu_experimental_enable_checksum_tracing_on_thunks.
+"""
+
+from collections.abc import Sequence
+
+from absl import app
+from absl import flags
+from google.protobuf import message
+from google.protobuf import text_format
+
+from xla.backends.gpu.runtime import buffer_debug_log_pb2
+from xla.backends.gpu.runtime import thunk_pb2
+from xla.tools.buffer_debug_log import checksum_mismatch_report
+
+
+def parse_binary_or_text_proto(data: bytes, proto_type):
+  """Parses a binary or text proto."""
+  try:
+    return proto_type.FromString(data)
+  except message.DecodeError:
+    pass
+  return text_format.Parse(data, proto_type())
+
+
+_METADATA_FILE = flags.DEFINE_string(
+    "metadata-file", None, "Path to the thunk metadata proto file."
+)
+
+
+def _print_formatted_report(
+    report: checksum_mismatch_report.ChecksumMismatchReport,
+):
+  """Prints a ChecksumMismatchReport to stdout in a human-readable format."""
+
+  if not report.mismatches:
+    print("\N{WHITE HEAVY CHECK MARK} All results are perfectly consistent.")
+    return
+
+  print(
+      "\N{OCTAGONAL SIGN} Different outputs detected among identical"
+      " thunk executions:"
+  )
+  for thunk_id, mismatches_by_inputs in report.mismatches.items():
+    if not mismatches_by_inputs:
+      continue
+
+    def describe_thunk(thunk_id: checksum_mismatch_report.ThunkId):
+      result = f"In outputs of thunk {thunk_id}"
+      metadata = " (metadata missing)"
+      if report.thunk_metadata:
+        thunk_metadata = report.thunk_metadata.get(thunk_id)
+        if thunk_metadata:
+          metadata = f" (kind: {thunk_metadata.thunk_kind}, profile_annotation:"
+          metadata += f" {thunk_metadata.profile_annotation})"
+      return result + metadata
+
+    print(describe_thunk(thunk_id))
+    for _, mismatches_by_buffer_idx in sorted(mismatches_by_inputs.items()):
+      for buffer_idx, checksums in mismatches_by_buffer_idx.items():
+        print(f"  buffer {buffer_idx}: checksums={checksums}")
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) < 2:
+    raise app.UsageError(
+        "Usage: buffer-debug.py [--metadata-file METADATA_PROTO_PATH]"
+        " LOG_PROTO_PATHS..."
+    )
+
+  log_protos = {}
+  for module_id, arg in enumerate(argv[1:]):
+    try:
+      with open(arg, "rb") as f:
+        log_protos[module_id] = parse_binary_or_text_proto(
+            f.read(), buffer_debug_log_pb2.BufferDebugLogProto
+        )
+    except Exception as e:
+      e.add_note(f"when reading {arg}")
+      raise
+
+  if _METADATA_FILE.value:
+    try:
+      with open(_METADATA_FILE.value, "rb") as f:
+        metadata_proto = parse_binary_or_text_proto(
+            f.read(), thunk_pb2.ThunkMetadataListProto
+        )
+    except Exception as e:
+      e.add_note(f"when reading {_METADATA_FILE.value}")
+      raise
+  else:
+    metadata_proto = thunk_pb2.ThunkMetadataListProto()
+
+  report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+      log_protos, metadata_proto
+  )
+  _print_formatted_report(report)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report.py b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report.py
new file mode 100644
index 00000000000000..d9f4b6098c5deb
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report.py
@@ -0,0 +1,250 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for detecting checksum mismatches in buffer debug logs.
+
+The log is generated by running with
+--xla_gpu_experimental_enable_checksum_tracing_on_thunks.
+"""
+
+import collections
+import dataclasses
+import itertools
+from typing import Callable, Iterable, NewType, Optional, Self, TypeVar
+
+from xla.backends.gpu.runtime import buffer_debug_log_pb2
+from xla.backends.gpu.runtime import thunk_pb2
+
+
+ModuleExecutionId = NewType("ModuleExecutionId", int)
+ThunkId = NewType("ThunkId", int)
+BufferIdx = NewType("BufferIdx", int)
+Checksum = NewType("Checksum", int)
+
+
+@dataclasses.dataclass(frozen=True)
+class BufferChecksums:
+  """A set of buffer checksums with order-independent hashing."""
+
+  checksums: dict[BufferIdx, Checksum]
+
+  def __hash__(self):
+    return hash(tuple(sorted(self.checksums.items())))
+
+
+@dataclasses.dataclass(frozen=True)
+class ThunkMetadata:
+  """Thunk metadata, read from ThunkMetadataListProto.
+
+  Stored in a separate type to enable type checking.
+  """
+
+  thunk_id: ThunkId
+  thunk_kind: str
+  profile_annotation: Optional[str]
+
+
+@dataclasses.dataclass(frozen=True)
+class ThunkExecution:
+  """The details of a single execution of a thunk."""
+
+  # An ID of the HLO module execution that produced this thunk execution.
+  module_execution_id: int
+  # An ID of the thunk execution within a HLO module execution. If a thunk
+  # executes in a loop, there will create multiple entries with same thunk_id
+  # but different execution IDs.
+  thunk_execution_id: int
+  # The ID of the thunk that was executed. Details about the thunk can be found
+  # in ThunkMetadata.
+  thunk_id: ThunkId
+  # Checksums of buffers with defined contents before thunk execution.
+  # These are used to identify repeats of the same computation that are expected
+  # to produce the same results.
+  input_checksums: BufferChecksums
+  # Checksums of buffers with defined contents after thunk execution.
+  # These are the values we want to verify are consistent across executions.
+  output_checksums: BufferChecksums
+
+
+@dataclasses.dataclass(frozen=True)
+class ChecksumMismatchReport:
+  """A report of checksum mismatches for a thunk."""
+
+  thunk_metadata: dict[ThunkId, ThunkMetadata]
+  # Thunks for which different executions produced different results. The value
+  # is a input checksums => output checksum sets dict containing the info about
+  # inconsistent outptus, and the checksums of inputs that caused them.
+  mismatches: dict[
+      ThunkId, dict[BufferChecksums, dict[BufferIdx, set[Checksum]]]
+  ]
+
+  @classmethod
+  def from_protos(
+      cls,
+      log_protos: dict[
+          ModuleExecutionId, buffer_debug_log_pb2.BufferDebugLogProto
+      ],
+      metadata_proto: thunk_pb2.ThunkMetadataListProto,
+  ) -> Self:
+    """Creates a ChecksumMismatchReport from protobufs.
+
+    Args:
+      log_protos: A dict of BufferDebugLogProto keyed by module execution ID.
+      metadata_proto: A ThunkMetadataListProto.
+
+    Preconditions:
+      - All log protos must refer to the same HLO module.
+      - metadata proto must describe the same HLO module as the log protos or be
+        an empty proto.
+    """
+    metadata = _parse_metadata(metadata_proto)
+
+    executions = itertools.chain.from_iterable(
+        _parse_log(module_execution_id, log_proto)
+        for module_execution_id, log_proto in log_protos.items()
+    )
+    mismatches = _find_inconsistent_thunks(executions)
+
+    return cls(metadata, mismatches)
+
+
+K = TypeVar("K")
+T = TypeVar("T")
+
+
+def group_by(
+    values: Iterable[T], key_getter: Callable[[T], K]
+) -> dict[K, list[T]]:
+  """Groups a sequence by a key function."""
+  result = collections.defaultdict(list)
+  for item in values:
+    result[key_getter(item)].append(item)
+  return result
+
+
+def _parse_metadata(
+    metadata_proto: thunk_pb2.ThunkMetadataListProto,
+) -> dict[ThunkId, ThunkMetadata]:
+  """Parses a ThunkMetadataListProto into a dict of ThunkMetadata."""
+  metadata_by_thunk_id: dict[ThunkId, ThunkMetadata] = {}
+  for metadata in metadata_proto.thunk_metadata:
+    thunk_id = ThunkId(metadata.thunk_info.thunk_id)
+    metadata_by_thunk_id[thunk_id] = ThunkMetadata(
+        thunk_id=thunk_id,
+        thunk_kind=metadata.thunk_kind,
+        profile_annotation=metadata.thunk_info.profile_annotation,
+    )
+
+  return metadata_by_thunk_id
+
+
+def _parse_log(
+    module_execution: int,
+    log_proto: buffer_debug_log_pb2.BufferDebugLogProto,
+) -> list[ThunkExecution]:
+  """Parses a BufferDebugLogProto and ThunkMetadataListProto into a list of ThunkExecutions."""
+
+  entries_by_execution = group_by(
+      log_proto.entries, lambda entry: (entry.thunk_id, entry.execution_id)
+  )
+  executions = [
+      ThunkExecution(
+          module_execution_id=module_execution,
+          thunk_execution_id=execution_id,
+          thunk_id=thunk_id,
+          input_checksums=BufferChecksums({
+              entry.buffer_idx: entry.checksum
+              for entry in entries
+              if entry.is_input_buffer
+          }),
+          output_checksums=BufferChecksums({
+              entry.buffer_idx: entry.checksum
+              for entry in entries
+              if not entry.is_input_buffer
+          }),
+      )
+      for (thunk_id, execution_id), entries in entries_by_execution.items()
+  ]
+  return executions
+
+
+def _find_inconsistent_output_checksums(
+    executions: list[ThunkExecution],
+) -> dict[BufferIdx, set[Checksum]]:
+  """Finds mismatches in output checksums for a list of identical executions.
+
+  Args:
+    executions: A list of executions of the same thunk on the same input
+      arguments.
+
+  Returns:
+    A dict of buffers whose contents were not consistent across executions with
+    the same inputs, based on the checksum value. The value is a set of
+    checksums observed for that buffer.
+  """
+  checksums_by_buffer_idx: dict[BufferIdx, set[Checksum]] = (
+      collections.defaultdict(set)
+  )
+  for execution in executions:
+    for buffer_idx, checksum in execution.output_checksums.checksums.items():
+      checksums_by_buffer_idx[buffer_idx].add(checksum)
+
+  return {
+      buffer_idx: checksums
+      for buffer_idx, checksums in checksums_by_buffer_idx.items()
+      if len(checksums) > 1
+  }
+
+
+def _find_inconsistent_thunks(
+    executions: Iterable[ThunkExecution],
+) -> dict[ThunkId, dict[BufferChecksums, dict[BufferIdx, set[Checksum]]]]:
+  """Finds thunks with inconsistent output checksums across identical executions.
+
+  Args:
+    executions: A arbitrary list of thunk executions.
+
+  Returns:
+    A dict of thunks whose outputs were inconsistent across identical
+    executions.
+
+    The value is a dict keyed by the set of input checksums, with values
+    identifying the output buffers with inconsistent checksums, along with the
+    set of observed checksums for each.
+  """
+  executions_by_thunk_id: dict[ThunkId, list[ThunkExecution]] = group_by(
+      executions,
+      lambda e: e.thunk_id,
+  )
+
+  mismatches: dict[
+      ThunkId, dict[BufferChecksums, dict[BufferIdx, set[Checksum]]]
+  ] = {}
+  for thunk_id, executions in executions_by_thunk_id.items():
+    executions_by_inputs: dict[BufferChecksums, list[ThunkExecution]] = (
+        group_by(executions, lambda e: e.input_checksums)
+    )
+
+    mismatches_by_inputs: dict[
+        BufferChecksums, dict[BufferIdx, set[Checksum]]
+    ] = {}
+    for input_checksums, executions in executions_by_inputs.items():
+      m = _find_inconsistent_output_checksums(executions)
+      if m:
+        mismatches_by_inputs[input_checksums] = m
+
+    if mismatches_by_inputs:
+      mismatches[thunk_id] = mismatches_by_inputs
+
+  return mismatches
diff --git a/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report_test.py b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report_test.py
new file mode 100644
index 00000000000000..6ad14efb5d4302
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report_test.py
@@ -0,0 +1,228 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from absl.testing import absltest
+from google.protobuf import text_format
+
+from xla.backends.gpu.runtime import buffer_debug_log_pb2
+from xla.backends.gpu.runtime import thunk_pb2
+from xla.tools.buffer_debug_log import checksum_mismatch_report
+
+
+class ChecksumMismatchReportTest(absltest.TestCase):
+
+  def test_from_protos_loads_metadata(self):
+    test_log = ""
+    test_metadata = """
+thunk_metadata {
+  thunk_info {
+    thunk_id: 100
+    profile_annotation: "thunk1"
+  }
+  thunk_kind: "kGemm"
+}
+thunk_metadata {
+  thunk_info {
+    thunk_id: 101
+    profile_annotation: "thunk2"
+  }
+  thunk_kind: "kConv"
+}
+"""
+    log_proto = text_format.Parse(
+        test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+    )
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        {0: log_proto}, metadata_proto
+    )
+
+    self.assertEqual(
+        report.thunk_metadata,
+        {
+            100: checksum_mismatch_report.ThunkMetadata(
+                thunk_id=100,
+                thunk_kind="kGemm",
+                profile_annotation="thunk1",
+            ),
+            101: checksum_mismatch_report.ThunkMetadata(
+                thunk_id=101,
+                thunk_kind="kConv",
+                profile_annotation="thunk2",
+            ),
+        },
+    )
+
+  def test_from_protos_finds_mismatches_in_single_proto(self):
+    test_log = """
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 22222222
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 33333333
+}
+"""
+    test_metadata = ""
+    log_proto = text_format.Parse(
+        test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+    )
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        {0: log_proto}, metadata_proto
+    )
+
+    self.assertEqual(
+        report.mismatches,
+        {
+            # thunk ID
+            100: {
+                # input checksums
+                checksum_mismatch_report.BufferChecksums({0: 11111111}): {
+                    # output buffer index => checksums
+                    1: {22222222, 33333333},
+                },
+            },
+        },
+    )
+
+  def test_from_protos_finds_mismatches_in_multiple_protos(self):
+    test_log_template = """
+entries {{
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}}
+entries {{
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: {output_checksum}
+}}
+"""
+    test_logs = [
+        test_log_template.format(output_checksum=checksum)
+        for checksum in [22222222, 33333333]
+    ]
+    test_metadata = ""
+    log_protos = {
+        module_id: text_format.Parse(
+            test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+        )
+        for module_id, test_log in enumerate(test_logs)
+    }
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        log_protos, metadata_proto
+    )
+
+    self.assertEqual(
+        report.mismatches,
+        {
+            # thunk ID
+            100: {
+                # input checksums
+                checksum_mismatch_report.BufferChecksums({0: 11111111}): {
+                    # output buffer index => checksums
+                    1: {22222222, 33333333},
+                },
+            },
+        },
+    )
+
+  def test_from_protos_does_not_include_consistent_executions(self):
+    test_log = """
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 22222222
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 22222222
+}
+"""
+    test_metadata = ""
+    log_proto = text_format.Parse(
+        test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+    )
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        {0: log_proto}, metadata_proto
+    )
+
+    self.assertEmpty(report.mismatches)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/third_party/xla/xla/tools/collective_perf_table_gen_main.cc b/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
index 51d3a5dfb54053..cf581896f882d6 100644
--- a/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
+++ b/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
@@ -158,8 +158,6 @@ std::string DefaultCollectiveDevicesIfEmpty(
 
 }  // namespace
 
-// TODO(b/390097558): Add an option to generate perf table for collective which
-// gets overlap to model resource contention.
 int main(int argc, char* argv[]) {
   // Default args.
   int32_t num_nodes = 1;
diff --git a/third_party/xla/xla/tools/collective_perf_table_gen_test.cc b/third_party/xla/xla/tools/collective_perf_table_gen_test.cc
index d11eaaa94129a0..ea57e1d2231bb0 100644
--- a/third_party/xla/xla/tools/collective_perf_table_gen_test.cc
+++ b/third_party/xla/xla/tools/collective_perf_table_gen_test.cc
@@ -33,21 +33,17 @@ using ::testing::Property;
 
 class CollectivePerfTableGenTest : public HloTestBase {
   void SetUp() override {
-    if (!IsCuda()) {
+    if (!backend()
+             .default_stream_executor()
+             ->GetDeviceDescription()
+             .gpu_compute_capability()
+             .IsCuda()) {
       GTEST_SKIP() << "Not built with --config=cuda";
     }
     cfg_.dry_run = true;
   }
 
  protected:
-  bool IsCuda() {
-    return std::holds_alternative<stream_executor::CudaComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
-  }
-
   CollectivePerfTableGen::Config cfg_;
 };
 
diff --git a/third_party/xla/xla/tools/compute_cost.cc b/third_party/xla/xla/tools/compute_cost.cc
index 6795775235d9cf..a4ad6afae7f069 100644
--- a/third_party/xla/xla/tools/compute_cost.cc
+++ b/third_party/xla/xla/tools/compute_cost.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/debug_options_flags.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/status.h"
 
 namespace {
 const char* const kUsage = R"(
@@ -111,8 +111,7 @@ int main(int argc, char** argv) {
   std::unique_ptr<xla::HloModule> module =
       *xla::LoadModuleFromFile(input, format, {});
 
-  TF_CHECK_OK(
-      module->entry_computation()->root_instruction()->Accept(&*analysis));
+  CHECK_OK(module->entry_computation()->root_instruction()->Accept(&*analysis));
 
   if (all) {
     print_costs_of_all_instructions(*module, *analysis);
diff --git a/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc b/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc
index e70fcb935596e7..5ee1b61f86c90b 100644
--- a/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc
+++ b/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc
@@ -41,10 +41,8 @@ limitations under the License.
 #include "xla/service/local_service.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "tsl/platform/env.h"
+#include "xla/tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace tools {
@@ -76,7 +74,7 @@ void RealMain(absl::Span<char* const> args) {
       ClientLibrary::GetXlaService(client->platform());
   for (char* arg : args) {
     HloSnapshot snapshot;
-    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), arg, &snapshot));
+    CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), arg, &snapshot));
     auto computation_status = client->LoadSnapshot(snapshot);
     if (!computation_status.ok()) {
       fprintf(stderr, "could not load snapshot for %s: %s\n", arg,
@@ -104,7 +102,7 @@ void RealMain(absl::Span<char* const> args) {
 
     OperationDumper dumper(arg);
     for (auto* computation : module.computations()) {
-      TF_CHECK_OK(computation->Accept(&dumper));
+      CHECK_OK(computation->Accept(&dumper));
     }
   }
 }
diff --git a/third_party/xla/xla/tools/dumped_computation_to_text.cc b/third_party/xla/xla/tools/dumped_computation_to_text.cc
index 78811d51bfc93f..2b63fc91e6dce8 100644
--- a/third_party/xla/xla/tools/dumped_computation_to_text.cc
+++ b/third_party/xla/xla/tools/dumped_computation_to_text.cc
@@ -29,15 +29,14 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/local_service.h"
 #include "xla/shape.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace tools {
@@ -48,7 +47,7 @@ void RealMain(absl::Span<char* const> args, bool compile) {
       ClientLibrary::GetXlaService(client->platform());
   for (char* arg : args) {
     HloSnapshot snapshot;
-    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), arg, &snapshot));
+    CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), arg, &snapshot));
     auto computation_status = client->LoadSnapshot(snapshot);
     if (!computation_status.ok()) {
       fprintf(stderr, "could not load snapshot for %s: %s\n", arg,
diff --git a/third_party/xla/xla/tools/extract_collective_operations.cc b/third_party/xla/xla/tools/extract_collective_operations.cc
index a8f8e98206983e..a28c0a7c670828 100644
--- a/third_party/xla/xla/tools/extract_collective_operations.cc
+++ b/third_party/xla/xla/tools/extract_collective_operations.cc
@@ -31,12 +31,11 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tools/hlo_module_loader.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/command_line_flags.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace {
 const char* const kUsage = R"(
@@ -163,7 +162,7 @@ int main(int argc, char** argv) {
     operation_types.insert(xla::HloOpcode::kAllToAll);
   }
 
-  TF_CHECK_OK(xla::ExtractCollectiveOperations(input, output, operation_types,
-                                               return_tuple));
+  CHECK_OK(xla::ExtractCollectiveOperations(input, output, operation_types,
+                                            return_tuple));
   return 0;
 }
diff --git a/third_party/xla/xla/tools/extract_dots_for_benchmark.cc b/third_party/xla/xla/tools/extract_dots_for_benchmark.cc
index 28dd2512c68ac2..b7943d2cfa580d 100644
--- a/third_party/xla/xla/tools/extract_dots_for_benchmark.cc
+++ b/third_party/xla/xla/tools/extract_dots_for_benchmark.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/protobuf.h"
 
 namespace {
 const char* const kUsage = R"(
@@ -139,8 +139,7 @@ int main(int argc, char** argv) {
   std::unique_ptr<xla::HloModule> module =
       *xla::LoadModuleFromFile(input, format, {});
 
-  TF_CHECK_OK(
-      module->entry_computation()->root_instruction()->Accept(&*analysis));
+  CHECK_OK(module->entry_computation()->root_instruction()->Accept(&*analysis));
 
   xla::PrintDots(*module);
 
diff --git a/third_party/xla/xla/tools/hlo_bisect/BUILD b/third_party/xla/xla/tools/hlo_bisect/BUILD
index ee345f207028a2..5b44a12601b5ab 100644
--- a/third_party/xla/xla/tools/hlo_bisect/BUILD
+++ b/third_party/xla/xla/tools/hlo_bisect/BUILD
@@ -80,12 +80,15 @@ cc_library(
     hdrs = ["hlo_bisect_utils.h"],
     deps = [
         ":hlo_bisect_state",
+        "//xla:debug_options_flags",
         "//xla:error_spec",
         "//xla:literal",
+        "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:dump",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_module_util",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_proto_util",
@@ -93,12 +96,23 @@ cc_library(
         "//xla/service:hlo_runner_interface",
         "//xla/service:hlo_verifier",
         "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
         "//xla/tests:literal_test_util",
         "//xla/tests:test_utils",
         "//xla/tools:prepare_reference_module",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:subprocess",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_for_library",
         "@local_tsl//tsl/platform:path",
     ],
 )
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc
index 3a2940afcbb9a1..9198c32b015558 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc
@@ -15,26 +15,47 @@ limitations under the License.
 
 #include "xla/tools/hlo_bisect/hlo_bisect_utils.h"
 
+#include <cstdint>
 #include <memory>
 #include <random>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/debug_options_flags.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/service/dump.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_proto_util.h"
 #include "xla/service/hlo_runner.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_utils.h"
+#include "xla/tools/hlo_bisect/hlo_bisect_state.h"
 #include "xla/tools/prepare_reference_module.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/subprocess.h"
 #include "xla/util.h"
 #include "tsl/platform/path.h"
@@ -51,8 +72,7 @@ Literal ExecuteWithRunnerAndRetrieveResult(std::unique_ptr<HloModule> module,
                                            bool run_hlo_passes) {
   auto result_status =
       runner->Execute(std::move(module), input_data, run_hlo_passes);
-  TF_CHECK_OK(result_status.status())
-      << "Failed to execute on " << runner->Name();
+  CHECK_OK(result_status.status()) << "Failed to execute on " << runner->Name();
   return std::move(result_status).value();
 }
 
@@ -260,7 +280,7 @@ absl::StatusOr<bool> ScriptChecker::Run(const HloModule& module) {
   }
 
   absl::Cleanup hlo_cleaner = [&] {
-    TF_CHECK_OK(tsl::Env::Default()->DeleteFile(hlo_path));
+    CHECK_OK(tsl::Env::Default()->DeleteFile(hlo_path));
   };
 
   std::string hlo_contents =
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
index e89f58490e7ed0..519ab6e5a58e0a 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
@@ -484,7 +484,7 @@ absl::Status HloControlFlowFlattening::RemoveEntryComputationLayoutDynamism(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> HloControlFlowFlattening::Run(
+absl::StatusOr<bool> HloControlFlowFlattening::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto call_graph = CallGraph::Build(module);
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.h b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
index efd2a482ba0654..9841078c3805e2 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.h
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
@@ -78,8 +78,9 @@ class HloControlFlowFlattening : public HloModulePass {
         remove_id_(options.remove_id) {}
   ~HloControlFlowFlattening() override = default;
   absl::string_view name() const override { return "control-flow-flattening"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/tools/hlo_extractor.cc b/third_party/xla/xla/tools/hlo_extractor.cc
index 914c7668b63acb..e17ba6322be7d7 100644
--- a/third_party/xla/xla/tools/hlo_extractor.cc
+++ b/third_party/xla/xla/tools/hlo_extractor.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/call_inliner.h"
+#include "xla/tsl/platform/errors.h"
 
 #ifndef _WIN32
 #include <unistd.h>
@@ -47,7 +48,6 @@ limitations under the License.
 #include "xla/service/hlo_verifier.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace {
@@ -178,7 +178,7 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
   // Replace the `hlo` with Constant of the same shape.
   absl::Status ReplaceWithConstant(const HloInstruction* hlo) {
     absl::StatusOr<Literal> literal_status = MakeFakeLiteral(hlo->shape());
-    TF_CHECK_OK(literal_status.status());
+    CHECK_OK(literal_status.status());
     auto new_const =
         HloInstruction::CreateConstant(std::move(literal_status.value()));
     clone_context_.MapInstruction(hlo, new_const.get());
@@ -234,7 +234,7 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
       } else {
         absl::StatusOr<Literal> literal_status =
             MakeFakeLiteral(constant_shape);
-        TF_CHECK_OK(literal_status.status());
+        CHECK_OK(literal_status.status());
         constant_instruction = builder->AddInstruction(
             HloInstruction::CreateConstant(std::move(literal_status.value())));
       }
@@ -358,14 +358,14 @@ std::unique_ptr<HloModule> ExtractModule(
   ExtractionVisitor visitor(instruction, &boundary, extract_selector,
                             replace_type_selector);
 
-  TF_CHECK_OK(instruction->Accept(&visitor, /*call_finish_visit=*/true,
-                                  /*ignore_control_predecessors=*/false,
-                                  /*cross_computation=*/cross_computation));
+  CHECK_OK(instruction->Accept(&visitor, /*call_finish_visit=*/true,
+                               /*ignore_control_predecessors=*/false,
+                               /*cross_computation=*/cross_computation));
 
   // Inline called computations and fusions if the flag
   // `inline_calls_and_fusions` is true.
   if (inline_calls_and_fusions) {
-    TF_CHECK_OK(Inline(visitor.module()));
+    CHECK_OK(Inline(visitor.module()));
   }
 
   // The first pass may leave unused parameter instructions in the entry
@@ -378,7 +378,7 @@ std::unique_ptr<HloModule> ExtractModule(
       /*extract_selector=*/nullptr,
       /*replace_type_selector=*/nullptr);
 
-  TF_CHECK_OK(visitor.module()->entry_computation()->root_instruction()->Accept(
+  CHECK_OK(visitor.module()->entry_computation()->root_instruction()->Accept(
       &cleanup_visitor, /*call_finish_visit=*/true,
       /*ignore_control_predecessors=*/false,
       /*cross_computation=*/false));
@@ -386,7 +386,7 @@ std::unique_ptr<HloModule> ExtractModule(
   if (run_verifier) {
     HloVerifier verifier(/*layout_sensitive=*/false,
                          /*allow_mixed_precision=*/true);
-    TF_CHECK_OK(verifier.Run(cleanup_visitor.module()).status());
+    CHECK_OK(verifier.Run(cleanup_visitor.module()).status());
   }
   return cleanup_visitor.ConsumeModule();
 }
diff --git a/third_party/xla/xla/tools/hlo_module_loader.cc b/third_party/xla/xla/tools/hlo_module_loader.cc
index bc5685ec9dc983..d805b45d9426ec 100644
--- a/third_party/xla/xla/tools/hlo_module_loader.cc
+++ b/third_party/xla/xla/tools/hlo_module_loader.cc
@@ -29,11 +29,17 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "google/protobuf/text_format.h"
 #include "re2/re2.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/translate/mhlo_to_hlo/translate.h"
+#include "xla/hlo/translate/stablehlo_to_hlo/translate.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/tools/run_hlo_module.pb.h"
@@ -83,6 +89,39 @@ absl::StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
     BufferAssignmentProto* buffer_assignment_proto, bool fill_missing_layouts) {
   DebugOptions debug_options = GetDebugOptionsFromFlags();
   std::unique_ptr<HloModule> module;
+  std::string buffer;
+  if (format == "stablehlo" || format == "mhlo") {
+    llvm::StringRef llvm_data(data.data(), data.size());
+    auto input = llvm::MemoryBuffer::getMemBuffer(
+        llvm_data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+    llvm::raw_string_ostream output(buffer);
+    auto status =
+        format == "mhlo"
+            ? xla::MlirHloToHloTextMain(
+                  std::move(input), output,
+                  /*emit_return_tuple=*/false,
+                  /*emit_use_tuple_arg=*/false,
+                  /*print_layouts=*/false,
+                  /*print_large_constants=*/true, /*print_sugar=*/false,
+                  /*via_builder=*/false, /*with_layouts=*/false)
+            : xla::StablehloToHloTextMain(
+                  std::move(input), output,
+                  /*emit_return_tuple=*/false,
+                  /*emit_use_tuple_arg=*/false,
+                  /*print_layouts=*/false,
+                  /*print_large_constants=*/true, /*print_sugar=*/false,
+                  /*via_builder=*/false, /*with_layouts=*/false);
+
+    if (status.failed()) {
+      LOG(QFATAL) << "Failed to translate input " << format
+                  << " program to HLO text";
+    }
+
+    VLOG(1) << "Input " << format << " program translated to HLO text";
+    format = "hlo";
+    data = buffer;
+  }
+
   if (format == "hlo" || format == "txt") {
     std::string hlo_string = StripLogHeaders(data);
     HloModuleConfig config;
@@ -121,8 +160,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
       }
     } else {
       return InvalidArgument(
-          "Invalid format from file extension: '%s'. Expected: hlo, txt, pb, "
-          "or pbtxt",
+          "Invalid format from file extension: '%s'. Expected: hlo, txt, "
+          "stablehlo, mhlo, pb, or pbtxt",
           format);
     }
     TF_ASSIGN_OR_RETURN(HloModuleConfig config,
diff --git a/third_party/xla/xla/tools/hlo_module_loader_test.cc b/third_party/xla/xla/tools/hlo_module_loader_test.cc
index 6f3413a14589aa..e51532b5396850 100644
--- a/third_party/xla/xla/tools/hlo_module_loader_test.cc
+++ b/third_party/xla/xla/tools/hlo_module_loader_test.cc
@@ -48,5 +48,31 @@ I0521 12:04:45.883483    1509 service.cc:186] }
   EXPECT_NE(FindInstruction(hlo_module.get(), "rooty"), nullptr);
 }
 
+TEST_F(HloModuleLoaderTest, SupportsStablehlo) {
+  const std::string& stablehlo_string = R"(
+module @jit_slice_data attributes {mhlo.num_partitions = 1 : i32, mhlo.num_replicas = 1 : i32} {
+  func.func public @main() -> (tensor<2x5xi32> {jax.result_info = "result"}) {
+    %c = stablehlo.constant dense<[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]> : tensor<2x5xi32>
+    %0 = stablehlo.iota dim = 0 : tensor<5xi32>
+    %c_0 = stablehlo.constant dense<0> : tensor<i32>
+    %1 = stablehlo.broadcast_in_dim %c_0, dims = [] : (tensor<i32>) -> tensor<5xi32>
+    %2 = stablehlo.compare  LT, %0, %1,  SIGNED : (tensor<5xi32>, tensor<5xi32>) -> tensor<5xi1>
+    %c_1 = stablehlo.constant dense<5> : tensor<i32>
+    %3 = stablehlo.broadcast_in_dim %c_1, dims = [] : (tensor<i32>) -> tensor<5xi32>
+    %4 = stablehlo.add %0, %3 : tensor<5xi32>
+    %5 = stablehlo.select %2, %4, %0 : tensor<5xi1>, tensor<5xi32>
+    %6 = stablehlo.broadcast_in_dim %5, dims = [0] : (tensor<5xi32>) -> tensor<5x1xi32>
+    %7 = "stablehlo.gather"(%c, %6) <{dimension_numbers = #stablehlo.gather<offset_dims = [0], collapsed_slice_dims = [1], start_index_map = [1], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = array<i64: 2, 1>}> : (tensor<2x5xi32>, tensor<5x1xi32>) -> tensor<2x5xi32>
+    %c_2 = stablehlo.constant dense<1> : tensor<i32>
+    %8 = stablehlo.broadcast_in_dim %c_2, dims = [] : (tensor<i32>) -> tensor<2x5xi32>
+    %9 = stablehlo.add %7, %8 : tensor<2x5xi32>
+    return %9 : tensor<2x5xi32>
+  }
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          LoadModuleFromData(stablehlo_string, "stablehlo"));
+  EXPECT_EQ(hlo_module->result_shape().ToString(), "s32[2,5]");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 1428db0ce7c548..5cc6765420d410 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -102,7 +102,6 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/service/gpu:alias_info",
         "//xla/service/gpu:compile_module_to_llvm_ir",
-        "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
         "//xla/service/gpu:gpu_hlo_schedule",
         "//xla/service/gpu:gpu_spmd_pipeline",
@@ -164,6 +163,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/cpu:target_machine_options",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:ir_compiler",
         "//xla/backends/cpu/codegen:target_machine_features",
diff --git a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
index 36e8182cc8de16..4f336f9d7c2aac 100644
--- a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/target_machine_options.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -125,8 +126,7 @@ class CpuOptProvider : public CompiledOptProvider {
         cpu::IrCompiler::InferTargetMachine(
             CompilerTargetOptions(module_config),
             CodeGenOptLevel(module_config),
-            cpu::CpuFeatureFromString(
-                module_config.debug_options().xla_cpu_max_isa()));
+            cpu::TargetMachineOptions(module_config.debug_options()));
     if (!jit_target_machine.ok()) {
       LOG(ERROR) << "Failed to infer target machine: "
                  << jit_target_machine.status();
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
index e0e04c4ecac33c..1daea09f0824f0 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_compiler.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
@@ -136,8 +135,7 @@ class GpuOptProvider : public CompiledOptProvider {
     se::GpuComputeCapability gpu_compute_capability;
     if (device_description.ok()) {
       gpu_compute_capability = device_description->gpu_compute_capability();
-      if (std::holds_alternative<se::CudaComputeCapability>(
-              gpu_compute_capability)) {
+      if (gpu_compute_capability.IsCuda()) {
         alias_info_ =
             std::make_unique<gpu::NVPTXAliasInfo>(*device_description);
       } else {
@@ -194,7 +192,7 @@ class GpuOptProvider : public CompiledOptProvider {
       const HloModule* module) {
     Compiler::CompileOptions opts;
     TF_ASSIGN_OR_RETURN(
-        Compiler::TargetConfig target_config,
+        Compiler::GpuTargetConfig target_config,
         gpu::GpuCompiler::GetTargetConfig(
             opts, module->config().debug_options(), /*executor=*/nullptr));
     return target_config.device_description;
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md b/third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md
index 90a62a563248fb..8d15dbb7e52690 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md
@@ -1,5 +1,5 @@
 The specs in this folder are obtained by calling
-`Compiler::TargetConfig::ToString()`, which turns the config into a
+`Compiler::GpuTargetConfig::ToString()`, which turns the config into a
 `GpuTargetConfigProto`, and then to a `std::string`. Most of the spec is the
 device description as a proto `GpuDeviceInfoProto`.
 
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb
new file mode 100644
index 00000000000000..a4d2f9de3aea3b
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb
@@ -0,0 +1,43 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 233472
+  threads_per_core_limit: 2048
+  core_count: 158
+  fpus_per_core: 128
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 7936000000000
+  l2_cache_size: 135528448
+  clock_rate_ghz: 1.965
+  device_memory_size: 297021865984
+  shared_memory_per_block_optin: 232448
+  cuda_compute_capability {
+    major: 10
+    minor: 3
+  }
+  registers_per_core_limit: 65536
+  registers_per_block_limit: 65536
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 9
+  minor: 9
+}
+device_description_str: "NVIDIA B300"
diff --git a/third_party/xla/xla/tools/hlo_slicer.cc b/third_party/xla/xla/tools/hlo_slicer.cc
index 79f76e7b766a85..ab78d401b49d62 100644
--- a/third_party/xla/xla/tools/hlo_slicer.cc
+++ b/third_party/xla/xla/tools/hlo_slicer.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tools/hlo_extractor.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace {
@@ -144,7 +143,7 @@ void RemoveSharding(HloModule* hlo_module) {
     // Verify if the module is still valid.
     HloVerifier verifier(/*layout_sensitive=*/false,
                          /*allow_mixed_precision=*/true);
-    TF_CHECK_OK(verifier.Run(hlo_module).status());
+    CHECK_OK(verifier.Run(hlo_module).status());
   }
 }
 
@@ -532,7 +531,7 @@ std::vector<std::unique_ptr<HloModule>> SliceModuleAndExtract(
     // Verify if the extracted module (after processing) is valid or not.
     HloVerifier verifier(/*layout_sensitive=*/false,
                          /*allow_mixed_precision=*/true);
-    TF_CHECK_OK(verifier.Run(extracted_module.get()).status());
+    CHECK_OK(verifier.Run(extracted_module.get()).status());
 
     sliced_modules.emplace_back(std::move(extracted_module));
   }
diff --git a/third_party/xla/xla/tools/interactive_graphviz.cc b/third_party/xla/xla/tools/interactive_graphviz.cc
index 1740f34e844708..e686be654d6b73 100644
--- a/third_party/xla/xla/tools/interactive_graphviz.cc
+++ b/third_party/xla/xla/tools/interactive_graphviz.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <stdio.h>
 
 #include <cstdint>
+#include <functional>
 #include <iostream>
 #include <memory>
 #include <optional>
@@ -38,6 +39,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -436,7 +438,7 @@ void DoExtractCommand(const HloModule& module,
 
   std::string outfile_name =
       tsl::io::GetTempFilename(absl::StrCat(node_name, "-extracted.hlo"));
-  TF_CHECK_OK(
+  CHECK_OK(
       tsl::WriteStringToFile(tsl::Env::Default(), outfile_name, module_str));
   std::cout << outfile_name << std::endl;
 }
@@ -754,7 +756,7 @@ void RealMain(const Options& opts) {
   std::unique_ptr<HloModule> module;
   if (!opts.hlo_snapshot.empty()) {
     HloSnapshot snapshot;
-    TF_CHECK_OK(
+    CHECK_OK(
         tsl::ReadBinaryProto(tsl::Env::Default(), opts.hlo_snapshot, &snapshot))
         << "Can't open, read, or parse HloSnapshot proto at "
         << opts.hlo_snapshot;
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen_main.cc b/third_party/xla/xla/tools/matmul_perf_table_gen_main.cc
index efd528cd724c0c..cd0863f9590693 100644
--- a/third_party/xla/xla/tools/matmul_perf_table_gen_main.cc
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen_main.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/tools/matmul_perf_table_gen.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/util/command_line_flags.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/init_main.h"
 
 constexpr absl::string_view kUsageText = R"(
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
index 835ed8fbe01d7c..7aa3d6863d752e 100644
--- a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
@@ -34,19 +35,14 @@ namespace {
 
 class MatmulPerfTableGenTest : public HloTestBase {
   void SetUp() override {
-    if (!IsCuda()) {
+    if (!backend()
+             .default_stream_executor()
+             ->GetDeviceDescription()
+             .gpu_compute_capability()
+             .IsCuda()) {
       GTEST_SKIP() << "Not built with --config=cuda";
     }
   }
-
- protected:
-  bool IsCuda() {
-    return std::holds_alternative<stream_executor::CudaComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
-  }
 };
 
 TEST_F(MatmulPerfTableGenTest, DryRunsSpecifiedSweepSpace) {
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index ef703fa9d06c7d..5a6808c377fdc4 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//xla:xla.default.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_py_strict_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_cuda_or_rocm", "if_google")
 load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
@@ -184,7 +184,6 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/runtime/large_hlo_snapshot_serialization:serialization",
@@ -198,7 +197,6 @@ cc_library(
         "//xla/tools:hlo_control_flow_flattening",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:fixed_option_set_flag",
         "@com_google_absl//absl/algorithm:container",
@@ -213,7 +211,6 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:FuncExtensions",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/lib:profiler_factory_impl",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/lib:profiler_session_impl",
@@ -232,10 +229,14 @@ xla_test(
     srcs = ["functional_hlo_runner_test.cc"],
     backend_tags = {
         "gpu": [
-            "multi_gpu_h100",
+            "multi_gpu",
             "no_oss",
             "nomsan",
         ],
+        "nvgpu_any": [
+            "broken",
+            "no_oss",
+        ],
     },
     backends = [
         "cpu",
@@ -247,10 +248,12 @@ xla_test(
         ":create_client",
         ":functional_hlo_runner",
         ":hlo_input_output_format",
+        ":profiler_interface",
         "//xla:debug_options_flags",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
@@ -259,11 +262,9 @@ xla_test(
         "//xla/service:computation_layout",
         "//xla/service:hlo_proto_cc",
         "//xla/tests:xla_test_backend_predicates",
-        "//xla/tools/multihost_hlo_runner:profiler_interface",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:subprocess",
         "//xla/tsl/platform:test",
@@ -271,6 +272,7 @@ xla_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -278,7 +280,7 @@ xla_test(
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
-    ],
+    ] + if_cuda(["//xla/tsl/cuda:nvml"]),
 )
 
 tsl_pybind_extension(
@@ -335,3 +337,26 @@ tsl_pybind_extension(
         "//xla/stream_executor:rocm_platform",
     ]),
 )
+
+xla_py_strict_test(
+    name = "python_hlo_runner_test",
+    srcs = ["python_hlo_runner_test.py"],
+    data = [
+        ":hlo_file",
+    ],
+    # Transformer engine dlopens several cuda libraries and so requires them as data dependencies.
+    need_cuda_libs = True,
+    tags = [
+        "gpu",
+        # Transformer engine takes a long time to compile. Disabling it for CI tests.
+        "no_oss",
+        "requires-gpu-sm90-only",
+    ],
+    deps = [
+        ":py_hlo_multihost_runner",
+        "@absl_py//absl/testing:absltest",
+        "@transformer_engine//:transformer_engine_jax",
+    ] + if_cuda([
+        "//xla/stream_executor:cuda_platform",
+    ]),
+)
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/data/transformer_engine_softmax.hlo b/third_party/xla/xla/tools/multihost_hlo_runner/data/transformer_engine_softmax.hlo
new file mode 100644
index 00000000000000..d95bf065ebe051
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/data/transformer_engine_softmax.hlo
@@ -0,0 +1,7 @@
+HloModule f
+
+ENTRY f {
+  // Softmax custom call requires rank 4 input.
+  %arg1 = bf16[4,1,16,32] parameter(0)
+  ROOT %custom-call = bf16[4,1,16,32] custom-call(bf16[4,1,16,32] %arg1), custom_call_target="te_scaled_softmax_forward_ffi", api_version=API_VERSION_TYPED_FFI, backend_config={scale_factor = 0.200000e+00 : f64}
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index bc4ab9aa00acc1..306598ec942a14 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/primitive_util.h"
 #include "xla/runtime/large_hlo_snapshot_serialization/serialization.h"
@@ -75,17 +74,16 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tools/hlo_control_flow_flattening.h"
+#include "xla/tools/multihost_hlo_runner/hlo_input_output_format.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/file_system_helper.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/fixed_option_set_flag.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/profiler/lib/profiler_session.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
@@ -112,35 +110,12 @@ absl::Span<PjRtDevice* const> GetLocalDevices(const PjRtClient& client) {
 //
 // Case 1: the HLO module is compiled with
 // CompileOptions::parameter_is_tupled_arguments = true
-// and the HLO module is executed with
-// ExecuteOptions::arguments_are_tupled = false.
 // This enables PjRtClient::Execute to assemble the tupled arguments from
 // a flat list of buffers.
-// Additionally, we set ExecuteOptions::untuple_result = true if the module's
-// output is a tuple. Thus we can use the aliased output buffer as input
-// arguments and reuse the non-aliased argument buffers. In this mode, users may
-// provide the argument literals as a list of tuples (for the convenience of
-// future use cases) or a tuple literal (to support existing use cases).
 //
 // Case 2: the HLO module is compiled with
 // CompileOptions::parameter_is_tupled_arguments = false
-// and the HLO module is executed with
-// ExecuteOptions::arguments_are_tupled = false.
-// Same as above, we set ExecuteOptions::untuple_result = true if the module's
-// output is a tuple. This allows us to reuse on-device buffers in the same way
-// as case 1.
-//
-// Case 3: the HLO module is compiled with
-// CompileOptions::parameter_is_tupled_arguments = false
-// and the HLO module is executed with
-// ExecuteOptions::arguments_are_tupled = false.
 // We will create new on-device buffers for each repeated execution.
-//
-// Irrespective of the above, if the output is a tuple with leaves mixing host
-// and device memory spaces, we set ExecuteOptions::untuple_result = true.
-// Otherwise PJRT cannot correctly represent these tuples, because a PjRtBuffer
-// can only belong to one memory space. By "untupling", PJRT assigns a separate
-// PjRtBuffer to each leaf.
 
 enum class ParameterType {
   kOneTupleOfArrays = 0,
@@ -238,19 +213,60 @@ absl::StatusOr<HloModuleAndArguments> ReadModuleFromSnapshotBinaryProtoFile(
   return hlo_module_and_arguments;
 }
 
-absl::StatusOr<HloModuleAndArguments>
-ReadModuleFromUnoptimizedSnapshotBinaryProtoFile(absl::string_view hlo_file) {
-  HloModuleAndArguments hlo_module_and_arguments;
-  tsl::Env* env = tsl::Env::Default();
+namespace {
 
+// This function reads an HloUnoptimizedSnapshot from the file at the given
+// path.
+// It first tries to deserialize the snapshot using custom serialization.
+// If that fails, it falls back to standard deserialization.
+absl::StatusOr<HloUnoptimizedSnapshot> ReadHloUnoptimizedSnapshot(
+    absl::string_view hlo_file) {
+  tsl::Env* env = tsl::Env::Default();
   std::unique_ptr<tsl::RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(std::string(hlo_file), &file));
 
-  tsl::RandomAccessFileCopyingInputStream input_stream(file.get());
-  tsl::protobuf::io::CopyingInputStreamAdaptor adaptor(&input_stream);
+  tsl::RandomAccessFileCopyingInputStream custom_deserialization_input_stream(
+      file.get());
+  tsl::protobuf::io::CopyingInputStreamAdaptor custom_deserialization_adaptor(
+      &custom_deserialization_input_stream);
+
+  // Try to deserialize snapshot with custom deserialization.
+  auto proto_or_status =
+      DeserializeHloUnoptimizedSnapshot(&custom_deserialization_adaptor);
+
+  if (proto_or_status.ok()) {
+    return proto_or_status.value();
+  }
+
+  // Fallback to standard deserialization.
+
+  HloUnoptimizedSnapshot proto;
+  // Fallback to standard deserialization. This requires creating a new input
+  // stream because we need to read from the beginning of the file.
+  tsl::RandomAccessFileCopyingInputStream fallback_input_stream(file.get());
+  tsl::protobuf::io::CopyingInputStreamAdaptor fallback_adaptor(
+      &fallback_input_stream);
+  tsl::protobuf::io::CodedInputStream coded_stream(&fallback_adaptor);
+
+  if (!proto.ParseFromCodedStream(&coded_stream)) {
+    return Internal(
+        "Failed to parse HloUnoptimizedSnapshot from the input stream with "
+        "standard deserialization. Custom deserialization also failed with "
+        "error: "
+        "%s",
+        proto_or_status.status().message());
+  }
+  return proto;
+}
+
+}  // namespace
+
+absl::StatusOr<HloModuleAndArguments>
+ReadModuleFromUnoptimizedSnapshotBinaryProtoFile(absl::string_view hlo_file) {
+  HloModuleAndArguments hlo_module_and_arguments;
 
   TF_ASSIGN_OR_RETURN(HloUnoptimizedSnapshot proto,
-                      DeserializeHloUnoptimizedSnapshot(&adaptor));
+                      ReadHloUnoptimizedSnapshot(hlo_file));
 
   TF_ASSIGN_OR_RETURN(hlo_module_and_arguments.hlo_module,
                       CreateModuleFromProto(proto.hlo_module()));
@@ -372,8 +388,10 @@ absl::StatusOr<PerDeviceLiteralVecType> FetchAndLogOutput(
         TF_RET_CHECK(buffer->device() == output_buffers[i][0]->device())
             << "All outputs from a given vector of outputs should be for the "
                "same device";
+        TF_ASSIGN_OR_RETURN(auto logical_shape,
+                            buffer->logical_on_device_shape());
         output_slice.emplace_back(
-            ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
+            ShapeUtil::DeviceShapeToHostShape(logical_shape));
         buffer->ToLiteral(&output_slice.back()).OnReady([&](absl::Status s) {
           absl::MutexLock lock(mu);
           --num_pending_transfers;
@@ -543,7 +561,6 @@ absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
   };
 
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers;
-  execute_options.arguments_are_tupled = false;
   std::optional<std::vector<Future<>>> futures;
   futures.emplace();
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> device_buffers;
@@ -570,7 +587,6 @@ absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
                                                 flatten_arguments));
         argument_ptrs = CreateArgumentPointersFromDeviceBuffers(device_buffers);
       }
-      execute_options.untuple_result = true;
       execute_options.launch_id = repeat + 1 + running_options.base_run_id;
       if (running_options.execution_profiles != nullptr) {
         execute_options.execution_profile =
@@ -963,16 +979,18 @@ CreateArgumentsOnDevice(PjRtClient& client,
 
 // Creates an ExecutableBuildOptions using the specified ExecutionOptions.
 ExecutableBuildOptions CreateExecutableBuildOptionsFromExecutionOptions(
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options, bool preserve_xla_dump_to) {
   ExecutableBuildOptions build_options;
   if (execution_options.has_debug_options()) {
     *build_options.mutable_debug_options() = execution_options.debug_options();
-    build_options.mutable_debug_options()->set_xla_dump_to("");
+    if (!preserve_xla_dump_to) {
+      build_options.mutable_debug_options()->set_xla_dump_to("");
+    }
   }
   if (execution_options.has_shape_with_output_layout()) {
     absl::StatusOr<Shape> shape =
         Shape::FromProto(execution_options.shape_with_output_layout());
-    TF_CHECK_OK(shape.status());
+    CHECK_OK(shape.status());
     build_options.set_result_layout(*shape);
   }
   build_options.set_num_replicas(execution_options.num_replicas());
@@ -991,7 +1009,7 @@ ExecutableBuildOptions CreateExecutableBuildOptionsFromExecutionOptions(
   if (execution_options.has_device_assignment()) {
     absl::StatusOr<std::unique_ptr<DeviceAssignment>> device_assignment =
         DeviceAssignment::Deserialize(execution_options.device_assignment());
-    TF_CHECK_OK(device_assignment.status());
+    CHECK_OK(device_assignment.status());
     build_options.set_device_assignment(**device_assignment);
   }
   build_options.set_alias_passthrough_params(
@@ -1016,7 +1034,8 @@ absl::StatusOr<CompileOptions> CreateCompileOptions(
   if (raw_options.execution_options.has_value()) {
     compile_options.executable_build_options =
         CreateExecutableBuildOptionsFromExecutionOptions(
-            raw_options.execution_options.value());
+            raw_options.execution_options.value(),
+            raw_options.preserve_xla_dump_to);
   }
 
   ExecutableBuildOptions& build_options =
@@ -1180,16 +1199,17 @@ absl::Status LoadAndRunAndDump(
     const xla::FunctionalHloRunner::RunningOptions& running_options,
     absl::string_view hlo_file, InputFormat input_format,
     std::string dump_output_to, int task_id, int num_nodes,
-    std::shared_ptr<xla::KeyValueStoreInterface> kv_store) {
+    std::shared_ptr<xla::KeyValueStoreInterface> kv_store,
+    std::minstd_rand0* engine) {
   TF_ASSIGN_OR_RETURN(
       CompileOptions compile_options,
       FunctionalHloRunner::CreateCompileOptions(client, raw_compile_options,
                                                 task_id, num_nodes, kv_store));
   TF_ASSIGN_OR_RETURN(
       FunctionalHloRunner::PerDeviceLiteralVecType output,
-      FunctionalHloRunner::LoadAndRun(client, debug_options, preproc_options,
-                                      compile_options, running_options,
-                                      hlo_file, input_format));
+      FunctionalHloRunner::LoadAndRun(
+          client, debug_options, preproc_options, compile_options,
+          running_options, hlo_file, input_format, /*arguments=*/{}, engine));
   return dump_output_to.empty()
              ? absl::OkStatus()
              : FunctionalHloRunner::DumpOutput(output, dump_output_to, task_id);
@@ -1549,7 +1569,7 @@ void HLORunnerProfiler::CreateSession() {
 void HLORunnerProfiler::UploadSession() {
   xspace_ = std::make_unique<tensorflow::profiler::XSpace>();
   // Stops the ProfilerSession
-  TF_CHECK_OK(session_->CollectData(xspace_.get()));
+  CHECK_OK(session_->CollectData(xspace_.get()));
 
   CHECK(!dump_path_.empty());
 
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
index 57e38a49199512..6ff9496e4d8f6b 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
@@ -234,6 +234,9 @@ struct RawCompileOptions {
   std::optional<int> num_slices = std::nullopt;
   // A directory to dump xla debug data to.
   std::string xla_dump_to = "";
+  // When user runs HLO runner with hlo_config provided and
+  // XLA_FLAGS=--xla_dump_to=dir we want to respect the xla_dump_to field.
+  bool preserve_xla_dump_to = false;
   XlaTextDumpMode xla_text_dump_mode = XlaTextDumpMode::kNotDumpAsText;
   XlaProtoDumpMode xla_proto_dump_mode = XlaProtoDumpMode::kNotDumpAsProto;
   // A directory to dump xspace data to (GPU profiler only).
@@ -314,7 +317,8 @@ absl::Status LoadAndRunAndDump(
     const xla::FunctionalHloRunner::RunningOptions& running_options,
     absl::string_view hlo_file, InputFormat input_format,
     std::string dump_output_to = "", int task_id = 0, int num_nodes = 1,
-    std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr);
+    std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr,
+    std::minstd_rand0* engine = nullptr);
 
 // Loads an HLO module from hlo_file according to input_format and run it.
 // The HLO module is run with the provided arguments if the arguments map is
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
index 2f7782a14b173a..495d303bd84726 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <memory>
 #include <random>
@@ -27,12 +28,15 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -49,7 +53,6 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/file_system_helper.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/subprocess.h"
 #include "xla/tsl/platform/test.h"
@@ -62,9 +65,12 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::Each;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::Property;
 using ::testing::SizeIs;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 using HloModuleAndArguments = ::xla::FunctionalHloRunner::HloModuleAndArguments;
 
 std::string GetHloPath(std::string file_name) {
@@ -98,6 +104,26 @@ TEST_F(FunctionalHloRunnerTest, SingleDeviceHlo) {
       running_options, {GetHloPath("single_device.hlo")}, InputFormat::kText));
 }
 
+TEST_F(FunctionalHloRunnerTest, SingleDeviceHloWithRandomEngine) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
+                          GetPjRtClient());
+
+  // Options corresponding to --num_replicas=1 --num_partitions=1
+  xla::DebugOptions debug_options;
+  FunctionalHloRunner::PreprocessingOptions preproc_options;
+  FunctionalHloRunner::RawCompileOptions raw_compile_options;
+  raw_compile_options.num_replicas = 1;
+  raw_compile_options.num_partitions = 1;
+  FunctionalHloRunner::RunningOptions running_options;
+  std::minstd_rand0 engine(42);
+
+  TF_EXPECT_OK(FunctionalHloRunner::LoadAndRunAndDump(
+      *client, debug_options, preproc_options, raw_compile_options,
+      running_options, {GetHloPath("single_device.hlo")}, InputFormat::kText,
+      /*dump_output_to=*/"", /*task_id=*/0, /*num_nodes=*/1,
+      /*kv_store=*/nullptr, /*engine=*/&engine));
+}
+
 TEST_F(FunctionalHloRunnerTest, SingleDeviceHloThroughStableHlo) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
                           GetPjRtClient());
@@ -685,6 +711,51 @@ TEST_F(FunctionalHloRunnerTest, Sharded2DevicesHloUnoptimizedSnapshot) {
   }
 }
 
+TEST_F(FunctionalHloRunnerTest, ReadHloUnoptimizedSnapshotCustomSerialization) {
+  std::string path_to_text_hlo =
+      GetHloPath("sharded_unoptimized_hlo_snapshot.pbtxt");
+  std::string path_to_binary_hlo = tsl::io::JoinPath(
+      std::getenv("TEST_UNDECLARED_OUTPUTS_DIR"),
+      "sharded_unoptimized_hlo_snapshot_custom_serialization.pb");
+  tsl::Env* env = tsl::Env::Default();
+
+  // Read the text proto
+  HloUnoptimizedSnapshot message;
+  TF_ASSERT_OK(tsl::ReadTextProto(env, path_to_text_hlo, &message));
+
+  // Dump message in the custom binary format
+  std::unique_ptr<tsl::WritableFile> file;
+  TF_ASSERT_OK(env->NewWritableFile(path_to_binary_hlo, &file));
+
+  tsl::WritableFileCopyingOutputStream output(file.get());
+
+  tsl::protobuf::io::CopyingOutputStreamAdaptor adaptor(&output);
+  TF_ASSERT_OK(SerializeHloUnoptimizedSnapshot(message, &adaptor));
+  adaptor.Flush();
+
+  TF_ASSERT_OK(file->Close());
+
+  // Read HloModuleAndArguments from text dump.
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloModuleAndArguments hlo_module_and_arguments_from_text,
+      FunctionalHloRunner::LoadHloModuleAndArguments(
+          path_to_text_hlo, InputFormat::kUnoptimizedSnapshotProtoText));
+  // Read HloModuleAndArguments from binary dump.
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloModuleAndArguments hlo_module_and_arguments_from_binary,
+      FunctionalHloRunner::LoadHloModuleAndArguments(
+          path_to_binary_hlo, InputFormat::kUnoptimizedSnapshotProtoBinary));
+
+  // Compare
+  CHECK_EQ(hlo_module_and_arguments_from_binary.arguments.size(), 2);
+
+  CHECK_EQ(hlo_module_and_arguments_from_text.hlo_module->ToString(),
+           hlo_module_and_arguments_from_binary.hlo_module->ToString());
+
+  CHECK_EQ(hlo_module_and_arguments_from_text.arguments.size(),
+           hlo_module_and_arguments_from_binary.arguments.size());
+}
+
 TEST_F(FunctionalHloRunnerTest, ReadHloUnoptimizedSnapshot) {
   std::string path_to_text_hlo =
       GetHloPath("sharded_unoptimized_hlo_snapshot.pbtxt");
@@ -704,7 +775,7 @@ TEST_F(FunctionalHloRunnerTest, ReadHloUnoptimizedSnapshot) {
   tsl::WritableFileCopyingOutputStream output(file.get());
 
   tsl::protobuf::io::CopyingOutputStreamAdaptor adaptor(&output);
-  TF_ASSERT_OK(SerializeHloUnoptimizedSnapshot(message, &adaptor));
+  EXPECT_TRUE(message.SerializeToZeroCopyStream(&adaptor));
   adaptor.Flush();
 
   TF_ASSERT_OK(file->Close());
@@ -730,6 +801,71 @@ TEST_F(FunctionalHloRunnerTest, ReadHloUnoptimizedSnapshot) {
            hlo_module_and_arguments_from_binary.arguments.size());
 }
 
+TEST_F(FunctionalHloRunnerTest,
+       ReadHloModuleProtoDoesNotPreserveInstructionIds) {
+  std::string path_to_text_hlo =
+      GetHloPath("sharded_unoptimized_hlo_snapshot.pbtxt");
+
+  tsl::Env* env = tsl::Env::Default();
+
+  // Read the text proto
+  HloUnoptimizedSnapshot message;
+  TF_ASSERT_OK(tsl::ReadTextProto(env, path_to_text_hlo, &message));
+
+  // Manually modify instruction ids in the proto.
+  int64_t instruction_id_offset = 1000;
+  for (HloComputationProto& computation :
+       *message.mutable_hlo_module()->mutable_computations()) {
+    for (HloInstructionProto& instruction :
+         *computation.mutable_instructions()) {
+      instruction.set_id(instruction.id() + instruction_id_offset);
+      for (int64_t& operand_id : *instruction.mutable_operand_ids()) {
+        operand_id += instruction_id_offset;
+      }
+    }
+    computation.set_root_id(computation.root_id() + instruction_id_offset);
+  }
+
+  // Dump message in the custom binary format
+  std::string path_to_binary_hlo =
+      tsl::io::JoinPath(std::getenv("TEST_UNDECLARED_OUTPUTS_DIR"),
+                        "sharded_unoptimized_hlo_snapshot_modified_ids.pb");
+
+  std::unique_ptr<tsl::WritableFile> file;
+  TF_ASSERT_OK(env->NewWritableFile(path_to_binary_hlo, &file));
+
+  tsl::WritableFileCopyingOutputStream output(file.get());
+
+  tsl::protobuf::io::CopyingOutputStreamAdaptor adaptor(&output);
+  EXPECT_TRUE(message.SerializeToZeroCopyStream(&adaptor));
+  adaptor.Flush();
+
+  TF_ASSERT_OK(file->Close());
+
+  // Read HloModuleAndArguments from binary dump.
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloModuleAndArguments hlo_module_and_arguments_from_binary,
+      FunctionalHloRunner::LoadHloModuleAndArguments(
+          path_to_binary_hlo, InputFormat::kUnoptimizedSnapshotProtoBinary));
+
+  // Check if ids have been re-assigned in a compact way
+  HloComputation* entry_computation =
+      hlo_module_and_arguments_from_binary.hlo_module->entry_computation();
+
+  EXPECT_THAT(entry_computation->instructions(),
+              ElementsAre(Property(&HloInstruction::local_id, Eq(0)),
+                          Property(&HloInstruction::local_id, Eq(1)),
+                          Property(&HloInstruction::local_id, Eq(2)),
+                          Property(&HloInstruction::local_id, Eq(3))));
+
+  // Check that all operand ids are also within the re-assigned range.
+  EXPECT_THAT(entry_computation->instructions(),
+              Each(Property(&HloInstruction::operands,
+                            Each(Property(&HloInstruction::local_id, Lt(4))))));
+
+  EXPECT_THAT(entry_computation->root_instruction()->local_id(), Eq(3));
+}
+
 TEST_F(FunctionalHloRunnerTest, FixFakeArguments) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
                           GetPjRtClient());
@@ -778,6 +914,7 @@ TEST(FunctionalHloRunnerTest, TestDebugOptionsAreNotOverwrittenByRawOptions) {
   // This test checks that we don't overwrite if we don't set xla_dump_to in the
   // raw options.
   xla::DebugOptions debug_options;
+  debug_options.set_xla_dump_to("test_dump_to");
   debug_options.set_xla_dump_hlo_as_text(true);
   FunctionalHloRunner::RawCompileOptions raw_compile_options;
   raw_compile_options.execution_options = ExecutionOptions();
@@ -794,6 +931,33 @@ TEST(FunctionalHloRunnerTest, TestDebugOptionsAreNotOverwrittenByRawOptions) {
                                                 /*kv_store=*/nullptr));
   EXPECT_TRUE(compile_options.executable_build_options.debug_options()
                   .xla_dump_hlo_as_text());
+  EXPECT_EQ(
+      compile_options.executable_build_options.debug_options().xla_dump_to(),
+      "");
+}
+
+// Check that xla_dump_to is respected if preserve_xla_dump_to is true.
+TEST(FunctionalHloRunnerTest, TestDebugOptionsDumpToIsRespected) {
+  xla::DebugOptions debug_options;
+  std::string xla_dump_to = "test_dump_to";
+  debug_options.set_xla_dump_to(xla_dump_to);
+  FunctionalHloRunner::RawCompileOptions raw_compile_options;
+  raw_compile_options.preserve_xla_dump_to = true;
+  raw_compile_options.execution_options = ExecutionOptions();
+  *raw_compile_options.execution_options->mutable_debug_options() =
+      debug_options;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
+                          GetPjRtClient());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      CompileOptions compile_options,
+      FunctionalHloRunner::CreateCompileOptions(*client, raw_compile_options,
+                                                /*task_id=*/0, /*num_nodes=*/1,
+                                                /*kv_store=*/nullptr));
+  EXPECT_EQ(
+      compile_options.executable_build_options.debug_options().xla_dump_to(),
+      xla_dump_to);
 }
 
 TEST(FunctionalHloRunnerTest, RespectUseSpmdPartitioning) {
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index c123491f08274f..7b8d8a18487f56 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <iostream>
 #include <memory>
 #include <optional>
+#include <random>
 #include <string>
 #include <vector>
 
@@ -98,6 +99,7 @@ struct HloRunnerConfig {
   bool xla_dump_as_text = false;
   bool xla_dump_as_proto = false;
   std::string hlo_argument_mode = "use_random_inputs";
+  int random_seed = -1;
   int32_t while_execution_count = -1;
   bool remove_infeed_outfeed = true;
   bool compile_as_stablehlo = false;
@@ -244,6 +246,11 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
   QCHECK(opts.dump_output_literal_to.empty() || argc == 2)
       << "Can only dump output literal when single input file is specified";
 
+  std::unique_ptr<std::minstd_rand0> engine = nullptr;
+  if (opts.random_seed != -1) {
+    engine = std::make_unique<std::minstd_rand0>(opts.random_seed);
+  }
+
   QCHECK_GT(opts.gpu_client_mem_fraction, 0.0);
   QCHECK_LT(opts.gpu_client_mem_fraction, 1.0);
 
@@ -295,7 +302,8 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
       TF_RETURN_IF_ERROR(xla::FunctionalHloRunner::LoadAndRunAndDump(
           *env.client, GetDebugOptionsFromFlags(), preproc_options,
           raw_compile_options, running_options, hlo_file, opts.input_format,
-          opts.dump_output_literal_to, opts.task_id));
+          opts.dump_output_literal_to, opts.task_id, opts.num_nodes,
+          env.kv_store, engine.get()));
     } else {
       std::cout << "\n** Compiling " << hlo_file << " **\n";
       TF_RETURN_IF_ERROR(FunctionalHloRunner::LoadAndCompile(
@@ -393,6 +401,10 @@ int main(int argc, char** argv) {
                 "use_device_id_as_input, use_random_inputs, "
                 "use_shared_random_inputs, "
                 "use_zeros_as_input or uninitialized."),
+      tsl::Flag("random_seed", &opts.random_seed,
+                "Seed to be used for generating random inputs when "
+                "`hlo_argument_mode` is set to use_random_inputs or "
+                "use_shared_random_inputs."),
       tsl::Flag("while_execution_count", &opts.while_execution_count,
                 "If set to a positive number, flatten all while loops to "
                 "a certain number of iterations."),
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc
index 72051a4ad4ac46..d2d022ad7c22ef 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc
@@ -331,12 +331,15 @@ absl::Status RegisterCustomTypeId(absl::string_view type_name,
   }
   XLA_FFI_TypeId* type_id_ptr =
       reinterpret_cast<XLA_FFI_TypeId*>(static_cast<void*>(capsule.data()));
-  return ffi::TakeStatus(ffi::Ffi::RegisterTypeId(xla::ffi::GetXlaFfiApi(),
-                                                  type_name, type_id_ptr));
+  XLA_FFI_TypeInfo* type_info_ptr = nullptr;
+  return ffi::TakeStatus(ffi::Ffi::RegisterTypeId(
+      xla::ffi::GetXlaFfiApi(), type_name, type_id_ptr, type_info_ptr));
 }
 
 NB_MODULE(py_hlo_multihost_runner, m) {
-  InitializeAbslLogging();
+#ifndef PLATFORM_GOOGLE
+  xla::InitializeAbslLogging();
+#endif  // PLATFORM_GOOGLE
 
   m.def("RunHloFiles", ThrowIfErrorWrapper(RunHloFiles));
   m.def(
@@ -351,7 +354,7 @@ NB_MODULE(py_hlo_multihost_runner, m) {
   m.def("custom_call_targets", GetRegisteredCustomCallTargets,
         nb::arg("platform"));
   m.def(
-      "register_custom_type_id",
+      "register_custom_type",
       [](absl::string_view type_name, nb::object type_id) {
         xla::ThrowIfError(RegisterCustomTypeId(type_name, type_id));
       },
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner_test.py b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner_test.py
new file mode 100644
index 00000000000000..91fcc20f443d95
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner_test.py
@@ -0,0 +1,67 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running HLO files."""
+
+import os
+import pathlib
+
+from absl.testing import absltest
+from transformer_engine import transformer_engine_jax
+
+from xla.tools.multihost_hlo_runner import py_hlo_multihost_runner
+
+
+def _register_transformer_engine_custom_calls():
+  for name, value in transformer_engine_jax.registrations().items():
+    py_hlo_multihost_runner.register_custom_call_target(
+        name, value, platform="CUDA", api_version=1
+    )
+
+
+def _get_test_hlo_path(file_name: str) -> str:
+  """Returns the path to a HLO file in the data directory."""
+  test_srcdir = pathlib.Path(os.environ["TEST_SRCDIR"])
+  test_workspace = os.environ["TEST_WORKSPACE"]
+  test_binary = os.environ["TEST_BINARY"]
+  return os.path.join(
+      os.path.dirname(test_srcdir / test_workspace / test_binary),
+      "data",
+      file_name,
+  )
+
+
+class RunTEHloTest(absltest.TestCase):
+  """Tests for running custom calls from Transformer Engine."""
+
+  def setUp(self):
+    super().setUp()
+    _register_transformer_engine_custom_calls()
+    self.config = py_hlo_multihost_runner.PyHloRunnerConfig()
+    self.config.input_format = py_hlo_multihost_runner.InputFormat.Text
+    self.config.hlo_argument_mode = (
+        py_hlo_multihost_runner.ModuleArgumentMode.Uninitialized
+    )
+
+  def test_run_custom_call_hlo(self):
+    hlo_file = _get_test_hlo_path("transformer_engine_softmax.hlo")
+    py_hlo_multihost_runner.RunHloFiles([hlo_file], self.config)
+
+
+def main():
+  absltest.main()
+
+
+if __name__ == "__main__":
+  main()
diff --git a/third_party/xla/xla/tools/print_indexing.cc b/third_party/xla/xla/tools/print_indexing.cc
index 0bd53e73588ee7..ae0a38153582ae 100644
--- a/third_party/xla/xla/tools/print_indexing.cc
+++ b/third_party/xla/xla/tools/print_indexing.cc
@@ -50,7 +50,7 @@ absl::Status Run(const std::string& filename, int operand_id, int output_id) {
   if (print_all) {
     get_operand_id = 0;
   }
-  mlir::MLIRContext ctx;
+  mlir::MLIRContext mlir_context;
   VLOG(1) << "module:\n" << module->ToString() << std::endl;
   LOG(INFO) << "root instruction is: " << root->ToString() << std::endl;
   VLOG(1) << "root is tuple: " << root->shape().IsTuple();
@@ -74,7 +74,7 @@ absl::Status Run(const std::string& filename, int operand_id, int output_id) {
 
   for (int out_id : output_ids) {
     HloInstructionIndexing indexing =
-        ComputeOutputToInputIndexing(root, out_id, &ctx);
+        ComputeOutputToInputIndexing(root, out_id, &mlir_context);
     LOG(INFO) << absl::StrFormat("output id %d has %d indexing maps", out_id,
                                  indexing.indexing_maps.size());
     if (indexing.indexing_maps.empty()) {
diff --git a/third_party/xla/xla/tools/ptx_opt/BUILD b/third_party/xla/xla/tools/ptx_opt/BUILD
new file mode 100644
index 00000000000000..262a448e9e2f9e
--- /dev/null
+++ b/third_party/xla/xla/tools/ptx_opt/BUILD
@@ -0,0 +1,63 @@
+load("//xla:lit.bzl", "lit_test_suite")
+load("//xla:xla.default.bzl", "xla_cc_binary")
+load("//xla/tsl:tsl.bzl", "if_google")
+load("//xla/tsl/platform/default:build_config_root.bzl", "tf_gpu_tests_tags")
+load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+xla_cc_binary(
+    name = "ptx_opt",
+    srcs = ["ptx_opt.cc"],
+    # We want to use this tool for lit tests. Due to hermetic cuda, we need to
+    # set linkopts in such a way that dynamic libraries are found, which are
+    # symlinked from the lit_lib directory.
+    linkopts = ["-Wl,-rpath,$$ORIGIN/../lit_lib"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla:debug_options_flags",
+        "//xla/service/gpu/llvm_gpu_backend:load_ir_module",
+        "//xla/service/gpu/llvm_gpu_backend:nvptx_backend",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:NVPTXCodeGen",  # buildcleaner: keep
+        "@llvm-project//llvm:ObjCARC",  # buildcleaner: keep
+        "@local_tsl//tsl/platform:platform_port",
+    ] + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:cuda_platform",
+    ]),
+)
+
+lit_test_suite(
+    name = "ptx_opt_tests",
+    # It fails in OSS to load the module correctly.
+    srcs = if_google(
+        glob(["**/*.ll"]),
+        [],
+    ),
+    args = if_cuda_is_configured([
+        "--param=PTX=PTX",
+        "--param=GPU=a100_pcie_80",
+    ]),
+    cfg = "//xla:lit.cfg.py",
+    default_tags = tf_gpu_tests_tags(),
+    hermetic_cuda_data_dir = "%S/../../../../../cuda_nvcc",
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    tools = [
+        ":ptx_opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc b/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
new file mode 100644
index 00000000000000..df00cb8039c253
--- /dev/null
+++ b/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
@@ -0,0 +1,93 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "xla/debug_options_flags.h"
+#include "xla/service/gpu/llvm_gpu_backend/load_ir_module.h"
+#include "xla/service/gpu/llvm_gpu_backend/nvptx_backend.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tsl/platform/init_main.h"
+
+namespace xla::gpu::nvptx {
+
+absl::Status Run(const std::string& arch, const std::string& input_ll_path) {
+  if (input_ll_path.empty()) {
+    return absl::InvalidArgumentError("Input file path is required.");
+  }
+  if (arch.empty()) {
+    return absl::InvalidArgumentError("--arch is required.");
+  }
+
+  llvm::LLVMContext ctx;
+  std::unique_ptr<llvm::Module> module = LoadIRModule(input_ll_path, &ctx);
+  if (!module) {
+    return absl::InternalError(
+        absl::StrCat("Failed to load module from ", input_ll_path));
+  }
+
+  // Create a GpuComputeCapability from the arch flag.
+  auto cuda_compute_capability =
+      stream_executor::CudaComputeCapability::FromString(arch);
+  if (!cuda_compute_capability.ok()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Invalid GPU architecture: ", arch));
+  }
+
+  stream_executor::GpuComputeCapability gpu_version(*cuda_compute_capability);
+
+  // Get DebugOptions.
+  DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
+  auto llvm_opts = GetNVPTXBackendOptions(debug_options);
+
+  // Compile to PTX.
+  auto ptx_or = CompileToPtx(module.get(), gpu_version, debug_options);
+  std::cout << *ptx_or << std::endl;
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu::nvptx
+
+int main(int argc, char* argv[]) {
+  std::string arch;
+  std::vector<tsl::Flag> flag_list = {tsl::Flag(
+      "arch", &arch,
+      "The GPU architecture to target, e.g., '8.6' or '9.0a' or '10.0f'.")};
+  xla::AppendDebugOptionsFlags(&flag_list);
+
+  const std::string usage = tsl::Flags::Usage(argv[0], flag_list);
+  bool parse_result = tsl::Flags::Parse(&argc, argv, flag_list);
+
+  tsl::port::InitMain(usage.c_str(), &argc, &argv);
+  if (!parse_result || argc != 2) {
+    std::cerr << usage << std::endl;
+    return 1;
+  }
+  absl::Status status = xla::gpu::nvptx::Run(arch, argv[1]);
+  if (!status.ok()) {
+    std::cerr << "Error: " << status << std::endl;
+    return 1;
+  }
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/ptx_opt/tests/acos.ll b/third_party/xla/xla/tools/ptx_opt/tests/acos.ll
new file mode 100644
index 00000000000000..21b49c0f6e8cde
--- /dev/null
+++ b/third_party/xla/xla/tools/ptx_opt/tests/acos.ll
@@ -0,0 +1,36 @@
+; RUN: ptx_opt  %s --arch=9.0 | FileCheck %s
+
+target datalayout = "e-p6:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+define ptx_kernel void @loop_acos_fusion(ptr noalias align 16 dereferenceable(1024) %0, ptr noalias align 256 dereferenceable(1024) %1) #0 {
+  %3 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !1
+  %4 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+  %5 = mul i32 %3, 128
+  %6 = add i32 %5, %4
+  %7 = getelementptr inbounds [256 x float], ptr %0, i32 0, i32 %6
+  %8 = load float, ptr %7, align 4, !invariant.load !3
+  %9 = call float @__nv_acosf(float %8)
+  %10 = getelementptr inbounds [256 x float], ptr %1, i32 0, i32 %6
+  store float %9, ptr %10, align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+declare float @__nv_acosf(float)
+
+attributes #0 = { "nvvm.reqntid"="128,1,1" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 0, i32 2}
+!2 = !{i32 0, i32 128}
+!3 = !{}
+
+; CHECK: .target sm_90
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/run_hlo_module.cc b/third_party/xla/xla/tools/run_hlo_module.cc
index 7749b3afdc8959..6c04c8383b5f0f 100644
--- a/third_party/xla/xla/tools/run_hlo_module.cc
+++ b/third_party/xla/xla/tools/run_hlo_module.cc
@@ -46,20 +46,22 @@ limitations under the License.
 #include "xla/literal_comparison.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/service/hlo_verifier.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tools/hlo_control_flow_flattening.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tools/prepare_reference_module.h"
 #include "xla/tools/run_hlo_module.pb.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -115,8 +117,8 @@ void WriteLiteralToTempFile(const LiteralSlice& literal,
     text_filename = tsl::io::GetTempFilename(absl::StrCat(name, ".txt"));
   }
 
-  TF_CHECK_OK(tsl::WriteBinaryProto(env, binary_filename, literal.ToProto()));
-  TF_CHECK_OK(tsl::WriteStringToFile(env, text_filename, literal.ToString()));
+  CHECK_OK(tsl::WriteBinaryProto(env, binary_filename, literal.ToProto()));
+  CHECK_OK(tsl::WriteStringToFile(env, text_filename, literal.ToString()));
   LOG(ERROR) << "wrote Literal to " << name << " binary: " << binary_filename
              << " text: " << text_filename;
 }
@@ -518,7 +520,7 @@ absl::Status RunAndCompare(
           true));
   TF_RETURN_IF_ERROR(verifier.Run(test_module.get()).status());
   if (compilation_env_modifier_hook) {
-    TF_CHECK_OK(compilation_env_modifier_hook(options, *test_module))
+    CHECK_OK(compilation_env_modifier_hook(options, *test_module))
         << "Could not adjust the compilation environment for user provided "
            "hlo module.";
   }
diff --git a/third_party/xla/xla/tools/run_hlo_module_main.cc b/third_party/xla/xla/tools/run_hlo_module_main.cc
index 8aecc024882c5f..3619695ad0183b 100644
--- a/third_party/xla/xla/tools/run_hlo_module_main.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_main.cc
@@ -32,12 +32,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "xla/debug_options_flags.h"
-#include "xla/hlo/translate/mhlo_to_hlo/translate.h"
-#include "xla/hlo/translate/stablehlo_to_hlo/translate.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/platform_util.h"
@@ -239,54 +234,6 @@ int main(int argc, char** argv) {
     const char* hlo_filename = argv[c];
     std::cout << "\n ** Running " << hlo_filename << "** \n";
 
-    if (opts.input_format == "stablehlo" || opts.input_format == "mhlo") {
-      auto input_filename = hlo_filename;
-      hlo_filename = std::tmpnam(nullptr);
-
-      std::error_code error;
-      auto output = std::make_unique<llvm::ToolOutputFile>(
-          hlo_filename, error, llvm::sys::fs::OF_None);
-      if (error) {
-        LOG(QFATAL) << "cannot open output file '" << std::string(hlo_filename)
-                    << "': " << error.message();
-      }
-
-      auto input = llvm::MemoryBuffer::getFile(input_filename);
-      error = input.getError();
-      if (error) {
-        LOG(QFATAL) << "cannot open input file '" << std::string(input_filename)
-                    << "': " << error.message();
-      }
-
-      auto status =
-          opts.input_format == "mhlo"
-              ? xla::MlirHloToHloTextMain(
-                    std::move(*input), output->os(),
-                    /*emit_return_tuple=*/false,
-                    /*emit_use_tuple_arg=*/false,
-                    /*print_layouts=*/false,
-                    /*print_large_constants=*/true, /*print_sugar=*/false,
-                    /*via_builder=*/false, /*with_layouts=*/false)
-              : xla::StablehloToHloTextMain(
-                    std::move(*input), output->os(),
-                    /*emit_return_tuple=*/false,
-                    /*emit_use_tuple_arg=*/false,
-                    /*print_layouts=*/false,
-                    /*print_large_constants=*/true, /*print_sugar=*/false,
-                    /*via_builder=*/false, /*with_layouts=*/false);
-
-      if (status.failed()) {
-        LOG(QFATAL) << "Failed to translate input " << opts.input_format
-                    << " program to HLO text";
-      }
-
-      VLOG(1) << "Input " << opts.input_format
-              << " program translated to HLO text at " << hlo_filename << "\n";
-
-      output->keep();
-      opts.input_format = "hlo";
-    }
-
     xla::RunHloModuleLiterals literals_proto;
     std::unique_ptr<std::minstd_rand0> engine;
     if (opts.random_init_input_literals) {
diff --git a/third_party/xla/xla/tools/show_signature.cc b/third_party/xla/xla/tools/show_signature.cc
index 8b975f85d47710..a46668142b2966 100644
--- a/third_party/xla/xla/tools/show_signature.cc
+++ b/third_party/xla/xla/tools/show_signature.cc
@@ -30,17 +30,16 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/client/client.h"
 #include "xla/client/client_library.h"
-#include "xla/client/local_client.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/types.h"
-#include "tsl/platform/env.h"
+#include "xla/tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace tools {
@@ -49,7 +48,7 @@ void RealMain(absl::Span<char* const> args) {
   Client* client = ClientLibrary::LocalClientOrDie();
   for (char* arg : args) {
     HloSnapshot module;
-    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), arg, &module));
+    CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), arg, &module));
     auto computation = client->LoadSnapshot(module).value();
     std::unique_ptr<ProgramShape> shape =
         client->GetComputationShape(computation).value();
diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
index 43da876f00d54a..afe0162a552530 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
+#include "google/protobuf/text_format.h"
 #include "stablehlo/dialect/Register.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/builder/xla_computation.h"
@@ -87,7 +88,7 @@ static absl::StatusOr<std::string> AotCompileCpuExecutable(
 
 static absl::StatusOr<std::string> CompileGpuExecutable(
     std::unique_ptr<HloModule> hlo_module,
-    std::optional<Compiler::TargetConfig> target_config,
+    std::optional<Compiler::GpuTargetConfig> target_config,
     CompilationResult& result) {
   TF_ASSIGN_OR_RETURN(std::string platform_name,
                       xla::PlatformUtil::CanonicalPlatformName("gpu"));
@@ -101,7 +102,7 @@ static absl::StatusOr<std::string> CompileGpuExecutable(
 
   if (aot) {
     AotCompilationOptions aot_options(platform->id());
-    aot_options.set_target_config(*target_config);
+    aot_options.set_gpu_target_config(*target_config);
     // We need the optimized module, so we call RunHloPasses ourselves above.
     aot_options.set_run_backend_only(true);
 
@@ -133,7 +134,7 @@ static absl::StatusOr<std::string> CompileGpuExecutable(
 
 absl::StatusOr<std::string> CompileExecutable(
     std::unique_ptr<HloModule> hlo_module, BackendType backend,
-    std::optional<Compiler::TargetConfig> target_config,
+    std::optional<Compiler::GpuTargetConfig> target_config,
     CompilationResult& result) {
   if (backend == BackendType::kCpu) {
     return AotCompileCpuExecutable(std::move(hlo_module));
@@ -218,7 +219,7 @@ ReadModuleFromSymbolRepo(absl::string_view symbol_repo,
   return mod;
 }
 
-static std::unique_ptr<Compiler::TargetConfig> ReadTargetConfigFromModule(
+static std::unique_ptr<Compiler::GpuTargetConfig> ReadTargetConfigFromModule(
     HloModuleAndMetadata* mod, BackendType backend) {
   if (backend == BackendType::kGpu) {
     if (auto* data = static_cast<gpu::GpuBackendSpecificData*>(
@@ -253,7 +254,7 @@ absl::StatusOr<bool> LoadAutotuneDataFromModule(HloModuleAndMetadata* mod,
 
 absl::Status XlaCompileMain(const XlaCompileOptions& options) {
   std::unique_ptr<HloModule> hlo_module;
-  std::unique_ptr<Compiler::TargetConfig> target_config;
+  std::unique_ptr<Compiler::GpuTargetConfig> target_config;
   if (options.platform != "cpu" && options.platform != "gpu") {
     return absl::UnimplementedError(
         absl::StrCat("platform", options.platform, " is not supported"));
@@ -306,7 +307,7 @@ absl::Status XlaCompileMain(const XlaCompileOptions& options) {
     }
   });
   // Run AOT compilation.
-  std::optional<Compiler::TargetConfig> cfg = std::nullopt;
+  std::optional<Compiler::GpuTargetConfig> cfg = std::nullopt;
   if (backend == BackendType::kGpu) {
     if (absl::string_view gpu_target_config_path =
             options.gpu_options.gpu_target_config_path;
@@ -324,9 +325,9 @@ absl::Status XlaCompileMain(const XlaCompileOptions& options) {
       }
 
       TF_ASSIGN_OR_RETURN(
-          Compiler::TargetConfig parsed_target_config,
-          Compiler::TargetConfig::FromProto(gpu_target_config_proto));
-      target_config = std::make_unique<Compiler::TargetConfig>(
+          Compiler::GpuTargetConfig parsed_target_config,
+          Compiler::GpuTargetConfig::FromProto(gpu_target_config_proto));
+      target_config = std::make_unique<Compiler::GpuTargetConfig>(
           std::move(parsed_target_config));
 
       if (absl::string_view autotune_results_path =
diff --git a/third_party/xla/xla/tools/xla_compile_lib.h b/third_party/xla/xla/tools/xla_compile_lib.h
index e2d63467d1c391..689aa7778c8170 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.h
+++ b/third_party/xla/xla/tools/xla_compile_lib.h
@@ -40,7 +40,7 @@ namespace xla {
 // This is the expected entry point to the compilation functionality.
 absl::StatusOr<std::string> CompileExecutable(
     std::unique_ptr<HloModule> hlo_module, BackendType backend,
-    std::optional<Compiler::TargetConfig> target_config,
+    std::optional<Compiler::GpuTargetConfig> target_config,
     CompilationResult& result);
 
 // Merges the measured duration into compilation_result and writes
diff --git a/third_party/xla/xla/tools/xla_cpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_cpu_compile_lib_test.cc
index 187269afd6923c..0a2aa1dc759add 100644
--- a/third_party/xla/xla/tools/xla_cpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_cpu_compile_lib_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/platform_util.h"
@@ -30,16 +31,15 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tools/xla_compile_lib.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/env_time.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/tsl/protobuf/status.pb.h"
 #include "xla/util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/env_time.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -47,9 +47,6 @@ namespace {
 using ::testing::IsEmpty;
 using ::testing::IsNull;
 using ::testing::Not;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 class XlaCompileLibTest : public HloTestBase {
  protected:
diff --git a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
index 51f07ca4d145b3..ce26278c7293a7 100644
--- a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/gpu_symbol_repository.h"
@@ -30,22 +31,20 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tools/xla_compile_lib.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/tsl/protobuf/status.pb.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
 using ::testing::IsEmpty;
 using ::testing::Not;
-using ::tsl::testing::IsOkAndHolds;
 
 class XlaCompileLibTest : public HloTestBase {
  protected:
diff --git a/third_party/xla/xla/tsl/BUILD b/third_party/xla/xla/tsl/BUILD
index 56f59a0f76a9f4..b5591610baa5a8 100644
--- a/third_party/xla/xla/tsl/BUILD
+++ b/third_party/xla/xla/tsl/BUILD
@@ -316,6 +316,16 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "linux_riscv64",
+    constraint_values =
+        [
+            "@platforms//cpu:riscv64",
+            "@platforms//os:linux",
+        ],
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "linux_s390x",
     constraint_values =
@@ -382,6 +392,15 @@ selects.config_setting_group(
     visibility = ["//visibility:public"],
 )
 
+selects.config_setting_group(
+    name = "riscv64_or_cross",
+    match_any = [
+        ":linux_riscv64",
+        ":with_cross_compiler_support",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 selects.config_setting_group(
     name = "s390x_or_cross",
     match_any = [
@@ -453,6 +472,7 @@ selects.config_setting_group(
         ":linux_aarch64",
         ":linux_armhf",
         ":linux_ppc64le",
+        ":linux_riscv64",
         ":linux_s390x",
         ":linux_x86_64",
     ],
diff --git a/third_party/xla/xla/tsl/concurrency/BUILD b/third_party/xla/xla/tsl/concurrency/BUILD
index 2306d1823dd41b..f6872f3ea815fc 100644
--- a/third_party/xla/xla/tsl/concurrency/BUILD
+++ b/third_party/xla/xla/tsl/concurrency/BUILD
@@ -152,6 +152,7 @@ cc_library(
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/meta:type_traits",
@@ -168,6 +169,7 @@ tsl_cc_test(
     deps = [
         ":executor",
         ":future",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
diff --git a/third_party/xla/xla/tsl/concurrency/async_value.h b/third_party/xla/xla/tsl/concurrency/async_value.h
index 8dba7e9c07f1f0..d5d7cec0c99066 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value.h
@@ -88,7 +88,12 @@ class AsyncValue {
   // Return reference count. This should be used for testing and debugging only.
   uint32_t NumRef() const { return refcount_.load(std::memory_order_acquire); }
 
-  // Return true if reference count is 1.
+  // Return true if this async value is a unique reference to the underlying
+  // payload. For concrete async values, this is equivalent to `NumRef() == 1`.
+  // For indirect async values it means that the whole chain of indirect async
+  // values has a reference count of 1. For unavailable indirect async values we
+  // conservatively return false as we don't know to what async value it will be
+  // forwarded.
   bool IsUnique() const;
 
   // Add a new reference to this object.
@@ -806,10 +811,9 @@ class IndirectAsyncValue : public AsyncValue {
 
   bool IsUnique() const {
     // In addition to checking the refcount of this IndirectAsyncValue, we also
-    // need to check the refcount of the underlying value. If the underlying
-    // value is not available, we conservatively return false.
-    return (refcount_.load(std::memory_order_acquire) == 1) && IsAvailable() &&
-           value_->IsUnique();
+    // need to check the refcount of the underlying value. If indirect async
+    // value value is not forwarded, we conservatively return false.
+    return (NumRef() == 1) && value_ && value_->IsUnique();
   }
 
  protected:
@@ -1048,6 +1052,34 @@ void AsyncValue::AndThen(Waiter&& waiter) {
 
 template <typename Waiter>
 void AsyncValue::AndThen(Executor& executor, Waiter&& waiter) {
+  // We don't know when the `executor` will run the `waiter`, so we need to add
+  // a reference to the AsyncValue to keep they underlying value alive for as
+  // long as the waiter is waiting to be executed.
+  struct SafeWaiter {
+    SafeWaiter(AsyncValue* value, Waiter waiter)
+        : value(value), waiter(std::move(waiter)) {
+      value->AddRef();
+    }
+
+    SafeWaiter(SafeWaiter&& other) noexcept
+        : value(other.value), waiter(std::move(other.waiter)) {
+      other.value = nullptr;
+    }
+
+    SafeWaiter& operator=(SafeWaiter&& other) = delete;
+
+    ~SafeWaiter() {
+      if (value) {
+        value->DropRef();
+      }
+    }
+
+    void operator()() { std::move(waiter)(); }
+
+    AsyncValue* value;
+    Waiter waiter;
+  };
+
   // Clients generally want to use AndThen without them each having to check
   // to see if the value is present. Check for them, and immediately run the
   // waiter if it is already here.
@@ -1055,12 +1087,13 @@ void AsyncValue::AndThen(Executor& executor, Waiter&& waiter) {
   if (waiters_and_state.state() == State::kConcrete ||
       waiters_and_state.state() == State::kError) {
     DCHECK_EQ(waiters_and_state.waiter(), nullptr);
-    executor.Execute(std::forward<Waiter>(waiter));
+    executor.Execute(SafeWaiter(this, std::forward<Waiter>(waiter)));
     return;
   }
 
   EnqueueWaiter(
-      [&executor, waiter = std::forward<Waiter>(waiter)]() mutable {
+      [&executor,
+       waiter = SafeWaiter(this, std::forward<Waiter>(waiter))]() mutable {
         executor.Execute(std::move(waiter));
       },
       waiters_and_state);
@@ -1123,7 +1156,7 @@ inline void AsyncValue::Destroy() {
 
 inline bool AsyncValue::IsUnique() const {
   if (kind() != Kind::kIndirect) {
-    return refcount_.load(std::memory_order_acquire) == 1;
+    return NumRef() == 1;
   }
 
   // If it is an IndirectAsyncValue, we also need to check the refcount of the
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_ref.h b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
index 405bd2a99b94ab..83825c973a4e5f 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_ref.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
@@ -452,6 +452,8 @@ class AsyncValuePtr {
 
   AsyncValueRef<T> CopyRef() const { return AsyncValueRef<T>(FormRef(value_)); }
 
+  RCReference<AsyncValue> CopyRCRef() const { return FormRef(value_); }
+
   T& get() const { return value_->template get<T>(); }
   T* operator->() const { return &get(); }
   T& operator*() const { return get(); }
@@ -565,14 +567,12 @@ class AsyncValuePtr {
   // An overload that executes `waiter` on a user-provided executor.
   template <typename Waiter, StatusOrWaiter<Waiter>* = nullptr>
   void AndThen(Executor& executor, Waiter&& waiter) const {
-    // We don't know when the executor will run the callback, so we need to
-    // copy the AsyncValueRef to keep the underlying value alive.
     AndThen(executor,
-            [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
-              if (ABSL_PREDICT_FALSE(ref.IsError())) {
-                return waiter(ref.GetError());
+            [waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
+              if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+                return waiter(ptr.GetError());
               }
-              return waiter(&ref.get());
+              return waiter(&ptr.get());
             });
   }
 
@@ -606,12 +606,10 @@ class AsyncValuePtr {
   // An overload that executes `waiter` on a user-provided executor.
   template <typename Waiter, StatusWaiter<Waiter>* = nullptr>
   void AndThen(Executor& executor, Waiter&& waiter) const {
-    // We don't know when the executor will run the callback, so we need to
-    // copy the AsyncValueRef to keep the underlying value alive.
     AndThen(executor,
-            [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
-              if (ABSL_PREDICT_FALSE(ref.IsError())) {
-                return waiter(ref.GetError());
+            [waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
+              if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+                return waiter(ptr.GetError());
               }
               return waiter(absl::OkStatus());
             });
@@ -644,16 +642,13 @@ class AsyncValuePtr {
   template <typename R, typename F, MapFunctor<R, F>* = nullptr>
   AsyncValueRef<R> Map(Executor& executor, F&& f) {
     auto result = MakeUnconstructedAsyncValueRef<R>();
-    // We don't know when the executor will run the callback, so we need to
-    // copy the AsyncValueRef to keep the underlying value alive.
-    AndThen(executor,
-            [f = std::forward<F>(f), result, ref = CopyRef()]() mutable {
-              if (ABSL_PREDICT_FALSE(ref.IsError())) {
-                result.SetError(ref.GetError());
-              } else {
-                result.emplace(f(*ref));
-              }
-            });
+    AndThen(executor, [f = std::forward<F>(f), result, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        result.SetError(ptr.GetError());
+      } else {
+        result.emplace(f(*ptr));
+      }
+    });
     return result;
   }
 
@@ -693,21 +688,18 @@ class AsyncValuePtr {
   template <typename R, typename F, TryMapFunctor<R, F>* = nullptr>
   AsyncValueRef<R> TryMap(Executor& executor, F&& f) {
     auto result = MakeUnconstructedAsyncValueRef<R>();
-    // We don't know when the executor will run the callback, so we need to
-    // copy the AsyncValueRef to keep the underlying value alive.
-    AndThen(executor,
-            [f = std::forward<F>(f), result, ref = CopyRef()]() mutable {
-              if (ABSL_PREDICT_FALSE(ref.IsError())) {
-                result.SetError(ref.GetError());
-              } else {
-                auto status_or = f(*ref);
-                if (status_or.ok()) {
-                  result.emplace(std::move(status_or.value()));
-                } else {
-                  result.SetError(status_or.status());
-                }
-              }
-            });
+    AndThen(executor, [f = std::forward<F>(f), result, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        result.SetError(ptr.GetError());
+      } else {
+        auto status_or = f(*ptr);
+        if (status_or.ok()) {
+          result.emplace(std::move(status_or.value()));
+        } else {
+          result.SetError(status_or.status());
+        }
+      }
+    });
     return result;
   }
 
@@ -791,20 +783,17 @@ class AsyncValuePtr {
     // we must execute user functor on a separate executor and can't call it in
     // the caller thread.
     auto promise = MakePromise<R>();
-    // We don't know when the executor will run the callback, so we need to
-    // copy the AsyncValueRef to keep the underlying value alive.
-    AndThen(executor,
-            [f = std::forward<F>(f), promise, ref = CopyRef()]() mutable {
-              if (ABSL_PREDICT_FALSE(ref.IsError())) {
-                promise->SetError(ref.GetError());
-              } else {
-                if constexpr (std::is_invocable_v<F, T&>) {
-                  promise->ForwardTo(f(*ref));
-                } else {
-                  promise->ForwardTo(f(ref.AsPtr()));
-                }
-              }
-            });
+    AndThen(executor, [f = std::forward<F>(f), promise, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        promise->SetError(ptr.GetError());
+      } else {
+        if constexpr (std::is_invocable_v<F, T&>) {
+          promise->ForwardTo(f(*ptr));
+        } else {
+          promise->ForwardTo(f(ptr));
+        }
+      }
+    });
     return AsyncValueRef<R>(promise);
   }
 
diff --git a/third_party/xla/xla/tsl/concurrency/future.h b/third_party/xla/xla/tsl/concurrency/future.h
index 00f972a9692553..27bf49e62c577a 100644
--- a/third_party/xla/xla/tsl/concurrency/future.h
+++ b/third_party/xla/xla/tsl/concurrency/future.h
@@ -26,12 +26,14 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/no_destructor.h"
 #include "absl/base/optimization.h"
+#include "absl/functional/bind_front.h"
 #include "absl/meta/type_traits.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/executor.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/logging.h"
 
 namespace tsl {
@@ -128,12 +130,28 @@ class FutureMoveControl</*is_move_only=*/false> {
   FutureMoveControl& operator=(FutureMoveControl&&) = default;
 };
 
+// A template helper to deduce the `Future` type from the `FutureBase` type.
+// clang-format off
+template <typename T>
+struct FutureType;
+template <>
+struct FutureType<absl::Status>      { using type = void; };
+template <typename T>
+struct FutureType<absl::StatusOr<T>> { using type = T; };
+// clang-format on
+
+template <typename T>
+using future_type_t = typename FutureType<T>::type;  // NOLINT
+
 // A base class for a stateful future Future<T> and a stateless future Future<>.
 // If `is_move_only` is true, Future derived from this class acts as a move-only
 // type and the value can be passed to the caller only using move assignment
 // (applied to Await and OnReady APIs).
 template <typename T, bool is_move_only = !std::is_copy_constructible_v<T>>
 class FutureBase : public FutureMoveControl<is_move_only> {
+  static_assert(internal::is_status_v<T> || internal::is_status_or_v<T>,
+                "Future value type must be absl::Status or absl::StatusOr");
+
  protected:
   FutureBase() = default;
 
@@ -225,9 +243,15 @@ class FutureBase : public FutureMoveControl<is_move_only> {
     // instead of dropping the promise without fulfilling it in order to make
     // debugging easier. Also, be aware that the current promise may still be
     // used to mint a future.
+    //
+    // We use this API only when we are exclusive owner of the promise and can
+    // guarantee that it didn't escape to other threads via pointers. Otherwise,
+    // this is best effort check, because it uses two atomic operations and is
+    // not atomic itself.
     bool IsUniqueReference() const {
-      CHECK(promise_) << "Promise must wrap an async value";
-      return promise_.IsUnique() && !promise_.HasWaiter();
+      CHECK(promise_ && !promise_.GetAsyncValue()->IsIndirect())
+          << "Promise must wrap a concrete async value";
+      return promise_.GetAsyncValue()->NumRef() == 1 && !promise_.HasWaiter();
     }
 
    protected:
@@ -310,6 +334,30 @@ class FutureBase : public FutureMoveControl<is_move_only> {
     }
   }
 
+  // Returns a detached `Future<T>` that by default will execute all `OnReady`
+  // callbacks (and `Map` functors) on the given `executor`.
+  //
+  // When future value is set via the connected promise, all callbacks attached
+  // to the future will be executed on a thread that sets the promise value.
+  // This might lead to unexpectedly running expensive callbacks on a thread
+  // that is not intended for that, i.e. if a promise is set by a non-blocking
+  // thread that handles IO events, running expensive computation might lead to
+  // overall performance degradation.
+  //
+  // Detached future guarantees that all pending callbacks will be executed on
+  // the specified executor. If the future is ready when `OnReady` or `Map` is
+  // called, then the callback will be executed immediately in the caller
+  // thread. Users can explicitly override executor by using `OnReady` and `Map`
+  // overloads that accept another executor instance.
+  //
+  // We use a trick we an extra template parameter to disable const& overload
+  // when T is move-only, as we don't want to allow to create multiple futures
+  // sharing the same async value promise.
+  template <typename U = void,
+            std::enable_if_t<!is_move_only && std::is_void_v<U>>* = nullptr>
+  [[nodiscard]] Future<future_type_t<T>> Detach(Executor& executor) const&;
+  [[nodiscard]] Future<future_type_t<T>> Detach(Executor& executor) &&;
+
   // Returns a Future<> that becomes ready when *this is ready. If *this
   // completes with an error, the returned future will also be an error.
   //
@@ -327,7 +375,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(F&& f) const& {
     CHECK(IsValid());
-    promise_.AndThen(Wrap(std::forward<F>(f)));
+    promise_.AndThen(AndThen(std::forward<F>(f)));
   }
 
   // Registers callback to be called once the promise is ready, with the final
@@ -335,7 +383,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(Executor& executor, F&& f) const& {
     CHECK(IsValid());
-    promise_.AndThen(executor, Wrap(std::forward<F>(f)));
+    promise_.AndThen(executor, AndThen(std::forward<F>(f)));
   }
 
   // Registers callback to be called once the promise is ready, with the final
@@ -344,7 +392,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F, true>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(F&& f) && {
     CHECK(IsValid());
-    promise_.AndThen(std::move(*this).Wrap(std::forward<F>(f)));
+    promise_.AndThen(std::move(*this).AndThen(std::forward<F>(f)));
     promise_.reset();
   }
 
@@ -353,7 +401,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F, true>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(Executor& executor, F&& f) && {
     CHECK(IsValid());
-    promise_.AndThen(executor, std::move(*this).Wrap(std::forward<F>(f)));
+    promise_.AndThen(executor, std::move(*this).AndThen(std::forward<F>(f)));
     promise_.reset();
   }
 
@@ -374,23 +422,23 @@ class FutureBase : public FutureMoveControl<is_move_only> {
 
   // Wraps a callback into a functor compatible with AsyncValue::AndThen.
   template <typename F>
-  auto Wrap(F&& f) const& {
-    return [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
-      f(*promise);
+  auto AndThen(F&& f) const& {
+    return [ptr = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+      std::move(f)(*ptr);
     };
   }
 
   // Wraps a callback into a functor compatible with AsyncValue::AndThen.
   template <typename F>
-  auto Wrap(F&& f) && {
-    return [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+  auto AndThen(F&& f) && {
+    return [ptr = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
       if constexpr (is_move_only) {
-        f(std::move(*promise));
+        std::move(f)(std::move(*ptr));
       } else {
         // We can't move from the promise to the caller because for copyable
         // futures we can have multiple copies of the Future sharing the
         // same underlying promise object.
-        f(*promise);
+        std::move(f)(*ptr);
       }
     };
   }
@@ -458,7 +506,7 @@ class Future : public internal::FutureBase<absl::StatusOr<T>> {
 
   static constexpr bool is_move_only = Base::IsMoveOnly();  // NOLINT
 
-  static_assert(!std::is_same_v<T, absl::Status>,
+  static_assert(!internal::is_status_v<T>,
                 "Use Future<> specialization for stateless futures");
 
   static_assert(
@@ -513,8 +561,9 @@ class Future : public internal::FutureBase<absl::StatusOr<T>> {
     friend class Future;
   };
 
-  // Returns a pair of connected Promise and Future<T>. Setting the returned
-  // promise will fulfill the connected future.
+  // Returns a pair of connected Promise and Future<>. Setting the returned
+  // promise will fulfill the connected future and will run pending callbacks in
+  // the caller thread.
   //
   // - on_block_start is called before Await starts to block.
   // - on_block_end is called after Await finishes blocking.
@@ -527,6 +576,24 @@ class Future : public internal::FutureBase<absl::StatusOr<T>> {
     return std::make_pair(std::move(promise), std::move(future));
   }
 
+  // Returns a pair of connected Promise and Future<T>. Setting the returned
+  // promise will fulfill the connected future and will run all pending
+  // callbacks on the given `executor`. If the future is ready when `OnReady` or
+  // `Map` is called, then the callback will be executed immediately in the
+  // caller thread. Users can explicitly override executor by using `OnReady`
+  // and `Map` overloads that accept another executor instance.
+  //
+  // - on_block_start is called before Await starts to block.
+  // - on_block_end is called after Await finishes blocking.
+  static ABSL_ATTRIBUTE_ALWAYS_INLINE std::pair<Promise, Future<T>> MakePromise(
+      Executor& executor, FutureHelpers::OnBlockStart on_block_start = nullptr,
+      FutureHelpers::OnBlockEnd on_block_end = nullptr) {
+    auto [promise, future] =
+        MakePromise(std::move(on_block_start), std::move(on_block_end));
+    return std::make_pair(std::move(promise),
+                          std::move(future).Detach(executor));
+  }
+
   // Returns a future that is constructed from the result of invoking functor
   // `f` on the given `executor`.
   template <typename F, typename R = std::invoke_result_t<F>,
@@ -542,6 +609,7 @@ class Future : public internal::FutureBase<absl::StatusOr<T>> {
   }
 
   using Base::Await;
+  using Base::Detach;
   using Base::GetReadyFuture;
   using Base::OnReady;
 
@@ -601,7 +669,23 @@ class Future : public internal::FutureBase<absl::StatusOr<T>> {
             internal::Mappable<R, U>* = nullptr>
   [[nodiscard]] Future<R> Map(Executor& executor, F&& f) const& {
     auto [promise, future] = Future<R>::MakePromise();
-    OnReady(executor, SetPromise<R, U>(std::move(promise), std::forward<F>(f)));
+
+    OnReady([&executor, f = std::forward<F>(f), promise = std::move(promise),
+             ptr = Base::promise()](const absl::StatusOr<T>& value) mutable {
+      // Do not submit a task to the executor if the result is unused.
+      if (ABSL_PREDICT_FALSE(promise.IsUniqueReference())) {
+        promise.Set(Base::AbortedError());
+        return;
+      }
+
+      // Extend the lifetime of the underlying async value storage by copying
+      // the reference to it, to avoid use-after-free inside the `f` functor.
+      executor.Execute([&value, ref = ptr.CopyRef(), f = std::move(f),
+                        promise = std::move(promise)]() mutable {
+        SetPromise<R, U>(std::move(promise), std::move(f))(value);
+      });
+    });
+
     return std::move(future);
   }
 
@@ -670,9 +754,38 @@ class Future : public internal::FutureBase<absl::StatusOr<T>> {
             internal::Mappable<R, U>* = nullptr>
   [[nodiscard]] Future<R> Map(Executor& executor, F&& f) && {
     auto [promise, future] = Future<R>::MakePromise();
-    std::move(*this).OnReady(
-        executor, SetPromise<R, U, /*rvalue=*/true>(std::move(promise),
-                                                    std::forward<F>(f)));
+
+    using Value = std::conditional_t<is_move_only, absl::StatusOr<T>,
+                                     const absl::StatusOr<T>&>;
+    std::move(*this).OnReady([&executor, f = std::forward<F>(f),
+                              promise = std::move(promise),
+                              ptr = Base::promise()](Value value) mutable {
+      // Do not submit a task to the executor if the result is unused.
+      if (ABSL_PREDICT_FALSE(promise.IsUniqueReference())) {
+        promise.Set(Base::AbortedError());
+        return;
+      }
+
+      // For move-only types pass by value to the executor callback, and for
+      // copyable types pass by const reference to avoid accidental copies. For
+      // values passed by reference extend the lifetime of the underlying async
+      // value storage by copying the reference to it, to avoid use-after-free
+      // inside the `f` functor.
+      if constexpr (is_move_only) {
+        executor.Execute([value = std::move(value), f = std::move(f),
+                          promise = std::move(promise)]() mutable {
+          SetPromise<R, U, /*rvalue=*/true>(std::move(promise),
+                                            std::move(f))(std::move(value));
+        });
+      } else {
+        executor.Execute([&value, ref = ptr.CopyRef(), f = std::move(f),
+                          promise = std::move(promise)]() mutable {
+          SetPromise<R, U, /*rvalue=*/true>(std::move(promise),
+                                            std::move(f))(value);
+        });
+      }
+    });
+
     return std::move(future);
   }
 
@@ -822,7 +935,11 @@ class Future<void> : public internal::FutureBase<absl::Status> {
   };
 
   // Returns a pair of connected Promise and Future<>. Setting the returned
-  // promise will fulfill the connected future.
+  // promise will fulfill the connected future and will run all pending
+  // callbacks in the caller thread.
+  //
+  // - on_block_start is called before Await starts to block.
+  // - on_block_end is called after Await finishes blocking.
   static ABSL_ATTRIBUTE_ALWAYS_INLINE std::pair<Promise, Future<>> MakePromise(
       FutureHelpers::OnBlockStart on_block_start = nullptr,
       FutureHelpers::OnBlockEnd on_block_end = nullptr) {
@@ -832,8 +949,26 @@ class Future<void> : public internal::FutureBase<absl::Status> {
     return std::make_pair(std::move(promise), std::move(future));
   }
 
-  // Returns a future that is constructed from the result of invoking functor
-  // `f` on the given `executor`.
+  // Returns a pair of connected Promise and Future<>. Setting the returned
+  // promise will fulfill the connected future and will run all pending
+  // callbacks on the given `executor`. If the future is ready when `OnReady` or
+  // `Map` is called, then the callback will be executed immediately in the
+  // caller thread. Users can explicitly override executor by using `OnReady`
+  // and `Map` overloads that accept another executor instance.
+  //
+  // - on_block_start is called before Await starts to block.
+  // - on_block_end is called after Await finishes blocking.
+  static ABSL_ATTRIBUTE_ALWAYS_INLINE std::pair<Promise, Future<>> MakePromise(
+      Executor& executor, FutureHelpers::OnBlockStart on_block_start = nullptr,
+      FutureHelpers::OnBlockEnd on_block_end = nullptr) {
+    auto [promise, future] =
+        MakePromise(std::move(on_block_start), std::move(on_block_end));
+    return std::make_pair(std::move(promise),
+                          std::move(future).Detach(executor));
+  }
+
+  // Returns a future that is constructed from the result of invoking
+  // functor `f` on the given `executor`.
   template <typename F, typename R = std::invoke_result_t<F>,
             std::enable_if_t<internal::is_status_v<R>>* = nullptr>
   [[nodiscard]] static Future<> MakeOn(Executor& executor, F&& f) {
@@ -847,6 +982,7 @@ class Future<void> : public internal::FutureBase<absl::Status> {
 
   using Base::Await;
   using Base::BlockUntilReady;
+  using Base::Detach;
   using Base::OnReady;
 
   // Returns an Future<R> that is constructed from the result of invoking
@@ -901,7 +1037,21 @@ class Future<void> : public internal::FutureBase<absl::Status> {
             internal::Mappable<R, U>* = nullptr>
   [[nodiscard]] Future<R> Map(Executor& executor, F&& f) const {
     auto [promise, future] = Future<R>::MakePromise();
-    OnReady(executor, SetPromise<R, U>(std::move(promise), std::forward<F>(f)));
+
+    OnReady([&executor, f = std::forward<F>(f),
+             promise = std::move(promise)](const absl::Status& status) mutable {
+      // Do not submit a task to the executor if the result is unused.
+      if (ABSL_PREDICT_FALSE(promise.IsUniqueReference())) {
+        promise.Set(Base::AbortedError());
+        return;
+      }
+
+      // Pass `status` by value because it's cheap to copy, instead of extending
+      // the lifetime of the underlying async value storage.
+      executor.Execute(std::bind(
+          SetPromise<R, U>(std::move(promise), std::move(f)), status));
+    });
+
     return std::move(future);
   }
 
@@ -948,7 +1098,7 @@ class Future<void> : public internal::FutureBase<absl::Status> {
   template <typename R, typename U, typename F>
   static auto SetPromise(typename Future<R>::Promise promise, F&& f) {
     return [promise = std::move(promise),
-            f = std::forward<F>(f)](absl::Status status) mutable {
+            f = std::forward<F>(f)](const absl::Status& status) mutable {
       // Do not compute `f` if the result is unused.
       if (ABSL_PREDICT_FALSE(promise.IsUniqueReference())) {
         promise.Set(Base::AbortedError());
@@ -999,15 +1149,66 @@ using Promise = typename Future<T>::Promise;  // NOLINT
 
 namespace internal {
 
+template <typename T, bool is_move_only>
+template <typename U, std::enable_if_t<!is_move_only && std::is_void_v<U>>*>
+Future<future_type_t<T>> FutureBase<T, is_move_only>::Detach(
+    Executor& executor) const& {
+  if (ABSL_PREDICT_FALSE(IsReady())) {
+    return Future<future_type_t<T>>(promise_, on_block_start_, on_block_end_);
+  }
+
+  RCReference<IndirectAsyncValue> detached = MakeIndirectAsyncValue<T>();
+  promise_.AndThen([&executor, detached, ptr = promise_.AsPtr()] {
+    // If we hold the last reference to the detached promise, then we can safely
+    // forward it to the available value without using an executor, as we know
+    // that it will not execute any callbacks in the caller thread.
+    if (ABSL_PREDICT_FALSE(detached->NumRef() == 1 && !detached->HasWaiter())) {
+      detached->ForwardTo(ptr.CopyRCRef());
+    } else {
+      executor.Execute(absl::bind_front(&IndirectAsyncValue::ForwardTo,
+                                        std::move(detached), ptr.CopyRCRef()));
+    }
+  });
+  return Future<future_type_t<T>>(AsyncValueRef<T>(std::move(detached)),
+                                  on_block_start_, on_block_end_);
+}
+
+template <typename T, bool is_move_only>
+Future<future_type_t<T>> FutureBase<T, is_move_only>::Detach(
+    Executor& executor) && {
+  if (ABSL_PREDICT_FALSE(IsReady())) {
+    return Future<future_type_t<T>>(std::move(promise_),
+                                    std::move(on_block_start_),
+                                    std::move(on_block_end_));
+  }
+
+  AsyncValuePtr<T> ptr = promise_.AsPtr();
+  RCReference<IndirectAsyncValue> detached = MakeIndirectAsyncValue<T>();
+  ptr.AndThen([&executor, detached, ref = std::move(promise_)]() mutable {
+    // If we hold the last reference to the detached promise, then we can safely
+    // forward it to the available value without using an executor, as we know
+    // that it will not execute any callbacks in the caller thread.
+    if (ABSL_PREDICT_FALSE(detached->NumRef() == 1 && !detached->HasWaiter())) {
+      detached->ForwardTo(std::move(ref));
+    } else {
+      executor.Execute(absl::bind_front(&IndirectAsyncValue::ForwardTo,
+                                        std::move(detached), std::move(ref)));
+    }
+  });
+  return Future<future_type_t<T>>(AsyncValueRef<T>(std::move(detached)),
+                                  std::move(on_block_start_),
+                                  std::move(on_block_end_));
+}
+
 template <typename T, bool is_move_only>
 Future<> FutureBase<T, is_move_only>::GetReadyFuture() const {
   auto [promise, future] = Future<>::MakePromise();
   promise_.AndThen(
-      [self = promise_.AsPtr(), promise = std::move(promise)]() mutable {
-        if constexpr (std::is_same_v<T, absl::Status>) {
-          promise.Set(*self);
+      [ptr = promise_.AsPtr(), promise = std::move(promise)]() mutable {
+        if constexpr (internal::is_status_v<T>) {
+          promise.Set(*ptr);
         } else {
-          promise.Set(self->status());
+          promise.Set(ptr->status());
         }
       });
   return std::move(future);
diff --git a/third_party/xla/xla/tsl/concurrency/future_test.cc b/third_party/xla/xla/tsl/concurrency/future_test.cc
index b1e38a6268c9ca..bd6ff8d555d119 100644
--- a/third_party/xla/xla/tsl/concurrency/future_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/future_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/tsl/concurrency/future.h"
 
+#include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <tuple>
@@ -25,14 +27,26 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/tsl/concurrency/executor.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
 
 namespace tsl {
 
 using ::absl_testing::IsOk;
 using ::testing::Not;
 
+// Inline executor that counts the number of tasks executed.
+struct CountingExecutor : public Executor {
+  void Execute(Task task) final {
+    ++num_tasks;
+    std::move(task)();
+  }
+
+  int32_t num_tasks = 0;
+};
+
 TEST(FutureTest, StatusConstructedFuture) {
   Future<> future = Future<>(absl::OkStatus());
   EXPECT_TRUE(future.IsReady());
@@ -294,32 +308,65 @@ TEST(FutureTest, MapMoveOnlyWithInplaceConstructor) {
   EXPECT_EQ(mapped.Await()->v, 42);
 }
 
-TEST(FutureTest, MapUnusedResult) {
-  auto [promise, future] = Future<int>::MakePromise();
+TEST(FutureTest, MapStatelessUnusedResult) {
+  auto [promise, future] = Future<>::MakePromise();
 
   bool called = false;
   // We intentionally drop returned future to test that promise will not
   // execute map functor.
-  (void)future.Map([&](int) {
-    called = true;
-    return 2;
-  });
-  promise.Set(1);
+  (void)future.Map([&]() { called = true; });
+  promise.Set(absl::OkStatus());
   EXPECT_FALSE(called);
 }
 
-TEST(FutureTest, MapStatusUnusedResult) {
+TEST(FutureTest, MapStatelessOnExecutorUnusedResult) {
   auto [promise, future] = Future<>::MakePromise();
 
+  CountingExecutor executor;
   bool called = false;
   // We intentionally drop returned future to test that promise will not
   // execute map functor.
-  (void)future.Map([&]() {
-    called = true;
-    return 2;
-  });
-  promise.Set();
+  (void)future.Map(executor, [&]() { called = true; });
+  promise.Set(absl::OkStatus());
+  EXPECT_FALSE(called);
+  EXPECT_EQ(executor.num_tasks, 0);
+}
+
+TEST(FutureTest, MapStatefulUnusedResult) {
+  auto [promise, future] = Future<int32_t>::MakePromise();
+
+  bool called = false;
+  // We intentionally drop returned future to test that promise will not
+  // execute map functor.
+  (void)future.Map([&](int) { called = true; });
+  promise.Set(1);
+  EXPECT_FALSE(called);
+}
+
+TEST(FutureTest, MapStatefulOnExecutorUnusedResult) {
+  auto [promise, future] = Future<int32_t>::MakePromise();
+
+  CountingExecutor executor;
+  bool called = false;
+  // We intentionally drop returned future to test that promise will not
+  // execute map functor.
+  (void)future.Map(executor, [&](int32_t) { called = true; });
+  promise.Set(1);
   EXPECT_FALSE(called);
+  EXPECT_EQ(executor.num_tasks, 0);
+}
+
+TEST(FutureTest, MapStatefulRvalueOnExecutorUnusedResult) {
+  auto [promise, future] = Future<std::unique_ptr<int32_t>>::MakePromise();
+
+  CountingExecutor executor;
+  bool called = false;
+  // We intentionally drop returned future to test that promise will not
+  // execute map functor.
+  (void)std::move(future).Map(executor, [&](auto) { called = true; });
+  promise.Set(std::make_unique<int32_t>(1));
+  EXPECT_FALSE(called);
+  EXPECT_EQ(executor.num_tasks, 0);
 }
 
 TEST(FutureTest, TryMapCopyableFutureToStateless) {
@@ -456,20 +503,6 @@ TEST(FutureTest, TryMapMoveOnlyFutureCreateError) {
   EXPECT_EQ(mapped.Await().status(), absl::InternalError("test"));
 }
 
-TEST(FutureTest, TryMapUnusedResult) {
-  auto [promise, future] = Future<int>::MakePromise();
-
-  bool called = false;
-  // We intentionally drop returned future to test that promise will not
-  // execute map functor.
-  (void)future.Map([&](int) -> absl::StatusOr<int> {
-    called = true;
-    return 2;
-  });
-  promise.Set(1);
-  EXPECT_FALSE(called);
-}
-
 TEST(FutureTest, MapWithVoidFunctor) {
   {
     auto [promise, future] = Future<>::MakePromise();
@@ -497,6 +530,202 @@ TEST(FutureTest, MapWithVoidFunctor) {
   }
 }
 
+TEST(FutureTest, MapDoesNotCopy) {
+  static int32_t counter = 0;
+
+  // A trivial class that counts how many times the copy constructor is called.
+  struct Data {
+    Data() = default;
+
+    Data(const Data& other) { ++counter; }
+    Data(Data&& other) {}
+
+    Data& operator=(Data& other) = delete;
+    Data& operator=(Data&& other) = delete;
+  };
+
+  auto [promise, future] = Future<Data>::MakePromise();
+
+  Future<> m0 = future.Map([](const Data& data) {});
+  Future<> m1 = future.Map([](Data data) {});
+  Future<> m2 = std::move(future).Map([](const Data& data) {});
+
+  promise.Set(Data{});
+
+  EXPECT_EQ(m0.Await(), absl::OkStatus());
+  EXPECT_EQ(m1.Await(), absl::OkStatus());
+  EXPECT_EQ(m2.Await(), absl::OkStatus());
+
+  EXPECT_EQ(counter, 1);
+};
+
+TEST(FutureTest, DetachDoesnNotCopy) {
+  CountingExecutor executor;
+  static int32_t counter = 0;
+
+  // A trivial class that counts how many times the copy constructor is called.
+  struct Data {
+    Data() = default;
+
+    Data(const Data& other) { ++counter; }
+    Data(Data&& other) {}
+
+    Data& operator=(Data& other) = delete;
+    Data& operator=(Data&& other) = delete;
+  };
+
+  auto [promise, future] = Future<Data>::MakePromise();
+  auto detached = future.Detach(executor);
+
+  Future<> m0 = future.Map([](const Data& data) {});
+  Future<> m1 = detached.Map([](const Data& data) {});
+
+  promise.Set(Data{});
+
+  EXPECT_EQ(m0.Await(), absl::OkStatus());
+  EXPECT_EQ(m1.Await(), absl::OkStatus());
+
+  EXPECT_EQ(counter, 0);
+  EXPECT_EQ(executor.num_tasks, 1);
+};
+
+TEST(FutureTest, DetachAndMap) {
+  CountingExecutor executor;
+
+  auto [promise, future] = Future<>::MakePromise();
+
+  Future<> mapped = future.Detach(executor).Map([] {});
+  promise.Set(absl::OkStatus());
+
+  EXPECT_EQ(mapped.Await(), absl::OkStatus());
+  EXPECT_EQ(executor.num_tasks, 1);
+};
+
+TEST(FutureTest, MakeDetachedFuture) {
+  CountingExecutor executor;
+
+  {  // Stateless future.
+    auto [promise, future] = Future<>::MakePromise(executor);
+    Future<> mapped = future.Map([] {});
+    promise.Set(absl::OkStatus());
+
+    EXPECT_EQ(mapped.Await(), absl::OkStatus());
+    EXPECT_EQ(executor.num_tasks, 1);
+  }
+
+  {  // Stateful future.
+    auto [promise, future] = Future<int32_t>::MakePromise(executor);
+    Future<> mapped = future.Map([](int32_t value) { EXPECT_EQ(value, 42); });
+    promise.Set(42);
+
+    EXPECT_EQ(mapped.Await(), absl::OkStatus());
+    EXPECT_EQ(executor.num_tasks, 2);
+  }
+};
+
+TEST(FutureTest, DetachMoveOnly) {
+  CountingExecutor executor;
+  static int32_t counter = 0;
+
+  auto [promise, future] = Future<std::unique_ptr<int32_t>>::MakePromise();
+  auto detached = std::move(future).Detach(executor);
+
+  Future<> m0 = std::move(detached).Map([](std::unique_ptr<int32_t> value) {
+    EXPECT_TRUE(value);
+    EXPECT_EQ(*value, 42);
+  });
+
+  promise.Set(std::make_unique<int32_t>(42));
+  EXPECT_EQ(m0.Await(), absl::OkStatus());
+
+  EXPECT_EQ(counter, 0);
+  EXPECT_EQ(executor.num_tasks, 1);
+};
+
+TEST(FutureTest, DetachOnThreadPool) {
+  // We use static thread local counter to make sure that all callbacks are
+  // executed on a thread inside the thread pool.
+  static thread_local int32_t counter = 0;
+
+  thread::ThreadPool thread_pool(Env::Default(), "test", 4);
+  Executor* executor = thread_pool.AsExecutor();
+
+  {  // Test both lvalue and rvalue stateless detached futures.
+    auto [promise, future] = Future<>::MakePromise();
+    Future<> detached = future.Detach(*executor);
+    detached.OnReady([](auto) { counter++; });
+    future.Detach(*executor).OnReady([](auto) { counter++; });
+    promise.Set(absl::OkStatus());
+  }
+
+  {  // Test both lvalue and rvalue stateful detached futures.
+    auto [promise, future] = Future<int32_t>::MakePromise();
+    Future<int32_t> detached = future.Detach(*executor);
+    detached.OnReady([](auto) { counter++; });
+    future.Detach(*executor).OnReady([](auto) { counter++; });
+    promise.Set(42);
+  }
+
+  {  // Test detached future with move-only payload.
+    auto [promise, future] = Future<std::unique_ptr<int32_t>>::MakePromise();
+    std::move(future).Detach(*executor).OnReady([](auto) { counter++; });
+    promise.Set(std::make_unique<int32_t>(42));
+  }
+
+  // Check that no callbacks were executed on the thread that sets the promise.
+  EXPECT_EQ(counter, 0);
+}
+
+TEST(FutureTest, NoOpDetachDoesNotExecute) {
+  auto [promise, future] = Future<>::MakePromise();
+
+  CountingExecutor executor;
+  (void)future.Detach(executor);
+  promise.Set(absl::OkStatus());
+  EXPECT_EQ(executor.num_tasks, 0);
+}
+
+TEST(FutureTest, NoOpMoveOnlyDetachDoesNotExecute) {
+  auto [promise, future] = Future<std::unique_ptr<int32_t>>::MakePromise();
+
+  CountingExecutor executor;
+  (void)std::move(future).Detach(executor);
+  promise.Set(std::make_unique<int32_t>(42));
+  EXPECT_EQ(executor.num_tasks, 0);
+}
+
+TEST(FutureTest, MapOnExecutorDoesNotCopy) {
+  thread::ThreadPool thread_pool(Env::Default(), "test", 4);
+  Executor* executor = thread_pool.AsExecutor();
+
+  static int32_t counter = 0;
+
+  // A trivial class that counts how many times the copy constructor is called.
+  struct Data {
+    Data() = default;
+
+    Data(const Data& other) { ++counter; }
+    Data(Data&& other) {}
+
+    Data& operator=(Data& other) = delete;
+    Data& operator=(Data&& other) = delete;
+  };
+
+  auto [promise, future] = Future<Data>::MakePromise();
+
+  Future<> m0 = future.Map(*executor, [](const Data& data) {});
+  Future<> m1 = future.Map(*executor, [](Data data) {});
+  Future<> m2 = std::move(future).Map(*executor, [](const Data& data) {});
+
+  promise.Set(Data{});
+
+  EXPECT_EQ(m0.Await(), absl::OkStatus());
+  EXPECT_EQ(m1.Await(), absl::OkStatus());
+  EXPECT_EQ(m2.Await(), absl::OkStatus());
+
+  EXPECT_EQ(counter, 1);
+};
+
 TEST(FutureTest, StatelessError) {
   auto [promise, future] = Future<>::MakePromise();
 
@@ -853,6 +1082,109 @@ TEST(FutureTest, MapOnExecutor) {
   EXPECT_EQ(*mapped2.Await(), 43);
 }
 
+TEST(FutureTest, MapStatelessOnThreadPoolExecutor) {
+  thread::ThreadPool thread_pool(Env::Default(), "test", 4);
+
+  std::vector<Future<>> mapped;
+  std::atomic<int32_t> counter = 0;
+
+  {  // Create mapped future in a nested scope to make sure that `promise` and
+    // `future` are destroyed before the end of the test.
+    auto [promise, future] = Future<>::MakePromise();
+    for (size_t i = 0; i < 100; ++i) {
+      mapped.push_back(
+          future.Map(*thread_pool.AsExecutor(), [&] { ++counter; }));
+    }
+    promise.Set();
+  }
+
+  EXPECT_EQ(JoinFutures(mapped).Await(), absl::OkStatus());
+  EXPECT_EQ(counter, 100);
+}
+
+TEST(FutureTest, MapStatefulOnThreadPoolExecutor) {
+  thread::ThreadPool thread_pool(Env::Default(), "test", 4);
+
+  std::vector<Future<>> mapped;
+  std::atomic<int32_t> counter = 0;
+
+  {  // Create mapped future in a nested scope to make sure that `promise` and
+    // `future` are destroyed before the end of the test.
+    auto [promise, future] = Future<int32_t>::MakePromise();
+    for (size_t i = 0; i < 100; ++i) {
+      mapped.push_back(future.Map(*thread_pool.AsExecutor(),
+                                  [&](int32_t value) { counter += value; }));
+    }
+    promise.Set(1);
+  }
+
+  EXPECT_EQ(JoinFutures(mapped).Await(), absl::OkStatus());
+  EXPECT_EQ(counter, 100);
+}
+
+TEST(FutureTest, MapMoveOnlyOnThreadPoolExecutor) {
+  thread::ThreadPool thread_pool(Env::Default(), "test", 4);
+
+  std::vector<Future<>> mapped;
+  std::atomic<int32_t> counter = 0;
+
+  {  // Create mapped future in a nested scope to make sure that `promise` and
+    // `future` are destroyed before the end of the test.
+    auto [promise, future] = Future<std::unique_ptr<int32_t>>::MakePromise();
+    for (size_t i = 0; i < 100; ++i) {
+      mapped.push_back(future.Map(
+          *thread_pool.AsExecutor(),
+          [&](const std::unique_ptr<int32_t>& value) { counter += *value; }));
+    }
+    promise.Set(std::make_unique<int32_t>(1));
+  }
+
+  EXPECT_EQ(JoinFutures(mapped).Await(), absl::OkStatus());
+  EXPECT_EQ(counter, 100);
+}
+
+TEST(FutureTest, MapMoveOnlyRvalueOnThreadPoolExecutor) {
+  thread::ThreadPool thread_pool(Env::Default(), "test", 4);
+
+  std::vector<Future<>> mapped;
+  std::atomic<int32_t> counter = 0;
+
+  {  // Create mapped future in a nested scope to make sure that `promise` and
+    // `future` are destroyed before the end of the test.
+    for (size_t i = 0; i < 100; ++i) {
+      auto [promise, future] = Future<std::unique_ptr<int32_t>>::MakePromise();
+      mapped.push_back(std::move(future).Map(
+          *thread_pool.AsExecutor(),
+          [&](std::unique_ptr<int32_t> value) { counter += *value; }));
+      promise.Set(std::make_unique<int32_t>(1));
+    }
+  }
+
+  EXPECT_EQ(JoinFutures(mapped).Await(), absl::OkStatus());
+  EXPECT_EQ(counter, 100);
+}
+
+TEST(FutureTest, DetachStatefulOnThreadPoolExecutor) {
+  thread::ThreadPool thread_pool(Env::Default(), "test", 4);
+  Executor* executor = thread_pool.AsExecutor();
+
+  std::vector<Future<>> mapped;
+  std::atomic<int32_t> counter = 0;
+
+  {  // Create mapped future in a nested scope to make sure that `promise` and
+    // `future` are destroyed before the end of the test.
+    auto [promise, future] = Future<int32_t>::MakePromise();
+    for (size_t i = 0; i < 100; ++i) {
+      mapped.push_back(future.Detach(*executor).Map(
+          [&](int32_t value) { counter += value; }));
+    }
+    promise.Set(1);
+  }
+
+  EXPECT_EQ(JoinFutures(mapped).Await(), absl::OkStatus());
+  EXPECT_EQ(counter, 100);
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/tsl/cuda/BUILD.bazel b/third_party/xla/xla/tsl/cuda/BUILD.bazel
index 14d734c69da04c..bb23b6e4b0f858 100644
--- a/third_party/xla/xla/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/xla/tsl/cuda/BUILD.bazel
@@ -411,3 +411,32 @@ alias(
     actual = if_cuda_libs("@nvidia_nvshmem//:nvshmem", ":nvshmem"),
     visibility = ["//visibility:public"],
 )
+
+cuda_stub(
+    name = "nvml",
+    srcs = ["nvml.symbols"],
+)
+
+cc_library(
+    name = "nvml_stub",  # buildifier: disable=duplicated-name
+    srcs = if_cuda_is_configured([
+        "nvml_stub.cc",
+        "nvml.tramp.S",
+    ]),
+    local_defines = [
+        "IMPLIB_EXPORT_SHIMS=1",
+    ],
+    textual_hdrs = ["nvml.inc"],
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        "@local_tsl//tsl/platform:dso_loader",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:load_library",
+    ]),
+)
+
+alias(
+    name = "nvml",  # buildifier: disable=duplicated-name
+    actual = ":nvml_stub",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/xla/tsl/cuda/cupti.symbols b/third_party/xla/xla/tsl/cuda/cupti.symbols
index f62f3f59fce802..bd0d23de8aee48 100644
--- a/third_party/xla/xla/tsl/cuda/cupti.symbols
+++ b/third_party/xla/xla/tsl/cuda/cupti.symbols
@@ -17,6 +17,8 @@ cuptiActivityFlushPeriod
 cuptiActivityGetAttribute
 cuptiActivityGetNextRecord
 cuptiActivityGetNumDroppedRecords
+cuptiActivityGetNvtxExtPayloadAttr
+cuptiActivityGetNvtxExtPayloadEntryTypeInfo
 cuptiActivityPopExternalCorrelationId
 cuptiActivityPushExternalCorrelationId
 cuptiActivityRegisterCallbacks
diff --git a/third_party/xla/xla/tsl/cuda/nvml.symbols b/third_party/xla/xla/tsl/cuda/nvml.symbols
new file mode 100644
index 00000000000000..0e1282f059dd1f
--- /dev/null
+++ b/third_party/xla/xla/tsl/cuda/nvml.symbols
@@ -0,0 +1,376 @@
+nvmlComputeInstanceDestroy
+nvmlComputeInstanceGetInfo
+nvmlComputeInstanceGetInfo_v2
+nvmlDeviceClearAccountingPids
+nvmlDeviceClearCpuAffinity
+nvmlDeviceClearEccErrorCounts
+nvmlDeviceClearFieldValues
+nvmlDeviceCreateGpuInstance
+nvmlDeviceCreateGpuInstanceWithPlacement
+nvmlDeviceDiscoverGpus
+nvmlDeviceFreezeNvLinkUtilizationCounter
+nvmlDeviceGetAPIRestriction
+nvmlDeviceGetAccountingBufferSize
+nvmlDeviceGetAccountingMode
+nvmlDeviceGetAccountingPids
+nvmlDeviceGetAccountingStats
+nvmlDeviceGetActiveVgpus
+nvmlDeviceGetAdaptiveClockInfoStatus
+nvmlDeviceGetApplicationsClock
+nvmlDeviceGetArchitecture
+nvmlDeviceGetAttributes
+nvmlDeviceGetAttributes_v2
+nvmlDeviceGetAutoBoostedClocksEnabled
+nvmlDeviceGetBAR1MemoryInfo
+nvmlDeviceGetBoardId
+nvmlDeviceGetBoardPartNumber
+nvmlDeviceGetBrand
+nvmlDeviceGetBridgeChipInfo
+nvmlDeviceGetBusType
+nvmlDeviceGetC2cModeInfoV
+nvmlDeviceGetCapabilities
+nvmlDeviceGetClkMonStatus
+nvmlDeviceGetClock
+nvmlDeviceGetClockInfo
+nvmlDeviceGetClockOffsets
+nvmlDeviceGetComputeInstanceId
+nvmlDeviceGetComputeMode
+nvmlDeviceGetComputeRunningProcesses
+nvmlDeviceGetComputeRunningProcesses_v2
+nvmlDeviceGetComputeRunningProcesses_v3
+nvmlDeviceGetConfComputeGpuAttestationReport
+nvmlDeviceGetConfComputeGpuCertificate
+nvmlDeviceGetConfComputeMemSizeInfo
+nvmlDeviceGetConfComputeProtectedMemoryUsage
+nvmlDeviceGetCoolerInfo
+nvmlDeviceGetCount
+nvmlDeviceGetCount_v2
+nvmlDeviceGetCpuAffinity
+nvmlDeviceGetCpuAffinityWithinScope
+nvmlDeviceGetCreatableVgpus
+nvmlDeviceGetCudaComputeCapability
+nvmlDeviceGetCurrPcieLinkGeneration
+nvmlDeviceGetCurrPcieLinkWidth
+nvmlDeviceGetCurrentClockFreqs
+nvmlDeviceGetCurrentClocksEventReasons
+nvmlDeviceGetCurrentClocksThrottleReasons
+nvmlDeviceGetDecoderUtilization
+nvmlDeviceGetDefaultApplicationsClock
+nvmlDeviceGetDefaultEccMode
+nvmlDeviceGetDetailedEccErrors
+nvmlDeviceGetDeviceHandleFromMigDeviceHandle
+nvmlDeviceGetDisplayActive
+nvmlDeviceGetDisplayMode
+nvmlDeviceGetDramEncryptionMode
+nvmlDeviceGetDriverModel
+nvmlDeviceGetDriverModel_v2
+nvmlDeviceGetDynamicPstatesInfo
+nvmlDeviceGetEccMode
+nvmlDeviceGetEncoderCapacity
+nvmlDeviceGetEncoderSessions
+nvmlDeviceGetEncoderStats
+nvmlDeviceGetEncoderUtilization
+nvmlDeviceGetEnforcedPowerLimit
+nvmlDeviceGetFBCSessions
+nvmlDeviceGetFBCStats
+nvmlDeviceGetFanControlPolicy_v2
+nvmlDeviceGetFanSpeed
+nvmlDeviceGetFanSpeedRPM
+nvmlDeviceGetFanSpeed_v2
+nvmlDeviceGetFieldValues
+nvmlDeviceGetGpcClkMinMaxVfOffset
+nvmlDeviceGetGpcClkVfOffset
+nvmlDeviceGetGpuFabricInfo
+nvmlDeviceGetGpuFabricInfoV
+nvmlDeviceGetGpuInstanceById
+nvmlDeviceGetGpuInstanceId
+nvmlDeviceGetGpuInstancePossiblePlacements
+nvmlDeviceGetGpuInstancePossiblePlacements_v2
+nvmlDeviceGetGpuInstanceProfileInfo
+nvmlDeviceGetGpuInstanceProfileInfoV
+nvmlDeviceGetGpuInstanceRemainingCapacity
+nvmlDeviceGetGpuInstances
+nvmlDeviceGetGpuMaxPcieLinkGeneration
+nvmlDeviceGetGpuOperationMode
+nvmlDeviceGetGraphicsRunningProcesses
+nvmlDeviceGetGraphicsRunningProcesses_v2
+nvmlDeviceGetGraphicsRunningProcesses_v3
+nvmlDeviceGetGridLicensableFeatures
+nvmlDeviceGetGridLicensableFeatures_v2
+nvmlDeviceGetGridLicensableFeatures_v3
+nvmlDeviceGetGridLicensableFeatures_v4
+nvmlDeviceGetGspFirmwareMode
+nvmlDeviceGetGspFirmwareVersion
+nvmlDeviceGetHandleByIndex
+nvmlDeviceGetHandleByIndex_v2
+nvmlDeviceGetHandleByPciBusId
+nvmlDeviceGetHandleByPciBusId_v2
+nvmlDeviceGetHandleBySerial
+nvmlDeviceGetHandleByUUID
+nvmlDeviceGetHostVgpuMode
+nvmlDeviceGetIndex
+nvmlDeviceGetInforomConfigurationChecksum
+nvmlDeviceGetInforomImageVersion
+nvmlDeviceGetInforomVersion
+nvmlDeviceGetIrqNum
+nvmlDeviceGetJpgUtilization
+nvmlDeviceGetLastBBXFlushTime
+nvmlDeviceGetMPSComputeRunningProcesses
+nvmlDeviceGetMPSComputeRunningProcesses_v2
+nvmlDeviceGetMPSComputeRunningProcesses_v3
+nvmlDeviceGetMarginTemperature
+nvmlDeviceGetMaxClockInfo
+nvmlDeviceGetMaxCustomerBoostClock
+nvmlDeviceGetMaxMigDeviceCount
+nvmlDeviceGetMaxPcieLinkGeneration
+nvmlDeviceGetMaxPcieLinkWidth
+nvmlDeviceGetMemClkMinMaxVfOffset
+nvmlDeviceGetMemClkVfOffset
+nvmlDeviceGetMemoryAffinity
+nvmlDeviceGetMemoryBusWidth
+nvmlDeviceGetMemoryErrorCounter
+nvmlDeviceGetMemoryInfo
+nvmlDeviceGetMemoryInfo_v2
+nvmlDeviceGetMigDeviceHandleByIndex
+nvmlDeviceGetMigMode
+nvmlDeviceGetMinMaxClockOfPState
+nvmlDeviceGetMinMaxFanSpeed
+nvmlDeviceGetMinorNumber
+nvmlDeviceGetModuleId
+nvmlDeviceGetMultiGpuBoard
+nvmlDeviceGetName
+nvmlDeviceGetNumFans
+nvmlDeviceGetNumGpuCores
+nvmlDeviceGetNumaNodeId
+nvmlDeviceGetNvLinkCapability
+nvmlDeviceGetNvLinkErrorCounter
+nvmlDeviceGetNvLinkRemoteDeviceType
+nvmlDeviceGetNvLinkRemotePciInfo
+nvmlDeviceGetNvLinkRemotePciInfo_v2
+nvmlDeviceGetNvLinkState
+nvmlDeviceGetNvLinkUtilizationControl
+nvmlDeviceGetNvLinkUtilizationCounter
+nvmlDeviceGetNvLinkVersion
+nvmlDeviceGetNvlinkBwMode
+nvmlDeviceGetNvlinkSupportedBwModes
+nvmlDeviceGetOfaUtilization
+nvmlDeviceGetP2PStatus
+nvmlDeviceGetPciInfo
+nvmlDeviceGetPciInfoExt
+nvmlDeviceGetPciInfo_v2
+nvmlDeviceGetPciInfo_v3
+nvmlDeviceGetPcieLinkMaxSpeed
+nvmlDeviceGetPcieReplayCounter
+nvmlDeviceGetPcieSpeed
+nvmlDeviceGetPcieThroughput
+nvmlDeviceGetPerformanceModes
+nvmlDeviceGetPerformanceState
+nvmlDeviceGetPersistenceMode
+nvmlDeviceGetPgpuMetadataString
+nvmlDeviceGetPlatformInfo
+nvmlDeviceGetPowerManagementDefaultLimit
+nvmlDeviceGetPowerManagementLimit
+nvmlDeviceGetPowerManagementLimitConstraints
+nvmlDeviceGetPowerManagementMode
+nvmlDeviceGetPowerSource
+nvmlDeviceGetPowerState
+nvmlDeviceGetPowerUsage
+nvmlDeviceGetProcessUtilization
+nvmlDeviceGetProcessesUtilizationInfo
+nvmlDeviceGetRemappedRows
+nvmlDeviceGetRetiredPages
+nvmlDeviceGetRetiredPagesPendingStatus
+nvmlDeviceGetRetiredPages_v2
+nvmlDeviceGetRowRemapperHistogram
+nvmlDeviceGetRunningProcessDetailList
+nvmlDeviceGetSamples
+nvmlDeviceGetSerial
+nvmlDeviceGetSramEccErrorStatus
+nvmlDeviceGetSupportedClocksEventReasons
+nvmlDeviceGetSupportedClocksThrottleReasons
+nvmlDeviceGetSupportedEventTypes
+nvmlDeviceGetSupportedGraphicsClocks
+nvmlDeviceGetSupportedMemoryClocks
+nvmlDeviceGetSupportedPerformanceStates
+nvmlDeviceGetSupportedVgpus
+nvmlDeviceGetTargetFanSpeed
+nvmlDeviceGetTemperature
+nvmlDeviceGetTemperatureThreshold
+nvmlDeviceGetTemperatureV
+nvmlDeviceGetThermalSettings
+nvmlDeviceGetTopologyCommonAncestor
+nvmlDeviceGetTopologyNearestGpus
+nvmlDeviceGetTotalEccErrors
+nvmlDeviceGetTotalEnergyConsumption
+nvmlDeviceGetUUID
+nvmlDeviceGetUtilizationRates
+nvmlDeviceGetVbiosVersion
+nvmlDeviceGetVgpuCapabilities
+nvmlDeviceGetVgpuHeterogeneousMode
+nvmlDeviceGetVgpuInstancesUtilizationInfo
+nvmlDeviceGetVgpuMetadata
+nvmlDeviceGetVgpuProcessUtilization
+nvmlDeviceGetVgpuProcessesUtilizationInfo
+nvmlDeviceGetVgpuSchedulerCapabilities
+nvmlDeviceGetVgpuSchedulerLog
+nvmlDeviceGetVgpuSchedulerState
+nvmlDeviceGetVgpuTypeCreatablePlacements
+nvmlDeviceGetVgpuTypeSupportedPlacements
+nvmlDeviceGetVgpuUtilization
+nvmlDeviceGetViolationStatus
+nvmlDeviceGetVirtualizationMode
+nvmlDeviceIsMigDeviceHandle
+nvmlDeviceModifyDrainState
+nvmlDeviceOnSameBoard
+nvmlDevicePowerSmoothingActivatePresetProfile
+nvmlDevicePowerSmoothingSetState
+nvmlDevicePowerSmoothingUpdatePresetProfileParam
+nvmlDeviceQueryDrainState
+nvmlDeviceRegisterEvents
+nvmlDeviceRemoveGpu
+nvmlDeviceRemoveGpu_v2
+nvmlDeviceResetApplicationsClocks
+nvmlDeviceResetGpuLockedClocks
+nvmlDeviceResetMemoryLockedClocks
+nvmlDeviceResetNvLinkErrorCounters
+nvmlDeviceResetNvLinkUtilizationCounter
+nvmlDeviceSetAPIRestriction
+nvmlDeviceSetAccountingMode
+nvmlDeviceSetApplicationsClocks
+nvmlDeviceSetAutoBoostedClocksEnabled
+nvmlDeviceSetClockOffsets
+nvmlDeviceSetComputeMode
+nvmlDeviceSetConfComputeUnprotectedMemSize
+nvmlDeviceSetCpuAffinity
+nvmlDeviceSetDefaultAutoBoostedClocksEnabled
+nvmlDeviceSetDefaultFanSpeed_v2
+nvmlDeviceSetDramEncryptionMode
+nvmlDeviceSetDriverModel
+nvmlDeviceSetEccMode
+nvmlDeviceSetFanControlPolicy
+nvmlDeviceSetFanSpeed_v2
+nvmlDeviceSetGpcClkVfOffset
+nvmlDeviceSetGpuLockedClocks
+nvmlDeviceSetGpuOperationMode
+nvmlDeviceSetMemClkVfOffset
+nvmlDeviceSetMemoryLockedClocks
+nvmlDeviceSetMigMode
+nvmlDeviceSetNvLinkDeviceLowPowerThreshold
+nvmlDeviceSetNvLinkUtilizationControl
+nvmlDeviceSetNvlinkBwMode
+nvmlDeviceSetPersistenceMode
+nvmlDeviceSetPowerManagementLimit
+nvmlDeviceSetPowerManagementLimit_v2
+nvmlDeviceSetTemperatureThreshold
+nvmlDeviceSetVgpuCapabilities
+nvmlDeviceSetVgpuHeterogeneousMode
+nvmlDeviceSetVgpuSchedulerState
+nvmlDeviceSetVirtualizationMode
+nvmlDeviceValidateInforom
+nvmlDeviceWorkloadPowerProfileClearRequestedProfiles
+nvmlDeviceWorkloadPowerProfileGetCurrentProfiles
+nvmlDeviceWorkloadPowerProfileGetProfilesInfo
+nvmlDeviceWorkloadPowerProfileSetRequestedProfiles
+nvmlErrorString
+nvmlEventSetCreate
+nvmlEventSetFree
+nvmlEventSetWait
+nvmlEventSetWait_v2
+nvmlGetBlacklistDeviceCount
+nvmlGetBlacklistDeviceInfoByIndex
+nvmlGetExcludedDeviceCount
+nvmlGetExcludedDeviceInfoByIndex
+nvmlGetVgpuCompatibility
+nvmlGetVgpuDriverCapabilities
+nvmlGetVgpuVersion
+nvmlGpmMetricsGet
+nvmlGpmMigSampleGet
+nvmlGpmQueryDeviceSupport
+nvmlGpmQueryIfStreamingEnabled
+nvmlGpmSampleAlloc
+nvmlGpmSampleFree
+nvmlGpmSampleGet
+nvmlGpmSetStreamingEnabled
+nvmlGpuInstanceCreateComputeInstance
+nvmlGpuInstanceCreateComputeInstanceWithPlacement
+nvmlGpuInstanceDestroy
+nvmlGpuInstanceGetComputeInstanceById
+nvmlGpuInstanceGetComputeInstancePossiblePlacements
+nvmlGpuInstanceGetComputeInstanceProfileInfo
+nvmlGpuInstanceGetComputeInstanceProfileInfoV
+nvmlGpuInstanceGetComputeInstanceRemainingCapacity
+nvmlGpuInstanceGetComputeInstances
+nvmlGpuInstanceGetInfo
+nvmlInit
+nvmlInitWithFlags
+nvmlInit_v2
+nvmlInternalGetExportTable
+nvmlSetVgpuVersion
+nvmlShutdown
+nvmlSystemGetConfComputeCapabilities
+nvmlSystemGetConfComputeGpusReadyState
+nvmlSystemGetConfComputeKeyRotationThresholdInfo
+nvmlSystemGetConfComputeSettings
+nvmlSystemGetConfComputeState
+nvmlSystemGetCudaDriverVersion
+nvmlSystemGetCudaDriverVersion_v2
+nvmlSystemGetDriverBranch
+nvmlSystemGetDriverVersion
+nvmlSystemGetHicVersion
+nvmlSystemGetNVMLVersion
+nvmlSystemGetNvlinkBwMode
+nvmlSystemGetProcessName
+nvmlSystemGetTopologyGpuSet
+nvmlSystemSetConfComputeGpusReadyState
+nvmlSystemSetConfComputeKeyRotationThresholdInfo
+nvmlSystemSetNvlinkBwMode
+nvmlUnitGetCount
+nvmlUnitGetDevices
+nvmlUnitGetFanSpeedInfo
+nvmlUnitGetHandleByIndex
+nvmlUnitGetLedState
+nvmlUnitGetPsuInfo
+nvmlUnitGetTemperature
+nvmlUnitGetUnitInfo
+nvmlUnitSetLedState
+nvmlVgpuInstanceClearAccountingPids
+nvmlVgpuInstanceGetAccountingMode
+nvmlVgpuInstanceGetAccountingPids
+nvmlVgpuInstanceGetAccountingStats
+nvmlVgpuInstanceGetEccMode
+nvmlVgpuInstanceGetEncoderCapacity
+nvmlVgpuInstanceGetEncoderSessions
+nvmlVgpuInstanceGetEncoderStats
+nvmlVgpuInstanceGetFBCSessions
+nvmlVgpuInstanceGetFBCStats
+nvmlVgpuInstanceGetFbUsage
+nvmlVgpuInstanceGetFrameRateLimit
+nvmlVgpuInstanceGetGpuInstanceId
+nvmlVgpuInstanceGetGpuPciId
+nvmlVgpuInstanceGetLicenseInfo
+nvmlVgpuInstanceGetLicenseInfo_v2
+nvmlVgpuInstanceGetLicenseStatus
+nvmlVgpuInstanceGetMdevUUID
+nvmlVgpuInstanceGetMetadata
+nvmlVgpuInstanceGetPlacementId
+nvmlVgpuInstanceGetRuntimeStateSize
+nvmlVgpuInstanceGetType
+nvmlVgpuInstanceGetUUID
+nvmlVgpuInstanceGetVmDriverVersion
+nvmlVgpuInstanceGetVmID
+nvmlVgpuInstanceSetEncoderCapacity
+nvmlVgpuTypeGetBAR1Info
+nvmlVgpuTypeGetCapabilities
+nvmlVgpuTypeGetClass
+nvmlVgpuTypeGetDeviceID
+nvmlVgpuTypeGetFbReservation
+nvmlVgpuTypeGetFrameRateLimit
+nvmlVgpuTypeGetFramebufferSize
+nvmlVgpuTypeGetGpuInstanceProfileId
+nvmlVgpuTypeGetGspHeapSize
+nvmlVgpuTypeGetLicense
+nvmlVgpuTypeGetMaxInstances
+nvmlVgpuTypeGetMaxInstancesPerVm
+nvmlVgpuTypeGetName
+nvmlVgpuTypeGetNumDisplayHeads
+nvmlVgpuTypeGetResolution
diff --git a/third_party/xla/xla/tsl/cuda/nvml_stub.cc b/third_party/xla/xla/tsl/cuda/nvml_stub.cc
new file mode 100644
index 00000000000000..ac6e82986e5764
--- /dev/null
+++ b/third_party/xla/xla/tsl/cuda/nvml_stub.cc
@@ -0,0 +1,72 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
+#include "xla/tsl/platform/logging.h"
+#include "tsl/platform/dso_loader.h"
+#include "tsl/platform/load_library.h"
+
+// Implements the NVML API by forwarding to NVML loaded from the DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or = tsl::internal::DsoLoader::GetNvmlDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.value();
+  }();
+  return handle;
+#endif
+}
+
+void* LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return symbol;
+}
+
+const char* kSymbols[] = {
+#include "xla/tsl/cuda/nvml.inc"
+};
+
+constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
+
+}  // namespace
+
+extern "C" {
+
+static nvmlReturn_t GetSymbolNotFoundError() {
+  return NVML_ERROR_FUNCTION_NOT_FOUND;
+}
+
+extern void* _nvml_tramp_table[];
+
+void _nvml_tramp_resolve(int i) {
+  CHECK_LE(0, i);
+  CHECK_LT(i, kNumSymbols);
+  void* p = LoadSymbol(kSymbols[i]);
+  if (!p) {
+    p = reinterpret_cast<void*>(&GetSymbolNotFoundError);
+  }
+  _nvml_tramp_table[i] = p;
+}
+
+}  // extern "C"
diff --git a/third_party/xla/xla/tsl/cuda/stub.bzl b/third_party/xla/xla/tsl/cuda/stub.bzl
index 23c980766c8673..dd7ac5d171d90c 100644
--- a/third_party/xla/xla/tsl/cuda/stub.bzl
+++ b/third_party/xla/xla/tsl/cuda/stub.bzl
@@ -19,9 +19,9 @@ def cuda_stub(name, srcs):
         ],
         tags = ["gpu"],
         cmd = select({
-            "@local_xla//xla/tsl:linux_aarch64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
-            "@local_xla//xla/tsl:linux_x86_64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
-            "@local_xla//xla/tsl:linux_ppc64le": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
+            Label("//xla/tsl:linux_aarch64"): "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
+            Label("//xla/tsl:linux_x86_64"): "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
+            Label("//xla/tsl:linux_ppc64le"): "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
             "//conditions:default": "NOT_IMPLEMENTED_FOR_THIS_PLATFORM_OR_ARCHITECTURE",
         }),
     )
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
index f9d185a9f838cf..a72583405507e6 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
@@ -1,5 +1,4 @@
 load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -238,9 +237,6 @@ tsl_cc_test(
     ]),  # no status matchers in OSS.
     deps = [
         ":key_value_store",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -268,6 +264,7 @@ tsl_cc_test(
         "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
         "//xla/tsl/protobuf:coordination_service_proto_cc_impl",
         "//xla/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -275,7 +272,7 @@ tsl_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 tsl_cc_test(
@@ -300,15 +297,17 @@ tsl_cc_test(
         "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
         "//xla/tsl/protobuf:coordination_service_proto_cc_impl",
         "//xla/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 filegroup(
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
index d606f58a7d7a5d..b78a62726800bb 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cassert>
 #include <functional>
 #include <memory>
@@ -23,6 +22,7 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -60,7 +60,6 @@ using ::testing::ElementsAre;
 using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::UnorderedElementsAre;
-using ::tsl::testing::StatusIs;
 
 constexpr absl::Duration kBarrierTimeout = absl::Milliseconds(200);
 constexpr absl::Duration kHeartbeatTimeout = absl::Seconds(3);
@@ -1062,8 +1061,8 @@ TEST_F(ClientServerTest, GetAliveTasks_Succeed) {
   auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
-    absl::StatusOr<std::vector<tensorflow::CoordinatedTask>> alive_tasks =
-        client->GetAliveTasks({GetTask(0), GetTask(1)});
+    absl::StatusOr<std::vector<CoordinationServiceAgent::AliveTask>>
+        alive_tasks = client->GetAliveTasks({GetTask(0), GetTask(1)});
     if (!alive_tasks.ok()) {
       return alive_tasks.status();
     }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
index e58426a97bb7b4..4a970f83aa0f80 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -527,6 +527,14 @@ CoordinationService::ConnectAfterBarrierPasses(absl::string_view task_name,
                                   int64_t unused_counter) mutable {
     state_mu_.AssertHeld();
     const std::unique_ptr<TaskState>& task_state = cluster_state_[task];
+    if (!s.ok()) {
+      LOG(WARNING) << "ConnectAfterBarrierPasses: " << s;
+    }
+    if (incarnation != task_state->GetTaskIncarnation()) {
+      LOG(WARNING) << "ConnectAfterBarrierPasses: incarnation=" << incarnation
+                   << ", task_state->GetTaskIncarnation()="
+                   << task_state->GetTaskIncarnation();
+    }
     if (s.ok() && incarnation == task_state->GetTaskIncarnation()) {
       // Connect task to service.
       task_state->Connect();
@@ -1558,7 +1566,6 @@ void CoordinationService::RefreshAliveness() {
       // the same set of alive tasks (alive_tasks) to every task in the barrier.
       std::vector<CoordinatedTask> v{alive_tasks.begin(), alive_tasks.end()};
       std::vector<IncarnationId> incarnation_ids = IncarnationIds(v);
-      absl::c_sort(incarnation_ids);
       for (const GetAliveTasksCallback& done : it->dones) {
         done(absl::OkStatus(), v, incarnation_ids);
       }
@@ -1610,7 +1617,6 @@ void CoordinationService::GetAliveTasksAsync(
   if (TaskSetSubset(alive_tasks, it->in_barrier)) {
     std::vector<CoordinatedTask> v{alive_tasks.begin(), alive_tasks.end()};
     std::vector<IncarnationId> incarnation_ids = IncarnationIds(v);
-    absl::c_sort(incarnation_ids);
     for (const GetAliveTasksCallback& done : it->dones) {
       done(absl::OkStatus(), v, incarnation_ids);
     }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 22cdbacf145200..f44159c986a428 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -1012,7 +1012,7 @@ void CoordinationServiceAgent::CancelBarrierAsync(absl::string_view barrier_id,
       });
 }
 
-absl::StatusOr<std::vector<tensorflow::CoordinatedTask>>
+absl::StatusOr<std::vector<CoordinationServiceAgent::AliveTask>>
 CoordinationServiceAgent::GetAliveTasks(
     const std::vector<CoordinatedTask>& tasks) {
   // Validate the agent.
@@ -1036,20 +1036,22 @@ CoordinationServiceAgent::GetAliveTasks(
   };
   leader_client_->GetAliveTasksAsync(request.get(), response.get(), done);
   n.WaitForNotification();
-
-  // Parse the response.
   if (!status.ok()) {
     return status;
   }
-  {
-    absl::MutexLock lock(incarnations_mu_);
-    for (int i = 0; i < response->alive_tasks_size(); ++i) {
-      incarnations_[response->alive_tasks(i).task_id()] =
-          response->incarnations(i);
-    }
+
+  // Parse the response.
+  absl::MutexLock lock(incarnations_mu_);
+  incarnations_.clear();
+  std::vector<AliveTask> alive_tasks;
+  for (int i = 0; i < response->alive_tasks_size(); ++i) {
+    int task_id = response->alive_tasks(i).task_id();
+    IncarnationId incarnation_id(response->incarnations(i));
+
+    alive_tasks.push_back(AliveTask{task_id, incarnation_id});
+    incarnations_[task_id] = incarnation_id;
   }
-  return std::vector<tensorflow::CoordinatedTask>(
-      response->alive_tasks().begin(), response->alive_tasks().end());
+  return alive_tasks;
 }
 
 // Returns an error if agent is not running.
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 7e3bc0fa798d14..2a0dba4c331b89 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -320,7 +320,11 @@ class CoordinationServiceAgent {
   // has failed and that every task calls GetAliveTasks([A, B, C, D]). The
   // invocation will return tasks [A, B, C]. The GetAliveTasks call acts as a
   // barrier across tasks A, B, and C. Task D, which failed, is ignored.
-  absl::StatusOr<std::vector<tensorflow::CoordinatedTask>> GetAliveTasks(
+  struct AliveTask {
+    int task_id;
+    IncarnationId incarnation_id;
+  };
+  absl::StatusOr<std::vector<AliveTask>> GetAliveTasks(
       const std::vector<tensorflow::CoordinatedTask>& tasks);
 
   // Returns the latest known set of incarnation ids for every task. Incarnation
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index c662cc11c23df1..2fc36535349818 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -51,8 +51,6 @@ using ::testing::Return;
 using ::testing::SetArgPointee;
 using ::testing::UnorderedPointwise;
 using ::testing::WithArgs;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 MATCHER(KvEq, "simple KeyValueEntry matcher") {
   const KeyValueEntry& kv0 = std::get<0>(arg);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
index 65258f3a4419cf..482a39b3e4256a 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -161,14 +161,14 @@ class TestCoordinationClientCache : public CoordinationClientCache {
     clients_.emplace(target, client);
   }
 
-  CoordinationClient* GetClient(const string& target) override {
+  CoordinationClient* GetClient(const std::string& target) override {
     auto it = clients_.find(target);
     if (it == clients_.end()) return nullptr;
     return it->second;
   }
 
   std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const string& target) override {
+      const std::string& target) override {
     LOG(ERROR) << "GetOwnedClient is not supported.";
     return nullptr;
   }
@@ -2562,9 +2562,9 @@ TEST_F(GetAliveTasksTest, SuccessfulGetAliveTasks) {
                   const std::vector<IncarnationId>& incarnations) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAreArray(GetTaskMatchers()));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1),
-                                          IncarnationId(2)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1),
+                                     IncarnationId(2)));
     finished.DecrementCount();
   };
   GetCoordinationService()->GetAliveTasksAsync(GetTask(0), GetTasks(), done);
@@ -2583,8 +2583,8 @@ TEST_F(GetAliveTasksTest, FailedTaskBeforeCallingGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(GetTask(0)),
                                                   EqualsProto(GetTask(1))));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1)));
     finished.DecrementCount();
   };
   ASSERT_OK(GetCoordinationService()->ReportTaskError(
@@ -2605,8 +2605,8 @@ TEST_F(GetAliveTasksTest, FailedTaskAfterCallingGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(GetTask(0)),
                                                   EqualsProto(GetTask(1))));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1)));
     finished.DecrementCount();
   };
   GetCoordinationService()->GetAliveTasksAsync(GetTask(0), GetTasks(), done);
@@ -2630,8 +2630,8 @@ TEST_F(GetAliveTasksTest, ConcurrentGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(tasks_01[0]),
                                                   EqualsProto(tasks_01[1])));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1)));
     finished_01.DecrementCount();
   };
 
@@ -2644,8 +2644,8 @@ TEST_F(GetAliveTasksTest, ConcurrentGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(tasks_12[0]),
                                                   EqualsProto(tasks_12[1])));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(1), IncarnationId(2)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(1), IncarnationId(2)));
     finished_12.DecrementCount();
   };
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store_test.cc
index 6adc11870241b9..088372415e8e11 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store_test.cc
@@ -24,9 +24,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace tsl {
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD b/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
index 6dd6bd76db96ba..96b898834df6ef 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
@@ -1,5 +1,5 @@
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_grpc_cc_dependencies")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -89,11 +89,12 @@ tsl_cc_test(
         "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
         "//xla/tsl/protobuf:coordination_service_proto_cc_impl",
         "//xla/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD b/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
index 9e573ad7a960ea..64ed07fbc832cc 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
@@ -2,7 +2,6 @@
 #   RPC communication interfaces and implementations for TensorFlow.
 
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -26,9 +25,10 @@ cc_library(
     srcs = [],
     hdrs = ["grpc_call.h"],
     deps = [
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:refcount",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
@@ -38,12 +38,16 @@ cc_library(
     deps = [
         "//xla/tsl/platform:status",
         "//xla/tsl/protobuf:distributed_runtime_payloads_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_github_grpc_grpc//:grpc++_codegen_proto",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:cord",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:stringpiece",
         "@local_tsl//tsl/platform:stringprintf",
-    ] + tsl_grpc_cc_dependencies(),
+        "@local_tsl//tsl/platform:tstring",
+    ],
 )
 
 tsl_cc_test(
@@ -56,14 +60,13 @@ tsl_cc_test(
     deps = [
         ":grpc_util",
         ":test_request_proto_cc_impl",
-        "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
-        "//xla/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
         "//xla/tsl/util/proto:proto_matchers",
-    ] + tsl_grpc_cc_dependencies(),
+        "@com_github_grpc_grpc//:grpc++",
+    ],
 )
 
 cc_library(
@@ -93,13 +96,14 @@ cc_library(
         "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:rpc_options_proto_cc",
         "//xla/tsl/util:device_name_utils",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:thread_annotations",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 tsl_cc_test(
@@ -131,9 +135,10 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
         "//xla/tsl/util:env_var",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:strcat",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
index d1d1cbd1016763..b572020c22e08c 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -1,5 +1,4 @@
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -24,11 +23,12 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:protobuf",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
@@ -45,9 +45,9 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/protobuf:coordination_service_cc_grpc_proto",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
index 42c3d9fd1b1269..12504f9c7b2176 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -213,7 +213,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
   string TranslateTask(const string& target) override {
-    absl::MutexLock l(&mu_);  // could use reader lock
+    absl::MutexLock l(mu_);  // could use reader lock
     GrpcChannelCache* cache = gtl::FindPtrOrNull(target_caches_, target);
     if (cache == nullptr) {
       for (GrpcChannelCache* c : caches_) {
@@ -235,7 +235,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     for (GrpcChannelCache* cache : caches_) {
       SharedGrpcChannelPtr ch(cache->FindWorkerChannel(target));
       if (ch) {
-        absl::MutexLock l(&mu_);
+        absl::MutexLock l(mu_);
         target_caches_.insert({target, cache});
         return ch;
       }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
index de9aadff1db4af..e608b614704564 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
@@ -88,7 +88,7 @@ GrpcChannelCache* NewGrpcChannelCache(
 ::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options);
 
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<absl::Status(string, const RPCOptions*,
+    const std::function<absl::Status(std::string, const RPCOptions*,
                                      SharedGrpcChannelPtr*)>&
         new_channel_func_ptr);
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
index 890c4fa66f1898..b4181ad695d80f 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
@@ -42,7 +42,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
 
   ~GenericCachingChannelCache() override {}
 
-  SharedGrpcChannelPtr FindWorkerChannel(const string& target) override {
+  SharedGrpcChannelPtr FindWorkerChannel(const std::string& target) override {
     {
       absl::MutexLock l(mu_);
       auto iter = channels_.find(target);
@@ -60,7 +60,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
 
     {
       absl::MutexLock l(mu_);
-      typename absl::flat_hash_map<string, ChannelState>::iterator iter;
+      typename absl::flat_hash_map<std::string, ChannelState>::iterator iter;
       bool was_inserted;
       std::tie(iter, was_inserted) = channels_.insert({target, new_chan_state});
       VLOG(2) << "Channel cache for target: " << target
@@ -74,7 +74,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
   // Find the ClientChannel for "target".  Only called when no channel was
   // found in the channels_ cache for "target".  A non nullptr result will be
   // cached in channels_.
-  virtual SharedGrpcChannelPtr FindChannelOnce(const string& target) = 0;
+  virtual SharedGrpcChannelPtr FindChannelOnce(const std::string& target) = 0;
 
  private:
   struct ChannelState {
@@ -96,7 +96,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
   const int num_channels_per_target_;
   // TODO(zhifengc): Eviction when the map becomes too big.
   absl::Mutex mu_;
-  absl::flat_hash_map<string, ChannelState> channels_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, ChannelState> channels_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index 3efae80a0511c2..eae6d0a6c26169 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -344,9 +344,9 @@ TEST(GrpcChannelTest, SparseHostPorts) {
   {
     std::vector<string> workers;
     cc->ListWorkersInJob("mnist", &workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:3",
-                                   "/job:mnist/replica:0/task:4"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:3",
+                                        "/job:mnist/replica:0/task:4"}),
               workers);
   }
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
index c2a0e2fcb0e401..235b4d98391c17 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
@@ -50,7 +50,7 @@ class RPCState : public GrpcClientCQTag {
       const ::grpc::string& method, const protobuf::Message& request,
       Response* response, StatusCallback done, CallOptions* call_opts,
       thread::ThreadPool* threadpool, int32_t max_retries = 0,
-      bool fail_fast = true, const string* target = nullptr,
+      bool fail_fast = true, const std::string* target = nullptr,
       std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
           [](::grpc::ByteBuffer* src, Response* dst) {
             return tsl::GrpcMaybeParseProto(src, dst);
@@ -69,10 +69,11 @@ class RPCState : public GrpcClientCQTag {
             // on worker task failures, except a few cases such as GetStatus
             // in cluster initialization and collective param resolution.
             [fail_fast, &done]() -> bool {
-              string fail_fast_env;
+              std::string fail_fast_env;
               TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "use_caller",
                                                &fail_fast_env));
-              string fail_fast_env_lower = absl::AsciiStrToLower(fail_fast_env);
+              std::string fail_fast_env_lower =
+                  absl::AsciiStrToLower(fail_fast_env);
               if (fail_fast_env_lower == "true") {
                 return true;
               } else if (fail_fast_env_lower == "use_caller") {
@@ -80,7 +81,7 @@ class RPCState : public GrpcClientCQTag {
               } else if (fail_fast_env_lower == "false") {
                 return false;
               } else {
-                string error_message = absl::StrCat(
+                std::string error_message = absl::StrCat(
                     "Invalid GRPC_FAIL_FAST config: ", fail_fast_env);
                 LOG(WARNING) << error_message;
                 done(errors::InvalidArgument(error_message));
@@ -96,7 +97,7 @@ class RPCState : public GrpcClientCQTag {
       const ::grpc::string& method, const Request& request, Response* response,
       StatusCallback done, CallOptions* call_opts,
       thread::ThreadPool* threadpool, bool fail_fast, int64_t timeout_in_ms,
-      int32_t max_retries, const string* target,
+      int32_t max_retries, const std::string* target,
       std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
           [](::grpc::ByteBuffer* src, Response* dst) {
             return tsl::GrpcMaybeParseProto(src, dst);
@@ -186,7 +187,7 @@ class RPCState : public GrpcClientCQTag {
                                         [this]() { StartCall(); });
     } else {
       // Attach additional GRPC error information if any to the final status
-      string error_msg = std::string(s.message());
+      std::string error_msg = std::string(s.message());
       absl::StrAppend(&error_msg, "\nAdditional GRPC error information");
       if (target_) {
         absl::StrAppend(&error_msg, " from remote target ", *target_);
@@ -247,7 +248,7 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::GenericStub* stub_;
   ::grpc::string method_;
   bool fail_fast_;
-  const string* target_;
+  const std::string* target_;
   std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn_ = nullptr;
 };
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
index 4fdc9213986bd8..ec1ddb87f3d318 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 
-#include <algorithm>
+#include <string>
 #include <vector>
 
-#include "grpcpp/impl/codegen/proto_utils.h"
-#include "tsl/platform/protobuf.h"
+#include "grpcpp/impl/proto_utils.h"
+#include "grpcpp/support/byte_buffer.h"
+#include "grpcpp/support/slice.h"
+#include "grpcpp/support/status.h"
+#include "google/protobuf/message.h"
+#include "tsl/platform/tstring.h"
 
 namespace tsl {
 
@@ -39,7 +43,8 @@ bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
 
 // GrpcMaybeUnparseProto from a string simply copies the string to the
 // ByteBuffer.
-::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
+::grpc::Status GrpcMaybeUnparseProto(const std::string& src,
+                                     grpc::ByteBuffer* dst) {
   ::grpc::Slice s(src.data(), src.size());
   ::grpc::ByteBuffer buffer(&s, 1);
   dst->Swap(&buffer);
@@ -47,7 +52,7 @@ ::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
 }
 
 // GrpcMaybeParseProto simply copies bytes into the string.
-bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, std::string* dst) {
   dst->clear();
   dst->reserve(src->Length());
   std::vector<::grpc::Slice> slices;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
index 4b510b1a02afda..7830312c6d4829 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
@@ -98,8 +98,8 @@ inline ::grpc::Status ToGrpcStatus(const absl::Status& s) {
   } else {
     if (s.message().size() > 3072 /* 3k bytes */) {
       // TODO(b/62947679): Remove truncation once the gRPC issue is resolved.
-      string scratch = strings::Printf("%.3072s ... [truncated]",
-                                       absl::StatusMessageAsCStr(s));
+      std::string scratch = strings::Printf("%.3072s ... [truncated]",
+                                            absl::StatusMessageAsCStr(s));
       LOG(ERROR) << "Truncated error message: " << s;
       return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()), scratch,
                             SerializePayloads(s));
@@ -119,11 +119,11 @@ ::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
 
 // Copy string src to grpc buffer *dst.
-::grpc::Status GrpcMaybeUnparseProto(const string& src,
+::grpc::Status GrpcMaybeUnparseProto(const std::string& src,
                                      ::grpc::ByteBuffer* dst);
 
 // Copy grpc buffer src to string *dst.
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, std::string* dst);
 
 // Copy grpc buffer src to tstring *dst.
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, tstring* dst);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
index af596db53a576d..61832e333c76c6 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
@@ -32,10 +32,10 @@ namespace {
 using tsl::proto_testing::EqualsProto;
 using tsl::test::TestRequest;
 
-string ToString(const grpc::ByteBuffer& buf) {
+std::string ToString(const grpc::ByteBuffer& buf) {
   std::vector<grpc::Slice> slices;
   CHECK(buf.Dump(&slices).ok());
-  string result;
+  std::string result;
   for (const grpc::Slice& s : slices) {
     result.append(reinterpret_cast<const char*>(s.begin()), s.size());
   }
@@ -43,7 +43,7 @@ string ToString(const grpc::ByteBuffer& buf) {
 }
 
 // Return a ByteBuffer that contains str split up into num_slices slices.
-grpc::ByteBuffer MakeBuffer(const string& str, int num_slices) {
+grpc::ByteBuffer MakeBuffer(const std::string& str, int num_slices) {
   // Convert to a ByteBuffer.
   std::vector<::grpc::Slice> slices;
   const size_t per_slice = (str.size() + num_slices - 1) / num_slices;
@@ -65,7 +65,7 @@ TestRequest MakeProto(int size) {
   int index = 0;
   while (approx_size < size) {
     int item_size = std::min(size - approx_size, 1024);
-    proto.add_data(string(item_size, 'a' + static_cast<char>(index % 26)));
+    proto.add_data(std::string(item_size, 'a' + static_cast<char>(index % 26)));
     approx_size += item_size + 3;  // +3 for encoding overhead.
     index++;
   }
@@ -105,7 +105,7 @@ TEST(GrpcProto, UnparseToString) {
   TestRequest proto;
   proto.add_data("hello");
   proto.add_data("world");
-  string str;
+  std::string str;
   CHECK(proto.SerializeToString(&str));
   grpc::ByteBuffer buf;
   ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
@@ -153,7 +153,7 @@ TEST(GrpcProto, ParseFromString) {
        }) {
     TestRequest proto = MakeProto(c.length);
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
-    string parsed_str;
+    std::string parsed_str;
     TestRequest parsed;
     ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
         << c.length << " " << c.slices;
@@ -179,7 +179,7 @@ static void BM_UnparseString(::testing::benchmark::State& state) {
   auto proto = MakeProto(size);
 
   for (auto s : state) {
-    string buf;
+    std::string buf;
     proto.SerializeToString(&buf);
   }
 }
@@ -207,7 +207,7 @@ static void BM_ParseString(::testing::benchmark::State& state) {
   const int size = state.range(0);
 
   TestRequest proto = MakeProto(size);
-  string serial = proto.SerializeAsString();
+  std::string serial = proto.SerializeAsString();
 
   for (auto s : state) {
     CHECK(proto.ParseFromString(serial));
diff --git a/third_party/xla/xla/tsl/framework/BUILD b/third_party/xla/xla/tsl/framework/BUILD
index ce51c519f425f3..000fda20a8265d 100644
--- a/third_party/xla/xla/tsl/framework/BUILD
+++ b/third_party/xla/xla/tsl/framework/BUILD
@@ -465,10 +465,12 @@ tsl_cc_test(
     deps = [
         ":device_id_impl",
         ":device_id_utils",
+        ":device_type",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/protobuf:error_codes_proto_impl_cc",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util:device_name_utils",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/tsl/framework/allocator.cc b/third_party/xla/xla/tsl/framework/allocator.cc
index b55ee822f46de8..459dbb6e4db0b0 100644
--- a/third_party/xla/xla/tsl/framework/allocator.cc
+++ b/third_party/xla/xla/tsl/framework/allocator.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tsl {
 
-string AllocatorStats::DebugString() const {
+std::string AllocatorStats::DebugString() const {
   return strings::Printf(
       "Limit:            %20lld\n"
       "InUse:            %20lld\n"
@@ -56,7 +56,7 @@ static bool cpu_allocator_collect_full_stats = false;
 void EnableCPUAllocatorFullStats() { cpu_allocator_collect_full_stats = true; }
 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
-string AllocatorAttributes::DebugString() const {
+std::string AllocatorAttributes::DebugString() const {
   return strings::StrCat("AllocatorAttributes(on_host=", on_host(),
                          " nic_compatible=", nic_compatible(),
                          " gpu_compatible=", gpu_compatible(), ")");
diff --git a/third_party/xla/xla/tsl/framework/allocator.h b/third_party/xla/xla/tsl/framework/allocator.h
index 2f8df404b94768..218bb24f05d54b 100644
--- a/third_party/xla/xla/tsl/framework/allocator.h
+++ b/third_party/xla/xla/tsl/framework/allocator.h
@@ -38,7 +38,7 @@ struct AllocationAttributes {
   AllocationAttributes() = default;
 
   AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
-                       std::function<uint64()>* freed_by_func)
+                       std::function<uint64_t()>* freed_by_func)
       : retry_on_failure(retry_on_failure),
         allocation_will_be_logged(allocation_will_be_logged),
         freed_by_func(freed_by_func) {}
@@ -59,7 +59,7 @@ struct AllocationAttributes {
   // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
   // a memory chunk whose freed_at_count is at this value or earlier may be
   // returned.
-  std::function<uint64()>* freed_by_func = nullptr;  // Not owned.
+  std::function<uint64_t()>* freed_by_func = nullptr;  // Not owned.
 
   AllocationAttributes(const AllocationAttributes&) = delete;
   void operator=(const AllocationAttributes&) = delete;
@@ -232,7 +232,7 @@ class Allocator {
   // REQUIRES: GetStats is overridden.
   virtual bool ClearStats() TF_MUST_USE_RESULT { return false; }
 
-  virtual void SetSafeFrontier(uint64 count) {}
+  virtual void SetSafeFrontier(uint64_t count) {}
 
   // For allocator that are stream aware, allow to specify the compute
   // stream this allocator is used for. This can also trigger memory
@@ -352,10 +352,10 @@ struct AllocatorAttributes {
   // device-specific uses.  Implementors of a device can interpret these
   // upper 8 bits in device-specific ways, and ops implemented for those
   // devices are responsible for setting those 8 bits appropriately.
-  uint32 value = 0;
+  uint32_t value = 0;
   // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
   // a named special-purpose allocator on the same device.
-  int32 scope_id = 0;
+  int32_t scope_id = 0;
 
   // Returns a human readable representation of this.
   std::string DebugString() const;
diff --git a/third_party/xla/xla/tsl/framework/allocator_registry.cc b/third_party/xla/xla/tsl/framework/allocator_registry.cc
index 2228d8faf3cc49..90bfc1c70b623a 100644
--- a/third_party/xla/xla/tsl/framework/allocator_registry.cc
+++ b/third_party/xla/xla/tsl/framework/allocator_registry.cc
@@ -30,7 +30,8 @@ AllocatorFactoryRegistry* AllocatorFactoryRegistry::singleton() {
 }
 
 const AllocatorFactoryRegistry::FactoryEntry*
-AllocatorFactoryRegistry::FindEntry(const string& name, int priority) const {
+AllocatorFactoryRegistry::FindEntry(const std::string& name,
+                                    int priority) const {
   for (auto& entry : factories_) {
     if (!name.compare(entry.name) && priority == entry.priority) {
       return &entry;
@@ -40,10 +41,10 @@ AllocatorFactoryRegistry::FindEntry(const string& name, int priority) const {
 }
 
 void AllocatorFactoryRegistry::Register(const char* source_file,
-                                        int source_line, const string& name,
-                                        int priority,
+                                        int source_line,
+                                        const std::string& name, int priority,
                                         AllocatorFactory* factory) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   CHECK(!first_alloc_made_) << "Attempt to register an AllocatorFactory "
                             << "after call to GetAllocator()";
   CHECK(!name.empty()) << "Need a valid name for Allocator";
@@ -69,7 +70,7 @@ void AllocatorFactoryRegistry::Register(const char* source_file,
 }
 
 Allocator* AllocatorFactoryRegistry::GetAllocator() {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   first_alloc_made_ = true;
   FactoryEntry* best_entry = nullptr;
   for (auto& entry : factories_) {
@@ -91,7 +92,7 @@ Allocator* AllocatorFactoryRegistry::GetAllocator() {
 }
 
 SubAllocator* AllocatorFactoryRegistry::GetSubAllocator(int numa_node) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   first_alloc_made_ = true;
   FactoryEntry* best_entry = nullptr;
   for (auto& entry : factories_) {
diff --git a/third_party/xla/xla/tsl/framework/allocator_registry.h b/third_party/xla/xla/tsl/framework/allocator_registry.h
index 4529d4f35b677b..a293ce63254bbb 100644
--- a/third_party/xla/xla/tsl/framework/allocator_registry.h
+++ b/third_party/xla/xla/tsl/framework/allocator_registry.h
@@ -71,8 +71,9 @@ class AllocatorFactoryRegistry {
   AllocatorFactoryRegistry() {}
   ~AllocatorFactoryRegistry() {}
 
-  void Register(const char* source_file, int source_line, const string& name,
-                int priority, AllocatorFactory* factory);
+  void Register(const char* source_file, int source_line,
+                const std::string& name, int priority,
+                AllocatorFactory* factory);
 
   // Returns 'best fit' Allocator.  Find the factory with the highest priority
   // and return an allocator constructed by it.  If multiple factories have
@@ -90,7 +91,7 @@ class AllocatorFactoryRegistry {
   static AllocatorFactoryRegistry* singleton();
 
   ProcessStateInterface* process_state() const {
-    absl::MutexLock ml(&mu_);
+    absl::MutexLock ml(mu_);
     return process_state_;
   }
 
@@ -98,7 +99,7 @@ class AllocatorFactoryRegistry {
   friend class tensorflow::ProcessState;
 
   void SetProcessState(ProcessStateInterface* interface) {
-    absl::MutexLock ml(&mu_);
+    absl::MutexLock ml(mu_);
     process_state_ = interface;
   }
 
@@ -109,7 +110,7 @@ class AllocatorFactoryRegistry {
   struct FactoryEntry {
     const char* source_file;
     int source_line;
-    string name;
+    std::string name;
     int priority;
     std::unique_ptr<AllocatorFactory> factory;
     std::unique_ptr<Allocator> allocator;
@@ -121,7 +122,7 @@ class AllocatorFactoryRegistry {
 
   // Returns any FactoryEntry registered under 'name' and 'priority',
   // or 'nullptr' if none found.
-  const FactoryEntry* FindEntry(const string& name, int priority) const
+  const FactoryEntry* FindEntry(const std::string& name, int priority) const
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   AllocatorFactoryRegistry(const AllocatorFactoryRegistry&) = delete;
@@ -130,8 +131,9 @@ class AllocatorFactoryRegistry {
 
 class AllocatorFactoryRegistration {
  public:
-  AllocatorFactoryRegistration(const char* file, int line, const string& name,
-                               int priority, AllocatorFactory* factory) {
+  AllocatorFactoryRegistration(const char* file, int line,
+                               const std::string& name, int priority,
+                               AllocatorFactory* factory) {
     AllocatorFactoryRegistry::singleton()->Register(file, line, name, priority,
                                                     factory);
   }
diff --git a/third_party/xla/xla/tsl/framework/allocator_retry.cc b/third_party/xla/xla/tsl/framework/allocator_retry.cc
index 9ecb7e40f9d584..268a2841b07ff4 100644
--- a/third_party/xla/xla/tsl/framework/allocator_retry.cc
+++ b/third_party/xla/xla/tsl/framework/allocator_retry.cc
@@ -38,14 +38,14 @@ class ScopedTimeTracker {
   }
   ~ScopedTimeTracker() {
     if (start_us_) {
-      uint64 end_us = env_->NowMicros();
+      uint64_t end_us = env_->NowMicros();
       metrics::UpdateBfcAllocatorDelayTime(end_us - *start_us_);
     }
   }
 
  private:
   Env* env_;
-  std::optional<uint64> start_us_;
+  std::optional<uint64_t> start_us_;
 };
 }  // namespace
 
@@ -54,7 +54,7 @@ AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}
 AllocatorRetry::~AllocatorRetry() {
   // Lock the mutex to make sure that all memory effects are safely published
   // and available to a thread running the destructor.
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
 }
 
 void* AllocatorRetry::AllocateRaw(
@@ -80,7 +80,7 @@ void* AllocatorRetry::AllocateRaw(
     }
     if (now < deadline) {
       tracker.Enable();
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       memory_returned_.WaitWithDeadline(&mu_, deadline);
     } else {
       return alloc_func(alignment, num_bytes, true);
diff --git a/third_party/xla/xla/tsl/framework/allocator_retry.h b/third_party/xla/xla/tsl/framework/allocator_retry.h
index 01e5d1d2613c11..254cd2b8673227 100644
--- a/third_party/xla/xla/tsl/framework/allocator_retry.h
+++ b/third_party/xla/xla/tsl/framework/allocator_retry.h
@@ -55,7 +55,7 @@ class AllocatorRetry {
 
 // Implementation details below
 inline void AllocatorRetry::NotifyDealloc() {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   memory_returned_.SignalAll();
 }
 
diff --git a/third_party/xla/xla/tsl/framework/bfc_allocator.cc b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
index b9e6c6a7786df6..4891503a0314be 100644
--- a/third_party/xla/xla/tsl/framework/bfc_allocator.cc
+++ b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -104,7 +105,7 @@ BFCAllocator::~BFCAllocator() {
   // Lock the mutex to make sure that all memory effects are safely published
   // and available to a thread running the destructor (i.e., deallocations
   // happened on a different thread right before the destructor).
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
 
   // Return memory back.
   VLOG(2) << "Number of regions allocated: "
@@ -264,7 +265,7 @@ void* BFCAllocator::AllocateRawInternalWithRetry(
     static const int64_t kMaxMillisToWait = 10000;  // 10 seconds
     r = retry_helper_.AllocateRaw(
         [this, &allocation_attr](size_t a, size_t nb, bool v) {
-          uint64 freed_by_count = 0;
+          uint64_t freed_by_count = 0;
           if (allocation_attr.freed_by_func != nullptr) {
             freed_by_count = (*allocation_attr.freed_by_func)();
           }
@@ -446,7 +447,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   // The BFC allocator tries to find the best fit first.
   BinNum bin_num = BinNumForSize(rounded_bytes);
 
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
   if (!timestamped_chunks_.empty()) {
     // Merge timestamped chunks whose counts have become safe for general use.
     MergeTimestampedChunks(0);
@@ -703,7 +704,7 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) {
     VLOG(2) << "tried to deallocate nullptr";
     return;
   }
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
 
   // Find the chunk from the ptr.
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
@@ -947,7 +948,7 @@ bool BFCAllocator::TracksAllocationSizes() const { return true; }
 
 size_t BFCAllocator::RequestedSize(const void* ptr) const {
   CHECK(ptr);
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
       << "Asked for requested size of pointer we never allocated: " << ptr;
@@ -956,7 +957,7 @@ size_t BFCAllocator::RequestedSize(const void* ptr) const {
 }
 
 size_t BFCAllocator::AllocatedSize(const void* ptr) const {
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
       << "Asked for allocated size of pointer we never allocated: " << ptr;
@@ -965,7 +966,7 @@ size_t BFCAllocator::AllocatedSize(const void* ptr) const {
 }
 
 int64_t BFCAllocator::AllocationId(const void* ptr) const {
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
       << "Asked for allocation id of pointer we never allocated: " << ptr;
@@ -999,10 +1000,9 @@ void RenderRegion(char* rendered, const size_t resolution,
 
 }  // namespace
 
-string BFCAllocator::RenderOccupancy() {
+std::string BFCAllocator::RenderOccupancy() {
   // Make a buffer for the ASCII-art representation.
   const size_t resolution = 100;
-  char rendered[resolution];
 
   // Compute the total region size to render over
   size_t total_region_size = 0;
@@ -1015,8 +1015,7 @@ string BFCAllocator::RenderOccupancy() {
   }
 
   // Start out with everything empty
-  RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
-               total_region_size, '_');
+  std::string rendered(resolution, '_');
 
   size_t region_offset = 0;
   for (const auto& region : region_manager_.regions()) {
@@ -1028,20 +1027,21 @@ string BFCAllocator::RenderOccupancy() {
         // Render the wasted space
         size_t wasted = c->size - c->requested_size;
         if (wasted > 0) {
-          RenderRegion(rendered, resolution, total_region_size,
+          RenderRegion(rendered.data(), resolution, total_region_size,
                        region_offset + c->requested_size, region.ptr(), c->ptr,
                        wasted, 'x');
         }
         // Then the occupied space
-        RenderRegion(rendered, resolution, total_region_size, region_offset,
-                     region.ptr(), c->ptr, c->requested_size, '*');
+        RenderRegion(rendered.data(), resolution, total_region_size,
+                     region_offset, region.ptr(), c->ptr, c->requested_size,
+                     '*');
       }
       h = c->next;
     }
     region_offset += region.memory_size();
   }
 
-  return string(rendered, resolution);
+  return rendered;
 }
 
 void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
@@ -1146,7 +1146,7 @@ void BFCAllocator::MaybeWriteMemoryMap() {
 }
 
 MemoryDump BFCAllocator::RecordMemoryMap() {
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
   return RecordMemoryMapInternal();
 }
 
@@ -1217,12 +1217,12 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
 }
 
 std::optional<AllocatorStats> BFCAllocator::GetStats() {
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
   return stats_;
 }
 
 bool BFCAllocator::ClearStats() {
-  absl::MutexLock l(&mutex_);
+  absl::MutexLock l(mutex_);
   stats_.num_allocs = 0;
   stats_.peak_bytes_in_use = stats_.bytes_in_use;
   stats_.largest_alloc_size = 0;
diff --git a/third_party/xla/xla/tsl/framework/bfc_allocator.h b/third_party/xla/xla/tsl/framework/bfc_allocator.h
index 599e5e026b238a..b52ac7c77558c1 100644
--- a/third_party/xla/xla/tsl/framework/bfc_allocator.h
+++ b/third_party/xla/xla/tsl/framework/bfc_allocator.h
@@ -76,11 +76,11 @@ class BFCAllocator : public Allocator {
     double fragmentation_fraction = 0;
   };
   BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, size_t total_memory,
-               const string& name, const Options& opts);
+               const std::string& name, const Options& opts);
 
   ~BFCAllocator() override;
 
-  string Name() override { return name_; }
+  std::string Name() override { return name_; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     return AllocateRaw(alignment, num_bytes, AllocationAttributes());
@@ -105,7 +105,7 @@ class BFCAllocator : public Allocator {
 
   void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
 
-  void SetSafeFrontier(uint64 count) override;
+  void SetSafeFrontier(uint64_t count) override;
 
   AllocatorMemoryType GetMemoryType() const override;
 
@@ -118,7 +118,7 @@ class BFCAllocator : public Allocator {
 
   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
                             bool dump_log_on_failure,
-                            uint64 freed_before_count);
+                            uint64_t freed_before_count);
 
   void* AllocateRawInternalWithRetry(
       size_t alignment, size_t num_bytes,
@@ -212,7 +212,7 @@ class BFCAllocator : public Allocator {
     BinNum bin_num = kInvalidBinNum;
 
     // Optional count when this chunk was most recently made free.
-    uint64 freed_at_count = 0;
+    uint64_t freed_at_count = 0;
 
     bool in_use() const { return allocation_id != -1; }
 
@@ -223,9 +223,9 @@ class BFCAllocator : public Allocator {
     int64 action_count = 0;
 #endif
 
-    string DebugString(BFCAllocator* a,
-                       bool recurse) ABSL_NO_THREAD_SAFETY_ANALYSIS {
-      string dbg;
+    std::string DebugString(BFCAllocator* a,
+                            bool recurse) ABSL_NO_THREAD_SAFETY_ANALYSIS {
+      std::string dbg;
       absl::StrAppend(
           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
           " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
@@ -474,7 +474,8 @@ class BFCAllocator : public Allocator {
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
-                     uint64 freed_before) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+                     uint64_t freed_before)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Splits the chunk specified by 'h' into two chunks, one at least
   // of size 'num_bytes'.
@@ -504,7 +505,7 @@ class BFCAllocator : public Allocator {
   // Removes the chunk metadata represented by 'h'.
   void DeleteChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  string RenderOccupancy() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  std::string RenderOccupancy() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   void DumpMemoryLog(size_t num_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   tensorflow::MemoryDump RecordMemoryMapInternal()
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
@@ -552,7 +553,7 @@ class BFCAllocator : public Allocator {
     return static_cast<size_t>(256) << index;
   }
   BinNum BinNumForSize(size_t bytes) {
-    uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
+    uint64_t v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
     int b = std::min(kNumBins - 1, tsl::Log2Floor64(v));
     return b;
   }
@@ -573,11 +574,11 @@ class BFCAllocator : public Allocator {
   const bool coalesce_regions_;
 
   std::unique_ptr<SubAllocator> sub_allocator_;
-  string name_;
+  std::string name_;
   SharedCounter* timing_counter_ = nullptr;
   std::deque<ChunkHandle> timestamped_chunks_;
 
-  std::atomic<uint64> safe_frontier_ = {0};
+  std::atomic<uint64_t> safe_frontier_ = {0};
 
   // Structures mutable after construction
   mutable absl::Mutex mutex_;
diff --git a/third_party/xla/xla/tsl/framework/cancellation.cc b/third_party/xla/xla/tsl/framework/cancellation.cc
index c0a4a93b401003..fc7f2b8760bc1a 100644
--- a/third_party/xla/xla/tsl/framework/cancellation.cc
+++ b/third_party/xla/xla/tsl/framework/cancellation.cc
@@ -48,7 +48,7 @@ void CancellationManager::StartCancelWithStatus(const absl::Status& status) {
   std::forward_list<CancellationManager*> children_to_cancel;
   absl::Notification* cancelled_notification = nullptr;
   {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     if (is_cancelled_.load(std::memory_order_relaxed) || is_cancelling_) {
       return;
     }
@@ -87,7 +87,7 @@ void CancellationManager::StartCancelWithStatus(const absl::Status& status) {
     child->StartCancelWithStatus(status);
   }
   {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     is_cancelling_ = false;
     is_cancelled_.store(true, std::memory_order_release);
   }
@@ -112,7 +112,7 @@ bool CancellationManager::RegisterCallbackWithErrorLogging(
 bool CancellationManager::RegisterCallbackConfig(CancellationToken token,
                                                  CallbackConfiguration config) {
   DCHECK_LT(token, next_cancellation_token_) << "Invalid cancellation token";
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   bool should_register = !is_cancelled_ && !is_cancelling_;
   if (should_register) {
     if (!state_) {
@@ -124,14 +124,14 @@ bool CancellationManager::RegisterCallbackConfig(CancellationToken token,
 }
 
 bool CancellationManager::DeregisterCallback(CancellationToken token) {
-  mu_.Lock();
+  mu_.lock();
   if (is_cancelled_) {
-    mu_.Unlock();
+    mu_.unlock();
     return false;
   } else if (is_cancelling_) {
     absl::Notification* cancelled_notification =
         state_ ? &state_->cancelled_notification : nullptr;
-    mu_.Unlock();
+    mu_.unlock();
     // Wait for all of the cancellation callbacks to be called. This
     // wait ensures that the caller of DeregisterCallback does not
     // return immediately and free objects that may be used in the
@@ -144,13 +144,13 @@ bool CancellationManager::DeregisterCallback(CancellationToken token) {
     if (state_) {
       state_->callbacks.erase(token);
     }
-    mu_.Unlock();
+    mu_.unlock();
     return true;
   }
 }
 
 bool CancellationManager::RegisterChild(CancellationManager* child) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   if (is_cancelled_.load(std::memory_order_relaxed) || is_cancelling_) {
     child->is_removed_from_parent_ = true;
     return true;
@@ -176,7 +176,7 @@ void CancellationManager::DeregisterChild(CancellationManager* child) {
   DCHECK_EQ(child->parent_, this);
   absl::Notification* cancelled_notification = nullptr;
   {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     if (!child->is_removed_from_parent_) {
       // Remove the child from this manager's list of children.
       DCHECK(state_);
@@ -209,7 +209,7 @@ void CancellationManager::DeregisterChild(CancellationManager* child) {
 }
 
 bool CancellationManager::TryDeregisterCallback(CancellationToken token) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (is_cancelled_ || is_cancelling_) {
     return false;
   } else {
@@ -230,7 +230,7 @@ CancellationManager::~CancellationManager() {
 }
 
 bool CancellationManager::IsCancelling() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   return is_cancelling_;
 }
 
diff --git a/third_party/xla/xla/tsl/framework/contraction/BUILD b/third_party/xla/xla/tsl/framework/contraction/BUILD
index 52eda9768bdc0a..029fa1dd009241 100644
--- a/third_party/xla/xla/tsl/framework/contraction/BUILD
+++ b/third_party/xla/xla/tsl/framework/contraction/BUILD
@@ -113,6 +113,7 @@ cc_library(
         "//xla/tsl:fuchsia_x86_64": [],
         "//xla/tsl:ios": [],
         "//xla/tsl:linux_ppc64le": [],
+        "//xla/tsl:linux_riscv64": [],
         "//xla/tsl:linux_s390x": [],
         "//xla/tsl:macos_arm64": [],
         "//conditions:default": [
@@ -132,6 +133,7 @@ cc_library(
         "//xla/tsl:fuchsia_x86_64": [],
         "//xla/tsl:ios": [],
         "//xla/tsl:linux_ppc64le": [],
+        "//xla/tsl:linux_riscv64": [],
         "//xla/tsl:linux_s390x": [],
         "//xla/tsl:macos_arm64": [],
         "//conditions:default": ["//xla/tsl/mkl:onednn"],
diff --git a/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc b/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
index b6be51d8eebb4f..adb3834b5b0e47 100644
--- a/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
+++ b/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
@@ -75,7 +75,7 @@ class CPUAllocator : public Allocator {
 
   ~CPUAllocator() override = default;
 
-  string Name() override { return "cpu"; }
+  std::string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     if (num_bytes > static_cast<size_t>(LargeAllocationWarningBytes()) &&
@@ -89,7 +89,7 @@ class CPUAllocator : public Allocator {
     void* p = port::AlignedMalloc(num_bytes, alignment);
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       ++stats_.num_allocs;
       stats_.bytes_in_use += alloc_size;
       stats_.peak_bytes_in_use =
@@ -115,7 +115,7 @@ class CPUAllocator : public Allocator {
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size =
           port::MallocExtension_GetAllocatedSize(ptr);
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       stats_.bytes_in_use -= alloc_size;
       AddTraceMe("MemoryDeallocation", ptr, 0, alloc_size);
     }
@@ -126,7 +126,7 @@ class CPUAllocator : public Allocator {
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size =
           port::MallocExtension_GetAllocatedSize(ptr);
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       stats_.bytes_in_use -= alloc_size;
       AddTraceMe("MemoryDeallocation", ptr, 0, alloc_size);
     }
@@ -147,7 +147,7 @@ class CPUAllocator : public Allocator {
                              {"peak_bytes_in_use", stats_.peak_bytes_in_use},
                              {"requested_bytes", req_bytes},
                              {"allocation_bytes", alloc_bytes},
-                             {"addr", reinterpret_cast<uint64>(chunk_ptr)},
+                             {"addr", reinterpret_cast<uint64_t>(chunk_ptr)},
                              {"tf_op", annotation.pending_op_name},
                              {"id", annotation.pending_step_id},
                              {"region_type", annotation.pending_region_type},
@@ -161,13 +161,13 @@ class CPUAllocator : public Allocator {
     if (!cpu_allocator_collect_stats) {
       return std::nullopt;
     }
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     return stats_;
   }
 
   bool ClearStats() override {
     if (!cpu_allocator_collect_stats) return false;
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     stats_.num_allocs = 0;
     stats_.peak_bytes_in_use = stats_.bytes_in_use;
     stats_.largest_alloc_size = 0;
diff --git a/third_party/xla/xla/tsl/framework/device_id.h b/third_party/xla/xla/tsl/framework/device_id.h
index e80d84298195fe..6738ffb356c7b3 100644
--- a/third_party/xla/xla/tsl/framework/device_id.h
+++ b/third_party/xla/xla/tsl/framework/device_id.h
@@ -81,8 +81,8 @@ namespace tsl {
 // for the StreamExecutor interface (as we don't change its API), whenever we
 // need a TF device id (or platform device id) we should use TfDeviceId (or
 // PlatformDeviceId) instead of a raw integer.
-TSL_LIB_GTL_DEFINE_INT_TYPE(TfDeviceId, int32);
-TSL_LIB_GTL_DEFINE_INT_TYPE(PlatformDeviceId, int32);
+TSL_LIB_GTL_DEFINE_INT_TYPE(TfDeviceId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PlatformDeviceId, int32_t);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/framework/device_id_manager.cc b/third_party/xla/xla/tsl/framework/device_id_manager.cc
index a8d0247d315912..fb3f86dd267f97 100644
--- a/third_party/xla/xla/tsl/framework/device_id_manager.cc
+++ b/third_party/xla/xla/tsl/framework/device_id_manager.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "xla/tsl/framework/device_id_manager.h"
 
+#include <cstdint>
+#include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/tsl/framework/device_id.h"
+#include "xla/tsl/framework/device_type.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/macros.h"
@@ -44,23 +48,24 @@ class TfToPlatformDeviceIdMap {
       TF_LOCKS_EXCLUDED(mu_) {
     std::pair<IdMapType::iterator, bool> result;
     {
-      absl::MutexLock lock(&mu_);
+      absl::MutexLock lock(mu_);
       TypeIdMapType::iterator device_id_map_iter =
           id_map_.insert({type.type_string(), IdMapType()}).first;
       result = device_id_map_iter->second.insert(
           {tf_device_id.value(), platform_device_id.value()});
     }
     if (!result.second && platform_device_id.value() != result.first->second) {
-      return errors::AlreadyExists(
-          "TensorFlow device (", type, ":", tf_device_id.value(),
+      return absl::AlreadyExistsError(absl::StrCat(
+          "TensorFlow device (", type.type_string(), ":", tf_device_id.value(),
           ") is being mapped to multiple devices (", platform_device_id.value(),
           " now, and ", result.first->second,
           " previously), which is not supported. "
           "This may be the result of providing different ",
-          type, " configurations (ConfigProto.gpu_options, for example ",
+          type.type_string(),
+          " configurations (ConfigProto.gpu_options, for example ",
           "different visible_device_list) when creating multiple Sessions in ",
           "the same process. This is not currently supported, see ",
-          "https://github.com/tensorflow/tensorflow/issues/19083");
+          "https://github.com/tensorflow/tensorflow/issues/19083"));
     }
     return absl::OkStatus();
   }
@@ -69,7 +74,7 @@ class TfToPlatformDeviceIdMap {
             PlatformDeviceId* platform_device_id) const TF_LOCKS_EXCLUDED(mu_) {
     // TODO(mrry): Consider replacing this with an atomic `is_initialized` bit,
     // to avoid writing to a shared cache line in the tf_shared_lock.
-    absl::ReaderMutexLock lock(&mu_);
+    absl::ReaderMutexLock lock(mu_);
     auto type_id_map_iter = id_map_.find(type.type_string());
     if (type_id_map_iter == id_map_.end()) return false;
     auto id_map_iter = type_id_map_iter->second.find(tf_device_id.value());
@@ -81,7 +86,7 @@ class TfToPlatformDeviceIdMap {
   absl::StatusOr<std::vector<TfDeviceId>> GetTfDevicesOnPlatform(
       const DeviceType& type, PlatformDeviceId platform_device_id) const
       TF_LOCKS_EXCLUDED(mu_) {
-    absl::ReaderMutexLock lock(&mu_);
+    absl::ReaderMutexLock lock(mu_);
     auto type_id_map_iter = id_map_.find(type.type_string());
     if (type_id_map_iter == id_map_.end()) {
       return absl::NotFoundError(
@@ -101,12 +106,12 @@ class TfToPlatformDeviceIdMap {
   TfToPlatformDeviceIdMap() = default;
 
   void TestOnlyReset() TF_LOCKS_EXCLUDED(mu_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     id_map_.clear();
   }
 
   // Map from physical device id to platform device id.
-  using IdMapType = std::unordered_map<int32, int32>;
+  using IdMapType = std::unordered_map<int32_t, int32_t>;
   // Map from DeviceType to IdMapType.
   // We use std::string instead of DeviceType because the key should
   // be default-initializable.
@@ -134,8 +139,9 @@ absl::Status DeviceIdManager::TfToPlatformDeviceId(
                                                  platform_device_id)) {
     return absl::OkStatus();
   }
-  return errors::NotFound("TensorFlow device ", type, ":", tf_device_id.value(),
-                          " was not registered");
+  return absl::NotFoundError(
+      absl::StrCat("TensorFlow device ", type.type_string(), ":",
+                   tf_device_id.value(), " was not registered"));
 }
 
 absl::StatusOr<std::vector<TfDeviceId>> DeviceIdManager::GetTfDevicesOnPlatform(
diff --git a/third_party/xla/xla/tsl/framework/device_id_utils_test.cc b/third_party/xla/xla/tsl/framework/device_id_utils_test.cc
index 2cd0853c8064a0..ce6efc75bbf076 100644
--- a/third_party/xla/xla/tsl/framework/device_id_utils_test.cc
+++ b/third_party/xla/xla/tsl/framework/device_id_utils_test.cc
@@ -14,19 +14,26 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/tsl/framework/device_id_utils.h"
 
+#include <cstddef>
+#include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/framework/device_id.h"
 #include "xla/tsl/framework/device_id_manager.h"
+#include "xla/tsl/framework/device_type.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/device_name_utils.h"
 
 namespace tsl {
 namespace {
 
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 constexpr absl::string_view kTestDeviceType = "CPU";
 
diff --git a/third_party/xla/xla/tsl/framework/tracking_allocator.cc b/third_party/xla/xla/tsl/framework/tracking_allocator.cc
index a52c768e772cd8..63e8dc53ef1ec9 100644
--- a/third_party/xla/xla/tsl/framework/tracking_allocator.cc
+++ b/third_party/xla/xla/tsl/framework/tracking_allocator.cc
@@ -44,7 +44,7 @@ void* TrackingAllocator::AllocateRaw(
   if (allocator_->TracksAllocationSizes()) {
     size_t allocated_bytes = allocator_->AllocatedSize(ptr);
     {
-      absl::MutexLock lock(&mu_);
+      absl::MutexLock lock(mu_);
       allocated_ += allocated_bytes;
       high_watermark_ = std::max(high_watermark_, allocated_);
       total_bytes_ += allocated_bytes;
@@ -57,7 +57,7 @@ void* TrackingAllocator::AllocateRaw(
     // use the requested size as an approximation.
     size_t allocated_bytes = allocator_->AllocatedSizeSlow(ptr);
     allocated_bytes = std::max(num_bytes, allocated_bytes);
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     next_allocation_id_ += 1;
     Chunk chunk = {num_bytes, allocated_bytes, next_allocation_id_};
     in_use_.emplace(std::make_pair(ptr, chunk));
@@ -67,7 +67,7 @@ void* TrackingAllocator::AllocateRaw(
     allocations_.emplace_back(allocated_bytes, Env::Default()->NowMicros());
     ++ref_;
   } else {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     total_bytes_ += num_bytes;
     allocations_.emplace_back(num_bytes, Env::Default()->NowMicros());
     ++ref_;
@@ -88,7 +88,7 @@ void TrackingAllocator::DeallocateRaw(void* ptr) {
   if (tracks_allocation_sizes) {
     allocated_bytes = allocator_->AllocatedSize(ptr);
   } else if (track_sizes_locally_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     auto itr = in_use_.find(ptr);
     if (itr != in_use_.end()) {
       tracks_allocation_sizes = true;
@@ -98,7 +98,7 @@ void TrackingAllocator::DeallocateRaw(void* ptr) {
   }
   Allocator* allocator = allocator_;
   {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     if (tracks_allocation_sizes) {
       CHECK_GE(allocated_, allocated_bytes);
       allocated_ -= allocated_bytes;
@@ -118,7 +118,7 @@ bool TrackingAllocator::TracksAllocationSizes() const {
 
 size_t TrackingAllocator::RequestedSize(const void* ptr) const {
   if (track_sizes_locally_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     auto it = in_use_.find(ptr);
     if (it != in_use_.end()) {
       return (*it).second.requested_size;
@@ -131,7 +131,7 @@ size_t TrackingAllocator::RequestedSize(const void* ptr) const {
 
 size_t TrackingAllocator::AllocatedSize(const void* ptr) const {
   if (track_sizes_locally_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     auto it = in_use_.find(ptr);
     if (it != in_use_.end()) {
       return (*it).second.allocated_size;
@@ -144,7 +144,7 @@ size_t TrackingAllocator::AllocatedSize(const void* ptr) const {
 
 int64_t TrackingAllocator::AllocationId(const void* ptr) const {
   if (track_sizes_locally_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     auto it = in_use_.find(ptr);
     if (it != in_use_.end()) {
       return (*it).second.allocation_id;
@@ -166,7 +166,7 @@ std::tuple<size_t, size_t, size_t> TrackingAllocator::GetSizes() {
   size_t total_bytes;
   size_t still_live_bytes;
   {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     high_watermark = high_watermark_;
     total_bytes = total_bytes_;
     still_live_bytes = allocated_;
@@ -178,7 +178,7 @@ absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetRecordsAndUnRef() {
   bool should_delete;
   absl::InlinedVector<AllocRecord, 4UL> allocations;
   {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     allocations.swap(allocations_);
     should_delete = UnRef();
   }
@@ -191,7 +191,7 @@ absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetRecordsAndUnRef() {
 absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetCurrentRecords() {
   absl::InlinedVector<AllocRecord, 4UL> allocations;
   {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     for (const AllocRecord& alloc : allocations_) {
       allocations.push_back(alloc);
     }
diff --git a/third_party/xla/xla/tsl/framework/type_traits.h b/third_party/xla/xla/tsl/framework/type_traits.h
index b5d1c64dfb9212..18120da72225ab 100644
--- a/third_party/xla/xla/tsl/framework/type_traits.h
+++ b/third_party/xla/xla/tsl/framework/type_traits.h
@@ -89,15 +89,15 @@ struct is_simple_type {
 // standard types.
 namespace std {
 template <>
-class numeric_limits<tsl::qint8> : public numeric_limits<tsl::int8> {};
+class numeric_limits<tsl::qint8> : public numeric_limits<int8_t> {};
 template <>
-class numeric_limits<tsl::quint8> : public numeric_limits<tsl::uint8> {};
+class numeric_limits<tsl::quint8> : public numeric_limits<uint8_t> {};
 template <>
-class numeric_limits<tsl::qint16> : public numeric_limits<tsl::int16> {};
+class numeric_limits<tsl::qint16> : public numeric_limits<int16_t> {};
 template <>
-class numeric_limits<tsl::quint16> : public numeric_limits<tsl::uint16> {};
+class numeric_limits<tsl::quint16> : public numeric_limits<uint16_t> {};
 template <>
-class numeric_limits<tsl::qint32> : public numeric_limits<tsl::int32> {};
+class numeric_limits<tsl::qint32> : public numeric_limits<int32_t> {};
 
 }  // namespace std
 
diff --git a/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc b/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc
index 45ce9a91e272da..c48d27bdabc39d 100644
--- a/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc
@@ -33,7 +33,7 @@ static std::vector<const char*> SortedContents(const StringSet& set) {
 
 TEST(CompactPointerSetTest, Simple) {
   // Make some aligned and some unaligned pointers.
-  string data = "ABCDEFG";
+  std::string data = "ABCDEFG";
   const char* a = &data[0];
   const char* b = &data[1];
   const char* c = &data[2];
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatmap.h b/third_party/xla/xla/tsl/lib/gtl/flatmap.h
index 63ece98a408e80..c873c40ef27390 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatmap.h
+++ b/third_party/xla/xla/tsl/lib/gtl/flatmap.h
@@ -133,7 +133,7 @@ class FlatMap {
     iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) { SkipUnused(); }
 
     // Make iterator pointing exactly at ith element in b, which must exist.
-    iterator(Bucket* b, Bucket* end, uint32 i) : b_(b), end_(end), i_(i) {
+    iterator(Bucket* b, Bucket* end, uint32_t i) : b_(b), end_(end), i_(i) {
       FillValue();
     }
 
@@ -160,7 +160,7 @@ class FlatMap {
     Bucket* b_;
     Bucket* end_;
     char space_ alignas(value_type)[sizeof(value_type)];
-    uint32 i_;
+    uint32_t i_;
 
     pointer val() { return reinterpret_cast<pointer>(space_); }
     void FillValue() { new (space_) value_type(b_->key(i_), b_->val(i_)); }
@@ -192,7 +192,7 @@ class FlatMap {
 
     const_iterator() : rep_() {}
     const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
-    const_iterator(Bucket* b, Bucket* end, uint32 i) : rep_(b, end, i) {}
+    const_iterator(Bucket* b, Bucket* end, uint32_t i) : rep_(b, end, i) {}
 
     reference operator*() const { return *rep_.val(); }
     pointer operator->() const { return rep_.val(); }
@@ -321,7 +321,7 @@ class FlatMap {
   // Bucket stores kWidth <marker, key, value> triples.
   // The data is organized as three parallel arrays to reduce padding.
   struct Bucket {
-    uint8 marker[Rep::kWidth];
+    uint8_t marker[Rep::kWidth];
 
     // Wrap keys and values in union to control construction and destruction.
     union Storage {
@@ -333,27 +333,27 @@ class FlatMap {
       ~Storage() {}
     } storage;
 
-    Key& key(uint32 i) {
+    Key& key(uint32_t i) {
       DCHECK_GE(marker[i], 2);
       return storage.key[i];
     }
-    Val& val(uint32 i) {
+    Val& val(uint32_t i) {
       DCHECK_GE(marker[i], 2);
       return storage.val[i];
     }
     template <typename V>
-    void InitVal(uint32 i, V&& v) {
+    void InitVal(uint32_t i, V&& v) {
       new (&storage.val[i]) Val(std::forward<V>(v));
     }
-    void Destroy(uint32 i) {
+    void Destroy(uint32_t i) {
       storage.key[i].Key::~Key();
       storage.val[i].Val::~Val();
     }
-    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void MoveFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
       new (&storage.val[i]) Val(std::move(src->storage.val[src_index]));
     }
-    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void CopyFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(src->storage.key[src_index]);
       new (&storage.val[i]) Val(src->storage.val[src_index]);
     }
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc b/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc
index 2cf4f517bee6cf..34827198d33872 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc
@@ -30,10 +30,10 @@ namespace tsl {
 namespace gtl {
 namespace {
 
-typedef FlatMap<int64_t, int32> NumMap;
+typedef FlatMap<int64_t, int32_t> NumMap;
 
 // If map has an entry for k, return the corresponding value, else return def.
-int32 Get(const NumMap& map, int64_t k, int32_t def = -1) {
+int32_t Get(const NumMap& map, int64_t k, int32_t def = -1) {
   auto iter = map.find(k);
   if (iter == map.end()) {
     EXPECT_EQ(map.count(k), 0);
@@ -47,7 +47,7 @@ int32 Get(const NumMap& map, int64_t k, int32_t def = -1) {
 }
 
 // Return contents of map as a sorted list of pairs.
-typedef std::vector<std::pair<int64_t, int32>> NumMapContents;
+typedef std::vector<std::pair<int64_t, int32_t>> NumMapContents;
 NumMapContents Contents(const NumMap& map) {
   NumMapContents result;
   for (const auto& p : map) {
@@ -146,8 +146,8 @@ TEST(FlatMapTest, Emplace) {
 }
 
 TEST(FlatMapTest, EmplaceUniquePtr) {
-  FlatMap<int64_t, std::unique_ptr<string>> smap;
-  smap.emplace(1, std::make_unique<string>("hello"));
+  FlatMap<int64_t, std::unique_ptr<std::string>> smap;
+  smap.emplace(1, std::make_unique<std::string>("hello"));
 }
 
 TEST(FlatMapTest, Size) {
@@ -344,7 +344,7 @@ TEST(FlatMap, InitializerList) {
   NumMap b({{1, 10}, {2, 20}, {3, 30}});
   NumMap c = {{1, 10}, {2, 20}, {3, 30}};
 
-  typedef std::unordered_map<int64_t, int32> StdNumMap;
+  typedef std::unordered_map<int64_t, int32_t> StdNumMap;
   StdNumMap std({{1, 10}, {2, 20}, {3, 30}});
   StdNumMap::value_type std_r1 = *std.find(1);
   StdNumMap::value_type std_r2 = *std.find(2);
@@ -591,17 +591,17 @@ TEST(FlatMap, ForwardIterator) {
 // or destructions will show up as errors under a sanitizer or
 // heap checker.
 TEST(FlatMap, ConstructDestruct) {
-  FlatMap<string, string> map;
-  string k1 = "the quick brown fox jumped over the lazy dog";
-  string k2 = k1 + k1;
-  string k3 = k1 + k2;
+  FlatMap<std::string, std::string> map;
+  std::string k1 = "the quick brown fox jumped over the lazy dog";
+  std::string k2 = k1 + k1;
+  std::string k3 = k1 + k2;
   map[k1] = k2;
   map[k3] = k1;
   EXPECT_EQ(k1, map.find(k1)->first);
   EXPECT_EQ(k2, map.find(k1)->second);
   EXPECT_EQ(k1, map[k3]);
   map.erase(k3);
-  EXPECT_EQ(string(), map[k3]);
+  EXPECT_EQ(std::string(), map[k3]);
 
   map.clear();
   map[k1] = k2;
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatrep.h b/third_party/xla/xla/tsl/lib/gtl/flatrep.h
index ed772875452c8a..51ac65480b9a9c 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatrep.h
+++ b/third_party/xla/xla/tsl/lib/gtl/flatrep.h
@@ -47,8 +47,8 @@ template <typename Key, typename Bucket, class Hash, class Eq>
 class FlatRep {
  public:
   // kWidth is the number of entries stored in a bucket.
-  static constexpr uint32 kBase = 3;
-  static constexpr uint32 kWidth = (1 << kBase);
+  static constexpr uint32_t kBase = 3;
+  static constexpr uint32_t kWidth = (1 << kBase);
 
   FlatRep(size_t N, const Hash& hf, const Eq& eq) : hash_(hf), equal_(eq) {
     Init(N);
@@ -102,7 +102,7 @@ class FlatRep {
 
   void clear_no_resize() {
     for (Bucket* b = array_; b != end_; b++) {
-      for (uint32 i = 0; i < kWidth; i++) {
+      for (uint32_t i = 0; i < kWidth; i++) {
         if (b->marker[i] >= 2) {
           b->Destroy(i);
           b->marker[i] = kEmpty;
@@ -134,7 +134,7 @@ class FlatRep {
   struct SearchResult {
     bool found;
     Bucket* b;
-    uint32 index;
+    uint32_t index;
   };
 
   // Hash value is partitioned as follows:
@@ -145,13 +145,13 @@ class FlatRep {
   // Find bucket/index for key k.
   SearchResult Find(const Key& k) const {
     size_t h = hash_(k);
-    const uint32 marker = Marker(h & 0xff);
+    const uint32_t marker = Marker(h & 0xff);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 num_probes = 1;            // Needed for quadratic probing
+    uint32_t num_probes = 1;          // Needed for quadratic probing
     while (true) {
-      uint32 bi = index & (kWidth - 1);
+      uint32_t bi = index & (kWidth - 1);
       Bucket* b = &array_[index >> kBase];
-      const uint32 x = b->marker[bi];
+      const uint32_t x = b->marker[bi];
       if (x == marker && equal_(b->key(bi), k)) {
         return {true, b, bi};
       } else if (x == kEmpty) {
@@ -170,15 +170,15 @@ class FlatRep {
   template <typename KeyType>
   SearchResult FindOrInsert(KeyType&& k) {
     size_t h = hash_(k);
-    const uint32 marker = Marker(h & 0xff);
+    const uint32_t marker = Marker(h & 0xff);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 num_probes = 1;            // Needed for quadratic probing
+    uint32_t num_probes = 1;          // Needed for quadratic probing
     Bucket* del = nullptr;            // First encountered deletion for kInsert
-    uint32 di = 0;
+    uint32_t di = 0;
     while (true) {
-      uint32 bi = index & (kWidth - 1);
+      uint32_t bi = index & (kWidth - 1);
       Bucket* b = &array_[index >> kBase];
-      const uint32 x = b->marker[bi];
+      const uint32_t x = b->marker[bi];
       if (x == marker && equal_(b->key(bi), k)) {
         return {true, b, bi};
       } else if (!del && x == kDeleted) {
@@ -203,7 +203,7 @@ class FlatRep {
     }
   }
 
-  void Erase(Bucket* b, uint32 i) {
+  void Erase(Bucket* b, uint32_t i) {
     b->Destroy(i);
     b->marker[i] = kDeleted;
     deleted_++;
@@ -213,7 +213,7 @@ class FlatRep {
   void Prefetch(const Key& k) const {
     size_t h = hash_(k);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 bi = index & (kWidth - 1);
+    uint32_t bi = index & (kWidth - 1);
     Bucket* b = &array_[index >> kBase];
     absl::PrefetchToLocalCache(&b->marker[bi]);
     absl::PrefetchToLocalCache(&b->storage.key[bi]);
@@ -247,7 +247,7 @@ class FlatRep {
 
   Hash hash_;         // User-supplied hasher
   Eq equal_;          // User-supplied comparator
-  uint8 lglen_;       // lg(#buckets)
+  uint8_t lglen_;     // lg(#buckets)
   Bucket* array_;     // array of length (1 << lglen_)
   Bucket* end_;       // Points just past last bucket in array_
   size_t mask_;       // (# of entries in table) - 1
@@ -258,7 +258,7 @@ class FlatRep {
 
   // Avoid kEmpty and kDeleted markers when computing hash values to
   // store in Bucket::marker[].
-  static uint32 Marker(uint32 hb) { return hb + (hb < 2 ? 2 : 0); }
+  static uint32_t Marker(uint32_t hb) { return hb + (hb < 2 ? 2 : 0); }
 
   void Init(size_t N) {
     // Make enough room for N elements.
@@ -290,14 +290,16 @@ class FlatRep {
 
   // Used by FreshInsert when we should copy from source.
   struct CopyEntry {
-    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+    inline void operator()(Bucket* dst, uint32_t dsti, Bucket* src,
+                           uint32_t srci) {
       dst->CopyFrom(dsti, src, srci);
     }
   };
 
   // Used by FreshInsert when we should move from source.
   struct MoveEntry {
-    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+    inline void operator()(Bucket* dst, uint32_t dsti, Bucket* src,
+                           uint32_t srci) {
       dst->MoveFrom(dsti, src, srci);
       src->Destroy(srci);
       src->marker[srci] = kDeleted;
@@ -307,7 +309,7 @@ class FlatRep {
   template <typename Copier>
   void CopyEntries(Bucket* start, Bucket* end, Copier copier) {
     for (Bucket* b = start; b != end; b++) {
-      for (uint32 i = 0; i < kWidth; i++) {
+      for (uint32_t i = 0; i < kWidth; i++) {
         if (b->marker[i] >= 2) {
           FreshInsert(b, i, copier);
         }
@@ -320,15 +322,15 @@ class FlatRep {
   // assume that there are no deletions, and k does not already exist
   // in the table.
   template <typename Copier>
-  void FreshInsert(Bucket* src, uint32 src_index, Copier copier) {
+  void FreshInsert(Bucket* src, uint32_t src_index, Copier copier) {
     size_t h = hash_(src->key(src_index));
-    const uint32 marker = Marker(h & 0xff);
+    const uint32_t marker = Marker(h & 0xff);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 num_probes = 1;            // Needed for quadratic probing
+    uint32_t num_probes = 1;          // Needed for quadratic probing
     while (true) {
-      uint32 bi = index & (kWidth - 1);
+      uint32_t bi = index & (kWidth - 1);
       Bucket* b = &array_[index >> kBase];
-      const uint32 x = b->marker[bi];
+      const uint32_t x = b->marker[bi];
       if (x == 0) {
         b->marker[bi] = marker;
         not_empty_++;
@@ -340,7 +342,7 @@ class FlatRep {
     }
   }
 
-  inline size_t NextIndex(size_t i, uint32 num_probes) const {
+  inline size_t NextIndex(size_t i, uint32_t num_probes) const {
     // Quadratic probing.
     return (i + num_probes) & mask_;
   }
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatset.h b/third_party/xla/xla/tsl/lib/gtl/flatset.h
index c4b44b9bb5a349..1ffe1cc054ee2e 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatset.h
+++ b/third_party/xla/xla/tsl/lib/gtl/flatset.h
@@ -118,7 +118,7 @@ class FlatSet {
     }
 
     // Make iterator pointing exactly at ith element in b, which must exist.
-    const_iterator(Bucket* b, Bucket* end, uint32 i)
+    const_iterator(Bucket* b, Bucket* end, uint32_t i)
         : b_(b), end_(end), i_(i) {}
 
     reference operator*() const { return key(); }
@@ -143,7 +143,7 @@ class FlatSet {
     friend class FlatSet;
     Bucket* b_;
     Bucket* end_;
-    uint32 i_;
+    uint32_t i_;
 
     reference key() const { return b_->key(i_); }
     void SkipUnused() {
@@ -257,7 +257,7 @@ class FlatSet {
   // Bucket stores kWidth <marker, key, value> triples.
   // The data is organized as three parallel arrays to reduce padding.
   struct Bucket {
-    uint8 marker[Rep::kWidth];
+    uint8_t marker[Rep::kWidth];
 
     // Wrap keys in union to control construction and destruction.
     union Storage {
@@ -266,15 +266,15 @@ class FlatSet {
       ~Storage() {}
     } storage;
 
-    Key& key(uint32 i) {
+    Key& key(uint32_t i) {
       DCHECK_GE(marker[i], 2);
       return storage.key[i];
     }
-    void Destroy(uint32 i) { storage.key[i].Key::~Key(); }
-    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void Destroy(uint32_t i) { storage.key[i].Key::~Key(); }
+    void MoveFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
     }
-    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void CopyFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(src->storage.key[src_index]);
     }
   };
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc b/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc
index 11cd92f5b4ec3f..02831cd1d7ee5b 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc
@@ -487,10 +487,10 @@ TEST(FlatSet, ForwardIterator) {
 // or destructions will show up as errors under a sanitizer or
 // heap checker.
 TEST(FlatSet, ConstructDestruct) {
-  FlatSet<string> set;
-  string k1 = "the quick brown fox jumped over the lazy dog";
-  string k2 = k1 + k1;
-  string k3 = k1 + k2;
+  FlatSet<std::string> set;
+  std::string k1 = "the quick brown fox jumped over the lazy dog";
+  std::string k2 = k1 + k1;
+  std::string k3 = k1 + k2;
   set.insert(k1);
   set.insert(k3);
   EXPECT_EQ(set.count(k1), 1);
diff --git a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
index 19e6a7cb10fc3f..1205cd7a3e3251 100644
--- a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
@@ -25,14 +25,14 @@ limitations under the License.
 
 namespace tsl {
 
-TSL_LIB_GTL_DEFINE_INT_TYPE(Int8_IT, int8);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt8_IT, uint8);
-TSL_LIB_GTL_DEFINE_INT_TYPE(Int16_IT, int16);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt16_IT, uint16);
-TSL_LIB_GTL_DEFINE_INT_TYPE(Int32_IT, int32);
+TSL_LIB_GTL_DEFINE_INT_TYPE(Int8_IT, int8_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt8_IT, uint8_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(Int16_IT, int16_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt16_IT, uint16_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(Int32_IT, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(Int64_IT, int64_t);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt32_IT, uint32);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt64_IT, uint64);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt32_IT, uint32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt64_IT, uint64_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(Long_IT, long);  // NOLINT
 
 template <typename IntType_Type>
@@ -252,12 +252,12 @@ TYPED_TEST(IntTypeTest, TestValueAccessor) {
   // as this code is part of a template class.  Weird syntax though.  Good news
   // is that only int_type.value<int>() is needed in most code.
   EXPECT_EQ(static_cast<int>(i), int_type.template value<int>());
-  EXPECT_EQ(static_cast<int8>(i), int_type.template value<int8>());
-  EXPECT_EQ(static_cast<int16>(i), int_type.template value<int16>());
-  EXPECT_EQ(static_cast<int32>(i), int_type.template value<int32>());
-  EXPECT_EQ(static_cast<uint32>(i), int_type.template value<uint32>());
+  EXPECT_EQ(static_cast<int8_t>(i), int_type.template value<int8_t>());
+  EXPECT_EQ(static_cast<int16_t>(i), int_type.template value<int16_t>());
+  EXPECT_EQ(static_cast<int32_t>(i), int_type.template value<int32_t>());
+  EXPECT_EQ(static_cast<uint32_t>(i), int_type.template value<uint32_t>());
   EXPECT_EQ(static_cast<int64_t>(i), int_type.template value<int64_t>());
-  EXPECT_EQ(static_cast<uint64>(i), int_type.template value<uint64>());
+  EXPECT_EQ(static_cast<uint64_t>(i), int_type.template value<uint64_t>());
   EXPECT_EQ(static_cast<long>(i), int_type.template value<long>());  // NOLINT
   static_assert(int_type.template value<int>() == static_cast<int>(i),
                 "value<Value>() failed");
diff --git a/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc b/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc
index 92ac1d0e1c5e52..af4540f188db80 100644
--- a/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tsl {
 
 TEST(MapUtil, Find) {
-  typedef std::map<string, string> Map;
+  typedef std::map<std::string, std::string> Map;
   Map m;
 
   // Check that I can use a type that's implicitly convertible to the
@@ -39,7 +39,7 @@ TEST(MapUtil, Find) {
 }
 
 TEST(MapUtil, LookupOrInsert) {
-  typedef std::map<string, string> Map;
+  typedef std::map<std::string, std::string> Map;
   Map m;
 
   // Check that I can use a type that's implicitly convertible to the
diff --git a/third_party/xla/xla/tsl/lib/hash/crc32c.cc b/third_party/xla/xla/tsl/lib/hash/crc32c.cc
index 37d0ed501ce785..d87dd3d118e51a 100644
--- a/third_party/xla/xla/tsl/lib/hash/crc32c.cc
+++ b/third_party/xla/xla/tsl/lib/hash/crc32c.cc
@@ -28,7 +28,7 @@ namespace tsl {
 namespace crc32c {
 
 #if defined(TF_CORD_SUPPORT)
-uint32 Extend(uint32 crc, const absl::Cord &cord) {
+uint32_t Extend(uint32_t crc, const absl::Cord& cord) {
   for (absl::string_view fragment : cord.Chunks()) {
     crc = Extend(crc, fragment.data(), fragment.size());
   }
diff --git a/third_party/xla/xla/tsl/lib/hash/crc32c.h b/third_party/xla/xla/tsl/lib/hash/crc32c.h
index 8d797dacf0572f..b7a0ab3981a58d 100644
--- a/third_party/xla/xla/tsl/lib/hash/crc32c.h
+++ b/third_party/xla/xla/tsl/lib/hash/crc32c.h
@@ -30,37 +30,37 @@ namespace crc32c {
 // Return the crc32c of concat(A, buf[0,size-1]) where init_crc is the
 // crc32c of some string A.  Extend() is often used to maintain the
 // crc32c of a stream of data.
-inline uint32 Extend(uint32 init_crc, const char* buf, size_t size) {
-  return static_cast<uint32>(absl::ExtendCrc32c(
+inline uint32_t Extend(uint32_t init_crc, const char* buf, size_t size) {
+  return static_cast<uint32_t>(absl::ExtendCrc32c(
       static_cast<absl::crc32c_t>(init_crc), absl::string_view(buf, size)));
 }
 
 #if defined(TF_CORD_SUPPORT)
-extern uint32 Extend(uint32 init_crc, const absl::Cord& cord);
+extern uint32_t Extend(uint32_t init_crc, const absl::Cord& cord);
 #endif
 
 // Return the crc32c of data[0,n-1]
-inline uint32 Value(const char* data, size_t n) { return Extend(0, data, n); }
+inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); }
 
 #if defined(TF_CORD_SUPPORT)
-inline uint32 Value(const absl::Cord& cord) { return Extend(0, cord); }
+inline uint32_t Value(const absl::Cord& cord) { return Extend(0, cord); }
 #endif
 
-static const uint32 kMaskDelta = 0xa282ead8ul;
+static const uint32_t kMaskDelta = 0xa282ead8ul;
 
 // Return a masked representation of crc.
 //
 // Motivation: it is problematic to compute the CRC of a string that
 // contains embedded CRCs.  Therefore we recommend that CRCs stored
 // somewhere (e.g., in files) should be masked before being stored.
-inline uint32 Mask(uint32 crc) {
+inline uint32_t Mask(uint32_t crc) {
   // Rotate right by 15 bits and add a constant.
   return ((crc >> 15) | (crc << 17)) + kMaskDelta;
 }
 
 // Return the crc whose masked representation is masked_crc.
-inline uint32 Unmask(uint32 masked_crc) {
-  uint32 rot = masked_crc - kMaskDelta;
+inline uint32_t Unmask(uint32_t masked_crc) {
+  uint32_t rot = masked_crc - kMaskDelta;
   return ((rot >> 17) | (rot << 15));
 }
 
diff --git a/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc b/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc
index 5082e27ac672e4..8196ad87e24dd7 100644
--- a/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc
+++ b/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc
@@ -68,7 +68,7 @@ TEST(CRC, Extend) {
 }
 
 TEST(CRC, Mask) {
-  uint32 crc = Value("foo", 3);
+  uint32_t crc = Value("foo", 3);
   ASSERT_NE(crc, Mask(crc));
   ASSERT_NE(crc, Mask(Mask(crc)));
   ASSERT_EQ(crc, Unmask(Mask(crc)));
@@ -89,7 +89,7 @@ TEST(CRC, ExtendWithCord) {
 static void BM_CRC(::testing::benchmark::State& state) {
   int len = state.range(0);
   std::string input(len, 'x');
-  uint32 h = 0;
+  uint32_t h = 0;
   for (auto s : state) {
     h = Extend(h, input.data() + 1, len - 1);
   }
diff --git a/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc b/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
index 3ac2e3dd41213b..504b0020873580 100644
--- a/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
+++ b/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
@@ -36,7 +36,7 @@ static void Validate(const Histogram& h) {
   h.EncodeToProto(&proto_no_zeroes, false);
   Histogram h3;
   EXPECT_TRUE(h3.DecodeFromProto(proto_no_zeroes));
-  string s3 = h3.ToString();
+  std::string s3 = h3.ToString();
   LOG(ERROR) << s3;
 
   EXPECT_EQ(h3.ToString(), h.ToString());
diff --git a/third_party/xla/xla/tsl/lib/io/BUILD b/third_party/xla/xla/tsl/lib/io/BUILD
index 7b9721bc251531..57b26d87659273 100644
--- a/third_party/xla/xla/tsl/lib/io/BUILD
+++ b/third_party/xla/xla/tsl/lib/io/BUILD
@@ -22,6 +22,7 @@ package(
         "//tensorflow/core/profiler:__subpackages__",
         "//tensorflow/python/profiler/internal:__pkg__",
         "//third_party/xprof:__subpackages__",
+        "//tensorflow/compiler/mlir/tensorflow:__subpackages__",
     ]),
     licenses = ["notice"],
 )
@@ -91,7 +92,9 @@ cc_library(
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:coding",
     ],
     alwayslink = True,
@@ -163,8 +166,12 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:raw_coding",
         "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:tstring",
     ],
     alwayslink = True,
 )
@@ -322,6 +329,12 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:stringpiece",
         "@zlib",
     ],
diff --git a/third_party/xla/xla/tsl/lib/io/block.cc b/third_party/xla/xla/tsl/lib/io/block.cc
index ae6d2a1ac46248..34efc30e9dfc2c 100644
--- a/third_party/xla/xla/tsl/lib/io/block.cc
+++ b/third_party/xla/xla/tsl/lib/io/block.cc
@@ -28,24 +28,24 @@ limitations under the License.
 namespace tsl {
 namespace table {
 
-inline uint32 Block::NumRestarts() const {
-  assert(size_ >= sizeof(uint32));
-  return core::DecodeFixed32(data_ + size_ - sizeof(uint32));
+inline uint32_t Block::NumRestarts() const {
+  assert(size_ >= sizeof(uint32_t));
+  return core::DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
 
 Block::Block(const BlockContents& contents)
     : data_(contents.data.data()),
       size_(contents.data.size()),
       owned_(contents.heap_allocated) {
-  if (size_ < sizeof(uint32)) {
+  if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
-    size_t max_restarts_allowed = (size_ - sizeof(uint32)) / sizeof(uint32);
+    size_t max_restarts_allowed = (size_ - sizeof(uint32_t)) / sizeof(uint32_t);
     if (NumRestarts() > max_restarts_allowed) {
       // The size is too small for NumRestarts()
       size_ = 0;
     } else {
-      restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32);
+      restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
     }
   }
 }
@@ -64,8 +64,8 @@ Block::~Block() {
 // If any errors are detected, returns NULL.  Otherwise, returns a
 // pointer to the key delta (just past the three decoded values).
 static inline const char* DecodeEntry(const char* p, const char* limit,
-                                      uint32* shared, uint32* non_shared,
-                                      uint32* value_length) {
+                                      uint32_t* shared, uint32_t* non_shared,
+                                      uint32_t* value_length) {
   if (limit - p < 3) return nullptr;
   *shared = reinterpret_cast<const unsigned char*>(p)[0];
   *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
@@ -81,7 +81,7 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
       return nullptr;
   }
 
-  if (static_cast<uint32>(limit - p) < (*non_shared + *value_length)) {
+  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
     return nullptr;
   }
   return p;
@@ -90,13 +90,13 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
 class Block::Iter : public Iterator {
  private:
   const char* const data_;     // underlying block contents
-  uint32 const restarts_;      // Offset of restart array (list of fixed32)
-  uint32 const num_restarts_;  // Number of uint32 entries in restart array
+  const uint32_t restarts_;    // Offset of restart array (list of fixed32)
+  const uint32_t num_restarts_;  // Number of uint32 entries in restart array
 
   // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
-  uint32 current_;
-  uint32 restart_index_;  // Index of restart block in which current_ falls
-  string key_;
+  uint32_t current_;
+  uint32_t restart_index_;  // Index of restart block in which current_ falls
+  std::string key_;
   absl::string_view value_;
   absl::Status status_;
 
@@ -106,27 +106,27 @@ class Block::Iter : public Iterator {
   }
 
   // Return the offset in data_ just past the end of the current entry.
-  inline uint32 NextEntryOffset() const {
+  inline uint32_t NextEntryOffset() const {
     return (value_.data() + value_.size()) - data_;
   }
 
-  uint32 GetRestartPoint(uint32 index) {
+  uint32_t GetRestartPoint(uint32_t index) {
     assert(index < num_restarts_);
-    return core::DecodeFixed32(data_ + restarts_ + index * sizeof(uint32));
+    return core::DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
   }
 
-  void SeekToRestartPoint(uint32 index) {
+  void SeekToRestartPoint(uint32_t index) {
     key_.clear();
     restart_index_ = index;
     // current_ will be fixed by ParseNextKey();
 
     // ParseNextKey() starts at the end of value_, so set value_ accordingly
-    uint32 offset = GetRestartPoint(index);
+    uint32_t offset = GetRestartPoint(index);
     value_ = absl::string_view(data_ + offset, 0);
   }
 
  public:
-  Iter(const char* data, uint32 restarts, uint32 num_restarts)
+  Iter(const char* data, uint32_t restarts, uint32_t num_restarts)
       : data_(data),
         restarts_(restarts),
         num_restarts_(num_restarts),
@@ -154,12 +154,12 @@ class Block::Iter : public Iterator {
   void Seek(absl::string_view target) override {
     // Binary search in restart array to find the last restart point
     // with a key < target
-    uint32 left = 0;
-    uint32 right = num_restarts_ - 1;
+    uint32_t left = 0;
+    uint32_t right = num_restarts_ - 1;
     while (left < right) {
-      uint32 mid = left + (right - left + 1) / 2;
-      uint32 region_offset = GetRestartPoint(mid);
-      uint32 shared, non_shared, value_length;
+      uint32_t mid = left + (right - left + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
       const char* key_ptr =
           DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
                       &non_shared, &value_length);
@@ -217,7 +217,7 @@ class Block::Iter : public Iterator {
     }
 
     // Decode next entry
-    uint32 shared, non_shared, value_length;
+    uint32_t shared, non_shared, value_length;
     p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
     if (p == nullptr || key_.size() < shared) {
       CorruptionError();
@@ -236,10 +236,10 @@ class Block::Iter : public Iterator {
 };
 
 Iterator* Block::NewIterator() {
-  if (size_ < sizeof(uint32)) {
+  if (size_ < sizeof(uint32_t)) {
     return NewErrorIterator(errors::DataLoss("bad block contents"));
   }
-  const uint32 num_restarts = NumRestarts();
+  const uint32_t num_restarts = NumRestarts();
   if (num_restarts == 0) {
     return NewEmptyIterator();
   } else {
diff --git a/third_party/xla/xla/tsl/lib/io/block.h b/third_party/xla/xla/tsl/lib/io/block.h
index c97a0f9830d48f..8c839c0bce2047 100644
--- a/third_party/xla/xla/tsl/lib/io/block.h
+++ b/third_party/xla/xla/tsl/lib/io/block.h
@@ -37,11 +37,11 @@ class Block {
   Iterator* NewIterator();
 
  private:
-  uint32 NumRestarts() const;
+  uint32_t NumRestarts() const;
 
   const char* data_;
   size_t size_;
-  uint32 restart_offset_;  // Offset in data_ of restart array
+  uint32_t restart_offset_;  // Offset in data_ of restart array
   bool owned_;             // Block owns data_[]
 
   // No copying allowed
diff --git a/third_party/xla/xla/tsl/lib/io/block_builder.cc b/third_party/xla/xla/tsl/lib/io/block_builder.cc
index 6ec8e73caa21cf..61dd61d8d6d0d1 100644
--- a/third_party/xla/xla/tsl/lib/io/block_builder.cc
+++ b/third_party/xla/xla/tsl/lib/io/block_builder.cc
@@ -65,9 +65,9 @@ void BlockBuilder::Reset() {
 }
 
 size_t BlockBuilder::CurrentSizeEstimate() const {
-  return (buffer_.size() +                     // Raw data buffer
-          restarts_.size() * sizeof(uint32) +  // Restart array
-          sizeof(uint32));                     // Restart array length
+  return (buffer_.size() +                       // Raw data buffer
+          restarts_.size() * sizeof(uint32_t) +  // Restart array
+          sizeof(uint32_t));                     // Restart array length
 }
 
 absl::string_view BlockBuilder::Finish() {
diff --git a/third_party/xla/xla/tsl/lib/io/block_builder.h b/third_party/xla/xla/tsl/lib/io/block_builder.h
index 2f181835819108..a27bbf1c3fe3b5 100644
--- a/third_party/xla/xla/tsl/lib/io/block_builder.h
+++ b/third_party/xla/xla/tsl/lib/io/block_builder.h
@@ -53,11 +53,11 @@ class BlockBuilder {
 
  private:
   const Options* options_;
-  string buffer_;                 // Destination buffer
-  std::vector<uint32> restarts_;  // Restart points
+  std::string buffer_;              // Destination buffer
+  std::vector<uint32_t> restarts_;  // Restart points
   int counter_;                   // Number of entries emitted since restart
   bool finished_;                 // Has Finish() been called?
-  string last_key_;
+  std::string last_key_;
 
   // No copying allowed
   BlockBuilder(const BlockBuilder&);
diff --git a/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc b/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc
index f1faf55ef5353f..898a731c7e2e0c 100644
--- a/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 TEST(BufferedInputStream, Tell) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   std::unique_ptr<WritableFile> write_file;
   TF_ASSERT_OK(env->NewWritableFile(fname, &write_file));
diff --git a/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc b/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
index 7c0621dbb9bfa0..ee3241e62f82d2 100644
--- a/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
@@ -63,7 +63,7 @@ class ReadOnceInputStream : public InputStreamInterface {
 
 TEST(BufferedInputStream, ReadLine_Empty) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
   std::unique_ptr<RandomAccessFile> file;
@@ -73,14 +73,14 @@ TEST(BufferedInputStream, ReadLine_Empty) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
 TEST(BufferedInputStream, ReadLine1) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
@@ -91,7 +91,7 @@ TEST(BufferedInputStream, ReadLine1) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -106,7 +106,7 @@ TEST(BufferedInputStream, ReadLine1) {
 
 TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
@@ -116,7 +116,7 @@ TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -131,7 +131,7 @@ TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
 
 TEST(BufferedInputStream, ReadLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
@@ -142,7 +142,7 @@ TEST(BufferedInputStream, ReadLine_EmptyLines) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -161,7 +161,7 @@ TEST(BufferedInputStream, ReadLine_EmptyLines) {
 
 TEST(BufferedInputStream, ReadLine_CRLF) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname,
                                  "line one\r\n\r\n\r\nline two\r\nline three"));
@@ -172,7 +172,7 @@ TEST(BufferedInputStream, ReadLine_CRLF) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -191,7 +191,7 @@ TEST(BufferedInputStream, ReadLine_CRLF) {
 
 TEST(BufferedInputStream, SkipLine1) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
@@ -202,7 +202,7 @@ TEST(BufferedInputStream, SkipLine1) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.SkipLine());
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line two");
@@ -215,7 +215,7 @@ TEST(BufferedInputStream, SkipLine1) {
 
 TEST(BufferedInputStream, SkipLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
@@ -225,7 +225,7 @@ TEST(BufferedInputStream, SkipLine_NoTrailingNewLine) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.SkipLine());
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line two");
@@ -238,7 +238,7 @@ TEST(BufferedInputStream, SkipLine_NoTrailingNewLine) {
 
 TEST(BufferedInputStream, SkipLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\n\n\nline two"));
   std::unique_ptr<RandomAccessFile> file;
@@ -248,7 +248,7 @@ TEST(BufferedInputStream, SkipLine_EmptyLines) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.SkipLine());
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "");
@@ -260,7 +260,7 @@ TEST(BufferedInputStream, SkipLine_EmptyLines) {
 
 TEST(BufferedInputStream, ReadNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -324,7 +324,7 @@ TEST(BufferedInputStream, OutOfRangeCache) {
 
 TEST(BufferedInputStream, SkipNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -362,7 +362,7 @@ TEST(BufferedInputStream, SkipNBytes) {
 
 TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -398,7 +398,7 @@ TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
 
 TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -434,7 +434,7 @@ TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
 
 TEST(BufferedInputStream, Seek) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -467,7 +467,7 @@ TEST(BufferedInputStream, Seek_NotReset) {
   // This test verifies seek backwards within the buffer doesn't reset
   // input_stream
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -489,9 +489,9 @@ TEST(BufferedInputStream, Seek_NotReset) {
 
 TEST(BufferedInputStream, ReadAll_Empty) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
-  const string expected = "";
+  const std::string expected = "";
   TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -499,7 +499,7 @@ TEST(BufferedInputStream, ReadAll_Empty) {
   for (auto buf_size : BufferSizes()) {
     RandomAccessInputStream input_stream(file.get());
     BufferedInputStream in(&input_stream, buf_size);
-    string contents;
+    std::string contents;
     TF_ASSERT_OK(in.ReadAll(&contents));
     EXPECT_EQ(expected, contents);
   }
@@ -507,9 +507,9 @@ TEST(BufferedInputStream, ReadAll_Empty) {
 
 TEST(BufferedInputStream, ReadAll_Text) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
-  const string expected = "line one\nline two\nline three";
+  const std::string expected = "line one\nline two\nline three";
   TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -517,7 +517,7 @@ TEST(BufferedInputStream, ReadAll_Text) {
   for (auto buf_size : BufferSizes()) {
     RandomAccessInputStream input_stream(file.get());
     BufferedInputStream in(&input_stream, buf_size);
-    string contents;
+    std::string contents;
     TF_ASSERT_OK(in.ReadAll(&contents));
     EXPECT_EQ(expected, contents);
   }
@@ -527,10 +527,10 @@ void BM_BufferedReaderSmallReads(::testing::benchmark::State& state) {
   const int buff_size = state.range(0);
   const int file_size = state.range(1);
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
-  const string file_elem = "0123456789";
+  const std::string file_elem = "0123456789";
   std::unique_ptr<WritableFile> write_file;
   TF_ASSERT_OK(env->NewWritableFile(fname, &write_file));
   for (int i = 0; i < file_size; ++i) {
diff --git a/third_party/xla/xla/tsl/lib/io/format.cc b/third_party/xla/xla/tsl/lib/io/format.cc
index 0880f47fde1ea5..aeee0db931cae9 100644
--- a/third_party/xla/xla/tsl/lib/io/format.cc
+++ b/third_party/xla/xla/tsl/lib/io/format.cc
@@ -28,10 +28,10 @@ limitations under the License.
 namespace tsl {
 namespace table {
 
-void BlockHandle::EncodeTo(string* dst) const {
+void BlockHandle::EncodeTo(std::string* dst) const {
   // Sanity check that all fields have been set
-  assert(offset_ != ~static_cast<uint64>(0));
-  assert(size_ != ~static_cast<uint64>(0));
+  assert(offset_ != ~static_cast<uint64_t>(0));
+  assert(size_ != ~static_cast<uint64_t>(0));
   core::PutVarint64(dst, offset_);
   core::PutVarint64(dst, size_);
 }
@@ -44,24 +44,24 @@ absl::Status BlockHandle::DecodeFrom(absl::string_view* input) {
   }
 }
 
-void Footer::EncodeTo(string* dst) const {
+void Footer::EncodeTo(std::string* dst) const {
 #ifndef NDEBUG
   const size_t original_size = dst->size();
 #endif
   metaindex_handle_.EncodeTo(dst);
   index_handle_.EncodeTo(dst);
   dst->resize(2 * BlockHandle::kMaxEncodedLength);  // Padding
-  core::PutFixed32(dst, static_cast<uint32>(kTableMagicNumber & 0xffffffffu));
-  core::PutFixed32(dst, static_cast<uint32>(kTableMagicNumber >> 32));
+  core::PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber & 0xffffffffu));
+  core::PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
   assert(dst->size() == original_size + kEncodedLength);
 }
 
 absl::Status Footer::DecodeFrom(absl::string_view* input) {
   const char* magic_ptr = input->data() + kEncodedLength - 8;
-  const uint32 magic_lo = core::DecodeFixed32(magic_ptr);
-  const uint32 magic_hi = core::DecodeFixed32(magic_ptr + 4);
-  const uint64 magic =
-      ((static_cast<uint64>(magic_hi) << 32) | (static_cast<uint64>(magic_lo)));
+  const uint32_t magic_lo = core::DecodeFixed32(magic_ptr);
+  const uint32_t magic_hi = core::DecodeFixed32(magic_ptr + 4);
+  const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
+                          (static_cast<uint64_t>(magic_lo)));
   if (magic != kTableMagicNumber) {
     return errors::DataLoss("not an sstable (bad magic number)");
   }
@@ -110,8 +110,8 @@ absl::Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
   // This checksum verification is optional.  We leave it on for now
   const bool verify_checksum = true;
   if (verify_checksum) {
-    const uint32 crc = crc32c::Unmask(core::DecodeFixed32(data + n + 1));
-    const uint32 actual = crc32c::Value(data, n + 1);
+    const uint32_t crc = crc32c::Unmask(core::DecodeFixed32(data + n + 1));
+    const uint32_t actual = crc32c::Value(data, n + 1);
     if (actual != crc) {
       delete[] buf;
       s = errors::DataLoss("block checksum mismatch");
diff --git a/third_party/xla/xla/tsl/lib/io/format.h b/third_party/xla/xla/tsl/lib/io/format.h
index 408be574f6b059..fa34bdd8ac75e8 100644
--- a/third_party/xla/xla/tsl/lib/io/format.h
+++ b/third_party/xla/xla/tsl/lib/io/format.h
@@ -38,22 +38,22 @@ class BlockHandle {
   BlockHandle();
 
   // The offset of the block in the file.
-  uint64 offset() const { return offset_; }
-  void set_offset(uint64 offset) { offset_ = offset; }
+  uint64_t offset() const { return offset_; }
+  void set_offset(uint64_t offset) { offset_ = offset; }
 
   // The size of the stored block
-  uint64 size() const { return size_; }
-  void set_size(uint64 size) { size_ = size; }
+  uint64_t size() const { return size_; }
+  void set_size(uint64_t size) { size_ = size; }
 
-  void EncodeTo(string* dst) const;
+  void EncodeTo(std::string* dst) const;
   absl::Status DecodeFrom(absl::string_view* input);
 
   // Maximum encoding length of a BlockHandle
   enum { kMaxEncodedLength = 10 + 10 };
 
  private:
-  uint64 offset_;
-  uint64 size_;
+  uint64_t offset_;
+  uint64_t size_;
 };
 
 // Footer encapsulates the fixed information stored at the tail
@@ -70,7 +70,7 @@ class Footer {
   const BlockHandle& index_handle() const { return index_handle_; }
   void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
 
-  void EncodeTo(string* dst) const;
+  void EncodeTo(std::string* dst) const;
   absl::Status DecodeFrom(absl::string_view* input);
 
   // Encoded length of a Footer.  Note that the serialization of a
@@ -86,7 +86,7 @@ class Footer {
 // kTableMagicNumber was picked by running
 //    echo http://code.google.com/p/leveldb/ | sha1sum
 // and taking the leading 64 bits.
-static const uint64 kTableMagicNumber = 0xdb4775248b80fb57ull;
+static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
 
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
@@ -105,7 +105,7 @@ extern absl::Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
 // Implementation details follow.  Clients should ignore,
 
 inline BlockHandle::BlockHandle()
-    : offset_(~static_cast<uint64>(0)), size_(~static_cast<uint64>(0)) {}
+    : offset_(~static_cast<uint64_t>(0)), size_(~static_cast<uint64_t>(0)) {}
 
 }  // namespace table
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer.cc b/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
index 3923b1ef7ecc6d..21bf6caa9b09f9 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
@@ -19,12 +19,17 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <string>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/logging.h"
+#include "tsl/platform/coding.h"
 
 namespace tsl {
 namespace io {
@@ -89,8 +94,8 @@ absl::Status InputBuffer::ReadNBytes(int64_t bytes_to_read,
                                      std::string* result) {
   result->clear();
   if (bytes_to_read < 0) {
-    return errors::InvalidArgument("Can't read a negative number of bytes: ",
-                                   bytes_to_read);
+    return absl::InvalidArgumentError(
+        absl::StrCat("Can't read a negative number of bytes: ", bytes_to_read));
   }
   result->resize(bytes_to_read);
   size_t bytes_read = 0;
@@ -102,10 +107,10 @@ absl::Status InputBuffer::ReadNBytes(int64_t bytes_to_read,
 absl::Status InputBuffer::ReadNBytes(int64_t bytes_to_read, char* result,
                                      size_t* bytes_read) {
   if (bytes_to_read < 0) {
-    return errors::InvalidArgument("Can't read a negative number of bytes: ",
-                                   bytes_to_read);
+    return absl::InvalidArgumentError(
+        absl::StrCat("Can't read a negative number of bytes: ", bytes_to_read));
   }
-  absl::Status status;
+  absl::Status status;  // Re-declare status here
   *bytes_read = 0;
   while (*bytes_read < static_cast<size_t>(bytes_to_read)) {
     if (pos_ == limit_) {
@@ -130,7 +135,7 @@ absl::Status InputBuffer::ReadNBytes(int64_t bytes_to_read, char* result,
   return status;
 }
 
-absl::Status InputBuffer::ReadVarint32Fallback(uint32* result) {
+absl::Status InputBuffer::ReadVarint32Fallback(uint32_t* result) {
   absl::Status s = ReadVarintFallback(result, core::kMaxVarint32Bytes);
   if (absl::IsDataLoss(s)) {
     return errors::DataLoss("Stored data is too large to be a varint32.");
@@ -138,7 +143,7 @@ absl::Status InputBuffer::ReadVarint32Fallback(uint32* result) {
   return s;
 }
 
-absl::Status InputBuffer::ReadVarint64Fallback(uint64* result) {
+absl::Status InputBuffer::ReadVarint64Fallback(uint64_t* result) {
   absl::Status s = ReadVarintFallback(result, core::kMaxVarint64Bytes);
   if (absl::IsDataLoss(s)) {
     return errors::DataLoss("Stored data is too large to be a varint64.");
@@ -148,7 +153,7 @@ absl::Status InputBuffer::ReadVarint64Fallback(uint64* result) {
 
 template <typename T>
 absl::Status InputBuffer::ReadVarintFallback(T* result, int max_bytes) {
-  uint8 scratch = 0;
+  uint8_t scratch = 0;
   auto* p = reinterpret_cast<char*>(&scratch);
   size_t unused_bytes_read = 0;
 
@@ -164,8 +169,8 @@ absl::Status InputBuffer::ReadVarintFallback(T* result, int max_bytes) {
 
 absl::Status InputBuffer::SkipNBytes(int64_t bytes_to_skip) {
   if (bytes_to_skip < 0) {
-    return errors::InvalidArgument("Can only skip forward, not ",
-                                   bytes_to_skip);
+    return absl::InvalidArgumentError(
+        absl::StrCat("Can only skip forward, not ", bytes_to_skip));
   }
   int64_t bytes_skipped = 0;
   absl::Status s;
@@ -190,8 +195,8 @@ absl::Status InputBuffer::SkipNBytes(int64_t bytes_to_skip) {
 
 absl::Status InputBuffer::Seek(int64_t position) {
   if (position < 0) {
-    return errors::InvalidArgument("Seeking to a negative position: ",
-                                   position);
+    return absl::InvalidArgumentError(
+        absl::StrCat("Seeking to a negative position: ", position));
   }
   // Position of the buffer within file.
   const int64_t bufpos = file_pos_ - static_cast<int64_t>(limit_ - buf());
@@ -210,8 +215,8 @@ absl::Status InputBuffer::Seek(int64_t position) {
 
 absl::Status InputBuffer::Hint(int64_t bytes_to_read) {
   if (bytes_to_read < 0) {
-    return errors::InvalidArgument("Can't read a negative number of bytes: ",
-                                   bytes_to_read);
+    return absl::InvalidArgumentError(
+        absl::StrCat("Can't read a negative number of bytes: ", bytes_to_read));
   }
 
   // The internal buffer is too small. Do nothing.
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer.h b/third_party/xla/xla/tsl/lib/io/inputbuffer.h
index 5dd9923d248fb3..b9d26f3bc451f3 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer.h
@@ -67,10 +67,10 @@ class InputBuffer {
                           size_t* bytes_read);
 
   // Reads a single varint32.
-  absl::Status ReadVarint32(uint32* result);
+  absl::Status ReadVarint32(uint32_t* result);
 
   // Reads a single varint64.
-  absl::Status ReadVarint64(uint64* result);
+  absl::Status ReadVarint64(uint64_t* result);
 
   // Like ReadNBytes() without returning the bytes read.
   absl::Status SkipNBytes(int64_t bytes_to_skip);
@@ -95,10 +95,10 @@ class InputBuffer {
   absl::Status FillBuffer();
 
   // Internal slow-path routine used by ReadVarint32().
-  absl::Status ReadVarint32Fallback(uint32* result);
+  absl::Status ReadVarint32Fallback(uint32_t* result);
 
   // Internal slow-path routine used by ReadVarint64().
-  absl::Status ReadVarint64Fallback(uint64* result);
+  absl::Status ReadVarint64Fallback(uint64_t* result);
 
   // Helper method for reading a varint which can span at max `max_bytes`.
   // If the varint is longer, a DataLoss error status is returned.
@@ -134,7 +134,7 @@ extern template Status InputBuffer::ReadLine<std::string>(std::string* result);
 extern template Status InputBuffer::ReadLine<tstring>(tstring* result);
 
 // Inlined for performance.
-inline absl::Status InputBuffer::ReadVarint32(uint32* result) {
+inline absl::Status InputBuffer::ReadVarint32(uint32_t* result) {
   if (pos_ + core::kMaxVarint32Bytes <= limit_) {
     // Fast path: directly parse from buffered data.
     // Reads strictly from the range [pos_, limit_).
@@ -149,7 +149,7 @@ inline absl::Status InputBuffer::ReadVarint32(uint32* result) {
 }
 
 // Inlined for performance.
-inline absl::Status InputBuffer::ReadVarint64(uint64* result) {
+inline absl::Status InputBuffer::ReadVarint64(uint64_t* result) {
   if (pos_ + core::kMaxVarint64Bytes <= limit_) {
     // Fast path: directly parse from buffered data.
     // Reads strictly from the range [pos_, limit_).
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc b/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
index f6a26becac9b6d..682740aca7bf0a 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
@@ -37,14 +37,14 @@ static std::vector<int> BufferSizes() {
 
 TEST(InputBuffer, ReadLine_Empty) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
@@ -52,7 +52,7 @@ TEST(InputBuffer, ReadLine_Empty) {
 
 TEST(InputBuffer, ReadLine1) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_CHECK_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
@@ -60,7 +60,7 @@ TEST(InputBuffer, ReadLine1) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -76,14 +76,14 @@ TEST(InputBuffer, ReadLine1) {
 
 TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -99,7 +99,7 @@ TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
 
 TEST(InputBuffer, ReadLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_CHECK_OK(
       WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
@@ -107,7 +107,7 @@ TEST(InputBuffer, ReadLine_EmptyLines) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -127,7 +127,7 @@ TEST(InputBuffer, ReadLine_EmptyLines) {
 
 TEST(InputBuffer, ReadLine_CRLF) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname,
                                  "line one\r\n\r\n\r\nline two\r\nline three"));
@@ -135,7 +135,7 @@ TEST(InputBuffer, ReadLine_CRLF) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -155,7 +155,7 @@ TEST(InputBuffer, ReadLine_CRLF) {
 
 TEST(InputBuffer, ReadNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
@@ -163,7 +163,7 @@ TEST(InputBuffer, ReadNBytes) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_CHECK_OK(in.ReadNBytes(3, &read));
@@ -229,14 +229,14 @@ TEST(InputBuffer, ReadNBytes) {
 
 TEST(InputBuffer, SkipNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_CHECK_OK(in.SkipNBytes(3));
@@ -265,14 +265,14 @@ TEST(InputBuffer, SkipNBytes) {
 
 TEST(InputBuffer, Seek) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
 
     TF_CHECK_OK(in.ReadNBytes(3, &read));
@@ -301,23 +301,23 @@ TEST(InputBuffer, Seek) {
 
 TEST(InputBuffer, ReadVarint32) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   // Generates data.
-  std::vector<uint32> data;
-  uint32 i = 0;
+  std::vector<uint32_t> data;
+  uint32_t i = 0;
   for (; i < (1U << 10); i += 1) data.push_back(i);
   for (; i < (1U << 15); i += 5) data.push_back(i);
   for (; i < (1U << 31); i += 132817) data.push_back(i);
-  data.push_back(std::numeric_limits<uint32>::max());
+  data.push_back(std::numeric_limits<uint32_t>::max());
 
   // Writes the varints.
   {
     std::unique_ptr<WritableFile> file;
     TF_CHECK_OK(env->NewWritableFile(fname, &file));
-    string varint;
-    for (uint32 number : data) {
+    std::string varint;
+    for (uint32_t number : data) {
       varint.clear();
       core::PutVarint32(&varint, number);
       TF_CHECK_OK(file->Append(absl::string_view(varint)));
@@ -328,9 +328,9 @@ TEST(InputBuffer, ReadVarint32) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
     io::InputBuffer in(file.get(), buf_size);
-    uint32 result = 0;
+    uint32_t result = 0;
 
-    for (uint32 expected : data) {
+    for (uint32_t expected : data) {
       TF_ASSERT_OK(in.ReadVarint32(&result));
       EXPECT_EQ(expected, result);
     }
@@ -340,24 +340,24 @@ TEST(InputBuffer, ReadVarint32) {
 
 TEST(InputBuffer, ReadVarint64) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   // Generates data.
-  std::vector<uint64> data;
-  uint64 i = 0;
+  std::vector<uint64_t> data;
+  uint64_t i = 0;
   for (; i < (1U << 10); i += 1) data.push_back(i);
   for (; i < (1U << 15); i += 5) data.push_back(i);
   for (; i < (1U << 31); i += 164817) data.push_back(i);
   for (; i < (1ULL << 63); i += 16481797854795663UL) data.push_back(i);
-  data.push_back(std::numeric_limits<uint64>::max());
+  data.push_back(std::numeric_limits<uint64_t>::max());
 
   // Writes the varints.
   {
     std::unique_ptr<WritableFile> file;
     TF_CHECK_OK(env->NewWritableFile(fname, &file));
-    string varint;
-    for (uint64 number : data) {
+    std::string varint;
+    for (uint64_t number : data) {
       varint.clear();
       core::PutVarint64(&varint, number);
       TF_CHECK_OK(file->Append(absl::string_view(varint)));
@@ -368,9 +368,9 @@ TEST(InputBuffer, ReadVarint64) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
     io::InputBuffer in(file.get(), buf_size);
-    uint64 result = 0;
+    uint64_t result = 0;
 
-    for (uint64 expected : data) {
+    for (uint64_t expected : data) {
       TF_ASSERT_OK(in.ReadVarint64(&result));
       EXPECT_EQ(expected, result);
     }
@@ -380,14 +380,14 @@ TEST(InputBuffer, ReadVarint64) {
 
 TEST(InputBuffer, Hint) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
 
     TF_CHECK_OK(in.ReadNBytes(3, &read));
diff --git a/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc b/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
index 453e898438ddb2..6ab16d4fbf103c 100644
--- a/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
@@ -25,7 +25,7 @@ namespace {
 
 class TestStringStream : public InputStreamInterface {
  public:
-  explicit TestStringStream(const string& content) : content_(content) {}
+  explicit TestStringStream(const std::string& content) : content_(content) {}
 
   absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override {
     result->clear();
@@ -45,7 +45,7 @@ class TestStringStream : public InputStreamInterface {
   }
 
  private:
-  string content_;
+  std::string content_;
   int64_t pos_ = 0;
 };
 
diff --git a/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h b/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
index a63a4f950f466d..b8ee11aa2078ea 100644
--- a/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
+++ b/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
@@ -41,7 +41,7 @@ class ProtoEncodeHelper {
   const char* data() const { return base_; }
   size_t size() const { return p_ - base_; }
 
-  void WriteUint64(int tag, uint64 v) {
+  void WriteUint64(int tag, uint64_t v) {
     Encode32(combine(tag, WIRETYPE_VARINT));
     Encode64(v);
   }
@@ -54,7 +54,7 @@ class ProtoEncodeHelper {
     Encode32(v.size());
     EncodeBytes(v.data(), v.size());
   }
-  void WriteVarlengthBeginning(int tag, uint32 len) {
+  void WriteVarlengthBeginning(int tag, uint32_t len) {
     Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
     Encode32(len);
   }
@@ -67,8 +67,10 @@ class ProtoEncodeHelper {
     WIRETYPE_VARINT = 0,
     WIRETYPE_LENGTH_DELIMITED = 2,
   };
-  static uint32 combine(uint32 tag, uint32 type) { return ((tag << 3) | type); }
-  inline void Encode32(uint32 v) {
+  static uint32_t combine(uint32_t tag, uint32_t type) {
+    return ((tag << 3) | type);
+  }
+  inline void Encode32(uint32_t v) {
     if (v < 128) {
       // Fast path for single-byte values.  Many of the calls will use a
       // constant value for v, so the comparison will get optimized away
@@ -79,7 +81,7 @@ class ProtoEncodeHelper {
       p_ = core::EncodeVarint32(p_, v);
     }
   }
-  void Encode64(uint64 v) { p_ = core::EncodeVarint64(p_, v); }
+  void Encode64(uint64_t v) { p_ = core::EncodeVarint64(p_, v); }
   void EncodeBool(bool v) {
     *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
     p_++;
diff --git a/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc b/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
index 03492d8c95dd3c..9c2dd2f88f7fe7 100644
--- a/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
@@ -25,7 +25,7 @@ namespace {
 
 TEST(RandomInputStream, ReadNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
@@ -55,7 +55,7 @@ TEST(RandomInputStream, ReadNBytes) {
 #if defined(TF_CORD_SUPPORT)
 TEST(RandomInputStream, ReadNBytesWithCords) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
@@ -87,7 +87,7 @@ TEST(RandomInputStream, ReadNBytesWithCords) {
 
 TEST(RandomInputStream, SkipNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
@@ -118,7 +118,7 @@ TEST(RandomInputStream, SkipNBytes) {
 
 TEST(RandomInputStream, Seek) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_seek_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_seek_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
diff --git a/third_party/xla/xla/tsl/lib/io/record_reader.cc b/third_party/xla/xla/tsl/lib/io/record_reader.cc
index 9ff15581af9edc..c1a4619bce86c9 100644
--- a/third_party/xla/xla/tsl/lib/io/record_reader.cc
+++ b/third_party/xla/xla/tsl/lib/io/record_reader.cc
@@ -17,19 +17,30 @@ limitations under the License.
 
 #include <limits.h>
 
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "xla/tsl/lib/hash/crc32c.h"
 #include "xla/tsl/lib/io/buffered_inputstream.h"
 #include "xla/tsl/lib/io/compression.h"
 #include "xla/tsl/lib/io/random_inputstream.h"
+#include "xla/tsl/lib/io/snappy/snappy_inputstream.h"
+#include "xla/tsl/lib/io/zlib_compression_options.h"
+#include "xla/tsl/lib/io/zlib_inputstream.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "tsl/platform/raw_coding.h"
+#include "tsl/platform/tstring.h"
 
 namespace tsl {
 namespace io {
 
 RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
-    const string& compression_type) {
+    const std::string& compression_type) {
   RecordReaderOptions options;
 
 #if defined(IS_SLIM_BUILD)
@@ -86,7 +97,7 @@ RecordReader::RecordReader(RandomAccessFile* file,
 }
 
 namespace {
-inline const char* GetChecksumErrorSuffix(uint64 offset) {
+inline const char* GetChecksumErrorSuffix(uint64_t offset) {
   if (offset == 0) {
     return " (Is this even a TFRecord file?)";
   }
@@ -101,14 +112,14 @@ inline const char* GetChecksumErrorSuffix(uint64 offset) {
 // and is used only in error messages. For failures at offset 0,
 // a reminder about the file format is added, because TFRecord files
 // contain no explicit format marker.
-absl::Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
+absl::Status RecordReader::ReadChecksummed(uint64_t offset, size_t n,
                                            tstring* result) {
-  if (n >= SIZE_MAX - sizeof(uint32)) {
+  if (n >= SIZE_MAX - sizeof(uint32_t)) {
     return errors::DataLoss("record size too large",
                             GetChecksumErrorSuffix(offset));
   }
 
-  const size_t expected = n + sizeof(uint32);
+  const size_t expected = n + sizeof(uint32_t);
   TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
 
   if (result->size() != expected) {
@@ -120,7 +131,7 @@ absl::Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
     }
   }
 
-  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
+  const uint32_t masked_crc = core::DecodeFixed32(result->data() + n);
   if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
     return errors::DataLoss("corrupted record at ", offset,
                             GetChecksumErrorSuffix(offset));
@@ -131,7 +142,7 @@ absl::Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
 
 absl::Status RecordReader::GetMetadata(Metadata* md) {
   if (!md) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Metadata object call to GetMetadata() was null");
   }
 
@@ -145,11 +156,11 @@ absl::Status RecordReader::GetMetadata(Metadata* md) {
     // Within the loop, we always increment offset positively, so this
     // loop should be guaranteed to either return after reaching EOF
     // or encountering an error.
-    uint64 offset = 0;
+    uint64_t offset = 0;
     tstring record;
     while (true) {
       // Read header, containing size of data.
-      absl::Status s = ReadChecksummed(offset, sizeof(uint64), &record);
+      absl::Status s = ReadChecksummed(offset, sizeof(uint64_t), &record);
       if (!s.ok()) {
         if (absl::IsOutOfRange(s)) {
           // We should reach out of range when the record file is complete.
@@ -159,7 +170,7 @@ absl::Status RecordReader::GetMetadata(Metadata* md) {
       }
 
       // Read the length of the data.
-      const uint64 length = core::DecodeFixed64(record.data());
+      const uint64_t length = core::DecodeFixed64(record.data());
 
       // Skip reading the actual data since we just want the number
       // of records and the size of the data.
@@ -182,7 +193,7 @@ absl::Status RecordReader::GetMetadata(Metadata* md) {
   return absl::OkStatus();
 }
 
-absl::Status RecordReader::PositionInputStream(uint64 offset) {
+absl::Status RecordReader::PositionInputStream(uint64_t offset) {
   int64_t curr_pos = input_stream_->Tell();
   int64_t desired_pos = static_cast<int64_t>(offset);
   if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
@@ -197,16 +208,16 @@ absl::Status RecordReader::PositionInputStream(uint64 offset) {
   return absl::OkStatus();
 }
 
-absl::Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
+absl::Status RecordReader::ReadRecord(uint64_t* offset, tstring* record) {
   TF_RETURN_IF_ERROR(PositionInputStream(*offset));
 
   // Read header data.
-  absl::Status s = ReadChecksummed(*offset, sizeof(uint64), record);
+  absl::Status s = ReadChecksummed(*offset, sizeof(uint64_t), record);
   if (!s.ok()) {
     last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(record->data());
+  const uint64_t length = core::DecodeFixed64(record->data());
 
   // Read data
   s = ReadChecksummed(*offset + kHeaderSize, length, record);
@@ -224,7 +235,7 @@ absl::Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
   return absl::OkStatus();
 }
 
-absl::Status RecordReader::SkipRecords(uint64* offset, int num_to_skip,
+absl::Status RecordReader::SkipRecords(uint64_t* offset, int num_to_skip,
                                        int* num_skipped) {
   TF_RETURN_IF_ERROR(PositionInputStream(*offset));
 
@@ -232,12 +243,12 @@ absl::Status RecordReader::SkipRecords(uint64* offset, int num_to_skip,
   tstring record;
   *num_skipped = 0;
   for (int i = 0; i < num_to_skip; ++i) {
-    s = ReadChecksummed(*offset, sizeof(uint64), &record);
+    s = ReadChecksummed(*offset, sizeof(uint64_t), &record);
     if (!s.ok()) {
       last_read_failed_ = true;
       return s;
     }
-    const uint64 length = core::DecodeFixed64(record.data());
+    const uint64_t length = core::DecodeFixed64(record.data());
 
     // Skip data
     s = input_stream_->SkipNBytes(length + kFooterSize);
diff --git a/third_party/xla/xla/tsl/lib/io/record_reader.h b/third_party/xla/xla/tsl/lib/io/record_reader.h
index 8f144148ca33f5..9d6ac85447dc4d 100644
--- a/third_party/xla/xla/tsl/lib/io/record_reader.h
+++ b/third_party/xla/xla/tsl/lib/io/record_reader.h
@@ -47,7 +47,7 @@ struct RecordReaderOptions {
   int64_t buffer_size = 0;
 
   static RecordReaderOptions CreateRecordReaderOptions(
-      const string& compression_type);
+      const std::string& compression_type);
 
 #if !defined(IS_SLIM_BUILD)
   // Options specific to compression.
@@ -68,8 +68,8 @@ class RecordReader {
   //  uint32    masked crc of length
   //  byte      data[length]
   //  uint32    masked crc of data
-  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
-  static constexpr size_t kFooterSize = sizeof(uint32);
+  static constexpr size_t kHeaderSize = sizeof(uint64_t) + sizeof(uint32_t);
+  static constexpr size_t kFooterSize = sizeof(uint32_t);
 
   // Statistics (sizes are in units of bytes)
   struct Stats {
@@ -94,14 +94,14 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  absl::Status ReadRecord(uint64* offset, tstring* record);
+  absl::Status ReadRecord(uint64_t* offset, tstring* record);
 
   // Skip num_to_skip record starting at "*offset" and update *offset
   // to point to the offset of the next num_to_skip + 1 record.
   // Return OK on success, OUT_OF_RANGE for end of file, or something
   // else for an error. "*num_skipped" records the number of records that
   // are actually skipped. It should be equal to num_to_skip on success.
-  absl::Status SkipRecords(uint64* offset, int num_to_skip, int* num_skipped);
+  absl::Status SkipRecords(uint64_t* offset, int num_to_skip, int* num_skipped);
 
   // Return the metadata of the Record file.
   //
@@ -115,8 +115,8 @@ class RecordReader {
   absl::Status GetMetadata(Metadata* md);
 
  private:
-  absl::Status ReadChecksummed(uint64 offset, size_t n, tstring* result);
-  absl::Status PositionInputStream(uint64 offset);
+  absl::Status ReadChecksummed(uint64_t offset, size_t n, tstring* result);
+  absl::Status PositionInputStream(uint64_t offset);
 
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
@@ -156,22 +156,22 @@ class SequentialRecordReader {
   }
 
   // Return the current offset in the file.
-  uint64 TellOffset() { return offset_; }
+  uint64_t TellOffset() { return offset_; }
 
   // Seek to this offset within the file and set this offset as the current
   // offset. Trying to seek backward will throw error.
-  absl::Status SeekOffset(uint64 offset) {
+  absl::Status SeekOffset(uint64_t offset) {
     if (offset < offset_)
-      return errors::InvalidArgument(
-          "Trying to seek offset: ", offset,
-          " which is less than the current offset: ", offset_);
+      return absl::InvalidArgumentError(
+          absl::StrCat("Trying to seek offset: ", offset,
+                       " which is less than the current offset: ", offset_));
     offset_ = offset;
     return absl::OkStatus();
   }
 
  private:
   RecordReader underlying_;
-  uint64 offset_ = 0;
+  uint64_t offset_ = 0;
 };
 
 }  // namespace io
diff --git a/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc b/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc
index 2220a3ba0cc63c..ad3b865411725c 100644
--- a/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc
@@ -48,15 +48,15 @@ io::RecordReaderOptions GetMatchingReaderOptions(
   return io::RecordReaderOptions::CreateRecordReaderOptions("");
 }
 
-uint64 GetFileSize(const string& fname) {
+uint64_t GetFileSize(const std::string& fname) {
   Env* env = Env::Default();
-  uint64 fsize;
+  uint64_t fsize;
   TF_CHECK_OK(env->GetFileSize(fname, &fsize));
   return fsize;
 }
 
 void VerifyFlush(const io::RecordWriterOptions& options) {
-  std::vector<string> records = {
+  std::vector<std::string> records = {
       "abcdefghijklmnopqrstuvwxyz",
       "ZYXWVUTSRQPONMLKJIHGFEDCBA0123456789!@#$%^&*()",
       "G5SyohOL9UmXofSOOwWDrv9hoLLMYPJbG9r38t3uBRcHxHj2PdKcPDuZmKW62RIY",
@@ -64,7 +64,7 @@ void VerifyFlush(const io::RecordWriterOptions& options) {
   };
 
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_flush_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_flush_test";
 
   std::unique_ptr<WritableFile> file;
   TF_CHECK_OK(env->NewWritableFile(fname, &file));
@@ -77,7 +77,7 @@ void VerifyFlush(const io::RecordWriterOptions& options) {
 
   EXPECT_EQ(GetFileSize(fname), 0);
   for (size_t i = 0; i < records.size(); i++) {
-    uint64 start_size = GetFileSize(fname);
+    uint64_t start_size = GetFileSize(fname);
 
     // Write a new record.
     TF_EXPECT_OK(writer.WriteRecord(records[i]));
@@ -85,11 +85,11 @@ void VerifyFlush(const io::RecordWriterOptions& options) {
     TF_CHECK_OK(file->Flush());
 
     // Verify that file size has changed after file flush.
-    uint64 new_size = GetFileSize(fname);
+    uint64_t new_size = GetFileSize(fname);
     EXPECT_GT(new_size, start_size);
 
     // Verify that file has all records written so far and no more.
-    uint64 offset = 0;
+    uint64_t offset = 0;
     tstring record;
     for (size_t j = 0; j <= i; j++) {
       // Check that j'th record is written correctly.
@@ -123,7 +123,7 @@ TEST(RecordReaderWriterTest, TestZlibSyncFlush) {
 
 TEST(RecordReaderWriterTest, TestBasics) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_test";
 
   for (auto buf_size : BufferSizes()) {
     {
@@ -145,7 +145,7 @@ TEST(RecordReaderWriterTest, TestBasics) {
       io::RecordReaderOptions options;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
@@ -164,7 +164,8 @@ TEST(RecordReaderWriterTest, TestBasics) {
 
 TEST(RecordReaderWriterTest, TestSkipBasic) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_skip_basic_test";
+  std::string fname =
+      testing::TmpDir() + "/record_reader_writer_skip_basic_test";
 
   for (auto buf_size : BufferSizes()) {
     {
@@ -187,7 +188,7 @@ TEST(RecordReaderWriterTest, TestSkipBasic) {
       io::RecordReaderOptions options;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       int num_skipped;
       tstring record;
       TF_CHECK_OK(reader.SkipRecords(&offset, 2, &num_skipped));
@@ -200,7 +201,7 @@ TEST(RecordReaderWriterTest, TestSkipBasic) {
 
 TEST(RecordReaderWriterTest, TestSkipOutOfRange) {
   Env* env = Env::Default();
-  string fname =
+  std::string fname =
       testing::TmpDir() + "/record_reader_writer_skip_out_of_range_test";
 
   for (auto buf_size : BufferSizes()) {
@@ -223,7 +224,7 @@ TEST(RecordReaderWriterTest, TestSkipOutOfRange) {
       io::RecordReaderOptions options;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       int num_skipped;
       tstring record;
       absl::Status s = reader.SkipRecords(&offset, 3, &num_skipped);
@@ -235,7 +236,7 @@ TEST(RecordReaderWriterTest, TestSkipOutOfRange) {
 
 TEST(RecordReaderWriterTest, TestMalformedInput) {
   Env* env = Env::Default();
-  string fname =
+  std::string fname =
       testing::TmpDir() + "/record_reader_writer_malformed_input_test";
 
   {
@@ -253,7 +254,7 @@ TEST(RecordReaderWriterTest, TestMalformedInput) {
     io::RecordReader reader(read_file.get());
     tstring record;
     // At offset 0, the error message reminds of the file type.
-    uint64 offset = 0;
+    uint64_t offset = 0;
     absl::Status s = reader.ReadRecord(&offset, &record);
     EXPECT_EQ(error::DATA_LOSS, s.code());
     EXPECT_EQ("corrupted record at 0 (Is this even a TFRecord file?)",
@@ -269,7 +270,7 @@ TEST(RecordReaderWriterTest, TestMalformedInput) {
 
 TEST(RecordReaderWriterTest, TestSnappy) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
 
   for (auto buf_size : BufferSizes()) {
     // Snappy compression needs output buffer size > 1.
@@ -295,7 +296,7 @@ TEST(RecordReaderWriterTest, TestSnappy) {
       options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
@@ -307,7 +308,7 @@ TEST(RecordReaderWriterTest, TestSnappy) {
 
 TEST(RecordReaderWriterTest, TestZlib) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
 
   for (auto buf_size : BufferSizes()) {
     // Zlib compression needs output buffer size > 1.
@@ -333,7 +334,7 @@ TEST(RecordReaderWriterTest, TestZlib) {
       options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
@@ -345,7 +346,8 @@ TEST(RecordReaderWriterTest, TestZlib) {
 
 TEST(RecordReaderWriterTest, TestUseAfterClose) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_flush_close_test";
+  std::string fname =
+      testing::TmpDir() + "/record_reader_writer_flush_close_test";
 
   {
     std::unique_ptr<WritableFile> file;
diff --git a/third_party/xla/xla/tsl/lib/io/record_writer.cc b/third_party/xla/xla/tsl/lib/io/record_writer.cc
index 985e415f632a61..0bc224a195cf84 100644
--- a/third_party/xla/xla/tsl/lib/io/record_writer.cc
+++ b/third_party/xla/xla/tsl/lib/io/record_writer.cc
@@ -33,7 +33,7 @@ bool IsSnappyCompressed(const RecordWriterOptions& options) {
 }  // namespace
 
 RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
-    const string& compression_type) {
+    const std::string& compression_type) {
   RecordWriterOptions options;
 #if defined(IS_SLIM_BUILD)
   if (compression_type != compression::kNone) {
diff --git a/third_party/xla/xla/tsl/lib/io/record_writer.h b/third_party/xla/xla/tsl/lib/io/record_writer.h
index ced0bc687a6e28..f48fa58b0bc82d 100644
--- a/third_party/xla/xla/tsl/lib/io/record_writer.h
+++ b/third_party/xla/xla/tsl/lib/io/record_writer.h
@@ -46,7 +46,7 @@ struct RecordWriterOptions {
   CompressionType compression_type = NONE;
 
   static RecordWriterOptions CreateRecordWriterOptions(
-      const string& compression_type);
+      const std::string& compression_type);
 
 #if !defined(IS_SLIM_BUILD)
   // Options specific to compression.
@@ -62,8 +62,8 @@ class RecordWriter {
   //  uint32    masked crc of length
   //  byte      data[length]
   //  uint32    masked crc of data
-  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
-  static constexpr size_t kFooterSize = sizeof(uint32);
+  static constexpr size_t kHeaderSize = sizeof(uint64_t) + sizeof(uint32_t);
+  static constexpr size_t kFooterSize = sizeof(uint32_t);
 
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
@@ -114,12 +114,12 @@ class RecordWriter {
   WritableFile* dest_;
   RecordWriterOptions options_;
 
-  inline static uint32 MaskedCrc(const char* data, size_t n) {
+  inline static uint32_t MaskedCrc(const char* data, size_t n) {
     return crc32c::Mask(crc32c::Value(data, n));
   }
 
 #if defined(TF_CORD_SUPPORT)
-  inline static uint32 MaskedCrc(const absl::Cord& data) {
+  inline static uint32_t MaskedCrc(const absl::Cord& data) {
     return crc32c::Mask(crc32c::Value(data));
   }
 #endif
@@ -130,8 +130,8 @@ class RecordWriter {
 
 void RecordWriter::PopulateHeader(char* header, const char* data, size_t n) {
   core::EncodeFixed64(header + 0, n);
-  core::EncodeFixed32(header + sizeof(uint64),
-                      MaskedCrc(header, sizeof(uint64)));
+  core::EncodeFixed32(header + sizeof(uint64_t),
+                      MaskedCrc(header, sizeof(uint64_t)));
 }
 
 void RecordWriter::PopulateFooter(char* footer, const char* data, size_t n) {
@@ -141,8 +141,8 @@ void RecordWriter::PopulateFooter(char* footer, const char* data, size_t n) {
 #if defined(TF_CORD_SUPPORT)
 void RecordWriter::PopulateHeader(char* header, const absl::Cord& data) {
   core::EncodeFixed64(header + 0, data.size());
-  core::EncodeFixed32(header + sizeof(uint64),
-                      MaskedCrc(header, sizeof(uint64)));
+  core::EncodeFixed32(header + sizeof(uint64_t),
+                      MaskedCrc(header, sizeof(uint64_t)));
 }
 
 void RecordWriter::PopulateFooter(char* footer, const absl::Cord& data) {
diff --git a/third_party/xla/xla/tsl/lib/io/recordio_test.cc b/third_party/xla/xla/tsl/lib/io/recordio_test.cc
index 7dc9e4c77b4fb5..138b937d6f0fef 100644
--- a/third_party/xla/xla/tsl/lib/io/recordio_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/recordio_test.cc
@@ -30,8 +30,8 @@ namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-string BigString(const string& partial_string, size_t n) {
-  string result;
+std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
   while (result.size() < n) {
     result.append(partial_string);
   }
@@ -40,20 +40,20 @@ string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-string NumberString(int n) {
+std::string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
-  return string(buf);
+  return std::string(buf);
 }
 
 // Return a skewed potentially long string
-string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+std::string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
 class StringDest : public WritableFile {
  public:
-  explicit StringDest(string* contents) : contents_(contents) {}
+  explicit StringDest(std::string* contents) : contents_(contents) {}
 
   absl::Status Close() override { return absl::OkStatus(); }
   absl::Status Flush() override { return absl::OkStatus(); }
@@ -74,15 +74,15 @@ class StringDest : public WritableFile {
   }
 
  private:
-  string* contents_;
+  std::string* contents_;
 };
 
 class StringSource : public RandomAccessFile {
  public:
-  explicit StringSource(string* contents)
+  explicit StringSource(std::string* contents)
       : contents_(contents), force_error_(false) {}
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     if (force_error_) {
       force_error_ = false;
@@ -103,17 +103,17 @@ class StringSource : public RandomAccessFile {
   void force_error() { force_error_ = true; }
 
  private:
-  string* contents_;
+  std::string* contents_;
   mutable bool force_error_;
 };
 
 class RecordioTest : public ::testing::Test {
  private:
-  string contents_;
+  std::string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
-  uint64 readpos_;
+  uint64_t readpos_;
   RecordWriter* writer_;
   RecordReader* reader_;
 
@@ -131,7 +131,7 @@ class RecordioTest : public ::testing::Test {
     delete reader_;
   }
 
-  void Write(const string& msg) {
+  void Write(const std::string& msg) {
     ASSERT_TRUE(!reading_) << "Write() after starting to read";
     TF_ASSERT_OK(writer_->WriteRecord(absl::string_view(msg)));
   }
@@ -145,7 +145,7 @@ class RecordioTest : public ::testing::Test {
 
   size_t WrittenBytes() const { return contents_.size(); }
 
-  string Read() {
+  std::string Read() {
     if (!reading_) {
       reading_ = true;
     }
@@ -182,7 +182,7 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
-    uint64 offset = WrittenBytes() + offset_past_end;
+    uint64_t offset = WrittenBytes() + offset_past_end;
     tstring record;
     absl::Status s = reader_->ReadRecord(&offset, &record);
     ASSERT_TRUE(absl::IsOutOfRange(s)) << s;
@@ -250,7 +250,7 @@ TEST_F(RecordioTest, RandomRead) {
 
 void TestNonSequentialReads(const RecordWriterOptions& writer_options,
                             const RecordReaderOptions& reader_options) {
-  string contents;
+  std::string contents;
   StringDest dst(&contents);
   RecordWriter writer(&dst, writer_options);
   for (int i = 0; i < 10; ++i) {
@@ -263,8 +263,8 @@ void TestNonSequentialReads(const RecordWriterOptions& writer_options,
 
   tstring record;
   // First read sequentially to fill in the offsets table.
-  uint64 offsets[10] = {0};
-  uint64 offset = 0;
+  uint64_t offsets[10] = {0};
+  uint64_t offset = 0;
   for (int i = 0; i < 10; ++i) {
     offsets[i] = offset;
     TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
@@ -306,15 +306,15 @@ void AssertHasSubstr(absl::string_view s, absl::string_view expected) {
 
 void TestReadError(const RecordWriterOptions& writer_options,
                    const RecordReaderOptions& reader_options) {
-  const string wrote = BigString("well hello there!", 100);
-  string contents;
+  const std::string wrote = BigString("well hello there!", 100);
+  std::string contents;
   StringDest dst(&contents);
   TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
 
   StringSource file(&contents);
   RecordReader reader(&file, reader_options);
 
-  uint64 offset = 0;
+  uint64_t offset = 0;
   tstring read;
   file.force_error();
   absl::Status status = reader.ReadRecord(&offset, &read);
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/BUILD b/third_party/xla/xla/tsl/lib/io/snappy/BUILD
index 5c0505c21efd2d..8df9cecb8e0520 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/BUILD
+++ b/third_party/xla/xla/tsl/lib/io/snappy/BUILD
@@ -36,11 +36,17 @@ cc_library(
     deps = [
         "//xla/tsl/lib/io:inputstream_interface",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:tstring",
     ],
     alwayslink = True,
 )
@@ -67,8 +73,13 @@ cc_library(
     deps = [
         "//xla/tsl/lib/io:inputstream_interface",
         "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:tstring",
     ],
     alwayslink = True,
 )
@@ -94,8 +105,16 @@ tsl_cc_test(
         "//xla/tsl/lib/io:inputbuffer",
         "//xla/tsl/lib/io:random_inputstream",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:env_impl",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:tstring",
     ],
 )
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
index b1e2b92b90b6bf..16000a6bafbb81 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
@@ -16,8 +16,19 @@ limitations under the License.
 #include "xla/tsl/lib/io/snappy/snappy_inputbuffer.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
+#include "tsl/platform/snappy.h"
+#include "tsl/platform/tstring.h"
 
 namespace tsl {
 namespace io {
@@ -86,7 +97,7 @@ size_t SnappyInputBuffer::ReadBytesFromCache(size_t bytes_to_read,
 
 absl::Status SnappyInputBuffer::Inflate() {
   // Read length of compressed block.
-  uint32 compressed_block_length;
+  uint32_t compressed_block_length;
   TF_RETURN_IF_ERROR(ReadCompressedBlockLength(&compressed_block_length));
 
   // If the entire block is not in cache do a read from file.
@@ -94,10 +105,10 @@ absl::Status SnappyInputBuffer::Inflate() {
     TF_RETURN_IF_ERROR(ReadFromFile());
     if (avail_in_ < compressed_block_length) {
       if (compressed_block_length > input_buffer_capacity_) {
-        return errors::ResourceExhausted(
-            "Input buffer(size: ", input_buffer_capacity_,
-            " bytes) too small. Should be larger ", "than ",
-            compressed_block_length, " bytes.");
+        return absl::ResourceExhaustedError(
+            absl::StrCat("Input buffer(size: ", input_buffer_capacity_,
+                         " bytes) too small. Should be larger than ",
+                         compressed_block_length, " bytes."));
       } else {
         return errors::DataLoss(
             absl::StrCat("Failed to read ", compressed_block_length,
@@ -130,7 +141,7 @@ absl::Status SnappyInputBuffer::Inflate() {
   return absl::OkStatus();
 }
 
-absl::Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
+absl::Status SnappyInputBuffer::ReadCompressedBlockLength(uint32_t* length) {
   *length = 0;
   size_t bytes_to_read = 4;
   while (bytes_to_read > 0) {
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
index 8688e368719828..8a0c75fa0ad0c2 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
@@ -92,7 +92,7 @@ class SnappyInputBuffer : public InputStreamInterface {
 
   // Reads the length of the next *compressed* block and stores in `length`.
   // The length is stored in 4 bytes in little endian notation.
-  absl::Status ReadCompressedBlockLength(uint32* length);
+  absl::Status ReadCompressedBlockLength(uint32_t* length);
 
   RandomAccessFile* file_;         // Not owned
   int64_t file_pos_ = 0;           // Next position to read from in `file_`
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
index 30e3abbd777993..ce485086ee4ba2 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
@@ -16,10 +16,18 @@ limitations under the License.
 #include "xla/tsl/lib/io/snappy/snappy_inputstream.h"
 
 #include <algorithm>
-
-#include "absl/memory/memory.h"
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "xla/tsl/lib/io/inputstream_interface.h"
 #include "xla/tsl/platform/errors.h"
 #include "tsl/platform/snappy.h"
+#include "tsl/platform/tstring.h"
 
 namespace tsl {
 namespace io {
@@ -85,11 +93,11 @@ absl::Status SnappyInputStream::ReadNBytes(int64_t bytes_to_read,
 
 absl::Status SnappyInputStream::Inflate() {
   tstring compressed_block_length_ts;
-  uint32 compressed_block_length;
+  uint32_t compressed_block_length;
 
   TF_RETURN_IF_ERROR(
-      input_stream_->ReadNBytes(sizeof(uint32), &compressed_block_length_ts));
-  for (int i = 0; i < sizeof(uint32); ++i) {
+      input_stream_->ReadNBytes(sizeof(uint32_t), &compressed_block_length_ts));
+  for (int i = 0; i < sizeof(uint32_t); ++i) {
     compressed_block_length =
         (compressed_block_length << 8) |
         static_cast<unsigned char>(compressed_block_length_ts.data()[i]);
@@ -115,11 +123,10 @@ absl::Status SnappyInputStream::Inflate() {
 
   DCHECK_EQ(avail_out_, 0);
   if (output_buffer_bytes_ < uncompressed_length) {
-    return errors::ResourceExhausted(
-        "Output buffer(size: ", output_buffer_bytes_,
-        " bytes"
-        ") too small. Should be larger than ",
-        uncompressed_length, " bytes.");
+    return absl::ResourceExhaustedError(
+        absl::StrCat("Output buffer(size: ", output_buffer_bytes_,
+                     " bytes) too small. Should be larger than ",
+                     uncompressed_length, " bytes."));
   }
 
   next_out_ = output_buffer_.get();
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc
index 7241d24c46b155..4bbca4712b6ef3 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc
@@ -80,7 +80,7 @@ absl::Status SnappyOutputBuffer::Write(absl::string_view data) {
 
   // If there is sufficient free space in input_buffer_ to fit data we
   // add it there and return.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (static_cast<int32_t>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return absl::OkStatus();
   }
@@ -91,7 +91,7 @@ absl::Status SnappyOutputBuffer::Write(absl::string_view data) {
   TF_RETURN_IF_ERROR(DeflateBuffered());
 
   // input_buffer_ should be empty at this point.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (static_cast<int32_t>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return absl::OkStatus();
   }
@@ -117,7 +117,7 @@ absl::Status SnappyOutputBuffer::Flush() {
   return absl::OkStatus();
 }
 
-int32 SnappyOutputBuffer::AvailableInputSpace() const {
+int32_t SnappyOutputBuffer::AvailableInputSpace() const {
   return input_buffer_capacity_ - avail_in_;
 }
 
@@ -148,7 +148,7 @@ void SnappyOutputBuffer::AddToInputBuffer(absl::string_view data) {
   const int32_t free_tail_bytes =
       input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
+  if (static_cast<int32_t>(bytes_to_write) > free_tail_bytes) {
     memmove(input_buffer_.get(), next_in_, avail_in_);
     next_in_ = input_buffer_.get();
   }
@@ -197,7 +197,7 @@ absl::Status SnappyOutputBuffer::Deflate() {
   if (avail_in_ == 0) {
     return absl::OkStatus();
   }
-  string output;
+  std::string output;
   if (!port::Snappy_Compress(next_in_, avail_in_, &output)) {
     return errors::DataLoss("Snappy_Compress failed");
   }
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
index d48ded2196a454..0abd3e48942c7b 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
@@ -114,7 +114,7 @@ class SnappyOutputBuffer : public WritableFile {
   absl::Status AddToOutputBuffer(const char* data, size_t length);
 
   // Returns the total space available in `input_buffer_`.
-  int32 AvailableInputSpace() const;
+  int32_t AvailableInputSpace() const;
 
   // Deflate contents in input_buffer_ and store results in output_buffer_.
   // The contents of output stream are written to file if more space is needed.
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
index 8104f89131567f..9ed24c20d53110 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
@@ -13,29 +13,42 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tsl/platform/snappy.h"
+
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
 #include <memory>
+#include <string>
 
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/lib/io/inputbuffer.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/tsl/lib/io/random_inputstream.h"
 #include "xla/tsl/lib/io/snappy/snappy_inputbuffer.h"
 #include "xla/tsl/lib/io/snappy/snappy_inputstream.h"
 #include "xla/tsl/lib/io/snappy/snappy_outputbuffer.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/test.h"
+#include "tsl/platform/tstring.h"
 
 namespace tsl {
 
-static void CheckPrefixSuffix(absl::string_view str, const string& prefix,
-                              const string& suffix) {
+static void CheckPrefixSuffix(absl::string_view str, const std::string& prefix,
+                              const std::string& suffix) {
   CHECK_GE(str.size(), prefix.size());
   CHECK_GE(str.size(), suffix.size());
   CHECK_EQ(str.substr(0, prefix.length()), prefix);
   CHECK_EQ(str.substr(str.length() - suffix.length()), suffix);
 }
 
-static string GetRecord() {
-  static const string lorem_ipsum =
+static std::string GetRecord() {
+  static const std::string lorem_ipsum =
       "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
       " Fusce vehicula tincidunt libero sit amet ultrices. Vestibulum non "
       "felis augue. Duis vitae augue id lectus lacinia congue et ut purus. "
@@ -51,8 +64,8 @@ static string GetRecord() {
   return lorem_ipsum;
 }
 
-static string GenTestString(int copies = 1) {
-  string result = "";
+static std::string GenTestString(int copies = 1) {
+  std::string result = "";
   for (int i = 0; i < copies; i++) {
     result += GetRecord();
   }
@@ -64,8 +77,8 @@ absl::Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
                                          int num_writes, bool with_flush,
                                          int num_copies,
                                          bool corrupt_compressed_file,
-                                         string& fname, string& data,
-                                         string& expected_result) {
+                                         std::string& fname, std::string& data,
+                                         std::string& expected_result) {
   Env* env = Env::Default();
 
   fname = testing::TmpDir() + "/snappy_buffers_test";
@@ -88,7 +101,8 @@ absl::Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
   TF_RETURN_IF_ERROR(file_writer->Close());
 
   if (corrupt_compressed_file) {
-    string corrupt_fname = testing::TmpDir() + "/snappy_buffers_test_corrupt";
+    std::string corrupt_fname =
+        testing::TmpDir() + "/snappy_buffers_test_corrupt";
     std::unique_ptr<WritableFile> corrupt_file_writer;
     TF_RETURN_IF_ERROR(
         env->NewWritableFile(corrupt_fname, &corrupt_file_writer));
@@ -135,9 +149,9 @@ absl::Status TestMultipleWrites(size_t compress_input_buf_size,
                                 bool corrupt_compressed_file = false) {
   Env* env = Env::Default();
 
-  string expected_result;
-  string fname;
-  string data;
+  std::string expected_result;
+  std::string fname;
+  std::string data;
 
   TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
       compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
@@ -150,7 +164,7 @@ absl::Status TestMultipleWrites(size_t compress_input_buf_size,
 
   // Run the test twice, resetting the stream after the first attempt.
   for (int attempt = 0; attempt < 2; ++attempt) {
-    string actual_result;
+    std::string actual_result;
     for (int i = 0; i < num_writes; i++) {
       tstring decompressed_output;
       TF_RETURN_IF_ERROR(in.ReadNBytes(data.size(), &decompressed_output));
@@ -173,9 +187,9 @@ absl::Status TestMultipleWritesInputStream(
     bool corrupt_compressed_file = false) {
   Env* env = Env::Default();
 
-  string expected_result;
-  string fname;
-  string data;
+  std::string expected_result;
+  std::string fname;
+  std::string data;
 
   TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
       compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
@@ -188,7 +202,7 @@ absl::Status TestMultipleWritesInputStream(
                                             uncompress_output_buf_size);
 
   for (int attempt = 0; attempt < 2; ++attempt) {
-    string actual_result;
+    std::string actual_result;
     for (int i = 0; i < num_writes; ++i) {
       tstring decompressed_output;
       TF_RETURN_IF_ERROR(
@@ -208,7 +222,7 @@ void TestTellWriteFile(size_t compress_input_buf_size,
                        size_t compress_output_buf_size,
                        size_t uncompress_input_buf_size,
                        size_t uncompress_output_buf_size, int num_copies,
-                       string& fname, string& data) {
+                       std::string& fname, std::string& data) {
   Env* env = Env::Default();
   fname = testing::TmpDir() + "/snappy_buffers_test";
   data = GenTestString(num_copies);
@@ -228,14 +242,14 @@ void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
               size_t uncompress_input_buf_size,
               size_t uncompress_output_buf_size, int num_copies = 1) {
   Env* env = Env::Default();
-  string data;
-  string fname;
+  std::string data;
+  std::string fname;
 
   TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
                     uncompress_input_buf_size, uncompress_output_buf_size,
                     num_copies, fname, data);
 
-  tstring first_half(string(data, 0, data.size() / 2));
+  tstring first_half(std::string(data, 0, data.size() / 2));
   tstring bytes_read;
   std::unique_ptr<RandomAccessFile> file_reader;
   TF_CHECK_OK(env->NewRandomAccessFile(fname, &file_reader));
@@ -265,14 +279,14 @@ void TestTellInputStream(size_t compress_input_buf_size,
                          size_t uncompress_output_buf_size,
                          int num_copies = 1) {
   Env* env = Env::Default();
-  string data;
-  string fname;
+  std::string data;
+  std::string fname;
 
   TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
                     uncompress_input_buf_size, uncompress_output_buf_size,
                     num_copies, fname, data);
 
-  tstring first_half(string(data, 0, data.size() / 2));
+  tstring first_half(std::string(data, 0, data.size() / 2));
   tstring bytes_read;
   std::unique_ptr<RandomAccessFile> file_reader;
   TF_CHECK_OK(env->NewRandomAccessFile(fname, &file_reader));
@@ -297,7 +311,7 @@ void TestTellInputStream(size_t compress_input_buf_size,
 }
 
 static bool SnappyCompressionSupported() {
-  string out;
+  std::string out;
   absl::string_view in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
   return port::Snappy_Compress(in.data(), in.size(), &out);
 }
@@ -340,9 +354,9 @@ TEST(SnappyBuffers, SmallUncompressInputStream) {
     return;
   }
   CHECK_EQ(TestMultipleWritesInputStream(10000, 10000, 10000, 10, 2, true),
-           errors::ResourceExhausted(
+           absl::ResourceExhaustedError(absl::StrCat(
                "Output buffer(size: 10 bytes) too small. ",
-               "Should be larger than ", GetRecord().size(), " bytes."));
+               "Should be larger than ", GetRecord().size(), " bytes.")));
 }
 
 TEST(SnappyBuffers, CorruptBlock) {
diff --git a/third_party/xla/xla/tsl/lib/io/table.cc b/third_party/xla/xla/tsl/lib/io/table.cc
index 6274b788c548e9..0c7fc36e5d295b 100644
--- a/third_party/xla/xla/tsl/lib/io/table.cc
+++ b/third_party/xla/xla/tsl/lib/io/table.cc
@@ -34,14 +34,14 @@ struct Table::Rep {
   Options options;
   absl::Status status;
   RandomAccessFile* file;
-  uint64 cache_id;
+  uint64_t cache_id;
 
   BlockHandle metaindex_handle;  // Handle to metaindex_block: saved from footer
   Block* index_block;
 };
 
 absl::Status Table::Open(const Options& options, RandomAccessFile* file,
-                         uint64 size, Table** table) {
+                         uint64_t size, Table** table) {
   *table = nullptr;
   if (size < Footer::kEncodedLength) {
     return errors::DataLoss("file is too short to be an sstable");
@@ -181,10 +181,10 @@ absl::Status Table::InternalGet(absl::string_view key, void* arg,
   return s;
 }
 
-uint64 Table::ApproximateOffsetOf(absl::string_view key) const {
+uint64_t Table::ApproximateOffsetOf(absl::string_view key) const {
   Iterator* index_iter = rep_->index_block->NewIterator();
   index_iter->Seek(key);
-  uint64 result;
+  uint64_t result;
   if (index_iter->Valid()) {
     BlockHandle handle;
     absl::string_view input = index_iter->value();
diff --git a/third_party/xla/xla/tsl/lib/io/table.h b/third_party/xla/xla/tsl/lib/io/table.h
index 9ef0230fd90f46..253ad4dad72586 100644
--- a/third_party/xla/xla/tsl/lib/io/table.h
+++ b/third_party/xla/xla/tsl/lib/io/table.h
@@ -44,7 +44,7 @@ class Table {
   // "*file", but the client must ensure that "file" remains live
   // for the duration of the returned table's lifetime.
   static absl::Status Open(const Options& options, tsl::RandomAccessFile* file,
-                           uint64 file_size, Table** table);
+                           uint64_t file_size, Table** table);
 
   ~Table();
 
@@ -59,7 +59,7 @@ class Table {
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
-  uint64 ApproximateOffsetOf(absl::string_view key) const;
+  uint64_t ApproximateOffsetOf(absl::string_view key) const;
 
  private:
   struct Rep;
diff --git a/third_party/xla/xla/tsl/lib/io/table_builder.cc b/third_party/xla/xla/tsl/lib/io/table_builder.cc
index ee8f8e3bb17a13..0ee043a09093e8 100644
--- a/third_party/xla/xla/tsl/lib/io/table_builder.cc
+++ b/third_party/xla/xla/tsl/lib/io/table_builder.cc
@@ -33,7 +33,7 @@ namespace table {
 
 namespace {
 
-void FindShortestSeparator(string* start, absl::string_view limit) {
+void FindShortestSeparator(std::string* start, absl::string_view limit) {
   // Find length of common prefix
   size_t min_length = std::min(start->size(), limit.size());
   size_t diff_index = 0;
@@ -45,9 +45,9 @@ void FindShortestSeparator(string* start, absl::string_view limit) {
   if (diff_index >= min_length) {
     // Do not shorten if one string is a prefix of the other
   } else {
-    uint8 diff_byte = static_cast<uint8>((*start)[diff_index]);
-    if (diff_byte < static_cast<uint8>(0xff) &&
-        diff_byte + 1 < static_cast<uint8>(limit[diff_index])) {
+    uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]);
+    if (diff_byte < static_cast<uint8_t>(0xff) &&
+        diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) {
       (*start)[diff_index]++;
       start->resize(diff_index + 1);
       assert(absl::string_view(*start).compare(limit) < 0);
@@ -55,12 +55,12 @@ void FindShortestSeparator(string* start, absl::string_view limit) {
   }
 }
 
-void FindShortSuccessor(string* key) {
+void FindShortSuccessor(std::string* key) {
   // Find first character that can be incremented
   size_t n = key->size();
   for (size_t i = 0; i < n; i++) {
-    const uint8 byte = (*key)[i];
-    if (byte != static_cast<uint8>(0xff)) {
+    const uint8_t byte = (*key)[i];
+    if (byte != static_cast<uint8_t>(0xff)) {
       (*key)[i] = byte + 1;
       key->resize(i + 1);
       return;
@@ -74,11 +74,11 @@ struct TableBuilder::Rep {
   Options options;
   Options index_block_options;
   WritableFile* file;
-  uint64 offset;
+  uint64_t offset;
   absl::Status status;
   BlockBuilder data_block;
   BlockBuilder index_block;
-  string last_key;
+  std::string last_key;
   int64_t num_entries;
   bool closed;  // Either Finish() or Abandon() has been called.
 
@@ -94,7 +94,7 @@ struct TableBuilder::Rep {
   bool pending_index_entry;
   BlockHandle pending_handle;  // Handle to add to index block
 
-  string compressed_output;
+  std::string compressed_output;
 
   Rep(const Options& opt, WritableFile* f)
       : options(opt),
@@ -136,7 +136,7 @@ void TableBuilder::Add(absl::string_view key, absl::string_view value) {
   if (r->pending_index_entry) {
     assert(r->data_block.empty());
     FindShortestSeparator(&r->last_key, key);
-    string handle_encoding;
+    std::string handle_encoding;
     r->pending_handle.EncodeTo(&handle_encoding);
     r->index_block.Add(r->last_key, absl::string_view(handle_encoding));
     r->pending_index_entry = false;
@@ -183,7 +183,7 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
       break;
 
     case kSnappyCompression: {
-      string* compressed = &r->compressed_output;
+      std::string* compressed = &r->compressed_output;
       if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
           compressed->size() < raw.size() - (raw.size() / 8u)) {
         block_contents = *compressed;
@@ -210,7 +210,7 @@ void TableBuilder::WriteRawBlock(absl::string_view block_contents,
   if (r->status.ok()) {
     char trailer[kBlockTrailerSize];
     trailer[0] = type;
-    uint32 crc = crc32c::Value(block_contents.data(), block_contents.size());
+    uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
     crc = crc32c::Extend(crc, trailer, 1);  // Extend crc to cover block type
     core::EncodeFixed32(trailer + 1, crc32c::Mask(crc));
     r->status = r->file->Append(absl::string_view(trailer, kBlockTrailerSize));
@@ -241,7 +241,7 @@ absl::Status TableBuilder::Finish() {
   if (ok()) {
     if (r->pending_index_entry) {
       FindShortSuccessor(&r->last_key);
-      string handle_encoding;
+      std::string handle_encoding;
       r->pending_handle.EncodeTo(&handle_encoding);
       r->index_block.Add(r->last_key, absl::string_view(handle_encoding));
       r->pending_index_entry = false;
@@ -254,7 +254,7 @@ absl::Status TableBuilder::Finish() {
     Footer footer;
     footer.set_metaindex_handle(metaindex_block_handle);
     footer.set_index_handle(index_block_handle);
-    string footer_encoding;
+    std::string footer_encoding;
     footer.EncodeTo(&footer_encoding);
     r->status = r->file->Append(footer_encoding);
     if (r->status.ok()) {
@@ -270,9 +270,9 @@ void TableBuilder::Abandon() {
   r->closed = true;
 }
 
-uint64 TableBuilder::NumEntries() const { return rep_->num_entries; }
+uint64_t TableBuilder::NumEntries() const { return rep_->num_entries; }
 
-uint64 TableBuilder::FileSize() const { return rep_->offset; }
+uint64_t TableBuilder::FileSize() const { return rep_->offset; }
 
 }  // namespace table
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/io/table_builder.h b/third_party/xla/xla/tsl/lib/io/table_builder.h
index 7ffdf9c9f94d44..bbbb2da5931f8c 100644
--- a/third_party/xla/xla/tsl/lib/io/table_builder.h
+++ b/third_party/xla/xla/tsl/lib/io/table_builder.h
@@ -75,11 +75,11 @@ class TableBuilder {
   void Abandon();
 
   // Number of calls to Add() so far.
-  uint64 NumEntries() const;
+  uint64_t NumEntries() const;
 
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
-  uint64 FileSize() const;
+  uint64_t FileSize() const;
 
  private:
   bool ok() const { return status().ok(); }
diff --git a/third_party/xla/xla/tsl/lib/io/table_test.cc b/third_party/xla/xla/tsl/lib/io/table_test.cc
index 0955f7b9f74041..2411834653cc40 100644
--- a/third_party/xla/xla/tsl/lib/io/table_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/table_test.cc
@@ -41,19 +41,19 @@ typedef std::pair<absl::string_view, absl::string_view> StringPiecePair;
 
 namespace test {
 static absl::string_view RandomString(random::SimplePhilox* rnd, int len,
-                                      string* dst) {
+                                      std::string* dst) {
   dst->resize(len);
   for (int i = 0; i < len; i++) {
     (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));  // ' ' .. '~'
   }
   return absl::string_view(*dst);
 }
-static string RandomKey(random::SimplePhilox* rnd, int len) {
+static std::string RandomKey(random::SimplePhilox* rnd, int len) {
   // Make sure to generate a wide variety of characters so we
   // test the boundary conditions for short-key optimizations.
   static const char kTestChars[] = {'\0', '\1', 'a',    'b',    'c',
                                     'd',  'e',  '\xfd', '\xfe', '\xff'};
-  string result;
+  std::string result;
   for (int i = 0; i < len; i++) {
     result += kTestChars[rnd->Uniform(sizeof(kTestChars))];
   }
@@ -61,10 +61,10 @@ static string RandomKey(random::SimplePhilox* rnd, int len) {
 }
 static absl::string_view CompressibleString(random::SimplePhilox* rnd,
                                             double compressed_fraction,
-                                            size_t len, string* dst) {
+                                            size_t len, std::string* dst) {
   int raw = static_cast<int>(len * compressed_fraction);
   if (raw < 1) raw = 1;
-  string raw_data;
+  std::string raw_data;
   RandomString(rnd, raw, &raw_data);
 
   // Duplicate the random data until we have filled "len" bytes
@@ -77,13 +77,13 @@ static absl::string_view CompressibleString(random::SimplePhilox* rnd,
 }
 }  // namespace test
 
-static void Increment(string* key) { key->push_back('\0'); }
+static void Increment(std::string* key) { key->push_back('\0'); }
 
 // An STL comparator that compares two StringPieces
 namespace {
 struct STLLessThan {
   STLLessThan() {}
-  bool operator()(const string& a, const string& b) const {
+  bool operator()(const std::string& a, const std::string& b) const {
     return absl::string_view(a).compare(absl::string_view(b)) < 0;
   }
 };
@@ -93,7 +93,7 @@ class StringSink : public WritableFile {
  public:
   ~StringSink() override {}
 
-  const string& contents() const { return contents_; }
+  const std::string& contents() const { return contents_; }
 
   absl::Status Close() override { return absl::OkStatus(); }
   absl::Status Flush() override { return absl::OkStatus(); }
@@ -112,7 +112,7 @@ class StringSink : public WritableFile {
   }
 
  private:
-  string contents_;
+  std::string contents_;
 };
 
 class StringSource : public RandomAccessFile {
@@ -122,13 +122,13 @@ class StringSource : public RandomAccessFile {
 
   ~StringSource() override {}
 
-  uint64 Size() const { return contents_.size(); }
+  uint64_t Size() const { return contents_.size(); }
 
   absl::Status Name(absl::string_view* result) const override {
     return errors::Unimplemented("StringSource does not support Name()");
   }
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     if (offset > contents_.size()) {
       return errors::InvalidArgument("invalid Read offset");
@@ -142,14 +142,14 @@ class StringSource : public RandomAccessFile {
     return absl::OkStatus();
   }
 
-  uint64 BytesRead() const { return bytes_read_; }
+  uint64_t BytesRead() const { return bytes_read_; }
 
  private:
-  string contents_;
-  mutable uint64 bytes_read_;
+  std::string contents_;
+  mutable uint64_t bytes_read_;
 };
 
-typedef std::map<string, string, STLLessThan> KVMap;
+typedef std::map<std::string, std::string, STLLessThan> KVMap;
 
 // Helper class for tests to unify the interface between
 // BlockBuilder/TableBuilder and Block/Table.
@@ -158,14 +158,15 @@ class Constructor {
   explicit Constructor() : data_(STLLessThan()) {}
   virtual ~Constructor() {}
 
-  void Add(const string& key, absl::string_view value) {
-    data_[key] = string(value);
+  void Add(const std::string& key, absl::string_view value) {
+    data_[key] = std::string(value);
   }
 
   // Finish constructing the data structure with all the keys that have
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
-  void Finish(const Options& options, std::vector<string>* keys, KVMap* kvmap) {
+  void Finish(const Options& options, std::vector<std::string>* keys,
+              KVMap* kvmap) {
     *kvmap = data_;
     keys->clear();
     for (KVMap::const_iterator it = data_.begin(); it != data_.end(); ++it) {
@@ -201,7 +202,7 @@ class BlockConstructor : public Constructor {
       builder.Add(it->first, it->second);
     }
     // Open the block
-    data_ = string(builder.Finish());
+    data_ = std::string(builder.Finish());
     BlockContents contents;
     contents.data = data_;
     contents.cacheable = false;
@@ -212,7 +213,7 @@ class BlockConstructor : public Constructor {
   Iterator* NewIterator() const override { return block_->NewIterator(); }
 
  private:
-  string data_;
+  std::string data_;
   Block* block_;
 };
 
@@ -242,11 +243,11 @@ class TableConstructor : public Constructor {
 
   Iterator* NewIterator() const override { return table_->NewIterator(); }
 
-  uint64 ApproximateOffsetOf(absl::string_view key) const {
+  uint64_t ApproximateOffsetOf(absl::string_view key) const {
     return table_->ApproximateOffsetOf(key);
   }
 
-  uint64 BytesRead() const { return source_->BytesRead(); }
+  uint64_t BytesRead() const { return source_->BytesRead(); }
 
  private:
   void Reset() {
@@ -298,12 +299,12 @@ class Harness : public ::testing::Test {
 
   ~Harness() override { delete constructor_; }
 
-  void Add(const string& key, const string& value) {
+  void Add(const std::string& key, const std::string& value) {
     constructor_->Add(key, value);
   }
 
   void Test(random::SimplePhilox* rnd, int num_random_access_iters = 200) {
-    std::vector<string> keys;
+    std::vector<std::string> keys;
     KVMap data;
     constructor_->Finish(options_, &keys, &data);
 
@@ -311,7 +312,8 @@ class Harness : public ::testing::Test {
     TestRandomAccess(rnd, keys, data, num_random_access_iters);
   }
 
-  void TestForwardScan(const std::vector<string>& keys, const KVMap& data) {
+  void TestForwardScan(const std::vector<std::string>& keys,
+                       const KVMap& data) {
     Iterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     iter->SeekToFirst();
@@ -325,7 +327,7 @@ class Harness : public ::testing::Test {
   }
 
   void TestRandomAccess(random::SimplePhilox* rnd,
-                        const std::vector<string>& keys, const KVMap& data,
+                        const std::vector<std::string>& keys, const KVMap& data,
                         int num_random_access_iters) {
     static const bool kVerbose = false;
     Iterator* iter = constructor_->NewIterator();
@@ -356,7 +358,7 @@ class Harness : public ::testing::Test {
         }
 
         case 2: {
-          string key = PickRandomKey(rnd, keys);
+          std::string key = PickRandomKey(rnd, keys);
           model_iter = data.lower_bound(key);
           if (kVerbose)
             fprintf(stderr, "Seek '%s'\n", absl::CEscape(key).c_str());
@@ -396,13 +398,13 @@ class Harness : public ::testing::Test {
     }
   }
 
-  string PickRandomKey(random::SimplePhilox* rnd,
-                       const std::vector<string>& keys) {
+  std::string PickRandomKey(random::SimplePhilox* rnd,
+                            const std::vector<std::string>& keys) {
     if (keys.empty()) {
       return "foo";
     } else {
       const int index = rnd->Uniform(keys.size());
-      string result = keys[index];
+      std::string result = keys[index];
       switch (rnd->Uniform(3)) {
         case 0:
           // Return an existing key
@@ -443,7 +445,7 @@ TEST_F(Harness, Empty) {
 // code never generates such blocks, but the Java version of leveldb
 // seems to.
 TEST_F(Harness, ZeroRestartPointsInBlock) {
-  char data[sizeof(uint32)];
+  char data[sizeof(uint32_t)];
   memset(data, 0, sizeof(data));
   BlockContents contents;
   contents.data = absl::string_view(data, sizeof(data));
@@ -497,8 +499,8 @@ TEST_F(Harness, SimpleMultiBigValues) {
     random::PhiloxRandom philox(testing::RandomSeed() + 3, 17);
     random::SimplePhilox rnd(&philox);
     Add("ainitial", "tiny");
-    Add("anext", string(10000000, 'a'));
-    Add("anext2", string(10000000, 'b'));
+    Add("anext", std::string(10000000, 'a'));
+    Add("anext2", std::string(10000000, 'b'));
     Add("azz", "tiny");
     Test(&rnd, 100 /* num_random_access_iters */);
   }
@@ -526,16 +528,16 @@ TEST_F(Harness, Randomized) {
                 int(kNumTestArgs), num_entries);
       }
       for (int e = 0; e < num_entries; e++) {
-        string v;
+        std::string v;
         Add(test::RandomKey(&rnd, rnd.Skewed(4)),
-            string(test::RandomString(&rnd, rnd.Skewed(5), &v)));
+            std::string(test::RandomString(&rnd, rnd.Skewed(5), &v)));
       }
       Test(&rnd);
     }
   }
 }
 
-static bool Between(uint64 val, uint64 low, uint64 high) {
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
   if (!result) {
     fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
@@ -552,12 +554,12 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
   TableConstructor c;
   c.Add("k01", "hello");
   c.Add("k02", "hello2");
-  c.Add("k03", string(10000, 'x'));
-  c.Add("k04", string(200000, 'x'));
-  c.Add("k05", string(300000, 'x'));
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
   c.Add("k06", "hello3");
-  c.Add("k07", string(100000, 'x'));
-  std::vector<string> keys;
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
   options.block_size = 1024;
@@ -578,7 +580,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
 }
 
 static bool SnappyCompressionSupported() {
-  string out;
+  std::string out;
   absl::string_view in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
   return port::Snappy_Compress(in.data(), in.size(), &out);
 }
@@ -592,12 +594,12 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   TableConstructor c;
-  string tmp;
+  std::string tmp;
   c.Add("k01", "hello");
   c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
   c.Add("k03", "hello3");
   c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
   options.block_size = 1024;
@@ -614,12 +616,12 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
 TEST(TableTest, SeekToFirstKeyDoesNotReadTooMuch) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  string tmp;
+  std::string tmp;
   TableConstructor c;
   c.Add("k01", "firstvalue");
   c.Add("k03", test::CompressibleString(&rnd, 0.25, 1000000, &tmp));
   c.Add("k04", "abc");
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
   options.block_size = 1024;
diff --git a/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc b/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc
index 9de0e76a303e3b..91a676ef9382d1 100644
--- a/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc
+++ b/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc
@@ -77,7 +77,7 @@ class TwoLevelIterator : public Iterator {
   Iterator* data_iter_;  // May be NULL
   // If data_iter_ is non-NULL, then "data_block_handle_" holds the
   // "index_value" passed to block_function_ to create the data_iter_.
-  string data_block_handle_;
+  std::string data_block_handle_;
 };
 
 TwoLevelIterator::TwoLevelIterator(Iterator* index_iter,
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc b/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
index c42bef8a9f2b19..0fe1e4e15ba555 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
@@ -35,8 +35,8 @@ static std::vector<int> OutputBufferSizes() { return {100, 200, 500, 1000}; }
 
 static std::vector<int> NumCopies() { return {1, 50, 500}; }
 
-static string GetRecord() {
-  static const string lorem_ipsum =
+static std::string GetRecord() {
+  static const std::string lorem_ipsum =
       "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
       " Fusce vehicula tincidunt libero sit amet ultrices. Vestibulum non "
       "felis augue. Duis vitae augue id lectus lacinia congue et ut purus. "
@@ -52,8 +52,8 @@ static string GetRecord() {
   return lorem_ipsum;
 }
 
-static string GenTestString(int copies = 1) {
-  string result = "";
+static std::string GenTestString(int copies = 1) {
+  std::string result = "";
   for (int i = 0; i < copies; i++) {
     result += GetRecord();
   }
@@ -65,11 +65,11 @@ typedef io::ZlibCompressionOptions CompressionOptions;
 void TestAllCombinations(CompressionOptions input_options,
                          CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
     // Write to compressed file
-    string data = GenTestString(file_size);
+    std::string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
       for (auto output_buf_size : OutputBufferSizes()) {
         std::unique_ptr<WritableFile> file_writer;
@@ -111,18 +111,18 @@ TEST(ZlibBuffers, Gzip) {
   TestAllCombinations(CompressionOptions::GZIP(), CompressionOptions::GZIP());
 }
 
-void TestMultipleWrites(uint8 input_buf_size, uint8 output_buf_size,
+void TestMultipleWrites(uint8_t input_buf_size, uint8_t output_buf_size,
                         int num_writes, bool with_flush = false) {
   Env* env = Env::Default();
   CompressionOptions input_options = CompressionOptions::DEFAULT();
   CompressionOptions output_options = CompressionOptions::DEFAULT();
 
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
-  string data = GenTestString();
+  std::string data = GenTestString();
   std::unique_ptr<WritableFile> file_writer;
-  string actual_result;
-  string expected_result;
+  std::string actual_result;
+  std::string expected_result;
 
   TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
   ZlibOutputBuffer out(file_writer.get(), input_buf_size, output_buf_size,
@@ -134,7 +134,7 @@ void TestMultipleWrites(uint8 input_buf_size, uint8 output_buf_size,
     if (with_flush) {
       TF_ASSERT_OK(out.Flush());
     }
-    strings::StrAppend(&expected_result, data);
+    absl::StrAppend(&expected_result, data);
   }
   TF_ASSERT_OK(out.Close());
   TF_ASSERT_OK(file_writer->Flush());
@@ -150,7 +150,7 @@ void TestMultipleWrites(uint8 input_buf_size, uint8 output_buf_size,
   for (int i = 0; i < num_writes; i++) {
     tstring decompressed_output;
     TF_ASSERT_OK(in.ReadNBytes(data.size(), &decompressed_output));
-    strings::StrAppend(&actual_result, decompressed_output);
+    absl::StrAppend(&actual_result, decompressed_output);
   }
 
   EXPECT_EQ(actual_result, expected_result);
@@ -166,7 +166,7 @@ TEST(ZlibBuffers, MultipleWriteCallsWithFlush) {
 
 TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   CompressionOptions output_options = CompressionOptions::DEFAULT();
   CompressionOptions input_options = CompressionOptions::DEFAULT();
@@ -175,7 +175,7 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   // inflate() has smaller history buffer.
   input_options.window_bits = output_options.window_bits - 1;
 
-  string data = GenTestString(10);
+  std::string data = GenTestString(10);
   std::unique_ptr<WritableFile> file_writer;
   TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
   tstring result;
@@ -199,10 +199,10 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   CHECK(absl::StrContains(read_status.message(), "inflate() failed"));
 }
 
-void WriteCompressedFile(Env* env, const string& fname, int input_buf_size,
+void WriteCompressedFile(Env* env, const std::string& fname, int input_buf_size,
                          int output_buf_size,
                          const CompressionOptions& output_options,
-                         const string& data) {
+                         const std::string& data) {
   std::unique_ptr<WritableFile> file_writer;
   TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
 
@@ -219,10 +219,10 @@ void WriteCompressedFile(Env* env, const string& fname, int input_buf_size,
 void TestTell(CompressionOptions input_options,
               CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
-    string data = GenTestString(file_size);
+    std::string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
       for (auto output_buf_size : OutputBufferSizes()) {
         // Write the compressed file.
@@ -237,7 +237,7 @@ void TestTell(CompressionOptions input_options,
         ZlibInputStream in(input_stream.get(), input_buf_size, output_buf_size,
                            input_options);
 
-        tstring first_half(string(data, 0, data.size() / 2));
+        tstring first_half(std::string(data, 0, data.size() / 2));
         tstring bytes_read;
 
         // Read the first half of the uncompressed file and expect that Tell()
@@ -264,10 +264,10 @@ void TestTell(CompressionOptions input_options,
 void TestSkipNBytes(CompressionOptions input_options,
                     CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
-    string data = GenTestString(file_size);
+    std::string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
       for (auto output_buf_size : OutputBufferSizes()) {
         // Write the compressed file.
@@ -283,7 +283,8 @@ void TestSkipNBytes(CompressionOptions input_options,
                            input_options);
 
         size_t data_half_size = data.size() / 2;
-        string second_half(data, data_half_size, data.size() - data_half_size);
+        std::string second_half(data, data_half_size,
+                                data.size() - data_half_size);
 
         // Skip past the first half of the file and expect Tell() returns
         // correctly.
@@ -303,7 +304,7 @@ void TestSkipNBytes(CompressionOptions input_options,
 
 void TestSoftErrorOnDecompress(CompressionOptions input_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   input_options.soft_fail_on_error = true;
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h b/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
index b0cb2f05724642..be3ae029bb1b16 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
+++ b/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
@@ -30,7 +30,7 @@ class ZlibCompressionOptions {
   static ZlibCompressionOptions GZIP();
 
   // Defaults to Z_NO_FLUSH
-  int8 flush_mode;
+  int8_t flush_mode;
 
   // Size of the buffer used for caching the data read from source file.
   int64_t input_buffer_size = 256 << 10;
@@ -74,7 +74,7 @@ class ZlibCompressionOptions {
   // error code Z_DATA_ERROR instead of trying to allocate a larger window.
   //
   // Defaults to MAX_WBITS
-  int8 window_bits;
+  int8_t window_bits;
 
   // From the zlib manual (http://www.zlib.net/manual.html):
   // The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
@@ -82,17 +82,17 @@ class ZlibCompressionOptions {
   // (the input data is simply copied a block at a time). Z_DEFAULT_COMPRESSION
   // requests a default compromise between speed and compression (currently
   // equivalent to level 6).
-  int8 compression_level;
+  int8_t compression_level;
 
   // Only Z_DEFLATED is supported at this time.
-  int8 compression_method;
+  int8_t compression_method;
 
   // From the zlib manual (http://www.zlib.net/manual.html):
   // The mem_level parameter specifies how much memory should be allocated for
   // the internal compression state. mem_level=1 uses minimum memory but is slow
   // and reduces compression ratio; mem_level=9 uses maximum memory for optimal
   // speed. The default value is 8.
-  int8 mem_level = 9;
+  int8_t mem_level = 9;
 
   // From the zlib manual (http://www.zlib.net/manual.html):
   // The strategy parameter is used to tune the compression algorithm. Use the
@@ -109,7 +109,7 @@ class ZlibCompressionOptions {
   // but not the correctness of the compressed output even if it is not set
   // appropriately. Z_FIXED prevents the use of dynamic Huffman codes, allowing
   // for a simpler decoder for special applications.
-  int8 compression_strategy;
+  int8_t compression_strategy;
 
   // When this is set to true and we are unable to find the header to correctly
   // decompress a file, we return an error when `ReadNBytes` is called instead
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc b/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
index 37e67646caaccf..31dd50b54100c5 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
@@ -250,10 +250,10 @@ absl::Status ZlibInputStream::Inflate() {
   // not fatal and `inflate` can be called again with more input and output
   // space to continue inflating.
   if (error != Z_OK && error != Z_STREAM_END && error != Z_BUF_ERROR) {
-    string error_string =
-        strings::StrCat("inflate() failed with error ", error);
+    std::string error_string =
+        absl::StrCat("inflate() failed with error ", error);
     if (z_stream_def_->stream->msg != nullptr) {
-      strings::StrAppend(&error_string, ": ", z_stream_def_->stream->msg);
+      absl::StrAppend(&error_string, ": ", z_stream_def_->stream->msg);
     }
     return errors::DataLoss(error_string);
   }
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc
index c39131f001525f..f3db1612c93ca0 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc
@@ -15,7 +15,20 @@ limitations under the License.
 
 #include "xla/tsl/lib/io/zlib_outputbuffer.h"
 
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/lib/io/zlib_compression_options.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
 
 namespace tsl {
 namespace io {
@@ -45,9 +58,8 @@ absl::Status ZlibOutputBuffer::Init() {
   // Output buffer size should be greater than 1 because deflation needs at
   // least one byte for book keeping etc.
   if (output_buffer_capacity_ <= 1) {
-    return errors::InvalidArgument(
-        "output_buffer_bytes should be greater than "
-        "1");
+    return absl::InvalidArgumentError(
+        "output_buffer_bytes should be greater than 1");
   }
   memset(z_stream_.get(), 0, sizeof(z_stream));
   z_stream_->zalloc = Z_NULL;
@@ -59,7 +71,8 @@ absl::Status ZlibOutputBuffer::Init() {
                    zlib_options_.mem_level, zlib_options_.compression_strategy);
   if (status != Z_OK) {
     z_stream_.reset(nullptr);
-    return errors::InvalidArgument("deflateInit failed with status", status);
+    return absl::InvalidArgumentError(
+        absl::StrCat("deflateInit failed with status", status));
   }
   z_stream_->next_in = z_stream_input_.get();
   z_stream_->next_out = z_stream_output_.get();
@@ -68,7 +81,7 @@ absl::Status ZlibOutputBuffer::Init() {
   return absl::OkStatus();
 }
 
-int32 ZlibOutputBuffer::AvailableInputSpace() const {
+int32_t ZlibOutputBuffer::AvailableInputSpace() const {
   return input_buffer_capacity_ - z_stream_->avail_in;
 }
 
@@ -130,7 +143,7 @@ absl::Status ZlibOutputBuffer::DeflateBuffered(int flush_mode) {
 }
 
 absl::Status ZlibOutputBuffer::FlushOutputBufferToFile() {
-  uint32 bytes_to_write = output_buffer_capacity_ - z_stream_->avail_out;
+  uint32_t bytes_to_write = output_buffer_capacity_ - z_stream_->avail_out;
   if (bytes_to_write > 0) {
     absl::Status s = file_->Append(absl::string_view(
         reinterpret_cast<char*>(z_stream_output_.get()), bytes_to_write));
@@ -231,9 +244,10 @@ absl::Status ZlibOutputBuffer::Deflate(int flush) {
       (error == Z_STREAM_END && flush == Z_FINISH)) {
     return absl::OkStatus();
   }
-  string error_string = strings::StrCat("deflate() failed with error ", error);
+  std::string error_string =
+      absl::StrCat("deflate() failed with error ", error);
   if (z_stream_->msg != nullptr) {
-    strings::StrAppend(&error_string, ": ", z_stream_->msg);
+    absl::StrAppend(&error_string, ": ", z_stream_->msg);
   }
   return errors::DataLoss(error_string);
 }
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
index 3d7e3024993ee9..6a20456e2cb5b8 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
@@ -126,7 +126,7 @@ class ZlibOutputBuffer : public WritableFile {
   void AddToInputBuffer(absl::string_view data);
 
   // Returns the total space available in z_input_stream_ buffer.
-  int32 AvailableInputSpace() const;
+  int32_t AvailableInputSpace() const;
 
   // Deflate contents in z_stream_input_ and store results in z_stream_output_.
   // The contents of output stream are written to file if more space is needed.
@@ -145,7 +145,7 @@ class ZlibOutputBuffer : public WritableFile {
   // Calls `deflate()` and returns DataLoss Status if it failed.
   absl::Status Deflate(int flush);
 
-  static bool IsSyncOrFullFlush(uint8 flush_mode) {
+  static bool IsSyncOrFullFlush(uint8_t flush_mode) {
     return flush_mode == Z_SYNC_FLUSH || flush_mode == Z_FULL_FLUSH;
   }
 
diff --git a/third_party/xla/xla/tsl/lib/math/math_util_test.cc b/third_party/xla/xla/tsl/lib/math/math_util_test.cc
index b7a91877b1168c..83723d40adbc8d 100644
--- a/third_party/xla/xla/tsl/lib/math/math_util_test.cc
+++ b/third_party/xla/xla/tsl/lib/math/math_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/tsl/lib/math/math_util.h"
 
 #include <cmath>
+#include <cstdint>
 #include <limits>
 #include <vector>
 
@@ -60,9 +61,9 @@ void TestCeilOfRatio(const TestDataType test_data[][kNumTestArguments],
 }
 
 template <typename UnsignedIntegralType>
-void TestCeilOfRatioUnsigned(uint64 kMax) {
+void TestCeilOfRatioUnsigned(uint64_t kMax) {
   const int kNumTests = 12;
-  const uint64 kTestData[kNumTests][kNumTestArguments] = {
+  const uint64_t kTestData[kNumTests][kNumTestArguments] = {
       // Numerator  | Denominator | Expected floor of ratio | Expected ceil of
       // ratio |
       // When numerator = 0, the result is always zero
@@ -83,7 +84,7 @@ void TestCeilOfRatioUnsigned(uint64 kMax) {
       // Try with a huge numerator and a huge denominator
       {kMax, kMax, 1, 1},
   };
-  TestCeilOfRatio<UnsignedIntegralType, uint64>(kTestData, kNumTests);
+  TestCeilOfRatio<UnsignedIntegralType, uint64_t>(kTestData, kNumTests);
 }
 
 template <typename SignedInteger>
@@ -185,18 +186,23 @@ void TestThatCeilOfRatioDenomMinusOneIsIncorrect() {
   // It does not work with negative values
   TestThatCeilOfRatioDenomMinusOneIsIncorrect(-1LL, -2LL, -1LL);
 
-  // This would also fail if given kint64max because of signed integer overflow.
+  // This would also fail if given std::numeric_limits<int64_t>::max() because
+  // of signed integer overflow.
 }
 
 TEST(MathUtil, CeilOfRatio) {
-  TestCeilOfRatioUnsigned<uint8>(kuint8max);
-  TestCeilOfRatioUnsigned<uint16>(kuint16max);
-  TestCeilOfRatioUnsigned<uint32>(kuint32max);
-  TestCeilOfRatioUnsigned<uint64>(kuint64max);
-  TestCeilOfRatioSigned<int8>(kint8min, kint8max);
-  TestCeilOfRatioSigned<int16>(kint16min, kint16max);
-  TestCeilOfRatioSigned<int32>(kint32min, kint32max);
-  TestCeilOfRatioSigned<int64_t>(kint64min, kint64max);
+  TestCeilOfRatioUnsigned<uint8_t>(std::numeric_limits<uint8_t>::max());
+  TestCeilOfRatioUnsigned<uint16_t>(std::numeric_limits<uint16_t>::max());
+  TestCeilOfRatioUnsigned<uint32_t>(std::numeric_limits<uint32_t>::max());
+  TestCeilOfRatioUnsigned<uint64_t>(std::numeric_limits<uint64_t>::max());
+  TestCeilOfRatioSigned<int8_t>(std::numeric_limits<int8_t>::min(),
+                                std::numeric_limits<int8_t>::max());
+  TestCeilOfRatioSigned<int16_t>(std::numeric_limits<int16_t>::min(),
+                                 std::numeric_limits<int16_t>::max());
+  TestCeilOfRatioSigned<int32_t>(std::numeric_limits<int32_t>::min(),
+                                 std::numeric_limits<int32_t>::max());
+  TestCeilOfRatioSigned<int64_t>(std::numeric_limits<int64_t>::min(),
+                                 std::numeric_limits<int64_t>::max());
 #if 0
   TestThatCeilOfRatioDenomMinusOneIsIncorrect();
 #endif
@@ -220,15 +226,15 @@ TEST(MathUtil, GCD) {
   });
 
   for (const auto& tc : testcases) {
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32>(tc.x, tc.y));
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32>(tc.y, tc.x));
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64>(tc.x, tc.y));
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64>(tc.y, tc.x));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32_t>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32_t>(tc.y, tc.x));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64_t>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64_t>(tc.y, tc.x));
   }
 
-  const uint64 biggish_prime = 1666666667;
+  const uint64_t biggish_prime = 1666666667;
   EXPECT_EQ(biggish_prime,
-            tsl::MathUtil::GCD<uint64>(biggish_prime * 3, biggish_prime * 4));
+            tsl::MathUtil::GCD<uint64_t>(biggish_prime * 3, biggish_prime * 4));
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/tsl/lib/monitoring/BUILD b/third_party/xla/xla/tsl/lib/monitoring/BUILD
index 99eb321fa7fe74..1a7fdee569ca27 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/BUILD
+++ b/third_party/xla/xla/tsl/lib/monitoring/BUILD
@@ -1,5 +1,9 @@
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tsl_cc_test",
+)
 load(
     "//xla/tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -11,6 +15,8 @@ package(
         "//learning/brain/google/data:__subpackages__",
         "//learning/brain/google/monitoring:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/experimental/orbax_model:__subpackages__",
+        # copybara:uncomment "//learning/pathways/util/platform:__subpackages__",
+        # copybara:uncomment "//third_party/pathways/util/platform:__subpackages__",
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
         # tensorflow/core/platform:monitoring depends on this package
@@ -69,6 +75,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "counter_gauge",
+    hdrs = ["counter_gauge.h"],
+    deps = [
+        ":collection_registry",
+        ":metric_def",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tsl_cc_test(
+    name = "counter_gauge_test",
+    srcs = ["counter_gauge_test.cc"],
+    deps = [
+        ":counter_gauge",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "sampler",
     srcs = ["sampler.cc"],
@@ -179,6 +208,23 @@ cc_library(
     ],
 )
 
+tsl_cc_test(
+    name = "cell_reader_test",
+    size = "small",
+    srcs = ["cell_reader_test.cc"],
+    deps = [
+        ":cell_reader",
+        ":counter",
+        ":counter_gauge",
+        ":gauge",
+        ":percentile_sampler",
+        ":sampler",
+        ":test_utils",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "percentile_sampler",
     srcs = ["percentile_sampler.cc"],
diff --git a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc
index 69a5536fae9034..936246474fe998 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "xla/tsl/lib/monitoring/collected_metrics.h"
 #include "xla/tsl/lib/monitoring/collection_registry.h"
@@ -29,7 +31,6 @@ limitations under the License.
 #include "xla/tsl/lib/monitoring/test_utils.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/types.h"
 
 namespace tsl {
 namespace monitoring {
@@ -67,21 +68,21 @@ absl::StatusOr<std::vector<Point>> GetPoints(
     const std::vector<std::string>& labels) {
   auto metric_descriptor = metrics.metric_descriptor_map.find(metric_name);
   if (metric_descriptor == metrics.metric_descriptor_map.end()) {
-    return errors::NotFound("Metric descriptor is not found for metric ",
-                            metric_name, ".");
+    return absl::NotFoundError(absl::StrCat(
+        "Metric descriptor is not found for metric ", metric_name, "."));
   }
-  const std::vector<string>& label_names =
+  const std::vector<std::string>& label_names =
       metric_descriptor->second->label_names;
   if (label_names.size() != labels.size()) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         "Metric ", metric_name, " has ", label_names.size(), " labels: [",
         absl::StrJoin(label_names, ", "), "]. Got label values [",
-        absl::StrJoin(labels, ", "), "].");
+        absl::StrJoin(labels, ", "), "]."));
   }
   auto point_set = metrics.point_set_map.find(metric_name);
   if (point_set == metrics.point_set_map.end()) {
-    return errors::NotFound("Metric point set is not found for metric ",
-                            metric_name, ".");
+    return absl::NotFoundError(absl::StrCat(
+        "Metric point set is not found for metric ", metric_name, "."));
   }
 
   std::vector<Point> points;
@@ -99,9 +100,9 @@ absl::StatusOr<Point> GetLatestPoint(const CollectedMetrics& metrics,
   TF_ASSIGN_OR_RETURN(std::vector<Point> points,
                       GetPoints(metrics, metric_name, labels));
   if (points.empty()) {
-    return errors::Unavailable("No data collected for metric ", metric_name,
-                               " with labels [", absl::StrJoin(labels, ", "),
-                               "].");
+    return absl::UnavailableError(
+        absl::StrCat("No data collected for metric ", metric_name,
+                     " with labels [", absl::StrJoin(labels, ", "), "]."));
   }
 
   bool same_start_time =
diff --git a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
index 8eb263ba4c0424..f0e1efdfcb00ca 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
@@ -89,18 +89,23 @@ Percentiles GetValue(const Point& point);
 
 // Returns the latest value for `metric_name`, associated with the `labels`. If
 // the metric has not collected any data, it returns a default value appropriate
-// for `ValueType`. If the metric does not exist, or the wrong number of labels
-// is provided, it will crash.
+// for `ValueType`. If a wrong number of labels is provided, it will crash. If
+// the metric does not exist, it will crash if `return_default_on_not_found` is
+// false, otherwise it will return a default value.
 template <typename ValueType>
 ValueType GetLatestValueOrDefault(const CollectedMetrics& metrics,
                                   const std::string& metric_name,
                                   const std::vector<std::string>& labels,
-                                  const ValueType default_value = ValueType()) {
+                                  bool return_default_on_not_found = false,
+                                  ValueType default_value = ValueType()) {
   absl::StatusOr<Point> latest_point =
       GetLatestPoint(metrics, metric_name, labels);
   if (absl::IsUnavailable(latest_point.status())) {
     return std::move(default_value);
   }
+  if (return_default_on_not_found && absl::IsNotFound(latest_point.status())) {
+    return std::move(default_value);
+  }
   if (!latest_point.ok()) {
     LOG(FATAL) << "Failed to read from tfstreamz: " << latest_point.status();
   }
diff --git a/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h b/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
index 782eb5bfd9347a..017cb0f868d54c 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -81,9 +80,8 @@ class CellReader {
  public:
   // Constructs a `CellReader` that reads values exported for `metric_name`.
   //
-  // REQUIRES: a tfstreamz with `metric_name` exists. Otherwise, the
-  // `CellReader` will construct without issue, but the `Read` and `Delta` calls
-  // will CHECK-fail.
+  // NOTE: if a tfstreamz with `metric_name` does not exists, the `CellReader`
+  // will construct without issue, but the `Read` calls will CHECK-fail.
   explicit CellReader(const std::string& metric_name);
   virtual ~CellReader() = default;
   CellReader(const CellReader&) = delete;
@@ -100,13 +98,13 @@ class CellReader {
 
   // Returns the difference in the value of this cell since the last time
   // `Delta()` was called for this cell, or when the `CellReader` was created,
-  // whichever was most recent. If the metric has not been modified, it returns
-  // a default value appropriate for `ValueType`. `Delta` is not supported for
-  // string and bool gauges.
+  // whichever was most recent. If tfstreamz does not exist or the metric has
+  // not been modified, it returns a default value appropriate for `ValueType`.
+  // `Delta` is not supported for string and bool gauges.
   //
-  // REQUIRES: The tfstreamz exists, `labels` contains a correct number of
-  // labels per tfstreamz definition, and the ValueType is not string or bool.
-  // Otherwise, it will CHECK-fail.
+  // REQUIRES: `labels` contains a correct number of labels per tfstreamz
+  // definition, and the ValueType is not string or bool. Otherwise, it will
+  // CHECK-fail.
   template <typename... LabelType>
   ValueType Delta(const LabelType&... labels);
 
@@ -149,13 +147,19 @@ ValueType CellReader<ValueType>::Delta(const LabelType&... labels) {
   std::vector<std::string> labels_list{labels...};
   std::unique_ptr<CollectedMetrics> metrics = internal::CollectMetrics();
   ValueType value = internal::GetLatestValueOrDefault<ValueType>(
-      *metrics, metric_name_, labels_list);
-  ValueType initial_value = internal::GetLatestValueOrDefault<ValueType>(
-      *initial_metrics_, metric_name_, labels_list);
-  if (delta_map_.contains(labels_list)) {
-    initial_value = delta_map_[labels_list];
+      *metrics, metric_name_, labels_list,
+      /*return_default_on_not_found=*/true);
+  auto it = delta_map_.find(labels_list);
+  ValueType initial_value;
+  if (it == delta_map_.end()) {
+    initial_value = internal::GetLatestValueOrDefault<ValueType>(
+        *initial_metrics_, metric_name_, labels_list,
+        /*return_default_on_not_found=*/true);
+    delta_map_[labels_list] = value;
+  } else {
+    initial_value = it->second;
+    it->second = value;
   }
-  delta_map_[labels_list] = value;
   return internal::GetDelta<ValueType>(value, initial_value);
 }
 
diff --git a/tensorflow/core/lib/monitoring/cell_reader_test.cc b/third_party/xla/xla/tsl/lib/monitoring/cell_reader_test.cc
similarity index 86%
rename from tensorflow/core/lib/monitoring/cell_reader_test.cc
rename to third_party/xla/xla/tsl/lib/monitoring/cell_reader_test.cc
index 3e3fb99d02093c..4c6bcb86d12279 100644
--- a/tensorflow/core/lib/monitoring/cell_reader_test.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader_test.cc
@@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/lib/monitoring/cell_reader.h"
+
+#include "xla/tsl/lib/monitoring/cell_reader.h"
 
 #include <cstdint>
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/lib/monitoring/counter.h"
-#include "tensorflow/core/lib/monitoring/gauge.h"
-#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
-#include "tensorflow/core/lib/monitoring/sampler.h"
-#include "tensorflow/core/lib/monitoring/test_utils.h"
-#include "tensorflow/core/lib/monitoring/types.h"
-#include "tensorflow/core/platform/test.h"
+#include <gtest/gtest.h>
+#include "xla/tsl/lib/monitoring/counter.h"
+#include "xla/tsl/lib/monitoring/counter_gauge.h"
+#include "xla/tsl/lib/monitoring/gauge.h"
+#include "xla/tsl/lib/monitoring/percentile_sampler.h"
+#include "xla/tsl/lib/monitoring/sampler.h"
+#include "xla/tsl/lib/monitoring/test_utils.h"
+#include "xla/tsl/lib/monitoring/types.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace monitoring {
 namespace testing {
 namespace {
@@ -35,58 +37,72 @@ std::vector<double> GetDefaultPercentiles() {
   return {25.0, 50.0, 80.0, 90.0, 95.0, 99.0};
 }
 
-auto* test_counter = monitoring::Counter<0>::New(
-    "/tensorflow/monitoring/test/counter", "Test counter.");
+auto* test_counter = tsl::monitoring::Counter<0>::New(
+    "/tsl/monitoring/test/counter", "Test counter.");
 
-auto* test_counter_with_labels = monitoring::Counter<2>::New(
-    "/tensorflow/monitoring/test/counter_with_labels",
-    "Test counter with two labels.", "label1", "label2");
+auto* test_counter_with_labels = tsl::monitoring::Counter<2>::New(
+    "/tsl/monitoring/test/counter_with_labels", "Test counter with two labels.",
+    "label1", "label2");
 
-auto* test_sampler = monitoring::Sampler<0>::New(
-    {"/tensorflow/monitoring/test/sampler", "Test sampler."},
-    /*buckets=*/monitoring::Buckets::Explicit(
+auto* test_sampler = tsl::monitoring::Sampler<0>::New(
+    {"/tsl/monitoring/test/sampler", "Test sampler."},
+    /*buckets=*/tsl::monitoring::Buckets::Explicit(
         {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0}));
 
-auto* test_sampler_with_labels = monitoring::Sampler<2>::New(
-    {"/tensorflow/monitoring/test/sampler_with_labels", "Test sampler.",
-     "label1", "label2"},
-    /*buckets=*/monitoring::Buckets::Exponential(
+auto* test_sampler_with_labels = tsl::monitoring::Sampler<2>::New(
+    {"/tsl/monitoring/test/sampler_with_labels", "Test sampler.", "label1",
+     "label2"},
+    /*buckets=*/tsl::monitoring::Buckets::Exponential(
         /*scale=*/1, /*growth_factor=*/10, /*bucket_count=*/5));
 
-auto* test_int_gauge = monitoring::Gauge<int64_t, 0>::New(
-    "/tensorflow/monitoring/test/int_gauge", "Test gauge.");
+auto* test_int_gauge = tsl::monitoring::Gauge<int64_t, 0>::New(
+    "/tsl/monitoring/test/int_gauge", "Test gauge.");
 
-auto* test_int_gauge_with_labels = monitoring::Gauge<int64_t, 2>::New(
-    "/tensorflow/monitoring/test/int_gauge_with_labels", "Test gauge.",
-    "label1", "label2");
+auto* test_int_gauge_with_labels = tsl::monitoring::Gauge<int64_t, 2>::New(
+    "/tsl/monitoring/test/int_gauge_with_labels", "Test gauge.", "label1",
+    "label2");
 
-auto* test_string_gauge = monitoring::Gauge<std::string, 0>::New(
-    "/tensorflow/monitoring/test/string_gauge", "Test gauge.");
+auto* test_string_gauge = tsl::monitoring::Gauge<std::string, 0>::New(
+    "/tsl/monitoring/test/string_gauge", "Test gauge.");
 
-auto* test_string_gauge_with_labels = monitoring::Gauge<std::string, 2>::New(
-    "/tensorflow/monitoring/test/string_gauge_with_labels", "Test gauge.",
-    "label1", "label2");
+auto* test_string_gauge_with_labels =
+    tsl::monitoring::Gauge<std::string, 2>::New(
+        "/tsl/monitoring/test/string_gauge_with_labels", "Test gauge.",
+        "label1", "label2");
 
-auto* test_bool_gauge = monitoring::Gauge<bool, 0>::New(
-    "/tensorflow/monitoring/test/bool_gauge", "Test gauge.");
+auto* test_bool_gauge = tsl::monitoring::Gauge<bool, 0>::New(
+    "/tsl/monitoring/test/bool_gauge", "Test gauge.");
 
-auto* test_bool_gauge_with_labels = monitoring::Gauge<bool, 2>::New(
-    "/tensorflow/monitoring/test/bool_gauge_with_labels", "Test gauge.",
-    "label1", "label2");
+auto* test_bool_gauge_with_labels = tsl::monitoring::Gauge<bool, 2>::New(
+    "/tsl/monitoring/test/bool_gauge_with_labels", "Test gauge.", "label1",
+    "label2");
 
-auto* test_percentiles = monitoring::PercentileSampler<0>::New(
-    {"/tensorflow/monitoring/test/percentiles", "Test percentiles."},
+auto* test_counter_gauge = tsl::monitoring::CounterGauge<0>::New(
+    "/tsl/monitoring/test/counter_gauge", "Test counter gauge.");
+
+auto* test_counter_gauge_with_labels = tsl::monitoring::CounterGauge<2>::New(
+    "/tsl/monitoring/test/counter_gauge_with_labels",
+    "Test counter gauge with two labels.", "label1", "label2");
+
+auto* test_percentiles = tsl::monitoring::PercentileSampler<0>::New(
+    {"/tsl/monitoring/test/percentiles", "Test percentiles."},
     GetDefaultPercentiles(), /*max_samples=*/1024,
-    monitoring::UnitOfMeasure::kTime);
+    tsl::monitoring::UnitOfMeasure::kTime);
 
-auto* test_percentiles_with_labels = monitoring::PercentileSampler<2>::New(
-    {"/tensorflow/monitoring/test/percentiles_with_labels", "Test percentiles.",
+auto* test_percentiles_with_labels = tsl::monitoring::PercentileSampler<2>::New(
+    {"/tsl/monitoring/test/percentiles_with_labels", "Test percentiles.",
      "label1", "label2"},
     GetDefaultPercentiles(), /*max_samples=*/1024,
-    monitoring::UnitOfMeasure::kTime);
+    tsl::monitoring::UnitOfMeasure::kTime);
+
+void IncrementLazyCounter() {
+  static auto* test_counter = monitoring::Counter<0>::New(
+      "/tsl/monitoring/test/lazy_counter", "Test lazy counter.");
+  test_counter->GetCell()->IncrementBy(1);
+}
 
 TEST(CellReaderTest, CounterDeltaNoLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Delta(), 0);
 
   test_counter->GetCell()->IncrementBy(5);
@@ -100,7 +116,7 @@ TEST(CellReaderTest, CounterDeltaNoLabels) {
 }
 
 TEST(CellReaderTest, CounterReadNoLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Read(), 0);
 
   test_counter->GetCell()->IncrementBy(5);
@@ -114,7 +130,7 @@ TEST(CellReaderTest, CounterReadNoLabels) {
 }
 
 TEST(CellReaderTest, CounterDeltaAndReadNoLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Delta(), 0);
   EXPECT_EQ(cell_reader.Read(), 0);
 
@@ -132,8 +148,7 @@ TEST(CellReaderTest, CounterDeltaAndReadNoLabels) {
 }
 
 TEST(CellReaderTest, CounterDeltaWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/counter_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Delta("x1", "y2"), 0);
   EXPECT_EQ(cell_reader.Delta("x2", "y1"), 0);
@@ -164,8 +179,7 @@ TEST(CellReaderTest, CounterDeltaWithLabels) {
 }
 
 TEST(CellReaderTest, CounterReadWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/counter_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x1", "y2"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y1"), 0);
@@ -196,8 +210,7 @@ TEST(CellReaderTest, CounterReadWithLabels) {
 }
 
 TEST(CellReaderTest, CounterDeltaAndReadWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/counter_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Delta("x1", "y2"), 0);
   EXPECT_EQ(cell_reader.Delta("x2", "y1"), 0);
@@ -244,9 +257,9 @@ TEST(CellReaderTest, CounterDeltaAndReadWithLabels) {
 }
 
 TEST(CellReaderTest, TwoCounterReaders) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   CellReader<int64_t> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta(), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x2", "y2"), 0);
@@ -284,9 +297,9 @@ TEST(CellReaderTest, TwoCounterReaders) {
 }
 
 TEST(CellReaderTest, RepeatedReads) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   CellReader<int64_t> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta(), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x2", "y2"), 0);
@@ -314,7 +327,7 @@ TEST(CellReaderTest, RepeatedReads) {
 }
 
 TEST(CellReaderTest, SamplerDeltaNoLabels) {
-  CellReader<Histogram> cell_reader("/tensorflow/monitoring/test/sampler");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler");
   Histogram histogram = cell_reader.Delta();
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -392,7 +405,7 @@ TEST(CellReaderTest, SamplerDeltaNoLabels) {
 }
 
 TEST(CellReaderTest, SamplerReadNoLabels) {
-  CellReader<Histogram> cell_reader("/tensorflow/monitoring/test/sampler");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler");
   Histogram histogram = cell_reader.Read();
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -470,7 +483,7 @@ TEST(CellReaderTest, SamplerReadNoLabels) {
 }
 
 TEST(CellReaderTest, SamplerDeltaAndReadNoLabels) {
-  CellReader<Histogram> cell_reader("/tensorflow/monitoring/test/sampler");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler");
   Histogram histogram = cell_reader.Delta();
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -585,8 +598,7 @@ TEST(CellReaderTest, SamplerDeltaAndReadNoLabels) {
 }
 
 TEST(CellReaderTest, SamplerDeltaWithLabels) {
-  CellReader<Histogram> cell_reader(
-      "/tensorflow/monitoring/test/sampler_with_labels");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler_with_labels");
   Histogram histogram = cell_reader.Delta("x1", "y1");
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -677,8 +689,7 @@ TEST(CellReaderTest, SamplerDeltaWithLabels) {
 }
 
 TEST(CellReaderTest, SamplerReadWithLabels) {
-  CellReader<Histogram> cell_reader(
-      "/tensorflow/monitoring/test/sampler_with_labels");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler_with_labels");
   Histogram histogram = cell_reader.Read("x1", "y1");
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -769,8 +780,7 @@ TEST(CellReaderTest, SamplerReadWithLabels) {
 }
 
 TEST(CellReaderTest, SamplerRepeatedReads) {
-  CellReader<Histogram> cell_reader(
-      "/tensorflow/monitoring/test/sampler_with_labels");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler_with_labels");
   Histogram histogram = cell_reader.Read("x1", "y1");
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -878,7 +888,7 @@ TEST(CellReaderTest, SamplerRepeatedReads) {
 }
 
 TEST(CellReaderTest, IntGaugeRead) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/int_gauge");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/int_gauge");
   EXPECT_EQ(cell_reader.Read(), 0);
 
   test_int_gauge->GetCell()->Set(100);
@@ -892,8 +902,7 @@ TEST(CellReaderTest, IntGaugeRead) {
 }
 
 TEST(CellReaderTest, IntGaugeReadWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/int_gauge_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/int_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), 0);
 
@@ -913,11 +922,12 @@ TEST(CellReaderTest, IntGaugeReadWithLabels) {
   test_int_gauge_with_labels->GetCell("x1", "y1")->Set(0);
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), 100000);
+  test_int_gauge_with_labels->GetCell("x2", "y2")->Set(0);
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), 0);
 }
 
 TEST(CellReaderTest, IntGaugeRepeatedSetAndRead) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/int_gauge_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/int_gauge_with_labels");
 
   test_int_gauge_with_labels->GetCell("x1", "y1")->Set(-1);
   test_int_gauge_with_labels->GetCell("x2", "y2")->Set(1);
@@ -940,11 +950,12 @@ TEST(CellReaderTest, IntGaugeRepeatedSetAndRead) {
   EXPECT_EQ(cell_reader.Read("x2", "y2"), -500);
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), -500);
+  test_int_gauge_with_labels->GetCell("x2", "y2")->Set(0);
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), 0);
 }
 
 TEST(CellReaderTest, StringGaugeRead) {
-  CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge");
+  CellReader<std::string> cell_reader("/tsl/monitoring/test/string_gauge");
   EXPECT_EQ(cell_reader.Read(), "");
 
   test_string_gauge->GetCell()->Set("gauge value");
@@ -959,7 +970,7 @@ TEST(CellReaderTest, StringGaugeRead) {
 
 TEST(CellReaderTest, StringGaugeReadWithLabels) {
   CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge_with_labels");
+      "/tsl/monitoring/test/string_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), "");
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "");
 
@@ -984,7 +995,7 @@ TEST(CellReaderTest, StringGaugeReadWithLabels) {
 
 TEST(CellReaderTest, StringGaugeRepeatedSetAndRead) {
   CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge_with_labels");
+      "/tsl/monitoring/test/string_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), "");
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "");
 
@@ -1009,10 +1020,14 @@ TEST(CellReaderTest, StringGaugeRepeatedSetAndRead) {
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "-10");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), "-10");
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "-10");
+  test_string_gauge_with_labels->GetCell("x1", "y1")->Set("");
+  test_string_gauge_with_labels->GetCell("x2", "y2")->Set("");
+  EXPECT_EQ(cell_reader.Read("x1", "y1"), "");
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), "");
 }
 
 TEST(CellReaderTest, BoolGaugeRead) {
-  CellReader<bool> cell_reader("/tensorflow/monitoring/test/bool_gauge");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge");
   EXPECT_EQ(cell_reader.Read(), false);
 
   test_bool_gauge->GetCell()->Set(true);
@@ -1023,8 +1038,7 @@ TEST(CellReaderTest, BoolGaugeRead) {
 }
 
 TEST(CellReaderTest, BoolGaugeReadWithLabels) {
-  CellReader<bool> cell_reader(
-      "/tensorflow/monitoring/test/bool_gauge_with_labels");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), false);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), false);
 
@@ -1048,8 +1062,7 @@ TEST(CellReaderTest, BoolGaugeReadWithLabels) {
 }
 
 TEST(CellReaderTest, BoolGaugeRepeatedSetAndRead) {
-  CellReader<bool> cell_reader(
-      "/tensorflow/monitoring/test/bool_gauge_with_labels");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), false);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), false);
 
@@ -1074,11 +1087,42 @@ TEST(CellReaderTest, BoolGaugeRepeatedSetAndRead) {
   EXPECT_EQ(cell_reader.Read("x2", "y2"), true);
   EXPECT_EQ(cell_reader.Read("x1", "y1"), false);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), true);
+  test_bool_gauge_with_labels->GetCell("x2", "y2")->Set(false);
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), false);
+}
+
+TEST(CellReaderTest, CounterGaugeRead) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_gauge");
+  EXPECT_EQ(cell_reader.Read(), 0);
+  test_counter_gauge->GetCell()->IncrementBy(10);
+  EXPECT_EQ(cell_reader.Read(), 10);
+  test_counter_gauge->GetCell()->IncrementBy(20);
+  EXPECT_EQ(cell_reader.Read(), 30);
+  test_counter_gauge->GetCell()->IncrementBy(-30);
+  EXPECT_EQ(cell_reader.Read(), 0);
+}
+
+TEST(CellReaderTest, CounterGaugeRepeatedIncrementAndDecrement) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_gauge");
+  EXPECT_EQ(cell_reader.Read(), 0);
+  const int kNumIterations = 10;
+  for (int i = 0; i < kNumIterations; ++i) {
+    test_counter_gauge->GetCell()->Increment();
+    EXPECT_EQ(cell_reader.Read(), i + 1);
+  }
+  for (int i = 0; i < kNumIterations; ++i) {
+    test_counter_gauge->GetCell()->Decrement();
+    EXPECT_EQ(cell_reader.Read(), 10 - i - 1);
+  }
+  for (int i = 0; i < kNumIterations; ++i) {
+    test_counter_gauge->GetCell()->Increment();
+    test_counter_gauge->GetCell()->Decrement();
+    EXPECT_EQ(cell_reader.Read(), 0);
+  }
 }
 
 TEST(CellReaderTest, PercentilesDeltaNoLabels) {
-  CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles");
+  CellReader<Percentiles> cell_reader("/tsl/monitoring/test/percentiles");
   Percentiles percentiles = cell_reader.Delta();
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1100,8 +1144,7 @@ TEST(CellReaderTest, PercentilesDeltaNoLabels) {
 }
 
 TEST(CellReaderTest, PercentilesReadNoLabels) {
-  CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles");
+  CellReader<Percentiles> cell_reader("/tsl/monitoring/test/percentiles");
   Percentiles percentiles = cell_reader.Read();
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1124,7 +1167,7 @@ TEST(CellReaderTest, PercentilesReadNoLabels) {
 
 TEST(CellReaderTest, PercentilesWithLabels) {
   CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles_with_labels");
+      "/tsl/monitoring/test/percentiles_with_labels");
   Percentiles percentiles = cell_reader.Delta("x1", "y1");
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1184,7 +1227,7 @@ TEST(CellReaderTest, PercentilesWithLabels) {
 
 TEST(CellReaderTest, PercentilesRepeatedSetAndRead) {
   CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles_with_labels");
+      "/tsl/monitoring/test/percentiles_with_labels");
   Percentiles percentiles = cell_reader.Delta("x1", "y1");
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1232,16 +1275,15 @@ TEST(CellReaderTest, PercentilesRepeatedSetAndRead) {
   EXPECT_FLOAT_EQ(percentiles.sum(), -111.0);
 }
 
-#if GTEST_HAS_DEATH_TEST
 TEST(CellReaderTest, WrongNumberOfLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Read(), 0);
   EXPECT_DEATH(cell_reader.Read("label1"), "has 0 labels");
   EXPECT_DEATH(cell_reader.Read("label1", "label2"), "has 0 labels");
   EXPECT_DEATH(cell_reader.Read("label1", "label2", "label3"), "has 0 labels");
 
   CellReader<int64_t> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_DEATH(cell_reader_with_labels.Read(), "has 2 labels");
   EXPECT_DEATH(cell_reader_with_labels.Read("label1"), "has 2 labels");
   EXPECT_EQ(cell_reader_with_labels.Read("label1", "label2"), 0);
@@ -1249,7 +1291,7 @@ TEST(CellReaderTest, WrongNumberOfLabels) {
                "has 2 labels");
 }
 
-TEST(CellReaderTest, MetricIsNotFound) {
+TEST(CellReaderTest, MetricIsNotFoundRead) {
   CellReader<int64_t> cell_reader("/metric/does/not/exist");
   CellReader<int64_t> empty_cell_reader("");
   EXPECT_DEATH(cell_reader.Read(), "Metric descriptor is not found");
@@ -1257,29 +1299,27 @@ TEST(CellReaderTest, MetricIsNotFound) {
 }
 
 TEST(CellReaderTest, StringGaugeDelta) {
-  CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge");
+  CellReader<std::string> cell_reader("/tsl/monitoring/test/string_gauge");
   CellReader<std::string> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/string_gauge_with_labels");
+      "/tsl/monitoring/test/string_gauge_with_labels");
   EXPECT_DEATH(cell_reader.Delta(), "Please use `Read` instead.");
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
                "Please use `Read` instead.");
 }
 
 TEST(CellReaderTest, BoolGaugeDelta) {
-  CellReader<bool> cell_reader("/tensorflow/monitoring/test/bool_gauge");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge");
   CellReader<bool> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/bool_gauge_with_labels");
+      "/tsl/monitoring/test/bool_gauge_with_labels");
   EXPECT_DEATH(cell_reader.Delta(), "Please use `Read` instead.");
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
                "Please use `Read` instead.");
 }
 
 TEST(CellReaderTest, InvalidType) {
-  CellReader<std::vector<int>> cell_reader(
-      "/tensorflow/monitoring/test/counter");
+  CellReader<std::vector<int>> cell_reader("/tsl/monitoring/test/counter");
   CellReader<std::vector<int>> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_DEATH(cell_reader.Read(),
                "Tensorflow CellReader does not support type");
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
@@ -1292,9 +1332,26 @@ TEST(CellReaderTest, InvalidType) {
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
                "Tensorflow CellReader does not support type");
 }
-#endif
+
+TEST(CellReaderTest, MetricIsNotFoundDelta) {
+  CellReader<int64_t> cell_reader("/metric/does/not/exist");
+  CellReader<int64_t> empty_cell_reader("");
+  EXPECT_EQ(cell_reader.Delta(), 0);
+  EXPECT_EQ(empty_cell_reader.Delta(), 0);
+}
+
+TEST(CellReaderTest, LazyInitializationDelta) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/lazy_counter");
+  EXPECT_EQ(cell_reader.Delta(), 0);
+}
+
+TEST(CellReaderTest, LazyInitializationDeltaAfterIncrement) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/lazy_counter");
+  IncrementLazyCounter();
+  EXPECT_EQ(cell_reader.Delta(), 1);
+}
 
 }  // namespace
 }  // namespace testing
 }  // namespace monitoring
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h b/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
index 8e305493e83c6b..434fb7c057f857 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
@@ -58,14 +58,14 @@ namespace monitoring {
 // monitorable entity is exporting it.
 struct MetricDescriptor {
   // Metric names are path-like.  E.g., "/mycomponent/mymetric".
-  string name;
+  std::string name;
 
   // A human-readable description of what this metric measures.
-  string description;
+  std::string description;
 
   // Label names for the metric.
   // See the example in the top level comment for MetricDescriptor.
-  std::vector<string> label_names;
+  std::vector<std::string> label_names;
 
   MetricKind metric_kind;
 
@@ -80,15 +80,15 @@ struct Point {
   struct Label {
     // The |name| field must match the |label_name| field in the
     // MetricDescriptor for this Point.
-    string name;
-    string value;
+    std::string name;
+    std::string value;
   };
   std::vector<Label> labels;
 
   // The actual metric value, dependent on the value_type enum.
   ValueType value_type;
   int64_t int64_value;
-  string string_value;
+  std::string string_value;
   bool bool_value;
   double double_value;
   HistogramProto histogram_value;
@@ -131,14 +131,14 @@ struct Point {
   // made.
   //
   // start_timestamp must not be younger than end_timestamp.
-  uint64 start_timestamp_millis;
-  uint64 end_timestamp_millis;
+  uint64_t start_timestamp_millis;
+  uint64_t end_timestamp_millis;
 };
 
 // A set of points belonging to a metric.
 struct PointSet {
   // This must match a name defined by a MetricDescriptor message.
-  string metric_name;
+  std::string metric_name;
 
   // No two Points in the same PointSet should have the same set of labels.
   std::vector<std::unique_ptr<Point>> points;
@@ -147,8 +147,9 @@ struct PointSet {
 // Standard format in which the metrics are collected, before being exported.
 struct CollectedMetrics {
   // The keys are the metric-names.
-  std::map<string, std::unique_ptr<MetricDescriptor>> metric_descriptor_map;
-  std::map<string, std::unique_ptr<PointSet>> point_set_map;
+  std::map<std::string, std::unique_ptr<MetricDescriptor>>
+      metric_descriptor_map;
+  std::map<std::string, std::unique_ptr<PointSet>> point_set_map;
 };
 
 }  // namespace monitoring
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
index 6c5c6c2b3767b8..1feb39c9f36347 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
@@ -39,22 +39,22 @@ void Collector::CollectMetricValues(
 }
 
 std::unique_ptr<CollectedMetrics> Collector::ConsumeCollectedMetrics() {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   return std::move(collected_metrics_);
 }
 
 void Collector::CollectMetricDescriptor(
     const AbstractMetricDef* const metric_def) {
   auto* const metric_descriptor = [&]() {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     return collected_metrics_->metric_descriptor_map
         .insert(std::make_pair(
-            string(metric_def->name()),
+            std::string(metric_def->name()),
             std::unique_ptr<MetricDescriptor>(new MetricDescriptor())))
         .first->second.get();
   }();
-  metric_descriptor->name = string(metric_def->name());
-  metric_descriptor->description = string(metric_def->description());
+  metric_descriptor->name = std::string(metric_def->name());
+  metric_descriptor->description = std::string(metric_def->description());
 
   for (const absl::string_view label_name : metric_def->label_descriptions()) {
     metric_descriptor->label_names.emplace_back(label_name);
@@ -81,7 +81,7 @@ CollectionRegistry::Register(const AbstractMetricDef* const metric_def,
   CHECK(collection_function)
       << "Requires collection_function to contain an implementation.";
 
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
 
   const auto found_it = registry_.find(metric_def->name());
   if (found_it != registry_.end()) {
@@ -103,7 +103,7 @@ CollectionRegistry::Register(const AbstractMetricDef* const metric_def,
 }
 
 void CollectionRegistry::Unregister(const AbstractMetricDef* const metric_def) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   registry_.erase(metric_def->name());
 }
 
@@ -111,7 +111,7 @@ std::unique_ptr<CollectedMetrics> CollectionRegistry::CollectMetrics(
     const CollectMetricsOptions& options) const {
   internal::Collector collector(env_->NowMicros() / 1000);
 
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   for (const auto& registration : registry_) {
     if (options.collect_metric_descriptors) {
       collector.CollectMetricDescriptor(registration.second.metric_def);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
index 52bb1ef78af1b1..871a66f99b0976 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
@@ -152,7 +152,7 @@ class MetricCollector {
 
   MetricCollector(
       const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
-      const uint64 registration_time_millis,
+      const uint64_t registration_time_millis,
       internal::Collector* const collector, PointSet* const point_set)
       : metric_def_(metric_def),
         registration_time_millis_(registration_time_millis),
@@ -162,7 +162,7 @@ class MetricCollector {
   }
 
   const MetricDef<metric_kind, Value, NumLabels>* const metric_def_;
-  const uint64 registration_time_millis_;
+  const uint64_t registration_time_millis_;
   internal::Collector* const collector_;
   PointSet* const point_set_;
 
@@ -191,14 +191,14 @@ class MetricCollectorGetter {
 
   MetricCollectorGetter(internal::Collector* const collector,
                         const AbstractMetricDef* const allowed_metric_def,
-                        const uint64 registration_time_millis)
+                        const uint64_t registration_time_millis)
       : collector_(collector),
         allowed_metric_def_(allowed_metric_def),
         registration_time_millis_(registration_time_millis) {}
 
   internal::Collector* const collector_;
   const AbstractMetricDef* const allowed_metric_def_;
-  const uint64 registration_time_millis_;
+  const uint64_t registration_time_millis_;
 };
 
 // A collection registry for metrics.
@@ -270,7 +270,7 @@ class CollectionRegistry {
   struct CollectionInfo {
     const AbstractMetricDef* const metric_def;
     CollectionFunction collection_function;
-    uint64 registration_time_millis;
+    uint64_t registration_time_millis;
   };
   std::map<absl::string_view, CollectionInfo> registry_ TF_GUARDED_BY(mu_);
 
@@ -375,17 +375,17 @@ inline void CollectValue(std::function<double()> value_fn, Point* const point) {
 // This class is thread-safe.
 class Collector {
  public:
-  explicit Collector(const uint64 collection_time_millis)
+  explicit Collector(const uint64_t collection_time_millis)
       : collected_metrics_(new CollectedMetrics()),
         collection_time_millis_(collection_time_millis) {}
 
   template <MetricKind metric_kind, typename Value, int NumLabels>
   MetricCollector<metric_kind, Value, NumLabels> GetMetricCollector(
       const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
-      const uint64 registration_time_millis,
+      const uint64_t registration_time_millis,
       internal::Collector* const collector) TF_LOCKS_EXCLUDED(mu_) {
     auto* const point_set = [&]() {
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       return collected_metrics_->point_set_map
           .insert(std::make_pair(std::string(metric_def->name()),
                                  std::unique_ptr<PointSet>(new PointSet())))
@@ -395,7 +395,7 @@ class Collector {
         metric_def, registration_time_millis, collector, point_set);
   }
 
-  uint64 collection_time_millis() const { return collection_time_millis_; }
+  uint64_t collection_time_millis() const { return collection_time_millis_; }
 
   void CollectMetricDescriptor(const AbstractMetricDef* const metric_def)
       TF_LOCKS_EXCLUDED(mu_);
@@ -409,7 +409,7 @@ class Collector {
  private:
   mutable absl::Mutex mu_;
   std::unique_ptr<CollectedMetrics> collected_metrics_ TF_GUARDED_BY(mu_);
-  const uint64 collection_time_millis_;
+  const uint64_t collection_time_millis_;
 
   Collector(const Collector&) = delete;
   void operator=(const Collector&) = delete;
@@ -423,21 +423,21 @@ class Collector {
 // collection function was registered, while the end timestamp will be set to
 // the collection time.
 template <MetricKind kind>
-void WriteTimestamps(const uint64 registration_time_millis,
-                     const uint64 collection_time_millis, Point* const point);
+void WriteTimestamps(const uint64_t registration_time_millis,
+                     const uint64_t collection_time_millis, Point* const point);
 
 template <>
 inline void WriteTimestamps<MetricKind::kGauge>(
-    const uint64 registration_time_millis, const uint64 collection_time_millis,
-    Point* const point) {
+    const uint64_t registration_time_millis,
+    const uint64_t collection_time_millis, Point* const point) {
   point->start_timestamp_millis = collection_time_millis;
   point->end_timestamp_millis = collection_time_millis;
 }
 
 template <>
 inline void WriteTimestamps<MetricKind::kCumulative>(
-    const uint64 registration_time_millis, const uint64 collection_time_millis,
-    Point* const point) {
+    const uint64_t registration_time_millis,
+    const uint64_t collection_time_millis, Point* const point) {
   point->start_timestamp_millis = registration_time_millis;
   // There's a chance that the clock goes backwards on the same machine, so we
   // protect ourselves against that.
diff --git a/third_party/xla/xla/tsl/lib/monitoring/counter.h b/third_party/xla/xla/tsl/lib/monitoring/counter.h
index 2ca270da5f91b8..38dec95cc3fa04 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/counter.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/counter.h
@@ -169,7 +169,7 @@ class Counter {
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
 
-              absl::MutexLock l(&mu_);
+              absl::MutexLock l(mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -187,7 +187,7 @@ class Counter {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   std::map<LabelArray, CounterCell> cells_ TF_GUARDED_BY(mu_);
 
   // The metric definition. This will be used to identify the metric when we
@@ -231,7 +231,7 @@ CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
                 "provided in GetCell(...).");
 
   const LabelArray& label_array = {{labels...}};
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
     return &(found_it->second);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/counter_gauge.h b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge.h
new file mode 100644
index 00000000000000..709866f8da914e
--- /dev/null
+++ b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge.h
@@ -0,0 +1,180 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_COUNTER_GAUGE_H_
+#define XLA_TSL_LIB_MONITORING_COUNTER_GAUGE_H_
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+namespace monitoring {
+
+// CounterGaugeCell stores each value of an CounterGauge.
+//
+// This class is thread-safe.
+class CounterGaugeCell {
+ public:
+  explicit CounterGaugeCell(int64_t value) : value_(value) {}
+  CounterGaugeCell() = default;
+
+  // Atomically increments the value by step. `step` can be any value.
+  void IncrementBy(int64_t step);
+
+  // Atomically increments the value by 1.
+  void Increment();
+
+  // Atomically decrements the value by 1.
+  void Decrement();
+
+  // Retrieves the current value.
+  int64_t value() const;
+
+ private:
+  std::atomic<int64_t> value_;
+
+  CounterGaugeCell(const CounterGaugeCell&) = delete;
+  void operator=(const CounterGaugeCell&) = delete;
+};
+
+// A stateful class for updating a gauge integer metric.
+//
+// This class encapsulates a set of values (or a single value for a label-less
+// metric). Each value is identified by a tuple of labels. The class allows the
+// user to increment each value.
+//
+// Counter allocates storage and maintains a cell for each value. You can
+// retrieve an individual cell using a label-tuple and update it separately.
+// This improves performance since operations related to retrieval, like
+// map-indexing and locking, are avoided.
+//
+// This class is thread-safe.
+template <int NumLabels>
+class CounterGauge {
+ public:
+  ~CounterGauge() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition arguments.
+  //
+  // Example;
+  // auto* counter_with_label = CounterGauge<1>::New("/tensorflow/counter",
+  //   "Tensorflow counter", "MyLabelName");
+  template <typename... MetricDefArgs>
+  static CounterGauge* New(MetricDefArgs&&... metric_def_args);
+
+  // Retrieves the cell for the specified labels, creating it on demand if
+  // not already present.
+  template <typename... Labels>
+  CounterGaugeCell* GetCell(const Labels&... labels) ABSL_LOCKS_EXCLUDED(mu_);
+
+  absl::Status GetStatus() { return status_; }
+
+ private:
+  explicit CounterGauge(
+      const MetricDef<MetricKind::kGauge, int64_t, NumLabels>& metric_def)
+      : metric_def_(metric_def),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+
+              absl::MutexLock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second->value());
+              }
+            })) {
+    if (registration_handle_) {
+      status_ = absl::OkStatus();
+    } else {
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
+                       "Another metric with the same name already exists.");
+    }
+  }
+
+  mutable absl::Mutex mu_;
+
+  absl::Status status_;
+
+  using LabelArray = std::array<std::string, NumLabels>;
+  absl::flat_hash_map<LabelArray, std::unique_ptr<CounterGaugeCell> > cells_
+      ABSL_GUARDED_BY(mu_);
+
+  // The metric definition. This will be used to identify the metric when we
+  // register it for collection.
+  const MetricDef<MetricKind::kGauge, int64_t, NumLabels> metric_def_;
+
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
+  CounterGauge(const CounterGauge&) = delete;
+  void operator=(const CounterGauge&) = delete;
+};
+
+////
+//  Implementation details follow. API readers may skip.
+////
+
+inline void CounterGaugeCell::IncrementBy(int64_t step) { value_ += step; }
+
+inline int64_t CounterGaugeCell::value() const { return value_; }
+
+inline void CounterGaugeCell::Increment() { IncrementBy(1); }
+
+inline void CounterGaugeCell::Decrement() { IncrementBy(-1); }
+
+template <int NumLabels>
+template <typename... MetricDefArgs>
+CounterGauge<NumLabels>* CounterGauge<NumLabels>::New(
+    MetricDefArgs&&... metric_def_args) {
+  return new CounterGauge<NumLabels>(
+      MetricDef<MetricKind::kGauge, int64_t, NumLabels>(
+          std::forward<MetricDefArgs>(metric_def_args)...));
+}
+
+template <int NumLabels>
+template <typename... Labels>
+CounterGaugeCell* CounterGauge<NumLabels>::GetCell(const Labels&... labels)
+    ABSL_LOCKS_EXCLUDED(mu_) {
+  // Provides a more informative error message than the one during array
+  // construction below.
+  static_assert(sizeof...(Labels) == NumLabels,
+                "Mismatch between CounterGauge<NumLabels> and number of labels "
+                "provided in GetCell(...).");
+
+  const LabelArray& label_array = {{labels...}};
+  absl::MutexLock l(mu_);
+  auto [it, unused_inserted] =
+      cells_.try_emplace(label_array, std::make_unique<CounterGaugeCell>());
+  return it->second.get();
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_COUNTER_GAUGE_H_
diff --git a/third_party/xla/xla/tsl/lib/monitoring/counter_gauge_test.cc b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge_test.cc
new file mode 100644
index 00000000000000..59f7afec0a5182
--- /dev/null
+++ b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge_test.cc
@@ -0,0 +1,111 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/lib/monitoring/counter_gauge.h"
+
+#include <gtest/gtest.h>
+
+namespace tsl::monitoring {
+namespace {
+
+auto* counter_gauge_with_labels =
+    CounterGauge<1>::New("/tensorflow/test/counter_gauge_with_labels",
+                         "CounterGauge with one label.", "MyLabel");
+
+TEST(LabeledCounterGaugeTest, InitializedWithZero) {
+  EXPECT_EQ(0, counter_gauge_with_labels->GetCell("Empty")->value());
+}
+
+TEST(LabeledCounterGaugeTest, GetCell) {
+  auto* cell = counter_gauge_with_labels->GetCell("GetCellOp");
+  EXPECT_EQ(0, cell->value());
+
+  cell->IncrementBy(-42);
+  EXPECT_EQ(-42, cell->value());
+
+  auto* same_cell = counter_gauge_with_labels->GetCell("GetCellOp");
+  EXPECT_EQ(-42, same_cell->value());
+
+  same_cell->IncrementBy(42);
+  EXPECT_EQ(0, cell->value());
+  EXPECT_EQ(0, same_cell->value());
+}
+
+TEST(LabeledCounterGaugeTest, IncrementAndDecrement) {
+  auto* cell = counter_gauge_with_labels->GetCell("IncrementAndDecrementOp");
+  cell->Increment();
+  EXPECT_EQ(1, cell->value());
+  cell->Increment();
+  EXPECT_EQ(2, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(1, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(0, cell->value());
+}
+
+TEST(LabeledCounterGaugeTest, SameName) {
+  auto* same_counter =
+      CounterGauge<1>::New("/tensorflow/test/counter_gauge_with_labels",
+                           "Counter with one label.", "MyLabel");
+  EXPECT_TRUE(counter_gauge_with_labels->GetStatus().ok());
+  EXPECT_TRUE(same_counter->GetStatus().ok());
+  delete same_counter;
+}
+
+auto* init_counter_gauge_without_labels = CounterGauge<0>::New(
+    "/tensorflow/test/init_counter_gauge_without_labels",
+    "Counter without any labels to check if it is initialized as 0.");
+
+TEST(UnlabeledCounterGaugeTest, InitializedWithZero) {
+  EXPECT_EQ(0, init_counter_gauge_without_labels->GetCell()->value());
+}
+
+auto* counter_gauge_without_labels =
+    CounterGauge<0>::New("/tensorflow/test/counter_gauge_without_labels",
+                         "Counter without any labels.");
+
+TEST(UnlabeledCounterGaugeTest, GetCell) {
+  auto* cell = counter_gauge_without_labels->GetCell();
+  EXPECT_EQ(0, cell->value());
+
+  cell->IncrementBy(42);
+  EXPECT_EQ(42, cell->value());
+
+  auto* same_cell = counter_gauge_without_labels->GetCell();
+  EXPECT_EQ(42, same_cell->value());
+
+  same_cell->IncrementBy(58);
+  EXPECT_EQ(100, cell->value());
+  EXPECT_EQ(100, same_cell->value());
+
+  cell->IncrementBy(-100);
+  EXPECT_EQ(0, cell->value());
+  EXPECT_EQ(0, same_cell->value());
+}
+
+TEST(UnlabeledCounterGaugeTest, IncrementAndDecrement) {
+  auto* cell = counter_gauge_without_labels->GetCell();
+  cell->Increment();
+  EXPECT_EQ(1, cell->value());
+  cell->Increment();
+  EXPECT_EQ(2, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(1, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(0, cell->value());
+}
+
+}  // namespace
+}  // namespace tsl::monitoring
diff --git a/third_party/xla/xla/tsl/lib/monitoring/gauge.h b/third_party/xla/xla/tsl/lib/monitoring/gauge.h
index ef4afbdef59653..966f7da14c22b6 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/gauge.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/gauge.h
@@ -235,7 +235,7 @@ class Gauge {
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
 
-              absl::MutexLock l(&mu_);
+              absl::MutexLock l(mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -253,7 +253,7 @@ class Gauge {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   std::map<LabelArray, GaugeCell<ValueType> > cells_ TF_GUARDED_BY(mu_);
 
   // The metric definition. This will be used to identify the metric when we
@@ -271,13 +271,13 @@ class Gauge {
 ////
 template <typename T>
 void GaugeCell<T>::Set(const T& value) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   value_ = value;
 }
 
 template <typename T>
 T GaugeCell<T>::value() const {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   return value_;
 }
 
@@ -320,7 +320,7 @@ GaugeCell<ValueType>* Gauge<ValueType, NumLabels>::GetCell(
       "provided in GetCell(...).");
 
   const LabelArray& label_array = {{labels...}};
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
     return &(found_it->second);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/metric_def.h b/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
index 82896f43a7e77e..4a5301b6baf697 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
@@ -127,7 +127,7 @@ class AbstractMetricDef {
 
   absl::string_view description() const { return description_; }
 
-  const std::vector<string>& label_descriptions() const {
+  const std::vector<std::string>& label_descriptions() const {
     return label_descriptions_;
   }
 
@@ -138,19 +138,19 @@ class AbstractMetricDef {
   AbstractMetricDef(const MetricKind kind, const ValueType value_type,
                     const absl::string_view name,
                     const absl::string_view description,
-                    const std::vector<string>& label_descriptions)
+                    const std::vector<std::string>& label_descriptions)
       : kind_(kind),
         value_type_(value_type),
         name_(name),
         description_(description),
-        label_descriptions_(std::vector<string>(label_descriptions.begin(),
-                                                label_descriptions.end())) {}
+        label_descriptions_(std::vector<std::string>(
+            label_descriptions.begin(), label_descriptions.end())) {}
 
   const MetricKind kind_;
   const ValueType value_type_;
-  const string name_;
-  const string description_;
-  const std::vector<string> label_descriptions_;
+  const std::string name_;
+  const std::string description_;
+  const std::vector<std::string> label_descriptions_;
 };
 
 // Metric definition.
diff --git a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
index 720ca3387260f4..87d94d3fe47d5b 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
@@ -35,8 +35,8 @@ namespace tsl {
 namespace monitoring {
 
 void PercentileSamplerCell::Add(double sample) {
-  uint64 nstime = EnvTime::NowNanos();
-  absl::MutexLock l(&mu_);
+  uint64_t nstime = EnvTime::NowNanos();
+  absl::MutexLock l(mu_);
   samples_[next_position_] = {nstime, sample};
   ++next_position_;
   if (TF_PREDICT_FALSE(next_position_ >= samples_.size())) {
@@ -90,7 +90,7 @@ Percentiles PercentileSamplerCell::value() const {
 
 std::vector<PercentileSamplerCell::Sample> PercentileSamplerCell::GetSamples(
     size_t* total_samples, long double* accumulator) const {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   std::vector<Sample> samples;
   if (num_samples_ == samples_.size()) {
     samples.insert(samples.end(), samples_.begin() + next_position_,
diff --git a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
index 88e5ae4647f725..be24fc564e9f63 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
@@ -121,7 +121,7 @@ class PercentileSamplerCell {
   struct Sample {
     bool operator<(const Sample& rhs) const { return value < rhs.value; }
 
-    uint64 nstime = 0;
+    uint64_t nstime = 0;
     double value = NAN;
   };
 
@@ -195,7 +195,7 @@ class PercentileSampler {
         registration_handle_(CollectionRegistry::Default()->Register(
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
-              absl::MutexLock l(&mu_);
+              absl::MutexLock l(mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -227,7 +227,7 @@ class PercentileSampler {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   // we need a container here that guarantees pointer stability of the value,
   // namely, the pointer of the value should remain valid even after more cells
   // are inserted.
@@ -274,7 +274,7 @@ PercentileSamplerCell* PercentileSampler<NumLabels>::GetCell(
       "provided in GetCell(...).");
 
   const LabelArray& label_array = {{labels...}};
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
     return &(found_it->second);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/sampler.cc b/third_party/xla/xla/tsl/lib/monitoring/sampler.cc
index 6ee26bc5ee01c5..999d92ead44c04 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/sampler.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/sampler.cc
@@ -82,6 +82,7 @@ class ExponentialBuckets : public Buckets {
                                                  int bucket_count) {
     CHECK_GT(bucket_count, 0);
     std::vector<double> bucket_limits;
+    bucket_limits.reserve(bucket_count);
     double bound = scale;
     for (int i = 0; i < bucket_count; i++) {
       bucket_limits.push_back(bound);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/sampler.h b/third_party/xla/xla/tsl/lib/monitoring/sampler.h
index 80370d623f7fa4..1a3b25fc1f01d3 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/sampler.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/sampler.h
@@ -246,7 +246,7 @@ class Sampler {
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
 
-              absl::ReaderMutexLock l(&mu_);
+              absl::ReaderMutexLock l(mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -264,7 +264,7 @@ class Sampler {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   // we need a container here that guarantees pointer stability of the value,
   // namely, the pointer of the value should remain valid even after more cells
   // are inserted.
@@ -317,13 +317,13 @@ SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
 
   const LabelArray& label_array = {{labels...}};
   {
-    absl::ReaderMutexLock l(&mu_);
+    absl::ReaderMutexLock l(mu_);
     const auto found_it = cells_.find(label_array);
     if (found_it != cells_.end()) {
       return &(found_it->second);
     }
   }
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   return &(cells_
                .emplace(std::piecewise_construct,
                         std::forward_as_tuple(label_array),
diff --git a/third_party/xla/xla/tsl/lib/monitoring/types.h b/third_party/xla/xla/tsl/lib/monitoring/types.h
index 4618308c8ce3e3..ff86579b185f31 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/types.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/types.h
@@ -38,8 +38,8 @@ struct PercentilePoint {
 
 struct Percentiles {
   UnitOfMeasure unit_of_measure = UnitOfMeasure::kNumber;
-  uint64 start_nstime = 0;
-  uint64 end_nstime = 0;
+  uint64_t start_nstime = 0;
+  uint64_t end_nstime = 0;
   double min_value = NAN;
   double max_value = NAN;
   double mean = NAN;
diff --git a/third_party/xla/xla/tsl/lib/random/BUILD b/third_party/xla/xla/tsl/lib/random/BUILD
index 9ee3308b720d85..36a6c1c1fd8c85 100644
--- a/third_party/xla/xla/tsl/lib/random/BUILD
+++ b/third_party/xla/xla/tsl/lib/random/BUILD
@@ -164,6 +164,7 @@ tsl_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc b/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc
index c94d9ec2de73d7..4cbff0d5fcfec7 100644
--- a/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <string.h>
 
+#include <cstdint>
+#include <limits>
 #include <memory>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "xla/tsl/lib/random/simple_philox.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/test.h"
@@ -96,7 +99,7 @@ static void BM_DistributionSampler(::testing::benchmark::State& state) {
   for (auto s : state) {
     r |= picker.Sample(&rand);
   }
-  CHECK_NE(r, kint32max);
+  CHECK_NE(r, std::numeric_limits<int32_t>::max());
 }
 
 BENCHMARK(BM_DistributionSampler)->Arg(10)->Arg(100)->Arg(1000);
diff --git a/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc b/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
index 7427df746d6e40..629d32d2b94360 100644
--- a/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
+++ b/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
@@ -59,12 +59,12 @@ class DeterministicSerializer {
 }  // namespace
 
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
-                                    string* result) {
+                                    std::string* result) {
   const size_t size = msg.ByteSizeLong();
   if (size > static_cast<size_t>(INT_MAX)) {
     return false;
   }
-  *result = string(size, '\0');
+  *result = std::string(size, '\0');
   return SerializeToBufferDeterministic(msg, const_cast<char*>(result->data()),
                                         result->size());
 }
@@ -95,13 +95,13 @@ bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
   return memcmp(x_serialized.data(), y_serialized.data(), size) == 0;
 }
 
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
-                                uint64 seed) {
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                  uint64_t seed) {
   DeterministicSerializer serialized(proto);
   return Hash64(serialized.data(), serialized.size(), seed);
 }
 
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto) {
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto) {
   DeterministicSerializer serialized(proto);
   return Hash64(serialized.data(), serialized.size());
 }
diff --git a/third_party/xla/xla/tsl/lib/strings/proto_serialization.h b/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
index b79e9aff6c21df..709f17e72454e4 100644
--- a/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
+++ b/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
@@ -26,7 +26,7 @@ namespace tsl {
 // See the following for more details:
 // https://github.com/google/protobuf/blob/a1bb147e96b6f74db6cdf3c3fcb00492472dbbfa/src/google/protobuf/io/coded_stream.h#L834
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
-                                    string* result);
+                                    std::string* result);
 
 // As above, but takes a pre-allocated buffer wrapped by result.
 // PRECONDITION: size == msg.ByteSizeLong() && size <= INT_MAX.
@@ -39,9 +39,9 @@ bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
                               const protobuf::MessageLite& y);
 
 // Computes Hash64 of the output of SerializeToBufferDeterministic().
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto);
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
-                                uint64 seed);
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto);
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                  uint64_t seed);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/mkl/BUILD.bazel b/third_party/xla/xla/tsl/mkl/BUILD.bazel
index 2d5eff14fba6cc..fdb5bb30887803 100644
--- a/third_party/xla/xla/tsl/mkl/BUILD.bazel
+++ b/third_party/xla/xla/tsl/mkl/BUILD.bazel
@@ -1,6 +1,6 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@local_xla//xla/tsl:tsl.bzl", "clean_dep")
-load("@local_xla//xla/tsl/mkl:build_defs.bzl", "mkl_deps")
+load("@local_xla//xla/tsl/mkl:build_defs.bzl", "mkl_dep")
 
 licenses(["notice"])  # 3-Clause BSD
 
@@ -153,10 +153,12 @@ cc_library(
     }),
 )
 
-cc_library(
+cc_library(name = "dummy_mkl_dnn")
+
+alias(
     name = "onednn",
+    actual = mkl_dep(),
     visibility = ["//visibility:public"],
-    deps = mkl_deps(),
 )
 
 bzl_library(
diff --git a/third_party/xla/xla/tsl/mkl/build_defs.bzl b/third_party/xla/xla/tsl/mkl/build_defs.bzl
index 1027ff6deecb9a..81f8015b393233 100644
--- a/third_party/xla/xla/tsl/mkl/build_defs.bzl
+++ b/third_party/xla/xla/tsl/mkl/build_defs.bzl
@@ -35,9 +35,9 @@ def if_mkl(if_true, if_false = []):
       may need it. It may be deleted in future with refactoring.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": if_true,
-        "@local_xla//xla/tsl:linux_x86_64": if_true,
-        "@local_xla//xla/tsl:windows": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): if_true,
+        Label("//xla/tsl:linux_x86_64"): if_true,
+        Label("//xla/tsl:windows"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -57,8 +57,8 @@ def if_mkl_ml(if_true, if_false = []):
       a select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_opensource": if_false,
-        "@local_xla//xla/tsl/mkl:build_with_mkl": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_opensource"): if_false,
+        Label("//xla/tsl/mkl:build_with_mkl"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -75,7 +75,7 @@ def if_mkl_lnx_x64(if_true, if_false = []):
       a select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_lnx_x64": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_lnx_x64"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -92,7 +92,7 @@ def if_enable_mkl(if_true, if_false = []):
       A select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:enable_mkl": if_true,
+        Label("//xla/tsl/mkl:enable_mkl"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -108,13 +108,32 @@ def mkl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
-        "@local_xla//xla/tsl:linux_x86_64_with_onednn_async": ["@onednn_async//:mkl_dnn"],
-        "@local_xla//xla/tsl:linux_x86_64": ["@onednn//:mkl_dnn"],
-        "@local_xla//xla/tsl:windows": ["@onednn//:mkl_dnn"],
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
+        Label("//xla/tsl:linux_x86_64_with_onednn_async"): ["@onednn_async//:mkl_dnn"],
+        Label("//xla/tsl:linux_x86_64"): ["@onednn//:mkl_dnn"],
+        Label("//xla/tsl:windows"): ["@onednn//:mkl_dnn"],
         "//conditions:default": [],
     })
 
+def mkl_dep():
+    """Returns the correct oneDNN library dependency.
+
+      Shorthand for select() to pull in the correct oneDNN library dep
+      depending on the platform. x86 Linux/Windows with or without --config=mkl
+      will always build with oneDNN library.
+
+    Returns:
+      a select evaluating to a library target, suitable for
+      inclusion in the deps attribute of rules.
+    """
+    return select({
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): "@mkl_dnn_acl_compatible//:mkl_dnn_acl",
+        Label("//xla/tsl:linux_x86_64_with_onednn_async"): "@onednn_async//:mkl_dnn",
+        Label("//xla/tsl:linux_x86_64"): "@onednn//:mkl_dnn",
+        Label("//xla/tsl:windows"): "@onednn//:mkl_dnn",
+        "//conditions:default": "//xla/tsl/mkl:dummy_mkl_dnn",
+    })
+
 def if_onednn_async(if_true, if_false = []):
     """Returns `if_true` if building oneDNN with async runtime support.
 
@@ -124,7 +143,7 @@ def if_onednn_async(if_true, if_false = []):
       Otherwise, the select statement evaluates to if_false.
     """
     return select({
-        "@local_xla//xla/tsl:linux_x86_64_with_onednn_async": if_true,
+        Label("//xla/tsl:linux_x86_64_with_onednn_async"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -136,9 +155,9 @@ def onednn_v3_define():
       An empty list of all other cases (include ARM builds).
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": ["-DENABLE_ONEDNN_V3"],
-        "@local_xla//xla/tsl:linux_x86_64": ["-DENABLE_ONEDNN_V3"],
-        "@local_xla//xla/tsl:windows": ["-DENABLE_ONEDNN_V3"],
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): ["-DENABLE_ONEDNN_V3"],
+        Label("//xla/tsl:linux_x86_64"): ["-DENABLE_ONEDNN_V3"],
+        Label("//xla/tsl:windows"): ["-DENABLE_ONEDNN_V3"],
         "//conditions:default": [],
     })
 
@@ -154,13 +173,13 @@ def if_mkldnn_openmp(if_true, if_false = []):
 
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkldnn_openmp": if_true,
+        Label("//xla/tsl/mkl:build_with_mkldnn_openmp"): if_true,
         "//conditions:default": if_false,
     })
 
 def if_mkldnn_aarch64_acl(if_true, if_false = []):
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -168,8 +187,8 @@ def if_mkldnn_aarch64_acl(if_true, if_false = []):
 def if_graph_api(if_true, if_false = []):
     """Returns `if_true` if Graph API is used with oneDNN."""
     return select({
-        "@local_xla//xla/tsl:linux_x86_64": if_true,
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": if_true,
+        Label("//xla/tsl:linux_x86_64"): if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): if_true,
         "//conditions:default": if_false,
     })
 
diff --git a/third_party/xla/xla/tsl/platform/BUILD b/third_party/xla/xla/tsl/platform/BUILD
index 60b1b00aef600f..c29efb69d96699 100644
--- a/third_party/xla/xla/tsl/platform/BUILD
+++ b/third_party/xla/xla/tsl/platform/BUILD
@@ -58,6 +58,8 @@ exports_files(
         "threadpool.cc",
         "threadpool.h",
         "env.h",
+        "numa_hwloc.cc",
+        "numa_noop.cc",
     ],
     visibility = internal_visibility([
         "//tensorflow/core/platform:__subpackages__",
@@ -488,6 +490,7 @@ cc_library(
     ],
     deps = [
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:absl_log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:globals",
         "@com_google_absl//absl/log:vlog_is_on",
@@ -576,14 +579,15 @@ tsl_cc_test(
         ":errors",
         ":stack_frame",
         ":status",
-        ":status_matchers",
         ":status_to_from_proto",
         ":test",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "//xla/tsl/protobuf:status_proto_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -729,6 +733,7 @@ cc_library(
     hdrs = ["types.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:bfloat16",
         "@local_tsl//tsl/platform:ml_dtypes",
diff --git a/third_party/xla/xla/tsl/platform/build_config_root.bzl b/third_party/xla/xla/tsl/platform/build_config_root.bzl
index 764251ac28d0e5..173c5df828e540 100644
--- a/third_party/xla/xla/tsl/platform/build_config_root.bzl
+++ b/third_party/xla/xla/tsl/platform/build_config_root.bzl
@@ -11,6 +11,7 @@ load(
     _if_llvm_aarch64_available = "if_llvm_aarch64_available",
     _if_llvm_arm_available = "if_llvm_arm_available",
     _if_llvm_powerpc_available = "if_llvm_powerpc_available",
+    _if_llvm_riscv_available = "if_llvm_riscv_available",
     _if_llvm_system_z_available = "if_llvm_system_z_available",
     _if_llvm_x86_available = "if_llvm_x86_available",
     _if_pywrap = "if_pywrap",
@@ -32,6 +33,7 @@ if_llvm_aarch32_available = _if_llvm_aarch32_available
 if_llvm_aarch64_available = _if_llvm_aarch64_available
 if_llvm_arm_available = _if_llvm_arm_available
 if_llvm_powerpc_available = _if_llvm_powerpc_available
+if_llvm_riscv_available = _if_llvm_riscv_available
 if_llvm_system_z_available = _if_llvm_system_z_available
 if_llvm_x86_available = _if_llvm_x86_available
 if_static = _if_static
diff --git a/third_party/xla/xla/tsl/platform/cloud/BUILD b/third_party/xla/xla/tsl/platform/cloud/BUILD
index 65eebbcd5b9914..2e8a6b6fde7ec2 100644
--- a/third_party/xla/xla/tsl/platform/cloud/BUILD
+++ b/third_party/xla/xla/tsl/platform/cloud/BUILD
@@ -130,14 +130,16 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@jsoncpp_git//:jsoncpp",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:retrying_file_system",
         "@local_tsl//tsl/platform:retrying_utils",
         "@local_tsl//tsl/platform:str_util",
@@ -176,9 +178,13 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@jsoncpp_git//:jsoncpp",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:path",
@@ -223,10 +229,16 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
         "//xla/tsl/util:env_var",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@curl",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:scanner",
         "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:stringpiece",
     ],
 )
@@ -266,6 +278,9 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@jsoncpp_git//:jsoncpp",
@@ -386,6 +401,7 @@ tsl_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
@@ -399,17 +415,29 @@ tsl_cc_test(
     size = "medium",
     srcs = ["gcs_file_system_test.cc"],
     deps = [
+        ":compute_engine_zone_provider",
+        ":file_block_cache",
         ":gcs_file_system",
+        ":gcs_throttle",
+        ":google_auth_provider",
+        ":http_request",
         ":http_request_fake",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:env_impl",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:file_statistics",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
+        "//xla/tsl/platform:types",
         "//xla/tsl/profiler/backends/cpu:traceme_recorder_impl",
         "//xla/tsl/profiler/utils:time_utils_impl",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:retrying_utils",
         "@local_tsl//tsl/platform:strcat",
     ],
 )
@@ -497,14 +525,21 @@ tsl_cc_test(
     ],
     tags = ["no_oss"],  # TODO(b/327036247): revisit after this moves to XLA
     deps = [
+        ":compute_engine_metadata_client",
         ":google_auth_provider",
+        ":http_request",
         ":http_request_fake",
         ":oauth_client",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:env_impl",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@jsoncpp_git//:jsoncpp",
         "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:retrying_utils",
     ],
 )
 
@@ -514,11 +549,14 @@ tsl_cc_test(
     srcs = ["compute_engine_metadata_client_test.cc"],
     deps = [
         ":compute_engine_metadata_client",
+        ":http_request",
         ":http_request_fake",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:retrying_utils",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/auth_provider.h b/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
index 5cbc1704baa498..6c840b9804617e 100644
--- a/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
@@ -31,9 +31,9 @@ class AuthProvider {
   /// \brief Returns the short-term authentication bearer token.
   ///
   /// Safe for concurrent use by multiple threads.
-  virtual absl::Status GetToken(string* t) = 0;
+  virtual absl::Status GetToken(std::string* t) = 0;
 
-  static absl::Status GetToken(AuthProvider* provider, string* token) {
+  static absl::Status GetToken(AuthProvider* provider, std::string* token) {
     if (!provider) {
       return errors::Internal("Auth provider is required.");
     }
@@ -44,7 +44,7 @@ class AuthProvider {
 /// No-op auth provider, which will only work for public objects.
 class EmptyAuthProvider : public AuthProvider {
  public:
-  absl::Status GetToken(string* token) override {
+  absl::Status GetToken(std::string* token) override {
     *token = "";
     return absl::OkStatus();
   }
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc
index 586c91a0b8bfab..2590dcd743e0d6 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc
@@ -41,9 +41,9 @@ ComputeEngineMetadataClient::ComputeEngineMetadataClient(
       retry_config_(config) {}
 
 absl::Status ComputeEngineMetadataClient::GetMetadata(
-    const string& path, std::vector<char>* response_buffer) {
+    const std::string& path, std::vector<char>* response_buffer) {
   const auto get_metadata_from_gce = [path, response_buffer, this]() {
-    string metadata_url;
+    std::string metadata_url;
     const char* metadata_url_override = std::getenv(kGceMetadataHost);
     if (metadata_url_override) {
       metadata_url = absl::StrCat("http://", metadata_url_override,
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
index 81863019a247ee..7fd20c0854c60e 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
@@ -51,7 +51,7 @@ class ComputeEngineMetadataClient {
   /// To get the zone of an instance:
   ///   compute_engine_metadata_client.GetMetadata(
   ///       "instance/zone", response_buffer);
-  virtual absl::Status GetMetadata(const string& path,
+  virtual absl::Status GetMetadata(const std::string& path,
                                    std::vector<char>* response_buffer);
 
  private:
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc
index b89e63cfa0a303..c627bc21d9387f 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc
@@ -15,9 +15,17 @@ limitations under the License.
 
 #include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/cloud/http_request.h"
 #include "xla/tsl/platform/cloud/http_request_fake.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
+#include "tsl/platform/retrying_utils.h"
 
 namespace tsl {
 
@@ -31,7 +39,7 @@ class ComputeEngineMetadataClientTest : public ::testing::Test {
 };
 
 TEST_F(ComputeEngineMetadataClientTest, GetMetadata) {
-  const string example_response = "example response";
+  const std::string example_response = "example response";
 
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: http://metadata.google.internal/computeMetadata/v1/instance"
@@ -52,7 +60,7 @@ TEST_F(ComputeEngineMetadataClientTest, GetMetadata) {
 }
 
 TEST_F(ComputeEngineMetadataClientTest, GetCustomMetadataEndpoint) {
-  const string example_response = "example response";
+  const std::string example_response = "example response";
   setenv("GCE_METADATA_HOST", "foo.bar", 1);
 
   std::vector<HttpRequest*> requests(
@@ -74,14 +82,14 @@ TEST_F(ComputeEngineMetadataClientTest, GetCustomMetadataEndpoint) {
 }
 
 TEST_F(ComputeEngineMetadataClientTest, RetryOnFailure) {
-  const string example_response = "example response";
+  const std::string example_response = "example response";
 
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: http://metadata.google.internal/computeMetadata/v1/instance"
            "/service-accounts/default/token\n"
            "Header Metadata-Flavor: Google\n",
-           "", errors::Unavailable("503"), 503),
+           "", absl::UnavailableError("503"), 503),
        new FakeHttpRequest(
            "Uri: http://metadata.google.internal/computeMetadata/v1/instance"
            "/service-accounts/default/token\n"
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc
index be878e3099ee1b..2c1705802ebdb5 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc
@@ -28,7 +28,7 @@ ComputeEngineZoneProvider::ComputeEngineZoneProvider(
     std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client)
     : google_metadata_client_(std::move(google_metadata_client)) {}
 
-absl::Status ComputeEngineZoneProvider::GetZone(string* zone) {
+absl::Status ComputeEngineZoneProvider::GetZone(std::string* zone) {
   if (!cached_zone.empty()) {
     *zone = cached_zone;
     return absl::OkStatus();
@@ -38,13 +38,12 @@ absl::Status ComputeEngineZoneProvider::GetZone(string* zone) {
                                                           &response_buffer));
   absl::string_view location(&response_buffer[0], response_buffer.size());
 
-  std::vector<string> elems = str_util::Split(location, "/");
+  std::vector<std::string> elems = str_util::Split(location, "/");
   if (elems.size() == 4) {
     cached_zone = elems.back();
     *zone = cached_zone;
   } else {
-    LOG(ERROR) << "Failed to parse the zone name from location: "
-               << string(location);
+    LOG(ERROR) << "Failed to parse the zone name from location: " << location;
   }
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
index e54b8aba42ab84..8366eb100e708d 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
@@ -27,11 +27,11 @@ class ComputeEngineZoneProvider : public ZoneProvider {
       std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client);
   virtual ~ComputeEngineZoneProvider();
 
-  absl::Status GetZone(string* zone) override;
+  absl::Status GetZone(std::string* zone) override;
 
  private:
   std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client_;
-  string cached_zone;
+  std::string cached_zone;
   ComputeEngineZoneProvider(const ComputeEngineZoneProvider&) = delete;
   void operator=(const ComputeEngineZoneProvider&) = delete;
 };
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc
index e9ecd10f68743a..8c3f29d02f4cb0 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc
@@ -40,7 +40,7 @@ TEST_F(ComputeEngineZoneProviderTest, GetZone) {
 
   ComputeEngineZoneProvider provider(metadata_client);
 
-  string zone;
+  std::string zone;
 
   TF_EXPECT_OK(provider.GetZone(&zone));
   EXPECT_EQ("us-west1-b", zone);
@@ -61,7 +61,7 @@ TEST_F(ComputeEngineZoneProviderTest, InvalidZoneString) {
 
   ComputeEngineZoneProvider provider(metadata_client);
 
-  string zone;
+  std::string zone;
 
   TF_EXPECT_OK(provider.GetZone(&zone));
   EXPECT_EQ("", zone);
diff --git a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc
index b9d06926a497ea..89db9893c53a32 100644
--- a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc
@@ -16,14 +16,27 @@ limitations under the License.
 #include "xla/tsl/platform/cloud/curl_http_request.h"
 
 #include <algorithm>
-
-#include "xla/tsl/lib/gtl/map_util.h"
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/util/env_var.h"
 #include "tsl/platform/scanner.h"
-#include "tsl/platform/str_util.h"
+#include "tsl/platform/strcat.h"
 
 #define CHECK_CURL_OK(expr) CHECK_EQ(expr, CURLE_OK)
 
@@ -173,7 +186,7 @@ CurlHttpRequest::~CurlHttpRequest() {
   }
 }
 
-string CurlHttpRequest::EscapeString(const string& str) {
+string CurlHttpRequest::EscapeString(const std::string& str) {
   char* out_char_str = libcurl_->curl_easy_escape(curl_, str.c_str(), 0);
   string out_str(out_char_str);
   libcurl_->curl_free(out_char_str);
@@ -243,8 +256,8 @@ absl::Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
   }
   put_body_ = fopen(body_filepath.c_str(), "r");
   if (!put_body_) {
-    return errors::InvalidArgument("Couldn't open the specified file: " +
-                                   body_filepath);
+    return absl::InvalidArgumentError(
+        absl::StrCat("Couldn't open the specified file: ", body_filepath));
   }
   fseek(put_body_, 0, SEEK_END);
   const auto size = ftell(put_body_) - offset;
@@ -493,7 +506,7 @@ absl::Status CurlHttpRequest::Send() {
     case 406:  // Not Acceptable
     case 411:  // Length Required
     case 414:  // URI Too Long
-      result = errors::InvalidArgument(get_error_message());
+      result = absl::InvalidArgumentError(get_error_message());
       break;
 
     // PERMISSION_DENIED indicates an authentication or an authorization issue.
@@ -506,7 +519,7 @@ absl::Status CurlHttpRequest::Send() {
     // NOT_FOUND indicates that the requested resource does not exist.
     case 404:  // Not found
     case 410:  // Gone
-      result = errors::NotFound(get_error_message());
+      result = absl::NotFoundError(get_error_message());
       break;
 
     // FAILED_PRECONDITION indicates that the request failed because some
@@ -534,7 +547,7 @@ absl::Status CurlHttpRequest::Send() {
     case 502:  // Bad Gateway
     case 503:  // Service Unavailable
     default:   // All other HTTP response codes also should be retried.
-      result = errors::Unavailable(get_error_message());
+      result = absl::UnavailableError(get_error_message());
       break;
   }
   if (!result.ok()) {
@@ -663,7 +676,7 @@ absl::Status CurlHttpRequest::CURLcodeToStatus(CURLcode code,
   }
   // Return Unavailable to retry by default. There may be other permanent
   // failures that should be distinguished.
-  return errors::Unavailable(
+  return absl::UnavailableError(
       absl::StrCat(error_message, *error_buffer ? error_buffer : "(none)"));
 }
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
index 717e59b13e5507..47440e6d3dbe49 100644
--- a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
+++ b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
@@ -200,7 +200,7 @@ class CurlHttpRequest : public HttpRequest {
 
   std::vector<char> default_response_buffer_;
 
-  std::unordered_map<string, string> response_headers_;
+  std::unordered_map<std::string, string> response_headers_;
   uint64 response_code_ = 0;
 
   // The timestamp of the last activity related to the request execution, in
diff --git a/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc b/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc
index 87824ba0a8d5db..35a4da95b1f24f 100644
--- a/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc
@@ -42,7 +42,7 @@ class FakeEnv : public EnvWrapper {
 // A fake proxy that pretends to be libcurl.
 class FakeLibCurl : public LibCurl {
  public:
-  FakeLibCurl(const string& response_content, uint64 response_code)
+  FakeLibCurl(const std::string& response_content, uint64 response_code)
       : response_content_(response_content), response_code_(response_code) {}
   FakeLibCurl(const string& response_content, uint64 response_code,
               std::vector<std::tuple<uint64, curl_off_t>> progress_ticks,
diff --git a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
index 903fcb0071e926..49c2070e1a6c79 100644
--- a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
@@ -39,13 +39,13 @@ class ExpiringLRUCache {
   /// A `max_age` of 0 means that nothing is cached. A `max_entries` of 0 means
   /// that there is no limit on the number of entries in the cache (however, if
   /// `max_age` is also 0, the cache will not be populated).
-  ExpiringLRUCache(uint64 max_age, size_t max_entries,
+  ExpiringLRUCache(uint64_t max_age, size_t max_entries,
                    Env* env = Env::Default())
       : max_age_(max_age), max_entries_(max_entries), env_(env) {}
 
   /// Insert `value` with key `key`. This will replace any previous entry with
   /// the same key.
-  void Insert(const string& key, const T& value) {
+  void Insert(const std::string& key, const T& value) {
     if (max_age_ == 0) {
       return;
     }
@@ -56,7 +56,7 @@ class ExpiringLRUCache {
   // Delete the entry with key `key`. Return true if the entry was found for
   // `key`, false if the entry was not found. In both cases, there is no entry
   // with key `key` existed after the call.
-  bool Delete(const string& key) {
+  bool Delete(const std::string& key) {
     absl::MutexLock lock(mu_);
     return DeleteLocked(key);
   }
@@ -64,7 +64,7 @@ class ExpiringLRUCache {
   /// Look up the entry with key `key` and copy it to `value` if found. Returns
   /// true if an entry was found for `key`, and its timestamp is not more than
   /// max_age_ seconds in the past.
-  bool Lookup(const string& key, T* value) {
+  bool Lookup(const std::string& key, T* value) {
     if (max_age_ == 0) {
       return false;
     }
@@ -72,12 +72,12 @@ class ExpiringLRUCache {
     return LookupLocked(key, value);
   }
 
-  typedef std::function<absl::Status(const string&, T*)> ComputeFunc;
+  typedef std::function<absl::Status(const std::string&, T*)> ComputeFunc;
 
   /// Look up the entry with key `key` and copy it to `value` if found. If not
   /// found, call `compute_func`. If `compute_func` returns successfully, store
   /// a copy of the output parameter in the cache, and another copy in `value`.
-  absl::Status LookupOrCompute(const string& key, T* value,
+  absl::Status LookupOrCompute(const std::string& key, T* value,
                                const ComputeFunc& compute_func) {
     if (max_age_ == 0) {
       return compute_func(key, value);
@@ -105,22 +105,22 @@ class ExpiringLRUCache {
   }
 
   /// Accessors for cache parameters.
-  uint64 max_age() const { return max_age_; }
+  uint64_t max_age() const { return max_age_; }
   size_t max_entries() const { return max_entries_; }
 
  private:
   struct Entry {
     /// The timestamp (seconds) at which the entry was added to the cache.
-    uint64 timestamp;
+    uint64_t timestamp;
 
     /// The entry's value.
     T value;
 
     /// A list iterator pointing to the entry's position in the LRU list.
-    std::list<string>::iterator lru_iterator;
+    std::list<std::string>::iterator lru_iterator;
   };
 
-  bool LookupLocked(const string& key, T* value)
+  bool LookupLocked(const std::string& key, T* value)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto it = cache_.find(key);
     if (it == cache_.end()) {
@@ -137,7 +137,7 @@ class ExpiringLRUCache {
     return true;
   }
 
-  void InsertLocked(const string& key, const T& value)
+  void InsertLocked(const std::string& key, const T& value)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     lru_list_.push_front(key);
     Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
@@ -151,7 +151,7 @@ class ExpiringLRUCache {
     }
   }
 
-  bool DeleteLocked(const string& key) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool DeleteLocked(const std::string& key) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto it = cache_.find(key);
     if (it == cache_.end()) {
       return false;
@@ -163,7 +163,7 @@ class ExpiringLRUCache {
 
   /// The maximum age of entries in the cache, in seconds. A value of 0 means
   /// that no entry is ever placed in the cache.
-  const uint64 max_age_;
+  const uint64_t max_age_;
 
   /// The maximum number of entries in the cache. A value of 0 means there is no
   /// limit on entry count.
@@ -176,11 +176,11 @@ class ExpiringLRUCache {
   absl::Mutex mu_;
 
   /// The cache (a map from string key to Entry).
-  std::map<string, Entry> cache_ TF_GUARDED_BY(mu_);
+  std::map<std::string, Entry> cache_ TF_GUARDED_BY(mu_);
 
   /// The LRU list of entries. The front of the list identifies the most
   /// recently accessed entry.
-  std::list<string> lru_list_ TF_GUARDED_BY(mu_);
+  std::list<std::string> lru_list_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc
index 9f107e59c29599..8f69466b1404e0 100644
--- a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc
@@ -25,7 +25,7 @@ namespace tsl {
 namespace {
 
 TEST(ExpiringLRUCacheTest, MaxAge) {
-  const string key = "a";
+  const std::string key = "a";
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
   ExpiringLRUCache<int> cache(1, 0, env.get());
   env->SetNowSeconds(1);
@@ -92,9 +92,9 @@ TEST(ExpiringLRUCacheTest, MaxEntries) {
 
 TEST(ExpiringLRUCacheTest, LookupOrCompute) {
   // max_age of 0 means we should always compute.
-  uint64 num_compute_calls = 0;
+  uint64_t num_compute_calls = 0;
   ExpiringLRUCache<int>::ComputeFunc compute_func =
-      [&num_compute_calls](const string& key, int* value) {
+      [&num_compute_calls](const std::string& key, int* value) {
         *value = num_compute_calls;
         num_compute_calls++;
         return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h b/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
index 3a6899036eee8e..bdab4dc8454f60 100644
--- a/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
@@ -69,7 +69,7 @@ class FileBlockCache {
   /// cache is constructed. The returned Status should be OK as long as the
   /// read from the remote filesystem succeeded (similar to the semantics of the
   /// read(2) system call).
-  typedef std::function<absl::Status(const string& filename, size_t offset,
+  typedef std::function<absl::Status(const std::string& filename, size_t offset,
                                      size_t buffer_size, char* buffer,
                                      size_t* bytes_transferred)>
       BlockFetcher;
@@ -90,18 +90,19 @@ class FileBlockCache {
   ///    placed in `out`.
   /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
   ///    in `out`).
-  virtual absl::Status Read(const string& filename, size_t offset, size_t n,
-                            char* buffer, size_t* bytes_transferred) = 0;
+  virtual absl::Status Read(const std::string& filename, size_t offset,
+                            size_t n, char* buffer,
+                            size_t* bytes_transferred) = 0;
 
   // Validate the given file signature with the existing file signature in the
   // cache. Returns true if the signature doesn't change or the file did not
   // exist before. If the signature changes, update the existing signature with
   // the new one and remove the file from cache.
-  virtual bool ValidateAndUpdateFileSignature(const string& filename,
+  virtual bool ValidateAndUpdateFileSignature(const std::string& filename,
                                               int64_t file_signature) = 0;
 
   /// Remove all cached blocks for `filename`.
-  virtual void RemoveFile(const string& filename) = 0;
+  virtual void RemoveFile(const std::string& filename) = 0;
 
   /// Remove all cached data.
   virtual void Flush() = 0;
@@ -109,7 +110,7 @@ class FileBlockCache {
   /// Accessors for cache parameters.
   virtual size_t block_size() const = 0;
   virtual size_t max_bytes() const = 0;
-  virtual uint64 max_staleness() const = 0;
+  virtual uint64_t max_staleness() const = 0;
 
   /// The current size (in bytes) of the cache.
   virtual size_t CacheSize() const = 0;
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
index 735a2fa663781e..73af26eed2f16b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
@@ -40,10 +40,11 @@ namespace tsl {
 
 namespace {
 
-const std::vector<string>& kCachedDomainNames =
-    *new std::vector<string>{"www.googleapis.com", "storage.googleapis.com"};
+const std::vector<std::string>& kCachedDomainNames =
+    *new std::vector<std::string>{"www.googleapis.com",
+                                  "storage.googleapis.com"};
 
-inline void print_getaddrinfo_error(const string& name,
+inline void print_getaddrinfo_error(const std::string& name,
                                     absl::Status return_status) {
   // Status doesn't map well to EAI type errors.
   LOG(ERROR) << "Error resolving " << name << ": " << return_status;
@@ -81,10 +82,10 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
 
   CHECK_EQ(kCachedDomainNames.size(), addresses_.size());
   for (size_t i = 0; i < kCachedDomainNames.size(); ++i) {
-    const string& name = kCachedDomainNames[i];
-    const std::vector<string>& addresses = addresses_[i];
+    const std::string& name = kCachedDomainNames[i];
+    const std::vector<std::string>& addresses = addresses_[i];
     if (!addresses.empty()) {
-      const string& chosen_address =
+      const std::string& chosen_address =
           SelectRandomItemUniform(&random_, addresses);
       request->AddResolveOverride(name, 443, chosen_address);
       VLOG(1) << "Annotated DNS mapping: " << name << " --> " << chosen_address;
@@ -94,7 +95,8 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
   }
 }
 
-/* static */ std::vector<string> GcsDnsCache::ResolveName(const string& name) {
+/* static */ std::vector<std::string> GcsDnsCache::ResolveName(
+    const std::string& name) {
   VLOG(1) << "Resolving DNS name: " << name;
 
   addrinfo hints;
@@ -182,7 +184,7 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
       },
       retryConfig);
 
-  std::vector<string> output;
+  std::vector<std::string> output;
   if (getaddrinfo_status.ok()) {
     for (const addrinfo* i = result; i != nullptr; i = i->ai_next) {
       if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) {
@@ -221,11 +223,11 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
 //
 // Ensures: names.size() == return_value.size()
 
-std::vector<std::vector<string>> GcsDnsCache::ResolveNames(
-    const std::vector<string>& names) {
-  std::vector<std::vector<string>> all_addresses;
+std::vector<std::vector<std::string>> GcsDnsCache::ResolveNames(
+    const std::vector<std::string>& names) {
+  std::vector<std::vector<std::string>> all_addresses;
   all_addresses.reserve(names.size());
-  for (const string& name : names) {
+  for (const std::string& name : names) {
     all_addresses.push_back(ResolveName(name));
   }
   return all_addresses;
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
index 4b4f19935c42b3..0a84728739441a 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
@@ -52,9 +52,9 @@ class GcsDnsCache {
   void AnnotateRequest(HttpRequest* request);
 
  private:
-  static std::vector<string> ResolveName(const string& name);
-  static std::vector<std::vector<string>> ResolveNames(
-      const std::vector<string>& names);
+  static std::vector<std::string> ResolveName(const std::string& name);
+  static std::vector<std::vector<std::string>> ResolveNames(
+      const std::vector<std::string>& names);
   void WorkerThread();
 
   // Define a friend class for testing.
@@ -70,7 +70,7 @@ class GcsDnsCache {
   const int64_t refresh_rate_secs_;
 
   // Entries in this vector correspond to entries in kCachedDomainNames.
-  std::vector<std::vector<string>> addresses_ TF_GUARDED_BY(mu_);
+  std::vector<std::vector<std::string>> addresses_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
index b2180666bb6b87..6059ccba3ed056 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
@@ -23,25 +23,25 @@ namespace tsl {
 
 class TestHttpRequest : public HttpRequest {
  public:
-  void SetUri(const string& uri) override {}
-  void SetRange(uint64 start, uint64 end) override {}
-  void AddHeader(const string& name, const string& value) override {}
-  void AddResolveOverride(const string& hostname, int64_t port,
-                          const string& ip_addr) override {
+  void SetUri(const std::string& uri) override {}
+  void SetRange(uint64_t start, uint64_t end) override {}
+  void AddHeader(const std::string& name, const std::string& value) override {}
+  void AddResolveOverride(const std::string& hostname, int64_t port,
+                          const std::string& ip_addr) override {
     EXPECT_EQ(port, 443) << "Unexpected port set for hostname: " << hostname;
     auto itr = resolve_overrides_.find(hostname);
     EXPECT_EQ(itr, resolve_overrides_.end())
         << "Hostname " << hostname << "already in map: " << itr->second;
 
     resolve_overrides_.insert(
-        std::map<string, string>::value_type(hostname, ip_addr));
+        std::map<std::string, std::string>::value_type(hostname, ip_addr));
   }
 
-  void AddAuthBearerHeader(const string& auth_token) override {}
+  void AddAuthBearerHeader(const std::string& auth_token) override {}
   void SetRequestStats(HttpRequest::RequestStats* stats) override {}
   void SetDeleteRequest() override {}
 
-  absl::Status SetPutFromFile(const string& body_filepath,
+  absl::Status SetPutFromFile(const std::string& body_filepath,
                               size_t offset) override {
     return absl::OkStatus();
   }
@@ -52,15 +52,17 @@ class TestHttpRequest : public HttpRequest {
   void SetResultBufferDirect(char* buffer, size_t size) override {}
   size_t GetResultBufferDirectBytesTransferred() override { return 0; }
 
-  string GetResponseHeader(const string& name) const override { return ""; }
-  uint64 GetResponseCode() const override { return 0; }
+  std::string GetResponseHeader(const std::string& name) const override {
+    return "";
+  }
+  uint64_t GetResponseCode() const override { return 0; }
   absl::Status Send() override { return absl::OkStatus(); }
-  string EscapeString(const string& str) override { return ""; }
+  std::string EscapeString(const std::string& str) override { return ""; }
 
-  void SetTimeouts(uint32 connection, uint32 inactivity,
-                   uint32 total) override {}
+  void SetTimeouts(uint32_t connection, uint32_t inactivity,
+                   uint32_t total) override {}
 
-  std::map<string, string> resolve_overrides_;
+  std::map<std::string, std::string> resolve_overrides_;
 };
 
 // Friend class for testing.
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
index b6b53799e0839c..37a81244fe5f40 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
@@ -17,13 +17,40 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <cstdint>
+#include <iosfwd>
+#include <limits>
 #include <memory>
+#include <set>
+#include <unordered_set>
 
+#include "absl/base/attributes.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/cloud/auth_provider.h"
+#include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
+#include "xla/tsl/platform/cloud/compute_engine_zone_provider.h"
+#include "xla/tsl/platform/cloud/expiring_lru_cache.h"
+#include "xla/tsl/platform/cloud/gcs_dns_cache.h"
+#include "xla/tsl/platform/cloud/gcs_throttle.h"
+#include "xla/tsl/platform/cloud/http_request.h"
+#include "xla/tsl/platform/cloud/zone_provider.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
 #include "tsl/platform/retrying_file_system.h"
 
 #ifndef _WIN32
@@ -45,7 +72,6 @@ limitations under the License.
 #ifdef _WIN32
 #include <io.h>  // for _mktemp
 #endif
-#include "absl/base/macros.h"
 #include "json/json.h"
 #include "xla/tsl/platform/cloud/curl_http_request.h"
 #include "xla/tsl/platform/cloud/file_block_cache.h"
@@ -56,7 +82,6 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/retrying_utils.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/stringprintf.h"
@@ -184,7 +209,7 @@ string MaybeAppendSlash(const string& name) {
     return "/";
   }
   if (name.back() != '/') {
-    return strings::StrCat(name, "/");
+    return absl::StrCat(name, "/");
   }
   return name;
 }
@@ -193,7 +218,7 @@ string MaybeAppendSlash(const string& name) {
 // to result in an appended slash in order for directory markers
 // to be processed correctly: "gs://a/b" + "" should give "gs://a/b/".
 string JoinGcsPath(const string& path, const string& subpath) {
-  return strings::StrCat(MaybeAppendSlash(path), subpath);
+  return absl::StrCat(MaybeAppendSlash(path), subpath);
 }
 
 /// \brief Returns the given paths appending all their subfolders.
@@ -660,9 +685,9 @@ class GcsWritableFile : public WritableFile {
     if (absl::IsNotFound(upload_status)) {
       // GCS docs recommend retrying the whole upload. We're relying on the
       // RetryingFileSystem to retry the Sync() call.
-      return errors::Unavailable(
-          strings::StrCat("Upload to gs://", bucket_, "/", object_,
-                          " failed, caused by: ", upload_status.message()));
+      return absl::UnavailableError(
+          absl::StrCat("Upload to gs://", bucket_, "/", object_,
+                       " failed, caused by: ", upload_status.message()));
     }
     if (upload_status.ok()) {
       if (should_compose) {
@@ -773,7 +798,7 @@ class GcsWritableFile : public WritableFile {
   }
 
   string GetGcsPathWithObject(string object) const {
-    return strings::StrCat("gs://", bucket_, "/", object);
+    return absl::StrCat("gs://", bucket_, "/", object);
   }
   string GetGcsPath() const { return GetGcsPathWithObject(object_); }
 
@@ -1234,7 +1259,7 @@ absl::Status GcsFileSystem::RequestUploadSessionStatus(
   TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
   request->SetUri(session_uri);
   request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
-  request->AddHeader("Content-Range", strings::StrCat("bytes */", file_size));
+  request->AddHeader("Content-Range", absl::StrCat("bytes */", file_size));
   request->SetPutEmptyBody();
   absl::Status status = request->Send();
   if (status.ok()) {
@@ -1295,19 +1320,19 @@ absl::Status GcsFileSystem::ParseGcsPathForScheme(absl::string_view fname,
   absl::string_view parsed_scheme, bucketp, objectp;
   io::ParseURI(fname, &parsed_scheme, &bucketp, &objectp);
   if (parsed_scheme != scheme) {
-    return errors::InvalidArgument("GCS path doesn't start with 'gs://': ",
-                                   fname);
+    return absl::InvalidArgumentError(
+        absl::StrCat("GCS path doesn't start with 'gs://': ", fname));
   }
   *bucket = string(bucketp);
   if (bucket->empty() || *bucket == ".") {
-    return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
-                                   fname);
+    return absl::InvalidArgumentError(
+        absl::StrCat("GCS path doesn't contain a bucket name: ", fname));
   }
   absl::ConsumePrefix(&objectp, "/");
   *object = string(objectp);
   if (!empty_object_ok && object->empty()) {
-    return errors::InvalidArgument("GCS path doesn't contain an object name: ",
-                                   fname);
+    return absl::InvalidArgumentError(
+        absl::StrCat("GCS path doesn't contain an object name: ", fname));
   }
   return absl::OkStatus();
 }
@@ -1322,7 +1347,7 @@ void GcsFileSystem::ClearFileCaches(const string& fname) {
   absl::ReaderMutexLock l(&block_cache_lock_);
   file_block_cache_->RemoveFile(fname);
   stat_cache_->Delete(fname);
-  // TODO(rxsang): Remove the patterns that matche the file in
+  // TODO(rxsang): Remove the patterns that match the file in
   // MatchingPathsCache as well.
 }
 
@@ -1499,7 +1524,8 @@ absl::Status GcsFileSystem::FileExists(const string& fname,
   if (result) {
     return absl::OkStatus();
   }
-  return errors::NotFound("The specified path ", fname, " was not found.");
+  return absl::NotFoundError(
+      absl::StrCat("The specified path ", fname, " was not found."));
 }
 
 absl::Status GcsFileSystem::ObjectExists(const string& fname,
@@ -1579,7 +1605,7 @@ absl::Status GcsFileSystem::StatForObject(const string& fname,
                                           const string& object,
                                           GcsFileStat* stat) {
   if (object.empty()) {
-    return errors::InvalidArgument(strings::Printf(
+    return absl::InvalidArgumentError(strings::Printf(
         "'object' must be a non-empty string. (File: %s)", fname.c_str()));
   }
 
@@ -1655,7 +1681,7 @@ absl::Status GcsFileSystem::GetBucketMetadata(
     const string& bucket, std::vector<char>* result_buffer) {
   std::unique_ptr<HttpRequest> request;
   TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
-  request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket));
+  request->SetUri(absl::StrCat(kGcsUriBase, "b/", bucket));
 
   if (result_buffer != nullptr) {
     request->SetResultBuffer(result_buffer);
@@ -1728,7 +1754,7 @@ absl::Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
       stat->base = DIRECTORY_STAT;
       return absl::OkStatus();
     } else {
-      return errors::InvalidArgument("Not a directory!");
+      return absl::InvalidArgumentError("Not a directory!");
     }
   };
   GcsFileStat stat;
@@ -1764,8 +1790,8 @@ absl::Status GcsFileSystem::GetMatchingPaths(const string& pattern,
             pattern.substr(0, pattern.find_first_of("*?[\\"));
         const string dir(this->Dirname(fixed_prefix));
         if (dir.empty()) {
-          return errors::InvalidArgument(
-              "A GCS pattern doesn't have a bucket name: ", pattern);
+          return absl::InvalidArgumentError(absl::StrCat(
+              "A GCS pattern doesn't have a bucket name: ", pattern));
         }
         std::vector<string> all_files;
         TF_RETURN_IF_ERROR(GetChildrenBounded(
@@ -1786,7 +1812,7 @@ absl::Status GcsFileSystem::GetMatchingPaths(const string& pattern,
           // removing duplicate slashes. We know that `dir_no_slash` does not
           // end in `/`, so we are safe inserting the new `/` here as the path
           // separator.
-          const string full_path = strings::StrCat(dir_no_slash, "/", path);
+          const string full_path = absl::StrCat(dir_no_slash, "/", path);
           if (this->Match(full_path, pattern)) {
             results->push_back(full_path);
           }
@@ -1802,7 +1828,7 @@ absl::Status GcsFileSystem::GetChildrenBounded(
     const string& dirname, uint64 max_results, std::vector<string>* result,
     bool recursive, bool include_self_directory_marker) {
   if (!result) {
-    return errors::InvalidArgument("'result' cannot be null");
+    return absl::InvalidArgumentError("'result' cannot be null");
   }
   string bucket, object_prefix;
   TF_RETURN_IF_ERROR(
@@ -1814,27 +1840,25 @@ absl::Status GcsFileSystem::GetChildrenBounded(
     std::vector<char> output_buffer;
     std::unique_ptr<HttpRequest> request;
     TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
-    auto uri = strings::StrCat(kGcsUriBase, "b/", bucket, "/o");
+    auto uri = absl::StrCat(kGcsUriBase, "b/", bucket, "/o");
     if (recursive) {
-      uri = strings::StrCat(uri, "?fields=items%2Fname%2CnextPageToken");
+      uri = absl::StrCat(uri, "?fields=items%2Fname%2CnextPageToken");
     } else {
       // Set "/" as a delimiter to ask GCS to treat subfolders as children
       // and return them in "prefixes".
-      uri = strings::StrCat(uri,
-                            "?fields=items%2Fname%2Cprefixes%2CnextPageToken");
-      uri = strings::StrCat(uri, "&delimiter=%2F");
+      uri =
+          absl::StrCat(uri, "?fields=items%2Fname%2Cprefixes%2CnextPageToken");
+      uri = absl::StrCat(uri, "&delimiter=%2F");
     }
     if (!object_prefix.empty()) {
-      uri = strings::StrCat(uri,
-                            "&prefix=", request->EscapeString(object_prefix));
+      uri = absl::StrCat(uri, "&prefix=", request->EscapeString(object_prefix));
     }
     if (!nextPageToken.empty()) {
-      uri = strings::StrCat(
-          uri, "&pageToken=", request->EscapeString(nextPageToken));
+      uri = absl::StrCat(uri,
+                         "&pageToken=", request->EscapeString(nextPageToken));
     }
     if (max_results - retrieved_results < kGetChildrenDefaultPageSize) {
-      uri =
-          strings::StrCat(uri, "&maxResults=", max_results - retrieved_results);
+      uri = absl::StrCat(uri, "&maxResults=", max_results - retrieved_results);
     }
     request->SetUri(uri);
     request->SetResultBuffer(&output_buffer);
@@ -1862,9 +1886,9 @@ absl::Status GcsFileSystem::GetChildrenBounded(
         // the beginning of 'name'.
         absl::string_view relative_path(name);
         if (!absl::ConsumePrefix(&relative_path, object_prefix)) {
-          return errors::Internal(strings::StrCat(
-              "Unexpected response: the returned file name ", name,
-              " doesn't match the prefix ", object_prefix));
+          return errors::Internal(
+              absl::StrCat("Unexpected response: the returned file name ", name,
+                           " doesn't match the prefix ", object_prefix));
         }
         if (!relative_path.empty() || include_self_directory_marker) {
           result->emplace_back(relative_path);
@@ -1927,7 +1951,8 @@ absl::Status GcsFileSystem::Stat(const string& fname, TransactionToken* token,
       *stat = DIRECTORY_STAT;
       return absl::OkStatus();
     }
-    return errors::NotFound("The specified bucket ", fname, " was not found.");
+    return absl::NotFoundError(
+        absl::StrCat("The specified bucket ", fname, " was not found."));
   }
 
   GcsFileStat gcs_stat;
@@ -1945,7 +1970,8 @@ absl::Status GcsFileSystem::Stat(const string& fname, TransactionToken* token,
     *stat = DIRECTORY_STAT;
     return absl::OkStatus();
   }
-  return errors::NotFound("The specified path ", fname, " was not found.");
+  return absl::NotFoundError(
+      absl::StrCat("The specified path ", fname, " was not found."));
 }
 
 absl::Status GcsFileSystem::DeleteFile(const string& fname,
@@ -1977,14 +2003,15 @@ absl::Status GcsFileSystem::CreateDir(const string& dirname,
     bool is_bucket;
     TF_RETURN_IF_ERROR(BucketExists(bucket, &is_bucket));
     return is_bucket ? absl::OkStatus()
-                     : errors::NotFound("The specified bucket ",
-                                        dirname_with_slash, " was not found.");
+                     : absl::NotFoundError(absl::StrCat("The specified bucket ",
+                                                        dirname_with_slash,
+                                                        " was not found."));
   }
 
   if (FileExists(dirname_with_slash, token).ok()) {
     // Use the original name for a correct error here.
     VLOG(3) << "CreateDir: directory already exists, not uploading " << dirname;
-    return errors::AlreadyExists(dirname);
+    return absl::AlreadyExistsError(dirname);
   }
 
   std::unique_ptr<HttpRequest> request;
@@ -2149,8 +2176,8 @@ absl::Status GcsFileSystem::IsDirectory(const string& fname,
     if (is_bucket) {
       return absl::OkStatus();
     }
-    return errors::NotFound("The specified bucket gs://", bucket,
-                            " was not found.");
+    return absl::NotFoundError(
+        absl::StrCat("The specified bucket gs://", bucket, " was not found."));
   }
   bool is_folder;
   TF_RETURN_IF_ERROR(FolderExists(fname, &is_folder));
@@ -2163,7 +2190,8 @@ absl::Status GcsFileSystem::IsDirectory(const string& fname,
     return errors::FailedPrecondition("The specified path ", fname,
                                       " is not a directory.");
   }
-  return errors::NotFound("The specified path ", fname, " was not found.");
+  return absl::NotFoundError(
+      absl::StrCat("The specified path ", fname, " was not found."));
 }
 
 absl::Status GcsFileSystem::DeleteRecursively(const string& dirname,
@@ -2180,7 +2208,7 @@ absl::Status GcsFileSystem::DeleteRecursively(const string& dirname,
     *undeleted_dirs = 1;
     return absl::Status(
         absl::StatusCode::kNotFound,
-        strings::StrCat(dirname, " doesn't exist or not a directory."));
+        absl::StrCat(dirname, " doesn't exist or not a directory."));
   }
   std::vector<string> all_objects;
   // Get all children in the directory recursively.
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
index 6f40b1eafbdb9e..9cf2d016d1ccba 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
@@ -197,7 +197,7 @@ class GcsFileSystem : public FileSystem {
   absl::Status CreateDir(const string& dirname,
                          TransactionToken* token) override;
 
-  absl::Status DeleteDir(const string& dirname,
+  absl::Status DeleteDir(const std::string& dirname,
                          TransactionToken* token) override;
 
   absl::Status GetFileSize(const string& fname, TransactionToken* token,
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
index 96d1ce0f23a69e..b138ec580c825b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
@@ -15,14 +15,35 @@ limitations under the License.
 
 #include "xla/tsl/platform/cloud/gcs_file_system.h"
 
-#include <fstream>
-
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/cloud/auth_provider.h"
+#include "xla/tsl/platform/cloud/file_block_cache.h"
+#include "xla/tsl/platform/cloud/gcs_throttle.h"
+#include "xla/tsl/platform/cloud/http_request.h"
 #include "xla/tsl/platform/cloud/http_request_fake.h"
+#include "xla/tsl/platform/cloud/zone_provider.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_statistics.h"
+#include "xla/tsl/platform/file_system.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/test.h"
-#include "tsl/platform/str_util.h"
+#include "xla/tsl/platform/types.h"
+#include "tsl/platform/retrying_utils.h"
 #include "tsl/platform/strcat.h"
 
 // Undef DeleteFile macro defined in wndows.h.
@@ -1015,39 +1036,39 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Header Content-Range: bytes 0-16/17\n"
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
-                           "", errors::Unavailable("503"), 503),
+                           "", absl::UnavailableError("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", errors::Unavailable("308"), nullptr,
+                           "", absl::UnavailableError("308"), nullptr,
                            {{"Range", "0-10"}}, 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 11-16/17\n"
                            "Timeouts: 5 1 30\n"
                            "Put body: ntent2\n",
-                           "", errors::Unavailable("503"), 503),
+                           "", absl::UnavailableError("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", errors::Unavailable("308"), nullptr,
+                           "", absl::UnavailableError("308"), nullptr,
                            {{"Range", "bytes=0-12"}}, 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 13-16/17\n"
                            "Timeouts: 5 1 30\n"
                            "Put body: ent2\n",
-                           "", errors::Unavailable("308"), 308),
+                           "", absl::UnavailableError("308"), 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", errors::Unavailable("308"), nullptr,
+                           "", absl::UnavailableError("308"), nullptr,
                            {{"Range", "bytes=0-14"}}, 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
@@ -1180,23 +1201,23 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
                            "Header Content-Range: bytes 0-16/17\n"
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
-                           "", errors::Unavailable("503"), 503)});
+                           "", absl::UnavailableError("503"), 503)});
   for (int i = 0; i < 10; i++) {
-    requests.emplace_back(
-        new FakeHttpRequest("Uri: https://custom/upload/location\n"
-                            "Auth Token: fake_token\n"
-                            "Timeouts: 5 1 10\n"
-                            "Header Content-Range: bytes */17\n"
-                            "Put: yes\n",
-                            "", errors::Unavailable("important HTTP error 308"),
-                            nullptr, {{"Range", "0-10"}}, 308));
+    requests.emplace_back(new FakeHttpRequest(
+        "Uri: https://custom/upload/location\n"
+        "Auth Token: fake_token\n"
+        "Timeouts: 5 1 10\n"
+        "Header Content-Range: bytes */17\n"
+        "Put: yes\n",
+        "", absl::UnavailableError("important HTTP error 308"), nullptr,
+        {{"Range", "0-10"}}, 308));
     requests.emplace_back(new FakeHttpRequest(
         "Uri: https://custom/upload/location\n"
         "Auth Token: fake_token\n"
         "Header Content-Range: bytes 11-16/17\n"
         "Timeouts: 5 1 30\n"
         "Put body: ntent2\n",
-        "", errors::Unavailable("important HTTP error 503"), 503));
+        "", absl::UnavailableError("important HTTP error 503"), 503));
   }
   // These calls will be made in the Close() attempt from the destructor.
   // Letting the destructor succeed.
@@ -1262,7 +1283,7 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
                            "Header Content-Range: bytes 0-16/17\n"
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
-                           "", errors::NotFound("important HTTP error 410"),
+                           "", absl::NotFoundError("important HTTP error 410"),
                            410),
        // These calls will be made in the Close() attempt from the destructor.
        // Letting the destructor succeed.
@@ -1444,7 +1465,7 @@ TEST(GcsFileSystemTest, NewAppendableFile_ObjectDoesNotExist) {
            "Auth Token: fake_token\n"
            "Range: 0-1048575\n"
            "Timeouts: 5 1 20\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o"
            "?uploadType=resumable&name=filename\n"
@@ -1469,7 +1490,7 @@ TEST(GcsFileSystemTest, NewAppendableFile_ObjectDoesNotExist) {
 }
 
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
-  const string content = "file content";
+  const std::string content = "file content";
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -1553,7 +1574,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
            "path%2Fsubfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubfolder%2F"
@@ -1610,7 +1631,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
            "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Ffile1.txt%2F"
@@ -1639,12 +1660,12 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket2\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket2\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404)});
+           "", absl::NotFoundError("404"), 404)});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
@@ -1673,7 +1694,7 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
            "path%2Fsubfolder%2F?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubfolder%2F"
@@ -2318,7 +2339,7 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
            "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F"
@@ -3035,7 +3056,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
-           "", errors::NotFound("404"), 404)});
+           "", absl::NotFoundError("404"), 404)});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
@@ -3202,7 +3223,7 @@ TEST(GcsFileSystemTest, Stat_Folder) {
            "subfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
@@ -3236,7 +3257,7 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
            "path?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
@@ -3289,7 +3310,7 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      "", errors::NotFound("404"), 404)});
+      "", absl::NotFoundError("404"), 404)});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
@@ -3320,7 +3341,7 @@ TEST(GcsFileSystemTest, Stat_Cache) {
            "subfolder%2F?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404),
+           "", absl::NotFoundError("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
@@ -3439,7 +3460,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
            "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404)});
+           "", absl::NotFoundError("404"), 404)});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
@@ -3549,7 +3570,7 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      "", errors::NotFound("404"), 404)});
+      "", absl::NotFoundError("404"), 404)});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
@@ -3622,11 +3643,11 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath", nullptr));
   // Check that when GCS returns the object already exists return that the
   // directory already exists.
-  EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath"),
+  EXPECT_EQ(absl::AlreadyExistsError("gs://bucket/subpath"),
             fs.CreateDir("gs://bucket/subpath", nullptr));
   // Check that when GCS returns the object already has a version (failed
   // precondition) return directory already exists.
-  EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath"),
+  EXPECT_EQ(absl::AlreadyExistsError("gs://bucket/subpath"),
             fs.CreateDir("gs://bucket/subpath", nullptr));
 }
 
@@ -3767,7 +3788,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
                            "Auth Token: fake_token\n"
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
-                           "", errors::NotFound("404"), 404),
+                           "", absl::NotFoundError("404"), 404),
        // Checking if gs://bucket/path/subpath/ is a folder - it is.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
@@ -3790,7 +3811,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
                            "Auth Token: fake_token\n"
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
-                           "", errors::NotFound("404"), 404),
+                           "", absl::NotFoundError("404"), 404),
        // Checking if gs://bucket/path/file3.txt/ is a folder - it's not.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
@@ -3805,7 +3826,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "path%2Ffile3.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404)});
+           "", absl::NotFoundError("404"), 404)});
 
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -3841,7 +3862,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
            "path?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           "", errors::NotFound("404"), 404)});
+           "", absl::NotFoundError("404"), 404)});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
index ba890acf2addb8..4b23f3032ca06c 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
@@ -60,7 +60,7 @@ void GcsThrottle::SetConfig(GcsThrottleConfig config) {
 void GcsThrottle::UpdateState() {
   // TODO(b/72643279): Switch to a monotonic clock.
   int64_t now = env_time_->GetOverridableNowSeconds();
-  uint64 delta_secs =
+  uint64_t delta_secs =
       std::max(int64_t{0}, now - static_cast<int64_t>(last_updated_secs_));
   available_tokens_ += delta_secs * config_.token_rate;
   available_tokens_ = std::min(available_tokens_, config_.bucket_size);
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
index 454bdf505a4783..9fd44e794bac74 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
@@ -139,7 +139,7 @@ class GcsThrottle {
    */
   void UpdateState() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  inline uint64 request_bytes_to_tokens(size_t num_bytes) {
+  inline uint64_t request_bytes_to_tokens(size_t num_bytes) {
     return num_bytes >> 10;
   }
 
@@ -150,7 +150,7 @@ class GcsThrottle {
    * the internal state of the GcsThrottle was updated. This is important when
    * determining the number of tokens to add to the available_tokens_ pool.
    */
-  uint64 last_updated_secs_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t last_updated_secs_ TF_GUARDED_BY(mu_) = 0;
 
   /**
    * available_tokens_ records how many tokens are available to be consumed.
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc
index 50e5aab36cab2e..beb87685ef878f 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc
@@ -25,16 +25,16 @@ namespace {
 
 class TestTime : public EnvTime {
  public:
-  uint64 GetOverridableNowNanos() const override {
+  uint64_t GetOverridableNowNanos() const override {
     return now_micros_ * kMicrosToNanos;
   }
 
-  void SetTime(uint64 now_micros) { now_micros_ = now_micros; }
+  void SetTime(uint64_t now_micros) { now_micros_ = now_micros; }
 
   void AdvanceSeconds(int64_t secs) { now_micros_ += secs * kSecondsToMicros; }
 
  private:
-  uint64 now_micros_ = 1234567890000000ULL;
+  uint64_t now_micros_ = 1234567890000000ULL;
 };
 
 class GcsThrottleTest : public ::testing::Test {
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
index fa19620575e948..b2f80fd4b77ef8 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
@@ -14,6 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/tsl/platform/cloud/google_auth_provider.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
+#include "xla/tsl/platform/cloud/oauth_client.h"
+#include "xla/tsl/platform/types.h"
 #ifndef _WIN32
 #include <pwd.h>
 #include <unistd.h>
@@ -89,7 +102,7 @@ absl::Status GetEnvironmentVariableFileName(string* filename) {
   }
   const char* result = std::getenv(kGoogleApplicationCredentials);
   if (!result || !IsFile(result)) {
-    return errors::NotFound(strings::StrCat("$", kGoogleApplicationCredentials,
+    return absl::NotFoundError(absl::StrCat("$", kGoogleApplicationCredentials,
                                             " is not set or corrupt."));
   }
   *filename = result;
@@ -115,7 +128,7 @@ absl::Status GetWellKnownFileName(string* filename) {
   }
   auto result = io::JoinPath(config_dir, kWellKnownCredentialsFile);
   if (!IsFile(result)) {
-    return errors::NotFound(
+    return absl::NotFoundError(
         "Could not find the credentials file in the standard gcloud location.");
   }
   *filename = result;
@@ -166,8 +179,8 @@ absl::Status GoogleAuthProvider::GetToken(string* t) {
   if (skip_gce_check) {
     token_from_gce_status =
         absl::Status(absl::StatusCode::kCancelled,
-                     strings::StrCat("GCE check skipped due to presence of $",
-                                     kNoGceCheck, " environment variable."));
+                     absl::StrCat("GCE check skipped due to presence of $",
+                                  kNoGceCheck, " environment variable."));
   } else {
     token_from_gce_status = GetTokenFromGce();
   }
@@ -211,7 +224,7 @@ absl::Status GoogleAuthProvider::GetTokenFromFiles() {
   string credentials_filename;
   if (!GetEnvironmentVariableFileName(&credentials_filename).ok() &&
       !GetWellKnownFileName(&credentials_filename).ok()) {
-    return errors::NotFound("Could not locate the credentials file.");
+    return absl::NotFoundError("Could not locate the credentials file.");
   }
 
   Json::Value json;
@@ -254,7 +267,7 @@ absl::Status GoogleAuthProvider::GetTokenFromGce() {
 absl::Status GoogleAuthProvider::GetTokenForTesting() {
   const char* token = std::getenv(kGoogleAuthTokenForTesting);
   if (!token) {
-    return errors::NotFound("The env variable for testing was not set.");
+    return absl::NotFoundError("The env variable for testing was not set.");
   }
   expiration_timestamp_sec_ = UINT64_MAX;
   current_token_ = token;
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
index f15b77af1e61f6..2a84c54dd06ddc 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
@@ -40,7 +40,7 @@ class GoogleAuthProvider : public AuthProvider {
   /// \brief Returns the short-term authentication bearer token.
   ///
   /// Safe for concurrent use by multiple threads.
-  absl::Status GetToken(string* token) override;
+  absl::Status GetToken(std::string* token) override;
 
  private:
   /// \brief Gets the bearer token from files.
@@ -59,8 +59,8 @@ class GoogleAuthProvider : public AuthProvider {
   std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
   Env* env_;
   absl::Mutex mu_;
-  string current_token_ TF_GUARDED_BY(mu_);
-  uint64 expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
+  std::string current_token_ TF_GUARDED_BY(mu_);
+  uint64_t expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
   GoogleAuthProvider(const GoogleAuthProvider&) = delete;
   void operator=(const GoogleAuthProvider&) = delete;
 };
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc
index 3b87cb5aa0fa73..17f5489b19b506 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc
@@ -17,16 +17,30 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "json/json.h"
+#include "third_party/jsoncpp/include/json/value.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
+#include "xla/tsl/platform/cloud/http_request.h"
 #include "xla/tsl/platform/cloud/http_request_fake.h"
+#include "xla/tsl/platform/cloud/oauth_client.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
 #include "tsl/platform/path.h"
+#include "tsl/platform/retrying_utils.h"
 
 namespace tsl {
 
 namespace {
 
-string TestData() {
+std::string TestData() {
   return io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform", "cloud",
                       "testdata");
 }
@@ -35,16 +49,16 @@ class FakeEnv : public EnvWrapper {
  public:
   FakeEnv() : EnvWrapper(Env::Default()) {}
 
-  uint64 NowSeconds() const override { return now; }
-  uint64 now = 10000;
+  uint64_t NowSeconds() const override { return now; }
+  uint64_t now = 10000;
 };
 
 class FakeOAuthClient : public OAuthClient {
  public:
   absl::Status GetTokenFromServiceAccountJson(
       Json::Value json, absl::string_view oauth_server_uri,
-      absl::string_view scope, string* token,
-      uint64* expiration_timestamp_sec) override {
+      absl::string_view scope, std::string* token,
+      uint64_t* expiration_timestamp_sec) override {
     provided_credentials_json = json;
     *token = return_token;
     *expiration_timestamp_sec = return_expiration_timestamp;
@@ -53,16 +67,16 @@ class FakeOAuthClient : public OAuthClient {
 
   /// Retrieves a bearer token using a refresh token.
   absl::Status GetTokenFromRefreshTokenJson(
-      Json::Value json, absl::string_view oauth_server_uri, string* token,
-      uint64* expiration_timestamp_sec) override {
+      Json::Value json, absl::string_view oauth_server_uri, std::string* token,
+      uint64_t* expiration_timestamp_sec) override {
     provided_credentials_json = json;
     *token = return_token;
     *expiration_timestamp_sec = return_expiration_timestamp;
     return absl::OkStatus();
   }
 
-  string return_token;
-  uint64 return_expiration_timestamp;
+  std::string return_token;
+  uint64_t return_expiration_timestamp;
   Json::Value provided_credentials_json;
 };
 
@@ -103,7 +117,7 @@ TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
   oauth_client->return_token = "fake-token";
   oauth_client->return_expiration_timestamp = env.NowSeconds() + 3600;
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("fake-token", token);
   EXPECT_EQ("fake_key_id",
@@ -139,7 +153,7 @@ TEST_F(GoogleAuthProviderTest, GCloudRefreshToken) {
   oauth_client->return_token = "fake-token";
   oauth_client->return_expiration_timestamp = env.NowSeconds() + 3600;
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("fake-token", token);
   EXPECT_EQ("fake-refresh-token",
@@ -165,7 +179,7 @@ TEST_F(GoogleAuthProviderTest, RunningOnGCE) {
            "Uri: http://metadata.google.internal/computeMetadata/v1/instance"
            "/service-accounts/default/token\n"
            "Header Metadata-Flavor: Google\n",
-           "", errors::Unavailable("503"), 503),
+           "", absl::UnavailableError("503"), 503),
        new FakeHttpRequest(
            "Uri: http://metadata.google.internal/computeMetadata/v1/instance"
            "/service-accounts/default/token\n"
@@ -185,7 +199,7 @@ TEST_F(GoogleAuthProviderTest, RunningOnGCE) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("fake-gce-token", token);
 
@@ -213,7 +227,7 @@ TEST_F(GoogleAuthProviderTest, OverrideForTesting) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("tokenForTesting", token);
 }
@@ -225,7 +239,7 @@ TEST_F(GoogleAuthProviderTest, NothingAvailable) {
       "Uri: http://metadata.google.internal/computeMetadata/v1/instance"
       "/service-accounts/default/token\n"
       "Header Metadata-Flavor: Google\n",
-      "", errors::NotFound("404"), 404)});
+      "", absl::NotFoundError("404"), 404)});
 
   FakeEnv env;
   std::shared_ptr<HttpRequest::Factory> fakeHttpRequestFactory =
@@ -235,7 +249,7 @@ TEST_F(GoogleAuthProviderTest, NothingAvailable) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("", token);
 }
@@ -250,7 +264,7 @@ TEST_F(GoogleAuthProviderTest, NoGceCheckEnvironmentVariable) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               nullptr, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("", token);
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/http_request.h b/third_party/xla/xla/tsl/platform/cloud/http_request.h
index 9ca2391b86dd57..2a441fee630e32 100644
--- a/third_party/xla/xla/tsl/platform/cloud/http_request.h
+++ b/third_party/xla/xla/tsl/platform/cloud/http_request.h
@@ -80,12 +80,13 @@ class HttpRequest {
     virtual ~RequestStats() = default;
 
     /// RecordRequest is called right before a request is sent on the wire.
-    virtual void RecordRequest(const HttpRequest* request, const string& uri,
+    virtual void RecordRequest(const HttpRequest* request,
+                               const std::string& uri,
                                RequestMethod method) = 0;
 
     /// RecordResponse is called after the response has been received.
-    virtual void RecordResponse(const HttpRequest* request, const string& uri,
-                                RequestMethod method,
+    virtual void RecordResponse(const HttpRequest* request,
+                                const std::string& uri, RequestMethod method,
                                 const absl::Status& result) = 0;
   };
 
@@ -93,27 +94,27 @@ class HttpRequest {
   virtual ~HttpRequest() {}
 
   /// Sets the request URI.
-  virtual void SetUri(const string& uri) = 0;
+  virtual void SetUri(const std::string& uri) = 0;
 
   /// \brief Sets the Range header.
   ///
   /// Used for random seeks, for example "0-999" returns the first 1000 bytes
   /// (note that the right border is included).
-  virtual void SetRange(uint64 start, uint64 end) = 0;
+  virtual void SetRange(uint64_t start, uint64_t end) = 0;
 
   /// Sets a request header.
-  virtual void AddHeader(const string& name, const string& value) = 0;
+  virtual void AddHeader(const std::string& name, const std::string& value) = 0;
 
   /// Sets a DNS resolve mapping (to skip DNS resolution).
   ///
   /// Note: because GCS is available over HTTPS, we cannot replace the hostname
   /// in the URI with an IP address, as that will cause the certificate check
   /// to fail.
-  virtual void AddResolveOverride(const string& hostname, int64_t port,
-                                  const string& ip_addr) = 0;
+  virtual void AddResolveOverride(const std::string& hostname, int64_t port,
+                                  const std::string& ip_addr) = 0;
 
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
-  virtual void AddAuthBearerHeader(const string& auth_token) = 0;
+  virtual void AddAuthBearerHeader(const std::string& auth_token) = 0;
 
   /// Sets the RequestStats object to use to record the request and response.
   virtual void SetRequestStats(RequestStats* stats) = 0;
@@ -125,7 +126,7 @@ class HttpRequest {
   ///
   /// The request body will be taken from the specified file starting from
   /// the given offset.
-  virtual absl::Status SetPutFromFile(const string& body_filepath,
+  virtual absl::Status SetPutFromFile(const std::string& body_filepath,
                                       size_t offset) = 0;
 
   /// Makes the request a PUT request with an empty body.
@@ -162,10 +163,10 @@ class HttpRequest {
   /// \brief Returns the response headers of a completed request.
   ///
   /// If the header is not found, returns an empty string.
-  virtual string GetResponseHeader(const string& name) const = 0;
+  virtual std::string GetResponseHeader(const std::string& name) const = 0;
 
   /// Returns the response code of a completed request.
-  virtual uint64 GetResponseCode() const = 0;
+  virtual uint64_t GetResponseCode() const = 0;
 
   /// \brief Sends the formed request.
   ///
@@ -174,7 +175,7 @@ class HttpRequest {
   virtual absl::Status Send() = 0;
 
   // Url encodes str and returns a new string.
-  virtual string EscapeString(const string& str) = 0;
+  virtual std::string EscapeString(const std::string& str) = 0;
 
   /// \brief Set timeouts for this request.
   ///
@@ -183,8 +184,8 @@ class HttpRequest {
   /// we should wait between additional responses from the server. Finally the
   /// total parameter controls the maximum total connection time to prevent
   /// hanging indefinitely.
-  virtual void SetTimeouts(uint32 connection, uint32 inactivity,
-                           uint32 total) = 0;
+  virtual void SetTimeouts(uint32_t connection, uint32_t inactivity,
+                           uint32_t total) = 0;
 
   HttpRequest(const HttpRequest&) = delete;
   void operator=(const HttpRequest&) = delete;
diff --git a/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h b/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
index 4997953b2e14e7..884de3f631f5c2 100644
--- a/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
+++ b/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
@@ -65,7 +65,7 @@ class FakeHttpRequest : public CurlHttpRequest {
   ///  and capture the POST body.
   ///
   /// Post body is not expected to be a part of the 'request' parameter.
-  FakeHttpRequest(const string& request, const string& response,
+  FakeHttpRequest(const std::string& request, const string& response,
                   absl::Status response_status, string* captured_post_body,
                   const std::map<string, string>& response_headers,
                   uint64 response_code)
diff --git a/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h b/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
index 32ba11e19d9aaf..06fd8fe8eef7c1 100644
--- a/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
+++ b/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
@@ -28,13 +28,13 @@ class NowSecondsEnv : public EnvWrapper {
   NowSecondsEnv() : EnvWrapper(Env::Default()) {}
 
   /// The current (fake) timestamp.
-  uint64 NowSeconds() const override {
+  uint64_t NowSeconds() const override {
     absl::MutexLock lock(mu_);
     return now_;
   }
 
   /// Set the current (fake) timestamp.
-  void SetNowSeconds(uint64 now) {
+  void SetNowSeconds(uint64_t now) {
     absl::MutexLock lock(mu_);
     now_ = now;
   }
@@ -43,7 +43,7 @@ class NowSecondsEnv : public EnvWrapper {
   mutable absl::Mutex mu_;
 
   /// The NowSeconds() value that this Env will return.
-  uint64 now_ = 1;
+  uint64_t now_ = 1;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc b/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc
index 3bbdb5248552ac..c9a179fb88072b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc
@@ -210,7 +210,7 @@ absl::Status OAuthClient::GetTokenFromServiceAccountJson(
 
   const uint64 request_timestamp_sec = env_->NowSeconds();
 
-  string encoded_claim, encoded_header;
+  std::string encoded_claim, encoded_header;
   TF_RETURN_IF_ERROR(EncodeJwtHeader(private_key_id, &encoded_header));
   TF_RETURN_IF_ERROR(EncodeJwtClaim(client_email, scope, oauth_server_uri,
                                     request_timestamp_sec, &encoded_claim));
diff --git a/third_party/xla/xla/tsl/platform/cloud/oauth_client.h b/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
index 578914ea0af507..45a150efdfe74f 100644
--- a/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
+++ b/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
@@ -39,18 +39,19 @@ class OAuthClient {
   /// with the client's private key.
   virtual absl::Status GetTokenFromServiceAccountJson(
       Json::Value json, absl::string_view oauth_server_uri,
-      absl::string_view scope, string* token, uint64* expiration_timestamp_sec);
+      absl::string_view scope, std::string* token,
+      uint64_t* expiration_timestamp_sec);
 
   /// Retrieves a bearer token using a refresh token.
   virtual absl::Status GetTokenFromRefreshTokenJson(
-      Json::Value json, absl::string_view oauth_server_uri, string* token,
-      uint64* expiration_timestamp_sec);
+      Json::Value json, absl::string_view oauth_server_uri, std::string* token,
+      uint64_t* expiration_timestamp_sec);
 
   /// Parses the JSON response with the token from an OAuth 2.0 server.
   virtual absl::Status ParseOAuthResponse(absl::string_view response,
-                                          uint64 request_timestamp_sec,
-                                          string* token,
-                                          uint64* expiration_timestamp_sec);
+                                          uint64_t request_timestamp_sec,
+                                          std::string* token,
+                                          uint64_t* expiration_timestamp_sec);
 
  private:
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
diff --git a/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc b/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc
index 3a0a866bc53d1e..58113ce78d63d5 100644
--- a/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc
@@ -31,7 +31,7 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-string TestData() {
+std::string TestData() {
   return io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform", "cloud",
                       "testdata");
 }
@@ -47,16 +47,16 @@ class FakeEnv : public EnvWrapper {
  public:
   FakeEnv() : EnvWrapper(Env::Default()) {}
 
-  uint64 NowSeconds() const override { return now; }
-  uint64 now = 10000;
+  uint64_t NowSeconds() const override { return now; }
+  uint64_t now = 10000;
 };
 
 }  // namespace
 
 TEST(OAuthClientTest, ParseOAuthResponse) {
-  const uint64 request_timestamp = 100;
-  string token;
-  uint64 expiration_timestamp;
+  const uint64_t request_timestamp = 100;
+  std::string token;
+  uint64_t expiration_timestamp;
   TF_EXPECT_OK(OAuthClient().ParseOAuthResponse(kTokenJson, request_timestamp,
                                                 &token, &expiration_timestamp));
   EXPECT_EQ("WITH_FAKE_ACCESS_TOKEN_TEST_SHOULD_BE_HAPPY", token);
@@ -64,7 +64,7 @@ TEST(OAuthClientTest, ParseOAuthResponse) {
 }
 
 TEST(OAuthClientTest, GetTokenFromRefreshTokenJson) {
-  const string credentials_json = R"(
+  const std::string credentials_json = R"(
       {
         "client_id": "test_client_id",
         "client_secret": "@@@test_client_secret@@@",
@@ -85,8 +85,8 @@ TEST(OAuthClientTest, GetTokenFromRefreshTokenJson) {
   OAuthClient client(std::unique_ptr<HttpRequest::Factory>(
                          new FakeHttpRequestFactory(&requests)),
                      &env);
-  string token;
-  uint64 expiration_timestamp;
+  std::string token;
+  uint64_t expiration_timestamp;
   TF_EXPECT_OK(client.GetTokenFromRefreshTokenJson(
       json, "https://www.googleapis.com/oauth2/v3/token", &token,
       &expiration_timestamp));
@@ -102,7 +102,7 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   Json::Reader reader;
   ASSERT_TRUE(reader.parse(credentials, json));
 
-  string post_body;
+  std::string post_body;
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest("Uri: https://www.googleapis.com/oauth2/v3/token\n",
                            kTokenJson, &post_body)});
@@ -110,8 +110,8 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   OAuthClient client(std::unique_ptr<HttpRequest::Factory>(
                          new FakeHttpRequestFactory(&requests)),
                      &env);
-  string token;
-  uint64 expiration_timestamp;
+  std::string token;
+  uint64_t expiration_timestamp;
   TF_EXPECT_OK(client.GetTokenFromServiceAccountJson(
       json, "https://www.googleapis.com/oauth2/v3/token",
       "https://test-token-scope.com", &token, &expiration_timestamp));
@@ -131,15 +131,15 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
             grant_type);
 
   int last_dot = assertion.rfind('.');
-  string header_dot_claim(assertion.substr(0, last_dot));
-  string signature_encoded(assertion.substr(last_dot + 1));
+  std::string header_dot_claim(assertion.substr(0, last_dot));
+  std::string signature_encoded(assertion.substr(last_dot + 1));
 
   // Check that 'signature' signs 'header_dot_claim'.
 
   // Read the serialized public key.
   std::ifstream public_key_stream(
       io::JoinPath(TestData(), "service_account_public_key.txt"));
-  string public_key_serialized(
+  std::string public_key_serialized(
       (std::istreambuf_iterator<char>(public_key_stream)),
       (std::istreambuf_iterator<char>()));
 
@@ -152,7 +152,7 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   EXPECT_TRUE(public_key) << "Could not load the public key from testdata.";
 
   // Deserialize the signature.
-  string signature;
+  std::string signature;
   TF_EXPECT_OK(Base64Decode(signature_encoded, &signature));
 
   // Actually cryptographically verify the signature.
@@ -178,10 +178,10 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
 
   // Now check the content of the header and the claim.
   int dot = header_dot_claim.find_last_of('.');
-  string header_encoded = header_dot_claim.substr(0, dot);
-  string claim_encoded = header_dot_claim.substr(dot + 1);
+  std::string header_encoded = header_dot_claim.substr(0, dot);
+  std::string claim_encoded = header_dot_claim.substr(dot + 1);
 
-  string header, claim;
+  std::string header, claim;
   TF_EXPECT_OK(Base64Decode(header_encoded, &header));
   TF_EXPECT_OK(Base64Decode(claim_encoded, &claim));
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
index fc4b745a29e521..fc4838af1d7aab 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
@@ -169,7 +169,7 @@ absl::Status RamFileBlockCache::MaybeFetch(
       "Control flow should never reach the end of RamFileBlockCache::Fetch.");
 }
 
-absl::Status RamFileBlockCache::Read(const string& filename, size_t offset,
+absl::Status RamFileBlockCache::Read(const std::string& filename, size_t offset,
                                      size_t n, char* buffer,
                                      size_t* bytes_transferred) {
   *bytes_transferred = 0;
@@ -232,8 +232,8 @@ absl::Status RamFileBlockCache::Read(const string& filename, size_t offset,
   return absl::OkStatus();
 }
 
-bool RamFileBlockCache::ValidateAndUpdateFileSignature(const string& filename,
-                                                       int64_t file_signature) {
+bool RamFileBlockCache::ValidateAndUpdateFileSignature(
+    const std::string& filename, int64_t file_signature) {
   absl::MutexLock lock(mu_);
   auto it = file_signature_map_.find(filename);
   if (it != file_signature_map_.end()) {
@@ -258,7 +258,7 @@ void RamFileBlockCache::Prune() {
   while (
       !stop_pruning_thread_.WaitForNotificationWithTimeout(absl::Seconds(1))) {
     absl::MutexLock lock(mu_);
-    uint64 now = env_->NowSeconds();
+    uint64_t now = env_->NowSeconds();
     while (!lra_list_.empty()) {
       auto it = block_map_.find(lra_list_.back());
       if (now - it->second->timestamp <= max_staleness_) {
@@ -280,12 +280,12 @@ void RamFileBlockCache::Flush() {
   cache_size_ = 0;
 }
 
-void RamFileBlockCache::RemoveFile(const string& filename) {
+void RamFileBlockCache::RemoveFile(const std::string& filename) {
   absl::MutexLock lock(mu_);
   RemoveFile_Locked(filename);
 }
 
-void RamFileBlockCache::RemoveFile_Locked(const string& filename) {
+void RamFileBlockCache::RemoveFile_Locked(const std::string& filename) {
   Key begin = std::make_pair(filename, 0);
   auto it = block_map_.lower_bound(begin);
   while (it != block_map_.end() && it->first.first == filename) {
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
index 3a6e90f5b08776..6b1236d0bd6d6c 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
@@ -46,12 +46,12 @@ class RamFileBlockCache : public FileBlockCache {
   /// cache is constructed. The returned Status should be OK as long as the
   /// read from the remote filesystem succeeded (similar to the semantics of the
   /// read(2) system call).
-  typedef std::function<absl::Status(const string& filename, size_t offset,
+  typedef std::function<absl::Status(const std::string& filename, size_t offset,
                                      size_t buffer_size, char* buffer,
                                      size_t* bytes_transferred)>
       BlockFetcher;
 
-  RamFileBlockCache(size_t block_size, size_t max_bytes, uint64 max_staleness,
+  RamFileBlockCache(size_t block_size, size_t max_bytes, uint64_t max_staleness,
                     BlockFetcher block_fetcher, Env* env = Env::Default())
       : block_size_(block_size),
         max_bytes_(max_bytes),
@@ -89,19 +89,19 @@ class RamFileBlockCache : public FileBlockCache {
   ///    placed in `out`.
   /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
   ///    in `out`).
-  absl::Status Read(const string& filename, size_t offset, size_t n,
+  absl::Status Read(const std::string& filename, size_t offset, size_t n,
                     char* buffer, size_t* bytes_transferred) override;
 
   // Validate the given file signature with the existing file signature in the
   // cache. Returns true if the signature doesn't change or the file doesn't
   // exist before. If the signature changes, update the existing signature with
   // the new one and remove the file from cache.
-  bool ValidateAndUpdateFileSignature(const string& filename,
+  bool ValidateAndUpdateFileSignature(const std::string& filename,
                                       int64_t file_signature) override
       TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached blocks for `filename`.
-  void RemoveFile(const string& filename) override TF_LOCKS_EXCLUDED(mu_);
+  void RemoveFile(const std::string& filename) override TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached data.
   void Flush() override TF_LOCKS_EXCLUDED(mu_);
@@ -109,7 +109,7 @@ class RamFileBlockCache : public FileBlockCache {
   /// Accessors for cache parameters.
   size_t block_size() const override { return block_size_; }
   size_t max_bytes() const override { return max_bytes_; }
-  uint64 max_staleness() const override { return max_staleness_; }
+  uint64_t max_staleness() const override { return max_staleness_; }
 
   /// The current size (in bytes) of the cache.
   size_t CacheSize() const override TF_LOCKS_EXCLUDED(mu_);
@@ -127,7 +127,7 @@ class RamFileBlockCache : public FileBlockCache {
   /// The maximum number of bytes (sum of block sizes) allowed in the LRU cache.
   const size_t max_bytes_;
   /// The maximum staleness of any block in the LRU cache, in seconds.
-  const uint64 max_staleness_;
+  const uint64_t max_staleness_;
   /// The callback to read a block from the underlying filesystem.
   const BlockFetcher block_fetcher_;
   /// The Env from which we read timestamps.
@@ -136,7 +136,7 @@ class RamFileBlockCache : public FileBlockCache {
   /// \brief The key type for the file block cache.
   ///
   /// The file block cache key is a {filename, offset} pair.
-  typedef std::pair<string, size_t> Key;
+  typedef std::pair<std::string, size_t> Key;
 
   /// \brief The state of a block.
   ///
@@ -175,7 +175,7 @@ class RamFileBlockCache : public FileBlockCache {
     /// A list iterator pointing to the block's position in the LRA list.
     std::list<Key>::iterator lra_iterator;
     /// The timestamp (seconds since epoch) at which the block was cached.
-    uint64 timestamp;
+    uint64_t timestamp;
     /// Mutex to guard state variable
     absl::Mutex mu;
     /// The state of the block.
@@ -209,7 +209,7 @@ class RamFileBlockCache : public FileBlockCache {
       TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all blocks of a file, with mu_ already held.
-  void RemoveFile_Locked(const string& filename)
+  void RemoveFile_Locked(const std::string& filename)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Remove the block `entry` from the block map and LRU list, and update the
@@ -243,7 +243,7 @@ class RamFileBlockCache : public FileBlockCache {
   size_t cache_size_ TF_GUARDED_BY(mu_) = 0;
 
   // A filename->file_signature map.
-  std::map<string, int64_t> file_signature_map_ TF_GUARDED_BY(mu_);
+  std::map<std::string, int64_t> file_signature_map_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc
index b8a72f15a42601..0148f01fcd2377 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
+#include "absl/strings/ascii.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
@@ -28,7 +29,7 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-absl::Status ReadCache(RamFileBlockCache* cache, const string& filename,
+absl::Status ReadCache(RamFileBlockCache* cache, const std::string& filename,
                        size_t offset, size_t n, std::vector<char>* out) {
   out->clear();
   out->resize(n, 0);
@@ -41,7 +42,7 @@ absl::Status ReadCache(RamFileBlockCache* cache, const string& filename,
 }
 
 TEST(RamFileBlockCacheTest, IsCacheEnabled) {
-  auto fetcher = [](const string& filename, size_t offset, size_t n,
+  auto fetcher = [](const std::string& filename, size_t offset, size_t n,
                     char* buffer, size_t* bytes_transferred) {
     // Do nothing.
     return absl::OkStatus();
@@ -59,14 +60,14 @@ TEST(RamFileBlockCacheTest, IsCacheEnabled) {
 
 TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
     *bytes_transferred = n;
     return absl::OkStatus();
   };
-  string filename = "file";
+  std::string filename = "file";
   RamFileBlockCache cache(16, 32, 0, fetcher);
   std::vector<char> out;
 
@@ -87,12 +88,12 @@ TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
 }
 
 TEST(RamFileBlockCacheTest, PassThrough) {
-  const string want_filename = "foo/bar";
+  const std::string want_filename = "foo/bar";
   const size_t want_offset = 42;
   const size_t want_n = 1024;
   int calls = 0;
   auto fetcher = [&calls, want_filename, want_offset, want_n](
-                     const string& got_filename, size_t got_offset,
+                     const std::string& got_filename, size_t got_offset,
                      size_t got_n, char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(got_filename, want_filename);
     EXPECT_EQ(got_offset, want_offset);
@@ -128,7 +129,7 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
     buf.push_back(i);
   }
   // The fetcher just fetches slices of the buffer.
-  auto fetcher = [&buf](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&buf](const std::string& filename, size_t offset, size_t n,
                         char* buffer, size_t* bytes_transferred) {
     if (offset < buf.size()) {
       size_t bytes_to_copy = std::min<size_t>(buf.size() - offset, n);
@@ -173,8 +174,8 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
 TEST(RamFileBlockCacheTest, CacheHits) {
   const size_t block_size = 16;
   std::set<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -184,7 +185,7 @@ TEST(RamFileBlockCacheTest, CacheHits) {
     *bytes_transferred = n;
     return absl::OkStatus();
   };
-  const uint32 block_count = 256;
+  const uint32_t block_count = 256;
   RamFileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
   out.resize(block_count, 0);
@@ -206,7 +207,7 @@ TEST(RamFileBlockCacheTest, OutOfRange) {
   bool first_block = false;
   bool second_block = false;
   auto fetcher = [block_size, file_size, &first_block, &second_block](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -249,8 +250,9 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
   // where we expected complete blocks.
   const size_t block_size = 16;
   // This fetcher returns OK but only fills in one byte for any offset.
-  auto fetcher = [block_size](const string& filename, size_t offset, size_t n,
-                              char* buffer, size_t* bytes_transferred) {
+  auto fetcher = [block_size](const std::string& filename, size_t offset,
+                              size_t n, char* buffer,
+                              size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     EXPECT_GE(n, 1);
@@ -272,8 +274,8 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
 TEST(RamFileBlockCacheTest, LRU) {
   const size_t block_size = 16;
   std::list<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_FALSE(calls.empty()) << "at offset = " << offset;
@@ -285,7 +287,7 @@ TEST(RamFileBlockCacheTest, LRU) {
     *bytes_transferred = n;
     return absl::OkStatus();
   };
-  const uint32 block_count = 2;
+  const uint32_t block_count = 2;
   RamFileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
   // Read blocks from the cache, and verify the LRU behavior based on the
@@ -320,7 +322,7 @@ TEST(RamFileBlockCacheTest, LRU) {
 
 TEST(RamFileBlockCacheTest, MaxStaleness) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
@@ -360,13 +362,13 @@ TEST(RamFileBlockCacheTest, MaxStaleness) {
 
 TEST(RamFileBlockCacheTest, RemoveFile) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     char c = (filename == "a") ? 'a' : (filename == "b") ? 'b' : 'x';
     if (offset > 0) {
       // The first block is lower case and all subsequent blocks are upper case.
-      c = toupper(c);
+      c = absl::ascii_toupper(c);
     }
     memset(buffer, c, n);
     *bytes_transferred = n;
@@ -422,7 +424,7 @@ TEST(RamFileBlockCacheTest, RemoveFile) {
 
 TEST(RamFileBlockCacheTest, Prune) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
@@ -432,7 +434,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   std::vector<char> out;
   // Our fake environment is initialized with the current timestamp.
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
-  uint64 now = Env::Default()->NowSeconds();
+  uint64_t now = Env::Default()->NowSeconds();
   env->SetNowSeconds(now);
   RamFileBlockCache cache(8, 32, 1 /* max staleness */, fetcher, env.get());
   // Read three blocks into the cache, and advance the timestamp by one second
@@ -459,7 +461,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   // timestamp of `now` + 2, file "a" is stale because its first block is stale,
   // but file "b" is not stale yet. Thus, once the pruning thread wakes up (in
   // one second of wall time), it should remove "a" and leave "b" alone.
-  uint64 start = Env::Default()->NowSeconds();
+  uint64_t start = Env::Default()->NowSeconds();
   do {
     Env::Default()->SleepForMicroseconds(100000);
   } while (cache.CacheSize() == 24 && Env::Default()->NowSeconds() - start < 3);
@@ -487,7 +489,7 @@ TEST(RamFileBlockCacheTest, ParallelReads) {
   absl::BlockingCounter counter(callers);
   absl::Notification notification;
   auto fetcher = [&counter, &notification](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, size_t* bytes_transferred) {
     if (counter.DecrementCount()) {
       notification.Notify();
@@ -528,7 +530,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
   int num_requests = 0;
   absl::Notification notification;
   auto fetcher = [&num_requests, &notification, block_size](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset, 0);
@@ -558,7 +560,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
 
 TEST(RamFileBlockCacheTest, Flush) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
diff --git a/third_party/xla/xla/tsl/platform/cloud/time_util.cc b/third_party/xla/xla/tsl/platform/cloud/time_util.cc
index 8ab851d2765f64..9bb1fc8f77106d 100644
--- a/third_party/xla/xla/tsl/platform/cloud/time_util.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/time_util.cc
@@ -34,7 +34,7 @@ constexpr int64_t kNanosecondsPerSecond = 1000 * 1000 * 1000;
 
 // Only implements one special case of RFC 3339 which is returned by
 // GCS API, e.g 2016-04-29T23:15:24.896Z.
-absl::Status ParseRfc3339Time(const string& time, int64_t* mtime_nsec) {
+absl::Status ParseRfc3339Time(const std::string& time, int64_t* mtime_nsec) {
   tm parsed{0};
   float seconds;
   if (sscanf(time.c_str(), "%4d-%2d-%2dT%2d:%2d:%fZ", &(parsed.tm_year),
diff --git a/third_party/xla/xla/tsl/platform/cloud/time_util.h b/third_party/xla/xla/tsl/platform/cloud/time_util.h
index de9653b87acafe..0d0ab557b6d714 100644
--- a/third_party/xla/xla/tsl/platform/cloud/time_util.h
+++ b/third_party/xla/xla/tsl/platform/cloud/time_util.h
@@ -22,7 +22,7 @@ namespace tsl {
 
 /// Parses the timestamp in RFC 3339 format and returns it
 /// as nanoseconds since epoch.
-absl::Status ParseRfc3339Time(const string& time, int64_t* mtime_nsec);
+absl::Status ParseRfc3339Time(const std::string& time, int64_t* mtime_nsec);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/zone_provider.h b/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
index 22a109500b94ad..5c61e2af0df6ed 100644
--- a/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
@@ -34,9 +34,9 @@ class ZoneProvider {
   /// Returns an empty string in the case where the zone does not match the
   /// expected format
   /// Safe for concurrent use by multiple threads.
-  virtual absl::Status GetZone(string* zone) = 0;
+  virtual absl::Status GetZone(std::string* zone) = 0;
 
-  static absl::Status GetZone(ZoneProvider* provider, string* zone) {
+  static absl::Status GetZone(ZoneProvider* provider, std::string* zone) {
     if (!provider) {
       return errors::Internal("Zone provider is required.");
     }
diff --git a/third_party/xla/xla/tsl/platform/default/BUILD b/third_party/xla/xla/tsl/platform/default/BUILD
index 6fd359c041bb1d..b5aceb89e063f1 100644
--- a/third_party/xla/xla/tsl/platform/default/BUILD
+++ b/third_party/xla/xla/tsl/platform/default/BUILD
@@ -13,7 +13,6 @@ load(
     "//xla/tsl:tsl.default.bzl",
     "filegroup",
     "if_cuda_tools",
-    "tsl_grpc_cc_dependencies",
 )
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load(
@@ -116,14 +115,13 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:tensorrt_headers",
         "@local_tsl//tsl/platform:load_library",
         "@local_tsl//tsl/platform:path",
     ] + if_oss([
         "@nvshmem//:nvshmem_config",
         "@local_config_nccl//:nccl_config",
+        "@local_config_tensorrt//:tensorrt_headers",
     ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_config",
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
@@ -166,11 +164,15 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
+        "@com_google_protobuf//:protobuf_lite",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:blocking_counter",
@@ -186,7 +188,6 @@ cc_library(
         "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:stringpiece",
-        "@local_tsl//tsl/platform:stringprintf",
         "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:tracing",
     ],
@@ -252,8 +253,9 @@ cc_library(
     ],
     deps = [
         "//xla/tsl/platform:logging",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/log:check",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 cc_library(
@@ -298,7 +300,6 @@ filegroup(
     name = "xla_cpu_runtime_srcs",
     srcs = [
         "context.h",
-        "integral_types.h",
     ] + if_not_windows(["env_time.cc"]),
 )
 
@@ -315,6 +316,8 @@ cc_library(
     ],
     deps = [
         "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:strcat",
     ],
     alwayslink = True,
@@ -325,7 +328,14 @@ cc_library(
     srcs = [
         "port.cc",
         "@local_tsl//tsl/platform:cpu_info.cc",
-    ],
+    ] + select({
+        "//xla/tsl:with_numa_support": [
+            "//xla/tsl/platform:numa_hwloc.cc",
+        ],
+        "//conditions:default": [
+            "//xla/tsl/platform:numa_noop.cc",
+        ],
+    }),
     hdrs = [
         "//xla/tsl/platform/profile_utils:cpu_utils.h",
         "@local_tsl//tsl/platform:cpu_info.h",
@@ -337,11 +347,7 @@ cc_library(
         "@local_tsl//tsl/platform:snappy.h",
     ],
     copts = tsl_copts(),
-    defines = ["TF_USE_SNAPPY"] + select({
-        # TF Additional NUMA defines
-        "//xla/tsl:with_numa_support": ["TENSORFLOW_USE_NUMA"],
-        "//conditions:default": [],
-    }),
+    defines = ["TF_USE_SNAPPY"],
     tags = [
         "manual",
         "no_oss",
@@ -355,15 +361,16 @@ cc_library(
         "//xla/tsl/platform:types",
         "//xla/tsl/platform/profile_utils:profile_utils_cpu_utils",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/platform",
         "@snappy",
     ] + select({
-        # TF Additional NUMA dependencies
         "//xla/tsl:with_numa_support": [
-            # Don't merge in a single line
+            "@com_google_absl//absl/log",
             "@hwloc",
         ],
-        "//conditions:default": [],
+        "//conditions:default": [
+        ],
     }),
 )
 
@@ -394,7 +401,6 @@ cc_library(
     deps = [
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:types",
-        "@local_config_rocm//rocm:rocm_config",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:path",
     ],
@@ -477,7 +483,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    textual_hdrs = ["integral_types.h"],
 )
 
 cc_library(
@@ -602,7 +607,6 @@ filegroup(
         "casts.h",
         "context.h",
         "env.cc",
-        "integral_types.h",
         "load_library.cc",
         "port.cc",
         "posix_file_system.cc",
@@ -611,6 +615,7 @@ filegroup(
         "status.h",
         "statusor.h",
         "tracing_impl.h",
+        "//xla/tsl/platform:numa_noop.cc",
         "//xla/tsl/platform/profile_utils:cpu_utils.h",
         "//xla/tsl/platform/profile_utils:i_cpu_utils_helper.h",
     ],
@@ -638,7 +643,6 @@ exports_files(
     srcs = glob(
         ["*"],
         exclude = [
-            "integral_types.h",
             "test.cc",
         ],
     ),
@@ -647,7 +651,6 @@ exports_files(
 
 exports_files(
     srcs = [
-        "integral_types.h",
         "test.cc",
     ],
     visibility = internal_visibility([
diff --git a/third_party/xla/xla/tsl/platform/default/build_config.bzl b/third_party/xla/xla/tsl/platform/default/build_config.bzl
index 3d4b16005fcf59..14d8f1bba7a487 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config.bzl
@@ -7,12 +7,6 @@ load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
 load("@com_github_grpc_grpc//bazel:python_rules.bzl", "py_grpc_library")
 load("@com_google_protobuf//bazel:cc_proto_library.bzl", "cc_proto_library")
 load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library")
-load(
-    "@local_xla//xla/tsl:tsl.bzl",
-    "clean_dep",
-    "if_tsl_link_protobuf",
-)
-load("@local_xla//xla/tsl/platform:build_config_root.bzl", "if_static")
 load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
 load("@rules_cc//cc:cc_test.bzl", _cc_test = "cc_test")
 load("@rules_python//python:py_library.bzl", "py_library")
@@ -20,6 +14,12 @@ load("@rules_python//python:py_library.bzl", "py_library")
 # IMPORTANT: Do not remove this load statement. We rely on that //xla/tsl doesn't exist in g3
 # to prevent g3 .bzl files from loading this file.
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+load(
+    "//xla/tsl:tsl.bzl",
+    "clean_dep",
+    "if_tsl_link_protobuf",
+)
+load("//xla/tsl/platform:build_config_root.bzl", "if_static")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
@@ -132,7 +132,7 @@ def pyx_library(
         cc_binary(
             name = shared_object_name,
             srcs = [stem + ".cpp"],
-            deps = cc_deps + ["@local_xla//third_party/python_runtime:headers"],
+            deps = cc_deps + [Label("//third_party/python_runtime:headers")],
             linkshared = 1,
             testonly = testonly,
             copts = copts,
@@ -311,7 +311,7 @@ def tf_proto_library(
             generate_mocks = True,
             visibility = visibility,
             compatible_with = compatible_with,
-            deps = [":{}".format(cc_proto_name)],
+            deps = [":{}".format(cc_proto_name), "@com_github_grpc_grpc//:grpc++"],
             plugin_flags = ["services_namespace=grpc"],
             grpc_only = True,
         )
@@ -321,14 +321,13 @@ def tf_additional_lib_hdrs():
         clean_dep("//xla/tsl/platform/default:casts.h"),
         clean_dep("//xla/tsl/platform/default:context.h"),
         clean_dep("//xla/tsl/platform/default:criticality.h"),
-        clean_dep("//xla/tsl/platform/default:integral_types.h"),
         clean_dep("//xla/tsl/platform/default:stacktrace.h"),
         clean_dep("//xla/tsl/platform/default:status.h"),
         clean_dep("//xla/tsl/platform/default:statusor.h"),
         clean_dep("//xla/tsl/platform/default:tracing_impl.h"),
         clean_dep("//xla/tsl/platform/default:unbounded_work_queue.h"),
     ] + select({
-        clean_dep("@local_xla//xla/tsl:windows"): [
+        clean_dep("//xla/tsl:windows"): [
             clean_dep("//xla/tsl/platform/windows:intrinsics_port.h"),
             clean_dep("//xla/tsl/platform/windows:stacktrace.h"),
             clean_dep("//xla/tsl/platform/windows:subprocess.h"),
@@ -380,11 +379,11 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
     return select({
-        clean_dep("@local_xla//xla/tsl:android"): [],
-        clean_dep("@local_xla//xla/tsl:ios"): [],
-        clean_dep("@local_xla//xla/tsl:linux_s390x"): [],
+        clean_dep("//xla/tsl:android"): [],
+        clean_dep("//xla/tsl:ios"): [],
+        clean_dep("//xla/tsl:linux_s390x"): [],
         "//conditions:default": [
-            clean_dep("@local_xla//xla/tsl/platform/cloud:gcs_file_system"),
+            clean_dep("//xla/tsl/platform/cloud:gcs_file_system"),
         ],
     })
 
@@ -392,7 +391,7 @@ def tf_lib_proto_parsing_deps():
     return [
         ":protos_all_cc",
         clean_dep("@eigen_archive//:eigen3"),
-        clean_dep("@local_xla//xla/tsl/protobuf:protos_all_cc"),
+        clean_dep("//xla/tsl/protobuf:protos_all_cc"),
     ]
 
 def tf_py_clif_cc(name, visibility = None, **_kwargs):
@@ -420,18 +419,28 @@ def tf_fingerprint_deps():
         "@farmhash_archive//:farmhash",
     ]
 
+def _protobuf_deps():
+    return [
+        clean_dep("@com_google_protobuf//:delimited_message_util"),
+        clean_dep("@com_google_protobuf//:differencer"),
+        clean_dep("@com_google_protobuf//:json_util"),
+        clean_dep("@com_google_protobuf//:type_resolver"),
+        clean_dep("@com_google_protobuf//:protobuf"),
+        clean_dep("@com_google_protobuf//:protobuf_lite"),
+        clean_dep("@com_google_protobuf//src/google/protobuf/io"),
+        clean_dep("@com_google_protobuf//src/google/protobuf/io:tokenizer"),
+    ]
+
 def tf_protobuf_deps():
     return if_static(
-        [
-            clean_dep("@com_google_protobuf//:protobuf"),
-        ],
+        _protobuf_deps(),
         otherwise = [clean_dep("@com_google_protobuf//:protobuf_headers")],
     )
 
 # TODO(b/356020232): remove completely after migration is done
 # Link protobuf, unless the tsl_link_protobuf build flag is explicitly set to false.
 def tsl_protobuf_deps():
-    return if_tsl_link_protobuf([clean_dep("@com_google_protobuf//:protobuf")], [clean_dep("@com_google_protobuf//:protobuf_headers")])
+    return if_tsl_link_protobuf(_protobuf_deps(), [clean_dep("@com_google_protobuf//:protobuf_headers")])
 
 def strict_cc_test(
         name,
@@ -439,7 +448,7 @@ def strict_cc_test(
         shuffle_tests = True,
         args = None,
         fail_if_no_test_linked = True,
-        fail_if_no_test_selected = True,
+        fail_if_no_test_selected = False,
         **kwargs):
     """A drop-in replacement for cc_test that enforces some good practices by default.
 
@@ -473,7 +482,7 @@ def strict_cc_test(
         # cases. Local builds are exempt from this enforcement to allow for development with
         # --gtest_filter.
         args = args + select({
-            clean_dep("@local_xla//xla/tsl:is_ci_build"): ["--gtest_fail_if_no_test_selected"],
+            clean_dep("//xla/tsl:is_ci_build"): ["--gtest_fail_if_no_test_selected"],
             "//conditions:default": [],
         })
     _cc_test(
@@ -512,9 +521,9 @@ def tsl_cc_test(
                 clean_dep("@com_google_protobuf//:protobuf"),
                 # TODO(ddunleavy) remove these and add proto deps to tests
                 # granularly
-                clean_dep("@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc_impl"),
-                clean_dep("@local_xla//xla/tsl/protobuf:histogram_proto_cc_impl"),
-                clean_dep("@local_xla//xla/tsl/protobuf:status_proto_cc_impl"),
+                clean_dep("//xla/tsl/protobuf:error_codes_proto_impl_cc_impl"),
+                clean_dep("//xla/tsl/protobuf:histogram_proto_cc_impl"),
+                clean_dep("//xla/tsl/protobuf:status_proto_cc_impl"),
                 clean_dep("//tsl/profiler/protobuf:xplane_proto_cc_impl"),
                 clean_dep("//tsl/profiler/protobuf:profiler_options_proto_cc_impl"),
             ],
@@ -523,7 +532,7 @@ def tsl_cc_test(
     )
 
 def tf_portable_proto_lib():
-    return ["//tensorflow/core:protos_all_cc_impl", clean_dep("@local_xla//xla/tsl/protobuf:protos_all_cc_impl")]
+    return ["//tensorflow/core:protos_all_cc_impl", clean_dep("//xla/tsl/protobuf:protos_all_cc_impl")]
 
 def tf_protobuf_compiler_deps():
     return if_static(
@@ -535,8 +544,8 @@ def tf_protobuf_compiler_deps():
 
 def tf_windows_aware_platform_deps(name):
     return select({
-        clean_dep("@local_xla//xla/tsl:windows"): [
-            clean_dep("@local_xla//xla/tsl/platform/windows:" + name),
+        clean_dep("//xla/tsl:windows"): [
+            clean_dep("//xla/tsl/platform/windows:" + name),
         ],
         "//conditions:default": [
             clean_dep("//xla/tsl/platform/default:" + name),
diff --git a/third_party/xla/xla/tsl/platform/default/build_config_root.bzl b/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
index 6b8291f62ca617..546f16a8979550 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
@@ -5,8 +5,8 @@ be separate to avoid cyclic references.
 """
 
 load("@local_config_remote_execution//:remote_execution.bzl", "gpu_test_tags")
-load("@local_config_rocm//rocm:build_defs.bzl", "is_rocm_configured")
-load("@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
+load("@local_config_rocm//rocm:build_defs.bzl", "get_rbe_amdgpu_pool", "is_rocm_configured")
+load("//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("//xla/tsl/platform/default:cuda_build_defs.bzl", "is_cuda_configured")
 
@@ -19,6 +19,14 @@ GPU_TEST_PROPERTIES = {
     "Pool": "gpu-pool",
 }
 
+ROCM_SINGLE_GPU_TEST_PROPERTIES = {
+    "test.Pool": get_rbe_amdgpu_pool(is_single_gpu = True),
+}
+
+ROCM_MULTI_GPU_TEST_PROPERTIES = {
+    "test.Pool": get_rbe_amdgpu_pool(is_single_gpu = False),
+}
+
 def tf_gpu_tests_tags():
     """Gets tags for TensorFlow GPU tests based on the configured environment.
 
@@ -38,9 +46,23 @@ def tf_gpu_tests_tags():
 def tf_cuda_tests_tags():
     return tf_gpu_tests_tags()
 
+def tf_has_tag(kwargs, tag):
+    return ("tags" in kwargs and kwargs["tags"] != None and tag in kwargs["tags"])
+
 def tf_exec_properties(kwargs):
-    if ("tags" in kwargs and kwargs["tags"] != None and
-        "remote-gpu" in kwargs["tags"]):
+    """Gets execution_properties for TensorFlow GPU tests based on the provided tags.
+
+    Args:
+      kwargs: all arguments of the xla test target
+    Returns:
+        execution_properties with the execution pool names for rbe.
+    """
+    if is_rocm_configured():
+        if tf_has_tag(kwargs, "multi_gpu"):
+            return ROCM_MULTI_GPU_TEST_PROPERTIES
+        if tf_has_tag(kwargs, "gpu"):
+            return ROCM_SINGLE_GPU_TEST_PROPERTIES
+    elif tf_has_tag(kwargs, "remote-gpu"):
         return GPU_TEST_PROPERTIES
     return {}
 
@@ -122,6 +144,12 @@ def if_llvm_powerpc_available(then, otherwise = []):
         "//conditions:default": otherwise,
     })
 
+def if_llvm_riscv_available(then, otherwise = []):
+    return select({
+        str(Label("//xla/tsl:riscv64_or_cross")): then,
+        "//conditions:default": otherwise,
+    })
+
 def if_llvm_system_z_available(then, otherwise = []):
     return select({
         str(Label("//xla/tsl:s390x_or_cross")): then,
diff --git a/third_party/xla/xla/tsl/platform/default/casts.h b/third_party/xla/xla/tsl/platform/default/casts.h
index 600d40bd495628..3825e2180d5d84 100644
--- a/third_party/xla/xla/tsl/platform/default/casts.h
+++ b/third_party/xla/xla/tsl/platform/default/casts.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <assert.h>  // for use with down_cast<>
 
+#include <memory>
 #include <type_traits>
 
 namespace tensorflow {
@@ -87,10 +88,19 @@ inline To down_cast(From& f) {
   return static_cast<To>(f);
 }
 
+// A `down_cast` version for `std::shared_ptr`.
+template <typename To, typename From>
+std::shared_ptr<To> down_pointer_cast(const std::shared_ptr<From>& from) {
+  auto* ptr =
+      down_cast<typename std::shared_ptr<To>::element_type*>(from.get());
+  return std::shared_ptr<To>{from, ptr};
+}
+
 }  // namespace tensorflow
 
 namespace tsl {
 using ::tensorflow::down_cast;
-}
+using ::tensorflow::down_pointer_cast;
+}  // namespace tsl
 
 #endif  // XLA_TSL_PLATFORM_DEFAULT_CASTS_H_
diff --git a/third_party/xla/xla/tsl/platform/default/context.h b/third_party/xla/xla/tsl/platform/default/context.h
index 8f1b36ee07bdb8..551380d42a483c 100644
--- a/third_party/xla/xla/tsl/platform/default/context.h
+++ b/third_party/xla/xla/tsl/platform/default/context.h
@@ -32,6 +32,11 @@ class WithContext {
   ~WithContext() {}
 };
 
+template <typename Functor>
+Functor WithCurrentContext(Functor f) {
+  return f;
+}
+
 }  // namespace tsl
 
 #endif  // XLA_TSL_PLATFORM_DEFAULT_CONTEXT_H_
diff --git a/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc b/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc
index f915f6288f22e2..8ac4bc05f2869f 100644
--- a/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc
+++ b/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc
@@ -43,7 +43,8 @@ std::vector<std::string> CandidateCudaRoots() {
   auto roots = std::vector<std::string>{};
   std::string runfiles_suffix = "runfiles";
   std::vector<std::string> cuda_dir_names = {"cuda_nvcc", "cuda_nvdisasm",
-                                             "nvidia_nvshmem", "cuda_nvvm"};
+                                             "nvidia_nvshmem", "cuda_nvvm",
+                                             "cuda_cudart"};
 
   // The CUDA candidate root for c++ targets.
   std::string executable_path = tsl::Env::Default()->GetExecutablePath();
@@ -56,14 +57,23 @@ std::vector<std::string> CandidateCudaRoots() {
   // The CUDA candidate root for python targets.
   std::string runfiles_dir = tsl::Env::Default()->GetRunfilesDir();
   std::size_t runfiles_ind = runfiles_dir.rfind(runfiles_suffix);
-  for (const std::string& cuda_dir_name : cuda_dir_names) {
-    std::string cuda_dir = io::JoinPath(
-        runfiles_dir.substr(0, runfiles_ind + runfiles_suffix.length()),
-        cuda_dir_name);
-    roots.push_back(cuda_dir);
+  if (runfiles_ind != std::string::npos) {
+    for (const std::string& cuda_dir_name : cuda_dir_names) {
+      std::string cuda_dir = io::JoinPath(
+          runfiles_dir.substr(0, runfiles_ind + runfiles_suffix.length()),
+          cuda_dir_name);
+      roots.push_back(cuda_dir);
+    }
   }
 
-  roots.push_back(TF_CUDA_TOOLKIT_PATH);
+  const char* cuda_home = getenv("CUDA_HOME");
+  if (cuda_home) {
+    roots.emplace_back(cuda_home);
+  }
+  std::string cuda_toolkit_path = TF_CUDA_TOOLKIT_PATH;
+  if (!cuda_toolkit_path.empty()) {
+    roots.push_back(std::move(cuda_toolkit_path));
+  }
   roots.emplace_back(std::string("/usr/local/cuda"));
   roots.emplace_back(std::string("/opt/cuda"));
 
@@ -101,12 +111,15 @@ std::vector<std::string> CandidateCudaRoots() {
     // $CONDA_PREFIX/lib/python3.12/site-packages/pkg_name, so if we want
     // to add $CONDA_PREFIX to the candidate roots dirs we need to add
     // ../../../..
-    for (auto path : {"../../../..", "../../../../.."})
+    for (auto path : {"../../../..", "../../../../.."}) {
       roots.emplace_back(io::JoinPath(dir, path));
+    }
   }
 #endif  // defined(PLATFORM_POSIX) && !defined(__APPLE__)
 
-  for (auto root : roots) VLOG(3) << "CUDA root = " << root;
+  for (auto root : roots) {
+    VLOG(3) << "CUDA root = " << root;
+  }
   return roots;
 #else   // !defined(PLATFORM_GOOGLE)
   return {};
diff --git a/third_party/xla/xla/tsl/platform/default/dso_loader.cc b/third_party/xla/xla/tsl/platform/default/dso_loader.cc
index 1a05bcd1e67339..81bb80de5072d6 100644
--- a/third_party/xla/xla/tsl/platform/default/dso_loader.cc
+++ b/third_party/xla/xla/tsl/platform/default/dso_loader.cc
@@ -138,7 +138,7 @@ absl::StatusOr<void*> GetDsoHandle(const std::string& name,
                               "'; dlerror: ", status.message());
 #if !defined(PLATFORM_WINDOWS)
   if (const char* ld_library_path = getenv("LD_LIBRARY_PATH")) {
-    message += absl::StrCat("; LD_LIBRARY_PATH: ", ld_library_path);
+    absl::StrAppend(&message, "; LD_LIBRARY_PATH: ", ld_library_path);
   }
 #endif
   VLOG(1) << message;
@@ -161,6 +161,10 @@ absl::StatusOr<void*> GetCudaDriverDsoHandle() {
   return GetDsoHandle("cuda", "1");
 }
 
+absl::StatusOr<void*> GetNvmlDsoHandle() {
+  return GetDsoHandle("nvidia-ml", "1");
+}
+
 absl::StatusOr<void*> GetCudaRuntimeDsoHandle() {
   return GetDsoHandle("cudart", GetCudaRtVersion());
 }
diff --git a/third_party/xla/xla/tsl/platform/default/dso_loader.h b/third_party/xla/xla/tsl/platform/default/dso_loader.h
index 4234da6b7272a1..2e544da0a43926 100644
--- a/third_party/xla/xla/tsl/platform/default/dso_loader.h
+++ b/third_party/xla/xla/tsl/platform/default/dso_loader.h
@@ -41,6 +41,7 @@ absl::StatusOr<void*> GetNcclDsoHandle();
 absl::StatusOr<void*> GetNvshmemDsoHandle();
 absl::StatusOr<void*> GetNvInferDsoHandle();
 absl::StatusOr<void*> GetNvInferPluginDsoHandle();
+absl::StatusOr<void*> GetNvmlDsoHandle();
 
 absl::StatusOr<void*> GetRocblasDsoHandle();
 absl::StatusOr<void*> GetMiopenDsoHandle();
diff --git a/third_party/xla/xla/tsl/platform/default/env.cc b/third_party/xla/xla/tsl/platform/default/env.cc
index 0097b558f6d8b3..55f1bc09cd3e1a 100644
--- a/third_party/xla/xla/tsl/platform/default/env.cc
+++ b/third_party/xla/xla/tsl/platform/default/env.cc
@@ -55,10 +55,10 @@ namespace {
 
 ABSL_CONST_INIT absl::Mutex name_mutex(absl::kConstInit);
 
-std::map<std::thread::id, string>& GetThreadNameRegistry()
+std::map<std::thread::id, std::string>& GetThreadNameRegistry()
     TF_EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
   static auto* const thread_name_registry =
-      new std::map<std::thread::id, string>();
+      new std::map<std::thread::id, std::string>();
   return *thread_name_registry;
 }
 
@@ -125,11 +125,11 @@ class PosixEnv : public Env {
 
   ~PosixEnv() override { LOG(FATAL) << "Env::Default() must not be destroyed"; }
 
-  bool MatchPath(const string& path, const string& pattern) override {
+  bool MatchPath(const std::string& path, const std::string& pattern) override {
     return fnmatch(pattern.c_str(), path.c_str(), FNM_PATHNAME) == 0;
   }
 
-  void SleepForMicroseconds(int64 micros) override {
+  void SleepForMicroseconds(int64_t micros) override {
     while (micros > 0) {
       timespec sleep_time;
       sleep_time.tv_sec = 0;
@@ -150,12 +150,13 @@ class PosixEnv : public Env {
     }
   }
 
-  Thread* StartThread(const ThreadOptions& thread_options, const string& name,
+  Thread* StartThread(const ThreadOptions& thread_options,
+                      const std::string& name,
                       absl::AnyInvocable<void()> fn) override {
     return new PThread(thread_options, name, std::move(fn));
   }
   void StartDetachedThread(const ThreadOptions& thread_options,
-                           const string& name,
+                           const std::string& name,
                            absl::AnyInvocable<void()> fn) override {
     PThread detached(thread_options, name, std::move(fn), /*detached=*/true);
   }
@@ -166,7 +167,7 @@ class PosixEnv : public Env {
     return current_thread_id;
   }
 
-  bool GetCurrentThreadName(string* name) override {
+  bool GetCurrentThreadName(std::string* name) override {
     {
       absl::MutexLock l(name_mutex);
       auto thread_name =
@@ -202,7 +203,7 @@ class PosixEnv : public Env {
     closure_thread.detach();
   }
 
-  void SchedClosureAfter(int64 micros,
+  void SchedClosureAfter(int64_t micros,
                          absl::AnyInvocable<void()> closure) override {
     // TODO(b/27290852): Consuming a thread here is wasteful, but this
     // code is (currently) only used in the case where a step fails
@@ -223,14 +224,14 @@ class PosixEnv : public Env {
     return internal::GetSymbolFromLibrary(handle, symbol_name, symbol);
   }
 
-  string FormatLibraryFileName(const string& name,
-                               const string& version) override {
+  std::string FormatLibraryFileName(const std::string& name,
+                                    const std::string& version) override {
     return internal::FormatLibraryFileName(name, version);
   }
 
-  string GetRunfilesDir() override {
-    string bin_path = this->GetExecutablePath();
-    string runfiles_suffix = ".runfiles/org_tensorflow";
+  std::string GetRunfilesDir() override {
+    std::string bin_path = this->GetExecutablePath();
+    std::string runfiles_suffix = ".runfiles/org_tensorflow";
     std::size_t pos = bin_path.find(runfiles_suffix);
 
     // Sometimes (when executing under python) bin_path returns the full path to
@@ -241,7 +242,7 @@ class PosixEnv : public Env {
 
     // See if we have the executable path. if executable.runfiles exists, return
     // that folder.
-    string runfiles_path = bin_path + runfiles_suffix;
+    std::string runfiles_path = bin_path + runfiles_suffix;
     absl::Status s = this->IsDirectory(runfiles_path);
     if (s.ok()) {
       return runfiles_path;
@@ -252,7 +253,7 @@ class PosixEnv : public Env {
   }
 
  private:
-  void GetLocalTempDirectories(std::vector<string>* list) override;
+  void GetLocalTempDirectories(std::vector<std::string>* list) override;
 
   int64_t GetCurrentThreadIdInternal() {
 #ifdef __APPLE__
@@ -282,7 +283,7 @@ Env* Env::Default() {
 }
 #endif
 
-void PosixEnv::GetLocalTempDirectories(std::vector<string>* list) {
+void PosixEnv::GetLocalTempDirectories(std::vector<std::string>* list) {
   list->clear();
   // Directories, in order of preference. If we find a dir that
   // exists, we stop adding other less-preferred dirs
@@ -307,7 +308,7 @@ void PosixEnv::GetLocalTempDirectories(std::vector<string>* list) {
     if (!d || d[0] == '\0') continue;  // Empty env var
     paths.push_back(d);
     // Make sure we don't surprise anyone who's expecting a '/'
-    string dstr = d;
+    std::string dstr = d;
     if (dstr[dstr.size() - 1] != '/') {
       dstr += "/";
     }
diff --git a/third_party/xla/xla/tsl/platform/default/env_time.cc b/third_party/xla/xla/tsl/platform/default/env_time.cc
index cfe7d23d1a2a72..a25b50f899f98b 100644
--- a/third_party/xla/xla/tsl/platform/default/env_time.cc
+++ b/third_party/xla/xla/tsl/platform/default/env_time.cc
@@ -21,11 +21,11 @@ limitations under the License.
 namespace tsl {
 
 /* static */
-uint64 EnvTime::NowNanos() {
+uint64_t EnvTime::NowNanos() {
   struct timespec ts;
   clock_gettime(CLOCK_REALTIME, &ts);
-  return (static_cast<uint64>(ts.tv_sec) * kSecondsToNanos +
-          static_cast<uint64>(ts.tv_nsec));
+  return (static_cast<uint64_t>(ts.tv_sec) * kSecondsToNanos +
+          static_cast<uint64_t>(ts.tv_nsec));
 }
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/default/human_readable_json.cc b/third_party/xla/xla/tsl/platform/default/human_readable_json.cc
index a1de7d6ed2806b..7870f3af9c212a 100644
--- a/third_party/xla/xla/tsl/platform/default/human_readable_json.cc
+++ b/third_party/xla/xla/tsl/platform/default/human_readable_json.cc
@@ -51,7 +51,7 @@ absl::StatusOr<std::string> ProtoToHumanReadableJson(
   return std::string("[human readable output not available for lite protos]");
 }
 
-absl::Status HumanReadableJsonToProto(const string& str,
+absl::Status HumanReadableJsonToProto(const std::string& str,
                                       protobuf::Message* proto) {
   proto->Clear();
   auto status = protobuf::util::JsonStringToMessage(str, proto);
@@ -65,7 +65,7 @@ absl::Status HumanReadableJsonToProto(const string& str,
   return absl::OkStatus();
 }
 
-absl::Status HumanReadableJsonToProto(const string& str,
+absl::Status HumanReadableJsonToProto(const std::string& str,
                                       protobuf::MessageLite* proto) {
   return errors::Internal("Cannot parse JSON protos on Android");
 }
diff --git a/third_party/xla/xla/tsl/platform/default/net.cc b/third_party/xla/xla/tsl/platform/default/net.cc
index 23df48be75ece1..d8aa6aeef92120 100644
--- a/third_party/xla/xla/tsl/platform/default/net.cc
+++ b/third_party/xla/xla/tsl/platform/default/net.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include <random>
 #include <unordered_set>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/logging.h"
 #include "tsl/platform/strcat.h"
 
@@ -97,6 +99,40 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   return true;
 }
 
+// Manages the set of ports that have been chosen by PickUnusedPort().
+// This class is a singleton and is thread-safe.
+class ChosenPorts {
+ public:
+  static ChosenPorts& GetChosenPorts() {
+    static ChosenPorts chosen_ports;
+    return chosen_ports;
+  }
+
+  // Returns true if the port is in the chosen set.
+  bool Contains(int port) {
+    absl::MutexLock l(mu_);
+    return ports_.count(port) > 0;
+  }
+
+  // If the port is not in the chosen set, inserts it and returns true.
+  // Otherwise, returns false.
+  bool Insert(int port) {
+    absl::MutexLock l(mu_);
+    return ports_.insert(port).second;
+  }
+
+  // Erases the port from the chosen set. Returns true if the port was present.
+  bool Erase(int port) {
+    absl::MutexLock l(mu_);
+    return ports_.erase(port) > 0;
+  }
+
+ private:
+  ChosenPorts() = default;
+  absl::Mutex mu_;
+  std::unordered_set<int> ports_ ABSL_GUARDED_BY(mu_);
+};
+
 const int kNumRandomPortsToPick = 100;
 const int kMaximumTrials = 1000;
 
@@ -109,7 +145,6 @@ int PickUnusedPortOrDie() {
 }
 
 int PickUnusedPort() {
-  static std::unordered_set<int> chosen_ports;
   // Type of port to first pick in the next iteration.
   bool is_tcp = true;
   int trial = 0;
@@ -132,7 +167,7 @@ int PickUnusedPort() {
       port = 0;
     }
 
-    if (chosen_ports.find(port) != chosen_ports.end()) {
+    if (ChosenPorts::GetChosenPorts().Contains(port)) {
       continue;
     }
     if (!IsPortAvailable(&port, is_tcp)) {
@@ -146,12 +181,22 @@ int PickUnusedPort() {
       continue;
     }
 
-    chosen_ports.insert(port);
-    return port;
+    if (ChosenPorts::GetChosenPorts().Insert(port)) {
+      return port;
+    }
   }
 
   return -1;
 }
 
+void RecycleUnusedPort(int port) {
+  if (port <= 0 || !ChosenPorts::GetChosenPorts().Erase(port)) {
+    LOG(FATAL)
+        << "Port " << port
+        << " is not a valid port to be recycled. It must be a positive "
+           "number that was previously returned by PickUnusedPort[OrDie](), "
+           "and not yet recycled.";
+  }
+}
 }  // namespace internal
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/default/port.cc b/third_party/xla/xla/tsl/platform/default/port.cc
index f42ae179acdda0..31a2b75b317aaa 100644
--- a/third_party/xla/xla/tsl/platform/default/port.cc
+++ b/third_party/xla/xla/tsl/platform/default/port.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <new>
+
 #include "absl/base/internal/sysinfo.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/profile_utils/cpu_utils.h"
@@ -48,10 +50,6 @@ limitations under the License.
 #include <thread>
 #endif
 
-#if TENSORFLOW_USE_NUMA
-#include "hwloc.h"
-#endif
-
 #if defined(__ANDROID__) && (defined(__i386__) || defined(__x86_64__))
 #define TENSORFLOW_HAS_CXA_DEMANGLE 0
 #elif (__GNUC__ >= 4 || (__GNUC__ >= 3 && __GNUC_MINOR__ >= 4)) && \
@@ -72,17 +70,17 @@ namespace port {
 
 void InitMain(const char* usage, int* argc, char*** argv) {}
 
-string Hostname() {
+std::string Hostname() {
   char hostname[1024];
   gethostname(hostname, sizeof hostname);
   hostname[sizeof hostname - 1] = 0;
-  return string(hostname);
+  return std::string(hostname);
 }
 
-string JobName() {
+std::string JobName() {
   const char* job_name_cs = std::getenv("TF_JOB_NAME");
   if (job_name_cs != nullptr) {
-    return string(job_name_cs);
+    return std::string(job_name_cs);
   }
   return "";
 }
@@ -170,146 +168,7 @@ int NumHyperthreadsPerCore() {
   return (ht_per_core > 0) ? ht_per_core : 1;
 }
 
-#ifdef TENSORFLOW_USE_NUMA
-namespace {
-static hwloc_topology_t hwloc_topology_handle;
-
-bool HaveHWLocTopology() {
-  // One time initialization
-  static bool init = []() {
-    if (hwloc_topology_init(&hwloc_topology_handle)) {
-      LOG(ERROR) << "Call to hwloc_topology_init() failed";
-      return false;
-    }
-    if (hwloc_topology_load(hwloc_topology_handle)) {
-      LOG(ERROR) << "Call to hwloc_topology_load() failed";
-      return false;
-    }
-    return true;
-  }();
-  return init;
-}
-
-// Return the first hwloc object of the given type whose os_index
-// matches 'index'.
-hwloc_obj_t GetHWLocTypeIndex(hwloc_obj_type_t tp, int index) {
-  hwloc_obj_t obj = nullptr;
-  if (index >= 0) {
-    while ((obj = hwloc_get_next_obj_by_type(hwloc_topology_handle, tp, obj)) !=
-           nullptr) {
-      if (obj->os_index == index) break;
-    }
-  }
-  return obj;
-}
-}  // namespace
-#endif  // TENSORFLOW_USE_NUMA
-
-bool NUMAEnabled() { return (NUMANumNodes() > 1); }
-
-int NUMANumNodes() {
-#ifdef TENSORFLOW_USE_NUMA
-  if (HaveHWLocTopology()) {
-    int num_numanodes =
-        hwloc_get_nbobjs_by_type(hwloc_topology_handle, HWLOC_OBJ_NUMANODE);
-    return std::max(1, num_numanodes);
-  } else {
-    return 1;
-  }
-#else
-  return 1;
-#endif  // TENSORFLOW_USE_NUMA
-}
-
-void NUMASetThreadNodeAffinity(int node) {
-#ifdef TENSORFLOW_USE_NUMA
-  if (HaveHWLocTopology()) {
-    // Find the corresponding NUMA node topology object.
-    hwloc_obj_t obj = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
-    if (obj) {
-      hwloc_set_cpubind(hwloc_topology_handle, obj->cpuset,
-                        HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
-    } else {
-      LOG(ERROR) << "Could not find hwloc NUMA node " << node;
-    }
-  }
-#endif  // TENSORFLOW_USE_NUMA
-}
-
-int NUMAGetThreadNodeAffinity() {
-  int node_index = kNUMANoAffinity;
-#ifdef TENSORFLOW_USE_NUMA
-  if (HaveHWLocTopology()) {
-    hwloc_cpuset_t thread_cpuset = hwloc_bitmap_alloc();
-    hwloc_get_cpubind(hwloc_topology_handle, thread_cpuset,
-                      HWLOC_CPUBIND_THREAD);
-    hwloc_obj_t obj = nullptr;
-    // Return the first NUMA node whose cpuset is a (non-proper) superset of
-    // that of the current thread.
-    while ((obj = hwloc_get_next_obj_by_type(
-                hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) {
-      if (hwloc_bitmap_isincluded(thread_cpuset, obj->cpuset)) {
-        node_index = obj->os_index;
-        break;
-      }
-    }
-    hwloc_bitmap_free(thread_cpuset);
-  }
-#endif  // TENSORFLOW_USE_NUMA
-  return node_index;
-}
-
-void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
-#ifdef TENSORFLOW_USE_NUMA
-  if (HaveHWLocTopology()) {
-    hwloc_obj_t numa_node = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
-    if (numa_node) {
-      return hwloc_alloc_membind(hwloc_topology_handle, size,
-                                 numa_node->nodeset, HWLOC_MEMBIND_BIND,
-                                 HWLOC_MEMBIND_BYNODESET);
-    } else {
-      LOG(ERROR) << "Failed to find hwloc NUMA node " << node;
-    }
-  }
-#endif  // TENSORFLOW_USE_NUMA
-  return tsl::port::AlignedMalloc(size, minimum_alignment);
-}
-
-void NUMAFree(void* ptr, size_t size) {
-#ifdef TENSORFLOW_USE_NUMA
-  if (HaveHWLocTopology()) {
-    hwloc_free(hwloc_topology_handle, ptr, size);
-    return;
-  }
-#endif  // TENSORFLOW_USE_NUMA
-  tsl::port::Free(ptr);
-}
-
-int NUMAGetMemAffinity(const void* addr) {
-  int node = kNUMANoAffinity;
-#ifdef TENSORFLOW_USE_NUMA
-  if (HaveHWLocTopology() && addr) {
-    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
-    if (!hwloc_get_area_memlocation(hwloc_topology_handle, addr, 4, nodeset,
-                                    HWLOC_MEMBIND_BYNODESET)) {
-      hwloc_obj_t obj = nullptr;
-      while ((obj = hwloc_get_next_obj_by_type(
-                  hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) {
-        if (hwloc_bitmap_isincluded(nodeset, obj->nodeset)) {
-          node = obj->os_index;
-          break;
-        }
-      }
-      hwloc_bitmap_free(nodeset);
-    } else {
-      LOG(ERROR) << "Failed call to hwloc_get_area_memlocation.";
-    }
-  }
-#endif  // TENSORFLOW_USE_NUMA
-  return node;
-}
-
-bool Snappy_Compress(const char* input, size_t length, string* output) {
+bool Snappy_Compress(const char* input, size_t length, std::string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
   size_t outlen;
@@ -322,7 +181,7 @@ bool Snappy_Compress(const char* input, size_t length, string* output) {
 }
 
 bool Snappy_CompressFromIOVec(const struct iovec* iov,
-                              size_t uncompressed_length, string* output) {
+                              size_t uncompressed_length, std::string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(uncompressed_length));
   size_t outlen;
@@ -362,7 +221,7 @@ bool Snappy_UncompressToIOVec(const char* compressed, size_t compressed_length,
 #endif
 }
 
-static void DemangleToString(const char* mangled, string* out) {
+static void DemangleToString(const char* mangled, std::string* out) {
   int status = 0;
   char* demangled = nullptr;
 #if TENSORFLOW_HAS_CXA_DEMANGLE
@@ -376,8 +235,8 @@ static void DemangleToString(const char* mangled, string* out) {
   }
 }
 
-string Demangle(const char* mangled) {
-  string demangled;
+std::string Demangle(const char* mangled) {
+  std::string demangled;
   DemangleToString(mangled, &demangled);
   return demangled;
 }
@@ -392,28 +251,31 @@ double NominalCPUFrequency() {
 namespace tsl {
 namespace port {
 
-void* AlignedMalloc(size_t size, int minimum_alignment) {
+void* AlignedMalloc(size_t size, std::align_val_t minimum_alignment) {
+  const size_t alignment = static_cast<size_t>(minimum_alignment);
 #if defined(__ANDROID__)
-  return memalign(minimum_alignment, size);
+  return memalign(alignment, size);
 #else  // !defined(__ANDROID__)
-  void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
   // sizeof(void*). In this case, fall back on malloc which should return
   // memory aligned to at least the size of a pointer.
-  const int required_alignment = sizeof(void*);
-  if (minimum_alignment < required_alignment) return Malloc(size);
-  int err = posix_memalign(&ptr, minimum_alignment, size);
+  constexpr int kRequiredAlignment = sizeof(void*);
+  if (alignment < kRequiredAlignment) {
+    return Malloc(size);
+  }
+  void* ptr = nullptr;
+  int err = posix_memalign(&ptr, alignment, size);
   if (err != 0) {
     return nullptr;
-  } else {
-    return ptr;
   }
+  return ptr;
 #endif
 }
 
 void AlignedFree(void* aligned_memory) { Free(aligned_memory); }
 
-void AlignedSizedFree(void* aligned_memory, size_t alignment, size_t size) {
+void AlignedSizedFree(void* aligned_memory, size_t size,
+                      std::align_val_t alignment) {
   (void)alignment;
   (void)size;
 
diff --git a/third_party/xla/xla/tsl/platform/default/posix_file_system.cc b/third_party/xla/xla/tsl/platform/default/posix_file_system.cc
index 68ee3b1b7b9697..14f2e4d2f0b041 100644
--- a/third_party/xla/xla/tsl/platform/default/posix_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/default/posix_file_system.cc
@@ -20,6 +20,18 @@ limitations under the License.
 #include <stdio.h>
 #include <sys/mman.h>
 
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/file_statistics.h"
+#include "xla/tsl/platform/file_system.h"
+
 #if defined(__linux__)
 #include <sys/sendfile.h>
 #endif
@@ -34,9 +46,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system_helper.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
-#include "tsl/platform/strcat.h"
 
 namespace tsl {
 
@@ -48,11 +58,11 @@ constexpr size_t kPosixCopyFileBufferSize = 128 * 1024;
 // pread() based random-access
 class PosixRandomAccessFile : public RandomAccessFile {
  private:
-  string filename_;
+  std::string filename_;
   int fd_;
 
  public:
-  PosixRandomAccessFile(const string& fname, int fd)
+  PosixRandomAccessFile(const std::string& fname, int fd)
       : filename_(fname), fd_(fd) {}
   ~PosixRandomAccessFile() override {
     if (close(fd_) < 0) {
@@ -65,7 +75,7 @@ class PosixRandomAccessFile : public RandomAccessFile {
     return absl::OkStatus();
   }
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     absl::Status s;
     char* dst = scratch;
@@ -98,20 +108,21 @@ class PosixRandomAccessFile : public RandomAccessFile {
   }
 
 #if defined(TF_CORD_SUPPORT)
-  absl::Status Read(uint64 offset, size_t n, absl::Cord* cord) const override {
+  absl::Status Read(uint64_t offset, size_t n,
+                    absl::Cord* cord) const override {
     if (n == 0) {
       return absl::OkStatus();
     }
     if (n < 0) {
-      return errors::InvalidArgument(
-          "Attempting to read ", n,
-          " bytes. You cannot read a negative number of bytes.");
+      return absl::InvalidArgumentError(
+          absl::StrCat("Attempting to read ", n,
+                       " bytes. You cannot read a negative number of bytes."));
     }
 
     char* scratch = new char[n];
     if (scratch == nullptr) {
-      return errors::ResourceExhausted("Unable to allocate ", n,
-                                       " bytes for file reading.");
+      return absl::ResourceExhaustedError(
+          absl::StrCat("Unable to allocate ", n, " bytes for file reading."));
     }
 
     absl::string_view tmp;
@@ -128,11 +139,11 @@ class PosixRandomAccessFile : public RandomAccessFile {
 
 class PosixWritableFile : public WritableFile {
  private:
-  string filename_;
+  std::string filename_;
   FILE* file_;
 
  public:
-  PosixWritableFile(const string& fname, FILE* f)
+  PosixWritableFile(const std::string& fname, FILE* f)
       : filename_(fname), file_(f) {}
 
   ~PosixWritableFile() override {
@@ -209,23 +220,23 @@ class PosixWritableFile : public WritableFile {
 
 class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
-  PosixReadOnlyMemoryRegion(const void* address, uint64 length)
+  PosixReadOnlyMemoryRegion(const void* address, uint64_t length)
       : address_(address), length_(length) {}
   ~PosixReadOnlyMemoryRegion() override {
     munmap(const_cast<void*>(address_), length_);
   }
   const void* data() override { return address_; }
-  uint64 length() override { return length_; }
+  uint64_t length() override { return length_; }
 
  private:
   const void* const address_;
-  const uint64 length_;
+  const uint64_t length_;
 };
 
 absl::Status PosixFileSystem::NewRandomAccessFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<RandomAccessFile>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s;
   int fd = open(translated_fname.c_str(), O_RDONLY);
   if (fd < 0) {
@@ -237,9 +248,9 @@ absl::Status PosixFileSystem::NewRandomAccessFile(
 }
 
 absl::Status PosixFileSystem::NewWritableFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s;
   FILE* f = fopen(translated_fname.c_str(), "w");
   if (f == nullptr) {
@@ -251,9 +262,9 @@ absl::Status PosixFileSystem::NewWritableFile(
 }
 
 absl::Status PosixFileSystem::NewAppendableFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s;
   FILE* f = fopen(translated_fname.c_str(), "a");
   if (f == nullptr) {
@@ -265,9 +276,9 @@ absl::Status PosixFileSystem::NewAppendableFile(
 }
 
 absl::Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s = absl::OkStatus();
   int fd = open(translated_fname.c_str(), O_RDONLY);
   if (fd < 0) {
@@ -289,18 +300,18 @@ absl::Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
   return s;
 }
 
-absl::Status PosixFileSystem::FileExists(const string& fname,
+absl::Status PosixFileSystem::FileExists(const std::string& fname,
                                          TransactionToken* token) {
   if (access(TranslateName(fname).c_str(), F_OK) == 0) {
     return absl::OkStatus();
   }
-  return errors::NotFound(fname, " not found");
+  return absl::NotFoundError(absl::StrCat(fname, " not found"));
 }
 
-absl::Status PosixFileSystem::GetChildren(const string& dir,
+absl::Status PosixFileSystem::GetChildren(const std::string& dir,
                                           TransactionToken* token,
-                                          std::vector<string>* result) {
-  string translated_dir = TranslateName(dir);
+                                          std::vector<std::string>* result) {
+  std::string translated_dir = TranslateName(dir);
   result->clear();
   DIR* d = opendir(translated_dir.c_str());
   if (d == nullptr) {
@@ -319,13 +330,13 @@ absl::Status PosixFileSystem::GetChildren(const string& dir,
   return absl::OkStatus();
 }
 
-absl::Status PosixFileSystem::GetMatchingPaths(const string& pattern,
-                                               TransactionToken* token,
-                                               std::vector<string>* results) {
+absl::Status PosixFileSystem::GetMatchingPaths(
+    const std::string& pattern, TransactionToken* token,
+    std::vector<std::string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-absl::Status PosixFileSystem::DeleteFile(const string& fname,
+absl::Status PosixFileSystem::DeleteFile(const std::string& fname,
                                          TransactionToken* token) {
   absl::Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
@@ -334,11 +345,11 @@ absl::Status PosixFileSystem::DeleteFile(const string& fname,
   return result;
 }
 
-absl::Status PosixFileSystem::CreateDir(const string& name,
+absl::Status PosixFileSystem::CreateDir(const std::string& name,
                                         TransactionToken* token) {
-  string translated = TranslateName(name);
+  std::string translated = TranslateName(name);
   if (translated.empty()) {
-    return errors::AlreadyExists(name);
+    return absl::AlreadyExistsError(name);
   }
   if (mkdir(translated.c_str(), 0755) != 0) {
     return IOError(name, errno);
@@ -346,7 +357,7 @@ absl::Status PosixFileSystem::CreateDir(const string& name,
   return absl::OkStatus();
 }
 
-absl::Status PosixFileSystem::DeleteDir(const string& name,
+absl::Status PosixFileSystem::DeleteDir(const std::string& name,
                                         TransactionToken* token) {
   absl::Status result;
   if (rmdir(TranslateName(name).c_str()) != 0) {
@@ -355,9 +366,9 @@ absl::Status PosixFileSystem::DeleteDir(const string& name,
   return result;
 }
 
-absl::Status PosixFileSystem::GetFileSize(const string& fname,
+absl::Status PosixFileSystem::GetFileSize(const std::string& fname,
                                           TransactionToken* token,
-                                          uint64* size) {
+                                          uint64_t* size) {
   absl::Status s;
   struct stat sbuf;
   if (stat(TranslateName(fname).c_str(), &sbuf) != 0) {
@@ -369,7 +380,8 @@ absl::Status PosixFileSystem::GetFileSize(const string& fname,
   return s;
 }
 
-absl::Status PosixFileSystem::Stat(const string& fname, TransactionToken* token,
+absl::Status PosixFileSystem::Stat(const std::string& fname,
+                                   TransactionToken* token,
                                    FileStatistics* stats) {
   absl::Status s;
   struct stat sbuf;
@@ -383,8 +395,8 @@ absl::Status PosixFileSystem::Stat(const string& fname, TransactionToken* token,
   return s;
 }
 
-absl::Status PosixFileSystem::RenameFile(const string& src,
-                                         const string& target,
+absl::Status PosixFileSystem::RenameFile(const std::string& src,
+                                         const std::string& target,
                                          TransactionToken* token) {
   absl::Status result;
   if (rename(TranslateName(src).c_str(), TranslateName(target).c_str()) != 0) {
@@ -393,9 +405,10 @@ absl::Status PosixFileSystem::RenameFile(const string& src,
   return result;
 }
 
-absl::Status PosixFileSystem::CopyFile(const string& src, const string& target,
+absl::Status PosixFileSystem::CopyFile(const std::string& src,
+                                       const std::string& target,
                                        TransactionToken* token) {
-  string translated_src = TranslateName(src);
+  std::string translated_src = TranslateName(src);
   struct stat sbuf;
   if (stat(translated_src.c_str(), &sbuf) != 0) {
     return IOError(src, errno);
@@ -404,7 +417,7 @@ absl::Status PosixFileSystem::CopyFile(const string& src, const string& target,
   if (src_fd < 0) {
     return IOError(src, errno);
   }
-  string translated_target = TranslateName(target);
+  std::string translated_target = TranslateName(target);
   // O_WRONLY | O_CREAT | O_TRUNC:
   //   Open file for write and if file does not exist, create the file.
   //   If file exists, truncate its size to 0.
@@ -421,7 +434,7 @@ absl::Status PosixFileSystem::CopyFile(const string& src, const string& target,
   std::unique_ptr<char[]> buffer(new char[kPosixCopyFileBufferSize]);
   while (offset < sbuf.st_size) {
     // Use uint64 for safe compare SSIZE_MAX
-    uint64 chunk = sbuf.st_size - offset;
+    uint64_t chunk = sbuf.st_size - offset;
     if (chunk > SSIZE_MAX) {
       chunk = SSIZE_MAX;
     }
diff --git a/third_party/xla/xla/tsl/platform/default/posix_file_system.h b/third_party/xla/xla/tsl/platform/default/posix_file_system.h
index a54ecf04017dcd..b65f4f8af84cc5 100644
--- a/third_party/xla/xla/tsl/platform/default/posix_file_system.h
+++ b/third_party/xla/xla/tsl/platform/default/posix_file_system.h
@@ -30,55 +30,59 @@ class PosixFileSystem : public FileSystem {
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
   absl::Status NewRandomAccessFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
 
-  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
                                std::unique_ptr<WritableFile>* result) override;
 
   absl::Status NewAppendableFile(
-      const string& fname, TransactionToken* token,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<WritableFile>* result) override;
 
   absl::Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  absl::Status FileExists(const string& fname,
+  absl::Status FileExists(const std::string& fname,
                           TransactionToken* token) override;
 
-  absl::Status GetChildren(const string& dir, TransactionToken* token,
-                           std::vector<string>* result) override;
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<std::string>* result) override;
 
-  absl::Status Stat(const string& fname, TransactionToken* token,
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
                     FileStatistics* stats) override;
 
-  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
-                                std::vector<string>* results) override;
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<std::string>* results) override;
 
-  absl::Status DeleteFile(const string& fname,
+  absl::Status DeleteFile(const std::string& fname,
                           TransactionToken* token) override;
 
-  absl::Status CreateDir(const string& name, TransactionToken* token) override;
+  absl::Status CreateDir(const std::string& name,
+                         TransactionToken* token) override;
 
-  absl::Status DeleteDir(const string& name, TransactionToken* token) override;
+  absl::Status DeleteDir(const std::string& name,
+                         TransactionToken* token) override;
 
-  absl::Status GetFileSize(const string& fname, TransactionToken* token,
-                           uint64* size) override;
+  absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
+                           uint64_t* size) override;
 
-  absl::Status RenameFile(const string& src, const string& target,
+  absl::Status RenameFile(const std::string& src, const std::string& target,
                           TransactionToken* token) override;
 
-  absl::Status CopyFile(const string& src, const string& target,
+  absl::Status CopyFile(const std::string& src, const std::string& target,
                         TransactionToken* token) override;
 };
 
 class LocalPosixFileSystem : public PosixFileSystem {
  public:
-  string TranslateName(const string& name) const override {
+  std::string TranslateName(const std::string& name) const override {
     absl::string_view scheme, host, path;
     io::ParseURI(name, &scheme, &host, &path);
-    return string(path);
+    return std::string(path);
   }
 };
 
diff --git a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
index 5eb1c2329dfca9..12c127bac63727 100644
--- a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
+++ b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
@@ -36,6 +36,6 @@ def cc_library(name, deps = None, **kwargs):
     # Horrifying, but needed to prevent a cycle, as `bazel_issue_21519` is an
     # alias of `empty`.
     if name != "empty":
-        deps = deps + ["@local_xla//xla/tsl:bazel_issue_21519"]  # buildifier: disable=list-append
-        deps = deps + ["@local_tsl//:bazel_issue_21519"]  # buildifier: disable=list-append
+        deps = deps + [Label("//xla/tsl:bazel_issue_21519")]  # buildifier: disable=list-append
+        deps = deps + [Label("@local_tsl//:bazel_issue_21519")]  # buildifier: disable=list-append
     _cc_library(name = name, deps = deps, **kwargs)
diff --git a/third_party/xla/xla/tsl/platform/env.cc b/third_party/xla/xla/tsl/platform/env.cc
index 8da042c0d9286e..b104d90b8bf2e2 100644
--- a/third_party/xla/xla/tsl/platform/env.cc
+++ b/third_party/xla/xla/tsl/platform/env.cc
@@ -18,19 +18,28 @@ limitations under the License.
 #include <sys/stat.h>
 
 #include <cstdint>
-#include <deque>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/tsl/platform/env_time.h"
+#include "absl/types/span.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/message_lite.h"
+#include "google/protobuf/text_format.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_statistics.h"
+#include "xla/tsl/platform/file_system.h"
 #include "tsl/platform/host_info.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/platform.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/stringprintf.h"
 #include "tsl/platform/thread_annotations.h"
 
 #if defined(__APPLE__)
@@ -77,8 +86,8 @@ absl::Status FileSystemRegistryImpl::Register(
   absl::MutexLock lock(mu_);
   if (!registry_.emplace(scheme, std::unique_ptr<FileSystem>(factory()))
            .second) {
-    return errors::AlreadyExists("File factory for ", scheme,
-                                 " already registered");
+    return absl::AlreadyExistsError(
+        absl::StrCat("File factory for ", scheme, " already registered"));
   }
   return absl::OkStatus();
 }
@@ -87,8 +96,8 @@ absl::Status FileSystemRegistryImpl::Register(
     const std::string& scheme, std::unique_ptr<FileSystem> filesystem) {
   absl::MutexLock lock(mu_);
   if (!registry_.emplace(scheme, std::move(filesystem)).second) {
-    return errors::AlreadyExists("File system for ", scheme,
-                                 " already registered");
+    return absl::AlreadyExistsError(
+        absl::StrCat("File system for ", scheme, " already registered"));
   }
   return absl::OkStatus();
 }
@@ -156,7 +165,7 @@ absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
 }
 
 absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
-                            const std::vector<string>& values) {
+                            const std::vector<std::string>& values) {
   FileSystem* file_system = file_system_registry_->Lookup(scheme);
   if (!file_system) {
     return errors::Unimplemented("File system scheme '", scheme,
@@ -186,9 +195,9 @@ absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
 }
 
 absl::Status Env::FlushFileSystemCaches() {
-  std::vector<string> schemes;
+  std::vector<std::string> schemes;
   TF_RETURN_IF_ERROR(GetRegisteredFileSystemSchemes(&schemes));
-  for (const string& scheme : schemes) {
+  for (const std::string& scheme : schemes) {
     FileSystem* fs = nullptr;
     TF_RETURN_IF_ERROR(
         GetFileSystemForFile(io::CreateURI(scheme, "", ""), &fs));
@@ -198,49 +207,49 @@ absl::Status Env::FlushFileSystemCaches() {
 }
 
 absl::Status Env::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewRandomAccessFile(fname, result);
 }
 
 absl::Status Env::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewReadOnlyMemoryRegionFromFile(fname, result);
 }
 
-absl::Status Env::NewWritableFile(const string& fname,
+absl::Status Env::NewWritableFile(const std::string& fname,
                                   std::unique_ptr<WritableFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewWritableFile(fname, result);
 }
 
-absl::Status Env::NewAppendableFile(const string& fname,
+absl::Status Env::NewAppendableFile(const std::string& fname,
                                     std::unique_ptr<WritableFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewAppendableFile(fname, result);
 }
 
-absl::Status Env::FileExists(const string& fname) {
+absl::Status Env::FileExists(const std::string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->FileExists(fname);
 }
 
-bool Env::FilesExist(const std::vector<string>& files,
+bool Env::FilesExist(const std::vector<std::string>& files,
                      std::vector<absl::Status>* status) {
-  std::unordered_map<string, std::vector<string>> files_per_fs;
+  std::unordered_map<std::string, std::vector<std::string>> files_per_fs;
   for (const auto& file : files) {
     absl::string_view scheme, host, path;
     io::ParseURI(file, &scheme, &host, &path);
-    files_per_fs[string(scheme)].push_back(file);
+    files_per_fs[std::string(scheme)].push_back(file);
   }
 
-  std::unordered_map<string, absl::Status> per_file_status;
+  std::unordered_map<std::string, absl::Status> per_file_status;
   bool result = true;
   for (auto itr : files_per_fs) {
     FileSystem* file_system = file_system_registry_->Lookup(itr.first);
@@ -277,62 +286,64 @@ bool Env::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-absl::Status Env::GetChildren(const string& dir, std::vector<string>* result) {
+absl::Status Env::GetChildren(const std::string& dir,
+                              std::vector<std::string>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dir, &fs));
   return fs->GetChildren(dir, result);
 }
 
-absl::Status Env::GetMatchingPaths(const string& pattern,
-                                   std::vector<string>* results) {
+absl::Status Env::GetMatchingPaths(const std::string& pattern,
+                                   std::vector<std::string>* results) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(pattern, &fs));
   return fs->GetMatchingPaths(pattern, results);
 }
 
-absl::Status Env::DeleteFile(const string& fname) {
+absl::Status Env::DeleteFile(const std::string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->DeleteFile(fname);
 }
 
-absl::Status Env::RecursivelyCreateDir(const string& dirname) {
+absl::Status Env::RecursivelyCreateDir(const std::string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->RecursivelyCreateDir(dirname);
 }
 
-absl::Status Env::CreateDir(const string& dirname) {
+absl::Status Env::CreateDir(const std::string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->CreateDir(dirname);
 }
 
-absl::Status Env::DeleteDir(const string& dirname) {
+absl::Status Env::DeleteDir(const std::string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->DeleteDir(dirname);
 }
 
-absl::Status Env::Stat(const string& fname, FileStatistics* stat) {
+absl::Status Env::Stat(const std::string& fname, FileStatistics* stat) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->Stat(fname, stat);
 }
 
-absl::Status Env::IsDirectory(const string& fname) {
+absl::Status Env::IsDirectory(const std::string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->IsDirectory(fname);
 }
 
-absl::Status Env::HasAtomicMove(const string& path, bool* has_atomic_move) {
+absl::Status Env::HasAtomicMove(const std::string& path,
+                                bool* has_atomic_move) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(path, &fs));
   return fs->HasAtomicMove(path, has_atomic_move);
 }
 
-absl::Status Env::DeleteRecursively(const string& dirname,
+absl::Status Env::DeleteRecursively(const std::string& dirname,
                                     int64_t* undeleted_files,
                                     int64_t* undeleted_dirs) {
   FileSystem* fs;
@@ -340,13 +351,14 @@ absl::Status Env::DeleteRecursively(const string& dirname,
   return fs->DeleteRecursively(dirname, undeleted_files, undeleted_dirs);
 }
 
-absl::Status Env::GetFileSize(const string& fname, uint64* file_size) {
+absl::Status Env::GetFileSize(const std::string& fname, uint64_t* file_size) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->GetFileSize(fname, file_size);
 }
 
-absl::Status Env::RenameFile(const string& src, const string& target) {
+absl::Status Env::RenameFile(const std::string& src,
+                             const std::string& target) {
   FileSystem* src_fs;
   FileSystem* target_fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
@@ -358,7 +370,7 @@ absl::Status Env::RenameFile(const string& src, const string& target) {
   return src_fs->RenameFile(src, target);
 }
 
-absl::Status Env::CopyFile(const string& src, const string& target) {
+absl::Status Env::CopyFile(const std::string& src, const std::string& target) {
   FileSystem* src_fs;
   FileSystem* target_fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
@@ -369,7 +381,7 @@ absl::Status Env::CopyFile(const string& src, const string& target) {
   return FileSystemCopyFile(src_fs, src, target_fs, target);
 }
 
-string Env::GetExecutablePath() {
+std::string Env::GetExecutablePath() {
   char exe_path[PATH_MAX] = {0};
 #ifdef __APPLE__
   uint32_t buffer_size(0U);
@@ -426,13 +438,13 @@ string Env::GetExecutablePath() {
   return exe_path;
 }
 
-bool Env::LocalTempFilename(string* filename) {
-  std::vector<string> dirs;
+bool Env::LocalTempFilename(std::string* filename) {
+  std::vector<std::string> dirs;
   GetLocalTempDirectories(&dirs);
 
   // Try each directory, as they might be full, have inappropriate
   // permissions or have different problems at times.
-  for (const string& dir : dirs) {
+  for (const std::string& dir : dirs) {
     *filename = io::JoinPath(dir, "tempfile-");
     if (CreateUniqueFileName(filename, "")) {
       return true;
@@ -441,7 +453,7 @@ bool Env::LocalTempFilename(string* filename) {
   return false;
 }
 
-bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
+bool Env::CreateUniqueFileName(std::string* prefix, const std::string& suffix) {
   int64_t tid = GetCurrentThreadId();
   int32_t pid = GetProcessId();
   long long now_microsec = NowMicros();  // NOLINT
@@ -460,11 +472,11 @@ bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
   }
 }
 
-int32 Env::GetProcessId() {
+int32_t Env::GetProcessId() {
 #ifdef PLATFORM_WINDOWS
   return static_cast<int32>(GetCurrentProcessId());
 #else
-  return static_cast<int32>(getpid());
+  return static_cast<int32_t>(getpid());
 #endif
 }
 
@@ -472,8 +484,9 @@ Thread::~Thread() {}
 
 EnvWrapper::~EnvWrapper() {}
 
-absl::Status ReadFileToString(Env* env, const string& fname, string* data) {
-  uint64 file_size;
+absl::Status ReadFileToString(Env* env, const std::string& fname,
+                              std::string* data) {
+  uint64_t file_size;
   absl::Status s = env->GetFileSize(fname, &file_size);
   if (!s.ok()) {
     return s;
@@ -501,7 +514,7 @@ absl::Status ReadFileToString(Env* env, const string& fname, string* data) {
   return s;
 }
 
-absl::Status WriteStringToFile(Env* env, const string& fname,
+absl::Status WriteStringToFile(Env* env, const std::string& fname,
                                absl::string_view data) {
   std::unique_ptr<WritableFile> file;
   absl::Status s = env->NewWritableFile(fname, &file);
@@ -515,13 +528,14 @@ absl::Status WriteStringToFile(Env* env, const string& fname,
   return s;
 }
 
-absl::Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
-                                FileSystem* target_fs, const string& target) {
+absl::Status FileSystemCopyFile(FileSystem* src_fs, const std::string& src,
+                                FileSystem* target_fs,
+                                const std::string& target) {
   std::unique_ptr<RandomAccessFile> src_file;
   TF_RETURN_IF_ERROR(src_fs->NewRandomAccessFile(src, &src_file));
 
   // When `target` points to a directory, we need to create a file within.
-  string target_name;
+  std::string target_name;
   if (target_fs->IsDirectory(target).ok()) {
     target_name = io::JoinPath(target, io::Basename(src));
   } else {
@@ -531,7 +545,7 @@ absl::Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
   std::unique_ptr<WritableFile> target_file;
   TF_RETURN_IF_ERROR(target_fs->NewWritableFile(target_name, &target_file));
 
-  uint64 offset = 0;
+  uint64_t offset = 0;
   std::unique_ptr<char[]> scratch(new char[kCopyFileBufferSize]);
   absl::Status s = absl::OkStatus();
   while (s.ok()) {
@@ -586,14 +600,14 @@ class FileStream : public protobuf::io::ZeroCopyInputStream {
 
 }  // namespace
 
-absl::Status WriteBinaryProto(Env* env, const string& fname,
+absl::Status WriteBinaryProto(Env* env, const std::string& fname,
                               const protobuf::MessageLite& proto) {
-  string serialized;
+  std::string serialized;
   proto.AppendToString(&serialized);
   return WriteStringToFile(env, fname, serialized);
 }
 
-absl::Status ReadBinaryProto(Env* env, const string& fname,
+absl::Status ReadBinaryProto(Env* env, const std::string& fname,
                              protobuf::MessageLite* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
@@ -608,16 +622,16 @@ absl::Status ReadBinaryProto(Env* env, const string& fname,
   return absl::OkStatus();
 }
 
-absl::Status WriteTextProto(Env* env, const string& fname,
+absl::Status WriteTextProto(Env* env, const std::string& fname,
                             const protobuf::Message& proto) {
-  string serialized;
+  std::string serialized;
   if (!protobuf::TextFormat::PrintToString(proto, &serialized)) {
     return errors::FailedPrecondition("Unable to convert proto to text.");
   }
   return WriteStringToFile(env, fname, serialized);
 }
 
-absl::Status ReadTextProto(Env* env, const string& fname,
+absl::Status ReadTextProto(Env* env, const std::string& fname,
                            protobuf::Message* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
@@ -630,7 +644,7 @@ absl::Status ReadTextProto(Env* env, const string& fname,
   return absl::OkStatus();
 }
 
-absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
                                    protobuf::Message* proto) {
   if (ReadTextProto(env, fname, proto).ok()) {
     return absl::OkStatus();
@@ -638,7 +652,7 @@ absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
   return ReadBinaryProto(env, fname, proto);
 }
 
-absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
                                    protobuf::MessageLite* proto) {
   return ReadBinaryProto(env, fname, proto);
 }
diff --git a/third_party/xla/xla/tsl/platform/env.h b/third_party/xla/xla/tsl/platform/env.h
index 2357ae513d2581..30bd4e34b618db 100644
--- a/third_party/xla/xla/tsl/platform/env.h
+++ b/third_party/xla/xla/tsl/platform/env.h
@@ -104,7 +104,7 @@ class Env {
                          const std::string& value);
 
   absl::Status SetOption(const std::string& scheme, const std::string& key,
-                         const std::vector<string>& values);
+                         const std::vector<std::string>& values);
 
   absl::Status SetOption(const std::string& scheme, const std::string& key,
                          const std::vector<int64_t>& values);
@@ -211,11 +211,11 @@ class Env {
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
   /// for each file.
-  bool FilesExist(const std::vector<string>& files,
+  bool FilesExist(const std::vector<std::string>& files,
                   std::vector<absl::Status>* status);
 
-  bool FilesExist(const std::vector<string>& files, TransactionToken* token,
-                  std::vector<absl::Status>* status) {
+  bool FilesExist(const std::vector<std::string>& files,
+                  TransactionToken* token, std::vector<absl::Status>* status) {
     return true;
   }
 
@@ -223,10 +223,11 @@ class Env {
   /// directory. The names are relative to "dir".
   ///
   /// Original contents of *results are dropped.
-  absl::Status GetChildren(const std::string& dir, std::vector<string>* result);
+  absl::Status GetChildren(const std::string& dir,
+                           std::vector<std::string>* result);
 
   absl::Status GetChildren(const std::string& dir, TransactionToken* token,
-                           std::vector<string>* result) {
+                           std::vector<std::string>* result) {
     return absl::OkStatus();
   }
 
@@ -240,11 +241,11 @@ class Env {
   ///
   /// More details about `pattern` in FileSystem::GetMatchingPaths.
   virtual absl::Status GetMatchingPaths(const std::string& pattern,
-                                        std::vector<string>* results);
+                                        std::vector<std::string>* results);
 
   absl::Status GetMatchingPaths(const std::string& pattern,
                                 TransactionToken* token,
-                                std::vector<string>* results) {
+                                std::vector<std::string>* results) {
     return absl::OkStatus();
   }
 
@@ -348,10 +349,10 @@ class Env {
   absl::Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
 
   /// Stores the size of `fname` in `*file_size`.
-  absl::Status GetFileSize(const std::string& fname, uint64* file_size);
+  absl::Status GetFileSize(const std::string& fname, uint64_t* file_size);
 
   absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
-                           uint64* file_size) {
+                           uint64_t* file_size) {
     return absl::OkStatus();
   }
 
@@ -426,19 +427,19 @@ class Env {
   // provide a routine to get the absolute time.
 
   /// \brief Returns the number of nano-seconds since the Unix epoch.
-  virtual uint64 NowNanos() const { return EnvTime::NowNanos(); }
+  virtual uint64_t NowNanos() const { return EnvTime::NowNanos(); }
 
   /// \brief Returns the number of micro-seconds since the Unix epoch.
-  virtual uint64 NowMicros() const { return EnvTime::NowMicros(); }
+  virtual uint64_t NowMicros() const { return EnvTime::NowMicros(); }
 
   /// \brief Returns the number of seconds since the Unix epoch.
-  virtual uint64 NowSeconds() const { return EnvTime::NowSeconds(); }
+  virtual uint64_t NowSeconds() const { return EnvTime::NowSeconds(); }
 
   /// Sleeps/delays the thread for the prescribed number of micro-seconds.
   virtual void SleepForMicroseconds(int64_t micros) = 0;
 
   /// Returns the process ID of the calling process.
-  int32 GetProcessId();
+  int32_t GetProcessId();
 
   /// \brief Returns a new thread that is running fn() and is identified
   /// (for debugging/performance-analysis) by "name".
@@ -511,7 +512,7 @@ class Env {
                                             const std::string& version) = 0;
 
   // Returns a possible list of local temporary directories.
-  virtual void GetLocalTempDirectories(std::vector<string>* list) = 0;
+  virtual void GetLocalTempDirectories(std::vector<std::string>* list) = 0;
 
  private:
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
@@ -538,7 +539,7 @@ class EnvWrapper : public Env {
   }
 
   absl::Status GetRegisteredFileSystemSchemes(
-      std::vector<string>* schemes) override {
+      std::vector<std::string>* schemes) override {
     return target_->GetRegisteredFileSystemSchemes(schemes);
   }
 
@@ -551,7 +552,7 @@ class EnvWrapper : public Env {
     return target_->MatchPath(path, pattern);
   }
 
-  uint64 NowMicros() const override { return target_->NowMicros(); }
+  uint64_t NowMicros() const override { return target_->NowMicros(); }
   void SleepForMicroseconds(int64_t micros) override {
     target_->SleepForMicroseconds(micros);
   }
@@ -595,7 +596,7 @@ class EnvWrapper : public Env {
   std::string GetRunfilesDir() override { return target_->GetRunfilesDir(); }
 
  private:
-  void GetLocalTempDirectories(std::vector<string>* list) override {
+  void GetLocalTempDirectories(std::vector<std::string>* list) override {
     target_->GetLocalTempDirectories(list);
   }
 
@@ -700,7 +701,8 @@ struct Register {
     // after TF 2.6+.
     if (try_modular_filesystems) {
       const char* env_value = getenv("TF_USE_MODULAR_FILESYSTEM");
-      string load_plugin = env_value ? absl::AsciiStrToLower(env_value) : "";
+      std::string load_plugin =
+          env_value ? absl::AsciiStrToLower(env_value) : "";
       if (load_plugin == "true" || load_plugin == "1") {
         // We don't register the static filesystem and wait for SIG IO one
         LOG(WARNING) << "Using modular file system for '" << scheme << "'."
diff --git a/third_party/xla/xla/tsl/platform/env_time.h b/third_party/xla/xla/tsl/platform/env_time.h
index f37e3129f45697..f66f2362f40dc4 100644
--- a/third_party/xla/xla/tsl/platform/env_time.h
+++ b/third_party/xla/xla/tsl/platform/env_time.h
@@ -25,37 +25,37 @@ namespace tsl {
 /// access timer related operations.
 class EnvTime {
  public:
-  static constexpr uint64 kMicrosToPicos = 1000ULL * 1000ULL;
-  static constexpr uint64 kMicrosToNanos = 1000ULL;
-  static constexpr uint64 kMillisToMicros = 1000ULL;
-  static constexpr uint64 kMillisToNanos = 1000ULL * 1000ULL;
-  static constexpr uint64 kNanosToPicos = 1000ULL;
-  static constexpr uint64 kSecondsToMillis = 1000ULL;
-  static constexpr uint64 kSecondsToMicros = 1000ULL * 1000ULL;
-  static constexpr uint64 kSecondsToNanos = 1000ULL * 1000ULL * 1000ULL;
+  static constexpr uint64_t kMicrosToPicos = 1000ULL * 1000ULL;
+  static constexpr uint64_t kMicrosToNanos = 1000ULL;
+  static constexpr uint64_t kMillisToMicros = 1000ULL;
+  static constexpr uint64_t kMillisToNanos = 1000ULL * 1000ULL;
+  static constexpr uint64_t kNanosToPicos = 1000ULL;
+  static constexpr uint64_t kSecondsToMillis = 1000ULL;
+  static constexpr uint64_t kSecondsToMicros = 1000ULL * 1000ULL;
+  static constexpr uint64_t kSecondsToNanos = 1000ULL * 1000ULL * 1000ULL;
 
   EnvTime() = default;
   virtual ~EnvTime() = default;
 
   /// \brief Returns the number of nano-seconds since the Unix epoch.
-  static uint64 NowNanos();
+  static uint64_t NowNanos();
 
   /// \brief Returns the number of micro-seconds since the Unix epoch.
-  static uint64 NowMicros() { return NowNanos() / kMicrosToNanos; }
+  static uint64_t NowMicros() { return NowNanos() / kMicrosToNanos; }
 
   /// \brief Returns the number of seconds since the Unix epoch.
-  static uint64 NowSeconds() { return NowNanos() / kSecondsToNanos; }
+  static uint64_t NowSeconds() { return NowNanos() / kSecondsToNanos; }
 
   /// \brief A version of NowNanos() that may be overridden by a subclass.
-  virtual uint64 GetOverridableNowNanos() const { return NowNanos(); }
+  virtual uint64_t GetOverridableNowNanos() const { return NowNanos(); }
 
   /// \brief A version of NowMicros() that may be overridden by a subclass.
-  virtual uint64 GetOverridableNowMicros() const {
+  virtual uint64_t GetOverridableNowMicros() const {
     return GetOverridableNowNanos() / kMicrosToNanos;
   }
 
   /// \brief A version of NowSeconds() that may be overridden by a subclass.
-  virtual uint64 GetOverridableNowSeconds() const {
+  virtual uint64_t GetOverridableNowSeconds() const {
     return GetOverridableNowNanos() / kSecondsToNanos;
   }
 };
diff --git a/third_party/xla/xla/tsl/platform/file_system.cc b/third_party/xla/xla/tsl/platform/file_system.cc
index be988cb773385a..ff3f44fe4d2ed2 100644
--- a/third_party/xla/xla/tsl/platform/file_system.cc
+++ b/third_party/xla/xla/tsl/platform/file_system.cc
@@ -42,7 +42,8 @@ limitations under the License.
 
 namespace tsl {
 
-bool FileSystem::Match(const string& filename, const string& pattern) {
+bool FileSystem::Match(const std::string& filename,
+                       const std::string& pattern) {
 #if defined(PLATFORM_POSIX) || defined(IS_MOBILE_PLATFORM) || \
     defined(PLATFORM_GOOGLE)
   // We avoid relying on RE2 on mobile platforms, because it incurs a
@@ -61,7 +62,7 @@ bool FileSystem::Match(const string& filename, const string& pattern) {
         // defined(PLATFORM_GOOGLE)
 }
 
-string FileSystem::TranslateName(const string& name) const {
+std::string FileSystem::TranslateName(const std::string& name) const {
   // If the name is empty, CleanPath returns "." which is incorrect and
   // we should return the empty path instead.
   if (name.empty()) return name;
@@ -76,7 +77,7 @@ string FileSystem::TranslateName(const string& name) const {
   return this->CleanPath(path);
 }
 
-absl::Status FileSystem::IsDirectory(const string& name,
+absl::Status FileSystem::IsDirectory(const std::string& name,
                                      TransactionToken* token) {
   // Check if path exists.
   // TODO(sami):Forward token to other methods once migration is complete.
@@ -89,7 +90,7 @@ absl::Status FileSystem::IsDirectory(const string& name,
   return absl::Status(absl::StatusCode::kFailedPrecondition, "Not a directory");
 }
 
-absl::Status FileSystem::HasAtomicMove(const string& path,
+absl::Status FileSystem::HasAtomicMove(const std::string& path,
                                        bool* has_atomic_move) {
   *has_atomic_move = true;
   return absl::OkStatus();
@@ -97,7 +98,7 @@ absl::Status FileSystem::HasAtomicMove(const string& path,
 
 void FileSystem::FlushCaches(TransactionToken* token) {}
 
-bool FileSystem::FilesExist(const std::vector<string>& files,
+bool FileSystem::FilesExist(const std::vector<std::string>& files,
                             TransactionToken* token,
                             std::vector<absl::Status>* status) {
   bool result = true;
@@ -114,7 +115,7 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-absl::Status FileSystem::DeleteRecursively(const string& dirname,
+absl::Status FileSystem::DeleteRecursively(const std::string& dirname,
                                            TransactionToken* token,
                                            int64_t* undeleted_files,
                                            int64_t* undeleted_dirs) {
@@ -137,18 +138,18 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
     return delete_root_status;
   }
 
-  std::deque<string> dir_q;      // Queue for the BFS
-  std::vector<string> dir_list;  // List of all dirs discovered
+  std::deque<std::string> dir_q;      // Queue for the BFS
+  std::vector<std::string> dir_list;  // List of all dirs discovered
   dir_q.push_back(dirname);
   absl::Status ret;  // Status to be returned.
   // Do a BFS on the directory to discover all the sub-directories. Remove all
   // children that are files along the way. Then cleanup and remove the
   // directories in reverse order.;
   while (!dir_q.empty()) {
-    string dir = dir_q.front();
+    std::string dir = dir_q.front();
     dir_q.pop_front();
     dir_list.push_back(dir);
-    std::vector<string> children;
+    std::vector<std::string> children;
     // GetChildren might fail if we don't have appropriate permissions.
     absl::Status s = GetChildren(dir, &children);
     ret.Update(s);
@@ -156,8 +157,8 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
       (*undeleted_dirs)++;
       continue;
     }
-    for (const string& child : children) {
-      const string child_path = this->JoinPath(dir, child);
+    for (const std::string& child : children) {
+      const std::string child_path = this->JoinPath(dir, child);
       // If the child is a directory add it to the queue, otherwise delete it.
       if (IsDirectory(child_path).ok()) {
         dir_q.push_back(child_path);
@@ -175,7 +176,7 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
   // Now reverse the list of directories and delete them. The BFS ensures that
   // we can delete the directories in this order.
   std::reverse(dir_list.begin(), dir_list.end());
-  for (const string& dir : dir_list) {
+  for (const std::string& dir : dir_list) {
     // Delete dir might fail because of permissions issues or might be
     // unimplemented.
     absl::Status s = DeleteDir(dir);
@@ -187,7 +188,7 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
   return ret;
 }
 
-absl::Status FileSystem::RecursivelyCreateDir(const string& dirname,
+absl::Status FileSystem::RecursivelyCreateDir(const std::string& dirname,
                                               TransactionToken* token) {
   absl::string_view scheme, host, remaining_dir;
   this->ParseURI(dirname, &scheme, &host, &remaining_dir);
@@ -222,7 +223,7 @@ absl::Status FileSystem::RecursivelyCreateDir(const string& dirname,
   std::reverse(sub_dirs.begin(), sub_dirs.end());
 
   // Now create the directories.
-  string built_path(remaining_dir);
+  std::string built_path(remaining_dir);
   for (const absl::string_view sub_dir : sub_dirs) {
     built_path = this->JoinPath(built_path, sub_dir);
     absl::Status status = CreateDir(this->CreateURI(scheme, host, built_path));
@@ -233,22 +234,23 @@ absl::Status FileSystem::RecursivelyCreateDir(const string& dirname,
   return absl::OkStatus();
 }
 
-absl::Status FileSystem::CopyFile(const string& src, const string& target,
+absl::Status FileSystem::CopyFile(const std::string& src,
+                                  const std::string& target,
                                   TransactionToken* token) {
   return FileSystemCopyFile(this, src, this, target);
 }
 
 char FileSystem::Separator() const { return '/'; }
 
-string FileSystem::JoinPathImpl(
+std::string FileSystem::JoinPathImpl(
     std::initializer_list<absl::string_view> paths) {
-  string result;
+  std::string result;
 
   for (absl::string_view path : paths) {
     if (path.empty()) continue;
 
     if (result.empty()) {
-      result = string(path);
+      result = std::string(path);
       continue;
     }
 
@@ -367,10 +369,10 @@ absl::string_view FileSystem::Extension(absl::string_view path) const {
   }
 }
 
-string FileSystem::CleanPath(absl::string_view unclean_path) const {
-  string path(unclean_path);
+std::string FileSystem::CleanPath(absl::string_view unclean_path) const {
+  std::string path(unclean_path);
   const char* src = path.c_str();
-  string::iterator dst = path.begin();
+  std::string::iterator dst = path.begin();
 
   // Check for absolute path and determine initial backtrack limit.
   const bool is_absolute_path = *src == '/';
@@ -378,7 +380,7 @@ string FileSystem::CleanPath(absl::string_view unclean_path) const {
     *dst++ = *src++;
     while (*src == '/') ++src;
   }
-  string::const_iterator backtrack_limit = dst;
+  std::string::const_iterator backtrack_limit = dst;
 
   // Process all parts
   while (*src) {
@@ -434,7 +436,7 @@ string FileSystem::CleanPath(absl::string_view unclean_path) const {
   }
 
   // Calculate and check the length of the cleaned path.
-  string::difference_type path_length = dst - path.begin();
+  std::string::difference_type path_length = dst - path.begin();
   if (path_length != 0) {
     // Remove trailing '/' except if it is root path ("/" ==> path_length := 1)
     if (path_length > 1 && path[path_length - 1] == '/') {
@@ -480,10 +482,11 @@ void FileSystem::ParseURI(absl::string_view remaining,
   *path = remaining;
 }
 
-string FileSystem::CreateURI(absl::string_view scheme, absl::string_view host,
-                             absl::string_view path) const {
+std::string FileSystem::CreateURI(absl::string_view scheme,
+                                  absl::string_view host,
+                                  absl::string_view path) const {
   if (scheme.empty()) {
-    return string(path);
+    return std::string(path);
   }
   return absl::StrCat(scheme, "://", host, path);
 }
diff --git a/third_party/xla/xla/tsl/platform/file_system_helper.cc b/third_party/xla/xla/tsl/platform/file_system_helper.cc
index 98fe0d5c3b2ec9..f8c590704a6603 100644
--- a/third_party/xla/xla/tsl/platform/file_system_helper.cc
+++ b/third_party/xla/xla/tsl/platform/file_system_helper.cc
@@ -121,8 +121,9 @@ static inline int GetFirstGlobbingEntry(const std::vector<std::string>& dirs) {
 
 }  // namespace
 
-absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
-                              std::vector<string>* results) {
+absl::Status GetMatchingPaths(FileSystem* fs, Env* env,
+                              const std::string& pattern,
+                              std::vector<std::string>* results) {
   // Check that `fs`, `env` and `results` are non-null.
   if (fs == nullptr || env == nullptr || results == nullptr) {
     return absl::Status(
@@ -181,8 +182,8 @@ absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
   // INVARIANT: If `{d, _}` is in queue, then `d` is a real directory.
   // INVARIANT: If `{_, ix}` is in queue, then `ix < dirs.size() - 1`.
   // INVARIANT: If `{_, ix}` is in queue, `IsGlobbingPattern(dirs[ix + 1])`.
-  std::deque<std::pair<string, int>> expand_queue;
-  std::deque<std::pair<string, int>> next_expand_queue;
+  std::deque<std::pair<std::string, int>> expand_queue;
+  std::deque<std::pair<std::string, int>> next_expand_queue;
   expand_queue.emplace_back(dirs[matching_index - 1], matching_index - 1);
 
   // Adding to `result` or `new_expand_queue` need to be protected by mutexes
@@ -267,7 +268,7 @@ absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> FileExists(Env* env, const string& fname) {
+absl::StatusOr<bool> FileExists(Env* env, const std::string& fname) {
   absl::Status status = env->FileExists(fname);
   if (absl::IsNotFound(status)) {
     return false;
diff --git a/third_party/xla/xla/tsl/platform/logging.h b/third_party/xla/xla/tsl/platform/logging.h
index 1cb30bf5734c27..f709a5d02a1af9 100644
--- a/third_party/xla/xla/tsl/platform/logging.h
+++ b/third_party/xla/xla/tsl/platform/logging.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_PLATFORM_LOGGING_H_
 #define XLA_TSL_PLATFORM_LOGGING_H_
 
+#include "absl/log/absl_log.h"
 #include "absl/log/check.h"       // IWYU pragma: export
 #include "absl/log/log.h"         // IWYU pragma: export
 #include "absl/log/vlog_is_on.h"  // IWYU pragma: export
@@ -29,7 +30,9 @@ template <typename T>
 T&& CheckNotNull(absl::string_view file, int line, absl::string_view exprtext,
                  T&& t) {
   if (t == nullptr) {
-    LOG(FATAL).AtLocation(file, line) << exprtext;
+    // Use ABSL_LOG instead of LOG to avoid conflicts if downstream
+    // projects (e.g. pytorch) define their own LOG macro.
+    ABSL_LOG(FATAL).AtLocation(file, line) << exprtext;
   }
   return std::forward<T>(t);
 }
diff --git a/third_party/xla/xla/tsl/platform/macros.h b/third_party/xla/xla/tsl/platform/macros.h
index aeca9877620047..f3e04156f11342 100644
--- a/third_party/xla/xla/tsl/platform/macros.h
+++ b/third_party/xla/xla/tsl/platform/macros.h
@@ -26,7 +26,6 @@ limitations under the License.
 #define TF_ATTRIBUTE_UNUSED ABSL_ATTRIBUTE_UNUSED
 #define TF_PACKED ABSL_ATTRIBUTE_PACKED
 #define TF_MUST_USE_RESULT ABSL_MUST_USE_RESULT
-#define TF_PRINTF_ATTRIBUTE ABSL_PRINTF_ATTRIBUTE
 
 // Control visibility outside .so
 #if defined(_WIN32)
diff --git a/third_party/xla/xla/tsl/platform/numa_hwloc.cc b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
new file mode 100644
index 00000000000000..50ba2c6a664fd1
--- /dev/null
+++ b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
@@ -0,0 +1,205 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+
+#include "absl/base/call_once.h"
+#include "absl/log/log.h"
+#include "hwloc.h"
+#include "tsl/platform/mem.h"
+#include "tsl/platform/numa.h"
+
+namespace tsl {
+namespace port {
+
+namespace {
+hwloc_topology_t GetHWLocTopology() {
+  static absl::once_flag init_once;
+  static hwloc_topology_t hwloc_topology_handle = nullptr;
+  absl::call_once(init_once, [] {
+    if (hwloc_topology_init(&hwloc_topology_handle)) {
+      LOG(ERROR) << "Call to hwloc_topology_init() failed";
+      return;
+    }
+    if (hwloc_topology_load(hwloc_topology_handle)) {
+      LOG(ERROR) << "Call to hwloc_topology_load() failed";
+      return;
+    }
+  });
+  return hwloc_topology_handle;
+}
+
+// Return the first hwloc object of the given type whose os_index
+// matches 'index'.
+hwloc_obj_t GetHWLocTypeIndex(hwloc_obj_type_t tp, int index) {
+  auto* topology = GetHWLocTopology();
+  if (!topology) {
+    return nullptr;
+  }
+
+  if (index < 0) {
+    return nullptr;
+  }
+
+  hwloc_obj_t obj = nullptr;
+  while ((obj = hwloc_get_next_obj_by_type(topology, tp, obj)) != nullptr) {
+    if (obj->os_index == index) {
+      break;
+    }
+  }
+  return obj;
+}
+
+struct HWLocBitmapDeleter {
+  void operator()(hwloc_bitmap_t bitmap) const { hwloc_bitmap_free(bitmap); }
+};
+
+auto AllocateBitmap() {
+  return std::unique_ptr<std::remove_pointer_t<hwloc_bitmap_t>,
+                         HWLocBitmapDeleter>(hwloc_bitmap_alloc());
+}
+}  // namespace
+
+bool NUMAEnabled() { return NUMANumNodes() > 1; }
+
+int NUMANumNodes() {
+  static int num_numanodes = 1;
+  static absl::once_flag init_once;
+  absl::call_once(init_once, [] {
+    auto* topology = GetHWLocTopology();
+    if (!topology) {
+      return;
+    }
+    num_numanodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE);
+    if (num_numanodes < 1) {
+      LOG(ERROR) << "Unknown number of NUMA nodes (got " << num_numanodes
+                 << "), assuming 1.";
+      num_numanodes = 1;
+    }
+  });
+  return num_numanodes;
+}
+
+void NUMASetThreadNodeAffinity(int node) {
+  if (node == kNUMANoAffinity) {
+    return;
+  }
+
+  auto* topology = GetHWLocTopology();
+  if (!topology) {
+    return;
+  }
+
+  // Find the corresponding NUMA node topology object.
+  hwloc_obj_t obj = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
+  if (!obj) {
+    LOG(ERROR) << "Could not find hwloc NUMA node " << node;
+    return;
+  }
+
+  if (hwloc_set_cpubind(topology, obj->cpuset,
+                        HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT)) {
+    LOG(ERROR).WithPerror() << "Call to hwloc_set_cpubind() failed";
+  }
+}
+
+int NUMAGetThreadNodeAffinity() {
+  auto* topology = GetHWLocTopology();
+  if (!topology) {
+    return kNUMANoAffinity;
+  }
+
+  auto thread_cpuset = AllocateBitmap();
+  if (!thread_cpuset) {
+    LOG(ERROR) << "Call to hwloc_bitmap_alloc() failed";
+    return kNUMANoAffinity;
+  }
+
+  if (hwloc_get_cpubind(topology, thread_cpuset.get(), HWLOC_CPUBIND_THREAD)) {
+    LOG(ERROR).WithPerror() << "Call to hwloc_get_cpubind() failed";
+    return kNUMANoAffinity;
+  }
+
+  hwloc_obj_t obj = nullptr;
+  // Return the first NUMA node whose cpuset is a (non-proper) superset of
+  // that of the current thread.
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE,
+                                           obj)) != nullptr) {
+    if (hwloc_bitmap_isincluded(thread_cpuset.get(), obj->cpuset)) {
+      break;
+    }
+  }
+  return obj ? obj->os_index : kNUMANoAffinity;
+}
+
+void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
+  if (node != kNUMANoAffinity) {
+    if (auto* topology = GetHWLocTopology()) {
+      hwloc_obj_t numa_node = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
+      if (numa_node) {
+        return hwloc_alloc_membind(topology, size, numa_node->nodeset,
+                                   HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
+      }
+      LOG(ERROR) << "Failed to find hwloc NUMA node " << node;
+    }
+  }
+  return ::tsl::port::AlignedMalloc(size, minimum_alignment);
+}
+
+void NUMAFree(void* ptr, size_t size) {
+  auto* topology = GetHWLocTopology();
+  if (!topology) {
+    ::tsl::port::Free(ptr);
+    return;
+  }
+  hwloc_free(topology, ptr, size);
+}
+
+int NUMAGetMemAffinity(const void* ptr) {
+  if (!ptr) {
+    return kNUMANoAffinity;
+  }
+
+  auto* topology = GetHWLocTopology();
+  if (!topology) {
+    return kNUMANoAffinity;
+  }
+
+  auto nodeset = AllocateBitmap();
+  if (!nodeset) {
+    LOG(ERROR) << "Call to hwloc_bitmap_alloc() failed";
+    return kNUMANoAffinity;
+  }
+
+  if (hwloc_get_area_memlocation(topology, ptr, 4, nodeset.get(),
+                                 HWLOC_MEMBIND_BYNODESET)) {
+    LOG(ERROR) << "Failed call to hwloc_get_area_memlocation.";
+    return kNUMANoAffinity;
+  }
+
+  hwloc_obj_t obj = nullptr;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE,
+                                           obj)) != nullptr) {
+    if (hwloc_bitmap_isincluded(nodeset.get(), obj->nodeset)) {
+      break;
+    }
+  }
+  return obj ? obj->os_index : kNUMANoAffinity;
+}
+
+}  // namespace port
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/default/integral_types.h b/third_party/xla/xla/tsl/platform/numa_noop.cc
similarity index 52%
rename from third_party/xla/xla/tsl/platform/default/integral_types.h
rename to third_party/xla/xla/tsl/platform/numa_noop.cc
index 0e67cdf9eb047d..616c3ae57c5ded 100644
--- a/third_party/xla/xla/tsl/platform/default/integral_types.h
+++ b/third_party/xla/xla/tsl/platform/numa_noop.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,26 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_TSL_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
-#define XLA_TSL_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
+#include <cstddef>
 
-#include <cstdint>
-
-// IWYU pragma: private, include "xla/tsl/platform/types.h"
-// IWYU pragma: friend third_party/tensorflow/compiler/xla/tsl/platform/types.h
+#include "tsl/platform/mem.h"
+#include "tsl/platform/numa.h"
 
 namespace tsl {
+namespace port {
 
-typedef signed char int8;
-typedef short int16;
-typedef int int32;
-typedef ::std::int64_t int64;
+bool NUMAEnabled() { return false; }
 
-typedef unsigned char uint8;
-typedef unsigned short uint16;
-typedef unsigned int uint32;
-typedef std::uint64_t uint64;
+int NUMANumNodes() { return 1; }
 
-}  // namespace tsl
+void NUMASetThreadNodeAffinity(int node) {}
+
+int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
+
+void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
+  return ::tsl::port::AlignedMalloc(size, minimum_alignment);
+}
 
-#endif  // XLA_TSL_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
+void NUMAFree(void* ptr, size_t size) { ::tsl::port::Free(ptr); }
+
+int NUMAGetMemAffinity(const void* ptr) { return kNUMANoAffinity; }
+
+}  // namespace port
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc
index bc3d08ad983a27..f05bdd36aebc3f 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc
+++ b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tsl {
 
-void ClockCycleProfiler::DumpStatistics(const string& tag) {
+void ClockCycleProfiler::DumpStatistics(const std::string& tag) {
   CHECK(!IsStarted());
   const double average_clock_cycle = GetAverageClockCycle();
   const double count = GetCount();
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
index b922cb942902a3..5862253149cf2b 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
+++ b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
@@ -60,11 +60,11 @@ class ClockCycleProfiler {
   }
 
   // Dump statistics
-  void DumpStatistics(const string& tag);
+  void DumpStatistics(const std::string& tag);
 
  private:
-  inline uint64 GetCurrentClockCycleInternal() {
-    const uint64 clockCycle = profile_utils::CpuUtils::GetCurrentClockCycle();
+  inline uint64_t GetCurrentClockCycleInternal() {
+    const uint64_t clockCycle = profile_utils::CpuUtils::GetCurrentClockCycle();
     if (clockCycle <= 0) {
       if (valid_) {
         LOG(WARNING) << "GetCurrentClockCycle is not implemented."
@@ -80,7 +80,7 @@ class ClockCycleProfiler {
   inline bool IsStarted() const { return start_clock_ > 0; }
 
   inline void AccumulateClockCycle() {
-    const uint64 now = GetCurrentClockCycleInternal();
+    const uint64_t now = GetCurrentClockCycleInternal();
     const double clock_diff = static_cast<double>(now - start_clock_);
     const double next_count = count_ + 1.0;
     const double next_count_inv = 1.0 / next_count;
@@ -92,7 +92,7 @@ class ClockCycleProfiler {
     start_clock_ = 0;
   }
 
-  uint64 start_clock_{0};
+  uint64_t start_clock_{0};
   double count_{0.0};
   double average_clock_cycle_{0.0};
   double worst_clock_cycle_{0.0};
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc
index 394d1f87a341ff..5dd0fbb4747911 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc
+++ b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc
@@ -87,7 +87,7 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
     LOG(WARNING) << "Failed to open /proc/cpuinfo";
     return INVALID_FREQUENCY;
   }
-  string line;
+  std::string line;
   while (std::getline(cpuinfo, line)) {
     double cpu_freq = 0.0;
     int retval = 0;
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
index f3d6d42566496b..df568714d80360 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
+++ b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
@@ -48,14 +48,14 @@ class CpuUtils {
   // Constant for invalid frequency.
   // This value is returned when the frequency is not obtained somehow.
   static constexpr int64_t INVALID_FREQUENCY = -1;
-  static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
+  static constexpr uint64_t DUMMY_CYCLE_CLOCK = 1;
 
   // Return current clock cycle. This function is designed to
   // minimize the overhead to get clock and maximize the accuracy of
   // time for profile.
   // This returns unsigned int because there is no guarantee that rdtsc
   // is less than 2 ^ 61.
-  static inline uint64 GetCurrentClockCycle() {
+  static inline uint64_t GetCurrentClockCycle() {
 #if defined(__ANDROID__)
     return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle();
 // ----------------------------------------------------------------
@@ -158,7 +158,7 @@ class CpuUtils {
    public:
     DefaultCpuUtilsHelper() = default;
     void ResetClockCycle() final {}
-    uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
+    uint64_t GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
     void EnableClockCycleProfiling() final {}
     void DisableClockCycleProfiling() final {}
     int64_t CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc
index 968846acb40f5a..77b852f395bc0f 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc
+++ b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc
@@ -36,15 +36,15 @@ TEST_F(CpuUtilsTest, TearDownTestCase) {}
 
 TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
   static constexpr int LOOP_COUNT = 10;
-  const uint64 start_clock_count = CpuUtils::GetCurrentClockCycle();
+  const uint64_t start_clock_count = CpuUtils::GetCurrentClockCycle();
   CHECK_GT(start_clock_count, 0);
-  uint64 prev_clock_count = start_clock_count;
+  uint64_t prev_clock_count = start_clock_count;
   for (int i = 0; i < LOOP_COUNT; ++i) {
-    const uint64 clock_count = CpuUtils::GetCurrentClockCycle();
+    const uint64_t clock_count = CpuUtils::GetCurrentClockCycle();
     CHECK_GE(clock_count, prev_clock_count);
     prev_clock_count = clock_count;
   }
-  const uint64 end_clock_count = CpuUtils::GetCurrentClockCycle();
+  const uint64_t end_clock_count = CpuUtils::GetCurrentClockCycle();
   if (DBG) {
     LOG(INFO) << "start clock = " << start_clock_count;
     LOG(INFO) << "end clock = " << end_clock_count;
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h b/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
index 11d5bf2f4b675f..3e9470a2dd942b 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
+++ b/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
@@ -34,7 +34,7 @@ class ICpuUtilsHelper {
   // clock cycle counters from overflowing on some platforms.
   virtual void ResetClockCycle() = 0;
   // Return current clock cycle.
-  virtual uint64 GetCurrentClockCycle() = 0;
+  virtual uint64_t GetCurrentClockCycle() = 0;
   // Enable/Disable clock cycle profile
   // You can enable / disable profile if it's supported by the platform
   virtual void EnableClockCycleProfiling() = 0;
diff --git a/third_party/xla/xla/tsl/platform/ram_file_system.h b/third_party/xla/xla/tsl/platform/ram_file_system.h
index 6aed2376018dac..f1b2528701e726 100644
--- a/third_party/xla/xla/tsl/platform/ram_file_system.h
+++ b/third_party/xla/xla/tsl/platform/ram_file_system.h
@@ -26,19 +26,31 @@ limitations under the License.
 // reference a single FS location, though no thread-safety guarantees are
 // provided.
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
 #include <string>
+#include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_statistics.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/stringpiece.h"
 
 #ifdef PLATFORM_WINDOWS
 #undef DeleteFile
 #undef CopyFile
 #undef TranslateName
+#undef StrCat
 #endif
 
 namespace tsl {
@@ -114,10 +126,11 @@ class RamFileSystem : public FileSystem {
     auto fname = StripRamFsPrefix(fname_);
 
     if (fs_.find(fname) == fs_.end()) {
-      return errors::NotFound("");
+      return absl::NotFoundError("");
     }
     if (fs_[fname] == nullptr) {
-      return errors::InvalidArgument(fname_, " is a directory.");
+      return absl::InvalidArgumentError(
+          absl::StrCat(fname_, " is a directory."));
     }
     *result = std::unique_ptr<RandomAccessFile>(
         new RamRandomAccessFile(fname, fs_[fname]));
@@ -134,7 +147,8 @@ class RamFileSystem : public FileSystem {
       fs_[fname] = std::make_shared<std::string>();
     }
     if (fs_[fname] == nullptr) {
-      return errors::InvalidArgument(fname_, " is a directory.");
+      return absl::InvalidArgumentError(
+          absl::StrCat(fname_, " is a directory."));
     }
     *result = std::unique_ptr<WritableFile>(
         new RamRandomAccessFile(fname, fs_[fname]));
@@ -151,7 +165,8 @@ class RamFileSystem : public FileSystem {
       fs_[fname] = std::make_shared<std::string>();
     }
     if (fs_[fname] == nullptr) {
-      return errors::InvalidArgument(fname_, " is a directory.");
+      return absl::InvalidArgumentError(
+          absl::StrCat(fname_, " is a directory."));
     }
     *result = std::unique_ptr<WritableFile>(
         new RamRandomAccessFile(fname, fs_[fname]));
@@ -212,7 +227,7 @@ class RamFileSystem : public FileSystem {
 
     auto it = fs_.lower_bound(fname);
     if (it == fs_.end() || !StartsWith(it->first, fname)) {
-      return errors::NotFound("");
+      return absl::NotFoundError("");
     }
 
     if (it->first == fname && it->second != nullptr) {
@@ -238,7 +253,7 @@ class RamFileSystem : public FileSystem {
       return absl::OkStatus();
     }
 
-    return errors::NotFound("");
+    return absl::NotFoundError("");
   }
 
   absl::Status CreateDir(const std::string& dirname_,
@@ -248,7 +263,7 @@ class RamFileSystem : public FileSystem {
 
     auto it = fs_.find(dirname);
     if (it != fs_.end() && it->second != nullptr) {
-      return errors::AlreadyExists(
+      return absl::AlreadyExistsError(
           "cannot create directory with same name as an existing file");
     }
 
@@ -279,10 +294,10 @@ class RamFileSystem : public FileSystem {
 
     auto it = fs_.find(dirname);
     if (it == fs_.end()) {
-      return errors::NotFound("");
+      return absl::NotFoundError("");
     }
     if (it->second != nullptr) {
-      return errors::InvalidArgument("Not a directory");
+      return absl::InvalidArgumentError("Not a directory");
     }
     fs_.erase(dirname);
 
@@ -296,12 +311,12 @@ class RamFileSystem : public FileSystem {
 
     if (fs_.find(fname) != fs_.end()) {
       if (fs_[fname] == nullptr) {
-        return errors::InvalidArgument("Not a file");
+        return absl::InvalidArgumentError("Not a file");
       }
       *file_size = fs_[fname]->size();
       return absl::OkStatus();
     }
-    return errors::NotFound("");
+    return absl::NotFoundError("");
   }
 
   absl::Status RenameFile(const std::string& src_, const std::string& target_,
@@ -315,7 +330,7 @@ class RamFileSystem : public FileSystem {
       fs_.erase(fs_.find(src));
       return absl::OkStatus();
     }
-    return errors::NotFound("");
+    return absl::NotFoundError("");
   }
 
   RamFileSystem() {}
diff --git a/third_party/xla/xla/tsl/platform/status_matchers.h b/third_party/xla/xla/tsl/platform/status_matchers.h
index 434d8e6af2a480..ae13105f571006 100644
--- a/third_party/xla/xla/tsl/platform/status_matchers.h
+++ b/third_party/xla/xla/tsl/platform/status_matchers.h
@@ -27,7 +27,7 @@ limitations under the License.
 // This matcher matches a StatusOr<T> value whose status is OK and whose inner
 // value matches matcher m. Example:
 //
-//   using ::tsl::testing::IsOkAndHolds;
+//   using absl_testing::IsOkAndHolds;
 //   using ::testing::HasSubstr;
 //   ...
 //   StatusOr<std::string> status_or_message("Hello, world");
diff --git a/third_party/xla/xla/tsl/platform/status_test.cc b/third_party/xla/xla/tsl/platform/status_test.cc
index 00760725438eb4..6cfd7bb12ba790 100644
--- a/third_party/xla/xla/tsl/platform/status_test.cc
+++ b/third_party/xla/xla/tsl/platform/status_test.cc
@@ -12,16 +12,18 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/tsl/platform/status.h"
 
+#include <cstddef>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/cord.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/stack_frame.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/status_to_from_proto.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
@@ -33,8 +35,6 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::IsEmpty;
 using ::testing::Pair;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 TEST(ToStringTest, PayloadsArePrinted) {
   absl::Status status = errors::Aborted("Aborted Error Message");
diff --git a/third_party/xla/xla/tsl/platform/types.h b/third_party/xla/xla/tsl/platform/types.h
index 22131e33f7ca09..65c457564105c9 100644
--- a/third_party/xla/xla/tsl/platform/types.h
+++ b/third_party/xla/xla/tsl/platform/types.h
@@ -16,47 +16,218 @@ limitations under the License.
 #ifndef XLA_TSL_PLATFORM_TYPES_H_
 #define XLA_TSL_PLATFORM_TYPES_H_
 
+#include <cstdint>
+#include <limits>
 #include <string>
 
-#include "tsl/platform/bfloat16.h"
+#include "absl/base/const_init.h"
+#include "absl/base/macros.h"
+#include "tsl/platform/bfloat16.h"  // IWYU pragma: export
 #include "tsl/platform/ml_dtypes.h"  // IWYU pragma: export
-#include "tsl/platform/platform.h"
 #include "tsl/platform/tstring.h"
 
-// Include appropriate platform-dependent implementations
-#if defined(PLATFORM_GOOGLE) || defined(GOOGLE_INTEGRAL_TYPES)
-#include "xla/tsl/platform/google/integral_types.h"  // IWYU pragma: export
-#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) ||    \
-    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_POSIX_IOS) || \
-    defined(PLATFORM_GOOGLE_IOS) || defined(PLATFORM_WINDOWS)
-#include "xla/tsl/platform/default/integral_types.h"  // IWYU pragma: export
-#else
-#error Define the appropriate PLATFORM_<foo> macro for this platform
-#endif
-
 namespace tsl {
 
 // Alias tsl::string to std::string.
-using std::string;
-
-static const uint4 kuint4max = static_cast<uint4>(0x0F);
-static const uint8 kuint8max = static_cast<uint8>(0xFF);
-static const uint16 kuint16max = static_cast<uint16>(0xFFFF);
-static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
-static const uint64 kuint64max = static_cast<uint64>(0xFFFFFFFFFFFFFFFFull);
-static const int8_t kint8min = static_cast<int8>(~0x7F);
-static const int8_t kint8max = static_cast<int8>(0x7F);
-static const int4 kint4min = static_cast<int4>(0x08);
-static const int4 kint4max = static_cast<int4>(0x07);
-static const int16_t kint16min = static_cast<int16>(~0x7FFF);
-static const int16_t kint16max = static_cast<int16>(0x7FFF);
-static const int32_t kint32min = static_cast<int32>(~0x7FFFFFFF);
-static const int32_t kint32max = static_cast<int32>(0x7FFFFFFF);
-static const int64_t kint64min = static_cast<int64_t>(~0x7FFFFFFFFFFFFFFFll);
-static const int64_t kint64max = static_cast<int64_t>(0x7FFFFFFFFFFFFFFFll);
+using string ABSL_DEPRECATE_AND_INLINE() = std::string;
+using uint8 ABSL_DEPRECATE_AND_INLINE() = uint8_t;
+using uint16 ABSL_DEPRECATE_AND_INLINE() = uint16_t;
+using uint32 ABSL_DEPRECATE_AND_INLINE() = uint32_t;
+using uint64 ABSL_DEPRECATE_AND_INLINE() = uint64_t;
+using int8 ABSL_DEPRECATE_AND_INLINE() = int8_t;
+using int16 ABSL_DEPRECATE_AND_INLINE() = int16_t;
+using int32 ABSL_DEPRECATE_AND_INLINE() = int32_t;
+using int64 ABSL_DEPRECATE_AND_INLINE() = int64_t;
+
+// Note: This duplication is necessary because the inliner doesn't handle
+// macros very well and templates will cause it to replace int32_t with int.
+namespace detail {
+class Uint8Max {
+ public:
+  constexpr explicit Uint8Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Uint8Max(const Uint8Max&) = delete;
+  Uint8Max& operator=(const Uint8Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator uint8_t() const {
+    return std::numeric_limits<uint8_t>::max();
+  }
+};
+
+class Uint16Max {
+ public:
+  constexpr explicit Uint16Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Uint16Max(const Uint16Max&) = delete;
+  Uint16Max& operator=(const Uint16Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator uint16_t() const {
+    return std::numeric_limits<uint16_t>::max();
+  }
+};
+
+class Uint32Max {
+ public:
+  constexpr explicit Uint32Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Uint32Max(const Uint32Max&) = delete;
+  Uint32Max& operator=(const Uint32Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator uint32_t() const {
+    return std::numeric_limits<uint32_t>::max();
+  }
+};
+
+class Uint64Max {
+ public:
+  constexpr explicit Uint64Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Uint64Max(const Uint64Max&) = delete;
+  Uint64Max& operator=(const Uint64Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator uint64_t() const {
+    return std::numeric_limits<uint64_t>::max();
+  }
+};
+
+class Int8Min {
+ public:
+  constexpr explicit Int8Min(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int8Min(const Int8Min&) = delete;
+  Int8Min& operator=(const Int8Min&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int8_t() const {
+    return std::numeric_limits<int8_t>::min();
+  }
+};
+
+class Int16Min {
+ public:
+  constexpr explicit Int16Min(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int16Min(const Int16Min&) = delete;
+  Int16Min& operator=(const Int16Min&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int16_t() const {
+    return std::numeric_limits<int16_t>::min();
+  }
+};
+
+class Int32Min {
+ public:
+  constexpr explicit Int32Min(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int32Min(const Int32Min&) = delete;
+  Int32Min& operator=(const Int32Min&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int32_t() const {
+    return std::numeric_limits<int32_t>::min();
+  }
+};
+
+class Int64Min {
+ public:
+  constexpr explicit Int64Min(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int64Min(const Int64Min&) = delete;
+  Int64Min& operator=(const Int64Min&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int64_t() const {
+    return std::numeric_limits<int64_t>::min();
+  }
+};
+
+class Int8Max {
+ public:
+  constexpr explicit Int8Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int8Max(const Int8Max&) = delete;
+  Int8Max& operator=(const Int8Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int8_t() const {
+    return std::numeric_limits<int8_t>::max();
+  }
+};
+
+class Int16Max {
+ public:
+  constexpr explicit Int16Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int16Max(const Int16Max&) = delete;
+  Int16Max& operator=(const Int16Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int16_t() const {
+    return std::numeric_limits<int16_t>::max();
+  }
+};
+
+class Int32Max {
+ public:
+  constexpr explicit Int32Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int32Max(const Int32Max&) = delete;
+  Int32Max& operator=(const Int32Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int32_t() const {
+    return std::numeric_limits<int32_t>::max();
+  }
+};
+
+class Int64Max {
+ public:
+  constexpr explicit Int64Max(absl::ConstInitType) {}
+  // Not copyable or movable.
+  Int64Max(const Int64Max&) = delete;
+  Int64Max& operator=(const Int64Max&) = delete;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator int64_t() const {
+    return std::numeric_limits<int64_t>::max();
+  }
+};
+}  // namespace detail
+
+inline constexpr detail::Uint8Max kuint8max{absl::kConstInit};
+inline constexpr detail::Uint16Max kuint16max{absl::kConstInit};
+inline constexpr detail::Uint32Max kuint32max{absl::kConstInit};
+inline constexpr detail::Uint64Max kuint64max{absl::kConstInit};
+
+inline constexpr detail::Int8Min kint8min{absl::kConstInit};
+inline constexpr detail::Int16Min kint16min{absl::kConstInit};
+inline constexpr detail::Int32Min kint32min{absl::kConstInit};
+inline constexpr detail::Int64Min kint64min{absl::kConstInit};
+
+inline constexpr detail::Int8Max kint8max{absl::kConstInit};
+inline constexpr detail::Int16Max kint16max{absl::kConstInit};
+inline constexpr detail::Int32Max kint32max{absl::kConstInit};
+inline constexpr detail::Int64Max kint64max{absl::kConstInit};
 
 // A typedef for a uint64 used as a short fingerprint.
-using Fprint = uint64;
+using Fprint = uint64_t;
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/platform/windows/BUILD b/third_party/xla/xla/tsl/platform/windows/BUILD
index a934119641c1ca..6052e72acee918 100644
--- a/third_party/xla/xla/tsl/platform/windows/BUILD
+++ b/third_party/xla/xla/tsl/platform/windows/BUILD
@@ -58,6 +58,7 @@ cc_library(
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform",
@@ -174,6 +175,7 @@ cc_library(
     name = "platform_port",
     srcs = [
         "port.cc",
+        "//xla/tsl/platform:numa_noop.cc",
         "@local_tsl//tsl/platform:cpu_info.cc",
     ],
     hdrs = [
@@ -197,6 +199,7 @@ cc_library(
         "//xla/tsl/platform:dynamic_annotations",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/platform",
         "@snappy",
     ],
diff --git a/third_party/xla/xla/tsl/platform/windows/port.cc b/third_party/xla/xla/tsl/platform/windows/port.cc
index e4e122ddfcaac3..7f2f84bd4799ec 100644
--- a/third_party/xla/xla/tsl/platform/windows/port.cc
+++ b/third_party/xla/xla/tsl/platform/windows/port.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <new>
 #ifdef TF_USE_SNAPPY
 #include "snappy.h"
 #endif
@@ -105,25 +107,6 @@ int GetCurrentCPU() {
   return GetCurrentProcessorNumber();
 }
 
-bool NUMAEnabled() {
-  // Not yet implemented: coming soon.
-  return false;
-}
-
-int NUMANumNodes() { return 1; }
-
-void NUMASetThreadNodeAffinity(int node) {}
-
-int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
-
-void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
-  return tsl::port::AlignedMalloc(size, minimum_alignment);
-}
-
-void NUMAFree(void* ptr, size_t size) { tsl::port::Free(ptr); }
-
-int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; }
-
 bool Snappy_Compress(const char* input, size_t length, string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
@@ -205,13 +188,14 @@ int NumHyperthreadsPerCore() {
 namespace tsl {
 namespace port {
 
-void* AlignedMalloc(size_t size, int minimum_alignment) {
-  return _aligned_malloc(size, minimum_alignment);
+void* AlignedMalloc(size_t size, std::align_val_t minimum_alignment) {
+  return _aligned_malloc(size, static_cast<size_t>(minimum_alignment));
 }
 
 void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
 
-void AlignedSizedFree(void* aligned_memory, size_t alignment, size_t size) {
+void AlignedSizedFree(void* aligned_memory, size_t size,
+                      std::align_val_t alignment) {
   (void)alignment;
   (void)size;
 
diff --git a/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc b/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc
index a6053586b0b861..1268767a5cb96b 100644
--- a/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc
@@ -28,10 +28,16 @@ limitations under the License.
 #include <time.h>
 
 #include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_statistics.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/file_system_helper.h"
 #include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/types.h"
 #include "xla/tsl/platform/windows/error_windows.h"
 #include "xla/tsl/platform/windows/wide_char.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
@@ -118,12 +124,12 @@ class WindowsRandomAccessFile : public RandomAccessFile {
     }
   }
 
-  Status Name(StringPiece* result) const override {
+  Status Name(absl::string_view* result) const override {
     *result = filename_;
     return absl::OkStatus();
   }
 
-  Status Read(uint64 offset, size_t n, StringPiece* result,
+  Status Read(uint64 offset, size_t n, absl::string_view* result,
               char* scratch) const override {
     Status s;
     char* dst = scratch;
@@ -148,7 +154,7 @@ class WindowsRandomAccessFile : public RandomAccessFile {
         s = IOError(filename_, errno);
       }
     }
-    *result = StringPiece(scratch, dst - scratch);
+    *result = absl::string_view(scratch, dst - scratch);
     return s;
   }
 
@@ -158,18 +164,18 @@ class WindowsRandomAccessFile : public RandomAccessFile {
       return absl::OkStatus();
     }
     if (n < 0) {
-      return errors::InvalidArgument(
-          "Attempting to read ", n,
-          " bytes. You cannot read a negative number of bytes.");
+      return absl::InvalidArgumentError(
+          absl::StrCat("Attempting to read ", n,
+                       " bytes. You cannot read a negative number of bytes."));
     }
 
     char* scratch = new char[n];
     if (scratch == nullptr) {
-      return errors::ResourceExhausted("Unable to allocate ", n,
-                                       " bytes for file reading.");
+      return absl::ResourceExhaustedError(
+          absl::StrCat("Unable to allocate ", n, " bytes for file reading."));
     }
 
-    StringPiece tmp;
+    absl::string_view tmp;
     Status s = Read(offset, n, &tmp, scratch);
 
     absl::Cord tmp_cord = absl::MakeCordFromExternal(
@@ -196,7 +202,7 @@ class WindowsWritableFile : public WritableFile {
     }
   }
 
-  Status Append(StringPiece data) override {
+  Status Append(absl::string_view data) override {
     DWORD bytes_written = 0;
     DWORD data_size = static_cast<DWORD>(data.size());
     BOOL write_result =
@@ -267,7 +273,7 @@ class WindowsWritableFile : public WritableFile {
     return absl::OkStatus();
   }
 
-  Status Name(StringPiece* result) const override {
+  Status Name(absl::string_view* result) const override {
     *result = filename_;
     return absl::OkStatus();
   }
@@ -533,7 +539,7 @@ Status WindowsFileSystem::FileExists(const string& fname,
   if (_waccess(ws_translated_fname.c_str(), kOk) == 0) {
     return absl::OkStatus();
   }
-  return errors::NotFound(fname, " not found");
+  return absl::NotFoundError(absl::StrCat(fname, " not found"));
 }
 
 Status WindowsFileSystem::GetChildren(const string& dir,
@@ -559,7 +565,7 @@ Status WindowsFileSystem::GetChildren(const string& dir,
 
   do {
     string file_name = WideCharToUtf8(find_data.cFileName);
-    const StringPiece basename = file_name;
+    const absl::string_view basename = file_name;
     if (basename != "." && basename != "..") {
       result->push_back(file_name);
     }
@@ -588,7 +594,7 @@ Status WindowsFileSystem::CreateDir(const string& name,
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
   if (ws_name.empty()) {
-    return errors::AlreadyExists(name);
+    return absl::AlreadyExistsError(name);
   }
   if (_wmkdir(ws_name.c_str()) != 0) {
     result = IOError("Failed to create a directory: " + name, errno);
diff --git a/third_party/xla/xla/tsl/platform/windows/windows_file_system.h b/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
index 4dad78172ea441..75fc104940368f 100644
--- a/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
+++ b/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_PLATFORM_WINDOWS_WINDOWS_FILE_SYSTEM_H_
 #define XLA_TSL_PLATFORM_WINDOWS_WINDOWS_FILE_SYSTEM_H_
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/file_system.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
@@ -88,7 +89,7 @@ class WindowsFileSystem : public FileSystem {
 class LocalWinFileSystem : public WindowsFileSystem {
  public:
   string TranslateName(const string& name) const override {
-    StringPiece scheme, host, path;
+    absl::string_view scheme, host, path;
     io::ParseURI(name, &scheme, &host, &path);
     return string(path);
   }
diff --git a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
index 70a74de2776b97..0e577e3dc18217 100644
--- a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
@@ -94,7 +94,8 @@ void MergeHostPlanesAndSortLines(tensorflow::profiler::XSpace* space) {
 }  // namespace
 
 void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
-                                 uint64 start_time_ns, uint64 stop_time_ns) {
+                                 uint64_t start_time_ns,
+                                 uint64_t stop_time_ns) {
   VLOG(3) << "Post processing local profiler XSpace.";
   // Post processing the collected XSpace without hold profiler lock.
   // 1. Merge all host planes and sorts lines by name.
diff --git a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
index 287e76586e2748..1b90ed9928a930 100644
--- a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
+++ b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
@@ -23,7 +23,7 @@ namespace profiler {
 
 // Post process XSpaces collected locally from multiple profilers.
 void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
-                                 uint64 start_time_ns, uint64 stop_time_ns);
+                                 uint64_t start_time_ns, uint64_t stop_time_ns);
 
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc b/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc
index 9796e29ec18702..ee31a60d03cef3 100644
--- a/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc
@@ -34,7 +34,7 @@ namespace {
 
 // Converts the given time from picoseconds to microseconds and then to a string
 // using maximum precision.
-inline std::string PicosToMicrosString(uint64 ps) {
+inline std::string PicosToMicrosString(uint64_t ps) {
   return MaxPrecision(PicoToMicro(ps));
 }
 
@@ -58,7 +58,7 @@ std::vector<const typename Map::value_type*> SortByKey(const Map& m) {
   return pairs;
 }
 
-inline void AddDeviceMetadata(uint32 device_id, const Device& device,
+inline void AddDeviceMetadata(uint32_t device_id, const Device& device,
                               std::string* json) {
   if (!device.name().empty()) {
     absl::StrAppend(json, R"({"ph":"M","pid":)", device_id,
@@ -70,21 +70,21 @@ inline void AddDeviceMetadata(uint32 device_id, const Device& device,
                   device_id, "}},");
 }
 
-inline void AddResourceMetadata(uint32 device_id, uint32 resource_id,
+inline void AddResourceMetadata(uint32_t device_id, uint32_t resource_id,
                                 const Resource& resource, std::string* json) {
   if (!resource.name().empty()) {
     absl::StrAppend(json, R"({"ph":"M","pid":)", device_id, R"(,"tid":)",
                     resource_id, R"(,"name":"thread_name","args":{"name":)",
                     JsonString(resource.name()), "}},");
   }
-  uint32 sort_index =
+  uint32_t sort_index =
       resource.sort_index() ? resource.sort_index() : resource_id;
   absl::StrAppend(json, R"({"ph":"M","pid":)", device_id, R"(,"tid":)",
                   resource_id, R"(,"name":"thread_sort_index")",
                   R"(,"args":{"sort_index":)", sort_index, "}},");
 }
 
-inline void AddTraceEvent(const TraceEvent& event, string* json) {
+inline void AddTraceEvent(const TraceEvent& event, std::string* json) {
   auto duration_ps = std::max(event.duration_ps(), protobuf_uint64{1});
   absl::StrAppend(json, R"({"ph":"X","pid":)", event.device_id(), R"(,"tid":)",
                   event.resource_id(), R"(,"ts":)",
@@ -110,11 +110,11 @@ std::string TraceContainerToJson(const TraceContainer& container) {
       R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)"
       R"("traceEvents":[)";
   for (const auto* id_and_device : SortByKey(container.trace().devices())) {
-    uint32 device_id = id_and_device->first;
+    uint32_t device_id = id_and_device->first;
     const Device& device = id_and_device->second;
     AddDeviceMetadata(device_id, device, &json);
     for (const auto* id_and_resource : SortByKey(device.resources())) {
-      uint32 resource_id = id_and_resource->first;
+      uint32_t resource_id = id_and_resource->first;
       const Resource& resource = id_and_resource->second;
       AddResourceMetadata(device_id, resource_id, resource, &json);
     }
diff --git a/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc b/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc
index a8f2e7beef1833..9bbe94314e37e4 100644
--- a/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc
@@ -39,7 +39,7 @@ namespace {
 
 using tensorflow::profiler::XSpace;
 
-void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
+void BuildDeviceAndResources(uint32_t device_id, const XPlaneVisitor& plane,
                              Device* device) {
   device->set_name(std::string(plane.Name()));
   device->set_device_id(device_id);
@@ -47,7 +47,7 @@ void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
   bool sort_by_ordinal = (device_id == kHostThreadsDeviceId);
   int ordinal = 0;
   plane.ForEachLine([&](const XLineVisitor& line) {
-    uint32 resource_id = line.DisplayId();
+    uint32_t resource_id = line.DisplayId();
     Resource& resource = (*device->mutable_resources())[resource_id];
     resource.set_resource_id(resource_id);
     resource.set_name(std::string(line.DisplayName()));
@@ -59,7 +59,7 @@ void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
   });
 }
 
-void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
+void ConvertXPlaneToTraceEvents(uint32_t device_id, const XPlaneVisitor& xplane,
                                 TraceContainer& container) {
   // Convert devices and resources.
   BuildDeviceAndResources(device_id, xplane,
@@ -67,7 +67,7 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
 
   // Convert events.
   xplane.ForEachLine([device_id, &container](const XLineVisitor& xline) {
-    uint32 resource_id = xline.DisplayId();
+    uint32_t resource_id = xline.DisplayId();
     if (xline.DisplayName() == tsl::profiler::kXlaAsyncOpLineName) {
       return;
     }
@@ -106,8 +106,8 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
 
 }  // namespace
 
-uint64 GetTraceViewerMaxEvents() {
-  constexpr uint64 kMaxEvents = 1000000;
+uint64_t GetTraceViewerMaxEvents() {
+  constexpr uint64_t kMaxEvents = 1000000;
   // Testing only env variable, not recommended for use
   char* max_events = getenv("TF_PROFILER_TRACE_VIEWER_MAX_EVENTS");
   if (max_events != nullptr) {
@@ -136,12 +136,12 @@ TraceContainer ConvertXSpaceToTraceContainer(const XSpace& xspace) {
   }
   for (const XPlane* device_plane : device_planes) {
     XPlaneVisitor xplane = CreateTfXPlaneVisitor(device_plane);
-    uint32 device_id = kFirstDeviceId + xplane.Id();
+    uint32_t device_id = kFirstDeviceId + xplane.Id();
     ConvertXPlaneToTraceEvents(device_id, xplane, container);
   }
   // Trace viewer (non-streaming) has scalability issues, we need to drop
   // events to avoid loading failure for trace viewer.
-  uint64 viewer_max_events = GetTraceViewerMaxEvents();
+  uint64_t viewer_max_events = GetTraceViewerMaxEvents();
   container.CapEvents(viewer_max_events);
   return container;
 }
diff --git a/third_party/xla/xla/tsl/profiler/rpc/BUILD b/third_party/xla/xla/tsl/profiler/rpc/BUILD
index 0d00aec6118091..523db019d51d6f 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/xla/tsl/profiler/rpc/BUILD
@@ -1,5 +1,4 @@
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla/tsl/profiler/builds:build_config.bzl",
@@ -41,18 +40,21 @@ cc_library(
         "//xla/tsl/profiler/rpc/client:save_profile",
         "//xla/tsl/profiler/utils:file_system_utils",
         "//xla/tsl/profiler/utils:math_utils",
+        "//xla/tsl/profiler/utils:profiler_options_util",
         "//xla/tsl/profiler/utils:time_utils",
         "//xla/tsl/profiler/utils:xplane_utils",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
 )
 
 tf_profiler_pybind_cc_library_wrapper(
@@ -79,8 +81,9 @@ cc_library(
         ":profiler_service_impl",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:types",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/BUILD b/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
index ff6e8e698c0ca8..84f7c5d7e2a7bc 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
@@ -1,5 +1,4 @@
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load(
     "//xla/tsl/platform:build_config.bzl",
     "tf_protos_profiler_service",
@@ -32,16 +31,19 @@ cc_library(
         "//third_party/xprof/pywrap:__pkg__",
     ]),
     deps = [
+        ":profiler_client",
         ":profiler_client_for_pybind",
         ":remote_profiler_session_manager",
         ":save_profile",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
-        "//xla/tsl/platform:types",
         "//xla/tsl/profiler/convert:trace_events_to_json",
         "//xla/tsl/profiler/convert:xplane_to_trace_events",
         "//xla/tsl/profiler/utils:session_manager",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:platform_port",
@@ -63,6 +65,7 @@ cc_library(
         "//xla/tsl/profiler:internal",
         "//xla/tsl/profiler/rpc:__pkg__",
         "//learning/pathways/util/platform:__subpackages__",
+        "//third_party/pathways/util/platform:__subpackages__",
     ]),
     deps = [
         "//xla/tsl/lib/io:zlib_compression_options",
@@ -130,12 +133,13 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/profiler/protobuf:profiler_analysis_cc_grpc_proto",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
-    ] + tsl_grpc_cc_dependencies(),
+    ],
     alwayslink = True,
 )
 
@@ -164,6 +168,7 @@ tsl_cc_test(
         ":profiler_client",
         ":profiler_client_impl",  # for oss
         ":profiler_client_test_util",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
@@ -171,9 +176,13 @@ tsl_cc_test(
         "//xla/tsl/platform:types",
         "//xla/tsl/profiler/rpc:profiler_server_impl",
         "//xla/tsl/profiler/rpc:profiler_service_impl",
+        "//xla/tsl/profiler/utils:file_system_utils",
         "//xla/tsl/profiler/utils:time_utils_impl",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/profiler/lib:profiler_factory_impl",
         "@local_tsl//tsl/profiler/lib:profiler_session_impl",
     ] + tf_protos_profiler_service(),
@@ -186,14 +195,15 @@ cc_library(
     copts = tf_profiler_copts(),
     deps = [
         ":profiler_client_for_pybind",
-        "//xla/tsl/platform:env_time",
-        "//xla/tsl/platform:errors",
+        "//xla/tsl/lib/gtl:map_util",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:types",
-        "//xla/tsl/profiler/utils:time_utils",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
@@ -209,6 +219,7 @@ tsl_cc_test(
         ":profiler_client_impl",  # for oss
         ":profiler_client_test_util",
         ":remote_profiler_session_manager",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
@@ -216,11 +227,15 @@ tsl_cc_test(
         "//xla/tsl/platform:types",
         "//xla/tsl/profiler/rpc:profiler_server_impl",
         "//xla/tsl/profiler/rpc:profiler_service_impl",
+        "//xla/tsl/profiler/utils:file_system_utils",
         "//xla/tsl/profiler/utils:time_utils_impl",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/profiler/lib:profiler_factory_impl",
+        "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/lib:profiler_session_impl",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
     ] + tf_protos_profiler_service(),
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc b/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc
index c90ee62deca79b..ce377e686d8020 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc
@@ -14,21 +14,24 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/tsl/profiler/rpc/client/capture_profile.h"
 
+#include <cstdint>
 #include <iostream>
-#include <limits>
 #include <memory>
+#include <string>
 #include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/convert/trace_events_to_json.h"
 #include "xla/tsl/profiler/convert/xplane_to_trace_events.h"
 #include "xla/tsl/profiler/rpc/client/profiler_client.h"
@@ -56,7 +59,7 @@ using tensorflow::ProfileResponse;
 using tensorflow::RemoteProfilerSessionManagerOptions;
 using tensorflow::profiler::XSpace;
 
-constexpr uint64 kMaxEvents = 1000000;
+constexpr uint64_t kMaxEvents = 1000000;
 const absl::string_view kXPlanePb = "xplane.pb";
 
 MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level,
@@ -115,7 +118,7 @@ NewProfileSessionRequest PopulateNewProfileSessionRequest(
 }
 
 inline bool ShouldRetryTracing(absl::Status status) {
-  return status.code() == error::Code::UNAVAILABLE ||
+  return status.code() == absl::StatusCode::kUnavailable ||
          status.code() == error::Code::ALREADY_EXISTS ||
          // When auto-reconnecting to a remote TensorFlow worker after it
          // restarts, gRPC can return an UNKNOWN error code with a "Stream
@@ -185,7 +188,7 @@ absl::Status NewSession(absl::string_view repository_root,
   std::cout << "Profile session succeed for host(s):"
             << absl::StrJoin(opts.service_addresses(), ",") << std::endl;
   if (response.empty_trace()) {
-    return errors::Unavailable("No trace event is collected");
+    return absl::UnavailableError("No trace event is collected");
   }
   return absl::OkStatus();
 }
@@ -199,8 +202,10 @@ absl::Status CaptureRemoteTrace(const std::string& logdir,
   DCHECK_GT(opts.profiler_options().duration_ms(), 0);
   DCHECK(!opts.service_addresses().empty());
 
-  // Use the current timestamp as the run name.
-  std::string session_id = GetCurrentTimeStampAsString();
+  // Sets the session ID if provided, otherwise uses the current timestamp.
+  std::string session_id = opts.profiler_options().session_id().empty()
+                               ? GetCurrentTimeStampAsString()
+                               : opts.profiler_options().session_id();
   std::string repository_root = GetTensorBoardProfilePluginDir(logdir);
   auto duration_ms = opts.profiler_options().duration_ms();
 
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc b/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc
index e4fa849fab0e9a..cd3a28f62aed57 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc
@@ -48,7 +48,7 @@ template <typename T>
 std::unique_ptr<typename T::Stub> CreateStub(
     const std::string& service_address) {
   ::grpc::ChannelArguments channel_args;
-  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
   // Default URI prefix is "dns:///" if not provided.
   auto channel = ::grpc::CreateCustomChannel(
       service_address, ::grpc::InsecureChannelCredentials(), channel_args);
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client_test.cc b/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client_test.cc
index c45fbb0d1d24d0..ac2678763919a2 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client_test.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client_test.cc
@@ -17,13 +17,14 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/status/status.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/rpc/client/profiler_client_test_util.h"
+#include "xla/tsl/profiler/rpc/profiler_server.h"
+#include "xla/tsl/profiler/utils/file_system_utils.h"
 #include "tsl/profiler/protobuf/profiler_service.pb.h"
 
 namespace tsl {
@@ -31,6 +32,7 @@ namespace profiler {
 namespace {
 
 using tensorflow::ProfileRequest;
+using tensorflow::ProfileResponse;
 using ::tsl::profiler::test::DurationApproxLess;
 using ::tsl::profiler::test::DurationNear;
 using ::tsl::profiler::test::StartServer;
@@ -148,6 +150,27 @@ TEST(RemoteProfilerSession, LongDuration) {
   EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
 }
 
+TEST(ProfileGrpcTest, ProfileWithOverrideHostname) {
+  absl::Duration duration = absl::Milliseconds(100);
+  ProfileRequest request;
+  std::string service_addr;
+  std::unique_ptr<ProfilerServer> server =
+      StartServer(duration, &service_addr, &request);
+
+  (*request.mutable_opts()
+        ->mutable_advanced_configuration())["override_hostname"]
+      .set_string_value("testhost");
+
+  tensorflow::ProfileResponse response;
+  absl::Status status = ProfileGrpc(service_addr, request, &response);
+  EXPECT_TRUE(status.ok());
+
+  std::string expected_filepath = ProfilerJoinPath(
+      request.repository_root(), request.session_id(), "testhost.xplane.pb");
+
+  EXPECT_TRUE(Env::Default()->FileExists(expected_filepath).ok());
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
index 4fb207036980ca..e87073e6c9feb0 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
@@ -17,25 +17,55 @@ limitations under the License.
 
 #include <cstddef>
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "xla/tsl/platform/env_time.h"
-#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/lib/gtl/map_util.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/types.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/profiler/rpc/client/profiler_client.h"
-#include "xla/tsl/profiler/utils/time_utils.h"
 
 namespace tsl {
 namespace profiler {
 
+namespace {
+
 using tensorflow::ProfileRequest;
 using tensorflow::RemoteProfilerSessionManagerOptions;
 
+// Parses "override_hostnames" from the configuration to save profile results.
+// Validates that the hostnames match the count and order of `service_addresses`
+// in `options`. The caller needs to ensure that order of `service_addresses`
+// and `override_hostnames` is same.
+absl::StatusOr<std::vector<std::string>> ParseAndValidateOverrideHostnames(
+    const RemoteProfilerSessionManagerOptions& options,
+    ProfileRequest& request) {
+  const auto* override_hostnames = gtl::FindOrNull(
+      request.opts().advanced_configuration(), "override_hostnames");
+  if (override_hostnames == nullptr) {
+    return std::vector<std::string>();
+  }
+  std::vector<std::string> override_hostnames_list =
+      absl::StrSplit(override_hostnames->string_value(), ',');
+  if (override_hostnames_list.size() != options.service_addresses().size()) {
+    return absl::InvalidArgumentError(
+        "The number of override hostnames must match the number of service "
+        "addresses.");
+  }
+  request.mutable_opts()->mutable_advanced_configuration()->erase(
+      "override_hostnames");
+  return override_hostnames_list;
+}
+
+}  // namespace
+
 /*static*/ std::unique_ptr<RemoteProfilerSessionManager>
 RemoteProfilerSessionManager::Create(
     const RemoteProfilerSessionManagerOptions& options,
@@ -84,12 +114,23 @@ absl::Status RemoteProfilerSessionManager::Init() {
             << session_created_ts << "]";
 
   // Prepare a list of clients.
-  clients_.reserve(options_.service_addresses_size());
+  clients_.reserve(options_.service_addresses().size());
+
+  ProfileRequest request_template = request_;
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::string> override_hostnames_list,
+      ParseAndValidateOverrideHostnames(options_, request_template));
 
-  ProfileRequest request = request_;
-  for (auto& service_address : options_.service_addresses()) {
+  for (size_t i = 0; i < options_.service_addresses().size(); ++i) {
+    const std::string& service_address = options_.service_addresses(i);
     std::string resolved_service_address = resolver_(service_address);
+    ProfileRequest request = request_template;
     request.set_host_name(resolved_service_address);
+    if (i < override_hostnames_list.size()) {
+      (*request.mutable_opts()
+            ->mutable_advanced_configuration())["override_hostname"]
+          .set_string_value(override_hostnames_list[i]);
+    }
 
     // Creation also issues Profile RPC asynchronously.
     auto client = RemoteProfilerSession::Create(resolved_service_address,
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
index 78d671601cc1b2..25c93c7134ce46 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
@@ -14,18 +14,23 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/rpc/client/profiler_client_test_util.h"
+#include "xla/tsl/profiler/rpc/profiler_server.h"
+#include "xla/tsl/profiler/utils/file_system_utils.h"
+#include "tsl/profiler/lib/profiler_session.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
 #include "tsl/profiler/protobuf/profiler_service.pb.h"
 
@@ -50,7 +55,7 @@ ProfileRequest PopulateProfileRequest(
     absl::string_view repository_root, absl::string_view session_id,
     absl::string_view host_name,
     const RemoteProfilerSessionManagerOptions& options) {
-  constexpr uint64 kMaxEvents = 1000000;
+  constexpr uint64_t kMaxEvents = 1000000;
   const absl::string_view kXPlanePb = "xplane.pb";
   ProfileRequest request;
   // TODO(b/169976117) Remove duration from request.
@@ -74,7 +79,8 @@ TEST(RemoteProfilerSessionManagerTest, Simple) {
       absl::ToInt64Milliseconds(duration));
 
   std::string service_address;
-  auto server = StartServer(duration, &service_address);
+  std::unique_ptr<ProfilerServer> server =
+      StartServer(duration, &service_address);
   options.add_service_addresses(service_address);
   absl::Time approx_start = absl::Now();
   absl::Duration grace = absl::Seconds(kGracePeriodSeconds);
@@ -85,7 +91,7 @@ TEST(RemoteProfilerSessionManagerTest, Simple) {
   ProfileRequest request =
       PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
   absl::Status status;
-  auto sessions =
+  std::unique_ptr<RemoteProfilerSessionManager> sessions =
       RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
   std::vector<Response> responses = sessions->WaitForCompletion();
@@ -117,7 +123,7 @@ TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
   ProfileRequest request =
       PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
   absl::Status status;
-  auto sessions =
+  std::unique_ptr<RemoteProfilerSessionManager> sessions =
       RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
   std::vector<Response> responses = sessions->WaitForCompletion();
@@ -149,7 +155,7 @@ TEST(RemoteProfilerSessionManagerTest, LongSession) {
   ProfileRequest request =
       PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
   absl::Status status;
-  auto sessions =
+  std::unique_ptr<RemoteProfilerSessionManager> sessions =
       RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
   std::vector<Response> responses = sessions->WaitForCompletion();
@@ -161,6 +167,74 @@ TEST(RemoteProfilerSessionManagerTest, LongSession) {
   EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
 }
 
+TEST(RemoteProfilerSessionManagerTest, OverrideHostnames) {
+  absl::Duration duration = absl::Milliseconds(100);
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() = tsl::ProfilerSession::DefaultOptions();
+  options.mutable_profiler_options()->set_duration_ms(
+      absl::ToInt64Milliseconds(duration));
+
+  std::string service_address;
+  std::unique_ptr<ProfilerServer> server =
+      StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
+  absl::Time approx_start = absl::Now();
+  absl::Duration grace = absl::Seconds(kGracePeriodSeconds);
+  absl::Duration max_duration = duration + grace;
+  options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
+  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(approx_start));
+
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
+  std::string random_hostname =
+      absl::StrCat("testhost_", absl::ToUnixNanos(absl::Now()));
+  (*request.mutable_opts()
+        ->mutable_advanced_configuration())["override_hostnames"]
+      .set_string_value(random_hostname);
+
+  absl::Status status;
+  std::unique_ptr<RemoteProfilerSessionManager> sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
+  ASSERT_TRUE(status.ok());
+  EXPECT_THAT(
+      sessions->WaitForCompletion(),
+      ElementsAre(::testing::Field(&Response::status, absl::OkStatus())));
+
+  EXPECT_TRUE(Env::Default()
+                  ->FileExists(ProfilerJoinPath(
+                      request.repository_root(), request.session_id(),
+                      absl::StrCat(random_hostname, ".xplane.pb")))
+                  .ok());
+}
+
+TEST(RemoteProfilerSessionManagerTest, OverrideHostnamesMismatch) {
+  absl::Duration duration = absl::Milliseconds(30);
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() = tsl::ProfilerSession::DefaultOptions();
+  options.mutable_profiler_options()->set_duration_ms(
+      absl::ToInt64Milliseconds(duration));
+
+  std::string service_address;
+  auto server = StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
+  absl::Time approx_start = absl::Now();
+  absl::Duration grace = absl::Seconds(kGracePeriodSeconds);
+  absl::Duration max_duration = duration + grace;
+  options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
+  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(approx_start));
+
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
+  (*request.mutable_opts()
+        ->mutable_advanced_configuration())["override_hostnames"]
+      .set_string_value("override1,override2");
+
+  absl::Status status;
+  std::unique_ptr<RemoteProfilerSessionManager> sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
+  EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument);
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc b/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc
index 3c939fc0579091..1207e529fbdc30 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
+#include <string>
+#include <variant>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
-#include "absl/strings/str_replace.h"
 #include "absl/synchronization/mutex.h"
 #include "grpcpp/support/status.h"
 #include "xla/tsl/platform/env.h"
@@ -28,10 +30,9 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/macros.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/profiler/rpc/client/save_profile.h"
-#include "xla/tsl/profiler/utils/file_system_utils.h"
 #include "xla/tsl/profiler/utils/math_utils.h"
+#include "xla/tsl/profiler/utils/profiler_options_util.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
 #include "tsl/profiler/lib/profiler_session.h"
@@ -50,6 +51,20 @@ using tensorflow::ProfileResponse;
 using tensorflow::TerminateRequest;
 using tensorflow::TerminateResponse;
 
+std::string GetHostname(const ProfileRequest& request) {
+  std::optional<std::variant<std::string, bool, int64_t>> hostname_override =
+      GetConfigValue(request.opts(), "override_hostname");
+  if (!hostname_override.has_value()) {
+    return request.host_name();
+  }
+  const std::string* hostname_str =
+      std::get_if<std::string>(&*hostname_override);
+  if (hostname_str != nullptr && !hostname_str->empty()) {
+    return *hostname_str;
+  }
+  return request.host_name();
+}
+
 // Collects data in XSpace format. The data is saved to a repository
 // unconditionally.
 absl::Status CollectData(const ProfileRequest& request,
@@ -69,7 +84,7 @@ absl::Status CollectData(const ProfileRequest& request,
   }
 
   return SaveXSpace(request.repository_root(), request.session_id(),
-                    request.host_name(), xspace);
+                    GetHostname(request), xspace);
 }
 
 class ProfilerServiceImpl : public tensorflow::grpc::ProfilerService::Service {
diff --git a/third_party/xla/xla/tsl/profiler/utils/BUILD b/third_party/xla/xla/tsl/profiler/utils/BUILD
index 8ad67789a1b914..3304df3c55da2f 100644
--- a/third_party/xla/xla/tsl/profiler/utils/BUILD
+++ b/third_party/xla/xla/tsl/profiler/utils/BUILD
@@ -480,12 +480,25 @@ cc_library(
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
     ],
 )
 
+tsl_cc_test(
+    name = "session_manager_test",
+    srcs = ["session_manager_test.cc"],
+    deps = [
+        ":session_manager",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "timestamp_utils",
     srcs = ["timestamp_utils.cc"],
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.cc b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
index e891231d807bbf..7edf70559c50f3 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
@@ -362,12 +362,12 @@ const EventNode* EventNode::FindParent(int64_t event_type) const {
 
 void EventForest::FindEventNodeAndApply(
     const int64_t event_type, const std::vector<int64_t>& stat_types,
-    const std::function<void(EventNode&, const std::vector<uint64>&)>& cb) {
+    const std::function<void(EventNode&, const std::vector<uint64_t>&)>& cb) {
   if (auto* event_node_list = gtl::FindOrNull(event_node_map_, event_type)) {
     // Drop 'const' here because the event_node entry can be mutated by the
     // apply function 'cb'.
     for (EventNode& event_node : *event_node_list) {
-      std::vector<uint64> stats;
+      std::vector<uint64_t> stats;
       for (const auto stat_type : stat_types) {
         std::optional<XStatVisitor> stat =
             event_node.GetEventVisitor().GetStat(stat_type);
@@ -424,7 +424,7 @@ void EventForest::ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
 void EventForest::ConnectInterThread(
     const std::vector<InterThreadConnectInfo>& connect_info_list) {
   for (const auto& connect_info : connect_info_list) {
-    absl::flat_hash_map<std::vector<uint64>, EventNode*> connect_map;
+    absl::flat_hash_map<std::vector<uint64_t>, EventNode*> connect_map;
     const std::vector<int64_t>& parent_stat_types =
         connect_info.parent_stat_types;
     const std::vector<int64_t>* child_stat_types =
@@ -438,7 +438,7 @@ void EventForest::ConnectInterThread(
     // the parent node.
     FindEventNodeAndApply(connect_info.parent_event_type, parent_stat_types,
                           [&connect_map](EventNode& event_node,
-                                         const std::vector<uint64>& stats) {
+                                         const std::vector<uint64_t>& stats) {
                             connect_map[stats] = &event_node;
                           });
 
@@ -449,7 +449,7 @@ void EventForest::ConnectInterThread(
     FindEventNodeAndApply(
         connect_info.child_event_type, *child_stat_types,
         [&connect_map](EventNode& event_node,
-                       const std::vector<uint64>& stats) {
+                       const std::vector<uint64_t>& stats) {
           if (auto parent_event_node = gtl::FindPtrOrNull(connect_map, stats)) {
             parent_event_node->AddChild(&event_node);
           }
@@ -651,7 +651,7 @@ void EventForest::ConnectTfDataEvents() {
       std::pair<int64_t /*iterator_id*/, int64_t /*element_id*/>,
       std::vector<EventNode*>>
       produce_iterator_map;
-  uint64 num_producers = 0;
+  uint64_t num_producers = 0;
   for (HostEventType event_type :
        {HostEventType::kPrefetchProduce,
         HostEventType::kParallelInterleaveProduce,
@@ -681,7 +681,7 @@ void EventForest::ConnectTfDataEvents() {
     }
   }
   VLOG(1) << num_producers << " producer iterators found.";
-  uint64 num_matched = 0;
+  uint64_t num_matched = 0;
   for (HostEventType event_type :
        {HostEventType::kPrefetchConsume,
         HostEventType::kParallelInterleaveConsume,
@@ -934,7 +934,15 @@ void GroupXplaneEvents(tensorflow::profiler::XPlane* plane,
   if (step_line) {
     bool device_loop = (step_line->events_size() > module_line->events_size());
     if (device_loop) {
-      group_line = nullptr;
+      int32_t group_id = 0;
+      for (XEvent& event : *step_line->mutable_events()) {
+        XEventBuilder step_builder(step_line, &plane_builder, &event);
+        XEventVisitor step_visitor(&plane_visitor, step_line, &event);
+        if (!step_visitor.GetStat(StatType::kGroupId).has_value()) {
+          step_builder.AddStatValue(*group_id_stat_metadata, group_id++);
+        }
+      }
+      group_line = step_line;
     } else {  // host loop
       if (group_line) {
         // Determine whether the module line has been grouped.
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.h b/third_party/xla/xla/tsl/profiler/utils/group_events.h
index 6e16d4ee4a5628..2bb0aa811a92b9 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.h
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.h
@@ -136,7 +136,7 @@ struct ContextGroup {
 
 using ContextGroupMap = absl::flat_hash_map<
     int /*context_type*/,
-    absl::flat_hash_map<uint64 /*context_id*/, ContextGroup>>;
+    absl::flat_hash_map<uint64_t /*context_id*/, ContextGroup>>;
 
 // EventForest augments the input XSpace with the trace context. The trace
 // context is created by stitching XEvents (1) using the nesting relationship
@@ -212,7 +212,7 @@ class EventForest {
   //     - The node's event type in event_node_map_ is event_type.
   void FindEventNodeAndApply(
       int64_t event_type, const std::vector<int64_t>& stat_types,
-      const std::function<void(EventNode&, const std::vector<uint64>&)>& cb);
+      const std::function<void(EventNode&, const std::vector<uint64_t>&)>& cb);
 
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
index 9b882f71059566..f3b8e20a56ebe6 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
@@ -441,7 +441,7 @@ TEST(GroupEventsTest, SemanticArgTest) {
   constexpr int64_t kIsRoot = 1;
   constexpr int64_t kStepNum = 100;
   constexpr int64_t kContextType = 123;
-  constexpr uint64 kContextId = 456;
+  constexpr uint64_t kContextId = 456;
 
   XSpace raw_space;
   XPlane* raw_plane = raw_space.add_planes();
@@ -479,8 +479,8 @@ TEST(GroupEventsTest, SemanticIntArgNoMatchTest) {
   constexpr int64_t kIsRoot = 1;
   constexpr int64_t kStepNum = 100;
   constexpr int64_t kContextType = 123;
-  constexpr uint64 kProducerId = 456;
-  constexpr uint64 kConsumerId = 789;
+  constexpr uint64_t kProducerId = 456;
+  constexpr uint64_t kConsumerId = 789;
 
   XSpace raw_space;
   XPlane* raw_plane = raw_space.add_planes();
@@ -522,8 +522,8 @@ TEST(GroupEventsTest, SemanticUintArgNoMatchTest) {
   constexpr int64_t kIsRoot = 1;
   constexpr int64_t kStepNum = 100;
   constexpr int64_t kContextType = 123;
-  constexpr uint64 kProducerId = UINT64_MAX;
-  constexpr uint64 kConsumerId = UINT64_MAX - 1;
+  constexpr uint64_t kProducerId = UINT64_MAX;
+  constexpr uint64_t kConsumerId = UINT64_MAX - 1;
 
   XSpace raw_space;
   XPlane* raw_plane = raw_space.add_planes();
@@ -641,7 +641,7 @@ TEST(GroupEventsTest, BatchingSessionTest) {
   EXPECT_EQ(group_metadata_map.at(1).children.size(), 1);
   EXPECT_EQ(group_metadata_map.at(2).children.size(), 1);
   // Check that the events have the selected_group_ids stat set.
-  uint64 num_checked = 0;
+  uint64_t num_checked = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine([&](const XLineVisitor& line) {
     line.ForEachEvent([&](const XEventVisitor& event) {
       std::optional<int64_t> group_id;
diff --git a/third_party/xla/xla/tsl/profiler/utils/session_manager.cc b/third_party/xla/xla/tsl/profiler/utils/session_manager.cc
index 87b6be5483fd6d..7d4e66a5a94a89 100644
--- a/third_party/xla/xla/tsl/profiler/utils/session_manager.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/session_manager.cc
@@ -22,6 +22,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/errors.h"
@@ -78,6 +82,8 @@ void UpdateMaxSessionDuration(RemoteProfilerSessionManagerOptions& options) {
 
 // Receives a comma delimited list of service_addresses and adds them to
 // RemoteProfilerSessionManagerOptions::service_addresses.
+// If override_hostnames is also specified, the order of hosts in
+// service_addresses and override_hostnames must match.
 void AddServiceAddresses(absl::string_view service_addresses,
                          RemoteProfilerSessionManagerOptions* options) {
   for (absl::string_view server : absl::StrSplit(service_addresses, ',')) {
@@ -136,6 +142,25 @@ RemoteProfilerSessionManagerOptions GetRemoteSessionManagerOptionsLocked(
             options.set_delay_ms(value);
           },
           nullptr);
+    } else if (key == "session_id") {
+      SetOption<std::string>(
+          key, kw.second,
+          [&options](tensorflow::ProfileOptions*, std::string value) {
+            options.mutable_profiler_options()->set_session_id(value);
+          },
+          nullptr);
+    } else if (key == "override_hostnames") {
+      // A comma-separated list of hostnames that should be used to save the
+      // profile results. The order of these hostnames must match the order of
+      // service_addresses.
+      SetOption<std::string>(
+          key, kw.second,
+          [&options](tensorflow::ProfileOptions*, std::string value) {
+            (*options.mutable_profiler_options()
+                  ->mutable_advanced_configuration())["override_hostnames"]
+                .set_string_value(value);
+          },
+          nullptr);
     } else {
       LOG(WARNING) << "Unrecognised key: " << key;
     }
@@ -190,12 +215,11 @@ RemoteProfilerSessionManagerOptions GetRemoteSessionManagerOptionsLocked(
 absl::Status ValidateRemoteProfilerSessionManagerOptions(
     const RemoteProfilerSessionManagerOptions& options) {
   if (options.service_addresses().empty()) {
-    return tsl::errors::InvalidArgument("No service address provided.");
+    return absl::InvalidArgumentError("No service address provided.");
   }
 
   if (options.profiler_options().duration_ms() == 0) {
-    return tsl::errors::InvalidArgument(
-        "duration_ms must be greater than zero.");
+    return absl::InvalidArgumentError("duration_ms must be greater than zero.");
   }
 
   for (absl::string_view host_port : options.service_addresses()) {
@@ -204,7 +228,7 @@ absl::Status ValidateRemoteProfilerSessionManagerOptions(
 
   if (options.max_session_duration_ms() <
       options.profiler_options().duration_ms()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "The maximum profiling session duration must be greater than or equal "
         "to the local profiler duration.");
   }
@@ -213,14 +237,14 @@ absl::Status ValidateRemoteProfilerSessionManagerOptions(
 }
 
 absl::Status ValidateHostPortPair(absl::string_view host_port) {
-  tsl::uint32 port;
+  uint32_t port;
   std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
   // Must be host:port, port must be a number, host must not contain a '/',
   // host also must not be empty.
   if (parts.size() != 2 || !absl::SimpleAtoi(parts[1], &port) ||
       absl::StrContains(parts[0], "/") || parts[0].empty()) {
-    return tsl::errors::InvalidArgument("Could not interpret \"", host_port,
-                                        "\" as a host-port pair.");
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Could not interpret \"", host_port, "\" as a host-port pair."));
   }
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/tsl/profiler/utils/session_manager_test.cc b/third_party/xla/xla/tsl/profiler/utils/session_manager_test.cc
new file mode 100644
index 00000000000000..2bd018d684c91c
--- /dev/null
+++ b/third_party/xla/xla/tsl/profiler/utils/session_manager_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/profiler/utils/session_manager.h"
+
+#include <string>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/test.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using tensorflow::RemoteProfilerSessionManagerOptions;
+
+TEST(SessionManagerTest, OptionsWithSessionIdTest) {
+  absl::string_view logdir = "/tmp/logdir";
+  absl::flat_hash_map<std::string, std::variant<bool, int, std::string>> opts;
+  opts["session_id"] = std::string("test_session_id");
+  RemoteProfilerSessionManagerOptions options =
+      GetRemoteSessionManagerOptionsLocked(logdir, opts);
+  EXPECT_EQ(options.profiler_options().session_id(), "test_session_id");
+}
+
+TEST(SessionManagerTest, OptionsWithoutSessionIdTest) {
+  absl::string_view logdir = "/tmp/logdir";
+  absl::flat_hash_map<std::string, std::variant<bool, int, std::string>> opts;
+  RemoteProfilerSessionManagerOptions options =
+      GetRemoteSessionManagerOptionsLocked(logdir, opts);
+  EXPECT_EQ(options.profiler_options().session_id().empty(), true);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/utils/timespan.h b/third_party/xla/xla/tsl/profiler/utils/timespan.h
index 5429468374ca28..e1e48dd215a157 100644
--- a/third_party/xla/xla/tsl/profiler/utils/timespan.h
+++ b/third_party/xla/xla/tsl/profiler/utils/timespan.h
@@ -31,20 +31,20 @@ namespace profiler {
 // Events may have duration 0 ("instant events") but duration can't be negative.
 class Timespan {
  public:
-  static Timespan FromEndPoints(uint64 begin_ps, uint64 end_ps) {
+  static Timespan FromEndPoints(uint64_t begin_ps, uint64_t end_ps) {
     if (begin_ps > end_ps) {
       return Timespan(begin_ps, 0);
     }
     return Timespan(begin_ps, end_ps - begin_ps);
   }
 
-  explicit Timespan(uint64 begin_ps = 0, uint64 duration_ps = 0)
+  explicit Timespan(uint64_t begin_ps = 0, uint64_t duration_ps = 0)
       : begin_ps_(begin_ps), duration_ps_(duration_ps) {}
 
-  uint64 begin_ps() const { return begin_ps_; }
-  uint64 middle_ps() const { return begin_ps_ + duration_ps_ / 2; }
-  uint64 end_ps() const { return begin_ps_ + duration_ps_; }
-  uint64 duration_ps() const { return duration_ps_; }
+  uint64_t begin_ps() const { return begin_ps_; }
+  uint64_t middle_ps() const { return begin_ps_ + duration_ps_ / 2; }
+  uint64_t end_ps() const { return begin_ps_ + duration_ps_; }
+  uint64_t duration_ps() const { return duration_ps_; }
 
   // Returns true if the Timespan represents an instant in time (duration 0).
   bool Instant() const { return duration_ps() == 0; }
@@ -72,10 +72,10 @@ class Timespan {
   }
 
   // Returns true if time_ps is within this Timespan.
-  bool Includes(uint64 time_ps) const { return Includes(Timespan(time_ps)); }
+  bool Includes(uint64_t time_ps) const { return Includes(Timespan(time_ps)); }
 
   // Returns the duration in ps that this Timespan overlaps with the other.
-  uint64 OverlappedDurationPs(const Timespan& other) const {
+  uint64_t OverlappedDurationPs(const Timespan& other) const {
     if (!Overlaps(other)) return 0;
     return std::min(end_ps(), other.end_ps()) -
            std::max(begin_ps(), other.begin_ps());
@@ -123,12 +123,12 @@ class Timespan {
   }
 
  private:
-  uint64 begin_ps_;
-  uint64 duration_ps_;  // 0 for an instant event.
+  uint64_t begin_ps_;
+  uint64_t duration_ps_;  // 0 for an instant event.
 };
 
 // Creates a Timespan from endpoints in picoseconds.
-inline Timespan PicoSpan(uint64 start_ps, uint64 end_ps) {
+inline Timespan PicoSpan(uint64_t start_ps, uint64_t end_ps) {
   return Timespan::FromEndPoints(start_ps, end_ps);
 }
 
diff --git a/third_party/xla/xla/tsl/profiler/utils/trace_utils.h b/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
index f4d82a014a0784..b7cd8a466960bd 100644
--- a/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_PROFILER_UTILS_TRACE_UTILS_H_
 #define XLA_TSL_PROFILER_UTILS_TRACE_UTILS_H_
 
+#include <cstdint>
 #include <optional>
 
 #include "absl/strings/numbers.h"
@@ -28,24 +29,27 @@ namespace profiler {
 // Constants used as trace_viewer PID (device_id in trace_events.proto).
 // PID 0 is unused.
 // Support up to 500 accelerator devices.
-constexpr uint32 kFirstDeviceId = 1;
-constexpr uint32 kLastDeviceId = 500;
+constexpr uint32_t kFirstDeviceId = 1;
+constexpr uint32_t kLastDeviceId = 500;
+
+// Max. devices per host. Power of 10 for ease of debugging.
+static constexpr uint32_t kMaxDevicesPerHost = 1000;
 // Support Upto 200 custom planes as fake devices (i.e., planes with a
 // "/custom:" prefix). See `<project_name>::kCustomPlanePrefix` for more
 // information
-constexpr uint32 kFirstCustomPlaneDeviceId = kLastDeviceId + 1;
-constexpr uint32 kMaxCustomPlaneDevicesPerHost = 200;
-constexpr uint32 kLastCustomPlaneDeviceId =
+constexpr uint32_t kFirstCustomPlaneDeviceId = kLastDeviceId + 1;
+constexpr uint32_t kMaxCustomPlaneDevicesPerHost = 200;
+constexpr uint32_t kLastCustomPlaneDeviceId =
     kFirstCustomPlaneDeviceId + kMaxCustomPlaneDevicesPerHost - 1;
 // Host threads are shown as a single fake device.
-constexpr uint32 kHostThreadsDeviceId = kLastCustomPlaneDeviceId + 1;
+constexpr uint32_t kHostThreadsDeviceId = kLastCustomPlaneDeviceId + 1;
 
 // Constants used as plane ID for custom NCCL planes, starting from the last
 // kMaxNcclPlanes in the custom plane device ID range.
-constexpr uint32 kMaxNcclPlanes = 100;
-constexpr uint32 kFirstNcclPlaneId =
+constexpr uint32_t kMaxNcclPlanes = 100;
+constexpr uint32_t kFirstNcclPlaneId =
     tsl::profiler::kMaxCustomPlaneDevicesPerHost - kMaxNcclPlanes;
-constexpr uint32 kLastNcclPlaneId = kFirstNcclPlaneId + kMaxNcclPlanes - 1;
+constexpr uint32_t kLastNcclPlaneId = kFirstNcclPlaneId + kMaxNcclPlanes - 1;
 
 constexpr int kNumGpuOnDeviceCustomPlanesPerHost = 50;
 constexpr int kFirstGpuOnDeviceCustomPlaneId =
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h b/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
index 44ef9150ec2b57..a0267766de828f 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
@@ -83,7 +83,7 @@ class XStatsBuilder {
   void ParseAndAddStatValue(const XStatMetadata& metadata,
                             absl::string_view value) {
     int64_t int_value;
-    uint64 uint_value;
+    uint64_t uint_value;
     double double_value;
     if (absl::SimpleAtoi(value, &int_value)) {
       AddStatValue(metadata, int_value);
@@ -116,7 +116,7 @@ class XStatsBuilder {
     return nullptr;
   }
 
-  static uint64 IntOrUintValue(const XStat& stat) {
+  static uint64_t IntOrUintValue(const XStat& stat) {
     return stat.value_case() == XStat::kUint64Value ? stat.uint64_value()
                                                     : stat.int64_value();
   }
@@ -228,6 +228,10 @@ class XEventBuilder : public XStatsBuilder<XEvent> {
   int64_t DurationPs() const { return event_->duration_ps(); }
   int64_t MetadataId() const { return event_->metadata_id(); }
 
+  void SetMetadataId(int64_t metadata_id) {
+    event_->set_metadata_id(metadata_id);
+  }
+
   void SetOffsetPs(int64_t offset_ps) { event_->set_offset_ps(offset_ps); }
 
   void SetOffsetNs(int64_t offset_ns) { SetOffsetPs(NanoToPico(offset_ns)); }
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
index 69eb26070c6237..3f54956c9a5e81 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
@@ -303,6 +303,7 @@ const StatTypeMap& GetStatTypeMap() {
        {"Thread Id", kThreadId},
        {"Time Scale Multiplier", kTimeScaleMultiplier},
        {"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent},
+       {"hbm_utilization_percent", kHbmUtilizationPercent},
        // XLA metadata map related.
        {"Hlo Proto", kHloProto},
        {"EdgeTPU Model information", kEdgeTpuModelInfo},
@@ -384,7 +385,10 @@ const StatTypeMap& GetStatTypeMap() {
        {"cuda_graph_map_id", kCudaGraphMapId},
        {"cuda_graph_map_value_id", kCudaGraphMapValueId},
        {"cuda_graph_node_map_id", kCudaGraphNodeMapId},
-       {"graph_metadata_line_id", kGraphMetadataLineId}});
+       {"graph_metadata_line_id", kGraphMetadataLineId},
+       {"offload_core_id", kOffloadCoreId},
+       {"tc_offload_start_id", kTcOffloadStartId},
+       {"marker_payload", kMarkerPayloadString}});
   DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
   return *stat_type_map;
 }
@@ -568,7 +572,7 @@ bool IsInternalEvent(std::optional<int64_t> event_type) {
 bool IsInternalStat(std::optional<int64_t> stat_type) {
   if (!stat_type.has_value()) return false;
   switch (*stat_type) {
-    case StatType::kKernelDetails:
+    // case StatType::kKernelDetails:  # removed for rocm gpu kernel details
     case StatType::kProducerType:
     case StatType::kProducerId:
     case StatType::kConsumerType:
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
index 8f15e92ba9fc56..cdecf0a2444682 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
@@ -290,6 +290,7 @@ enum StatType {
   kScaledValue,
   kThreadId,
   kMatrixUnitUtilizationPercent,
+  kHbmUtilizationPercent,
   // Cost analysis related.
   kTimeScaleMultiplier,
   // XLA metadata map related.
@@ -372,7 +373,10 @@ enum StatType {
   kCudaGraphMapValueId,
   kCudaGraphNodeMapId,
   kGraphMetadataLineId,
-  kLastStatType = kGraphMetadataLineId,
+  kOffloadCoreId,
+  kTcOffloadStartId,
+  kMarkerPayloadString,
+  kLastStatType = kMarkerPayloadString,
 };
 
 enum MegaScaleStatType : uint8_t {
@@ -503,7 +507,7 @@ class XFlow {
   }
 
   // Encoding
-  uint64 ToStatValue() const { return encoded_.whole; }
+  uint64_t ToStatValue() const { return encoded_.whole; }
 
   // Decoding
   static XFlow FromStatValue(uint64_t encoded) { return XFlow(encoded); }
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
index fd75840788c04c..f7292594df0af0 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tsl {
 namespace profiler {
 
-using XStatValue = std::variant<int64_t, uint64, absl::string_view>;
+using XStatValue = std::variant<int64_t, uint64_t, absl::string_view>;
 
 XPlane* GetOrCreateHostXPlane(XSpace* space);
 
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc
index 4c18933e50fc02..0a1d0c85098508 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc
@@ -316,21 +316,33 @@ void SortXSpace(XSpace* space) {
 // The assumption is that both line's timestamp_ns and start_time_ns are
 // nano-seconds from epoch time, the different of these values is much
 // smaller than these value.
-void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
+void NormalizeTimestamps(XPlane* plane, uint64_t start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
     if (line.timestamp_ns() >= static_cast<int64_t>(start_time_ns)) {
       line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
+    } else {
+      // When this happen, we suppose that the line.timestamp_ns() should
+      // already be normalized, i.e., pretty small. Here use MAX_INT64 / 1000
+      // to check, supposing when it convert to picosecond, it should not cause
+      // overflow.
+      if (line.timestamp_ns() >= std::numeric_limits<int64_t>::max() / 1000) {
+        LOG(ERROR) << "line.timestamp_ns() " << line.timestamp_ns()
+                   << " is too large, which means the line.timestamp_ns() is "
+                      "not normalized before, "
+                      "and here it is normalized to some timestamp after it:"
+                   << start_time_ns;
+      }
     }
   }
 }
 
-void NormalizeTimestamps(XSpace* space, uint64 start_time_ns) {
+void NormalizeTimestamps(XSpace* space, uint64_t start_time_ns) {
   for (XPlane& plane : *space->mutable_planes()) {
     NormalizeTimestamps(&plane, start_time_ns);
   }
 }
 
-void DenormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
+void DenormalizeTimestamps(XPlane* plane, uint64_t start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
     if (line.timestamp_ns() < static_cast<int64_t>(start_time_ns)) {
       line.set_timestamp_ns(line.timestamp_ns() + start_time_ns);
@@ -338,7 +350,7 @@ void DenormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
   }
 }
 
-void DenormalizeTimestamps(XSpace* space, uint64 start_time_ns) {
+void DenormalizeTimestamps(XSpace* space, uint64_t start_time_ns) {
   for (XPlane& plane : *space->mutable_planes()) {
     DenormalizeTimestamps(&plane, start_time_ns);
   }
@@ -567,7 +579,7 @@ void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace) {
   aggregated_plane.SetName(plane.Name());
   aggregated_plane.SetId(plane.Id());
 
-  uint64_t first_op_start_ps = kint64max;
+  uint64_t first_op_start_ps = std::numeric_limits<int64_t>::max();
   uint64_t last_op_end_ps = 0;
 
   plane.ForEachLine([&](const XLineVisitor& line) {
@@ -594,8 +606,9 @@ void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace) {
                            ? last_op_end_ps
                            : timespan.end_ps();
       const auto& group_stat = event.GetStat(StatType::kGroupId);
-      int64_t group_id =
-          group_stat.has_value() ? group_stat->IntOrUintValue() : kint64max;
+      int64_t group_id = group_stat.has_value()
+                             ? group_stat->IntOrUintValue()
+                             : std::numeric_limits<int64_t>::max();
 
       StatByEvent& line_stats = stats[line.Id()][group_id];
       line_stats[event.Id()].stat.UpdateStat(timespan.duration_ps());
@@ -646,7 +659,7 @@ void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace) {
             aggregated_line.AddEvent(event_metadata);
         aggregated_event.SetNumOccurrences(event_stat.stat.count());
         aggregated_event.SetDurationPs(event_stat.stat.sum());
-        if (group_id != kint64max) {
+        if (group_id != std::numeric_limits<int64_t>::max()) {
           aggregated_event.AddStatValue(*kGroupId, group_id);
         }
         if (event_stat.stat.count() > 1) {
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
index d1eb817305fd9f..fb4a43b814a21e 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
@@ -153,12 +153,12 @@ std::vector<Event> GetSortedEvents(Plane& plane,
 }
 
 // Normalize timestamps by time-shifting to start_time_ns_ as origin.
-void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns);
-void NormalizeTimestamps(XSpace* space, uint64 start_time_ns);
+void NormalizeTimestamps(XPlane* plane, uint64_t start_time_ns);
+void NormalizeTimestamps(XSpace* space, uint64_t start_time_ns);
 
 // Denormalize timestamps by time-shifting to 0 as origin.
-void DenormalizeTimestamps(XPlane* plane, uint64 start_time_ns);
-void DenormalizeTimestamps(XSpace* space, uint64 start_time_ns);
+void DenormalizeTimestamps(XPlane* plane, uint64_t start_time_ns);
+void DenormalizeTimestamps(XSpace* space, uint64_t start_time_ns);
 
 // Merges src_plane into dst_plane. Both plane level stats, lines, events and
 // event level stats are merged. If src_plane and dst_plane both have the same
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc
index 1c5dfeb7b990ac..01db1f0df504bb 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc
@@ -92,7 +92,7 @@ void XPlaneVisitor::BuildEventTypeMap(
     const XPlane* plane, const TypeGetterList& event_type_getter_list) {
   if (event_type_getter_list.empty()) return;
   for (const auto& event_metadata : plane->event_metadata()) {
-    uint64 metadata_id = event_metadata.first;
+    uint64_t metadata_id = event_metadata.first;
     const auto& metadata = event_metadata.second;
     for (const auto& event_type_getter : event_type_getter_list) {
       std::optional<int64_t> event_type = event_type_getter(metadata.name());
@@ -124,7 +124,7 @@ void XPlaneVisitor::BuildStatTypeMap(
     const XPlane* plane, const TypeGetterList& stat_type_getter_list) {
   if (stat_type_getter_list.empty()) return;
   for (const auto& stat_metadata : plane->stat_metadata()) {
-    uint64 metadata_id = stat_metadata.first;
+    uint64_t metadata_id = stat_metadata.first;
     const auto& metadata = stat_metadata.second;
     for (const auto& stat_type_getter : stat_type_getter_list) {
       std::optional<int64_t> stat_type = stat_type_getter(metadata.name());
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
index df3c6a242aaecd..b2eb5af4ab47b3 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
@@ -64,13 +64,14 @@ class XStatVisitor {
 
   int64_t IntValue() const { return stat_->int64_value(); }
 
-  uint64 UintValue() const { return stat_->uint64_value(); }
+  uint64_t UintValue() const { return stat_->uint64_value(); }
 
   absl::string_view BytesValue() const { return stat_->bytes_value(); }
 
-  uint64 IntOrUintValue() const {
-    return ValueCase() == XStat::kUint64Value ? UintValue()
-                                              : static_cast<uint64>(IntValue());
+  uint64_t IntOrUintValue() const {
+    return ValueCase() == XStat::kUint64Value
+               ? UintValue()
+               : static_cast<uint64_t>(IntValue());
   }
 
   double DoubleValue() const { return stat_->double_value(); }
diff --git a/third_party/xla/xla/tsl/testing/BUILD b/third_party/xla/xla/tsl/testing/BUILD
index 7255cdc4e491a0..515df03de66ac2 100644
--- a/third_party/xla/xla/tsl/testing/BUILD
+++ b/third_party/xla/xla/tsl/testing/BUILD
@@ -21,7 +21,6 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/utility",
         "@com_google_googletest//:gtest_for_library",
         "@local_tsl//tsl/platform:path",
     ],
@@ -35,7 +34,6 @@ tsl_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:file_statistics",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/tsl/tsl.bzl b/third_party/xla/xla/tsl/tsl.bzl
index d6dcca3ef3de33..5dc399ce2a1ab7 100644
--- a/third_party/xla/xla/tsl/tsl.bzl
+++ b/third_party/xla/xla/tsl/tsl.bzl
@@ -508,9 +508,6 @@ def if_not_mobile_or_arm_or_macos_or_lgpl_restricted(a):
         "//conditions:default": [],
     })
 
-def tsl_grpc_cc_dependencies():
-    return [clean_dep("//xla/tsl:grpc++")]
-
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
     outputs = _get_transitive_headers([], ctx.attr.deps)
diff --git a/third_party/xla/xla/tsl/tsl.default.bzl b/third_party/xla/xla/tsl/tsl.default.bzl
index 5a2547c001eaeb..5ba377ac9c8acb 100644
--- a/third_party/xla/xla/tsl/tsl.default.bzl
+++ b/third_party/xla/xla/tsl/tsl.default.bzl
@@ -10,7 +10,6 @@ load(
     _tsl_extra_config_settings = "tsl_extra_config_settings",
     _tsl_extra_config_settings_targets = "tsl_extra_config_settings_targets",
     _tsl_google_bzl_deps = "tsl_google_bzl_deps",
-    _tsl_grpc_cc_dependencies = "tsl_grpc_cc_dependencies",
     _tsl_pybind_extension = "tsl_pybind_extension",
 )
 
@@ -20,7 +19,6 @@ get_compatible_with_portable = _get_compatible_with_portable
 get_compatible_with_libtpu_portable = _get_compatible_with_libtpu_portable
 filegroup = _filegroup
 if_not_mobile_or_arm_or_macos_or_lgpl_restricted = _if_not_mobile_or_arm_or_macos_or_lgpl_restricted
-tsl_grpc_cc_dependencies = _tsl_grpc_cc_dependencies
 tsl_pybind_extension = _tsl_pybind_extension
 tsl_google_bzl_deps = _tsl_google_bzl_deps
 tsl_extra_config_settings = _tsl_extra_config_settings
diff --git a/third_party/xla/xla/tsl/util/BUILD b/third_party/xla/xla/tsl/util/BUILD
index 48351ded114b14..30cf9023c92571 100644
--- a/third_party/xla/xla/tsl/util/BUILD
+++ b/third_party/xla/xla/tsl/util/BUILD
@@ -236,11 +236,12 @@ cc_library(
     ]),
     deps = [
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:test_log_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:str_util",
     ],
 )
@@ -278,6 +279,11 @@ cc_library(
     deps = [
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/platform:stringpiece",
     ],
@@ -305,11 +311,11 @@ cc_library(
     hdrs = ["command_line_flags.h"],
     deps = [
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:types",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:str_util",
-        "@local_tsl//tsl/platform:stringpiece",
-        "@local_tsl//tsl/platform:stringprintf",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/util/command_line_flags.cc b/third_party/xla/xla/tsl/util/command_line_flags.cc
index 5cdb31ae6b4c50..b5f12b443e667f 100644
--- a/third_party/xla/xla/tsl/util/command_line_flags.cc
+++ b/third_party/xla/xla/tsl/util/command_line_flags.cc
@@ -16,68 +16,53 @@ limitations under the License.
 #include "xla/tsl/util/command_line_flags.h"
 
 #include <algorithm>
-#include <cinttypes>
-#include <cstring>
+#include <cstdint>
+#include <functional>
+#include <limits>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/functional/function_ref.h"
 #include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/types/span.h"
 #include "xla/tsl/platform/logging.h"
-#include "tsl/platform/str_util.h"
-#include "tsl/platform/stringpiece.h"
-#include "tsl/platform/stringprintf.h"
 
 namespace tsl {
 namespace {
 
 bool ParseStringFlag(absl::string_view arg, absl::string_view flag,
-                     const std::function<bool(string)>& hook,
+                     absl::FunctionRef<bool(std::string)> hook,
                      bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
       absl::ConsumePrefix(&arg, "=")) {
-    *value_parsing_ok = hook(string(arg));
+    *value_parsing_ok = hook(std::string(arg));
     return true;
   }
 
   return false;
 }
 
-bool ParseInt32Flag(absl::string_view arg, absl::string_view flag,
-                    const std::function<bool(int32_t)>& hook,
-                    bool* value_parsing_ok) {
+template <typename T>
+bool ParseIntFlag(absl::string_view arg, absl::string_view flag,
+                  absl::FunctionRef<bool(T)> hook, bool* value_parsing_ok) {
+  static_assert(std::numeric_limits<T>::is_integer);
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
       absl::ConsumePrefix(&arg, "=")) {
-    char extra;
-    int32_t parsed_int32;
-    if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
+    T parsed_int;
+    if (absl::SimpleAtoi(arg, &parsed_int)) {
+      *value_parsing_ok = hook(parsed_int);
     } else {
-      *value_parsing_ok = hook(parsed_int32);
-    }
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseInt64Flag(absl::string_view arg, absl::string_view flag,
-                    const std::function<bool(int64_t)>& hook,
-                    bool* value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
-      absl::ConsumePrefix(&arg, "=")) {
-    char extra;
-    int64_t parsed_int64;
-    if (sscanf(arg.data(), "%" SCNd64 "%c", &parsed_int64, &extra) != 1) {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
-    } else {
-      *value_parsing_ok = hook(parsed_int64);
     }
     return true;
   }
@@ -86,8 +71,7 @@ bool ParseInt64Flag(absl::string_view arg, absl::string_view flag,
 }
 
 bool ParseBoolFlag(absl::string_view arg, absl::string_view flag,
-                   const std::function<bool(bool)>& hook,
-                   bool* value_parsing_ok) {
+                   absl::FunctionRef<bool(bool)> hook, bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
@@ -105,31 +89,29 @@ bool ParseBoolFlag(absl::string_view arg, absl::string_view flag,
     if (absl::EqualsIgnoreCase(arg, "false") || arg == "0") {
       *value_parsing_ok = hook(false);
       return true;
-    } else {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-      return true;
     }
+    LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
+               << ".";
+    *value_parsing_ok = false;
+    return true;
   }
 
   return false;
 }
 
 bool ParseFloatFlag(absl::string_view arg, absl::string_view flag,
-                    const std::function<bool(float)>& hook,
+                    absl::FunctionRef<bool(float)> hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
       absl::ConsumePrefix(&arg, "=")) {
-    char extra;
     float parsed_float;
-    if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
+    if (absl::SimpleAtof(arg, &parsed_float)) {
+      *value_parsing_ok = hook(parsed_float);
+    } else {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
-    } else {
-      *value_parsing_ok = hook(parsed_float);
     }
     return true;
   }
@@ -139,7 +121,7 @@ bool ParseFloatFlag(absl::string_view arg, absl::string_view flag,
 
 }  // namespace
 
-Flag::Flag(const char* name, int32_t* dst, const string& usage_text,
+Flag::Flag(const char* name, int32_t* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_INT32),
@@ -153,7 +135,7 @@ Flag::Flag(const char* name, int32_t* dst, const string& usage_text,
       int32_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, int64_t* dst, const string& usage_text,
+Flag::Flag(const char* name, int64_t* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_INT64),
@@ -167,7 +149,7 @@ Flag::Flag(const char* name, int64_t* dst, const string& usage_text,
       int64_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, float* dst, const string& usage_text,
+Flag::Flag(const char* name, float* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_FLOAT),
@@ -181,7 +163,7 @@ Flag::Flag(const char* name, float* dst, const string& usage_text,
       float_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, bool* dst, const string& usage_text,
+Flag::Flag(const char* name, bool* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_BOOL),
@@ -195,11 +177,11 @@ Flag::Flag(const char* name, bool* dst, const string& usage_text,
       bool_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, string* dst, const string& usage_text,
+Flag::Flag(const char* name, std::string* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_STRING),
-      string_hook_([dst, dst_updated](string value) {
+      string_hook_([dst, dst_updated](std::string value) {
         *dst = std::move(value);
         if (dst_updated) {
           *dst_updated = true;
@@ -210,7 +192,7 @@ Flag::Flag(const char* name, string* dst, const string& usage_text,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(int32_t)> int32_hook,
-           int32_t default_value_for_display, const string& usage_text)
+           int32_t default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_INT32),
       int32_hook_(std::move(int32_hook)),
@@ -218,7 +200,7 @@ Flag::Flag(const char* name, std::function<bool(int32_t)> int32_hook,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(int64_t)> int64_hook,
-           int64_t default_value_for_display, const string& usage_text)
+           int64_t default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_INT64),
       int64_hook_(std::move(int64_hook)),
@@ -226,7 +208,7 @@ Flag::Flag(const char* name, std::function<bool(int64_t)> int64_hook,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(float)> float_hook,
-           float default_value_for_display, const string& usage_text)
+           float default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_FLOAT),
       float_hook_(std::move(float_hook)),
@@ -234,27 +216,27 @@ Flag::Flag(const char* name, std::function<bool(float)> float_hook,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(bool)> bool_hook,
-           bool default_value_for_display, const string& usage_text)
+           bool default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_BOOL),
       bool_hook_(std::move(bool_hook)),
       bool_default_for_display_(default_value_for_display),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, std::function<bool(string)> string_hook,
-           string default_value_for_display, const string& usage_text)
+Flag::Flag(const char* name, std::function<bool(std::string)> string_hook,
+           std::string default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_STRING),
       string_hook_(std::move(string_hook)),
       string_default_for_display_(std::move(default_value_for_display)),
       usage_text_(usage_text) {}
 
-bool Flag::Parse(string arg, bool* value_parsing_ok) const {
+bool Flag::Parse(absl::string_view arg, bool* value_parsing_ok) const {
   bool result = false;
   if (type_ == TYPE_INT32) {
-    result = ParseInt32Flag(arg, name_, int32_hook_, value_parsing_ok);
+    result = ParseIntFlag<int32_t>(arg, name_, int32_hook_, value_parsing_ok);
   } else if (type_ == TYPE_INT64) {
-    result = ParseInt64Flag(arg, name_, int64_hook_, value_parsing_ok);
+    result = ParseIntFlag<int64_t>(arg, name_, int64_hook_, value_parsing_ok);
   } else if (type_ == TYPE_BOOL) {
     result = ParseBoolFlag(arg, name_, bool_hook_, value_parsing_ok);
   } else if (type_ == TYPE_STRING) {
@@ -266,11 +248,12 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
 }
 
 /*static*/ bool Flags::Parse(int* argc, char** argv,
-                             const std::vector<Flag>& flag_list) {
+                             absl::Span<const Flag> flag_list) {
   bool result = true;
   std::vector<char*> unknown_flags;
   for (int i = 1; i < *argc; ++i) {
-    if (string(argv[i]) == "--") {
+    absl::string_view arg = argv[i];
+    if (arg == "--") {
       while (i < *argc) {
         unknown_flags.push_back(argv[i]);
         ++i;
@@ -281,7 +264,7 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
     bool was_found = false;
     for (const Flag& flag : flag_list) {
       bool value_parsing_ok;
-      was_found = flag.Parse(argv[i], &value_parsing_ok);
+      was_found = flag.Parse(arg, &value_parsing_ok);
       if (!value_parsing_ok) {
         result = false;
       }
@@ -294,17 +277,14 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
     }
   }
   // Passthrough any extra flags.
-  int dst = 1;  // Skip argv[0]
-  for (char* f : unknown_flags) {
-    argv[dst++] = f;
-  }
-  argv[dst++] = nullptr;
+  absl::c_copy(unknown_flags, argv + 1);
   *argc = unknown_flags.size() + 1;
-  return result && (*argc < 2 || strcmp(argv[1], "--help") != 0);
+  argv[*argc] = nullptr;
+  return result && (*argc < 2 || absl::string_view(argv[1]) != "--help");
 }
 
 /*static*/ bool Flags::Parse(std::vector<std::string>& flags,
-                             const std::vector<Flag>& flag_list) {
+                             absl::Span<const Flag> flag_list) {
   bool result = true;
   std::vector<std::string> unknown_flags;
   for (auto& flag : flags) {
@@ -321,47 +301,45 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
       }
     }
   }
-  auto IsEmpty = [](const std::string& flag) { return flag.empty(); };
+  auto IsEmpty = [](absl::string_view flag) { return flag.empty(); };
   flags.erase(std::remove_if(flags.begin(), flags.end(), IsEmpty), flags.end());
   return result;
 }
 
-/*static*/ string Flags::Usage(const string& cmdline,
-                               const std::vector<Flag>& flag_list) {
-  string usage_text;
+/*static*/ std::string Flags::Usage(absl::string_view cmdline,
+                                    absl::Span<const Flag> flag_list) {
+  std::string usage_text;
   if (!flag_list.empty()) {
-    strings::Appendf(&usage_text, "usage: %s\nFlags:\n", cmdline.c_str());
+    absl::StrAppendFormat(&usage_text, "usage: %s\nFlags:\n", cmdline);
   } else {
-    strings::Appendf(&usage_text, "usage: %s\n", cmdline.c_str());
+    absl::StrAppendFormat(&usage_text, "usage: %s\n", cmdline);
   }
   for (const Flag& flag : flag_list) {
     const char* type_name = "";
-    string flag_string;
+    std::string flag_string;
     if (flag.type_ == Flag::TYPE_INT32) {
       type_name = "int32";
-      flag_string = strings::Printf("--%s=%d", flag.name_.c_str(),
+      flag_string = absl::StrFormat("--%s=%d", flag.name_,
                                     flag.int32_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_INT64) {
       type_name = "int64";
-      flag_string = strings::Printf(
-          "--%s=%lld", flag.name_.c_str(),
-          static_cast<long long>(flag.int64_default_for_display_));
+      flag_string = absl::StrFormat("--%s=%d", flag.name_,
+                                    flag.int64_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_BOOL) {
       type_name = "bool";
-      flag_string =
-          strings::Printf("--%s=%s", flag.name_.c_str(),
-                          flag.bool_default_for_display_ ? "true" : "false");
+      flag_string = absl::StrFormat("--%s=%v", flag.name_,
+                                    flag.bool_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_STRING) {
       type_name = "string";
-      flag_string = strings::Printf("--%s=\"%s\"", flag.name_.c_str(),
-                                    flag.string_default_for_display_.c_str());
+      flag_string = absl::StrFormat("--%s=\"%s\"", flag.name_,
+                                    flag.string_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_FLOAT) {
       type_name = "float";
-      flag_string = strings::Printf("--%s=%f", flag.name_.c_str(),
+      flag_string = absl::StrFormat("--%s=%f", flag.name_,
                                     flag.float_default_for_display_);
     }
-    strings::Appendf(&usage_text, "\t%-33s\t%s\t%s\n", flag_string.c_str(),
-                     type_name, flag.usage_text_.c_str());
+    absl::StrAppendFormat(&usage_text, "\t%-33s\t%s\t%s\n", flag_string,
+                          type_name, flag.usage_text_);
   }
   return usage_text;
 }
diff --git a/third_party/xla/xla/tsl/util/command_line_flags.h b/third_party/xla/xla/tsl/util/command_line_flags.h
index 50888879219f3c..55fd3366eaf67a 100644
--- a/third_party/xla/xla/tsl/util/command_line_flags.h
+++ b/third_party/xla/xla/tsl/util/command_line_flags.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
 #define XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
 
+#include <cstdint>
 #include <functional>
 #include <string>
 #include <vector>
 
-#include "xla/tsl/platform/types.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 
 namespace tsl {
 
@@ -63,15 +65,15 @@ namespace tsl {
 // text, and a pointer to the corresponding variable.
 class Flag {
  public:
-  Flag(const char* name, int32* dst, const string& usage_text,
+  Flag(const char* name, int32_t* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, int64_t* dst, const string& usage_text,
+  Flag(const char* name, int64_t* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, bool* dst, const string& usage_text,
+  Flag(const char* name, bool* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, string* dst, const string& usage_text,
+  Flag(const char* name, std::string* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, float* dst, const string& usage_text,
+  Flag(const char* name, float* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
 
   // These constructors invoke a hook on a match instead of writing to a
@@ -81,22 +83,22 @@ class Flag {
   // "default_value_for_display" is shown as the default value of this flag in
   // Flags::Usage().
   Flag(const char* name, std::function<bool(int32_t)> int32_hook,
-       int32_t default_value_for_display, const string& usage_text);
+       int32_t default_value_for_display, absl::string_view usage_text);
   Flag(const char* name, std::function<bool(int64_t)> int64_hook,
-       int64_t default_value_for_display, const string& usage_text);
+       int64_t default_value_for_display, absl::string_view usage_text);
   Flag(const char* name, std::function<bool(float)> float_hook,
-       float default_value_for_display, const string& usage_text);
+       float default_value_for_display, absl::string_view usage_text);
   Flag(const char* name, std::function<bool(bool)> bool_hook,
-       bool default_value_for_display, const string& usage_text);
-  Flag(const char* name, std::function<bool(string)> string_hook,
-       string default_value_for_display, const string& usage_text);
+       bool default_value_for_display, absl::string_view usage_text);
+  Flag(const char* name, std::function<bool(std::string)> string_hook,
+       std::string default_value_for_display, absl::string_view usage_text);
 
  private:
   friend class Flags;
 
-  bool Parse(string arg, bool* value_parsing_ok) const;
+  bool Parse(absl::string_view arg, bool* value_parsing_ok) const;
 
-  string name_;
+  std::string name_;
   enum {
     TYPE_INT32,
     TYPE_INT64,
@@ -106,7 +108,7 @@ class Flag {
   } type_;
 
   std::function<bool(int32_t)> int32_hook_;
-  int32 int32_default_for_display_;
+  int32_t int32_default_for_display_;
 
   std::function<bool(int64_t)> int64_hook_;
   int64_t int64_default_for_display_;
@@ -117,10 +119,10 @@ class Flag {
   std::function<bool(bool)> bool_hook_;
   bool bool_default_for_display_;
 
-  std::function<bool(string)> string_hook_;
-  string string_default_for_display_;
+  std::function<bool(std::string)> string_hook_;
+  std::string string_default_for_display_;
 
-  string usage_text_;
+  std::string usage_text_;
 };
 
 class Flags {
@@ -130,17 +132,17 @@ class Flags {
   // with matching flags, and remove the matching arguments from (*argc, argv).
   // Return true iff all recognized flag values were parsed correctly, and the
   // first remaining argument is not "--help".
-  static bool Parse(int* argc, char** argv, const std::vector<Flag>& flag_list);
+  static bool Parse(int* argc, char** argv, absl::Span<const Flag> flag_list);
 
   // Similar as above, but accepts a mutable vector of strings in place of
   // argc and argv. Doesn't ignore the first flag, and return the unknown flags
   // back in flags vector.
   static bool Parse(std::vector<std::string>& flags,
-                    const std::vector<Flag>& flag_list);
+                    absl::Span<const Flag> flag_list);
   // Return a usage message with command line cmdline, and the
   // usage_text strings in flag_list[].
-  static string Usage(const string& cmdline,
-                      const std::vector<Flag>& flag_list);
+  static std::string Usage(absl::string_view cmdline,
+                           absl::Span<const Flag> flag_list);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/util/device_name_utils.cc b/third_party/xla/xla/tsl/util/device_name_utils.cc
index cc5fa36a5ebcab..a79551b6b7f080 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils.cc
@@ -16,29 +16,36 @@ limitations under the License.
 #include "xla/tsl/util/device_name_utils.h"
 
 #include <algorithm>
-
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/types.h"
 #include "tsl/platform/str_util.h"
 
 namespace tsl {
 
-static bool IsAlpha(char c) {
-  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
-}
-
 static bool IsAlphaNumOrUnderscore(char c) {
-  return IsAlpha(c) || (c >= '0' && c <= '9') || c == '_';
+  return absl::ascii_isalnum(c) || c == '_';
 }
 
 // Returns true iff "in" is a valid job name.
 static bool IsJobName(absl::string_view in) {
-  return !in.empty() && IsAlpha(in.front()) &&
+  return !in.empty() && absl::ascii_isalpha(in.front()) &&
          std::all_of(in.begin(), in.end(), IsAlphaNumOrUnderscore);
 }
 
 static bool ConsumePrefix(absl::string_view* in, string* out,
                           absl::string_view prefix_terminators) {
-  if (in->empty() || !IsAlpha(in->front())) {
+  if (in->empty() || !absl::ascii_isalpha(in->front())) {
     return false;
   }
   const auto end_it =
@@ -66,7 +73,7 @@ static bool ConsumeDeviceType(absl::string_view* in, string* device_type) {
 // Returns true and fills in "*val" iff "*in" starts with a decimal
 // number.
 static bool ConsumeNumber(absl::string_view* in, int* val) {
-  uint64 tmp;
+  uint64_t tmp;
   if (str_util::ConsumeLeadingDigits(in, &tmp)) {
     *val = tmp;
     return true;
@@ -75,16 +82,16 @@ static bool ConsumeNumber(absl::string_view* in, int* val) {
 }
 
 // Returns a fully qualified device name given the parameters.
-static string DeviceName(const string& job, int replica, int task,
-                         const string& device_prefix, const string& device_type,
-                         int id) {
+static std::string DeviceName(absl::string_view job, int replica, int task,
+                              absl::string_view device_prefix,
+                              absl::string_view device_type, int id) {
   CHECK(IsJobName(job)) << job;
   CHECK_LE(0, replica);
   CHECK_LE(0, task);
   CHECK(!device_type.empty());
   CHECK_LE(0, id);
-  return strings::StrCat("/job:", job, "/replica:", replica, "/task:", task,
-                         device_prefix, device_type, ":", id);
+  return absl::StrCat("/job:", job, "/replica:", replica, "/task:", task,
+                      device_prefix, device_type, ":", id);
 }
 
 /* static */
@@ -213,15 +220,15 @@ absl::Status DeviceNameUtils::CanonicalizeDeviceName(absl::string_view fullname,
   *canonical_name = "";
   ParsedName parsed_basename;
   if (!ParseFullName(basename, &parsed_basename)) {
-    return errors::InvalidArgument("Could not parse basename: ", basename,
-                                   " into a device specification.");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Could not parse basename: ", basename,
+                     " into a device specification."));
   }
   if (!(parsed_basename.has_job && parsed_basename.has_replica &&
         parsed_basename.has_task && parsed_basename.has_type &&
         parsed_basename.has_id)) {
-    return errors::InvalidArgument("Basename: ", basename,
-                                   " should be fully "
-                                   "specified.");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Basename: ", basename, " should be fully specified."));
   }
   ParsedName parsed_name;
   if (ParseLocalName(fullname, &parsed_name)) {
@@ -234,9 +241,9 @@ absl::Status DeviceNameUtils::CanonicalizeDeviceName(absl::string_view fullname,
     *canonical_name = ParsedNameToString(parsed_name);
     return absl::OkStatus();
   }
-  return errors::InvalidArgument("Could not parse ", fullname,
-                                 " into a device "
-                                 "specification.");
+  return absl::InvalidArgumentError(absl::StrCat("Could not parse ", fullname,
+                                                 " into a device "
+                                                 "specification."));
 }
 
 /* static */
@@ -366,10 +373,10 @@ absl::Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
   const auto& ParsedNameToString = DeviceNameUtils::ParsedNameToString;
   if (other.has_job) {
     if (target->has_job && target->job != other.job) {
-      return errors::InvalidArgument(
-          "Cannot merge devices with incompatible jobs: '",
-          ParsedNameToString(*target), "' and '", ParsedNameToString(other),
-          "'");
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot merge devices with incompatible jobs: '",
+                       ParsedNameToString(*target), "' and '",
+                       ParsedNameToString(other), "'"));
     }
     target->has_job = other.has_job;
     target->job = other.job;
@@ -377,10 +384,10 @@ absl::Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
 
   if (other.has_replica) {
     if (target->has_replica && target->replica != other.replica) {
-      return errors::InvalidArgument(
-          "Cannot merge devices with incompatible replicas: '",
-          ParsedNameToString(*target), "' and '", ParsedNameToString(other),
-          "'");
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot merge devices with incompatible replicas: '",
+                       ParsedNameToString(*target), "' and '",
+                       ParsedNameToString(other), "'"));
     }
     target->has_replica = other.has_replica;
     target->replica = other.replica;
@@ -388,10 +395,10 @@ absl::Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
 
   if (other.has_task) {
     if (target->has_task && target->task != other.task) {
-      return errors::InvalidArgument(
-          "Cannot merge devices with incompatible tasks: '",
-          ParsedNameToString(*target), "' and '", ParsedNameToString(other),
-          "'");
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot merge devices with incompatible tasks: '",
+                       ParsedNameToString(*target), "' and '",
+                       ParsedNameToString(other), "'"));
     }
     target->has_task = other.has_task;
     target->task = other.task;
@@ -400,10 +407,10 @@ absl::Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
   if (other.has_type) {
     if (target->has_type && target->type != other.type) {
       if (!allow_soft_placement) {
-        return errors::InvalidArgument(
-            "Cannot merge devices with incompatible types: '",
-            ParsedNameToString(*target), "' and '", ParsedNameToString(other),
-            "'");
+        return absl::InvalidArgumentError(
+            absl::StrCat("Cannot merge devices with incompatible types: '",
+                         ParsedNameToString(*target), "' and '",
+                         ParsedNameToString(other), "'"));
       }
       if (override_conflicts) {
         target->type = other.type;
@@ -421,10 +428,10 @@ absl::Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
   if (other.has_id) {
     if (target->has_id && target->id != other.id) {
       if (!allow_soft_placement) {
-        return errors::InvalidArgument(
-            "Cannot merge devices with incompatible ids: '",
-            ParsedNameToString(*target), "' and '", ParsedNameToString(other),
-            "'");
+        return absl::InvalidArgumentError(
+            absl::StrCat("Cannot merge devices with incompatible ids: '",
+                         ParsedNameToString(*target), "' and '",
+                         ParsedNameToString(other), "'"));
       }
       if (override_conflicts) {
         target->id = other.id;
diff --git a/third_party/xla/xla/tsl/util/proto/BUILD b/third_party/xla/xla/tsl/util/proto/BUILD
index ce6a593cebe7ef..b0045943904f8a 100644
--- a/third_party/xla/xla/tsl/util/proto/BUILD
+++ b/third_party/xla/xla/tsl/util/proto/BUILD
@@ -31,7 +31,8 @@ cc_library(
     srcs = ["proto_matchers.cc"],
     hdrs = ["proto_matchers.h"],
     visibility = [
-        "//learning/pathways:__subpackages__",  # For OSS
+        "//learning/pathways:__subpackages__",
+        "//third_party/pathways:__subpackages__",  # For OSS
         "//xla:__subpackages__",
     ],
     deps = [
diff --git a/third_party/xla/xla/tsl/util/reporter.cc b/third_party/xla/xla/tsl/util/reporter.cc
index 642bfea5bb011a..a2124135327bbf 100644
--- a/third_party/xla/xla/tsl/util/reporter.cc
+++ b/third_party/xla/xla/tsl/util/reporter.cc
@@ -15,7 +15,14 @@ limitations under the License.
 
 #include "xla/tsl/util/reporter.h"
 
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/types.h"
 #include "tsl/platform/str_util.h"
 
 namespace tsl {
@@ -46,8 +53,8 @@ absl::Status TestReportFile::Initialize() {
       fname_, absl::StrJoin(str_util::Split(test_name_, '/'), "__"));
   Env* env = Env::Default();
   if (env->FileExists(mangled_fname).ok()) {
-    return errors::InvalidArgument(
-        "Cannot create TestReportFile, file exists: ", mangled_fname);
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Cannot create TestReportFile, file exists: ", mangled_fname));
   }
   TF_RETURN_IF_ERROR(env->NewWritableFile(mangled_fname, &log_file_));
   TF_RETURN_IF_ERROR(log_file_->Flush());
diff --git a/third_party/xla/xla/tsl/xnnpack/BUILD b/third_party/xla/xla/tsl/xnnpack/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/xla/xla/tsl/xnnpack/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/xla/tsl/xnnpack/build_defs.bzl b/third_party/xla/xla/tsl/xnnpack/build_defs.bzl
new file mode 100644
index 00000000000000..5aab05cadd1132
--- /dev/null
+++ b/third_party/xla/xla/tsl/xnnpack/build_defs.bzl
@@ -0,0 +1,38 @@
+"""Macros for XNNPACK and YNNPACK."""
+
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+
+visibility(DEFAULT_LOAD_VISIBILITY)
+
+def if_ynnpack(if_true, if_false = []):
+    """Selection based on whether we are building XLA with YNNPACK integration.
+
+    Args:
+      if_true: Expression to evaluate if building with YNNPACK.
+      if_false: Expression to evaluate if building without YNNPACK.
+
+    Returns:
+      A select evaluating to either if_true or if_false as appropriate.
+    """
+    return select({
+        # YNNPACK is not tested on Windows.
+        "//xla/tsl:windows": if_false,
+        "//conditions:default": if_true,
+    })
+
+def ynn_cc_test(
+        srcs = [],
+        deps = [],
+        **kwargs):
+    """xla_cc_test rule with empty src and deps if not building with YNNPACK."""
+    xla_cc_test(
+        # CC_TEST_OK=Just defining `xla_cc_test` rule to be used in XLA.
+        srcs = if_ynnpack(srcs),
+        deps = if_ynnpack(if_true = deps, if_false = ["@com_google_googletest//:gtest_main"]),
+        # If not building with YNNPACK, we don't have any tests linked.
+        fail_if_no_test_linked = False,
+        # If not building with YNNPACK, we don't have any tests defined either.
+        fail_if_no_test_selected = False,
+        **kwargs
+    )
diff --git a/third_party/xla/xla/tuple_tree.h b/third_party/xla/xla/tuple_tree.h
index dcdf8442e8bca4..018fcb56031960 100644
--- a/third_party/xla/xla/tuple_tree.h
+++ b/third_party/xla/xla/tuple_tree.h
@@ -371,6 +371,8 @@ class TupleTree {
         .ok();
   }
 
+  bool IsTuple() const { return nodes_.size() > 1; }
+
   absl::Status CopyCompatibleSubtreeFrom(const TupleTree<T>& other,
                                          const ShapeIndex& src_index,
                                          const ShapeIndex& dst_index) {
diff --git a/third_party/xla/xla/tuple_tree_test.cc b/third_party/xla/xla/tuple_tree_test.cc
index 767beebf19a4e5..5741d694424682 100644
--- a/third_party/xla/xla/tuple_tree_test.cc
+++ b/third_party/xla/xla/tuple_tree_test.cc
@@ -747,5 +747,14 @@ TEST_F(TupleTreeTest, ToNode) {
   EXPECT_THAT(tree.ToNode({0, 0}), StatusIs(absl::StatusCode::kInvalidArgument,
                                             "Cannot index into a leaf node"));
 }
+
+TEST_F(TupleTreeTest, IsTuple) {
+  TupleTree<int> tuple_tree({5});
+  TupleTree<int> non_tuple_tree(5);
+
+  EXPECT_TRUE(tuple_tree.IsTuple());
+  EXPECT_FALSE(non_tuple_tree.IsTuple());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index 0b12bc1ffafd01..06d78a424daa8c 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -286,7 +286,7 @@ std::string HumanReadableNumOps(double flops, double nanoseconds,
       absl::EndsWith(sp, "b")) {
     *throughput.rbegin() = 'G';
   }
-  throughput += absl::StrCat(op_prefix, "OP/s");
+  absl::StrAppend(&throughput, op_prefix, "OP/s");
   return throughput;
 }
 
@@ -334,11 +334,6 @@ void LogLines(absl::LogSeverity sev, absl::string_view text, const char* fname,
   }
 }
 
-int64_t Product(absl::Span<const int64_t> xs) {
-  return absl::c_accumulate(xs, static_cast<int64_t>(1),
-                            std::multiplies<int64_t>());
-}
-
 std::vector<int64_t> ElemwiseProduct(absl::Span<const int64_t> a,
                                      absl::Span<const int64_t> b) {
   CHECK_EQ(a.size(), b.size());
@@ -543,25 +538,5 @@ std::string PrintAllFields(const tsl::protobuf::Message& message) {
   return result.str();
 }
 
-std::unique_ptr<void, FreeDeleter> AlignedAlloc(std::size_t alignment,
-                                                std::size_t size) {
-  CHECK_GT(alignment, 0) << "alignment must be positive";
-  CHECK(IsPowerOf2(alignment))
-      << "alignment must be a power of 2, but got " << alignment;
-  CHECK_GT(size, 0) << "size must be positive";
-#ifdef _WIN32
-  void* raw_ptr = _aligned_malloc(size, alignment);  // Note argument order
-#elif defined(__ANDROID__) && __ANDROID_API__ < 28
-  // Use posix_memalign as a fallback for older Android APIs
-  void* raw_ptr;
-  int result = posix_memalign(&raw_ptr, alignment, size);
-  CHECK_EQ(result, 0) << "posix_memalign failed with error code: " << result;
-#else
-  void* raw_ptr = std::aligned_alloc(alignment, size);
-#endif
-  CHECK_NE(raw_ptr, nullptr) << "aligned_alloc failed";
-  // Return unique_ptr managing the memory.
-  return std::unique_ptr<void, FreeDeleter>(raw_ptr, FreeDeleter());
-}
-
+int64_t Product(absl::Span<const int64_t> xs) { return Product<int64_t>(xs); }
 }  // namespace xla
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index 2e3a720cbcf44c..7bb69650e3f854 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -756,6 +756,11 @@ std::unique_ptr<Derived> unique_ptr_down_cast(std::unique_ptr<Base> ptr) {
   return absl::WrapUnique(tensorflow::down_cast<Derived*>(ptr.release()));
 }
 
+template <typename T>
+T Product(absl::Span<const T> xs) {
+  return absl::c_accumulate(xs, static_cast<T>(1), std::multiplies<T>());
+}
+
 int64_t Product(absl::Span<const int64_t> xs);
 
 // Returns an array of results after performing elementwise product of a and b.
@@ -1000,31 +1005,12 @@ using Vector3 = std::array<int64_t, 3>;
 std::string PrintAllFields(const tsl::protobuf::Message& message);
 
 // Returns true if x is a power of 2.
-constexpr bool IsPowerOf2(size_t x) noexcept {
+ABSL_DEPRECATE_AND_INLINE()
+constexpr bool IsPowerOf2(size_t x) {
   // Checks that x is non-zero and has only a single bit set.
-  return x != 0 && (x & (x - 1)) == 0;
+  return absl::has_single_bit(x);
 }
 
-// A custom deleter that frees the pointer via std::free().
-struct FreeDeleter {
-  void operator()(void* ptr) {
-#if defined(_WIN32)
-    _aligned_free(ptr);
-#else
-    std::free(ptr);
-#endif
-  }
-};
-
-/**
- * @brief Allocates memory with specified alignment.
- * @param alignment Specifies the alignment. Power of two.
- * @param size The number of bytes to allocate. Integral multiple of alignment
- * @return A unique_ptr managing the allocated memory.
- */
-std::unique_ptr<void, FreeDeleter> AlignedAlloc(std::size_t alignment,
-                                                std::size_t size);
-
 // Note that STRING is evaluated regardless of whether it will be logged.
 #define XLA_LOG_LINES(SEV, STRING) \
   ::xla::LogLines##SEV(STRING, __FILE__, __LINE__)
diff --git a/third_party/xla/xla/xla.default.bzl b/third_party/xla/xla/xla.default.bzl
index 3a975e4df2bc1b..8d829c93c2eabb 100644
--- a/third_party/xla/xla/xla.default.bzl
+++ b/third_party/xla/xla/xla.default.bzl
@@ -1,11 +1,15 @@
 """Wrapper around proto libraries used inside the XLA codebase."""
 
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:dicts.bzl", "dicts")
+load("@bazel_skylib//lib:paths.bzl", "paths")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
 load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
+load("@rules_cc//cc/common:cc_info.bzl", "CcInfo")
+load("//xla:py_strict.bzl", "py_strict_test")
 load(
     "//xla/tsl:package_groups.bzl",
     "DEFAULT_LOAD_VISIBILITY",
@@ -20,6 +24,7 @@ load(
     "tf_exec_properties",
 )
 load("//xla/tsl/platform/default:build_config.bzl", "strict_cc_test")
+load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 visibility(DEFAULT_LOAD_VISIBILITY + LEGACY_XLA_USERS)
 
@@ -111,3 +116,82 @@ def xla_bzl_library(name = "xla_bzl_library"):
             "@bazel_skylib//:bzl_library",
         ],
     )
+
+def _symlink_dynamic_libs_rule_impl(ctx):
+    runfiles = ctx.runfiles()
+    runfiles_symlinks = {}
+    for dep in ctx.attr.deps:
+        linker_inputs = dep[CcInfo].linking_context.linker_inputs.to_list()
+        for linker_input in linker_inputs:
+            if len(linker_input.libraries) == 0:
+                continue
+            lib = linker_input.libraries[0].dynamic_library
+            if not lib:
+                continue
+            lib_path = paths.join(ctx.attr.lib_dir, lib.basename)
+            runfiles_symlinks[lib_path] = lib
+    return [
+        DefaultInfo(runfiles = ctx.runfiles(
+            symlinks = runfiles_symlinks,
+        ).merge(runfiles)),
+    ]
+
+_symlink_dynamic_libs_rule = rule(
+    implementation = _symlink_dynamic_libs_rule_impl,
+    attrs = {
+        "deps": attr.label_list(allow_empty = True),
+        "lib_dir": attr.string(mandatory = True),
+    },
+    doc = "Symlinks all dynamic libraries for `deps` into a single `lib_dir` directory.",
+)
+
+def xla_py_strict_test(name, deps = None, data = None, env = None, need_cuda_libs = False, **kwargs):
+    """A wrapper around py_strict_test that adds XLA-specific dependencies.
+
+    Args:
+      name: The name of the test.
+      deps: The dependencies of the test.
+      data: The data dependencies of the test.
+      env: The environment variables to set for the test.
+      need_cuda_libs: Whether to add CUDA libraries as data dependencies.
+      **kwargs: Other arguments to pass to the test.
+    """
+    deps = deps or []
+    data = data or []
+    env = env or {}
+
+    if need_cuda_libs:
+        library_target = "_{}_libs".format(name)
+        lib_dir = paths.join(
+            native.package_name(),
+            library_target,
+        )
+
+        # If the python tests needs to have CUDA libraries as data dependencies, we symlink
+        # them into a directory inside the runfiles directory that the test can access and add
+        # that directory to the LD_LIBRARY_PATH and CUDA_HOME environment variables.
+        _symlink_dynamic_libs_rule(
+            name = library_target,
+            lib_dir = lib_dir,
+            deps = if_cuda_is_configured(
+                [
+                    "//xla/stream_executor/cuda:all_runtime",
+                ],
+            ),
+            testonly = True,
+            visibility = ["//visibility:private"],
+        )
+
+        data = data + [library_target]
+        env = dicts.add(env, {
+            "CUDA_HOME": lib_dir,
+            "LD_LIBRARY_PATH": lib_dir,
+        })
+
+    py_strict_test(
+        name = name,
+        deps = deps + xla_py_test_deps(),
+        data = data,
+        env = env,
+        **kwargs
+    )
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 9db76193debf16..6b6325408ccfaf 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -26,6 +26,24 @@ message CompilationEnvironmentsProto {
   repeated google.protobuf.Any environments = 1;
 }
 
+// A [first; last] range of integers, inclusive on both ends.
+message IntRangeInclusive {
+  int64 first = 1;
+  int64 last = 2;
+}
+
+// Set of filters for limiting the thunk buffer debug instrumentation to
+// specific thunks. Only meaningful in combination with either
+// `xla_gpu_experimental_enable_checksum_tracing_on_thunks` or
+// `xla_gpu_experimental_enable_checksum_validation_on_thunks`.
+message ThunkBufferDebugFilter {
+  // If set, only thunk IDs matching one or more of the ranges will be included.
+  repeated IntRangeInclusive thunk_id_ranges = 1;
+  // If set, only thunks with profile annotations matching one or more of the
+  // regexes will be included.
+  repeated string profile_annotation_regexes = 2;
+}
+
 // Debugging options for XLA. These options may change at any time - there are
 // no guarantees about backward or forward compatibility for these fields with
 // the exception of few stable GPU feature flags (see `GetFlagStatus` in
@@ -130,6 +148,10 @@ message DebugOptions {
   // Return an error if HostOffloader would have automatically offloaded some
   // compute to the host.
   optional bool xla_disable_automatic_host_compute_offload = 408;
+  // Setting xla_enable_scoped_logging_timers to false will disable some of the
+  // timers (not all places support this). This is useful during autotuning
+  // compilation, where we want to avoid the overhead of the timers.
+  optional bool xla_enable_scoped_logging_timers = 436;
   // Perform hash-based cycle detection in fixed-point loops.
   optional bool xla_hlo_pass_fix_detect_cycles = 370;
   // Keep shardings after SPMD.
@@ -142,8 +164,6 @@ message DebugOptions {
   optional bool xla_unsupported_crash_on_hlo_pass_silent_hlo_change = 380;
   // go/keep-sorted end
 
-  reserved 346;  // xla_experimental_exec_time_optimization_effort
-
   //--------------------------------------------------------------------------//
   // XLA:CPU options.
   //--------------------------------------------------------------------------//
@@ -155,6 +175,7 @@ message DebugOptions {
     LIBRARY_FUSION_TYPE_DOT = 1;  // Dot and any eltwise ops around it.
     LIBRARY_FUSION_TYPE_ELTWISE = 2;
     LIBRARY_FUSION_TYPE_REDUCE = 3;
+    LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT = 4;
   }
 
   enum XnnGraphFusionMode {
@@ -202,18 +223,27 @@ message DebugOptions {
   // below!
   optional bool xla_cpu_enable_fast_min_max = 140;
 
+  // When xla_enable_platform_dependent_math is true, we allow operations to use
+  // calculations that produce different results depending on the current
+  // machine.
+  optional bool xla_cpu_enable_platform_dependent_math = 425;
+
   // Call oneDNN custom call thunks in the CPU backend
   optional bool xla_cpu_experimental_onednn_custom_call = 412;
 
-  // Stores the fusion types enabled for oneDNN in DotLibraryRewriter pass.
+  // Stores the fusion types enabled for oneDNN in LibraryRewriter pass.
   repeated LibraryFusionType xla_cpu_experimental_onednn_fusion_type = 399;
 
-  // Stores the fusion types enabled for XNNPACK in DotLibraryRewriter pass.
+  // Stores the fusion types enabled for XNNPACK in LibraryRewriter pass.
   repeated LibraryFusionType xla_cpu_experimental_xnn_fusion_type = 400;
 
   // Controls XnnGraphFusion HLO pass.
   optional XnnGraphFusionMode xla_cpu_experimental_xnn_graph_fusion_mode = 365;
 
+  // Stores the fusion types enabled for YNNPACK in LibraryRewriter pass or
+  // for individual operations.
+  repeated LibraryFusionType xla_cpu_experimental_ynn_fusion_type = 422;
+
   // When xla_cpu_enable_fast_math is true then this controls whether we forbid
   // to use the reciprocal of an argument instead of division. Ignored when
   // xla_cpu_enable_fast_math is false.
@@ -264,19 +294,13 @@ message DebugOptions {
 
   // go/keep-sorted end
 
-  reserved 298;  // Was xla_cpu_use_thunk_runtime
-
   //--------------------------------------------------------------------------//
   // XLA:GPU options.
   //--------------------------------------------------------------------------//
   // clang-format off
-  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["optional AutotuneCacheMode","optional bool","optional float","optional int32","optional int64","optional LibNvJitLinkMode","map<string, string>","optional PGLEStrictnessLevel","optional PipelineParallelismOptLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","optional ShapeChecks","optional string","optional WhileLoopUnrolling","reserved","repeated GenericTritonEmitterFeature","optional CommandBufferSchedulingMode"] // NOLINT
+  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["optional AutotuneCacheMode","optional bool","optional float","optional int32","optional int64","optional LibNvJitLinkMode","map<string, string>","optional PGLEStrictnessLevel","optional PipelineParallelismOptLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","optional ShapeChecks","optional string","optional WhileLoopUnrolling","repeated GenericTritonEmitterFeature","optional CommandBufferSchedulingMode"] // NOLINT
   // clang-format on
 
-  reserved 160;  // Was xla_gpu_enable_cudnn_frontend
-
-  reserved 352;  // xla_gpu_dump_hlo_unoptimized_snapshots
-
   // Command buffer scheduling mode.
   // SERIALIZE: Serialize all commands in a command buffer.
   // CONCURRENT: Identify concurrent across operator through data conflicts.
@@ -289,27 +313,6 @@ message DebugOptions {
     LHS = 2;
   }
 
-  // Options for the generic Triton emitter.
-  // Set with xla_gpu_unsupported_generic_triton_emitter_features.
-  enum GenericTritonEmitterFeature {
-    // No specfic meaning, zero value for protobuf best practices.
-    GENERIC_TRITON_EMITTER_UNSPECIFIED = 0;
-    // Enable nest_gemm_fusion pass to convert gemms to be emitted by the
-    // generic Triton emitter.
-    GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM = 1;
-    // Disable legacy GEMM emitter, that might lead to crashes if GEMM is not
-    // supported by the generic emitter.
-    GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM = 2;
-    // Do not restrict which ops can be present in the GEMM fusion.
-    GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION = 3;
-    // Do not restrict the shapes of the operands and the result of the dot
-    // instruction.
-    GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES = 4;
-    // Fail in autotuner if any of the configs are not supported.
-    // Otherwise, the autotuner will silenly ignore configs that are regected.
-    GENERIC_TRITON_EMITTER_MUST_ACCEPT_ALL_AUTOTUNER_CONFIGS = 5;
-  }
-
   // Experimental optimizations for SPMD-based pipeline parallelism on GPU.
   enum PipelineParallelismOptLevel {
     PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE = 0;
@@ -320,15 +323,24 @@ message DebugOptions {
     // Additionally, enable collective-permute cycle decomposer. This set of
     // optimizations will lead to best overlap for trivial pipeline parallelism
     // implementation.
-    reserved 2;  // Was PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER
+    reserved "PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER";
+    reserved 2;
   }
 
+  // Limits the thunk buffer debug instrumentation to specific thunks.
+  optional ThunkBufferDebugFilter
+      xla_gpu_experimental_thunk_buffer_debug_filter = 424;
+
   // If true, every time an HLO module is run, we will dump an
   // HloUnoptimizedSnapshot (essentially, a serialized unoptimizedmodule plus
   // its inputs) to the --xla_dump_to directory. This flag is currently
   // supported on XLA:GPU and XLA:CPU.
   optional bool xla_dump_hlo_unoptimized_snapshots = 405;
 
+  // Enable communication optimization patterns specified in Enzyme. More
+  // details in http://shortn/_jXJ2VFoyMN.
+  optional bool xla_enable_enzyme_comms_opt = 429;
+
   // Denylist for cuDNN convolutions.
   optional string xla_gpu_algorithm_denylist_path = 128;
 
@@ -458,6 +470,8 @@ message DebugOptions {
   // compilation, possibly multiple times per process. This only works on CUDA.
   optional string xla_gpu_dump_autotune_results_to = 222;
 
+  // The flag is being generalized to dump all autotuned instructions as we
+  // combine the autotuner passes into a single pass.
   optional bool xla_gpu_dump_autotuned_gemm_fusions = 232;
 
   // Whether to dump llvm ir when compiling to ptx.
@@ -582,6 +596,17 @@ message DebugOptions {
 
   optional bool xla_gpu_exhaustive_tiling_search = 219;
 
+  // If true, allows unroll factor 8 on Blackwell architectures. This is also
+  // guarded with a heuristic, but the heuristic is not perfect, so enabling
+  // this flag can cause both performance improvements and performance
+  // regressions.
+  optional bool xla_gpu_experimental_allow_unroll_factor_eight = 430;
+
+  // Enables an Ahead-of-Time (AOT) compilation flow where the compiled binary
+  // includes the generated Thunks. In contrast, the legacy flow only compiles
+  // up to the HLO optimization stage, before Thunk generation.
+  optional bool xla_gpu_experimental_aot_compiled_thunks = 435;
+
   // Specifies the behavior of per kernel autotuning cache.
   optional AutotuneCacheMode xla_gpu_experimental_autotune_cache_mode = 324;
 
@@ -600,11 +625,24 @@ message DebugOptions {
   // Dump FDO profiles in a binary format to a separate file.
   optional bool xla_gpu_experimental_dump_fdo_profiles = 338;
 
+  // Dump the serialized GPU executables to 'gpu_executable' suffixed
+  // files in the directory specified by `xla_dump_to`.
+  // No-op if `xla_dump_to` isn't set, or during autotuning compilations.
+  //
+  // The dumped files are serialized `xla.ExecutableAndOptionsProto` messages,
+  // which contain the binary serialized `xla.gpu.GpuExecutableProto`, in the
+  // `serialized_executable` field.
+  optional bool xla_gpu_experimental_dump_gpu_executable = 427;
+
   // Enable windowed einsum(collective matmul) rewrite for all-to-all + gemm
   // This feature is still experimental and effective only
   // xla_gpu_multi_streamed_windowed_einsum is set to true.
   optional bool xla_gpu_experimental_enable_alltoall_windowed_einsum = 360;
 
+  // Enables an experimental feature to record outputs of selected thunks.
+  // Writes to --xla_dump_to.
+  optional bool xla_gpu_experimental_enable_buffer_saver_on_thunks = 431;
+
   // Enables an experimental feature to record checksums of selected thunk
   // inputs/outputs.
   optional bool xla_gpu_experimental_enable_checksum_tracing_on_thunks = 414;
@@ -657,9 +695,12 @@ message DebugOptions {
   optional bool xla_gpu_experimental_enable_triton_heroless_priority_fusion =
       340;
 
-  // When possible, XLA will use Triton's experimental TMA feature.
+  // When possible, XLA will use Triton's TMA loads/stores.
   optional bool xla_gpu_experimental_enable_triton_tma = 355;
 
+  // When possible, XLA will use Triton's auto warp specialization feature.
+  optional bool xla_gpu_experimental_enable_triton_warp_specialization = 421;
+
   // For sub-byte dot operands, layout them along contracting dimensions.
   optional bool xla_gpu_experimental_pack_dot_operands_along_k_dimension = 362;
 
@@ -719,6 +760,10 @@ message DebugOptions {
 
   optional bool xla_gpu_fused_attention_use_cudnn_rng = 235;
 
+  // A textproto file to override autotune results. See also
+  // `xla_gpu_override_gemm_autotuner` to override with a single config.
+  optional string xla_gpu_gemm_autotuner_override_file = 434;
+
   // Threshold to rewrite matmul to cuBLAS or Triton (minimum combined number of
   // elements of both matrices in non-batch dimensions to be considered for a
   // rewrite).
@@ -897,14 +942,6 @@ message DebugOptions {
   // TODO(b/390559452): Remove the flag once the feature is stable.
   optional bool xla_gpu_unsupported_enable_triton_multi_output_fusion = 382;
 
-  // Controls smaller knobs of the generic Triton emitter.
-  // Do not rely on this flag currently supported values as they are subject to
-  // change. Check 'debug_option_flags' for description.
-  // TODO(b/393299275): remove the flag once we fully switched to the new
-  // emitter.
-  repeated GenericTritonEmitterFeature
-      xla_gpu_unsupported_generic_triton_emitter_features = 398;
-
   // Internal debug/testing flag to override the number of devices in the fast
   // interconnect domain. Default is 0, which means the number of devices is not
   // overridden.
@@ -939,14 +976,6 @@ message DebugOptions {
 
   // go/keep-sorted end
 
-  reserved 167;  // xla_gpu_redzone_scratch_max_megabytes
-  reserved 266;  // xla_gpu_enable_triton_hopper
-  reserved 276;  // xla_gpu_enable_nccl_per_stream_comms
-  reserved 226;  // xla_gpu_triton_gemm_disable_reduced_precision_reduction
-  reserved 385;  // xla_gpu_experimental_enable_dynamic_dot_search_space
-  reserved "xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms";
-  reserved 367;
-
   //--------------------------------------------------------------------------//
   // XLA:TPU options.
   //--------------------------------------------------------------------------//
@@ -995,9 +1024,6 @@ message DebugOptions {
   // mode.
   optional bool xla_cpu_multi_thread_eigen = 60;
 
-  reserved 63;   // Was xla_gpu_disable_multi_streaming
-  reserved 134;  // Was xla_gpu_use_random_streams
-
   // If true, in LLVM-based backends, emit !alias.scope metadata in
   // generated IR.
   optional bool xla_llvm_enable_alias_scope_metadata = 70;
@@ -1013,8 +1039,6 @@ message DebugOptions {
   // If true, a set of expensive LLVM optimization passes will not be run.
   optional bool xla_llvm_disable_expensive_passes = 73;
 
-  reserved 80;  // Was hlo_reduce_precision_options
-
   // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
   // computation will run n! times with all permunations of layouts for the
   // output shape in rank n. For example, with a 3D shape, all permutations of
@@ -1031,16 +1055,9 @@ message DebugOptions {
   // HLO graph.
   optional bool xla_hlo_graph_sharding_color = 92;
 
-  reserved 93;  // Was xla_hlo_tfgraph_device_scopes
-  reserved 94;  // Was xla_gpu_use_cudnn_batchnorm
-
   // Call oneDNN thunks for matmul and convolution fusions in the CPU backend.
   optional bool xla_cpu_use_onednn = 97;
 
-  reserved 177;  // Was xla_cpu_use_xla_runtime
-  reserved 98;   // Was xla_gpu_max_kernel_unroll_factor
-  reserved 207;  // Was xla_cpu_sparse_cuda_threads
-
   // Allows xla to increase the output precision of floating point operations
   // and all floating-point conversions to be simplified, including those
   // that affect the numerics. The `FloatNormalization` pass inserts many
@@ -1058,10 +1075,6 @@ message DebugOptions {
   // the host that run models in parallel across multiple devices.
   optional int32 xla_force_host_platform_device_count = 102;
 
-  reserved 171;  // Was xla_cpu_enable_mlir_lowering
-  reserved 173;  // Was xla_gpu_enable_mlir_lowering
-  reserved 179;  // Was xla_gpu_enable_softmax_fusion
-
   // Enable fast math with eigen in the HLO evaluator.
   optional bool xla_hlo_evaluator_use_fast_path = 106;
 
@@ -1116,6 +1129,10 @@ message DebugOptions {
   // match this regular expression.  Set to .* to dump before/after all passes.
   optional string xla_dump_hlo_pass_re = 111;
 
+  // If specified, dumps debug logs (e.g. IR like LLVM or MLIR) before and
+  // after emitters that match this regular expression.
+  optional string xla_dump_emitter_re = 433;
+
   // Specifies the format that HLO is dumped in.  Multiple of these may be
   // specified.
   optional bool xla_dump_hlo_as_text = 112;
@@ -1159,8 +1176,6 @@ message DebugOptions {
   // Whether to dump mlir using pretty print form.
   optional bool xla_dump_full_hlo_config = 381;
 
-  reserved 130;  // Was xla_gpu_deterministic_reductions
-
   // Debug options that trigger execution errors when NaN or Inf are detected.
   optional bool xla_tpu_detect_nan = 135;
   optional bool xla_tpu_detect_inf = 136;
@@ -1168,14 +1183,10 @@ message DebugOptions {
   // True if TraceMe annotations are enabled for XLA:CPU.
   optional bool xla_cpu_enable_xprof_traceme = 137;
 
-  reserved 141;  // was xla_gpu_asm_extra_flags
-
   // Per-heap size constraint. New heaps will be created if per-heap max size is
   // reached.
   optional int32 xla_multiheap_size_constraint_per_heap = 142;
 
-  reserved 143;  // Was xla_detailed_logging_and_dumping
-
   // Enable detailed logging into vlog. If this is disabled, no
   // compilation summary will be printed in the end of computation.
   optional bool xla_detailed_logging = 252;
@@ -1183,18 +1194,6 @@ message DebugOptions {
   // Enable HLO dumping. If this is disabled, no HLO modules will be dumped.
   optional bool xla_enable_dumping = 253;
 
-  // Used to be xla_gpu_enable_async_all_reduce
-  // xla_gpu_enable_async_collective_broadcast
-  // xla_gpu_enable_async_collective_permute
-  // xla_gpu_enable_async_all_gather
-  // xla_gpu_enable_async_reduce_scatter
-  // xla_gpu_enable_async_all_to_all
-  // xla_gpu_enable_async_collectives
-  reserved 152, 278, 183, 199, 200, 201, 238;
-
-  // Was xla_gpu_all_reduce_contiguous, xla_gpu_enable_all_reduce_splitter
-  reserved 158, 299;
-
   // Whether to force inline before llvm module split to get a more balanced
   // splits for parallel compilation.
   optional bool xla_llvm_force_inline_before_split = 300;
@@ -1207,18 +1206,6 @@ message DebugOptions {
   // enables dumping in all pipelines.
   optional string xla_dump_hlo_pipeline_re = 154;
 
-  reserved 161;  // Was xla_gpu_bef_executable
-  reserved 162;  // Was xla_gpu_bef_thunk
-  reserved 169;  // Was xla_gpu_enable_xla_runtime_executable
-  reserved 233;  // was xla_gpu_enable_gpu2_runtime
-  reserved 234;  // was xla_gpu_enable_gpu2_hal
-  reserved 202;  // Was xla_gpu_graph_num_runs_to_instantiate
-  reserved 230;  // Was xla_gpu_graph_eviction_timeout_seconds
-  reserved 168;  // Was xla_gpu_simplify_all_fp_conversions.
-  reserved 172;  // Was xla_gpu_normalize_layouts.
-  reserved 263;  // Was xla_gpu_enable_custom_fusions
-  reserved 264;  // Was xla_gpu_enable_custom_fusions_re
-
   // Generate calls to Arm Compute Library in the CPU backend.
   optional bool xla_cpu_use_acl = 174;
 
@@ -1226,21 +1213,8 @@ message DebugOptions {
   // (much) faster on our hardware.  Set this flag to disable this behavior.
   optional bool xla_cpu_strict_dot_conv_math = 175;
 
-  reserved 402;  // Was xla_cpu_dump_unoptimized_hlo_snapshots
-
   optional bool xla_dump_latency_hiding_schedule = 182;
 
-  reserved 184;  // Was xla_cpu_enable_mlir_tiling_and_fusion.
-  reserved 192;  // Was xla_cpu_enable_mlir_fusion_outlining.
-  reserved 191;  // Was xla_cpu_enable_experimental_deallocation.
-  reserved 195;  // Was xla_cpu_enable_custom_matmul_tiling.
-  reserved 196;  // Was xla_cpu_matmul_tiling_m_dim.
-  reserved 197;  // Was xla_cpu_matmul_tiling_n_dim.
-  reserved 198;  // Was xla_cpu_matmul_tiling_k_dim.
-
-  reserved 204;  // Was xla_gpu_lhs_enable_gpu_async_tracker.
-  reserved 313;  // Was xla_gpu_run_post_layout_collective_pipeliner.
-
   enum PartitioningAlgorithm {
     PARTITIONING_ALGORITHM_NOOP = 0;
     PARTITIONING_ALGORITHM_EXP0 = 1;
@@ -1250,25 +1224,28 @@ message DebugOptions {
   // The partitioning algorithm to be used in the PartitionAssignment pass.
   optional PartitioningAlgorithm xla_partitioning_algorithm = 187;
 
-  reserved 211;  // Was xla_gpu_enable_dot_strength_reduction
-  reserved 220;  // Was xla_gpu_enable_triton_softmax_fusion
-  reserved 286;  // Was xla_gpu_enable_triton_softmax_priority_fusion
-
   // Maximum number of buffers to print when debugging buffer assignment.
   optional int64 xla_debug_buffer_assignment_show_max = 251;
 
-  enum UnstableReductionDetectionMode {
-    UNSTABLE_REDUCTION_DETECTION_MODE_NONE = 0;
-    UNSTABLE_REDUCTION_DETECTION_MODE_WARNING = 1;
-    UNSTABLE_REDUCTION_DETECTION_MODE_FAIL = 2;
+  enum DetectionMode {
+    DETECTION_MODE_NONE = 0;
+    DETECTION_MODE_WARNING = 1;
+    DETECTION_MODE_FAIL = 2;
   }
-  // Whether to enable checks for unstable reductions in computations.
-  optional UnstableReductionDetectionMode xla_detect_unstable_reductions = 403;
+  // Whether to enable checks for unstable reductions in computations
+  // pre-optimizations.
+  optional DetectionMode xla_detect_unstable_reductions = 403;
+
+  // Whether to enable checks for unstable reductions in computations
+  // post-optimizations.
+  optional DetectionMode xla_detect_unstable_reductions_post_optimizations =
+      432;
 
-  reserved 275;  // was xla_gpu_enable_mlir_emitters
-  reserved 281;  // was xla_gpu_max_mlir_kernels
-  reserved 282;  // was xla_gpu_skip_mlir_kernels
-  reserved 303;  // was xla_gpu_mlir_emitter_level
+  // Whether to enable checks for NaN values in computations.
+  optional DetectionMode xla_gpu_detect_nan = 426;
+
+  // Whether to enable checks for Inf values in computations.
+  optional DetectionMode xla_gpu_detect_inf = 428;
 
   // If true, large constants will be printed out when dumping HLOs.
   optional bool xla_dump_large_constants = 290;
@@ -1276,15 +1253,11 @@ message DebugOptions {
   // Base length to rewrite the reduce window to, no rewrite if set to 0.
   optional int64 xla_reduce_window_rewrite_base_length = 293;
 
-  reserved 302;  // was xla_use_shardy
-
   // The command buffer trace cache size, increasing the cache size may
   // sometimes reduces the chances of doing command buffer tracing for
   // updating command buffer instance.
   optional int64 xla_cmd_buffer_trace_cache_size = 311;
 
-  reserved 314;  // was legacy_command_buffer_custom_call_targets
-
   // This flag is used for controlling HLO dumping and NVTX marker. If turned
   // on, both HLO dumping and NVTX marker will use syntactic sugar wrappers
   // as op names, while the actual op names will be shown if turned off.
@@ -1317,8 +1290,6 @@ message DebugOptions {
   // TODO(b/355487968): Remove this option when validation complete.
   optional bool xla_enable_command_buffers_during_profiling = 317;
 
-  reserved 319;  // was xla_gpu_enable_libnvjitlink with boolean type.
-
   enum AutotuneCacheMode {
     AUTOTUNE_CACHE_MODE_UNSPECIFIED = 0;
 
@@ -1358,39 +1329,112 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 421
+  // Next id: 437
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
 
-  // Reserved tags were xla_hlo_dump_as_graphdef, xla_dump_to,
-  // xla_gpu_use_horizontal_fusion,
-  // xla_gpu_unsafe_fallback_to_driver_on_ptxas_error,
-  // xla_gpu_simplify_scatters, xla_gpu_simplify_gathers
-  // xla_gpu_enable_cuda_graphs
-  // xla_gpu_allow_all_reduce_kernel
-  // xla_gpu_enable_experimental_block_size
-  // xla_gpu_graph_level
-  // xla_gpu_single_wave_autotuning
-  // xla_gpu_enable_persistent_temp_buffers
-  // xla_gpu_enable_triton_gemm_int4
-  // xla_gpu_experimental_enable_triton_i4_rewrites
-  // xla_gpu_enable_priority_fusion
-  // xla_gpu_experimental_enable_triton_softmax_priority_fusion
-  // xla_gpu_pgle_accuracy_checker
-  // xla_gpu_enable_heuristic_pass_configuration
-  // xla_gpu_enable_dot_strength_reduction
-  // xla_gpu_triton_fusion_level
-  // xla_gpu_enable_bf16_3way_gemm
-  // xla_gpu_enable_bf16_6way_gemm
-  // xla_gpu_enable_cudnn_fmha
-  // xla_gpu_unsupported_force_triton_gemm
-  // xla_allow_get_default_platform
-  // xla_gpu_ensure_minor_dot_contraction_dims
-  // xla_gpu_unsafe_pipelined_loop_annotator
-  reserved 5, 117, 133, 139, 176, 178, 180, 193, 214, 194, 221, 242, 206, 320,
-      325, 326, 332, 361, 270, 229, 271, 279, 218, 369, 371, 249, 309;
+  reserved "hlo_reduce_precision_options";
+  reserved "legacy_command_buffer_custom_call_targets";
+  reserved "xla_allow_get_default_platform";
+  reserved "xla_cpu_dump_unoptimized_hlo_snapshots";
+  reserved "xla_cpu_enable_custom_matmul_tiling";
+  reserved "xla_cpu_enable_experimental_deallocation";
+  reserved "xla_cpu_enable_mlir_fusion_outlining";
+  reserved "xla_cpu_enable_mlir_lowering";
+  reserved "xla_cpu_enable_mlir_tiling_and_fusion";
+  reserved "xla_cpu_matmul_tiling_k_dim";
+  reserved "xla_cpu_matmul_tiling_m_dim";
+  reserved "xla_cpu_matmul_tiling_n_dim";
+  reserved "xla_cpu_sparse_cuda_threads";
+  reserved "xla_cpu_use_thunk_runtime";
+  reserved "xla_cpu_use_xla_runtime";
+  reserved "xla_detailed_logging_and_dumping";
+  reserved "xla_dump_ir";
+  reserved "xla_experimental_exec_time_optimization_effort";
+  reserved "xla_gpu_all_reduce_contiguous";
+  reserved "xla_gpu_allow_all_reduce_kernel";
+  reserved "xla_gpu_asm_extra_flags";
+  reserved "xla_gpu_bef_executable";
+  reserved "xla_gpu_bef_thunk";
+  reserved "xla_gpu_deterministic_reductions";
+  reserved "xla_gpu_disable_multi_streaming";
+  reserved "xla_gpu_dump_hlo_unoptimized_snapshots";
+  reserved "xla_gpu_enable_all_reduce_splitter";
+  reserved "xla_gpu_enable_async_all_gather";
+  reserved "xla_gpu_enable_async_all_reduce";
+  reserved "xla_gpu_enable_async_all_to_all";
+  reserved "xla_gpu_enable_async_collective_broadcast";
+  reserved "xla_gpu_enable_async_collective_permute";
+  reserved "xla_gpu_enable_async_collectives";
+  reserved "xla_gpu_enable_async_reduce_scatter";
+  reserved "xla_gpu_enable_bf16_3way_gemm";
+  reserved "xla_gpu_enable_bf16_6way_gemm";
+  reserved "xla_gpu_enable_cuda_graphs";
+  reserved "xla_gpu_enable_cudnn_fmha";
+  reserved "xla_gpu_enable_cudnn_frontend";
+  reserved "xla_gpu_enable_custom_fusions_re";
+  reserved "xla_gpu_enable_custom_fusions";
+  reserved "xla_gpu_enable_dot_strength_reduction";
+  reserved "xla_gpu_enable_experimental_block_size";
+  reserved "xla_gpu_enable_gpu2_hal";
+  reserved "xla_gpu_enable_gpu2_runtime";
+  reserved "xla_gpu_enable_heuristic_pass_configuration";
+  reserved "xla_gpu_enable_libnvjitlink";
+  reserved "xla_gpu_enable_mlir_emitters";
+  reserved "xla_gpu_enable_mlir_lowering";
+  reserved "xla_gpu_enable_nccl_per_stream_comms";
+  reserved "xla_gpu_enable_persistent_temp_buffers";
+  reserved "xla_gpu_enable_pgle_accuracy_checker";
+  reserved "xla_gpu_enable_priority_fusion";
+  reserved "xla_gpu_enable_softmax_fusion";
+  reserved "xla_gpu_enable_triton_gemm_int4";
+  reserved "xla_gpu_enable_triton_hopper";
+  reserved "xla_gpu_enable_triton_softmax_fusion";
+  reserved "xla_gpu_enable_triton_softmax_priority_fusion";
+  reserved "xla_gpu_enable_xla_runtime_executable";
+  reserved "xla_gpu_ensure_minor_dot_contraction_dims";
+  reserved "xla_gpu_experimental_enable_dynamic_dot_search_space";
+  reserved "xla_gpu_experimental_enable_nan_counter_on_thunks";
+  reserved "xla_gpu_experimental_enable_triton_i4_rewrites";
+  reserved "xla_gpu_experimental_enable_triton_softmax_priority_fusion";
+  reserved "xla_gpu_graph_eviction_timeout_seconds";
+  reserved "xla_gpu_graph_level";
+  reserved "xla_gpu_graph_num_runs_to_instantiate";
+  reserved "xla_gpu_lhs_enable_gpu_async_tracker";
+  reserved "xla_gpu_max_kernel_unroll_factor";
+  reserved "xla_gpu_max_mlir_kernels";
+  reserved "xla_gpu_mlir_emitter_level";
+  reserved "xla_gpu_normalize_layouts";
+  reserved "xla_gpu_redzone_scratch_max_megabytes";
+  reserved "xla_gpu_run_post_layout_collective_pipeliner";
+  reserved "xla_gpu_simplify_all_fp_conversions";
+  reserved "xla_gpu_simplify_gathers";
+  reserved "xla_gpu_simplify_scatters";
+  reserved "xla_gpu_single_wave_autotuning";
+  reserved "xla_gpu_skip_mlir_kernels";
+  reserved "xla_gpu_triton_fusion_level";
+  reserved "xla_gpu_triton_gemm_disable_reduced_precision_reduction";
+  reserved "xla_gpu_unsafe_fallback_to_driver_on_ptxas_error";
+  reserved "xla_gpu_unsafe_pipelined_loop_annotator";
+  reserved "xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms";
+  reserved "xla_gpu_unsupported_force_triton_gemm";
+  reserved "xla_gpu_unsupported_generic_triton_emitter_features";
+  reserved "xla_gpu_use_cudnn_batchnorm";
+  reserved "xla_gpu_use_horizontal_fusion";
+  reserved "xla_gpu_use_random_streams";
+  reserved "xla_hlo_dump_as_graphdef";
+  reserved "xla_hlo_tfgraph_device_scopes";
+  reserved "xla_use_shardy";
+
+  reserved 5, 63, 80, 93, 94, 98, 117, 130, 133, 134, 139, 141, 143, 152, 158,
+      160, 161, 162, 167, 168, 169, 171, 172, 173, 176, 177, 178, 179, 180, 183,
+      184, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 204, 206,
+      207, 211, 214, 218, 220, 221, 226, 229, 230, 233, 234, 238, 242, 249, 263,
+      264, 266, 270, 271, 275, 276, 278, 279, 281, 282, 286, 298, 299, 302, 303,
+      309, 313, 314, 319, 320, 325, 326, 332, 346, 352, 361, 367, 369, 371, 385,
+      398, 402, 423;
 }
 
 // Contains flags which affects the GPU compilation result.
@@ -1519,7 +1563,8 @@ message ExecutionOptions {
   // works on TPU.
   bool deduplicate_hlo = 12;
 
-  reserved 13;  // Was broadcast_replicated_parameters_via_collectives
+  reserved "broadcast_replicated_parameters_via_collectives";
+  reserved 13;
 
   // Allows sharding propagation to propagate to the parameters. This changes
   // the input shape of the computation (which is undesirable), but it can be
@@ -1576,7 +1621,7 @@ message ExecutionOptions {
 // Serialization of HloModuleConfig. See the C++ class definition for
 // descriptions of each field.
 // There are no guarantees of backwards or forwards compatibility.
-// Next id: 42.
+// Next id: 43.
 message HloModuleConfigProto {
   enum FusionConfigCollection {
     OFF = 0;       // Do not collect configuration.
@@ -1631,7 +1676,8 @@ message HloModuleConfigProto {
   repeated uint64 memory_space_assignment_config = 23;
   repeated BoolList phase_ordering_config = 24;
   int32 phase_index = 25;
-  reserved 26;  // Was flag_config
+  reserved "flag_config";
+  reserved 26;
   repeated bool allow_spmd_sharding_propagation_to_parameters = 33;
   repeated bool allow_spmd_sharding_propagation_to_output = 27;
   map<string, int64> analysis_allowance_map = 28;
@@ -1641,6 +1687,8 @@ message HloModuleConfigProto {
   bool use_shardy_partitioner = 34;
   ShardingConfigProto sharding_config = 38;
   ScheduleConfigProto schedule_config = 41;
+  // Number of devices in a fast-interconnect domain.
+  int64 partition_size = 42;
 }
 
 message HloModuleProtoWithConfig {
@@ -1672,9 +1720,10 @@ message ScheduleProto {
     double reduce_scatter_wasted_cycles = 7;
     double send_wasted_cycles = 8;
     double recv_wasted_cycles = 9;
-    double total_wasted_cycles = 10;
-    double total_cycles = 11;
-    int64 memory_pressure_peak = 12;
+    double call_wasted_cycles = 10;
+    double total_wasted_cycles = 11;
+    double total_cycles = 12;
+    int64 memory_pressure_peak = 13;
   }
 
   message ComputationScheduleProto {
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index 4a9a8e49aa9073..9557c9526209a1 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -157,6 +157,63 @@ enum PrimitiveType {
 //   https://www.tensorflow.org/code/tensorflow/compiler/xla/tools/driver.cc
 // )
 
+// In XLA:GPU we use different streams for different kinds of collective
+// operations, and include the async stream kind into the GPU clique key.
+//
+// We carefully isolate different kinds of collectives using separate
+// communicators and guarantee that all collective operations have a total order
+// that will not create a deadlock.
+enum AsyncStreamKind {
+  // Stream for asynchronous collective ops.
+  ASYNC_STREAM_KIND_COLLECTIVE = 0;
+  // One Stream for P2P Send and Recv ops.
+  ASYNC_STREAM_KIND_P2P0 = 1;
+  // Another Stream for P2P Send and Recv ops.
+  ASYNC_STREAM_KIND_P2P1 = 2;
+  // Stream for MemCpyP2P
+  ASYNC_STREAM_KIND_MEMCPYP2P = 3;
+}
+
+// There are broadly 4 modes that collective communication ops use to describe
+// which sets of devices are participating with a given device in the operation.
+// These modes are determined by the values of channel_id (optional) and
+// use_global_device_ids (optional). The modes are as follows:
+//
+// kCrossReplica:
+//    implied by: no channel id, use_global_device_ids = false, or
+//                no channel_id, no use_global_device_ids:
+//    replica_groups contain replica_id, group contains all replicas for the
+//    current partition
+//
+// kCrossPartition:
+//    implied by: channel_id is set, no use_global_device_ids:
+//    replica_groups contain partition_id, group contains all partitions for the
+//    current replica.
+//
+// kCrossReplicaAndPartition:
+//    implied by: channel_id is set, use_global_device_ids = false:
+//    replica_groups contain replica_id, group contains all replicas for all
+//    partitions (as opposed to just current partition).
+//
+// kFlattenedID:
+//    implied by: channel_id is set, use_global_device_ids = true:
+//    replica_groups contain flattened-ids, group contains devices that are
+//    listed in the flattened-id list.
+//
+// Rest of the combinations are invalid.
+//
+// Since the actual value of channel_id does not matter, we use a bool argument
+// `has_channel_id`, and optional<bool> for use_global_device_ids.
+// Note that use_global_device_ids true requires channel_id to be set as well.
+// Additionally, if use_global_device_ids = true, replica groups cannot be
+// empty (verified in the HLO verifier).
+enum CollectiveOpGroupMode {
+  COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA = 0;
+  COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION = 1;
+  COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION = 2;
+  COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID = 3;
+}
+
 // Describes the padding configuration for Pad operation. The padding amount on
 // both edges as well as between the elements are specified for each dimension.
 message PaddingConfig {
@@ -902,39 +959,43 @@ message StatisticsViz {
   repeated Statistic statistics = 2;
 }
 
-// A mesh is a list of axes and an optional list of device IDs specifying
-// the device ordering.
-//
-// If the list of axes is empty, the mesh has an implicit unnamed axis of
-// size 1. In this case, if a device ID list is not provided, the implicit
-// device ID list is [0]; if a device ID list is provided, it must
-// contains a single integer of any non-negative value. We call this
-// maximal-sharding case.
-//
-// For all non-maximal-sharding cases, if a device ID list is specified,
-// the product of the axis sizes should match the number of devices. If a
-// device ID list is not specified, the implicit device ID list is
-// iota(product(axes)). For simplicity, we also disallow specifying a
-// device ID list that is the same as iota(product(axes)); in this case, a
-// device ID list shouldn't be specified.
-//
+// A mesh is a list of axes and an optional list of device IDs specifying the
+// device ordering.
+
+// If the list of axes is empty
+//   - If the `device_ids` is not provided, it is an empty mesh.
+//   - If the `device_ids` is provided, it must be a single non-negative
+//     integer, we call it a **maximal-sharding mesh**.
+
+// If the list of axes is provided
+//   - If a device ID list is specified, the product of the axis sizes should
+//     match the number of devices.
+//   - If a device ID list is not specified, the implicit device ID list is
+//     iota(product(axes)). For simplicity, we also disallow specifying a
+//     device ID list that is the same as iota(product(axes)); in this case, a
+//     device ID list shouldn't be specified.
+//   - It is not a maximal-sharding mesh even if the total size of axes is 1.
+
 // Here are some examples of meshes:
+
 // - An empty mesh represents a placeholder mesh that can be replaced during
-// propagation: <[]>
-// - A mesh with an unnamed axis and an explicit device ID, which is
-// typically used to represent maximal sharding: <[], device_ids=[3]>
+//   propagation: <[]>
+// - A mesh without axes list and a single non-negative device ID, which is a
+//   maximal-sharding mesh: <[], device_ids=[3]>
 // - A mesh with two axes and implicit device IDs iota(6): <["a"=2, "b"=3]>
 // - A mesh with two axes and explicit device IDs specifying the device
-// ordering: <["a"=3, "b"=2], device_ids=[0, 2, 4, 1, 3, 5]>
-//
-// Constraints:
-// - Elements in `axes` must not have duplicate names.
-// - If `device_ids` is specified:
-//   * The product of axis sizes must match the number of devices.
-//   * All of its elements must be non-negative.
-//   * `device_ids` should not be equal to `iota(product(axis_sizes))`.
-//   * Sorted `device_ids` must be `iota(product(axis_sizes))`.
-message Mesh {
+//   ordering: <["a"=3, "b"=2], device_ids=[0, 2, 4, 1, 3, 5]>
+
+// **Constraints:**
+// - Elements in `device_ids` should be non-negative.
+// - If `axes` is empty, the size of `device_ids` can be 0 (empty mesh) or 1
+//   (maximal-sharding mesh).
+// - If `axes` is not empty,
+//     - Elements in `axes` must not have duplicate names.
+//     - If `device_ids` is specified, the original `device_ids` is not
+//       `iota(product(axis_sizes))` and the sorted `device_ids` is
+//       `iota(product(axis_sizes))`.
+message MeshProto {
   message MeshAxis {
     string name = 1;
     int64 size = 2;
@@ -949,8 +1010,8 @@ message Mesh {
 // axis name to reduce proto size.
 //
 // Constraints:
-// - `axis_index` is a valid index into `mesh.axes`.
-message AxisRef {
+// - `mesh_axis_index` is a valid index into `mesh.axes`.
+message AxisRefProto {
   // When splitting a full axis into n sub-axes, the axis is reshaped into
   // [k_1,...,k_n], and the ith sub-axis can be expressed by the product of
   // all axis sizes to its left `m=prod(k_1,...,k_(i-1))` (aka pre-size) and
@@ -970,7 +1031,7 @@ message AxisRef {
     int64 size = 2;
   }
 
-  int64 axis_index = 1;
+  int64 mesh_axis_index = 1;
   optional SubAxis sub_axis_info = 2;
 }
 
@@ -985,37 +1046,35 @@ message AxisRef {
 // This is corresponding to mlir sharding representation `Sdy_TensorSharding`
 // (https://github.com/openxla/shardy/blob/main/shardy/dialect/sdy/ir/attrs.td)
 // used in Shardy.
-message NamedSharding {
+message NamedShardingProto {
   // Describes how a tensor dimension is sharded using indices into
   // mesh.axes, from major to minor, and a boolean indicating whether
   // the dimension can be further sharded.
   message DimensionSharding {
     // List of axes to shard a tensor dimension on from major to minor. The
     // dimension should have at least one axis if it is closed.
-    repeated AxisRef axes = 1;
+    repeated AxisRefProto axes = 1;
     // If true, this dimension is "closed" and can't be further sharded.
     bool is_closed = 2;
   }
 
-  // If this is non-empty, then all the other fields are ignored and this
-  // represents a tuple sharding.
-  repeated NamedSharding tuple_shardings = 1;
-
-  Mesh mesh = 2;
+  MeshProto mesh = 2;
   // The dimension shardings tell us for each dimension of the tensor, along
   // which axes it is sharded from major to minor.
   repeated DimensionSharding dim_shardings = 3;
   // All axes in this list are explicitly replicated.
-  repeated AxisRef replicated_axes = 4;
+  repeated AxisRefProto replicated_axes = 4;
   // A sharding can have unreduced axes, meaning the tensor is unreduced
   // along these axes.
-  repeated AxisRef unreduced_axes = 5;
+  repeated AxisRefProto unreduced_axes = 5;
 
   // This field is used to track the source of this sharding, usually derived
   // from instructions. Multple metadata may be populated if sharding is
   // combined with other shardings. Metadata should be set on individual tuple
   // elements and not tuple_sharding.
   repeated OpMetadata metadata = 6;
+
+  reserved 1;
 }
 
 // LINT.IfChange
@@ -1111,10 +1170,17 @@ message OpSharding {
   // or else are ignored. This is to facilitate migration from the old sharding
   // format.
   //
+  // Note that for tuple NamedShardings, we reuse HloSharding's tuple_elements_
+  // field. If named sharding format is enabled each element in tuple_elements_
+  // will be an HloSharding, which itself can be a tuple or should only have
+  // named_sharding_ populated. This approach is taken to maintain backward
+  // compatibility with the existing `tuple_elements()` method, which provides a
+  // modifiable reference to a `std::vector<HloSharding>`.
+  //
   // Note that instead of reusing OpSharding's fields like metadata, we have
   // separate fields in NamedSharding to treat it as a standalone message which
   // is more clear and will help in future cleanup.
-  optional NamedSharding named_sharding = 14;
+  optional NamedShardingProto named_sharding = 14;
 }
 // LINT.ThenChange()
 
@@ -1126,6 +1192,19 @@ message ReplicaGroup {
   repeated int64 replica_ids = 1;
 }
 
+// Represents a list of replica groups (a list of list of devices) via a mesh
+// and list of axes. The replica groups correspond to the partitions of the
+// device ids which would arise if a collective operation was performed over the
+// specified axes.
+message MeshAxesReplicaGroupListProto {
+  // The mesh used to define the full set of axes and devices ids.
+  MeshProto mesh = 1;
+  // The axes defining the replica groups. These groups are implicitly defined
+  // by the device ids which would communicate together if a collective
+  // operation is performed over these axes.
+  repeated AxisRefProto axes = 2;
+}
+
 // Represents a list of replica groups (a list of list of devices) with
 // reshaping and transposing an iota array (iota tile assignment). Can be used
 // to represent certain common patterns of device lists in a compact, scalable
diff --git a/third_party/xprof/xprof.patch b/third_party/xprof/xprof.patch
index 6bfae91e88dcd5..6e66772c7ff08d 100644
--- a/third_party/xprof/xprof.patch
+++ b/third_party/xprof/xprof.patch
@@ -54,3 +54,33 @@ diff --git a/xprof/convert/xplane_to_tools_data.cc b/xprof/convert/xplane_to_too
  
    auto encode_status =
        tsl::protobuf::util::MessageToJsonString(profile, &json_output, opts);
+diff --git a/xprof/convert/trace_viewer/trace_events.cc b/xprof/convert/trace_viewer/trace_events.cc
+--- a/xprof/convert/trace_viewer/trace_events.cc
++++ b/xprof/convert/trace_viewer/trace_events.cc
+@@ -38,9 +38,7 @@ limitations under the License.
+ #include "xla/tsl/platform/env.h"
+ #include "xla/tsl/platform/errors.h"
+ #include "xla/tsl/platform/file_system.h"
+-#include "xla/tsl/platform/macros.h"
+ #include "xla/tsl/profiler/utils/timespan.h"
+-#include "xla/tsl/platform/types.h"
+ #include "xprof/convert/trace_viewer/trace_events_filter_interface.h"
+ #include "xprof/convert/trace_viewer/trace_events_util.h"
+ #include "xprof/convert/trace_viewer/trace_viewer_visibility.h"
+@@ -49,7 +47,6 @@ limitations under the License.
+
+ namespace tensorflow {
+ namespace profiler {
+-using tsl::kint64max;
+
+ namespace {
+
+@@ -137,7 +134,7 @@ std::pair<uint64_t, uint64_t> GetLevelBoundsForDuration(uint64_t duration_ps) {
+   for (; i < NumLevels(); ++i) {
+     if (duration_ps > kLayerResolutions[i]) {
+       if (i == 0) {
+-        return std::make_pair(kLayerResolutions[i], kint64max);
++        return std::make_pair(kLayerResolutions[i], std::numeric_limits<int64_t>::max());
+       } else {
+         return std::make_pair(kLayerResolutions[i], kLayerResolutions[i - 1]);
+       }
\ No newline at end of file